4970 files changed, 282029 insertions, 116913 deletions
diff --git a/.gitignore b/.gitignore
index e806c2c..7109373 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,7 +16,7 @@
 # Byte compiled python modules.
 *.pyc
 # vim swap files
-.*.swp
+.*.sw?
 .sw?
 #OS X specific files.
 .DS_store
@@ -24,6 +24,13 @@
 #==============================================================================#
 # Explicit files to ignore (only matches one).
 #==============================================================================#
+# Various tag programs
+/tags
+/TAGS
+/GPATH
+/GRTAGS
+/GSYMS
+/GTAGS
 .gitusers
 autom4te.cache
 cscope.files
@@ -45,6 +52,8 @@ tools/clang
 tools/lldb
 # lld, which is tracked independently.
 tools/lld
+# llgo, which is tracked independently.
+tools/llgo
 # Polly, which is tracked independently.
 tools/polly
 # Sphinx build tree, if building in-source dir.
diff --git a/Android.mk b/Android.mk
index 1f4421c..bc628e1 100644
--- a/Android.mk
+++ b/Android.mk
@@ -14,11 +14,13 @@ subdirs := \
   lib/ExecutionEngine \
   lib/ExecutionEngine/RuntimeDyld \
   lib/ExecutionEngine/MCJIT \
+  lib/ExecutionEngine/Orc \
   lib/ExecutionEngine/Interpreter \
   lib/CodeGen \
   lib/CodeGen/AsmPrinter \
   lib/CodeGen/SelectionDAG \
-  lib/DebugInfo \
+  lib/DebugInfo/DWARF \
+  lib/DebugInfo/PDB \
   lib/IR \
   lib/IRReader \
   lib/Linker \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6691189..ed7aeb0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,11 @@
 
 cmake_minimum_required(VERSION 2.8.8)
 
+if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  message(STATUS "No build type selected, default to Debug")
+  set(CMAKE_BUILD_TYPE "Debug")
+endif()
+
 # FIXME: It may be removed when we use 2.8.12.
 if(CMAKE_VERSION VERSION_LESS 2.8.12)
   # Invalidate a couple of keywords.
@@ -16,6 +21,15 @@ else()
   endif()
 endif()
 
+if (POLICY CMP0051)
+  # CMake 3.1 and higher include generator expressions of the form
+  # $<TARGETLIB:obj> in the SOURCES property.  These need to be
+  # stripped everywhere that access the SOURCES property, so we just
+  # defer to the OLD behavior of not including generator expressions
+  # in the output for now.
+  cmake_policy(SET CMP0051 OLD)
+endif()
+
 if(CMAKE_VERSION VERSION_LESS 3.1.20141117)
   set(cmake_3_2_USES_TERMINAL)
 else()
@@ -47,8 +61,9 @@ set(CMAKE_MODULE_PATH
   )
 
 set(LLVM_VERSION_MAJOR 3)
-set(LLVM_VERSION_MINOR 6)
+set(LLVM_VERSION_MINOR 7)
 set(LLVM_VERSION_PATCH 0)
+set(LLVM_VERSION_SUFFIX svn)
 
 if (NOT PACKAGE_VERSION)
   set(PACKAGE_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}svn")
@@ -96,6 +111,9 @@ if(WIN32 AND NOT UNIX)
     "ExecWait '$INSTDIR/tools/msbuild/install.bat'")
   set(CPACK_NSIS_EXTRA_UNINSTALL_COMMANDS
     "ExecWait '$INSTDIR/tools/msbuild/uninstall.bat'")
+  if( CMAKE_CL_64 )
+    set(CPACK_NSIS_INSTALL_ROOT "$PROGRAMFILES64")
+  endif()
 endif()
 include(CPack)
 
@@ -132,9 +150,11 @@ endif()
 
 string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
 
+set(LLVM_LIBDIR_SUFFIX "" CACHE STRING "Define suffix of library directory name (32/64)" )
+
 # They are used as destination of target generators.
 set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin)
-set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib)
+set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX})
 if(WIN32 OR CYGWIN)
   # DLL platform -- put DLLs into bin.
   set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_RUNTIME_OUTPUT_INTDIR})
@@ -151,7 +171,6 @@ set(LLVM_BINARY_DIR       ${CMAKE_CURRENT_BINARY_DIR}  ) # --prefix
 
 set(LLVM_EXAMPLES_BINARY_DIR ${LLVM_BINARY_DIR}/examples)
 set(LLVM_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
-set(LLVM_LIBDIR_SUFFIX "" CACHE STRING "Define suffix of library directory name (32/64)" )
 
 set(LLVM_ALL_TARGETS
   AArch64
@@ -324,6 +343,10 @@ option (LLVM_BUILD_EXTERNAL_COMPILER_RT
   "Build compiler-rt as an external project." OFF)
 
 option(LLVM_BUILD_LLVM_DYLIB "Build libllvm dynamic library" OFF)
+option(LLVM_DISABLE_LLVM_DYLIB_ATEXIT "Disable llvm-shlib's atexit destructors." ON)
+if(LLVM_DISABLE_LLVM_DYLIB_ATEXIT)
+  set(DISABLE_LLVM_DYLIB_ATEXIT 1)
+endif()
 
 # All options referred to from HandleLLVMOptions have to be specified
 # BEFORE this include, otherwise options will not be correctly set on
@@ -339,7 +362,9 @@ set(TARGET_TRIPLE "${LLVM_DEFAULT_TARGET_TRIPLE}")
 include(HandleLLVMOptions)
 
 # Verify that we can find a Python 2 interpreter.  Python 3 is unsupported.
-set(Python_ADDITIONAL_VERSIONS 2.7 2.6 2.5)
+# FIXME: We should support systems with only Python 3, but that requires work
+# on LLDB.
+set(Python_ADDITIONAL_VERSIONS 2.7)
 include(FindPythonInterp)
 if( NOT PYTHONINTERP_FOUND )
   message(FATAL_ERROR
@@ -348,6 +373,10 @@ if( NOT PYTHONINTERP_FOUND )
 Please install Python or specify the PYTHON_EXECUTABLE CMake variable.")
 endif()
 
+if( ${PYTHON_VERSION_STRING} VERSION_LESS 2.7 )
+  message(FATAL_ERROR "Python 2.7 or newer is required")
+endif()
+
 ######
 # LLVMBuild Integration
 #
@@ -473,8 +502,8 @@ configure_file(
 
 # They are not referenced. See set_output_directory().
 set( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/bin )
-set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/lib )
-set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/lib )
+set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX} )
+set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX} )
 
 set(CMAKE_BUILD_WITH_INSTALL_RPATH ON)
 if (APPLE)
@@ -482,7 +511,7 @@ if (APPLE)
   set(CMAKE_INSTALL_RPATH "@executable_path/../lib")
 else(UNIX)
   if(NOT DEFINED CMAKE_INSTALL_RPATH)
-    set(CMAKE_INSTALL_RPATH "\$ORIGIN/../lib")
+    set(CMAKE_INSTALL_RPATH "\$ORIGIN/../lib${LLVM_LIBDIR_SUFFIX}")
     if (${CMAKE_SYSTEM_NAME} MATCHES FreeBSD)
       set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-z,origin")
       set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,origin")
@@ -561,6 +590,12 @@ if(LLVM_INCLUDE_TESTS)
   add_subdirectory(utils/unittest)
 endif()
 
+foreach( binding ${LLVM_BINDINGS_LIST} )
+  if( EXISTS "${LLVM_MAIN_SRC_DIR}/bindings/${binding}/CMakeLists.txt" )
+    add_subdirectory(bindings/${binding})
+  endif()
+endforeach()
+
 add_subdirectory(projects)
 
 if(WITH_POLLY)
@@ -610,6 +645,7 @@ add_subdirectory(cmake/modules)
 if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
   install(DIRECTORY include/llvm include/llvm-c
     DESTINATION include
+    COMPONENT llvm-headers
     FILES_MATCHING
     PATTERN "*.def"
     PATTERN "*.h"
@@ -621,6 +657,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
 
   install(DIRECTORY ${LLVM_INCLUDE_DIR}/llvm
     DESTINATION include
+    COMPONENT llvm-headers
     FILES_MATCHING
     PATTERN "*.def"
     PATTERN "*.h"
@@ -631,4 +668,12 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
     PATTERN "config.h" EXCLUDE
     PATTERN ".svn" EXCLUDE
     )
+
+  if (NOT CMAKE_CONFIGURATION_TYPES)
+    add_custom_target(installhdrs
+                      DEPENDS ${name}
+                      COMMAND "${CMAKE_COMMAND}"
+                              -DCMAKE_INSTALL_COMPONENT=llvm-headers
+                              -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
+  endif()
 endif()
diff --git a/CODE_OWNERS.TXT b/CODE_OWNERS.TXT
index 071f6c8..b674756 100644
--- a/CODE_OWNERS.TXT
+++ b/CODE_OWNERS.TXT
@@ -20,6 +20,10 @@ N: Rafael Avila de Espindola
 E: rafael.espindola@gmail.com
 D: Gold plugin (tools/gold/*)
 
+N: Justin Bogner
+E: mail@justinbogner.com
+D: InstrProfiling and related parts of ProfileData
+
 N: Chandler Carruth
 E: chandlerc@gmail.com
 E: chandlerc@google.com
@@ -29,10 +33,6 @@ N: Evan Cheng
 E: evan.cheng@apple.com
 D: ARM target, parts of code generator not covered by someone else
 
-N: Renato Golin
-E: renato.golin@linaro.org
-D: ARM Linux support
-
 N: Eric Christopher
 E: echristo@gmail.com
 D: Debug Information, autotools/configure/make build, inline assembly
@@ -41,18 +41,30 @@ N: Greg Clayton
 E: gclayton@apple.com
 D: LLDB
 
+N: Marshall Clow
+E: mclow.lists@gmail.com
+D: libc++
+
 N: Peter Collingbourne
 E: peter@pcc.me.uk
-D: libclc
+D: llgo
 
 N: Anshuman Dasgupta
 E: adasgupt@codeaurora.org
 D: Hexagon Backend
 
+N: Duncan P. N. Exon Smith
+E: dexonsmith@apple.com
+D: Branch weights and BlockFrequencyInfo
+
 N: Hal Finkel
 E: hfinkel@anl.gov
 D: BBVectorize, the loop reroller, alias analysis and the PowerPC target
 
+N: Renato Golin
+E: renato.golin@linaro.org
+D: ARM Linux support
+
 N: Venkatraman Govindaraju
 E: venkatra@cs.wisc.edu
 D: Sparc Backend (lib/Target/Sparc/*)
@@ -65,10 +77,6 @@ N: James Grosbach
 E: grosbach@apple.com
 D: MC layer
 
-N: Marshall Clow
-E: mclow.lists@gmail.com
-D: libc++
-
 N: Justin Holewinski
 E: jholewinski@nvidia.com
 D: NVPTX Target (lib/Target/NVPTX/*)
@@ -102,6 +110,10 @@ N: Tim Northover
 E: t.p.northover@gmail.com
 D: AArch64 backend
 
+N: Diego Novillo
+E: dnovillo@google.com
+D: SampleProfile and related parts of ProfileData
+
 N: Jakob Olesen
 E: stoklund@2pi.dk
 D: Register allocators and TableGen
@@ -114,6 +126,10 @@ N: Chad Rosier
 E: mcrosier@codeaurora.org
 D: Fast-Isel
 
+N: Alex Rosenberg
+E: alexr@leftfield.org
+D: Sony PlayStation®4 support
+
 N: Nadav Rotem
 E: nrotem@apple.com
 D: X86 Backend, Loop Vectorizer
@@ -122,10 +138,6 @@ N: Daniel Sanders
 E: daniel.sanders@imgtec.com
 D: MIPS Backend (lib/Target/Mips/*)
 
-N: Richard Sandiford
-E: rsandifo@linux.vnet.ibm.com
-D: SystemZ Backend
-
 N: Duncan Sands
 E: baldrick@free.fr
 D: DragonEgg
@@ -138,10 +150,14 @@ N: Michael Spencer
 E: bigcheesegs@gmail.com
 D: Windows parts of Support, Object, ar, nm, objdump, ranlib, size
 
+N: Alexei Starovoitov
+E: alexei.starovoitov@gmail.com
+D: BPF backend
+
 N: Tom Stellard
 E: thomas.stellard@amd.com
 E: mesa-dev@lists.freedesktop.org
-D: Release manager for the 3.5 branch, R600 Backend
+D: Release manager for the 3.5 branch, R600 Backend, libclc
 
 N: Evgeniy Stepanov
 E: eugenis@google.com
@@ -151,6 +167,10 @@ N: Andrew Trick
 E: atrick@apple.com
 D: IndVar Simplify, Loop Strength Reduction, Instruction Scheduling
 
+N: Ulrich Weigand
+E: uweigand@de.ibm.com
+D: SystemZ Backend
+
 N: Bill Wendling
 E: isanbard@gmail.com
 D: libLTO, IR Linker
diff --git a/README.txt b/README.txt
index 193330f..6358a06 100644
--- a/README.txt
+++ b/README.txt
@@ -1,8 +1,8 @@
 Low Level Virtual Machine (LLVM)
 ================================
 
-This directory and its subdirectories contain source code for the Low Level
-Virtual Machine, a toolkit for the construction of highly optimized compilers,
+This directory and its subdirectories contain source code for LLVM,
+a toolkit for the construction of highly optimized compilers,
 optimizers, and runtime environments.
 
 LLVM is open source software. You may freely distribute it under the terms of
diff --git a/autoconf/config.sub b/autoconf/config.sub
index 673d62b..2583c90 100755
--- a/autoconf/config.sub
+++ b/autoconf/config.sub
@@ -1354,7 +1354,7 @@ case $os in
 	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
 	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
 	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
-	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*)
+	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* | -bitrig*)
 	# Remember, each alternative MUST END IN *, to match a version number.
 		;;
 	-qnx*)
@@ -1489,6 +1489,8 @@ case $os in
 		;;
 	-nacl*)
 		;;
+	-ps4)
+		;;
 	-none)
 		;;
 	*)
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index c998dd5..e0bc783 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -32,10 +32,10 @@ dnl===-----------------------------------------------------------------------===
 dnl Initialize autoconf and define the package name, version number and
 dnl address for reporting bugs.
 
-AC_INIT([LLVM],[3.6.0svn],[http://llvm.org/bugs/])
+AC_INIT([LLVM],[3.7.0svn],[http://llvm.org/bugs/])
 
 LLVM_VERSION_MAJOR=3
-LLVM_VERSION_MINOR=6
+LLVM_VERSION_MINOR=7
 LLVM_VERSION_PATCH=0
 LLVM_VERSION_SUFFIX=svn
 
@@ -267,6 +267,11 @@ AC_CACHE_CHECK([type of operating system we're going to host on],
     llvm_cv_no_link_all_option="-Wl,--no-whole-archive"
     llvm_cv_os_type="DragonFly"
     llvm_cv_platform_type="Unix" ;;
+  *-*-bitrig*)
+    llvm_cv_link_all_option="-Wl,--whole-archive"
+    llvm_cv_no_link_all_option="-Wl,--no-whole-archive"
+    llvm_cv_os_type="Bitrig"
+    llvm_cv_platform_type="Unix" ;;
   *-*-hpux*)
     llvm_cv_link_all_option="-Wl,--whole-archive"
     llvm_cv_no_link_all_option="-Wl,--no-whole-archive"
@@ -347,6 +352,8 @@ AC_CACHE_CHECK([type of operating system we're going to target],
     llvm_cv_target_os_type="NetBSD" ;;
   *-*-dragonfly*)
     llvm_cv_target_os_type="DragonFly" ;;
+  *-*-bitrig*)
+    llvm_cv_target_os_type="Bitrig" ;;
   *-*-hpux*)
     llvm_cv_target_os_type="HP-UX" ;;
   *-*-interix*)
@@ -369,6 +376,8 @@ AC_CACHE_CHECK([type of operating system we're going to target],
     llvm_cv_target_os_type="NativeClient" ;;
   *-unknown-eabi*)
     llvm_cv_target_os_type="Freestanding" ;;
+  *-*-ps4)
+    llvm_cv_target_os_type="PS4" ;;
   *)
     llvm_cv_target_os_type="Unknown" ;;
 esac])
@@ -1533,11 +1542,11 @@ AC_ARG_WITH(oprofile,
             AC_SEARCH_LIBS(bfd_init, bfd, [], [])
             AC_SEARCH_LIBS(op_open_agent, opagent, [], [
               echo "Error! You need to have libopagent around."
-              exit -1
+              exit 1
             ])
             AC_CHECK_HEADER([opagent.h], [], [
               echo "Error! You need to have opagent.h around."
-              exit -1
+              exit 1
               ])
           fi ;;
         *)
@@ -1714,7 +1723,9 @@ if test "$llvm_cv_os_type" = "MingW" ; then
   AC_CHECK_LIB(gcc,_alloca,AC_DEFINE([HAVE__ALLOCA],[1],[Have host's _alloca]))
   AC_CHECK_LIB(gcc,__alloca,AC_DEFINE([HAVE___ALLOCA],[1],[Have host's __alloca]))
   AC_CHECK_LIB(gcc,__chkstk,AC_DEFINE([HAVE___CHKSTK],[1],[Have host's __chkstk]))
+  AC_CHECK_LIB(gcc,__chkstk_ms,AC_DEFINE([HAVE___CHKSTK_MS],[1],[Have host's __chkstk_ms]))
   AC_CHECK_LIB(gcc,___chkstk,AC_DEFINE([HAVE____CHKSTK],[1],[Have host's ___chkstk]))
+  AC_CHECK_LIB(gcc,___chkstk_ms,AC_DEFINE([HAVE____CHKSTK_MS],[1],[Have host's ___chkstk_ms]))
 
   AC_CHECK_LIB(gcc,__ashldi3,AC_DEFINE([HAVE___ASHLDI3],[1],[Have host's __ashldi3]))
   AC_CHECK_LIB(gcc,__ashrdi3,AC_DEFINE([HAVE___ASHRDI3],[1],[Have host's __ashrdi3]))
@@ -2017,6 +2028,11 @@ if test "${clang_src_root}" = ""; then
   clang_src_root="$srcdir/tools/clang"
 fi
 if test -f ${clang_src_root}/README.txt; then
+  dnl Clang supports build systems which use the multilib libdir suffix.
+  dnl The autoconf system doesn't support this so stub out that variable.
+  AC_DEFINE_UNQUOTED(CLANG_LIBDIR_SUFFIX,"",
+                     [Multilib suffix for libdir.])
+
   dnl Use variables to stay under 80 columns.
   configh="include/clang/Config/config.h"
   doxy="docs/doxygen.cfg"
diff --git a/bindings/go/llvm/DIBuilderBindings.cpp b/bindings/go/llvm/DIBuilderBindings.cpp
index 94fa96f..a7d75a3 100644
--- a/bindings/go/llvm/DIBuilderBindings.cpp
+++ b/bindings/go/llvm/DIBuilderBindings.cpp
@@ -12,21 +12,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "DIBuilderBindings.h"
-
-#include "llvm/IR/Module.h"
+#include "IRBindings.h"
 #include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 
 using namespace llvm;
 
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(DIBuilder, LLVMDIBuilderRef)
+
 namespace {
-template <typename T>
-T unwrapDI(LLVMValueRef v) {
+template <typename T> T unwrapDI(LLVMMetadataRef v) {
   return v ? T(unwrap<MDNode>(v)) : T();
 }
 }
 
-DEFINE_SIMPLE_CONVERSION_FUNCTIONS(DIBuilder, LLVMDIBuilderRef)
-
 LLVMDIBuilderRef LLVMNewDIBuilder(LLVMModuleRef mref) {
   Module *m = unwrap(mref);
   return wrap(new DIBuilder(*m));
@@ -39,49 +39,50 @@ void LLVMDIBuilderDestroy(LLVMDIBuilderRef dref) {
 
 void LLVMDIBuilderFinalize(LLVMDIBuilderRef dref) { unwrap(dref)->finalize(); }
 
-LLVMValueRef LLVMDIBuilderCreateCompileUnit(LLVMDIBuilderRef Dref,
-                                            unsigned Lang, const char *File,
-                                            const char *Dir,
-                                            const char *Producer, int Optimized,
-                                            const char *Flags,
-                                            unsigned RuntimeVersion) {
+LLVMMetadataRef LLVMDIBuilderCreateCompileUnit(LLVMDIBuilderRef Dref,
+                                               unsigned Lang, const char *File,
+                                               const char *Dir,
+                                               const char *Producer,
+                                               int Optimized, const char *Flags,
+                                               unsigned RuntimeVersion) {
   DIBuilder *D = unwrap(Dref);
   DICompileUnit CU = D->createCompileUnit(Lang, File, Dir, Producer, Optimized,
                                           Flags, RuntimeVersion);
   return wrap(CU);
 }
 
-LLVMValueRef LLVMDIBuilderCreateFile(LLVMDIBuilderRef Dref, const char *File,
-                                     const char *Dir) {
+LLVMMetadataRef LLVMDIBuilderCreateFile(LLVMDIBuilderRef Dref, const char *File,
+                                        const char *Dir) {
   DIBuilder *D = unwrap(Dref);
   DIFile F = D->createFile(File, Dir);
   return wrap(F);
 }
 
-LLVMValueRef LLVMDIBuilderCreateLexicalBlock(LLVMDIBuilderRef Dref,
-                                             LLVMValueRef Scope,
-                                             LLVMValueRef File, unsigned Line,
-                                             unsigned Column) {
+LLVMMetadataRef LLVMDIBuilderCreateLexicalBlock(LLVMDIBuilderRef Dref,
+                                                LLVMMetadataRef Scope,
+                                                LLVMMetadataRef File,
+                                                unsigned Line,
+                                                unsigned Column) {
   DIBuilder *D = unwrap(Dref);
   DILexicalBlock LB = D->createLexicalBlock(
       unwrapDI<DIDescriptor>(Scope), unwrapDI<DIFile>(File), Line, Column);
   return wrap(LB);
 }
 
-LLVMValueRef LLVMDIBuilderCreateLexicalBlockFile(LLVMDIBuilderRef Dref,
-                                                 LLVMValueRef Scope,
-                                                 LLVMValueRef File,
-                                                 unsigned Discriminator) {
+LLVMMetadataRef LLVMDIBuilderCreateLexicalBlockFile(LLVMDIBuilderRef Dref,
+                                                    LLVMMetadataRef Scope,
+                                                    LLVMMetadataRef File,
+                                                    unsigned Discriminator) {
   DIBuilder *D = unwrap(Dref);
   DILexicalBlockFile LBF = D->createLexicalBlockFile(
       unwrapDI<DIDescriptor>(Scope), unwrapDI<DIFile>(File), Discriminator);
   return wrap(LBF);
 }
 
-LLVMValueRef LLVMDIBuilderCreateFunction(
-    LLVMDIBuilderRef Dref, LLVMValueRef Scope, const char *Name,
-    const char *LinkageName, LLVMValueRef File, unsigned Line,
-    LLVMValueRef CompositeType, int IsLocalToUnit, int IsDefinition,
+LLVMMetadataRef LLVMDIBuilderCreateFunction(
+    LLVMDIBuilderRef Dref, LLVMMetadataRef Scope, const char *Name,
+    const char *LinkageName, LLVMMetadataRef File, unsigned Line,
+    LLVMMetadataRef CompositeType, int IsLocalToUnit, int IsDefinition,
     unsigned ScopeLine, unsigned Flags, int IsOptimized, LLVMValueRef Func) {
   DIBuilder *D = unwrap(Dref);
   DISubprogram SP = D->createFunction(
@@ -91,10 +92,10 @@ LLVMValueRef LLVMDIBuilderCreateFunction(
   return wrap(SP);
 }
 
-LLVMValueRef LLVMDIBuilderCreateLocalVariable(
-    LLVMDIBuilderRef Dref, unsigned Tag, LLVMValueRef Scope, const char *Name,
-    LLVMValueRef File, unsigned Line, LLVMValueRef Ty, int AlwaysPreserve,
-    unsigned Flags, unsigned ArgNo) {
+LLVMMetadataRef LLVMDIBuilderCreateLocalVariable(
+    LLVMDIBuilderRef Dref, unsigned Tag, LLVMMetadataRef Scope,
+    const char *Name, LLVMMetadataRef File, unsigned Line, LLVMMetadataRef Ty,
+    int AlwaysPreserve, unsigned Flags, unsigned ArgNo) {
   DIBuilder *D = unwrap(Dref);
   DIVariable V = D->createLocalVariable(
       Tag, unwrapDI<DIDescriptor>(Scope), Name, unwrapDI<DIFile>(File), Line,
@@ -102,39 +103,41 @@ LLVMValueRef LLVMDIBuilderCreateLocalVariable(
   return wrap(V);
 }
 
-LLVMValueRef LLVMDIBuilderCreateBasicType(LLVMDIBuilderRef Dref,
-                                          const char *Name, uint64_t SizeInBits,
-                                          uint64_t AlignInBits,
-                                          unsigned Encoding) {
+LLVMMetadataRef LLVMDIBuilderCreateBasicType(LLVMDIBuilderRef Dref,
+                                             const char *Name,
+                                             uint64_t SizeInBits,
+                                             uint64_t AlignInBits,
+                                             unsigned Encoding) {
   DIBuilder *D = unwrap(Dref);
   DIBasicType T = D->createBasicType(Name, SizeInBits, AlignInBits, Encoding);
   return wrap(T);
 }
 
-LLVMValueRef LLVMDIBuilderCreatePointerType(LLVMDIBuilderRef Dref,
-                                            LLVMValueRef PointeeType,
-                                            uint64_t SizeInBits,
-                                            uint64_t AlignInBits,
-                                            const char *Name) {
+LLVMMetadataRef LLVMDIBuilderCreatePointerType(LLVMDIBuilderRef Dref,
+                                               LLVMMetadataRef PointeeType,
+                                               uint64_t SizeInBits,
+                                               uint64_t AlignInBits,
+                                               const char *Name) {
   DIBuilder *D = unwrap(Dref);
   DIDerivedType T = D->createPointerType(unwrapDI<DIType>(PointeeType),
                                          SizeInBits, AlignInBits, Name);
   return wrap(T);
 }
 
-LLVMValueRef LLVMDIBuilderCreateSubroutineType(LLVMDIBuilderRef Dref,
-                                               LLVMValueRef File,
-                                               LLVMValueRef ParameterTypes) {
+LLVMMetadataRef
+LLVMDIBuilderCreateSubroutineType(LLVMDIBuilderRef Dref, LLVMMetadataRef File,
+                                  LLVMMetadataRef ParameterTypes) {
   DIBuilder *D = unwrap(Dref);
   DICompositeType CT = D->createSubroutineType(
       unwrapDI<DIFile>(File), unwrapDI<DITypeArray>(ParameterTypes));
   return wrap(CT);
 }
 
-LLVMValueRef LLVMDIBuilderCreateStructType(
-    LLVMDIBuilderRef Dref, LLVMValueRef Scope, const char *Name,
-    LLVMValueRef File, unsigned Line, uint64_t SizeInBits, uint64_t AlignInBits,
-    unsigned Flags, LLVMValueRef DerivedFrom, LLVMValueRef ElementTypes) {
+LLVMMetadataRef LLVMDIBuilderCreateStructType(
+    LLVMDIBuilderRef Dref, LLVMMetadataRef Scope, const char *Name,
+    LLVMMetadataRef File, unsigned Line, uint64_t SizeInBits,
+    uint64_t AlignInBits, unsigned Flags, LLVMMetadataRef DerivedFrom,
+    LLVMMetadataRef ElementTypes) {
   DIBuilder *D = unwrap(Dref);
   DICompositeType CT = D->createStructType(
       unwrapDI<DIDescriptor>(Scope), Name, unwrapDI<DIFile>(File), Line,
@@ -143,10 +146,12 @@ LLVMValueRef LLVMDIBuilderCreateStructType(
   return wrap(CT);
 }
 
-LLVMValueRef LLVMDIBuilderCreateMemberType(
-    LLVMDIBuilderRef Dref, LLVMValueRef Scope, const char *Name,
-    LLVMValueRef File, unsigned Line, uint64_t SizeInBits, uint64_t AlignInBits,
-    uint64_t OffsetInBits, unsigned Flags, LLVMValueRef Ty) {
+LLVMMetadataRef
+LLVMDIBuilderCreateMemberType(LLVMDIBuilderRef Dref, LLVMMetadataRef Scope,
+                              const char *Name, LLVMMetadataRef File,
+                              unsigned Line, uint64_t SizeInBits,
+                              uint64_t AlignInBits, uint64_t OffsetInBits,
+                              unsigned Flags, LLVMMetadataRef Ty) {
   DIBuilder *D = unwrap(Dref);
   DIDerivedType DT = D->createMemberType(
       unwrapDI<DIDescriptor>(Scope), Name, unwrapDI<DIFile>(File), Line,
@@ -154,11 +159,11 @@ LLVMValueRef LLVMDIBuilderCreateMemberType(
   return wrap(DT);
 }
 
-LLVMValueRef LLVMDIBuilderCreateArrayType(LLVMDIBuilderRef Dref,
-                                          uint64_t SizeInBits,
-                                          uint64_t AlignInBits,
-                                          LLVMValueRef ElementType,
-                                          LLVMValueRef Subscripts) {
+LLVMMetadataRef LLVMDIBuilderCreateArrayType(LLVMDIBuilderRef Dref,
+                                             uint64_t SizeInBits,
+                                             uint64_t AlignInBits,
+                                             LLVMMetadataRef ElementType,
+                                             LLVMMetadataRef Subscripts) {
   DIBuilder *D = unwrap(Dref);
   DICompositeType CT =
       D->createArrayType(SizeInBits, AlignInBits, unwrapDI<DIType>(ElementType),
@@ -166,9 +171,10 @@ LLVMValueRef LLVMDIBuilderCreateArrayType(LLVMDIBuilderRef Dref,
   return wrap(CT);
 }
 
-LLVMValueRef LLVMDIBuilderCreateTypedef(LLVMDIBuilderRef Dref, LLVMValueRef Ty,
-                                        const char *Name, LLVMValueRef File,
-                                        unsigned Line, LLVMValueRef Context) {
+LLVMMetadataRef LLVMDIBuilderCreateTypedef(LLVMDIBuilderRef Dref,
+                                           LLVMMetadataRef Ty, const char *Name,
+                                           LLVMMetadataRef File, unsigned Line,
+                                           LLVMMetadataRef Context) {
   DIBuilder *D = unwrap(Dref);
   DIDerivedType DT =
       D->createTypedef(unwrapDI<DIType>(Ty), Name, unwrapDI<DIFile>(File), Line,
@@ -176,34 +182,35 @@ LLVMValueRef LLVMDIBuilderCreateTypedef(LLVMDIBuilderRef Dref, LLVMValueRef Ty,
   return wrap(DT);
 }
 
-LLVMValueRef LLVMDIBuilderGetOrCreateSubrange(LLVMDIBuilderRef Dref, int64_t Lo,
-                                              int64_t Count) {
+LLVMMetadataRef LLVMDIBuilderGetOrCreateSubrange(LLVMDIBuilderRef Dref,
+                                                 int64_t Lo, int64_t Count) {
   DIBuilder *D = unwrap(Dref);
   DISubrange S = D->getOrCreateSubrange(Lo, Count);
   return wrap(S);
 }
 
-LLVMValueRef LLVMDIBuilderGetOrCreateArray(LLVMDIBuilderRef Dref,
-                                           LLVMValueRef *Data, size_t Length) {
+LLVMMetadataRef LLVMDIBuilderGetOrCreateArray(LLVMDIBuilderRef Dref,
+                                              LLVMMetadataRef *Data,
+                                              size_t Length) {
   DIBuilder *D = unwrap(Dref);
-  Value **DataValue = unwrap(Data);
-  ArrayRef<Value *> Elements(DataValue, Length);
+  Metadata **DataValue = unwrap(Data);
+  ArrayRef<Metadata *> Elements(DataValue, Length);
   DIArray A = D->getOrCreateArray(Elements);
   return wrap(A);
 }
 
-LLVMValueRef LLVMDIBuilderGetOrCreateTypeArray(LLVMDIBuilderRef Dref,
-                                               LLVMValueRef *Data,
-                                               size_t Length) {
+LLVMMetadataRef LLVMDIBuilderGetOrCreateTypeArray(LLVMDIBuilderRef Dref,
+                                                  LLVMMetadataRef *Data,
+                                                  size_t Length) {
   DIBuilder *D = unwrap(Dref);
-  Value **DataValue = unwrap(Data);
-  ArrayRef<Value *> Elements(DataValue, Length);
+  Metadata **DataValue = unwrap(Data);
+  ArrayRef<Metadata *> Elements(DataValue, Length);
   DITypeArray A = D->getOrCreateTypeArray(Elements);
   return wrap(A);
 }
 
-LLVMValueRef LLVMDIBuilderCreateExpression(LLVMDIBuilderRef Dref, int64_t *Addr,
-                                           size_t Length) {
+LLVMMetadataRef LLVMDIBuilderCreateExpression(LLVMDIBuilderRef Dref,
+                                              int64_t *Addr, size_t Length) {
   DIBuilder *D = unwrap(Dref);
   DIExpression Expr = D->createExpression(ArrayRef<int64_t>(Addr, Length));
   return wrap(Expr);
@@ -211,8 +218,8 @@ LLVMValueRef LLVMDIBuilderCreateExpression(LLVMDIBuilderRef Dref, int64_t *Addr,
 
 LLVMValueRef LLVMDIBuilderInsertDeclareAtEnd(LLVMDIBuilderRef Dref,
                                              LLVMValueRef Storage,
-                                             LLVMValueRef VarInfo,
-                                             LLVMValueRef Expr,
+                                             LLVMMetadataRef VarInfo,
+                                             LLVMMetadataRef Expr,
                                              LLVMBasicBlockRef Block) {
   DIBuilder *D = unwrap(Dref);
   Instruction *Instr =
@@ -220,3 +227,15 @@ LLVMValueRef LLVMDIBuilderInsertDeclareAtEnd(LLVMDIBuilderRef Dref,
                        unwrapDI<DIExpression>(Expr), unwrap(Block));
   return wrap(Instr);
 }
+
+LLVMValueRef LLVMDIBuilderInsertValueAtEnd(LLVMDIBuilderRef Dref,
+                                           LLVMValueRef Val, uint64_t Offset,
+                                           LLVMMetadataRef VarInfo,
+                                           LLVMMetadataRef Expr,
+                                           LLVMBasicBlockRef Block) {
+  DIBuilder *D = unwrap(Dref);
+  Instruction *Instr = D->insertDbgValueIntrinsic(
+      unwrap(Val), Offset, unwrapDI<DIVariable>(VarInfo),
+      unwrapDI<DIExpression>(Expr), unwrap(Block));
+  return wrap(Instr);
+}
diff --git a/bindings/go/llvm/DIBuilderBindings.h b/bindings/go/llvm/DIBuilderBindings.h
index e6fe02a..e268b3c 100644
--- a/bindings/go/llvm/DIBuilderBindings.h
+++ b/bindings/go/llvm/DIBuilderBindings.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_BINDINGS_GO_LLVM_DIBUILDERBINDINGS_H
 #define LLVM_BINDINGS_GO_LLVM_DIBUILDERBINDINGS_H
 
+#include "IRBindings.h"
 #include "llvm-c/Core.h"
 
 #ifdef __cplusplus
@@ -31,93 +32,104 @@ LLVMDIBuilderRef LLVMNewDIBuilder(LLVMModuleRef m);
 void LLVMDIBuilderDestroy(LLVMDIBuilderRef d);
 void LLVMDIBuilderFinalize(LLVMDIBuilderRef d);
 
-LLVMValueRef LLVMDIBuilderCreateCompileUnit(LLVMDIBuilderRef D,
-                                            unsigned Language, const char *File,
-                                            const char *Dir,
-                                            const char *Producer, int Optimized,
-                                            const char *Flags,
-                                            unsigned RuntimeVersion);
-
-LLVMValueRef LLVMDIBuilderCreateFile(LLVMDIBuilderRef D, const char *File,
-                                     const char *Dir);
-
-LLVMValueRef LLVMDIBuilderCreateLexicalBlock(LLVMDIBuilderRef D,
-                                             LLVMValueRef Scope,
-                                             LLVMValueRef File, unsigned Line,
-                                             unsigned Column);
-
-LLVMValueRef LLVMDIBuilderCreateLexicalBlockFile(LLVMDIBuilderRef D,
-                                                 LLVMValueRef Scope,
-                                                 LLVMValueRef File,
-                                                 unsigned Discriminator);
-
-LLVMValueRef LLVMDIBuilderCreateFunction(
-    LLVMDIBuilderRef D, LLVMValueRef Scope, const char *Name,
-    const char *LinkageName, LLVMValueRef File, unsigned Line,
-    LLVMValueRef CompositeType, int IsLocalToUnit, int IsDefinition,
+LLVMMetadataRef
+LLVMDIBuilderCreateCompileUnit(LLVMDIBuilderRef D, unsigned Language,
+                               const char *File, const char *Dir,
+                               const char *Producer, int Optimized,
+                               const char *Flags, unsigned RuntimeVersion);
+
+LLVMMetadataRef LLVMDIBuilderCreateFile(LLVMDIBuilderRef D, const char *File,
+                                        const char *Dir);
+
+LLVMMetadataRef LLVMDIBuilderCreateLexicalBlock(LLVMDIBuilderRef D,
+                                                LLVMMetadataRef Scope,
+                                                LLVMMetadataRef File,
+                                                unsigned Line, unsigned Column);
+
+LLVMMetadataRef LLVMDIBuilderCreateLexicalBlockFile(LLVMDIBuilderRef D,
+                                                    LLVMMetadataRef Scope,
+                                                    LLVMMetadataRef File,
+                                                    unsigned Discriminator);
+
+LLVMMetadataRef LLVMDIBuilderCreateFunction(
+    LLVMDIBuilderRef D, LLVMMetadataRef Scope, const char *Name,
+    const char *LinkageName, LLVMMetadataRef File, unsigned Line,
+    LLVMMetadataRef CompositeType, int IsLocalToUnit, int IsDefinition,
     unsigned ScopeLine, unsigned Flags, int IsOptimized, LLVMValueRef Function);
 
-LLVMValueRef LLVMDIBuilderCreateLocalVariable(
-    LLVMDIBuilderRef D, unsigned Tag, LLVMValueRef Scope, const char *Name,
-    LLVMValueRef File, unsigned Line, LLVMValueRef Ty, int AlwaysPreserve,
+LLVMMetadataRef LLVMDIBuilderCreateLocalVariable(
+    LLVMDIBuilderRef D, unsigned Tag, LLVMMetadataRef Scope, const char *Name,
+    LLVMMetadataRef File, unsigned Line, LLVMMetadataRef Ty, int AlwaysPreserve,
     unsigned Flags, unsigned ArgNo);
 
-LLVMValueRef LLVMDIBuilderCreateBasicType(LLVMDIBuilderRef D, const char *Name,
-                                          uint64_t SizeInBits,
-                                          uint64_t AlignInBits,
-                                          unsigned Encoding);
-
-LLVMValueRef LLVMDIBuilderCreatePointerType(LLVMDIBuilderRef D,
-                                            LLVMValueRef PointeeType,
-                                            uint64_t SizeInBits,
-                                            uint64_t AlignInBits,
-                                            const char *Name);
-
-LLVMValueRef LLVMDIBuilderCreateSubroutineType(LLVMDIBuilderRef D,
-                                               LLVMValueRef File,
-                                               LLVMValueRef ParameterTypes);
-
-LLVMValueRef LLVMDIBuilderCreateStructType(
-    LLVMDIBuilderRef D, LLVMValueRef Scope, const char *Name, LLVMValueRef File,
-    unsigned Line, uint64_t SizeInBits, uint64_t AlignInBits, unsigned Flags,
-    LLVMValueRef DerivedFrom, LLVMValueRef ElementTypes);
-
-LLVMValueRef LLVMDIBuilderCreateMemberType(
-    LLVMDIBuilderRef D, LLVMValueRef Scope, const char *Name, LLVMValueRef File,
-    unsigned Line, uint64_t SizeInBits, uint64_t AlignInBits,
-    uint64_t OffsetInBits, unsigned Flags, LLVMValueRef Ty);
-
-LLVMValueRef LLVMDIBuilderCreateArrayType(LLVMDIBuilderRef D,
-                                          uint64_t SizeInBits,
-                                          uint64_t AlignInBits,
-                                          LLVMValueRef ElementType,
-                                          LLVMValueRef Subscripts);
-
-LLVMValueRef LLVMDIBuilderCreateTypedef(LLVMDIBuilderRef D, LLVMValueRef Ty,
-                                        const char *Name, LLVMValueRef File,
-                                        unsigned Line, LLVMValueRef Context);
-
-LLVMValueRef LLVMDIBuilderGetOrCreateSubrange(LLVMDIBuilderRef D, int64_t Lo,
-                                              int64_t Count);
-
-LLVMValueRef LLVMDIBuilderGetOrCreateArray(LLVMDIBuilderRef D,
-                                           LLVMValueRef *Data, size_t Length);
-
-LLVMValueRef LLVMDIBuilderGetOrCreateTypeArray(LLVMDIBuilderRef D,
-                                               LLVMValueRef *Data,
-                                               size_t Length);
-
-LLVMValueRef LLVMDIBuilderCreateExpression(LLVMDIBuilderRef Dref, int64_t *Addr,
-                                           size_t Length);
+LLVMMetadataRef LLVMDIBuilderCreateBasicType(LLVMDIBuilderRef D,
+                                             const char *Name,
+                                             uint64_t SizeInBits,
+                                             uint64_t AlignInBits,
+                                             unsigned Encoding);
+
+LLVMMetadataRef LLVMDIBuilderCreatePointerType(LLVMDIBuilderRef D,
+                                               LLVMMetadataRef PointeeType,
+                                               uint64_t SizeInBits,
+                                               uint64_t AlignInBits,
+                                               const char *Name);
+
+LLVMMetadataRef
+LLVMDIBuilderCreateSubroutineType(LLVMDIBuilderRef D, LLVMMetadataRef File,
+                                  LLVMMetadataRef ParameterTypes);
+
+LLVMMetadataRef LLVMDIBuilderCreateStructType(
+    LLVMDIBuilderRef D, LLVMMetadataRef Scope, const char *Name,
+    LLVMMetadataRef File, unsigned Line, uint64_t SizeInBits,
+    uint64_t AlignInBits, unsigned Flags, LLVMMetadataRef DerivedFrom,
+    LLVMMetadataRef ElementTypes);
+
+LLVMMetadataRef
+LLVMDIBuilderCreateMemberType(LLVMDIBuilderRef D, LLVMMetadataRef Scope,
+                              const char *Name, LLVMMetadataRef File,
+                              unsigned Line, uint64_t SizeInBits,
+                              uint64_t AlignInBits, uint64_t OffsetInBits,
+                              unsigned Flags, LLVMMetadataRef Ty);
+
+LLVMMetadataRef LLVMDIBuilderCreateArrayType(LLVMDIBuilderRef D,
+                                             uint64_t SizeInBits,
+                                             uint64_t AlignInBits,
+                                             LLVMMetadataRef ElementType,
+                                             LLVMMetadataRef Subscripts);
+
+LLVMMetadataRef LLVMDIBuilderCreateTypedef(LLVMDIBuilderRef D,
+                                           LLVMMetadataRef Ty, const char *Name,
+                                           LLVMMetadataRef File, unsigned Line,
+                                           LLVMMetadataRef Context);
+
+LLVMMetadataRef LLVMDIBuilderGetOrCreateSubrange(LLVMDIBuilderRef D, int64_t Lo,
+                                                 int64_t Count);
+
+LLVMMetadataRef LLVMDIBuilderGetOrCreateArray(LLVMDIBuilderRef D,
+                                              LLVMMetadataRef *Data,
+                                              size_t Length);
+
+LLVMMetadataRef LLVMDIBuilderGetOrCreateTypeArray(LLVMDIBuilderRef D,
+                                                  LLVMMetadataRef *Data,
+                                                  size_t Length);
+
+LLVMMetadataRef LLVMDIBuilderCreateExpression(LLVMDIBuilderRef Dref,
+                                              int64_t *Addr, size_t Length);
 
 LLVMValueRef LLVMDIBuilderInsertDeclareAtEnd(LLVMDIBuilderRef D,
                                              LLVMValueRef Storage,
-                                             LLVMValueRef VarInfo,
-                                             LLVMValueRef Expr,
+                                             LLVMMetadataRef VarInfo,
+                                             LLVMMetadataRef Expr,
                                              LLVMBasicBlockRef Block);
 
+LLVMValueRef LLVMDIBuilderInsertValueAtEnd(LLVMDIBuilderRef D, LLVMValueRef Val,
+                                           uint64_t Offset,
+                                           LLVMMetadataRef VarInfo,
+                                           LLVMMetadataRef Expr,
+                                           LLVMBasicBlockRef Block);
+
 #ifdef __cplusplus
-}  // extern "C"
+} // extern "C"
 #endif
 
 #endif
diff --git a/bindings/go/llvm/IRBindings.cpp b/bindings/go/llvm/IRBindings.cpp
index 67a54a2..fb451ef 100644
--- a/bindings/go/llvm/IRBindings.cpp
+++ b/bindings/go/llvm/IRBindings.cpp
@@ -12,9 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "IRBindings.h"
-
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 
 using namespace llvm;
 
@@ -45,3 +48,54 @@ void LLVMRemoveFunctionAttr2(LLVMValueRef Fn, uint64_t PA) {
                                            AttributeSet::FunctionIndex, B));
   Func->setAttributes(PALnew);
 }
+
+LLVMMetadataRef LLVMConstantAsMetadata(LLVMValueRef C) {
+  return wrap(ConstantAsMetadata::get(unwrap<Constant>(C)));
+}
+
+LLVMMetadataRef LLVMMDString2(LLVMContextRef C, const char *Str, unsigned SLen) {
+  return wrap(MDString::get(*unwrap(C), StringRef(Str, SLen)));
+}
+
+LLVMMetadataRef LLVMMDNode2(LLVMContextRef C, LLVMMetadataRef *MDs,
+                            unsigned Count) {
+  return wrap(
+      MDNode::get(*unwrap(C), ArrayRef<Metadata *>(unwrap(MDs), Count)));
+}
+
+LLVMMetadataRef LLVMTemporaryMDNode(LLVMContextRef C, LLVMMetadataRef *MDs,
+                                    unsigned Count) {
+  return wrap(MDTuple::getTemporary(*unwrap(C),
+                                    ArrayRef<Metadata *>(unwrap(MDs), Count))
+                  .release());
+}
+
+void LLVMAddNamedMetadataOperand2(LLVMModuleRef M, const char *name,
+                                  LLVMMetadataRef Val) {
+  NamedMDNode *N = unwrap(M)->getOrInsertNamedMetadata(name);
+  if (!N)
+    return;
+  if (!Val)
+    return;
+  N->addOperand(unwrap<MDNode>(Val));
+}
+
+void LLVMSetMetadata2(LLVMValueRef Inst, unsigned KindID, LLVMMetadataRef MD) {
+  MDNode *N = MD ? unwrap<MDNode>(MD) : nullptr;
+  unwrap<Instruction>(Inst)->setMetadata(KindID, N);
+}
+
+void LLVMMetadataReplaceAllUsesWith(LLVMMetadataRef MD, LLVMMetadataRef New) {
+  auto *Node = unwrap<MDTuple>(MD);
+  assert(Node->isTemporary() && "Expected temporary node");
+  Node->replaceAllUsesWith(unwrap<MDNode>(New));
+  MDNode::deleteTemporary(Node);
+}
+
+void LLVMSetCurrentDebugLocation2(LLVMBuilderRef Bref, unsigned Line,
+                                  unsigned Col, LLVMMetadataRef Scope,
+                                  LLVMMetadataRef InlinedAt) {
+  unwrap(Bref)->SetCurrentDebugLocation(
+      DebugLoc::get(Line, Col, Scope ? unwrap<MDNode>(Scope) : nullptr,
+                    InlinedAt ? unwrap<MDNode>(InlinedAt) : nullptr));
+}
diff --git a/bindings/go/llvm/IRBindings.h b/bindings/go/llvm/IRBindings.h
index cc63e4e..a53e178 100644
--- a/bindings/go/llvm/IRBindings.h
+++ b/bindings/go/llvm/IRBindings.h
@@ -15,12 +15,19 @@
 #define LLVM_BINDINGS_GO_LLVM_IRBINDINGS_H
 
 #include "llvm-c/Core.h"
+#ifdef __cplusplus
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/CBindingWrapping.h"
+#endif
+
 #include <stdint.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+typedef struct LLVMOpaqueMetadata *LLVMMetadataRef;
+
 // These functions duplicate the LLVM*FunctionAttr functions in the stable C
 // API. We cannot use the existing functions because they take 32-bit attribute
 // values, and the Go bindings expose all of the LLVM attributes, some of which
@@ -30,8 +37,37 @@ void LLVMAddFunctionAttr2(LLVMValueRef Fn, uint64_t PA);
 uint64_t LLVMGetFunctionAttr2(LLVMValueRef Fn);
 void LLVMRemoveFunctionAttr2(LLVMValueRef Fn, uint64_t PA);
 
+LLVMMetadataRef LLVMConstantAsMetadata(LLVMValueRef Val);
+
+LLVMMetadataRef LLVMMDString2(LLVMContextRef C, const char *Str, unsigned SLen);
+LLVMMetadataRef LLVMMDNode2(LLVMContextRef C, LLVMMetadataRef *MDs,
+                            unsigned Count);
+LLVMMetadataRef LLVMTemporaryMDNode(LLVMContextRef C, LLVMMetadataRef *MDs,
+                                    unsigned Count);
+
+void LLVMAddNamedMetadataOperand2(LLVMModuleRef M, const char *name,
+                                  LLVMMetadataRef Val);
+void LLVMSetMetadata2(LLVMValueRef Inst, unsigned KindID, LLVMMetadataRef MD);
+
+void LLVMMetadataReplaceAllUsesWith(LLVMMetadataRef MD, LLVMMetadataRef New);
+
+void LLVMSetCurrentDebugLocation2(LLVMBuilderRef Bref, unsigned Line,
+                                  unsigned Col, LLVMMetadataRef Scope,
+                                  LLVMMetadataRef InlinedAt);
+
 #ifdef __cplusplus
 }
+
+namespace llvm {
+
+DEFINE_ISA_CONVERSION_FUNCTIONS(Metadata, LLVMMetadataRef)
+
+inline Metadata **unwrap(LLVMMetadataRef *Vals) {
+  return reinterpret_cast<Metadata**>(Vals);
+}
+
+}
+
 #endif
 
 #endif
diff --git a/bindings/go/llvm/InstrumentationBindings.cpp b/bindings/go/llvm/InstrumentationBindings.cpp
index b604abb..8b7bafa 100644
--- a/bindings/go/llvm/InstrumentationBindings.cpp
+++ b/bindings/go/llvm/InstrumentationBindings.cpp
@@ -12,10 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstrumentationBindings.h"
-
 #include "llvm-c/Core.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
-#include "llvm/PassManager.h"
 #include "llvm/Transforms/Instrumentation.h"
 
 using namespace llvm;
@@ -37,6 +36,11 @@ void LLVMAddMemorySanitizerPass(LLVMPassManagerRef PM) {
 }
 
 void LLVMAddDataFlowSanitizerPass(LLVMPassManagerRef PM,
-                                  const char *ABIListFile) {
-  unwrap(PM)->add(createDataFlowSanitizerPass(ABIListFile));
+                                  int ABIListFilesNum,
+                                  const char **ABIListFiles) {
+  std::vector<std::string> ABIListFilesVec;
+  for (int i = 0; i != ABIListFilesNum; ++i) {
+    ABIListFilesVec.push_back(ABIListFiles[i]);
+  }
+  unwrap(PM)->add(createDataFlowSanitizerPass(ABIListFilesVec));
 }
diff --git a/bindings/go/llvm/InstrumentationBindings.h b/bindings/go/llvm/InstrumentationBindings.h
index e8dbd59..97af2d5 100644
--- a/bindings/go/llvm/InstrumentationBindings.h
+++ b/bindings/go/llvm/InstrumentationBindings.h
@@ -28,8 +28,8 @@ void LLVMAddAddressSanitizerFunctionPass(LLVMPassManagerRef PM);
 void LLVMAddAddressSanitizerModulePass(LLVMPassManagerRef PM);
 void LLVMAddThreadSanitizerPass(LLVMPassManagerRef PM);
 void LLVMAddMemorySanitizerPass(LLVMPassManagerRef PM);
-void LLVMAddDataFlowSanitizerPass(LLVMPassManagerRef PM,
-                                  const char *ABIListFile);
+void LLVMAddDataFlowSanitizerPass(LLVMPassManagerRef PM, int ABIListFilesNum,
+                                  const char **ABIListFiles);
 
 #ifdef __cplusplus
 }
diff --git a/bindings/go/llvm/SupportBindings.cpp b/bindings/go/llvm/SupportBindings.cpp
index df5f865..5e251b7 100644
--- a/bindings/go/llvm/SupportBindings.cpp
+++ b/bindings/go/llvm/SupportBindings.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "SupportBindings.h"
-
 #include "llvm/Support/DynamicLibrary.h"
 #include <stdlib.h>
 #include <string.h>
diff --git a/bindings/go/llvm/dibuilder.go b/bindings/go/llvm/dibuilder.go
index 1d07e98..3b1a1a6 100644
--- a/bindings/go/llvm/dibuilder.go
+++ b/bindings/go/llvm/dibuilder.go
@@ -121,7 +121,7 @@ type DICompileUnit struct {
 }
 
 // CreateCompileUnit creates compile unit debug metadata.
-func (d *DIBuilder) CreateCompileUnit(cu DICompileUnit) Value {
+func (d *DIBuilder) CreateCompileUnit(cu DICompileUnit) Metadata {
 	file := C.CString(cu.File)
 	defer C.free(unsafe.Pointer(file))
 	dir := C.CString(cu.Dir)
@@ -139,28 +139,28 @@ func (d *DIBuilder) CreateCompileUnit(cu DICompileUnit) Value {
 		flags,
 		C.unsigned(cu.RuntimeVersion),
 	)
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
 // CreateCompileUnit creates file debug metadata.
-func (d *DIBuilder) CreateFile(filename, dir string) Value {
+func (d *DIBuilder) CreateFile(filename, dir string) Metadata {
 	cfilename := C.CString(filename)
 	defer C.free(unsafe.Pointer(cfilename))
 	cdir := C.CString(dir)
 	defer C.free(unsafe.Pointer(cdir))
 	result := C.LLVMDIBuilderCreateFile(d.ref, cfilename, cdir)
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
 // DILexicalBlock holds the values for creating lexical block debug metadata.
 type DILexicalBlock struct {
-	File   Value
+	File   Metadata
 	Line   int
 	Column int
 }
 
 // CreateCompileUnit creates lexical block debug metadata.
-func (d *DIBuilder) CreateLexicalBlock(diScope Value, b DILexicalBlock) Value {
+func (d *DIBuilder) CreateLexicalBlock(diScope Metadata, b DILexicalBlock) Metadata {
 	result := C.LLVMDIBuilderCreateLexicalBlock(
 		d.ref,
 		diScope.C,
@@ -168,22 +168,22 @@ func (d *DIBuilder) CreateLexicalBlock(diScope Value, b DILexicalBlock) Value {
 		C.unsigned(b.Line),
 		C.unsigned(b.Column),
 	)
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
-func (d *DIBuilder) CreateLexicalBlockFile(diScope Value, diFile Value, discriminator int) Value {
+func (d *DIBuilder) CreateLexicalBlockFile(diScope Metadata, diFile Metadata, discriminator int) Metadata {
 	result := C.LLVMDIBuilderCreateLexicalBlockFile(d.ref, diScope.C, diFile.C,
 		C.unsigned(discriminator))
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
 // DIFunction holds the values for creating function debug metadata.
 type DIFunction struct {
 	Name         string
 	LinkageName  string
-	File         Value
+	File         Metadata
 	Line         int
-	Type         Value
+	Type         Metadata
 	LocalToUnit  bool
 	IsDefinition bool
 	ScopeLine    int
@@ -193,7 +193,7 @@ type DIFunction struct {
 }
 
 // CreateCompileUnit creates function debug metadata.
-func (d *DIBuilder) CreateFunction(diScope Value, f DIFunction) Value {
+func (d *DIBuilder) CreateFunction(diScope Metadata, f DIFunction) Metadata {
 	name := C.CString(f.Name)
 	defer C.free(unsafe.Pointer(name))
 	linkageName := C.CString(f.LinkageName)
@@ -213,16 +213,16 @@ func (d *DIBuilder) CreateFunction(diScope Value, f DIFunction) Value {
 		boolToCInt(f.Optimized),
 		f.Function.C,
 	)
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
 // DILocalVariable holds the values for creating local variable debug metadata.
 type DILocalVariable struct {
 	Tag            dwarf.Tag
 	Name           string
-	File           Value
+	File           Metadata
 	Line           int
-	Type           Value
+	Type           Metadata
 	AlwaysPreserve bool
 	Flags          int
 
@@ -232,7 +232,7 @@ type DILocalVariable struct {
 }
 
 // CreateLocalVariable creates local variable debug metadata.
-func (d *DIBuilder) CreateLocalVariable(scope Value, v DILocalVariable) Value {
+func (d *DIBuilder) CreateLocalVariable(scope Metadata, v DILocalVariable) Metadata {
 	name := C.CString(v.Name)
 	defer C.free(unsafe.Pointer(name))
 	result := C.LLVMDIBuilderCreateLocalVariable(
@@ -247,7 +247,7 @@ func (d *DIBuilder) CreateLocalVariable(scope Value, v DILocalVariable) Value {
 		C.unsigned(v.Flags),
 		C.unsigned(v.ArgNo),
 	)
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
 // DIBasicType holds the values for creating basic type debug metadata.
@@ -259,7 +259,7 @@ type DIBasicType struct {
 }
 
 // CreateBasicType creates basic type debug metadata.
-func (d *DIBuilder) CreateBasicType(t DIBasicType) Value {
+func (d *DIBuilder) CreateBasicType(t DIBasicType) Metadata {
 	name := C.CString(t.Name)
 	defer C.free(unsafe.Pointer(name))
 	result := C.LLVMDIBuilderCreateBasicType(
@@ -269,19 +269,19 @@ func (d *DIBuilder) CreateBasicType(t DIBasicType) Value {
 		C.uint64_t(t.AlignInBits),
 		C.unsigned(t.Encoding),
 	)
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
 // DIPointerType holds the values for creating pointer type debug metadata.
 type DIPointerType struct {
-	Pointee     Value
+	Pointee     Metadata
 	SizeInBits  uint64
 	AlignInBits uint64 // optional
 	Name        string // optional
 }
 
 // CreateBasicType creates basic type debug metadata.
-func (d *DIBuilder) CreatePointerType(t DIPointerType) Value {
+func (d *DIBuilder) CreatePointerType(t DIPointerType) Metadata {
 	name := C.CString(t.Name)
 	defer C.free(unsafe.Pointer(name))
 	result := C.LLVMDIBuilderCreatePointerType(
@@ -291,40 +291,40 @@ func (d *DIBuilder) CreatePointerType(t DIPointerType) Value {
 		C.uint64_t(t.AlignInBits),
 		name,
 	)
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
 // DISubroutineType holds the values for creating subroutine type debug metadata.
 type DISubroutineType struct {
 	// File is the file in which the subroutine type is defined.
-	File Value
+	File Metadata
 
 	// Parameters contains the subroutine parameter types,
 	// including the return type at the 0th index.
-	Parameters []Value
+	Parameters []Metadata
 }
 
 // CreateSubroutineType creates subroutine type debug metadata.
-func (d *DIBuilder) CreateSubroutineType(t DISubroutineType) Value {
+func (d *DIBuilder) CreateSubroutineType(t DISubroutineType) Metadata {
 	params := d.getOrCreateTypeArray(t.Parameters)
 	result := C.LLVMDIBuilderCreateSubroutineType(d.ref, t.File.C, params.C)
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
 // DIStructType holds the values for creating struct type debug metadata.
 type DIStructType struct {
 	Name        string
-	File        Value
+	File        Metadata
 	Line        int
 	SizeInBits  uint64
 	AlignInBits uint64
 	Flags       int
-	DerivedFrom Value
-	Elements    []Value
+	DerivedFrom Metadata
+	Elements    []Metadata
 }
 
 // CreateStructType creates struct type debug metadata.
-func (d *DIBuilder) CreateStructType(scope Value, t DIStructType) Value {
+func (d *DIBuilder) CreateStructType(scope Metadata, t DIStructType) Metadata {
 	elements := d.getOrCreateArray(t.Elements)
 	name := C.CString(t.Name)
 	defer C.free(unsafe.Pointer(name))
@@ -340,23 +340,23 @@ func (d *DIBuilder) CreateStructType(scope Value, t DIStructType) Value {
 		t.DerivedFrom.C,
 		elements.C,
 	)
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
 // DIMemberType holds the values for creating member type debug metadata.
 type DIMemberType struct {
 	Name         string
-	File         Value
+	File         Metadata
 	Line         int
 	SizeInBits   uint64
 	AlignInBits  uint64
 	OffsetInBits uint64
 	Flags        int
-	Type         Value
+	Type         Metadata
 }
 
 // CreateMemberType creates struct type debug metadata.
-func (d *DIBuilder) CreateMemberType(scope Value, t DIMemberType) Value {
+func (d *DIBuilder) CreateMemberType(scope Metadata, t DIMemberType) Metadata {
 	name := C.CString(t.Name)
 	defer C.free(unsafe.Pointer(name))
 	result := C.LLVMDIBuilderCreateMemberType(
@@ -371,7 +371,7 @@ func (d *DIBuilder) CreateMemberType(scope Value, t DIMemberType) Value {
 		C.unsigned(t.Flags),
 		t.Type.C,
 	)
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
 // DISubrange describes an integer value range.
@@ -384,13 +384,13 @@ type DISubrange struct {
 type DIArrayType struct {
 	SizeInBits  uint64
 	AlignInBits uint64
-	ElementType Value
+	ElementType Metadata
 	Subscripts  []DISubrange
 }
 
 // CreateArrayType creates struct type debug metadata.
-func (d *DIBuilder) CreateArrayType(t DIArrayType) Value {
-	subscriptsSlice := make([]Value, len(t.Subscripts))
+func (d *DIBuilder) CreateArrayType(t DIArrayType) Metadata {
+	subscriptsSlice := make([]Metadata, len(t.Subscripts))
 	for i, s := range t.Subscripts {
 		subscriptsSlice[i] = d.getOrCreateSubrange(s.Lo, s.Count)
 	}
@@ -402,20 +402,20 @@ func (d *DIBuilder) CreateArrayType(t DIArrayType) Value {
 		t.ElementType.C,
 		subscripts.C,
 	)
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
 // DITypedef holds the values for creating typedef type debug metadata.
 type DITypedef struct {
-	Type    Value
+	Type    Metadata
 	Name    string
-	File    Value
+	File    Metadata
 	Line    int
-	Context Value
+	Context Metadata
 }
 
 // CreateTypedef creates typedef type debug metadata.
-func (d *DIBuilder) CreateTypedef(t DITypedef) Value {
+func (d *DIBuilder) CreateTypedef(t DITypedef) Metadata {
 	name := C.CString(t.Name)
 	defer C.free(unsafe.Pointer(name))
 	result := C.LLVMDIBuilderCreateTypedef(
@@ -426,64 +426,63 @@ func (d *DIBuilder) CreateTypedef(t DITypedef) Value {
 		C.unsigned(t.Line),
 		t.Context.C,
 	)
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
 // getOrCreateSubrange gets a metadata node for the specified subrange,
 // creating if required.
-func (d *DIBuilder) getOrCreateSubrange(lo, count int64) Value {
+func (d *DIBuilder) getOrCreateSubrange(lo, count int64) Metadata {
 	result := C.LLVMDIBuilderGetOrCreateSubrange(d.ref, C.int64_t(lo), C.int64_t(count))
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
 // getOrCreateArray gets a metadata node containing the specified values,
 // creating if required.
-func (d *DIBuilder) getOrCreateArray(values []Value) Value {
+func (d *DIBuilder) getOrCreateArray(values []Metadata) Metadata {
 	if len(values) == 0 {
-		return Value{}
-	}
-	var data *C.LLVMValueRef
-	length := len(values)
-	if length > 0 {
-		data = &values[0].C
+		return Metadata{}
 	}
+	data, length := llvmMetadataRefs(values)
 	result := C.LLVMDIBuilderGetOrCreateArray(d.ref, data, C.size_t(length))
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
 // getOrCreateTypeArray gets a metadata node for a type array containing the
 // specified values, creating if required.
-func (d *DIBuilder) getOrCreateTypeArray(values []Value) Value {
+func (d *DIBuilder) getOrCreateTypeArray(values []Metadata) Metadata {
 	if len(values) == 0 {
-		return Value{}
-	}
-	var data *C.LLVMValueRef
-	length := len(values)
-	if length > 0 {
-		data = &values[0].C
+		return Metadata{}
 	}
+	data, length := llvmMetadataRefs(values)
 	result := C.LLVMDIBuilderGetOrCreateTypeArray(d.ref, data, C.size_t(length))
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
 // CreateExpression creates a new descriptor for the specified
 // variable which has a complex address expression for its address.
-func (d *DIBuilder) CreateExpression(addr []int64) Value {
+func (d *DIBuilder) CreateExpression(addr []int64) Metadata {
 	var data *C.int64_t
 	if len(addr) > 0 {
 		data = (*C.int64_t)(unsafe.Pointer(&addr[0]))
 	}
 	result := C.LLVMDIBuilderCreateExpression(d.ref, data, C.size_t(len(addr)))
-	return Value{C: result}
+	return Metadata{C: result}
 }
 
 // InsertDeclareAtEnd inserts a call to llvm.dbg.declare at the end of the
 // specified basic block for the given value and associated debug metadata.
-func (d *DIBuilder) InsertDeclareAtEnd(v, diVarInfo, expr Value, bb BasicBlock) Value {
+func (d *DIBuilder) InsertDeclareAtEnd(v Value, diVarInfo, expr Metadata, bb BasicBlock) Value {
 	result := C.LLVMDIBuilderInsertDeclareAtEnd(d.ref, v.C, diVarInfo.C, expr.C, bb.C)
 	return Value{C: result}
 }
 
+// InsertValueAtEnd inserts a call to llvm.dbg.value at the end of the
+// specified basic block for the given value and associated debug metadata.
+func (d *DIBuilder) InsertValueAtEnd(v Value, diVarInfo, expr Metadata, offset uint64, bb BasicBlock) Value {
+	result := C.LLVMDIBuilderInsertValueAtEnd(d.ref, v.C, C.uint64_t(offset), diVarInfo.C, expr.C, bb.C)
+	return Value{C: result}
+}
+
 func boolToCInt(v bool) C.int {
 	if v {
 		return 1
diff --git a/bindings/go/llvm/executionengine.go b/bindings/go/llvm/executionengine.go
index 26b7524..94d4e83 100644
--- a/bindings/go/llvm/executionengine.go
+++ b/bindings/go/llvm/executionengine.go
@@ -30,11 +30,25 @@ type GenericValue struct {
 type ExecutionEngine struct {
 	C C.LLVMExecutionEngineRef
 }
+
 type MCJITCompilerOptions struct {
-	OptLevel           uint
-	CodeModel          CodeModel
-	NoFramePointerElim bool
-	EnableFastISel     bool
+	C C.struct_LLVMMCJITCompilerOptions
+}
+
+func (options *MCJITCompilerOptions) SetMCJITOptimizationLevel(level uint) {
+	options.C.OptLevel = C.uint(level)
+}
+
+func (options *MCJITCompilerOptions) SetMCJITNoFramePointerElim(nfp bool) {
+	options.C.NoFramePointerElim = boolToLLVMBool(nfp)
+}
+
+func (options *MCJITCompilerOptions) SetMCJITEnableFastISel(fastisel bool) {
+	options.C.EnableFastISel = boolToLLVMBool(fastisel)
+}
+
+func (options *MCJITCompilerOptions) SetMCJITCodeModel(CodeModel CodeModel) {
+	options.C.CodeModel = C.LLVMCodeModel(CodeModel)
 }
 
 // helpers
@@ -96,15 +110,15 @@ func NewInterpreter(m Module) (ee ExecutionEngine, err error) {
 	return
 }
 
+func NewMCJITCompilerOptions() MCJITCompilerOptions {
+	var options C.struct_LLVMMCJITCompilerOptions
+	C.LLVMInitializeMCJITCompilerOptions(&options, C.size_t(unsafe.Sizeof(C.struct_LLVMMCJITCompilerOptions{})))
+	return MCJITCompilerOptions{options}
+}
+
 func NewMCJITCompiler(m Module, options MCJITCompilerOptions) (ee ExecutionEngine, err error) {
 	var cmsg *C.char
-	copts := C.struct_LLVMMCJITCompilerOptions{
-		OptLevel:           C.unsigned(options.OptLevel),
-		CodeModel:          C.LLVMCodeModel(options.CodeModel),
-		NoFramePointerElim: boolToLLVMBool(options.NoFramePointerElim),
-		EnableFastISel:     boolToLLVMBool(options.EnableFastISel),
-	}
-	fail := C.LLVMCreateMCJITCompilerForModule(&ee.C, m.C, &copts, C.size_t(unsafe.Sizeof(copts)), &cmsg)
+	fail := C.LLVMCreateMCJITCompilerForModule(&ee.C, m.C, &options.C, C.size_t(unsafe.Sizeof(C.struct_LLVMMCJITCompilerOptions{})), &cmsg)
 	if fail != 0 {
 		ee.C = nil
 		err = errors.New(C.GoString(cmsg))
diff --git a/bindings/go/llvm/executionengine_test.go b/bindings/go/llvm/executionengine_test.go
index 1a3fd45..2b6a3ca 100644
--- a/bindings/go/llvm/executionengine_test.go
+++ b/bindings/go/llvm/executionengine_test.go
@@ -66,7 +66,12 @@ func TestFactorial(t *testing.T) {
 		return
 	}
 
-	engine, err := NewMCJITCompiler(mod, MCJITCompilerOptions{OptLevel: 2})
+	options := NewMCJITCompilerOptions()
+	options.SetMCJITOptimizationLevel(2)
+	options.SetMCJITEnableFastISel(true)
+	options.SetMCJITNoFramePointerElim(true)
+	options.SetMCJITCodeModel(CodeModelJITDefault)
+	engine, err := NewMCJITCompiler(mod, options)
 	if err != nil {
 		t.Errorf("Error creating JIT: %s", err)
 		return
diff --git a/bindings/go/llvm/ir.go b/bindings/go/llvm/ir.go
index 7834f5c..e5916a1 100644
--- a/bindings/go/llvm/ir.go
+++ b/bindings/go/llvm/ir.go
@@ -55,6 +55,9 @@ type (
 	Use struct {
 		C C.LLVMUseRef
 	}
+	Metadata struct {
+		C C.LLVMMetadataRef
+	}
 	Attribute        uint64
 	Opcode           C.LLVMOpcode
 	TypeKind         C.LLVMTypeKind
@@ -80,6 +83,9 @@ func (c Use) IsNil() bool            { return c.C == nil }
 // helpers
 func llvmTypeRefPtr(t *Type) *C.LLVMTypeRef    { return (*C.LLVMTypeRef)(unsafe.Pointer(t)) }
 func llvmValueRefPtr(t *Value) *C.LLVMValueRef { return (*C.LLVMValueRef)(unsafe.Pointer(t)) }
+func llvmMetadataRefPtr(t *Metadata) *C.LLVMMetadataRef {
+	return (*C.LLVMMetadataRef)(unsafe.Pointer(t))
+}
 func llvmBasicBlockRefPtr(t *BasicBlock) *C.LLVMBasicBlockRef {
 	return (*C.LLVMBasicBlockRef)(unsafe.Pointer(t))
 }
@@ -99,6 +105,15 @@ func llvmValueRefs(values []Value) (*C.LLVMValueRef, C.unsigned) {
 	return pt, ptlen
 }
 
+func llvmMetadataRefs(mds []Metadata) (*C.LLVMMetadataRef, C.unsigned) {
+	var pt *C.LLVMMetadataRef
+	ptlen := C.unsigned(len(mds))
+	if ptlen > 0 {
+		pt = llvmMetadataRefPtr(&mds[0])
+	}
+	return pt, ptlen
+}
+
 //-------------------------------------------------------------------------
 // llvm.Attribute
 //-------------------------------------------------------------------------
@@ -421,10 +436,10 @@ func (m Module) SetInlineAsm(asm string) {
 	C.LLVMSetModuleInlineAsm(m.C, casm)
 }
 
-func (m Module) AddNamedMetadataOperand(name string, operand Value) {
+func (m Module) AddNamedMetadataOperand(name string, operand Metadata) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
-	C.LLVMAddNamedMetadataOperand(m.C, cname, operand.C)
+	C.LLVMAddNamedMetadataOperand2(m.C, cname, operand.C)
 }
 
 func (m Module) Context() (c Context) {
@@ -628,8 +643,8 @@ func (v Value) Metadata(kind int) (rv Value) {
 	rv.C = C.LLVMGetMetadata(v.C, C.unsigned(kind))
 	return
 }
-func (v Value) SetMetadata(kind int, node Value) {
-	C.LLVMSetMetadata(v.C, C.unsigned(kind), node.C)
+func (v Value) SetMetadata(kind int, node Metadata) {
+	C.LLVMSetMetadata2(v.C, C.unsigned(kind), node.C)
 }
 
 // Conversion functions.
@@ -723,26 +738,24 @@ func (v Value) IsUndef() bool           { return C.LLVMIsUndef(v.C) != 0 }
 func ConstPointerNull(t Type) (v Value) { v.C = C.LLVMConstPointerNull(t.C); return }
 
 // Operations on metadata
-func (c Context) MDString(str string) (v Value) {
+func (c Context) MDString(str string) (md Metadata) {
 	cstr := C.CString(str)
 	defer C.free(unsafe.Pointer(cstr))
-	v.C = C.LLVMMDStringInContext(c.C, cstr, C.unsigned(len(str)))
+	md.C = C.LLVMMDString2(c.C, cstr, C.unsigned(len(str)))
 	return
 }
-func MDString(str string) (v Value) {
-	cstr := C.CString(str)
-	defer C.free(unsafe.Pointer(cstr))
-	v.C = C.LLVMMDString(cstr, C.unsigned(len(str)))
+func (c Context) MDNode(mds []Metadata) (md Metadata) {
+	ptr, nvals := llvmMetadataRefs(mds)
+	md.C = C.LLVMMDNode2(c.C, ptr, nvals)
 	return
 }
-func (c Context) MDNode(vals []Value) (v Value) {
-	ptr, nvals := llvmValueRefs(vals)
-	v.C = C.LLVMMDNodeInContext(c.C, ptr, nvals)
+func (c Context) TemporaryMDNode(mds []Metadata) (md Metadata) {
+	ptr, nvals := llvmMetadataRefs(mds)
+	md.C = C.LLVMTemporaryMDNode(c.C, ptr, nvals)
 	return
 }
-func MDNode(vals []Value) (v Value) {
-	ptr, nvals := llvmValueRefs(vals)
-	v.C = C.LLVMMDNode(ptr, nvals)
+func (v Value) ConstantAsMetadata() (md Metadata) {
+	md.C = C.LLVMConstantAsMetadata(v.C)
 	return
 }
 
@@ -1188,8 +1201,9 @@ func (b Builder) InsertWithName(instr Value, name string) {
 func (b Builder) Dispose() { C.LLVMDisposeBuilder(b.C) }
 
 // Metadata
-func (b Builder) SetCurrentDebugLocation(v Value) { C.LLVMSetCurrentDebugLocation(b.C, v.C) }
-func (b Builder) CurrentDebugLocation() (v Value) { v.C = C.LLVMGetCurrentDebugLocation(b.C); return }
+func (b Builder) SetCurrentDebugLocation(line, col uint, scope, inlinedAt Metadata) {
+	C.LLVMSetCurrentDebugLocation2(b.C, C.unsigned(line), C.unsigned(col), scope.C, inlinedAt.C)
+}
 func (b Builder) SetInstDebugLocation(v Value)    { C.LLVMSetInstDebugLocation(b.C, v.C) }
 func (b Builder) InsertDeclare(module Module, storage Value, md Value) Value {
 	f := module.NamedFunction("llvm.dbg.declare")
@@ -1822,3 +1836,11 @@ func (pm PassManager) FinalizeFunc() bool { return C.LLVMFinalizeFunctionPassMan
 // the module provider.
 // See llvm::PassManagerBase::~PassManagerBase.
 func (pm PassManager) Dispose() { C.LLVMDisposePassManager(pm.C) }
+
+//-------------------------------------------------------------------------
+// llvm.Metadata
+//-------------------------------------------------------------------------
+
+func (md Metadata) ReplaceAllUsesWith(new Metadata) {
+	C.LLVMMetadataReplaceAllUsesWith(md.C, new.C)
+}
diff --git a/bindings/go/llvm/linker.go b/bindings/go/llvm/linker.go
index 31e9ad2..64d794e 100644
--- a/bindings/go/llvm/linker.go
+++ b/bindings/go/llvm/linker.go
@@ -20,16 +20,9 @@ package llvm
 import "C"
 import "errors"
 
-type LinkerMode C.LLVMLinkerMode
-
-const (
-	LinkerDestroySource  = C.LLVMLinkerDestroySource
-	LinkerPreserveSource = C.LLVMLinkerPreserveSource
-)
-
-func LinkModules(Dest, Src Module, Mode LinkerMode) error {
+func LinkModules(Dest, Src Module) error {
 	var cmsg *C.char
-	failed := C.LLVMLinkModules(Dest.C, Src.C, C.LLVMLinkerMode(Mode), &cmsg)
+	failed := C.LLVMLinkModules(Dest.C, Src.C, 0, &cmsg)
 	if failed != 0 {
 		err := errors.New(C.GoString(cmsg))
 		C.LLVMDisposeMessage(cmsg)
diff --git a/bindings/go/llvm/transforms_instrumentation.go b/bindings/go/llvm/transforms_instrumentation.go
index 9b191b2..73e2732 100644
--- a/bindings/go/llvm/transforms_instrumentation.go
+++ b/bindings/go/llvm/transforms_instrumentation.go
@@ -36,8 +36,11 @@ func (pm PassManager) AddMemorySanitizerPass() {
 	C.LLVMAddMemorySanitizerPass(pm.C)
 }
 
-func (pm PassManager) AddDataFlowSanitizerPass(abilist string) {
-	cabilist := C.CString(abilist)
-	defer C.free(unsafe.Pointer(cabilist))
-	C.LLVMAddDataFlowSanitizerPass(pm.C, cabilist)
+func (pm PassManager) AddDataFlowSanitizerPass(abilist []string) {
+	abiliststrs := make([]*C.char, len(abilist))
+	for i, arg := range abilist {
+		abiliststrs[i] = C.CString(arg)
+		defer C.free(unsafe.Pointer(abiliststrs[i]))
+	}
+	C.LLVMAddDataFlowSanitizerPass(pm.C, C.int(len(abilist)), &abiliststrs[0])
 }
diff --git a/bindings/ocaml/CMakeLists.txt b/bindings/ocaml/CMakeLists.txt
new file mode 100644
index 0000000..2058368
--- /dev/null
+++ b/bindings/ocaml/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_subdirectory(llvm)
+add_subdirectory(all_backends)
+add_subdirectory(analysis)
+add_subdirectory(backends)
+add_subdirectory(bitreader)
+add_subdirectory(bitwriter)
+add_subdirectory(irreader)
+add_subdirectory(linker)
+add_subdirectory(target)
+add_subdirectory(transforms)
+add_subdirectory(executionengine)
diff --git a/bindings/ocaml/Makefile.ocaml b/bindings/ocaml/Makefile.ocaml
index 5e00cf5..1f65a7b 100644
--- a/bindings/ocaml/Makefile.ocaml
+++ b/bindings/ocaml/Makefile.ocaml
@@ -32,6 +32,12 @@ endif
 
 include $(LEVEL)/Makefile.common
 
+# Used in out-of-tree builds of OCaml bindings only.
+ifdef SYSTEM_LLVM_CONFIG
+LLVM_CONFIG = $(SYSTEM_LLVM_CONFIG)
+LLVMLibsOptions += $(shell $(LLVM_CONFIG) --ldflags)
+endif
+
 # Intentionally ignore PROJ_prefix here. We want the ocaml stdlib. However, the
 # user can override this with OCAML_LIBDIR or configure --with-ocaml-libdir=.
 PROJ_libocamldir := $(DESTDIR)$(OCAML_LIBDIR)
@@ -65,6 +71,10 @@ OCAMLRPATH     := $(RPATH) -Wl,'$$ORIGIN/../../lib'
 endif
 endif
 
+# See http://caml.inria.fr/mantis/view.php?id=6642
+OCAMLORIGIN    := -ccopt -L'$$CAMLORIGIN/..' \
+                  -ccopt $(RPATH) -ccopt -Wl,'$$CAMLORIGIN/..'
+
 # Tools
 OCAMLCFLAGS += -I $(OcamlDir) $(addprefix -package ,$(FindlibPackages))
 
@@ -92,16 +102,18 @@ Compile.CMX  := $(strip $(OCAMLFIND) opt -c $(OCAMLCFLAGS) $(OCAMLDEBUGFLAG) -o)
 ifdef OCAMLSTUBS
 # -dllib is engaged with ocamlc builds, $(OCAMLSTUBFLAGS) in ocamlc -custom builds.
 Archive.CMA  := $(strip $(OCAMLFIND) c -a -dllib -l$(LIBRARYNAME) $(OCAMLSTUBFLAGS) \
-																			 $(OCAMLDEBUGFLAG) -o)
+																			 $(OCAMLDEBUGFLAG) $(OCAMLORIGIN) -o)
 else
 Archive.CMA  := $(strip $(OCAMLFIND) c -a -custom $(OCAMLAFLAGS) $(OCAMLDEBUGFLAG) \
-                                       -o)
+                                       $(OCAMLORIGIN) -o)
 endif
 
 ifdef OCAMLSTUBS
-Archive.CMXA := $(strip $(OCAMLFIND) opt -a $(OCAMLSTUBFLAGS) $(OCAMLDEBUGFLAG) -o)
+Archive.CMXA := $(strip $(OCAMLFIND) opt -a $(OCAMLSTUBFLAGS) $(OCAMLDEBUGFLAG) \
+                                         $(OCAMLORIGIN) -o)
 else
-Archive.CMXA := $(strip $(OCAMLFIND) opt -a $(OCAMLAFLAGS) $(OCAMLDEBUGFLAG) -o)
+Archive.CMXA := $(strip $(OCAMLFIND) opt -a $(OCAMLAFLAGS) $(OCAMLDEBUGFLAG) \
+                                         $(OCAMLORIGIN) -o)
 endif
 
 # Source files
@@ -237,8 +249,8 @@ uninstall-local:: uninstall-shared
 
 $(SharedLib): $(ObjectsO) $(OcamlDir)/.dir
 	$(Echo) "Building $(BuildMode) $(notdir $@)"
-	$(Verb) $(Link) $(SharedLinkOptions) $(OCAMLRPATH) $(LLVMLibsOptions) \
-			-o $@ $(ObjectsO)
+	$(Verb) $(Link) $(SharedLinkOptions) $(OCAMLRPATH) -o $@ $(ObjectsO) \
+			$(LLVMLibsOptions)
 
 clean-shared::
 	-$(Verb) $(RM) -f $(SharedLib)
@@ -255,8 +267,9 @@ uninstall-shared::
 endif
 
 
-##===- Deposit dependent libraries adjacent to Ocaml libs -----------------===##
+##===- Deposit dependent libraries adjacent to OCaml libs -----------------===##
 
+ifndef SYSTEM_LLVM_CONFIG
 all-local:: build-deplibs
 clean-local:: clean-deplibs
 install-local:: install-deplibs
@@ -281,7 +294,7 @@ install-deplibs:
 
 uninstall-deplibs:
 	$(Verb) $(RM) -f $(DestLibs)
-
+endif
 
 ##===- Build ocaml interfaces (.mli's -> .cmi's) --------------------------===##
 
diff --git a/bindings/ocaml/all_backends/CMakeLists.txt b/bindings/ocaml/all_backends/CMakeLists.txt
new file mode 100644
index 0000000..716a49c
--- /dev/null
+++ b/bindings/ocaml/all_backends/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_ocaml_library(llvm_all_backends
+  OCAML    llvm_all_backends
+  OCAMLDEP llvm
+  C        all_backends_ocaml
+  LLVM     ${LLVM_TARGETS_TO_BUILD})
diff --git a/bindings/ocaml/analysis/CMakeLists.txt b/bindings/ocaml/analysis/CMakeLists.txt
new file mode 100644
index 0000000..f8ca84d
--- /dev/null
+++ b/bindings/ocaml/analysis/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_ocaml_library(llvm_analysis
+  OCAML    llvm_analysis
+  OCAMLDEP llvm
+  C        analysis_ocaml
+  LLVM     analysis)
diff --git a/bindings/ocaml/backends/CMakeLists.txt b/bindings/ocaml/backends/CMakeLists.txt
new file mode 100644
index 0000000..a980638
--- /dev/null
+++ b/bindings/ocaml/backends/CMakeLists.txt
@@ -0,0 +1,27 @@
+foreach(TARGET ${LLVM_TARGETS_TO_BUILD})
+  set(OCAML_LLVM_TARGET ${TARGET})
+
+  foreach( ext ml mli )
+    configure_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/llvm_backend.${ext}.in"
+        "${CMAKE_CURRENT_BINARY_DIR}/llvm_${TARGET}.${ext}")
+  endforeach()
+
+  configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/backend_ocaml.c"
+    "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}_ocaml.c")
+
+  add_ocaml_library(llvm_${TARGET}
+    OCAML    llvm_${TARGET}
+    C        ${TARGET}_ocaml
+    CFLAGS   -DTARGET=${TARGET}
+    LLVM     ${TARGET}
+    NOCOPY)
+
+  configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/META.llvm_backend.in"
+    "${LLVM_LIBRARY_DIR}/ocaml/META.llvm_${TARGET}")
+
+  install(FILES "${LLVM_LIBRARY_DIR}/ocaml/META.llvm_${TARGET}"
+          DESTINATION lib/ocaml)
+endforeach()
diff --git a/bindings/ocaml/bitreader/CMakeLists.txt b/bindings/ocaml/bitreader/CMakeLists.txt
new file mode 100644
index 0000000..8d16103
--- /dev/null
+++ b/bindings/ocaml/bitreader/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_ocaml_library(llvm_bitreader
+  OCAML    llvm_bitreader
+  OCAMLDEP llvm
+  C        bitreader_ocaml
+  LLVM     bitreader)
diff --git a/bindings/ocaml/bitwriter/CMakeLists.txt b/bindings/ocaml/bitwriter/CMakeLists.txt
new file mode 100644
index 0000000..5a14498
--- /dev/null
+++ b/bindings/ocaml/bitwriter/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_ocaml_library(llvm_bitwriter
+  OCAML    llvm_bitwriter
+  OCAMLDEP llvm
+  C        bitwriter_ocaml
+  LLVM     bitwriter)
diff --git a/bindings/ocaml/executionengine/CMakeLists.txt b/bindings/ocaml/executionengine/CMakeLists.txt
new file mode 100644
index 0000000..ae9af08
--- /dev/null
+++ b/bindings/ocaml/executionengine/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_ocaml_library(llvm_executionengine
+  OCAML    llvm_executionengine
+  OCAMLDEP llvm llvm_target
+  C        executionengine_ocaml
+  LLVM     executionengine mcjit native
+  PKG      ctypes)
diff --git a/bindings/ocaml/executionengine/executionengine_ocaml.c b/bindings/ocaml/executionengine/executionengine_ocaml.c
index 0557efc..b799250 100644
--- a/bindings/ocaml/executionengine/executionengine_ocaml.c
+++ b/bindings/ocaml/executionengine/executionengine_ocaml.c
@@ -115,8 +115,12 @@ CAMLprim value llvm_ee_add_global_mapping(LLVMValueRef Global, value Ptr,
   return Val_unit;
 }
 
-/* Llvm.llvalue -> llexecutionengine -> int64 */
-CAMLprim value llvm_ee_get_pointer_to_global(LLVMValueRef Global,
-                                             LLVMExecutionEngineRef EE) {
-  return caml_copy_int64((int64_t) LLVMGetPointerToGlobal(EE, Global));
+CAMLprim value llvm_ee_get_global_value_address(value Name,
+						LLVMExecutionEngineRef EE) {
+  return caml_copy_int64((int64_t) LLVMGetGlobalValueAddress(EE, String_val(Name)));
+}
+
+CAMLprim value llvm_ee_get_function_address(value Name,
+					    LLVMExecutionEngineRef EE) {
+  return caml_copy_int64((int64_t) LLVMGetFunctionAddress(EE, String_val(Name)));
 }
diff --git a/bindings/ocaml/executionengine/llvm_executionengine.ml b/bindings/ocaml/executionengine/llvm_executionengine.ml
index c0ff330..34031be 100644
--- a/bindings/ocaml/executionengine/llvm_executionengine.ml
+++ b/bindings/ocaml/executionengine/llvm_executionengine.ml
@@ -45,15 +45,27 @@ external data_layout : llexecutionengine -> Llvm_target.DataLayout.t
   = "llvm_ee_get_data_layout"
 external add_global_mapping_ : Llvm.llvalue -> int64 -> llexecutionengine -> unit
   = "llvm_ee_add_global_mapping"
-external get_pointer_to_global_ : Llvm.llvalue -> llexecutionengine -> int64
-  = "llvm_ee_get_pointer_to_global"
+external get_global_value_address_ : string -> llexecutionengine -> int64
+  = "llvm_ee_get_global_value_address"
+external get_function_address_ : string -> llexecutionengine -> int64
+  = "llvm_ee_get_function_address"
 
 let add_global_mapping llval ptr ee =
   add_global_mapping_ llval (Ctypes.raw_address_of_ptr (Ctypes.to_voidp ptr)) ee
 
-let get_pointer_to_global llval typ ee =
-  Ctypes.coerce (let open Ctypes in ptr void) typ
-                (Ctypes.ptr_of_raw_address (get_pointer_to_global_ llval ee))
+let get_global_value_address name typ ee =
+  let vptr = get_global_value_address_ name ee in
+  if Int64.to_int vptr <> 0 then
+    let open Ctypes in !@ (coerce (ptr void) (ptr typ) (ptr_of_raw_address vptr))
+  else
+    raise (Error ("Value " ^ name ^ " not found"))
+
+let get_function_address name typ ee =
+  let fptr = get_function_address_ name ee in
+  if Int64.to_int fptr <> 0 then
+    let open Ctypes in coerce (ptr void) typ (ptr_of_raw_address fptr)
+  else
+    raise (Error ("Function " ^ name ^ " not found"))
 
 (* The following are not bound. Patches are welcome.
 target_machine : llexecutionengine -> Llvm_target.TargetMachine.t
diff --git a/bindings/ocaml/executionengine/llvm_executionengine.mli b/bindings/ocaml/executionengine/llvm_executionengine.mli
index b07151d..bc076be 100644
--- a/bindings/ocaml/executionengine/llvm_executionengine.mli
+++ b/bindings/ocaml/executionengine/llvm_executionengine.mli
@@ -76,9 +76,18 @@ val data_layout : llexecutionengine -> Llvm_target.DataLayout.t
     All uses of [gv] in the compiled code will refer to [ptr]. *)
 val add_global_mapping : Llvm.llvalue -> 'a Ctypes.ptr -> llexecutionengine -> unit
 
-(** [get_pointer_to_global gv typ ee] returns the value of the global
-    variable [gv] in the execution engine [ee] as type [typ], which may
-    be a pointer type (e.g. [int ptr typ]) for global variables or
-    a function (e.g. [(int -> int) typ]) type for functions, and which
-    will be live as long as [gv] and [ee] are. *)
-val get_pointer_to_global : Llvm.llvalue -> 'a Ctypes.typ -> llexecutionengine -> 'a
+(** [get_global_value_address id typ ee] returns a pointer to the
+    identifier [id] as type [typ], which will be a pointer type for a
+    value, and which will be live as long as [id] and [ee]
+    are. Caution: this function finalizes, i.e. forces code
+    generation, all loaded modules.  Further modifications to the
+    modules will not have any effect. *)
+val get_global_value_address : string -> 'a Ctypes.typ -> llexecutionengine -> 'a
+
+(** [get_function_address fn typ ee] returns a pointer to the function
+    [fn] as type [typ], which will be a pointer type for a function
+    (e.g. [(int -> int) typ]), and which will be live as long as [fn]
+    and [ee] are. Caution: this function finalizes, i.e. forces code
+    generation, all loaded modules.  Further modifications to the
+    modules will not have any effect. *)
+val get_function_address : string -> 'a Ctypes.typ -> llexecutionengine -> 'a
diff --git a/bindings/ocaml/irreader/CMakeLists.txt b/bindings/ocaml/irreader/CMakeLists.txt
new file mode 100644
index 0000000..87d269b
--- /dev/null
+++ b/bindings/ocaml/irreader/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_ocaml_library(llvm_irreader
+  OCAML    llvm_irreader
+  OCAMLDEP llvm
+  C        irreader_ocaml
+  LLVM     irreader)
diff --git a/bindings/ocaml/linker/CMakeLists.txt b/bindings/ocaml/linker/CMakeLists.txt
new file mode 100644
index 0000000..b6bc8ac
--- /dev/null
+++ b/bindings/ocaml/linker/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_ocaml_library(llvm_linker
+  OCAML    llvm_linker
+  OCAMLDEP llvm
+  C        linker_ocaml
+  LLVM     linker)
diff --git a/bindings/ocaml/linker/linker_ocaml.c b/bindings/ocaml/linker/linker_ocaml.c
index ed37777..3b8512a 100644
--- a/bindings/ocaml/linker/linker_ocaml.c
+++ b/bindings/ocaml/linker/linker_ocaml.c
@@ -23,11 +23,11 @@
 
 void llvm_raise(value Prototype, char *Message);
 
-/* llmodule -> llmodule -> Mode.t -> unit */
-CAMLprim value llvm_link_modules(LLVMModuleRef Dst, LLVMModuleRef Src, value Mode) {
+/* llmodule -> llmodule -> unit */
+CAMLprim value llvm_link_modules(LLVMModuleRef Dst, LLVMModuleRef Src) {
   char* Message;
 
-  if (LLVMLinkModules(Dst, Src, Int_val(Mode), &Message))
+  if (LLVMLinkModules(Dst, Src, 0, &Message))
     llvm_raise(*caml_named_value("Llvm_linker.Error"), Message);
 
   return Val_unit;
diff --git a/bindings/ocaml/linker/llvm_linker.ml b/bindings/ocaml/linker/llvm_linker.ml
index 5854d70..3044abd 100644
--- a/bindings/ocaml/linker/llvm_linker.ml
+++ b/bindings/ocaml/linker/llvm_linker.ml
@@ -11,11 +11,5 @@ exception Error of string
 
 let () = Callback.register_exception "Llvm_linker.Error" (Error "")
 
-module Mode = struct
-  type t =
-  | DestroySource
-  | PreserveSource
-end
-
-external link_modules : Llvm.llmodule -> Llvm.llmodule -> Mode.t -> unit
+external link_modules : Llvm.llmodule -> Llvm.llmodule -> unit
                       = "llvm_link_modules"
diff --git a/bindings/ocaml/linker/llvm_linker.mli b/bindings/ocaml/linker/llvm_linker.mli
index 4def7a8..06c3b92 100644
--- a/bindings/ocaml/linker/llvm_linker.mli
+++ b/bindings/ocaml/linker/llvm_linker.mli
@@ -14,13 +14,6 @@
 
 exception Error of string
 
-(** Linking mode. *)
-module Mode : sig
-  type t =
-  | DestroySource
-  | PreserveSource
-end
-
 (** [link_modules dst src mode] links [src] into [dst], raising [Error]
     if the linking fails. *)
-val link_modules : Llvm.llmodule -> Llvm.llmodule -> Mode.t -> unit
-\ No newline at end of file
+val link_modules : Llvm.llmodule -> Llvm.llmodule -> unit
+\ No newline at end of file
diff --git a/bindings/ocaml/llvm/CMakeLists.txt b/bindings/ocaml/llvm/CMakeLists.txt
new file mode 100644
index 0000000..4956fa4
--- /dev/null
+++ b/bindings/ocaml/llvm/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_ocaml_library(llvm
+  OCAML llvm
+  C     llvm_ocaml
+  LLVM  core support)
+
+configure_file(
+  "${CMAKE_CURRENT_SOURCE_DIR}/META.llvm.in"
+  "${LLVM_LIBRARY_DIR}/ocaml/META.llvm")
+
+install(FILES "${LLVM_LIBRARY_DIR}/ocaml/META.llvm"
+        DESTINATION lib/ocaml)
diff --git a/bindings/ocaml/llvm/META.llvm.in b/bindings/ocaml/llvm/META.llvm.in
index f9808c7..92896e3 100644
--- a/bindings/ocaml/llvm/META.llvm.in
+++ b/bindings/ocaml/llvm/META.llvm.in
@@ -61,6 +61,14 @@ package "scalar_opts" (
     archive(native) = "llvm_scalar_opts.cmxa"
 )
 
+package "transform_utils" (
+    requires = "llvm"
+    version = "@PACKAGE_VERSION@"
+    description = "Transform utilities for LLVM"
+    archive(byte) = "llvm_transform_utils.cma"
+    archive(native) = "llvm_transform_utils.cmxa"
+)
+
 package "vectorize" (
     requires = "llvm"
     version = "@PACKAGE_VERSION@"
diff --git a/bindings/ocaml/llvm/Makefile b/bindings/ocaml/llvm/Makefile
index fb682c7..c0785a1 100644
--- a/bindings/ocaml/llvm/Makefile
+++ b/bindings/ocaml/llvm/Makefile
@@ -13,7 +13,7 @@
 
 LEVEL := ../../..
 LIBRARYNAME := llvm
-UsedComponents := core transformutils
+UsedComponents := core
 UsedOcamlLibs := llvm
 ExtraLibs := -lstdc++
 
diff --git a/bindings/ocaml/llvm/llvm.ml b/bindings/ocaml/llvm/llvm.ml
index 0df4d40..9a3cb1f 100644
--- a/bindings/ocaml/llvm/llvm.ml
+++ b/bindings/ocaml/llvm/llvm.ml
@@ -313,7 +313,6 @@ external mdkind_id : llcontext -> string -> llmdkind = "llvm_mdkind_id"
 (*===-- Modules -----------------------------------------------------------===*)
 external create_module : llcontext -> string -> llmodule = "llvm_create_module"
 external dispose_module : llmodule -> unit = "llvm_dispose_module"
-external clone_module : llmodule -> llmodule = "LLVMCloneModule"
 external target_triple: llmodule -> string
                       = "llvm_target_triple"
 external set_target_triple: string -> llmodule -> unit
@@ -460,6 +459,7 @@ external clear_metadata : llvalue -> llmdkind -> unit = "llvm_clear_metadata"
 (*--... Operations on metadata .......,.....................................--*)
 external mdstring : llcontext -> string -> llvalue = "llvm_mdstring"
 external mdnode : llcontext -> llvalue array -> llvalue = "llvm_mdnode"
+external mdnull : llcontext -> llvalue = "llvm_mdnull"
 external get_mdstring : llvalue -> string option = "llvm_get_mdstring"
 external get_named_metadata : llmodule -> string -> llvalue array
                             = "llvm_get_namedmd"
@@ -1300,6 +1300,8 @@ external build_fcmp : Fcmp.t -> llvalue -> llvalue -> string ->
 (*--... Miscellaneous instructions .........................................--*)
 external build_phi : (llvalue * llbasicblock) list -> string -> llbuilder ->
                      llvalue = "llvm_build_phi"
+external build_empty_phi : lltype -> string -> llbuilder -> llvalue
+                         = "llvm_build_empty_phi"
 external build_call : llvalue -> llvalue array -> string -> llbuilder -> llvalue
                     = "llvm_build_call"
 external build_select : llvalue -> llvalue -> llvalue -> string -> llbuilder ->
diff --git a/bindings/ocaml/llvm/llvm.mli b/bindings/ocaml/llvm/llvm.mli
index e5e90c3..dcda027 100644
--- a/bindings/ocaml/llvm/llvm.mli
+++ b/bindings/ocaml/llvm/llvm.mli
@@ -431,9 +431,6 @@ val create_module : llcontext -> string -> llmodule
     [llvm::Module::~Module]. *)
 val dispose_module : llmodule -> unit
 
-(** [clone_module m] returns an exact copy of module [m]. *)
-val clone_module : llmodule -> llmodule
-
 (** [target_triple m] is the target specifier for the module [m], something like
     [i686-apple-darwin8]. See the method [llvm::Module::getTargetTriple]. *)
 val target_triple: llmodule -> string
@@ -822,6 +819,9 @@ val mdstring : llcontext -> string -> llvalue
     See the method [llvm::MDNode::get]. *)
 val mdnode : llcontext -> llvalue array -> llvalue
 
+(** [mdnull c ] returns a null MDNode in context [c].  *)
+val mdnull : llcontext -> llvalue
+
 (** [get_mdstring v] returns the MDString.
     See the method [llvm::MDString::getString] *)
 val get_mdstring : llvalue -> string option
@@ -2422,6 +2422,12 @@ val build_fcmp : Fcmp.t -> llvalue -> llvalue -> string ->
 val build_phi : (llvalue * llbasicblock) list -> string -> llbuilder ->
                      llvalue
 
+(** [build_empty_phi ty name b] creates a
+    [%name = phi %ty] instruction at the position specified by
+    the instruction builder [b]. [ty] is the type of the instruction.
+    See the method [llvm::LLVMBuilder::CreatePHI]. *)
+val build_empty_phi : lltype -> string -> llbuilder -> llvalue
+
 (** [build_call fn args name b] creates a
     [%name = call %fn(args...)]
     instruction at the position specified by the instruction builder [b].
diff --git a/bindings/ocaml/llvm/llvm_ocaml.c b/bindings/ocaml/llvm/llvm_ocaml.c
index 63c235d..3889f92 100644
--- a/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/bindings/ocaml/llvm/llvm_ocaml.c
@@ -666,6 +666,11 @@ CAMLprim LLVMValueRef llvm_mdnode(LLVMContextRef C, value ElementVals) {
                              Wosize_val(ElementVals));
 }
 
+/* llcontext -> llvalue */
+CAMLprim LLVMValueRef llvm_mdnull(LLVMContextRef C) {
+  return NULL;
+}
+
 /* llvalue -> string option */
 CAMLprim value llvm_get_mdstring(LLVMValueRef V) {
   CAMLparam0();
@@ -2186,6 +2191,15 @@ CAMLprim LLVMValueRef llvm_build_phi(value Incoming, value Name, value B) {
   return PhiNode;
 }
 
+/* lltype -> string -> llbuilder -> value */
+CAMLprim LLVMValueRef llvm_build_empty_phi(LLVMTypeRef Type, value Name, value B) {
+  LLVMValueRef PhiNode;
+
+  return LLVMBuildPhi(Builder_val(B), Type, String_val(Name));
+
+  return PhiNode;
+}
+
 /* llvalue -> llvalue array -> string -> llbuilder -> llvalue */
 CAMLprim LLVMValueRef llvm_build_call(LLVMValueRef Fn, value Params,
                                       value Name, value B) {
diff --git a/bindings/ocaml/target/CMakeLists.txt b/bindings/ocaml/target/CMakeLists.txt
new file mode 100644
index 0000000..adee0fc
--- /dev/null
+++ b/bindings/ocaml/target/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_ocaml_library(llvm_target
+  OCAML    llvm_target
+  OCAMLDEP llvm
+  C        target_ocaml
+  LLVM     target)
diff --git a/bindings/ocaml/transforms/CMakeLists.txt b/bindings/ocaml/transforms/CMakeLists.txt
new file mode 100644
index 0000000..beb8694
--- /dev/null
+++ b/bindings/ocaml/transforms/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_subdirectory(ipo)
+add_subdirectory(passmgr_builder)
+add_subdirectory(scalar_opts)
+add_subdirectory(utils)
+add_subdirectory(vectorize)
diff --git a/bindings/ocaml/transforms/Makefile b/bindings/ocaml/transforms/Makefile
index f3637a6..15bffb4 100644
--- a/bindings/ocaml/transforms/Makefile
+++ b/bindings/ocaml/transforms/Makefile
@@ -8,7 +8,7 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL := ../../..
-DIRS = scalar_opts ipo vectorize passmgr_builder
+DIRS = ipo passmgr_builder scalar_opts utils vectorize
 
 ocamldoc:
 	$(Verb) for i in $(DIRS) ; do \
diff --git a/bindings/ocaml/transforms/ipo/CMakeLists.txt b/bindings/ocaml/transforms/ipo/CMakeLists.txt
new file mode 100644
index 0000000..4b8784f
--- /dev/null
+++ b/bindings/ocaml/transforms/ipo/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_ocaml_library(llvm_ipo
+  OCAML    llvm_ipo
+  OCAMLDEP llvm
+  C        ipo_ocaml
+  LLVM     ipo)
diff --git a/bindings/ocaml/transforms/passmgr_builder/CMakeLists.txt b/bindings/ocaml/transforms/passmgr_builder/CMakeLists.txt
new file mode 100644
index 0000000..b012863
--- /dev/null
+++ b/bindings/ocaml/transforms/passmgr_builder/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_ocaml_library(llvm_passmgr_builder
+  OCAML    llvm_passmgr_builder
+  OCAMLDEP llvm
+  C        passmgr_builder_ocaml
+  LLVM     ipo)
diff --git a/bindings/ocaml/transforms/scalar_opts/CMakeLists.txt b/bindings/ocaml/transforms/scalar_opts/CMakeLists.txt
new file mode 100644
index 0000000..98c7c68
--- /dev/null
+++ b/bindings/ocaml/transforms/scalar_opts/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_ocaml_library(llvm_scalar_opts
+  OCAML    llvm_scalar_opts
+  OCAMLDEP llvm
+  C        scalar_opts_ocaml
+  LLVM     scalaropts)
diff --git a/bindings/ocaml/transforms/utils/CMakeLists.txt b/bindings/ocaml/transforms/utils/CMakeLists.txt
new file mode 100644
index 0000000..37f3eb7
--- /dev/null
+++ b/bindings/ocaml/transforms/utils/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_ocaml_library(llvm_transform_utils
+  OCAML    llvm_transform_utils
+  OCAMLDEP llvm
+  C        transform_utils_ocaml
+  LLVM     transformutils)
diff --git a/bindings/ocaml/transforms/utils/Makefile b/bindings/ocaml/transforms/utils/Makefile
new file mode 100644
index 0000000..76a6f0b
--- /dev/null
+++ b/bindings/ocaml/transforms/utils/Makefile
@@ -0,0 +1,19 @@
+##===- bindings/ocaml/transforms/utils/Makefile ------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# This is the makefile for the Objective Caml Llvm_vectorize interface.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../../../..
+LIBRARYNAME := llvm_transform_utils
+UsedComponents := transformutils
+UsedOcamlInterfaces := llvm
+
+include ../../Makefile.ocaml
diff --git a/bindings/ocaml/transforms/utils/llvm_transform_utils.ml b/bindings/ocaml/transforms/utils/llvm_transform_utils.ml
new file mode 100644
index 0000000..135efe2
--- /dev/null
+++ b/bindings/ocaml/transforms/utils/llvm_transform_utils.ml
@@ -0,0 +1,10 @@
+(*===-- llvm_transform_utils.ml - LLVM OCaml Interface --------*- OCaml -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*)
+
+external clone_module : Llvm.llmodule -> Llvm.llmodule = "llvm_clone_module"
diff --git a/bindings/ocaml/transforms/utils/llvm_transform_utils.mli b/bindings/ocaml/transforms/utils/llvm_transform_utils.mli
new file mode 100644
index 0000000..1c2b07c
--- /dev/null
+++ b/bindings/ocaml/transforms/utils/llvm_transform_utils.mli
@@ -0,0 +1,17 @@
+(*===-- llvm_transform_utils.mli - LLVM OCaml Interface -------*- OCaml -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*)
+
+(** Transform Utilities.
+
+    This interface provides an OCaml API for LLVM transform utilities, the
+    classes in the [LLVMTransformUtils] library. *)
+
+(** [clone_module m] returns an exact copy of module [m].
+    See the [llvm::CloneModule] function. *)
+external clone_module : Llvm.llmodule -> Llvm.llmodule = "llvm_clone_module"
diff --git a/bindings/ocaml/transforms/utils/transform_utils_ocaml.c b/bindings/ocaml/transforms/utils/transform_utils_ocaml.c
new file mode 100644
index 0000000..75b2052
--- /dev/null
+++ b/bindings/ocaml/transforms/utils/transform_utils_ocaml.c
@@ -0,0 +1,31 @@
+/*===-- vectorize_ocaml.c - LLVM OCaml Glue ---------------------*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file glues LLVM's OCaml interface to its C interface. These functions *|
+|* are by and large transparent wrappers to the corresponding C functions.    *|
+|*                                                                            *|
+|* Note that these functions intentionally take liberties with the CAMLparamX *|
+|* macros, since most of the parameters are not GC heap objects.              *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c/Core.h"
+#include "caml/mlvalues.h"
+#include "caml/misc.h"
+
+/*
+ * Do not move directly into external. This function is here to pull in
+ * -lLLVMTransformUtils, which would otherwise be not linked on static builds,
+ * as ld can't see the reference from OCaml code.
+ */
+
+/* llmodule -> llmodule */
+CAMLprim LLVMModuleRef llvm_clone_module(LLVMModuleRef M) {
+  return LLVMCloneModule(M);
+}
diff --git a/bindings/ocaml/transforms/vectorize/CMakeLists.txt b/bindings/ocaml/transforms/vectorize/CMakeLists.txt
new file mode 100644
index 0000000..af0ffce
--- /dev/null
+++ b/bindings/ocaml/transforms/vectorize/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_ocaml_library(llvm_vectorize
+  OCAML    llvm_vectorize
+  OCAMLDEP llvm
+  C        vectorize_ocaml
+  LLVM     vectorize)
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 5204f6c..d646498 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -79,6 +79,7 @@ check_symbol_exists(FE_INEXACT "fenv.h" HAVE_DECL_FE_INEXACT)
 
 check_include_file(mach/mach.h HAVE_MACH_MACH_H)
 check_include_file(mach-o/dyld.h HAVE_MACH_O_DYLD_H)
+check_include_file(histedit.h HAVE_HISTEDIT_H)
 
 # size_t must be defined before including cxxabi.h on FreeBSD 10.0.
 check_cxx_source_compiles("
@@ -110,7 +111,9 @@ if( NOT PURE_WINDOWS )
   else()
     set(HAVE_LIBZ 0)
   endif()
-  check_library_exists(edit el_init "" HAVE_LIBEDIT)
+  if (HAVE_HISTEDIT_H)
+    check_library_exists(edit el_init "" HAVE_LIBEDIT)
+  endif()
   if(LLVM_ENABLE_TERMINFO)
     set(HAVE_TERMINFO 0)
     foreach(library tinfo terminfo curses ncurses ncursesw)
@@ -160,6 +163,7 @@ if( HAVE_SYS_UIO_H )
   check_symbol_exists(writev sys/uio.h HAVE_WRITEV)
 endif()
 check_symbol_exists(nearbyintf math.h HAVE_NEARBYINTF)
+check_symbol_exists(mallctl malloc_np.h HAVE_MALLCTL)
 check_symbol_exists(mallinfo malloc.h HAVE_MALLINFO)
 check_symbol_exists(malloc_zone_statistics malloc/malloc.h
                     HAVE_MALLOC_ZONE_STATISTICS)
@@ -198,7 +202,9 @@ if( PURE_WINDOWS )
   check_function_exists(_alloca HAVE__ALLOCA)
   check_function_exists(__alloca HAVE___ALLOCA)
   check_function_exists(__chkstk HAVE___CHKSTK)
+  check_function_exists(__chkstk_ms HAVE___CHKSTK_MS)
   check_function_exists(___chkstk HAVE____CHKSTK)
+  check_function_exists(___chkstk_ms HAVE____CHKSTK_MS)
 
   check_function_exists(__ashldi3 HAVE___ASHLDI3)
   check_function_exists(__ashrdi3 HAVE___ASHRDI3)
@@ -424,6 +430,24 @@ if( MSVC )
   set(SHLIBEXT ".lib")
   set(stricmp "_stricmp")
   set(strdup "_strdup")
+
+  # See if the DIA SDK is available and usable.
+  set(MSVC_DIA_SDK_DIR "$ENV{VSINSTALLDIR}DIA SDK")
+
+  # Due to a bug in MSVC 2013's installation software, it is possible
+  # for MSVC 2013 to write the DIA SDK into the Visual Studio 2012
+  # install directory.  If this happens, the installation is corrupt
+  # and there's nothing we can do.  It happens with enough frequency
+  # though that we should handle it.  We do so by simply checking that
+  # the DIA SDK folder exists.  Should this happen you will need to
+  # uninstall VS 2012 and then re-install VS 2013.
+  if (IS_DIRECTORY ${MSVC_DIA_SDK_DIR})
+    set(HAVE_DIA_SDK 1)
+  else()
+    set(HAVE_DIA_SDK 0)
+  endif()
+else()
+  set(HAVE_DIA_SDK 0)
 endif( MSVC )
 
 if( PURE_WINDOWS )
@@ -516,3 +540,35 @@ else()
     endif()
   endif()
 endif()
+
+find_program(GOLD_EXECUTABLE NAMES ld.gold ld DOC "The gold linker")
+if(GOLD_EXECUTABLE)
+	set(LLVM_BINUTILS_INCDIR "" CACHE PATH
+		"PATH to binutils/include containing plugin-api.h for gold plugin.")
+endif()
+
+include(FindOCaml)
+include(AddOCaml)
+if(WIN32)
+  message(STATUS "OCaml bindings disabled.")
+else()
+  find_package(OCaml)
+  if( NOT OCAML_FOUND )
+    message(STATUS "OCaml bindings disabled.")
+  else()
+    if( OCAML_VERSION VERSION_LESS "4.00.0" )
+      message(STATUS "OCaml bindings disabled, need OCaml >=4.00.0.")
+    else()
+      find_ocamlfind_package(ctypes VERSION 0.3 OPTIONAL)
+      if( HAVE_OCAML_CTYPES )
+        message(STATUS "OCaml bindings enabled.")
+        find_ocamlfind_package(oUnit VERSION 2 OPTIONAL)
+        set(LLVM_BINDINGS "${LLVM_BINDINGS} ocaml")
+      else()
+        message(STATUS "OCaml bindings disabled, need ctypes >=0.3.")
+      endif()
+    endif()
+  endif()
+endif()
+
+string(REPLACE " " ";" LLVM_BINDINGS_LIST "${LLVM_BINDINGS}")
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index bc26f06..7f63644 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -10,7 +10,7 @@ function(llvm_update_compile_flags name)
 
   # LLVM_REQUIRES_EH is an internal flag that individual
   # targets can use to force EH
-  if(LLVM_REQUIRES_EH OR LLVM_ENABLE_EH)
+  if((LLVM_REQUIRES_EH OR LLVM_ENABLE_EH) AND NOT CLANG_CL)
     if(NOT (LLVM_REQUIRES_RTTI OR LLVM_ENABLE_RTTI))
       message(AUTHOR_WARNING "Exception handling requires RTTI. Enabling RTTI for ${name}")
       set(LLVM_REQUIRES_RTTI ON)
@@ -22,6 +22,10 @@ function(llvm_update_compile_flags name)
       list(APPEND LLVM_COMPILE_DEFINITIONS _HAS_EXCEPTIONS=0)
       list(APPEND LLVM_COMPILE_FLAGS "/EHs-c-")
     endif()
+    if (CLANG_CL)
+      # FIXME: Remove this once clang-cl supports SEH
+      list(APPEND LLVM_COMPILE_DEFINITIONS "GTEST_HAS_SEH=0")
+    endif()
   endif()
 
   # LLVM_REQUIRES_RTTI is an internal flag that individual
@@ -142,6 +146,17 @@ function(add_llvm_symbol_exports target_name export_file)
   set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE)
 endfunction(add_llvm_symbol_exports)
 
+if(NOT WIN32 AND NOT APPLE)
+  execute_process(
+    COMMAND ${CMAKE_C_COMPILER} -Wl,--version
+    OUTPUT_VARIABLE stdout
+    ERROR_QUIET
+    )
+  if("${stdout}" MATCHES "GNU gold")
+    set(LLVM_LINKER_IS_GOLD ON)
+  endif()
+endif()
+
 function(add_link_opts target_name)
   # Pass -O3 to the linker. This enabled different optimizations on different
   # linkers.
@@ -150,12 +165,20 @@ function(add_link_opts target_name)
                  LINK_FLAGS " -Wl,-O3")
   endif()
 
+  if(LLVM_LINKER_IS_GOLD)
+    # With gold gc-sections is always safe.
+    set_property(TARGET ${target_name} APPEND_STRING PROPERTY
+                 LINK_FLAGS " -Wl,--gc-sections")
+    # Note that there is a bug with -Wl,--icf=safe so it is not safe
+    # to enable. See https://sourceware.org/bugzilla/show_bug.cgi?id=17704.
+  endif()
+
   if(NOT LLVM_NO_DEAD_STRIP)
     if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
       # ld64's implementation of -dead_strip breaks tools that use plugins.
       set_property(TARGET ${target_name} APPEND_STRING PROPERTY
                    LINK_FLAGS " -Wl,-dead_strip")
-    elseif(NOT WIN32)
+    elseif(NOT WIN32 AND NOT LLVM_LINKER_IS_GOLD)
       # Object files are compiled with -ffunction-data-sections.
       # Versions of bfd ld < 2.23.1 have a bug in --gc-sections that breaks
       # tools that use plugins. Always pass --gc-sections once we require
@@ -315,11 +338,11 @@ function(llvm_add_library name)
         PREFIX ""
         )
     endif()
-    if (MSVC)
-      set_target_properties(${name}
-        PROPERTIES
-        IMPORT_SUFFIX ".imp")
-    endif ()
+    
+    set_target_properties(${name}
+      PROPERTIES
+      SOVERSION ${LLVM_VERSION_MAJOR}
+      VERSION ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}${LLVM_VERSION_SUFFIX})
   endif()
 
   if(ARG_MODULE OR ARG_SHARED)
@@ -394,7 +417,16 @@ macro(add_llvm_library name)
         EXPORT LLVMExports
         RUNTIME DESTINATION bin
         LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-        ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX})
+        ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+        COMPONENT ${name})
+
+      if (NOT CMAKE_CONFIGURATION_TYPES)
+        add_custom_target(install-${name}
+                          DEPENDS ${name}
+                          COMMAND "${CMAKE_COMMAND}"
+                                  -DCMAKE_INSTALL_COMPONENT=${name}
+                                  -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
+      endif()
     endif()
     set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS ${name})
   endif()
@@ -475,7 +507,16 @@ macro(add_llvm_tool name)
     if( LLVM_BUILD_TOOLS )
       install(TARGETS ${name}
               EXPORT LLVMExports
-              RUNTIME DESTINATION bin)
+              RUNTIME DESTINATION bin
+              COMPONENT ${name})
+
+      if (NOT CMAKE_CONFIGURATION_TYPES)
+        add_custom_target(install-${name}
+                          DEPENDS ${name}
+                          COMMAND "${CMAKE_COMMAND}"
+                                  -DCMAKE_INSTALL_COMPONENT=${name}
+                                  -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
+      endif()
     endif()
   endif()
   if( LLVM_BUILD_TOOLS )
@@ -570,12 +611,6 @@ function(add_unittest test_suite test_name)
     set(EXCLUDE_FROM_ALL ON)
   endif()
 
-  # Visual Studio 2012 only supports up to 8 template parameters in
-  # std::tr1::tuple by default, but gtest requires 10
-  if (MSVC AND MSVC_VERSION EQUAL 1700)
-    list(APPEND LLVM_COMPILE_DEFINITIONS _VARIADIC_MAX=10)
-  endif ()
-
   include_directories(${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include)
   if (NOT LLVM_ENABLE_THREADS)
     list(APPEND LLVM_COMPILE_DEFINITIONS GTEST_HAS_PTHREAD=0)
@@ -603,6 +638,36 @@ function(add_unittest test_suite test_name)
   endif ()
 endfunction()
 
+function(llvm_add_go_executable binary pkgpath)
+  cmake_parse_arguments(ARG "ALL" "" "DEPENDS;GOFLAGS" ${ARGN})
+
+  if(LLVM_BINDINGS MATCHES "go")
+    # FIXME: This should depend only on the libraries Go needs.
+    get_property(llvmlibs GLOBAL PROPERTY LLVM_LIBS)
+    set(binpath ${CMAKE_BINARY_DIR}/bin/${binary}${CMAKE_EXECUTABLE_SUFFIX})
+    set(cc "${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_ARG1}")
+    set(cxx "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1}")
+    set(cppflags "")
+    get_property(include_dirs DIRECTORY PROPERTY INCLUDE_DIRECTORIES)
+    foreach(d ${include_dirs})
+      set(cppflags "${cppflags} -I${d}")
+    endforeach(d)
+    set(ldflags "${CMAKE_EXE_LINKER_FLAGS}")
+    add_custom_command(OUTPUT ${binpath}
+      COMMAND ${CMAKE_BINARY_DIR}/bin/llvm-go "cc=${cc}" "cxx=${cxx}" "cppflags=${cppflags}" "ldflags=${ldflags}"
+              ${ARG_GOFLAGS} build -o ${binpath} ${pkgpath}
+      DEPENDS llvm-config ${CMAKE_BINARY_DIR}/bin/llvm-go${CMAKE_EXECUTABLE_SUFFIX}
+              ${llvmlibs} ${ARG_DEPENDS}
+      COMMENT "Building Go executable ${binary}"
+      VERBATIM)
+    if (ARG_ALL)
+      add_custom_target(${binary} ALL DEPENDS ${binpath})
+    else()
+      add_custom_target(${binary} DEPENDS ${binpath})
+    endif()
+  endif()
+endfunction()
+
 # This function provides an automatic way to 'configure'-like generate a file
 # based on a set of common and custom variables, specifically targeting the
 # variables needed for the 'lit.site.cfg' files. This function bundles the
@@ -675,18 +740,20 @@ function(add_lit_target target comment)
   foreach(param ${ARG_PARAMS})
     list(APPEND LIT_COMMAND --param ${param})
   endforeach()
-  if( ARG_DEPENDS )
+  if (ARG_DEFAULT_ARGS)
     add_custom_target(${target}
       COMMAND ${LIT_COMMAND} ${ARG_DEFAULT_ARGS}
       COMMENT "${comment}"
       ${cmake_3_2_USES_TERMINAL}
       )
-    add_dependencies(${target} ${ARG_DEPENDS})
   else()
     add_custom_target(${target}
       COMMAND ${CMAKE_COMMAND} -E echo "${target} does nothing, no tools built.")
     message(STATUS "${target} does nothing.")
   endif()
+  if (ARG_DEPENDS)
+    add_dependencies(${target} ${ARG_DEPENDS})
+  endif()
 
   # Tests should be excluded from "Build Solution".
   set_target_properties(${target} PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD ON)
diff --git a/cmake/modules/AddOCaml.cmake b/cmake/modules/AddOCaml.cmake
new file mode 100644
index 0000000..c58ac9c
--- /dev/null
+++ b/cmake/modules/AddOCaml.cmake
@@ -0,0 +1,201 @@
+# CMake build rules for the OCaml language.
+# Assumes FindOCaml is used.
+# http://ocaml.org/
+#
+# Example usage:
+#
+# add_ocaml_library(pkg_a OCAML mod_a OCAMLDEP pkg_b C mod_a_stubs PKG ctypes LLVM core)
+#
+# Unnamed parameters:
+#
+#   * Library name.
+#
+# Named parameters:
+#
+# OCAML     OCaml module names. Imply presence of a corresponding .ml and .mli files.
+# OCAMLDEP  Names of libraries this library depends on.
+# C         C stub sources. Imply presence of a corresponding .c file.
+# CFLAGS    Additional arguments passed when compiling C stubs.
+# PKG       Names of ocamlfind packages this library depends on.
+# LLVM      Names of LLVM libraries this library depends on.
+# NOCOPY    Do not automatically copy sources (.c, .ml, .mli) from the source directory,
+#           e.g. if they are generated.
+#
+
+function(add_ocaml_library name)
+  CMAKE_PARSE_ARGUMENTS(ARG "NOCOPY" "" "OCAML;OCAMLDEP;C;CFLAGS;PKG;LLVM" ${ARGN})
+
+  set(src ${CMAKE_CURRENT_SOURCE_DIR})
+  set(bin ${CMAKE_CURRENT_BINARY_DIR})
+
+  set(ocaml_pkgs)
+  foreach( ocaml_pkg ${ARG_PKG} )
+    list(APPEND ocaml_pkgs "-package" "${ocaml_pkg}")
+  endforeach()
+
+  set(sources)
+
+  set(ocaml_inputs)
+
+  set(ocaml_outputs "${bin}/${name}.cma")
+  if( ARG_C )
+    list(APPEND ocaml_outputs
+         "${bin}/lib${name}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+    if ( BUILD_SHARED_LIBS )
+      list(APPEND ocaml_outputs
+           "${bin}/dll${name}${CMAKE_SHARED_LIBRARY_SUFFIX}")
+    endif()
+  endif()
+  if( HAVE_OCAMLOPT )
+    list(APPEND ocaml_outputs
+         "${bin}/${name}.cmxa"
+         "${bin}/${name}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+  endif()
+
+  set(ocaml_flags "-lstdc++" "-ldopt" "-L${LLVM_LIBRARY_DIR}"
+                  "-ccopt" "-L\\$CAMLORIGIN/.."
+                  "-ccopt" "-Wl,-rpath,\\$CAMLORIGIN/.."
+                  ${ocaml_pkgs})
+
+  foreach( ocaml_dep ${ARG_OCAMLDEP} )
+    get_target_property(dep_ocaml_flags "ocaml_${ocaml_dep}" OCAML_FLAGS)
+    list(APPEND ocaml_flags ${dep_ocaml_flags})
+  endforeach()
+
+  if( NOT BUILD_SHARED_LIBS )
+    list(APPEND ocaml_flags "-custom")
+  endif()
+
+  explicit_map_components_to_libraries(llvm_libs ${ARG_LLVM})
+  foreach( llvm_lib ${llvm_libs} )
+    list(APPEND ocaml_flags "-l${llvm_lib}" )
+  endforeach()
+
+  get_property(system_libs TARGET LLVMSupport PROPERTY LLVM_SYSTEM_LIBS)
+  foreach(system_lib ${system_libs})
+    list(APPEND ocaml_flags "-l${system_lib}" )
+  endforeach()
+
+  string(REPLACE ";" " " ARG_CFLAGS "${ARG_CFLAGS}")
+  set(c_flags "${ARG_CFLAGS} ${LLVM_DEFINITIONS}")
+  foreach( include_dir ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR} )
+    set(c_flags "${c_flags} -I${include_dir}")
+  endforeach()
+
+  foreach( ocaml_file ${ARG_OCAML} )
+    list(APPEND sources "${ocaml_file}.mli" "${ocaml_file}.ml")
+
+    list(APPEND ocaml_inputs "${bin}/${ocaml_file}.mli" "${bin}/${ocaml_file}.ml")
+
+    list(APPEND ocaml_outputs "${bin}/${ocaml_file}.cmi" "${bin}/${ocaml_file}.cmo")
+    if( HAVE_OCAMLOPT )
+      list(APPEND ocaml_outputs
+           "${bin}/${ocaml_file}.cmx"
+           "${bin}/${ocaml_file}${CMAKE_C_OUTPUT_EXTENSION}")
+    endif()
+  endforeach()
+
+  foreach( c_file ${ARG_C} )
+    list(APPEND sources "${c_file}.c")
+
+    list(APPEND c_inputs  "${bin}/${c_file}.c")
+    list(APPEND c_outputs "${bin}/${c_file}${CMAKE_C_OUTPUT_EXTENSION}")
+  endforeach()
+
+  if( NOT ARG_NOCOPY )
+    foreach( source ${sources} )
+      add_custom_command(
+          OUTPUT "${bin}/${source}"
+          COMMAND "${CMAKE_COMMAND}" "-E" "copy" "${src}/${source}" "${bin}"
+          DEPENDS "${src}/${source}"
+          COMMENT "Copying ${source} to build area")
+    endforeach()
+  endif()
+
+  foreach( c_input ${c_inputs} )
+    get_filename_component(basename "${c_input}" NAME_WE)
+    add_custom_command(
+      OUTPUT "${basename}${CMAKE_C_OUTPUT_EXTENSION}"
+      COMMAND "${OCAMLFIND}" "ocamlc" "-c" "${c_input}" -ccopt ${c_flags}
+      DEPENDS "${c_input}"
+      COMMENT "Building OCaml stub object file ${basename}${CMAKE_C_OUTPUT_EXTENSION}"
+      VERBATIM)
+  endforeach()
+
+  set(ocaml_params)
+  foreach( ocaml_input ${ocaml_inputs} ${c_outputs})
+    get_filename_component(filename "${ocaml_input}" NAME)
+    list(APPEND ocaml_params "${filename}")
+  endforeach()
+
+  if( APPLE )
+    set(ocaml_rpath "@executable_path/../../lib")
+  elseif( UNIX )
+    set(ocaml_rpath "\\$ORIGIN/../../lib")
+  endif()
+  list(APPEND ocaml_flags "-ldopt" "-Wl,-rpath,${ocaml_rpath}")
+
+  add_custom_command(
+    OUTPUT ${ocaml_outputs}
+    COMMAND "${OCAMLFIND}" "ocamlmklib" "-o" "${name}" ${ocaml_flags} ${ocaml_params}
+    DEPENDS ${ocaml_inputs} ${c_outputs}
+    COMMENT "Building OCaml library ${name}"
+    VERBATIM)
+
+  add_custom_command(
+    OUTPUT "${bin}/${name}.odoc"
+    COMMAND "${OCAMLFIND}" "ocamldoc"
+            "-I" "${bin}"
+            "-I" "${LLVM_LIBRARY_DIR}/ocaml/"
+            "-dump" "${bin}/${name}.odoc"
+            ${ocaml_pkgs} ${ocaml_inputs}
+    DEPENDS ${ocaml_inputs}
+    COMMENT "Building OCaml documentation for ${name}"
+    VERBATIM)
+
+  add_custom_target("ocaml_${name}" ALL DEPENDS ${ocaml_outputs} "${bin}/${name}.odoc")
+
+  set_target_properties("ocaml_${name}" PROPERTIES
+    OCAML_FLAGS "-I;${bin}")
+  set_target_properties("ocaml_${name}" PROPERTIES
+    OCAML_ODOC "${bin}/${name}.odoc")
+
+  foreach( ocaml_dep ${ARG_OCAMLDEP} )
+    add_dependencies("ocaml_${name}" "ocaml_${ocaml_dep}")
+  endforeach()
+
+  foreach( llvm_lib ${llvm_libs} )
+    add_dependencies("ocaml_${name}" "${llvm_lib}")
+  endforeach()
+
+  set(install_files)
+  set(install_shlibs)
+  foreach( ocaml_output ${ocaml_outputs} )
+    get_filename_component(ext "${ocaml_output}" EXT)
+
+    if( NOT (ext STREQUAL ".cmo" OR
+             ext STREQUAL CMAKE_C_OUTPUT_EXTENSION OR
+             ext STREQUAL CMAKE_SHARED_LIBRARY_SUFFIX) )
+      list(APPEND install_files "${ocaml_output}")
+    elseif( ext STREQUAL CMAKE_SHARED_LIBRARY_SUFFIX)
+      list(APPEND install_shlibs "${ocaml_output}")
+    endif()
+  endforeach()
+
+  install(FILES ${install_files}
+          DESTINATION lib/ocaml)
+  install(FILES ${install_shlibs}
+          PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE
+                      GROUP_READ GROUP_EXECUTE
+                      WORLD_READ WORLD_EXECUTE
+          DESTINATION lib/ocaml)
+
+  foreach( install_file ${install_files} ${install_shlibs} )
+    get_filename_component(filename "${install_file}" NAME)
+    add_custom_command(TARGET "ocaml_${name}" POST_BUILD
+      COMMAND "${CMAKE_COMMAND}" "-E" "copy" "${install_file}"
+                                             "${LLVM_LIBRARY_DIR}/ocaml/"
+      COMMENT "Copying OCaml library component ${filename} to intermediate area"
+      VERBATIM)
+  endforeach()
+endfunction()
diff --git a/cmake/modules/CMakeLists.txt b/cmake/modules/CMakeLists.txt
index c87193d..5f3f255 100644
--- a/cmake/modules/CMakeLists.txt
+++ b/cmake/modules/CMakeLists.txt
@@ -58,7 +58,7 @@ foreach(p ${_count})
 get_filename_component(LLVM_INSTALL_PREFIX \"\${LLVM_INSTALL_PREFIX}\" PATH)")
 endforeach(p)
 set(LLVM_CONFIG_INCLUDE_DIRS "\${LLVM_INSTALL_PREFIX}/include")
-set(LLVM_CONFIG_LIBRARY_DIRS "\${LLVM_INSTALL_PREFIX}/lib")
+set(LLVM_CONFIG_LIBRARY_DIRS "\${LLVM_INSTALL_PREFIX}/lib\${LLVM_LIBDIR_SUFFIX}")
 set(LLVM_CONFIG_CMAKE_DIR "\${LLVM_INSTALL_PREFIX}/${LLVM_INSTALL_PACKAGE_DIR}")
 set(LLVM_CONFIG_TOOLS_BINARY_DIR "\${LLVM_INSTALL_PREFIX}/bin")
 set(LLVM_CONFIG_EXPORTS_FILE "\${LLVM_CMAKE_DIR}/LLVMExports.cmake")
diff --git a/cmake/modules/CheckAtomic.cmake b/cmake/modules/CheckAtomic.cmake
index 2ed4819..30a5e31 100644
--- a/cmake/modules/CheckAtomic.cmake
+++ b/cmake/modules/CheckAtomic.cmake
@@ -2,9 +2,13 @@
 
 INCLUDE(CheckCXXSourceCompiles)
 
-check_library_exists(atomic __atomic_fetch_add_4 "" HAVE_LIBATOMIC)
-if (HAVE_LIBATOMIC)
-  list(APPEND CMAKE_REQUIRED_LIBRARIES "atomic")
+check_function_exists(__atomic_fetch_add_4 HAVE___ATOMIC_FETCH_ADD_4)
+if( NOT HAVE___ATOMIC_FETCH_ADD_4 )
+  check_library_exists(atomic __atomic_fetch_add_4 "" HAVE_LIBATOMIC)
+  set(HAVE_LIBATOMIC False)
+  if( HAVE_LIBATOMIC )
+    list(APPEND CMAKE_REQUIRED_LIBRARIES "atomic")
+  endif()
 endif()
 
 CHECK_CXX_SOURCE_COMPILES("
diff --git a/cmake/modules/FindOCaml.cmake b/cmake/modules/FindOCaml.cmake
new file mode 100644
index 0000000..8eba212
--- /dev/null
+++ b/cmake/modules/FindOCaml.cmake
@@ -0,0 +1,103 @@
+# CMake find_package() module for the OCaml language.
+# Assumes ocamlfind will be used for compilation.
+# http://ocaml.org/
+#
+# Example usage:
+#
+# find_package(OCaml)
+#
+# If successful, the following variables will be defined:
+# OCAMLFIND
+# OCAML_VERSION
+# OCAML_STDLIB_PATH
+# HAVE_OCAMLOPT
+#
+# Also provides find_ocamlfind_package() macro.
+#
+# Example usage:
+#
+# find_ocamlfind_package(ctypes)
+#
+# In any case, the following variables are defined:
+#
+# HAVE_OCAML_${pkg}
+#
+# If successful, the following variables will be defined:
+#
+# OCAML_${pkg}_VERSION
+
+include( FindPackageHandleStandardArgs )
+
+find_program(OCAMLFIND
+             NAMES ocamlfind)
+
+if( OCAMLFIND )
+  execute_process(
+    COMMAND ${OCAMLFIND} ocamlc -version
+    OUTPUT_VARIABLE OCAML_VERSION
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  execute_process(
+    COMMAND ${OCAMLFIND} ocamlc -where
+    OUTPUT_VARIABLE OCAML_STDLIB_PATH
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  execute_process(
+    COMMAND ${OCAMLFIND} ocamlc -version
+    OUTPUT_QUIET
+    RESULT_VARIABLE find_ocaml_result)
+  if( find_ocaml_result EQUAL 0 )
+    set(HAVE_OCAMLOPT TRUE)
+  else()
+    set(HAVE_OCAMLOPT FALSE)
+  endif()
+endif()
+
+find_package_handle_standard_args( OCaml DEFAULT_MSG
+  OCAMLFIND
+  OCAML_VERSION
+  OCAML_STDLIB_PATH)
+
+mark_as_advanced(
+  OCAMLFIND)
+
+function(find_ocamlfind_package pkg)
+  CMAKE_PARSE_ARGUMENTS(ARG "OPTIONAL" "VERSION" "" ${ARGN})
+
+  execute_process(
+    COMMAND "${OCAMLFIND}" "query" "${pkg}" "-format" "%v"
+    RESULT_VARIABLE result
+    OUTPUT_VARIABLE version
+    ERROR_VARIABLE error
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    ERROR_STRIP_TRAILING_WHITESPACE)
+
+  if( NOT result EQUAL 0 AND NOT ARG_OPTIONAL )
+    message(FATAL_ERROR ${error})
+  endif()
+
+  if( result EQUAL 0 )
+    set(found TRUE)
+  else()
+    set(found FALSE)
+  endif()
+
+  if( found AND ARG_VERSION )
+    if( version VERSION_LESS ARG_VERSION AND ARG_OPTIONAL )
+      # If it's optional and the constraint is not satisfied, pretend
+      # it wasn't found.
+      set(found FALSE)
+    elseif( version VERSION_LESS ARG_VERSION )
+      message(FATAL_ERROR
+              "ocamlfind package ${pkg} should have version ${ARG_VERSION} or newer")
+    endif()
+  endif()
+
+  string(TOUPPER ${pkg} pkg)
+
+  set(HAVE_OCAML_${pkg} ${found}
+      PARENT_SCOPE)
+
+  set(OCAML_${pkg}_VERSION ${version}
+      PARENT_SCOPE)
+endfunction()
diff --git a/cmake/modules/GetSVN.cmake b/cmake/modules/GetSVN.cmake
index 4e32c09..d512bd2 100644
--- a/cmake/modules/GetSVN.cmake
+++ b/cmake/modules/GetSVN.cmake
@@ -19,13 +19,83 @@ get_filename_component(LLVM_DIR "${LLVM_DIR}" PATH)
 # Handle strange terminals
 set(ENV{TERM} "dumb")
 
+macro(get_source_info_svn path revision repository)
+  # If svn is a bat file, find_program(Subversion) doesn't find it.
+  # Explicitly search for that here; Subversion_SVN_EXECUTABLE will override
+  # the find_program call in FindSubversion.cmake.
+  find_program(Subversion_SVN_EXECUTABLE NAMES svn svn.bat)
+
+  # FindSubversion does not work with symlinks. See PR 8437
+  if (NOT IS_SYMLINK "${path}")
+    find_package(Subversion)
+  endif()
+  if (Subversion_FOUND)
+    subversion_wc_info( ${path} Project )
+    if (Project_WC_REVISION)
+      set(${revision} ${Project_WC_REVISION} PARENT_SCOPE)
+    endif()
+    if (Project_WC_URL)
+      set(${repository} ${Project_WC_URL} PARENT_SCOPE)
+    endif()
+  endif()
+endmacro()
+
+macro(get_source_info_git_svn path revision repository)
+  find_program(git_executable NAMES git git.exe git.cmd)
+  if (git_executable)
+    execute_process(COMMAND ${git_executable} svn info
+      WORKING_DIRECTORY ${path}
+      TIMEOUT 5
+      RESULT_VARIABLE git_result
+      OUTPUT_VARIABLE git_output)
+    if (git_result EQUAL 0)
+      string(REGEX REPLACE "^(.*\n)?Revision: ([^\n]+).*"
+        "\\2" git_svn_rev "${git_output}")
+      set(${revision} ${git_svn_rev} PARENT_SCOPE)
+      string(REGEX REPLACE "^(.*\n)?URL: ([^\n]+).*"
+        "\\2" git_url "${git_output}")
+      set(${repository} ${git_url} PARENT_SCOPE)
+    endif()
+  endif()
+endmacro()
+
+macro(get_source_info_git path revision repository)
+  find_program(git_executable NAMES git git.exe git.cmd)
+  if (git_executable)
+    execute_process(COMMAND ${git_executable} log -1 --pretty=format:%H
+      WORKING_DIRECTORY ${path}
+      TIMEOUT 5
+      RESULT_VARIABLE git_result
+      OUTPUT_VARIABLE git_output)
+    if (git_result EQUAL 0)
+      set(${revision} ${git_output} PARENT_SCOPE)
+    endif()
+    execute_process(COMMAND ${git_executable} remote -v
+      WORKING_DIRECTORY ${path}
+      TIMEOUT 5
+      RESULT_VARIABLE git_result
+      OUTPUT_VARIABLE git_output)
+    if (git_result EQUAL 0)
+      string(REGEX REPLACE "^(.*\n)?[^ \t]+[ \t]+([^ \t\n]+)[ \t]+\\(fetch\\).*"
+        "\\2" git_url "${git_output}")
+      set(${repository} "${git_url}" PARENT_SCOPE)
+    endif()
+  endif()
+endmacro()
+
+function(get_source_info path revision repository)
+  if (EXISTS "${path}/.svn")
+    get_source_info_svn("${path}" revision repository)
+  elseif (EXISTS "${path}/.git/svn")
+    get_source_info_git_svn("${path}" revision repository)
+  elseif (EXISTS "${path}/.git")
+    get_source_info_git("${path}" revision repository)
+  endif()
+endfunction()
+
 function(append_info name path)
-  execute_process(COMMAND "${LLVM_DIR}/utils/GetSourceVersion" "${path}"
-    OUTPUT_VARIABLE revision)
+  get_source_info("${path}" revision repository)
   string(STRIP "${revision}" revision)
-  execute_process(COMMAND "${LLVM_DIR}/utils/GetRepositoryPath" "${path}"
-    OUTPUT_VARIABLE repository
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
   string(STRIP "${repository}" repository)
   file(APPEND "${HEADER_FILE}.txt"
     "#define ${name}_REVISION \"${revision}\"\n")
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 6cc6d65..fdc4ea0 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -21,11 +21,16 @@ if(NOT LLVM_FORCE_USE_OLD_TOOLCHAIN)
       message(FATAL_ERROR "Host Clang version must be at least 3.1!")
     endif()
 
-    # Also test that we aren't using too old of a version of libstdc++ with the
-    # Clang compiler. This is tricky as there is no real way to check the
-    # version of libstdc++ directly. Instead we test for a known bug in
-    # libstdc++4.6 that is fixed in libstdc++4.7.
-    if(NOT LLVM_ENABLE_LIBCXX)
+    if (CMAKE_CXX_SIMULATE_ID MATCHES "MSVC")
+      if (CMAKE_CXX_SIMULATE_VERSION VERSION_LESS 18.0)
+        message(FATAL_ERROR "Host Clang must have at least -fms-compatibility-version=18.0")
+      endif()
+      set(CLANG_CL 1)
+    elseif(NOT LLVM_ENABLE_LIBCXX)
+      # Otherwise, test that we aren't using too old of a version of libstdc++
+      # with the Clang compiler. This is tricky as there is no real way to
+      # check the version of libstdc++ directly. Instead we test for a known
+      # bug in libstdc++4.6 that is fixed in libstdc++4.7.
       set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
       set(OLD_CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES})
       set(CMAKE_REQUIRED_FLAGS "-std=c++0x")
@@ -41,8 +46,11 @@ int main() { return (float)x; }"
       set(CMAKE_REQUIRED_LIBRARIES ${OLD_CMAKE_REQUIRED_LIBRARIES})
     endif()
   elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
-    if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 17.0)
-      message(FATAL_ERROR "Host Visual Studio must be at least 2012 (MSVC 17.0)")
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0)
+      message(FATAL_ERROR "Host Visual Studio must be at least 2013")
+    elseif(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0.31101)
+      message(WARNING "Host Visual Studio should at least be 2013 Update 4 (MSVC 18.0.31101)"
+        "  due to miscompiles from earlier versions")
     endif()
   endif()
 endif()
@@ -79,8 +87,6 @@ if(WIN32)
     set(LLVM_ON_WIN32 1)
     set(LLVM_ON_UNIX 0)
   endif(CYGWIN)
-  # Maximum path length is 160 for non-unicode paths
-  set(MAXPATHLEN 160)
 else(WIN32)
   if(UNIX)
     set(LLVM_ON_WIN32 0)
@@ -90,8 +96,6 @@ else(WIN32)
     else(APPLE)
       set(LLVM_HAVE_LINK_VERSION_SCRIPT 1)
     endif(APPLE)
-    # FIXME: Maximum path length is currently set to 'safe' fixed value
-    set(MAXPATHLEN 2024)
   else(UNIX)
     MESSAGE(SEND_ERROR "Unable to determine platform")
   endif(UNIX)
@@ -108,6 +112,15 @@ if(APPLE)
   set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-flat_namespace -Wl,-undefined -Wl,suppress")
 endif()
 
+# Pass -Wl,-z,defs. This makes sure all symbols are defined. Otherwise a DSO
+# build might work on ELF but fail on MachO/COFF.
+if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin" OR WIN32 OR
+        ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") AND
+   NOT LLVM_USE_SANITIZER)
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,defs")
+endif()
+
+
 function(append value)
   foreach(variable ${ARGN})
     set(${variable} "${${variable}} ${value}" PARENT_SCOPE)
@@ -399,12 +412,20 @@ if(LLVM_USE_SANITIZER)
     elseif (LLVM_USE_SANITIZER STREQUAL "Thread")
       append_common_sanitizer_flags()
       append("-fsanitize=thread" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    elseif (LLVM_USE_SANITIZER STREQUAL "Address;Undefined" OR
+            LLVM_USE_SANITIZER STREQUAL "Undefined;Address")
+      append_common_sanitizer_flags()
+      append("-fsanitize=address,undefined -fno-sanitize=vptr,function -fno-sanitize-recover"
+              CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     else()
       message(WARNING "Unsupported value of LLVM_USE_SANITIZER: ${LLVM_USE_SANITIZER}")
     endif()
   else()
     message(WARNING "LLVM_USE_SANITIZER is not supported on this platform.")
   endif()
+  if (LLVM_USE_SANITIZE_COVERAGE)
+    append("-fsanitize-coverage=4" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
 endif()
 
 # Turn on -gsplit-dwarf if requested
diff --git a/cmake/modules/LLVM-Config.cmake b/cmake/modules/LLVM-Config.cmake
index e8c42fc..b24c129 100644
--- a/cmake/modules/LLVM-Config.cmake
+++ b/cmake/modules/LLVM-Config.cmake
@@ -152,29 +152,39 @@ function(llvm_map_components_to_libnames out_libs)
   set(${out_libs} ${expanded_components} PARENT_SCOPE)
 endfunction()
 
+# Perform a post-order traversal of the dependency graph.
+# This duplicates the algorithm used by llvm-config, originally
+# in tools/llvm-config/llvm-config.cpp, function ComputeLibsForComponents.
+function(expand_topologically name required_libs visited_libs)
+  list(FIND visited_libs ${name} found)
+  if( found LESS 0 )
+    list(APPEND visited_libs ${name})
+    set(visited_libs ${visited_libs} PARENT_SCOPE)
+
+    get_property(lib_deps GLOBAL PROPERTY LLVMBUILD_LIB_DEPS_${name})
+    foreach( lib_dep ${lib_deps} )
+      expand_topologically(${lib_dep} "${required_libs}" "${visited_libs}")
+      set(required_libs ${required_libs} PARENT_SCOPE)
+      set(visited_libs ${visited_libs} PARENT_SCOPE)
+    endforeach()
+
+    list(APPEND required_libs ${name})
+    set(required_libs ${required_libs} PARENT_SCOPE)
+  endif()
+endfunction()
+
 # Expand dependencies while topologically sorting the list of libraries:
 function(llvm_expand_dependencies out_libs)
   set(expanded_components ${ARGN})
-  list(LENGTH expanded_components lst_size)
-  set(cursor 0)
-  set(processed)
-  while( cursor LESS lst_size )
-    list(GET expanded_components ${cursor} lib)
-    get_property(lib_deps GLOBAL PROPERTY LLVMBUILD_LIB_DEPS_${lib})
-    list(APPEND expanded_components ${lib_deps})
-    # Remove duplicates at the front:
-    list(REVERSE expanded_components)
-    list(REMOVE_DUPLICATES expanded_components)
-    list(REVERSE expanded_components)
-    list(APPEND processed ${lib})
-    # Find the maximum index that doesn't have to be re-processed:
-    while(NOT "${expanded_components}" MATCHES "^${processed}.*" )
-      list(REMOVE_AT processed -1)
-    endwhile()
-    list(LENGTH processed cursor)
-    list(LENGTH expanded_components lst_size)
-  endwhile( cursor LESS lst_size )
-  set(${out_libs} ${expanded_components} PARENT_SCOPE)
+
+  set(required_libs)
+  set(visited_libs)
+  foreach( lib ${expanded_components} )
+    expand_topologically(${lib} "${required_libs}" "${visited_libs}")
+  endforeach()
+
+  list(REVERSE required_libs)
+  set(${out_libs} ${required_libs} PARENT_SCOPE)
 endfunction()
 
 function(explicit_map_components_to_libraries out_libs)
diff --git a/cmake/modules/LLVMConfig.cmake.in b/cmake/modules/LLVMConfig.cmake.in
index 780a608..9a9cd85 100644
--- a/cmake/modules/LLVMConfig.cmake.in
+++ b/cmake/modules/LLVMConfig.cmake.in
@@ -40,6 +40,8 @@ set(LLVM_ENABLE_PIC @LLVM_ENABLE_PIC@)
 set(LLVM_ON_UNIX @LLVM_ON_UNIX@)
 set(LLVM_ON_WIN32 @LLVM_ON_WIN32@)
 
+set(LLVM_LIBDIR_SUFFIX @LLVM_LIBDIR_SUFFIX@)
+
 set(LLVM_INCLUDE_DIRS "@LLVM_CONFIG_INCLUDE_DIRS@")
 set(LLVM_LIBRARY_DIRS "@LLVM_CONFIG_LIBRARY_DIRS@")
 set(LLVM_DEFINITIONS "-D__STDC_LIMIT_MACROS" "-D__STDC_CONSTANT_MACROS")
diff --git a/cmake/modules/LLVMProcessSources.cmake b/cmake/modules/LLVMProcessSources.cmake
index 64ebce8..ae1921b 100644
--- a/cmake/modules/LLVMProcessSources.cmake
+++ b/cmake/modules/LLVMProcessSources.cmake
@@ -28,26 +28,39 @@ macro(add_td_sources srcs)
   endif()
 endmacro(add_td_sources)
 
+function(add_header_files_for_glob hdrs_out glob)
+  file(GLOB hds ${glob})
+  set(${hdrs_out} ${hds} PARENT_SCOPE)
+endfunction(add_header_files_for_glob)
 
-macro(add_header_files srcs)
-  file(GLOB hds *.h)
-  if( hds )
-    set_source_files_properties(${hds} PROPERTIES HEADER_FILE_ONLY ON)
-    list(APPEND ${srcs} ${hds})
-  endif()
-endmacro(add_header_files)
+function(find_all_header_files hdrs_out additional_headerdirs)
+  add_header_files_for_glob(hds *.h)
+  list(APPEND all_headers ${hds})
+
+  foreach(additional_dir ${additional_headerdirs})
+    add_header_files_for_glob(hds "${additional_dir}/*.h")
+    list(APPEND all_headers ${hds})
+    add_header_files_for_glob(hds "${additional_dir}/*.inc")
+    list(APPEND all_headers ${hds})
+  endforeach(additional_dir)
+
+  set( ${hdrs_out} ${all_headers} PARENT_SCOPE )
+endfunction(find_all_header_files)
 
 
 function(llvm_process_sources OUT_VAR)
-  cmake_parse_arguments(ARG "" "" "ADDITIONAL_HEADERS" ${ARGN})
+  cmake_parse_arguments(ARG "" "" "ADDITIONAL_HEADERS;ADDITIONAL_HEADER_DIRS" ${ARGN})
   set(sources ${ARG_UNPARSED_ARGUMENTS})
   llvm_check_source_file_list( ${sources} )
   if( MSVC_IDE OR XCODE )
     # This adds .td and .h files to the Visual Studio solution:
     add_td_sources(sources)
-    add_header_files(sources)
+    find_all_header_files(hdrs "${ARG_ADDITIONAL_HEADER_DIRS}")
+    if (hdrs)
+      set_source_files_properties(${hdrs} PROPERTIES HEADER_FILE_ONLY ON)
+    endif()
     set_source_files_properties(${ARG_ADDITIONAL_HEADERS} PROPERTIES HEADER_FILE_ONLY ON)
-    list(APPEND sources ${ARG_ADDITIONAL_HEADERS})
+    list(APPEND sources ${ARG_ADDITIONAL_HEADERS} ${hdrs})
   endif()
 
   set( ${OUT_VAR} ${sources} PARENT_SCOPE )
diff --git a/cmake/modules/Makefile b/cmake/modules/Makefile
index dd31aa7..e38f5a6 100644
--- a/cmake/modules/Makefile
+++ b/cmake/modules/Makefile
@@ -48,7 +48,7 @@ endif
 
 OBJMODS := LLVMConfig.cmake LLVMConfigVersion.cmake LLVMExports.cmake
 
-$(PROJ_OBJ_DIR)/LLVMConfig.cmake: LLVMConfig.cmake.in $(LLVMBuildCMakeFrag)
+$(PROJ_OBJ_DIR)/LLVMConfig.cmake: LLVMConfig.cmake.in Makefile $(LLVMBuildCMakeFrag)
 	$(Echo) 'Generating LLVM CMake package config file'
 	$(Verb) ( \
 	 cat $< | sed \
@@ -73,6 +73,7 @@ $(PROJ_OBJ_DIR)/LLVMConfig.cmake: LLVMConfig.cmake.in $(LLVMBuildCMakeFrag)
 	  -e 's/@LLVM_ENABLE_PIC@/'"$(ENABLE_PIC)"'/' \
 	  -e 's/@LLVM_ON_UNIX@/'"$(LLVM_ON_UNIX)"'/' \
 	  -e 's/@LLVM_ON_WIN32@/'"$(LLVM_ON_WIN32)"'/' \
+	  -e 's/@LLVM_LIBDIR_SUFFIX@//' \
 	  -e 's/@LLVM_CONFIG_INCLUDE_DIRS@/'"$(subst /,\/,$(PROJ_includedir))"'/' \
 	  -e 's/@LLVM_CONFIG_LIBRARY_DIRS@/'"$(subst /,\/,$(PROJ_libdir))"'/' \
 	  -e 's/@LLVM_CONFIG_CMAKE_DIR@/'"$(subst /,\/,$(PROJ_cmake))"'/' \
diff --git a/cmake/platforms/iOS.cmake b/cmake/platforms/iOS.cmake
index 4973643..e18ca6b 100644
--- a/cmake/platforms/iOS.cmake
+++ b/cmake/platforms/iOS.cmake
@@ -18,8 +18,17 @@ IF(NOT DEFINED ENV{SDKROOT})
  MESSAGE(FATAL_ERROR "SDKROOT env var must be set: " $ENV{SDKROOT})
 ENDIF()
 
+IF(EXISTS $ENV{SDKROOT})
+  SET(SDKROOT $ENV{SDKROOT})
+ELSE()
+  execute_process(COMMAND xcodebuild -version -sdk $ENV{SDKROOT} Path
+   OUTPUT_VARIABLE SDKROOT
+   ERROR_QUIET
+   OUTPUT_STRIP_TRAILING_WHITESPACE)
+ENDIF()
+
 IF(NOT CMAKE_C_COMPILER)
-  execute_process(COMMAND xcrun -sdk iphoneos -find clang
+  execute_process(COMMAND xcrun -sdk $ENV{SDKROOT} -find clang
    OUTPUT_VARIABLE CMAKE_C_COMPILER
    ERROR_QUIET
    OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -27,21 +36,30 @@ IF(NOT CMAKE_C_COMPILER)
 ENDIF()
 
 IF(NOT CMAKE_CXX_COMPILER)
-  execute_process(COMMAND xcrun -sdk iphoneos -find clang++
+  execute_process(COMMAND xcrun -sdk $ENV{SDKROOT} -find clang++
    OUTPUT_VARIABLE CMAKE_CXX_COMPILER
    ERROR_QUIET
    OUTPUT_STRIP_TRAILING_WHITESPACE)
   message(STATUS "Using c compiler ${CMAKE_CXX_COMPILER}")
 ENDIF()
 
+IF(NOT CMAKE_AR)
+  execute_process(COMMAND xcrun -sdk $ENV{SDKROOT} -find ar
+   OUTPUT_VARIABLE CMAKE_AR_val
+   ERROR_QUIET
+   OUTPUT_STRIP_TRAILING_WHITESPACE)
+  SET(CMAKE_AR ${CMAKE_AR_val} CACHE FILEPATH "Archiver")
+  message(STATUS "Using ar ${CMAKE_AR}")
+ENDIF()
+
 IF (NOT DEFINED IOS_MIN_TARGET)
-execute_process(COMMAND xcodebuild -sdk iphoneos -version SDKVersion
+execute_process(COMMAND xcodebuild -sdk $ENV{SDKROOT} -version SDKVersion
    OUTPUT_VARIABLE IOS_MIN_TARGET
    ERROR_QUIET
    OUTPUT_STRIP_TRAILING_WHITESPACE)
 ENDIF()
 
-SET(IOS_COMMON_FLAGS "-isysroot $ENV{SDKROOT} -mios-version-min=${IOS_MIN_TARGET}")
+SET(IOS_COMMON_FLAGS "-mios-version-min=${IOS_MIN_TARGET}")
 SET(CMAKE_C_FLAGS "${IOS_COMMON_FLAGS}" CACHE STRING "toolchain_cflags" FORCE)
 SET(CMAKE_CXX_FLAGS "${IOS_COMMON_FLAGS}" CACHE STRING "toolchain_cxxflags" FORCE)
 SET(CMAKE_LINK_FLAGS "${IOS_COMMON_FLAGS}" CACHE STRING "toolchain_linkflags" FORCE)
diff --git a/configure b/configure
index b8cb1e2..cac9bf6 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.60 for LLVM 3.6.0svn.
+# Generated by GNU Autoconf 2.60 for LLVM 3.7.0svn.
 #
 # Report bugs to <http://llvm.org/bugs/>.
 #
@@ -561,8 +561,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='LLVM'
 PACKAGE_TARNAME='llvm'
-PACKAGE_VERSION='3.6.0svn'
-PACKAGE_STRING='LLVM 3.6.0svn'
+PACKAGE_VERSION='3.7.0svn'
+PACKAGE_STRING='LLVM 3.7.0svn'
 PACKAGE_BUGREPORT='http://llvm.org/bugs/'
 
 ac_unique_file="lib/IR/Module.cpp"
@@ -1314,7 +1314,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures LLVM 3.6.0svn to adapt to many kinds of systems.
+\`configure' configures LLVM 3.7.0svn to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1380,7 +1380,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of LLVM 3.6.0svn:";;
+     short | recursive ) echo "Configuration of LLVM 3.7.0svn:";;
    esac
   cat <<\_ACEOF
 
@@ -1550,7 +1550,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-LLVM configure 3.6.0svn
+LLVM configure 3.7.0svn
 generated by GNU Autoconf 2.60
 
 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1566,7 +1566,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by LLVM $as_me 3.6.0svn, which was
+It was created by LLVM $as_me 3.7.0svn, which was
 generated by GNU Autoconf 2.60.  Invocation command line was
 
   $ $0 $@
@@ -1921,7 +1921,7 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
 LLVM_VERSION_MAJOR=3
-LLVM_VERSION_MINOR=6
+LLVM_VERSION_MINOR=7
 LLVM_VERSION_PATCH=0
 LLVM_VERSION_SUFFIX=svn
 
@@ -3966,6 +3966,11 @@ else
     llvm_cv_no_link_all_option="-Wl,--no-whole-archive"
     llvm_cv_os_type="DragonFly"
     llvm_cv_platform_type="Unix" ;;
+  *-*-bitrig*)
+    llvm_cv_link_all_option="-Wl,--whole-archive"
+    llvm_cv_no_link_all_option="-Wl,--no-whole-archive"
+    llvm_cv_os_type="Bitrig"
+    llvm_cv_platform_type="Unix" ;;
   *-*-hpux*)
     llvm_cv_link_all_option="-Wl,--whole-archive"
     llvm_cv_no_link_all_option="-Wl,--no-whole-archive"
@@ -4052,6 +4057,8 @@ else
     llvm_cv_target_os_type="NetBSD" ;;
   *-*-dragonfly*)
     llvm_cv_target_os_type="DragonFly" ;;
+  *-*-bitrig*)
+    llvm_cv_target_os_type="Bitrig" ;;
   *-*-hpux*)
     llvm_cv_target_os_type="HP-UX" ;;
   *-*-interix*)
@@ -4074,6 +4081,8 @@ else
     llvm_cv_target_os_type="NativeClient" ;;
   *-unknown-eabi*)
     llvm_cv_target_os_type="Freestanding" ;;
+  *-*-ps4)
+    llvm_cv_target_os_type="PS4" ;;
   *)
     llvm_cv_target_os_type="Unknown" ;;
 esac
@@ -9908,7 +9917,7 @@ if test "$ac_res" != no; then
 else
 
               echo "Error! You need to have libopagent around."
-              exit -1
+              exit 1
 
 fi
 
@@ -10071,7 +10080,7 @@ if test $ac_cv_header_opagent_h = yes; then
 else
 
               echo "Error! You need to have opagent.h around."
-              exit -1
+              exit 1
 
 fi
 
@@ -15438,6 +15447,91 @@ _ACEOF
 
 fi
 
+  { echo "$as_me:$LINENO: checking for __chkstk_ms in -lgcc" >&5
+echo $ECHO_N "checking for __chkstk_ms in -lgcc... $ECHO_C" >&6; }
+if test "${ac_cv_lib_gcc___chkstk_ms+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lgcc  $LIBS"
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char __chkstk_ms ();
+int
+main ()
+{
+return __chkstk_ms ();
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; } &&
+	 { ac_try='test -s conftest$ac_exeext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_cv_lib_gcc___chkstk_ms=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_cv_lib_gcc___chkstk_ms=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext \
+      conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_lib_gcc___chkstk_ms" >&5
+echo "${ECHO_T}$ac_cv_lib_gcc___chkstk_ms" >&6; }
+if test $ac_cv_lib_gcc___chkstk_ms = yes; then
+
+cat >>confdefs.h <<\_ACEOF
+#define HAVE___CHKSTK_MS 1
+_ACEOF
+
+fi
+
   { echo "$as_me:$LINENO: checking for ___chkstk in -lgcc" >&5
 echo $ECHO_N "checking for ___chkstk in -lgcc... $ECHO_C" >&6; }
 if test "${ac_cv_lib_gcc____chkstk+set}" = set; then
@@ -15523,6 +15617,91 @@ _ACEOF
 
 fi
 
+  { echo "$as_me:$LINENO: checking for ___chkstk_ms in -lgcc" >&5
+echo $ECHO_N "checking for ___chkstk_ms in -lgcc... $ECHO_C" >&6; }
+if test "${ac_cv_lib_gcc____chkstk_ms+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lgcc  $LIBS"
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char ___chkstk_ms ();
+int
+main ()
+{
+return ___chkstk_ms ();
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; } &&
+	 { ac_try='test -s conftest$ac_exeext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_cv_lib_gcc____chkstk_ms=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_cv_lib_gcc____chkstk_ms=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext \
+      conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_lib_gcc____chkstk_ms" >&5
+echo "${ECHO_T}$ac_cv_lib_gcc____chkstk_ms" >&6; }
+if test $ac_cv_lib_gcc____chkstk_ms = yes; then
+
+cat >>confdefs.h <<\_ACEOF
+#define HAVE____CHKSTK_MS 1
+_ACEOF
+
+fi
+
 
   { echo "$as_me:$LINENO: checking for __ashldi3 in -lgcc" >&5
 echo $ECHO_N "checking for __ashldi3 in -lgcc... $ECHO_C" >&6; }
@@ -18458,6 +18637,12 @@ if test "${clang_src_root}" = ""; then
   clang_src_root="$srcdir/tools/clang"
 fi
 if test -f ${clang_src_root}/README.txt; then
+
+cat >>confdefs.h <<_ACEOF
+#define CLANG_LIBDIR_SUFFIX ""
+_ACEOF
+
+
     configh="include/clang/Config/config.h"
   doxy="docs/doxygen.cfg"
   ac_config_headers="$ac_config_headers tools/clang/${configh}:${clang_src_root}/${configh}.in"
@@ -18895,7 +19080,7 @@ exec 6>&1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by LLVM $as_me 3.6.0svn, which was
+This file was extended by LLVM $as_me 3.7.0svn, which was
 generated by GNU Autoconf 2.60.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -18948,7 +19133,7 @@ Report bugs to <bug-autoconf@gnu.org>."
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF
 ac_cs_version="\\
-LLVM config.status 3.6.0svn
+LLVM config.status 3.7.0svn
 configured by $0, generated by GNU Autoconf 2.60,
   with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
 
diff --git a/docs/BitCodeFormat.rst b/docs/BitCodeFormat.rst
index 34485b5..4b398a4 100644
--- a/docs/BitCodeFormat.rst
+++ b/docs/BitCodeFormat.rst
@@ -672,7 +672,7 @@ for each library name referenced.
 MODULE_CODE_GLOBALVAR Record
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-``[GLOBALVAR, pointer type, isconst, initid, linkage, alignment, section, visibility, threadlocal, unnamed_addr, dllstorageclass]``
+``[GLOBALVAR, pointer type, isconst, initid, linkage, alignment, section, visibility, threadlocal, unnamed_addr, externally_initialized, dllstorageclass, comdat]``
 
 The ``GLOBALVAR`` record (code 7) marks the declaration or definition of a
 global variable. The operand fields are:
@@ -741,7 +741,7 @@ global variable. The operand fields are:
 MODULE_CODE_FUNCTION Record
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-``[FUNCTION, type, callingconv, isproto, linkage, paramattr, alignment, section, visibility, gc, prefix, dllstorageclass]``
+``[FUNCTION, type, callingconv, isproto, linkage, paramattr, alignment, section, visibility, gc, prologuedata, dllstorageclass, comdat, prefixdata]``
 
 The ``FUNCTION`` record (code 8) marks the declaration or definition of a
 function. The operand fields are:
@@ -784,12 +784,18 @@ function. The operand fields are:
 * *unnamed_addr*: If present and non-zero, indicates that the function has
   ``unnamed_addr``
 
-* *prefix*: If non-zero, the value index of the prefix data for this function,
+* *prologuedata*: If non-zero, the value index of the prologue data for this function,
   plus 1.
 
 * *dllstorageclass*: An encoding of the
   :ref:`dllstorageclass<bcdllstorageclass>` of this function
 
+* *comdat*: An encoding of the COMDAT of this function
+
+* *prefixdata*: If non-zero, the value index of the prefix data for this function,
+  plus 1.
+
+
 MODULE_CODE_ALIAS Record
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/docs/BitSets.rst b/docs/BitSets.rst
new file mode 100644
index 0000000..c6ffdbd
--- /dev/null
+++ b/docs/BitSets.rst
@@ -0,0 +1,70 @@
+=======
+Bitsets
+=======
+
+This is a mechanism that allows IR modules to co-operatively build pointer
+sets corresponding to addresses within a given set of globals. One example
+of a use case for this is to allow a C++ program to efficiently verify (at
+each call site) that a vtable pointer is in the set of valid vtable pointers
+for the type of the class or its derived classes.
+
+To use the mechanism, a client creates a global metadata node named
+``llvm.bitsets``.  Each element is a metadata node with three elements:
+the first is a metadata string containing an identifier for the bitset,
+the second is a global variable and the third is a byte offset into the
+global variable.
+
+This will cause a link-time optimization pass to generate bitsets from the
+memory addresses referenced from the elements of the bitset metadata. The pass
+will lay out the referenced globals consecutively, so their definitions must
+be available at LTO time. The `GlobalLayoutBuilder`_ class is responsible for
+laying out the globals efficiently to minimize the sizes of the underlying
+bitsets. An intrinsic, :ref:`llvm.bitset.test <bitset.test>`, generates code
+to test whether a given pointer is a member of a bitset.
+
+:Example:
+
+::
+
+    target datalayout = "e-p:32:32"
+
+    @a = internal global i32 0
+    @b = internal global i32 0
+    @c = internal global i32 0
+    @d = internal global [2 x i32] [i32 0, i32 0]
+
+    !llvm.bitsets = !{!0, !1, !2, !3, !4}
+
+    !0 = !{!"bitset1", i32* @a, i32 0}
+    !1 = !{!"bitset1", i32* @b, i32 0}
+    !2 = !{!"bitset2", i32* @b, i32 0}
+    !3 = !{!"bitset2", i32* @c, i32 0}
+    !4 = !{!"bitset2", i32* @d, i32 4}
+
+    declare i1 @llvm.bitset.test(i8* %ptr, metadata %bitset) nounwind readnone
+
+    define i1 @foo(i32* %p) {
+      %pi8 = bitcast i32* %p to i8*
+      %x = call i1 @llvm.bitset.test(i8* %pi8, metadata !"bitset1")
+      ret i1 %x
+    }
+
+    define i1 @bar(i32* %p) {
+      %pi8 = bitcast i32* %p to i8*
+      %x = call i1 @llvm.bitset.test(i8* %pi8, metadata !"bitset2")
+      ret i1 %x
+    }
+
+    define void @main() {
+      %a1 = call i1 @foo(i32* @a) ; returns 1
+      %b1 = call i1 @foo(i32* @b) ; returns 1
+      %c1 = call i1 @foo(i32* @c) ; returns 0
+      %a2 = call i1 @bar(i32* @a) ; returns 0
+      %b2 = call i1 @bar(i32* @b) ; returns 1
+      %c2 = call i1 @bar(i32* @c) ; returns 1
+      %d02 = call i1 @bar(i32* getelementptr ([2 x i32]* @d, i32 0, i32 0)) ; returns 0
+      %d12 = call i1 @bar(i32* getelementptr ([2 x i32]* @d, i32 0, i32 1)) ; returns 1
+      ret void
+    }
+
+.. _GlobalLayoutBuilder: http://llvm.org/klaus/llvm/blob/master/include/llvm/Transforms/IPO/LowerBitSets.h
diff --git a/docs/CMake.rst b/docs/CMake.rst
index 653fa16..4d96466 100644
--- a/docs/CMake.rst
+++ b/docs/CMake.rst
@@ -26,7 +26,7 @@ Quick start
 We use here the command-line, non-interactive CMake interface.
 
 #. `Download <http://www.cmake.org/cmake/resources/software.html>`_ and install
-   CMake. Version 2.8 is the minimum required.
+   CMake. Version 2.8.8 is the minimum required.
 
 #. Open a shell. Your development tools must be reachable from this shell
    through the PATH environment variable.
@@ -59,6 +59,36 @@ We use here the command-line, non-interactive CMake interface.
    environment variable, for instance. You can force CMake to use a given build
    tool, see the `Usage`_ section.
 
+#. After CMake has finished running, proceed to use IDE project files or start
+   the build from the build directory:
+
+   .. code-block:: console
+
+     $ cmake --build .
+
+   The ``--build`` option tells ``cmake`` to invoke the underlying build
+   tool (``make``, ``ninja``, ``xcodebuild``, ``msbuild``, etc).
+
+   The underlying build tool can be invoked directly either of course, but
+   the ``--build`` option is portable.
+
+#. After LLVM has finished building, install it from the build directory:
+
+   .. code-block:: console
+
+     $ cmake --build . --target install
+
+   The ``--target`` option with ``install`` parameter in addition to
+   the ``--build`` option tells ``cmake`` to build the ``install`` target.
+
+   It is possible to set a different install prefix at installation time
+   by invoking the ``cmake_install.cmake`` script generated in the
+   build directory:
+
+   .. code-block:: console
+
+     $ cmake -DCMAKE_INSTALL_PREFIX=/tmp/llvm -P cmake_install.cmake
+
 .. _Basic CMake usage:
 .. _Usage:
 
@@ -215,8 +245,8 @@ LLVM-specific variables
   Build in C++1y mode, if available. Defaults to OFF.
 
 **LLVM_ENABLE_ASSERTIONS**:BOOL
-  Enables code assertions. Defaults to OFF if and only if ``CMAKE_BUILD_TYPE``
-  is *Release*.
+  Enables code assertions. Defaults to ON if and only if ``CMAKE_BUILD_TYPE``
+  is *Debug*.
 
 **LLVM_ENABLE_EH**:BOOL
   Build LLVM with exception handling support. This is necessary if you wish to
@@ -234,7 +264,7 @@ LLVM-specific variables
   Enable all compiler warnings. Defaults to ON.
 
 **LLVM_ENABLE_PEDANTIC**:BOOL
-  Enable pedantic mode. This disable compiler specific extensions, is
+  Enable pedantic mode. This disables compiler specific extensions, if
   possible. Defaults to ON.
 
 **LLVM_ENABLE_WERROR**:BOOL
@@ -316,8 +346,8 @@ LLVM-specific variables
   otherwise this has no effect.
 
 **LLVM_DOXYGEN_QCH_FILENAME**:STRING
-  The filename of the Qt Compressed Help file that will be genrated when
-  ``-DLLVM_ENABLE_DOXYGEN=ON`` and 
+  The filename of the Qt Compressed Help file that will be generated when
+  ``-DLLVM_ENABLE_DOXYGEN=ON`` and
   ``-DLLVM_ENABLE_DOXYGEN_QT_HELP=ON`` are given. Defaults to
   ``org.llvm.qch``.
   This option is only useful in combination with
@@ -330,7 +360,7 @@ LLVM-specific variables
   for more information. Defaults to "org.llvm". This option is only useful in
   combination with ``-DLLVM_ENABLE_DOXYGEN_QT_HELP=ON``; otherwise
   this has no effect.
-    
+
 **LLVM_DOXYGEN_QHP_CUST_FILTER_NAME**:STRING
   See `Qt Help Project`_ for
   more information. Defaults to the CMake variable ``${PACKAGE_STRING}`` which
@@ -429,7 +459,7 @@ and uses them to build a simple application ``simple-tool``.
   add_definitions(${LLVM_DEFINITIONS})
 
   # Now build our tools
-  add_excutable(simple-tool tool.cpp)
+  add_executable(simple-tool tool.cpp)
 
   # Find the libraries that correspond to the LLVM components
   # that we wish to use
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index d310a0a..da27627 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -104,3 +104,46 @@ if (LLVM_ENABLE_SPHINX)
 
   endif()
 endif()
+
+list(FIND LLVM_BINDINGS_LIST ocaml uses_ocaml)
+if( NOT uses_ocaml LESS 0 )
+  set(doc_targets
+        ocaml_llvm
+        ocaml_llvm_all_backends
+        ocaml_llvm_analysis
+        ocaml_llvm_bitreader
+        ocaml_llvm_bitwriter
+        ocaml_llvm_executionengine
+        ocaml_llvm_irreader
+        ocaml_llvm_linker
+        ocaml_llvm_target
+        ocaml_llvm_ipo
+        ocaml_llvm_passmgr_builder
+        ocaml_llvm_scalar_opts
+        ocaml_llvm_transform_utils
+        ocaml_llvm_vectorize
+      )
+
+  foreach(llvm_target ${LLVM_TARGETS_TO_BUILD})
+    list(APPEND doc_targets ocaml_llvm_${llvm_target})
+  endforeach()
+
+  set(odoc_files)
+  foreach( doc_target ${doc_targets} )
+    get_target_property(odoc_file ${doc_target} OCAML_ODOC)
+    list(APPEND odoc_files -load ${odoc_file})
+  endforeach()
+
+  add_custom_target(ocaml_doc
+    COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html
+    COMMAND ${OCAMLFIND} ocamldoc -d ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html
+                                  -sort -colorize-code -html ${odoc_files})
+
+  add_dependencies(ocaml_doc ${doc_targets})
+
+  if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
+    install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html
+      DESTINATION docs/ocaml/html)
+  endif()
+endif()
diff --git a/docs/CodeGenerator.rst b/docs/CodeGenerator.rst
index b0a1059..75d40db 100644
--- a/docs/CodeGenerator.rst
+++ b/docs/CodeGenerator.rst
@@ -464,7 +464,7 @@ code:
   mov %EAX, %EDX
   sar %EDX, 31
   idiv %ECX
-  ret 
+  ret
 
 This approach is extremely general (if it can handle the X86 architecture, it
 can handle anything!) and allows all of the target specific knowledge about the
@@ -848,6 +848,10 @@ is based on the final SelectionDAG, with nodes that must be scheduled together
 bundled into a single scheduling-unit node, and with immediate operands and
 other nodes that aren't relevant for scheduling omitted.
 
+The option ``-filter-view-dags`` allows to select the name of the basic block
+that you are interested to visualize and filters all the previous
+``view-*-dags`` options.
+
 .. _Build initial DAG:
 
 Initial SelectionDAG Construction
@@ -1336,7 +1340,7 @@ found before being stored or after being reloaded.
 If the indirect strategy is used, after all the virtual registers have been
 mapped to physical registers or stack slots, it is necessary to use a spiller
 object to place load and store instructions in the code. Every virtual that has
-been mapped to a stack slot will be stored to memory after been defined and will
+been mapped to a stack slot will be stored to memory after being defined and will
 be loaded before being used. The implementation of the spiller tries to recycle
 load/store instructions, avoiding unnecessary instructions. For an example of
 how to invoke the spiller, see ``RegAllocLinearScan::runOnMachineFunction`` in
@@ -1349,7 +1353,7 @@ With very rare exceptions (e.g., function calls), the LLVM machine code
 instructions are three address instructions. That is, each instruction is
 expected to define at most one register, and to use at most two registers.
 However, some architectures use two address instructions. In this case, the
-defined register is also one of the used register. For instance, an instruction
+defined register is also one of the used registers. For instance, an instruction
 such as ``ADD %EAX, %EBX``, in X86 is actually equivalent to ``%EAX = %EAX +
 %EBX``.
 
@@ -1574,7 +1578,7 @@ three important things that you have to implement for your target:
    correspond to. The MCInsts that are generated by this are fed into the
    instruction printer or the encoder.
 
-Finally, at your choosing, you can also implement an subclass of MCCodeEmitter
+Finally, at your choosing, you can also implement a subclass of MCCodeEmitter
 which lowers MCInst's into machine code bytes and relocations.  This is
 important if you want to support direct .o file emission, or would like to
 implement an assembler for your target.
diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst
index 0552c71..221c431 100644
--- a/docs/CodingStandards.rst
+++ b/docs/CodingStandards.rst
@@ -83,7 +83,7 @@ Supported C++11 Language and Library Features
 
 While LLVM, Clang, and LLD use C++11, not all features are available in all of
 the toolchains which we support. The set of features supported for use in LLVM
-is the intersection of those supported in MSVC 2012, GCC 4.7, and Clang 3.1.
+is the intersection of those supported in MSVC 2013, GCC 4.7, and Clang 3.1.
 The ultimate definition of this set is what build bots with those respective
 toolchains accept. Don't argue with the build bots. However, we have some
 guidance below to help you know what to expect.
@@ -123,6 +123,12 @@ unlikely to be supported by our host compilers.
 
 * ``override`` and ``final``: N2928_, N3206_, N3272_
 * Atomic operations and the C++11 memory model: N2429_
+* Variadic templates: N2242_
+* Explicit conversion operators: N2437_
+* Defaulted and deleted functions: N2346_
+
+  * But not defaulted move constructors or move assignment operators, MSVC 2013
+    cannot synthesize them.
 
 .. _N2118: http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2006/n2118.html
 .. _N2439: http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2439.htm
@@ -143,6 +149,9 @@ unlikely to be supported by our host compilers.
 .. _N3206: http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2010/n3206.htm
 .. _N3272: http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2011/n3272.htm
 .. _N2429: http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2429.htm
+.. _N2242: http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2242.pdf
+.. _N2437: http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2437.pdf
+.. _N2346: http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2346.htm
 .. _MSVC-compatible RTTI: http://llvm.org/PR18951
 
 The supported features in the C++11 standard libraries are less well tracked,
@@ -251,7 +260,8 @@ The next section in the file is a concise note that defines the license that the
 file is released under.  This makes it perfectly clear what terms the source
 code can be distributed under and should not be modified in any way.
 
-The main body is a ``doxygen`` comment describing the purpose of the file.  It
+The main body is a ``doxygen`` comment (identified by the ``///`` comment
+marker instead of the usual ``//``) describing the purpose of the file.  It
 should have a ``\brief`` command that describes the file in one or two
 sentences.  Any additional information should be separated by a blank line.  If
 an algorithm is being implemented or something tricky is going on, a reference
@@ -281,7 +291,8 @@ happens: does the method return null?  Abort?  Format your hard disk?
 Comment Formatting
 ^^^^^^^^^^^^^^^^^^
 
-In general, prefer C++ style (``//``) comments.  They take less space, require
+In general, prefer C++ style comments (``//`` for normal comments, ``///`` for
+``doxygen`` documentation comments).  They take less space, require
 less typing, don't have nesting problems, etc.  There are a few cases when it is
 useful to use C style (``/* */``) comments however:
 
@@ -710,7 +721,7 @@ the symbol (e.g., MSVC).  This can lead to problems at link time.
   // Bar isn't POD, but it does look like a struct.
   struct Bar {
     int Data;
-    Foo() : Data(0) { }
+    Bar() : Data(0) { }
   };
 
 Do not use Braced Initializer Lists to Call a Constructor
diff --git a/docs/CommandGuide/lit.rst b/docs/CommandGuide/lit.rst
index 2708e9d..9c63848 100644
--- a/docs/CommandGuide/lit.rst
+++ b/docs/CommandGuide/lit.rst
@@ -341,7 +341,7 @@ LOCAL CONFIGURATION FILES
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
 When :program:`lit` loads a subdirectory in a test suite, it instantiates a
-local test configuration by cloning the configuration for the parent direction
+local test configuration by cloning the configuration for the parent directory
 --- the root of this configuration chain will always be a test suite.  Once the
 test configuration is cloned :program:`lit` checks for a *lit.local.cfg* file
 in the subdirectory.  If present, this file will be loaded and can be used to
diff --git a/docs/CompilerWriterInfo.rst b/docs/CompilerWriterInfo.rst
index a012c32..2dfdc9b 100644
--- a/docs/CompilerWriterInfo.rst
+++ b/docs/CompilerWriterInfo.rst
@@ -41,6 +41,8 @@ MIPS
 
 * `MIPS Processor Architecture <http://imgtec.com/mips/mips-architectures.asp>`_
 
+* `MIPS 64-bit ELF Object File Specification <http://techpubs.sgi.com/library/manuals/4000/007-4658-001/pdf/007-4658-001.pdf>`_
+
 PowerPC
 -------
 
diff --git a/docs/ExceptionHandling.rst b/docs/ExceptionHandling.rst
index 64edca7..696b50f 100644
--- a/docs/ExceptionHandling.rst
+++ b/docs/ExceptionHandling.rst
@@ -64,6 +64,21 @@ handling at the expense of slower execution when no exceptions are thrown. As
 exceptions are, by their nature, intended for uncommon code paths, DWARF
 exception handling is generally preferred to SJLJ.
 
+Windows Runtime Exception Handling
+-----------------------------------
+
+Windows runtime based exception handling uses the same basic IR structure as
+Itanium ABI based exception handling, but it relies on the personality
+functions provided by the native Windows runtime library, ``__CxxFrameHandler3``
+for C++ exceptions: ``__C_specific_handler`` for 64-bit SEH or 
+``_frame_handler3/4`` for 32-bit SEH.  This results in a very different
+execution model and requires some minor modifications to the initial IR
+representation and a significant restructuring just before code generation.
+
+General information about the Windows x64 exception handling mechanism can be
+found at `MSDN Exception Handling (x64)
+<https://msdn.microsoft.com/en-us/library/1eyas8tf(v=vs.80).aspx>`_.
+
 Overview
 --------
 
@@ -263,9 +278,9 @@ there are no catches or filters that require it to.
   exceptions and throws a third.
 
 When all cleanups are finished, if the exception is not handled by the current
-function, resume unwinding by calling the `resume
-instruction <LangRef.html#i_resume>`_, passing in the result of the
-``landingpad`` instruction for the original landing pad.
+function, resume unwinding by calling the :ref:`resume instruction <i_resume>`,
+passing in the result of the ``landingpad`` instruction for the original
+landing pad.
 
 Throw Filters
 -------------
@@ -306,6 +321,97 @@ the selector results they understand and then resume exception propagation with
 the `resume instruction <LangRef.html#i_resume>`_ if none of the conditions
 match.
 
+C++ Exception Handling using the Windows Runtime
+=================================================
+
+(Note: Windows C++ exception handling support is a work in progress and is
+ not yet fully implemented.  The text below describes how it will work
+ when completed.)
+
+The Windows runtime function for C++ exception handling uses a multi-phase
+approach.  When an exception occurs it searches the current callstack for a
+frame that has a handler for the exception.  If a handler is found, it then
+calls the cleanup handler for each frame above the handler which has a
+cleanup handler before calling the catch handler.  These calls are all made
+from a stack context different from the original frame in which the handler
+is defined.  Therefore, it is necessary to outline these handlers from their
+original context before code generation.
+
+Catch handlers are called with a pointer to the handler itself as the first
+argument and a pointer to the parent function's stack frame as the second
+argument.  The catch handler uses the `llvm.recoverframe
+<LangRef.html#llvm-frameallocate-and-llvm-framerecover-intrinsics>`_ to get a
+pointer to a frame allocation block that is created in the parent frame using
+the `llvm.allocateframe 
+<LangRef.html#llvm-frameallocate-and-llvm-framerecover-intrinsics>`_ intrinsic.
+The ``WinEHPrepare`` pass will have created a structure definition for the
+contents of this block.  The first two members of the structure will always be
+(1) a 32-bit integer that the runtime uses to track the exception state of the
+parent frame for the purposes of handling chained exceptions and (2) a pointer
+to the object associated with the exception (roughly, the parameter of the
+catch clause). These two members will be followed by any frame variables from
+the parent function which must be accessed in any of the functions unwind or
+catch handlers.  The catch handler returns the address at which execution
+should continue.
+
+Cleanup handlers perform any cleanup necessary as the frame goes out of scope,
+such as calling object destructors.  The runtime handles the actual unwinding
+of the stack.  If an exception occurs in a cleanup handler the runtime manages
+termination of the process. Cleanup handlers are called with the same arguments
+as catch handlers (a pointer to the handler and a pointer to the parent stack
+frame) and use the same mechanism described above to access frame variables
+in the parent function.  Cleanup handlers do not return a value.
+
+The IR generated for Windows runtime based C++ exception handling is initially
+very similar to the ``landingpad`` mechanism described above.  Calls to
+libc++abi functions (such as ``__cxa_begin_catch``/``__cxa_end_catch`` and
+``__cxa_throw_exception`` are replaced with calls to intrinsics or Windows
+runtime functions (such as ``llvm.eh.begincatch``/``llvm.eh.endcatch`` and
+``__CxxThrowException``).
+
+During the WinEHPrepare pass, the handler functions are outlined into handler
+functions and the original landing pad code is replaced with a call to the
+``llvm.eh.actions`` intrinsic that describes the order in which handlers will
+be processed from the logical location of the landing pad and an indirect
+branch to the return value of the ``llvm.eh.actions`` intrinsic. The
+``llvm.eh.actions`` intrinsic is defined as returning the address at which
+execution will continue.  This is a temporary construct which will be removed
+before code generation, but it allows for the accurate tracking of control
+flow until then.
+
+A typical landing pad will look like this after outlining:
+
+.. code-block:: llvm
+
+    lpad:
+      %vals = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+	      cleanup
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+          catch i8* bitcast (i8** @_ZTIf to i8*)
+      %recover = call i8* (...)* @llvm.eh.actions(
+          i32 3, i8* bitcast (i8** @_ZTIi to i8*), i8* (i8*, i8*)* @_Z4testb.catch.1)
+          i32 2, i8* null, void (i8*, i8*)* @_Z4testb.cleanup.1)
+          i32 1, i8* bitcast (i8** @_ZTIf to i8*), i8* (i8*, i8*)* @_Z4testb.catch.0)
+          i32 0, i8* null, void (i8*, i8*)* @_Z4testb.cleanup.0)
+      indirectbr i8* %recover, [label %try.cont1, label %try.cont2]
+
+In this example, the landing pad represents an exception handling context with
+two catch handlers and a cleanup handler that have been outlined.  If an
+exception is thrown with a type that matches ``_ZTIi``, the ``_Z4testb.catch.1``
+handler will be called an no clean-up is needed.  If an exception is thrown
+with a type that matches ``_ZTIf``, first the ``_Z4testb.cleanup.1`` handler
+will be called to perform unwind-related cleanup, then the ``_Z4testb.catch.1``
+handler will be called.  If an exception is throw which does not match either
+of these types and the exception is handled by another frame further up the
+call stack, first the ``_Z4testb.cleanup.1`` handler will be called, then the
+``_Z4testb.cleanup.0`` handler (which corresponds to a different scope) will be
+called, and exception handling will continue at the next frame in the call
+stack will be called.  One of the catch handlers will return the address of
+``%try.cont1`` in the parent function and the other will return the address of
+``%try.cont2``, meaning that execution continues at one of those blocks after
+an exception is caught.
+
+
 Exception Handling Intrinsics
 =============================
 
@@ -329,6 +435,70 @@ function.  This value can be used to compare against the result of
 
 Uses of this intrinsic are generated by the C++ front-end.
 
+.. _llvm.eh.begincatch:
+
+``llvm.eh.begincatch``
+----------------------
+
+.. code-block:: llvm
+
+  i8* @llvm.eh.begincatch(i8* %exn)
+
+
+This intrinsic marks the beginning of catch handling code within the blocks
+following a ``landingpad`` instruction.  The exact behavior of this function
+depends on the compilation target and the personality function associated
+with the ``landingpad`` instruction.
+
+The argument to this intrinsic is a pointer that was previously extracted from
+the aggregate return value of the ``landingpad`` instruction.  The return
+value of the intrinsic is a pointer to the exception object to be used by the
+catch code.  This pointer is returned as an ``i8*`` value, but the actual type
+of the object will depend on the exception that was thrown.
+
+Uses of this intrinsic are generated by the C++ front-end.  Many targets will
+use implementation-specific functions (such as ``__cxa_begin_catch``) instead
+of this intrinsic.  The intrinsic is provided for targets that require a more
+abstract interface.
+
+When used in the native Windows C++ exception handling implementation, this
+intrinsic serves as a placeholder to delimit code before a catch handler is
+outlined.  When the handler is is outlined, this intrinsic will be replaced
+by instructions that retrieve the exception object pointer from the frame
+allocation block.
+
+
+.. _llvm.eh.endcatch:
+
+``llvm.eh.endcatch``
+----------------------
+
+.. code-block:: llvm
+
+  void @llvm.eh.endcatch()
+
+
+This intrinsic marks the end of catch handling code within the current block,
+which will be a successor of a block which called ``llvm.eh.begincatch''.
+The exact behavior of this function depends on the compilation target and the
+personality function associated with the corresponding ``landingpad``
+instruction.
+
+There may be more than one call to ``llvm.eh.endcatch`` for any given call to
+``llvm.eh.begincatch`` with each ``llvm.eh.endcatch`` call corresponding to the
+end of a different control path.  All control paths following a call to
+``llvm.eh.begincatch`` must reach a call to ``llvm.eh.endcatch``.
+
+Uses of this intrinsic are generated by the C++ front-end.  Many targets will
+use implementation-specific functions (such as ``__cxa_begin_catch``) instead
+of this intrinsic.  The intrinsic is provided for targets that require a more
+abstract interface.
+
+When used in the native Windows C++ exception handling implementation, this
+intrinsic serves as a placeholder to delimit code before a catch handler is
+outlined.  After the handler is outlined, this intrinsic is simply removed.
+
+
 SJLJ Intrinsics
 ---------------
 
diff --git a/docs/ExtendingLLVM.rst b/docs/ExtendingLLVM.rst
index 60cbf01..2552c07 100644
--- a/docs/ExtendingLLVM.rst
+++ b/docs/ExtendingLLVM.rst
@@ -58,7 +58,7 @@ function and then be turned into an instruction if warranted.
    If it is possible to constant fold your intrinsic, add support to it in the
    ``canConstantFoldCallTo`` and ``ConstantFoldCall`` functions.
 
-#. ``llvm/test/Regression/*``:
+#. ``llvm/test/*``:
 
    Add test cases for your test cases to the test suite
 
@@ -164,10 +164,10 @@ complicated behavior in a single node (rotate).
 
 #. TODO: document complex patterns.
 
-#. ``llvm/test/Regression/CodeGen/*``:
+#. ``llvm/test/CodeGen/*``:
 
    Add test cases for your new node to the test suite.
-   ``llvm/test/Regression/CodeGen/X86/bswap.ll`` is a good example.
+   ``llvm/test/CodeGen/X86/bswap.ll`` is a good example.
 
 Adding a new instruction
 ========================
@@ -217,7 +217,7 @@ Adding a new instruction
 
    add support for your instruction to code generators, or add a lowering pass.
 
-#. ``llvm/test/Regression/*``:
+#. ``llvm/test/*``:
 
    add your test cases to the test suite.
 
diff --git a/docs/GarbageCollection.rst b/docs/GarbageCollection.rst
index 49d3496..a1557fc 100644
--- a/docs/GarbageCollection.rst
+++ b/docs/GarbageCollection.rst
@@ -1,13 +1,82 @@
 =====================================
-Accurate Garbage Collection with LLVM
+Garbage Collection with LLVM
 =====================================
 
 .. contents::
    :local:
 
+Abstract
+========
+
+This document covers how to integrate LLVM into a compiler for a language which
+supports garbage collection.  **Note that LLVM itself does not provide a 
+garbage collector.**  You must provide your own.  
+
+Quick Start
+============
+
+First, you should pick a collector strategy.  LLVM includes a number of built 
+in ones, but you can also implement a loadable plugin with a custom definition.
+Note that the collector strategy is a description of how LLVM should generate 
+code such that it interacts with your collector and runtime, not a description
+of the collector itself.
+
+Next, mark your generated functions as using your chosen collector strategy.  
+From c++, you can call: 
+
+.. code-block:: c++
+
+  F.setGC(<collector description name>);
+
+
+This will produce IR like the following fragment:
+
+.. code-block:: llvm
+
+  define void @foo() gc "<collector description name>" { ... }
+
+
+When generating LLVM IR for your functions, you will need to:
+
+* Use ``@llvm.gcread`` and/or ``@llvm.gcwrite`` in place of standard load and 
+  store instructions.  These intrinsics are used to represent load and store 
+  barriers.  If you collector does not require such barriers, you can skip 
+  this step.  
+
+* Use the memory allocation routines provided by your garbage collector's 
+  runtime library.
+
+* If your collector requires them, generate type maps according to your 
+  runtime's binary interface.  LLVM is not involved in the process.  In 
+  particular, the LLVM type system is not suitable for conveying such 
+  information though the compiler.
+
+* Insert any coordination code required for interacting with your collector.  
+  Many collectors require running application code to periodically check a
+  flag and conditionally call a runtime function.  This is often referred to 
+  as a safepoint poll.  
+
+You will need to identify roots (i.e. references to heap objects your collector 
+needs to know about) in your generated IR, so that LLVM can encode them into 
+your final stack maps.  Depending on the collector strategy chosen, this is 
+accomplished by using either the ``@llvm.gcroot`` intrinsics or an 
+``gc.statepoint`` relocation sequence. 
+
+Don't forget to create a root for each intermediate value that is generated when
+evaluating an expression.  In ``h(f(), g())``, the result of ``f()`` could 
+easily be collected if evaluating ``g()`` triggers a collection.
+
+Finally, you need to link your runtime library with the generated program 
+executable (for a static compiler) or ensure the appropriate symbols are 
+available for the runtime linker (for a JIT compiler).  
+
+
 Introduction
 ============
 
+What is Garbage Collection?
+---------------------------
+
 Garbage collection is a widely used technique that frees the programmer from
 having to know the lifetimes of heap objects, making software easier to produce
 and maintain.  Many programming languages rely on garbage collection for
@@ -59,31 +128,34 @@ instance, the intrinsics permit:
 
 * generational collectors
 
-* reference counting
-
 * incremental collectors
 
 * concurrent collectors
 
 * cooperative collectors
 
-We hope that the primitive support built into the LLVM IR is sufficient to
-support a broad class of garbage collected languages including Scheme, ML, Java,
-C#, Perl, Python, Lua, Ruby, other scripting languages, and more.
+* reference counting
+
+We hope that the support built into the LLVM IR is sufficient to support a 
+broad class of garbage collected languages including Scheme, ML, Java, C#, 
+Perl, Python, Lua, Ruby, other scripting languages, and more.
 
-However, LLVM does not itself provide a garbage collector --- this should be
-part of your language's runtime library.  LLVM provides a framework for compile
-time :ref:`code generation plugins <plugin>`.  The role of these plugins is to
+Note that LLVM **does not itself provide a garbage collector** --- this should
+be part of your language's runtime library.  LLVM provides a framework for
+describing the garbage collectors requirements to the compiler.  In particular,
+LLVM provides support for generating stack maps at call sites, polling for a 
+safepoint, and emitting load and store barriers.  You can also extend LLVM - 
+possibly through a loadable :ref:`code generation plugins <plugin>` - to
 generate code and data structures which conforms to the *binary interface*
 specified by the *runtime library*.  This is similar to the relationship between
 LLVM and DWARF debugging info, for example.  The difference primarily lies in
 the lack of an established standard in the domain of garbage collection --- thus
-the plugins.
+the need for a flexible extension mechanism.
 
 The aspects of the binary interface with which LLVM's GC support is
 concerned are:
 
-* Creation of GC-safe points within code where collection is allowed to execute
+* Creation of GC safepoints within code where collection is allowed to execute
   safely.
 
 * Computation of the stack map.  For each safe point in the code, object
@@ -111,205 +183,63 @@ There are additional areas that LLVM does not directly address:
 In general, LLVM's support for GC does not include features which can be
 adequately addressed with other features of the IR and does not specify a
 particular binary interface.  On the plus side, this means that you should be
-able to integrate LLVM with an existing runtime.  On the other hand, it leaves a
-lot of work for the developer of a novel language.  However, it's easy to get
-started quickly and scale up to a more sophisticated implementation as your
-compiler matures.
-
-Getting started
-===============
-
-Using a GC with LLVM implies many things, for example:
-
-* Write a runtime library or find an existing one which implements a GC heap.
-
-  #. Implement a memory allocator.
-
-  #. Design a binary interface for the stack map, used to identify references
-     within a stack frame on the machine stack.\*
-
-  #. Implement a stack crawler to discover functions on the call stack.\*
-
-  #. Implement a registry for global roots.
-
-  #. Design a binary interface for type maps, used to identify references
-     within heap objects.
-
-  #. Implement a collection routine bringing together all of the above.
-
-* Emit compatible code from your compiler.
-
-  * Initialization in the main function.
-
-  * Use the ``gc "..."`` attribute to enable GC code generation (or
-    ``F.setGC("...")``).
-
-  * Use ``@llvm.gcroot`` to mark stack roots.
-
-  * Use ``@llvm.gcread`` and/or ``@llvm.gcwrite`` to manipulate GC references,
-    if necessary.
-
-  * Allocate memory using the GC allocation routine provided by the runtime
-    library.
-
-  * Generate type maps according to your runtime's binary interface.
-
-* Write a compiler plugin to interface LLVM with the runtime library.\*
-
-  * Lower ``@llvm.gcread`` and ``@llvm.gcwrite`` to appropriate code
-    sequences.\*
-
-  * Compile LLVM's stack map to the binary form expected by the runtime.
-
-* Load the plugin into the compiler.  Use ``llc -load`` or link the plugin
-  statically with your language's compiler.\*
-
-* Link program executables with the runtime.
-
-To help with several of these tasks (those indicated with a \*), LLVM includes a
-highly portable, built-in ShadowStack code generator.  It is compiled into
-``llc`` and works even with the interpreter and C backends.
-
-In your compiler
-----------------
-
-To turn the shadow stack on for your functions, first call:
-
-.. code-block:: c++
-
-  F.setGC("shadow-stack");
-
-for each function your compiler emits. Since the shadow stack is built into
-LLVM, you do not need to load a plugin.
-
-Your compiler must also use ``@llvm.gcroot`` as documented.  Don't forget to
-create a root for each intermediate value that is generated when evaluating an
-expression.  In ``h(f(), g())``, the result of ``f()`` could easily be collected
-if evaluating ``g()`` triggers a collection.
-
-There's no need to use ``@llvm.gcread`` and ``@llvm.gcwrite`` over plain
-``load`` and ``store`` for now.  You will need them when switching to a more
-advanced GC.
-
-In your runtime
----------------
-
-The shadow stack doesn't imply a memory allocation algorithm.  A semispace
-collector or building atop ``malloc`` are great places to start, and can be
-implemented with very little code.
-
-When it comes time to collect, however, your runtime needs to traverse the stack
-roots, and for this it needs to integrate with the shadow stack.  Luckily, doing
-so is very simple. (This code is heavily commented to help you understand the
-data structure, but there are only 20 lines of meaningful code.)
-
-.. code-block:: c++
-
-  /// @brief The map for a single function's stack frame.  One of these is
-  ///        compiled as constant data into the executable for each function.
-  ///
-  /// Storage of metadata values is elided if the %metadata parameter to
-  /// @llvm.gcroot is null.
-  struct FrameMap {
-    int32_t NumRoots;    //< Number of roots in stack frame.
-    int32_t NumMeta;     //< Number of metadata entries.  May be < NumRoots.
-    const void *Meta[0]; //< Metadata for each root.
-  };
-
-  /// @brief A link in the dynamic shadow stack.  One of these is embedded in
-  ///        the stack frame of each function on the call stack.
-  struct StackEntry {
-    StackEntry *Next;    //< Link to next stack entry (the caller's).
-    const FrameMap *Map; //< Pointer to constant FrameMap.
-    void *Roots[0];      //< Stack roots (in-place array).
-  };
-
-  /// @brief The head of the singly-linked list of StackEntries.  Functions push
-  ///        and pop onto this in their prologue and epilogue.
-  ///
-  /// Since there is only a global list, this technique is not threadsafe.
-  StackEntry *llvm_gc_root_chain;
-
-  /// @brief Calls Visitor(root, meta) for each GC root on the stack.
-  ///        root and meta are exactly the values passed to
-  ///        @llvm.gcroot.
-  ///
-  /// Visitor could be a function to recursively mark live objects.  Or it
-  /// might copy them to another heap or generation.
-  ///
-  /// @param Visitor A function to invoke for every GC root on the stack.
-  void visitGCRoots(void (*Visitor)(void **Root, const void *Meta)) {
-    for (StackEntry *R = llvm_gc_root_chain; R; R = R->Next) {
-      unsigned i = 0;
-
-      // For roots [0, NumMeta), the metadata pointer is in the FrameMap.
-      for (unsigned e = R->Map->NumMeta; i != e; ++i)
-        Visitor(&R->Roots[i], R->Map->Meta[i]);
-
-      // For roots [NumMeta, NumRoots), the metadata pointer is null.
-      for (unsigned e = R->Map->NumRoots; i != e; ++i)
-        Visitor(&R->Roots[i], NULL);
-    }
-  }
-
-About the shadow stack
-----------------------
-
-Unlike many GC algorithms which rely on a cooperative code generator to compile
-stack maps, this algorithm carefully maintains a linked list of stack roots
-[:ref:`Henderson2002 <henderson02>`].  This so-called "shadow stack" mirrors the
-machine stack.  Maintaining this data structure is slower than using a stack map
-compiled into the executable as constant data, but has a significant portability
-advantage because it requires no special support from the target code generator,
-and does not require tricky platform-specific code to crawl the machine stack.
-
-The tradeoff for this simplicity and portability is:
-
-* High overhead per function call.
-
-* Not thread-safe.
-
-Still, it's an easy way to get started.  After your compiler and runtime are up
-and running, writing a :ref:`plugin <plugin>` will allow you to take advantage
-of :ref:`more advanced GC features <collector-algos>` of LLVM in order to
-improve performance.
+able to integrate LLVM with an existing runtime.  On the other hand, it can 
+have the effect of leaving a lot of work for the developer of a novel 
+language.  We try to mitigate this by providing built in collector strategy 
+descriptions that can work with many common collector designs and easy 
+extension points.  If you don't already have a specific binary interface 
+you need to support, we recommend trying to use one of these built in collector 
+strategies.
 
 .. _gc_intrinsics:
 
-IR features
-===========
+LLVM IR Features
+================
 
 This section describes the garbage collection facilities provided by the
 :doc:`LLVM intermediate representation <LangRef>`.  The exact behavior of these
-IR features is specified by the binary interface implemented by a :ref:`code
-generation plugin <plugin>`, not by this document.
-
-These facilities are limited to those strictly necessary; they are not intended
-to be a complete interface to any garbage collector.  A program will need to
-interface with the GC library using the facilities provided by that program.
+IR features is specified by the selected :ref:`GC strategy description 
+<plugin>`. 
 
 Specifying GC code generation: ``gc "..."``
 -------------------------------------------
 
 .. code-block:: llvm
 
-  define ty @name(...) gc "name" { ...
+  define <returntype> @name(...) gc "name" { ... }
 
-The ``gc`` function attribute is used to specify the desired GC style to the
+The ``gc`` function attribute is used to specify the desired GC strategy to the
 compiler.  Its programmatic equivalent is the ``setGC`` method of ``Function``.
 
-Setting ``gc "name"`` on a function triggers a search for a matching code
-generation plugin "*name*"; it is that plugin which defines the exact nature of
-the code generated to support GC.  If none is found, the compiler will raise an
-error.
+Setting ``gc "name"`` on a function triggers a search for a matching subclass
+of GCStrategy.  Some collector strategies are built in.  You can add others 
+using either the loadable plugin mechanism, or by patching your copy of LLVM.
+It is the selected GC strategy which defines the exact nature of the code 
+generated to support GC.  If none is found, the compiler will raise an error.
 
 Specifying the GC style on a per-function basis allows LLVM to link together
 programs that use different garbage collection algorithms (or none at all).
 
 .. _gcroot:
 
-Identifying GC roots on the stack: ``llvm.gcroot``
---------------------------------------------------
+Identifying GC roots on the stack
+----------------------------------
+
+LLVM currently supports two different mechanisms for describing references in
+compiled code at safepoints.  ``llvm.gcroot`` is the older mechanism; 
+``gc.statepoint`` has been added more recently.  At the moment, you can choose 
+either implementation (on a per :ref:`GC strategy <plugin>` basis).  Longer 
+term, we will probably either migrate away from ``llvm.gcroot`` entirely, or 
+substantially merge their implementations. Note that most new development 
+work is focused on ``gc.statepoint``.  
+
+Using ``gc.statepoint``
+^^^^^^^^^^^^^^^^^^^^^^^^
+:doc:`This page <Statepoints>` contains detailed documentation for 
+``gc.statepoint``. 
+
+Using ``llvm.gcwrite``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. code-block:: llvm
 
@@ -317,24 +247,27 @@ Identifying GC roots on the stack: ``llvm.gcroot``
 
 The ``llvm.gcroot`` intrinsic is used to inform LLVM that a stack variable
 references an object on the heap and is to be tracked for garbage collection.
-The exact impact on generated code is specified by a :ref:`compiler plugin
-<plugin>`.  All calls to ``llvm.gcroot`` **must** reside inside the first basic
-block.
+The exact impact on generated code is specified by the Function's selected 
+:ref:`GC strategy <plugin>`.  All calls to ``llvm.gcroot`` **must** reside 
+inside the first basic block.
 
-A compiler which uses mem2reg to raise imperative code using ``alloca`` into SSA
-form need only add a call to ``@llvm.gcroot`` for those variables which a
-pointers into the GC heap.
+The first argument **must** be a value referring to an alloca instruction or a
+bitcast of an alloca.  The second contains a pointer to metadata that should be
+associated with the pointer, and **must** be a constant or global value
+address.  If your target collector uses tags, use a null pointer for metadata.
+
+A compiler which performs manual SSA construction **must** ensure that SSA 
+values representing GC references are stored in to the alloca passed to the
+respective ``gcroot`` before every call site and reloaded after every call.  
+A compiler which uses mem2reg to raise imperative code using ``alloca`` into 
+SSA form need only add a call to ``@llvm.gcroot`` for those variables which 
+are pointers into the GC heap.  
 
 It is also important to mark intermediate values with ``llvm.gcroot``.  For
 example, consider ``h(f(), g())``.  Beware leaking the result of ``f()`` in the
 case that ``g()`` triggers a collection.  Note, that stack variables must be
 initialized and marked with ``llvm.gcroot`` in function's prologue.
 
-The first argument **must** be a value referring to an alloca instruction or a
-bitcast of an alloca.  The second contains a pointer to metadata that should be
-associated with the pointer, and **must** be a constant or global value
-address.  If your target collector uses tags, use a null pointer for metadata.
-
 The ``%metadata`` argument can be used to avoid requiring heap objects to have
 'isa' pointers or tag bits. [Appel89_, Goldberg91_, Tolmach94_] If specified,
 its value will be tracked along with the location of the pointer in the stack
@@ -407,12 +340,18 @@ pointer:
   %derived = getelementptr %object, i32 0, i32 2, i32 %n
 
 LLVM does not enforce this relationship between the object and derived pointer
-(although a :ref:`plugin <plugin>` might).  However, it would be an unusual
-collector that violated it.
+(although a particular :ref:`collector strategy <plugin>` might).  However, it
+would be an unusual collector that violated it.
+
+The use of these intrinsics is naturally optional if the target GC does not 
+require the corresponding barrier.  The GC strategy used with such a collector 
+should replace the intrinsic calls with the corresponding ``load`` or 
+``store`` instruction if they are used.
 
-The use of these intrinsics is naturally optional if the target GC does require
-the corresponding barrier.  Such a GC plugin will replace the intrinsic calls
-with the corresponding ``load`` or ``store`` instruction if they are used.
+One known deficiency with the current design is that the barrier intrinsics do 
+not include the size or alignment of the underlying operation performed.  It is 
+currently assumed that the operation is of pointer size and the alignment is
+assumed to be the target machine's default alignment.
 
 Write barrier: ``llvm.gcwrite``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -423,8 +362,8 @@ Write barrier: ``llvm.gcwrite``
 
 For write barriers, LLVM provides the ``llvm.gcwrite`` intrinsic function.  It
 has exactly the same semantics as a non-volatile ``store`` to the derived
-pointer (the third argument).  The exact code generated is specified by a
-compiler :ref:`plugin <plugin>`.
+pointer (the third argument).  The exact code generated is specified by the
+Function's selected :ref:`GC strategy <plugin>`.
 
 Many important algorithms require write barriers, including generational and
 concurrent collectors.  Additionally, write barriers could be used to implement
@@ -439,16 +378,189 @@ Read barrier: ``llvm.gcread``
 
 For read barriers, LLVM provides the ``llvm.gcread`` intrinsic function.  It has
 exactly the same semantics as a non-volatile ``load`` from the derived pointer
-(the second argument).  The exact code generated is specified by a
-:ref:`compiler plugin <plugin>`.
+(the second argument).  The exact code generated is specified by the Function's
+selected :ref:`GC strategy <plugin>`.
 
 Read barriers are needed by fewer algorithms than write barriers, and may have a
 greater performance impact since pointer reads are more frequent than writes.
 
 .. _plugin:
 
+.. _builtin-gc-strategies:
+
+Built In GC Strategies
+======================
+
+LLVM includes built in support for several varieties of garbage collectors.  
+
+The Shadow Stack GC
+----------------------
+
+To use this collector strategy, mark your functions with:
+
+.. code-block:: c++
+
+  F.setGC("shadow-stack");
+
+Unlike many GC algorithms which rely on a cooperative code generator to compile
+stack maps, this algorithm carefully maintains a linked list of stack roots
+[:ref:`Henderson2002 <henderson02>`].  This so-called "shadow stack" mirrors the
+machine stack.  Maintaining this data structure is slower than using a stack map
+compiled into the executable as constant data, but has a significant portability
+advantage because it requires no special support from the target code generator,
+and does not require tricky platform-specific code to crawl the machine stack.
+
+The tradeoff for this simplicity and portability is:
+
+* High overhead per function call.
+
+* Not thread-safe.
+
+Still, it's an easy way to get started.  After your compiler and runtime are up
+and running, writing a :ref:`plugin <plugin>` will allow you to take advantage
+of :ref:`more advanced GC features <collector-algos>` of LLVM in order to
+improve performance.
+
+
+The shadow stack doesn't imply a memory allocation algorithm.  A semispace
+collector or building atop ``malloc`` are great places to start, and can be
+implemented with very little code.
+
+When it comes time to collect, however, your runtime needs to traverse the stack
+roots, and for this it needs to integrate with the shadow stack.  Luckily, doing
+so is very simple. (This code is heavily commented to help you understand the
+data structure, but there are only 20 lines of meaningful code.)
+
+.. code-block:: c++
+
+  /// @brief The map for a single function's stack frame.  One of these is
+  ///        compiled as constant data into the executable for each function.
+  ///
+  /// Storage of metadata values is elided if the %metadata parameter to
+  /// @llvm.gcroot is null.
+  struct FrameMap {
+    int32_t NumRoots;    //< Number of roots in stack frame.
+    int32_t NumMeta;     //< Number of metadata entries.  May be < NumRoots.
+    const void *Meta[0]; //< Metadata for each root.
+  };
+
+  /// @brief A link in the dynamic shadow stack.  One of these is embedded in
+  ///        the stack frame of each function on the call stack.
+  struct StackEntry {
+    StackEntry *Next;    //< Link to next stack entry (the caller's).
+    const FrameMap *Map; //< Pointer to constant FrameMap.
+    void *Roots[0];      //< Stack roots (in-place array).
+  };
+
+  /// @brief The head of the singly-linked list of StackEntries.  Functions push
+  ///        and pop onto this in their prologue and epilogue.
+  ///
+  /// Since there is only a global list, this technique is not threadsafe.
+  StackEntry *llvm_gc_root_chain;
+
+  /// @brief Calls Visitor(root, meta) for each GC root on the stack.
+  ///        root and meta are exactly the values passed to
+  ///        @llvm.gcroot.
+  ///
+  /// Visitor could be a function to recursively mark live objects.  Or it
+  /// might copy them to another heap or generation.
+  ///
+  /// @param Visitor A function to invoke for every GC root on the stack.
+  void visitGCRoots(void (*Visitor)(void **Root, const void *Meta)) {
+    for (StackEntry *R = llvm_gc_root_chain; R; R = R->Next) {
+      unsigned i = 0;
+
+      // For roots [0, NumMeta), the metadata pointer is in the FrameMap.
+      for (unsigned e = R->Map->NumMeta; i != e; ++i)
+        Visitor(&R->Roots[i], R->Map->Meta[i]);
+
+      // For roots [NumMeta, NumRoots), the metadata pointer is null.
+      for (unsigned e = R->Map->NumRoots; i != e; ++i)
+        Visitor(&R->Roots[i], NULL);
+    }
+  }
+
+
+The 'Erlang' and 'Ocaml' GCs
+-----------------------------
+
+LLVM ships with two example collectors which leverage the ``gcroot`` 
+mechanisms.  To our knowledge, these are not actually used by any language 
+runtime, but they do provide a reasonable starting point for someone interested 
+in writing an ``gcroot`` compatible GC plugin.  In particular, these are the 
+only in tree examples of how to produce a custom binary stack map format using 
+a ``gcroot`` strategy.
+
+As there names imply, the binary format produced is intended to model that 
+used by the Erlang and OCaml compilers respectively.  
+
+
+The Statepoint Example GC
+-------------------------
+
+.. code-block:: c++
+
+  F.setGC("statepoint-example");
+
+This GC provides an example of how one might use the infrastructure provided 
+by ``gc.statepoint``. This example GC is compatible with the 
+:ref:`PlaceSafepoints` and :ref:`RewriteStatepointsForGC` utility passes 
+which simplify ``gc.statepoint`` sequence insertion. If you need to build a 
+custom GC strategy around the ``gc.statepoints`` mechanisms, it is recommended
+that you use this one as a starting point.
+
+This GC strategy does not support read or write barriers.  As a result, these 
+intrinsics are lowered to normal loads and stores.
+
+The stack map format generated by this GC strategy can be found in the 
+:ref:`stackmap-section` using a format documented :ref:`here 
+<statepoint-stackmap-format>`. This format is intended to be the standard 
+format supported by LLVM going forward.
+
+
+Custom GC Strategies
+====================
+
+If none of the built in GC strategy descriptions met your needs above, you will
+need to define a custom GCStrategy and possibly, a custom LLVM pass to perform 
+lowering.  Your best example of where to start defining a custom GCStrategy 
+would be to look at one of the built in strategies.
+
+You may be able to structure this additional code as a loadable plugin library.
+Loadable plugins are sufficient if all you need is to enable a different 
+combination of built in functionality, but if you need to provide a custom 
+lowering pass, you will need to build a patched version of LLVM.  If you think 
+you need a patched build, please ask for advice on llvm-dev.  There may be an 
+easy way we can extend the support to make it work for your use case without 
+requiring a custom build.  
+
+Collector Requirements
+----------------------
+
+You should be able to leverage any existing collector library that includes the following elements:
+
+#. A memory allocator which exposes an allocation function your compiled 
+   code can call.
+
+#. A binary format for the stack map.  A stack map describes the location
+   of references at a safepoint and is used by precise collectors to identify
+   references within a stack frame on the machine stack. Note that collectors
+   which conservatively scan the stack don't require such a structure.
+
+#. A stack crawler to discover functions on the call stack, and enumerate the
+   references listed in the stack map for each call site.  
+
+#. A mechanism for identifying references in global locations (e.g. global 
+   variables).
+
+#. If you collector requires them, an LLVM IR implementation of your collectors
+   load and store barriers.  Note that since many collectors don't require 
+   barriers at all, LLVM defaults to lowering such barriers to normal loads 
+   and stores unless you arrange otherwise.
+
+
 Implementing a collector plugin
-===============================
+-------------------------------
 
 User code specifies which GC code generation to use with the ``gc`` function
 attribute or, equivalently, with the ``setGC`` method of ``Function``.
@@ -721,8 +833,9 @@ this feature should be used by all GC plugins.  It is enabled by default.
 Custom lowering of intrinsics: ``CustomRoots``, ``CustomReadBarriers``, and ``CustomWriteBarriers``
 ---------------------------------------------------------------------------------------------------
 
-For GCs which use barriers or unusual treatment of stack roots, these flags
-allow the collector to perform arbitrary transformations of the LLVM IR:
+For GCs which use barriers or unusual treatment of stack roots, these 
+flags allow the collector to perform arbitrary transformations of the
+LLVM IR:
 
 .. code-block:: c++
 
@@ -733,70 +846,18 @@ allow the collector to perform arbitrary transformations of the LLVM IR:
       CustomReadBarriers = true;
       CustomWriteBarriers = true;
     }
-
-    virtual bool initializeCustomLowering(Module &M);
-    virtual bool performCustomLowering(Function &F);
   };
 
-If any of these flags are set, then LLVM suppresses its default lowering for the
-corresponding intrinsics and instead calls ``performCustomLowering``.
-
-LLVM's default action for each intrinsic is as follows:
-
-* ``llvm.gcroot``: Leave it alone.  The code generator must see it or the stack
-  map will not be computed.
-
-* ``llvm.gcread``: Substitute a ``load`` instruction.
-
-* ``llvm.gcwrite``: Substitute a ``store`` instruction.
-
-If ``CustomReadBarriers`` or ``CustomWriteBarriers`` are specified, then
-``performCustomLowering`` **must** eliminate the corresponding barriers.
+If any of these flags are set, LLVM suppresses its default lowering for
+the corresponding intrinsics.  Instead, you must provide a custom Pass
+which lowers the intrinsics as desired.  If you have opted in to custom
+lowering of a particular intrinsic your pass **must** eliminate all 
+instances of the corresponding intrinsic in functions which opt in to
+your GC.  The best example of such a pass is the ShadowStackGC and it's 
+ShadowStackGCLowering pass.  
 
-``performCustomLowering`` must comply with the same restrictions as
-:ref:`FunctionPass::runOnFunction <writing-an-llvm-pass-runOnFunction>`
-Likewise, ``initializeCustomLowering`` has the same semantics as
-:ref:`Pass::doInitialization(Module&)
-<writing-an-llvm-pass-doInitialization-mod>`
-
-The following can be used as a template:
-
-.. code-block:: c++
-
-  #include "llvm/IR/Module.h"
-  #include "llvm/IR/IntrinsicInst.h"
-
-  bool MyGC::initializeCustomLowering(Module &M) {
-    return false;
-  }
-
-  bool MyGC::performCustomLowering(Function &F) {
-    bool MadeChange = false;
-
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-      for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; )
-        if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++))
-          if (Function *F = CI->getCalledFunction())
-            switch (F->getIntrinsicID()) {
-            case Intrinsic::gcwrite:
-              // Handle llvm.gcwrite.
-              CI->eraseFromParent();
-              MadeChange = true;
-              break;
-            case Intrinsic::gcread:
-              // Handle llvm.gcread.
-              CI->eraseFromParent();
-              MadeChange = true;
-              break;
-            case Intrinsic::gcroot:
-              // Handle llvm.gcroot.
-              CI->eraseFromParent();
-              MadeChange = true;
-              break;
-            }
-
-    return MadeChange;
-  }
+There is currently no way to register such a custom lowering pass 
+without building a custom copy of LLVM.
 
 .. _safe-points:
 
diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index 316f1f7..fa55ece 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@@ -230,7 +230,7 @@ our build systems:
 
 * Clang 3.1
 * GCC 4.7
-* Visual Studio 2012
+* Visual Studio 2013
 
 Anything older than these toolchains *may* work, but will require forcing the
 build system with a special option and is not really a supported host platform.
@@ -280,7 +280,7 @@ Getting a Modern Host C++ Toolchain
 
 This section mostly applies to Linux and older BSDs. On Mac OS X, you should
 have a sufficiently modern Xcode, or you will likely need to upgrade until you
-do. On Windows, just use Visual Studio 2012 as the host compiler, it is
+do. On Windows, just use Visual Studio 2013 as the host compiler, it is
 explicitly supported and widely available. FreeBSD 10.0 and newer have a modern
 Clang as the system compiler.
 
diff --git a/docs/GettingStartedVS.rst b/docs/GettingStartedVS.rst
index fa20912..63e81f5 100644
--- a/docs/GettingStartedVS.rst
+++ b/docs/GettingStartedVS.rst
@@ -45,13 +45,13 @@ and software you will need.
 
 Hardware
 --------
-Any system that can adequately run Visual Studio 2012 is fine. The LLVM
+Any system that can adequately run Visual Studio 2013 is fine. The LLVM
 source tree and object files, libraries and executables will consume
 approximately 3GB.
 
 Software
 --------
-You will need Visual Studio 2012 or higher.
+You will need Visual Studio 2013 or higher.
 
 You will also need the `CMake <http://www.cmake.org/>`_ build system since it
 generates the project files you will use to build with.
diff --git a/docs/HowToSetUpLLVMStyleRTTI.rst b/docs/HowToSetUpLLVMStyleRTTI.rst
index 96275e7..3892994 100644
--- a/docs/HowToSetUpLLVMStyleRTTI.rst
+++ b/docs/HowToSetUpLLVMStyleRTTI.rst
@@ -40,14 +40,14 @@ RTTI for this class hierarchy:
      double SideLength;
    public:
      Square(double S) : SideLength(S) {}
-     double computeArea() /* override */;
+     double computeArea() override;
    };
 
    class Circle : public Shape {
      double Radius;
    public:
      Circle(double R) : Radius(R) {}
-     double computeArea() /* override */;
+     double computeArea() override;
    };
 
 The most basic working setup for LLVM-style RTTI requires the following
@@ -135,7 +135,7 @@ steps:
        public:
       -  Square(double S) : SideLength(S) {}
       +  Square(double S) : Shape(SK_Square), SideLength(S) {}
-         double computeArea() /* override */;
+         double computeArea() override;
        };
 
        class Circle : public Shape {
@@ -143,7 +143,7 @@ steps:
        public:
       -  Circle(double R) : Radius(R) {}
       +  Circle(double R) : Shape(SK_Circle), Radius(R) {}
-         double computeArea() /* override */;
+         double computeArea() override;
        };
 
 #. Finally, you need to inform LLVM's RTTI templates how to dynamically
@@ -175,7 +175,7 @@ steps:
          double SideLength;
        public:
          Square(double S) : Shape(SK_Square), SideLength(S) {}
-         double computeArea() /* override */;
+         double computeArea() override;
       +
       +  static bool classof(const Shape *S) {
       +    return S->getKind() == SK_Square;
@@ -186,7 +186,7 @@ steps:
          double Radius;
        public:
          Circle(double R) : Shape(SK_Circle), Radius(R) {}
-         double computeArea() /* override */;
+         double computeArea() override;
       +
       +  static bool classof(const Shape *S) {
       +    return S->getKind() == SK_Circle;
@@ -377,6 +377,20 @@ contract for ``classof`` is "return ``true`` if the dynamic type of the
 argument is-a ``C``".  As long as your implementation fulfills this
 contract, you can tweak and optimize it as much as you want.
 
+For example, LLVM-style RTTI can work fine in the presence of
+multiple-inheritance by defining an appropriate ``classof``.
+An example of this in practice is
+`Decl <http://clang.llvm.org/doxygen/classclang_1_1Decl.html>`_ vs.
+`DeclContext <http://clang.llvm.org/doxygen/classclang_1_1DeclContext.html>`_
+inside Clang.
+The ``Decl`` hierarchy is done very similarly to the example setup
+demonstrated in this tutorial.
+The key part is how to then incorporate ``DeclContext``: all that is needed
+is in ``bool DeclContext::classof(const Decl *)``, which asks the question
+"Given a ``Decl``, how can I determine if it is-a ``DeclContext``?".
+It answers this with a simple switch over the set of ``Decl`` "kinds", and
+returning true for ones that are known to be ``DeclContext``'s.
+
 .. TODO::
 
    Touch on some of the more advanced features, like ``isa_impl`` and
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 3b7d80b..a0e9b18 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -75,7 +75,7 @@ identifiers, for different purposes:
 #. Named values are represented as a string of characters with their
    prefix. For example, ``%foo``, ``@DivisionByZero``,
    ``%a.really.long.identifier``. The actual regular expression used is
-   '``[%@][a-zA-Z$._][a-zA-Z$._0-9]*``'. Identifiers that require other
+   '``[%@][-a-zA-Z$._][-a-zA-Z$._0-9]*``'. Identifiers that require other
    characters in their names can be surrounded with quotes. Special
    characters may be escaped using ``"\xx"`` where ``xx`` is the ASCII
    code for the character in hexadecimal. In this way, any character can
@@ -170,7 +170,7 @@ symbol table entries. Here is an example of the "hello world" module:
     }
 
     ; Named metadata
-    !0 = metadata !{i32 42, null, metadata !"string"}
+    !0 = !{i32 42, null, !"string"}
     !foo = !{!0}
 
 This example is made up of a :ref:`global variable <globalvars>` named
@@ -368,7 +368,7 @@ added in the future:
 
     The idea behind this convention is to support calls to runtime functions
     that have a hot path and a cold path. The hot path is usually a small piece
-    of code that doesn't many registers. The cold path might need to call out to
+    of code that doesn't use many registers. The cold path might need to call out to
     another function and therefore only needs to preserve the caller-saved
     registers, which haven't already been saved by the caller. The
     `PreserveMost` calling convention is very similar to the `cold` calling
@@ -521,7 +521,7 @@ Global Variables
 Global variables define regions of memory allocated at compilation time
 instead of run-time.
 
-Global variables definitions must be initialized.
+Global variable definitions must be initialized.
 
 Global variables in other translation units can also be declared, in which
 case they don't have an initializer.
@@ -588,7 +588,7 @@ iteration. The maximum alignment is ``1 << 29``.
 
 Globals can also have a :ref:`DLL storage class <dllstorageclass>`.
 
-Variables and aliasaes can have a
+Variables and aliases can have a
 :ref:`Thread Local Storage Model <tls_model>`.
 
 Syntax::
@@ -596,7 +596,8 @@ Syntax::
     [@<GlobalVarName> =] [Linkage] [Visibility] [DLLStorageClass] [ThreadLocal]
                          [unnamed_addr] [AddrSpace] [ExternallyInitialized]
                          <global | constant> <Type> [<InitializerConstant>]
-                         [, section "name"] [, align <Alignment>]
+                         [, section "name"] [, comdat [($name)]]
+                         [, align <Alignment>]
 
 For example, the following defines a global in a numbered address space
 with an initializer, section, and alignment:
@@ -633,7 +634,8 @@ name, a (possibly empty) argument list (each with optional :ref:`parameter
 attributes <paramattrs>`), optional :ref:`function attributes <fnattrs>`,
 an optional section, an optional alignment,
 an optional :ref:`comdat <langref_comdats>`,
-an optional :ref:`garbage collector name <gc>`, an optional :ref:`prefix <prefixdata>`, an opening
+an optional :ref:`garbage collector name <gc>`, an optional :ref:`prefix <prefixdata>`,
+an optional :ref:`prologue <prologuedata>`, an opening
 curly brace, a list of basic blocks, and a closing curly brace.
 
 LLVM function declarations consist of the "``declare``" keyword, an
@@ -643,7 +645,8 @@ an optional :ref:`calling convention <callingconv>`,
 an optional ``unnamed_addr`` attribute, a return type, an optional
 :ref:`parameter attribute <paramattrs>` for the return type, a function
 name, a possibly empty list of arguments, an optional alignment, an optional
-:ref:`garbage collector name <gc>` and an optional :ref:`prefix <prefixdata>`.
+:ref:`garbage collector name <gc>`, an optional :ref:`prefix <prefixdata>`,
+and an optional :ref:`prologue <prologuedata>`.
 
 A function definition contains a list of basic blocks, forming the CFG (Control
 Flow Graph) for the function. Each basic block may optionally start with a label
@@ -663,7 +666,7 @@ predecessors, it also cannot have any :ref:`PHI nodes <i_phi>`.
 
 LLVM allows an explicit section to be specified for functions. If the
 target supports it, it will emit functions to the section specified.
-Additionally, the function can placed in a COMDAT.
+Additionally, the function can be placed in a COMDAT.
 
 An explicit alignment may be specified for a function. If not present,
 or if the alignment is set to zero, the alignment of the function is set
@@ -671,7 +674,7 @@ by the target to whatever it feels convenient. If an explicit alignment
 is specified, the function is forced to have at least that much
 alignment. All alignments must be a power of 2.
 
-If the ``unnamed_addr`` attribute is given, the address is know to not
+If the ``unnamed_addr`` attribute is given, the address is known to not
 be significant and two identical functions can be merged.
 
 Syntax::
@@ -679,8 +682,8 @@ Syntax::
     define [linkage] [visibility] [DLLStorageClass]
            [cconv] [ret attrs]
            <ResultType> @<FunctionName> ([argument list])
-           [unnamed_addr] [fn Attrs] [section "name"] [comdat $<ComdatName>]
-           [align N] [gc] [prefix Constant] { ... }
+           [unnamed_addr] [fn Attrs] [section "name"] [comdat [($name)]]
+           [align N] [gc] [prefix Constant] [prologue Constant] { ... }
 
 The argument list is a comma seperated sequence of arguments where each
 argument is of the following form
@@ -713,7 +716,7 @@ The linkage must be one of ``private``, ``internal``, ``linkonce``, ``weak``,
 ``linkonce_odr``, ``weak_odr``, ``external``. Note that some system linkers
 might not correctly handle dropping a weak symbol that is aliased.
 
-Alias that are not ``unnamed_addr`` are guaranteed to have the same address as
+Aliases that are not ``unnamed_addr`` are guaranteed to have the same address as
 the aliasee expression. ``unnamed_addr`` ones are only guaranteed to point
 to the same content.
 
@@ -773,12 +776,21 @@ the COMDAT key's section is the largest:
 .. code-block:: llvm
 
    $foo = comdat largest
-   @foo = global i32 2, comdat $foo
+   @foo = global i32 2, comdat($foo)
 
-   define void @bar() comdat $foo {
+   define void @bar() comdat($foo) {
      ret void
    }
 
+As a syntactic sugar the ``$name`` can be omitted if the name is the same as
+the global name:
+
+.. code-block:: llvm
+
+  $foo = comdat any
+  @foo = global i32 2, comdat
+
+
 In a COFF object file, this will create a COMDAT section with selection kind
 ``IMAGE_COMDAT_SELECT_LARGEST`` containing the contents of the ``@foo`` symbol
 and another COMDAT section with selection kind
@@ -801,8 +813,8 @@ For example:
 
    $foo = comdat any
    $bar = comdat any
-   @g1 = global i32 42, section "sec", comdat $foo
-   @g2 = global i32 42, section "sec", comdat $bar
+   @g1 = global i32 42, section "sec", comdat($foo)
+   @g2 = global i32 42, section "sec", comdat($bar)
 
 From the object file perspective, this requires the creation of two sections
 with the same name.  This is necessary because both globals belong to different
@@ -825,9 +837,9 @@ operands for a named metadata.
 Syntax::
 
     ; Some unnamed metadata nodes, which are referenced by the named metadata.
-    !0 = metadata !{metadata !"zero"}
-    !1 = metadata !{metadata !"one"}
-    !2 = metadata !{metadata !"two"}
+    !0 = !{!"zero"}
+    !1 = !{!"one"}
+    !2 = !{!"two"}
     ; A named metadata.
     !name = !{!0, !1, !2}
 
@@ -941,23 +953,26 @@ Currently, only the following parameter attributes are defined:
 .. _noalias:
 
 ``noalias``
-    This indicates that pointer values :ref:`based <pointeraliasing>` on
-    the argument or return value do not alias pointer values that are
-    not *based* on it, ignoring certain "irrelevant" dependencies. For a
-    call to the parent function, dependencies between memory references
-    from before or after the call and from those during the call are
-    "irrelevant" to the ``noalias`` keyword for the arguments and return
-    value used in that call. The caller shares the responsibility with
-    the callee for ensuring that these requirements are met. For further
-    details, please see the discussion of the NoAlias response in :ref:`alias
-    analysis <Must, May, or No>`.
+    This indicates that objects accessed via pointer values
+    :ref:`based <pointeraliasing>` on the argument or return value are not also
+    accessed, during the execution of the function, via pointer values not
+    *based* on the argument or return value. The attribute on a return value
+    also has additional semantics described below. The caller shares the
+    responsibility with the callee for ensuring that these requirements are met.
+    For further details, please see the discussion of the NoAlias response in
+    :ref:`alias analysis <Must, May, or No>`.
 
     Note that this definition of ``noalias`` is intentionally similar
-    to the definition of ``restrict`` in C99 for function arguments,
-    though it is slightly weaker.
+    to the definition of ``restrict`` in C99 for function arguments.
 
     For function return values, C99's ``restrict`` is not meaningful,
-    while LLVM's ``noalias`` is.
+    while LLVM's ``noalias`` is. Furthermore, the semantics of the ``noalias``
+    attribute on return values are stronger than the semantics of the attribute
+    when used on function arguments. On function return values, the ``noalias``
+    attribute indicates that the function acts like a system memory allocation
+    function, returning a pointer to allocated storage disjoint from the
+    storage for any other object accessible to the caller.
+
 ``nocapture``
     This indicates that the callee does not make any copies of the
     pointer that outlive the callee itself. This is not a valid
@@ -999,66 +1014,101 @@ Currently, only the following parameter attributes are defined:
 
 .. _gc:
 
-Garbage Collector Names
------------------------
+Garbage Collector Strategy Names
+--------------------------------
 
-Each function may specify a garbage collector name, which is simply a
+Each function may specify a garbage collector strategy name, which is simply a
 string:
 
 .. code-block:: llvm
 
     define void @f() gc "name" { ... }
 
-The compiler declares the supported values of *name*. Specifying a
-collector will cause the compiler to alter its output in order to
-support the named garbage collection algorithm.
+The supported values of *name* includes those :ref:`built in to LLVM 
+<builtin-gc-strategies>` and any provided by loaded plugins.  Specifying a GC
+strategy will cause the compiler to alter its output in order to support the 
+named garbage collection algorithm.  Note that LLVM itself does not contain a 
+garbage collector, this functionality is restricted to generating machine code
+which can interoperate with a collector provided externally.  
 
 .. _prefixdata:
 
 Prefix Data
 -----------
 
-Prefix data is data associated with a function which the code generator
-will emit immediately before the function body.  The purpose of this feature
-is to allow frontends to associate language-specific runtime metadata with
-specific functions and make it available through the function pointer while
-still allowing the function pointer to be called.  To access the data for a
-given function, a program may bitcast the function pointer to a pointer to
-the constant's type.  This implies that the IR symbol points to the start
-of the prefix data.
+Prefix data is data associated with a function which the code
+generator will emit immediately before the function's entrypoint.
+The purpose of this feature is to allow frontends to associate
+language-specific runtime metadata with specific functions and make it
+available through the function pointer while still allowing the
+function pointer to be called.
+
+To access the data for a given function, a program may bitcast the
+function pointer to a pointer to the constant's type and dereference
+index -1.  This implies that the IR symbol points just past the end of
+the prefix data. For instance, take the example of a function annotated
+with a single ``i32``,
+
+.. code-block:: llvm
+
+    define void @f() prefix i32 123 { ... }
+
+The prefix data can be referenced as,
+
+.. code-block:: llvm
+
+    %0 = bitcast *void () @f to *i32
+    %a = getelementptr inbounds *i32 %0, i32 -1
+    %b = load i32* %a
+
+Prefix data is laid out as if it were an initializer for a global variable
+of the prefix data's type.  The function will be placed such that the
+beginning of the prefix data is aligned. This means that if the size
+of the prefix data is not a multiple of the alignment size, the
+function's entrypoint will not be aligned. If alignment of the
+function's entrypoint is desired, padding must be added to the prefix
+data.
+
+A function may have prefix data but no body.  This has similar semantics
+to the ``available_externally`` linkage in that the data may be used by the
+optimizers but will not be emitted in the object file.
+
+.. _prologuedata:
 
-To maintain the semantics of ordinary function calls, the prefix data must
+Prologue Data
+-------------
+
+The ``prologue`` attribute allows arbitrary code (encoded as bytes) to
+be inserted prior to the function body. This can be used for enabling
+function hot-patching and instrumentation.
+
+To maintain the semantics of ordinary function calls, the prologue data must
 have a particular format.  Specifically, it must begin with a sequence of
 bytes which decode to a sequence of machine instructions, valid for the
 module's target, which transfer control to the point immediately succeeding
-the prefix data, without performing any other visible action.  This allows
+the prologue data, without performing any other visible action.  This allows
 the inliner and other passes to reason about the semantics of the function
-definition without needing to reason about the prefix data.  Obviously this
-makes the format of the prefix data highly target dependent.
+definition without needing to reason about the prologue data.  Obviously this
+makes the format of the prologue data highly target dependent.
 
-Prefix data is laid out as if it were an initializer for a global variable
-of the prefix data's type.  No padding is automatically placed between the
-prefix data and the function body.  If padding is required, it must be part
-of the prefix data.
-
-A trivial example of valid prefix data for the x86 architecture is ``i8 144``,
+A trivial example of valid prologue data for the x86 architecture is ``i8 144``,
 which encodes the ``nop`` instruction:
 
 .. code-block:: llvm
 
-    define void @f() prefix i8 144 { ... }
+    define void @f() prologue i8 144 { ... }
 
-Generally prefix data can be formed by encoding a relative branch instruction
-which skips the metadata, as in this example of valid prefix data for the
+Generally prologue data can be formed by encoding a relative branch instruction
+which skips the metadata, as in this example of valid prologue data for the
 x86_64 architecture, where the first two bytes encode ``jmp .+10``:
 
 .. code-block:: llvm
 
     %0 = type <{ i8, i8, i8* }>
 
-    define void @f() prefix %0 <{ i8 235, i8 8, i8* @md}> { ... }
+    define void @f() prologue %0 <{ i8 235, i8 8, i8* @md}> { ... }
 
-A function may have prefix data but no body.  This has similar semantics
+A function may have prologue data but no body.  This has similar semantics
 to the ``available_externally`` linkage in that the data may be used by the
 optimizers but will not be emitted in the object file.
 
@@ -1189,9 +1239,12 @@ example:
     normally. This produces undefined behavior at runtime if the
     function ever does dynamically return.
 ``nounwind``
-    This function attribute indicates that the function never returns
-    with an unwind or exceptional control flow. If the function does
-    unwind, its runtime behavior is undefined.
+    This function attribute indicates that the function never raises an
+    exception. If the function does raise an exception, its runtime
+    behavior is undefined. However, functions marked nounwind may still
+    trap or generate asynchronous exceptions. Exception handling schemes
+    that are recognized by LLVM to handle asynchronous exceptions, such
+    as SEH, will still provide their implementation defined semantics.
 ``optnone``
     This function attribute indicates that the function is not optimized
     by any optimization or code generator passes with the
@@ -1732,7 +1785,7 @@ Fast-Math Flags
 
 LLVM IR floating-point binary ops (:ref:`fadd <i_fadd>`,
 :ref:`fsub <i_fsub>`, :ref:`fmul <i_fmul>`, :ref:`fdiv <i_fdiv>`,
-:ref:`frem <i_frem>`) have the following flags that can set to enable
+:ref:`frem <i_frem>`) have the following flags that can be set to enable
 otherwise unsafe floating point operations
 
 ``nnan``
@@ -2293,11 +2346,11 @@ constants and smaller complex constants.
     having to print large zero initializers (e.g. for large arrays) and
     is always exactly equivalent to using explicit zero initializers.
 **Metadata node**
-    A metadata node is a structure-like constant with :ref:`metadata
-    type <t_metadata>`. For example:
-    "``metadata !{ i32 0, metadata !"test" }``". Unlike other
-    constants that are meant to be interpreted as part of the
-    instruction stream, metadata is a place to attach additional
+    A metadata node is a constant tuple without types.  For example:
+    "``!{!0, !{!2, !0}, !"test"}``".  Metadata can reference constant values,
+    for example: "``!{!0, i32 0, i8* @global, i64 (i64)* @function, !"str"}``".
+    Unlike other typed constants that are meant to be interpreted as part of
+    the instruction stream, metadata is a place to attach additional
     information such as debug info.
 
 Global Variable and Function Addresses
@@ -2771,15 +2824,21 @@ occurs on.
 
 .. _metadata:
 
-Metadata Nodes and Metadata Strings
------------------------------------
+Metadata
+========
 
 LLVM IR allows metadata to be attached to instructions in the program
 that can convey extra information about the code to the optimizers and
 code generator. One example application of metadata is source-level
 debug information. There are two metadata primitives: strings and nodes.
-All metadata has the ``metadata`` type and is identified in syntax by a
-preceding exclamation point ('``!``').
+
+Metadata does not have a type, and is not a value.  If referenced from a
+``call`` instruction, it uses the ``metadata`` type.
+
+All metadata are identified in syntax by a exclamation point ('``!``').
+
+Metadata Nodes and Metadata Strings
+-----------------------------------
 
 A metadata string is a string surrounded by double quotes. It can
 contain any character by escaping non-printable characters with
@@ -2793,7 +2852,17 @@ their operand. For example:
 
 .. code-block:: llvm
 
-    !{ metadata !"test\00", i32 10}
+    !{ !"test\00", i32 10}
+
+Metadata nodes that aren't uniqued use the ``distinct`` keyword. For example:
+
+.. code-block:: llvm
+
+    !0 = distinct !{!"test\00", i32 10}
+
+``distinct`` nodes are useful when nodes shouldn't be merged based on their
+content.  They can also occur when transformations cause uniquing collisions
+when metadata operands change.
 
 A :ref:`named metadata <namedmetadatastructure>` is a collection of
 metadata nodes, which can be looked up in the module symbol table. For
@@ -2801,7 +2870,7 @@ example:
 
 .. code-block:: llvm
 
-    !foo =  metadata !{!4, !3}
+    !foo = !{!4, !3}
 
 Metadata can be used as function arguments. Here ``llvm.dbg.value``
 function is using two metadata arguments:
@@ -2820,6 +2889,23 @@ attached to the ``add`` instruction using the ``!dbg`` identifier:
 More information about specific metadata nodes recognized by the
 optimizers and code generator is found below.
 
+Specialized Metadata Nodes
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Specialized metadata nodes are custom data structures in metadata (as opposed
+to generic tuples).  Their fields are labelled, and can be specified in any
+order.
+
+MDLocation
+""""""""""
+
+``MDLocation`` nodes represent source debug locations.  The ``scope:`` field is
+mandatory.
+
+.. code-block:: llvm
+
+    !0 = !MDLocation(line: 2900, column: 42, scope: !1, inlinedAt: !2)
+
 '``tbaa``' Metadata
 ^^^^^^^^^^^^^^^^^^^
 
@@ -2834,10 +2920,10 @@ to three fields, e.g.:
 
 .. code-block:: llvm
 
-    !0 = metadata !{ metadata !"an example type tree" }
-    !1 = metadata !{ metadata !"int", metadata !0 }
-    !2 = metadata !{ metadata !"float", metadata !0 }
-    !3 = metadata !{ metadata !"const float", metadata !2, i64 1 }
+    !0 = !{ !"an example type tree" }
+    !1 = !{ !"int", !0 }
+    !2 = !{ !"float", !0 }
+    !3 = !{ !"const float", !2, i64 1 }
 
 The first field is an identity field. It can be any value, usually a
 metadata string, which uniquely identifies the type. The most important
@@ -2877,7 +2963,7 @@ its tbaa tag. e.g.:
 
 .. code-block:: llvm
 
-    !4 = metadata !{ i64 0, i64 4, metadata !1, i64 8, i64 4, metadata !2 }
+    !4 = !{ i64 0, i64 4, !1, i64 8, i64 4, !2 }
 
 This describes a struct with two fields. The first is at offset 0 bytes
 with size 4 bytes, and has tbaa tag !1. The second is at offset 8 bytes
@@ -2898,7 +2984,7 @@ collection of memory access instructions that carry ``alias.scope`` metadata.
 Each type of metadata specifies a list of scopes where each scope has an id and
 a domain. When evaluating an aliasing query, if for some some domain, the set
 of scopes with that domain in one instruction's ``alias.scope`` list is a
-subset of (or qual to) the set of scopes for that domain in another
+subset of (or equal to) the set of scopes for that domain in another
 instruction's ``noalias`` list, then the two memory accesses are assumed not to
 alias.
 
@@ -2920,18 +3006,18 @@ For example,
 .. code-block:: llvm
 
     ; Two scope domains:
-    !0 = metadata !{metadata !0}
-    !1 = metadata !{metadata !1}
+    !0 = !{!0}
+    !1 = !{!1}
 
     ; Some scopes in these domains:
-    !2 = metadata !{metadata !2, metadata !0}
-    !3 = metadata !{metadata !3, metadata !0}
-    !4 = metadata !{metadata !4, metadata !1}
+    !2 = !{!2, !0}
+    !3 = !{!3, !0}
+    !4 = !{!4, !1}
 
     ; Some scope lists:
-    !5 = metadata !{metadata !4} ; A list containing only scope !4
-    !6 = metadata !{metadata !4, metadata !3, metadata !2}
-    !7 = metadata !{metadata !3}
+    !5 = !{!4} ; A list containing only scope !4
+    !6 = !{!4, !3, !2}
+    !7 = !{!3}
 
     ; These two instructions don't alias:
     %0 = load float* %c, align 4, !alias.scope !5
@@ -2968,7 +3054,7 @@ number representing the maximum relative error, for example:
 
 .. code-block:: llvm
 
-    !0 = metadata !{ float 2.5 } ; maximum acceptable inaccuracy is 2.5 ULPs
+    !0 = !{ float 2.5 } ; maximum acceptable inaccuracy is 2.5 ULPs
 
 '``range``' Metadata
 ^^^^^^^^^^^^^^^^^^^^
@@ -3000,10 +3086,10 @@ Examples:
       %d = invoke i8 @bar() to label %cont
              unwind label %lpad, !range !3 ; Can only be -2, -1, 3, 4 or 5
     ...
-    !0 = metadata !{ i8 0, i8 2 }
-    !1 = metadata !{ i8 255, i8 2 }
-    !2 = metadata !{ i8 0, i8 2, i8 3, i8 6 }
-    !3 = metadata !{ i8 -2, i8 0, i8 3, i8 6 }
+    !0 = !{ i8 0, i8 2 }
+    !1 = !{ i8 255, i8 2 }
+    !2 = !{ i8 0, i8 2, i8 3, i8 6 }
+    !3 = !{ i8 -2, i8 0, i8 3, i8 6 }
 
 '``llvm.loop``'
 ^^^^^^^^^^^^^^^
@@ -3023,8 +3109,8 @@ constructs:
 
 .. code-block:: llvm
 
-    !0 = metadata !{ metadata !0 }
-    !1 = metadata !{ metadata !1 }
+    !0 = !{!0}
+    !1 = !{!1}
 
 The loop identifier metadata can be used to specify additional
 per-loop metadata. Any operands after the first operand can be treated
@@ -3035,8 +3121,8 @@ suggests an unroll factor to the loop unroller:
 
       br i1 %exitcond, label %._crit_edge, label %.lr.ph, !llvm.loop !0
     ...
-    !0 = metadata !{ metadata !0, metadata !1 }
-    !1 = metadata !{ metadata !"llvm.loop.unroll.count", i32 4 }
+    !0 = !{!0, !1}
+    !1 = !{!"llvm.loop.unroll.count", i32 4}
 
 '``llvm.loop.vectorize``' and '``llvm.loop.interleave``'
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -3061,7 +3147,7 @@ example:
 
 .. code-block:: llvm
 
-   !0 = metadata !{ metadata !"llvm.loop.interleave.count", i32 4 }
+   !0 = !{!"llvm.loop.interleave.count", i32 4}
 
 Note that setting ``llvm.loop.interleave.count`` to 1 disables interleaving
 multiple iterations of the loop.  If ``llvm.loop.interleave.count`` is set to 0
@@ -3077,8 +3163,8 @@ is a bit.  If the bit operand value is 1 vectorization is enabled. A value of
 
 .. code-block:: llvm
 
-   !0 = metadata !{ metadata !"llvm.loop.vectorize.enable", i1 0 }
-   !1 = metadata !{ metadata !"llvm.loop.vectorize.enable", i1 1 }
+   !0 = !{!"llvm.loop.vectorize.enable", i1 0}
+   !1 = !{!"llvm.loop.vectorize.enable", i1 1}
 
 '``llvm.loop.vectorize.width``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -3089,7 +3175,7 @@ operand is an integer specifying the width. For example:
 
 .. code-block:: llvm
 
-   !0 = metadata !{ metadata !"llvm.loop.vectorize.width", i32 4 }
+   !0 = !{!"llvm.loop.vectorize.width", i32 4}
 
 Note that setting ``llvm.loop.vectorize.width`` to 1 disables
 vectorization of the loop.  If ``llvm.loop.vectorize.width`` is set to
@@ -3116,7 +3202,7 @@ example:
 
 .. code-block:: llvm
 
-   !0 = metadata !{ metadata !"llvm.loop.unroll.count", i32 4 }
+   !0 = !{!"llvm.loop.unroll.count", i32 4}
 
 If the trip count of the loop is less than the unroll count the loop
 will be partially unrolled.
@@ -3129,7 +3215,7 @@ which is the string ``llvm.loop.unroll.disable``.  For example:
 
 .. code-block:: llvm
 
-   !0 = metadata !{ metadata !"llvm.loop.unroll.disable" }
+   !0 = !{!"llvm.loop.unroll.disable"}
 
 '``llvm.loop.unroll.full``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -3140,7 +3226,7 @@ For example:
 
 .. code-block:: llvm
 
-   !0 = metadata !{ metadata !"llvm.loop.unroll.full" }
+   !0 = !{!"llvm.loop.unroll.full"}
 
 '``llvm.mem``'
 ^^^^^^^^^^^^^^^
@@ -3191,7 +3277,7 @@ metadata types that refer to the same loop identifier metadata.
 
    for.end:
    ...
-   !0 = metadata !{ metadata !0 }
+   !0 = !{!0}
 
 It is also possible to have nested parallel loops. In that case the
 memory accesses refer to a list of loop identifier metadata nodes instead of
@@ -3221,9 +3307,15 @@ the loop identifier metadata node directly:
 
    outer.for.end:                                          ; preds = %for.body
    ...
-   !0 = metadata !{ metadata !1, metadata !2 } ; a list of loop identifiers
-   !1 = metadata !{ metadata !1 } ; an identifier for the inner loop
-   !2 = metadata !{ metadata !2 } ; an identifier for the outer loop
+   !0 = !{!1, !2} ; a list of loop identifiers
+   !1 = !{!1} ; an identifier for the inner loop
+   !2 = !{!2} ; an identifier for the outer loop
+
+'``llvm.bitsets``'
+^^^^^^^^^^^^^^^^^^
+
+The ``llvm.bitsets`` global metadata is used to implement
+:doc:`bitsets <BitSets>`.
 
 Module Flags Metadata
 =====================
@@ -3307,12 +3399,12 @@ An example of module flags:
 
 .. code-block:: llvm
 
-    !0 = metadata !{ i32 1, metadata !"foo", i32 1 }
-    !1 = metadata !{ i32 4, metadata !"bar", i32 37 }
-    !2 = metadata !{ i32 2, metadata !"qux", i32 42 }
-    !3 = metadata !{ i32 3, metadata !"qux",
-      metadata !{
-        metadata !"foo", i32 1
+    !0 = !{ i32 1, !"foo", i32 1 }
+    !1 = !{ i32 4, !"bar", i32 37 }
+    !2 = !{ i32 2, !"qux", i32 42 }
+    !3 = !{ i32 3, !"qux",
+      !{
+        !"foo", i32 1
       }
     }
     !llvm.module.flags = !{ !0, !1, !2, !3 }
@@ -3333,7 +3425,7 @@ An example of module flags:
 
    ::
 
-       metadata !{ metadata !"foo", i32 1 }
+       !{ !"foo", i32 1 }
 
    The behavior is to emit an error if the ``llvm.module.flags`` does not
    contain a flag with the ID ``!"foo"`` that has the value '1' after linking is
@@ -3409,10 +3501,10 @@ For example, the following metadata section specifies two separate sets of
 linker options, presumably to link against ``libz`` and the ``Cocoa``
 framework::
 
-    !0 = metadata !{ i32 6, metadata !"Linker Options",
-       metadata !{
-          metadata !{ metadata !"-lz" },
-          metadata !{ metadata !"-framework", metadata !"Cocoa" } } }
+    !0 = !{ i32 6, !"Linker Options",
+       !{
+          !{ !"-lz" },
+          !{ !"-framework", !"Cocoa" } } }
     !llvm.module.flags = !{ !0 }
 
 The metadata encoding as lists of lists of options, as opposed to a collapsed
@@ -3458,8 +3550,8 @@ compiled with a ``wchar_t`` width of 4 bytes, and the underlying type of an
 enum is the smallest type which can represent all of its values::
 
     !llvm.module.flags = !{!0, !1}
-    !0 = metadata !{i32 1, metadata !"short_wchar", i32 1}
-    !1 = metadata !{i32 1, metadata !"short_enum", i32 0}
+    !0 = !{i32 1, !"short_wchar", i32 1}
+    !1 = !{i32 1, !"short_enum", i32 0}
 
 .. _intrinsicglobalvariables:
 
@@ -5208,10 +5300,11 @@ as the ``MOVNT`` instruction on x86.
 The optional ``!invariant.load`` metadata must reference a single
 metadata name ``<index>`` corresponding to a metadata node with no
 entries. The existence of the ``!invariant.load`` metadata on the
-instruction tells the optimizer and code generator that this load
-address points to memory which does not change value during program
-execution. The optimizer may then move this load around, for example, by
-hoisting it out of loops using loop invariant code motion.
+instruction tells the optimizer and code generator that the address
+operand to this load points to memory which can be assumed unchanged.
+Being invariant does not imply that a location is dereferenceable, 
+but it does imply that once the location is known dereferenceable 
+its value is henceforth unchanging.  
 
 The optional ``!nonnull`` metadata must reference a single
 metadata name ``<index>`` corresponding to a metadata node with no
@@ -7016,18 +7109,28 @@ arbitrarily complex and require, for example, memory allocation.
 Accurate Garbage Collection Intrinsics
 --------------------------------------
 
-LLVM support for `Accurate Garbage Collection <GarbageCollection.html>`_
-(GC) requires the implementation and generation of these intrinsics.
+LLVM's support for `Accurate Garbage Collection <GarbageCollection.html>`_
+(GC) requires the frontend to generate code containing appropriate intrinsic 
+calls and select an appropriate GC strategy which knows how to lower these 
+intrinsics in a manner which is appropriate for the target collector.
+
 These intrinsics allow identification of :ref:`GC roots on the
 stack <int_gcroot>`, as well as garbage collector implementations that
 require :ref:`read <int_gcread>` and :ref:`write <int_gcwrite>` barriers.
-Front-ends for type-safe garbage collected languages should generate
+Frontends for type-safe garbage collected languages should generate
 these intrinsics to make use of the LLVM garbage collectors. For more
-details, see `Accurate Garbage Collection with
-LLVM <GarbageCollection.html>`_.
+details, see `Garbage Collection with LLVM <GarbageCollection.html>`_.
 
-The garbage collection intrinsics only operate on objects in the generic
-address space (address space zero).
+Experimental Statepoint Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+LLVM provides an second experimental set of intrinsics for describing garbage
+collection safepoints in compiled code.  These intrinsics are an alternative 
+to the ``llvm.gcroot`` intrinsics, but are compatible with the ones for 
+:ref:`read <int_gcread>` and :ref:`write <int_gcwrite>` barriers.  The 
+differences in approach are covered in the `Garbage Collection with LLVM 
+<GarbageCollection.html>`_ documentation.  The intrinsics themselves are 
+described in :doc:`Statepoints`.
 
 .. _int_gcroot:
 
@@ -7217,6 +7320,56 @@ Note that calling this intrinsic does not prevent function inlining or
 other aggressive transformations, so the value returned may not be that
 of the obvious source-language caller.
 
+'``llvm.frameallocate``' and '``llvm.framerecover``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i8* @llvm.frameallocate(i32 %size)
+      declare i8* @llvm.framerecover(i8* %func, i8* %fp)
+
+Overview:
+"""""""""
+
+The '``llvm.frameallocate``' intrinsic allocates stack memory at some fixed
+offset from the frame pointer, and the '``llvm.framerecover``'
+intrinsic applies that offset to a live frame pointer to recover the address of
+the allocation. The offset is computed during frame layout of the caller of
+``llvm.frameallocate``.
+
+Arguments:
+""""""""""
+
+The ``size`` argument to '``llvm.frameallocate``' must be a constant integer
+indicating the amount of stack memory to allocate. As with allocas, allocating
+zero bytes is legal, but the result is undefined.
+
+The ``func`` argument to '``llvm.framerecover``' must be a constant
+bitcasted pointer to a function defined in the current module. The code
+generator cannot determine the frame allocation offset of functions defined in
+other modules.
+
+The ``fp`` argument to '``llvm.framerecover``' must be a frame
+pointer of a call frame that is currently live. The return value of
+'``llvm.frameaddress``' is one way to produce such a value, but most platforms
+also expose the frame pointer through stack unwinding mechanisms.
+
+Semantics:
+""""""""""
+
+These intrinsics allow a group of functions to access one stack memory
+allocation in an ancestor stack frame. The memory returned from
+'``llvm.frameallocate``' may be allocated prior to stack realignment, so the
+memory is only aligned to the ABI-required stack alignment.  Each function may
+only call '``llvm.frameallocate``' one or zero times from the function entry
+block.  The frame allocation intrinsic inhibits inlining, as any frame
+allocations in the inlined function frame are likely to be at a different
+offset from the one used by '``llvm.framerecover``' called with the
+uninlined function.
+
 .. _int_read_register:
 .. _int_write_register:
 
@@ -7232,7 +7385,7 @@ Syntax:
       declare i64 @llvm.read_register.i64(metadata)
       declare void @llvm.write_register.i32(metadata, i32 @value)
       declare void @llvm.write_register.i64(metadata, i64 @value)
-      !0 = metadata !{metadata !"sp\00"}
+      !0 = !{!"sp\00"}
 
 Overview:
 """""""""
@@ -7454,6 +7607,50 @@ time library.
 This instrinsic does *not* empty the instruction pipeline. Modifications
 of the current function are outside the scope of the intrinsic.
 
+'``llvm.instrprof_increment``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare void @llvm.instrprof_increment(i8* <name>, i64 <hash>,
+                                             i32 <num-counters>, i32 <index>)
+
+Overview:
+"""""""""
+
+The '``llvm.instrprof_increment``' intrinsic can be emitted by a
+frontend for use with instrumentation based profiling. These will be
+lowered by the ``-instrprof`` pass to generate execution counts of a
+program at runtime.
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to a global variable containing the
+name of the entity being instrumented. This should generally be the
+(mangled) function name for a set of counters.
+
+The second argument is a hash value that can be used by the consumer
+of the profile data to detect changes to the instrumented source, and
+the third is the number of counters associated with ``name``. It is an
+error if ``hash`` or ``num-counters`` differ between two instances of
+``instrprof_increment`` that refer to the same name.
+
+The last argument refers to which of the counters for ``name`` should
+be incremented. It should be a value between 0 and ``num-counters``.
+
+Semantics:
+""""""""""
+
+This intrinsic represents an increment of a profiling counter. It will
+cause the ``-instrprof`` pass to generate the appropriate data
+structures and the code to increment the appropriate value, in a
+format that can be written out by a compiler runtime and consumed via
+the ``llvm-profdata`` tool.
+
 Standard C Library Intrinsics
 -----------------------------
 
@@ -8499,7 +8696,7 @@ Arguments:
 """"""""""
 
 The first argument is the value to be counted. This argument may be of
-any integer type, or a vectory with integer element type. The return
+any integer type, or a vector with integer element type. The return
 type must match the first argument type.
 
 The second argument must be a constant and is a flag to indicate whether
@@ -8546,7 +8743,7 @@ Arguments:
 """"""""""
 
 The first argument is the value to be counted. This argument may be of
-any integer type, or a vectory with integer element type. The return
+any integer type, or a vector with integer element type. The return
 type must match the first argument type.
 
 The second argument must be a constant and is a flag to indicate whether
@@ -9142,6 +9339,93 @@ intrinsic returns the executable address corresponding to ``tramp``
 after performing the required machine specific adjustments. The pointer
 returned can then be :ref:`bitcast and executed <int_trampoline>`.
 
+Masked Vector Load and Store Intrinsics
+---------------------------------------
+
+LLVM provides intrinsics for predicated vector load and store operations. The predicate is specified by a mask operand, which holds one bit per vector element, switching the associated vector lane on or off. The memory addresses corresponding to the "off" lanes are not accessed. When all bits of the mask are on, the intrinsic is identical to a regular vector load or store. When all bits are off, no memory is accessed.
+
+.. _int_mload:
+
+'``llvm.masked.load.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. The loaded data is a vector of any integer or floating point data type.
+
+::
+
+      declare <16 x float> @llvm.masked.load.v16f32 (<16 x float>* <ptr>, i32 <alignment>, <16 x i1> <mask>, <16 x float> <passthru>)
+      declare <2 x double> @llvm.masked.load.v2f64  (<2 x double>* <ptr>, i32 <alignment>, <2 x i1>  <mask>, <2 x double> <passthru>)
+
+Overview:
+"""""""""
+
+Reads a vector from memory according to the provided mask. The mask holds a bit for each vector lane, and is used to prevent memory accesses to the masked-off lanes. The masked-off lanes in the result vector are taken from the corresponding lanes in the passthru operand.
+
+
+Arguments:
+""""""""""
+
+The first operand is the base pointer for the load. The second operand is the alignment of the source location. It must be a constant integer value. The third operand, mask, is a vector of boolean 'i1' values with the same number of elements as the return type. The fourth is a pass-through value that is used to fill the masked-off lanes of the result. The return type, underlying type of the base pointer and the type of passthru operand are the same vector types.
+
+
+Semantics:
+""""""""""
+
+The '``llvm.masked.load``' intrinsic is designed for conditional reading of selected vector elements in a single IR operation. It is useful for targets that support vector masked loads and allows vectorizing predicated basic blocks on these targets. Other targets may support this intrinsic differently, for example by lowering it into a sequence of branches that guard scalar load operations.
+The result of this operation is equivalent to a regular vector load instruction followed by a 'select' between the loaded and the passthru values, predicated on the same mask. However, using this intrinsic prevents exceptions on memory access to masked-off lanes.
+
+
+::
+
+       %res = call <16 x float> @llvm.masked.load.v16f32 (<16 x float>* %ptr, i32 4, <16 x i1>%mask, <16 x float> %passthru)
+       
+       ;; The result of the two following instructions is identical aside from potential memory access exception
+       %loadlal = load <16 x float>* %ptr, align 4
+       %res = select <16 x i1> %mask, <16 x float> %loadlal, <16 x float> %passthru
+
+.. _int_mstore:
+
+'``llvm.masked.store.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. The data stored in memory is a vector of any integer or floating point data type.
+
+::
+
+       declare void @llvm.masked.store.v8i32 (<8 x i32>  <value>, <8 x i32> * <ptr>, i32 <alignment>,  <8 x i1>  <mask>)
+       declare void @llvm.masked.store.v16f32(<16 x i32> <value>, <16 x i32>* <ptr>, i32 <alignment>,  <16 x i1> <mask>)
+
+Overview:
+"""""""""
+
+Writes a vector to memory according to the provided mask. The mask holds a bit for each vector lane, and is used to prevent memory accesses to the masked-off lanes.
+
+Arguments:
+""""""""""
+
+The first operand is the vector value to be written to memory. The second operand is the base pointer for the store, it has the same underlying type as the value operand. The third operand is the alignment of the destination location. The fourth operand, mask, is a vector of boolean values. The types of the mask and the value operand must have the same number of vector elements.
+
+
+Semantics:
+""""""""""
+
+The '``llvm.masked.store``' intrinsics is designed for conditional writing of selected vector elements in a single IR operation. It is useful for targets that support vector masked store and allows vectorizing predicated basic blocks on these targets. Other targets may support this intrinsic differently, for example by lowering it into a sequence of branches that guard scalar store operations.
+The result of this operation is equivalent to a load-modify-store sequence. However, using this intrinsic prevents exceptions and data races on memory access to masked-off lanes.
+
+::
+
+       call void @llvm.masked.store.v16f32(<16 x float> %value, <16 x float>* %ptr, i32 4,  <16 x i1> %mask)
+       
+       ;; The result of the following instructions is identical aside from potential data races and memory access exceptions
+       %oldval = load <16 x float>* %ptr, align 4
+       %res = select <16 x i1> %mask, <16 x float> %value, <16 x float> %oldval
+       store <16 x float> %res, <16 x float>* %ptr, align 4
+
+
 Memory Use Markers
 ------------------
 
@@ -9617,15 +9901,40 @@ generated for this intrinsic, and instructions that contribute only to the
 provided condition are not used for code generation. If the condition is
 violated during execution, the behavior is undefined.
 
-Please note that optimizer might limit the transformations performed on values
+Note that the optimizer might limit the transformations performed on values
 used by the ``llvm.assume`` intrinsic in order to preserve the instructions
 only used to form the intrinsic's input argument. This might prove undesirable
-if the extra information provided by the ``llvm.assume`` intrinsic does cause
+if the extra information provided by the ``llvm.assume`` intrinsic does not cause
 sufficient overall improvement in code quality. For this reason,
 ``llvm.assume`` should not be used to document basic mathematical invariants
 that the optimizer can otherwise deduce or facts that are of little use to the
 optimizer.
 
+.. _bitset.test:
+
+'``llvm.bitset.test``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i1 @llvm.bitset.test(i8* %ptr, metadata %bitset) nounwind readnone
+
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to be tested. The second argument is a
+metadata string containing the name of a :doc:`bitset <BitSets>`.
+
+Overview:
+"""""""""
+
+The ``llvm.bitset.test`` intrinsic tests whether the given pointer is a
+member of the given bitset.
+
 '``llvm.donothing``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/docs/LinkTimeOptimization.rst b/docs/LinkTimeOptimization.rst
index c15abd3..55a7486 100644
--- a/docs/LinkTimeOptimization.rst
+++ b/docs/LinkTimeOptimization.rst
@@ -134,7 +134,7 @@ Alternative Approaches
 Multi-phase communication between ``libLTO`` and linker
 =======================================================
 
-The linker collects information about symbol defininitions and uses in various
+The linker collects information about symbol definitions and uses in various
 link objects which is more accurate than any information collected by other
 tools during typical build cycles.  The linker collects this information by
 looking at the definitions and uses of symbols in native .o files and using
diff --git a/docs/MergeFunctions.rst b/docs/MergeFunctions.rst
new file mode 100644
index 0000000..6b8012e
--- /dev/null
+++ b/docs/MergeFunctions.rst
@@ -0,0 +1,802 @@
+=================================
+MergeFunctions pass, how it works
+=================================
+
+.. contents::
+   :local:
+
+Introduction
+============
+Sometimes code contains equal functions, or functions that does exactly the same
+thing even though they are non-equal on the IR level (e.g.: multiplication on 2
+and 'shl 1'). It could happen due to several reasons: mainly, the usage of
+templates and automatic code generators. Though, sometimes user itself could
+write the same thing twice :-)
+
+The main purpose of this pass is to recognize such functions and merge them.
+
+Why would I want to read this document?
+---------------------------------------
+Document is the extension to pass comments and describes the pass logic. It
+describes algorithm that is used in order to compare functions, it also
+explains how we could combine equal functions correctly, keeping module valid.
+
+Material is brought in top-down form, so reader could start learn pass from
+ideas and end up with low-level algorithm details, thus preparing him for
+reading the sources.
+
+So main goal is do describe algorithm and logic here; the concept. This document
+is good for you, if you *don't want* to read the source code, but want to
+understand pass algorithms. Author tried not to repeat the source-code and
+cover only common cases, and thus avoid cases when after minor code changes we
+need to update this document.
+
+
+What should I know to be able to follow along with this document?
+-----------------------------------------------------------------
+
+Reader should be familiar with common compile-engineering principles and LLVM
+code fundamentals. In this article we suppose reader is familiar with
+`Single Static Assingment <http://en.wikipedia.org/wiki/Static_single_assignment_form>`_
+concepts. Understanding of
+`IR structure <http://llvm.org/docs/LangRef.html#high-level-structure>`_ is
+also important.
+
+We will use such terms as
+"`module <http://llvm.org/docs/LangRef.html#high-level-structure>`_",
+"`function <http://llvm.org/docs/ProgrammersManual.html#the-function-class>`_",
+"`basic block <http://en.wikipedia.org/wiki/Basic_block>`_",
+"`user <http://llvm.org/docs/ProgrammersManual.html#the-user-class>`_",
+"`value <http://llvm.org/docs/ProgrammersManual.html#the-value-class>`_",
+"`instruction <http://llvm.org/docs/ProgrammersManual.html#the-instruction-class>`_".
+
+As a good start point, Kaleidoscope tutorial could be used:
+
+:doc:`tutorial/index`
+
+Especially it's important to understand chapter 3 of tutorial:
+
+:doc:`tutorial/LangImpl3`
+
+Reader also should know how passes work in LLVM, he could use next article as a
+reference and start point here:
+
+:doc:`WritingAnLLVMPass`
+
+What else? Well perhaps reader also should have some experience in LLVM pass
+debugging and bug-fixing.
+
+What I gain by reading this document?
+-------------------------------------
+Main purpose is to provide reader with comfortable form of algorithms
+description, namely the human reading text. Since it could be hard to
+understand algorithm straight from the source code: pass uses some principles
+that have to be explained first.
+
+Author wishes to everybody to avoid case, when you read code from top to bottom
+again and again, and yet you don't understand why we implemented it that way.
+
+We hope that after this article reader could easily debug and improve
+MergeFunctions pass and thus help LLVM project.
+
+Narrative structure
+-------------------
+Article consists of three parts. First part explains pass functionality on the
+top-level. Second part describes the comparison procedure itself. The third
+part describes the merging process.
+
+In every part author also tried to put the contents into the top-down form.
+First, the top-level methods will be described, while the terminal ones will be
+at the end, in the tail of each part. If reader will see the reference to the
+method that wasn't described yet, he will find its description a bit below.
+
+Basics
+======
+
+How to do it?
+-------------
+Do we need to merge functions? Obvious thing is: yes that's a quite possible
+case, since usually we *do* have duplicates. And it would be good to get rid of
+them. But how to detect such a duplicates? The idea is next: we split functions
+onto small bricks (parts), then we compare "bricks" amount, and if it equal,
+compare "bricks" themselves, and then do our conclusions about functions
+themselves.
+
+What the difference it could be? For example, on machine with 64-bit pointers
+(let's assume we have only one address space),  one function stores 64-bit
+integer, while another one stores a pointer. So if the target is a machine
+mentioned above, and if functions are identical, except the parameter type (we
+could consider it as a part of function type), then we can treat ``uint64_t``
+and``void*`` as equal.
+
+It was just an example; possible details are described a bit below.
+
+As another example reader may imagine two more functions. First function
+performs multiplication on 2, while the second one performs arithmetic right
+shift on 1.
+
+Possible solutions
+^^^^^^^^^^^^^^^^^^
+Let's briefly consider possible options about how and what we have to implement
+in order to create full-featured functions merging, and also what it would
+meant for us.
+
+Equal functions detection, obviously supposes "detector" method to be
+implemented, latter should answer the question "whether functions are equal".
+This "detector" method consists of tiny "sub-detectors", each of them answers
+exactly the same question, but for function parts.
+
+As the second step, we should merge equal functions. So it should be a "merger"
+method. "Merger" accepts two functions *F1* and *F2*, and produces *F1F2*
+function, the result of merging.
+
+Having such a routines in our hands, we can process whole module, and merge all
+equal functions.
+
+In this case, we have to compare every function with every another function. As
+reader could notice, this way seems to be quite expensive. Of course we could
+introduce hashing and other helpers, but it is still just an optimization, and
+thus the level of O(N*N) complexity.
+
+Can we reach another level? Could we introduce logarithmical search, or random
+access lookup? The answer is: "yes".
+
+Random-access
+"""""""""""""
+How it could be done? Just convert each function to number, and gather all of
+them in special hash-table. Functions with equal hash are equal. Good hashing
+means, that every function part must be taken into account. That means we have
+to convert every function part into some number, and then add it into hash.
+Lookup-up time would be small, but such approach adds some delay due to hashing
+routine.
+
+Logarithmical search
+""""""""""""""""""""
+We could introduce total ordering among the functions set, once we had it we
+could then implement a logarithmical search. Lookup time still depends on N,
+but adds a little of delay (*log(N)*).
+
+Present state
+"""""""""""""
+Both of approaches (random-access and logarithmical) has been implemented and
+tested. And both of them gave a very good improvement. And what was most
+surprising, logarithmical search was faster; sometimes up to 15%. Hashing needs
+some extra CPU time, and it is the main reason why it works slower; in most of
+cases total "hashing" time was greater than total "logarithmical-search" time.
+
+So, preference has been granted to the "logarithmical search".
+
+Though in the case of need, *logarithmical-search* (read "total-ordering") could
+be used as a milestone on our way to the *random-access* implementation.
+
+Every comparison is based either on the numbers or on flags comparison. In
+*random-access* approach we could use the same comparison algorithm. During
+comparison we exit once we find the difference, but here we might have to scan
+whole function body every time (note, it could be slower). Like in
+"total-ordering", we will track every numbers and flags, but instead of
+comparison, we should get numbers sequence and then create the hash number. So,
+once again, *total-ordering* could be considered as a milestone for even faster
+(in theory) random-access approach.
+
+MergeFunctions, main fields and runOnModule
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+There are two most important fields in class:
+
+``FnTree``  – the set of all unique functions. It keeps items that couldn't be
+merged with each other. It is defined as:
+
+``std::set<FunctionNode> FnTree;``
+
+Here ``FunctionNode`` is a wrapper for ``llvm::Function`` class, with
+implemented “<” operator among the functions set (below we explain how it works
+exactly; this is a key point in fast functions comparison).
+
+``Deferred`` – merging process can affect bodies of functions that are in
+``FnTree`` already. Obviously such functions should be rechecked again. In this
+case we remove them from ``FnTree``, and mark them as to be rescanned, namely
+put them into ``Deferred`` list.
+
+runOnModule
+"""""""""""
+The algorithm is pretty simple:
+
+1. Put all module's functions into the *worklist*.
+
+2. Scan *worklist*'s functions twice: first enumerate only strong functions and
+then only weak ones:
+
+   2.1. Loop body: take function from *worklist*  (call it *FCur*) and try to
+   insert it into *FnTree*: check whether *FCur* is equal to one of functions
+   in *FnTree*. If there *is* equal function in *FnTree* (call it *FExists*):
+   merge function *FCur* with *FExists*. Otherwise add function from *worklist*
+   to *FnTree*.
+
+3. Once *worklist* scanning and merging operations is complete, check *Deferred*
+list. If it is not empty: refill *worklist* contents with *Deferred* list and
+do step 2 again, if *Deferred* is empty, then exit from method.
+
+Comparison and logarithmical search
+"""""""""""""""""""""""""""""""""""
+Let's recall our task: for every function *F* from module *M*, we have to find
+equal functions *F`* in shortest time, and merge them into the single function.
+
+Defining total ordering among the functions set allows to organize functions
+into the binary tree. The lookup procedure complexity would be estimated as
+O(log(N)) in this case. But how to define *total-ordering*?
+
+We have to introduce a single rule applicable to every pair of functions, and
+following this rule then evaluate which of them is greater. What kind of rule
+it could be? Let's declare it as "compare" method, that returns one of 3
+possible values:
+
+-1, left is *less* than right,
+
+0, left and right are *equal*,
+
+1, left is *greater* than right.
+
+Of course it means, that we have to maintain
+*strict and non-strict order relation properties*:
+
+* reflexivity (``a <= a``, ``a == a``, ``a >= a``),
+* antisymmetry (if ``a <= b`` and ``b <= a`` then ``a == b``),
+* transitivity (``a <= b`` and ``b <= c``, then ``a <= c``)
+* asymmetry (if ``a < b``, then ``a > b`` or ``a == b``).
+
+As it was mentioned before, comparison routine consists of
+"sub-comparison-routines", each of them also consists
+"sub-comparison-routines", and so on, finally it ends up with a primitives
+comparison.
+
+Below, we will use the next operations:
+
+#. ``cmpNumbers(number1, number2)`` is method that returns -1 if left is less
+   than right; 0, if left and right are equal; and 1 otherwise.
+
+#. ``cmpFlags(flag1, flag2)`` is hypothetical method that compares two flags.
+   The logic is the same as in ``cmpNumbers``, where ``true`` is 1, and
+   ``false`` is 0.
+
+The rest of article is based on *MergeFunctions.cpp* source code
+(*<llvm_dir>/lib/Transforms/IPO/MergeFunctions.cpp*). We would like to ask
+reader to keep this file open nearby, so we could use it as a reference for
+further explanations.
+
+Now we're ready to proceed to the next chapter and see how it works.
+
+Functions comparison
+====================
+At first, let's define how exactly we compare complex objects.
+
+Complex objects comparison (function, basic-block, etc) is mostly based on its
+sub-objects comparison results. So it is similar to the next "tree" objects
+comparison:
+
+#. For two trees *T1* and *T2* we perform *depth-first-traversal* and have
+   two sequences as a product: "*T1Items*" and "*T2Items*".
+
+#. Then compare chains "*T1Items*" and "*T2Items*" in
+   most-significant-item-first order. Result of items comparison would be the
+   result of *T1* and *T2* comparison itself.
+
+FunctionComparator::compare(void)
+---------------------------------
+Brief look at the source code tells us, that comparison starts in
+“``int FunctionComparator::compare(void)``” method.
+
+1. First parts to be compared are function's attributes and some properties that
+outsides “attributes” term, but still could make function different without
+changing its body. This part of comparison is usually done within simple
+*cmpNumbers* or *cmpFlags* operations (e.g.
+``cmpFlags(F1->hasGC(), F2->hasGC())``). Below is full list of function's
+properties to be compared on this stage:
+
+  * *Attributes* (those are returned by ``Function::getAttributes()``
+    method).
+
+  * *GC*, for equivalence, *RHS* and *LHS* should be both either without
+    *GC* or with the same one.
+
+  * *Section*, just like a *GC*: *RHS* and *LHS* should be defined in the
+    same section.
+
+  * *Variable arguments*. *LHS* and *RHS* should be both either with or
+    without *var-args*.
+
+  * *Calling convention* should be the same.
+
+2. Function type. Checked by ``FunctionComparator::cmpType(Type*, Type*)``
+method. It checks return type and parameters type; the method itself will be
+described later.
+
+3. Associate function formal parameters with each other. Then comparing function
+bodies, if we see the usage of *LHS*'s *i*-th argument in *LHS*'s body, then,
+we want to see usage of *RHS*'s *i*-th argument at the same place in *RHS*'s
+body, otherwise functions are different. On this stage we grant the preference
+to those we met later in function body (value we met first would be *less*).
+This is done by “``FunctionComparator::cmpValues(const Value*, const Value*)``”
+method (will be described a bit later).
+
+4. Function body comparison. As it written in method comments:
+
+“We do a CFG-ordered walk since the actual ordering of the blocks in the linked
+list is immaterial. Our walk starts at the entry block for both functions, then
+takes each block from each terminator in order. As an artifact, this also means
+that unreachable blocks are ignored.”
+
+So, using this walk we get BBs from *left* and *right* in the same order, and
+compare them by “``FunctionComparator::compare(const BasicBlock*, const
+BasicBlock*)``” method.
+
+We also associate BBs with each other, like we did it with function formal
+arguments (see ``cmpValues`` method below).
+
+FunctionComparator::cmpType
+---------------------------
+Consider how types comparison works.
+
+1. Coerce pointer to integer. If left type is a pointer, try to coerce it to the
+integer type. It could be done if its address space is 0, or if address spaces
+are ignored at all. Do the same thing for the right type.
+
+2. If left and right types are equal, return 0. Otherwise we need to give
+preference to one of them. So proceed to the next step.
+
+3. If types are of different kind (different type IDs). Return result of type
+IDs comparison, treating them as a numbers (use ``cmpNumbers`` operation).
+
+4. If types are vectors or integers, return result of their pointers comparison,
+comparing them as numbers.
+
+5. Check whether type ID belongs to the next group (call it equivalent-group):
+
+   * Void
+
+   * Float
+
+   * Double
+
+   * X86_FP80
+
+   * FP128
+
+   * PPC_FP128
+
+   * Label
+
+   * Metadata.
+
+   If ID belongs to group above, return 0. Since it's enough to see that
+   types has the same ``TypeID``. No additional information is required.
+
+6. Left and right are pointers. Return result of address space comparison
+(numbers comparison).
+
+7. Complex types (structures, arrays, etc.). Follow complex objects comparison
+technique (see the very first paragraph of this chapter). Both *left* and
+*right* are to be expanded and their element types will be checked the same
+way. If we get -1 or 1 on some stage, return it. Otherwise return 0.
+
+8. Steps 1-6 describe all the possible cases, if we passed steps 1-6 and didn't
+get any conclusions, then invoke ``llvm_unreachable``, since it's quite
+unexpectable case.
+
+cmpValues(const Value*, const Value*)
+-------------------------------------
+Method that compares local values.
+
+This method gives us an answer on a very curious quesion: whether we could treat
+local values as equal, and which value is greater otherwise. It's better to
+start from example:
+
+Consider situation when we're looking at the same place in left function "*FL*"
+and in right function "*FR*". And every part of *left* place is equal to the
+corresponding part of *right* place, and (!) both parts use *Value* instances,
+for example:
+
+.. code-block:: llvm
+
+   instr0 i32 %LV   ; left side, function FL
+   instr0 i32 %RV   ; right side, function FR
+
+So, now our conclusion depends on *Value* instances comparison.
+
+Main purpose of this method is to determine relation between such values.
+
+What we expect from equal functions? At the same place, in functions "*FL*" and
+"*FR*" we expect to see *equal* values, or values *defined* at the same place
+in "*FL*" and "*FR*".
+
+Consider small example here:
+
+.. code-block:: llvm
+
+  define void %f(i32 %pf0, i32 %pf1) {
+    instr0 i32 %pf0 instr1 i32 %pf1 instr2 i32 123
+  }
+
+.. code-block:: llvm
+
+  define void %g(i32 %pg0, i32 %pg1) {
+    instr0 i32 %pg0 instr1 i32 %pg0 instr2 i32 123
+  }
+
+In this example, *pf0* is associated with *pg0*, *pf1* is associated with *pg1*,
+and we also declare that *pf0* < *pf1*, and thus *pg0* < *pf1*.
+
+Instructions with opcode "*instr0*" would be *equal*, since their types and
+opcodes are equal, and values are *associated*.
+
+Instruction with opcode "*instr1*" from *f* is *greater* than instruction with
+opcode "*instr1*" from *g*; here we have equal types and opcodes, but "*pf1* is
+greater than "*pg0*".
+
+And instructions with opcode "*instr2*" are equal, because their opcodes and
+types are equal, and the same constant is used as a value.
+
+What we assiciate in cmpValues?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+* Function arguments. *i*-th argument from left function associated with
+  *i*-th argument from right function.
+* BasicBlock instances. In basic-block enumeration loop we associate *i*-th
+  BasicBlock from the left function with *i*-th BasicBlock from the right
+  function.
+* Instructions.
+* Instruction operands. Note, we can meet *Value* here we have never seen
+  before. In this case it is not a function argument, nor *BasicBlock*, nor
+  *Instruction*. It is global value. It is constant, since its the only
+  supposed global here. Method also compares:
+* Constants that are of the same type.
+* If right constant could be losslessly bit-casted to the left one, then we
+  also compare them.
+
+How to implement cmpValues?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+*Association* is a case of equality for us. We just treat such values as equal.
+But, in general, we need to implement antisymmetric relation. As it was
+mentioned above, to understand what is *less*, we can use order in which we
+meet values. If both of values has the same order in function (met at the same
+time), then treat values as *associated*. Otherwise – it depends on who was
+first.
+
+Every time we run top-level compare method, we initialize two identical maps
+(one for the left side, another one for the right side):
+
+``map<Value, int> sn_mapL, sn_mapR;``
+
+The key of the map is the *Value* itself, the *value* – is its order (call it
+*serial number*).
+
+To add value *V* we need to perform the next procedure:
+
+``sn_map.insert(std::make_pair(V, sn_map.size()));``
+
+For the first *Value*, map will return *0*, for second *Value* map will return
+*1*, and so on.
+
+Then we can check whether left and right values met at the same time with simple
+comparison:
+
+``cmpNumbers(sn_mapL[Left], sn_mapR[Right]);``
+
+Of course, we can combine insertion and comparison:
+
+.. code-block:: c++
+
+  std::pair<iterator, bool>
+    LeftRes = sn_mapL.insert(std::make_pair(Left, sn_mapL.size())), RightRes
+    = sn_mapR.insert(std::make_pair(Right, sn_mapR.size()));
+  return cmpNumbers(LeftRes.first->second, RightRes.first->second);
+
+Let's look, how whole method could be implemented.
+
+1. we have to start from the bad news. Consider function self and
+cross-referencing cases:
+
+.. code-block:: c++
+
+  // self-reference unsigned fact0(unsigned n) { return n > 1 ? n
+  * fact0(n-1) : 1; } unsigned fact1(unsigned n) { return n > 1 ? n *
+  fact1(n-1) : 1; }
+
+  // cross-reference unsigned ping(unsigned n) { return n!= 0 ? pong(n-1) : 0;
+  } unsigned pong(unsigned n) { return n!= 0 ? ping(n-1) : 0; }
+
+..
+
+  This comparison has been implemented in initial *MergeFunctions* pass
+  version. But, unfortunately, it is not transitive. And this is the only case
+  we can't convert to less-equal-greater comparison. It is a seldom case, 4-5
+  functions of 10000 (checked on test-suite), and, we hope, reader would
+  forgive us for such a sacrifice in order to get the O(log(N)) pass time.
+
+2. If left/right *Value* is a constant, we have to compare them. Return 0 if it
+is the same constant, or use ``cmpConstants`` method otherwise.
+
+3. If left/right is *InlineAsm* instance. Return result of *Value* pointers
+comparison.
+
+4. Explicit association of *L* (left value) and *R*  (right value). We need to
+find out whether values met at the same time, and thus are *associated*. Or we
+need to put the rule: when we treat *L* < *R*. Now it is easy: just return
+result of numbers comparison:
+
+.. code-block:: c++
+
+   std::pair<iterator, bool>
+     LeftRes = sn_mapL.insert(std::make_pair(Left, sn_mapL.size())),
+     RightRes = sn_mapR.insert(std::make_pair(Right, sn_mapR.size()));
+   if (LeftRes.first->second == RightRes.first->second) return 0;
+   if (LeftRes.first->second < RightRes.first->second) return -1;
+   return 1;
+
+Now when *cmpValues* returns 0, we can proceed comparison procedure. Otherwise,
+if we get (-1 or 1), we need to pass this result to the top level, and finish
+comparison procedure.
+
+cmpConstants
+------------
+Performs constants comparison as follows:
+
+1. Compare constant types using ``cmpType`` method. If result is -1 or 1, goto
+step 2, otherwise proceed to step 3.
+
+2. If types are different, we still can check whether constants could be
+losslessly bitcasted to each other. The further explanation is modification of
+``canLosslesslyBitCastTo`` method.
+
+   2.1 Check whether constants are of the first class types
+   (``isFirstClassType`` check):
+
+   2.1.1. If both constants are *not* of the first class type: return result
+   of ``cmpType``.
+
+   2.1.2. Otherwise, if left type is not of the first class, return -1. If
+   right type is not of the first class, return 1.
+
+   2.1.3. If both types are of the first class type, proceed to the next step
+   (2.1.3.1).
+
+   2.1.3.1. If types are vectors, compare their bitwidth using the
+   *cmpNumbers*. If result is not 0, return it.
+
+   2.1.3.2. Different types, but not a vectors:
+
+   * if both of them are pointers, good for us, we can proceed to step 3.
+   * if one of types is pointer, return result of *isPointer* flags
+     comparison (*cmpFlags* operation).
+   * otherwise we have no methods to prove bitcastability, and thus return
+     result of types comparison (-1 or 1).
+
+Steps below are for the case when types are equal, or case when constants are
+bitcastable:
+
+3. One of constants is a "*null*" value. Return the result of
+``cmpFlags(L->isNullValue, R->isNullValue)`` comparison.
+
+4. Compare value IDs, and return result if it is not 0:
+
+.. code-block:: c++
+
+  if (int Res = cmpNumbers(L->getValueID(), R->getValueID()))
+    return Res;
+
+5. Compare the contents of constants. The comparison depends on kind of
+constants, but on this stage it is just a lexicographical comparison. Just see
+how it was described in the beginning of "*Functions comparison*" paragraph.
+Mathematically it is equal to the next case: we encode left constant and right
+constant (with similar way *bitcode-writer* does). Then compare left code
+sequence and right code sequence.
+
+compare(const BasicBlock*, const BasicBlock*)
+---------------------------------------------
+Compares two *BasicBlock* instances.
+
+It enumerates instructions from left *BB* and right *BB*.
+
+1. It assigns serial numbers to the left and right instructions, using
+``cmpValues`` method.
+
+2. If one of left or right is *GEP* (``GetElementPtr``), then treat *GEP* as
+greater than other instructions, if both instructions are *GEPs* use ``cmpGEP``
+method for comparison. If result is -1 or 1, pass it to the top-level
+comparison (return it).
+
+   3.1. Compare operations. Call ``cmpOperation`` method. If result is -1 or
+   1, return it.
+
+   3.2. Compare number of operands, if result is -1 or 1, return it.
+
+   3.3. Compare operands themselves, use ``cmpValues`` method. Return result
+   if it is -1 or 1.
+
+   3.4. Compare type of operands, using ``cmpType`` method. Return result if
+   it is -1 or 1.
+
+   3.5. Proceed to the next instruction.
+
+4. We can finish instruction enumeration in 3 cases:
+
+   4.1. We reached the end of both left and right basic-blocks. We didn't
+   exit on steps 1-3, so contents is equal, return 0.
+
+   4.2. We have reached the end of the left basic-block. Return -1.
+
+   4.3. Return 1 (the end of the right basic block).
+
+cmpGEP
+------
+Compares two GEPs (``getelementptr`` instructions).
+
+It differs from regular operations comparison with the only thing: possibility
+to use ``accumulateConstantOffset`` method.
+
+So, if we get constant offset for both left and right *GEPs*, then compare it as
+numbers, and return comparison result.
+
+Otherwise treat it like a regular operation (see previous paragraph).
+
+cmpOperation
+------------
+Compares instruction opcodes and some important operation properties.
+
+1. Compare opcodes, if it differs return the result.
+
+2. Compare number of operands. If it differs – return the result.
+
+3. Compare operation types, use *cmpType*. All the same – if types are
+different, return result.
+
+4. Compare *subclassOptionalData*, get it with ``getRawSubclassOptionalData``
+method, and compare it like a numbers.
+
+5. Compare operand types.
+
+6. For some particular instructions check equivalence (relation in our case) of
+some significant attributes. For example we have to compare alignment for
+``load`` instructions.
+
+O(log(N))
+---------
+Methods described above implement order relationship. And latter, could be used
+for nodes comparison in a binary tree. So we can organize functions set into
+the binary tree and reduce the cost of lookup procedure from
+O(N*N) to O(log(N)).
+
+Merging process, mergeTwoFunctions
+==================================
+Once *MergeFunctions* detected that current function (*G*) is equal to one that
+were analyzed before (function *F*) it calls ``mergeTwoFunctions(Function*,
+Function*)``.
+
+Operation affects ``FnTree`` contents with next way: *F* will stay in
+``FnTree``. *G* being equal to *F* will not be added to ``FnTree``. Calls of
+*G* would be replaced with something else. It changes bodies of callers. So,
+functions that calls *G* would be put into ``Deferred`` set and removed from
+``FnTree``, and analyzed again.
+
+The approach is next:
+
+1. Most wished case: when we can use alias and both of *F* and *G* are weak. We
+make both of them with aliases to the third strong function *H*. Actually *H*
+is *F*. See below how it's made (but it's better to look straight into the
+source code). Well, this is a case when we can just replace *G* with *F*
+everywhere, we use ``replaceAllUsesWith`` operation here (*RAUW*).
+
+2. *F* could not be overridden, while *G* could. It would be good to do the
+next: after merging the places where overridable function were used, still use
+overridable stub. So try to make *G* alias to *F*, or create overridable tail
+call wrapper around *F* and replace *G* with that call.
+
+3. Neither *F* nor *G* could be overridden. We can't use *RAUW*. We can just
+change the callers: call *F* instead of *G*.  That's what
+``replaceDirectCallers`` does.
+
+Below is detailed body description.
+
+If “F” may be overridden
+------------------------
+As follows from ``mayBeOverridden`` comments: “whether the definition of this
+global may be replaced by something non-equivalent at link time”. If so, thats
+ok: we can use alias to *F* instead of *G* or change call instructions itself.
+
+HasGlobalAliases, removeUsers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+First consider the case when we have global aliases of one function name to
+another. Our purpose is  make both of them with aliases to the third strong
+function. Though if we keep *F* alive and without major changes we can leave it
+in ``FnTree``. Try to combine these two goals.
+
+Do stub replacement of *F* itself with an alias to *F*.
+
+1. Create stub function *H*, with the same name and attributes like function
+*F*. It takes maximum alignment of *F* and *G*.
+
+2. Replace all uses of function *F* with uses of function *H*. It is the two
+steps procedure instead. First of all, we must take into account, all functions
+from whom *F* is called would be changed: since we change the call argument
+(from *F* to *H*). If so we must to review these caller functions again after
+this procedure. We remove callers from ``FnTree``, method with name
+``removeUsers(F)`` does that (don't confuse with ``replaceAllUsesWith``):
+
+   2.1. ``Inside removeUsers(Value*
+   V)`` we go through the all values that use value *V* (or *F* in our context).
+   If value is instruction, we go to function that holds this instruction and
+   mark it as to-be-analyzed-again (put to ``Deferred`` set), we also remove
+   caller from ``FnTree``.
+
+   2.2. Now we can do the replacement: call ``F->replaceAllUsesWith(H)``.
+
+3. *H* (that now "officially" plays *F*'s role) is replaced with alias to *F*.
+Do the same with *G*: replace it with alias to *F*. So finally everywhere *F*
+was used, we use *H* and it is alias to *F*, and everywhere *G* was used we
+also have alias to *F*.
+
+4. Set *F* linkage to private. Make it strong :-)
+
+No global aliases, replaceDirectCallers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+If global aliases are not supported. We call ``replaceDirectCallers`` then. Just
+go through all calls of *G* and replace it with calls of *F*. If you look into
+method you will see that it scans all uses of *G* too, and if use is callee (if
+user is call instruction and *G* is used as what to be called), we replace it
+with use of *F*.
+
+If “F” could not be overridden, fix it!
+"""""""""""""""""""""""""""""""""""""""
+
+We call ``writeThunkOrAlias(Function *F, Function *G)``. Here we try to replace
+*G* with alias to *F* first. Next conditions are essential:
+
+* target should support global aliases,
+* the address itself of  *G* should be not significant, not named and not
+  referenced anywhere,
+* function should come with external, local or weak linkage.
+
+Otherwise we write thunk: some wrapper that has *G's* interface and calls *F*,
+so *G* could be replaced with this wrapper.
+
+*writeAlias*
+
+As follows from *llvm* reference:
+
+“Aliases act as *second name* for the aliasee value”. So we just want to create
+second name for *F* and use it instead of *G*:
+
+1. create global alias itself (*GA*),
+
+2. adjust alignment of *F* so it must be maximum of current and *G's* alignment;
+
+3. replace uses of *G*:
+
+   3.1. first mark all callers of *G* as to-be-analyzed-again, using
+   ``removeUsers`` method (see chapter above),
+
+   3.2. call ``G->replaceAllUsesWith(GA)``.
+
+4. Get rid of *G*.
+
+*writeThunk*
+
+As it written in method comments:
+
+“Replace G with a simple tail call to bitcast(F). Also replace direct uses of G
+with bitcast(F). Deletes G.”
+
+In general it does the same as usual when we want to replace callee, except the
+first point:
+
+1. We generate tail call wrapper around *F*, but with interface that allows use
+it instead of *G*.
+
+2. “As-usual”: ``removeUsers`` and ``replaceAllUsesWith`` then.
+
+3. Get rid of *G*.
+
+That's it.
+==========
+We have described how to detect equal functions, and how to merge them, and in
+first chapter we have described how it works all-together. Author hopes, reader
+have some picture from now, and it helps him improve and debug this pass.
+
+Reader is welcomed to send us any questions and proposals ;-)
diff --git a/docs/Passes.rst b/docs/Passes.rst
index 9f40092..cc0a853 100644
--- a/docs/Passes.rst
+++ b/docs/Passes.rst
@@ -891,17 +891,24 @@ calls, or transforming sets of stores into ``memset``\ s.
 
 This pass looks for equivalent functions that are mergable and folds them.
 
-A hash is computed from the function, based on its type and number of basic
-blocks.
+Total-ordering is introduced among the functions set: we define comparison
+that answers for every two functions which of them is greater. It allows to
+arrange functions into the binary tree.
 
-Once all hashes are computed, we perform an expensive equality comparison on
-each function pair.  This takes n^2/2 comparisons per bucket, so it's important
-that the hash function be high quality.  The equality comparison iterates
-through each instruction in each basic block.
+For every new function we check for equivalent in tree.
 
-When a match is found the functions are folded.  If both functions are
-overridable, we move the functionality into a new internal function and leave
-two overridable thunks to it.
+If equivalent exists we fold such functions. If both functions are overridable,
+we move the functionality into a new internal function and leave two
+overridable thunks to it.
+
+If there is no equivalent, then we add this function to tree.
+
+Lookup routine has O(log(n)) complexity, while whole merging process has
+complexity of O(n*log(n)).
+
+Read
+:doc:`this <MergeFunctions>`
+article for more details.
 
 ``-mergereturn``: Unify function exit nodes
 -------------------------------------------
@@ -1112,13 +1119,6 @@ useful when diffing the effect of an optimization because deleting an unnamed
 instruction can change all other instruction numbering, making the diff very
 noisy.
 
-``-preverify``: Preliminary module verification
------------------------------------------------
-
-Ensures that the module is in the form required by the :ref:`Module Verifier
-<passes-verify>` pass.  Running the verifier runs this pass automatically, so
-there should be no need to use it directly.
-
 .. _passes-verify:
 
 ``-verify``: Module Verifier
diff --git a/docs/Phabricator.rst b/docs/Phabricator.rst
index 8ac9afe..3f4f72a 100644
--- a/docs/Phabricator.rst
+++ b/docs/Phabricator.rst
@@ -66,7 +66,8 @@ To upload a new patch:
 * Leave the drop down on *Create a new Revision...* and click *Continue*.
 * Enter a descriptive title and summary; add reviewers and mailing
   lists that you want to be included in the review. If your patch is
-  for LLVM, cc llvm-commits; if your patch is for Clang, cc cfe-commits.
+  for LLVM, add llvm-commits as a subscriber; if your patch is for Clang,
+  add cfe-commits.
 * Click *Save*.
 
 To submit an updated patch:
diff --git a/docs/ProgrammersManual.rst b/docs/ProgrammersManual.rst
index 85a4ad8..753e658 100644
--- a/docs/ProgrammersManual.rst
+++ b/docs/ProgrammersManual.rst
@@ -488,6 +488,9 @@ gathered, use the '``-stats``' option:
   $ opt -stats -mypassname < program.bc > /dev/null
   ... statistics output ...
 
+Note that in order to use the '``-stats``' option, LLVM must be
+compiled with assertions enabled.
+
 When running ``opt`` on a C file from the SPEC benchmark suite, it gives a
 report that looks like this:
 
@@ -1408,7 +1411,7 @@ llvm/ADT/IntervalMap.h
 
 IntervalMap is a compact map for small keys and values.  It maps key intervals
 instead of single keys, and it will automatically coalesce adjacent intervals.
-When then map only contains a few intervals, they are stored in the map object
+When the map only contains a few intervals, they are stored in the map object
 itself to avoid allocations.
 
 The IntervalMap iterators are quite big, so they should not be passed around as
@@ -2480,6 +2483,76 @@ ensures that the first bytes of ``User`` (if interpreted as a pointer) never has
 the LSBit set. (Portability is relying on the fact that all known compilers
 place the ``vptr`` in the first word of the instances.)
 
+.. _polymorphism:
+
+Designing Type Hiercharies and Polymorphic Interfaces
+-----------------------------------------------------
+
+There are two different design patterns that tend to result in the use of
+virtual dispatch for methods in a type hierarchy in C++ programs. The first is
+a genuine type hierarchy where different types in the hierarchy model
+a specific subset of the functionality and semantics, and these types nest
+strictly within each other. Good examples of this can be seen in the ``Value``
+or ``Type`` type hierarchies.
+
+A second is the desire to dispatch dynamically across a collection of
+polymorphic interface implementations. This latter use case can be modeled with
+virtual dispatch and inheritance by defining an abstract interface base class
+which all implementations derive from and override. However, this
+implementation strategy forces an **"is-a"** relationship to exist that is not
+actually meaningful. There is often not some nested hierarchy of useful
+generalizations which code might interact with and move up and down. Instead,
+there is a singular interface which is dispatched across a range of
+implementations.
+
+The preferred implementation strategy for the second use case is that of
+generic programming (sometimes called "compile-time duck typing" or "static
+polymorphism"). For example, a template over some type parameter ``T`` can be
+instantiated across any particular implementation that conforms to the
+interface or *concept*. A good example here is the highly generic properties of
+any type which models a node in a directed graph. LLVM models these primarily
+through templates and generic programming. Such templates include the
+``LoopInfoBase`` and ``DominatorTreeBase``. When this type of polymorphism
+truly needs **dynamic** dispatch you can generalize it using a technique
+called *concept-based polymorphism*. This pattern emulates the interfaces and
+behaviors of templates using a very limited form of virtual dispatch for type
+erasure inside its implementation. You can find examples of this technique in
+the ``PassManager.h`` system, and there is a more detailed introduction to it
+by Sean Parent in several of his talks and papers:
+
+#. `Inheritance Is The Base Class of Evil
+   <http://channel9.msdn.com/Events/GoingNative/2013/Inheritance-Is-The-Base-Class-of-Evil>`_
+   - The GoingNative 2013 talk describing this technique, and probably the best
+   place to start.
+#. `Value Semantics and Concepts-based Polymorphism
+   <http://www.youtube.com/watch?v=_BpMYeUFXv8>`_ - The C++Now! 2012 talk
+   describing this technique in more detail.
+#. `Sean Parent's Papers and Presentations
+   <http://github.com/sean-parent/sean-parent.github.com/wiki/Papers-and-Presentations>`_
+   - A Github project full of links to slides, video, and sometimes code.
+
+When deciding between creating a type hierarchy (with either tagged or virtual
+dispatch) and using templates or concepts-based polymorphism, consider whether
+there is some refinement of an abstract base class which is a semantically
+meaningful type on an interface boundary. If anything more refined than the
+root abstract interface is meaningless to talk about as a partial extension of
+the semantic model, then your use case likely fits better with polymorphism and
+you should avoid using virtual dispatch. However, there may be some exigent
+circumstances that require one technique or the other to be used.
+
+If you do need to introduce a type hierarchy, we prefer to use explicitly
+closed type hierarchies with manual tagged dispatch and/or RTTI rather than the
+open inheritance model and virtual dispatch that is more common in C++ code.
+This is because LLVM rarely encourages library consumers to extend its core
+types, and leverages the closed and tag-dispatched nature of its hierarchies to
+generate significantly more efficient code. We have also found that a large
+amount of our usage of type hierarchies fits better with tag-based pattern
+matching rather than dynamic dispatch across a common interface. Within LLVM we
+have built custom helpers to facilitate this design. See this document's
+section on :ref:`isa and dyn_cast <isa>` and our :doc:`detailed document
+<HowToSetUpLLVMStyleRTTI>` which describes how you can implement this
+pattern for use with the LLVM helpers.
+
 .. _coreclasses:
 
 The Core LLVM Class Hierarchy Reference
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index be2954c..6a38363 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -1,12 +1,12 @@
 ======================
-LLVM 3.5 Release Notes
+LLVM 3.7 Release Notes
 ======================
 
 .. contents::
     :local:
 
 .. warning::
-   These are in-progress notes for the upcoming LLVM 3.6 release.  You may
+   These are in-progress notes for the upcoming LLVM 3.7 release.  You may
    prefer the `LLVM 3.5 Release Notes <http://llvm.org/releases/3.5.0/docs
    /ReleaseNotes.html>`_.
 
@@ -15,7 +15,7 @@ Introduction
 ============
 
 This document contains the release notes for the LLVM Compiler Infrastructure,
-release 3.6.  Here we describe the status of LLVM, including major improvements
+release 3.7.  Here we describe the status of LLVM, including major improvements
 from the previous release, improvements in various subprojects of LLVM, and
 some of the current users of the code.  All LLVM releases may be downloaded
 from the `LLVM releases web site <http://llvm.org/releases/>`_.
@@ -41,10 +41,8 @@ Non-comprehensive list of changes in this release
    functionality, or simply have a lot to talk about), see the `NOTE` below
    for adding a new subsection.
 
-* Support for AuroraUX has been removed.
-
-* Added support for a `native object file-based bitcode wrapper format
-  <BitCodeFormat.html#native-object-file>`_.
+* The minimum required Visual Studio version for building LLVM is now 2013
+  Update 4.
 
 * ... next change ...
 
@@ -67,19 +65,27 @@ Changes to the ARM Backend
 Changes to the MIPS Target
 --------------------------
 
-During this release ...
+ During this release ...
+
 
 Changes to the PowerPC Target
 -----------------------------
 
-During this release ...
+ During this release ...
+
+
+Changes to the OCaml bindings
+-----------------------------
+
+ During this release ...
+
 
-External Open Source Projects Using LLVM 3.6
+External Open Source Projects Using LLVM 3.7
 ============================================
 
 An exciting aspect of LLVM is that it is used as an enabling technology for
 a lot of other language and tools projects. This section lists some of the
-projects that have already been updated to work with LLVM 3.6.
+projects that have already been updated to work with LLVM 3.7.
 
 * A project
 
diff --git a/docs/SourceLevelDebugging.rst b/docs/SourceLevelDebugging.rst
index 3a5fa6e..350604c 100644
--- a/docs/SourceLevelDebugging.rst
+++ b/docs/SourceLevelDebugging.rst
@@ -1807,7 +1807,6 @@ tag is one of:
 * DW_TAG_subrange_type
 * DW_TAG_base_type
 * DW_TAG_const_type
-* DW_TAG_constant
 * DW_TAG_file_type
 * DW_TAG_namelist
 * DW_TAG_packed_type
diff --git a/docs/StackMaps.rst b/docs/StackMaps.rst
index bd0fb94..43c60c9 100644
--- a/docs/StackMaps.rst
+++ b/docs/StackMaps.rst
@@ -221,6 +221,13 @@ lowered according to the calling convention specified at the
 intrinsic's callsite. Variants of the intrinsic with non-void return
 type also return a value according to calling convention.
 
+On PowerPC, note that ``<target>`` must be the actual intended target of
+the indirect call. Specifically, even when compiling for the ELF V1 ABI,
+``<target>`` is not the function-descriptor address normally used as the C/C++
+function-pointer representation. As a result, the call target must be local
+because no adjustment or restoration of the TOC pointer (in register r2) will
+be performed.
+
 Requesting zero patch point arguments is valid. In this case, all
 variable operands are handled just like
 ``llvm.experimental.stackmap.*``. The difference is that space will
diff --git a/docs/Statepoints.rst b/docs/Statepoints.rst
new file mode 100644
index 0000000..9741c93
--- /dev/null
+++ b/docs/Statepoints.rst
@@ -0,0 +1,580 @@
+=====================================
+Garbage Collection Safepoints in LLVM
+=====================================
+
+.. contents::
+   :local:
+   :depth: 2
+
+Status
+=======
+
+This document describes a set of experimental extensions to LLVM. Use
+with caution.  Because the intrinsics have experimental status,
+compatibility across LLVM releases is not guaranteed.
+
+LLVM currently supports an alternate mechanism for conservative
+garbage collection support using the ``gcroot`` intrinsic.  The mechanism
+described here shares little in common with the alternate ``gcroot``
+implementation and it is hoped that this mechanism will eventually
+replace the gc_root mechanism.
+
+Overview
+========
+
+To collect dead objects, garbage collectors must be able to identify
+any references to objects contained within executing code, and,
+depending on the collector, potentially update them.  The collector
+does not need this information at all points in code - that would make
+the problem much harder - but only at well-defined points in the
+execution known as 'safepoints' For most collectors, it is sufficient
+to track at least one copy of each unique pointer value.  However, for
+a collector which wishes to relocate objects directly reachable from
+running code, a higher standard is required.
+
+One additional challenge is that the compiler may compute intermediate
+results ("derived pointers") which point outside of the allocation or
+even into the middle of another allocation.  The eventual use of this
+intermediate value must yield an address within the bounds of the
+allocation, but such "exterior derived pointers" may be visible to the
+collector.  Given this, a garbage collector can not safely rely on the
+runtime value of an address to indicate the object it is associated
+with.  If the garbage collector wishes to move any object, the
+compiler must provide a mapping, for each pointer, to an indication of
+its allocation.
+
+To simplify the interaction between a collector and the compiled code,
+most garbage collectors are organized in terms of three abstractions:
+load barriers, store barriers, and safepoints.
+
+#. A load barrier is a bit of code executed immediately after the
+   machine load instruction, but before any use of the value loaded.
+   Depending on the collector, such a barrier may be needed for all
+   loads, merely loads of a particular type (in the original source
+   language), or none at all.
+
+#. Analogously, a store barrier is a code fragement that runs
+   immediately before the machine store instruction, but after the
+   computation of the value stored.  The most common use of a store
+   barrier is to update a 'card table' in a generational garbage
+   collector.
+
+#. A safepoint is a location at which pointers visible to the compiled
+   code (i.e. currently in registers or on the stack) are allowed to
+   change.  After the safepoint completes, the actual pointer value
+   may differ, but the 'object' (as seen by the source language)
+   pointed to will not.
+
+  Note that the term 'safepoint' is somewhat overloaded.  It refers to
+  both the location at which the machine state is parsable and the
+  coordination protocol involved in bring application threads to a
+  point at which the collector can safely use that information.  The
+  term "statepoint" as used in this document refers exclusively to the
+  former.
+
+This document focuses on the last item - compiler support for
+safepoints in generated code.  We will assume that an outside
+mechanism has decided where to place safepoints.  From our
+perspective, all safepoints will be function calls.  To support
+relocation of objects directly reachable from values in compiled code,
+the collector must be able to:
+
+#. identify every copy of a pointer (including copies introduced by
+   the compiler itself) at the safepoint,
+#. identify which object each pointer relates to, and
+#. potentially update each of those copies.
+
+This document describes the mechanism by which an LLVM based compiler
+can provide this information to a language runtime/collector, and
+ensure that all pointers can be read and updated if desired.  The
+heart of the approach is to construct (or rewrite) the IR in a manner
+where the possible updates performed by the garbage collector are
+explicitly visible in the IR.  Doing so requires that we:
+
+#. create a new SSA value for each potentially relocated pointer, and
+   ensure that no uses of the original (non relocated) value is
+   reachable after the safepoint,
+#. specify the relocation in a way which is opaque to the compiler to
+   ensure that the optimizer can not introduce new uses of an
+   unrelocated value after a statepoint. This prevents the optimizer
+   from performing unsound optimizations.
+#. recording a mapping of live pointers (and the allocation they're
+   associated with) for each statepoint.
+
+At the most abstract level, inserting a safepoint can be thought of as
+replacing a call instruction with a call to a multiple return value
+function which both calls the original target of the call, returns
+it's result, and returns updated values for any live pointers to
+garbage collected objects.
+
+  Note that the task of identifying all live pointers to garbage
+  collected values, transforming the IR to expose a pointer giving the
+  base object for every such live pointer, and inserting all the
+  intrinsics correctly is explicitly out of scope for this document.
+  The recommended approach is to use the :ref:`utility passes 
+  <statepoint-utilities>` described below. 
+
+This abstract function call is concretely represented by a sequence of
+intrinsic calls known collectively as a "statepoint relocation sequence".
+
+Let's consider a simple call in LLVM IR:
+
+.. code-block:: llvm
+
+  define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) 
+         gc "statepoint-example" {
+    call void ()* @foo()
+    ret i8 addrspace(1)* %obj
+  }
+
+Depending on our language we may need to allow a safepoint during the execution 
+of ``foo``. If so, we need to let the collector update local values in the 
+current frame.  If we don't, we'll be accessing a potential invalid reference 
+once we eventually return from the call.
+
+In this example, we need to relocate the SSA value ``%obj``.  Since we can't 
+actually change the value in the SSA value ``%obj``, we need to introduce a new 
+SSA value ``%obj.relocated`` which represents the potentially changed value of
+``%obj`` after the safepoint and update any following uses appropriately.  The 
+resulting relocation sequence is:
+
+.. code-block:: llvm
+
+  define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) 
+         gc "statepoint-example" {
+    %0 = call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @foo, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i8 addrspace(1)* %obj)
+    %obj.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(i32 %0, i32 9, i32 9)
+    ret i8 addrspace(1)* %obj.relocated
+  }
+
+Ideally, this sequence would have been represented as a M argument, N
+return value function (where M is the number of values being
+relocated + the original call arguments and N is the original return
+value + each relocated value), but LLVM does not easily support such a
+representation.
+
+Instead, the statepoint intrinsic marks the actual site of the
+safepoint or statepoint.  The statepoint returns a token value (which
+exists only at compile time).  To get back the original return value
+of the call, we use the ``gc.result`` intrinsic.  To get the relocation
+of each pointer in turn, we use the ``gc.relocate`` intrinsic with the
+appropriate index.  Note that both the ``gc.relocate`` and ``gc.result`` are
+tied to the statepoint.  The combination forms a "statepoint relocation 
+sequence" and represents the entitety of a parseable call or 'statepoint'.
+
+When lowered, this example would generate the following x86 assembly:
+
+.. code-block:: gas
+  
+	  .globl	test1
+	  .align	16, 0x90
+	  pushq	%rax
+	  callq	foo
+  .Ltmp1:
+	  movq	(%rsp), %rax  # This load is redundant (oops!)
+	  popq	%rdx
+	  retq
+
+Each of the potentially relocated values has been spilled to the
+stack, and a record of that location has been recorded to the
+:ref:`Stack Map section <stackmap-section>`.  If the garbage collector
+needs to update any of these pointers during the call, it knows
+exactly what to change.
+
+The relevant parts of the StackMap section for our example are:
+
+.. code-block:: gas
+  
+  # This describes the call site
+  # Stack Maps: callsite 2882400000
+	  .quad	2882400000
+	  .long	.Ltmp1-test1
+	  .short	0
+  # .. 8 entries skipped ..
+  # This entry describes the spill slot which is directly addressable
+  # off RSP with offset 0.  Given the value was spilled with a pushq, 
+  # that makes sense.
+  # Stack Maps:   Loc 8: Direct RSP     [encoding: .byte 2, .byte 8, .short 7, .int 0]
+	  .byte	2
+	  .byte	8
+	  .short	7
+	  .long	0
+
+This example was taken from the tests for the :ref:`RewriteStatepointsForGC` utility pass.  As such, it's full StackMap can be easily examined with the following command.
+
+.. code-block:: bash
+
+  opt -rewrite-statepoints-for-gc test/Transforms/RewriteStatepointsForGC/basics.ll -S | llc -debug-only=stackmaps
+
+
+
+
+
+Intrinsics
+===========
+
+'llvm.experimental.gc.statepoint' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32
+        @llvm.experimental.gc.statepoint(func_type <target>, 
+                       i64 <#call args>. i64 <unused>, 
+                       ... (call parameters),
+                       i64 <# deopt args>, ... (deopt parameters),
+                       ... (gc parameters))
+
+Overview:
+"""""""""
+
+The statepoint intrinsic represents a call which is parse-able by the
+runtime.
+
+Operands:
+"""""""""
+
+The 'target' operand is the function actually being called.  The
+target can be specified as either a symbolic LLVM function, or as an
+arbitrary Value of appropriate function type.  Note that the function
+type must match the signature of the callee and the types of the 'call
+parameters' arguments.
+
+The '#call args' operand is the number of arguments to the actual
+call.  It must exactly match the number of arguments passed in the
+'call parameters' variable length section.
+
+The 'unused' operand is unused and likely to be removed.  Please do
+not use.
+
+The 'call parameters' arguments are simply the arguments which need to
+be passed to the call target.  They will be lowered according to the
+specified calling convention and otherwise handled like a normal call
+instruction.  The number of arguments must exactly match what is
+specified in '# call args'.  The types must match the signature of
+'target'.
+
+The 'deopt parameters' arguments contain an arbitrary list of Values
+which is meaningful to the runtime.  The runtime may read any of these
+values, but is assumed not to modify them.  If the garbage collector
+might need to modify one of these values, it must also be listed in
+the 'gc pointer' argument list.  The '# deopt args' field indicates
+how many operands are to be interpreted as 'deopt parameters'.
+
+The 'gc parameters' arguments contain every pointer to a garbage
+collector object which potentially needs to be updated by the garbage
+collector.  Note that the argument list must explicitly contain a base
+pointer for every derived pointer listed.  The order of arguments is
+unimportant.  Unlike the other variable length parameter sets, this
+list is not length prefixed.
+
+Semantics:
+""""""""""
+
+A statepoint is assumed to read and write all memory.  As a result,
+memory operations can not be reordered past a statepoint.  It is
+illegal to mark a statepoint as being either 'readonly' or 'readnone'.
+
+Note that legal IR can not perform any memory operation on a 'gc
+pointer' argument of the statepoint in a location statically reachable
+from the statepoint.  Instead, the explicitly relocated value (from a
+``gc.relocate``) must be used.
+
+'llvm.experimental.gc.result' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare type*
+        @llvm.experimental.gc.result(i32 %statepoint_token)
+
+Overview:
+"""""""""
+
+``gc.result`` extracts the result of the original call instruction
+which was replaced by the ``gc.statepoint``.  The ``gc.result``
+intrinsic is actually a family of three intrinsics due to an
+implementation limitation.  Other than the type of the return value,
+the semantics are the same.
+
+Operands:
+"""""""""
+
+The first and only argument is the ``gc.statepoint`` which starts
+the safepoint sequence of which this ``gc.result`` is a part.
+Despite the typing of this as a generic i32, *only* the value defined
+by a ``gc.statepoint`` is legal here.
+
+Semantics:
+""""""""""
+
+The ``gc.result`` represents the return value of the call target of
+the ``statepoint``.  The type of the ``gc.result`` must exactly match
+the type of the target.  If the call target returns void, there will
+be no ``gc.result``.
+
+A ``gc.result`` is modeled as a 'readnone' pure function.  It has no
+side effects since it is just a projection of the return value of the
+previous call represented by the ``gc.statepoint``.
+
+'llvm.experimental.gc.relocate' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <pointer type>
+        @llvm.experimental.gc.relocate(i32 %statepoint_token, 
+                                       i32 %base_offset, 
+                                       i32 %pointer_offset)
+
+Overview:
+"""""""""
+
+A ``gc.relocate`` returns the potentially relocated value of a pointer
+at the safepoint.
+
+Operands:
+"""""""""
+
+The first argument is the ``gc.statepoint`` which starts the
+safepoint sequence of which this ``gc.relocation`` is a part.
+Despite the typing of this as a generic i32, *only* the value defined
+by a ``gc.statepoint`` is legal here.
+
+The second argument is an index into the statepoints list of arguments
+which specifies the base pointer for the pointer being relocated.
+This index must land within the 'gc parameter' section of the
+statepoint's argument list.
+
+The third argument is an index into the statepoint's list of arguments
+which specify the (potentially) derived pointer being relocated.  It
+is legal for this index to be the same as the second argument
+if-and-only-if a base pointer is being relocated. This index must land
+within the 'gc parameter' section of the statepoint's argument list.
+
+Semantics:
+""""""""""
+
+The return value of ``gc.relocate`` is the potentially relocated value
+of the pointer specified by it's arguments.  It is unspecified how the
+value of the returned pointer relates to the argument to the
+``gc.statepoint`` other than that a) it points to the same source
+language object with the same offset, and b) the 'based-on'
+relationship of the newly relocated pointers is a projection of the
+unrelocated pointers.  In particular, the integer value of the pointer
+returned is unspecified.
+
+A ``gc.relocate`` is modeled as a ``readnone`` pure function.  It has no
+side effects since it is just a way to extract information about work
+done during the actual call modeled by the ``gc.statepoint``.
+
+.. _statepoint-stackmap-format:
+
+Stack Map Format
+================
+
+Locations for each pointer value which may need read and/or updated by
+the runtime or collector are provided via the :ref:`Stack Map format
+<stackmap-format>` specified in the PatchPoint documentation.
+
+Each statepoint generates the following Locations:
+
+* Constant which describes number of following deopt *Locations* (not
+  operands)
+* Variable number of Locations, one for each deopt parameter listed in
+  the IR statepoint (same number as described by previous Constant)
+* Variable number of Locations pairs, one pair for each unique pointer
+  which needs relocated.  The first Location in each pair describes
+  the base pointer for the object.  The second is the derived pointer
+  actually being relocated.  It is guaranteed that the base pointer
+  must also appear explicitly as a relocation pair if used after the
+  statepoint. There may be fewer pairs then gc parameters in the IR
+  statepoint. Each *unique* pair will occur at least once; duplicates
+  are possible.
+
+Note that the Locations used in each section may describe the same
+physical location.  e.g. A stack slot may appear as a deopt location,
+a gc base pointer, and a gc derived pointer.
+
+The ID field of the 'StkMapRecord' for a statepoint is meaningless and
+it's value is explicitly unspecified.
+
+The LiveOut section of the StkMapRecord will be empty for a statepoint
+record.
+
+Safepoint Semantics & Verification
+==================================
+
+The fundamental correctness property for the compiled code's
+correctness w.r.t. the garbage collector is a dynamic one.  It must be
+the case that there is no dynamic trace such that a operation
+involving a potentially relocated pointer is observably-after a
+safepoint which could relocate it.  'observably-after' is this usage
+means that an outside observer could observe this sequence of events
+in a way which precludes the operation being performed before the
+safepoint.
+
+To understand why this 'observable-after' property is required,
+consider a null comparison performed on the original copy of a
+relocated pointer.  Assuming that control flow follows the safepoint,
+there is no way to observe externally whether the null comparison is
+performed before or after the safepoint.  (Remember, the original
+Value is unmodified by the safepoint.)  The compiler is free to make
+either scheduling choice.
+
+The actual correctness property implemented is slightly stronger than
+this.  We require that there be no *static path* on which a
+potentially relocated pointer is 'observably-after' it may have been
+relocated.  This is slightly stronger than is strictly necessary (and
+thus may disallow some otherwise valid programs), but greatly
+simplifies reasoning about correctness of the compiled code.
+
+By construction, this property will be upheld by the optimizer if
+correctly established in the source IR.  This is a key invariant of
+the design.
+
+The existing IR Verifier pass has been extended to check most of the
+local restrictions on the intrinsics mentioned in their respective
+documentation.  The current implementation in LLVM does not check the
+key relocation invariant, but this is ongoing work on developing such
+a verifier.  Please ask on llvmdev if you're interested in
+experimenting with the current version.
+
+.. _statepoint-utilities:
+
+Utility Passes for Safepoint Insertion
+======================================
+
+.. _RewriteStatepointsForGC:
+
+RewriteStatepointsForGC
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The pass RewriteStatepointsForGC transforms a functions IR by replacing a 
+``gc.statepoint`` (with an optional ``gc.result``) with a full relocation 
+sequence, including all required ``gc.relocates``.  To function, the pass 
+requires that the GC strategy specified for the function be able to reliably 
+distinguish between GC references and non-GC references in IR it is given.
+
+As an example, given this code:
+
+.. code-block:: llvm
+
+  define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) 
+         gc "statepoint-example" {
+    call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @foo, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+    ret i8 addrspace(1)* %obj
+  }
+
+The pass would produce this IR:
+
+.. code-block:: llvm
+
+  define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) 
+         gc "statepoint-example" {
+    %0 = call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @foo, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i8 addrspace(1)* %obj)
+    %obj.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(i32 %0, i32 9, i32 9)
+    ret i8 addrspace(1)* %obj.relocated
+  }
+
+In the above examples, the addrspace(1) marker on the pointers is the mechanism
+that the ``statepoint-example`` GC strategy uses to distinguish references from
+non references.  Address space 1 is not globally reserved for this purpose.
+
+This pass can be used an utility function by a language frontend that doesn't 
+want to manually reason about liveness, base pointers, or relocation when 
+constructing IR.  As currently implemented, RewriteStatepointsForGC must be 
+run after SSA construction (i.e. mem2ref).  
+
+
+In practice, RewriteStatepointsForGC can be run much later in the pass 
+pipeline, after most optimization is already done.  This helps to improve 
+the quality of the generated code when compiled with garbage collection support.
+In the long run, this is the intended usage model.  At this time, a few details
+have yet to be worked out about the semantic model required to guarantee this 
+is always correct.  As such, please use with caution and report bugs.
+
+.. _PlaceSafepoints:
+
+PlaceSafepoints
+^^^^^^^^^^^^^^^^
+
+The pass PlaceSafepoints transforms a function's IR by replacing any call or 
+invoke instructions with appropriate ``gc.statepoint`` and ``gc.result`` pairs,
+and inserting safepoint polls sufficient to ensure running code checks for a 
+safepoint request on a timely manner.  This pass is expected to be run before 
+RewriteStatepointsForGC and thus does not produce full relocation sequences.  
+
+As an example, given input IR of the following:
+
+.. code-block:: llvm
+
+  define void @test() gc "statepoint-example" {
+    call void @foo()
+    ret void
+  }
+
+  declare void @do_safepoint()
+  define void @gc.safepoint_poll() {
+    call void @do_safepoint()
+    ret void
+  }
+
+
+This pass would produce the following IR:
+
+.. code-block:: llvm
+
+  define void @test() gc "statepoint-example" {
+    %safepoint_token = call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @do_safepoint, i32 0, i32 0, i32 0)
+    %safepoint_token1 = call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @foo, i32 0, i32 0, i32 0)
+    ret void
+  }
+
+In this case, we've added an (unconditional) entry safepoint poll and converted the call into a ``gc.statepoint``.  Note that despite appearances, the entry poll is not necessarily redundant.  We'd have to know that ``foo`` and ``test`` were not mutually recursive for the poll to be redundant.  In practice, you'd probably want to your poll definition to contain a conditional branch of some form.
+
+
+At the moment, PlaceSafepoints can insert safepoint polls at method entry and 
+loop backedges locations.  Extending this to work with return polls would be 
+straight forward if desired.
+
+PlaceSafepoints includes a number of optimizations to avoid placing safepoint 
+polls at particular sites unless needed to ensure timely execution of a poll 
+under normal conditions.  PlaceSafepoints does not attempt to ensure timely 
+execution of a poll under worst case conditions such as heavy system paging.
+
+The implementation of a safepoint poll action is specified by looking up a 
+function of the name ``gc.safepoint_poll`` in the containing Module.  The body
+of this function is inserted at each poll site desired.  While calls or invokes
+inside this method are transformed to a ``gc.statepoints``, recursive poll 
+insertion is not performed.
+
+If you are scheduling the RewriteStatepointsForGC pass late in the pass order,
+you should probably schedule this pass immediately before it.  The exception 
+would be if you need to preserve abstract frame information (e.g. for
+deoptimization or introspection) at safepoints.  In that case, ask on the 
+llvmdev mailing list for suggestions.
+
+
+Bugs and Enhancements
+=====================
+
+Currently known bugs and enhancements under consideration can be
+tracked by performing a `bugzilla search
+<http://llvm.org/bugs/buglist.cgi?cmdtype=runnamed&namedcmd=Statepoint%20Bugs&list_id=64342>`_
+for [Statepoint] in the summary field. When filing new bugs, please
+use this tag so that interested parties see the newly filed bug.  As
+with most LLVM features, design discussions take place on `llvmdev
+<http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_, and patches
+should be sent to `llvm-commits
+<http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits>`_ for review.
+
diff --git a/docs/TableGen/index.rst b/docs/TableGen/index.rst
index cda41b5..9526240 100644
--- a/docs/TableGen/index.rst
+++ b/docs/TableGen/index.rst
@@ -123,7 +123,6 @@ this (at the time of this writing):
     bit hasCtrlDep = 0;
     bit isNotDuplicable = 0;
     bit hasSideEffects = 0;
-    bit neverHasSideEffects = 0;
     InstrItinClass Itinerary = NoItinerary;
     string Constraints = "";
     string DisableEncoding = "";
diff --git a/docs/WritingAnLLVMPass.rst b/docs/WritingAnLLVMPass.rst
index ef2b953..1d5a52f 100644
--- a/docs/WritingAnLLVMPass.rst
+++ b/docs/WritingAnLLVMPass.rst
@@ -853,7 +853,7 @@ Example implementations of ``getAnalysisUsage``
   // This example modifies the program, but does not modify the CFG
   void LICM::getAnalysisUsage(AnalysisUsage &AU) const {
     AU.setPreservesCFG();
-    AU.addRequired<LoopInfo>();
+    AU.addRequired<LoopInfoWrapperPass>();
   }
 
 .. _writing-an-llvm-pass-getAnalysis:
@@ -870,7 +870,7 @@ you want, and returns a reference to that pass.  For example:
 .. code-block:: c++
 
   bool LICM::runOnFunction(Function &F) {
-    LoopInfo &LI = getAnalysis<LoopInfo>();
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     //...
   }
 
diff --git a/docs/conf.py b/docs/conf.py
index 659c3e0..1897282 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -47,9 +47,9 @@ copyright = u'2003-2014, LLVM Project'
 # built documents.
 #
 # The short X.Y version.
-version = '3.6'
+version = '3.7'
 # The full version, including alpha/beta/rc tags.
-release = '3.6'
+release = '3.7'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/index.rst b/docs/index.rst
index 5ac5443..56567db 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -199,6 +199,8 @@ For developers of applications which use LLVM as a library.
   (`classes <http://llvm.org/doxygen/inherits.html>`_)
   (`tarball <http://llvm.org/doxygen/doxygen.tar.gz>`_)
 
+`Documentation for Go bindings <http://godoc.org/llvm.org/llvm/bindings/go/llvm>`_
+
 `ViewVC Repository Browser <http://llvm.org/viewvc/>`_
    ..
 
@@ -240,6 +242,9 @@ For API clients and LLVM developers.
    InAlloca
    BigEndianNEON
    CoverageMappingFormat
+   Statepoints
+   MergeFunctions
+   BitSets
 
 :doc:`WritingAnLLVMPass`
    Information on how to write LLVM transformations and analyses.
@@ -332,6 +337,16 @@ For API clients and LLVM developers.
 :doc:`CoverageMappingFormat`
   This describes the format and encoding used for LLVM’s code coverage mapping.
 
+:doc:`Statepoints`
+  This describes a set of experimental extensions for garbage
+  collection support.
+
+:doc:`MergeFunctions`
+  Describes functions merging optimization.
+
+:doc:`InAlloca`
+  Description of the ``inalloca`` argument attribute.
+
 Development Process Documentation
 =================================
 
diff --git a/docs/tutorial/LangImpl1.rst b/docs/tutorial/LangImpl1.rst
index a2c5eee..f4b0191 100644
--- a/docs/tutorial/LangImpl1.rst
+++ b/docs/tutorial/LangImpl1.rst
@@ -73,14 +73,21 @@ in the various pieces. The structure of the tutorial is:
    about this is how easy and trivial it is to construct SSA form in
    LLVM: no, LLVM does *not* require your front-end to construct SSA
    form!
--  `Chapter #8 <LangImpl8.html>`_: Conclusion and other useful LLVM
+-  `Chapter #8 <LangImpl8.html>`_: Extending the Language: Debug
+   Information - Having built a decent little programming language with
+   control flow, functions and mutable variables, we consider what it
+   takes to add debug information to standalone executables. This debug
+   information will allow you to set breakpoints in Kaleidoscope
+   functions, print out argument variables, and call functions - all
+   from within the debugger!
+-  `Chapter #9 <LangImpl8.html>`_: Conclusion and other useful LLVM
    tidbits - This chapter wraps up the series by talking about
    potential ways to extend the language, but also includes a bunch of
    pointers to info about "special topics" like adding garbage
    collection support, exceptions, debugging, support for "spaghetti
    stacks", and a bunch of other tips and tricks.
 
-By the end of the tutorial, we'll have written a bit less than 700 lines
+By the end of the tutorial, we'll have written a bit less than 1000 lines
 of non-comment, non-blank, lines of code. With this small amount of
 code, we'll have built up a very reasonable compiler for a non-trivial
 language including a hand-written lexer, parser, AST, as well as code
diff --git a/docs/tutorial/LangImpl4.rst b/docs/tutorial/LangImpl4.rst
index aa469ca..cdaac63 100644
--- a/docs/tutorial/LangImpl4.rst
+++ b/docs/tutorial/LangImpl4.rst
@@ -428,7 +428,7 @@ the LLVM JIT and optimizer. To build this example, use:
 .. code-block:: bash
 
     # Compile
-    clang++ -g toy.cpp `llvm-config --cxxflags --ldflags --system-libs --libs core jit native` -O3 -o toy
+    clang++ -g toy.cpp `llvm-config --cxxflags --ldflags --system-libs --libs core mcjit native` -O3 -o toy
     # Run
     ./toy
 
diff --git a/docs/tutorial/LangImpl5.rst b/docs/tutorial/LangImpl5.rst
index 2a3a4ce..72e34b1 100644
--- a/docs/tutorial/LangImpl5.rst
+++ b/docs/tutorial/LangImpl5.rst
@@ -736,7 +736,7 @@ the if/then/else and for expressions.. To build this example, use:
 .. code-block:: bash
 
     # Compile
-    clang++ -g toy.cpp `llvm-config --cxxflags --ldflags --system-libs --libs core jit native` -O3 -o toy
+    clang++ -g toy.cpp `llvm-config --cxxflags --ldflags --system-libs --libs core mcjit native` -O3 -o toy
     # Run
     ./toy
 
diff --git a/docs/tutorial/LangImpl6.rst b/docs/tutorial/LangImpl6.rst
index cdceb03..bf78bde 100644
--- a/docs/tutorial/LangImpl6.rst
+++ b/docs/tutorial/LangImpl6.rst
@@ -729,7 +729,7 @@ the if/then/else and for expressions.. To build this example, use:
 .. code-block:: bash
 
     # Compile
-    clang++ -g toy.cpp `llvm-config --cxxflags --ldflags --system-libs --libs core jit native` -O3 -o toy
+    clang++ -g toy.cpp `llvm-config --cxxflags --ldflags --system-libs --libs core mcjit native` -O3 -o toy
     # Run
     ./toy
 
diff --git a/docs/tutorial/LangImpl7.rst b/docs/tutorial/LangImpl7.rst
index c4c7233..c445908 100644
--- a/docs/tutorial/LangImpl7.rst
+++ b/docs/tutorial/LangImpl7.rst
@@ -847,7 +847,7 @@ mutable variables and var/in support. To build this example, use:
 .. code-block:: bash
 
     # Compile
-    clang++ -g toy.cpp `llvm-config --cxxflags --ldflags --system-libs --libs core jit native` -O3 -o toy
+    clang++ -g toy.cpp `llvm-config --cxxflags --ldflags --system-libs --libs core mcjit native` -O3 -o toy
     # Run
     ./toy
 
@@ -856,5 +856,5 @@ Here is the code:
 .. literalinclude:: ../../examples/Kaleidoscope/Chapter7/toy.cpp
    :language: c++
 
-`Next: Conclusion and other useful LLVM tidbits <LangImpl8.html>`_
+`Next: Adding Debug Information <LangImpl8.html>`_
 
diff --git a/docs/tutorial/LangImpl8.rst b/docs/tutorial/LangImpl8.rst
index 6f69493..4473035 100644
--- a/docs/tutorial/LangImpl8.rst
+++ b/docs/tutorial/LangImpl8.rst
@@ -1,267 +1,459 @@
-======================================================
-Kaleidoscope: Conclusion and other useful LLVM tidbits
-======================================================
+======================================
+Kaleidoscope: Adding Debug Information
+======================================
 
 .. contents::
    :local:
 
-Tutorial Conclusion
-===================
-
-Welcome to the final chapter of the "`Implementing a language with
-LLVM <index.html>`_" tutorial. In the course of this tutorial, we have
-grown our little Kaleidoscope language from being a useless toy, to
-being a semi-interesting (but probably still useless) toy. :)
-
-It is interesting to see how far we've come, and how little code it has
-taken. We built the entire lexer, parser, AST, code generator, and an
-interactive run-loop (with a JIT!) by-hand in under 700 lines of
-(non-comment/non-blank) code.
-
-Our little language supports a couple of interesting features: it
-supports user defined binary and unary operators, it uses JIT
-compilation for immediate evaluation, and it supports a few control flow
-constructs with SSA construction.
-
-Part of the idea of this tutorial was to show you how easy and fun it
-can be to define, build, and play with languages. Building a compiler
-need not be a scary or mystical process! Now that you've seen some of
-the basics, I strongly encourage you to take the code and hack on it.
-For example, try adding:
-
--  **global variables** - While global variables have questional value
-   in modern software engineering, they are often useful when putting
-   together quick little hacks like the Kaleidoscope compiler itself.
-   Fortunately, our current setup makes it very easy to add global
-   variables: just have value lookup check to see if an unresolved
-   variable is in the global variable symbol table before rejecting it.
-   To create a new global variable, make an instance of the LLVM
-   ``GlobalVariable`` class.
--  **typed variables** - Kaleidoscope currently only supports variables
-   of type double. This gives the language a very nice elegance, because
-   only supporting one type means that you never have to specify types.
-   Different languages have different ways of handling this. The easiest
-   way is to require the user to specify types for every variable
-   definition, and record the type of the variable in the symbol table
-   along with its Value\*.
--  **arrays, structs, vectors, etc** - Once you add types, you can start
-   extending the type system in all sorts of interesting ways. Simple
-   arrays are very easy and are quite useful for many different
-   applications. Adding them is mostly an exercise in learning how the
-   LLVM `getelementptr <../LangRef.html#i_getelementptr>`_ instruction
-   works: it is so nifty/unconventional, it `has its own
-   FAQ <../GetElementPtr.html>`_! If you add support for recursive types
-   (e.g. linked lists), make sure to read the `section in the LLVM
-   Programmer's Manual <../ProgrammersManual.html#TypeResolve>`_ that
-   describes how to construct them.
--  **standard runtime** - Our current language allows the user to access
-   arbitrary external functions, and we use it for things like "printd"
-   and "putchard". As you extend the language to add higher-level
-   constructs, often these constructs make the most sense if they are
-   lowered to calls into a language-supplied runtime. For example, if
-   you add hash tables to the language, it would probably make sense to
-   add the routines to a runtime, instead of inlining them all the way.
--  **memory management** - Currently we can only access the stack in
-   Kaleidoscope. It would also be useful to be able to allocate heap
-   memory, either with calls to the standard libc malloc/free interface
-   or with a garbage collector. If you would like to use garbage
-   collection, note that LLVM fully supports `Accurate Garbage
-   Collection <../GarbageCollection.html>`_ including algorithms that
-   move objects and need to scan/update the stack.
--  **debugger support** - LLVM supports generation of `DWARF Debug
-   info <../SourceLevelDebugging.html>`_ which is understood by common
-   debuggers like GDB. Adding support for debug info is fairly
-   straightforward. The best way to understand it is to compile some
-   C/C++ code with "``clang -g -O0``" and taking a look at what it
-   produces.
--  **exception handling support** - LLVM supports generation of `zero
-   cost exceptions <../ExceptionHandling.html>`_ which interoperate with
-   code compiled in other languages. You could also generate code by
-   implicitly making every function return an error value and checking
-   it. You could also make explicit use of setjmp/longjmp. There are
-   many different ways to go here.
--  **object orientation, generics, database access, complex numbers,
-   geometric programming, ...** - Really, there is no end of crazy
-   features that you can add to the language.
--  **unusual domains** - We've been talking about applying LLVM to a
-   domain that many people are interested in: building a compiler for a
-   specific language. However, there are many other domains that can use
-   compiler technology that are not typically considered. For example,
-   LLVM has been used to implement OpenGL graphics acceleration,
-   translate C++ code to ActionScript, and many other cute and clever
-   things. Maybe you will be the first to JIT compile a regular
-   expression interpreter into native code with LLVM?
-
-Have fun - try doing something crazy and unusual. Building a language
-like everyone else always has, is much less fun than trying something a
-little crazy or off the wall and seeing how it turns out. If you get
-stuck or want to talk about it, feel free to email the `llvmdev mailing
-list <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_: it has lots
-of people who are interested in languages and are often willing to help
-out.
-
-Before we end this tutorial, I want to talk about some "tips and tricks"
-for generating LLVM IR. These are some of the more subtle things that
-may not be obvious, but are very useful if you want to take advantage of
-LLVM's capabilities.
-
-Properties of the LLVM IR
-=========================
-
-We have a couple common questions about code in the LLVM IR form - lets
-just get these out of the way right now, shall we?
-
-Target Independence
--------------------
-
-Kaleidoscope is an example of a "portable language": any program written
-in Kaleidoscope will work the same way on any target that it runs on.
-Many other languages have this property, e.g. lisp, java, haskell,
-javascript, python, etc (note that while these languages are portable,
-not all their libraries are).
-
-One nice aspect of LLVM is that it is often capable of preserving target
-independence in the IR: you can take the LLVM IR for a
-Kaleidoscope-compiled program and run it on any target that LLVM
-supports, even emitting C code and compiling that on targets that LLVM
-doesn't support natively. You can trivially tell that the Kaleidoscope
-compiler generates target-independent code because it never queries for
-any target-specific information when generating code.
-
-The fact that LLVM provides a compact, target-independent,
-representation for code gets a lot of people excited. Unfortunately,
-these people are usually thinking about C or a language from the C
-family when they are asking questions about language portability. I say
-"unfortunately", because there is really no way to make (fully general)
-C code portable, other than shipping the source code around (and of
-course, C source code is not actually portable in general either - ever
-port a really old application from 32- to 64-bits?).
-
-The problem with C (again, in its full generality) is that it is heavily
-laden with target specific assumptions. As one simple example, the
-preprocessor often destructively removes target-independence from the
-code when it processes the input text:
-
-.. code-block:: c
-
-    #ifdef __i386__
-      int X = 1;
-    #else
-      int X = 42;
-    #endif
-
-While it is possible to engineer more and more complex solutions to
-problems like this, it cannot be solved in full generality in a way that
-is better than shipping the actual source code.
-
-That said, there are interesting subsets of C that can be made portable.
-If you are willing to fix primitive types to a fixed size (say int =
-32-bits, and long = 64-bits), don't care about ABI compatibility with
-existing binaries, and are willing to give up some other minor features,
-you can have portable code. This can make sense for specialized domains
-such as an in-kernel language.
-
-Safety Guarantees
------------------
-
-Many of the languages above are also "safe" languages: it is impossible
-for a program written in Java to corrupt its address space and crash the
-process (assuming the JVM has no bugs). Safety is an interesting
-property that requires a combination of language design, runtime
-support, and often operating system support.
-
-It is certainly possible to implement a safe language in LLVM, but LLVM
-IR does not itself guarantee safety. The LLVM IR allows unsafe pointer
-casts, use after free bugs, buffer over-runs, and a variety of other
-problems. Safety needs to be implemented as a layer on top of LLVM and,
-conveniently, several groups have investigated this. Ask on the `llvmdev
-mailing list <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ if
-you are interested in more details.
-
-Language-Specific Optimizations
--------------------------------
-
-One thing about LLVM that turns off many people is that it does not
-solve all the world's problems in one system (sorry 'world hunger',
-someone else will have to solve you some other day). One specific
-complaint is that people perceive LLVM as being incapable of performing
-high-level language-specific optimization: LLVM "loses too much
-information".
-
-Unfortunately, this is really not the place to give you a full and
-unified version of "Chris Lattner's theory of compiler design". Instead,
-I'll make a few observations:
-
-First, you're right that LLVM does lose information. For example, as of
-this writing, there is no way to distinguish in the LLVM IR whether an
-SSA-value came from a C "int" or a C "long" on an ILP32 machine (other
-than debug info). Both get compiled down to an 'i32' value and the
-information about what it came from is lost. The more general issue
-here, is that the LLVM type system uses "structural equivalence" instead
-of "name equivalence". Another place this surprises people is if you
-have two types in a high-level language that have the same structure
-(e.g. two different structs that have a single int field): these types
-will compile down into a single LLVM type and it will be impossible to
-tell what it came from.
-
-Second, while LLVM does lose information, LLVM is not a fixed target: we
-continue to enhance and improve it in many different ways. In addition
-to adding new features (LLVM did not always support exceptions or debug
-info), we also extend the IR to capture important information for
-optimization (e.g. whether an argument is sign or zero extended,
-information about pointers aliasing, etc). Many of the enhancements are
-user-driven: people want LLVM to include some specific feature, so they
-go ahead and extend it.
-
-Third, it is *possible and easy* to add language-specific optimizations,
-and you have a number of choices in how to do it. As one trivial
-example, it is easy to add language-specific optimization passes that
-"know" things about code compiled for a language. In the case of the C
-family, there is an optimization pass that "knows" about the standard C
-library functions. If you call "exit(0)" in main(), it knows that it is
-safe to optimize that into "return 0;" because C specifies what the
-'exit' function does.
-
-In addition to simple library knowledge, it is possible to embed a
-variety of other language-specific information into the LLVM IR. If you
-have a specific need and run into a wall, please bring the topic up on
-the llvmdev list. At the very worst, you can always treat LLVM as if it
-were a "dumb code generator" and implement the high-level optimizations
-you desire in your front-end, on the language-specific AST.
-
-Tips and Tricks
-===============
-
-There is a variety of useful tips and tricks that you come to know after
-working on/with LLVM that aren't obvious at first glance. Instead of
-letting everyone rediscover them, this section talks about some of these
-issues.
-
-Implementing portable offsetof/sizeof
--------------------------------------
-
-One interesting thing that comes up, if you are trying to keep the code
-generated by your compiler "target independent", is that you often need
-to know the size of some LLVM type or the offset of some field in an
-llvm structure. For example, you might need to pass the size of a type
-into a function that allocates memory.
-
-Unfortunately, this can vary widely across targets: for example the
-width of a pointer is trivially target-specific. However, there is a
-`clever way to use the getelementptr
-instruction <http://nondot.org/sabre/LLVMNotes/SizeOf-OffsetOf-VariableSizedStructs.txt>`_
-that allows you to compute this in a portable way.
-
-Garbage Collected Stack Frames
-------------------------------
-
-Some languages want to explicitly manage their stack frames, often so
-that they are garbage collected or to allow easy implementation of
-closures. There are often better ways to implement these features than
-explicit stack frames, but `LLVM does support
-them, <http://nondot.org/sabre/LLVMNotes/ExplicitlyManagedStackFrames.txt>`_
-if you want. It requires your front-end to convert the code into
-`Continuation Passing
-Style <http://en.wikipedia.org/wiki/Continuation-passing_style>`_ and
-the use of tail calls (which LLVM also supports).
+Chapter 8 Introduction
+======================
+
+Welcome to Chapter 8 of the "`Implementing a language with
+LLVM <index.html>`_" tutorial. In chapters 1 through 7, we've built a
+decent little programming language with functions and variables.
+What happens if something goes wrong though, how do you debug your
+program?
+
+Source level debugging uses formatted data that helps a debugger
+translate from binary and the state of the machine back to the
+source that the programmer wrote. In LLVM we generally use a format
+called `DWARF <http://dwarfstd.org>`_. DWARF is a compact encoding
+that represents types, source locations, and variable locations. 
+
+The short summary of this chapter is that we'll go through the
+various things you have to add to a programming language to
+support debug info, and how you translate that into DWARF.
+
+Caveat: For now we can't debug via the JIT, so we'll need to compile
+our program down to something small and standalone. As part of this
+we'll make a few modifications to the running of the language and
+how programs are compiled. This means that we'll have a source file
+with a simple program written in Kaleidoscope rather than the
+interactive JIT. It does involve a limitation that we can only
+have one "top level" command at a time to reduce the number of
+changes necessary.
+
+Here's the sample program we'll be compiling:
+
+.. code-block:: python
+
+   def fib(x)
+     if x < 3 then
+       1
+     else
+       fib(x-1)+fib(x-2);
+
+   fib(10)
+
+
+Why is this a hard problem?
+===========================
+
+Debug information is a hard problem for a few different reasons - mostly
+centered around optimized code. First, optimization makes keeping source
+locations more difficult. In LLVM IR we keep the original source location
+for each IR level instruction on the instruction. Optimization passes
+should keep the source locations for newly created instructions, but merged
+instructions only get to keep a single location - this can cause jumping
+around when stepping through optimized programs. Secondly, optimization
+can move variables in ways that are either optimized out, shared in memory
+with other variables, or difficult to track. For the purposes of this
+tutorial we're going to avoid optimization (as you'll see with one of the
+next sets of patches).
+
+Ahead-of-Time Compilation Mode
+==============================
+
+To highlight only the aspects of adding debug information to a source
+language without needing to worry about the complexities of JIT debugging
+we're going to make a few changes to Kaleidoscope to support compiling
+the IR emitted by the front end into a simple standalone program that
+you can execute, debug, and see results.
+
+First we make our anonymous function that contains our top level
+statement be our "main":
+
+.. code-block:: udiff
+
+  -    PrototypeAST *Proto = new PrototypeAST("", std::vector<std::string>());
+  +    PrototypeAST *Proto = new PrototypeAST("main", std::vector<std::string>());
+
+just with the simple change of giving it a name.
+
+Then we're going to remove the command line code wherever it exists:
+
+.. code-block:: udiff
+
+  @@ -1129,7 +1129,6 @@ static void HandleTopLevelExpression() {
+   /// top ::= definition | external | expression | ';'
+   static void MainLoop() {
+     while (1) {
+  -    fprintf(stderr, "ready> ");
+       switch (CurTok) {
+       case tok_eof:
+         return;
+  @@ -1184,7 +1183,6 @@ int main() {
+     BinopPrecedence['*'] = 40; // highest.
+ 
+     // Prime the first token.
+  -  fprintf(stderr, "ready> ");
+     getNextToken();
+ 
+Lastly we're going to disable all of the optimization passes and the JIT so
+that the only thing that happens after we're done parsing and generating
+code is that the llvm IR goes to standard error:
+
+.. code-block:: udiff
+
+  @@ -1108,17 +1108,8 @@ static void HandleExtern() {
+   static void HandleTopLevelExpression() {
+     // Evaluate a top-level expression into an anonymous function.
+     if (FunctionAST *F = ParseTopLevelExpr()) {
+  -    if (Function *LF = F->Codegen()) {
+  -      // We're just doing this to make sure it executes.
+  -      TheExecutionEngine->finalizeObject();
+  -      // JIT the function, returning a function pointer.
+  -      void *FPtr = TheExecutionEngine->getPointerToFunction(LF);
+  -
+  -      // Cast it to the right type (takes no arguments, returns a double) so we
+  -      // can call it as a native function.
+  -      double (*FP)() = (double (*)())(intptr_t)FPtr;
+  -      // Ignore the return value for this.
+  -      (void)FP;
+  +    if (!F->Codegen()) {
+  +      fprintf(stderr, "Error generating code for top level expr");
+       }
+     } else {
+       // Skip token for error recovery.
+  @@ -1439,11 +1459,11 @@ int main() {
+     // target lays out data structures.
+     TheModule->setDataLayout(TheExecutionEngine->getDataLayout());
+     OurFPM.add(new DataLayoutPass());
+  +#if 0
+     OurFPM.add(createBasicAliasAnalysisPass());
+     // Promote allocas to registers.
+     OurFPM.add(createPromoteMemoryToRegisterPass());
+  @@ -1218,7 +1210,7 @@ int main() {
+     OurFPM.add(createGVNPass());
+     // Simplify the control flow graph (deleting unreachable blocks, etc).
+     OurFPM.add(createCFGSimplificationPass());
+  -
+  +  #endif
+     OurFPM.doInitialization();
+ 
+     // Set the global so the code gen can use this.
+
+This relatively small set of changes get us to the point that we can compile
+our piece of Kaleidoscope language down to an executable program via this
+command line:
+
+.. code-block:: bash
+
+  Kaleidoscope-Ch8 < fib.ks | & clang -x ir -
+
+which gives an a.out/a.exe in the current working directory.
+
+Compile Unit
+============
+
+The top level container for a section of code in DWARF is a compile unit.
+This contains the type and function data for an individual translation unit
+(read: one file of source code). So the first thing we need to do is
+construct one for our fib.ks file.
+
+DWARF Emission Setup
+====================
+
+Similar to the ``IRBuilder`` class we have a
+```DIBuilder`` <http://llvm.org/doxygen/classllvm_1_1DIBuilder.html>`_ class
+that helps in constructing debug metadata for an llvm IR file. It
+corresponds 1:1 similarly to ``IRBuilder`` and llvm IR, but with nicer names.
+Using it does require that you be more familiar with DWARF terminology than
+you needed to be with ``IRBuilder`` and ``Instruction`` names, but if you
+read through the general documentation on the
+```Metadata Format`` <http://llvm.org/docs/SourceLevelDebugging.html>`_ it
+should be a little more clear. We'll be using this class to construct all
+of our IR level descriptions. Construction for it takes a module so we
+need to construct it shortly after we construct our module. We've left it
+as a global static variable to make it a bit easier to use.
+
+Next we're going to create a small container to cache some of our frequent
+data. The first will be our compile unit, but we'll also write a bit of
+code for our one type since we won't have to worry about multiple typed
+expressions:
+
+.. code-block:: c++
+
+  static DIBuilder *DBuilder;
+
+  struct DebugInfo {
+    DICompileUnit TheCU;
+    DIType DblTy;
+
+    DIType getDoubleTy();
+  } KSDbgInfo;
+
+  DIType DebugInfo::getDoubleTy() {
+    if (DblTy.isValid())
+      return DblTy;
+
+    DblTy = DBuilder->createBasicType("double", 64, 64, dwarf::DW_ATE_float);
+    return DblTy;
+  }
+
+And then later on in ``main`` when we're constructing our module:
+
+.. code-block:: c++
+
+  DBuilder = new DIBuilder(*TheModule);
+
+  KSDbgInfo.TheCU = DBuilder->createCompileUnit(
+      dwarf::DW_LANG_C, "fib.ks", ".", "Kaleidoscope Compiler", 0, "", 0);
+
+There are a couple of things to note here. First, while we're producing a
+compile unit for a language called Kaleidoscope we used the language
+constant for C. This is because a debugger wouldn't necessarily understand
+the calling conventions or default ABI for a language it doesn't recognize
+and we follow the C ABI in our llvm code generation so it's the closest
+thing to accurate. This ensures we can actually call functions from the
+debugger and have them execute. Secondly, you'll see the "fib.ks" in the
+call to ``createCompileUnit``. This is a default hard coded value since
+we're using shell redirection to put our source into the Kaleidoscope
+compiler. In a usual front end you'd have an input file name and it would
+go there.
+
+One last thing as part of emitting debug information via DIBuilder is that
+we need to "finalize" the debug information. The reasons are part of the
+underlying API for DIBuilder, but make sure you do this near the end of
+main:
+
+.. code-block:: c++
+
+  DBuilder->finalize();
+
+before you dump out the module.
+
+Functions
+=========
+
+Now that we have our ``Compile Unit`` and our source locations, we can add
+function definitions to the debug info. So in ``PrototypeAST::Codegen`` we
+add a few lines of code to describe a context for our subprogram, in this
+case the "File", and the actual definition of the function itself.
+
+So the context:
+
+.. code-block:: c++
+
+  DIFile Unit = DBuilder->createFile(KSDbgInfo.TheCU.getFilename(),
+                                     KSDbgInfo.TheCU.getDirectory());
+
+giving us a DIFile and asking the ``Compile Unit`` we created above for the
+directory and filename where we are currently. Then, for now, we use some
+source locations of 0 (since our AST doesn't currently have source location
+information) and construct our function definition:
+
+.. code-block:: c++
+
+  DIDescriptor FContext(Unit);
+  unsigned LineNo = 0;
+  unsigned ScopeLine = 0;
+  DISubprogram SP = DBuilder->createFunction(
+      FContext, Name, StringRef(), Unit, LineNo,
+      CreateFunctionType(Args.size(), Unit), false /* internal linkage */,
+      true /* definition */, ScopeLine, DIDescriptor::FlagPrototyped, false, F);
+
+and we now have a DISubprogram that contains a reference to all of our metadata
+for the function.
+
+Source Locations
+================
+
+The most important thing for debug information is accurate source location -
+this makes it possible to map your source code back. We have a problem though,
+Kaleidoscope really doesn't have any source location information in the lexer
+or parser so we'll need to add it.
+
+.. code-block:: c++
+
+   struct SourceLocation {
+     int Line;
+     int Col;
+   };
+   static SourceLocation CurLoc;
+   static SourceLocation LexLoc = {1, 0};
+
+   static int advance() {
+     int LastChar = getchar();
+
+     if (LastChar == '\n' || LastChar == '\r') {
+       LexLoc.Line++;
+       LexLoc.Col = 0;
+     } else
+       LexLoc.Col++;
+     return LastChar;
+   }
+
+In this set of code we've added some functionality on how to keep track of the
+line and column of the "source file". As we lex every token we set our current
+current "lexical location" to the assorted line and column for the beginning
+of the token. We do this by overriding all of the previous calls to
+``getchar()`` with our new ``advance()`` that keeps track of the information
+and then we have added to all of our AST classes a source location:
+
+.. code-block:: c++
+
+   class ExprAST {
+     SourceLocation Loc;
+
+     public:
+       int getLine() const { return Loc.Line; }
+       int getCol() const { return Loc.Col; }
+       ExprAST(SourceLocation Loc = CurLoc) : Loc(Loc) {}
+       virtual std::ostream &dump(std::ostream &out, int ind) {
+         return out << ':' << getLine() << ':' << getCol() << '\n';
+       }
+
+that we pass down through when we create a new expression:
+
+.. code-block:: c++
+
+   LHS = new BinaryExprAST(BinLoc, BinOp, LHS, RHS);
+
+giving us locations for each of our expressions and variables.
+
+From this we can make sure to tell ``DIBuilder`` when we're at a new source
+location so it can use that when we generate the rest of our code and make
+sure that each instruction has source location information. We do this
+by constructing another small function:
+
+.. code-block:: c++
+
+  void DebugInfo::emitLocation(ExprAST *AST) {
+    DIScope *Scope;
+    if (LexicalBlocks.empty())
+      Scope = &TheCU;
+    else
+      Scope = LexicalBlocks.back();
+    Builder.SetCurrentDebugLocation(
+        DebugLoc::get(AST->getLine(), AST->getCol(), DIScope(*Scope)));
+  }
+
+that both tells the main ``IRBuilder`` where we are, but also what scope
+we're in. Since we've just created a function above we can either be in
+the main file scope (like when we created our function), or now we can be
+in the function scope we just created. To represent this we create a stack
+of scopes:
+
+.. code-block:: c++
+
+   std::vector<DIScope *> LexicalBlocks;
+   std::map<const PrototypeAST *, DIScope> FnScopeMap;
+
+and keep a map of each function to the scope that it represents (a DISubprogram
+is also a DIScope).
+
+Then we make sure to:
+
+.. code-block:: c++
+
+   KSDbgInfo.emitLocation(this);
+
+emit the location every time we start to generate code for a new AST, and
+also:
+
+.. code-block:: c++
+
+  KSDbgInfo.FnScopeMap[this] = SP;
+
+store the scope (function) when we create it and use it:
+
+  KSDbgInfo.LexicalBlocks.push_back(&KSDbgInfo.FnScopeMap[Proto]);
+
+when we start generating the code for each function.
+
+also, don't forget to pop the scope back off of your scope stack at the
+end of the code generation for the function:
+
+.. code-block:: c++
+
+  // Pop off the lexical block for the function since we added it
+  // unconditionally.
+  KSDbgInfo.LexicalBlocks.pop_back();
+
+Variables
+=========
+
+Now that we have functions, we need to be able to print out the variables
+we have in scope. Let's get our function arguments set up so we can get
+decent backtraces and see how our functions are being called. It isn't
+a lot of code, and we generally handle it when we're creating the
+argument allocas in ``PrototypeAST::CreateArgumentAllocas``.
+
+.. code-block:: c++
+
+  DIScope *Scope = KSDbgInfo.LexicalBlocks.back();
+  DIFile Unit = DBuilder->createFile(KSDbgInfo.TheCU.getFilename(),
+                                     KSDbgInfo.TheCU.getDirectory());
+  DIVariable D = DBuilder->createLocalVariable(dwarf::DW_TAG_arg_variable,
+                                               *Scope, Args[Idx], Unit, Line,
+                                               KSDbgInfo.getDoubleTy(), Idx);
+
+  Instruction *Call = DBuilder->insertDeclare(
+      Alloca, D, DBuilder->createExpression(), Builder.GetInsertBlock());
+  Call->setDebugLoc(DebugLoc::get(Line, 0, *Scope));
+
+Here we're doing a few things. First, we're grabbing our current scope
+for the variable so we can say what range of code our variable is valid
+through. Second, we're creating the variable, giving it the scope,
+the name, source location, type, and since it's an argument, the argument
+index. Third, we create an ``lvm.dbg.declare`` call to indicate at the IR
+level that we've got a variable in an alloca (and it gives a starting
+location for the variable). Lastly, we set a source location for the
+beginning of the scope on the declare.
+
+One interesting thing to note at this point is that various debuggers have
+assumptions based on how code and debug information was generated for them
+in the past. In this case we need to do a little bit of a hack to avoid
+generating line information for the function prologue so that the debugger
+knows to skip over those instructions when setting a breakpoint. So in
+``FunctionAST::CodeGen`` we add a couple of lines:
+
+.. code-block:: c++
+
+  // Unset the location for the prologue emission (leading instructions with no
+  // location in a function are considered part of the prologue and the debugger
+  // will run past them when breaking on a function)
+  KSDbgInfo.emitLocation(nullptr);
+
+and then emit a new location when we actually start generating code for the
+body of the function:
+
+.. code-block:: c++
+
+  KSDbgInfo.emitLocation(Body);
+
+With this we have enough debug information to set breakpoints in functions,
+print out argument variables, and call functions. Not too bad for just a
+few simple lines of code!
+
+Full Code Listing
+=================
+
+Here is the complete code listing for our running example, enhanced with
+debug information. To build this example, use:
+
+.. code-block:: bash
+
+    # Compile
+    clang++ -g toy.cpp `llvm-config --cxxflags --ldflags --system-libs --libs core mcjit native` -O3 -o toy
+    # Run
+    ./toy
+
+Here is the code:
+
+.. literalinclude:: ../../examples/Kaleidoscope/Chapter8/toy.cpp
+   :language: c++
+
+`Next: Conclusion and other useful LLVM tidbits <LangImpl9.html>`_
 
diff --git a/docs/tutorial/LangImpl9.rst b/docs/tutorial/LangImpl9.rst
new file mode 100644
index 0000000..3398768
--- /dev/null
+++ b/docs/tutorial/LangImpl9.rst
@@ -0,0 +1,262 @@
+======================================================
+Kaleidoscope: Conclusion and other useful LLVM tidbits
+======================================================
+
+.. contents::
+   :local:
+
+Tutorial Conclusion
+===================
+
+Welcome to the final chapter of the "`Implementing a language with
+LLVM <index.html>`_" tutorial. In the course of this tutorial, we have
+grown our little Kaleidoscope language from being a useless toy, to
+being a semi-interesting (but probably still useless) toy. :)
+
+It is interesting to see how far we've come, and how little code it has
+taken. We built the entire lexer, parser, AST, code generator, an
+interactive run-loop (with a JIT!), and emitted debug information in
+standalone executables - all in under 1000 lines of (non-comment/non-blank)
+code.
+
+Our little language supports a couple of interesting features: it
+supports user defined binary and unary operators, it uses JIT
+compilation for immediate evaluation, and it supports a few control flow
+constructs with SSA construction.
+
+Part of the idea of this tutorial was to show you how easy and fun it
+can be to define, build, and play with languages. Building a compiler
+need not be a scary or mystical process! Now that you've seen some of
+the basics, I strongly encourage you to take the code and hack on it.
+For example, try adding:
+
+-  **global variables** - While global variables have questional value
+   in modern software engineering, they are often useful when putting
+   together quick little hacks like the Kaleidoscope compiler itself.
+   Fortunately, our current setup makes it very easy to add global
+   variables: just have value lookup check to see if an unresolved
+   variable is in the global variable symbol table before rejecting it.
+   To create a new global variable, make an instance of the LLVM
+   ``GlobalVariable`` class.
+-  **typed variables** - Kaleidoscope currently only supports variables
+   of type double. This gives the language a very nice elegance, because
+   only supporting one type means that you never have to specify types.
+   Different languages have different ways of handling this. The easiest
+   way is to require the user to specify types for every variable
+   definition, and record the type of the variable in the symbol table
+   along with its Value\*.
+-  **arrays, structs, vectors, etc** - Once you add types, you can start
+   extending the type system in all sorts of interesting ways. Simple
+   arrays are very easy and are quite useful for many different
+   applications. Adding them is mostly an exercise in learning how the
+   LLVM `getelementptr <../LangRef.html#i_getelementptr>`_ instruction
+   works: it is so nifty/unconventional, it `has its own
+   FAQ <../GetElementPtr.html>`_! If you add support for recursive types
+   (e.g. linked lists), make sure to read the `section in the LLVM
+   Programmer's Manual <../ProgrammersManual.html#TypeResolve>`_ that
+   describes how to construct them.
+-  **standard runtime** - Our current language allows the user to access
+   arbitrary external functions, and we use it for things like "printd"
+   and "putchard". As you extend the language to add higher-level
+   constructs, often these constructs make the most sense if they are
+   lowered to calls into a language-supplied runtime. For example, if
+   you add hash tables to the language, it would probably make sense to
+   add the routines to a runtime, instead of inlining them all the way.
+-  **memory management** - Currently we can only access the stack in
+   Kaleidoscope. It would also be useful to be able to allocate heap
+   memory, either with calls to the standard libc malloc/free interface
+   or with a garbage collector. If you would like to use garbage
+   collection, note that LLVM fully supports `Accurate Garbage
+   Collection <../GarbageCollection.html>`_ including algorithms that
+   move objects and need to scan/update the stack.
+-  **exception handling support** - LLVM supports generation of `zero
+   cost exceptions <../ExceptionHandling.html>`_ which interoperate with
+   code compiled in other languages. You could also generate code by
+   implicitly making every function return an error value and checking
+   it. You could also make explicit use of setjmp/longjmp. There are
+   many different ways to go here.
+-  **object orientation, generics, database access, complex numbers,
+   geometric programming, ...** - Really, there is no end of crazy
+   features that you can add to the language.
+-  **unusual domains** - We've been talking about applying LLVM to a
+   domain that many people are interested in: building a compiler for a
+   specific language. However, there are many other domains that can use
+   compiler technology that are not typically considered. For example,
+   LLVM has been used to implement OpenGL graphics acceleration,
+   translate C++ code to ActionScript, and many other cute and clever
+   things. Maybe you will be the first to JIT compile a regular
+   expression interpreter into native code with LLVM?
+
+Have fun - try doing something crazy and unusual. Building a language
+like everyone else always has, is much less fun than trying something a
+little crazy or off the wall and seeing how it turns out. If you get
+stuck or want to talk about it, feel free to email the `llvmdev mailing
+list <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_: it has lots
+of people who are interested in languages and are often willing to help
+out.
+
+Before we end this tutorial, I want to talk about some "tips and tricks"
+for generating LLVM IR. These are some of the more subtle things that
+may not be obvious, but are very useful if you want to take advantage of
+LLVM's capabilities.
+
+Properties of the LLVM IR
+=========================
+
+We have a couple common questions about code in the LLVM IR form - lets
+just get these out of the way right now, shall we?
+
+Target Independence
+-------------------
+
+Kaleidoscope is an example of a "portable language": any program written
+in Kaleidoscope will work the same way on any target that it runs on.
+Many other languages have this property, e.g. lisp, java, haskell,
+javascript, python, etc (note that while these languages are portable,
+not all their libraries are).
+
+One nice aspect of LLVM is that it is often capable of preserving target
+independence in the IR: you can take the LLVM IR for a
+Kaleidoscope-compiled program and run it on any target that LLVM
+supports, even emitting C code and compiling that on targets that LLVM
+doesn't support natively. You can trivially tell that the Kaleidoscope
+compiler generates target-independent code because it never queries for
+any target-specific information when generating code.
+
+The fact that LLVM provides a compact, target-independent,
+representation for code gets a lot of people excited. Unfortunately,
+these people are usually thinking about C or a language from the C
+family when they are asking questions about language portability. I say
+"unfortunately", because there is really no way to make (fully general)
+C code portable, other than shipping the source code around (and of
+course, C source code is not actually portable in general either - ever
+port a really old application from 32- to 64-bits?).
+
+The problem with C (again, in its full generality) is that it is heavily
+laden with target specific assumptions. As one simple example, the
+preprocessor often destructively removes target-independence from the
+code when it processes the input text:
+
+.. code-block:: c
+
+    #ifdef __i386__
+      int X = 1;
+    #else
+      int X = 42;
+    #endif
+
+While it is possible to engineer more and more complex solutions to
+problems like this, it cannot be solved in full generality in a way that
+is better than shipping the actual source code.
+
+That said, there are interesting subsets of C that can be made portable.
+If you are willing to fix primitive types to a fixed size (say int =
+32-bits, and long = 64-bits), don't care about ABI compatibility with
+existing binaries, and are willing to give up some other minor features,
+you can have portable code. This can make sense for specialized domains
+such as an in-kernel language.
+
+Safety Guarantees
+-----------------
+
+Many of the languages above are also "safe" languages: it is impossible
+for a program written in Java to corrupt its address space and crash the
+process (assuming the JVM has no bugs). Safety is an interesting
+property that requires a combination of language design, runtime
+support, and often operating system support.
+
+It is certainly possible to implement a safe language in LLVM, but LLVM
+IR does not itself guarantee safety. The LLVM IR allows unsafe pointer
+casts, use after free bugs, buffer over-runs, and a variety of other
+problems. Safety needs to be implemented as a layer on top of LLVM and,
+conveniently, several groups have investigated this. Ask on the `llvmdev
+mailing list <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ if
+you are interested in more details.
+
+Language-Specific Optimizations
+-------------------------------
+
+One thing about LLVM that turns off many people is that it does not
+solve all the world's problems in one system (sorry 'world hunger',
+someone else will have to solve you some other day). One specific
+complaint is that people perceive LLVM as being incapable of performing
+high-level language-specific optimization: LLVM "loses too much
+information".
+
+Unfortunately, this is really not the place to give you a full and
+unified version of "Chris Lattner's theory of compiler design". Instead,
+I'll make a few observations:
+
+First, you're right that LLVM does lose information. For example, as of
+this writing, there is no way to distinguish in the LLVM IR whether an
+SSA-value came from a C "int" or a C "long" on an ILP32 machine (other
+than debug info). Both get compiled down to an 'i32' value and the
+information about what it came from is lost. The more general issue
+here, is that the LLVM type system uses "structural equivalence" instead
+of "name equivalence". Another place this surprises people is if you
+have two types in a high-level language that have the same structure
+(e.g. two different structs that have a single int field): these types
+will compile down into a single LLVM type and it will be impossible to
+tell what it came from.
+
+Second, while LLVM does lose information, LLVM is not a fixed target: we
+continue to enhance and improve it in many different ways. In addition
+to adding new features (LLVM did not always support exceptions or debug
+info), we also extend the IR to capture important information for
+optimization (e.g. whether an argument is sign or zero extended,
+information about pointers aliasing, etc). Many of the enhancements are
+user-driven: people want LLVM to include some specific feature, so they
+go ahead and extend it.
+
+Third, it is *possible and easy* to add language-specific optimizations,
+and you have a number of choices in how to do it. As one trivial
+example, it is easy to add language-specific optimization passes that
+"know" things about code compiled for a language. In the case of the C
+family, there is an optimization pass that "knows" about the standard C
+library functions. If you call "exit(0)" in main(), it knows that it is
+safe to optimize that into "return 0;" because C specifies what the
+'exit' function does.
+
+In addition to simple library knowledge, it is possible to embed a
+variety of other language-specific information into the LLVM IR. If you
+have a specific need and run into a wall, please bring the topic up on
+the llvmdev list. At the very worst, you can always treat LLVM as if it
+were a "dumb code generator" and implement the high-level optimizations
+you desire in your front-end, on the language-specific AST.
+
+Tips and Tricks
+===============
+
+There is a variety of useful tips and tricks that you come to know after
+working on/with LLVM that aren't obvious at first glance. Instead of
+letting everyone rediscover them, this section talks about some of these
+issues.
+
+Implementing portable offsetof/sizeof
+-------------------------------------
+
+One interesting thing that comes up, if you are trying to keep the code
+generated by your compiler "target independent", is that you often need
+to know the size of some LLVM type or the offset of some field in an
+llvm structure. For example, you might need to pass the size of a type
+into a function that allocates memory.
+
+Unfortunately, this can vary widely across targets: for example the
+width of a pointer is trivially target-specific. However, there is a
+`clever way to use the getelementptr
+instruction <http://nondot.org/sabre/LLVMNotes/SizeOf-OffsetOf-VariableSizedStructs.txt>`_
+that allows you to compute this in a portable way.
+
+Garbage Collected Stack Frames
+------------------------------
+
+Some languages want to explicitly manage their stack frames, often so
+that they are garbage collected or to allow easy implementation of
+closures. There are often better ways to implement these features than
+explicit stack frames, but `LLVM does support
+them, <http://nondot.org/sabre/LLVMNotes/ExplicitlyManagedStackFrames.txt>`_
+if you want. It requires your front-end to convert the code into
+`Continuation Passing
+Style <http://en.wikipedia.org/wiki/Continuation-passing_style>`_ and
+the use of tail calls (which LLVM also supports).
+
diff --git a/examples/ExceptionDemo/ExceptionDemo.cpp b/examples/ExceptionDemo/ExceptionDemo.cpp
index 17076fa..317a326 100644
--- a/examples/ExceptionDemo/ExceptionDemo.cpp
+++ b/examples/ExceptionDemo/ExceptionDemo.cpp
@@ -56,8 +56,8 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetOptions.h"
@@ -1122,14 +1122,11 @@ static llvm::BasicBlock *createCatchBlock(llvm::LLVMContext &context,
 /// @param numExceptionsToCatch length of exceptionTypesToCatch array
 /// @param exceptionTypesToCatch array of type info types to "catch"
 /// @returns generated function
-static
-llvm::Function *createCatchWrappedInvokeFunction(llvm::Module &module,
-                                             llvm::IRBuilder<> &builder,
-                                             llvm::FunctionPassManager &fpm,
-                                             llvm::Function &toInvoke,
-                                             std::string ourId,
-                                             unsigned numExceptionsToCatch,
-                                             unsigned exceptionTypesToCatch[]) {
+static llvm::Function *createCatchWrappedInvokeFunction(
+    llvm::Module &module, llvm::IRBuilder<> &builder,
+    llvm::legacy::FunctionPassManager &fpm, llvm::Function &toInvoke,
+    std::string ourId, unsigned numExceptionsToCatch,
+    unsigned exceptionTypesToCatch[]) {
 
   llvm::LLVMContext &context = module.getContext();
   llvm::Function *toPrint32Int = module.getFunction("print32Int");
@@ -1389,13 +1386,11 @@ llvm::Function *createCatchWrappedInvokeFunction(llvm::Module &module,
 /// @param nativeThrowFunct function which will throw a foreign exception
 ///        if the above nativeThrowType matches generated function's arg.
 /// @returns generated function
-static
-llvm::Function *createThrowExceptionFunction(llvm::Module &module,
-                                             llvm::IRBuilder<> &builder,
-                                             llvm::FunctionPassManager &fpm,
-                                             std::string ourId,
-                                             int32_t nativeThrowType,
-                                             llvm::Function &nativeThrowFunct) {
+static llvm::Function *
+createThrowExceptionFunction(llvm::Module &module, llvm::IRBuilder<> &builder,
+                             llvm::legacy::FunctionPassManager &fpm,
+                             std::string ourId, int32_t nativeThrowType,
+                             llvm::Function &nativeThrowFunct) {
   llvm::LLVMContext &context = module.getContext();
   namedValues.clear();
   ArgTypes unwindArgTypes;
@@ -1508,10 +1503,10 @@ static void createStandardUtilityFunctions(unsigned numTypeInfos,
 /// @param nativeThrowFunctName name of external function which will throw
 ///        a foreign exception
 /// @returns outermost generated test function.
-llvm::Function *createUnwindExceptionTest(llvm::Module &module,
-                                          llvm::IRBuilder<> &builder,
-                                          llvm::FunctionPassManager &fpm,
-                                          std::string nativeThrowFunctName) {
+llvm::Function *
+createUnwindExceptionTest(llvm::Module &module, llvm::IRBuilder<> &builder,
+                          llvm::legacy::FunctionPassManager &fpm,
+                          std::string nativeThrowFunctName) {
   // Number of type infos to generate
   unsigned numTypeInfos = 6;
 
@@ -1961,17 +1956,17 @@ int main(int argc, char *argv[]) {
       llvm::make_unique<llvm::Module>("my cool jit", context);
   llvm::Module *module = Owner.get();
 
-  llvm::RTDyldMemoryManager *MemMgr = new llvm::SectionMemoryManager();
+  std::unique_ptr<llvm::RTDyldMemoryManager> MemMgr(new llvm::SectionMemoryManager());
 
   // Build engine with JIT
   llvm::EngineBuilder factory(std::move(Owner));
   factory.setEngineKind(llvm::EngineKind::JIT);
   factory.setTargetOptions(Opts);
-  factory.setMCJITMemoryManager(MemMgr);
+  factory.setMCJITMemoryManager(std::move(MemMgr));
   llvm::ExecutionEngine *executionEngine = factory.create();
 
   {
-    llvm::FunctionPassManager fpm(module);
+    llvm::legacy::FunctionPassManager fpm(module);
 
     // Set up the optimizer pipeline.
     // Start with registering info about how the
diff --git a/examples/Kaleidoscope/CMakeLists.txt b/examples/Kaleidoscope/CMakeLists.txt
index 8c87ac5..32664aa 100644
--- a/examples/Kaleidoscope/CMakeLists.txt
+++ b/examples/Kaleidoscope/CMakeLists.txt
@@ -1,6 +1,16 @@
+add_custom_target(Kaleidoscope)
+set_target_properties(Kaleidoscope PROPERTIES FOLDER Examples)
+
+macro(add_kaleidoscope_chapter name)
+  add_dependencies(Kaleidoscope ${name})
+  add_llvm_example(${name} ${ARGN})
+endmacro(add_kaleidoscope_chapter name)
+
 add_subdirectory(Chapter2)
 add_subdirectory(Chapter3)
 add_subdirectory(Chapter4)
 add_subdirectory(Chapter5)
 add_subdirectory(Chapter6)
 add_subdirectory(Chapter7)
+add_subdirectory(Chapter8)
+add_subdirectory(Orc)
diff --git a/examples/Kaleidoscope/Chapter2/CMakeLists.txt b/examples/Kaleidoscope/Chapter2/CMakeLists.txt
index 79f2b17..fed3f4b 100644
--- a/examples/Kaleidoscope/Chapter2/CMakeLists.txt
+++ b/examples/Kaleidoscope/Chapter2/CMakeLists.txt
@@ -1,3 +1,3 @@
-add_llvm_example(Kaleidoscope-Ch2
+add_kaleidoscope_chapter(Kaleidoscope-Ch2
   toy.cpp
   )
diff --git a/examples/Kaleidoscope/Chapter3/CMakeLists.txt b/examples/Kaleidoscope/Chapter3/CMakeLists.txt
index a98d7df..8053c96 100644
--- a/examples/Kaleidoscope/Chapter3/CMakeLists.txt
+++ b/examples/Kaleidoscope/Chapter3/CMakeLists.txt
@@ -3,6 +3,6 @@ set(LLVM_LINK_COMPONENTS
   Support
   )
 
-add_llvm_example(Kaleidoscope-Ch3
+add_kaleidoscope_chapter(Kaleidoscope-Ch3
   toy.cpp
   )
diff --git a/examples/Kaleidoscope/Chapter4/CMakeLists.txt b/examples/Kaleidoscope/Chapter4/CMakeLists.txt
index 2f828dc..2c01e12 100644
--- a/examples/Kaleidoscope/Chapter4/CMakeLists.txt
+++ b/examples/Kaleidoscope/Chapter4/CMakeLists.txt
@@ -3,12 +3,14 @@ set(LLVM_LINK_COMPONENTS
   Core
   ExecutionEngine
   InstCombine
-  MC
+  MCJIT
+  RuntimeDyld
   ScalarOpts
   Support
-  nativecodegen
+  TransformUtils
+  native
   )
 
-add_llvm_example(Kaleidoscope-Ch4
+add_kaleidoscope_chapter(Kaleidoscope-Ch4
   toy.cpp
   )
diff --git a/examples/Kaleidoscope/Chapter4/toy.cpp b/examples/Kaleidoscope/Chapter4/toy.cpp
index 3564d75..70fe57f 100644
--- a/examples/Kaleidoscope/Chapter4/toy.cpp
+++ b/examples/Kaleidoscope/Chapter4/toy.cpp
@@ -1,12 +1,14 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/MCJIT.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
 #include <cctype>
@@ -26,14 +28,16 @@ enum Token {
   tok_eof = -1,
 
   // commands
-  tok_def = -2, tok_extern = -3,
+  tok_def = -2,
+  tok_extern = -3,
 
   // primary
-  tok_identifier = -4, tok_number = -5
+  tok_identifier = -4,
+  tok_number = -5
 };
 
-static std::string IdentifierStr;  // Filled in if tok_identifier
-static double NumVal;              // Filled in if tok_number
+static std::string IdentifierStr; // Filled in if tok_identifier
+static double NumVal;             // Filled in if tok_number
 
 /// gettok - Return the next token from standard input.
 static int gettok() {
@@ -48,12 +52,14 @@ static int gettok() {
     while (isalnum((LastChar = getchar())))
       IdentifierStr += LastChar;
 
-    if (IdentifierStr == "def") return tok_def;
-    if (IdentifierStr == "extern") return tok_extern;
+    if (IdentifierStr == "def")
+      return tok_def;
+    if (IdentifierStr == "extern")
+      return tok_extern;
     return tok_identifier;
   }
 
-  if (isdigit(LastChar) || LastChar == '.') {   // Number: [0-9.]+
+  if (isdigit(LastChar) || LastChar == '.') { // Number: [0-9.]+
     std::string NumStr;
     do {
       NumStr += LastChar;
@@ -66,13 +72,14 @@ static int gettok() {
 
   if (LastChar == '#') {
     // Comment until end of line.
-    do LastChar = getchar();
+    do
+      LastChar = getchar();
     while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
-    
+
     if (LastChar != EOF)
       return gettok();
   }
-  
+
   // Check for end of file.  Don't eat the EOF.
   if (LastChar == EOF)
     return tok_eof;
@@ -97,6 +104,7 @@ public:
 /// NumberExprAST - Expression class for numeric literals like "1.0".
 class NumberExprAST : public ExprAST {
   double Val;
+
 public:
   NumberExprAST(double val) : Val(val) {}
   virtual Value *Codegen();
@@ -105,6 +113,7 @@ public:
 /// VariableExprAST - Expression class for referencing a variable, like "a".
 class VariableExprAST : public ExprAST {
   std::string Name;
+
 public:
   VariableExprAST(const std::string &name) : Name(name) {}
   virtual Value *Codegen();
@@ -114,19 +123,21 @@ public:
 class BinaryExprAST : public ExprAST {
   char Op;
   ExprAST *LHS, *RHS;
+
 public:
-  BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs) 
-    : Op(op), LHS(lhs), RHS(rhs) {}
+  BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs)
+      : Op(op), LHS(lhs), RHS(rhs) {}
   virtual Value *Codegen();
 };
 
 /// CallExprAST - Expression class for function calls.
 class CallExprAST : public ExprAST {
   std::string Callee;
-  std::vector<ExprAST*> Args;
+  std::vector<ExprAST *> Args;
+
 public:
-  CallExprAST(const std::string &callee, std::vector<ExprAST*> &args)
-    : Callee(callee), Args(args) {}
+  CallExprAST(const std::string &callee, std::vector<ExprAST *> &args)
+      : Callee(callee), Args(args) {}
   virtual Value *Codegen();
 };
 
@@ -136,10 +147,11 @@ public:
 class PrototypeAST {
   std::string Name;
   std::vector<std::string> Args;
+
 public:
   PrototypeAST(const std::string &name, const std::vector<std::string> &args)
-    : Name(name), Args(args) {}
-  
+      : Name(name), Args(args) {}
+
   Function *Codegen();
 };
 
@@ -147,10 +159,10 @@ public:
 class FunctionAST {
   PrototypeAST *Proto;
   ExprAST *Body;
+
 public:
-  FunctionAST(PrototypeAST *proto, ExprAST *body)
-    : Proto(proto), Body(body) {}
-  
+  FunctionAST(PrototypeAST *proto, ExprAST *body) : Proto(proto), Body(body) {}
+
   Function *Codegen();
 };
 } // end anonymous namespace
@@ -163,9 +175,7 @@ public:
 /// token the parser is looking at.  getNextToken reads another token from the
 /// lexer and updates CurTok with its results.
 static int CurTok;
-static int getNextToken() {
-  return CurTok = gettok();
-}
+static int getNextToken() { return CurTok = gettok(); }
 
 /// BinopPrecedence - This holds the precedence for each binary operator that is
 /// defined.
@@ -175,17 +185,27 @@ static std::map<char, int> BinopPrecedence;
 static int GetTokPrecedence() {
   if (!isascii(CurTok))
     return -1;
-  
+
   // Make sure it's a declared binop.
   int TokPrec = BinopPrecedence[CurTok];
-  if (TokPrec <= 0) return -1;
+  if (TokPrec <= 0)
+    return -1;
   return TokPrec;
 }
 
 /// Error* - These are little helper functions for error handling.
-ExprAST *Error(const char *Str) { fprintf(stderr, "Error: %s\n", Str);return 0;}
-PrototypeAST *ErrorP(const char *Str) { Error(Str); return 0; }
-FunctionAST *ErrorF(const char *Str) { Error(Str); return 0; }
+ExprAST *Error(const char *Str) {
+  fprintf(stderr, "Error: %s\n", Str);
+  return 0;
+}
+PrototypeAST *ErrorP(const char *Str) {
+  Error(Str);
+  return 0;
+}
+FunctionAST *ErrorF(const char *Str) {
+  Error(Str);
+  return 0;
+}
 
 static ExprAST *ParseExpression();
 
@@ -194,22 +214,24 @@ static ExprAST *ParseExpression();
 ///   ::= identifier '(' expression* ')'
 static ExprAST *ParseIdentifierExpr() {
   std::string IdName = IdentifierStr;
-  
-  getNextToken();  // eat identifier.
-  
+
+  getNextToken(); // eat identifier.
+
   if (CurTok != '(') // Simple variable ref.
     return new VariableExprAST(IdName);
-  
+
   // Call.
-  getNextToken();  // eat (
-  std::vector<ExprAST*> Args;
+  getNextToken(); // eat (
+  std::vector<ExprAST *> Args;
   if (CurTok != ')') {
     while (1) {
       ExprAST *Arg = ParseExpression();
-      if (!Arg) return 0;
+      if (!Arg)
+        return 0;
       Args.push_back(Arg);
 
-      if (CurTok == ')') break;
+      if (CurTok == ')')
+        break;
 
       if (CurTok != ',')
         return Error("Expected ')' or ',' in argument list");
@@ -219,7 +241,7 @@ static ExprAST *ParseIdentifierExpr() {
 
   // Eat the ')'.
   getNextToken();
-  
+
   return new CallExprAST(IdName, Args);
 }
 
@@ -232,13 +254,14 @@ static ExprAST *ParseNumberExpr() {
 
 /// parenexpr ::= '(' expression ')'
 static ExprAST *ParseParenExpr() {
-  getNextToken();  // eat (.
+  getNextToken(); // eat (.
   ExprAST *V = ParseExpression();
-  if (!V) return 0;
-  
+  if (!V)
+    return 0;
+
   if (CurTok != ')')
     return Error("expected ')'");
-  getNextToken();  // eat ).
+  getNextToken(); // eat ).
   return V;
 }
 
@@ -248,10 +271,14 @@ static ExprAST *ParseParenExpr() {
 ///   ::= parenexpr
 static ExprAST *ParsePrimary() {
   switch (CurTok) {
-  default: return Error("unknown token when expecting an expression");
-  case tok_identifier: return ParseIdentifierExpr();
-  case tok_number:     return ParseNumberExpr();
-  case '(':            return ParseParenExpr();
+  default:
+    return Error("unknown token when expecting an expression");
+  case tok_identifier:
+    return ParseIdentifierExpr();
+  case tok_number:
+    return ParseNumberExpr();
+  case '(':
+    return ParseParenExpr();
   }
 }
 
@@ -261,28 +288,30 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
   // If this is a binop, find its precedence.
   while (1) {
     int TokPrec = GetTokPrecedence();
-    
+
     // If this is a binop that binds at least as tightly as the current binop,
     // consume it, otherwise we are done.
     if (TokPrec < ExprPrec)
       return LHS;
-    
+
     // Okay, we know this is a binop.
     int BinOp = CurTok;
-    getNextToken();  // eat binop
-    
+    getNextToken(); // eat binop
+
     // Parse the primary expression after the binary operator.
     ExprAST *RHS = ParsePrimary();
-    if (!RHS) return 0;
-    
+    if (!RHS)
+      return 0;
+
     // If BinOp binds less tightly with RHS than the operator after RHS, let
     // the pending operator take RHS as its LHS.
     int NextPrec = GetTokPrecedence();
     if (TokPrec < NextPrec) {
-      RHS = ParseBinOpRHS(TokPrec+1, RHS);
-      if (RHS == 0) return 0;
+      RHS = ParseBinOpRHS(TokPrec + 1, RHS);
+      if (RHS == 0)
+        return 0;
     }
-    
+
     // Merge LHS/RHS.
     LHS = new BinaryExprAST(BinOp, LHS, RHS);
   }
@@ -293,8 +322,9 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
 ///
 static ExprAST *ParseExpression() {
   ExprAST *LHS = ParsePrimary();
-  if (!LHS) return 0;
-  
+  if (!LHS)
+    return 0;
+
   return ParseBinOpRHS(0, LHS);
 }
 
@@ -306,27 +336,28 @@ static PrototypeAST *ParsePrototype() {
 
   std::string FnName = IdentifierStr;
   getNextToken();
-  
+
   if (CurTok != '(')
     return ErrorP("Expected '(' in prototype");
-  
+
   std::vector<std::string> ArgNames;
   while (getNextToken() == tok_identifier)
     ArgNames.push_back(IdentifierStr);
   if (CurTok != ')')
     return ErrorP("Expected ')' in prototype");
-  
+
   // success.
-  getNextToken();  // eat ')'.
-  
+  getNextToken(); // eat ')'.
+
   return new PrototypeAST(FnName, ArgNames);
 }
 
 /// definition ::= 'def' prototype expression
 static FunctionAST *ParseDefinition() {
-  getNextToken();  // eat def.
+  getNextToken(); // eat def.
   PrototypeAST *Proto = ParsePrototype();
-  if (Proto == 0) return 0;
+  if (Proto == 0)
+    return 0;
 
   if (ExprAST *E = ParseExpression())
     return new FunctionAST(Proto, E);
@@ -345,20 +376,258 @@ static FunctionAST *ParseTopLevelExpr() {
 
 /// external ::= 'extern' prototype
 static PrototypeAST *ParseExtern() {
-  getNextToken();  // eat extern.
+  getNextToken(); // eat extern.
   return ParsePrototype();
 }
 
 //===----------------------------------------------------------------------===//
+// Quick and dirty hack
+//===----------------------------------------------------------------------===//
+
+// FIXME: Obviously we can do better than this
+std::string GenerateUniqueName(const char *root) {
+  static int i = 0;
+  char s[16];
+  sprintf(s, "%s%d", root, i++);
+  std::string S = s;
+  return S;
+}
+
+std::string MakeLegalFunctionName(std::string Name) {
+  std::string NewName;
+  if (!Name.length())
+    return GenerateUniqueName("anon_func_");
+
+  // Start with what we have
+  NewName = Name;
+
+  // Look for a numberic first character
+  if (NewName.find_first_of("0123456789") == 0) {
+    NewName.insert(0, 1, 'n');
+  }
+
+  // Replace illegal characters with their ASCII equivalent
+  std::string legal_elements =
+      "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+  size_t pos;
+  while ((pos = NewName.find_first_not_of(legal_elements)) !=
+         std::string::npos) {
+    char old_c = NewName.at(pos);
+    char new_str[16];
+    sprintf(new_str, "%d", (int)old_c);
+    NewName = NewName.replace(pos, 1, new_str);
+  }
+
+  return NewName;
+}
+
+//===----------------------------------------------------------------------===//
+// MCJIT helper class
+//===----------------------------------------------------------------------===//
+
+class MCJITHelper {
+public:
+  MCJITHelper(LLVMContext &C) : Context(C), OpenModule(NULL) {}
+  ~MCJITHelper();
+
+  Function *getFunction(const std::string FnName);
+  Module *getModuleForNewFunction();
+  void *getPointerToFunction(Function *F);
+  void *getSymbolAddress(const std::string &Name);
+  void dump();
+
+private:
+  typedef std::vector<Module *> ModuleVector;
+  typedef std::vector<ExecutionEngine *> EngineVector;
+
+  LLVMContext &Context;
+  Module *OpenModule;
+  ModuleVector Modules;
+  EngineVector Engines;
+};
+
+class HelpingMemoryManager : public SectionMemoryManager {
+  HelpingMemoryManager(const HelpingMemoryManager &) = delete;
+  void operator=(const HelpingMemoryManager &) = delete;
+
+public:
+  HelpingMemoryManager(MCJITHelper *Helper) : MasterHelper(Helper) {}
+  virtual ~HelpingMemoryManager() {}
+
+  /// This method returns the address of the specified symbol.
+  /// Our implementation will attempt to find symbols in other
+  /// modules associated with the MCJITHelper to cross link symbols
+  /// from one generated module to another.
+  virtual uint64_t getSymbolAddress(const std::string &Name) override;
+
+private:
+  MCJITHelper *MasterHelper;
+};
+
+uint64_t HelpingMemoryManager::getSymbolAddress(const std::string &Name) {
+  uint64_t FnAddr = SectionMemoryManager::getSymbolAddress(Name);
+  if (FnAddr)
+    return FnAddr;
+
+  uint64_t HelperFun = (uint64_t)MasterHelper->getSymbolAddress(Name);
+  if (!HelperFun)
+    report_fatal_error("Program used extern function '" + Name +
+                       "' which could not be resolved!");
+
+  return HelperFun;
+}
+
+MCJITHelper::~MCJITHelper() {
+  if (OpenModule)
+    delete OpenModule;
+  EngineVector::iterator begin = Engines.begin();
+  EngineVector::iterator end = Engines.end();
+  EngineVector::iterator it;
+  for (it = begin; it != end; ++it)
+    delete *it;
+}
+
+Function *MCJITHelper::getFunction(const std::string FnName) {
+  ModuleVector::iterator begin = Modules.begin();
+  ModuleVector::iterator end = Modules.end();
+  ModuleVector::iterator it;
+  for (it = begin; it != end; ++it) {
+    Function *F = (*it)->getFunction(FnName);
+    if (F) {
+      if (*it == OpenModule)
+        return F;
+
+      assert(OpenModule != NULL);
+
+      // This function is in a module that has already been JITed.
+      // We need to generate a new prototype for external linkage.
+      Function *PF = OpenModule->getFunction(FnName);
+      if (PF && !PF->empty()) {
+        ErrorF("redefinition of function across modules");
+        return 0;
+      }
+
+      // If we don't have a prototype yet, create one.
+      if (!PF)
+        PF = Function::Create(F->getFunctionType(), Function::ExternalLinkage,
+                              FnName, OpenModule);
+      return PF;
+    }
+  }
+  return NULL;
+}
+
+Module *MCJITHelper::getModuleForNewFunction() {
+  // If we have a Module that hasn't been JITed, use that.
+  if (OpenModule)
+    return OpenModule;
+
+  // Otherwise create a new Module.
+  std::string ModName = GenerateUniqueName("mcjit_module_");
+  Module *M = new Module(ModName, Context);
+  Modules.push_back(M);
+  OpenModule = M;
+  return M;
+}
+
+void *MCJITHelper::getPointerToFunction(Function *F) {
+  // See if an existing instance of MCJIT has this function.
+  EngineVector::iterator begin = Engines.begin();
+  EngineVector::iterator end = Engines.end();
+  EngineVector::iterator it;
+  for (it = begin; it != end; ++it) {
+    void *P = (*it)->getPointerToFunction(F);
+    if (P)
+      return P;
+  }
+
+  // If we didn't find the function, see if we can generate it.
+  if (OpenModule) {
+    std::string ErrStr;
+    ExecutionEngine *NewEngine =
+        EngineBuilder(std::unique_ptr<Module>(OpenModule))
+            .setErrorStr(&ErrStr)
+            .setMCJITMemoryManager(std::unique_ptr<HelpingMemoryManager>(
+                new HelpingMemoryManager(this)))
+            .create();
+    if (!NewEngine) {
+      fprintf(stderr, "Could not create ExecutionEngine: %s\n", ErrStr.c_str());
+      exit(1);
+    }
+
+    // Create a function pass manager for this engine
+    auto *FPM = new legacy::FunctionPassManager(OpenModule);
+
+    // Set up the optimizer pipeline.  Start with registering info about how the
+    // target lays out data structures.
+    OpenModule->setDataLayout(NewEngine->getDataLayout());
+    FPM->add(new DataLayoutPass());
+    // Provide basic AliasAnalysis support for GVN.
+    FPM->add(createBasicAliasAnalysisPass());
+    // Promote allocas to registers.
+    FPM->add(createPromoteMemoryToRegisterPass());
+    // Do simple "peephole" optimizations and bit-twiddling optzns.
+    FPM->add(createInstructionCombiningPass());
+    // Reassociate expressions.
+    FPM->add(createReassociatePass());
+    // Eliminate Common SubExpressions.
+    FPM->add(createGVNPass());
+    // Simplify the control flow graph (deleting unreachable blocks, etc).
+    FPM->add(createCFGSimplificationPass());
+    FPM->doInitialization();
+
+    // For each function in the module
+    Module::iterator it;
+    Module::iterator end = OpenModule->end();
+    for (it = OpenModule->begin(); it != end; ++it) {
+      // Run the FPM on this function
+      FPM->run(*it);
+    }
+
+    // We don't need this anymore
+    delete FPM;
+
+    OpenModule = NULL;
+    Engines.push_back(NewEngine);
+    NewEngine->finalizeObject();
+    return NewEngine->getPointerToFunction(F);
+  }
+  return NULL;
+}
+
+void *MCJITHelper::getSymbolAddress(const std::string &Name) {
+  // Look for the symbol in each of our execution engines.
+  EngineVector::iterator begin = Engines.begin();
+  EngineVector::iterator end = Engines.end();
+  EngineVector::iterator it;
+  for (it = begin; it != end; ++it) {
+    uint64_t FAddr = (*it)->getFunctionAddress(Name);
+    if (FAddr) {
+      return (void *)FAddr;
+    }
+  }
+  return NULL;
+}
+
+void MCJITHelper::dump() {
+  ModuleVector::iterator begin = Modules.begin();
+  ModuleVector::iterator end = Modules.end();
+  ModuleVector::iterator it;
+  for (it = begin; it != end; ++it)
+    (*it)->dump();
+}
+//===----------------------------------------------------------------------===//
 // Code Generation
 //===----------------------------------------------------------------------===//
 
-static Module *TheModule;
+static MCJITHelper *JITHelper;
 static IRBuilder<> Builder(getGlobalContext());
-static std::map<std::string, Value*> NamedValues;
-static FunctionPassManager *TheFPM;
+static std::map<std::string, Value *> NamedValues;
 
-Value *ErrorV(const char *Str) { Error(Str); return 0; }
+Value *ErrorV(const char *Str) {
+  Error(Str);
+  return 0;
+}
 
 Value *NumberExprAST::Codegen() {
   return ConstantFP::get(getGlobalContext(), APFloat(Val));
@@ -373,93 +642,102 @@ Value *VariableExprAST::Codegen() {
 Value *BinaryExprAST::Codegen() {
   Value *L = LHS->Codegen();
   Value *R = RHS->Codegen();
-  if (L == 0 || R == 0) return 0;
-  
+  if (L == 0 || R == 0)
+    return 0;
+
   switch (Op) {
-  case '+': return Builder.CreateFAdd(L, R, "addtmp");
-  case '-': return Builder.CreateFSub(L, R, "subtmp");
-  case '*': return Builder.CreateFMul(L, R, "multmp");
+  case '+':
+    return Builder.CreateFAdd(L, R, "addtmp");
+  case '-':
+    return Builder.CreateFSub(L, R, "subtmp");
+  case '*':
+    return Builder.CreateFMul(L, R, "multmp");
   case '<':
     L = Builder.CreateFCmpULT(L, R, "cmptmp");
     // Convert bool 0/1 to double 0.0 or 1.0
     return Builder.CreateUIToFP(L, Type::getDoubleTy(getGlobalContext()),
                                 "booltmp");
-  default: return ErrorV("invalid binary operator");
+  default:
+    return ErrorV("invalid binary operator");
   }
 }
 
 Value *CallExprAST::Codegen() {
   // Look up the name in the global module table.
-  Function *CalleeF = TheModule->getFunction(Callee);
+  Function *CalleeF = JITHelper->getFunction(Callee);
   if (CalleeF == 0)
     return ErrorV("Unknown function referenced");
-  
+
   // If argument mismatch error.
   if (CalleeF->arg_size() != Args.size())
     return ErrorV("Incorrect # arguments passed");
 
-  std::vector<Value*> ArgsV;
+  std::vector<Value *> ArgsV;
   for (unsigned i = 0, e = Args.size(); i != e; ++i) {
     ArgsV.push_back(Args[i]->Codegen());
-    if (ArgsV.back() == 0) return 0;
+    if (ArgsV.back() == 0)
+      return 0;
   }
-  
+
   return Builder.CreateCall(CalleeF, ArgsV, "calltmp");
 }
 
 Function *PrototypeAST::Codegen() {
   // Make the function type:  double(double,double) etc.
-  std::vector<Type*> Doubles(Args.size(),
-                             Type::getDoubleTy(getGlobalContext()));
-  FunctionType *FT = FunctionType::get(Type::getDoubleTy(getGlobalContext()),
-                                       Doubles, false);
-  
-  Function *F = Function::Create(FT, Function::ExternalLinkage, Name, TheModule);
-  
+  std::vector<Type *> Doubles(Args.size(),
+                              Type::getDoubleTy(getGlobalContext()));
+  FunctionType *FT =
+      FunctionType::get(Type::getDoubleTy(getGlobalContext()), Doubles, false);
+
+  std::string FnName = MakeLegalFunctionName(Name);
+
+  Module *M = JITHelper->getModuleForNewFunction();
+
+  Function *F = Function::Create(FT, Function::ExternalLinkage, FnName, M);
+
   // If F conflicted, there was already something named 'Name'.  If it has a
   // body, don't allow redefinition or reextern.
-  if (F->getName() != Name) {
+  if (F->getName() != FnName) {
     // Delete the one we just made and get the existing one.
     F->eraseFromParent();
-    F = TheModule->getFunction(Name);
-    
+    F = JITHelper->getFunction(Name);
     // If F already has a body, reject this.
     if (!F->empty()) {
       ErrorF("redefinition of function");
       return 0;
     }
-    
+
     // If F took a different number of args, reject.
     if (F->arg_size() != Args.size()) {
       ErrorF("redefinition of function with different # args");
       return 0;
     }
   }
-  
+
   // Set names for all arguments.
   unsigned Idx = 0;
   for (Function::arg_iterator AI = F->arg_begin(); Idx != Args.size();
        ++AI, ++Idx) {
     AI->setName(Args[Idx]);
-    
+
     // Add arguments to variable symbol table.
     NamedValues[Args[Idx]] = AI;
   }
-  
+
   return F;
 }
 
 Function *FunctionAST::Codegen() {
   NamedValues.clear();
-  
+
   Function *TheFunction = Proto->Codegen();
   if (TheFunction == 0)
     return 0;
-  
+
   // Create a new basic block to start insertion into.
   BasicBlock *BB = BasicBlock::Create(getGlobalContext(), "entry", TheFunction);
   Builder.SetInsertPoint(BB);
-  
+
   if (Value *RetVal = Body->Codegen()) {
     // Finish off the function.
     Builder.CreateRet(RetVal);
@@ -467,12 +745,9 @@ Function *FunctionAST::Codegen() {
     // Validate the generated code, checking for consistency.
     verifyFunction(*TheFunction);
 
-    // Optimize the function.
-    TheFPM->run(*TheFunction);
-    
     return TheFunction;
   }
-  
+
   // Error reading body, remove function.
   TheFunction->eraseFromParent();
   return 0;
@@ -482,8 +757,6 @@ Function *FunctionAST::Codegen() {
 // Top-Level parsing and JIT Driver
 //===----------------------------------------------------------------------===//
 
-static ExecutionEngine *TheExecutionEngine;
-
 static void HandleDefinition() {
   if (FunctionAST *F = ParseDefinition()) {
     if (Function *LF = F->Codegen()) {
@@ -513,8 +786,8 @@ static void HandleTopLevelExpression() {
   if (FunctionAST *F = ParseTopLevelExpr()) {
     if (Function *LF = F->Codegen()) {
       // JIT the function, returning a function pointer.
-      void *FPtr = TheExecutionEngine->getPointerToFunction(LF);
-      
+      void *FPtr = JITHelper->getPointerToFunction(LF);
+
       // Cast it to the right type (takes no arguments, returns a double) so we
       // can call it as a native function.
       double (*FP)() = (double (*)())(intptr_t)FPtr;
@@ -531,11 +804,20 @@ static void MainLoop() {
   while (1) {
     fprintf(stderr, "ready> ");
     switch (CurTok) {
-    case tok_eof:    return;
-    case ';':        getNextToken(); break;  // ignore top-level semicolons.
-    case tok_def:    HandleDefinition(); break;
-    case tok_extern: HandleExtern(); break;
-    default:         HandleTopLevelExpression(); break;
+    case tok_eof:
+      return;
+    case ';':
+      getNextToken();
+      break; // ignore top-level semicolons.
+    case tok_def:
+      HandleDefinition();
+      break;
+    case tok_extern:
+      HandleExtern();
+      break;
+    default:
+      HandleTopLevelExpression();
+      break;
     }
   }
 }
@@ -545,8 +827,7 @@ static void MainLoop() {
 //===----------------------------------------------------------------------===//
 
 /// putchard - putchar that takes a double and returns 0.
-extern "C" 
-double putchard(double X) {
+extern "C" double putchard(double X) {
   putchar((char)X);
   return 0;
 }
@@ -557,61 +838,27 @@ double putchard(double X) {
 
 int main() {
   InitializeNativeTarget();
+  InitializeNativeTargetAsmPrinter();
+  InitializeNativeTargetAsmParser();
   LLVMContext &Context = getGlobalContext();
+  JITHelper = new MCJITHelper(Context);
 
   // Install standard binary operators.
   // 1 is lowest precedence.
   BinopPrecedence['<'] = 10;
   BinopPrecedence['+'] = 20;
   BinopPrecedence['-'] = 20;
-  BinopPrecedence['*'] = 40;  // highest.
+  BinopPrecedence['*'] = 40; // highest.
 
   // Prime the first token.
   fprintf(stderr, "ready> ");
   getNextToken();
 
-  // Make the module, which holds all the code.
-  std::unique_ptr<Module> Owner = make_unique<Module>("my cool jit", Context);
-  TheModule = Owner.get();
-
-  // Create the JIT.  This takes ownership of the module.
-  std::string ErrStr;
-  TheExecutionEngine =
-      EngineBuilder(std::move(Owner)).setErrorStr(&ErrStr).create();
-  if (!TheExecutionEngine) {
-    fprintf(stderr, "Could not create ExecutionEngine: %s\n", ErrStr.c_str());
-    exit(1);
-  }
-
-  FunctionPassManager OurFPM(TheModule);
-
-  // Set up the optimizer pipeline.  Start with registering info about how the
-  // target lays out data structures.
-  TheModule->setDataLayout(TheExecutionEngine->getDataLayout());
-  OurFPM.add(new DataLayoutPass());
-  // Provide basic AliasAnalysis support for GVN.
-  OurFPM.add(createBasicAliasAnalysisPass());
-  // Do simple "peephole" optimizations and bit-twiddling optzns.
-  OurFPM.add(createInstructionCombiningPass());
-  // Reassociate expressions.
-  OurFPM.add(createReassociatePass());
-  // Eliminate Common SubExpressions.
-  OurFPM.add(createGVNPass());
-  // Simplify the control flow graph (deleting unreachable blocks, etc).
-  OurFPM.add(createCFGSimplificationPass());
-
-  OurFPM.doInitialization();
-
-  // Set the global so the code gen can use this.
-  TheFPM = &OurFPM;
-
   // Run the main "interpreter loop" now.
   MainLoop();
 
-  TheFPM = 0;
-
   // Print out all of the generated code.
-  TheModule->dump();
+  JITHelper->dump();
 
   return 0;
 }
diff --git a/examples/Kaleidoscope/Chapter5/CMakeLists.txt b/examples/Kaleidoscope/Chapter5/CMakeLists.txt
index 1912ddc..aac9949 100644
--- a/examples/Kaleidoscope/Chapter5/CMakeLists.txt
+++ b/examples/Kaleidoscope/Chapter5/CMakeLists.txt
@@ -3,12 +3,12 @@ set(LLVM_LINK_COMPONENTS
   Core
   ExecutionEngine
   InstCombine
-  MC
+  MCJIT
   ScalarOpts
   Support
-  nativecodegen
+  native
   )
 
-add_llvm_example(Kaleidoscope-Ch5
+add_kaleidoscope_chapter(Kaleidoscope-Ch5
   toy.cpp
   )
diff --git a/examples/Kaleidoscope/Chapter5/toy.cpp b/examples/Kaleidoscope/Chapter5/toy.cpp
index 4929a20..728a2f5 100644
--- a/examples/Kaleidoscope/Chapter5/toy.cpp
+++ b/examples/Kaleidoscope/Chapter5/toy.cpp
@@ -1,12 +1,14 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/MCJIT.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
 #include <cctype>
@@ -26,18 +28,23 @@ enum Token {
   tok_eof = -1,
 
   // commands
-  tok_def = -2, tok_extern = -3,
+  tok_def = -2,
+  tok_extern = -3,
 
   // primary
-  tok_identifier = -4, tok_number = -5,
-  
+  tok_identifier = -4,
+  tok_number = -5,
+
   // control
-  tok_if = -6, tok_then = -7, tok_else = -8,
-  tok_for = -9, tok_in = -10
+  tok_if = -6,
+  tok_then = -7,
+  tok_else = -8,
+  tok_for = -9,
+  tok_in = -10
 };
 
-static std::string IdentifierStr;  // Filled in if tok_identifier
-static double NumVal;              // Filled in if tok_number
+static std::string IdentifierStr; // Filled in if tok_identifier
+static double NumVal;             // Filled in if tok_number
 
 /// gettok - Return the next token from standard input.
 static int gettok() {
@@ -52,17 +59,24 @@ static int gettok() {
     while (isalnum((LastChar = getchar())))
       IdentifierStr += LastChar;
 
-    if (IdentifierStr == "def") return tok_def;
-    if (IdentifierStr == "extern") return tok_extern;
-    if (IdentifierStr == "if") return tok_if;
-    if (IdentifierStr == "then") return tok_then;
-    if (IdentifierStr == "else") return tok_else;
-    if (IdentifierStr == "for") return tok_for;
-    if (IdentifierStr == "in") return tok_in;
+    if (IdentifierStr == "def")
+      return tok_def;
+    if (IdentifierStr == "extern")
+      return tok_extern;
+    if (IdentifierStr == "if")
+      return tok_if;
+    if (IdentifierStr == "then")
+      return tok_then;
+    if (IdentifierStr == "else")
+      return tok_else;
+    if (IdentifierStr == "for")
+      return tok_for;
+    if (IdentifierStr == "in")
+      return tok_in;
     return tok_identifier;
   }
 
-  if (isdigit(LastChar) || LastChar == '.') {   // Number: [0-9.]+
+  if (isdigit(LastChar) || LastChar == '.') { // Number: [0-9.]+
     std::string NumStr;
     do {
       NumStr += LastChar;
@@ -75,13 +89,14 @@ static int gettok() {
 
   if (LastChar == '#') {
     // Comment until end of line.
-    do LastChar = getchar();
+    do
+      LastChar = getchar();
     while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
-    
+
     if (LastChar != EOF)
       return gettok();
   }
-  
+
   // Check for end of file.  Don't eat the EOF.
   if (LastChar == EOF)
     return tok_eof;
@@ -106,6 +121,7 @@ public:
 /// NumberExprAST - Expression class for numeric literals like "1.0".
 class NumberExprAST : public ExprAST {
   double Val;
+
 public:
   NumberExprAST(double val) : Val(val) {}
   virtual Value *Codegen();
@@ -114,6 +130,7 @@ public:
 /// VariableExprAST - Expression class for referencing a variable, like "a".
 class VariableExprAST : public ExprAST {
   std::string Name;
+
 public:
   VariableExprAST(const std::string &name) : Name(name) {}
   virtual Value *Codegen();
@@ -123,28 +140,31 @@ public:
 class BinaryExprAST : public ExprAST {
   char Op;
   ExprAST *LHS, *RHS;
+
 public:
-  BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs) 
-    : Op(op), LHS(lhs), RHS(rhs) {}
+  BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs)
+      : Op(op), LHS(lhs), RHS(rhs) {}
   virtual Value *Codegen();
 };
 
 /// CallExprAST - Expression class for function calls.
 class CallExprAST : public ExprAST {
   std::string Callee;
-  std::vector<ExprAST*> Args;
+  std::vector<ExprAST *> Args;
+
 public:
-  CallExprAST(const std::string &callee, std::vector<ExprAST*> &args)
-    : Callee(callee), Args(args) {}
+  CallExprAST(const std::string &callee, std::vector<ExprAST *> &args)
+      : Callee(callee), Args(args) {}
   virtual Value *Codegen();
 };
 
 /// IfExprAST - Expression class for if/then/else.
 class IfExprAST : public ExprAST {
   ExprAST *Cond, *Then, *Else;
+
 public:
   IfExprAST(ExprAST *cond, ExprAST *then, ExprAST *_else)
-  : Cond(cond), Then(then), Else(_else) {}
+      : Cond(cond), Then(then), Else(_else) {}
   virtual Value *Codegen();
 };
 
@@ -152,10 +172,11 @@ public:
 class ForExprAST : public ExprAST {
   std::string VarName;
   ExprAST *Start, *End, *Step, *Body;
+
 public:
   ForExprAST(const std::string &varname, ExprAST *start, ExprAST *end,
              ExprAST *step, ExprAST *body)
-    : VarName(varname), Start(start), End(end), Step(step), Body(body) {}
+      : VarName(varname), Start(start), End(end), Step(step), Body(body) {}
   virtual Value *Codegen();
 };
 
@@ -165,10 +186,11 @@ public:
 class PrototypeAST {
   std::string Name;
   std::vector<std::string> Args;
+
 public:
   PrototypeAST(const std::string &name, const std::vector<std::string> &args)
-    : Name(name), Args(args) {}
-  
+      : Name(name), Args(args) {}
+
   Function *Codegen();
 };
 
@@ -176,10 +198,10 @@ public:
 class FunctionAST {
   PrototypeAST *Proto;
   ExprAST *Body;
+
 public:
-  FunctionAST(PrototypeAST *proto, ExprAST *body)
-    : Proto(proto), Body(body) {}
-  
+  FunctionAST(PrototypeAST *proto, ExprAST *body) : Proto(proto), Body(body) {}
+
   Function *Codegen();
 };
 } // end anonymous namespace
@@ -192,9 +214,7 @@ public:
 /// token the parser is looking at.  getNextToken reads another token from the
 /// lexer and updates CurTok with its results.
 static int CurTok;
-static int getNextToken() {
-  return CurTok = gettok();
-}
+static int getNextToken() { return CurTok = gettok(); }
 
 /// BinopPrecedence - This holds the precedence for each binary operator that is
 /// defined.
@@ -204,17 +224,27 @@ static std::map<char, int> BinopPrecedence;
 static int GetTokPrecedence() {
   if (!isascii(CurTok))
     return -1;
-  
+
   // Make sure it's a declared binop.
   int TokPrec = BinopPrecedence[CurTok];
-  if (TokPrec <= 0) return -1;
+  if (TokPrec <= 0)
+    return -1;
   return TokPrec;
 }
 
 /// Error* - These are little helper functions for error handling.
-ExprAST *Error(const char *Str) { fprintf(stderr, "Error: %s\n", Str);return 0;}
-PrototypeAST *ErrorP(const char *Str) { Error(Str); return 0; }
-FunctionAST *ErrorF(const char *Str) { Error(Str); return 0; }
+ExprAST *Error(const char *Str) {
+  fprintf(stderr, "Error: %s\n", Str);
+  return 0;
+}
+PrototypeAST *ErrorP(const char *Str) {
+  Error(Str);
+  return 0;
+}
+FunctionAST *ErrorF(const char *Str) {
+  Error(Str);
+  return 0;
+}
 
 static ExprAST *ParseExpression();
 
@@ -223,22 +253,24 @@ static ExprAST *ParseExpression();
 ///   ::= identifier '(' expression* ')'
 static ExprAST *ParseIdentifierExpr() {
   std::string IdName = IdentifierStr;
-  
-  getNextToken();  // eat identifier.
-  
+
+  getNextToken(); // eat identifier.
+
   if (CurTok != '(') // Simple variable ref.
     return new VariableExprAST(IdName);
-  
+
   // Call.
-  getNextToken();  // eat (
-  std::vector<ExprAST*> Args;
+  getNextToken(); // eat (
+  std::vector<ExprAST *> Args;
   if (CurTok != ')') {
     while (1) {
       ExprAST *Arg = ParseExpression();
-      if (!Arg) return 0;
+      if (!Arg)
+        return 0;
       Args.push_back(Arg);
 
-      if (CurTok == ')') break;
+      if (CurTok == ')')
+        break;
 
       if (CurTok != ',')
         return Error("Expected ')' or ',' in argument list");
@@ -248,7 +280,7 @@ static ExprAST *ParseIdentifierExpr() {
 
   // Eat the ')'.
   getNextToken();
-  
+
   return new CallExprAST(IdName, Args);
 }
 
@@ -261,80 +293,87 @@ static ExprAST *ParseNumberExpr() {
 
 /// parenexpr ::= '(' expression ')'
 static ExprAST *ParseParenExpr() {
-  getNextToken();  // eat (.
+  getNextToken(); // eat (.
   ExprAST *V = ParseExpression();
-  if (!V) return 0;
-  
+  if (!V)
+    return 0;
+
   if (CurTok != ')')
     return Error("expected ')'");
-  getNextToken();  // eat ).
+  getNextToken(); // eat ).
   return V;
 }
 
 /// ifexpr ::= 'if' expression 'then' expression 'else' expression
 static ExprAST *ParseIfExpr() {
-  getNextToken();  // eat the if.
-  
+  getNextToken(); // eat the if.
+
   // condition.
   ExprAST *Cond = ParseExpression();
-  if (!Cond) return 0;
-  
+  if (!Cond)
+    return 0;
+
   if (CurTok != tok_then)
     return Error("expected then");
-  getNextToken();  // eat the then
-  
+  getNextToken(); // eat the then
+
   ExprAST *Then = ParseExpression();
-  if (Then == 0) return 0;
-  
+  if (Then == 0)
+    return 0;
+
   if (CurTok != tok_else)
     return Error("expected else");
-  
+
   getNextToken();
-  
+
   ExprAST *Else = ParseExpression();
-  if (!Else) return 0;
-  
+  if (!Else)
+    return 0;
+
   return new IfExprAST(Cond, Then, Else);
 }
 
 /// forexpr ::= 'for' identifier '=' expr ',' expr (',' expr)? 'in' expression
 static ExprAST *ParseForExpr() {
-  getNextToken();  // eat the for.
+  getNextToken(); // eat the for.
 
   if (CurTok != tok_identifier)
     return Error("expected identifier after for");
-  
+
   std::string IdName = IdentifierStr;
-  getNextToken();  // eat identifier.
-  
+  getNextToken(); // eat identifier.
+
   if (CurTok != '=')
     return Error("expected '=' after for");
-  getNextToken();  // eat '='.
-  
-  
+  getNextToken(); // eat '='.
+
   ExprAST *Start = ParseExpression();
-  if (Start == 0) return 0;
+  if (Start == 0)
+    return 0;
   if (CurTok != ',')
     return Error("expected ',' after for start value");
   getNextToken();
-  
+
   ExprAST *End = ParseExpression();
-  if (End == 0) return 0;
-  
+  if (End == 0)
+    return 0;
+
   // The step value is optional.
   ExprAST *Step = 0;
   if (CurTok == ',') {
     getNextToken();
     Step = ParseExpression();
-    if (Step == 0) return 0;
+    if (Step == 0)
+      return 0;
   }
-  
+
   if (CurTok != tok_in)
     return Error("expected 'in' after for");
-  getNextToken();  // eat 'in'.
-  
+  getNextToken(); // eat 'in'.
+
   ExprAST *Body = ParseExpression();
-  if (Body == 0) return 0;
+  if (Body == 0)
+    return 0;
 
   return new ForExprAST(IdName, Start, End, Step, Body);
 }
@@ -347,12 +386,18 @@ static ExprAST *ParseForExpr() {
 ///   ::= forexpr
 static ExprAST *ParsePrimary() {
   switch (CurTok) {
-  default: return Error("unknown token when expecting an expression");
-  case tok_identifier: return ParseIdentifierExpr();
-  case tok_number:     return ParseNumberExpr();
-  case '(':            return ParseParenExpr();
-  case tok_if:         return ParseIfExpr();
-  case tok_for:        return ParseForExpr();
+  default:
+    return Error("unknown token when expecting an expression");
+  case tok_identifier:
+    return ParseIdentifierExpr();
+  case tok_number:
+    return ParseNumberExpr();
+  case '(':
+    return ParseParenExpr();
+  case tok_if:
+    return ParseIfExpr();
+  case tok_for:
+    return ParseForExpr();
   }
 }
 
@@ -362,28 +407,30 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
   // If this is a binop, find its precedence.
   while (1) {
     int TokPrec = GetTokPrecedence();
-    
+
     // If this is a binop that binds at least as tightly as the current binop,
     // consume it, otherwise we are done.
     if (TokPrec < ExprPrec)
       return LHS;
-    
+
     // Okay, we know this is a binop.
     int BinOp = CurTok;
-    getNextToken();  // eat binop
-    
+    getNextToken(); // eat binop
+
     // Parse the primary expression after the binary operator.
     ExprAST *RHS = ParsePrimary();
-    if (!RHS) return 0;
-    
+    if (!RHS)
+      return 0;
+
     // If BinOp binds less tightly with RHS than the operator after RHS, let
     // the pending operator take RHS as its LHS.
     int NextPrec = GetTokPrecedence();
     if (TokPrec < NextPrec) {
-      RHS = ParseBinOpRHS(TokPrec+1, RHS);
-      if (RHS == 0) return 0;
+      RHS = ParseBinOpRHS(TokPrec + 1, RHS);
+      if (RHS == 0)
+        return 0;
     }
-    
+
     // Merge LHS/RHS.
     LHS = new BinaryExprAST(BinOp, LHS, RHS);
   }
@@ -394,8 +441,9 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
 ///
 static ExprAST *ParseExpression() {
   ExprAST *LHS = ParsePrimary();
-  if (!LHS) return 0;
-  
+  if (!LHS)
+    return 0;
+
   return ParseBinOpRHS(0, LHS);
 }
 
@@ -407,27 +455,28 @@ static PrototypeAST *ParsePrototype() {
 
   std::string FnName = IdentifierStr;
   getNextToken();
-  
+
   if (CurTok != '(')
     return ErrorP("Expected '(' in prototype");
-  
+
   std::vector<std::string> ArgNames;
   while (getNextToken() == tok_identifier)
     ArgNames.push_back(IdentifierStr);
   if (CurTok != ')')
     return ErrorP("Expected ')' in prototype");
-  
+
   // success.
-  getNextToken();  // eat ')'.
-  
+  getNextToken(); // eat ')'.
+
   return new PrototypeAST(FnName, ArgNames);
 }
 
 /// definition ::= 'def' prototype expression
 static FunctionAST *ParseDefinition() {
-  getNextToken();  // eat def.
+  getNextToken(); // eat def.
   PrototypeAST *Proto = ParsePrototype();
-  if (Proto == 0) return 0;
+  if (Proto == 0)
+    return 0;
 
   if (ExprAST *E = ParseExpression())
     return new FunctionAST(Proto, E);
@@ -446,7 +495,7 @@ static FunctionAST *ParseTopLevelExpr() {
 
 /// external ::= 'extern' prototype
 static PrototypeAST *ParseExtern() {
-  getNextToken();  // eat extern.
+  getNextToken(); // eat extern.
   return ParsePrototype();
 }
 
@@ -456,10 +505,13 @@ static PrototypeAST *ParseExtern() {
 
 static Module *TheModule;
 static IRBuilder<> Builder(getGlobalContext());
-static std::map<std::string, Value*> NamedValues;
-static FunctionPassManager *TheFPM;
+static std::map<std::string, Value *> NamedValues;
+static legacy::FunctionPassManager *TheFPM;
 
-Value *ErrorV(const char *Str) { Error(Str); return 0; }
+Value *ErrorV(const char *Str) {
+  Error(Str);
+  return 0;
+}
 
 Value *NumberExprAST::Codegen() {
   return ConstantFP::get(getGlobalContext(), APFloat(Val));
@@ -474,18 +526,23 @@ Value *VariableExprAST::Codegen() {
 Value *BinaryExprAST::Codegen() {
   Value *L = LHS->Codegen();
   Value *R = RHS->Codegen();
-  if (L == 0 || R == 0) return 0;
-  
+  if (L == 0 || R == 0)
+    return 0;
+
   switch (Op) {
-  case '+': return Builder.CreateFAdd(L, R, "addtmp");
-  case '-': return Builder.CreateFSub(L, R, "subtmp");
-  case '*': return Builder.CreateFMul(L, R, "multmp");
+  case '+':
+    return Builder.CreateFAdd(L, R, "addtmp");
+  case '-':
+    return Builder.CreateFSub(L, R, "subtmp");
+  case '*':
+    return Builder.CreateFMul(L, R, "multmp");
   case '<':
     L = Builder.CreateFCmpULT(L, R, "cmptmp");
     // Convert bool 0/1 to double 0.0 or 1.0
     return Builder.CreateUIToFP(L, Type::getDoubleTy(getGlobalContext()),
                                 "booltmp");
-  default: return ErrorV("invalid binary operator");
+  default:
+    return ErrorV("invalid binary operator");
   }
 }
 
@@ -494,66 +551,70 @@ Value *CallExprAST::Codegen() {
   Function *CalleeF = TheModule->getFunction(Callee);
   if (CalleeF == 0)
     return ErrorV("Unknown function referenced");
-  
+
   // If argument mismatch error.
   if (CalleeF->arg_size() != Args.size())
     return ErrorV("Incorrect # arguments passed");
 
-  std::vector<Value*> ArgsV;
+  std::vector<Value *> ArgsV;
   for (unsigned i = 0, e = Args.size(); i != e; ++i) {
     ArgsV.push_back(Args[i]->Codegen());
-    if (ArgsV.back() == 0) return 0;
+    if (ArgsV.back() == 0)
+      return 0;
   }
-  
+
   return Builder.CreateCall(CalleeF, ArgsV, "calltmp");
 }
 
 Value *IfExprAST::Codegen() {
   Value *CondV = Cond->Codegen();
-  if (CondV == 0) return 0;
-  
+  if (CondV == 0)
+    return 0;
+
   // Convert condition to a bool by comparing equal to 0.0.
-  CondV = Builder.CreateFCmpONE(CondV, 
-                              ConstantFP::get(getGlobalContext(), APFloat(0.0)),
-                                "ifcond");
-  
+  CondV = Builder.CreateFCmpONE(
+      CondV, ConstantFP::get(getGlobalContext(), APFloat(0.0)), "ifcond");
+
   Function *TheFunction = Builder.GetInsertBlock()->getParent();
-  
+
   // Create blocks for the then and else cases.  Insert the 'then' block at the
   // end of the function.
-  BasicBlock *ThenBB = BasicBlock::Create(getGlobalContext(), "then", TheFunction);
+  BasicBlock *ThenBB =
+      BasicBlock::Create(getGlobalContext(), "then", TheFunction);
   BasicBlock *ElseBB = BasicBlock::Create(getGlobalContext(), "else");
   BasicBlock *MergeBB = BasicBlock::Create(getGlobalContext(), "ifcont");
-  
+
   Builder.CreateCondBr(CondV, ThenBB, ElseBB);
-  
+
   // Emit then value.
   Builder.SetInsertPoint(ThenBB);
-  
+
   Value *ThenV = Then->Codegen();
-  if (ThenV == 0) return 0;
-  
+  if (ThenV == 0)
+    return 0;
+
   Builder.CreateBr(MergeBB);
   // Codegen of 'Then' can change the current block, update ThenBB for the PHI.
   ThenBB = Builder.GetInsertBlock();
-  
+
   // Emit else block.
   TheFunction->getBasicBlockList().push_back(ElseBB);
   Builder.SetInsertPoint(ElseBB);
-  
+
   Value *ElseV = Else->Codegen();
-  if (ElseV == 0) return 0;
-  
+  if (ElseV == 0)
+    return 0;
+
   Builder.CreateBr(MergeBB);
   // Codegen of 'Else' can change the current block, update ElseBB for the PHI.
   ElseBB = Builder.GetInsertBlock();
-  
+
   // Emit merge block.
   TheFunction->getBasicBlockList().push_back(MergeBB);
   Builder.SetInsertPoint(MergeBB);
-  PHINode *PN = Builder.CreatePHI(Type::getDoubleTy(getGlobalContext()), 2,
-                                  "iftmp");
-  
+  PHINode *PN =
+      Builder.CreatePHI(Type::getDoubleTy(getGlobalContext()), 2, "iftmp");
+
   PN->addIncoming(ThenV, ThenBB);
   PN->addIncoming(ElseV, ElseBB);
   return PN;
@@ -564,7 +625,7 @@ Value *ForExprAST::Codegen() {
   //   ...
   //   start = startexpr
   //   goto loop
-  // loop: 
+  // loop:
   //   variable = phi [start, loopheader], [nextvariable, loopend]
   //   ...
   //   bodyexpr
@@ -575,136 +636,141 @@ Value *ForExprAST::Codegen() {
   //   endcond = endexpr
   //   br endcond, loop, endloop
   // outloop:
-  
+
   // Emit the start code first, without 'variable' in scope.
   Value *StartVal = Start->Codegen();
-  if (StartVal == 0) return 0;
-  
+  if (StartVal == 0)
+    return 0;
+
   // Make the new basic block for the loop header, inserting after current
   // block.
   Function *TheFunction = Builder.GetInsertBlock()->getParent();
   BasicBlock *PreheaderBB = Builder.GetInsertBlock();
-  BasicBlock *LoopBB = BasicBlock::Create(getGlobalContext(), "loop", TheFunction);
-  
+  BasicBlock *LoopBB =
+      BasicBlock::Create(getGlobalContext(), "loop", TheFunction);
+
   // Insert an explicit fall through from the current block to the LoopBB.
   Builder.CreateBr(LoopBB);
 
   // Start insertion in LoopBB.
   Builder.SetInsertPoint(LoopBB);
-  
+
   // Start the PHI node with an entry for Start.
-  PHINode *Variable = Builder.CreatePHI(Type::getDoubleTy(getGlobalContext()), 2, VarName.c_str());
+  PHINode *Variable = Builder.CreatePHI(Type::getDoubleTy(getGlobalContext()),
+                                        2, VarName.c_str());
   Variable->addIncoming(StartVal, PreheaderBB);
-  
+
   // Within the loop, the variable is defined equal to the PHI node.  If it
   // shadows an existing variable, we have to restore it, so save it now.
   Value *OldVal = NamedValues[VarName];
   NamedValues[VarName] = Variable;
-  
+
   // Emit the body of the loop.  This, like any other expr, can change the
   // current BB.  Note that we ignore the value computed by the body, but don't
   // allow an error.
   if (Body->Codegen() == 0)
     return 0;
-  
+
   // Emit the step value.
   Value *StepVal;
   if (Step) {
     StepVal = Step->Codegen();
-    if (StepVal == 0) return 0;
+    if (StepVal == 0)
+      return 0;
   } else {
     // If not specified, use 1.0.
     StepVal = ConstantFP::get(getGlobalContext(), APFloat(1.0));
   }
-  
+
   Value *NextVar = Builder.CreateFAdd(Variable, StepVal, "nextvar");
 
   // Compute the end condition.
   Value *EndCond = End->Codegen();
-  if (EndCond == 0) return EndCond;
-  
+  if (EndCond == 0)
+    return EndCond;
+
   // Convert condition to a bool by comparing equal to 0.0.
-  EndCond = Builder.CreateFCmpONE(EndCond, 
-                              ConstantFP::get(getGlobalContext(), APFloat(0.0)),
-                                  "loopcond");
-  
+  EndCond = Builder.CreateFCmpONE(
+      EndCond, ConstantFP::get(getGlobalContext(), APFloat(0.0)), "loopcond");
+
   // Create the "after loop" block and insert it.
   BasicBlock *LoopEndBB = Builder.GetInsertBlock();
-  BasicBlock *AfterBB = BasicBlock::Create(getGlobalContext(), "afterloop", TheFunction);
-  
+  BasicBlock *AfterBB =
+      BasicBlock::Create(getGlobalContext(), "afterloop", TheFunction);
+
   // Insert the conditional branch into the end of LoopEndBB.
   Builder.CreateCondBr(EndCond, LoopBB, AfterBB);
-  
+
   // Any new code will be inserted in AfterBB.
   Builder.SetInsertPoint(AfterBB);
-  
+
   // Add a new entry to the PHI node for the backedge.
   Variable->addIncoming(NextVar, LoopEndBB);
-  
+
   // Restore the unshadowed variable.
   if (OldVal)
     NamedValues[VarName] = OldVal;
   else
     NamedValues.erase(VarName);
 
-  
   // for expr always returns 0.0.
   return Constant::getNullValue(Type::getDoubleTy(getGlobalContext()));
 }
 
 Function *PrototypeAST::Codegen() {
   // Make the function type:  double(double,double) etc.
-  std::vector<Type*> Doubles(Args.size(),
-                             Type::getDoubleTy(getGlobalContext()));
-  FunctionType *FT = FunctionType::get(Type::getDoubleTy(getGlobalContext()),
-                                       Doubles, false);
-  
-  Function *F = Function::Create(FT, Function::ExternalLinkage, Name, TheModule);
-  
+  std::vector<Type *> Doubles(Args.size(),
+                              Type::getDoubleTy(getGlobalContext()));
+  FunctionType *FT =
+      FunctionType::get(Type::getDoubleTy(getGlobalContext()), Doubles, false);
+
+  Function *F =
+      Function::Create(FT, Function::ExternalLinkage, Name, TheModule);
+
   // If F conflicted, there was already something named 'Name'.  If it has a
   // body, don't allow redefinition or reextern.
   if (F->getName() != Name) {
     // Delete the one we just made and get the existing one.
     F->eraseFromParent();
     F = TheModule->getFunction(Name);
-    
+
     // If F already has a body, reject this.
     if (!F->empty()) {
       ErrorF("redefinition of function");
       return 0;
     }
-    
+
     // If F took a different number of args, reject.
     if (F->arg_size() != Args.size()) {
       ErrorF("redefinition of function with different # args");
       return 0;
     }
   }
-  
+
   // Set names for all arguments.
   unsigned Idx = 0;
   for (Function::arg_iterator AI = F->arg_begin(); Idx != Args.size();
        ++AI, ++Idx) {
     AI->setName(Args[Idx]);
-    
+
     // Add arguments to variable symbol table.
     NamedValues[Args[Idx]] = AI;
   }
-  
+
   return F;
 }
 
 Function *FunctionAST::Codegen() {
   NamedValues.clear();
-  
+
   Function *TheFunction = Proto->Codegen();
   if (TheFunction == 0)
     return 0;
-  
+
   // Create a new basic block to start insertion into.
   BasicBlock *BB = BasicBlock::Create(getGlobalContext(), "entry", TheFunction);
   Builder.SetInsertPoint(BB);
-  
+
   if (Value *RetVal = Body->Codegen()) {
     // Finish off the function.
     Builder.CreateRet(RetVal);
@@ -714,10 +780,10 @@ Function *FunctionAST::Codegen() {
 
     // Optimize the function.
     TheFPM->run(*TheFunction);
-    
+
     return TheFunction;
   }
-  
+
   // Error reading body, remove function.
   TheFunction->eraseFromParent();
   return 0;
@@ -757,9 +823,10 @@ static void HandleTopLevelExpression() {
   // Evaluate a top-level expression into an anonymous function.
   if (FunctionAST *F = ParseTopLevelExpr()) {
     if (Function *LF = F->Codegen()) {
+      TheExecutionEngine->finalizeObject();
       // JIT the function, returning a function pointer.
       void *FPtr = TheExecutionEngine->getPointerToFunction(LF);
-      
+
       // Cast it to the right type (takes no arguments, returns a double) so we
       // can call it as a native function.
       double (*FP)() = (double (*)())(intptr_t)FPtr;
@@ -776,11 +843,20 @@ static void MainLoop() {
   while (1) {
     fprintf(stderr, "ready> ");
     switch (CurTok) {
-    case tok_eof:    return;
-    case ';':        getNextToken(); break;  // ignore top-level semicolons.
-    case tok_def:    HandleDefinition(); break;
-    case tok_extern: HandleExtern(); break;
-    default:         HandleTopLevelExpression(); break;
+    case tok_eof:
+      return;
+    case ';':
+      getNextToken();
+      break; // ignore top-level semicolons.
+    case tok_def:
+      HandleDefinition();
+      break;
+    case tok_extern:
+      HandleExtern();
+      break;
+    default:
+      HandleTopLevelExpression();
+      break;
     }
   }
 }
@@ -790,8 +866,7 @@ static void MainLoop() {
 //===----------------------------------------------------------------------===//
 
 /// putchard - putchar that takes a double and returns 0.
-extern "C" 
-double putchard(double X) {
+extern "C" double putchard(double X) {
   putchar((char)X);
   return 0;
 }
@@ -802,6 +877,8 @@ double putchard(double X) {
 
 int main() {
   InitializeNativeTarget();
+  InitializeNativeTargetAsmPrinter();
+  InitializeNativeTargetAsmParser();
   LLVMContext &Context = getGlobalContext();
 
   // Install standard binary operators.
@@ -809,7 +886,7 @@ int main() {
   BinopPrecedence['<'] = 10;
   BinopPrecedence['+'] = 20;
   BinopPrecedence['-'] = 20;
-  BinopPrecedence['*'] = 40;  // highest.
+  BinopPrecedence['*'] = 40; // highest.
 
   // Prime the first token.
   fprintf(stderr, "ready> ");
@@ -822,13 +899,16 @@ int main() {
   // Create the JIT.  This takes ownership of the module.
   std::string ErrStr;
   TheExecutionEngine =
-      EngineBuilder(std::move(Owner)).setErrorStr(&ErrStr).create();
+      EngineBuilder(std::move(Owner))
+          .setErrorStr(&ErrStr)
+          .setMCJITMemoryManager(llvm::make_unique<SectionMemoryManager>())
+          .create();
   if (!TheExecutionEngine) {
     fprintf(stderr, "Could not create ExecutionEngine: %s\n", ErrStr.c_str());
     exit(1);
   }
 
-  FunctionPassManager OurFPM(TheModule);
+  legacy::FunctionPassManager OurFPM(TheModule);
 
   // Set up the optimizer pipeline.  Start with registering info about how the
   // target lays out data structures.
diff --git a/examples/Kaleidoscope/Chapter6/CMakeLists.txt b/examples/Kaleidoscope/Chapter6/CMakeLists.txt
index d36f030..55b9a78 100644
--- a/examples/Kaleidoscope/Chapter6/CMakeLists.txt
+++ b/examples/Kaleidoscope/Chapter6/CMakeLists.txt
@@ -3,12 +3,12 @@ set(LLVM_LINK_COMPONENTS
   Core
   ExecutionEngine
   InstCombine
-  MC
+  MCJIT
   ScalarOpts
   Support
-  nativecodegen
+  native
   )
 
-add_llvm_example(Kaleidoscope-Ch6
+add_kaleidoscope_chapter(Kaleidoscope-Ch6
   toy.cpp
   )
diff --git a/examples/Kaleidoscope/Chapter6/toy.cpp b/examples/Kaleidoscope/Chapter6/toy.cpp
index 06da9ac..8131aa1 100644
--- a/examples/Kaleidoscope/Chapter6/toy.cpp
+++ b/examples/Kaleidoscope/Chapter6/toy.cpp
@@ -1,12 +1,14 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/MCJIT.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
 #include <cctype>
@@ -26,21 +28,27 @@ enum Token {
   tok_eof = -1,
 
   // commands
-  tok_def = -2, tok_extern = -3,
+  tok_def = -2,
+  tok_extern = -3,
 
   // primary
-  tok_identifier = -4, tok_number = -5,
-  
+  tok_identifier = -4,
+  tok_number = -5,
+
   // control
-  tok_if = -6, tok_then = -7, tok_else = -8,
-  tok_for = -9, tok_in = -10,
-  
+  tok_if = -6,
+  tok_then = -7,
+  tok_else = -8,
+  tok_for = -9,
+  tok_in = -10,
+
   // operators
-  tok_binary = -11, tok_unary = -12
+  tok_binary = -11,
+  tok_unary = -12
 };
 
-static std::string IdentifierStr;  // Filled in if tok_identifier
-static double NumVal;              // Filled in if tok_number
+static std::string IdentifierStr; // Filled in if tok_identifier
+static double NumVal;             // Filled in if tok_number
 
 /// gettok - Return the next token from standard input.
 static int gettok() {
@@ -55,19 +63,28 @@ static int gettok() {
     while (isalnum((LastChar = getchar())))
       IdentifierStr += LastChar;
 
-    if (IdentifierStr == "def") return tok_def;
-    if (IdentifierStr == "extern") return tok_extern;
-    if (IdentifierStr == "if") return tok_if;
-    if (IdentifierStr == "then") return tok_then;
-    if (IdentifierStr == "else") return tok_else;
-    if (IdentifierStr == "for") return tok_for;
-    if (IdentifierStr == "in") return tok_in;
-    if (IdentifierStr == "binary") return tok_binary;
-    if (IdentifierStr == "unary") return tok_unary;
+    if (IdentifierStr == "def")
+      return tok_def;
+    if (IdentifierStr == "extern")
+      return tok_extern;
+    if (IdentifierStr == "if")
+      return tok_if;
+    if (IdentifierStr == "then")
+      return tok_then;
+    if (IdentifierStr == "else")
+      return tok_else;
+    if (IdentifierStr == "for")
+      return tok_for;
+    if (IdentifierStr == "in")
+      return tok_in;
+    if (IdentifierStr == "binary")
+      return tok_binary;
+    if (IdentifierStr == "unary")
+      return tok_unary;
     return tok_identifier;
   }
 
-  if (isdigit(LastChar) || LastChar == '.') {   // Number: [0-9.]+
+  if (isdigit(LastChar) || LastChar == '.') { // Number: [0-9.]+
     std::string NumStr;
     do {
       NumStr += LastChar;
@@ -80,13 +97,14 @@ static int gettok() {
 
   if (LastChar == '#') {
     // Comment until end of line.
-    do LastChar = getchar();
+    do
+      LastChar = getchar();
     while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
-    
+
     if (LastChar != EOF)
       return gettok();
   }
-  
+
   // Check for end of file.  Don't eat the EOF.
   if (LastChar == EOF)
     return tok_eof;
@@ -111,6 +129,7 @@ public:
 /// NumberExprAST - Expression class for numeric literals like "1.0".
 class NumberExprAST : public ExprAST {
   double Val;
+
 public:
   NumberExprAST(double val) : Val(val) {}
   virtual Value *Codegen();
@@ -119,6 +138,7 @@ public:
 /// VariableExprAST - Expression class for referencing a variable, like "a".
 class VariableExprAST : public ExprAST {
   std::string Name;
+
 public:
   VariableExprAST(const std::string &name) : Name(name) {}
   virtual Value *Codegen();
@@ -128,9 +148,10 @@ public:
 class UnaryExprAST : public ExprAST {
   char Opcode;
   ExprAST *Operand;
+
 public:
-  UnaryExprAST(char opcode, ExprAST *operand) 
-    : Opcode(opcode), Operand(operand) {}
+  UnaryExprAST(char opcode, ExprAST *operand)
+      : Opcode(opcode), Operand(operand) {}
   virtual Value *Codegen();
 };
 
@@ -138,28 +159,31 @@ public:
 class BinaryExprAST : public ExprAST {
   char Op;
   ExprAST *LHS, *RHS;
+
 public:
-  BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs) 
-    : Op(op), LHS(lhs), RHS(rhs) {}
+  BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs)
+      : Op(op), LHS(lhs), RHS(rhs) {}
   virtual Value *Codegen();
 };
 
 /// CallExprAST - Expression class for function calls.
 class CallExprAST : public ExprAST {
   std::string Callee;
-  std::vector<ExprAST*> Args;
+  std::vector<ExprAST *> Args;
+
 public:
-  CallExprAST(const std::string &callee, std::vector<ExprAST*> &args)
-    : Callee(callee), Args(args) {}
+  CallExprAST(const std::string &callee, std::vector<ExprAST *> &args)
+      : Callee(callee), Args(args) {}
   virtual Value *Codegen();
 };
 
 /// IfExprAST - Expression class for if/then/else.
 class IfExprAST : public ExprAST {
   ExprAST *Cond, *Then, *Else;
+
 public:
   IfExprAST(ExprAST *cond, ExprAST *then, ExprAST *_else)
-  : Cond(cond), Then(then), Else(_else) {}
+      : Cond(cond), Then(then), Else(_else) {}
   virtual Value *Codegen();
 };
 
@@ -167,10 +191,11 @@ public:
 class ForExprAST : public ExprAST {
   std::string VarName;
   ExprAST *Start, *End, *Step, *Body;
+
 public:
   ForExprAST(const std::string &varname, ExprAST *start, ExprAST *end,
              ExprAST *step, ExprAST *body)
-    : VarName(varname), Start(start), End(end), Step(step), Body(body) {}
+      : VarName(varname), Start(start), End(end), Step(step), Body(body) {}
   virtual Value *Codegen();
 };
 
@@ -181,22 +206,22 @@ class PrototypeAST {
   std::string Name;
   std::vector<std::string> Args;
   bool isOperator;
-  unsigned Precedence;  // Precedence if a binary op.
+  unsigned Precedence; // Precedence if a binary op.
 public:
   PrototypeAST(const std::string &name, const std::vector<std::string> &args,
                bool isoperator = false, unsigned prec = 0)
-  : Name(name), Args(args), isOperator(isoperator), Precedence(prec) {}
-  
+      : Name(name), Args(args), isOperator(isoperator), Precedence(prec) {}
+
   bool isUnaryOp() const { return isOperator && Args.size() == 1; }
   bool isBinaryOp() const { return isOperator && Args.size() == 2; }
-  
+
   char getOperatorName() const {
     assert(isUnaryOp() || isBinaryOp());
-    return Name[Name.size()-1];
+    return Name[Name.size() - 1];
   }
-  
+
   unsigned getBinaryPrecedence() const { return Precedence; }
-  
+
   Function *Codegen();
 };
 
@@ -204,10 +229,10 @@ public:
 class FunctionAST {
   PrototypeAST *Proto;
   ExprAST *Body;
+
 public:
-  FunctionAST(PrototypeAST *proto, ExprAST *body)
-    : Proto(proto), Body(body) {}
-  
+  FunctionAST(PrototypeAST *proto, ExprAST *body) : Proto(proto), Body(body) {}
+
   Function *Codegen();
 };
 } // end anonymous namespace
@@ -220,9 +245,7 @@ public:
 /// token the parser is looking at.  getNextToken reads another token from the
 /// lexer and updates CurTok with its results.
 static int CurTok;
-static int getNextToken() {
-  return CurTok = gettok();
-}
+static int getNextToken() { return CurTok = gettok(); }
 
 /// BinopPrecedence - This holds the precedence for each binary operator that is
 /// defined.
@@ -232,17 +255,27 @@ static std::map<char, int> BinopPrecedence;
 static int GetTokPrecedence() {
   if (!isascii(CurTok))
     return -1;
-  
+
   // Make sure it's a declared binop.
   int TokPrec = BinopPrecedence[CurTok];
-  if (TokPrec <= 0) return -1;
+  if (TokPrec <= 0)
+    return -1;
   return TokPrec;
 }
 
 /// Error* - These are little helper functions for error handling.
-ExprAST *Error(const char *Str) { fprintf(stderr, "Error: %s\n", Str);return 0;}
-PrototypeAST *ErrorP(const char *Str) { Error(Str); return 0; }
-FunctionAST *ErrorF(const char *Str) { Error(Str); return 0; }
+ExprAST *Error(const char *Str) {
+  fprintf(stderr, "Error: %s\n", Str);
+  return 0;
+}
+PrototypeAST *ErrorP(const char *Str) {
+  Error(Str);
+  return 0;
+}
+FunctionAST *ErrorF(const char *Str) {
+  Error(Str);
+  return 0;
+}
 
 static ExprAST *ParseExpression();
 
@@ -251,22 +284,24 @@ static ExprAST *ParseExpression();
 ///   ::= identifier '(' expression* ')'
 static ExprAST *ParseIdentifierExpr() {
   std::string IdName = IdentifierStr;
-  
-  getNextToken();  // eat identifier.
-  
+
+  getNextToken(); // eat identifier.
+
   if (CurTok != '(') // Simple variable ref.
     return new VariableExprAST(IdName);
-  
+
   // Call.
-  getNextToken();  // eat (
-  std::vector<ExprAST*> Args;
+  getNextToken(); // eat (
+  std::vector<ExprAST *> Args;
   if (CurTok != ')') {
     while (1) {
       ExprAST *Arg = ParseExpression();
-      if (!Arg) return 0;
+      if (!Arg)
+        return 0;
       Args.push_back(Arg);
 
-      if (CurTok == ')') break;
+      if (CurTok == ')')
+        break;
 
       if (CurTok != ',')
         return Error("Expected ')' or ',' in argument list");
@@ -276,7 +311,7 @@ static ExprAST *ParseIdentifierExpr() {
 
   // Eat the ')'.
   getNextToken();
-  
+
   return new CallExprAST(IdName, Args);
 }
 
@@ -289,80 +324,87 @@ static ExprAST *ParseNumberExpr() {
 
 /// parenexpr ::= '(' expression ')'
 static ExprAST *ParseParenExpr() {
-  getNextToken();  // eat (.
+  getNextToken(); // eat (.
   ExprAST *V = ParseExpression();
-  if (!V) return 0;
-  
+  if (!V)
+    return 0;
+
   if (CurTok != ')')
     return Error("expected ')'");
-  getNextToken();  // eat ).
+  getNextToken(); // eat ).
   return V;
 }
 
 /// ifexpr ::= 'if' expression 'then' expression 'else' expression
 static ExprAST *ParseIfExpr() {
-  getNextToken();  // eat the if.
-  
+  getNextToken(); // eat the if.
+
   // condition.
   ExprAST *Cond = ParseExpression();
-  if (!Cond) return 0;
-  
+  if (!Cond)
+    return 0;
+
   if (CurTok != tok_then)
     return Error("expected then");
-  getNextToken();  // eat the then
-  
+  getNextToken(); // eat the then
+
   ExprAST *Then = ParseExpression();
-  if (Then == 0) return 0;
-  
+  if (Then == 0)
+    return 0;
+
   if (CurTok != tok_else)
     return Error("expected else");
-  
+
   getNextToken();
-  
+
   ExprAST *Else = ParseExpression();
-  if (!Else) return 0;
-  
+  if (!Else)
+    return 0;
+
   return new IfExprAST(Cond, Then, Else);
 }
 
 /// forexpr ::= 'for' identifier '=' expr ',' expr (',' expr)? 'in' expression
 static ExprAST *ParseForExpr() {
-  getNextToken();  // eat the for.
+  getNextToken(); // eat the for.
 
   if (CurTok != tok_identifier)
     return Error("expected identifier after for");
-  
+
   std::string IdName = IdentifierStr;
-  getNextToken();  // eat identifier.
-  
+  getNextToken(); // eat identifier.
+
   if (CurTok != '=')
     return Error("expected '=' after for");
-  getNextToken();  // eat '='.
-  
-  
+  getNextToken(); // eat '='.
+
   ExprAST *Start = ParseExpression();
-  if (Start == 0) return 0;
+  if (Start == 0)
+    return 0;
   if (CurTok != ',')
     return Error("expected ',' after for start value");
   getNextToken();
-  
+
   ExprAST *End = ParseExpression();
-  if (End == 0) return 0;
-  
+  if (End == 0)
+    return 0;
+
   // The step value is optional.
   ExprAST *Step = 0;
   if (CurTok == ',') {
     getNextToken();
     Step = ParseExpression();
-    if (Step == 0) return 0;
+    if (Step == 0)
+      return 0;
   }
-  
+
   if (CurTok != tok_in)
     return Error("expected 'in' after for");
-  getNextToken();  // eat 'in'.
-  
+  getNextToken(); // eat 'in'.
+
   ExprAST *Body = ParseExpression();
-  if (Body == 0) return 0;
+  if (Body == 0)
+    return 0;
 
   return new ForExprAST(IdName, Start, End, Step, Body);
 }
@@ -375,12 +417,18 @@ static ExprAST *ParseForExpr() {
 ///   ::= forexpr
 static ExprAST *ParsePrimary() {
   switch (CurTok) {
-  default: return Error("unknown token when expecting an expression");
-  case tok_identifier: return ParseIdentifierExpr();
-  case tok_number:     return ParseNumberExpr();
-  case '(':            return ParseParenExpr();
-  case tok_if:         return ParseIfExpr();
-  case tok_for:        return ParseForExpr();
+  default:
+    return Error("unknown token when expecting an expression");
+  case tok_identifier:
+    return ParseIdentifierExpr();
+  case tok_number:
+    return ParseNumberExpr();
+  case '(':
+    return ParseParenExpr();
+  case tok_if:
+    return ParseIfExpr();
+  case tok_for:
+    return ParseForExpr();
   }
 }
 
@@ -391,7 +439,7 @@ static ExprAST *ParseUnary() {
   // If the current token is not an operator, it must be a primary expr.
   if (!isascii(CurTok) || CurTok == '(' || CurTok == ',')
     return ParsePrimary();
-  
+
   // If this is a unary operator, read it.
   int Opc = CurTok;
   getNextToken();
@@ -406,28 +454,30 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
   // If this is a binop, find its precedence.
   while (1) {
     int TokPrec = GetTokPrecedence();
-    
+
     // If this is a binop that binds at least as tightly as the current binop,
     // consume it, otherwise we are done.
     if (TokPrec < ExprPrec)
       return LHS;
-    
+
     // Okay, we know this is a binop.
     int BinOp = CurTok;
-    getNextToken();  // eat binop
-    
+    getNextToken(); // eat binop
+
     // Parse the unary expression after the binary operator.
     ExprAST *RHS = ParseUnary();
-    if (!RHS) return 0;
-    
+    if (!RHS)
+      return 0;
+
     // If BinOp binds less tightly with RHS than the operator after RHS, let
     // the pending operator take RHS as its LHS.
     int NextPrec = GetTokPrecedence();
     if (TokPrec < NextPrec) {
-      RHS = ParseBinOpRHS(TokPrec+1, RHS);
-      if (RHS == 0) return 0;
+      RHS = ParseBinOpRHS(TokPrec + 1, RHS);
+      if (RHS == 0)
+        return 0;
     }
-    
+
     // Merge LHS/RHS.
     LHS = new BinaryExprAST(BinOp, LHS, RHS);
   }
@@ -438,8 +488,9 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
 ///
 static ExprAST *ParseExpression() {
   ExprAST *LHS = ParseUnary();
-  if (!LHS) return 0;
-  
+  if (!LHS)
+    return 0;
+
   return ParseBinOpRHS(0, LHS);
 }
 
@@ -449,10 +500,10 @@ static ExprAST *ParseExpression() {
 ///   ::= unary LETTER (id)
 static PrototypeAST *ParsePrototype() {
   std::string FnName;
-  
+
   unsigned Kind = 0; // 0 = identifier, 1 = unary, 2 = binary.
   unsigned BinaryPrecedence = 30;
-  
+
   switch (CurTok) {
   default:
     return ErrorP("Expected function name in prototype");
@@ -478,7 +529,7 @@ static PrototypeAST *ParsePrototype() {
     FnName += (char)CurTok;
     Kind = 2;
     getNextToken();
-    
+
     // Read the precedence if present.
     if (CurTok == tok_number) {
       if (NumVal < 1 || NumVal > 100)
@@ -488,31 +539,32 @@ static PrototypeAST *ParsePrototype() {
     }
     break;
   }
-  
+
   if (CurTok != '(')
     return ErrorP("Expected '(' in prototype");
-  
+
   std::vector<std::string> ArgNames;
   while (getNextToken() == tok_identifier)
     ArgNames.push_back(IdentifierStr);
   if (CurTok != ')')
     return ErrorP("Expected ')' in prototype");
-  
+
   // success.
-  getNextToken();  // eat ')'.
-  
+  getNextToken(); // eat ')'.
+
   // Verify right number of names for operator.
   if (Kind && ArgNames.size() != Kind)
     return ErrorP("Invalid number of operands for operator");
-  
+
   return new PrototypeAST(FnName, ArgNames, Kind != 0, BinaryPrecedence);
 }
 
 /// definition ::= 'def' prototype expression
 static FunctionAST *ParseDefinition() {
-  getNextToken();  // eat def.
+  getNextToken(); // eat def.
   PrototypeAST *Proto = ParsePrototype();
-  if (Proto == 0) return 0;
+  if (Proto == 0)
+    return 0;
 
   if (ExprAST *E = ParseExpression())
     return new FunctionAST(Proto, E);
@@ -531,7 +583,7 @@ static FunctionAST *ParseTopLevelExpr() {
 
 /// external ::= 'extern' prototype
 static PrototypeAST *ParseExtern() {
-  getNextToken();  // eat extern.
+  getNextToken(); // eat extern.
   return ParsePrototype();
 }
 
@@ -541,10 +593,13 @@ static PrototypeAST *ParseExtern() {
 
 static Module *TheModule;
 static IRBuilder<> Builder(getGlobalContext());
-static std::map<std::string, Value*> NamedValues;
-static FunctionPassManager *TheFPM;
+static std::map<std::string, Value *> NamedValues;
+static legacy::FunctionPassManager *TheFPM;
 
-Value *ErrorV(const char *Str) { Error(Str); return 0; }
+Value *ErrorV(const char *Str) {
+  Error(Str);
+  return 0;
+}
 
 Value *NumberExprAST::Codegen() {
   return ConstantFP::get(getGlobalContext(), APFloat(Val));
@@ -558,37 +613,43 @@ Value *VariableExprAST::Codegen() {
 
 Value *UnaryExprAST::Codegen() {
   Value *OperandV = Operand->Codegen();
-  if (OperandV == 0) return 0;
-  
-  Function *F = TheModule->getFunction(std::string("unary")+Opcode);
+  if (OperandV == 0)
+    return 0;
+
+  Function *F = TheModule->getFunction(std::string("unary") + Opcode);
   if (F == 0)
     return ErrorV("Unknown unary operator");
-  
+
   return Builder.CreateCall(F, OperandV, "unop");
 }
 
 Value *BinaryExprAST::Codegen() {
   Value *L = LHS->Codegen();
   Value *R = RHS->Codegen();
-  if (L == 0 || R == 0) return 0;
-  
+  if (L == 0 || R == 0)
+    return 0;
+
   switch (Op) {
-  case '+': return Builder.CreateFAdd(L, R, "addtmp");
-  case '-': return Builder.CreateFSub(L, R, "subtmp");
-  case '*': return Builder.CreateFMul(L, R, "multmp");
+  case '+':
+    return Builder.CreateFAdd(L, R, "addtmp");
+  case '-':
+    return Builder.CreateFSub(L, R, "subtmp");
+  case '*':
+    return Builder.CreateFMul(L, R, "multmp");
   case '<':
     L = Builder.CreateFCmpULT(L, R, "cmptmp");
     // Convert bool 0/1 to double 0.0 or 1.0
     return Builder.CreateUIToFP(L, Type::getDoubleTy(getGlobalContext()),
                                 "booltmp");
-  default: break;
+  default:
+    break;
   }
-  
+
   // If it wasn't a builtin binary operator, it must be a user defined one. Emit
   // a call to it.
-  Function *F = TheModule->getFunction(std::string("binary")+Op);
+  Function *F = TheModule->getFunction(std::string("binary") + Op);
   assert(F && "binary operator not found!");
-  
+
   Value *Ops[] = { L, R };
   return Builder.CreateCall(F, Ops, "binop");
 }
@@ -598,66 +659,70 @@ Value *CallExprAST::Codegen() {
   Function *CalleeF = TheModule->getFunction(Callee);
   if (CalleeF == 0)
     return ErrorV("Unknown function referenced");
-  
+
   // If argument mismatch error.
   if (CalleeF->arg_size() != Args.size())
     return ErrorV("Incorrect # arguments passed");
 
-  std::vector<Value*> ArgsV;
+  std::vector<Value *> ArgsV;
   for (unsigned i = 0, e = Args.size(); i != e; ++i) {
     ArgsV.push_back(Args[i]->Codegen());
-    if (ArgsV.back() == 0) return 0;
+    if (ArgsV.back() == 0)
+      return 0;
   }
-  
+
   return Builder.CreateCall(CalleeF, ArgsV, "calltmp");
 }
 
 Value *IfExprAST::Codegen() {
   Value *CondV = Cond->Codegen();
-  if (CondV == 0) return 0;
-  
+  if (CondV == 0)
+    return 0;
+
   // Convert condition to a bool by comparing equal to 0.0.
-  CondV = Builder.CreateFCmpONE(CondV, 
-                              ConstantFP::get(getGlobalContext(), APFloat(0.0)),
-                                "ifcond");
-  
+  CondV = Builder.CreateFCmpONE(
+      CondV, ConstantFP::get(getGlobalContext(), APFloat(0.0)), "ifcond");
+
   Function *TheFunction = Builder.GetInsertBlock()->getParent();
-  
+
   // Create blocks for the then and else cases.  Insert the 'then' block at the
   // end of the function.
-  BasicBlock *ThenBB = BasicBlock::Create(getGlobalContext(), "then", TheFunction);
+  BasicBlock *ThenBB =
+      BasicBlock::Create(getGlobalContext(), "then", TheFunction);
   BasicBlock *ElseBB = BasicBlock::Create(getGlobalContext(), "else");
   BasicBlock *MergeBB = BasicBlock::Create(getGlobalContext(), "ifcont");
-  
+
   Builder.CreateCondBr(CondV, ThenBB, ElseBB);
-  
+
   // Emit then value.
   Builder.SetInsertPoint(ThenBB);
-  
+
   Value *ThenV = Then->Codegen();
-  if (ThenV == 0) return 0;
-  
+  if (ThenV == 0)
+    return 0;
+
   Builder.CreateBr(MergeBB);
   // Codegen of 'Then' can change the current block, update ThenBB for the PHI.
   ThenBB = Builder.GetInsertBlock();
-  
+
   // Emit else block.
   TheFunction->getBasicBlockList().push_back(ElseBB);
   Builder.SetInsertPoint(ElseBB);
-  
+
   Value *ElseV = Else->Codegen();
-  if (ElseV == 0) return 0;
-  
+  if (ElseV == 0)
+    return 0;
+
   Builder.CreateBr(MergeBB);
   // Codegen of 'Else' can change the current block, update ElseBB for the PHI.
   ElseBB = Builder.GetInsertBlock();
-  
+
   // Emit merge block.
   TheFunction->getBasicBlockList().push_back(MergeBB);
   Builder.SetInsertPoint(MergeBB);
-  PHINode *PN = Builder.CreatePHI(Type::getDoubleTy(getGlobalContext()), 2,
-                                  "iftmp");
-  
+  PHINode *PN =
+      Builder.CreatePHI(Type::getDoubleTy(getGlobalContext()), 2, "iftmp");
+
   PN->addIncoming(ThenV, ThenBB);
   PN->addIncoming(ElseV, ElseBB);
   return PN;
@@ -668,7 +733,7 @@ Value *ForExprAST::Codegen() {
   //   ...
   //   start = startexpr
   //   goto loop
-  // loop: 
+  // loop:
   //   variable = phi [start, loopheader], [nextvariable, loopend]
   //   ...
   //   bodyexpr
@@ -679,140 +744,145 @@ Value *ForExprAST::Codegen() {
   //   endcond = endexpr
   //   br endcond, loop, endloop
   // outloop:
-  
+
   // Emit the start code first, without 'variable' in scope.
   Value *StartVal = Start->Codegen();
-  if (StartVal == 0) return 0;
-  
+  if (StartVal == 0)
+    return 0;
+
   // Make the new basic block for the loop header, inserting after current
   // block.
   Function *TheFunction = Builder.GetInsertBlock()->getParent();
   BasicBlock *PreheaderBB = Builder.GetInsertBlock();
-  BasicBlock *LoopBB = BasicBlock::Create(getGlobalContext(), "loop", TheFunction);
-  
+  BasicBlock *LoopBB =
+      BasicBlock::Create(getGlobalContext(), "loop", TheFunction);
+
   // Insert an explicit fall through from the current block to the LoopBB.
   Builder.CreateBr(LoopBB);
 
   // Start insertion in LoopBB.
   Builder.SetInsertPoint(LoopBB);
-  
+
   // Start the PHI node with an entry for Start.
-  PHINode *Variable = Builder.CreatePHI(Type::getDoubleTy(getGlobalContext()), 2, VarName.c_str());
+  PHINode *Variable = Builder.CreatePHI(Type::getDoubleTy(getGlobalContext()),
+                                        2, VarName.c_str());
   Variable->addIncoming(StartVal, PreheaderBB);
-  
+
   // Within the loop, the variable is defined equal to the PHI node.  If it
   // shadows an existing variable, we have to restore it, so save it now.
   Value *OldVal = NamedValues[VarName];
   NamedValues[VarName] = Variable;
-  
+
   // Emit the body of the loop.  This, like any other expr, can change the
   // current BB.  Note that we ignore the value computed by the body, but don't
   // allow an error.
   if (Body->Codegen() == 0)
     return 0;
-  
+
   // Emit the step value.
   Value *StepVal;
   if (Step) {
     StepVal = Step->Codegen();
-    if (StepVal == 0) return 0;
+    if (StepVal == 0)
+      return 0;
   } else {
     // If not specified, use 1.0.
     StepVal = ConstantFP::get(getGlobalContext(), APFloat(1.0));
   }
-  
+
   Value *NextVar = Builder.CreateFAdd(Variable, StepVal, "nextvar");
 
   // Compute the end condition.
   Value *EndCond = End->Codegen();
-  if (EndCond == 0) return EndCond;
-  
+  if (EndCond == 0)
+    return EndCond;
+
   // Convert condition to a bool by comparing equal to 0.0.
-  EndCond = Builder.CreateFCmpONE(EndCond, 
-                              ConstantFP::get(getGlobalContext(), APFloat(0.0)),
-                                  "loopcond");
-  
+  EndCond = Builder.CreateFCmpONE(
+      EndCond, ConstantFP::get(getGlobalContext(), APFloat(0.0)), "loopcond");
+
   // Create the "after loop" block and insert it.
   BasicBlock *LoopEndBB = Builder.GetInsertBlock();
-  BasicBlock *AfterBB = BasicBlock::Create(getGlobalContext(), "afterloop", TheFunction);
-  
+  BasicBlock *AfterBB =
+      BasicBlock::Create(getGlobalContext(), "afterloop", TheFunction);
+
   // Insert the conditional branch into the end of LoopEndBB.
   Builder.CreateCondBr(EndCond, LoopBB, AfterBB);
-  
+
   // Any new code will be inserted in AfterBB.
   Builder.SetInsertPoint(AfterBB);
-  
+
   // Add a new entry to the PHI node for the backedge.
   Variable->addIncoming(NextVar, LoopEndBB);
-  
+
   // Restore the unshadowed variable.
   if (OldVal)
     NamedValues[VarName] = OldVal;
   else
     NamedValues.erase(VarName);
 
-  
   // for expr always returns 0.0.
   return Constant::getNullValue(Type::getDoubleTy(getGlobalContext()));
 }
 
 Function *PrototypeAST::Codegen() {
   // Make the function type:  double(double,double) etc.
-  std::vector<Type*> Doubles(Args.size(),
-                             Type::getDoubleTy(getGlobalContext()));
-  FunctionType *FT = FunctionType::get(Type::getDoubleTy(getGlobalContext()),
-                                       Doubles, false);
-  
-  Function *F = Function::Create(FT, Function::ExternalLinkage, Name, TheModule);
-  
+  std::vector<Type *> Doubles(Args.size(),
+                              Type::getDoubleTy(getGlobalContext()));
+  FunctionType *FT =
+      FunctionType::get(Type::getDoubleTy(getGlobalContext()), Doubles, false);
+
+  Function *F =
+      Function::Create(FT, Function::ExternalLinkage, Name, TheModule);
+
   // If F conflicted, there was already something named 'Name'.  If it has a
   // body, don't allow redefinition or reextern.
   if (F->getName() != Name) {
     // Delete the one we just made and get the existing one.
     F->eraseFromParent();
     F = TheModule->getFunction(Name);
-    
+
     // If F already has a body, reject this.
     if (!F->empty()) {
       ErrorF("redefinition of function");
       return 0;
     }
-    
+
     // If F took a different number of args, reject.
     if (F->arg_size() != Args.size()) {
       ErrorF("redefinition of function with different # args");
       return 0;
     }
   }
-  
+
   // Set names for all arguments.
   unsigned Idx = 0;
   for (Function::arg_iterator AI = F->arg_begin(); Idx != Args.size();
        ++AI, ++Idx) {
     AI->setName(Args[Idx]);
-    
+
     // Add arguments to variable symbol table.
     NamedValues[Args[Idx]] = AI;
   }
-  
+
   return F;
 }
 
 Function *FunctionAST::Codegen() {
   NamedValues.clear();
-  
+
   Function *TheFunction = Proto->Codegen();
   if (TheFunction == 0)
     return 0;
-  
+
   // If this is an operator, install it.
   if (Proto->isBinaryOp())
     BinopPrecedence[Proto->getOperatorName()] = Proto->getBinaryPrecedence();
-  
+
   // Create a new basic block to start insertion into.
   BasicBlock *BB = BasicBlock::Create(getGlobalContext(), "entry", TheFunction);
   Builder.SetInsertPoint(BB);
-  
+
   if (Value *RetVal = Body->Codegen()) {
     // Finish off the function.
     Builder.CreateRet(RetVal);
@@ -822,10 +892,10 @@ Function *FunctionAST::Codegen() {
 
     // Optimize the function.
     TheFPM->run(*TheFunction);
-    
+
     return TheFunction;
   }
-  
+
   // Error reading body, remove function.
   TheFunction->eraseFromParent();
 
@@ -868,9 +938,10 @@ static void HandleTopLevelExpression() {
   // Evaluate a top-level expression into an anonymous function.
   if (FunctionAST *F = ParseTopLevelExpr()) {
     if (Function *LF = F->Codegen()) {
+      TheExecutionEngine->finalizeObject();
       // JIT the function, returning a function pointer.
       void *FPtr = TheExecutionEngine->getPointerToFunction(LF);
-      
+
       // Cast it to the right type (takes no arguments, returns a double) so we
       // can call it as a native function.
       double (*FP)() = (double (*)())(intptr_t)FPtr;
@@ -887,11 +958,20 @@ static void MainLoop() {
   while (1) {
     fprintf(stderr, "ready> ");
     switch (CurTok) {
-    case tok_eof:    return;
-    case ';':        getNextToken(); break;  // ignore top-level semicolons.
-    case tok_def:    HandleDefinition(); break;
-    case tok_extern: HandleExtern(); break;
-    default:         HandleTopLevelExpression(); break;
+    case tok_eof:
+      return;
+    case ';':
+      getNextToken();
+      break; // ignore top-level semicolons.
+    case tok_def:
+      HandleDefinition();
+      break;
+    case tok_extern:
+      HandleExtern();
+      break;
+    default:
+      HandleTopLevelExpression();
+      break;
     }
   }
 }
@@ -901,15 +981,13 @@ static void MainLoop() {
 //===----------------------------------------------------------------------===//
 
 /// putchard - putchar that takes a double and returns 0.
-extern "C" 
-double putchard(double X) {
+extern "C" double putchard(double X) {
   putchar((char)X);
   return 0;
 }
 
 /// printd - printf that takes a double prints it as "%f\n", returning 0.
-extern "C" 
-double printd(double X) {
+extern "C" double printd(double X) {
   printf("%f\n", X);
   return 0;
 }
@@ -920,6 +998,8 @@ double printd(double X) {
 
 int main() {
   InitializeNativeTarget();
+  InitializeNativeTargetAsmPrinter();
+  InitializeNativeTargetAsmParser();
   LLVMContext &Context = getGlobalContext();
 
   // Install standard binary operators.
@@ -927,7 +1007,7 @@ int main() {
   BinopPrecedence['<'] = 10;
   BinopPrecedence['+'] = 20;
   BinopPrecedence['-'] = 20;
-  BinopPrecedence['*'] = 40;  // highest.
+  BinopPrecedence['*'] = 40; // highest.
 
   // Prime the first token.
   fprintf(stderr, "ready> ");
@@ -940,13 +1020,16 @@ int main() {
   // Create the JIT.  This takes ownership of the module.
   std::string ErrStr;
   TheExecutionEngine =
-      EngineBuilder(std::move(Owner)).setErrorStr(&ErrStr).create();
+      EngineBuilder(std::move(Owner))
+          .setErrorStr(&ErrStr)
+          .setMCJITMemoryManager(llvm::make_unique<SectionMemoryManager>())
+          .create();
   if (!TheExecutionEngine) {
     fprintf(stderr, "Could not create ExecutionEngine: %s\n", ErrStr.c_str());
     exit(1);
   }
 
-  FunctionPassManager OurFPM(TheModule);
+  legacy::FunctionPassManager OurFPM(TheModule);
 
   // Set up the optimizer pipeline.  Start with registering info about how the
   // target lays out data structures.
diff --git a/examples/Kaleidoscope/Chapter7/CMakeLists.txt b/examples/Kaleidoscope/Chapter7/CMakeLists.txt
index bdc0e55..4bc1b2c 100644
--- a/examples/Kaleidoscope/Chapter7/CMakeLists.txt
+++ b/examples/Kaleidoscope/Chapter7/CMakeLists.txt
@@ -3,15 +3,15 @@ set(LLVM_LINK_COMPONENTS
   Core
   ExecutionEngine
   InstCombine
-  MC
+  MCJIT
   ScalarOpts
   Support
   TransformUtils
-  nativecodegen
+  native
   )
 
 set(LLVM_REQUIRES_RTTI 1)
 
-add_llvm_example(Kaleidoscope-Ch7
+add_kaleidoscope_chapter(Kaleidoscope-Ch7
   toy.cpp
   )
diff --git a/examples/Kaleidoscope/Chapter7/toy.cpp b/examples/Kaleidoscope/Chapter7/toy.cpp
index 56a6fa9..82f083a 100644
--- a/examples/Kaleidoscope/Chapter7/toy.cpp
+++ b/examples/Kaleidoscope/Chapter7/toy.cpp
@@ -1,12 +1,14 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/MCJIT.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
 #include <cctype>
@@ -26,24 +28,30 @@ enum Token {
   tok_eof = -1,
 
   // commands
-  tok_def = -2, tok_extern = -3,
+  tok_def = -2,
+  tok_extern = -3,
 
   // primary
-  tok_identifier = -4, tok_number = -5,
-  
+  tok_identifier = -4,
+  tok_number = -5,
+
   // control
-  tok_if = -6, tok_then = -7, tok_else = -8,
-  tok_for = -9, tok_in = -10,
-  
+  tok_if = -6,
+  tok_then = -7,
+  tok_else = -8,
+  tok_for = -9,
+  tok_in = -10,
+
   // operators
-  tok_binary = -11, tok_unary = -12,
-  
+  tok_binary = -11,
+  tok_unary = -12,
+
   // var definition
   tok_var = -13
 };
 
-static std::string IdentifierStr;  // Filled in if tok_identifier
-static double NumVal;              // Filled in if tok_number
+static std::string IdentifierStr; // Filled in if tok_identifier
+static double NumVal;             // Filled in if tok_number
 
 /// gettok - Return the next token from standard input.
 static int gettok() {
@@ -58,20 +66,30 @@ static int gettok() {
     while (isalnum((LastChar = getchar())))
       IdentifierStr += LastChar;
 
-    if (IdentifierStr == "def") return tok_def;
-    if (IdentifierStr == "extern") return tok_extern;
-    if (IdentifierStr == "if") return tok_if;
-    if (IdentifierStr == "then") return tok_then;
-    if (IdentifierStr == "else") return tok_else;
-    if (IdentifierStr == "for") return tok_for;
-    if (IdentifierStr == "in") return tok_in;
-    if (IdentifierStr == "binary") return tok_binary;
-    if (IdentifierStr == "unary") return tok_unary;
-    if (IdentifierStr == "var") return tok_var;
+    if (IdentifierStr == "def")
+      return tok_def;
+    if (IdentifierStr == "extern")
+      return tok_extern;
+    if (IdentifierStr == "if")
+      return tok_if;
+    if (IdentifierStr == "then")
+      return tok_then;
+    if (IdentifierStr == "else")
+      return tok_else;
+    if (IdentifierStr == "for")
+      return tok_for;
+    if (IdentifierStr == "in")
+      return tok_in;
+    if (IdentifierStr == "binary")
+      return tok_binary;
+    if (IdentifierStr == "unary")
+      return tok_unary;
+    if (IdentifierStr == "var")
+      return tok_var;
     return tok_identifier;
   }
 
-  if (isdigit(LastChar) || LastChar == '.') {   // Number: [0-9.]+
+  if (isdigit(LastChar) || LastChar == '.') { // Number: [0-9.]+
     std::string NumStr;
     do {
       NumStr += LastChar;
@@ -84,13 +102,14 @@ static int gettok() {
 
   if (LastChar == '#') {
     // Comment until end of line.
-    do LastChar = getchar();
+    do
+      LastChar = getchar();
     while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
-    
+
     if (LastChar != EOF)
       return gettok();
   }
-  
+
   // Check for end of file.  Don't eat the EOF.
   if (LastChar == EOF)
     return tok_eof;
@@ -115,6 +134,7 @@ public:
 /// NumberExprAST - Expression class for numeric literals like "1.0".
 class NumberExprAST : public ExprAST {
   double Val;
+
 public:
   NumberExprAST(double val) : Val(val) {}
   virtual Value *Codegen();
@@ -123,6 +143,7 @@ public:
 /// VariableExprAST - Expression class for referencing a variable, like "a".
 class VariableExprAST : public ExprAST {
   std::string Name;
+
 public:
   VariableExprAST(const std::string &name) : Name(name) {}
   const std::string &getName() const { return Name; }
@@ -133,9 +154,10 @@ public:
 class UnaryExprAST : public ExprAST {
   char Opcode;
   ExprAST *Operand;
+
 public:
-  UnaryExprAST(char opcode, ExprAST *operand) 
-    : Opcode(opcode), Operand(operand) {}
+  UnaryExprAST(char opcode, ExprAST *operand)
+      : Opcode(opcode), Operand(operand) {}
   virtual Value *Codegen();
 };
 
@@ -143,28 +165,31 @@ public:
 class BinaryExprAST : public ExprAST {
   char Op;
   ExprAST *LHS, *RHS;
+
 public:
-  BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs) 
-    : Op(op), LHS(lhs), RHS(rhs) {}
+  BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs)
+      : Op(op), LHS(lhs), RHS(rhs) {}
   virtual Value *Codegen();
 };
 
 /// CallExprAST - Expression class for function calls.
 class CallExprAST : public ExprAST {
   std::string Callee;
-  std::vector<ExprAST*> Args;
+  std::vector<ExprAST *> Args;
+
 public:
-  CallExprAST(const std::string &callee, std::vector<ExprAST*> &args)
-    : Callee(callee), Args(args) {}
+  CallExprAST(const std::string &callee, std::vector<ExprAST *> &args)
+      : Callee(callee), Args(args) {}
   virtual Value *Codegen();
 };
 
 /// IfExprAST - Expression class for if/then/else.
 class IfExprAST : public ExprAST {
   ExprAST *Cond, *Then, *Else;
+
 public:
   IfExprAST(ExprAST *cond, ExprAST *then, ExprAST *_else)
-  : Cond(cond), Then(then), Else(_else) {}
+      : Cond(cond), Then(then), Else(_else) {}
   virtual Value *Codegen();
 };
 
@@ -172,22 +197,24 @@ public:
 class ForExprAST : public ExprAST {
   std::string VarName;
   ExprAST *Start, *End, *Step, *Body;
+
 public:
   ForExprAST(const std::string &varname, ExprAST *start, ExprAST *end,
              ExprAST *step, ExprAST *body)
-    : VarName(varname), Start(start), End(end), Step(step), Body(body) {}
+      : VarName(varname), Start(start), End(end), Step(step), Body(body) {}
   virtual Value *Codegen();
 };
 
 /// VarExprAST - Expression class for var/in
 class VarExprAST : public ExprAST {
-  std::vector<std::pair<std::string, ExprAST*> > VarNames;
+  std::vector<std::pair<std::string, ExprAST *> > VarNames;
   ExprAST *Body;
+
 public:
-  VarExprAST(const std::vector<std::pair<std::string, ExprAST*> > &varnames,
+  VarExprAST(const std::vector<std::pair<std::string, ExprAST *> > &varnames,
              ExprAST *body)
-  : VarNames(varnames), Body(body) {}
-  
+      : VarNames(varnames), Body(body) {}
+
   virtual Value *Codegen();
 };
 
@@ -197,24 +224,24 @@ class PrototypeAST {
   std::string Name;
   std::vector<std::string> Args;
   bool isOperator;
-  unsigned Precedence;  // Precedence if a binary op.
+  unsigned Precedence; // Precedence if a binary op.
 public:
   PrototypeAST(const std::string &name, const std::vector<std::string> &args,
                bool isoperator = false, unsigned prec = 0)
-  : Name(name), Args(args), isOperator(isoperator), Precedence(prec) {}
-  
+      : Name(name), Args(args), isOperator(isoperator), Precedence(prec) {}
+
   bool isUnaryOp() const { return isOperator && Args.size() == 1; }
   bool isBinaryOp() const { return isOperator && Args.size() == 2; }
-  
+
   char getOperatorName() const {
     assert(isUnaryOp() || isBinaryOp());
-    return Name[Name.size()-1];
+    return Name[Name.size() - 1];
   }
-  
+
   unsigned getBinaryPrecedence() const { return Precedence; }
-  
+
   Function *Codegen();
-  
+
   void CreateArgumentAllocas(Function *F);
 };
 
@@ -222,10 +249,10 @@ public:
 class FunctionAST {
   PrototypeAST *Proto;
   ExprAST *Body;
+
 public:
-  FunctionAST(PrototypeAST *proto, ExprAST *body)
-    : Proto(proto), Body(body) {}
-  
+  FunctionAST(PrototypeAST *proto, ExprAST *body) : Proto(proto), Body(body) {}
+
   Function *Codegen();
 };
 } // end anonymous namespace
@@ -238,9 +265,7 @@ public:
 /// token the parser is looking at.  getNextToken reads another token from the
 /// lexer and updates CurTok with its results.
 static int CurTok;
-static int getNextToken() {
-  return CurTok = gettok();
-}
+static int getNextToken() { return CurTok = gettok(); }
 
 /// BinopPrecedence - This holds the precedence for each binary operator that is
 /// defined.
@@ -250,17 +275,27 @@ static std::map<char, int> BinopPrecedence;
 static int GetTokPrecedence() {
   if (!isascii(CurTok))
     return -1;
-  
+
   // Make sure it's a declared binop.
   int TokPrec = BinopPrecedence[CurTok];
-  if (TokPrec <= 0) return -1;
+  if (TokPrec <= 0)
+    return -1;
   return TokPrec;
 }
 
 /// Error* - These are little helper functions for error handling.
-ExprAST *Error(const char *Str) { fprintf(stderr, "Error: %s\n", Str);return 0;}
-PrototypeAST *ErrorP(const char *Str) { Error(Str); return 0; }
-FunctionAST *ErrorF(const char *Str) { Error(Str); return 0; }
+ExprAST *Error(const char *Str) {
+  fprintf(stderr, "Error: %s\n", Str);
+  return 0;
+}
+PrototypeAST *ErrorP(const char *Str) {
+  Error(Str);
+  return 0;
+}
+FunctionAST *ErrorF(const char *Str) {
+  Error(Str);
+  return 0;
+}
 
 static ExprAST *ParseExpression();
 
@@ -269,22 +304,24 @@ static ExprAST *ParseExpression();
 ///   ::= identifier '(' expression* ')'
 static ExprAST *ParseIdentifierExpr() {
   std::string IdName = IdentifierStr;
-  
-  getNextToken();  // eat identifier.
-  
+
+  getNextToken(); // eat identifier.
+
   if (CurTok != '(') // Simple variable ref.
     return new VariableExprAST(IdName);
-  
+
   // Call.
-  getNextToken();  // eat (
-  std::vector<ExprAST*> Args;
+  getNextToken(); // eat (
+  std::vector<ExprAST *> Args;
   if (CurTok != ')') {
     while (1) {
       ExprAST *Arg = ParseExpression();
-      if (!Arg) return 0;
+      if (!Arg)
+        return 0;
       Args.push_back(Arg);
 
-      if (CurTok == ')') break;
+      if (CurTok == ')')
+        break;
 
       if (CurTok != ',')
         return Error("Expected ')' or ',' in argument list");
@@ -294,7 +331,7 @@ static ExprAST *ParseIdentifierExpr() {
 
   // Eat the ')'.
   getNextToken();
-  
+
   return new CallExprAST(IdName, Args);
 }
 
@@ -307,126 +344,136 @@ static ExprAST *ParseNumberExpr() {
 
 /// parenexpr ::= '(' expression ')'
 static ExprAST *ParseParenExpr() {
-  getNextToken();  // eat (.
+  getNextToken(); // eat (.
   ExprAST *V = ParseExpression();
-  if (!V) return 0;
-  
+  if (!V)
+    return 0;
+
   if (CurTok != ')')
     return Error("expected ')'");
-  getNextToken();  // eat ).
+  getNextToken(); // eat ).
   return V;
 }
 
 /// ifexpr ::= 'if' expression 'then' expression 'else' expression
 static ExprAST *ParseIfExpr() {
-  getNextToken();  // eat the if.
-  
+  getNextToken(); // eat the if.
+
   // condition.
   ExprAST *Cond = ParseExpression();
-  if (!Cond) return 0;
-  
+  if (!Cond)
+    return 0;
+
   if (CurTok != tok_then)
     return Error("expected then");
-  getNextToken();  // eat the then
-  
+  getNextToken(); // eat the then
+
   ExprAST *Then = ParseExpression();
-  if (Then == 0) return 0;
-  
+  if (Then == 0)
+    return 0;
+
   if (CurTok != tok_else)
     return Error("expected else");
-  
+
   getNextToken();
-  
+
   ExprAST *Else = ParseExpression();
-  if (!Else) return 0;
-  
+  if (!Else)
+    return 0;
+
   return new IfExprAST(Cond, Then, Else);
 }
 
 /// forexpr ::= 'for' identifier '=' expr ',' expr (',' expr)? 'in' expression
 static ExprAST *ParseForExpr() {
-  getNextToken();  // eat the for.
+  getNextToken(); // eat the for.
 
   if (CurTok != tok_identifier)
     return Error("expected identifier after for");
-  
+
   std::string IdName = IdentifierStr;
-  getNextToken();  // eat identifier.
-  
+  getNextToken(); // eat identifier.
+
   if (CurTok != '=')
     return Error("expected '=' after for");
-  getNextToken();  // eat '='.
-  
-  
+  getNextToken(); // eat '='.
+
   ExprAST *Start = ParseExpression();
-  if (Start == 0) return 0;
+  if (Start == 0)
+    return 0;
   if (CurTok != ',')
     return Error("expected ',' after for start value");
   getNextToken();
-  
+
   ExprAST *End = ParseExpression();
-  if (End == 0) return 0;
-  
+  if (End == 0)
+    return 0;
+
   // The step value is optional.
   ExprAST *Step = 0;
   if (CurTok == ',') {
     getNextToken();
     Step = ParseExpression();
-    if (Step == 0) return 0;
+    if (Step == 0)
+      return 0;
   }
-  
+
   if (CurTok != tok_in)
     return Error("expected 'in' after for");
-  getNextToken();  // eat 'in'.
-  
+  getNextToken(); // eat 'in'.
+
   ExprAST *Body = ParseExpression();
-  if (Body == 0) return 0;
+  if (Body == 0)
+    return 0;
 
   return new ForExprAST(IdName, Start, End, Step, Body);
 }
 
-/// varexpr ::= 'var' identifier ('=' expression)? 
+/// varexpr ::= 'var' identifier ('=' expression)?
 //                    (',' identifier ('=' expression)?)* 'in' expression
 static ExprAST *ParseVarExpr() {
-  getNextToken();  // eat the var.
+  getNextToken(); // eat the var.
 
-  std::vector<std::pair<std::string, ExprAST*> > VarNames;
+  std::vector<std::pair<std::string, ExprAST *> > VarNames;
 
   // At least one variable name is required.
   if (CurTok != tok_identifier)
     return Error("expected identifier after var");
-  
+
   while (1) {
     std::string Name = IdentifierStr;
-    getNextToken();  // eat identifier.
+    getNextToken(); // eat identifier.
 
     // Read the optional initializer.
     ExprAST *Init = 0;
     if (CurTok == '=') {
       getNextToken(); // eat the '='.
-      
+
       Init = ParseExpression();
-      if (Init == 0) return 0;
+      if (Init == 0)
+        return 0;
     }
-    
+
     VarNames.push_back(std::make_pair(Name, Init));
-    
+
     // End of var list, exit loop.
-    if (CurTok != ',') break;
+    if (CurTok != ',')
+      break;
     getNextToken(); // eat the ','.
-    
+
     if (CurTok != tok_identifier)
       return Error("expected identifier list after var");
   }
-  
+
   // At this point, we have to have 'in'.
   if (CurTok != tok_in)
     return Error("expected 'in' keyword after 'var'");
-  getNextToken();  // eat 'in'.
-  
+  getNextToken(); // eat 'in'.
+
   ExprAST *Body = ParseExpression();
-  if (Body == 0) return 0;
-  
+  if (Body == 0)
+    return 0;
+
   return new VarExprAST(VarNames, Body);
 }
 
@@ -439,13 +486,20 @@ static ExprAST *ParseVarExpr() {
 ///   ::= varexpr
 static ExprAST *ParsePrimary() {
   switch (CurTok) {
-  default: return Error("unknown token when expecting an expression");
-  case tok_identifier: return ParseIdentifierExpr();
-  case tok_number:     return ParseNumberExpr();
-  case '(':            return ParseParenExpr();
-  case tok_if:         return ParseIfExpr();
-  case tok_for:        return ParseForExpr();
-  case tok_var:        return ParseVarExpr();
+  default:
+    return Error("unknown token when expecting an expression");
+  case tok_identifier:
+    return ParseIdentifierExpr();
+  case tok_number:
+    return ParseNumberExpr();
+  case '(':
+    return ParseParenExpr();
+  case tok_if:
+    return ParseIfExpr();
+  case tok_for:
+    return ParseForExpr();
+  case tok_var:
+    return ParseVarExpr();
   }
 }
 
@@ -456,7 +510,7 @@ static ExprAST *ParseUnary() {
   // If the current token is not an operator, it must be a primary expr.
   if (!isascii(CurTok) || CurTok == '(' || CurTok == ',')
     return ParsePrimary();
-  
+
   // If this is a unary operator, read it.
   int Opc = CurTok;
   getNextToken();
@@ -471,28 +525,30 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
   // If this is a binop, find its precedence.
   while (1) {
     int TokPrec = GetTokPrecedence();
-    
+
     // If this is a binop that binds at least as tightly as the current binop,
     // consume it, otherwise we are done.
     if (TokPrec < ExprPrec)
       return LHS;
-    
+
     // Okay, we know this is a binop.
     int BinOp = CurTok;
-    getNextToken();  // eat binop
-    
+    getNextToken(); // eat binop
+
     // Parse the unary expression after the binary operator.
     ExprAST *RHS = ParseUnary();
-    if (!RHS) return 0;
-    
+    if (!RHS)
+      return 0;
+
     // If BinOp binds less tightly with RHS than the operator after RHS, let
     // the pending operator take RHS as its LHS.
     int NextPrec = GetTokPrecedence();
     if (TokPrec < NextPrec) {
-      RHS = ParseBinOpRHS(TokPrec+1, RHS);
-      if (RHS == 0) return 0;
+      RHS = ParseBinOpRHS(TokPrec + 1, RHS);
+      if (RHS == 0)
+        return 0;
     }
-    
+
     // Merge LHS/RHS.
     LHS = new BinaryExprAST(BinOp, LHS, RHS);
   }
@@ -503,8 +559,9 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
 ///
 static ExprAST *ParseExpression() {
   ExprAST *LHS = ParseUnary();
-  if (!LHS) return 0;
-  
+  if (!LHS)
+    return 0;
+
   return ParseBinOpRHS(0, LHS);
 }
 
@@ -514,10 +571,10 @@ static ExprAST *ParseExpression() {
 ///   ::= unary LETTER (id)
 static PrototypeAST *ParsePrototype() {
   std::string FnName;
-  
+
   unsigned Kind = 0; // 0 = identifier, 1 = unary, 2 = binary.
   unsigned BinaryPrecedence = 30;
-  
+
   switch (CurTok) {
   default:
     return ErrorP("Expected function name in prototype");
@@ -543,7 +600,7 @@ static PrototypeAST *ParsePrototype() {
     FnName += (char)CurTok;
     Kind = 2;
     getNextToken();
-    
+
     // Read the precedence if present.
     if (CurTok == tok_number) {
       if (NumVal < 1 || NumVal > 100)
@@ -553,31 +610,32 @@ static PrototypeAST *ParsePrototype() {
     }
     break;
   }
-  
+
   if (CurTok != '(')
     return ErrorP("Expected '(' in prototype");
-  
+
   std::vector<std::string> ArgNames;
   while (getNextToken() == tok_identifier)
     ArgNames.push_back(IdentifierStr);
   if (CurTok != ')')
     return ErrorP("Expected ')' in prototype");
-  
+
   // success.
-  getNextToken();  // eat ')'.
-  
+  getNextToken(); // eat ')'.
+
   // Verify right number of names for operator.
   if (Kind && ArgNames.size() != Kind)
     return ErrorP("Invalid number of operands for operator");
-  
+
   return new PrototypeAST(FnName, ArgNames, Kind != 0, BinaryPrecedence);
 }
 
 /// definition ::= 'def' prototype expression
 static FunctionAST *ParseDefinition() {
-  getNextToken();  // eat def.
+  getNextToken(); // eat def.
   PrototypeAST *Proto = ParsePrototype();
-  if (Proto == 0) return 0;
+  if (Proto == 0)
+    return 0;
 
   if (ExprAST *E = ParseExpression())
     return new FunctionAST(Proto, E);
@@ -596,7 +654,7 @@ static FunctionAST *ParseTopLevelExpr() {
 
 /// external ::= 'extern' prototype
 static PrototypeAST *ParseExtern() {
-  getNextToken();  // eat extern.
+  getNextToken(); // eat extern.
   return ParsePrototype();
 }
 
@@ -606,17 +664,20 @@ static PrototypeAST *ParseExtern() {
 
 static Module *TheModule;
 static IRBuilder<> Builder(getGlobalContext());
-static std::map<std::string, AllocaInst*> NamedValues;
-static FunctionPassManager *TheFPM;
+static std::map<std::string, AllocaInst *> NamedValues;
+static legacy::FunctionPassManager *TheFPM;
 
-Value *ErrorV(const char *Str) { Error(Str); return 0; }
+Value *ErrorV(const char *Str) {
+  Error(Str);
+  return 0;
+}
 
 /// CreateEntryBlockAlloca - Create an alloca instruction in the entry block of
 /// the function.  This is used for mutable variables etc.
 static AllocaInst *CreateEntryBlockAlloca(Function *TheFunction,
                                           const std::string &VarName) {
   IRBuilder<> TmpB(&TheFunction->getEntryBlock(),
-                 TheFunction->getEntryBlock().begin());
+                   TheFunction->getEntryBlock().begin());
   return TmpB.CreateAlloca(Type::getDoubleTy(getGlobalContext()), 0,
                            VarName.c_str());
 }
@@ -628,7 +689,8 @@ Value *NumberExprAST::Codegen() {
 Value *VariableExprAST::Codegen() {
   // Look this variable up in the function.
   Value *V = NamedValues[Name];
-  if (V == 0) return ErrorV("Unknown variable name");
+  if (V == 0)
+    return ErrorV("Unknown variable name");
 
   // Load the value.
   return Builder.CreateLoad(V, Name.c_str());
@@ -636,12 +698,13 @@ Value *VariableExprAST::Codegen() {
 
 Value *UnaryExprAST::Codegen() {
   Value *OperandV = Operand->Codegen();
-  if (OperandV == 0) return 0;
-  
-  Function *F = TheModule->getFunction(std::string("unary")+Opcode);
+  if (OperandV == 0)
+    return 0;
+
+  Function *F = TheModule->getFunction(std::string("unary") + Opcode);
   if (F == 0)
     return ErrorV("Unknown unary operator");
-  
+
   return Builder.CreateCall(F, OperandV, "unop");
 }
 
@@ -649,42 +712,49 @@ Value *BinaryExprAST::Codegen() {
   // Special case '=' because we don't want to emit the LHS as an expression.
   if (Op == '=') {
     // Assignment requires the LHS to be an identifier.
-    VariableExprAST *LHSE = dynamic_cast<VariableExprAST*>(LHS);
+    VariableExprAST *LHSE = dynamic_cast<VariableExprAST *>(LHS);
     if (!LHSE)
       return ErrorV("destination of '=' must be a variable");
     // Codegen the RHS.
     Value *Val = RHS->Codegen();
-    if (Val == 0) return 0;
+    if (Val == 0)
+      return 0;
 
     // Look up the name.
     Value *Variable = NamedValues[LHSE->getName()];
-    if (Variable == 0) return ErrorV("Unknown variable name");
+    if (Variable == 0)
+      return ErrorV("Unknown variable name");
 
     Builder.CreateStore(Val, Variable);
     return Val;
   }
-  
+
   Value *L = LHS->Codegen();
   Value *R = RHS->Codegen();
-  if (L == 0 || R == 0) return 0;
-  
+  if (L == 0 || R == 0)
+    return 0;
+
   switch (Op) {
-  case '+': return Builder.CreateFAdd(L, R, "addtmp");
-  case '-': return Builder.CreateFSub(L, R, "subtmp");
-  case '*': return Builder.CreateFMul(L, R, "multmp");    
+  case '+':
+    return Builder.CreateFAdd(L, R, "addtmp");
+  case '-':
+    return Builder.CreateFSub(L, R, "subtmp");
+  case '*':
+    return Builder.CreateFMul(L, R, "multmp");
   case '<':
     L = Builder.CreateFCmpULT(L, R, "cmptmp");
     // Convert bool 0/1 to double 0.0 or 1.0
     return Builder.CreateUIToFP(L, Type::getDoubleTy(getGlobalContext()),
                                 "booltmp");
-  default: break;
+  default:
+    break;
   }
-  
+
   // If it wasn't a builtin binary operator, it must be a user defined one. Emit
   // a call to it.
-  Function *F = TheModule->getFunction(std::string("binary")+Op);
+  Function *F = TheModule->getFunction(std::string("binary") + Op);
   assert(F && "binary operator not found!");
-  
+
   Value *Ops[] = { L, R };
   return Builder.CreateCall(F, Ops, "binop");
 }
@@ -694,66 +764,70 @@ Value *CallExprAST::Codegen() {
   Function *CalleeF = TheModule->getFunction(Callee);
   if (CalleeF == 0)
     return ErrorV("Unknown function referenced");
-  
+
   // If argument mismatch error.
   if (CalleeF->arg_size() != Args.size())
     return ErrorV("Incorrect # arguments passed");
 
-  std::vector<Value*> ArgsV;
+  std::vector<Value *> ArgsV;
   for (unsigned i = 0, e = Args.size(); i != e; ++i) {
     ArgsV.push_back(Args[i]->Codegen());
-    if (ArgsV.back() == 0) return 0;
+    if (ArgsV.back() == 0)
+      return 0;
   }
-  
+
   return Builder.CreateCall(CalleeF, ArgsV, "calltmp");
 }
 
 Value *IfExprAST::Codegen() {
   Value *CondV = Cond->Codegen();
-  if (CondV == 0) return 0;
-  
+  if (CondV == 0)
+    return 0;
+
   // Convert condition to a bool by comparing equal to 0.0.
-  CondV = Builder.CreateFCmpONE(CondV, 
-                              ConstantFP::get(getGlobalContext(), APFloat(0.0)),
-                                "ifcond");
-  
+  CondV = Builder.CreateFCmpONE(
+      CondV, ConstantFP::get(getGlobalContext(), APFloat(0.0)), "ifcond");
+
   Function *TheFunction = Builder.GetInsertBlock()->getParent();
-  
+
   // Create blocks for the then and else cases.  Insert the 'then' block at the
   // end of the function.
-  BasicBlock *ThenBB = BasicBlock::Create(getGlobalContext(), "then", TheFunction);
+  BasicBlock *ThenBB =
+      BasicBlock::Create(getGlobalContext(), "then", TheFunction);
   BasicBlock *ElseBB = BasicBlock::Create(getGlobalContext(), "else");
   BasicBlock *MergeBB = BasicBlock::Create(getGlobalContext(), "ifcont");
-  
+
   Builder.CreateCondBr(CondV, ThenBB, ElseBB);
-  
+
   // Emit then value.
   Builder.SetInsertPoint(ThenBB);
-  
+
   Value *ThenV = Then->Codegen();
-  if (ThenV == 0) return 0;
-  
+  if (ThenV == 0)
+    return 0;
+
   Builder.CreateBr(MergeBB);
   // Codegen of 'Then' can change the current block, update ThenBB for the PHI.
   ThenBB = Builder.GetInsertBlock();
-  
+
   // Emit else block.
   TheFunction->getBasicBlockList().push_back(ElseBB);
   Builder.SetInsertPoint(ElseBB);
-  
+
   Value *ElseV = Else->Codegen();
-  if (ElseV == 0) return 0;
-  
+  if (ElseV == 0)
+    return 0;
+
   Builder.CreateBr(MergeBB);
   // Codegen of 'Else' can change the current block, update ElseBB for the PHI.
   ElseBB = Builder.GetInsertBlock();
-  
+
   // Emit merge block.
   TheFunction->getBasicBlockList().push_back(MergeBB);
   Builder.SetInsertPoint(MergeBB);
-  PHINode *PN = Builder.CreatePHI(Type::getDoubleTy(getGlobalContext()), 2,
-                                  "iftmp");
-  
+  PHINode *PN =
+      Builder.CreatePHI(Type::getDoubleTy(getGlobalContext()), 2, "iftmp");
+
   PN->addIncoming(ThenV, ThenBB);
   PN->addIncoming(ElseV, ElseBB);
   return PN;
@@ -766,7 +840,7 @@ Value *ForExprAST::Codegen() {
   //   start = startexpr
   //   store start -> var
   //   goto loop
-  // loop: 
+  // loop:
   //   ...
   //   bodyexpr
   //   ...
@@ -779,95 +853,98 @@ Value *ForExprAST::Codegen() {
   //   store nextvar -> var
   //   br endcond, loop, endloop
   // outloop:
-  
+
   Function *TheFunction = Builder.GetInsertBlock()->getParent();
 
   // Create an alloca for the variable in the entry block.
   AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
-  
+
   // Emit the start code first, without 'variable' in scope.
   Value *StartVal = Start->Codegen();
-  if (StartVal == 0) return 0;
-  
+  if (StartVal == 0)
+    return 0;
+
   // Store the value into the alloca.
   Builder.CreateStore(StartVal, Alloca);
-  
+
   // Make the new basic block for the loop header, inserting after current
   // block.
-  BasicBlock *LoopBB = BasicBlock::Create(getGlobalContext(), "loop", TheFunction);
-  
+  BasicBlock *LoopBB =
+      BasicBlock::Create(getGlobalContext(), "loop", TheFunction);
+
   // Insert an explicit fall through from the current block to the LoopBB.
   Builder.CreateBr(LoopBB);
 
   // Start insertion in LoopBB.
   Builder.SetInsertPoint(LoopBB);
-  
+
   // Within the loop, the variable is defined equal to the PHI node.  If it
   // shadows an existing variable, we have to restore it, so save it now.
   AllocaInst *OldVal = NamedValues[VarName];
   NamedValues[VarName] = Alloca;
-  
+
   // Emit the body of the loop.  This, like any other expr, can change the
   // current BB.  Note that we ignore the value computed by the body, but don't
   // allow an error.
   if (Body->Codegen() == 0)
     return 0;
-  
+
   // Emit the step value.
   Value *StepVal;
   if (Step) {
     StepVal = Step->Codegen();
-    if (StepVal == 0) return 0;
+    if (StepVal == 0)
+      return 0;
   } else {
     // If not specified, use 1.0.
     StepVal = ConstantFP::get(getGlobalContext(), APFloat(1.0));
   }
-  
+
   // Compute the end condition.
   Value *EndCond = End->Codegen();
-  if (EndCond == 0) return EndCond;
-  
+  if (EndCond == 0)
+    return EndCond;
+
   // Reload, increment, and restore the alloca.  This handles the case where
   // the body of the loop mutates the variable.
   Value *CurVar = Builder.CreateLoad(Alloca, VarName.c_str());
   Value *NextVar = Builder.CreateFAdd(CurVar, StepVal, "nextvar");
   Builder.CreateStore(NextVar, Alloca);
-  
+
   // Convert condition to a bool by comparing equal to 0.0.
-  EndCond = Builder.CreateFCmpONE(EndCond, 
-                              ConstantFP::get(getGlobalContext(), APFloat(0.0)),
-                                  "loopcond");
-  
+  EndCond = Builder.CreateFCmpONE(
+      EndCond, ConstantFP::get(getGlobalContext(), APFloat(0.0)), "loopcond");
+
   // Create the "after loop" block and insert it.
-  BasicBlock *AfterBB = BasicBlock::Create(getGlobalContext(), "afterloop", TheFunction);
-  
+  BasicBlock *AfterBB =
+      BasicBlock::Create(getGlobalContext(), "afterloop", TheFunction);
+
   // Insert the conditional branch into the end of LoopEndBB.
   Builder.CreateCondBr(EndCond, LoopBB, AfterBB);
-  
+
   // Any new code will be inserted in AfterBB.
   Builder.SetInsertPoint(AfterBB);
-  
+
   // Restore the unshadowed variable.
   if (OldVal)
     NamedValues[VarName] = OldVal;
   else
     NamedValues.erase(VarName);
 
-  
   // for expr always returns 0.0.
   return Constant::getNullValue(Type::getDoubleTy(getGlobalContext()));
 }
 
 Value *VarExprAST::Codegen() {
   std::vector<AllocaInst *> OldBindings;
-  
+
   Function *TheFunction = Builder.GetInsertBlock()->getParent();
 
   // Register all variables and emit their initializer.
   for (unsigned i = 0, e = VarNames.size(); i != e; ++i) {
     const std::string &VarName = VarNames[i].first;
     ExprAST *Init = VarNames[i].second;
-    
+
     // Emit the initializer before adding the variable to scope, this prevents
     // the initializer from referencing the variable itself, and permits stuff
     // like this:
@@ -876,26 +953,28 @@ Value *VarExprAST::Codegen() {
     Value *InitVal;
     if (Init) {
       InitVal = Init->Codegen();
-      if (InitVal == 0) return 0;
+      if (InitVal == 0)
+        return 0;
     } else { // If not specified, use 0.0.
       InitVal = ConstantFP::get(getGlobalContext(), APFloat(0.0));
     }
-    
+
     AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
     Builder.CreateStore(InitVal, Alloca);
 
     // Remember the old variable binding so that we can restore the binding when
     // we unrecurse.
     OldBindings.push_back(NamedValues[VarName]);
-    
+
     // Remember this binding.
     NamedValues[VarName] = Alloca;
   }
-  
+
   // Codegen the body, now that all vars are in scope.
   Value *BodyVal = Body->Codegen();
-  if (BodyVal == 0) return 0;
-  
+  if (BodyVal == 0)
+    return 0;
+
   // Pop all our variables from scope.
   for (unsigned i = 0, e = VarNames.size(); i != e; ++i)
     NamedValues[VarNames[i].first] = OldBindings[i];
@@ -906,39 +985,40 @@ Value *VarExprAST::Codegen() {
 
 Function *PrototypeAST::Codegen() {
   // Make the function type:  double(double,double) etc.
-  std::vector<Type*> Doubles(Args.size(), 
-                             Type::getDoubleTy(getGlobalContext()));
-  FunctionType *FT = FunctionType::get(Type::getDoubleTy(getGlobalContext()),
-                                       Doubles, false);
-  
-  Function *F = Function::Create(FT, Function::ExternalLinkage, Name, TheModule);
-  
+  std::vector<Type *> Doubles(Args.size(),
+                              Type::getDoubleTy(getGlobalContext()));
+  FunctionType *FT =
+      FunctionType::get(Type::getDoubleTy(getGlobalContext()), Doubles, false);
+
+  Function *F =
+      Function::Create(FT, Function::ExternalLinkage, Name, TheModule);
+
   // If F conflicted, there was already something named 'Name'.  If it has a
   // body, don't allow redefinition or reextern.
   if (F->getName() != Name) {
     // Delete the one we just made and get the existing one.
     F->eraseFromParent();
     F = TheModule->getFunction(Name);
-    
+
     // If F already has a body, reject this.
     if (!F->empty()) {
       ErrorF("redefinition of function");
       return 0;
     }
-    
+
     // If F took a different number of args, reject.
     if (F->arg_size() != Args.size()) {
       ErrorF("redefinition of function with different # args");
       return 0;
     }
   }
-  
+
   // Set names for all arguments.
   unsigned Idx = 0;
   for (Function::arg_iterator AI = F->arg_begin(); Idx != Args.size();
        ++AI, ++Idx)
     AI->setName(Args[Idx]);
-    
+
   return F;
 }
 
@@ -960,19 +1040,19 @@ void PrototypeAST::CreateArgumentAllocas(Function *F) {
 
 Function *FunctionAST::Codegen() {
   NamedValues.clear();
-  
+
   Function *TheFunction = Proto->Codegen();
   if (TheFunction == 0)
     return 0;
-  
+
   // If this is an operator, install it.
   if (Proto->isBinaryOp())
     BinopPrecedence[Proto->getOperatorName()] = Proto->getBinaryPrecedence();
-  
+
   // Create a new basic block to start insertion into.
   BasicBlock *BB = BasicBlock::Create(getGlobalContext(), "entry", TheFunction);
   Builder.SetInsertPoint(BB);
-  
+
   // Add all arguments to the symbol table and create their allocas.
   Proto->CreateArgumentAllocas(TheFunction);
 
@@ -985,10 +1065,10 @@ Function *FunctionAST::Codegen() {
 
     // Optimize the function.
     TheFPM->run(*TheFunction);
-    
+
     return TheFunction;
   }
-  
+
   // Error reading body, remove function.
   TheFunction->eraseFromParent();
 
@@ -1031,9 +1111,10 @@ static void HandleTopLevelExpression() {
   // Evaluate a top-level expression into an anonymous function.
   if (FunctionAST *F = ParseTopLevelExpr()) {
     if (Function *LF = F->Codegen()) {
+      TheExecutionEngine->finalizeObject();
       // JIT the function, returning a function pointer.
       void *FPtr = TheExecutionEngine->getPointerToFunction(LF);
-      
+
       // Cast it to the right type (takes no arguments, returns a double) so we
       // can call it as a native function.
       double (*FP)() = (double (*)())(intptr_t)FPtr;
@@ -1050,11 +1131,20 @@ static void MainLoop() {
   while (1) {
     fprintf(stderr, "ready> ");
     switch (CurTok) {
-    case tok_eof:    return;
-    case ';':        getNextToken(); break;  // ignore top-level semicolons.
-    case tok_def:    HandleDefinition(); break;
-    case tok_extern: HandleExtern(); break;
-    default:         HandleTopLevelExpression(); break;
+    case tok_eof:
+      return;
+    case ';':
+      getNextToken();
+      break; // ignore top-level semicolons.
+    case tok_def:
+      HandleDefinition();
+      break;
+    case tok_extern:
+      HandleExtern();
+      break;
+    default:
+      HandleTopLevelExpression();
+      break;
     }
   }
 }
@@ -1064,15 +1154,13 @@ static void MainLoop() {
 //===----------------------------------------------------------------------===//
 
 /// putchard - putchar that takes a double and returns 0.
-extern "C" 
-double putchard(double X) {
+extern "C" double putchard(double X) {
   putchar((char)X);
   return 0;
 }
 
 /// printd - printf that takes a double prints it as "%f\n", returning 0.
-extern "C" 
-double printd(double X) {
+extern "C" double printd(double X) {
   printf("%f\n", X);
   return 0;
 }
@@ -1083,6 +1171,8 @@ double printd(double X) {
 
 int main() {
   InitializeNativeTarget();
+  InitializeNativeTargetAsmPrinter();
+  InitializeNativeTargetAsmParser();
   LLVMContext &Context = getGlobalContext();
 
   // Install standard binary operators.
@@ -1091,7 +1181,7 @@ int main() {
   BinopPrecedence['<'] = 10;
   BinopPrecedence['+'] = 20;
   BinopPrecedence['-'] = 20;
-  BinopPrecedence['*'] = 40;  // highest.
+  BinopPrecedence['*'] = 40; // highest.
 
   // Prime the first token.
   fprintf(stderr, "ready> ");
@@ -1104,13 +1194,16 @@ int main() {
   // Create the JIT.  This takes ownership of the module.
   std::string ErrStr;
   TheExecutionEngine =
-      EngineBuilder(std::move(Owner)).setErrorStr(&ErrStr).create();
+      EngineBuilder(std::move(Owner))
+          .setErrorStr(&ErrStr)
+          .setMCJITMemoryManager(llvm::make_unique<SectionMemoryManager>())
+          .create();
   if (!TheExecutionEngine) {
     fprintf(stderr, "Could not create ExecutionEngine: %s\n", ErrStr.c_str());
     exit(1);
   }
 
-  FunctionPassManager OurFPM(TheModule);
+  legacy::FunctionPassManager OurFPM(TheModule);
 
   // Set up the optimizer pipeline.  Start with registering info about how the
   // target lays out data structures.
diff --git a/examples/Kaleidoscope/Chapter8/CMakeLists.txt b/examples/Kaleidoscope/Chapter8/CMakeLists.txt
new file mode 100644
index 0000000..42882e9
--- /dev/null
+++ b/examples/Kaleidoscope/Chapter8/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  ExecutionEngine
+  MCJIT
+  Support
+  native
+  )
+
+set(LLVM_REQUIRES_RTTI 1)
+
+add_kaleidoscope_chapter(Kaleidoscope-Ch8
+  toy.cpp
+  )
diff --git a/examples/Kaleidoscope/Chapter8/Makefile b/examples/Kaleidoscope/Chapter8/Makefile
new file mode 100644
index 0000000..8e4d422
--- /dev/null
+++ b/examples/Kaleidoscope/Chapter8/Makefile
@@ -0,0 +1,16 @@
+##===- examples/Kaleidoscope/Chapter7/Makefile -------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+TOOLNAME = Kaleidoscope-Ch8
+EXAMPLE_TOOL = 1
+REQUIRES_RTTI := 1
+
+LINK_COMPONENTS := core mcjit native
+
+include $(LEVEL)/Makefile.common
diff --git a/examples/Kaleidoscope/Chapter8/toy.cpp b/examples/Kaleidoscope/Chapter8/toy.cpp
new file mode 100644
index 0000000..30d4669
--- /dev/null
+++ b/examples/Kaleidoscope/Chapter8/toy.cpp
@@ -0,0 +1,1494 @@
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/MCJIT.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Transforms/Scalar.h"
+#include <cctype>
+#include <cstdio>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Lexer
+//===----------------------------------------------------------------------===//
+
+// The lexer returns tokens [0-255] if it is an unknown character, otherwise one
+// of these for known things.
+enum Token {
+  tok_eof = -1,
+
+  // commands
+  tok_def = -2,
+  tok_extern = -3,
+
+  // primary
+  tok_identifier = -4,
+  tok_number = -5,
+
+  // control
+  tok_if = -6,
+  tok_then = -7,
+  tok_else = -8,
+  tok_for = -9,
+  tok_in = -10,
+
+  // operators
+  tok_binary = -11,
+  tok_unary = -12,
+
+  // var definition
+  tok_var = -13
+};
+
+std::string getTokName(int Tok) {
+  switch (Tok) {
+  case tok_eof:
+    return "eof";
+  case tok_def:
+    return "def";
+  case tok_extern:
+    return "extern";
+  case tok_identifier:
+    return "identifier";
+  case tok_number:
+    return "number";
+  case tok_if:
+    return "if";
+  case tok_then:
+    return "then";
+  case tok_else:
+    return "else";
+  case tok_for:
+    return "for";
+  case tok_in:
+    return "in";
+  case tok_binary:
+    return "binary";
+  case tok_unary:
+    return "unary";
+  case tok_var:
+    return "var";
+  }
+  return std::string(1, (char)Tok);
+}
+
+namespace {
+class PrototypeAST;
+class ExprAST;
+}
+static IRBuilder<> Builder(getGlobalContext());
+struct DebugInfo {
+  DICompileUnit TheCU;
+  DIType DblTy;
+  std::vector<DIScope *> LexicalBlocks;
+  std::map<const PrototypeAST *, DIScope> FnScopeMap;
+
+  void emitLocation(ExprAST *AST);
+  DIType getDoubleTy();
+} KSDbgInfo;
+
+static std::string IdentifierStr; // Filled in if tok_identifier
+static double NumVal;             // Filled in if tok_number
+struct SourceLocation {
+  int Line;
+  int Col;
+};
+static SourceLocation CurLoc;
+static SourceLocation LexLoc = { 1, 0 };
+
+static int advance() {
+  int LastChar = getchar();
+
+  if (LastChar == '\n' || LastChar == '\r') {
+    LexLoc.Line++;
+    LexLoc.Col = 0;
+  } else
+    LexLoc.Col++;
+  return LastChar;
+}
+
+/// gettok - Return the next token from standard input.
+static int gettok() {
+  static int LastChar = ' ';
+
+  // Skip any whitespace.
+  while (isspace(LastChar))
+    LastChar = advance();
+
+  CurLoc = LexLoc;
+
+  if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]*
+    IdentifierStr = LastChar;
+    while (isalnum((LastChar = advance())))
+      IdentifierStr += LastChar;
+
+    if (IdentifierStr == "def")
+      return tok_def;
+    if (IdentifierStr == "extern")
+      return tok_extern;
+    if (IdentifierStr == "if")
+      return tok_if;
+    if (IdentifierStr == "then")
+      return tok_then;
+    if (IdentifierStr == "else")
+      return tok_else;
+    if (IdentifierStr == "for")
+      return tok_for;
+    if (IdentifierStr == "in")
+      return tok_in;
+    if (IdentifierStr == "binary")
+      return tok_binary;
+    if (IdentifierStr == "unary")
+      return tok_unary;
+    if (IdentifierStr == "var")
+      return tok_var;
+    return tok_identifier;
+  }
+
+  if (isdigit(LastChar) || LastChar == '.') { // Number: [0-9.]+
+    std::string NumStr;
+    do {
+      NumStr += LastChar;
+      LastChar = advance();
+    } while (isdigit(LastChar) || LastChar == '.');
+
+    NumVal = strtod(NumStr.c_str(), 0);
+    return tok_number;
+  }
+
+  if (LastChar == '#') {
+    // Comment until end of line.
+    do
+      LastChar = advance();
+    while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
+
+    if (LastChar != EOF)
+      return gettok();
+  }
+
+  // Check for end of file.  Don't eat the EOF.
+  if (LastChar == EOF)
+    return tok_eof;
+
+  // Otherwise, just return the character as its ascii value.
+  int ThisChar = LastChar;
+  LastChar = advance();
+  return ThisChar;
+}
+
+//===----------------------------------------------------------------------===//
+// Abstract Syntax Tree (aka Parse Tree)
+//===----------------------------------------------------------------------===//
+namespace {
+
+std::ostream &indent(std::ostream &O, int size) {
+  return O << std::string(size, ' ');
+}
+
+/// ExprAST - Base class for all expression nodes.
+class ExprAST {
+  SourceLocation Loc;
+
+public:
+  int getLine() const { return Loc.Line; }
+  int getCol() const { return Loc.Col; }
+  ExprAST(SourceLocation Loc = CurLoc) : Loc(Loc) {}
+  virtual std::ostream &dump(std::ostream &out, int ind) {
+    return out << ':' << getLine() << ':' << getCol() << '\n';
+  }
+  virtual ~ExprAST() {}
+  virtual Value *Codegen() = 0;
+};
+
+/// NumberExprAST - Expression class for numeric literals like "1.0".
+class NumberExprAST : public ExprAST {
+  double Val;
+
+public:
+  NumberExprAST(double val) : Val(val) {}
+  virtual std::ostream &dump(std::ostream &out, int ind) {
+    return ExprAST::dump(out << Val, ind);
+  }
+  virtual Value *Codegen();
+};
+
+/// VariableExprAST - Expression class for referencing a variable, like "a".
+class VariableExprAST : public ExprAST {
+  std::string Name;
+
+public:
+  VariableExprAST(SourceLocation Loc, const std::string &name)
+      : ExprAST(Loc), Name(name) {}
+  const std::string &getName() const { return Name; }
+  virtual std::ostream &dump(std::ostream &out, int ind) {
+    return ExprAST::dump(out << Name, ind);
+  }
+  virtual Value *Codegen();
+};
+
+/// UnaryExprAST - Expression class for a unary operator.
+class UnaryExprAST : public ExprAST {
+  char Opcode;
+  ExprAST *Operand;
+
+public:
+  UnaryExprAST(char opcode, ExprAST *operand)
+      : Opcode(opcode), Operand(operand) {}
+  virtual std::ostream &dump(std::ostream &out, int ind) {
+    ExprAST::dump(out << "unary" << Opcode, ind);
+    Operand->dump(out, ind + 1);
+    return out;
+  }
+  virtual Value *Codegen();
+};
+
+/// BinaryExprAST - Expression class for a binary operator.
+class BinaryExprAST : public ExprAST {
+  char Op;
+  ExprAST *LHS, *RHS;
+
+public:
+  BinaryExprAST(SourceLocation Loc, char op, ExprAST *lhs, ExprAST *rhs)
+      : ExprAST(Loc), Op(op), LHS(lhs), RHS(rhs) {}
+  virtual std::ostream &dump(std::ostream &out, int ind) {
+    ExprAST::dump(out << "binary" << Op, ind);
+    LHS->dump(indent(out, ind) << "LHS:", ind + 1);
+    RHS->dump(indent(out, ind) << "RHS:", ind + 1);
+    return out;
+  }
+  virtual Value *Codegen();
+};
+
+/// CallExprAST - Expression class for function calls.
+class CallExprAST : public ExprAST {
+  std::string Callee;
+  std::vector<ExprAST *> Args;
+
+public:
+  CallExprAST(SourceLocation Loc, const std::string &callee,
+              std::vector<ExprAST *> &args)
+      : ExprAST(Loc), Callee(callee), Args(args) {}
+  virtual std::ostream &dump(std::ostream &out, int ind) {
+    ExprAST::dump(out << "call " << Callee, ind);
+    for (ExprAST *Arg : Args)
+      Arg->dump(indent(out, ind + 1), ind + 1);
+    return out;
+  }
+  virtual Value *Codegen();
+};
+
+/// IfExprAST - Expression class for if/then/else.
+class IfExprAST : public ExprAST {
+  ExprAST *Cond, *Then, *Else;
+
+public:
+  IfExprAST(SourceLocation Loc, ExprAST *cond, ExprAST *then, ExprAST *_else)
+      : ExprAST(Loc), Cond(cond), Then(then), Else(_else) {}
+  virtual std::ostream &dump(std::ostream &out, int ind) {
+    ExprAST::dump(out << "if", ind);
+    Cond->dump(indent(out, ind) << "Cond:", ind + 1);
+    Then->dump(indent(out, ind) << "Then:", ind + 1);
+    Else->dump(indent(out, ind) << "Else:", ind + 1);
+    return out;
+  }
+  virtual Value *Codegen();
+};
+
+/// ForExprAST - Expression class for for/in.
+class ForExprAST : public ExprAST {
+  std::string VarName;
+  ExprAST *Start, *End, *Step, *Body;
+
+public:
+  ForExprAST(const std::string &varname, ExprAST *start, ExprAST *end,
+             ExprAST *step, ExprAST *body)
+      : VarName(varname), Start(start), End(end), Step(step), Body(body) {}
+  virtual std::ostream &dump(std::ostream &out, int ind) {
+    ExprAST::dump(out << "for", ind);
+    Start->dump(indent(out, ind) << "Cond:", ind + 1);
+    End->dump(indent(out, ind) << "End:", ind + 1);
+    Step->dump(indent(out, ind) << "Step:", ind + 1);
+    Body->dump(indent(out, ind) << "Body:", ind + 1);
+    return out;
+  }
+  virtual Value *Codegen();
+};
+
+/// VarExprAST - Expression class for var/in
+class VarExprAST : public ExprAST {
+  std::vector<std::pair<std::string, ExprAST *> > VarNames;
+  ExprAST *Body;
+
+public:
+  VarExprAST(const std::vector<std::pair<std::string, ExprAST *> > &varnames,
+             ExprAST *body)
+      : VarNames(varnames), Body(body) {}
+
+  virtual std::ostream &dump(std::ostream &out, int ind) {
+    ExprAST::dump(out << "var", ind);
+    for (const auto &NamedVar : VarNames)
+      NamedVar.second->dump(indent(out, ind) << NamedVar.first << ':', ind + 1);
+    Body->dump(indent(out, ind) << "Body:", ind + 1);
+    return out;
+  }
+  virtual Value *Codegen();
+};
+
+/// PrototypeAST - This class represents the "prototype" for a function,
+/// which captures its argument names as well as if it is an operator.
+class PrototypeAST {
+  std::string Name;
+  std::vector<std::string> Args;
+  bool isOperator;
+  unsigned Precedence; // Precedence if a binary op.
+  int Line;
+
+public:
+  PrototypeAST(SourceLocation Loc, const std::string &name,
+               const std::vector<std::string> &args, bool isoperator = false,
+               unsigned prec = 0)
+      : Name(name), Args(args), isOperator(isoperator), Precedence(prec),
+        Line(Loc.Line) {}
+
+  bool isUnaryOp() const { return isOperator && Args.size() == 1; }
+  bool isBinaryOp() const { return isOperator && Args.size() == 2; }
+
+  char getOperatorName() const {
+    assert(isUnaryOp() || isBinaryOp());
+    return Name[Name.size() - 1];
+  }
+
+  unsigned getBinaryPrecedence() const { return Precedence; }
+
+  Function *Codegen();
+
+  void CreateArgumentAllocas(Function *F);
+  const std::vector<std::string> &getArgs() const { return Args; }
+};
+
+/// FunctionAST - This class represents a function definition itself.
+class FunctionAST {
+  PrototypeAST *Proto;
+  ExprAST *Body;
+
+public:
+  FunctionAST(PrototypeAST *proto, ExprAST *body) : Proto(proto), Body(body) {}
+
+  std::ostream &dump(std::ostream &out, int ind) {
+    indent(out, ind) << "FunctionAST\n";
+    ++ind;
+    indent(out, ind) << "Body:";
+    return Body ? Body->dump(out, ind) : out << "null\n";
+  }
+
+  Function *Codegen();
+};
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Parser
+//===----------------------------------------------------------------------===//
+
+/// CurTok/getNextToken - Provide a simple token buffer.  CurTok is the current
+/// token the parser is looking at.  getNextToken reads another token from the
+/// lexer and updates CurTok with its results.
+static int CurTok;
+static int getNextToken() { return CurTok = gettok(); }
+
+/// BinopPrecedence - This holds the precedence for each binary operator that is
+/// defined.
+static std::map<char, int> BinopPrecedence;
+
+/// GetTokPrecedence - Get the precedence of the pending binary operator token.
+static int GetTokPrecedence() {
+  if (!isascii(CurTok))
+    return -1;
+
+  // Make sure it's a declared binop.
+  int TokPrec = BinopPrecedence[CurTok];
+  if (TokPrec <= 0)
+    return -1;
+  return TokPrec;
+}
+
+/// Error* - These are little helper functions for error handling.
+ExprAST *Error(const char *Str) {
+  fprintf(stderr, "Error: %s\n", Str);
+  return 0;
+}
+PrototypeAST *ErrorP(const char *Str) {
+  Error(Str);
+  return 0;
+}
+FunctionAST *ErrorF(const char *Str) {
+  Error(Str);
+  return 0;
+}
+
+static ExprAST *ParseExpression();
+
+/// identifierexpr
+///   ::= identifier
+///   ::= identifier '(' expression* ')'
+static ExprAST *ParseIdentifierExpr() {
+  std::string IdName = IdentifierStr;
+
+  SourceLocation LitLoc = CurLoc;
+
+  getNextToken(); // eat identifier.
+
+  if (CurTok != '(') // Simple variable ref.
+    return new VariableExprAST(LitLoc, IdName);
+
+  // Call.
+  getNextToken(); // eat (
+  std::vector<ExprAST *> Args;
+  if (CurTok != ')') {
+    while (1) {
+      ExprAST *Arg = ParseExpression();
+      if (!Arg)
+        return 0;
+      Args.push_back(Arg);
+
+      if (CurTok == ')')
+        break;
+
+      if (CurTok != ',')
+        return Error("Expected ')' or ',' in argument list");
+      getNextToken();
+    }
+  }
+
+  // Eat the ')'.
+  getNextToken();
+
+  return new CallExprAST(LitLoc, IdName, Args);
+}
+
+/// numberexpr ::= number
+static ExprAST *ParseNumberExpr() {
+  ExprAST *Result = new NumberExprAST(NumVal);
+  getNextToken(); // consume the number
+  return Result;
+}
+
+/// parenexpr ::= '(' expression ')'
+static ExprAST *ParseParenExpr() {
+  getNextToken(); // eat (.
+  ExprAST *V = ParseExpression();
+  if (!V)
+    return 0;
+
+  if (CurTok != ')')
+    return Error("expected ')'");
+  getNextToken(); // eat ).
+  return V;
+}
+
+/// ifexpr ::= 'if' expression 'then' expression 'else' expression
+static ExprAST *ParseIfExpr() {
+  SourceLocation IfLoc = CurLoc;
+
+  getNextToken(); // eat the if.
+
+  // condition.
+  ExprAST *Cond = ParseExpression();
+  if (!Cond)
+    return 0;
+
+  if (CurTok != tok_then)
+    return Error("expected then");
+  getNextToken(); // eat the then
+
+  ExprAST *Then = ParseExpression();
+  if (Then == 0)
+    return 0;
+
+  if (CurTok != tok_else)
+    return Error("expected else");
+
+  getNextToken();
+
+  ExprAST *Else = ParseExpression();
+  if (!Else)
+    return 0;
+
+  return new IfExprAST(IfLoc, Cond, Then, Else);
+}
+
+/// forexpr ::= 'for' identifier '=' expr ',' expr (',' expr)? 'in' expression
+static ExprAST *ParseForExpr() {
+  getNextToken(); // eat the for.
+
+  if (CurTok != tok_identifier)
+    return Error("expected identifier after for");
+
+  std::string IdName = IdentifierStr;
+  getNextToken(); // eat identifier.
+
+  if (CurTok != '=')
+    return Error("expected '=' after for");
+  getNextToken(); // eat '='.
+
+  ExprAST *Start = ParseExpression();
+  if (Start == 0)
+    return 0;
+  if (CurTok != ',')
+    return Error("expected ',' after for start value");
+  getNextToken();
+
+  ExprAST *End = ParseExpression();
+  if (End == 0)
+    return 0;
+
+  // The step value is optional.
+  ExprAST *Step = 0;
+  if (CurTok == ',') {
+    getNextToken();
+    Step = ParseExpression();
+    if (Step == 0)
+      return 0;
+  }
+
+  if (CurTok != tok_in)
+    return Error("expected 'in' after for");
+  getNextToken(); // eat 'in'.
+
+  ExprAST *Body = ParseExpression();
+  if (Body == 0)
+    return 0;
+
+  return new ForExprAST(IdName, Start, End, Step, Body);
+}
+
+/// varexpr ::= 'var' identifier ('=' expression)?
+//                    (',' identifier ('=' expression)?)* 'in' expression
+static ExprAST *ParseVarExpr() {
+  getNextToken(); // eat the var.
+
+  std::vector<std::pair<std::string, ExprAST *> > VarNames;
+
+  // At least one variable name is required.
+  if (CurTok != tok_identifier)
+    return Error("expected identifier after var");
+
+  while (1) {
+    std::string Name = IdentifierStr;
+    getNextToken(); // eat identifier.
+
+    // Read the optional initializer.
+    ExprAST *Init = 0;
+    if (CurTok == '=') {
+      getNextToken(); // eat the '='.
+
+      Init = ParseExpression();
+      if (Init == 0)
+        return 0;
+    }
+
+    VarNames.push_back(std::make_pair(Name, Init));
+
+    // End of var list, exit loop.
+    if (CurTok != ',')
+      break;
+    getNextToken(); // eat the ','.
+
+    if (CurTok != tok_identifier)
+      return Error("expected identifier list after var");
+  }
+
+  // At this point, we have to have 'in'.
+  if (CurTok != tok_in)
+    return Error("expected 'in' keyword after 'var'");
+  getNextToken(); // eat 'in'.
+
+  ExprAST *Body = ParseExpression();
+  if (Body == 0)
+    return 0;
+
+  return new VarExprAST(VarNames, Body);
+}
+
+/// primary
+///   ::= identifierexpr
+///   ::= numberexpr
+///   ::= parenexpr
+///   ::= ifexpr
+///   ::= forexpr
+///   ::= varexpr
+static ExprAST *ParsePrimary() {
+  switch (CurTok) {
+  default:
+    return Error("unknown token when expecting an expression");
+  case tok_identifier:
+    return ParseIdentifierExpr();
+  case tok_number:
+    return ParseNumberExpr();
+  case '(':
+    return ParseParenExpr();
+  case tok_if:
+    return ParseIfExpr();
+  case tok_for:
+    return ParseForExpr();
+  case tok_var:
+    return ParseVarExpr();
+  }
+}
+
+/// unary
+///   ::= primary
+///   ::= '!' unary
+static ExprAST *ParseUnary() {
+  // If the current token is not an operator, it must be a primary expr.
+  if (!isascii(CurTok) || CurTok == '(' || CurTok == ',')
+    return ParsePrimary();
+
+  // If this is a unary operator, read it.
+  int Opc = CurTok;
+  getNextToken();
+  if (ExprAST *Operand = ParseUnary())
+    return new UnaryExprAST(Opc, Operand);
+  return 0;
+}
+
+/// binoprhs
+///   ::= ('+' unary)*
+static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
+  // If this is a binop, find its precedence.
+  while (1) {
+    int TokPrec = GetTokPrecedence();
+
+    // If this is a binop that binds at least as tightly as the current binop,
+    // consume it, otherwise we are done.
+    if (TokPrec < ExprPrec)
+      return LHS;
+
+    // Okay, we know this is a binop.
+    int BinOp = CurTok;
+    SourceLocation BinLoc = CurLoc;
+    getNextToken(); // eat binop
+
+    // Parse the unary expression after the binary operator.
+    ExprAST *RHS = ParseUnary();
+    if (!RHS)
+      return 0;
+
+    // If BinOp binds less tightly with RHS than the operator after RHS, let
+    // the pending operator take RHS as its LHS.
+    int NextPrec = GetTokPrecedence();
+    if (TokPrec < NextPrec) {
+      RHS = ParseBinOpRHS(TokPrec + 1, RHS);
+      if (RHS == 0)
+        return 0;
+    }
+
+    // Merge LHS/RHS.
+    LHS = new BinaryExprAST(BinLoc, BinOp, LHS, RHS);
+  }
+}
+
+/// expression
+///   ::= unary binoprhs
+///
+static ExprAST *ParseExpression() {
+  ExprAST *LHS = ParseUnary();
+  if (!LHS)
+    return 0;
+
+  return ParseBinOpRHS(0, LHS);
+}
+
+/// prototype
+///   ::= id '(' id* ')'
+///   ::= binary LETTER number? (id, id)
+///   ::= unary LETTER (id)
+static PrototypeAST *ParsePrototype() {
+  std::string FnName;
+
+  SourceLocation FnLoc = CurLoc;
+
+  unsigned Kind = 0; // 0 = identifier, 1 = unary, 2 = binary.
+  unsigned BinaryPrecedence = 30;
+
+  switch (CurTok) {
+  default:
+    return ErrorP("Expected function name in prototype");
+  case tok_identifier:
+    FnName = IdentifierStr;
+    Kind = 0;
+    getNextToken();
+    break;
+  case tok_unary:
+    getNextToken();
+    if (!isascii(CurTok))
+      return ErrorP("Expected unary operator");
+    FnName = "unary";
+    FnName += (char)CurTok;
+    Kind = 1;
+    getNextToken();
+    break;
+  case tok_binary:
+    getNextToken();
+    if (!isascii(CurTok))
+      return ErrorP("Expected binary operator");
+    FnName = "binary";
+    FnName += (char)CurTok;
+    Kind = 2;
+    getNextToken();
+
+    // Read the precedence if present.
+    if (CurTok == tok_number) {
+      if (NumVal < 1 || NumVal > 100)
+        return ErrorP("Invalid precedecnce: must be 1..100");
+      BinaryPrecedence = (unsigned)NumVal;
+      getNextToken();
+    }
+    break;
+  }
+
+  if (CurTok != '(')
+    return ErrorP("Expected '(' in prototype");
+
+  std::vector<std::string> ArgNames;
+  while (getNextToken() == tok_identifier)
+    ArgNames.push_back(IdentifierStr);
+  if (CurTok != ')')
+    return ErrorP("Expected ')' in prototype");
+
+  // success.
+  getNextToken(); // eat ')'.
+
+  // Verify right number of names for operator.
+  if (Kind && ArgNames.size() != Kind)
+    return ErrorP("Invalid number of operands for operator");
+
+  return new PrototypeAST(FnLoc, FnName, ArgNames, Kind != 0, BinaryPrecedence);
+}
+
+/// definition ::= 'def' prototype expression
+static FunctionAST *ParseDefinition() {
+  getNextToken(); // eat def.
+  PrototypeAST *Proto = ParsePrototype();
+  if (Proto == 0)
+    return 0;
+
+  if (ExprAST *E = ParseExpression())
+    return new FunctionAST(Proto, E);
+  return 0;
+}
+
+/// toplevelexpr ::= expression
+static FunctionAST *ParseTopLevelExpr() {
+  SourceLocation FnLoc = CurLoc;
+  if (ExprAST *E = ParseExpression()) {
+    // Make an anonymous proto.
+    PrototypeAST *Proto =
+        new PrototypeAST(FnLoc, "main", std::vector<std::string>());
+    return new FunctionAST(Proto, E);
+  }
+  return 0;
+}
+
+/// external ::= 'extern' prototype
+static PrototypeAST *ParseExtern() {
+  getNextToken(); // eat extern.
+  return ParsePrototype();
+}
+
+//===----------------------------------------------------------------------===//
+// Debug Info Support
+//===----------------------------------------------------------------------===//
+
+static DIBuilder *DBuilder;
+
+DIType DebugInfo::getDoubleTy() {
+  if (DblTy.isValid())
+    return DblTy;
+
+  DblTy = DBuilder->createBasicType("double", 64, 64, dwarf::DW_ATE_float);
+  return DblTy;
+}
+
+void DebugInfo::emitLocation(ExprAST *AST) {
+  if (!AST)
+    return Builder.SetCurrentDebugLocation(DebugLoc());
+  DIScope *Scope;
+  if (LexicalBlocks.empty())
+    Scope = &TheCU;
+  else
+    Scope = LexicalBlocks.back();
+  Builder.SetCurrentDebugLocation(
+      DebugLoc::get(AST->getLine(), AST->getCol(), DIScope(*Scope)));
+}
+
+static DICompositeType CreateFunctionType(unsigned NumArgs, DIFile Unit) {
+  SmallVector<Metadata *, 8> EltTys;
+  DIType DblTy = KSDbgInfo.getDoubleTy();
+
+  // Add the result type.
+  EltTys.push_back(DblTy);
+
+  for (unsigned i = 0, e = NumArgs; i != e; ++i)
+    EltTys.push_back(DblTy);
+
+  DITypeArray EltTypeArray = DBuilder->getOrCreateTypeArray(EltTys);
+  return DBuilder->createSubroutineType(Unit, EltTypeArray);
+}
+
+//===----------------------------------------------------------------------===//
+// Code Generation
+//===----------------------------------------------------------------------===//
+
+static Module *TheModule;
+static std::map<std::string, AllocaInst *> NamedValues;
+static legacy::FunctionPassManager *TheFPM;
+
+Value *ErrorV(const char *Str) {
+  Error(Str);
+  return 0;
+}
+
+/// CreateEntryBlockAlloca - Create an alloca instruction in the entry block of
+/// the function.  This is used for mutable variables etc.
+static AllocaInst *CreateEntryBlockAlloca(Function *TheFunction,
+                                          const std::string &VarName) {
+  IRBuilder<> TmpB(&TheFunction->getEntryBlock(),
+                   TheFunction->getEntryBlock().begin());
+  return TmpB.CreateAlloca(Type::getDoubleTy(getGlobalContext()), 0,
+                           VarName.c_str());
+}
+
+Value *NumberExprAST::Codegen() {
+  KSDbgInfo.emitLocation(this);
+  return ConstantFP::get(getGlobalContext(), APFloat(Val));
+}
+
+Value *VariableExprAST::Codegen() {
+  // Look this variable up in the function.
+  Value *V = NamedValues[Name];
+  if (V == 0)
+    return ErrorV("Unknown variable name");
+
+  KSDbgInfo.emitLocation(this);
+  // Load the value.
+  return Builder.CreateLoad(V, Name.c_str());
+}
+
+Value *UnaryExprAST::Codegen() {
+  Value *OperandV = Operand->Codegen();
+  if (OperandV == 0)
+    return 0;
+
+  Function *F = TheModule->getFunction(std::string("unary") + Opcode);
+  if (F == 0)
+    return ErrorV("Unknown unary operator");
+
+  KSDbgInfo.emitLocation(this);
+  return Builder.CreateCall(F, OperandV, "unop");
+}
+
+Value *BinaryExprAST::Codegen() {
+  KSDbgInfo.emitLocation(this);
+
+  // Special case '=' because we don't want to emit the LHS as an expression.
+  if (Op == '=') {
+    // Assignment requires the LHS to be an identifier.
+    VariableExprAST *LHSE = dynamic_cast<VariableExprAST *>(LHS);
+    if (!LHSE)
+      return ErrorV("destination of '=' must be a variable");
+    // Codegen the RHS.
+    Value *Val = RHS->Codegen();
+    if (Val == 0)
+      return 0;
+
+    // Look up the name.
+    Value *Variable = NamedValues[LHSE->getName()];
+    if (Variable == 0)
+      return ErrorV("Unknown variable name");
+
+    Builder.CreateStore(Val, Variable);
+    return Val;
+  }
+
+  Value *L = LHS->Codegen();
+  Value *R = RHS->Codegen();
+  if (L == 0 || R == 0)
+    return 0;
+
+  switch (Op) {
+  case '+':
+    return Builder.CreateFAdd(L, R, "addtmp");
+  case '-':
+    return Builder.CreateFSub(L, R, "subtmp");
+  case '*':
+    return Builder.CreateFMul(L, R, "multmp");
+  case '<':
+    L = Builder.CreateFCmpULT(L, R, "cmptmp");
+    // Convert bool 0/1 to double 0.0 or 1.0
+    return Builder.CreateUIToFP(L, Type::getDoubleTy(getGlobalContext()),
+                                "booltmp");
+  default:
+    break;
+  }
+
+  // If it wasn't a builtin binary operator, it must be a user defined one. Emit
+  // a call to it.
+  Function *F = TheModule->getFunction(std::string("binary") + Op);
+  assert(F && "binary operator not found!");
+
+  Value *Ops[] = { L, R };
+  return Builder.CreateCall(F, Ops, "binop");
+}
+
+Value *CallExprAST::Codegen() {
+  KSDbgInfo.emitLocation(this);
+
+  // Look up the name in the global module table.
+  Function *CalleeF = TheModule->getFunction(Callee);
+  if (CalleeF == 0)
+    return ErrorV("Unknown function referenced");
+
+  // If argument mismatch error.
+  if (CalleeF->arg_size() != Args.size())
+    return ErrorV("Incorrect # arguments passed");
+
+  std::vector<Value *> ArgsV;
+  for (unsigned i = 0, e = Args.size(); i != e; ++i) {
+    ArgsV.push_back(Args[i]->Codegen());
+    if (ArgsV.back() == 0)
+      return 0;
+  }
+
+  return Builder.CreateCall(CalleeF, ArgsV, "calltmp");
+}
+
+Value *IfExprAST::Codegen() {
+  KSDbgInfo.emitLocation(this);
+
+  Value *CondV = Cond->Codegen();
+  if (CondV == 0)
+    return 0;
+
+  // Convert condition to a bool by comparing equal to 0.0.
+  CondV = Builder.CreateFCmpONE(
+      CondV, ConstantFP::get(getGlobalContext(), APFloat(0.0)), "ifcond");
+
+  Function *TheFunction = Builder.GetInsertBlock()->getParent();
+
+  // Create blocks for the then and else cases.  Insert the 'then' block at the
+  // end of the function.
+  BasicBlock *ThenBB =
+      BasicBlock::Create(getGlobalContext(), "then", TheFunction);
+  BasicBlock *ElseBB = BasicBlock::Create(getGlobalContext(), "else");
+  BasicBlock *MergeBB = BasicBlock::Create(getGlobalContext(), "ifcont");
+
+  Builder.CreateCondBr(CondV, ThenBB, ElseBB);
+
+  // Emit then value.
+  Builder.SetInsertPoint(ThenBB);
+
+  Value *ThenV = Then->Codegen();
+  if (ThenV == 0)
+    return 0;
+
+  Builder.CreateBr(MergeBB);
+  // Codegen of 'Then' can change the current block, update ThenBB for the PHI.
+  ThenBB = Builder.GetInsertBlock();
+
+  // Emit else block.
+  TheFunction->getBasicBlockList().push_back(ElseBB);
+  Builder.SetInsertPoint(ElseBB);
+
+  Value *ElseV = Else->Codegen();
+  if (ElseV == 0)
+    return 0;
+
+  Builder.CreateBr(MergeBB);
+  // Codegen of 'Else' can change the current block, update ElseBB for the PHI.
+  ElseBB = Builder.GetInsertBlock();
+
+  // Emit merge block.
+  TheFunction->getBasicBlockList().push_back(MergeBB);
+  Builder.SetInsertPoint(MergeBB);
+  PHINode *PN =
+      Builder.CreatePHI(Type::getDoubleTy(getGlobalContext()), 2, "iftmp");
+
+  PN->addIncoming(ThenV, ThenBB);
+  PN->addIncoming(ElseV, ElseBB);
+  return PN;
+}
+
+Value *ForExprAST::Codegen() {
+  // Output this as:
+  //   var = alloca double
+  //   ...
+  //   start = startexpr
+  //   store start -> var
+  //   goto loop
+  // loop:
+  //   ...
+  //   bodyexpr
+  //   ...
+  // loopend:
+  //   step = stepexpr
+  //   endcond = endexpr
+  //
+  //   curvar = load var
+  //   nextvar = curvar + step
+  //   store nextvar -> var
+  //   br endcond, loop, endloop
+  // outloop:
+
+  Function *TheFunction = Builder.GetInsertBlock()->getParent();
+
+  // Create an alloca for the variable in the entry block.
+  AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
+
+  KSDbgInfo.emitLocation(this);
+
+  // Emit the start code first, without 'variable' in scope.
+  Value *StartVal = Start->Codegen();
+  if (StartVal == 0)
+    return 0;
+
+  // Store the value into the alloca.
+  Builder.CreateStore(StartVal, Alloca);
+
+  // Make the new basic block for the loop header, inserting after current
+  // block.
+  BasicBlock *LoopBB =
+      BasicBlock::Create(getGlobalContext(), "loop", TheFunction);
+
+  // Insert an explicit fall through from the current block to the LoopBB.
+  Builder.CreateBr(LoopBB);
+
+  // Start insertion in LoopBB.
+  Builder.SetInsertPoint(LoopBB);
+
+  // Within the loop, the variable is defined equal to the PHI node.  If it
+  // shadows an existing variable, we have to restore it, so save it now.
+  AllocaInst *OldVal = NamedValues[VarName];
+  NamedValues[VarName] = Alloca;
+
+  // Emit the body of the loop.  This, like any other expr, can change the
+  // current BB.  Note that we ignore the value computed by the body, but don't
+  // allow an error.
+  if (Body->Codegen() == 0)
+    return 0;
+
+  // Emit the step value.
+  Value *StepVal;
+  if (Step) {
+    StepVal = Step->Codegen();
+    if (StepVal == 0)
+      return 0;
+  } else {
+    // If not specified, use 1.0.
+    StepVal = ConstantFP::get(getGlobalContext(), APFloat(1.0));
+  }
+
+  // Compute the end condition.
+  Value *EndCond = End->Codegen();
+  if (EndCond == 0)
+    return EndCond;
+
+  // Reload, increment, and restore the alloca.  This handles the case where
+  // the body of the loop mutates the variable.
+  Value *CurVar = Builder.CreateLoad(Alloca, VarName.c_str());
+  Value *NextVar = Builder.CreateFAdd(CurVar, StepVal, "nextvar");
+  Builder.CreateStore(NextVar, Alloca);
+
+  // Convert condition to a bool by comparing equal to 0.0.
+  EndCond = Builder.CreateFCmpONE(
+      EndCond, ConstantFP::get(getGlobalContext(), APFloat(0.0)), "loopcond");
+
+  // Create the "after loop" block and insert it.
+  BasicBlock *AfterBB =
+      BasicBlock::Create(getGlobalContext(), "afterloop", TheFunction);
+
+  // Insert the conditional branch into the end of LoopEndBB.
+  Builder.CreateCondBr(EndCond, LoopBB, AfterBB);
+
+  // Any new code will be inserted in AfterBB.
+  Builder.SetInsertPoint(AfterBB);
+
+  // Restore the unshadowed variable.
+  if (OldVal)
+    NamedValues[VarName] = OldVal;
+  else
+    NamedValues.erase(VarName);
+
+  // for expr always returns 0.0.
+  return Constant::getNullValue(Type::getDoubleTy(getGlobalContext()));
+}
+
+Value *VarExprAST::Codegen() {
+  std::vector<AllocaInst *> OldBindings;
+
+  Function *TheFunction = Builder.GetInsertBlock()->getParent();
+
+  // Register all variables and emit their initializer.
+  for (unsigned i = 0, e = VarNames.size(); i != e; ++i) {
+    const std::string &VarName = VarNames[i].first;
+    ExprAST *Init = VarNames[i].second;
+
+    // Emit the initializer before adding the variable to scope, this prevents
+    // the initializer from referencing the variable itself, and permits stuff
+    // like this:
+    //  var a = 1 in
+    //    var a = a in ...   # refers to outer 'a'.
+    Value *InitVal;
+    if (Init) {
+      InitVal = Init->Codegen();
+      if (InitVal == 0)
+        return 0;
+    } else { // If not specified, use 0.0.
+      InitVal = ConstantFP::get(getGlobalContext(), APFloat(0.0));
+    }
+
+    AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
+    Builder.CreateStore(InitVal, Alloca);
+
+    // Remember the old variable binding so that we can restore the binding when
+    // we unrecurse.
+    OldBindings.push_back(NamedValues[VarName]);
+
+    // Remember this binding.
+    NamedValues[VarName] = Alloca;
+  }
+
+  KSDbgInfo.emitLocation(this);
+
+  // Codegen the body, now that all vars are in scope.
+  Value *BodyVal = Body->Codegen();
+  if (BodyVal == 0)
+    return 0;
+
+  // Pop all our variables from scope.
+  for (unsigned i = 0, e = VarNames.size(); i != e; ++i)
+    NamedValues[VarNames[i].first] = OldBindings[i];
+
+  // Return the body computation.
+  return BodyVal;
+}
+
+Function *PrototypeAST::Codegen() {
+  // Make the function type:  double(double,double) etc.
+  std::vector<Type *> Doubles(Args.size(),
+                              Type::getDoubleTy(getGlobalContext()));
+  FunctionType *FT =
+      FunctionType::get(Type::getDoubleTy(getGlobalContext()), Doubles, false);
+
+  Function *F =
+      Function::Create(FT, Function::ExternalLinkage, Name, TheModule);
+
+  // If F conflicted, there was already something named 'Name'.  If it has a
+  // body, don't allow redefinition or reextern.
+  if (F->getName() != Name) {
+    // Delete the one we just made and get the existing one.
+    F->eraseFromParent();
+    F = TheModule->getFunction(Name);
+
+    // If F already has a body, reject this.
+    if (!F->empty()) {
+      ErrorF("redefinition of function");
+      return 0;
+    }
+
+    // If F took a different number of args, reject.
+    if (F->arg_size() != Args.size()) {
+      ErrorF("redefinition of function with different # args");
+      return 0;
+    }
+  }
+
+  // Set names for all arguments.
+  unsigned Idx = 0;
+  for (Function::arg_iterator AI = F->arg_begin(); Idx != Args.size();
+       ++AI, ++Idx)
+    AI->setName(Args[Idx]);
+
+  // Create a subprogram DIE for this function.
+  DIFile Unit = DBuilder->createFile(KSDbgInfo.TheCU.getFilename(),
+                                     KSDbgInfo.TheCU.getDirectory());
+  DIDescriptor FContext(Unit);
+  unsigned LineNo = Line;
+  unsigned ScopeLine = Line;
+  DISubprogram SP = DBuilder->createFunction(
+      FContext, Name, StringRef(), Unit, LineNo,
+      CreateFunctionType(Args.size(), Unit), false /* internal linkage */,
+      true /* definition */, ScopeLine, DIDescriptor::FlagPrototyped, false, F);
+
+  KSDbgInfo.FnScopeMap[this] = SP;
+  return F;
+}
+
+/// CreateArgumentAllocas - Create an alloca for each argument and register the
+/// argument in the symbol table so that references to it will succeed.
+void PrototypeAST::CreateArgumentAllocas(Function *F) {
+  Function::arg_iterator AI = F->arg_begin();
+  for (unsigned Idx = 0, e = Args.size(); Idx != e; ++Idx, ++AI) {
+    // Create an alloca for this variable.
+    AllocaInst *Alloca = CreateEntryBlockAlloca(F, Args[Idx]);
+
+    // Create a debug descriptor for the variable.
+    DIScope *Scope = KSDbgInfo.LexicalBlocks.back();
+    DIFile Unit = DBuilder->createFile(KSDbgInfo.TheCU.getFilename(),
+                                       KSDbgInfo.TheCU.getDirectory());
+    DIVariable D = DBuilder->createLocalVariable(dwarf::DW_TAG_arg_variable,
+                                                 *Scope, Args[Idx], Unit, Line,
+                                                 KSDbgInfo.getDoubleTy(), Idx);
+
+    Instruction *Call = DBuilder->insertDeclare(
+        Alloca, D, DBuilder->createExpression(), Builder.GetInsertBlock());
+    Call->setDebugLoc(DebugLoc::get(Line, 0, *Scope));
+
+    // Store the initial value into the alloca.
+    Builder.CreateStore(AI, Alloca);
+
+    // Add arguments to variable symbol table.
+    NamedValues[Args[Idx]] = Alloca;
+  }
+}
+
+Function *FunctionAST::Codegen() {
+  NamedValues.clear();
+
+  Function *TheFunction = Proto->Codegen();
+  if (TheFunction == 0)
+    return 0;
+
+  // Push the current scope.
+  KSDbgInfo.LexicalBlocks.push_back(&KSDbgInfo.FnScopeMap[Proto]);
+
+  // Unset the location for the prologue emission (leading instructions with no
+  // location in a function are considered part of the prologue and the debugger
+  // will run past them when breaking on a function)
+  KSDbgInfo.emitLocation(nullptr);
+
+  // If this is an operator, install it.
+  if (Proto->isBinaryOp())
+    BinopPrecedence[Proto->getOperatorName()] = Proto->getBinaryPrecedence();
+
+  // Create a new basic block to start insertion into.
+  BasicBlock *BB = BasicBlock::Create(getGlobalContext(), "entry", TheFunction);
+  Builder.SetInsertPoint(BB);
+
+  // Add all arguments to the symbol table and create their allocas.
+  Proto->CreateArgumentAllocas(TheFunction);
+
+  KSDbgInfo.emitLocation(Body);
+
+  if (Value *RetVal = Body->Codegen()) {
+    // Finish off the function.
+    Builder.CreateRet(RetVal);
+
+    // Pop off the lexical block for the function.
+    KSDbgInfo.LexicalBlocks.pop_back();
+
+    // Validate the generated code, checking for consistency.
+    verifyFunction(*TheFunction);
+
+    // Optimize the function.
+    TheFPM->run(*TheFunction);
+
+    return TheFunction;
+  }
+
+  // Error reading body, remove function.
+  TheFunction->eraseFromParent();
+
+  if (Proto->isBinaryOp())
+    BinopPrecedence.erase(Proto->getOperatorName());
+
+  // Pop off the lexical block for the function since we added it
+  // unconditionally.
+  KSDbgInfo.LexicalBlocks.pop_back();
+
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Top-Level parsing and JIT Driver
+//===----------------------------------------------------------------------===//
+
+static ExecutionEngine *TheExecutionEngine;
+
+static void HandleDefinition() {
+  if (FunctionAST *F = ParseDefinition()) {
+    if (!F->Codegen()) {
+      fprintf(stderr, "Error reading function definition:");
+    }
+  } else {
+    // Skip token for error recovery.
+    getNextToken();
+  }
+}
+
+static void HandleExtern() {
+  if (PrototypeAST *P = ParseExtern()) {
+    if (!P->Codegen()) {
+      fprintf(stderr, "Error reading extern");
+    }
+  } else {
+    // Skip token for error recovery.
+    getNextToken();
+  }
+}
+
+static void HandleTopLevelExpression() {
+  // Evaluate a top-level expression into an anonymous function.
+  if (FunctionAST *F = ParseTopLevelExpr()) {
+    if (!F->Codegen()) {
+      fprintf(stderr, "Error generating code for top level expr");
+    }
+  } else {
+    // Skip token for error recovery.
+    getNextToken();
+  }
+}
+
+/// top ::= definition | external | expression | ';'
+static void MainLoop() {
+  while (1) {
+    switch (CurTok) {
+    case tok_eof:
+      return;
+    case ';':
+      getNextToken();
+      break; // ignore top-level semicolons.
+    case tok_def:
+      HandleDefinition();
+      break;
+    case tok_extern:
+      HandleExtern();
+      break;
+    default:
+      HandleTopLevelExpression();
+      break;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// "Library" functions that can be "extern'd" from user code.
+//===----------------------------------------------------------------------===//
+
+/// putchard - putchar that takes a double and returns 0.
+extern "C" double putchard(double X) {
+  putchar((char)X);
+  return 0;
+}
+
+/// printd - printf that takes a double prints it as "%f\n", returning 0.
+extern "C" double printd(double X) {
+  printf("%f\n", X);
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Main driver code.
+//===----------------------------------------------------------------------===//
+
+int main() {
+  InitializeNativeTarget();
+  InitializeNativeTargetAsmPrinter();
+  InitializeNativeTargetAsmParser();
+  LLVMContext &Context = getGlobalContext();
+
+  // Install standard binary operators.
+  // 1 is lowest precedence.
+  BinopPrecedence['='] = 2;
+  BinopPrecedence['<'] = 10;
+  BinopPrecedence['+'] = 20;
+  BinopPrecedence['-'] = 20;
+  BinopPrecedence['*'] = 40; // highest.
+
+  // Prime the first token.
+  getNextToken();
+
+  // Make the module, which holds all the code.
+  std::unique_ptr<Module> Owner = make_unique<Module>("my cool jit", Context);
+  TheModule = Owner.get();
+
+  // Add the current debug info version into the module.
+  TheModule->addModuleFlag(Module::Warning, "Debug Info Version",
+                           DEBUG_METADATA_VERSION);
+
+  // Darwin only supports dwarf2.
+  if (Triple(sys::getProcessTriple()).isOSDarwin())
+    TheModule->addModuleFlag(llvm::Module::Warning, "Dwarf Version", 2);
+
+  // Construct the DIBuilder, we do this here because we need the module.
+  DBuilder = new DIBuilder(*TheModule);
+
+  // Create the compile unit for the module.
+  // Currently down as "fib.ks" as a filename since we're redirecting stdin
+  // but we'd like actual source locations.
+  KSDbgInfo.TheCU = DBuilder->createCompileUnit(
+      dwarf::DW_LANG_C, "fib.ks", ".", "Kaleidoscope Compiler", 0, "", 0);
+
+  // Create the JIT.  This takes ownership of the module.
+  std::string ErrStr;
+  TheExecutionEngine =
+      EngineBuilder(std::move(Owner))
+          .setErrorStr(&ErrStr)
+          .setMCJITMemoryManager(llvm::make_unique<SectionMemoryManager>())
+          .create();
+  if (!TheExecutionEngine) {
+    fprintf(stderr, "Could not create ExecutionEngine: %s\n", ErrStr.c_str());
+    exit(1);
+  }
+
+  legacy::FunctionPassManager OurFPM(TheModule);
+
+  // Set up the optimizer pipeline.  Start with registering info about how the
+  // target lays out data structures.
+  TheModule->setDataLayout(TheExecutionEngine->getDataLayout());
+  OurFPM.add(new DataLayoutPass());
+#if 0
+  // Provide basic AliasAnalysis support for GVN.
+  OurFPM.add(createBasicAliasAnalysisPass());
+  // Promote allocas to registers.
+  OurFPM.add(createPromoteMemoryToRegisterPass());
+  // Do simple "peephole" optimizations and bit-twiddling optzns.
+  OurFPM.add(createInstructionCombiningPass());
+  // Reassociate expressions.
+  OurFPM.add(createReassociatePass());
+  // Eliminate Common SubExpressions.
+  OurFPM.add(createGVNPass());
+  // Simplify the control flow graph (deleting unreachable blocks, etc).
+  OurFPM.add(createCFGSimplificationPass());
+  #endif
+  OurFPM.doInitialization();
+
+  // Set the global so the code gen can use this.
+  TheFPM = &OurFPM;
+
+  // Run the main "interpreter loop" now.
+  MainLoop();
+
+  TheFPM = 0;
+
+  // Finalize the debug info.
+  DBuilder->finalize();
+
+  // Print out all of the generated code.
+  TheModule->dump();
+
+  return 0;
+}
diff --git a/examples/Kaleidoscope/MCJIT/cached/toy-jit.cpp b/examples/Kaleidoscope/MCJIT/cached/toy-jit.cpp
index 00f5b83..7f5ed13 100644
--- a/examples/Kaleidoscope/MCJIT/cached/toy-jit.cpp
+++ b/examples/Kaleidoscope/MCJIT/cached/toy-jit.cpp
@@ -6,10 +6,10 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
diff --git a/examples/Kaleidoscope/MCJIT/cached/toy.cpp b/examples/Kaleidoscope/MCJIT/cached/toy.cpp
index af51b4a..f598ca4 100644
--- a/examples/Kaleidoscope/MCJIT/cached/toy.cpp
+++ b/examples/Kaleidoscope/MCJIT/cached/toy.cpp
@@ -9,10 +9,10 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -36,8 +36,8 @@ InputIR("input-IR",
         cl::desc("Specify the name of an IR file to load for function definitions"),
         cl::value_desc("input IR file name"));
 
-cl::opt<bool> 
-UseObjectCache("use-object-cache", 
+cl::opt<bool>
+UseObjectCache("use-object-cache",
                cl::desc("Enable use of the MCJIT object caching"),
                cl::init(false));
 
@@ -55,14 +55,14 @@ enum Token {
 
   // primary
   tok_identifier = -4, tok_number = -5,
-  
+
   // control
   tok_if = -6, tok_then = -7, tok_else = -8,
   tok_for = -9, tok_in = -10,
-  
+
   // operators
   tok_binary = -11, tok_unary = -12,
-  
+
   // var definition
   tok_var = -13
 };
@@ -111,11 +111,11 @@ static int gettok() {
     // Comment until end of line.
     do LastChar = getchar();
     while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
-    
+
     if (LastChar != EOF)
       return gettok();
   }
-  
+
   // Check for end of file.  Don't eat the EOF.
   if (LastChar == EOF)
     return tok_eof;
@@ -159,7 +159,7 @@ class UnaryExprAST : public ExprAST {
   char Opcode;
   ExprAST *Operand;
 public:
-  UnaryExprAST(char opcode, ExprAST *operand) 
+  UnaryExprAST(char opcode, ExprAST *operand)
     : Opcode(opcode), Operand(operand) {}
   virtual Value *Codegen();
 };
@@ -169,7 +169,7 @@ class BinaryExprAST : public ExprAST {
   char Op;
   ExprAST *LHS, *RHS;
 public:
-  BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs) 
+  BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs)
     : Op(op), LHS(lhs), RHS(rhs) {}
   virtual Value *Codegen();
 };
@@ -212,7 +212,7 @@ public:
   VarExprAST(const std::vector<std::pair<std::string, ExprAST*> > &varnames,
              ExprAST *body)
   : VarNames(varnames), Body(body) {}
-  
+
   virtual Value *Codegen();
 };
 
@@ -227,19 +227,19 @@ public:
   PrototypeAST(const std::string &name, const std::vector<std::string> &args,
                bool isoperator = false, unsigned prec = 0)
   : Name(name), Args(args), isOperator(isoperator), Precedence(prec) {}
-  
+
   bool isUnaryOp() const { return isOperator && Args.size() == 1; }
   bool isBinaryOp() const { return isOperator && Args.size() == 2; }
-  
+
   char getOperatorName() const {
     assert(isUnaryOp() || isBinaryOp());
     return Name[Name.size()-1];
   }
-  
+
   unsigned getBinaryPrecedence() const { return Precedence; }
-  
+
   Function *Codegen();
-  
+
   void CreateArgumentAllocas(Function *F);
 };
 
@@ -250,7 +250,7 @@ class FunctionAST {
 public:
   FunctionAST(PrototypeAST *proto, ExprAST *body)
     : Proto(proto), Body(body) {}
-  
+
   Function *Codegen();
 };
 
@@ -274,7 +274,7 @@ static std::map<char, int> BinopPrecedence;
 static int GetTokPrecedence() {
   if (!isascii(CurTok))
     return -1;
-  
+
   // Make sure it's a declared binop.
   int TokPrec = BinopPrecedence[CurTok];
   if (TokPrec <= 0) return -1;
@@ -293,12 +293,12 @@ static ExprAST *ParseExpression();
 ///   ::= identifier '(' expression* ')'
 static ExprAST *ParseIdentifierExpr() {
   std::string IdName = IdentifierStr;
-  
+
   getNextToken();  // eat identifier.
-  
+
   if (CurTok != '(') // Simple variable ref.
     return new VariableExprAST(IdName);
-  
+
   // Call.
   getNextToken();  // eat (
   std::vector<ExprAST*> Args;
@@ -318,7 +318,7 @@ static ExprAST *ParseIdentifierExpr() {
 
   // Eat the ')'.
   getNextToken();
-  
+
   return new CallExprAST(IdName, Args);
 }
 
@@ -334,7 +334,7 @@ static ExprAST *ParseParenExpr() {
   getNextToken();  // eat (.
   ExprAST *V = ParseExpression();
   if (!V) return 0;
-  
+
   if (CurTok != ')')
     return Error("expected ')'");
   getNextToken();  // eat ).
@@ -344,26 +344,26 @@ static ExprAST *ParseParenExpr() {
 /// ifexpr ::= 'if' expression 'then' expression 'else' expression
 static ExprAST *ParseIfExpr() {
   getNextToken();  // eat the if.
-  
+
   // condition.
   ExprAST *Cond = ParseExpression();
   if (!Cond) return 0;
-  
+
   if (CurTok != tok_then)
     return Error("expected then");
   getNextToken();  // eat the then
-  
+
   ExprAST *Then = ParseExpression();
   if (Then == 0) return 0;
-  
+
   if (CurTok != tok_else)
     return Error("expected else");
-  
+
   getNextToken();
-  
+
   ExprAST *Else = ParseExpression();
   if (!Else) return 0;
-  
+
   return new IfExprAST(Cond, Then, Else);
 }
 
@@ -373,24 +373,24 @@ static ExprAST *ParseForExpr() {
 
   if (CurTok != tok_identifier)
     return Error("expected identifier after for");
-  
+
   std::string IdName = IdentifierStr;
   getNextToken();  // eat identifier.
-  
+
   if (CurTok != '=')
     return Error("expected '=' after for");
   getNextToken();  // eat '='.
-  
-  
+
+
   ExprAST *Start = ParseExpression();
   if (Start == 0) return 0;
   if (CurTok != ',')
     return Error("expected ',' after for start value");
   getNextToken();
-  
+
   ExprAST *End = ParseExpression();
   if (End == 0) return 0;
-  
+
   // The step value is optional.
   ExprAST *Step = 0;
   if (CurTok == ',') {
@@ -398,18 +398,18 @@ static ExprAST *ParseForExpr() {
     Step = ParseExpression();
     if (Step == 0) return 0;
   }
-  
+
   if (CurTok != tok_in)
     return Error("expected 'in' after for");
   getNextToken();  // eat 'in'.
-  
+
   ExprAST *Body = ParseExpression();
   if (Body == 0) return 0;
 
   return new ForExprAST(IdName, Start, End, Step, Body);
 }
 
-/// varexpr ::= 'var' identifier ('=' expression)? 
+/// varexpr ::= 'var' identifier ('=' expression)?
 //                    (',' identifier ('=' expression)?)* 'in' expression
 static ExprAST *ParseVarExpr() {
   getNextToken();  // eat the var.
@@ -419,7 +419,7 @@ static ExprAST *ParseVarExpr() {
   // At least one variable name is required.
   if (CurTok != tok_identifier)
     return Error("expected identifier after var");
-  
+
   while (1) {
     std::string Name = IdentifierStr;
     getNextToken();  // eat identifier.
@@ -428,29 +428,29 @@ static ExprAST *ParseVarExpr() {
     ExprAST *Init = 0;
     if (CurTok == '=') {
       getNextToken(); // eat the '='.
-      
+
       Init = ParseExpression();
       if (Init == 0) return 0;
     }
-    
+
     VarNames.push_back(std::make_pair(Name, Init));
-    
+
     // End of var list, exit loop.
     if (CurTok != ',') break;
     getNextToken(); // eat the ','.
-    
+
     if (CurTok != tok_identifier)
       return Error("expected identifier list after var");
   }
-  
+
   // At this point, we have to have 'in'.
   if (CurTok != tok_in)
     return Error("expected 'in' keyword after 'var'");
   getNextToken();  // eat 'in'.
-  
+
   ExprAST *Body = ParseExpression();
   if (Body == 0) return 0;
-  
+
   return new VarExprAST(VarNames, Body);
 }
 
@@ -480,7 +480,7 @@ static ExprAST *ParseUnary() {
   // If the current token is not an operator, it must be a primary expr.
   if (!isascii(CurTok) || CurTok == '(' || CurTok == ',')
     return ParsePrimary();
-  
+
   // If this is a unary operator, read it.
   int Opc = CurTok;
   getNextToken();
@@ -495,20 +495,20 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
   // If this is a binop, find its precedence.
   while (1) {
     int TokPrec = GetTokPrecedence();
-    
+
     // If this is a binop that binds at least as tightly as the current binop,
     // consume it, otherwise we are done.
     if (TokPrec < ExprPrec)
       return LHS;
-    
+
     // Okay, we know this is a binop.
     int BinOp = CurTok;
     getNextToken();  // eat binop
-    
+
     // Parse the unary expression after the binary operator.
     ExprAST *RHS = ParseUnary();
     if (!RHS) return 0;
-    
+
     // If BinOp binds less tightly with RHS than the operator after RHS, let
     // the pending operator take RHS as its LHS.
     int NextPrec = GetTokPrecedence();
@@ -516,7 +516,7 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
       RHS = ParseBinOpRHS(TokPrec+1, RHS);
       if (RHS == 0) return 0;
     }
-    
+
     // Merge LHS/RHS.
     LHS = new BinaryExprAST(BinOp, LHS, RHS);
   }
@@ -528,7 +528,7 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
 static ExprAST *ParseExpression() {
   ExprAST *LHS = ParseUnary();
   if (!LHS) return 0;
-  
+
   return ParseBinOpRHS(0, LHS);
 }
 
@@ -538,10 +538,10 @@ static ExprAST *ParseExpression() {
 ///   ::= unary LETTER (id)
 static PrototypeAST *ParsePrototype() {
   std::string FnName;
-  
+
   unsigned Kind = 0; // 0 = identifier, 1 = unary, 2 = binary.
   unsigned BinaryPrecedence = 30;
-  
+
   switch (CurTok) {
   default:
     return ErrorP("Expected function name in prototype");
@@ -567,7 +567,7 @@ static PrototypeAST *ParsePrototype() {
     FnName += (char)CurTok;
     Kind = 2;
     getNextToken();
-    
+
     // Read the precedence if present.
     if (CurTok == tok_number) {
       if (NumVal < 1 || NumVal > 100)
@@ -577,23 +577,23 @@ static PrototypeAST *ParsePrototype() {
     }
     break;
   }
-  
+
   if (CurTok != '(')
     return ErrorP("Expected '(' in prototype");
-  
+
   std::vector<std::string> ArgNames;
   while (getNextToken() == tok_identifier)
     ArgNames.push_back(IdentifierStr);
   if (CurTok != ')')
     return ErrorP("Expected ')' in prototype");
-  
+
   // success.
   getNextToken();  // eat ')'.
-  
+
   // Verify right number of names for operator.
   if (Kind && ArgNames.size() != Kind)
     return ErrorP("Invalid number of operands for operator");
-  
+
   return new PrototypeAST(FnName, ArgNames, Kind != 0, BinaryPrecedence);
 }
 
@@ -762,14 +762,14 @@ private:
 
 class HelpingMemoryManager : public SectionMemoryManager
 {
-  HelpingMemoryManager(const HelpingMemoryManager&) LLVM_DELETED_FUNCTION;
-  void operator=(const HelpingMemoryManager&) LLVM_DELETED_FUNCTION;
+  HelpingMemoryManager(const HelpingMemoryManager&) = delete;
+  void operator=(const HelpingMemoryManager&) = delete;
 
 public:
   HelpingMemoryManager(MCJITHelper *Helper) : MasterHelper(Helper) {}
   virtual ~HelpingMemoryManager() {}
 
-  /// This method returns the address of the specified function. 
+  /// This method returns the address of the specified function.
   /// Our implementation will attempt to find functions in other
   /// modules associated with the MCJITHelper to cross link functions
   /// from one generated module to another.
@@ -838,9 +838,9 @@ Function *MCJITHelper::getFunction(const std::string FnName) {
 
       // If we don't have a prototype yet, create one.
       if (!PF)
-        PF = Function::Create(F->getFunctionType(), 
-                                      Function::ExternalLinkage, 
-                                      FnName, 
+        PF = Function::Create(F->getFunctionType(),
+                                      Function::ExternalLinkage,
+                                      FnName,
                                       OpenModule);
       return PF;
     }
@@ -1027,11 +1027,11 @@ Value *VariableExprAST::Codegen() {
 Value *UnaryExprAST::Codegen() {
   Value *OperandV = Operand->Codegen();
   if (OperandV == 0) return 0;
-  
+
   Function *F = TheHelper->getFunction(MakeLegalFunctionName(std::string("unary")+Opcode));
   if (F == 0)
     return ErrorV("Unknown unary operator");
-  
+
   return Builder.CreateCall(F, OperandV, "unop");
 }
 
@@ -1053,11 +1053,11 @@ Value *BinaryExprAST::Codegen() {
     Builder.CreateStore(Val, Variable);
     return Val;
   }
-  
+
   Value *L = LHS->Codegen();
   Value *R = RHS->Codegen();
   if (L == 0 || R == 0) return 0;
-  
+
   switch (Op) {
   case '+': return Builder.CreateFAdd(L, R, "addtmp");
   case '-': return Builder.CreateFSub(L, R, "subtmp");
@@ -1070,12 +1070,12 @@ Value *BinaryExprAST::Codegen() {
                                 "booltmp");
   default: break;
   }
-  
+
   // If it wasn't a builtin binary operator, it must be a user defined one. Emit
   // a call to it.
   Function *F = TheHelper->getFunction(MakeLegalFunctionName(std::string("binary")+Op));
   assert(F && "binary operator not found!");
-  
+
   Value *Ops[] = { L, R };
   return Builder.CreateCall(F, Ops, "binop");
 }
@@ -1085,7 +1085,7 @@ Value *CallExprAST::Codegen() {
   Function *CalleeF = TheHelper->getFunction(Callee);
   if (CalleeF == 0)
     return ErrorV("Unknown function referenced");
-  
+
   // If argument mismatch error.
   if (CalleeF->arg_size() != Args.size())
     return ErrorV("Incorrect # arguments passed");
@@ -1095,56 +1095,56 @@ Value *CallExprAST::Codegen() {
     ArgsV.push_back(Args[i]->Codegen());
     if (ArgsV.back() == 0) return 0;
   }
-  
+
   return Builder.CreateCall(CalleeF, ArgsV, "calltmp");
 }
 
 Value *IfExprAST::Codegen() {
   Value *CondV = Cond->Codegen();
   if (CondV == 0) return 0;
-  
+
   // Convert condition to a bool by comparing equal to 0.0.
-  CondV = Builder.CreateFCmpONE(CondV, 
+  CondV = Builder.CreateFCmpONE(CondV,
                               ConstantFP::get(getGlobalContext(), APFloat(0.0)),
                                 "ifcond");
-  
+
   Function *TheFunction = Builder.GetInsertBlock()->getParent();
-  
+
   // Create blocks for the then and else cases.  Insert the 'then' block at the
   // end of the function.
   BasicBlock *ThenBB = BasicBlock::Create(getGlobalContext(), "then", TheFunction);
   BasicBlock *ElseBB = BasicBlock::Create(getGlobalContext(), "else");
   BasicBlock *MergeBB = BasicBlock::Create(getGlobalContext(), "ifcont");
-  
+
   Builder.CreateCondBr(CondV, ThenBB, ElseBB);
-  
+
   // Emit then value.
   Builder.SetInsertPoint(ThenBB);
-  
+
   Value *ThenV = Then->Codegen();
   if (ThenV == 0) return 0;
-  
+
   Builder.CreateBr(MergeBB);
   // Codegen of 'Then' can change the current block, update ThenBB for the PHI.
   ThenBB = Builder.GetInsertBlock();
-  
+
   // Emit else block.
   TheFunction->getBasicBlockList().push_back(ElseBB);
   Builder.SetInsertPoint(ElseBB);
-  
+
   Value *ElseV = Else->Codegen();
   if (ElseV == 0) return 0;
-  
+
   Builder.CreateBr(MergeBB);
   // Codegen of 'Else' can change the current block, update ElseBB for the PHI.
   ElseBB = Builder.GetInsertBlock();
-  
+
   // Emit merge block.
   TheFunction->getBasicBlockList().push_back(MergeBB);
   Builder.SetInsertPoint(MergeBB);
   PHINode *PN = Builder.CreatePHI(Type::getDoubleTy(getGlobalContext()), 2,
                                   "iftmp");
-  
+
   PN->addIncoming(ThenV, ThenBB);
   PN->addIncoming(ElseV, ElseBB);
   return PN;
@@ -1157,7 +1157,7 @@ Value *ForExprAST::Codegen() {
   //   start = startexpr
   //   store start -> var
   //   goto loop
-  // loop: 
+  // loop:
   //   ...
   //   bodyexpr
   //   ...
@@ -1170,40 +1170,40 @@ Value *ForExprAST::Codegen() {
   //   store nextvar -> var
   //   br endcond, loop, endloop
   // outloop:
-  
+
   Function *TheFunction = Builder.GetInsertBlock()->getParent();
 
   // Create an alloca for the variable in the entry block.
   AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
-  
+
   // Emit the start code first, without 'variable' in scope.
   Value *StartVal = Start->Codegen();
   if (StartVal == 0) return 0;
-  
+
   // Store the value into the alloca.
   Builder.CreateStore(StartVal, Alloca);
-  
+
   // Make the new basic block for the loop header, inserting after current
   // block.
   BasicBlock *LoopBB = BasicBlock::Create(getGlobalContext(), "loop", TheFunction);
-  
+
   // Insert an explicit fall through from the current block to the LoopBB.
   Builder.CreateBr(LoopBB);
 
   // Start insertion in LoopBB.
   Builder.SetInsertPoint(LoopBB);
-  
+
   // Within the loop, the variable is defined equal to the PHI node.  If it
   // shadows an existing variable, we have to restore it, so save it now.
   AllocaInst *OldVal = NamedValues[VarName];
   NamedValues[VarName] = Alloca;
-  
+
   // Emit the body of the loop.  This, like any other expr, can change the
   // current BB.  Note that we ignore the value computed by the body, but don't
   // allow an error.
   if (Body->Codegen() == 0)
     return 0;
-  
+
   // Emit the step value.
   Value *StepVal;
   if (Step) {
@@ -1213,52 +1213,52 @@ Value *ForExprAST::Codegen() {
     // If not specified, use 1.0.
     StepVal = ConstantFP::get(getGlobalContext(), APFloat(1.0));
   }
-  
+
   // Compute the end condition.
   Value *EndCond = End->Codegen();
   if (EndCond == 0) return EndCond;
-  
+
   // Reload, increment, and restore the alloca.  This handles the case where
   // the body of the loop mutates the variable.
   Value *CurVar = Builder.CreateLoad(Alloca, VarName.c_str());
   Value *NextVar = Builder.CreateFAdd(CurVar, StepVal, "nextvar");
   Builder.CreateStore(NextVar, Alloca);
-  
+
   // Convert condition to a bool by comparing equal to 0.0.
-  EndCond = Builder.CreateFCmpONE(EndCond, 
+  EndCond = Builder.CreateFCmpONE(EndCond,
                               ConstantFP::get(getGlobalContext(), APFloat(0.0)),
                                   "loopcond");
-  
+
   // Create the "after loop" block and insert it.
   BasicBlock *AfterBB = BasicBlock::Create(getGlobalContext(), "afterloop", TheFunction);
-  
+
   // Insert the conditional branch into the end of LoopEndBB.
   Builder.CreateCondBr(EndCond, LoopBB, AfterBB);
-  
+
   // Any new code will be inserted in AfterBB.
   Builder.SetInsertPoint(AfterBB);
-  
+
   // Restore the unshadowed variable.
   if (OldVal)
     NamedValues[VarName] = OldVal;
   else
     NamedValues.erase(VarName);
 
-  
+
   // for expr always returns 0.0.
   return Constant::getNullValue(Type::getDoubleTy(getGlobalContext()));
 }
 
 Value *VarExprAST::Codegen() {
   std::vector<AllocaInst *> OldBindings;
-  
+
   Function *TheFunction = Builder.GetInsertBlock()->getParent();
 
   // Register all variables and emit their initializer.
   for (unsigned i = 0, e = VarNames.size(); i != e; ++i) {
     const std::string &VarName = VarNames[i].first;
     ExprAST *Init = VarNames[i].second;
-    
+
     // Emit the initializer before adding the variable to scope, this prevents
     // the initializer from referencing the variable itself, and permits stuff
     // like this:
@@ -1271,22 +1271,22 @@ Value *VarExprAST::Codegen() {
     } else { // If not specified, use 0.0.
       InitVal = ConstantFP::get(getGlobalContext(), APFloat(0.0));
     }
-    
+
     AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
     Builder.CreateStore(InitVal, Alloca);
 
     // Remember the old variable binding so that we can restore the binding when
     // we unrecurse.
     OldBindings.push_back(NamedValues[VarName]);
-    
+
     // Remember this binding.
     NamedValues[VarName] = Alloca;
   }
-  
+
   // Codegen the body, now that all vars are in scope.
   Value *BodyVal = Body->Codegen();
   if (BodyVal == 0) return 0;
-  
+
   // Pop all our variables from scope.
   for (unsigned i = 0, e = VarNames.size(); i != e; ++i)
     NamedValues[VarNames[i].first] = OldBindings[i];
@@ -1297,7 +1297,7 @@ Value *VarExprAST::Codegen() {
 
 Function *PrototypeAST::Codegen() {
   // Make the function type:  double(double,double) etc.
-  std::vector<Type*> Doubles(Args.size(), 
+  std::vector<Type*> Doubles(Args.size(),
                              Type::getDoubleTy(getGlobalContext()));
   FunctionType *FT = FunctionType::get(Type::getDoubleTy(getGlobalContext()),
                                        Doubles, false);
@@ -1314,26 +1314,26 @@ Function *PrototypeAST::Codegen() {
     // Delete the one we just made and get the existing one.
     F->eraseFromParent();
     F = M->getFunction(Name);
-    
+
     // If F already has a body, reject this.
     if (!F->empty()) {
       ErrorF("redefinition of function");
       return 0;
     }
-    
+
     // If F took a different number of args, reject.
     if (F->arg_size() != Args.size()) {
       ErrorF("redefinition of function with different # args");
       return 0;
     }
   }
-  
+
   // Set names for all arguments.
   unsigned Idx = 0;
   for (Function::arg_iterator AI = F->arg_begin(); Idx != Args.size();
        ++AI, ++Idx)
     AI->setName(Args[Idx]);
-    
+
   return F;
 }
 
@@ -1355,19 +1355,19 @@ void PrototypeAST::CreateArgumentAllocas(Function *F) {
 
 Function *FunctionAST::Codegen() {
   NamedValues.clear();
-  
+
   Function *TheFunction = Proto->Codegen();
   if (TheFunction == 0)
     return 0;
-  
+
   // If this is an operator, install it.
   if (Proto->isBinaryOp())
     BinopPrecedence[Proto->getOperatorName()] = Proto->getBinaryPrecedence();
-  
+
   // Create a new basic block to start insertion into.
   BasicBlock *BB = BasicBlock::Create(getGlobalContext(), "entry", TheFunction);
   Builder.SetInsertPoint(BB);
-  
+
   // Add all arguments to the symbol table and create their allocas.
   Proto->CreateArgumentAllocas(TheFunction);
 
@@ -1428,7 +1428,7 @@ static void HandleTopLevelExpression() {
     if (Function *LF = F->Codegen()) {
       // JIT the function, returning a function pointer.
       void *FPtr = TheHelper->getPointerToFunction(LF);
-      
+
       // Cast it to the right type (takes no arguments, returns a double) so we
       // can call it as a native function.
       double (*FP)() = (double (*)())(intptr_t)FPtr;
@@ -1465,20 +1465,20 @@ static void MainLoop() {
 //===----------------------------------------------------------------------===//
 
 /// putchard - putchar that takes a double and returns 0.
-extern "C" 
+extern "C"
 double putchard(double X) {
   putchar((char)X);
   return 0;
 }
 
 /// printd - printf that takes a double prints it as "%f\n", returning 0.
-extern "C" 
+extern "C"
 double printd(double X) {
   printf("%f", X);
   return 0;
 }
 
-extern "C" 
+extern "C"
 double printlf() {
   printf("\n");
   return 0;
diff --git a/examples/Kaleidoscope/MCJIT/complete/toy.cpp b/examples/Kaleidoscope/MCJIT/complete/toy.cpp
index 3beb0d8..5294bb7 100644
--- a/examples/Kaleidoscope/MCJIT/complete/toy.cpp
+++ b/examples/Kaleidoscope/MCJIT/complete/toy.cpp
@@ -7,10 +7,10 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -37,7 +37,7 @@ namespace {
               cl::value_desc("input IR file name"));
 
   cl::opt<bool>
-  VerboseOutput("verbose", 
+  VerboseOutput("verbose",
                 cl::desc("Enable verbose output (results, IR, etc.) to stderr"),
                 cl::init(false));
 
@@ -830,8 +830,8 @@ private:
 
 class HelpingMemoryManager : public SectionMemoryManager
 {
-  HelpingMemoryManager(const HelpingMemoryManager&) LLVM_DELETED_FUNCTION;
-  void operator=(const HelpingMemoryManager&) LLVM_DELETED_FUNCTION;
+  HelpingMemoryManager(const HelpingMemoryManager&) = delete;
+  void operator=(const HelpingMemoryManager&) = delete;
 
 public:
   HelpingMemoryManager(MCJITHelper *Helper) : MasterHelper(Helper) {}
diff --git a/examples/Kaleidoscope/MCJIT/initial/toy.cpp b/examples/Kaleidoscope/MCJIT/initial/toy.cpp
index 2c1b297..dd35358 100644
--- a/examples/Kaleidoscope/MCJIT/initial/toy.cpp
+++ b/examples/Kaleidoscope/MCJIT/initial/toy.cpp
@@ -6,9 +6,9 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
 #include <cctype>
@@ -32,14 +32,14 @@ enum Token {
 
   // primary
   tok_identifier = -4, tok_number = -5,
-  
+
   // control
   tok_if = -6, tok_then = -7, tok_else = -8,
   tok_for = -9, tok_in = -10,
-  
+
   // operators
   tok_binary = -11, tok_unary = -12,
-  
+
   // var definition
   tok_var = -13
 };
@@ -88,11 +88,11 @@ static int gettok() {
     // Comment until end of line.
     do LastChar = getchar();
     while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
-    
+
     if (LastChar != EOF)
       return gettok();
   }
-  
+
   // Check for end of file.  Don't eat the EOF.
   if (LastChar == EOF)
     return tok_eof;
@@ -136,7 +136,7 @@ class UnaryExprAST : public ExprAST {
   char Opcode;
   ExprAST *Operand;
 public:
-  UnaryExprAST(char opcode, ExprAST *operand) 
+  UnaryExprAST(char opcode, ExprAST *operand)
     : Opcode(opcode), Operand(operand) {}
   virtual Value *Codegen();
 };
@@ -146,7 +146,7 @@ class BinaryExprAST : public ExprAST {
   char Op;
   ExprAST *LHS, *RHS;
 public:
-  BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs) 
+  BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs)
     : Op(op), LHS(lhs), RHS(rhs) {}
   virtual Value *Codegen();
 };
@@ -189,7 +189,7 @@ public:
   VarExprAST(const std::vector<std::pair<std::string, ExprAST*> > &varnames,
              ExprAST *body)
   : VarNames(varnames), Body(body) {}
-  
+
   virtual Value *Codegen();
 };
 
@@ -204,19 +204,19 @@ public:
   PrototypeAST(const std::string &name, const std::vector<std::string> &args,
                bool isoperator = false, unsigned prec = 0)
   : Name(name), Args(args), isOperator(isoperator), Precedence(prec) {}
-  
+
   bool isUnaryOp() const { return isOperator && Args.size() == 1; }
   bool isBinaryOp() const { return isOperator && Args.size() == 2; }
-  
+
   char getOperatorName() const {
     assert(isUnaryOp() || isBinaryOp());
     return Name[Name.size()-1];
   }
-  
+
   unsigned getBinaryPrecedence() const { return Precedence; }
-  
+
   Function *Codegen();
-  
+
   void CreateArgumentAllocas(Function *F);
 };
 
@@ -227,7 +227,7 @@ class FunctionAST {
 public:
   FunctionAST(PrototypeAST *proto, ExprAST *body)
     : Proto(proto), Body(body) {}
-  
+
   Function *Codegen();
 };
 
@@ -251,7 +251,7 @@ static std::map<char, int> BinopPrecedence;
 static int GetTokPrecedence() {
   if (!isascii(CurTok))
     return -1;
-  
+
   // Make sure it's a declared binop.
   int TokPrec = BinopPrecedence[CurTok];
   if (TokPrec <= 0) return -1;
@@ -270,12 +270,12 @@ static ExprAST *ParseExpression();
 ///   ::= identifier '(' expression* ')'
 static ExprAST *ParseIdentifierExpr() {
   std::string IdName = IdentifierStr;
-  
+
   getNextToken();  // eat identifier.
-  
+
   if (CurTok != '(') // Simple variable ref.
     return new VariableExprAST(IdName);
-  
+
   // Call.
   getNextToken();  // eat (
   std::vector<ExprAST*> Args;
@@ -295,7 +295,7 @@ static ExprAST *ParseIdentifierExpr() {
 
   // Eat the ')'.
   getNextToken();
-  
+
   return new CallExprAST(IdName, Args);
 }
 
@@ -311,7 +311,7 @@ static ExprAST *ParseParenExpr() {
   getNextToken();  // eat (.
   ExprAST *V = ParseExpression();
   if (!V) return 0;
-  
+
   if (CurTok != ')')
     return Error("expected ')'");
   getNextToken();  // eat ).
@@ -321,26 +321,26 @@ static ExprAST *ParseParenExpr() {
 /// ifexpr ::= 'if' expression 'then' expression 'else' expression
 static ExprAST *ParseIfExpr() {
   getNextToken();  // eat the if.
-  
+
   // condition.
   ExprAST *Cond = ParseExpression();
   if (!Cond) return 0;
-  
+
   if (CurTok != tok_then)
     return Error("expected then");
   getNextToken();  // eat the then
-  
+
   ExprAST *Then = ParseExpression();
   if (Then == 0) return 0;
-  
+
   if (CurTok != tok_else)
     return Error("expected else");
-  
+
   getNextToken();
-  
+
   ExprAST *Else = ParseExpression();
   if (!Else) return 0;
-  
+
   return new IfExprAST(Cond, Then, Else);
 }
 
@@ -350,24 +350,24 @@ static ExprAST *ParseForExpr() {
 
   if (CurTok != tok_identifier)
     return Error("expected identifier after for");
-  
+
   std::string IdName = IdentifierStr;
   getNextToken();  // eat identifier.
-  
+
   if (CurTok != '=')
     return Error("expected '=' after for");
   getNextToken();  // eat '='.
-  
-  
+
+
   ExprAST *Start = ParseExpression();
   if (Start == 0) return 0;
   if (CurTok != ',')
     return Error("expected ',' after for start value");
   getNextToken();
-  
+
   ExprAST *End = ParseExpression();
   if (End == 0) return 0;
-  
+
   // The step value is optional.
   ExprAST *Step = 0;
   if (CurTok == ',') {
@@ -375,18 +375,18 @@ static ExprAST *ParseForExpr() {
     Step = ParseExpression();
     if (Step == 0) return 0;
   }
-  
+
   if (CurTok != tok_in)
     return Error("expected 'in' after for");
   getNextToken();  // eat 'in'.
-  
+
   ExprAST *Body = ParseExpression();
   if (Body == 0) return 0;
 
   return new ForExprAST(IdName, Start, End, Step, Body);
 }
 
-/// varexpr ::= 'var' identifier ('=' expression)? 
+/// varexpr ::= 'var' identifier ('=' expression)?
 //                    (',' identifier ('=' expression)?)* 'in' expression
 static ExprAST *ParseVarExpr() {
   getNextToken();  // eat the var.
@@ -396,7 +396,7 @@ static ExprAST *ParseVarExpr() {
   // At least one variable name is required.
   if (CurTok != tok_identifier)
     return Error("expected identifier after var");
-  
+
   while (1) {
     std::string Name = IdentifierStr;
     getNextToken();  // eat identifier.
@@ -405,29 +405,29 @@ static ExprAST *ParseVarExpr() {
     ExprAST *Init = 0;
     if (CurTok == '=') {
       getNextToken(); // eat the '='.
-      
+
       Init = ParseExpression();
       if (Init == 0) return 0;
     }
-    
+
     VarNames.push_back(std::make_pair(Name, Init));
-    
+
     // End of var list, exit loop.
     if (CurTok != ',') break;
     getNextToken(); // eat the ','.
-    
+
     if (CurTok != tok_identifier)
       return Error("expected identifier list after var");
   }
-  
+
   // At this point, we have to have 'in'.
   if (CurTok != tok_in)
     return Error("expected 'in' keyword after 'var'");
   getNextToken();  // eat 'in'.
-  
+
   ExprAST *Body = ParseExpression();
   if (Body == 0) return 0;
-  
+
   return new VarExprAST(VarNames, Body);
 }
 
@@ -457,7 +457,7 @@ static ExprAST *ParseUnary() {
   // If the current token is not an operator, it must be a primary expr.
   if (!isascii(CurTok) || CurTok == '(' || CurTok == ',')
     return ParsePrimary();
-  
+
   // If this is a unary operator, read it.
   int Opc = CurTok;
   getNextToken();
@@ -472,20 +472,20 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
   // If this is a binop, find its precedence.
   while (1) {
     int TokPrec = GetTokPrecedence();
-    
+
     // If this is a binop that binds at least as tightly as the current binop,
     // consume it, otherwise we are done.
     if (TokPrec < ExprPrec)
       return LHS;
-    
+
     // Okay, we know this is a binop.
     int BinOp = CurTok;
     getNextToken();  // eat binop
-    
+
     // Parse the unary expression after the binary operator.
     ExprAST *RHS = ParseUnary();
     if (!RHS) return 0;
-    
+
     // If BinOp binds less tightly with RHS than the operator after RHS, let
     // the pending operator take RHS as its LHS.
     int NextPrec = GetTokPrecedence();
@@ -493,7 +493,7 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
       RHS = ParseBinOpRHS(TokPrec+1, RHS);
       if (RHS == 0) return 0;
     }
-    
+
     // Merge LHS/RHS.
     LHS = new BinaryExprAST(BinOp, LHS, RHS);
   }
@@ -505,7 +505,7 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
 static ExprAST *ParseExpression() {
   ExprAST *LHS = ParseUnary();
   if (!LHS) return 0;
-  
+
   return ParseBinOpRHS(0, LHS);
 }
 
@@ -515,10 +515,10 @@ static ExprAST *ParseExpression() {
 ///   ::= unary LETTER (id)
 static PrototypeAST *ParsePrototype() {
   std::string FnName;
-  
+
   unsigned Kind = 0; // 0 = identifier, 1 = unary, 2 = binary.
   unsigned BinaryPrecedence = 30;
-  
+
   switch (CurTok) {
   default:
     return ErrorP("Expected function name in prototype");
@@ -544,7 +544,7 @@ static PrototypeAST *ParsePrototype() {
     FnName += (char)CurTok;
     Kind = 2;
     getNextToken();
-    
+
     // Read the precedence if present.
     if (CurTok == tok_number) {
       if (NumVal < 1 || NumVal > 100)
@@ -554,23 +554,23 @@ static PrototypeAST *ParsePrototype() {
     }
     break;
   }
-  
+
   if (CurTok != '(')
     return ErrorP("Expected '(' in prototype");
-  
+
   std::vector<std::string> ArgNames;
   while (getNextToken() == tok_identifier)
     ArgNames.push_back(IdentifierStr);
   if (CurTok != ')')
     return ErrorP("Expected ')' in prototype");
-  
+
   // success.
   getNextToken();  // eat ')'.
-  
+
   // Verify right number of names for operator.
   if (Kind && ArgNames.size() != Kind)
     return ErrorP("Invalid number of operands for operator");
-  
+
   return new PrototypeAST(FnName, ArgNames, Kind != 0, BinaryPrecedence);
 }
 
@@ -670,14 +670,14 @@ private:
 
 class HelpingMemoryManager : public SectionMemoryManager
 {
-  HelpingMemoryManager(const HelpingMemoryManager&) LLVM_DELETED_FUNCTION;
-  void operator=(const HelpingMemoryManager&) LLVM_DELETED_FUNCTION;
+  HelpingMemoryManager(const HelpingMemoryManager&) = delete;
+  void operator=(const HelpingMemoryManager&) = delete;
 
 public:
   HelpingMemoryManager(MCJITHelper *Helper) : MasterHelper(Helper) {}
   virtual ~HelpingMemoryManager() {}
 
-  /// This method returns the address of the specified function. 
+  /// This method returns the address of the specified function.
   /// Our implementation will attempt to find functions in other
   /// modules associated with the MCJITHelper to cross link functions
   /// from one generated module to another.
@@ -739,9 +739,9 @@ Function *MCJITHelper::getFunction(const std::string FnName) {
 
       // If we don't have a prototype yet, create one.
       if (!PF)
-        PF = Function::Create(F->getFunctionType(), 
-                                      Function::ExternalLinkage, 
-                                      FnName, 
+        PF = Function::Create(F->getFunctionType(),
+                                      Function::ExternalLinkage,
+                                      FnName,
                                       OpenModule);
       return PF;
     }
@@ -885,11 +885,11 @@ Value *VariableExprAST::Codegen() {
 Value *UnaryExprAST::Codegen() {
   Value *OperandV = Operand->Codegen();
   if (OperandV == 0) return 0;
-  
+
   Function *F = TheHelper->getFunction(MakeLegalFunctionName(std::string("unary")+Opcode));
   if (F == 0)
     return ErrorV("Unknown unary operator");
-  
+
   return Builder.CreateCall(F, OperandV, "unop");
 }
 
@@ -911,11 +911,11 @@ Value *BinaryExprAST::Codegen() {
     Builder.CreateStore(Val, Variable);
     return Val;
   }
-  
+
   Value *L = LHS->Codegen();
   Value *R = RHS->Codegen();
   if (L == 0 || R == 0) return 0;
-  
+
   switch (Op) {
   case '+': return Builder.CreateFAdd(L, R, "addtmp");
   case '-': return Builder.CreateFSub(L, R, "subtmp");
@@ -928,12 +928,12 @@ Value *BinaryExprAST::Codegen() {
                                 "booltmp");
   default: break;
   }
-  
+
   // If it wasn't a builtin binary operator, it must be a user defined one. Emit
   // a call to it.
   Function *F = TheHelper->getFunction(MakeLegalFunctionName(std::string("binary")+Op));
   assert(F && "binary operator not found!");
-  
+
   Value *Ops[] = { L, R };
   return Builder.CreateCall(F, Ops, "binop");
 }
@@ -943,7 +943,7 @@ Value *CallExprAST::Codegen() {
   Function *CalleeF = TheHelper->getFunction(Callee);
   if (CalleeF == 0)
     return ErrorV("Unknown function referenced");
-  
+
   // If argument mismatch error.
   if (CalleeF->arg_size() != Args.size())
     return ErrorV("Incorrect # arguments passed");
@@ -953,56 +953,56 @@ Value *CallExprAST::Codegen() {
     ArgsV.push_back(Args[i]->Codegen());
     if (ArgsV.back() == 0) return 0;
   }
-  
+
   return Builder.CreateCall(CalleeF, ArgsV, "calltmp");
 }
 
 Value *IfExprAST::Codegen() {
   Value *CondV = Cond->Codegen();
   if (CondV == 0) return 0;
-  
+
   // Convert condition to a bool by comparing equal to 0.0.
-  CondV = Builder.CreateFCmpONE(CondV, 
+  CondV = Builder.CreateFCmpONE(CondV,
                               ConstantFP::get(getGlobalContext(), APFloat(0.0)),
                                 "ifcond");
-  
+
   Function *TheFunction = Builder.GetInsertBlock()->getParent();
-  
+
   // Create blocks for the then and else cases.  Insert the 'then' block at the
   // end of the function.
   BasicBlock *ThenBB = BasicBlock::Create(getGlobalContext(), "then", TheFunction);
   BasicBlock *ElseBB = BasicBlock::Create(getGlobalContext(), "else");
   BasicBlock *MergeBB = BasicBlock::Create(getGlobalContext(), "ifcont");
-  
+
   Builder.CreateCondBr(CondV, ThenBB, ElseBB);
-  
+
   // Emit then value.
   Builder.SetInsertPoint(ThenBB);
-  
+
   Value *ThenV = Then->Codegen();
   if (ThenV == 0) return 0;
-  
+
   Builder.CreateBr(MergeBB);
   // Codegen of 'Then' can change the current block, update ThenBB for the PHI.
   ThenBB = Builder.GetInsertBlock();
-  
+
   // Emit else block.
   TheFunction->getBasicBlockList().push_back(ElseBB);
   Builder.SetInsertPoint(ElseBB);
-  
+
   Value *ElseV = Else->Codegen();
   if (ElseV == 0) return 0;
-  
+
   Builder.CreateBr(MergeBB);
   // Codegen of 'Else' can change the current block, update ElseBB for the PHI.
   ElseBB = Builder.GetInsertBlock();
-  
+
   // Emit merge block.
   TheFunction->getBasicBlockList().push_back(MergeBB);
   Builder.SetInsertPoint(MergeBB);
   PHINode *PN = Builder.CreatePHI(Type::getDoubleTy(getGlobalContext()), 2,
                                   "iftmp");
-  
+
   PN->addIncoming(ThenV, ThenBB);
   PN->addIncoming(ElseV, ElseBB);
   return PN;
@@ -1015,7 +1015,7 @@ Value *ForExprAST::Codegen() {
   //   start = startexpr
   //   store start -> var
   //   goto loop
-  // loop: 
+  // loop:
   //   ...
   //   bodyexpr
   //   ...
@@ -1028,40 +1028,40 @@ Value *ForExprAST::Codegen() {
   //   store nextvar -> var
   //   br endcond, loop, endloop
   // outloop:
-  
+
   Function *TheFunction = Builder.GetInsertBlock()->getParent();
 
   // Create an alloca for the variable in the entry block.
   AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
-  
+
   // Emit the start code first, without 'variable' in scope.
   Value *StartVal = Start->Codegen();
   if (StartVal == 0) return 0;
-  
+
   // Store the value into the alloca.
   Builder.CreateStore(StartVal, Alloca);
-  
+
   // Make the new basic block for the loop header, inserting after current
   // block.
   BasicBlock *LoopBB = BasicBlock::Create(getGlobalContext(), "loop", TheFunction);
-  
+
   // Insert an explicit fall through from the current block to the LoopBB.
   Builder.CreateBr(LoopBB);
 
   // Start insertion in LoopBB.
   Builder.SetInsertPoint(LoopBB);
-  
+
   // Within the loop, the variable is defined equal to the PHI node.  If it
   // shadows an existing variable, we have to restore it, so save it now.
   AllocaInst *OldVal = NamedValues[VarName];
   NamedValues[VarName] = Alloca;
-  
+
   // Emit the body of the loop.  This, like any other expr, can change the
   // current BB.  Note that we ignore the value computed by the body, but don't
   // allow an error.
   if (Body->Codegen() == 0)
     return 0;
-  
+
   // Emit the step value.
   Value *StepVal;
   if (Step) {
@@ -1071,52 +1071,52 @@ Value *ForExprAST::Codegen() {
     // If not specified, use 1.0.
     StepVal = ConstantFP::get(getGlobalContext(), APFloat(1.0));
   }
-  
+
   // Compute the end condition.
   Value *EndCond = End->Codegen();
   if (EndCond == 0) return EndCond;
-  
+
   // Reload, increment, and restore the alloca.  This handles the case where
   // the body of the loop mutates the variable.
   Value *CurVar = Builder.CreateLoad(Alloca, VarName.c_str());
   Value *NextVar = Builder.CreateFAdd(CurVar, StepVal, "nextvar");
   Builder.CreateStore(NextVar, Alloca);
-  
+
   // Convert condition to a bool by comparing equal to 0.0.
-  EndCond = Builder.CreateFCmpONE(EndCond, 
+  EndCond = Builder.CreateFCmpONE(EndCond,
                               ConstantFP::get(getGlobalContext(), APFloat(0.0)),
                                   "loopcond");
-  
+
   // Create the "after loop" block and insert it.
   BasicBlock *AfterBB = BasicBlock::Create(getGlobalContext(), "afterloop", TheFunction);
-  
+
   // Insert the conditional branch into the end of LoopEndBB.
   Builder.CreateCondBr(EndCond, LoopBB, AfterBB);
-  
+
   // Any new code will be inserted in AfterBB.
   Builder.SetInsertPoint(AfterBB);
-  
+
   // Restore the unshadowed variable.
   if (OldVal)
     NamedValues[VarName] = OldVal;
   else
     NamedValues.erase(VarName);
 
-  
+
   // for expr always returns 0.0.
   return Constant::getNullValue(Type::getDoubleTy(getGlobalContext()));
 }
 
 Value *VarExprAST::Codegen() {
   std::vector<AllocaInst *> OldBindings;
-  
+
   Function *TheFunction = Builder.GetInsertBlock()->getParent();
 
   // Register all variables and emit their initializer.
   for (unsigned i = 0, e = VarNames.size(); i != e; ++i) {
     const std::string &VarName = VarNames[i].first;
     ExprAST *Init = VarNames[i].second;
-    
+
     // Emit the initializer before adding the variable to scope, this prevents
     // the initializer from referencing the variable itself, and permits stuff
     // like this:
@@ -1129,22 +1129,22 @@ Value *VarExprAST::Codegen() {
     } else { // If not specified, use 0.0.
       InitVal = ConstantFP::get(getGlobalContext(), APFloat(0.0));
     }
-    
+
     AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
     Builder.CreateStore(InitVal, Alloca);
 
     // Remember the old variable binding so that we can restore the binding when
     // we unrecurse.
     OldBindings.push_back(NamedValues[VarName]);
-    
+
     // Remember this binding.
     NamedValues[VarName] = Alloca;
   }
-  
+
   // Codegen the body, now that all vars are in scope.
   Value *BodyVal = Body->Codegen();
   if (BodyVal == 0) return 0;
-  
+
   // Pop all our variables from scope.
   for (unsigned i = 0, e = VarNames.size(); i != e; ++i)
     NamedValues[VarNames[i].first] = OldBindings[i];
@@ -1155,7 +1155,7 @@ Value *VarExprAST::Codegen() {
 
 Function *PrototypeAST::Codegen() {
   // Make the function type:  double(double,double) etc.
-  std::vector<Type*> Doubles(Args.size(), 
+  std::vector<Type*> Doubles(Args.size(),
                              Type::getDoubleTy(getGlobalContext()));
   FunctionType *FT = FunctionType::get(Type::getDoubleTy(getGlobalContext()),
                                        Doubles, false);
@@ -1172,26 +1172,26 @@ Function *PrototypeAST::Codegen() {
     // Delete the one we just made and get the existing one.
     F->eraseFromParent();
     F = M->getFunction(Name);
-    
+
     // If F already has a body, reject this.
     if (!F->empty()) {
       ErrorF("redefinition of function");
       return 0;
     }
-    
+
     // If F took a different number of args, reject.
     if (F->arg_size() != Args.size()) {
       ErrorF("redefinition of function with different # args");
       return 0;
     }
   }
-  
+
   // Set names for all arguments.
   unsigned Idx = 0;
   for (Function::arg_iterator AI = F->arg_begin(); Idx != Args.size();
        ++AI, ++Idx)
     AI->setName(Args[Idx]);
-    
+
   return F;
 }
 
@@ -1213,19 +1213,19 @@ void PrototypeAST::CreateArgumentAllocas(Function *F) {
 
 Function *FunctionAST::Codegen() {
   NamedValues.clear();
-  
+
   Function *TheFunction = Proto->Codegen();
   if (TheFunction == 0)
     return 0;
-  
+
   // If this is an operator, install it.
   if (Proto->isBinaryOp())
     BinopPrecedence[Proto->getOperatorName()] = Proto->getBinaryPrecedence();
-  
+
   // Create a new basic block to start insertion into.
   BasicBlock *BB = BasicBlock::Create(getGlobalContext(), "entry", TheFunction);
   Builder.SetInsertPoint(BB);
-  
+
   // Add all arguments to the symbol table and create their allocas.
   Proto->CreateArgumentAllocas(TheFunction);
 
@@ -1238,7 +1238,7 @@ Function *FunctionAST::Codegen() {
 
     return TheFunction;
   }
-  
+
   // Error reading body, remove function.
   TheFunction->eraseFromParent();
 
@@ -1285,7 +1285,7 @@ static void HandleTopLevelExpression() {
     if (Function *LF = F->Codegen()) {
       // JIT the function, returning a function pointer.
       void *FPtr = TheHelper->getPointerToFunction(LF);
-      
+
       // Cast it to the right type (takes no arguments, returns a double) so we
       // can call it as a native function.
       double (*FP)() = (double (*)())(intptr_t)FPtr;
@@ -1322,20 +1322,20 @@ static void MainLoop() {
 //===----------------------------------------------------------------------===//
 
 /// putchard - putchar that takes a double and returns 0.
-extern "C" 
+extern "C"
 double putchard(double X) {
   putchar((char)X);
   return 0;
 }
 
 /// printd - printf that takes a double prints it as "%f\n", returning 0.
-extern "C" 
+extern "C"
 double printd(double X) {
   printf("%f", X);
   return 0;
 }
 
-extern "C" 
+extern "C"
 double printlf() {
   printf("\n");
   return 0;
diff --git a/examples/Kaleidoscope/MCJIT/lazy/toy-jit.cpp b/examples/Kaleidoscope/MCJIT/lazy/toy-jit.cpp
index 98c1001..07adbd4 100644
--- a/examples/Kaleidoscope/MCJIT/lazy/toy-jit.cpp
+++ b/examples/Kaleidoscope/MCJIT/lazy/toy-jit.cpp
@@ -6,9 +6,9 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
 #include <cctype>
diff --git a/examples/Kaleidoscope/MCJIT/lazy/toy.cpp b/examples/Kaleidoscope/MCJIT/lazy/toy.cpp
index 9c2a0d4..fe7fb61 100644
--- a/examples/Kaleidoscope/MCJIT/lazy/toy.cpp
+++ b/examples/Kaleidoscope/MCJIT/lazy/toy.cpp
@@ -8,9 +8,9 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
 #include <cctype>
@@ -34,14 +34,14 @@ enum Token {
 
   // primary
   tok_identifier = -4, tok_number = -5,
-  
+
   // control
   tok_if = -6, tok_then = -7, tok_else = -8,
   tok_for = -9, tok_in = -10,
-  
+
   // operators
   tok_binary = -11, tok_unary = -12,
-  
+
   // var definition
   tok_var = -13
 };
@@ -90,11 +90,11 @@ static int gettok() {
     // Comment until end of line.
     do LastChar = getchar();
     while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
-    
+
     if (LastChar != EOF)
       return gettok();
   }
-  
+
   // Check for end of file.  Don't eat the EOF.
   if (LastChar == EOF)
     return tok_eof;
@@ -138,7 +138,7 @@ class UnaryExprAST : public ExprAST {
   char Opcode;
   ExprAST *Operand;
 public:
-  UnaryExprAST(char opcode, ExprAST *operand) 
+  UnaryExprAST(char opcode, ExprAST *operand)
     : Opcode(opcode), Operand(operand) {}
   virtual Value *Codegen();
 };
@@ -148,7 +148,7 @@ class BinaryExprAST : public ExprAST {
   char Op;
   ExprAST *LHS, *RHS;
 public:
-  BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs) 
+  BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs)
     : Op(op), LHS(lhs), RHS(rhs) {}
   virtual Value *Codegen();
 };
@@ -191,7 +191,7 @@ public:
   VarExprAST(const std::vector<std::pair<std::string, ExprAST*> > &varnames,
              ExprAST *body)
   : VarNames(varnames), Body(body) {}
-  
+
   virtual Value *Codegen();
 };
 
@@ -206,19 +206,19 @@ public:
   PrototypeAST(const std::string &name, const std::vector<std::string> &args,
                bool isoperator = false, unsigned prec = 0)
   : Name(name), Args(args), isOperator(isoperator), Precedence(prec) {}
-  
+
   bool isUnaryOp() const { return isOperator && Args.size() == 1; }
   bool isBinaryOp() const { return isOperator && Args.size() == 2; }
-  
+
   char getOperatorName() const {
     assert(isUnaryOp() || isBinaryOp());
     return Name[Name.size()-1];
   }
-  
+
   unsigned getBinaryPrecedence() const { return Precedence; }
-  
+
   Function *Codegen();
-  
+
   void CreateArgumentAllocas(Function *F);
 };
 
@@ -229,7 +229,7 @@ class FunctionAST {
 public:
   FunctionAST(PrototypeAST *proto, ExprAST *body)
     : Proto(proto), Body(body) {}
-  
+
   Function *Codegen();
 };
 
@@ -253,7 +253,7 @@ static std::map<char, int> BinopPrecedence;
 static int GetTokPrecedence() {
   if (!isascii(CurTok))
     return -1;
-  
+
   // Make sure it's a declared binop.
   int TokPrec = BinopPrecedence[CurTok];
   if (TokPrec <= 0) return -1;
@@ -272,12 +272,12 @@ static ExprAST *ParseExpression();
 ///   ::= identifier '(' expression* ')'
 static ExprAST *ParseIdentifierExpr() {
   std::string IdName = IdentifierStr;
-  
+
   getNextToken();  // eat identifier.
-  
+
   if (CurTok != '(') // Simple variable ref.
     return new VariableExprAST(IdName);
-  
+
   // Call.
   getNextToken();  // eat (
   std::vector<ExprAST*> Args;
@@ -297,7 +297,7 @@ static ExprAST *ParseIdentifierExpr() {
 
   // Eat the ')'.
   getNextToken();
-  
+
   return new CallExprAST(IdName, Args);
 }
 
@@ -313,7 +313,7 @@ static ExprAST *ParseParenExpr() {
   getNextToken();  // eat (.
   ExprAST *V = ParseExpression();
   if (!V) return 0;
-  
+
   if (CurTok != ')')
     return Error("expected ')'");
   getNextToken();  // eat ).
@@ -323,26 +323,26 @@ static ExprAST *ParseParenExpr() {
 /// ifexpr ::= 'if' expression 'then' expression 'else' expression
 static ExprAST *ParseIfExpr() {
   getNextToken();  // eat the if.
-  
+
   // condition.
   ExprAST *Cond = ParseExpression();
   if (!Cond) return 0;
-  
+
   if (CurTok != tok_then)
     return Error("expected then");
   getNextToken();  // eat the then
-  
+
   ExprAST *Then = ParseExpression();
   if (Then == 0) return 0;
-  
+
   if (CurTok != tok_else)
     return Error("expected else");
-  
+
   getNextToken();
-  
+
   ExprAST *Else = ParseExpression();
   if (!Else) return 0;
-  
+
   return new IfExprAST(Cond, Then, Else);
 }
 
@@ -352,24 +352,24 @@ static ExprAST *ParseForExpr() {
 
   if (CurTok != tok_identifier)
     return Error("expected identifier after for");
-  
+
   std::string IdName = IdentifierStr;
   getNextToken();  // eat identifier.
-  
+
   if (CurTok != '=')
     return Error("expected '=' after for");
   getNextToken();  // eat '='.
-  
-  
+
+
   ExprAST *Start = ParseExpression();
   if (Start == 0) return 0;
   if (CurTok != ',')
     return Error("expected ',' after for start value");
   getNextToken();
-  
+
   ExprAST *End = ParseExpression();
   if (End == 0) return 0;
-  
+
   // The step value is optional.
   ExprAST *Step = 0;
   if (CurTok == ',') {
@@ -377,18 +377,18 @@ static ExprAST *ParseForExpr() {
     Step = ParseExpression();
     if (Step == 0) return 0;
   }
-  
+
   if (CurTok != tok_in)
     return Error("expected 'in' after for");
   getNextToken();  // eat 'in'.
-  
+
   ExprAST *Body = ParseExpression();
   if (Body == 0) return 0;
 
   return new ForExprAST(IdName, Start, End, Step, Body);
 }
 
-/// varexpr ::= 'var' identifier ('=' expression)? 
+/// varexpr ::= 'var' identifier ('=' expression)?
 //                    (',' identifier ('=' expression)?)* 'in' expression
 static ExprAST *ParseVarExpr() {
   getNextToken();  // eat the var.
@@ -398,7 +398,7 @@ static ExprAST *ParseVarExpr() {
   // At least one variable name is required.
   if (CurTok != tok_identifier)
     return Error("expected identifier after var");
-  
+
   while (1) {
     std::string Name = IdentifierStr;
     getNextToken();  // eat identifier.
@@ -407,29 +407,29 @@ static ExprAST *ParseVarExpr() {
     ExprAST *Init = 0;
     if (CurTok == '=') {
       getNextToken(); // eat the '='.
-      
+
       Init = ParseExpression();
       if (Init == 0) return 0;
     }
-    
+
     VarNames.push_back(std::make_pair(Name, Init));
-    
+
     // End of var list, exit loop.
     if (CurTok != ',') break;
     getNextToken(); // eat the ','.
-    
+
     if (CurTok != tok_identifier)
       return Error("expected identifier list after var");
   }
-  
+
   // At this point, we have to have 'in'.
   if (CurTok != tok_in)
     return Error("expected 'in' keyword after 'var'");
   getNextToken();  // eat 'in'.
-  
+
   ExprAST *Body = ParseExpression();
   if (Body == 0) return 0;
-  
+
   return new VarExprAST(VarNames, Body);
 }
 
@@ -459,7 +459,7 @@ static ExprAST *ParseUnary() {
   // If the current token is not an operator, it must be a primary expr.
   if (!isascii(CurTok) || CurTok == '(' || CurTok == ',')
     return ParsePrimary();
-  
+
   // If this is a unary operator, read it.
   int Opc = CurTok;
   getNextToken();
@@ -474,20 +474,20 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
   // If this is a binop, find its precedence.
   while (1) {
     int TokPrec = GetTokPrecedence();
-    
+
     // If this is a binop that binds at least as tightly as the current binop,
     // consume it, otherwise we are done.
     if (TokPrec < ExprPrec)
       return LHS;
-    
+
     // Okay, we know this is a binop.
     int BinOp = CurTok;
     getNextToken();  // eat binop
-    
+
     // Parse the unary expression after the binary operator.
     ExprAST *RHS = ParseUnary();
     if (!RHS) return 0;
-    
+
     // If BinOp binds less tightly with RHS than the operator after RHS, let
     // the pending operator take RHS as its LHS.
     int NextPrec = GetTokPrecedence();
@@ -495,7 +495,7 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
       RHS = ParseBinOpRHS(TokPrec+1, RHS);
       if (RHS == 0) return 0;
     }
-    
+
     // Merge LHS/RHS.
     LHS = new BinaryExprAST(BinOp, LHS, RHS);
   }
@@ -507,7 +507,7 @@ static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
 static ExprAST *ParseExpression() {
   ExprAST *LHS = ParseUnary();
   if (!LHS) return 0;
-  
+
   return ParseBinOpRHS(0, LHS);
 }
 
@@ -517,10 +517,10 @@ static ExprAST *ParseExpression() {
 ///   ::= unary LETTER (id)
 static PrototypeAST *ParsePrototype() {
   std::string FnName;
-  
+
   unsigned Kind = 0; // 0 = identifier, 1 = unary, 2 = binary.
   unsigned BinaryPrecedence = 30;
-  
+
   switch (CurTok) {
   default:
     return ErrorP("Expected function name in prototype");
@@ -546,7 +546,7 @@ static PrototypeAST *ParsePrototype() {
     FnName += (char)CurTok;
     Kind = 2;
     getNextToken();
-    
+
     // Read the precedence if present.
     if (CurTok == tok_number) {
       if (NumVal < 1 || NumVal > 100)
@@ -556,23 +556,23 @@ static PrototypeAST *ParsePrototype() {
     }
     break;
   }
-  
+
   if (CurTok != '(')
     return ErrorP("Expected '(' in prototype");
-  
+
   std::vector<std::string> ArgNames;
   while (getNextToken() == tok_identifier)
     ArgNames.push_back(IdentifierStr);
   if (CurTok != ')')
     return ErrorP("Expected ')' in prototype");
-  
+
   // success.
   getNextToken();  // eat ')'.
-  
+
   // Verify right number of names for operator.
   if (Kind && ArgNames.size() != Kind)
     return ErrorP("Invalid number of operands for operator");
-  
+
   return new PrototypeAST(FnName, ArgNames, Kind != 0, BinaryPrecedence);
 }
 
@@ -673,14 +673,14 @@ private:
 
 class HelpingMemoryManager : public SectionMemoryManager
 {
-  HelpingMemoryManager(const HelpingMemoryManager&) LLVM_DELETED_FUNCTION;
-  void operator=(const HelpingMemoryManager&) LLVM_DELETED_FUNCTION;
+  HelpingMemoryManager(const HelpingMemoryManager&) = delete;
+  void operator=(const HelpingMemoryManager&) = delete;
 
 public:
   HelpingMemoryManager(MCJITHelper *Helper) : MasterHelper(Helper) {}
   virtual ~HelpingMemoryManager() {}
 
-  /// This method returns the address of the specified function. 
+  /// This method returns the address of the specified function.
   /// Our implementation will attempt to find functions in other
   /// modules associated with the MCJITHelper to cross link functions
   /// from one generated module to another.
@@ -749,9 +749,9 @@ Function *MCJITHelper::getFunction(const std::string FnName) {
 
       // If we don't have a prototype yet, create one.
       if (!PF)
-        PF = Function::Create(F->getFunctionType(), 
-                                      Function::ExternalLinkage, 
-                                      FnName, 
+        PF = Function::Create(F->getFunctionType(),
+                                      Function::ExternalLinkage,
+                                      FnName,
                                       OpenModule);
       return PF;
     }
@@ -925,11 +925,11 @@ Value *VariableExprAST::Codegen() {
 Value *UnaryExprAST::Codegen() {
   Value *OperandV = Operand->Codegen();
   if (OperandV == 0) return 0;
-  
+
   Function *F = TheHelper->getFunction(MakeLegalFunctionName(std::string("unary")+Opcode));
   if (F == 0)
     return ErrorV("Unknown unary operator");
-  
+
   return Builder.CreateCall(F, OperandV, "unop");
 }
 
@@ -951,11 +951,11 @@ Value *BinaryExprAST::Codegen() {
     Builder.CreateStore(Val, Variable);
     return Val;
   }
-  
+
   Value *L = LHS->Codegen();
   Value *R = RHS->Codegen();
   if (L == 0 || R == 0) return 0;
-  
+
   switch (Op) {
   case '+': return Builder.CreateFAdd(L, R, "addtmp");
   case '-': return Builder.CreateFSub(L, R, "subtmp");
@@ -968,12 +968,12 @@ Value *BinaryExprAST::Codegen() {
                                 "booltmp");
   default: break;
   }
-  
+
   // If it wasn't a builtin binary operator, it must be a user defined one. Emit
   // a call to it.
   Function *F = TheHelper->getFunction(MakeLegalFunctionName(std::string("binary")+Op));
   assert(F && "binary operator not found!");
-  
+
   Value *Ops[] = { L, R };
   return Builder.CreateCall(F, Ops, "binop");
 }
@@ -983,7 +983,7 @@ Value *CallExprAST::Codegen() {
   Function *CalleeF = TheHelper->getFunction(Callee);
   if (CalleeF == 0)
     return ErrorV("Unknown function referenced");
-  
+
   // If argument mismatch error.
   if (CalleeF->arg_size() != Args.size())
     return ErrorV("Incorrect # arguments passed");
@@ -993,56 +993,56 @@ Value *CallExprAST::Codegen() {
     ArgsV.push_back(Args[i]->Codegen());
     if (ArgsV.back() == 0) return 0;
   }
-  
+
   return Builder.CreateCall(CalleeF, ArgsV, "calltmp");
 }
 
 Value *IfExprAST::Codegen() {
   Value *CondV = Cond->Codegen();
   if (CondV == 0) return 0;
-  
+
   // Convert condition to a bool by comparing equal to 0.0.
-  CondV = Builder.CreateFCmpONE(CondV, 
+  CondV = Builder.CreateFCmpONE(CondV,
                               ConstantFP::get(getGlobalContext(), APFloat(0.0)),
                                 "ifcond");
-  
+
   Function *TheFunction = Builder.GetInsertBlock()->getParent();
-  
+
   // Create blocks for the then and else cases.  Insert the 'then' block at the
   // end of the function.
   BasicBlock *ThenBB = BasicBlock::Create(getGlobalContext(), "then", TheFunction);
   BasicBlock *ElseBB = BasicBlock::Create(getGlobalContext(), "else");
   BasicBlock *MergeBB = BasicBlock::Create(getGlobalContext(), "ifcont");
-  
+
   Builder.CreateCondBr(CondV, ThenBB, ElseBB);
-  
+
   // Emit then value.
   Builder.SetInsertPoint(ThenBB);
-  
+
   Value *ThenV = Then->Codegen();
   if (ThenV == 0) return 0;
-  
+
   Builder.CreateBr(MergeBB);
   // Codegen of 'Then' can change the current block, update ThenBB for the PHI.
   ThenBB = Builder.GetInsertBlock();
-  
+
   // Emit else block.
   TheFunction->getBasicBlockList().push_back(ElseBB);
   Builder.SetInsertPoint(ElseBB);
-  
+
   Value *ElseV = Else->Codegen();
   if (ElseV == 0) return 0;
-  
+
   Builder.CreateBr(MergeBB);
   // Codegen of 'Else' can change the current block, update ElseBB for the PHI.
   ElseBB = Builder.GetInsertBlock();
-  
+
   // Emit merge block.
   TheFunction->getBasicBlockList().push_back(MergeBB);
   Builder.SetInsertPoint(MergeBB);
   PHINode *PN = Builder.CreatePHI(Type::getDoubleTy(getGlobalContext()), 2,
                                   "iftmp");
-  
+
   PN->addIncoming(ThenV, ThenBB);
   PN->addIncoming(ElseV, ElseBB);
   return PN;
@@ -1055,7 +1055,7 @@ Value *ForExprAST::Codegen() {
   //   start = startexpr
   //   store start -> var
   //   goto loop
-  // loop: 
+  // loop:
   //   ...
   //   bodyexpr
   //   ...
@@ -1068,40 +1068,40 @@ Value *ForExprAST::Codegen() {
   //   store nextvar -> var
   //   br endcond, loop, endloop
   // outloop:
-  
+
   Function *TheFunction = Builder.GetInsertBlock()->getParent();
 
   // Create an alloca for the variable in the entry block.
   AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
-  
+
   // Emit the start code first, without 'variable' in scope.
   Value *StartVal = Start->Codegen();
   if (StartVal == 0) return 0;
-  
+
   // Store the value into the alloca.
   Builder.CreateStore(StartVal, Alloca);
-  
+
   // Make the new basic block for the loop header, inserting after current
   // block.
   BasicBlock *LoopBB = BasicBlock::Create(getGlobalContext(), "loop", TheFunction);
-  
+
   // Insert an explicit fall through from the current block to the LoopBB.
   Builder.CreateBr(LoopBB);
 
   // Start insertion in LoopBB.
   Builder.SetInsertPoint(LoopBB);
-  
+
   // Within the loop, the variable is defined equal to the PHI node.  If it
   // shadows an existing variable, we have to restore it, so save it now.
   AllocaInst *OldVal = NamedValues[VarName];
   NamedValues[VarName] = Alloca;
-  
+
   // Emit the body of the loop.  This, like any other expr, can change the
   // current BB.  Note that we ignore the value computed by the body, but don't
   // allow an error.
   if (Body->Codegen() == 0)
     return 0;
-  
+
   // Emit the step value.
   Value *StepVal;
   if (Step) {
@@ -1111,52 +1111,52 @@ Value *ForExprAST::Codegen() {
     // If not specified, use 1.0.
     StepVal = ConstantFP::get(getGlobalContext(), APFloat(1.0));
   }
-  
+
   // Compute the end condition.
   Value *EndCond = End->Codegen();
   if (EndCond == 0) return EndCond;
-  
+
   // Reload, increment, and restore the alloca.  This handles the case where
   // the body of the loop mutates the variable.
   Value *CurVar = Builder.CreateLoad(Alloca, VarName.c_str());
   Value *NextVar = Builder.CreateFAdd(CurVar, StepVal, "nextvar");
   Builder.CreateStore(NextVar, Alloca);
-  
+
   // Convert condition to a bool by comparing equal to 0.0.
-  EndCond = Builder.CreateFCmpONE(EndCond, 
+  EndCond = Builder.CreateFCmpONE(EndCond,
                               ConstantFP::get(getGlobalContext(), APFloat(0.0)),
                                   "loopcond");
-  
+
   // Create the "after loop" block and insert it.
   BasicBlock *AfterBB = BasicBlock::Create(getGlobalContext(), "afterloop", TheFunction);
-  
+
   // Insert the conditional branch into the end of LoopEndBB.
   Builder.CreateCondBr(EndCond, LoopBB, AfterBB);
-  
+
   // Any new code will be inserted in AfterBB.
   Builder.SetInsertPoint(AfterBB);
-  
+
   // Restore the unshadowed variable.
   if (OldVal)
     NamedValues[VarName] = OldVal;
   else
     NamedValues.erase(VarName);
 
-  
+
   // for expr always returns 0.0.
   return Constant::getNullValue(Type::getDoubleTy(getGlobalContext()));
 }
 
 Value *VarExprAST::Codegen() {
   std::vector<AllocaInst *> OldBindings;
-  
+
   Function *TheFunction = Builder.GetInsertBlock()->getParent();
 
   // Register all variables and emit their initializer.
   for (unsigned i = 0, e = VarNames.size(); i != e; ++i) {
     const std::string &VarName = VarNames[i].first;
     ExprAST *Init = VarNames[i].second;
-    
+
     // Emit the initializer before adding the variable to scope, this prevents
     // the initializer from referencing the variable itself, and permits stuff
     // like this:
@@ -1169,22 +1169,22 @@ Value *VarExprAST::Codegen() {
     } else { // If not specified, use 0.0.
       InitVal = ConstantFP::get(getGlobalContext(), APFloat(0.0));
     }
-    
+
     AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
     Builder.CreateStore(InitVal, Alloca);
 
     // Remember the old variable binding so that we can restore the binding when
     // we unrecurse.
     OldBindings.push_back(NamedValues[VarName]);
-    
+
     // Remember this binding.
     NamedValues[VarName] = Alloca;
   }
-  
+
   // Codegen the body, now that all vars are in scope.
   Value *BodyVal = Body->Codegen();
   if (BodyVal == 0) return 0;
-  
+
   // Pop all our variables from scope.
   for (unsigned i = 0, e = VarNames.size(); i != e; ++i)
     NamedValues[VarNames[i].first] = OldBindings[i];
@@ -1195,7 +1195,7 @@ Value *VarExprAST::Codegen() {
 
 Function *PrototypeAST::Codegen() {
   // Make the function type:  double(double,double) etc.
-  std::vector<Type*> Doubles(Args.size(), 
+  std::vector<Type*> Doubles(Args.size(),
                              Type::getDoubleTy(getGlobalContext()));
   FunctionType *FT = FunctionType::get(Type::getDoubleTy(getGlobalContext()),
                                        Doubles, false);
@@ -1212,26 +1212,26 @@ Function *PrototypeAST::Codegen() {
     // Delete the one we just made and get the existing one.
     F->eraseFromParent();
     F = M->getFunction(Name);
-    
+
     // If F already has a body, reject this.
     if (!F->empty()) {
       ErrorF("redefinition of function");
       return 0;
     }
-    
+
     // If F took a different number of args, reject.
     if (F->arg_size() != Args.size()) {
       ErrorF("redefinition of function with different # args");
       return 0;
     }
   }
-  
+
   // Set names for all arguments.
   unsigned Idx = 0;
   for (Function::arg_iterator AI = F->arg_begin(); Idx != Args.size();
        ++AI, ++Idx)
     AI->setName(Args[Idx]);
-    
+
   return F;
 }
 
@@ -1253,19 +1253,19 @@ void PrototypeAST::CreateArgumentAllocas(Function *F) {
 
 Function *FunctionAST::Codegen() {
   NamedValues.clear();
-  
+
   Function *TheFunction = Proto->Codegen();
   if (TheFunction == 0)
     return 0;
-  
+
   // If this is an operator, install it.
   if (Proto->isBinaryOp())
     BinopPrecedence[Proto->getOperatorName()] = Proto->getBinaryPrecedence();
-  
+
   // Create a new basic block to start insertion into.
   BasicBlock *BB = BasicBlock::Create(getGlobalContext(), "entry", TheFunction);
   Builder.SetInsertPoint(BB);
-  
+
   // Add all arguments to the symbol table and create their allocas.
   Proto->CreateArgumentAllocas(TheFunction);
 
@@ -1326,7 +1326,7 @@ static void HandleTopLevelExpression() {
     if (Function *LF = F->Codegen()) {
       // JIT the function, returning a function pointer.
       void *FPtr = TheHelper->getPointerToFunction(LF);
-      
+
       // Cast it to the right type (takes no arguments, returns a double) so we
       // can call it as a native function.
       double (*FP)() = (double (*)())(intptr_t)FPtr;
@@ -1363,20 +1363,20 @@ static void MainLoop() {
 //===----------------------------------------------------------------------===//
 
 /// putchard - putchar that takes a double and returns 0.
-extern "C" 
+extern "C"
 double putchard(double X) {
   putchar((char)X);
   return 0;
 }
 
 /// printd - printf that takes a double prints it as "%f\n", returning 0.
-extern "C" 
+extern "C"
 double printd(double X) {
   printf("%f", X);
   return 0;
 }
 
-extern "C" 
+extern "C"
 double printlf() {
   printf("\n");
   return 0;
diff --git a/examples/Kaleidoscope/Makefile b/examples/Kaleidoscope/Makefile
index bd0c252..8c3b1e3 100644
--- a/examples/Kaleidoscope/Makefile
+++ b/examples/Kaleidoscope/Makefile
@@ -10,6 +10,6 @@ LEVEL=../..
 
 include $(LEVEL)/Makefile.config
 
-PARALLEL_DIRS:= Chapter2 Chapter3 Chapter4 Chapter5 Chapter6 Chapter7
+PARALLEL_DIRS:= Chapter2 Chapter3 Chapter4 Chapter5 Chapter6 Chapter7 Chapter8
 
 include $(LEVEL)/Makefile.common
diff --git a/examples/Kaleidoscope/Orc/CMakeLists.txt b/examples/Kaleidoscope/Orc/CMakeLists.txt
new file mode 100644
index 0000000..5aa0454
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_subdirectory(initial)
+add_subdirectory(lazy_codegen)
+add_subdirectory(lazy_irgen)
+add_subdirectory(fully_lazy)
diff --git a/examples/Kaleidoscope/Orc/fully_lazy/CMakeLists.txt b/examples/Kaleidoscope/Orc/fully_lazy/CMakeLists.txt
new file mode 100644
index 0000000..abb0428
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/fully_lazy/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  ExecutionEngine
+  Object
+  OrcJIT
+  RuntimeDyld
+  Support
+  native
+  )
+
+add_kaleidoscope_chapter(Kaleidoscope-Orc-fully_lazy
+  toy.cpp
+  )
diff --git a/examples/Kaleidoscope/Orc/fully_lazy/Makefile b/examples/Kaleidoscope/Orc/fully_lazy/Makefile
new file mode 100644
index 0000000..5536314
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/fully_lazy/Makefile
@@ -0,0 +1,17 @@
+UNAME := $(shell uname -s)
+
+ifeq ($(UNAME),Darwin)
+	CXX := xcrun --sdk macosx clang++
+else
+	CXX := clang++
+endif
+
+LLVM_CXXFLAGS := $(shell llvm-config --cxxflags)
+LLVM_LDFLAGS := $(shell llvm-config  --ldflags --system-libs --libs core orcjit native)
+
+toy: toy.cpp
+	$(CXX) $(LLVM_CXXFLAGS) -Wall -std=c++11 -g -O0 -rdynamic -fno-rtti -o toy toy.cpp $(LLVM_LDFLAGS)
+
+.PHONY: clean
+clean:
+	rm -f toy
diff --git a/examples/Kaleidoscope/Orc/fully_lazy/README.txt b/examples/Kaleidoscope/Orc/fully_lazy/README.txt
new file mode 100644
index 0000000..c018931
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/fully_lazy/README.txt
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===/
+//                 Kaleidoscope with Orc - Lazy IRGen Version
+//===----------------------------------------------------------------------===//
+
+This version of Kaleidoscope with Orc demonstrates fully lazy IR-generation.
+Building on the lazy-irgen version of the tutorial, this version injects JIT
+callbacks to defer the bulk of IR-generation and code-generation of functions until
+they are first called.
+
+When a function definition is entered, a JIT callback is created and a stub
+function is built that will call the body of the function indirectly. The body of
+the function is *not* IRGen'd at this point. Instead, the function pointer for
+the indirect call is initialized to point at the JIT callback, and the compile
+action for the callback is initialized with a lambda that IRGens the body of the
+function and adds it to the JIT. The function pointer is updated by the JIT
+callback's update action to point at the newly emitted function body, so future
+calls to the stub will go straight to the body, not through the JIT.
+
+This directory contains a Makefile that allows the code to be built in a
+standalone manner, independent of the larger LLVM build infrastructure. To build
+the program you will need to have 'clang++' and 'llvm-config' in your path.
diff --git a/examples/Kaleidoscope/Orc/fully_lazy/toy.cpp b/examples/Kaleidoscope/Orc/fully_lazy/toy.cpp
new file mode 100644
index 0000000..2e65756
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/fully_lazy/toy.cpp
@@ -0,0 +1,1437 @@
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/LazyEmittingLayer.h"
+#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/OrcTargetSupport.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Transforms/Scalar.h"
+#include <cctype>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::orc;
+
+//===----------------------------------------------------------------------===//
+// Lexer
+//===----------------------------------------------------------------------===//
+
+// The lexer returns tokens [0-255] if it is an unknown character, otherwise one
+// of these for known things.
+enum Token {
+  tok_eof = -1,
+
+  // commands
+  tok_def = -2, tok_extern = -3,
+
+  // primary
+  tok_identifier = -4, tok_number = -5,
+  
+  // control
+  tok_if = -6, tok_then = -7, tok_else = -8,
+  tok_for = -9, tok_in = -10,
+  
+  // operators
+  tok_binary = -11, tok_unary = -12,
+  
+  // var definition
+  tok_var = -13
+};
+
+static std::string IdentifierStr;  // Filled in if tok_identifier
+static double NumVal;              // Filled in if tok_number
+
+/// gettok - Return the next token from standard input.
+static int gettok() {
+  static int LastChar = ' ';
+
+  // Skip any whitespace.
+  while (isspace(LastChar))
+    LastChar = getchar();
+
+  if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]*
+    IdentifierStr = LastChar;
+    while (isalnum((LastChar = getchar())))
+      IdentifierStr += LastChar;
+
+    if (IdentifierStr == "def") return tok_def;
+    if (IdentifierStr == "extern") return tok_extern;
+    if (IdentifierStr == "if") return tok_if;
+    if (IdentifierStr == "then") return tok_then;
+    if (IdentifierStr == "else") return tok_else;
+    if (IdentifierStr == "for") return tok_for;
+    if (IdentifierStr == "in") return tok_in;
+    if (IdentifierStr == "binary") return tok_binary;
+    if (IdentifierStr == "unary") return tok_unary;
+    if (IdentifierStr == "var") return tok_var;
+    return tok_identifier;
+  }
+
+  if (isdigit(LastChar) || LastChar == '.') {   // Number: [0-9.]+
+    std::string NumStr;
+    do {
+      NumStr += LastChar;
+      LastChar = getchar();
+    } while (isdigit(LastChar) || LastChar == '.');
+
+    NumVal = strtod(NumStr.c_str(), 0);
+    return tok_number;
+  }
+
+  if (LastChar == '#') {
+    // Comment until end of line.
+    do LastChar = getchar();
+    while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
+    
+    if (LastChar != EOF)
+      return gettok();
+  }
+  
+  // Check for end of file.  Don't eat the EOF.
+  if (LastChar == EOF)
+    return tok_eof;
+
+  // Otherwise, just return the character as its ascii value.
+  int ThisChar = LastChar;
+  LastChar = getchar();
+  return ThisChar;
+}
+
+//===----------------------------------------------------------------------===//
+// Abstract Syntax Tree (aka Parse Tree)
+//===----------------------------------------------------------------------===//
+
+class IRGenContext;
+
+/// ExprAST - Base class for all expression nodes.
+struct ExprAST {
+  virtual ~ExprAST() {}
+  virtual Value *IRGen(IRGenContext &C) const = 0;
+};
+
+/// NumberExprAST - Expression class for numeric literals like "1.0".
+struct NumberExprAST : public ExprAST {
+  NumberExprAST(double Val) : Val(Val) {}
+  Value *IRGen(IRGenContext &C) const override;
+
+  double Val;
+};
+
+/// VariableExprAST - Expression class for referencing a variable, like "a".
+struct VariableExprAST : public ExprAST {
+  VariableExprAST(std::string Name) : Name(std::move(Name)) {}
+  Value *IRGen(IRGenContext &C) const override;
+
+  std::string Name;
+};
+
+/// UnaryExprAST - Expression class for a unary operator.
+struct UnaryExprAST : public ExprAST {
+  UnaryExprAST(char Opcode, std::unique_ptr<ExprAST> Operand) 
+    : Opcode(std::move(Opcode)), Operand(std::move(Operand)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  char Opcode;
+  std::unique_ptr<ExprAST> Operand;
+};
+
+/// BinaryExprAST - Expression class for a binary operator.
+struct BinaryExprAST : public ExprAST {
+  BinaryExprAST(char Op, std::unique_ptr<ExprAST> LHS,
+                std::unique_ptr<ExprAST> RHS) 
+    : Op(Op), LHS(std::move(LHS)), RHS(std::move(RHS)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  char Op;
+  std::unique_ptr<ExprAST> LHS, RHS;
+};
+
+/// CallExprAST - Expression class for function calls.
+struct CallExprAST : public ExprAST {
+  CallExprAST(std::string CalleeName,
+              std::vector<std::unique_ptr<ExprAST>> Args)
+    : CalleeName(std::move(CalleeName)), Args(std::move(Args)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  std::string CalleeName;
+  std::vector<std::unique_ptr<ExprAST>> Args;
+};
+
+/// IfExprAST - Expression class for if/then/else.
+struct IfExprAST : public ExprAST {
+  IfExprAST(std::unique_ptr<ExprAST> Cond, std::unique_ptr<ExprAST> Then,
+            std::unique_ptr<ExprAST> Else)
+    : Cond(std::move(Cond)), Then(std::move(Then)), Else(std::move(Else)) {}
+  Value *IRGen(IRGenContext &C) const override;
+
+  std::unique_ptr<ExprAST> Cond, Then, Else;
+};
+
+/// ForExprAST - Expression class for for/in.
+struct ForExprAST : public ExprAST {
+  ForExprAST(std::string VarName, std::unique_ptr<ExprAST> Start,
+             std::unique_ptr<ExprAST> End, std::unique_ptr<ExprAST> Step,
+             std::unique_ptr<ExprAST> Body)
+    : VarName(std::move(VarName)), Start(std::move(Start)), End(std::move(End)),
+      Step(std::move(Step)), Body(std::move(Body)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  std::string VarName;
+  std::unique_ptr<ExprAST> Start, End, Step, Body;
+};
+
+/// VarExprAST - Expression class for var/in
+struct VarExprAST : public ExprAST {
+  typedef std::pair<std::string, std::unique_ptr<ExprAST>> Binding;
+  typedef std::vector<Binding> BindingList;
+
+  VarExprAST(BindingList VarBindings, std::unique_ptr<ExprAST> Body)
+    : VarBindings(std::move(VarBindings)), Body(std::move(Body)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  BindingList VarBindings;
+  std::unique_ptr<ExprAST> Body;
+};
+
+/// PrototypeAST - This class represents the "prototype" for a function,
+/// which captures its argument names as well as if it is an operator.
+struct PrototypeAST {
+  PrototypeAST(std::string Name, std::vector<std::string> Args,
+               bool IsOperator = false, unsigned Precedence = 0)
+    : Name(std::move(Name)), Args(std::move(Args)), IsOperator(IsOperator),
+      Precedence(Precedence) {}
+
+  Function *IRGen(IRGenContext &C) const;
+  void CreateArgumentAllocas(Function *F, IRGenContext &C);
+
+  bool isUnaryOp() const { return IsOperator && Args.size() == 1; }
+  bool isBinaryOp() const { return IsOperator && Args.size() == 2; }
+  
+  char getOperatorName() const {
+    assert(isUnaryOp() || isBinaryOp());
+    return Name[Name.size()-1];
+  }
+
+  std::string Name;
+  std::vector<std::string> Args;
+  bool IsOperator;
+  unsigned Precedence;  // Precedence if a binary op.
+};
+
+/// FunctionAST - This class represents a function definition itself.
+struct FunctionAST {
+  FunctionAST(std::unique_ptr<PrototypeAST> Proto,
+              std::unique_ptr<ExprAST> Body)
+    : Proto(std::move(Proto)), Body(std::move(Body)) {}
+
+  Function *IRGen(IRGenContext &C) const;
+
+  std::unique_ptr<PrototypeAST> Proto;
+  std::unique_ptr<ExprAST> Body;
+};
+
+//===----------------------------------------------------------------------===//
+// Parser
+//===----------------------------------------------------------------------===//
+
+/// CurTok/getNextToken - Provide a simple token buffer.  CurTok is the current
+/// token the parser is looking at.  getNextToken reads another token from the
+/// lexer and updates CurTok with its results.
+static int CurTok;
+static int getNextToken() {
+  return CurTok = gettok();
+}
+
+/// BinopPrecedence - This holds the precedence for each binary operator that is
+/// defined.
+static std::map<char, int> BinopPrecedence;
+
+/// GetTokPrecedence - Get the precedence of the pending binary operator token.
+static int GetTokPrecedence() {
+  if (!isascii(CurTok))
+    return -1;
+  
+  // Make sure it's a declared binop.
+  int TokPrec = BinopPrecedence[CurTok];
+  if (TokPrec <= 0) return -1;
+  return TokPrec;
+}
+
+template <typename T>
+std::unique_ptr<T> ErrorU(const std::string &Str) {
+  std::cerr << "Error: " << Str << "\n";
+  return nullptr;
+}
+
+template <typename T>
+T* ErrorP(const std::string &Str) {
+  std::cerr << "Error: " << Str << "\n";
+  return nullptr;
+}
+
+static std::unique_ptr<ExprAST> ParseExpression();
+
+/// identifierexpr
+///   ::= identifier
+///   ::= identifier '(' expression* ')'
+static std::unique_ptr<ExprAST> ParseIdentifierExpr() {
+  std::string IdName = IdentifierStr;
+  
+  getNextToken();  // eat identifier.
+  
+  if (CurTok != '(') // Simple variable ref.
+    return llvm::make_unique<VariableExprAST>(IdName);
+  
+  // Call.
+  getNextToken();  // eat (
+  std::vector<std::unique_ptr<ExprAST>> Args;
+  if (CurTok != ')') {
+    while (1) {
+      auto Arg = ParseExpression();
+      if (!Arg) return nullptr;
+      Args.push_back(std::move(Arg));
+
+      if (CurTok == ')') break;
+
+      if (CurTok != ',')
+        return ErrorU<CallExprAST>("Expected ')' or ',' in argument list");
+      getNextToken();
+    }
+  }
+
+  // Eat the ')'.
+  getNextToken();
+  
+  return llvm::make_unique<CallExprAST>(IdName, std::move(Args));
+}
+
+/// numberexpr ::= number
+static std::unique_ptr<NumberExprAST> ParseNumberExpr() {
+  auto Result = llvm::make_unique<NumberExprAST>(NumVal);
+  getNextToken(); // consume the number
+  return Result;
+}
+
+/// parenexpr ::= '(' expression ')'
+static std::unique_ptr<ExprAST> ParseParenExpr() {
+  getNextToken();  // eat (.
+  auto V = ParseExpression();
+  if (!V)
+    return nullptr;
+  
+  if (CurTok != ')')
+    return ErrorU<ExprAST>("expected ')'");
+  getNextToken();  // eat ).
+  return V;
+}
+
+/// ifexpr ::= 'if' expression 'then' expression 'else' expression
+static std::unique_ptr<ExprAST> ParseIfExpr() {
+  getNextToken();  // eat the if.
+  
+  // condition.
+  auto Cond = ParseExpression();
+  if (!Cond)
+    return nullptr;
+  
+  if (CurTok != tok_then)
+    return ErrorU<ExprAST>("expected then");
+  getNextToken();  // eat the then
+  
+  auto Then = ParseExpression();
+  if (!Then)
+    return nullptr;
+  
+  if (CurTok != tok_else)
+    return ErrorU<ExprAST>("expected else");
+  
+  getNextToken();
+  
+  auto Else = ParseExpression();
+  if (!Else)
+    return nullptr;
+  
+  return llvm::make_unique<IfExprAST>(std::move(Cond), std::move(Then),
+                                      std::move(Else));
+}
+
+/// forexpr ::= 'for' identifier '=' expr ',' expr (',' expr)? 'in' expression
+static std::unique_ptr<ForExprAST> ParseForExpr() {
+  getNextToken();  // eat the for.
+
+  if (CurTok != tok_identifier)
+    return ErrorU<ForExprAST>("expected identifier after for");
+  
+  std::string IdName = IdentifierStr;
+  getNextToken();  // eat identifier.
+  
+  if (CurTok != '=')
+    return ErrorU<ForExprAST>("expected '=' after for");
+  getNextToken();  // eat '='.
+  
+  
+  auto Start = ParseExpression();
+  if (!Start)
+    return nullptr;
+  if (CurTok != ',')
+    return ErrorU<ForExprAST>("expected ',' after for start value");
+  getNextToken();
+  
+  auto End = ParseExpression();
+  if (!End)
+    return nullptr;
+  
+  // The step value is optional.
+  std::unique_ptr<ExprAST> Step;
+  if (CurTok == ',') {
+    getNextToken();
+    Step = ParseExpression();
+    if (!Step)
+      return nullptr;
+  }
+  
+  if (CurTok != tok_in)
+    return ErrorU<ForExprAST>("expected 'in' after for");
+  getNextToken();  // eat 'in'.
+  
+  auto Body = ParseExpression();
+  if (Body)
+    return nullptr;
+
+  return llvm::make_unique<ForExprAST>(IdName, std::move(Start), std::move(End),
+                                       std::move(Step), std::move(Body));
+}
+
+/// varexpr ::= 'var' identifier ('=' expression)? 
+//                    (',' identifier ('=' expression)?)* 'in' expression
+static std::unique_ptr<VarExprAST> ParseVarExpr() {
+  getNextToken();  // eat the var.
+
+  VarExprAST::BindingList VarBindings;
+
+  // At least one variable name is required.
+  if (CurTok != tok_identifier)
+    return ErrorU<VarExprAST>("expected identifier after var");
+  
+  while (1) {
+    std::string Name = IdentifierStr;
+    getNextToken();  // eat identifier.
+
+    // Read the optional initializer.
+    std::unique_ptr<ExprAST> Init;
+    if (CurTok == '=') {
+      getNextToken(); // eat the '='.
+      
+      Init = ParseExpression();
+      if (!Init)
+        return nullptr;
+    }
+    
+    VarBindings.push_back(VarExprAST::Binding(Name, std::move(Init)));
+    
+    // End of var list, exit loop.
+    if (CurTok != ',') break;
+    getNextToken(); // eat the ','.
+    
+    if (CurTok != tok_identifier)
+      return ErrorU<VarExprAST>("expected identifier list after var");
+  }
+  
+  // At this point, we have to have 'in'.
+  if (CurTok != tok_in)
+    return ErrorU<VarExprAST>("expected 'in' keyword after 'var'");
+  getNextToken();  // eat 'in'.
+  
+  auto Body = ParseExpression();
+  if (!Body)
+    return nullptr;
+  
+  return llvm::make_unique<VarExprAST>(std::move(VarBindings), std::move(Body));
+}
+
+/// primary
+///   ::= identifierexpr
+///   ::= numberexpr
+///   ::= parenexpr
+///   ::= ifexpr
+///   ::= forexpr
+///   ::= varexpr
+static std::unique_ptr<ExprAST> ParsePrimary() {
+  switch (CurTok) {
+  default: return ErrorU<ExprAST>("unknown token when expecting an expression");
+  case tok_identifier: return ParseIdentifierExpr();
+  case tok_number:     return ParseNumberExpr();
+  case '(':            return ParseParenExpr();
+  case tok_if:         return ParseIfExpr();
+  case tok_for:        return ParseForExpr();
+  case tok_var:        return ParseVarExpr();
+  }
+}
+
+/// unary
+///   ::= primary
+///   ::= '!' unary
+static std::unique_ptr<ExprAST> ParseUnary() {
+  // If the current token is not an operator, it must be a primary expr.
+  if (!isascii(CurTok) || CurTok == '(' || CurTok == ',')
+    return ParsePrimary();
+  
+  // If this is a unary operator, read it.
+  int Opc = CurTok;
+  getNextToken();
+  if (auto Operand = ParseUnary())
+    return llvm::make_unique<UnaryExprAST>(Opc, std::move(Operand));
+  return nullptr;
+}
+
+/// binoprhs
+///   ::= ('+' unary)*
+static std::unique_ptr<ExprAST> ParseBinOpRHS(int ExprPrec,
+                                              std::unique_ptr<ExprAST> LHS) {
+  // If this is a binop, find its precedence.
+  while (1) {
+    int TokPrec = GetTokPrecedence();
+    
+    // If this is a binop that binds at least as tightly as the current binop,
+    // consume it, otherwise we are done.
+    if (TokPrec < ExprPrec)
+      return LHS;
+    
+    // Okay, we know this is a binop.
+    int BinOp = CurTok;
+    getNextToken();  // eat binop
+    
+    // Parse the unary expression after the binary operator.
+    auto RHS = ParseUnary();
+    if (!RHS)
+      return nullptr;
+    
+    // If BinOp binds less tightly with RHS than the operator after RHS, let
+    // the pending operator take RHS as its LHS.
+    int NextPrec = GetTokPrecedence();
+    if (TokPrec < NextPrec) {
+      RHS = ParseBinOpRHS(TokPrec+1, std::move(RHS));
+      if (!RHS)
+        return nullptr;
+    }
+    
+    // Merge LHS/RHS.
+    LHS = llvm::make_unique<BinaryExprAST>(BinOp, std::move(LHS), std::move(RHS));
+  }
+}
+
+/// expression
+///   ::= unary binoprhs
+///
+static std::unique_ptr<ExprAST> ParseExpression() {
+  auto LHS = ParseUnary();
+  if (!LHS)
+    return nullptr;
+  
+  return ParseBinOpRHS(0, std::move(LHS));
+}
+
+/// prototype
+///   ::= id '(' id* ')'
+///   ::= binary LETTER number? (id, id)
+///   ::= unary LETTER (id)
+static std::unique_ptr<PrototypeAST> ParsePrototype() {
+  std::string FnName;
+  
+  unsigned Kind = 0; // 0 = identifier, 1 = unary, 2 = binary.
+  unsigned BinaryPrecedence = 30;
+  
+  switch (CurTok) {
+  default:
+    return ErrorU<PrototypeAST>("Expected function name in prototype");
+  case tok_identifier:
+    FnName = IdentifierStr;
+    Kind = 0;
+    getNextToken();
+    break;
+  case tok_unary:
+    getNextToken();
+    if (!isascii(CurTok))
+      return ErrorU<PrototypeAST>("Expected unary operator");
+    FnName = "unary";
+    FnName += (char)CurTok;
+    Kind = 1;
+    getNextToken();
+    break;
+  case tok_binary:
+    getNextToken();
+    if (!isascii(CurTok))
+      return ErrorU<PrototypeAST>("Expected binary operator");
+    FnName = "binary";
+    FnName += (char)CurTok;
+    Kind = 2;
+    getNextToken();
+    
+    // Read the precedence if present.
+    if (CurTok == tok_number) {
+      if (NumVal < 1 || NumVal > 100)
+        return ErrorU<PrototypeAST>("Invalid precedecnce: must be 1..100");
+      BinaryPrecedence = (unsigned)NumVal;
+      getNextToken();
+    }
+    break;
+  }
+  
+  if (CurTok != '(')
+    return ErrorU<PrototypeAST>("Expected '(' in prototype");
+  
+  std::vector<std::string> ArgNames;
+  while (getNextToken() == tok_identifier)
+    ArgNames.push_back(IdentifierStr);
+  if (CurTok != ')')
+    return ErrorU<PrototypeAST>("Expected ')' in prototype");
+  
+  // success.
+  getNextToken();  // eat ')'.
+  
+  // Verify right number of names for operator.
+  if (Kind && ArgNames.size() != Kind)
+    return ErrorU<PrototypeAST>("Invalid number of operands for operator");
+  
+  return llvm::make_unique<PrototypeAST>(FnName, std::move(ArgNames), Kind != 0,
+                                         BinaryPrecedence);
+}
+
+/// definition ::= 'def' prototype expression
+static std::unique_ptr<FunctionAST> ParseDefinition() {
+  getNextToken();  // eat def.
+  auto Proto = ParsePrototype();
+  if (!Proto)
+    return nullptr;
+
+  if (auto Body = ParseExpression())
+    return llvm::make_unique<FunctionAST>(std::move(Proto), std::move(Body));
+  return nullptr;
+}
+
+/// toplevelexpr ::= expression
+static std::unique_ptr<FunctionAST> ParseTopLevelExpr() {
+  if (auto E = ParseExpression()) {
+    // Make an anonymous proto.
+    auto Proto =
+      llvm::make_unique<PrototypeAST>("__anon_expr", std::vector<std::string>());
+    return llvm::make_unique<FunctionAST>(std::move(Proto), std::move(E));
+  }
+  return nullptr;
+}
+
+/// external ::= 'extern' prototype
+static std::unique_ptr<PrototypeAST> ParseExtern() {
+  getNextToken();  // eat extern.
+  return ParsePrototype();
+}
+
+//===----------------------------------------------------------------------===//
+// Code Generation
+//===----------------------------------------------------------------------===//
+
+// FIXME: Obviously we can do better than this
+std::string GenerateUniqueName(const std::string &Root) {
+  static int i = 0;
+  std::ostringstream NameStream;
+  NameStream << Root << ++i;
+  return NameStream.str();
+}
+
+std::string MakeLegalFunctionName(std::string Name)
+{
+  std::string NewName;
+  assert(!Name.empty() && "Base name must not be empty");
+
+  // Start with what we have
+  NewName = Name;
+
+  // Look for a numberic first character
+  if (NewName.find_first_of("0123456789") == 0) {
+    NewName.insert(0, 1, 'n');
+  }
+
+  // Replace illegal characters with their ASCII equivalent
+  std::string legal_elements = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+  size_t pos;
+  while ((pos = NewName.find_first_not_of(legal_elements)) != std::string::npos) {
+    std::ostringstream NumStream;
+    NumStream << (int)NewName.at(pos);
+    NewName = NewName.replace(pos, 1, NumStream.str());
+  }
+
+  return NewName;
+}
+
+class SessionContext {
+public:
+  SessionContext(LLVMContext &C)
+    : Context(C), TM(EngineBuilder().selectTarget()) {}
+  LLVMContext& getLLVMContext() const { return Context; }
+  TargetMachine& getTarget() { return *TM; }
+  void addPrototypeAST(std::unique_ptr<PrototypeAST> P);
+  PrototypeAST* getPrototypeAST(const std::string &Name);
+private:
+  typedef std::map<std::string, std::unique_ptr<PrototypeAST>> PrototypeMap;
+  
+  LLVMContext &Context;
+  std::unique_ptr<TargetMachine> TM;
+      
+  PrototypeMap Prototypes;
+};
+
+void SessionContext::addPrototypeAST(std::unique_ptr<PrototypeAST> P) {
+  Prototypes[P->Name] = std::move(P);
+}
+
+PrototypeAST* SessionContext::getPrototypeAST(const std::string &Name) {
+  PrototypeMap::iterator I = Prototypes.find(Name);
+  if (I != Prototypes.end())
+    return I->second.get();
+  return nullptr;
+}
+
+class IRGenContext {
+public:
+
+  IRGenContext(SessionContext &S)
+    : Session(S),
+      M(new Module(GenerateUniqueName("jit_module_"),
+                   Session.getLLVMContext())),
+      Builder(Session.getLLVMContext()) {
+    M->setDataLayout(Session.getTarget().getDataLayout());
+  }
+
+  SessionContext& getSession() { return Session; }
+  Module& getM() const { return *M; }
+  std::unique_ptr<Module> takeM() { return std::move(M); }
+  IRBuilder<>& getBuilder() { return Builder; }
+  LLVMContext& getLLVMContext() { return Session.getLLVMContext(); }
+  Function* getPrototype(const std::string &Name);
+
+  std::map<std::string, AllocaInst*> NamedValues;
+private:
+  SessionContext &Session;
+  std::unique_ptr<Module> M;
+  IRBuilder<> Builder;
+};
+
+Function* IRGenContext::getPrototype(const std::string &Name) {
+  if (Function *ExistingProto = M->getFunction(Name))
+    return ExistingProto;
+  if (PrototypeAST *ProtoAST = Session.getPrototypeAST(Name))
+    return ProtoAST->IRGen(*this);
+  return nullptr;
+}
+
+/// CreateEntryBlockAlloca - Create an alloca instruction in the entry block of
+/// the function.  This is used for mutable variables etc.
+static AllocaInst *CreateEntryBlockAlloca(Function *TheFunction,
+                                          const std::string &VarName) {
+  IRBuilder<> TmpB(&TheFunction->getEntryBlock(),
+                 TheFunction->getEntryBlock().begin());
+  return TmpB.CreateAlloca(Type::getDoubleTy(getGlobalContext()), 0,
+                           VarName.c_str());
+}
+
+Value *NumberExprAST::IRGen(IRGenContext &C) const {
+  return ConstantFP::get(C.getLLVMContext(), APFloat(Val));
+}
+
+Value *VariableExprAST::IRGen(IRGenContext &C) const {
+  // Look this variable up in the function.
+  Value *V = C.NamedValues[Name];
+
+  if (V == 0)
+    return ErrorP<Value>("Unknown variable name '" + Name + "'");
+
+  // Load the value.
+  return C.getBuilder().CreateLoad(V, Name.c_str());
+}
+
+Value *UnaryExprAST::IRGen(IRGenContext &C) const {
+  if (Value *OperandV = Operand->IRGen(C)) {
+    std::string FnName = MakeLegalFunctionName(std::string("unary")+Opcode);
+    if (Function *F = C.getPrototype(FnName))
+      return C.getBuilder().CreateCall(F, OperandV, "unop");
+    return ErrorP<Value>("Unknown unary operator");
+  }
+
+  // Could not codegen operand - return null.
+  return nullptr;
+}
+
+Value *BinaryExprAST::IRGen(IRGenContext &C) const {
+  // Special case '=' because we don't want to emit the LHS as an expression.
+  if (Op == '=') {
+    // Assignment requires the LHS to be an identifier.
+    auto LHSVar = static_cast<VariableExprAST&>(*LHS);
+    // Codegen the RHS.
+    Value *Val = RHS->IRGen(C);
+    if (!Val) return nullptr;
+
+    // Look up the name.
+    if (auto Variable = C.NamedValues[LHSVar.Name]) {
+      C.getBuilder().CreateStore(Val, Variable);
+      return Val;
+    }
+    return ErrorP<Value>("Unknown variable name");
+  }
+  
+  Value *L = LHS->IRGen(C);
+  Value *R = RHS->IRGen(C);
+  if (!L || !R) return nullptr;
+  
+  switch (Op) {
+  case '+': return C.getBuilder().CreateFAdd(L, R, "addtmp");
+  case '-': return C.getBuilder().CreateFSub(L, R, "subtmp");
+  case '*': return C.getBuilder().CreateFMul(L, R, "multmp");
+  case '/': return C.getBuilder().CreateFDiv(L, R, "divtmp");
+  case '<':
+    L = C.getBuilder().CreateFCmpULT(L, R, "cmptmp");
+    // Convert bool 0/1 to double 0.0 or 1.0
+    return C.getBuilder().CreateUIToFP(L, Type::getDoubleTy(getGlobalContext()),
+                                "booltmp");
+  default: break;
+  }
+  
+  // If it wasn't a builtin binary operator, it must be a user defined one. Emit
+  // a call to it.
+  std::string FnName = MakeLegalFunctionName(std::string("binary")+Op);
+  if (Function *F = C.getPrototype(FnName)) {
+    Value *Ops[] = { L, R };
+    return C.getBuilder().CreateCall(F, Ops, "binop");
+  }
+   
+  return ErrorP<Value>("Unknown binary operator");
+}
+
+Value *CallExprAST::IRGen(IRGenContext &C) const {
+  // Look up the name in the global module table.
+  if (auto CalleeF = C.getPrototype(CalleeName)) {
+    // If argument mismatch error.
+    if (CalleeF->arg_size() != Args.size())
+      return ErrorP<Value>("Incorrect # arguments passed");
+
+    std::vector<Value*> ArgsV;
+    for (unsigned i = 0, e = Args.size(); i != e; ++i) {
+      ArgsV.push_back(Args[i]->IRGen(C));
+      if (!ArgsV.back()) return nullptr;
+    }
+    
+    return C.getBuilder().CreateCall(CalleeF, ArgsV, "calltmp");
+  }
+
+  return ErrorP<Value>("Unknown function referenced");
+}
+
+Value *IfExprAST::IRGen(IRGenContext &C) const {
+  Value *CondV = Cond->IRGen(C);
+  if (!CondV) return nullptr;
+  
+  // Convert condition to a bool by comparing equal to 0.0.
+  ConstantFP *FPZero = 
+    ConstantFP::get(C.getLLVMContext(), APFloat(0.0));
+  CondV = C.getBuilder().CreateFCmpONE(CondV, FPZero, "ifcond");
+  
+  Function *TheFunction = C.getBuilder().GetInsertBlock()->getParent();
+  
+  // Create blocks for the then and else cases.  Insert the 'then' block at the
+  // end of the function.
+  BasicBlock *ThenBB = BasicBlock::Create(C.getLLVMContext(), "then", TheFunction);
+  BasicBlock *ElseBB = BasicBlock::Create(C.getLLVMContext(), "else");
+  BasicBlock *MergeBB = BasicBlock::Create(C.getLLVMContext(), "ifcont");
+  
+  C.getBuilder().CreateCondBr(CondV, ThenBB, ElseBB);
+  
+  // Emit then value.
+  C.getBuilder().SetInsertPoint(ThenBB);
+  
+  Value *ThenV = Then->IRGen(C);
+  if (!ThenV) return nullptr;
+  
+  C.getBuilder().CreateBr(MergeBB);
+  // Codegen of 'Then' can change the current block, update ThenBB for the PHI.
+  ThenBB = C.getBuilder().GetInsertBlock();
+  
+  // Emit else block.
+  TheFunction->getBasicBlockList().push_back(ElseBB);
+  C.getBuilder().SetInsertPoint(ElseBB);
+  
+  Value *ElseV = Else->IRGen(C);
+  if (!ElseV) return nullptr;
+  
+  C.getBuilder().CreateBr(MergeBB);
+  // Codegen of 'Else' can change the current block, update ElseBB for the PHI.
+  ElseBB = C.getBuilder().GetInsertBlock();
+  
+  // Emit merge block.
+  TheFunction->getBasicBlockList().push_back(MergeBB);
+  C.getBuilder().SetInsertPoint(MergeBB);
+  PHINode *PN = C.getBuilder().CreatePHI(Type::getDoubleTy(getGlobalContext()), 2,
+                                  "iftmp");
+  
+  PN->addIncoming(ThenV, ThenBB);
+  PN->addIncoming(ElseV, ElseBB);
+  return PN;
+}
+
+Value *ForExprAST::IRGen(IRGenContext &C) const {
+  // Output this as:
+  //   var = alloca double
+  //   ...
+  //   start = startexpr
+  //   store start -> var
+  //   goto loop
+  // loop: 
+  //   ...
+  //   bodyexpr
+  //   ...
+  // loopend:
+  //   step = stepexpr
+  //   endcond = endexpr
+  //
+  //   curvar = load var
+  //   nextvar = curvar + step
+  //   store nextvar -> var
+  //   br endcond, loop, endloop
+  // outloop:
+  
+  Function *TheFunction = C.getBuilder().GetInsertBlock()->getParent();
+
+  // Create an alloca for the variable in the entry block.
+  AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
+  
+  // Emit the start code first, without 'variable' in scope.
+  Value *StartVal = Start->IRGen(C);
+  if (!StartVal) return nullptr;
+  
+  // Store the value into the alloca.
+  C.getBuilder().CreateStore(StartVal, Alloca);
+  
+  // Make the new basic block for the loop header, inserting after current
+  // block.
+  BasicBlock *LoopBB = BasicBlock::Create(getGlobalContext(), "loop", TheFunction);
+  
+  // Insert an explicit fall through from the current block to the LoopBB.
+  C.getBuilder().CreateBr(LoopBB);
+
+  // Start insertion in LoopBB.
+  C.getBuilder().SetInsertPoint(LoopBB);
+  
+  // Within the loop, the variable is defined equal to the PHI node.  If it
+  // shadows an existing variable, we have to restore it, so save it now.
+  AllocaInst *OldVal = C.NamedValues[VarName];
+  C.NamedValues[VarName] = Alloca;
+  
+  // Emit the body of the loop.  This, like any other expr, can change the
+  // current BB.  Note that we ignore the value computed by the body, but don't
+  // allow an error.
+  if (!Body->IRGen(C))
+    return nullptr;
+  
+  // Emit the step value.
+  Value *StepVal;
+  if (Step) {
+    StepVal = Step->IRGen(C);
+    if (!StepVal) return nullptr;
+  } else {
+    // If not specified, use 1.0.
+    StepVal = ConstantFP::get(getGlobalContext(), APFloat(1.0));
+  }
+  
+  // Compute the end condition.
+  Value *EndCond = End->IRGen(C);
+  if (EndCond == 0) return EndCond;
+  
+  // Reload, increment, and restore the alloca.  This handles the case where
+  // the body of the loop mutates the variable.
+  Value *CurVar = C.getBuilder().CreateLoad(Alloca, VarName.c_str());
+  Value *NextVar = C.getBuilder().CreateFAdd(CurVar, StepVal, "nextvar");
+  C.getBuilder().CreateStore(NextVar, Alloca);
+  
+  // Convert condition to a bool by comparing equal to 0.0.
+  EndCond = C.getBuilder().CreateFCmpONE(EndCond, 
+                              ConstantFP::get(getGlobalContext(), APFloat(0.0)),
+                                  "loopcond");
+  
+  // Create the "after loop" block and insert it.
+  BasicBlock *AfterBB = BasicBlock::Create(getGlobalContext(), "afterloop", TheFunction);
+  
+  // Insert the conditional branch into the end of LoopEndBB.
+  C.getBuilder().CreateCondBr(EndCond, LoopBB, AfterBB);
+  
+  // Any new code will be inserted in AfterBB.
+  C.getBuilder().SetInsertPoint(AfterBB);
+  
+  // Restore the unshadowed variable.
+  if (OldVal)
+    C.NamedValues[VarName] = OldVal;
+  else
+    C.NamedValues.erase(VarName);
+
+  
+  // for expr always returns 0.0.
+  return Constant::getNullValue(Type::getDoubleTy(getGlobalContext()));
+}
+
+Value *VarExprAST::IRGen(IRGenContext &C) const {
+  std::vector<AllocaInst *> OldBindings;
+  
+  Function *TheFunction = C.getBuilder().GetInsertBlock()->getParent();
+
+  // Register all variables and emit their initializer.
+  for (unsigned i = 0, e = VarBindings.size(); i != e; ++i) {
+    auto &VarName = VarBindings[i].first;
+    auto &Init = VarBindings[i].second;
+    
+    // Emit the initializer before adding the variable to scope, this prevents
+    // the initializer from referencing the variable itself, and permits stuff
+    // like this:
+    //  var a = 1 in
+    //    var a = a in ...   # refers to outer 'a'.
+    Value *InitVal;
+    if (Init) {
+      InitVal = Init->IRGen(C);
+      if (!InitVal) return nullptr;
+    } else // If not specified, use 0.0.
+      InitVal = ConstantFP::get(getGlobalContext(), APFloat(0.0));
+    
+    AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
+    C.getBuilder().CreateStore(InitVal, Alloca);
+
+    // Remember the old variable binding so that we can restore the binding when
+    // we unrecurse.
+    OldBindings.push_back(C.NamedValues[VarName]);
+    
+    // Remember this binding.
+    C.NamedValues[VarName] = Alloca;
+  }
+  
+  // Codegen the body, now that all vars are in scope.
+  Value *BodyVal = Body->IRGen(C);
+  if (!BodyVal) return nullptr;
+  
+  // Pop all our variables from scope.
+  for (unsigned i = 0, e = VarBindings.size(); i != e; ++i)
+    C.NamedValues[VarBindings[i].first] = OldBindings[i];
+
+  // Return the body computation.
+  return BodyVal;
+}
+
+Function *PrototypeAST::IRGen(IRGenContext &C) const {
+  std::string FnName = MakeLegalFunctionName(Name);
+
+  // Make the function type:  double(double,double) etc.
+  std::vector<Type*> Doubles(Args.size(), 
+                             Type::getDoubleTy(getGlobalContext()));
+  FunctionType *FT = FunctionType::get(Type::getDoubleTy(getGlobalContext()),
+                                       Doubles, false);
+  Function *F = Function::Create(FT, Function::ExternalLinkage, FnName,
+                                 &C.getM());
+
+  // If F conflicted, there was already something named 'FnName'.  If it has a
+  // body, don't allow redefinition or reextern.
+  if (F->getName() != FnName) {
+    // Delete the one we just made and get the existing one.
+    F->eraseFromParent();
+    F = C.getM().getFunction(Name);
+    
+    // If F already has a body, reject this.
+    if (!F->empty()) {
+      ErrorP<Function>("redefinition of function");
+      return nullptr;
+    }
+    
+    // If F took a different number of args, reject.
+    if (F->arg_size() != Args.size()) {
+      ErrorP<Function>("redefinition of function with different # args");
+      return nullptr;
+    }
+  }
+  
+  // Set names for all arguments.
+  unsigned Idx = 0;
+  for (Function::arg_iterator AI = F->arg_begin(); Idx != Args.size();
+       ++AI, ++Idx)
+    AI->setName(Args[Idx]);
+    
+  return F;
+}
+
+/// CreateArgumentAllocas - Create an alloca for each argument and register the
+/// argument in the symbol table so that references to it will succeed.
+void PrototypeAST::CreateArgumentAllocas(Function *F, IRGenContext &C) {
+  Function::arg_iterator AI = F->arg_begin();
+  for (unsigned Idx = 0, e = Args.size(); Idx != e; ++Idx, ++AI) {
+    // Create an alloca for this variable.
+    AllocaInst *Alloca = CreateEntryBlockAlloca(F, Args[Idx]);
+
+    // Store the initial value into the alloca.
+    C.getBuilder().CreateStore(AI, Alloca);
+
+    // Add arguments to variable symbol table.
+    C.NamedValues[Args[Idx]] = Alloca;
+  }
+}
+
+Function *FunctionAST::IRGen(IRGenContext &C) const {
+  C.NamedValues.clear();
+  
+  Function *TheFunction = Proto->IRGen(C);
+  if (!TheFunction)
+    return nullptr;
+  
+  // If this is an operator, install it.
+  if (Proto->isBinaryOp())
+    BinopPrecedence[Proto->getOperatorName()] = Proto->Precedence;
+  
+  // Create a new basic block to start insertion into.
+  BasicBlock *BB = BasicBlock::Create(getGlobalContext(), "entry", TheFunction);
+  C.getBuilder().SetInsertPoint(BB);
+  
+  // Add all arguments to the symbol table and create their allocas.
+  Proto->CreateArgumentAllocas(TheFunction, C);
+
+  if (Value *RetVal = Body->IRGen(C)) {
+    // Finish off the function.
+    C.getBuilder().CreateRet(RetVal);
+
+    // Validate the generated code, checking for consistency.
+    verifyFunction(*TheFunction);
+
+    return TheFunction;
+  }
+  
+  // Error reading body, remove function.
+  TheFunction->eraseFromParent();
+
+  if (Proto->isBinaryOp())
+    BinopPrecedence.erase(Proto->getOperatorName());
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Top-Level parsing and JIT Driver
+//===----------------------------------------------------------------------===//
+
+static std::unique_ptr<llvm::Module> IRGen(SessionContext &S,
+                                           const FunctionAST &F) {
+  IRGenContext C(S);
+  auto LF = F.IRGen(C);
+  if (!LF)
+    return nullptr;
+#ifndef MINIMAL_STDERR_OUTPUT
+  fprintf(stderr, "Read function definition:");
+  LF->dump();
+#endif
+  return C.takeM();
+}
+
+template <typename T>
+static std::vector<T> singletonSet(T t) {
+  std::vector<T> Vec;
+  Vec.push_back(std::move(t));
+  return Vec;
+}
+
+static void EarthShatteringKaboom() {
+  fprintf(stderr, "Earth shattering kaboom.");
+  exit(1);
+}
+
+class KaleidoscopeJIT {
+public:
+  typedef ObjectLinkingLayer<> ObjLayerT;
+  typedef IRCompileLayer<ObjLayerT> CompileLayerT;
+  typedef LazyEmittingLayer<CompileLayerT> LazyEmitLayerT;
+  typedef LazyEmitLayerT::ModuleSetHandleT ModuleHandleT;
+
+  KaleidoscopeJIT(SessionContext &Session)
+    : Session(Session),
+      Mang(Session.getTarget().getDataLayout()),
+      ObjectLayer(
+        [](){ return llvm::make_unique<SectionMemoryManager>(); }),
+      CompileLayer(ObjectLayer, SimpleCompiler(Session.getTarget())),
+      LazyEmitLayer(CompileLayer),
+      CompileCallbacks(LazyEmitLayer, Session.getLLVMContext(),
+                       reinterpret_cast<uintptr_t>(EarthShatteringKaboom),
+                       64) {}
+
+  std::string mangle(const std::string &Name) {
+    std::string MangledName;
+    {
+      raw_string_ostream MangledNameStream(MangledName);
+      Mang.getNameWithPrefix(MangledNameStream, Name);
+    }
+    return MangledName;
+  }
+
+  void addFunctionDefinition(std::unique_ptr<FunctionAST> FnAST) {
+    FunctionDefs[mangle(FnAST->Proto->Name)] = std::move(FnAST);
+  }
+
+  ModuleHandleT addModule(std::unique_ptr<Module> M) {
+    // We need a memory manager to allocate memory and resolve symbols for this
+    // new module. Create one that resolves symbols by looking back into the
+    // JIT.
+    auto MM = createLookasideRTDyldMM<SectionMemoryManager>(
+                [&](const std::string &Name) {
+                  // First try to find 'Name' within the JIT.
+                  if (auto Symbol = findSymbol(Name))
+                    return Symbol.getAddress();
+
+                  // If we don't already have a definition of 'Name' then search
+                  // the ASTs.
+                  return searchUncompiledASTs(Name);
+                },
+                [](const std::string &S) { return 0; } );
+
+    return LazyEmitLayer.addModuleSet(singletonSet(std::move(M)),
+                                      std::move(MM));
+  }
+
+  void removeModule(ModuleHandleT H) { LazyEmitLayer.removeModuleSet(H); }
+
+  JITSymbol findSymbol(const std::string &Name) {
+    return LazyEmitLayer.findSymbol(Name, true);
+  }
+
+  JITSymbol findSymbolIn(ModuleHandleT H, const std::string &Name) {
+    return LazyEmitLayer.findSymbolIn(H, Name, true);
+  }
+
+  JITSymbol findUnmangledSymbol(const std::string &Name) {
+    return findSymbol(mangle(Name));
+  }
+
+  JITSymbol findUnmangledSymbolIn(ModuleHandleT H, const std::string &Name) {
+    return findSymbolIn(H, mangle(Name));
+  }
+
+private:
+
+  // This method searches the FunctionDefs map for a definition of 'Name'. If it
+  // finds one it generates a stub for it and returns the address of the stub.
+  TargetAddress searchUncompiledASTs(const std::string &Name) {
+    auto DefI = FunctionDefs.find(Name);
+    if (DefI == FunctionDefs.end())
+      return 0;
+
+    // We have AST for 'Name'. IRGen a stub for it and add it to the JIT.
+    // FIXME: What happens if IRGen fails?
+    auto H = irGenStub(std::move(DefI->second));
+
+    // Remove the function definition's AST now that we're
+    // finished with it.
+    FunctionDefs.erase(DefI);
+
+    // Return the address of the stub.
+    return findSymbolIn(H, Name).getAddress();
+  }
+
+  // This method will take the AST for a function definition and IR-gen a stub
+  // for that function that will, on first call, IR-gen the actual body of the
+  // function.
+  ModuleHandleT irGenStub(std::unique_ptr<FunctionAST> FnAST) {
+    // Step 1) IRGen a prototype for the stub. This will have the same type as
+    //         the function.
+    IRGenContext C(Session);
+    Function *F = FnAST->Proto->IRGen(C);
+
+    // Step 2) Get a compile callback that can be used to compile the body of
+    //         the function. The resulting CallbackInfo type will let us set the
+    //         compile and update actions for the callback, and get a pointer to
+    //         the jit trampoline that we need to call to trigger those actions.
+    auto CallbackInfo =
+      CompileCallbacks.getCompileCallback(*F->getFunctionType());
+
+    // Step 3) Create a stub that will indirectly call the body of this
+    //         function once it is compiled. Initially, set the function
+    //         pointer for the indirection to point at the trampoline.
+    std::string BodyPtrName = (F->getName() + "$address").str();
+    GlobalVariable *FunctionBodyPointer =
+      createImplPointer(*F, BodyPtrName, CallbackInfo.getAddress());
+    makeStub(*F, *FunctionBodyPointer);
+
+    // Step 4) Add the module containing the stub to the JIT.
+    auto H = addModule(C.takeM());
+
+    // Step 5) Set the compile and update actions.
+    //
+    //   The compile action will IRGen the function and add it to the JIT, then
+    // request its address, which will trigger codegen. Since we don't need the
+    // AST after this, we pass ownership of the AST into the compile action:
+    // compile actions (and update actions) are deleted after they're run, so
+    // this will free the AST for us.
+    //
+    //   The update action will update FunctionBodyPointer to point at the newly
+    // compiled function.
+    std::shared_ptr<FunctionAST> Fn = std::move(FnAST);
+    CallbackInfo.setCompileAction([this, Fn]() {
+      auto H = addModule(IRGen(Session, *Fn));
+      return findUnmangledSymbolIn(H, Fn->Proto->Name).getAddress();
+    });
+    CallbackInfo.setUpdateAction(
+      CompileCallbacks.getLocalFPUpdater(H, mangle(BodyPtrName)));
+
+    return H;
+  }
+
+  SessionContext &Session;
+  Mangler Mang;
+  ObjLayerT ObjectLayer;
+  CompileLayerT CompileLayer;
+  LazyEmitLayerT LazyEmitLayer;
+
+  std::map<std::string, std::unique_ptr<FunctionAST>> FunctionDefs;
+
+  JITCompileCallbackManager<LazyEmitLayerT, OrcX86_64> CompileCallbacks;
+};
+
+static void HandleDefinition(SessionContext &S, KaleidoscopeJIT &J) {
+  if (auto F = ParseDefinition()) {
+    S.addPrototypeAST(llvm::make_unique<PrototypeAST>(*F->Proto));
+    J.addFunctionDefinition(std::move(F));
+  } else {
+    // Skip token for error recovery.
+    getNextToken();
+  }
+}
+
+static void HandleExtern(SessionContext &S) {
+  if (auto P = ParseExtern())
+    S.addPrototypeAST(std::move(P));
+  else {
+    // Skip token for error recovery.
+    getNextToken();
+  }
+}
+
+static void HandleTopLevelExpression(SessionContext &S, KaleidoscopeJIT &J) {
+  // Evaluate a top-level expression into an anonymous function.
+  if (auto F = ParseTopLevelExpr()) {
+    IRGenContext C(S);
+    if (auto ExprFunc = F->IRGen(C)) {
+#ifndef MINIMAL_STDERR_OUTPUT
+      std::cerr << "Expression function:\n";
+      ExprFunc->dump();
+#endif
+      // Add the CodeGen'd module to the JIT. Keep a handle to it: We can remove
+      // this module as soon as we've executed Function ExprFunc.
+      auto H = J.addModule(C.takeM());
+
+      // Get the address of the JIT'd function in memory.
+      auto ExprSymbol = J.findUnmangledSymbol("__anon_expr");
+      
+      // Cast it to the right type (takes no arguments, returns a double) so we
+      // can call it as a native function.
+      double (*FP)() = (double (*)())(intptr_t)ExprSymbol.getAddress();
+#ifdef MINIMAL_STDERR_OUTPUT
+      FP();
+#else
+      std::cerr << "Evaluated to " << FP() << "\n";
+#endif
+
+      // Remove the function.
+      J.removeModule(H);
+    }
+  } else {
+    // Skip token for error recovery.
+    getNextToken();
+  }
+}
+
+/// top ::= definition | external | expression | ';'
+static void MainLoop() {
+  SessionContext S(getGlobalContext());
+  KaleidoscopeJIT J(S);
+
+  while (1) {
+    switch (CurTok) {
+    case tok_eof:    return;
+    case ';':        getNextToken(); continue;  // ignore top-level semicolons.
+    case tok_def:    HandleDefinition(S, J); break;
+    case tok_extern: HandleExtern(S); break;
+    default:         HandleTopLevelExpression(S, J); break;
+    }
+#ifndef MINIMAL_STDERR_OUTPUT
+    std::cerr << "ready> ";
+#endif
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// "Library" functions that can be "extern'd" from user code.
+//===----------------------------------------------------------------------===//
+
+/// putchard - putchar that takes a double and returns 0.
+extern "C" 
+double putchard(double X) {
+  putchar((char)X);
+  return 0;
+}
+
+/// printd - printf that takes a double prints it as "%f\n", returning 0.
+extern "C" 
+double printd(double X) {
+  printf("%f", X);
+  return 0;
+}
+
+extern "C" 
+double printlf() {
+  printf("\n");
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Main driver code.
+//===----------------------------------------------------------------------===//
+
+int main() {
+  InitializeNativeTarget();
+  InitializeNativeTargetAsmPrinter();
+  InitializeNativeTargetAsmParser();
+
+  // Install standard binary operators.
+  // 1 is lowest precedence.
+  BinopPrecedence['='] = 2;
+  BinopPrecedence['<'] = 10;
+  BinopPrecedence['+'] = 20;
+  BinopPrecedence['-'] = 20;
+  BinopPrecedence['/'] = 40;
+  BinopPrecedence['*'] = 40;  // highest.
+
+  // Prime the first token.
+#ifndef MINIMAL_STDERR_OUTPUT
+  std::cerr << "ready> ";
+#endif
+  getNextToken();
+
+  std::cerr << std::fixed;
+
+  // Run the main "interpreter loop" now.
+  MainLoop();
+
+  return 0;
+}
+
diff --git a/examples/Kaleidoscope/Orc/initial/CMakeLists.txt b/examples/Kaleidoscope/Orc/initial/CMakeLists.txt
new file mode 100644
index 0000000..4f21e1c
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/initial/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  ExecutionEngine
+  Object
+  RuntimeDyld
+  Support
+  native
+  )
+
+add_kaleidoscope_chapter(Kaleidoscope-Orc-initial
+  toy.cpp
+  )
diff --git a/examples/Kaleidoscope/Orc/initial/Makefile b/examples/Kaleidoscope/Orc/initial/Makefile
new file mode 100644
index 0000000..5536314
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/initial/Makefile
@@ -0,0 +1,17 @@
+UNAME := $(shell uname -s)
+
+ifeq ($(UNAME),Darwin)
+	CXX := xcrun --sdk macosx clang++
+else
+	CXX := clang++
+endif
+
+LLVM_CXXFLAGS := $(shell llvm-config --cxxflags)
+LLVM_LDFLAGS := $(shell llvm-config  --ldflags --system-libs --libs core orcjit native)
+
+toy: toy.cpp
+	$(CXX) $(LLVM_CXXFLAGS) -Wall -std=c++11 -g -O0 -rdynamic -fno-rtti -o toy toy.cpp $(LLVM_LDFLAGS)
+
+.PHONY: clean
+clean:
+	rm -f toy
diff --git a/examples/Kaleidoscope/Orc/initial/README.txt b/examples/Kaleidoscope/Orc/initial/README.txt
new file mode 100644
index 0000000..5f4cbbf
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/initial/README.txt
@@ -0,0 +1,13 @@
+//===----------------------------------------------------------------------===/
+//                 Kaleidoscope with Orc - Initial Version
+//===----------------------------------------------------------------------===//
+
+This version of Kaleidoscope with Orc demonstrates fully eager compilation. When
+a function definition or top-level expression is entered it is immediately
+translated (IRGen'd) to LLVM IR and added to the JIT, where it is code-gen'd to
+native code and either stored (for function definitions) or executed (for
+top-level expressions).
+
+This directory contain a Makefile that allow the code to be built in a
+standalone manner, independent of the larger LLVM build infrastructure. To build
+the program you will need to have 'clang++' and 'llvm-config' in your path.
diff --git a/examples/Kaleidoscope/Orc/initial/toy.cpp b/examples/Kaleidoscope/Orc/initial/toy.cpp
new file mode 100644
index 0000000..1b65e8c
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/initial/toy.cpp
@@ -0,0 +1,1333 @@
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/LazyEmittingLayer.h"
+#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Transforms/Scalar.h"
+#include <cctype>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::orc;
+
+//===----------------------------------------------------------------------===//
+// Lexer
+//===----------------------------------------------------------------------===//
+
+// The lexer returns tokens [0-255] if it is an unknown character, otherwise one
+// of these for known things.
+enum Token {
+  tok_eof = -1,
+
+  // commands
+  tok_def = -2, tok_extern = -3,
+
+  // primary
+  tok_identifier = -4, tok_number = -5,
+  
+  // control
+  tok_if = -6, tok_then = -7, tok_else = -8,
+  tok_for = -9, tok_in = -10,
+  
+  // operators
+  tok_binary = -11, tok_unary = -12,
+  
+  // var definition
+  tok_var = -13
+};
+
+static std::string IdentifierStr;  // Filled in if tok_identifier
+static double NumVal;              // Filled in if tok_number
+
+/// gettok - Return the next token from standard input.
+static int gettok() {
+  static int LastChar = ' ';
+
+  // Skip any whitespace.
+  while (isspace(LastChar))
+    LastChar = getchar();
+
+  if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]*
+    IdentifierStr = LastChar;
+    while (isalnum((LastChar = getchar())))
+      IdentifierStr += LastChar;
+
+    if (IdentifierStr == "def") return tok_def;
+    if (IdentifierStr == "extern") return tok_extern;
+    if (IdentifierStr == "if") return tok_if;
+    if (IdentifierStr == "then") return tok_then;
+    if (IdentifierStr == "else") return tok_else;
+    if (IdentifierStr == "for") return tok_for;
+    if (IdentifierStr == "in") return tok_in;
+    if (IdentifierStr == "binary") return tok_binary;
+    if (IdentifierStr == "unary") return tok_unary;
+    if (IdentifierStr == "var") return tok_var;
+    return tok_identifier;
+  }
+
+  if (isdigit(LastChar) || LastChar == '.') {   // Number: [0-9.]+
+    std::string NumStr;
+    do {
+      NumStr += LastChar;
+      LastChar = getchar();
+    } while (isdigit(LastChar) || LastChar == '.');
+
+    NumVal = strtod(NumStr.c_str(), 0);
+    return tok_number;
+  }
+
+  if (LastChar == '#') {
+    // Comment until end of line.
+    do LastChar = getchar();
+    while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
+    
+    if (LastChar != EOF)
+      return gettok();
+  }
+  
+  // Check for end of file.  Don't eat the EOF.
+  if (LastChar == EOF)
+    return tok_eof;
+
+  // Otherwise, just return the character as its ascii value.
+  int ThisChar = LastChar;
+  LastChar = getchar();
+  return ThisChar;
+}
+
+//===----------------------------------------------------------------------===//
+// Abstract Syntax Tree (aka Parse Tree)
+//===----------------------------------------------------------------------===//
+
+class IRGenContext;
+
+/// ExprAST - Base class for all expression nodes.
+struct ExprAST {
+  virtual ~ExprAST() {}
+  virtual Value *IRGen(IRGenContext &C) const = 0;
+};
+
+/// NumberExprAST - Expression class for numeric literals like "1.0".
+struct NumberExprAST : public ExprAST {
+  NumberExprAST(double Val) : Val(Val) {}
+  Value *IRGen(IRGenContext &C) const override;
+
+  double Val;
+};
+
+/// VariableExprAST - Expression class for referencing a variable, like "a".
+struct VariableExprAST : public ExprAST {
+  VariableExprAST(std::string Name) : Name(std::move(Name)) {}
+  Value *IRGen(IRGenContext &C) const override;
+
+  std::string Name;
+};
+
+/// UnaryExprAST - Expression class for a unary operator.
+struct UnaryExprAST : public ExprAST {
+  UnaryExprAST(char Opcode, std::unique_ptr<ExprAST> Operand) 
+    : Opcode(std::move(Opcode)), Operand(std::move(Operand)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  char Opcode;
+  std::unique_ptr<ExprAST> Operand;
+};
+
+/// BinaryExprAST - Expression class for a binary operator.
+struct BinaryExprAST : public ExprAST {
+  BinaryExprAST(char Op, std::unique_ptr<ExprAST> LHS,
+                std::unique_ptr<ExprAST> RHS) 
+    : Op(Op), LHS(std::move(LHS)), RHS(std::move(RHS)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  char Op;
+  std::unique_ptr<ExprAST> LHS, RHS;
+};
+
+/// CallExprAST - Expression class for function calls.
+struct CallExprAST : public ExprAST {
+  CallExprAST(std::string CalleeName,
+              std::vector<std::unique_ptr<ExprAST>> Args)
+    : CalleeName(std::move(CalleeName)), Args(std::move(Args)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  std::string CalleeName;
+  std::vector<std::unique_ptr<ExprAST>> Args;
+};
+
+/// IfExprAST - Expression class for if/then/else.
+struct IfExprAST : public ExprAST {
+  IfExprAST(std::unique_ptr<ExprAST> Cond, std::unique_ptr<ExprAST> Then,
+            std::unique_ptr<ExprAST> Else)
+    : Cond(std::move(Cond)), Then(std::move(Then)), Else(std::move(Else)) {}
+  Value *IRGen(IRGenContext &C) const override;
+
+  std::unique_ptr<ExprAST> Cond, Then, Else;
+};
+
+/// ForExprAST - Expression class for for/in.
+struct ForExprAST : public ExprAST {
+  ForExprAST(std::string VarName, std::unique_ptr<ExprAST> Start,
+             std::unique_ptr<ExprAST> End, std::unique_ptr<ExprAST> Step,
+             std::unique_ptr<ExprAST> Body)
+    : VarName(std::move(VarName)), Start(std::move(Start)), End(std::move(End)),
+      Step(std::move(Step)), Body(std::move(Body)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  std::string VarName;
+  std::unique_ptr<ExprAST> Start, End, Step, Body;
+};
+
+/// VarExprAST - Expression class for var/in
+struct VarExprAST : public ExprAST {
+  typedef std::pair<std::string, std::unique_ptr<ExprAST>> Binding;
+  typedef std::vector<Binding> BindingList;
+
+  VarExprAST(BindingList VarBindings, std::unique_ptr<ExprAST> Body)
+    : VarBindings(std::move(VarBindings)), Body(std::move(Body)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  BindingList VarBindings;
+  std::unique_ptr<ExprAST> Body;
+};
+
+/// PrototypeAST - This class represents the "prototype" for a function,
+/// which captures its argument names as well as if it is an operator.
+struct PrototypeAST {
+  PrototypeAST(std::string Name, std::vector<std::string> Args,
+               bool IsOperator = false, unsigned Precedence = 0)
+    : Name(std::move(Name)), Args(std::move(Args)), IsOperator(IsOperator),
+      Precedence(Precedence) {}
+
+  Function *IRGen(IRGenContext &C) const;
+  void CreateArgumentAllocas(Function *F, IRGenContext &C);
+
+  bool isUnaryOp() const { return IsOperator && Args.size() == 1; }
+  bool isBinaryOp() const { return IsOperator && Args.size() == 2; }
+  
+  char getOperatorName() const {
+    assert(isUnaryOp() || isBinaryOp());
+    return Name[Name.size()-1];
+  }
+
+  std::string Name;
+  std::vector<std::string> Args;
+  bool IsOperator;
+  unsigned Precedence;  // Precedence if a binary op.
+};
+
+/// FunctionAST - This class represents a function definition itself.
+struct FunctionAST {
+  FunctionAST(std::unique_ptr<PrototypeAST> Proto,
+              std::unique_ptr<ExprAST> Body)
+    : Proto(std::move(Proto)), Body(std::move(Body)) {}
+
+  Function *IRGen(IRGenContext &C) const;
+
+  std::unique_ptr<PrototypeAST> Proto;
+  std::unique_ptr<ExprAST> Body;
+};
+
+//===----------------------------------------------------------------------===//
+// Parser
+//===----------------------------------------------------------------------===//
+
+/// CurTok/getNextToken - Provide a simple token buffer.  CurTok is the current
+/// token the parser is looking at.  getNextToken reads another token from the
+/// lexer and updates CurTok with its results.
+static int CurTok;
+static int getNextToken() {
+  return CurTok = gettok();
+}
+
+/// BinopPrecedence - This holds the precedence for each binary operator that is
+/// defined.
+static std::map<char, int> BinopPrecedence;
+
+/// GetTokPrecedence - Get the precedence of the pending binary operator token.
+static int GetTokPrecedence() {
+  if (!isascii(CurTok))
+    return -1;
+  
+  // Make sure it's a declared binop.
+  int TokPrec = BinopPrecedence[CurTok];
+  if (TokPrec <= 0) return -1;
+  return TokPrec;
+}
+
+template <typename T>
+std::unique_ptr<T> ErrorU(const std::string &Str) {
+  std::cerr << "Error: " << Str << "\n";
+  return nullptr;
+}
+
+template <typename T>
+T* ErrorP(const std::string &Str) {
+  std::cerr << "Error: " << Str << "\n";
+  return nullptr;
+}
+
+static std::unique_ptr<ExprAST> ParseExpression();
+
+/// identifierexpr
+///   ::= identifier
+///   ::= identifier '(' expression* ')'
+static std::unique_ptr<ExprAST> ParseIdentifierExpr() {
+  std::string IdName = IdentifierStr;
+  
+  getNextToken();  // eat identifier.
+  
+  if (CurTok != '(') // Simple variable ref.
+    return llvm::make_unique<VariableExprAST>(IdName);
+  
+  // Call.
+  getNextToken();  // eat (
+  std::vector<std::unique_ptr<ExprAST>> Args;
+  if (CurTok != ')') {
+    while (1) {
+      auto Arg = ParseExpression();
+      if (!Arg) return nullptr;
+      Args.push_back(std::move(Arg));
+
+      if (CurTok == ')') break;
+
+      if (CurTok != ',')
+        return ErrorU<CallExprAST>("Expected ')' or ',' in argument list");
+      getNextToken();
+    }
+  }
+
+  // Eat the ')'.
+  getNextToken();
+  
+  return llvm::make_unique<CallExprAST>(IdName, std::move(Args));
+}
+
+/// numberexpr ::= number
+static std::unique_ptr<NumberExprAST> ParseNumberExpr() {
+  auto Result = llvm::make_unique<NumberExprAST>(NumVal);
+  getNextToken(); // consume the number
+  return Result;
+}
+
+/// parenexpr ::= '(' expression ')'
+static std::unique_ptr<ExprAST> ParseParenExpr() {
+  getNextToken();  // eat (.
+  auto V = ParseExpression();
+  if (!V)
+    return nullptr;
+  
+  if (CurTok != ')')
+    return ErrorU<ExprAST>("expected ')'");
+  getNextToken();  // eat ).
+  return V;
+}
+
+/// ifexpr ::= 'if' expression 'then' expression 'else' expression
+static std::unique_ptr<ExprAST> ParseIfExpr() {
+  getNextToken();  // eat the if.
+  
+  // condition.
+  auto Cond = ParseExpression();
+  if (!Cond)
+    return nullptr;
+  
+  if (CurTok != tok_then)
+    return ErrorU<ExprAST>("expected then");
+  getNextToken();  // eat the then
+  
+  auto Then = ParseExpression();
+  if (!Then)
+    return nullptr;
+  
+  if (CurTok != tok_else)
+    return ErrorU<ExprAST>("expected else");
+  
+  getNextToken();
+  
+  auto Else = ParseExpression();
+  if (!Else)
+    return nullptr;
+  
+  return llvm::make_unique<IfExprAST>(std::move(Cond), std::move(Then),
+                                      std::move(Else));
+}
+
+/// forexpr ::= 'for' identifier '=' expr ',' expr (',' expr)? 'in' expression
+static std::unique_ptr<ForExprAST> ParseForExpr() {
+  getNextToken();  // eat the for.
+
+  if (CurTok != tok_identifier)
+    return ErrorU<ForExprAST>("expected identifier after for");
+  
+  std::string IdName = IdentifierStr;
+  getNextToken();  // eat identifier.
+  
+  if (CurTok != '=')
+    return ErrorU<ForExprAST>("expected '=' after for");
+  getNextToken();  // eat '='.
+  
+  
+  auto Start = ParseExpression();
+  if (!Start)
+    return nullptr;
+  if (CurTok != ',')
+    return ErrorU<ForExprAST>("expected ',' after for start value");
+  getNextToken();
+  
+  auto End = ParseExpression();
+  if (!End)
+    return nullptr;
+  
+  // The step value is optional.
+  std::unique_ptr<ExprAST> Step;
+  if (CurTok == ',') {
+    getNextToken();
+    Step = ParseExpression();
+    if (!Step)
+      return nullptr;
+  }
+  
+  if (CurTok != tok_in)
+    return ErrorU<ForExprAST>("expected 'in' after for");
+  getNextToken();  // eat 'in'.
+  
+  auto Body = ParseExpression();
+  if (Body)
+    return nullptr;
+
+  return llvm::make_unique<ForExprAST>(IdName, std::move(Start), std::move(End),
+                                       std::move(Step), std::move(Body));
+}
+
+/// varexpr ::= 'var' identifier ('=' expression)? 
+//                    (',' identifier ('=' expression)?)* 'in' expression
+static std::unique_ptr<VarExprAST> ParseVarExpr() {
+  getNextToken();  // eat the var.
+
+  VarExprAST::BindingList VarBindings;
+
+  // At least one variable name is required.
+  if (CurTok != tok_identifier)
+    return ErrorU<VarExprAST>("expected identifier after var");
+  
+  while (1) {
+    std::string Name = IdentifierStr;
+    getNextToken();  // eat identifier.
+
+    // Read the optional initializer.
+    std::unique_ptr<ExprAST> Init;
+    if (CurTok == '=') {
+      getNextToken(); // eat the '='.
+      
+      Init = ParseExpression();
+      if (!Init)
+        return nullptr;
+    }
+    
+    VarBindings.push_back(VarExprAST::Binding(Name, std::move(Init)));
+    
+    // End of var list, exit loop.
+    if (CurTok != ',') break;
+    getNextToken(); // eat the ','.
+    
+    if (CurTok != tok_identifier)
+      return ErrorU<VarExprAST>("expected identifier list after var");
+  }
+  
+  // At this point, we have to have 'in'.
+  if (CurTok != tok_in)
+    return ErrorU<VarExprAST>("expected 'in' keyword after 'var'");
+  getNextToken();  // eat 'in'.
+  
+  auto Body = ParseExpression();
+  if (!Body)
+    return nullptr;
+  
+  return llvm::make_unique<VarExprAST>(std::move(VarBindings), std::move(Body));
+}
+
+/// primary
+///   ::= identifierexpr
+///   ::= numberexpr
+///   ::= parenexpr
+///   ::= ifexpr
+///   ::= forexpr
+///   ::= varexpr
+static std::unique_ptr<ExprAST> ParsePrimary() {
+  switch (CurTok) {
+  default: return ErrorU<ExprAST>("unknown token when expecting an expression");
+  case tok_identifier: return ParseIdentifierExpr();
+  case tok_number:     return ParseNumberExpr();
+  case '(':            return ParseParenExpr();
+  case tok_if:         return ParseIfExpr();
+  case tok_for:        return ParseForExpr();
+  case tok_var:        return ParseVarExpr();
+  }
+}
+
+/// unary
+///   ::= primary
+///   ::= '!' unary
+static std::unique_ptr<ExprAST> ParseUnary() {
+  // If the current token is not an operator, it must be a primary expr.
+  if (!isascii(CurTok) || CurTok == '(' || CurTok == ',')
+    return ParsePrimary();
+  
+  // If this is a unary operator, read it.
+  int Opc = CurTok;
+  getNextToken();
+  if (auto Operand = ParseUnary())
+    return llvm::make_unique<UnaryExprAST>(Opc, std::move(Operand));
+  return nullptr;
+}
+
+/// binoprhs
+///   ::= ('+' unary)*
+static std::unique_ptr<ExprAST> ParseBinOpRHS(int ExprPrec,
+                                              std::unique_ptr<ExprAST> LHS) {
+  // If this is a binop, find its precedence.
+  while (1) {
+    int TokPrec = GetTokPrecedence();
+    
+    // If this is a binop that binds at least as tightly as the current binop,
+    // consume it, otherwise we are done.
+    if (TokPrec < ExprPrec)
+      return LHS;
+    
+    // Okay, we know this is a binop.
+    int BinOp = CurTok;
+    getNextToken();  // eat binop
+    
+    // Parse the unary expression after the binary operator.
+    auto RHS = ParseUnary();
+    if (!RHS)
+      return nullptr;
+    
+    // If BinOp binds less tightly with RHS than the operator after RHS, let
+    // the pending operator take RHS as its LHS.
+    int NextPrec = GetTokPrecedence();
+    if (TokPrec < NextPrec) {
+      RHS = ParseBinOpRHS(TokPrec+1, std::move(RHS));
+      if (!RHS)
+        return nullptr;
+    }
+    
+    // Merge LHS/RHS.
+    LHS = llvm::make_unique<BinaryExprAST>(BinOp, std::move(LHS), std::move(RHS));
+  }
+}
+
+/// expression
+///   ::= unary binoprhs
+///
+static std::unique_ptr<ExprAST> ParseExpression() {
+  auto LHS = ParseUnary();
+  if (!LHS)
+    return nullptr;
+  
+  return ParseBinOpRHS(0, std::move(LHS));
+}
+
+/// prototype
+///   ::= id '(' id* ')'
+///   ::= binary LETTER number? (id, id)
+///   ::= unary LETTER (id)
+static std::unique_ptr<PrototypeAST> ParsePrototype() {
+  std::string FnName;
+  
+  unsigned Kind = 0; // 0 = identifier, 1 = unary, 2 = binary.
+  unsigned BinaryPrecedence = 30;
+  
+  switch (CurTok) {
+  default:
+    return ErrorU<PrototypeAST>("Expected function name in prototype");
+  case tok_identifier:
+    FnName = IdentifierStr;
+    Kind = 0;
+    getNextToken();
+    break;
+  case tok_unary:
+    getNextToken();
+    if (!isascii(CurTok))
+      return ErrorU<PrototypeAST>("Expected unary operator");
+    FnName = "unary";
+    FnName += (char)CurTok;
+    Kind = 1;
+    getNextToken();
+    break;
+  case tok_binary:
+    getNextToken();
+    if (!isascii(CurTok))
+      return ErrorU<PrototypeAST>("Expected binary operator");
+    FnName = "binary";
+    FnName += (char)CurTok;
+    Kind = 2;
+    getNextToken();
+    
+    // Read the precedence if present.
+    if (CurTok == tok_number) {
+      if (NumVal < 1 || NumVal > 100)
+        return ErrorU<PrototypeAST>("Invalid precedecnce: must be 1..100");
+      BinaryPrecedence = (unsigned)NumVal;
+      getNextToken();
+    }
+    break;
+  }
+  
+  if (CurTok != '(')
+    return ErrorU<PrototypeAST>("Expected '(' in prototype");
+  
+  std::vector<std::string> ArgNames;
+  while (getNextToken() == tok_identifier)
+    ArgNames.push_back(IdentifierStr);
+  if (CurTok != ')')
+    return ErrorU<PrototypeAST>("Expected ')' in prototype");
+  
+  // success.
+  getNextToken();  // eat ')'.
+  
+  // Verify right number of names for operator.
+  if (Kind && ArgNames.size() != Kind)
+    return ErrorU<PrototypeAST>("Invalid number of operands for operator");
+  
+  return llvm::make_unique<PrototypeAST>(FnName, std::move(ArgNames), Kind != 0,
+                                         BinaryPrecedence);
+}
+
+/// definition ::= 'def' prototype expression
+static std::unique_ptr<FunctionAST> ParseDefinition() {
+  getNextToken();  // eat def.
+  auto Proto = ParsePrototype();
+  if (!Proto)
+    return nullptr;
+
+  if (auto Body = ParseExpression())
+    return llvm::make_unique<FunctionAST>(std::move(Proto), std::move(Body));
+  return nullptr;
+}
+
+/// toplevelexpr ::= expression
+static std::unique_ptr<FunctionAST> ParseTopLevelExpr() {
+  if (auto E = ParseExpression()) {
+    // Make an anonymous proto.
+    auto Proto =
+      llvm::make_unique<PrototypeAST>("__anon_expr", std::vector<std::string>());
+    return llvm::make_unique<FunctionAST>(std::move(Proto), std::move(E));
+  }
+  return nullptr;
+}
+
+/// external ::= 'extern' prototype
+static std::unique_ptr<PrototypeAST> ParseExtern() {
+  getNextToken();  // eat extern.
+  return ParsePrototype();
+}
+
+//===----------------------------------------------------------------------===//
+// Code Generation
+//===----------------------------------------------------------------------===//
+
+// FIXME: Obviously we can do better than this
+std::string GenerateUniqueName(const std::string &Root) {
+  static int i = 0;
+  std::ostringstream NameStream;
+  NameStream << Root << ++i;
+  return NameStream.str();
+}
+
+std::string MakeLegalFunctionName(std::string Name)
+{
+  std::string NewName;
+  assert(!Name.empty() && "Base name must not be empty");
+
+  // Start with what we have
+  NewName = Name;
+
+  // Look for a numberic first character
+  if (NewName.find_first_of("0123456789") == 0) {
+    NewName.insert(0, 1, 'n');
+  }
+
+  // Replace illegal characters with their ASCII equivalent
+  std::string legal_elements = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+  size_t pos;
+  while ((pos = NewName.find_first_not_of(legal_elements)) != std::string::npos) {
+    std::ostringstream NumStream;
+    NumStream << (int)NewName.at(pos);
+    NewName = NewName.replace(pos, 1, NumStream.str());
+  }
+
+  return NewName;
+}
+
+class SessionContext {
+public:
+  SessionContext(LLVMContext &C)
+    : Context(C), TM(EngineBuilder().selectTarget()) {}
+  LLVMContext& getLLVMContext() const { return Context; }
+  TargetMachine& getTarget() { return *TM; }
+  void addPrototypeAST(std::unique_ptr<PrototypeAST> P);
+  PrototypeAST* getPrototypeAST(const std::string &Name);
+private:
+  typedef std::map<std::string, std::unique_ptr<PrototypeAST>> PrototypeMap;
+  
+  LLVMContext &Context;
+  std::unique_ptr<TargetMachine> TM;
+      
+  PrototypeMap Prototypes;
+};
+
+void SessionContext::addPrototypeAST(std::unique_ptr<PrototypeAST> P) {
+  Prototypes[P->Name] = std::move(P);
+}
+
+PrototypeAST* SessionContext::getPrototypeAST(const std::string &Name) {
+  PrototypeMap::iterator I = Prototypes.find(Name);
+  if (I != Prototypes.end())
+    return I->second.get();
+  return nullptr;
+}
+
+class IRGenContext {
+public:
+
+  IRGenContext(SessionContext &S)
+    : Session(S),
+      M(new Module(GenerateUniqueName("jit_module_"),
+                   Session.getLLVMContext())),
+      Builder(Session.getLLVMContext()) {
+    M->setDataLayout(Session.getTarget().getDataLayout());
+  }
+
+  SessionContext& getSession() { return Session; }
+  Module& getM() const { return *M; }
+  std::unique_ptr<Module> takeM() { return std::move(M); }
+  IRBuilder<>& getBuilder() { return Builder; }
+  LLVMContext& getLLVMContext() { return Session.getLLVMContext(); }
+  Function* getPrototype(const std::string &Name);
+
+  std::map<std::string, AllocaInst*> NamedValues;
+private:
+  SessionContext &Session;
+  std::unique_ptr<Module> M;
+  IRBuilder<> Builder;
+};
+
+Function* IRGenContext::getPrototype(const std::string &Name) {
+  if (Function *ExistingProto = M->getFunction(Name))
+    return ExistingProto;
+  if (PrototypeAST *ProtoAST = Session.getPrototypeAST(Name))
+    return ProtoAST->IRGen(*this);
+  return nullptr;
+}
+
+/// CreateEntryBlockAlloca - Create an alloca instruction in the entry block of
+/// the function.  This is used for mutable variables etc.
+static AllocaInst *CreateEntryBlockAlloca(Function *TheFunction,
+                                          const std::string &VarName) {
+  IRBuilder<> TmpB(&TheFunction->getEntryBlock(),
+                 TheFunction->getEntryBlock().begin());
+  return TmpB.CreateAlloca(Type::getDoubleTy(getGlobalContext()), 0,
+                           VarName.c_str());
+}
+
+Value *NumberExprAST::IRGen(IRGenContext &C) const {
+  return ConstantFP::get(C.getLLVMContext(), APFloat(Val));
+}
+
+Value *VariableExprAST::IRGen(IRGenContext &C) const {
+  // Look this variable up in the function.
+  Value *V = C.NamedValues[Name];
+
+  if (V == 0)
+    return ErrorP<Value>("Unknown variable name '" + Name + "'");
+
+  // Load the value.
+  return C.getBuilder().CreateLoad(V, Name.c_str());
+}
+
+Value *UnaryExprAST::IRGen(IRGenContext &C) const {
+  if (Value *OperandV = Operand->IRGen(C)) {
+    std::string FnName = MakeLegalFunctionName(std::string("unary")+Opcode);
+    if (Function *F = C.getPrototype(FnName))
+      return C.getBuilder().CreateCall(F, OperandV, "unop");
+    return ErrorP<Value>("Unknown unary operator");
+  }
+
+  // Could not codegen operand - return null.
+  return nullptr;
+}
+
+Value *BinaryExprAST::IRGen(IRGenContext &C) const {
+  // Special case '=' because we don't want to emit the LHS as an expression.
+  if (Op == '=') {
+    // Assignment requires the LHS to be an identifier.
+    auto LHSVar = static_cast<VariableExprAST&>(*LHS);
+    // Codegen the RHS.
+    Value *Val = RHS->IRGen(C);
+    if (!Val) return nullptr;
+
+    // Look up the name.
+    if (auto Variable = C.NamedValues[LHSVar.Name]) {
+      C.getBuilder().CreateStore(Val, Variable);
+      return Val;
+    }
+    return ErrorP<Value>("Unknown variable name");
+  }
+  
+  Value *L = LHS->IRGen(C);
+  Value *R = RHS->IRGen(C);
+  if (!L || !R) return nullptr;
+  
+  switch (Op) {
+  case '+': return C.getBuilder().CreateFAdd(L, R, "addtmp");
+  case '-': return C.getBuilder().CreateFSub(L, R, "subtmp");
+  case '*': return C.getBuilder().CreateFMul(L, R, "multmp");
+  case '/': return C.getBuilder().CreateFDiv(L, R, "divtmp");
+  case '<':
+    L = C.getBuilder().CreateFCmpULT(L, R, "cmptmp");
+    // Convert bool 0/1 to double 0.0 or 1.0
+    return C.getBuilder().CreateUIToFP(L, Type::getDoubleTy(getGlobalContext()),
+                                "booltmp");
+  default: break;
+  }
+  
+  // If it wasn't a builtin binary operator, it must be a user defined one. Emit
+  // a call to it.
+  std::string FnName = MakeLegalFunctionName(std::string("binary")+Op);
+  if (Function *F = C.getPrototype(FnName)) {
+    Value *Ops[] = { L, R };
+    return C.getBuilder().CreateCall(F, Ops, "binop");
+  }
+   
+  return ErrorP<Value>("Unknown binary operator");
+}
+
+Value *CallExprAST::IRGen(IRGenContext &C) const {
+  // Look up the name in the global module table.
+  if (auto CalleeF = C.getPrototype(CalleeName)) {
+    // If argument mismatch error.
+    if (CalleeF->arg_size() != Args.size())
+      return ErrorP<Value>("Incorrect # arguments passed");
+
+    std::vector<Value*> ArgsV;
+    for (unsigned i = 0, e = Args.size(); i != e; ++i) {
+      ArgsV.push_back(Args[i]->IRGen(C));
+      if (!ArgsV.back()) return nullptr;
+    }
+    
+    return C.getBuilder().CreateCall(CalleeF, ArgsV, "calltmp");
+  }
+
+  return ErrorP<Value>("Unknown function referenced");
+}
+
+Value *IfExprAST::IRGen(IRGenContext &C) const {
+  Value *CondV = Cond->IRGen(C);
+  if (!CondV) return nullptr;
+  
+  // Convert condition to a bool by comparing equal to 0.0.
+  ConstantFP *FPZero = 
+    ConstantFP::get(C.getLLVMContext(), APFloat(0.0));
+  CondV = C.getBuilder().CreateFCmpONE(CondV, FPZero, "ifcond");
+  
+  Function *TheFunction = C.getBuilder().GetInsertBlock()->getParent();
+  
+  // Create blocks for the then and else cases.  Insert the 'then' block at the
+  // end of the function.
+  BasicBlock *ThenBB = BasicBlock::Create(C.getLLVMContext(), "then", TheFunction);
+  BasicBlock *ElseBB = BasicBlock::Create(C.getLLVMContext(), "else");
+  BasicBlock *MergeBB = BasicBlock::Create(C.getLLVMContext(), "ifcont");
+  
+  C.getBuilder().CreateCondBr(CondV, ThenBB, ElseBB);
+  
+  // Emit then value.
+  C.getBuilder().SetInsertPoint(ThenBB);
+  
+  Value *ThenV = Then->IRGen(C);
+  if (!ThenV) return nullptr;
+  
+  C.getBuilder().CreateBr(MergeBB);
+  // Codegen of 'Then' can change the current block, update ThenBB for the PHI.
+  ThenBB = C.getBuilder().GetInsertBlock();
+  
+  // Emit else block.
+  TheFunction->getBasicBlockList().push_back(ElseBB);
+  C.getBuilder().SetInsertPoint(ElseBB);
+  
+  Value *ElseV = Else->IRGen(C);
+  if (!ElseV) return nullptr;
+  
+  C.getBuilder().CreateBr(MergeBB);
+  // Codegen of 'Else' can change the current block, update ElseBB for the PHI.
+  ElseBB = C.getBuilder().GetInsertBlock();
+  
+  // Emit merge block.
+  TheFunction->getBasicBlockList().push_back(MergeBB);
+  C.getBuilder().SetInsertPoint(MergeBB);
+  PHINode *PN = C.getBuilder().CreatePHI(Type::getDoubleTy(getGlobalContext()), 2,
+                                  "iftmp");
+  
+  PN->addIncoming(ThenV, ThenBB);
+  PN->addIncoming(ElseV, ElseBB);
+  return PN;
+}
+
+Value *ForExprAST::IRGen(IRGenContext &C) const {
+  // Output this as:
+  //   var = alloca double
+  //   ...
+  //   start = startexpr
+  //   store start -> var
+  //   goto loop
+  // loop: 
+  //   ...
+  //   bodyexpr
+  //   ...
+  // loopend:
+  //   step = stepexpr
+  //   endcond = endexpr
+  //
+  //   curvar = load var
+  //   nextvar = curvar + step
+  //   store nextvar -> var
+  //   br endcond, loop, endloop
+  // outloop:
+  
+  Function *TheFunction = C.getBuilder().GetInsertBlock()->getParent();
+
+  // Create an alloca for the variable in the entry block.
+  AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
+  
+  // Emit the start code first, without 'variable' in scope.
+  Value *StartVal = Start->IRGen(C);
+  if (!StartVal) return nullptr;
+  
+  // Store the value into the alloca.
+  C.getBuilder().CreateStore(StartVal, Alloca);
+  
+  // Make the new basic block for the loop header, inserting after current
+  // block.
+  BasicBlock *LoopBB = BasicBlock::Create(getGlobalContext(), "loop", TheFunction);
+  
+  // Insert an explicit fall through from the current block to the LoopBB.
+  C.getBuilder().CreateBr(LoopBB);
+
+  // Start insertion in LoopBB.
+  C.getBuilder().SetInsertPoint(LoopBB);
+  
+  // Within the loop, the variable is defined equal to the PHI node.  If it
+  // shadows an existing variable, we have to restore it, so save it now.
+  AllocaInst *OldVal = C.NamedValues[VarName];
+  C.NamedValues[VarName] = Alloca;
+  
+  // Emit the body of the loop.  This, like any other expr, can change the
+  // current BB.  Note that we ignore the value computed by the body, but don't
+  // allow an error.
+  if (!Body->IRGen(C))
+    return nullptr;
+  
+  // Emit the step value.
+  Value *StepVal;
+  if (Step) {
+    StepVal = Step->IRGen(C);
+    if (!StepVal) return nullptr;
+  } else {
+    // If not specified, use 1.0.
+    StepVal = ConstantFP::get(getGlobalContext(), APFloat(1.0));
+  }
+  
+  // Compute the end condition.
+  Value *EndCond = End->IRGen(C);
+  if (EndCond == 0) return EndCond;
+  
+  // Reload, increment, and restore the alloca.  This handles the case where
+  // the body of the loop mutates the variable.
+  Value *CurVar = C.getBuilder().CreateLoad(Alloca, VarName.c_str());
+  Value *NextVar = C.getBuilder().CreateFAdd(CurVar, StepVal, "nextvar");
+  C.getBuilder().CreateStore(NextVar, Alloca);
+  
+  // Convert condition to a bool by comparing equal to 0.0.
+  EndCond = C.getBuilder().CreateFCmpONE(EndCond, 
+                              ConstantFP::get(getGlobalContext(), APFloat(0.0)),
+                                  "loopcond");
+  
+  // Create the "after loop" block and insert it.
+  BasicBlock *AfterBB = BasicBlock::Create(getGlobalContext(), "afterloop", TheFunction);
+  
+  // Insert the conditional branch into the end of LoopEndBB.
+  C.getBuilder().CreateCondBr(EndCond, LoopBB, AfterBB);
+  
+  // Any new code will be inserted in AfterBB.
+  C.getBuilder().SetInsertPoint(AfterBB);
+  
+  // Restore the unshadowed variable.
+  if (OldVal)
+    C.NamedValues[VarName] = OldVal;
+  else
+    C.NamedValues.erase(VarName);
+
+  
+  // for expr always returns 0.0.
+  return Constant::getNullValue(Type::getDoubleTy(getGlobalContext()));
+}
+
+Value *VarExprAST::IRGen(IRGenContext &C) const {
+  std::vector<AllocaInst *> OldBindings;
+  
+  Function *TheFunction = C.getBuilder().GetInsertBlock()->getParent();
+
+  // Register all variables and emit their initializer.
+  for (unsigned i = 0, e = VarBindings.size(); i != e; ++i) {
+    auto &VarName = VarBindings[i].first;
+    auto &Init = VarBindings[i].second;
+    
+    // Emit the initializer before adding the variable to scope, this prevents
+    // the initializer from referencing the variable itself, and permits stuff
+    // like this:
+    //  var a = 1 in
+    //    var a = a in ...   # refers to outer 'a'.
+    Value *InitVal;
+    if (Init) {
+      InitVal = Init->IRGen(C);
+      if (!InitVal) return nullptr;
+    } else // If not specified, use 0.0.
+      InitVal = ConstantFP::get(getGlobalContext(), APFloat(0.0));
+    
+    AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
+    C.getBuilder().CreateStore(InitVal, Alloca);
+
+    // Remember the old variable binding so that we can restore the binding when
+    // we unrecurse.
+    OldBindings.push_back(C.NamedValues[VarName]);
+    
+    // Remember this binding.
+    C.NamedValues[VarName] = Alloca;
+  }
+  
+  // Codegen the body, now that all vars are in scope.
+  Value *BodyVal = Body->IRGen(C);
+  if (!BodyVal) return nullptr;
+  
+  // Pop all our variables from scope.
+  for (unsigned i = 0, e = VarBindings.size(); i != e; ++i)
+    C.NamedValues[VarBindings[i].first] = OldBindings[i];
+
+  // Return the body computation.
+  return BodyVal;
+}
+
+Function *PrototypeAST::IRGen(IRGenContext &C) const {
+  std::string FnName = MakeLegalFunctionName(Name);
+
+  // Make the function type:  double(double,double) etc.
+  std::vector<Type*> Doubles(Args.size(), 
+                             Type::getDoubleTy(getGlobalContext()));
+  FunctionType *FT = FunctionType::get(Type::getDoubleTy(getGlobalContext()),
+                                       Doubles, false);
+  Function *F = Function::Create(FT, Function::ExternalLinkage, FnName,
+                                 &C.getM());
+
+  // If F conflicted, there was already something named 'FnName'.  If it has a
+  // body, don't allow redefinition or reextern.
+  if (F->getName() != FnName) {
+    // Delete the one we just made and get the existing one.
+    F->eraseFromParent();
+    F = C.getM().getFunction(Name);
+    
+    // If F already has a body, reject this.
+    if (!F->empty()) {
+      ErrorP<Function>("redefinition of function");
+      return nullptr;
+    }
+    
+    // If F took a different number of args, reject.
+    if (F->arg_size() != Args.size()) {
+      ErrorP<Function>("redefinition of function with different # args");
+      return nullptr;
+    }
+  }
+  
+  // Set names for all arguments.
+  unsigned Idx = 0;
+  for (Function::arg_iterator AI = F->arg_begin(); Idx != Args.size();
+       ++AI, ++Idx)
+    AI->setName(Args[Idx]);
+    
+  return F;
+}
+
+/// CreateArgumentAllocas - Create an alloca for each argument and register the
+/// argument in the symbol table so that references to it will succeed.
+void PrototypeAST::CreateArgumentAllocas(Function *F, IRGenContext &C) {
+  Function::arg_iterator AI = F->arg_begin();
+  for (unsigned Idx = 0, e = Args.size(); Idx != e; ++Idx, ++AI) {
+    // Create an alloca for this variable.
+    AllocaInst *Alloca = CreateEntryBlockAlloca(F, Args[Idx]);
+
+    // Store the initial value into the alloca.
+    C.getBuilder().CreateStore(AI, Alloca);
+
+    // Add arguments to variable symbol table.
+    C.NamedValues[Args[Idx]] = Alloca;
+  }
+}
+
+Function *FunctionAST::IRGen(IRGenContext &C) const {
+  C.NamedValues.clear();
+  
+  Function *TheFunction = Proto->IRGen(C);
+  if (!TheFunction)
+    return nullptr;
+  
+  // If this is an operator, install it.
+  if (Proto->isBinaryOp())
+    BinopPrecedence[Proto->getOperatorName()] = Proto->Precedence;
+  
+  // Create a new basic block to start insertion into.
+  BasicBlock *BB = BasicBlock::Create(getGlobalContext(), "entry", TheFunction);
+  C.getBuilder().SetInsertPoint(BB);
+  
+  // Add all arguments to the symbol table and create their allocas.
+  Proto->CreateArgumentAllocas(TheFunction, C);
+
+  if (Value *RetVal = Body->IRGen(C)) {
+    // Finish off the function.
+    C.getBuilder().CreateRet(RetVal);
+
+    // Validate the generated code, checking for consistency.
+    verifyFunction(*TheFunction);
+
+    return TheFunction;
+  }
+  
+  // Error reading body, remove function.
+  TheFunction->eraseFromParent();
+
+  if (Proto->isBinaryOp())
+    BinopPrecedence.erase(Proto->getOperatorName());
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Top-Level parsing and JIT Driver
+//===----------------------------------------------------------------------===//
+
+static std::unique_ptr<llvm::Module> IRGen(SessionContext &S,
+                                           const FunctionAST &F) {
+  IRGenContext C(S);
+  auto LF = F.IRGen(C);
+  if (!LF)
+    return nullptr;
+#ifndef MINIMAL_STDERR_OUTPUT
+  fprintf(stderr, "Read function definition:");
+  LF->dump();
+#endif
+  return C.takeM();
+}
+
+template <typename T>
+static std::vector<T> singletonSet(T t) {
+  std::vector<T> Vec;
+  Vec.push_back(std::move(t));
+  return Vec;
+}
+
+class KaleidoscopeJIT {
+public:
+  typedef ObjectLinkingLayer<> ObjLayerT;
+  typedef IRCompileLayer<ObjLayerT> CompileLayerT;
+  typedef CompileLayerT::ModuleSetHandleT ModuleHandleT;
+
+  KaleidoscopeJIT(SessionContext &Session)
+    : Mang(Session.getTarget().getDataLayout()),
+      CompileLayer(ObjectLayer, SimpleCompiler(Session.getTarget())) {}
+
+  std::string mangle(const std::string &Name) {
+    std::string MangledName;
+    {
+      raw_string_ostream MangledNameStream(MangledName);
+      Mang.getNameWithPrefix(MangledNameStream, Name);
+    }
+    return MangledName;
+  }
+
+  ModuleHandleT addModule(std::unique_ptr<Module> M) {
+    // We need a memory manager to allocate memory and resolve symbols for this
+    // new module. Create one that resolves symbols by looking back into the
+    // JIT.
+    auto MM = createLookasideRTDyldMM<SectionMemoryManager>(
+                [&](const std::string &Name) {
+                  return findSymbol(Name).getAddress();
+                },
+                [](const std::string &S) { return 0; } );
+
+    return CompileLayer.addModuleSet(singletonSet(std::move(M)), std::move(MM));
+  }
+
+  void removeModule(ModuleHandleT H) { CompileLayer.removeModuleSet(H); }
+
+  JITSymbol findSymbol(const std::string &Name) {
+    return CompileLayer.findSymbol(Name, true);
+  }
+
+  JITSymbol findUnmangledSymbol(const std::string Name) {
+    return findSymbol(mangle(Name));
+  }
+
+private:
+
+  Mangler Mang;
+  ObjLayerT ObjectLayer;
+  CompileLayerT CompileLayer;
+};
+
+static void HandleDefinition(SessionContext &S, KaleidoscopeJIT &J) {
+  if (auto F = ParseDefinition()) {
+    if (auto M = IRGen(S, *F)) {
+      S.addPrototypeAST(llvm::make_unique<PrototypeAST>(*F->Proto));
+      J.addModule(std::move(M));
+    }
+  } else {
+    // Skip token for error recovery.
+    getNextToken();
+  }
+}
+
+static void HandleExtern(SessionContext &S) {
+  if (auto P = ParseExtern())
+    S.addPrototypeAST(std::move(P));
+  else {
+    // Skip token for error recovery.
+    getNextToken();
+  }
+}
+
+static void HandleTopLevelExpression(SessionContext &S, KaleidoscopeJIT &J) {
+  // Evaluate a top-level expression into an anonymous function.
+  if (auto F = ParseTopLevelExpr()) {
+    IRGenContext C(S);
+    if (auto ExprFunc = F->IRGen(C)) {
+#ifndef MINIMAL_STDERR_OUTPUT
+      std::cerr << "Expression function:\n";
+      ExprFunc->dump();
+#endif
+      // Add the CodeGen'd module to the JIT. Keep a handle to it: We can remove
+      // this module as soon as we've executed Function ExprFunc.
+      auto H = J.addModule(C.takeM());
+
+      // Get the address of the JIT'd function in memory.
+      auto ExprSymbol = J.findUnmangledSymbol("__anon_expr");
+      
+      // Cast it to the right type (takes no arguments, returns a double) so we
+      // can call it as a native function.
+      double (*FP)() = (double (*)())(intptr_t)ExprSymbol.getAddress();
+#ifdef MINIMAL_STDERR_OUTPUT
+      FP();
+#else
+      std::cerr << "Evaluated to " << FP() << "\n";
+#endif
+
+      // Remove the function.
+      J.removeModule(H);
+    }
+  } else {
+    // Skip token for error recovery.
+    getNextToken();
+  }
+}
+
+/// top ::= definition | external | expression | ';'
+static void MainLoop() {
+  SessionContext S(getGlobalContext());
+  KaleidoscopeJIT J(S);
+
+  while (1) {
+    switch (CurTok) {
+    case tok_eof:    return;
+    case ';':        getNextToken(); continue;  // ignore top-level semicolons.
+    case tok_def:    HandleDefinition(S, J); break;
+    case tok_extern: HandleExtern(S); break;
+    default:         HandleTopLevelExpression(S, J); break;
+    }
+#ifndef MINIMAL_STDERR_OUTPUT
+    std::cerr << "ready> ";
+#endif
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// "Library" functions that can be "extern'd" from user code.
+//===----------------------------------------------------------------------===//
+
+/// putchard - putchar that takes a double and returns 0.
+extern "C" 
+double putchard(double X) {
+  putchar((char)X);
+  return 0;
+}
+
+/// printd - printf that takes a double prints it as "%f\n", returning 0.
+extern "C" 
+double printd(double X) {
+  printf("%f", X);
+  return 0;
+}
+
+extern "C" 
+double printlf() {
+  printf("\n");
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Main driver code.
+//===----------------------------------------------------------------------===//
+
+int main() {
+  InitializeNativeTarget();
+  InitializeNativeTargetAsmPrinter();
+  InitializeNativeTargetAsmParser();
+
+  // Install standard binary operators.
+  // 1 is lowest precedence.
+  BinopPrecedence['='] = 2;
+  BinopPrecedence['<'] = 10;
+  BinopPrecedence['+'] = 20;
+  BinopPrecedence['-'] = 20;
+  BinopPrecedence['/'] = 40;
+  BinopPrecedence['*'] = 40;  // highest.
+
+  // Prime the first token.
+#ifndef MINIMAL_STDERR_OUTPUT
+  std::cerr << "ready> ";
+#endif
+  getNextToken();
+
+  std::cerr << std::fixed;
+
+  // Run the main "interpreter loop" now.
+  MainLoop();
+
+  return 0;
+}
+
diff --git a/examples/Kaleidoscope/Orc/lazy_codegen/CMakeLists.txt b/examples/Kaleidoscope/Orc/lazy_codegen/CMakeLists.txt
new file mode 100644
index 0000000..faad342
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/lazy_codegen/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  ExecutionEngine
+  Object
+  RuntimeDyld
+  Support
+  native
+  )
+
+add_kaleidoscope_chapter(Kaleidoscope-Orc-lazy_codegen
+  toy.cpp
+  )
diff --git a/examples/Kaleidoscope/Orc/lazy_codegen/Makefile b/examples/Kaleidoscope/Orc/lazy_codegen/Makefile
new file mode 100644
index 0000000..5536314
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/lazy_codegen/Makefile
@@ -0,0 +1,17 @@
+UNAME := $(shell uname -s)
+
+ifeq ($(UNAME),Darwin)
+	CXX := xcrun --sdk macosx clang++
+else
+	CXX := clang++
+endif
+
+LLVM_CXXFLAGS := $(shell llvm-config --cxxflags)
+LLVM_LDFLAGS := $(shell llvm-config  --ldflags --system-libs --libs core orcjit native)
+
+toy: toy.cpp
+	$(CXX) $(LLVM_CXXFLAGS) -Wall -std=c++11 -g -O0 -rdynamic -fno-rtti -o toy toy.cpp $(LLVM_LDFLAGS)
+
+.PHONY: clean
+clean:
+	rm -f toy
diff --git a/examples/Kaleidoscope/Orc/lazy_codegen/README.txt b/examples/Kaleidoscope/Orc/lazy_codegen/README.txt
new file mode 100644
index 0000000..9d62a91
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/lazy_codegen/README.txt
@@ -0,0 +1,13 @@
+//===----------------------------------------------------------------------===/
+//                 Kaleidoscope with Orc - Initial Version
+//===----------------------------------------------------------------------===//
+
+This version of Kaleidoscope with Orc demonstrates lazy code-generation.
+Unlike the first Kaleidoscope-Orc tutorial, where code-gen was performed as soon
+as modules were added to the JIT, this tutorial adds a LazyEmittingLayer to defer
+code-generation until modules are actually referenced. All IR-generation is still
+performed up-front.
+
+This directory contain a Makefile that allow the code to be built in a
+standalone manner, independent of the larger LLVM build infrastructure. To build
+the program you will need to have 'clang++' and 'llvm-config' in your path.
diff --git a/examples/Kaleidoscope/Orc/lazy_codegen/toy.cpp b/examples/Kaleidoscope/Orc/lazy_codegen/toy.cpp
new file mode 100644
index 0000000..1ed267d
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/lazy_codegen/toy.cpp
@@ -0,0 +1,1338 @@
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/LazyEmittingLayer.h"
+#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Transforms/Scalar.h"
+#include <cctype>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::orc;
+
+//===----------------------------------------------------------------------===//
+// Lexer
+//===----------------------------------------------------------------------===//
+
+// The lexer returns tokens [0-255] if it is an unknown character, otherwise one
+// of these for known things.
+enum Token {
+  tok_eof = -1,
+
+  // commands
+  tok_def = -2, tok_extern = -3,
+
+  // primary
+  tok_identifier = -4, tok_number = -5,
+  
+  // control
+  tok_if = -6, tok_then = -7, tok_else = -8,
+  tok_for = -9, tok_in = -10,
+  
+  // operators
+  tok_binary = -11, tok_unary = -12,
+  
+  // var definition
+  tok_var = -13
+};
+
+static std::string IdentifierStr;  // Filled in if tok_identifier
+static double NumVal;              // Filled in if tok_number
+
+/// gettok - Return the next token from standard input.
+static int gettok() {
+  static int LastChar = ' ';
+
+  // Skip any whitespace.
+  while (isspace(LastChar))
+    LastChar = getchar();
+
+  if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]*
+    IdentifierStr = LastChar;
+    while (isalnum((LastChar = getchar())))
+      IdentifierStr += LastChar;
+
+    if (IdentifierStr == "def") return tok_def;
+    if (IdentifierStr == "extern") return tok_extern;
+    if (IdentifierStr == "if") return tok_if;
+    if (IdentifierStr == "then") return tok_then;
+    if (IdentifierStr == "else") return tok_else;
+    if (IdentifierStr == "for") return tok_for;
+    if (IdentifierStr == "in") return tok_in;
+    if (IdentifierStr == "binary") return tok_binary;
+    if (IdentifierStr == "unary") return tok_unary;
+    if (IdentifierStr == "var") return tok_var;
+    return tok_identifier;
+  }
+
+  if (isdigit(LastChar) || LastChar == '.') {   // Number: [0-9.]+
+    std::string NumStr;
+    do {
+      NumStr += LastChar;
+      LastChar = getchar();
+    } while (isdigit(LastChar) || LastChar == '.');
+
+    NumVal = strtod(NumStr.c_str(), 0);
+    return tok_number;
+  }
+
+  if (LastChar == '#') {
+    // Comment until end of line.
+    do LastChar = getchar();
+    while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
+    
+    if (LastChar != EOF)
+      return gettok();
+  }
+  
+  // Check for end of file.  Don't eat the EOF.
+  if (LastChar == EOF)
+    return tok_eof;
+
+  // Otherwise, just return the character as its ascii value.
+  int ThisChar = LastChar;
+  LastChar = getchar();
+  return ThisChar;
+}
+
+//===----------------------------------------------------------------------===//
+// Abstract Syntax Tree (aka Parse Tree)
+//===----------------------------------------------------------------------===//
+
+class IRGenContext;
+
+/// ExprAST - Base class for all expression nodes.
+struct ExprAST {
+  virtual ~ExprAST() {}
+  virtual Value *IRGen(IRGenContext &C) const = 0;
+};
+
+/// NumberExprAST - Expression class for numeric literals like "1.0".
+struct NumberExprAST : public ExprAST {
+  NumberExprAST(double Val) : Val(Val) {}
+  Value *IRGen(IRGenContext &C) const override;
+
+  double Val;
+};
+
+/// VariableExprAST - Expression class for referencing a variable, like "a".
+struct VariableExprAST : public ExprAST {
+  VariableExprAST(std::string Name) : Name(std::move(Name)) {}
+  Value *IRGen(IRGenContext &C) const override;
+
+  std::string Name;
+};
+
+/// UnaryExprAST - Expression class for a unary operator.
+struct UnaryExprAST : public ExprAST {
+  UnaryExprAST(char Opcode, std::unique_ptr<ExprAST> Operand) 
+    : Opcode(std::move(Opcode)), Operand(std::move(Operand)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  char Opcode;
+  std::unique_ptr<ExprAST> Operand;
+};
+
+/// BinaryExprAST - Expression class for a binary operator.
+struct BinaryExprAST : public ExprAST {
+  BinaryExprAST(char Op, std::unique_ptr<ExprAST> LHS,
+                std::unique_ptr<ExprAST> RHS) 
+    : Op(Op), LHS(std::move(LHS)), RHS(std::move(RHS)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  char Op;
+  std::unique_ptr<ExprAST> LHS, RHS;
+};
+
+/// CallExprAST - Expression class for function calls.
+struct CallExprAST : public ExprAST {
+  CallExprAST(std::string CalleeName,
+              std::vector<std::unique_ptr<ExprAST>> Args)
+    : CalleeName(std::move(CalleeName)), Args(std::move(Args)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  std::string CalleeName;
+  std::vector<std::unique_ptr<ExprAST>> Args;
+};
+
+/// IfExprAST - Expression class for if/then/else.
+struct IfExprAST : public ExprAST {
+  IfExprAST(std::unique_ptr<ExprAST> Cond, std::unique_ptr<ExprAST> Then,
+            std::unique_ptr<ExprAST> Else)
+    : Cond(std::move(Cond)), Then(std::move(Then)), Else(std::move(Else)) {}
+  Value *IRGen(IRGenContext &C) const override;
+
+  std::unique_ptr<ExprAST> Cond, Then, Else;
+};
+
+/// ForExprAST - Expression class for for/in.
+struct ForExprAST : public ExprAST {
+  ForExprAST(std::string VarName, std::unique_ptr<ExprAST> Start,
+             std::unique_ptr<ExprAST> End, std::unique_ptr<ExprAST> Step,
+             std::unique_ptr<ExprAST> Body)
+    : VarName(std::move(VarName)), Start(std::move(Start)), End(std::move(End)),
+      Step(std::move(Step)), Body(std::move(Body)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  std::string VarName;
+  std::unique_ptr<ExprAST> Start, End, Step, Body;
+};
+
+/// VarExprAST - Expression class for var/in
+struct VarExprAST : public ExprAST {
+  typedef std::pair<std::string, std::unique_ptr<ExprAST>> Binding;
+  typedef std::vector<Binding> BindingList;
+
+  VarExprAST(BindingList VarBindings, std::unique_ptr<ExprAST> Body)
+    : VarBindings(std::move(VarBindings)), Body(std::move(Body)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  BindingList VarBindings;
+  std::unique_ptr<ExprAST> Body;
+};
+
+/// PrototypeAST - This class represents the "prototype" for a function,
+/// which captures its argument names as well as if it is an operator.
+struct PrototypeAST {
+  PrototypeAST(std::string Name, std::vector<std::string> Args,
+               bool IsOperator = false, unsigned Precedence = 0)
+    : Name(std::move(Name)), Args(std::move(Args)), IsOperator(IsOperator),
+      Precedence(Precedence) {}
+
+  Function *IRGen(IRGenContext &C) const;
+  void CreateArgumentAllocas(Function *F, IRGenContext &C);
+
+  bool isUnaryOp() const { return IsOperator && Args.size() == 1; }
+  bool isBinaryOp() const { return IsOperator && Args.size() == 2; }
+  
+  char getOperatorName() const {
+    assert(isUnaryOp() || isBinaryOp());
+    return Name[Name.size()-1];
+  }
+
+  std::string Name;
+  std::vector<std::string> Args;
+  bool IsOperator;
+  unsigned Precedence;  // Precedence if a binary op.
+};
+
+/// FunctionAST - This class represents a function definition itself.
+struct FunctionAST {
+  FunctionAST(std::unique_ptr<PrototypeAST> Proto,
+              std::unique_ptr<ExprAST> Body)
+    : Proto(std::move(Proto)), Body(std::move(Body)) {}
+
+  Function *IRGen(IRGenContext &C) const;
+
+  std::unique_ptr<PrototypeAST> Proto;
+  std::unique_ptr<ExprAST> Body;
+};
+
+//===----------------------------------------------------------------------===//
+// Parser
+//===----------------------------------------------------------------------===//
+
+/// CurTok/getNextToken - Provide a simple token buffer.  CurTok is the current
+/// token the parser is looking at.  getNextToken reads another token from the
+/// lexer and updates CurTok with its results.
+static int CurTok;
+static int getNextToken() {
+  return CurTok = gettok();
+}
+
+/// BinopPrecedence - This holds the precedence for each binary operator that is
+/// defined.
+static std::map<char, int> BinopPrecedence;
+
+/// GetTokPrecedence - Get the precedence of the pending binary operator token.
+static int GetTokPrecedence() {
+  if (!isascii(CurTok))
+    return -1;
+  
+  // Make sure it's a declared binop.
+  int TokPrec = BinopPrecedence[CurTok];
+  if (TokPrec <= 0) return -1;
+  return TokPrec;
+}
+
+template <typename T>
+std::unique_ptr<T> ErrorU(const std::string &Str) {
+  std::cerr << "Error: " << Str << "\n";
+  return nullptr;
+}
+
+template <typename T>
+T* ErrorP(const std::string &Str) {
+  std::cerr << "Error: " << Str << "\n";
+  return nullptr;
+}
+
+static std::unique_ptr<ExprAST> ParseExpression();
+
+/// identifierexpr
+///   ::= identifier
+///   ::= identifier '(' expression* ')'
+static std::unique_ptr<ExprAST> ParseIdentifierExpr() {
+  std::string IdName = IdentifierStr;
+  
+  getNextToken();  // eat identifier.
+  
+  if (CurTok != '(') // Simple variable ref.
+    return llvm::make_unique<VariableExprAST>(IdName);
+  
+  // Call.
+  getNextToken();  // eat (
+  std::vector<std::unique_ptr<ExprAST>> Args;
+  if (CurTok != ')') {
+    while (1) {
+      auto Arg = ParseExpression();
+      if (!Arg) return nullptr;
+      Args.push_back(std::move(Arg));
+
+      if (CurTok == ')') break;
+
+      if (CurTok != ',')
+        return ErrorU<CallExprAST>("Expected ')' or ',' in argument list");
+      getNextToken();
+    }
+  }
+
+  // Eat the ')'.
+  getNextToken();
+  
+  return llvm::make_unique<CallExprAST>(IdName, std::move(Args));
+}
+
+/// numberexpr ::= number
+static std::unique_ptr<NumberExprAST> ParseNumberExpr() {
+  auto Result = llvm::make_unique<NumberExprAST>(NumVal);
+  getNextToken(); // consume the number
+  return Result;
+}
+
+/// parenexpr ::= '(' expression ')'
+static std::unique_ptr<ExprAST> ParseParenExpr() {
+  getNextToken();  // eat (.
+  auto V = ParseExpression();
+  if (!V)
+    return nullptr;
+  
+  if (CurTok != ')')
+    return ErrorU<ExprAST>("expected ')'");
+  getNextToken();  // eat ).
+  return V;
+}
+
+/// ifexpr ::= 'if' expression 'then' expression 'else' expression
+static std::unique_ptr<ExprAST> ParseIfExpr() {
+  getNextToken();  // eat the if.
+  
+  // condition.
+  auto Cond = ParseExpression();
+  if (!Cond)
+    return nullptr;
+  
+  if (CurTok != tok_then)
+    return ErrorU<ExprAST>("expected then");
+  getNextToken();  // eat the then
+  
+  auto Then = ParseExpression();
+  if (!Then)
+    return nullptr;
+  
+  if (CurTok != tok_else)
+    return ErrorU<ExprAST>("expected else");
+  
+  getNextToken();
+  
+  auto Else = ParseExpression();
+  if (!Else)
+    return nullptr;
+  
+  return llvm::make_unique<IfExprAST>(std::move(Cond), std::move(Then),
+                                      std::move(Else));
+}
+
+/// forexpr ::= 'for' identifier '=' expr ',' expr (',' expr)? 'in' expression
+static std::unique_ptr<ForExprAST> ParseForExpr() {
+  getNextToken();  // eat the for.
+
+  if (CurTok != tok_identifier)
+    return ErrorU<ForExprAST>("expected identifier after for");
+  
+  std::string IdName = IdentifierStr;
+  getNextToken();  // eat identifier.
+  
+  if (CurTok != '=')
+    return ErrorU<ForExprAST>("expected '=' after for");
+  getNextToken();  // eat '='.
+  
+  
+  auto Start = ParseExpression();
+  if (!Start)
+    return nullptr;
+  if (CurTok != ',')
+    return ErrorU<ForExprAST>("expected ',' after for start value");
+  getNextToken();
+  
+  auto End = ParseExpression();
+  if (!End)
+    return nullptr;
+  
+  // The step value is optional.
+  std::unique_ptr<ExprAST> Step;
+  if (CurTok == ',') {
+    getNextToken();
+    Step = ParseExpression();
+    if (!Step)
+      return nullptr;
+  }
+  
+  if (CurTok != tok_in)
+    return ErrorU<ForExprAST>("expected 'in' after for");
+  getNextToken();  // eat 'in'.
+  
+  auto Body = ParseExpression();
+  if (Body)
+    return nullptr;
+
+  return llvm::make_unique<ForExprAST>(IdName, std::move(Start), std::move(End),
+                                       std::move(Step), std::move(Body));
+}
+
+/// varexpr ::= 'var' identifier ('=' expression)? 
+//                    (',' identifier ('=' expression)?)* 'in' expression
+static std::unique_ptr<VarExprAST> ParseVarExpr() {
+  getNextToken();  // eat the var.
+
+  VarExprAST::BindingList VarBindings;
+
+  // At least one variable name is required.
+  if (CurTok != tok_identifier)
+    return ErrorU<VarExprAST>("expected identifier after var");
+  
+  while (1) {
+    std::string Name = IdentifierStr;
+    getNextToken();  // eat identifier.
+
+    // Read the optional initializer.
+    std::unique_ptr<ExprAST> Init;
+    if (CurTok == '=') {
+      getNextToken(); // eat the '='.
+      
+      Init = ParseExpression();
+      if (!Init)
+        return nullptr;
+    }
+    
+    VarBindings.push_back(VarExprAST::Binding(Name, std::move(Init)));
+    
+    // End of var list, exit loop.
+    if (CurTok != ',') break;
+    getNextToken(); // eat the ','.
+    
+    if (CurTok != tok_identifier)
+      return ErrorU<VarExprAST>("expected identifier list after var");
+  }
+  
+  // At this point, we have to have 'in'.
+  if (CurTok != tok_in)
+    return ErrorU<VarExprAST>("expected 'in' keyword after 'var'");
+  getNextToken();  // eat 'in'.
+  
+  auto Body = ParseExpression();
+  if (!Body)
+    return nullptr;
+  
+  return llvm::make_unique<VarExprAST>(std::move(VarBindings), std::move(Body));
+}
+
+/// primary
+///   ::= identifierexpr
+///   ::= numberexpr
+///   ::= parenexpr
+///   ::= ifexpr
+///   ::= forexpr
+///   ::= varexpr
+static std::unique_ptr<ExprAST> ParsePrimary() {
+  switch (CurTok) {
+  default: return ErrorU<ExprAST>("unknown token when expecting an expression");
+  case tok_identifier: return ParseIdentifierExpr();
+  case tok_number:     return ParseNumberExpr();
+  case '(':            return ParseParenExpr();
+  case tok_if:         return ParseIfExpr();
+  case tok_for:        return ParseForExpr();
+  case tok_var:        return ParseVarExpr();
+  }
+}
+
+/// unary
+///   ::= primary
+///   ::= '!' unary
+static std::unique_ptr<ExprAST> ParseUnary() {
+  // If the current token is not an operator, it must be a primary expr.
+  if (!isascii(CurTok) || CurTok == '(' || CurTok == ',')
+    return ParsePrimary();
+  
+  // If this is a unary operator, read it.
+  int Opc = CurTok;
+  getNextToken();
+  if (auto Operand = ParseUnary())
+    return llvm::make_unique<UnaryExprAST>(Opc, std::move(Operand));
+  return nullptr;
+}
+
+/// binoprhs
+///   ::= ('+' unary)*
+static std::unique_ptr<ExprAST> ParseBinOpRHS(int ExprPrec,
+                                              std::unique_ptr<ExprAST> LHS) {
+  // If this is a binop, find its precedence.
+  while (1) {
+    int TokPrec = GetTokPrecedence();
+    
+    // If this is a binop that binds at least as tightly as the current binop,
+    // consume it, otherwise we are done.
+    if (TokPrec < ExprPrec)
+      return LHS;
+    
+    // Okay, we know this is a binop.
+    int BinOp = CurTok;
+    getNextToken();  // eat binop
+    
+    // Parse the unary expression after the binary operator.
+    auto RHS = ParseUnary();
+    if (!RHS)
+      return nullptr;
+    
+    // If BinOp binds less tightly with RHS than the operator after RHS, let
+    // the pending operator take RHS as its LHS.
+    int NextPrec = GetTokPrecedence();
+    if (TokPrec < NextPrec) {
+      RHS = ParseBinOpRHS(TokPrec+1, std::move(RHS));
+      if (!RHS)
+        return nullptr;
+    }
+    
+    // Merge LHS/RHS.
+    LHS = llvm::make_unique<BinaryExprAST>(BinOp, std::move(LHS), std::move(RHS));
+  }
+}
+
+/// expression
+///   ::= unary binoprhs
+///
+static std::unique_ptr<ExprAST> ParseExpression() {
+  auto LHS = ParseUnary();
+  if (!LHS)
+    return nullptr;
+  
+  return ParseBinOpRHS(0, std::move(LHS));
+}
+
+/// prototype
+///   ::= id '(' id* ')'
+///   ::= binary LETTER number? (id, id)
+///   ::= unary LETTER (id)
+static std::unique_ptr<PrototypeAST> ParsePrototype() {
+  std::string FnName;
+  
+  unsigned Kind = 0; // 0 = identifier, 1 = unary, 2 = binary.
+  unsigned BinaryPrecedence = 30;
+  
+  switch (CurTok) {
+  default:
+    return ErrorU<PrototypeAST>("Expected function name in prototype");
+  case tok_identifier:
+    FnName = IdentifierStr;
+    Kind = 0;
+    getNextToken();
+    break;
+  case tok_unary:
+    getNextToken();
+    if (!isascii(CurTok))
+      return ErrorU<PrototypeAST>("Expected unary operator");
+    FnName = "unary";
+    FnName += (char)CurTok;
+    Kind = 1;
+    getNextToken();
+    break;
+  case tok_binary:
+    getNextToken();
+    if (!isascii(CurTok))
+      return ErrorU<PrototypeAST>("Expected binary operator");
+    FnName = "binary";
+    FnName += (char)CurTok;
+    Kind = 2;
+    getNextToken();
+    
+    // Read the precedence if present.
+    if (CurTok == tok_number) {
+      if (NumVal < 1 || NumVal > 100)
+        return ErrorU<PrototypeAST>("Invalid precedecnce: must be 1..100");
+      BinaryPrecedence = (unsigned)NumVal;
+      getNextToken();
+    }
+    break;
+  }
+  
+  if (CurTok != '(')
+    return ErrorU<PrototypeAST>("Expected '(' in prototype");
+  
+  std::vector<std::string> ArgNames;
+  while (getNextToken() == tok_identifier)
+    ArgNames.push_back(IdentifierStr);
+  if (CurTok != ')')
+    return ErrorU<PrototypeAST>("Expected ')' in prototype");
+  
+  // success.
+  getNextToken();  // eat ')'.
+  
+  // Verify right number of names for operator.
+  if (Kind && ArgNames.size() != Kind)
+    return ErrorU<PrototypeAST>("Invalid number of operands for operator");
+  
+  return llvm::make_unique<PrototypeAST>(FnName, std::move(ArgNames), Kind != 0,
+                                         BinaryPrecedence);
+}
+
+/// definition ::= 'def' prototype expression
+static std::unique_ptr<FunctionAST> ParseDefinition() {
+  getNextToken();  // eat def.
+  auto Proto = ParsePrototype();
+  if (!Proto)
+    return nullptr;
+
+  if (auto Body = ParseExpression())
+    return llvm::make_unique<FunctionAST>(std::move(Proto), std::move(Body));
+  return nullptr;
+}
+
+/// toplevelexpr ::= expression
+static std::unique_ptr<FunctionAST> ParseTopLevelExpr() {
+  if (auto E = ParseExpression()) {
+    // Make an anonymous proto.
+    auto Proto =
+      llvm::make_unique<PrototypeAST>("__anon_expr", std::vector<std::string>());
+    return llvm::make_unique<FunctionAST>(std::move(Proto), std::move(E));
+  }
+  return nullptr;
+}
+
+/// external ::= 'extern' prototype
+static std::unique_ptr<PrototypeAST> ParseExtern() {
+  getNextToken();  // eat extern.
+  return ParsePrototype();
+}
+
+//===----------------------------------------------------------------------===//
+// Code Generation
+//===----------------------------------------------------------------------===//
+
+// FIXME: Obviously we can do better than this
+std::string GenerateUniqueName(const std::string &Root) {
+  static int i = 0;
+  std::ostringstream NameStream;
+  NameStream << Root << ++i;
+  return NameStream.str();
+}
+
+std::string MakeLegalFunctionName(std::string Name)
+{
+  std::string NewName;
+  assert(!Name.empty() && "Base name must not be empty");
+
+  // Start with what we have
+  NewName = Name;
+
+  // Look for a numberic first character
+  if (NewName.find_first_of("0123456789") == 0) {
+    NewName.insert(0, 1, 'n');
+  }
+
+  // Replace illegal characters with their ASCII equivalent
+  std::string legal_elements = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+  size_t pos;
+  while ((pos = NewName.find_first_not_of(legal_elements)) != std::string::npos) {
+    std::ostringstream NumStream;
+    NumStream << (int)NewName.at(pos);
+    NewName = NewName.replace(pos, 1, NumStream.str());
+  }
+
+  return NewName;
+}
+
+class SessionContext {
+public:
+  SessionContext(LLVMContext &C)
+    : Context(C), TM(EngineBuilder().selectTarget()) {}
+  LLVMContext& getLLVMContext() const { return Context; }
+  TargetMachine& getTarget() { return *TM; }
+  void addPrototypeAST(std::unique_ptr<PrototypeAST> P);
+  PrototypeAST* getPrototypeAST(const std::string &Name);
+private:
+  typedef std::map<std::string, std::unique_ptr<PrototypeAST>> PrototypeMap;
+  
+  LLVMContext &Context;
+  std::unique_ptr<TargetMachine> TM;
+      
+  PrototypeMap Prototypes;
+};
+
+void SessionContext::addPrototypeAST(std::unique_ptr<PrototypeAST> P) {
+  Prototypes[P->Name] = std::move(P);
+}
+
+PrototypeAST* SessionContext::getPrototypeAST(const std::string &Name) {
+  PrototypeMap::iterator I = Prototypes.find(Name);
+  if (I != Prototypes.end())
+    return I->second.get();
+  return nullptr;
+}
+
+class IRGenContext {
+public:
+
+  IRGenContext(SessionContext &S)
+    : Session(S),
+      M(new Module(GenerateUniqueName("jit_module_"),
+                   Session.getLLVMContext())),
+      Builder(Session.getLLVMContext()) {
+    M->setDataLayout(Session.getTarget().getDataLayout());
+  }
+
+  SessionContext& getSession() { return Session; }
+  Module& getM() const { return *M; }
+  std::unique_ptr<Module> takeM() { return std::move(M); }
+  IRBuilder<>& getBuilder() { return Builder; }
+  LLVMContext& getLLVMContext() { return Session.getLLVMContext(); }
+  Function* getPrototype(const std::string &Name);
+
+  std::map<std::string, AllocaInst*> NamedValues;
+private:
+  SessionContext &Session;
+  std::unique_ptr<Module> M;
+  IRBuilder<> Builder;
+};
+
+Function* IRGenContext::getPrototype(const std::string &Name) {
+  if (Function *ExistingProto = M->getFunction(Name))
+    return ExistingProto;
+  if (PrototypeAST *ProtoAST = Session.getPrototypeAST(Name))
+    return ProtoAST->IRGen(*this);
+  return nullptr;
+}
+
+/// CreateEntryBlockAlloca - Create an alloca instruction in the entry block of
+/// the function.  This is used for mutable variables etc.
+static AllocaInst *CreateEntryBlockAlloca(Function *TheFunction,
+                                          const std::string &VarName) {
+  IRBuilder<> TmpB(&TheFunction->getEntryBlock(),
+                 TheFunction->getEntryBlock().begin());
+  return TmpB.CreateAlloca(Type::getDoubleTy(getGlobalContext()), 0,
+                           VarName.c_str());
+}
+
+Value *NumberExprAST::IRGen(IRGenContext &C) const {
+  return ConstantFP::get(C.getLLVMContext(), APFloat(Val));
+}
+
+Value *VariableExprAST::IRGen(IRGenContext &C) const {
+  // Look this variable up in the function.
+  Value *V = C.NamedValues[Name];
+
+  if (V == 0)
+    return ErrorP<Value>("Unknown variable name '" + Name + "'");
+
+  // Load the value.
+  return C.getBuilder().CreateLoad(V, Name.c_str());
+}
+
+Value *UnaryExprAST::IRGen(IRGenContext &C) const {
+  if (Value *OperandV = Operand->IRGen(C)) {
+    std::string FnName = MakeLegalFunctionName(std::string("unary")+Opcode);
+    if (Function *F = C.getPrototype(FnName))
+      return C.getBuilder().CreateCall(F, OperandV, "unop");
+    return ErrorP<Value>("Unknown unary operator");
+  }
+
+  // Could not codegen operand - return null.
+  return nullptr;
+}
+
+Value *BinaryExprAST::IRGen(IRGenContext &C) const {
+  // Special case '=' because we don't want to emit the LHS as an expression.
+  if (Op == '=') {
+    // Assignment requires the LHS to be an identifier.
+    auto LHSVar = static_cast<VariableExprAST&>(*LHS);
+    // Codegen the RHS.
+    Value *Val = RHS->IRGen(C);
+    if (!Val) return nullptr;
+
+    // Look up the name.
+    if (auto Variable = C.NamedValues[LHSVar.Name]) {
+      C.getBuilder().CreateStore(Val, Variable);
+      return Val;
+    }
+    return ErrorP<Value>("Unknown variable name");
+  }
+  
+  Value *L = LHS->IRGen(C);
+  Value *R = RHS->IRGen(C);
+  if (!L || !R) return nullptr;
+  
+  switch (Op) {
+  case '+': return C.getBuilder().CreateFAdd(L, R, "addtmp");
+  case '-': return C.getBuilder().CreateFSub(L, R, "subtmp");
+  case '*': return C.getBuilder().CreateFMul(L, R, "multmp");
+  case '/': return C.getBuilder().CreateFDiv(L, R, "divtmp");
+  case '<':
+    L = C.getBuilder().CreateFCmpULT(L, R, "cmptmp");
+    // Convert bool 0/1 to double 0.0 or 1.0
+    return C.getBuilder().CreateUIToFP(L, Type::getDoubleTy(getGlobalContext()),
+                                "booltmp");
+  default: break;
+  }
+  
+  // If it wasn't a builtin binary operator, it must be a user defined one. Emit
+  // a call to it.
+  std::string FnName = MakeLegalFunctionName(std::string("binary")+Op);
+  if (Function *F = C.getPrototype(FnName)) {
+    Value *Ops[] = { L, R };
+    return C.getBuilder().CreateCall(F, Ops, "binop");
+  }
+   
+  return ErrorP<Value>("Unknown binary operator");
+}
+
+Value *CallExprAST::IRGen(IRGenContext &C) const {
+  // Look up the name in the global module table.
+  if (auto CalleeF = C.getPrototype(CalleeName)) {
+    // If argument mismatch error.
+    if (CalleeF->arg_size() != Args.size())
+      return ErrorP<Value>("Incorrect # arguments passed");
+
+    std::vector<Value*> ArgsV;
+    for (unsigned i = 0, e = Args.size(); i != e; ++i) {
+      ArgsV.push_back(Args[i]->IRGen(C));
+      if (!ArgsV.back()) return nullptr;
+    }
+    
+    return C.getBuilder().CreateCall(CalleeF, ArgsV, "calltmp");
+  }
+
+  return ErrorP<Value>("Unknown function referenced");
+}
+
+Value *IfExprAST::IRGen(IRGenContext &C) const {
+  Value *CondV = Cond->IRGen(C);
+  if (!CondV) return nullptr;
+  
+  // Convert condition to a bool by comparing equal to 0.0.
+  ConstantFP *FPZero = 
+    ConstantFP::get(C.getLLVMContext(), APFloat(0.0));
+  CondV = C.getBuilder().CreateFCmpONE(CondV, FPZero, "ifcond");
+  
+  Function *TheFunction = C.getBuilder().GetInsertBlock()->getParent();
+  
+  // Create blocks for the then and else cases.  Insert the 'then' block at the
+  // end of the function.
+  BasicBlock *ThenBB = BasicBlock::Create(C.getLLVMContext(), "then", TheFunction);
+  BasicBlock *ElseBB = BasicBlock::Create(C.getLLVMContext(), "else");
+  BasicBlock *MergeBB = BasicBlock::Create(C.getLLVMContext(), "ifcont");
+  
+  C.getBuilder().CreateCondBr(CondV, ThenBB, ElseBB);
+  
+  // Emit then value.
+  C.getBuilder().SetInsertPoint(ThenBB);
+  
+  Value *ThenV = Then->IRGen(C);
+  if (!ThenV) return nullptr;
+  
+  C.getBuilder().CreateBr(MergeBB);
+  // Codegen of 'Then' can change the current block, update ThenBB for the PHI.
+  ThenBB = C.getBuilder().GetInsertBlock();
+  
+  // Emit else block.
+  TheFunction->getBasicBlockList().push_back(ElseBB);
+  C.getBuilder().SetInsertPoint(ElseBB);
+  
+  Value *ElseV = Else->IRGen(C);
+  if (!ElseV) return nullptr;
+  
+  C.getBuilder().CreateBr(MergeBB);
+  // Codegen of 'Else' can change the current block, update ElseBB for the PHI.
+  ElseBB = C.getBuilder().GetInsertBlock();
+  
+  // Emit merge block.
+  TheFunction->getBasicBlockList().push_back(MergeBB);
+  C.getBuilder().SetInsertPoint(MergeBB);
+  PHINode *PN = C.getBuilder().CreatePHI(Type::getDoubleTy(getGlobalContext()), 2,
+                                  "iftmp");
+  
+  PN->addIncoming(ThenV, ThenBB);
+  PN->addIncoming(ElseV, ElseBB);
+  return PN;
+}
+
+Value *ForExprAST::IRGen(IRGenContext &C) const {
+  // Output this as:
+  //   var = alloca double
+  //   ...
+  //   start = startexpr
+  //   store start -> var
+  //   goto loop
+  // loop: 
+  //   ...
+  //   bodyexpr
+  //   ...
+  // loopend:
+  //   step = stepexpr
+  //   endcond = endexpr
+  //
+  //   curvar = load var
+  //   nextvar = curvar + step
+  //   store nextvar -> var
+  //   br endcond, loop, endloop
+  // outloop:
+  
+  Function *TheFunction = C.getBuilder().GetInsertBlock()->getParent();
+
+  // Create an alloca for the variable in the entry block.
+  AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
+  
+  // Emit the start code first, without 'variable' in scope.
+  Value *StartVal = Start->IRGen(C);
+  if (!StartVal) return nullptr;
+  
+  // Store the value into the alloca.
+  C.getBuilder().CreateStore(StartVal, Alloca);
+  
+  // Make the new basic block for the loop header, inserting after current
+  // block.
+  BasicBlock *LoopBB = BasicBlock::Create(getGlobalContext(), "loop", TheFunction);
+  
+  // Insert an explicit fall through from the current block to the LoopBB.
+  C.getBuilder().CreateBr(LoopBB);
+
+  // Start insertion in LoopBB.
+  C.getBuilder().SetInsertPoint(LoopBB);
+  
+  // Within the loop, the variable is defined equal to the PHI node.  If it
+  // shadows an existing variable, we have to restore it, so save it now.
+  AllocaInst *OldVal = C.NamedValues[VarName];
+  C.NamedValues[VarName] = Alloca;
+  
+  // Emit the body of the loop.  This, like any other expr, can change the
+  // current BB.  Note that we ignore the value computed by the body, but don't
+  // allow an error.
+  if (!Body->IRGen(C))
+    return nullptr;
+  
+  // Emit the step value.
+  Value *StepVal;
+  if (Step) {
+    StepVal = Step->IRGen(C);
+    if (!StepVal) return nullptr;
+  } else {
+    // If not specified, use 1.0.
+    StepVal = ConstantFP::get(getGlobalContext(), APFloat(1.0));
+  }
+  
+  // Compute the end condition.
+  Value *EndCond = End->IRGen(C);
+  if (EndCond == 0) return EndCond;
+  
+  // Reload, increment, and restore the alloca.  This handles the case where
+  // the body of the loop mutates the variable.
+  Value *CurVar = C.getBuilder().CreateLoad(Alloca, VarName.c_str());
+  Value *NextVar = C.getBuilder().CreateFAdd(CurVar, StepVal, "nextvar");
+  C.getBuilder().CreateStore(NextVar, Alloca);
+  
+  // Convert condition to a bool by comparing equal to 0.0.
+  EndCond = C.getBuilder().CreateFCmpONE(EndCond, 
+                              ConstantFP::get(getGlobalContext(), APFloat(0.0)),
+                                  "loopcond");
+  
+  // Create the "after loop" block and insert it.
+  BasicBlock *AfterBB = BasicBlock::Create(getGlobalContext(), "afterloop", TheFunction);
+  
+  // Insert the conditional branch into the end of LoopEndBB.
+  C.getBuilder().CreateCondBr(EndCond, LoopBB, AfterBB);
+  
+  // Any new code will be inserted in AfterBB.
+  C.getBuilder().SetInsertPoint(AfterBB);
+  
+  // Restore the unshadowed variable.
+  if (OldVal)
+    C.NamedValues[VarName] = OldVal;
+  else
+    C.NamedValues.erase(VarName);
+
+  
+  // for expr always returns 0.0.
+  return Constant::getNullValue(Type::getDoubleTy(getGlobalContext()));
+}
+
+Value *VarExprAST::IRGen(IRGenContext &C) const {
+  std::vector<AllocaInst *> OldBindings;
+  
+  Function *TheFunction = C.getBuilder().GetInsertBlock()->getParent();
+
+  // Register all variables and emit their initializer.
+  for (unsigned i = 0, e = VarBindings.size(); i != e; ++i) {
+    auto &VarName = VarBindings[i].first;
+    auto &Init = VarBindings[i].second;
+    
+    // Emit the initializer before adding the variable to scope, this prevents
+    // the initializer from referencing the variable itself, and permits stuff
+    // like this:
+    //  var a = 1 in
+    //    var a = a in ...   # refers to outer 'a'.
+    Value *InitVal;
+    if (Init) {
+      InitVal = Init->IRGen(C);
+      if (!InitVal) return nullptr;
+    } else // If not specified, use 0.0.
+      InitVal = ConstantFP::get(getGlobalContext(), APFloat(0.0));
+    
+    AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
+    C.getBuilder().CreateStore(InitVal, Alloca);
+
+    // Remember the old variable binding so that we can restore the binding when
+    // we unrecurse.
+    OldBindings.push_back(C.NamedValues[VarName]);
+    
+    // Remember this binding.
+    C.NamedValues[VarName] = Alloca;
+  }
+  
+  // Codegen the body, now that all vars are in scope.
+  Value *BodyVal = Body->IRGen(C);
+  if (!BodyVal) return nullptr;
+  
+  // Pop all our variables from scope.
+  for (unsigned i = 0, e = VarBindings.size(); i != e; ++i)
+    C.NamedValues[VarBindings[i].first] = OldBindings[i];
+
+  // Return the body computation.
+  return BodyVal;
+}
+
+Function *PrototypeAST::IRGen(IRGenContext &C) const {
+  std::string FnName = MakeLegalFunctionName(Name);
+
+  // Make the function type:  double(double,double) etc.
+  std::vector<Type*> Doubles(Args.size(), 
+                             Type::getDoubleTy(getGlobalContext()));
+  FunctionType *FT = FunctionType::get(Type::getDoubleTy(getGlobalContext()),
+                                       Doubles, false);
+  Function *F = Function::Create(FT, Function::ExternalLinkage, FnName,
+                                 &C.getM());
+
+  // If F conflicted, there was already something named 'FnName'.  If it has a
+  // body, don't allow redefinition or reextern.
+  if (F->getName() != FnName) {
+    // Delete the one we just made and get the existing one.
+    F->eraseFromParent();
+    F = C.getM().getFunction(Name);
+    
+    // If F already has a body, reject this.
+    if (!F->empty()) {
+      ErrorP<Function>("redefinition of function");
+      return nullptr;
+    }
+    
+    // If F took a different number of args, reject.
+    if (F->arg_size() != Args.size()) {
+      ErrorP<Function>("redefinition of function with different # args");
+      return nullptr;
+    }
+  }
+  
+  // Set names for all arguments.
+  unsigned Idx = 0;
+  for (Function::arg_iterator AI = F->arg_begin(); Idx != Args.size();
+       ++AI, ++Idx)
+    AI->setName(Args[Idx]);
+    
+  return F;
+}
+
+/// CreateArgumentAllocas - Create an alloca for each argument and register the
+/// argument in the symbol table so that references to it will succeed.
+void PrototypeAST::CreateArgumentAllocas(Function *F, IRGenContext &C) {
+  Function::arg_iterator AI = F->arg_begin();
+  for (unsigned Idx = 0, e = Args.size(); Idx != e; ++Idx, ++AI) {
+    // Create an alloca for this variable.
+    AllocaInst *Alloca = CreateEntryBlockAlloca(F, Args[Idx]);
+
+    // Store the initial value into the alloca.
+    C.getBuilder().CreateStore(AI, Alloca);
+
+    // Add arguments to variable symbol table.
+    C.NamedValues[Args[Idx]] = Alloca;
+  }
+}
+
+Function *FunctionAST::IRGen(IRGenContext &C) const {
+  C.NamedValues.clear();
+  
+  Function *TheFunction = Proto->IRGen(C);
+  if (!TheFunction)
+    return nullptr;
+  
+  // If this is an operator, install it.
+  if (Proto->isBinaryOp())
+    BinopPrecedence[Proto->getOperatorName()] = Proto->Precedence;
+  
+  // Create a new basic block to start insertion into.
+  BasicBlock *BB = BasicBlock::Create(getGlobalContext(), "entry", TheFunction);
+  C.getBuilder().SetInsertPoint(BB);
+  
+  // Add all arguments to the symbol table and create their allocas.
+  Proto->CreateArgumentAllocas(TheFunction, C);
+
+  if (Value *RetVal = Body->IRGen(C)) {
+    // Finish off the function.
+    C.getBuilder().CreateRet(RetVal);
+
+    // Validate the generated code, checking for consistency.
+    verifyFunction(*TheFunction);
+
+    return TheFunction;
+  }
+  
+  // Error reading body, remove function.
+  TheFunction->eraseFromParent();
+
+  if (Proto->isBinaryOp())
+    BinopPrecedence.erase(Proto->getOperatorName());
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Top-Level parsing and JIT Driver
+//===----------------------------------------------------------------------===//
+
+static std::unique_ptr<llvm::Module> IRGen(SessionContext &S,
+                                           const FunctionAST &F) {
+  IRGenContext C(S);
+  auto LF = F.IRGen(C);
+  if (!LF)
+    return nullptr;
+#ifndef MINIMAL_STDERR_OUTPUT
+  fprintf(stderr, "Read function definition:");
+  LF->dump();
+#endif
+  return C.takeM();
+}
+
+template <typename T>
+static std::vector<T> singletonSet(T t) {
+  std::vector<T> Vec;
+  Vec.push_back(std::move(t));
+  return Vec;
+}
+
+class KaleidoscopeJIT {
+public:
+  typedef ObjectLinkingLayer<> ObjLayerT;
+  typedef IRCompileLayer<ObjLayerT> CompileLayerT;
+  typedef LazyEmittingLayer<CompileLayerT> LazyEmitLayerT;
+
+  typedef LazyEmitLayerT::ModuleSetHandleT ModuleHandleT;
+
+  KaleidoscopeJIT(SessionContext &Session)
+    : Mang(Session.getTarget().getDataLayout()),
+      CompileLayer(ObjectLayer, SimpleCompiler(Session.getTarget())),
+      LazyEmitLayer(CompileLayer) {}
+
+  std::string mangle(const std::string &Name) {
+    std::string MangledName;
+    {
+      raw_string_ostream MangledNameStream(MangledName);
+      Mang.getNameWithPrefix(MangledNameStream, Name);
+    }
+    return MangledName;
+  }
+
+  ModuleHandleT addModule(std::unique_ptr<Module> M) {
+    // We need a memory manager to allocate memory and resolve symbols for this
+    // new module. Create one that resolves symbols by looking back into the
+    // JIT.
+    auto MM = createLookasideRTDyldMM<SectionMemoryManager>(
+                [&](const std::string &Name) {
+                  return findSymbol(Name).getAddress();
+                },
+                [](const std::string &S) { return 0; } );
+
+    return LazyEmitLayer.addModuleSet(singletonSet(std::move(M)),
+                                      std::move(MM));
+  }
+
+  void removeModule(ModuleHandleT H) { LazyEmitLayer.removeModuleSet(H); }
+
+  JITSymbol findSymbol(const std::string &Name) {
+    return LazyEmitLayer.findSymbol(Name, true);
+  }
+
+  JITSymbol findUnmangledSymbol(const std::string Name) {
+    return findSymbol(mangle(Name));
+  }
+
+private:
+
+  Mangler Mang;
+  ObjLayerT ObjectLayer;
+  CompileLayerT CompileLayer;
+  LazyEmitLayerT LazyEmitLayer;
+};
+
+static void HandleDefinition(SessionContext &S, KaleidoscopeJIT &J) {
+  if (auto F = ParseDefinition()) {
+    if (auto M = IRGen(S, *F)) {
+      S.addPrototypeAST(llvm::make_unique<PrototypeAST>(*F->Proto));
+      J.addModule(std::move(M));
+    }
+  } else {
+    // Skip token for error recovery.
+    getNextToken();
+  }
+}
+
+static void HandleExtern(SessionContext &S) {
+  if (auto P = ParseExtern())
+    S.addPrototypeAST(std::move(P));
+  else {
+    // Skip token for error recovery.
+    getNextToken();
+  }
+}
+
+static void HandleTopLevelExpression(SessionContext &S, KaleidoscopeJIT &J) {
+  // Evaluate a top-level expression into an anonymous function.
+  if (auto F = ParseTopLevelExpr()) {
+    IRGenContext C(S);
+    if (auto ExprFunc = F->IRGen(C)) {
+#ifndef MINIMAL_STDERR_OUTPUT
+      std::cerr << "Expression function:\n";
+      ExprFunc->dump();
+#endif
+      // Add the CodeGen'd module to the JIT. Keep a handle to it: We can remove
+      // this module as soon as we've executed Function ExprFunc.
+      auto H = J.addModule(C.takeM());
+
+      // Get the address of the JIT'd function in memory.
+      auto ExprSymbol = J.findUnmangledSymbol("__anon_expr");
+      
+      // Cast it to the right type (takes no arguments, returns a double) so we
+      // can call it as a native function.
+      double (*FP)() = (double (*)())(intptr_t)ExprSymbol.getAddress();
+#ifdef MINIMAL_STDERR_OUTPUT
+      FP();
+#else
+      std::cerr << "Evaluated to " << FP() << "\n";
+#endif
+
+      // Remove the function.
+      J.removeModule(H);
+    }
+  } else {
+    // Skip token for error recovery.
+    getNextToken();
+  }
+}
+
+/// top ::= definition | external | expression | ';'
+static void MainLoop() {
+  SessionContext S(getGlobalContext());
+  KaleidoscopeJIT J(S);
+
+  while (1) {
+    switch (CurTok) {
+    case tok_eof:    return;
+    case ';':        getNextToken(); continue;  // ignore top-level semicolons.
+    case tok_def:    HandleDefinition(S, J); break;
+    case tok_extern: HandleExtern(S); break;
+    default:         HandleTopLevelExpression(S, J); break;
+    }
+#ifndef MINIMAL_STDERR_OUTPUT
+    std::cerr << "ready> ";
+#endif
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// "Library" functions that can be "extern'd" from user code.
+//===----------------------------------------------------------------------===//
+
+/// putchard - putchar that takes a double and returns 0.
+extern "C" 
+double putchard(double X) {
+  putchar((char)X);
+  return 0;
+}
+
+/// printd - printf that takes a double prints it as "%f\n", returning 0.
+extern "C" 
+double printd(double X) {
+  printf("%f", X);
+  return 0;
+}
+
+extern "C" 
+double printlf() {
+  printf("\n");
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Main driver code.
+//===----------------------------------------------------------------------===//
+
+int main() {
+  InitializeNativeTarget();
+  InitializeNativeTargetAsmPrinter();
+  InitializeNativeTargetAsmParser();
+
+  // Install standard binary operators.
+  // 1 is lowest precedence.
+  BinopPrecedence['='] = 2;
+  BinopPrecedence['<'] = 10;
+  BinopPrecedence['+'] = 20;
+  BinopPrecedence['-'] = 20;
+  BinopPrecedence['/'] = 40;
+  BinopPrecedence['*'] = 40;  // highest.
+
+  // Prime the first token.
+#ifndef MINIMAL_STDERR_OUTPUT
+  std::cerr << "ready> ";
+#endif
+  getNextToken();
+
+  std::cerr << std::fixed;
+
+  // Run the main "interpreter loop" now.
+  MainLoop();
+
+  return 0;
+}
+
diff --git a/examples/Kaleidoscope/Orc/lazy_irgen/CMakeLists.txt b/examples/Kaleidoscope/Orc/lazy_irgen/CMakeLists.txt
new file mode 100644
index 0000000..4488681
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/lazy_irgen/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  ExecutionEngine
+  Object
+  RuntimeDyld
+  Support
+  native
+  )
+
+add_kaleidoscope_chapter(Kaleidoscope-Orc-lazy_irgen
+  toy.cpp
+  )
diff --git a/examples/Kaleidoscope/Orc/lazy_irgen/Makefile b/examples/Kaleidoscope/Orc/lazy_irgen/Makefile
new file mode 100644
index 0000000..5536314
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/lazy_irgen/Makefile
@@ -0,0 +1,17 @@
+UNAME := $(shell uname -s)
+
+ifeq ($(UNAME),Darwin)
+	CXX := xcrun --sdk macosx clang++
+else
+	CXX := clang++
+endif
+
+LLVM_CXXFLAGS := $(shell llvm-config --cxxflags)
+LLVM_LDFLAGS := $(shell llvm-config  --ldflags --system-libs --libs core orcjit native)
+
+toy: toy.cpp
+	$(CXX) $(LLVM_CXXFLAGS) -Wall -std=c++11 -g -O0 -rdynamic -fno-rtti -o toy toy.cpp $(LLVM_LDFLAGS)
+
+.PHONY: clean
+clean:
+	rm -f toy
diff --git a/examples/Kaleidoscope/Orc/lazy_irgen/README.txt b/examples/Kaleidoscope/Orc/lazy_irgen/README.txt
new file mode 100644
index 0000000..9aaa431
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/lazy_irgen/README.txt
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===/
+//                 Kaleidoscope with Orc - Lazy IRGen Version
+//===----------------------------------------------------------------------===//
+
+This version of Kaleidoscope with Orc demonstrates lazy IR-generation.
+Building on the lazy-codegen version of the tutorial, this version reduces the
+amount of up-front work that must be done by lazily IRgen'ing ASTs. When a
+function definition is entered, its AST is added to a map of available
+definitions. No IRGen is performed at this point and nothing is added to the JIT.
+When attempting to resolve symbol addresses, the lambda in
+KaleidoscopeJIT::getSymbolAddress will scan the AST map and generate IR on the
+fly.
+
+This directory contains a Makefile that allows the code to be built in a
+standalone manner, independent of the larger LLVM build infrastructure. To build
+the program you will need to have 'clang++' and 'llvm-config' in your path.
diff --git a/examples/Kaleidoscope/Orc/lazy_irgen/toy.cpp b/examples/Kaleidoscope/Orc/lazy_irgen/toy.cpp
new file mode 100644
index 0000000..d7744ec
--- /dev/null
+++ b/examples/Kaleidoscope/Orc/lazy_irgen/toy.cpp
@@ -0,0 +1,1373 @@
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/LazyEmittingLayer.h"
+#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Transforms/Scalar.h"
+#include <cctype>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::orc;
+
+//===----------------------------------------------------------------------===//
+// Lexer
+//===----------------------------------------------------------------------===//
+
+// The lexer returns tokens [0-255] if it is an unknown character, otherwise one
+// of these for known things.
+enum Token {
+  tok_eof = -1,
+
+  // commands
+  tok_def = -2, tok_extern = -3,
+
+  // primary
+  tok_identifier = -4, tok_number = -5,
+  
+  // control
+  tok_if = -6, tok_then = -7, tok_else = -8,
+  tok_for = -9, tok_in = -10,
+  
+  // operators
+  tok_binary = -11, tok_unary = -12,
+  
+  // var definition
+  tok_var = -13
+};
+
+static std::string IdentifierStr;  // Filled in if tok_identifier
+static double NumVal;              // Filled in if tok_number
+
+/// gettok - Return the next token from standard input.
+static int gettok() {
+  static int LastChar = ' ';
+
+  // Skip any whitespace.
+  while (isspace(LastChar))
+    LastChar = getchar();
+
+  if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]*
+    IdentifierStr = LastChar;
+    while (isalnum((LastChar = getchar())))
+      IdentifierStr += LastChar;
+
+    if (IdentifierStr == "def") return tok_def;
+    if (IdentifierStr == "extern") return tok_extern;
+    if (IdentifierStr == "if") return tok_if;
+    if (IdentifierStr == "then") return tok_then;
+    if (IdentifierStr == "else") return tok_else;
+    if (IdentifierStr == "for") return tok_for;
+    if (IdentifierStr == "in") return tok_in;
+    if (IdentifierStr == "binary") return tok_binary;
+    if (IdentifierStr == "unary") return tok_unary;
+    if (IdentifierStr == "var") return tok_var;
+    return tok_identifier;
+  }
+
+  if (isdigit(LastChar) || LastChar == '.') {   // Number: [0-9.]+
+    std::string NumStr;
+    do {
+      NumStr += LastChar;
+      LastChar = getchar();
+    } while (isdigit(LastChar) || LastChar == '.');
+
+    NumVal = strtod(NumStr.c_str(), 0);
+    return tok_number;
+  }
+
+  if (LastChar == '#') {
+    // Comment until end of line.
+    do LastChar = getchar();
+    while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
+    
+    if (LastChar != EOF)
+      return gettok();
+  }
+  
+  // Check for end of file.  Don't eat the EOF.
+  if (LastChar == EOF)
+    return tok_eof;
+
+  // Otherwise, just return the character as its ascii value.
+  int ThisChar = LastChar;
+  LastChar = getchar();
+  return ThisChar;
+}
+
+//===----------------------------------------------------------------------===//
+// Abstract Syntax Tree (aka Parse Tree)
+//===----------------------------------------------------------------------===//
+
+class IRGenContext;
+
+/// ExprAST - Base class for all expression nodes.
+struct ExprAST {
+  virtual ~ExprAST() {}
+  virtual Value *IRGen(IRGenContext &C) const = 0;
+};
+
+/// NumberExprAST - Expression class for numeric literals like "1.0".
+struct NumberExprAST : public ExprAST {
+  NumberExprAST(double Val) : Val(Val) {}
+  Value *IRGen(IRGenContext &C) const override;
+
+  double Val;
+};
+
+/// VariableExprAST - Expression class for referencing a variable, like "a".
+struct VariableExprAST : public ExprAST {
+  VariableExprAST(std::string Name) : Name(std::move(Name)) {}
+  Value *IRGen(IRGenContext &C) const override;
+
+  std::string Name;
+};
+
+/// UnaryExprAST - Expression class for a unary operator.
+struct UnaryExprAST : public ExprAST {
+  UnaryExprAST(char Opcode, std::unique_ptr<ExprAST> Operand) 
+    : Opcode(std::move(Opcode)), Operand(std::move(Operand)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  char Opcode;
+  std::unique_ptr<ExprAST> Operand;
+};
+
+/// BinaryExprAST - Expression class for a binary operator.
+struct BinaryExprAST : public ExprAST {
+  BinaryExprAST(char Op, std::unique_ptr<ExprAST> LHS,
+                std::unique_ptr<ExprAST> RHS) 
+    : Op(Op), LHS(std::move(LHS)), RHS(std::move(RHS)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  char Op;
+  std::unique_ptr<ExprAST> LHS, RHS;
+};
+
+/// CallExprAST - Expression class for function calls.
+struct CallExprAST : public ExprAST {
+  CallExprAST(std::string CalleeName,
+              std::vector<std::unique_ptr<ExprAST>> Args)
+    : CalleeName(std::move(CalleeName)), Args(std::move(Args)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  std::string CalleeName;
+  std::vector<std::unique_ptr<ExprAST>> Args;
+};
+
+/// IfExprAST - Expression class for if/then/else.
+struct IfExprAST : public ExprAST {
+  IfExprAST(std::unique_ptr<ExprAST> Cond, std::unique_ptr<ExprAST> Then,
+            std::unique_ptr<ExprAST> Else)
+    : Cond(std::move(Cond)), Then(std::move(Then)), Else(std::move(Else)) {}
+  Value *IRGen(IRGenContext &C) const override;
+
+  std::unique_ptr<ExprAST> Cond, Then, Else;
+};
+
+/// ForExprAST - Expression class for for/in.
+struct ForExprAST : public ExprAST {
+  ForExprAST(std::string VarName, std::unique_ptr<ExprAST> Start,
+             std::unique_ptr<ExprAST> End, std::unique_ptr<ExprAST> Step,
+             std::unique_ptr<ExprAST> Body)
+    : VarName(std::move(VarName)), Start(std::move(Start)), End(std::move(End)),
+      Step(std::move(Step)), Body(std::move(Body)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  std::string VarName;
+  std::unique_ptr<ExprAST> Start, End, Step, Body;
+};
+
+/// VarExprAST - Expression class for var/in
+struct VarExprAST : public ExprAST {
+  typedef std::pair<std::string, std::unique_ptr<ExprAST>> Binding;
+  typedef std::vector<Binding> BindingList;
+
+  VarExprAST(BindingList VarBindings, std::unique_ptr<ExprAST> Body)
+    : VarBindings(std::move(VarBindings)), Body(std::move(Body)) {}
+
+  Value *IRGen(IRGenContext &C) const override;
+
+  BindingList VarBindings;
+  std::unique_ptr<ExprAST> Body;
+};
+
+/// PrototypeAST - This class represents the "prototype" for a function,
+/// which captures its argument names as well as if it is an operator.
+struct PrototypeAST {
+  PrototypeAST(std::string Name, std::vector<std::string> Args,
+               bool IsOperator = false, unsigned Precedence = 0)
+    : Name(std::move(Name)), Args(std::move(Args)), IsOperator(IsOperator),
+      Precedence(Precedence) {}
+
+  Function *IRGen(IRGenContext &C) const;
+  void CreateArgumentAllocas(Function *F, IRGenContext &C);
+
+  bool isUnaryOp() const { return IsOperator && Args.size() == 1; }
+  bool isBinaryOp() const { return IsOperator && Args.size() == 2; }
+  
+  char getOperatorName() const {
+    assert(isUnaryOp() || isBinaryOp());
+    return Name[Name.size()-1];
+  }
+
+  std::string Name;
+  std::vector<std::string> Args;
+  bool IsOperator;
+  unsigned Precedence;  // Precedence if a binary op.
+};
+
+/// FunctionAST - This class represents a function definition itself.
+struct FunctionAST {
+  FunctionAST(std::unique_ptr<PrototypeAST> Proto,
+              std::unique_ptr<ExprAST> Body)
+    : Proto(std::move(Proto)), Body(std::move(Body)) {}
+
+  Function *IRGen(IRGenContext &C) const;
+
+  std::unique_ptr<PrototypeAST> Proto;
+  std::unique_ptr<ExprAST> Body;
+};
+
+//===----------------------------------------------------------------------===//
+// Parser
+//===----------------------------------------------------------------------===//
+
+/// CurTok/getNextToken - Provide a simple token buffer.  CurTok is the current
+/// token the parser is looking at.  getNextToken reads another token from the
+/// lexer and updates CurTok with its results.
+static int CurTok;
+static int getNextToken() {
+  return CurTok = gettok();
+}
+
+/// BinopPrecedence - This holds the precedence for each binary operator that is
+/// defined.
+static std::map<char, int> BinopPrecedence;
+
+/// GetTokPrecedence - Get the precedence of the pending binary operator token.
+static int GetTokPrecedence() {
+  if (!isascii(CurTok))
+    return -1;
+  
+  // Make sure it's a declared binop.
+  int TokPrec = BinopPrecedence[CurTok];
+  if (TokPrec <= 0) return -1;
+  return TokPrec;
+}
+
+template <typename T>
+std::unique_ptr<T> ErrorU(const std::string &Str) {
+  std::cerr << "Error: " << Str << "\n";
+  return nullptr;
+}
+
+template <typename T>
+T* ErrorP(const std::string &Str) {
+  std::cerr << "Error: " << Str << "\n";
+  return nullptr;
+}
+
+static std::unique_ptr<ExprAST> ParseExpression();
+
+/// identifierexpr
+///   ::= identifier
+///   ::= identifier '(' expression* ')'
+static std::unique_ptr<ExprAST> ParseIdentifierExpr() {
+  std::string IdName = IdentifierStr;
+  
+  getNextToken();  // eat identifier.
+  
+  if (CurTok != '(') // Simple variable ref.
+    return llvm::make_unique<VariableExprAST>(IdName);
+  
+  // Call.
+  getNextToken();  // eat (
+  std::vector<std::unique_ptr<ExprAST>> Args;
+  if (CurTok != ')') {
+    while (1) {
+      auto Arg = ParseExpression();
+      if (!Arg) return nullptr;
+      Args.push_back(std::move(Arg));
+
+      if (CurTok == ')') break;
+
+      if (CurTok != ',')
+        return ErrorU<CallExprAST>("Expected ')' or ',' in argument list");
+      getNextToken();
+    }
+  }
+
+  // Eat the ')'.
+  getNextToken();
+  
+  return llvm::make_unique<CallExprAST>(IdName, std::move(Args));
+}
+
+/// numberexpr ::= number
+static std::unique_ptr<NumberExprAST> ParseNumberExpr() {
+  auto Result = llvm::make_unique<NumberExprAST>(NumVal);
+  getNextToken(); // consume the number
+  return Result;
+}
+
+/// parenexpr ::= '(' expression ')'
+static std::unique_ptr<ExprAST> ParseParenExpr() {
+  getNextToken();  // eat (.
+  auto V = ParseExpression();
+  if (!V)
+    return nullptr;
+  
+  if (CurTok != ')')
+    return ErrorU<ExprAST>("expected ')'");
+  getNextToken();  // eat ).
+  return V;
+}
+
+/// ifexpr ::= 'if' expression 'then' expression 'else' expression
+static std::unique_ptr<ExprAST> ParseIfExpr() {
+  getNextToken();  // eat the if.
+  
+  // condition.
+  auto Cond = ParseExpression();
+  if (!Cond)
+    return nullptr;
+  
+  if (CurTok != tok_then)
+    return ErrorU<ExprAST>("expected then");
+  getNextToken();  // eat the then
+  
+  auto Then = ParseExpression();
+  if (!Then)
+    return nullptr;
+  
+  if (CurTok != tok_else)
+    return ErrorU<ExprAST>("expected else");
+  
+  getNextToken();
+  
+  auto Else = ParseExpression();
+  if (!Else)
+    return nullptr;
+  
+  return llvm::make_unique<IfExprAST>(std::move(Cond), std::move(Then),
+                                      std::move(Else));
+}
+
+/// forexpr ::= 'for' identifier '=' expr ',' expr (',' expr)? 'in' expression
+static std::unique_ptr<ForExprAST> ParseForExpr() {
+  getNextToken();  // eat the for.
+
+  if (CurTok != tok_identifier)
+    return ErrorU<ForExprAST>("expected identifier after for");
+  
+  std::string IdName = IdentifierStr;
+  getNextToken();  // eat identifier.
+  
+  if (CurTok != '=')
+    return ErrorU<ForExprAST>("expected '=' after for");
+  getNextToken();  // eat '='.
+  
+  
+  auto Start = ParseExpression();
+  if (!Start)
+    return nullptr;
+  if (CurTok != ',')
+    return ErrorU<ForExprAST>("expected ',' after for start value");
+  getNextToken();
+  
+  auto End = ParseExpression();
+  if (!End)
+    return nullptr;
+  
+  // The step value is optional.
+  std::unique_ptr<ExprAST> Step;
+  if (CurTok == ',') {
+    getNextToken();
+    Step = ParseExpression();
+    if (!Step)
+      return nullptr;
+  }
+  
+  if (CurTok != tok_in)
+    return ErrorU<ForExprAST>("expected 'in' after for");
+  getNextToken();  // eat 'in'.
+  
+  auto Body = ParseExpression();
+  if (Body)
+    return nullptr;
+
+  return llvm::make_unique<ForExprAST>(IdName, std::move(Start), std::move(End),
+                                       std::move(Step), std::move(Body));
+}
+
+/// varexpr ::= 'var' identifier ('=' expression)? 
+//                    (',' identifier ('=' expression)?)* 'in' expression
+static std::unique_ptr<VarExprAST> ParseVarExpr() {
+  getNextToken();  // eat the var.
+
+  VarExprAST::BindingList VarBindings;
+
+  // At least one variable name is required.
+  if (CurTok != tok_identifier)
+    return ErrorU<VarExprAST>("expected identifier after var");
+  
+  while (1) {
+    std::string Name = IdentifierStr;
+    getNextToken();  // eat identifier.
+
+    // Read the optional initializer.
+    std::unique_ptr<ExprAST> Init;
+    if (CurTok == '=') {
+      getNextToken(); // eat the '='.
+      
+      Init = ParseExpression();
+      if (!Init)
+        return nullptr;
+    }
+    
+    VarBindings.push_back(VarExprAST::Binding(Name, std::move(Init)));
+    
+    // End of var list, exit loop.
+    if (CurTok != ',') break;
+    getNextToken(); // eat the ','.
+    
+    if (CurTok != tok_identifier)
+      return ErrorU<VarExprAST>("expected identifier list after var");
+  }
+  
+  // At this point, we have to have 'in'.
+  if (CurTok != tok_in)
+    return ErrorU<VarExprAST>("expected 'in' keyword after 'var'");
+  getNextToken();  // eat 'in'.
+  
+  auto Body = ParseExpression();
+  if (!Body)
+    return nullptr;
+  
+  return llvm::make_unique<VarExprAST>(std::move(VarBindings), std::move(Body));
+}
+
+/// primary
+///   ::= identifierexpr
+///   ::= numberexpr
+///   ::= parenexpr
+///   ::= ifexpr
+///   ::= forexpr
+///   ::= varexpr
+static std::unique_ptr<ExprAST> ParsePrimary() {
+  switch (CurTok) {
+  default: return ErrorU<ExprAST>("unknown token when expecting an expression");
+  case tok_identifier: return ParseIdentifierExpr();
+  case tok_number:     return ParseNumberExpr();
+  case '(':            return ParseParenExpr();
+  case tok_if:         return ParseIfExpr();
+  case tok_for:        return ParseForExpr();
+  case tok_var:        return ParseVarExpr();
+  }
+}
+
+/// unary
+///   ::= primary
+///   ::= '!' unary
+static std::unique_ptr<ExprAST> ParseUnary() {
+  // If the current token is not an operator, it must be a primary expr.
+  if (!isascii(CurTok) || CurTok == '(' || CurTok == ',')
+    return ParsePrimary();
+  
+  // If this is a unary operator, read it.
+  int Opc = CurTok;
+  getNextToken();
+  if (auto Operand = ParseUnary())
+    return llvm::make_unique<UnaryExprAST>(Opc, std::move(Operand));
+  return nullptr;
+}
+
+/// binoprhs
+///   ::= ('+' unary)*
+static std::unique_ptr<ExprAST> ParseBinOpRHS(int ExprPrec,
+                                              std::unique_ptr<ExprAST> LHS) {
+  // If this is a binop, find its precedence.
+  while (1) {
+    int TokPrec = GetTokPrecedence();
+    
+    // If this is a binop that binds at least as tightly as the current binop,
+    // consume it, otherwise we are done.
+    if (TokPrec < ExprPrec)
+      return LHS;
+    
+    // Okay, we know this is a binop.
+    int BinOp = CurTok;
+    getNextToken();  // eat binop
+    
+    // Parse the unary expression after the binary operator.
+    auto RHS = ParseUnary();
+    if (!RHS)
+      return nullptr;
+    
+    // If BinOp binds less tightly with RHS than the operator after RHS, let
+    // the pending operator take RHS as its LHS.
+    int NextPrec = GetTokPrecedence();
+    if (TokPrec < NextPrec) {
+      RHS = ParseBinOpRHS(TokPrec+1, std::move(RHS));
+      if (!RHS)
+        return nullptr;
+    }
+    
+    // Merge LHS/RHS.
+    LHS = llvm::make_unique<BinaryExprAST>(BinOp, std::move(LHS), std::move(RHS));
+  }
+}
+
+/// expression
+///   ::= unary binoprhs
+///
+static std::unique_ptr<ExprAST> ParseExpression() {
+  auto LHS = ParseUnary();
+  if (!LHS)
+    return nullptr;
+  
+  return ParseBinOpRHS(0, std::move(LHS));
+}
+
+/// prototype
+///   ::= id '(' id* ')'
+///   ::= binary LETTER number? (id, id)
+///   ::= unary LETTER (id)
+static std::unique_ptr<PrototypeAST> ParsePrototype() {
+  std::string FnName;
+  
+  unsigned Kind = 0; // 0 = identifier, 1 = unary, 2 = binary.
+  unsigned BinaryPrecedence = 30;
+  
+  switch (CurTok) {
+  default:
+    return ErrorU<PrototypeAST>("Expected function name in prototype");
+  case tok_identifier:
+    FnName = IdentifierStr;
+    Kind = 0;
+    getNextToken();
+    break;
+  case tok_unary:
+    getNextToken();
+    if (!isascii(CurTok))
+      return ErrorU<PrototypeAST>("Expected unary operator");
+    FnName = "unary";
+    FnName += (char)CurTok;
+    Kind = 1;
+    getNextToken();
+    break;
+  case tok_binary:
+    getNextToken();
+    if (!isascii(CurTok))
+      return ErrorU<PrototypeAST>("Expected binary operator");
+    FnName = "binary";
+    FnName += (char)CurTok;
+    Kind = 2;
+    getNextToken();
+    
+    // Read the precedence if present.
+    if (CurTok == tok_number) {
+      if (NumVal < 1 || NumVal > 100)
+        return ErrorU<PrototypeAST>("Invalid precedecnce: must be 1..100");
+      BinaryPrecedence = (unsigned)NumVal;
+      getNextToken();
+    }
+    break;
+  }
+  
+  if (CurTok != '(')
+    return ErrorU<PrototypeAST>("Expected '(' in prototype");
+  
+  std::vector<std::string> ArgNames;
+  while (getNextToken() == tok_identifier)
+    ArgNames.push_back(IdentifierStr);
+  if (CurTok != ')')
+    return ErrorU<PrototypeAST>("Expected ')' in prototype");
+  
+  // success.
+  getNextToken();  // eat ')'.
+  
+  // Verify right number of names for operator.
+  if (Kind && ArgNames.size() != Kind)
+    return ErrorU<PrototypeAST>("Invalid number of operands for operator");
+  
+  return llvm::make_unique<PrototypeAST>(FnName, std::move(ArgNames), Kind != 0,
+                                         BinaryPrecedence);
+}
+
+/// definition ::= 'def' prototype expression
+static std::unique_ptr<FunctionAST> ParseDefinition() {
+  getNextToken();  // eat def.
+  auto Proto = ParsePrototype();
+  if (!Proto)
+    return nullptr;
+
+  if (auto Body = ParseExpression())
+    return llvm::make_unique<FunctionAST>(std::move(Proto), std::move(Body));
+  return nullptr;
+}
+
+/// toplevelexpr ::= expression
+static std::unique_ptr<FunctionAST> ParseTopLevelExpr() {
+  if (auto E = ParseExpression()) {
+    // Make an anonymous proto.
+    auto Proto =
+      llvm::make_unique<PrototypeAST>("__anon_expr", std::vector<std::string>());
+    return llvm::make_unique<FunctionAST>(std::move(Proto), std::move(E));
+  }
+  return nullptr;
+}
+
+/// external ::= 'extern' prototype
+static std::unique_ptr<PrototypeAST> ParseExtern() {
+  getNextToken();  // eat extern.
+  return ParsePrototype();
+}
+
+//===----------------------------------------------------------------------===//
+// Code Generation
+//===----------------------------------------------------------------------===//
+
+// FIXME: Obviously we can do better than this
+std::string GenerateUniqueName(const std::string &Root) {
+  static int i = 0;
+  std::ostringstream NameStream;
+  NameStream << Root << ++i;
+  return NameStream.str();
+}
+
+std::string MakeLegalFunctionName(std::string Name)
+{
+  std::string NewName;
+  assert(!Name.empty() && "Base name must not be empty");
+
+  // Start with what we have
+  NewName = Name;
+
+  // Look for a numberic first character
+  if (NewName.find_first_of("0123456789") == 0) {
+    NewName.insert(0, 1, 'n');
+  }
+
+  // Replace illegal characters with their ASCII equivalent
+  std::string legal_elements = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+  size_t pos;
+  while ((pos = NewName.find_first_not_of(legal_elements)) != std::string::npos) {
+    std::ostringstream NumStream;
+    NumStream << (int)NewName.at(pos);
+    NewName = NewName.replace(pos, 1, NumStream.str());
+  }
+
+  return NewName;
+}
+
+class SessionContext {
+public:
+  SessionContext(LLVMContext &C)
+    : Context(C), TM(EngineBuilder().selectTarget()) {}
+  LLVMContext& getLLVMContext() const { return Context; }
+  TargetMachine& getTarget() { return *TM; }
+  void addPrototypeAST(std::unique_ptr<PrototypeAST> P);
+  PrototypeAST* getPrototypeAST(const std::string &Name);
+private:
+  typedef std::map<std::string, std::unique_ptr<PrototypeAST>> PrototypeMap;
+  
+  LLVMContext &Context;
+  std::unique_ptr<TargetMachine> TM;
+      
+  PrototypeMap Prototypes;
+};
+
+void SessionContext::addPrototypeAST(std::unique_ptr<PrototypeAST> P) {
+  Prototypes[P->Name] = std::move(P);
+}
+
+PrototypeAST* SessionContext::getPrototypeAST(const std::string &Name) {
+  PrototypeMap::iterator I = Prototypes.find(Name);
+  if (I != Prototypes.end())
+    return I->second.get();
+  return nullptr;
+}
+
+class IRGenContext {
+public:
+
+  IRGenContext(SessionContext &S)
+    : Session(S),
+      M(new Module(GenerateUniqueName("jit_module_"),
+                   Session.getLLVMContext())),
+      Builder(Session.getLLVMContext()) {
+    M->setDataLayout(Session.getTarget().getDataLayout());
+  }
+
+  SessionContext& getSession() { return Session; }
+  Module& getM() const { return *M; }
+  std::unique_ptr<Module> takeM() { return std::move(M); }
+  IRBuilder<>& getBuilder() { return Builder; }
+  LLVMContext& getLLVMContext() { return Session.getLLVMContext(); }
+  Function* getPrototype(const std::string &Name);
+
+  std::map<std::string, AllocaInst*> NamedValues;
+private:
+  SessionContext &Session;
+  std::unique_ptr<Module> M;
+  IRBuilder<> Builder;
+};
+
+Function* IRGenContext::getPrototype(const std::string &Name) {
+  if (Function *ExistingProto = M->getFunction(Name))
+    return ExistingProto;
+  if (PrototypeAST *ProtoAST = Session.getPrototypeAST(Name))
+    return ProtoAST->IRGen(*this);
+  return nullptr;
+}
+
+/// CreateEntryBlockAlloca - Create an alloca instruction in the entry block of
+/// the function.  This is used for mutable variables etc.
+static AllocaInst *CreateEntryBlockAlloca(Function *TheFunction,
+                                          const std::string &VarName) {
+  IRBuilder<> TmpB(&TheFunction->getEntryBlock(),
+                 TheFunction->getEntryBlock().begin());
+  return TmpB.CreateAlloca(Type::getDoubleTy(getGlobalContext()), 0,
+                           VarName.c_str());
+}
+
+Value *NumberExprAST::IRGen(IRGenContext &C) const {
+  return ConstantFP::get(C.getLLVMContext(), APFloat(Val));
+}
+
+Value *VariableExprAST::IRGen(IRGenContext &C) const {
+  // Look this variable up in the function.
+  Value *V = C.NamedValues[Name];
+
+  if (V == 0)
+    return ErrorP<Value>("Unknown variable name '" + Name + "'");
+
+  // Load the value.
+  return C.getBuilder().CreateLoad(V, Name.c_str());
+}
+
+Value *UnaryExprAST::IRGen(IRGenContext &C) const {
+  if (Value *OperandV = Operand->IRGen(C)) {
+    std::string FnName = MakeLegalFunctionName(std::string("unary")+Opcode);
+    if (Function *F = C.getPrototype(FnName))
+      return C.getBuilder().CreateCall(F, OperandV, "unop");
+    return ErrorP<Value>("Unknown unary operator");
+  }
+
+  // Could not codegen operand - return null.
+  return nullptr;
+}
+
+Value *BinaryExprAST::IRGen(IRGenContext &C) const {
+  // Special case '=' because we don't want to emit the LHS as an expression.
+  if (Op == '=') {
+    // Assignment requires the LHS to be an identifier.
+    auto LHSVar = static_cast<VariableExprAST&>(*LHS);
+    // Codegen the RHS.
+    Value *Val = RHS->IRGen(C);
+    if (!Val) return nullptr;
+
+    // Look up the name.
+    if (auto Variable = C.NamedValues[LHSVar.Name]) {
+      C.getBuilder().CreateStore(Val, Variable);
+      return Val;
+    }
+    return ErrorP<Value>("Unknown variable name");
+  }
+  
+  Value *L = LHS->IRGen(C);
+  Value *R = RHS->IRGen(C);
+  if (!L || !R) return nullptr;
+  
+  switch (Op) {
+  case '+': return C.getBuilder().CreateFAdd(L, R, "addtmp");
+  case '-': return C.getBuilder().CreateFSub(L, R, "subtmp");
+  case '*': return C.getBuilder().CreateFMul(L, R, "multmp");
+  case '/': return C.getBuilder().CreateFDiv(L, R, "divtmp");
+  case '<':
+    L = C.getBuilder().CreateFCmpULT(L, R, "cmptmp");
+    // Convert bool 0/1 to double 0.0 or 1.0
+    return C.getBuilder().CreateUIToFP(L, Type::getDoubleTy(getGlobalContext()),
+                                "booltmp");
+  default: break;
+  }
+  
+  // If it wasn't a builtin binary operator, it must be a user defined one. Emit
+  // a call to it.
+  std::string FnName = MakeLegalFunctionName(std::string("binary")+Op);
+  if (Function *F = C.getPrototype(FnName)) {
+    Value *Ops[] = { L, R };
+    return C.getBuilder().CreateCall(F, Ops, "binop");
+  }
+   
+  return ErrorP<Value>("Unknown binary operator");
+}
+
+Value *CallExprAST::IRGen(IRGenContext &C) const {
+  // Look up the name in the global module table.
+  if (auto CalleeF = C.getPrototype(CalleeName)) {
+    // If argument mismatch error.
+    if (CalleeF->arg_size() != Args.size())
+      return ErrorP<Value>("Incorrect # arguments passed");
+
+    std::vector<Value*> ArgsV;
+    for (unsigned i = 0, e = Args.size(); i != e; ++i) {
+      ArgsV.push_back(Args[i]->IRGen(C));
+      if (!ArgsV.back()) return nullptr;
+    }
+    
+    return C.getBuilder().CreateCall(CalleeF, ArgsV, "calltmp");
+  }
+
+  return ErrorP<Value>("Unknown function referenced");
+}
+
+Value *IfExprAST::IRGen(IRGenContext &C) const {
+  Value *CondV = Cond->IRGen(C);
+  if (!CondV) return nullptr;
+  
+  // Convert condition to a bool by comparing equal to 0.0.
+  ConstantFP *FPZero = 
+    ConstantFP::get(C.getLLVMContext(), APFloat(0.0));
+  CondV = C.getBuilder().CreateFCmpONE(CondV, FPZero, "ifcond");
+  
+  Function *TheFunction = C.getBuilder().GetInsertBlock()->getParent();
+  
+  // Create blocks for the then and else cases.  Insert the 'then' block at the
+  // end of the function.
+  BasicBlock *ThenBB = BasicBlock::Create(C.getLLVMContext(), "then", TheFunction);
+  BasicBlock *ElseBB = BasicBlock::Create(C.getLLVMContext(), "else");
+  BasicBlock *MergeBB = BasicBlock::Create(C.getLLVMContext(), "ifcont");
+  
+  C.getBuilder().CreateCondBr(CondV, ThenBB, ElseBB);
+  
+  // Emit then value.
+  C.getBuilder().SetInsertPoint(ThenBB);
+  
+  Value *ThenV = Then->IRGen(C);
+  if (!ThenV) return nullptr;
+  
+  C.getBuilder().CreateBr(MergeBB);
+  // Codegen of 'Then' can change the current block, update ThenBB for the PHI.
+  ThenBB = C.getBuilder().GetInsertBlock();
+  
+  // Emit else block.
+  TheFunction->getBasicBlockList().push_back(ElseBB);
+  C.getBuilder().SetInsertPoint(ElseBB);
+  
+  Value *ElseV = Else->IRGen(C);
+  if (!ElseV) return nullptr;
+  
+  C.getBuilder().CreateBr(MergeBB);
+  // Codegen of 'Else' can change the current block, update ElseBB for the PHI.
+  ElseBB = C.getBuilder().GetInsertBlock();
+  
+  // Emit merge block.
+  TheFunction->getBasicBlockList().push_back(MergeBB);
+  C.getBuilder().SetInsertPoint(MergeBB);
+  PHINode *PN = C.getBuilder().CreatePHI(Type::getDoubleTy(getGlobalContext()), 2,
+                                  "iftmp");
+  
+  PN->addIncoming(ThenV, ThenBB);
+  PN->addIncoming(ElseV, ElseBB);
+  return PN;
+}
+
+Value *ForExprAST::IRGen(IRGenContext &C) const {
+  // Output this as:
+  //   var = alloca double
+  //   ...
+  //   start = startexpr
+  //   store start -> var
+  //   goto loop
+  // loop: 
+  //   ...
+  //   bodyexpr
+  //   ...
+  // loopend:
+  //   step = stepexpr
+  //   endcond = endexpr
+  //
+  //   curvar = load var
+  //   nextvar = curvar + step
+  //   store nextvar -> var
+  //   br endcond, loop, endloop
+  // outloop:
+  
+  Function *TheFunction = C.getBuilder().GetInsertBlock()->getParent();
+
+  // Create an alloca for the variable in the entry block.
+  AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
+  
+  // Emit the start code first, without 'variable' in scope.
+  Value *StartVal = Start->IRGen(C);
+  if (!StartVal) return nullptr;
+  
+  // Store the value into the alloca.
+  C.getBuilder().CreateStore(StartVal, Alloca);
+  
+  // Make the new basic block for the loop header, inserting after current
+  // block.
+  BasicBlock *LoopBB = BasicBlock::Create(getGlobalContext(), "loop", TheFunction);
+  
+  // Insert an explicit fall through from the current block to the LoopBB.
+  C.getBuilder().CreateBr(LoopBB);
+
+  // Start insertion in LoopBB.
+  C.getBuilder().SetInsertPoint(LoopBB);
+  
+  // Within the loop, the variable is defined equal to the PHI node.  If it
+  // shadows an existing variable, we have to restore it, so save it now.
+  AllocaInst *OldVal = C.NamedValues[VarName];
+  C.NamedValues[VarName] = Alloca;
+  
+  // Emit the body of the loop.  This, like any other expr, can change the
+  // current BB.  Note that we ignore the value computed by the body, but don't
+  // allow an error.
+  if (!Body->IRGen(C))
+    return nullptr;
+  
+  // Emit the step value.
+  Value *StepVal;
+  if (Step) {
+    StepVal = Step->IRGen(C);
+    if (!StepVal) return nullptr;
+  } else {
+    // If not specified, use 1.0.
+    StepVal = ConstantFP::get(getGlobalContext(), APFloat(1.0));
+  }
+  
+  // Compute the end condition.
+  Value *EndCond = End->IRGen(C);
+  if (EndCond == 0) return EndCond;
+  
+  // Reload, increment, and restore the alloca.  This handles the case where
+  // the body of the loop mutates the variable.
+  Value *CurVar = C.getBuilder().CreateLoad(Alloca, VarName.c_str());
+  Value *NextVar = C.getBuilder().CreateFAdd(CurVar, StepVal, "nextvar");
+  C.getBuilder().CreateStore(NextVar, Alloca);
+  
+  // Convert condition to a bool by comparing equal to 0.0.
+  EndCond = C.getBuilder().CreateFCmpONE(EndCond, 
+                              ConstantFP::get(getGlobalContext(), APFloat(0.0)),
+                                  "loopcond");
+  
+  // Create the "after loop" block and insert it.
+  BasicBlock *AfterBB = BasicBlock::Create(getGlobalContext(), "afterloop", TheFunction);
+  
+  // Insert the conditional branch into the end of LoopEndBB.
+  C.getBuilder().CreateCondBr(EndCond, LoopBB, AfterBB);
+  
+  // Any new code will be inserted in AfterBB.
+  C.getBuilder().SetInsertPoint(AfterBB);
+  
+  // Restore the unshadowed variable.
+  if (OldVal)
+    C.NamedValues[VarName] = OldVal;
+  else
+    C.NamedValues.erase(VarName);
+
+  
+  // for expr always returns 0.0.
+  return Constant::getNullValue(Type::getDoubleTy(getGlobalContext()));
+}
+
+Value *VarExprAST::IRGen(IRGenContext &C) const {
+  std::vector<AllocaInst *> OldBindings;
+  
+  Function *TheFunction = C.getBuilder().GetInsertBlock()->getParent();
+
+  // Register all variables and emit their initializer.
+  for (unsigned i = 0, e = VarBindings.size(); i != e; ++i) {
+    auto &VarName = VarBindings[i].first;
+    auto &Init = VarBindings[i].second;
+    
+    // Emit the initializer before adding the variable to scope, this prevents
+    // the initializer from referencing the variable itself, and permits stuff
+    // like this:
+    //  var a = 1 in
+    //    var a = a in ...   # refers to outer 'a'.
+    Value *InitVal;
+    if (Init) {
+      InitVal = Init->IRGen(C);
+      if (!InitVal) return nullptr;
+    } else // If not specified, use 0.0.
+      InitVal = ConstantFP::get(getGlobalContext(), APFloat(0.0));
+    
+    AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
+    C.getBuilder().CreateStore(InitVal, Alloca);
+
+    // Remember the old variable binding so that we can restore the binding when
+    // we unrecurse.
+    OldBindings.push_back(C.NamedValues[VarName]);
+    
+    // Remember this binding.
+    C.NamedValues[VarName] = Alloca;
+  }
+  
+  // Codegen the body, now that all vars are in scope.
+  Value *BodyVal = Body->IRGen(C);
+  if (!BodyVal) return nullptr;
+  
+  // Pop all our variables from scope.
+  for (unsigned i = 0, e = VarBindings.size(); i != e; ++i)
+    C.NamedValues[VarBindings[i].first] = OldBindings[i];
+
+  // Return the body computation.
+  return BodyVal;
+}
+
+Function *PrototypeAST::IRGen(IRGenContext &C) const {
+  std::string FnName = MakeLegalFunctionName(Name);
+
+  // Make the function type:  double(double,double) etc.
+  std::vector<Type*> Doubles(Args.size(), 
+                             Type::getDoubleTy(getGlobalContext()));
+  FunctionType *FT = FunctionType::get(Type::getDoubleTy(getGlobalContext()),
+                                       Doubles, false);
+  Function *F = Function::Create(FT, Function::ExternalLinkage, FnName,
+                                 &C.getM());
+
+  // If F conflicted, there was already something named 'FnName'.  If it has a
+  // body, don't allow redefinition or reextern.
+  if (F->getName() != FnName) {
+    // Delete the one we just made and get the existing one.
+    F->eraseFromParent();
+    F = C.getM().getFunction(Name);
+    
+    // If F already has a body, reject this.
+    if (!F->empty()) {
+      ErrorP<Function>("redefinition of function");
+      return nullptr;
+    }
+    
+    // If F took a different number of args, reject.
+    if (F->arg_size() != Args.size()) {
+      ErrorP<Function>("redefinition of function with different # args");
+      return nullptr;
+    }
+  }
+  
+  // Set names for all arguments.
+  unsigned Idx = 0;
+  for (Function::arg_iterator AI = F->arg_begin(); Idx != Args.size();
+       ++AI, ++Idx)
+    AI->setName(Args[Idx]);
+    
+  return F;
+}
+
+/// CreateArgumentAllocas - Create an alloca for each argument and register the
+/// argument in the symbol table so that references to it will succeed.
+void PrototypeAST::CreateArgumentAllocas(Function *F, IRGenContext &C) {
+  Function::arg_iterator AI = F->arg_begin();
+  for (unsigned Idx = 0, e = Args.size(); Idx != e; ++Idx, ++AI) {
+    // Create an alloca for this variable.
+    AllocaInst *Alloca = CreateEntryBlockAlloca(F, Args[Idx]);
+
+    // Store the initial value into the alloca.
+    C.getBuilder().CreateStore(AI, Alloca);
+
+    // Add arguments to variable symbol table.
+    C.NamedValues[Args[Idx]] = Alloca;
+  }
+}
+
+Function *FunctionAST::IRGen(IRGenContext &C) const {
+  C.NamedValues.clear();
+  
+  Function *TheFunction = Proto->IRGen(C);
+  if (!TheFunction)
+    return nullptr;
+  
+  // If this is an operator, install it.
+  if (Proto->isBinaryOp())
+    BinopPrecedence[Proto->getOperatorName()] = Proto->Precedence;
+  
+  // Create a new basic block to start insertion into.
+  BasicBlock *BB = BasicBlock::Create(getGlobalContext(), "entry", TheFunction);
+  C.getBuilder().SetInsertPoint(BB);
+  
+  // Add all arguments to the symbol table and create their allocas.
+  Proto->CreateArgumentAllocas(TheFunction, C);
+
+  if (Value *RetVal = Body->IRGen(C)) {
+    // Finish off the function.
+    C.getBuilder().CreateRet(RetVal);
+
+    // Validate the generated code, checking for consistency.
+    verifyFunction(*TheFunction);
+
+    return TheFunction;
+  }
+  
+  // Error reading body, remove function.
+  TheFunction->eraseFromParent();
+
+  if (Proto->isBinaryOp())
+    BinopPrecedence.erase(Proto->getOperatorName());
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Top-Level parsing and JIT Driver
+//===----------------------------------------------------------------------===//
+
+static std::unique_ptr<llvm::Module> IRGen(SessionContext &S,
+                                           const FunctionAST &F) {
+  IRGenContext C(S);
+  auto LF = F.IRGen(C);
+  if (!LF)
+    return nullptr;
+#ifndef MINIMAL_STDERR_OUTPUT
+  fprintf(stderr, "Read function definition:");
+  LF->dump();
+#endif
+  return C.takeM();
+}
+
+template <typename T>
+static std::vector<T> singletonSet(T t) {
+  std::vector<T> Vec;
+  Vec.push_back(std::move(t));
+  return Vec;
+}
+
+class KaleidoscopeJIT {
+public:
+  typedef ObjectLinkingLayer<> ObjLayerT;
+  typedef IRCompileLayer<ObjLayerT> CompileLayerT;
+  typedef LazyEmittingLayer<CompileLayerT> LazyEmitLayerT;
+  typedef LazyEmitLayerT::ModuleSetHandleT ModuleHandleT;
+
+  KaleidoscopeJIT(SessionContext &Session)
+    : Session(Session),
+      Mang(Session.getTarget().getDataLayout()),
+      CompileLayer(ObjectLayer, SimpleCompiler(Session.getTarget())),
+      LazyEmitLayer(CompileLayer) {}
+
+  std::string mangle(const std::string &Name) {
+    std::string MangledName;
+    {
+      raw_string_ostream MangledNameStream(MangledName);
+      Mang.getNameWithPrefix(MangledNameStream, Name);
+    }
+    return MangledName;
+  }
+
+  void addFunctionDefinition(std::unique_ptr<FunctionAST> FnAST) {
+    FunctionDefs[mangle(FnAST->Proto->Name)] = std::move(FnAST);
+  }
+
+  ModuleHandleT addModule(std::unique_ptr<Module> M) {
+    // We need a memory manager to allocate memory and resolve symbols for this
+    // new module. Create one that resolves symbols by looking back into the
+    // JIT.
+    auto MM = createLookasideRTDyldMM<SectionMemoryManager>(
+                [&](const std::string &Name) {
+                  // First try to find 'Name' within the JIT.
+                  if (auto Symbol = findSymbol(Name))
+                    return Symbol.getAddress();
+
+                  // If we don't already have a definition of 'Name' then search
+                  // the ASTs.
+                  return searchUncompiledASTs(Name);
+                },
+                [](const std::string &S) { return 0; } );
+
+    return LazyEmitLayer.addModuleSet(singletonSet(std::move(M)),
+                                      std::move(MM));
+  }
+
+  void removeModule(ModuleHandleT H) { LazyEmitLayer.removeModuleSet(H); }
+
+  JITSymbol findSymbol(const std::string &Name) {
+    return LazyEmitLayer.findSymbol(Name, true);
+  }
+
+  JITSymbol findSymbolIn(ModuleHandleT H, const std::string &Name) {
+    return LazyEmitLayer.findSymbolIn(H, Name, true);
+  }
+
+  JITSymbol findUnmangledSymbol(const std::string &Name) {
+    return findSymbol(mangle(Name));
+  }
+
+private:
+
+  // This method searches the FunctionDefs map for a definition of 'Name'. If it
+  // finds one it generates a stub for it and returns the address of the stub.
+  TargetAddress searchUncompiledASTs(const std::string &Name) {
+    auto DefI = FunctionDefs.find(Name);
+    if (DefI == FunctionDefs.end())
+      return 0;
+
+    // We have AST for 'Name'. IRGen it, add it to the JIT, and
+    // return the address for it.
+    // FIXME: What happens if IRGen fails?
+    auto H = addModule(IRGen(Session, *DefI->second));
+
+    // Remove the function definition's AST now that we're
+    // finished with it.
+    FunctionDefs.erase(DefI);
+
+    // Return the address of the function.
+    return findSymbolIn(H, Name).getAddress();
+  }
+
+  SessionContext &Session;
+  Mangler Mang;
+  ObjLayerT ObjectLayer;
+  CompileLayerT CompileLayer;
+  LazyEmitLayerT LazyEmitLayer;
+
+  std::map<std::string, std::unique_ptr<FunctionAST>> FunctionDefs;
+};
+
+static void HandleDefinition(SessionContext &S, KaleidoscopeJIT &J) {
+  if (auto F = ParseDefinition()) {
+    S.addPrototypeAST(llvm::make_unique<PrototypeAST>(*F->Proto));
+    J.addFunctionDefinition(std::move(F));
+  } else {
+    // Skip token for error recovery.
+    getNextToken();
+  }
+}
+
+static void HandleExtern(SessionContext &S) {
+  if (auto P = ParseExtern())
+    S.addPrototypeAST(std::move(P));
+  else {
+    // Skip token for error recovery.
+    getNextToken();
+  }
+}
+
+static void HandleTopLevelExpression(SessionContext &S, KaleidoscopeJIT &J) {
+  // Evaluate a top-level expression into an anonymous function.
+  if (auto F = ParseTopLevelExpr()) {
+    IRGenContext C(S);
+    if (auto ExprFunc = F->IRGen(C)) {
+#ifndef MINIMAL_STDERR_OUTPUT
+      std::cerr << "Expression function:\n";
+      ExprFunc->dump();
+#endif
+      // Add the CodeGen'd module to the JIT. Keep a handle to it: We can remove
+      // this module as soon as we've executed Function ExprFunc.
+      auto H = J.addModule(C.takeM());
+
+      // Get the address of the JIT'd function in memory.
+      auto ExprSymbol = J.findUnmangledSymbol("__anon_expr");
+      
+      // Cast it to the right type (takes no arguments, returns a double) so we
+      // can call it as a native function.
+      double (*FP)() = (double (*)())(intptr_t)ExprSymbol.getAddress();
+#ifdef MINIMAL_STDERR_OUTPUT
+      FP();
+#else
+      std::cerr << "Evaluated to " << FP() << "\n";
+#endif
+
+      // Remove the function.
+      J.removeModule(H);
+    }
+  } else {
+    // Skip token for error recovery.
+    getNextToken();
+  }
+}
+
+/// top ::= definition | external | expression | ';'
+static void MainLoop() {
+  SessionContext S(getGlobalContext());
+  KaleidoscopeJIT J(S);
+
+  while (1) {
+    switch (CurTok) {
+    case tok_eof:    return;
+    case ';':        getNextToken(); continue;  // ignore top-level semicolons.
+    case tok_def:    HandleDefinition(S, J); break;
+    case tok_extern: HandleExtern(S); break;
+    default:         HandleTopLevelExpression(S, J); break;
+    }
+#ifndef MINIMAL_STDERR_OUTPUT
+    std::cerr << "ready> ";
+#endif
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// "Library" functions that can be "extern'd" from user code.
+//===----------------------------------------------------------------------===//
+
+/// putchard - putchar that takes a double and returns 0.
+extern "C" 
+double putchard(double X) {
+  putchar((char)X);
+  return 0;
+}
+
+/// printd - printf that takes a double prints it as "%f\n", returning 0.
+extern "C" 
+double printd(double X) {
+  printf("%f", X);
+  return 0;
+}
+
+extern "C" 
+double printlf() {
+  printf("\n");
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Main driver code.
+//===----------------------------------------------------------------------===//
+
+int main() {
+  InitializeNativeTarget();
+  InitializeNativeTargetAsmPrinter();
+  InitializeNativeTargetAsmParser();
+
+  // Install standard binary operators.
+  // 1 is lowest precedence.
+  BinopPrecedence['='] = 2;
+  BinopPrecedence['<'] = 10;
+  BinopPrecedence['+'] = 20;
+  BinopPrecedence['-'] = 20;
+  BinopPrecedence['/'] = 40;
+  BinopPrecedence['*'] = 40;  // highest.
+
+  // Prime the first token.
+#ifndef MINIMAL_STDERR_OUTPUT
+  std::cerr << "ready> ";
+#endif
+  getNextToken();
+
+  std::cerr << std::fixed;
+
+  // Run the main "interpreter loop" now.
+  MainLoop();
+
+  return 0;
+}
+
diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index 30c7595..8873fdb 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -1157,8 +1157,6 @@ LLVMTypeRef LLVMX86MMXType(void);
   macro(Argument)                           \
   macro(BasicBlock)                         \
   macro(InlineAsm)                          \
-  macro(MDNode)                             \
-  macro(MDString)                           \
   macro(User)                               \
     macro(Constant)                         \
       macro(BlockAddress)                   \
@@ -1307,6 +1305,9 @@ LLVMBool LLVMIsUndef(LLVMValueRef Val);
   LLVMValueRef LLVMIsA##name(LLVMValueRef Val);
 LLVM_FOR_EACH_VALUE_SUBCLASS(LLVM_DECLARE_VALUE_CAST)
 
+LLVMValueRef LLVMIsAMDNode(LLVMValueRef Val);
+LLVMValueRef LLVMIsAMDString(LLVMValueRef Val);
+
 /**
  * @}
  */
diff --git a/include/llvm-c/ExecutionEngine.h b/include/llvm-c/ExecutionEngine.h
index f1f4cad..eb3ecab 100644
--- a/include/llvm-c/ExecutionEngine.h
+++ b/include/llvm-c/ExecutionEngine.h
@@ -170,6 +170,10 @@ void LLVMAddGlobalMapping(LLVMExecutionEngineRef EE, LLVMValueRef Global,
 
 void *LLVMGetPointerToGlobal(LLVMExecutionEngineRef EE, LLVMValueRef Global);
 
+uint64_t LLVMGetGlobalValueAddress(LLVMExecutionEngineRef EE, const char *Name);
+
+uint64_t LLVMGetFunctionAddress(LLVMExecutionEngineRef EE, const char *Name);
+
 /*===-- Operations on memory managers -------------------------------------===*/
 
 typedef uint8_t *(*LLVMMemoryManagerAllocateCodeSectionCallback)(
diff --git a/include/llvm-c/Linker.h b/include/llvm-c/Linker.h
index 9f337cf..cedde5e 100644
--- a/include/llvm-c/Linker.h
+++ b/include/llvm-c/Linker.h
@@ -20,20 +20,13 @@
 extern "C" {
 #endif
 
-
-typedef enum {
-  LLVMLinkerDestroySource = 0, /* Allow source module to be destroyed. */
-  LLVMLinkerPreserveSource = 1 /* Preserve the source module. */
-} LLVMLinkerMode;
-
-
 /* Links the source module into the destination module, taking ownership
  * of the source module away from the caller. Optionally returns a
  * human-readable description of any errors that occurred in linking.
  * OutMessage must be disposed with LLVMDisposeMessage. The return value
  * is true if an error occurred, false otherwise. */
 LLVMBool LLVMLinkModules(LLVMModuleRef Dest, LLVMModuleRef Src,
-                         LLVMLinkerMode Mode, char **OutMessage);
+                         unsigned Unused, char **OutMessage);
 
 #ifdef __cplusplus
 }
diff --git a/include/llvm-c/Transforms/Scalar.h b/include/llvm-c/Transforms/Scalar.h
index 7ad1ad1..48c19a6 100644
--- a/include/llvm-c/Transforms/Scalar.h
+++ b/include/llvm-c/Transforms/Scalar.h
@@ -35,6 +35,9 @@ extern "C" {
 /** See llvm::createAggressiveDCEPass function. */
 void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM);
 
+/** See llvm::createBitTrackingDCEPass function. */
+void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM);
+
 /** See llvm::createAlignmentFromAssumptionsPass function. */
 void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM);
 
diff --git a/include/llvm-c/lto.h b/include/llvm-c/lto.h
index 3f30d6d..c6acdad 100644
--- a/include/llvm-c/lto.h
+++ b/include/llvm-c/lto.h
@@ -40,7 +40,7 @@ typedef bool lto_bool_t;
  * @{
  */
 
-#define LTO_API_VERSION 11
+#define LTO_API_VERSION 13
 
 /**
  * \since prior to LTO_API_VERSION=3
@@ -396,6 +396,17 @@ extern lto_bool_t
 lto_codegen_add_module(lto_code_gen_t cg, lto_module_t mod);
 
 /**
+ * Sets the object module for code generation. This will transfer the ownship of
+ * the module to code generator.
+ *
+ * \c cg and \c mod must both be in the same context.
+ *
+ * \since prior to LTO_API_VERSION=13
+ */
+extern void
+lto_codegen_set_module(lto_code_gen_t cg, lto_module_t mod);
+
+/**
  * Sets if debug info should be generated.
  * Returns true on error (check lto_get_error_message() for details).
  *
@@ -464,6 +475,8 @@ lto_codegen_write_merged_modules(lto_code_gen_t cg, const char* path);
 
 /**
  * Generates code for all added modules into one native object file.
+ * This calls lto_codegen_optimize then lto_codegen_compile_optimized.
+ *
  * On success returns a pointer to a generated mach-o/ELF buffer and
  * length set to the buffer size.  The buffer is owned by the
  * lto_code_gen_t and will be freed when lto_codegen_dispose()
@@ -477,6 +490,9 @@ lto_codegen_compile(lto_code_gen_t cg, size_t* length);
 
 /**
  * Generates code for all added modules into one native object file.
+ * This calls lto_codegen_optimize then lto_codegen_compile_optimized (instead
+ * of returning a generated mach-o/ELF buffer, it writes to a file).
+ *
  * The name of the file is written to name. Returns true on error.
  *
  * \since LTO_API_VERSION=5
@@ -484,6 +500,36 @@ lto_codegen_compile(lto_code_gen_t cg, size_t* length);
 extern lto_bool_t
 lto_codegen_compile_to_file(lto_code_gen_t cg, const char** name);
 
+/**
+ * Runs optimization for the merged module. Returns true on error.
+ *
+ * \since LTO_API_VERSION=12
+ */
+extern lto_bool_t
+lto_codegen_optimize(lto_code_gen_t cg);
+
+/**
+ * Generates code for the optimized merged module into one native object file.
+ * It will not run any IR optimizations on the merged module.
+ *
+ * On success returns a pointer to a generated mach-o/ELF buffer and length set
+ * to the buffer size.  The buffer is owned by the lto_code_gen_t and will be
+ * freed when lto_codegen_dispose() is called, or
+ * lto_codegen_compile_optimized() is called again. On failure, returns NULL
+ * (check lto_get_error_message() for details).
+ *
+ * \since LTO_API_VERSION=12
+ */
+extern const void*
+lto_codegen_compile_optimized(lto_code_gen_t cg, size_t* length);
+
+/**
+ * Returns the runtime API version.
+ *
+ * \since LTO_API_VERSION=12
+ */
+extern unsigned int
+lto_api_version(void);
 
 /**
  * Sets options to help debug codegen bugs.
diff --git a/include/llvm/ADT/APFloat.h b/include/llvm/ADT/APFloat.h
index 26aae77..bf814e0 100644
--- a/include/llvm/ADT/APFloat.h
+++ b/include/llvm/ADT/APFloat.h
@@ -376,7 +376,7 @@ public:
   /// The definition of equality is not straightforward for floating point, so
   /// we won't use operator==.  Use one of the following, or write whatever it
   /// is you really mean.
-  bool operator==(const APFloat &) const LLVM_DELETED_FUNCTION;
+  bool operator==(const APFloat &) const = delete;
 
   /// IEEE comparison with another floating point number (NaNs compare
   /// unordered, 0==-0).
diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h
index f4e7e3c..c2fe094 100644
--- a/include/llvm/ADT/APInt.h
+++ b/include/llvm/ADT/APInt.h
@@ -91,6 +91,8 @@ class APInt {
     APINT_WORD_SIZE = static_cast<unsigned int>(sizeof(uint64_t))
   };
 
+  friend struct DenseMapAPIntKeyInfo;
+
   /// \brief Fast internal constructor
   ///
   /// This constructor is used only internally for speed of construction of
@@ -277,7 +279,6 @@ public:
   /// Simply makes *this a copy of that.
   /// @brief Copy Constructor.
   APInt(const APInt &that) : BitWidth(that.BitWidth), VAL(0) {
-    assert(BitWidth && "bitwidth too small");
     if (isSingleWord())
       VAL = that.VAL;
     else
@@ -1355,7 +1356,7 @@ public:
 
   /// \brief Count the number of leading one bits.
   ///
-  /// This function is an APInt version of the countLeadingOnes_{32,64}
+  /// This function is an APInt version of the countLeadingOnes
   /// functions in MathExtras.h. It counts the number of ones from the most
   /// significant bit to the first zero bit.
   ///
@@ -1371,7 +1372,7 @@ public:
 
   /// \brief Count the number of trailing zero bits.
   ///
-  /// This function is an APInt version of the countTrailingZeros_{32,64}
+  /// This function is an APInt version of the countTrailingZeros
   /// functions in MathExtras.h. It counts the number of zeros from the least
   /// significant bit to the first set bit.
   ///
@@ -1381,7 +1382,7 @@ public:
 
   /// \brief Count the number of trailing one bits.
   ///
-  /// This function is an APInt version of the countTrailingOnes_{32,64}
+  /// This function is an APInt version of the countTrailingOnes
   /// functions in MathExtras.h. It counts the number of ones from the least
   /// significant bit to the first zero bit.
   ///
@@ -1389,19 +1390,19 @@ public:
   /// of ones from the least significant bit to the first zero bit.
   unsigned countTrailingOnes() const {
     if (isSingleWord())
-      return CountTrailingOnes_64(VAL);
+      return llvm::countTrailingOnes(VAL);
     return countTrailingOnesSlowCase();
   }
 
   /// \brief Count the number of bits set.
   ///
-  /// This function is an APInt version of the countPopulation_{32,64} functions
+  /// This function is an APInt version of the countPopulation functions
   /// in MathExtras.h. It counts the number of 1 bits in the APInt value.
   ///
   /// \returns 0 if the value is zero, otherwise returns the number of set bits.
   unsigned countPopulation() const {
     if (isSingleWord())
-      return CountPopulation_64(VAL);
+      return llvm::countPopulation(VAL);
     return countPopulationSlowCase();
   }
 
diff --git a/include/llvm/ADT/APSInt.h b/include/llvm/ADT/APSInt.h
index a6693f7..91ccda2 100644
--- a/include/llvm/ADT/APSInt.h
+++ b/include/llvm/ADT/APSInt.h
@@ -62,6 +62,12 @@ public:
   }
   using APInt::toString;
 
+  /// \brief Get the correctly-extended \c int64_t value.
+  int64_t getExtValue() const {
+    assert(getMinSignedBits() <= 64 && "Too many bits for int64_t");
+    return isSigned() ? getSExtValue() : getZExtValue();
+  }
+
   APSInt LLVM_ATTRIBUTE_UNUSED_RESULT trunc(uint32_t width) const {
     return APSInt(APInt::trunc(width), IsUnsigned);
   }
@@ -133,14 +139,27 @@ public:
     assert(IsUnsigned == RHS.IsUnsigned && "Signedness mismatch!");
     return eq(RHS);
   }
-  inline bool operator==(int64_t RHS) const {
-    return isSameValue(*this, APSInt(APInt(64, RHS), true));
-  }
   inline bool operator!=(const APSInt& RHS) const {
     return !((*this) == RHS);
   }
-  inline bool operator!=(int64_t RHS) const {
-    return !((*this) == RHS);
+
+  bool operator==(int64_t RHS) const {
+    return compareValues(*this, get(RHS)) == 0;
+  }
+  bool operator!=(int64_t RHS) const {
+    return compareValues(*this, get(RHS)) != 0;
+  }
+  bool operator<=(int64_t RHS) const {
+    return compareValues(*this, get(RHS)) <= 0;
+  }
+  bool operator>=(int64_t RHS) const {
+    return compareValues(*this, get(RHS)) >= 0;
+  }
+  bool operator<(int64_t RHS) const {
+    return compareValues(*this, get(RHS)) < 0;
+  }
+  bool operator>(int64_t RHS) const {
+    return compareValues(*this, get(RHS)) > 0;
   }
 
   // The remaining operators just wrap the logic of APInt, but retain the
@@ -260,37 +279,49 @@ public:
   /// \brief Determine if two APSInts have the same value, zero- or
   /// sign-extending as needed.  
   static bool isSameValue(const APSInt &I1, const APSInt &I2) {
+    return !compareValues(I1, I2);
+  }
+
+  /// \brief Compare underlying values of two numbers.
+  static int compareValues(const APSInt &I1, const APSInt &I2) {
     if (I1.getBitWidth() == I2.getBitWidth() && I1.isSigned() == I2.isSigned())
-      return I1 == I2;
+      return I1 == I2 ? 0 : I1 > I2 ? 1 : -1;
 
     // Check for a bit-width mismatch.
     if (I1.getBitWidth() > I2.getBitWidth())
-      return isSameValue(I1, I2.extend(I1.getBitWidth()));
+      return compareValues(I1, I2.extend(I1.getBitWidth()));
     else if (I2.getBitWidth() > I1.getBitWidth())
-      return isSameValue(I1.extend(I2.getBitWidth()), I2);
-
-    assert(I1.isSigned() != I2.isSigned());
+      return compareValues(I1.extend(I2.getBitWidth()), I2);
 
     // We have a signedness mismatch. Check for negative values and do an
-    // unsigned compare if signs match.
-    if ((I1.isSigned() && I1.isNegative()) ||
-        (!I1.isSigned() && I2.isNegative()))
-      return false;
+    // unsigned compare if both are positive.
+    if (I1.isSigned()) {
+      assert(!I2.isSigned() && "Expected signed mismatch");
+      if (I1.isNegative())
+        return -1;
+    } else {
+      assert(I2.isSigned() && "Expected signed mismatch");
+      if (I2.isNegative())
+        return 1;
+    }
 
-    return I1.eq(I2);
+    return I1.eq(I2) ? 0 : I1.ugt(I2) ? 1 : -1;
   }
 
+  static APSInt get(int64_t X) { return APSInt(APInt(64, X), false); }
+  static APSInt getUnsigned(uint64_t X) { return APSInt(APInt(64, X), true); }
+
   /// Profile - Used to insert APSInt objects, or objects that contain APSInt
   ///  objects, into FoldingSets.
   void Profile(FoldingSetNodeID& ID) const;
 };
 
-inline bool operator==(int64_t V1, const APSInt& V2) {
-  return V2 == V1;
-}
-inline bool operator!=(int64_t V1, const APSInt& V2) {
-  return V2 != V1;
-}
+inline bool operator==(int64_t V1, const APSInt &V2) { return V2 == V1; }
+inline bool operator!=(int64_t V1, const APSInt &V2) { return V2 != V1; }
+inline bool operator<=(int64_t V1, const APSInt &V2) { return V2 >= V1; }
+inline bool operator>=(int64_t V1, const APSInt &V2) { return V2 <= V1; }
+inline bool operator<(int64_t V1, const APSInt &V2) { return V2 > V1; }
+inline bool operator>(int64_t V1, const APSInt &V2) { return V2 < V1; }
 
 inline raw_ostream &operator<<(raw_ostream &OS, const APSInt &I) {
   I.print(OS, I.isSigned());
diff --git a/include/llvm/ADT/ArrayRef.h b/include/llvm/ADT/ArrayRef.h
index 8c14a42..5b7ed9c 100644
--- a/include/llvm/ADT/ArrayRef.h
+++ b/include/llvm/ADT/ArrayRef.h
@@ -97,12 +97,10 @@ namespace llvm {
     /*implicit*/ LLVM_CONSTEXPR ArrayRef(const T (&Arr)[N])
       : Data(Arr), Length(N) {}
 
-#if LLVM_HAS_INITIALIZER_LISTS
     /// Construct an ArrayRef from a std::initializer_list.
     /*implicit*/ ArrayRef(const std::initializer_list<T> &Vec)
     : Data(Vec.begin() == Vec.end() ? (T*)0 : Vec.begin()),
       Length(Vec.size()) {}
-#endif
 
     /// Construct an ArrayRef<const T*> from ArrayRef<T*>. This uses SFINAE to
     /// ensure that only ArrayRefs of pointers can be converted.
diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h
index 34e2284..0dbe810 100644
--- a/include/llvm/ADT/BitVector.h
+++ b/include/llvm/ADT/BitVector.h
@@ -29,6 +29,9 @@ class BitVector {
 
   enum { BITWORD_SIZE = (unsigned)sizeof(BitWord) * CHAR_BIT };
 
+  static_assert(BITWORD_SIZE == 64 || BITWORD_SIZE == 32,
+                "Unsupported word size");
+
   BitWord  *Bits;        // Actual bits.
   unsigned Size;         // Size of bitvector in bits.
   unsigned Capacity;     // Size of allocated memory in BitWord.
@@ -118,12 +121,7 @@ public:
   size_type count() const {
     unsigned NumBits = 0;
     for (unsigned i = 0; i < NumBitWords(size()); ++i)
-      if (sizeof(BitWord) == 4)
-        NumBits += CountPopulation_32((uint32_t)Bits[i]);
-      else if (sizeof(BitWord) == 8)
-        NumBits += CountPopulation_64(Bits[i]);
-      else
-        llvm_unreachable("Unsupported!");
+      NumBits += countPopulation(Bits[i]);
     return NumBits;
   }
 
@@ -157,13 +155,8 @@ public:
   /// of the bits are set.
   int find_first() const {
     for (unsigned i = 0; i < NumBitWords(size()); ++i)
-      if (Bits[i] != 0) {
-        if (sizeof(BitWord) == 4)
-          return i * BITWORD_SIZE + countTrailingZeros((uint32_t)Bits[i]);
-        if (sizeof(BitWord) == 8)
-          return i * BITWORD_SIZE + countTrailingZeros(Bits[i]);
-        llvm_unreachable("Unsupported!");
-      }
+      if (Bits[i] != 0)
+        return i * BITWORD_SIZE + countTrailingZeros(Bits[i]);
     return -1;
   }
 
@@ -180,23 +173,13 @@ public:
     // Mask off previous bits.
     Copy &= ~0UL << BitPos;
 
-    if (Copy != 0) {
-      if (sizeof(BitWord) == 4)
-        return WordPos * BITWORD_SIZE + countTrailingZeros((uint32_t)Copy);
-      if (sizeof(BitWord) == 8)
-        return WordPos * BITWORD_SIZE + countTrailingZeros(Copy);
-      llvm_unreachable("Unsupported!");
-    }
+    if (Copy != 0)
+      return WordPos * BITWORD_SIZE + countTrailingZeros(Copy);
 
     // Check subsequent words.
     for (unsigned i = WordPos+1; i < NumBitWords(size()); ++i)
-      if (Bits[i] != 0) {
-        if (sizeof(BitWord) == 4)
-          return i * BITWORD_SIZE + countTrailingZeros((uint32_t)Bits[i]);
-        if (sizeof(BitWord) == 8)
-          return i * BITWORD_SIZE + countTrailingZeros(Bits[i]);
-        llvm_unreachable("Unsupported!");
-      }
+      if (Bits[i] != 0)
+        return i * BITWORD_SIZE + countTrailingZeros(Bits[i]);
     return -1;
   }
 
@@ -239,6 +222,7 @@ public:
   }
 
   BitVector &set(unsigned Idx) {
+    assert(Bits && "Bits never allocated");
     Bits[Idx / BITWORD_SIZE] |= BitWord(1) << (Idx % BITWORD_SIZE);
     return *this;
   }
@@ -450,6 +434,7 @@ public:
 
     // Grow the bitvector to have enough elements.
     Capacity = RHSWords;
+    assert(Capacity > 0 && "negative capacity?");
     BitWord *NewBits = (BitWord *)std::malloc(Capacity * sizeof(BitWord));
     std::memcpy(NewBits, RHS.Bits, Capacity * sizeof(BitWord));
 
@@ -545,6 +530,7 @@ private:
 
   void grow(unsigned NewSize) {
     Capacity = std::max(NumBitWords(NewSize), Capacity * 2);
+    assert(Capacity > 0 && "realloc-ing zero space");
     Bits = (BitWord *)std::realloc(Bits, Capacity * sizeof(BitWord));
 
     clear_unused_bits();
@@ -556,7 +542,7 @@ private:
 
   template<bool AddBits, bool InvertMask>
   void applyMask(const uint32_t *Mask, unsigned MaskWords) {
-    assert(BITWORD_SIZE % 32 == 0 && "Unsupported BitWord size.");
+    static_assert(BITWORD_SIZE % 32 == 0, "Unsupported BitWord size.");
     MaskWords = std::min(MaskWords, (size() + 31) / 32);
     const unsigned Scale = BITWORD_SIZE / 32;
     unsigned i;
diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index c44b67a..1699ea3 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h
@@ -31,26 +31,35 @@
 
 namespace llvm {
 
-template<typename KeyT, typename ValueT,
-         typename KeyInfoT = DenseMapInfo<KeyT>,
-         bool IsConst = false>
+namespace detail {
+// We extend a pair to allow users to override the bucket type with their own
+// implementation without requiring two members.
+template <typename KeyT, typename ValueT>
+struct DenseMapPair : public std::pair<KeyT, ValueT> {
+  KeyT &getFirst() { return std::pair<KeyT, ValueT>::first; }
+  const KeyT &getFirst() const { return std::pair<KeyT, ValueT>::first; }
+  ValueT &getSecond() { return std::pair<KeyT, ValueT>::second; }
+  const ValueT &getSecond() const { return std::pair<KeyT, ValueT>::second; }
+};
+}
+
+template <
+    typename KeyT, typename ValueT, typename KeyInfoT = DenseMapInfo<KeyT>,
+    typename Bucket = detail::DenseMapPair<KeyT, ValueT>, bool IsConst = false>
 class DenseMapIterator;
 
-template<typename DerivedT,
-         typename KeyT, typename ValueT, typename KeyInfoT>
+template <typename DerivedT, typename KeyT, typename ValueT, typename KeyInfoT,
+          typename BucketT>
 class DenseMapBase {
-protected:
-  typedef std::pair<KeyT, ValueT> BucketT;
-
 public:
   typedef unsigned size_type;
   typedef KeyT key_type;
   typedef ValueT mapped_type;
   typedef BucketT value_type;
 
-  typedef DenseMapIterator<KeyT, ValueT, KeyInfoT> iterator;
-  typedef DenseMapIterator<KeyT, ValueT,
-                           KeyInfoT, true> const_iterator;
+  typedef DenseMapIterator<KeyT, ValueT, KeyInfoT, BucketT> iterator;
+  typedef DenseMapIterator<KeyT, ValueT, KeyInfoT, BucketT, true>
+      const_iterator;
   inline iterator begin() {
     // When the map is empty, avoid the overhead of AdvancePastEmptyBuckets().
     return empty() ? end() : iterator(getBuckets(), getBucketsEnd());
@@ -88,12 +97,12 @@ public:
 
     const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
     for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) {
-      if (!KeyInfoT::isEqual(P->first, EmptyKey)) {
-        if (!KeyInfoT::isEqual(P->first, TombstoneKey)) {
-          P->second.~ValueT();
+      if (!KeyInfoT::isEqual(P->getFirst(), EmptyKey)) {
+        if (!KeyInfoT::isEqual(P->getFirst(), TombstoneKey)) {
+          P->getSecond().~ValueT();
           decrementNumEntries();
         }
-        P->first = EmptyKey;
+        P->getFirst() = EmptyKey;
       }
     }
     assert(getNumEntries() == 0 && "Node count imbalance!");
@@ -144,7 +153,7 @@ public:
   ValueT lookup(const KeyT &Val) const {
     const BucketT *TheBucket;
     if (LookupBucketFor(Val, TheBucket))
-      return TheBucket->second;
+      return TheBucket->getSecond();
     return ValueT();
   }
 
@@ -191,16 +200,16 @@ public:
     if (!LookupBucketFor(Val, TheBucket))
       return false; // not in map.
 
-    TheBucket->second.~ValueT();
-    TheBucket->first = getTombstoneKey();
+    TheBucket->getSecond().~ValueT();
+    TheBucket->getFirst() = getTombstoneKey();
     decrementNumEntries();
     incrementNumTombstones();
     return true;
   }
   void erase(iterator I) {
     BucketT *TheBucket = &*I;
-    TheBucket->second.~ValueT();
-    TheBucket->first = getTombstoneKey();
+    TheBucket->getSecond().~ValueT();
+    TheBucket->getFirst() = getTombstoneKey();
     decrementNumEntries();
     incrementNumTombstones();
   }
@@ -250,10 +259,10 @@ protected:
 
     const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
     for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) {
-      if (!KeyInfoT::isEqual(P->first, EmptyKey) &&
-          !KeyInfoT::isEqual(P->first, TombstoneKey))
-        P->second.~ValueT();
-      P->first.~KeyT();
+      if (!KeyInfoT::isEqual(P->getFirst(), EmptyKey) &&
+          !KeyInfoT::isEqual(P->getFirst(), TombstoneKey))
+        P->getSecond().~ValueT();
+      P->getFirst().~KeyT();
     }
 
 #ifndef NDEBUG
@@ -269,7 +278,7 @@ protected:
            "# initial buckets must be a power of two!");
     const KeyT EmptyKey = getEmptyKey();
     for (BucketT *B = getBuckets(), *E = getBucketsEnd(); B != E; ++B)
-      new (&B->first) KeyT(EmptyKey);
+      new (&B->getFirst()) KeyT(EmptyKey);
   }
 
   void moveFromOldBuckets(BucketT *OldBucketsBegin, BucketT *OldBucketsEnd) {
@@ -279,21 +288,21 @@ protected:
     const KeyT EmptyKey = getEmptyKey();
     const KeyT TombstoneKey = getTombstoneKey();
     for (BucketT *B = OldBucketsBegin, *E = OldBucketsEnd; B != E; ++B) {
-      if (!KeyInfoT::isEqual(B->first, EmptyKey) &&
-          !KeyInfoT::isEqual(B->first, TombstoneKey)) {
+      if (!KeyInfoT::isEqual(B->getFirst(), EmptyKey) &&
+          !KeyInfoT::isEqual(B->getFirst(), TombstoneKey)) {
         // Insert the key/value into the new table.
         BucketT *DestBucket;
-        bool FoundVal = LookupBucketFor(B->first, DestBucket);
+        bool FoundVal = LookupBucketFor(B->getFirst(), DestBucket);
         (void)FoundVal; // silence warning.
         assert(!FoundVal && "Key already in new map?");
-        DestBucket->first = std::move(B->first);
-        new (&DestBucket->second) ValueT(std::move(B->second));
+        DestBucket->getFirst() = std::move(B->getFirst());
+        new (&DestBucket->getSecond()) ValueT(std::move(B->getSecond()));
         incrementNumEntries();
 
         // Free the value.
-        B->second.~ValueT();
+        B->getSecond().~ValueT();
       }
-      B->first.~KeyT();
+      B->getFirst().~KeyT();
     }
 
 #ifndef NDEBUG
@@ -304,7 +313,8 @@ protected:
   }
 
   template <typename OtherBaseT>
-  void copyFrom(const DenseMapBase<OtherBaseT, KeyT, ValueT, KeyInfoT>& other) {
+  void copyFrom(
+      const DenseMapBase<OtherBaseT, KeyT, ValueT, KeyInfoT, BucketT> &other) {
     assert(&other != this);
     assert(getNumBuckets() == other.getNumBuckets());
 
@@ -316,10 +326,12 @@ protected:
              getNumBuckets() * sizeof(BucketT));
     else
       for (size_t i = 0; i < getNumBuckets(); ++i) {
-        new (&getBuckets()[i].first) KeyT(other.getBuckets()[i].first);
-        if (!KeyInfoT::isEqual(getBuckets()[i].first, getEmptyKey()) &&
-            !KeyInfoT::isEqual(getBuckets()[i].first, getTombstoneKey()))
-          new (&getBuckets()[i].second) ValueT(other.getBuckets()[i].second);
+        new (&getBuckets()[i].getFirst())
+            KeyT(other.getBuckets()[i].getFirst());
+        if (!KeyInfoT::isEqual(getBuckets()[i].getFirst(), getEmptyKey()) &&
+            !KeyInfoT::isEqual(getBuckets()[i].getFirst(), getTombstoneKey()))
+          new (&getBuckets()[i].getSecond())
+              ValueT(other.getBuckets()[i].getSecond());
       }
   }
 
@@ -396,8 +408,8 @@ private:
                             BucketT *TheBucket) {
     TheBucket = InsertIntoBucketImpl(Key, TheBucket);
 
-    TheBucket->first = Key;
-    new (&TheBucket->second) ValueT(Value);
+    TheBucket->getFirst() = Key;
+    new (&TheBucket->getSecond()) ValueT(Value);
     return TheBucket;
   }
 
@@ -405,16 +417,16 @@ private:
                             BucketT *TheBucket) {
     TheBucket = InsertIntoBucketImpl(Key, TheBucket);
 
-    TheBucket->first = Key;
-    new (&TheBucket->second) ValueT(std::move(Value));
+    TheBucket->getFirst() = Key;
+    new (&TheBucket->getSecond()) ValueT(std::move(Value));
     return TheBucket;
   }
 
   BucketT *InsertIntoBucket(KeyT &&Key, ValueT &&Value, BucketT *TheBucket) {
     TheBucket = InsertIntoBucketImpl(Key, TheBucket);
 
-    TheBucket->first = std::move(Key);
-    new (&TheBucket->second) ValueT(std::move(Value));
+    TheBucket->getFirst() = std::move(Key);
+    new (&TheBucket->getSecond()) ValueT(std::move(Value));
     return TheBucket;
   }
 
@@ -430,11 +442,12 @@ private:
     // causing infinite loops in lookup.
     unsigned NewNumEntries = getNumEntries() + 1;
     unsigned NumBuckets = getNumBuckets();
-    if (NewNumEntries*4 >= NumBuckets*3) {
+    if (LLVM_UNLIKELY(NewNumEntries * 4 >= NumBuckets * 3)) {
       this->grow(NumBuckets * 2);
       LookupBucketFor(Key, TheBucket);
       NumBuckets = getNumBuckets();
-    } else if (NumBuckets-(NewNumEntries+getNumTombstones()) <= NumBuckets/8) {
+    } else if (LLVM_UNLIKELY(NumBuckets-(NewNumEntries+getNumTombstones()) <=
+                             NumBuckets/8)) {
       this->grow(NumBuckets);
       LookupBucketFor(Key, TheBucket);
     }
@@ -446,7 +459,7 @@ private:
 
     // If we are writing over a tombstone, remember this.
     const KeyT EmptyKey = getEmptyKey();
-    if (!KeyInfoT::isEqual(TheBucket->first, EmptyKey))
+    if (!KeyInfoT::isEqual(TheBucket->getFirst(), EmptyKey))
       decrementNumTombstones();
 
     return TheBucket;
@@ -480,14 +493,14 @@ private:
     while (1) {
       const BucketT *ThisBucket = BucketsPtr + BucketNo;
       // Found Val's bucket?  If so, return it.
-      if (KeyInfoT::isEqual(Val, ThisBucket->first)) {
+      if (LLVM_LIKELY(KeyInfoT::isEqual(Val, ThisBucket->getFirst()))) {
         FoundBucket = ThisBucket;
         return true;
       }
 
       // If we found an empty bucket, the key doesn't exist in the set.
       // Insert it and return the default value.
-      if (KeyInfoT::isEqual(ThisBucket->first, EmptyKey)) {
+      if (LLVM_LIKELY(KeyInfoT::isEqual(ThisBucket->getFirst(), EmptyKey))) {
         // If we've already seen a tombstone while probing, fill it in instead
         // of the empty bucket we eventually probed to.
         FoundBucket = FoundTombstone ? FoundTombstone : ThisBucket;
@@ -496,7 +509,8 @@ private:
 
       // If this is a tombstone, remember it.  If Val ends up not in the map, we
       // prefer to return it than something that would require more probing.
-      if (KeyInfoT::isEqual(ThisBucket->first, TombstoneKey) && !FoundTombstone)
+      if (KeyInfoT::isEqual(ThisBucket->getFirst(), TombstoneKey) &&
+          !FoundTombstone)
         FoundTombstone = ThisBucket;  // Remember the first tombstone found.
 
       // Otherwise, it's a hash collision or a tombstone, continue quadratic
@@ -525,16 +539,15 @@ public:
   }
 };
 
-template<typename KeyT, typename ValueT,
-         typename KeyInfoT = DenseMapInfo<KeyT> >
-class DenseMap
-    : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT>,
-                          KeyT, ValueT, KeyInfoT> {
+template <typename KeyT, typename ValueT,
+          typename KeyInfoT = DenseMapInfo<KeyT>,
+          typename BucketT = detail::DenseMapPair<KeyT, ValueT>>
+class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
+                                     KeyT, ValueT, KeyInfoT, BucketT> {
   // Lift some types from the dependent base class into this class for
   // simplicity of referring to them.
-  typedef DenseMapBase<DenseMap, KeyT, ValueT, KeyInfoT> BaseT;
-  typedef typename BaseT::BucketT BucketT;
-  friend class DenseMapBase<DenseMap, KeyT, ValueT, KeyInfoT>;
+  typedef DenseMapBase<DenseMap, KeyT, ValueT, KeyInfoT, BucketT> BaseT;
+  friend class DenseMapBase<DenseMap, KeyT, ValueT, KeyInfoT, BucketT>;
 
   BucketT *Buckets;
   unsigned NumEntries;
@@ -677,17 +690,17 @@ private:
   }
 };
 
-template<typename KeyT, typename ValueT,
-         unsigned InlineBuckets = 4,
-         typename KeyInfoT = DenseMapInfo<KeyT> >
+template <typename KeyT, typename ValueT, unsigned InlineBuckets = 4,
+          typename KeyInfoT = DenseMapInfo<KeyT>,
+          typename BucketT = detail::DenseMapPair<KeyT, ValueT>>
 class SmallDenseMap
-    : public DenseMapBase<SmallDenseMap<KeyT, ValueT, InlineBuckets, KeyInfoT>,
-                          KeyT, ValueT, KeyInfoT> {
+    : public DenseMapBase<
+          SmallDenseMap<KeyT, ValueT, InlineBuckets, KeyInfoT, BucketT>, KeyT,
+          ValueT, KeyInfoT, BucketT> {
   // Lift some types from the dependent base class into this class for
   // simplicity of referring to them.
-  typedef DenseMapBase<SmallDenseMap, KeyT, ValueT, KeyInfoT> BaseT;
-  typedef typename BaseT::BucketT BucketT;
-  friend class DenseMapBase<SmallDenseMap, KeyT, ValueT, KeyInfoT>;
+  typedef DenseMapBase<SmallDenseMap, KeyT, ValueT, KeyInfoT, BucketT> BaseT;
+  friend class DenseMapBase<SmallDenseMap, KeyT, ValueT, KeyInfoT, BucketT>;
 
   unsigned Small : 1;
   unsigned NumEntries : 31;
@@ -744,23 +757,23 @@ public:
       for (unsigned i = 0, e = InlineBuckets; i != e; ++i) {
         BucketT *LHSB = &getInlineBuckets()[i],
                 *RHSB = &RHS.getInlineBuckets()[i];
-        bool hasLHSValue = (!KeyInfoT::isEqual(LHSB->first, EmptyKey) &&
-                            !KeyInfoT::isEqual(LHSB->first, TombstoneKey));
-        bool hasRHSValue = (!KeyInfoT::isEqual(RHSB->first, EmptyKey) &&
-                            !KeyInfoT::isEqual(RHSB->first, TombstoneKey));
+        bool hasLHSValue = (!KeyInfoT::isEqual(LHSB->getFirst(), EmptyKey) &&
+                            !KeyInfoT::isEqual(LHSB->getFirst(), TombstoneKey));
+        bool hasRHSValue = (!KeyInfoT::isEqual(RHSB->getFirst(), EmptyKey) &&
+                            !KeyInfoT::isEqual(RHSB->getFirst(), TombstoneKey));
         if (hasLHSValue && hasRHSValue) {
           // Swap together if we can...
           std::swap(*LHSB, *RHSB);
           continue;
         }
         // Swap separately and handle any assymetry.
-        std::swap(LHSB->first, RHSB->first);
+        std::swap(LHSB->getFirst(), RHSB->getFirst());
         if (hasLHSValue) {
-          new (&RHSB->second) ValueT(std::move(LHSB->second));
-          LHSB->second.~ValueT();
+          new (&RHSB->getSecond()) ValueT(std::move(LHSB->getSecond()));
+          LHSB->getSecond().~ValueT();
         } else if (hasRHSValue) {
-          new (&LHSB->second) ValueT(std::move(RHSB->second));
-          RHSB->second.~ValueT();
+          new (&LHSB->getSecond()) ValueT(std::move(RHSB->getSecond()));
+          RHSB->getSecond().~ValueT();
         }
       }
       return;
@@ -785,12 +798,12 @@ public:
     for (unsigned i = 0, e = InlineBuckets; i != e; ++i) {
       BucketT *NewB = &LargeSide.getInlineBuckets()[i],
               *OldB = &SmallSide.getInlineBuckets()[i];
-      new (&NewB->first) KeyT(std::move(OldB->first));
-      OldB->first.~KeyT();
-      if (!KeyInfoT::isEqual(NewB->first, EmptyKey) &&
-          !KeyInfoT::isEqual(NewB->first, TombstoneKey)) {
-        new (&NewB->second) ValueT(std::move(OldB->second));
-        OldB->second.~ValueT();
+      new (&NewB->getFirst()) KeyT(std::move(OldB->getFirst()));
+      OldB->getFirst().~KeyT();
+      if (!KeyInfoT::isEqual(NewB->getFirst(), EmptyKey) &&
+          !KeyInfoT::isEqual(NewB->getFirst(), TombstoneKey)) {
+        new (&NewB->getSecond()) ValueT(std::move(OldB->getSecond()));
+        OldB->getSecond().~ValueT();
       }
     }
 
@@ -852,16 +865,16 @@ public:
       const KeyT EmptyKey = this->getEmptyKey();
       const KeyT TombstoneKey = this->getTombstoneKey();
       for (BucketT *P = getBuckets(), *E = P + InlineBuckets; P != E; ++P) {
-        if (!KeyInfoT::isEqual(P->first, EmptyKey) &&
-            !KeyInfoT::isEqual(P->first, TombstoneKey)) {
+        if (!KeyInfoT::isEqual(P->getFirst(), EmptyKey) &&
+            !KeyInfoT::isEqual(P->getFirst(), TombstoneKey)) {
           assert(size_t(TmpEnd - TmpBegin) < InlineBuckets &&
                  "Too many inline buckets!");
-          new (&TmpEnd->first) KeyT(std::move(P->first));
-          new (&TmpEnd->second) ValueT(std::move(P->second));
+          new (&TmpEnd->getFirst()) KeyT(std::move(P->getFirst()));
+          new (&TmpEnd->getSecond()) ValueT(std::move(P->getSecond()));
           ++TmpEnd;
-          P->second.~ValueT();
+          P->getSecond().~ValueT();
         }
-        P->first.~KeyT();
+        P->getFirst().~KeyT();
       }
 
       // Now make this map use the large rep, and move all the entries back
@@ -972,13 +985,12 @@ private:
   }
 };
 
-template<typename KeyT, typename ValueT,
-         typename KeyInfoT, bool IsConst>
+template <typename KeyT, typename ValueT, typename KeyInfoT, typename Bucket,
+          bool IsConst>
 class DenseMapIterator {
-  typedef std::pair<KeyT, ValueT> Bucket;
-  typedef DenseMapIterator<KeyT, ValueT,
-                           KeyInfoT, true> ConstIterator;
-  friend class DenseMapIterator<KeyT, ValueT, KeyInfoT, true>;
+  typedef DenseMapIterator<KeyT, ValueT, KeyInfoT, Bucket, true> ConstIterator;
+  friend class DenseMapIterator<KeyT, ValueT, KeyInfoT, Bucket, true>;
+
 public:
   typedef ptrdiff_t difference_type;
   typedef typename std::conditional<IsConst, const Bucket, Bucket>::type
@@ -999,9 +1011,9 @@ public:
   // If IsConst is true this is a converting constructor from iterator to
   // const_iterator and the default copy constructor is used.
   // Otherwise this is a copy constructor for iterator.
-  DenseMapIterator(const DenseMapIterator<KeyT, ValueT,
-                                          KeyInfoT, false>& I)
-    : Ptr(I.Ptr), End(I.End) {}
+  DenseMapIterator(
+      const DenseMapIterator<KeyT, ValueT, KeyInfoT, Bucket, false> &I)
+      : Ptr(I.Ptr), End(I.End) {}
 
   reference operator*() const {
     return *Ptr;
@@ -1031,9 +1043,8 @@ private:
     const KeyT Empty = KeyInfoT::getEmptyKey();
     const KeyT Tombstone = KeyInfoT::getTombstoneKey();
 
-    while (Ptr != End &&
-           (KeyInfoT::isEqual(Ptr->first, Empty) ||
-            KeyInfoT::isEqual(Ptr->first, Tombstone)))
+    while (Ptr != End && (KeyInfoT::isEqual(Ptr->getFirst(), Empty) ||
+                          KeyInfoT::isEqual(Ptr->getFirst(), Tombstone)))
       ++Ptr;
   }
 };
diff --git a/include/llvm/ADT/DenseSet.h b/include/llvm/ADT/DenseSet.h
index 9ab1be2..d340240 100644
--- a/include/llvm/ADT/DenseSet.h
+++ b/include/llvm/ADT/DenseSet.h
@@ -18,13 +18,29 @@
 
 namespace llvm {
 
+namespace detail {
+struct DenseSetEmpty {};
+
+// Use the empty base class trick so we can create a DenseMap where the buckets
+// contain only a single item.
+template <typename KeyT> class DenseSetPair : public DenseSetEmpty {
+  KeyT key;
+
+public:
+  KeyT &getFirst() { return key; }
+  const KeyT &getFirst() const { return key; }
+  DenseSetEmpty &getSecond() { return *this; }
+  const DenseSetEmpty &getSecond() const { return *this; }
+};
+}
+
 /// DenseSet - This implements a dense probed hash-table based set.
-///
-/// FIXME: This is currently implemented directly in terms of DenseMap, this
-/// should be optimized later if there is a need.
 template<typename ValueT, typename ValueInfoT = DenseMapInfo<ValueT> >
 class DenseSet {
-  typedef DenseMap<ValueT, char, ValueInfoT> MapTy;
+  typedef DenseMap<ValueT, detail::DenseSetEmpty, ValueInfoT,
+                   detail::DenseSetPair<ValueT>> MapTy;
+  static_assert(sizeof(typename MapTy::value_type) == sizeof(ValueT),
+                "DenseMap buckets unexpectedly large!");
   MapTy TheMap;
 public:
   typedef ValueT key_type;
@@ -72,8 +88,8 @@ public:
 
     Iterator(const typename MapTy::iterator &i) : I(i) {}
 
-    ValueT& operator*() { return I->first; }
-    ValueT* operator->() { return &I->first; }
+    ValueT &operator*() { return I->getFirst(); }
+    ValueT *operator->() { return &I->getFirst(); }
 
     Iterator& operator++() { ++I; return *this; }
     bool operator==(const Iterator& X) const { return I == X.I; }
@@ -92,8 +108,8 @@ public:
 
     ConstIterator(const typename MapTy::const_iterator &i) : I(i) {}
 
-    const ValueT& operator*() { return I->first; }
-    const ValueT* operator->() { return &I->first; }
+    const ValueT &operator*() { return I->getFirst(); }
+    const ValueT *operator->() { return &I->getFirst(); }
 
     ConstIterator& operator++() { ++I; return *this; }
     bool operator==(const ConstIterator& X) const { return I == X.I; }
@@ -129,7 +145,8 @@ public:
   void erase(ConstIterator CI) { return TheMap.erase(CI.I); }
 
   std::pair<iterator, bool> insert(const ValueT &V) {
-    return TheMap.insert(std::make_pair(V, 0));
+    detail::DenseSetEmpty Empty;
+    return TheMap.insert(std::make_pair(V, Empty));
   }
   
   // Range insertion of values.
diff --git a/include/llvm/ADT/DepthFirstIterator.h b/include/llvm/ADT/DepthFirstIterator.h
index 0f69146..6cd9e68 100644
--- a/include/llvm/ADT/DepthFirstIterator.h
+++ b/include/llvm/ADT/DepthFirstIterator.h
@@ -33,10 +33,10 @@
 #ifndef LLVM_ADT_DEPTHFIRSTITERATOR_H
 #define LLVM_ADT_DEPTHFIRSTITERATOR_H
 
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/iterator_range.h"
 #include <set>
 #include <vector>
 
diff --git a/include/llvm/ADT/EquivalenceClasses.h b/include/llvm/ADT/EquivalenceClasses.h
index e0396c7..d6a26f8 100644
--- a/include/llvm/ADT/EquivalenceClasses.h
+++ b/include/llvm/ADT/EquivalenceClasses.h
@@ -17,6 +17,7 @@
 
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
+#include <cstddef>
 #include <set>
 
 namespace llvm {
@@ -254,7 +255,7 @@ public:
       assert(Node != nullptr && "Dereferencing end()!");
       return Node->getData();
     }
-    reference operator->() const { return operator*(); }
+    pointer operator->() const { return &operator*(); }
 
     member_iterator &operator++() {
       assert(Node != nullptr && "++'d off the end of the list!");
diff --git a/include/llvm/ADT/FoldingSet.h b/include/llvm/ADT/FoldingSet.h
index 14c5933..7ade167 100644
--- a/include/llvm/ADT/FoldingSet.h
+++ b/include/llvm/ADT/FoldingSet.h
@@ -18,6 +18,7 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/DataTypes.h"
 
@@ -532,46 +533,6 @@ public:
 };
 
 //===----------------------------------------------------------------------===//
-/// FoldingSetVectorIterator - This implements an iterator for
-/// FoldingSetVector. It is only necessary because FoldingSetIterator provides
-/// a value_type of T, while the vector in FoldingSetVector exposes
-/// a value_type of T*. Fortunately, FoldingSetIterator doesn't expose very
-/// much besides operator* and operator->, so we just wrap the inner vector
-/// iterator and perform the extra dereference.
-template <class T, class VectorIteratorT>
-class FoldingSetVectorIterator {
-  // Provide a typedef to workaround the lack of correct injected class name
-  // support in older GCCs.
-  typedef FoldingSetVectorIterator<T, VectorIteratorT> SelfT;
-
-  VectorIteratorT Iterator;
-
-public:
-  FoldingSetVectorIterator(VectorIteratorT I) : Iterator(I) {}
-
-  bool operator==(const SelfT &RHS) const {
-    return Iterator == RHS.Iterator;
-  }
-  bool operator!=(const SelfT &RHS) const {
-    return Iterator != RHS.Iterator;
-  }
-
-  T &operator*() const { return **Iterator; }
-
-  T *operator->() const { return *Iterator; }
-
-  inline SelfT &operator++() {
-    ++Iterator;
-    return *this;
-  }
-  SelfT operator++(int) {
-    SelfT tmp = *this;
-    ++*this;
-    return tmp;
-  }
-};
-
-//===----------------------------------------------------------------------===//
 /// FoldingSetVector - This template class combines a FoldingSet and a vector
 /// to provide the interface of FoldingSet but with deterministic iteration
 /// order based on the insertion order. T must be a subclass of FoldingSetNode
@@ -586,12 +547,11 @@ public:
       : Set(Log2InitSize) {
   }
 
-  typedef FoldingSetVectorIterator<T, typename VectorT::iterator> iterator;
+  typedef pointee_iterator<typename VectorT::iterator> iterator;
   iterator begin() { return Vector.begin(); }
   iterator end()   { return Vector.end(); }
 
-  typedef FoldingSetVectorIterator<const T, typename VectorT::const_iterator>
-    const_iterator;
+  typedef pointee_iterator<typename VectorT::const_iterator> const_iterator;
   const_iterator begin() const { return Vector.begin(); }
   const_iterator end()   const { return Vector.end(); }
 
@@ -735,31 +695,9 @@ template <typename T>
 class FoldingSetNodeWrapper : public FoldingSetNode {
   T data;
 public:
-  explicit FoldingSetNodeWrapper(const T &x) : data(x) {}
-  virtual ~FoldingSetNodeWrapper() {}
-
-  template<typename A1>
-  explicit FoldingSetNodeWrapper(const A1 &a1)
-    : data(a1) {}
-
-  template <typename A1, typename A2>
-  explicit FoldingSetNodeWrapper(const A1 &a1, const A2 &a2)
-    : data(a1,a2) {}
-
-  template <typename A1, typename A2, typename A3>
-  explicit FoldingSetNodeWrapper(const A1 &a1, const A2 &a2, const A3 &a3)
-    : data(a1,a2,a3) {}
-
-  template <typename A1, typename A2, typename A3, typename A4>
-  explicit FoldingSetNodeWrapper(const A1 &a1, const A2 &a2, const A3 &a3,
-                                 const A4 &a4)
-    : data(a1,a2,a3,a4) {}
-
-  template <typename A1, typename A2, typename A3, typename A4, typename A5>
-  explicit FoldingSetNodeWrapper(const A1 &a1, const A2 &a2, const A3 &a3,
-                                 const A4 &a4, const A5 &a5)
-  : data(a1,a2,a3,a4,a5) {}
-
+  template <typename... Ts>
+  explicit FoldingSetNodeWrapper(Ts &&... Args)
+      : data(std::forward<Ts>(Args)...) {}
 
   void Profile(FoldingSetNodeID &ID) { FoldingSetTrait<T>::Profile(data, ID); }
 
diff --git a/include/llvm/ADT/Hashing.h b/include/llvm/ADT/Hashing.h
index abf02b8..a1e7864 100644
--- a/include/llvm/ADT/Hashing.h
+++ b/include/llvm/ADT/Hashing.h
@@ -55,12 +55,6 @@
 #include <iterator>
 #include <utility>
 
-// Allow detecting C++11 feature availability when building with Clang without
-// breaking other compilers.
-#ifndef __has_feature
-# define __has_feature(x) 0
-#endif
-
 namespace llvm {
 
 /// \brief An opaque object representing a hash code.
@@ -553,8 +547,6 @@ public:
     return buffer_ptr;
   }
 
-#if defined(__has_feature) && __has_feature(__cxx_variadic_templates__)
-
   /// \brief Recursive, variadic combining method.
   ///
   /// This function recurses through each argument, combining that argument
@@ -568,53 +560,6 @@ public:
     return combine(length, buffer_ptr, buffer_end, args...);
   }
 
-#else
-  // Manually expanded recursive combining methods. See variadic above for
-  // documentation.
-
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-            typename T6>
-  hash_code combine(size_t length, char *buffer_ptr, char *buffer_end,
-                    const T1 &arg1, const T2 &arg2, const T3 &arg3,
-                    const T4 &arg4, const T5 &arg5, const T6 &arg6) {
-    buffer_ptr = combine_data(length, buffer_ptr, buffer_end, get_hashable_data(arg1));
-    return combine(length, buffer_ptr, buffer_end, arg2, arg3, arg4, arg5, arg6);
-  }
-  template <typename T1, typename T2, typename T3, typename T4, typename T5>
-  hash_code combine(size_t length, char *buffer_ptr, char *buffer_end,
-                    const T1 &arg1, const T2 &arg2, const T3 &arg3,
-                    const T4 &arg4, const T5 &arg5) {
-    buffer_ptr = combine_data(length, buffer_ptr, buffer_end, get_hashable_data(arg1));
-    return combine(length, buffer_ptr, buffer_end, arg2, arg3, arg4, arg5);
-  }
-  template <typename T1, typename T2, typename T3, typename T4>
-  hash_code combine(size_t length, char *buffer_ptr, char *buffer_end,
-                    const T1 &arg1, const T2 &arg2, const T3 &arg3,
-                    const T4 &arg4) {
-    buffer_ptr = combine_data(length, buffer_ptr, buffer_end, get_hashable_data(arg1));
-    return combine(length, buffer_ptr, buffer_end, arg2, arg3, arg4);
-  }
-  template <typename T1, typename T2, typename T3>
-  hash_code combine(size_t length, char *buffer_ptr, char *buffer_end,
-                    const T1 &arg1, const T2 &arg2, const T3 &arg3) {
-    buffer_ptr = combine_data(length, buffer_ptr, buffer_end, get_hashable_data(arg1));
-    return combine(length, buffer_ptr, buffer_end, arg2, arg3);
-  }
-  template <typename T1, typename T2>
-  hash_code combine(size_t length, char *buffer_ptr, char *buffer_end,
-                    const T1 &arg1, const T2 &arg2) {
-    buffer_ptr = combine_data(length, buffer_ptr, buffer_end, get_hashable_data(arg1));
-    return combine(length, buffer_ptr, buffer_end, arg2);
-  }
-  template <typename T1>
-  hash_code combine(size_t length, char *buffer_ptr, char *buffer_end,
-                    const T1 &arg1) {
-    buffer_ptr = combine_data(length, buffer_ptr, buffer_end, get_hashable_data(arg1));
-    return combine(length, buffer_ptr, buffer_end);
-  }
-
-#endif
-
   /// \brief Base case for recursive, variadic combining.
   ///
   /// The base case when combining arguments recursively is reached when all
@@ -643,9 +588,6 @@ public:
 } // namespace detail
 } // namespace hashing
 
-
-#if __has_feature(__cxx_variadic_templates__)
-
 /// \brief Combine values into a single hash_code.
 ///
 /// This routine accepts a varying number of arguments of any type. It will
@@ -663,52 +605,6 @@ template <typename ...Ts> hash_code hash_combine(const Ts &...args) {
   return helper.combine(0, helper.buffer, helper.buffer + 64, args...);
 }
 
-#else
-
-// What follows are manually exploded overloads for each argument width. See
-// the above variadic definition for documentation and specification.
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6>
-hash_code hash_combine(const T1 &arg1, const T2 &arg2, const T3 &arg3,
-                       const T4 &arg4, const T5 &arg5, const T6 &arg6) {
-  ::llvm::hashing::detail::hash_combine_recursive_helper helper;
-  return helper.combine(0, helper.buffer, helper.buffer + 64,
-                        arg1, arg2, arg3, arg4, arg5, arg6);
-}
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-hash_code hash_combine(const T1 &arg1, const T2 &arg2, const T3 &arg3,
-                       const T4 &arg4, const T5 &arg5) {
-  ::llvm::hashing::detail::hash_combine_recursive_helper helper;
-  return helper.combine(0, helper.buffer, helper.buffer + 64,
-                        arg1, arg2, arg3, arg4, arg5);
-}
-template <typename T1, typename T2, typename T3, typename T4>
-hash_code hash_combine(const T1 &arg1, const T2 &arg2, const T3 &arg3,
-                       const T4 &arg4) {
-  ::llvm::hashing::detail::hash_combine_recursive_helper helper;
-  return helper.combine(0, helper.buffer, helper.buffer + 64,
-                        arg1, arg2, arg3, arg4);
-}
-template <typename T1, typename T2, typename T3>
-hash_code hash_combine(const T1 &arg1, const T2 &arg2, const T3 &arg3) {
-  ::llvm::hashing::detail::hash_combine_recursive_helper helper;
-  return helper.combine(0, helper.buffer, helper.buffer + 64, arg1, arg2, arg3);
-}
-template <typename T1, typename T2>
-hash_code hash_combine(const T1 &arg1, const T2 &arg2) {
-  ::llvm::hashing::detail::hash_combine_recursive_helper helper;
-  return helper.combine(0, helper.buffer, helper.buffer + 64, arg1, arg2);
-}
-template <typename T1>
-hash_code hash_combine(const T1 &arg1) {
-  ::llvm::hashing::detail::hash_combine_recursive_helper helper;
-  return helper.combine(0, helper.buffer, helper.buffer + 64, arg1);
-}
-
-#endif
-
-
 // Implementation details for implementations of hash_value overloads provided
 // here.
 namespace hashing {
diff --git a/include/llvm/ADT/ImmutableList.h b/include/llvm/ADT/ImmutableList.h
index 7f0c239..748d3e4 100644
--- a/include/llvm/ADT/ImmutableList.h
+++ b/include/llvm/ADT/ImmutableList.h
@@ -33,8 +33,8 @@ class ImmutableListImpl : public FoldingSetNode {
 
   friend class ImmutableListFactory<T>;
 
-  void operator=(const ImmutableListImpl&) LLVM_DELETED_FUNCTION;
-  ImmutableListImpl(const ImmutableListImpl&) LLVM_DELETED_FUNCTION;
+  void operator=(const ImmutableListImpl&) = delete;
+  ImmutableListImpl(const ImmutableListImpl&) = delete;
 
 public:
   const T& getHead() const { return Head; }
diff --git a/include/llvm/ADT/ImmutableMap.h b/include/llvm/ADT/ImmutableMap.h
index 11f281b..75fee90 100644
--- a/include/llvm/ADT/ImmutableMap.h
+++ b/include/llvm/ADT/ImmutableMap.h
@@ -122,8 +122,8 @@ public:
     }
 
   private:
-    Factory(const Factory& RHS) LLVM_DELETED_FUNCTION;
-    void operator=(const Factory& RHS) LLVM_DELETED_FUNCTION;
+    Factory(const Factory& RHS) = delete;
+    void operator=(const Factory& RHS) = delete;
   };
 
   bool contains(key_type_ref K) const {
diff --git a/include/llvm/ADT/ImmutableSet.h b/include/llvm/ADT/ImmutableSet.h
index 5a3d8ad..3c6f58c 100644
--- a/include/llvm/ADT/ImmutableSet.h
+++ b/include/llvm/ADT/ImmutableSet.h
@@ -1014,8 +1014,8 @@ public:
     }
 
   private:
-    Factory(const Factory& RHS) LLVM_DELETED_FUNCTION;
-    void operator=(const Factory& RHS) LLVM_DELETED_FUNCTION;
+    Factory(const Factory& RHS) = delete;
+    void operator=(const Factory& RHS) = delete;
   };
 
   friend class Factory;
diff --git a/include/llvm/ADT/IntervalMap.h b/include/llvm/ADT/IntervalMap.h
index 46549ee..99be38f 100644
--- a/include/llvm/ADT/IntervalMap.h
+++ b/include/llvm/ADT/IntervalMap.h
@@ -496,7 +496,7 @@ public:
   NodeRef() {}
 
   /// operator bool - Detect a null ref.
-  LLVM_EXPLICIT operator bool() const { return pip.getOpaqueValue(); }
+  explicit operator bool() const { return pip.getOpaqueValue(); }
 
   /// NodeRef - Create a reference to the node p with n elements.
   template <typename NodeT>
diff --git a/include/llvm/ADT/IntrusiveRefCntPtr.h b/include/llvm/ADT/IntrusiveRefCntPtr.h
index c859c98..9b12d04 100644
--- a/include/llvm/ADT/IntrusiveRefCntPtr.h
+++ b/include/llvm/ADT/IntrusiveRefCntPtr.h
@@ -84,7 +84,7 @@ namespace llvm {
     friend struct IntrusiveRefCntPtrInfo;
   };
 
-  
+
   template <typename T> struct IntrusiveRefCntPtrInfo {
     static void retain(T *obj) { obj->Retain(); }
     static void release(T *obj) { obj->Release(); }
@@ -114,7 +114,7 @@ public:
       delete static_cast<const Derived*>(this);
   }
 };
-  
+
 //===----------------------------------------------------------------------===//
 /// IntrusiveRefCntPtr - A template class that implements a "smart pointer"
 ///  that assumes the wrapped object has a reference count associated
@@ -177,7 +177,7 @@ public:
 
     T* get() const { return Obj; }
 
-    LLVM_EXPLICIT operator bool() const { return Obj; }
+    explicit operator bool() const { return Obj; }
 
     void swap(IntrusiveRefCntPtr& other) {
       T* tmp = other.Obj;
diff --git a/include/llvm/ADT/MapVector.h b/include/llvm/ADT/MapVector.h
index 14c49c5..1331b15 100644
--- a/include/llvm/ADT/MapVector.h
+++ b/include/llvm/ADT/MapVector.h
@@ -18,6 +18,7 @@
 #define LLVM_ADT_MAPVECTOR_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include <vector>
 
 namespace llvm {
@@ -181,6 +182,14 @@ void MapVector<KeyT, ValueT, MapType, VectorType>::remove_if(Function Pred) {
   Vector.erase(O, Vector.end());
 }
 
+/// \brief A MapVector that performs no allocations if smaller than a certain
+/// size.
+template <typename KeyT, typename ValueT, unsigned N>
+struct SmallMapVector
+    : MapVector<KeyT, ValueT, SmallDenseMap<KeyT, unsigned, N>,
+                SmallVector<std::pair<KeyT, ValueT>, N>> {
+};
+
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/ADT/None.h b/include/llvm/ADT/None.h
index 5793bd2..d69ec17 100644
--- a/include/llvm/ADT/None.h
+++ b/include/llvm/ADT/None.h
@@ -19,9 +19,8 @@
 namespace llvm {
 /// \brief A simple null object to allow implicit construction of Optional<T>
 /// and similar types without having to spell out the specialization's name.
-enum NoneType {
-  None
-};
+enum class NoneType { None };
+const NoneType None = None;
 }
 
 #endif
diff --git a/include/llvm/ADT/Optional.h b/include/llvm/ADT/Optional.h
index 591872e..855ab89 100644
--- a/include/llvm/ADT/Optional.h
+++ b/include/llvm/ADT/Optional.h
@@ -70,8 +70,6 @@ public:
     return *this;
   }
 
-#if LLVM_HAS_VARIADIC_TEMPLATES
-
   /// Create a new object by constructing it in place with the given arguments.
   template<typename ...ArgTypes>
   void emplace(ArgTypes &&...Args) {
@@ -80,51 +78,6 @@ public:
     new (storage.buffer) T(std::forward<ArgTypes>(Args)...);
   }
 
-#else
-  
-  /// Create a new object by default-constructing it in place.
-  void emplace() {
-    reset();
-    hasVal = true;
-    new (storage.buffer) T();
-  }
-  
-  /// Create a new object by constructing it in place with the given arguments.
-  template<typename T1>
-  void emplace(T1 &&A1) {
-    reset();
-    hasVal = true;
-    new (storage.buffer) T(std::forward<T1>(A1));
-  }
-  
-  /// Create a new object by constructing it in place with the given arguments.
-  template<typename T1, typename T2>
-  void emplace(T1 &&A1, T2 &&A2) {
-    reset();
-    hasVal = true;
-    new (storage.buffer) T(std::forward<T1>(A1), std::forward<T2>(A2));
-  }
-  
-  /// Create a new object by constructing it in place with the given arguments.
-  template<typename T1, typename T2, typename T3>
-  void emplace(T1 &&A1, T2 &&A2, T3 &&A3) {
-    reset();
-    hasVal = true;
-    new (storage.buffer) T(std::forward<T1>(A1), std::forward<T2>(A2),
-        std::forward<T3>(A3));
-  }
-  
-  /// Create a new object by constructing it in place with the given arguments.
-  template<typename T1, typename T2, typename T3, typename T4>
-  void emplace(T1 &&A1, T2 &&A2, T3 &&A3, T4 &&A4) {
-    reset();
-    hasVal = true;
-    new (storage.buffer) T(std::forward<T1>(A1), std::forward<T2>(A2),
-        std::forward<T3>(A3), std::forward<T4>(A4));
-  }
-
-#endif // LLVM_HAS_VARIADIC_TEMPLATES
-
   static inline Optional create(const T* y) {
     return y ? Optional(*y) : Optional();
   }
@@ -168,7 +121,7 @@ public:
   const T& getValue() const LLVM_LVALUE_FUNCTION { assert(hasVal); return *getPointer(); }
   T& getValue() LLVM_LVALUE_FUNCTION { assert(hasVal); return *getPointer(); }
 
-  LLVM_EXPLICIT operator bool() const { return hasVal; }
+  explicit operator bool() const { return hasVal; }
   bool hasValue() const { return hasVal; }
   const T* operator->() const { return getPointer(); }
   T* operator->() { return getPointer(); }
diff --git a/include/llvm/ADT/PointerUnion.h b/include/llvm/ADT/PointerUnion.h
index a6dddd2..f27b811 100644
--- a/include/llvm/ADT/PointerUnion.h
+++ b/include/llvm/ADT/PointerUnion.h
@@ -61,7 +61,7 @@ namespace llvm {
       NumLowBitsAvailable = PT1BitsAv < PT2BitsAv ? PT1BitsAv : PT2BitsAv
     };
   };
-  
+
   /// PointerUnion - This implements a discriminated union of two pointer types,
   /// and keeps the discriminator bit-mangled into the low bits of the pointer.
   /// This allows the implementation to be extremely efficient in space, but
@@ -80,7 +80,7 @@ namespace llvm {
   template <typename PT1, typename PT2>
   class PointerUnion {
   public:
-    typedef PointerIntPair<void*, 1, bool, 
+    typedef PointerIntPair<void*, 1, bool,
                            PointerUnionUIntTraits<PT1,PT2> > ValTy;
   private:
     ValTy Val;
@@ -96,14 +96,14 @@ namespace llvm {
 
   public:
     PointerUnion() {}
-    
+
     PointerUnion(PT1 V) : Val(
       const_cast<void *>(PointerLikeTypeTraits<PT1>::getAsVoidPointer(V))) {
     }
     PointerUnion(PT2 V) : Val(
       const_cast<void *>(PointerLikeTypeTraits<PT2>::getAsVoidPointer(V)), 1) {
     }
-    
+
     /// isNull - Return true if the pointer held in the union is null,
     /// regardless of which type it is.
     bool isNull() const {
@@ -111,7 +111,7 @@ namespace llvm {
       // we recursively strip off low bits if we have a nested PointerUnion.
       return !PointerLikeTypeTraits<PT1>::getFromVoidPointer(Val.getPointer());
     }
-    LLVM_EXPLICIT operator bool() const { return !isNull(); }
+    explicit operator bool() const { return !isNull(); }
 
     /// is<T>() return true if the Union currently holds the type matching T.
     template<typename T>
@@ -123,7 +123,7 @@ namespace llvm {
       int TyNo = Ty::Num;
       return static_cast<int>(Val.getInt()) == TyNo;
     }
-    
+
     /// get<T>() - Return the value of the specified pointer type. If the
     /// specified pointer type is incorrect, assert.
     template<typename T>
@@ -131,7 +131,7 @@ namespace llvm {
       assert(is<T>() && "Invalid accessor called");
       return PointerLikeTypeTraits<T>::getFromVoidPointer(Val.getPointer());
     }
-    
+
     /// dyn_cast<T>() - If the current value is of the specified pointer type,
     /// return it, otherwise return null.
     template<typename T>
@@ -160,7 +160,7 @@ namespace llvm {
       Val.initWithPointer(nullptr);
       return *this;
     }
-    
+
     /// Assignment operators - Allow assigning into this union from either
     /// pointer type, setting the discriminator to remember what it came from.
     const PointerUnion &operator=(const PT1 &RHS) {
@@ -174,7 +174,7 @@ namespace llvm {
         1);
       return *this;
     }
-    
+
     void *getOpaqueValue() const { return Val.getOpaqueValue(); }
     static inline PointerUnion getFromOpaqueValue(void *VP) {
       PointerUnion V;
@@ -195,6 +195,12 @@ namespace llvm {
     return lhs.getOpaqueValue() != rhs.getOpaqueValue();
   }
 
+  template<typename PT1, typename PT2>
+  static bool operator<(PointerUnion<PT1, PT2> lhs,
+                        PointerUnion<PT1, PT2> rhs) {
+    return lhs.getOpaqueValue() < rhs.getOpaqueValue();
+  }
+
   // Teach SmallPtrSet that PointerUnion is "basically a pointer", that has
   // # low bits available = min(PT1bits,PT2bits)-1.
   template<typename PT1, typename PT2>
@@ -208,16 +214,16 @@ namespace llvm {
     getFromVoidPointer(void *P) {
       return PointerUnion<PT1, PT2>::getFromOpaqueValue(P);
     }
-    
+
     // The number of bits available are the min of the two pointer types.
     enum {
-      NumLowBitsAvailable = 
+      NumLowBitsAvailable =
         PointerLikeTypeTraits<typename PointerUnion<PT1,PT2>::ValTy>
           ::NumLowBitsAvailable
     };
   };
-  
-  
+
+
   /// PointerUnion3 - This is a pointer union of three pointer types.  See
   /// documentation for PointerUnion for usage.
   template <typename PT1, typename PT2, typename PT3>
@@ -233,7 +239,7 @@ namespace llvm {
       IsInnerUnion(ValTy val) : Val(val) { }
       template<typename T>
       int is() const {
-        return Val.template is<InnerUnion>() && 
+        return Val.template is<InnerUnion>() &&
                Val.template get<InnerUnion>().template is<T>();
       }
       template<typename T>
@@ -257,7 +263,7 @@ namespace llvm {
 
   public:
     PointerUnion3() {}
-    
+
     PointerUnion3(PT1 V) {
       Val = InnerUnion(V);
     }
@@ -267,12 +273,12 @@ namespace llvm {
     PointerUnion3(PT3 V) {
       Val = V;
     }
-    
+
     /// isNull - Return true if the pointer held in the union is null,
     /// regardless of which type it is.
     bool isNull() const { return Val.isNull(); }
-    LLVM_EXPLICIT operator bool() const { return !isNull(); }
-    
+    explicit operator bool() const { return !isNull(); }
+
     /// is<T>() return true if the Union currently holds the type matching T.
     template<typename T>
     int is() const {
@@ -283,7 +289,7 @@ namespace llvm {
                                                                    >::Return Ty;
       return Ty(Val).template is<T>();
     }
-    
+
     /// get<T>() - Return the value of the specified pointer type. If the
     /// specified pointer type is incorrect, assert.
     template<typename T>
@@ -296,7 +302,7 @@ namespace llvm {
                                                                    >::Return Ty;
       return Ty(Val).template get<T>();
     }
-    
+
     /// dyn_cast<T>() - If the current value is of the specified pointer type,
     /// return it, otherwise return null.
     template<typename T>
@@ -310,7 +316,7 @@ namespace llvm {
       Val = nullptr;
       return *this;
     }
-    
+
     /// Assignment operators - Allow assigning into this union from either
     /// pointer type, setting the discriminator to remember what it came from.
     const PointerUnion3 &operator=(const PT1 &RHS) {
@@ -325,7 +331,7 @@ namespace llvm {
       Val = RHS;
       return *this;
     }
-    
+
     void *getOpaqueValue() const { return Val.getOpaqueValue(); }
     static inline PointerUnion3 getFromOpaqueValue(void *VP) {
       PointerUnion3 V;
@@ -333,7 +339,7 @@ namespace llvm {
       return V;
     }
   };
- 
+
   // Teach SmallPtrSet that PointerUnion3 is "basically a pointer", that has
   // # low bits available = min(PT1bits,PT2bits,PT2bits)-2.
   template<typename PT1, typename PT2, typename PT3>
@@ -347,10 +353,10 @@ namespace llvm {
     getFromVoidPointer(void *P) {
       return PointerUnion3<PT1, PT2, PT3>::getFromOpaqueValue(P);
     }
-    
+
     // The number of bits available are the min of the two pointer types.
     enum {
-      NumLowBitsAvailable = 
+      NumLowBitsAvailable =
         PointerLikeTypeTraits<typename PointerUnion3<PT1, PT2, PT3>::ValTy>
           ::NumLowBitsAvailable
     };
@@ -368,7 +374,7 @@ namespace llvm {
     ValTy Val;
   public:
     PointerUnion4() {}
-    
+
     PointerUnion4(PT1 V) {
       Val = InnerUnion1(V);
     }
@@ -381,12 +387,12 @@ namespace llvm {
     PointerUnion4(PT4 V) {
       Val = InnerUnion2(V);
     }
-    
+
     /// isNull - Return true if the pointer held in the union is null,
     /// regardless of which type it is.
     bool isNull() const { return Val.isNull(); }
-    LLVM_EXPLICIT operator bool() const { return !isNull(); }
-    
+    explicit operator bool() const { return !isNull(); }
+
     /// is<T>() return true if the Union currently holds the type matching T.
     template<typename T>
     int is() const {
@@ -395,10 +401,10 @@ namespace llvm {
         ::llvm::PointerUnionTypeSelector<PT1, T, InnerUnion1,
           ::llvm::PointerUnionTypeSelector<PT2, T, InnerUnion1, InnerUnion2 >
                                                                    >::Return Ty;
-      return Val.template is<Ty>() && 
+      return Val.template is<Ty>() &&
              Val.template get<Ty>().template is<T>();
     }
-    
+
     /// get<T>() - Return the value of the specified pointer type. If the
     /// specified pointer type is incorrect, assert.
     template<typename T>
@@ -411,7 +417,7 @@ namespace llvm {
                                                                    >::Return Ty;
       return Val.template get<Ty>().template get<T>();
     }
-    
+
     /// dyn_cast<T>() - If the current value is of the specified pointer type,
     /// return it, otherwise return null.
     template<typename T>
@@ -425,7 +431,7 @@ namespace llvm {
       Val = nullptr;
       return *this;
     }
-    
+
     /// Assignment operators - Allow assigning into this union from either
     /// pointer type, setting the discriminator to remember what it came from.
     const PointerUnion4 &operator=(const PT1 &RHS) {
@@ -444,7 +450,7 @@ namespace llvm {
       Val = InnerUnion2(RHS);
       return *this;
     }
-    
+
     void *getOpaqueValue() const { return Val.getOpaqueValue(); }
     static inline PointerUnion4 getFromOpaqueValue(void *VP) {
       PointerUnion4 V;
@@ -452,7 +458,7 @@ namespace llvm {
       return V;
     }
   };
-  
+
   // Teach SmallPtrSet that PointerUnion4 is "basically a pointer", that has
   // # low bits available = min(PT1bits,PT2bits,PT2bits)-2.
   template<typename PT1, typename PT2, typename PT3, typename PT4>
@@ -466,10 +472,10 @@ namespace llvm {
     getFromVoidPointer(void *P) {
       return PointerUnion4<PT1, PT2, PT3, PT4>::getFromOpaqueValue(P);
     }
-    
+
     // The number of bits available are the min of the two pointer types.
     enum {
-      NumLowBitsAvailable = 
+      NumLowBitsAvailable =
         PointerLikeTypeTraits<typename PointerUnion4<PT1, PT2, PT3, PT4>::ValTy>
           ::NumLowBitsAvailable
     };
diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index 4e56e4d..57af18e 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -18,6 +18,7 @@
 #define LLVM_ADT_STLEXTRAS_H
 
 #include "llvm/Support/Compiler.h"
+#include <cassert>
 #include <cstddef> // for std::size_t
 #include <cstdlib> // for qsort
 #include <functional>
@@ -63,8 +64,6 @@ struct greater_ptr : public std::binary_function<Ty, Ty, bool> {
 /// a function_ref.
 template<typename Fn> class function_ref;
 
-#if LLVM_HAS_VARIADIC_TEMPLATES
-
 template<typename Ret, typename ...Params>
 class function_ref<Ret(Params...)> {
   Ret (*callback)(intptr_t callable, Params ...params);
@@ -89,112 +88,6 @@ public:
   }
 };
 
-#else
-
-template<typename Ret>
-class function_ref<Ret()> {
-  Ret (*callback)(intptr_t callable);
-  intptr_t callable;
-
-  template<typename Callable>
-  static Ret callback_fn(intptr_t callable) {
-    return (*reinterpret_cast<Callable*>(callable))();
-  }
-
-public:
-  template<typename Callable>
-  function_ref(Callable &&callable,
-               typename std::enable_if<
-                   !std::is_same<typename std::remove_reference<Callable>::type,
-                                 function_ref>::value>::type * = nullptr)
-      : callback(callback_fn<typename std::remove_reference<Callable>::type>),
-        callable(reinterpret_cast<intptr_t>(&callable)) {}
-  Ret operator()() const { return callback(callable); }
-};
-
-template<typename Ret, typename Param1>
-class function_ref<Ret(Param1)> {
-  Ret (*callback)(intptr_t callable, Param1 param1);
-  intptr_t callable;
-
-  template<typename Callable>
-  static Ret callback_fn(intptr_t callable, Param1 param1) {
-    return (*reinterpret_cast<Callable*>(callable))(
-        std::forward<Param1>(param1));
-  }
-
-public:
-  template<typename Callable>
-  function_ref(Callable &&callable,
-               typename std::enable_if<
-                   !std::is_same<typename std::remove_reference<Callable>::type,
-                                 function_ref>::value>::type * = nullptr)
-      : callback(callback_fn<typename std::remove_reference<Callable>::type>),
-        callable(reinterpret_cast<intptr_t>(&callable)) {}
-  Ret operator()(Param1 param1) {
-    return callback(callable, std::forward<Param1>(param1));
-  }
-};
-
-template<typename Ret, typename Param1, typename Param2>
-class function_ref<Ret(Param1, Param2)> {
-  Ret (*callback)(intptr_t callable, Param1 param1, Param2 param2);
-  intptr_t callable;
-
-  template<typename Callable>
-  static Ret callback_fn(intptr_t callable, Param1 param1, Param2 param2) {
-    return (*reinterpret_cast<Callable*>(callable))(
-        std::forward<Param1>(param1),
-        std::forward<Param2>(param2));
-  }
-
-public:
-  template<typename Callable>
-  function_ref(Callable &&callable,
-               typename std::enable_if<
-                   !std::is_same<typename std::remove_reference<Callable>::type,
-                                 function_ref>::value>::type * = nullptr)
-      : callback(callback_fn<typename std::remove_reference<Callable>::type>),
-        callable(reinterpret_cast<intptr_t>(&callable)) {}
-  Ret operator()(Param1 param1, Param2 param2) {
-    return callback(callable,
-                    std::forward<Param1>(param1),
-                    std::forward<Param2>(param2));
-  }
-};
-
-template<typename Ret, typename Param1, typename Param2, typename Param3>
-class function_ref<Ret(Param1, Param2, Param3)> {
-  Ret (*callback)(intptr_t callable, Param1 param1, Param2 param2, Param3 param3);
-  intptr_t callable;
-
-  template<typename Callable>
-  static Ret callback_fn(intptr_t callable, Param1 param1, Param2 param2,
-                         Param3 param3) {
-    return (*reinterpret_cast<Callable*>(callable))(
-        std::forward<Param1>(param1),
-        std::forward<Param2>(param2),
-        std::forward<Param3>(param3));
-  }
-
-public:
-  template<typename Callable>
-  function_ref(Callable &&callable,
-               typename std::enable_if<
-                   !std::is_same<typename std::remove_reference<Callable>::type,
-                                 function_ref>::value>::type * = nullptr)
-      : callback(callback_fn<typename std::remove_reference<Callable>::type>),
-        callable(reinterpret_cast<intptr_t>(&callable)) {}
-  Ret operator()(Param1 param1, Param2 param2, Param3 param3) {
-    return callback(callable,
-                    std::forward<Param1>(param1),
-                    std::forward<Param2>(param2),
-                    std::forward<Param3>(param3));
-  }
-};
-
-#endif
-
 // deleter - Very very very simple method that is used to invoke operator
 // delete on something.  It is used like this:
 //
@@ -301,6 +194,28 @@ struct less_second {
   }
 };
 
+// A subset of N3658. More stuff can be added as-needed.
+
+/// \brief Represents a compile-time sequence of integers.
+template <class T, T... I> struct integer_sequence {
+  typedef T value_type;
+
+  static LLVM_CONSTEXPR size_t size() { return sizeof...(I); }
+};
+
+/// \brief Alias for the common case of a sequence of size_ts.
+template <size_t... I>
+struct index_sequence : integer_sequence<std::size_t, I...> {};
+
+template <std::size_t N, std::size_t... I>
+struct build_index_impl : build_index_impl<N - 1, N - 1, I...> {};
+template <std::size_t... I>
+struct build_index_impl<0, I...> : index_sequence<I...> {};
+
+/// \brief Creates a compile-time integer sequence for a parameter pack.
+template <class... Ts>
+struct index_sequence_for : build_index_impl<sizeof...(Ts)> {};
+
 //===----------------------------------------------------------------------===//
 //     Extra additions for arrays
 //===----------------------------------------------------------------------===//
@@ -392,8 +307,6 @@ void DeleteContainerSeconds(Container &C) {
 //     Extra additions to <memory>
 //===----------------------------------------------------------------------===//
 
-#if LLVM_HAS_VARIADIC_TEMPLATES
-
 // Implement make_unique according to N3656.
 
 /// \brief Constructs a `new T()` with the given args and returns a
@@ -427,123 +340,7 @@ make_unique(size_t n) {
 /// This function isn't used and is only here to provide better compile errors.
 template <class T, class... Args>
 typename std::enable_if<std::extent<T>::value != 0>::type
-make_unique(Args &&...) LLVM_DELETED_FUNCTION;
-
-#else
-
-template <class T>
-typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
-make_unique() {
-  return std::unique_ptr<T>(new T());
-}
-
-template <class T, class Arg1>
-typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
-make_unique(Arg1 &&arg1) {
-  return std::unique_ptr<T>(new T(std::forward<Arg1>(arg1)));
-}
-
-template <class T, class Arg1, class Arg2>
-typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
-make_unique(Arg1 &&arg1, Arg2 &&arg2) {
-  return std::unique_ptr<T>(
-      new T(std::forward<Arg1>(arg1), std::forward<Arg2>(arg2)));
-}
-
-template <class T, class Arg1, class Arg2, class Arg3>
-typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
-make_unique(Arg1 &&arg1, Arg2 &&arg2, Arg3 &&arg3) {
-  return std::unique_ptr<T>(new T(std::forward<Arg1>(arg1),
-                                  std::forward<Arg2>(arg2),
-                                  std::forward<Arg3>(arg3)));
-}
-
-template <class T, class Arg1, class Arg2, class Arg3, class Arg4>
-typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
-make_unique(Arg1 &&arg1, Arg2 &&arg2, Arg3 &&arg3, Arg4 &&arg4) {
-  return std::unique_ptr<T>(
-      new T(std::forward<Arg1>(arg1), std::forward<Arg2>(arg2),
-            std::forward<Arg3>(arg3), std::forward<Arg4>(arg4)));
-}
-
-template <class T, class Arg1, class Arg2, class Arg3, class Arg4, class Arg5>
-typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
-make_unique(Arg1 &&arg1, Arg2 &&arg2, Arg3 &&arg3, Arg4 &&arg4, Arg5 &&arg5) {
-  return std::unique_ptr<T>(
-      new T(std::forward<Arg1>(arg1), std::forward<Arg2>(arg2),
-            std::forward<Arg3>(arg3), std::forward<Arg4>(arg4),
-            std::forward<Arg5>(arg5)));
-}
-
-template <class T, class Arg1, class Arg2, class Arg3, class Arg4, class Arg5,
-          class Arg6>
-typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
-make_unique(Arg1 &&arg1, Arg2 &&arg2, Arg3 &&arg3, Arg4 &&arg4, Arg5 &&arg5,
-            Arg6 &&arg6) {
-  return std::unique_ptr<T>(
-      new T(std::forward<Arg1>(arg1), std::forward<Arg2>(arg2),
-            std::forward<Arg3>(arg3), std::forward<Arg4>(arg4),
-            std::forward<Arg5>(arg5), std::forward<Arg6>(arg6)));
-}
-
-template <class T, class Arg1, class Arg2, class Arg3, class Arg4, class Arg5,
-          class Arg6, class Arg7>
-typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
-make_unique(Arg1 &&arg1, Arg2 &&arg2, Arg3 &&arg3, Arg4 &&arg4, Arg5 &&arg5,
-            Arg6 &&arg6, Arg7 &&arg7) {
-  return std::unique_ptr<T>(
-      new T(std::forward<Arg1>(arg1), std::forward<Arg2>(arg2),
-            std::forward<Arg3>(arg3), std::forward<Arg4>(arg4),
-            std::forward<Arg5>(arg5), std::forward<Arg6>(arg6),
-            std::forward<Arg7>(arg7)));
-}
-
-template <class T, class Arg1, class Arg2, class Arg3, class Arg4, class Arg5,
-          class Arg6, class Arg7, class Arg8>
-typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
-make_unique(Arg1 &&arg1, Arg2 &&arg2, Arg3 &&arg3, Arg4 &&arg4, Arg5 &&arg5,
-            Arg6 &&arg6, Arg7 &&arg7, Arg8 &&arg8) {
-  return std::unique_ptr<T>(
-      new T(std::forward<Arg1>(arg1), std::forward<Arg2>(arg2),
-            std::forward<Arg3>(arg3), std::forward<Arg4>(arg4),
-            std::forward<Arg5>(arg5), std::forward<Arg6>(arg6),
-            std::forward<Arg7>(arg7), std::forward<Arg8>(arg8)));
-}
-
-template <class T, class Arg1, class Arg2, class Arg3, class Arg4, class Arg5,
-          class Arg6, class Arg7, class Arg8, class Arg9>
-typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
-make_unique(Arg1 &&arg1, Arg2 &&arg2, Arg3 &&arg3, Arg4 &&arg4, Arg5 &&arg5,
-            Arg6 &&arg6, Arg7 &&arg7, Arg8 &&arg8, Arg9 &&arg9) {
-  return std::unique_ptr<T>(
-      new T(std::forward<Arg1>(arg1), std::forward<Arg2>(arg2),
-            std::forward<Arg3>(arg3), std::forward<Arg4>(arg4),
-            std::forward<Arg5>(arg5), std::forward<Arg6>(arg6),
-            std::forward<Arg7>(arg7), std::forward<Arg8>(arg8),
-            std::forward<Arg9>(arg9)));
-}
-
-template <class T, class Arg1, class Arg2, class Arg3, class Arg4, class Arg5,
-          class Arg6, class Arg7, class Arg8, class Arg9, class Arg10>
-typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
-make_unique(Arg1 &&arg1, Arg2 &&arg2, Arg3 &&arg3, Arg4 &&arg4, Arg5 &&arg5,
-            Arg6 &&arg6, Arg7 &&arg7, Arg8 &&arg8, Arg9 &&arg9, Arg10 &&arg10) {
-  return std::unique_ptr<T>(
-      new T(std::forward<Arg1>(arg1), std::forward<Arg2>(arg2),
-            std::forward<Arg3>(arg3), std::forward<Arg4>(arg4),
-            std::forward<Arg5>(arg5), std::forward<Arg6>(arg6),
-            std::forward<Arg7>(arg7), std::forward<Arg8>(arg8),
-            std::forward<Arg9>(arg9), std::forward<Arg10>(arg10)));
-}
-
-template <class T>
-typename std::enable_if<std::is_array<T>::value &&std::extent<T>::value == 0,
-                        std::unique_ptr<T>>::type
-make_unique(size_t n) {
-  return std::unique_ptr<T>(new typename std::remove_extent<T>::type[n]());
-}
-
-#endif
+make_unique(Args &&...) = delete;
 
 struct FreeDeleter {
   void operator()(void* v) {
@@ -558,6 +355,35 @@ struct pair_hash {
   }
 };
 
+/// A functor like C++14's std::less<void> in its absence.
+struct less {
+  template <typename A, typename B> bool operator()(A &&a, B &&b) const {
+    return std::forward<A>(a) < std::forward<B>(b);
+  }
+};
+
+/// A functor like C++14's std::equal<void> in its absence.
+struct equal {
+  template <typename A, typename B> bool operator()(A &&a, B &&b) const {
+    return std::forward<A>(a) == std::forward<B>(b);
+  }
+};
+
+/// Binary functor that adapts to any other binary functor after dereferencing
+/// operands.
+template <typename T> struct deref {
+  T func;
+  // Could be further improved to cope with non-derivable functors and
+  // non-binary functors (should be a variadic template member function
+  // operator()).
+  template <typename A, typename B>
+  auto operator()(A &lhs, B &rhs) const -> decltype(func(*lhs, *rhs)) {
+    assert(lhs);
+    assert(rhs);
+    return func(*lhs, *rhs);
+  }
+};
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/ADT/ScopedHashTable.h b/include/llvm/ADT/ScopedHashTable.h
index 2f60ecc..5abe76c 100644
--- a/include/llvm/ADT/ScopedHashTable.h
+++ b/include/llvm/ADT/ScopedHashTable.h
@@ -90,8 +90,8 @@ class ScopedHashTableScope {
   /// LastValInScope - This is the last value that was inserted for this scope
   /// or null if none have been inserted yet.
   ScopedHashTableVal<K, V> *LastValInScope;
-  void operator=(ScopedHashTableScope&) LLVM_DELETED_FUNCTION;
-  ScopedHashTableScope(ScopedHashTableScope&) LLVM_DELETED_FUNCTION;
+  void operator=(ScopedHashTableScope&) = delete;
+  ScopedHashTableScope(ScopedHashTableScope&) = delete;
 public:
   ScopedHashTableScope(ScopedHashTable<K, V, KInfo, AllocatorTy> &HT);
   ~ScopedHashTableScope();
diff --git a/include/llvm/ADT/SmallBitVector.h b/include/llvm/ADT/SmallBitVector.h
index ababf0f..22e8ccd 100644
--- a/include/llvm/ADT/SmallBitVector.h
+++ b/include/llvm/ADT/SmallBitVector.h
@@ -53,6 +53,9 @@ class SmallBitVector {
     SmallNumDataBits = SmallNumRawBits - SmallNumSizeBits
   };
 
+  static_assert(NumBaseBits == 64 || NumBaseBits == 32,
+                "Unsupported word size");
+
 public:
   typedef unsigned size_type;
   // Encapsulation of a single bit.
@@ -177,11 +180,7 @@ public:
   size_type count() const {
     if (isSmall()) {
       uintptr_t Bits = getSmallBits();
-      if (NumBaseBits == 32)
-        return CountPopulation_32(Bits);
-      if (NumBaseBits == 64)
-        return CountPopulation_64(Bits);
-      llvm_unreachable("Unsupported!");
+      return countPopulation(Bits);
     }
     return getPointer()->count();
   }
@@ -214,11 +213,7 @@ public:
       uintptr_t Bits = getSmallBits();
       if (Bits == 0)
         return -1;
-      if (NumBaseBits == 32)
-        return countTrailingZeros(Bits);
-      if (NumBaseBits == 64)
-        return countTrailingZeros(Bits);
-      llvm_unreachable("Unsupported!");
+      return countTrailingZeros(Bits);
     }
     return getPointer()->find_first();
   }
@@ -232,11 +227,7 @@ public:
       Bits &= ~uintptr_t(0) << (Prev + 1);
       if (Bits == 0 || Prev + 1 >= getSmallSize())
         return -1;
-      if (NumBaseBits == 32)
-        return countTrailingZeros(Bits);
-      if (NumBaseBits == 64)
-        return countTrailingZeros(Bits);
-      llvm_unreachable("Unsupported!");
+      return countTrailingZeros(Bits);
     }
     return getPointer()->find_next(Prev);
   }
@@ -292,8 +283,12 @@ public:
   }
 
   SmallBitVector &set(unsigned Idx) {
-    if (isSmall())
+    if (isSmall()) {
+      assert(Idx <= static_cast<unsigned>(
+                        std::numeric_limits<uintptr_t>::digits) &&
+             "undefined behavior");
       setSmallBits(getSmallBits() | (uintptr_t(1) << Idx));
+    }
     else
       getPointer()->set(Idx);
     return *this;
@@ -556,7 +551,6 @@ public:
 private:
   template<bool AddBits, bool InvertMask>
   void applyMask(const uint32_t *Mask, unsigned MaskWords) {
-    assert((NumBaseBits == 64 || NumBaseBits == 32) && "Unsupported word size");
     if (NumBaseBits == 64 && MaskWords >= 2) {
       uint64_t M = Mask[0] | (uint64_t(Mask[1]) << 32);
       if (InvertMask) M = ~M;
diff --git a/include/llvm/ADT/SmallPtrSet.h b/include/llvm/ADT/SmallPtrSet.h
index b8977fa..3e3c9c1 100644
--- a/include/llvm/ADT/SmallPtrSet.h
+++ b/include/llvm/ADT/SmallPtrSet.h
@@ -132,7 +132,7 @@ private:
   /// Grow - Allocate a larger backing store for the buckets and move it over.
   void Grow(unsigned NewSize);
 
-  void operator=(const SmallPtrSetImplBase &RHS) LLVM_DELETED_FUNCTION;
+  void operator=(const SmallPtrSetImplBase &RHS) = delete;
 protected:
   /// swap - Swaps the elements of two sets.
   /// Note: This method assumes that both sets have the same small size.
@@ -242,7 +242,7 @@ template <typename PtrType>
 class SmallPtrSetImpl : public SmallPtrSetImplBase {
   typedef PointerLikeTypeTraits<PtrType> PtrTraits;
 
-  SmallPtrSetImpl(const SmallPtrSetImpl&) LLVM_DELETED_FUNCTION;
+  SmallPtrSetImpl(const SmallPtrSetImpl&) = delete;
 protected:
   // Constructors that forward to the base.
   SmallPtrSetImpl(const void **SmallStorage, const SmallPtrSetImpl &that)
@@ -257,8 +257,10 @@ public:
   typedef SmallPtrSetIterator<PtrType> iterator;
   typedef SmallPtrSetIterator<PtrType> const_iterator;
 
-  /// insert - This returns true if the pointer was new to the set, false if it
-  /// was already in the set.
+  /// Inserts Ptr if and only if there is no element in the container equal to
+  /// Ptr. The bool component of the returned pair is true if and only if the
+  /// insertion takes place, and the iterator component of the pair points to
+  /// the element equal to Ptr.
   std::pair<iterator, bool> insert(PtrType Ptr) {
     auto p = insert_imp(PtrTraits::getAsVoidPointer(Ptr));
     return std::make_pair(iterator(p.first, CurArray + CurArraySize), p.second);
diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index 2117541..14e2c7b 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h
@@ -135,11 +135,11 @@ public:
   const_pointer data() const { return const_pointer(begin()); }
 
   reference operator[](size_type idx) {
-    assert(begin() + idx < end());
+    assert(idx < size());
     return begin()[idx];
   }
   const_reference operator[](size_type idx) const {
-    assert(begin() + idx < end());
+    assert(idx < size());
     return begin()[idx];
   }
 
@@ -307,8 +307,11 @@ protected:
 
   /// Copy the range [I, E) onto the uninitialized memory
   /// starting with "Dest", constructing elements into it as needed.
-  template<typename T1, typename T2>
-  static void uninitialized_copy(T1 *I, T1 *E, T2 *Dest) {
+  template <typename T1, typename T2>
+  static void uninitialized_copy(
+      T1 *I, T1 *E, T2 *Dest,
+      typename std::enable_if<std::is_same<typename std::remove_const<T1>::type,
+                                           T2>::value>::type * = nullptr) {
     // Use memcpy for PODs iterated by pointers (which includes SmallVector
     // iterators): std::uninitialized_copy optimizes to memmove, but we can
     // use memcpy here.
@@ -340,7 +343,7 @@ template <typename T>
 class SmallVectorImpl : public SmallVectorTemplateBase<T, isPodLike<T>::value> {
   typedef SmallVectorTemplateBase<T, isPodLike<T>::value > SuperClass;
 
-  SmallVectorImpl(const SmallVectorImpl&) LLVM_DELETED_FUNCTION;
+  SmallVectorImpl(const SmallVectorImpl&) = delete;
 public:
   typedef typename SuperClass::iterator iterator;
   typedef typename SuperClass::size_type size_type;
@@ -367,7 +370,7 @@ public:
     this->EndX = this->BeginX;
   }
 
-  void resize(unsigned N) {
+  void resize(size_type N) {
     if (N < this->size()) {
       this->destroy_range(this->begin()+N, this->end());
       this->setEnd(this->begin()+N);
@@ -380,7 +383,7 @@ public:
     }
   }
 
-  void resize(unsigned N, const T &NV) {
+  void resize(size_type N, const T &NV) {
     if (N < this->size()) {
       this->destroy_range(this->begin()+N, this->end());
       this->setEnd(this->begin()+N);
@@ -392,7 +395,7 @@ public:
     }
   }
 
-  void reserve(unsigned N) {
+  void reserve(size_type N) {
     if (this->capacity() < N)
       this->grow(N);
   }
@@ -414,9 +417,7 @@ public:
       this->grow(this->size()+NumInputs);
 
     // Copy the new elements over.
-    // TODO: NEED To compile time dispatch on whether in_iter is a random access
-    // iterator to use the fast uninitialized_copy.
-    std::uninitialized_copy(in_start, in_end, this->end());
+    this->uninitialized_copy(in_start, in_end, this->end());
     this->setEnd(this->end() + NumInputs);
   }
 
@@ -431,7 +432,7 @@ public:
     this->setEnd(this->end() + NumInputs);
   }
 
-  void assign(unsigned NumElts, const T &Elt) {
+  void assign(size_type NumElts, const T &Elt) {
     clear();
     if (this->capacity() < NumElts)
       this->grow(NumElts);
@@ -537,7 +538,7 @@ public:
     assert(I <= this->end() && "Inserting past the end of the vector.");
 
     // Ensure there is enough space.
-    reserve(static_cast<unsigned>(this->size() + NumToInsert));
+    reserve(this->size() + NumToInsert);
 
     // Uninvalidate the iterator.
     I = this->begin()+InsertElt;
@@ -591,7 +592,7 @@ public:
     size_t NumToInsert = std::distance(From, To);
 
     // Ensure there is enough space.
-    reserve(static_cast<unsigned>(this->size() + NumToInsert));
+    reserve(this->size() + NumToInsert);
 
     // Uninvalidate the iterator.
     I = this->begin()+InsertElt;
@@ -632,6 +633,13 @@ public:
     return I;
   }
 
+  template <typename... ArgTypes> void emplace_back(ArgTypes &&... Args) {
+    if (LLVM_UNLIKELY(this->EndX >= this->CapacityX))
+      this->grow();
+    ::new ((void *)this->end()) T(std::forward<ArgTypes>(Args)...);
+    this->setEnd(this->end() + 1);
+  }
+
   SmallVectorImpl &operator=(const SmallVectorImpl &RHS);
 
   SmallVectorImpl &operator=(SmallVectorImpl &&RHS);
@@ -658,7 +666,7 @@ public:
   /// of the buffer when they know that more elements are available, and only
   /// update the size later. This avoids the cost of value initializing elements
   /// which will only be overwritten.
-  void set_size(unsigned N) {
+  void set_size(size_type N) {
     assert(N <= this->capacity());
     this->setEnd(this->begin() + N);
   }
@@ -684,7 +692,7 @@ void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
   // Swap the shared elements.
   size_t NumShared = this->size();
   if (NumShared > RHS.size()) NumShared = RHS.size();
-  for (unsigned i = 0; i != static_cast<unsigned>(NumShared); ++i)
+  for (size_type i = 0; i != NumShared; ++i)
     std::swap((*this)[i], RHS[i]);
 
   // Copy over the extra elts.
@@ -841,7 +849,7 @@ public:
   SmallVector() : SmallVectorImpl<T>(N) {
   }
 
-  explicit SmallVector(unsigned Size, const T &Value = T())
+  explicit SmallVector(size_t Size, const T &Value = T())
     : SmallVectorImpl<T>(N) {
     this->assign(Size, Value);
   }
@@ -876,6 +884,17 @@ public:
     SmallVectorImpl<T>::operator=(::std::move(RHS));
     return *this;
   }
+
+  SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
+  }
+
+  const SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+
 };
 
 template<typename T, unsigned N>
diff --git a/include/llvm/ADT/SparseBitVector.h b/include/llvm/ADT/SparseBitVector.h
index d5bde29..20cbe2c 100644
--- a/include/llvm/ADT/SparseBitVector.h
+++ b/include/llvm/ADT/SparseBitVector.h
@@ -124,25 +124,15 @@ public:
   size_type count() const {
     unsigned NumBits = 0;
     for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i)
-      if (sizeof(BitWord) == 4)
-        NumBits += CountPopulation_32(Bits[i]);
-      else if (sizeof(BitWord) == 8)
-        NumBits += CountPopulation_64(Bits[i]);
-      else
-        llvm_unreachable("Unsupported!");
+      NumBits += countPopulation(Bits[i]);
     return NumBits;
   }
 
   /// find_first - Returns the index of the first set bit.
   int find_first() const {
     for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i)
-      if (Bits[i] != 0) {
-        if (sizeof(BitWord) == 4)
-          return i * BITWORD_SIZE + countTrailingZeros(Bits[i]);
-        if (sizeof(BitWord) == 8)
-          return i * BITWORD_SIZE + countTrailingZeros(Bits[i]);
-        llvm_unreachable("Unsupported!");
-      }
+      if (Bits[i] != 0)
+        return i * BITWORD_SIZE + countTrailingZeros(Bits[i]);
     llvm_unreachable("Illegal empty element");
   }
 
@@ -161,23 +151,13 @@ public:
     // Mask off previous bits.
     Copy &= ~0UL << BitPos;
 
-    if (Copy != 0) {
-      if (sizeof(BitWord) == 4)
-        return WordPos * BITWORD_SIZE + countTrailingZeros(Copy);
-      if (sizeof(BitWord) == 8)
-        return WordPos * BITWORD_SIZE + countTrailingZeros(Copy);
-      llvm_unreachable("Unsupported!");
-    }
+    if (Copy != 0)
+      return WordPos * BITWORD_SIZE + countTrailingZeros(Copy);
 
     // Check subsequent words.
     for (unsigned i = WordPos+1; i < BITWORDS_PER_ELEMENT; ++i)
-      if (Bits[i] != 0) {
-        if (sizeof(BitWord) == 4)
-          return i * BITWORD_SIZE + countTrailingZeros(Bits[i]);
-        if (sizeof(BitWord) == 8)
-          return i * BITWORD_SIZE + countTrailingZeros(Bits[i]);
-        llvm_unreachable("Unsupported!");
-      }
+      if (Bits[i] != 0)
+        return i * BITWORD_SIZE + countTrailingZeros(Bits[i]);
     return -1;
   }
 
diff --git a/include/llvm/ADT/SparseMultiSet.h b/include/llvm/ADT/SparseMultiSet.h
index f858536..e3aa258 100644
--- a/include/llvm/ADT/SparseMultiSet.h
+++ b/include/llvm/ADT/SparseMultiSet.h
@@ -133,8 +133,8 @@ class SparseMultiSet {
 
   // Disable copy construction and assignment.
   // This data structure is not meant to be used that way.
-  SparseMultiSet(const SparseMultiSet&) LLVM_DELETED_FUNCTION;
-  SparseMultiSet &operator=(const SparseMultiSet&) LLVM_DELETED_FUNCTION;
+  SparseMultiSet(const SparseMultiSet&) = delete;
+  SparseMultiSet &operator=(const SparseMultiSet&) = delete;
 
   /// Whether the given entry is the head of the list. List heads's previous
   /// pointers are to the tail of the list, allowing for efficient access to the
diff --git a/include/llvm/ADT/SparseSet.h b/include/llvm/ADT/SparseSet.h
index 9a13440..a45d1c8 100644
--- a/include/llvm/ADT/SparseSet.h
+++ b/include/llvm/ADT/SparseSet.h
@@ -133,8 +133,8 @@ class SparseSet {
 
   // Disable copy construction and assignment.
   // This data structure is not meant to be used that way.
-  SparseSet(const SparseSet&) LLVM_DELETED_FUNCTION;
-  SparseSet &operator=(const SparseSet&) LLVM_DELETED_FUNCTION;
+  SparseSet(const SparseSet&) = delete;
+  SparseSet &operator=(const SparseSet&) = delete;
 
 public:
   typedef ValueT value_type;
diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h
index 2feb2ab..8721c73 100644
--- a/include/llvm/ADT/StringMap.h
+++ b/include/llvm/ADT/StringMap.h
@@ -111,7 +111,7 @@ public:
 /// and data.
 template<typename ValueTy>
 class StringMapEntry : public StringMapEntryBase {
-  StringMapEntry(StringMapEntry &E) LLVM_DELETED_FUNCTION;
+  StringMapEntry(StringMapEntry &E) = delete;
 public:
   ValueTy second;
 
@@ -179,19 +179,6 @@ public:
     return Create(Key, ValueTy());
   }
 
-  /// GetStringMapEntryFromValue - Given a value that is known to be embedded
-  /// into a StringMapEntry, return the StringMapEntry itself.
-  static StringMapEntry &GetStringMapEntryFromValue(ValueTy &V) {
-    StringMapEntry *EPtr = 0;
-    char *Ptr = reinterpret_cast<char*>(&V) -
-                  (reinterpret_cast<char*>(&EPtr->second) -
-                   reinterpret_cast<char*>(EPtr));
-    return *reinterpret_cast<StringMapEntry*>(Ptr);
-  }
-  static const StringMapEntry &GetStringMapEntryFromValue(const ValueTy &V) {
-    return GetStringMapEntryFromValue(const_cast<ValueTy&>(V));
-  }
-
   /// GetStringMapEntryFromKeyData - Given key data that is known to be embedded
   /// into a StringMapEntry, return the StringMapEntry itself.
   static StringMapEntry &GetStringMapEntryFromKeyData(const char *KeyData) {
diff --git a/include/llvm/ADT/StringRef.h b/include/llvm/ADT/StringRef.h
index 778fa10..6111c42 100644
--- a/include/llvm/ADT/StringRef.h
+++ b/include/llvm/ADT/StringRef.h
@@ -91,6 +91,13 @@ namespace llvm {
 
     iterator end() const { return Data + Length; }
 
+    const unsigned char *bytes_begin() const {
+      return reinterpret_cast<const unsigned char *>(begin());
+    }
+    const unsigned char *bytes_end() const {
+      return reinterpret_cast<const unsigned char *>(end());
+    }
+
     /// @}
     /// @name String Operations
     /// @{
diff --git a/include/llvm/ADT/TinyPtrVector.h b/include/llvm/ADT/TinyPtrVector.h
index e158f9d..f29608f 100644
--- a/include/llvm/ADT/TinyPtrVector.h
+++ b/include/llvm/ADT/TinyPtrVector.h
@@ -27,9 +27,12 @@ class TinyPtrVector {
 public:
   typedef llvm::SmallVector<EltTy, 4> VecTy;
   typedef typename VecTy::value_type value_type;
+  typedef llvm::PointerUnion<EltTy, VecTy *> PtrUnion;
 
-  llvm::PointerUnion<EltTy, VecTy*> Val;
+private:
+  PtrUnion Val;
 
+public:
   TinyPtrVector() {}
   ~TinyPtrVector() {
     if (VecTy *V = Val.template dyn_cast<VecTy*>())
@@ -96,6 +99,14 @@ public:
     return *this;
   }
 
+  /// Constructor from an ArrayRef.
+  ///
+  /// This also is a constructor for individual array elements due to the single
+  /// element constructor for ArrayRef.
+  explicit TinyPtrVector(ArrayRef<EltTy> Elts)
+      : Val(Elts.size() == 1 ? PtrUnion(Elts[0])
+                             : PtrUnion(new VecTy(Elts.begin(), Elts.end()))) {}
+
   // implicit conversion operator to ArrayRef.
   operator ArrayRef<EltTy>() const {
     if (Val.isNull())
@@ -105,6 +116,15 @@ public:
     return *Val.template get<VecTy*>();
   }
 
+  // implicit conversion operator to MutableArrayRef.
+  operator MutableArrayRef<EltTy>() {
+    if (Val.isNull())
+      return None;
+    if (Val.template is<EltTy>())
+      return *Val.getAddrOfPtr1();
+    return *Val.template get<VecTy*>();
+  }
+
   bool empty() const {
     // This vector can be empty if it contains no element, or if it
     // contains a pointer to an empty vector.
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index fbc19f8..886f6fb 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -50,6 +50,7 @@ public:
     armeb,      // ARM (big endian): armeb
     aarch64,    // AArch64 (little endian): aarch64
     aarch64_be, // AArch64 (big endian): aarch64_be
+    bpf,        // eBPF or extended BPF or 64-bit BPF (little endian)
     hexagon,    // Hexagon: hexagon
     mips,       // MIPS: mips, mipsallegrex
     mipsel,     // MIPSEL: mipsel, mipsallegrexel
@@ -60,6 +61,7 @@ public:
     ppc64,      // PPC64: powerpc64, ppu
     ppc64le,    // PPC64LE: powerpc64le
     r600,       // R600: AMD GPUs HD2XXX - HD6XXX
+    amdgcn,     // AMDGCN: AMD GCN GPUs
     sparc,      // Sparc: sparc
     sparcv9,    // Sparcv9: Sparcv9
     systemz,    // SystemZ: s390x
@@ -138,7 +140,9 @@ public:
     Bitrig,
     AIX,
     CUDA,       // NVIDIA CUDA
-    NVCL        // NVIDIA OpenCL
+    NVCL,       // NVIDIA OpenCL
+    AMDHSA,     // AMD HSA Runtime
+    PS4
   };
   enum EnvironmentType {
     UnknownEnvironment,
@@ -198,6 +202,13 @@ public:
   Triple(const Twine &ArchStr, const Twine &VendorStr, const Twine &OSStr,
          const Twine &EnvironmentStr);
 
+  bool operator==(const Triple &Other) const {
+    return Arch == Other.Arch && SubArch == Other.SubArch &&
+           Vendor == Other.Vendor && OS == Other.OS &&
+           Environment == Other.Environment &&
+           ObjectFormat == Other.ObjectFormat;
+  }
+
   /// @}
   /// @name Normalization
   /// @{
@@ -208,6 +219,9 @@ public:
   /// common case in which otherwise valid components are in the wrong order.
   static std::string normalize(StringRef Str);
 
+  /// \brief Return the normalized form of this triple's string.
+  std::string normalize() const { return normalize(Data); }
+
   /// @}
   /// @name Typed Component Access
   /// @{
@@ -332,6 +346,12 @@ public:
     return false;
   }
 
+  bool isOSVersionLT(const Triple &Other) const {
+    unsigned RHS[3];
+    Other.getOSVersion(RHS[0], RHS[1], RHS[2]);
+    return isOSVersionLT(RHS[0], RHS[1], RHS[2]);
+  }
+
   /// isMacOSXVersionLT - Comparison function for checking OS X version
   /// compatibility, which handles supporting skewed version numbering schemes
   /// used by the "darwin" triples.
@@ -364,10 +384,28 @@ public:
     return isMacOSX() || isiOS();
   }
 
+  bool isOSNetBSD() const {
+    return getOS() == Triple::NetBSD;
+  }
+
+  bool isOSOpenBSD() const {
+    return getOS() == Triple::OpenBSD;
+  }
+
   bool isOSFreeBSD() const {
     return getOS() == Triple::FreeBSD;
   }
 
+  bool isOSDragonFly() const { return getOS() == Triple::DragonFly; }
+
+  bool isOSSolaris() const {
+    return getOS() == Triple::Solaris;
+  }
+
+  bool isOSBitrig() const {
+    return getOS() == Triple::Bitrig;
+  }
+
   bool isWindowsMSVCEnvironment() const {
     return getOS() == Triple::Win32 &&
            (getEnvironment() == Triple::UnknownEnvironment ||
@@ -403,7 +441,7 @@ public:
 
   /// \brief Tests whether the OS is Windows.
   bool isOSWindows() const {
-    return getOS() == Triple::Win32 || isOSCygMing();
+    return getOS() == Triple::Win32;
   }
 
   /// \brief Tests whether the OS is NaCl (Native Client)
@@ -431,6 +469,19 @@ public:
     return getObjectFormat() == Triple::MachO;
   }
 
+  /// \brief Tests whether the target is the PS4 CPU
+  bool isPS4CPU() const {
+    return getArch() == Triple::x86_64 &&
+           getVendor() == Triple::SCEI &&
+           getOS() == Triple::PS4;
+  }
+
+  /// \brief Tests whether the target is the PS4 platform
+  bool isPS4() const {
+    return getVendor() == Triple::SCEI &&
+           getOS() == Triple::PS4;
+  }
+
   /// @}
   /// @name Mutators
   /// @{
diff --git a/include/llvm/ADT/Twine.h b/include/llvm/ADT/Twine.h
index 77d92b4..9e9a4e1 100644
--- a/include/llvm/ADT/Twine.h
+++ b/include/llvm/ADT/Twine.h
@@ -80,7 +80,7 @@ namespace llvm {
   /// StringRef) codegen as desired.
   class Twine {
     /// NodeKind - Represent the type of an argument.
-    enum NodeKind {
+    enum NodeKind : unsigned char {
       /// An empty string; the result of concatenating anything with it is also
       /// empty.
       NullKind,
@@ -153,12 +153,10 @@ namespace llvm {
     /// RHS - The suffix in the concatenation, which may be uninitialized for
     /// Null or Empty kinds.
     Child RHS;
-    // enums stored as unsigned chars to save on space while some compilers
-    // don't support specifying the backing type for an enum
     /// LHSKind - The NodeKind of the left hand side, \see getLHSKind().
-    unsigned char LHSKind;
+    NodeKind LHSKind;
     /// RHSKind - The NodeKind of the right hand side, \see getRHSKind().
-    unsigned char RHSKind;
+    NodeKind RHSKind;
 
   private:
     /// Construct a nullary twine; the kind must be NullKind or EmptyKind.
@@ -184,7 +182,7 @@ namespace llvm {
 
     /// Since the intended use of twines is as temporary objects, assignments
     /// when concatenating might cause undefined behavior or stack corruptions
-    Twine &operator=(const Twine &Other) LLVM_DELETED_FUNCTION;
+    Twine &operator=(const Twine &Other) = delete;
 
     /// isNull - Check for the null twine.
     bool isNull() const {
@@ -238,10 +236,10 @@ namespace llvm {
     }
 
     /// getLHSKind - Get the NodeKind of the left-hand side.
-    NodeKind getLHSKind() const { return (NodeKind) LHSKind; }
+    NodeKind getLHSKind() const { return LHSKind; }
 
     /// getRHSKind - Get the NodeKind of the right-hand side.
-    NodeKind getRHSKind() const { return (NodeKind) RHSKind; }
+    NodeKind getRHSKind() const { return RHSKind; }
 
     /// printOneChild - Print one child from a twine.
     void printOneChild(raw_ostream &OS, Child Ptr, NodeKind Kind) const;
diff --git a/include/llvm/ADT/ilist.h b/include/llvm/ADT/ilist.h
index 8c19a6f..a7b9306 100644
--- a/include/llvm/ADT/ilist.h
+++ b/include/llvm/ADT/ilist.h
@@ -237,14 +237,14 @@ public:
 // These are to catch errors when people try to use them as random access
 // iterators.
 template<typename T>
-void operator-(int, ilist_iterator<T>) LLVM_DELETED_FUNCTION;
+void operator-(int, ilist_iterator<T>) = delete;
 template<typename T>
-void operator-(ilist_iterator<T>,int) LLVM_DELETED_FUNCTION;
+void operator-(ilist_iterator<T>,int) = delete;
 
 template<typename T>
-void operator+(int, ilist_iterator<T>) LLVM_DELETED_FUNCTION;
+void operator+(int, ilist_iterator<T>) = delete;
 template<typename T>
-void operator+(ilist_iterator<T>,int) LLVM_DELETED_FUNCTION;
+void operator+(ilist_iterator<T>,int) = delete;
 
 // operator!=/operator== - Allow mixed comparisons without dereferencing
 // the iterator, which could very likely be pointing to end().
@@ -332,8 +332,8 @@ class iplist : public Traits {
 
   // No fundamental reason why iplist can't be copyable, but the default
   // copy/copy-assign won't do.
-  iplist(const iplist &) LLVM_DELETED_FUNCTION;
-  void operator=(const iplist &) LLVM_DELETED_FUNCTION;
+  iplist(const iplist &) = delete;
+  void operator=(const iplist &) = delete;
 
 public:
   typedef NodeTy *pointer;
diff --git a/include/llvm/ADT/iterator.h b/include/llvm/ADT/iterator.h
index 56041db..e2c9e5e 100644
--- a/include/llvm/ADT/iterator.h
+++ b/include/llvm/ADT/iterator.h
@@ -10,8 +10,8 @@
 #ifndef LLVM_ADT_ITERATOR_H
 #define LLVM_ADT_ITERATOR_H
 
-#include <iterator>
 #include <cstddef>
+#include <iterator>
 
 namespace llvm {
 
diff --git a/include/llvm/ADT/iterator_range.h b/include/llvm/ADT/iterator_range.h
index ecaf4a2..523a86f 100644
--- a/include/llvm/ADT/iterator_range.h
+++ b/include/llvm/ADT/iterator_range.h
@@ -32,7 +32,6 @@ class iterator_range {
   IteratorT begin_iterator, end_iterator;
 
 public:
-  iterator_range() {}
   iterator_range(IteratorT begin_iterator, IteratorT end_iterator)
       : begin_iterator(std::move(begin_iterator)),
         end_iterator(std::move(end_iterator)) {}
diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index 9bfa045..763f372 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h
@@ -502,7 +502,7 @@ public:
   ///
 
   /// canBasicBlockModify - Return true if it is possible for execution of the
-  /// specified basic block to modify the value pointed to by Ptr.
+  /// specified basic block to modify the location Loc.
   bool canBasicBlockModify(const BasicBlock &BB, const Location &Loc);
 
   /// canBasicBlockModify - A convenience wrapper.
@@ -510,17 +510,20 @@ public:
     return canBasicBlockModify(BB, Location(P, Size));
   }
 
-  /// canInstructionRangeModify - Return true if it is possible for the
-  /// execution of the specified instructions to modify the value pointed to by
-  /// Ptr.  The instructions to consider are all of the instructions in the
-  /// range of [I1,I2] INCLUSIVE.  I1 and I2 must be in the same basic block.
-  bool canInstructionRangeModify(const Instruction &I1, const Instruction &I2,
-                                 const Location &Loc);
-
-  /// canInstructionRangeModify - A convenience wrapper.
-  bool canInstructionRangeModify(const Instruction &I1, const Instruction &I2,
-                                 const Value *Ptr, uint64_t Size) {
-    return canInstructionRangeModify(I1, I2, Location(Ptr, Size));
+  /// canInstructionRangeModRef - Return true if it is possible for the
+  /// execution of the specified instructions to mod\ref (according to the
+  /// mode) the location Loc. The instructions to consider are all
+  /// of the instructions in the range of [I1,I2] INCLUSIVE.
+  /// I1 and I2 must be in the same basic block.
+  bool canInstructionRangeModRef(const Instruction &I1,
+                                const Instruction &I2, const Location &Loc,
+                                const ModRefResult Mode);
+
+  /// canInstructionRangeModRef - A convenience wrapper.
+  bool canInstructionRangeModRef(const Instruction &I1,
+                                 const Instruction &I2, const Value *Ptr,
+                                 uint64_t Size, const ModRefResult Mode) {
+    return canInstructionRangeModRef(I1, I2, Location(Ptr, Size), Mode);
   }
 
   //===--------------------------------------------------------------------===//
diff --git a/include/llvm/Analysis/AliasSetTracker.h b/include/llvm/Analysis/AliasSetTracker.h
index 036d58d..afa7e6f 100644
--- a/include/llvm/Analysis/AliasSetTracker.h
+++ b/include/llvm/Analysis/AliasSetTracker.h
@@ -226,8 +226,8 @@ private:
       AccessTy(NoModRef), AliasTy(MustAlias), Volatile(false) {
   }
 
-  AliasSet(const AliasSet &AS) LLVM_DELETED_FUNCTION;
-  void operator=(const AliasSet &AS) LLVM_DELETED_FUNCTION;
+  AliasSet(const AliasSet &AS) = delete;
+  void operator=(const AliasSet &AS) = delete;
 
   PointerRec *getSomePointer() const {
     return PtrList;
diff --git a/include/llvm/Analysis/AssumptionCache.h b/include/llvm/Analysis/AssumptionCache.h
new file mode 100644
index 0000000..fc1393f
--- /dev/null
+++ b/include/llvm/Analysis/AssumptionCache.h
@@ -0,0 +1,183 @@
+//===- llvm/Analysis/AssumptionCache.h - Track @llvm.assume ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that keeps track of @llvm.assume intrinsics in
+// the functions of a module (allowing assumptions within any function to be
+// found cheaply by other parts of the optimizer).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_ASSUMPTIONCACHE_H
+#define LLVM_ANALYSIS_ASSUMPTIONCACHE_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include <memory>
+
+namespace llvm {
+
+// FIXME: Replace this brittle forward declaration with the include of the new
+// PassManager.h when doing so doesn't break the PassManagerBuilder.
+template <typename IRUnitT> class AnalysisManager;
+class PreservedAnalyses;
+
+/// \brief A cache of @llvm.assume calls within a function.
+///
+/// This cache provides fast lookup of assumptions within a function by caching
+/// them and amortizing the cost of scanning for them across all queries. The
+/// cache is also conservatively self-updating so that it will never return
+/// incorrect results about a function even as the function is being mutated.
+/// However, flushing the cache and rebuilding it (or explicitly updating it)
+/// may allow it to discover new assumptions.
+class AssumptionCache {
+  /// \brief The function for which this cache is handling assumptions.
+  ///
+  /// We track this to lazily populate our assumptions.
+  Function &F;
+
+  /// \brief Vector of weak value handles to calls of the @llvm.assume
+  /// intrinsic.
+  SmallVector<WeakVH, 4> AssumeHandles;
+
+  /// \brief Flag tracking whether we have scanned the function yet.
+  ///
+  /// We want to be as lazy about this as possible, and so we scan the function
+  /// at the last moment.
+  bool Scanned;
+
+  /// \brief Scan the function for assumptions and add them to the cache.
+  void scanFunction();
+
+public:
+  /// \brief Construct an AssumptionCache from a function by scanning all of
+  /// its instructions.
+  AssumptionCache(Function &F) : F(F), Scanned(false) {}
+
+  /// \brief Add an @llvm.assume intrinsic to this function's cache.
+  ///
+  /// The call passed in must be an instruction within this fuction and must
+  /// not already be in the cache.
+  void registerAssumption(CallInst *CI);
+
+  /// \brief Clear the cache of @llvm.assume intrinsics for a function.
+  ///
+  /// It will be re-scanned the next time it is requested.
+  void clear() {
+    AssumeHandles.clear();
+    Scanned = false;
+  }
+
+  /// \brief Access the list of assumption handles currently tracked for this
+  /// fuction.
+  ///
+  /// Note that these produce weak handles that may be null. The caller must
+  /// handle that case.
+  /// FIXME: We should replace this with pointee_iterator<filter_iterator<...>>
+  /// when we can write that to filter out the null values. Then caller code
+  /// will become simpler.
+  MutableArrayRef<WeakVH> assumptions() {
+    if (!Scanned)
+      scanFunction();
+    return AssumeHandles;
+  }
+};
+
+/// \brief A function analysis which provides an \c AssumptionCache.
+///
+/// This analysis is intended for use with the new pass manager and will vend
+/// assumption caches for a given function.
+class AssumptionAnalysis {
+  static char PassID;
+
+public:
+  typedef AssumptionCache Result;
+
+  /// \brief Opaque, unique identifier for this analysis pass.
+  static void *ID() { return (void *)&PassID; }
+
+  /// \brief Provide a name for the analysis for debugging and logging.
+  static StringRef name() { return "AssumptionAnalysis"; }
+
+  AssumptionAnalysis() {}
+  AssumptionAnalysis(const AssumptionAnalysis &Arg) {}
+  AssumptionAnalysis(AssumptionAnalysis &&Arg) {}
+  AssumptionAnalysis &operator=(const AssumptionAnalysis &RHS) { return *this; }
+  AssumptionAnalysis &operator=(AssumptionAnalysis &&RHS) { return *this; }
+
+  AssumptionCache run(Function &F) { return AssumptionCache(F); }
+};
+
+/// \brief Printer pass for the \c AssumptionAnalysis results.
+class AssumptionPrinterPass {
+  raw_ostream &OS;
+
+public:
+  explicit AssumptionPrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(Function &F, AnalysisManager<Function> *AM);
+
+  static StringRef name() { return "AssumptionPrinterPass"; }
+};
+
+/// \brief An immutable pass that tracks lazily created \c AssumptionCache
+/// objects.
+///
+/// This is essentially a workaround for the legacy pass manager's weaknesses
+/// which associates each assumption cache with Function and clears it if the
+/// function is deleted. The nature of the AssumptionCache is that it is not
+/// invalidated by any changes to the function body and so this is sufficient
+/// to be conservatively correct.
+class AssumptionCacheTracker : public ImmutablePass {
+  /// A callback value handle applied to function objects, which we use to
+  /// delete our cache of intrinsics for a function when it is deleted.
+  class FunctionCallbackVH : public CallbackVH {
+    AssumptionCacheTracker *ACT;
+    void deleted() override;
+
+  public:
+    typedef DenseMapInfo<Value *> DMI;
+
+    FunctionCallbackVH(Value *V, AssumptionCacheTracker *ACT = nullptr)
+        : CallbackVH(V), ACT(ACT) {}
+  };
+
+  friend FunctionCallbackVH;
+
+  typedef DenseMap<FunctionCallbackVH, std::unique_ptr<AssumptionCache>,
+                   FunctionCallbackVH::DMI> FunctionCallsMap;
+  FunctionCallsMap AssumptionCaches;
+
+public:
+  /// \brief Get the cached assumptions for a function.
+  ///
+  /// If no assumptions are cached, this will scan the function. Otherwise, the
+  /// existing cache will be returned.
+  AssumptionCache &getAssumptionCache(Function &F);
+
+  AssumptionCacheTracker();
+  ~AssumptionCacheTracker();
+
+  void releaseMemory() override { AssumptionCaches.shrink_and_clear(); }
+
+  void verifyAnalysis() const override;
+  bool doFinalization(Module &) override {
+    verifyAnalysis();
+    return false;
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/Analysis/AssumptionTracker.h b/include/llvm/Analysis/AssumptionTracker.h
deleted file mode 100644
index 5a050a8..0000000
--- a/include/llvm/Analysis/AssumptionTracker.h
+++ /dev/null
@@ -1,128 +0,0 @@
-//===- llvm/Analysis/AssumptionTracker.h - Track @llvm.assume ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass that keeps track of @llvm.assume intrinsics in
-// the functions of a module (allowing assumptions within any function to be
-// found cheaply by other parts of the optimizer).
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_ASSUMPTIONTRACKER_H
-#define LLVM_ANALYSIS_ASSUMPTIONTRACKER_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/Pass.h"
-#include <memory>
-
-namespace llvm {
-
-/// An immutable pass that tracks @llvm.assume intrinsics in a module.
-class AssumptionTracker : public ImmutablePass {
-  /// A callback value handle applied to function objects, which we use to
-  /// delete our cache of intrinsics for a function when it is deleted.
-  class FunctionCallbackVH : public CallbackVH {
-    AssumptionTracker *AT;
-    void deleted() override;
-
-    public:
-      typedef DenseMapInfo<Value *> DMI;
-
-      FunctionCallbackVH(Value *V, AssumptionTracker *AT = nullptr)
-        : CallbackVH(V), AT(AT) {}
-  };
-
-  /// A callback value handle applied to call instructions, which keeps
-  /// track of the call's parent function so that we can remove a
-  /// assumption intrinsic call from our cache when the instruction is
-  /// deleted.
-  class CallCallbackVH : public CallbackVH {
-    AssumptionTracker *AT;
-    void deleted() override;
-
-    // We store the function here because we need it to lookup the set
-    // containing this handle when the underlying CallInst is being deleted.
-    Function *F;
-
-    public:
-      typedef DenseMapInfo<Instruction *> DMI;
-
-      CallCallbackVH(Instruction *I, AssumptionTracker *AT = nullptr)
-        : CallbackVH(I), AT(AT), F(nullptr) {
-        if (I != DMI::getEmptyKey() && I != DMI::getTombstoneKey())
-          F = I->getParent()->getParent();
-      }
-
-      operator CallInst*() const {
-        Value *V = getValPtr();
-        if (V == DMI::getEmptyKey() || V == DMI::getTombstoneKey())
-          return reinterpret_cast<CallInst*>(V);
-
-        return cast<CallInst>(V);
-      }
-
-      CallInst *operator->() const { return cast<CallInst>(getValPtr()); }
-      CallInst &operator*() const { return *cast<CallInst>(getValPtr()); }
-  };
-
-  friend FunctionCallbackVH;
-  friend CallCallbackVH;
-
-  // FIXME: SmallSet might be better here, but it currently has no iterators.
-  typedef DenseSet<CallCallbackVH, CallCallbackVH::DMI> CallHandleSet;
-  typedef DenseMap<FunctionCallbackVH, std::unique_ptr<CallHandleSet>,
-                   FunctionCallbackVH::DMI> FunctionCallsMap;
-  FunctionCallsMap CachedAssumeCalls;
-
-  /// Scan the provided function for @llvm.assume intrinsic calls. Returns an
-  /// iterator to the set for this function in the CachedAssumeCalls map.
-  FunctionCallsMap::iterator scanFunction(Function *F);
-
-public:
-  /// Remove the cache of @llvm.assume intrinsics for the given function.
-  void forgetCachedAssumptions(Function *F);
-
-  /// Add an @llvm.assume intrinsic to the cache for its parent function.
-  void registerAssumption(CallInst *CI);
-
-  typedef CallHandleSet::iterator assumption_iterator;
-  typedef iterator_range<assumption_iterator> assumption_range;
-
-  inline assumption_range assumptions(Function *F) {
-    FunctionCallsMap::iterator I = CachedAssumeCalls.find_as(F);
-    if (I == CachedAssumeCalls.end()) {
-      I = scanFunction(F);
-    }
-
-    return assumption_range(I->second->begin(), I->second->end());
-  }
-
-  AssumptionTracker();
-  ~AssumptionTracker();
-
-  void releaseMemory() override {
-    CachedAssumeCalls.shrink_and_clear();
-  }
-
-  void verifyAnalysis() const override;
-  bool doFinalization(Module &) override {
-    verifyAnalysis();
-    return false;
-  }
-
-  static char ID; // Pass identification, replacement for typeid
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/include/llvm/Analysis/BranchProbabilityInfo.h b/include/llvm/Analysis/BranchProbabilityInfo.h
index 4414c84..89eef68 100644
--- a/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -111,6 +111,10 @@ public:
   void setEdgeWeight(const BasicBlock *Src, unsigned IndexInSuccessors,
                      uint32_t Weight);
 
+  static uint32_t getBranchWeightStackProtector(bool IsLikely) {
+    return IsLikely ? (1u << 20) - 1 : 1;
+  }
+
 private:
   // Since we allow duplicate edges from one basic block to another, we use
   // a pair (PredBlock and an index in the successors) to specify an edge.
diff --git a/include/llvm/Analysis/CGSCCPassManager.h b/include/llvm/Analysis/CGSCCPassManager.h
index 1533b36..0d4fe93 100644
--- a/include/llvm/Analysis/CGSCCPassManager.h
+++ b/include/llvm/Analysis/CGSCCPassManager.h
@@ -21,135 +21,25 @@
 #ifndef LLVM_ANALYSIS_CGSCCPASSMANAGER_H
 #define LLVM_ANALYSIS_CGSCCPASSMANAGER_H
 
-#include "llvm/IR/PassManager.h"
 #include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
-class CGSCCAnalysisManager;
-
-class CGSCCPassManager {
-public:
-  // We have to explicitly define all the special member functions because MSVC
-  // refuses to generate them.
-  CGSCCPassManager() {}
-  CGSCCPassManager(CGSCCPassManager &&Arg) : Passes(std::move(Arg.Passes)) {}
-  CGSCCPassManager &operator=(CGSCCPassManager &&RHS) {
-    Passes = std::move(RHS.Passes);
-    return *this;
-  }
-
-  /// \brief Run all of the CGSCC passes in this pass manager over a SCC.
-  PreservedAnalyses run(LazyCallGraph::SCC *C,
-                        CGSCCAnalysisManager *AM = nullptr);
-
-  template <typename CGSCCPassT> void addPass(CGSCCPassT Pass) {
-    Passes.emplace_back(new CGSCCPassModel<CGSCCPassT>(std::move(Pass)));
-  }
-
-  static StringRef name() { return "CGSCCPassManager"; }
-
-private:
-  // Pull in the concept type and model template specialized for SCCs.
-  typedef detail::PassConcept<LazyCallGraph::SCC *, CGSCCAnalysisManager>
-  CGSCCPassConcept;
-  template <typename PassT>
-  struct CGSCCPassModel
-      : detail::PassModel<LazyCallGraph::SCC *, CGSCCAnalysisManager, PassT> {
-    CGSCCPassModel(PassT Pass)
-        : detail::PassModel<LazyCallGraph::SCC *, CGSCCAnalysisManager, PassT>(
-              std::move(Pass)) {}
-  };
-
-  CGSCCPassManager(const CGSCCPassManager &) LLVM_DELETED_FUNCTION;
-  CGSCCPassManager &operator=(const CGSCCPassManager &) LLVM_DELETED_FUNCTION;
-
-  std::vector<std::unique_ptr<CGSCCPassConcept>> Passes;
-};
-
-/// \brief A function analysis manager to coordinate and cache analyses run over
-/// a module.
-class CGSCCAnalysisManager : public detail::AnalysisManagerBase<
-                                 CGSCCAnalysisManager, LazyCallGraph::SCC *> {
-  friend class detail::AnalysisManagerBase<CGSCCAnalysisManager,
-                                           LazyCallGraph::SCC *>;
-  typedef detail::AnalysisManagerBase<CGSCCAnalysisManager,
-                                      LazyCallGraph::SCC *> BaseT;
-  typedef BaseT::ResultConceptT ResultConceptT;
-  typedef BaseT::PassConceptT PassConceptT;
-
-public:
-  // Most public APIs are inherited from the CRTP base class.
-
-  // We have to explicitly define all the special member functions because MSVC
-  // refuses to generate them.
-  CGSCCAnalysisManager() {}
-  CGSCCAnalysisManager(CGSCCAnalysisManager &&Arg)
-      : BaseT(std::move(static_cast<BaseT &>(Arg))),
-        CGSCCAnalysisResults(std::move(Arg.CGSCCAnalysisResults)) {}
-  CGSCCAnalysisManager &operator=(CGSCCAnalysisManager &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    CGSCCAnalysisResults = std::move(RHS.CGSCCAnalysisResults);
-    return *this;
-  }
-
-  /// \brief Returns true if the analysis manager has an empty results cache.
-  bool empty() const;
-
-  /// \brief Clear the function analysis result cache.
-  ///
-  /// This routine allows cleaning up when the set of functions itself has
-  /// potentially changed, and thus we can't even look up a a result and
-  /// invalidate it directly. Notably, this does *not* call invalidate
-  /// functions as there is nothing to be done for them.
-  void clear();
-
-private:
-  CGSCCAnalysisManager(const CGSCCAnalysisManager &) LLVM_DELETED_FUNCTION;
-  CGSCCAnalysisManager &
-  operator=(const CGSCCAnalysisManager &) LLVM_DELETED_FUNCTION;
-
-  /// \brief Get a function pass result, running the pass if necessary.
-  ResultConceptT &getResultImpl(void *PassID, LazyCallGraph::SCC *C);
-
-  /// \brief Get a cached function pass result or return null.
-  ResultConceptT *getCachedResultImpl(void *PassID,
-                                      LazyCallGraph::SCC *C) const;
-
-  /// \brief Invalidate a function pass result.
-  void invalidateImpl(void *PassID, LazyCallGraph::SCC *C);
-
-  /// \brief Invalidate the results for a function..
-  void invalidateImpl(LazyCallGraph::SCC *C, const PreservedAnalyses &PA);
+/// \brief The CGSCC pass manager.
+///
+/// See the documentation for the PassManager template for details. It runs
+/// a sequency of SCC passes over each SCC that the manager is run over. This
+/// typedef serves as a convenient way to refer to this construct.
+typedef PassManager<LazyCallGraph::SCC> CGSCCPassManager;
 
-  /// \brief List of function analysis pass IDs and associated concept pointers.
-  ///
-  /// Requires iterators to be valid across appending new entries and arbitrary
-  /// erases. Provides both the pass ID and concept pointer such that it is
-  /// half of a bijection and provides storage for the actual result concept.
-  typedef std::list<
-      std::pair<void *, std::unique_ptr<detail::AnalysisResultConcept<
-                            LazyCallGraph::SCC *>>>> CGSCCAnalysisResultListT;
-
-  /// \brief Map type from function pointer to our custom list type.
-  typedef DenseMap<LazyCallGraph::SCC *, CGSCCAnalysisResultListT>
-  CGSCCAnalysisResultListMapT;
-
-  /// \brief Map from function to a list of function analysis results.
-  ///
-  /// Provides linear time removal of all analysis results for a function and
-  /// the ultimate storage for a particular cached analysis result.
-  CGSCCAnalysisResultListMapT CGSCCAnalysisResultLists;
-
-  /// \brief Map type from a pair of analysis ID and function pointer to an
-  /// iterator into a particular result list.
-  typedef DenseMap<std::pair<void *, LazyCallGraph::SCC *>,
-                   CGSCCAnalysisResultListT::iterator> CGSCCAnalysisResultMapT;
-
-  /// \brief Map from an analysis ID and function to a particular cached
-  /// analysis result.
-  CGSCCAnalysisResultMapT CGSCCAnalysisResults;
-};
+/// \brief The CGSCC analysis manager.
+///
+/// See the documentation for the AnalysisManager template for detail
+/// documentation. This typedef serves as a convenient way to refer to this
+/// construct in the adaptors and proxies used to integrate this into the larger
+/// pass manager infrastructure.
+typedef AnalysisManager<LazyCallGraph::SCC> CGSCCAnalysisManager;
 
 /// \brief A module analysis which acts as a proxy for a CGSCC analysis
 /// manager.
@@ -187,7 +77,7 @@ public:
     /// Regardless of whether this analysis is marked as preserved, all of the
     /// analyses in the \c CGSCCAnalysisManager are potentially invalidated
     /// based on the set of preserved analyses.
-    bool invalidate(Module *M, const PreservedAnalyses &PA);
+    bool invalidate(Module &M, const PreservedAnalyses &PA);
 
   private:
     CGSCCAnalysisManager *CGAM;
@@ -195,12 +85,13 @@ public:
 
   static void *ID() { return (void *)&PassID; }
 
+  static StringRef name() { return "CGSCCAnalysisManagerModuleProxy"; }
+
   explicit CGSCCAnalysisManagerModuleProxy(CGSCCAnalysisManager &CGAM)
       : CGAM(&CGAM) {}
   // We have to explicitly define all the special member functions because MSVC
   // refuses to generate them.
-  CGSCCAnalysisManagerModuleProxy(
-      const CGSCCAnalysisManagerModuleProxy &Arg)
+  CGSCCAnalysisManagerModuleProxy(const CGSCCAnalysisManagerModuleProxy &Arg)
       : CGAM(Arg.CGAM) {}
   CGSCCAnalysisManagerModuleProxy(CGSCCAnalysisManagerModuleProxy &&Arg)
       : CGAM(std::move(Arg.CGAM)) {}
@@ -219,7 +110,7 @@ public:
   /// In debug builds, it will also assert that the analysis manager is empty
   /// as no queries should arrive at the CGSCC analysis manager prior to
   /// this analysis being requested.
-  Result run(Module *M);
+  Result run(Module &M);
 
 private:
   static char PassID;
@@ -257,7 +148,7 @@ public:
     const ModuleAnalysisManager &getManager() const { return *MAM; }
 
     /// \brief Handle invalidation by ignoring it, this pass is immutable.
-    bool invalidate(LazyCallGraph::SCC *) { return false; }
+    bool invalidate(LazyCallGraph::SCC &) { return false; }
 
   private:
     const ModuleAnalysisManager *MAM;
@@ -265,12 +156,13 @@ public:
 
   static void *ID() { return (void *)&PassID; }
 
+  static StringRef name() { return "ModuleAnalysisManagerCGSCCProxy"; }
+
   ModuleAnalysisManagerCGSCCProxy(const ModuleAnalysisManager &MAM)
       : MAM(&MAM) {}
   // We have to explicitly define all the special member functions because MSVC
   // refuses to generate them.
-  ModuleAnalysisManagerCGSCCProxy(
-      const ModuleAnalysisManagerCGSCCProxy &Arg)
+  ModuleAnalysisManagerCGSCCProxy(const ModuleAnalysisManagerCGSCCProxy &Arg)
       : MAM(Arg.MAM) {}
   ModuleAnalysisManagerCGSCCProxy(ModuleAnalysisManagerCGSCCProxy &&Arg)
       : MAM(std::move(Arg.MAM)) {}
@@ -283,7 +175,7 @@ public:
   /// \brief Run the analysis pass and create our proxy result object.
   /// Nothing to see here, it just forwards the \c MAM reference into the
   /// result.
-  Result run(LazyCallGraph::SCC *) { return Result(*MAM); }
+  Result run(LazyCallGraph::SCC &) { return Result(*MAM); }
 
 private:
   static char PassID;
@@ -323,7 +215,7 @@ public:
   }
 
   /// \brief Runs the CGSCC pass across every SCC in the module.
-  PreservedAnalyses run(Module *M, ModuleAnalysisManager *AM) {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager *AM) {
     assert(AM && "We need analyses to compute the call graph!");
 
     // Setup the CGSCC analysis manager from its proxy.
@@ -335,15 +227,17 @@ public:
 
     PreservedAnalyses PA = PreservedAnalyses::all();
     for (LazyCallGraph::SCC &C : CG.postorder_sccs()) {
-      PreservedAnalyses PassPA = Pass.run(&C, &CGAM);
+      PreservedAnalyses PassPA = Pass.run(C, &CGAM);
 
       // We know that the CGSCC pass couldn't have invalidated any other
       // SCC's analyses (that's the contract of a CGSCC pass), so
-      // directly handle the CGSCC analysis manager's invalidation here.
+      // directly handle the CGSCC analysis manager's invalidation here. We
+      // also update the preserved set of analyses to reflect that invalidated
+      // analyses are now safe to preserve.
       // FIXME: This isn't quite correct. We need to handle the case where the
       // pass updated the CG, particularly some child of the current SCC, and
       // invalidate its analyses.
-      CGAM.invalidate(&C, PassPA);
+      PassPA = CGAM.invalidate(C, std::move(PassPA));
 
       // Then intersect the preserved set so that invalidation of module
       // analyses will eventually occur when the module pass completes.
@@ -409,7 +303,7 @@ public:
     /// Regardless of whether this analysis is marked as preserved, all of the
     /// analyses in the \c FunctionAnalysisManager are potentially invalidated
     /// based on the set of preserved analyses.
-    bool invalidate(LazyCallGraph::SCC *C, const PreservedAnalyses &PA);
+    bool invalidate(LazyCallGraph::SCC &C, const PreservedAnalyses &PA);
 
   private:
     FunctionAnalysisManager *FAM;
@@ -417,6 +311,8 @@ public:
 
   static void *ID() { return (void *)&PassID; }
 
+  static StringRef name() { return "FunctionAnalysisManagerCGSCCProxy"; }
+
   explicit FunctionAnalysisManagerCGSCCProxy(FunctionAnalysisManager &FAM)
       : FAM(&FAM) {}
   // We have to explicitly define all the special member functions because MSVC
@@ -441,7 +337,7 @@ public:
   /// In debug builds, it will also assert that the analysis manager is empty
   /// as no queries should arrive at the function analysis manager prior to
   /// this analysis being requested.
-  Result run(LazyCallGraph::SCC *C);
+  Result run(LazyCallGraph::SCC &C);
 
 private:
   static char PassID;
@@ -479,7 +375,7 @@ public:
     const CGSCCAnalysisManager &getManager() const { return *CGAM; }
 
     /// \brief Handle invalidation by ignoring it, this pass is immutable.
-    bool invalidate(Function *) { return false; }
+    bool invalidate(Function &) { return false; }
 
   private:
     const CGSCCAnalysisManager *CGAM;
@@ -487,6 +383,8 @@ public:
 
   static void *ID() { return (void *)&PassID; }
 
+  static StringRef name() { return "CGSCCAnalysisManagerFunctionProxy"; }
+
   CGSCCAnalysisManagerFunctionProxy(const CGSCCAnalysisManager &CGAM)
       : CGAM(&CGAM) {}
   // We have to explicitly define all the special member functions because MSVC
@@ -505,7 +403,7 @@ public:
   /// \brief Run the analysis pass and create our proxy result object.
   /// Nothing to see here, it just forwards the \c CGAM reference into the
   /// result.
-  Result run(Function *) { return Result(*CGAM); }
+  Result run(Function &) { return Result(*CGAM); }
 
 private:
   static char PassID;
@@ -531,7 +429,8 @@ public:
       : Pass(Arg.Pass) {}
   CGSCCToFunctionPassAdaptor(CGSCCToFunctionPassAdaptor &&Arg)
       : Pass(std::move(Arg.Pass)) {}
-  friend void swap(CGSCCToFunctionPassAdaptor &LHS, CGSCCToFunctionPassAdaptor &RHS) {
+  friend void swap(CGSCCToFunctionPassAdaptor &LHS,
+                   CGSCCToFunctionPassAdaptor &RHS) {
     using std::swap;
     swap(LHS.Pass, RHS.Pass);
   }
@@ -541,21 +440,23 @@ public:
   }
 
   /// \brief Runs the function pass across every function in the module.
-  PreservedAnalyses run(LazyCallGraph::SCC *C, CGSCCAnalysisManager *AM) {
+  PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager *AM) {
     FunctionAnalysisManager *FAM = nullptr;
     if (AM)
       // Setup the function analysis manager from its proxy.
       FAM = &AM->getResult<FunctionAnalysisManagerCGSCCProxy>(C).getManager();
 
     PreservedAnalyses PA = PreservedAnalyses::all();
-    for (LazyCallGraph::Node *N : *C) {
-      PreservedAnalyses PassPA = Pass.run(&N->getFunction(), FAM);
+    for (LazyCallGraph::Node *N : C) {
+      PreservedAnalyses PassPA = Pass.run(N->getFunction(), FAM);
 
       // We know that the function pass couldn't have invalidated any other
       // function's analyses (that's the contract of a function pass), so
       // directly handle the function analysis manager's invalidation here.
+      // Also, update the preserved analyses to reflect that once invalidated
+      // these can again be preserved.
       if (FAM)
-        FAM->invalidate(&N->getFunction(), PassPA);
+        PassPA = FAM->invalidate(N->getFunction(), std::move(PassPA));
 
       // Then intersect the preserved set so that invalidation of module
       // analyses will eventually occur when the module pass completes.
@@ -585,7 +486,6 @@ CGSCCToFunctionPassAdaptor<FunctionPassT>
 createCGSCCToFunctionPassAdaptor(FunctionPassT Pass) {
   return std::move(CGSCCToFunctionPassAdaptor<FunctionPassT>(std::move(Pass)));
 }
-
 }
 
 #endif
diff --git a/include/llvm/Analysis/CallGraph.h b/include/llvm/Analysis/CallGraph.h
index 76d9073..64d288a 100644
--- a/include/llvm/Analysis/CallGraph.h
+++ b/include/llvm/Analysis/CallGraph.h
@@ -273,8 +273,8 @@ private:
   /// CalledFunctions array of this or other CallGraphNodes.
   unsigned NumReferences;
 
-  CallGraphNode(const CallGraphNode &) LLVM_DELETED_FUNCTION;
-  void operator=(const CallGraphNode &) LLVM_DELETED_FUNCTION;
+  CallGraphNode(const CallGraphNode &) = delete;
+  void operator=(const CallGraphNode &) = delete;
 
   void DropRef() { --NumReferences; }
   void AddRef() { ++NumReferences; }
diff --git a/include/llvm/Analysis/CodeMetrics.h b/include/llvm/Analysis/CodeMetrics.h
index 59502df..2f59691 100644
--- a/include/llvm/Analysis/CodeMetrics.h
+++ b/include/llvm/Analysis/CodeMetrics.h
@@ -20,7 +20,7 @@
 #include "llvm/IR/CallSite.h"
 
 namespace llvm {
-class AssumptionTracker;
+class AssumptionCache;
 class BasicBlock;
 class Loop;
 class Function;
@@ -93,13 +93,13 @@ struct CodeMetrics {
 
   /// \brief Collect a loop's ephemeral values (those used only by an assume
   /// or similar intrinsics in the loop).
-  static void collectEphemeralValues(const Loop *L, AssumptionTracker *AT,
-                                     SmallPtrSetImpl<const Value*> &EphValues);
+  static void collectEphemeralValues(const Loop *L, AssumptionCache *AC,
+                                     SmallPtrSetImpl<const Value *> &EphValues);
 
   /// \brief Collect a functions's ephemeral values (those used only by an
   /// assume or similar intrinsics in the function).
-  static void collectEphemeralValues(const Function *L, AssumptionTracker *AT,
-                                     SmallPtrSetImpl<const Value*> &EphValues);
+  static void collectEphemeralValues(const Function *L, AssumptionCache *AC,
+                                     SmallPtrSetImpl<const Value *> &EphValues);
 };
 
 }
diff --git a/include/llvm/Analysis/DependenceAnalysis.h b/include/llvm/Analysis/DependenceAnalysis.h
index 1041e3f..e01aa54 100644
--- a/include/llvm/Analysis/DependenceAnalysis.h
+++ b/include/llvm/Analysis/DependenceAnalysis.h
@@ -278,8 +278,8 @@ namespace llvm {
   /// DependenceAnalysis - This class is the main dependence-analysis driver.
   ///
   class DependenceAnalysis : public FunctionPass {
-    void operator=(const DependenceAnalysis &) LLVM_DELETED_FUNCTION;
-    DependenceAnalysis(const DependenceAnalysis &) LLVM_DELETED_FUNCTION;
+    void operator=(const DependenceAnalysis &) = delete;
+    DependenceAnalysis(const DependenceAnalysis &) = delete;
   public:
     /// depends - Tests for a dependence between the Src and Dst instructions.
     /// Returns NULL if no dependence; otherwise, returns a Dependence (or a
diff --git a/include/llvm/Analysis/FindUsedTypes.h b/include/llvm/Analysis/FindUsedTypes.h
deleted file mode 100644
index 574c947..0000000
--- a/include/llvm/Analysis/FindUsedTypes.h
+++ /dev/null
@@ -1,66 +0,0 @@
-//===- llvm/Analysis/FindUsedTypes.h - Find all Types in use ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass is used to seek out all of the types in use by the program.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_FINDUSEDTYPES_H
-#define LLVM_ANALYSIS_FINDUSEDTYPES_H
-
-#include "llvm/ADT/SetVector.h"
-#include "llvm/Pass.h"
-
-namespace llvm {
-
-class Type;
-class Value;
-
-class FindUsedTypes : public ModulePass {
-  SetVector<Type *> UsedTypes;
-public:
-  static char ID; // Pass identification, replacement for typeid
-  FindUsedTypes() : ModulePass(ID) {
-    initializeFindUsedTypesPass(*PassRegistry::getPassRegistry());
-  }
-
-  /// getTypes - After the pass has been run, return the set containing all of
-  /// the types used in the module.
-  ///
-  const SetVector<Type *> &getTypes() const { return UsedTypes; }
-
-  /// Print the types found in the module.  If the optional Module parameter is
-  /// passed in, then the types are printed symbolically if possible, using the
-  /// symbol table from the module.
-  ///
-  void print(raw_ostream &o, const Module *M) const override;
-
-private:
-  /// IncorporateType - Incorporate one type and all of its subtypes into the
-  /// collection of used types.
-  ///
-  void IncorporateType(Type *Ty);
-
-  /// IncorporateValue - Incorporate all of the types used by this value.
-  ///
-  void IncorporateValue(const Value *V);
-
-public:
-  /// run - This incorporates all types used by the specified module
-  bool runOnModule(Module &M) override;
-
-  /// getAnalysisUsage - We do not modify anything.
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-  }
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/include/llvm/Analysis/FunctionTargetTransformInfo.h b/include/llvm/Analysis/FunctionTargetTransformInfo.h
deleted file mode 100644
index c1654cc..0000000
--- a/include/llvm/Analysis/FunctionTargetTransformInfo.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//===- llvm/Analysis/FunctionTargetTransformInfo.h --------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass wraps a TargetTransformInfo in a FunctionPass so that it can
-// forward along the current Function so that we can make target specific
-// decisions based on the particular subtarget specified for each Function.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_FUNCTIONTARGETTRANSFORMINFO_H
-#define LLVM_ANALYSIS_FUNCTIONTARGETTRANSFORMINFO_H
-
-#include "llvm/Pass.h"
-#include "TargetTransformInfo.h"
-
-namespace llvm {
-class FunctionTargetTransformInfo final : public FunctionPass {
-private:
-  const Function *Fn;
-  const TargetTransformInfo *TTI;
-
-  FunctionTargetTransformInfo(const FunctionTargetTransformInfo &)
-      LLVM_DELETED_FUNCTION;
-  void operator=(const FunctionTargetTransformInfo &) LLVM_DELETED_FUNCTION;
-
-public:
-  static char ID;
-  FunctionTargetTransformInfo();
-
-  // Implementation boilerplate.
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  void releaseMemory() override;
-  bool runOnFunction(Function &F) override;
-
-  // Shimmed functions from TargetTransformInfo.
-  void
-  getUnrollingPreferences(Loop *L,
-                          TargetTransformInfo::UnrollingPreferences &UP) const {
-    TTI->getUnrollingPreferences(Fn, L, UP);
-  }
-};
-}
-#endif
diff --git a/include/llvm/Analysis/InlineCost.h b/include/llvm/Analysis/InlineCost.h
index 81795ba..fdee9f8 100644
--- a/include/llvm/Analysis/InlineCost.h
+++ b/include/llvm/Analysis/InlineCost.h
@@ -19,11 +19,11 @@
 #include <climits>
 
 namespace llvm {
-class AssumptionTracker;
+class AssumptionCacheTracker;
 class CallSite;
 class DataLayout;
 class Function;
-class TargetTransformInfo;
+class TargetTransformInfoWrapperPass;
 
 namespace InlineConstants {
   // Various magic constants used to adjust heuristics.
@@ -77,7 +77,7 @@ public:
   }
 
   /// \brief Test whether the inline cost is low enough for inlining.
-  LLVM_EXPLICIT operator bool() const {
+  explicit operator bool() const {
     return Cost < Threshold;
   }
 
@@ -100,8 +100,8 @@ public:
 
 /// \brief Cost analyzer used by inliner.
 class InlineCostAnalysis : public CallGraphSCCPass {
-  const TargetTransformInfo *TTI;
-  AssumptionTracker *AT;
+  TargetTransformInfoWrapperPass *TTIWP;
+  AssumptionCacheTracker *ACT;
 
 public:
   static char ID;
diff --git a/include/llvm/Analysis/InstructionSimplify.h b/include/llvm/Analysis/InstructionSimplify.h
index 51f6e85..1ebf981 100644
--- a/include/llvm/Analysis/InstructionSimplify.h
+++ b/include/llvm/Analysis/InstructionSimplify.h
@@ -37,7 +37,7 @@
 namespace llvm {
   template<typename T>
   class ArrayRef;
-  class AssumptionTracker;
+  class AssumptionCache;
   class DominatorTree;
   class Instruction;
   class DataLayout;
@@ -52,7 +52,7 @@ namespace llvm {
                          const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
                          const DominatorTree *DT = nullptr,
-                         AssumptionTracker *AT = nullptr,
+                         AssumptionCache *AC = nullptr,
                          const Instruction *CxtI = nullptr);
 
   /// SimplifySubInst - Given operands for a Sub, see if we can
@@ -61,35 +61,34 @@ namespace llvm {
                          const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
                          const DominatorTree *DT = nullptr,
-                         AssumptionTracker *AT = nullptr,
+                         AssumptionCache *AC = nullptr,
                          const Instruction *CxtI = nullptr);
 
   /// Given operands for an FAdd, see if we can fold the result.  If not, this
   /// returns null.
   Value *SimplifyFAddInst(Value *LHS, Value *RHS, FastMathFlags FMF,
-                         const DataLayout *TD = nullptr,
-                         const TargetLibraryInfo *TLI = nullptr,
-                         const DominatorTree *DT = nullptr,
-                         AssumptionTracker *AT = nullptr,
-                         const Instruction *CxtI = nullptr);
+                          const DataLayout *TD = nullptr,
+                          const TargetLibraryInfo *TLI = nullptr,
+                          const DominatorTree *DT = nullptr,
+                          AssumptionCache *AC = nullptr,
+                          const Instruction *CxtI = nullptr);
 
   /// Given operands for an FSub, see if we can fold the result.  If not, this
   /// returns null.
   Value *SimplifyFSubInst(Value *LHS, Value *RHS, FastMathFlags FMF,
-                         const DataLayout *TD = nullptr,
-                         const TargetLibraryInfo *TLI = nullptr,
-                         const DominatorTree *DT = nullptr,
-                         AssumptionTracker *AT = nullptr,
-                         const Instruction *CxtI = nullptr);
+                          const DataLayout *TD = nullptr,
+                          const TargetLibraryInfo *TLI = nullptr,
+                          const DominatorTree *DT = nullptr,
+                          AssumptionCache *AC = nullptr,
+                          const Instruction *CxtI = nullptr);
 
   /// Given operands for an FMul, see if we can fold the result.  If not, this
   /// returns null.
-  Value *SimplifyFMulInst(Value *LHS, Value *RHS,
-                          FastMathFlags FMF,
+  Value *SimplifyFMulInst(Value *LHS, Value *RHS, FastMathFlags FMF,
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
                           const DominatorTree *DT = nullptr,
-                          AssumptionTracker *AT = nullptr,
+                          AssumptionCache *AC = nullptr,
                           const Instruction *CxtI = nullptr);
 
   /// SimplifyMulInst - Given operands for a Mul, see if we can
@@ -97,7 +96,7 @@ namespace llvm {
   Value *SimplifyMulInst(Value *LHS, Value *RHS, const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
                          const DominatorTree *DT = nullptr,
-                         AssumptionTracker *AT = nullptr,
+                         AssumptionCache *AC = nullptr,
                          const Instruction *CxtI = nullptr);
 
   /// SimplifySDivInst - Given operands for an SDiv, see if we can
@@ -106,7 +105,7 @@ namespace llvm {
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
                           const DominatorTree *DT = nullptr,
-                          AssumptionTracker *AT = nullptr,
+                          AssumptionCache *AC = nullptr,
                           const Instruction *CxtI = nullptr);
 
   /// SimplifyUDivInst - Given operands for a UDiv, see if we can
@@ -115,16 +114,16 @@ namespace llvm {
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
                           const DominatorTree *DT = nullptr,
-                          AssumptionTracker *AT = nullptr,
+                          AssumptionCache *AC = nullptr,
                           const Instruction *CxtI = nullptr);
 
   /// SimplifyFDivInst - Given operands for an FDiv, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyFDivInst(Value *LHS, Value *RHS,
+  Value *SimplifyFDivInst(Value *LHS, Value *RHS, FastMathFlags FMF,
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
                           const DominatorTree *DT = nullptr,
-                          AssumptionTracker *AT = nullptr,
+                          AssumptionCache *AC = nullptr,
                           const Instruction *CxtI = nullptr);
 
   /// SimplifySRemInst - Given operands for an SRem, see if we can
@@ -133,7 +132,7 @@ namespace llvm {
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
                           const DominatorTree *DT = nullptr,
-                          AssumptionTracker *AT = nullptr,
+                          AssumptionCache *AC = nullptr,
                           const Instruction *CxtI = nullptr);
 
   /// SimplifyURemInst - Given operands for a URem, see if we can
@@ -142,16 +141,16 @@ namespace llvm {
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
                           const DominatorTree *DT = nullptr,
-                          AssumptionTracker *AT = nullptr,
+                          AssumptionCache *AC = nullptr,
                           const Instruction *CxtI = nullptr);
 
   /// SimplifyFRemInst - Given operands for an FRem, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyFRemInst(Value *LHS, Value *RHS,
+  Value *SimplifyFRemInst(Value *LHS, Value *RHS, FastMathFlags FMF,
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
                           const DominatorTree *DT = nullptr,
-                          AssumptionTracker *AT = nullptr,
+                          AssumptionCache *AC = nullptr,
                           const Instruction *CxtI = nullptr);
 
   /// SimplifyShlInst - Given operands for a Shl, see if we can
@@ -160,7 +159,7 @@ namespace llvm {
                          const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
                          const DominatorTree *DT = nullptr,
-                         AssumptionTracker *AT = nullptr,
+                         AssumptionCache *AC = nullptr,
                          const Instruction *CxtI = nullptr);
 
   /// SimplifyLShrInst - Given operands for a LShr, see if we can
@@ -169,7 +168,7 @@ namespace llvm {
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
                           const DominatorTree *DT = nullptr,
-                          AssumptionTracker *AT = nullptr,
+                          AssumptionCache *AC = nullptr,
                           const Instruction *CxtI = nullptr);
 
   /// SimplifyAShrInst - Given operands for a AShr, see if we can
@@ -178,7 +177,7 @@ namespace llvm {
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
                           const DominatorTree *DT = nullptr,
-                          AssumptionTracker *AT = nullptr,
+                          AssumptionCache *AC = nullptr,
                           const Instruction *CxtI = nullptr);
 
   /// SimplifyAndInst - Given operands for an And, see if we can
@@ -186,7 +185,7 @@ namespace llvm {
   Value *SimplifyAndInst(Value *LHS, Value *RHS, const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
                          const DominatorTree *DT = nullptr,
-                         AssumptionTracker *AT = nullptr,
+                         AssumptionCache *AC = nullptr,
                          const Instruction *CxtI = nullptr);
 
   /// SimplifyOrInst - Given operands for an Or, see if we can
@@ -194,7 +193,7 @@ namespace llvm {
   Value *SimplifyOrInst(Value *LHS, Value *RHS, const DataLayout *TD = nullptr,
                         const TargetLibraryInfo *TLI = nullptr,
                         const DominatorTree *DT = nullptr,
-                        AssumptionTracker *AT = nullptr,
+                        AssumptionCache *AC = nullptr,
                         const Instruction *CxtI = nullptr);
 
   /// SimplifyXorInst - Given operands for a Xor, see if we can
@@ -202,7 +201,7 @@ namespace llvm {
   Value *SimplifyXorInst(Value *LHS, Value *RHS, const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
                          const DominatorTree *DT = nullptr,
-                         AssumptionTracker *AT = nullptr,
+                         AssumptionCache *AC = nullptr,
                          const Instruction *CxtI = nullptr);
 
   /// SimplifyICmpInst - Given operands for an ICmpInst, see if we can
@@ -211,7 +210,7 @@ namespace llvm {
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
                           const DominatorTree *DT = nullptr,
-                          AssumptionTracker *AT = nullptr,
+                          AssumptionCache *AC = nullptr,
                           Instruction *CxtI = nullptr);
 
   /// SimplifyFCmpInst - Given operands for an FCmpInst, see if we can
@@ -220,7 +219,7 @@ namespace llvm {
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
                           const DominatorTree *DT = nullptr,
-                          AssumptionTracker *AT = nullptr,
+                          AssumptionCache *AC = nullptr,
                           const Instruction *CxtI = nullptr);
 
   /// SimplifySelectInst - Given operands for a SelectInst, see if we can fold
@@ -229,7 +228,7 @@ namespace llvm {
                             const DataLayout *TD = nullptr,
                             const TargetLibraryInfo *TLI = nullptr,
                             const DominatorTree *DT = nullptr,
-                            AssumptionTracker *AT = nullptr,
+                            AssumptionCache *AC = nullptr,
                             const Instruction *CxtI = nullptr);
 
   /// SimplifyGEPInst - Given operands for an GetElementPtrInst, see if we can
@@ -237,7 +236,7 @@ namespace llvm {
   Value *SimplifyGEPInst(ArrayRef<Value *> Ops, const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
                          const DominatorTree *DT = nullptr,
-                         AssumptionTracker *AT = nullptr,
+                         AssumptionCache *AC = nullptr,
                          const Instruction *CxtI = nullptr);
 
   /// SimplifyInsertValueInst - Given operands for an InsertValueInst, see if we
@@ -247,7 +246,7 @@ namespace llvm {
                                  const DataLayout *TD = nullptr,
                                  const TargetLibraryInfo *TLI = nullptr,
                                  const DominatorTree *DT = nullptr,
-                                 AssumptionTracker *AT = nullptr,
+                                 AssumptionCache *AC = nullptr,
                                  const Instruction *CxtI = nullptr);
 
   /// SimplifyTruncInst - Given operands for an TruncInst, see if we can fold
@@ -255,7 +254,7 @@ namespace llvm {
   Value *SimplifyTruncInst(Value *Op, Type *Ty, const DataLayout *TD = nullptr,
                            const TargetLibraryInfo *TLI = nullptr,
                            const DominatorTree *DT = nullptr,
-                           AssumptionTracker *AT = nullptr,
+                           AssumptionCache *AC = nullptr,
                            const Instruction *CxtI = nullptr);
 
   //=== Helper functions for higher up the class hierarchy.
@@ -267,7 +266,7 @@ namespace llvm {
                          const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
                          const DominatorTree *DT = nullptr,
-                         AssumptionTracker *AT = nullptr,
+                         AssumptionCache *AC = nullptr,
                          const Instruction *CxtI = nullptr);
 
   /// SimplifyBinOp - Given operands for a BinaryOperator, see if we can
@@ -276,8 +275,19 @@ namespace llvm {
                        const DataLayout *TD = nullptr,
                        const TargetLibraryInfo *TLI = nullptr,
                        const DominatorTree *DT = nullptr,
-                       AssumptionTracker *AT = nullptr,
+                       AssumptionCache *AC = nullptr,
                        const Instruction *CxtI = nullptr);
+  /// SimplifyFPBinOp - Given operands for a BinaryOperator, see if we can
+  /// fold the result.  If not, this returns null.
+  /// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the
+  /// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp.
+  Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+                         const FastMathFlags &FMF,
+                         const DataLayout *TD = nullptr,
+                         const TargetLibraryInfo *TLI = nullptr,
+                         const DominatorTree *DT = nullptr,
+                         AssumptionCache *AC = nullptr,
+                         const Instruction *CxtI = nullptr);
 
   /// \brief Given a function and iterators over arguments, see if we can fold
   /// the result.
@@ -287,7 +297,7 @@ namespace llvm {
                       User::op_iterator ArgEnd, const DataLayout *TD = nullptr,
                       const TargetLibraryInfo *TLI = nullptr,
                       const DominatorTree *DT = nullptr,
-                      AssumptionTracker *AT = nullptr,
+                      AssumptionCache *AC = nullptr,
                       const Instruction *CxtI = nullptr);
 
   /// \brief Given a function and set of arguments, see if we can fold the
@@ -298,7 +308,7 @@ namespace llvm {
                       const DataLayout *TD = nullptr,
                       const TargetLibraryInfo *TLI = nullptr,
                       const DominatorTree *DT = nullptr,
-                      AssumptionTracker *AT = nullptr,
+                      AssumptionCache *AC = nullptr,
                       const Instruction *CxtI = nullptr);
 
   /// SimplifyInstruction - See if we can compute a simplified version of this
@@ -306,8 +316,7 @@ namespace llvm {
   Value *SimplifyInstruction(Instruction *I, const DataLayout *TD = nullptr,
                              const TargetLibraryInfo *TLI = nullptr,
                              const DominatorTree *DT = nullptr,
-                             AssumptionTracker *AT = nullptr);
-
+                             AssumptionCache *AC = nullptr);
 
   /// \brief Replace all uses of 'I' with 'SimpleV' and simplify the uses
   /// recursively.
@@ -321,7 +330,7 @@ namespace llvm {
                                      const DataLayout *TD = nullptr,
                                      const TargetLibraryInfo *TLI = nullptr,
                                      const DominatorTree *DT = nullptr,
-                                     AssumptionTracker *AT = nullptr);
+                                     AssumptionCache *AC = nullptr);
 
   /// \brief Recursively attempt to simplify an instruction.
   ///
@@ -333,7 +342,7 @@ namespace llvm {
                                       const DataLayout *TD = nullptr,
                                       const TargetLibraryInfo *TLI = nullptr,
                                       const DominatorTree *DT = nullptr,
-                                      AssumptionTracker *AT = nullptr);
+                                      AssumptionCache *AC = nullptr);
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/Analysis/JumpInstrTableInfo.h b/include/llvm/Analysis/JumpInstrTableInfo.h
index 5b0176c..591e794 100644
--- a/include/llvm/Analysis/JumpInstrTableInfo.h
+++ b/include/llvm/Analysis/JumpInstrTableInfo.h
@@ -16,7 +16,6 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Pass.h"
-
 #include <vector>
 
 namespace llvm {
diff --git a/include/llvm/Analysis/LazyCallGraph.h b/include/llvm/Analysis/LazyCallGraph.h
index 9a59844..b0b9068 100644
--- a/include/llvm/Analysis/LazyCallGraph.h
+++ b/include/llvm/Analysis/LazyCallGraph.h
@@ -46,11 +46,11 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/Allocator.h"
 #include <iterator>
 
 namespace llvm {
-class ModuleAnalysisManager;
 class PreservedAnalyses;
 class raw_ostream;
 
@@ -252,6 +252,12 @@ public:
     /// \brief Test if this SCC is a descendant of \a C.
     bool isDescendantOf(const SCC &C) const;
 
+    /// \brief Short name useful for debugging or logging.
+    ///
+    /// We use the name of the first function in the SCC to name the SCC for
+    /// the purposes of debugging and logging.
+    StringRef getName() const { return (*begin())->getFunction().getName(); }
+
     ///@{
     /// \name Mutation API
     ///
@@ -537,11 +543,13 @@ public:
 
   static void *ID() { return (void *)&PassID; }
 
+  static StringRef name() { return "Lazy CallGraph Analysis"; }
+
   /// \brief Compute the \c LazyCallGraph for the module \c M.
   ///
   /// This just builds the set of entry points to the call graph. The rest is
   /// built lazily as it is walked.
-  LazyCallGraph run(Module *M) { return LazyCallGraph(*M); }
+  LazyCallGraph run(Module &M) { return LazyCallGraph(M); }
 
 private:
   static char PassID;
@@ -556,7 +564,7 @@ class LazyCallGraphPrinterPass {
 public:
   explicit LazyCallGraphPrinterPass(raw_ostream &OS);
 
-  PreservedAnalyses run(Module *M, ModuleAnalysisManager *AM);
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager *AM);
 
   static StringRef name() { return "LazyCallGraphPrinterPass"; }
 };
diff --git a/include/llvm/Analysis/LazyValueInfo.h b/include/llvm/Analysis/LazyValueInfo.h
index 52cc0d1..51f6b0c 100644
--- a/include/llvm/Analysis/LazyValueInfo.h
+++ b/include/llvm/Analysis/LazyValueInfo.h
@@ -18,7 +18,7 @@
 #include "llvm/Pass.h"
 
 namespace llvm {
-  class AssumptionTracker;
+  class AssumptionCache;
   class Constant;
   class DataLayout;
   class DominatorTree;
@@ -26,16 +26,15 @@ namespace llvm {
   class TargetLibraryInfo;
   class Value;
   
-/// LazyValueInfo - This pass computes, caches, and vends lazy value constraint
-/// information.
+/// This pass computes, caches, and vends lazy value constraint information.
 class LazyValueInfo : public FunctionPass {
-  AssumptionTracker *AT;
+  AssumptionCache *AC;
   const DataLayout *DL;
   class TargetLibraryInfo *TLI;
   DominatorTree *DT;
   void *PImpl;
-  LazyValueInfo(const LazyValueInfo&) LLVM_DELETED_FUNCTION;
-  void operator=(const LazyValueInfo&) LLVM_DELETED_FUNCTION;
+  LazyValueInfo(const LazyValueInfo&) = delete;
+  void operator=(const LazyValueInfo&) = delete;
 public:
   static char ID;
   LazyValueInfo() : FunctionPass(ID), PImpl(nullptr) {
@@ -43,7 +42,7 @@ public:
   }
   ~LazyValueInfo() { assert(!PImpl && "releaseMemory not called"); }
 
-  /// Tristate - This is used to return true/false/dunno results.
+  /// This is used to return true/false/dunno results.
   enum Tristate {
     Unknown = -1, False = 0, True = 1
   };
@@ -51,33 +50,33 @@ public:
   
   // Public query interface.
   
-  /// getPredicateOnEdge - Determine whether the specified value comparison
-  /// with a constant is known to be true or false on the specified CFG edge.
+  /// Determine whether the specified value comparison with a constant is known
+  /// to be true or false on the specified CFG edge.
   /// Pred is a CmpInst predicate.
   Tristate getPredicateOnEdge(unsigned Pred, Value *V, Constant *C,
                               BasicBlock *FromBB, BasicBlock *ToBB,
                               Instruction *CxtI = nullptr);
   
-  /// getPredicateAt - Determine whether the specified value comparison
-  /// with a constant is known to be true or false at the specified instruction
+  /// Determine whether the specified value comparison with a constant is known
+  /// to be true or false at the specified instruction
   /// (from an assume intrinsic). Pred is a CmpInst predicate.
   Tristate getPredicateAt(unsigned Pred, Value *V, Constant *C,
                           Instruction *CxtI);
  
-  /// getConstant - Determine whether the specified value is known to be a
+  /// Determine whether the specified value is known to be a
   /// constant at the end of the specified block.  Return null if not.
   Constant *getConstant(Value *V, BasicBlock *BB, Instruction *CxtI = nullptr);
 
-  /// getConstantOnEdge - Determine whether the specified value is known to be a
+  /// Determine whether the specified value is known to be a
   /// constant on the specified edge.  Return null if not.
   Constant *getConstantOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB,
                               Instruction *CxtI = nullptr);
   
-  /// threadEdge - Inform the analysis cache that we have threaded an edge from
+  /// Inform the analysis cache that we have threaded an edge from
   /// PredBB to OldSucc to be from PredBB to NewSucc instead.
   void threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc, BasicBlock *NewSucc);
   
-  /// eraseBlock - Inform the analysis cache that we have erased a block.
+  /// Inform the analysis cache that we have erased a block.
   void eraseBlock(BasicBlock *BB);
   
   // Implementation boilerplate.
diff --git a/include/llvm/Analysis/LibCallSemantics.h b/include/llvm/Analysis/LibCallSemantics.h
index 8bd747f..e6427a4 100644
--- a/include/llvm/Analysis/LibCallSemantics.h
+++ b/include/llvm/Analysis/LibCallSemantics.h
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 
 namespace llvm {
+class InvokeInst;
 
   /// LibCallLocationInfo - This struct describes a set of memory locations that
   /// are accessed by libcalls.  Identification of a location is doing with a
@@ -162,6 +163,28 @@ namespace llvm {
     virtual const LibCallFunctionInfo *getFunctionInfoArray() const = 0;
   };
 
+  enum class EHPersonality {
+    Unknown,
+    GNU_Ada,
+    GNU_C,
+    GNU_CXX,
+    GNU_ObjC,
+    MSVC_X86SEH,
+    MSVC_Win64SEH,
+    MSVC_CXX,
+  };
+
+  /// \brief See if the given exception handling personality function is one
+  /// that we understand.  If so, return a description of it; otherwise return
+  /// Unknown.
+  EHPersonality classifyEHPersonality(const Value *Pers);
+
+  /// \brief Returns true if this personality function catches asynchronous
+  /// exceptions.
+  bool isAsynchronousEHPersonality(EHPersonality Pers);
+
+  bool canSimplifyInvokeNoUnwind(const InvokeInst *II);
+
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
new file mode 100644
index 0000000..323af98
--- /dev/null
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -0,0 +1,290 @@
+//===- llvm/Analysis/LoopAccessAnalysis.h -----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface for the loop memory dependence framework that
+// was originally developed for the Loop Vectorizer.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_LOOPACCESSANALYSIS_H
+#define LLVM_ANALYSIS_LOOPACCESSANALYSIS_H
+
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class Value;
+class DataLayout;
+class AliasAnalysis;
+class ScalarEvolution;
+class Loop;
+class SCEV;
+
+/// Optimization analysis message produced during vectorization. Messages inform
+/// the user why vectorization did not occur.
+class LoopAccessReport {
+  std::string Message;
+  const Instruction *Instr;
+
+protected:
+  LoopAccessReport(const Twine &Message, const Instruction *I)
+      : Message(Message.str()), Instr(I) {}
+
+public:
+  LoopAccessReport(const Instruction *I = nullptr) : Instr(I) {}
+
+  template <typename A> LoopAccessReport &operator<<(const A &Value) {
+    raw_string_ostream Out(Message);
+    Out << Value;
+    return *this;
+  }
+
+  const Instruction *getInstr() const { return Instr; }
+
+  std::string &str() { return Message; }
+  const std::string &str() const { return Message; }
+  operator Twine() { return Message; }
+
+  /// \brief Emit an analysis note for \p PassName with the debug location from
+  /// the instruction in \p Message if available.  Otherwise use the location of
+  /// \p TheLoop.
+  static void emitAnalysis(const LoopAccessReport &Message,
+                           const Function *TheFunction,
+                           const Loop *TheLoop,
+                           const char *PassName);
+};
+
+/// \brief Collection of parameters shared beetween the Loop Vectorizer and the
+/// Loop Access Analysis.
+struct VectorizerParams {
+  /// \brief Maximum SIMD width.
+  static const unsigned MaxVectorWidth;
+
+  /// \brief VF as overridden by the user.
+  static unsigned VectorizationFactor;
+  /// \brief Interleave factor as overridden by the user.
+  static unsigned VectorizationInterleave;
+  /// \brief True if force-vector-interleave was specified by the user.
+  static bool isInterleaveForced();
+
+  /// \\brief When performing memory disambiguation checks at runtime do not
+  /// make more than this number of comparisons.
+  static unsigned RuntimeMemoryCheckThreshold;
+};
+
+/// \brief Drive the analysis of memory accesses in the loop
+///
+/// This class is responsible for analyzing the memory accesses of a loop.  It
+/// collects the accesses and then its main helper the AccessAnalysis class
+/// finds and categorizes the dependences in buildDependenceSets.
+///
+/// For memory dependences that can be analyzed at compile time, it determines
+/// whether the dependence is part of cycle inhibiting vectorization.  This work
+/// is delegated to the MemoryDepChecker class.
+///
+/// For memory dependences that cannot be determined at compile time, it
+/// generates run-time checks to prove independence.  This is done by
+/// AccessAnalysis::canCheckPtrAtRT and the checks are maintained by the
+/// RuntimePointerCheck class.
+class LoopAccessInfo {
+public:
+  /// This struct holds information about the memory runtime legality check that
+  /// a group of pointers do not overlap.
+  struct RuntimePointerCheck {
+    RuntimePointerCheck() : Need(false) {}
+
+    /// Reset the state of the pointer runtime information.
+    void reset() {
+      Need = false;
+      Pointers.clear();
+      Starts.clear();
+      Ends.clear();
+      IsWritePtr.clear();
+      DependencySetId.clear();
+      AliasSetId.clear();
+    }
+
+    /// Insert a pointer and calculate the start and end SCEVs.
+    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr,
+                unsigned DepSetId, unsigned ASId,
+                const ValueToValueMap &Strides);
+
+    /// \brief No run-time memory checking is necessary.
+    bool empty() const { return Pointers.empty(); }
+
+    /// \brief Decide whether we need to issue a run-time check for pointer at
+    /// index \p I and \p J to prove their independence.
+    bool needsChecking(unsigned I, unsigned J) const;
+
+    /// \brief Print the list run-time memory checks necessary.
+    void print(raw_ostream &OS, unsigned Depth = 0) const;
+
+    /// This flag indicates if we need to add the runtime check.
+    bool Need;
+    /// Holds the pointers that we need to check.
+    SmallVector<TrackingVH<Value>, 2> Pointers;
+    /// Holds the pointer value at the beginning of the loop.
+    SmallVector<const SCEV*, 2> Starts;
+    /// Holds the pointer value at the end of the loop.
+    SmallVector<const SCEV*, 2> Ends;
+    /// Holds the information if this pointer is used for writing to memory.
+    SmallVector<bool, 2> IsWritePtr;
+    /// Holds the id of the set of pointers that could be dependent because of a
+    /// shared underlying object.
+    SmallVector<unsigned, 2> DependencySetId;
+    /// Holds the id of the disjoint alias set to which this pointer belongs.
+    SmallVector<unsigned, 2> AliasSetId;
+  };
+
+  LoopAccessInfo(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
+                 const TargetLibraryInfo *TLI, AliasAnalysis *AA,
+                 DominatorTree *DT, const ValueToValueMap &Strides);
+
+  /// Return true we can analyze the memory accesses in the loop and there are
+  /// no memory dependence cycles.
+  bool canVectorizeMemory() const { return CanVecMem; }
+
+  const RuntimePointerCheck *getRuntimePointerCheck() const {
+    return &PtrRtCheck;
+  }
+
+  /// Return true if the block BB needs to be predicated in order for the loop
+  /// to be vectorized.
+  static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
+                                    DominatorTree *DT);
+
+  /// Returns true if the value V is uniform within the loop.
+  bool isUniform(Value *V) const;
+
+  unsigned getMaxSafeDepDistBytes() const { return MaxSafeDepDistBytes; }
+  unsigned getNumStores() const { return NumStores; }
+  unsigned getNumLoads() const { return NumLoads;}
+
+  /// \brief Add code that checks at runtime if the accessed arrays overlap.
+  ///
+  /// Returns a pair of instructions where the first element is the first
+  /// instruction generated in possibly a sequence of instructions and the
+  /// second value is the final comparator value or NULL if no check is needed.
+  std::pair<Instruction *, Instruction *>
+    addRuntimeCheck(Instruction *Loc) const;
+
+  /// \brief The diagnostics report generated for the analysis.  E.g. why we
+  /// couldn't analyze the loop.
+  const Optional<LoopAccessReport> &getReport() const { return Report; }
+
+  /// \brief Print the information about the memory accesses in the loop.
+  void print(raw_ostream &OS, unsigned Depth = 0) const;
+
+  /// \brief Used to ensure that if the analysis was run with speculating the
+  /// value of symbolic strides, the client queries it with the same assumption.
+  /// Only used in DEBUG build but we don't want NDEBUG-dependent ABI.
+  unsigned NumSymbolicStrides;
+
+private:
+  /// \brief Analyze the loop.  Substitute symbolic strides using Strides.
+  void analyzeLoop(const ValueToValueMap &Strides);
+
+  /// \brief Check if the structure of the loop allows it to be analyzed by this
+  /// pass.
+  bool canAnalyzeLoop();
+
+  void emitAnalysis(LoopAccessReport &Message);
+
+  /// We need to check that all of the pointers in this list are disjoint
+  /// at runtime.
+  RuntimePointerCheck PtrRtCheck;
+  Loop *TheLoop;
+  ScalarEvolution *SE;
+  const DataLayout *DL;
+  const TargetLibraryInfo *TLI;
+  AliasAnalysis *AA;
+  DominatorTree *DT;
+
+  unsigned NumLoads;
+  unsigned NumStores;
+
+  unsigned MaxSafeDepDistBytes;
+
+  /// \brief Cache the result of analyzeLoop.
+  bool CanVecMem;
+
+  /// \brief The diagnostics report generated for the analysis.  E.g. why we
+  /// couldn't analyze the loop.
+  Optional<LoopAccessReport> Report;
+};
+
+Value *stripIntegerCast(Value *V);
+
+///\brief Return the SCEV corresponding to a pointer with the symbolic stride
+///replaced with constant one.
+///
+/// If \p OrigPtr is not null, use it to look up the stride value instead of \p
+/// Ptr.  \p PtrToStride provides the mapping between the pointer value and its
+/// stride as collected by LoopVectorizationLegality::collectStridedAccess.
+const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE,
+                                      const ValueToValueMap &PtrToStride,
+                                      Value *Ptr, Value *OrigPtr = nullptr);
+
+/// \brief This analysis provides dependence information for the memory accesses
+/// of a loop.
+///
+/// It runs the analysis for a loop on demand.  This can be initiated by
+/// querying the loop access info via LAA::getInfo.  getInfo return a
+/// LoopAccessInfo object.  See this class for the specifics of what information
+/// is provided.
+class LoopAccessAnalysis : public FunctionPass {
+public:
+  static char ID;
+
+  LoopAccessAnalysis() : FunctionPass(ID) {
+    initializeLoopAccessAnalysisPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// \brief Query the result of the loop access information for the loop \p L.
+  ///
+  /// If the client speculates (and then issues run-time checks) for the values
+  /// of symbolic strides, \p Strides provides the mapping (see
+  /// replaceSymbolicStrideSCEV).  If there is no cached result available run
+  /// the analysis.
+  const LoopAccessInfo &getInfo(Loop *L, const ValueToValueMap &Strides);
+
+  void releaseMemory() override {
+    // Invalidate the cache when the pass is freed.
+    LoopAccessInfoMap.clear();
+  }
+
+  /// \brief Print the result of the analysis when invoked with -analyze.
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
+
+private:
+  /// \brief The cache.
+  DenseMap<Loop *, std::unique_ptr<LoopAccessInfo>> LoopAccessInfoMap;
+
+  // The used analysis passes.
+  ScalarEvolution *SE;
+  const DataLayout *DL;
+  const TargetLibraryInfo *TLI;
+  AliasAnalysis *AA;
+  DominatorTree *DT;
+};
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index bef03e9..85c2da7 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -42,6 +42,11 @@
 
 namespace llvm {
 
+// FIXME: Replace this brittle forward declaration with the include of the new
+// PassManager.h when doing so doesn't break the PassManagerBuilder.
+template <typename IRUnitT> class AnalysisManager;
+class PreservedAnalyses;
+
 template<typename T>
 inline void RemoveFromVector(std::vector<T*> &V, T *N) {
   typename std::vector<T*>::iterator I = std::find(V.begin(), V.end(), N);
@@ -74,9 +79,9 @@ class LoopBase {
 
   SmallPtrSet<const BlockT*, 8> DenseBlockSet;
 
-  LoopBase(const LoopBase<BlockT, LoopT> &) LLVM_DELETED_FUNCTION;
+  LoopBase(const LoopBase<BlockT, LoopT> &) = delete;
   const LoopBase<BlockT, LoopT>&
-    operator=(const LoopBase<BlockT, LoopT> &) LLVM_DELETED_FUNCTION;
+    operator=(const LoopBase<BlockT, LoopT> &) = delete;
 public:
   /// Loop ctor - This creates an empty loop.
   LoopBase() : ParentLoop(nullptr) {}
@@ -496,18 +501,33 @@ class LoopInfoBase {
   friend class LoopBase<BlockT, LoopT>;
   friend class LoopInfo;
 
-  void operator=(const LoopInfoBase &) LLVM_DELETED_FUNCTION;
-  LoopInfoBase(const LoopInfo &) LLVM_DELETED_FUNCTION;
+  void operator=(const LoopInfoBase &) = delete;
+  LoopInfoBase(const LoopInfoBase &) = delete;
 public:
   LoopInfoBase() { }
   ~LoopInfoBase() { releaseMemory(); }
 
+  LoopInfoBase(LoopInfoBase &&Arg)
+      : BBMap(std::move(Arg.BBMap)),
+        TopLevelLoops(std::move(Arg.TopLevelLoops)) {
+    // We have to clear the arguments top level loops as we've taken ownership.
+    Arg.TopLevelLoops.clear();
+  }
+  LoopInfoBase &operator=(LoopInfoBase &&RHS) {
+    BBMap = std::move(RHS.BBMap);
+
+    for (auto *L : TopLevelLoops)
+      delete L;
+    TopLevelLoops = std::move(RHS.TopLevelLoops);
+    RHS.TopLevelLoops.clear();
+    return *this;
+  }
+
   void releaseMemory() {
-    for (typename std::vector<LoopT *>::iterator I =
-         TopLevelLoops.begin(), E = TopLevelLoops.end(); I != E; ++I)
-      delete *I;   // Delete all of the loops...
+    BBMap.clear();
 
-    BBMap.clear();                           // Reset internal state of analysis
+    for (auto *L : TopLevelLoops)
+      delete L;
     TopLevelLoops.clear();
   }
 
@@ -576,8 +596,7 @@ public:
   /// list with the indicated loop.
   void changeTopLevelLoop(LoopT *OldLoop,
                           LoopT *NewLoop) {
-    typename std::vector<LoopT *>::iterator I =
-                 std::find(TopLevelLoops.begin(), TopLevelLoops.end(), OldLoop);
+    auto I = std::find(TopLevelLoops.begin(), TopLevelLoops.end(), OldLoop);
     assert(I != TopLevelLoops.end() && "Old loop not at top level!");
     *I = NewLoop;
     assert(!NewLoop->ParentLoop && !OldLoop->ParentLoop &&
@@ -595,7 +614,7 @@ public:
   /// including all of the Loop objects it is nested in and our mapping from
   /// BasicBlocks to loops.
   void removeBlock(BlockT *BB) {
-    typename DenseMap<BlockT *, LoopT *>::iterator I = BBMap.find(BB);
+    auto I = BBMap.find(BB);
     if (I != BBMap.end()) {
       for (LoopT *L = I->second; L; L = L->getParentLoop())
         L->removeBlockFromLoop(BB);
@@ -617,8 +636,9 @@ public:
   void Analyze(DominatorTreeBase<BlockT> &DomTree);
 
   // Debugging
-
   void print(raw_ostream &OS) const;
+
+  void verify() const;
 };
 
 // Implementation in LoopInfoImpl.h
@@ -626,99 +646,23 @@ public:
 __extension__ extern template class LoopInfoBase<BasicBlock, Loop>;
 #endif
 
-class LoopInfo : public FunctionPass {
-  LoopInfoBase<BasicBlock, Loop> LI;
+class LoopInfo : public LoopInfoBase<BasicBlock, Loop> {
+  typedef LoopInfoBase<BasicBlock, Loop> BaseT;
+
   friend class LoopBase<BasicBlock, Loop>;
 
-  void operator=(const LoopInfo &) LLVM_DELETED_FUNCTION;
-  LoopInfo(const LoopInfo &) LLVM_DELETED_FUNCTION;
+  void operator=(const LoopInfo &) = delete;
+  LoopInfo(const LoopInfo &) = delete;
 public:
-  static char ID; // Pass identification, replacement for typeid
-
-  LoopInfo() : FunctionPass(ID) {
-    initializeLoopInfoPass(*PassRegistry::getPassRegistry());
-  }
-
-  LoopInfoBase<BasicBlock, Loop>& getBase() { return LI; }
+  LoopInfo() {}
 
-  /// iterator/begin/end - The interface to the top-level loops in the current
-  /// function.
-  ///
-  typedef LoopInfoBase<BasicBlock, Loop>::iterator iterator;
-  typedef LoopInfoBase<BasicBlock, Loop>::reverse_iterator reverse_iterator;
-  inline iterator begin() const { return LI.begin(); }
-  inline iterator end() const { return LI.end(); }
-  inline reverse_iterator rbegin() const { return LI.rbegin(); }
-  inline reverse_iterator rend() const { return LI.rend(); }
-  bool empty() const { return LI.empty(); }
-
-  /// getLoopFor - Return the inner most loop that BB lives in.  If a basic
-  /// block is in no loop (for example the entry node), null is returned.
-  ///
-  inline Loop *getLoopFor(const BasicBlock *BB) const {
-    return LI.getLoopFor(BB);
+  LoopInfo(LoopInfo &&Arg) : BaseT(std::move(static_cast<BaseT &>(Arg))) {}
+  LoopInfo &operator=(LoopInfo &&RHS) {
+    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
+    return *this;
   }
 
-  /// operator[] - same as getLoopFor...
-  ///
-  inline const Loop *operator[](const BasicBlock *BB) const {
-    return LI.getLoopFor(BB);
-  }
-
-  /// getLoopDepth - Return the loop nesting level of the specified block.  A
-  /// depth of 0 means the block is not inside any loop.
-  ///
-  inline unsigned getLoopDepth(const BasicBlock *BB) const {
-    return LI.getLoopDepth(BB);
-  }
-
-  // isLoopHeader - True if the block is a loop header node
-  inline bool isLoopHeader(BasicBlock *BB) const {
-    return LI.isLoopHeader(BB);
-  }
-
-  /// runOnFunction - Calculate the natural loop information.
-  ///
-  bool runOnFunction(Function &F) override;
-
-  void verifyAnalysis() const override;
-
-  void releaseMemory() override { LI.releaseMemory(); }
-
-  void print(raw_ostream &O, const Module* M = nullptr) const override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-  /// removeLoop - This removes the specified top-level loop from this loop info
-  /// object.  The loop is not deleted, as it will presumably be inserted into
-  /// another loop.
-  inline Loop *removeLoop(iterator I) { return LI.removeLoop(I); }
-
-  /// changeLoopFor - Change the top-level loop that contains BB to the
-  /// specified loop.  This should be used by transformations that restructure
-  /// the loop hierarchy tree.
-  inline void changeLoopFor(BasicBlock *BB, Loop *L) {
-    LI.changeLoopFor(BB, L);
-  }
-
-  /// changeTopLevelLoop - Replace the specified loop in the top-level loops
-  /// list with the indicated loop.
-  inline void changeTopLevelLoop(Loop *OldLoop, Loop *NewLoop) {
-    LI.changeTopLevelLoop(OldLoop, NewLoop);
-  }
-
-  /// addTopLevelLoop - This adds the specified loop to the collection of
-  /// top-level loops.
-  inline void addTopLevelLoop(Loop *New) {
-    LI.addTopLevelLoop(New);
-  }
-
-  /// removeBlock - This method completely removes BB from all data structures,
-  /// including all of the Loop objects it is nested in and our mapping from
-  /// BasicBlocks to loops.
-  void removeBlock(BasicBlock *BB) {
-    LI.removeBlock(BB);
-  }
+  // Most of the public interface is provided via LoopInfoBase.
 
   /// updateUnloop - Update LoopInfo after removing the last backedge from a
   /// loop--now the "unloop". This updates the loop forest and parent loops for
@@ -748,7 +692,6 @@ public:
   }
 };
 
-
 // Allow clients to walk the list of nested loops...
 template <> struct GraphTraits<const Loop*> {
   typedef const Loop NodeType;
@@ -776,6 +719,65 @@ template <> struct GraphTraits<Loop*> {
   }
 };
 
+/// \brief Analysis pass that exposes the \c LoopInfo for a function.
+class LoopAnalysis {
+  static char PassID;
+
+public:
+  typedef LoopInfo Result;
+
+  /// \brief Opaque, unique identifier for this analysis pass.
+  static void *ID() { return (void *)&PassID; }
+
+  /// \brief Provide a name for the analysis for debugging and logging.
+  static StringRef name() { return "LoopAnalysis"; }
+
+  LoopAnalysis() {}
+  LoopAnalysis(const LoopAnalysis &Arg) {}
+  LoopAnalysis(LoopAnalysis &&Arg) {}
+  LoopAnalysis &operator=(const LoopAnalysis &RHS) { return *this; }
+  LoopAnalysis &operator=(LoopAnalysis &&RHS) { return *this; }
+
+  LoopInfo run(Function &F, AnalysisManager<Function> *AM);
+};
+
+/// \brief Printer pass for the \c LoopAnalysis results.
+class LoopPrinterPass {
+  raw_ostream &OS;
+
+public:
+  explicit LoopPrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(Function &F, AnalysisManager<Function> *AM);
+
+  static StringRef name() { return "LoopPrinterPass"; }
+};
+
+/// \brief The legacy pass manager's analysis pass to compute loop information.
+class LoopInfoWrapperPass : public FunctionPass {
+  LoopInfo LI;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  LoopInfoWrapperPass() : FunctionPass(ID) {
+    initializeLoopInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  LoopInfo &getLoopInfo() { return LI; }
+  const LoopInfo &getLoopInfo() const { return LI; }
+
+  /// \brief Calculate the natural loop information for a given function.
+  bool runOnFunction(Function &F) override;
+
+  void verifyAnalysis() const override;
+
+  void releaseMemory() override { LI.releaseMemory(); }
+
+  void print(raw_ostream &O, const Module *M = nullptr) const override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Analysis/LoopInfoImpl.h b/include/llvm/Analysis/LoopInfoImpl.h
index 948be0f..3321f39 100644
--- a/include/llvm/Analysis/LoopInfoImpl.h
+++ b/include/llvm/Analysis/LoopInfoImpl.h
@@ -545,6 +545,25 @@ void LoopInfoBase<BlockT, LoopT>::print(raw_ostream &OS) const {
 #endif
 }
 
+template<class BlockT, class LoopT>
+void LoopInfoBase<BlockT, LoopT>::verify() const {
+  DenseSet<const LoopT*> Loops;
+  for (iterator I = begin(), E = end(); I != E; ++I) {
+    assert(!(*I)->getParentLoop() && "Top-level loop has a parent!");
+    (*I)->verifyLoopNest(&Loops);
+  }
+
+  // Verify that blocks are mapped to valid loops.
+#ifndef NDEBUG
+  for (auto &Entry : BBMap) {
+    BlockT *BB = Entry.first;
+    LoopT *L = Entry.second;
+    assert(Loops.count(L) && "orphaned loop");
+    assert(L->contains(BB) && "orphaned block");
+  }
+#endif
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Analysis/MemoryDependenceAnalysis.h b/include/llvm/Analysis/MemoryDependenceAnalysis.h
index 4d315d1..77610b3 100644
--- a/include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ b/include/llvm/Analysis/MemoryDependenceAnalysis.h
@@ -28,7 +28,7 @@ namespace llvm {
   class Instruction;
   class CallSite;
   class AliasAnalysis;
-  class AssumptionTracker;
+  class AssumptionCache;
   class DataLayout;
   class MemoryDependenceAnalysis;
   class PredIteratorCache;
@@ -326,7 +326,7 @@ namespace llvm {
     AliasAnalysis *AA;
     const DataLayout *DL;
     DominatorTree *DT;
-    AssumptionTracker *AT;
+    AssumptionCache *AC;
     std::unique_ptr<PredIteratorCache> PredCache;
 
   public:
@@ -366,12 +366,16 @@ namespace llvm {
 
 
     /// getNonLocalPointerDependency - Perform a full dependency query for an
-    /// access to the specified (non-volatile) memory location, returning the
-    /// set of instructions that either define or clobber the value.
+    /// access to the QueryInst's specified memory location, returning the set
+    /// of instructions that either define or clobber the value.
     ///
-    /// This method assumes the pointer has a "NonLocal" dependency within BB.
-    void getNonLocalPointerDependency(const AliasAnalysis::Location &Loc,
-                                      bool isLoad, BasicBlock *BB,
+    /// Warning: For a volatile query instruction, the dependencies will be
+    /// accurate, and thus usable for reordering, but it is never legal to
+    /// remove the query instruction.  
+    ///
+    /// This method assumes the pointer has a "NonLocal" dependency within
+    /// QueryInst's parent basic block.
+    void getNonLocalPointerDependency(Instruction *QueryInst,
                                     SmallVectorImpl<NonLocalDepResult> &Result);
 
     /// removeInstruction - Remove an instruction from the dependence analysis,
@@ -424,13 +428,15 @@ namespace llvm {
     MemDepResult getCallSiteDependencyFrom(CallSite C, bool isReadOnlyCall,
                                            BasicBlock::iterator ScanIt,
                                            BasicBlock *BB);
-    bool getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
+    bool getNonLocalPointerDepFromBB(Instruction *QueryInst,
+                                     const PHITransAddr &Pointer,
                                      const AliasAnalysis::Location &Loc,
                                      bool isLoad, BasicBlock *BB,
                                      SmallVectorImpl<NonLocalDepResult> &Result,
                                      DenseMap<BasicBlock*, Value*> &Visited,
                                      bool SkipFirstBlock = false);
-    MemDepResult GetNonLocalInfoForBlock(const AliasAnalysis::Location &Loc,
+    MemDepResult GetNonLocalInfoForBlock(Instruction *QueryInst,
+                                         const AliasAnalysis::Location &Loc,
                                          bool isLoad, BasicBlock *BB,
                                          NonLocalDepInfo *Cache,
                                          unsigned NumSortedEntries);
diff --git a/include/llvm/Analysis/PHITransAddr.h b/include/llvm/Analysis/PHITransAddr.h
index 0790e97..38730d8 100644
--- a/include/llvm/Analysis/PHITransAddr.h
+++ b/include/llvm/Analysis/PHITransAddr.h
@@ -18,7 +18,7 @@
 #include "llvm/IR/Instruction.h"
 
 namespace llvm {
-  class AssumptionTracker;
+  class AssumptionCache;
   class DominatorTree;
   class DataLayout;
   class TargetLibraryInfo;
@@ -44,13 +44,13 @@ class PHITransAddr {
   const TargetLibraryInfo *TLI;
 
   /// A cache of @llvm.assume calls used by SimplifyInstruction.
-  AssumptionTracker *AT;
-  
+  AssumptionCache *AC;
+
   /// InstInputs - The inputs for our symbolic address.
   SmallVector<Instruction*, 4> InstInputs;
 public:
-  PHITransAddr(Value *addr, const DataLayout *DL, AssumptionTracker *AT)
-      : Addr(addr), DL(DL), TLI(nullptr), AT(AT) {
+  PHITransAddr(Value *addr, const DataLayout *DL, AssumptionCache *AC)
+      : Addr(addr), DL(DL), TLI(nullptr), AC(AC) {
     // If the address is an instruction, the whole thing is considered an input.
     if (Instruction *I = dyn_cast<Instruction>(Addr))
       InstInputs.push_back(I);
diff --git a/include/llvm/Analysis/Passes.h b/include/llvm/Analysis/Passes.h
index 10a5605..530faa7 100644
--- a/include/llvm/Analysis/Passes.h
+++ b/include/llvm/Analysis/Passes.h
@@ -162,6 +162,14 @@ namespace llvm {
   // createJumpInstrTableInfoPass - This creates a pass that stores information
   // about the jump tables created by JumpInstrTables
   ImmutablePass *createJumpInstrTableInfoPass();
+
+  //===--------------------------------------------------------------------===//
+  //
+  // createMemDerefPrinter - This pass collects memory dereferenceability
+  // information and prints it with -analyze.
+  //
+  FunctionPass *createMemDerefPrinter();
+
 }
 
 #endif
diff --git a/include/llvm/Analysis/RegionInfo.h b/include/llvm/Analysis/RegionInfo.h
index 6ff7f97..1c7f4d3 100644
--- a/include/llvm/Analysis/RegionInfo.h
+++ b/include/llvm/Analysis/RegionInfo.h
@@ -115,8 +115,8 @@ public:
   typedef typename Tr::RegionT RegionT;
 
 private:
-  RegionNodeBase(const RegionNodeBase &) LLVM_DELETED_FUNCTION;
-  const RegionNodeBase &operator=(const RegionNodeBase &) LLVM_DELETED_FUNCTION;
+  RegionNodeBase(const RegionNodeBase &) = delete;
+  const RegionNodeBase &operator=(const RegionNodeBase &) = delete;
 
   /// This is the entry basic block that starts this region node.  If this is a
   /// BasicBlock RegionNode, then entry is just the basic block, that this
@@ -261,8 +261,8 @@ class RegionBase : public RegionNodeBase<Tr> {
   typedef typename InvBlockTraits::ChildIteratorType PredIterTy;
 
   friend class RegionInfoBase<Tr>;
-  RegionBase(const RegionBase &) LLVM_DELETED_FUNCTION;
-  const RegionBase &operator=(const RegionBase &) LLVM_DELETED_FUNCTION;
+  RegionBase(const RegionBase &) = delete;
+  const RegionBase &operator=(const RegionBase &) = delete;
 
   // Information necessary to manage this Region.
   RegionInfoT *RI;
@@ -674,8 +674,8 @@ class RegionInfoBase {
   RegionInfoBase();
   virtual ~RegionInfoBase();
 
-  RegionInfoBase(const RegionInfoBase &) LLVM_DELETED_FUNCTION;
-  const RegionInfoBase &operator=(const RegionInfoBase &) LLVM_DELETED_FUNCTION;
+  RegionInfoBase(const RegionInfoBase &) = delete;
+  const RegionInfoBase &operator=(const RegionInfoBase &) = delete;
 
   DomTreeT *DT;
   PostDomTreeT *PDT;
diff --git a/include/llvm/Analysis/RegionInfoImpl.h b/include/llvm/Analysis/RegionInfoImpl.h
index b5d0bb3..b0dc263 100644
--- a/include/llvm/Analysis/RegionInfoImpl.h
+++ b/include/llvm/Analysis/RegionInfoImpl.h
@@ -12,11 +12,11 @@
 #ifndef LLVM_ANALYSIS_REGIONINFOIMPL_H
 #define LLVM_ANALYSIS_REGIONINFOIMPL_H
 
-#include "llvm/Analysis/RegionInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Analysis/DominanceFrontier.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 893402e..c60cea9 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -35,7 +35,7 @@
 
 namespace llvm {
   class APInt;
-  class AssumptionTracker;
+  class AssumptionCache;
   class Constant;
   class ConstantInt;
   class DominatorTree;
@@ -71,8 +71,8 @@ namespace llvm {
     unsigned short SubclassData;
 
   private:
-    SCEV(const SCEV &) LLVM_DELETED_FUNCTION;
-    void operator=(const SCEV &) LLVM_DELETED_FUNCTION;
+    SCEV(const SCEV &) = delete;
+    void operator=(const SCEV &) = delete;
 
   public:
     /// NoWrapFlags are bitfield indices into SubclassData.
@@ -82,12 +82,13 @@ namespace llvm {
     /// operator. NSW is a misnomer that we use to mean no signed overflow or
     /// underflow.
     ///
-    /// AddRec expression may have a no-self-wraparound <NW> property if the
-    /// result can never reach the start value. This property is independent of
-    /// the actual start value and step direction. Self-wraparound is defined
-    /// purely in terms of the recurrence's loop, step size, and
-    /// bitwidth. Formally, a recurrence with no self-wraparound satisfies:
-    /// abs(step) * max-iteration(loop) <= unsigned-max(bitwidth).
+    /// AddRec expressions may have a no-self-wraparound <NW> property if, in
+    /// the integer domain, abs(step) * max-iteration(loop) <=
+    /// unsigned-max(bitwidth).  This means that the recurrence will never reach
+    /// its start value if the step is non-zero.  Computing the same value on
+    /// each iteration is not considered wrapping, and recurrences with step = 0
+    /// are trivially <NW>.  <NW> is independent of the sign of step and the
+    /// value the add recurrence starts with.
     ///
     /// Note that NUW and NSW are also valid properties of a recurrence, and
     /// either implies NW. For convenience, NW will be set for a recurrence
@@ -225,7 +226,7 @@ namespace llvm {
     Function *F;
 
     /// The tracker for @llvm.assume intrinsics in this function.
-    AssumptionTracker *AT;
+    AssumptionCache *AC;
 
     /// LI - The loop information for the function we are currently analyzing.
     ///
@@ -372,14 +373,17 @@ namespace llvm {
 
     /// LoopDispositions - Memoized computeLoopDisposition results.
     DenseMap<const SCEV *,
-             SmallVector<std::pair<const Loop *, LoopDisposition>, 2> > LoopDispositions;
+             SmallVector<PointerIntPair<const Loop *, 2, LoopDisposition>, 2>>
+        LoopDispositions;
 
     /// computeLoopDisposition - Compute a LoopDisposition value.
     LoopDisposition computeLoopDisposition(const SCEV *S, const Loop *L);
 
     /// BlockDispositions - Memoized computeBlockDisposition results.
-    DenseMap<const SCEV *,
-             SmallVector<std::pair<const BasicBlock *, BlockDisposition>, 2> > BlockDispositions;
+    DenseMap<
+        const SCEV *,
+        SmallVector<PointerIntPair<const BasicBlock *, 2, BlockDisposition>, 2>>
+        BlockDispositions;
 
     /// computeBlockDisposition - Compute a BlockDisposition value.
     BlockDisposition computeBlockDisposition(const SCEV *S, const BasicBlock *BB);
diff --git a/include/llvm/Analysis/ScalarEvolutionExpressions.h b/include/llvm/Analysis/ScalarEvolutionExpressions.h
index 94e665f..ff82db1 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpressions.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpressions.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_ANALYSIS_SCALAREVOLUTIONEXPRESSIONS_H
 #define LLVM_ANALYSIS_SCALAREVOLUTIONEXPRESSIONS_H
 
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Support/ErrorHandling.h"
 
diff --git a/include/llvm/Analysis/SparsePropagation.h b/include/llvm/Analysis/SparsePropagation.h
index 65ff2f6..9ccae5f 100644
--- a/include/llvm/Analysis/SparsePropagation.h
+++ b/include/llvm/Analysis/SparsePropagation.h
@@ -131,8 +131,8 @@ class SparseSolver {
   typedef std::pair<BasicBlock*,BasicBlock*> Edge;
   std::set<Edge> KnownFeasibleEdges;
 
-  SparseSolver(const SparseSolver&) LLVM_DELETED_FUNCTION;
-  void operator=(const SparseSolver&) LLVM_DELETED_FUNCTION;
+  SparseSolver(const SparseSolver&) = delete;
+  void operator=(const SparseSolver&) = delete;
 public:
   explicit SparseSolver(AbstractLatticeFunction *Lattice)
     : LatticeFunc(Lattice) {}
diff --git a/include/llvm/Analysis/TargetLibraryInfo.h b/include/llvm/Analysis/TargetLibraryInfo.h
new file mode 100644
index 0000000..5c6f364
--- /dev/null
+++ b/include/llvm/Analysis/TargetLibraryInfo.h
@@ -0,0 +1,935 @@
+//===-- TargetLibraryInfo.h - Library information ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_TARGETLIBRARYINFO_H
+#define LLVM_ANALYSIS_TARGETLIBRARYINFO_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+// BEGIN ANDROID-SPECIFIC
+#ifdef WIN32
+#ifdef fseeko
+#undef fseeko
+#endif
+#ifdef ftello
+#undef ftello
+#endif
+#endif  // WIN32
+// END ANDROID-SPECIFIC
+
+namespace llvm {
+class PreservedAnalyses;
+
+  namespace LibFunc {
+    enum Func {
+      /// int _IO_getc(_IO_FILE * __fp);
+      under_IO_getc,
+      /// int _IO_putc(int __c, _IO_FILE * __fp);
+      under_IO_putc,
+      /// void operator delete[](void*);
+      ZdaPv,
+      /// void operator delete[](void*, nothrow);
+      ZdaPvRKSt9nothrow_t,
+      /// void operator delete[](void*, unsigned int);
+      ZdaPvj,
+      /// void operator delete[](void*, unsigned long);
+      ZdaPvm,
+      /// void operator delete(void*);
+      ZdlPv,
+      /// void operator delete(void*, nothrow);
+      ZdlPvRKSt9nothrow_t,
+      /// void operator delete(void*, unsigned int);
+      ZdlPvj,
+      /// void operator delete(void*, unsigned long);
+      ZdlPvm,
+      /// void *new[](unsigned int);
+      Znaj,
+      /// void *new[](unsigned int, nothrow);
+      ZnajRKSt9nothrow_t,
+      /// void *new[](unsigned long);
+      Znam,
+      /// void *new[](unsigned long, nothrow);
+      ZnamRKSt9nothrow_t,
+      /// void *new(unsigned int);
+      Znwj,
+      /// void *new(unsigned int, nothrow);
+      ZnwjRKSt9nothrow_t,
+      /// void *new(unsigned long);
+      Znwm,
+      /// void *new(unsigned long, nothrow);
+      ZnwmRKSt9nothrow_t,
+      /// double __cospi(double x);
+      cospi,
+      /// float __cospif(float x);
+      cospif,
+      /// int __cxa_atexit(void (*f)(void *), void *p, void *d);
+      cxa_atexit,
+      /// void __cxa_guard_abort(guard_t *guard);
+      /// guard_t is int64_t in Itanium ABI or int32_t on ARM eabi.
+      cxa_guard_abort,
+      /// int __cxa_guard_acquire(guard_t *guard);
+      cxa_guard_acquire,
+      /// void __cxa_guard_release(guard_t *guard);
+      cxa_guard_release,
+      /// int __isoc99_scanf (const char *format, ...)
+      dunder_isoc99_scanf,
+      /// int __isoc99_sscanf(const char *s, const char *format, ...)
+      dunder_isoc99_sscanf,
+      /// void *__memcpy_chk(void *s1, const void *s2, size_t n, size_t s1size);
+      memcpy_chk,
+      /// void *__memmove_chk(void *s1, const void *s2, size_t n,
+      ///                     size_t s1size);
+      memmove_chk,
+      /// void *__memset_chk(void *s, char v, size_t n, size_t s1size);
+      memset_chk,
+      /// double __sincospi_stret(double x);
+      sincospi_stret,
+      /// float __sincospif_stret(float x);
+      sincospif_stret,
+      /// double __sinpi(double x);
+      sinpi,
+      /// float __sinpif(float x);
+      sinpif,
+      /// double __sqrt_finite(double x);
+      sqrt_finite,
+      /// float __sqrt_finite(float x);
+      sqrtf_finite,
+      /// long double __sqrt_finite(long double x);
+      sqrtl_finite,
+      /// char *__stpcpy_chk(char *s1, const char *s2, size_t s1size);
+      stpcpy_chk,
+      /// char *__stpncpy_chk(char *s1, const char *s2, size_t n,
+      ///                     size_t s1size);
+      stpncpy_chk,
+      /// char *__strcpy_chk(char *s1, const char *s2, size_t s1size);
+      strcpy_chk,
+      /// char * __strdup(const char *s);
+      dunder_strdup,
+      /// char *__strncpy_chk(char *s1, const char *s2, size_t n,
+      ///                     size_t s1size);
+      strncpy_chk,
+      /// char *__strndup(const char *s, size_t n);
+      dunder_strndup,
+      /// char * __strtok_r(char *s, const char *delim, char **save_ptr);
+      dunder_strtok_r,
+      /// int abs(int j);
+      abs,
+      /// int access(const char *path, int amode);
+      access,
+      /// double acos(double x);
+      acos,
+      /// float acosf(float x);
+      acosf,
+      /// double acosh(double x);
+      acosh,
+      /// float acoshf(float x);
+      acoshf,
+      /// long double acoshl(long double x);
+      acoshl,
+      /// long double acosl(long double x);
+      acosl,
+      /// double asin(double x);
+      asin,
+      /// float asinf(float x);
+      asinf,
+      /// double asinh(double x);
+      asinh,
+      /// float asinhf(float x);
+      asinhf,
+      /// long double asinhl(long double x);
+      asinhl,
+      /// long double asinl(long double x);
+      asinl,
+      /// double atan(double x);
+      atan,
+      /// double atan2(double y, double x);
+      atan2,
+      /// float atan2f(float y, float x);
+      atan2f,
+      /// long double atan2l(long double y, long double x);
+      atan2l,
+      /// float atanf(float x);
+      atanf,
+      /// double atanh(double x);
+      atanh,
+      /// float atanhf(float x);
+      atanhf,
+      /// long double atanhl(long double x);
+      atanhl,
+      /// long double atanl(long double x);
+      atanl,
+      /// double atof(const char *str);
+      atof,
+      /// int atoi(const char *str);
+      atoi,
+      /// long atol(const char *str);
+      atol,
+      /// long long atoll(const char *nptr);
+      atoll,
+      /// int bcmp(const void *s1, const void *s2, size_t n);
+      bcmp,
+      /// void bcopy(const void *s1, void *s2, size_t n);
+      bcopy,
+      /// void bzero(void *s, size_t n);
+      bzero,
+      /// void *calloc(size_t count, size_t size);
+      calloc,
+      /// double cbrt(double x);
+      cbrt,
+      /// float cbrtf(float x);
+      cbrtf,
+      /// long double cbrtl(long double x);
+      cbrtl,
+      /// double ceil(double x);
+      ceil,
+      /// float ceilf(float x);
+      ceilf,
+      /// long double ceill(long double x);
+      ceill,
+      /// int chmod(const char *path, mode_t mode);
+      chmod,
+      /// int chown(const char *path, uid_t owner, gid_t group);
+      chown,
+      /// void clearerr(FILE *stream);
+      clearerr,
+      /// int closedir(DIR *dirp);
+      closedir,
+      /// double copysign(double x, double y);
+      copysign,
+      /// float copysignf(float x, float y);
+      copysignf,
+      /// long double copysignl(long double x, long double y);
+      copysignl,
+      /// double cos(double x);
+      cos,
+      /// float cosf(float x);
+      cosf,
+      /// double cosh(double x);
+      cosh,
+      /// float coshf(float x);
+      coshf,
+      /// long double coshl(long double x);
+      coshl,
+      /// long double cosl(long double x);
+      cosl,
+      /// char *ctermid(char *s);
+      ctermid,
+      /// double exp(double x);
+      exp,
+      /// double exp10(double x);
+      exp10,
+      /// float exp10f(float x);
+      exp10f,
+      /// long double exp10l(long double x);
+      exp10l,
+      /// double exp2(double x);
+      exp2,
+      /// float exp2f(float x);
+      exp2f,
+      /// long double exp2l(long double x);
+      exp2l,
+      /// float expf(float x);
+      expf,
+      /// long double expl(long double x);
+      expl,
+      /// double expm1(double x);
+      expm1,
+      /// float expm1f(float x);
+      expm1f,
+      /// long double expm1l(long double x);
+      expm1l,
+      /// double fabs(double x);
+      fabs,
+      /// float fabsf(float x);
+      fabsf,
+      /// long double fabsl(long double x);
+      fabsl,
+      /// int fclose(FILE *stream);
+      fclose,
+      /// FILE *fdopen(int fildes, const char *mode);
+      fdopen,
+      /// int feof(FILE *stream);
+      feof,
+      /// int ferror(FILE *stream);
+      ferror,
+      /// int fflush(FILE *stream);
+      fflush,
+      /// int ffs(int i);
+      ffs,
+      /// int ffsl(long int i);
+      ffsl,
+      /// int ffsll(long long int i);
+      ffsll,
+      /// int fgetc(FILE *stream);
+      fgetc,
+      /// int fgetpos(FILE *stream, fpos_t *pos);
+      fgetpos,
+      /// char *fgets(char *s, int n, FILE *stream);
+      fgets,
+      /// int fileno(FILE *stream);
+      fileno,
+      /// int fiprintf(FILE *stream, const char *format, ...);
+      fiprintf,
+      /// void flockfile(FILE *file);
+      flockfile,
+      /// double floor(double x);
+      floor,
+      /// float floorf(float x);
+      floorf,
+      /// long double floorl(long double x);
+      floorl,
+      /// double fmax(double x, double y);
+      fmax,
+      /// float fmaxf(float x, float y);
+      fmaxf,
+      /// long double fmaxl(long double x, long double y);
+      fmaxl,
+      /// double fmin(double x, double y);
+      fmin,
+      /// float fminf(float x, float y);
+      fminf,
+      /// long double fminl(long double x, long double y);
+      fminl,
+      /// double fmod(double x, double y);
+      fmod,
+      /// float fmodf(float x, float y);
+      fmodf,
+      /// long double fmodl(long double x, long double y);
+      fmodl,
+      /// FILE *fopen(const char *filename, const char *mode);
+      fopen,
+      /// FILE *fopen64(const char *filename, const char *opentype)
+      fopen64,
+      /// int fprintf(FILE *stream, const char *format, ...);
+      fprintf,
+      /// int fputc(int c, FILE *stream);
+      fputc,
+      /// int fputs(const char *s, FILE *stream);
+      fputs,
+      /// size_t fread(void *ptr, size_t size, size_t nitems, FILE *stream);
+      fread,
+      /// void free(void *ptr);
+      free,
+      /// double frexp(double num, int *exp);
+      frexp,
+      /// float frexpf(float num, int *exp);
+      frexpf,
+      /// long double frexpl(long double num, int *exp);
+      frexpl,
+      /// int fscanf(FILE *stream, const char *format, ... );
+      fscanf,
+      /// int fseek(FILE *stream, long offset, int whence);
+      fseek,
+      /// int fseeko(FILE *stream, off_t offset, int whence);
+      fseeko,
+      /// int fseeko64(FILE *stream, off64_t offset, int whence)
+      fseeko64,
+      /// int fsetpos(FILE *stream, const fpos_t *pos);
+      fsetpos,
+      /// int fstat(int fildes, struct stat *buf);
+      fstat,
+      /// int fstat64(int filedes, struct stat64 *buf)
+      fstat64,
+      /// int fstatvfs(int fildes, struct statvfs *buf);
+      fstatvfs,
+      /// int fstatvfs64(int fildes, struct statvfs64 *buf);
+      fstatvfs64,
+      /// long ftell(FILE *stream);
+      ftell,
+      /// off_t ftello(FILE *stream);
+      ftello,
+      /// off64_t ftello64(FILE *stream)
+      ftello64,
+      /// int ftrylockfile(FILE *file);
+      ftrylockfile,
+      /// void funlockfile(FILE *file);
+      funlockfile,
+      /// size_t fwrite(const void *ptr, size_t size, size_t nitems,
+      /// FILE *stream);
+      fwrite,
+      /// int getc(FILE *stream);
+      getc,
+      /// int getc_unlocked(FILE *stream);
+      getc_unlocked,
+      /// int getchar(void);
+      getchar,
+      /// char *getenv(const char *name);
+      getenv,
+      /// int getitimer(int which, struct itimerval *value);
+      getitimer,
+      /// int getlogin_r(char *name, size_t namesize);
+      getlogin_r,
+      /// struct passwd *getpwnam(const char *name);
+      getpwnam,
+      /// char *gets(char *s);
+      gets,
+      /// int gettimeofday(struct timeval *tp, void *tzp);
+      gettimeofday,
+      /// uint32_t htonl(uint32_t hostlong);
+      htonl,
+      /// uint16_t htons(uint16_t hostshort);
+      htons,
+      /// int iprintf(const char *format, ...);
+      iprintf,
+      /// int isascii(int c);
+      isascii,
+      /// int isdigit(int c);
+      isdigit,
+      /// long int labs(long int j);
+      labs,
+      /// int lchown(const char *path, uid_t owner, gid_t group);
+      lchown,
+      /// double ldexp(double x, int n);
+      ldexp,
+      /// float ldexpf(float x, int n);
+      ldexpf,
+      /// long double ldexpl(long double x, int n);
+      ldexpl,
+      /// long long int llabs(long long int j);
+      llabs,
+      /// double log(double x);
+      log,
+      /// double log10(double x);
+      log10,
+      /// float log10f(float x);
+      log10f,
+      /// long double log10l(long double x);
+      log10l,
+      /// double log1p(double x);
+      log1p,
+      /// float log1pf(float x);
+      log1pf,
+      /// long double log1pl(long double x);
+      log1pl,
+      /// double log2(double x);
+      log2,
+      /// float log2f(float x);
+      log2f,
+      /// double long double log2l(long double x);
+      log2l,
+      /// double logb(double x);
+      logb,
+      /// float logbf(float x);
+      logbf,
+      /// long double logbl(long double x);
+      logbl,
+      /// float logf(float x);
+      logf,
+      /// long double logl(long double x);
+      logl,
+      /// int lstat(const char *path, struct stat *buf);
+      lstat,
+      /// int lstat64(const char *path, struct stat64 *buf);
+      lstat64,
+      /// void *malloc(size_t size);
+      malloc,
+      /// void *memalign(size_t boundary, size_t size);
+      memalign,
+      /// void *memccpy(void *s1, const void *s2, int c, size_t n);
+      memccpy,
+      /// void *memchr(const void *s, int c, size_t n);
+      memchr,
+      /// int memcmp(const void *s1, const void *s2, size_t n);
+      memcmp,
+      /// void *memcpy(void *s1, const void *s2, size_t n);
+      memcpy,
+      /// void *memmove(void *s1, const void *s2, size_t n);
+      memmove,
+      // void *memrchr(const void *s, int c, size_t n);
+      memrchr,
+      /// void *memset(void *b, int c, size_t len);
+      memset,
+      /// void memset_pattern16(void *b, const void *pattern16, size_t len);
+      memset_pattern16,
+      /// int mkdir(const char *path, mode_t mode);
+      mkdir,
+      /// time_t mktime(struct tm *timeptr);
+      mktime,
+      /// double modf(double x, double *iptr);
+      modf,
+      /// float modff(float, float *iptr);
+      modff,
+      /// long double modfl(long double value, long double *iptr);
+      modfl,
+      /// double nearbyint(double x);
+      nearbyint,
+      /// float nearbyintf(float x);
+      nearbyintf,
+      /// long double nearbyintl(long double x);
+      nearbyintl,
+      /// uint32_t ntohl(uint32_t netlong);
+      ntohl,
+      /// uint16_t ntohs(uint16_t netshort);
+      ntohs,
+      /// int open(const char *path, int oflag, ... );
+      open,
+      /// int open64(const char *filename, int flags[, mode_t mode])
+      open64,
+      /// DIR *opendir(const char *dirname);
+      opendir,
+      /// int pclose(FILE *stream);
+      pclose,
+      /// void perror(const char *s);
+      perror,
+      /// FILE *popen(const char *command, const char *mode);
+      popen,
+      /// int posix_memalign(void **memptr, size_t alignment, size_t size);
+      posix_memalign,
+      /// double pow(double x, double y);
+      pow,
+      /// float powf(float x, float y);
+      powf,
+      /// long double powl(long double x, long double y);
+      powl,
+      /// ssize_t pread(int fildes, void *buf, size_t nbyte, off_t offset);
+      pread,
+      /// int printf(const char *format, ...);
+      printf,
+      /// int putc(int c, FILE *stream);
+      putc,
+      /// int putchar(int c);
+      putchar,
+      /// int puts(const char *s);
+      puts,
+      /// ssize_t pwrite(int fildes, const void *buf, size_t nbyte,
+      ///                off_t offset);
+      pwrite,
+      /// void qsort(void *base, size_t nel, size_t width,
+      ///            int (*compar)(const void *, const void *));
+      qsort,
+      /// ssize_t read(int fildes, void *buf, size_t nbyte);
+      read,
+      /// ssize_t readlink(const char *path, char *buf, size_t bufsize);
+      readlink,
+      /// void *realloc(void *ptr, size_t size);
+      realloc,
+      /// void *reallocf(void *ptr, size_t size);
+      reallocf,
+      /// char *realpath(const char *file_name, char *resolved_name);
+      realpath,
+      /// int remove(const char *path);
+      remove,
+      /// int rename(const char *old, const char *new);
+      rename,
+      /// void rewind(FILE *stream);
+      rewind,
+      /// double rint(double x);
+      rint,
+      /// float rintf(float x);
+      rintf,
+      /// long double rintl(long double x);
+      rintl,
+      /// int rmdir(const char *path);
+      rmdir,
+      /// double round(double x);
+      round,
+      /// float roundf(float x);
+      roundf,
+      /// long double roundl(long double x);
+      roundl,
+      /// int scanf(const char *restrict format, ... );
+      scanf,
+      /// void setbuf(FILE *stream, char *buf);
+      setbuf,
+      /// int setitimer(int which, const struct itimerval *value,
+      ///               struct itimerval *ovalue);
+      setitimer,
+      /// int setvbuf(FILE *stream, char *buf, int type, size_t size);
+      setvbuf,
+      /// double sin(double x);
+      sin,
+      /// float sinf(float x);
+      sinf,
+      /// double sinh(double x);
+      sinh,
+      /// float sinhf(float x);
+      sinhf,
+      /// long double sinhl(long double x);
+      sinhl,
+      /// long double sinl(long double x);
+      sinl,
+      /// int siprintf(char *str, const char *format, ...);
+      siprintf,
+      /// int snprintf(char *s, size_t n, const char *format, ...);
+      snprintf,
+      /// int sprintf(char *str, const char *format, ...);
+      sprintf,
+      /// double sqrt(double x);
+      sqrt,
+      /// float sqrtf(float x);
+      sqrtf,
+      /// long double sqrtl(long double x);
+      sqrtl,
+      /// int sscanf(const char *s, const char *format, ... );
+      sscanf,
+      /// int stat(const char *path, struct stat *buf);
+      stat,
+      /// int stat64(const char *path, struct stat64 *buf);
+      stat64,
+      /// int statvfs(const char *path, struct statvfs *buf);
+      statvfs,
+      /// int statvfs64(const char *path, struct statvfs64 *buf)
+      statvfs64,
+      /// char *stpcpy(char *s1, const char *s2);
+      stpcpy,
+      /// char *stpncpy(char *s1, const char *s2, size_t n);
+      stpncpy,
+      /// int strcasecmp(const char *s1, const char *s2);
+      strcasecmp,
+      /// char *strcat(char *s1, const char *s2);
+      strcat,
+      /// char *strchr(const char *s, int c);
+      strchr,
+      /// int strcmp(const char *s1, const char *s2);
+      strcmp,
+      /// int strcoll(const char *s1, const char *s2);
+      strcoll,
+      /// char *strcpy(char *s1, const char *s2);
+      strcpy,
+      /// size_t strcspn(const char *s1, const char *s2);
+      strcspn,
+      /// char *strdup(const char *s1);
+      strdup,
+      /// size_t strlen(const char *s);
+      strlen,
+      /// int strncasecmp(const char *s1, const char *s2, size_t n);
+      strncasecmp,
+      /// char *strncat(char *s1, const char *s2, size_t n);
+      strncat,
+      /// int strncmp(const char *s1, const char *s2, size_t n);
+      strncmp,
+      /// char *strncpy(char *s1, const char *s2, size_t n);
+      strncpy,
+      /// char *strndup(const char *s1, size_t n);
+      strndup,
+      /// size_t strnlen(const char *s, size_t maxlen);
+      strnlen,
+      /// char *strpbrk(const char *s1, const char *s2);
+      strpbrk,
+      /// char *strrchr(const char *s, int c);
+      strrchr,
+      /// size_t strspn(const char *s1, const char *s2);
+      strspn,
+      /// char *strstr(const char *s1, const char *s2);
+      strstr,
+      /// double strtod(const char *nptr, char **endptr);
+      strtod,
+      /// float strtof(const char *nptr, char **endptr);
+      strtof,
+      // char *strtok(char *s1, const char *s2);
+      strtok,
+      // char *strtok_r(char *s, const char *sep, char **lasts);
+      strtok_r,
+      /// long int strtol(const char *nptr, char **endptr, int base);
+      strtol,
+      /// long double strtold(const char *nptr, char **endptr);
+      strtold,
+      /// long long int strtoll(const char *nptr, char **endptr, int base);
+      strtoll,
+      /// unsigned long int strtoul(const char *nptr, char **endptr, int base);
+      strtoul,
+      /// unsigned long long int strtoull(const char *nptr, char **endptr,
+      ///                                 int base);
+      strtoull,
+      /// size_t strxfrm(char *s1, const char *s2, size_t n);
+      strxfrm,
+      /// int system(const char *command);
+      system,
+      /// double tan(double x);
+      tan,
+      /// float tanf(float x);
+      tanf,
+      /// double tanh(double x);
+      tanh,
+      /// float tanhf(float x);
+      tanhf,
+      /// long double tanhl(long double x);
+      tanhl,
+      /// long double tanl(long double x);
+      tanl,
+      /// clock_t times(struct tms *buffer);
+      times,
+      /// FILE *tmpfile(void);
+      tmpfile,
+      /// FILE *tmpfile64(void)
+      tmpfile64,
+      /// int toascii(int c);
+      toascii,
+      /// double trunc(double x);
+      trunc,
+      /// float truncf(float x);
+      truncf,
+      /// long double truncl(long double x);
+      truncl,
+      /// int uname(struct utsname *name);
+      uname,
+      /// int ungetc(int c, FILE *stream);
+      ungetc,
+      /// int unlink(const char *path);
+      unlink,
+      /// int unsetenv(const char *name);
+      unsetenv,
+      /// int utime(const char *path, const struct utimbuf *times);
+      utime,
+      /// int utimes(const char *path, const struct timeval times[2]);
+      utimes,
+      /// void *valloc(size_t size);
+      valloc,
+      /// int vfprintf(FILE *stream, const char *format, va_list ap);
+      vfprintf,
+      /// int vfscanf(FILE *stream, const char *format, va_list arg);
+      vfscanf,
+      /// int vprintf(const char *restrict format, va_list ap);
+      vprintf,
+      /// int vscanf(const char *format, va_list arg);
+      vscanf,
+      /// int vsnprintf(char *s, size_t n, const char *format, va_list ap);
+      vsnprintf,
+      /// int vsprintf(char *s, const char *format, va_list ap);
+      vsprintf,
+      /// int vsscanf(const char *s, const char *format, va_list arg);
+      vsscanf,
+      /// ssize_t write(int fildes, const void *buf, size_t nbyte);
+      write,
+
+      NumLibFuncs
+    };
+  }
+
+/// \brief Implementation of the target library information.
+///
+/// This class constructs tables that hold the target library information and
+/// make it available. However, it is somewhat expensive to compute and only
+/// depends on the triple. So users typicaly interact with the \c
+/// TargetLibraryInfo wrapper below.
+class TargetLibraryInfoImpl {
+  friend class TargetLibraryInfo;
+
+  unsigned char AvailableArray[(LibFunc::NumLibFuncs+3)/4];
+  llvm::DenseMap<unsigned, std::string> CustomNames;
+  static const char* StandardNames[LibFunc::NumLibFuncs];
+
+  enum AvailabilityState {
+    StandardName = 3, // (memset to all ones)
+    CustomName = 1,
+    Unavailable = 0  // (memset to all zeros)
+  };
+  void setState(LibFunc::Func F, AvailabilityState State) {
+    AvailableArray[F/4] &= ~(3 << 2*(F&3));
+    AvailableArray[F/4] |= State << 2*(F&3);
+  }
+  AvailabilityState getState(LibFunc::Func F) const {
+    return static_cast<AvailabilityState>((AvailableArray[F/4] >> 2*(F&3)) & 3);
+  }
+
+public:
+  TargetLibraryInfoImpl();
+  explicit TargetLibraryInfoImpl(const Triple &T);
+
+  // Provide value semantics.
+  TargetLibraryInfoImpl(const TargetLibraryInfoImpl &TLI);
+  TargetLibraryInfoImpl(TargetLibraryInfoImpl &&TLI);
+  TargetLibraryInfoImpl &operator=(const TargetLibraryInfoImpl &TLI);
+  TargetLibraryInfoImpl &operator=(TargetLibraryInfoImpl &&TLI);
+
+  /// \brief Searches for a particular function name.
+  ///
+  /// If it is one of the known library functions, return true and set F to the
+  /// corresponding value.
+  bool getLibFunc(StringRef funcName, LibFunc::Func &F) const;
+
+  /// \brief Forces a function to be marked as unavailable.
+  void setUnavailable(LibFunc::Func F) {
+    setState(F, Unavailable);
+  }
+
+  /// \brief Forces a function to be marked as available.
+  void setAvailable(LibFunc::Func F) {
+    setState(F, StandardName);
+  }
+
+  /// \brief Forces a function to be marked as available and provide an
+  /// alternate name that must be used.
+  void setAvailableWithName(LibFunc::Func F, StringRef Name) {
+    if (StandardNames[F] != Name) {
+      setState(F, CustomName);
+      CustomNames[F] = Name;
+      assert(CustomNames.find(F) != CustomNames.end());
+    } else {
+      setState(F, StandardName);
+    }
+  }
+
+  /// \brief Disables all builtins.
+  ///
+  /// This can be used for options like -fno-builtin.
+  void disableAllFunctions();
+};
+
+/// \brief Provides information about what library functions are available for
+/// the current target.
+///
+/// This both allows optimizations to handle them specially and frontends to
+/// disable such optimizations through -fno-builtin etc.
+class TargetLibraryInfo {
+  friend class TargetLibraryAnalysis;
+  friend class TargetLibraryInfoWrapperPass;
+
+  const TargetLibraryInfoImpl *Impl;
+
+public:
+  explicit TargetLibraryInfo(const TargetLibraryInfoImpl &Impl) : Impl(&Impl) {}
+
+  // Provide value semantics.
+  TargetLibraryInfo(const TargetLibraryInfo &TLI) : Impl(TLI.Impl) {}
+  TargetLibraryInfo(TargetLibraryInfo &&TLI) : Impl(TLI.Impl) {}
+  TargetLibraryInfo &operator=(const TargetLibraryInfo &TLI) {
+    Impl = TLI.Impl;
+    return *this;
+  }
+  TargetLibraryInfo &operator=(TargetLibraryInfo &&TLI) {
+    Impl = TLI.Impl;
+    return *this;
+  }
+
+  /// \brief Searches for a particular function name.
+  ///
+  /// If it is one of the known library functions, return true and set F to the
+  /// corresponding value.
+  bool getLibFunc(StringRef funcName, LibFunc::Func &F) const {
+    return Impl->getLibFunc(funcName, F);
+  }
+
+  /// \brief Tests wether a library function is available.
+  bool has(LibFunc::Func F) const {
+    return Impl->getState(F) != TargetLibraryInfoImpl::Unavailable;
+  }
+
+  /// \brief Tests if the function is both available and a candidate for
+  /// optimized code generation.
+  bool hasOptimizedCodeGen(LibFunc::Func F) const {
+    if (Impl->getState(F) == TargetLibraryInfoImpl::Unavailable)
+      return false;
+    switch (F) {
+    default: break;
+    case LibFunc::copysign:  case LibFunc::copysignf:  case LibFunc::copysignl:
+    case LibFunc::fabs:      case LibFunc::fabsf:      case LibFunc::fabsl:
+    case LibFunc::sin:       case LibFunc::sinf:       case LibFunc::sinl:
+    case LibFunc::cos:       case LibFunc::cosf:       case LibFunc::cosl:
+    case LibFunc::sqrt:      case LibFunc::sqrtf:      case LibFunc::sqrtl:
+    case LibFunc::sqrt_finite: case LibFunc::sqrtf_finite:
+                                                  case LibFunc::sqrtl_finite:
+    case LibFunc::fmax:      case LibFunc::fmaxf:      case LibFunc::fmaxl:
+    case LibFunc::fmin:      case LibFunc::fminf:      case LibFunc::fminl:
+    case LibFunc::floor:     case LibFunc::floorf:     case LibFunc::floorl:
+    case LibFunc::nearbyint: case LibFunc::nearbyintf: case LibFunc::nearbyintl:
+    case LibFunc::ceil:      case LibFunc::ceilf:      case LibFunc::ceill:
+    case LibFunc::rint:      case LibFunc::rintf:      case LibFunc::rintl:
+    case LibFunc::round:     case LibFunc::roundf:     case LibFunc::roundl:
+    case LibFunc::trunc:     case LibFunc::truncf:     case LibFunc::truncl:
+    case LibFunc::log2:      case LibFunc::log2f:      case LibFunc::log2l:
+    case LibFunc::exp2:      case LibFunc::exp2f:      case LibFunc::exp2l:
+    case LibFunc::memcmp:    case LibFunc::strcmp:     case LibFunc::strcpy:
+    case LibFunc::stpcpy:    case LibFunc::strlen:     case LibFunc::strnlen:
+    case LibFunc::memchr:
+      return true;
+    }
+    return false;
+  }
+
+  StringRef getName(LibFunc::Func F) const {
+    auto State = Impl->getState(F);
+    if (State == TargetLibraryInfoImpl::Unavailable)
+      return StringRef();
+    if (State == TargetLibraryInfoImpl::StandardName)
+      return Impl->StandardNames[F];
+    assert(State == TargetLibraryInfoImpl::CustomName);
+    return Impl->CustomNames.find(F)->second;
+  }
+
+  /// \brief Handle invalidation from the pass manager.
+  ///
+  /// If we try to invalidate this info, just return false. It cannot become
+  /// invalid even if the module changes.
+  bool invalidate(Module &, const PreservedAnalyses &) { return false; }
+};
+
+/// \brief Analysis pass providing the \c TargetLibraryInfo.
+///
+/// Note that this pass's result cannot be invalidated, it is immutable for the
+/// life of the module.
+class TargetLibraryAnalysis {
+public:
+  typedef TargetLibraryInfo Result;
+
+  /// \brief Opaque, unique identifier for this analysis pass.
+  static void *ID() { return (void *)&PassID; }
+
+  /// \brief Default construct the library analysis.
+  ///
+  /// This will use the module's triple to construct the library info for that
+  /// module.
+  TargetLibraryAnalysis() {}
+
+  /// \brief Construct a library analysis with preset info.
+  ///
+  /// This will directly copy the preset info into the result without
+  /// consulting the module's triple.
+  TargetLibraryAnalysis(TargetLibraryInfoImpl PresetInfoImpl)
+      : PresetInfoImpl(std::move(PresetInfoImpl)) {}
+
+  // Move semantics. We spell out the constructors for MSVC.
+  TargetLibraryAnalysis(TargetLibraryAnalysis &&Arg)
+      : PresetInfoImpl(std::move(Arg.PresetInfoImpl)), Impls(std::move(Arg.Impls)) {}
+  TargetLibraryAnalysis &operator=(TargetLibraryAnalysis &&RHS) {
+    PresetInfoImpl = std::move(RHS.PresetInfoImpl);
+    Impls = std::move(RHS.Impls);
+    return *this;
+  }
+
+  TargetLibraryInfo run(Module &M);
+  TargetLibraryInfo run(Function &F);
+
+  /// \brief Provide access to a name for this pass for debugging purposes.
+  static StringRef name() { return "TargetLibraryAnalysis"; }
+
+private:
+  static char PassID;
+
+  Optional<TargetLibraryInfoImpl> PresetInfoImpl;
+
+  StringMap<std::unique_ptr<TargetLibraryInfoImpl>> Impls;
+
+  TargetLibraryInfoImpl &lookupInfoImpl(Triple T);
+};
+
+class TargetLibraryInfoWrapperPass : public ImmutablePass {
+  TargetLibraryInfoImpl TLIImpl;
+  TargetLibraryInfo TLI;
+
+  virtual void anchor();
+
+public:
+  static char ID;
+  TargetLibraryInfoWrapperPass();
+  explicit TargetLibraryInfoWrapperPass(const Triple &T);
+  explicit TargetLibraryInfoWrapperPass(const TargetLibraryInfoImpl &TLI);
+
+  TargetLibraryInfo &getTLI() { return TLI; }
+  const TargetLibraryInfo &getTLI() const { return TLI; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index 9acaaa6..4998141 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -1,4 +1,4 @@
-//===- llvm/Analysis/TargetTransformInfo.h ----------------------*- C++ -*-===//
+//===- TargetTransformInfo.h ------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,22 +6,24 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This pass exposes codegen information to IR-level passes. Every
-// transformation that uses codegen information is broken into three parts:
-// 1. The IR-level analysis pass.
-// 2. The IR-level transformation interface which provides the needed
-//    information.
-// 3. Codegen-level implementation which uses target-specific hooks.
-//
-// This file defines #2, which is the interface that IR-level transformations
-// use for querying the codegen.
-//
+/// \file
+/// This pass exposes codegen information to IR-level passes. Every
+/// transformation that uses codegen information is broken into three parts:
+/// 1. The IR-level analysis pass.
+/// 2. The IR-level transformation interface which provides the needed
+///    information.
+/// 3. Codegen-level implementation which uses target-specific hooks.
+///
+/// This file defines #2, which is the interface that IR-level transformations
+/// use for querying the codegen.
+///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
 #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
 
+#include "llvm/ADT/Optional.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/DataTypes.h"
@@ -31,41 +33,61 @@ namespace llvm {
 class Function;
 class GlobalValue;
 class Loop;
+class PreservedAnalyses;
 class Type;
 class User;
 class Value;
 
-/// TargetTransformInfo - This pass provides access to the codegen
-/// interfaces that are needed for IR-level transformations.
+/// \brief Information about a load/store intrinsic defined by the target.
+struct MemIntrinsicInfo {
+  MemIntrinsicInfo()
+      : ReadMem(false), WriteMem(false), Vol(false), MatchingId(0),
+        NumMemRefs(0), PtrVal(nullptr) {}
+  bool ReadMem;
+  bool WriteMem;
+  bool Vol;
+  // Same Id is set by the target for corresponding load/store intrinsics.
+  unsigned short MatchingId;
+  int NumMemRefs;
+  Value *PtrVal;
+};
+
+/// \brief This pass provides access to the codegen interfaces that are needed
+/// for IR-level transformations.
 class TargetTransformInfo {
-protected:
-  /// \brief The TTI instance one level down the stack.
+public:
+  /// \brief Construct a TTI object using a type implementing the \c Concept
+  /// API below.
   ///
-  /// This is used to implement the default behavior all of the methods which
-  /// is to delegate up through the stack of TTIs until one can answer the
-  /// query.
-  TargetTransformInfo *PrevTTI;
+  /// This is used by targets to construct a TTI wrapping their target-specific
+  /// implementaion that encodes appropriate costs for their target.
+  template <typename T> TargetTransformInfo(T Impl);
 
-  /// \brief The top of the stack of TTI analyses available.
+  /// \brief Construct a baseline TTI object using a minimal implementation of
+  /// the \c Concept API below.
   ///
-  /// This is a convenience routine maintained as TTI analyses become available
-  /// that complements the PrevTTI delegation chain. When one part of an
-  /// analysis pass wants to query another part of the analysis pass it can use
-  /// this to start back at the top of the stack.
-  TargetTransformInfo *TopTTI;
+  /// The TTI implementation will reflect the information in the DataLayout
+  /// provided if non-null.
+  explicit TargetTransformInfo(const DataLayout *DL);
 
-  /// All pass subclasses must in their initializePass routine call
-  /// pushTTIStack with themselves to update the pointers tracking the previous
-  /// TTI instance in the analysis group's stack, and the top of the analysis
-  /// group's stack.
-  void pushTTIStack(Pass *P);
+  // Provide move semantics.
+  TargetTransformInfo(TargetTransformInfo &&Arg);
+  TargetTransformInfo &operator=(TargetTransformInfo &&RHS);
 
-  /// All pass subclasses must call TargetTransformInfo::getAnalysisUsage.
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  // We need to define the destructor out-of-line to define our sub-classes
+  // out-of-line.
+  ~TargetTransformInfo();
 
-public:
-  /// This class is intended to be subclassed by real implementations.
-  virtual ~TargetTransformInfo() = 0;
+  /// \brief Handle the invalidation of this information.
+  ///
+  /// When used as a result of \c TargetIRAnalysis this method will be called
+  /// when the function this was computed for changes. When it returns false,
+  /// the information is preserved across those changes.
+  bool invalidate(Function &, const PreservedAnalyses &) {
+    // FIXME: We should probably in some way ensure that the subtarget
+    // information for a function hasn't changed.
+    return false;
+  }
 
   /// \name Generic Target Information
   /// @{
@@ -86,9 +108,9 @@ public:
   /// skipped by renaming the registers in the CPU, but they still are encoded
   /// and thus wouldn't be considered 'free' here.
   enum TargetCostConstants {
-    TCC_Free = 0,       ///< Expected to fold away in lowering.
-    TCC_Basic = 1,      ///< The cost of a typical 'add' instruction.
-    TCC_Expensive = 4   ///< The cost of a 'div' instruction on x86.
+    TCC_Free = 0,     ///< Expected to fold away in lowering.
+    TCC_Basic = 1,    ///< The cost of a typical 'add' instruction.
+    TCC_Expensive = 4 ///< The cost of a 'div' instruction on x86.
   };
 
   /// \brief Estimate the cost of a specific operation when lowered.
@@ -105,16 +127,15 @@ public:
   ///
   /// The returned cost is defined in terms of \c TargetCostConstants, see its
   /// comments for a detailed explanation of the cost values.
-  virtual unsigned getOperationCost(unsigned Opcode, Type *Ty,
-                                    Type *OpTy = nullptr) const;
+  unsigned getOperationCost(unsigned Opcode, Type *Ty,
+                            Type *OpTy = nullptr) const;
 
   /// \brief Estimate the cost of a GEP operation when lowered.
   ///
   /// The contract for this function is the same as \c getOperationCost except
   /// that it supports an interface that provides extra information specific to
   /// the GEP operation.
-  virtual unsigned getGEPCost(const Value *Ptr,
-                              ArrayRef<const Value *> Operands) const;
+  unsigned getGEPCost(const Value *Ptr, ArrayRef<const Value *> Operands) const;
 
   /// \brief Estimate the cost of a function call when lowered.
   ///
@@ -125,31 +146,31 @@ public:
   /// This is the most basic query for estimating call cost: it only knows the
   /// function type and (potentially) the number of arguments at the call site.
   /// The latter is only interesting for varargs function types.
-  virtual unsigned getCallCost(FunctionType *FTy, int NumArgs = -1) const;
+  unsigned getCallCost(FunctionType *FTy, int NumArgs = -1) const;
 
   /// \brief Estimate the cost of calling a specific function when lowered.
   ///
   /// This overload adds the ability to reason about the particular function
   /// being called in the event it is a library call with special lowering.
-  virtual unsigned getCallCost(const Function *F, int NumArgs = -1) const;
+  unsigned getCallCost(const Function *F, int NumArgs = -1) const;
 
   /// \brief Estimate the cost of calling a specific function when lowered.
   ///
   /// This overload allows specifying a set of candidate argument values.
-  virtual unsigned getCallCost(const Function *F,
-                               ArrayRef<const Value *> Arguments) const;
+  unsigned getCallCost(const Function *F,
+                       ArrayRef<const Value *> Arguments) const;
 
   /// \brief Estimate the cost of an intrinsic when lowered.
   ///
   /// Mirrors the \c getCallCost method but uses an intrinsic identifier.
-  virtual unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                                    ArrayRef<Type *> ParamTys) const;
+  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<Type *> ParamTys) const;
 
   /// \brief Estimate the cost of an intrinsic when lowered.
   ///
   /// Mirrors the \c getCallCost method but uses an intrinsic identifier.
-  virtual unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                                    ArrayRef<const Value *> Arguments) const;
+  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<const Value *> Arguments) const;
 
   /// \brief Estimate the cost of a given IR user when lowered.
   ///
@@ -166,13 +187,13 @@ public:
   ///
   /// The returned cost is defined in terms of \c TargetCostConstants, see its
   /// comments for a detailed explanation of the cost values.
-  virtual unsigned getUserCost(const User *U) const;
+  unsigned getUserCost(const User *U) const;
 
   /// \brief hasBranchDivergence - Return true if branch divergence exists.
   /// Branch divergence has a significantly negative impact on GPU performance
   /// when threads in the same wavefront take different paths due to conditional
   /// branches.
-  virtual bool hasBranchDivergence() const;
+  bool hasBranchDivergence() const;
 
   /// \brief Test whether calls to a function lower to actual program function
   /// calls.
@@ -186,7 +207,7 @@ public:
   /// and execution-speed costs. This would allow modelling the core of this
   /// query more accurately as a call is a single small instruction, but
   /// incurs significant execution cost.
-  virtual bool isLoweredToCall(const Function *F) const;
+  bool isLoweredToCall(const Function *F) const;
 
   /// Parameters that control the generic loop unrolling transformation.
   struct UnrollingPreferences {
@@ -196,6 +217,13 @@ public:
     /// exceed this cost. Set this to UINT_MAX to disable the loop body cost
     /// restriction.
     unsigned Threshold;
+    /// If complete unrolling could help other optimizations (e.g. InstSimplify)
+    /// to remove N% of instructions, then we can go beyond unroll threshold.
+    /// This value set the minimal percent for allowing that.
+    unsigned MinPercentOfOptimized;
+    /// The absolute cost threshold. We won't go beyond this even if complete
+    /// unrolling could result in optimizing out 90% of instructions.
+    unsigned AbsoluteThreshold;
     /// The cost threshold for the unrolled loop when optimizing for size (set
     /// to UINT_MAX to disable).
     unsigned OptSizeThreshold;
@@ -203,8 +231,8 @@ public:
     /// for partial/runtime unrolling (set to UINT_MAX to disable).
     unsigned PartialThreshold;
     /// The cost threshold for the unrolled loop when optimizing for size, like
-    /// OptSizeThreshold, but used for partial/runtime unrolling (set to UINT_MAX
-    /// to disable).
+    /// OptSizeThreshold, but used for partial/runtime unrolling (set to
+    /// UINT_MAX to disable).
     unsigned PartialOptSizeThreshold;
     /// A forced unrolling factor (the number of concatenated bodies of the
     /// original loop in the unrolled loop body). When set to 0, the unrolling
@@ -218,18 +246,17 @@ public:
     unsigned MaxCount;
     /// Allow partial unrolling (unrolling of loops to expand the size of the
     /// loop body, not only to eliminate small constant-trip-count loops).
-    bool     Partial;
+    bool Partial;
     /// Allow runtime unrolling (unrolling of loops to expand the size of the
-    /// loop body even when the number of loop iterations is not known at compile
-    /// time).
-    bool     Runtime;
+    /// loop body even when the number of loop iterations is not known at
+    /// compile time).
+    bool Runtime;
   };
 
   /// \brief Get target-customized preferences for the generic loop unrolling
   /// transformation. The caller will initialize UP with the current
   /// target-independent defaults.
-  virtual void getUnrollingPreferences(const Function *F, Loop *L,
-                                       UnrollingPreferences &UP) const;
+  void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;
 
   /// @}
 
@@ -244,31 +271,33 @@ public:
   /// support is considered as "Fast" if it can outperform, or is on a par
   /// with, SW implementation when the population is sparse; otherwise, it is
   /// considered as "Slow".
-  enum PopcntSupportKind {
-    PSK_Software,
-    PSK_SlowHardware,
-    PSK_FastHardware
-  };
+  enum PopcntSupportKind { PSK_Software, PSK_SlowHardware, PSK_FastHardware };
 
   /// \brief Return true if the specified immediate is legal add immediate, that
   /// is the target has add instructions which can add a register with the
   /// immediate without having to materialize the immediate into a register.
-  virtual bool isLegalAddImmediate(int64_t Imm) const;
+  bool isLegalAddImmediate(int64_t Imm) const;
 
   /// \brief Return true if the specified immediate is legal icmp immediate,
   /// that is the target has icmp instructions which can compare a register
   /// against the immediate without having to materialize the immediate into a
   /// register.
-  virtual bool isLegalICmpImmediate(int64_t Imm) const;
+  bool isLegalICmpImmediate(int64_t Imm) const;
 
   /// \brief Return true if the addressing mode represented by AM is legal for
   /// this target, for a load/store of the specified type.
   /// The type may be VoidTy, in which case only return true if the addressing
   /// mode is legal for a load/store of any legal type.
   /// TODO: Handle pre/postinc as well.
-  virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
-                                     int64_t BaseOffset, bool HasBaseReg,
-                                     int64_t Scale) const;
+  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
+                             bool HasBaseReg, int64_t Scale) const;
+
+  /// \brief Return true if the target works with masked instruction
+  /// AVX2 allows masks for consecutive load and store for i32 and i64 elements.
+  /// AVX-512 architecture will also allow masks for non-consecutive memory
+  /// accesses.
+  bool isLegalMaskedStore(Type *DataType, int Consecutive) const;
+  bool isLegalMaskedLoad(Type *DataType, int Consecutive) const;
 
   /// \brief Return the cost of the scaling factor used in the addressing
   /// mode represented by AM for this target, for a load/store
@@ -276,45 +305,52 @@ public:
   /// If the AM is supported, the return value must be >= 0.
   /// If the AM is not supported, it returns a negative value.
   /// TODO: Handle pre/postinc as well.
-  virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
-                                   int64_t BaseOffset, bool HasBaseReg,
-                                   int64_t Scale) const;
+  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
+                           bool HasBaseReg, int64_t Scale) const;
 
   /// \brief Return true if it's free to truncate a value of type Ty1 to type
   /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
   /// by referencing its sub-register AX.
-  virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
+  bool isTruncateFree(Type *Ty1, Type *Ty2) const;
+
+  /// \brief Return true if it is profitable to hoist instruction in the
+  /// then/else to before if.
+  bool isProfitableToHoist(Instruction *I) const;
 
   /// \brief Return true if this type is legal.
-  virtual bool isTypeLegal(Type *Ty) const;
+  bool isTypeLegal(Type *Ty) const;
 
   /// \brief Returns the target's jmp_buf alignment in bytes.
-  virtual unsigned getJumpBufAlignment() const;
+  unsigned getJumpBufAlignment() const;
 
   /// \brief Returns the target's jmp_buf size in bytes.
-  virtual unsigned getJumpBufSize() const;
+  unsigned getJumpBufSize() const;
 
   /// \brief Return true if switches should be turned into lookup tables for the
   /// target.
-  virtual bool shouldBuildLookupTables() const;
+  bool shouldBuildLookupTables() const;
 
   /// \brief Return hardware support for population count.
-  virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
+  PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
 
   /// \brief Return true if the hardware has a fast square-root instruction.
-  virtual bool haveFastSqrt(Type *Ty) const;
+  bool haveFastSqrt(Type *Ty) const;
+
+  /// \brief Return the expected cost of supporting the floating point operation
+  /// of the specified type.
+  unsigned getFPOpCost(Type *Ty) const;
 
   /// \brief Return the expected cost of materializing for the given integer
   /// immediate of the specified type.
-  virtual unsigned getIntImmCost(const APInt &Imm, Type *Ty) const;
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const;
 
   /// \brief Return the expected cost of materialization for the given integer
   /// immediate of the specified type for a given instruction. The cost can be
   /// zero if the immediate can be folded into the specified instruction.
-  virtual unsigned getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,
-                                 Type *Ty) const;
-  virtual unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx,
-                                 const APInt &Imm, Type *Ty) const;
+  unsigned getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const;
+  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const;
   /// @}
 
   /// \name Vector Target Information
@@ -331,10 +367,10 @@ public:
 
   /// \brief Additional information about an operand's possible values.
   enum OperandValueKind {
-    OK_AnyValue,                 // Operand can have any value.
-    OK_UniformValue,             // Operand is uniform (splat of a value).
-    OK_UniformConstantValue,     // Operand is uniform constant.
-    OK_NonUniformConstantValue   // Operand is a non uniform constant value.
+    OK_AnyValue,               // Operand can have any value.
+    OK_UniformValue,           // Operand is uniform (splat of a value).
+    OK_UniformConstantValue,   // Operand is uniform constant.
+    OK_NonUniformConstantValue // Operand is a non uniform constant value.
   };
 
   /// \brief Additional properties of an operand's values.
@@ -343,18 +379,18 @@ public:
   /// \return The number of scalar or vector registers that the target has.
   /// If 'Vectors' is true, it returns the number of vector registers. If it is
   /// set to false, it returns the number of scalar registers.
-  virtual unsigned getNumberOfRegisters(bool Vector) const;
+  unsigned getNumberOfRegisters(bool Vector) const;
 
   /// \return The width of the largest scalar or vector register type.
-  virtual unsigned getRegisterBitWidth(bool Vector) const;
+  unsigned getRegisterBitWidth(bool Vector) const;
 
   /// \return The maximum interleave factor that any transform should try to
   /// perform for this target. This number depends on the level of parallelism
   /// and the number of execution units in the CPU.
-  virtual unsigned getMaxInterleaveFactor() const;
+  unsigned getMaxInterleaveFactor() const;
 
   /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc.
-  virtual unsigned
+  unsigned
   getArithmeticInstrCost(unsigned Opcode, Type *Ty,
                          OperandValueKind Opd1Info = OK_AnyValue,
                          OperandValueKind Opd2Info = OK_AnyValue,
@@ -364,31 +400,33 @@ public:
   /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
   /// The index and subtype parameters are used by the subvector insertion and
   /// extraction shuffle kinds.
-  virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index = 0,
-                                  Type *SubTp = nullptr) const;
+  unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index = 0,
+                          Type *SubTp = nullptr) const;
 
   /// \return The expected cost of cast instructions, such as bitcast, trunc,
   /// zext, etc.
-  virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
-                                    Type *Src) const;
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const;
 
   /// \return The expected cost of control-flow related instructions such as
   /// Phi, Ret, Br.
-  virtual unsigned getCFInstrCost(unsigned Opcode) const;
+  unsigned getCFInstrCost(unsigned Opcode) const;
 
   /// \returns The expected cost of compare and select instructions.
-  virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                      Type *CondTy = nullptr) const;
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                              Type *CondTy = nullptr) const;
 
   /// \return The expected cost of vector Insert and Extract.
   /// Use -1 to indicate that there is no information on the index value.
-  virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
-                                      unsigned Index = -1) const;
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                              unsigned Index = -1) const;
 
   /// \return The cost of Load and Store instructions.
-  virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
-                                   unsigned Alignment,
-                                   unsigned AddressSpace) const;
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) const;
+
+  /// \return The cost of masked Load and Store instructions.
+  unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                 unsigned AddressSpace) const;
 
   /// \brief Calculate the cost of performing a vector reduction.
   ///
@@ -403,16 +441,16 @@ public:
   /// Split:
   ///  (v0, v1, v2, v3)
   ///  ((v0+v2), (v1+v3), undef, undef)
-  virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
-                                    bool IsPairwiseForm) const;
+  unsigned getReductionCost(unsigned Opcode, Type *Ty,
+                            bool IsPairwiseForm) const;
 
   /// \returns The cost of Intrinsic instructions.
-  virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                         ArrayRef<Type *> Tys) const;
+  unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+                                 ArrayRef<Type *> Tys) const;
 
   /// \returns The number of pieces into which the provided type must be
   /// split during legalization. Zero is returned when the answer is unknown.
-  virtual unsigned getNumberOfParts(Type *Tp) const;
+  unsigned getNumberOfParts(Type *Tp) const;
 
   /// \returns The cost of the address computation. For most targets this can be
   /// merged into the instruction indexing mode. Some targets might want to
@@ -421,28 +459,385 @@ public:
   /// The 'IsComplex' parameter is a hint that the address computation is likely
   /// to involve multiple instructions and as such unlikely to be merged into
   /// the address indexing mode.
-  virtual unsigned getAddressComputationCost(Type *Ty,
-                                             bool IsComplex = false) const;
+  unsigned getAddressComputationCost(Type *Ty, bool IsComplex = false) const;
 
   /// \returns The cost, if any, of keeping values of the given types alive
   /// over a callsite.
   ///
   /// Some types may require the use of register classes that do not have
   /// any callee-saved registers, so would require a spill and fill.
-  virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const;
+  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const;
+
+  /// \returns True if the intrinsic is a supported memory intrinsic.  Info
+  /// will contain additional information - whether the intrinsic may write
+  /// or read to memory, volatility and the pointer.  Info is undefined
+  /// if false is returned.
+  bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
+
+  /// \returns A value which is the result of the given memory intrinsic.  New
+  /// instructions may be created to extract the result from the given intrinsic
+  /// memory operation.  Returns nullptr if the target cannot create a result
+  /// from the given intrinsic.
+  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                           Type *ExpectedType) const;
 
   /// @}
 
-  /// Analysis group identification.
+private:
+  /// \brief The abstract base class used to type erase specific TTI
+  /// implementations.
+  class Concept;
+
+  /// \brief The template model for the base class which wraps a concrete
+  /// implementation in a type erased interface.
+  template <typename T> class Model;
+
+  std::unique_ptr<Concept> TTIImpl;
+};
+
+class TargetTransformInfo::Concept {
+public:
+  virtual ~Concept() = 0;
+
+  virtual unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) = 0;
+  virtual unsigned getGEPCost(const Value *Ptr,
+                              ArrayRef<const Value *> Operands) = 0;
+  virtual unsigned getCallCost(FunctionType *FTy, int NumArgs) = 0;
+  virtual unsigned getCallCost(const Function *F, int NumArgs) = 0;
+  virtual unsigned getCallCost(const Function *F,
+                               ArrayRef<const Value *> Arguments) = 0;
+  virtual unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                                    ArrayRef<Type *> ParamTys) = 0;
+  virtual unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                                    ArrayRef<const Value *> Arguments) = 0;
+  virtual unsigned getUserCost(const User *U) = 0;
+  virtual bool hasBranchDivergence() = 0;
+  virtual bool isLoweredToCall(const Function *F) = 0;
+  virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) = 0;
+  virtual bool isLegalAddImmediate(int64_t Imm) = 0;
+  virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
+  virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
+                                     int64_t BaseOffset, bool HasBaseReg,
+                                     int64_t Scale) = 0;
+  virtual bool isLegalMaskedStore(Type *DataType, int Consecutive) = 0;
+  virtual bool isLegalMaskedLoad(Type *DataType, int Consecutive) = 0;
+  virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
+                                   int64_t BaseOffset, bool HasBaseReg,
+                                   int64_t Scale) = 0;
+  virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0;
+  virtual bool isProfitableToHoist(Instruction *I) = 0;
+  virtual bool isTypeLegal(Type *Ty) = 0;
+  virtual unsigned getJumpBufAlignment() = 0;
+  virtual unsigned getJumpBufSize() = 0;
+  virtual bool shouldBuildLookupTables() = 0;
+  virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
+  virtual bool haveFastSqrt(Type *Ty) = 0;
+  virtual unsigned getFPOpCost(Type *Ty) = 0;
+  virtual unsigned getIntImmCost(const APInt &Imm, Type *Ty) = 0;
+  virtual unsigned getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,
+                                 Type *Ty) = 0;
+  virtual unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                                 const APInt &Imm, Type *Ty) = 0;
+  virtual unsigned getNumberOfRegisters(bool Vector) = 0;
+  virtual unsigned getRegisterBitWidth(bool Vector) = 0;
+  virtual unsigned getMaxInterleaveFactor() = 0;
+  virtual unsigned
+  getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
+                         OperandValueKind Opd2Info,
+                         OperandValueProperties Opd1PropInfo,
+                         OperandValueProperties Opd2PropInfo) = 0;
+  virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
+                                  Type *SubTp) = 0;
+  virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) = 0;
+  virtual unsigned getCFInstrCost(unsigned Opcode) = 0;
+  virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy) = 0;
+  virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index) = 0;
+  virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) = 0;
+  virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
+                                         unsigned Alignment,
+                                         unsigned AddressSpace) = 0;
+  virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
+                                    bool IsPairwiseForm) = 0;
+  virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+                                         ArrayRef<Type *> Tys) = 0;
+  virtual unsigned getNumberOfParts(Type *Tp) = 0;
+  virtual unsigned getAddressComputationCost(Type *Ty, bool IsComplex) = 0;
+  virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) = 0;
+  virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst,
+                                  MemIntrinsicInfo &Info) = 0;
+  virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                                   Type *ExpectedType) = 0;
+};
+
+template <typename T>
+class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
+  T Impl;
+
+public:
+  Model(T Impl) : Impl(std::move(Impl)) {}
+  ~Model() override {}
+
+  unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) override {
+    return Impl.getOperationCost(Opcode, Ty, OpTy);
+  }
+  unsigned getGEPCost(const Value *Ptr,
+                      ArrayRef<const Value *> Operands) override {
+    return Impl.getGEPCost(Ptr, Operands);
+  }
+  unsigned getCallCost(FunctionType *FTy, int NumArgs) override {
+    return Impl.getCallCost(FTy, NumArgs);
+  }
+  unsigned getCallCost(const Function *F, int NumArgs) override {
+    return Impl.getCallCost(F, NumArgs);
+  }
+  unsigned getCallCost(const Function *F,
+                       ArrayRef<const Value *> Arguments) override {
+    return Impl.getCallCost(F, Arguments);
+  }
+  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<Type *> ParamTys) override {
+    return Impl.getIntrinsicCost(IID, RetTy, ParamTys);
+  }
+  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<const Value *> Arguments) override {
+    return Impl.getIntrinsicCost(IID, RetTy, Arguments);
+  }
+  unsigned getUserCost(const User *U) override { return Impl.getUserCost(U); }
+  bool hasBranchDivergence() override { return Impl.hasBranchDivergence(); }
+  bool isLoweredToCall(const Function *F) override {
+    return Impl.isLoweredToCall(F);
+  }
+  void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) override {
+    return Impl.getUnrollingPreferences(L, UP);
+  }
+  bool isLegalAddImmediate(int64_t Imm) override {
+    return Impl.isLegalAddImmediate(Imm);
+  }
+  bool isLegalICmpImmediate(int64_t Imm) override {
+    return Impl.isLegalICmpImmediate(Imm);
+  }
+  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
+                             bool HasBaseReg, int64_t Scale) override {
+    return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg,
+                                      Scale);
+  }
+  bool isLegalMaskedStore(Type *DataType, int Consecutive) override {
+    return Impl.isLegalMaskedStore(DataType, Consecutive);
+  }
+  bool isLegalMaskedLoad(Type *DataType, int Consecutive) override {
+    return Impl.isLegalMaskedLoad(DataType, Consecutive);
+  }
+  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
+                           bool HasBaseReg, int64_t Scale) override {
+    return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale);
+  }
+  bool isTruncateFree(Type *Ty1, Type *Ty2) override {
+    return Impl.isTruncateFree(Ty1, Ty2);
+  }
+  bool isProfitableToHoist(Instruction *I) override {
+    return Impl.isProfitableToHoist(I);
+  }
+  bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
+  unsigned getJumpBufAlignment() override { return Impl.getJumpBufAlignment(); }
+  unsigned getJumpBufSize() override { return Impl.getJumpBufSize(); }
+  bool shouldBuildLookupTables() override {
+    return Impl.shouldBuildLookupTables();
+  }
+  PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) override {
+    return Impl.getPopcntSupport(IntTyWidthInBit);
+  }
+  bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }
+
+  unsigned getFPOpCost(Type *Ty) override {
+    return Impl.getFPOpCost(Ty);
+  }
+
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) override {
+    return Impl.getIntImmCost(Imm, Ty);
+  }
+  unsigned getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,
+                         Type *Ty) override {
+    return Impl.getIntImmCost(Opc, Idx, Imm, Ty);
+  }
+  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                         Type *Ty) override {
+    return Impl.getIntImmCost(IID, Idx, Imm, Ty);
+  }
+  unsigned getNumberOfRegisters(bool Vector) override {
+    return Impl.getNumberOfRegisters(Vector);
+  }
+  unsigned getRegisterBitWidth(bool Vector) override {
+    return Impl.getRegisterBitWidth(Vector);
+  }
+  unsigned getMaxInterleaveFactor() override {
+    return Impl.getMaxInterleaveFactor();
+  }
+  unsigned
+  getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
+                         OperandValueKind Opd2Info,
+                         OperandValueProperties Opd1PropInfo,
+                         OperandValueProperties Opd2PropInfo) override {
+    return Impl.getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                       Opd1PropInfo, Opd2PropInfo);
+  }
+  unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
+                          Type *SubTp) override {
+    return Impl.getShuffleCost(Kind, Tp, Index, SubTp);
+  }
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) override {
+    return Impl.getCastInstrCost(Opcode, Dst, Src);
+  }
+  unsigned getCFInstrCost(unsigned Opcode) override {
+    return Impl.getCFInstrCost(Opcode);
+  }
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                              Type *CondTy) override {
+    return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  }
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                              unsigned Index) override {
+    return Impl.getVectorInstrCost(Opcode, Val, Index);
+  }
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) override {
+    return Impl.getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+  }
+  unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                 unsigned AddressSpace) override {
+    return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+  }
+  unsigned getReductionCost(unsigned Opcode, Type *Ty,
+                            bool IsPairwiseForm) override {
+    return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm);
+  }
+  unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+                                 ArrayRef<Type *> Tys) override {
+    return Impl.getIntrinsicInstrCost(ID, RetTy, Tys);
+  }
+  unsigned getNumberOfParts(Type *Tp) override {
+    return Impl.getNumberOfParts(Tp);
+  }
+  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) override {
+    return Impl.getAddressComputationCost(Ty, IsComplex);
+  }
+  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) override {
+    return Impl.getCostOfKeepingLiveOverCall(Tys);
+  }
+  bool getTgtMemIntrinsic(IntrinsicInst *Inst,
+                          MemIntrinsicInfo &Info) override {
+    return Impl.getTgtMemIntrinsic(Inst, Info);
+  }
+  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                           Type *ExpectedType) override {
+    return Impl.getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
+  }
+};
+
+template <typename T>
+TargetTransformInfo::TargetTransformInfo(T Impl)
+    : TTIImpl(new Model<T>(Impl)) {}
+
+/// \brief Analysis pass providing the \c TargetTransformInfo.
+///
+/// The core idea of the TargetIRAnalysis is to expose an interface through
+/// which LLVM targets can analyze and provide information about the middle
+/// end's target-independent IR. This supports use cases such as target-aware
+/// cost modeling of IR constructs.
+///
+/// This is a function analysis because much of the cost modeling for targets
+/// is done in a subtarget specific way and LLVM supports compiling different
+/// functions targeting different subtargets in order to support runtime
+/// dispatch according to the observed subtarget.
+class TargetIRAnalysis {
+public:
+  typedef TargetTransformInfo Result;
+
+  /// \brief Opaque, unique identifier for this analysis pass.
+  static void *ID() { return (void *)&PassID; }
+
+  /// \brief Provide access to a name for this pass for debugging purposes.
+  static StringRef name() { return "TargetIRAnalysis"; }
+
+  /// \brief Default construct a target IR analysis.
+  ///
+  /// This will use the module's datalayout to construct a baseline
+  /// conservative TTI result.
+  TargetIRAnalysis();
+
+  /// \brief Construct an IR analysis pass around a target-provide callback.
+  ///
+  /// The callback will be called with a particular function for which the TTI
+  /// is needed and must return a TTI object for that function.
+  TargetIRAnalysis(std::function<Result(Function &)> TTICallback);
+
+  // Value semantics. We spell out the constructors for MSVC.
+  TargetIRAnalysis(const TargetIRAnalysis &Arg)
+      : TTICallback(Arg.TTICallback) {}
+  TargetIRAnalysis(TargetIRAnalysis &&Arg)
+      : TTICallback(std::move(Arg.TTICallback)) {}
+  TargetIRAnalysis &operator=(const TargetIRAnalysis &RHS) {
+    TTICallback = RHS.TTICallback;
+    return *this;
+  }
+  TargetIRAnalysis &operator=(TargetIRAnalysis &&RHS) {
+    TTICallback = std::move(RHS.TTICallback);
+    return *this;
+  }
+
+  Result run(Function &F);
+
+private:
+  static char PassID;
+
+  /// \brief The callback used to produce a result.
+  ///
+  /// We use a completely opaque callback so that targets can provide whatever
+  /// mechanism they desire for constructing the TTI for a given function.
+  ///
+  /// FIXME: Should we really use std::function? It's relatively inefficient.
+  /// It might be possible to arrange for even stateful callbacks to outlive
+  /// the analysis and thus use a function_ref which would be lighter weight.
+  /// This may also be less error prone as the callback is likely to reference
+  /// the external TargetMachine, and that reference needs to never dangle.
+  std::function<Result(Function &)> TTICallback;
+
+  /// \brief Helper function used as the callback in the default constructor.
+  static Result getDefaultTTI(Function &F);
+};
+
+/// \brief Wrapper pass for TargetTransformInfo.
+///
+/// This pass can be constructed from a TTI object which it stores internally
+/// and is queried by passes.
+class TargetTransformInfoWrapperPass : public ImmutablePass {
+  TargetIRAnalysis TIRA;
+  Optional<TargetTransformInfo> TTI;
+
+  virtual void anchor();
+
+public:
   static char ID;
+
+  /// \brief We must provide a default constructor for the pass but it should
+  /// never be used.
+  ///
+  /// Use the constructor below or call one of the creation routines.
+  TargetTransformInfoWrapperPass();
+
+  explicit TargetTransformInfoWrapperPass(TargetIRAnalysis TIRA);
+
+  TargetTransformInfo &getTTI(Function &F);
 };
 
-/// \brief Create the base case instance of a pass in the TTI analysis group.
+/// \brief Create an analysis pass wrapper around a TTI object.
 ///
-/// This class provides the base case for the stack of TTI analyzes. It doesn't
-/// delegate to anything and uses the STTI and VTTI objects passed in to
-/// satisfy the queries.
-ImmutablePass *createNoTargetTransformInfoPass();
+/// This analysis pass just holds the TTI instance and makes it available to
+/// clients.
+ImmutablePass *createTargetTransformInfoWrapperPass(TargetIRAnalysis TIRA);
 
 } // End llvm namespace
 
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
new file mode 100644
index 0000000..3e02c0c
--- /dev/null
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -0,0 +1,433 @@
+//===- TargetTransformInfoImpl.h --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file provides helpers for the implementation of
+/// a TargetTransformInfo-conforming class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFOIMPL_H
+#define LLVM_ANALYSIS_TARGETTRANSFORMINFOIMPL_H
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+
+namespace llvm {
+
+/// \brief Base class for use as a mix-in that aids implementing
+/// a TargetTransformInfo-compatible class.
+class TargetTransformInfoImplBase {
+protected:
+  typedef TargetTransformInfo TTI;
+
+  const DataLayout *DL;
+
+  explicit TargetTransformInfoImplBase(const DataLayout *DL)
+      : DL(DL) {}
+
+public:
+  // Provide value semantics. MSVC requires that we spell all of these out.
+  TargetTransformInfoImplBase(const TargetTransformInfoImplBase &Arg)
+      : DL(Arg.DL) {}
+  TargetTransformInfoImplBase(TargetTransformInfoImplBase &&Arg)
+      : DL(std::move(Arg.DL)) {}
+  TargetTransformInfoImplBase &
+  operator=(const TargetTransformInfoImplBase &RHS) {
+    DL = RHS.DL;
+    return *this;
+  }
+  TargetTransformInfoImplBase &operator=(TargetTransformInfoImplBase &&RHS) {
+    DL = std::move(RHS.DL);
+    return *this;
+  }
+
+  unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) {
+    switch (Opcode) {
+    default:
+      // By default, just classify everything as 'basic'.
+      return TTI::TCC_Basic;
+
+    case Instruction::GetElementPtr:
+      llvm_unreachable("Use getGEPCost for GEP operations!");
+
+    case Instruction::BitCast:
+      assert(OpTy && "Cast instructions must provide the operand type");
+      if (Ty == OpTy || (Ty->isPointerTy() && OpTy->isPointerTy()))
+        // Identity and pointer-to-pointer casts are free.
+        return TTI::TCC_Free;
+
+      // Otherwise, the default basic cost is used.
+      return TTI::TCC_Basic;
+
+    case Instruction::IntToPtr: {
+      if (!DL)
+        return TTI::TCC_Basic;
+
+      // An inttoptr cast is free so long as the input is a legal integer type
+      // which doesn't contain values outside the range of a pointer.
+      unsigned OpSize = OpTy->getScalarSizeInBits();
+      if (DL->isLegalInteger(OpSize) &&
+          OpSize <= DL->getPointerTypeSizeInBits(Ty))
+        return TTI::TCC_Free;
+
+      // Otherwise it's not a no-op.
+      return TTI::TCC_Basic;
+    }
+    case Instruction::PtrToInt: {
+      if (!DL)
+        return TTI::TCC_Basic;
+
+      // A ptrtoint cast is free so long as the result is large enough to store
+      // the pointer, and a legal integer type.
+      unsigned DestSize = Ty->getScalarSizeInBits();
+      if (DL->isLegalInteger(DestSize) &&
+          DestSize >= DL->getPointerTypeSizeInBits(OpTy))
+        return TTI::TCC_Free;
+
+      // Otherwise it's not a no-op.
+      return TTI::TCC_Basic;
+    }
+    case Instruction::Trunc:
+      // trunc to a native type is free (assuming the target has compare and
+      // shift-right of the same width).
+      if (DL && DL->isLegalInteger(DL->getTypeSizeInBits(Ty)))
+        return TTI::TCC_Free;
+
+      return TTI::TCC_Basic;
+    }
+  }
+
+  unsigned getGEPCost(const Value *Ptr, ArrayRef<const Value *> Operands) {
+    // In the basic model, we just assume that all-constant GEPs will be folded
+    // into their uses via addressing modes.
+    for (unsigned Idx = 0, Size = Operands.size(); Idx != Size; ++Idx)
+      if (!isa<Constant>(Operands[Idx]))
+        return TTI::TCC_Basic;
+
+    return TTI::TCC_Free;
+  }
+
+  unsigned getCallCost(FunctionType *FTy, int NumArgs) {
+    assert(FTy && "FunctionType must be provided to this routine.");
+
+    // The target-independent implementation just measures the size of the
+    // function by approximating that each argument will take on average one
+    // instruction to prepare.
+
+    if (NumArgs < 0)
+      // Set the argument number to the number of explicit arguments in the
+      // function.
+      NumArgs = FTy->getNumParams();
+
+    return TTI::TCC_Basic * (NumArgs + 1);
+  }
+
+  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<Type *> ParamTys) {
+    switch (IID) {
+    default:
+      // Intrinsics rarely (if ever) have normal argument setup constraints.
+      // Model them as having a basic instruction cost.
+      // FIXME: This is wrong for libc intrinsics.
+      return TTI::TCC_Basic;
+
+    case Intrinsic::annotation:
+    case Intrinsic::assume:
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_value:
+    case Intrinsic::invariant_start:
+    case Intrinsic::invariant_end:
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::objectsize:
+    case Intrinsic::ptr_annotation:
+    case Intrinsic::var_annotation:
+    case Intrinsic::experimental_gc_result_int:
+    case Intrinsic::experimental_gc_result_float:
+    case Intrinsic::experimental_gc_result_ptr:
+    case Intrinsic::experimental_gc_result:
+    case Intrinsic::experimental_gc_relocate:
+      // These intrinsics don't actually represent code after lowering.
+      return TTI::TCC_Free;
+    }
+  }
+
+  bool hasBranchDivergence() { return false; }
+
+  bool isLoweredToCall(const Function *F) {
+    // FIXME: These should almost certainly not be handled here, and instead
+    // handled with the help of TLI or the target itself. This was largely
+    // ported from existing analysis heuristics here so that such refactorings
+    // can take place in the future.
+
+    if (F->isIntrinsic())
+      return false;
+
+    if (F->hasLocalLinkage() || !F->hasName())
+      return true;
+
+    StringRef Name = F->getName();
+
+    // These will all likely lower to a single selection DAG node.
+    if (Name == "copysign" || Name == "copysignf" || Name == "copysignl" ||
+        Name == "fabs" || Name == "fabsf" || Name == "fabsl" || Name == "sin" ||
+        Name == "fmin" || Name == "fminf" || Name == "fminl" ||
+        Name == "fmax" || Name == "fmaxf" || Name == "fmaxl" ||
+        Name == "sinf" || Name == "sinl" || Name == "cos" || Name == "cosf" ||
+        Name == "cosl" || Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl")
+      return false;
+
+    // These are all likely to be optimized into something smaller.
+    if (Name == "pow" || Name == "powf" || Name == "powl" || Name == "exp2" ||
+        Name == "exp2l" || Name == "exp2f" || Name == "floor" ||
+        Name == "floorf" || Name == "ceil" || Name == "round" ||
+        Name == "ffs" || Name == "ffsl" || Name == "abs" || Name == "labs" ||
+        Name == "llabs")
+      return false;
+
+    return true;
+  }
+
+  void getUnrollingPreferences(Loop *, TTI::UnrollingPreferences &) {}
+
+  bool isLegalAddImmediate(int64_t Imm) { return false; }
+
+  bool isLegalICmpImmediate(int64_t Imm) { return false; }
+
+  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
+                             bool HasBaseReg, int64_t Scale) {
+    // Guess that reg+reg addressing is allowed. This heuristic is taken from
+    // the implementation of LSR.
+    return !BaseGV && BaseOffset == 0 && Scale <= 1;
+  }
+
+  bool isLegalMaskedStore(Type *DataType, int Consecutive) { return false; }
+
+  bool isLegalMaskedLoad(Type *DataType, int Consecutive) { return false; }
+
+  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
+                           bool HasBaseReg, int64_t Scale) {
+    // Guess that all legal addressing mode are free.
+    if (isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale))
+      return 0;
+    return -1;
+  }
+
+  bool isTruncateFree(Type *Ty1, Type *Ty2) { return false; }
+
+  bool isProfitableToHoist(Instruction *I) { return true; }
+
+  bool isTypeLegal(Type *Ty) { return false; }
+
+  unsigned getJumpBufAlignment() { return 0; }
+
+  unsigned getJumpBufSize() { return 0; }
+
+  bool shouldBuildLookupTables() { return true; }
+
+  TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) {
+    return TTI::PSK_Software;
+  }
+
+  bool haveFastSqrt(Type *Ty) { return false; }
+
+  unsigned getFPOpCost(Type *Ty) { return TargetTransformInfo::TCC_Basic; }
+
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) { return TTI::TCC_Basic; }
+
+  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                         Type *Ty) {
+    return TTI::TCC_Free;
+  }
+
+  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                         Type *Ty) {
+    return TTI::TCC_Free;
+  }
+
+  unsigned getNumberOfRegisters(bool Vector) { return 8; }
+
+  unsigned getRegisterBitWidth(bool Vector) { return 32; }
+
+  unsigned getMaxInterleaveFactor() { return 1; }
+
+  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                  TTI::OperandValueKind Opd1Info,
+                                  TTI::OperandValueKind Opd2Info,
+                                  TTI::OperandValueProperties Opd1PropInfo,
+                                  TTI::OperandValueProperties Opd2PropInfo) {
+    return 1;
+  }
+
+  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Ty, int Index,
+                          Type *SubTp) {
+    return 1;
+  }
+
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { return 1; }
+
+  unsigned getCFInstrCost(unsigned Opcode) { return 1; }
+
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+    return 1;
+  }
+
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+    return 1;
+  }
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) {
+    return 1;
+  }
+
+  unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                 unsigned AddressSpace) {
+    return 1;
+  }
+
+  unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+                                 ArrayRef<Type *> Tys) {
+    return 1;
+  }
+
+  unsigned getNumberOfParts(Type *Tp) { return 0; }
+
+  unsigned getAddressComputationCost(Type *Tp, bool) { return 0; }
+
+  unsigned getReductionCost(unsigned, Type *, bool) { return 1; }
+
+  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { return 0; }
+
+  bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) {
+    return false;
+  }
+
+  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                           Type *ExpectedType) {
+    return nullptr;
+  }
+};
+
+/// \brief CRTP base class for use as a mix-in that aids implementing
+/// a TargetTransformInfo-compatible class.
+template <typename T>
+class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
+private:
+  typedef TargetTransformInfoImplBase BaseT;
+
+protected:
+  explicit TargetTransformInfoImplCRTPBase(const DataLayout *DL)
+      : BaseT(DL) {}
+
+public:
+  // Provide value semantics. MSVC requires that we spell all of these out.
+  TargetTransformInfoImplCRTPBase(const TargetTransformInfoImplCRTPBase &Arg)
+      : BaseT(static_cast<const BaseT &>(Arg)) {}
+  TargetTransformInfoImplCRTPBase(TargetTransformInfoImplCRTPBase &&Arg)
+      : BaseT(std::move(static_cast<BaseT &>(Arg))) {}
+  TargetTransformInfoImplCRTPBase &
+  operator=(const TargetTransformInfoImplCRTPBase &RHS) {
+    BaseT::operator=(static_cast<const BaseT &>(RHS));
+    return *this;
+  }
+  TargetTransformInfoImplCRTPBase &
+  operator=(TargetTransformInfoImplCRTPBase &&RHS) {
+    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
+    return *this;
+  }
+
+  using BaseT::getCallCost;
+
+  unsigned getCallCost(const Function *F, int NumArgs) {
+    assert(F && "A concrete function must be provided to this routine.");
+
+    if (NumArgs < 0)
+      // Set the argument number to the number of explicit arguments in the
+      // function.
+      NumArgs = F->arg_size();
+
+    if (Intrinsic::ID IID = (Intrinsic::ID)F->getIntrinsicID()) {
+      FunctionType *FTy = F->getFunctionType();
+      SmallVector<Type *, 8> ParamTys(FTy->param_begin(), FTy->param_end());
+      return static_cast<T *>(this)
+          ->getIntrinsicCost(IID, FTy->getReturnType(), ParamTys);
+    }
+
+    if (!static_cast<T *>(this)->isLoweredToCall(F))
+      return TTI::TCC_Basic; // Give a basic cost if it will be lowered
+                             // directly.
+
+    return static_cast<T *>(this)->getCallCost(F->getFunctionType(), NumArgs);
+  }
+
+  unsigned getCallCost(const Function *F, ArrayRef<const Value *> Arguments) {
+    // Simply delegate to generic handling of the call.
+    // FIXME: We should use instsimplify or something else to catch calls which
+    // will constant fold with these arguments.
+    return static_cast<T *>(this)->getCallCost(F, Arguments.size());
+  }
+
+  using BaseT::getIntrinsicCost;
+
+  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<const Value *> Arguments) {
+    // Delegate to the generic intrinsic handling code. This mostly provides an
+    // opportunity for targets to (for example) special case the cost of
+    // certain intrinsics based on constants used as arguments.
+    SmallVector<Type *, 8> ParamTys;
+    ParamTys.reserve(Arguments.size());
+    for (unsigned Idx = 0, Size = Arguments.size(); Idx != Size; ++Idx)
+      ParamTys.push_back(Arguments[Idx]->getType());
+    return static_cast<T *>(this)->getIntrinsicCost(IID, RetTy, ParamTys);
+  }
+
+  unsigned getUserCost(const User *U) {
+    if (isa<PHINode>(U))
+      return TTI::TCC_Free; // Model all PHI nodes as free.
+
+    if (const GEPOperator *GEP = dyn_cast<GEPOperator>(U)) {
+      SmallVector<const Value *, 4> Indices(GEP->idx_begin(), GEP->idx_end());
+      return static_cast<T *>(this)
+          ->getGEPCost(GEP->getPointerOperand(), Indices);
+    }
+
+    if (ImmutableCallSite CS = U) {
+      const Function *F = CS.getCalledFunction();
+      if (!F) {
+        // Just use the called value type.
+        Type *FTy = CS.getCalledValue()->getType()->getPointerElementType();
+        return static_cast<T *>(this)
+            ->getCallCost(cast<FunctionType>(FTy), CS.arg_size());
+      }
+
+      SmallVector<const Value *, 8> Arguments(CS.arg_begin(), CS.arg_end());
+      return static_cast<T *>(this)->getCallCost(F, Arguments);
+    }
+
+    if (const CastInst *CI = dyn_cast<CastInst>(U)) {
+      // Result of a cmp instruction is often extended (to be used by other
+      // cmp instructions, logical or return instructions). These are usually
+      // nop on most sane targets.
+      if (isa<CmpInst>(CI->getOperand(0)))
+        return TTI::TCC_Free;
+    }
+
+    return static_cast<T *>(this)->getOperationCost(
+        Operator::getOpcode(U), U->getType(),
+        U->getNumOperands() == 1 ? U->getOperand(0)->getType() : nullptr);
+  }
+};
+}
+
+#endif
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index 6bbf4f4..ac8c3b7 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -25,7 +25,7 @@ namespace llvm {
   class DataLayout;
   class StringRef;
   class MDNode;
-  class AssumptionTracker;
+  class AssumptionCache;
   class DominatorTree;
   class TargetLibraryInfo;
 
@@ -37,9 +37,9 @@ namespace llvm {
   /// where V is a vector, the known zero and known one values are the
   /// same width as the vector element, and the bit is set only if it is true
   /// for all of the elements in the vector.
-  void computeKnownBits(Value *V,  APInt &KnownZero, APInt &KnownOne,
+  void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
                         const DataLayout *TD = nullptr, unsigned Depth = 0,
-                        AssumptionTracker *AT = nullptr,
+                        AssumptionCache *AC = nullptr,
                         const Instruction *CxtI = nullptr,
                         const DominatorTree *DT = nullptr);
   /// Compute known bits from the range metadata.
@@ -51,7 +51,7 @@ namespace llvm {
   /// one.  Convenience wrapper around computeKnownBits.
   void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
                       const DataLayout *TD = nullptr, unsigned Depth = 0,
-                      AssumptionTracker *AT = nullptr,
+                      AssumptionCache *AC = nullptr,
                       const Instruction *CxtI = nullptr,
                       const DominatorTree *DT = nullptr);
 
@@ -61,7 +61,7 @@ namespace llvm {
   /// integer or pointer type and vectors of integers.  If 'OrZero' is set then
   /// returns true if the given value is either a power of two or zero.
   bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero = false, unsigned Depth = 0,
-                              AssumptionTracker *AT = nullptr,
+                              AssumptionCache *AC = nullptr,
                               const Instruction *CxtI = nullptr,
                               const DominatorTree *DT = nullptr);
 
@@ -70,7 +70,7 @@ namespace llvm {
   /// non-zero when defined.  Supports values with integer or pointer type and
   /// vectors of integers.
   bool isKnownNonZero(Value *V, const DataLayout *TD = nullptr,
-                      unsigned Depth = 0, AssumptionTracker *AT = nullptr,
+                      unsigned Depth = 0, AssumptionCache *AC = nullptr,
                       const Instruction *CxtI = nullptr,
                       const DominatorTree *DT = nullptr);
 
@@ -83,13 +83,12 @@ namespace llvm {
   /// where V is a vector, the mask, known zero, and known one values are the
   /// same width as the vector element, and the bit is set only if it is true
   /// for all of the elements in the vector.
-  bool MaskedValueIsZero(Value *V, const APInt &Mask, 
+  bool MaskedValueIsZero(Value *V, const APInt &Mask,
                          const DataLayout *TD = nullptr, unsigned Depth = 0,
-                         AssumptionTracker *AT = nullptr,
+                         AssumptionCache *AC = nullptr,
                          const Instruction *CxtI = nullptr,
                          const DominatorTree *DT = nullptr);
 
-  
   /// ComputeNumSignBits - Return the number of times the sign bit of the
   /// register is replicated into the other bits.  We know that at least 1 bit
   /// is always equal to the sign bit (itself), but other cases can give us
@@ -99,8 +98,7 @@ namespace llvm {
   /// 'Op' must have a scalar integer type.
   ///
   unsigned ComputeNumSignBits(Value *Op, const DataLayout *TD = nullptr,
-                              unsigned Depth = 0,
-                              AssumptionTracker *AT = nullptr,
+                              unsigned Depth = 0, AssumptionCache *AC = nullptr,
                               const Instruction *CxtI = nullptr,
                               const DominatorTree *DT = nullptr);
 
@@ -118,6 +116,11 @@ namespace llvm {
   ///
   bool CannotBeNegativeZero(const Value *V, unsigned Depth = 0);
 
+  /// CannotBeOrderedLessThanZero - Return true if we can prove that the 
+  /// specified FP value is either a NaN or never less than 0.0.
+  ///
+  bool CannotBeOrderedLessThanZero(const Value *V, unsigned Depth = 0);
+
   /// isBytewiseValue - If the specified value can be set by repeating the same
   /// byte in memory, return the i8 value that it is represented with.  This is
   /// true for all i8 values obviously, but is also true for i32 0, i32 -1,
@@ -217,6 +220,17 @@ namespace llvm {
                                const DataLayout *DL = nullptr,
                                const DominatorTree *DT = nullptr);
 
+  enum class OverflowResult { AlwaysOverflows, MayOverflow, NeverOverflows };
+  OverflowResult computeOverflowForUnsignedMul(Value *LHS, Value *RHS,
+                                               const DataLayout *DL,
+                                               AssumptionCache *AC,
+                                               const Instruction *CxtI,
+                                               const DominatorTree *DT);
+  OverflowResult computeOverflowForUnsignedAdd(Value *LHS, Value *RHS,
+                                               const DataLayout *DL,
+                                               AssumptionCache *AC,
+                                               const Instruction *CxtI,
+                                               const DominatorTree *DT);
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/Bitcode/BitCodes.h b/include/llvm/Bitcode/BitCodes.h
index ed2dcf8..3f7a77d 100644
--- a/include/llvm/Bitcode/BitCodes.h
+++ b/include/llvm/Bitcode/BitCodes.h
@@ -125,7 +125,7 @@ public:
     case Blob:
       return false;
     }
-    llvm_unreachable("Invalid encoding");
+    report_fatal_error("Invalid encoding");
   }
 
   /// isChar6 - Return true if this character is legal in the Char6 encoding.
diff --git a/include/llvm/Bitcode/BitcodeWriterPass.h b/include/llvm/Bitcode/BitcodeWriterPass.h
index eb85548..8fe9b7e 100644
--- a/include/llvm/Bitcode/BitcodeWriterPass.h
+++ b/include/llvm/Bitcode/BitcodeWriterPass.h
@@ -41,7 +41,7 @@ public:
 
   /// \brief Run the bitcode writer pass, and output the module to the selected
   /// output stream.
-  PreservedAnalyses run(Module *M);
+  PreservedAnalyses run(Module &M);
 
   static StringRef name() { return "BitcodeWriterPass"; }
 };
diff --git a/include/llvm/Bitcode/BitstreamReader.h b/include/llvm/Bitcode/BitstreamReader.h
index ecf8235..c20a160 100644
--- a/include/llvm/Bitcode/BitstreamReader.h
+++ b/include/llvm/Bitcode/BitstreamReader.h
@@ -50,8 +50,8 @@ private:
   /// information in the BlockInfo block. Only llvm-bcanalyzer uses this.
   bool IgnoreBlockInfoNames;
 
-  BitstreamReader(const BitstreamReader&) LLVM_DELETED_FUNCTION;
-  void operator=(const BitstreamReader&) LLVM_DELETED_FUNCTION;
+  BitstreamReader(const BitstreamReader&) = delete;
+  void operator=(const BitstreamReader&) = delete;
 public:
   BitstreamReader() : IgnoreBlockInfoNames(true) {
   }
@@ -61,9 +61,8 @@ public:
     init(Start, End);
   }
 
-  BitstreamReader(MemoryObject *bytes) : IgnoreBlockInfoNames(true) {
-    BitcodeBytes.reset(bytes);
-  }
+  BitstreamReader(std::unique_ptr<MemoryObject> BitcodeBytes)
+      : BitcodeBytes(std::move(BitcodeBytes)), IgnoreBlockInfoNames(true) {}
 
   BitstreamReader(BitstreamReader &&Other) {
     *this = std::move(Other);
@@ -259,8 +258,8 @@ public:
     AF_DontAutoprocessAbbrevs = 2
   };
 
-      /// Advance the current bitstream, returning the next entry in the stream.
-      BitstreamEntry advance(unsigned Flags = 0) {
+  /// Advance the current bitstream, returning the next entry in the stream.
+  BitstreamEntry advance(unsigned Flags = 0) {
     while (1) {
       unsigned Code = ReadCode();
       if (Code == bitc::END_BLOCK) {
@@ -302,7 +301,7 @@ public:
 
   /// Reset the stream to the specified bit number.
   void JumpToBit(uint64_t BitNo) {
-    uintptr_t ByteNo = uintptr_t(BitNo/8) & ~(sizeof(word_t)-1);
+    size_t ByteNo = size_t(BitNo/8) & ~(sizeof(word_t)-1);
     unsigned WordBitNo = unsigned(BitNo & (sizeof(word_t)*8-1));
     assert(canSkipToPos(ByteNo) && "Invalid location");
 
@@ -316,7 +315,8 @@ public:
   }
 
   void fillCurWord() {
-    assert(Size == 0 || NextChar < (unsigned)Size);
+    if (Size != 0 && NextChar >= Size)
+      report_fatal_error("Unexpected end of file");
 
     // Read the next word from the stream.
     uint8_t Array[sizeof(word_t)] = {0};
@@ -491,11 +491,11 @@ private:
   //===--------------------------------------------------------------------===//
 
 public:
-
   /// Return the abbreviation for the specified AbbrevId.
   const BitCodeAbbrev *getAbbrev(unsigned AbbrevID) {
-    unsigned AbbrevNo = AbbrevID-bitc::FIRST_APPLICATION_ABBREV;
-    assert(AbbrevNo < CurAbbrevs.size() && "Invalid abbrev #!");
+    unsigned AbbrevNo = AbbrevID - bitc::FIRST_APPLICATION_ABBREV;
+    if (AbbrevNo >= CurAbbrevs.size())
+      report_fatal_error("Invalid abbrev number");
     return CurAbbrevs[AbbrevNo].get();
   }
 
diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h
index c42ecfe..d842167 100644
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h
@@ -137,16 +137,36 @@ namespace bitc {
 
   enum MetadataCodes {
     METADATA_STRING        = 1,   // MDSTRING:      [values]
-    // 2 is unused.
-    // 3 is unused.
+    METADATA_VALUE         = 2,   // VALUE:         [type num, value num]
+    METADATA_NODE          = 3,   // NODE:          [n x md num]
     METADATA_NAME          = 4,   // STRING:        [values]
-    // 5 is unused.
+    METADATA_DISTINCT_NODE = 5,   // DISTINCT_NODE: [n x md num]
     METADATA_KIND          = 6,   // [n x [id, name]]
-    // 7 is unused.
-    METADATA_NODE          = 8,   // NODE:          [n x (type num, value num)]
-    METADATA_FN_NODE       = 9,   // FN_NODE:       [n x (type num, value num)]
+    METADATA_LOCATION      = 7,   // [distinct, line, col, scope, inlined-at?]
+    METADATA_OLD_NODE      = 8,   // OLD_NODE:      [n x (type num, value num)]
+    METADATA_OLD_FN_NODE   = 9,   // OLD_FN_NODE:   [n x (type num, value num)]
     METADATA_NAMED_NODE    = 10,  // NAMED_NODE:    [n x mdnodes]
-    METADATA_ATTACHMENT    = 11   // [m x [value, [n x [id, mdnode]]]
+    METADATA_ATTACHMENT    = 11,  // [m x [value, [n x [id, mdnode]]]
+    METADATA_GENERIC_DEBUG = 12,  // [distinct, tag, vers, header, n x md num]
+    METADATA_SUBRANGE      = 13,  // [distinct, count, lo]
+    METADATA_ENUMERATOR    = 14,  // [distinct, value, name]
+    METADATA_BASIC_TYPE    = 15,  // [distinct, tag, name, size, align, enc]
+    METADATA_FILE          = 16,  // [distinct, filename, directory]
+    METADATA_DERIVED_TYPE  = 17,  // [distinct, ...]
+    METADATA_COMPOSITE_TYPE= 18,  // [distinct, ...]
+    METADATA_SUBROUTINE_TYPE=19,  // [distinct, flags, types]
+    METADATA_COMPILE_UNIT  = 20,  // [distinct, ...]
+    METADATA_SUBPROGRAM    = 21,  // [distinct, ...]
+    METADATA_LEXICAL_BLOCK = 22,  // [distinct, scope, file, line, column]
+    METADATA_LEXICAL_BLOCK_FILE=23,//[distinct, scope, file, discriminator]
+    METADATA_NAMESPACE     = 24,  // [distinct, scope, file, name, line]
+    METADATA_TEMPLATE_TYPE = 25,  // [distinct, scope, name, type, ...]
+    METADATA_TEMPLATE_VALUE= 26,  // [distinct, scope, name, type, value, ...]
+    METADATA_GLOBAL_VAR    = 27,  // [distinct, ...]
+    METADATA_LOCAL_VAR     = 28,  // [distinct, ...]
+    METADATA_EXPRESSION    = 29,  // [distinct, n x element]
+    METADATA_OBJC_PROPERTY = 30,  // [distinct, name, file, line, ...]
+    METADATA_IMPORTED_ENTITY=31,  // [distinct, tag, scope, entity, line, name]
   };
 
   // The constants block (CONSTANTS_BLOCK_ID) describes emission for each
@@ -273,7 +293,7 @@ namespace bitc {
 
     FUNC_CODE_INST_BINOP       =  2, // BINOP:      [opcode, ty, opval, opval]
     FUNC_CODE_INST_CAST        =  3, // CAST:       [opcode, ty, opty, opval]
-    FUNC_CODE_INST_GEP         =  4, // GEP:        [n x operands]
+    FUNC_CODE_INST_GEP_OLD     =  4, // GEP:        [n x operands]
     FUNC_CODE_INST_SELECT      =  5, // SELECT:     [ty, opval, opval, opval]
     FUNC_CODE_INST_EXTRACTELT  =  6, // EXTRACTELT: [opty, opval, opval]
     FUNC_CODE_INST_INSERTELT   =  7, // INSERTELT:  [ty, opval, opval, opval]
@@ -307,7 +327,7 @@ namespace bitc {
     FUNC_CODE_INST_CMP2        = 28, // CMP2:       [opty, opval, opval, pred]
     // new select on i1 or [N x i1]
     FUNC_CODE_INST_VSELECT     = 29, // VSELECT:    [ty,opval,opval,predty,pred]
-    FUNC_CODE_INST_INBOUNDS_GEP= 30, // INBOUNDS_GEP: [n x operands]
+    FUNC_CODE_INST_INBOUNDS_GEP_OLD = 30, // INBOUNDS_GEP: [n x operands]
     FUNC_CODE_INST_INDIRECTBR  = 31, // INDIRECTBR: [opty, op0, op1, ...]
     // 32 is unused.
     FUNC_CODE_DEBUG_LOC_AGAIN  = 33, // DEBUG_LOC_AGAIN
@@ -325,8 +345,9 @@ namespace bitc {
     FUNC_CODE_INST_LANDINGPAD  = 40, // LANDINGPAD: [ty,val,val,num,id0,val0...]
     FUNC_CODE_INST_LOADATOMIC  = 41, // LOAD: [opty, op, align, vol,
                                      //        ordering, synchscope]
-    FUNC_CODE_INST_STOREATOMIC = 42  // STORE: [ptrty,ptr,val, align, vol
+    FUNC_CODE_INST_STOREATOMIC = 42, // STORE: [ptrty,ptr,val, align, vol
                                      //         ordering, synchscope]
+    FUNC_CODE_INST_GEP         = 43, // GEP:  [inbounds, n x operands]
   };
 
   enum UseListCodes {
diff --git a/include/llvm/Bitcode/ReaderWriter.h b/include/llvm/Bitcode/ReaderWriter.h
index 2e8cdc7..48bdabc 100644
--- a/include/llvm/Bitcode/ReaderWriter.h
+++ b/include/llvm/Bitcode/ReaderWriter.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_BITCODE_READERWRITER_H
 #define LLVM_BITCODE_READERWRITER_H
 
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include <memory>
@@ -30,27 +31,28 @@ namespace llvm {
   /// Read the header of the specified bitcode buffer and prepare for lazy
   /// deserialization of function bodies.  If successful, this moves Buffer. On
   /// error, this *does not* move Buffer.
-  ErrorOr<Module *> getLazyBitcodeModule(std::unique_ptr<MemoryBuffer> &&Buffer,
-                                         LLVMContext &Context);
-
-  /// getStreamedBitcodeModule - Read the header of the specified stream
-  /// and prepare for lazy deserialization and streaming of function bodies.
-  /// On error, this returns null, and fills in *ErrMsg with an error
-  /// description if ErrMsg is non-null.
-  Module *getStreamedBitcodeModule(const std::string &name,
-                                   DataStreamer *streamer,
-                                   LLVMContext &Context,
-                                   std::string *ErrMsg = nullptr);
+  ErrorOr<Module *>
+  getLazyBitcodeModule(std::unique_ptr<MemoryBuffer> &&Buffer,
+                       LLVMContext &Context,
+                       DiagnosticHandlerFunction DiagnosticHandler = nullptr);
+
+  /// Read the header of the specified stream and prepare for lazy
+  /// deserialization and streaming of function bodies.
+  ErrorOr<std::unique_ptr<Module>> getStreamedBitcodeModule(
+      StringRef Name, DataStreamer *Streamer, LLVMContext &Context,
+      DiagnosticHandlerFunction DiagnosticHandler = nullptr);
 
   /// Read the header of the specified bitcode buffer and extract just the
   /// triple information. If successful, this returns a string. On error, this
   /// returns "".
-  std::string getBitcodeTargetTriple(MemoryBufferRef Buffer,
-                                     LLVMContext &Context);
+  std::string
+  getBitcodeTargetTriple(MemoryBufferRef Buffer, LLVMContext &Context,
+                         DiagnosticHandlerFunction DiagnosticHandler = nullptr);
 
   /// Read the specified bitcode file, returning the module.
-  ErrorOr<Module *> parseBitcodeFile(MemoryBufferRef Buffer,
-                                     LLVMContext &Context);
+  ErrorOr<Module *>
+  parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context,
+                   DiagnosticHandlerFunction DiagnosticHandler = nullptr);
 
   /// WriteBitcodeToFile - Write the specified module to the specified
   /// raw output stream.  For streams where it matters, the given stream
@@ -141,32 +143,26 @@ namespace llvm {
   }
 
   const std::error_category &BitcodeErrorCategory();
-  enum class BitcodeError {
-    ConflictingMETADATA_KINDRecords,
-    CouldNotFindFunctionInStream,
-    ExpectedConstant,
-    InsufficientFunctionProtos,
-    InvalidBitcodeSignature,
-    InvalidBitcodeWrapperHeader,
-    InvalidConstantReference,
-    InvalidID, // A read identifier is not found in the table it should be in.
-    InvalidInstructionWithNoBB,
-    InvalidRecord, // A read record doesn't have the expected size or structure
-    InvalidTypeForValue, // Type read OK, but is invalid for its use
-    InvalidTYPETable,
-    InvalidType,    // We were unable to read a type
-    MalformedBlock, // We are unable to advance in the stream.
-    MalformedGlobalInitializerSet,
-    InvalidMultipleBlocks, // We found multiple blocks of a kind that should
-                           // have only one
-    NeverResolvedValueFoundInFunction,
-    NeverResolvedFunctionFromBlockAddress,
-    InvalidValue // Invalid version, inst number, attr number, etc
-  };
+  enum class BitcodeError { InvalidBitcodeSignature, CorruptedBitcode };
   inline std::error_code make_error_code(BitcodeError E) {
     return std::error_code(static_cast<int>(E), BitcodeErrorCategory());
   }
 
+  class BitcodeDiagnosticInfo : public DiagnosticInfo {
+    const Twine &Msg;
+    std::error_code EC;
+
+  public:
+    BitcodeDiagnosticInfo(std::error_code EC, DiagnosticSeverity Severity,
+                          const Twine &Msg);
+    void print(DiagnosticPrinter &DP) const override;
+    std::error_code getError() const { return EC; };
+
+    static bool classof(const DiagnosticInfo *DI) {
+      return DI->getKind() == DK_Bitcode;
+    }
+  };
+
 } // End llvm namespace
 
 namespace std {
diff --git a/include/llvm/CodeGen/Analysis.h b/include/llvm/CodeGen/Analysis.h
index 3132999..c4b94ed 100644
--- a/include/llvm/CodeGen/Analysis.h
+++ b/include/llvm/CodeGen/Analysis.h
@@ -31,10 +31,21 @@ class SDValue;
 class SelectionDAG;
 struct EVT;
 
-/// ComputeLinearIndex - Given an LLVM IR aggregate type and a sequence
-/// of insertvalue or extractvalue indices that identify a member, return
-/// the linearized index of the start of the member.
+/// \brief Compute the linearized index of a member in a nested
+/// aggregate/struct/array.
 ///
+/// Given an LLVM IR aggregate type and a sequence of insertvalue or
+/// extractvalue indices that identify a member, return the linearized index of
+/// the start of the member, i.e the number of element in memory before the
+/// seeked one. This is disconnected from the number of bytes.
+///
+/// \param Ty is the type indexed by \p Indices.
+/// \param Indices is an optional pointer in the indices list to the current
+/// index.
+/// \param IndicesEnd is the end of the indices list.
+/// \param CurIndex is the current index in the recursion.
+///
+/// \returns \p CurIndex plus the linear index in \p Ty  the indices list.
 unsigned ComputeLinearIndex(Type *Ty,
                             const unsigned *Indices,
                             const unsigned *IndicesEnd,
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index 25b99a2..ac224d8 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -44,8 +44,8 @@ class MachineModuleInfo;
 class MCAsmInfo;
 class MCCFIInstruction;
 class MCContext;
+class MCExpr;
 class MCInst;
-class MCInstrInfo;
 class MCSection;
 class MCStreamer;
 class MCSubtargetInfo;
@@ -68,7 +68,6 @@ public:
   ///
   const MCAsmInfo *MAI;
 
-  const MCInstrInfo *MII;
   /// This is the context for the output file that we are streaming. This owns
   /// all of the global MC-related objects for the generated translation unit.
   MCContext &OutContext;
@@ -98,6 +97,11 @@ public:
   /// default, this is equal to CurrentFnSym.
   MCSymbol *CurrentFnSymForSize;
 
+  /// Map global GOT equivalent MCSymbols to GlobalVariables and keep track of
+  /// its number of uses by other globals.
+  typedef std::pair<const GlobalVariable *, unsigned> GOTEquivUsePair;
+  DenseMap<const MCSymbol *, GOTEquivUsePair> GlobalGOTEquivs;
+
 private:
   // The garbage collection metadata printer table.
   void *GCMetadataPrinters; // Really a DenseMap.
@@ -126,12 +130,13 @@ private:
   DwarfDebug *DD;
 
 protected:
-  explicit AsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
+  explicit AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
 
 public:
   virtual ~AsmPrinter();
 
   DwarfDebug *getDwarfDebug() { return DD; }
+  DwarfDebug *getDwarfDebug() const { return DD; }
 
   /// Return true if assembly output should contain comments.
   ///
@@ -203,6 +208,8 @@ public:
 
   void emitCFIInstruction(const MachineInstr &MI);
 
+  void emitFrameAlloc(const MachineInstr &MI);
+
   enum CFIMoveType { CFI_M_None, CFI_M_EH, CFI_M_Debug };
   CFIMoveType needsCFIMoves();
 
@@ -234,13 +241,27 @@ public:
   ///
   void EmitAlignment(unsigned NumBits, const GlobalObject *GO = nullptr) const;
 
-  /// This method prints the label for the specified MachineBasicBlock, an
-  /// alignment (if present) and a comment describing it if appropriate.
-  void EmitBasicBlockStart(const MachineBasicBlock &MBB) const;
+  /// Lower the specified LLVM Constant to an MCExpr.
+  const MCExpr *lowerConstant(const Constant *CV);
 
   /// \brief Print a general LLVM constant to the .s file.
   void EmitGlobalConstant(const Constant *CV);
 
+  /// \brief Unnamed constant global variables solely contaning a pointer to
+  /// another globals variable act like a global variable "proxy", or GOT
+  /// equivalents, i.e., it's only used to hold the address of the latter. One
+  /// optimization is to replace accesses to these proxies by using the GOT
+  /// entry for the final global instead. Hence, we select GOT equivalent
+  /// candidates among all the module global variables, avoid emitting them
+  /// unnecessarily and finally replace references to them by pc relative
+  /// accesses to GOT entries.
+  void computeGlobalGOTEquivs(Module &M);
+
+  /// \brief Constant expressions using GOT equivalent globals may not be
+  /// eligible for PC relative GOT entry conversion, in such cases we need to
+  /// emit the proxies we previously omitted in EmitGlobalVariable.
+  void emitGlobalGOTEquivs();
+
   //===------------------------------------------------------------------===//
   // Overridable Hooks
   //===------------------------------------------------------------------===//
@@ -264,6 +285,12 @@ public:
   /// function.
   virtual void EmitFunctionBodyEnd() {}
 
+  /// Targets can override this to emit stuff at the start of a basic block.
+  /// By default, this method prints the label for the specified
+  /// MachineBasicBlock, an alignment (if present) and a comment describing it
+  /// if appropriate.
+  virtual void EmitBasicBlockStart(const MachineBasicBlock &MBB) const;
+
   /// Targets can override this to emit stuff at the end of a basic block.
   virtual void EmitBasicBlockEnd(const MachineBasicBlock &MBB) {}
 
@@ -299,10 +326,10 @@ public:
 public:
   /// Return the MCSymbol corresponding to the assembler temporary label with
   /// the specified stem and unique ID.
-  MCSymbol *GetTempSymbol(Twine Name, unsigned ID) const;
+  MCSymbol *GetTempSymbol(const Twine &Name, unsigned ID) const;
 
   /// Return an assembler temporary label with the specified stem.
-  MCSymbol *GetTempSymbol(Twine Name) const;
+  MCSymbol *GetTempSymbol(const Twine &Name) const;
 
   /// Return the MCSymbol for a private symbol with global value name as its
   /// base, with the specified suffix.
@@ -397,7 +424,7 @@ public:
                          const MCSymbol *SectionLabel) const;
 
   /// Get the value for DW_AT_APPLE_isa. Zero if no isa encoding specified.
-  virtual unsigned getISAEncoding() { return 0; }
+  virtual unsigned getISAEncoding(const Function *) { return 0; }
 
   /// Emit a dwarf register operation for describing
   /// - a small value occupying only part of a register or
@@ -423,9 +450,8 @@ public:
                            unsigned PieceOffset = 0) const;
 
   /// EmitDwarfRegOp - Emit a dwarf register operation.
-  /// \param Indirect   whether this is a register-indirect address
-  virtual void EmitDwarfRegOp(ByteStreamer &BS, const MachineLocation &MLoc,
-                              bool Indirect) const;
+  virtual void EmitDwarfRegOp(ByteStreamer &BS,
+                              const MachineLocation &MLoc) const;
 
   //===------------------------------------------------------------------===//
   // Dwarf Lowering Routines
@@ -465,6 +491,10 @@ public:
                                      unsigned AsmVariant, const char *ExtraCode,
                                      raw_ostream &OS);
 
+  /// Let the target do anything it needs to do before emitting inlineasm.
+  /// \p StartInfo - the subtarget info before parsing inline asm
+  virtual void emitInlineAsmStart() const;
+
   /// Let the target do anything it needs to do after emitting inlineasm.
   /// This callback can be used restore the original mode in case the
   /// inlineasm contains directives to switch modes.
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
new file mode 100644
index 0000000..ff85b06
--- /dev/null
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -0,0 +1,723 @@
+//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file provides a helper that implements much of the TTI interface in
+/// terms of the target-independent code generator and TargetLowering
+/// interfaces.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
+#define LLVM_CODEGEN_BASICTTIIMPL_H
+
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfoImpl.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+namespace llvm {
+
+extern cl::opt<unsigned> PartialUnrollingThreshold;
+
+/// \brief Base class which can be used to help build a TTI implementation.
+///
+/// This class provides as much implementation of the TTI interface as is
+/// possible using the target independent parts of the code generator.
+///
+/// In order to subclass it, your class must implement a getST() method to
+/// return the subtarget, and a getTLI() method to return the target lowering.
+/// We need these methods implemented in the derived class so that this class
+/// doesn't have to duplicate storage for them.
+template <typename T>
+class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
+private:
+  typedef TargetTransformInfoImplCRTPBase<T> BaseT;
+  typedef TargetTransformInfo TTI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
+    assert(Ty->isVectorTy() && "Can only scalarize vectors");
+    unsigned Cost = 0;
+
+    for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+      if (Insert)
+        Cost += static_cast<T *>(this)
+                    ->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+      if (Extract)
+        Cost += static_cast<T *>(this)
+                    ->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+    }
+
+    return Cost;
+  }
+
+  /// Estimate the cost overhead of SK_Alternate shuffle.
+  unsigned getAltShuffleOverhead(Type *Ty) {
+    assert(Ty->isVectorTy() && "Can only shuffle vectors");
+    unsigned Cost = 0;
+    // Shuffle cost is equal to the cost of extracting element from its argument
+    // plus the cost of inserting them onto the result vector.
+
+    // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
+    // index 0 of first vector, index 1 of second vector,index 2 of first
+    // vector and finally index 3 of second vector and insert them at index
+    // <0,1,2,3> of result vector.
+    for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+      Cost += static_cast<T *>(this)
+                  ->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+      Cost += static_cast<T *>(this)
+                  ->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+    }
+    return Cost;
+  }
+
+  /// \brief Local query method delegates up to T which *must* implement this!
+  const TargetSubtargetInfo *getST() const {
+    return static_cast<const T *>(this)->getST();
+  }
+
+  /// \brief Local query method delegates up to T which *must* implement this!
+  const TargetLoweringBase *getTLI() const {
+    return static_cast<const T *>(this)->getTLI();
+  }
+
+protected:
+  explicit BasicTTIImplBase(const TargetMachine *TM)
+      : BaseT(TM->getDataLayout()) {}
+
+public:
+  // Provide value semantics. MSVC requires that we spell all of these out.
+  BasicTTIImplBase(const BasicTTIImplBase &Arg)
+      : BaseT(static_cast<const BaseT &>(Arg)) {}
+  BasicTTIImplBase(BasicTTIImplBase &&Arg)
+      : BaseT(std::move(static_cast<BaseT &>(Arg))) {}
+  BasicTTIImplBase &operator=(const BasicTTIImplBase &RHS) {
+    BaseT::operator=(static_cast<const BaseT &>(RHS));
+    return *this;
+  }
+  BasicTTIImplBase &operator=(BasicTTIImplBase &&RHS) {
+    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
+    return *this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+
+  bool hasBranchDivergence() { return false; }
+
+  bool isLegalAddImmediate(int64_t imm) {
+    return getTLI()->isLegalAddImmediate(imm);
+  }
+
+  bool isLegalICmpImmediate(int64_t imm) {
+    return getTLI()->isLegalICmpImmediate(imm);
+  }
+
+  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
+                             bool HasBaseReg, int64_t Scale) {
+    TargetLoweringBase::AddrMode AM;
+    AM.BaseGV = BaseGV;
+    AM.BaseOffs = BaseOffset;
+    AM.HasBaseReg = HasBaseReg;
+    AM.Scale = Scale;
+    return getTLI()->isLegalAddressingMode(AM, Ty);
+  }
+
+  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
+                           bool HasBaseReg, int64_t Scale) {
+    TargetLoweringBase::AddrMode AM;
+    AM.BaseGV = BaseGV;
+    AM.BaseOffs = BaseOffset;
+    AM.HasBaseReg = HasBaseReg;
+    AM.Scale = Scale;
+    return getTLI()->getScalingFactorCost(AM, Ty);
+  }
+
+  bool isTruncateFree(Type *Ty1, Type *Ty2) {
+    return getTLI()->isTruncateFree(Ty1, Ty2);
+  }
+
+  bool isProfitableToHoist(Instruction *I) {
+    return getTLI()->isProfitableToHoist(I);
+  }
+
+  bool isTypeLegal(Type *Ty) {
+    EVT VT = getTLI()->getValueType(Ty);
+    return getTLI()->isTypeLegal(VT);
+  }
+
+  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<const Value *> Arguments) {
+    return BaseT::getIntrinsicCost(IID, RetTy, Arguments);
+  }
+
+  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<Type *> ParamTys) {
+    if (IID == Intrinsic::cttz) {
+      if (getTLI()->isCheapToSpeculateCttz())
+        return TargetTransformInfo::TCC_Basic;
+      return TargetTransformInfo::TCC_Expensive;
+    }
+
+    if (IID == Intrinsic::ctlz) {
+       if (getTLI()->isCheapToSpeculateCtlz())
+        return TargetTransformInfo::TCC_Basic;
+      return TargetTransformInfo::TCC_Expensive;
+    }
+
+    return BaseT::getIntrinsicCost(IID, RetTy, ParamTys);
+  }
+
+  unsigned getJumpBufAlignment() { return getTLI()->getJumpBufAlignment(); }
+
+  unsigned getJumpBufSize() { return getTLI()->getJumpBufSize(); }
+
+  bool shouldBuildLookupTables() {
+    const TargetLoweringBase *TLI = getTLI();
+    return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
+           TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
+  }
+
+  bool haveFastSqrt(Type *Ty) {
+    const TargetLoweringBase *TLI = getTLI();
+    EVT VT = TLI->getValueType(Ty);
+    return TLI->isTypeLegal(VT) &&
+           TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
+  }
+
+  unsigned getFPOpCost(Type *Ty) {
+    // By default, FP instructions are no more expensive since they are
+    // implemented in HW.  Target specific TTI can override this.
+    return TargetTransformInfo::TCC_Basic;
+  }
+
+  unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) {
+    const TargetLoweringBase *TLI = getTLI();
+    switch (Opcode) {
+    default: break;
+    case Instruction::Trunc: {
+      if (TLI->isTruncateFree(OpTy, Ty))
+        return TargetTransformInfo::TCC_Free;
+      return TargetTransformInfo::TCC_Basic;
+    }
+    case Instruction::ZExt: {
+      if (TLI->isZExtFree(OpTy, Ty))
+        return TargetTransformInfo::TCC_Free;
+      return TargetTransformInfo::TCC_Basic;
+    }
+    }
+
+    return BaseT::getOperationCost(Opcode, Ty, OpTy);
+  }
+
+  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) {
+    // This unrolling functionality is target independent, but to provide some
+    // motivation for its intended use, for x86:
+
+    // According to the Intel 64 and IA-32 Architectures Optimization Reference
+    // Manual, Intel Core models and later have a loop stream detector (and
+    // associated uop queue) that can benefit from partial unrolling.
+    // The relevant requirements are:
+    //  - The loop must have no more than 4 (8 for Nehalem and later) branches
+    //    taken, and none of them may be calls.
+    //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
+
+    // According to the Software Optimization Guide for AMD Family 15h
+    // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
+    // and loop buffer which can benefit from partial unrolling.
+    // The relevant requirements are:
+    //  - The loop must have fewer than 16 branches
+    //  - The loop must have less than 40 uops in all executed loop branches
+
+    // The number of taken branches in a loop is hard to estimate here, and
+    // benchmarking has revealed that it is better not to be conservative when
+    // estimating the branch count. As a result, we'll ignore the branch limits
+    // until someone finds a case where it matters in practice.
+
+    unsigned MaxOps;
+    const TargetSubtargetInfo *ST = getST();
+    if (PartialUnrollingThreshold.getNumOccurrences() > 0)
+      MaxOps = PartialUnrollingThreshold;
+    else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
+      MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
+    else
+      return;
+
+    // Scan the loop: don't unroll loops with calls.
+    for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E;
+         ++I) {
+      BasicBlock *BB = *I;
+
+      for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J)
+        if (isa<CallInst>(J) || isa<InvokeInst>(J)) {
+          ImmutableCallSite CS(J);
+          if (const Function *F = CS.getCalledFunction()) {
+            if (!static_cast<T *>(this)->isLoweredToCall(F))
+              continue;
+          }
+
+          return;
+        }
+    }
+
+    // Enable runtime and partial unrolling up to the specified size.
+    UP.Partial = UP.Runtime = true;
+    UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
+  }
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector) { return 1; }
+
+  unsigned getRegisterBitWidth(bool Vector) { return 32; }
+
+  unsigned getMaxInterleaveFactor() { return 1; }
+
+  unsigned getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) {
+    // Check if any of the operands are vector operands.
+    const TargetLoweringBase *TLI = getTLI();
+    int ISD = TLI->InstructionOpcodeToISD(Opcode);
+    assert(ISD && "Invalid opcode");
+
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+
+    bool IsFloat = Ty->getScalarType()->isFloatingPointTy();
+    // Assume that floating point arithmetic operations cost twice as much as
+    // integer operations.
+    unsigned OpCost = (IsFloat ? 2 : 1);
+
+    if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
+      // The operation is legal. Assume it costs 1.
+      // If the type is split to multiple registers, assume that there is some
+      // overhead to this.
+      // TODO: Once we have extract/insert subvector cost we need to use them.
+      if (LT.first > 1)
+        return LT.first * 2 * OpCost;
+      return LT.first * 1 * OpCost;
+    }
+
+    if (!TLI->isOperationExpand(ISD, LT.second)) {
+      // If the operation is custom lowered then assume
+      // thare the code is twice as expensive.
+      return LT.first * 2 * OpCost;
+    }
+
+    // Else, assume that we need to scalarize this op.
+    if (Ty->isVectorTy()) {
+      unsigned Num = Ty->getVectorNumElements();
+      unsigned Cost = static_cast<T *>(this)
+                          ->getArithmeticInstrCost(Opcode, Ty->getScalarType());
+      // return the cost of multiple scalar invocation plus the cost of
+      // inserting
+      // and extracting the values.
+      return getScalarizationOverhead(Ty, true, true) + Num * Cost;
+    }
+
+    // We don't know anything about this scalar instruction.
+    return OpCost;
+  }
+
+  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                          Type *SubTp) {
+    if (Kind == TTI::SK_Alternate) {
+      return getAltShuffleOverhead(Tp);
+    }
+    return 1;
+  }
+
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+    const TargetLoweringBase *TLI = getTLI();
+    int ISD = TLI->InstructionOpcodeToISD(Opcode);
+    assert(ISD && "Invalid opcode");
+
+    std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(Src);
+    std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(Dst);
+
+    // Check for NOOP conversions.
+    if (SrcLT.first == DstLT.first &&
+        SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
+
+      // Bitcast between types that are legalized to the same type are free.
+      if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc)
+        return 0;
+    }
+
+    if (Opcode == Instruction::Trunc &&
+        TLI->isTruncateFree(SrcLT.second, DstLT.second))
+      return 0;
+
+    if (Opcode == Instruction::ZExt &&
+        TLI->isZExtFree(SrcLT.second, DstLT.second))
+      return 0;
+
+    // If the cast is marked as legal (or promote) then assume low cost.
+    if (SrcLT.first == DstLT.first &&
+        TLI->isOperationLegalOrPromote(ISD, DstLT.second))
+      return 1;
+
+    // Handle scalar conversions.
+    if (!Src->isVectorTy() && !Dst->isVectorTy()) {
+
+      // Scalar bitcasts are usually free.
+      if (Opcode == Instruction::BitCast)
+        return 0;
+
+      // Just check the op cost. If the operation is legal then assume it costs
+      // 1.
+      if (!TLI->isOperationExpand(ISD, DstLT.second))
+        return 1;
+
+      // Assume that illegal scalar instruction are expensive.
+      return 4;
+    }
+
+    // Check vector-to-vector casts.
+    if (Dst->isVectorTy() && Src->isVectorTy()) {
+
+      // If the cast is between same-sized registers, then the check is simple.
+      if (SrcLT.first == DstLT.first &&
+          SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
+
+        // Assume that Zext is done using AND.
+        if (Opcode == Instruction::ZExt)
+          return 1;
+
+        // Assume that sext is done using SHL and SRA.
+        if (Opcode == Instruction::SExt)
+          return 2;
+
+        // Just check the op cost. If the operation is legal then assume it
+        // costs
+        // 1 and multiply by the type-legalization overhead.
+        if (!TLI->isOperationExpand(ISD, DstLT.second))
+          return SrcLT.first * 1;
+      }
+
+      // If we are converting vectors and the operation is illegal, or
+      // if the vectors are legalized to different types, estimate the
+      // scalarization costs.
+      unsigned Num = Dst->getVectorNumElements();
+      unsigned Cost = static_cast<T *>(this)->getCastInstrCost(
+          Opcode, Dst->getScalarType(), Src->getScalarType());
+
+      // Return the cost of multiple scalar invocation plus the cost of
+      // inserting and extracting the values.
+      return getScalarizationOverhead(Dst, true, true) + Num * Cost;
+    }
+
+    // We already handled vector-to-vector and scalar-to-scalar conversions.
+    // This
+    // is where we handle bitcast between vectors and scalars. We need to assume
+    //  that the conversion is scalarized in one way or another.
+    if (Opcode == Instruction::BitCast)
+      // Illegal bitcasts are done by storing and loading from a stack slot.
+      return (Src->isVectorTy() ? getScalarizationOverhead(Src, false, true)
+                                : 0) +
+             (Dst->isVectorTy() ? getScalarizationOverhead(Dst, true, false)
+                                : 0);
+
+    llvm_unreachable("Unhandled cast");
+  }
+
+  unsigned getCFInstrCost(unsigned Opcode) {
+    // Branches are assumed to be predicted.
+    return 0;
+  }
+
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+    const TargetLoweringBase *TLI = getTLI();
+    int ISD = TLI->InstructionOpcodeToISD(Opcode);
+    assert(ISD && "Invalid opcode");
+
+    // Selects on vectors are actually vector selects.
+    if (ISD == ISD::SELECT) {
+      assert(CondTy && "CondTy must exist");
+      if (CondTy->isVectorTy())
+        ISD = ISD::VSELECT;
+    }
+
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+
+    if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
+        !TLI->isOperationExpand(ISD, LT.second)) {
+      // The operation is legal. Assume it costs 1. Multiply
+      // by the type-legalization overhead.
+      return LT.first * 1;
+    }
+
+    // Otherwise, assume that the cast is scalarized.
+    if (ValTy->isVectorTy()) {
+      unsigned Num = ValTy->getVectorNumElements();
+      if (CondTy)
+        CondTy = CondTy->getScalarType();
+      unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost(
+          Opcode, ValTy->getScalarType(), CondTy);
+
+      // Return the cost of multiple scalar invocation plus the cost of
+      // inserting
+      // and extracting the values.
+      return getScalarizationOverhead(ValTy, true, false) + Num * Cost;
+    }
+
+    // Unknown scalar opcode.
+    return 1;
+  }
+
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+    std::pair<unsigned, MVT> LT =
+        getTLI()->getTypeLegalizationCost(Val->getScalarType());
+
+    return LT.first;
+  }
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) {
+    assert(!Src->isVoidTy() && "Invalid type");
+    std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(Src);
+
+    // Assuming that all loads of legal types cost 1.
+    unsigned Cost = LT.first;
+
+    if (Src->isVectorTy() &&
+        Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) {
+      // This is a vector load that legalizes to a larger type than the vector
+      // itself. Unless the corresponding extending load or truncating store is
+      // legal, then this will scalarize.
+      TargetLowering::LegalizeAction LA = TargetLowering::Expand;
+      EVT MemVT = getTLI()->getValueType(Src, true);
+      if (MemVT.isSimple() && MemVT != MVT::Other) {
+        if (Opcode == Instruction::Store)
+          LA = getTLI()->getTruncStoreAction(LT.second, MemVT.getSimpleVT());
+        else
+          LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
+      }
+
+      if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
+        // This is a vector load/store for some illegal type that is scalarized.
+        // We must account for the cost of building or decomposing the vector.
+        Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store,
+                                         Opcode == Instruction::Store);
+      }
+    }
+
+    return Cost;
+  }
+
+  unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+                                 ArrayRef<Type *> Tys) {
+    unsigned ISD = 0;
+    switch (IID) {
+    default: {
+      // Assume that we need to scalarize this intrinsic.
+      unsigned ScalarizationCost = 0;
+      unsigned ScalarCalls = 1;
+      if (RetTy->isVectorTy()) {
+        ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
+        ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
+      }
+      for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
+        if (Tys[i]->isVectorTy()) {
+          ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
+          ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
+        }
+      }
+
+      return ScalarCalls + ScalarizationCost;
+    }
+    // Look for intrinsics that can be lowered directly or turned into a scalar
+    // intrinsic call.
+    case Intrinsic::sqrt:
+      ISD = ISD::FSQRT;
+      break;
+    case Intrinsic::sin:
+      ISD = ISD::FSIN;
+      break;
+    case Intrinsic::cos:
+      ISD = ISD::FCOS;
+      break;
+    case Intrinsic::exp:
+      ISD = ISD::FEXP;
+      break;
+    case Intrinsic::exp2:
+      ISD = ISD::FEXP2;
+      break;
+    case Intrinsic::log:
+      ISD = ISD::FLOG;
+      break;
+    case Intrinsic::log10:
+      ISD = ISD::FLOG10;
+      break;
+    case Intrinsic::log2:
+      ISD = ISD::FLOG2;
+      break;
+    case Intrinsic::fabs:
+      ISD = ISD::FABS;
+      break;
+    case Intrinsic::minnum:
+      ISD = ISD::FMINNUM;
+      break;
+    case Intrinsic::maxnum:
+      ISD = ISD::FMAXNUM;
+      break;
+    case Intrinsic::copysign:
+      ISD = ISD::FCOPYSIGN;
+      break;
+    case Intrinsic::floor:
+      ISD = ISD::FFLOOR;
+      break;
+    case Intrinsic::ceil:
+      ISD = ISD::FCEIL;
+      break;
+    case Intrinsic::trunc:
+      ISD = ISD::FTRUNC;
+      break;
+    case Intrinsic::nearbyint:
+      ISD = ISD::FNEARBYINT;
+      break;
+    case Intrinsic::rint:
+      ISD = ISD::FRINT;
+      break;
+    case Intrinsic::round:
+      ISD = ISD::FROUND;
+      break;
+    case Intrinsic::pow:
+      ISD = ISD::FPOW;
+      break;
+    case Intrinsic::fma:
+      ISD = ISD::FMA;
+      break;
+    case Intrinsic::fmuladd:
+      ISD = ISD::FMA;
+      break;
+    // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+      return 0;
+    case Intrinsic::masked_store:
+      return static_cast<T *>(this)
+          ->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0, 0);
+    case Intrinsic::masked_load:
+      return static_cast<T *>(this)
+          ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0);
+    }
+
+    const TargetLoweringBase *TLI = getTLI();
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(RetTy);
+
+    if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
+      // The operation is legal. Assume it costs 1.
+      // If the type is split to multiple registers, assume that there is some
+      // overhead to this.
+      // TODO: Once we have extract/insert subvector cost we need to use them.
+      if (LT.first > 1)
+        return LT.first * 2;
+      return LT.first * 1;
+    }
+
+    if (!TLI->isOperationExpand(ISD, LT.second)) {
+      // If the operation is custom lowered then assume
+      // thare the code is twice as expensive.
+      return LT.first * 2;
+    }
+
+    // If we can't lower fmuladd into an FMA estimate the cost as a floating
+    // point mul followed by an add.
+    if (IID == Intrinsic::fmuladd)
+      return static_cast<T *>(this)
+                 ->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +
+             static_cast<T *>(this)
+                 ->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);
+
+    // Else, assume that we need to scalarize this intrinsic. For math builtins
+    // this will emit a costly libcall, adding call overhead and spills. Make it
+    // very expensive.
+    if (RetTy->isVectorTy()) {
+      unsigned Num = RetTy->getVectorNumElements();
+      unsigned Cost = static_cast<T *>(this)->getIntrinsicInstrCost(
+          IID, RetTy->getScalarType(), Tys);
+      return 10 * Cost * Num;
+    }
+
+    // This is going to be turned into a library call, make it expensive.
+    return 10;
+  }
+
+  unsigned getNumberOfParts(Type *Tp) {
+    std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(Tp);
+    return LT.first;
+  }
+
+  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) { return 0; }
+
+  unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) {
+    assert(Ty->isVectorTy() && "Expect a vector type");
+    unsigned NumVecElts = Ty->getVectorNumElements();
+    unsigned NumReduxLevels = Log2_32(NumVecElts);
+    unsigned ArithCost =
+        NumReduxLevels *
+        static_cast<T *>(this)->getArithmeticInstrCost(Opcode, Ty);
+    // Assume the pairwise shuffles add a cost.
+    unsigned ShuffleCost =
+        NumReduxLevels * (IsPairwise + 1) *
+        static_cast<T *>(this)
+            ->getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts / 2, Ty);
+    return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
+  }
+
+  /// @}
+};
+
+/// \brief Concrete BasicTTIImpl that can be used if no further customization
+/// is needed.
+class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
+  typedef BasicTTIImplBase<BasicTTIImpl> BaseT;
+  friend class BasicTTIImplBase<BasicTTIImpl>;
+
+  const TargetSubtargetInfo *ST;
+  const TargetLoweringBase *TLI;
+
+  const TargetSubtargetInfo *getST() const { return ST; }
+  const TargetLoweringBase *getTLI() const { return TLI; }
+
+public:
+  explicit BasicTTIImpl(const TargetMachine *ST, Function &F);
+
+  // Provide value semantics. MSVC requires that we spell all of these out.
+  BasicTTIImpl(const BasicTTIImpl &Arg)
+      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
+  BasicTTIImpl(BasicTTIImpl &&Arg)
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+        TLI(std::move(Arg.TLI)) {}
+  BasicTTIImpl &operator=(const BasicTTIImpl &RHS) {
+    BaseT::operator=(static_cast<const BaseT &>(RHS));
+    ST = RHS.ST;
+    TLI = RHS.TLI;
+    return *this;
+  }
+  BasicTTIImpl &operator=(BasicTTIImpl &&RHS) {
+    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
+    ST = std::move(RHS.ST);
+    TLI = std::move(RHS.TLI);
+    return *this;
+  }
+};
+
+}
+
+#endif
diff --git a/include/llvm/CodeGen/CallingConvLower.h b/include/llvm/CodeGen/CallingConvLower.h
index 0b2ccc6..e2b744f 100644
--- a/include/llvm/CodeGen/CallingConvLower.h
+++ b/include/llvm/CodeGen/CallingConvLower.h
@@ -122,8 +122,8 @@ public:
   // There is no need to differentiate between a pending CCValAssign and other
   // kinds, as they are stored in a different list.
   static CCValAssign getPending(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                LocInfo HTP) {
-    return getReg(ValNo, ValVT, 0, LocVT, HTP);
+                                LocInfo HTP, unsigned ExtraInfo = 0) {
+    return getReg(ValNo, ValVT, ExtraInfo, LocVT, HTP);
   }
 
   void convertToReg(unsigned RegNo) {
@@ -146,6 +146,7 @@ public:
 
   unsigned getLocReg() const { assert(isRegLoc()); return Loc; }
   unsigned getLocMemOffset() const { assert(isMemLoc()); return Loc; }
+  unsigned getExtraInfo() const { return Loc; }
   MVT getLocVT() const { return LocVT; }
 
   LocInfo getLocInfo() const { return HTP; }
@@ -158,6 +159,16 @@ public:
   }
 };
 
+/// Describes a register that needs to be forwarded from the prologue to a
+/// musttail call.
+struct ForwardedRegister {
+  ForwardedRegister(unsigned VReg, MCPhysReg PReg, MVT VT)
+      : VReg(VReg), PReg(PReg), VT(VT) {}
+  unsigned VReg;
+  MCPhysReg PReg;
+  MVT VT;
+};
+
 /// CCAssignFn - This function assigns a location for Val, updating State to
 /// reflect the change.  It returns 'true' if it failed to handle Val.
 typedef bool CCAssignFn(unsigned ValNo, MVT ValVT,
@@ -302,13 +313,13 @@ public:
   /// produce a single value.
   void AnalyzeCallResult(MVT VT, CCAssignFn Fn);
 
-  /// getFirstUnallocated - Return the first unallocated register in the set, or
-  /// NumRegs if they are all allocated.
-  unsigned getFirstUnallocated(const MCPhysReg *Regs, unsigned NumRegs) const {
-    for (unsigned i = 0; i != NumRegs; ++i)
+  /// getFirstUnallocated - Return the index of the first unallocated register
+  /// in the set, or Regs.size() if they are all allocated.
+  unsigned getFirstUnallocated(ArrayRef<MCPhysReg> Regs) const {
+    for (unsigned i = 0; i < Regs.size(); ++i)
       if (!isAllocated(Regs[i]))
         return i;
-    return NumRegs;
+    return Regs.size();
   }
 
   /// AllocateReg - Attempt to allocate one register.  If it is not available,
@@ -331,9 +342,9 @@ public:
   /// AllocateReg - Attempt to allocate one of the specified registers.  If none
   /// are available, return zero.  Otherwise, return the first one available,
   /// marking it and any aliases as allocated.
-  unsigned AllocateReg(const MCPhysReg *Regs, unsigned NumRegs) {
-    unsigned FirstUnalloc = getFirstUnallocated(Regs, NumRegs);
-    if (FirstUnalloc == NumRegs)
+  unsigned AllocateReg(ArrayRef<MCPhysReg> Regs) {
+    unsigned FirstUnalloc = getFirstUnallocated(Regs);
+    if (FirstUnalloc == Regs.size())
       return 0;    // Didn't find the reg.
 
     // Mark the register and any aliases as allocated.
@@ -345,8 +356,12 @@ public:
   /// AllocateRegBlock - Attempt to allocate a block of RegsRequired consecutive
   /// registers. If this is not possible, return zero. Otherwise, return the first
   /// register of the block that were allocated, marking the entire block as allocated.
-  unsigned AllocateRegBlock(const uint16_t *Regs, unsigned NumRegs, unsigned RegsRequired) {
-    for (unsigned StartIdx = 0; StartIdx <= NumRegs - RegsRequired; ++StartIdx) {
+  unsigned AllocateRegBlock(ArrayRef<uint16_t> Regs, unsigned RegsRequired) {
+    if (RegsRequired > Regs.size())
+      return 0;
+
+    for (unsigned StartIdx = 0; StartIdx <= Regs.size() - RegsRequired;
+         ++StartIdx) {
       bool BlockAvailable = true;
       // Check for already-allocated regs in this block
       for (unsigned BlockIdx = 0; BlockIdx < RegsRequired; ++BlockIdx) {
@@ -368,10 +383,9 @@ public:
   }
 
   /// Version of AllocateReg with list of registers to be shadowed.
-  unsigned AllocateReg(const MCPhysReg *Regs, const MCPhysReg *ShadowRegs,
-                       unsigned NumRegs) {
-    unsigned FirstUnalloc = getFirstUnallocated(Regs, NumRegs);
-    if (FirstUnalloc == NumRegs)
+  unsigned AllocateReg(ArrayRef<MCPhysReg> Regs, const MCPhysReg *ShadowRegs) {
+    unsigned FirstUnalloc = getFirstUnallocated(Regs);
+    if (FirstUnalloc == Regs.size())
       return 0;    // Didn't find the reg.
 
     // Mark the register and any aliases as allocated.
@@ -401,8 +415,8 @@ public:
   /// Version of AllocateStack with list of extra registers to be shadowed.
   /// Note that, unlike AllocateReg, this shadows ALL of the shadow registers.
   unsigned AllocateStack(unsigned Size, unsigned Align,
-                         const MCPhysReg *ShadowRegs, unsigned NumShadowRegs) {
-    for (unsigned i = 0; i < NumShadowRegs; ++i)
+                         ArrayRef<MCPhysReg> ShadowRegs) {
+    for (unsigned i = 0; i < ShadowRegs.size(); ++i)
       MarkAllocated(ShadowRegs[i]);
     return AllocateStack(Size, Align);
   }
@@ -466,6 +480,19 @@ public:
     return PendingLocs;
   }
 
+  /// Compute the remaining unused register parameters that would be used for
+  /// the given value type. This is useful when varargs are passed in the
+  /// registers that normal prototyped parameters would be passed in, or for
+  /// implementing perfect forwarding.
+  void getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs, MVT VT,
+                                   CCAssignFn Fn);
+
+  /// Compute the set of registers that need to be preserved and forwarded to
+  /// any musttail calls.
+  void analyzeMustTailForwardedRegisters(
+      SmallVectorImpl<ForwardedRegister> &Forwards, ArrayRef<MVT> RegParmTypes,
+      CCAssignFn Fn);
+
 private:
   /// MarkAllocated - Mark a register and all of its aliases as allocated.
   void MarkAllocated(unsigned Reg);
diff --git a/include/llvm/CodeGen/CommandFlags.h b/include/llvm/CodeGen/CommandFlags.h
index 973c595..9f86429 100644
--- a/include/llvm/CodeGen/CommandFlags.h
+++ b/include/llvm/CodeGen/CommandFlags.h
@@ -207,6 +207,10 @@ FunctionSections("function-sections",
                  cl::desc("Emit functions into separate sections"),
                  cl::init(false));
 
+cl::opt<bool> UniqueSectionNames("unique-section-names",
+                                 cl::desc("Give unique names to every section"),
+                                 cl::init(true));
+
 cl::opt<llvm::JumpTable::JumpTableType>
 JTableType("jump-table-type",
           cl::desc("Choose the type of Jump-Instruction Table for jumptable."),
@@ -284,6 +288,7 @@ static inline TargetOptions InitTargetOptionsFromCodeGenFlags() {
   Options.UseInitArray = !UseCtors;
   Options.DataSections = DataSections;
   Options.FunctionSections = FunctionSections;
+  Options.UniqueSectionNames = UniqueSectionNames;
 
   Options.MCOptions = InitMCTargetOptionsFromFlags();
   Options.JTType = JTableType;
diff --git a/lib/CodeGen/AsmPrinter/DIE.h b/include/llvm/CodeGen/DIE.h
index e310aef..e310aef 100644
--- a/lib/CodeGen/AsmPrinter/DIE.h
+++ b/include/llvm/CodeGen/DIE.h
diff --git a/include/llvm/CodeGen/FastISel.h b/include/llvm/CodeGen/FastISel.h
index b5405f9..1dca2ce 100644
--- a/include/llvm/CodeGen/FastISel.h
+++ b/include/llvm/CodeGen/FastISel.h
@@ -18,9 +18,9 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/Target/TargetLowering.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
 
@@ -76,6 +76,8 @@ public:
     unsigned ResultReg;
     unsigned NumResultRegs;
 
+    bool IsPatchPoint;
+
     SmallVector<Value *, 16> OutVals;
     SmallVector<ISD::ArgFlagsTy, 16> OutFlags;
     SmallVector<unsigned, 16> OutRegs;
@@ -87,7 +89,7 @@ public:
           IsInReg(false), DoesNotReturn(false), IsReturnValueUsed(true),
           IsTailCall(false), NumFixedArgs(-1), CallConv(CallingConv::C),
           Callee(nullptr), SymName(nullptr), CS(nullptr), Call(nullptr),
-          ResultReg(0), NumResultRegs(0) {}
+          ResultReg(0), NumResultRegs(0), IsPatchPoint(false) {}
 
     CallLoweringInfo &setCallee(Type *ResultTy, FunctionType *FuncTy,
                                 const Value *Target, ArgListTy &&ArgsList,
@@ -162,6 +164,11 @@ public:
       return *this;
     }
 
+    CallLoweringInfo &setIsPatchPoint(bool Value = true) {
+      IsPatchPoint = Value;
+      return *this;
+    }
+
     ArgListTy &getArgs() { return Args; }
 
     void clearOuts() {
diff --git a/include/llvm/CodeGen/ForwardControlFlowIntegrity.h b/include/llvm/CodeGen/ForwardControlFlowIntegrity.h
index a6232c5..ec8e2ef 100644
--- a/include/llvm/CodeGen/ForwardControlFlowIntegrity.h
+++ b/include/llvm/CodeGen/ForwardControlFlowIntegrity.h
@@ -18,7 +18,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetOptions.h"
-
 #include <string>
 
 namespace llvm {
diff --git a/include/llvm/CodeGen/FunctionLoweringInfo.h b/include/llvm/CodeGen/FunctionLoweringInfo.h
index 91f20d0..e12866e 100644
--- a/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -20,8 +20,8 @@
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -88,6 +88,12 @@ public:
   /// RegFixups - Registers which need to be replaced after isel is done.
   DenseMap<unsigned, unsigned> RegFixups;
 
+  /// StatepointStackSlots - A list of temporary stack slots (frame indices) 
+  /// used to spill values at a statepoint.  We store them here to enable
+  /// reuse of the same stack slots across different statepoints in different
+  /// basic blocks.
+  SmallVector<unsigned, 50> StatepointStackSlots;
+
   /// MBB - The current block.
   MachineBasicBlock *MBB;
 
@@ -200,6 +206,9 @@ public:
       return;
 
     unsigned Reg = It->second;
+    if (Reg == 0)
+      return;
+
     LiveOutRegInfo.grow(Reg);
     LiveOutRegInfo[Reg].IsValid = false;
   }
@@ -223,11 +232,6 @@ private:
 /// floating-point support.
 void ComputeUsesVAFloatArgument(const CallInst &I, MachineModuleInfo *MMI);
 
-/// AddCatchInfo - Extract the personality and type infos from an eh.selector
-/// call, and add them to the specified machine basic block.
-void AddCatchInfo(const CallInst &I,
-                  MachineModuleInfo *MMI, MachineBasicBlock *MBB);
-
 /// AddLandingPadInfo - Extract the exception handling information from the
 /// landingpad instruction and add them to the specified machine module info.
 void AddLandingPadInfo(const LandingPadInst &I, MachineModuleInfo &MMI,
diff --git a/include/llvm/CodeGen/GCMetadata.h b/include/llvm/CodeGen/GCMetadata.h
index ddcc823..357b2d8 100644
--- a/include/llvm/CodeGen/GCMetadata.h
+++ b/include/llvm/CodeGen/GCMetadata.h
@@ -34,168 +34,173 @@
 #define LLVM_CODEGEN_GCMETADATA_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Pass.h"
-
 #include <memory>
 
 namespace llvm {
-  class AsmPrinter;
-  class GCStrategy;
-  class Constant;
-  class MCSymbol;
-
-  namespace GC {
-    /// PointKind - The type of a collector-safe point.
-    ///
-    enum PointKind {
-      Loop,    ///< Instr is a loop (backwards branch).
-      Return,  ///< Instr is a return instruction.
-      PreCall, ///< Instr is a call instruction.
-      PostCall ///< Instr is the return address of a call.
-    };
-  }
-
-  /// GCPoint - Metadata for a collector-safe point in machine code.
+class AsmPrinter;
+class Constant;
+class MCSymbol;
+
+/// GCPoint - Metadata for a collector-safe point in machine code.
+///
+struct GCPoint {
+  GC::PointKind Kind; ///< The kind of the safe point.
+  MCSymbol *Label;    ///< A label.
+  DebugLoc Loc;
+
+  GCPoint(GC::PointKind K, MCSymbol *L, DebugLoc DL)
+      : Kind(K), Label(L), Loc(DL) {}
+};
+
+/// GCRoot - Metadata for a pointer to an object managed by the garbage
+/// collector.
+struct GCRoot {
+  int Num;                  ///< Usually a frame index.
+  int StackOffset;          ///< Offset from the stack pointer.
+  const Constant *Metadata; ///< Metadata straight from the call
+                            ///< to llvm.gcroot.
+
+  GCRoot(int N, const Constant *MD) : Num(N), StackOffset(-1), Metadata(MD) {}
+};
+
+/// Garbage collection metadata for a single function.  Currently, this
+/// information only applies to GCStrategies which use GCRoot.
+class GCFunctionInfo {
+public:
+  typedef std::vector<GCPoint>::iterator iterator;
+  typedef std::vector<GCRoot>::iterator roots_iterator;
+  typedef std::vector<GCRoot>::const_iterator live_iterator;
+
+private:
+  const Function &F;
+  GCStrategy &S;
+  uint64_t FrameSize;
+  std::vector<GCRoot> Roots;
+  std::vector<GCPoint> SafePoints;
+
+  // FIXME: Liveness. A 2D BitVector, perhaps?
+  //
+  //   BitVector Liveness;
+  //
+  //   bool islive(int point, int root) =
+  //     Liveness[point * SafePoints.size() + root]
+  //
+  // The bit vector is the more compact representation where >3.2% of roots
+  // are live per safe point (1.5% on 64-bit hosts).
+
+public:
+  GCFunctionInfo(const Function &F, GCStrategy &S);
+  ~GCFunctionInfo();
+
+  /// getFunction - Return the function to which this metadata applies.
   ///
-  struct GCPoint {
-    GC::PointKind Kind; ///< The kind of the safe point.
-    MCSymbol *Label;    ///< A label.
-    DebugLoc Loc;
-
-    GCPoint(GC::PointKind K, MCSymbol *L, DebugLoc DL)
-        : Kind(K), Label(L), Loc(DL) {}
-  };
-
-  /// GCRoot - Metadata for a pointer to an object managed by the garbage
-  /// collector.
-  struct GCRoot {
-    int Num;            ///< Usually a frame index.
-    int StackOffset;    ///< Offset from the stack pointer.
-    const Constant *Metadata; ///< Metadata straight from the call
-                              ///< to llvm.gcroot.
+  const Function &getFunction() const { return F; }
 
-    GCRoot(int N, const Constant *MD) : Num(N), StackOffset(-1), Metadata(MD) {}
-  };
-
-
-  /// GCFunctionInfo - Garbage collection metadata for a single function.
+  /// getStrategy - Return the GC strategy for the function.
   ///
-  class GCFunctionInfo {
-  public:
-    typedef std::vector<GCPoint>::iterator iterator;
-    typedef std::vector<GCRoot>::iterator roots_iterator;
-    typedef std::vector<GCRoot>::const_iterator live_iterator;
-
-  private:
-    const Function &F;
-    GCStrategy &S;
-    uint64_t FrameSize;
-    std::vector<GCRoot> Roots;
-    std::vector<GCPoint> SafePoints;
-
-    // FIXME: Liveness. A 2D BitVector, perhaps?
-    //
-    //   BitVector Liveness;
-    //
-    //   bool islive(int point, int root) =
-    //     Liveness[point * SafePoints.size() + root]
-    //
-    // The bit vector is the more compact representation where >3.2% of roots
-    // are live per safe point (1.5% on 64-bit hosts).
-
-  public:
-    GCFunctionInfo(const Function &F, GCStrategy &S);
-    ~GCFunctionInfo();
-
-    /// getFunction - Return the function to which this metadata applies.
-    ///
-    const Function &getFunction() const { return F; }
-
-    /// getStrategy - Return the GC strategy for the function.
-    ///
-    GCStrategy &getStrategy() { return S; }
-
-    /// addStackRoot - Registers a root that lives on the stack. Num is the
-    ///                stack object ID for the alloca (if the code generator is
-    //                 using  MachineFrameInfo).
-    void addStackRoot(int Num, const Constant *Metadata) {
-      Roots.push_back(GCRoot(Num, Metadata));
-    }
-
-    /// removeStackRoot - Removes a root.
-    roots_iterator removeStackRoot(roots_iterator position) {
-      return Roots.erase(position);
-    }
-
-    /// addSafePoint - Notes the existence of a safe point. Num is the ID of the
-    /// label just prior to the safe point (if the code generator is using
-    /// MachineModuleInfo).
-    void addSafePoint(GC::PointKind Kind, MCSymbol *Label, DebugLoc DL) {
-      SafePoints.push_back(GCPoint(Kind, Label, DL));
-    }
-
-    /// getFrameSize/setFrameSize - Records the function's frame size.
-    ///
-    uint64_t getFrameSize() const { return FrameSize; }
-    void setFrameSize(uint64_t S) { FrameSize = S; }
-
-    /// begin/end - Iterators for safe points.
-    ///
-    iterator begin() { return SafePoints.begin(); }
-    iterator end()   { return SafePoints.end();   }
-    size_t size() const { return SafePoints.size(); }
-
-    /// roots_begin/roots_end - Iterators for all roots in the function.
-    ///
-    roots_iterator roots_begin() { return Roots.begin(); }
-    roots_iterator roots_end  () { return Roots.end();   }
-    size_t roots_size() const { return Roots.size(); }
-
-    /// live_begin/live_end - Iterators for live roots at a given safe point.
-    ///
-    live_iterator live_begin(const iterator &p) { return roots_begin(); }
-    live_iterator live_end  (const iterator &p) { return roots_end();   }
-    size_t live_size(const iterator &p) const { return roots_size(); }
-  };
-
-
-  /// GCModuleInfo - Garbage collection metadata for a whole module.
-  ///
-  class GCModuleInfo : public ImmutablePass {
-    typedef StringMap<GCStrategy*> strategy_map_type;
-    typedef std::vector<std::unique_ptr<GCStrategy>> list_type;
-    typedef DenseMap<const Function*,GCFunctionInfo*> finfo_map_type;
-
-    strategy_map_type StrategyMap;
-    list_type StrategyList;
-    finfo_map_type FInfoMap;
+  GCStrategy &getStrategy() { return S; }
 
-    GCStrategy *getOrCreateStrategy(const Module *M, const std::string &Name);
+  /// addStackRoot - Registers a root that lives on the stack. Num is the
+  ///                stack object ID for the alloca (if the code generator is
+  //                 using  MachineFrameInfo).
+  void addStackRoot(int Num, const Constant *Metadata) {
+    Roots.push_back(GCRoot(Num, Metadata));
+  }
 
-  public:
-    typedef list_type::const_iterator iterator;
+  /// removeStackRoot - Removes a root.
+  roots_iterator removeStackRoot(roots_iterator position) {
+    return Roots.erase(position);
+  }
 
-    static char ID;
+  /// addSafePoint - Notes the existence of a safe point. Num is the ID of the
+  /// label just prior to the safe point (if the code generator is using
+  /// MachineModuleInfo).
+  void addSafePoint(GC::PointKind Kind, MCSymbol *Label, DebugLoc DL) {
+    SafePoints.push_back(GCPoint(Kind, Label, DL));
+  }
 
-    GCModuleInfo();
+  /// getFrameSize/setFrameSize - Records the function's frame size.
+  ///
+  uint64_t getFrameSize() const { return FrameSize; }
+  void setFrameSize(uint64_t S) { FrameSize = S; }
 
-    /// clear - Resets the pass. Any pass, which uses GCModuleInfo, should
-    /// call it in doFinalization().
-    ///
-    void clear();
+  /// begin/end - Iterators for safe points.
+  ///
+  iterator begin() { return SafePoints.begin(); }
+  iterator end() { return SafePoints.end(); }
+  size_t size() const { return SafePoints.size(); }
 
-    /// begin/end - Iterators for used strategies.
-    ///
-    iterator begin() const { return StrategyList.begin(); }
-    iterator end()   const { return StrategyList.end();   }
+  /// roots_begin/roots_end - Iterators for all roots in the function.
+  ///
+  roots_iterator roots_begin() { return Roots.begin(); }
+  roots_iterator roots_end() { return Roots.end(); }
+  size_t roots_size() const { return Roots.size(); }
 
-    /// get - Look up function metadata.
-    ///
-    GCFunctionInfo &getFunctionInfo(const Function &F);
-  };
+  /// live_begin/live_end - Iterators for live roots at a given safe point.
+  ///
+  live_iterator live_begin(const iterator &p) { return roots_begin(); }
+  live_iterator live_end(const iterator &p) { return roots_end(); }
+  size_t live_size(const iterator &p) const { return roots_size(); }
+};
+
+/// An analysis pass which caches information about the entire Module.
+/// Records both the function level information used by GCRoots and a
+/// cache of the 'active' gc strategy objects for the current Module.
+class GCModuleInfo : public ImmutablePass {
+  /// An owning list of all GCStrategies which have been created
+  SmallVector<std::unique_ptr<GCStrategy>, 1> GCStrategyList;
+  /// A helper map to speedup lookups into the above list
+  StringMap<GCStrategy*> GCStrategyMap;
+
+public:
+  /// Lookup the GCStrategy object associated with the given gc name.
+  /// Objects are owned internally; No caller should attempt to delete the
+  /// returned objects. 
+  GCStrategy *getGCStrategy(const StringRef Name);
+  
+  /// List of per function info objects.  In theory, Each of these
+  /// may be associated with a different GC.
+  typedef std::vector<std::unique_ptr<GCFunctionInfo>> FuncInfoVec;
+
+  FuncInfoVec::iterator funcinfo_begin() { return Functions.begin(); }
+  FuncInfoVec::iterator funcinfo_end() { return Functions.end(); }
+
+private:
+  /// Owning list of all GCFunctionInfos associated with this Module
+  FuncInfoVec Functions;
+
+  /// Non-owning map to bypass linear search when finding the GCFunctionInfo
+  /// associated with a particular Function.
+  typedef DenseMap<const Function *, GCFunctionInfo *> finfo_map_type;
+  finfo_map_type FInfoMap;
+
+public:
+  typedef SmallVector<std::unique_ptr<GCStrategy>,1>::const_iterator iterator;
+
+  static char ID;
+
+  GCModuleInfo();
+
+  /// clear - Resets the pass. Any pass, which uses GCModuleInfo, should
+  /// call it in doFinalization().
+  ///
+  void clear();
 
+  /// begin/end - Iterators for used strategies.
+  ///
+  iterator begin() const { return GCStrategyList.begin(); }
+  iterator end() const { return GCStrategyList.end(); }
+
+  /// get - Look up function metadata.  This is currently assumed
+  /// have the side effect of initializing the associated GCStrategy.  That
+  /// will soon change.
+  GCFunctionInfo &getFunctionInfo(const Function &F);
+};
 }
 
 #endif
diff --git a/include/llvm/CodeGen/GCMetadataPrinter.h b/include/llvm/CodeGen/GCMetadataPrinter.h
index 4a6b5ac..2208470 100644
--- a/include/llvm/CodeGen/GCMetadataPrinter.h
+++ b/include/llvm/CodeGen/GCMetadataPrinter.h
@@ -26,49 +26,39 @@
 
 namespace llvm {
 
-  class GCMetadataPrinter;
-
-  /// GCMetadataPrinterRegistry - The GC assembly printer registry uses all the
-  /// defaults from Registry.
-  typedef Registry<GCMetadataPrinter> GCMetadataPrinterRegistry;
-
-  /// GCMetadataPrinter - Emits GC metadata as assembly code.
-  ///
-  class GCMetadataPrinter {
-  public:
-    typedef GCStrategy::list_type list_type;
-    typedef GCStrategy::iterator iterator;
-
-  private:
-    GCStrategy *S;
-
-    friend class AsmPrinter;
-
-  protected:
-    // May only be subclassed.
-    GCMetadataPrinter();
-
-  private:
-    GCMetadataPrinter(const GCMetadataPrinter &) LLVM_DELETED_FUNCTION;
-    GCMetadataPrinter &
-      operator=(const GCMetadataPrinter &) LLVM_DELETED_FUNCTION;
-
-  public:
-    GCStrategy &getStrategy() { return *S; }
-    const Module &getModule() const { return S->getModule(); }
-
-    /// begin/end - Iterate over the collected function metadata.
-    iterator begin() { return S->begin(); }
-    iterator end()   { return S->end();   }
-
-    /// beginAssembly/finishAssembly - Emit module metadata as assembly code.
-    virtual void beginAssembly(AsmPrinter &AP);
-
-    virtual void finishAssembly(AsmPrinter &AP);
-
-    virtual ~GCMetadataPrinter();
-  };
-
+class GCMetadataPrinter;
+
+/// GCMetadataPrinterRegistry - The GC assembly printer registry uses all the
+/// defaults from Registry.
+typedef Registry<GCMetadataPrinter> GCMetadataPrinterRegistry;
+
+/// GCMetadataPrinter - Emits GC metadata as assembly code.  Instances are
+/// created, managed, and owned by the AsmPrinter.
+class GCMetadataPrinter {
+private:
+  GCStrategy *S;
+  friend class AsmPrinter;
+
+protected:
+  // May only be subclassed.
+  GCMetadataPrinter();
+
+private:
+  GCMetadataPrinter(const GCMetadataPrinter &) = delete;
+  GCMetadataPrinter &operator=(const GCMetadataPrinter &) = delete;
+
+public:
+  GCStrategy &getStrategy() { return *S; }
+
+  /// Called before the assembly for the module is generated by
+  /// the AsmPrinter (but after target specific hooks.)
+  virtual void beginAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) {}
+  /// Called after the assembly for the module is generated by
+  /// the AsmPrinter (but before target specific hooks)
+  virtual void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) {}
+
+  virtual ~GCMetadataPrinter();
+};
 }
 
 #endif
diff --git a/include/llvm/CodeGen/GCStrategy.h b/include/llvm/CodeGen/GCStrategy.h
index 81e1f85..869f888 100644
--- a/include/llvm/CodeGen/GCStrategy.h
+++ b/include/llvm/CodeGen/GCStrategy.h
@@ -12,10 +12,15 @@
 // specified in a function's 'gc' attribute. Algorithms are enabled by setting
 // flags in a subclass's constructor, and some virtual methods can be
 // overridden.
-// 
-// When requested, the GCStrategy will be populated with data about each
-// function which uses it. Specifically:
-// 
+//
+// GCStrategy is relevant for implementations using either gc.root or
+// gc.statepoint based lowering strategies, but is currently focused mostly on
+// options for gc.root.  This will change over time.
+//
+// When requested by a subclass of GCStrategy, the gc.root implementation will
+// populate GCModuleInfo and GCFunctionInfo with that about each Function in
+// the Module that opts in to garbage collection.  Specifically:
+//
 // - Safe points
 //   Garbage collection is generally only possible at certain points in code.
 //   GCStrategy can request that the collector insert such points:
@@ -23,7 +28,7 @@
 //     - At and after any call to a subroutine
 //     - Before returning from the current function
 //     - Before backwards branches (loops)
-// 
+//
 // - Roots
 //   When a reference to a GC-allocated object exists on the stack, it must be
 //   stored in an alloca registered with llvm.gcoot.
@@ -31,123 +36,142 @@
 // This information can used to emit the metadata tables which are required by
 // the target garbage collector runtime.
 //
+// When used with gc.statepoint, information about safepoint and roots can be
+// found in the binary StackMap section after code generation.  Safepoint
+// placement is currently the responsibility of the frontend, though late
+// insertion support is planned.  gc.statepoint does not currently support
+// custom stack map formats; such can be generated by parsing the standard
+// stack map section if desired.
+//
+// The read and write barrier support can be used with either implementation.
+//
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_GCSTRATEGY_H
-#define LLVM_CODEGEN_GCSTRATEGY_H
+#ifndef LLVM_IR_GCSTRATEGY_H
+#define LLVM_IR_GCSTRATEGY_H
 
-#include "llvm/CodeGen/GCMetadata.h"
-#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Registry.h"
 #include <string>
 
 namespace llvm {
-  
-  class GCStrategy;
-  
-  /// The GC strategy registry uses all the defaults from Registry.
-  /// 
-  typedef Registry<GCStrategy> GCRegistry;
-  
-  /// GCStrategy describes a garbage collector algorithm's code generation
-  /// requirements, and provides overridable hooks for those needs which cannot
-  /// be abstractly described.
-  class GCStrategy {
-  public:
-    typedef std::vector<std::unique_ptr<GCFunctionInfo>> list_type;
-    typedef list_type::iterator iterator;
-    
-  private:
-    friend class GCModuleInfo;
-    const Module *M;
-    std::string Name;
-    
-    list_type Functions;
-    
-  protected:
-    unsigned NeededSafePoints; ///< Bitmask of required safe points.
-    bool CustomReadBarriers;   ///< Default is to insert loads.
-    bool CustomWriteBarriers;  ///< Default is to insert stores.
-    bool CustomRoots;          ///< Default is to pass through to backend.
-    bool CustomSafePoints;     ///< Default is to use NeededSafePoints
-                               ///< to find safe points.
-    bool InitRoots;            ///< If set, roots are nulled during lowering.
-    bool UsesMetadata;         ///< If set, backend must emit metadata tables.
-    
-  public:
-    GCStrategy();
-    
-    virtual ~GCStrategy() {}
-    
-    
-    /// getName - The name of the GC strategy, for debugging.
-    /// 
-    const std::string &getName() const { return Name; }
-
-    /// getModule - The module within which the GC strategy is operating.
-    /// 
-    const Module &getModule() const { return *M; }
-
-    /// needsSafePoitns - True if safe points of any kind are required. By
-    //                    default, none are recorded.
-    bool needsSafePoints() const {
-      return CustomSafePoints || NeededSafePoints != 0;
-    }
-    
-    /// needsSafePoint(Kind) - True if the given kind of safe point is
-    //                          required. By default, none are recorded.
-    bool needsSafePoint(GC::PointKind Kind) const {
-      return (NeededSafePoints & 1 << Kind) != 0;
-    }
-    
-    /// customWriteBarrier - By default, write barriers are replaced with simple
-    ///                      store instructions. If true, then
-    ///                      performCustomLowering must instead lower them.
-    bool customWriteBarrier() const { return CustomWriteBarriers; }
-    
-    /// customReadBarrier - By default, read barriers are replaced with simple
-    ///                     load instructions. If true, then
-    ///                     performCustomLowering must instead lower them.
-    bool customReadBarrier() const { return CustomReadBarriers; }
-    
-    /// customRoots - By default, roots are left for the code generator so it
-    ///               can generate a stack map. If true, then
-    //                performCustomLowering must delete them.
-    bool customRoots() const { return CustomRoots; }
-
-    /// customSafePoints - By default, the GC analysis will find safe
-    ///                    points according to NeededSafePoints. If true,
-    ///                    then findCustomSafePoints must create them.
-    bool customSafePoints() const { return CustomSafePoints; }
-    
-    /// initializeRoots - If set, gcroot intrinsics should initialize their
-    //                    allocas to null before the first use. This is
-    //                    necessary for most GCs and is enabled by default.
-    bool initializeRoots() const { return InitRoots; }
-    
-    /// usesMetadata - If set, appropriate metadata tables must be emitted by
-    ///                the back-end (assembler, JIT, or otherwise).
-    bool usesMetadata() const { return UsesMetadata; }
-    
-    /// begin/end - Iterators for function metadata.
-    /// 
-    iterator begin() { return Functions.begin(); }
-    iterator end()   { return Functions.end();   }
-
-    /// insertFunctionMetadata - Creates metadata for a function.
-    /// 
-    GCFunctionInfo *insertFunctionInfo(const Function &F);
-
-    /// initializeCustomLowering/performCustomLowering - If any of the actions
-    /// are set to custom, performCustomLowering must be overriden to transform
-    /// the corresponding actions to LLVM IR. initializeCustomLowering is
-    /// optional to override. These are the only GCStrategy methods through
-    /// which the LLVM IR can be modified.
-    virtual bool initializeCustomLowering(Module &F);
-    virtual bool performCustomLowering(Function &F);
-    virtual bool findCustomSafePoints(GCFunctionInfo& FI, MachineFunction& MF);
-  };
-  
+namespace GC {
+/// PointKind - The type of a collector-safe point.
+///
+enum PointKind {
+  Loop,    ///< Instr is a loop (backwards branch).
+  Return,  ///< Instr is a return instruction.
+  PreCall, ///< Instr is a call instruction.
+  PostCall ///< Instr is the return address of a call.
+};
+}
+
+/// GCStrategy describes a garbage collector algorithm's code generation
+/// requirements, and provides overridable hooks for those needs which cannot
+/// be abstractly described.  GCStrategy objects must be looked up through
+/// the Function.  The objects themselves are owned by the Context and must
+/// be immutable.
+class GCStrategy {
+private:
+  std::string Name;
+  friend class GCModuleInfo;
+
+protected:
+  bool UseStatepoints; /// Uses gc.statepoints as opposed to gc.roots,
+                       /// if set, none of the other options can be
+                       /// anything but their default values.
+
+  unsigned NeededSafePoints; ///< Bitmask of required safe points.
+  bool CustomReadBarriers;   ///< Default is to insert loads.
+  bool CustomWriteBarriers;  ///< Default is to insert stores.
+  bool CustomRoots;          ///< Default is to pass through to backend.
+  bool InitRoots;            ///< If set, roots are nulled during lowering.
+  bool UsesMetadata;         ///< If set, backend must emit metadata tables.
+
+public:
+  GCStrategy();
+  virtual ~GCStrategy() {}
+
+  /// Return the name of the GC strategy.  This is the value of the collector
+  /// name string specified on functions which use this strategy.
+  const std::string &getName() const { return Name; }
+
+  /// By default, write barriers are replaced with simple store
+  /// instructions. If true, you must provide a custom pass to lower 
+  /// calls to @llvm.gcwrite.
+  bool customWriteBarrier() const { return CustomWriteBarriers; }
+
+  /// By default, read barriers are replaced with simple load
+  /// instructions. If true, you must provide a custom pass to lower 
+  /// calls to @llvm.gcread.
+  bool customReadBarrier() const { return CustomReadBarriers; }
+
+  /// Returns true if this strategy is expecting the use of gc.statepoints,
+  /// and false otherwise.
+  bool useStatepoints() const { return UseStatepoints; }
+
+  /** @name Statepoint Specific Properties */
+  ///@{
+
+  /// If the value specified can be reliably distinguished, returns true for
+  /// pointers to GC managed locations and false for pointers to non-GC
+  /// managed locations.  Note a GCStrategy can always return 'None' (i.e. an
+  /// empty optional indicating it can't reliably distinguish.
+  virtual Optional<bool> isGCManagedPointer(const Value *V) const {
+    return None;
+  }
+  ///@}
+
+  /** @name GCRoot Specific Properties
+   * These properties and overrides only apply to collector strategies using
+   * GCRoot.
+   */
+  ///@{
+
+  /// True if safe points of any kind are required. By default, none are
+  /// recorded.
+  bool needsSafePoints() const { return NeededSafePoints != 0; }
+
+  /// True if the given kind of safe point is required. By default, none are
+  /// recorded.
+  bool needsSafePoint(GC::PointKind Kind) const {
+    return (NeededSafePoints & 1 << Kind) != 0;
+  }
+
+  /// By default, roots are left for the code generator so it can generate a
+  /// stack map. If true, you must provide a custom pass to lower 
+  /// calls to @llvm.gcroot.
+  bool customRoots() const { return CustomRoots; }
+
+  /// If set, gcroot intrinsics should initialize their allocas to null
+  /// before the first use. This is necessary for most GCs and is enabled by
+  /// default.
+  bool initializeRoots() const { return InitRoots; }
+
+  /// If set, appropriate metadata tables must be emitted by the back-end
+  /// (assembler, JIT, or otherwise). For statepoint, this method is
+  /// currently unsupported.  The stackmap information can be found in the
+  /// StackMap section as described in the documentation.
+  bool usesMetadata() const { return UsesMetadata; }
+
+  ///@}
+};
+
+/// Subclasses of GCStrategy are made available for use during compilation by
+/// adding them to the global GCRegistry.  This can done either within the
+/// LLVM source tree or via a loadable plugin.  An example registeration
+/// would be:
+/// static GCRegistry::Add<CustomGC> X("custom-name",
+///        "my custom supper fancy gc strategy");
+///
+/// Note that to use a custom GCMetadataPrinter w/gc.roots, you must also
+/// register your GCMetadataPrinter subclass with the
+/// GCMetadataPrinterRegistery as well.
+typedef Registry<GCStrategy> GCRegistry;
 }
 
 #endif
diff --git a/include/llvm/CodeGen/GCs.h b/include/llvm/CodeGen/GCs.h
index bb170c8..5bae41e 100644
--- a/include/llvm/CodeGen/GCs.h
+++ b/include/llvm/CodeGen/GCs.h
@@ -15,27 +15,29 @@
 #define LLVM_CODEGEN_GCS_H
 
 namespace llvm {
-  class GCStrategy;
-  class GCMetadataPrinter;
+class GCStrategy;
+class GCMetadataPrinter;
 
-  /// FIXME: Collector instances are not useful on their own. These no longer
-  ///        serve any purpose except to link in the plugins.
+/// FIXME: Collector instances are not useful on their own. These no longer
+///        serve any purpose except to link in the plugins.
 
-  /// Creates an ocaml-compatible garbage collector.
-  void linkOcamlGC();
+/// Creates an ocaml-compatible garbage collector.
+void linkOcamlGC();
 
-  /// Creates an ocaml-compatible metadata printer.
-  void linkOcamlGCPrinter();
+/// Creates an ocaml-compatible metadata printer.
+void linkOcamlGCPrinter();
 
-  /// Creates an erlang-compatible garbage collector.
-  void linkErlangGC();
+/// Creates an erlang-compatible garbage collector.
+void linkErlangGC();
 
-  /// Creates an erlang-compatible metadata printer.
-  void linkErlangGCPrinter();
+/// Creates an erlang-compatible metadata printer.
+void linkErlangGCPrinter();
 
-  /// Creates a shadow stack garbage collector. This collector requires no code
-  /// generator support.
-  void linkShadowStackGC();
+/// Creates a shadow stack garbage collector. This collector requires no code
+/// generator support.
+void linkShadowStackGC();
+
+void linkStatepointExampleGC();
 }
 
 #endif
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index bbf0ad3..8a31f7e 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -72,6 +72,11 @@ namespace ISD {
     /// the parent's frame or return address, and so on.
     FRAMEADDR, RETURNADDR,
 
+    /// FRAME_ALLOC_RECOVER - Represents the llvm.framerecover
+    /// intrinsic. Materializes the offset from the frame pointer of another
+    /// function to the result of llvm.frameallocate.
+    FRAME_ALLOC_RECOVER,
+
     /// READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on
     /// the DAG, which implements the named register global variables extension.
     READ_REGISTER,
@@ -224,7 +229,14 @@ namespace ISD {
     SMULO, UMULO,
 
     /// Simple binary floating point operators.
-    FADD, FSUB, FMUL, FMA, FDIV, FREM,
+    FADD, FSUB, FMUL, FDIV, FREM,
+
+    /// FMA - Perform a * b + c with no intermediate rounding step.
+    FMA,
+
+    /// FMAD - Perform a * b + c, while getting the same result as the
+    /// separately rounded operations.
+    FMAD,
 
     /// FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.  NOTE: This
     /// DAG node does not require that X and Y have the same type, just that the
@@ -675,6 +687,11 @@ namespace ISD {
     ATOMIC_LOAD_UMIN,
     ATOMIC_LOAD_UMAX,
 
+    // Masked load and store
+    MLOAD, MSTORE,
+    // Masked gather and scatter
+    MGATHER, MSCATTER,
+
     /// This corresponds to the llvm.lifetime.* intrinsics. The first operand
     /// is the chain and the second operand is the alloca pointer.
     LIFETIME_START, LIFETIME_END,
@@ -688,7 +705,7 @@ namespace ISD {
   /// which do not reference a specific memory location should be less than
   /// this value. Those that do must not be less than this value, and can
   /// be used with SelectionDAG::getMemIntrinsicNode.
-  static const int FIRST_TARGET_MEMORY_OPCODE = BUILTIN_OP_END+180;
+  static const int FIRST_TARGET_MEMORY_OPCODE = BUILTIN_OP_END+200;
 
   //===--------------------------------------------------------------------===//
   /// MemIndexedMode enum - This enum defines the load / store indexed
@@ -745,7 +762,7 @@ namespace ISD {
     LAST_LOADEXT_TYPE
   };
 
-  NodeType getExtForLoadExtType(LoadExtType);
+  NodeType getExtForLoadExtType(bool IsFP, LoadExtType);
 
   //===--------------------------------------------------------------------===//
   /// ISD::CondCode enum - These are ordered carefully to make the bitfields
diff --git a/include/llvm/CodeGen/LexicalScopes.h b/include/llvm/CodeGen/LexicalScopes.h
index 021fd98..b3a8405 100644
--- a/include/llvm/CodeGen/LexicalScopes.h
+++ b/include/llvm/CodeGen/LexicalScopes.h
@@ -19,14 +19,14 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/ValueHandle.h"
-#include <utility>
 #include <unordered_map>
+#include <utility>
 namespace llvm {
 
 class MachineInstr;
@@ -48,6 +48,8 @@ public:
   LexicalScope(LexicalScope *P, const MDNode *D, const MDNode *I, bool A)
       : Parent(P), Desc(D), InlinedAtLocation(I), AbstractScope(A),
         LastInsn(nullptr), FirstInsn(nullptr), DFSIn(0), DFSOut(0) {
+    assert((!D || D->isResolved()) && "Expected resolved node");
+    assert((!I || I->isResolved()) && "Expected resolved node");
     if (Parent)
       Parent->addChild(this);
   }
@@ -116,8 +118,8 @@ public:
 
 private:
   LexicalScope *Parent;                        // Parent to this scope.
-  AssertingVH<const MDNode> Desc;              // Debug info descriptor.
-  AssertingVH<const MDNode> InlinedAtLocation; // Location at which this
+  const MDNode *Desc;                          // Debug info descriptor.
+  const MDNode *InlinedAtLocation;             // Location at which this
                                                // scope is inlined.
   bool AbstractScope;                          // Abstract Scope
   SmallVector<LexicalScope *, 4> Children;     // Scopes defined in scope.
@@ -178,9 +180,11 @@ public:
     return I != AbstractScopeMap.end() ? &I->second : nullptr;
   }
 
-  /// findInlinedScope - Find an inlined scope for the given DebugLoc or return
-  /// NULL.
-  LexicalScope *findInlinedScope(DebugLoc DL);
+  /// findInlinedScope - Find an inlined scope for the given scope/inlined-at.
+  LexicalScope *findInlinedScope(const MDNode *N, const MDNode *IA) {
+    auto I = InlinedLexicalScopeMap.find(std::make_pair(N, IA));
+    return I != InlinedLexicalScopeMap.end() ? &I->second : nullptr;
+  }
 
   /// findLexicalScope - Find regular lexical scope or return null.
   LexicalScope *findLexicalScope(const MDNode *N) {
diff --git a/include/llvm/CodeGen/LinkAllCodegenComponents.h b/include/llvm/CodeGen/LinkAllCodegenComponents.h
index 372c294..e7ccbfa 100644
--- a/include/llvm/CodeGen/LinkAllCodegenComponents.h
+++ b/include/llvm/CodeGen/LinkAllCodegenComponents.h
@@ -39,6 +39,7 @@ namespace {
       llvm::linkOcamlGC();
       llvm::linkErlangGC();
       llvm::linkShadowStackGC();
+      llvm::linkStatepointExampleGC();
 
       (void) llvm::createBURRListDAGScheduler(nullptr,
                                               llvm::CodeGenOpt::Default);
diff --git a/include/llvm/CodeGen/LiveInterval.h b/include/llvm/CodeGen/LiveInterval.h
index 6629e60..21634cb 100644
--- a/include/llvm/CodeGen/LiveInterval.h
+++ b/include/llvm/CodeGen/LiveInterval.h
@@ -27,6 +27,7 @@
 #include "llvm/Support/Allocator.h"
 #include <cassert>
 #include <climits>
+#include <set>
 
 namespace llvm {
   class CoalescerPair;
@@ -119,6 +120,12 @@ namespace llvm {
       return isDeadDef() ? nullptr : LateVal;
     }
 
+    /// Returns the value alive at the end of the instruction, if any. This can
+    /// be a live-through value, a live def or a dead def.
+    VNInfo *valueOutOrDead() const {
+      return LateVal;
+    }
+
     /// Return the value defined by this instruction, if any. This includes
     /// dead defs, it is the value created by the instruction's def operands.
     VNInfo *valueDefined() const {
@@ -188,6 +195,12 @@ namespace llvm {
     Segments segments;   // the liveness segments
     VNInfoList valnos;   // value#'s
 
+    // The segment set is used temporarily to accelerate initial computation
+    // of live ranges of physical registers in computeRegUnitRange.
+    // After that the set is flushed to the segment vector and deleted.
+    typedef std::set<Segment> SegmentSet;
+    SegmentSet *segmentSet;
+
     typedef Segments::iterator iterator;
     iterator begin() { return segments.begin(); }
     iterator end()   { return segments.end(); }
@@ -204,6 +217,31 @@ namespace llvm {
     const_vni_iterator vni_begin() const { return valnos.begin(); }
     const_vni_iterator vni_end() const   { return valnos.end(); }
 
+    /// Constructs a new LiveRange object.
+    LiveRange(bool UseSegmentSet = false) : segmentSet(nullptr) {
+      if (UseSegmentSet)
+        segmentSet = new SegmentSet();
+    }
+
+    /// Constructs a new LiveRange object by copying segments and valnos from
+    /// another LiveRange.
+    LiveRange(const LiveRange &Other, BumpPtrAllocator &Allocator)
+        : segmentSet(nullptr) {
+      assert(Other.segmentSet == nullptr &&
+             "Copying of LiveRanges with active SegmentSets is not supported");
+
+      // Duplicate valnos.
+      for (const VNInfo *VNI : Other.valnos) {
+        createValueCopy(VNI, Allocator);
+      }
+      // Now we can copy segments and remap their valnos.
+      for (const Segment &S : Other.segments) {
+        segments.push_back(Segment(S.start, S.end, valnos[S.valno->id]));
+      }
+    }
+
+    ~LiveRange() { delete segmentSet; }
+
     /// advanceTo - Advance the specified iterator to point to the Segment
     /// containing the specified position, or end() if the position is past the
     /// end of the range.  If no Segment contains this position, but the
@@ -217,6 +255,14 @@ namespace llvm {
       return I;
     }
 
+    const_iterator advanceTo(const_iterator I, SlotIndex Pos) const {
+      assert(I != end());
+      if (Pos >= endIndex())
+        return end();
+      while (I->end <= Pos) ++I;
+      return I;
+    }
+
     /// find - Return an iterator pointing to the first segment that ends after
     /// Pos, or end(). This is the same as advanceTo(begin(), Pos), but faster
     /// when searching large ranges.
@@ -397,17 +443,21 @@ namespace llvm {
     /// scanning the Other range starting at I.
     bool overlapsFrom(const LiveRange &Other, const_iterator I) const;
 
+    /// Returns true if all segments of the @p Other live range are completely
+    /// covered by this live range.
+    /// Adjacent live ranges do not affect the covering:the liverange
+    /// [1,5](5,10] covers (3,7].
+    bool covers(const LiveRange &Other) const;
+
     /// Add the specified Segment to this range, merging segments as
     /// appropriate.  This returns an iterator to the inserted segment (which
     /// may have grown since it was inserted).
-    iterator addSegment(Segment S) {
-      return addSegmentFrom(S, segments.begin());
-    }
+    iterator addSegment(Segment S);
 
-    /// extendInBlock - If this range is live before Kill in the basic block
-    /// that starts at StartIdx, extend it to be live up to Kill, and return
-    /// the value. If there is no segment before Kill, return NULL.
-    VNInfo *extendInBlock(SlotIndex StartIdx, SlotIndex Kill);
+    /// If this range is live before @p Use in the basic block that starts at
+    /// @p StartIdx, extend it to be live up to @p Use, and return the value. If
+    /// there is no segment before @p Use, return nullptr.
+    VNInfo *extendInBlock(SlotIndex StartIdx, SlotIndex Use);
 
     /// join - Join two live ranges (this, and other) together.  This applies
     /// mappings to the value numbers in the LHS/RHS ranges as specified.  If
@@ -435,6 +485,12 @@ namespace llvm {
       removeSegment(S.start, S.end, RemoveDeadValNo);
     }
 
+    /// Remove segment pointed to by iterator @p I from this range.  This does
+    /// not remove dead value numbers.
+    iterator removeSegment(iterator I) {
+      return segments.erase(I);
+    }
+
     /// Query Liveness at Idx.
     /// The sub-instruction slot of Idx doesn't matter, only the instruction
     /// it refers to is considered.
@@ -484,9 +540,9 @@ namespace llvm {
     /// Returns true if the live range is zero length, i.e. no live segments
     /// span instructions. It doesn't pay to spill such a range.
     bool isZeroLength(SlotIndexes *Indexes) const {
-      for (const_iterator i = begin(), e = end(); i != e; ++i)
-        if (Indexes->getNextNonNullIndex(i->start).getBaseIndex() <
-            i->end.getBaseIndex())
+      for (const Segment &S : segments)
+        if (Indexes->getNextNonNullIndex(S.start).getBaseIndex() <
+            S.end.getBaseIndex())
           return false;
       return true;
     }
@@ -497,6 +553,12 @@ namespace llvm {
       return thisIndex < otherIndex;
     }
 
+    /// Flush segment set into the regular segment vector.
+    /// The method is to be called after the live range
+    /// has been created, if use of the segment set was
+    /// activated in the constructor of the live range.
+    void flushSegmentSet();
+
     void print(raw_ostream &OS) const;
     void dump() const;
 
@@ -509,11 +571,13 @@ namespace llvm {
     void verify() const;
 #endif
 
-  private:
+  protected:
+    /// Append a segment to the list of segments.
+    void append(const LiveRange::Segment S);
 
-    iterator addSegmentFrom(Segment S, iterator From);
-    void extendSegmentEndTo(iterator I, SlotIndex NewEnd);
-    iterator extendSegmentStartTo(iterator I, SlotIndex NewStr);
+  private:
+    friend class LiveRangeUpdater;
+    void addSegmentToSet(Segment S);
     void markValNoForDeletion(VNInfo *V);
 
   };
@@ -529,11 +593,124 @@ namespace llvm {
   public:
     typedef LiveRange super;
 
+    /// A live range for subregisters. The LaneMask specifies which parts of the
+    /// super register are covered by the interval.
+    /// (@sa TargetRegisterInfo::getSubRegIndexLaneMask()).
+    class SubRange : public LiveRange {
+    public:
+      SubRange *Next;
+      unsigned LaneMask;
+
+      /// Constructs a new SubRange object.
+      SubRange(unsigned LaneMask)
+        : Next(nullptr), LaneMask(LaneMask) {
+      }
+
+      /// Constructs a new SubRange object by copying liveness from @p Other.
+      SubRange(unsigned LaneMask, const LiveRange &Other,
+               BumpPtrAllocator &Allocator)
+        : LiveRange(Other, Allocator), Next(nullptr), LaneMask(LaneMask) {
+      }
+    };
+
+  private:
+    SubRange *SubRanges; ///< Single linked list of subregister live ranges.
+
+  public:
     const unsigned reg;  // the register or stack slot of this interval.
     float weight;        // weight of this interval
 
     LiveInterval(unsigned Reg, float Weight)
-      : reg(Reg), weight(Weight) {}
+      : SubRanges(nullptr), reg(Reg), weight(Weight) {}
+
+    ~LiveInterval() {
+      clearSubRanges();
+    }
+
+    template<typename T>
+    class SingleLinkedListIterator {
+      T *P;
+    public:
+      SingleLinkedListIterator<T>(T *P) : P(P) {}
+      SingleLinkedListIterator<T> &operator++() {
+        P = P->Next;
+        return *this;
+      }
+      SingleLinkedListIterator<T> &operator++(int) {
+        SingleLinkedListIterator res = *this;
+        ++*this;
+        return res;
+      }
+      bool operator!=(const SingleLinkedListIterator<T> &Other) {
+        return P != Other.operator->();
+      }
+      bool operator==(const SingleLinkedListIterator<T> &Other) {
+        return P == Other.operator->();
+      }
+      T &operator*() const {
+        return *P;
+      }
+      T *operator->() const {
+        return P;
+      }
+    };
+
+    typedef SingleLinkedListIterator<SubRange> subrange_iterator;
+    subrange_iterator subrange_begin() {
+      return subrange_iterator(SubRanges);
+    }
+    subrange_iterator subrange_end() {
+      return subrange_iterator(nullptr);
+    }
+
+    typedef SingleLinkedListIterator<const SubRange> const_subrange_iterator;
+    const_subrange_iterator subrange_begin() const {
+      return const_subrange_iterator(SubRanges);
+    }
+    const_subrange_iterator subrange_end() const {
+      return const_subrange_iterator(nullptr);
+    }
+
+    iterator_range<subrange_iterator> subranges() {
+      return make_range(subrange_begin(), subrange_end());
+    }
+
+    iterator_range<const_subrange_iterator> subranges() const {
+      return make_range(subrange_begin(), subrange_end());
+    }
+
+    /// Creates a new empty subregister live range. The range is added at the
+    /// beginning of the subrange list; subrange iterators stay valid.
+    SubRange *createSubRange(BumpPtrAllocator &Allocator, unsigned LaneMask) {
+      SubRange *Range = new (Allocator) SubRange(LaneMask);
+      appendSubRange(Range);
+      return Range;
+    }
+
+    /// Like createSubRange() but the new range is filled with a copy of the
+    /// liveness information in @p CopyFrom.
+    SubRange *createSubRangeFrom(BumpPtrAllocator &Allocator, unsigned LaneMask,
+                                 const LiveRange &CopyFrom) {
+      SubRange *Range = new (Allocator) SubRange(LaneMask, CopyFrom, Allocator);
+      appendSubRange(Range);
+      return Range;
+    }
+
+    /// Returns true if subregister liveness information is available.
+    bool hasSubRanges() const {
+      return SubRanges != nullptr;
+    }
+
+    /// Removes all subregister liveness information.
+    void clearSubRanges();
+
+    /// Removes all subranges without any segments (subranges without segments
+    /// are not considered valid and should only exist temporarily).
+    void removeEmptySubRanges();
+
+    /// Construct main live range by merging the SubRanges of @p LI.
+    void constructMainRangeFromSubranges(const SlotIndexes &Indexes,
+                                         VNInfo::Allocator &VNIAllocator);
 
     /// getSize - Returns the sum of sizes of all the LiveRange's.
     ///
@@ -558,9 +735,26 @@ namespace llvm {
     void print(raw_ostream &OS) const;
     void dump() const;
 
+    /// \brief Walks the interval and assert if any invariants fail to hold.
+    ///
+    /// Note that this is a no-op when asserts are disabled.
+#ifdef NDEBUG
+    void verify(const MachineRegisterInfo *MRI = nullptr) const {}
+#else
+    void verify(const MachineRegisterInfo *MRI = nullptr) const;
+#endif
+
   private:
-    LiveInterval& operator=(const LiveInterval& rhs) LLVM_DELETED_FUNCTION;
+    LiveInterval& operator=(const LiveInterval& rhs) = delete;
+
+    /// Appends @p Range to SubRanges list.
+    void appendSubRange(SubRange *Range) {
+      Range->Next = SubRanges;
+      SubRanges = Range;
+    }
 
+    /// Free memory held by SubRange.
+    void freeSubRange(SubRange *S);
   };
 
   inline raw_ostream &operator<<(raw_ostream &OS, const LiveInterval &LI) {
diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h
index f9bd317..dc52c0a 100644
--- a/include/llvm/CodeGen/LiveIntervalAnalysis.h
+++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h
@@ -27,12 +27,15 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <cmath>
 #include <iterator>
 
 namespace llvm {
 
+extern cl::opt<bool> UseSegmentSetForPhysRegs;
+
   class AliasAnalysis;
   class BitVector;
   class BlockFrequency;
@@ -154,16 +157,11 @@ namespace llvm {
     bool shrinkToUses(LiveInterval *li,
                       SmallVectorImpl<MachineInstr*> *dead = nullptr);
 
-    /// \brief Walk the values in the given interval and compute which ones
-    /// are dead.  Dead values are not deleted, however:
-    /// - Dead PHIDef values are marked as unused.
-    /// - New dead machine instructions are added to the dead vector.
-    /// - CanSeparate is set to true if the interval may have been separated
-    ///   into multiple connected components.
-    void computeDeadValues(LiveInterval *li,
-                           LiveRange &LR,
-                           bool *CanSeparate,
-                           SmallVectorImpl<MachineInstr*> *dead);
+    /// Specialized version of
+    /// shrinkToUses(LiveInterval *li, SmallVectorImpl<MachineInstr*> *dead)
+    /// that works on a subregister live range and only looks at uses matching
+    /// the lane mask of the subregister range.
+    void shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg);
 
     /// extendToIndices - Extend the live range of LI to reach all points in
     /// Indices. The points in the Indices array must be jointly dominated by
@@ -175,14 +173,15 @@ namespace llvm {
     /// See also LiveRangeCalc::extend().
     void extendToIndices(LiveRange &LR, ArrayRef<SlotIndex> Indices);
 
-    /// pruneValue - If an LI value is live at Kill, prune its live range by
-    /// removing any liveness reachable from Kill. Add live range end points to
+
+    /// If @p LR has a live value at @p Kill, prune its live range by removing
+    /// any liveness reachable from Kill. Add live range end points to
     /// EndPoints such that extendToIndices(LI, EndPoints) will reconstruct the
     /// value's live range.
     ///
     /// Calling pruneValue() and extendToIndices() can be used to reconstruct
     /// SSA form after adding defs to a virtual register.
-    void pruneValue(LiveInterval *LI, SlotIndex Kill,
+    void pruneValue(LiveRange &LR, SlotIndex Kill,
                     SmallVectorImpl<SlotIndex> *EndPoints);
 
     SlotIndexes *getSlotIndexes() const {
@@ -381,7 +380,8 @@ namespace llvm {
       LiveRange *LR = RegUnitRanges[Unit];
       if (!LR) {
         // Compute missing ranges on demand.
-        RegUnitRanges[Unit] = LR = new LiveRange();
+        // Use segment set to speed-up initial computation of the live range.
+        RegUnitRanges[Unit] = LR = new LiveRange(UseSegmentSetForPhysRegs);
         computeRegUnitRange(*LR, Unit);
       }
       return *LR;
@@ -397,6 +397,15 @@ namespace llvm {
       return RegUnitRanges[Unit];
     }
 
+    /// Remove value numbers and related live segments starting at position
+    /// @p Pos that are part of any liverange of physical register @p Reg or one
+    /// of its subregisters.
+    void removePhysRegDefAt(unsigned Reg, SlotIndex Pos);
+
+    /// Remove value number and related live segments of @p LI and its subranges
+    /// that start at position @p Pos.
+    void removeVRegDefAt(LiveInterval &LI, SlotIndex Pos);
+
   private:
     /// Compute live intervals for all virtual registers.
     void computeVirtRegs();
@@ -404,6 +413,16 @@ namespace llvm {
     /// Compute RegMaskSlots and RegMaskBits.
     void computeRegMasks();
 
+    /// Walk the values in @p LI and check for dead values:
+    /// - Dead PHIDef values are marked as unused.
+    /// - Dead operands are marked as such.
+    /// - Completely dead machine instructions are added to the @p dead vector
+    ///   if it is not nullptr.
+    /// Returns true if any PHI value numbers have been removed which may
+    /// have separated the interval into multiple connected components.
+    bool computeDeadValues(LiveInterval &LI,
+                           SmallVectorImpl<MachineInstr*> *dead);
+
     static LiveInterval* createInterval(unsigned Reg);
 
     void printInstrs(raw_ostream &O) const;
@@ -413,6 +432,16 @@ namespace llvm {
     void computeRegUnitRange(LiveRange&, unsigned Unit);
     void computeVirtRegInterval(LiveInterval&);
 
+
+    /// Helper function for repairIntervalsInRange(), walks backwards and
+    /// creates/modifies live segments in @p LR to match the operands found.
+    /// Only full operands or operands with subregisters matching @p LaneMask
+    /// are considered.
+    void repairOldRegInRange(MachineBasicBlock::iterator Begin,
+                             MachineBasicBlock::iterator End,
+                             const SlotIndex endIdx, LiveRange &LR,
+                             unsigned Reg, unsigned LaneMask = ~0u);
+
     class HMEditor;
   };
 } // End llvm namespace
diff --git a/include/llvm/CodeGen/LiveIntervalUnion.h b/include/llvm/CodeGen/LiveIntervalUnion.h
index 2f40509..967f0cb 100644
--- a/include/llvm/CodeGen/LiveIntervalUnion.h
+++ b/include/llvm/CodeGen/LiveIntervalUnion.h
@@ -84,10 +84,16 @@ public:
   bool changedSince(unsigned tag) const { return tag != Tag; }
 
   // Add a live virtual register to this union and merge its segments.
-  void unify(LiveInterval &VirtReg);
+  void unify(LiveInterval &VirtReg, const LiveRange &Range);
+  void unify(LiveInterval &VirtReg) {
+    unify(VirtReg, VirtReg);
+  }
 
   // Remove a live virtual register's segments from this union.
-  void extract(LiveInterval &VirtReg);
+  void extract(LiveInterval &VirtReg, const LiveRange &Range);
+  void extract(LiveInterval &VirtReg) {
+    extract(VirtReg, VirtReg);
+  }
 
   // Remove all inserted virtual registers.
   void clear() { Segments.clear(); ++Tag; }
@@ -173,8 +179,8 @@ public:
     }
 
   private:
-    Query(const Query&) LLVM_DELETED_FUNCTION;
-    void operator=(const Query&) LLVM_DELETED_FUNCTION;
+    Query(const Query&) = delete;
+    void operator=(const Query&) = delete;
   };
 
   // Array of LiveIntervalUnions.
diff --git a/include/llvm/CodeGen/LivePhysRegs.h b/include/llvm/CodeGen/LivePhysRegs.h
index 91e4ddc..4aa53a9 100644
--- a/include/llvm/CodeGen/LivePhysRegs.h
+++ b/include/llvm/CodeGen/LivePhysRegs.h
@@ -44,8 +44,8 @@ class LivePhysRegs {
   const TargetRegisterInfo *TRI;
   SparseSet<unsigned> LiveRegs;
 
-  LivePhysRegs(const LivePhysRegs&) LLVM_DELETED_FUNCTION;
-  LivePhysRegs &operator=(const LivePhysRegs&) LLVM_DELETED_FUNCTION;
+  LivePhysRegs(const LivePhysRegs&) = delete;
+  LivePhysRegs &operator=(const LivePhysRegs&) = delete;
 public:
   /// \brief Constructs a new empty LivePhysRegs set.
   LivePhysRegs() : TRI(nullptr), LiveRegs() {}
diff --git a/include/llvm/CodeGen/MachineDominanceFrontier.h b/include/llvm/CodeGen/MachineDominanceFrontier.h
index e099e71..4131194 100644
--- a/include/llvm/CodeGen/MachineDominanceFrontier.h
+++ b/include/llvm/CodeGen/MachineDominanceFrontier.h
@@ -26,8 +26,8 @@ public:
   typedef DominanceFrontierBase<MachineBasicBlock>::iterator iterator;
   typedef DominanceFrontierBase<MachineBasicBlock>::const_iterator const_iterator;
 
-  void operator=(const MachineDominanceFrontier &) LLVM_DELETED_FUNCTION;
-  MachineDominanceFrontier(const MachineDominanceFrontier &) LLVM_DELETED_FUNCTION;
+  void operator=(const MachineDominanceFrontier &) = delete;
+  MachineDominanceFrontier(const MachineDominanceFrontier &) = delete;
 
   static char ID;
 
diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h
index 1e7fee6..6677360 100644
--- a/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/include/llvm/CodeGen/MachineFrameInfo.h
@@ -516,6 +516,10 @@ public:
   /// on the stack.  Returns an index with a negative value.
   int CreateFixedSpillStackObject(uint64_t Size, int64_t SPOffset);
 
+  /// Allocates memory at a fixed, target-specific offset from the frame
+  /// pointer. Marks the function as having its frame address taken.
+  int CreateFrameAllocation(uint64_t Size);
+
   /// isFixedObjectIndex - Returns true if the specified index corresponds to a
   /// fixed stack object.
   bool isFixedObjectIndex(int ObjectIdx) const {
diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index 3271410..94610ca 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@@ -72,6 +72,15 @@ private:
 /// MachineFunction is destroyed.
 struct MachineFunctionInfo {
   virtual ~MachineFunctionInfo();
+
+  /// \brief Factory function: default behavior is to call new using the
+  /// supplied allocator.
+  ///
+  /// This function can be overridden in a derive class.
+  template<typename Ty>
+  static Ty *create(BumpPtrAllocator &Allocator, MachineFunction &MF) {
+    return new (Allocator.Allocate<Ty>()) Ty(MF);
+  }
 };
 
 class MachineFunction {
@@ -136,8 +145,8 @@ class MachineFunction {
   /// True if the function includes any inline assembly.
   bool HasInlineAsm;
 
-  MachineFunction(const MachineFunction &) LLVM_DELETED_FUNCTION;
-  void operator=(const MachineFunction&) LLVM_DELETED_FUNCTION;
+  MachineFunction(const MachineFunction &) = delete;
+  void operator=(const MachineFunction&) = delete;
 public:
   MachineFunction(const Function *Fn, const TargetMachine &TM,
                   unsigned FunctionNum, MachineModuleInfo &MMI);
@@ -167,6 +176,13 @@ public:
   const TargetSubtargetInfo &getSubtarget() const { return *STI; }
   void setSubtarget(const TargetSubtargetInfo *ST) { STI = ST; }
 
+  /// getSubtarget - This method returns a pointer to the specified type of
+  /// TargetSubtargetInfo.  In debug builds, it verifies that the object being
+  /// returned is of the correct type.
+  template<typename STC> const STC &getSubtarget() const {
+    return *static_cast<const STC *>(STI);
+  }
+
   /// getRegInfo - Return information about the registers currently in use.
   ///
   MachineRegisterInfo &getRegInfo() { return *RegInfo; }
@@ -239,7 +255,7 @@ public:
   template<typename Ty>
   Ty *getInfo() {
     if (!MFInfo)
-      MFInfo = new (Allocator.Allocate<Ty>()) Ty(*this);
+      MFInfo = Ty::template create<Ty>(Allocator, *this);
     return static_cast<Ty*>(MFInfo);
   }
 
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index d20b45b..4ba4a97 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -93,10 +93,10 @@ private:
 
   DebugLoc debugLoc;                    // Source line information.
 
-  MachineInstr(const MachineInstr&) LLVM_DELETED_FUNCTION;
-  void operator=(const MachineInstr&) LLVM_DELETED_FUNCTION;
+  MachineInstr(const MachineInstr&) = delete;
+  void operator=(const MachineInstr&) = delete;
   // Use MachineFunction::DeleteMachineInstr() instead.
-  ~MachineInstr() LLVM_DELETED_FUNCTION;
+  ~MachineInstr() = delete;
 
   // Intrusive list support
   friend struct ilist_traits<MachineInstr>;
@@ -110,8 +110,8 @@ private:
   /// MachineInstr ctor - This constructor create a MachineInstr and add the
   /// implicit operands.  It reserves space for number of operands specified by
   /// MCInstrDesc.  An explicit DebugLoc is supplied.
-  MachineInstr(MachineFunction&, const MCInstrDesc &MCID,
-               const DebugLoc dl, bool NoImp = false);
+  MachineInstr(MachineFunction &, const MCInstrDesc &MCID, DebugLoc dl,
+               bool NoImp = false);
 
   // MachineInstrs are pool-allocated and owned by MachineFunction.
   friend class MachineFunction;
@@ -242,7 +242,7 @@ public:
 
   /// getDebugLoc - Returns the debug location id of this MachineInstr.
   ///
-  DebugLoc getDebugLoc() const { return debugLoc; }
+  const DebugLoc &getDebugLoc() const { return debugLoc; }
 
   /// \brief Return the debug variable referenced by
   /// this DBG_VALUE instruction.
@@ -1048,6 +1048,14 @@ public:
   bool addRegisterDead(unsigned Reg, const TargetRegisterInfo *RegInfo,
                        bool AddIfNotFound = false);
 
+  /// Clear all dead flags on operands defining register @p Reg.
+  void clearRegisterDeads(unsigned Reg);
+
+  /// Mark all subregister defs of register @p Reg with the undef flag.
+  /// This function is used when we determined to have a subregister def in an
+  /// otherwise undefined super register.
+  void addRegisterDefReadUndef(unsigned Reg);
+
   /// addRegisterDefined - We have determined MI defines a register. Make sure
   /// there is an operand defining Reg.
   void addRegisterDefined(unsigned Reg,
@@ -1139,7 +1147,10 @@ public:
   /// setDebugLoc - Replace current source information with new such.
   /// Avoid using this, the constructor argument is preferable.
   ///
-  void setDebugLoc(const DebugLoc dl) { debugLoc = dl; }
+  void setDebugLoc(DebugLoc dl) {
+    debugLoc = std::move(dl);
+    assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
+  }
 
   /// RemoveOperand - Erase an operand  from an instruction, leaving it with one
   /// fewer operand than it started with.
diff --git a/include/llvm/CodeGen/MachineLoopInfo.h b/include/llvm/CodeGen/MachineLoopInfo.h
index 4fbd46b..f7bcf45 100644
--- a/include/llvm/CodeGen/MachineLoopInfo.h
+++ b/include/llvm/CodeGen/MachineLoopInfo.h
@@ -74,8 +74,8 @@ class MachineLoopInfo : public MachineFunctionPass {
   LoopInfoBase<MachineBasicBlock, MachineLoop> LI;
   friend class LoopBase<MachineBasicBlock, MachineLoop>;
 
-  void operator=(const MachineLoopInfo &) LLVM_DELETED_FUNCTION;
-  MachineLoopInfo(const MachineLoopInfo &) LLVM_DELETED_FUNCTION;
+  void operator=(const MachineLoopInfo &) = delete;
+  MachineLoopInfo(const MachineLoopInfo &) = delete;
 
 public:
   static char ID; // Pass identification, replacement for typeid
diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h
index 6653333..f171df2 100644
--- a/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/include/llvm/CodeGen/MachineModuleInfo.h
@@ -35,6 +35,7 @@
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/LibCallSemantics.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/ValueHandle.h"
@@ -66,6 +67,7 @@ struct LandingPadInfo {
   MachineBasicBlock *LandingPadBlock;    // Landing pad block.
   SmallVector<MCSymbol*, 1> BeginLabels; // Labels prior to invoke.
   SmallVector<MCSymbol*, 1> EndLabels;   // Labels after invoke.
+  SmallVector<MCSymbol*, 1> ClauseLabels; // Labels for each clause.
   MCSymbol *LandingPadLabel;             // Label at beginning of landing pad.
   const Function *Personality;           // Personality function.
   std::vector<int> TypeIds;              // List of type ids (filters negative)
@@ -161,14 +163,26 @@ class MachineModuleInfo : public ImmutablePass {
   /// to _fltused on Windows targets.
   bool UsesVAFloatArgument;
 
+  /// UsesMorestackAddr - True if the module calls the __morestack function
+  /// indirectly, as is required under the large code model on x86. This is used
+  /// to emit a definition of a symbol, __morestack_addr, containing the
+  /// address. See comments in lib/Target/X86/X86FrameLowering.cpp for more
+  /// details.
+  bool UsesMorestackAddr;
+
+  EHPersonality PersonalityTypeCache;
+
 public:
   static char ID; // Pass identification, replacement for typeid
 
   struct VariableDbgInfo {
-    TrackingVH<MDNode> Var;
-    TrackingVH<MDNode> Expr;
+    TrackingMDNodeRef Var;
+    TrackingMDNodeRef Expr;
     unsigned Slot;
     DebugLoc Loc;
+
+    VariableDbgInfo(MDNode *Var, MDNode *Expr, unsigned Slot, DebugLoc Loc)
+        : Var(Var), Expr(Expr), Slot(Slot), Loc(Loc) {}
   };
   typedef SmallVector<VariableDbgInfo, 4> VariableDbgInfoMapTy;
   VariableDbgInfoMapTy VariableDbgInfos;
@@ -231,6 +245,14 @@ public:
     UsesVAFloatArgument = b;
   }
 
+  bool usesMorestackAddr() const {
+    return UsesMorestackAddr;
+  }
+
+  void setUsesMorestackAddr(bool b) {
+    UsesMorestackAddr = b;
+  }
+
   /// \brief Returns a reference to a list of cfi instructions in the current
   /// function's prologue.  Used to construct frame maps for debug and exception
   /// handling comsumers.
@@ -312,6 +334,11 @@ public:
   ///
   void addCleanup(MachineBasicBlock *LandingPad);
 
+  /// Add a clause for a landing pad. Returns a new label for the clause. This
+  /// is used by EH schemes that have more than one landing pad. In this case,
+  /// each clause gets its own basic block.
+  MCSymbol *addClauseForLandingPad(MachineBasicBlock *LandingPad);
+
   /// getTypeIDFor - Return the type id for the specified typeinfo.  This is
   /// function wide.
   unsigned getTypeIDFor(const GlobalValue *TI);
@@ -389,12 +416,14 @@ public:
   /// of one is required to emit exception handling info.
   const Function *getPersonality() const;
 
+  /// Classify the personality function amongst known EH styles.
+  EHPersonality getPersonalityType();
+
   /// setVariableDbgInfo - Collect information used to emit debugging
   /// information of a variable.
   void setVariableDbgInfo(MDNode *Var, MDNode *Expr, unsigned Slot,
                           DebugLoc Loc) {
-    VariableDbgInfo Info = {Var, Expr, Slot, Loc};
-    VariableDbgInfos.push_back(std::move(Info));
+    VariableDbgInfos.emplace_back(Var, Expr, Slot, Loc);
   }
 
   VariableDbgInfoMapTy &getVariableDbgInfo() { return VariableDbgInfos; }
diff --git a/include/llvm/CodeGen/MachinePassRegistry.h b/include/llvm/CodeGen/MachinePassRegistry.h
index c962e68..57d1a6d 100644
--- a/include/llvm/CodeGen/MachinePassRegistry.h
+++ b/include/llvm/CodeGen/MachinePassRegistry.h
@@ -122,11 +122,12 @@ template<class RegistryClass>
 class RegisterPassParser : public MachinePassRegistryListener,
                    public cl::parser<typename RegistryClass::FunctionPassCtor> {
 public:
-  RegisterPassParser() {}
+  RegisterPassParser(cl::Option &O)
+      : cl::parser<typename RegistryClass::FunctionPassCtor>(O) {}
   ~RegisterPassParser() { RegistryClass::setListener(nullptr); }
 
-  void initialize(cl::Option &O) {
-    cl::parser<typename RegistryClass::FunctionPassCtor>::initialize(O);
+  void initialize() {
+    cl::parser<typename RegistryClass::FunctionPassCtor>::initialize();
 
     // Add existing passes to option.
     for (RegistryClass *Node = RegistryClass::getList();
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index 2e7f034..abb04de 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -52,6 +52,9 @@ private:
   /// accurate when after this flag is cleared.
   bool TracksLiveness;
 
+  /// True if subregister liveness is tracked.
+  bool TracksSubRegLiveness;
+
   /// VRegInfo - Information we keep for each virtual register.
   ///
   /// Each element in this list contains the register class of the vreg and the
@@ -120,8 +123,8 @@ private:
   /// second element.
   std::vector<std::pair<unsigned, unsigned> > LiveIns;
 
-  MachineRegisterInfo(const MachineRegisterInfo&) LLVM_DELETED_FUNCTION;
-  void operator=(const MachineRegisterInfo&) LLVM_DELETED_FUNCTION;
+  MachineRegisterInfo(const MachineRegisterInfo&) = delete;
+  void operator=(const MachineRegisterInfo&) = delete;
 public:
   explicit MachineRegisterInfo(const MachineFunction *MF);
 
@@ -179,6 +182,12 @@ public:
   /// information.
   void invalidateLiveness() { TracksLiveness = false; }
 
+  bool tracksSubRegLiveness() const { return TracksSubRegLiveness; }
+
+  void enableSubRegLiveness(bool Enable = true) {
+    TracksSubRegLiveness = Enable;
+  }
+
   //===--------------------------------------------------------------------===//
   // Register Info
   //===--------------------------------------------------------------------===//
@@ -584,7 +593,7 @@ public:
   /// virtual register, for example after removing instructions or splitting
   /// the live range.
   ///
-  bool recomputeRegClass(unsigned Reg, const TargetMachine&);
+  bool recomputeRegClass(unsigned Reg);
 
   /// createVirtualRegister - Create and return a new virtual register in the
   /// function with the specified register class.
@@ -768,6 +777,10 @@ public:
                         const TargetRegisterInfo &TRI,
                         const TargetInstrInfo &TII);
 
+  /// Returns a mask covering all bits that can appear in lane masks of
+  /// subregisters of the virtual register @p Reg.
+  unsigned getMaxLaneMaskForVReg(unsigned Reg) const;
+
   /// defusechain_iterator - This class provides iterator support for machine
   /// operands in the function that use or define a specific register.  If
   /// ReturnUses is true it returns uses of registers, if ReturnDefs is true it
diff --git a/include/llvm/CodeGen/MachineSSAUpdater.h b/include/llvm/CodeGen/MachineSSAUpdater.h
index 486a26e..5f988ad 100644
--- a/include/llvm/CodeGen/MachineSSAUpdater.h
+++ b/include/llvm/CodeGen/MachineSSAUpdater.h
@@ -107,8 +107,8 @@ public:
 private:
   unsigned GetValueAtEndOfBlockInternal(MachineBasicBlock *BB);
 
-  void operator=(const MachineSSAUpdater&) LLVM_DELETED_FUNCTION;
-  MachineSSAUpdater(const MachineSSAUpdater&) LLVM_DELETED_FUNCTION;
+  void operator=(const MachineSSAUpdater&) = delete;
+  MachineSSAUpdater(const MachineSSAUpdater&) = delete;
 };
 
 } // End llvm namespace
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index c5f66a8..a319401 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -80,7 +80,6 @@
 #include "llvm/CodeGen/MachinePassRegistry.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
-
 #include <memory>
 
 namespace llvm {
diff --git a/include/llvm/CodeGen/MachineValueType.h b/include/llvm/CodeGen/MachineValueType.h
index affacb0..7ad782f 100644
--- a/include/llvm/CodeGen/MachineValueType.h
+++ b/include/llvm/CodeGen/MachineValueType.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_CODEGEN_MACHINEVALUETYPE_H
 #define LLVM_CODEGEN_MACHINEVALUETYPE_H
 
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -118,6 +119,7 @@ namespace llvm {
                               // unspecified type.  The register class
                               // will be determined by the opcode.
 
+      FIRST_VALUETYPE = 0,    // This is always the beginning of the list.
       LAST_VALUETYPE =  58,   // This always remains at the end of the list.
 
       // This is the current maximum for LAST_VALUETYPE.
@@ -150,7 +152,11 @@ namespace llvm {
 
       // iPTR - An int value the size of the pointer of the current
       // target.  This should only be used internal to tblgen!
-      iPTR           = 255
+      iPTR           = 255,
+
+      // Any - Any type. This is used for intrinsics that have overloadings.
+      // This is only for tblgen's consumption!
+      Any            = 256
     };
 
     SimpleValueType SimpleTy;
@@ -165,6 +171,12 @@ namespace llvm {
     bool operator>=(const MVT& S) const { return SimpleTy >= S.SimpleTy; }
     bool operator<=(const MVT& S) const { return SimpleTy <= S.SimpleTy; }
 
+    /// isValid - Return true if this is a valid simple valuetype.
+    bool isValid() const {
+      return (SimpleTy >= MVT::FIRST_VALUETYPE &&
+              SimpleTy < MVT::LAST_VALUETYPE);
+    }
+
     /// isFloatingPoint - Return true if this is a FP, or a vector FP type.
     bool isFloatingPoint() const {
       return ((SimpleTy >= MVT::FIRST_FP_VALUETYPE &&
@@ -237,7 +249,8 @@ namespace llvm {
 
     /// isOverloaded - Return true if this is an overloaded type for TableGen.
     bool isOverloaded() const {
-      return (SimpleTy==MVT::iAny || SimpleTy==MVT::fAny ||
+      return (SimpleTy==MVT::Any  ||
+              SimpleTy==MVT::iAny || SimpleTy==MVT::fAny ||
               SimpleTy==MVT::vAny || SimpleTy==MVT::iPTRAny);
     }
 
@@ -372,6 +385,7 @@ namespace llvm {
       case iAny:
       case fAny:
       case vAny:
+      case Any:
         llvm_unreachable("Value type is overloaded.");
       case Metadata:
         llvm_unreachable("Value type is metadata.");
@@ -575,6 +589,52 @@ namespace llvm {
     /// returned as Other, otherwise they are invalid.
     static MVT getVT(Type *Ty, bool HandleUnknown = false);
 
+  private:
+    /// A simple iterator over the MVT::SimpleValueType enum.
+    struct mvt_iterator {
+      SimpleValueType VT;
+      mvt_iterator(SimpleValueType VT) : VT(VT) {}
+      MVT operator*() const { return VT; }
+      bool operator!=(const mvt_iterator &LHS) const { return VT != LHS.VT; }
+      mvt_iterator& operator++() {
+        VT = (MVT::SimpleValueType)((int)VT + 1);
+        assert((int)VT <= MVT::MAX_ALLOWED_VALUETYPE &&
+               "MVT iterator overflowed.");
+        return *this;
+      }
+    };
+    /// A range of the MVT::SimpleValueType enum.
+    typedef iterator_range<mvt_iterator> mvt_range;
+
+  public:
+    /// SimpleValueType Iteration
+    /// @{
+    static mvt_range all_valuetypes() {
+      return mvt_range(MVT::FIRST_VALUETYPE, MVT::LAST_VALUETYPE);
+    }
+    static mvt_range integer_valuetypes() {
+      return mvt_range(MVT::FIRST_INTEGER_VALUETYPE,
+                       (MVT::SimpleValueType)(MVT::LAST_INTEGER_VALUETYPE + 1));
+    }
+    static mvt_range fp_valuetypes() {
+      return mvt_range(MVT::FIRST_FP_VALUETYPE,
+                       (MVT::SimpleValueType)(MVT::LAST_FP_VALUETYPE + 1));
+    }
+    static mvt_range vector_valuetypes() {
+      return mvt_range(MVT::FIRST_VECTOR_VALUETYPE,
+                       (MVT::SimpleValueType)(MVT::LAST_VECTOR_VALUETYPE + 1));
+    }
+    static mvt_range integer_vector_valuetypes() {
+      return mvt_range(
+          MVT::FIRST_INTEGER_VECTOR_VALUETYPE,
+          (MVT::SimpleValueType)(MVT::LAST_INTEGER_VECTOR_VALUETYPE + 1));
+    }
+    static mvt_range fp_vector_valuetypes() {
+      return mvt_range(
+          MVT::FIRST_FP_VECTOR_VALUETYPE,
+          (MVT::SimpleValueType)(MVT::LAST_FP_VECTOR_VALUETYPE + 1));
+    }
+    /// @}
   };
 
 } // End llvm namespace
diff --git a/include/llvm/CodeGen/PBQP/Graph.h b/include/llvm/CodeGen/PBQP/Graph.h
index 4dc5674..efb723c 100644
--- a/include/llvm/CodeGen/PBQP/Graph.h
+++ b/include/llvm/CodeGen/PBQP/Graph.h
@@ -507,14 +507,14 @@ namespace PBQP {
       return getNode(NId).getAdjEdgeIds().size();
     }
 
-    /// @brief Set an edge's cost matrix.
+    /// @brief Update an edge's cost matrix.
     /// @param EId Edge id.
     /// @param Costs New cost matrix.
     template <typename OtherMatrixT>
-    void setEdgeCosts(EdgeId EId, OtherMatrixT Costs) {
+    void updateEdgeCosts(EdgeId EId, OtherMatrixT Costs) {
       MatrixPtr AllocatedCosts = CostAlloc.getMatrix(std::move(Costs));
       if (Solver)
-        Solver->handleSetEdgeCosts(EId, *AllocatedCosts);
+        Solver->handleUpdateCosts(EId, *AllocatedCosts);
       getEdge(EId).Costs = AllocatedCosts;
     }
 
@@ -548,14 +548,14 @@ namespace PBQP {
     /// @brief Get the first node connected to this edge.
     /// @param EId Edge id.
     /// @return The first node connected to the given edge.
-    NodeId getEdgeNode1Id(EdgeId EId) {
+    NodeId getEdgeNode1Id(EdgeId EId) const {
       return getEdge(EId).getN1Id();
     }
 
     /// @brief Get the second node connected to this edge.
     /// @param EId Edge id.
     /// @return The second node connected to the given edge.
-    NodeId getEdgeNode2Id(EdgeId EId) {
+    NodeId getEdgeNode2Id(EdgeId EId) const {
       return getEdge(EId).getN2Id();
     }
 
@@ -672,69 +672,6 @@ namespace PBQP {
       Edges.clear();
       FreeEdgeIds.clear();
     }
-
-    /// @brief Dump a graph to an output stream.
-    template <typename OStream>
-    void dumpToStream(OStream &OS) {
-      OS << nodeIds().size() << " " << edgeIds().size() << "\n";
-
-      for (auto NId : nodeIds()) {
-        const Vector& V = getNodeCosts(NId);
-        OS << "\n" << V.getLength() << "\n";
-        assert(V.getLength() != 0 && "Empty vector in graph.");
-        OS << V[0];
-        for (unsigned i = 1; i < V.getLength(); ++i) {
-          OS << " " << V[i];
-        }
-        OS << "\n";
-      }
-
-      for (auto EId : edgeIds()) {
-        NodeId N1Id = getEdgeNode1Id(EId);
-        NodeId N2Id = getEdgeNode2Id(EId);
-        assert(N1Id != N2Id && "PBQP graphs shound not have self-edges.");
-        const Matrix& M = getEdgeCosts(EId);
-        OS << "\n" << N1Id << " " << N2Id << "\n"
-           << M.getRows() << " " << M.getCols() << "\n";
-        assert(M.getRows() != 0 && "No rows in matrix.");
-        assert(M.getCols() != 0 && "No cols in matrix.");
-        for (unsigned i = 0; i < M.getRows(); ++i) {
-          OS << M[i][0];
-          for (unsigned j = 1; j < M.getCols(); ++j) {
-            OS << " " << M[i][j];
-          }
-          OS << "\n";
-        }
-      }
-    }
-
-    /// @brief Dump this graph to dbgs().
-    void dump() {
-      dumpToStream(dbgs());
-    }
-
-    /// @brief Print a representation of this graph in DOT format.
-    /// @param OS Output stream to print on.
-    template <typename OStream>
-    void printDot(OStream &OS) {
-      OS << "graph {\n";
-      for (auto NId : nodeIds()) {
-        OS << "  node" << NId << " [ label=\""
-           << NId << ": " << getNodeCosts(NId) << "\" ]\n";
-      }
-      OS << "  edge [ len=" << nodeIds().size() << " ]\n";
-      for (auto EId : edgeIds()) {
-        OS << "  node" << getEdgeNode1Id(EId)
-           << " -- node" << getEdgeNode2Id(EId)
-           << " [ label=\"";
-        const Matrix &EdgeCosts = getEdgeCosts(EId);
-        for (unsigned i = 0; i < EdgeCosts.getRows(); ++i) {
-          OS << EdgeCosts.getRowAsVector(i) << "\\n";
-        }
-        OS << "\" ]\n";
-      }
-      OS << "}\n";
-    }
   };
 
 }  // namespace PBQP
diff --git a/include/llvm/CodeGen/PBQP/ReductionRules.h b/include/llvm/CodeGen/PBQP/ReductionRules.h
index 21fde4d..d4a544b 100644
--- a/include/llvm/CodeGen/PBQP/ReductionRules.h
+++ b/include/llvm/CodeGen/PBQP/ReductionRules.h
@@ -132,9 +132,9 @@ namespace PBQP {
     } else {
       const Matrix &YZECosts = G.getEdgeCosts(YZEId);
       if (YNId == G.getEdgeNode1Id(YZEId)) {
-        G.setEdgeCosts(YZEId, Delta + YZECosts);
+        G.updateEdgeCosts(YZEId, Delta + YZECosts);
       } else {
-        G.setEdgeCosts(YZEId, Delta.transpose() + YZECosts);
+        G.updateEdgeCosts(YZEId, Delta.transpose() + YZECosts);
       }
     }
 
@@ -144,6 +144,25 @@ namespace PBQP {
     // TODO: Try to normalize newly added/modified edge.
   }
 
+#ifndef NDEBUG
+  // Does this Cost vector have any register options ?
+  template <typename VectorT>
+  bool hasRegisterOptions(const VectorT &V) {
+    unsigned VL = V.getLength();
+
+    // An empty or spill only cost vector does not provide any register option.
+    if (VL <= 1)
+      return false;
+
+    // If there are registers in the cost vector, but all of them have infinite
+    // costs, then ... there is no available register.
+    for (unsigned i = 1; i < VL; ++i)
+      if (V[i] != std::numeric_limits<PBQP::PBQPNum>::infinity())
+        return true;
+
+    return false;
+  }
+#endif
 
   // \brief Find a solution to a fully reduced graph by backpropagation.
   //
@@ -170,6 +189,15 @@ namespace PBQP {
 
       RawVector v = G.getNodeCosts(NId);
 
+#ifndef NDEBUG
+      // Although a conservatively allocatable node can be allocated to a register,
+      // spilling it may provide a lower cost solution. Assert here that spilling
+      // is done by choice, not because there were no register available.
+      if (G.getNodeMetadata(NId).wasConservativelyAllocatable())
+        assert(hasRegisterOptions(v) && "A conservatively allocatable node "
+                                        "must have available register options");
+#endif
+
       for (auto EId : G.adjEdgeIds(NId)) {
         const Matrix& edgeCosts = G.getEdgeCosts(EId);
         if (NId == G.getEdgeNode1Id(EId)) {
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index b672d9d..65b17d3 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -105,6 +105,7 @@ private:
   AnalysisID StopAfter;
   bool Started;
   bool Stopped;
+  bool AddingMachinePasses;
 
 protected:
   TargetMachine *TM;
@@ -213,7 +214,7 @@ public:
   ///
   /// This can also be used to plug a new MachineSchedStrategy into an instance
   /// of the standard ScheduleDAGMI:
-  ///   return new ScheduleDAGMI(C, new MyStrategy(C))
+  ///   return new ScheduleDAGMI(C, make_unique<MyStrategy>(C), /* IsPostRA= */false)
   ///
   /// Return NULL to select the default (generic) machine scheduler.
   virtual ScheduleDAGInstrs *
@@ -259,12 +260,9 @@ protected:
     return false;
   }
 
-  /// addPreRegAlloc - This method may be implemented by targets that want to
-  /// run passes immediately before register allocation. This should return
-  /// true if -print-machineinstrs should print after these passes.
-  virtual bool addPreRegAlloc() {
-    return false;
-  }
+  /// This method may be implemented by targets that want to run passes
+  /// immediately before register allocation.
+  virtual void addPreRegAlloc() { }
 
   /// createTargetRegisterAllocator - Create the register allocator pass for
   /// this target at the current optimization level.
@@ -290,24 +288,16 @@ protected:
     return false;
   }
 
-  /// addPostRegAlloc - This method may be implemented by targets that want to
-  /// run passes after register allocation pass pipeline but before
-  /// prolog-epilog insertion.  This should return true if -print-machineinstrs
-  /// should print after these passes.
-  virtual bool addPostRegAlloc() {
-    return false;
-  }
+  /// This method may be implemented by targets that want to run passes after
+  /// register allocation pass pipeline but before prolog-epilog insertion.
+  virtual void addPostRegAlloc() { }
 
   /// Add passes that optimize machine instructions after register allocation.
   virtual void addMachineLateOptimization();
 
-  /// addPreSched2 - This method may be implemented by targets that want to
-  /// run passes after prolog-epilog insertion and before the second instruction
-  /// scheduling pass.  This should return true if -print-machineinstrs should
-  /// print after these passes.
-  virtual bool addPreSched2() {
-    return false;
-  }
+  /// This method may be implemented by targets that want to run passes after
+  /// prolog-epilog insertion and before the second instruction scheduling pass.
+  virtual void addPreSched2() { }
 
   /// addGCPasses - Add late codegen passes that analyze code for garbage
   /// collection. This should return true if GC info should be printed after
@@ -317,24 +307,30 @@ protected:
   /// Add standard basic block placement passes.
   virtual void addBlockPlacement();
 
-  /// addPreEmitPass - This pass may be implemented by targets that want to run
-  /// passes immediately before machine code is emitted.  This should return
-  /// true if -print-machineinstrs should print out the code after the passes.
-  virtual bool addPreEmitPass() {
-    return false;
-  }
+  /// This pass may be implemented by targets that want to run passes
+  /// immediately before machine code is emitted.
+  virtual void addPreEmitPass() { }
 
   /// Utilities for targets to add passes to the pass manager.
   ///
 
   /// Add a CodeGen pass at this point in the pipeline after checking overrides.
   /// Return the pass that was added, or zero if no pass was added.
-  AnalysisID addPass(AnalysisID PassID);
+  /// @p printAfter    if true and adding a machine function pass add an extra
+  ///                  machine printer pass afterwards
+  /// @p verifyAfter   if true and adding a machine function pass add an extra
+  ///                  machine verification pass afterwards.
+  AnalysisID addPass(AnalysisID PassID, bool verifyAfter = true,
+                     bool printAfter = true);
 
   /// Add a pass to the PassManager if that pass is supposed to be run, as
   /// determined by the StartAfter and StopAfter options. Takes ownership of the
   /// pass.
-  void addPass(Pass *P);
+  /// @p printAfter    if true and adding a machine function pass add an extra
+  ///                  machine printer pass afterwards
+  /// @p verifyAfter   if true and adding a machine function pass add an extra
+  ///                  machine verification pass afterwards.
+  void addPass(Pass *P, bool verifyAfter = true, bool printAfter = true);
 
   /// addMachinePasses helper to create the target-selected or overriden
   /// regalloc pass.
@@ -343,7 +339,14 @@ protected:
   /// printAndVerify - Add a pass to dump then verify the machine function, if
   /// those steps are enabled.
   ///
-  void printAndVerify(const char *Banner);
+  void printAndVerify(const std::string &Banner);
+
+  /// Add a pass to print the machine function if printing is enabled.
+  void addPrintPass(const std::string &Banner);
+
+  /// Add a pass to perform basic verification of the machine function if
+  /// verification is enabled.
+  void addVerifyPass(const std::string &Banner);
 };
 } // namespace llvm
 
@@ -351,13 +354,6 @@ protected:
 namespace llvm {
   FunctionPass *createAtomicExpandPass(const TargetMachine *TM);
 
-  /// \brief Create a basic TargetTransformInfo analysis pass.
-  ///
-  /// This pass implements the target transform info analysis using the target
-  /// independent information available to the LLVM code generator.
-  ImmutablePass *
-  createBasicTargetTransformInfoPass(const TargetMachine *TM);
-
   /// createUnreachableBlockEliminationPass - The LLVM code generator does not
   /// work well with unreachable basic blocks (what live ranges make sense for a
   /// block that cannot be reached?).  As such, a code generator should either
@@ -514,11 +510,15 @@ namespace llvm {
   /// information.
   extern char &MachineBlockPlacementStatsID;
 
-  /// GCLowering Pass - Performs target-independent LLVM IR transformations for
-  /// highly portable strategies.
-  ///
+  /// GCLowering Pass - Used by gc.root to perform its default lowering
+  /// operations.
   FunctionPass *createGCLoweringPass();
 
+  /// ShadowStackGCLowering - Implements the custom lowering mechanism
+  /// used by the shadow stack GC.  Only runs on functions which opt in to
+  /// the shadow stack collector.
+  FunctionPass *createShadowStackGCLoweringPass();
+
   /// GCMachineCodeAnalysis - Target-independent pass to mark safe points
   /// in machine code. Must be added very late during code generation, just
   /// prior to output, and importantly after all CFG transformations (such as
@@ -560,12 +560,16 @@ namespace llvm {
   /// createMachineVerifierPass - This pass verifies cenerated machine code
   /// instructions for correctness.
   ///
-  FunctionPass *createMachineVerifierPass(const char *Banner = nullptr);
+  FunctionPass *createMachineVerifierPass(const std::string& Banner);
 
   /// createDwarfEHPass - This pass mulches exception handling code into a form
   /// adapted to code generation.  Required if using dwarf exception handling.
   FunctionPass *createDwarfEHPass(const TargetMachine *TM);
 
+  /// createWinEHPass - Prepares personality functions used by MSVC on Windows,
+  /// in addition to the Itanium LSDA based personalities.
+  FunctionPass *createWinEHPass(const TargetMachine *TM);
+
   /// createSjLjEHPreparePass - This pass adapts exception handling code to use
   /// the GCC-style builtin setjmp/longjmp (sjlj) to handling EH control flow.
   ///
diff --git a/include/llvm/CodeGen/RegAllocPBQP.h b/include/llvm/CodeGen/RegAllocPBQP.h
index 540af08..c7bb07b 100644
--- a/include/llvm/CodeGen/RegAllocPBQP.h
+++ b/include/llvm/CodeGen/RegAllocPBQP.h
@@ -17,12 +17,15 @@
 #define LLVM_CODEGEN_REGALLOCPBQP_H
 
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/PBQPRAConstraint.h"
 #include "llvm/CodeGen/PBQP/CostAllocator.h"
 #include "llvm/CodeGen/PBQP/ReductionRules.h"
+#include "llvm/CodeGen/PBQPRAConstraint.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
+
+class raw_ostream;
+
 namespace PBQP {
 namespace RegAlloc {
 
@@ -177,23 +180,38 @@ class NodeMetadata {
 public:
   typedef RegAlloc::AllowedRegVector AllowedRegVector;
 
-  typedef enum { Unprocessed,
-                 OptimallyReducible,
-                 ConservativelyAllocatable,
-                 NotProvablyAllocatable } ReductionState;
+  // The node's reduction state. The order in this enum is important,
+  // as it is assumed nodes can only progress up (i.e. towards being
+  // optimally reducible) when reducing the graph.
+  typedef enum {
+    Unprocessed,
+    NotProvablyAllocatable,
+    ConservativelyAllocatable,
+    OptimallyReducible
+  } ReductionState;
 
   NodeMetadata()
     : RS(Unprocessed), NumOpts(0), DeniedOpts(0), OptUnsafeEdges(nullptr),
-      VReg(0) {}
+      VReg(0)
+#ifndef NDEBUG
+      , everConservativelyAllocatable(false)
+#endif
+      {}
 
   // FIXME: Re-implementing default behavior to work around MSVC. Remove once
   // MSVC synthesizes move constructors properly.
   NodeMetadata(const NodeMetadata &Other)
     : RS(Other.RS), NumOpts(Other.NumOpts), DeniedOpts(Other.DeniedOpts),
       OptUnsafeEdges(new unsigned[NumOpts]), VReg(Other.VReg),
-      AllowedRegs(Other.AllowedRegs) {
-    std::copy(&Other.OptUnsafeEdges[0], &Other.OptUnsafeEdges[NumOpts],
-              &OptUnsafeEdges[0]);
+      AllowedRegs(Other.AllowedRegs)
+#ifndef NDEBUG
+      , everConservativelyAllocatable(Other.everConservativelyAllocatable)
+#endif
+  {
+    if (NumOpts > 0) {
+      std::copy(&Other.OptUnsafeEdges[0], &Other.OptUnsafeEdges[NumOpts],
+                &OptUnsafeEdges[0]);
+    }
   }
 
   // FIXME: Re-implementing default behavior to work around MSVC. Remove once
@@ -201,7 +219,11 @@ public:
   NodeMetadata(NodeMetadata &&Other)
     : RS(Other.RS), NumOpts(Other.NumOpts), DeniedOpts(Other.DeniedOpts),
       OptUnsafeEdges(std::move(Other.OptUnsafeEdges)), VReg(Other.VReg),
-      AllowedRegs(std::move(Other.AllowedRegs)) {}
+      AllowedRegs(std::move(Other.AllowedRegs))
+#ifndef NDEBUG
+      , everConservativelyAllocatable(Other.everConservativelyAllocatable)
+#endif
+  {}
 
   // FIXME: Re-implementing default behavior to work around MSVC. Remove once
   // MSVC synthesizes move constructors properly.
@@ -214,6 +236,9 @@ public:
               OptUnsafeEdges.get());
     VReg = Other.VReg;
     AllowedRegs = Other.AllowedRegs;
+#ifndef NDEBUG
+    everConservativelyAllocatable = Other.everConservativelyAllocatable;
+#endif
     return *this;
   }
 
@@ -226,6 +251,9 @@ public:
     OptUnsafeEdges = std::move(Other.OptUnsafeEdges);
     VReg = Other.VReg;
     AllowedRegs = std::move(Other.AllowedRegs);
+#ifndef NDEBUG
+    everConservativelyAllocatable = Other.everConservativelyAllocatable;
+#endif
     return *this;
   }
 
@@ -243,10 +271,21 @@ public:
   }
 
   ReductionState getReductionState() const { return RS; }
-  void setReductionState(ReductionState RS) { this->RS = RS; }
+  void setReductionState(ReductionState RS) {
+    assert(RS >= this->RS && "A node's reduction state can not be downgraded");
+    this->RS = RS;
+
+#ifndef NDEBUG
+    // Remember this state to assert later that a non-infinite register
+    // option was available.
+    if (RS == ConservativelyAllocatable)
+      everConservativelyAllocatable = true;
+#endif
+  }
+
 
   void handleAddEdge(const MatrixMetadata& MD, bool Transpose) {
-    DeniedOpts += Transpose ? MD.getWorstCol() : MD.getWorstRow();
+    DeniedOpts += Transpose ? MD.getWorstRow() : MD.getWorstCol();
     const bool* UnsafeOpts =
       Transpose ? MD.getUnsafeCols() : MD.getUnsafeRows();
     for (unsigned i = 0; i < NumOpts; ++i)
@@ -254,7 +293,7 @@ public:
   }
 
   void handleRemoveEdge(const MatrixMetadata& MD, bool Transpose) {
-    DeniedOpts -= Transpose ? MD.getWorstCol() : MD.getWorstRow();
+    DeniedOpts -= Transpose ? MD.getWorstRow() : MD.getWorstCol();
     const bool* UnsafeOpts =
       Transpose ? MD.getUnsafeCols() : MD.getUnsafeRows();
     for (unsigned i = 0; i < NumOpts; ++i)
@@ -267,6 +306,12 @@ public:
        &OptUnsafeEdges[NumOpts]);
   }
 
+#ifndef NDEBUG
+  bool wasConservativelyAllocatable() const {
+    return everConservativelyAllocatable;
+  }
+#endif
+
 private:
   ReductionState RS;
   unsigned NumOpts;
@@ -274,6 +319,10 @@ private:
   std::unique_ptr<unsigned[]> OptUnsafeEdges;
   unsigned VReg;
   GraphMetadata::AllowedRegVecRef AllowedRegs;
+
+#ifndef NDEBUG
+  bool everConservativelyAllocatable;
+#endif
 };
 
 class RegAllocSolverImpl {
@@ -307,6 +356,8 @@ public:
   }
 
   void handleAddNode(NodeId NId) {
+    assert(G.getNodeCosts(NId).getLength() > 1 &&
+           "PBQP Graph should not contain single or zero-option nodes");
     G.getNodeMetadata(NId).setup(G.getNodeCosts(NId));
   }
   void handleRemoveNode(NodeId NId) {}
@@ -326,15 +377,7 @@ public:
     NodeMetadata& NMd = G.getNodeMetadata(NId);
     const MatrixMetadata& MMd = G.getEdgeCosts(EId).getMetadata();
     NMd.handleRemoveEdge(MMd, NId == G.getEdgeNode2Id(EId));
-    if (G.getNodeDegree(NId) == 3) {
-      // This node is becoming optimally reducible.
-      moveToOptimallyReducibleNodes(NId);
-    } else if (NMd.getReductionState() ==
-               NodeMetadata::NotProvablyAllocatable &&
-               NMd.isConservativelyAllocatable()) {
-      // This node just became conservatively allocatable.
-      moveToConservativelyAllocatableNodes(NId);
-    }
+    promote(NId, NMd);
   }
 
   void handleReconnectEdge(EdgeId EId, NodeId NId) {
@@ -343,20 +386,44 @@ public:
     NMd.handleAddEdge(MMd, NId == G.getEdgeNode2Id(EId));
   }
 
-  void handleSetEdgeCosts(EdgeId EId, const Matrix& NewCosts) {
-    handleRemoveEdge(EId);
-
+  void handleUpdateCosts(EdgeId EId, const Matrix& NewCosts) {
     NodeId N1Id = G.getEdgeNode1Id(EId);
     NodeId N2Id = G.getEdgeNode2Id(EId);
     NodeMetadata& N1Md = G.getNodeMetadata(N1Id);
     NodeMetadata& N2Md = G.getNodeMetadata(N2Id);
+    bool Transpose = N1Id != G.getEdgeNode1Id(EId);
+
+    // Metadata are computed incrementally. First, update them
+    // by removing the old cost.
+    const MatrixMetadata& OldMMd = G.getEdgeCosts(EId).getMetadata();
+    N1Md.handleRemoveEdge(OldMMd, Transpose);
+    N2Md.handleRemoveEdge(OldMMd, !Transpose);
+
+    // And update now the metadata with the new cost.
     const MatrixMetadata& MMd = NewCosts.getMetadata();
-    N1Md.handleAddEdge(MMd, N1Id != G.getEdgeNode1Id(EId));
-    N2Md.handleAddEdge(MMd, N2Id != G.getEdgeNode1Id(EId));
+    N1Md.handleAddEdge(MMd, Transpose);
+    N2Md.handleAddEdge(MMd, !Transpose);
+
+    // As the metadata may have changed with the update, the nodes may have
+    // become ConservativelyAllocatable or OptimallyReducible.
+    promote(N1Id, N1Md);
+    promote(N2Id, N2Md);
   }
 
 private:
 
+  void promote(NodeId NId, NodeMetadata& NMd) {
+    if (G.getNodeDegree(NId) == 3) {
+      // This node is becoming optimally reducible.
+      moveToOptimallyReducibleNodes(NId);
+    } else if (NMd.getReductionState() ==
+               NodeMetadata::NotProvablyAllocatable &&
+               NMd.isConservativelyAllocatable()) {
+      // This node just became conservatively allocatable.
+      moveToConservativelyAllocatableNodes(NId);
+    }
+  }
+
   void removeFromCurrentSet(NodeId NId) {
     switch (G.getNodeMetadata(NId).getReductionState()) {
     case NodeMetadata::Unprocessed: break;
@@ -497,6 +564,17 @@ private:
   typedef PBQP::Graph<RegAllocSolverImpl> BaseT;
 public:
   PBQPRAGraph(GraphMetadata Metadata) : BaseT(Metadata) {}
+
+  /// @brief Dump this graph to dbgs().
+  void dump() const;
+
+  /// @brief Dump this graph to an output stream.
+  /// @param OS Output stream to print on.
+  void dump(raw_ostream &OS) const;
+
+  /// @brief Print a representation of this graph in DOT format.
+  /// @param OS Output stream to print on.
+  void printDot(raw_ostream &OS) const;
 };
 
 inline Solution solve(PBQPRAGraph& G) {
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index 5a65d59..80aee8c 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -190,6 +190,12 @@ namespace llvm {
       return getKind() == Order && Contents.OrdKind == Barrier;
     }
 
+    /// isNormalMemoryOrBarrier - Test if this is could be any kind of memory
+    /// dependence.
+    bool isNormalMemoryOrBarrier() const {
+      return (isNormalMemory() || isBarrier());
+    }
+
     /// isMustAlias - Test if this is an Order dependence that is marked
     /// as "must alias", meaning that the SUnits at either end of the edge
     /// have a memory dependence on a known memory location.
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index fbdaf0d..dc1c80d 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -115,8 +115,8 @@ class SDDbgInfo {
   typedef DenseMap<const SDNode*, SmallVector<SDDbgValue*, 2> > DbgValMapType;
   DbgValMapType DbgValMap;
 
-  void operator=(const SDDbgInfo&) LLVM_DELETED_FUNCTION;
-  SDDbgInfo(const SDDbgInfo&) LLVM_DELETED_FUNCTION;
+  void operator=(const SDDbgInfo&) = delete;
+  SDDbgInfo(const SDDbgInfo&) = delete;
 public:
   SDDbgInfo() {}
 
@@ -262,8 +262,8 @@ private:
                               DenseSet<SDNode *> &visited,
                               int level, bool &printed);
 
-  void operator=(const SelectionDAG&) LLVM_DELETED_FUNCTION;
-  SelectionDAG(const SelectionDAG&) LLVM_DELETED_FUNCTION;
+  void operator=(const SelectionDAG&) = delete;
+  SelectionDAG(const SelectionDAG&) = delete;
 
 public:
   explicit SelectionDAG(const TargetMachine &TM, llvm::CodeGenOpt::Level);
@@ -866,6 +866,12 @@ public:
   SDValue getIndexedStore(SDValue OrigStoe, SDLoc dl, SDValue Base,
                            SDValue Offset, ISD::MemIndexedMode AM);
 
+  SDValue getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr,
+                        SDValue Mask, SDValue Src0, EVT MemVT,
+                        MachineMemOperand *MMO, ISD::LoadExtType);
+  SDValue getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
+                         SDValue Ptr, SDValue Mask, EVT MemVT,
+                         MachineMemOperand *MMO, bool IsTrunc);
   /// getSrcValue - Construct a node to track a Value* through the backend.
   SDValue getSrcValue(const Value *v);
 
diff --git a/include/llvm/CodeGen/SelectionDAGISel.h b/include/llvm/CodeGen/SelectionDAGISel.h
index 2639402..d53e66d 100644
--- a/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/include/llvm/CodeGen/SelectionDAGISel.h
@@ -18,8 +18,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Pass.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 namespace llvm {
   class FastISel;
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 4715827..0b6240f 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -19,7 +19,6 @@
 #ifndef LLVM_CODEGEN_SELECTIONDAGNODES_H
 #define LLVM_CODEGEN_SELECTIONDAGNODES_H
 
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/GraphTraits.h"
@@ -27,6 +26,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/ilist_node.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -145,7 +145,7 @@ public:
   bool operator<(const SDValue &O) const {
     return std::tie(Node, ResNo) < std::tie(O.Node, O.ResNo);
   }
-  LLVM_EXPLICIT operator bool() const {
+  explicit operator bool() const {
     return Node != nullptr;
   }
 
@@ -184,7 +184,7 @@ public:
   inline bool isTargetOpcode() const;
   inline bool isMachineOpcode() const;
   inline unsigned getMachineOpcode() const;
-  inline const DebugLoc getDebugLoc() const;
+  inline const DebugLoc &getDebugLoc() const;
   inline void dump() const;
   inline void dumpr() const;
 
@@ -259,8 +259,8 @@ class SDUse {
   /// this operand.
   SDUse **Prev, *Next;
 
-  SDUse(const SDUse &U) LLVM_DELETED_FUNCTION;
-  void operator=(const SDUse &U) LLVM_DELETED_FUNCTION;
+  SDUse(const SDUse &U) = delete;
+  void operator=(const SDUse &U) = delete;
 
 public:
   SDUse() : Val(), User(nullptr), Prev(nullptr), Next(nullptr) {}
@@ -476,11 +476,11 @@ public:
   void setIROrder(unsigned Order) { IROrder = Order; }
 
   /// getDebugLoc - Return the source location info.
-  const DebugLoc getDebugLoc() const { return debugLoc; }
+  const DebugLoc &getDebugLoc() const { return debugLoc; }
 
   /// setDebugLoc - Set source location info.  Try to avoid this, putting
   /// it in the constructor is preferable.
-  void setDebugLoc(const DebugLoc dl) { debugLoc = dl; }
+  void setDebugLoc(DebugLoc dl) { debugLoc = std::move(dl); }
 
   /// use_iterator - This class provides iterator support for SDUse
   /// operands that use a specific SDNode.
@@ -754,19 +754,20 @@ protected:
     return Ret;
   }
 
-  SDNode(unsigned Opc, unsigned Order, const DebugLoc dl, SDVTList VTs,
+  SDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
          ArrayRef<SDValue> Ops)
-    : NodeType(Opc), OperandsNeedDelete(true), HasDebugValue(false),
-      SubclassData(0), NodeId(-1),
-      OperandList(Ops.size() ? new SDUse[Ops.size()] : nullptr),
-      ValueList(VTs.VTs), UseList(nullptr),
-      NumOperands(Ops.size()), NumValues(VTs.NumVTs),
-      debugLoc(dl), IROrder(Order) {
+      : NodeType(Opc), OperandsNeedDelete(true), HasDebugValue(false),
+        SubclassData(0), NodeId(-1),
+        OperandList(Ops.size() ? new SDUse[Ops.size()] : nullptr),
+        ValueList(VTs.VTs), UseList(nullptr), NumOperands(Ops.size()),
+        NumValues(VTs.NumVTs), debugLoc(std::move(dl)), IROrder(Order) {
+    assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
     assert(NumOperands == Ops.size() &&
            "NumOperands wasn't wide enough for its operands!");
     assert(NumValues == VTs.NumVTs &&
            "NumValues wasn't wide enough for its operands!");
     for (unsigned i = 0; i != Ops.size(); ++i) {
+      assert(OperandList && "no operands available");
       OperandList[i].setUser(this);
       OperandList[i].setInitial(Ops[i]);
     }
@@ -775,11 +776,12 @@ protected:
 
   /// This constructor adds no operands itself; operands can be
   /// set later with InitOperands.
-  SDNode(unsigned Opc, unsigned Order, const DebugLoc dl, SDVTList VTs)
-    : NodeType(Opc), OperandsNeedDelete(false), HasDebugValue(false),
-      SubclassData(0), NodeId(-1), OperandList(nullptr), ValueList(VTs.VTs),
-      UseList(nullptr), NumOperands(0), NumValues(VTs.NumVTs), debugLoc(dl),
-      IROrder(Order) {
+  SDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs)
+      : NodeType(Opc), OperandsNeedDelete(false), HasDebugValue(false),
+        SubclassData(0), NodeId(-1), OperandList(nullptr), ValueList(VTs.VTs),
+        UseList(nullptr), NumOperands(0), NumValues(VTs.NumVTs),
+        debugLoc(std::move(dl)), IROrder(Order) {
+    assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
     assert(NumValues == VTs.NumVTs &&
            "NumValues wasn't wide enough for its operands!");
   }
@@ -942,7 +944,7 @@ inline bool SDValue::use_empty() const {
 inline bool SDValue::hasOneUse() const {
   return Node->hasNUsesOfValue(1, ResNo);
 }
-inline const DebugLoc SDValue::getDebugLoc() const {
+inline const DebugLoc &SDValue::getDebugLoc() const {
   return Node->getDebugLoc();
 }
 inline void SDValue::dump() const {
@@ -1177,6 +1179,8 @@ public:
            N->getOpcode() == ISD::ATOMIC_LOAD_UMAX    ||
            N->getOpcode() == ISD::ATOMIC_LOAD         ||
            N->getOpcode() == ISD::ATOMIC_STORE        ||
+           N->getOpcode() == ISD::MLOAD               ||
+           N->getOpcode() == ISD::MSTORE              ||
            N->isMemIntrinsic()                        ||
            N->isTargetMemoryOpcode();
   }
@@ -1413,6 +1417,12 @@ public:
   /// isNaN - Return true if the value is a NaN.
   bool isNaN() const { return Value->isNaN(); }
 
+  /// isInfinity - Return true if the value is an infinity
+  bool isInfinity() const { return Value->isInfinity(); }
+
+  /// isNegative - Return true if the value is negative.
+  bool isNegative() const { return Value->isNegative(); }
+
   /// isExactlyValue - We don't rely on operator== working on double values, as
   /// it returns true for things that are clearly not equal, like -0.0 and 0.0.
   /// As such, this method can be used to do an exact bit-for-bit comparison of
@@ -1601,7 +1611,7 @@ public:
 /// BUILD_VECTORs.
 class BuildVectorSDNode : public SDNode {
   // These are constructed as SDNodes and then cast to BuildVectorSDNodes.
-  explicit BuildVectorSDNode() LLVM_DELETED_FUNCTION;
+  explicit BuildVectorSDNode() = delete;
 public:
   /// isConstantSplat - Check if this is a constant splat, and if so, find the
   /// smallest element size that splats the vector.  If MinSplatBits is
@@ -1926,6 +1936,81 @@ public:
   }
 };
 
+/// MaskedLoadStoreSDNode - This is a base class is used to represent MLOAD and
+/// MSTORE nodes
+///
+class MaskedLoadStoreSDNode : public MemSDNode {
+  // Operands
+  SDUse Ops[4];
+public:
+  friend class SelectionDAG;
+  MaskedLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order, DebugLoc dl,
+                   SDValue *Operands, unsigned numOperands, 
+                   SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
+    : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
+    InitOperands(Ops, Operands, numOperands);
+  }
+
+  // In the both nodes address is Op1, mask is Op2:
+  // MaskedLoadSDNode (Chain, ptr, mask, src0), src0 is a passthru value
+  // MaskedStoreSDNode (Chain, ptr, mask, data)
+  // Mask is a vector of i1 elements
+  const SDValue &getBasePtr() const { return getOperand(1); }
+  const SDValue &getMask() const    { return getOperand(2); }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::MLOAD ||
+           N->getOpcode() == ISD::MSTORE;
+  }
+};
+
+/// MaskedLoadSDNode - This class is used to represent an MLOAD node
+///
+class MaskedLoadSDNode : public MaskedLoadStoreSDNode {
+public:
+  friend class SelectionDAG;
+  MaskedLoadSDNode(unsigned Order, DebugLoc dl, SDValue *Operands,
+                   unsigned numOperands, SDVTList VTs, ISD::LoadExtType ETy,
+                   EVT MemVT, MachineMemOperand *MMO)
+    : MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, Operands, numOperands,
+                            VTs, MemVT, MMO) {
+    SubclassData |= (unsigned short)ETy;
+  }
+
+  ISD::LoadExtType getExtensionType() const {
+    return ISD::LoadExtType(SubclassData & 3);
+  } 
+  const SDValue &getSrc0() const { return getOperand(3); }
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::MLOAD;
+  }
+};
+
+/// MaskedStoreSDNode - This class is used to represent an MSTORE node
+///
+class MaskedStoreSDNode : public MaskedLoadStoreSDNode {
+
+public:
+  friend class SelectionDAG;
+  MaskedStoreSDNode(unsigned Order, DebugLoc dl, SDValue *Operands,
+                    unsigned numOperands, SDVTList VTs, bool isTrunc, EVT MemVT,
+                    MachineMemOperand *MMO)
+    : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, Operands, numOperands,
+                            VTs, MemVT, MMO) {
+      SubclassData |= (unsigned short)isTrunc;
+  }
+  /// isTruncatingStore - Return true if the op does a truncation before store.
+  /// For integers this is the same as doing a TRUNCATE and storing the result.
+  /// For floats, it is the same as doing an FP_ROUND and storing the result.
+  bool isTruncatingStore() const { return SubclassData & 1; }
+
+  const SDValue &getValue() const { return getOperand(3); }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::MSTORE;
+  }
+};
+
 /// MachineSDNode - An SDNode that represents everything that will be needed
 /// to construct a MachineInstr. These nodes are created during the
 /// instruction selection proper phase.
diff --git a/include/llvm/CodeGen/SlotIndexes.h b/include/llvm/CodeGen/SlotIndexes.h
index 00bb22b..9d6d6f5 100644
--- a/include/llvm/CodeGen/SlotIndexes.h
+++ b/include/llvm/CodeGen/SlotIndexes.h
@@ -162,7 +162,7 @@ namespace llvm {
     }
 
     /// Return true for a valid index.
-    LLVM_EXPLICIT operator bool() const { return isValid(); }
+    explicit operator bool() const { return isValid(); }
 
     /// Print this index to the given raw_ostream.
     void print(raw_ostream &os) const;
diff --git a/include/llvm/CodeGen/StackMaps.h b/include/llvm/CodeGen/StackMaps.h
index e343980..4e48afe 100644
--- a/include/llvm/CodeGen/StackMaps.h
+++ b/include/llvm/CodeGen/StackMaps.h
@@ -81,6 +81,52 @@ public:
   unsigned getNextScratchIdx(unsigned StartIdx = 0) const;
 };
 
+/// MI-level Statepoint operands
+///
+/// Statepoint operands take the form:
+///   <num call arguments>, <call target>, [call arguments],
+///   <StackMaps::ConstantOp>, <flags>,
+///   <StackMaps::ConstantOp>, <num other args>, [other args],
+///   [gc values]
+class StatepointOpers {
+private:
+  enum {
+    NCallArgsPos = 0,
+    CallTargetPos = 1
+  };
+
+public:
+  explicit StatepointOpers(const MachineInstr *MI):
+    MI(MI) { }
+
+  /// Get starting index of non call related arguments
+  /// (statepoint flags, vm state and gc state).
+  unsigned getVarIdx() const {
+    return MI->getOperand(NCallArgsPos).getImm() + 2;
+  }
+
+  /// Returns the index of the operand containing the number of non-gc non-call
+  /// arguments. 
+  unsigned getNumVMSArgsIdx() const {
+    return getVarIdx() + 3;
+  }
+
+  /// Returns the number of non-gc non-call arguments attached to the
+  /// statepoint.  Note that this is the number of arguments, not the number of
+  /// operands required to represent those arguments.
+  unsigned getNumVMSArgs() const {
+    return MI->getOperand(getNumVMSArgsIdx()).getImm();
+  }
+
+  /// Returns the target of the underlying call.
+  const MachineOperand &getCallTarget() const {
+    return MI->getOperand(CallTargetPos);
+  }
+
+private:
+  const MachineInstr *MI;
+};
+
 class StackMaps {
 public:
   struct Location {
@@ -132,6 +178,9 @@ public:
   /// \brief Generate a stackmap record for a patchpoint instruction.
   void recordPatchPoint(const MachineInstr &MI);
 
+  /// \brief Generate a stackmap record for a statepoint instruction.
+  void recordStatepoint(const MachineInstr &MI);
+
   /// If there is any stack map data, create a stack map section and serialize
   /// the map info into it. This clears the stack map data structures
   /// afterwards.
@@ -139,7 +188,6 @@ public:
 
 private:
   static const char *WSMP;
-
   typedef SmallVector<Location, 8> LocationVec;
   typedef SmallVector<LiveOutReg, 8> LiveOutVec;
   typedef MapVector<uint64_t, uint64_t> ConstantPool;
diff --git a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index 87f1401..348c634 100644
--- a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
@@ -36,6 +36,8 @@ class TargetLoweringObjectFileELF : public TargetLoweringObjectFile {
   bool UseInitArray;
 
 public:
+  TargetLoweringObjectFileELF() : UseInitArray(false) {}
+
   virtual ~TargetLoweringObjectFileELF() {}
 
   void emitPersonalityValue(MCStreamer &Streamer, const TargetMachine &TM,
@@ -54,6 +56,13 @@ public:
                                         SectionKind Kind, Mangler &Mang,
                                         const TargetMachine &TM) const override;
 
+  const MCSection *
+  getSectionForJumpTable(const Function &F, Mangler &Mang,
+                         const TargetMachine &TM) const override;
+
+  bool shouldPutJumpTableInFunctionSection(bool UsesLabelDifference,
+                                           const Function &F) const override;
+
   /// Return an MCExpr to use for a reference to the specified type info global
   /// variable from exception handling information.
   const MCExpr *
@@ -89,8 +98,6 @@ public:
                        ArrayRef<Module::ModuleFlagEntry> ModuleFlags,
                        Mangler &Mang, const TargetMachine &TM) const override;
 
-  bool isSectionAtomizableBySymbols(const MCSection &Section) const override;
-
   const MCSection *
     SelectSectionForGlobal(const GlobalValue *GV,
                            SectionKind Kind, Mangler &Mang,
diff --git a/include/llvm/CodeGen/ValueTypes.td b/include/llvm/CodeGen/ValueTypes.td
index b5fa0e8..756262f 100644
--- a/include/llvm/CodeGen/ValueTypes.td
+++ b/include/llvm/CodeGen/ValueTypes.td
@@ -98,3 +98,6 @@ def iAny   : ValueType<0  , 254>;
 
 // Pseudo valuetype mapped to the current pointer size.
 def iPTR   : ValueType<0  , 255>;
+
+// Pseudo valuetype to represent "any type of any size".
+def Any    : ValueType<0  , 256>;
diff --git a/include/llvm/CodeGen/VirtRegMap.h b/include/llvm/CodeGen/VirtRegMap.h
index eceb875..d7e9209 100644
--- a/include/llvm/CodeGen/VirtRegMap.h
+++ b/include/llvm/CodeGen/VirtRegMap.h
@@ -63,8 +63,8 @@ namespace llvm {
     /// createSpillSlot - Allocate a spill slot for RC from MFI.
     unsigned createSpillSlot(const TargetRegisterClass *RC);
 
-    VirtRegMap(const VirtRegMap&) LLVM_DELETED_FUNCTION;
-    void operator=(const VirtRegMap&) LLVM_DELETED_FUNCTION;
+    VirtRegMap(const VirtRegMap&) = delete;
+    void operator=(const VirtRegMap&) = delete;
 
   public:
     static char ID;
diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake
index 37696eb..86ae8bb 100644
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake
@@ -6,9 +6,6 @@
 /* Exported configuration */
 #include "llvm/Config/llvm-config.h"
 
-/* Patch version of the LLVM API */
-#cmakedefine LLVM_VERSION_PATCH ${LLVM_VERSION_PATCH}
-
 /* Bug report URL. */
 #define BUG_REPORT_URL "${BUG_REPORT_URL}"
 
@@ -18,6 +15,9 @@
 /* Define to enable crash overrides */
 #cmakedefine ENABLE_CRASH_OVERRIDES
 
+/* Define to disable C++ atexit */
+#cmakedefine DISABLE_LLVM_DYLIB_ATEXIT
+
 /* Define if position independent code is enabled */
 #cmakedefine ENABLE_PIC
 
@@ -52,6 +52,9 @@
    don't. */
 #cmakedefine01 HAVE_DECL_STRERROR_S
 
+/* Define to 1 if you have the DIA SDK installed, and to 0 if you don't. */
+#cmakedefine HAVE_DIA_SDK ${HAVE_DIA_SDK}
+
 /* Define to 1 if you have the <dirent.h> header file, and it defines `DIR'.
    */
 #cmakedefine HAVE_DIRENT_H ${HAVE_DIRENT_H}
@@ -219,6 +222,9 @@
 /* Define to 1 if you have the `malloc_zone_statistics' function. */
 #cmakedefine HAVE_MALLOC_ZONE_STATISTICS ${HAVE_MALLOC_ZONE_STATISTICS}
 
+/* Define to 1 if you have the `mallctl` function. */
+#cmakedefine HAVE_MALLCTL ${HAVE_MALLCTL}
+
 /* Define to 1 if you have the `mkdtemp' function. */
 #cmakedefine HAVE_MKDTEMP ${HAVE_MKDTEMP}
 
@@ -423,6 +429,9 @@
 /* Have host's __chkstk */
 #cmakedefine HAVE___CHKSTK ${HAVE___CHKSTK}
 
+/* Have host's __chkstk_ms */
+#cmakedefine HAVE___CHKSTK_MS ${HAVE___CHKSTK_MS}
+
 /* Have host's __cmpdi2 */
 #cmakedefine HAVE___CMPDI2 ${HAVE___CMPDI2}
 
@@ -459,6 +468,9 @@
 /* Have host's ___chkstk */
 #cmakedefine HAVE____CHKSTK ${HAVE____CHKSTK}
 
+/* Have host's ___chkstk_ms */
+#cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}
+
 /* Define if we link Polly to the tools */
 #cmakedefine LINK_POLLY_INTO_TOOLS
 
@@ -518,9 +530,6 @@
 /* Type of 1st arg on ELM Callback */
 #cmakedefine WIN32_ELMCB_PCSTR ${WIN32_ELMCB_PCSTR}
 
-/* Define to empty if `const' does not conform to ANSI C. */
-#undef const
-
 /* Define to `int' if <sys/types.h> does not define. */
 #undef pid_t
 
@@ -542,7 +551,4 @@
 /* Define to 1 if you have the `_chsize_s' function. */
 #cmakedefine HAVE__CHSIZE_S ${HAVE__CHSIZE_S}
 
-/* Maximum path length */
-#cmakedefine MAXPATHLEN ${MAXPATHLEN}
-
 #endif
diff --git a/include/llvm/Config/config.h.in b/include/llvm/Config/config.h.in
index 8fcf145..c317bb1 100644
--- a/include/llvm/Config/config.h.in
+++ b/include/llvm/Config/config.h.in
@@ -9,6 +9,9 @@
 /* Define if we have libxml2 */
 #undef CLANG_HAVE_LIBXML
 
+/* Multilib suffix for libdir. */
+#undef CLANG_LIBDIR_SUFFIX
+
 /* Relative directory for resource files */
 #undef CLANG_RESOURCE_DIR
 
@@ -64,6 +67,9 @@
    don't. */
 #undef HAVE_DECL_STRERROR_S
 
+/* Define to 1 if you have the DIA SDK installed, and to 0 if you don't. */
+#undef HAVE_DIA_SDK
+
 /* Define to 1 if you have the <dirent.h> header file, and it defines `DIR'.
    */
 #undef HAVE_DIRENT_H
@@ -204,6 +210,9 @@
 /* Define if mallinfo() is available on this platform. */
 #undef HAVE_MALLINFO
 
+/* Define if mallctl() is available on this plaform. */
+#undef HAVE_MALLCTL
+
 /* Define to 1 if you have the <malloc.h> header file. */
 #undef HAVE_MALLOC_H
 
@@ -417,6 +426,9 @@
 /* Have host's __chkstk */
 #undef HAVE___CHKSTK
 
+/* Have host's __chkstk_ms */
+#undef HAVE___CHKSTK_MS
+
 /* Have host's __cmpdi2 */
 #undef HAVE___CMPDI2
 
@@ -453,6 +465,9 @@
 /* Have host's ___chkstk */
 #undef HAVE____CHKSTK
 
+/* Have host's ___chkstk_ms */
+#undef HAVE____CHKSTK_MS
+
 /* Linker version detected at compile time. */
 #undef HOST_LINK_VERSION
 
diff --git a/include/llvm/Config/llvm-config.h.cmake b/include/llvm/Config/llvm-config.h.cmake
index 77201e6..d54003d 100644
--- a/include/llvm/Config/llvm-config.h.cmake
+++ b/include/llvm/Config/llvm-config.h.cmake
@@ -87,10 +87,13 @@
 #cmakedefine LLVM_USE_OPROFILE 1
 
 /* Major version of the LLVM API */
-#cmakedefine LLVM_VERSION_MAJOR ${LLVM_VERSION_MAJOR}
+#define LLVM_VERSION_MAJOR ${LLVM_VERSION_MAJOR}
 
 /* Minor version of the LLVM API */
-#cmakedefine LLVM_VERSION_MINOR ${LLVM_VERSION_MINOR}
+#define LLVM_VERSION_MINOR ${LLVM_VERSION_MINOR}
+
+/* Patch version of the LLVM API */
+#define LLVM_VERSION_PATCH ${LLVM_VERSION_PATCH}
 
 /* LLVM version string */
 #define LLVM_VERSION_STRING "${PACKAGE_VERSION}"
diff --git a/include/llvm/Config/llvm-config.h.in b/include/llvm/Config/llvm-config.h.in
index 2d6add7..25a9295 100644
--- a/include/llvm/Config/llvm-config.h.in
+++ b/include/llvm/Config/llvm-config.h.in
@@ -92,6 +92,9 @@
 /* Minor version of the LLVM API */
 #undef LLVM_VERSION_MINOR
 
+/* Patch version of the LLVM API */
+#undef LLVM_VERSION_PATCH
+
 /* LLVM version string */
 #undef LLVM_VERSION_STRING
 
diff --git a/include/llvm/DebugInfo/DIContext.h b/include/llvm/DebugInfo/DIContext.h
deleted file mode 100644
index 3aa098d..0000000
--- a/include/llvm/DebugInfo/DIContext.h
+++ /dev/null
@@ -1,151 +0,0 @@
-//===-- DIContext.h ---------------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines DIContext, an abstract data structure that holds
-// debug information data.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_DICONTEXT_H
-#define LLVM_DEBUGINFO_DICONTEXT_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Object/RelocVisitor.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/DataTypes.h"
-
-#include <string>
-
-namespace llvm {
-
-class raw_ostream;
-
-/// DILineInfo - a format-neutral container for source line information.
-struct DILineInfo {
-  std::string FileName;
-  std::string FunctionName;
-  uint32_t Line;
-  uint32_t Column;
-
-  DILineInfo()
-      : FileName("<invalid>"), FunctionName("<invalid>"), Line(0), Column(0) {}
-
-  bool operator==(const DILineInfo &RHS) const {
-    return Line == RHS.Line && Column == RHS.Column &&
-           FileName == RHS.FileName && FunctionName == RHS.FunctionName;
-  }
-  bool operator!=(const DILineInfo &RHS) const {
-    return !(*this == RHS);
-  }
-};
-
-typedef SmallVector<std::pair<uint64_t, DILineInfo>, 16> DILineInfoTable;
-
-/// DIInliningInfo - a format-neutral container for inlined code description.
-class DIInliningInfo {
-  SmallVector<DILineInfo, 4> Frames;
- public:
-  DIInliningInfo() {}
-  DILineInfo getFrame(unsigned Index) const {
-    assert(Index < Frames.size());
-    return Frames[Index];
-  }
-  uint32_t getNumberOfFrames() const {
-    return Frames.size();
-  }
-  void addFrame(const DILineInfo &Frame) {
-    Frames.push_back(Frame);
-  }
-};
-
-/// A DINameKind is passed to name search methods to specify a
-/// preference regarding the type of name resolution the caller wants.
-enum class DINameKind { None, ShortName, LinkageName };
-
-/// DILineInfoSpecifier - controls which fields of DILineInfo container
-/// should be filled with data.
-struct DILineInfoSpecifier {
-  enum class FileLineInfoKind { None, Default, AbsoluteFilePath };
-  typedef DINameKind FunctionNameKind;
-
-  FileLineInfoKind FLIKind;
-  FunctionNameKind FNKind;
-
-  DILineInfoSpecifier(FileLineInfoKind FLIKind = FileLineInfoKind::Default,
-                      FunctionNameKind FNKind = FunctionNameKind::None)
-      : FLIKind(FLIKind), FNKind(FNKind) {}
-};
-
-/// Selects which debug sections get dumped.
-enum DIDumpType {
-  DIDT_Null,
-  DIDT_All,
-  DIDT_Abbrev,
-  DIDT_AbbrevDwo,
-  DIDT_Aranges,
-  DIDT_Frames,
-  DIDT_Info,
-  DIDT_InfoDwo,
-  DIDT_Types,
-  DIDT_TypesDwo,
-  DIDT_Line,
-  DIDT_LineDwo,
-  DIDT_Loc,
-  DIDT_LocDwo,
-  DIDT_Ranges,
-  DIDT_Pubnames,
-  DIDT_Pubtypes,
-  DIDT_GnuPubnames,
-  DIDT_GnuPubtypes,
-  DIDT_Str,
-  DIDT_StrDwo,
-  DIDT_StrOffsetsDwo,
-  DIDT_AppleNames,
-  DIDT_AppleTypes,
-  DIDT_AppleNamespaces,
-  DIDT_AppleObjC
-};
-
-// In place of applying the relocations to the data we've read from disk we use
-// a separate mapping table to the side and checking that at locations in the
-// dwarf where we expect relocated values. This adds a bit of complexity to the
-// dwarf parsing/extraction at the benefit of not allocating memory for the
-// entire size of the debug info sections.
-typedef DenseMap<uint64_t, std::pair<uint8_t, int64_t> > RelocAddrMap;
-
-class DIContext {
-public:
-  enum DIContextKind {
-    CK_DWARF
-  };
-  DIContextKind getKind() const { return Kind; }
-
-  DIContext(DIContextKind K) : Kind(K) {}
-  virtual ~DIContext();
-
-  /// getDWARFContext - get a context for binary DWARF data.
-  static DIContext *getDWARFContext(const object::ObjectFile &Obj);
-
-  virtual void dump(raw_ostream &OS, DIDumpType DumpType = DIDT_All) = 0;
-
-  virtual DILineInfo getLineInfoForAddress(uint64_t Address,
-      DILineInfoSpecifier Specifier = DILineInfoSpecifier()) = 0;
-  virtual DILineInfoTable getLineInfoForAddressRange(uint64_t Address,
-      uint64_t Size, DILineInfoSpecifier Specifier = DILineInfoSpecifier()) = 0;
-  virtual DIInliningInfo getInliningInfoForAddress(uint64_t Address,
-      DILineInfoSpecifier Specifier = DILineInfoSpecifier()) = 0;
-private:
-  const DIContextKind Kind;
-};
-
-}
-
-#endif
diff --git a/include/llvm/DebugInfo/DWARF/DIContext.h b/include/llvm/DebugInfo/DWARF/DIContext.h
new file mode 100644
index 0000000..622aa69
--- /dev/null
+++ b/include/llvm/DebugInfo/DWARF/DIContext.h
@@ -0,0 +1,150 @@
+//===-- DIContext.h ---------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines DIContext, an abstract data structure that holds
+// debug information data.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DICONTEXT_H
+#define LLVM_DEBUGINFO_DICONTEXT_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/RelocVisitor.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+
+class raw_ostream;
+
+/// DILineInfo - a format-neutral container for source line information.
+struct DILineInfo {
+  std::string FileName;
+  std::string FunctionName;
+  uint32_t Line;
+  uint32_t Column;
+
+  DILineInfo()
+      : FileName("<invalid>"), FunctionName("<invalid>"), Line(0), Column(0) {}
+
+  bool operator==(const DILineInfo &RHS) const {
+    return Line == RHS.Line && Column == RHS.Column &&
+           FileName == RHS.FileName && FunctionName == RHS.FunctionName;
+  }
+  bool operator!=(const DILineInfo &RHS) const {
+    return !(*this == RHS);
+  }
+};
+
+typedef SmallVector<std::pair<uint64_t, DILineInfo>, 16> DILineInfoTable;
+
+/// DIInliningInfo - a format-neutral container for inlined code description.
+class DIInliningInfo {
+  SmallVector<DILineInfo, 4> Frames;
+ public:
+  DIInliningInfo() {}
+  DILineInfo getFrame(unsigned Index) const {
+    assert(Index < Frames.size());
+    return Frames[Index];
+  }
+  uint32_t getNumberOfFrames() const {
+    return Frames.size();
+  }
+  void addFrame(const DILineInfo &Frame) {
+    Frames.push_back(Frame);
+  }
+};
+
+/// A DINameKind is passed to name search methods to specify a
+/// preference regarding the type of name resolution the caller wants.
+enum class DINameKind { None, ShortName, LinkageName };
+
+/// DILineInfoSpecifier - controls which fields of DILineInfo container
+/// should be filled with data.
+struct DILineInfoSpecifier {
+  enum class FileLineInfoKind { None, Default, AbsoluteFilePath };
+  typedef DINameKind FunctionNameKind;
+
+  FileLineInfoKind FLIKind;
+  FunctionNameKind FNKind;
+
+  DILineInfoSpecifier(FileLineInfoKind FLIKind = FileLineInfoKind::Default,
+                      FunctionNameKind FNKind = FunctionNameKind::None)
+      : FLIKind(FLIKind), FNKind(FNKind) {}
+};
+
+/// Selects which debug sections get dumped.
+enum DIDumpType {
+  DIDT_Null,
+  DIDT_All,
+  DIDT_Abbrev,
+  DIDT_AbbrevDwo,
+  DIDT_Aranges,
+  DIDT_Frames,
+  DIDT_Info,
+  DIDT_InfoDwo,
+  DIDT_Types,
+  DIDT_TypesDwo,
+  DIDT_Line,
+  DIDT_LineDwo,
+  DIDT_Loc,
+  DIDT_LocDwo,
+  DIDT_Ranges,
+  DIDT_Pubnames,
+  DIDT_Pubtypes,
+  DIDT_GnuPubnames,
+  DIDT_GnuPubtypes,
+  DIDT_Str,
+  DIDT_StrDwo,
+  DIDT_StrOffsetsDwo,
+  DIDT_AppleNames,
+  DIDT_AppleTypes,
+  DIDT_AppleNamespaces,
+  DIDT_AppleObjC
+};
+
+// In place of applying the relocations to the data we've read from disk we use
+// a separate mapping table to the side and checking that at locations in the
+// dwarf where we expect relocated values. This adds a bit of complexity to the
+// dwarf parsing/extraction at the benefit of not allocating memory for the
+// entire size of the debug info sections.
+typedef DenseMap<uint64_t, std::pair<uint8_t, int64_t> > RelocAddrMap;
+
+class DIContext {
+public:
+  enum DIContextKind {
+    CK_DWARF
+  };
+  DIContextKind getKind() const { return Kind; }
+
+  DIContext(DIContextKind K) : Kind(K) {}
+  virtual ~DIContext();
+
+  /// getDWARFContext - get a context for binary DWARF data.
+  static DIContext *getDWARFContext(const object::ObjectFile &Obj);
+
+  virtual void dump(raw_ostream &OS, DIDumpType DumpType = DIDT_All) = 0;
+
+  virtual DILineInfo getLineInfoForAddress(uint64_t Address,
+      DILineInfoSpecifier Specifier = DILineInfoSpecifier()) = 0;
+  virtual DILineInfoTable getLineInfoForAddressRange(uint64_t Address,
+      uint64_t Size, DILineInfoSpecifier Specifier = DILineInfoSpecifier()) = 0;
+  virtual DIInliningInfo getInliningInfoForAddress(uint64_t Address,
+      DILineInfoSpecifier Specifier = DILineInfoSpecifier()) = 0;
+private:
+  const DIContextKind Kind;
+};
+
+}
+
+#endif
diff --git a/lib/DebugInfo/DWARFAbbreviationDeclaration.h b/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
index bb05c30..bb05c30 100644
--- a/lib/DebugInfo/DWARFAbbreviationDeclaration.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
diff --git a/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h b/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
new file mode 100644
index 0000000..e34f096
--- /dev/null
+++ b/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
@@ -0,0 +1,49 @@
+//===--- DWARFAcceleratorTable.h --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
+#include <cstdint>
+
+namespace llvm {
+
+class DWARFAcceleratorTable {
+
+  struct Header {
+    uint32_t Magic;
+    uint16_t Version;
+    uint16_t HashFunction;
+    uint32_t NumBuckets;
+    uint32_t NumHashes;
+    uint32_t HeaderDataLength;
+  };
+
+  struct HeaderData {
+    typedef uint16_t AtomType;
+    typedef uint16_t Form;
+    uint32_t DIEOffsetBase;
+    SmallVector<std::pair<AtomType, Form>, 3> Atoms;
+  };
+
+  struct Header Hdr;
+  struct HeaderData HdrData;
+  DataExtractor AccelSection;
+  DataExtractor StringSection;
+  const RelocAddrMap& Relocs;
+public:
+  DWARFAcceleratorTable(DataExtractor AccelSection, DataExtractor StringSection,
+                        const RelocAddrMap &Relocs)
+    : AccelSection(AccelSection), StringSection(StringSection), Relocs(Relocs) {}
+
+  bool extract();
+  void dump(raw_ostream &OS) const;
+};
+
+}
diff --git a/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h b/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
new file mode 100644
index 0000000..743f9c6
--- /dev/null
+++ b/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
@@ -0,0 +1,31 @@
+//===-- DWARFCompileUnit.h --------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_DEBUGINFO_DWARFCOMPILEUNIT_H
+#define LLVM_LIB_DEBUGINFO_DWARFCOMPILEUNIT_H
+
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+
+namespace llvm {
+
+class DWARFCompileUnit : public DWARFUnit {
+public:
+  DWARFCompileUnit(DWARFContext &Context, const DWARFSection &Section,
+                   const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
+                   StringRef SOS, StringRef AOS, bool LE,
+                   const DWARFUnitSectionBase &UnitSection)
+      : DWARFUnit(Context, Section, DA, RS, SS, SOS, AOS, LE, UnitSection) {}
+  void dump(raw_ostream &OS);
+  // VTable anchor.
+  ~DWARFCompileUnit() override;
+};
+
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h
new file mode 100644
index 0000000..677242b
--- /dev/null
+++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -0,0 +1,292 @@
+//===-- DWARFContext.h ------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===/
+
+#ifndef LLVM_LIB_DEBUGINFO_DWARFCONTEXT_H
+#define LLVM_LIB_DEBUGINFO_DWARFCONTEXT_H
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/DWARF/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugAranges.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFSection.h"
+#include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
+#include <vector>
+
+namespace llvm {
+
+/// DWARFContext
+/// This data structure is the top level entity that deals with dwarf debug
+/// information parsing. The actual data is supplied through pure virtual
+/// methods that a concrete implementation provides.
+class DWARFContext : public DIContext {
+
+  DWARFUnitSection<DWARFCompileUnit> CUs;
+  std::vector<DWARFUnitSection<DWARFTypeUnit>> TUs;
+  std::unique_ptr<DWARFDebugAbbrev> Abbrev;
+  std::unique_ptr<DWARFDebugLoc> Loc;
+  std::unique_ptr<DWARFDebugAranges> Aranges;
+  std::unique_ptr<DWARFDebugLine> Line;
+  std::unique_ptr<DWARFDebugFrame> DebugFrame;
+
+  DWARFUnitSection<DWARFCompileUnit> DWOCUs;
+  std::vector<DWARFUnitSection<DWARFTypeUnit>> DWOTUs;
+  std::unique_ptr<DWARFDebugAbbrev> AbbrevDWO;
+  std::unique_ptr<DWARFDebugLocDWO> LocDWO;
+
+  DWARFContext(DWARFContext &) = delete;
+  DWARFContext &operator=(DWARFContext &) = delete;
+
+  /// Read compile units from the debug_info section (if necessary)
+  /// and store them in CUs.
+  void parseCompileUnits();
+
+  /// Read type units from the debug_types sections (if necessary)
+  /// and store them in TUs.
+  void parseTypeUnits();
+
+  /// Read compile units from the debug_info.dwo section (if necessary)
+  /// and store them in DWOCUs.
+  void parseDWOCompileUnits();
+
+  /// Read type units from the debug_types.dwo section (if necessary)
+  /// and store them in DWOTUs.
+  void parseDWOTypeUnits();
+
+public:
+  DWARFContext() : DIContext(CK_DWARF) {}
+
+  static bool classof(const DIContext *DICtx) {
+    return DICtx->getKind() == CK_DWARF;
+  }
+
+  void dump(raw_ostream &OS, DIDumpType DumpType = DIDT_All) override;
+
+  typedef DWARFUnitSection<DWARFCompileUnit>::iterator_range cu_iterator_range;
+  typedef DWARFUnitSection<DWARFTypeUnit>::iterator_range tu_iterator_range;
+  typedef iterator_range<std::vector<DWARFUnitSection<DWARFTypeUnit>>::iterator> tu_section_iterator_range;
+
+  /// Get compile units in this context.
+  cu_iterator_range compile_units() {
+    parseCompileUnits();
+    return cu_iterator_range(CUs.begin(), CUs.end());
+  }
+
+  /// Get type units in this context.
+  tu_section_iterator_range type_unit_sections() {
+    parseTypeUnits();
+    return tu_section_iterator_range(TUs.begin(), TUs.end());
+  }
+
+  /// Get compile units in the DWO context.
+  cu_iterator_range dwo_compile_units() {
+    parseDWOCompileUnits();
+    return cu_iterator_range(DWOCUs.begin(), DWOCUs.end());
+  }
+
+  /// Get type units in the DWO context.
+  tu_section_iterator_range dwo_type_unit_sections() {
+    parseDWOTypeUnits();
+    return tu_section_iterator_range(DWOTUs.begin(), DWOTUs.end());
+  }
+
+  /// Get the number of compile units in this context.
+  unsigned getNumCompileUnits() {
+    parseCompileUnits();
+    return CUs.size();
+  }
+
+  /// Get the number of compile units in this context.
+  unsigned getNumTypeUnits() {
+    parseTypeUnits();
+    return TUs.size();
+  }
+
+  /// Get the number of compile units in the DWO context.
+  unsigned getNumDWOCompileUnits() {
+    parseDWOCompileUnits();
+    return DWOCUs.size();
+  }
+
+  /// Get the number of compile units in the DWO context.
+  unsigned getNumDWOTypeUnits() {
+    parseDWOTypeUnits();
+    return DWOTUs.size();
+  }
+
+  /// Get the compile unit at the specified index for this compile unit.
+  DWARFCompileUnit *getCompileUnitAtIndex(unsigned index) {
+    parseCompileUnits();
+    return CUs[index].get();
+  }
+
+  /// Get the compile unit at the specified index for the DWO compile units.
+  DWARFCompileUnit *getDWOCompileUnitAtIndex(unsigned index) {
+    parseDWOCompileUnits();
+    return DWOCUs[index].get();
+  }
+
+  /// Get a pointer to the parsed DebugAbbrev object.
+  const DWARFDebugAbbrev *getDebugAbbrev();
+
+  /// Get a pointer to the parsed DebugLoc object.
+  const DWARFDebugLoc *getDebugLoc();
+
+  /// Get a pointer to the parsed dwo abbreviations object.
+  const DWARFDebugAbbrev *getDebugAbbrevDWO();
+
+  /// Get a pointer to the parsed DebugLoc object.
+  const DWARFDebugLocDWO *getDebugLocDWO();
+
+  /// Get a pointer to the parsed DebugAranges object.
+  const DWARFDebugAranges *getDebugAranges();
+
+  /// Get a pointer to the parsed frame information object.
+  const DWARFDebugFrame *getDebugFrame();
+
+  /// Get a pointer to a parsed line table corresponding to a compile unit.
+  const DWARFDebugLine::LineTable *getLineTableForUnit(DWARFUnit *cu);
+
+  DILineInfo getLineInfoForAddress(uint64_t Address,
+      DILineInfoSpecifier Specifier = DILineInfoSpecifier()) override;
+  DILineInfoTable getLineInfoForAddressRange(uint64_t Address, uint64_t Size,
+      DILineInfoSpecifier Specifier = DILineInfoSpecifier()) override;
+  DIInliningInfo getInliningInfoForAddress(uint64_t Address,
+      DILineInfoSpecifier Specifier = DILineInfoSpecifier()) override;
+
+  virtual bool isLittleEndian() const = 0;
+  virtual uint8_t getAddressSize() const = 0;
+  virtual const DWARFSection &getInfoSection() = 0;
+  typedef MapVector<object::SectionRef, DWARFSection,
+                    std::map<object::SectionRef, unsigned>> TypeSectionMap;
+  virtual const TypeSectionMap &getTypesSections() = 0;
+  virtual StringRef getAbbrevSection() = 0;
+  virtual const DWARFSection &getLocSection() = 0;
+  virtual StringRef getARangeSection() = 0;
+  virtual StringRef getDebugFrameSection() = 0;
+  virtual const DWARFSection &getLineSection() = 0;
+  virtual StringRef getStringSection() = 0;
+  virtual StringRef getRangeSection() = 0;
+  virtual StringRef getPubNamesSection() = 0;
+  virtual StringRef getPubTypesSection() = 0;
+  virtual StringRef getGnuPubNamesSection() = 0;
+  virtual StringRef getGnuPubTypesSection() = 0;
+
+  // Sections for DWARF5 split dwarf proposal.
+  virtual const DWARFSection &getInfoDWOSection() = 0;
+  virtual const TypeSectionMap &getTypesDWOSections() = 0;
+  virtual StringRef getAbbrevDWOSection() = 0;
+  virtual const DWARFSection &getLineDWOSection() = 0;
+  virtual const DWARFSection &getLocDWOSection() = 0;
+  virtual StringRef getStringDWOSection() = 0;
+  virtual StringRef getStringOffsetDWOSection() = 0;
+  virtual StringRef getRangeDWOSection() = 0;
+  virtual StringRef getAddrSection() = 0;
+  virtual const DWARFSection& getAppleNamesSection() = 0;
+  virtual const DWARFSection& getAppleTypesSection() = 0;
+  virtual const DWARFSection& getAppleNamespacesSection() = 0;
+  virtual const DWARFSection& getAppleObjCSection() = 0;
+
+  static bool isSupportedVersion(unsigned version) {
+    return version == 2 || version == 3 || version == 4;
+  }
+private:
+  /// Return the compile unit that includes an offset (relative to .debug_info).
+  DWARFCompileUnit *getCompileUnitForOffset(uint32_t Offset);
+
+  /// Return the compile unit which contains instruction with provided
+  /// address.
+  DWARFCompileUnit *getCompileUnitForAddress(uint64_t Address);
+};
+
+/// DWARFContextInMemory is the simplest possible implementation of a
+/// DWARFContext. It assumes all content is available in memory and stores
+/// pointers to it.
+class DWARFContextInMemory : public DWARFContext {
+  virtual void anchor();
+  bool IsLittleEndian;
+  uint8_t AddressSize;
+  DWARFSection InfoSection;
+  TypeSectionMap TypesSections;
+  StringRef AbbrevSection;
+  DWARFSection LocSection;
+  StringRef ARangeSection;
+  StringRef DebugFrameSection;
+  DWARFSection LineSection;
+  StringRef StringSection;
+  StringRef RangeSection;
+  StringRef PubNamesSection;
+  StringRef PubTypesSection;
+  StringRef GnuPubNamesSection;
+  StringRef GnuPubTypesSection;
+
+  // Sections for DWARF5 split dwarf proposal.
+  DWARFSection InfoDWOSection;
+  TypeSectionMap TypesDWOSections;
+  StringRef AbbrevDWOSection;
+  DWARFSection LineDWOSection;
+  DWARFSection LocDWOSection;
+  StringRef StringDWOSection;
+  StringRef StringOffsetDWOSection;
+  StringRef RangeDWOSection;
+  StringRef AddrSection;
+  DWARFSection AppleNamesSection;
+  DWARFSection AppleTypesSection;
+  DWARFSection AppleNamespacesSection;
+  DWARFSection AppleObjCSection;
+
+  SmallVector<SmallString<32>, 4> UncompressedSections;
+
+public:
+  DWARFContextInMemory(const object::ObjectFile &Obj);
+  bool isLittleEndian() const override { return IsLittleEndian; }
+  uint8_t getAddressSize() const override { return AddressSize; }
+  const DWARFSection &getInfoSection() override { return InfoSection; }
+  const TypeSectionMap &getTypesSections() override { return TypesSections; }
+  StringRef getAbbrevSection() override { return AbbrevSection; }
+  const DWARFSection &getLocSection() override { return LocSection; }
+  StringRef getARangeSection() override { return ARangeSection; }
+  StringRef getDebugFrameSection() override { return DebugFrameSection; }
+  const DWARFSection &getLineSection() override { return LineSection; }
+  StringRef getStringSection() override { return StringSection; }
+  StringRef getRangeSection() override { return RangeSection; }
+  StringRef getPubNamesSection() override { return PubNamesSection; }
+  StringRef getPubTypesSection() override { return PubTypesSection; }
+  StringRef getGnuPubNamesSection() override { return GnuPubNamesSection; }
+  StringRef getGnuPubTypesSection() override { return GnuPubTypesSection; }
+  const DWARFSection& getAppleNamesSection() override { return AppleNamesSection; }
+  const DWARFSection& getAppleTypesSection() override { return AppleTypesSection; }
+  const DWARFSection& getAppleNamespacesSection() override { return AppleNamespacesSection; }
+  const DWARFSection& getAppleObjCSection() override { return AppleObjCSection; }
+
+  // Sections for DWARF5 split dwarf proposal.
+  const DWARFSection &getInfoDWOSection() override { return InfoDWOSection; }
+  const TypeSectionMap &getTypesDWOSections() override {
+    return TypesDWOSections;
+  }
+  StringRef getAbbrevDWOSection() override { return AbbrevDWOSection; }
+  const DWARFSection &getLineDWOSection() override { return LineDWOSection; }
+  const DWARFSection &getLocDWOSection() override { return LocDWOSection; }
+  StringRef getStringDWOSection() override { return StringDWOSection; }
+  StringRef getStringOffsetDWOSection() override {
+    return StringOffsetDWOSection;
+  }
+  StringRef getRangeDWOSection() override { return RangeDWOSection; }
+  StringRef getAddrSection() override {
+    return AddrSection;
+  }
+};
+
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h b/include/llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h
new file mode 100644
index 0000000..2114208
--- /dev/null
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h
@@ -0,0 +1,63 @@
+//===-- DWARFDebugAbbrev.h --------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGABBREV_H
+#define LLVM_LIB_DEBUGINFO_DWARFDEBUGABBREV_H
+
+#include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
+#include <list>
+#include <map>
+#include <vector>
+
+namespace llvm {
+
+class DWARFAbbreviationDeclarationSet {
+  uint32_t Offset;
+  /// Code of the first abbreviation, if all abbreviations in the set have
+  /// consecutive codes. UINT32_MAX otherwise.
+  uint32_t FirstAbbrCode;
+  std::vector<DWARFAbbreviationDeclaration> Decls;
+
+public:
+  DWARFAbbreviationDeclarationSet();
+
+  uint32_t getOffset() const { return Offset; }
+  void dump(raw_ostream &OS) const;
+  bool extract(DataExtractor Data, uint32_t *OffsetPtr);
+
+  const DWARFAbbreviationDeclaration *
+  getAbbreviationDeclaration(uint32_t AbbrCode) const;
+
+private:
+  void clear();
+};
+
+class DWARFDebugAbbrev {
+  typedef std::map<uint64_t, DWARFAbbreviationDeclarationSet>
+    DWARFAbbreviationDeclarationSetMap;
+
+  DWARFAbbreviationDeclarationSetMap AbbrDeclSets;
+  mutable DWARFAbbreviationDeclarationSetMap::const_iterator PrevAbbrOffsetPos;
+
+public:
+  DWARFDebugAbbrev();
+
+  const DWARFAbbreviationDeclarationSet *
+  getAbbreviationDeclarationSet(uint64_t CUAbbrOffset) const;
+
+  void dump(raw_ostream &OS) const;
+  void extract(DataExtractor Data);
+
+private:
+  void clear();
+};
+
+}
+
+#endif
diff --git a/lib/DebugInfo/DWARFDebugArangeSet.h b/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
index 837a8e6..837a8e6 100644
--- a/lib/DebugInfo/DWARFDebugArangeSet.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
diff --git a/lib/DebugInfo/DWARFDebugAranges.h b/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
index 791f010..791f010 100644
--- a/lib/DebugInfo/DWARFDebugAranges.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
diff --git a/lib/DebugInfo/DWARFDebugFrame.h b/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
index be925cb..be925cb 100644
--- a/lib/DebugInfo/DWARFDebugFrame.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h b/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
new file mode 100644
index 0000000..1080327
--- /dev/null
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
@@ -0,0 +1,160 @@
+//===-- DWARFDebugInfoEntry.h -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGINFOENTRY_H
+#define LLVM_LIB_DEBUGINFO_DWARFDEBUGINFOENTRY_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/DWARF/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class DWARFDebugAranges;
+class DWARFCompileUnit;
+class DWARFUnit;
+class DWARFContext;
+class DWARFFormValue;
+struct DWARFDebugInfoEntryInlinedChain;
+
+/// DWARFDebugInfoEntryMinimal - A DIE with only the minimum required data.
+class DWARFDebugInfoEntryMinimal {
+  /// Offset within the .debug_info of the start of this entry.
+  uint32_t Offset;
+
+  /// How many to add to "this" to get the sibling.
+  uint32_t SiblingIdx;
+
+  const DWARFAbbreviationDeclaration *AbbrevDecl;
+public:
+  DWARFDebugInfoEntryMinimal()
+    : Offset(0), SiblingIdx(0), AbbrevDecl(nullptr) {}
+
+  void dump(raw_ostream &OS, DWARFUnit *u, unsigned recurseDepth,
+            unsigned indent = 0) const;
+  void dumpAttribute(raw_ostream &OS, DWARFUnit *u, uint32_t *offset_ptr,
+                     uint16_t attr, uint16_t form, unsigned indent = 0) const;
+
+  /// Extracts a debug info entry, which is a child of a given unit,
+  /// starting at a given offset. If DIE can't be extracted, returns false and
+  /// doesn't change OffsetPtr.
+  bool extractFast(const DWARFUnit *U, uint32_t *OffsetPtr);
+
+  uint32_t getTag() const { return AbbrevDecl ? AbbrevDecl->getTag() : 0; }
+  bool isNULL() const { return AbbrevDecl == nullptr; }
+
+  /// Returns true if DIE represents a subprogram (not inlined).
+  bool isSubprogramDIE() const;
+  /// Returns true if DIE represents a subprogram or an inlined
+  /// subroutine.
+  bool isSubroutineDIE() const;
+
+  uint32_t getOffset() const { return Offset; }
+  bool hasChildren() const { return !isNULL() && AbbrevDecl->hasChildren(); }
+
+  // We know we are kept in a vector of contiguous entries, so we know
+  // our sibling will be some index after "this".
+  const DWARFDebugInfoEntryMinimal *getSibling() const {
+    return SiblingIdx > 0 ? this + SiblingIdx : nullptr;
+  }
+
+  // We know we are kept in a vector of contiguous entries, so we know
+  // we don't need to store our child pointer, if we have a child it will
+  // be the next entry in the list...
+  const DWARFDebugInfoEntryMinimal *getFirstChild() const {
+    return hasChildren() ? this + 1 : nullptr;
+  }
+
+  void setSibling(const DWARFDebugInfoEntryMinimal *Sibling) {
+    if (Sibling) {
+      // We know we are kept in a vector of contiguous entries, so we know
+      // our sibling will be some index after "this".
+      SiblingIdx = Sibling - this;
+    } else
+      SiblingIdx = 0;
+  }
+
+  const DWARFAbbreviationDeclaration *getAbbreviationDeclarationPtr() const {
+    return AbbrevDecl;
+  }
+
+  bool getAttributeValue(const DWARFUnit *U, const uint16_t Attr,
+                         DWARFFormValue &FormValue) const;
+
+  const char *getAttributeValueAsString(const DWARFUnit *U, const uint16_t Attr,
+                                        const char *FailValue) const;
+
+  uint64_t getAttributeValueAsAddress(const DWARFUnit *U, const uint16_t Attr,
+                                      uint64_t FailValue) const;
+
+  uint64_t getAttributeValueAsUnsignedConstant(const DWARFUnit *U,
+                                               const uint16_t Attr,
+                                               uint64_t FailValue) const;
+
+  uint64_t getAttributeValueAsReference(const DWARFUnit *U, const uint16_t Attr,
+                                        uint64_t FailValue) const;
+
+  uint64_t getAttributeValueAsSectionOffset(const DWARFUnit *U,
+                                            const uint16_t Attr,
+                                            uint64_t FailValue) const;
+
+  uint64_t getRangesBaseAttribute(const DWARFUnit *U, uint64_t FailValue) const;
+
+  /// Retrieves DW_AT_low_pc and DW_AT_high_pc from CU.
+  /// Returns true if both attributes are present.
+  bool getLowAndHighPC(const DWARFUnit *U, uint64_t &LowPC,
+                       uint64_t &HighPC) const;
+
+  DWARFAddressRangesVector getAddressRanges(const DWARFUnit *U) const;
+
+  void collectChildrenAddressRanges(const DWARFUnit *U,
+                                    DWARFAddressRangesVector &Ranges) const;
+
+  bool addressRangeContainsAddress(const DWARFUnit *U,
+                                   const uint64_t Address) const;
+
+  /// If a DIE represents a subprogram (or inlined subroutine),
+  /// returns its mangled name (or short name, if mangled is missing).
+  /// This name may be fetched from specification or abstract origin
+  /// for this subprogram. Returns null if no name is found.
+  const char *getSubroutineName(const DWARFUnit *U, DINameKind Kind) const;
+
+  /// Return the DIE name resolving DW_AT_sepcification or
+  /// DW_AT_abstract_origin references if necessary.
+  /// Returns null if no name is found.
+  const char *getName(const DWARFUnit *U, DINameKind Kind) const;
+
+  /// Retrieves values of DW_AT_call_file, DW_AT_call_line and
+  /// DW_AT_call_column from DIE (or zeroes if they are missing).
+  void getCallerFrame(const DWARFUnit *U, uint32_t &CallFile,
+                      uint32_t &CallLine, uint32_t &CallColumn) const;
+
+  /// Get inlined chain for a given address, rooted at the current DIE.
+  /// Returns empty chain if address is not contained in address range
+  /// of current DIE.
+  DWARFDebugInfoEntryInlinedChain
+  getInlinedChainForAddress(const DWARFUnit *U, const uint64_t Address) const;
+};
+
+/// DWARFDebugInfoEntryInlinedChain - represents a chain of inlined_subroutine
+/// DIEs, (possibly ending with subprogram DIE), all of which are contained
+/// in some concrete inlined instance tree. Address range for each DIE
+/// (except the last DIE) in this chain is contained in address
+/// range for next DIE in the chain.
+struct DWARFDebugInfoEntryInlinedChain {
+  DWARFDebugInfoEntryInlinedChain() : U(nullptr) {}
+  SmallVector<DWARFDebugInfoEntryMinimal, 4> DIEs;
+  const DWARFUnit *U;
+};
+
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
new file mode 100644
index 0000000..0c564c4
--- /dev/null
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
@@ -0,0 +1,238 @@
+//===-- DWARFDebugLine.h ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGLINE_H
+#define LLVM_LIB_DEBUGINFO_DWARFDEBUGLINE_H
+
+#include "llvm/DebugInfo/DWARF/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
+#include "llvm/Support/DataExtractor.h"
+#include <map>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+class raw_ostream;
+
+class DWARFDebugLine {
+public:
+  DWARFDebugLine(const RelocAddrMap* LineInfoRelocMap) : RelocMap(LineInfoRelocMap) {}
+  struct FileNameEntry {
+    FileNameEntry() : Name(nullptr), DirIdx(0), ModTime(0), Length(0) {}
+
+    const char *Name;
+    uint64_t DirIdx;
+    uint64_t ModTime;
+    uint64_t Length;
+  };
+
+  struct Prologue {
+    Prologue();
+
+    // The size in bytes of the statement information for this compilation unit
+    // (not including the total_length field itself).
+    uint32_t TotalLength;
+    // Version identifier for the statement information format.
+    uint16_t Version;
+    // The number of bytes following the prologue_length field to the beginning
+    // of the first byte of the statement program itself.
+    uint32_t PrologueLength;
+    // The size in bytes of the smallest target machine instruction. Statement
+    // program opcodes that alter the address register first multiply their
+    // operands by this value.
+    uint8_t MinInstLength;
+    // The maximum number of individual operations that may be encoded in an
+    // instruction.
+    uint8_t MaxOpsPerInst;
+    // The initial value of theis_stmtregister.
+    uint8_t DefaultIsStmt;
+    // This parameter affects the meaning of the special opcodes. See below.
+    int8_t LineBase;
+    // This parameter affects the meaning of the special opcodes. See below.
+    uint8_t LineRange;
+    // The number assigned to the first special opcode.
+    uint8_t OpcodeBase;
+    std::vector<uint8_t> StandardOpcodeLengths;
+    std::vector<const char*> IncludeDirectories;
+    std::vector<FileNameEntry> FileNames;
+
+    // Length of the prologue in bytes.
+    uint32_t getLength() const {
+      return PrologueLength + sizeof(TotalLength) + sizeof(Version) +
+             sizeof(PrologueLength);
+    }
+    // Length of the line table data in bytes (not including the prologue).
+    uint32_t getStatementTableLength() const {
+      return TotalLength + sizeof(TotalLength) - getLength();
+    }
+    int32_t getMaxLineIncrementForSpecialOpcode() const {
+      return LineBase + (int8_t)LineRange - 1;
+    }
+
+    void clear();
+    void dump(raw_ostream &OS) const;
+    bool parse(DataExtractor debug_line_data, uint32_t *offset_ptr);
+  };
+
+  // Standard .debug_line state machine structure.
+  struct Row {
+    explicit Row(bool default_is_stmt = false);
+
+    /// Called after a row is appended to the matrix.
+    void postAppend();
+    void reset(bool default_is_stmt);
+    void dump(raw_ostream &OS) const;
+
+    static bool orderByAddress(const Row& LHS, const Row& RHS) {
+      return LHS.Address < RHS.Address;
+    }
+
+    // The program-counter value corresponding to a machine instruction
+    // generated by the compiler.
+    uint64_t Address;
+    // An unsigned integer indicating a source line number. Lines are numbered
+    // beginning at 1. The compiler may emit the value 0 in cases where an
+    // instruction cannot be attributed to any source line.
+    uint32_t Line;
+    // An unsigned integer indicating a column number within a source line.
+    // Columns are numbered beginning at 1. The value 0 is reserved to indicate
+    // that a statement begins at the 'left edge' of the line.
+    uint16_t Column;
+    // An unsigned integer indicating the identity of the source file
+    // corresponding to a machine instruction.
+    uint16_t File;
+    // An unsigned integer whose value encodes the applicable instruction set
+    // architecture for the current instruction.
+    uint8_t Isa;
+    // An unsigned integer representing the DWARF path discriminator value
+    // for this location.
+    uint32_t Discriminator;
+    // A boolean indicating that the current instruction is the beginning of a
+    // statement.
+    uint8_t IsStmt:1,
+            // A boolean indicating that the current instruction is the
+            // beginning of a basic block.
+            BasicBlock:1,
+            // A boolean indicating that the current address is that of the
+            // first byte after the end of a sequence of target machine
+            // instructions.
+            EndSequence:1,
+            // A boolean indicating that the current address is one (of possibly
+            // many) where execution should be suspended for an entry breakpoint
+            // of a function.
+            PrologueEnd:1,
+            // A boolean indicating that the current address is one (of possibly
+            // many) where execution should be suspended for an exit breakpoint
+            // of a function.
+            EpilogueBegin:1;
+  };
+
+  // Represents a series of contiguous machine instructions. Line table for each
+  // compilation unit may consist of multiple sequences, which are not
+  // guaranteed to be in the order of ascending instruction address.
+  struct Sequence {
+    // Sequence describes instructions at address range [LowPC, HighPC)
+    // and is described by line table rows [FirstRowIndex, LastRowIndex).
+    uint64_t LowPC;
+    uint64_t HighPC;
+    unsigned FirstRowIndex;
+    unsigned LastRowIndex;
+    bool Empty;
+
+    Sequence();
+    void reset();
+
+    static bool orderByLowPC(const Sequence& LHS, const Sequence& RHS) {
+      return LHS.LowPC < RHS.LowPC;
+    }
+    bool isValid() const {
+      return !Empty && (LowPC < HighPC) && (FirstRowIndex < LastRowIndex);
+    }
+    bool containsPC(uint64_t pc) const {
+      return (LowPC <= pc && pc < HighPC);
+    }
+  };
+
+  struct LineTable {
+    LineTable();
+
+    void appendRow(const DWARFDebugLine::Row &R) {
+      Rows.push_back(R);
+    }
+    void appendSequence(const DWARFDebugLine::Sequence &S) {
+      Sequences.push_back(S);
+    }
+
+    // Returns the index of the row with file/line info for a given address,
+    // or -1 if there is no such row.
+    uint32_t lookupAddress(uint64_t address) const;
+
+    bool lookupAddressRange(uint64_t address, uint64_t size,
+                            std::vector<uint32_t> &result) const;
+
+    // Extracts filename by its index in filename table in prologue.
+    // Returns true on success.
+    bool getFileNameByIndex(uint64_t FileIndex, const char *CompDir,
+                            DILineInfoSpecifier::FileLineInfoKind Kind,
+                            std::string &Result) const;
+
+    // Fills the Result argument with the file and line information
+    // corresponding to Address. Returns true on success.
+    bool getFileLineInfoForAddress(uint64_t Address, const char *CompDir, 
+                                   DILineInfoSpecifier::FileLineInfoKind Kind,
+                                   DILineInfo &Result) const;
+
+    void dump(raw_ostream &OS) const;
+    void clear();
+
+    /// Parse prologue and all rows.
+    bool parse(DataExtractor debug_line_data, const RelocAddrMap *RMap,
+               uint32_t *offset_ptr);
+
+    struct Prologue Prologue;
+    typedef std::vector<Row> RowVector;
+    typedef RowVector::const_iterator RowIter;
+    typedef std::vector<Sequence> SequenceVector;
+    typedef SequenceVector::const_iterator SequenceIter;
+    RowVector Rows;
+    SequenceVector Sequences;
+  };
+
+  const LineTable *getLineTable(uint32_t offset) const;
+  const LineTable *getOrParseLineTable(DataExtractor debug_line_data,
+                                       uint32_t offset);
+
+private:
+  struct ParsingState {
+    ParsingState(struct LineTable *LT);
+
+    void resetRowAndSequence();
+    void appendRowToMatrix(uint32_t offset);
+
+    // Line table we're currently parsing.
+    struct LineTable *LineTable;
+    // The row number that starts at zero for the prologue, and increases for
+    // each row added to the matrix.
+    unsigned RowNumber;
+    struct Row Row;
+    struct Sequence Sequence;
+  };
+
+  typedef std::map<uint32_t, LineTable> LineTableMapTy;
+  typedef LineTableMapTy::iterator LineTableIter;
+  typedef LineTableMapTy::const_iterator LineTableConstIter;
+
+  const RelocAddrMap *RelocMap;
+  LineTableMapTy LineTableMap;
+};
+
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h b/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
new file mode 100644
index 0000000..bd44c2e5
--- /dev/null
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
@@ -0,0 +1,81 @@
+//===-- DWARFDebugLoc.h -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGLOC_H
+#define LLVM_LIB_DEBUGINFO_DWARFDEBUGLOC_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
+#include "llvm/Support/DataExtractor.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class DWARFDebugLoc {
+  /// A single location within a location list.
+  struct Entry {
+    /// The beginning address of the instruction range.
+    uint64_t Begin;
+    /// The ending address of the instruction range.
+    uint64_t End;
+    /// The location of the variable within the specified range.
+    SmallVector<unsigned char, 4> Loc;
+  };
+
+  /// A list of locations that contain one variable.
+  struct LocationList {
+    /// The beginning offset where this location list is stored in the debug_loc
+    /// section.
+    unsigned Offset;
+    /// All the locations in which the variable is stored.
+    SmallVector<Entry, 2> Entries;
+  };
+
+  typedef SmallVector<LocationList, 4> LocationLists;
+
+  /// A list of all the variables in the debug_loc section, each one describing
+  /// the locations in which the variable is stored.
+  LocationLists Locations;
+
+  /// A map used to resolve binary relocations.
+  const RelocAddrMap &RelocMap;
+
+public:
+  DWARFDebugLoc(const RelocAddrMap &LocRelocMap) : RelocMap(LocRelocMap) {}
+  /// Print the location lists found within the debug_loc section.
+  void dump(raw_ostream &OS) const;
+  /// Parse the debug_loc section accessible via the 'data' parameter using the
+  /// specified address size to interpret the address ranges.
+  void parse(DataExtractor data, unsigned AddressSize);
+};
+
+class DWARFDebugLocDWO {
+  struct Entry {
+    uint64_t Start;
+    uint32_t Length;
+    SmallVector<unsigned char, 4> Loc;
+  };
+
+  struct LocationList {
+    unsigned Offset;
+    SmallVector<Entry, 2> Entries;
+  };
+
+  typedef SmallVector<LocationList, 4> LocationLists;
+
+  LocationLists Locations;
+
+public:
+  void parse(DataExtractor data);
+  void dump(raw_ostream &OS) const;
+};
+}
+
+#endif
diff --git a/lib/DebugInfo/DWARFDebugRangeList.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
index 4ee3bda..4ee3bda 100644
--- a/lib/DebugInfo/DWARFDebugRangeList.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
diff --git a/include/llvm/DebugInfo/DWARFFormValue.h b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
index 5bb6d1b..5bb6d1b 100644
--- a/include/llvm/DebugInfo/DWARFFormValue.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
diff --git a/lib/DebugInfo/DWARFRelocMap.h b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
index d7fe303..d7fe303 100644
--- a/lib/DebugInfo/DWARFRelocMap.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
diff --git a/include/llvm/DebugInfo/DWARF/DWARFSection.h b/include/llvm/DebugInfo/DWARF/DWARFSection.h
new file mode 100644
index 0000000..f52004c
--- /dev/null
+++ b/include/llvm/DebugInfo/DWARF/DWARFSection.h
@@ -0,0 +1,24 @@
+//===-- DWARFSection.h ------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_DEBUGINFO_DWARFSECTION_H
+#define LLVM_LIB_DEBUGINFO_DWARFSECTION_H
+
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
+
+namespace llvm {
+
+struct DWARFSection {
+  StringRef Data;
+  RelocAddrMap Relocs;
+};
+
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h b/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
new file mode 100644
index 0000000..f24e278
--- /dev/null
+++ b/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
@@ -0,0 +1,38 @@
+//===-- DWARFTypeUnit.h -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_DEBUGINFO_DWARFTYPEUNIT_H
+#define LLVM_LIB_DEBUGINFO_DWARFTYPEUNIT_H
+
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+
+namespace llvm {
+
+class DWARFTypeUnit : public DWARFUnit {
+private:
+  uint64_t TypeHash;
+  uint32_t TypeOffset;
+public:
+  DWARFTypeUnit(DWARFContext &Context, const DWARFSection &Section,
+                const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
+                StringRef SOS, StringRef AOS, bool LE,
+                const DWARFUnitSectionBase &UnitSection)
+      : DWARFUnit(Context, Section, DA, RS, SS, SOS, AOS, LE, UnitSection) {}
+  uint32_t getHeaderSize() const override {
+    return DWARFUnit::getHeaderSize() + 12;
+  }
+  void dump(raw_ostream &OS);
+protected:
+  bool extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) override;
+};
+
+}
+
+#endif
+
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
new file mode 100644
index 0000000..628852f
--- /dev/null
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -0,0 +1,285 @@
+//===-- DWARFUnit.h ---------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_DEBUGINFO_DWARFUNIT_H
+#define LLVM_LIB_DEBUGINFO_DWARFUNIT_H
+
+#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
+#include "llvm/DebugInfo/DWARF/DWARFSection.h"
+#include <vector>
+
+namespace llvm {
+
+namespace object {
+class ObjectFile;
+}
+
+class DWARFContext;
+class DWARFDebugAbbrev;
+class DWARFUnit;
+class StringRef;
+class raw_ostream;
+
+/// Base class for all DWARFUnitSection classes. This provides the
+/// functionality common to all unit types.
+class DWARFUnitSectionBase {
+public:
+  /// Returns the Unit that contains the given section offset in the
+  /// same section this Unit originated from.
+  virtual DWARFUnit *getUnitForOffset(uint32_t Offset) const = 0;
+
+  void parse(DWARFContext &C, const DWARFSection &Section);
+  void parseDWO(DWARFContext &C, const DWARFSection &DWOSection);
+
+protected:
+  virtual void parseImpl(DWARFContext &Context, const DWARFSection &Section,
+                         const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
+                         StringRef SOS, StringRef AOS, bool isLittleEndian) = 0;
+
+  ~DWARFUnitSectionBase() {}
+};
+
+/// Concrete instance of DWARFUnitSection, specialized for one Unit type.
+template<typename UnitType>
+class DWARFUnitSection final : public SmallVector<std::unique_ptr<UnitType>, 1>,
+                               public DWARFUnitSectionBase {
+
+  struct UnitOffsetComparator {
+    bool operator()(uint32_t LHS,
+                    const std::unique_ptr<UnitType> &RHS) const {
+      return LHS < RHS->getNextUnitOffset();
+    }
+  };
+
+  bool Parsed;
+
+public:
+  DWARFUnitSection() : Parsed(false) {}
+  DWARFUnitSection(DWARFUnitSection &&DUS) :
+    SmallVector<std::unique_ptr<UnitType>, 1>(std::move(DUS)), Parsed(DUS.Parsed) {}
+
+  typedef llvm::SmallVectorImpl<std::unique_ptr<UnitType>> UnitVector;
+  typedef typename UnitVector::iterator iterator;
+  typedef llvm::iterator_range<typename UnitVector::iterator> iterator_range;
+
+  UnitType *getUnitForOffset(uint32_t Offset) const override {
+    auto *CU = std::upper_bound(this->begin(), this->end(), Offset,
+                                UnitOffsetComparator());
+    if (CU != this->end())
+      return CU->get();
+    return nullptr;
+  }
+
+private:
+  void parseImpl(DWARFContext &Context, const DWARFSection &Section,
+                 const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
+                 StringRef SOS, StringRef AOS, bool LE) override {
+    if (Parsed)
+      return;
+    DataExtractor Data(Section.Data, LE, 0);
+    uint32_t Offset = 0;
+    while (Data.isValidOffset(Offset)) {
+      auto U = llvm::make_unique<UnitType>(Context, Section, DA, RS, SS, SOS,
+                                           AOS, LE, *this);
+      if (!U->extract(Data, &Offset))
+        break;
+      this->push_back(std::move(U));
+      Offset = this->back()->getNextUnitOffset();
+    }
+    Parsed = true;
+  }
+};
+
+class DWARFUnit {
+  DWARFContext &Context;
+  // Section containing this DWARFUnit.
+  const DWARFSection &InfoSection;
+
+  const DWARFDebugAbbrev *Abbrev;
+  StringRef RangeSection;
+  uint32_t RangeSectionBase;
+  StringRef StringSection;
+  StringRef StringOffsetSection;
+  StringRef AddrOffsetSection;
+  uint32_t AddrOffsetSectionBase;
+  bool isLittleEndian;
+  const DWARFUnitSectionBase &UnitSection;
+
+  uint32_t Offset;
+  uint32_t Length;
+  uint16_t Version;
+  const DWARFAbbreviationDeclarationSet *Abbrevs;
+  uint8_t AddrSize;
+  uint64_t BaseAddr;
+  // The compile unit debug information entry items.
+  std::vector<DWARFDebugInfoEntryMinimal> DieArray;
+
+  class DWOHolder {
+    object::OwningBinary<object::ObjectFile> DWOFile;
+    std::unique_ptr<DWARFContext> DWOContext;
+    DWARFUnit *DWOU;
+  public:
+    DWOHolder(StringRef DWOPath);
+    DWARFUnit *getUnit() const { return DWOU; }
+  };
+  std::unique_ptr<DWOHolder> DWO;
+
+protected:
+  virtual bool extractImpl(DataExtractor debug_info, uint32_t *offset_ptr);
+  /// Size in bytes of the unit header.
+  virtual uint32_t getHeaderSize() const { return 11; }
+
+public:
+  DWARFUnit(DWARFContext &Context, const DWARFSection &Section,
+            const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
+            StringRef SOS, StringRef AOS, bool LE,
+            const DWARFUnitSectionBase &UnitSection);
+
+  virtual ~DWARFUnit();
+
+  DWARFContext& getContext() const { return Context; }
+
+  StringRef getStringSection() const { return StringSection; }
+  StringRef getStringOffsetSection() const { return StringOffsetSection; }
+  void setAddrOffsetSection(StringRef AOS, uint32_t Base) {
+    AddrOffsetSection = AOS;
+    AddrOffsetSectionBase = Base;
+  }
+  void setRangesSection(StringRef RS, uint32_t Base) {
+    RangeSection = RS;
+    RangeSectionBase = Base;
+  }
+
+  bool getAddrOffsetSectionItem(uint32_t Index, uint64_t &Result) const;
+  // FIXME: Result should be uint64_t in DWARF64.
+  bool getStringOffsetSectionItem(uint32_t Index, uint32_t &Result) const;
+
+  DataExtractor getDebugInfoExtractor() const {
+    return DataExtractor(InfoSection.Data, isLittleEndian, AddrSize);
+  }
+  DataExtractor getStringExtractor() const {
+    return DataExtractor(StringSection, false, 0);
+  }
+
+  const RelocAddrMap *getRelocMap() const { return &InfoSection.Relocs; }
+
+  bool extract(DataExtractor debug_info, uint32_t* offset_ptr);
+
+  /// extractRangeList - extracts the range list referenced by this compile
+  /// unit from .debug_ranges section. Returns true on success.
+  /// Requires that compile unit is already extracted.
+  bool extractRangeList(uint32_t RangeListOffset,
+                        DWARFDebugRangeList &RangeList) const;
+  void clear();
+  uint32_t getOffset() const { return Offset; }
+  uint32_t getNextUnitOffset() const { return Offset + Length + 4; }
+  uint32_t getLength() const { return Length; }
+  uint16_t getVersion() const { return Version; }
+  const DWARFAbbreviationDeclarationSet *getAbbreviations() const {
+    return Abbrevs;
+  }
+  uint8_t getAddressByteSize() const { return AddrSize; }
+  uint64_t getBaseAddress() const { return BaseAddr; }
+
+  void setBaseAddress(uint64_t base_addr) {
+    BaseAddr = base_addr;
+  }
+
+  const DWARFDebugInfoEntryMinimal *
+  getCompileUnitDIE(bool extract_cu_die_only = true) {
+    extractDIEsIfNeeded(extract_cu_die_only);
+    return DieArray.empty() ? nullptr : &DieArray[0];
+  }
+
+  const char *getCompilationDir();
+  uint64_t getDWOId();
+
+  void collectAddressRanges(DWARFAddressRangesVector &CURanges);
+
+  /// getInlinedChainForAddress - fetches inlined chain for a given address.
+  /// Returns empty chain if there is no subprogram containing address. The
+  /// chain is valid as long as parsed compile unit DIEs are not cleared.
+  DWARFDebugInfoEntryInlinedChain getInlinedChainForAddress(uint64_t Address);
+
+  /// getUnitSection - Return the DWARFUnitSection containing this unit.
+  const DWARFUnitSectionBase &getUnitSection() const { return UnitSection; }
+
+  /// \brief Returns the number of DIEs in the unit. Parses the unit
+  /// if necessary.
+  unsigned getNumDIEs() {
+    extractDIEsIfNeeded(false);
+    return DieArray.size();
+  }
+
+  /// \brief Return the index of a DIE inside the unit's DIE vector.
+  ///
+  /// It is illegal to call this method with a DIE that hasn't be
+  /// created by this unit. In other word, it's illegal to call this
+  /// method on a DIE that isn't accessible by following
+  /// children/sibling links starting from this unit's
+  /// getCompileUnitDIE().
+  uint32_t getDIEIndex(const DWARFDebugInfoEntryMinimal *DIE) {
+    assert(!DieArray.empty() && DIE >= &DieArray[0] &&
+           DIE < &DieArray[0] + DieArray.size());
+    return DIE - &DieArray[0];
+  }
+
+  /// \brief Return the DIE object at the given index.
+  const DWARFDebugInfoEntryMinimal *getDIEAtIndex(unsigned Index) const {
+    assert(Index < DieArray.size());
+    return &DieArray[Index];
+  }
+
+  /// \brief Return the DIE object for a given offset inside the
+  /// unit's DIE vector.
+  ///
+  /// The unit needs to have his DIEs extracted for this method to work.
+  const DWARFDebugInfoEntryMinimal *getDIEForOffset(uint32_t Offset) const {
+    assert(!DieArray.empty());
+    auto it = std::lower_bound(
+        DieArray.begin(), DieArray.end(), Offset,
+        [=](const DWARFDebugInfoEntryMinimal &LHS, uint32_t Offset) {
+          return LHS.getOffset() < Offset;
+        });
+    return it == DieArray.end() ? nullptr : &*it;
+  }
+
+private:
+  /// Size in bytes of the .debug_info data associated with this compile unit.
+  size_t getDebugInfoSize() const { return Length + 4 - getHeaderSize(); }
+
+  /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
+  /// hasn't already been done. Returns the number of DIEs parsed at this call.
+  size_t extractDIEsIfNeeded(bool CUDieOnly);
+  /// extractDIEsToVector - Appends all parsed DIEs to a vector.
+  void extractDIEsToVector(bool AppendCUDie, bool AppendNonCUDIEs,
+                           std::vector<DWARFDebugInfoEntryMinimal> &DIEs) const;
+  /// setDIERelations - We read in all of the DIE entries into our flat list
+  /// of DIE entries and now we need to go back through all of them and set the
+  /// parent, sibling and child pointers for quick DIE navigation.
+  void setDIERelations();
+  /// clearDIEs - Clear parsed DIEs to keep memory usage low.
+  void clearDIEs(bool KeepCUDie);
+
+  /// parseDWO - Parses .dwo file for current compile unit. Returns true if
+  /// it was actually constructed.
+  bool parseDWO();
+
+  /// getSubprogramForAddress - Returns subprogram DIE with address range
+  /// encompassing the provided address. The pointer is alive as long as parsed
+  /// compile unit DIEs are not cleared.
+  const DWARFDebugInfoEntryMinimal *getSubprogramForAddress(uint64_t Address);
+};
+
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h b/include/llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h
new file mode 100644
index 0000000..96ce12f
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h
@@ -0,0 +1,59 @@
+//===- ConcreteSymbolEnumerator.h -------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_CONCRETESYMBOLENUMERATOR_H
+#define LLVM_DEBUGINFO_PDB_CONCRETESYMBOLENUMERATOR_H
+
+#include "IPDBEnumChildren.h"
+#include "llvm/Support/Casting.h"
+#include <memory>
+
+namespace llvm {
+
+template <typename ChildType>
+class ConcreteSymbolEnumerator : public IPDBEnumChildren<ChildType> {
+public:
+  ConcreteSymbolEnumerator(std::unique_ptr<IPDBEnumSymbols> SymbolEnumerator)
+      : Enumerator(std::move(SymbolEnumerator)) {}
+
+  virtual ~ConcreteSymbolEnumerator() {}
+
+  uint32_t getChildCount() const override {
+    return Enumerator->getChildCount();
+  }
+
+  std::unique_ptr<ChildType> getChildAtIndex(uint32_t Index) const override {
+    std::unique_ptr<PDBSymbol> Child = Enumerator->getChildAtIndex(Index);
+    return make_concrete_child(std::move(Child));
+  }
+
+  std::unique_ptr<ChildType> getNext() override {
+    std::unique_ptr<PDBSymbol> Child = Enumerator->getNext();
+    return make_concrete_child(std::move(Child));
+  }
+
+  void reset() override { Enumerator->reset(); }
+
+  ConcreteSymbolEnumerator<ChildType> *clone() const override {
+    std::unique_ptr<IPDBEnumSymbols> WrappedClone(Enumerator->clone());
+    return new ConcreteSymbolEnumerator<ChildType>(std::move(WrappedClone));
+  }
+
+private:
+  std::unique_ptr<ChildType>
+  make_concrete_child(std::unique_ptr<PDBSymbol> Child) const {
+    ChildType *ConcreteChild = dyn_cast_or_null<ChildType>(Child.release());
+    return std::unique_ptr<ChildType>(ConcreteChild);
+  }
+
+  std::unique_ptr<IPDBEnumSymbols> Enumerator;
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIADataStream.h b/include/llvm/DebugInfo/PDB/DIA/DIADataStream.h
new file mode 100644
index 0000000..7b2bc14
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/DIA/DIADataStream.h
@@ -0,0 +1,33 @@
+//===- DIADataStream.h - DIA implementation of IPDBDataStream ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIADATASTREAM_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIADATASTREAM_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBDataStream.h"
+
+namespace llvm {
+class DIADataStream : public IPDBDataStream {
+public:
+  explicit DIADataStream(CComPtr<IDiaEnumDebugStreamData> DiaStreamData);
+
+  uint32_t getRecordCount() const override;
+  std::string getName() const override;
+  llvm::Optional<RecordType> getItemAtIndex(uint32_t Index) const override;
+  bool getNext(RecordType &Record) override;
+  void reset() override;
+  DIADataStream *clone() const override;
+
+private:
+  CComPtr<IDiaEnumDebugStreamData> StreamData;
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIAEnumDebugStreams.h b/include/llvm/DebugInfo/PDB/DIA/DIAEnumDebugStreams.h
new file mode 100644
index 0000000..375bcdd
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/DIA/DIAEnumDebugStreams.h
@@ -0,0 +1,35 @@
+//==- DIAEnumDebugStreams.h - DIA Debug Stream Enumerator impl ---*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIAENUMDEBUGSTREAMS_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIAENUMDEBUGSTREAMS_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+
+namespace llvm {
+
+class IPDBDataStream;
+
+class DIAEnumDebugStreams : public IPDBEnumChildren<IPDBDataStream> {
+public:
+  explicit DIAEnumDebugStreams(CComPtr<IDiaEnumDebugStreams> DiaEnumerator);
+
+  uint32_t getChildCount() const override;
+  ChildTypePtr getChildAtIndex(uint32_t Index) const override;
+  ChildTypePtr getNext() override;
+  void reset() override;
+  DIAEnumDebugStreams *clone() const override;
+
+private:
+  CComPtr<IDiaEnumDebugStreams> Enumerator;
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIAEnumLineNumbers.h b/include/llvm/DebugInfo/PDB/DIA/DIAEnumLineNumbers.h
new file mode 100644
index 0000000..4cc85ed
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/DIA/DIAEnumLineNumbers.h
@@ -0,0 +1,35 @@
+//==- DIAEnumLineNumbers.h - DIA Line Number Enumerator impl -----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIAENUMLINENUMBERS_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIAENUMLINENUMBERS_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+
+namespace llvm {
+
+class IPDBLineNumber;
+
+class DIAEnumLineNumbers : public IPDBEnumChildren<IPDBLineNumber> {
+public:
+  explicit DIAEnumLineNumbers(CComPtr<IDiaEnumLineNumbers> DiaEnumerator);
+
+  uint32_t getChildCount() const override;
+  ChildTypePtr getChildAtIndex(uint32_t Index) const override;
+  ChildTypePtr getNext() override;
+  void reset() override;
+  DIAEnumLineNumbers *clone() const override;
+
+private:
+  CComPtr<IDiaEnumLineNumbers> Enumerator;
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIAEnumSourceFiles.h b/include/llvm/DebugInfo/PDB/DIA/DIAEnumSourceFiles.h
new file mode 100644
index 0000000..88625f6
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/DIA/DIAEnumSourceFiles.h
@@ -0,0 +1,37 @@
+//==- DIAEnumSourceFiles.h - DIA Source File Enumerator impl -----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIAENUMSOURCEFILES_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIAENUMSOURCEFILES_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+
+namespace llvm {
+
+class DIASession;
+
+class DIAEnumSourceFiles : public IPDBEnumChildren<IPDBSourceFile> {
+public:
+  explicit DIAEnumSourceFiles(const DIASession &PDBSession,
+                              CComPtr<IDiaEnumSourceFiles> DiaEnumerator);
+
+  uint32_t getChildCount() const override;
+  ChildTypePtr getChildAtIndex(uint32_t Index) const override;
+  ChildTypePtr getNext() override;
+  void reset() override;
+  DIAEnumSourceFiles *clone() const override;
+
+private:
+  const DIASession &Session;
+  CComPtr<IDiaEnumSourceFiles> Enumerator;
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIAEnumSymbols.h b/include/llvm/DebugInfo/PDB/DIA/DIAEnumSymbols.h
new file mode 100644
index 0000000..fe343f7
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/DIA/DIAEnumSymbols.h
@@ -0,0 +1,37 @@
+//==- DIAEnumSymbols.h - DIA Symbol Enumerator impl --------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIAENUMSYMBOLS_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIAENUMSYMBOLS_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+
+namespace llvm {
+
+class DIASession;
+
+class DIAEnumSymbols : public IPDBEnumChildren<PDBSymbol> {
+public:
+  explicit DIAEnumSymbols(const DIASession &Session,
+                          CComPtr<IDiaEnumSymbols> DiaEnumerator);
+
+  uint32_t getChildCount() const override;
+  std::unique_ptr<PDBSymbol> getChildAtIndex(uint32_t Index) const override;
+  std::unique_ptr<PDBSymbol> getNext() override;
+  void reset() override;
+  DIAEnumSymbols *clone() const override;
+
+private:
+  const DIASession &Session;
+  CComPtr<IDiaEnumSymbols> Enumerator;
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIALineNumber.h b/include/llvm/DebugInfo/PDB/DIA/DIALineNumber.h
new file mode 100644
index 0000000..5950a0d
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/DIA/DIALineNumber.h
@@ -0,0 +1,39 @@
+//===- DIALineNumber.h - DIA implementation of IPDBLineNumber ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIALINENUMBER_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIALINENUMBER_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
+
+namespace llvm {
+class DIALineNumber : public IPDBLineNumber {
+public:
+  explicit DIALineNumber(CComPtr<IDiaLineNumber> DiaLineNumber);
+
+  uint32_t getLineNumber() const override;
+  uint32_t getLineNumberEnd() const override;
+  uint32_t getColumnNumber() const override;
+  uint32_t getColumnNumberEnd() const override;
+  uint32_t getAddressSection() const override;
+  uint32_t getAddressOffset() const override;
+  uint32_t getRelativeVirtualAddress() const override;
+  uint64_t getVirtualAddress() const override;
+  uint32_t getLength() const override;
+  uint32_t getSourceFileId() const override;
+  uint32_t getCompilandId() const override;
+  bool isStatement() const override;
+
+private:
+  CComPtr<IDiaLineNumber> LineNumber;
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h b/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h
new file mode 100644
index 0000000..c71f35c
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h
@@ -0,0 +1,206 @@
+//===- DIARawSymbol.h - DIA implementation of IPDBRawSymbol ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIARAWSYMBOL_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIARAWSYMBOL_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+
+namespace llvm {
+class DIASession;
+class DIARawSymbol : public IPDBRawSymbol {
+public:
+  DIARawSymbol(const DIASession &PDBSession, CComPtr<IDiaSymbol> DiaSymbol);
+
+  void dump(raw_ostream &OS, int Indent) const override;
+
+  CComPtr<IDiaSymbol> getDiaSymbol() const { return Symbol; }
+
+  std::unique_ptr<IPDBEnumSymbols>
+  DIARawSymbol::findChildren(PDB_SymType Type) const override;
+  std::unique_ptr<IPDBEnumSymbols>
+  findChildren(PDB_SymType Type, StringRef Name,
+               PDB_NameSearchFlags Flags) const override;
+  std::unique_ptr<IPDBEnumSymbols>
+  findChildrenByRVA(PDB_SymType Type, StringRef Name, PDB_NameSearchFlags Flags,
+                    uint32_t RVA) const override;
+  std::unique_ptr<IPDBEnumSymbols>
+  findInlineFramesByRVA(uint32_t RVA) const override;
+
+  void getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes) const override;
+  void getFrontEndVersion(VersionInfo &Version) const override;
+  void getBackEndVersion(VersionInfo &Version) const override;
+  PDB_MemberAccess getAccess() const override;
+  uint32_t getAddressOffset() const override;
+  uint32_t getAddressSection() const override;
+  uint32_t getAge() const override;
+  uint32_t getArrayIndexTypeId() const override;
+  uint32_t getBaseDataOffset() const override;
+  uint32_t getBaseDataSlot() const override;
+  uint32_t getBaseSymbolId() const override;
+  PDB_BuiltinType getBuiltinType() const override;
+  uint32_t getBitPosition() const override;
+  PDB_CallingConv getCallingConvention() const override;
+  uint32_t getClassParentId() const override;
+  std::string getCompilerName() const override;
+  uint32_t getCount() const override;
+  uint32_t getCountLiveRanges() const override;
+  PDB_Lang getLanguage() const override;
+  uint32_t getLexicalParentId() const override;
+  std::string getLibraryName() const override;
+  uint32_t getLiveRangeStartAddressOffset() const override;
+  uint32_t getLiveRangeStartAddressSection() const override;
+  uint32_t getLiveRangeStartRelativeVirtualAddress() const override;
+  PDB_RegisterId getLocalBasePointerRegisterId() const override;
+  uint32_t getLowerBoundId() const override;
+  uint32_t getMemorySpaceKind() const override;
+  std::string getName() const override;
+  uint32_t getNumberOfAcceleratorPointerTags() const override;
+  uint32_t getNumberOfColumns() const override;
+  uint32_t getNumberOfModifiers() const override;
+  uint32_t getNumberOfRegisterIndices() const override;
+  uint32_t getNumberOfRows() const override;
+  std::string getObjectFileName() const override;
+  uint32_t getOemId() const override;
+  uint32_t getOemSymbolId() const override;
+  uint32_t getOffsetInUdt() const override;
+  PDB_Cpu getPlatform() const override;
+  uint32_t getRank() const override;
+  PDB_RegisterId getRegisterId() const override;
+  uint32_t getRegisterType() const override;
+  uint32_t getRelativeVirtualAddress() const override;
+  uint32_t getSamplerSlot() const override;
+  uint32_t getSignature() const override;
+  uint32_t getSizeInUdt() const override;
+  uint32_t getSlot() const override;
+  std::string getSourceFileName() const override;
+  uint32_t getStride() const override;
+  uint32_t getSubTypeId() const override;
+  std::string getSymbolsFileName() const override;
+  uint32_t getSymIndexId() const override;
+  uint32_t getTargetOffset() const override;
+  uint32_t getTargetRelativeVirtualAddress() const override;
+  uint64_t getTargetVirtualAddress() const override;
+  uint32_t getTargetSection() const override;
+  uint32_t getTextureSlot() const override;
+  uint32_t getTimeStamp() const override;
+  uint32_t getToken() const override;
+  uint32_t getTypeId() const override;
+  uint32_t getUavSlot() const override;
+  std::string getUndecoratedName() const override;
+  uint32_t getUnmodifiedTypeId() const override;
+  uint32_t getUpperBoundId() const override;
+  Variant getValue() const override;
+  uint32_t getVirtualBaseDispIndex() const override;
+  uint32_t getVirtualBaseOffset() const override;
+  uint32_t getVirtualTableShapeId() const override;
+  PDB_DataKind getDataKind() const override;
+  PDB_SymType getSymTag() const override;
+  PDB_UniqueId getGuid() const override;
+  int32_t getOffset() const override;
+  int32_t getThisAdjust() const override;
+  int32_t getVirtualBasePointerOffset() const override;
+  PDB_LocType getLocationType() const override;
+  PDB_Machine getMachineType() const override;
+  PDB_ThunkOrdinal getThunkOrdinal() const override;
+  uint64_t getLength() const override;
+  uint64_t getLiveRangeLength() const override;
+  uint64_t getVirtualAddress() const override;
+  PDB_UdtType getUdtKind() const override;
+  bool hasConstructor() const override;
+  bool hasCustomCallingConvention() const override;
+  bool hasFarReturn() const override;
+  bool isCode() const override;
+  bool isCompilerGenerated() const override;
+  bool isConstType() const override;
+  bool isEditAndContinueEnabled() const override;
+  bool isFunction() const override;
+  bool getAddressTaken() const override;
+  bool getNoStackOrdering() const override;
+  bool hasAlloca() const override;
+  bool hasAssignmentOperator() const override;
+  bool hasCTypes() const override;
+  bool hasCastOperator() const override;
+  bool hasDebugInfo() const override;
+  bool hasEH() const override;
+  bool hasEHa() const override;
+  bool hasInlAsm() const override;
+  bool hasInlineAttribute() const override;
+  bool hasInterruptReturn() const override;
+  bool hasFramePointer() const override;
+  bool hasLongJump() const override;
+  bool hasManagedCode() const override;
+  bool hasNestedTypes() const override;
+  bool hasNoInlineAttribute() const override;
+  bool hasNoReturnAttribute() const override;
+  bool hasOptimizedCodeDebugInfo() const override;
+  bool hasOverloadedOperator() const override;
+  bool hasSEH() const override;
+  bool hasSecurityChecks() const override;
+  bool hasSetJump() const override;
+  bool hasStrictGSCheck() const override;
+  bool isAcceleratorGroupSharedLocal() const override;
+  bool isAcceleratorPointerTagLiveRange() const override;
+  bool isAcceleratorStubFunction() const override;
+  bool isAggregated() const override;
+  bool isIntroVirtualFunction() const override;
+  bool isCVTCIL() const override;
+  bool isConstructorVirtualBase() const override;
+  bool isCxxReturnUdt() const override;
+  bool isDataAligned() const override;
+  bool isHLSLData() const override;
+  bool isHotpatchable() const override;
+  bool isIndirectVirtualBaseClass() const override;
+  bool isInterfaceUdt() const override;
+  bool isIntrinsic() const override;
+  bool isLTCG() const override;
+  bool isLocationControlFlowDependent() const override;
+  bool isMSILNetmodule() const override;
+  bool isMatrixRowMajor() const override;
+  bool isManagedCode() const override;
+  bool isMSILCode() const override;
+  bool isMultipleInheritance() const override;
+  bool isNaked() const override;
+  bool isNested() const override;
+  bool isOptimizedAway() const override;
+  bool isPacked() const override;
+  bool isPointerBasedOnSymbolValue() const override;
+  bool isPointerToDataMember() const override;
+  bool isPointerToMemberFunction() const override;
+  bool isPureVirtual() const override;
+  bool isRValueReference() const override;
+  bool isRefUdt() const override;
+  bool isReference() const override;
+  bool isRestrictedType() const override;
+  bool isReturnValue() const override;
+  bool isSafeBuffers() const override;
+  bool isScoped() const override;
+  bool isSdl() const override;
+  bool isSingleInheritance() const override;
+  bool isSplitted() const override;
+  bool isStatic() const override;
+  bool hasPrivateSymbols() const override;
+  bool isUnalignedType() const override;
+  bool isUnreached() const override;
+  bool isValueUdt() const override;
+  bool isVirtual() const override;
+  bool isVirtualBaseClass() const override;
+  bool isVirtualInheritance() const override;
+  bool isVolatileType() const override;
+  bool wasInlined() const override;
+  std::string getUnused() const override;
+
+private:
+  const DIASession &Session;
+  CComPtr<IDiaSymbol> Symbol;
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIASession.h b/include/llvm/DebugInfo/PDB/DIA/DIASession.h
new file mode 100644
index 0000000..141b9b0
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/DIA/DIASession.h
@@ -0,0 +1,42 @@
+//===- DIASession.h - DIA implementation of IPDBSession ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIASESSION_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIASESSION_H
+
+#include "DIASupport.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+
+namespace llvm {
+class DIASession : public IPDBSession {
+public:
+  explicit DIASession(CComPtr<IDiaSession> DiaSession);
+
+  static DIASession *createFromPdb(StringRef Path);
+
+  uint64_t getLoadAddress() const override;
+  void setLoadAddress(uint64_t Address) override;
+  std::unique_ptr<PDBSymbolExe> getGlobalScope() const override;
+  std::unique_ptr<PDBSymbol> getSymbolById(uint32_t SymbolId) const override;
+
+  std::unique_ptr<IPDBEnumSourceFiles> getAllSourceFiles() const override;
+  std::unique_ptr<IPDBEnumSourceFiles> getSourceFilesForCompiland(
+      const PDBSymbolCompiland &Compiland) const override;
+  std::unique_ptr<IPDBSourceFile>
+  getSourceFileById(uint32_t FileId) const override;
+
+  std::unique_ptr<IPDBEnumDataStreams> getDebugStreams() const override;
+
+private:
+  CComPtr<IDiaSession> Session;
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIASourceFile.h b/include/llvm/DebugInfo/PDB/DIA/DIASourceFile.h
new file mode 100644
index 0000000..c424e27
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/DIA/DIASourceFile.h
@@ -0,0 +1,36 @@
+//===- DIASourceFile.h - DIA implementation of IPDBSourceFile ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIASOURCEFILE_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIASOURCEFILE_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
+
+namespace llvm {
+class DIASession;
+
+class DIASourceFile : public IPDBSourceFile {
+public:
+  explicit DIASourceFile(const DIASession &Session,
+                         CComPtr<IDiaSourceFile> DiaSourceFile);
+
+  std::string getFileName() const override;
+  uint32_t getUniqueId() const override;
+  std::string getChecksum() const override;
+  PDB_Checksum getChecksumType() const override;
+  std::unique_ptr<IPDBEnumSymbols> getCompilands() const override;
+
+private:
+  const DIASession &Session;
+  CComPtr<IDiaSourceFile> SourceFile;
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIASupport.h b/include/llvm/DebugInfo/PDB/DIA/DIASupport.h
new file mode 100644
index 0000000..407a345
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/DIA/DIASupport.h
@@ -0,0 +1,33 @@
+//===- DIASupport.h - Common header includes for DIA ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Common defines and header includes for all LLVMDebugInfoPDBDIA.  The
+// definitions here configure the necessary #defines and include system headers
+// in the proper order for using DIA.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIASUPPORT_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIASUPPORT_H
+
+// Require at least Vista
+#define NTDDI_VERSION NTDDI_VISTA
+#define _WIN32_WINNT _WIN32_WINNT_VISTA
+#define WINVER _WIN32_WINNT_VISTA
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+
+// atlbase.h has to come before windows.h
+#include <atlbase.h>
+#include <windows.h>
+
+// DIA headers must come after windows headers.
+#include <cvconst.h>
+#include <dia2.h>
+
+#endif // LLVM_DEBUGINFO_PDB_DIA_DIASUPPORT_H
diff --git a/include/llvm/DebugInfo/PDB/IPDBDataStream.h b/include/llvm/DebugInfo/PDB/IPDBDataStream.h
new file mode 100644
index 0000000..808a0f3
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/IPDBDataStream.h
@@ -0,0 +1,37 @@
+//===- IPDBDataStream.h - base interface for child enumerator -*- C++ ---*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_IPDBDATASTREAM_H
+#define LLVM_DEBUGINFO_PDB_IPDBDATASTREAM_H
+
+#include "PDBTypes.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+
+/// IPDBDataStream defines an interface used to represent a stream consisting
+/// of a name and a series of records whose formats depend on the particular
+/// stream type.
+class IPDBDataStream {
+public:
+  typedef llvm::SmallVector<uint8_t, 32> RecordType;
+
+  virtual ~IPDBDataStream();
+
+  virtual uint32_t getRecordCount() const = 0;
+  virtual std::string getName() const = 0;
+  virtual llvm::Optional<RecordType> getItemAtIndex(uint32_t Index) const = 0;
+  virtual bool getNext(RecordType &Record) = 0;
+  virtual void reset() = 0;
+  virtual IPDBDataStream *clone() const = 0;
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/IPDBEnumChildren.h b/include/llvm/DebugInfo/PDB/IPDBEnumChildren.h
new file mode 100644
index 0000000..645ac96
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/IPDBEnumChildren.h
@@ -0,0 +1,33 @@
+//===- IPDBEnumChildren.h - base interface for child enumerator -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_IPDBENUMCHILDREN_H
+#define LLVM_DEBUGINFO_PDB_IPDBENUMCHILDREN_H
+
+#include "PDBTypes.h"
+#include <memory>
+
+namespace llvm {
+
+template <typename ChildType> class IPDBEnumChildren {
+public:
+  typedef std::unique_ptr<ChildType> ChildTypePtr;
+  typedef IPDBEnumChildren<ChildType> MyType;
+
+  virtual ~IPDBEnumChildren() {}
+
+  virtual uint32_t getChildCount() const = 0;
+  virtual ChildTypePtr getChildAtIndex(uint32_t Index) const = 0;
+  virtual ChildTypePtr getNext() = 0;
+  virtual void reset() = 0;
+  virtual MyType *clone() const = 0;
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/IPDBLineNumber.h b/include/llvm/DebugInfo/PDB/IPDBLineNumber.h
new file mode 100644
index 0000000..92cd58d
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/IPDBLineNumber.h
@@ -0,0 +1,36 @@
+//===- IPDBLineNumber.h - base interface for PDB line no. info ---*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_IPDBLINENUMBER_H
+#define LLVM_DEBUGINFO_PDB_IPDBLINENUMBER_H
+
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class IPDBLineNumber {
+public:
+  virtual ~IPDBLineNumber();
+
+  virtual uint32_t getLineNumber() const = 0;
+  virtual uint32_t getLineNumberEnd() const = 0;
+  virtual uint32_t getColumnNumber() const = 0;
+  virtual uint32_t getColumnNumberEnd() const = 0;
+  virtual uint32_t getAddressSection() const = 0;
+  virtual uint32_t getAddressOffset() const = 0;
+  virtual uint32_t getRelativeVirtualAddress() const = 0;
+  virtual uint64_t getVirtualAddress() const = 0;
+  virtual uint32_t getLength() const = 0;
+  virtual uint32_t getSourceFileId() const = 0;
+  virtual uint32_t getCompilandId() const = 0;
+  virtual bool isStatement() const = 0;
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h b/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
new file mode 100644
index 0000000..139bff5
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
@@ -0,0 +1,211 @@
+//===- IPDBRawSymbol.h - base interface for PDB symbol types ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_IPDBRAWSYMBOL_H
+#define LLVM_DEBUGINFO_PDB_IPDBRAWSYMBOL_H
+
+#include "PDBTypes.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include <memory>
+
+namespace llvm {
+
+class raw_ostream;
+
+/// IPDBRawSymbol defines an interface used to represent an arbitrary symbol.
+/// It exposes a monolithic interface consisting of accessors for the union of
+/// all properties that are valid for any symbol type.  This interface is then
+/// wrapped by a concrete class which exposes only those set of methods valid
+/// for this particular symbol type.  See PDBSymbol.h for more details.
+class IPDBRawSymbol {
+public:
+  virtual ~IPDBRawSymbol();
+
+  virtual void dump(raw_ostream &OS, int Indent) const = 0;
+
+  virtual std::unique_ptr<IPDBEnumSymbols>
+  findChildren(PDB_SymType Type) const = 0;
+
+  virtual std::unique_ptr<IPDBEnumSymbols>
+  findChildren(PDB_SymType Type, StringRef Name,
+               PDB_NameSearchFlags Flags) const = 0;
+  virtual std::unique_ptr<IPDBEnumSymbols>
+  findChildrenByRVA(PDB_SymType Type, StringRef Name, PDB_NameSearchFlags Flags,
+                    uint32_t RVA) const = 0;
+  virtual std::unique_ptr<IPDBEnumSymbols>
+  findInlineFramesByRVA(uint32_t RVA) const = 0;
+
+  virtual void getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes) const = 0;
+  virtual void getBackEndVersion(VersionInfo &Version) const = 0;
+  virtual PDB_MemberAccess getAccess() const = 0;
+  virtual uint32_t getAddressOffset() const = 0;
+  virtual uint32_t getAddressSection() const = 0;
+  virtual uint32_t getAge() const = 0;
+  virtual uint32_t getArrayIndexTypeId() const = 0;
+  virtual uint32_t getBaseDataOffset() const = 0;
+  virtual uint32_t getBaseDataSlot() const = 0;
+  virtual uint32_t getBaseSymbolId() const = 0;
+  virtual PDB_BuiltinType getBuiltinType() const = 0;
+  virtual uint32_t getBitPosition() const = 0;
+  virtual PDB_CallingConv getCallingConvention() const = 0;
+  virtual uint32_t getClassParentId() const = 0;
+  virtual std::string getCompilerName() const = 0;
+  virtual uint32_t getCount() const = 0;
+  virtual uint32_t getCountLiveRanges() const = 0;
+  virtual void getFrontEndVersion(VersionInfo &Version) const = 0;
+  virtual PDB_Lang getLanguage() const = 0;
+  virtual uint32_t getLexicalParentId() const = 0;
+  virtual std::string getLibraryName() const = 0;
+  virtual uint32_t getLiveRangeStartAddressOffset() const = 0;
+  virtual uint32_t getLiveRangeStartAddressSection() const = 0;
+  virtual uint32_t getLiveRangeStartRelativeVirtualAddress() const = 0;
+  virtual PDB_RegisterId getLocalBasePointerRegisterId() const = 0;
+  virtual uint32_t getLowerBoundId() const = 0;
+  virtual uint32_t getMemorySpaceKind() const = 0;
+  virtual std::string getName() const = 0;
+  virtual uint32_t getNumberOfAcceleratorPointerTags() const = 0;
+  virtual uint32_t getNumberOfColumns() const = 0;
+  virtual uint32_t getNumberOfModifiers() const = 0;
+  virtual uint32_t getNumberOfRegisterIndices() const = 0;
+  virtual uint32_t getNumberOfRows() const = 0;
+  virtual std::string getObjectFileName() const = 0;
+  virtual uint32_t getOemId() const = 0;
+  virtual uint32_t getOemSymbolId() const = 0;
+  virtual uint32_t getOffsetInUdt() const = 0;
+  virtual PDB_Cpu getPlatform() const = 0;
+  virtual uint32_t getRank() const = 0;
+  virtual PDB_RegisterId getRegisterId() const = 0;
+  virtual uint32_t getRegisterType() const = 0;
+  virtual uint32_t getRelativeVirtualAddress() const = 0;
+  virtual uint32_t getSamplerSlot() const = 0;
+  virtual uint32_t getSignature() const = 0;
+  virtual uint32_t getSizeInUdt() const = 0;
+  virtual uint32_t getSlot() const = 0;
+  virtual std::string getSourceFileName() const = 0;
+  virtual uint32_t getStride() const = 0;
+  virtual uint32_t getSubTypeId() const = 0;
+  virtual std::string getSymbolsFileName() const = 0;
+  virtual uint32_t getSymIndexId() const = 0;
+  virtual uint32_t getTargetOffset() const = 0;
+  virtual uint32_t getTargetRelativeVirtualAddress() const = 0;
+  virtual uint64_t getTargetVirtualAddress() const = 0;
+  virtual uint32_t getTargetSection() const = 0;
+  virtual uint32_t getTextureSlot() const = 0;
+  virtual uint32_t getTimeStamp() const = 0;
+  virtual uint32_t getToken() const = 0;
+  virtual uint32_t getTypeId() const = 0;
+  virtual uint32_t getUavSlot() const = 0;
+  virtual std::string getUndecoratedName() const = 0;
+  virtual uint32_t getUnmodifiedTypeId() const = 0;
+  virtual uint32_t getUpperBoundId() const = 0;
+  virtual Variant getValue() const = 0;
+  virtual uint32_t getVirtualBaseDispIndex() const = 0;
+  virtual uint32_t getVirtualBaseOffset() const = 0;
+  virtual uint32_t getVirtualTableShapeId() const = 0;
+  virtual PDB_DataKind getDataKind() const = 0;
+  virtual PDB_SymType getSymTag() const = 0;
+  virtual PDB_UniqueId getGuid() const = 0;
+  virtual int32_t getOffset() const = 0;
+  virtual int32_t getThisAdjust() const = 0;
+  virtual int32_t getVirtualBasePointerOffset() const = 0;
+  virtual PDB_LocType getLocationType() const = 0;
+  virtual PDB_Machine getMachineType() const = 0;
+  virtual PDB_ThunkOrdinal getThunkOrdinal() const = 0;
+  virtual uint64_t getLength() const = 0;
+  virtual uint64_t getLiveRangeLength() const = 0;
+  virtual uint64_t getVirtualAddress() const = 0;
+  virtual PDB_UdtType getUdtKind() const = 0;
+  virtual bool hasConstructor() const = 0;
+  virtual bool hasCustomCallingConvention() const = 0;
+  virtual bool hasFarReturn() const = 0;
+  virtual bool isCode() const = 0;
+  virtual bool isCompilerGenerated() const = 0;
+  virtual bool isConstType() const = 0;
+  virtual bool isEditAndContinueEnabled() const = 0;
+  virtual bool isFunction() const = 0;
+  virtual bool getAddressTaken() const = 0;
+  virtual bool getNoStackOrdering() const = 0;
+  virtual bool hasAlloca() const = 0;
+  virtual bool hasAssignmentOperator() const = 0;
+  virtual bool hasCTypes() const = 0;
+  virtual bool hasCastOperator() const = 0;
+  virtual bool hasDebugInfo() const = 0;
+  virtual bool hasEH() const = 0;
+  virtual bool hasEHa() const = 0;
+  virtual bool hasFramePointer() const = 0;
+  virtual bool hasInlAsm() const = 0;
+  virtual bool hasInlineAttribute() const = 0;
+  virtual bool hasInterruptReturn() const = 0;
+  virtual bool hasLongJump() const = 0;
+  virtual bool hasManagedCode() const = 0;
+  virtual bool hasNestedTypes() const = 0;
+  virtual bool hasNoInlineAttribute() const = 0;
+  virtual bool hasNoReturnAttribute() const = 0;
+  virtual bool hasOptimizedCodeDebugInfo() const = 0;
+  virtual bool hasOverloadedOperator() const = 0;
+  virtual bool hasSEH() const = 0;
+  virtual bool hasSecurityChecks() const = 0;
+  virtual bool hasSetJump() const = 0;
+  virtual bool hasStrictGSCheck() const = 0;
+  virtual bool isAcceleratorGroupSharedLocal() const = 0;
+  virtual bool isAcceleratorPointerTagLiveRange() const = 0;
+  virtual bool isAcceleratorStubFunction() const = 0;
+  virtual bool isAggregated() const = 0;
+  virtual bool isIntroVirtualFunction() const = 0;
+  virtual bool isCVTCIL() const = 0;
+  virtual bool isConstructorVirtualBase() const = 0;
+  virtual bool isCxxReturnUdt() const = 0;
+  virtual bool isDataAligned() const = 0;
+  virtual bool isHLSLData() const = 0;
+  virtual bool isHotpatchable() const = 0;
+  virtual bool isIndirectVirtualBaseClass() const = 0;
+  virtual bool isInterfaceUdt() const = 0;
+  virtual bool isIntrinsic() const = 0;
+  virtual bool isLTCG() const = 0;
+  virtual bool isLocationControlFlowDependent() const = 0;
+  virtual bool isMSILNetmodule() const = 0;
+  virtual bool isMatrixRowMajor() const = 0;
+  virtual bool isManagedCode() const = 0;
+  virtual bool isMSILCode() const = 0;
+  virtual bool isMultipleInheritance() const = 0;
+  virtual bool isNaked() const = 0;
+  virtual bool isNested() const = 0;
+  virtual bool isOptimizedAway() const = 0;
+  virtual bool isPacked() const = 0;
+  virtual bool isPointerBasedOnSymbolValue() const = 0;
+  virtual bool isPointerToDataMember() const = 0;
+  virtual bool isPointerToMemberFunction() const = 0;
+  virtual bool isPureVirtual() const = 0;
+  virtual bool isRValueReference() const = 0;
+  virtual bool isRefUdt() const = 0;
+  virtual bool isReference() const = 0;
+  virtual bool isRestrictedType() const = 0;
+  virtual bool isReturnValue() const = 0;
+  virtual bool isSafeBuffers() const = 0;
+  virtual bool isScoped() const = 0;
+  virtual bool isSdl() const = 0;
+  virtual bool isSingleInheritance() const = 0;
+  virtual bool isSplitted() const = 0;
+  virtual bool isStatic() const = 0;
+  virtual bool hasPrivateSymbols() const = 0;
+  virtual bool isUnalignedType() const = 0;
+  virtual bool isUnreached() const = 0;
+  virtual bool isValueUdt() const = 0;
+  virtual bool isVirtual() const = 0;
+  virtual bool isVirtualBaseClass() const = 0;
+  virtual bool isVirtualInheritance() const = 0;
+  virtual bool isVolatileType() const = 0;
+  virtual bool wasInlined() const = 0;
+  virtual std::string getUnused() const = 0;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/IPDBSession.h b/include/llvm/DebugInfo/PDB/IPDBSession.h
new file mode 100644
index 0000000..60d6f62
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/IPDBSession.h
@@ -0,0 +1,56 @@
+//===- IPDBSession.h - base interface for a PDB symbol context --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_IPDBSESSION_H
+#define LLVM_DEBUGINFO_PDB_IPDBSESSION_H
+
+#include "PDBTypes.h"
+#include "llvm/Support/Casting.h"
+#include <memory>
+
+namespace llvm {
+
+class PDBSymbolCompiland;
+class PDBSymbolExe;
+
+/// IPDBSession defines an interface used to provide a context for querying
+/// debug information from a debug data source (for example, a PDB).
+class IPDBSession {
+public:
+  virtual ~IPDBSession();
+
+  virtual uint64_t getLoadAddress() const = 0;
+  virtual void setLoadAddress(uint64_t Address) = 0;
+  virtual std::unique_ptr<PDBSymbolExe> getGlobalScope() const = 0;
+  virtual std::unique_ptr<PDBSymbol> getSymbolById(uint32_t SymbolId) const = 0;
+
+  template <typename T>
+  std::unique_ptr<T> getConcreteSymbolById(uint32_t SymbolId) const {
+    auto Symbol(getSymbolById(SymbolId));
+    if (!Symbol)
+      return nullptr;
+
+    T *ConcreteSymbol = dyn_cast<T>(Symbol.get());
+    if (!ConcreteSymbol)
+      return nullptr;
+    Symbol.release();
+    return std::unique_ptr<T>(ConcreteSymbol);
+  }
+
+  virtual std::unique_ptr<IPDBEnumSourceFiles> getAllSourceFiles() const = 0;
+  virtual std::unique_ptr<IPDBEnumSourceFiles>
+  getSourceFilesForCompiland(const PDBSymbolCompiland &Compiland) const = 0;
+  virtual std::unique_ptr<IPDBSourceFile>
+  getSourceFileById(uint32_t FileId) const = 0;
+
+  virtual std::unique_ptr<IPDBEnumDataStreams> getDebugStreams() const = 0;
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/IPDBSourceFile.h b/include/llvm/DebugInfo/PDB/IPDBSourceFile.h
new file mode 100644
index 0000000..55000ef
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/IPDBSourceFile.h
@@ -0,0 +1,37 @@
+//===- IPDBSourceFile.h - base interface for a PDB source file --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_IPDBSOURCEFILE_H
+#define LLVM_DEBUGINFO_PDB_IPDBSOURCEFILE_H
+
+#include "PDBTypes.h"
+#include <memory>
+#include <string>
+
+namespace llvm {
+
+class raw_ostream;
+
+/// IPDBSourceFile defines an interface used to represent source files whose
+/// information are stored in the PDB.
+class IPDBSourceFile {
+public:
+  virtual ~IPDBSourceFile();
+
+  void dump(raw_ostream &OS, int Indent) const;
+
+  virtual std::string getFileName() const = 0;
+  virtual uint32_t getUniqueId() const = 0;
+  virtual std::string getChecksum() const = 0;
+  virtual PDB_Checksum getChecksumType() const = 0;
+  virtual std::unique_ptr<IPDBEnumSymbols> getCompilands() const = 0;
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/PDB.h b/include/llvm/DebugInfo/PDB/PDB.h
new file mode 100644
index 0000000..7193eef
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDB.h
@@ -0,0 +1,23 @@
+//===- PDB.h - base header file for creating a PDB reader -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDB_H
+#define LLVM_DEBUGINFO_PDB_PDB_H
+
+#include "PDBTypes.h"
+#include <memory>
+
+namespace llvm {
+class StringRef;
+
+std::unique_ptr<IPDBSession> createPDBReader(PDB_ReaderType Type,
+                                             StringRef Path);
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/PDBExtras.h b/include/llvm/DebugInfo/PDB/PDBExtras.h
new file mode 100644
index 0000000..cbbe171
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBExtras.h
@@ -0,0 +1,37 @@
+//===- PDBExtras.h - helper functions and classes for PDBs -------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBEXTRAS_H
+#define LLVM_DEBUGINFO_PDB_PDBEXTRAS_H
+
+#include "PDBTypes.h"
+#include "llvm/Support/raw_ostream.h"
+#include <unordered_map>
+
+namespace llvm {
+typedef std::unordered_map<PDB_SymType, int> TagStats;
+
+raw_ostream &operator<<(raw_ostream &OS, const PDB_VariantType &Value);
+raw_ostream &operator<<(raw_ostream &OS, const PDB_CallingConv &Conv);
+raw_ostream &operator<<(raw_ostream &OS, const PDB_DataKind &Data);
+raw_ostream &operator<<(raw_ostream &OS, const PDB_RegisterId &Reg);
+raw_ostream &operator<<(raw_ostream &OS, const PDB_LocType &Loc);
+raw_ostream &operator<<(raw_ostream &OS, const PDB_ThunkOrdinal &Thunk);
+raw_ostream &operator<<(raw_ostream &OS, const PDB_Checksum &Checksum);
+raw_ostream &operator<<(raw_ostream &OS, const PDB_Lang &Lang);
+raw_ostream &operator<<(raw_ostream &OS, const PDB_SymType &Tag);
+raw_ostream &operator<<(raw_ostream &OS, const PDB_BuiltinType &Type);
+raw_ostream &operator<<(raw_ostream &OS, const PDB_UniqueId &Id);
+
+raw_ostream &operator<<(raw_ostream &OS, const Variant &Value);
+raw_ostream &operator<<(raw_ostream &OS, const VersionInfo &Version);
+raw_ostream &operator<<(raw_ostream &OS, const TagStats &Stats);
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/PDBSymDumper.h b/include/llvm/DebugInfo/PDB/PDBSymDumper.h
new file mode 100644
index 0000000..dee601c
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymDumper.h
@@ -0,0 +1,85 @@
+//===- PDBSymDumper.h - base interface for PDB symbol dumper *- C++ -----*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMDUMPER_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMDUMPER_H
+
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymDumper {
+public:
+  PDBSymDumper(bool ShouldRequireImpl);
+  virtual ~PDBSymDumper();
+
+  virtual void dump(const PDBSymbolAnnotation &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolBlock &Symbol, raw_ostream &OS, int Indent);
+  virtual void dump(const PDBSymbolCompiland &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolCompilandDetails &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolCompilandEnv &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolCustom &Symbol, raw_ostream &OS, int Indent);
+  virtual void dump(const PDBSymbolData &Symbol, raw_ostream &OS, int Indent);
+  virtual void dump(const PDBSymbolExe &Symbol, raw_ostream &OS, int Indent);
+  virtual void dump(const PDBSymbolFunc &Symbol, raw_ostream &OS, int Indent);
+  virtual void dump(const PDBSymbolFuncDebugEnd &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolFuncDebugStart &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolLabel &Symbol, raw_ostream &OS, int Indent);
+  virtual void dump(const PDBSymbolPublicSymbol &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolThunk &Symbol, raw_ostream &OS, int Indent);
+  virtual void dump(const PDBSymbolTypeArray &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolTypeBaseClass &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolTypeBuiltin &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolTypeCustom &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolTypeDimension &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolTypeEnum &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolTypeFriend &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolTypeFunctionArg &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolTypeFunctionSig &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolTypeManaged &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolTypePointer &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolTypeTypedef &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolTypeUDT &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolTypeVTable &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolTypeVTableShape &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolUnknown &Symbol, raw_ostream &OS,
+                    int Indent);
+  virtual void dump(const PDBSymbolUsingNamespace &Symbol, raw_ostream &OS,
+                    int Indent);
+
+private:
+  bool RequireImpl;
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbol.h b/include/llvm/DebugInfo/PDB/PDBSymbol.h
new file mode 100644
index 0000000..36005eb
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbol.h
@@ -0,0 +1,98 @@
+//===- PDBSymbol.h - base class for user-facing symbol types -----*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_IPDBSYMBOL_H
+#define LLVM_DEBUGINFO_PDB_IPDBSYMBOL_H
+
+#include "ConcreteSymbolEnumerator.h"
+#include "IPDBRawSymbol.h"
+#include "PDBExtras.h"
+#include "PDBTypes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <unordered_map>
+
+#define FORWARD_SYMBOL_METHOD(MethodName)                                      \
+  auto MethodName() const->decltype(RawSymbol->MethodName()) {                 \
+    return RawSymbol->MethodName();                                            \
+  }
+
+namespace llvm {
+
+class IPDBRawSymbol;
+class raw_ostream;
+
+#define DECLARE_PDB_SYMBOL_CONCRETE_TYPE(TagValue)                             \
+  static const PDB_SymType Tag = TagValue;                                     \
+  static bool classof(const PDBSymbol *S) { return S->getSymTag() == Tag; }
+
+/// PDBSymbol defines the base of the inheritance hierarchy for concrete symbol
+/// types (e.g. functions, executables, vtables, etc).  All concrete symbol
+/// types inherit from PDBSymbol and expose the exact set of methods that are
+/// valid for that particular symbol type, as described in the Microsoft
+/// reference "Lexical and Class Hierarchy of Symbol Types":
+/// https://msdn.microsoft.com/en-us/library/370hs6k4.aspx
+class PDBSymbol {
+protected:
+  PDBSymbol(const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol);
+
+public:
+  static std::unique_ptr<PDBSymbol>
+  create(const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol);
+
+  virtual ~PDBSymbol();
+
+  /// Dumps the contents of a symbol a raw_ostream.  By default this will just
+  /// call dump() on the underlying RawSymbol, which allows us to discover
+  /// unknown properties, but individual implementations of PDBSymbol may
+  /// override the behavior to only dump known fields.
+  virtual void dump(raw_ostream &OS, int Indent,
+                    PDBSymDumper &Dumper) const = 0;
+  void defaultDump(raw_ostream &OS, int Indent) const;
+
+  PDB_SymType getSymTag() const;
+
+  template <typename T> std::unique_ptr<T> findOneChild() const {
+    auto Enumerator(findAllChildren<T>());
+    return Enumerator->getNext();
+  }
+
+  template <typename T>
+  std::unique_ptr<ConcreteSymbolEnumerator<T>> findAllChildren() const {
+    auto BaseIter = RawSymbol->findChildren(T::Tag);
+    return llvm::make_unique<ConcreteSymbolEnumerator<T>>(std::move(BaseIter));
+  }
+  std::unique_ptr<IPDBEnumSymbols> findAllChildren(PDB_SymType Type) const;
+  std::unique_ptr<IPDBEnumSymbols> findAllChildren() const;
+
+  std::unique_ptr<IPDBEnumSymbols>
+  findChildren(PDB_SymType Type, StringRef Name,
+               PDB_NameSearchFlags Flags) const;
+  std::unique_ptr<IPDBEnumSymbols> findChildrenByRVA(PDB_SymType Type,
+                                                     StringRef Name,
+                                                     PDB_NameSearchFlags Flags,
+                                                     uint32_t RVA) const;
+  std::unique_ptr<IPDBEnumSymbols> findInlineFramesByRVA(uint32_t RVA) const;
+
+  const IPDBRawSymbol &getRawSymbol() const { return *RawSymbol; }
+  IPDBRawSymbol &getRawSymbol() { return *RawSymbol; }
+
+  const IPDBSession &getSession() const { return Session; }
+
+  std::unique_ptr<IPDBEnumSymbols> getChildStats(TagStats &Stats) const;
+
+protected:
+  const IPDBSession &Session;
+  const std::unique_ptr<IPDBRawSymbol> RawSymbol;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolAnnotation.h b/include/llvm/DebugInfo/PDB/PDBSymbolAnnotation.h
new file mode 100644
index 0000000..347e30a
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolAnnotation.h
@@ -0,0 +1,39 @@
+//===- PDBSymbolAnnotation.h - Accessors for querying PDB annotations ---*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLANNOTATION_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLANNOTATION_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+#include <string>
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolAnnotation : public PDBSymbol {
+public:
+  PDBSymbolAnnotation(const IPDBSession &PDBSession,
+                      std::unique_ptr<IPDBRawSymbol> Symbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Annotation)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getAddressOffset)
+  FORWARD_SYMBOL_METHOD(getAddressSection)
+  FORWARD_SYMBOL_METHOD(getDataKind)
+  FORWARD_SYMBOL_METHOD(getRelativeVirtualAddress)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  // FORWARD_SYMBOL_METHOD(getValue)
+  FORWARD_SYMBOL_METHOD(getVirtualAddress)
+};
+}
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLANNOTATION_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolBlock.h b/include/llvm/DebugInfo/PDB/PDBSymbolBlock.h
new file mode 100644
index 0000000..a0091be
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolBlock.h
@@ -0,0 +1,41 @@
+//===- PDBSymbolBlock.h - Accessors for querying PDB blocks -------------*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLBLOCK_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLBLOCK_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+#include <string>
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolBlock : public PDBSymbol {
+public:
+  PDBSymbolBlock(const IPDBSession &PDBSession,
+                 std::unique_ptr<IPDBRawSymbol> Symbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Block)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getAddressOffset)
+  FORWARD_SYMBOL_METHOD(getAddressSection)
+  FORWARD_SYMBOL_METHOD(getLength)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getLocationType)
+  FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(getRelativeVirtualAddress)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getVirtualAddress)
+};
+}
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLBLOCK_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolCompiland.h b/include/llvm/DebugInfo/PDB/PDBSymbolCompiland.h
new file mode 100644
index 0000000..055e444
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolCompiland.h
@@ -0,0 +1,38 @@
+//===- PDBSymbolCompiland.h - Accessors for querying PDB compilands -----*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLCOMPILAND_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLCOMPILAND_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+#include <string>
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolCompiland : public PDBSymbol {
+public:
+  PDBSymbolCompiland(const IPDBSession &PDBSession,
+                     std::unique_ptr<IPDBRawSymbol> CompilandSymbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Compiland)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(isEditAndContinueEnabled)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getLibraryName)
+  FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(getSourceFileName)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+};
+}
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLCOMPILAND_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h b/include/llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h
new file mode 100644
index 0000000..8836828
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h
@@ -0,0 +1,55 @@
+//===- PDBSymbolCompilandDetails.h - PDB compiland details ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLCOMPILANDDETAILS_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLCOMPILANDDETAILS_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolCompilandDetails : public PDBSymbol {
+public:
+  PDBSymbolCompilandDetails(const IPDBSession &PDBSession,
+                            std::unique_ptr<IPDBRawSymbol> Symbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::CompilandDetails)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  void getFrontEndVersion(VersionInfo &Version) const {
+    RawSymbol->getFrontEndVersion(Version);
+  }
+
+  void getBackEndVersion(VersionInfo &Version) const {
+    RawSymbol->getBackEndVersion(Version);
+  }
+
+  FORWARD_SYMBOL_METHOD(getCompilerName)
+  FORWARD_SYMBOL_METHOD(isEditAndContinueEnabled)
+  FORWARD_SYMBOL_METHOD(hasDebugInfo)
+  FORWARD_SYMBOL_METHOD(hasManagedCode)
+  FORWARD_SYMBOL_METHOD(hasSecurityChecks)
+  FORWARD_SYMBOL_METHOD(isCVTCIL)
+  FORWARD_SYMBOL_METHOD(isDataAligned)
+  FORWARD_SYMBOL_METHOD(isHotpatchable)
+  FORWARD_SYMBOL_METHOD(isLTCG)
+  FORWARD_SYMBOL_METHOD(isMSILNetmodule)
+  FORWARD_SYMBOL_METHOD(getLanguage)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getPlatform)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBFUNCTION_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h b/include/llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h
new file mode 100644
index 0000000..c3502a0
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h
@@ -0,0 +1,37 @@
+//===- PDBSymbolCompilandEnv.h - compiland environment variables *- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLCOMPILANDENV_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLCOMPILANDENV_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolCompilandEnv : public PDBSymbol {
+public:
+  PDBSymbolCompilandEnv(const IPDBSession &PDBSession,
+                        std::unique_ptr<IPDBRawSymbol> Symbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::CompilandEnv)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  std::string getValue() const;
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLCOMPILANDENV_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolCustom.h b/include/llvm/DebugInfo/PDB/PDBSymbolCustom.h
new file mode 100644
index 0000000..b433dde
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolCustom.h
@@ -0,0 +1,39 @@
+//===- PDBSymbolCustom.h - compiler-specific types --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLCUSTOM_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLCUSTOM_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+/// PDBSymbolCustom represents symbols that are compiler-specific and do not
+/// fit anywhere else in the lexical hierarchy.
+/// https://msdn.microsoft.com/en-us/library/d88sf09h.aspx
+class PDBSymbolCustom : public PDBSymbol {
+public:
+  PDBSymbolCustom(const IPDBSession &PDBSession,
+                  std::unique_ptr<IPDBRawSymbol> CustomSymbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Custom)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  void getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes);
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLCUSTOM_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolData.h b/include/llvm/DebugInfo/PDB/PDBSymbolData.h
new file mode 100644
index 0000000..8b9a657
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolData.h
@@ -0,0 +1,61 @@
+//===- PDBSymbolData.h - PDB data (e.g. variable) accessors -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLDATA_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLDATA_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolData : public PDBSymbol {
+public:
+  PDBSymbolData(const IPDBSession &PDBSession,
+                std::unique_ptr<IPDBRawSymbol> DataSymbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Data)
+
+  std::unique_ptr<PDBSymbol> getType() const;
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getAccess)
+  FORWARD_SYMBOL_METHOD(getAddressOffset)
+  FORWARD_SYMBOL_METHOD(getAddressSection)
+  FORWARD_SYMBOL_METHOD(getAddressTaken)
+  FORWARD_SYMBOL_METHOD(getBitPosition)
+  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_METHOD(isCompilerGenerated)
+  FORWARD_SYMBOL_METHOD(isConstType)
+  FORWARD_SYMBOL_METHOD(getDataKind)
+  FORWARD_SYMBOL_METHOD(isAggregated)
+  FORWARD_SYMBOL_METHOD(isSplitted)
+  FORWARD_SYMBOL_METHOD(getLength)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getLocationType)
+  FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(getOffset)
+  FORWARD_SYMBOL_METHOD(getRegisterId)
+  FORWARD_SYMBOL_METHOD(getRelativeVirtualAddress)
+  FORWARD_SYMBOL_METHOD(getSlot)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getToken)
+  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_METHOD(isUnalignedType)
+  FORWARD_SYMBOL_METHOD(getValue)
+  FORWARD_SYMBOL_METHOD(getVirtualAddress)
+  FORWARD_SYMBOL_METHOD(isVolatileType)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLDATA_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolExe.h b/include/llvm/DebugInfo/PDB/PDBSymbolExe.h
new file mode 100644
index 0000000..33046f3
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolExe.h
@@ -0,0 +1,46 @@
+//===- PDBSymbolExe.h - Accessors for querying executables in a PDB ----*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLEXE_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLEXE_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+#include <string>
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolExe : public PDBSymbol {
+public:
+  PDBSymbolExe(const IPDBSession &PDBSession,
+               std::unique_ptr<IPDBRawSymbol> ExeSymbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Exe)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getAge)
+  FORWARD_SYMBOL_METHOD(getGuid)
+  FORWARD_SYMBOL_METHOD(hasCTypes)
+  FORWARD_SYMBOL_METHOD(hasPrivateSymbols)
+  FORWARD_SYMBOL_METHOD(getMachineType)
+  FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(getSignature)
+  FORWARD_SYMBOL_METHOD(getSymbolsFileName)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+
+private:
+  void dumpChildren(raw_ostream &OS, StringRef Label, PDB_SymType ChildType,
+                    int Indent) const;
+};
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLEXE_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h b/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h
new file mode 100644
index 0000000..22ae6e0
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h
@@ -0,0 +1,80 @@
+//===- PDBSymbolFunc.h - class representing a function instance -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLFUNC_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLFUNC_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolFunc : public PDBSymbol {
+public:
+  PDBSymbolFunc(const IPDBSession &PDBSession,
+                std::unique_ptr<IPDBRawSymbol> FuncSymbol);
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  std::unique_ptr<PDBSymbolTypeFunctionSig> getSignature() const;
+  std::unique_ptr<PDBSymbolTypeUDT> getClassParent() const;
+  std::unique_ptr<IPDBEnumChildren<PDBSymbolData>> getArguments() const;
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Function)
+
+  FORWARD_SYMBOL_METHOD(getAccess)
+  FORWARD_SYMBOL_METHOD(getAddressOffset)
+  FORWARD_SYMBOL_METHOD(getAddressSection)
+  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_METHOD(isCompilerGenerated)
+  FORWARD_SYMBOL_METHOD(isConstType)
+  FORWARD_SYMBOL_METHOD(hasCustomCallingConvention)
+  FORWARD_SYMBOL_METHOD(hasFarReturn)
+  FORWARD_SYMBOL_METHOD(hasAlloca)
+  FORWARD_SYMBOL_METHOD(hasEH)
+  FORWARD_SYMBOL_METHOD(hasEHa)
+  FORWARD_SYMBOL_METHOD(hasInlAsm)
+  FORWARD_SYMBOL_METHOD(hasLongJump)
+  FORWARD_SYMBOL_METHOD(hasSEH)
+  FORWARD_SYMBOL_METHOD(hasSecurityChecks)
+  FORWARD_SYMBOL_METHOD(hasSetJump)
+  FORWARD_SYMBOL_METHOD(hasInterruptReturn)
+  FORWARD_SYMBOL_METHOD(isIntroVirtualFunction)
+  FORWARD_SYMBOL_METHOD(hasInlineAttribute)
+  FORWARD_SYMBOL_METHOD(isNaked)
+  FORWARD_SYMBOL_METHOD(isStatic)
+  FORWARD_SYMBOL_METHOD(getLength)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getLocalBasePointerRegisterId)
+  FORWARD_SYMBOL_METHOD(getLocationType)
+  FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(hasFramePointer)
+  FORWARD_SYMBOL_METHOD(hasNoInlineAttribute)
+  FORWARD_SYMBOL_METHOD(hasNoReturnAttribute)
+  FORWARD_SYMBOL_METHOD(isUnreached)
+  FORWARD_SYMBOL_METHOD(getNoStackOrdering)
+  FORWARD_SYMBOL_METHOD(hasOptimizedCodeDebugInfo)
+  FORWARD_SYMBOL_METHOD(isPureVirtual)
+  FORWARD_SYMBOL_METHOD(getRelativeVirtualAddress)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getToken)
+  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_METHOD(isUnalignedType)
+  FORWARD_SYMBOL_METHOD(getUndecoratedName)
+  FORWARD_SYMBOL_METHOD(isVirtual)
+  FORWARD_SYMBOL_METHOD(getVirtualAddress)
+  FORWARD_SYMBOL_METHOD(getVirtualBaseOffset)
+  FORWARD_SYMBOL_METHOD(isVolatileType)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLFUNC_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h b/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h
new file mode 100644
index 0000000..bd49314
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h
@@ -0,0 +1,49 @@
+//===- PDBSymbolFuncDebugEnd.h - function end bounds info -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLFUNCDEBUGEND_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLFUNCDEBUGEND_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolFuncDebugEnd : public PDBSymbol {
+public:
+  PDBSymbolFuncDebugEnd(const IPDBSession &PDBSession,
+                        std::unique_ptr<IPDBRawSymbol> FuncDebugEndSymbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::FuncDebugEnd)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getAddressOffset)
+  FORWARD_SYMBOL_METHOD(getAddressSection)
+  FORWARD_SYMBOL_METHOD(hasCustomCallingConvention)
+  FORWARD_SYMBOL_METHOD(hasFarReturn)
+  FORWARD_SYMBOL_METHOD(hasInterruptReturn)
+  FORWARD_SYMBOL_METHOD(isStatic)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getLocationType)
+  FORWARD_SYMBOL_METHOD(hasNoInlineAttribute)
+  FORWARD_SYMBOL_METHOD(hasNoReturnAttribute)
+  FORWARD_SYMBOL_METHOD(isUnreached)
+  FORWARD_SYMBOL_METHOD(getOffset)
+  FORWARD_SYMBOL_METHOD(hasOptimizedCodeDebugInfo)
+  FORWARD_SYMBOL_METHOD(getRelativeVirtualAddress)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getVirtualAddress)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLFUNCDEBUGEND_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h b/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h
new file mode 100644
index 0000000..a62eada
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h
@@ -0,0 +1,49 @@
+//===- PDBSymbolFuncDebugStart.h - function start bounds info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLFUNCDEBUGSTART_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLFUNCDEBUGSTART_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolFuncDebugStart : public PDBSymbol {
+public:
+  PDBSymbolFuncDebugStart(const IPDBSession &PDBSession,
+                          std::unique_ptr<IPDBRawSymbol> FuncDebugStartSymbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::FuncDebugStart)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getAddressOffset)
+  FORWARD_SYMBOL_METHOD(getAddressSection)
+  FORWARD_SYMBOL_METHOD(hasCustomCallingConvention)
+  FORWARD_SYMBOL_METHOD(hasFarReturn)
+  FORWARD_SYMBOL_METHOD(hasInterruptReturn)
+  FORWARD_SYMBOL_METHOD(isStatic)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getLocationType)
+  FORWARD_SYMBOL_METHOD(hasNoInlineAttribute)
+  FORWARD_SYMBOL_METHOD(hasNoReturnAttribute)
+  FORWARD_SYMBOL_METHOD(isUnreached)
+  FORWARD_SYMBOL_METHOD(getOffset)
+  FORWARD_SYMBOL_METHOD(hasOptimizedCodeDebugInfo)
+  FORWARD_SYMBOL_METHOD(getRelativeVirtualAddress)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getVirtualAddress)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLFUNCDEBUGSTART_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolLabel.h b/include/llvm/DebugInfo/PDB/PDBSymbolLabel.h
new file mode 100644
index 0000000..d006495
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolLabel.h
@@ -0,0 +1,49 @@
+//===- PDBSymbolLabel.h - label info ----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLLABEL_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLLABEL_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolLabel : public PDBSymbol {
+public:
+  PDBSymbolLabel(const IPDBSession &PDBSession,
+                 std::unique_ptr<IPDBRawSymbol> LabelSymbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Label)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getAddressOffset)
+  FORWARD_SYMBOL_METHOD(getAddressSection)
+  FORWARD_SYMBOL_METHOD(hasCustomCallingConvention)
+  FORWARD_SYMBOL_METHOD(hasFarReturn)
+  FORWARD_SYMBOL_METHOD(hasInterruptReturn)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getLocationType)
+  FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(hasNoInlineAttribute)
+  FORWARD_SYMBOL_METHOD(hasNoReturnAttribute)
+  FORWARD_SYMBOL_METHOD(isUnreached)
+  FORWARD_SYMBOL_METHOD(getOffset)
+  FORWARD_SYMBOL_METHOD(hasOptimizedCodeDebugInfo)
+  FORWARD_SYMBOL_METHOD(getRelativeVirtualAddress)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getVirtualAddress)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLLABEL_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h b/include/llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h
new file mode 100644
index 0000000..a8de89d
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h
@@ -0,0 +1,47 @@
+//===- PDBSymbolPublicSymbol.h - public symbol info -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLPUBLICSYMBOL_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLPUBLICSYMBOL_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolPublicSymbol : public PDBSymbol {
+public:
+  PDBSymbolPublicSymbol(const IPDBSession &PDBSession,
+                        std::unique_ptr<IPDBRawSymbol> PublicSymbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::PublicSymbol)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getAddressOffset)
+  FORWARD_SYMBOL_METHOD(getAddressSection)
+  FORWARD_SYMBOL_METHOD(isCode)
+  FORWARD_SYMBOL_METHOD(isFunction)
+  FORWARD_SYMBOL_METHOD(getLength)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getLocationType)
+  FORWARD_SYMBOL_METHOD(isManagedCode)
+  FORWARD_SYMBOL_METHOD(isMSILCode)
+  FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getRelativeVirtualAddress)
+  FORWARD_SYMBOL_METHOD(getVirtualAddress)
+  FORWARD_SYMBOL_METHOD(getUndecoratedName)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLPUBLICSYMBOL_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolThunk.h b/include/llvm/DebugInfo/PDB/PDBSymbolThunk.h
new file mode 100644
index 0000000..88588f1
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolThunk.h
@@ -0,0 +1,57 @@
+//===- PDBSymbolThunk.h - Support for querying PDB thunks ---------------*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTHUNK_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLTHUNK_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+#include <string>
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolThunk : public PDBSymbol {
+public:
+  PDBSymbolThunk(const IPDBSession &PDBSession,
+                 std::unique_ptr<IPDBRawSymbol> ThunkSymbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Thunk)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getAccess)
+  FORWARD_SYMBOL_METHOD(getAddressOffset)
+  FORWARD_SYMBOL_METHOD(getAddressSection)
+  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_METHOD(isConstType)
+  FORWARD_SYMBOL_METHOD(isIntroVirtualFunction)
+  FORWARD_SYMBOL_METHOD(isStatic)
+  FORWARD_SYMBOL_METHOD(getLength)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(isPureVirtual)
+  FORWARD_SYMBOL_METHOD(getRelativeVirtualAddress)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getTargetOffset)
+  FORWARD_SYMBOL_METHOD(getTargetRelativeVirtualAddress)
+  FORWARD_SYMBOL_METHOD(getTargetVirtualAddress)
+  FORWARD_SYMBOL_METHOD(getTargetSection)
+  FORWARD_SYMBOL_METHOD(getThunkOrdinal)
+  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_METHOD(isUnalignedType)
+  FORWARD_SYMBOL_METHOD(isVirtual)
+  FORWARD_SYMBOL_METHOD(getVirtualAddress)
+  FORWARD_SYMBOL_METHOD(getVirtualBaseOffset)
+  FORWARD_SYMBOL_METHOD(isVolatileType)
+};
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLTHUNK_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeArray.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeArray.h
new file mode 100644
index 0000000..ca925f9
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeArray.h
@@ -0,0 +1,45 @@
+//===- PDBSymbolTypeArray.h - array type information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEARRAY_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEARRAY_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolTypeArray : public PDBSymbol {
+public:
+  PDBSymbolTypeArray(const IPDBSession &PDBSession,
+                     std::unique_ptr<IPDBRawSymbol> ArrayTypeSymbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::ArrayType)
+
+  std::unique_ptr<PDBSymbol> getElementType() const;
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getArrayIndexTypeId)
+  FORWARD_SYMBOL_METHOD(isConstType)
+  FORWARD_SYMBOL_METHOD(getCount)
+  FORWARD_SYMBOL_METHOD(getLength)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getRank)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_METHOD(isUnalignedType)
+  FORWARD_SYMBOL_METHOD(isVolatileType)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEARRAY_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h
new file mode 100644
index 0000000..a3dcc02
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h
@@ -0,0 +1,60 @@
+//===- PDBSymbolTypeBaseClass.h - base class type information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEBASECLASS_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEBASECLASS_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolTypeBaseClass : public PDBSymbol {
+public:
+  PDBSymbolTypeBaseClass(const IPDBSession &PDBSession,
+                         std::unique_ptr<IPDBRawSymbol> Symbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::BaseClass)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getAccess)
+  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_METHOD(hasConstructor)
+  FORWARD_SYMBOL_METHOD(isConstType)
+  FORWARD_SYMBOL_METHOD(hasAssignmentOperator)
+  FORWARD_SYMBOL_METHOD(hasCastOperator)
+  FORWARD_SYMBOL_METHOD(hasNestedTypes)
+  FORWARD_SYMBOL_METHOD(isIndirectVirtualBaseClass)
+  FORWARD_SYMBOL_METHOD(getLength)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(isNested)
+  FORWARD_SYMBOL_METHOD(getOffset)
+  FORWARD_SYMBOL_METHOD(hasOverloadedOperator)
+  FORWARD_SYMBOL_METHOD(isPacked)
+  FORWARD_SYMBOL_METHOD(isScoped)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_METHOD(getUdtKind)
+  FORWARD_SYMBOL_METHOD(isUnalignedType)
+
+  FORWARD_SYMBOL_METHOD(isVirtualBaseClass)
+  FORWARD_SYMBOL_METHOD(getVirtualBaseDispIndex)
+  FORWARD_SYMBOL_METHOD(getVirtualBasePointerOffset)
+  // FORWARD_SYMBOL_METHOD(getVirtualBaseTableType)
+  FORWARD_SYMBOL_METHOD(getVirtualTableShapeId)
+  FORWARD_SYMBOL_METHOD(isVolatileType)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEBASECLASS_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h
new file mode 100644
index 0000000..1cede08
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h
@@ -0,0 +1,40 @@
+//===- PDBSymbolTypeBuiltin.h - builtin type information --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEBUILTIN_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEBUILTIN_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolTypeBuiltin : public PDBSymbol {
+public:
+  PDBSymbolTypeBuiltin(const IPDBSession &PDBSession,
+                       std::unique_ptr<IPDBRawSymbol> Symbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::BuiltinType)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getBuiltinType)
+  FORWARD_SYMBOL_METHOD(isConstType)
+  FORWARD_SYMBOL_METHOD(getLength)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(isUnalignedType)
+  FORWARD_SYMBOL_METHOD(isVolatileType)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEBUILTIN_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h
new file mode 100644
index 0000000..90a043f
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h
@@ -0,0 +1,36 @@
+//===- PDBSymbolTypeCustom.h - custom compiler type information -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPECUSTOM_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPECUSTOM_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolTypeCustom : public PDBSymbol {
+public:
+  PDBSymbolTypeCustom(const IPDBSession &PDBSession,
+                      std::unique_ptr<IPDBRawSymbol> Symbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::CustomType)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getOemId)
+  FORWARD_SYMBOL_METHOD(getOemSymbolId)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPECUSTOM_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h
new file mode 100644
index 0000000..f871681
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h
@@ -0,0 +1,36 @@
+//===- PDBSymbolTypeDimension.h - array dimension type info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEDIMENSION_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEDIMENSION_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolTypeDimension : public PDBSymbol {
+public:
+  PDBSymbolTypeDimension(const IPDBSession &PDBSession,
+                         std::unique_ptr<IPDBRawSymbol> Symbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Dimension)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getLowerBoundId)
+  FORWARD_SYMBOL_METHOD(getUpperBoundId)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEDIMENSION_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h
new file mode 100644
index 0000000..2479f46
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h
@@ -0,0 +1,52 @@
+//===- PDBSymbolTypeEnum.h - enum type info ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEENUM_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEENUM_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolTypeEnum : public PDBSymbol {
+public:
+  PDBSymbolTypeEnum(const IPDBSession &PDBSession,
+                    std::unique_ptr<IPDBRawSymbol> EnumTypeSymbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Enum)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getBuiltinType)
+  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_METHOD(getUnmodifiedTypeId)
+  FORWARD_SYMBOL_METHOD(hasConstructor)
+  FORWARD_SYMBOL_METHOD(isConstType)
+  FORWARD_SYMBOL_METHOD(hasAssignmentOperator)
+  FORWARD_SYMBOL_METHOD(hasCastOperator)
+  FORWARD_SYMBOL_METHOD(hasNestedTypes)
+  FORWARD_SYMBOL_METHOD(getLength)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(isNested)
+  FORWARD_SYMBOL_METHOD(hasOverloadedOperator)
+  FORWARD_SYMBOL_METHOD(isPacked)
+  FORWARD_SYMBOL_METHOD(isScoped)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_METHOD(isUnalignedType)
+  FORWARD_SYMBOL_METHOD(isVolatileType)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEENUM_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h
new file mode 100644
index 0000000..964246f
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h
@@ -0,0 +1,37 @@
+//===- PDBSymbolTypeFriend.h - friend type info -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEFRIEND_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEFRIEND_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolTypeFriend : public PDBSymbol {
+public:
+  PDBSymbolTypeFriend(const IPDBSession &PDBSession,
+                      std::unique_ptr<IPDBRawSymbol> Symbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Friend)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getTypeId)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEFRIEND_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h
new file mode 100644
index 0000000..22f1455
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h
@@ -0,0 +1,37 @@
+//===- PDBSymbolTypeFunctionArg.h - function arg type info ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEFUNCTIONARG_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEFUNCTIONARG_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolTypeFunctionArg : public PDBSymbol {
+public:
+  PDBSymbolTypeFunctionArg(const IPDBSession &PDBSession,
+                           std::unique_ptr<IPDBRawSymbol> Symbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::FunctionArg)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getTypeId)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEFUNCTIONARG_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h
new file mode 100644
index 0000000..82bb1fd
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h
@@ -0,0 +1,50 @@
+//===- PDBSymbolTypeFunctionSig.h - function signature type info *- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEFUNCTIONSIG_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEFUNCTIONSIG_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolTypeFunctionSig : public PDBSymbol {
+public:
+  PDBSymbolTypeFunctionSig(const IPDBSession &PDBSession,
+                           std::unique_ptr<IPDBRawSymbol> Symbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::FunctionSig)
+
+  std::unique_ptr<PDBSymbol> getReturnType() const;
+  std::unique_ptr<IPDBEnumSymbols> getArguments() const;
+  std::unique_ptr<PDBSymbol> getClassParent() const;
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+  void dumpArgList(raw_ostream &OS) const;
+
+  FORWARD_SYMBOL_METHOD(getCallingConvention)
+  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_METHOD(getUnmodifiedTypeId)
+  FORWARD_SYMBOL_METHOD(isConstType)
+  FORWARD_SYMBOL_METHOD(getCount)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  // FORWARD_SYMBOL_METHOD(getObjectPointerType)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getThisAdjust)
+  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_METHOD(isUnalignedType)
+  FORWARD_SYMBOL_METHOD(isVolatileType)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEFUNCTIONSIG_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h
new file mode 100644
index 0000000..42f5867
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h
@@ -0,0 +1,35 @@
+//===- PDBSymbolTypeManaged.h - managed type info ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEMANAGED_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEMANAGED_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolTypeManaged : public PDBSymbol {
+public:
+  PDBSymbolTypeManaged(const IPDBSession &PDBSession,
+                       std::unique_ptr<IPDBRawSymbol> Symbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::ManagedType)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEMANAGED_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h
new file mode 100644
index 0000000..8b2806f
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h
@@ -0,0 +1,43 @@
+//===- PDBSymbolTypePointer.h - pointer type info ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEPOINTER_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEPOINTER_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolTypePointer : public PDBSymbol {
+public:
+  PDBSymbolTypePointer(const IPDBSession &PDBSession,
+                       std::unique_ptr<IPDBRawSymbol> Symbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::PointerType)
+
+  std::unique_ptr<PDBSymbol> getPointeeType() const;
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(isConstType)
+  FORWARD_SYMBOL_METHOD(getLength)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(isReference)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_METHOD(isUnalignedType)
+  FORWARD_SYMBOL_METHOD(isVolatileType)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEPOINTER_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h
new file mode 100644
index 0000000..d3a9ca2
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h
@@ -0,0 +1,54 @@
+//===- PDBSymbolTypeTypedef.h - typedef type info ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPETYPEDEF_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPETYPEDEF_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolTypeTypedef : public PDBSymbol {
+public:
+  PDBSymbolTypeTypedef(const IPDBSession &PDBSession,
+                       std::unique_ptr<IPDBRawSymbol> Symbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Typedef)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getBuiltinType)
+  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_METHOD(hasConstructor)
+  FORWARD_SYMBOL_METHOD(isConstType)
+  FORWARD_SYMBOL_METHOD(hasAssignmentOperator)
+  FORWARD_SYMBOL_METHOD(hasCastOperator)
+  FORWARD_SYMBOL_METHOD(hasNestedTypes)
+  FORWARD_SYMBOL_METHOD(getLength)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(isNested)
+  FORWARD_SYMBOL_METHOD(hasOverloadedOperator)
+  FORWARD_SYMBOL_METHOD(isPacked)
+  FORWARD_SYMBOL_METHOD(isReference)
+  FORWARD_SYMBOL_METHOD(isScoped)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_METHOD(getUdtKind)
+  FORWARD_SYMBOL_METHOD(isUnalignedType)
+  FORWARD_SYMBOL_METHOD(getVirtualTableShapeId)
+  FORWARD_SYMBOL_METHOD(isVolatileType)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPETYPEDEF_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h
new file mode 100644
index 0000000..bf912b8
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h
@@ -0,0 +1,52 @@
+//===- PDBSymbolTypeUDT.h - UDT type info -----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEUDT_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEUDT_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolTypeUDT : public PDBSymbol {
+public:
+  PDBSymbolTypeUDT(const IPDBSession &PDBSession,
+                   std::unique_ptr<IPDBRawSymbol> UDTSymbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::UDT)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_METHOD(getUnmodifiedTypeId)
+  FORWARD_SYMBOL_METHOD(hasConstructor)
+  FORWARD_SYMBOL_METHOD(isConstType)
+  FORWARD_SYMBOL_METHOD(hasAssignmentOperator)
+  FORWARD_SYMBOL_METHOD(hasCastOperator)
+  FORWARD_SYMBOL_METHOD(hasNestedTypes)
+  FORWARD_SYMBOL_METHOD(getLength)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(isNested)
+  FORWARD_SYMBOL_METHOD(hasOverloadedOperator)
+  FORWARD_SYMBOL_METHOD(isPacked)
+  FORWARD_SYMBOL_METHOD(isScoped)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getUdtKind)
+  FORWARD_SYMBOL_METHOD(isUnalignedType)
+  FORWARD_SYMBOL_METHOD(getVirtualTableShapeId)
+  FORWARD_SYMBOL_METHOD(isVolatileType)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEUDT_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h
new file mode 100644
index 0000000..6b6d99b
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h
@@ -0,0 +1,40 @@
+//===- PDBSymbolTypeVTable.h - VTable type info -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEVTABLE_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEVTABLE_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolTypeVTable : public PDBSymbol {
+public:
+  PDBSymbolTypeVTable(const IPDBSession &PDBSession,
+                      std::unique_ptr<IPDBRawSymbol> VtblSymbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::VTable)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_METHOD(isConstType)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_METHOD(isUnalignedType)
+  FORWARD_SYMBOL_METHOD(isVolatileType)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEVTABLE_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h
new file mode 100644
index 0000000..e866106
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h
@@ -0,0 +1,39 @@
+//===- PDBSymbolTypeVTableShape.h - VTable shape info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEVTABLESHAPE_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEVTABLESHAPE_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolTypeVTableShape : public PDBSymbol {
+public:
+  PDBSymbolTypeVTableShape(const IPDBSession &PDBSession,
+                           std::unique_ptr<IPDBRawSymbol> VtblShapeSymbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::VTableShape)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(isConstType)
+  FORWARD_SYMBOL_METHOD(getCount)
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+  FORWARD_SYMBOL_METHOD(isUnalignedType)
+  FORWARD_SYMBOL_METHOD(isVolatileType)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEVTABLESHAPE_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolUnknown.h b/include/llvm/DebugInfo/PDB/PDBSymbolUnknown.h
new file mode 100644
index 0000000..7f8c6f9
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolUnknown.h
@@ -0,0 +1,34 @@
+//===- PDBSymbolUnknown.h - unknown symbol type -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLUNKNOWN_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLUNKNOWN_H
+
+#include "PDBSymbol.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolUnknown : public PDBSymbol {
+public:
+  PDBSymbolUnknown(const IPDBSession &PDBSession,
+                   std::unique_ptr<IPDBRawSymbol> UnknownSymbol);
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  static bool classof(const PDBSymbol *S) {
+    return (S->getSymTag() == PDB_SymType::None ||
+            S->getSymTag() >= PDB_SymType::Max);
+  }
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLUNKNOWN_H
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h b/include/llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h
new file mode 100644
index 0000000..59ec16b
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h
@@ -0,0 +1,36 @@
+//===- PDBSymbolUsingNamespace.h - using namespace info ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLUSINGNAMESPACE_H
+#define LLVM_DEBUGINFO_PDB_PDBSYMBOLUSINGNAMESPACE_H
+
+#include "PDBSymbol.h"
+#include "PDBTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class PDBSymbolUsingNamespace : public PDBSymbol {
+public:
+  PDBSymbolUsingNamespace(const IPDBSession &PDBSession,
+                          std::unique_ptr<IPDBRawSymbol> Symbol);
+
+  DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::UsingNamespace)
+
+  void dump(raw_ostream &OS, int Indent, PDBSymDumper &Dumper) const override;
+
+  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_METHOD(getName)
+  FORWARD_SYMBOL_METHOD(getSymIndexId)
+};
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_PDBSYMBOLUSINGNAMESPACE_H
diff --git a/include/llvm/DebugInfo/PDB/PDBTypes.h b/include/llvm/DebugInfo/PDB/PDBTypes.h
new file mode 100644
index 0000000..75b6475
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/PDBTypes.h
@@ -0,0 +1,479 @@
+//===- PDBTypes.h - Defines enums for various fields contained in PDB ---*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBTYPES_H
+#define LLVM_DEBUGINFO_PDB_PDBTYPES_H
+
+#include "llvm/Config/llvm-config.h"
+#include <functional>
+#include <stdint.h>
+
+namespace llvm {
+
+class PDBSymDumper;
+class PDBSymbol;
+
+class IPDBDataStream;
+template <class T> class IPDBEnumChildren;
+class IPDBRawSymbol;
+class IPDBSession;
+class IPDBSourceFile;
+
+typedef IPDBEnumChildren<PDBSymbol> IPDBEnumSymbols;
+typedef IPDBEnumChildren<IPDBSourceFile> IPDBEnumSourceFiles;
+typedef IPDBEnumChildren<IPDBDataStream> IPDBEnumDataStreams;
+
+class PDBSymbolExe;
+class PDBSymbolCompiland;
+class PDBSymbolCompilandDetails;
+class PDBSymbolCompilandEnv;
+class PDBSymbolFunc;
+class PDBSymbolBlock;
+class PDBSymbolData;
+class PDBSymbolAnnotation;
+class PDBSymbolLabel;
+class PDBSymbolPublicSymbol;
+class PDBSymbolTypeUDT;
+class PDBSymbolTypeEnum;
+class PDBSymbolTypeFunctionSig;
+class PDBSymbolTypePointer;
+class PDBSymbolTypeArray;
+class PDBSymbolTypeBuiltin;
+class PDBSymbolTypeTypedef;
+class PDBSymbolTypeBaseClass;
+class PDBSymbolTypeFriend;
+class PDBSymbolTypeFunctionArg;
+class PDBSymbolFuncDebugStart;
+class PDBSymbolFuncDebugEnd;
+class PDBSymbolUsingNamespace;
+class PDBSymbolTypeVTableShape;
+class PDBSymbolTypeVTable;
+class PDBSymbolCustom;
+class PDBSymbolThunk;
+class PDBSymbolTypeCustom;
+class PDBSymbolTypeManaged;
+class PDBSymbolTypeDimension;
+class PDBSymbolUnknown;
+
+/// Specifies which PDB reader implementation is to be used.  Only a value
+/// of PDB_ReaderType::DIA is supported.
+enum class PDB_ReaderType {
+  DIA = 0,
+};
+
+/// Defines a 128-bit unique identifier.  This maps to a GUID on Windows, but
+/// is abstracted here for the purposes of non-Windows platforms that don't have
+/// the GUID structure defined.
+struct PDB_UniqueId {
+  uint64_t HighPart;
+  uint64_t LowPart;
+};
+
+/// An enumeration indicating the type of data contained in this table.
+enum class PDB_TableType {
+  Symbols,
+  SourceFiles,
+  LineNumbers,
+  SectionContribs,
+  Segments,
+  InjectedSources,
+  FrameData
+};
+
+/// Defines flags used for enumerating child symbols.  This corresponds to the
+/// NameSearchOptions enumeration which is documented here:
+/// https://msdn.microsoft.com/en-us/library/yat28ads.aspx
+enum PDB_NameSearchFlags {
+  NS_Default = 0x0,
+  NS_CaseSensitive = 0x1,
+  NS_CaseInsensitive = 0x2,
+  NS_FileNameExtMatch = 0x4,
+  NS_Regex = 0x8,
+  NS_UndecoratedName = 0x10
+};
+
+/// Specifies the hash algorithm that a source file from a PDB was hashed with.
+/// This corresponds to the CV_SourceChksum_t enumeration and are documented
+/// here: https://msdn.microsoft.com/en-us/library/e96az21x.aspx
+enum class PDB_Checksum { None = 0, MD5 = 1, SHA1 = 2 };
+
+/// These values correspond to the CV_CPU_TYPE_e enumeration, and are documented
+/// here: https://msdn.microsoft.com/en-us/library/b2fc64ek.aspx
+enum class PDB_Cpu {
+  Intel8080 = 0x0,
+  Intel8086 = 0x1,
+  Intel80286 = 0x2,
+  Intel80386 = 0x3,
+  Intel80486 = 0x4,
+  Pentium = 0x5,
+  PentiumPro = 0x6,
+  Pentium3 = 0x7,
+  MIPS = 0x10,
+  MIPS16 = 0x11,
+  MIPS32 = 0x12,
+  MIPS64 = 0x13,
+  MIPSI = 0x14,
+  MIPSII = 0x15,
+  MIPSIII = 0x16,
+  MIPSIV = 0x17,
+  MIPSV = 0x18,
+  M68000 = 0x20,
+  M68010 = 0x21,
+  M68020 = 0x22,
+  M68030 = 0x23,
+  M68040 = 0x24,
+  Alpha = 0x30,
+  Alpha21164 = 0x31,
+  Alpha21164A = 0x32,
+  Alpha21264 = 0x33,
+  Alpha21364 = 0x34,
+  PPC601 = 0x40,
+  PPC603 = 0x41,
+  PPC604 = 0x42,
+  PPC620 = 0x43,
+  PPCFP = 0x44,
+  PPCBE = 0x45,
+  SH3 = 0x50,
+  SH3E = 0x51,
+  SH3DSP = 0x52,
+  SH4 = 0x53,
+  SHMedia = 0x54,
+  ARM3 = 0x60,
+  ARM4 = 0x61,
+  ARM4T = 0x62,
+  ARM5 = 0x63,
+  ARM5T = 0x64,
+  ARM6 = 0x65,
+  ARM_XMAC = 0x66,
+  ARM_WMMX = 0x67,
+  ARM7 = 0x68,
+  Omni = 0x70,
+  Ia64 = 0x80,
+  Ia64_2 = 0x81,
+  CEE = 0x90,
+  AM33 = 0xa0,
+  M32R = 0xb0,
+  TriCore = 0xc0,
+  X64 = 0xd0,
+  EBC = 0xe0,
+  Thumb = 0xf0,
+  ARMNT = 0xf4,
+  D3D11_Shader = 0x100,
+};
+
+enum class PDB_Machine {
+  Invalid = 0xffff,
+  Unknown = 0x0,
+  Am33 = 0x13,
+  Amd64 = 0x8664,
+  Arm = 0x1C0,
+  ArmNT = 0x1C4,
+  Ebc = 0xEBC,
+  x86 = 0x14C,
+  Ia64 = 0x200,
+  M32R = 0x9041,
+  Mips16 = 0x266,
+  MipsFpu = 0x366,
+  MipsFpu16 = 0x466,
+  PowerPC = 0x1F0,
+  PowerPCFP = 0x1F1,
+  R4000 = 0x166,
+  SH3 = 0x1A2,
+  SH3DSP = 0x1A3,
+  SH4 = 0x1A6,
+  SH5 = 0x1A8,
+  Thumb = 0x1C2,
+  WceMipsV2 = 0x169
+};
+
+/// These values correspond to the CV_call_e enumeration, and are documented
+/// at the following locations:
+///   https://msdn.microsoft.com/en-us/library/b2fc64ek.aspx
+///   https://msdn.microsoft.com/en-us/library/windows/desktop/ms680207(v=vs.85).aspx
+///
+enum class PDB_CallingConv {
+  NearCdecl = 0x00,
+  FarCdecl = 0x01,
+  NearPascal = 0x02,
+  FarPascal = 0x03,
+  NearFastcall = 0x04,
+  FarFastcall = 0x05,
+  Skipped = 0x06,
+  NearStdcall = 0x07,
+  FarStdcall = 0x08,
+  NearSyscall = 0x09,
+  FarSyscall = 0x0a,
+  Thiscall = 0x0b,
+  MipsCall = 0x0c,
+  Generic = 0x0d,
+  Alphacall = 0x0e,
+  Ppccall = 0x0f,
+  SuperHCall = 0x10,
+  Armcall = 0x11,
+  AM33call = 0x12,
+  Tricall = 0x13,
+  Sh5call = 0x14,
+  M32R = 0x15,
+  Clrcall = 0x16,
+  Inline = 0x17,
+  NearVectorcall = 0x18,
+  Reserved = 0x19,
+};
+
+/// These values correspond to the CV_CFL_LANG enumeration, and are documented
+/// here: https://msdn.microsoft.com/en-us/library/bw3aekw6.aspx
+enum class PDB_Lang {
+  C = 0x00,
+  Cpp = 0x01,
+  Fortran = 0x02,
+  Masm = 0x03,
+  Pascal = 0x04,
+  Basic = 0x05,
+  Cobol = 0x06,
+  Link = 0x07,
+  Cvtres = 0x08,
+  Cvtpgd = 0x09,
+  CSharp = 0x0a,
+  VB = 0x0b,
+  ILAsm = 0x0c,
+  Java = 0x0d,
+  JScript = 0x0e,
+  MSIL = 0x0f,
+  HLSL = 0x10
+};
+
+/// These values correspond to the DataKind enumeration, and are documented
+/// here: https://msdn.microsoft.com/en-us/library/b2x2t313.aspx
+enum class PDB_DataKind {
+  Unknown,
+  Local,
+  StaticLocal,
+  Param,
+  ObjectPtr,
+  FileStatic,
+  Global,
+  Member,
+  StaticMember,
+  Constant
+};
+
+/// These values correspond to the SymTagEnum enumeration, and are documented
+/// here: https://msdn.microsoft.com/en-us/library/bkedss5f.aspx
+enum class PDB_SymType {
+  None,
+  Exe,
+  Compiland,
+  CompilandDetails,
+  CompilandEnv,
+  Function,
+  Block,
+  Data,
+  Annotation,
+  Label,
+  PublicSymbol,
+  UDT,
+  Enum,
+  FunctionSig,
+  PointerType,
+  ArrayType,
+  BuiltinType,
+  Typedef,
+  BaseClass,
+  Friend,
+  FunctionArg,
+  FuncDebugStart,
+  FuncDebugEnd,
+  UsingNamespace,
+  VTableShape,
+  VTable,
+  Custom,
+  Thunk,
+  CustomType,
+  ManagedType,
+  Dimension,
+  Max
+};
+
+/// These values correspond to the LocationType enumeration, and are documented
+/// here: https://msdn.microsoft.com/en-us/library/f57kaez3.aspx
+enum class PDB_LocType {
+  Null,
+  Static,
+  TLS,
+  RegRel,
+  ThisRel,
+  Enregistered,
+  BitField,
+  Slot,
+  IlRel,
+  MetaData,
+  Constant,
+  Max
+};
+
+/// These values correspond to the THUNK_ORDINAL enumeration, and are documented
+/// here: https://msdn.microsoft.com/en-us/library/dh0k8hft.aspx
+enum class PDB_ThunkOrdinal {
+  Standard,
+  ThisAdjustor,
+  Vcall,
+  Pcode,
+  UnknownLoad,
+  TrampIncremental,
+  BranchIsland
+};
+
+/// These values correspond to the UdtKind enumeration, and are documented
+/// here: https://msdn.microsoft.com/en-us/library/wcstk66t.aspx
+enum class PDB_UdtType { Struct, Class, Union, Interface };
+
+/// These values correspond to the StackFrameTypeEnum enumeration, and are
+/// documented here: https://msdn.microsoft.com/en-us/library/bc5207xw.aspx.
+enum class PDB_StackFrameType { FPO, KernelTrap, KernelTSS, EBP, FrameData };
+
+/// These values correspond to the StackFrameTypeEnum enumeration, and are
+/// documented here: https://msdn.microsoft.com/en-us/library/bc5207xw.aspx.
+enum class PDB_MemoryType { Code, Data, Stack, HeapCode };
+
+/// These values correspond to the Basictype enumeration, and are documented
+/// here: https://msdn.microsoft.com/en-us/library/4szdtzc3.aspx
+enum class PDB_BuiltinType {
+  None = 0,
+  Void = 1,
+  Char = 2,
+  WCharT = 3,
+  Int = 6,
+  UInt = 7,
+  Float = 8,
+  BCD = 9,
+  Bool = 10,
+  Long = 13,
+  ULong = 14,
+  Currency = 25,
+  Date = 26,
+  Variant = 27,
+  Complex = 28,
+  Bitfield = 29,
+  BSTR = 30,
+  HResult = 31
+};
+
+enum class PDB_RegisterId {
+  Unknown = 0,
+  VFrame = 30006,
+  AL = 1,
+  CL = 2,
+  DL = 3,
+  BL = 4,
+  AH = 5,
+  CH = 6,
+  DH = 7,
+  BH = 8,
+  AX = 9,
+  CX = 10,
+  DX = 11,
+  BX = 12,
+  SP = 13,
+  BP = 14,
+  SI = 15,
+  DI = 16,
+  EAX = 17,
+  ECX = 18,
+  EDX = 19,
+  EBX = 20,
+  ESP = 21,
+  EBP = 22,
+  ESI = 23,
+  EDI = 24,
+  ES = 25,
+  CS = 26,
+  SS = 27,
+  DS = 28,
+  FS = 29,
+  GS = 30,
+  IP = 31,
+  RAX = 328,
+  RBX = 329,
+  RCX = 330,
+  RDX = 331,
+  RSI = 332,
+  RDI = 333,
+  RBP = 334,
+  RSP = 335,
+  R8 = 336,
+  R9 = 337,
+  R10 = 338,
+  R11 = 339,
+  R12 = 340,
+  R13 = 341,
+  R14 = 342,
+  R15 = 343,
+};
+
+enum class PDB_MemberAccess { Private = 1, Protected = 2, Public = 3 };
+
+struct VersionInfo {
+  uint32_t Major;
+  uint32_t Minor;
+  uint32_t Build;
+  uint32_t QFE;
+};
+
+enum PDB_VariantType {
+  Empty,
+  Unknown,
+  Int8,
+  Int16,
+  Int32,
+  Int64,
+  Single,
+  Double,
+  UInt8,
+  UInt16,
+  UInt32,
+  UInt64,
+  Bool,
+};
+
+struct Variant {
+  Variant()
+    : Type(PDB_VariantType::Empty) {
+  }
+
+  PDB_VariantType Type;
+  union {
+    bool Bool;
+    int8_t Int8;
+    int16_t Int16;
+    int32_t Int32;
+    int64_t Int64;
+    float Single;
+    double Double;
+    uint8_t UInt8;
+    uint16_t UInt16;
+    uint32_t UInt32;
+    uint64_t UInt64;
+    void* Pointer;
+  };
+};
+
+} // namespace llvm
+
+namespace std {
+template <> struct hash<llvm::PDB_SymType> {
+  typedef llvm::PDB_SymType argument_type;
+  typedef std::size_t result_type;
+
+  result_type operator()(const argument_type &Arg) const {
+    return std::hash<int>()(static_cast<int>(Arg));
+  }
+};
+}
+
+#endif
diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
index b9c0b61..17de5c7 100644
--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -138,10 +138,17 @@ protected:
   /// getMemoryforGV - Allocate memory for a global variable.
   virtual char *getMemoryForGV(const GlobalVariable *GV);
 
-  static ExecutionEngine *(*MCJITCtor)(std::unique_ptr<Module> M,
-                                       std::string *ErrorStr,
-                                       RTDyldMemoryManager *MCJMM,
-                                       std::unique_ptr<TargetMachine> TM);
+  static ExecutionEngine *(*MCJITCtor)(
+                                     std::unique_ptr<Module> M,
+                                     std::string *ErrorStr,
+                                     std::unique_ptr<RTDyldMemoryManager> MCJMM,
+                                     std::unique_ptr<TargetMachine> TM);
+
+  static ExecutionEngine *(*OrcMCJITReplacementCtor)(
+                                    std::string *ErrorStr,
+                                    std::unique_ptr<RTDyldMemoryManager> OrcJMM,
+                                    std::unique_ptr<TargetMachine> TM);
+
   static ExecutionEngine *(*InterpCtor)(std::unique_ptr<Module> M,
                                         std::string *ErrorStr);
 
@@ -463,6 +470,7 @@ public:
   }
 
 protected:
+  ExecutionEngine() : EEState(*this) {}
   explicit ExecutionEngine(std::unique_ptr<Module> M);
 
   void emitGlobals();
@@ -492,7 +500,7 @@ private:
   EngineKind::Kind WhichEngine;
   std::string *ErrorStr;
   CodeGenOpt::Level OptLevel;
-  RTDyldMemoryManager *MCJMM;
+  std::unique_ptr<RTDyldMemoryManager> MCJMM;
   TargetOptions Options;
   Reloc::Model RelocModel;
   CodeModel::Model CMModel;
@@ -500,15 +508,20 @@ private:
   std::string MCPU;
   SmallVector<std::string, 4> MAttrs;
   bool VerifyModules;
+  bool UseOrcMCJITReplacement;
 
   /// InitEngine - Does the common initialization of default options.
   void InitEngine();
 
 public:
+  /// Default constructor for EngineBuilder.
+  EngineBuilder();
+
   /// Constructor for EngineBuilder.
-  EngineBuilder(std::unique_ptr<Module> M) : M(std::move(M)) {
-    InitEngine();
-  }
+  EngineBuilder(std::unique_ptr<Module> M);
+
+  // Out-of-line since we don't have the def'n of RTDyldMemoryManager here.
+  ~EngineBuilder();
 
   /// setEngineKind - Controls whether the user wants the interpreter, the JIT,
   /// or whichever engine works.  This option defaults to EngineKind::Either.
@@ -523,10 +536,7 @@ public:
   /// to create anything other than MCJIT will cause a runtime error. If create()
   /// is called and is successful, the created engine takes ownership of the
   /// memory manager. This option defaults to NULL.
-  EngineBuilder &setMCJITMemoryManager(RTDyldMemoryManager *mcjmm) {
-    MCJMM = mcjmm;
-    return *this;
-  }
+  EngineBuilder &setMCJITMemoryManager(std::unique_ptr<RTDyldMemoryManager> mcjmm);
 
   /// setErrorStr - Set the error string to write to on error.  This option
   /// defaults to NULL.
@@ -591,6 +601,11 @@ public:
     return *this;
   }
 
+  // \brief Use OrcMCJITReplacement instead of MCJIT. Off by default.
+  void setUseOrcMCJITReplacement(bool UseOrcMCJITReplacement) {
+    this->UseOrcMCJITReplacement = UseOrcMCJITReplacement;
+  }
+
   TargetMachine *selectTarget();
 
   /// selectTarget - Pick a target either via -march or by guessing the native
diff --git a/include/llvm/ExecutionEngine/JITEventListener.h b/include/llvm/ExecutionEngine/JITEventListener.h
index cef3aa2..c3edec8 100644
--- a/include/llvm/ExecutionEngine/JITEventListener.h
+++ b/include/llvm/ExecutionEngine/JITEventListener.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_EXECUTIONENGINE_JITEVENTLISTENER_H
 #define LLVM_EXECUTIONENGINE_JITEVENTLISTENER_H
 
+#include "RuntimeDyld.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Support/DataTypes.h"
@@ -25,7 +26,10 @@ class Function;
 class MachineFunction;
 class OProfileWrapper;
 class IntelJITEventsWrapper;
-class ObjectImage;
+
+namespace object {
+  class ObjectFile;
+}
 
 /// JITEvent_EmittedFunctionDetails - Helper struct for containing information
 /// about a generated machine code function.
@@ -57,7 +61,7 @@ public:
 
 public:
   JITEventListener() {}
-  virtual ~JITEventListener();
+  virtual ~JITEventListener() {}
 
   /// NotifyObjectEmitted - Called after an object has been successfully
   /// emitted to memory.  NotifyFunctionEmitted will not be called for
@@ -67,11 +71,15 @@ public:
   /// The ObjectImage contains the generated object image
   /// with section headers updated to reflect the address at which sections
   /// were loaded and with relocations performed in-place on debug sections.
-  virtual void NotifyObjectEmitted(const ObjectImage &Obj) {}
+  virtual void NotifyObjectEmitted(const object::ObjectFile &Obj,
+                                   const RuntimeDyld::LoadedObjectInfo &L) {}
 
   /// NotifyFreeingObject - Called just before the memory associated with
   /// a previously emitted object is released.
-  virtual void NotifyFreeingObject(const ObjectImage &Obj) {}
+  virtual void NotifyFreeingObject(const object::ObjectFile &Obj) {}
+
+  // Get a pointe to the GDB debugger registration listener.
+  static JITEventListener *createGDBRegistrationListener();
 
 #if LLVM_USE_INTEL_JITEVENTS
   // Construct an IntelJITEventListener
@@ -105,7 +113,8 @@ public:
     return nullptr;
   }
 #endif // USE_OPROFILE
-
+private:
+  virtual void anchor();
 };
 
 } // end namespace llvm.
diff --git a/include/llvm/ExecutionEngine/ObjectBuffer.h b/include/llvm/ExecutionEngine/ObjectBuffer.h
deleted file mode 100644
index ee4820a..0000000
--- a/include/llvm/ExecutionEngine/ObjectBuffer.h
+++ /dev/null
@@ -1,76 +0,0 @@
-//===---- ObjectBuffer.h - Utility class to wrap object image memory -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares a wrapper class to hold the memory into which an
-// object will be generated.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_OBJECTBUFFER_H
-#define LLVM_EXECUTIONENGINE_OBJECTBUFFER_H
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-
-/// This class acts as a container for the memory buffer used during generation
-/// and loading of executable objects using MCJIT and RuntimeDyld. The
-/// underlying memory for the object will be owned by the ObjectBuffer instance
-/// throughout its lifetime.
-class ObjectBuffer {
-  virtual void anchor();
-public:
-  ObjectBuffer() {}
-  ObjectBuffer(std::unique_ptr<MemoryBuffer> Buf) : Buffer(std::move(Buf)) {}
-  virtual ~ObjectBuffer() {}
-
-  MemoryBufferRef getMemBuffer() const { return Buffer->getMemBufferRef(); }
-
-  const char *getBufferStart() const { return Buffer->getBufferStart(); }
-  size_t getBufferSize() const { return Buffer->getBufferSize(); }
-  StringRef getBuffer() const { return Buffer->getBuffer(); }
-  StringRef getBufferIdentifier() const {
-    return Buffer->getBufferIdentifier();
-  }
-
-protected:
-  // The memory contained in an ObjectBuffer
-  std::unique_ptr<MemoryBuffer> Buffer;
-};
-
-/// This class encapsulates the SmallVector and raw_svector_ostream needed to
-/// generate an object using MC code emission while providing a common
-/// ObjectBuffer interface for access to the memory once the object has been
-/// generated.
-class ObjectBufferStream : public ObjectBuffer {
-  void anchor() override;
-public:
-  ObjectBufferStream() : OS(SV) {}
-  virtual ~ObjectBufferStream() {}
-
-  raw_ostream &getOStream() { return OS; }
-  void flush()
-  {
-    OS.flush();
-
-    // Make the data accessible via the ObjectBuffer::Buffer
-    Buffer =
-        MemoryBuffer::getMemBuffer(StringRef(SV.data(), SV.size()), "", false);
-  }
-
-protected:
-  SmallVector<char, 4096> SV; // Working buffer into which we JIT.
-  raw_svector_ostream     OS; // streaming wrapper
-};
-
-} // namespace llvm
-
-#endif
diff --git a/include/llvm/ExecutionEngine/ObjectImage.h b/include/llvm/ExecutionEngine/ObjectImage.h
deleted file mode 100644
index dc142bd..0000000
--- a/include/llvm/ExecutionEngine/ObjectImage.h
+++ /dev/null
@@ -1,76 +0,0 @@
-//===---- ObjectImage.h - Format independent executuable object image -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares a file format independent ObjectImage class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_OBJECTIMAGE_H
-#define LLVM_EXECUTIONENGINE_OBJECTIMAGE_H
-
-#include "llvm/ExecutionEngine/ObjectBuffer.h"
-#include "llvm/Object/ObjectFile.h"
-
-namespace llvm {
-
-
-/// ObjectImage - A container class that represents an ObjectFile that has been
-/// or is in the process of being loaded into memory for execution.
-class ObjectImage {
-  ObjectImage() LLVM_DELETED_FUNCTION;
-  ObjectImage(const ObjectImage &other) LLVM_DELETED_FUNCTION;
-  virtual void anchor();
-
-protected:
-  std::unique_ptr<ObjectBuffer> Buffer;
-
-public:
-  ObjectImage(std::unique_ptr<ObjectBuffer> Input) : Buffer(std::move(Input)) {}
-  virtual ~ObjectImage() {}
-
-  virtual object::symbol_iterator begin_symbols() const = 0;
-  virtual object::symbol_iterator end_symbols() const = 0;
-  iterator_range<object::symbol_iterator> symbols() const {
-    return iterator_range<object::symbol_iterator>(begin_symbols(),
-                                                   end_symbols());
-  }
-
-  virtual object::section_iterator begin_sections() const = 0;
-  virtual object::section_iterator end_sections() const  = 0;
-  iterator_range<object::section_iterator> sections() const {
-    return iterator_range<object::section_iterator>(begin_sections(),
-                                                    end_sections());
-  }
-
-  virtual /* Triple::ArchType */ unsigned getArch() const = 0;
-
-  // Return the name associated with this ObjectImage.
-  // This is usually the name of the file or MemoryBuffer that the the
-  // ObjectBuffer was constructed from.
-  StringRef getImageName() const { return Buffer->getBufferIdentifier(); }
-
-  // Subclasses can override these methods to update the image with loaded
-  // addresses for sections and common symbols
-  virtual void updateSectionAddress(const object::SectionRef &Sec,
-                                    uint64_t Addr) = 0;
-  virtual void updateSymbolAddress(const object::SymbolRef &Sym,
-                                   uint64_t Addr) = 0;
-
-  virtual StringRef getData() const = 0;
-
-  virtual object::ObjectFile* getObjectFile() const = 0;
-
-  // Subclasses can override these methods to provide JIT debugging support
-  virtual void registerWithDebugger() = 0;
-  virtual void deregisterWithDebugger() = 0;
-};
-
-} // end namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_OBJECTIMAGE_H
diff --git a/include/llvm/ExecutionEngine/ObjectMemoryBuffer.h b/include/llvm/ExecutionEngine/ObjectMemoryBuffer.h
new file mode 100644
index 0000000..b075611
--- /dev/null
+++ b/include/llvm/ExecutionEngine/ObjectMemoryBuffer.h
@@ -0,0 +1,63 @@
+//===- ObjectMemoryBuffer.h - SmallVector-backed MemoryBuffrer  -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a wrapper class to hold the memory into which an
+// object will be generated.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_OBJECTMEMORYBUFFER_H
+#define LLVM_EXECUTIONENGINE_OBJECTMEMORYBUFFER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+/// \brief SmallVector-backed MemoryBuffer instance.
+///
+/// This class enables efficient construction of MemoryBuffers from SmallVector
+/// instances. This is useful for MCJIT and Orc, where object files are streamed
+/// into SmallVectors, then inspected using ObjectFile (which takes a
+/// MemoryBuffer).
+class ObjectMemoryBuffer : public MemoryBuffer {
+public:
+
+  /// \brief Construct an ObjectMemoryBuffer from the given SmallVector r-value.
+  ///
+  /// FIXME: It'd be nice for this to be a non-templated constructor taking a
+  /// SmallVectorImpl here instead of a templated one taking a SmallVector<N>,
+  /// but SmallVector's move-construction/assignment currently only take
+  /// SmallVectors. If/when that is fixed we can simplify this constructor and
+  /// the following one.
+  ObjectMemoryBuffer(SmallVectorImpl<char> &&SV)
+    : SV(std::move(SV)), BufferName("<in-memory object>") {
+    init(this->SV.begin(), this->SV.end(), false);
+  }
+
+  /// \brief Construct a named ObjectMemoryBuffer from the given SmallVector
+  ///        r-value and StringRef.
+  ObjectMemoryBuffer(SmallVectorImpl<char> &&SV, StringRef Name)
+    : SV(std::move(SV)), BufferName(Name) {
+    init(this->SV.begin(), this->SV.end(), false);
+  }
+
+  const char* getBufferIdentifier() const override { return BufferName.c_str(); }
+
+  BufferKind getBufferKind() const override { return MemoryBuffer_Malloc; }
+
+private:
+  SmallVector<char, 0> SV;
+  std::string BufferName;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/ExecutionEngine/Orc/CloneSubModule.h b/include/llvm/ExecutionEngine/Orc/CloneSubModule.h
new file mode 100644
index 0000000..1bd3955
--- /dev/null
+++ b/include/llvm/ExecutionEngine/Orc/CloneSubModule.h
@@ -0,0 +1,60 @@
+//===-- CloneSubModule.h - Utilities for extracting sub-modules -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains utilities for extracting sub-modules. Useful for breaking up modules
+// for lazy jitting.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_CLONESUBMODULE_H
+#define LLVM_EXECUTIONENGINE_ORC_CLONESUBMODULE_H
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <functional>
+
+namespace llvm {
+
+class Function;
+class GlobalVariable;
+class Module;
+
+namespace orc {
+
+/// @brief Functor type for describing how CloneSubModule should mutate a
+///        GlobalVariable.
+typedef std::function<void(GlobalVariable &, const GlobalVariable &,
+                           ValueToValueMapTy &)> HandleGlobalVariableFtor;
+
+/// @brief Functor type for describing how CloneSubModule should mutate a
+///        Function.
+typedef std::function<void(Function &, const Function &, ValueToValueMapTy &)>
+    HandleFunctionFtor;
+
+/// @brief Copies the initializer from Orig to New.
+///
+///   Type is suitable for implicit conversion to a HandleGlobalVariableFtor.
+void copyGVInitializer(GlobalVariable &New, const GlobalVariable &Orig,
+                       ValueToValueMapTy &VMap);
+
+/// @brief Copies the body of Orig to New.
+///
+///   Type is suitable for implicit conversion to a HandleFunctionFtor.
+void copyFunctionBody(Function &New, const Function &Orig,
+                      ValueToValueMapTy &VMap);
+
+/// @brief Clone a subset of the module Src into Dst.
+void CloneSubModule(Module &Dst, const Module &Src,
+                    HandleGlobalVariableFtor HandleGlobalVariable,
+                    HandleFunctionFtor HandleFunction, bool KeepInlineAsm);
+
+} // End namespace orc.
+} // End namespace llvm.
+
+#endif // LLVM_EXECUTIONENGINE_ORC_CLONESUBMODULE_H
diff --git a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
new file mode 100644
index 0000000..0e218e2
--- /dev/null
+++ b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -0,0 +1,355 @@
+//===- CompileOnDemandLayer.h - Compile each function on demand -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// JIT layer for breaking up modules and inserting callbacks to allow
+// individual functions to be compiled on demand.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_COMPILEONDEMANDLAYER_H
+#define LLVM_EXECUTIONENGINE_ORC_COMPILEONDEMANDLAYER_H
+
+#include "IndirectionUtils.h"
+#include "LookasideRTDyldMM.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include <list>
+
+namespace llvm {
+namespace orc {
+
+/// @brief Compile-on-demand layer.
+///
+///   Modules added to this layer have their calls indirected, and are then
+/// broken up into a set of single-function modules, each of which is added
+/// to the layer below in a singleton set. The lower layer can be any layer that
+/// accepts IR module sets.
+///
+/// It is expected that this layer will frequently be used on top of a
+/// LazyEmittingLayer. The combination of the two ensures that each function is
+/// compiled only when it is first called.
+template <typename BaseLayerT, typename CompileCallbackMgrT>
+class CompileOnDemandLayer {
+public:
+  /// @brief Lookup helper that provides compatibility with the classic
+  ///        static-compilation symbol resolution process.
+  ///
+  ///   The CompileOnDemand (COD) layer splits modules up into multiple
+  /// sub-modules, each held in its own llvm::Module instance, in order to
+  /// support lazy compilation. When a module that contains private symbols is
+  /// broken up symbol linkage changes may be required to enable access to
+  /// "private" data that now resides in a different llvm::Module instance. To
+  /// retain expected symbol resolution behavior for clients of the COD layer,
+  /// the CODScopedLookup class uses a two-tiered lookup system to resolve
+  /// symbols. Lookup first scans sibling modules that were split from the same
+  /// original module (logical-module scoped lookup), then scans all other
+  /// modules that have been added to the lookup scope (logical-dylib scoped
+  /// lookup).
+  class CODScopedLookup {
+  private:
+    typedef typename BaseLayerT::ModuleSetHandleT BaseLayerModuleSetHandleT;
+    typedef std::vector<BaseLayerModuleSetHandleT> SiblingHandlesList;
+    typedef std::list<SiblingHandlesList> PseudoDylibModuleSetHandlesList;
+
+  public:
+    /// @brief Handle for a logical module.
+    typedef typename PseudoDylibModuleSetHandlesList::iterator LMHandle;
+
+    /// @brief Construct a scoped lookup.
+    CODScopedLookup(BaseLayerT &BaseLayer) : BaseLayer(BaseLayer) {}
+
+    /// @brief Start a new context for a single logical module.
+    LMHandle createLogicalModule() {
+      Handles.push_back(SiblingHandlesList());
+      return std::prev(Handles.end());
+    }
+
+    /// @brief Add a concrete Module's handle to the given logical Module's
+    ///        lookup scope.
+    void addToLogicalModule(LMHandle LMH, BaseLayerModuleSetHandleT H) {
+      LMH->push_back(H);
+    }
+
+    /// @brief Remove a logical Module from the CODScopedLookup entirely.
+    void removeLogicalModule(LMHandle LMH) { Handles.erase(LMH); }
+
+    /// @brief Look up a symbol in this context.
+    JITSymbol findSymbol(LMHandle LMH, const std::string &Name) {
+      if (auto Symbol = findSymbolIn(LMH, Name))
+        return Symbol;
+
+      for (auto I = Handles.begin(), E = Handles.end(); I != E; ++I)
+        if (I != LMH)
+          if (auto Symbol = findSymbolIn(I, Name))
+            return Symbol;
+
+      return nullptr;
+    }
+
+  private:
+
+    JITSymbol findSymbolIn(LMHandle LMH, const std::string &Name) {
+      for (auto H : *LMH)
+        if (auto Symbol = BaseLayer.findSymbolIn(H, Name, false))
+          return Symbol;
+      return nullptr;
+    }
+
+    BaseLayerT &BaseLayer;
+    PseudoDylibModuleSetHandlesList Handles;
+  };
+
+private:
+  typedef typename BaseLayerT::ModuleSetHandleT BaseLayerModuleSetHandleT;
+  typedef std::vector<BaseLayerModuleSetHandleT> BaseLayerModuleSetHandleListT;
+
+  struct ModuleSetInfo {
+    // Symbol lookup - just one for the whole module set.
+    std::shared_ptr<CODScopedLookup> Lookup;
+
+    // Logical module handles.
+    std::vector<typename CODScopedLookup::LMHandle> LMHandles;
+
+    // List of vectors of module set handles:
+    // One vector per logical module - each vector holds the handles for the
+    // exploded modules for that logical module in the base layer.
+    BaseLayerModuleSetHandleListT BaseLayerModuleSetHandles;
+
+    ModuleSetInfo(std::shared_ptr<CODScopedLookup> Lookup)
+        : Lookup(std::move(Lookup)) {}
+
+    void releaseResources(BaseLayerT &BaseLayer) {
+      for (auto LMH : LMHandles)
+        Lookup->removeLogicalModule(LMH);
+      for (auto H : BaseLayerModuleSetHandles)
+        BaseLayer.removeModuleSet(H);
+    }
+  };
+
+  typedef std::list<ModuleSetInfo> ModuleSetInfoListT;
+
+public:
+  /// @brief Handle to a set of loaded modules.
+  typedef typename ModuleSetInfoListT::iterator ModuleSetHandleT;
+
+  // @brief Fallback lookup functor.
+  typedef std::function<uint64_t(const std::string &)> LookupFtor;
+
+  /// @brief Construct a compile-on-demand layer instance.
+  CompileOnDemandLayer(BaseLayerT &BaseLayer, LLVMContext &Context)
+    : BaseLayer(BaseLayer),
+      CompileCallbackMgr(BaseLayer, Context, 0, 64) {}
+
+  /// @brief Add a module to the compile-on-demand layer.
+  template <typename ModuleSetT>
+  ModuleSetHandleT addModuleSet(ModuleSetT Ms,
+                                LookupFtor FallbackLookup = nullptr) {
+
+    // If the user didn't supply a fallback lookup then just use
+    // getSymbolAddress.
+    if (!FallbackLookup)
+      FallbackLookup = [=](const std::string &Name) {
+                         return findSymbol(Name, true).getAddress();
+                       };
+
+    // Create a lookup context and ModuleSetInfo for this module set.
+    // For the purposes of symbol resolution the set Ms will be treated as if
+    // the modules it contained had been linked together as a dylib.
+    auto DylibLookup = std::make_shared<CODScopedLookup>(BaseLayer);
+    ModuleSetHandleT H =
+        ModuleSetInfos.insert(ModuleSetInfos.end(), ModuleSetInfo(DylibLookup));
+    ModuleSetInfo &MSI = ModuleSetInfos.back();
+
+    // Process each of the modules in this module set.
+    for (auto &M : Ms)
+      partitionAndAdd(*M, MSI, FallbackLookup);
+
+    return H;
+  }
+
+  /// @brief Remove the module represented by the given handle.
+  ///
+  ///   This will remove all modules in the layers below that were derived from
+  /// the module represented by H.
+  void removeModuleSet(ModuleSetHandleT H) {
+    H->releaseResources(BaseLayer);
+    ModuleSetInfos.erase(H);
+  }
+
+  /// @brief Search for the given named symbol.
+  /// @param Name The name of the symbol to search for.
+  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
+  /// @return A handle for the given named symbol, if it exists.
+  JITSymbol findSymbol(StringRef Name, bool ExportedSymbolsOnly) {
+    return BaseLayer.findSymbol(Name, ExportedSymbolsOnly);
+  }
+
+  /// @brief Get the address of a symbol provided by this layer, or some layer
+  ///        below this one.
+  JITSymbol findSymbolIn(ModuleSetHandleT H, const std::string &Name,
+                         bool ExportedSymbolsOnly) {
+    BaseLayerModuleSetHandleListT &BaseLayerHandles = H->second;
+    for (auto &BH : BaseLayerHandles) {
+      if (auto Symbol = BaseLayer.findSymbolIn(BH, Name, ExportedSymbolsOnly))
+        return Symbol;
+    }
+    return nullptr;
+  }
+
+private:
+
+  void partitionAndAdd(Module &M, ModuleSetInfo &MSI,
+                       LookupFtor FallbackLookup) {
+    const char *AddrSuffix = "$orc_addr";
+    const char *BodySuffix = "$orc_body";
+
+    // We're going to break M up into a bunch of sub-modules, but we want
+    // internal linkage symbols to still resolve sensibly. CODScopedLookup
+    // provides the "logical module" concept to make this work, so create a
+    // new logical module for M.
+    auto DylibLookup = MSI.Lookup;
+    auto LogicalModule = DylibLookup->createLogicalModule();
+    MSI.LMHandles.push_back(LogicalModule);
+
+    // Partition M into a "globals and stubs" module, a "common symbols" module,
+    // and a list of single-function modules.
+    auto PartitionedModule = fullyPartition(M);
+    auto StubsModule = std::move(PartitionedModule.GlobalVars);
+    auto CommonsModule = std::move(PartitionedModule.Commons);
+    auto FunctionModules = std::move(PartitionedModule.Functions);
+
+    // Emit the commons stright away.
+    auto CommonHandle = addModule(std::move(CommonsModule), MSI, LogicalModule,
+                                  FallbackLookup);
+    BaseLayer.emitAndFinalize(CommonHandle);
+
+    // Map of definition names to callback-info data structures. We'll use
+    // this to build the compile actions for the stubs below.
+    typedef std::map<std::string,
+                     typename CompileCallbackMgrT::CompileCallbackInfo>
+      StubInfoMap;
+    StubInfoMap StubInfos;
+
+    // Now we need to take each of the extracted Modules and add them to
+    // base layer. Each Module will be added individually to make sure they
+    // can be compiled separately, and each will get its own lookaside
+    // memory manager that will resolve within this logical module first.
+    for (auto &SubM : FunctionModules) {
+
+      // Keep track of the stubs we create for this module so that we can set
+      // their compile actions.
+      std::vector<typename StubInfoMap::iterator> NewStubInfos;
+
+      // Search for function definitions and insert stubs into the stubs
+      // module.
+      for (auto &F : *SubM) {
+        if (F.isDeclaration())
+          continue;
+
+        std::string Name = F.getName();
+        Function *Proto = StubsModule->getFunction(Name);
+        assert(Proto && "Failed to clone function decl into stubs module.");
+        auto CallbackInfo =
+          CompileCallbackMgr.getCompileCallback(*Proto->getFunctionType());
+        GlobalVariable *FunctionBodyPointer =
+          createImplPointer(*Proto, Name + AddrSuffix,
+                            CallbackInfo.getAddress());
+        makeStub(*Proto, *FunctionBodyPointer);
+
+        F.setName(Name + BodySuffix);
+        F.setVisibility(GlobalValue::HiddenVisibility);
+
+        auto KV = std::make_pair(std::move(Name), std::move(CallbackInfo));
+        NewStubInfos.push_back(StubInfos.insert(StubInfos.begin(), KV));
+      }
+
+      auto H = addModule(std::move(SubM), MSI, LogicalModule, FallbackLookup);
+
+      // Set the compile actions for this module:
+      for (auto &KVPair : NewStubInfos) {
+        std::string BodyName = Mangle(KVPair->first + BodySuffix,
+                                      *M.getDataLayout());
+        auto &CCInfo = KVPair->second;
+        CCInfo.setCompileAction(
+          [=](){
+            return BaseLayer.findSymbolIn(H, BodyName, false).getAddress();
+          });
+      }
+
+    }
+
+    // Ok - we've processed all the partitioned modules. Now add the
+    // stubs/globals module and set the update actions.
+    auto StubsH =
+      addModule(std::move(StubsModule), MSI, LogicalModule, FallbackLookup);
+
+    for (auto &KVPair : StubInfos) {
+      std::string AddrName = Mangle(KVPair.first + AddrSuffix,
+                                    *M.getDataLayout());
+      auto &CCInfo = KVPair.second;
+      CCInfo.setUpdateAction(
+        CompileCallbackMgr.getLocalFPUpdater(StubsH, AddrName));
+    }
+  }
+
+  // Add the given Module to the base layer using a memory manager that will
+  // perform the appropriate scoped lookup (i.e. will look first with in the
+  // module from which it was extracted, then into the set to which that module
+  // belonged, and finally externally).
+  BaseLayerModuleSetHandleT addModule(
+                               std::unique_ptr<Module> M,
+                               ModuleSetInfo &MSI,
+                               typename CODScopedLookup::LMHandle LogicalModule,
+                               LookupFtor FallbackLookup) {
+
+    // Add this module to the JIT with a memory manager that uses the
+    // DylibLookup to resolve symbols.
+    std::vector<std::unique_ptr<Module>> MSet;
+    MSet.push_back(std::move(M));
+
+    auto DylibLookup = MSI.Lookup;
+    auto MM =
+      createLookasideRTDyldMM<SectionMemoryManager>(
+        [=](const std::string &Name) {
+          if (auto Symbol = DylibLookup->findSymbol(LogicalModule, Name))
+            return Symbol.getAddress();
+          return FallbackLookup(Name);
+        },
+        [=](const std::string &Name) {
+          return DylibLookup->findSymbol(LogicalModule, Name).getAddress();
+        });
+
+    BaseLayerModuleSetHandleT H =
+      BaseLayer.addModuleSet(std::move(MSet), std::move(MM));
+    // Add this module to the logical module lookup.
+    DylibLookup->addToLogicalModule(LogicalModule, H);
+    MSI.BaseLayerModuleSetHandles.push_back(H);
+
+    return H;
+  }
+
+  static std::string Mangle(StringRef Name, const DataLayout &DL) {
+    Mangler M(&DL);
+    std::string MangledName;
+    {
+      raw_string_ostream MangledNameStream(MangledName);
+      M.getNameWithPrefix(MangledNameStream, Name);
+    }
+    return MangledName;
+  }
+
+  BaseLayerT &BaseLayer;
+  CompileCallbackMgrT CompileCallbackMgr;
+  ModuleSetInfoListT ModuleSetInfos;
+};
+
+} // End namespace orc.
+} // End namespace llvm.
+
+#endif // LLVM_EXECUTIONENGINE_ORC_COMPILEONDEMANDLAYER_H
diff --git a/include/llvm/ExecutionEngine/Orc/CompileUtils.h b/include/llvm/ExecutionEngine/Orc/CompileUtils.h
new file mode 100644
index 0000000..49a1fba
--- /dev/null
+++ b/include/llvm/ExecutionEngine/Orc/CompileUtils.h
@@ -0,0 +1,62 @@
+//===-- CompileUtils.h - Utilities for compiling IR in the JIT --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains utilities for compiling IR to object files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_COMPILEUTILS_H
+#define LLVM_EXECUTIONENGINE_ORC_COMPILEUTILS_H
+
+#include "llvm/ExecutionEngine/ObjectMemoryBuffer.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+namespace orc {
+
+/// @brief Simple compile functor: Takes a single IR module and returns an
+///        ObjectFile.
+class SimpleCompiler {
+public:
+  /// @brief Construct a simple compile functor with the given target.
+  SimpleCompiler(TargetMachine &TM) : TM(TM) {}
+
+  /// @brief Compile a Module to an ObjectFile.
+  object::OwningBinary<object::ObjectFile> operator()(Module &M) const {
+    SmallVector<char, 0> ObjBufferSV;
+    raw_svector_ostream ObjStream(ObjBufferSV);
+
+    legacy::PassManager PM;
+    MCContext *Ctx;
+    if (TM.addPassesToEmitMC(PM, Ctx, ObjStream))
+      llvm_unreachable("Target does not support MC emission.");
+    PM.run(M);
+    ObjStream.flush();
+    std::unique_ptr<MemoryBuffer> ObjBuffer(
+        new ObjectMemoryBuffer(std::move(ObjBufferSV)));
+    ErrorOr<std::unique_ptr<object::ObjectFile>> Obj =
+        object::ObjectFile::createObjectFile(ObjBuffer->getMemBufferRef());
+    // TODO: Actually report errors helpfully.
+    typedef object::OwningBinary<object::ObjectFile> OwningObj;
+    if (Obj)
+      return OwningObj(std::move(*Obj), std::move(ObjBuffer));
+    return OwningObj(nullptr, nullptr);
+  }
+
+private:
+  TargetMachine &TM;
+};
+
+} // End namespace orc.
+} // End namespace llvm.
+
+#endif // LLVM_EXECUTIONENGINE_ORC_COMPILEUTILS_H
diff --git a/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h b/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
new file mode 100644
index 0000000..6a47622
--- /dev/null
+++ b/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
@@ -0,0 +1,146 @@
+//===------ IRCompileLayer.h -- Eagerly compile IR for JIT ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains the definition for a basic, eagerly compiling layer of the JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_IRCOMPILELAYER_H
+#define LLVM_EXECUTIONENGINE_ORC_IRCOMPILELAYER_H
+
+#include "JITSymbol.h"
+#include "llvm/ExecutionEngine/ObjectCache.h"
+#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
+#include "llvm/Object/ObjectFile.h"
+#include <memory>
+
+namespace llvm {
+namespace orc {
+
+/// @brief Eager IR compiling layer.
+///
+///   This layer accepts sets of LLVM IR Modules (via addModuleSet). It
+/// immediately compiles each IR module to an object file (each IR Module is
+/// compiled separately). The resulting set of object files is then added to
+/// the layer below, which must implement the object layer concept.
+template <typename BaseLayerT> class IRCompileLayer {
+public:
+  typedef std::function<object::OwningBinary<object::ObjectFile>(Module &)>
+      CompileFtor;
+
+private:
+  typedef typename BaseLayerT::ObjSetHandleT ObjSetHandleT;
+
+  typedef std::vector<std::unique_ptr<object::ObjectFile>> OwningObjectVec;
+  typedef std::vector<std::unique_ptr<MemoryBuffer>> OwningBufferVec;
+
+public:
+  /// @brief Handle to a set of compiled modules.
+  typedef ObjSetHandleT ModuleSetHandleT;
+
+  /// @brief Construct an IRCompileLayer with the given BaseLayer, which must
+  ///        implement the ObjectLayer concept.
+  IRCompileLayer(BaseLayerT &BaseLayer, CompileFtor Compile)
+      : BaseLayer(BaseLayer), Compile(std::move(Compile)), ObjCache(nullptr) {}
+
+  /// @brief Set an ObjectCache to query before compiling.
+  void setObjectCache(ObjectCache *NewCache) { ObjCache = NewCache; }
+
+  /// @brief Compile each module in the given module set, then then add the
+  ///        resulting set of objects to the base layer, along with the memory
+  //         manager MM.
+  ///
+  /// @return A handle for the added modules.
+  template <typename ModuleSetT>
+  ModuleSetHandleT addModuleSet(ModuleSetT Ms,
+                                std::unique_ptr<RTDyldMemoryManager> MM) {
+    OwningObjectVec Objects;
+    OwningBufferVec Buffers;
+
+    for (const auto &M : Ms) {
+      std::unique_ptr<object::ObjectFile> Object;
+      std::unique_ptr<MemoryBuffer> Buffer;
+
+      if (ObjCache)
+        std::tie(Object, Buffer) = tryToLoadFromObjectCache(*M).takeBinary();
+
+      if (!Object) {
+        std::tie(Object, Buffer) = Compile(*M).takeBinary();
+        if (ObjCache)
+          ObjCache->notifyObjectCompiled(&*M, Buffer->getMemBufferRef());
+      }
+
+      Objects.push_back(std::move(Object));
+      Buffers.push_back(std::move(Buffer));
+    }
+
+    ModuleSetHandleT H =
+      BaseLayer.addObjectSet(Objects, std::move(MM));
+
+    BaseLayer.takeOwnershipOfBuffers(H, std::move(Buffers));
+
+    return H;
+  }
+
+  /// @brief Remove the module set associated with the handle H.
+  void removeModuleSet(ModuleSetHandleT H) { BaseLayer.removeObjectSet(H); }
+
+  /// @brief Search for the given named symbol.
+  /// @param Name The name of the symbol to search for.
+  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
+  /// @return A handle for the given named symbol, if it exists.
+  JITSymbol findSymbol(const std::string &Name, bool ExportedSymbolsOnly) {
+    return BaseLayer.findSymbol(Name, ExportedSymbolsOnly);
+  }
+
+  /// @brief Get the address of the given symbol in the context of the set of
+  ///        compiled modules represented by the handle H. This call is
+  ///        forwarded to the base layer's implementation.
+  /// @param H The handle for the module set to search in.
+  /// @param Name The name of the symbol to search for.
+  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
+  /// @return A handle for the given named symbol, if it is found in the
+  ///         given module set.
+  JITSymbol findSymbolIn(ModuleSetHandleT H, const std::string &Name,
+                         bool ExportedSymbolsOnly) {
+    return BaseLayer.findSymbolIn(H, Name, ExportedSymbolsOnly);
+  }
+
+  /// @brief Immediately emit and finalize the moduleOB set represented by the
+  ///        given handle.
+  /// @param H Handle for module set to emit/finalize.
+  void emitAndFinalize(ModuleSetHandleT H) {
+    BaseLayer.emitAndFinalize(H);
+  }
+
+private:
+  object::OwningBinary<object::ObjectFile>
+  tryToLoadFromObjectCache(const Module &M) {
+    std::unique_ptr<MemoryBuffer> ObjBuffer = ObjCache->getObject(&M);
+    if (!ObjBuffer)
+      return object::OwningBinary<object::ObjectFile>();
+
+    ErrorOr<std::unique_ptr<object::ObjectFile>> Obj =
+        object::ObjectFile::createObjectFile(ObjBuffer->getMemBufferRef());
+    if (!Obj)
+      return object::OwningBinary<object::ObjectFile>();
+
+    return object::OwningBinary<object::ObjectFile>(std::move(*Obj),
+                                                    std::move(ObjBuffer));
+  }
+
+  BaseLayerT &BaseLayer;
+  CompileFtor Compile;
+  ObjectCache *ObjCache;
+};
+
+} // End namespace orc.
+} // End namespace llvm.
+
+#endif // LLVM_EXECUTIONENGINE_ORC_IRCOMPILINGLAYER_H
diff --git a/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h b/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
new file mode 100644
index 0000000..e9d3d34
--- /dev/null
+++ b/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
@@ -0,0 +1,246 @@
+//===-- IndirectionUtils.h - Utilities for adding indirections --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains utilities for adding indirections and breaking up modules.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_INDIRECTIONUTILS_H
+#define LLVM_EXECUTIONENGINE_ORC_INDIRECTIONUTILS_H
+
+#include "JITSymbol.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include <sstream>
+
+namespace llvm {
+namespace orc {
+
+/// @brief Base class for JITLayer independent aspects of
+///        JITCompileCallbackManager.
+template <typename TargetT>
+class JITCompileCallbackManagerBase {
+public:
+
+  /// @brief Construct a JITCompileCallbackManagerBase.
+  /// @param ErrorHandlerAddress The address of an error handler in the target
+  ///                            process to be used if a compile callback fails.
+  /// @param NumTrampolinesPerBlock Number of trampolines to emit if there is no
+  ///                             available trampoline when getCompileCallback is
+  ///                             called.
+  JITCompileCallbackManagerBase(TargetAddress ErrorHandlerAddress,
+                                unsigned NumTrampolinesPerBlock)
+    : ErrorHandlerAddress(ErrorHandlerAddress),
+      NumTrampolinesPerBlock(NumTrampolinesPerBlock) {}
+
+  /// @brief Execute the callback for the given trampoline id. Called by the JIT
+  ///        to compile functions on demand.
+  TargetAddress executeCompileCallback(TargetAddress TrampolineID) {
+    typename TrampolineMapT::iterator I = ActiveTrampolines.find(TrampolineID);
+    // FIXME: Also raise an error in the Orc error-handler when we finally have
+    //        one.
+    if (I == ActiveTrampolines.end())
+      return ErrorHandlerAddress;
+
+    // Found a callback handler. Yank this trampoline out of the active list and
+    // put it back in the available trampolines list, then try to run the
+    // handler's compile and update actions.
+    // Moving the trampoline ID back to the available list first means there's at
+    // least one available trampoline if the compile action triggers a request for
+    // a new one.
+    AvailableTrampolines.push_back(I->first - TargetT::CallSize);
+    auto CallbackHandler = std::move(I->second);
+    ActiveTrampolines.erase(I);
+
+    if (auto Addr = CallbackHandler.Compile()) {
+      CallbackHandler.Update(Addr);
+      return Addr;
+    }
+    return ErrorHandlerAddress;
+  }
+
+protected:
+
+  typedef std::function<TargetAddress()> CompileFtorT;
+  typedef std::function<void(TargetAddress)> UpdateFtorT;
+
+  struct CallbackHandler {
+    CompileFtorT Compile;
+    UpdateFtorT Update;
+  };
+
+  TargetAddress ErrorHandlerAddress;
+  unsigned NumTrampolinesPerBlock;
+
+  typedef std::map<TargetAddress, CallbackHandler> TrampolineMapT;
+  TrampolineMapT ActiveTrampolines;
+  std::vector<TargetAddress> AvailableTrampolines;
+};
+
+/// @brief Manage compile callbacks.
+template <typename JITLayerT, typename TargetT>
+class JITCompileCallbackManager :
+    public JITCompileCallbackManagerBase<TargetT> {
+public:
+
+  typedef typename JITCompileCallbackManagerBase<TargetT>::CompileFtorT
+    CompileFtorT;
+  typedef typename JITCompileCallbackManagerBase<TargetT>::UpdateFtorT
+    UpdateFtorT;
+
+  /// @brief Construct a JITCompileCallbackManager.
+  /// @param JIT JIT layer to emit callback trampolines, etc. into.
+  /// @param Context LLVMContext to use for trampoline & resolve block modules.
+  /// @param ErrorHandlerAddress The address of an error handler in the target
+  ///                            process to be used if a compile callback fails.
+  /// @param NumTrampolinesPerBlock Number of trampolines to allocate whenever
+  ///                               there is no existing callback trampoline.
+  ///                               (Trampolines are allocated in blocks for
+  ///                               efficiency.)
+  JITCompileCallbackManager(JITLayerT &JIT, LLVMContext &Context,
+                            TargetAddress ErrorHandlerAddress,
+                            unsigned NumTrampolinesPerBlock)
+    : JITCompileCallbackManagerBase<TargetT>(ErrorHandlerAddress,
+                                             NumTrampolinesPerBlock),
+      JIT(JIT) {
+    emitResolverBlock(Context);
+  }
+
+  /// @brief Handle to a newly created compile callback. Can be used to get an
+  ///        IR constant representing the address of the trampoline, and to set
+  ///        the compile and update actions for the callback.
+  class CompileCallbackInfo {
+  public:
+    CompileCallbackInfo(Constant *Addr, CompileFtorT &Compile,
+                        UpdateFtorT &Update)
+      : Addr(Addr), Compile(Compile), Update(Update) {}
+
+    Constant* getAddress() const { return Addr; }
+    void setCompileAction(CompileFtorT Compile) {
+      this->Compile = std::move(Compile);
+    }
+    void setUpdateAction(UpdateFtorT Update) {
+      this->Update = std::move(Update);
+    }
+  private:
+    Constant *Addr;
+    CompileFtorT &Compile;
+    UpdateFtorT &Update;
+  };
+
+  /// @brief Get/create a compile callback with the given signature.
+  CompileCallbackInfo getCompileCallback(FunctionType &FT) {
+    TargetAddress TrampolineAddr = getAvailableTrampolineAddr(FT.getContext());
+    auto &CallbackHandler =
+      this->ActiveTrampolines[TrampolineAddr + TargetT::CallSize];
+    Constant *AddrIntVal =
+      ConstantInt::get(Type::getInt64Ty(FT.getContext()), TrampolineAddr);
+    Constant *AddrPtrVal =
+      ConstantExpr::getCast(Instruction::IntToPtr, AddrIntVal,
+                            PointerType::get(&FT, 0));
+
+    return CompileCallbackInfo(AddrPtrVal, CallbackHandler.Compile,
+                               CallbackHandler.Update);
+  }
+
+  /// @brief Get a functor for updating the value of a named function pointer.
+  UpdateFtorT getLocalFPUpdater(typename JITLayerT::ModuleSetHandleT H,
+                                std::string Name) {
+    // FIXME: Move-capture Name once we can use C++14.
+    return [=](TargetAddress Addr) {
+      auto FPSym = JIT.findSymbolIn(H, Name, true);
+      assert(FPSym && "Cannot find function pointer to update.");
+      void *FPAddr = reinterpret_cast<void*>(
+                       static_cast<uintptr_t>(FPSym.getAddress()));
+      memcpy(FPAddr, &Addr, sizeof(uintptr_t));
+    };
+  }
+
+private:
+
+  std::vector<std::unique_ptr<Module>>
+  SingletonSet(std::unique_ptr<Module> M) {
+    std::vector<std::unique_ptr<Module>> Ms;
+    Ms.push_back(std::move(M));
+    return Ms;
+  }
+
+  void emitResolverBlock(LLVMContext &Context) {
+    std::unique_ptr<Module> M(new Module("resolver_block_module",
+                                         Context));
+    TargetT::insertResolverBlock(*M, *this);
+    auto H = JIT.addModuleSet(SingletonSet(std::move(M)), nullptr);
+    JIT.emitAndFinalize(H);
+    auto ResolverBlockSymbol =
+      JIT.findSymbolIn(H, TargetT::ResolverBlockName, false);
+    assert(ResolverBlockSymbol && "Failed to insert resolver block");
+    ResolverBlockAddr = ResolverBlockSymbol.getAddress();
+  }
+
+  TargetAddress getAvailableTrampolineAddr(LLVMContext &Context) {
+    if (this->AvailableTrampolines.empty())
+      grow(Context);
+    assert(!this->AvailableTrampolines.empty() &&
+           "Failed to grow available trampolines.");
+    TargetAddress TrampolineAddr = this->AvailableTrampolines.back();
+    this->AvailableTrampolines.pop_back();
+    return TrampolineAddr;
+  }
+
+  void grow(LLVMContext &Context) {
+    assert(this->AvailableTrampolines.empty() && "Growing prematurely?");
+    std::unique_ptr<Module> M(new Module("trampoline_block", Context));
+    auto GetLabelName =
+      TargetT::insertCompileCallbackTrampolines(*M, ResolverBlockAddr,
+                                                this->NumTrampolinesPerBlock,
+                                                this->ActiveTrampolines.size());
+    auto H = JIT.addModuleSet(SingletonSet(std::move(M)), nullptr);
+    JIT.emitAndFinalize(H);
+    for (unsigned I = 0; I < this->NumTrampolinesPerBlock; ++I) {
+      std::string Name = GetLabelName(I);
+      auto TrampolineSymbol = JIT.findSymbolIn(H, Name, false);
+      assert(TrampolineSymbol && "Failed to emit trampoline.");
+      this->AvailableTrampolines.push_back(TrampolineSymbol.getAddress());
+    }
+  }
+
+  JITLayerT &JIT;
+  TargetAddress ResolverBlockAddr;
+};
+
+GlobalVariable* createImplPointer(Function &F, const Twine &Name,
+                                  Constant *Initializer);
+
+void makeStub(Function &F, GlobalVariable &ImplPointer);
+
+typedef std::map<Module*, DenseSet<const GlobalValue*>> ModulePartitionMap;
+
+void partition(Module &M, const ModulePartitionMap &PMap);
+
+/// @brief Struct for trivial "complete" partitioning of a module.
+class FullyPartitionedModule {
+public:
+  std::unique_ptr<Module> GlobalVars;
+  std::unique_ptr<Module> Commons;
+  std::vector<std::unique_ptr<Module>> Functions;
+
+  FullyPartitionedModule() = default;
+  FullyPartitionedModule(FullyPartitionedModule &&S)
+      : GlobalVars(std::move(S.GlobalVars)), Commons(std::move(S.Commons)),
+        Functions(std::move(S.Functions)) {}
+};
+
+FullyPartitionedModule fullyPartition(Module &M);
+
+} // End namespace orc.
+} // End namespace llvm.
+
+#endif // LLVM_EXECUTIONENGINE_ORC_INDIRECTIONUTILS_H
diff --git a/include/llvm/ExecutionEngine/Orc/JITSymbol.h b/include/llvm/ExecutionEngine/Orc/JITSymbol.h
new file mode 100644
index 0000000..a670222
--- /dev/null
+++ b/include/llvm/ExecutionEngine/Orc/JITSymbol.h
@@ -0,0 +1,74 @@
+//===----------- JITSymbol.h - JIT symbol abstraction -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Abstraction for target process addresses.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_JITSYMBOL_H
+#define LLVM_EXECUTIONENGINE_ORC_JITSYMBOL_H
+
+#include "llvm/Support/DataTypes.h"
+#include <cassert>
+#include <functional>
+
+namespace llvm {
+namespace orc {
+
+/// @brief Represents an address in the target process's address space.
+typedef uint64_t TargetAddress;
+
+/// @brief Represents a symbol in the JIT.
+class JITSymbol {
+public:
+  typedef std::function<TargetAddress()> GetAddressFtor;
+
+  /// @brief Create a 'null' symbol that represents failure to find a symbol
+  ///        definition.
+  JITSymbol(std::nullptr_t) : CachedAddr(0) {}
+
+  /// @brief Create a symbol for a definition with a known address.
+  JITSymbol(TargetAddress Addr)
+    : CachedAddr(Addr) {}
+
+  /// @brief Create a symbol for a definition that doesn't have a known address
+  ///        yet.
+  /// @param GetAddress A functor to materialize a definition (fixing the
+  ///        address) on demand.
+  ///
+  ///   This constructor allows a JIT layer to provide a reference to a symbol
+  /// definition without actually materializing the definition up front. The
+  /// user can materialize the definition at any time by calling the getAddress
+  /// method.
+  JITSymbol(GetAddressFtor GetAddress)
+    : CachedAddr(0), GetAddress(std::move(GetAddress)) {}
+
+  /// @brief Returns true if the symbol exists, false otherwise.
+  explicit operator bool() const { return CachedAddr || GetAddress; }
+
+  /// @brief Get the address of the symbol in the target address space. Returns
+  ///        '0' if the symbol does not exist.
+  TargetAddress getAddress() {
+    if (GetAddress) {
+      CachedAddr = GetAddress();
+      assert(CachedAddr && "Symbol could not be materialized.");
+      GetAddress = nullptr;
+    }
+    return CachedAddr;
+  }
+
+private:
+  TargetAddress CachedAddr;
+  GetAddressFtor GetAddress;
+};
+
+} // End namespace orc.
+} // End namespace llvm.
+
+#endif // LLVM_EXECUTIONENGINE_ORC_JITSYMBOL_H
diff --git a/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h b/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
new file mode 100644
index 0000000..2a94abe
--- /dev/null
+++ b/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
@@ -0,0 +1,283 @@
+//===- LazyEmittingLayer.h - Lazily emit IR to lower JIT layers -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains the definition for a lazy-emitting layer for the JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_LAZYEMITTINGLAYER_H
+#define LLVM_EXECUTIONENGINE_ORC_LAZYEMITTINGLAYER_H
+
+#include "JITSymbol.h"
+#include "LookasideRTDyldMM.h"
+#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/ADT/StringMap.h"
+#include <list>
+
+namespace llvm {
+namespace orc {
+
+/// @brief Lazy-emitting IR layer.
+///
+///   This layer accepts sets of LLVM IR Modules (via addModuleSet), but does
+/// not immediately emit them the layer below. Instead, emissing to the base
+/// layer is deferred until the first time the client requests the address
+/// (via JITSymbol::getAddress) for a symbol contained in this layer.
+template <typename BaseLayerT> class LazyEmittingLayer {
+public:
+  typedef typename BaseLayerT::ModuleSetHandleT BaseLayerHandleT;
+
+private:
+  class EmissionDeferredSet {
+  public:
+    EmissionDeferredSet() : EmitState(NotEmitted) {}
+    virtual ~EmissionDeferredSet() {}
+
+    JITSymbol find(StringRef Name, bool ExportedSymbolsOnly, BaseLayerT &B) {
+      switch (EmitState) {
+      case NotEmitted:
+        if (provides(Name, ExportedSymbolsOnly)) {
+          // Create a std::string version of Name to capture here - the argument
+          // (a StringRef) may go away before the lambda is executed.
+          // FIXME: Use capture-init when we move to C++14. 
+          std::string PName = Name;
+          return JITSymbol(
+              [this, ExportedSymbolsOnly, PName, &B]() -> TargetAddress {
+                if (this->EmitState == Emitting)
+                  return 0;
+                else if (this->EmitState == NotEmitted) {
+                  this->EmitState = Emitting;
+                  Handle = this->emitToBaseLayer(B);
+                  this->EmitState = Emitted;
+                }
+                return B.findSymbolIn(Handle, PName, ExportedSymbolsOnly)
+                          .getAddress();
+              });
+        } else
+          return nullptr;
+      case Emitting:
+        // Calling "emit" can trigger external symbol lookup (e.g. to check for
+        // pre-existing definitions of common-symbol), but it will never find in
+        // this module that it would not have found already, so return null from
+        // here.
+        return nullptr;
+      case Emitted:
+        return B.findSymbolIn(Handle, Name, ExportedSymbolsOnly);
+      }
+      llvm_unreachable("Invalid emit-state.");
+    }
+
+    void removeModulesFromBaseLayer(BaseLayerT &BaseLayer) {
+      if (EmitState != NotEmitted)
+        BaseLayer.removeModuleSet(Handle);
+    }
+
+    void emitAndFinalize(BaseLayerT &BaseLayer) {
+      assert(EmitState != Emitting &&
+             "Cannot emitAndFinalize while already emitting");
+      if (EmitState == NotEmitted) {
+        EmitState = Emitting;
+        Handle = emitToBaseLayer(BaseLayer);
+        EmitState = Emitted;
+      }
+      BaseLayer.emitAndFinalize(Handle);
+    }
+
+    template <typename ModuleSetT>
+    static std::unique_ptr<EmissionDeferredSet>
+    create(BaseLayerT &B, ModuleSetT Ms,
+           std::unique_ptr<RTDyldMemoryManager> MM);
+
+  protected:
+    virtual bool provides(StringRef Name, bool ExportedSymbolsOnly) const = 0;
+    virtual BaseLayerHandleT emitToBaseLayer(BaseLayerT &BaseLayer) = 0;
+
+  private:
+    enum { NotEmitted, Emitting, Emitted } EmitState;
+    BaseLayerHandleT Handle;
+  };
+
+  template <typename ModuleSetT>
+  class EmissionDeferredSetImpl : public EmissionDeferredSet {
+  public:
+    EmissionDeferredSetImpl(ModuleSetT Ms,
+                            std::unique_ptr<RTDyldMemoryManager> MM)
+        : Ms(std::move(Ms)), MM(std::move(MM)) {}
+
+  protected:
+
+    BaseLayerHandleT emitToBaseLayer(BaseLayerT &BaseLayer) override {
+      // We don't need the mangled names set any more: Once we've emitted this
+      // to the base layer we'll just look for symbols there.
+      MangledNames.reset();
+      return BaseLayer.addModuleSet(std::move(Ms), std::move(MM));
+    }
+
+    bool provides(StringRef Name, bool ExportedSymbolsOnly) const override {
+      // FIXME: We could clean all this up if we had a way to reliably demangle
+      //        names: We could just demangle name and search, rather than
+      //        mangling everything else.
+
+      // If we have already built the mangled name set then just search it.
+      if (MangledNames) {
+        auto VI = MangledNames->find(Name);
+        if (VI == MangledNames->end())
+          return false;
+        return !ExportedSymbolsOnly || VI->second;
+      }
+
+      // If we haven't built the mangled name set yet, try to build it. As an
+      // optimization this will leave MangledNames set to nullptr if we find
+      // Name in the process of building the set.
+      buildMangledNames(Name, ExportedSymbolsOnly);
+      if (!MangledNames)
+        return true;
+      return false;
+    }
+
+  private:
+    // If the mangled name of the given GlobalValue matches the given search
+    // name (and its visibility conforms to the ExportedSymbolsOnly flag) then
+    // just return 'true'. Otherwise, add the mangled name to the Names map and
+    // return 'false'.
+    bool addGlobalValue(StringMap<bool> &Names, const GlobalValue &GV,
+                        const Mangler &Mang, StringRef SearchName,
+                        bool ExportedSymbolsOnly) const {
+      // Modules don't "provide" decls or common symbols.
+      if (GV.isDeclaration() || GV.hasCommonLinkage())
+        return false;
+
+      // Mangle the GV name.
+      std::string MangledName;
+      {
+        raw_string_ostream MangledNameStream(MangledName);
+        Mang.getNameWithPrefix(MangledNameStream, &GV, false);
+      }
+
+      // Check whether this is the name we were searching for, and if it is then
+      // bail out early.
+      if (MangledName == SearchName)
+        if (!ExportedSymbolsOnly || GV.hasDefaultVisibility())
+          return true;
+
+      // Otherwise add this to the map for later.
+      Names[MangledName] = GV.hasDefaultVisibility();
+      return false;
+    }
+
+    // Build the MangledNames map. Bails out early (with MangledNames left set
+    // to nullptr) if the given SearchName is found while building the map.
+    void buildMangledNames(StringRef SearchName,
+                           bool ExportedSymbolsOnly) const {
+      assert(!MangledNames && "Mangled names map already exists?");
+
+      auto Names = llvm::make_unique<StringMap<bool>>();
+
+      for (const auto &M : Ms) {
+        Mangler Mang(M->getDataLayout());
+
+        for (const auto &GV : M->globals())
+          if (addGlobalValue(*Names, GV, Mang, SearchName, ExportedSymbolsOnly))
+            return;
+
+        for (const auto &F : *M)
+          if (addGlobalValue(*Names, F, Mang, SearchName, ExportedSymbolsOnly))
+            return;
+      }
+
+      MangledNames = std::move(Names);
+    }
+
+    ModuleSetT Ms;
+    std::unique_ptr<RTDyldMemoryManager> MM;
+    mutable std::unique_ptr<StringMap<bool>> MangledNames;
+  };
+
+  typedef std::list<std::unique_ptr<EmissionDeferredSet>> ModuleSetListT;
+
+  BaseLayerT &BaseLayer;
+  ModuleSetListT ModuleSetList;
+
+public:
+  /// @brief Handle to a set of loaded modules.
+  typedef typename ModuleSetListT::iterator ModuleSetHandleT;
+
+  /// @brief Construct a lazy emitting layer.
+  LazyEmittingLayer(BaseLayerT &BaseLayer) : BaseLayer(BaseLayer) {}
+
+  /// @brief Add the given set of modules to the lazy emitting layer.
+  template <typename ModuleSetT>
+  ModuleSetHandleT addModuleSet(ModuleSetT Ms,
+                                std::unique_ptr<RTDyldMemoryManager> MM) {
+    return ModuleSetList.insert(
+        ModuleSetList.end(),
+        EmissionDeferredSet::create(BaseLayer, std::move(Ms), std::move(MM)));
+  }
+
+  /// @brief Remove the module set represented by the given handle.
+  ///
+  ///   This method will free the memory associated with the given module set,
+  /// both in this layer, and the base layer.
+  void removeModuleSet(ModuleSetHandleT H) {
+    (*H)->removeModulesFromBaseLayer(BaseLayer);
+    ModuleSetList.erase(H);
+  }
+
+  /// @brief Search for the given named symbol.
+  /// @param Name The name of the symbol to search for.
+  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
+  /// @return A handle for the given named symbol, if it exists.
+  JITSymbol findSymbol(const std::string &Name, bool ExportedSymbolsOnly) {
+    // Look for the symbol among existing definitions.
+    if (auto Symbol = BaseLayer.findSymbol(Name, ExportedSymbolsOnly))
+      return Symbol;
+
+    // If not found then search the deferred sets. If any of these contain a
+    // definition of 'Name' then they will return a JITSymbol that will emit
+    // the corresponding module when the symbol address is requested.
+    for (auto &DeferredSet : ModuleSetList)
+      if (auto Symbol = DeferredSet->find(Name, ExportedSymbolsOnly, BaseLayer))
+        return Symbol;
+
+    // If no definition found anywhere return a null symbol.
+    return nullptr;
+  }
+
+  /// @brief Get the address of the given symbol in the context of the set of
+  ///        compiled modules represented by the handle H.
+  JITSymbol findSymbolIn(ModuleSetHandleT H, const std::string &Name,
+                         bool ExportedSymbolsOnly) {
+    return (*H)->find(Name, ExportedSymbolsOnly, BaseLayer);
+  }
+
+  /// @brief Immediately emit and finalize the moduleOB set represented by the
+  ///        given handle.
+  /// @param H Handle for module set to emit/finalize.
+  void emitAndFinalize(ModuleSetHandleT H) {
+    (*H)->emitAndFinalize(BaseLayer);
+  }
+
+};
+
+template <typename BaseLayerT>
+template <typename ModuleSetT>
+std::unique_ptr<typename LazyEmittingLayer<BaseLayerT>::EmissionDeferredSet>
+LazyEmittingLayer<BaseLayerT>::EmissionDeferredSet::create(
+    BaseLayerT &B, ModuleSetT Ms, std::unique_ptr<RTDyldMemoryManager> MM) {
+  return llvm::make_unique<EmissionDeferredSetImpl<ModuleSetT>>(std::move(Ms),
+                                                                std::move(MM));
+}
+
+} // End namespace orc.
+} // End namespace llvm.
+
+#endif // LLVM_EXECUTIONENGINE_ORC_LAZYEMITTINGLAYER_H
diff --git a/include/llvm/ExecutionEngine/Orc/LookasideRTDyldMM.h b/include/llvm/ExecutionEngine/Orc/LookasideRTDyldMM.h
new file mode 100644
index 0000000..4456404
--- /dev/null
+++ b/include/llvm/ExecutionEngine/Orc/LookasideRTDyldMM.h
@@ -0,0 +1,92 @@
+//===- LookasideRTDyldMM - Redirect symbol lookup via a functor -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//   Defines an adapter for RuntimeDyldMM that allows lookups for external
+// symbols to go via a functor.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_LOOKASIDERTDYLDMM_H
+#define LLVM_EXECUTIONENGINE_ORC_LOOKASIDERTDYLDMM_H
+
+#include "llvm/ADT/STLExtras.h"
+#include <memory>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+/// @brief Defines an adapter for RuntimeDyldMM that allows lookups for external
+///        symbols to go via a functor, before falling back to the lookup logic
+///        provided by the underlying RuntimeDyldMM instance.
+///
+///   This class is useful for redirecting symbol lookup back to various layers
+/// of a JIT component stack, e.g. to enable lazy module emission.
+///
+template <typename BaseRTDyldMM, typename ExternalLookupFtor,
+          typename DylibLookupFtor>
+class LookasideRTDyldMM : public BaseRTDyldMM {
+public:
+  /// @brief Create a LookasideRTDyldMM intance.
+  LookasideRTDyldMM(ExternalLookupFtor ExternalLookup,
+                    DylibLookupFtor DylibLookup)
+      : ExternalLookup(std::move(ExternalLookup)),
+        DylibLookup(std::move(DylibLookup)) {}
+
+  /// @brief Look up the given symbol address, first via the functor this
+  ///        instance was created with, then (if the symbol isn't found)
+  ///        via the underlying RuntimeDyldMM.
+  uint64_t getSymbolAddress(const std::string &Name) override {
+    if (uint64_t Addr = ExternalLookup(Name))
+      return Addr;
+    return BaseRTDyldMM::getSymbolAddress(Name);
+  }
+
+  uint64_t getSymbolAddressInLogicalDylib(const std::string &Name) override {
+    if (uint64_t Addr = DylibLookup(Name))
+      return Addr;
+    return BaseRTDyldMM::getSymbolAddressInLogicalDylib(Name);
+  };
+
+  /// @brief Get a reference to the ExternalLookup functor.
+  ExternalLookupFtor &getExternalLookup() { return ExternalLookup; }
+
+  /// @brief Get a const-reference to the ExternalLookup functor.
+  const ExternalLookupFtor &getExternalLookup() const { return ExternalLookup; }
+
+  /// @brief Get a reference to the DylibLookup functor.
+  DylibLookupFtor &getDylibLookup() { return DylibLookup; }
+
+  /// @brief Get a const-reference to the DylibLookup functor.
+  const DylibLookupFtor &getDylibLookup() const { return DylibLookup; }
+
+private:
+  ExternalLookupFtor ExternalLookup;
+  DylibLookupFtor DylibLookup;
+};
+
+/// @brief Create a LookasideRTDyldMM from a base memory manager type, an
+///        external lookup functor, and a dylib lookup functor.
+template <typename BaseRTDyldMM, typename ExternalLookupFtor,
+          typename DylibLookupFtor>
+std::unique_ptr<
+    LookasideRTDyldMM<BaseRTDyldMM, ExternalLookupFtor, DylibLookupFtor>>
+createLookasideRTDyldMM(ExternalLookupFtor &&ExternalLookup,
+                        DylibLookupFtor &&DylibLookup) {
+  typedef LookasideRTDyldMM<BaseRTDyldMM, ExternalLookupFtor, DylibLookupFtor>
+      ThisLookasideMM;
+  return llvm::make_unique<ThisLookasideMM>(
+      std::forward<ExternalLookupFtor>(ExternalLookup),
+      std::forward<DylibLookupFtor>(DylibLookup));
+}
+
+} // End namespace orc.
+} // End namespace llvm.
+
+#endif // LLVM_EXECUTIONENGINE_ORC_LOOKASIDERTDYLDMM_H
diff --git a/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h b/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
new file mode 100644
index 0000000..36af0fe
--- /dev/null
+++ b/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
@@ -0,0 +1,267 @@
+//===- ObjectLinkingLayer.h - Add object files to a JIT process -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains the definition for the object layer of the JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_OBJECTLINKINGLAYER_H
+#define LLVM_EXECUTIONENGINE_ORC_OBJECTLINKINGLAYER_H
+
+#include "JITSymbol.h"
+#include "LookasideRTDyldMM.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include <list>
+#include <memory>
+
+namespace llvm {
+namespace orc {
+
+class ObjectLinkingLayerBase {
+protected:
+
+  /// @brief Holds a set of objects to be allocated/linked as a unit in the JIT.
+  ///
+  /// An instance of this class will be created for each set of objects added
+  /// via JITObjectLayer::addObjectSet. Deleting the instance (via
+  /// removeObjectSet) frees its memory, removing all symbol definitions that
+  /// had been provided by this instance. Higher level layers are responsible
+  /// for taking any action required to handle the missing symbols.
+  class LinkedObjectSet {
+    LinkedObjectSet(const LinkedObjectSet&) = delete;
+    void operator=(const LinkedObjectSet&) = delete;
+  public:
+    LinkedObjectSet(std::unique_ptr<RTDyldMemoryManager> MM)
+        : MM(std::move(MM)), RTDyld(llvm::make_unique<RuntimeDyld>(&*this->MM)),
+          State(Raw) {}
+
+    // MSVC 2012 cannot infer a move constructor, so write it out longhand.
+    LinkedObjectSet(LinkedObjectSet &&O)
+        : MM(std::move(O.MM)), RTDyld(std::move(O.RTDyld)), State(O.State) {}
+
+    std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
+    addObject(const object::ObjectFile &Obj) {
+      return RTDyld->loadObject(Obj);
+    }
+
+    TargetAddress getSymbolAddress(StringRef Name, bool ExportedSymbolsOnly) {
+      if (ExportedSymbolsOnly)
+        return RTDyld->getExportedSymbolLoadAddress(Name);
+      return RTDyld->getSymbolLoadAddress(Name);
+    }
+
+    bool NeedsFinalization() const { return (State == Raw); }
+
+    void Finalize() {
+      State = Finalizing;
+      RTDyld->resolveRelocations();
+      RTDyld->registerEHFrames();
+      MM->finalizeMemory();
+      OwnedBuffers.clear();
+      State = Finalized;
+    }
+
+    void mapSectionAddress(const void *LocalAddress, TargetAddress TargetAddr) {
+      assert((State != Finalized) &&
+             "Attempting to remap sections for finalized objects.");
+      RTDyld->mapSectionAddress(LocalAddress, TargetAddr);
+    }
+
+    void takeOwnershipOfBuffer(std::unique_ptr<MemoryBuffer> B) {
+      OwnedBuffers.push_back(std::move(B));
+    }
+
+  private:
+    std::unique_ptr<RTDyldMemoryManager> MM;
+    std::unique_ptr<RuntimeDyld> RTDyld;
+    enum { Raw, Finalizing, Finalized } State;
+
+    // FIXME: This ownership hack only exists because RuntimeDyldELF still
+    //        wants to be able to inspect the original object when resolving
+    //        relocations. As soon as that can be fixed this should be removed.
+    std::vector<std::unique_ptr<MemoryBuffer>> OwnedBuffers;
+  };
+
+  typedef std::list<LinkedObjectSet> LinkedObjectSetListT;
+
+public:
+  /// @brief Handle to a set of loaded objects.
+  typedef LinkedObjectSetListT::iterator ObjSetHandleT;
+
+  // Ownership hack.
+  // FIXME: Remove this as soon as RuntimeDyldELF can apply relocations without
+  //        referencing the original object.
+  template <typename OwningMBSet>
+  void takeOwnershipOfBuffers(ObjSetHandleT H, OwningMBSet MBs) {
+    for (auto &MB : MBs)
+      H->takeOwnershipOfBuffer(std::move(MB));
+  }
+
+};
+
+/// @brief Default (no-op) action to perform when loading objects.
+class DoNothingOnNotifyLoaded {
+public:
+  template <typename ObjSetT, typename LoadResult>
+  void operator()(ObjectLinkingLayerBase::ObjSetHandleT, const ObjSetT &,
+                  const LoadResult &) {}
+};
+
+/// @brief Bare bones object linking layer.
+///
+///   This class is intended to be used as the base layer for a JIT. It allows
+/// object files to be loaded into memory, linked, and the addresses of their
+/// symbols queried. All objects added to this layer can see each other's
+/// symbols.
+template <typename NotifyLoadedFtor = DoNothingOnNotifyLoaded>
+class ObjectLinkingLayer : public ObjectLinkingLayerBase {
+public:
+
+  /// @brief LoadedObjectInfo list. Contains a list of owning pointers to
+  ///        RuntimeDyld::LoadedObjectInfo instances.
+  typedef std::vector<std::unique_ptr<RuntimeDyld::LoadedObjectInfo>>
+      LoadedObjInfoList;
+
+  /// @brief Functor to create RTDyldMemoryManager instances.
+  typedef std::function<std::unique_ptr<RTDyldMemoryManager>()> CreateRTDyldMMFtor;
+
+  /// @brief Functor for receiving finalization notifications.
+  typedef std::function<void(ObjSetHandleT)> NotifyFinalizedFtor;
+
+  /// @brief Construct an ObjectLinkingLayer with the given NotifyLoaded,
+  ///        NotifyFinalized and CreateMemoryManager functors.
+  ObjectLinkingLayer(
+      CreateRTDyldMMFtor CreateMemoryManager = CreateRTDyldMMFtor(),
+      NotifyLoadedFtor NotifyLoaded = NotifyLoadedFtor(),
+      NotifyFinalizedFtor NotifyFinalized = NotifyFinalizedFtor())
+      : NotifyLoaded(std::move(NotifyLoaded)),
+        NotifyFinalized(std::move(NotifyFinalized)),
+        CreateMemoryManager(std::move(CreateMemoryManager)) {}
+
+  /// @brief Add a set of objects (or archives) that will be treated as a unit
+  ///        for the purposes of symbol lookup and memory management.
+  ///
+  /// @return A pair containing (1) A handle that can be used to free the memory
+  ///         allocated for the objects, and (2) a LoadedObjInfoList containing
+  ///         one LoadedObjInfo instance for each object at the corresponding
+  ///         index in the Objects list.
+  ///
+  ///   This version of this method allows the client to pass in an
+  /// RTDyldMemoryManager instance that will be used to allocate memory and look
+  /// up external symbol addresses for the given objects.
+  template <typename ObjSetT>
+  ObjSetHandleT addObjectSet(const ObjSetT &Objects,
+                             std::unique_ptr<RTDyldMemoryManager> MM) {
+
+    if (!MM) {
+      assert(CreateMemoryManager &&
+             "No memory manager or memory manager creator provided.");
+      MM = CreateMemoryManager();
+    }
+
+    ObjSetHandleT Handle = LinkedObjSetList.insert(
+        LinkedObjSetList.end(), LinkedObjectSet(std::move(MM)));
+    LinkedObjectSet &LOS = *Handle;
+    LoadedObjInfoList LoadedObjInfos;
+
+    for (auto &Obj : Objects)
+      LoadedObjInfos.push_back(LOS.addObject(*Obj));
+
+    NotifyLoaded(Handle, Objects, LoadedObjInfos);
+
+    return Handle;
+  }
+
+  /// @brief Remove the set of objects associated with handle H.
+  ///
+  ///   All memory allocated for the objects will be freed, and the sections and
+  /// symbols they provided will no longer be available. No attempt is made to
+  /// re-emit the missing symbols, and any use of these symbols (directly or
+  /// indirectly) will result in undefined behavior. If dependence tracking is
+  /// required to detect or resolve such issues it should be added at a higher
+  /// layer.
+  void removeObjectSet(ObjSetHandleT H) {
+    // How do we invalidate the symbols in H?
+    LinkedObjSetList.erase(H);
+  }
+
+  /// @brief Search for the given named symbol.
+  /// @param Name The name of the symbol to search for.
+  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
+  /// @return A handle for the given named symbol, if it exists.
+  JITSymbol findSymbol(StringRef Name, bool ExportedSymbolsOnly) {
+    for (auto I = LinkedObjSetList.begin(), E = LinkedObjSetList.end(); I != E;
+         ++I)
+      if (auto Symbol = findSymbolIn(I, Name, ExportedSymbolsOnly))
+        return Symbol;
+
+    return nullptr;
+  }
+
+  /// @brief Search for the given named symbol in the context of the set of
+  ///        loaded objects represented by the handle H.
+  /// @param H The handle for the object set to search in.
+  /// @param Name The name of the symbol to search for.
+  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
+  /// @return A handle for the given named symbol, if it is found in the
+  ///         given object set.
+  JITSymbol findSymbolIn(ObjSetHandleT H, StringRef Name,
+                         bool ExportedSymbolsOnly) {
+    if (auto Addr = H->getSymbolAddress(Name, ExportedSymbolsOnly)) {
+      if (!H->NeedsFinalization()) {
+        // If this instance has already been finalized then we can just return
+        // the address.
+        return JITSymbol(Addr);
+      } else {
+        // If this instance needs finalization return a functor that will do it.
+        // The functor still needs to double-check whether finalization is
+        // required, in case someone else finalizes this set before the functor
+        // is called.
+        return JITSymbol(
+          [this, Addr, H]() {
+            if (H->NeedsFinalization()) {
+              H->Finalize();
+              if (NotifyFinalized)
+                NotifyFinalized(H);
+            }
+            return Addr;
+          });
+      }
+    }
+
+    return nullptr;
+  }
+
+  /// @brief Map section addresses for the objects associated with the handle H.
+  void mapSectionAddress(ObjSetHandleT H, const void *LocalAddress,
+                         TargetAddress TargetAddr) {
+    H->mapSectionAddress(LocalAddress, TargetAddr);
+  }
+
+  /// @brief Immediately emit and finalize the object set represented by the
+  ///        given handle.
+  /// @param H Handle for object set to emit/finalize.
+  void emitAndFinalize(ObjSetHandleT H) {
+    H->Finalize();
+    if (NotifyFinalized)
+      NotifyFinalized(H);
+  }
+
+private:
+  LinkedObjectSetListT LinkedObjSetList;
+  NotifyLoadedFtor NotifyLoaded;
+  NotifyFinalizedFtor NotifyFinalized;
+  CreateRTDyldMMFtor CreateMemoryManager;
+};
+
+} // End namespace orc.
+} // End namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_OBJECTLINKINGLAYER_H
diff --git a/include/llvm/ExecutionEngine/Orc/OrcTargetSupport.h b/include/llvm/ExecutionEngine/Orc/OrcTargetSupport.h
new file mode 100644
index 0000000..c6f866a
--- /dev/null
+++ b/include/llvm/ExecutionEngine/Orc/OrcTargetSupport.h
@@ -0,0 +1,56 @@
+//===-- OrcTargetSupport.h - Code to support specific targets  --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Target specific code for Orc, e.g. callback assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_ORCTARGETSUPPORT_H
+#define LLVM_EXECUTIONENGINE_ORC_ORCTARGETSUPPORT_H
+
+#include "IndirectionUtils.h"
+
+namespace llvm {
+namespace orc {
+
+class OrcX86_64 {
+public:
+  static const char *ResolverBlockName;
+
+  /// @brief Insert module-level inline callback asm into module M for the
+  /// symbols managed by JITResolveCallbackHandler J.
+  static void insertResolverBlock(
+                                 Module &M,
+                                 JITCompileCallbackManagerBase<OrcX86_64> &JCBM);
+
+  /// @brief Get a label name from the given index.
+  typedef std::function<std::string(unsigned)> LabelNameFtor;
+
+  static const unsigned CallSize = 6;
+
+  /// @brief Insert the requested number of trampolines into the given module.
+  /// @param M Module to insert the call block into.
+  /// @param NumCalls Number of calls to create in the call block.
+  /// @param StartIndex Optional argument specifying the index suffix to start
+  ///                   with.
+  /// @return A functor that provides the symbol name for each entry in the call
+  ///         block.
+  ///
+  static LabelNameFtor insertCompileCallbackTrampolines(
+                                                    Module &M,
+                                                    TargetAddress TrampolineAddr,
+                                                    unsigned NumCalls,
+                                                    unsigned StartIndex = 0);
+
+};
+
+} // End namespace orc.
+} // End namespace llvm.
+
+#endif // LLVM_EXECUTIONENGINE_ORC_ORCTARGETSUPPORT_H
diff --git a/include/llvm/ExecutionEngine/OrcMCJITReplacement.h b/include/llvm/ExecutionEngine/OrcMCJITReplacement.h
new file mode 100644
index 0000000..4cd5648
--- /dev/null
+++ b/include/llvm/ExecutionEngine/OrcMCJITReplacement.h
@@ -0,0 +1,38 @@
+//===---- OrcMCJITReplacement.h - Orc-based MCJIT replacement ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file forces OrcMCJITReplacement to link in on certain operating systems.
+// (Windows).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORCMCJITREPLACEMENT_H
+#define LLVM_EXECUTIONENGINE_ORCMCJITREPLACEMENT_H
+
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include <cstdlib>
+
+extern "C" void LLVMLinkInOrcMCJITReplacement();
+
+namespace {
+  struct ForceOrcMCJITReplacementLinking {
+    ForceOrcMCJITReplacementLinking() {
+      // We must reference OrcMCJITReplacement in such a way that compilers will
+      // not delete it all as dead code, even with whole program optimization,
+      // yet is effectively a NO-OP. As the compiler isn't smart enough to know
+      // that getenv() never returns -1, this will do the job.
+      if (std::getenv("bar") != (char*) -1)
+        return;
+
+      LLVMLinkInOrcMCJITReplacement();
+    }
+  } ForceOrcMCJITReplacementLinking;
+}
+
+#endif
diff --git a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
index b941efc..792a499 100644
--- a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
+++ b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
@@ -22,7 +22,10 @@
 namespace llvm {
 
 class ExecutionEngine;
-class ObjectImage;
+
+  namespace object {
+    class ObjectFile;
+  }
 
 // RuntimeDyld clients often want to handle the memory management of
 // what gets placed where. For JIT clients, this is the subset of
@@ -31,8 +34,8 @@ class ObjectImage;
 // FIXME: As the RuntimeDyld fills out, additional routines will be needed
 //        for the varying types of objects to be allocated.
 class RTDyldMemoryManager {
-  RTDyldMemoryManager(const RTDyldMemoryManager&) LLVM_DELETED_FUNCTION;
-  void operator=(const RTDyldMemoryManager&) LLVM_DELETED_FUNCTION;
+  RTDyldMemoryManager(const RTDyldMemoryManager&) = delete;
+  void operator=(const RTDyldMemoryManager&) = delete;
 public:
   RTDyldMemoryManager() {}
   virtual ~RTDyldMemoryManager();
@@ -86,6 +89,27 @@ public:
     return getSymbolAddressInProcess(Name);
   }
 
+  /// This method returns the address of the specified symbol if it exists
+  /// within the logical dynamic library represented by this
+  /// RTDyldMemoryManager. Unlike getSymbolAddress, queries through this
+  /// interface should return addresses for hidden symbols.
+  ///
+  /// This is of particular importance for the Orc JIT APIs, which support lazy
+  /// compilation by breaking up modules: Each of those broken out modules
+  /// must be able to resolve hidden symbols provided by the others. Clients
+  /// writing memory managers for MCJIT can usually ignore this method.
+  ///
+  /// This method will be queried by RuntimeDyld when checking for previous
+  /// definitions of common symbols. It will *not* be queried by default when
+  /// resolving external symbols (this minimises the link-time overhead for
+  /// MCJIT clients who don't care about Orc features). If you are writing a
+  /// RTDyldMemoryManager for Orc and want "external" symbol resolution to
+  /// search the logical dylib, you should override your getSymbolAddress
+  /// method call this method directly.
+  virtual uint64_t getSymbolAddressInLogicalDylib(const std::string &Name) {
+    return 0;
+  }
+
   /// This method returns the address of the specified function. As such it is
   /// only useful for resolving library symbols, not code generated symbols.
   ///
@@ -109,7 +133,7 @@ public:
   /// address space can use this call to remap the section addresses for the
   /// newly loaded object.
   virtual void notifyObjectLoaded(ExecutionEngine *EE,
-                                  const ObjectImage *) {}
+                                  const object::ObjectFile &) {}
 
   /// This method is called when object loading is complete and section page
   /// permissions can be applied.  It is up to the memory manager implementation
diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h
index 3605b9e..08cfa39 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -15,25 +15,25 @@
 #define LLVM_EXECUTIONENGINE_RUNTIMEDYLD_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ExecutionEngine/ObjectBuffer.h"
 #include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/Support/Memory.h"
+#include <memory>
 
 namespace llvm {
 
 namespace object {
   class ObjectFile;
+  template <typename T> class OwningBinary;
 }
 
 class RuntimeDyldImpl;
 class RuntimeDyldCheckerImpl;
-class ObjectImage;
 
 class RuntimeDyld {
   friend class RuntimeDyldCheckerImpl;
 
-  RuntimeDyld(const RuntimeDyld &) LLVM_DELETED_FUNCTION;
-  void operator=(const RuntimeDyld &) LLVM_DELETED_FUNCTION;
+  RuntimeDyld(const RuntimeDyld &) = delete;
+  void operator=(const RuntimeDyld &) = delete;
 
   // RuntimeDyldImpl is the actual class. RuntimeDyld is just the public
   // interface.
@@ -46,32 +46,49 @@ protected:
   // Any relocations already associated with the symbol will be re-resolved.
   void reassignSectionAddress(unsigned SectionID, uint64_t Addr);
 public:
+
+  /// \brief Information about the loaded object.
+  class LoadedObjectInfo {
+    friend class RuntimeDyldImpl;
+  public:
+    LoadedObjectInfo(RuntimeDyldImpl &RTDyld, unsigned BeginIdx,
+                     unsigned EndIdx)
+      : RTDyld(RTDyld), BeginIdx(BeginIdx), EndIdx(EndIdx) { }
+
+    virtual ~LoadedObjectInfo() {}
+
+    virtual object::OwningBinary<object::ObjectFile>
+    getObjectForDebug(const object::ObjectFile &Obj) const = 0;
+
+    uint64_t getSectionLoadAddress(StringRef Name) const;
+
+  protected:
+    virtual void anchor();
+
+    RuntimeDyldImpl &RTDyld;
+    unsigned BeginIdx, EndIdx;
+  };
+
   RuntimeDyld(RTDyldMemoryManager *);
   ~RuntimeDyld();
 
-  /// Prepare the object contained in the input buffer for execution.
-  /// Ownership of the input buffer is transferred to the ObjectImage
-  /// instance returned from this function if successful. In the case of load
-  /// failure, the input buffer will be deleted.
-  std::unique_ptr<ObjectImage>
-  loadObject(std::unique_ptr<ObjectBuffer> InputBuffer);
-
-  /// Prepare the referenced object file for execution.
-  /// Ownership of the input object is transferred to the ObjectImage
-  /// instance returned from this function if successful. In the case of load
-  /// failure, the input object will be deleted.
-  std::unique_ptr<ObjectImage>
-  loadObject(std::unique_ptr<object::ObjectFile> InputObject);
+  /// Add the referenced object file to the list of objects to be loaded and
+  /// relocated.
+  std::unique_ptr<LoadedObjectInfo> loadObject(const object::ObjectFile &O);
 
   /// Get the address of our local copy of the symbol. This may or may not
   /// be the address used for relocation (clients can copy the data around
   /// and resolve relocatons based on where they put it).
   void *getSymbolAddress(StringRef Name) const;
 
-  /// Get the address of the target copy of the symbol. This is the address
-  /// used for relocation.
+  /// Get the address of the target copy of the symbol (works for both exported
+  /// and non-exported symbols). This is the address used for relocation.
   uint64_t getSymbolLoadAddress(StringRef Name) const;
 
+  /// Get the address of the target copy of the symbol (works for exported
+  /// symbols only). This is the address used for relocation.
+  uint64_t getExportedSymbolLoadAddress(StringRef Name) const;
+
   /// Resolve the relocations for all symbols we currently know about.
   void resolveRelocations();
 
diff --git a/include/llvm/ExecutionEngine/RuntimeDyldChecker.h b/include/llvm/ExecutionEngine/RuntimeDyldChecker.h
index 35ceba2..23936a6 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyldChecker.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyldChecker.h
@@ -52,6 +52,7 @@ class raw_ostream;
 ///
 /// ident_expr = 'decode_operand' '(' symbol ',' operand-index ')'
 ///            | 'next_pc'        '(' symbol ')'
+///            | 'stub_addr' '(' file-name ',' section-name ',' symbol ')'
 ///            | symbol
 ///
 /// binary_expr = expr '+' expr
diff --git a/include/llvm/ExecutionEngine/SectionMemoryManager.h b/include/llvm/ExecutionEngine/SectionMemoryManager.h
index 1368563..b825aff 100644
--- a/include/llvm/ExecutionEngine/SectionMemoryManager.h
+++ b/include/llvm/ExecutionEngine/SectionMemoryManager.h
@@ -35,8 +35,8 @@ namespace llvm {
 /// MCJIT::finalizeObject or by calling SectionMemoryManager::finalizeMemory
 /// directly.  Clients of MCJIT should call MCJIT::finalizeObject.
 class SectionMemoryManager : public RTDyldMemoryManager {
-  SectionMemoryManager(const SectionMemoryManager&) LLVM_DELETED_FUNCTION;
-  void operator=(const SectionMemoryManager&) LLVM_DELETED_FUNCTION;
+  SectionMemoryManager(const SectionMemoryManager&) = delete;
+  void operator=(const SectionMemoryManager&) = delete;
 
 public:
   SectionMemoryManager() { }
diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index 5ff48d6..443892b 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@@ -282,6 +282,11 @@ public:
   AttributeSet removeAttributes(LLVMContext &C, unsigned Index, 
                                 AttributeSet Attrs) const;
 
+  /// \brief Add the dereferenceable attribute to the attribute set at the given
+  /// index. Since attribute sets are immutable, this returns a new set.
+  AttributeSet addDereferenceableAttr(LLVMContext &C, unsigned Index,
+                                      uint64_t Bytes) const;
+
   //===--------------------------------------------------------------------===//
   // AttributeSet Accessors
   //===--------------------------------------------------------------------===//
diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h
index 7c7dd2c..185fc29 100644
--- a/include/llvm/IR/BasicBlock.h
+++ b/include/llvm/IR/BasicBlock.h
@@ -82,8 +82,8 @@ private:
   void setParent(Function *parent);
   friend class SymbolTableListTraits<BasicBlock, Function>;
 
-  BasicBlock(const BasicBlock &) LLVM_DELETED_FUNCTION;
-  void operator=(const BasicBlock &) LLVM_DELETED_FUNCTION;
+  BasicBlock(const BasicBlock &) = delete;
+  void operator=(const BasicBlock &) = delete;
 
   /// \brief Constructor.
   ///
@@ -208,6 +208,14 @@ public:
     return const_cast<BasicBlock*>(this)->getUniquePredecessor();
   }
 
+  /// Return the successor of this block if it has a unique successor.
+  /// Otherwise return a null pointer.  This method is analogous to
+  /// getUniquePredeccessor above.
+  BasicBlock *getUniqueSuccessor();
+  const BasicBlock *getUniqueSuccessor() const {
+    return const_cast<BasicBlock*>(this)->getUniqueSuccessor();
+  }
+
   //===--------------------------------------------------------------------===//
   /// Instruction iterator methods
   ///
diff --git a/include/llvm/IR/CFG.h b/include/llvm/IR/CFG.h
index c8be8bd..8476431 100644
--- a/include/llvm/IR/CFG.h
+++ b/include/llvm/IR/CFG.h
@@ -16,6 +16,7 @@
 #define LLVM_IR_CFG_H
 
 #include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
 
@@ -84,6 +85,8 @@ public:
 typedef PredIterator<BasicBlock, Value::user_iterator> pred_iterator;
 typedef PredIterator<const BasicBlock,
                      Value::const_user_iterator> const_pred_iterator;
+typedef llvm::iterator_range<pred_iterator> pred_range;
+typedef llvm::iterator_range<const_pred_iterator> pred_const_range;
 
 inline pred_iterator pred_begin(BasicBlock *BB) { return pred_iterator(BB); }
 inline const_pred_iterator pred_begin(const BasicBlock *BB) {
@@ -93,8 +96,15 @@ inline pred_iterator pred_end(BasicBlock *BB) { return pred_iterator(BB, true);}
 inline const_pred_iterator pred_end(const BasicBlock *BB) {
   return const_pred_iterator(BB, true);
 }
-
-
+inline bool pred_empty(const BasicBlock *BB) {
+  return pred_begin(BB) == pred_end(BB);
+}
+inline pred_range predecessors(BasicBlock *BB) {
+  return pred_range(pred_begin(BB), pred_end(BB));
+}
+inline pred_const_range predecessors(const BasicBlock *BB) {
+  return pred_const_range(pred_begin(BB), pred_end(BB));
+}
 
 //===----------------------------------------------------------------------===//
 // BasicBlock succ_iterator definition
@@ -244,6 +254,8 @@ public:
 typedef SuccIterator<TerminatorInst*, BasicBlock> succ_iterator;
 typedef SuccIterator<const TerminatorInst*,
                      const BasicBlock> succ_const_iterator;
+typedef llvm::iterator_range<succ_iterator> succ_range;
+typedef llvm::iterator_range<succ_const_iterator> succ_const_range;
 
 inline succ_iterator succ_begin(BasicBlock *BB) {
   return succ_iterator(BB->getTerminator());
@@ -257,6 +269,16 @@ inline succ_iterator succ_end(BasicBlock *BB) {
 inline succ_const_iterator succ_end(const BasicBlock *BB) {
   return succ_const_iterator(BB->getTerminator(), true);
 }
+inline bool succ_empty(const BasicBlock *BB) {
+  return succ_begin(BB) == succ_end(BB);
+}
+inline succ_range successors(BasicBlock *BB) {
+  return succ_range(succ_begin(BB), succ_end(BB));
+}
+inline succ_const_range successors(const BasicBlock *BB) {
+  return succ_const_range(succ_begin(BB), succ_end(BB));
+}
+
 
 template <typename T, typename U> struct isPodLike<SuccIterator<T, U> > {
   static const bool value = isPodLike<T>::value;
diff --git a/include/llvm/IR/CallSite.h b/include/llvm/IR/CallSite.h
index df08257..a4ea243 100644
--- a/include/llvm/IR/CallSite.h
+++ b/include/llvm/IR/CallSite.h
@@ -78,7 +78,7 @@ public:
 
   InstrTy *getInstruction() const { return I.getPointer(); }
   InstrTy *operator->() const { return I.getPointer(); }
-  LLVM_EXPLICIT operator bool() const { return I.getPointer(); }
+  explicit operator bool() const { return I.getPointer(); }
 
   /// getCalledValue - Return the pointer to function that is being called.
   ///
diff --git a/include/llvm/IR/Comdat.h b/include/llvm/IR/Comdat.h
index 3e77a77..4d4c15f 100644
--- a/include/llvm/IR/Comdat.h
+++ b/include/llvm/IR/Comdat.h
@@ -49,7 +49,7 @@ private:
   friend class Module;
   Comdat();
   Comdat(SelectionKind SK, StringMapEntry<Comdat> *Name);
-  Comdat(const Comdat &) LLVM_DELETED_FUNCTION;
+  Comdat(const Comdat &) = delete;
 
   // Points to the map in Module.
   StringMapEntry<Comdat> *Name;
diff --git a/include/llvm/IR/Constant.h b/include/llvm/IR/Constant.h
index d26991e..75499e0 100644
--- a/include/llvm/IR/Constant.h
+++ b/include/llvm/IR/Constant.h
@@ -39,8 +39,8 @@ namespace llvm {
 /// don't have to worry about the lifetime of the objects.
 /// @brief LLVM Constant Representation
 class Constant : public User {
-  void operator=(const Constant &) LLVM_DELETED_FUNCTION;
-  Constant(const Constant &) LLVM_DELETED_FUNCTION;
+  void operator=(const Constant &) = delete;
+  Constant(const Constant &) = delete;
   void anchor() override;
 
 protected:
diff --git a/include/llvm/IR/ConstantRange.h b/include/llvm/IR/ConstantRange.h
index 3d39289..5e8cd34 100644
--- a/include/llvm/IR/ConstantRange.h
+++ b/include/llvm/IR/ConstantRange.h
@@ -37,7 +37,7 @@
 
 namespace llvm {
 
-/// ConstantRange - This class represents an range of values.
+/// This class represents a range of values.
 ///
 class ConstantRange {
   APInt Lower, Upper;
@@ -59,7 +59,7 @@ public:
   /// assert out if the two APInt's are not the same bit width.
   ConstantRange(APIntMoveTy Lower, APIntMoveTy Upper);
 
-  /// makeICmpRegion - Produce the smallest range that contains all values that
+  /// Produce the smallest range that contains all values that
   /// might satisfy the comparison specified by Pred when compared to any value
   /// contained within Other.
   ///
@@ -69,47 +69,46 @@ public:
   static ConstantRange makeICmpRegion(unsigned Pred,
                                       const ConstantRange &Other);
 
-  /// getLower - Return the lower value for this range...
+  /// Return the lower value for this range.
   ///
   const APInt &getLower() const { return Lower; }
 
-  /// getUpper - Return the upper value for this range...
+  /// Return the upper value for this range.
   ///
   const APInt &getUpper() const { return Upper; }
 
-  /// getBitWidth - get the bit width of this ConstantRange
+  /// Get the bit width of this ConstantRange.
   ///
   uint32_t getBitWidth() const { return Lower.getBitWidth(); }
 
-  /// isFullSet - Return true if this set contains all of the elements possible
-  /// for this data-type
+  /// Return true if this set contains all of the elements possible
+  /// for this data-type.
   ///
   bool isFullSet() const;
 
-  /// isEmptySet - Return true if this set contains no members.
+  /// Return true if this set contains no members.
   ///
   bool isEmptySet() const;
 
-  /// isWrappedSet - Return true if this set wraps around the top of the range,
-  /// for example: [100, 8)
+  /// Return true if this set wraps around the top of the range.
+  /// For example: [100, 8).
   ///
   bool isWrappedSet() const;
 
-  /// isSignWrappedSet - Return true if this set wraps around the INT_MIN of
-  /// its bitwidth, for example: i8 [120, 140).
+  /// Return true if this set wraps around the INT_MIN of
+  /// its bitwidth. For example: i8 [120, 140).
   ///
   bool isSignWrappedSet() const;
 
-  /// contains - Return true if the specified value is in the set.
+  /// Return true if the specified value is in the set.
   ///
   bool contains(const APInt &Val) const;
 
-  /// contains - Return true if the other range is a subset of this one.
+  /// Return true if the other range is a subset of this one.
   ///
   bool contains(const ConstantRange &CR) const;
 
-  /// getSingleElement - If this set contains a single element, return it,
-  /// otherwise return null.
+  /// If this set contains a single element, return it, otherwise return null.
   ///
   const APInt *getSingleElement() const {
     if (Upper == Lower + 1)
@@ -117,35 +116,31 @@ public:
     return nullptr;
   }
 
-  /// isSingleElement - Return true if this set contains exactly one member.
+  /// Return true if this set contains exactly one member.
   ///
   bool isSingleElement() const { return getSingleElement() != nullptr; }
 
-  /// getSetSize - Return the number of elements in this set.
+  /// Return the number of elements in this set.
   ///
   APInt getSetSize() const;
 
-  /// getUnsignedMax - Return the largest unsigned value contained in the
-  /// ConstantRange.
+  /// Return the largest unsigned value contained in the ConstantRange.
   ///
   APInt getUnsignedMax() const;
 
-  /// getUnsignedMin - Return the smallest unsigned value contained in the
-  /// ConstantRange.
+  /// Return the smallest unsigned value contained in the ConstantRange.
   ///
   APInt getUnsignedMin() const;
 
-  /// getSignedMax - Return the largest signed value contained in the
-  /// ConstantRange.
+  /// Return the largest signed value contained in the ConstantRange.
   ///
   APInt getSignedMax() const;
 
-  /// getSignedMin - Return the smallest signed value contained in the
-  /// ConstantRange.
+  /// Return the smallest signed value contained in the ConstantRange.
   ///
   APInt getSignedMin() const;
 
-  /// operator== - Return true if this range is equal to another range.
+  /// Return true if this range is equal to another range.
   ///
   bool operator==(const ConstantRange &CR) const {
     return Lower == CR.Lower && Upper == CR.Upper;
@@ -154,15 +149,14 @@ public:
     return !operator==(CR);
   }
 
-  /// subtract - Subtract the specified constant from the endpoints of this
-  /// constant range.
+  /// Subtract the specified constant from the endpoints of this constant range.
   ConstantRange subtract(const APInt &CI) const;
 
   /// \brief Subtract the specified range from this range (aka relative
   /// complement of the sets).
   ConstantRange difference(const ConstantRange &CR) const;
 
-  /// intersectWith - Return the range that results from the intersection of
+  /// Return the range that results from the intersection of
   /// this range with another range.  The resultant range is guaranteed to
   /// include all elements contained in both input ranges, and to have the
   /// smallest possible set size that does so.  Because there may be two
@@ -171,7 +165,7 @@ public:
   ///
   ConstantRange intersectWith(const ConstantRange &CR) const;
 
-  /// unionWith - Return the range that results from the union of this range
+  /// Return the range that results from the union of this range
   /// with another range.  The resultant range is guaranteed to include the
   /// elements of both sets, but may contain more.  For example, [3, 9) union
   /// [12,15) is [3, 15), which includes 9, 10, and 11, which were not included
@@ -179,85 +173,84 @@ public:
   ///
   ConstantRange unionWith(const ConstantRange &CR) const;
 
-  /// zeroExtend - Return a new range in the specified integer type, which must
+  /// Return a new range in the specified integer type, which must
   /// be strictly larger than the current type.  The returned range will
   /// correspond to the possible range of values if the source range had been
   /// zero extended to BitWidth.
   ConstantRange zeroExtend(uint32_t BitWidth) const;
 
-  /// signExtend - Return a new range in the specified integer type, which must
+  /// Return a new range in the specified integer type, which must
   /// be strictly larger than the current type.  The returned range will
   /// correspond to the possible range of values if the source range had been
   /// sign extended to BitWidth.
   ConstantRange signExtend(uint32_t BitWidth) const;
 
-  /// truncate - Return a new range in the specified integer type, which must be
+  /// Return a new range in the specified integer type, which must be
   /// strictly smaller than the current type.  The returned range will
   /// correspond to the possible range of values if the source range had been
   /// truncated to the specified type.
   ConstantRange truncate(uint32_t BitWidth) const;
 
-  /// zextOrTrunc - make this range have the bit width given by \p BitWidth. The
+  /// Make this range have the bit width given by \p BitWidth. The
   /// value is zero extended, truncated, or left alone to make it that width.
   ConstantRange zextOrTrunc(uint32_t BitWidth) const;
   
-  /// sextOrTrunc - make this range have the bit width given by \p BitWidth. The
+  /// Make this range have the bit width given by \p BitWidth. The
   /// value is sign extended, truncated, or left alone to make it that width.
   ConstantRange sextOrTrunc(uint32_t BitWidth) const;
 
-  /// add - Return a new range representing the possible values resulting
+  /// Return a new range representing the possible values resulting
   /// from an addition of a value in this range and a value in \p Other.
   ConstantRange add(const ConstantRange &Other) const;
 
-  /// sub - Return a new range representing the possible values resulting
+  /// Return a new range representing the possible values resulting
   /// from a subtraction of a value in this range and a value in \p Other.
   ConstantRange sub(const ConstantRange &Other) const;
 
-  /// multiply - Return a new range representing the possible values resulting
+  /// Return a new range representing the possible values resulting
   /// from a multiplication of a value in this range and a value in \p Other.
   /// TODO: This isn't fully implemented yet.
   ConstantRange multiply(const ConstantRange &Other) const;
 
-  /// smax - Return a new range representing the possible values resulting
+  /// Return a new range representing the possible values resulting
   /// from a signed maximum of a value in this range and a value in \p Other.
   ConstantRange smax(const ConstantRange &Other) const;
 
-  /// umax - Return a new range representing the possible values resulting
+  /// Return a new range representing the possible values resulting
   /// from an unsigned maximum of a value in this range and a value in \p Other.
   ConstantRange umax(const ConstantRange &Other) const;
 
-  /// udiv - Return a new range representing the possible values resulting
+  /// Return a new range representing the possible values resulting
   /// from an unsigned division of a value in this range and a value in
   /// \p Other.
   ConstantRange udiv(const ConstantRange &Other) const;
 
-  /// binaryAnd - return a new range representing the possible values resulting
+  /// Return a new range representing the possible values resulting
   /// from a binary-and of a value in this range by a value in \p Other.
   ConstantRange binaryAnd(const ConstantRange &Other) const;
 
-  /// binaryOr - return a new range representing the possible values resulting
+  /// Return a new range representing the possible values resulting
   /// from a binary-or of a value in this range by a value in \p Other.
   ConstantRange binaryOr(const ConstantRange &Other) const;
 
-  /// shl - Return a new range representing the possible values resulting
+  /// Return a new range representing the possible values resulting
   /// from a left shift of a value in this range by a value in \p Other.
   /// TODO: This isn't fully implemented yet.
   ConstantRange shl(const ConstantRange &Other) const;
 
-  /// lshr - Return a new range representing the possible values resulting
-  /// from a logical right shift of a value in this range and a value in
-  /// \p Other.
+  /// Return a new range representing the possible values resulting from a
+  /// logical right shift of a value in this range and a value in \p Other.
   ConstantRange lshr(const ConstantRange &Other) const;
 
-  /// inverse - Return a new range that is the logical not of the current set.
+  /// Return a new range that is the logical not of the current set.
   ///
   ConstantRange inverse() const;
   
-  /// print - Print out the bounds to a stream...
+  /// Print out the bounds to a stream.
   ///
   void print(raw_ostream &OS) const;
 
-  /// dump - Allow printing from a debugger easily...
+  /// Allow printing from a debugger easily.
   ///
   void dump() const;
 };
diff --git a/include/llvm/IR/Constants.h b/include/llvm/IR/Constants.h
index 1b0e1b7..59be653 100644
--- a/include/llvm/IR/Constants.h
+++ b/include/llvm/IR/Constants.h
@@ -46,8 +46,8 @@ template <class ConstantClass> struct ConstantAggrKeyType;
 /// @brief Class for constant integers.
 class ConstantInt : public Constant {
   void anchor() override;
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
-  ConstantInt(const ConstantInt &) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
+  ConstantInt(const ConstantInt &) = delete;
   ConstantInt(IntegerType *Ty, const APInt& V);
   APInt Val;
 protected:
@@ -228,8 +228,8 @@ public:
 class ConstantFP : public Constant {
   APFloat Val;
   void anchor() override;
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
-  ConstantFP(const ConstantFP &) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
+  ConstantFP(const ConstantFP &) = delete;
   friend class LLVMContextImpl;
 protected:
   ConstantFP(Type *Ty, const APFloat& V);
@@ -294,8 +294,8 @@ public:
 /// ConstantAggregateZero - All zero aggregate value
 ///
 class ConstantAggregateZero : public Constant {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
-  ConstantAggregateZero(const ConstantAggregateZero &) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
+  ConstantAggregateZero(const ConstantAggregateZero &) = delete;
 protected:
   explicit ConstantAggregateZero(Type *ty)
     : Constant(ty, ConstantAggregateZeroVal, nullptr, 0) {}
@@ -325,6 +325,9 @@ public:
   /// index.
   Constant *getElementValue(unsigned Idx) const;
 
+  /// \brief Return the number of elements in the array, vector, or struct.
+  unsigned getNumElements() const;
+
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   ///
   static bool classof(const Value *V) {
@@ -338,7 +341,7 @@ public:
 ///
 class ConstantArray : public Constant {
   friend struct ConstantAggrKeyType<ConstantArray>;
-  ConstantArray(const ConstantArray &) LLVM_DELETED_FUNCTION;
+  ConstantArray(const ConstantArray &) = delete;
 protected:
   ConstantArray(ArrayType *T, ArrayRef<Constant *> Val);
 public:
@@ -380,7 +383,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantArray, Constant)
 //
 class ConstantStruct : public Constant {
   friend struct ConstantAggrKeyType<ConstantStruct>;
-  ConstantStruct(const ConstantStruct &) LLVM_DELETED_FUNCTION;
+  ConstantStruct(const ConstantStruct &) = delete;
 protected:
   ConstantStruct(StructType *T, ArrayRef<Constant *> Val);
 public:
@@ -439,7 +442,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantStruct, Constant)
 ///
 class ConstantVector : public Constant {
   friend struct ConstantAggrKeyType<ConstantVector>;
-  ConstantVector(const ConstantVector &) LLVM_DELETED_FUNCTION;
+  ConstantVector(const ConstantVector &) = delete;
 protected:
   ConstantVector(VectorType *T, ArrayRef<Constant *> Val);
 public:
@@ -488,8 +491,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantVector, Constant)
 /// ConstantPointerNull - a constant pointer value that points to null
 ///
 class ConstantPointerNull : public Constant {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
-  ConstantPointerNull(const ConstantPointerNull &) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
+  ConstantPointerNull(const ConstantPointerNull &) = delete;
 protected:
   explicit ConstantPointerNull(PointerType *T)
     : Constant(T,
@@ -539,8 +542,8 @@ class ConstantDataSequential : public Constant {
   /// element array of i8, or a 1-element array of i32.  They'll both end up in
   /// the same StringMap bucket, linked up.
   ConstantDataSequential *Next;
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
-  ConstantDataSequential(const ConstantDataSequential &) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
+  ConstantDataSequential(const ConstantDataSequential &) = delete;
 protected:
   explicit ConstantDataSequential(Type *ty, ValueTy VT, const char *Data)
     : Constant(ty, VT, nullptr, 0), DataElements(Data), Next(nullptr) {}
@@ -650,8 +653,8 @@ private:
 /// operands because it stores all of the elements of the constant as densely
 /// packed data, instead of as Value*'s.
 class ConstantDataArray : public ConstantDataSequential {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
-  ConstantDataArray(const ConstantDataArray &) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
+  ConstantDataArray(const ConstantDataArray &) = delete;
   void anchor() override;
   friend class ConstantDataSequential;
   explicit ConstantDataArray(Type *ty, const char *Data)
@@ -673,6 +676,15 @@ public:
   static Constant *get(LLVMContext &Context, ArrayRef<float> Elts);
   static Constant *get(LLVMContext &Context, ArrayRef<double> Elts);
 
+  /// getFP() constructors - Return a constant with array type with an element
+  /// count and element type of float with precision matching the number of
+  /// bits in the ArrayRef passed in. (i.e. half for 16bits, float for 32bits,
+  /// double for 64bits) Note that this can return a ConstantAggregateZero
+  /// object.
+  static Constant *getFP(LLVMContext &Context, ArrayRef<uint16_t> Elts);
+  static Constant *getFP(LLVMContext &Context, ArrayRef<uint32_t> Elts);
+  static Constant *getFP(LLVMContext &Context, ArrayRef<uint64_t> Elts);
+
   /// getString - This method constructs a CDS and initializes it with a text
   /// string. The default behavior (AddNull==true) causes a null terminator to
   /// be placed at the end of the array (increasing the length of the string by
@@ -702,8 +714,8 @@ public:
 /// operands because it stores all of the elements of the constant as densely
 /// packed data, instead of as Value*'s.
 class ConstantDataVector : public ConstantDataSequential {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
-  ConstantDataVector(const ConstantDataVector &) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
+  ConstantDataVector(const ConstantDataVector &) = delete;
   void anchor() override;
   friend class ConstantDataSequential;
   explicit ConstantDataVector(Type *ty, const char *Data)
@@ -725,6 +737,15 @@ public:
   static Constant *get(LLVMContext &Context, ArrayRef<float> Elts);
   static Constant *get(LLVMContext &Context, ArrayRef<double> Elts);
 
+  /// getFP() constructors - Return a constant with vector type with an element
+  /// count and element type of float with the precision matching the number of
+  /// bits in the ArrayRef passed in.  (i.e. half for 16bits, float for 32bits,
+  /// double for 64bits) Note that this can return a ConstantAggregateZero
+  /// object.
+  static Constant *getFP(LLVMContext &Context, ArrayRef<uint16_t> Elts);
+  static Constant *getFP(LLVMContext &Context, ArrayRef<uint32_t> Elts);
+  static Constant *getFP(LLVMContext &Context, ArrayRef<uint64_t> Elts);
+
   /// getSplat - Return a ConstantVector with the specified constant in each
   /// element.  The specified constant has to be a of a compatible type (i8/i16/
   /// i32/i64/float/double) and must be a ConstantFP or ConstantInt.
@@ -753,7 +774,7 @@ public:
 /// BlockAddress - The address of a basic block.
 ///
 class BlockAddress : public Constant {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
   void *operator new(size_t s) { return User::operator new(s, 2); }
   BlockAddress(Function *F, BasicBlock *BB);
 public:
@@ -1165,8 +1186,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantExpr, Constant)
 /// LangRef.html#undefvalues for details.
 ///
 class UndefValue : public Constant {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
-  UndefValue(const UndefValue &) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
+  UndefValue(const UndefValue &) = delete;
 protected:
   explicit UndefValue(Type *T) : Constant(T, UndefValueVal, nullptr, 0) {}
 protected:
@@ -1196,6 +1217,9 @@ public:
   /// index.
   UndefValue *getElementValue(unsigned Idx) const;
 
+  /// \brief Return the number of elements in the array, vector, or struct.
+  unsigned getNumElements() const;
+
   void destroyConstant() override;
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
diff --git a/include/llvm/IR/DIBuilder.h b/include/llvm/IR/DIBuilder.h
index 3a50609..97a7b83 100644
--- a/include/llvm/IR/DIBuilder.h
+++ b/include/llvm/IR/DIBuilder.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/TrackingMDRef.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/DataTypes.h"
 
@@ -65,22 +66,34 @@ namespace llvm {
     Function *DeclareFn;     // llvm.dbg.declare
     Function *ValueFn;       // llvm.dbg.value
 
-    SmallVector<Value *, 4> AllEnumTypes;
-    /// Use TrackingVH to collect RetainTypes, since they can be updated
-    /// later on.
-    SmallVector<TrackingVH<MDNode>, 4> AllRetainTypes;
-    SmallVector<Value *, 4> AllSubprograms;
-    SmallVector<Value *, 4> AllGVs;
-    SmallVector<TrackingVH<MDNode>, 4> AllImportedModules;
+    SmallVector<Metadata *, 4> AllEnumTypes;
+    /// Track the RetainTypes, since they can be updated later on.
+    SmallVector<TrackingMDNodeRef, 4> AllRetainTypes;
+    SmallVector<Metadata *, 4> AllSubprograms;
+    SmallVector<Metadata *, 4> AllGVs;
+    SmallVector<TrackingMDNodeRef, 4> AllImportedModules;
+
+    /// \brief Track nodes that may be unresolved.
+    SmallVector<TrackingMDNodeRef, 4> UnresolvedNodes;
+    bool AllowUnresolvedNodes;
 
     /// Each subprogram's preserved local variables.
-    DenseMap<MDNode *, std::vector<TrackingVH<MDNode>>> PreservedVariables;
+    DenseMap<MDNode *, std::vector<TrackingMDNodeRef>> PreservedVariables;
+
+    DIBuilder(const DIBuilder &) = delete;
+    void operator=(const DIBuilder &) = delete;
 
-    DIBuilder(const DIBuilder &) LLVM_DELETED_FUNCTION;
-    void operator=(const DIBuilder &) LLVM_DELETED_FUNCTION;
+    /// \brief Create a temporary.
+    ///
+    /// Create an \a temporary node and track it in \a UnresolvedNodes.
+    void trackIfUnresolved(MDNode *N);
 
   public:
-    explicit DIBuilder(Module &M);
+    /// \brief Construct a builder for a module.
+    ///
+    /// If \c AllowUnresolved, collect unresolved nodes attached to the module
+    /// in order to resolve cycles during \a finalize().
+    explicit DIBuilder(Module &M, bool AllowUnresolved = true);
     enum DebugEmissionKind { FullDebug=1, LineTablesOnly };
 
     /// finalize - Construct any deferred debug info descriptors.
@@ -159,8 +172,12 @@ namespace llvm {
 
     /// \brief Create debugging information entry for a pointer to member.
     /// @param PointeeTy Type pointed to by this pointer.
+    /// @param SizeInBits  Size.
+    /// @param AlignInBits Alignment. (optional)
     /// @param Class Type for which this pointer points to members of.
-    DIDerivedType createMemberPointerType(DIType PointeeTy, DIType Class);
+    DIDerivedType createMemberPointerType(DIType PointeeTy, DIType Class,
+                                          uint64_t SizeInBits,
+                                          uint64_t AlignInBits = 0);
 
     /// createReferenceType - Create debugging information entry for a c++
     /// style reference or rvalue reference type.
@@ -316,13 +333,8 @@ namespace llvm {
     /// @param Scope        Scope in which this type is defined.
     /// @param Name         Type parameter name.
     /// @param Ty           Parameter type.
-    /// @param File         File where this type parameter is defined.
-    /// @param LineNo       Line number.
-    /// @param ColumnNo     Column Number.
     DITemplateTypeParameter
-    createTemplateTypeParameter(DIDescriptor Scope, StringRef Name, DIType Ty,
-                                MDNode *File = nullptr, unsigned LineNo = 0,
-                                unsigned ColumnNo = 0);
+    createTemplateTypeParameter(DIDescriptor Scope, StringRef Name, DIType Ty);
 
     /// createTemplateValueParameter - Create debugging information for template
     /// value parameter.
@@ -330,40 +342,30 @@ namespace llvm {
     /// @param Name         Value parameter name.
     /// @param Ty           Parameter type.
     /// @param Val          Constant parameter value.
-    /// @param File         File where this type parameter is defined.
-    /// @param LineNo       Line number.
-    /// @param ColumnNo     Column Number.
-    DITemplateValueParameter
-    createTemplateValueParameter(DIDescriptor Scope, StringRef Name, DIType Ty,
-                                 Constant *Val, MDNode *File = nullptr,
-                                 unsigned LineNo = 0, unsigned ColumnNo = 0);
+    DITemplateValueParameter createTemplateValueParameter(DIDescriptor Scope,
+                                                          StringRef Name,
+                                                          DIType Ty,
+                                                          Constant *Val);
 
     /// \brief Create debugging information for a template template parameter.
     /// @param Scope        Scope in which this type is defined.
     /// @param Name         Value parameter name.
     /// @param Ty           Parameter type.
     /// @param Val          The fully qualified name of the template.
-    /// @param File         File where this type parameter is defined.
-    /// @param LineNo       Line number.
-    /// @param ColumnNo     Column Number.
-    DITemplateValueParameter
-    createTemplateTemplateParameter(DIDescriptor Scope, StringRef Name,
-                                    DIType Ty, StringRef Val,
-                                    MDNode *File = nullptr, unsigned LineNo = 0,
-                                    unsigned ColumnNo = 0);
+    DITemplateValueParameter createTemplateTemplateParameter(DIDescriptor Scope,
+                                                             StringRef Name,
+                                                             DIType Ty,
+                                                             StringRef Val);
 
     /// \brief Create debugging information for a template parameter pack.
     /// @param Scope        Scope in which this type is defined.
     /// @param Name         Value parameter name.
     /// @param Ty           Parameter type.
     /// @param Val          An array of types in the pack.
-    /// @param File         File where this type parameter is defined.
-    /// @param LineNo       Line number.
-    /// @param ColumnNo     Column Number.
-    DITemplateValueParameter
-    createTemplateParameterPack(DIDescriptor Scope, StringRef Name,
-                                DIType Ty, DIArray Val, MDNode *File = nullptr,
-                                unsigned LineNo = 0, unsigned ColumnNo = 0);
+    DITemplateValueParameter createTemplateParameterPack(DIDescriptor Scope,
+                                                         StringRef Name,
+                                                         DIType Ty,
+                                                         DIArray Val);
 
     /// createArrayType - Create debugging information entry for an array.
     /// @param Size         Array size.
@@ -423,10 +425,11 @@ namespace llvm {
                                       StringRef UniqueIdentifier = StringRef());
 
     /// \brief Create a temporary forward-declared type.
-    DICompositeType createReplaceableForwardDecl(
+    DICompositeType createReplaceableCompositeType(
         unsigned Tag, StringRef Name, DIDescriptor Scope, DIFile F,
         unsigned Line, unsigned RuntimeLang = 0, uint64_t SizeInBits = 0,
-        uint64_t AlignInBits = 0, StringRef UniqueIdentifier = StringRef());
+        uint64_t AlignInBits = 0, unsigned Flags = DIDescriptor::FlagFwdDecl,
+        StringRef UniqueIdentifier = StringRef());
 
     /// retainType - Retain DIType in a module even if it is not referenced
     /// through debug info anchors.
@@ -437,10 +440,10 @@ namespace llvm {
     DIBasicType createUnspecifiedParameter();
 
     /// getOrCreateArray - Get a DIArray, create one if required.
-    DIArray getOrCreateArray(ArrayRef<Value *> Elements);
+    DIArray getOrCreateArray(ArrayRef<Metadata *> Elements);
 
     /// getOrCreateTypeArray - Get a DITypeArray, create one if required.
-    DITypeArray getOrCreateTypeArray(ArrayRef<Value *> Elements);
+    DITypeArray getOrCreateTypeArray(ArrayRef<Metadata *> Elements);
 
     /// getOrCreateSubrange - Create a descriptor for a value range.  This
     /// implicitly uniques the values returned.
@@ -497,15 +500,16 @@ namespace llvm {
     /// createExpression - Create a new descriptor for the specified
     /// variable which has a complex address expression for its address.
     /// @param Addr        An array of complex address operations.
-    DIExpression createExpression(ArrayRef<int64_t> Addr = None);
+    DIExpression createExpression(ArrayRef<uint64_t> Addr = None);
+    DIExpression createExpression(ArrayRef<int64_t> Addr);
 
-    /// createPieceExpression - Create a descriptor to describe one part
+    /// createBitPieceExpression - Create a descriptor to describe one part
     /// of aggregate variable that is fragmented across multiple Values.
     ///
-    /// @param OffsetInBytes Offset of the piece in bytes.
-    /// @param SizeInBytes   Size of the piece in bytes.
-    DIExpression createPieceExpression(unsigned OffsetInBytes,
-                                       unsigned SizeInBytes);
+    /// @param OffsetInBits Offset of the piece in bits.
+    /// @param SizeInBits   Size of the piece in bits.
+    DIExpression createBitPieceExpression(unsigned OffsetInBits,
+                                          unsigned SizeInBits);
 
     /// createFunction - Create a new descriptor for the specified subprogram.
     /// See comments in DISubprogram for descriptions of these fields.
@@ -685,6 +689,20 @@ namespace llvm {
     Instruction *insertDbgValueIntrinsic(llvm::Value *Val, uint64_t Offset,
                                          DIVariable VarInfo, DIExpression Expr,
                                          Instruction *InsertBefore);
+
+    /// \brief Replace the vtable holder in the given composite type.
+    ///
+    /// If this creates a self reference, it may orphan some unresolved cycles
+    /// in the operands of \c T, so \a DIBuilder needs to track that.
+    void replaceVTableHolder(DICompositeType &T, DICompositeType VTableHolder);
+
+    /// \brief Replace arrays on a composite type.
+    ///
+    /// If \c T is resolved, but the arrays aren't -- which can happen if \c T
+    /// has a self-reference -- \a DIBuilder needs to track the array to
+    /// resolve cycles.
+    void replaceArrays(DICompositeType &T, DIArray Elements,
+                       DIArray TParems = DIArray());
   };
 } // end namespace llvm
 
diff --git a/include/llvm/IR/DataLayout.h b/include/llvm/IR/DataLayout.h
index 4580a4f..9479ba4 100644
--- a/include/llvm/IR/DataLayout.h
+++ b/include/llvm/IR/DataLayout.h
@@ -53,6 +53,11 @@ enum AlignTypeEnum {
   AGGREGATE_ALIGN = 'a'
 };
 
+// FIXME: Currently the DataLayout string carries a "preferred alignment"
+// for types. As the DataLayout is module/global, this should likely be
+// sunk down to an FTTI element that is queried rather than a global
+// preference.
+
 /// \brief Layout alignment element.
 ///
 /// Stores the alignment data associated with a given alignment type (integer,
@@ -228,6 +233,8 @@ public:
     return (StackNaturalAlign != 0) && (Align > StackNaturalAlign);
   }
 
+  unsigned getStackAlignment() const { return StackNaturalAlign; }
+
   bool hasMicrosoftFastStdCallMangling() const {
     return ManglingMode == MM_WINCOFF;
   }
diff --git a/include/llvm/IR/DebugInfo.h b/include/llvm/IR/DebugInfo.h
index 22a2138..d2e5975 100644
--- a/include/llvm/IR/DebugInfo.h
+++ b/include/llvm/IR/DebugInfo.h
@@ -18,11 +18,11 @@
 #define LLVM_IR_DEBUGINFO_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Metadata.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -66,10 +66,10 @@ class DIHeaderFieldIterator
 
 public:
   DIHeaderFieldIterator() {}
-  DIHeaderFieldIterator(StringRef Header)
+  explicit DIHeaderFieldIterator(StringRef Header)
       : Header(Header), Current(Header.slice(0, Header.find('\0'))) {}
   StringRef operator*() const { return Current; }
-  const StringRef * operator->() const { return &Current; }
+  const StringRef *operator->() const { return &Current; }
   DIHeaderFieldIterator &operator++() {
     increment();
     return *this;
@@ -99,6 +99,16 @@ public:
     return Header.slice(Current.end() - Header.begin() + 1, StringRef::npos);
   }
 
+  /// \brief Get the current field as a number.
+  ///
+  /// Convert the current field into a number.  Return \c 0 on error.
+  template <class T> T getNumber() const {
+    T Int;
+    if (getCurrent().getAsInteger(0, Int))
+      return 0;
+    return Int;
+  }
+
 private:
   void increment() {
     assert(Current.data() != nullptr && "Cannot increment past the end");
@@ -122,27 +132,21 @@ public:
   /// The three accessibility flags are mutually exclusive and rolled together
   /// in the first two bits.
   enum {
-    FlagAccessibility     = 1 << 0 | 1 << 1,
-    FlagPrivate           = 1,
-    FlagProtected         = 2,
-    FlagPublic            = 3,
-
-    FlagFwdDecl           = 1 << 2,
-    FlagAppleBlock        = 1 << 3,
-    FlagBlockByrefStruct  = 1 << 4,
-    FlagVirtual           = 1 << 5,
-    FlagArtificial        = 1 << 6,
-    FlagExplicit          = 1 << 7,
-    FlagPrototyped        = 1 << 8,
-    FlagObjcClassComplete = 1 << 9,
-    FlagObjectPointer     = 1 << 10,
-    FlagVector            = 1 << 11,
-    FlagStaticMember      = 1 << 12,
-    FlagIndirectVariable  = 1 << 13,
-    FlagLValueReference   = 1 << 14,
-    FlagRValueReference   = 1 << 15
+#define HANDLE_DI_FLAG(ID, NAME) Flag##NAME = ID,
+#include "llvm/IR/DebugInfoFlags.def"
+    FlagAccessibility = FlagPrivate | FlagProtected | FlagPublic
   };
 
+  static unsigned getFlag(StringRef Flag);
+  static const char *getFlagString(unsigned Flag);
+
+  /// \brief Split up a flags bitfield.
+  ///
+  /// Split \c Flags into \c SplitFlags, a vector of its components.  Returns
+  /// any remaining (unrecognized) bits.
+  static unsigned splitFlags(unsigned Flags,
+                             SmallVectorImpl<unsigned> &SplitFlags);
+
 protected:
   const MDNode *DbgNode;
 
@@ -168,42 +172,47 @@ public:
 
   bool Verify() const;
 
-  operator MDNode *() const { return const_cast<MDNode *>(DbgNode); }
-  MDNode *operator->() const { return const_cast<MDNode *>(DbgNode); }
+  MDNode *get() const { return const_cast<MDNode *>(DbgNode); }
+  operator MDNode *() const { return get(); }
+  MDNode *operator->() const { return get(); }
 
   // An explicit operator bool so that we can do testing of DI values
   // easily.
   // FIXME: This operator bool isn't actually protecting anything at the
   // moment due to the conversion operator above making DIDescriptor nodes
   // implicitly convertable to bool.
-  LLVM_EXPLICIT operator bool() const { return DbgNode != nullptr; }
+  explicit operator bool() const { return DbgNode != nullptr; }
 
   bool operator==(DIDescriptor Other) const { return DbgNode == Other.DbgNode; }
   bool operator!=(DIDescriptor Other) const { return !operator==(Other); }
 
-  StringRef getHeader() const {
-    return getStringField(0);
-  }
+  StringRef getHeader() const { return getStringField(0); }
 
   size_t getNumHeaderFields() const {
     return std::distance(DIHeaderFieldIterator(getHeader()),
                          DIHeaderFieldIterator());
   }
 
-  StringRef getHeaderField(unsigned Index) const {
+  DIHeaderFieldIterator header_begin() const {
+    return DIHeaderFieldIterator(getHeader());
+  }
+  DIHeaderFieldIterator header_end() const { return DIHeaderFieldIterator(); }
+
+  DIHeaderFieldIterator getHeaderIterator(unsigned Index) const {
     // Since callers expect an empty string for out-of-range accesses, we can't
     // use std::advance() here.
-    for (DIHeaderFieldIterator I(getHeader()), E; I != E; ++I, --Index)
+    for (auto I = header_begin(), E = header_end(); I != E; ++I, --Index)
       if (!Index)
-        return *I;
-    return StringRef();
+        return I;
+    return header_end();
+  }
+
+  StringRef getHeaderField(unsigned Index) const {
+    return *getHeaderIterator(Index);
   }
 
   template <class T> T getHeaderFieldAs(unsigned Index) const {
-    T Int;
-    if (getHeaderField(Index).getAsInteger(0, Int))
-      return 0;
-    return Int;
+    return getHeaderIterator(Index).getNumber<T>();
   }
 
   uint16_t getTag() const { return getHeaderFieldAs<uint16_t>(0); }
@@ -238,6 +247,19 @@ public:
   void replaceAllUsesWith(MDNode *D);
 };
 
+#define RETURN_FROM_RAW(VALID, DEFAULT)                                        \
+  do {                                                                         \
+    if (auto *N = getRaw())                                                    \
+      return VALID;                                                            \
+    return DEFAULT;                                                            \
+  } while (false)
+#define RETURN_DESCRIPTOR_FROM_RAW(DESC, VALID)                                \
+  do {                                                                         \
+    if (auto *N = getRaw())                                                    \
+      return DESC(dyn_cast_or_null<MDNode>(VALID));                            \
+    return DESC(static_cast<const MDNode *>(nullptr));                         \
+  } while (false)
+
 /// \brief This is used to represent ranges, for array bounds.
 class DISubrange : public DIDescriptor {
   friend class DIDescriptor;
@@ -258,9 +280,7 @@ public:
   unsigned getNumElements() const {
     return DbgNode ? DbgNode->getNumOperands() : 0;
   }
-  T getElement(unsigned Idx) const {
-    return getFieldAs<T>(Idx);
-  }
+  T getElement(unsigned Idx) const { return getFieldAs<T>(Idx); }
 };
 
 typedef DITypedArray<DIDescriptor> DIArray;
@@ -282,6 +302,7 @@ public:
 };
 
 template <typename T> class DIRef;
+typedef DIRef<DIDescriptor> DIDescriptorRef;
 typedef DIRef<DIScope> DIScopeRef;
 typedef DIRef<DIType> DITypeRef;
 typedef DITypedArray<DITypeRef> DITypeArray;
@@ -371,6 +392,12 @@ template <typename T> StringRef DIRef<T>::getName() const {
   return MS->getString();
 }
 
+/// \brief Handle fields that are references to DIDescriptors.
+template <>
+DIDescriptorRef DIDescriptor::getFieldAs<DIDescriptorRef>(unsigned Elt) const;
+/// \brief Specialize DIRef constructor for DIDescriptorRef.
+template <> DIRef<DIDescriptor>::DIRef(const Metadata *V);
+
 /// \brief Handle fields that are references to DIScopes.
 template <> DIScopeRef DIDescriptor::getFieldAs<DIScopeRef>(unsigned Elt) const;
 /// \brief Specialize DIRef constructor for DIScopeRef.
@@ -381,7 +408,7 @@ template <> DITypeRef DIDescriptor::getFieldAs<DITypeRef>(unsigned Elt) const;
 /// \brief Specialize DIRef constructor for DITypeRef.
 template <> DIRef<DIType>::DIRef(const Metadata *V);
 
-/// \briefThis is a wrapper for a type.
+/// \brief This is a wrapper for a type.
 ///
 /// FIXME: Types should be factored much better so that CV qualifiers and
 /// others do not require a huge and empty descriptor full of zeros.
@@ -392,7 +419,7 @@ protected:
 
 public:
   explicit DIType(const MDNode *N = nullptr) : DIScope(N) {}
-  operator DITypeRef () const {
+  operator DITypeRef() const {
     assert(isType() &&
            "constructing DITypeRef from an MDNode that is not a type");
     return DITypeRef(&*getRef());
@@ -402,20 +429,12 @@ public:
 
   DIScopeRef getContext() const { return getFieldAs<DIScopeRef>(2); }
   StringRef getName() const { return getHeaderField(1); }
-  unsigned getLineNumber() const {
-    return getHeaderFieldAs<unsigned>(2);
-  }
-  uint64_t getSizeInBits() const {
-    return getHeaderFieldAs<unsigned>(3);
-  }
-  uint64_t getAlignInBits() const {
-    return getHeaderFieldAs<unsigned>(4);
-  }
+  unsigned getLineNumber() const { return getHeaderFieldAs<unsigned>(2); }
+  uint64_t getSizeInBits() const { return getHeaderFieldAs<unsigned>(3); }
+  uint64_t getAlignInBits() const { return getHeaderFieldAs<unsigned>(4); }
   // FIXME: Offset is only used for DW_TAG_member nodes.  Making every type
   // carry this is just plain insane.
-  uint64_t getOffsetInBits() const {
-    return getHeaderFieldAs<unsigned>(5);
-  }
+  uint64_t getOffsetInBits() const { return getHeaderFieldAs<unsigned>(5); }
   unsigned getFlags() const { return getHeaderFieldAs<unsigned>(6); }
   bool isPrivate() const {
     return (getFlags() & FlagAccessibility) == FlagPrivate;
@@ -499,6 +518,7 @@ public:
 // FIXME: Make this derive from DIType directly & just store the
 // base type in a single DIType field.
 class DICompositeType : public DIDerivedType {
+  friend class DIBuilder;
   friend class DIDescriptor;
   void printInternal(raw_ostream &OS) const;
 
@@ -512,20 +532,26 @@ public:
     assert(!isSubroutineType() && "no elements for DISubroutineType");
     return getFieldAs<DIArray>(4);
   }
+
+private:
   template <typename T>
   void setArrays(DITypedArray<T> Elements, DIArray TParams = DIArray()) {
-    assert((!TParams || DbgNode->getNumOperands() == 8) &&
-           "If you're setting the template parameters this should include a slot "
-           "for that!");
+    assert(
+        (!TParams || DbgNode->getNumOperands() == 8) &&
+        "If you're setting the template parameters this should include a slot "
+        "for that!");
     setArraysHelper(Elements, TParams);
   }
-  unsigned getRunTimeLang() const {
-    return getHeaderFieldAs<unsigned>(7);
-  }
+
+public:
+  unsigned getRunTimeLang() const { return getHeaderFieldAs<unsigned>(7); }
   DITypeRef getContainingType() const { return getFieldAs<DITypeRef>(5); }
 
+private:
   /// \brief Set the containing type.
   void setContainingType(DICompositeType ContainingType);
+
+public:
   DIArray getTemplateParams() const { return getFieldAs<DIArray>(6); }
   MDString *getIdentifier() const;
 
@@ -663,7 +689,6 @@ public:
   unsigned isRValueReference() const {
     return (getFlags() & FlagRValueReference) != 0;
   }
-
 };
 
 /// \brief This is a wrapper for a lexical block.
@@ -671,12 +696,8 @@ class DILexicalBlock : public DIScope {
 public:
   explicit DILexicalBlock(const MDNode *N = nullptr) : DIScope(N) {}
   DIScope getContext() const { return getFieldAs<DIScope>(2); }
-  unsigned getLineNumber() const {
-    return getHeaderFieldAs<unsigned>(1);
-  }
-  unsigned getColumnNumber() const {
-    return getHeaderFieldAs<unsigned>(2);
-  }
+  unsigned getLineNumber() const { return getHeaderFieldAs<unsigned>(1); }
+  unsigned getColumnNumber() const { return getHeaderFieldAs<unsigned>(2); }
   bool Verify() const;
 };
 
@@ -685,6 +706,8 @@ class DILexicalBlockFile : public DIScope {
 public:
   explicit DILexicalBlockFile(const MDNode *N = nullptr) : DIScope(N) {}
   DIScope getContext() const {
+    // FIXME: This logic is horrible.  getScope() returns a DILexicalBlock, but
+    // then we check if it's a subprogram?  WHAT?!?
     if (getScope().isSubprogram())
       return getScope();
     return getScope().getContext();
@@ -713,18 +736,11 @@ public:
 class DITemplateTypeParameter : public DIDescriptor {
 public:
   explicit DITemplateTypeParameter(const MDNode *N = nullptr)
-    : DIDescriptor(N) {}
+      : DIDescriptor(N) {}
 
   StringRef getName() const { return getHeaderField(1); }
-  unsigned getLineNumber() const { return getHeaderFieldAs<unsigned>(2); }
-  unsigned getColumnNumber() const { return getHeaderFieldAs<unsigned>(3); }
 
-  DIScopeRef getContext() const { return getFieldAs<DIScopeRef>(1); }
   DITypeRef getType() const { return getFieldAs<DITypeRef>(2); }
-  StringRef getFilename() const { return getFieldAs<DIFile>(3).getFilename(); }
-  StringRef getDirectory() const {
-    return getFieldAs<DIFile>(3).getDirectory();
-  }
   bool Verify() const;
 };
 
@@ -732,19 +748,12 @@ public:
 class DITemplateValueParameter : public DIDescriptor {
 public:
   explicit DITemplateValueParameter(const MDNode *N = nullptr)
-    : DIDescriptor(N) {}
+      : DIDescriptor(N) {}
 
   StringRef getName() const { return getHeaderField(1); }
-  unsigned getLineNumber() const { return getHeaderFieldAs<unsigned>(2); }
-  unsigned getColumnNumber() const { return getHeaderFieldAs<unsigned>(3); }
 
-  DIScopeRef getContext() const { return getFieldAs<DIScopeRef>(1); }
   DITypeRef getType() const { return getFieldAs<DITypeRef>(2); }
-  Value *getValue() const;
-  StringRef getFilename() const { return getFieldAs<DIFile>(4).getFilename(); }
-  StringRef getDirectory() const {
-    return getFieldAs<DIFile>(4).getDirectory();
-  }
+  Metadata *getValue() const;
   bool Verify() const;
 };
 
@@ -763,7 +772,7 @@ public:
   unsigned isLocalToUnit() const { return getHeaderFieldAs<bool>(5); }
   unsigned isDefinition() const { return getHeaderFieldAs<bool>(6); }
 
-  DIScopeRef getContext() const { return getFieldAs<DIScopeRef>(1); }
+  DIScope getContext() const { return getFieldAs<DIScope>(1); }
   StringRef getFilename() const { return getFieldAs<DIFile>(2).getFilename(); }
   StringRef getDirectory() const {
     return getFieldAs<DIFile>(2).getDirectory();
@@ -807,11 +816,6 @@ public:
     return (getHeaderFieldAs<unsigned>(3) & FlagObjectPointer) != 0;
   }
 
-  /// \brief Return true if this variable is represented as a pointer.
-  bool isIndirect() const {
-    return (getHeaderFieldAs<unsigned>(3) & FlagIndirectVariable) != 0;
-  }
-
   /// \brief If this variable is inlined then return inline location.
   MDNode *getInlinedAt() const;
 
@@ -831,7 +835,13 @@ public:
   void printExtendedName(raw_ostream &OS) const;
 };
 
-/// \brief A complex location expression.
+/// \brief A complex location expression in postfix notation.
+///
+/// This is (almost) a DWARF expression that modifies the location of a
+/// variable or (or the location of a single piece of a variable).
+///
+/// FIXME: Instead of DW_OP_plus taking an argument, this should use DW_OP_const
+/// and have DW_OP_plus consume the topmost elements on the stack.
 class DIExpression : public DIDescriptor {
   friend class DIDescriptor;
   void printInternal(raw_ostream &OS) const;
@@ -854,24 +864,91 @@ public:
   uint64_t getElement(unsigned Idx) const;
 
   /// \brief Return whether this is a piece of an aggregate variable.
-  bool isVariablePiece() const;
-  /// \brief Return the offset of this piece in bytes.
-  uint64_t getPieceOffset() const;
-  /// \brief Return the size of this piece in bytes.
-  uint64_t getPieceSize() const;
+  bool isBitPiece() const;
+  /// \brief Return the offset of this piece in bits.
+  uint64_t getBitPieceOffset() const;
+  /// \brief Return the size of this piece in bits.
+  uint64_t getBitPieceSize() const;
+
+  class iterator;
+  /// \brief A lightweight wrapper around an element of a DIExpression.
+  class Operand {
+    friend class iterator;
+    DIHeaderFieldIterator I;
+    Operand() {}
+    Operand(DIHeaderFieldIterator I) : I(I) {}
+  public:
+    /// \brief Operands such as DW_OP_piece have explicit (non-stack) arguments.
+    /// Argument 0 is the operand itself.
+    uint64_t getArg(unsigned N) const {
+      DIHeaderFieldIterator In = I;
+      std::advance(In, N);
+      return In.getNumber<uint64_t>();
+    }
+    operator uint64_t () const { return I.getNumber<uint64_t>(); }
+    /// \brief Returns underlying DIHeaderFieldIterator.
+    const DIHeaderFieldIterator &getBase() const { return I; }
+    /// \brief Returns the next operand.
+    Operand getNext() const;
+  };
+
+  /// \brief An iterator for DIExpression elements.
+  class iterator : public std::iterator<std::input_iterator_tag, StringRef,
+                                        unsigned, const Operand*, Operand> {
+    friend class Operand;
+    DIHeaderFieldIterator I;
+    Operand Tmp;
+    iterator(DIHeaderFieldIterator I) : I(I) {}
+  public:
+    iterator() {}
+    iterator(const DIExpression &Expr) : I(++Expr.header_begin()) {}
+    const Operand &operator*() { return Tmp = Operand(I); }
+    const Operand *operator->() { return &(Tmp = Operand(I)); }
+    iterator &operator++() {
+      increment();
+      return *this;
+    }
+    iterator operator++(int) {
+      iterator X(*this);
+      increment();
+      return X;
+    }
+    bool operator==(const iterator &X) const { return I == X.I; }
+    bool operator!=(const iterator &X) const { return !(*this == X); }
+
+  private:
+    void increment() {
+      switch (**this) {
+      case dwarf::DW_OP_bit_piece: std::advance(I, 3); break;
+      case dwarf::DW_OP_plus:      std::advance(I, 2); break;
+      case dwarf::DW_OP_deref:     std::advance(I, 1); break;
+      default:
+        llvm_unreachable("unsupported operand");
+      }
+    }
+  };
+
+  iterator begin() const;
+  iterator end() const;
 };
 
 /// \brief This object holds location information.
 ///
 /// This object is not associated with any DWARF tag.
 class DILocation : public DIDescriptor {
+  MDLocation *getRaw() const { return dyn_cast_or_null<MDLocation>(get()); }
+
 public:
   explicit DILocation(const MDNode *N) : DIDescriptor(N) {}
 
-  unsigned getLineNumber() const { return getUnsignedField(0); }
-  unsigned getColumnNumber() const { return getUnsignedField(1); }
-  DIScope getScope() const { return getFieldAs<DIScope>(2); }
-  DILocation getOrigLocation() const { return getFieldAs<DILocation>(3); }
+  unsigned getLineNumber() const { RETURN_FROM_RAW(N->getLine(), 0); }
+  unsigned getColumnNumber() const { RETURN_FROM_RAW(N->getColumn(), 0); }
+  DIScope getScope() const {
+    RETURN_DESCRIPTOR_FROM_RAW(DIScope, N->getScope());
+  }
+  DILocation getOrigLocation() const {
+    RETURN_DESCRIPTOR_FROM_RAW(DILocation, N->getInlinedAt());
+  }
   StringRef getFilename() const { return getScope().getFilename(); }
   StringRef getDirectory() const { return getScope().getDirectory(); }
   bool Verify() const;
@@ -892,7 +969,9 @@ public:
     // sure this location is a lexical block before retrieving its
     // value.
     return getScope().isLexicalBlockFile()
-               ? getFieldAs<DILexicalBlockFile>(2).getDiscriminator()
+               ? DILexicalBlockFile(
+                     cast<MDNode>(cast<MDLocation>(DbgNode)->getScope()))
+                     .getDiscriminator()
                : 0;
   }
 
@@ -951,14 +1030,18 @@ class DIImportedEntity : public DIDescriptor {
   void printInternal(raw_ostream &OS) const;
 
 public:
+  DIImportedEntity() = default;
   explicit DIImportedEntity(const MDNode *N) : DIDescriptor(N) {}
   DIScope getContext() const { return getFieldAs<DIScope>(1); }
-  DIScopeRef getEntity() const { return getFieldAs<DIScopeRef>(2); }
+  DIDescriptorRef getEntity() const { return getFieldAs<DIDescriptorRef>(2); }
   unsigned getLineNumber() const { return getHeaderFieldAs<unsigned>(1); }
   StringRef getName() const { return getHeaderField(2); }
   bool Verify() const;
 };
 
+#undef RETURN_FROM_RAW
+#undef RETURN_DESCRIPTOR_FROM_RAW
+
 /// \brief Find subprogram that is enclosing this scope.
 DISubprogram getDISubprogram(const MDNode *Scope);
 
@@ -1015,6 +1098,9 @@ public:
   /// \brief Process DILocation.
   void processLocation(const Module &M, DILocation Loc);
 
+  /// \brief Process DIExpression.
+  void processExpression(DIExpression Expr);
+
   /// \brief Clear all lists.
   void reset();
 
@@ -1033,7 +1119,8 @@ private:
 public:
   typedef SmallVectorImpl<DICompileUnit>::const_iterator compile_unit_iterator;
   typedef SmallVectorImpl<DISubprogram>::const_iterator subprogram_iterator;
-  typedef SmallVectorImpl<DIGlobalVariable>::const_iterator global_variable_iterator;
+  typedef SmallVectorImpl<DIGlobalVariable>::const_iterator
+      global_variable_iterator;
   typedef SmallVectorImpl<DIType>::const_iterator type_iterator;
   typedef SmallVectorImpl<DIScope>::const_iterator scope_iterator;
 
diff --git a/include/llvm/IR/DebugInfoFlags.def b/include/llvm/IR/DebugInfoFlags.def
new file mode 100644
index 0000000..d5de868
--- /dev/null
+++ b/include/llvm/IR/DebugInfoFlags.def
@@ -0,0 +1,36 @@
+//===- llvm/IR/DebugInfoFlags.def - Debug info flag definitions -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Macros for running through debug info flags.
+//
+//===----------------------------------------------------------------------===//
+
+// TODO: Add other DW-based macros.
+#ifndef HANDLE_DI_FLAG
+#error "Missing macro definition of HANDLE_DI_FLAG"
+#endif
+
+HANDLE_DI_FLAG(1, Private)
+HANDLE_DI_FLAG(2, Protected)
+HANDLE_DI_FLAG(3, Public)
+HANDLE_DI_FLAG((1 << 2), FwdDecl)
+HANDLE_DI_FLAG((1 << 3), AppleBlock)
+HANDLE_DI_FLAG((1 << 4), BlockByrefStruct)
+HANDLE_DI_FLAG((1 << 5), Virtual)
+HANDLE_DI_FLAG((1 << 6), Artificial)
+HANDLE_DI_FLAG((1 << 7), Explicit)
+HANDLE_DI_FLAG((1 << 8), Prototyped)
+HANDLE_DI_FLAG((1 << 9), ObjcClassComplete)
+HANDLE_DI_FLAG((1 << 10), ObjectPointer)
+HANDLE_DI_FLAG((1 << 11), Vector)
+HANDLE_DI_FLAG((1 << 12), StaticMember)
+HANDLE_DI_FLAG((1 << 13), LValueReference)
+HANDLE_DI_FLAG((1 << 14), RValueReference)
+
+#undef HANDLE_DI_FLAG
diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h
new file mode 100644
index 0000000..4534a14
--- /dev/null
+++ b/include/llvm/IR/DebugInfoMetadata.h
@@ -0,0 +1,1669 @@
+//===- llvm/IR/DebugInfoMetadata.h - Debug info metadata --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Declarations for metadata specific to debug info.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_DEBUGINFOMETADATA_H
+#define LLVM_IR_DEBUGINFOMETADATA_H
+
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/Dwarf.h"
+
+// Helper macros for defining get() overrides.
+#define DEFINE_MDNODE_GET_UNPACK_IMPL(...) __VA_ARGS__
+#define DEFINE_MDNODE_GET_UNPACK(ARGS) DEFINE_MDNODE_GET_UNPACK_IMPL ARGS
+#define DEFINE_MDNODE_GET(CLASS, FORMAL, ARGS)                                 \
+  static CLASS *get(LLVMContext &Context, DEFINE_MDNODE_GET_UNPACK(FORMAL)) {  \
+    return getImpl(Context, DEFINE_MDNODE_GET_UNPACK(ARGS), Uniqued);          \
+  }                                                                            \
+  static CLASS *getIfExists(LLVMContext &Context,                              \
+                            DEFINE_MDNODE_GET_UNPACK(FORMAL)) {                \
+    return getImpl(Context, DEFINE_MDNODE_GET_UNPACK(ARGS), Uniqued,           \
+                   /* ShouldCreate */ false);                                  \
+  }                                                                            \
+  static CLASS *getDistinct(LLVMContext &Context,                              \
+                            DEFINE_MDNODE_GET_UNPACK(FORMAL)) {                \
+    return getImpl(Context, DEFINE_MDNODE_GET_UNPACK(ARGS), Distinct);         \
+  }                                                                            \
+  static Temp##CLASS getTemporary(LLVMContext &Context,                        \
+                                  DEFINE_MDNODE_GET_UNPACK(FORMAL)) {          \
+    return Temp##CLASS(                                                        \
+        getImpl(Context, DEFINE_MDNODE_GET_UNPACK(ARGS), Temporary));          \
+  }
+
+namespace llvm {
+
+/// \brief Debug location.
+///
+/// A debug location in source code, used for debug info and otherwise.
+class MDLocation : public MDNode {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  MDLocation(LLVMContext &C, StorageType Storage, unsigned Line,
+             unsigned Column, ArrayRef<Metadata *> MDs);
+  ~MDLocation() { dropAllReferences(); }
+
+  static MDLocation *getImpl(LLVMContext &Context, unsigned Line,
+                             unsigned Column, Metadata *Scope,
+                             Metadata *InlinedAt, StorageType Storage,
+                             bool ShouldCreate = true);
+
+  TempMDLocation cloneImpl() const {
+    return getTemporary(getContext(), getLine(), getColumn(), getScope(),
+                        getInlinedAt());
+  }
+
+  // Disallow replacing operands.
+  void replaceOperandWith(unsigned I, Metadata *New) = delete;
+
+public:
+  DEFINE_MDNODE_GET(MDLocation,
+                    (unsigned Line, unsigned Column, Metadata *Scope,
+                     Metadata *InlinedAt = nullptr),
+                    (Line, Column, Scope, InlinedAt))
+
+  /// \brief Return a (temporary) clone of this.
+  TempMDLocation clone() const { return cloneImpl(); }
+
+  unsigned getLine() const { return SubclassData32; }
+  unsigned getColumn() const { return SubclassData16; }
+  Metadata *getScope() const { return getOperand(0); }
+  Metadata *getInlinedAt() const {
+    if (getNumOperands() == 2)
+      return getOperand(1);
+    return nullptr;
+  }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDLocationKind;
+  }
+};
+
+/// \brief Tagged DWARF-like metadata node.
+///
+/// A metadata node with a DWARF tag (i.e., a constant named \c DW_TAG_*,
+/// defined in llvm/Support/Dwarf.h).  Called \a DebugNode because it's
+/// potentially used for non-DWARF output.
+class DebugNode : public MDNode {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+protected:
+  DebugNode(LLVMContext &C, unsigned ID, StorageType Storage, unsigned Tag,
+            ArrayRef<Metadata *> Ops1, ArrayRef<Metadata *> Ops2 = None)
+      : MDNode(C, ID, Storage, Ops1, Ops2) {
+    assert(Tag < 1u << 16);
+    SubclassData16 = Tag;
+  }
+  ~DebugNode() {}
+
+  template <class Ty> Ty *getOperandAs(unsigned I) const {
+    return cast_or_null<Ty>(getOperand(I));
+  }
+
+  StringRef getStringOperand(unsigned I) const {
+    if (auto *S = getOperandAs<MDString>(I))
+      return S->getString();
+    return StringRef();
+  }
+
+  static MDString *getCanonicalMDString(LLVMContext &Context, StringRef S) {
+    if (S.empty())
+      return nullptr;
+    return MDString::get(Context, S);
+  }
+
+public:
+  unsigned getTag() const { return SubclassData16; }
+
+  static bool classof(const Metadata *MD) {
+    switch (MD->getMetadataID()) {
+    default:
+      return false;
+    case GenericDebugNodeKind:
+    case MDSubrangeKind:
+    case MDEnumeratorKind:
+    case MDBasicTypeKind:
+    case MDDerivedTypeKind:
+    case MDCompositeTypeKind:
+    case MDSubroutineTypeKind:
+    case MDFileKind:
+    case MDCompileUnitKind:
+    case MDSubprogramKind:
+    case MDLexicalBlockKind:
+    case MDLexicalBlockFileKind:
+    case MDNamespaceKind:
+    case MDTemplateTypeParameterKind:
+    case MDTemplateValueParameterKind:
+    case MDGlobalVariableKind:
+    case MDLocalVariableKind:
+    case MDExpressionKind:
+    case MDObjCPropertyKind:
+    case MDImportedEntityKind:
+      return true;
+    }
+  }
+};
+
+/// \brief Generic tagged DWARF-like metadata node.
+///
+/// An un-specialized DWARF-like metadata node.  The first operand is a
+/// (possibly empty) null-separated \a MDString header that contains arbitrary
+/// fields.  The remaining operands are \a dwarf_operands(), and are pointers
+/// to other metadata.
+class GenericDebugNode : public DebugNode {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  GenericDebugNode(LLVMContext &C, StorageType Storage, unsigned Hash,
+                   unsigned Tag, ArrayRef<Metadata *> Ops1,
+                   ArrayRef<Metadata *> Ops2)
+      : DebugNode(C, GenericDebugNodeKind, Storage, Tag, Ops1, Ops2) {
+    setHash(Hash);
+  }
+  ~GenericDebugNode() { dropAllReferences(); }
+
+  void setHash(unsigned Hash) { SubclassData32 = Hash; }
+  void recalculateHash();
+
+  static GenericDebugNode *getImpl(LLVMContext &Context, unsigned Tag,
+                                   StringRef Header,
+                                   ArrayRef<Metadata *> DwarfOps,
+                                   StorageType Storage,
+                                   bool ShouldCreate = true) {
+    return getImpl(Context, Tag, getCanonicalMDString(Context, Header),
+                   DwarfOps, Storage, ShouldCreate);
+  }
+
+  static GenericDebugNode *getImpl(LLVMContext &Context, unsigned Tag,
+                                   MDString *Header,
+                                   ArrayRef<Metadata *> DwarfOps,
+                                   StorageType Storage,
+                                   bool ShouldCreate = true);
+
+  TempGenericDebugNode cloneImpl() const {
+    return getTemporary(
+        getContext(), getTag(), getHeader(),
+        SmallVector<Metadata *, 4>(dwarf_op_begin(), dwarf_op_end()));
+  }
+
+public:
+  unsigned getHash() const { return SubclassData32; }
+
+  DEFINE_MDNODE_GET(GenericDebugNode, (unsigned Tag, StringRef Header,
+                                       ArrayRef<Metadata *> DwarfOps),
+                    (Tag, Header, DwarfOps))
+  DEFINE_MDNODE_GET(GenericDebugNode, (unsigned Tag, MDString *Header,
+                                       ArrayRef<Metadata *> DwarfOps),
+                    (Tag, Header, DwarfOps))
+
+  /// \brief Return a (temporary) clone of this.
+  TempGenericDebugNode clone() const { return cloneImpl(); }
+
+  unsigned getTag() const { return SubclassData16; }
+  StringRef getHeader() const { return getStringOperand(0); }
+
+  op_iterator dwarf_op_begin() const { return op_begin() + 1; }
+  op_iterator dwarf_op_end() const { return op_end(); }
+  op_range dwarf_operands() const {
+    return op_range(dwarf_op_begin(), dwarf_op_end());
+  }
+
+  unsigned getNumDwarfOperands() const { return getNumOperands() - 1; }
+  const MDOperand &getDwarfOperand(unsigned I) const {
+    return getOperand(I + 1);
+  }
+  void replaceDwarfOperandWith(unsigned I, Metadata *New) {
+    replaceOperandWith(I + 1, New);
+  }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == GenericDebugNodeKind;
+  }
+};
+
+/// \brief Array subrange.
+///
+/// TODO: Merge into node for DW_TAG_array_type, which should have a custom
+/// type.
+class MDSubrange : public DebugNode {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  int64_t Count;
+  int64_t Lo;
+
+  MDSubrange(LLVMContext &C, StorageType Storage, int64_t Count, int64_t Lo)
+      : DebugNode(C, MDSubrangeKind, Storage, dwarf::DW_TAG_subrange_type,
+                  None),
+        Count(Count), Lo(Lo) {}
+  ~MDSubrange() {}
+
+  static MDSubrange *getImpl(LLVMContext &Context, int64_t Count, int64_t Lo,
+                             StorageType Storage, bool ShouldCreate = true);
+
+  TempMDSubrange cloneImpl() const {
+    return getTemporary(getContext(), getCount(), getLo());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDSubrange, (int64_t Count, int64_t Lo = 0), (Count, Lo))
+
+  TempMDSubrange clone() const { return cloneImpl(); }
+
+  int64_t getLo() const { return Lo; }
+  int64_t getCount() const { return Count; }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDSubrangeKind;
+  }
+};
+
+/// \brief Enumeration value.
+///
+/// TODO: Add a pointer to the context (DW_TAG_enumeration_type) once that no
+/// longer creates a type cycle.
+class MDEnumerator : public DebugNode {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  int64_t Value;
+
+  MDEnumerator(LLVMContext &C, StorageType Storage, int64_t Value,
+               ArrayRef<Metadata *> Ops)
+      : DebugNode(C, MDEnumeratorKind, Storage, dwarf::DW_TAG_enumerator, Ops),
+        Value(Value) {}
+  ~MDEnumerator() {}
+
+  static MDEnumerator *getImpl(LLVMContext &Context, int64_t Value,
+                               StringRef Name, StorageType Storage,
+                               bool ShouldCreate = true) {
+    return getImpl(Context, Value, getCanonicalMDString(Context, Name), Storage,
+                   ShouldCreate);
+  }
+  static MDEnumerator *getImpl(LLVMContext &Context, int64_t Value,
+                               MDString *Name, StorageType Storage,
+                               bool ShouldCreate = true);
+
+  TempMDEnumerator cloneImpl() const {
+    return getTemporary(getContext(), getValue(), getName());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDEnumerator, (int64_t Value, StringRef Name),
+                    (Value, Name))
+  DEFINE_MDNODE_GET(MDEnumerator, (int64_t Value, MDString *Name),
+                    (Value, Name))
+
+  TempMDEnumerator clone() const { return cloneImpl(); }
+
+  int64_t getValue() const { return Value; }
+  StringRef getName() const { return getStringOperand(0); }
+
+  MDString *getRawName() const { return getOperandAs<MDString>(0); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDEnumeratorKind;
+  }
+};
+
+/// \brief Base class for scope-like contexts.
+///
+/// Base class for lexical scopes and types (which are also declaration
+/// contexts).
+///
+/// TODO: Separate the concepts of declaration contexts and lexical scopes.
+class MDScope : public DebugNode {
+protected:
+  MDScope(LLVMContext &C, unsigned ID, StorageType Storage, unsigned Tag,
+          ArrayRef<Metadata *> Ops)
+      : DebugNode(C, ID, Storage, Tag, Ops) {}
+  ~MDScope() {}
+
+public:
+  Metadata *getFile() const { return getOperand(0); }
+
+  static bool classof(const Metadata *MD) {
+    switch (MD->getMetadataID()) {
+    default:
+      return false;
+    case MDBasicTypeKind:
+    case MDDerivedTypeKind:
+    case MDCompositeTypeKind:
+    case MDSubroutineTypeKind:
+    case MDFileKind:
+    case MDCompileUnitKind:
+    case MDSubprogramKind:
+    case MDLexicalBlockKind:
+    case MDLexicalBlockFileKind:
+    case MDNamespaceKind:
+      return true;
+    }
+  }
+};
+
+/// \brief Base class for types.
+///
+/// TODO: Remove the hardcoded name and context, since many types don't use
+/// them.
+/// TODO: Split up flags.
+class MDType : public MDScope {
+  unsigned Line;
+  unsigned Flags;
+  uint64_t SizeInBits;
+  uint64_t AlignInBits;
+  uint64_t OffsetInBits;
+
+protected:
+  MDType(LLVMContext &C, unsigned ID, StorageType Storage, unsigned Tag,
+         unsigned Line, uint64_t SizeInBits, uint64_t AlignInBits,
+         uint64_t OffsetInBits, unsigned Flags, ArrayRef<Metadata *> Ops)
+      : MDScope(C, ID, Storage, Tag, Ops), Line(Line), Flags(Flags),
+        SizeInBits(SizeInBits), AlignInBits(AlignInBits),
+        OffsetInBits(OffsetInBits) {}
+  ~MDType() {}
+
+public:
+  unsigned getLine() const { return Line; }
+  uint64_t getSizeInBits() const { return SizeInBits; }
+  uint64_t getAlignInBits() const { return AlignInBits; }
+  uint64_t getOffsetInBits() const { return OffsetInBits; }
+  unsigned getFlags() const { return Flags; }
+
+  Metadata *getScope() const { return getOperand(1); }
+  StringRef getName() const { return getStringOperand(2); }
+
+  MDString *getRawName() const { return getOperandAs<MDString>(2); }
+
+  static bool classof(const Metadata *MD) {
+    switch (MD->getMetadataID()) {
+    default:
+      return false;
+    case MDBasicTypeKind:
+    case MDDerivedTypeKind:
+    case MDCompositeTypeKind:
+    case MDSubroutineTypeKind:
+      return true;
+    }
+  }
+};
+
+/// \brief Basic type.
+///
+/// TODO: Split out DW_TAG_unspecified_type.
+/// TODO: Drop unused accessors.
+class MDBasicType : public MDType {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  unsigned Encoding;
+
+  MDBasicType(LLVMContext &C, StorageType Storage, unsigned Tag,
+              uint64_t SizeInBits, uint64_t AlignInBits, unsigned Encoding,
+              ArrayRef<Metadata *> Ops)
+      : MDType(C, MDBasicTypeKind, Storage, Tag, 0, SizeInBits, AlignInBits, 0,
+               0, Ops),
+        Encoding(Encoding) {}
+  ~MDBasicType() {}
+
+  static MDBasicType *getImpl(LLVMContext &Context, unsigned Tag,
+                              StringRef Name, uint64_t SizeInBits,
+                              uint64_t AlignInBits, unsigned Encoding,
+                              StorageType Storage, bool ShouldCreate = true) {
+    return getImpl(Context, Tag, getCanonicalMDString(Context, Name),
+                   SizeInBits, AlignInBits, Encoding, Storage, ShouldCreate);
+  }
+  static MDBasicType *getImpl(LLVMContext &Context, unsigned Tag,
+                              MDString *Name, uint64_t SizeInBits,
+                              uint64_t AlignInBits, unsigned Encoding,
+                              StorageType Storage, bool ShouldCreate = true);
+
+  TempMDBasicType cloneImpl() const {
+    return getTemporary(getContext(), getTag(), getName(), getSizeInBits(),
+                        getAlignInBits(), getEncoding());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDBasicType,
+                    (unsigned Tag, StringRef Name, uint64_t SizeInBits,
+                     uint64_t AlignInBits, unsigned Encoding),
+                    (Tag, Name, SizeInBits, AlignInBits, Encoding))
+  DEFINE_MDNODE_GET(MDBasicType,
+                    (unsigned Tag, MDString *Name, uint64_t SizeInBits,
+                     uint64_t AlignInBits, unsigned Encoding),
+                    (Tag, Name, SizeInBits, AlignInBits, Encoding))
+
+  TempMDBasicType clone() const { return cloneImpl(); }
+
+  unsigned getEncoding() const { return Encoding; }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDBasicTypeKind;
+  }
+};
+
+/// \brief Base class for MDDerivedType and MDCompositeType.
+///
+/// TODO: Delete; they're not really related.
+class MDDerivedTypeBase : public MDType {
+protected:
+  MDDerivedTypeBase(LLVMContext &C, unsigned ID, StorageType Storage,
+                    unsigned Tag, unsigned Line, uint64_t SizeInBits,
+                    uint64_t AlignInBits, uint64_t OffsetInBits, unsigned Flags,
+                    ArrayRef<Metadata *> Ops)
+      : MDType(C, ID, Storage, Tag, Line, SizeInBits, AlignInBits, OffsetInBits,
+               Flags, Ops) {}
+  ~MDDerivedTypeBase() {}
+
+public:
+  Metadata *getBaseType() const { return getOperand(3); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDDerivedTypeKind ||
+           MD->getMetadataID() == MDCompositeTypeKind ||
+           MD->getMetadataID() == MDSubroutineTypeKind;
+  }
+};
+
+/// \brief Derived types.
+///
+/// This includes qualified types, pointers, references, friends, typedefs, and
+/// class members.
+///
+/// TODO: Split out members (inheritance, fields, methods, etc.).
+class MDDerivedType : public MDDerivedTypeBase {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  MDDerivedType(LLVMContext &C, StorageType Storage, unsigned Tag,
+                unsigned Line, uint64_t SizeInBits, uint64_t AlignInBits,
+                uint64_t OffsetInBits, unsigned Flags, ArrayRef<Metadata *> Ops)
+      : MDDerivedTypeBase(C, MDDerivedTypeKind, Storage, Tag, Line, SizeInBits,
+                          AlignInBits, OffsetInBits, Flags, Ops) {}
+  ~MDDerivedType() {}
+
+  static MDDerivedType *getImpl(LLVMContext &Context, unsigned Tag,
+                                StringRef Name, Metadata *File, unsigned Line,
+                                Metadata *Scope, Metadata *BaseType,
+                                uint64_t SizeInBits, uint64_t AlignInBits,
+                                uint64_t OffsetInBits, unsigned Flags,
+                                Metadata *ExtraData, StorageType Storage,
+                                bool ShouldCreate = true) {
+    return getImpl(Context, Tag, getCanonicalMDString(Context, Name), File,
+                   Line, Scope, BaseType, SizeInBits, AlignInBits, OffsetInBits,
+                   Flags, ExtraData, Storage, ShouldCreate);
+  }
+  static MDDerivedType *getImpl(LLVMContext &Context, unsigned Tag,
+                                MDString *Name, Metadata *File, unsigned Line,
+                                Metadata *Scope, Metadata *BaseType,
+                                uint64_t SizeInBits, uint64_t AlignInBits,
+                                uint64_t OffsetInBits, unsigned Flags,
+                                Metadata *ExtraData, StorageType Storage,
+                                bool ShouldCreate = true);
+
+  TempMDDerivedType cloneImpl() const {
+    return getTemporary(getContext(), getTag(), getName(), getFile(), getLine(),
+                        getScope(), getBaseType(), getSizeInBits(),
+                        getAlignInBits(), getOffsetInBits(), getFlags(),
+                        getExtraData());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDDerivedType,
+                    (unsigned Tag, MDString *Name, Metadata *File,
+                     unsigned Line, Metadata *Scope, Metadata *BaseType,
+                     uint64_t SizeInBits, uint64_t AlignInBits,
+                     uint64_t OffsetInBits, unsigned Flags,
+                     Metadata *ExtraData = nullptr),
+                    (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                     AlignInBits, OffsetInBits, Flags, ExtraData))
+  DEFINE_MDNODE_GET(MDDerivedType,
+                    (unsigned Tag, StringRef Name, Metadata *File,
+                     unsigned Line, Metadata *Scope, Metadata *BaseType,
+                     uint64_t SizeInBits, uint64_t AlignInBits,
+                     uint64_t OffsetInBits, unsigned Flags,
+                     Metadata *ExtraData = nullptr),
+                    (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                     AlignInBits, OffsetInBits, Flags, ExtraData))
+
+  TempMDDerivedType clone() const { return cloneImpl(); }
+
+  /// \brief Get extra data associated with this derived type.
+  ///
+  /// Class type for pointer-to-members, objective-c property node for ivars,
+  /// or global constant wrapper for static members.
+  ///
+  /// TODO: Separate out types that need this extra operand: pointer-to-member
+  /// types and member fields (static members and ivars).
+  Metadata *getExtraData() const { return getOperand(4); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDDerivedTypeKind;
+  }
+};
+
+/// \brief Base class for MDCompositeType and MDSubroutineType.
+///
+/// TODO: Delete; they're not really related.
+class MDCompositeTypeBase : public MDDerivedTypeBase {
+  unsigned RuntimeLang;
+
+protected:
+  MDCompositeTypeBase(LLVMContext &C, unsigned ID, StorageType Storage,
+                      unsigned Tag, unsigned Line, unsigned RuntimeLang,
+                      uint64_t SizeInBits, uint64_t AlignInBits,
+                      uint64_t OffsetInBits, unsigned Flags,
+                      ArrayRef<Metadata *> Ops)
+      : MDDerivedTypeBase(C, ID, Storage, Tag, Line, SizeInBits, AlignInBits,
+                          OffsetInBits, Flags, Ops),
+        RuntimeLang(RuntimeLang) {}
+  ~MDCompositeTypeBase() {}
+
+public:
+  Metadata *getElements() const { return getOperand(4); }
+  Metadata *getVTableHolder() const { return getOperand(5); }
+  Metadata *getTemplateParams() const { return getOperand(6); }
+  StringRef getIdentifier() const { return getStringOperand(7); }
+  unsigned getRuntimeLang() const { return RuntimeLang; }
+
+  MDString *getRawIdentifier() const { return getOperandAs<MDString>(7); }
+
+  /// \brief Replace operands.
+  ///
+  /// If this \a isUniqued() and not \a isResolved(), on a uniquing collision
+  /// this will be RAUW'ed and deleted.  Use a \a TrackingMDRef to keep track
+  /// of its movement if necessary.
+  /// @{
+  void replaceElements(MDTuple *Elements) {
+#ifndef NDEBUG
+    if (auto *Old = cast_or_null<MDTuple>(getElements()))
+      for (const auto &Op : Old->operands())
+        assert(std::find(Elements->op_begin(), Elements->op_end(), Op) &&
+               "Lost a member during member list replacement");
+#endif
+    replaceOperandWith(4, Elements);
+  }
+  void replaceVTableHolder(Metadata *VTableHolder) {
+    replaceOperandWith(5, VTableHolder);
+  }
+  void replaceTemplateParams(MDTuple *TemplateParams) {
+    replaceOperandWith(6, TemplateParams);
+  }
+  /// @}
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDCompositeTypeKind ||
+           MD->getMetadataID() == MDSubroutineTypeKind;
+  }
+};
+
+/// \brief Composite types.
+///
+/// TODO: Detach from DerivedTypeBase (split out MDEnumType?).
+/// TODO: Create a custom, unrelated node for DW_TAG_array_type.
+class MDCompositeType : public MDCompositeTypeBase {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  MDCompositeType(LLVMContext &C, StorageType Storage, unsigned Tag,
+                  unsigned Line, unsigned RuntimeLang, uint64_t SizeInBits,
+                  uint64_t AlignInBits, uint64_t OffsetInBits, unsigned Flags,
+                  ArrayRef<Metadata *> Ops)
+      : MDCompositeTypeBase(C, MDCompositeTypeKind, Storage, Tag, Line,
+                            RuntimeLang, SizeInBits, AlignInBits, OffsetInBits,
+                            Flags, Ops) {}
+  ~MDCompositeType() {}
+
+  static MDCompositeType *
+  getImpl(LLVMContext &Context, unsigned Tag, StringRef Name, Metadata *File,
+          unsigned Line, Metadata *Scope, Metadata *BaseType,
+          uint64_t SizeInBits, uint64_t AlignInBits, uint64_t OffsetInBits,
+          uint64_t Flags, Metadata *Elements, unsigned RuntimeLang,
+          Metadata *VTableHolder, Metadata *TemplateParams,
+          StringRef Identifier, StorageType Storage, bool ShouldCreate = true) {
+    return getImpl(Context, Tag, getCanonicalMDString(Context, Name), File,
+                   Line, Scope, BaseType, SizeInBits, AlignInBits, OffsetInBits,
+                   Flags, Elements, RuntimeLang, VTableHolder, TemplateParams,
+                   getCanonicalMDString(Context, Identifier), Storage,
+                   ShouldCreate);
+  }
+  static MDCompositeType *
+  getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
+          unsigned Line, Metadata *Scope, Metadata *BaseType,
+          uint64_t SizeInBits, uint64_t AlignInBits, uint64_t OffsetInBits,
+          unsigned Flags, Metadata *Elements, unsigned RuntimeLang,
+          Metadata *VTableHolder, Metadata *TemplateParams,
+          MDString *Identifier, StorageType Storage, bool ShouldCreate = true);
+
+  TempMDCompositeType cloneImpl() const {
+    return getTemporary(getContext(), getTag(), getName(), getFile(), getLine(),
+                        getScope(), getBaseType(), getSizeInBits(),
+                        getAlignInBits(), getOffsetInBits(), getFlags(),
+                        getElements(), getRuntimeLang(), getVTableHolder(),
+                        getTemplateParams(), getIdentifier());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDCompositeType,
+                    (unsigned Tag, StringRef Name, Metadata *File,
+                     unsigned Line, Metadata *Scope, Metadata *BaseType,
+                     uint64_t SizeInBits, uint64_t AlignInBits,
+                     uint64_t OffsetInBits, unsigned Flags, Metadata *Elements,
+                     unsigned RuntimeLang, Metadata *VTableHolder,
+                     Metadata *TemplateParams = nullptr,
+                     StringRef Identifier = ""),
+                    (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                     AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
+                     VTableHolder, TemplateParams, Identifier))
+  DEFINE_MDNODE_GET(MDCompositeType,
+                    (unsigned Tag, MDString *Name, Metadata *File,
+                     unsigned Line, Metadata *Scope, Metadata *BaseType,
+                     uint64_t SizeInBits, uint64_t AlignInBits,
+                     uint64_t OffsetInBits, unsigned Flags, Metadata *Elements,
+                     unsigned RuntimeLang, Metadata *VTableHolder,
+                     Metadata *TemplateParams = nullptr,
+                     MDString *Identifier = nullptr),
+                    (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                     AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
+                     VTableHolder, TemplateParams, Identifier))
+
+  TempMDCompositeType clone() const { return cloneImpl(); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDCompositeTypeKind;
+  }
+};
+
+/// \brief Type array for a subprogram.
+///
+/// TODO: Detach from CompositeType, and fold the array of types in directly
+/// as operands.
+class MDSubroutineType : public MDCompositeTypeBase {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  MDSubroutineType(LLVMContext &C, StorageType Storage, unsigned Flags,
+                   ArrayRef<Metadata *> Ops)
+      : MDCompositeTypeBase(C, MDSubroutineTypeKind, Storage,
+                            dwarf::DW_TAG_subroutine_type, 0, 0, 0, 0, 0, Flags,
+                            Ops) {}
+  ~MDSubroutineType() {}
+
+  static MDSubroutineType *getImpl(LLVMContext &Context, unsigned Flags,
+                                   Metadata *TypeArray, StorageType Storage,
+                                   bool ShouldCreate = true);
+
+  TempMDSubroutineType cloneImpl() const {
+    return getTemporary(getContext(), getFlags(), getTypeArray());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDSubroutineType, (unsigned Flags, Metadata *TypeArray),
+                    (Flags, TypeArray))
+
+  TempMDSubroutineType clone() const { return cloneImpl(); }
+
+  Metadata *getTypeArray() const { return getElements(); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDSubroutineTypeKind;
+  }
+};
+
+/// \brief File.
+///
+/// TODO: Merge with directory/file node (including users).
+/// TODO: Canonicalize paths on creation.
+class MDFile : public MDScope {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  MDFile(LLVMContext &C, StorageType Storage, ArrayRef<Metadata *> Ops)
+      : MDScope(C, MDFileKind, Storage, dwarf::DW_TAG_file_type, Ops) {}
+  ~MDFile() {}
+
+  static MDFile *getImpl(LLVMContext &Context, StringRef Filename,
+                         StringRef Directory, StorageType Storage,
+                         bool ShouldCreate = true) {
+    return getImpl(Context, getCanonicalMDString(Context, Filename),
+                   getCanonicalMDString(Context, Directory), Storage,
+                   ShouldCreate);
+  }
+  static MDFile *getImpl(LLVMContext &Context, MDString *Filename,
+                         MDString *Directory, StorageType Storage,
+                         bool ShouldCreate = true);
+
+  TempMDFile cloneImpl() const {
+    return getTemporary(getContext(), getFilename(), getDirectory());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDFile, (StringRef Filename, StringRef Directory),
+                    (Filename, Directory))
+  DEFINE_MDNODE_GET(MDFile, (MDString * Filename, MDString *Directory),
+                    (Filename, Directory))
+
+  TempMDFile clone() const { return cloneImpl(); }
+
+  StringRef getFilename() const { return getStringOperand(0); }
+  StringRef getDirectory() const { return getStringOperand(1); }
+
+  MDString *getRawFilename() const { return getOperandAs<MDString>(0); }
+  MDString *getRawDirectory() const { return getOperandAs<MDString>(1); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDFileKind;
+  }
+};
+
+/// \brief Compile unit.
+class MDCompileUnit : public MDScope {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  unsigned SourceLanguage;
+  bool IsOptimized;
+  unsigned RuntimeVersion;
+  unsigned EmissionKind;
+
+  MDCompileUnit(LLVMContext &C, StorageType Storage, unsigned SourceLanguage,
+                bool IsOptimized, unsigned RuntimeVersion,
+                unsigned EmissionKind, ArrayRef<Metadata *> Ops)
+      : MDScope(C, MDCompileUnitKind, Storage, dwarf::DW_TAG_compile_unit, Ops),
+        SourceLanguage(SourceLanguage), IsOptimized(IsOptimized),
+        RuntimeVersion(RuntimeVersion), EmissionKind(EmissionKind) {}
+  ~MDCompileUnit() {}
+
+  static MDCompileUnit *
+  getImpl(LLVMContext &Context, unsigned SourceLanguage, Metadata *File,
+          StringRef Producer, bool IsOptimized, StringRef Flags,
+          unsigned RuntimeVersion, StringRef SplitDebugFilename,
+          unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
+          Metadata *Subprograms, Metadata *GlobalVariables,
+          Metadata *ImportedEntities, StorageType Storage,
+          bool ShouldCreate = true) {
+    return getImpl(Context, SourceLanguage, File,
+                   getCanonicalMDString(Context, Producer), IsOptimized,
+                   getCanonicalMDString(Context, Flags), RuntimeVersion,
+                   getCanonicalMDString(Context, SplitDebugFilename),
+                   EmissionKind, EnumTypes, RetainedTypes, Subprograms,
+                   GlobalVariables, ImportedEntities, Storage, ShouldCreate);
+  }
+  static MDCompileUnit *
+  getImpl(LLVMContext &Context, unsigned SourceLanguage, Metadata *File,
+          MDString *Producer, bool IsOptimized, MDString *Flags,
+          unsigned RuntimeVersion, MDString *SplitDebugFilename,
+          unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
+          Metadata *Subprograms, Metadata *GlobalVariables,
+          Metadata *ImportedEntities, StorageType Storage,
+          bool ShouldCreate = true);
+
+  TempMDCompileUnit cloneImpl() const {
+    return getTemporary(
+        getContext(), getSourceLanguage(), getFile(), getProducer(),
+        isOptimized(), getFlags(), getRuntimeVersion(), getSplitDebugFilename(),
+        getEmissionKind(), getEnumTypes(), getRetainedTypes(), getSubprograms(),
+        getGlobalVariables(), getImportedEntities());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDCompileUnit,
+                    (unsigned SourceLanguage, Metadata *File,
+                     StringRef Producer, bool IsOptimized, StringRef Flags,
+                     unsigned RuntimeVersion, StringRef SplitDebugFilename,
+                     unsigned EmissionKind, Metadata *EnumTypes,
+                     Metadata *RetainedTypes, Metadata *Subprograms,
+                     Metadata *GlobalVariables, Metadata *ImportedEntities),
+                    (SourceLanguage, File, Producer, IsOptimized, Flags,
+                     RuntimeVersion, SplitDebugFilename, EmissionKind,
+                     EnumTypes, RetainedTypes, Subprograms, GlobalVariables,
+                     ImportedEntities))
+  DEFINE_MDNODE_GET(MDCompileUnit,
+                    (unsigned SourceLanguage, Metadata *File,
+                     MDString *Producer, bool IsOptimized, MDString *Flags,
+                     unsigned RuntimeVersion, MDString *SplitDebugFilename,
+                     unsigned EmissionKind, Metadata *EnumTypes,
+                     Metadata *RetainedTypes, Metadata *Subprograms,
+                     Metadata *GlobalVariables, Metadata *ImportedEntities),
+                    (SourceLanguage, File, Producer, IsOptimized, Flags,
+                     RuntimeVersion, SplitDebugFilename, EmissionKind,
+                     EnumTypes, RetainedTypes, Subprograms, GlobalVariables,
+                     ImportedEntities))
+
+  TempMDCompileUnit clone() const { return cloneImpl(); }
+
+  unsigned getSourceLanguage() const { return SourceLanguage; }
+  bool isOptimized() const { return IsOptimized; }
+  unsigned getRuntimeVersion() const { return RuntimeVersion; }
+  unsigned getEmissionKind() const { return EmissionKind; }
+  StringRef getProducer() const { return getStringOperand(1); }
+  StringRef getFlags() const { return getStringOperand(2); }
+  StringRef getSplitDebugFilename() const { return getStringOperand(3); }
+  Metadata *getEnumTypes() const { return getOperand(4); }
+  Metadata *getRetainedTypes() const { return getOperand(5); }
+  Metadata *getSubprograms() const { return getOperand(6); }
+  Metadata *getGlobalVariables() const { return getOperand(7); }
+  Metadata *getImportedEntities() const { return getOperand(8); }
+
+  MDString *getRawProducer() const { return getOperandAs<MDString>(1); }
+  MDString *getRawFlags() const { return getOperandAs<MDString>(2); }
+  MDString *getRawSplitDebugFilename() const {
+    return getOperandAs<MDString>(3);
+  }
+
+  /// \brief Replace arrays.
+  ///
+  /// If this \a isUniqued() and not \a isResolved(), it will be RAUW'ed and
+  /// deleted on a uniquing collision.  In practice, uniquing collisions on \a
+  /// MDCompileUnit should be fairly rare.
+  /// @{
+  void replaceSubprograms(MDTuple *N) { replaceOperandWith(6, N); }
+  void replaceGlobalVariables(MDTuple *N) { replaceOperandWith(7, N); }
+  /// @}
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDCompileUnitKind;
+  }
+};
+
+/// \brief Subprogram description.
+///
+/// TODO: Remove DisplayName.  It's always equal to Name.
+/// TODO: Split up flags.
+class MDSubprogram : public MDScope {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  unsigned Line;
+  unsigned ScopeLine;
+  unsigned Virtuality;
+  unsigned VirtualIndex;
+  unsigned Flags;
+  bool IsLocalToUnit;
+  bool IsDefinition;
+  bool IsOptimized;
+
+  MDSubprogram(LLVMContext &C, StorageType Storage, unsigned Line,
+               unsigned ScopeLine, unsigned Virtuality, unsigned VirtualIndex,
+               unsigned Flags, bool IsLocalToUnit, bool IsDefinition,
+               bool IsOptimized, ArrayRef<Metadata *> Ops)
+      : MDScope(C, MDSubprogramKind, Storage, dwarf::DW_TAG_subprogram, Ops),
+        Line(Line), ScopeLine(ScopeLine), Virtuality(Virtuality),
+        VirtualIndex(VirtualIndex), Flags(Flags), IsLocalToUnit(IsLocalToUnit),
+        IsDefinition(IsDefinition), IsOptimized(IsOptimized) {}
+  ~MDSubprogram() {}
+
+  static MDSubprogram *
+  getImpl(LLVMContext &Context, Metadata *Scope, StringRef Name,
+          StringRef LinkageName, Metadata *File, unsigned Line, Metadata *Type,
+          bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine,
+          Metadata *ContainingType, unsigned Virtuality, unsigned VirtualIndex,
+          unsigned Flags, bool IsOptimized, Metadata *Function,
+          Metadata *TemplateParams, Metadata *Declaration, Metadata *Variables,
+          StorageType Storage, bool ShouldCreate = true) {
+    return getImpl(Context, Scope, getCanonicalMDString(Context, Name),
+                   getCanonicalMDString(Context, LinkageName), File, Line, Type,
+                   IsLocalToUnit, IsDefinition, ScopeLine, ContainingType,
+                   Virtuality, VirtualIndex, Flags, IsOptimized, Function,
+                   TemplateParams, Declaration, Variables, Storage,
+                   ShouldCreate);
+  }
+  static MDSubprogram *
+  getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
+          MDString *LinkageName, Metadata *File, unsigned Line, Metadata *Type,
+          bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine,
+          Metadata *ContainingType, unsigned Virtuality, unsigned VirtualIndex,
+          unsigned Flags, bool IsOptimized, Metadata *Function,
+          Metadata *TemplateParams, Metadata *Declaration, Metadata *Variables,
+          StorageType Storage, bool ShouldCreate = true);
+
+  TempMDSubprogram cloneImpl() const {
+    return getTemporary(getContext(), getScope(), getName(), getLinkageName(),
+                        getFile(), getLine(), getType(), isLocalToUnit(),
+                        isDefinition(), getScopeLine(), getContainingType(),
+                        getVirtuality(), getVirtualIndex(), getFlags(),
+                        isOptimized(), getFunction(), getTemplateParams(),
+                        getDeclaration(), getVariables());
+  }
+
+public:
+  DEFINE_MDNODE_GET(
+      MDSubprogram,
+      (Metadata * Scope, StringRef Name, StringRef LinkageName, Metadata *File,
+       unsigned Line, Metadata *Type, bool IsLocalToUnit, bool IsDefinition,
+       unsigned ScopeLine, Metadata *ContainingType, unsigned Virtuality,
+       unsigned VirtualIndex, unsigned Flags, bool IsOptimized,
+       Metadata *Function = nullptr, Metadata *TemplateParams = nullptr,
+       Metadata *Declaration = nullptr, Metadata *Variables = nullptr),
+      (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition,
+       ScopeLine, ContainingType, Virtuality, VirtualIndex, Flags, IsOptimized,
+       Function, TemplateParams, Declaration, Variables))
+  DEFINE_MDNODE_GET(
+      MDSubprogram,
+      (Metadata * Scope, MDString *Name, MDString *LinkageName, Metadata *File,
+       unsigned Line, Metadata *Type, bool IsLocalToUnit, bool IsDefinition,
+       unsigned ScopeLine, Metadata *ContainingType, unsigned Virtuality,
+       unsigned VirtualIndex, unsigned Flags, bool IsOptimized,
+       Metadata *Function = nullptr, Metadata *TemplateParams = nullptr,
+       Metadata *Declaration = nullptr, Metadata *Variables = nullptr),
+      (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition,
+       ScopeLine, ContainingType, Virtuality, VirtualIndex, Flags, IsOptimized,
+       Function, TemplateParams, Declaration, Variables))
+
+  TempMDSubprogram clone() const { return cloneImpl(); }
+
+public:
+  unsigned getLine() const { return Line; }
+  unsigned getVirtuality() const { return Virtuality; }
+  unsigned getVirtualIndex() const { return VirtualIndex; }
+  unsigned getScopeLine() const { return ScopeLine; }
+  unsigned getFlags() const { return Flags; }
+  bool isLocalToUnit() const { return IsLocalToUnit; }
+  bool isDefinition() const { return IsDefinition; }
+  bool isOptimized() const { return IsOptimized; }
+
+  Metadata *getScope() const { return getOperand(1); }
+
+  StringRef getName() const { return getStringOperand(2); }
+  StringRef getDisplayName() const { return getStringOperand(3); }
+  StringRef getLinkageName() const { return getStringOperand(4); }
+
+  MDString *getRawName() const { return getOperandAs<MDString>(2); }
+  MDString *getRawLinkageName() const { return getOperandAs<MDString>(4); }
+
+  Metadata *getType() const { return getOperand(5); }
+  Metadata *getContainingType() const { return getOperand(6); }
+
+  Metadata *getFunction() const { return getOperand(7); }
+  Metadata *getTemplateParams() const { return getOperand(8); }
+  Metadata *getDeclaration() const { return getOperand(9); }
+  Metadata *getVariables() const { return getOperand(10); }
+
+  /// \brief Replace the function.
+  ///
+  /// If \a isUniqued() and not \a isResolved(), this could node will be
+  /// RAUW'ed and deleted out from under the caller.  Use a \a TrackingMDRef if
+  /// that's a problem.
+  /// @{
+  void replaceFunction(Function *F);
+  void replaceFunction(ConstantAsMetadata *MD) { replaceOperandWith(7, MD); }
+  void replaceFunction(std::nullptr_t) { replaceOperandWith(7, nullptr); }
+  /// @}
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDSubprogramKind;
+  }
+};
+
+class MDLexicalBlockBase : public MDScope {
+protected:
+  MDLexicalBlockBase(LLVMContext &C, unsigned ID, StorageType Storage,
+                     ArrayRef<Metadata *> Ops)
+      : MDScope(C, ID, Storage, dwarf::DW_TAG_lexical_block, Ops) {}
+  ~MDLexicalBlockBase() {}
+
+public:
+  Metadata *getScope() const { return getOperand(1); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDLexicalBlockKind ||
+           MD->getMetadataID() == MDLexicalBlockFileKind;
+  }
+};
+
+class MDLexicalBlock : public MDLexicalBlockBase {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  unsigned Line;
+  unsigned Column;
+
+  MDLexicalBlock(LLVMContext &C, StorageType Storage, unsigned Line,
+                 unsigned Column, ArrayRef<Metadata *> Ops)
+      : MDLexicalBlockBase(C, MDLexicalBlockKind, Storage, Ops), Line(Line),
+        Column(Column) {}
+  ~MDLexicalBlock() {}
+
+  static MDLexicalBlock *getImpl(LLVMContext &Context, Metadata *Scope,
+                                 Metadata *File, unsigned Line, unsigned Column,
+                                 StorageType Storage, bool ShouldCreate = true);
+
+  TempMDLexicalBlock cloneImpl() const {
+    return getTemporary(getContext(), getScope(), getFile(), getLine(),
+                        getColumn());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDLexicalBlock, (Metadata * Scope, Metadata *File,
+                                     unsigned Line, unsigned Column),
+                    (Scope, File, Line, Column))
+
+  TempMDLexicalBlock clone() const { return cloneImpl(); }
+
+  unsigned getLine() const { return Line; }
+  unsigned getColumn() const { return Column; }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDLexicalBlockKind;
+  }
+};
+
+class MDLexicalBlockFile : public MDLexicalBlockBase {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  unsigned Discriminator;
+
+  MDLexicalBlockFile(LLVMContext &C, StorageType Storage,
+                     unsigned Discriminator, ArrayRef<Metadata *> Ops)
+      : MDLexicalBlockBase(C, MDLexicalBlockFileKind, Storage, Ops),
+        Discriminator(Discriminator) {}
+  ~MDLexicalBlockFile() {}
+
+  static MDLexicalBlockFile *getImpl(LLVMContext &Context, Metadata *Scope,
+                                     Metadata *File, unsigned Discriminator,
+                                     StorageType Storage,
+                                     bool ShouldCreate = true);
+
+  TempMDLexicalBlockFile cloneImpl() const {
+    return getTemporary(getContext(), getScope(), getFile(),
+                        getDiscriminator());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDLexicalBlockFile,
+                    (Metadata * Scope, Metadata *File, unsigned Discriminator),
+                    (Scope, File, Discriminator))
+
+  TempMDLexicalBlockFile clone() const { return cloneImpl(); }
+
+  unsigned getDiscriminator() const { return Discriminator; }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDLexicalBlockFileKind;
+  }
+};
+
+class MDNamespace : public MDScope {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  unsigned Line;
+
+  MDNamespace(LLVMContext &Context, StorageType Storage, unsigned Line,
+              ArrayRef<Metadata *> Ops)
+      : MDScope(Context, MDNamespaceKind, Storage, dwarf::DW_TAG_namespace,
+                Ops),
+        Line(Line) {}
+  ~MDNamespace() {}
+
+  static MDNamespace *getImpl(LLVMContext &Context, Metadata *Scope,
+                              Metadata *File, StringRef Name, unsigned Line,
+                              StorageType Storage, bool ShouldCreate = true) {
+    return getImpl(Context, Scope, File, getCanonicalMDString(Context, Name),
+                   Line, Storage, ShouldCreate);
+  }
+  static MDNamespace *getImpl(LLVMContext &Context, Metadata *Scope,
+                              Metadata *File, MDString *Name, unsigned Line,
+                              StorageType Storage, bool ShouldCreate = true);
+
+  TempMDNamespace cloneImpl() const {
+    return getTemporary(getContext(), getScope(), getFile(), getName(),
+                        getLine());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDNamespace, (Metadata * Scope, Metadata *File,
+                                  StringRef Name, unsigned Line),
+                    (Scope, File, Name, Line))
+  DEFINE_MDNODE_GET(MDNamespace, (Metadata * Scope, Metadata *File,
+                                  MDString *Name, unsigned Line),
+                    (Scope, File, Name, Line))
+
+  TempMDNamespace clone() const { return cloneImpl(); }
+
+  unsigned getLine() const { return Line; }
+  Metadata *getScope() const { return getOperand(1); }
+  StringRef getName() const { return getStringOperand(2); }
+
+  MDString *getRawName() const { return getOperandAs<MDString>(2); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDNamespaceKind;
+  }
+};
+
+/// \brief Base class for template parameters.
+class MDTemplateParameter : public DebugNode {
+protected:
+  MDTemplateParameter(LLVMContext &Context, unsigned ID, StorageType Storage,
+                      unsigned Tag, ArrayRef<Metadata *> Ops)
+      : DebugNode(Context, ID, Storage, Tag, Ops) {}
+  ~MDTemplateParameter() {}
+
+public:
+  StringRef getName() const { return getStringOperand(0); }
+  Metadata *getType() const { return getOperand(1); }
+
+  MDString *getRawName() const { return getOperandAs<MDString>(0); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDTemplateTypeParameterKind ||
+           MD->getMetadataID() == MDTemplateValueParameterKind;
+  }
+};
+
+class MDTemplateTypeParameter : public MDTemplateParameter {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  MDTemplateTypeParameter(LLVMContext &Context, StorageType Storage,
+                          ArrayRef<Metadata *> Ops)
+      : MDTemplateParameter(Context, MDTemplateTypeParameterKind, Storage,
+                            dwarf::DW_TAG_template_type_parameter, Ops) {}
+  ~MDTemplateTypeParameter() {}
+
+  static MDTemplateTypeParameter *getImpl(LLVMContext &Context, StringRef Name,
+                                          Metadata *Type, StorageType Storage,
+                                          bool ShouldCreate = true) {
+    return getImpl(Context, getCanonicalMDString(Context, Name), Type, Storage,
+                   ShouldCreate);
+  }
+  static MDTemplateTypeParameter *getImpl(LLVMContext &Context, MDString *Name,
+                                          Metadata *Type, StorageType Storage,
+                                          bool ShouldCreate = true);
+
+  TempMDTemplateTypeParameter cloneImpl() const {
+    return getTemporary(getContext(), getName(), getType());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDTemplateTypeParameter, (StringRef Name, Metadata *Type),
+                    (Name, Type))
+  DEFINE_MDNODE_GET(MDTemplateTypeParameter, (MDString * Name, Metadata *Type),
+                    (Name, Type))
+
+  TempMDTemplateTypeParameter clone() const { return cloneImpl(); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDTemplateTypeParameterKind;
+  }
+};
+
+class MDTemplateValueParameter : public MDTemplateParameter {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  MDTemplateValueParameter(LLVMContext &Context, StorageType Storage,
+                           unsigned Tag, ArrayRef<Metadata *> Ops)
+      : MDTemplateParameter(Context, MDTemplateValueParameterKind, Storage, Tag,
+                            Ops) {}
+  ~MDTemplateValueParameter() {}
+
+  static MDTemplateValueParameter *getImpl(LLVMContext &Context, unsigned Tag,
+                                           StringRef Name, Metadata *Type,
+                                           Metadata *Value, StorageType Storage,
+                                           bool ShouldCreate = true) {
+    return getImpl(Context, Tag, getCanonicalMDString(Context, Name), Type,
+                   Value, Storage, ShouldCreate);
+  }
+  static MDTemplateValueParameter *getImpl(LLVMContext &Context, unsigned Tag,
+                                           MDString *Name, Metadata *Type,
+                                           Metadata *Value, StorageType Storage,
+                                           bool ShouldCreate = true);
+
+  TempMDTemplateValueParameter cloneImpl() const {
+    return getTemporary(getContext(), getTag(), getName(), getType(),
+                        getValue());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDTemplateValueParameter, (unsigned Tag, StringRef Name,
+                                               Metadata *Type, Metadata *Value),
+                    (Tag, Name, Type, Value))
+  DEFINE_MDNODE_GET(MDTemplateValueParameter, (unsigned Tag, MDString *Name,
+                                               Metadata *Type, Metadata *Value),
+                    (Tag, Name, Type, Value))
+
+  TempMDTemplateValueParameter clone() const { return cloneImpl(); }
+
+  Metadata *getValue() const { return getOperand(2); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDTemplateValueParameterKind;
+  }
+};
+
+/// \brief Base class for variables.
+///
+/// TODO: Hardcode to DW_TAG_variable.
+class MDVariable : public DebugNode {
+  unsigned Line;
+
+protected:
+  MDVariable(LLVMContext &C, unsigned ID, StorageType Storage, unsigned Tag,
+             unsigned Line, ArrayRef<Metadata *> Ops)
+      : DebugNode(C, ID, Storage, Tag, Ops), Line(Line) {}
+  ~MDVariable() {}
+
+public:
+  unsigned getLine() const { return Line; }
+  Metadata *getScope() const { return getOperand(0); }
+  StringRef getName() const { return getStringOperand(1); }
+  Metadata *getFile() const { return getOperand(2); }
+  Metadata *getType() const { return getOperand(3); }
+
+  MDString *getRawName() const { return getOperandAs<MDString>(1); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDLocalVariableKind ||
+           MD->getMetadataID() == MDGlobalVariableKind;
+  }
+};
+
+/// \brief Global variables.
+///
+/// TODO: Remove DisplayName.  It's always equal to Name.
+class MDGlobalVariable : public MDVariable {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  bool IsLocalToUnit;
+  bool IsDefinition;
+
+  MDGlobalVariable(LLVMContext &C, StorageType Storage, unsigned Line,
+                   bool IsLocalToUnit, bool IsDefinition,
+                   ArrayRef<Metadata *> Ops)
+      : MDVariable(C, MDGlobalVariableKind, Storage, dwarf::DW_TAG_variable,
+                   Line, Ops),
+        IsLocalToUnit(IsLocalToUnit), IsDefinition(IsDefinition) {}
+  ~MDGlobalVariable() {}
+
+  static MDGlobalVariable *
+  getImpl(LLVMContext &Context, Metadata *Scope, StringRef Name,
+          StringRef LinkageName, Metadata *File, unsigned Line, Metadata *Type,
+          bool IsLocalToUnit, bool IsDefinition, Metadata *Variable,
+          Metadata *StaticDataMemberDeclaration, StorageType Storage,
+          bool ShouldCreate = true) {
+    return getImpl(Context, Scope, getCanonicalMDString(Context, Name),
+                   getCanonicalMDString(Context, LinkageName), File, Line, Type,
+                   IsLocalToUnit, IsDefinition, Variable,
+                   StaticDataMemberDeclaration, Storage, ShouldCreate);
+  }
+  static MDGlobalVariable *
+  getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
+          MDString *LinkageName, Metadata *File, unsigned Line, Metadata *Type,
+          bool IsLocalToUnit, bool IsDefinition, Metadata *Variable,
+          Metadata *StaticDataMemberDeclaration, StorageType Storage,
+          bool ShouldCreate = true);
+
+  TempMDGlobalVariable cloneImpl() const {
+    return getTemporary(getContext(), getScope(), getName(), getLinkageName(),
+                        getFile(), getLine(), getType(), isLocalToUnit(),
+                        isDefinition(), getVariable(),
+                        getStaticDataMemberDeclaration());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDGlobalVariable,
+                    (Metadata * Scope, StringRef Name, StringRef LinkageName,
+                     Metadata *File, unsigned Line, Metadata *Type,
+                     bool IsLocalToUnit, bool IsDefinition, Metadata *Variable,
+                     Metadata *StaticDataMemberDeclaration),
+                    (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit,
+                     IsDefinition, Variable, StaticDataMemberDeclaration))
+  DEFINE_MDNODE_GET(MDGlobalVariable,
+                    (Metadata * Scope, MDString *Name, MDString *LinkageName,
+                     Metadata *File, unsigned Line, Metadata *Type,
+                     bool IsLocalToUnit, bool IsDefinition, Metadata *Variable,
+                     Metadata *StaticDataMemberDeclaration),
+                    (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit,
+                     IsDefinition, Variable, StaticDataMemberDeclaration))
+
+  TempMDGlobalVariable clone() const { return cloneImpl(); }
+
+  bool isLocalToUnit() const { return IsLocalToUnit; }
+  bool isDefinition() const { return IsDefinition; }
+  StringRef getDisplayName() const { return getStringOperand(4); }
+  StringRef getLinkageName() const { return getStringOperand(5); }
+  Metadata *getVariable() const { return getOperand(6); }
+  Metadata *getStaticDataMemberDeclaration() const { return getOperand(7); }
+
+  MDString *getRawLinkageName() const { return getOperandAs<MDString>(5); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDGlobalVariableKind;
+  }
+};
+
+/// \brief Local variable.
+///
+/// TODO: Split between arguments and otherwise.
+/// TODO: Use \c DW_TAG_variable instead of fake tags.
+/// TODO: Split up flags.
+class MDLocalVariable : public MDVariable {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  unsigned Arg;
+  unsigned Flags;
+
+  MDLocalVariable(LLVMContext &C, StorageType Storage, unsigned Tag,
+                  unsigned Line, unsigned Arg, unsigned Flags,
+                  ArrayRef<Metadata *> Ops)
+      : MDVariable(C, MDLocalVariableKind, Storage, Tag, Line, Ops), Arg(Arg),
+        Flags(Flags) {}
+  ~MDLocalVariable() {}
+
+  static MDLocalVariable *getImpl(LLVMContext &Context, unsigned Tag,
+                                  Metadata *Scope, StringRef Name,
+                                  Metadata *File, unsigned Line, Metadata *Type,
+                                  unsigned Arg, unsigned Flags,
+                                  Metadata *InlinedAt, StorageType Storage,
+                                  bool ShouldCreate = true) {
+    return getImpl(Context, Tag, Scope, getCanonicalMDString(Context, Name),
+                   File, Line, Type, Arg, Flags, InlinedAt, Storage,
+                   ShouldCreate);
+  }
+  static MDLocalVariable *getImpl(LLVMContext &Context, unsigned Tag,
+                                  Metadata *Scope, MDString *Name,
+                                  Metadata *File, unsigned Line, Metadata *Type,
+                                  unsigned Arg, unsigned Flags,
+                                  Metadata *InlinedAt, StorageType Storage,
+                                  bool ShouldCreate = true);
+
+  TempMDLocalVariable cloneImpl() const {
+    return getTemporary(getContext(), getTag(), getScope(), getName(),
+                        getFile(), getLine(), getType(), getArg(), getFlags(),
+                        getInlinedAt());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDLocalVariable,
+                    (unsigned Tag, Metadata *Scope, StringRef Name,
+                     Metadata *File, unsigned Line, Metadata *Type,
+                     unsigned Arg, unsigned Flags,
+                     Metadata *InlinedAt = nullptr),
+                    (Tag, Scope, Name, File, Line, Type, Arg, Flags, InlinedAt))
+  DEFINE_MDNODE_GET(MDLocalVariable,
+                    (unsigned Tag, Metadata *Scope, MDString *Name,
+                     Metadata *File, unsigned Line, Metadata *Type,
+                     unsigned Arg, unsigned Flags,
+                     Metadata *InlinedAt = nullptr),
+                    (Tag, Scope, Name, File, Line, Type, Arg, Flags, InlinedAt))
+
+  TempMDLocalVariable clone() const { return cloneImpl(); }
+
+  unsigned getArg() const { return Arg; }
+  unsigned getFlags() const { return Flags; }
+  Metadata *getInlinedAt() const { return getOperand(4); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDLocalVariableKind;
+  }
+};
+
+/// \brief DWARF expression.
+///
+/// TODO: Co-allocate the expression elements.
+/// TODO: Drop fake DW_TAG_expression and separate from DebugNode.
+/// TODO: Separate from MDNode, or otherwise drop Distinct and Temporary
+/// storage types.
+class MDExpression : public DebugNode {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  std::vector<uint64_t> Elements;
+
+  MDExpression(LLVMContext &C, StorageType Storage, ArrayRef<uint64_t> Elements)
+      : DebugNode(C, MDExpressionKind, Storage, dwarf::DW_TAG_expression, None),
+        Elements(Elements.begin(), Elements.end()) {}
+  ~MDExpression() {}
+
+  static MDExpression *getImpl(LLVMContext &Context,
+                               ArrayRef<uint64_t> Elements, StorageType Storage,
+                               bool ShouldCreate = true);
+
+  TempMDExpression cloneImpl() const {
+    return getTemporary(getContext(), getElements());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDExpression, (ArrayRef<uint64_t> Elements), (Elements))
+
+  TempMDExpression clone() const { return cloneImpl(); }
+
+  ArrayRef<uint64_t> getElements() const { return Elements; }
+
+  unsigned getNumElements() const { return Elements.size(); }
+  uint64_t getElement(unsigned I) const {
+    assert(I < Elements.size() && "Index out of range");
+    return Elements[I];
+  }
+
+  typedef ArrayRef<uint64_t>::iterator element_iterator;
+  element_iterator elements_begin() const { return getElements().begin(); }
+  element_iterator elements_end() const { return getElements().end(); }
+
+  /// \brief A lightweight wrapper around an expression operand.
+  ///
+  /// TODO: Store arguments directly and change \a MDExpression to store a
+  /// range of these.
+  class ExprOperand {
+    const uint64_t *Op;
+
+  public:
+    explicit ExprOperand(const uint64_t *Op) : Op(Op) {}
+
+    const uint64_t *get() const { return Op; }
+
+    /// \brief Get the operand code.
+    uint64_t getOp() const { return *Op; }
+
+    /// \brief Get an argument to the operand.
+    ///
+    /// Never returns the operand itself.
+    uint64_t getArg(unsigned I) const { return Op[I + 1]; }
+
+    unsigned getNumArgs() const { return getSize() - 1; }
+
+    /// \brief Return the size of the operand.
+    ///
+    /// Return the number of elements in the operand (1 + args).
+    unsigned getSize() const;
+  };
+
+  /// \brief An iterator for expression operands.
+  class expr_op_iterator
+      : public std::iterator<std::input_iterator_tag, ExprOperand> {
+    ExprOperand Op;
+
+  public:
+    explicit expr_op_iterator(element_iterator I) : Op(I) {}
+
+    element_iterator getBase() const { return Op.get(); }
+    const ExprOperand &operator*() const { return Op; }
+    const ExprOperand *operator->() const { return &Op; }
+
+    expr_op_iterator &operator++() {
+      increment();
+      return *this;
+    }
+    expr_op_iterator operator++(int) {
+      expr_op_iterator T(*this);
+      increment();
+      return T;
+    }
+
+    bool operator==(const expr_op_iterator &X) const {
+      return getBase() == X.getBase();
+    }
+    bool operator!=(const expr_op_iterator &X) const {
+      return getBase() != X.getBase();
+    }
+
+  private:
+    void increment() { Op = ExprOperand(getBase() + Op.getSize()); }
+  };
+
+  /// \brief Visit the elements via ExprOperand wrappers.
+  ///
+  /// These range iterators visit elements through \a ExprOperand wrappers.
+  /// This is not guaranteed to be a valid range unless \a isValid() gives \c
+  /// true.
+  ///
+  /// \pre \a isValid() gives \c true.
+  /// @{
+  expr_op_iterator expr_op_begin() const {
+    return expr_op_iterator(elements_begin());
+  }
+  expr_op_iterator expr_op_end() const {
+    return expr_op_iterator(elements_end());
+  }
+  /// @}
+
+  bool isValid() const;
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDExpressionKind;
+  }
+};
+
+class MDObjCProperty : public DebugNode {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  unsigned Line;
+  unsigned Attributes;
+
+  MDObjCProperty(LLVMContext &C, StorageType Storage, unsigned Line,
+                 unsigned Attributes, ArrayRef<Metadata *> Ops)
+      : DebugNode(C, MDObjCPropertyKind, Storage, dwarf::DW_TAG_APPLE_property,
+                  Ops),
+        Line(Line), Attributes(Attributes) {}
+  ~MDObjCProperty() {}
+
+  static MDObjCProperty *
+  getImpl(LLVMContext &Context, StringRef Name, Metadata *File, unsigned Line,
+          StringRef GetterName, StringRef SetterName, unsigned Attributes,
+          Metadata *Type, StorageType Storage, bool ShouldCreate = true) {
+    return getImpl(Context, getCanonicalMDString(Context, Name), File, Line,
+                   getCanonicalMDString(Context, GetterName),
+                   getCanonicalMDString(Context, SetterName), Attributes, Type,
+                   Storage, ShouldCreate);
+  }
+  static MDObjCProperty *getImpl(LLVMContext &Context, MDString *Name,
+                                 Metadata *File, unsigned Line,
+                                 MDString *GetterName, MDString *SetterName,
+                                 unsigned Attributes, Metadata *Type,
+                                 StorageType Storage, bool ShouldCreate = true);
+
+  TempMDObjCProperty cloneImpl() const {
+    return getTemporary(getContext(), getName(), getFile(), getLine(),
+                        getGetterName(), getSetterName(), getAttributes(),
+                        getType());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDObjCProperty,
+                    (StringRef Name, Metadata *File, unsigned Line,
+                     StringRef GetterName, StringRef SetterName,
+                     unsigned Attributes, Metadata *Type),
+                    (Name, File, Line, GetterName, SetterName, Attributes,
+                     Type))
+  DEFINE_MDNODE_GET(MDObjCProperty,
+                    (MDString * Name, Metadata *File, unsigned Line,
+                     MDString *GetterName, MDString *SetterName,
+                     unsigned Attributes, Metadata *Type),
+                    (Name, File, Line, GetterName, SetterName, Attributes,
+                     Type))
+
+  TempMDObjCProperty clone() const { return cloneImpl(); }
+
+  unsigned getLine() const { return Line; }
+  unsigned getAttributes() const { return Attributes; }
+  StringRef getName() const { return getStringOperand(0); }
+  Metadata *getFile() const { return getOperand(1); }
+  StringRef getGetterName() const { return getStringOperand(2); }
+  StringRef getSetterName() const { return getStringOperand(3); }
+  Metadata *getType() const { return getOperand(4); }
+
+  MDString *getRawName() const { return getOperandAs<MDString>(0); }
+  MDString *getRawGetterName() const { return getOperandAs<MDString>(2); }
+  MDString *getRawSetterName() const { return getOperandAs<MDString>(3); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDObjCPropertyKind;
+  }
+};
+
+class MDImportedEntity : public DebugNode {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  unsigned Line;
+
+  MDImportedEntity(LLVMContext &C, StorageType Storage, unsigned Tag,
+                   unsigned Line, ArrayRef<Metadata *> Ops)
+      : DebugNode(C, MDImportedEntityKind, Storage, Tag, Ops), Line(Line) {}
+  ~MDImportedEntity() {}
+
+  static MDImportedEntity *getImpl(LLVMContext &Context, unsigned Tag,
+                                   Metadata *Scope, Metadata *Entity,
+                                   unsigned Line, StringRef Name,
+                                   StorageType Storage,
+                                   bool ShouldCreate = true) {
+    return getImpl(Context, Tag, Scope, Entity, Line,
+                   getCanonicalMDString(Context, Name), Storage, ShouldCreate);
+  }
+  static MDImportedEntity *getImpl(LLVMContext &Context, unsigned Tag,
+                                   Metadata *Scope, Metadata *Entity,
+                                   unsigned Line, MDString *Name,
+                                   StorageType Storage,
+                                   bool ShouldCreate = true);
+
+  TempMDImportedEntity cloneImpl() const {
+    return getTemporary(getContext(), getTag(), getScope(), getEntity(),
+                        getLine(), getName());
+  }
+
+public:
+  DEFINE_MDNODE_GET(MDImportedEntity,
+                    (unsigned Tag, Metadata *Scope, Metadata *Entity,
+                     unsigned Line, StringRef Name = ""),
+                    (Tag, Scope, Entity, Line, Name))
+  DEFINE_MDNODE_GET(MDImportedEntity,
+                    (unsigned Tag, Metadata *Scope, Metadata *Entity,
+                     unsigned Line, MDString *Name),
+                    (Tag, Scope, Entity, Line, Name))
+
+  TempMDImportedEntity clone() const { return cloneImpl(); }
+
+  unsigned getLine() const { return Line; }
+  Metadata *getScope() const { return getOperand(0); }
+  Metadata *getEntity() const { return getOperand(1); }
+  StringRef getName() const { return getStringOperand(2); }
+
+  MDString *getRawName() const { return getOperandAs<MDString>(2); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDImportedEntityKind;
+  }
+};
+
+} // end namespace llvm
+
+#undef DEFINE_MDNODE_GET_UNPACK_IMPL
+#undef DEFINE_MDNODE_GET_UNPACK
+#undef DEFINE_MDNODE_GET
+
+#endif
diff --git a/include/llvm/IR/DebugLoc.h b/include/llvm/IR/DebugLoc.h
index 3d969a8..86e6441 100644
--- a/include/llvm/IR/DebugLoc.h
+++ b/include/llvm/IR/DebugLoc.h
@@ -15,51 +15,41 @@
 #ifndef LLVM_IR_DEBUGLOC_H
 #define LLVM_IR_DEBUGLOC_H
 
+#include "llvm/IR/TrackingMDRef.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
-  template <typename T> struct DenseMapInfo;
-  class MDNode;
+
   class LLVMContext;
   class raw_ostream;
+  class MDNode;
 
   /// DebugLoc - Debug location id.  This is carried by Instruction, SDNode,
   /// and MachineInstr to compactly encode file/line/scope information for an
   /// operation.
   class DebugLoc {
-    friend struct DenseMapInfo<DebugLoc>;
-
-    /// getEmptyKey() - A private constructor that returns an unknown that is
-    /// not equal to the tombstone key or DebugLoc().
-    static DebugLoc getEmptyKey() {
-      DebugLoc DL;
-      DL.LineCol = 1;
-      return DL;
-    }
+    TrackingMDNodeRef Loc;
 
-    /// getTombstoneKey() - A private constructor that returns an unknown that
-    /// is not equal to the empty key or DebugLoc().
-    static DebugLoc getTombstoneKey() {
-      DebugLoc DL;
-      DL.LineCol = 2;
-      return DL;
+  public:
+    DebugLoc() {}
+    DebugLoc(DebugLoc &&X) : Loc(std::move(X.Loc)) {}
+    DebugLoc(const DebugLoc &X) : Loc(X.Loc) {}
+    DebugLoc &operator=(DebugLoc &&X) {
+      Loc = std::move(X.Loc);
+      return *this;
+    }
+    DebugLoc &operator=(const DebugLoc &X) {
+      Loc = X.Loc;
+      return *this;
     }
 
-    /// LineCol - This 32-bit value encodes the line and column number for the
-    /// location, encoded as 24-bits for line and 8 bits for col.  A value of 0
-    /// for either means unknown.
-    uint32_t LineCol;
-
-    /// ScopeIdx - This is an opaque ID# for Scope/InlinedAt information,
-    /// decoded by LLVMContext.  0 is unknown.
-    int ScopeIdx;
-  public:
-    DebugLoc() : LineCol(0), ScopeIdx(0) {}  // Defaults to unknown.
+    /// \brief Check whether this has a trivial destructor.
+    bool hasTrivialDestructor() const { return Loc.hasTrivialDestructor(); }
 
     /// get - Get a new DebugLoc that corresponds to the specified line/col
     /// scope/inline location.
-    static DebugLoc get(unsigned Line, unsigned Col,
-                        MDNode *Scope, MDNode *InlinedAt = nullptr);
+    static DebugLoc get(unsigned Line, unsigned Col, MDNode *Scope,
+                        MDNode *InlinedAt = nullptr);
 
     /// getFromDILocation - Translate the DILocation quad into a DebugLoc.
     static DebugLoc getFromDILocation(MDNode *N);
@@ -68,56 +58,54 @@ namespace llvm {
     static DebugLoc getFromDILexicalBlock(MDNode *N);
 
     /// isUnknown - Return true if this is an unknown location.
-    bool isUnknown() const { return ScopeIdx == 0; }
+    bool isUnknown() const { return !Loc; }
 
-    unsigned getLine() const {
-      return (LineCol << 8) >> 8;  // Mask out column.
-    }
-
-    unsigned getCol() const {
-      return LineCol >> 24;
-    }
+    unsigned getLine() const;
+    unsigned getCol() const;
 
     /// getScope - This returns the scope pointer for this DebugLoc, or null if
     /// invalid.
-    MDNode *getScope(const LLVMContext &Ctx) const;
+    MDNode *getScope() const;
+    MDNode *getScope(const LLVMContext &) const { return getScope(); }
 
     /// getInlinedAt - This returns the InlinedAt pointer for this DebugLoc, or
     /// null if invalid or not present.
-    MDNode *getInlinedAt(const LLVMContext &Ctx) const;
+    MDNode *getInlinedAt() const;
+    MDNode *getInlinedAt(const LLVMContext &) const { return getInlinedAt(); }
 
     /// getScopeAndInlinedAt - Return both the Scope and the InlinedAt values.
+    void getScopeAndInlinedAt(MDNode *&Scope, MDNode *&IA) const;
     void getScopeAndInlinedAt(MDNode *&Scope, MDNode *&IA,
-                              const LLVMContext &Ctx) const;
+                              const LLVMContext &) const {
+      return getScopeAndInlinedAt(Scope, IA);
+    }
 
     /// getScopeNode - Get MDNode for DebugLoc's scope, or null if invalid.
-    MDNode *getScopeNode(const LLVMContext &Ctx) const;
+    MDNode *getScopeNode() const;
+    MDNode *getScopeNode(const LLVMContext &) const { return getScopeNode(); }
 
     // getFnDebugLoc - Walk up the scope chain of given debug loc and find line
     // number info for the function.
-    DebugLoc getFnDebugLoc(const LLVMContext &Ctx) const;
+    DebugLoc getFnDebugLoc() const;
+    DebugLoc getFnDebugLoc(const LLVMContext &) const {
+      return getFnDebugLoc();
+    }
 
     /// getAsMDNode - This method converts the compressed DebugLoc node into a
     /// DILocation compatible MDNode.
-    MDNode *getAsMDNode(const LLVMContext &Ctx) const;
+    MDNode *getAsMDNode() const;
+    MDNode *getAsMDNode(LLVMContext &) const { return getAsMDNode(); }
 
-    bool operator==(const DebugLoc &DL) const {
-      return LineCol == DL.LineCol && ScopeIdx == DL.ScopeIdx;
-    }
+    bool operator==(const DebugLoc &DL) const { return Loc == DL.Loc; }
     bool operator!=(const DebugLoc &DL) const { return !(*this == DL); }
 
-    void dump(const LLVMContext &Ctx) const;
+    void dump() const;
+    void dump(const LLVMContext &) const { dump(); }
     /// \brief prints source location /path/to/file.exe:line:col @[inlined at]
-    void print(const LLVMContext &Ctx, raw_ostream &OS) const;
+    void print(raw_ostream &OS) const;
+    void print(const LLVMContext &, raw_ostream &OS) const { print(OS); }
   };
 
-  template <>
-  struct DenseMapInfo<DebugLoc> {
-    static DebugLoc getEmptyKey() { return DebugLoc::getEmptyKey(); }
-    static DebugLoc getTombstoneKey() { return DebugLoc::getTombstoneKey(); }
-    static unsigned getHashValue(const DebugLoc &Key);
-    static bool isEqual(DebugLoc LHS, DebugLoc RHS) { return LHS == RHS; }
-  };
 } // end namespace llvm
 
 #endif /* LLVM_SUPPORT_DEBUGLOC_H */
diff --git a/include/llvm/IR/DerivedTypes.h b/include/llvm/IR/DerivedTypes.h
index 534d1e5..76f8064 100644
--- a/include/llvm/IR/DerivedTypes.h
+++ b/include/llvm/IR/DerivedTypes.h
@@ -94,8 +94,8 @@ public:
 /// FunctionType - Class to represent function types
 ///
 class FunctionType : public Type {
-  FunctionType(const FunctionType &) LLVM_DELETED_FUNCTION;
-  const FunctionType &operator=(const FunctionType &) LLVM_DELETED_FUNCTION;
+  FunctionType(const FunctionType &) = delete;
+  const FunctionType &operator=(const FunctionType &) = delete;
   FunctionType(Type *Result, ArrayRef<Type*> Params, bool IsVarArgs);
 
 public:
@@ -123,6 +123,9 @@ public:
   typedef Type::subtype_iterator param_iterator;
   param_iterator param_begin() const { return ContainedTys + 1; }
   param_iterator param_end() const { return &ContainedTys[NumContainedTys]; }
+  ArrayRef<Type *> params() const {
+    return makeArrayRef(param_begin(), param_end());
+  }
 
   /// Parameter type accessors.
   Type *getParamType(unsigned i) const { return ContainedTys[i+1]; }
@@ -185,8 +188,8 @@ public:
 /// generator for a target expects).
 ///
 class StructType : public CompositeType {
-  StructType(const StructType &) LLVM_DELETED_FUNCTION;
-  const StructType &operator=(const StructType &) LLVM_DELETED_FUNCTION;
+  StructType(const StructType &) = delete;
+  const StructType &operator=(const StructType &) = delete;
   StructType(LLVMContext &C)
     : CompositeType(C, StructTyID), SymbolTableEntry(nullptr) {}
   enum {
@@ -274,6 +277,9 @@ public:
   typedef Type::subtype_iterator element_iterator;
   element_iterator element_begin() const { return ContainedTys; }
   element_iterator element_end() const { return &ContainedTys[NumContainedTys];}
+  ArrayRef<Type *> const elements() const {
+    return makeArrayRef(element_begin(), element_end());
+  }
 
   /// isLayoutIdentical - Return true if this is layout identical to the
   /// specified struct.
@@ -302,8 +308,8 @@ public:
 ///
 class SequentialType : public CompositeType {
   Type *ContainedType;               ///< Storage for the single contained type.
-  SequentialType(const SequentialType &) LLVM_DELETED_FUNCTION;
-  const SequentialType &operator=(const SequentialType &) LLVM_DELETED_FUNCTION;
+  SequentialType(const SequentialType &) = delete;
+  const SequentialType &operator=(const SequentialType &) = delete;
 
 protected:
   SequentialType(TypeID TID, Type *ElType)
@@ -329,8 +335,8 @@ public:
 class ArrayType : public SequentialType {
   uint64_t NumElements;
 
-  ArrayType(const ArrayType &) LLVM_DELETED_FUNCTION;
-  const ArrayType &operator=(const ArrayType &) LLVM_DELETED_FUNCTION;
+  ArrayType(const ArrayType &) = delete;
+  const ArrayType &operator=(const ArrayType &) = delete;
   ArrayType(Type *ElType, uint64_t NumEl);
 public:
   /// ArrayType::get - This static method is the primary way to construct an
@@ -355,8 +361,8 @@ public:
 class VectorType : public SequentialType {
   unsigned NumElements;
 
-  VectorType(const VectorType &) LLVM_DELETED_FUNCTION;
-  const VectorType &operator=(const VectorType &) LLVM_DELETED_FUNCTION;
+  VectorType(const VectorType &) = delete;
+  const VectorType &operator=(const VectorType &) = delete;
   VectorType(Type *ElType, unsigned NumEl);
 public:
   /// VectorType::get - This static method is the primary way to construct an
@@ -440,8 +446,8 @@ public:
 /// PointerType - Class to represent pointers.
 ///
 class PointerType : public SequentialType {
-  PointerType(const PointerType &) LLVM_DELETED_FUNCTION;
-  const PointerType &operator=(const PointerType &) LLVM_DELETED_FUNCTION;
+  PointerType(const PointerType &) = delete;
+  const PointerType &operator=(const PointerType &) = delete;
   explicit PointerType(Type *ElType, unsigned AddrSpace);
 public:
   /// PointerType::get - This constructs a pointer to an object of the specified
diff --git a/include/llvm/IR/DiagnosticInfo.h b/include/llvm/IR/DiagnosticInfo.h
index b592f89..c6a8854 100644
--- a/include/llvm/IR/DiagnosticInfo.h
+++ b/include/llvm/IR/DiagnosticInfo.h
@@ -45,6 +45,7 @@ enum DiagnosticSeverity {
 /// \brief Defines the different supported kind of a diagnostic.
 /// This enum should be extended with a new ID for each added concrete subclass.
 enum DiagnosticKind {
+  DK_Bitcode,
   DK_InlineAsm,
   DK_StackSize,
   DK_Linker,
@@ -97,6 +98,8 @@ public:
   virtual void print(DiagnosticPrinter &DP) const = 0;
 };
 
+typedef std::function<void(const DiagnosticInfo &)> DiagnosticHandlerFunction;
+
 /// Diagnostic information for inline asm reporting.
 /// This is basically a message and an optional location.
 class DiagnosticInfoInlineAsm : public DiagnosticInfo {
diff --git a/include/llvm/IR/Dominators.h b/include/llvm/IR/Dominators.h
index e2d1ccc..c1f208e 100644
--- a/include/llvm/IR/Dominators.h
+++ b/include/llvm/IR/Dominators.h
@@ -31,6 +31,11 @@
 
 namespace llvm {
 
+// FIXME: Replace this brittle forward declaration with the include of the new
+// PassManager.h when doing so doesn't break the PassManagerBuilder.
+template <typename IRUnitT> class AnalysisManager;
+class PreservedAnalyses;
+
 EXTERN_TEMPLATE_INSTANTIATION(class DomTreeNodeBase<BasicBlock>);
 EXTERN_TEMPLATE_INSTANTIATION(class DominatorTreeBase<BasicBlock>);
 
@@ -69,6 +74,13 @@ public:
 
   DominatorTree() : DominatorTreeBase<BasicBlock>(false) {}
 
+  DominatorTree(DominatorTree &&Arg)
+      : Base(std::move(static_cast<Base &>(Arg))) {}
+  DominatorTree &operator=(DominatorTree &&RHS) {
+    Base::operator=(std::move(static_cast<Base &>(RHS)));
+    return *this;
+  }
+
   /// \brief Returns *false* if the other dominator tree matches this dominator
   /// tree.
   inline bool compare(const DominatorTree &Other) const {
@@ -155,6 +167,43 @@ template <> struct GraphTraits<DominatorTree*>
 };
 
 /// \brief Analysis pass which computes a \c DominatorTree.
+class DominatorTreeAnalysis {
+public:
+  /// \brief Provide the result typedef for this analysis pass.
+  typedef DominatorTree Result;
+
+  /// \brief Opaque, unique identifier for this analysis pass.
+  static void *ID() { return (void *)&PassID; }
+
+  /// \brief Run the analysis pass over a function and produce a dominator tree.
+  DominatorTree run(Function &F);
+
+  /// \brief Provide access to a name for this pass for debugging purposes.
+  static StringRef name() { return "DominatorTreeAnalysis"; }
+
+private:
+  static char PassID;
+};
+
+/// \brief Printer pass for the \c DominatorTree.
+class DominatorTreePrinterPass {
+  raw_ostream &OS;
+
+public:
+  explicit DominatorTreePrinterPass(raw_ostream &OS);
+  PreservedAnalyses run(Function &F, AnalysisManager<Function> *AM);
+
+  static StringRef name() { return "DominatorTreePrinterPass"; }
+};
+
+/// \brief Verifier pass for the \c DominatorTree.
+struct DominatorTreeVerifierPass {
+  PreservedAnalyses run(Function &F, AnalysisManager<Function> *AM);
+
+  static StringRef name() { return "DominatorTreeVerifierPass"; }
+};
+
+/// \brief Legacy analysis pass which computes a \c DominatorTree.
 class DominatorTreeWrapperPass : public FunctionPass {
   DominatorTree DT;
 
diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h
index 26d893b..0cd5afb 100644
--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h
@@ -87,11 +87,14 @@ private:
   ValueSymbolTable *SymTab;               ///< Symbol table of args/instructions
   AttributeSet AttributeSets;             ///< Parameter attributes
 
-  // HasLazyArguments is stored in Value::SubclassData.
-  /*bool HasLazyArguments;*/
-
-  // The Calling Convention is stored in Value::SubclassData.
-  /*CallingConv::ID CallingConvention;*/
+  /*
+   * Value::SubclassData
+   *
+   * bit 0  : HasLazyArguments
+   * bit 1  : HasPrefixData
+   * bit 2  : HasPrologueData
+   * bit 3-6: CallingConvention
+   */
 
   friend class SymbolTableListTraits<Function, Module>;
 
@@ -102,7 +105,7 @@ private:
   /// needs it.  The hasLazyArguments predicate returns true if the arg list
   /// hasn't been set up yet.
   bool hasLazyArguments() const {
-    return getSubclassDataFromValue() & 1;
+    return getSubclassDataFromValue() & (1<<0);
   }
   void CheckLazyArguments() const {
     if (hasLazyArguments())
@@ -110,8 +113,8 @@ private:
   }
   void BuildLazyArguments() const;
 
-  Function(const Function&) LLVM_DELETED_FUNCTION;
-  void operator=(const Function&) LLVM_DELETED_FUNCTION;
+  Function(const Function&) = delete;
+  void operator=(const Function&) = delete;
 
   /// Do the actual lookup of an intrinsic ID when the query could not be
   /// answered from the cache.
@@ -162,11 +165,11 @@ public:
   /// calling convention of this function.  The enum values for the known
   /// calling conventions are defined in CallingConv.h.
   CallingConv::ID getCallingConv() const {
-    return static_cast<CallingConv::ID>(getSubclassDataFromValue() >> 2);
+    return static_cast<CallingConv::ID>(getSubclassDataFromValue() >> 3);
   }
   void setCallingConv(CallingConv::ID CC) {
-    setValueSubclassData((getSubclassDataFromValue() & 3) |
-                         (static_cast<unsigned>(CC) << 2));
+    setValueSubclassData((getSubclassDataFromValue() & 7) |
+                         (static_cast<unsigned>(CC) << 3));
   }
 
   /// @brief Return the attribute list for this Function.
@@ -215,6 +218,11 @@ public:
     return AttributeSets.getAttribute(AttributeSet::FunctionIndex, Kind);
   }
 
+  /// \brief Return the stack alignment for the function.
+  unsigned getFnStackAlignment() const {
+    return AttributeSets.getStackAlignment(AttributeSet::FunctionIndex);
+  }
+
   /// hasGC/getGC/setGC/clearGC - The name of the garbage collection algorithm
   ///                             to use during code generation.
   bool hasGC() const;
@@ -231,6 +239,9 @@ public:
   /// @brief removes the attributes from the list of attributes.
   void removeAttributes(unsigned i, AttributeSet attr);
 
+  /// @brief adds the dereferenceable attribute to the list of attributes.
+  void addDereferenceableAttr(unsigned i, uint64_t Bytes);
+
   /// @brief Extract the alignment for a call or parameter (0=unknown).
   unsigned getParamAlignment(unsigned i) const {
     return AttributeSets.getParamAlignment(i);
@@ -448,12 +459,19 @@ public:
   bool arg_empty() const;
 
   bool hasPrefixData() const {
-    return getSubclassDataFromValue() & 2;
+    return getSubclassDataFromValue() & (1<<1);
   }
 
   Constant *getPrefixData() const;
   void setPrefixData(Constant *PrefixData);
 
+  bool hasPrologueData() const {
+    return getSubclassDataFromValue() & (1<<2);
+  }
+
+  Constant *getPrologueData() const;
+  void setPrologueData(Constant *PrologueData);
+
   /// viewCFG - This function is meant for use from the debugger.  You can just
   /// say 'call F->viewCFG()' and a ghostview window should pop up from the
   /// program, displaying the CFG of the current function with the code for each
diff --git a/include/llvm/IR/GVMaterializer.h b/include/llvm/IR/GVMaterializer.h
index a7d68ec..6f57dc2 100644
--- a/include/llvm/IR/GVMaterializer.h
+++ b/include/llvm/IR/GVMaterializer.h
@@ -19,11 +19,13 @@
 #define LLVM_IR_GVMATERIALIZER_H
 
 #include <system_error>
+#include <vector>
 
 namespace llvm {
 class Function;
 class GlobalValue;
 class Module;
+class StructType;
 
 class GVMaterializer {
 protected:
@@ -50,6 +52,8 @@ public:
   /// Make sure the entire Module has been completely read.
   ///
   virtual std::error_code MaterializeModule(Module *M) = 0;
+
+  virtual std::vector<StructType *> getIdentifiedStructTypes() const = 0;
 };
 
 } // End llvm namespace
diff --git a/include/llvm/IR/GlobalAlias.h b/include/llvm/IR/GlobalAlias.h
index 075b570..d0672c8 100644
--- a/include/llvm/IR/GlobalAlias.h
+++ b/include/llvm/IR/GlobalAlias.h
@@ -28,8 +28,8 @@ template<typename ValueSubClass, typename ItemParentClass>
 
 class GlobalAlias : public GlobalValue, public ilist_node<GlobalAlias> {
   friend class SymbolTableListTraits<GlobalAlias, Module>;
-  void operator=(const GlobalAlias &) LLVM_DELETED_FUNCTION;
-  GlobalAlias(const GlobalAlias &) LLVM_DELETED_FUNCTION;
+  void operator=(const GlobalAlias &) = delete;
+  GlobalAlias(const GlobalAlias &) = delete;
 
   void setParent(Module *parent);
 
diff --git a/include/llvm/IR/GlobalObject.h b/include/llvm/IR/GlobalObject.h
index 546fea2..50deb08 100644
--- a/include/llvm/IR/GlobalObject.h
+++ b/include/llvm/IR/GlobalObject.h
@@ -24,7 +24,7 @@ class Comdat;
 class Module;
 
 class GlobalObject : public GlobalValue {
-  GlobalObject(const GlobalObject &) LLVM_DELETED_FUNCTION;
+  GlobalObject(const GlobalObject &) = delete;
 
 protected:
   GlobalObject(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps,
diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index e7b5d58..aaecc1d 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h
@@ -20,7 +20,6 @@
 
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DerivedTypes.h"
-
 #include <system_error>
 
 namespace llvm {
@@ -30,7 +29,7 @@ class PointerType;
 class Module;
 
 class GlobalValue : public Constant {
-  GlobalValue(const GlobalValue &) LLVM_DELETED_FUNCTION;
+  GlobalValue(const GlobalValue &) = delete;
 public:
   /// @brief An enumeration for the kinds of linkage for global values.
   enum LinkageTypes {
diff --git a/include/llvm/IR/GlobalVariable.h b/include/llvm/IR/GlobalVariable.h
index 4189ccb..d7b81a2 100644
--- a/include/llvm/IR/GlobalVariable.h
+++ b/include/llvm/IR/GlobalVariable.h
@@ -34,9 +34,9 @@ template<typename ValueSubClass, typename ItemParentClass>
 
 class GlobalVariable : public GlobalObject, public ilist_node<GlobalVariable> {
   friend class SymbolTableListTraits<GlobalVariable, Module>;
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
-  void operator=(const GlobalVariable &) LLVM_DELETED_FUNCTION;
-  GlobalVariable(const GlobalVariable &) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
+  void operator=(const GlobalVariable &) = delete;
+  GlobalVariable(const GlobalVariable &) = delete;
 
   void setParent(Module *parent);
 
diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h
index 088c7b4..33649d7 100644
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@@ -115,12 +115,10 @@ public:
   }
 
   /// \brief Set location information used by debugging information.
-  void SetCurrentDebugLocation(const DebugLoc &L) {
-    CurDbgLocation = L;
-  }
+  void SetCurrentDebugLocation(DebugLoc L) { CurDbgLocation = std::move(L); }
 
   /// \brief Get location information used by debugging information.
-  DebugLoc getCurrentDebugLocation() const { return CurDbgLocation; }
+  const DebugLoc &getCurrentDebugLocation() const { return CurDbgLocation; }
 
   /// \brief If this builder has a current debug location, set it on the
   /// specified instruction.
@@ -200,8 +198,8 @@ public:
     BasicBlock::iterator Point;
     DebugLoc DbgLoc;
 
-    InsertPointGuard(const InsertPointGuard &) LLVM_DELETED_FUNCTION;
-    InsertPointGuard &operator=(const InsertPointGuard &) LLVM_DELETED_FUNCTION;
+    InsertPointGuard(const InsertPointGuard &) = delete;
+    InsertPointGuard &operator=(const InsertPointGuard &) = delete;
 
   public:
     InsertPointGuard(IRBuilderBase &B)
@@ -221,9 +219,9 @@ public:
     FastMathFlags FMF;
     MDNode *FPMathTag;
 
-    FastMathFlagGuard(const FastMathFlagGuard &) LLVM_DELETED_FUNCTION;
+    FastMathFlagGuard(const FastMathFlagGuard &) = delete;
     FastMathFlagGuard &operator=(
-        const FastMathFlagGuard &) LLVM_DELETED_FUNCTION;
+        const FastMathFlagGuard &) = delete;
 
   public:
     FastMathFlagGuard(IRBuilderBase &B)
@@ -429,11 +427,54 @@ public:
   /// If the pointer isn't i8* it will be converted.
   CallInst *CreateLifetimeEnd(Value *Ptr, ConstantInt *Size = nullptr);
 
+  /// \brief Create a call to Masked Load intrinsic
+  CallInst *CreateMaskedLoad(Value *Ptr, unsigned Align, Value *Mask,
+                             Value *PassThru = 0, const Twine &Name = "");
+
+  /// \brief Create a call to Masked Store intrinsic
+  CallInst *CreateMaskedStore(Value *Val, Value *Ptr, unsigned Align,
+                              Value *Mask);
+
   /// \brief Create an assume intrinsic call that allows the optimizer to
   /// assume that the provided condition will be true.
   CallInst *CreateAssumption(Value *Cond);
 
+  /// \brief Create a call to the experimental.gc.statepoint intrinsic to
+  /// start a new statepoint sequence.
+  CallInst *CreateGCStatepoint(Value *ActualCallee,
+                               ArrayRef<Value *> CallArgs,
+                               ArrayRef<Value *> DeoptArgs,
+                               ArrayRef<Value *> GCArgs,
+                               const Twine &Name = "");
+
+  // Conveninence function for the common case when CallArgs are filled in using
+  // makeArrayRef(CS.arg_begin(), .arg_end()); Use needs to be .get()'ed to get
+  // the Value *.
+  CallInst *CreateGCStatepoint(Value *ActualCallee, ArrayRef<Use> CallArgs,
+                               ArrayRef<Value *> DeoptArgs,
+                               ArrayRef<Value *> GCArgs,
+                               const Twine &Name = "");
+
+  /// \brief Create a call to the experimental.gc.result intrinsic to extract
+  /// the result from a call wrapped in a statepoint.
+  CallInst *CreateGCResult(Instruction *Statepoint,
+                           Type *ResultType,
+                           const Twine &Name = "");
+
+  /// \brief Create a call to the experimental.gc.relocate intrinsics to
+  /// project the relocated value of one pointer from the statepoint.
+  CallInst *CreateGCRelocate(Instruction *Statepoint,
+                             int BaseOffset,
+                             int DerivedOffset,
+                             Type *ResultType,
+                             const Twine &Name = "");
+
 private:
+  /// \brief Create a call to a masked intrinsic with given Id.
+  /// Masked intrinsic has only one overloaded type - data type.
+  CallInst *CreateMaskedIntrinsic(unsigned Id, ArrayRef<Value *> Ops,
+                                  Type *DataTy, const Twine &Name = "");
+
   Value *getCastedInt8PtrValue(Value *Ptr);
 };
 
@@ -1246,11 +1287,23 @@ public:
       return Insert(Folder.CreateIntCast(VC, DestTy, isSigned), Name);
     return Insert(CastInst::CreateIntegerCast(V, DestTy, isSigned), Name);
   }
+
+  Value *CreateBitOrPointerCast(Value *V, Type *DestTy,
+                                const Twine &Name = "") {
+    if (V->getType() == DestTy)
+      return V;
+    if (V->getType()->isPointerTy() && DestTy->isIntegerTy())
+      return CreatePtrToInt(V, DestTy, Name);
+    if (V->getType()->isIntegerTy() && DestTy->isPointerTy())
+      return CreateIntToPtr(V, DestTy, Name);
+
+    return CreateBitCast(V, DestTy, Name);
+  }
 private:
   // \brief Provided to resolve 'CreateIntCast(Ptr, Ptr, "...")', giving a
   // compile time error, instead of converting the string to bool for the
   // isSigned parameter.
-  Value *CreateIntCast(Value *, Type *, const char *) LLVM_DELETED_FUNCTION;
+  Value *CreateIntCast(Value *, Type *, const char *) = delete;
 public:
   Value *CreateFPCast(Value *V, Type *DestTy, const Twine &Name = "") {
     if (V->getType() == DestTy)
diff --git a/include/llvm/IR/IRPrintingPasses.h b/include/llvm/IR/IRPrintingPasses.h
index afea0c3..7f2027b 100644
--- a/include/llvm/IR/IRPrintingPasses.h
+++ b/include/llvm/IR/IRPrintingPasses.h
@@ -58,7 +58,7 @@ public:
   PrintModulePass();
   PrintModulePass(raw_ostream &OS, const std::string &Banner = "");
 
-  PreservedAnalyses run(Module *M);
+  PreservedAnalyses run(Module &M);
 
   static StringRef name() { return "PrintModulePass"; }
 };
@@ -75,7 +75,7 @@ public:
   PrintFunctionPass();
   PrintFunctionPass(raw_ostream &OS, const std::string &Banner = "");
 
-  PreservedAnalyses run(Function *F);
+  PreservedAnalyses run(Function &F);
 
   static StringRef name() { return "PrintFunctionPass"; }
 };
diff --git a/include/llvm/IR/InlineAsm.h b/include/llvm/IR/InlineAsm.h
index b2d79d0..84ae9df 100644
--- a/include/llvm/IR/InlineAsm.h
+++ b/include/llvm/IR/InlineAsm.h
@@ -40,8 +40,8 @@ private:
   friend struct InlineAsmKeyType;
   friend class ConstantUniqueMap<InlineAsm>;
 
-  InlineAsm(const InlineAsm &) LLVM_DELETED_FUNCTION;
-  void operator=(const InlineAsm&) LLVM_DELETED_FUNCTION;
+  InlineAsm(const InlineAsm &) = delete;
+  void operator=(const InlineAsm&) = delete;
 
   std::string AsmString, Constraints;
   bool HasSideEffects;
diff --git a/include/llvm/IR/InstIterator.h b/include/llvm/IR/InstIterator.h
index 75e93bd..0bb2854 100644
--- a/include/llvm/IR/InstIterator.h
+++ b/include/llvm/IR/InstIterator.h
@@ -127,20 +127,32 @@ typedef InstIterator<const iplist<BasicBlock>,
 
 inline inst_iterator inst_begin(Function *F) { return inst_iterator(*F); }
 inline inst_iterator inst_end(Function *F)   { return inst_iterator(*F, true); }
+inline iterator_range<inst_iterator> inst_range(Function *F) {
+  return iterator_range<inst_iterator>(inst_begin(F), inst_end(F));
+}
 inline const_inst_iterator inst_begin(const Function *F) {
   return const_inst_iterator(*F);
 }
 inline const_inst_iterator inst_end(const Function *F) {
   return const_inst_iterator(*F, true);
 }
+inline iterator_range<const_inst_iterator> inst_range(const Function *F) {
+  return iterator_range<const_inst_iterator>(inst_begin(F), inst_end(F));
+}
 inline inst_iterator inst_begin(Function &F) { return inst_iterator(F); }
 inline inst_iterator inst_end(Function &F)   { return inst_iterator(F, true); }
+inline iterator_range<inst_iterator> inst_range(Function &F) {
+  return iterator_range<inst_iterator>(inst_begin(F), inst_end(F));
+}
 inline const_inst_iterator inst_begin(const Function &F) {
   return const_inst_iterator(F);
 }
 inline const_inst_iterator inst_end(const Function &F) {
   return const_inst_iterator(F, true);
 }
+inline iterator_range<const_inst_iterator> inst_range(const Function &F) {
+  return iterator_range<const_inst_iterator>(inst_begin(F), inst_end(F));
+}
 
 } // End llvm namespace
 
diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index 186fc88..e086282 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -83,7 +83,7 @@ public:
 //===----------------------------------------------------------------------===//
 
 class UnaryInstruction : public Instruction {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
 
 protected:
   UnaryInstruction(Type *Ty, unsigned iType, Value *V,
@@ -132,7 +132,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(UnaryInstruction, Value)
 //===----------------------------------------------------------------------===//
 
 class BinaryOperator : public Instruction {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
 protected:
   void init(BinaryOps iType);
   BinaryOperator(BinaryOps iType, Value *S1, Value *S2, Type *Ty,
@@ -489,6 +489,19 @@ public:
     Instruction *InsertBefore = 0 ///< Place to insert the instruction
   );
 
+  /// @brief Create a BitCast, a PtrToInt, or an IntToPTr cast instruction.
+  ///
+  /// If the value is a pointer type and the destination an integer type,
+  /// creates a PtrToInt cast. If the value is an integer type and the
+  /// destination a pointer type, creates an IntToPtr cast. Otherwise, creates
+  /// a bitcast.
+  static CastInst *CreateBitOrPointerCast(
+    Value *S,                ///< The pointer value to be casted (operand 0)
+    Type *Ty,          ///< The type to which cast should be made
+    const Twine &Name = "", ///< Name for the instruction
+    Instruction *InsertBefore = 0 ///< Place to insert the instruction
+  );
+
   /// @brief Create a ZExt, BitCast, or Trunc for int -> int casts.
   static CastInst *CreateIntegerCast(
     Value *S,                ///< The pointer value to be casted (operand 0)
@@ -551,6 +564,17 @@ public:
     Type *DestTy ///< The Type to which the value should be cast.
   );
 
+  /// @brief Check whether a bitcast, inttoptr, or ptrtoint cast between these
+  /// types is valid and a no-op.
+  ///
+  /// This ensures that any pointer<->integer cast has enough bits in the
+  /// integer and any other cast is a bitcast.
+  static bool isBitOrNoopPointerCastable(
+    Type *SrcTy, ///< The Type from which the value should be cast.
+    Type *DestTy, ///< The Type to which the value should be cast.
+    const DataLayout *Layout = 0 ///< Optional DataLayout.
+  );
+
   /// Returns the opcode necessary to cast Val into Ty using usual casting
   /// rules.
   /// @brief Infer the opcode for cast operand and type
@@ -650,8 +674,8 @@ public:
 /// This class is the base class for the comparison instructions.
 /// @brief Abstract base class of comparison instructions.
 class CmpInst : public Instruction {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
-  CmpInst() LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
+  CmpInst() = delete;
 protected:
   CmpInst(Type *ty, Instruction::OtherOps op, unsigned short pred,
           Value *LHS, Value *RHS, const Twine &Name = "",
diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index ba7791c..dfece3e 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h
@@ -31,8 +31,8 @@ template<typename ValueSubClass, typename ItemParentClass>
   class SymbolTableListTraits;
 
 class Instruction : public User, public ilist_node<Instruction> {
-  void operator=(const Instruction &) LLVM_DELETED_FUNCTION;
-  Instruction(const Instruction &) LLVM_DELETED_FUNCTION;
+  void operator=(const Instruction &) = delete;
+  Instruction(const Instruction &) = delete;
 
   BasicBlock *Parent;
   DebugLoc DbgLoc;                         // 'dbg' Metadata cache.
@@ -201,7 +201,7 @@ public:
   void setAAMetadata(const AAMDNodes &N);
 
   /// setDebugLoc - Set the debug location information for this instruction.
-  void setDebugLoc(const DebugLoc &Loc) { DbgLoc = Loc; }
+  void setDebugLoc(DebugLoc Loc) { DbgLoc = std::move(Loc); }
 
   /// getDebugLoc - Return the debug location for this node as a DebugLoc.
   const DebugLoc &getDebugLoc() const { return DbgLoc; }
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index dcf19e0..83f9d04 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -17,8 +17,8 @@
 #define LLVM_IR_INSTRUCTIONS_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -285,7 +285,7 @@ private:
 /// StoreInst - an instruction for storing to memory
 ///
 class StoreInst : public Instruction {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
   void AssertOK();
 protected:
   StoreInst *clone_impl() const override;
@@ -411,7 +411,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(StoreInst, Value)
 /// FenceInst - an instruction for ordering other memory operations
 ///
 class FenceInst : public Instruction {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
   void Init(AtomicOrdering Ordering, SynchronizationScope SynchScope);
 protected:
   FenceInst *clone_impl() const override;
@@ -478,7 +478,7 @@ private:
 /// there.  Returns the value that was loaded.
 ///
 class AtomicCmpXchgInst : public Instruction {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
   void Init(Value *Ptr, Value *Cmp, Value *NewVal,
             AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering,
             SynchronizationScope SynchScope);
@@ -634,7 +634,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(AtomicCmpXchgInst, Value)
 /// the old value.
 ///
 class AtomicRMWInst : public Instruction {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
 protected:
   AtomicRMWInst *clone_impl() const override;
 public:
@@ -845,6 +845,13 @@ public:
     return cast<SequentialType>(Instruction::getType());
   }
 
+  Type *getSourceElementType() const {
+    SequentialType *Ty = cast<SequentialType>(getPointerOperandType());
+    if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+      Ty = cast<SequentialType>(VTy->getElementType());
+    return Ty->getElementType();
+  }
+
   /// \brief Returns the address space of this instruction's pointer type.
   unsigned getAddressSpace() const {
     // Note that this is always the same as the pointer operand's address space
@@ -1375,6 +1382,9 @@ public:
   /// removeAttribute - removes the attribute from the list of attributes.
   void removeAttribute(unsigned i, Attribute attr);
 
+  /// \brief adds the dereferenceable attribute to the list of attributes.
+  void addDereferenceableAttr(unsigned i, uint64_t Bytes);
+
   /// \brief Determine whether this call has the given attribute.
   bool hasFnAttr(Attribute::AttrKind A) const {
     assert(A != Attribute::NoBuiltin &&
@@ -1958,7 +1968,7 @@ ExtractValueInst::ExtractValueInst(Value *Agg,
 class InsertValueInst : public Instruction {
   SmallVector<unsigned, 4> Indices;
 
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
   InsertValueInst(const InsertValueInst &IVI);
   void init(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
             const Twine &NameStr);
@@ -2088,7 +2098,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(InsertValueInst, Value)
 // scientist's overactive imagination.
 //
 class PHINode : public Instruction {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
   /// ReservedSpace - The number of operands actually allocated.  NumOperands is
   /// the number actually in use.
   unsigned ReservedSpace;
@@ -2163,6 +2173,8 @@ public:
     return block_begin() + getNumOperands();
   }
 
+  op_range incoming_values() { return operands(); }
+
   /// getNumIncomingValues - Return the number of incoming edges
   ///
   unsigned getNumIncomingValues() const { return getNumOperands(); }
@@ -2295,7 +2307,7 @@ class LandingPadInst : public Instruction {
 public:
   enum ClauseType { Catch, Filter };
 private:
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
   // Allocate space for exactly zero operands.
   void *operator new(size_t s) {
     return User::operator new(s, 0);
@@ -2562,7 +2574,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BranchInst, Value)
 /// SwitchInst - Multiway switch
 ///
 class SwitchInst : public TerminatorInst {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
   unsigned ReservedSpace;
   // Operand[0]    = Value to switch on
   // Operand[1]    = Default basic block destination
@@ -2871,7 +2883,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SwitchInst, Value)
 /// IndirectBrInst - Indirect Branch Instruction.
 ///
 class IndirectBrInst : public TerminatorInst {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
   unsigned ReservedSpace;
   // Operand[0]    = Value to switch on
   // Operand[1]    = Default basic block destination
@@ -3056,6 +3068,9 @@ public:
   /// removeAttribute - removes the attribute from the list of attributes.
   void removeAttribute(unsigned i, Attribute attr);
 
+  /// \brief removes the dereferenceable attribute to the list of attributes.
+  void addDereferenceableAttr(unsigned i, uint64_t Bytes);
+
   /// \brief Determine whether this call has the given attribute.
   bool hasFnAttr(Attribute::AttrKind A) const {
     assert(A != Attribute::NoBuiltin &&
@@ -3296,7 +3311,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ResumeInst, Value)
 /// end of the block cannot be reached.
 ///
 class UnreachableInst : public TerminatorInst {
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
 protected:
   UnreachableInst *clone_impl() const override;
 
diff --git a/include/llvm/IR/IntrinsicInst.h b/include/llvm/IR/IntrinsicInst.h
index e3d7999..d434432 100644
--- a/include/llvm/IR/IntrinsicInst.h
+++ b/include/llvm/IR/IntrinsicInst.h
@@ -28,15 +28,16 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Metadata.h"
 
 namespace llvm {
   /// IntrinsicInst - A useful wrapper class for inspecting calls to intrinsic
   /// functions.  This allows the standard isa/dyncast/cast functionality to
   /// work with calls to intrinsic functions.
   class IntrinsicInst : public CallInst {
-    IntrinsicInst() LLVM_DELETED_FUNCTION;
-    IntrinsicInst(const IntrinsicInst&) LLVM_DELETED_FUNCTION;
-    void operator=(const IntrinsicInst&) LLVM_DELETED_FUNCTION;
+    IntrinsicInst() = delete;
+    IntrinsicInst(const IntrinsicInst&) = delete;
+    void operator=(const IntrinsicInst&) = delete;
   public:
     /// getIntrinsicID - Return the intrinsic ID of this intrinsic.
     ///
@@ -81,8 +82,14 @@ namespace llvm {
   class DbgDeclareInst : public DbgInfoIntrinsic {
   public:
     Value *getAddress() const;
-    MDNode *getVariable() const { return cast<MDNode>(getArgOperand(1)); }
-    MDNode *getExpression() const { return cast<MDNode>(getArgOperand(2)); }
+    MDNode *getVariable() const {
+      return cast<MDNode>(
+          cast<MetadataAsValue>(getArgOperand(1))->getMetadata());
+    }
+    MDNode *getExpression() const {
+      return cast<MDNode>(
+          cast<MetadataAsValue>(getArgOperand(2))->getMetadata());
+    }
 
     // Methods for support type inquiry through isa, cast, and dyn_cast:
     static inline bool classof(const IntrinsicInst *I) {
@@ -103,8 +110,14 @@ namespace llvm {
       return cast<ConstantInt>(
                           const_cast<Value*>(getArgOperand(1)))->getZExtValue();
     }
-    MDNode *getVariable() const { return cast<MDNode>(getArgOperand(2)); }
-    MDNode *getExpression() const { return cast<MDNode>(getArgOperand(3)); }
+    MDNode *getVariable() const {
+      return cast<MDNode>(
+          cast<MetadataAsValue>(getArgOperand(2))->getMetadata());
+    }
+    MDNode *getExpression() const {
+      return cast<MDNode>(
+          cast<MetadataAsValue>(getArgOperand(3))->getMetadata());
+    }
 
     // Methods for support type inquiry through isa, cast, and dyn_cast:
     static inline bool classof(const IntrinsicInst *I) {
@@ -322,6 +335,33 @@ namespace llvm {
     Value *getSrc() const { return const_cast<Value*>(getArgOperand(1)); }
   };
 
+  /// This represents the llvm.instrprof_increment intrinsic.
+  class InstrProfIncrementInst : public IntrinsicInst {
+  public:
+    static inline bool classof(const IntrinsicInst *I) {
+      return I->getIntrinsicID() == Intrinsic::instrprof_increment;
+    }
+    static inline bool classof(const Value *V) {
+      return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+    }
+
+    GlobalVariable *getName() const {
+      return cast<GlobalVariable>(
+          const_cast<Value *>(getArgOperand(0))->stripPointerCasts());
+    }
+
+    ConstantInt *getHash() const {
+      return cast<ConstantInt>(const_cast<Value *>(getArgOperand(1)));
+    }
+
+    ConstantInt *getNumCounters() const {
+      return cast<ConstantInt>(const_cast<Value *>(getArgOperand(2)));
+    }
+
+    ConstantInt *getIndex() const {
+      return cast<ConstantInt>(const_cast<Value *>(getArgOperand(3)));
+    }
+  };
 }
 
 #endif
diff --git a/include/llvm/IR/Intrinsics.h b/include/llvm/IR/Intrinsics.h
index acc0e9e..a66bd2c 100644
--- a/include/llvm/IR/Intrinsics.h
+++ b/include/llvm/IR/Intrinsics.h
@@ -41,7 +41,7 @@ namespace Intrinsic {
 #undef GET_INTRINSIC_ENUM_VALUES
     , num_intrinsics
   };
-  
+
   /// Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
   std::string getName(ID id, ArrayRef<Type*> Tys = None);
 
@@ -69,16 +69,17 @@ namespace Intrinsic {
 
   /// Map a MS builtin name to an intrinsic ID.
   ID getIntrinsicForMSBuiltin(const char *Prefix, const char *BuiltinName);
-  
+
   /// This is a type descriptor which explains the type requirements of an
   /// intrinsic. This is returned by getIntrinsicInfoTableEntries.
   struct IITDescriptor {
     enum IITDescriptorKind {
       Void, VarArg, MMX, Metadata, Half, Float, Double,
       Integer, Vector, Pointer, Struct,
-      Argument, ExtendArgument, TruncArgument, HalfVecArgument
+      Argument, ExtendArgument, TruncArgument, HalfVecArgument,
+      SameVecWidthArgument, PtrToArgument, VecOfPtrsToElt
     } Kind;
-    
+
     union {
       unsigned Integer_Width;
       unsigned Float_Width;
@@ -87,8 +88,9 @@ namespace Intrinsic {
       unsigned Struct_NumElements;
       unsigned Argument_Info;
     };
-    
+
     enum ArgKind {
+      AK_Any,
       AK_AnyInteger,
       AK_AnyFloat,
       AK_AnyVector,
@@ -96,25 +98,29 @@ namespace Intrinsic {
     };
     unsigned getArgumentNumber() const {
       assert(Kind == Argument || Kind == ExtendArgument ||
-             Kind == TruncArgument || Kind == HalfVecArgument);
-      return Argument_Info >> 2;
+             Kind == TruncArgument || Kind == HalfVecArgument ||
+             Kind == SameVecWidthArgument || Kind == PtrToArgument ||
+             Kind == VecOfPtrsToElt);
+      return Argument_Info >> 3;
     }
     ArgKind getArgumentKind() const {
       assert(Kind == Argument || Kind == ExtendArgument ||
-             Kind == TruncArgument || Kind == HalfVecArgument);
-      return (ArgKind)(Argument_Info&3);
+             Kind == TruncArgument || Kind == HalfVecArgument ||
+             Kind == SameVecWidthArgument || Kind == PtrToArgument ||
+             Kind == VecOfPtrsToElt);
+      return (ArgKind)(Argument_Info & 7);
     }
-    
+
     static IITDescriptor get(IITDescriptorKind K, unsigned Field) {
       IITDescriptor Result = { K, { Field } };
       return Result;
     }
   };
-  
+
   /// Return the IIT table descriptor for the specified intrinsic into an array
   /// of IITDescriptors.
   void getIntrinsicInfoTableEntries(ID id, SmallVectorImpl<IITDescriptor> &T);
-  
+
 } // End Intrinsic namespace
 
 } // End llvm namespace
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 98d48de..e89e65d 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -112,12 +112,19 @@ class LLVMMatchType<int num>
 // the intrinsic is overloaded, so the matched type should be declared as iAny.
 class LLVMExtendedType<int num> : LLVMMatchType<num>;
 class LLVMTruncatedType<int num> : LLVMMatchType<num>;
+class LLVMVectorSameWidth<int num, LLVMType elty>
+  : LLVMMatchType<num> {
+  ValueType ElTy = elty.VT;
+}
+class LLVMPointerTo<int num> : LLVMMatchType<num>;
+class LLVMVectorOfPointersToElt<int num> : LLVMMatchType<num>;
 
 // Match the type of another intrinsic parameter that is expected to be a
 // vector type, but change the element count to be half as many
 class LLVMHalfElementsVectorType<int num> : LLVMMatchType<num>;
 
 def llvm_void_ty       : LLVMType<isVoid>;
+def llvm_any_ty        : LLVMType<Any>;
 def llvm_anyint_ty     : LLVMType<iAny>;
 def llvm_anyfloat_ty   : LLVMType<fAny>;
 def llvm_anyvector_ty  : LLVMType<vAny>;
@@ -254,6 +261,10 @@ def int_gcwrite : Intrinsic<[],
 //
 def int_returnaddress : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_frameaddress  : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_frameallocate : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty]>;
+def int_framerecover : Intrinsic<[llvm_ptr_ty],
+                                             [llvm_ptr_ty, llvm_ptr_ty],
+                                             [IntrNoMem]>;
 def int_read_register  : Intrinsic<[llvm_anyint_ty], [llvm_metadata_ty],
                                    [IntrNoMem], "llvm.read_register">;
 def int_write_register : Intrinsic<[], [llvm_metadata_ty, llvm_anyint_ty],
@@ -287,6 +298,12 @@ def int_stackprotector : Intrinsic<[], [llvm_ptr_ty, llvm_ptrptr_ty], []>;
 def int_stackprotectorcheck : Intrinsic<[], [llvm_ptrptr_ty],
                                         [IntrReadWriteArgMem]>;
 
+// A counter increment for instrumentation based profiling.
+def int_instrprof_increment : Intrinsic<[],
+                                        [llvm_ptr_ty, llvm_i64_ty,
+                                         llvm_i32_ty, llvm_i32_ty],
+                                        []>;
+
 //===------------------- Standard C Library Intrinsics --------------------===//
 //
 
@@ -394,6 +411,11 @@ def int_eh_typeid_for : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>;
 def int_eh_return_i32 : Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty]>;
 def int_eh_return_i64 : Intrinsic<[], [llvm_i64_ty, llvm_ptr_ty]>;
 
+// eh.begincatch takes a pointer returned by a landingpad instruction and
+// returns the exception object pointer for the exception to be handled.
+def int_eh_begincatch : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty]>;
+def int_eh_endcatch : Intrinsic<[], []>;
+
 // __builtin_unwind_init is an undocumented GCC intrinsic that causes all
 // callee-saved registers to be saved and restored (regardless of whether they
 // are used) in the calling function. It is used by libgcc_eh.
@@ -493,6 +515,24 @@ def int_experimental_patchpoint_i64 : Intrinsic<[llvm_i64_ty],
                                                  llvm_vararg_ty],
                                                  [Throws]>;
 
+
+//===------------------------ Garbage Collection Intrinsics ---------------===//
+// These are documented in docs/Statepoint.rst
+
+def int_experimental_gc_statepoint : Intrinsic<[llvm_i32_ty],
+                               [llvm_anyptr_ty, llvm_i32_ty,
+                                llvm_i32_ty, llvm_vararg_ty]>;
+
+def int_experimental_gc_result   : Intrinsic<[llvm_any_ty], [llvm_i32_ty]>;
+def int_experimental_gc_relocate : Intrinsic<[llvm_anyptr_ty],
+                                [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
+
+// Deprecated: will be removed in a couple of weeks
+def int_experimental_gc_result_int : Intrinsic<[llvm_anyint_ty], [llvm_i32_ty]>;
+def int_experimental_gc_result_float : Intrinsic<[llvm_anyfloat_ty],
+                                                 [llvm_i32_ty]>;
+def int_experimental_gc_result_ptr : Intrinsic<[llvm_anyptr_ty], [llvm_i32_ty]>;
+
 //===-------------------------- Other Intrinsics --------------------------===//
 //
 def int_flt_rounds : Intrinsic<[llvm_i32_ty]>,
@@ -539,6 +579,34 @@ def int_convertuu  : Intrinsic<[llvm_anyint_ty],
 def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
                                 [], "llvm.clear_cache">;
 
+//===-------------------------- Masked Intrinsics -------------------------===//
+//
+def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, LLVMPointerTo<0>,
+                                      llvm_i32_ty,
+                                      LLVMVectorSameWidth<0, llvm_i1_ty>],
+                                 [IntrReadWriteArgMem]>;
+
+def int_masked_load  : Intrinsic<[llvm_anyvector_ty],
+                                 [LLVMPointerTo<0>, llvm_i32_ty,
+                                  LLVMVectorSameWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
+                                 [IntrReadArgMem]>;
+
+def int_masked_gather: Intrinsic<[llvm_anyvector_ty],
+                                 [LLVMVectorOfPointersToElt<0>, llvm_i32_ty,
+                                  LLVMVectorSameWidth<0, llvm_i1_ty>,
+                                  LLVMMatchType<0>],
+                                 [IntrReadArgMem]>;
+
+def int_masked_scatter: Intrinsic<[],
+                                  [llvm_anyvector_ty,
+                                   LLVMVectorOfPointersToElt<0>, llvm_i32_ty,
+                                   LLVMVectorSameWidth<0, llvm_i1_ty>],
+                                  [IntrReadWriteArgMem]>;
+
+// Intrinsics to support bit sets.
+def int_bitset_test : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty],
+                                [IntrNoMem]>;
+
 //===----------------------------------------------------------------------===//
 // Target-specific intrinsics
 //===----------------------------------------------------------------------===//
@@ -552,3 +620,4 @@ include "llvm/IR/IntrinsicsHexagon.td"
 include "llvm/IR/IntrinsicsNVVM.td"
 include "llvm/IR/IntrinsicsMips.td"
 include "llvm/IR/IntrinsicsR600.td"
+include "llvm/IR/IntrinsicsBPF.td"
diff --git a/include/llvm/IR/IntrinsicsBPF.td b/include/llvm/IR/IntrinsicsBPF.td
new file mode 100644
index 0000000..6b5110b
--- /dev/null
+++ b/include/llvm/IR/IntrinsicsBPF.td
@@ -0,0 +1,22 @@
+//===- IntrinsicsBPF.td - Defines BPF intrinsics -----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the BPF-specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+// Specialized loads from packet
+let TargetPrefix = "bpf" in {  // All intrinsics start with "llvm.bpf."
+  def int_bpf_load_byte : GCCBuiltin<"__builtin_bpf_load_byte">,
+              Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i64_ty], [IntrReadMem]>;
+  def int_bpf_load_half : GCCBuiltin<"__builtin_bpf_load_half">,
+              Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i64_ty], [IntrReadMem]>;
+  def int_bpf_load_word : GCCBuiltin<"__builtin_bpf_load_word">,
+              Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i64_ty], [IntrReadMem]>;
+}
diff --git a/include/llvm/IR/IntrinsicsHexagon.td b/include/llvm/IR/IntrinsicsHexagon.td
index 8a88729..b566956 100644
--- a/include/llvm/IR/IntrinsicsHexagon.td
+++ b/include/llvm/IR/IntrinsicsHexagon.td
@@ -254,7 +254,7 @@ class Hexagon_qi_qiqi_Intrinsic<string GCCIntSuffix>
 //
 class Hexagon_qi_qiqiqi_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i1_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                          [llvm_i1_ty], [llvm_i1_ty, llvm_i1_ty, llvm_i1_ty],
                           [IntrNoMem]>;
 //
 // DEF_FUNCTION_TYPE_2(SI_ftype_QIQI,BT_INT,BT_BOOL,BT_BOOL) ->
@@ -434,7 +434,7 @@ class Hexagon_mem_memmemsisi_Intrinsic<string GCCIntSuffix>
 class Hexagon_sf_si_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
                           [llvm_float_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
+                          [IntrNoMem, Throws]>;
 //
 // Hexagon_sf_df_Intrinsic<string GCCIntSuffix>
 //
@@ -490,21 +490,21 @@ class Hexagon_si_df_Intrinsic<string GCCIntSuffix>
 class Hexagon_sf_sfsf_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
                           [llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-                          [IntrNoMem]>;
+                          [IntrNoMem, Throws]>;
 //
-// Hexagon_qi_sfsf_Intrinsic<string GCCIntSuffix>
+// Hexagon_si_sfsf_Intrinsic<string GCCIntSuffix>
 //
-class Hexagon_qi_sfsf_Intrinsic<string GCCIntSuffix>
+class Hexagon_si_sfsf_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i1_ty], [llvm_float_ty, llvm_float_ty],
-                          [IntrNoMem]>;
+                          [llvm_i32_ty], [llvm_float_ty, llvm_float_ty],
+                          [IntrNoMem, Throws]>;
 //
-// Hexagon_qi_sfsi_Intrinsic<string GCCIntSuffix>
+// Hexagon_si_sfsi_Intrinsic<string GCCIntSuffix>
 //
-class Hexagon_qi_sfsi_Intrinsic<string GCCIntSuffix>
+class Hexagon_si_sfsi_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i1_ty], [llvm_float_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
+                          [llvm_i32_ty], [llvm_float_ty, llvm_i32_ty],
+                          [IntrNoMem, Throws]>;
 //
 // Hexagon_qi_sfqi_Intrinsic<string GCCIntSuffix>
 //
@@ -519,7 +519,7 @@ class Hexagon_sf_sfsfsf_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
                           [llvm_float_ty], [llvm_float_ty, llvm_float_ty,
                                             llvm_float_ty],
-                          [IntrNoMem]>;
+                          [IntrNoMem, Throws]>;
 //
 // Hexagon_sf_sfsfsfqi_Intrinsic<string GCCIntSuffix>
 //
@@ -528,7 +528,7 @@ class Hexagon_sf_sfsfsfqi_Intrinsic<string GCCIntSuffix>
                           [llvm_float_ty], [llvm_float_ty, llvm_float_ty,
                                             llvm_float_ty,
                            llvm_i32_ty],
-                          [IntrNoMem]>;
+                          [IntrNoMem, Throws]>;
 //
 // Hexagon_di_dididi_Intrinsic<string GCCIntSuffix>
 //
@@ -543,7 +543,7 @@ class Hexagon_di_dididisi_Intrinsic<string GCCIntSuffix>
 class Hexagon_df_si_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
                           [llvm_double_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
+                          [IntrNoMem, Throws]>;
 //
 // Hexagon_df_di_Intrinsic<string GCCIntSuffix>
 //
@@ -571,21 +571,21 @@ class Hexagon_df_df_Intrinsic<string GCCIntSuffix>
 class Hexagon_df_dfdf_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
                           [llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-                          [IntrNoMem]>;
+                          [IntrNoMem, Throws]>;
 //
-// Hexagon_qi_dfdf_Intrinsic<string GCCIntSuffix>
+// Hexagon_si_dfdf_Intrinsic<string GCCIntSuffix>
 //
-class Hexagon_qi_dfdf_Intrinsic<string GCCIntSuffix>
+class Hexagon_si_dfdf_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i1_ty], [llvm_double_ty, llvm_double_ty],
-                          [IntrNoMem]>;
+                          [llvm_i32_ty], [llvm_double_ty, llvm_double_ty],
+                          [IntrNoMem, Throws]>;
 //
-// Hexagon_qi_dfsi_Intrinsic<string GCCIntSuffix>
+// Hexagon_si_dfsi_Intrinsic<string GCCIntSuffix>
 //
-class Hexagon_qi_dfsi_Intrinsic<string GCCIntSuffix>
+class Hexagon_si_dfsi_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i1_ty], [llvm_double_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
+                          [llvm_i32_ty], [llvm_double_ty, llvm_i32_ty],
+                          [IntrNoMem, Throws]>;
 //
 //
 // Hexagon_df_dfdfdf_Intrinsic<string GCCIntSuffix>
@@ -594,7 +594,7 @@ class Hexagon_df_dfdfdf_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
                           [llvm_double_ty], [llvm_double_ty, llvm_double_ty,
                                              llvm_double_ty],
-                          [IntrNoMem]>;
+                          [IntrNoMem, Throws]>;
 //
 // Hexagon_df_dfdfdf_Intrinsic<string GCCIntSuffix>
 //
@@ -603,7 +603,7 @@ class Hexagon_df_dfdfdfqi_Intrinsic<string GCCIntSuffix>
                           [llvm_double_ty], [llvm_double_ty, llvm_double_ty,
                                              llvm_double_ty,
                           llvm_i32_ty],
-                          [IntrNoMem]>;
+                          [IntrNoMem, Throws]>;
 
 
 // This one below will not be generated from iset.py.
@@ -624,32 +624,32 @@ Hexagon_mem_memmemsisi_Intrinsic<"circ_ldd">;
 // BUILTIN_INFO(HEXAGON.C2_cmpeq,QI_ftype_SISI,2)
 //
 def int_hexagon_C2_cmpeq :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpeq">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpeq">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpgt,QI_ftype_SISI,2)
 //
 def int_hexagon_C2_cmpgt :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpgt">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgt">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpgtu,QI_ftype_SISI,2)
 //
 def int_hexagon_C2_cmpgtu :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpgtu">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgtu">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpeqp,QI_ftype_DIDI,2)
 //
 def int_hexagon_C2_cmpeqp :
-Hexagon_qi_didi_Intrinsic<"HEXAGON_C2_cmpeqp">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_C2_cmpeqp">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpgtp,QI_ftype_DIDI,2)
 //
 def int_hexagon_C2_cmpgtp :
-Hexagon_qi_didi_Intrinsic<"HEXAGON_C2_cmpgtp">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_C2_cmpgtp">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpgtup,QI_ftype_DIDI,2)
 //
 def int_hexagon_C2_cmpgtup :
-Hexagon_qi_didi_Intrinsic<"HEXAGON_C2_cmpgtup">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_C2_cmpgtup">;
 //
 // BUILTIN_INFO(HEXAGON.A4_rcmpeqi,SI_ftype_SISI,2)
 //
@@ -674,182 +674,182 @@ Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_rcmpneq">;
 // BUILTIN_INFO(HEXAGON.C2_bitsset,QI_ftype_SISI,2)
 //
 def int_hexagon_C2_bitsset :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_bitsset">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_bitsset">;
 //
 // BUILTIN_INFO(HEXAGON.C2_bitsclr,QI_ftype_SISI,2)
 //
 def int_hexagon_C2_bitsclr :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_bitsclr">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_bitsclr">;
 //
 // BUILTIN_INFO(HEXAGON.C4_nbitsset,QI_ftype_SISI,2)
 //
 def int_hexagon_C4_nbitsset :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_nbitsset">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_nbitsset">;
 //
 // BUILTIN_INFO(HEXAGON.C4_nbitsclr,QI_ftype_SISI,2)
 //
 def int_hexagon_C4_nbitsclr :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_nbitsclr">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_nbitsclr">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpeqi,QI_ftype_SISI,2)
 //
 def int_hexagon_C2_cmpeqi :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpeqi">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpeqi">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpgti,QI_ftype_SISI,2)
 //
 def int_hexagon_C2_cmpgti :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpgti">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgti">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpgtui,QI_ftype_SISI,2)
 //
 def int_hexagon_C2_cmpgtui :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpgtui">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgtui">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpgei,QI_ftype_SISI,2)
 //
 def int_hexagon_C2_cmpgei :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpgei">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgei">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpgeui,QI_ftype_SISI,2)
 //
 def int_hexagon_C2_cmpgeui :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpgeui">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgeui">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmplt,QI_ftype_SISI,2)
 //
 def int_hexagon_C2_cmplt :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmplt">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmplt">;
 //
 // BUILTIN_INFO(HEXAGON.C2_cmpltu,QI_ftype_SISI,2)
 //
 def int_hexagon_C2_cmpltu :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_cmpltu">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpltu">;
 //
 // BUILTIN_INFO(HEXAGON.C2_bitsclri,QI_ftype_SISI,2)
 //
 def int_hexagon_C2_bitsclri :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C2_bitsclri">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_bitsclri">;
 //
 // BUILTIN_INFO(HEXAGON.C4_nbitsclri,QI_ftype_SISI,2)
 //
 def int_hexagon_C4_nbitsclri :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_nbitsclri">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_nbitsclri">;
 //
 // BUILTIN_INFO(HEXAGON.C4_cmpneqi,QI_ftype_SISI,2)
 //
 def int_hexagon_C4_cmpneqi :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_cmpneqi">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmpneqi">;
 //
 // BUILTIN_INFO(HEXAGON.C4_cmpltei,QI_ftype_SISI,2)
 //
 def int_hexagon_C4_cmpltei :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_cmpltei">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmpltei">;
 //
 // BUILTIN_INFO(HEXAGON.C4_cmplteui,QI_ftype_SISI,2)
 //
 def int_hexagon_C4_cmplteui :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_cmplteui">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmplteui">;
 //
 // BUILTIN_INFO(HEXAGON.C4_cmpneq,QI_ftype_SISI,2)
 //
 def int_hexagon_C4_cmpneq :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_cmpneq">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmpneq">;
 //
 // BUILTIN_INFO(HEXAGON.C4_cmplte,QI_ftype_SISI,2)
 //
 def int_hexagon_C4_cmplte :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_cmplte">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmplte">;
 //
 // BUILTIN_INFO(HEXAGON.C4_cmplteu,QI_ftype_SISI,2)
 //
 def int_hexagon_C4_cmplteu :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_C4_cmplteu">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmplteu">;
 //
 // BUILTIN_INFO(HEXAGON.C2_and,QI_ftype_QIQI,2)
 //
 def int_hexagon_C2_and :
-Hexagon_qi_qiqi_Intrinsic<"HEXAGON_C2_and">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_and">;
 //
 // BUILTIN_INFO(HEXAGON.C2_or,QI_ftype_QIQI,2)
 //
 def int_hexagon_C2_or :
-Hexagon_qi_qiqi_Intrinsic<"HEXAGON_C2_or">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_or">;
 //
 // BUILTIN_INFO(HEXAGON.C2_xor,QI_ftype_QIQI,2)
 //
 def int_hexagon_C2_xor :
-Hexagon_qi_qiqi_Intrinsic<"HEXAGON_C2_xor">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_xor">;
 //
 // BUILTIN_INFO(HEXAGON.C2_andn,QI_ftype_QIQI,2)
 //
 def int_hexagon_C2_andn :
-Hexagon_qi_qiqi_Intrinsic<"HEXAGON_C2_andn">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_andn">;
 //
 // BUILTIN_INFO(HEXAGON.C2_not,QI_ftype_QI,1)
 //
 def int_hexagon_C2_not :
-Hexagon_qi_qi_Intrinsic<"HEXAGON_C2_not">;
+Hexagon_si_si_Intrinsic<"HEXAGON_C2_not">;
 //
 // BUILTIN_INFO(HEXAGON.C2_orn,QI_ftype_QIQI,2)
 //
 def int_hexagon_C2_orn :
-Hexagon_qi_qiqi_Intrinsic<"HEXAGON_C2_orn">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_orn">;
 //
 // BUILTIN_INFO(HEXAGON.C4_and_and,QI_ftype_QIQIQI,3)
 //
 def int_hexagon_C4_and_and :
-Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON_C4_and_and">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_and_and">;
 //
 // BUILTIN_INFO(HEXAGON.C4_and_or,QI_ftype_QIQIQI,3)
 //
 def int_hexagon_C4_and_or :
-Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON_C4_and_or">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_and_or">;
 //
 // BUILTIN_INFO(HEXAGON.C4_or_and,QI_ftype_QIQIQI,3)
 //
 def int_hexagon_C4_or_and :
-Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON_C4_or_and">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_or_and">;
 //
 // BUILTIN_INFO(HEXAGON.C4_or_or,QI_ftype_QIQIQI,3)
 //
 def int_hexagon_C4_or_or :
-Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON_C4_or_or">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_or_or">;
 //
 // BUILTIN_INFO(HEXAGON.C4_and_andn,QI_ftype_QIQIQI,3)
 //
 def int_hexagon_C4_and_andn :
-Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON_C4_and_andn">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_and_andn">;
 //
 // BUILTIN_INFO(HEXAGON.C4_and_orn,QI_ftype_QIQIQI,3)
 //
 def int_hexagon_C4_and_orn :
-Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON_C4_and_orn">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_and_orn">;
 //
 // BUILTIN_INFO(HEXAGON.C4_or_andn,QI_ftype_QIQIQI,3)
 //
 def int_hexagon_C4_or_andn :
-Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON_C4_or_andn">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_or_andn">;
 //
 // BUILTIN_INFO(HEXAGON.C4_or_orn,QI_ftype_QIQIQI,3)
 //
 def int_hexagon_C4_or_orn :
-Hexagon_qi_qiqiqi_Intrinsic<"HEXAGON_C4_or_orn">;
+Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_or_orn">;
 //
 // BUILTIN_INFO(HEXAGON.C2_pxfer_map,QI_ftype_QI,1)
 //
 def int_hexagon_C2_pxfer_map :
-Hexagon_qi_qi_Intrinsic<"HEXAGON_C2_pxfer_map">;
+Hexagon_si_qi_Intrinsic<"HEXAGON_C2_pxfer_map">;
 //
 // BUILTIN_INFO(HEXAGON.C2_any8,QI_ftype_QI,1)
 //
 def int_hexagon_C2_any8 :
-Hexagon_qi_qi_Intrinsic<"HEXAGON_C2_any8">;
+Hexagon_si_qi_Intrinsic<"HEXAGON_C2_any8">;
 //
 // BUILTIN_INFO(HEXAGON.C2_all8,QI_ftype_QI,1)
 //
 def int_hexagon_C2_all8 :
-Hexagon_qi_qi_Intrinsic<"HEXAGON_C2_all8">;
+Hexagon_si_qi_Intrinsic<"HEXAGON_C2_all8">;
 //
 // BUILTIN_INFO(HEXAGON.C2_vitpack,SI_ftype_QIQI,2)
 //
@@ -889,167 +889,167 @@ Hexagon_di_qi_Intrinsic<"HEXAGON_C2_mask">;
 // BUILTIN_INFO(HEXAGON.A2_vcmpbeq,QI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vcmpbeq :
-Hexagon_qi_didi_Intrinsic<"HEXAGON_A2_vcmpbeq">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpbeq">;
 //
 // BUILTIN_INFO(HEXAGON.A4_vcmpbeqi,QI_ftype_DISI,2)
 //
 def int_hexagon_A4_vcmpbeqi :
-Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmpbeqi">;
+Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpbeqi">;
 //
 // BUILTIN_INFO(HEXAGON.A4_vcmpbeq_any,QI_ftype_DIDI,2)
 //
 def int_hexagon_A4_vcmpbeq_any :
-Hexagon_qi_didi_Intrinsic<"HEXAGON_A4_vcmpbeq_any">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_A4_vcmpbeq_any">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vcmpbgtu,QI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vcmpbgtu :
-Hexagon_qi_didi_Intrinsic<"HEXAGON_A2_vcmpbgtu">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpbgtu">;
 //
 // BUILTIN_INFO(HEXAGON.A4_vcmpbgtui,QI_ftype_DISI,2)
 //
 def int_hexagon_A4_vcmpbgtui :
-Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmpbgtui">;
+Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpbgtui">;
 //
 // BUILTIN_INFO(HEXAGON.A4_vcmpbgt,QI_ftype_DIDI,2)
 //
 def int_hexagon_A4_vcmpbgt :
-Hexagon_qi_didi_Intrinsic<"HEXAGON_A4_vcmpbgt">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_A4_vcmpbgt">;
 //
 // BUILTIN_INFO(HEXAGON.A4_vcmpbgti,QI_ftype_DISI,2)
 //
 def int_hexagon_A4_vcmpbgti :
-Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmpbgti">;
+Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpbgti">;
 //
 // BUILTIN_INFO(HEXAGON.A4_cmpbeq,QI_ftype_SISI,2)
 //
 def int_hexagon_A4_cmpbeq :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmpbeq">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbeq">;
 //
 // BUILTIN_INFO(HEXAGON.A4_cmpbeqi,QI_ftype_SISI,2)
 //
 def int_hexagon_A4_cmpbeqi :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmpbeqi">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbeqi">;
 //
 // BUILTIN_INFO(HEXAGON.A4_cmpbgtu,QI_ftype_SISI,2)
 //
 def int_hexagon_A4_cmpbgtu :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmpbgtu">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbgtu">;
 //
 // BUILTIN_INFO(HEXAGON.A4_cmpbgtui,QI_ftype_SISI,2)
 //
 def int_hexagon_A4_cmpbgtui :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmpbgtui">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbgtui">;
 //
 // BUILTIN_INFO(HEXAGON.A4_cmpbgt,QI_ftype_SISI,2)
 //
 def int_hexagon_A4_cmpbgt :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmpbgt">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbgt">;
 //
 // BUILTIN_INFO(HEXAGON.A4_cmpbgti,QI_ftype_SISI,2)
 //
 def int_hexagon_A4_cmpbgti :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmpbgti">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbgti">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vcmpheq,QI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vcmpheq :
-Hexagon_qi_didi_Intrinsic<"HEXAGON_A2_vcmpheq">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpheq">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vcmphgt,QI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vcmphgt :
-Hexagon_qi_didi_Intrinsic<"HEXAGON_A2_vcmphgt">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmphgt">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vcmphgtu,QI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vcmphgtu :
-Hexagon_qi_didi_Intrinsic<"HEXAGON_A2_vcmphgtu">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmphgtu">;
 //
 // BUILTIN_INFO(HEXAGON.A4_vcmpheqi,QI_ftype_DISI,2)
 //
 def int_hexagon_A4_vcmpheqi :
-Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmpheqi">;
+Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpheqi">;
 //
 // BUILTIN_INFO(HEXAGON.A4_vcmphgti,QI_ftype_DISI,2)
 //
 def int_hexagon_A4_vcmphgti :
-Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmphgti">;
+Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmphgti">;
 //
 // BUILTIN_INFO(HEXAGON.A4_vcmphgtui,QI_ftype_DISI,2)
 //
 def int_hexagon_A4_vcmphgtui :
-Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmphgtui">;
+Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmphgtui">;
 //
 // BUILTIN_INFO(HEXAGON.A4_cmpheq,QI_ftype_SISI,2)
 //
 def int_hexagon_A4_cmpheq :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmpheq">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpheq">;
 //
 // BUILTIN_INFO(HEXAGON.A4_cmphgt,QI_ftype_SISI,2)
 //
 def int_hexagon_A4_cmphgt :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmphgt">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmphgt">;
 //
 // BUILTIN_INFO(HEXAGON.A4_cmphgtu,QI_ftype_SISI,2)
 //
 def int_hexagon_A4_cmphgtu :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmphgtu">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmphgtu">;
 //
 // BUILTIN_INFO(HEXAGON.A4_cmpheqi,QI_ftype_SISI,2)
 //
 def int_hexagon_A4_cmpheqi :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmpheqi">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpheqi">;
 //
 // BUILTIN_INFO(HEXAGON.A4_cmphgti,QI_ftype_SISI,2)
 //
 def int_hexagon_A4_cmphgti :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmphgti">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmphgti">;
 //
 // BUILTIN_INFO(HEXAGON.A4_cmphgtui,QI_ftype_SISI,2)
 //
 def int_hexagon_A4_cmphgtui :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_A4_cmphgtui">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmphgtui">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vcmpweq,QI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vcmpweq :
-Hexagon_qi_didi_Intrinsic<"HEXAGON_A2_vcmpweq">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpweq">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vcmpwgt,QI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vcmpwgt :
-Hexagon_qi_didi_Intrinsic<"HEXAGON_A2_vcmpwgt">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpwgt">;
 //
 // BUILTIN_INFO(HEXAGON.A2_vcmpwgtu,QI_ftype_DIDI,2)
 //
 def int_hexagon_A2_vcmpwgtu :
-Hexagon_qi_didi_Intrinsic<"HEXAGON_A2_vcmpwgtu">;
+Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpwgtu">;
 //
 // BUILTIN_INFO(HEXAGON.A4_vcmpweqi,QI_ftype_DISI,2)
 //
 def int_hexagon_A4_vcmpweqi :
-Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmpweqi">;
+Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpweqi">;
 //
 // BUILTIN_INFO(HEXAGON.A4_vcmpwgti,QI_ftype_DISI,2)
 //
 def int_hexagon_A4_vcmpwgti :
-Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmpwgti">;
+Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpwgti">;
 //
 // BUILTIN_INFO(HEXAGON.A4_vcmpwgtui,QI_ftype_DISI,2)
 //
 def int_hexagon_A4_vcmpwgtui :
-Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_vcmpwgtui">;
+Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpwgtui">;
 //
 // BUILTIN_INFO(HEXAGON.A4_boundscheck,QI_ftype_SIDI,2)
 //
 def int_hexagon_A4_boundscheck :
-Hexagon_qi_sidi_Intrinsic<"HEXAGON_A4_boundscheck">;
+Hexagon_si_sidi_Intrinsic<"HEXAGON_A4_boundscheck">;
 //
 // BUILTIN_INFO(HEXAGON.A4_tlbmatch,QI_ftype_DISI,2)
 //
 def int_hexagon_A4_tlbmatch :
-Hexagon_qi_disi_Intrinsic<"HEXAGON_A4_tlbmatch">;
+Hexagon_si_disi_Intrinsic<"HEXAGON_A4_tlbmatch">;
 //
 // BUILTIN_INFO(HEXAGON.C2_tfrpr,SI_ftype_QI,1)
 //
@@ -1059,17 +1059,17 @@ Hexagon_si_qi_Intrinsic<"HEXAGON_C2_tfrpr">;
 // BUILTIN_INFO(HEXAGON.C2_tfrrp,QI_ftype_SI,1)
 //
 def int_hexagon_C2_tfrrp :
-Hexagon_qi_si_Intrinsic<"HEXAGON_C2_tfrrp">;
+Hexagon_si_si_Intrinsic<"HEXAGON_C2_tfrrp">;
 //
 // BUILTIN_INFO(HEXAGON.C4_fastcorner9,QI_ftype_QIQI,2)
 //
 def int_hexagon_C4_fastcorner9 :
-Hexagon_qi_qiqi_Intrinsic<"HEXAGON_C4_fastcorner9">;
+Hexagon_si_qiqi_Intrinsic<"HEXAGON_C4_fastcorner9">;
 //
 // BUILTIN_INFO(HEXAGON.C4_fastcorner9_not,QI_ftype_QIQI,2)
 //
 def int_hexagon_C4_fastcorner9_not :
-Hexagon_qi_qiqi_Intrinsic<"HEXAGON_C4_fastcorner9_not">;
+Hexagon_si_qiqi_Intrinsic<"HEXAGON_C4_fastcorner9_not">;
 //
 // BUILTIN_INFO(HEXAGON.M2_mpy_acc_hh_s0,SI_ftype_SISISI,3)
 //
@@ -2849,7 +2849,7 @@ Hexagon_di_di_Intrinsic<"HEXAGON_A2_tfrp">;
 // BUILTIN_INFO(HEXAGON.A2_tfrpi,DI_ftype_SI,1)
 //
 def int_hexagon_A2_tfrpi :
-Hexagon_di_si_Intrinsic<"HEXAGON_A2_tfrpi">;
+Hexagon_di_di_Intrinsic<"HEXAGON_A2_tfrpi">;
 //
 // BUILTIN_INFO(HEXAGON.A2_zxtb,SI_ftype_SI,1)
 //
@@ -3609,22 +3609,22 @@ Hexagon_sf_sfsfsf_Intrinsic<"HEXAGON_F2_sffms_lib">;
 // BUILTIN_INFO(HEXAGON.F2_sfcmpeq,QI_ftype_SFSF,2)
 //
 def int_hexagon_F2_sfcmpeq :
-Hexagon_qi_sfsf_Intrinsic<"HEXAGON_F2_sfcmpeq">;
+Hexagon_si_sfsf_Intrinsic<"HEXAGON_F2_sfcmpeq">;
 //
 // BUILTIN_INFO(HEXAGON.F2_sfcmpgt,QI_ftype_SFSF,2)
 //
 def int_hexagon_F2_sfcmpgt :
-Hexagon_qi_sfsf_Intrinsic<"HEXAGON_F2_sfcmpgt">;
+Hexagon_si_sfsf_Intrinsic<"HEXAGON_F2_sfcmpgt">;
 //
 // BUILTIN_INFO(HEXAGON.F2_sfcmpge,QI_ftype_SFSF,2)
 //
 def int_hexagon_F2_sfcmpge :
-Hexagon_qi_sfsf_Intrinsic<"HEXAGON_F2_sfcmpge">;
+Hexagon_si_sfsf_Intrinsic<"HEXAGON_F2_sfcmpge">;
 //
 // BUILTIN_INFO(HEXAGON.F2_sfcmpuo,QI_ftype_SFSF,2)
 //
 def int_hexagon_F2_sfcmpuo :
-Hexagon_qi_sfsf_Intrinsic<"HEXAGON_F2_sfcmpuo">;
+Hexagon_si_sfsf_Intrinsic<"HEXAGON_F2_sfcmpuo">;
 //
 // BUILTIN_INFO(HEXAGON.F2_sfmax,SF_ftype_SFSF,2)
 //
@@ -3639,7 +3639,7 @@ Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sfmin">;
 // BUILTIN_INFO(HEXAGON.F2_sfclass,QI_ftype_SFSI,2)
 //
 def int_hexagon_F2_sfclass :
-Hexagon_qi_sfsi_Intrinsic<"HEXAGON_F2_sfclass">;
+Hexagon_si_sfsi_Intrinsic<"HEXAGON_F2_sfclass">;
 //
 // BUILTIN_INFO(HEXAGON.F2_sfimm_p,SF_ftype_SI,1)
 //
@@ -3666,80 +3666,30 @@ Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sffixupd">;
 def int_hexagon_F2_sffixupr :
 Hexagon_sf_sf_Intrinsic<"HEXAGON_F2_sffixupr">;
 //
-// BUILTIN_INFO(HEXAGON.F2_dfadd,DF_ftype_DFDF,2)
-//
-def int_hexagon_F2_dfadd :
-Hexagon_df_dfdf_Intrinsic<"HEXAGON_F2_dfadd">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dfsub,DF_ftype_DFDF,2)
-//
-def int_hexagon_F2_dfsub :
-Hexagon_df_dfdf_Intrinsic<"HEXAGON_F2_dfsub">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dfmpy,DF_ftype_DFDF,2)
-//
-def int_hexagon_F2_dfmpy :
-Hexagon_df_dfdf_Intrinsic<"HEXAGON_F2_dfmpy">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dffma,DF_ftype_DFDFDF,3)
-//
-def int_hexagon_F2_dffma :
-Hexagon_df_dfdfdf_Intrinsic<"HEXAGON_F2_dffma">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dffms,DF_ftype_DFDFDF,3)
-//
-def int_hexagon_F2_dffms :
-Hexagon_df_dfdfdf_Intrinsic<"HEXAGON_F2_dffms">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dffma_lib,DF_ftype_DFDFDF,3)
-//
-def int_hexagon_F2_dffma_lib :
-Hexagon_df_dfdfdf_Intrinsic<"HEXAGON_F2_dffma_lib">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dffms_lib,DF_ftype_DFDFDF,3)
-//
-def int_hexagon_F2_dffms_lib :
-Hexagon_df_dfdfdf_Intrinsic<"HEXAGON_F2_dffms_lib">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dffma_sc,DF_ftype_DFDFDFQI,4)
-//
-def int_hexagon_F2_dffma_sc :
-Hexagon_df_dfdfdfqi_Intrinsic<"HEXAGON_F2_dffma_sc">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dfmax,DF_ftype_DFDF,2)
-//
-def int_hexagon_F2_dfmax :
-Hexagon_df_dfdf_Intrinsic<"HEXAGON_F2_dfmax">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dfmin,DF_ftype_DFDF,2)
-//
-def int_hexagon_F2_dfmin :
-Hexagon_df_dfdf_Intrinsic<"HEXAGON_F2_dfmin">;
-//
 // BUILTIN_INFO(HEXAGON.F2_dfcmpeq,QI_ftype_DFDF,2)
 //
 def int_hexagon_F2_dfcmpeq :
-Hexagon_qi_dfdf_Intrinsic<"HEXAGON_F2_dfcmpeq">;
+Hexagon_si_dfdf_Intrinsic<"HEXAGON_F2_dfcmpeq">;
 //
 // BUILTIN_INFO(HEXAGON.F2_dfcmpgt,QI_ftype_DFDF,2)
 //
 def int_hexagon_F2_dfcmpgt :
-Hexagon_qi_dfdf_Intrinsic<"HEXAGON_F2_dfcmpgt">;
+Hexagon_si_dfdf_Intrinsic<"HEXAGON_F2_dfcmpgt">;
 //
 // BUILTIN_INFO(HEXAGON.F2_dfcmpge,QI_ftype_DFDF,2)
 //
 def int_hexagon_F2_dfcmpge :
-Hexagon_qi_dfdf_Intrinsic<"HEXAGON_F2_dfcmpge">;
+Hexagon_si_dfdf_Intrinsic<"HEXAGON_F2_dfcmpge">;
 //
 // BUILTIN_INFO(HEXAGON.F2_dfcmpuo,QI_ftype_DFDF,2)
 //
 def int_hexagon_F2_dfcmpuo :
-Hexagon_qi_dfdf_Intrinsic<"HEXAGON_F2_dfcmpuo">;
+Hexagon_si_dfdf_Intrinsic<"HEXAGON_F2_dfcmpuo">;
 //
 // BUILTIN_INFO(HEXAGON.F2_dfclass,QI_ftype_DFSI,2)
 //
 def int_hexagon_F2_dfclass :
-Hexagon_qi_dfsi_Intrinsic<"HEXAGON_F2_dfclass">;
+Hexagon_si_dfsi_Intrinsic<"HEXAGON_F2_dfclass">;
 //
 // BUILTIN_INFO(HEXAGON.F2_dfimm_p,DF_ftype_SI,1)
 //
@@ -3751,21 +3701,6 @@ Hexagon_df_si_Intrinsic<"HEXAGON_F2_dfimm_p">;
 def int_hexagon_F2_dfimm_n :
 Hexagon_df_si_Intrinsic<"HEXAGON_F2_dfimm_n">;
 //
-// BUILTIN_INFO(HEXAGON.F2_dffixupn,DF_ftype_DFDF,2)
-//
-def int_hexagon_F2_dffixupn :
-Hexagon_df_dfdf_Intrinsic<"HEXAGON_F2_dffixupn">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dffixupd,DF_ftype_DFDF,2)
-//
-def int_hexagon_F2_dffixupd :
-Hexagon_df_dfdf_Intrinsic<"HEXAGON_F2_dffixupd">;
-//
-// BUILTIN_INFO(HEXAGON.F2_dffixupr,DF_ftype_DF,1)
-//
-def int_hexagon_F2_dffixupr :
-Hexagon_df_df_Intrinsic<"HEXAGON_F2_dffixupr">;
-//
 // BUILTIN_INFO(HEXAGON.F2_conv_sf2df,DF_ftype_SF,1)
 //
 def int_hexagon_F2_conv_sf2df :
@@ -4494,12 +4429,12 @@ Hexagon_di_didi_Intrinsic<"HEXAGON_S2_extractup_rp">;
 // BUILTIN_INFO(HEXAGON.S2_tstbit_i,QI_ftype_SISI,2)
 //
 def int_hexagon_S2_tstbit_i :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_S2_tstbit_i">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_tstbit_i">;
 //
 // BUILTIN_INFO(HEXAGON.S4_ntstbit_i,QI_ftype_SISI,2)
 //
 def int_hexagon_S4_ntstbit_i :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_S4_ntstbit_i">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S4_ntstbit_i">;
 //
 // BUILTIN_INFO(HEXAGON.S2_setbit_i,SI_ftype_SISI,2)
 //
@@ -4519,12 +4454,12 @@ Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_clrbit_i">;
 // BUILTIN_INFO(HEXAGON.S2_tstbit_r,QI_ftype_SISI,2)
 //
 def int_hexagon_S2_tstbit_r :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_S2_tstbit_r">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_tstbit_r">;
 //
 // BUILTIN_INFO(HEXAGON.S4_ntstbit_r,QI_ftype_SISI,2)
 //
 def int_hexagon_S4_ntstbit_r :
-Hexagon_qi_sisi_Intrinsic<"HEXAGON_S4_ntstbit_r">;
+Hexagon_si_sisi_Intrinsic<"HEXAGON_S4_ntstbit_r">;
 //
 // BUILTIN_INFO(HEXAGON.S2_setbit_r,SI_ftype_SISI,2)
 //
@@ -4875,3 +4810,21 @@ Hexagon_di_di_Intrinsic<"HEXAGON_S2_interleave">;
 //
 def int_hexagon_S2_deinterleave :
 Hexagon_di_di_Intrinsic<"HEXAGON_S2_deinterleave">;
+
+def llvm_ptr32_ty : LLVMPointerType<llvm_i32_ty>;
+def llvm_ptr64_ty : LLVMPointerType<llvm_i64_ty>;
+
+// Mark locked loads as read/write to prevent any accidental reordering.
+def int_hexagon_L2_loadw_locked :
+Hexagon_Intrinsic<"HEXAGON_L2_loadw_locked", [llvm_i32_ty], [llvm_ptr32_ty],
+      [IntrReadWriteArgMem, NoCapture<0>]>;
+def int_hexagon_L4_loadd_locked :
+Hexagon_Intrinsic<"HEXAGON_L4_loadd_locked", [llvm_i64_ty], [llvm_ptr64_ty],
+      [IntrReadWriteArgMem, NoCapture<0>]>;
+
+def int_hexagon_S2_storew_locked :
+Hexagon_Intrinsic<"HEXAGON_S2_storew_locked", [llvm_i32_ty],
+      [llvm_ptr32_ty, llvm_i32_ty], [IntrReadWriteArgMem, NoCapture<0>]>;
+def int_hexagon_S4_stored_locked :
+Hexagon_Intrinsic<"HEXAGON_S4_stored_locked", [llvm_i32_ty],
+      [llvm_ptr64_ty, llvm_i64_ty], [IntrReadWriteArgMem, NoCapture<0>]>;
diff --git a/include/llvm/IR/IntrinsicsPowerPC.td b/include/llvm/IR/IntrinsicsPowerPC.td
index 5cdabde..110d55d 100644
--- a/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/include/llvm/IR/IntrinsicsPowerPC.td
@@ -542,3 +542,180 @@ def int_ppc_vsx_xsmindp : PowerPC_VSX_Sca_DDD_Intrinsic<"xsmindp">;
 def int_ppc_vsx_xvdivdp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvdivdp">;
 def int_ppc_vsx_xvdivsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvdivsp">;
 }
+
+//===----------------------------------------------------------------------===//
+// PowerPC QPX Intrinsics.
+//
+
+let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
+  /// PowerPC_QPX_Intrinsic - Base class for all QPX intrinsics.
+  class PowerPC_QPX_Intrinsic<string GCCIntSuffix, list<LLVMType> ret_types,
+                              list<LLVMType> param_types,
+                              list<IntrinsicProperty> properties>
+    : GCCBuiltin<!strconcat("__builtin_qpx_", GCCIntSuffix)>,
+      Intrinsic<ret_types, param_types, properties>;
+}
+
+//===----------------------------------------------------------------------===//
+// PowerPC QPX Intrinsic Class Definitions.
+//
+
+/// PowerPC_QPX_FF_Intrinsic - A PowerPC intrinsic that takes one v4f64
+/// vector and returns one.  These intrinsics have no side effects.
+class PowerPC_QPX_FF_Intrinsic<string GCCIntSuffix>
+  : PowerPC_QPX_Intrinsic<GCCIntSuffix,
+                          [llvm_v4f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
+
+/// PowerPC_QPX_FFF_Intrinsic - A PowerPC intrinsic that takes two v4f64
+/// vectors and returns one.  These intrinsics have no side effects.
+class PowerPC_QPX_FFF_Intrinsic<string GCCIntSuffix>
+  : PowerPC_QPX_Intrinsic<GCCIntSuffix,
+                          [llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty],
+                          [IntrNoMem]>;
+
+/// PowerPC_QPX_FFFF_Intrinsic - A PowerPC intrinsic that takes three v4f64
+/// vectors and returns one.  These intrinsics have no side effects.
+class PowerPC_QPX_FFFF_Intrinsic<string GCCIntSuffix>
+  : PowerPC_QPX_Intrinsic<GCCIntSuffix,
+                          [llvm_v4f64_ty],
+                          [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
+                          [IntrNoMem]>;
+
+/// PowerPC_QPX_Load_Intrinsic - A PowerPC intrinsic that takes a pointer
+/// and returns a v4f64.
+class PowerPC_QPX_Load_Intrinsic<string GCCIntSuffix>
+  : PowerPC_QPX_Intrinsic<GCCIntSuffix,
+                          [llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
+
+/// PowerPC_QPX_LoadPerm_Intrinsic - A PowerPC intrinsic that takes a pointer
+/// and returns a v4f64 permutation.
+class PowerPC_QPX_LoadPerm_Intrinsic<string GCCIntSuffix>
+  : PowerPC_QPX_Intrinsic<GCCIntSuffix,
+                          [llvm_v4f64_ty], [llvm_ptr_ty], [IntrNoMem]>;
+
+/// PowerPC_QPX_Store_Intrinsic - A PowerPC intrinsic that takes a pointer
+/// and stores a v4f64.
+class PowerPC_QPX_Store_Intrinsic<string GCCIntSuffix>
+  : PowerPC_QPX_Intrinsic<GCCIntSuffix,
+                          [], [llvm_v4f64_ty, llvm_ptr_ty],
+                          [IntrReadWriteArgMem]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC QPX Intrinsic Definitions.
+
+let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
+  // Add Instructions
+  def int_ppc_qpx_qvfadd : PowerPC_QPX_FFF_Intrinsic<"qvfadd">;
+  def int_ppc_qpx_qvfadds : PowerPC_QPX_FFF_Intrinsic<"qvfadds">;
+  def int_ppc_qpx_qvfsub : PowerPC_QPX_FFF_Intrinsic<"qvfsub">;
+  def int_ppc_qpx_qvfsubs : PowerPC_QPX_FFF_Intrinsic<"qvfsubs">;
+
+  // Estimate Instructions
+  def int_ppc_qpx_qvfre : PowerPC_QPX_FF_Intrinsic<"qvfre">;
+  def int_ppc_qpx_qvfres : PowerPC_QPX_FF_Intrinsic<"qvfres">;
+  def int_ppc_qpx_qvfrsqrte : PowerPC_QPX_FF_Intrinsic<"qvfrsqrte">;
+  def int_ppc_qpx_qvfrsqrtes : PowerPC_QPX_FF_Intrinsic<"qvfrsqrtes">;
+
+  // Multiply Instructions
+  def int_ppc_qpx_qvfmul : PowerPC_QPX_FFF_Intrinsic<"qvfmul">;
+  def int_ppc_qpx_qvfmuls : PowerPC_QPX_FFF_Intrinsic<"qvfmuls">;
+  def int_ppc_qpx_qvfxmul : PowerPC_QPX_FFF_Intrinsic<"qvfxmul">;
+  def int_ppc_qpx_qvfxmuls : PowerPC_QPX_FFF_Intrinsic<"qvfxmuls">;
+
+  // Multiply-add instructions
+  def int_ppc_qpx_qvfmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfmadd">;
+  def int_ppc_qpx_qvfmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfmadds">;
+  def int_ppc_qpx_qvfnmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfnmadd">;
+  def int_ppc_qpx_qvfnmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfnmadds">;
+  def int_ppc_qpx_qvfmsub : PowerPC_QPX_FFFF_Intrinsic<"qvfmsub">;
+  def int_ppc_qpx_qvfmsubs : PowerPC_QPX_FFFF_Intrinsic<"qvfmsubs">;
+  def int_ppc_qpx_qvfnmsub : PowerPC_QPX_FFFF_Intrinsic<"qvfnmsub">;
+  def int_ppc_qpx_qvfnmsubs : PowerPC_QPX_FFFF_Intrinsic<"qvfnmsubs">;
+  def int_ppc_qpx_qvfxmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxmadd">;
+  def int_ppc_qpx_qvfxmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxmadds">;
+  def int_ppc_qpx_qvfxxnpmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxnpmadd">;
+  def int_ppc_qpx_qvfxxnpmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxnpmadds">;
+  def int_ppc_qpx_qvfxxcpnmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxcpnmadd">;
+  def int_ppc_qpx_qvfxxcpnmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxcpnmadds">;
+  def int_ppc_qpx_qvfxxmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxmadd">;
+  def int_ppc_qpx_qvfxxmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxmadds">;
+
+  // Select Instruction
+  def int_ppc_qpx_qvfsel : PowerPC_QPX_FFFF_Intrinsic<"qvfsel">;
+
+  // Permute Instruction
+  def int_ppc_qpx_qvfperm : PowerPC_QPX_FFFF_Intrinsic<"qvfperm">;
+
+  // Convert and Round Instructions
+  def int_ppc_qpx_qvfctid : PowerPC_QPX_FF_Intrinsic<"qvfctid">;
+  def int_ppc_qpx_qvfctidu : PowerPC_QPX_FF_Intrinsic<"qvfctidu">;
+  def int_ppc_qpx_qvfctidz : PowerPC_QPX_FF_Intrinsic<"qvfctidz">;
+  def int_ppc_qpx_qvfctiduz : PowerPC_QPX_FF_Intrinsic<"qvfctiduz">;
+  def int_ppc_qpx_qvfctiw : PowerPC_QPX_FF_Intrinsic<"qvfctiw">;
+  def int_ppc_qpx_qvfctiwu : PowerPC_QPX_FF_Intrinsic<"qvfctiwu">;
+  def int_ppc_qpx_qvfctiwz : PowerPC_QPX_FF_Intrinsic<"qvfctiwz">;
+  def int_ppc_qpx_qvfctiwuz : PowerPC_QPX_FF_Intrinsic<"qvfctiwuz">;
+  def int_ppc_qpx_qvfcfid : PowerPC_QPX_FF_Intrinsic<"qvfcfid">;
+  def int_ppc_qpx_qvfcfidu : PowerPC_QPX_FF_Intrinsic<"qvfcfidu">;
+  def int_ppc_qpx_qvfcfids : PowerPC_QPX_FF_Intrinsic<"qvfcfids">;
+  def int_ppc_qpx_qvfcfidus : PowerPC_QPX_FF_Intrinsic<"qvfcfidus">;
+  def int_ppc_qpx_qvfrsp : PowerPC_QPX_FF_Intrinsic<"qvfrsp">;
+  def int_ppc_qpx_qvfriz : PowerPC_QPX_FF_Intrinsic<"qvfriz">;
+  def int_ppc_qpx_qvfrin : PowerPC_QPX_FF_Intrinsic<"qvfrin">;
+  def int_ppc_qpx_qvfrip : PowerPC_QPX_FF_Intrinsic<"qvfrip">;
+  def int_ppc_qpx_qvfrim : PowerPC_QPX_FF_Intrinsic<"qvfrim">;
+
+  // Move Instructions
+  def int_ppc_qpx_qvfneg : PowerPC_QPX_FF_Intrinsic<"qvfneg">;
+  def int_ppc_qpx_qvfabs : PowerPC_QPX_FF_Intrinsic<"qvfabs">;
+  def int_ppc_qpx_qvfnabs : PowerPC_QPX_FF_Intrinsic<"qvfnabs">;
+  def int_ppc_qpx_qvfcpsgn : PowerPC_QPX_FFF_Intrinsic<"qvfcpsgn">;
+
+  // Compare Instructions
+  def int_ppc_qpx_qvftstnan : PowerPC_QPX_FFF_Intrinsic<"qvftstnan">;
+  def int_ppc_qpx_qvfcmplt : PowerPC_QPX_FFF_Intrinsic<"qvfcmplt">;
+  def int_ppc_qpx_qvfcmpgt : PowerPC_QPX_FFF_Intrinsic<"qvfcmpgt">;
+  def int_ppc_qpx_qvfcmpeq : PowerPC_QPX_FFF_Intrinsic<"qvfcmpeq">;
+
+  // Load instructions
+  def int_ppc_qpx_qvlfd : PowerPC_QPX_Load_Intrinsic<"qvlfd">;
+  def int_ppc_qpx_qvlfda : PowerPC_QPX_Load_Intrinsic<"qvlfda">;
+  def int_ppc_qpx_qvlfs : PowerPC_QPX_Load_Intrinsic<"qvlfs">;
+  def int_ppc_qpx_qvlfsa : PowerPC_QPX_Load_Intrinsic<"qvlfsa">;
+
+  def int_ppc_qpx_qvlfcda : PowerPC_QPX_Load_Intrinsic<"qvlfcda">;
+  def int_ppc_qpx_qvlfcd : PowerPC_QPX_Load_Intrinsic<"qvlfcd">;
+  def int_ppc_qpx_qvlfcsa : PowerPC_QPX_Load_Intrinsic<"qvlfcsa">;
+  def int_ppc_qpx_qvlfcs : PowerPC_QPX_Load_Intrinsic<"qvlfcs">;
+  def int_ppc_qpx_qvlfiwaa : PowerPC_QPX_Load_Intrinsic<"qvlfiwaa">;
+  def int_ppc_qpx_qvlfiwa : PowerPC_QPX_Load_Intrinsic<"qvlfiwa">;
+  def int_ppc_qpx_qvlfiwza : PowerPC_QPX_Load_Intrinsic<"qvlfiwza">;
+  def int_ppc_qpx_qvlfiwz : PowerPC_QPX_Load_Intrinsic<"qvlfiwz">;
+
+  def int_ppc_qpx_qvlpcld : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcld">;
+  def int_ppc_qpx_qvlpcls : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcls">;
+  def int_ppc_qpx_qvlpcrd : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcrd">;
+  def int_ppc_qpx_qvlpcrs : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcrs">;
+
+  // Store instructions
+  def int_ppc_qpx_qvstfd : PowerPC_QPX_Store_Intrinsic<"qvstfd">;
+  def int_ppc_qpx_qvstfda : PowerPC_QPX_Store_Intrinsic<"qvstfda">;
+  def int_ppc_qpx_qvstfs : PowerPC_QPX_Store_Intrinsic<"qvstfs">;
+  def int_ppc_qpx_qvstfsa : PowerPC_QPX_Store_Intrinsic<"qvstfsa">;
+
+  def int_ppc_qpx_qvstfcda : PowerPC_QPX_Store_Intrinsic<"qvstfcda">;
+  def int_ppc_qpx_qvstfcd : PowerPC_QPX_Store_Intrinsic<"qvstfcd">;
+  def int_ppc_qpx_qvstfcsa : PowerPC_QPX_Store_Intrinsic<"qvstfcsa">;
+  def int_ppc_qpx_qvstfcs : PowerPC_QPX_Store_Intrinsic<"qvstfcs">;
+  def int_ppc_qpx_qvstfiwa : PowerPC_QPX_Store_Intrinsic<"qvstfiwa">;
+  def int_ppc_qpx_qvstfiw : PowerPC_QPX_Store_Intrinsic<"qvstfiw">;
+
+  // Logical and permutation formation
+  def int_ppc_qpx_qvflogical : PowerPC_QPX_Intrinsic<"qvflogical",
+                          [llvm_v4f64_ty],
+                          [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i32_ty],
+                          [IntrNoMem]>;
+  def int_ppc_qpx_qvgpci : PowerPC_QPX_Intrinsic<"qvgpci",
+                          [llvm_v4f64_ty], [llvm_i32_ty], [IntrNoMem]>;
+}
+
diff --git a/include/llvm/IR/IntrinsicsR600.td b/include/llvm/IR/IntrinsicsR600.td
index d99c42d..5055667 100644
--- a/include/llvm/IR/IntrinsicsR600.td
+++ b/include/llvm/IR/IntrinsicsR600.td
@@ -76,6 +76,9 @@ def int_AMDGPU_rsq_clamped : GCCBuiltin<"__builtin_amdgpu_rsq_clamped">,
 def int_AMDGPU_ldexp : GCCBuiltin<"__builtin_amdgpu_ldexp">,
   Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>;
 
+def int_AMDGPU_class : GCCBuiltin<"__builtin_amdgpu_class">,
+  Intrinsic<[llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>;
+
 def int_AMDGPU_read_workdim : AMDGPUReadPreloadRegisterIntrinsic <
                                        "__builtin_amdgpu_read_workdim">;
 
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 59ff946..60deb32 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -453,19 +453,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_psrai_d : GCCBuiltin<"__builtin_ia32_psradi128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                          llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_sse2_psll_dq : GCCBuiltin<"__builtin_ia32_pslldqi128">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_sse2_psrl_dq : GCCBuiltin<"__builtin_ia32_psrldqi128">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_sse2_psll_dq_bs : GCCBuiltin<"__builtin_ia32_pslldqi128_byteshift">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_sse2_psrl_dq_bs : GCCBuiltin<"__builtin_ia32_psrldqi128_byteshift">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
 }
 
 // Conversion ops
@@ -1363,6 +1350,12 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_mask_loadu_pd_512 : GCCBuiltin<"__builtin_ia32_loadupd512_mask">,
         Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty],
                   [IntrReadArgMem]>;
+  def int_x86_avx512_mask_load_ps_512 : GCCBuiltin<"__builtin_ia32_loadaps512_mask">,
+        Intrinsic<[llvm_v16f32_ty], [llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty],
+                  [IntrReadArgMem]>;
+  def int_x86_avx512_mask_load_pd_512 : GCCBuiltin<"__builtin_ia32_loadapd512_mask">,
+        Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty],
+                  [IntrReadArgMem]>;
 }
 
 // Conditional store ops
@@ -1389,6 +1382,14 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_storeupd512_mask">,
         Intrinsic<[], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty],
                   [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_store_ps_512 :
+        GCCBuiltin<"__builtin_ia32_storeaps512_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty],
+                  [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_store_pd_512 :
+        GCCBuiltin<"__builtin_ia32_storeapd512_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;
   def int_x86_avx512_mask_store_ss :
         GCCBuiltin<"__builtin_ia32_storess_mask">,
         Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty],
@@ -1572,19 +1573,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
                          llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx2_psll_dq : GCCBuiltin<"__builtin_ia32_pslldqi256">,
-              Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx2_psrl_dq : GCCBuiltin<"__builtin_ia32_psrldqi256">,
-              Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx2_psll_dq_bs : GCCBuiltin<"__builtin_ia32_pslldqi256_byteshift">,
-              Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx2_psrl_dq_bs : GCCBuiltin<"__builtin_ia32_psrldqi256_byteshift">,
-              Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
-
   def int_x86_avx512_mask_pslli_d : GCCBuiltin<"__builtin_ia32_pslldi512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
                          llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
@@ -1603,6 +1591,25 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_mask_psrai_q : GCCBuiltin<"__builtin_ia32_psraqi512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
                          llvm_i32_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_psll_d : GCCBuiltin<"__builtin_ia32_pslld512_mask">,
+              Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                         llvm_v4i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psll_q : GCCBuiltin<"__builtin_ia32_psllq512_mask">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                         llvm_v2i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrl_d : GCCBuiltin<"__builtin_ia32_psrld512_mask">,
+              Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                         llvm_v4i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrl_q : GCCBuiltin<"__builtin_ia32_psrlq512_mask">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                         llvm_v2i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psra_d : GCCBuiltin<"__builtin_ia32_psrad512_mask">,
+              Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                         llvm_v4i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psra_q : GCCBuiltin<"__builtin_ia32_psraq512_mask">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                         llvm_v2i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
 }
 
 // Pack ops.
@@ -1911,6 +1918,31 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_psrav_d_256 : GCCBuiltin<"__builtin_ia32_psrav8si">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty],
                         [IntrNoMem]>;
+
+  def int_x86_avx512_mask_psllv_d : GCCBuiltin<"__builtin_ia32_psllv16si_mask">,
+              Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                         llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty],
+                        [IntrNoMem]>;
+  def int_x86_avx512_mask_psllv_q : GCCBuiltin<"__builtin_ia32_psllv8di_mask">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                         llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], 
+                        [IntrNoMem]>;
+  def int_x86_avx512_mask_psrav_d : GCCBuiltin<"__builtin_ia32_psrav16si_mask">,
+              Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                         llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty],
+                        [IntrNoMem]>;
+  def int_x86_avx512_mask_psrav_q : GCCBuiltin<"__builtin_ia32_psrav8di_mask">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                         llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_avx512_mask_psrlv_d : GCCBuiltin<"__builtin_ia32_psrlv16si_mask">,
+              Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                         llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty],
+                        [IntrNoMem]>;
+  def int_x86_avx512_mask_psrlv_q : GCCBuiltin<"__builtin_ia32_psrlv8di_mask">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                         llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], 
+                        [IntrNoMem]>;
 }
 
 // Gather ops
@@ -2029,11 +2061,31 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                         [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,
                          llvm_i16_ty, llvm_i32_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_mask_vfmadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfmaddps256_mask">,
+              Intrinsic<[llvm_v8f32_ty],
+                        [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_mask_vfmadd_ps_128 : GCCBuiltin<"__builtin_ia32_vfmaddps128_mask">,
+              Intrinsic<[llvm_v4f32_ty],
+                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
   def int_x86_fma_mask_vfmadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfmaddpd512_mask">,
               Intrinsic<[llvm_v8f64_ty],
                         [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,
                          llvm_i8_ty, llvm_i32_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_mask_vfmadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfmaddpd256_mask">,
+              Intrinsic<[llvm_v4f64_ty],
+                        [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_mask_vfmadd_pd_128 : GCCBuiltin<"__builtin_ia32_vfmaddpd128_mask">,
+              Intrinsic<[llvm_v2f64_ty],
+                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
   def int_x86_fma_vfmsub_ss : GCCBuiltin<"__builtin_ia32_vfmsubss">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
@@ -2063,11 +2115,31 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                         [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,
                          llvm_i16_ty, llvm_i32_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_mask_vfmsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfmsubps256_mask">,
+              Intrinsic<[llvm_v8f32_ty],
+                        [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_mask_vfmsub_ps_128 : GCCBuiltin<"__builtin_ia32_vfmsubps128_mask">,
+              Intrinsic<[llvm_v4f32_ty],
+                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
   def int_x86_fma_mask_vfmsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfmsubpd512_mask">,
               Intrinsic<[llvm_v8f64_ty],
                         [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,
                          llvm_i8_ty, llvm_i32_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_mask_vfmsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfmsubpd256_mask">,
+              Intrinsic<[llvm_v4f64_ty],
+                        [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_mask_vfmsub_pd_128 : GCCBuiltin<"__builtin_ia32_vfmsubpd128_mask">,
+              Intrinsic<[llvm_v2f64_ty],
+                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
   def int_x86_fma_vfnmadd_ss : GCCBuiltin<"__builtin_ia32_vfnmaddss">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
@@ -2097,11 +2169,31 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                         [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,
                          llvm_i16_ty, llvm_i32_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_mask_vfnmadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfnmaddps256_mask">,
+              Intrinsic<[llvm_v8f32_ty],
+                        [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_mask_vfnmadd_ps_128 : GCCBuiltin<"__builtin_ia32_vfnmaddps128_mask">,
+              Intrinsic<[llvm_v4f32_ty],
+                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
   def int_x86_fma_mask_vfnmadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfnmaddpd512_mask">,
               Intrinsic<[llvm_v8f64_ty],
                         [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,
                          llvm_i8_ty, llvm_i32_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_mask_vfnmadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfnmaddpd256_mask">,
+              Intrinsic<[llvm_v4f64_ty],
+                        [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_mask_vfnmadd_pd_128 : GCCBuiltin<"__builtin_ia32_vfnmaddpd128_mask">,
+              Intrinsic<[llvm_v2f64_ty],
+                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
   def int_x86_fma_vfnmsub_ss : GCCBuiltin<"__builtin_ia32_vfnmsubss">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
@@ -2131,11 +2223,31 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                         [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,
                          llvm_i16_ty, llvm_i32_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_mask_vfnmsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfnmsubps256_mask">,
+              Intrinsic<[llvm_v8f32_ty],
+                        [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_mask_vfnmsub_ps_128 : GCCBuiltin<"__builtin_ia32_vfnmsubps128_mask">,
+              Intrinsic<[llvm_v4f32_ty],
+                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
   def int_x86_fma_mask_vfnmsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfnmsubpd512_mask">,
               Intrinsic<[llvm_v8f64_ty],
                         [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,
                          llvm_i8_ty, llvm_i32_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_mask_vfnmsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfnmsubpd256_mask">,
+              Intrinsic<[llvm_v4f64_ty],
+                        [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_mask_vfnmsub_pd_128 : GCCBuiltin<"__builtin_ia32_vfnmsubpd128_mask">,
+              Intrinsic<[llvm_v2f64_ty],
+                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
   def int_x86_fma_vfmaddsub_ps : GCCBuiltin<"__builtin_ia32_vfmaddsubps">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
@@ -2159,11 +2271,31 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                         [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,
                          llvm_i16_ty, llvm_i32_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_mask_vfmaddsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfmaddsubps256_mask">,
+              Intrinsic<[llvm_v8f32_ty],
+                        [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_mask_vfmaddsub_ps_128 : GCCBuiltin<"__builtin_ia32_vfmaddsubps128_mask">,
+              Intrinsic<[llvm_v4f32_ty],
+                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
   def int_x86_fma_mask_vfmaddsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfmaddsubpd512_mask">,
               Intrinsic<[llvm_v8f64_ty],
                         [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,
                          llvm_i8_ty, llvm_i32_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_mask_vfmaddsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfmaddsubpd256_mask">,
+              Intrinsic<[llvm_v4f64_ty],
+                        [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_mask_vfmaddsub_pd_128 : GCCBuiltin<"__builtin_ia32_vfmaddsubpd128_mask">,
+              Intrinsic<[llvm_v2f64_ty],
+                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
   def int_x86_fma_vfmsubadd_ps : GCCBuiltin<"__builtin_ia32_vfmsubaddps">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
@@ -2187,11 +2319,31 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                         [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,
                          llvm_i16_ty, llvm_i32_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_mask_vfmsubadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfmsubaddps256_mask">,
+              Intrinsic<[llvm_v8f32_ty],
+                        [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_mask_vfmsubadd_ps_128 : GCCBuiltin<"__builtin_ia32_vfmsubaddps128_mask">,
+              Intrinsic<[llvm_v4f32_ty],
+                        [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
   def int_x86_fma_mask_vfmsubadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfmsubaddpd512_mask">,
               Intrinsic<[llvm_v8f64_ty],
                         [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,
                          llvm_i8_ty, llvm_i32_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_mask_vfmsubadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfmsubaddpd256_mask">,
+              Intrinsic<[llvm_v4f64_ty],
+                        [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_mask_vfmsubadd_pd_128 : GCCBuiltin<"__builtin_ia32_vfmsubaddpd128_mask">,
+              Intrinsic<[llvm_v2f64_ty],
+                        [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
+                         llvm_i8_ty],
+                        [IntrNoMem]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3003,6 +3155,31 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Arithmetic ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+
+  def int_x86_avx512_mask_add_ps_512 : GCCBuiltin<"__builtin_ia32_addps512_mask">,
+          Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
+                     llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_add_pd_512 : GCCBuiltin<"__builtin_ia32_addpd512_mask">,
+          Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
+                     llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_sub_ps_512 : GCCBuiltin<"__builtin_ia32_subps512_mask">,
+          Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
+                     llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_sub_pd_512 : GCCBuiltin<"__builtin_ia32_subpd512_mask">,
+          Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
+                     llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_mul_ps_512 : GCCBuiltin<"__builtin_ia32_mulps512_mask">,
+          Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
+                     llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_mul_pd_512 : GCCBuiltin<"__builtin_ia32_mulpd512_mask">,
+          Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
+                     llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_div_ps_512 : GCCBuiltin<"__builtin_ia32_divps512_mask">,
+          Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
+                     llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_div_pd_512 : GCCBuiltin<"__builtin_ia32_divpd512_mask">,
+          Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
+                     llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512_mask">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                      llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
@@ -3016,12 +3193,14 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_rndscale_ss        : GCCBuiltin<"__builtin_ia32_rndscaless">,
-              Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_rndscale_sd        : GCCBuiltin<"__builtin_ia32_rndscalesd">,
-              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_rndscale_ss : GCCBuiltin<"__builtin_ia32_rndscaless_mask">,
+          Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
+                                     llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
+                                     [IntrNoMem]>;
+  def int_x86_avx512_mask_rndscale_sd : GCCBuiltin<"__builtin_ia32_rndscalesd_mask">,
+          Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
+                                      llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
+                                     [IntrNoMem]>;
   def int_x86_avx512_sqrt_ss        : GCCBuiltin<"__builtin_ia32_sqrtrndss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
                         [IntrNoMem]>;
@@ -3102,22 +3281,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                       [IntrNoMem]>;
 }
 
-// Integer shift ops.
-let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx512_psll_dq : GCCBuiltin<"__builtin_ia32_pslldqi512">,
-              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_psrl_dq : GCCBuiltin<"__builtin_ia32_psrldqi512">,
-              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_psll_dq_bs : GCCBuiltin<"__builtin_ia32_pslldqi512_byteshift">,
-              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_psrl_dq_bs : GCCBuiltin<"__builtin_ia32_psrldqi512_byteshift">,
-              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
-}
-
 // Gather and Scatter ops
 let TargetPrefix = "x86" in {
   def int_x86_avx512_gather_dpd_512  : GCCBuiltin<"__builtin_ia32_gathersiv8df">,
@@ -3226,22 +3389,22 @@ let TargetPrefix = "x86" in {
           GCCBuiltin<"__builtin_ia32_vpconflictsi_512_mask">,
           Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
                     llvm_v16i32_ty, llvm_i16_ty],
-                    []>;
+                    [IntrNoMem]>;
   def int_x86_avx512_mask_conflict_q_512 :
           GCCBuiltin<"__builtin_ia32_vpconflictdi_512_mask">,
           Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
                     llvm_v8i64_ty, llvm_i8_ty],
-                    []>;
+                    [IntrNoMem]>;
   def int_x86_avx512_mask_lzcnt_d_512 :
           GCCBuiltin<"__builtin_ia32_vplzcntd_512_mask">,
           Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
                     llvm_v16i32_ty, llvm_i16_ty],
-                    []>;
+                    [IntrNoMem]>;
   def int_x86_avx512_mask_lzcnt_q_512 :
           GCCBuiltin<"__builtin_ia32_vplzcntq_512_mask">,
           Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
                     llvm_v8i64_ty, llvm_i8_ty],
-                    []>;
+                    [IntrNoMem]>;
 }
 
 // Vector blend
@@ -3250,10 +3413,26 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         Intrinsic<[llvm_v16f32_ty],
                   [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty],
                   [IntrNoMem]>;
+  def int_x86_avx512_mask_blend_ps_256 : GCCBuiltin<"__builtin_ia32_blendmps_256_mask">,
+        Intrinsic<[llvm_v8f32_ty],
+                  [llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_blend_ps_128 : GCCBuiltin<"__builtin_ia32_blendmps_128_mask">,
+        Intrinsic<[llvm_v4f32_ty],
+                  [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
   def int_x86_avx512_mask_blend_pd_512 : GCCBuiltin<"__builtin_ia32_blendmpd_512_mask">,
         Intrinsic<[llvm_v8f64_ty],
                   [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty],
                   [IntrNoMem]>;
+  def int_x86_avx512_mask_blend_pd_256 : GCCBuiltin<"__builtin_ia32_blendmpd_256_mask">,
+        Intrinsic<[llvm_v4f64_ty],
+                  [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_blend_pd_128 : GCCBuiltin<"__builtin_ia32_blendmpd_128_mask">,
+        Intrinsic<[llvm_v2f64_ty],
+                  [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
 
   def int_x86_avx512_mask_blend_d_512 : GCCBuiltin<"__builtin_ia32_blendmd_512_mask">,
         Intrinsic<[llvm_v16i32_ty],
@@ -3263,6 +3442,48 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         Intrinsic<[llvm_v8i64_ty],
                   [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
                   [IntrNoMem]>;
+  def int_x86_avx512_mask_blend_d_256 : GCCBuiltin<"__builtin_ia32_blendmd_256_mask">,
+        Intrinsic<[llvm_v8i32_ty],
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_blend_q_256 : GCCBuiltin<"__builtin_ia32_blendmq_256_mask">,
+        Intrinsic<[llvm_v4i64_ty],
+                  [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_blend_d_128 : GCCBuiltin<"__builtin_ia32_blendmd_128_mask">,
+        Intrinsic<[llvm_v4i32_ty],
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_blend_q_128 : GCCBuiltin<"__builtin_ia32_blendmq_128_mask">,
+        Intrinsic<[llvm_v2i64_ty],
+                  [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+
+   def int_x86_avx512_mask_blend_w_512 : GCCBuiltin<"__builtin_ia32_blendmw_512_mask">,
+        Intrinsic<[llvm_v32i16_ty],
+                  [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty],
+                  [IntrNoMem]>;
+   def int_x86_avx512_mask_blend_w_256 : GCCBuiltin<"__builtin_ia32_blendmw_256_mask">,
+        Intrinsic<[llvm_v16i16_ty],
+                  [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty],
+                  [IntrNoMem]>;
+   def int_x86_avx512_mask_blend_w_128 : GCCBuiltin<"__builtin_ia32_blendmw_128_mask">,
+        Intrinsic<[llvm_v8i16_ty],
+                  [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+   def int_x86_avx512_mask_blend_b_512 : GCCBuiltin<"__builtin_ia32_blendmb_512_mask">,
+        Intrinsic<[llvm_v64i8_ty],
+                  [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty],
+                  [IntrNoMem]>;
+   def int_x86_avx512_mask_blend_b_256 : GCCBuiltin<"__builtin_ia32_blendmb_256_mask">,
+        Intrinsic<[llvm_v32i8_ty],
+                  [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],
+                  [IntrNoMem]>;
+   def int_x86_avx512_mask_blend_b_128 : GCCBuiltin<"__builtin_ia32_blendmb_128_mask">,
+        Intrinsic<[llvm_v16i8_ty],
+                  [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty],
+                  [IntrNoMem]>;
+
 }
 
 let TargetPrefix = "x86" in {
@@ -3307,29 +3528,29 @@ let TargetPrefix = "x86" in {
                   [IntrNoMem]>;
 
   def int_x86_avx512_mask_cmp_b_512: GCCBuiltin<"__builtin_ia32_cmpb512_mask">,
-        Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i8_ty,
                   llvm_i64_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_cmp_w_512: GCCBuiltin<"__builtin_ia32_cmpw512_mask">,
-        Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i8_ty,
                   llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_cmp_d_512: GCCBuiltin<"__builtin_ia32_cmpd512_mask">,
-        Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i8_ty,
                   llvm_i16_ty], [IntrNoMem ]>;
   def int_x86_avx512_mask_cmp_q_512: GCCBuiltin<"__builtin_ia32_cmpq512_mask">,
-        Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty,
                   llvm_i8_ty], [IntrNoMem]>;
 
   def int_x86_avx512_mask_ucmp_b_512: GCCBuiltin<"__builtin_ia32_ucmpb512_mask">,
-        Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i8_ty,
                   llvm_i64_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_ucmp_w_512: GCCBuiltin<"__builtin_ia32_ucmpw512_mask">,
-        Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i8_ty,
                   llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_ucmp_d_512: GCCBuiltin<"__builtin_ia32_ucmpd512_mask">,
-        Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i8_ty,
                   llvm_i16_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_ucmp_q_512: GCCBuiltin<"__builtin_ia32_ucmpq512_mask">,
-        Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty,
                   llvm_i8_ty], [IntrNoMem]>;
 
   // 256-bit
@@ -3360,29 +3581,29 @@ let TargetPrefix = "x86" in {
                   [IntrNoMem]>;
 
   def int_x86_avx512_mask_cmp_b_256: GCCBuiltin<"__builtin_ia32_cmpb256_mask">,
-        Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i8_ty,
                   llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_cmp_w_256: GCCBuiltin<"__builtin_ia32_cmpw256_mask">,
-        Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i8_ty,
                   llvm_i16_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_cmp_d_256: GCCBuiltin<"__builtin_ia32_cmpd256_mask">,
-        Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty,
                   llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_cmp_q_256: GCCBuiltin<"__builtin_ia32_cmpq256_mask">,
-        Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty,
                   llvm_i8_ty], [IntrNoMem]>;
 
   def int_x86_avx512_mask_ucmp_b_256: GCCBuiltin<"__builtin_ia32_ucmpb256_mask">,
-        Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i8_ty,
                   llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_ucmp_w_256: GCCBuiltin<"__builtin_ia32_ucmpw256_mask">,
-        Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i8_ty,
                   llvm_i16_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_ucmp_d_256: GCCBuiltin<"__builtin_ia32_ucmpd256_mask">,
-        Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty,
                   llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_ucmp_q_256: GCCBuiltin<"__builtin_ia32_ucmpq256_mask">,
-        Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty,
                   llvm_i8_ty], [IntrNoMem]>;
 
   // 128-bit
@@ -3413,39 +3634,243 @@ let TargetPrefix = "x86" in {
                   [IntrNoMem]>;
 
   def int_x86_avx512_mask_cmp_b_128: GCCBuiltin<"__builtin_ia32_cmpb128_mask">,
-        Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty,
                   llvm_i16_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_cmp_w_128: GCCBuiltin<"__builtin_ia32_cmpw128_mask">,
-        Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty,
                   llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_cmp_d_128: GCCBuiltin<"__builtin_ia32_cmpd128_mask">,
-        Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty,
                   llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_cmp_q_128: GCCBuiltin<"__builtin_ia32_cmpq128_mask">,
-        Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty,
                   llvm_i8_ty], [IntrNoMem]>;
 
   def int_x86_avx512_mask_ucmp_b_128: GCCBuiltin<"__builtin_ia32_ucmpb128_mask">,
-        Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty,
                   llvm_i16_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_ucmp_w_128: GCCBuiltin<"__builtin_ia32_ucmpw128_mask">,
-        Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty,
                   llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_ucmp_d_128: GCCBuiltin<"__builtin_ia32_ucmpd128_mask">,
-        Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty,
                   llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_ucmp_q_128: GCCBuiltin<"__builtin_ia32_ucmpq128_mask">,
-        Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty,
+        Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty,
                   llvm_i8_ty], [IntrNoMem]>;
 }
 
+// Compress, Expand
+let TargetPrefix = "x86" in {
+  def int_x86_avx512_mask_compress_ps_512 :
+                             GCCBuiltin<"__builtin_ia32_compresssf512_mask">,
+        Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
+                   llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_compress_pd_512 :
+                             GCCBuiltin<"__builtin_ia32_compressdf512_mask">,
+        Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_compress_ps_256 :
+                             GCCBuiltin<"__builtin_ia32_compresssf256_mask">,
+        Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_compress_pd_256 :
+                             GCCBuiltin<"__builtin_ia32_compressdf256_mask">,
+        Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_compress_ps_128 :
+                             GCCBuiltin<"__builtin_ia32_compresssf128_mask">,
+        Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_compress_pd_128 :
+                             GCCBuiltin<"__builtin_ia32_compressdf128_mask">,
+        Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_compress_store_ps_512 :
+                            GCCBuiltin<"__builtin_ia32_compressstoresf512_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v16f32_ty,
+                   llvm_i16_ty], [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_compress_store_pd_512 :
+                            GCCBuiltin<"__builtin_ia32_compressstoredf512_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v8f64_ty,
+                   llvm_i8_ty], [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_compress_store_ps_256 :
+                            GCCBuiltin<"__builtin_ia32_compressstoresf256_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty,
+                   llvm_i8_ty], [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_compress_store_pd_256 :
+                            GCCBuiltin<"__builtin_ia32_compressstoredf256_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty,
+                   llvm_i8_ty], [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_compress_store_ps_128 :
+                            GCCBuiltin<"__builtin_ia32_compressstoresf128_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty,
+                   llvm_i8_ty], [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_compress_store_pd_128 :
+                            GCCBuiltin<"__builtin_ia32_compressstoredf128_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v2f64_ty,
+                   llvm_i8_ty], [IntrReadWriteArgMem]>;
+
+  def int_x86_avx512_mask_compress_d_512 :
+                             GCCBuiltin<"__builtin_ia32_compresssi512_mask">,
+        Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
+                   llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_compress_q_512 :
+                             GCCBuiltin<"__builtin_ia32_compressdi512_mask">,
+        Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_compress_d_256 :
+                             GCCBuiltin<"__builtin_ia32_compresssi256_mask">,
+        Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_compress_q_256 :
+                             GCCBuiltin<"__builtin_ia32_compressdi256_mask">,
+        Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_compress_d_128 :
+                             GCCBuiltin<"__builtin_ia32_compresssi128_mask">,
+        Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_compress_q_128 :
+                             GCCBuiltin<"__builtin_ia32_compressdi128_mask">,
+        Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_compress_store_d_512 :
+                            GCCBuiltin<"__builtin_ia32_compressstoresi512_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty,
+                   llvm_i16_ty], [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_compress_store_q_512 :
+                            GCCBuiltin<"__builtin_ia32_compressstoredi512_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty,
+                   llvm_i8_ty], [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_compress_store_d_256 :
+                            GCCBuiltin<"__builtin_ia32_compressstoresi256_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty,
+                   llvm_i8_ty], [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_compress_store_q_256 :
+                            GCCBuiltin<"__builtin_ia32_compressstoredi256_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty,
+                   llvm_i8_ty], [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_compress_store_d_128 :
+                            GCCBuiltin<"__builtin_ia32_compressstoresi128_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty,
+                   llvm_i8_ty], [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_compress_store_q_128 :
+                            GCCBuiltin<"__builtin_ia32_compressstoredi128_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty,
+                   llvm_i8_ty], [IntrReadWriteArgMem]>;
+
+// expand
+  def int_x86_avx512_mask_expand_ps_512 :
+                             GCCBuiltin<"__builtin_ia32_expandsf512_mask">,
+        Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
+                   llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_expand_pd_512 :
+                             GCCBuiltin<"__builtin_ia32_expanddf512_mask">,
+        Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_expand_ps_256 :
+                             GCCBuiltin<"__builtin_ia32_expandsf256_mask">,
+        Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_expand_pd_256 :
+                             GCCBuiltin<"__builtin_ia32_expanddf256_mask">,
+        Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_expand_ps_128 :
+                             GCCBuiltin<"__builtin_ia32_expandsf128_mask">,
+        Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_expand_pd_128 :
+                             GCCBuiltin<"__builtin_ia32_expanddf128_mask">,
+        Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_expand_load_ps_512 :
+                            GCCBuiltin<"__builtin_ia32_expandloadsf512_mask">,
+        Intrinsic<[llvm_v16f32_ty], [llvm_ptr_ty, llvm_v16f32_ty,
+                   llvm_i16_ty], [IntrReadArgMem]>;
+  def int_x86_avx512_mask_expand_load_pd_512 :
+                            GCCBuiltin<"__builtin_ia32_expandloaddf512_mask">,
+        Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty,
+                   llvm_i8_ty], [IntrReadArgMem]>;
+  def int_x86_avx512_mask_expand_load_ps_256 :
+                            GCCBuiltin<"__builtin_ia32_expandloadsf256_mask">,
+        Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty, llvm_v8f32_ty,
+                   llvm_i8_ty], [IntrReadArgMem]>;
+  def int_x86_avx512_mask_expand_load_pd_256 :
+                            GCCBuiltin<"__builtin_ia32_expandloaddf256_mask">,
+        Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty, llvm_v4f64_ty,
+                   llvm_i8_ty], [IntrReadArgMem]>;
+  def int_x86_avx512_mask_expand_load_ps_128 :
+                            GCCBuiltin<"__builtin_ia32_expandloadsf128_mask">,
+        Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty, llvm_v4f32_ty,
+                   llvm_i8_ty], [IntrReadArgMem]>;
+  def int_x86_avx512_mask_expand_load_pd_128 :
+                            GCCBuiltin<"__builtin_ia32_expandloaddf128_mask">,
+        Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty, llvm_v2f64_ty,
+                   llvm_i8_ty], [IntrReadArgMem]>;
+
+  def int_x86_avx512_mask_expand_d_512 :
+                             GCCBuiltin<"__builtin_ia32_expandsi512_mask">,
+        Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
+                   llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_expand_q_512 :
+                             GCCBuiltin<"__builtin_ia32_expanddi512_mask">,
+        Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_expand_d_256 :
+                             GCCBuiltin<"__builtin_ia32_expandsi256_mask">,
+        Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_expand_q_256 :
+                             GCCBuiltin<"__builtin_ia32_expanddi256_mask">,
+        Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_expand_d_128 :
+                             GCCBuiltin<"__builtin_ia32_expandsi128_mask">,
+        Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_expand_q_128 :
+                             GCCBuiltin<"__builtin_ia32_expanddi128_mask">,
+        Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
+                   llvm_i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_expand_load_d_512 :
+                            GCCBuiltin<"__builtin_ia32_expandloadsi512_mask">,
+        Intrinsic<[llvm_v16i32_ty], [llvm_ptr_ty, llvm_v16i32_ty,
+                   llvm_i16_ty], [IntrReadArgMem]>;
+  def int_x86_avx512_mask_expand_load_q_512 :
+                            GCCBuiltin<"__builtin_ia32_expandloaddi512_mask">,
+        Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty, llvm_v8i64_ty,
+                   llvm_i8_ty], [IntrReadArgMem]>;
+  def int_x86_avx512_mask_expand_load_d_256 :
+                            GCCBuiltin<"__builtin_ia32_expandloadsi256_mask">,
+        Intrinsic<[llvm_v8i32_ty], [llvm_ptr_ty, llvm_v8i32_ty,
+                   llvm_i8_ty], [IntrReadArgMem]>;
+  def int_x86_avx512_mask_expand_load_q_256 :
+                            GCCBuiltin<"__builtin_ia32_expandloaddi256_mask">,
+        Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty,
+                   llvm_i8_ty], [IntrReadArgMem]>;
+  def int_x86_avx512_mask_expand_load_d_128 :
+                            GCCBuiltin<"__builtin_ia32_expandloadsi128_mask">,
+        Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_v4i32_ty,
+                   llvm_i8_ty], [IntrReadArgMem]>;
+  def int_x86_avx512_mask_expand_load_q_128 :
+                            GCCBuiltin<"__builtin_ia32_expandloaddi128_mask">,
+        Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_v2i64_ty,
+                   llvm_i8_ty], [IntrReadArgMem]>;
+
+}
 // Misc.
 let TargetPrefix = "x86" in {
   def int_x86_avx512_mask_cmp_ps_512 : GCCBuiltin<"__builtin_ia32_cmpps512_mask">,
-            Intrinsic<[llvm_i16_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty,
+            Intrinsic<[llvm_i16_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i8_ty,
                                       llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_cmp_pd_512 : GCCBuiltin<"__builtin_ia32_cmppd512_mask">,
-            Intrinsic<[llvm_i8_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty,
+            Intrinsic<[llvm_i8_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
                                       llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_pand_d_512 : GCCBuiltin<"__builtin_ia32_pandd512_mask">,
             Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
diff --git a/include/llvm/IR/LLVMContext.h b/include/llvm/IR/LLVMContext.h
index 2f18782..8c04b08 100644
--- a/include/llvm/IR/LLVMContext.h
+++ b/include/llvm/IR/LLVMContext.h
@@ -176,8 +176,8 @@ public:
   }
 
 private:
-  LLVMContext(LLVMContext&) LLVM_DELETED_FUNCTION;
-  void operator=(LLVMContext&) LLVM_DELETED_FUNCTION;
+  LLVMContext(LLVMContext&) = delete;
+  void operator=(LLVMContext&) = delete;
 
   /// addModule - Register a module as being instantiated in this context.  If
   /// the context is deleted, the module will be deleted as well.
diff --git a/include/llvm/IR/LeakDetector.h b/include/llvm/IR/LeakDetector.h
deleted file mode 100644
index cb18df8..0000000
--- a/include/llvm/IR/LeakDetector.h
+++ /dev/null
@@ -1,92 +0,0 @@
-//===- LeakDetector.h - Provide leak detection ------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a class that can be used to provide very simple memory leak
-// checks for an API.  Basically LLVM uses this to make sure that Instructions,
-// for example, are deleted when they are supposed to be, and not leaked away.
-//
-// When compiling with NDEBUG (Release build), this class does nothing, thus
-// adding no checking overhead to release builds.  Note that this class is
-// implemented in a very simple way, requiring completely manual manipulation
-// and checking for garbage, but this is intentional: users should not be using
-// this API, only other APIs should.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_IR_LEAKDETECTOR_H
-#define LLVM_IR_LEAKDETECTOR_H
-
-#include <string>
-
-namespace llvm {
-
-class LLVMContext;
-class Value;
-
-struct LeakDetector {
-  /// addGarbageObject - Add a pointer to the internal set of "garbage" object
-  /// pointers.  This should be called when objects are created, or if they are
-  /// taken out of an owning collection.
-  ///
-  static void addGarbageObject(void *Object) {
-#ifndef NDEBUG
-    addGarbageObjectImpl(Object);
-#endif
-  }
-
-  /// removeGarbageObject - Remove a pointer from our internal representation of
-  /// our "garbage" objects.  This should be called when an object is added to
-  /// an "owning" collection.
-  ///
-  static void removeGarbageObject(void *Object) {
-#ifndef NDEBUG
-    removeGarbageObjectImpl(Object);
-#endif
-  }
-
-  /// checkForGarbage - Traverse the internal representation of garbage
-  /// pointers.  If there are any pointers that have been add'ed, but not
-  /// remove'd, big obnoxious warnings about memory leaks are issued.
-  ///
-  /// The specified message will be printed indicating when the check was
-  /// performed.
-  ///
-  static void checkForGarbage(LLVMContext &C, const std::string &Message) {
-#ifndef NDEBUG
-    checkForGarbageImpl(C, Message);
-#endif
-  }
-
-  /// Overload the normal methods to work better with Value*'s because they are
-  /// by far the most common in LLVM.  This does not affect the actual
-  /// functioning of this class, it just makes the warning messages nicer.
-  ///
-  static void addGarbageObject(const Value *Object) {
-#ifndef NDEBUG
-    addGarbageObjectImpl(Object);
-#endif
-  }
-  static void removeGarbageObject(const Value *Object) {
-#ifndef NDEBUG
-    removeGarbageObjectImpl(Object);
-#endif
-  }
-
-private:
-  // If we are debugging, the actual implementations will be called...
-  static void addGarbageObjectImpl(const Value *Object);
-  static void removeGarbageObjectImpl(const Value *Object);
-  static void addGarbageObjectImpl(void *Object);
-  static void removeGarbageObjectImpl(void *Object);
-  static void checkForGarbageImpl(LLVMContext &C, const std::string &Message);
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/include/llvm/IR/LegacyPassManager.h b/include/llvm/IR/LegacyPassManager.h
index c967a6b..6c04e9d 100644
--- a/include/llvm/IR/LegacyPassManager.h
+++ b/include/llvm/IR/LegacyPassManager.h
@@ -37,9 +37,10 @@ class PassManagerBase {
 public:
   virtual ~PassManagerBase();
 
-  /// add - Add a pass to the queue of passes to run.  This passes ownership of
+  /// Add a pass to the queue of passes to run.  This passes ownership of
   /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
   /// will be destroyed as well, so there is no need to delete the pass.  This
+  /// may even destroy the pass right away if it is found to be redundant. This
   /// implies that all passes MUST be allocated with 'new'.
   virtual void add(Pass *P) = 0;
 };
@@ -51,10 +52,6 @@ public:
   PassManager();
   ~PassManager();
 
-  /// add - Add a pass to the queue of passes to run.  This passes ownership of
-  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
-  /// will be destroyed as well, so there is no need to delete the pass.  This
-  /// implies that all passes MUST be allocated with 'new'.
   void add(Pass *P) override;
 
   /// run - Execute all of the passes scheduled for execution.  Keep track of
@@ -75,11 +72,6 @@ public:
   explicit FunctionPassManager(Module *M);
   ~FunctionPassManager();
 
-  /// add - Add a pass to the queue of passes to run.  This passes
-  /// ownership of the Pass to the PassManager.  When the
-  /// PassManager_X is destroyed, the pass will be destroyed as well, so
-  /// there is no need to delete the pass.
-  /// This implies that all passes MUST be allocated with 'new'.
   void add(Pass *P) override;
 
   /// run - Execute all of the passes scheduled for execution.  Keep
diff --git a/include/llvm/IR/LegacyPassManagers.h b/include/llvm/IR/LegacyPassManagers.h
index ab500a1..7f7889a 100644
--- a/include/llvm/IR/LegacyPassManagers.h
+++ b/include/llvm/IR/LegacyPassManagers.h
@@ -195,6 +195,9 @@ public:
   /// then return NULL.
   Pass *findAnalysisPass(AnalysisID AID);
 
+  /// Retrieve the PassInfo for an analysis.
+  const PassInfo *findAnalysisPassInfo(AnalysisID AID) const;
+
   /// Find analysis usage information for the pass P.
   AnalysisUsage *findAnalysisUsage(Pass *P);
 
@@ -251,6 +254,12 @@ private:
   SmallVector<ImmutablePass *, 16> ImmutablePasses;
 
   DenseMap<Pass *, AnalysisUsage *> AnUsageMap;
+
+  /// Collection of PassInfo objects found via analysis IDs and in this top
+  /// level manager. This is used to memoize queries to the pass registry.
+  /// FIXME: This is an egregious hack because querying the pass registry is
+  /// either slow or racy.
+  mutable DenseMap<AnalysisID, const PassInfo *> AnalysisPassInfos;
 };
 
 
diff --git a/include/llvm/IR/LegacyPassNameParser.h b/include/llvm/IR/LegacyPassNameParser.h
index e2e4912..52db1c3 100644
--- a/include/llvm/IR/LegacyPassNameParser.h
+++ b/include/llvm/IR/LegacyPassNameParser.h
@@ -41,14 +41,12 @@ namespace llvm {
 //
 class PassNameParser : public PassRegistrationListener,
                        public cl::parser<const PassInfo*> {
-  cl::Option *Opt;
 public:
-  PassNameParser();
+  PassNameParser(cl::Option &O);
   virtual ~PassNameParser();
 
-  void initialize(cl::Option &O) {
-    Opt = &O;
-    cl::parser<const PassInfo*>::initialize(O);
+  void initialize() {
+    cl::parser<const PassInfo*>::initialize();
 
     // Add all of the passes to the map that got initialized before 'this' did.
     enumeratePasses();
@@ -69,7 +67,7 @@ public:
   // Implement the PassRegistrationListener callbacks used to populate our map
   //
   void passRegistered(const PassInfo *P) override {
-    if (ignorablePass(P) || !Opt) return;
+    if (ignorablePass(P)) return;
     if (findOption(P->getPassArgument()) != getNumOptions()) {
       errs() << "Two passes with the same argument (-"
            << P->getPassArgument() << ") attempted to be registered!\n";
diff --git a/include/llvm/IR/MDBuilder.h b/include/llvm/IR/MDBuilder.h
index d29512c..51b8b02 100644
--- a/include/llvm/IR/MDBuilder.h
+++ b/include/llvm/IR/MDBuilder.h
@@ -24,6 +24,8 @@ namespace llvm {
 class APInt;
 template <typename T> class ArrayRef;
 class LLVMContext;
+class Constant;
+class ConstantAsMetadata;
 class MDNode;
 class MDString;
 
@@ -36,6 +38,9 @@ public:
   /// \brief Return the given string as metadata.
   MDString *createString(StringRef Str);
 
+  /// \brief Return the given constant as metadata.
+  ConstantAsMetadata *createConstant(Constant *C);
+
   //===------------------------------------------------------------------===//
   // FPMath metadata.
   //===------------------------------------------------------------------===//
@@ -62,6 +67,9 @@ public:
   /// \brief Return metadata describing the range [Lo, Hi).
   MDNode *createRange(const APInt &Lo, const APInt &Hi);
 
+  /// \brief Return metadata describing the range [Lo, Hi).
+  MDNode *createRange(Constant *Lo, Constant *Hi);
+
   //===------------------------------------------------------------------===//
   // AA metadata.
   //===------------------------------------------------------------------===//
diff --git a/include/llvm/IR/Metadata.def b/include/llvm/IR/Metadata.def
new file mode 100644
index 0000000..66589e0
--- /dev/null
+++ b/include/llvm/IR/Metadata.def
@@ -0,0 +1,99 @@
+//===- llvm/IR/Metadata.def - Metadata definitions --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Macros for running through all types of metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#if !(defined HANDLE_METADATA || defined HANDLE_METADATA_LEAF ||               \
+      defined HANDLE_METADATA_BRANCH || defined HANDLE_MDNODE_LEAF ||          \
+      defined HANDLE_MDNODE_BRANCH ||                                          \
+      defined HANDLE_SPECIALIZED_MDNODE_LEAF ||                                \
+      defined HANDLE_SPECIALIZED_MDNODE_BRANCH)
+#error "Missing macro definition of HANDLE_METADATA*"
+#endif
+
+// Handler for all types of metadata.
+#ifndef HANDLE_METADATA
+#define HANDLE_METADATA(CLASS)
+#endif
+
+// Handler for leaf nodes in the class hierarchy.
+#ifndef HANDLE_METADATA_LEAF
+#define HANDLE_METADATA_LEAF(CLASS) HANDLE_METADATA(CLASS)
+#endif
+
+// Handler for non-leaf nodes in the class hierarchy.
+#ifndef HANDLE_METADATA_BRANCH
+#define HANDLE_METADATA_BRANCH(CLASS) HANDLE_METADATA(CLASS)
+#endif
+
+// Handler for leaf nodes under MDNode.
+#ifndef HANDLE_MDNODE_LEAF
+#define HANDLE_MDNODE_LEAF(CLASS) HANDLE_METADATA_LEAF(CLASS)
+#endif
+
+// Handler for non-leaf nodes under MDNode.
+#ifndef HANDLE_MDNODE_BRANCH
+#define HANDLE_MDNODE_BRANCH(CLASS) HANDLE_METADATA_BRANCH(CLASS)
+#endif
+
+// Handler for specialized leaf nodes under MDNode.
+#ifndef HANDLE_SPECIALIZED_MDNODE_LEAF
+#define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS) HANDLE_MDNODE_LEAF(CLASS)
+#endif
+
+// Handler for specialized non-leaf nodes under MDNode.
+#ifndef HANDLE_SPECIALIZED_MDNODE_BRANCH
+#define HANDLE_SPECIALIZED_MDNODE_BRANCH(CLASS) HANDLE_MDNODE_BRANCH(CLASS)
+#endif
+
+HANDLE_METADATA_LEAF(MDString)
+HANDLE_METADATA_BRANCH(ValueAsMetadata)
+HANDLE_METADATA_LEAF(ConstantAsMetadata)
+HANDLE_METADATA_LEAF(LocalAsMetadata)
+HANDLE_MDNODE_BRANCH(MDNode)
+HANDLE_MDNODE_LEAF(MDTuple)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDLocation)
+HANDLE_SPECIALIZED_MDNODE_BRANCH(DebugNode)
+HANDLE_SPECIALIZED_MDNODE_LEAF(GenericDebugNode)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDSubrange)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDEnumerator)
+HANDLE_SPECIALIZED_MDNODE_BRANCH(MDScope)
+HANDLE_SPECIALIZED_MDNODE_BRANCH(MDType)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDBasicType)
+HANDLE_SPECIALIZED_MDNODE_BRANCH(MDDerivedTypeBase)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDDerivedType)
+HANDLE_SPECIALIZED_MDNODE_BRANCH(MDCompositeTypeBase)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDCompositeType)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDSubroutineType)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDFile)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDCompileUnit)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDSubprogram)
+HANDLE_SPECIALIZED_MDNODE_BRANCH(MDLexicalBlockBase)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDLexicalBlock)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDLexicalBlockFile)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDNamespace)
+HANDLE_SPECIALIZED_MDNODE_BRANCH(MDTemplateParameter)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDTemplateTypeParameter)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDTemplateValueParameter)
+HANDLE_SPECIALIZED_MDNODE_BRANCH(MDVariable)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDGlobalVariable)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDLocalVariable)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDExpression)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDObjCProperty)
+HANDLE_SPECIALIZED_MDNODE_LEAF(MDImportedEntity)
+
+#undef HANDLE_METADATA
+#undef HANDLE_METADATA_LEAF
+#undef HANDLE_METADATA_BRANCH
+#undef HANDLE_MDNODE_LEAF
+#undef HANDLE_MDNODE_BRANCH
+#undef HANDLE_SPECIALIZED_MDNODE_LEAF
+#undef HANDLE_SPECIALIZED_MDNODE_BRANCH
diff --git a/include/llvm/IR/Metadata.h b/include/llvm/IR/Metadata.h
index a056b0d..ff8f837 100644
--- a/include/llvm/IR/Metadata.h
+++ b/include/llvm/IR/Metadata.h
@@ -1,4 +1,4 @@
-//===-- llvm/Metadata.h - Metadata definitions ------------------*- C++ -*-===//
+//===- llvm/IR/Metadata.h - Metadata definitions ----------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,11 +18,13 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/MetadataTracking.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <type_traits>
 
 namespace llvm {
 class LLVMContext;
@@ -38,20 +40,422 @@ enum LLVMConstants : uint32_t {
 /// \brief Root of the metadata hierarchy.
 ///
 /// This is a root class for typeless data in the IR.
-///
-/// TODO: Detach from the Value hierarchy.
-class Metadata : public Value {
+class Metadata {
+  friend class ReplaceableMetadataImpl;
+
+  /// \brief RTTI.
+  const unsigned char SubclassID;
+
+protected:
+  /// \brief Active type of storage.
+  enum StorageType { Uniqued, Distinct, Temporary };
+
+  /// \brief Storage flag for non-uniqued, otherwise unowned, metadata.
+  unsigned Storage : 2;
+  // TODO: expose remaining bits to subclasses.
+
+  unsigned short SubclassData16;
+  unsigned SubclassData32;
+
+public:
+  enum MetadataKind {
+    MDTupleKind,
+    MDLocationKind,
+    GenericDebugNodeKind,
+    MDSubrangeKind,
+    MDEnumeratorKind,
+    MDBasicTypeKind,
+    MDDerivedTypeKind,
+    MDCompositeTypeKind,
+    MDSubroutineTypeKind,
+    MDFileKind,
+    MDCompileUnitKind,
+    MDSubprogramKind,
+    MDLexicalBlockKind,
+    MDLexicalBlockFileKind,
+    MDNamespaceKind,
+    MDTemplateTypeParameterKind,
+    MDTemplateValueParameterKind,
+    MDGlobalVariableKind,
+    MDLocalVariableKind,
+    MDExpressionKind,
+    MDObjCPropertyKind,
+    MDImportedEntityKind,
+    ConstantAsMetadataKind,
+    LocalAsMetadataKind,
+    MDStringKind
+  };
+
 protected:
-  Metadata(LLVMContext &Context, unsigned ID);
+  Metadata(unsigned ID, StorageType Storage)
+      : SubclassID(ID), Storage(Storage), SubclassData16(0), SubclassData32(0) {
+  }
+  ~Metadata() {}
+
+  /// \brief Default handling of a changed operand, which asserts.
+  ///
+  /// If subclasses pass themselves in as owners to a tracking node reference,
+  /// they must provide an implementation of this method.
+  void handleChangedOperand(void *, Metadata *) {
+    llvm_unreachable("Unimplemented in Metadata subclass");
+  }
+
+public:
+  unsigned getMetadataID() const { return SubclassID; }
+
+  /// \brief User-friendly dump.
+  void dump() const;
+  void print(raw_ostream &OS) const;
+  void printAsOperand(raw_ostream &OS, bool PrintType = true,
+                      const Module *M = nullptr) const;
+};
+
+#define HANDLE_METADATA(CLASS) class CLASS;
+#include "llvm/IR/Metadata.def"
+
+inline raw_ostream &operator<<(raw_ostream &OS, const Metadata &MD) {
+  MD.print(OS);
+  return OS;
+}
+
+/// \brief Metadata wrapper in the Value hierarchy.
+///
+/// A member of the \a Value hierarchy to represent a reference to metadata.
+/// This allows, e.g., instrinsics to have metadata as operands.
+///
+/// Notably, this is the only thing in either hierarchy that is allowed to
+/// reference \a LocalAsMetadata.
+class MetadataAsValue : public Value {
+  friend class ReplaceableMetadataImpl;
+  friend class LLVMContextImpl;
+
+  Metadata *MD;
+
+  MetadataAsValue(Type *Ty, Metadata *MD);
+  ~MetadataAsValue();
+
+  /// \brief Drop use of metadata (during teardown).
+  void dropUse() { MD = nullptr; }
 
 public:
+  static MetadataAsValue *get(LLVMContext &Context, Metadata *MD);
+  static MetadataAsValue *getIfExists(LLVMContext &Context, Metadata *MD);
+  Metadata *getMetadata() const { return MD; }
+
   static bool classof(const Value *V) {
-    return V->getValueID() == GenericMDNodeVal ||
-           V->getValueID() == MDNodeFwdDeclVal ||
-           V->getValueID() == MDStringVal;
+    return V->getValueID() == MetadataAsValueVal;
   }
+
+private:
+  void handleChangedMetadata(Metadata *MD);
+  void track();
+  void untrack();
 };
 
+/// \brief Shared implementation of use-lists for replaceable metadata.
+///
+/// Most metadata cannot be RAUW'ed.  This is a shared implementation of
+/// use-lists and associated API for the two that support it (\a ValueAsMetadata
+/// and \a TempMDNode).
+class ReplaceableMetadataImpl {
+  friend class MetadataTracking;
+
+public:
+  typedef MetadataTracking::OwnerTy OwnerTy;
+
+private:
+  LLVMContext &Context;
+  uint64_t NextIndex;
+  SmallDenseMap<void *, std::pair<OwnerTy, uint64_t>, 4> UseMap;
+
+public:
+  ReplaceableMetadataImpl(LLVMContext &Context)
+      : Context(Context), NextIndex(0) {}
+  ~ReplaceableMetadataImpl() {
+    assert(UseMap.empty() && "Cannot destroy in-use replaceable metadata");
+  }
+
+  LLVMContext &getContext() const { return Context; }
+
+  /// \brief Replace all uses of this with MD.
+  ///
+  /// Replace all uses of this with \c MD, which is allowed to be null.
+  void replaceAllUsesWith(Metadata *MD);
+
+  /// \brief Resolve all uses of this.
+  ///
+  /// Resolve all uses of this, turning off RAUW permanently.  If \c
+  /// ResolveUsers, call \a MDNode::resolve() on any users whose last operand
+  /// is resolved.
+  void resolveAllUses(bool ResolveUsers = true);
+
+private:
+  void addRef(void *Ref, OwnerTy Owner);
+  void dropRef(void *Ref);
+  void moveRef(void *Ref, void *New, const Metadata &MD);
+
+  static ReplaceableMetadataImpl *get(Metadata &MD);
+};
+
+/// \brief Value wrapper in the Metadata hierarchy.
+///
+/// This is a custom value handle that allows other metadata to refer to
+/// classes in the Value hierarchy.
+///
+/// Because of full uniquing support, each value is only wrapped by a single \a
+/// ValueAsMetadata object, so the lookup maps are far more efficient than
+/// those using ValueHandleBase.
+class ValueAsMetadata : public Metadata, ReplaceableMetadataImpl {
+  friend class ReplaceableMetadataImpl;
+  friend class LLVMContextImpl;
+
+  Value *V;
+
+  /// \brief Drop users without RAUW (during teardown).
+  void dropUsers() {
+    ReplaceableMetadataImpl::resolveAllUses(/* ResolveUsers */ false);
+  }
+
+protected:
+  ValueAsMetadata(unsigned ID, Value *V)
+      : Metadata(ID, Uniqued), ReplaceableMetadataImpl(V->getContext()), V(V) {
+    assert(V && "Expected valid value");
+  }
+  ~ValueAsMetadata() {}
+
+public:
+  static ValueAsMetadata *get(Value *V);
+  static ConstantAsMetadata *getConstant(Value *C) {
+    return cast<ConstantAsMetadata>(get(C));
+  }
+  static LocalAsMetadata *getLocal(Value *Local) {
+    return cast<LocalAsMetadata>(get(Local));
+  }
+
+  static ValueAsMetadata *getIfExists(Value *V);
+  static ConstantAsMetadata *getConstantIfExists(Value *C) {
+    return cast_or_null<ConstantAsMetadata>(getIfExists(C));
+  }
+  static LocalAsMetadata *getLocalIfExists(Value *Local) {
+    return cast_or_null<LocalAsMetadata>(getIfExists(Local));
+  }
+
+  Value *getValue() const { return V; }
+  Type *getType() const { return V->getType(); }
+  LLVMContext &getContext() const { return V->getContext(); }
+
+  static void handleDeletion(Value *V);
+  static void handleRAUW(Value *From, Value *To);
+
+protected:
+  /// \brief Handle collisions after \a Value::replaceAllUsesWith().
+  ///
+  /// RAUW isn't supported directly for \a ValueAsMetadata, but if the wrapped
+  /// \a Value gets RAUW'ed and the target already exists, this is used to
+  /// merge the two metadata nodes.
+  void replaceAllUsesWith(Metadata *MD) {
+    ReplaceableMetadataImpl::replaceAllUsesWith(MD);
+  }
+
+public:
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == LocalAsMetadataKind ||
+           MD->getMetadataID() == ConstantAsMetadataKind;
+  }
+};
+
+class ConstantAsMetadata : public ValueAsMetadata {
+  friend class ValueAsMetadata;
+
+  ConstantAsMetadata(Constant *C)
+      : ValueAsMetadata(ConstantAsMetadataKind, C) {}
+
+public:
+  static ConstantAsMetadata *get(Constant *C) {
+    return ValueAsMetadata::getConstant(C);
+  }
+  static ConstantAsMetadata *getIfExists(Constant *C) {
+    return ValueAsMetadata::getConstantIfExists(C);
+  }
+
+  Constant *getValue() const {
+    return cast<Constant>(ValueAsMetadata::getValue());
+  }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == ConstantAsMetadataKind;
+  }
+};
+
+class LocalAsMetadata : public ValueAsMetadata {
+  friend class ValueAsMetadata;
+
+  LocalAsMetadata(Value *Local)
+      : ValueAsMetadata(LocalAsMetadataKind, Local) {
+    assert(!isa<Constant>(Local) && "Expected local value");
+  }
+
+public:
+  static LocalAsMetadata *get(Value *Local) {
+    return ValueAsMetadata::getLocal(Local);
+  }
+  static LocalAsMetadata *getIfExists(Value *Local) {
+    return ValueAsMetadata::getLocalIfExists(Local);
+  }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == LocalAsMetadataKind;
+  }
+};
+
+/// \brief Transitional API for extracting constants from Metadata.
+///
+/// This namespace contains transitional functions for metadata that points to
+/// \a Constants.
+///
+/// In prehistory -- when metadata was a subclass of \a Value -- \a MDNode
+/// operands could refer to any \a Value.  There's was a lot of code like this:
+///
+/// \code
+///     MDNode *N = ...;
+///     auto *CI = dyn_cast<ConstantInt>(N->getOperand(2));
+/// \endcode
+///
+/// Now that \a Value and \a Metadata are in separate hierarchies, maintaining
+/// the semantics for \a isa(), \a cast(), \a dyn_cast() (etc.) requires three
+/// steps: cast in the \a Metadata hierarchy, extraction of the \a Value, and
+/// cast in the \a Value hierarchy.  Besides creating boiler-plate, this
+/// requires subtle control flow changes.
+///
+/// The end-goal is to create a new type of metadata, called (e.g.) \a MDInt,
+/// so that metadata can refer to numbers without traversing a bridge to the \a
+/// Value hierarchy.  In this final state, the code above would look like this:
+///
+/// \code
+///     MDNode *N = ...;
+///     auto *MI = dyn_cast<MDInt>(N->getOperand(2));
+/// \endcode
+///
+/// The API in this namespace supports the transition.  \a MDInt doesn't exist
+/// yet, and even once it does, changing each metadata schema to use it is its
+/// own mini-project.  In the meantime this API prevents us from introducing
+/// complex and bug-prone control flow that will disappear in the end.  In
+/// particular, the above code looks like this:
+///
+/// \code
+///     MDNode *N = ...;
+///     auto *CI = mdconst::dyn_extract<ConstantInt>(N->getOperand(2));
+/// \endcode
+///
+/// The full set of provided functions includes:
+///
+///   mdconst::hasa                <=> isa
+///   mdconst::extract             <=> cast
+///   mdconst::extract_or_null     <=> cast_or_null
+///   mdconst::dyn_extract         <=> dyn_cast
+///   mdconst::dyn_extract_or_null <=> dyn_cast_or_null
+///
+/// The target of the cast must be a subclass of \a Constant.
+namespace mdconst {
+
+namespace detail {
+template <class T> T &make();
+template <class T, class Result> struct HasDereference {
+  typedef char Yes[1];
+  typedef char No[2];
+  template <size_t N> struct SFINAE {};
+
+  template <class U, class V>
+  static Yes &hasDereference(SFINAE<sizeof(static_cast<V>(*make<U>()))> * = 0);
+  template <class U, class V> static No &hasDereference(...);
+
+  static const bool value =
+      sizeof(hasDereference<T, Result>(nullptr)) == sizeof(Yes);
+};
+template <class V, class M> struct IsValidPointer {
+  static const bool value = std::is_base_of<Constant, V>::value &&
+                            HasDereference<M, const Metadata &>::value;
+};
+template <class V, class M> struct IsValidReference {
+  static const bool value = std::is_base_of<Constant, V>::value &&
+                            std::is_convertible<M, const Metadata &>::value;
+};
+} // end namespace detail
+
+/// \brief Check whether Metadata has a Value.
+///
+/// As an analogue to \a isa(), check whether \c MD has an \a Value inside of
+/// type \c X.
+template <class X, class Y>
+inline typename std::enable_if<detail::IsValidPointer<X, Y>::value, bool>::type
+hasa(Y &&MD) {
+  assert(MD && "Null pointer sent into hasa");
+  if (auto *V = dyn_cast<ConstantAsMetadata>(MD))
+    return isa<X>(V->getValue());
+  return false;
+}
+template <class X, class Y>
+inline
+    typename std::enable_if<detail::IsValidReference<X, Y &>::value, bool>::type
+    hasa(Y &MD) {
+  return hasa(&MD);
+}
+
+/// \brief Extract a Value from Metadata.
+///
+/// As an analogue to \a cast(), extract the \a Value subclass \c X from \c MD.
+template <class X, class Y>
+inline typename std::enable_if<detail::IsValidPointer<X, Y>::value, X *>::type
+extract(Y &&MD) {
+  return cast<X>(cast<ConstantAsMetadata>(MD)->getValue());
+}
+template <class X, class Y>
+inline
+    typename std::enable_if<detail::IsValidReference<X, Y &>::value, X *>::type
+    extract(Y &MD) {
+  return extract(&MD);
+}
+
+/// \brief Extract a Value from Metadata, allowing null.
+///
+/// As an analogue to \a cast_or_null(), extract the \a Value subclass \c X
+/// from \c MD, allowing \c MD to be null.
+template <class X, class Y>
+inline typename std::enable_if<detail::IsValidPointer<X, Y>::value, X *>::type
+extract_or_null(Y &&MD) {
+  if (auto *V = cast_or_null<ConstantAsMetadata>(MD))
+    return cast<X>(V->getValue());
+  return nullptr;
+}
+
+/// \brief Extract a Value from Metadata, if any.
+///
+/// As an analogue to \a dyn_cast_or_null(), extract the \a Value subclass \c X
+/// from \c MD, return null if \c MD doesn't contain a \a Value or if the \a
+/// Value it does contain is of the wrong subclass.
+template <class X, class Y>
+inline typename std::enable_if<detail::IsValidPointer<X, Y>::value, X *>::type
+dyn_extract(Y &&MD) {
+  if (auto *V = dyn_cast<ConstantAsMetadata>(MD))
+    return dyn_cast<X>(V->getValue());
+  return nullptr;
+}
+
+/// \brief Extract a Value from Metadata, if any, allowing null.
+///
+/// As an analogue to \a dyn_cast_or_null(), extract the \a Value subclass \c X
+/// from \c MD, return null if \c MD doesn't contain a \a Value or if the \a
+/// Value it does contain is of the wrong subclass, allowing \c MD to be null.
+template <class X, class Y>
+inline typename std::enable_if<detail::IsValidPointer<X, Y>::value, X *>::type
+dyn_extract_or_null(Y &&MD) {
+  if (auto *V = dyn_cast_or_null<ConstantAsMetadata>(MD))
+    return dyn_cast<X>(V->getValue());
+  return nullptr;
+}
+
+} // end namespace mdconst
+
 //===----------------------------------------------------------------------===//
 /// \brief A single uniqued string.
 ///
@@ -60,14 +464,13 @@ public:
 class MDString : public Metadata {
   friend class StringMapEntry<MDString>;
 
-  virtual void anchor();
-  MDString(const MDString &) LLVM_DELETED_FUNCTION;
-
-  explicit MDString(LLVMContext &Context)
-      : Metadata(Context, Value::MDStringVal) {}
+  MDString(const MDString &) = delete;
+  MDString &operator=(MDString &&) = delete;
+  MDString &operator=(const MDString &) = delete;
 
-  /// \brief Shadow Value::getName() to prevent its use.
-  StringRef getName() const LLVM_DELETED_FUNCTION;
+  StringMapEntry<MDString> *Entry;
+  MDString() : Metadata(MDStringKind, Uniqued), Entry(nullptr) {}
+  MDString(MDString &&) : Metadata(MDStringKind, Uniqued) {}
 
 public:
   static MDString *get(LLVMContext &Context, StringRef Str);
@@ -87,9 +490,12 @@ public:
   /// \brief Pointer to one byte past the end of the string.
   iterator end() const { return getString().end(); }
 
+  const unsigned char *bytes_begin() const { return getString().bytes_begin(); }
+  const unsigned char *bytes_end() const { return getString().bytes_end(); }
+
   /// \brief Methods for support type inquiry through isa, cast, and dyn_cast.
-  static bool classof(const Value *V) {
-    return V->getValueID() == MDStringVal;
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDStringKind;
   }
 };
 
@@ -106,7 +512,7 @@ struct AAMDNodes {
 
   bool operator!=(const AAMDNodes &A) const { return !(*this == A); }
 
-  LLVM_EXPLICIT operator bool() const { return TBAA || Scope || NoAlias; }
+  explicit operator bool() const { return TBAA || Scope || NoAlias; }
 
   /// \brief The tag for type-based alias analysis.
   MDNode *TBAA;
@@ -137,21 +543,171 @@ struct DenseMapInfo<AAMDNodes> {
   }
 };
 
-class MDNodeOperand;
+/// \brief Tracking metadata reference owned by Metadata.
+///
+/// Similar to \a TrackingMDRef, but it's expected to be owned by an instance
+/// of \a Metadata, which has the option of registering itself for callbacks to
+/// re-unique itself.
+///
+/// In particular, this is used by \a MDNode.
+class MDOperand {
+  MDOperand(MDOperand &&) = delete;
+  MDOperand(const MDOperand &) = delete;
+  MDOperand &operator=(MDOperand &&) = delete;
+  MDOperand &operator=(const MDOperand &) = delete;
 
-//===----------------------------------------------------------------------===//
-/// \brief Tuple of metadata.
+  Metadata *MD;
+
+public:
+  MDOperand() : MD(nullptr) {}
+  ~MDOperand() { untrack(); }
+
+  Metadata *get() const { return MD; }
+  operator Metadata *() const { return get(); }
+  Metadata *operator->() const { return get(); }
+  Metadata &operator*() const { return *get(); }
+
+  void reset() {
+    untrack();
+    MD = nullptr;
+  }
+  void reset(Metadata *MD, Metadata *Owner) {
+    untrack();
+    this->MD = MD;
+    track(Owner);
+  }
+
+private:
+  void track(Metadata *Owner) {
+    if (MD) {
+      if (Owner)
+        MetadataTracking::track(this, *MD, *Owner);
+      else
+        MetadataTracking::track(MD);
+    }
+  }
+  void untrack() {
+    assert(static_cast<void *>(this) == &MD && "Expected same address");
+    if (MD)
+      MetadataTracking::untrack(MD);
+  }
+};
+
+template <> struct simplify_type<MDOperand> {
+  typedef Metadata *SimpleType;
+  static SimpleType getSimplifiedValue(MDOperand &MD) { return MD.get(); }
+};
+
+template <> struct simplify_type<const MDOperand> {
+  typedef Metadata *SimpleType;
+  static SimpleType getSimplifiedValue(const MDOperand &MD) { return MD.get(); }
+};
+
+/// \brief Pointer to the context, with optional RAUW support.
+///
+/// Either a raw (non-null) pointer to the \a LLVMContext, or an owned pointer
+/// to \a ReplaceableMetadataImpl (which has a reference to \a LLVMContext).
+class ContextAndReplaceableUses {
+  PointerUnion<LLVMContext *, ReplaceableMetadataImpl *> Ptr;
+
+  ContextAndReplaceableUses() = delete;
+  ContextAndReplaceableUses(ContextAndReplaceableUses &&) = delete;
+  ContextAndReplaceableUses(const ContextAndReplaceableUses &) = delete;
+  ContextAndReplaceableUses &operator=(ContextAndReplaceableUses &&) = delete;
+  ContextAndReplaceableUses &
+  operator=(const ContextAndReplaceableUses &) = delete;
+
+public:
+  ContextAndReplaceableUses(LLVMContext &Context) : Ptr(&Context) {}
+  ContextAndReplaceableUses(
+      std::unique_ptr<ReplaceableMetadataImpl> ReplaceableUses)
+      : Ptr(ReplaceableUses.release()) {
+    assert(getReplaceableUses() && "Expected non-null replaceable uses");
+  }
+  ~ContextAndReplaceableUses() { delete getReplaceableUses(); }
+
+  operator LLVMContext &() { return getContext(); }
+
+  /// \brief Whether this contains RAUW support.
+  bool hasReplaceableUses() const {
+    return Ptr.is<ReplaceableMetadataImpl *>();
+  }
+  LLVMContext &getContext() const {
+    if (hasReplaceableUses())
+      return getReplaceableUses()->getContext();
+    return *Ptr.get<LLVMContext *>();
+  }
+  ReplaceableMetadataImpl *getReplaceableUses() const {
+    if (hasReplaceableUses())
+      return Ptr.get<ReplaceableMetadataImpl *>();
+    return nullptr;
+  }
+
+  /// \brief Assign RAUW support to this.
+  ///
+  /// Make this replaceable, taking ownership of \c ReplaceableUses (which must
+  /// not be null).
+  void
+  makeReplaceable(std::unique_ptr<ReplaceableMetadataImpl> ReplaceableUses) {
+    assert(ReplaceableUses && "Expected non-null replaceable uses");
+    assert(&ReplaceableUses->getContext() == &getContext() &&
+           "Expected same context");
+    delete getReplaceableUses();
+    Ptr = ReplaceableUses.release();
+  }
+
+  /// \brief Drop RAUW support.
+  ///
+  /// Cede ownership of RAUW support, returning it.
+  std::unique_ptr<ReplaceableMetadataImpl> takeReplaceableUses() {
+    assert(hasReplaceableUses() && "Expected to own replaceable uses");
+    std::unique_ptr<ReplaceableMetadataImpl> ReplaceableUses(
+        getReplaceableUses());
+    Ptr = &ReplaceableUses->getContext();
+    return ReplaceableUses;
+  }
+};
+
+struct TempMDNodeDeleter {
+  inline void operator()(MDNode *Node) const;
+};
+
+#define HANDLE_MDNODE_LEAF(CLASS)                                              \
+  typedef std::unique_ptr<CLASS, TempMDNodeDeleter> Temp##CLASS;
+#define HANDLE_MDNODE_BRANCH(CLASS) HANDLE_MDNODE_LEAF(CLASS)
+#include "llvm/IR/Metadata.def"
+
+/// \brief Metadata node.
+///
+/// Metadata nodes can be uniqued, like constants, or distinct.  Temporary
+/// metadata nodes (with full support for RAUW) can be used to delay uniquing
+/// until forward references are known.  The basic metadata node is an \a
+/// MDTuple.
+///
+/// There is limited support for RAUW at construction time.  At construction
+/// time, if any operand is a temporary node (or an unresolved uniqued node,
+/// which indicates a transitive temporary operand), the node itself will be
+/// unresolved.  As soon as all operands become resolved, it will drop RAUW
+/// support permanently.
+///
+/// If an unresolved node is part of a cycle, \a resolveCycles() needs
+/// to be called on some member of the cycle once all temporary nodes have been
+/// replaced.
 class MDNode : public Metadata {
-  MDNode(const MDNode &) LLVM_DELETED_FUNCTION;
-  void operator=(const MDNode &) LLVM_DELETED_FUNCTION;
-  friend class MDNodeOperand;
+  friend class ReplaceableMetadataImpl;
   friend class LLVMContextImpl;
-  void *operator new(size_t) LLVM_DELETED_FUNCTION;
+
+  MDNode(const MDNode &) = delete;
+  void operator=(const MDNode &) = delete;
+  void *operator new(size_t) = delete;
+
+  unsigned NumOperands;
+  unsigned NumUnresolved;
 
 protected:
-  void *operator new(size_t Size, unsigned NumOps);
+  ContextAndReplaceableUses Context;
 
-  /// \brief Required by std, but never called.
+  void *operator new(size_t Size, unsigned NumOps);
   void operator delete(void *Mem);
 
   /// \brief Required by std, but never called.
@@ -164,83 +720,188 @@ protected:
     llvm_unreachable("Constructor throws?");
   }
 
-  /// \brief Subclass data enums.
-  enum {
-    /// FunctionLocalBit - This bit is set if this MDNode is function local.
-    /// This is true when it (potentially transitively) contains a reference to
-    /// something in a function, like an argument, basicblock, or instruction.
-    FunctionLocalBit = 1 << 0,
+  MDNode(LLVMContext &Context, unsigned ID, StorageType Storage,
+         ArrayRef<Metadata *> Ops1, ArrayRef<Metadata *> Ops2 = None);
+  ~MDNode() {}
 
-    /// NotUniquedBit - This is set on MDNodes that are not uniqued because they
-    /// have a null operand.
-    NotUniquedBit    = 1 << 1
-  };
+  void dropAllReferences();
 
-  /// \brief FunctionLocal enums.
-  enum FunctionLocalness {
-    FL_Unknown = -1,
-    FL_No = 0,
-    FL_Yes = 1
-  };
+  MDOperand *mutable_begin() { return mutable_end() - NumOperands; }
+  MDOperand *mutable_end() { return reinterpret_cast<MDOperand *>(this); }
 
-  /// \brief Replace each instance of the given operand with a new value.
-  void replaceOperand(MDNodeOperand *Op, Value *NewVal);
+public:
+  static inline MDTuple *get(LLVMContext &Context, ArrayRef<Metadata *> MDs);
+  static inline MDTuple *getIfExists(LLVMContext &Context,
+                                     ArrayRef<Metadata *> MDs);
+  static inline MDTuple *getDistinct(LLVMContext &Context,
+                                     ArrayRef<Metadata *> MDs);
+  static inline TempMDTuple getTemporary(LLVMContext &Context,
+                                         ArrayRef<Metadata *> MDs);
 
-  MDNode(LLVMContext &C, unsigned ID, ArrayRef<Value *> Vals,
-         bool isFunctionLocal);
-  ~MDNode() {}
+  /// \brief Create a (temporary) clone of this.
+  TempMDNode clone() const;
 
-  static MDNode *getMDNode(LLVMContext &C, ArrayRef<Value*> Vals,
-                           FunctionLocalness FL, bool Insert = true);
-public:
-  static MDNode *get(LLVMContext &Context, ArrayRef<Value*> Vals);
-  /// \brief Construct MDNode with an explicit function-localness.
+  /// \brief Deallocate a node created by getTemporary.
   ///
-  /// Don't analyze Vals; trust isFunctionLocal.
-  static MDNode *getWhenValsUnresolved(LLVMContext &Context,
-                                       ArrayRef<Value*> Vals,
-                                       bool isFunctionLocal);
+  /// Calls \c replaceAllUsesWith(nullptr) before deleting, so any remaining
+  /// references will be reset.
+  static void deleteTemporary(MDNode *N);
 
-  static MDNode *getIfExists(LLVMContext &Context, ArrayRef<Value*> Vals);
+  LLVMContext &getContext() const { return Context.getContext(); }
+
+  /// \brief Replace a specific operand.
+  void replaceOperandWith(unsigned I, Metadata *New);
 
-  /// \brief Return a temporary MDNode
+  /// \brief Check if node is fully resolved.
   ///
-  /// For use in constructing cyclic MDNode structures. A temporary MDNode is
-  /// not uniqued, may be RAUW'd, and must be manually deleted with
-  /// deleteTemporary.
-  static MDNode *getTemporary(LLVMContext &Context, ArrayRef<Value*> Vals);
+  /// If \a isTemporary(), this always returns \c false; if \a isDistinct(),
+  /// this always returns \c true.
+  ///
+  /// If \a isUniqued(), returns \c true if this has already dropped RAUW
+  /// support (because all operands are resolved).
+  ///
+  /// As forward declarations are resolved, their containers should get
+  /// resolved automatically.  However, if this (or one of its operands) is
+  /// involved in a cycle, \a resolveCycles() needs to be called explicitly.
+  bool isResolved() const { return !Context.hasReplaceableUses(); }
 
-  /// \brief Deallocate a node created by getTemporary.
+  bool isUniqued() const { return Storage == Uniqued; }
+  bool isDistinct() const { return Storage == Distinct; }
+  bool isTemporary() const { return Storage == Temporary; }
+
+  /// \brief RAUW a temporary.
   ///
-  /// The node must not have any users.
-  static void deleteTemporary(MDNode *N);
+  /// \pre \a isTemporary() must be \c true.
+  void replaceAllUsesWith(Metadata *MD) {
+    assert(isTemporary() && "Expected temporary node");
+    assert(!isResolved() && "Expected RAUW support");
+    Context.getReplaceableUses()->replaceAllUsesWith(MD);
+  }
 
-  /// \brief Replace a specific operand.
-  void replaceOperandWith(unsigned i, Value *NewVal);
+  /// \brief Resolve cycles.
+  ///
+  /// Once all forward declarations have been resolved, force cycles to be
+  /// resolved.
+  ///
+  /// \pre No operands (or operands' operands, etc.) have \a isTemporary().
+  void resolveCycles();
 
-  /// \brief Return specified operand.
-  Value *getOperand(unsigned i) const LLVM_READONLY;
+  /// \brief Replace a temporary node with a permanent one.
+  ///
+  /// Try to create a uniqued version of \c N -- in place, if possible -- and
+  /// return it.  If \c N cannot be uniqued, return a distinct node instead.
+  template <class T>
+  static typename std::enable_if<std::is_base_of<MDNode, T>::value, T *>::type
+  replaceWithPermanent(std::unique_ptr<T, TempMDNodeDeleter> N) {
+    return cast<T>(N.release()->replaceWithPermanentImpl());
+  }
 
-  /// \brief Return number of MDNode operands.
-  unsigned getNumOperands() const { return NumOperands; }
+  /// \brief Replace a temporary node with a uniqued one.
+  ///
+  /// Create a uniqued version of \c N -- in place, if possible -- and return
+  /// it.  Takes ownership of the temporary node.
+  ///
+  /// \pre N does not self-reference.
+  template <class T>
+  static typename std::enable_if<std::is_base_of<MDNode, T>::value, T *>::type
+  replaceWithUniqued(std::unique_ptr<T, TempMDNodeDeleter> N) {
+    return cast<T>(N.release()->replaceWithUniquedImpl());
+  }
 
-  /// \brief Return whether MDNode is local to a function.
-  bool isFunctionLocal() const {
-    return (getSubclassDataFromValue() & FunctionLocalBit) != 0;
+  /// \brief Replace a temporary node with a distinct one.
+  ///
+  /// Create a distinct version of \c N -- in place, if possible -- and return
+  /// it.  Takes ownership of the temporary node.
+  template <class T>
+  static typename std::enable_if<std::is_base_of<MDNode, T>::value, T *>::type
+  replaceWithDistinct(std::unique_ptr<T, TempMDNodeDeleter> N) {
+    return cast<T>(N.release()->replaceWithDistinctImpl());
   }
 
-  /// \brief Return the first function-local operand's function.
+private:
+  MDNode *replaceWithPermanentImpl();
+  MDNode *replaceWithUniquedImpl();
+  MDNode *replaceWithDistinctImpl();
+
+protected:
+  /// \brief Set an operand.
+  ///
+  /// Sets the operand directly, without worrying about uniquing.
+  void setOperand(unsigned I, Metadata *New);
+
+  void storeDistinctInContext();
+  template <class T, class StoreT>
+  static T *storeImpl(T *N, StorageType Storage, StoreT &Store);
+
+private:
+  void handleChangedOperand(void *Ref, Metadata *New);
+
+  void resolve();
+  void resolveAfterOperandChange(Metadata *Old, Metadata *New);
+  void decrementUnresolvedOperandCount();
+  unsigned countUnresolvedOperands();
+
+  /// \brief Mutate this to be "uniqued".
   ///
-  /// If this metadata is function-local and recursively has a function-local
-  /// operand, return the first such operand's parent function.  Otherwise,
-  /// return null. getFunction() should not be used for performance- critical
-  /// code because it recursively visits all the MDNode's operands.
-  const Function *getFunction() const;
+  /// Mutate this so that \a isUniqued().
+  /// \pre \a isTemporary().
+  /// \pre already added to uniquing set.
+  void makeUniqued();
+
+  /// \brief Mutate this to be "distinct".
+  ///
+  /// Mutate this so that \a isDistinct().
+  /// \pre \a isTemporary().
+  void makeDistinct();
+
+  void deleteAsSubclass();
+  MDNode *uniquify();
+  void eraseFromStore();
+
+  template <class NodeTy> struct HasCachedHash;
+  template <class NodeTy>
+  static void dispatchRecalculateHash(NodeTy *N, std::true_type) {
+    N->recalculateHash();
+  }
+  template <class NodeTy>
+  static void dispatchRecalculateHash(NodeTy *N, std::false_type) {}
+  template <class NodeTy>
+  static void dispatchResetHash(NodeTy *N, std::true_type) {
+    N->setHash(0);
+  }
+  template <class NodeTy>
+  static void dispatchResetHash(NodeTy *N, std::false_type) {}
+
+public:
+  typedef const MDOperand *op_iterator;
+  typedef iterator_range<op_iterator> op_range;
+
+  op_iterator op_begin() const {
+    return const_cast<MDNode *>(this)->mutable_begin();
+  }
+  op_iterator op_end() const {
+    return const_cast<MDNode *>(this)->mutable_end();
+  }
+  op_range operands() const { return op_range(op_begin(), op_end()); }
+
+  const MDOperand &getOperand(unsigned I) const {
+    assert(I < NumOperands && "Out of range");
+    return op_begin()[I];
+  }
+
+  /// \brief Return number of MDNode operands.
+  unsigned getNumOperands() const { return NumOperands; }
 
   /// \brief Methods for support type inquiry through isa, cast, and dyn_cast:
-  static bool classof(const Value *V) {
-    return V->getValueID() == GenericMDNodeVal ||
-           V->getValueID() == MDNodeFwdDeclVal;
+  static bool classof(const Metadata *MD) {
+    switch (MD->getMetadataID()) {
+    default:
+      return false;
+#define HANDLE_MDNODE_LEAF(CLASS)                                              \
+  case CLASS##Kind:                                                            \
+    return true;
+#include "llvm/IR/Metadata.def"
+    }
   }
 
   /// \brief Check whether MDNode is a vtable access.
@@ -250,72 +911,91 @@ public:
   static MDNode *concatenate(MDNode *A, MDNode *B);
   static MDNode *intersect(MDNode *A, MDNode *B);
   static MDNode *getMostGenericTBAA(MDNode *A, MDNode *B);
-  static AAMDNodes getMostGenericAA(const AAMDNodes &A, const AAMDNodes &B);
   static MDNode *getMostGenericFPMath(MDNode *A, MDNode *B);
   static MDNode *getMostGenericRange(MDNode *A, MDNode *B);
-
-protected:
-  bool isNotUniqued() const {
-    return (getSubclassDataFromValue() & NotUniquedBit) != 0;
-  }
-  void setIsNotUniqued();
-
-  // Shadow Value::setValueSubclassData with a private forwarding method so that
-  // any future subclasses cannot accidentally use it.
-  void setValueSubclassData(unsigned short D) {
-    Value::setValueSubclassData(D);
-  }
+  static MDNode *getMostGenericAliasScope(MDNode *A, MDNode *B);
 };
 
-/// \brief Generic metadata node.
-///
-/// Generic metadata nodes, with opt-out support for uniquing.
-///
-/// Although nodes are uniqued by default, \a GenericMDNode has no support for
-/// RAUW.  If an operand change (due to RAUW or otherwise) causes a uniquing
-/// collision, the uniquing bit is dropped.
+/// \brief Tuple of metadata.
 ///
-/// TODO: Make uniquing opt-out (status: mandatory, sometimes dropped).
-/// TODO: Drop support for RAUW.
-class GenericMDNode : public MDNode {
-  friend class MDNode;
+/// This is the simple \a MDNode arbitrary tuple.  Nodes are uniqued by
+/// default based on their operands.
+class MDTuple : public MDNode {
   friend class LLVMContextImpl;
+  friend class MDNode;
+
+  MDTuple(LLVMContext &C, StorageType Storage, unsigned Hash,
+          ArrayRef<Metadata *> Vals)
+      : MDNode(C, MDTupleKind, Storage, Vals) {
+    setHash(Hash);
+  }
+  ~MDTuple() { dropAllReferences(); }
 
-  unsigned Hash;
+  void setHash(unsigned Hash) { SubclassData32 = Hash; }
+  void recalculateHash();
 
-  GenericMDNode(LLVMContext &C, ArrayRef<Value *> Vals, bool isFunctionLocal)
-      : MDNode(C, GenericMDNodeVal, Vals, isFunctionLocal), Hash(0) {}
-  ~GenericMDNode();
+  static MDTuple *getImpl(LLVMContext &Context, ArrayRef<Metadata *> MDs,
+                          StorageType Storage, bool ShouldCreate = true);
 
-  void dropAllReferences();
+  TempMDTuple cloneImpl() const {
+    return getTemporary(getContext(),
+                        SmallVector<Metadata *, 4>(op_begin(), op_end()));
+  }
 
 public:
   /// \brief Get the hash, if any.
-  unsigned getHash() const { return Hash; }
+  unsigned getHash() const { return SubclassData32; }
 
-  static bool classof(const Value *V) {
-    return V->getValueID() == GenericMDNodeVal;
+  static MDTuple *get(LLVMContext &Context, ArrayRef<Metadata *> MDs) {
+    return getImpl(Context, MDs, Uniqued);
+  }
+  static MDTuple *getIfExists(LLVMContext &Context, ArrayRef<Metadata *> MDs) {
+    return getImpl(Context, MDs, Uniqued, /* ShouldCreate */ false);
   }
-};
 
-/// \brief Forward declaration of metadata.
-///
-/// Forward declaration of metadata, in the form of a metadata node.  Unlike \a
-/// GenericMDNode, this class has support for RAUW and is suitable for forward
-/// references.
-class MDNodeFwdDecl : public MDNode {
-  friend class MDNode;
+  /// \brief Return a distinct node.
+  ///
+  /// Return a distinct node -- i.e., a node that is not uniqued.
+  static MDTuple *getDistinct(LLVMContext &Context, ArrayRef<Metadata *> MDs) {
+    return getImpl(Context, MDs, Distinct);
+  }
 
-  MDNodeFwdDecl(LLVMContext &C, ArrayRef<Value *> Vals, bool isFunctionLocal)
-      : MDNode(C, MDNodeFwdDeclVal, Vals, isFunctionLocal) {}
-  ~MDNodeFwdDecl() {}
+  /// \brief Return a temporary node.
+  ///
+  /// For use in constructing cyclic MDNode structures. A temporary MDNode is
+  /// not uniqued, may be RAUW'd, and must be manually deleted with
+  /// deleteTemporary.
+  static TempMDTuple getTemporary(LLVMContext &Context,
+                                  ArrayRef<Metadata *> MDs) {
+    return TempMDTuple(getImpl(Context, MDs, Temporary));
+  }
 
-public:
-  static bool classof(const Value *V) {
-    return V->getValueID() == MDNodeFwdDeclVal;
+  /// \brief Return a (temporary) clone of this.
+  TempMDTuple clone() const { return cloneImpl(); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == MDTupleKind;
   }
 };
 
+MDTuple *MDNode::get(LLVMContext &Context, ArrayRef<Metadata *> MDs) {
+  return MDTuple::get(Context, MDs);
+}
+MDTuple *MDNode::getIfExists(LLVMContext &Context, ArrayRef<Metadata *> MDs) {
+  return MDTuple::getIfExists(Context, MDs);
+}
+MDTuple *MDNode::getDistinct(LLVMContext &Context, ArrayRef<Metadata *> MDs) {
+  return MDTuple::getDistinct(Context, MDs);
+}
+TempMDTuple MDNode::getTemporary(LLVMContext &Context,
+                                 ArrayRef<Metadata *> MDs) {
+  return MDTuple::getTemporary(Context, MDs);
+}
+
+void TempMDNodeDeleter::operator()(MDNode *Node) const {
+  MDNode::deleteTemporary(Node);
+}
+
 //===----------------------------------------------------------------------===//
 /// \brief A tuple of MDNodes.
 ///
@@ -328,11 +1008,11 @@ class NamedMDNode : public ilist_node<NamedMDNode> {
   friend struct ilist_traits<NamedMDNode>;
   friend class LLVMContextImpl;
   friend class Module;
-  NamedMDNode(const NamedMDNode &) LLVM_DELETED_FUNCTION;
+  NamedMDNode(const NamedMDNode &) = delete;
 
   std::string Name;
   Module *Parent;
-  void *Operands; // SmallVector<TrackingVH<MDNode>, 4>
+  void *Operands; // SmallVector<TrackingMDRef, 4>
 
   void setParent(Module *M) { Parent = M; }
 
@@ -390,6 +1070,7 @@ public:
   MDNode *getOperand(unsigned i) const;
   unsigned getNumOperands() const;
   void addOperand(MDNode *M);
+  void setOperand(unsigned I, MDNode *New);
   StringRef getName() const;
   void print(raw_ostream &ROS) const;
   void dump() const;
diff --git a/include/llvm/IR/MetadataTracking.h b/include/llvm/IR/MetadataTracking.h
new file mode 100644
index 0000000..541d9b3
--- /dev/null
+++ b/include/llvm/IR/MetadataTracking.h
@@ -0,0 +1,99 @@
+//===- llvm/IR/MetadataTracking.h - Metadata tracking ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Low-level functions to enable tracking of metadata that could RAUW.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_METADATATRACKING_H
+#define LLVM_IR_METADATATRACKING_H
+
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/Support/Casting.h"
+#include <type_traits>
+
+namespace llvm {
+
+class Metadata;
+class MetadataAsValue;
+
+/// \brief API for tracking metadata references through RAUW and deletion.
+///
+/// Shared API for updating \a Metadata pointers in subclasses that support
+/// RAUW.
+///
+/// This API is not meant to be used directly.  See \a TrackingMDRef for a
+/// user-friendly tracking reference.
+class MetadataTracking {
+public:
+  /// \brief Track the reference to metadata.
+  ///
+  /// Register \c MD with \c *MD, if the subclass supports tracking.  If \c *MD
+  /// gets RAUW'ed, \c MD will be updated to the new address.  If \c *MD gets
+  /// deleted, \c MD will be set to \c nullptr.
+  ///
+  /// If tracking isn't supported, \c *MD will not change.
+  ///
+  /// \return true iff tracking is supported by \c MD.
+  static bool track(Metadata *&MD) {
+    return track(&MD, *MD, static_cast<Metadata *>(nullptr));
+  }
+
+  /// \brief Track the reference to metadata for \a Metadata.
+  ///
+  /// As \a track(Metadata*&), but with support for calling back to \c Owner to
+  /// tell it that its operand changed.  This could trigger \c Owner being
+  /// re-uniqued.
+  static bool track(void *Ref, Metadata &MD, Metadata &Owner) {
+    return track(Ref, MD, &Owner);
+  }
+
+  /// \brief Track the reference to metadata for \a MetadataAsValue.
+  ///
+  /// As \a track(Metadata*&), but with support for calling back to \c Owner to
+  /// tell it that its operand changed.  This could trigger \c Owner being
+  /// re-uniqued.
+  static bool track(void *Ref, Metadata &MD, MetadataAsValue &Owner) {
+    return track(Ref, MD, &Owner);
+  }
+
+  /// \brief Stop tracking a reference to metadata.
+  ///
+  /// Stops \c *MD from tracking \c MD.
+  static void untrack(Metadata *&MD) { untrack(&MD, *MD); }
+  static void untrack(void *Ref, Metadata &MD);
+
+  /// \brief Move tracking from one reference to another.
+  ///
+  /// Semantically equivalent to \c untrack(MD) followed by \c track(New),
+  /// except that ownership callbacks are maintained.
+  ///
+  /// Note: it is an error if \c *MD does not equal \c New.
+  ///
+  /// \return true iff tracking is supported by \c MD.
+  static bool retrack(Metadata *&MD, Metadata *&New) {
+    return retrack(&MD, *MD, &New);
+  }
+  static bool retrack(void *Ref, Metadata &MD, void *New);
+
+  /// \brief Check whether metadata is replaceable.
+  static bool isReplaceable(const Metadata &MD);
+
+  typedef PointerUnion<MetadataAsValue *, Metadata *> OwnerTy;
+
+private:
+  /// \brief Track a reference to metadata for an owner.
+  ///
+  /// Generalized version of tracking.
+  static bool track(void *Ref, Metadata &MD, OwnerTy Owner);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h
index 7fff80a..62f4194 100644
--- a/include/llvm/IR/Module.h
+++ b/include/llvm/IR/Module.h
@@ -33,8 +33,6 @@ class GVMaterializer;
 class LLVMContext;
 class RandomNumberGenerator;
 class StructType;
-template<typename T> struct DenseMapInfo;
-template<typename KeyT, typename ValueT, typename KeyInfoT> class DenseMap;
 
 template<> struct ilist_traits<Function>
   : public SymbolTableListTraits<Function, Module> {
@@ -150,7 +148,7 @@ public:
 
   /// The named metadata iterators.
   typedef NamedMDListType::iterator             named_metadata_iterator;
-  /// The named metadata constant interators.
+  /// The named metadata constant iterators.
   typedef NamedMDListType::const_iterator const_named_metadata_iterator;
 
   /// This enumeration defines the supported behaviors of module flags.
@@ -190,16 +188,16 @@ public:
     ModFlagBehaviorLastVal = AppendUnique
   };
 
-  /// Checks if Value represents a valid ModFlagBehavior, and stores the
+  /// Checks if Metadata represents a valid ModFlagBehavior, and stores the
   /// converted result in MFB.
-  static bool isValidModFlagBehavior(Value *V, ModFlagBehavior &MFB);
+  static bool isValidModFlagBehavior(Metadata *MD, ModFlagBehavior &MFB);
 
   struct ModuleFlagEntry {
     ModFlagBehavior Behavior;
     MDString *Key;
-    Value *Val;
-    ModuleFlagEntry(ModFlagBehavior B, MDString *K, Value *V)
-      : Behavior(B), Key(K), Val(V) {}
+    Metadata *Val;
+    ModuleFlagEntry(ModFlagBehavior B, MDString *K, Metadata *V)
+        : Behavior(B), Key(K), Val(V) {}
   };
 
 /// @}
@@ -219,9 +217,8 @@ private:
   Materializer;                   ///< Used to materialize GlobalValues
   std::string ModuleID;           ///< Human readable identifier for the module
   std::string TargetTriple;       ///< Platform target triple Module compiled on
+                                  ///< Format: (arch)(sub)-(vendor)-(sys0-(abi)
   void *NamedMDSymTab;            ///< NamedMDNode names.
-  // Allow lazy initialization in const method.
-  mutable RandomNumberGenerator *RNG; ///< The random number generator for this module.
 
   // We need to keep the string because the C API expects us to own the string
   // representation.
@@ -251,6 +248,12 @@ public:
   /// @returns the module identifier as a string
   const std::string &getModuleIdentifier() const { return ModuleID; }
 
+  /// \brief Get a short "name" for the module.
+  ///
+  /// This is useful for debugging or logging. It is essentially a convenience
+  /// wrapper around getModuleIdentifier().
+  StringRef getName() const { return ModuleID; }
+
   /// Get the data layout string for the module's target platform. This is
   /// equivalent to getDataLayout()->getStringRepresentation().
   const std::string &getDataLayoutStr() const { return DataLayoutStr; }
@@ -270,10 +273,16 @@ public:
   /// @returns a string containing the module-scope inline assembly blocks.
   const std::string &getModuleInlineAsm() const { return GlobalScopeAsm; }
 
-  /// Get the RandomNumberGenerator for this module. The RNG can be
-  /// seeded via -rng-seed=<uint64> and is salted with the ModuleID.
-  /// The returned RNG should not be shared across threads.
-  RandomNumberGenerator &getRNG() const;
+  /// Get a RandomNumberGenerator salted for use with this module. The
+  /// RNG can be seeded via -rng-seed=<uint64> and is salted with the
+  /// ModuleID and the provided pass salt. The returned RNG should not
+  /// be shared across threads or passes.
+  ///
+  /// A unique RNG per pass ensures a reproducible random stream even
+  /// when other randomness consuming passes are added or removed. In
+  /// addition, the random stream will be reproducible across LLVM
+  /// versions when the pass does not change.
+  RandomNumberGenerator *createRNG(const Pass* P) const;
 
 /// @}
 /// @name Module Level Mutators
@@ -327,6 +336,8 @@ public:
   /// name.
   StructType *getTypeByName(StringRef Name) const;
 
+  std::vector<StructType *> getIdentifiedStructTypes() const;
+
 /// @}
 /// @name Function Accessors
 /// @{
@@ -441,7 +452,7 @@ public:
 
   /// Return the corresponding value if Key appears in module flags, otherwise
   /// return null.
-  Value *getModuleFlag(StringRef Key) const;
+  Metadata *getModuleFlag(StringRef Key) const;
 
   /// Returns the NamedMDNode in the module that represents module-level flags.
   /// This method returns null if there are no module-level flags.
@@ -454,7 +465,8 @@ public:
 
   /// Add a module-level flag to the module-level flags metadata. It will create
   /// the module-level flags named metadata if it doesn't already exist.
-  void addModuleFlag(ModFlagBehavior Behavior, StringRef Key, Value *Val);
+  void addModuleFlag(ModFlagBehavior Behavior, StringRef Key, Metadata *Val);
+  void addModuleFlag(ModFlagBehavior Behavior, StringRef Key, Constant *Val);
   void addModuleFlag(ModFlagBehavior Behavior, StringRef Key, uint32_t Val);
   void addModuleFlag(MDNode *Node);
 
@@ -483,7 +495,7 @@ public:
   std::error_code materialize(GlobalValue *GV);
   /// If the GlobalValue is read in, and if the GVMaterializer supports it,
   /// release the memory for the function, and set it up to be materialized
-  /// lazily. If !isDematerializable(), this method is a noop.
+  /// lazily. If !isDematerializable(), this method is a no-op.
   void Dematerialize(GlobalValue *GV);
 
   /// Make sure all GlobalValues in this Module are fully read.
@@ -618,6 +630,15 @@ public:
                                                          named_metadata_end());
   }
 
+  /// Destroy ConstantArrays in LLVMContext if they are not used.
+  /// ConstantArrays constructed during linking can cause quadratic memory
+  /// explosion. Releasing all unused constants can cause a 20% LTO compile-time
+  /// slowdown for a large application.
+  ///
+  /// NOTE: Constants are currently owned by LLVMContext. This can then only
+  /// be called where all uses of the LLVMContext are understood.
+  void dropTriviallyDeadConstantArrays();
+
 /// @}
 /// @name Utility functions for printing and dumping Module objects
 /// @{
diff --git a/include/llvm/IR/Operator.h b/include/llvm/IR/Operator.h
index 0933f21..46935ce 100644
--- a/include/llvm/IR/Operator.h
+++ b/include/llvm/IR/Operator.h
@@ -34,12 +34,12 @@ class Operator : public User {
 private:
   // The Operator class is intended to be used as a utility, and is never itself
   // instantiated.
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
-  void *operator new(size_t s) LLVM_DELETED_FUNCTION;
-  Operator() LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
+  void *operator new(size_t s) = delete;
+  Operator() = delete;
 
 protected:
-  // NOTE: Cannot use LLVM_DELETED_FUNCTION because it's not legal to delete
+  // NOTE: Cannot use = delete because it's not legal to delete
   // an overridden method that's not deleted in the base class. Cannot leave
   // this unimplemented because that leads to an ODR-violation.
   ~Operator();
@@ -504,6 +504,20 @@ public:
   }
 };
 
+class BitCastOperator
+    : public ConcreteOperator<Operator, Instruction::BitCast> {
+  friend class BitCastInst;
+  friend class ConstantExpr;
+
+public:
+  Type *getSrcTy() const {
+    return getOperand(0)->getType();
+  }
+
+  Type *getDestTy() const {
+    return getType();
+  }
+};
 
 } // End llvm namespace
 
diff --git a/include/llvm/IR/PassManager.h b/include/llvm/IR/PassManager.h
index 45985e1..c92e7c9 100644
--- a/include/llvm/IR/PassManager.h
+++ b/include/llvm/IR/PassManager.h
@@ -43,6 +43,9 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManagerInternal.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/type_traits.h"
 #include <list>
 #include <memory>
@@ -91,9 +94,12 @@ public:
   }
 
   /// \brief Mark a particular pass as preserved, adding it to the set.
-  template <typename PassT> void preserve() {
+  template <typename PassT> void preserve() { preserve(PassT::ID()); }
+
+  /// \brief Mark an abstract PassID as preserved, adding it to the set.
+  void preserve(void *PassID) {
     if (!areAllPreserved())
-      PreservedPassIDs.insert(PassT::ID());
+      PreservedPassIDs.insert(PassID);
   }
 
   /// \brief Intersect this set with another in place.
@@ -140,408 +146,115 @@ public:
            PreservedPassIDs.count(PassID);
   }
 
+  /// \brief Test whether all passes are preserved.
+  ///
+  /// This is used primarily to optimize for the case of no changes which will
+  /// common in many scenarios.
+  bool areAllPreserved() const {
+    return PreservedPassIDs.count((void *)AllPassesID);
+  }
+
 private:
   // Note that this must not be -1 or -2 as those are already used by the
   // SmallPtrSet.
   static const uintptr_t AllPassesID = (intptr_t)(-3);
 
-  bool areAllPreserved() const {
-    return PreservedPassIDs.count((void *)AllPassesID);
-  }
-
   SmallPtrSet<void *, 2> PreservedPassIDs;
 };
 
-/// \brief Implementation details of the pass manager interfaces.
-namespace detail {
-
-/// \brief Template for the abstract base class used to dispatch
-/// polymorphically over pass objects.
-template <typename IRUnitT, typename AnalysisManagerT> struct PassConcept {
-  // Boiler plate necessary for the container of derived classes.
-  virtual ~PassConcept() {}
-
-  /// \brief The polymorphic API which runs the pass over a given IR entity.
-  ///
-  /// Note that actual pass object can omit the analysis manager argument if
-  /// desired. Also that the analysis manager may be null if there is no
-  /// analysis manager in the pass pipeline.
-  virtual PreservedAnalyses run(IRUnitT IR, AnalysisManagerT *AM) = 0;
-
-  /// \brief Polymorphic method to access the name of a pass.
-  virtual StringRef name() = 0;
-};
-
-/// \brief SFINAE metafunction for computing whether \c PassT has a run method
-/// accepting an \c AnalysisManagerT.
-template <typename IRUnitT, typename AnalysisManagerT, typename PassT,
-          typename ResultT>
-class PassRunAcceptsAnalysisManager {
-  typedef char SmallType;
-  struct BigType {
-    char a, b;
-  };
-
-  template <typename T, ResultT (T::*)(IRUnitT, AnalysisManagerT *)>
-  struct Checker;
-
-  template <typename T> static SmallType f(Checker<T, &T::run> *);
-  template <typename T> static BigType f(...);
+// Forward declare the analysis manager template.
+template <typename IRUnitT> class AnalysisManager;
 
-public:
-  enum { Value = sizeof(f<PassT>(nullptr)) == sizeof(SmallType) };
-};
-
-/// \brief A template wrapper used to implement the polymorphic API.
+/// \brief Manages a sequence of passes over units of IR.
 ///
-/// Can be instantiated for any object which provides a \c run method accepting
-/// an \c IRUnitT. It requires the pass to be a copyable object. When the
-/// \c run method also accepts an \c AnalysisManagerT*, we pass it along.
-template <typename IRUnitT, typename AnalysisManagerT, typename PassT,
-          bool AcceptsAnalysisManager = PassRunAcceptsAnalysisManager<
-              IRUnitT, AnalysisManagerT, PassT, PreservedAnalyses>::Value>
-struct PassModel;
-
-/// \brief Specialization of \c PassModel for passes that accept an analyis
-/// manager.
-template <typename IRUnitT, typename AnalysisManagerT, typename PassT>
-struct PassModel<IRUnitT, AnalysisManagerT, PassT, true>
-    : PassConcept<IRUnitT, AnalysisManagerT> {
-  explicit PassModel(PassT Pass) : Pass(std::move(Pass)) {}
-  // We have to explicitly define all the special member functions because MSVC
-  // refuses to generate them.
-  PassModel(const PassModel &Arg) : Pass(Arg.Pass) {}
-  PassModel(PassModel &&Arg) : Pass(std::move(Arg.Pass)) {}
-  friend void swap(PassModel &LHS, PassModel &RHS) {
-    using std::swap;
-    swap(LHS.Pass, RHS.Pass);
-  }
-  PassModel &operator=(PassModel RHS) {
-    swap(*this, RHS);
-    return *this;
-  }
-
-  PreservedAnalyses run(IRUnitT IR, AnalysisManagerT *AM) override {
-    return Pass.run(IR, AM);
-  }
-  StringRef name() override { return PassT::name(); }
-  PassT Pass;
-};
-
-/// \brief Specialization of \c PassModel for passes that accept an analyis
-/// manager.
-template <typename IRUnitT, typename AnalysisManagerT, typename PassT>
-struct PassModel<IRUnitT, AnalysisManagerT, PassT, false>
-    : PassConcept<IRUnitT, AnalysisManagerT> {
-  explicit PassModel(PassT Pass) : Pass(std::move(Pass)) {}
-  // We have to explicitly define all the special member functions because MSVC
-  // refuses to generate them.
-  PassModel(const PassModel &Arg) : Pass(Arg.Pass) {}
-  PassModel(PassModel &&Arg) : Pass(std::move(Arg.Pass)) {}
-  friend void swap(PassModel &LHS, PassModel &RHS) {
-    using std::swap;
-    swap(LHS.Pass, RHS.Pass);
-  }
-  PassModel &operator=(PassModel RHS) {
-    swap(*this, RHS);
-    return *this;
-  }
-
-  PreservedAnalyses run(IRUnitT IR, AnalysisManagerT *AM) override {
-    return Pass.run(IR);
-  }
-  StringRef name() override { return PassT::name(); }
-  PassT Pass;
-};
-
-/// \brief Abstract concept of an analysis result.
+/// A pass manager contains a sequence of passes to run over units of IR. It is
+/// itself a valid pass over that unit of IR, and when over some given IR will
+/// run each pass in sequence. This is the primary and most basic building
+/// block of a pass pipeline.
 ///
-/// This concept is parameterized over the IR unit that this result pertains
-/// to.
-template <typename IRUnitT> struct AnalysisResultConcept {
-  virtual ~AnalysisResultConcept() {}
-
-  /// \brief Method to try and mark a result as invalid.
-  ///
-  /// When the outer analysis manager detects a change in some underlying
-  /// unit of the IR, it will call this method on all of the results cached.
-  ///
-  /// This method also receives a set of preserved analyses which can be used
-  /// to avoid invalidation because the pass which changed the underlying IR
-  /// took care to update or preserve the analysis result in some way.
-  ///
-  /// \returns true if the result is indeed invalid (the default).
-  virtual bool invalidate(IRUnitT IR, const PreservedAnalyses &PA) = 0;
-};
-
-/// \brief SFINAE metafunction for computing whether \c ResultT provides an
-/// \c invalidate member function.
-template <typename IRUnitT, typename ResultT> class ResultHasInvalidateMethod {
-  typedef char SmallType;
-  struct BigType {
-    char a, b;
-  };
-
-  template <typename T, bool (T::*)(IRUnitT, const PreservedAnalyses &)>
-  struct Checker;
-
-  template <typename T> static SmallType f(Checker<T, &T::invalidate> *);
-  template <typename T> static BigType f(...);
-
+/// If it is run with an \c AnalysisManager<IRUnitT> argument, it will propagate
+/// that analysis manager to each pass it runs, as well as calling the analysis
+/// manager's invalidation routine with the PreservedAnalyses of each pass it
+/// runs.
+template <typename IRUnitT> class PassManager {
 public:
-  enum { Value = sizeof(f<ResultT>(nullptr)) == sizeof(SmallType) };
-};
-
-/// \brief Wrapper to model the analysis result concept.
-///
-/// By default, this will implement the invalidate method with a trivial
-/// implementation so that the actual analysis result doesn't need to provide
-/// an invalidation handler. It is only selected when the invalidation handler
-/// is not part of the ResultT's interface.
-template <typename IRUnitT, typename PassT, typename ResultT,
-          bool HasInvalidateHandler =
-              ResultHasInvalidateMethod<IRUnitT, ResultT>::Value>
-struct AnalysisResultModel;
-
-/// \brief Specialization of \c AnalysisResultModel which provides the default
-/// invalidate functionality.
-template <typename IRUnitT, typename PassT, typename ResultT>
-struct AnalysisResultModel<IRUnitT, PassT, ResultT, false>
-    : AnalysisResultConcept<IRUnitT> {
-  explicit AnalysisResultModel(ResultT Result) : Result(std::move(Result)) {}
-  // We have to explicitly define all the special member functions because MSVC
-  // refuses to generate them.
-  AnalysisResultModel(const AnalysisResultModel &Arg) : Result(Arg.Result) {}
-  AnalysisResultModel(AnalysisResultModel &&Arg)
-      : Result(std::move(Arg.Result)) {}
-  friend void swap(AnalysisResultModel &LHS, AnalysisResultModel &RHS) {
-    using std::swap;
-    swap(LHS.Result, RHS.Result);
-  }
-  AnalysisResultModel &operator=(AnalysisResultModel RHS) {
-    swap(*this, RHS);
-    return *this;
-  }
-
-  /// \brief The model bases invalidation solely on being in the preserved set.
-  //
-  // FIXME: We should actually use two different concepts for analysis results
-  // rather than two different models, and avoid the indirect function call for
-  // ones that use the trivial behavior.
-  bool invalidate(IRUnitT, const PreservedAnalyses &PA) override {
-    return !PA.preserved(PassT::ID());
-  }
-
-  ResultT Result;
-};
-
-/// \brief Specialization of \c AnalysisResultModel which delegates invalidate
-/// handling to \c ResultT.
-template <typename IRUnitT, typename PassT, typename ResultT>
-struct AnalysisResultModel<IRUnitT, PassT, ResultT, true>
-    : AnalysisResultConcept<IRUnitT> {
-  explicit AnalysisResultModel(ResultT Result) : Result(std::move(Result)) {}
+  /// \brief Construct a pass manager.
+  ///
+  /// It can be passed a flag to get debug logging as the passes are run.
+  PassManager(bool DebugLogging = false) : DebugLogging(DebugLogging) {}
   // We have to explicitly define all the special member functions because MSVC
   // refuses to generate them.
-  AnalysisResultModel(const AnalysisResultModel &Arg) : Result(Arg.Result) {}
-  AnalysisResultModel(AnalysisResultModel &&Arg)
-      : Result(std::move(Arg.Result)) {}
-  friend void swap(AnalysisResultModel &LHS, AnalysisResultModel &RHS) {
-    using std::swap;
-    swap(LHS.Result, RHS.Result);
-  }
-  AnalysisResultModel &operator=(AnalysisResultModel RHS) {
-    swap(*this, RHS);
+  PassManager(PassManager &&Arg)
+      : Passes(std::move(Arg.Passes)),
+        DebugLogging(std::move(Arg.DebugLogging)) {}
+  PassManager &operator=(PassManager &&RHS) {
+    Passes = std::move(RHS.Passes);
+    DebugLogging = std::move(RHS.DebugLogging);
     return *this;
   }
 
-  /// \brief The model delegates to the \c ResultT method.
-  bool invalidate(IRUnitT IR, const PreservedAnalyses &PA) override {
-    return Result.invalidate(IR, PA);
-  }
+  /// \brief Run all of the passes in this manager over the IR.
+  PreservedAnalyses run(IRUnitT &IR, AnalysisManager<IRUnitT> *AM = nullptr) {
+    PreservedAnalyses PA = PreservedAnalyses::all();
 
-  ResultT Result;
-};
+    if (DebugLogging)
+      dbgs() << "Starting pass manager run.\n";
 
-/// \brief Abstract concept of an analysis pass.
-///
-/// This concept is parameterized over the IR unit that it can run over and
-/// produce an analysis result.
-template <typename IRUnitT, typename AnalysisManagerT>
-struct AnalysisPassConcept {
-  virtual ~AnalysisPassConcept() {}
-
-  /// \brief Method to run this analysis over a unit of IR.
-  /// \returns A unique_ptr to the analysis result object to be queried by
-  /// users.
-  virtual std::unique_ptr<AnalysisResultConcept<IRUnitT>>
-  run(IRUnitT IR, AnalysisManagerT *AM) = 0;
-};
+    for (unsigned Idx = 0, Size = Passes.size(); Idx != Size; ++Idx) {
+      if (DebugLogging)
+        dbgs() << "Running pass: " << Passes[Idx]->name() << "\n";
 
-/// \brief Wrapper to model the analysis pass concept.
-///
-/// Can wrap any type which implements a suitable \c run method. The method
-/// must accept the IRUnitT as an argument and produce an object which can be
-/// wrapped in a \c AnalysisResultModel.
-template <typename IRUnitT, typename AnalysisManagerT, typename PassT,
-          bool AcceptsAnalysisManager = PassRunAcceptsAnalysisManager<
-              IRUnitT, AnalysisManagerT, PassT, typename PassT::Result>::Value>
-struct AnalysisPassModel;
-
-/// \brief Specialization of \c AnalysisPassModel which passes an
-/// \c AnalysisManager to PassT's run method.
-template <typename IRUnitT, typename AnalysisManagerT, typename PassT>
-struct AnalysisPassModel<IRUnitT, AnalysisManagerT, PassT, true>
-    : AnalysisPassConcept<IRUnitT, AnalysisManagerT> {
-  explicit AnalysisPassModel(PassT Pass) : Pass(std::move(Pass)) {}
-  // We have to explicitly define all the special member functions because MSVC
-  // refuses to generate them.
-  AnalysisPassModel(const AnalysisPassModel &Arg) : Pass(Arg.Pass) {}
-  AnalysisPassModel(AnalysisPassModel &&Arg) : Pass(std::move(Arg.Pass)) {}
-  friend void swap(AnalysisPassModel &LHS, AnalysisPassModel &RHS) {
-    using std::swap;
-    swap(LHS.Pass, RHS.Pass);
-  }
-  AnalysisPassModel &operator=(AnalysisPassModel RHS) {
-    swap(*this, RHS);
-    return *this;
-  }
-
-  // FIXME: Replace PassT::Result with type traits when we use C++11.
-  typedef AnalysisResultModel<IRUnitT, PassT, typename PassT::Result>
-      ResultModelT;
-
-  /// \brief The model delegates to the \c PassT::run method.
-  ///
-  /// The return is wrapped in an \c AnalysisResultModel.
-  std::unique_ptr<AnalysisResultConcept<IRUnitT>>
-  run(IRUnitT IR, AnalysisManagerT *AM) override {
-    return make_unique<ResultModelT>(Pass.run(IR, AM));
-  }
+      PreservedAnalyses PassPA = Passes[Idx]->run(IR, AM);
 
-  PassT Pass;
-};
+      // If we have an active analysis manager at this level we want to ensure
+      // we update it as each pass runs and potentially invalidates analyses.
+      // We also update the preserved set of analyses based on what analyses we
+      // have already handled the invalidation for here and don't need to
+      // invalidate when finished.
+      if (AM)
+        PassPA = AM->invalidate(IR, std::move(PassPA));
 
-/// \brief Specialization of \c AnalysisPassModel which does not pass an
-/// \c AnalysisManager to PassT's run method.
-template <typename IRUnitT, typename AnalysisManagerT, typename PassT>
-struct AnalysisPassModel<IRUnitT, AnalysisManagerT, PassT, false>
-    : AnalysisPassConcept<IRUnitT, AnalysisManagerT> {
-  explicit AnalysisPassModel(PassT Pass) : Pass(std::move(Pass)) {}
-  // We have to explicitly define all the special member functions because MSVC
-  // refuses to generate them.
-  AnalysisPassModel(const AnalysisPassModel &Arg) : Pass(Arg.Pass) {}
-  AnalysisPassModel(AnalysisPassModel &&Arg) : Pass(std::move(Arg.Pass)) {}
-  friend void swap(AnalysisPassModel &LHS, AnalysisPassModel &RHS) {
-    using std::swap;
-    swap(LHS.Pass, RHS.Pass);
-  }
-  AnalysisPassModel &operator=(AnalysisPassModel RHS) {
-    swap(*this, RHS);
-    return *this;
-  }
-
-  // FIXME: Replace PassT::Result with type traits when we use C++11.
-  typedef AnalysisResultModel<IRUnitT, PassT, typename PassT::Result>
-      ResultModelT;
-
-  /// \brief The model delegates to the \c PassT::run method.
-  ///
-  /// The return is wrapped in an \c AnalysisResultModel.
-  std::unique_ptr<AnalysisResultConcept<IRUnitT>>
-  run(IRUnitT IR, AnalysisManagerT *) override {
-    return make_unique<ResultModelT>(Pass.run(IR));
-  }
-
-  PassT Pass;
-};
+      // Finally, we intersect the final preserved analyses to compute the
+      // aggregate preserved set for this pass manager.
+      PA.intersect(std::move(PassPA));
 
-} // End namespace detail
+      // FIXME: Historically, the pass managers all called the LLVM context's
+      // yield function here. We don't have a generic way to acquire the
+      // context and it isn't yet clear what the right pattern is for yielding
+      // in the new pass manager so it is currently omitted.
+      //IR.getContext().yield();
+    }
 
-class ModuleAnalysisManager;
+    if (DebugLogging)
+      dbgs() << "Finished pass manager run.\n";
 
-class ModulePassManager {
-public:
-  // We have to explicitly define all the special member functions because MSVC
-  // refuses to generate them.
-  ModulePassManager() {}
-  ModulePassManager(ModulePassManager &&Arg) : Passes(std::move(Arg.Passes)) {}
-  ModulePassManager &operator=(ModulePassManager &&RHS) {
-    Passes = std::move(RHS.Passes);
-    return *this;
+    return PA;
   }
 
-  /// \brief Run all of the module passes in this module pass manager over
-  /// a module.
-  ///
-  /// This method should only be called for a single module as there is the
-  /// expectation that the lifetime of a pass is bounded to that of a module.
-  PreservedAnalyses run(Module *M, ModuleAnalysisManager *AM = nullptr);
-
-  template <typename ModulePassT> void addPass(ModulePassT Pass) {
-    Passes.emplace_back(new ModulePassModel<ModulePassT>(std::move(Pass)));
+  template <typename PassT> void addPass(PassT Pass) {
+    typedef detail::PassModel<IRUnitT, PassT> PassModelT;
+    Passes.emplace_back(new PassModelT(std::move(Pass)));
   }
 
-  static StringRef name() { return "ModulePassManager"; }
+  static StringRef name() { return "PassManager"; }
 
 private:
-  // Pull in the concept type and model template specialized for modules.
-  typedef detail::PassConcept<Module *, ModuleAnalysisManager>
-  ModulePassConcept;
-  template <typename PassT>
-  struct ModulePassModel
-      : detail::PassModel<Module *, ModuleAnalysisManager, PassT> {
-    ModulePassModel(PassT Pass)
-        : detail::PassModel<Module *, ModuleAnalysisManager, PassT>(
-              std::move(Pass)) {}
-  };
-
-  ModulePassManager(const ModulePassManager &) LLVM_DELETED_FUNCTION;
-  ModulePassManager &operator=(const ModulePassManager &) LLVM_DELETED_FUNCTION;
+  typedef detail::PassConcept<IRUnitT> PassConceptT;
 
-  std::vector<std::unique_ptr<ModulePassConcept>> Passes;
-};
+  PassManager(const PassManager &) = delete;
+  PassManager &operator=(const PassManager &) = delete;
 
-class FunctionAnalysisManager;
+  std::vector<std::unique_ptr<PassConceptT>> Passes;
 
-class FunctionPassManager {
-public:
-  // We have to explicitly define all the special member functions because MSVC
-  // refuses to generate them.
-  FunctionPassManager() {}
-  FunctionPassManager(FunctionPassManager &&Arg)
-      : Passes(std::move(Arg.Passes)) {}
-  FunctionPassManager &operator=(FunctionPassManager &&RHS) {
-    Passes = std::move(RHS.Passes);
-    return *this;
-  }
-
-  template <typename FunctionPassT> void addPass(FunctionPassT Pass) {
-    Passes.emplace_back(new FunctionPassModel<FunctionPassT>(std::move(Pass)));
-  }
-
-  PreservedAnalyses run(Function *F, FunctionAnalysisManager *AM = nullptr);
-
-  static StringRef name() { return "FunctionPassManager"; }
+  /// \brief Flag indicating whether we should do debug logging.
+  bool DebugLogging;
+};
 
-private:
-  // Pull in the concept type and model template specialized for functions.
-  typedef detail::PassConcept<Function *, FunctionAnalysisManager>
-  FunctionPassConcept;
-  template <typename PassT>
-  struct FunctionPassModel
-      : detail::PassModel<Function *, FunctionAnalysisManager, PassT> {
-    FunctionPassModel(PassT Pass)
-        : detail::PassModel<Function *, FunctionAnalysisManager, PassT>(
-              std::move(Pass)) {}
-  };
+/// \brief Convenience typedef for a pass manager over modules.
+typedef PassManager<Module> ModulePassManager;
 
-  FunctionPassManager(const FunctionPassManager &) LLVM_DELETED_FUNCTION;
-  FunctionPassManager &
-  operator=(const FunctionPassManager &) LLVM_DELETED_FUNCTION;
-
-  std::vector<std::unique_ptr<FunctionPassConcept>> Passes;
-};
+/// \brief Convenience typedef for a pass manager over functions.
+typedef PassManager<Function> FunctionPassManager;
 
 namespace detail {
 
@@ -556,19 +269,25 @@ namespace detail {
 /// - invalidateImpl
 ///
 /// The details of the call pattern are within.
+///
+/// Note that there is also a generic analysis manager template which implements
+/// the above required functions along with common datastructures used for
+/// managing analyses. This base class is factored so that if you need to
+/// customize the handling of a specific IR unit, you can do so without
+/// replicating *all* of the boilerplate.
 template <typename DerivedT, typename IRUnitT> class AnalysisManagerBase {
   DerivedT *derived_this() { return static_cast<DerivedT *>(this); }
   const DerivedT *derived_this() const {
     return static_cast<const DerivedT *>(this);
   }
 
-  AnalysisManagerBase(const AnalysisManagerBase &) LLVM_DELETED_FUNCTION;
+  AnalysisManagerBase(const AnalysisManagerBase &) = delete;
   AnalysisManagerBase &
-  operator=(const AnalysisManagerBase &) LLVM_DELETED_FUNCTION;
+  operator=(const AnalysisManagerBase &) = delete;
 
 protected:
   typedef detail::AnalysisResultConcept<IRUnitT> ResultConceptT;
-  typedef detail::AnalysisPassConcept<IRUnitT, DerivedT> PassConceptT;
+  typedef detail::AnalysisPassConcept<IRUnitT> PassConceptT;
 
   // FIXME: Provide template aliases for the models when we're using C++11 in
   // a mode supporting them.
@@ -588,7 +307,7 @@ public:
   ///
   /// If there is not a valid cached result in the manager already, this will
   /// re-run the analysis to produce a valid result.
-  template <typename PassT> typename PassT::Result &getResult(IRUnitT IR) {
+  template <typename PassT> typename PassT::Result &getResult(IRUnitT &IR) {
     assert(AnalysisPasses.count(PassT::ID()) &&
            "This analysis pass was not registered prior to being queried");
 
@@ -605,7 +324,7 @@ public:
   ///
   /// \returns null if there is no cached result.
   template <typename PassT>
-  typename PassT::Result *getCachedResult(IRUnitT IR) const {
+  typename PassT::Result *getCachedResult(IRUnitT &IR) const {
     assert(AnalysisPasses.count(PassT::ID()) &&
            "This analysis pass was not registered prior to being queried");
 
@@ -627,25 +346,28 @@ public:
   template <typename PassT> void registerPass(PassT Pass) {
     assert(!AnalysisPasses.count(PassT::ID()) &&
            "Registered the same analysis pass twice!");
-    typedef detail::AnalysisPassModel<IRUnitT, DerivedT, PassT> PassModelT;
+    typedef detail::AnalysisPassModel<IRUnitT, PassT> PassModelT;
     AnalysisPasses[PassT::ID()].reset(new PassModelT(std::move(Pass)));
   }
 
   /// \brief Invalidate a specific analysis pass for an IR module.
   ///
   /// Note that the analysis result can disregard invalidation.
-  template <typename PassT> void invalidate(Module *M) {
+  template <typename PassT> void invalidate(IRUnitT &IR) {
     assert(AnalysisPasses.count(PassT::ID()) &&
            "This analysis pass was not registered prior to being invalidated");
-    derived_this()->invalidateImpl(PassT::ID(), M);
+    derived_this()->invalidateImpl(PassT::ID(), IR);
   }
 
   /// \brief Invalidate analyses cached for an IR unit.
   ///
   /// Walk through all of the analyses pertaining to this unit of IR and
   /// invalidate them unless they are preserved by the PreservedAnalyses set.
-  void invalidate(IRUnitT IR, const PreservedAnalyses &PA) {
-    derived_this()->invalidateImpl(IR, PA);
+  /// We accept the PreservedAnalyses set by value and update it with each
+  /// analyis pass which has been successfully invalidated and thus can be
+  /// preserved going forward. The updated set is returned.
+  PreservedAnalyses invalidate(IRUnitT &IR, PreservedAnalyses PA) {
+    return derived_this()->invalidateImpl(IR, std::move(PA));
   }
 
 protected:
@@ -675,108 +397,153 @@ private:
 
 } // End namespace detail
 
-/// \brief A module analysis pass manager with lazy running and caching of
+/// \brief A generic analysis pass manager with lazy running and caching of
 /// results.
-class ModuleAnalysisManager
-    : public detail::AnalysisManagerBase<ModuleAnalysisManager, Module *> {
-  friend class detail::AnalysisManagerBase<ModuleAnalysisManager, Module *>;
-  typedef detail::AnalysisManagerBase<ModuleAnalysisManager, Module *> BaseT;
-  typedef BaseT::ResultConceptT ResultConceptT;
-  typedef BaseT::PassConceptT PassConceptT;
-
-public:
-  // We have to explicitly define all the special member functions because MSVC
-  // refuses to generate them.
-  ModuleAnalysisManager() {}
-  ModuleAnalysisManager(ModuleAnalysisManager &&Arg)
-      : BaseT(std::move(static_cast<BaseT &>(Arg))),
-        ModuleAnalysisResults(std::move(Arg.ModuleAnalysisResults)) {}
-  ModuleAnalysisManager &operator=(ModuleAnalysisManager &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ModuleAnalysisResults = std::move(RHS.ModuleAnalysisResults);
-    return *this;
-  }
-
-private:
-  ModuleAnalysisManager(const ModuleAnalysisManager &) LLVM_DELETED_FUNCTION;
-  ModuleAnalysisManager &
-  operator=(const ModuleAnalysisManager &) LLVM_DELETED_FUNCTION;
-
-  /// \brief Get a module pass result, running the pass if necessary.
-  ResultConceptT &getResultImpl(void *PassID, Module *M);
-
-  /// \brief Get a cached module pass result or return null.
-  ResultConceptT *getCachedResultImpl(void *PassID, Module *M) const;
-
-  /// \brief Invalidate a module pass result.
-  void invalidateImpl(void *PassID, Module *M);
-
-  /// \brief Invalidate results across a module.
-  void invalidateImpl(Module *M, const PreservedAnalyses &PA);
-
-  /// \brief Map type from module analysis pass ID to pass result concept
-  /// pointer.
-  typedef DenseMap<void *,
-                   std::unique_ptr<detail::AnalysisResultConcept<Module *>>>
-      ModuleAnalysisResultMapT;
-
-  /// \brief Cache of computed module analysis results for this module.
-  ModuleAnalysisResultMapT ModuleAnalysisResults;
-};
-
-/// \brief A function analysis manager to coordinate and cache analyses run over
-/// a module.
-class FunctionAnalysisManager
-    : public detail::AnalysisManagerBase<FunctionAnalysisManager, Function *> {
-  friend class detail::AnalysisManagerBase<FunctionAnalysisManager, Function *>;
-  typedef detail::AnalysisManagerBase<FunctionAnalysisManager, Function *>
-      BaseT;
-  typedef BaseT::ResultConceptT ResultConceptT;
-  typedef BaseT::PassConceptT PassConceptT;
+///
+/// This analysis manager can be used for any IR unit where the address of the
+/// IR unit sufficies as its identity. It manages the cache for a unit of IR via
+/// the address of each unit of IR cached.
+template <typename IRUnitT>
+class AnalysisManager
+    : public detail::AnalysisManagerBase<AnalysisManager<IRUnitT>, IRUnitT> {
+  friend class detail::AnalysisManagerBase<AnalysisManager<IRUnitT>, IRUnitT>;
+  typedef detail::AnalysisManagerBase<AnalysisManager<IRUnitT>, IRUnitT> BaseT;
+  typedef typename BaseT::ResultConceptT ResultConceptT;
+  typedef typename BaseT::PassConceptT PassConceptT;
 
 public:
   // Most public APIs are inherited from the CRTP base class.
 
+  /// \brief Construct an empty analysis manager.
+  ///
+  /// A flag can be passed to indicate that the manager should perform debug
+  /// logging.
+  AnalysisManager(bool DebugLogging = false) : DebugLogging(DebugLogging) {}
+
   // We have to explicitly define all the special member functions because MSVC
   // refuses to generate them.
-  FunctionAnalysisManager() {}
-  FunctionAnalysisManager(FunctionAnalysisManager &&Arg)
+  AnalysisManager(AnalysisManager &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))),
-        FunctionAnalysisResults(std::move(Arg.FunctionAnalysisResults)) {}
-  FunctionAnalysisManager &operator=(FunctionAnalysisManager &&RHS) {
+        AnalysisResults(std::move(Arg.AnalysisResults)),
+        DebugLogging(std::move(Arg.DebugLogging)) {}
+  AnalysisManager &operator=(AnalysisManager &&RHS) {
     BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    FunctionAnalysisResults = std::move(RHS.FunctionAnalysisResults);
+    AnalysisResults = std::move(RHS.AnalysisResults);
+    DebugLogging = std::move(RHS.DebugLogging);
     return *this;
   }
 
   /// \brief Returns true if the analysis manager has an empty results cache.
-  bool empty() const;
+  bool empty() const {
+    assert(AnalysisResults.empty() == AnalysisResultLists.empty() &&
+           "The storage and index of analysis results disagree on how many "
+           "there are!");
+    return AnalysisResults.empty();
+  }
 
-  /// \brief Clear the function analysis result cache.
+  /// \brief Clear the analysis result cache.
   ///
-  /// This routine allows cleaning up when the set of functions itself has
+  /// This routine allows cleaning up when the set of IR units itself has
   /// potentially changed, and thus we can't even look up a a result and
-  /// invalidate it directly. Notably, this does *not* call invalidate
-  /// functions as there is nothing to be done for them.
-  void clear();
+  /// invalidate it directly. Notably, this does *not* call invalidate functions
+  /// as there is nothing to be done for them.
+  void clear() {
+    AnalysisResults.clear();
+    AnalysisResultLists.clear();
+  }
 
 private:
-  FunctionAnalysisManager(const FunctionAnalysisManager &)
-      LLVM_DELETED_FUNCTION;
-  FunctionAnalysisManager &
-  operator=(const FunctionAnalysisManager &) LLVM_DELETED_FUNCTION;
+  AnalysisManager(const AnalysisManager &) = delete;
+  AnalysisManager &operator=(const AnalysisManager &) = delete;
+
+  /// \brief Get an analysis result, running the pass if necessary.
+  ResultConceptT &getResultImpl(void *PassID, IRUnitT &IR) {
+    typename AnalysisResultMapT::iterator RI;
+    bool Inserted;
+    std::tie(RI, Inserted) = AnalysisResults.insert(std::make_pair(
+        std::make_pair(PassID, &IR), typename AnalysisResultListT::iterator()));
+
+    // If we don't have a cached result for this function, look up the pass and
+    // run it to produce a result, which we then add to the cache.
+    if (Inserted) {
+      auto &P = this->lookupPass(PassID);
+      if (DebugLogging)
+        dbgs() << "Running analysis: " << P.name() << "\n";
+      AnalysisResultListT &ResultList = AnalysisResultLists[&IR];
+      ResultList.emplace_back(PassID, P.run(IR, this));
+      RI->second = std::prev(ResultList.end());
+    }
 
-  /// \brief Get a function pass result, running the pass if necessary.
-  ResultConceptT &getResultImpl(void *PassID, Function *F);
+    return *RI->second->second;
+  }
 
-  /// \brief Get a cached function pass result or return null.
-  ResultConceptT *getCachedResultImpl(void *PassID, Function *F) const;
+  /// \brief Get a cached analysis result or return null.
+  ResultConceptT *getCachedResultImpl(void *PassID, IRUnitT &IR) const {
+    typename AnalysisResultMapT::const_iterator RI =
+        AnalysisResults.find(std::make_pair(PassID, &IR));
+    return RI == AnalysisResults.end() ? nullptr : &*RI->second->second;
+  }
 
   /// \brief Invalidate a function pass result.
-  void invalidateImpl(void *PassID, Function *F);
+  void invalidateImpl(void *PassID, IRUnitT &IR) {
+    typename AnalysisResultMapT::iterator RI =
+        AnalysisResults.find(std::make_pair(PassID, &IR));
+    if (RI == AnalysisResults.end())
+      return;
+
+    if (DebugLogging)
+      dbgs() << "Invalidating analysis: " << this->lookupPass(PassID).name()
+             << "\n";
+    AnalysisResultLists[&IR].erase(RI->second);
+    AnalysisResults.erase(RI);
+  }
 
   /// \brief Invalidate the results for a function..
-  void invalidateImpl(Function *F, const PreservedAnalyses &PA);
+  PreservedAnalyses invalidateImpl(IRUnitT &IR, PreservedAnalyses PA) {
+    // Short circuit for a common case of all analyses being preserved.
+    if (PA.areAllPreserved())
+      return std::move(PA);
+
+    if (DebugLogging)
+      dbgs() << "Invalidating all non-preserved analyses for: "
+             << IR.getName() << "\n";
+
+    // Clear all the invalidated results associated specifically with this
+    // function.
+    SmallVector<void *, 8> InvalidatedPassIDs;
+    AnalysisResultListT &ResultsList = AnalysisResultLists[&IR];
+    for (typename AnalysisResultListT::iterator I = ResultsList.begin(),
+                                                E = ResultsList.end();
+         I != E;) {
+      void *PassID = I->first;
+
+      // Pass the invalidation down to the pass itself to see if it thinks it is
+      // necessary. The analysis pass can return false if no action on the part
+      // of the analysis manager is required for this invalidation event.
+      if (I->second->invalidate(IR, PA)) {
+        if (DebugLogging)
+          dbgs() << "Invalidating analysis: " << this->lookupPass(PassID).name()
+                 << "\n";
+
+        InvalidatedPassIDs.push_back(I->first);
+        I = ResultsList.erase(I);
+      } else {
+        ++I;
+      }
+
+      // After handling each pass, we mark it as preserved. Once we've
+      // invalidated any stale results, the rest of the system is allowed to
+      // start preserving this analysis again.
+      PA.preserve(PassID);
+    }
+    while (!InvalidatedPassIDs.empty())
+      AnalysisResults.erase(
+          std::make_pair(InvalidatedPassIDs.pop_back_val(), &IR));
+    if (ResultsList.empty())
+      AnalysisResultLists.erase(&IR);
+
+    return std::move(PA);
+  }
 
   /// \brief List of function analysis pass IDs and associated concept pointers.
   ///
@@ -784,30 +551,37 @@ private:
   /// erases. Provides both the pass ID and concept pointer such that it is
   /// half of a bijection and provides storage for the actual result concept.
   typedef std::list<std::pair<
-      void *, std::unique_ptr<detail::AnalysisResultConcept<Function *>>>>
-          FunctionAnalysisResultListT;
+      void *, std::unique_ptr<detail::AnalysisResultConcept<IRUnitT>>>>
+      AnalysisResultListT;
 
   /// \brief Map type from function pointer to our custom list type.
-  typedef DenseMap<Function *, FunctionAnalysisResultListT>
-      FunctionAnalysisResultListMapT;
+  typedef DenseMap<IRUnitT *, AnalysisResultListT> AnalysisResultListMapT;
 
   /// \brief Map from function to a list of function analysis results.
   ///
   /// Provides linear time removal of all analysis results for a function and
   /// the ultimate storage for a particular cached analysis result.
-  FunctionAnalysisResultListMapT FunctionAnalysisResultLists;
+  AnalysisResultListMapT AnalysisResultLists;
 
   /// \brief Map type from a pair of analysis ID and function pointer to an
   /// iterator into a particular result list.
-  typedef DenseMap<std::pair<void *, Function *>,
-                   FunctionAnalysisResultListT::iterator>
-      FunctionAnalysisResultMapT;
+  typedef DenseMap<std::pair<void *, IRUnitT *>,
+                   typename AnalysisResultListT::iterator> AnalysisResultMapT;
 
   /// \brief Map from an analysis ID and function to a particular cached
   /// analysis result.
-  FunctionAnalysisResultMapT FunctionAnalysisResults;
+  AnalysisResultMapT AnalysisResults;
+
+  /// \brief A flag indicating whether debug logging is enabled.
+  bool DebugLogging;
 };
 
+/// \brief Convenience typedef for the Module analysis manager.
+typedef AnalysisManager<Module> ModuleAnalysisManager;
+
+/// \brief Convenience typedef for the Function analysis manager.
+typedef AnalysisManager<Function> FunctionAnalysisManager;
+
 /// \brief A module analysis which acts as a proxy for a function analysis
 /// manager.
 ///
@@ -822,6 +596,8 @@ public:
 
   static void *ID() { return (void *)&PassID; }
 
+  static StringRef name() { return "FunctionAnalysisManagerModuleProxy"; }
+
   explicit FunctionAnalysisManagerModuleProxy(FunctionAnalysisManager &FAM)
       : FAM(&FAM) {}
   // We have to explicitly define all the special member functions because MSVC
@@ -846,7 +622,7 @@ public:
   /// In debug builds, it will also assert that the analysis manager is empty
   /// as no queries should arrive at the function analysis manager prior to
   /// this analysis being requested.
-  Result run(Module *M);
+  Result run(Module &M);
 
 private:
   static char PassID;
@@ -884,7 +660,7 @@ public:
   /// Regardless of whether this analysis is marked as preserved, all of the
   /// analyses in the \c FunctionAnalysisManager are potentially invalidated
   /// based on the set of preserved analyses.
-  bool invalidate(Module *M, const PreservedAnalyses &PA);
+  bool invalidate(Module &M, const PreservedAnalyses &PA);
 
 private:
   FunctionAnalysisManager *FAM;
@@ -920,7 +696,7 @@ public:
     const ModuleAnalysisManager &getManager() const { return *MAM; }
 
     /// \brief Handle invalidation by ignoring it, this pass is immutable.
-    bool invalidate(Function *) { return false; }
+    bool invalidate(Function &) { return false; }
 
   private:
     const ModuleAnalysisManager *MAM;
@@ -928,6 +704,8 @@ public:
 
   static void *ID() { return (void *)&PassID; }
 
+  static StringRef name() { return "ModuleAnalysisManagerFunctionProxy"; }
+
   ModuleAnalysisManagerFunctionProxy(const ModuleAnalysisManager &MAM)
       : MAM(&MAM) {}
   // We have to explicitly define all the special member functions because MSVC
@@ -946,7 +724,7 @@ public:
   /// \brief Run the analysis pass and create our proxy result object.
   /// Nothing to see here, it just forwards the \c MAM reference into the
   /// result.
-  Result run(Function *) { return Result(*MAM); }
+  Result run(Function &) { return Result(*MAM); }
 
 private:
   static char PassID;
@@ -962,6 +740,20 @@ private:
 /// \c FunctionAnalysisManagerModuleProxy analysis prior to running the function
 /// pass over the module to enable a \c FunctionAnalysisManager to be used
 /// within this run safely.
+///
+/// Function passes run within this adaptor can rely on having exclusive access
+/// to the function they are run over. They should not read or modify any other
+/// functions! Other threads or systems may be manipulating other functions in
+/// the module, and so their state should never be relied on.
+/// FIXME: Make the above true for all of LLVM's actual passes, some still
+/// violate this principle.
+///
+/// Function passes can also read the module containing the function, but they
+/// should not modify that module outside of the use lists of various globals.
+/// For example, a function pass is not permitted to add functions to the
+/// module.
+/// FIXME: Make the above true for all of LLVM's actual passes, some still
+/// violate this principle.
 template <typename FunctionPassT> class ModuleToFunctionPassAdaptor {
 public:
   explicit ModuleToFunctionPassAdaptor(FunctionPassT Pass)
@@ -972,7 +764,8 @@ public:
       : Pass(Arg.Pass) {}
   ModuleToFunctionPassAdaptor(ModuleToFunctionPassAdaptor &&Arg)
       : Pass(std::move(Arg.Pass)) {}
-  friend void swap(ModuleToFunctionPassAdaptor &LHS, ModuleToFunctionPassAdaptor &RHS) {
+  friend void swap(ModuleToFunctionPassAdaptor &LHS,
+                   ModuleToFunctionPassAdaptor &RHS) {
     using std::swap;
     swap(LHS.Pass, RHS.Pass);
   }
@@ -982,21 +775,26 @@ public:
   }
 
   /// \brief Runs the function pass across every function in the module.
-  PreservedAnalyses run(Module *M, ModuleAnalysisManager *AM) {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager *AM) {
     FunctionAnalysisManager *FAM = nullptr;
     if (AM)
       // Setup the function analysis manager from its proxy.
       FAM = &AM->getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
     PreservedAnalyses PA = PreservedAnalyses::all();
-    for (Module::iterator I = M->begin(), E = M->end(); I != E; ++I) {
-      PreservedAnalyses PassPA = Pass.run(I, FAM);
+    for (Function &F : M) {
+      if (F.isDeclaration())
+        continue;
+
+      PreservedAnalyses PassPA = Pass.run(F, FAM);
 
       // We know that the function pass couldn't have invalidated any other
       // function's analyses (that's the contract of a function pass), so
-      // directly handle the function analysis manager's invalidation here.
+      // directly handle the function analysis manager's invalidation here and
+      // update our preserved set to reflect that these have already been
+      // handled.
       if (FAM)
-        FAM->invalidate(I, PassPA);
+        PassPA = FAM->invalidate(F, std::move(PassPA));
 
       // Then intersect the preserved set so that invalidation of module
       // analyses will eventually occur when the module pass completes.
@@ -1025,6 +823,66 @@ createModuleToFunctionPassAdaptor(FunctionPassT Pass) {
   return std::move(ModuleToFunctionPassAdaptor<FunctionPassT>(std::move(Pass)));
 }
 
+/// \brief A template utility pass to force an analysis result to be available.
+///
+/// This is a no-op pass which simply forces a specific analysis pass's result
+/// to be available when it is run.
+template <typename AnalysisT> struct RequireAnalysisPass {
+  /// \brief Run this pass over some unit of IR.
+  ///
+  /// This pass can be run over any unit of IR and use any analysis manager
+  /// provided they satisfy the basic API requirements. When this pass is
+  /// created, these methods can be instantiated to satisfy whatever the
+  /// context requires.
+  template <typename IRUnitT>
+  PreservedAnalyses run(IRUnitT &Arg, AnalysisManager<IRUnitT> *AM) {
+    if (AM)
+      (void)AM->template getResult<AnalysisT>(Arg);
+
+    return PreservedAnalyses::all();
+  }
+
+  static StringRef name() { return "RequireAnalysisPass"; }
+};
+
+/// \brief A template utility pass to force an analysis result to be
+/// invalidated.
+///
+/// This is a no-op pass which simply forces a specific analysis result to be
+/// invalidated when it is run.
+template <typename AnalysisT> struct InvalidateAnalysisPass {
+  /// \brief Run this pass over some unit of IR.
+  ///
+  /// This pass can be run over any unit of IR and use any analysis manager
+  /// provided they satisfy the basic API requirements. When this pass is
+  /// created, these methods can be instantiated to satisfy whatever the
+  /// context requires.
+  template <typename IRUnitT>
+  PreservedAnalyses run(IRUnitT &Arg, AnalysisManager<IRUnitT> *AM) {
+    if (AM)
+      // We have to directly invalidate the analysis result as we can't
+      // enumerate all other analyses and use the preserved set to control it.
+      (void)AM->template invalidate<AnalysisT>(Arg);
+
+    return PreservedAnalyses::all();
+  }
+
+  static StringRef name() { return "InvalidateAnalysisPass"; }
+};
+
+/// \brief A utility pass that does nothing but preserves no analyses.
+///
+/// As a consequence fo not preserving any analyses, this pass will force all
+/// analysis passes to be re-run to produce fresh results if any are needed.
+struct InvalidateAllAnalysesPass {
+  /// \brief Run this pass over some unit of IR.
+  template <typename IRUnitT> PreservedAnalyses run(IRUnitT &Arg) {
+    return PreservedAnalyses::none();
+  }
+
+  static StringRef name() { return "InvalidateAllAnalysesPass"; }
+};
+
 }
 
 #endif
diff --git a/include/llvm/IR/PassManagerInternal.h b/include/llvm/IR/PassManagerInternal.h
new file mode 100644
index 0000000..297f5f4
--- /dev/null
+++ b/include/llvm/IR/PassManagerInternal.h
@@ -0,0 +1,349 @@
+//===- PassManager internal APIs and implementation details -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This header provides internal APIs and implementation details used by the
+/// pass management interfaces exposed in PassManager.h. To understand more
+/// context of why these particular interfaces are needed, see that header
+/// file. None of these APIs should be used elsewhere.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_PASSMANAGERINTERNAL_H
+#define LLVM_IR_PASSMANAGERINTERNAL_H
+
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+
+template <typename IRUnitT> class AnalysisManager;
+class PreservedAnalyses;
+
+/// \brief Implementation details of the pass manager interfaces.
+namespace detail {
+
+/// \brief Template for the abstract base class used to dispatch
+/// polymorphically over pass objects.
+template <typename IRUnitT> struct PassConcept {
+  // Boiler plate necessary for the container of derived classes.
+  virtual ~PassConcept() {}
+
+  /// \brief The polymorphic API which runs the pass over a given IR entity.
+  ///
+  /// Note that actual pass object can omit the analysis manager argument if
+  /// desired. Also that the analysis manager may be null if there is no
+  /// analysis manager in the pass pipeline.
+  virtual PreservedAnalyses run(IRUnitT &IR, AnalysisManager<IRUnitT> *AM) = 0;
+
+  /// \brief Polymorphic method to access the name of a pass.
+  virtual StringRef name() = 0;
+};
+
+/// \brief SFINAE metafunction for computing whether \c PassT has a run method
+/// accepting an \c AnalysisManager<IRUnitT>.
+template <typename IRUnitT, typename PassT, typename ResultT>
+class PassRunAcceptsAnalysisManager {
+  typedef char SmallType;
+  struct BigType {
+    char a, b;
+  };
+
+  template <typename T, ResultT (T::*)(IRUnitT &, AnalysisManager<IRUnitT> *)>
+  struct Checker;
+
+  template <typename T> static SmallType f(Checker<T, &T::run> *);
+  template <typename T> static BigType f(...);
+
+public:
+  enum { Value = sizeof(f<PassT>(nullptr)) == sizeof(SmallType) };
+};
+
+/// \brief A template wrapper used to implement the polymorphic API.
+///
+/// Can be instantiated for any object which provides a \c run method accepting
+/// an \c IRUnitT. It requires the pass to be a copyable object. When the
+/// \c run method also accepts an \c AnalysisManager<IRUnitT>*, we pass it
+/// along.
+template <typename IRUnitT, typename PassT,
+          typename PreservedAnalysesT = PreservedAnalyses,
+          bool AcceptsAnalysisManager = PassRunAcceptsAnalysisManager<
+              IRUnitT, PassT, PreservedAnalysesT>::Value>
+struct PassModel;
+
+/// \brief Specialization of \c PassModel for passes that accept an analyis
+/// manager.
+template <typename IRUnitT, typename PassT, typename PreservedAnalysesT>
+struct PassModel<IRUnitT, PassT, PreservedAnalysesT, true>
+    : PassConcept<IRUnitT> {
+  explicit PassModel(PassT Pass) : Pass(std::move(Pass)) {}
+  // We have to explicitly define all the special member functions because MSVC
+  // refuses to generate them.
+  PassModel(const PassModel &Arg) : Pass(Arg.Pass) {}
+  PassModel(PassModel &&Arg) : Pass(std::move(Arg.Pass)) {}
+  friend void swap(PassModel &LHS, PassModel &RHS) {
+    using std::swap;
+    swap(LHS.Pass, RHS.Pass);
+  }
+  PassModel &operator=(PassModel RHS) {
+    swap(*this, RHS);
+    return *this;
+  }
+
+  PreservedAnalysesT run(IRUnitT &IR, AnalysisManager<IRUnitT> *AM) override {
+    return Pass.run(IR, AM);
+  }
+  StringRef name() override { return PassT::name(); }
+  PassT Pass;
+};
+
+/// \brief Specialization of \c PassModel for passes that accept an analyis
+/// manager.
+template <typename IRUnitT, typename PassT, typename PreservedAnalysesT>
+struct PassModel<IRUnitT, PassT, PreservedAnalysesT, false>
+    : PassConcept<IRUnitT> {
+  explicit PassModel(PassT Pass) : Pass(std::move(Pass)) {}
+  // We have to explicitly define all the special member functions because MSVC
+  // refuses to generate them.
+  PassModel(const PassModel &Arg) : Pass(Arg.Pass) {}
+  PassModel(PassModel &&Arg) : Pass(std::move(Arg.Pass)) {}
+  friend void swap(PassModel &LHS, PassModel &RHS) {
+    using std::swap;
+    swap(LHS.Pass, RHS.Pass);
+  }
+  PassModel &operator=(PassModel RHS) {
+    swap(*this, RHS);
+    return *this;
+  }
+
+  PreservedAnalysesT run(IRUnitT &IR, AnalysisManager<IRUnitT> *AM) override {
+    return Pass.run(IR);
+  }
+  StringRef name() override { return PassT::name(); }
+  PassT Pass;
+};
+
+/// \brief Abstract concept of an analysis result.
+///
+/// This concept is parameterized over the IR unit that this result pertains
+/// to.
+template <typename IRUnitT> struct AnalysisResultConcept {
+  virtual ~AnalysisResultConcept() {}
+
+  /// \brief Method to try and mark a result as invalid.
+  ///
+  /// When the outer analysis manager detects a change in some underlying
+  /// unit of the IR, it will call this method on all of the results cached.
+  ///
+  /// This method also receives a set of preserved analyses which can be used
+  /// to avoid invalidation because the pass which changed the underlying IR
+  /// took care to update or preserve the analysis result in some way.
+  ///
+  /// \returns true if the result is indeed invalid (the default).
+  virtual bool invalidate(IRUnitT &IR, const PreservedAnalyses &PA) = 0;
+};
+
+/// \brief SFINAE metafunction for computing whether \c ResultT provides an
+/// \c invalidate member function.
+template <typename IRUnitT, typename ResultT> class ResultHasInvalidateMethod {
+  typedef char SmallType;
+  struct BigType {
+    char a, b;
+  };
+
+  template <typename T, bool (T::*)(IRUnitT &, const PreservedAnalyses &)>
+  struct Checker;
+
+  template <typename T> static SmallType f(Checker<T, &T::invalidate> *);
+  template <typename T> static BigType f(...);
+
+public:
+  enum { Value = sizeof(f<ResultT>(nullptr)) == sizeof(SmallType) };
+};
+
+/// \brief Wrapper to model the analysis result concept.
+///
+/// By default, this will implement the invalidate method with a trivial
+/// implementation so that the actual analysis result doesn't need to provide
+/// an invalidation handler. It is only selected when the invalidation handler
+/// is not part of the ResultT's interface.
+template <typename IRUnitT, typename PassT, typename ResultT,
+          typename PreservedAnalysesT = PreservedAnalyses,
+          bool HasInvalidateHandler =
+              ResultHasInvalidateMethod<IRUnitT, ResultT>::Value>
+struct AnalysisResultModel;
+
+/// \brief Specialization of \c AnalysisResultModel which provides the default
+/// invalidate functionality.
+template <typename IRUnitT, typename PassT, typename ResultT,
+          typename PreservedAnalysesT>
+struct AnalysisResultModel<IRUnitT, PassT, ResultT, PreservedAnalysesT, false>
+    : AnalysisResultConcept<IRUnitT> {
+  explicit AnalysisResultModel(ResultT Result) : Result(std::move(Result)) {}
+  // We have to explicitly define all the special member functions because MSVC
+  // refuses to generate them.
+  AnalysisResultModel(const AnalysisResultModel &Arg) : Result(Arg.Result) {}
+  AnalysisResultModel(AnalysisResultModel &&Arg)
+      : Result(std::move(Arg.Result)) {}
+  friend void swap(AnalysisResultModel &LHS, AnalysisResultModel &RHS) {
+    using std::swap;
+    swap(LHS.Result, RHS.Result);
+  }
+  AnalysisResultModel &operator=(AnalysisResultModel RHS) {
+    swap(*this, RHS);
+    return *this;
+  }
+
+  /// \brief The model bases invalidation solely on being in the preserved set.
+  //
+  // FIXME: We should actually use two different concepts for analysis results
+  // rather than two different models, and avoid the indirect function call for
+  // ones that use the trivial behavior.
+  bool invalidate(IRUnitT &, const PreservedAnalysesT &PA) override {
+    return !PA.preserved(PassT::ID());
+  }
+
+  ResultT Result;
+};
+
+/// \brief Specialization of \c AnalysisResultModel which delegates invalidate
+/// handling to \c ResultT.
+template <typename IRUnitT, typename PassT, typename ResultT,
+          typename PreservedAnalysesT>
+struct AnalysisResultModel<IRUnitT, PassT, ResultT, PreservedAnalysesT, true>
+    : AnalysisResultConcept<IRUnitT> {
+  explicit AnalysisResultModel(ResultT Result) : Result(std::move(Result)) {}
+  // We have to explicitly define all the special member functions because MSVC
+  // refuses to generate them.
+  AnalysisResultModel(const AnalysisResultModel &Arg) : Result(Arg.Result) {}
+  AnalysisResultModel(AnalysisResultModel &&Arg)
+      : Result(std::move(Arg.Result)) {}
+  friend void swap(AnalysisResultModel &LHS, AnalysisResultModel &RHS) {
+    using std::swap;
+    swap(LHS.Result, RHS.Result);
+  }
+  AnalysisResultModel &operator=(AnalysisResultModel RHS) {
+    swap(*this, RHS);
+    return *this;
+  }
+
+  /// \brief The model delegates to the \c ResultT method.
+  bool invalidate(IRUnitT &IR, const PreservedAnalysesT &PA) override {
+    return Result.invalidate(IR, PA);
+  }
+
+  ResultT Result;
+};
+
+/// \brief Abstract concept of an analysis pass.
+///
+/// This concept is parameterized over the IR unit that it can run over and
+/// produce an analysis result.
+template <typename IRUnitT> struct AnalysisPassConcept {
+  virtual ~AnalysisPassConcept() {}
+
+  /// \brief Method to run this analysis over a unit of IR.
+  /// \returns A unique_ptr to the analysis result object to be queried by
+  /// users.
+  virtual std::unique_ptr<AnalysisResultConcept<IRUnitT>>
+  run(IRUnitT &IR, AnalysisManager<IRUnitT> *AM) = 0;
+
+  /// \brief Polymorphic method to access the name of a pass.
+  virtual StringRef name() = 0;
+};
+
+/// \brief Wrapper to model the analysis pass concept.
+///
+/// Can wrap any type which implements a suitable \c run method. The method
+/// must accept the IRUnitT as an argument and produce an object which can be
+/// wrapped in a \c AnalysisResultModel.
+template <typename IRUnitT, typename PassT,
+          bool AcceptsAnalysisManager = PassRunAcceptsAnalysisManager<
+              IRUnitT, PassT, typename PassT::Result>::Value>
+struct AnalysisPassModel;
+
+/// \brief Specialization of \c AnalysisPassModel which passes an
+/// \c AnalysisManager to PassT's run method.
+template <typename IRUnitT, typename PassT>
+struct AnalysisPassModel<IRUnitT, PassT, true> : AnalysisPassConcept<IRUnitT> {
+  explicit AnalysisPassModel(PassT Pass) : Pass(std::move(Pass)) {}
+  // We have to explicitly define all the special member functions because MSVC
+  // refuses to generate them.
+  AnalysisPassModel(const AnalysisPassModel &Arg) : Pass(Arg.Pass) {}
+  AnalysisPassModel(AnalysisPassModel &&Arg) : Pass(std::move(Arg.Pass)) {}
+  friend void swap(AnalysisPassModel &LHS, AnalysisPassModel &RHS) {
+    using std::swap;
+    swap(LHS.Pass, RHS.Pass);
+  }
+  AnalysisPassModel &operator=(AnalysisPassModel RHS) {
+    swap(*this, RHS);
+    return *this;
+  }
+
+  // FIXME: Replace PassT::Result with type traits when we use C++11.
+  typedef AnalysisResultModel<IRUnitT, PassT, typename PassT::Result>
+      ResultModelT;
+
+  /// \brief The model delegates to the \c PassT::run method.
+  ///
+  /// The return is wrapped in an \c AnalysisResultModel.
+  std::unique_ptr<AnalysisResultConcept<IRUnitT>>
+  run(IRUnitT &IR, AnalysisManager<IRUnitT> *AM) override {
+    return make_unique<ResultModelT>(Pass.run(IR, AM));
+  }
+
+  /// \brief The model delegates to a static \c PassT::name method.
+  ///
+  /// The returned string ref must point to constant immutable data!
+  StringRef name() override { return PassT::name(); }
+
+  PassT Pass;
+};
+
+/// \brief Specialization of \c AnalysisPassModel which does not pass an
+/// \c AnalysisManager to PassT's run method.
+template <typename IRUnitT, typename PassT>
+struct AnalysisPassModel<IRUnitT, PassT, false> : AnalysisPassConcept<IRUnitT> {
+  explicit AnalysisPassModel(PassT Pass) : Pass(std::move(Pass)) {}
+  // We have to explicitly define all the special member functions because MSVC
+  // refuses to generate them.
+  AnalysisPassModel(const AnalysisPassModel &Arg) : Pass(Arg.Pass) {}
+  AnalysisPassModel(AnalysisPassModel &&Arg) : Pass(std::move(Arg.Pass)) {}
+  friend void swap(AnalysisPassModel &LHS, AnalysisPassModel &RHS) {
+    using std::swap;
+    swap(LHS.Pass, RHS.Pass);
+  }
+  AnalysisPassModel &operator=(AnalysisPassModel RHS) {
+    swap(*this, RHS);
+    return *this;
+  }
+
+  // FIXME: Replace PassT::Result with type traits when we use C++11.
+  typedef AnalysisResultModel<IRUnitT, PassT, typename PassT::Result>
+      ResultModelT;
+
+  /// \brief The model delegates to the \c PassT::run method.
+  ///
+  /// The return is wrapped in an \c AnalysisResultModel.
+  std::unique_ptr<AnalysisResultConcept<IRUnitT>>
+  run(IRUnitT &IR, AnalysisManager<IRUnitT> *) override {
+    return make_unique<ResultModelT>(Pass.run(IR));
+  }
+
+  /// \brief The model delegates to a static \c PassT::name method.
+  ///
+  /// The returned string ref must point to constant immutable data!
+  StringRef name() override { return PassT::name(); }
+
+  PassT Pass;
+};
+
+} // End namespace detail
+}
+
+#endif
diff --git a/include/llvm/IR/PatternMatch.h b/include/llvm/IR/PatternMatch.h
index 4783062..f94e105 100644
--- a/include/llvm/IR/PatternMatch.h
+++ b/include/llvm/IR/PatternMatch.h
@@ -38,55 +38,58 @@
 namespace llvm {
 namespace PatternMatch {
 
-template<typename Val, typename Pattern>
-bool match(Val *V, const Pattern &P) {
-  return const_cast<Pattern&>(P).match(V);
+template <typename Val, typename Pattern> bool match(Val *V, const Pattern &P) {
+  return const_cast<Pattern &>(P).match(V);
 }
 
-
-template<typename SubPattern_t>
-struct OneUse_match {
+template <typename SubPattern_t> struct OneUse_match {
   SubPattern_t SubPattern;
 
   OneUse_match(const SubPattern_t &SP) : SubPattern(SP) {}
 
-  template<typename OpTy>
-  bool match(OpTy *V) {
+  template <typename OpTy> bool match(OpTy *V) {
     return V->hasOneUse() && SubPattern.match(V);
   }
 };
 
-template<typename T>
-inline OneUse_match<T> m_OneUse(const T &SubPattern) { return SubPattern; }
-
+template <typename T> inline OneUse_match<T> m_OneUse(const T &SubPattern) {
+  return SubPattern;
+}
 
-template<typename Class>
-struct class_match {
-  template<typename ITy>
-  bool match(ITy *V) { return isa<Class>(V); }
+template <typename Class> struct class_match {
+  template <typename ITy> bool match(ITy *V) { return isa<Class>(V); }
 };
 
-/// m_Value() - Match an arbitrary value and ignore it.
+/// \brief Match an arbitrary value and ignore it.
 inline class_match<Value> m_Value() { return class_match<Value>(); }
-/// m_ConstantInt() - Match an arbitrary ConstantInt and ignore it.
+
+/// \brief Match an arbitrary binary operation and ignore it.
+inline class_match<BinaryOperator> m_BinOp() {
+  return class_match<BinaryOperator>();
+}
+
+/// \brief Matches any compare instruction and ignore it.
+inline class_match<CmpInst> m_Cmp() { return class_match<CmpInst>(); }
+
+/// \brief Match an arbitrary ConstantInt and ignore it.
 inline class_match<ConstantInt> m_ConstantInt() {
   return class_match<ConstantInt>();
 }
-/// m_Undef() - Match an arbitrary undef constant.
+
+/// \brief Match an arbitrary undef constant.
 inline class_match<UndefValue> m_Undef() { return class_match<UndefValue>(); }
 
+/// \brief Match an arbitrary Constant and ignore it.
 inline class_match<Constant> m_Constant() { return class_match<Constant>(); }
 
 /// Matching combinators
-template<typename LTy, typename RTy>
-struct match_combine_or {
+template <typename LTy, typename RTy> struct match_combine_or {
   LTy L;
   RTy R;
 
-  match_combine_or(const LTy &Left, const RTy &Right) : L(Left), R(Right) { }
+  match_combine_or(const LTy &Left, const RTy &Right) : L(Left), R(Right) {}
 
-  template<typename ITy>
-  bool match(ITy *V) {
+  template <typename ITy> bool match(ITy *V) {
     if (L.match(V))
       return true;
     if (R.match(V))
@@ -95,15 +98,13 @@ struct match_combine_or {
   }
 };
 
-template<typename LTy, typename RTy>
-struct match_combine_and {
+template <typename LTy, typename RTy> struct match_combine_and {
   LTy L;
   RTy R;
 
-  match_combine_and(const LTy &Left, const RTy &Right) : L(Left), R(Right) { }
+  match_combine_and(const LTy &Left, const RTy &Right) : L(Left), R(Right) {}
 
-  template<typename ITy>
-  bool match(ITy *V) {
+  template <typename ITy> bool match(ITy *V) {
     if (L.match(V))
       if (R.match(V))
         return true;
@@ -112,46 +113,44 @@ struct match_combine_and {
 };
 
 /// Combine two pattern matchers matching L || R
-template<typename LTy, typename RTy>
+template <typename LTy, typename RTy>
 inline match_combine_or<LTy, RTy> m_CombineOr(const LTy &L, const RTy &R) {
   return match_combine_or<LTy, RTy>(L, R);
 }
 
 /// Combine two pattern matchers matching L && R
-template<typename LTy, typename RTy>
+template <typename LTy, typename RTy>
 inline match_combine_and<LTy, RTy> m_CombineAnd(const LTy &L, const RTy &R) {
   return match_combine_and<LTy, RTy>(L, R);
 }
 
 struct match_zero {
-  template<typename ITy>
-  bool match(ITy *V) {
-    if (const Constant *C = dyn_cast<Constant>(V))
+  template <typename ITy> bool match(ITy *V) {
+    if (const auto *C = dyn_cast<Constant>(V))
       return C->isNullValue();
     return false;
   }
 };
 
-/// m_Zero() - Match an arbitrary zero/null constant.  This includes
+/// \brief Match an arbitrary zero/null constant.  This includes
 /// zero_initializer for vectors and ConstantPointerNull for pointers.
 inline match_zero m_Zero() { return match_zero(); }
 
 struct match_neg_zero {
-  template<typename ITy>
-  bool match(ITy *V) {
-    if (const Constant *C = dyn_cast<Constant>(V))
+  template <typename ITy> bool match(ITy *V) {
+    if (const auto *C = dyn_cast<Constant>(V))
       return C->isNegativeZeroValue();
     return false;
   }
 };
 
-/// m_NegZero() - Match an arbitrary zero/null constant.  This includes
+/// \brief Match an arbitrary zero/null constant.  This includes
 /// zero_initializer for vectors and ConstantPointerNull for pointers. For
 /// floating point constants, this will match negative zero but not positive
 /// zero
 inline match_neg_zero m_NegZero() { return match_neg_zero(); }
 
-/// m_AnyZero() - Match an arbitrary zero/null constant.  This includes
+/// \brief - Match an arbitrary zero/null constant.  This includes
 /// zero_initializer for vectors and ConstantPointerNull for pointers. For
 /// floating point constants, this will match negative zero and positive zero
 inline match_combine_or<match_zero, match_neg_zero> m_AnyZero() {
@@ -161,16 +160,14 @@ inline match_combine_or<match_zero, match_neg_zero> m_AnyZero() {
 struct apint_match {
   const APInt *&Res;
   apint_match(const APInt *&R) : Res(R) {}
-  template<typename ITy>
-  bool match(ITy *V) {
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+  template <typename ITy> bool match(ITy *V) {
+    if (auto *CI = dyn_cast<ConstantInt>(V)) {
       Res = &CI->getValue();
       return true;
     }
     if (V->getType()->isVectorTy())
-      if (const Constant *C = dyn_cast<Constant>(V))
-        if (ConstantInt *CI =
-            dyn_cast_or_null<ConstantInt>(C->getSplatValue())) {
+      if (const auto *C = dyn_cast<Constant>(V))
+        if (auto *CI = dyn_cast_or_null<ConstantInt>(C->getSplatValue())) {
           Res = &CI->getValue();
           return true;
         }
@@ -178,16 +175,13 @@ struct apint_match {
   }
 };
 
-/// m_APInt - Match a ConstantInt or splatted ConstantVector, binding the
+/// \brief Match a ConstantInt or splatted ConstantVector, binding the
 /// specified pointer to the contained APInt.
 inline apint_match m_APInt(const APInt *&Res) { return Res; }
 
-
-template<int64_t Val>
-struct constantint_match {
-  template<typename ITy>
-  bool match(ITy *V) {
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+template <int64_t Val> struct constantint_match {
+  template <typename ITy> bool match(ITy *V) {
+    if (const auto *CI = dyn_cast<ConstantInt>(V)) {
       const APInt &CIV = CI->getValue();
       if (Val >= 0)
         return CIV == static_cast<uint64_t>(Val);
@@ -200,45 +194,39 @@ struct constantint_match {
   }
 };
 
-/// m_ConstantInt<int64_t> - Match a ConstantInt with a specific value.
-template<int64_t Val>
-inline constantint_match<Val> m_ConstantInt() {
+/// \brief Match a ConstantInt with a specific value.
+template <int64_t Val> inline constantint_match<Val> m_ConstantInt() {
   return constantint_match<Val>();
 }
 
-/// cst_pred_ty - This helper class is used to match scalar and vector constants
-/// that satisfy a specified predicate.
-template<typename Predicate>
-struct cst_pred_ty : public Predicate {
-  template<typename ITy>
-  bool match(ITy *V) {
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
+/// \brief This helper class is used to match scalar and vector constants that
+/// satisfy a specified predicate.
+template <typename Predicate> struct cst_pred_ty : public Predicate {
+  template <typename ITy> bool match(ITy *V) {
+    if (const auto *CI = dyn_cast<ConstantInt>(V))
       return this->isValue(CI->getValue());
     if (V->getType()->isVectorTy())
-      if (const Constant *C = dyn_cast<Constant>(V))
-        if (const ConstantInt *CI =
-            dyn_cast_or_null<ConstantInt>(C->getSplatValue()))
+      if (const auto *C = dyn_cast<Constant>(V))
+        if (const auto *CI = dyn_cast_or_null<ConstantInt>(C->getSplatValue()))
           return this->isValue(CI->getValue());
     return false;
   }
 };
 
-/// api_pred_ty - This helper class is used to match scalar and vector constants
-/// that satisfy a specified predicate, and bind them to an APInt.
-template<typename Predicate>
-struct api_pred_ty : public Predicate {
+/// \brief This helper class is used to match scalar and vector constants that
+/// satisfy a specified predicate, and bind them to an APInt.
+template <typename Predicate> struct api_pred_ty : public Predicate {
   const APInt *&Res;
   api_pred_ty(const APInt *&R) : Res(R) {}
-  template<typename ITy>
-  bool match(ITy *V) {
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
+  template <typename ITy> bool match(ITy *V) {
+    if (const auto *CI = dyn_cast<ConstantInt>(V))
       if (this->isValue(CI->getValue())) {
         Res = &CI->getValue();
         return true;
       }
     if (V->getType()->isVectorTy())
-      if (const Constant *C = dyn_cast<Constant>(V))
-        if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C->getSplatValue()))
+      if (const auto *C = dyn_cast<Constant>(V))
+        if (auto *CI = dyn_cast_or_null<ConstantInt>(C->getSplatValue()))
           if (this->isValue(CI->getValue())) {
             Res = &CI->getValue();
             return true;
@@ -248,12 +236,11 @@ struct api_pred_ty : public Predicate {
   }
 };
 
-
 struct is_one {
   bool isValue(const APInt &C) { return C == 1; }
 };
 
-/// m_One() - Match an integer 1 or a vector with all elements equal to 1.
+/// \brief Match an integer 1 or a vector with all elements equal to 1.
 inline cst_pred_ty<is_one> m_One() { return cst_pred_ty<is_one>(); }
 inline api_pred_ty<is_one> m_One(const APInt *&V) { return V; }
 
@@ -261,34 +248,43 @@ struct is_all_ones {
   bool isValue(const APInt &C) { return C.isAllOnesValue(); }
 };
 
-/// m_AllOnes() - Match an integer or vector with all bits set to true.
-inline cst_pred_ty<is_all_ones> m_AllOnes() {return cst_pred_ty<is_all_ones>();}
+/// \brief Match an integer or vector with all bits set to true.
+inline cst_pred_ty<is_all_ones> m_AllOnes() {
+  return cst_pred_ty<is_all_ones>();
+}
 inline api_pred_ty<is_all_ones> m_AllOnes(const APInt *&V) { return V; }
 
 struct is_sign_bit {
   bool isValue(const APInt &C) { return C.isSignBit(); }
 };
 
-/// m_SignBit() - Match an integer or vector with only the sign bit(s) set.
-inline cst_pred_ty<is_sign_bit> m_SignBit() {return cst_pred_ty<is_sign_bit>();}
+/// \brief Match an integer or vector with only the sign bit(s) set.
+inline cst_pred_ty<is_sign_bit> m_SignBit() {
+  return cst_pred_ty<is_sign_bit>();
+}
 inline api_pred_ty<is_sign_bit> m_SignBit(const APInt *&V) { return V; }
 
 struct is_power2 {
   bool isValue(const APInt &C) { return C.isPowerOf2(); }
 };
 
-/// m_Power2() - Match an integer or vector power of 2.
+/// \brief Match an integer or vector power of 2.
 inline cst_pred_ty<is_power2> m_Power2() { return cst_pred_ty<is_power2>(); }
 inline api_pred_ty<is_power2> m_Power2(const APInt *&V) { return V; }
 
-template<typename Class>
-struct bind_ty {
+struct is_maxsignedvalue {
+  bool isValue(const APInt &C) { return C.isMaxSignedValue(); }
+};
+
+inline cst_pred_ty<is_maxsignedvalue> m_MaxSignedValue() { return cst_pred_ty<is_maxsignedvalue>(); }
+inline api_pred_ty<is_maxsignedvalue> m_MaxSignedValue(const APInt *&V) { return V; }
+
+template <typename Class> struct bind_ty {
   Class *&VR;
   bind_ty(Class *&V) : VR(V) {}
 
-  template<typename ITy>
-  bool match(ITy *V) {
-    if (Class *CV = dyn_cast<Class>(V)) {
+  template <typename ITy> bool match(ITy *V) {
+    if (auto *CV = dyn_cast<Class>(V)) {
       VR = CV;
       return true;
     }
@@ -296,64 +292,62 @@ struct bind_ty {
   }
 };
 
-/// m_Value - Match a value, capturing it if we match.
+/// \brief Match a value, capturing it if we match.
 inline bind_ty<Value> m_Value(Value *&V) { return V; }
 
-/// m_ConstantInt - Match a ConstantInt, capturing the value if we match.
+/// \brief Match a binary operator, capturing it if we match.
+inline bind_ty<BinaryOperator> m_BinOp(BinaryOperator *&I) { return I; }
+
+/// \brief Match a ConstantInt, capturing the value if we match.
 inline bind_ty<ConstantInt> m_ConstantInt(ConstantInt *&CI) { return CI; }
 
-/// m_Constant - Match a Constant, capturing the value if we match.
+/// \brief Match a Constant, capturing the value if we match.
 inline bind_ty<Constant> m_Constant(Constant *&C) { return C; }
 
-/// m_ConstantFP - Match a ConstantFP, capturing the value if we match.
+/// \brief Match a ConstantFP, capturing the value if we match.
 inline bind_ty<ConstantFP> m_ConstantFP(ConstantFP *&C) { return C; }
 
-/// specificval_ty - Match a specified Value*.
+/// \brief Match a specified Value*.
 struct specificval_ty {
   const Value *Val;
   specificval_ty(const Value *V) : Val(V) {}
 
-  template<typename ITy>
-  bool match(ITy *V) {
-    return V == Val;
-  }
+  template <typename ITy> bool match(ITy *V) { return V == Val; }
 };
 
-/// m_Specific - Match if we have a specific specified value.
+/// \brief Match if we have a specific specified value.
 inline specificval_ty m_Specific(const Value *V) { return V; }
 
-/// Match a specified floating point value or vector of all elements of that
-/// value.
+/// \brief Match a specified floating point value or vector of all elements of
+/// that value.
 struct specific_fpval {
   double Val;
   specific_fpval(double V) : Val(V) {}
 
-  template<typename ITy>
-  bool match(ITy *V) {
-    if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
+  template <typename ITy> bool match(ITy *V) {
+    if (const auto *CFP = dyn_cast<ConstantFP>(V))
       return CFP->isExactlyValue(Val);
     if (V->getType()->isVectorTy())
-      if (const Constant *C = dyn_cast<Constant>(V))
-        if (ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(C->getSplatValue()))
+      if (const auto *C = dyn_cast<Constant>(V))
+        if (auto *CFP = dyn_cast_or_null<ConstantFP>(C->getSplatValue()))
           return CFP->isExactlyValue(Val);
     return false;
   }
 };
 
-/// Match a specific floating point value or vector with all elements equal to
-/// the value.
+/// \brief Match a specific floating point value or vector with all elements
+/// equal to the value.
 inline specific_fpval m_SpecificFP(double V) { return specific_fpval(V); }
 
-/// Match a float 1.0 or vector with all elements equal to 1.0.
+/// \brief Match a float 1.0 or vector with all elements equal to 1.0.
 inline specific_fpval m_FPOne() { return m_SpecificFP(1.0); }
 
 struct bind_const_intval_ty {
   uint64_t &VR;
   bind_const_intval_ty(uint64_t &V) : VR(V) {}
 
-  template<typename ITy>
-  bool match(ITy *V) {
-    if (ConstantInt *CV = dyn_cast<ConstantInt>(V))
+  template <typename ITy> bool match(ITy *V) {
+    if (const auto *CV = dyn_cast<ConstantInt>(V))
       if (CV->getBitWidth() <= 64) {
         VR = CV->getZExtValue();
         return true;
@@ -362,14 +356,14 @@ struct bind_const_intval_ty {
   }
 };
 
-/// Match a specified integer value or vector of all elements of that value.
+/// \brief Match a specified integer value or vector of all elements of that
+// value.
 struct specific_intval {
   uint64_t Val;
   specific_intval(uint64_t V) : Val(V) {}
 
-  template<typename ITy>
-  bool match(ITy *V) {
-    ConstantInt *CI = dyn_cast<ConstantInt>(V);
+  template <typename ITy> bool match(ITy *V) {
+    const auto *CI = dyn_cast<ConstantInt>(V);
     if (!CI && V->getType()->isVectorTy())
       if (const auto *C = dyn_cast<Constant>(V))
         CI = dyn_cast_or_null<ConstantInt>(C->getSplatValue());
@@ -381,156 +375,177 @@ struct specific_intval {
   }
 };
 
-/// Match a specific integer value or vector with all elements equal to the
-/// value.
+/// \brief Match a specific integer value or vector with all elements equal to
+/// the value.
 inline specific_intval m_SpecificInt(uint64_t V) { return specific_intval(V); }
 
-/// m_ConstantInt - Match a ConstantInt and bind to its value.  This does not
-/// match ConstantInts wider than 64-bits.
+/// \brief Match a ConstantInt and bind to its value.  This does not match
+/// ConstantInts wider than 64-bits.
 inline bind_const_intval_ty m_ConstantInt(uint64_t &V) { return V; }
 
 //===----------------------------------------------------------------------===//
+// Matcher for any binary operator.
+//
+template <typename LHS_t, typename RHS_t> struct AnyBinaryOp_match {
+  LHS_t L;
+  RHS_t R;
+
+  AnyBinaryOp_match(const LHS_t &LHS, const RHS_t &RHS) : L(LHS), R(RHS) {}
+
+  template <typename OpTy> bool match(OpTy *V) {
+    if (auto *I = dyn_cast<BinaryOperator>(V))
+      return L.match(I->getOperand(0)) && R.match(I->getOperand(1));
+    return false;
+  }
+};
+
+template <typename LHS, typename RHS>
+inline AnyBinaryOp_match<LHS, RHS> m_BinOp(const LHS &L, const RHS &R) {
+  return AnyBinaryOp_match<LHS, RHS>(L, R);
+}
+
+//===----------------------------------------------------------------------===//
 // Matchers for specific binary operators.
 //
 
-template<typename LHS_t, typename RHS_t, unsigned Opcode>
+template <typename LHS_t, typename RHS_t, unsigned Opcode>
 struct BinaryOp_match {
   LHS_t L;
   RHS_t R;
 
   BinaryOp_match(const LHS_t &LHS, const RHS_t &RHS) : L(LHS), R(RHS) {}
 
-  template<typename OpTy>
-  bool match(OpTy *V) {
+  template <typename OpTy> bool match(OpTy *V) {
     if (V->getValueID() == Value::InstructionVal + Opcode) {
-      BinaryOperator *I = cast<BinaryOperator>(V);
+      auto *I = cast<BinaryOperator>(V);
       return L.match(I->getOperand(0)) && R.match(I->getOperand(1));
     }
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (auto *CE = dyn_cast<ConstantExpr>(V))
       return CE->getOpcode() == Opcode && L.match(CE->getOperand(0)) &&
              R.match(CE->getOperand(1));
     return false;
   }
 };
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::Add>
-m_Add(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::Add> m_Add(const LHS &L,
+                                                        const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::Add>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::FAdd>
-m_FAdd(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::FAdd> m_FAdd(const LHS &L,
+                                                          const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::FAdd>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::Sub>
-m_Sub(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::Sub> m_Sub(const LHS &L,
+                                                        const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::Sub>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::FSub>
-m_FSub(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::FSub> m_FSub(const LHS &L,
+                                                          const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::FSub>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::Mul>
-m_Mul(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::Mul> m_Mul(const LHS &L,
+                                                        const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::Mul>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::FMul>
-m_FMul(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::FMul> m_FMul(const LHS &L,
+                                                          const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::FMul>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::UDiv>
-m_UDiv(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::UDiv> m_UDiv(const LHS &L,
+                                                          const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::UDiv>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::SDiv>
-m_SDiv(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::SDiv> m_SDiv(const LHS &L,
+                                                          const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::SDiv>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::FDiv>
-m_FDiv(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::FDiv> m_FDiv(const LHS &L,
+                                                          const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::FDiv>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::URem>
-m_URem(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::URem> m_URem(const LHS &L,
+                                                          const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::URem>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::SRem>
-m_SRem(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::SRem> m_SRem(const LHS &L,
+                                                          const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::SRem>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::FRem>
-m_FRem(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::FRem> m_FRem(const LHS &L,
+                                                          const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::FRem>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::And>
-m_And(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::And> m_And(const LHS &L,
+                                                        const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::And>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::Or>
-m_Or(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::Or> m_Or(const LHS &L,
+                                                      const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::Or>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::Xor>
-m_Xor(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::Xor> m_Xor(const LHS &L,
+                                                        const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::Xor>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::Shl>
-m_Shl(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::Shl> m_Shl(const LHS &L,
+                                                        const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::Shl>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::LShr>
-m_LShr(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::LShr> m_LShr(const LHS &L,
+                                                          const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::LShr>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, Instruction::AShr>
-m_AShr(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, Instruction::AShr> m_AShr(const LHS &L,
+                                                          const RHS &R) {
   return BinaryOp_match<LHS, RHS, Instruction::AShr>(L, R);
 }
 
-template<typename LHS_t, typename RHS_t, unsigned Opcode, unsigned WrapFlags = 0>
+template <typename LHS_t, typename RHS_t, unsigned Opcode,
+          unsigned WrapFlags = 0>
 struct OverflowingBinaryOp_match {
   LHS_t L;
   RHS_t R;
 
-  OverflowingBinaryOp_match(const LHS_t &LHS, const RHS_t &RHS) : L(LHS), R(RHS) {}
+  OverflowingBinaryOp_match(const LHS_t &LHS, const RHS_t &RHS)
+      : L(LHS), R(RHS) {}
 
-  template<typename OpTy>
-  bool match(OpTy *V) {
-    if (OverflowingBinaryOperator *Op = dyn_cast<OverflowingBinaryOperator>(V)) {
+  template <typename OpTy> bool match(OpTy *V) {
+    if (auto *Op = dyn_cast<OverflowingBinaryOperator>(V)) {
       if (Op->getOpcode() != Opcode)
         return false;
       if (WrapFlags & OverflowingBinaryOperator::NoUnsignedWrap &&
@@ -614,43 +629,42 @@ m_NUWShl(const LHS &L, const RHS &R) {
 //===----------------------------------------------------------------------===//
 // Class that matches two different binary ops.
 //
-template<typename LHS_t, typename RHS_t, unsigned Opc1, unsigned Opc2>
+template <typename LHS_t, typename RHS_t, unsigned Opc1, unsigned Opc2>
 struct BinOp2_match {
   LHS_t L;
   RHS_t R;
 
   BinOp2_match(const LHS_t &LHS, const RHS_t &RHS) : L(LHS), R(RHS) {}
 
-  template<typename OpTy>
-  bool match(OpTy *V) {
+  template <typename OpTy> bool match(OpTy *V) {
     if (V->getValueID() == Value::InstructionVal + Opc1 ||
         V->getValueID() == Value::InstructionVal + Opc2) {
-      BinaryOperator *I = cast<BinaryOperator>(V);
+      auto *I = cast<BinaryOperator>(V);
       return L.match(I->getOperand(0)) && R.match(I->getOperand(1));
     }
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (auto *CE = dyn_cast<ConstantExpr>(V))
       return (CE->getOpcode() == Opc1 || CE->getOpcode() == Opc2) &&
              L.match(CE->getOperand(0)) && R.match(CE->getOperand(1));
     return false;
   }
 };
 
-/// m_Shr - Matches LShr or AShr.
-template<typename LHS, typename RHS>
+/// \brief Matches LShr or AShr.
+template <typename LHS, typename RHS>
 inline BinOp2_match<LHS, RHS, Instruction::LShr, Instruction::AShr>
 m_Shr(const LHS &L, const RHS &R) {
   return BinOp2_match<LHS, RHS, Instruction::LShr, Instruction::AShr>(L, R);
 }
 
-/// m_LogicalShift - Matches LShr or Shl.
-template<typename LHS, typename RHS>
+/// \brief Matches LShr or Shl.
+template <typename LHS, typename RHS>
 inline BinOp2_match<LHS, RHS, Instruction::LShr, Instruction::Shl>
 m_LogicalShift(const LHS &L, const RHS &R) {
   return BinOp2_match<LHS, RHS, Instruction::LShr, Instruction::Shl>(L, R);
 }
 
-/// m_IDiv - Matches UDiv and SDiv.
-template<typename LHS, typename RHS>
+/// \brief Matches UDiv and SDiv.
+template <typename LHS, typename RHS>
 inline BinOp2_match<LHS, RHS, Instruction::SDiv, Instruction::UDiv>
 m_IDiv(const LHS &L, const RHS &R) {
   return BinOp2_match<LHS, RHS, Instruction::SDiv, Instruction::UDiv>(L, R);
@@ -659,38 +673,36 @@ m_IDiv(const LHS &L, const RHS &R) {
 //===----------------------------------------------------------------------===//
 // Class that matches exact binary ops.
 //
-template<typename SubPattern_t>
-struct Exact_match {
+template <typename SubPattern_t> struct Exact_match {
   SubPattern_t SubPattern;
 
   Exact_match(const SubPattern_t &SP) : SubPattern(SP) {}
 
-  template<typename OpTy>
-  bool match(OpTy *V) {
+  template <typename OpTy> bool match(OpTy *V) {
     if (PossiblyExactOperator *PEO = dyn_cast<PossiblyExactOperator>(V))
       return PEO->isExact() && SubPattern.match(V);
     return false;
   }
 };
 
-template<typename T>
-inline Exact_match<T> m_Exact(const T &SubPattern) { return SubPattern; }
+template <typename T> inline Exact_match<T> m_Exact(const T &SubPattern) {
+  return SubPattern;
+}
 
 //===----------------------------------------------------------------------===//
 // Matchers for CmpInst classes
 //
 
-template<typename LHS_t, typename RHS_t, typename Class, typename PredicateTy>
+template <typename LHS_t, typename RHS_t, typename Class, typename PredicateTy>
 struct CmpClass_match {
   PredicateTy &Predicate;
   LHS_t L;
   RHS_t R;
 
   CmpClass_match(PredicateTy &Pred, const LHS_t &LHS, const RHS_t &RHS)
-    : Predicate(Pred), L(LHS), R(RHS) {}
+      : Predicate(Pred), L(LHS), R(RHS) {}
 
-  template<typename OpTy>
-  bool match(OpTy *V) {
+  template <typename OpTy> bool match(OpTy *V) {
     if (Class *I = dyn_cast<Class>(V))
       if (L.match(I->getOperand(0)) && R.match(I->getOperand(1))) {
         Predicate = I->getPredicate();
@@ -700,123 +712,114 @@ struct CmpClass_match {
   }
 };
 
-template<typename LHS, typename RHS>
+template <typename LHS, typename RHS>
+inline CmpClass_match<LHS, RHS, CmpInst, CmpInst::Predicate>
+m_Cmp(CmpInst::Predicate &Pred, const LHS &L, const RHS &R) {
+  return CmpClass_match<LHS, RHS, CmpInst, CmpInst::Predicate>(Pred, L, R);
+}
+
+template <typename LHS, typename RHS>
 inline CmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate>
 m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R) {
-  return CmpClass_match<LHS, RHS,
-                        ICmpInst, ICmpInst::Predicate>(Pred, L, R);
+  return CmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate>(Pred, L, R);
 }
 
-template<typename LHS, typename RHS>
+template <typename LHS, typename RHS>
 inline CmpClass_match<LHS, RHS, FCmpInst, FCmpInst::Predicate>
 m_FCmp(FCmpInst::Predicate &Pred, const LHS &L, const RHS &R) {
-  return CmpClass_match<LHS, RHS,
-                        FCmpInst, FCmpInst::Predicate>(Pred, L, R);
+  return CmpClass_match<LHS, RHS, FCmpInst, FCmpInst::Predicate>(Pred, L, R);
 }
 
 //===----------------------------------------------------------------------===//
 // Matchers for SelectInst classes
 //
 
-template<typename Cond_t, typename LHS_t, typename RHS_t>
+template <typename Cond_t, typename LHS_t, typename RHS_t>
 struct SelectClass_match {
   Cond_t C;
   LHS_t L;
   RHS_t R;
 
-  SelectClass_match(const Cond_t &Cond, const LHS_t &LHS,
-                    const RHS_t &RHS)
-    : C(Cond), L(LHS), R(RHS) {}
+  SelectClass_match(const Cond_t &Cond, const LHS_t &LHS, const RHS_t &RHS)
+      : C(Cond), L(LHS), R(RHS) {}
 
-  template<typename OpTy>
-  bool match(OpTy *V) {
-    if (SelectInst *I = dyn_cast<SelectInst>(V))
-      return C.match(I->getOperand(0)) &&
-             L.match(I->getOperand(1)) &&
+  template <typename OpTy> bool match(OpTy *V) {
+    if (auto *I = dyn_cast<SelectInst>(V))
+      return C.match(I->getOperand(0)) && L.match(I->getOperand(1)) &&
              R.match(I->getOperand(2));
     return false;
   }
 };
 
-template<typename Cond, typename LHS, typename RHS>
-inline SelectClass_match<Cond, LHS, RHS>
-m_Select(const Cond &C, const LHS &L, const RHS &R) {
+template <typename Cond, typename LHS, typename RHS>
+inline SelectClass_match<Cond, LHS, RHS> m_Select(const Cond &C, const LHS &L,
+                                                  const RHS &R) {
   return SelectClass_match<Cond, LHS, RHS>(C, L, R);
 }
 
-/// m_SelectCst - This matches a select of two constants, e.g.:
-///    m_SelectCst<-1, 0>(m_Value(V))
-template<int64_t L, int64_t R, typename Cond>
-inline SelectClass_match<Cond, constantint_match<L>, constantint_match<R> >
+/// \brief This matches a select of two constants, e.g.:
+/// m_SelectCst<-1, 0>(m_Value(V))
+template <int64_t L, int64_t R, typename Cond>
+inline SelectClass_match<Cond, constantint_match<L>, constantint_match<R>>
 m_SelectCst(const Cond &C) {
   return m_Select(C, m_ConstantInt<L>(), m_ConstantInt<R>());
 }
 
-
 //===----------------------------------------------------------------------===//
 // Matchers for CastInst classes
 //
 
-template<typename Op_t, unsigned Opcode>
-struct CastClass_match {
+template <typename Op_t, unsigned Opcode> struct CastClass_match {
   Op_t Op;
 
   CastClass_match(const Op_t &OpMatch) : Op(OpMatch) {}
 
-  template<typename OpTy>
-  bool match(OpTy *V) {
-    if (Operator *O = dyn_cast<Operator>(V))
+  template <typename OpTy> bool match(OpTy *V) {
+    if (auto *O = dyn_cast<Operator>(V))
       return O->getOpcode() == Opcode && Op.match(O->getOperand(0));
     return false;
   }
 };
 
-/// m_BitCast
-template<typename OpTy>
-inline CastClass_match<OpTy, Instruction::BitCast>
-m_BitCast(const OpTy &Op) {
+/// \brief Matches BitCast.
+template <typename OpTy>
+inline CastClass_match<OpTy, Instruction::BitCast> m_BitCast(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::BitCast>(Op);
 }
 
-/// m_PtrToInt
-template<typename OpTy>
-inline CastClass_match<OpTy, Instruction::PtrToInt>
-m_PtrToInt(const OpTy &Op) {
+/// \brief Matches PtrToInt.
+template <typename OpTy>
+inline CastClass_match<OpTy, Instruction::PtrToInt> m_PtrToInt(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::PtrToInt>(Op);
 }
 
-/// m_Trunc
-template<typename OpTy>
-inline CastClass_match<OpTy, Instruction::Trunc>
-m_Trunc(const OpTy &Op) {
+/// \brief Matches Trunc.
+template <typename OpTy>
+inline CastClass_match<OpTy, Instruction::Trunc> m_Trunc(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::Trunc>(Op);
 }
 
-/// m_SExt
-template<typename OpTy>
-inline CastClass_match<OpTy, Instruction::SExt>
-m_SExt(const OpTy &Op) {
+/// \brief Matches SExt.
+template <typename OpTy>
+inline CastClass_match<OpTy, Instruction::SExt> m_SExt(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::SExt>(Op);
 }
 
-/// m_ZExt
-template<typename OpTy>
-inline CastClass_match<OpTy, Instruction::ZExt>
-m_ZExt(const OpTy &Op) {
+/// \brief Matches ZExt.
+template <typename OpTy>
+inline CastClass_match<OpTy, Instruction::ZExt> m_ZExt(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::ZExt>(Op);
 }
 
-/// m_UIToFP
-template<typename OpTy>
-inline CastClass_match<OpTy, Instruction::UIToFP>
-m_UIToFP(const OpTy &Op) {
+/// \brief Matches UIToFP.
+template <typename OpTy>
+inline CastClass_match<OpTy, Instruction::UIToFP> m_UIToFP(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::UIToFP>(Op);
 }
 
-/// m_SIToFP
-template<typename OpTy>
-inline CastClass_match<OpTy, Instruction::SIToFP>
-m_SIToFP(const OpTy &Op) {
+/// \brief Matches SIToFP.
+template <typename OpTy>
+inline CastClass_match<OpTy, Instruction::SIToFP> m_SIToFP(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::SIToFP>(Op);
 }
 
@@ -824,46 +827,41 @@ m_SIToFP(const OpTy &Op) {
 // Matchers for unary operators
 //
 
-template<typename LHS_t>
-struct not_match {
+template <typename LHS_t> struct not_match {
   LHS_t L;
 
   not_match(const LHS_t &LHS) : L(LHS) {}
 
-  template<typename OpTy>
-  bool match(OpTy *V) {
-    if (Operator *O = dyn_cast<Operator>(V))
+  template <typename OpTy> bool match(OpTy *V) {
+    if (auto *O = dyn_cast<Operator>(V))
       if (O->getOpcode() == Instruction::Xor)
         return matchIfNot(O->getOperand(0), O->getOperand(1));
     return false;
   }
+
 private:
   bool matchIfNot(Value *LHS, Value *RHS) {
     return (isa<ConstantInt>(RHS) || isa<ConstantDataVector>(RHS) ||
             // FIXME: Remove CV.
             isa<ConstantVector>(RHS)) &&
-           cast<Constant>(RHS)->isAllOnesValue() &&
-           L.match(LHS);
+           cast<Constant>(RHS)->isAllOnesValue() && L.match(LHS);
   }
 };
 
-template<typename LHS>
-inline not_match<LHS> m_Not(const LHS &L) { return L; }
+template <typename LHS> inline not_match<LHS> m_Not(const LHS &L) { return L; }
 
-
-template<typename LHS_t>
-struct neg_match {
+template <typename LHS_t> struct neg_match {
   LHS_t L;
 
   neg_match(const LHS_t &LHS) : L(LHS) {}
 
-  template<typename OpTy>
-  bool match(OpTy *V) {
-    if (Operator *O = dyn_cast<Operator>(V))
+  template <typename OpTy> bool match(OpTy *V) {
+    if (auto *O = dyn_cast<Operator>(V))
       if (O->getOpcode() == Instruction::Sub)
         return matchIfNeg(O->getOperand(0), O->getOperand(1));
     return false;
   }
+
 private:
   bool matchIfNeg(Value *LHS, Value *RHS) {
     return ((isa<ConstantInt>(LHS) && cast<ConstantInt>(LHS)->isZero()) ||
@@ -872,36 +870,33 @@ private:
   }
 };
 
-/// m_Neg - Match an integer negate.
-template<typename LHS>
-inline neg_match<LHS> m_Neg(const LHS &L) { return L; }
-
+/// \brief Match an integer negate.
+template <typename LHS> inline neg_match<LHS> m_Neg(const LHS &L) { return L; }
 
-template<typename LHS_t>
-struct fneg_match {
+template <typename LHS_t> struct fneg_match {
   LHS_t L;
 
   fneg_match(const LHS_t &LHS) : L(LHS) {}
 
-  template<typename OpTy>
-  bool match(OpTy *V) {
-    if (Operator *O = dyn_cast<Operator>(V))
+  template <typename OpTy> bool match(OpTy *V) {
+    if (auto *O = dyn_cast<Operator>(V))
       if (O->getOpcode() == Instruction::FSub)
         return matchIfFNeg(O->getOperand(0), O->getOperand(1));
     return false;
   }
+
 private:
   bool matchIfFNeg(Value *LHS, Value *RHS) {
-    if (ConstantFP *C = dyn_cast<ConstantFP>(LHS))
+    if (const auto *C = dyn_cast<ConstantFP>(LHS))
       return C->isNegativeZeroValue() && L.match(RHS);
     return false;
   }
 };
 
-/// m_FNeg - Match a floating point negate.
-template<typename LHS>
-inline fneg_match<LHS> m_FNeg(const LHS &L) { return L; }
-
+/// \brief Match a floating point negate.
+template <typename LHS> inline fneg_match<LHS> m_FNeg(const LHS &L) {
+  return L;
+}
 
 //===----------------------------------------------------------------------===//
 // Matchers for control flow.
@@ -909,13 +904,10 @@ inline fneg_match<LHS> m_FNeg(const LHS &L) { return L; }
 
 struct br_match {
   BasicBlock *&Succ;
-  br_match(BasicBlock *&Succ)
-    : Succ(Succ) {
-  }
+  br_match(BasicBlock *&Succ) : Succ(Succ) {}
 
-  template<typename OpTy>
-  bool match(OpTy *V) {
-    if (BranchInst *BI = dyn_cast<BranchInst>(V))
+  template <typename OpTy> bool match(OpTy *V) {
+    if (auto *BI = dyn_cast<BranchInst>(V))
       if (BI->isUnconditional()) {
         Succ = BI->getSuccessor(0);
         return true;
@@ -926,17 +918,14 @@ struct br_match {
 
 inline br_match m_UnconditionalBr(BasicBlock *&Succ) { return br_match(Succ); }
 
-template<typename Cond_t>
-struct brc_match {
+template <typename Cond_t> struct brc_match {
   Cond_t Cond;
   BasicBlock *&T, *&F;
   brc_match(const Cond_t &C, BasicBlock *&t, BasicBlock *&f)
-    : Cond(C), T(t), F(f) {
-  }
+      : Cond(C), T(t), F(f) {}
 
-  template<typename OpTy>
-  bool match(OpTy *V) {
-    if (BranchInst *BI = dyn_cast<BranchInst>(V))
+  template <typename OpTy> bool match(OpTy *V) {
+    if (auto *BI = dyn_cast<BranchInst>(V))
       if (BI->isConditional() && Cond.match(BI->getCondition())) {
         T = BI->getSuccessor(0);
         F = BI->getSuccessor(1);
@@ -946,31 +935,28 @@ struct brc_match {
   }
 };
 
-template<typename Cond_t>
+template <typename Cond_t>
 inline brc_match<Cond_t> m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F) {
   return brc_match<Cond_t>(C, T, F);
 }
 
-
 //===----------------------------------------------------------------------===//
 // Matchers for max/min idioms, eg: "select (sgt x, y), x, y" -> smax(x,y).
 //
 
-template<typename CmpInst_t, typename LHS_t, typename RHS_t, typename Pred_t>
+template <typename CmpInst_t, typename LHS_t, typename RHS_t, typename Pred_t>
 struct MaxMin_match {
   LHS_t L;
   RHS_t R;
 
-  MaxMin_match(const LHS_t &LHS, const RHS_t &RHS)
-    : L(LHS), R(RHS) {}
+  MaxMin_match(const LHS_t &LHS, const RHS_t &RHS) : L(LHS), R(RHS) {}
 
-  template<typename OpTy>
-  bool match(OpTy *V) {
+  template <typename OpTy> bool match(OpTy *V) {
     // Look for "(x pred y) ? x : y" or "(x pred y) ? y : x".
-    SelectInst *SI = dyn_cast<SelectInst>(V);
+    auto *SI = dyn_cast<SelectInst>(V);
     if (!SI)
       return false;
-    CmpInst_t *Cmp = dyn_cast<CmpInst_t>(SI->getCondition());
+    auto *Cmp = dyn_cast<CmpInst_t>(SI->getCondition());
     if (!Cmp)
       return false;
     // At this point we have a select conditioned on a comparison.  Check that
@@ -982,8 +968,8 @@ struct MaxMin_match {
     if ((TrueVal != LHS || FalseVal != RHS) &&
         (TrueVal != RHS || FalseVal != LHS))
       return false;
-    typename CmpInst_t::Predicate Pred = LHS == TrueVal ?
-      Cmp->getPredicate() : Cmp->getSwappedPredicate();
+    typename CmpInst_t::Predicate Pred =
+        LHS == TrueVal ? Cmp->getPredicate() : Cmp->getSwappedPredicate();
     // Does "(x pred y) ? x : y" represent the desired max/min operation?
     if (!Pred_t::match(Pred))
       return false;
@@ -992,83 +978,83 @@ struct MaxMin_match {
   }
 };
 
-/// smax_pred_ty - Helper class for identifying signed max predicates.
+/// \brief Helper class for identifying signed max predicates.
 struct smax_pred_ty {
   static bool match(ICmpInst::Predicate Pred) {
     return Pred == CmpInst::ICMP_SGT || Pred == CmpInst::ICMP_SGE;
   }
 };
 
-/// smin_pred_ty - Helper class for identifying signed min predicates.
+/// \brief Helper class for identifying signed min predicates.
 struct smin_pred_ty {
   static bool match(ICmpInst::Predicate Pred) {
     return Pred == CmpInst::ICMP_SLT || Pred == CmpInst::ICMP_SLE;
   }
 };
 
-/// umax_pred_ty - Helper class for identifying unsigned max predicates.
+/// \brief Helper class for identifying unsigned max predicates.
 struct umax_pred_ty {
   static bool match(ICmpInst::Predicate Pred) {
     return Pred == CmpInst::ICMP_UGT || Pred == CmpInst::ICMP_UGE;
   }
 };
 
-/// umin_pred_ty - Helper class for identifying unsigned min predicates.
+/// \brief Helper class for identifying unsigned min predicates.
 struct umin_pred_ty {
   static bool match(ICmpInst::Predicate Pred) {
     return Pred == CmpInst::ICMP_ULT || Pred == CmpInst::ICMP_ULE;
   }
 };
 
-/// ofmax_pred_ty - Helper class for identifying ordered max predicates.
+/// \brief Helper class for identifying ordered max predicates.
 struct ofmax_pred_ty {
   static bool match(FCmpInst::Predicate Pred) {
     return Pred == CmpInst::FCMP_OGT || Pred == CmpInst::FCMP_OGE;
   }
 };
 
-/// ofmin_pred_ty - Helper class for identifying ordered min predicates.
+/// \brief Helper class for identifying ordered min predicates.
 struct ofmin_pred_ty {
   static bool match(FCmpInst::Predicate Pred) {
     return Pred == CmpInst::FCMP_OLT || Pred == CmpInst::FCMP_OLE;
   }
 };
 
-/// ufmax_pred_ty - Helper class for identifying unordered max predicates.
+/// \brief Helper class for identifying unordered max predicates.
 struct ufmax_pred_ty {
   static bool match(FCmpInst::Predicate Pred) {
     return Pred == CmpInst::FCMP_UGT || Pred == CmpInst::FCMP_UGE;
   }
 };
 
-/// ufmin_pred_ty - Helper class for identifying unordered min predicates.
+/// \brief Helper class for identifying unordered min predicates.
 struct ufmin_pred_ty {
   static bool match(FCmpInst::Predicate Pred) {
     return Pred == CmpInst::FCMP_ULT || Pred == CmpInst::FCMP_ULE;
   }
 };
 
-template<typename LHS, typename RHS>
-inline MaxMin_match<ICmpInst, LHS, RHS, smax_pred_ty>
-m_SMax(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline MaxMin_match<ICmpInst, LHS, RHS, smax_pred_ty> m_SMax(const LHS &L,
+                                                             const RHS &R) {
   return MaxMin_match<ICmpInst, LHS, RHS, smax_pred_ty>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline MaxMin_match<ICmpInst, LHS, RHS, smin_pred_ty>
-m_SMin(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline MaxMin_match<ICmpInst, LHS, RHS, smin_pred_ty> m_SMin(const LHS &L,
+                                                             const RHS &R) {
   return MaxMin_match<ICmpInst, LHS, RHS, smin_pred_ty>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline MaxMin_match<ICmpInst, LHS, RHS, umax_pred_ty>
-m_UMax(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline MaxMin_match<ICmpInst, LHS, RHS, umax_pred_ty> m_UMax(const LHS &L,
+                                                             const RHS &R) {
   return MaxMin_match<ICmpInst, LHS, RHS, umax_pred_ty>(L, R);
 }
 
-template<typename LHS, typename RHS>
-inline MaxMin_match<ICmpInst, LHS, RHS, umin_pred_ty>
-m_UMin(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline MaxMin_match<ICmpInst, LHS, RHS, umin_pred_ty> m_UMin(const LHS &L,
+                                                             const RHS &R) {
   return MaxMin_match<ICmpInst, LHS, RHS, umin_pred_ty>(L, R);
 }
 
@@ -1081,9 +1067,9 @@ m_UMin(const LHS &L, const RHS &R) {
 ///
 ///                         max(L, R)  iff L and R are not NaN
 ///  m_OrdFMax(L, R) =      R          iff L or R are NaN
-template<typename LHS, typename RHS>
-inline MaxMin_match<FCmpInst, LHS, RHS, ofmax_pred_ty>
-m_OrdFMax(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline MaxMin_match<FCmpInst, LHS, RHS, ofmax_pred_ty> m_OrdFMax(const LHS &L,
+                                                                 const RHS &R) {
   return MaxMin_match<FCmpInst, LHS, RHS, ofmax_pred_ty>(L, R);
 }
 
@@ -1096,9 +1082,9 @@ m_OrdFMax(const LHS &L, const RHS &R) {
 ///
 ///                         max(L, R)  iff L and R are not NaN
 ///  m_OrdFMin(L, R) =      R          iff L or R are NaN
-template<typename LHS, typename RHS>
-inline MaxMin_match<FCmpInst, LHS, RHS, ofmin_pred_ty>
-m_OrdFMin(const LHS &L, const RHS &R) {
+template <typename LHS, typename RHS>
+inline MaxMin_match<FCmpInst, LHS, RHS, ofmin_pred_ty> m_OrdFMin(const LHS &L,
+                                                                 const RHS &R) {
   return MaxMin_match<FCmpInst, LHS, RHS, ofmin_pred_ty>(L, R);
 }
 
@@ -1111,7 +1097,7 @@ m_OrdFMin(const LHS &L, const RHS &R) {
 ///
 ///                         max(L, R)  iff L and R are not NaN
 ///  m_UnordFMin(L, R) =    L          iff L or R are NaN
-template<typename LHS, typename RHS>
+template <typename LHS, typename RHS>
 inline MaxMin_match<FCmpInst, LHS, RHS, ufmax_pred_ty>
 m_UnordFMax(const LHS &L, const RHS &R) {
   return MaxMin_match<FCmpInst, LHS, RHS, ufmax_pred_ty>(L, R);
@@ -1126,40 +1112,37 @@ m_UnordFMax(const LHS &L, const RHS &R) {
 ///
 ///                          max(L, R)  iff L and R are not NaN
 ///  m_UnordFMin(L, R) =     L          iff L or R are NaN
-template<typename LHS, typename RHS>
+template <typename LHS, typename RHS>
 inline MaxMin_match<FCmpInst, LHS, RHS, ufmin_pred_ty>
 m_UnordFMin(const LHS &L, const RHS &R) {
   return MaxMin_match<FCmpInst, LHS, RHS, ufmin_pred_ty>(L, R);
 }
 
-template<typename Opnd_t>
-struct Argument_match {
+template <typename Opnd_t> struct Argument_match {
   unsigned OpI;
   Opnd_t Val;
-  Argument_match(unsigned OpIdx, const Opnd_t &V) : OpI(OpIdx), Val(V) { }
+  Argument_match(unsigned OpIdx, const Opnd_t &V) : OpI(OpIdx), Val(V) {}
 
-  template<typename OpTy>
-  bool match(OpTy *V) {
+  template <typename OpTy> bool match(OpTy *V) {
     CallSite CS(V);
     return CS.isCall() && Val.match(CS.getArgument(OpI));
   }
 };
 
-/// Match an argument
-template<unsigned OpI, typename Opnd_t>
+/// \brief Match an argument.
+template <unsigned OpI, typename Opnd_t>
 inline Argument_match<Opnd_t> m_Argument(const Opnd_t &Op) {
   return Argument_match<Opnd_t>(OpI, Op);
 }
 
-/// Intrinsic matchers.
+/// \brief Intrinsic matchers.
 struct IntrinsicID_match {
   unsigned ID;
-  IntrinsicID_match(Intrinsic::ID IntrID) : ID(IntrID) { }
+  IntrinsicID_match(Intrinsic::ID IntrID) : ID(IntrID) {}
 
-  template<typename OpTy>
-  bool match(OpTy *V) {
-    if (const CallInst *CI = dyn_cast<CallInst>(V))
-      if (const Function *F = CI->getCalledFunction())
+  template <typename OpTy> bool match(OpTy *V) {
+    if (const auto *CI = dyn_cast<CallInst>(V))
+      if (const auto *F = CI->getCalledFunction())
         return F->getIntrinsicID() == ID;
     return false;
   }
@@ -1172,73 +1155,71 @@ struct IntrinsicID_match {
 template <typename T0 = void, typename T1 = void, typename T2 = void,
           typename T3 = void, typename T4 = void, typename T5 = void,
           typename T6 = void, typename T7 = void, typename T8 = void,
-          typename T9 = void, typename T10 = void> struct m_Intrinsic_Ty;
-template <typename T0>
-struct m_Intrinsic_Ty<T0> {
-  typedef match_combine_and<IntrinsicID_match, Argument_match<T0> > Ty;
+          typename T9 = void, typename T10 = void>
+struct m_Intrinsic_Ty;
+template <typename T0> struct m_Intrinsic_Ty<T0> {
+  typedef match_combine_and<IntrinsicID_match, Argument_match<T0>> Ty;
 };
-template <typename T0, typename T1>
-struct m_Intrinsic_Ty<T0, T1> {
-  typedef match_combine_and<typename m_Intrinsic_Ty<T0>::Ty,
-                            Argument_match<T1> > Ty;
+template <typename T0, typename T1> struct m_Intrinsic_Ty<T0, T1> {
+  typedef match_combine_and<typename m_Intrinsic_Ty<T0>::Ty, Argument_match<T1>>
+      Ty;
 };
 template <typename T0, typename T1, typename T2>
 struct m_Intrinsic_Ty<T0, T1, T2> {
   typedef match_combine_and<typename m_Intrinsic_Ty<T0, T1>::Ty,
-                            Argument_match<T2> > Ty;
+                            Argument_match<T2>> Ty;
 };
 template <typename T0, typename T1, typename T2, typename T3>
 struct m_Intrinsic_Ty<T0, T1, T2, T3> {
   typedef match_combine_and<typename m_Intrinsic_Ty<T0, T1, T2>::Ty,
-                            Argument_match<T3> > Ty;
+                            Argument_match<T3>> Ty;
 };
 
-/// Match intrinsic calls like this:
-///   m_Intrinsic<Intrinsic::fabs>(m_Value(X))
-template <Intrinsic::ID IntrID>
-inline IntrinsicID_match
-m_Intrinsic() { return IntrinsicID_match(IntrID); }
+/// \brief Match intrinsic calls like this:
+/// m_Intrinsic<Intrinsic::fabs>(m_Value(X))
+template <Intrinsic::ID IntrID> inline IntrinsicID_match m_Intrinsic() {
+  return IntrinsicID_match(IntrID);
+}
 
-template<Intrinsic::ID IntrID, typename T0>
-inline typename m_Intrinsic_Ty<T0>::Ty
-m_Intrinsic(const T0 &Op0) {
+template <Intrinsic::ID IntrID, typename T0>
+inline typename m_Intrinsic_Ty<T0>::Ty m_Intrinsic(const T0 &Op0) {
   return m_CombineAnd(m_Intrinsic<IntrID>(), m_Argument<0>(Op0));
 }
 
-template<Intrinsic::ID IntrID, typename T0, typename T1>
-inline typename m_Intrinsic_Ty<T0, T1>::Ty
-m_Intrinsic(const T0 &Op0, const T1 &Op1) {
+template <Intrinsic::ID IntrID, typename T0, typename T1>
+inline typename m_Intrinsic_Ty<T0, T1>::Ty m_Intrinsic(const T0 &Op0,
+                                                       const T1 &Op1) {
   return m_CombineAnd(m_Intrinsic<IntrID>(Op0), m_Argument<1>(Op1));
 }
 
-template<Intrinsic::ID IntrID, typename T0, typename T1, typename T2>
+template <Intrinsic::ID IntrID, typename T0, typename T1, typename T2>
 inline typename m_Intrinsic_Ty<T0, T1, T2>::Ty
 m_Intrinsic(const T0 &Op0, const T1 &Op1, const T2 &Op2) {
   return m_CombineAnd(m_Intrinsic<IntrID>(Op0, Op1), m_Argument<2>(Op2));
 }
 
-template<Intrinsic::ID IntrID, typename T0, typename T1, typename T2, typename T3>
+template <Intrinsic::ID IntrID, typename T0, typename T1, typename T2,
+          typename T3>
 inline typename m_Intrinsic_Ty<T0, T1, T2, T3>::Ty
 m_Intrinsic(const T0 &Op0, const T1 &Op1, const T2 &Op2, const T3 &Op3) {
   return m_CombineAnd(m_Intrinsic<IntrID>(Op0, Op1, Op2), m_Argument<3>(Op3));
 }
 
-// Helper intrinsic matching specializations
-template<typename Opnd0>
-inline typename m_Intrinsic_Ty<Opnd0>::Ty
-m_BSwap(const Opnd0 &Op0) {
+// Helper intrinsic matching specializations.
+template <typename Opnd0>
+inline typename m_Intrinsic_Ty<Opnd0>::Ty m_BSwap(const Opnd0 &Op0) {
   return m_Intrinsic<Intrinsic::bswap>(Op0);
 }
 
-template<typename Opnd0, typename Opnd1>
-inline typename m_Intrinsic_Ty<Opnd0, Opnd1>::Ty
-m_FMin(const Opnd0 &Op0, const Opnd1 &Op1) {
+template <typename Opnd0, typename Opnd1>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1>::Ty m_FMin(const Opnd0 &Op0,
+                                                        const Opnd1 &Op1) {
   return m_Intrinsic<Intrinsic::minnum>(Op0, Op1);
 }
 
-template<typename Opnd0, typename Opnd1>
-inline typename m_Intrinsic_Ty<Opnd0, Opnd1>::Ty
-m_FMax(const Opnd0 &Op0, const Opnd1 &Op1) {
+template <typename Opnd0, typename Opnd1>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1>::Ty m_FMax(const Opnd0 &Op0,
+                                                        const Opnd1 &Op1) {
   return m_Intrinsic<Intrinsic::maxnum>(Op0, Op1);
 }
 
diff --git a/include/llvm/IR/Statepoint.h b/include/llvm/IR/Statepoint.h
new file mode 100644
index 0000000..38720ed
--- /dev/null
+++ b/include/llvm/IR/Statepoint.h
@@ -0,0 +1,298 @@
+//===-- llvm/IR/Statepoint.h - gc.statepoint utilities ------ --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains utility functions and a wrapper class analogous to
+// CallSite for accessing the fields of gc.statepoint, gc.relocate, and
+// gc.result intrinsics
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LLVM_IR_STATEPOINT_H
+#define __LLVM_IR_STATEPOINT_H
+
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class GCRelocateOperands;
+class ImmutableStatepoint;
+
+bool isStatepoint(const ImmutableCallSite &CS);
+bool isStatepoint(const Value *inst);
+bool isStatepoint(const Value &inst);
+
+bool isGCRelocate(const Value *inst);
+bool isGCRelocate(const ImmutableCallSite &CS);
+
+bool isGCResult(const Value *inst);
+bool isGCResult(const ImmutableCallSite &CS);
+
+/// Analogous to CallSiteBase, this provides most of the actual
+/// functionality for Statepoint and ImmutableStatepoint.  It is
+/// templatized to allow easily specializing of const and non-const
+/// concrete subtypes.  This is structured analogous to CallSite
+/// rather than the IntrinsicInst.h helpers since we want to support
+/// invokable statepoints in the near future.
+/// TODO: This does not currently allow the if(Statepoint S = ...)
+///   idiom used with CallSites.  Consider refactoring to support.
+template <typename InstructionTy, typename ValueTy, typename CallSiteTy>
+class StatepointBase {
+  CallSiteTy StatepointCS;
+  void *operator new(size_t, unsigned) = delete;
+  void *operator new(size_t s) = delete;
+
+ protected:
+  explicit StatepointBase(InstructionTy *I) : StatepointCS(I) {
+    assert(isStatepoint(I));
+  }
+  explicit StatepointBase(CallSiteTy CS) : StatepointCS(CS) {
+    assert(isStatepoint(CS));
+  }
+
+ public:
+  typedef typename CallSiteTy::arg_iterator arg_iterator;
+
+  /// Return the underlying CallSite.
+  CallSiteTy getCallSite() {
+    return StatepointCS;
+  }
+
+  /// Return the value actually being called or invoked.
+  ValueTy *actualCallee() {
+    return StatepointCS.getArgument(0);
+  }
+  /// Number of arguments to be passed to the actual callee.
+  int numCallArgs() {
+    return cast<ConstantInt>(StatepointCS.getArgument(1))->getZExtValue();
+  }
+  /// Number of additional arguments excluding those intended
+  /// for garbage collection.
+  int numTotalVMSArgs() {
+    return cast<ConstantInt>(StatepointCS.getArgument(3 + numCallArgs()))->getZExtValue();
+  }
+
+  typename CallSiteTy::arg_iterator call_args_begin() {
+    // 3 = callTarget, #callArgs, flag
+    int Offset = 3;
+    assert(Offset <= (int)StatepointCS.arg_size());
+    return StatepointCS.arg_begin() + Offset;
+  }
+  typename CallSiteTy::arg_iterator call_args_end() {
+    int Offset = 3 + numCallArgs();
+    assert(Offset <= (int)StatepointCS.arg_size());
+    return StatepointCS.arg_begin() + Offset;
+  }
+
+  /// range adapter for call arguments
+  iterator_range<arg_iterator> call_args() {
+    return iterator_range<arg_iterator>(call_args_begin(), call_args_end());
+  }
+
+  typename CallSiteTy::arg_iterator vm_state_begin() {
+    return call_args_end();
+  }
+  typename CallSiteTy::arg_iterator vm_state_end() {
+    int Offset = 3 + numCallArgs() + 1 + numTotalVMSArgs();
+    assert(Offset <= (int)StatepointCS.arg_size());
+    return StatepointCS.arg_begin() + Offset;
+  }
+
+  /// range adapter for vm state arguments
+  iterator_range<arg_iterator> vm_state_args() {
+    return iterator_range<arg_iterator>(vm_state_begin(), vm_state_end());
+  }
+
+  typename CallSiteTy::arg_iterator first_vm_state_stack_begin() {
+    // 6 = numTotalVMSArgs, 1st_objectID, 1st_bci,
+    //     1st_#stack, 1st_#local, 1st_#monitor
+    return vm_state_begin() + 6;
+  }
+
+  typename CallSiteTy::arg_iterator gc_args_begin() {
+    return vm_state_end();
+  }
+  typename CallSiteTy::arg_iterator gc_args_end() {
+    return StatepointCS.arg_end();
+  }
+
+  /// range adapter for gc arguments
+  iterator_range<arg_iterator> gc_args() {
+    return iterator_range<arg_iterator>(gc_args_begin(), gc_args_end());
+  }
+
+  /// Get list of all gc reloactes linked to this statepoint
+  /// May contain several relocations for the same base/derived pair.
+  /// For example this could happen due to relocations on unwinding
+  /// path of invoke.
+  std::vector<GCRelocateOperands> getRelocates(ImmutableStatepoint &IS);
+
+#ifndef NDEBUG
+  /// Asserts if this statepoint is malformed.  Common cases for failure
+  /// include incorrect length prefixes for variable length sections or
+  /// illegal values for parameters.
+  void verify() {
+    assert(numCallArgs() >= 0 &&
+           "number of arguments to actually callee can't be negative");
+
+    // The internal asserts in the iterator accessors do the rest.
+    (void)call_args_begin();
+    (void)call_args_end();
+    (void)vm_state_begin();
+    (void)vm_state_end();
+    (void)gc_args_begin();
+    (void)gc_args_end();
+  }
+#endif
+};
+
+/// A specialization of it's base class for read only access
+/// to a gc.statepoint.
+class ImmutableStatepoint
+    : public StatepointBase<const Instruction, const Value,
+                            ImmutableCallSite> {
+  typedef StatepointBase<const Instruction, const Value, ImmutableCallSite>
+      Base;
+
+public:
+  explicit ImmutableStatepoint(const Instruction *I) : Base(I) {}
+  explicit ImmutableStatepoint(ImmutableCallSite CS) : Base(CS) {}
+};
+
+/// A specialization of it's base class for read-write access
+/// to a gc.statepoint.
+class Statepoint : public StatepointBase<Instruction, Value, CallSite> {
+  typedef StatepointBase<Instruction, Value, CallSite> Base;
+
+public:
+  explicit Statepoint(Instruction *I) : Base(I) {}
+  explicit Statepoint(CallSite CS) : Base(CS) {}
+};
+
+/// Wraps a call to a gc.relocate and provides access to it's operands.
+/// TODO: This should likely be refactored to resememble the wrappers in
+/// InstrinsicInst.h.
+class GCRelocateOperands {
+  ImmutableCallSite RelocateCS;
+
+ public:
+  GCRelocateOperands(const User* U) : RelocateCS(U) {
+    assert(isGCRelocate(U));
+  }
+  GCRelocateOperands(const Instruction *inst) : RelocateCS(inst) {
+    assert(isGCRelocate(inst));
+  }
+  GCRelocateOperands(CallSite CS) : RelocateCS(CS) {
+    assert(isGCRelocate(CS));
+  }
+
+  /// Return true if this relocate is tied to the invoke statepoint.
+  /// This includes relocates which are on the unwinding path.
+  bool isTiedToInvoke() const {
+    const Value *Token = RelocateCS.getArgument(0);
+
+    return isa<ExtractValueInst>(Token) ||
+      isa<InvokeInst>(Token);
+  }
+
+  /// Get enclosed relocate intrinsic
+  ImmutableCallSite getUnderlyingCallSite() {
+    return RelocateCS;
+  }
+
+  /// The statepoint with which this gc.relocate is associated.
+  const Instruction *statepoint() {
+    const Value *token = RelocateCS.getArgument(0);
+
+    // This takes care both of relocates for call statepoints and relocates
+    // on normal path of invoke statepoint.
+    if (!isa<ExtractValueInst>(token)) {
+      return cast<Instruction>(token);
+    }
+
+    // This relocate is on exceptional path of an invoke statepoint
+    const BasicBlock *invokeBB =
+      cast<Instruction>(token)->getParent()->getUniquePredecessor();
+
+    assert(invokeBB && "safepoints should have unique landingpads");
+    assert(invokeBB->getTerminator() && "safepoint block should be well formed");
+    assert(isStatepoint(invokeBB->getTerminator()));
+
+    return invokeBB->getTerminator();
+  }
+  /// The index into the associate statepoint's argument list
+  /// which contains the base pointer of the pointer whose
+  /// relocation this gc.relocate describes.
+  unsigned basePtrIndex() {
+    return cast<ConstantInt>(RelocateCS.getArgument(1))->getZExtValue();
+  }
+  /// The index into the associate statepoint's argument list which
+  /// contains the pointer whose relocation this gc.relocate describes.
+  unsigned derivedPtrIndex() {
+    return cast<ConstantInt>(RelocateCS.getArgument(2))->getZExtValue();
+  }
+  Value *basePtr() {
+    ImmutableCallSite CS(statepoint());
+    return *(CS.arg_begin() + basePtrIndex());
+  }
+  Value *derivedPtr() {
+    ImmutableCallSite CS(statepoint());
+    return *(CS.arg_begin() + derivedPtrIndex());
+  }
+};
+
+template <typename InstructionTy, typename ValueTy, typename CallSiteTy>
+std::vector<GCRelocateOperands>
+  StatepointBase<InstructionTy, ValueTy, CallSiteTy>::
+    getRelocates(ImmutableStatepoint &IS) {
+
+  std::vector<GCRelocateOperands> res;
+
+  ImmutableCallSite StatepointCS = IS.getCallSite();
+
+  // Search for relocated pointers.  Note that working backwards from the
+  // gc_relocates ensures that we only get pairs which are actually relocated
+  // and used after the statepoint.
+  for (const User *U : StatepointCS.getInstruction()->users()) {
+    if (isGCRelocate(U)) {
+      res.push_back(GCRelocateOperands(U));
+    }
+  }
+
+  if (!StatepointCS.isInvoke()) {
+    return res;
+  }
+
+  // We need to scan thorough exceptional relocations if it is invoke statepoint
+  LandingPadInst *LandingPad =
+    cast<InvokeInst>(StatepointCS.getInstruction())->getLandingPadInst();
+
+  // Search for extract value from landingpad instruction to which
+  // gc relocates will be attached
+  for (const User *LandingPadUser : LandingPad->users()) {
+    if (!isa<ExtractValueInst>(LandingPadUser)) {
+      continue;
+    }
+
+    // gc relocates should be attached to this extract value
+    for (const User *U : LandingPadUser->users()) {
+      if (isGCRelocate(U)) {
+        res.push_back(GCRelocateOperands(U));
+      }
+    }
+  }
+  return res;
+}
+
+}
+#endif
diff --git a/include/llvm/IR/TrackingMDRef.h b/include/llvm/IR/TrackingMDRef.h
new file mode 100644
index 0000000..e241121
--- /dev/null
+++ b/include/llvm/IR/TrackingMDRef.h
@@ -0,0 +1,170 @@
+//===- llvm/IR/TrackingMDRef.h - Tracking Metadata references ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// References to metadata that track RAUW.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_TRACKINGMDREF_H
+#define LLVM_IR_TRACKINGMDREF_H
+
+#include "llvm/IR/MetadataTracking.h"
+#include "llvm/Support/Casting.h"
+
+namespace llvm {
+
+class Metadata;
+class MDNode;
+class ValueAsMetadata;
+
+/// \brief Tracking metadata reference.
+///
+/// This class behaves like \a TrackingVH, but for metadata.
+class TrackingMDRef {
+  Metadata *MD;
+
+public:
+  TrackingMDRef() : MD(nullptr) {}
+  explicit TrackingMDRef(Metadata *MD) : MD(MD) { track(); }
+
+  TrackingMDRef(TrackingMDRef &&X) : MD(X.MD) { retrack(X); }
+  TrackingMDRef(const TrackingMDRef &X) : MD(X.MD) { track(); }
+  TrackingMDRef &operator=(TrackingMDRef &&X) {
+    if (&X == this)
+      return *this;
+
+    untrack();
+    MD = X.MD;
+    retrack(X);
+    return *this;
+  }
+  TrackingMDRef &operator=(const TrackingMDRef &X) {
+    if (&X == this)
+      return *this;
+
+    untrack();
+    MD = X.MD;
+    track();
+    return *this;
+  }
+  ~TrackingMDRef() { untrack(); }
+
+  Metadata *get() const { return MD; }
+  operator Metadata *() const { return get(); }
+  Metadata *operator->() const { return get(); }
+  Metadata &operator*() const { return *get(); }
+
+  void reset() {
+    untrack();
+    MD = nullptr;
+  }
+  void reset(Metadata *MD) {
+    untrack();
+    this->MD = MD;
+    track();
+  }
+
+  /// \brief Check whether this has a trivial destructor.
+  ///
+  /// If \c MD isn't replaceable, the destructor will be a no-op.
+  bool hasTrivialDestructor() const {
+    return !MD || !MetadataTracking::isReplaceable(*MD);
+  }
+
+  bool operator==(const TrackingMDRef &X) const { return MD == X.MD; }
+  bool operator!=(const TrackingMDRef &X) const { return MD != X.MD; }
+
+private:
+  void track() {
+    if (MD)
+      MetadataTracking::track(MD);
+  }
+  void untrack() {
+    if (MD)
+      MetadataTracking::untrack(MD);
+  }
+  void retrack(TrackingMDRef &X) {
+    assert(MD == X.MD && "Expected values to match");
+    if (X.MD) {
+      MetadataTracking::retrack(X.MD, MD);
+      X.MD = nullptr;
+    }
+  }
+};
+
+/// \brief Typed tracking ref.
+///
+/// Track refererences of a particular type.  It's useful to use this for \a
+/// MDNode and \a ValueAsMetadata.
+template <class T> class TypedTrackingMDRef {
+  TrackingMDRef Ref;
+
+public:
+  TypedTrackingMDRef() {}
+  explicit TypedTrackingMDRef(T *MD) : Ref(static_cast<Metadata *>(MD)) {}
+
+  TypedTrackingMDRef(TypedTrackingMDRef &&X) : Ref(std::move(X.Ref)) {}
+  TypedTrackingMDRef(const TypedTrackingMDRef &X) : Ref(X.Ref) {}
+  TypedTrackingMDRef &operator=(TypedTrackingMDRef &&X) {
+    Ref = std::move(X.Ref);
+    return *this;
+  }
+  TypedTrackingMDRef &operator=(const TypedTrackingMDRef &X) {
+    Ref = X.Ref;
+    return *this;
+  }
+
+  T *get() const { return (T *)Ref.get(); }
+  operator T *() const { return get(); }
+  T *operator->() const { return get(); }
+  T &operator*() const { return *get(); }
+
+  bool operator==(const TypedTrackingMDRef &X) const { return Ref == X.Ref; }
+  bool operator!=(const TypedTrackingMDRef &X) const { return Ref != X.Ref; }
+
+  void reset() { Ref.reset(); }
+  void reset(T *MD) { Ref.reset(static_cast<Metadata *>(MD)); }
+
+  /// \brief Check whether this has a trivial destructor.
+  bool hasTrivialDestructor() const { return Ref.hasTrivialDestructor(); }
+};
+
+typedef TypedTrackingMDRef<MDNode> TrackingMDNodeRef;
+typedef TypedTrackingMDRef<ValueAsMetadata> TrackingValueAsMetadataRef;
+
+// Expose the underlying metadata to casting.
+template <> struct simplify_type<TrackingMDRef> {
+  typedef Metadata *SimpleType;
+  static SimpleType getSimplifiedValue(TrackingMDRef &MD) { return MD.get(); }
+};
+
+template <> struct simplify_type<const TrackingMDRef> {
+  typedef Metadata *SimpleType;
+  static SimpleType getSimplifiedValue(const TrackingMDRef &MD) {
+    return MD.get();
+  }
+};
+
+template <class T> struct simplify_type<TypedTrackingMDRef<T>> {
+  typedef T *SimpleType;
+  static SimpleType getSimplifiedValue(TypedTrackingMDRef<T> &MD) {
+    return MD.get();
+  }
+};
+
+template <class T> struct simplify_type<const TypedTrackingMDRef<T>> {
+  typedef T *SimpleType;
+  static SimpleType getSimplifiedValue(const TypedTrackingMDRef<T> &MD) {
+    return MD.get();
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/IR/Type.h b/include/llvm/IR/Type.h
index a36fb0f..c2073c7 100644
--- a/include/llvm/IR/Type.h
+++ b/include/llvm/IR/Type.h
@@ -313,6 +313,9 @@ public:
   typedef Type * const *subtype_iterator;
   subtype_iterator subtype_begin() const { return ContainedTys; }
   subtype_iterator subtype_end() const { return &ContainedTys[NumContainedTys];}
+  ArrayRef<Type*> subtypes() const {
+    return makeArrayRef(subtype_begin(), subtype_end());
+  }
 
   typedef std::reverse_iterator<subtype_iterator> subtype_reverse_iterator;
   subtype_reverse_iterator subtype_rbegin() const {
diff --git a/include/llvm/IR/TypeFinder.h b/include/llvm/IR/TypeFinder.h
index cea66a4..73a63ad 100644
--- a/include/llvm/IR/TypeFinder.h
+++ b/include/llvm/IR/TypeFinder.h
@@ -31,6 +31,7 @@ class TypeFinder {
   // To avoid walking constant expressions multiple times and other IR
   // objects, we keep several helper maps.
   DenseSet<const Value*> VisitedConstants;
+  DenseSet<const MDNode *> VisitedMetadata;
   DenseSet<Type*> VisitedTypes;
 
   std::vector<StructType*> StructTypes;
diff --git a/include/llvm/IR/Use.h b/include/llvm/IR/Use.h
index 033cd3e..160d71b 100644
--- a/include/llvm/IR/Use.h
+++ b/include/llvm/IR/Use.h
@@ -77,7 +77,7 @@ public:
   typedef PointerIntPair<User *, 1, unsigned> UserRef;
 
 private:
-  Use(const Use &U) LLVM_DELETED_FUNCTION;
+  Use(const Use &U) = delete;
 
   /// Destructor - Only for zap()
   ~Use() {
diff --git a/include/llvm/IR/UseListOrder.h b/include/llvm/IR/UseListOrder.h
index 5df459b..7d8205d 100644
--- a/include/llvm/IR/UseListOrder.h
+++ b/include/llvm/IR/UseListOrder.h
@@ -45,8 +45,8 @@ struct UseListOrder {
   }
 
 private:
-  UseListOrder(const UseListOrder &X) LLVM_DELETED_FUNCTION;
-  UseListOrder &operator=(const UseListOrder &X) LLVM_DELETED_FUNCTION;
+  UseListOrder(const UseListOrder &X) = delete;
+  UseListOrder &operator=(const UseListOrder &X) = delete;
 };
 
 typedef std::vector<UseListOrder> UseListOrderStack;
diff --git a/include/llvm/IR/User.h b/include/llvm/IR/User.h
index f578227..d39378d 100644
--- a/include/llvm/IR/User.h
+++ b/include/llvm/IR/User.h
@@ -33,8 +33,8 @@ template <class>
 struct OperandTraits;
 
 class User : public Value {
-  User(const User &) LLVM_DELETED_FUNCTION;
-  void *operator new(size_t) LLVM_DELETED_FUNCTION;
+  User(const User &) = delete;
+  void *operator new(size_t) = delete;
   template <unsigned>
   friend struct HungoffOperandTraits;
   virtual void anchor();
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index 67665be..b7213a6 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -37,7 +37,6 @@ class GlobalVariable;
 class InlineAsm;
 class Instruction;
 class LLVMContext;
-class MDNode;
 class Module;
 class StringRef;
 class Twine;
@@ -70,9 +69,9 @@ class Value {
   Type *VTy;
   Use *UseList;
 
-  friend class ValueSymbolTable; // Allow ValueSymbolTable to directly mod Name.
+  friend class ValueAsMetadata; // Allow access to NameAndIsUsedByMD.
   friend class ValueHandleBase;
-  ValueName *Name;
+  PointerIntPair<ValueName *, 1> NameAndIsUsedByMD;
 
   const unsigned char SubclassID;   // Subclass identifier (for isa/dyn_cast)
   unsigned char HasValueHandle : 1; // Has a ValueHandle pointing to this?
@@ -107,17 +106,12 @@ protected:
 private:
   template <typename UseT> // UseT == 'Use' or 'const Use'
   class use_iterator_impl
-      : public std::iterator<std::forward_iterator_tag, UseT *, ptrdiff_t> {
-    typedef std::iterator<std::forward_iterator_tag, UseT *, ptrdiff_t> super;
-
+      : public std::iterator<std::forward_iterator_tag, UseT *> {
     UseT *U;
     explicit use_iterator_impl(UseT *u) : U(u) {}
     friend class Value;
 
   public:
-    typedef typename super::reference reference;
-    typedef typename super::pointer pointer;
-
     use_iterator_impl() : U() {}
 
     bool operator==(const use_iterator_impl &x) const { return U == x.U; }
@@ -148,17 +142,12 @@ private:
 
   template <typename UserTy> // UserTy == 'User' or 'const User'
   class user_iterator_impl
-      : public std::iterator<std::forward_iterator_tag, UserTy *, ptrdiff_t> {
-    typedef std::iterator<std::forward_iterator_tag, UserTy *, ptrdiff_t> super;
-
+      : public std::iterator<std::forward_iterator_tag, UserTy *> {
     use_iterator_impl<Use> UI;
     explicit user_iterator_impl(Use *U) : UI(U) {}
     friend class Value;
 
   public:
-    typedef typename super::reference reference;
-    typedef typename super::pointer pointer;
-
     user_iterator_impl() {}
 
     bool operator==(const user_iterator_impl &x) const { return UI == x.UI; }
@@ -189,15 +178,10 @@ private:
     }
 
     Use &getUse() const { return *UI; }
-
-    /// \brief Return the operand # of this use in its User.
-    ///
-    /// FIXME: Replace all callers with a direct call to Use::getOperandNo.
-    unsigned getOperandNo() const { return UI->getOperandNo(); }
   };
 
-  void operator=(const Value &) LLVM_DELETED_FUNCTION;
-  Value(const Value &) LLVM_DELETED_FUNCTION;
+  void operator=(const Value &) = delete;
+  Value(const Value &) = delete;
 
 protected:
   Value(Type *Ty, unsigned scid);
@@ -226,10 +210,14 @@ public:
   LLVMContext &getContext() const;
 
   // \brief All values can potentially be named.
-  bool hasName() const { return Name != nullptr; }
-  ValueName *getValueName() const { return Name; }
-  void setValueName(ValueName *VN) { Name = VN; }
+  bool hasName() const { return getValueName() != nullptr; }
+  ValueName *getValueName() const { return NameAndIsUsedByMD.getPointer(); }
+  void setValueName(ValueName *VN) { NameAndIsUsedByMD.setPointer(VN); }
+
+private:
+  void destroyValueName();
 
+public:
   /// \brief Return a constant reference to the value's name.
   ///
   /// This is cheap and guaranteed to return the same reference as long as the
@@ -258,6 +246,13 @@ public:
   /// guaranteed to be empty.
   void replaceAllUsesWith(Value *V);
 
+  /// replaceUsesOutsideBlock - Go through the uses list for this definition and
+  /// make each use point to "V" instead of "this" when the use is outside the
+  /// block. 'This's use list is expected to have at least one element.
+  /// Unlike replaceAllUsesWith this function does not support basic block
+  /// values or constant users.
+  void replaceUsesOutsideBlock(Value *V, BasicBlock *BB);
+
   //----------------------------------------------------------------------
   // Methods for handling the chain of uses of this Value.
   //
@@ -276,6 +271,8 @@ public:
     return iterator_range<const_use_iterator>(use_begin(), use_end());
   }
 
+  bool               user_empty() const { return UseList == nullptr; }
+
   typedef user_iterator_impl<User>       user_iterator;
   typedef user_iterator_impl<const User> const_user_iterator;
   user_iterator       user_begin()       { return user_iterator(UseList); }
@@ -345,9 +342,7 @@ public:
     ConstantStructVal,        // This is an instance of ConstantStruct
     ConstantVectorVal,        // This is an instance of ConstantVector
     ConstantPointerNullVal,   // This is an instance of ConstantPointerNull
-    GenericMDNodeVal,         // This is an instance of GenericMDNode
-    MDNodeFwdDeclVal,         // This is an instance of MDNodeFwdDecl
-    MDStringVal,              // This is an instance of MDString
+    MetadataAsValueVal,       // This is an instance of MetadataAsValue
     InlineAsmVal,             // This is an instance of InlineAsm
     InstructionVal,           // This is an instance of Instruction
     // Enum values starting at InstructionVal are used for Instructions;
@@ -397,6 +392,9 @@ public:
   /// \brief Return true if there is a value handle associated with this value.
   bool hasValueHandle() const { return HasValueHandle; }
 
+  /// \brief Return true if there is metadata referencing this value.
+  bool isUsedByMetadata() const { return NameAndIsUsedByMD.getInt(); }
+
   /// \brief Strip off pointer casts, all-zero GEPs, and aliases.
   ///
   /// Returns the original uncasted value.  If this is called on a non-pointer
@@ -471,7 +469,8 @@ public:
   ///
   /// This is the greatest alignment value supported by load, store, and alloca
   /// instructions, and global values.
-  static const unsigned MaximumAlignment = 1u << 29;
+  static const unsigned MaxAlignmentExponent = 29;
+  static const unsigned MaximumAlignment = 1u << MaxAlignmentExponent;
 
   /// \brief Mutate the type of this Value to be of the specified type.
   ///
@@ -680,13 +679,6 @@ template <> struct isa_impl<GlobalObject, Value> {
   }
 };
 
-template <> struct isa_impl<MDNode, Value> {
-  static inline bool doit(const Value &Val) {
-    return Val.getValueID() == Value::GenericMDNodeVal ||
-           Val.getValueID() == Value::MDNodeFwdDeclVal;
-  }
-};
-
 // Value* is only 4-byte aligned.
 template<>
 class PointerLikeTypeTraits<Value*> {
diff --git a/include/llvm/IR/ValueHandle.h b/include/llvm/IR/ValueHandle.h
index 460210e..355748e 100644
--- a/include/llvm/IR/ValueHandle.h
+++ b/include/llvm/IR/ValueHandle.h
@@ -56,55 +56,48 @@ private:
   PointerIntPair<ValueHandleBase**, 2, HandleBaseKind> PrevPair;
   ValueHandleBase *Next;
 
-  // A subclass may want to store some information along with the value
-  // pointer. Allow them to do this by making the value pointer a pointer-int
-  // pair. The 'setValPtrInt' and 'getValPtrInt' methods below give them this
-  // access.
-  PointerIntPair<Value*, 2> VP;
+  Value* V;
 
-  ValueHandleBase(const ValueHandleBase&) LLVM_DELETED_FUNCTION;
+  ValueHandleBase(const ValueHandleBase&) = delete;
 public:
   explicit ValueHandleBase(HandleBaseKind Kind)
-    : PrevPair(nullptr, Kind), Next(nullptr), VP(nullptr, 0) {}
+    : PrevPair(nullptr, Kind), Next(nullptr), V(nullptr) {}
   ValueHandleBase(HandleBaseKind Kind, Value *V)
-    : PrevPair(nullptr, Kind), Next(nullptr), VP(V, 0) {
-    if (isValid(VP.getPointer()))
+    : PrevPair(nullptr, Kind), Next(nullptr), V(V) {
+    if (isValid(V))
       AddToUseList();
   }
   ValueHandleBase(HandleBaseKind Kind, const ValueHandleBase &RHS)
-    : PrevPair(nullptr, Kind), Next(nullptr), VP(RHS.VP) {
-    if (isValid(VP.getPointer()))
+    : PrevPair(nullptr, Kind), Next(nullptr), V(RHS.V) {
+    if (isValid(V))
       AddToExistingUseList(RHS.getPrevPtr());
   }
   ~ValueHandleBase() {
-    if (isValid(VP.getPointer()))
+    if (isValid(V))
       RemoveFromUseList();
   }
 
   Value *operator=(Value *RHS) {
-    if (VP.getPointer() == RHS) return RHS;
-    if (isValid(VP.getPointer())) RemoveFromUseList();
-    VP.setPointer(RHS);
-    if (isValid(VP.getPointer())) AddToUseList();
+    if (V == RHS) return RHS;
+    if (isValid(V)) RemoveFromUseList();
+    V = RHS;
+    if (isValid(V)) AddToUseList();
     return RHS;
   }
 
   Value *operator=(const ValueHandleBase &RHS) {
-    if (VP.getPointer() == RHS.VP.getPointer()) return RHS.VP.getPointer();
-    if (isValid(VP.getPointer())) RemoveFromUseList();
-    VP.setPointer(RHS.VP.getPointer());
-    if (isValid(VP.getPointer())) AddToExistingUseList(RHS.getPrevPtr());
-    return VP.getPointer();
+    if (V == RHS.V) return RHS.V;
+    if (isValid(V)) RemoveFromUseList();
+    V = RHS.V;
+    if (isValid(V)) AddToExistingUseList(RHS.getPrevPtr());
+    return V;
   }
 
-  Value *operator->() const { return getValPtr(); }
-  Value &operator*() const { return *getValPtr(); }
+  Value *operator->() const { return V; }
+  Value &operator*() const { return *V; }
 
 protected:
-  Value *getValPtr() const { return VP.getPointer(); }
-
-  void setValPtrInt(unsigned K) { VP.setInt(K); }
-  unsigned getValPtrInt() const { return VP.getInt(); }
+  Value *getValPtr() const { return V; }
 
   static bool isValid(Value *V) {
     return V &&
@@ -123,7 +116,7 @@ private:
   HandleBaseKind getKind() const { return PrevPair.getInt(); }
   void setPrevPtr(ValueHandleBase **Ptr) { PrevPair.setPointer(Ptr); }
 
-  /// \brief Add this ValueHandle to the use list for VP.
+  /// \brief Add this ValueHandle to the use list for V.
   ///
   /// List is the address of either the head of the list or a Next node within
   /// the existing use list.
@@ -132,7 +125,7 @@ private:
   /// \brief Add this ValueHandle to the use list after Node.
   void AddToExistingUseListAfter(ValueHandleBase *Node);
 
-  /// \brief Add this ValueHandle to the use list for VP.
+  /// \brief Add this ValueHandle to the use list for V.
   void AddToUseList();
   /// \brief Remove this ValueHandle from its current use list.
   void RemoveFromUseList();
@@ -197,23 +190,20 @@ class AssertingVH
   friend struct DenseMapInfo<AssertingVH<ValueTy> >;
 
 #ifndef NDEBUG
-  ValueTy *getValPtr() const {
-    return static_cast<ValueTy*>(ValueHandleBase::getValPtr());
-  }
-  void setValPtr(ValueTy *P) {
-    ValueHandleBase::operator=(GetAsValue(P));
-  }
+  Value *getRawValPtr() const { return ValueHandleBase::getValPtr(); }
+  void setRawValPtr(Value *P) { ValueHandleBase::operator=(P); }
 #else
-  ValueTy *ThePtr;
-  ValueTy *getValPtr() const { return ThePtr; }
-  void setValPtr(ValueTy *P) { ThePtr = P; }
+  Value *ThePtr;
+  Value *getRawValPtr() const { return ThePtr; }
+  void setRawValPtr(Value *P) { ThePtr = P; }
 #endif
-
-  // Convert a ValueTy*, which may be const, to the type the base
-  // class expects.
+  // Convert a ValueTy*, which may be const, to the raw Value*.
   static Value *GetAsValue(Value *V) { return V; }
   static Value *GetAsValue(const Value *V) { return const_cast<Value*>(V); }
 
+  ValueTy *getValPtr() const { return static_cast<ValueTy *>(getRawValPtr()); }
+  void setValPtr(ValueTy *P) { setRawValPtr(GetAsValue(P)); }
+
 public:
 #ifndef NDEBUG
   AssertingVH() : ValueHandleBase(Assert) {}
@@ -221,7 +211,7 @@ public:
   AssertingVH(const AssertingVH &RHS) : ValueHandleBase(Assert, RHS) {}
 #else
   AssertingVH() : ThePtr(nullptr) {}
-  AssertingVH(ValueTy *P) : ThePtr(P) {}
+  AssertingVH(ValueTy *P) : ThePtr(GetAsValue(P)) {}
 #endif
 
   operator ValueTy*() const {
@@ -244,27 +234,23 @@ public:
 // Specialize DenseMapInfo to allow AssertingVH to participate in DenseMap.
 template<typename T>
 struct DenseMapInfo<AssertingVH<T> > {
-  typedef DenseMapInfo<T*> PointerInfo;
   static inline AssertingVH<T> getEmptyKey() {
-    return AssertingVH<T>(PointerInfo::getEmptyKey());
+    AssertingVH<T> Res;
+    Res.setRawValPtr(DenseMapInfo<Value *>::getEmptyKey());
+    return Res;
   }
-  static inline T* getTombstoneKey() {
-    return AssertingVH<T>(PointerInfo::getTombstoneKey());
+  static inline AssertingVH<T> getTombstoneKey() {
+    AssertingVH<T> Res;
+    Res.setRawValPtr(DenseMapInfo<Value *>::getTombstoneKey());
+    return Res;
   }
   static unsigned getHashValue(const AssertingVH<T> &Val) {
-    return PointerInfo::getHashValue(Val);
+    return DenseMapInfo<Value *>::getHashValue(Val.getRawValPtr());
   }
-#ifndef NDEBUG
   static bool isEqual(const AssertingVH<T> &LHS, const AssertingVH<T> &RHS) {
-    // Avoid downcasting AssertingVH<T> to T*, as empty/tombstone keys may not
-    // be properly aligned pointers to T*.
-    return LHS.ValueHandleBase::getValPtr() == RHS.ValueHandleBase::getValPtr();
+    return DenseMapInfo<Value *>::isEqual(LHS.getRawValPtr(),
+                                          RHS.getRawValPtr());
   }
-#else
-  static bool isEqual(const AssertingVH<T> &LHS, const AssertingVH<T> &RHS) {
-    return LHS == RHS;
-  }
-#endif
 };
 
 template <typename T>
diff --git a/include/llvm/IR/ValueMap.h b/include/llvm/IR/ValueMap.h
index aa8a29d..08d6d17 100644
--- a/include/llvm/IR/ValueMap.h
+++ b/include/llvm/IR/ValueMap.h
@@ -27,11 +27,13 @@
 #define LLVM_IR_VALUEMAP_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/TrackingMDRef.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/UniqueLock.h"
 #include "llvm/Support/type_traits.h"
 #include <iterator>
+#include <memory>
 
 namespace llvm {
 
@@ -79,11 +81,13 @@ class ValueMap {
   friend class ValueMapCallbackVH<KeyT, ValueT, Config>;
   typedef ValueMapCallbackVH<KeyT, ValueT, Config> ValueMapCVH;
   typedef DenseMap<ValueMapCVH, ValueT, DenseMapInfo<ValueMapCVH> > MapT;
+  typedef DenseMap<const Metadata *, TrackingMDRef> MDMapT;
   typedef typename Config::ExtraData ExtraData;
   MapT Map;
+  std::unique_ptr<MDMapT> MDMap;
   ExtraData Data;
-  ValueMap(const ValueMap&) LLVM_DELETED_FUNCTION;
-  ValueMap& operator=(const ValueMap&) LLVM_DELETED_FUNCTION;
+  ValueMap(const ValueMap&) = delete;
+  ValueMap& operator=(const ValueMap&) = delete;
 public:
   typedef KeyT key_type;
   typedef ValueT mapped_type;
@@ -91,12 +95,19 @@ public:
   typedef unsigned size_type;
 
   explicit ValueMap(unsigned NumInitBuckets = 64)
-    : Map(NumInitBuckets), Data() {}
+      : Map(NumInitBuckets), Data() {}
   explicit ValueMap(const ExtraData &Data, unsigned NumInitBuckets = 64)
-    : Map(NumInitBuckets), Data(Data) {}
+      : Map(NumInitBuckets), Data(Data) {}
 
   ~ValueMap() {}
 
+  bool hasMD() const { return MDMap; }
+  MDMapT &MD() {
+    if (!MDMap)
+      MDMap.reset(new MDMapT);
+    return *MDMap;
+  }
+
   typedef ValueMapIterator<MapT, KeyT> iterator;
   typedef ValueMapConstIterator<MapT, KeyT> const_iterator;
   inline iterator begin() { return iterator(Map.begin()); }
@@ -110,7 +121,10 @@ public:
   /// Grow the map so that it has at least Size buckets. Does not shrink
   void resize(size_t Size) { Map.resize(Size); }
 
-  void clear() { Map.clear(); }
+  void clear() {
+    Map.clear();
+    MDMap.reset();
+  }
 
   /// Return 1 if the specified key is in the map, 0 otherwise.
   size_type count(const KeyT &Val) const {
@@ -210,6 +224,9 @@ class ValueMapCallbackVH : public CallbackVH {
       : CallbackVH(const_cast<Value*>(static_cast<const Value*>(Key))),
         Map(Map) {}
 
+  // Private constructor used to create empty/tombstone DenseMap keys.
+  ValueMapCallbackVH(Value *V) : CallbackVH(V), Map(nullptr) {}
+
 public:
   KeyT Unwrap() const { return cast_or_null<KeySansPointerT>(getValPtr()); }
 
@@ -252,19 +269,18 @@ public:
 template<typename KeyT, typename ValueT, typename Config>
 struct DenseMapInfo<ValueMapCallbackVH<KeyT, ValueT, Config> > {
   typedef ValueMapCallbackVH<KeyT, ValueT, Config> VH;
-  typedef DenseMapInfo<KeyT> PointerInfo;
 
   static inline VH getEmptyKey() {
-    return VH(PointerInfo::getEmptyKey(), nullptr);
+    return VH(DenseMapInfo<Value *>::getEmptyKey());
   }
   static inline VH getTombstoneKey() {
-    return VH(PointerInfo::getTombstoneKey(), nullptr);
+    return VH(DenseMapInfo<Value *>::getTombstoneKey());
   }
   static unsigned getHashValue(const VH &Val) {
-    return PointerInfo::getHashValue(Val.Unwrap());
+    return DenseMapInfo<KeyT>::getHashValue(Val.Unwrap());
   }
   static unsigned getHashValue(const KeyT &Val) {
-    return PointerInfo::getHashValue(Val);
+    return DenseMapInfo<KeyT>::getHashValue(Val);
   }
   static bool isEqual(const VH &LHS, const VH &RHS) {
     return LHS == RHS;
diff --git a/include/llvm/IR/Verifier.h b/include/llvm/IR/Verifier.h
index 0272e20..43bd123 100644
--- a/include/llvm/IR/Verifier.h
+++ b/include/llvm/IR/Verifier.h
@@ -77,8 +77,8 @@ class VerifierPass {
 public:
   explicit VerifierPass(bool FatalErrors = true) : FatalErrors(FatalErrors) {}
 
-  PreservedAnalyses run(Module *M);
-  PreservedAnalyses run(Function *F);
+  PreservedAnalyses run(Module &M);
+  PreservedAnalyses run(Function &F);
 
   static StringRef name() { return "VerifierPass"; }
 };
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index a4bc598..bcf8cf2 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -40,7 +40,7 @@ void initializeObjCARCOpts(PassRegistry&);
 void initializeVectorization(PassRegistry&);
 
 /// initializeInstCombine - Initialize all passes linked into the
-/// ScalarOpts library.
+/// InstCombine library.
 void initializeInstCombine(PassRegistry&);
 
 /// initializeIPO - Initialize all passes linked into the IPO library.
@@ -65,6 +65,7 @@ void initializeTarget(PassRegistry&);
 void initializeAAEvalPass(PassRegistry&);
 void initializeAddDiscriminatorsPass(PassRegistry&);
 void initializeADCEPass(PassRegistry&);
+void initializeBDCEPass(PassRegistry&);
 void initializeAliasAnalysisAnalysisGroup(PassRegistry&);
 void initializeAliasAnalysisCounterPass(PassRegistry&);
 void initializeAliasDebuggerPass(PassRegistry&);
@@ -77,7 +78,6 @@ void initializeAlignmentFromAssumptionsPass(PassRegistry&);
 void initializeBarrierNoopPass(PassRegistry&);
 void initializeBasicAliasAnalysisPass(PassRegistry&);
 void initializeCallGraphWrapperPassPass(PassRegistry &);
-void initializeBasicTTIPass(PassRegistry&);
 void initializeBlockExtractorPassPass(PassRegistry&);
 void initializeBlockFrequencyInfoPass(PassRegistry&);
 void initializeBoundsCheckingPass(PassRegistry&);
@@ -106,7 +106,6 @@ void initializeDAEPass(PassRegistry&);
 void initializeDAHPass(PassRegistry&);
 void initializeDCEPass(PassRegistry&);
 void initializeDSEPass(PassRegistry&);
-void initializeDebugIRPass(PassRegistry&);
 void initializeDebugInfoVerifierLegacyPassPass(PassRegistry &);
 void initializeDeadInstEliminationPass(PassRegistry&);
 void initializeDeadMachineInstructionElimPass(PassRegistry&);
@@ -122,6 +121,7 @@ void initializeEarlyIfConverterPass(PassRegistry&);
 void initializeEdgeBundlesPass(PassRegistry&);
 void initializeExpandPostRAPass(PassRegistry&);
 void initializeGCOVProfilerPass(PassRegistry&);
+void initializeInstrProfilingPass(PassRegistry&);
 void initializeAddressSanitizerPass(PassRegistry&);
 void initializeAddressSanitizerModulePass(PassRegistry&);
 void initializeMemorySanitizerPass(PassRegistry&);
@@ -129,9 +129,8 @@ void initializeThreadSanitizerPass(PassRegistry&);
 void initializeSanitizerCoverageModulePass(PassRegistry&);
 void initializeDataFlowSanitizerPass(PassRegistry&);
 void initializeScalarizerPass(PassRegistry&);
-void initializeEarlyCSEPass(PassRegistry&);
+void initializeEarlyCSELegacyPassPass(PassRegistry &);
 void initializeExpandISelPseudosPass(PassRegistry&);
-void initializeFindUsedTypesPass(PassRegistry&);
 void initializeFunctionAttrsPass(PassRegistry&);
 void initializeGCMachineCodeAnalysisPass(PassRegistry&);
 void initializeGCModuleInfoPass(PassRegistry&);
@@ -143,9 +142,10 @@ void initializeIPCPPass(PassRegistry&);
 void initializeIPSCCPPass(PassRegistry&);
 void initializeIVUsersPass(PassRegistry&);
 void initializeIfConverterPass(PassRegistry&);
+void initializeInductiveRangeCheckEliminationPass(PassRegistry&);
 void initializeIndVarSimplifyPass(PassRegistry&);
 void initializeInlineCostAnalysisPass(PassRegistry&);
-void initializeInstCombinerPass(PassRegistry&);
+void initializeInstructionCombiningPassPass(PassRegistry&);
 void initializeInstCountPass(PassRegistry&);
 void initializeInstNamerPass(PassRegistry&);
 void initializeInternalizePassPass(PassRegistry&);
@@ -167,7 +167,7 @@ void initializeLoaderPassPass(PassRegistry&);
 void initializeLocalStackSlotPassPass(PassRegistry&);
 void initializeLoopDeletionPass(PassRegistry&);
 void initializeLoopExtractorPass(PassRegistry&);
-void initializeLoopInfoPass(PassRegistry&);
+void initializeLoopInfoWrapperPassPass(PassRegistry&);
 void initializeLoopInstSimplifyPass(PassRegistry&);
 void initializeLoopRotatePass(PassRegistry&);
 void initializeLoopSimplifyPass(PassRegistry&);
@@ -178,6 +178,7 @@ void initializeLoopUnrollPass(PassRegistry&);
 void initializeLoopUnswitchPass(PassRegistry&);
 void initializeLoopIdiomRecognizePass(PassRegistry&);
 void initializeLowerAtomicPass(PassRegistry&);
+void initializeLowerBitSetsPass(PassRegistry&);
 void initializeLowerExpectIntrinsicPass(PassRegistry&);
 void initializeLowerIntrinsicsPass(PassRegistry&);
 void initializeLowerInvokePass(PassRegistry&);
@@ -200,6 +201,7 @@ void initializeMachineTraceMetricsPass(PassRegistry&);
 void initializeMachineVerifierPassPass(PassRegistry&);
 void initializeMemCpyOptPass(PassRegistry&);
 void initializeMemDepPrinterPass(PassRegistry&);
+void initializeMemDerefPrinterPass(PassRegistry&);
 void initializeMemoryDependenceAnalysisPass(PassRegistry&);
 void initializeMergedLoadStoreMotionPass(PassRegistry &);
 void initializeMetaRenamerPass(PassRegistry&);
@@ -238,6 +240,7 @@ void initializeRegionOnlyPrinterPass(PassRegistry&);
 void initializeRegionOnlyViewerPass(PassRegistry&);
 void initializeRegionPrinterPass(PassRegistry&);
 void initializeRegionViewerPass(PassRegistry&);
+void initializeRewriteStatepointsForGCPass(PassRegistry&);
 void initializeSCCPPass(PassRegistry&);
 void initializeSROAPass(PassRegistry&);
 void initializeSROA_DTPass(PassRegistry&);
@@ -245,6 +248,7 @@ void initializeSROA_SSAUpPass(PassRegistry&);
 void initializeScalarEvolutionAliasAnalysisPass(PassRegistry&);
 void initializeScalarEvolutionPass(PassRegistry&);
 void initializeSimpleInlinerPass(PassRegistry&);
+void initializeShadowStackGCLoweringPass(PassRegistry&);  
 void initializeRegisterCoalescerPass(PassRegistry&);
 void initializeSingleLoopExtractorPass(PassRegistry&);
 void initializeSinkingPass(PassRegistry&);
@@ -254,6 +258,7 @@ void initializeSpillPlacementPass(PassRegistry&);
 void initializeStackProtectorPass(PassRegistry&);
 void initializeStackColoringPass(PassRegistry&);
 void initializeStackSlotColoringPass(PassRegistry&);
+void initializeStraightLineStrengthReducePass(PassRegistry &);
 void initializeStripDeadDebugInfoPass(PassRegistry&);
 void initializeStripDeadPrototypesPassPass(PassRegistry&);
 void initializeStripDebugDeclarePass(PassRegistry&);
@@ -263,11 +268,9 @@ void initializeTailCallElimPass(PassRegistry&);
 void initializeTailDuplicatePassPass(PassRegistry&);
 void initializeTargetPassConfigPass(PassRegistry&);
 void initializeDataLayoutPassPass(PassRegistry &);
-void initializeTargetTransformInfoAnalysisGroup(PassRegistry&);
-void initializeFunctionTargetTransformInfoPass(PassRegistry &);
-void initializeNoTTIPass(PassRegistry&);
-void initializeTargetLibraryInfoPass(PassRegistry&);
-void initializeAssumptionTrackerPass(PassRegistry &);
+void initializeTargetTransformInfoWrapperPassPass(PassRegistry &);
+void initializeTargetLibraryInfoWrapperPassPass(PassRegistry &);
+void initializeAssumptionCacheTrackerPass(PassRegistry &);
 void initializeTwoAddressInstructionPassPass(PassRegistry&);
 void initializeTypeBasedAliasAnalysisPass(PassRegistry&);
 void initializeScopedNoAliasAAPass(PassRegistry&);
@@ -280,6 +283,7 @@ void initializeVirtRegRewriterPass(PassRegistry&);
 void initializeInstSimplifierPass(PassRegistry&);
 void initializeUnpackMachineBundlesPass(PassRegistry&);
 void initializeFinalizeMachineBundlesPass(PassRegistry&);
+void initializeLoopAccessAnalysisPass(PassRegistry&);
 void initializeLoopVectorizePass(PassRegistry&);
 void initializeSLPVectorizerPass(PassRegistry&);
 void initializeBBVectorizePass(PassRegistry&);
@@ -288,6 +292,10 @@ void initializeStackMapLivenessPass(PassRegistry&);
 void initializeMachineCombinerPass(PassRegistry &);
 void initializeLoadCombinePass(PassRegistry&);
 void initializeRewriteSymbolsPass(PassRegistry&);
+void initializeWinEHPreparePass(PassRegistry&);
+void initializePlaceBackedgeSafepointsImplPass(PassRegistry&);
+void initializePlaceSafepointsPass(PassRegistry&);
+void initializeDwarfEHPreparePass(PassRegistry&);
 }
 
 #endif
diff --git a/include/llvm/LTO/LTOCodeGenerator.h b/include/llvm/LTO/LTOCodeGenerator.h
index 0c9ce4a..56d27ca 100644
--- a/include/llvm/LTO/LTOCodeGenerator.h
+++ b/include/llvm/LTO/LTOCodeGenerator.h
@@ -67,6 +67,9 @@ struct LTOCodeGenerator {
   // Merge given module, return true on success.
   bool addModule(struct LTOModule *);
 
+  // Set the destination module.
+  void setModule(struct LTOModule *);
+
   void setTargetOptions(TargetOptions options);
   void setDebugInfo(lto_debug_model);
   void setCodePICModel(lto_codegen_model);
@@ -117,6 +120,18 @@ struct LTOCodeGenerator {
                       bool disableVectorization,
                       std::string &errMsg);
 
+  // Optimizes the merged module. Returns true on success.
+  bool optimize(bool disableOpt,
+                bool disableInline,
+                bool disableGVNLoadPRE,
+                bool disableVectorization,
+                std::string &errMsg);
+
+  // Compiles the merged optimized module into a single object file. It brings
+  // the object to a buffer, and returns the buffer to the caller. Return NULL
+  // if the compilation was not successful.
+  const void *compileOptimized(size_t *length, std::string &errMsg);
+
   void setDiagnosticHandler(lto_diagnostic_handler_t, void *);
 
   LLVMContext &getContext() { return Context; }
@@ -124,9 +139,8 @@ struct LTOCodeGenerator {
 private:
   void initializeLTOPasses();
 
-  bool generateObjectFile(raw_ostream &out, bool disableOpt, bool disableInline,
-                          bool disableGVNLoadPRE, bool disableVectorization,
-                          std::string &errMsg);
+  bool compileOptimized(raw_ostream &out, std::string &errMsg);
+  bool compileOptimizedToFile(const char **name, std::string &errMsg);
   void applyScopeRestrictions();
   void applyRestriction(GlobalValue &GV, ArrayRef<StringRef> Libcalls,
                         std::vector<const char *> &MustPreserveList,
@@ -141,6 +155,7 @@ private:
   typedef StringMap<uint8_t> StringSet;
 
   void initialize();
+  void destroyMergedModule();
   std::unique_ptr<LLVMContext> OwnedContext;
   LLVMContext &Context;
   Linker IRLinker;
@@ -158,6 +173,7 @@ private:
   TargetOptions Options;
   lto_diagnostic_handler_t DiagHandler;
   void *DiagContext;
+  LTOModule *OwnedModule;
 };
 }
 #endif
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 66e4e9c..18343e2 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -18,7 +18,6 @@
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/CallPrinter.h"
 #include "llvm/Analysis/DomPrinter.h"
-#include "llvm/Analysis/FindUsedTypes.h"
 #include "llvm/Analysis/IntervalPartition.h"
 #include "llvm/Analysis/Lint.h"
 #include "llvm/Analysis/Passes.h"
@@ -33,8 +32,8 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <cstdlib>
 
@@ -50,6 +49,7 @@ namespace {
 
       (void) llvm::createAAEvalPass();
       (void) llvm::createAggressiveDCEPass();
+      (void) llvm::createBitTrackingDCEPass();
       (void) llvm::createAliasAnalysisCounterPass();
       (void) llvm::createAliasDebugger();
       (void) llvm::createArgumentPromotionPass();
@@ -79,6 +79,7 @@ namespace {
       (void) llvm::createDomOnlyViewerPass();
       (void) llvm::createDomViewerPass();
       (void) llvm::createGCOVProfilerPass();
+      (void) llvm::createInstrProfilingPass();
       (void) llvm::createFunctionInliningPass();
       (void) llvm::createAlwaysInlinerPass();
       (void) llvm::createGlobalDCEPass();
@@ -86,6 +87,7 @@ namespace {
       (void) llvm::createGlobalsModRefPass();
       (void) llvm::createIPConstantPropagationPass();
       (void) llvm::createIPSCCPPass();
+      (void) llvm::createInductiveRangeCheckEliminationPass();
       (void) llvm::createIndVarSimplifyPass();
       (void) llvm::createInstructionCombiningPass();
       (void) llvm::createInternalizePass();
@@ -166,9 +168,10 @@ namespace {
       (void) llvm::createScalarizerPass();
       (void) llvm::createSeparateConstOffsetFromGEPPass();
       (void) llvm::createRewriteSymbolsPass();
+      (void) llvm::createStraightLineStrengthReducePass();
+      (void) llvm::createMemDerefPrinter();
 
       (void)new llvm::IntervalPartition();
-      (void)new llvm::FindUsedTypes();
       (void)new llvm::ScalarEvolution();
       ((llvm::Function*)nullptr)->viewCFGOnly();
       llvm::RGPassManager RGM;
diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h
index c957cc2..aac9dcd 100644
--- a/include/llvm/Linker/Linker.h
+++ b/include/llvm/Linker/Linker.h
@@ -10,45 +10,81 @@
 #ifndef LLVM_LINKER_LINKER_H
 #define LLVM_LINKER_LINKER_H
 
-#include "llvm/ADT/SmallPtrSet.h"
-
-#include <functional>
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/IR/DiagnosticInfo.h"
 
 namespace llvm {
-class DiagnosticInfo;
 class Module;
 class StructType;
+class Type;
 
 /// This class provides the core functionality of linking in LLVM. It keeps a
 /// pointer to the merged module so far. It doesn't take ownership of the
 /// module since it is assumed that the user of this class will want to do
 /// something with it after the linking.
 class Linker {
-  public:
-    typedef std::function<void(const DiagnosticInfo &)>
-        DiagnosticHandlerFunction;
+public:
+  struct StructTypeKeyInfo {
+    struct KeyTy {
+      ArrayRef<Type *> ETypes;
+      bool IsPacked;
+      KeyTy(ArrayRef<Type *> E, bool P);
+      KeyTy(const StructType *ST);
+      bool operator==(const KeyTy &that) const;
+      bool operator!=(const KeyTy &that) const;
+    };
+    static StructType *getEmptyKey();
+    static StructType *getTombstoneKey();
+    static unsigned getHashValue(const KeyTy &Key);
+    static unsigned getHashValue(const StructType *ST);
+    static bool isEqual(const KeyTy &LHS, const StructType *RHS);
+    static bool isEqual(const StructType *LHS, const StructType *RHS);
+  };
+
+  typedef DenseSet<StructType *, StructTypeKeyInfo> NonOpaqueStructTypeSet;
+  typedef DenseSet<StructType *> OpaqueStructTypeSet;
+
+  struct IdentifiedStructTypeSet {
+    // The set of opaque types is the composite module.
+    OpaqueStructTypeSet OpaqueStructTypes;
+
+    // The set of identified but non opaque structures in the composite module.
+    NonOpaqueStructTypeSet NonOpaqueStructTypes;
+
+    void addNonOpaque(StructType *Ty);
+    void addOpaque(StructType *Ty);
+    StructType *findNonOpaque(ArrayRef<Type *> ETypes, bool IsPacked);
+    bool hasType(StructType *Ty);
+  };
+
+  Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler);
+  Linker(Module *M);
+  ~Linker();
+
+  Module *getModule() const { return Composite; }
+  void deleteModule();
+
+  /// \brief Link \p Src into the composite. The source is destroyed.
+  /// Returns true on error.
+  bool linkInModule(Module *Src);
 
-    Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler);
-    Linker(Module *M);
-    ~Linker();
+  /// \brief Set the composite to the passed-in module.
+  void setModule(Module *Dst);
 
-    Module *getModule() const { return Composite; }
-    void deleteModule();
+  static bool LinkModules(Module *Dest, Module *Src,
+                          DiagnosticHandlerFunction DiagnosticHandler);
 
-    /// \brief Link \p Src into the composite. The source is destroyed.
-    /// Returns true on error.
-    bool linkInModule(Module *Src);
+  static bool LinkModules(Module *Dest, Module *Src);
 
-    static bool LinkModules(Module *Dest, Module *Src,
-                            DiagnosticHandlerFunction DiagnosticHandler);
+private:
+  void init(Module *M, DiagnosticHandlerFunction DiagnosticHandler);
+  Module *Composite;
 
-    static bool LinkModules(Module *Dest, Module *Src);
+  IdentifiedStructTypeSet IdentifiedStructTypes;
 
-  private:
-    void init(Module *M, DiagnosticHandlerFunction DiagnosticHandler);
-    Module *Composite;
-    SmallPtrSet<StructType*, 32> IdentifiedStructTypes;
-    DiagnosticHandlerFunction DiagnosticHandler;
+  DiagnosticHandlerFunction DiagnosticHandler;
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/MCAsmBackend.h b/include/llvm/MC/MCAsmBackend.h
index 15a956b..56da95d 100644
--- a/include/llvm/MC/MCAsmBackend.h
+++ b/include/llvm/MC/MCAsmBackend.h
@@ -32,8 +32,8 @@ class raw_ostream;
 
 /// MCAsmBackend - Generic interface to target specific assembler backends.
 class MCAsmBackend {
-  MCAsmBackend(const MCAsmBackend &) LLVM_DELETED_FUNCTION;
-  void operator=(const MCAsmBackend &) LLVM_DELETED_FUNCTION;
+  MCAsmBackend(const MCAsmBackend &) = delete;
+  void operator=(const MCAsmBackend &) = delete;
 
 protected: // Can only create subclasses.
   MCAsmBackend();
@@ -61,20 +61,6 @@ public:
   /// markers. If not, data region directives will be ignored.
   bool hasDataInCodeSupport() const { return HasDataInCodeSupport; }
 
-  /// doesSectionRequireSymbols - Check whether the given section requires that
-  /// all symbols (even temporaries) have symbol table entries.
-  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
-    return false;
-  }
-
-  /// isSectionAtomizable - Check whether the given section can be split into
-  /// atoms.
-  ///
-  /// \see MCAssembler::isSymbolLinkerVisible().
-  virtual bool isSectionAtomizable(const MCSection &Section) const {
-    return true;
-  }
-
   /// @name Target Fixup Interfaces
   /// @{
 
diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index 4f38aac..ee245e9 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h
@@ -42,11 +42,11 @@ enum class EncodingType {
 }
 
 enum class ExceptionHandling {
-  None,         /// No exception support
-  DwarfCFI,     /// DWARF-like instruction based exceptions
-  SjLj,         /// setjmp/longjmp based exceptions
-  ARM,          /// ARM EHABI
-  ItaniumWinEH, /// Itanium EH built on Windows unwind info (.pdata and .xdata)
+  None,     /// No exception support
+  DwarfCFI, /// DWARF-like instruction based exceptions
+  SjLj,     /// setjmp/longjmp based exceptions
+  ARM,      /// ARM EHABI
+  WinEH,    /// Windows Exception Handling
 };
 
 namespace LCOMM {
@@ -123,6 +123,10 @@ protected:
   /// file.  Defaults to "L"
   const char *PrivateGlobalPrefix;
 
+  /// This prefix is used for labels for basic blocks. Defaults to the same as
+  /// PrivateGlobalPrefix.
+  const char *PrivateLabelPrefix;
+
   /// This prefix is used for symbols that should be passed through the
   /// assembler but be removed by the linker.  This is 'l' on Darwin, currently
   /// used for some ObjC metadata.  The default of "" meast that for this system
@@ -215,7 +219,8 @@ protected:
 
   //===--- Global Variable Emission Directives --------------------------===//
 
-  /// This is the directive used to declare a global entity.  Defaults to NULL.
+  /// This is the directive used to declare a global entity. Defaults to
+  /// ".globl".
   const char *GlobalDirective;
 
   /// True if the expression
@@ -264,6 +269,9 @@ protected:
   /// to false.
   bool HasNoDeadStrip;
 
+  /// Used to declare a global as being a weak symbol. Defaults to ".weak".
+  const char *WeakDirective;
+
   /// This directive, if non-null, is used to declare a global as being a weak
   /// undefined symbol.  Defaults to NULL.
   const char *WeakRefDirective;
@@ -373,6 +381,12 @@ public:
     return nullptr;
   }
 
+  /// \brief True if the section is atomized using the symbols in it.
+  /// This is false if the section is not atomized at all (most ELF sections) or
+  /// if it is atomized based on its contents (MachO' __TEXT,__cstring for
+  /// example).
+  virtual bool isSectionAtomizableBySymbols(const MCSection &Section) const;
+
   virtual const MCExpr *getExprForPersonalitySymbol(const MCSymbol *Sym,
                                                     unsigned Encoding,
                                                     MCStreamer &Streamer) const;
@@ -414,6 +428,7 @@ public:
 
   bool useAssignmentForEHBegin() const { return UseAssignmentForEHBegin; }
   const char *getPrivateGlobalPrefix() const { return PrivateGlobalPrefix; }
+  const char *getPrivateLabelPrefix() const { return PrivateLabelPrefix; }
   bool hasLinkerPrivateGlobalPrefix() const {
     return LinkerPrivateGlobalPrefix[0] != '\0';
   }
@@ -452,6 +467,7 @@ public:
   bool hasSingleParameterDotFile() const { return HasSingleParameterDotFile; }
   bool hasIdentDirective() const { return HasIdentDirective; }
   bool hasNoDeadStrip() const { return HasNoDeadStrip; }
+  const char *getWeakDirective() const { return WeakDirective; }
   const char *getWeakRefDirective() const { return WeakRefDirective; }
   bool hasWeakDefDirective() const { return HasWeakDefDirective; }
   bool hasWeakDefCanBeHiddenDirective() const {
@@ -473,13 +489,16 @@ public:
   ExceptionHandling getExceptionHandlingType() const { return ExceptionsType; }
   WinEH::EncodingType getWinEHEncodingType() const { return WinEHEncodingType; }
 
-  /// Return true if the exception handling type uses the language-specific data
-  /// area (LSDA) format specified by the Itanium C++ ABI.
-  bool usesItaniumLSDAForExceptions() const {
+  /// Returns true if the exception handling method for the platform uses call
+  /// frame information to unwind.
+  bool usesCFIForEH() const {
     return (ExceptionsType == ExceptionHandling::DwarfCFI ||
             ExceptionsType == ExceptionHandling::ARM ||
-            // This Windows EH type uses the Itanium LSDA encoding.
-            ExceptionsType == ExceptionHandling::ItaniumWinEH);
+            ExceptionsType == ExceptionHandling::WinEH);
+  }
+
+  bool usesWindowsCFI() const {
+    return ExceptionsType == ExceptionHandling::WinEH;
   }
 
   bool doesDwarfUseRelocationsAcrossSections() const {
diff --git a/include/llvm/MC/MCAsmInfoDarwin.h b/include/llvm/MC/MCAsmInfoDarwin.h
index 3d249f9..d587c3c 100644
--- a/include/llvm/MC/MCAsmInfoDarwin.h
+++ b/include/llvm/MC/MCAsmInfoDarwin.h
@@ -19,9 +19,9 @@
 
 namespace llvm {
   class MCAsmInfoDarwin : public MCAsmInfo {
-    virtual void anchor();
   public:
     explicit MCAsmInfoDarwin();
+    bool isSectionAtomizableBySymbols(const MCSection &Section) const override;
   };
 }
 
diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index 681a317..8c56f88 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h
@@ -11,6 +11,7 @@
 #define LLVM_MC_MCASSEMBLER_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
@@ -46,8 +47,8 @@ class MCAsmBackend;
 class MCFragment : public ilist_node<MCFragment> {
   friend class MCAsmLayout;
 
-  MCFragment(const MCFragment&) LLVM_DELETED_FUNCTION;
-  void operator=(const MCFragment&) LLVM_DELETED_FUNCTION;
+  MCFragment(const MCFragment&) = delete;
+  void operator=(const MCFragment&) = delete;
 
 public:
   enum FragmentType {
@@ -562,8 +563,8 @@ public:
 class MCSectionData : public ilist_node<MCSectionData> {
   friend class MCAsmLayout;
 
-  MCSectionData(const MCSectionData&) LLVM_DELETED_FUNCTION;
-  void operator=(const MCSectionData&) LLVM_DELETED_FUNCTION;
+  MCSectionData(const MCSectionData&) = delete;
+  void operator=(const MCSectionData&) = delete;
 
 public:
   typedef iplist<MCFragment> FragmentListType;
@@ -864,8 +865,8 @@ public:
     unsigned Update;
   } VersionMinInfoType;
 private:
-  MCAssembler(const MCAssembler&) LLVM_DELETED_FUNCTION;
-  void operator=(const MCAssembler&) LLVM_DELETED_FUNCTION;
+  MCAssembler(const MCAssembler&) = delete;
+  void operator=(const MCAssembler&) = delete;
 
   MCContext &Context;
 
@@ -881,6 +882,8 @@ private:
 
   iplist<MCSymbolData> Symbols;
 
+  DenseSet<const MCSymbol *> LocalsUsedInReloc;
+
   /// The map of sections to their associated assembler backend data.
   //
   // FIXME: Avoid this indirection?
@@ -980,6 +983,9 @@ private:
                                         MCFragment &F, const MCFixup &Fixup);
 
 public:
+  void addLocalUsedInReloc(const MCSymbol &Sym);
+  bool isLocalUsedInReloc(const MCSymbol &Sym) const;
+
   /// Compute the effective fragment size assuming it is laid out at the given
   /// \p SectionAddress and \p FragmentOffset.
   uint64_t computeFragmentSize(const MCAsmLayout &Layout,
diff --git a/include/llvm/MC/MCCodeEmitter.h b/include/llvm/MC/MCCodeEmitter.h
index d3b5617..ddf0f82 100644
--- a/include/llvm/MC/MCCodeEmitter.h
+++ b/include/llvm/MC/MCCodeEmitter.h
@@ -22,8 +22,8 @@ template<typename T> class SmallVectorImpl;
 /// MCCodeEmitter - Generic instruction encoding interface.
 class MCCodeEmitter {
 private:
-  MCCodeEmitter(const MCCodeEmitter &) LLVM_DELETED_FUNCTION;
-  void operator=(const MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  MCCodeEmitter(const MCCodeEmitter &) = delete;
+  void operator=(const MCCodeEmitter &) = delete;
 protected: // Can only create subclasses.
   MCCodeEmitter();
 
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index f209448..cd96dfd 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -47,8 +47,8 @@ namespace llvm {
   /// of the sections that it creates.
   ///
   class MCContext {
-    MCContext(const MCContext&) LLVM_DELETED_FUNCTION;
-    MCContext &operator=(const MCContext&) LLVM_DELETED_FUNCTION;
+    MCContext(const MCContext&) = delete;
+    MCContext &operator=(const MCContext&) = delete;
   public:
     typedef StringMap<MCSymbol*, BumpPtrAllocator&> SymbolTable;
   private:
@@ -237,6 +237,8 @@ namespace llvm {
 
     MCSymbol *getOrCreateSectionSymbol(const MCSectionELF &Section);
 
+    MCSymbol *getOrCreateFrameAllocSymbol(StringRef FuncName);
+
     /// LookupSymbol - Get the symbol for \p Name, or null.
     MCSymbol *LookupSymbol(StringRef Name) const;
     MCSymbol *LookupSymbol(const Twine &Name) const;
@@ -269,11 +271,15 @@ namespace llvm {
     }
 
     const MCSectionELF *getELFSection(StringRef Section, unsigned Type,
-                                      unsigned Flags, SectionKind Kind);
+                                      unsigned Flags);
+
+    const MCSectionELF *getELFSection(StringRef Section, unsigned Type,
+                                      unsigned Flags, unsigned EntrySize,
+                                      StringRef Group);
 
     const MCSectionELF *getELFSection(StringRef Section, unsigned Type,
-                                      unsigned Flags, SectionKind Kind,
-                                      unsigned EntrySize, StringRef Group);
+                                      unsigned Flags, unsigned EntrySize,
+                                      StringRef Group, bool Unique);
 
     void renameELFSection(const MCSectionELF *Section, StringRef Name);
 
diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h
index a221d26..c266acf 100644
--- a/include/llvm/MC/MCDwarf.h
+++ b/include/llvm/MC/MCDwarf.h
@@ -16,16 +16,16 @@
 #define LLVM_MC_MCDWARF_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
-#include <vector>
 #include <string>
 #include <utility>
+#include <vector>
 
 namespace llvm {
 class MCAsmBackend;
diff --git a/include/llvm/MC/MCELFObjectWriter.h b/include/llvm/MC/MCELFObjectWriter.h
index 421e7a0..9763635 100644
--- a/include/llvm/MC/MCELFObjectWriter.h
+++ b/include/llvm/MC/MCELFObjectWriter.h
@@ -41,6 +41,7 @@ protected:
 public:
   static uint8_t getOSABI(Triple::OSType OSType) {
     switch (OSType) {
+      case Triple::PS4:
       case Triple::FreeBSD:
         return ELF::ELFOSABI_FREEBSD;
       case Triple::Linux:
diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index f0e8611..5e2ef3f 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@@ -44,8 +44,8 @@ public:
 private:
   ExprKind Kind;
 
-  MCExpr(const MCExpr&) LLVM_DELETED_FUNCTION;
-  void operator=(const MCExpr&) LLVM_DELETED_FUNCTION;
+  MCExpr(const MCExpr&) = delete;
+  void operator=(const MCExpr&) = delete;
 
   bool EvaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
                           const MCAsmLayout *Layout,
@@ -194,6 +194,7 @@ public:
     VK_ARM_TARGET1,
     VK_ARM_TARGET2,
     VK_ARM_PREL31,
+    VK_ARM_SBREL,          // symbol(sbrel)
     VK_ARM_TLSLDO,         // symbol(tlsldo)
     VK_ARM_TLSCALL,        // symbol(tlscall)
     VK_ARM_TLSDESC,        // symbol(tlsdesc)
diff --git a/include/llvm/MC/MCInst.h b/include/llvm/MC/MCInst.h
index 6918280..aed7347 100644
--- a/include/llvm/MC/MCInst.h
+++ b/include/llvm/MC/MCInst.h
@@ -31,7 +31,7 @@ class MCInst;
 /// MCOperand - Instances of this class represent operands of the MCInst class.
 /// This is a simple discriminated union.
 class MCOperand {
-  enum MachineOperandType {
+  enum MachineOperandType : unsigned char {
     kInvalid,                 ///< Uninitialized.
     kRegister,                ///< Register operand.
     kImmediate,               ///< Immediate operand.
@@ -39,7 +39,7 @@ class MCOperand {
     kExpr,                    ///< Relocatable immediate operand.
     kInst                     ///< Sub-instruction operand.
   };
-  unsigned char Kind;
+  MachineOperandType Kind;
 
   union {
     unsigned RegVal;
@@ -139,7 +139,7 @@ public:
     return Op;
   }
 
-  void print(raw_ostream &OS, const MCAsmInfo *MAI) const;
+  void print(raw_ostream &OS) const;
   void dump() const;
 };
 
@@ -169,33 +169,35 @@ public:
   }
 
   void clear() { Operands.clear(); }
-  size_t size() { return Operands.size(); }
+  size_t size() const { return Operands.size(); }
 
   typedef SmallVectorImpl<MCOperand>::iterator iterator;
+  typedef SmallVectorImpl<MCOperand>::const_iterator const_iterator;
   iterator begin() { return Operands.begin(); }
-  iterator end()   { return Operands.end();   }
+  const_iterator begin() const { return Operands.begin(); }
+  iterator end()   { return Operands.end(); }
+  const_iterator end() const { return Operands.end(); }
   iterator insert(iterator I, const MCOperand &Op) {
     return Operands.insert(I, Op);
   }
 
-  void print(raw_ostream &OS, const MCAsmInfo *MAI) const;
+  void print(raw_ostream &OS) const;
   void dump() const;
 
   /// \brief Dump the MCInst as prettily as possible using the additional MC
   /// structures, if given. Operators are separated by the \p Separator
   /// string.
-  void dump_pretty(raw_ostream &OS, const MCAsmInfo *MAI = nullptr,
-                   const MCInstPrinter *Printer = nullptr,
+  void dump_pretty(raw_ostream &OS, const MCInstPrinter *Printer = nullptr,
                    StringRef Separator = " ") const;
 };
 
 inline raw_ostream& operator<<(raw_ostream &OS, const MCOperand &MO) {
-  MO.print(OS, nullptr);
+  MO.print(OS);
   return OS;
 }
 
 inline raw_ostream& operator<<(raw_ostream &OS, const MCInst &MI) {
-  MI.print(OS, nullptr);
+  MI.print(OS);
   return OS;
 }
 
diff --git a/include/llvm/MC/MCInstPrinter.h b/include/llvm/MC/MCInstPrinter.h
index 95124c3..dce3a06 100644
--- a/include/llvm/MC/MCInstPrinter.h
+++ b/include/llvm/MC/MCInstPrinter.h
@@ -95,12 +95,14 @@ public:
   void setPrintImmHex(HexStyle::Style Value) { PrintHexStyle = Value; }
 
   /// Utility function to print immediates in decimal or hex.
-  format_object1<int64_t> formatImm(const int64_t Value) const { return PrintImmHex ? formatHex(Value) : formatDec(Value); }
+  format_object<int64_t> formatImm(int64_t Value) const {
+    return PrintImmHex ? formatHex(Value) : formatDec(Value);
+  }
 
   /// Utility functions to print decimal/hexadecimal values.
-  format_object1<int64_t> formatDec(const int64_t Value) const;
-  format_object1<int64_t> formatHex(const int64_t Value) const;
-  format_object1<uint64_t> formatHex(const uint64_t Value) const;
+  format_object<int64_t> formatDec(int64_t Value) const;
+  format_object<int64_t> formatHex(int64_t Value) const;
+  format_object<uint64_t> formatHex(uint64_t Value) const;
 };
 
 } // namespace llvm
diff --git a/include/llvm/MC/MCInstrDesc.h b/include/llvm/MC/MCInstrDesc.h
index d4f93c1..3609893 100644
--- a/include/llvm/MC/MCInstrDesc.h
+++ b/include/llvm/MC/MCInstrDesc.h
@@ -44,11 +44,12 @@ namespace MCOI {
 
   /// Operand Type - Operands are tagged with one of the values of this enum.
   enum OperandType {
-    OPERAND_UNKNOWN,
-    OPERAND_IMMEDIATE,
-    OPERAND_REGISTER,
-    OPERAND_MEMORY,
-    OPERAND_PCREL
+    OPERAND_UNKNOWN      = 0,
+    OPERAND_IMMEDIATE    = 1,
+    OPERAND_REGISTER     = 2,
+    OPERAND_MEMORY       = 3,
+    OPERAND_PCREL        = 4,
+    OPERAND_FIRST_TARGET = 5
   };
 }
 
diff --git a/include/llvm/MC/MCLabel.h b/include/llvm/MC/MCLabel.h
index f531de8..a386deb 100644
--- a/include/llvm/MC/MCLabel.h
+++ b/include/llvm/MC/MCLabel.h
@@ -32,8 +32,8 @@ namespace llvm {
     MCLabel(unsigned instance)
       : Instance(instance) {}
 
-    MCLabel(const MCLabel&) LLVM_DELETED_FUNCTION;
-    void operator=(const MCLabel&) LLVM_DELETED_FUNCTION;
+    MCLabel(const MCLabel&) = delete;
+    void operator=(const MCLabel&) = delete;
   public:
     /// getInstance - Get the current instance of this Directional Local Label.
     unsigned getInstance() const { return Instance; }
diff --git a/include/llvm/MC/MCLinkerOptimizationHint.h b/include/llvm/MC/MCLinkerOptimizationHint.h
index 1f91b0d..890d638 100644
--- a/include/llvm/MC/MCLinkerOptimizationHint.h
+++ b/include/llvm/MC/MCLinkerOptimizationHint.h
@@ -18,8 +18,8 @@
 #define LLVM_MC_MCLINKEROPTIMIZATIONHINT_H
 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/include/llvm/MC/MCMachObjectWriter.h b/include/llvm/MC/MCMachObjectWriter.h
index 0c5aa8a..e4681c0 100644
--- a/include/llvm/MC/MCMachObjectWriter.h
+++ b/include/llvm/MC/MCMachObjectWriter.h
@@ -68,12 +68,10 @@ public:
   /// @name API
   /// @{
 
-  virtual void RecordRelocation(MachObjectWriter *Writer,
-                                const MCAssembler &Asm,
+  virtual void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
                                 const MCAsmLayout &Layout,
                                 const MCFragment *Fragment,
-                                const MCFixup &Fixup,
-                                MCValue Target,
+                                const MCFixup &Fixup, MCValue Target,
                                 uint64_t &FixedValue) = 0;
 
   /// @}
@@ -97,8 +95,14 @@ class MachObjectWriter : public MCObjectWriter {
   /// @name Relocation Data
   /// @{
 
-  llvm::DenseMap<const MCSectionData*,
-                 std::vector<MachO::any_relocation_info> > Relocations;
+  struct RelAndSymbol {
+    const MCSymbolData *Sym;
+    MachO::any_relocation_info MRE;
+    RelAndSymbol(const MCSymbolData *Sym, const MachO::any_relocation_info &MRE)
+        : Sym(Sym), MRE(MRE) {}
+  };
+
+  llvm::DenseMap<const MCSectionData *, std::vector<RelAndSymbol>> Relocations;
   llvm::DenseMap<const MCSectionData*, unsigned> IndirectSymBase;
 
   /// @}
@@ -213,9 +217,15 @@ public:
   //  - Input errors, where something cannot be correctly encoded. 'as' allows
   //    these through in many cases.
 
-  void addRelocation(const MCSectionData *SD,
+  // Add a relocation to be output in the object file. At the time this is
+  // called, the symbol indexes are not know, so if the relocation refers
+  // to a symbol it should be passed as \p RelSymbol so that it can be updated
+  // afterwards. If the relocation doesn't refer to a symbol, nullptr should be
+  // used.
+  void addRelocation(const MCSymbolData *RelSymbol, const MCSectionData *SD,
                      MachO::any_relocation_info &MRE) {
-    Relocations[SD].push_back(MRE);
+    RelAndSymbol P(RelSymbol, MRE);
+    Relocations[SD].push_back(P);
   }
 
   void RecordScatteredRelocation(const MCAssembler &Asm,
@@ -231,7 +241,7 @@ public:
                             const MCFixup &Fixup, MCValue Target,
                             uint64_t &FixedValue);
 
-  void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
+  void RecordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
                         MCValue Target, bool &IsPCRel,
                         uint64_t &FixedValue) override;
diff --git a/include/llvm/MC/MCObjectWriter.h b/include/llvm/MC/MCObjectWriter.h
index 55c828c..430075c 100644
--- a/include/llvm/MC/MCObjectWriter.h
+++ b/include/llvm/MC/MCObjectWriter.h
@@ -37,8 +37,8 @@ class MCValue;
 /// The object writer also contains a number of helper methods for writing
 /// binary data to the output stream.
 class MCObjectWriter {
-  MCObjectWriter(const MCObjectWriter &) LLVM_DELETED_FUNCTION;
-  void operator=(const MCObjectWriter &) LLVM_DELETED_FUNCTION;
+  MCObjectWriter(const MCObjectWriter &) = delete;
+  void operator=(const MCObjectWriter &) = delete;
 
 protected:
   raw_ostream &OS;
@@ -76,12 +76,10 @@ public:
   /// post layout binding. The implementation is responsible for storing
   /// information about the relocation so that it can be emitted during
   /// WriteObject().
-  virtual void RecordRelocation(const MCAssembler &Asm,
-                                const MCAsmLayout &Layout,
+  virtual void RecordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
                                 const MCFragment *Fragment,
                                 const MCFixup &Fixup, MCValue Target,
-                                bool &IsPCRel,
-                                uint64_t &FixedValue) = 0;
+                                bool &IsPCRel, uint64_t &FixedValue) = 0;
 
   /// \brief Check whether the difference (A - B) between two symbol
   /// references is fully resolved.
diff --git a/include/llvm/MC/MCParser/AsmLexer.h b/include/llvm/MC/MCParser/AsmLexer.h
index a9a30f1..2f681d6 100644
--- a/include/llvm/MC/MCParser/AsmLexer.h
+++ b/include/llvm/MC/MCParser/AsmLexer.h
@@ -31,8 +31,8 @@ class AsmLexer : public MCAsmLexer {
   StringRef CurBuf;
   bool isAtStartOfLine;
 
-  void operator=(const AsmLexer&) LLVM_DELETED_FUNCTION;
-  AsmLexer(const AsmLexer&) LLVM_DELETED_FUNCTION;
+  void operator=(const AsmLexer&) = delete;
+  AsmLexer(const AsmLexer&) = delete;
 
 protected:
   /// LexToken - Read the next token and return its code.
diff --git a/include/llvm/MC/MCParser/MCAsmLexer.h b/include/llvm/MC/MCParser/MCAsmLexer.h
index b05891c..14f8ade 100644
--- a/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -124,8 +124,8 @@ class MCAsmLexer {
   SMLoc ErrLoc;
   std::string Err;
 
-  MCAsmLexer(const MCAsmLexer &) LLVM_DELETED_FUNCTION;
-  void operator=(const MCAsmLexer &) LLVM_DELETED_FUNCTION;
+  MCAsmLexer(const MCAsmLexer &) = delete;
+  void operator=(const MCAsmLexer &) = delete;
 protected: // Can only create subclasses.
   const char *TokStart;
   bool SkipSpace;
diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index 34188e6..83ffbb5 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h
@@ -68,8 +68,8 @@ public:
     ExtensionDirectiveHandler;
 
 private:
-  MCAsmParser(const MCAsmParser &) LLVM_DELETED_FUNCTION;
-  void operator=(const MCAsmParser &) LLVM_DELETED_FUNCTION;
+  MCAsmParser(const MCAsmParser &) = delete;
+  void operator=(const MCAsmParser &) = delete;
 
   MCTargetAsmParser *TargetParser;
 
diff --git a/include/llvm/MC/MCParser/MCAsmParserExtension.h b/include/llvm/MC/MCParser/MCAsmParserExtension.h
index bfc0afa..53515d7 100644
--- a/include/llvm/MC/MCParser/MCAsmParserExtension.h
+++ b/include/llvm/MC/MCParser/MCAsmParserExtension.h
@@ -21,8 +21,8 @@ class Twine;
 /// which is implemented by target and object file assembly parser
 /// implementations.
 class MCAsmParserExtension {
-  MCAsmParserExtension(const MCAsmParserExtension &) LLVM_DELETED_FUNCTION;
-  void operator=(const MCAsmParserExtension &) LLVM_DELETED_FUNCTION;
+  MCAsmParserExtension(const MCAsmParserExtension &) = delete;
+  void operator=(const MCAsmParserExtension &) = delete;
 
   MCAsmParser *Parser;
 
diff --git a/include/llvm/MC/MCRegisterInfo.h b/include/llvm/MC/MCRegisterInfo.h
index df556e7..8e25ee1 100644
--- a/include/llvm/MC/MCRegisterInfo.h
+++ b/include/llvm/MC/MCRegisterInfo.h
@@ -114,6 +114,10 @@ struct MCRegisterDesc {
   // RegUnits - Points to the list of register units. The low 4 bits holds the
   // Scale, the high bits hold an offset into DiffLists. See MCRegUnitIterator.
   uint32_t RegUnits;
+
+  /// Index into list with lane mask sequences. The sequence contains a lanemask
+  /// for every register unit.
+  uint16_t RegUnitLaneMasks;
 };
 
 /// MCRegisterInfo base class - We assume that the target defines a static
@@ -157,6 +161,8 @@ private:
   unsigned NumRegUnits;                       // Number of regunits.
   const MCPhysReg (*RegUnitRoots)[2];         // Pointer to regunit root table.
   const MCPhysReg *DiffLists;                 // Pointer to the difflists array
+  const unsigned *RegUnitMaskSequences;       // Pointer to lane mask sequences
+                                              // for register units.
   const char *RegStrings;                     // Pointer to the string table.
   const char *RegClassStrings;                // Pointer to the class strings.
   const uint16_t *SubRegIndices;              // Pointer to the subreg lookup
@@ -227,8 +233,10 @@ public:
   // These iterators are allowed to sub-class DiffListIterator and access
   // internal list pointers.
   friend class MCSubRegIterator;
+  friend class MCSubRegIndexIterator;
   friend class MCSuperRegIterator;
   friend class MCRegUnitIterator;
+  friend class MCRegUnitMaskIterator;
   friend class MCRegUnitRootIterator;
 
   /// \brief Initialize MCRegisterInfo, called by TableGen
@@ -239,6 +247,7 @@ public:
                           const MCPhysReg (*RURoots)[2],
                           unsigned NRU,
                           const MCPhysReg *DL,
+                          const unsigned *RUMS,
                           const char *Strings,
                           const char *ClassStrings,
                           const uint16_t *SubIndices,
@@ -251,6 +260,7 @@ public:
     PCReg = PC;
     Classes = C;
     DiffLists = DL;
+    RegUnitMaskSequences = RUMS;
     RegStrings = Strings;
     RegClassStrings = ClassStrings;
     NumClasses = NC;
@@ -452,6 +462,38 @@ public:
   }
 };
 
+/// Iterator that enumerates the sub-registers of a Reg and the associated
+/// sub-register indices.
+class MCSubRegIndexIterator {
+  MCSubRegIterator SRIter;
+  const uint16_t *SRIndex;
+public:
+  /// Constructs an iterator that traverses subregisters and their
+  /// associated subregister indices.
+  MCSubRegIndexIterator(unsigned Reg, const MCRegisterInfo *MCRI)
+    : SRIter(Reg, MCRI) {
+    SRIndex = MCRI->SubRegIndices + MCRI->get(Reg).SubRegIndices;
+  }
+
+  /// Returns current sub-register.
+  unsigned getSubReg() const {
+    return *SRIter;
+  }
+  /// Returns sub-register index of the current sub-register.
+  unsigned getSubRegIndex() const {
+    return *SRIndex;
+  }
+
+  /// Returns true if this iterator is not yet at the end.
+  bool isValid() const { return SRIter.isValid(); }
+
+  /// Moves to the next position.
+  void operator++() {
+    ++SRIter;
+    ++SRIndex;
+  }
+};
+
 /// MCSuperRegIterator enumerates all super-registers of Reg.
 /// If IncludeSelf is set, Reg itself is included in the list.
 class MCSuperRegIterator : public MCRegisterInfo::DiffListIterator {
@@ -513,6 +555,36 @@ public:
   }
 };
 
+/// MCRegUnitIterator enumerates a list of register units and their associated
+/// lane masks for Reg. The register units are in ascending numerical order.
+class MCRegUnitMaskIterator {
+  MCRegUnitIterator RUIter;
+  const unsigned *MaskListIter;
+public:
+  MCRegUnitMaskIterator() {}
+  /// Constructs an iterator that traverses the register units and their
+  /// associated LaneMasks in Reg.
+  MCRegUnitMaskIterator(unsigned Reg, const MCRegisterInfo *MCRI)
+    : RUIter(Reg, MCRI) {
+      uint16_t Idx = MCRI->get(Reg).RegUnitLaneMasks;
+      MaskListIter = &MCRI->RegUnitMaskSequences[Idx];
+  }
+
+  /// Returns a (RegUnit, LaneMask) pair.
+  std::pair<unsigned,unsigned> operator*() const {
+    return std::make_pair(*RUIter, *MaskListIter);
+  }
+
+  /// Returns true if this iterator is not yet at the end.
+  bool isValid() const { return RUIter.isValid(); }
+
+  /// Moves to the next position.
+  void operator++() {
+    ++MaskListIter;
+    ++RUIter;
+  }
+};
+
 // Each register unit has one or two root registers. The complete set of
 // registers containing a register unit is the union of the roots and their
 // super-registers. All registers aliasing Unit can be visited like this:
diff --git a/include/llvm/MC/MCRelocationInfo.h b/include/llvm/MC/MCRelocationInfo.h
index 9dab900..40e0217 100644
--- a/include/llvm/MC/MCRelocationInfo.h
+++ b/include/llvm/MC/MCRelocationInfo.h
@@ -28,8 +28,8 @@ class MCContext;
 
 /// \brief Create MCExprs from relocations found in an object file.
 class MCRelocationInfo {
-  MCRelocationInfo(const MCRelocationInfo &) LLVM_DELETED_FUNCTION;
-  void operator=(const MCRelocationInfo &) LLVM_DELETED_FUNCTION;
+  MCRelocationInfo(const MCRelocationInfo &) = delete;
+  void operator=(const MCRelocationInfo &) = delete;
 
 protected:
   MCContext &Ctx;
diff --git a/include/llvm/MC/MCSection.h b/include/llvm/MC/MCSection.h
index de2678a..8aec9c8 100644
--- a/include/llvm/MC/MCSection.h
+++ b/include/llvm/MC/MCSection.h
@@ -35,8 +35,8 @@ namespace llvm {
     };
 
   private:
-    MCSection(const MCSection&) LLVM_DELETED_FUNCTION;
-    void operator=(const MCSection&) LLVM_DELETED_FUNCTION;
+    MCSection(const MCSection&) = delete;
+    void operator=(const MCSection&) = delete;
   protected:
     MCSection(SectionVariant V, SectionKind K) : Variant(V), Kind(K) {}
     SectionVariant Variant;
diff --git a/include/llvm/MC/MCSectionELF.h b/include/llvm/MC/MCSectionELF.h
index 5ec23f1..6d864b4 100644
--- a/include/llvm/MC/MCSectionELF.h
+++ b/include/llvm/MC/MCSectionELF.h
@@ -39,6 +39,8 @@ class MCSectionELF : public MCSection {
   /// below.
   unsigned Flags;
 
+  bool Unique;
+
   /// EntrySize - The size of each entry in this section. This size only
   /// makes sense for sections that contain fixed-sized entries. If a
   /// section does not contain fixed-sized entries 'EntrySize' will be 0.
@@ -48,10 +50,10 @@ class MCSectionELF : public MCSection {
 
 private:
   friend class MCContext;
-  MCSectionELF(StringRef Section, unsigned type, unsigned flags,
-               SectionKind K, unsigned entrySize, const MCSymbol *group)
-    : MCSection(SV_ELF, K), SectionName(Section), Type(type), Flags(flags),
-      EntrySize(entrySize), Group(group) {}
+  MCSectionELF(StringRef Section, unsigned type, unsigned flags, SectionKind K,
+               unsigned entrySize, const MCSymbol *group, bool Unique)
+      : MCSection(SV_ELF, K), SectionName(Section), Type(type), Flags(flags),
+        Unique(Unique), EntrySize(entrySize), Group(group) {}
   ~MCSectionELF();
 
   void setSectionName(StringRef Name) { SectionName = Name; }
@@ -92,10 +94,6 @@ public:
   static bool classof(const MCSection *S) {
     return S->getVariant() == SV_ELF;
   }
-
-  // Return the entry size for sections with fixed-width data.
-  static unsigned DetermineEntrySize(SectionKind Kind);
-
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index df896a6..f0be77f 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -49,14 +49,14 @@ typedef std::pair<const MCSection *, const MCExpr *> MCSectionSubPair;
 ///
 /// If target foo wants to use this, it should implement 3 classes:
 /// * FooTargetStreamer : public MCTargetStreamer
-/// * FooTargetAsmSreamer : public FooTargetStreamer
+/// * FooTargetAsmStreamer : public FooTargetStreamer
 /// * FooTargetELFStreamer : public FooTargetStreamer
 ///
 /// FooTargetStreamer should have a pure virtual method for each directive. For
 /// example, for a ".bar symbol_name" directive, it should have
 /// virtual emitBar(const MCSymbol &Symbol) = 0;
 ///
-/// The FooTargetAsmSreamer and FooTargetELFStreamer classes implement the
+/// The FooTargetAsmStreamer and FooTargetELFStreamer classes implement the
 /// method. The assembly streamer just prints ".bar symbol_name". The object
 /// streamer does whatever is needed to implement .bar in the object file.
 ///
@@ -66,8 +66,9 @@ typedef std::pair<const MCSection *, const MCExpr *> MCSectionSubPair;
 /// MCTargetStreamer &TS = OutStreamer.getTargetStreamer();
 /// FooTargetStreamer &ATS = static_cast<FooTargetStreamer &>(TS);
 ///
-/// The base classes FooTargetAsmSreamer and FooTargetELFStreamer should *never*
-/// be treated differently. Callers should always talk to a FooTargetStreamer.
+/// The base classes FooTargetAsmStreamer and FooTargetELFStreamer should
+/// *never* be treated differently. Callers should always talk to a
+/// FooTargetStreamer.
 class MCTargetStreamer {
 protected:
   MCStreamer &Streamer;
@@ -138,6 +139,7 @@ public:
                                     StringRef StringValue = "");
   virtual void emitFPU(unsigned FPU);
   virtual void emitArch(unsigned Arch);
+  virtual void emitArchExtension(unsigned ArchExt);
   virtual void emitObjectArch(unsigned Arch);
   virtual void finishAttributeSection();
   virtual void emitInst(uint32_t Inst, char Suffix = '\0');
@@ -174,8 +176,8 @@ class MCStreamer {
   MCContext &Context;
   std::unique_ptr<MCTargetStreamer> TargetStreamer;
 
-  MCStreamer(const MCStreamer &) LLVM_DELETED_FUNCTION;
-  MCStreamer &operator=(const MCStreamer &) LLVM_DELETED_FUNCTION;
+  MCStreamer(const MCStreamer &) = delete;
+  MCStreamer &operator=(const MCStreamer &) = delete;
 
   std::vector<MCDwarfFrameInfo> DwarfFrameInfos;
   MCDwarfFrameInfo *getCurrentDwarfFrameInfo();
diff --git a/include/llvm/MC/MCSubtargetInfo.h b/include/llvm/MC/MCSubtargetInfo.h
index 9d09bd8..3984a1f 100644
--- a/include/llvm/MC/MCSubtargetInfo.h
+++ b/include/llvm/MC/MCSubtargetInfo.h
@@ -28,6 +28,7 @@ class StringRef;
 ///
 class MCSubtargetInfo {
   std::string TargetTriple;            // Target triple
+  std::string CPU; // CPU being targeted.
   ArrayRef<SubtargetFeatureKV> ProcFeatures;  // Processor feature list
   ArrayRef<SubtargetFeatureKV> ProcDesc;  // Processor descriptions
 
@@ -59,6 +60,11 @@ public:
     return TargetTriple;
   }
 
+  /// getCPU - Return the CPU string.
+  StringRef getCPU() const {
+    return CPU;
+  }
+
   /// getFeatureBits - Return the feature bits.
   ///
   uint64_t getFeatureBits() const {
@@ -136,6 +142,15 @@ public:
 
   /// Initialize an InstrItineraryData instance.
   void initInstrItins(InstrItineraryData &InstrItins) const;
+
+  /// Check whether the CPU string is valid.
+  bool isCPUStringValid(StringRef CPU) {
+    auto Found = std::find_if(ProcDesc.begin(), ProcDesc.end(),
+                              [=](const SubtargetFeatureKV &KV) {
+                                return CPU == KV.Key; 
+                              });
+    return Found != ProcDesc.end();
+  }
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/MCSymbol.h b/include/llvm/MC/MCSymbol.h
index 0b3c3ce..53443b0 100644
--- a/include/llvm/MC/MCSymbol.h
+++ b/include/llvm/MC/MCSymbol.h
@@ -53,6 +53,9 @@ namespace llvm {
     /// "Lfoo" or ".foo".
     unsigned IsTemporary : 1;
 
+    /// \brief True if this symbol can be redefined.
+    unsigned IsRedefinable : 1;
+
     /// IsUsed - True if this symbol has been used.
     mutable unsigned IsUsed : 1;
 
@@ -61,10 +64,10 @@ namespace llvm {
     friend class MCContext;
     MCSymbol(StringRef name, bool isTemporary)
       : Name(name), Section(nullptr), Value(nullptr),
-        IsTemporary(isTemporary), IsUsed(false) {}
+        IsTemporary(isTemporary), IsRedefinable(false), IsUsed(false) {}
 
-    MCSymbol(const MCSymbol&) LLVM_DELETED_FUNCTION;
-    void operator=(const MCSymbol&) LLVM_DELETED_FUNCTION;
+    MCSymbol(const MCSymbol&) = delete;
+    void operator=(const MCSymbol&) = delete;
   public:
     /// getName - Get the symbol name.
     StringRef getName() const { return Name; }
@@ -79,6 +82,19 @@ namespace llvm {
     bool isUsed() const { return IsUsed; }
     void setUsed(bool Value) const { IsUsed = Value; }
 
+    /// \brief Check if this symbol is redefinable.
+    bool isRedefinable() const { return IsRedefinable; }
+    /// \brief Mark this symbol as redefinable.
+    void setRedefinable(bool Value) { IsRedefinable = Value; }
+    /// \brief Prepare this symbol to be redefined.
+    void redefineIfPossible() {
+      if (IsRedefinable) {
+        Value = nullptr;
+        Section = nullptr;
+        IsRedefinable = false;
+      }
+    }
+
     /// @}
     /// @name Associated Sections
     /// @{
diff --git a/include/llvm/MC/MCSymbolizer.h b/include/llvm/MC/MCSymbolizer.h
index cbbb591..20e7b81 100644
--- a/include/llvm/MC/MCSymbolizer.h
+++ b/include/llvm/MC/MCSymbolizer.h
@@ -38,8 +38,8 @@ class raw_ostream;
 /// operands are actually symbolizable, and in what way. I don't think this
 /// information exists right now.
 class MCSymbolizer {
-  MCSymbolizer(const MCSymbolizer &) LLVM_DELETED_FUNCTION;
-  void operator=(const MCSymbolizer &) LLVM_DELETED_FUNCTION;
+  MCSymbolizer(const MCSymbolizer &) = delete;
+  void operator=(const MCSymbolizer &) = delete;
 
 protected:
   MCContext &Ctx;
diff --git a/include/llvm/MC/MCTargetAsmParser.h b/include/llvm/MC/MCTargetAsmParser.h
index cf92307..8412b1d 100644
--- a/include/llvm/MC/MCTargetAsmParser.h
+++ b/include/llvm/MC/MCTargetAsmParser.h
@@ -13,7 +13,6 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCTargetOptions.h"
-
 #include <memory>
 
 namespace llvm {
@@ -92,8 +91,8 @@ public:
   };
 
 private:
-  MCTargetAsmParser(const MCTargetAsmParser &) LLVM_DELETED_FUNCTION;
-  void operator=(const MCTargetAsmParser &) LLVM_DELETED_FUNCTION;
+  MCTargetAsmParser(const MCTargetAsmParser &) = delete;
+  void operator=(const MCTargetAsmParser &) = delete;
 protected: // Can only create subclasses.
   MCTargetAsmParser();
 
diff --git a/include/llvm/MC/MCTargetOptions.h b/include/llvm/MC/MCTargetOptions.h
index de79bae..ce28a19 100644
--- a/include/llvm/MC/MCTargetOptions.h
+++ b/include/llvm/MC/MCTargetOptions.h
@@ -10,8 +10,12 @@
 #ifndef LLVM_MC_MCTARGETOPTIONS_H
 #define LLVM_MC_MCTARGETOPTIONS_H
 
+#include <string>
+
 namespace llvm {
 
+class StringRef;
+
 class MCTargetOptions {
 public:
   enum AsmInstrumentation {
@@ -31,6 +35,11 @@ public:
   bool ShowMCInst : 1;
   bool AsmVerbose : 1;
   int DwarfVersion;
+  /// getABIName - If this returns a non-empty string this represents the
+  /// textual name of the ABI that we want the backend to use, e.g. o32, or
+  /// aapcs-linux.
+  StringRef getABIName() const;
+  std::string ABIName;
   MCTargetOptions();
 };
 
@@ -45,7 +54,8 @@ inline bool operator==(const MCTargetOptions &LHS, const MCTargetOptions &RHS) {
           ARE_EQUAL(ShowMCEncoding) &&
           ARE_EQUAL(ShowMCInst) &&
           ARE_EQUAL(AsmVerbose) &&
-          ARE_EQUAL(DwarfVersion));
+          ARE_EQUAL(DwarfVersion) &&
+	  ARE_EQUAL(ABIName));
 #undef ARE_EQUAL
 }
 
diff --git a/include/llvm/MC/MCTargetOptionsCommandFlags.h b/include/llvm/MC/MCTargetOptionsCommandFlags.h
index 6d4eb0e..af23a92 100644
--- a/include/llvm/MC/MCTargetOptionsCommandFlags.h
+++ b/include/llvm/MC/MCTargetOptionsCommandFlags.h
@@ -15,8 +15,8 @@
 #ifndef LLVM_MC_MCTARGETOPTIONSCOMMANDFLAGS_H
 #define LLVM_MC_MCTARGETOPTIONSCOMMANDFLAGS_H
 
-#include "llvm/Support/CommandLine.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
 cl::opt<MCTargetOptions::AsmInstrumentation> AsmInstrumentation(
@@ -40,6 +40,11 @@ cl::opt<bool> ShowMCInst("asm-show-inst",
                          cl::desc("Emit internal instruction representation to "
                                   "assembly file"));
 
+cl::opt<std::string>
+ABIName("target-abi", cl::Hidden,
+        cl::desc("The name of the ABI to be targeted from the backend."),
+        cl::init(""));
+
 static inline MCTargetOptions InitMCTargetOptionsFromFlags() {
   MCTargetOptions Options;
   Options.SanitizeAddress =
@@ -47,6 +52,7 @@ static inline MCTargetOptions InitMCTargetOptionsFromFlags() {
   Options.MCRelaxAll = RelaxAll;
   Options.DwarfVersion = DwarfVersion;
   Options.ShowMCInst = ShowMCInst;
+  Options.ABIName = ABIName;
   return Options;
 }
 
diff --git a/include/llvm/MC/MCValue.h b/include/llvm/MC/MCValue.h
index dd86979..175fed0 100644
--- a/include/llvm/MC/MCValue.h
+++ b/include/llvm/MC/MCValue.h
@@ -55,7 +55,7 @@ public:
   bool isAbsolute() const { return !SymA && !SymB; }
 
   /// print - Print the value to the stream \p OS.
-  void print(raw_ostream &OS, const MCAsmInfo *MAI) const;
+  void print(raw_ostream &OS) const;
 
   /// dump - Print the value to stderr.
   void dump() const;
diff --git a/include/llvm/MC/MCWinCOFFObjectWriter.h b/include/llvm/MC/MCWinCOFFObjectWriter.h
index dad7bb5..956e436 100644
--- a/include/llvm/MC/MCWinCOFFObjectWriter.h
+++ b/include/llvm/MC/MCWinCOFFObjectWriter.h
@@ -11,10 +11,11 @@
 #define LLVM_MC_MCWINCOFFOBJECTWRITER_H
 
 namespace llvm {
-  class MCFixup;
-  class MCObjectWriter;
-  class MCValue;
-  class raw_ostream;
+class MCAsmBackend;
+class MCFixup;
+class MCObjectWriter;
+class MCValue;
+class raw_ostream;
 
   class MCWinCOFFObjectTargetWriter {
     virtual void anchor();
@@ -27,9 +28,9 @@ namespace llvm {
     virtual ~MCWinCOFFObjectTargetWriter() {}
 
     unsigned getMachine() const { return Machine; }
-    virtual unsigned getRelocType(const MCValue &Target,
-                                  const MCFixup &Fixup,
-                                  bool IsCrossSection) const = 0;
+    virtual unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
+                                  bool IsCrossSection,
+                                  const MCAsmBackend &MAB) const = 0;
     virtual bool recordRelocation(const MCFixup &) const { return true; }
   };
 
diff --git a/include/llvm/MC/SectionKind.h b/include/llvm/MC/SectionKind.h
index 85a91c6..9e8b68f 100644
--- a/include/llvm/MC/SectionKind.h
+++ b/include/llvm/MC/SectionKind.h
@@ -55,7 +55,6 @@ class SectionKind {
         /// MergeableConst - These are sections for merging fixed-length
         /// constants together.  For example, this can be used to unique
         /// constant pool entries etc.
-        MergeableConst,
 
             /// MergeableConst4 - This is a section used by 4-byte constants,
             /// for example, floats.
@@ -151,8 +150,8 @@ public:
   bool isMergeable4ByteCString() const { return K == Mergeable4ByteCString; }
 
   bool isMergeableConst() const {
-    return K == MergeableConst || K == MergeableConst4 ||
-           K == MergeableConst8 || K == MergeableConst16;
+    return K == MergeableConst4 || K == MergeableConst8 ||
+           K == MergeableConst16;
   }
   bool isMergeableConst4() const { return K == MergeableConst4; }
   bool isMergeableConst8() const { return K == MergeableConst8; }
@@ -216,7 +215,6 @@ public:
   static SectionKind getMergeable4ByteCString() {
     return get(Mergeable4ByteCString);
   }
-  static SectionKind getMergeableConst() { return get(MergeableConst); }
   static SectionKind getMergeableConst4() { return get(MergeableConst4); }
   static SectionKind getMergeableConst8() { return get(MergeableConst8); }
   static SectionKind getMergeableConst16() { return get(MergeableConst16); }
diff --git a/include/llvm/Object/Archive.h b/include/llvm/Object/Archive.h
index 7c03dcd..4f8e281 100644
--- a/include/llvm/Object/Archive.h
+++ b/include/llvm/Object/Archive.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_OBJECT_ARCHIVE_H
 #define LLVM_OBJECT_ARCHIVE_H
 
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
@@ -41,6 +41,9 @@ struct ArchiveMemberHeader {
 
   sys::fs::perms getAccessMode() const;
   sys::TimeValue getLastModified() const;
+  llvm::StringRef getRawLastModified() const {
+    return StringRef(LastModified, sizeof(LastModified)).rtrim(" ");
+  }
   unsigned getUID() const;
   unsigned getGID() const;
 };
@@ -78,17 +81,23 @@ public:
     sys::TimeValue getLastModified() const {
       return getHeader()->getLastModified();
     }
+    StringRef getRawLastModified() const {
+      return getHeader()->getRawLastModified();
+    }
     unsigned getUID() const { return getHeader()->getUID(); }
     unsigned getGID() const { return getHeader()->getGID(); }
     sys::fs::perms getAccessMode() const {
       return getHeader()->getAccessMode();
     }
     /// \return the size of the archive member without the header or padding.
-    uint64_t getSize() const { return Data.size() - StartOfFile; }
+    uint64_t getSize() const;
+    /// \return the size in the archive header for this member.
+    uint64_t getRawSize() const;
 
     StringRef getBuffer() const {
       return StringRef(Data.data() + StartOfFile, getSize());
     }
+    uint64_t getChildOffset() const;
 
     ErrorOr<MemoryBufferRef> getMemoryBufferRef() const;
 
@@ -169,13 +178,12 @@ public:
 
   enum Kind {
     K_GNU,
+    K_MIPS64,
     K_BSD,
     K_COFF
   };
 
-  Kind kind() const { 
-    return Format;
-  }
+  Kind kind() const { return (Kind)Format; }
 
   child_iterator child_begin(bool SkipInternal = true) const;
   child_iterator child_end() const;
@@ -196,12 +204,14 @@ public:
   child_iterator findSym(StringRef name) const;
 
   bool hasSymbolTable() const;
+  child_iterator getSymbolTableChild() const { return SymbolTable; }
 
 private:
   child_iterator SymbolTable;
   child_iterator StringTable;
   child_iterator FirstRegular;
-  Kind Format;
+  unsigned Format : 2;
+  unsigned IsThin : 1;
 };
 
 }
diff --git a/include/llvm/Object/Binary.h b/include/llvm/Object/Binary.h
index 4b2b7e6..a3d6d0d 100644
--- a/include/llvm/Object/Binary.h
+++ b/include/llvm/Object/Binary.h
@@ -28,8 +28,8 @@ namespace object {
 
 class Binary {
 private:
-  Binary() LLVM_DELETED_FUNCTION;
-  Binary(const Binary &other) LLVM_DELETED_FUNCTION;
+  Binary() = delete;
+  Binary(const Binary &other) = delete;
 
   unsigned int TypeID;
 
diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h
index 3368d68..522bf68 100644
--- a/include/llvm/Object/COFF.h
+++ b/include/llvm/Object/COFF.h
@@ -276,12 +276,14 @@ public:
   }
 
   const StringTableOffset &getStringTableOffset() const {
+    assert(isSet() && "COFFSymbolRef points to nothing!");
     return CS16 ? CS16->Name.Offset : CS32->Name.Offset;
   }
 
   uint32_t getValue() const { return CS16 ? CS16->Value : CS32->Value; }
 
   int32_t getSectionNumber() const {
+    assert(isSet() && "COFFSymbolRef points to nothing!");
     if (CS16) {
       // Reserved sections are returned as negative numbers.
       if (CS16->SectionNumber <= COFF::MaxNumberOfSections16)
@@ -291,13 +293,18 @@ public:
     return static_cast<int32_t>(CS32->SectionNumber);
   }
 
-  uint16_t getType() const { return CS16 ? CS16->Type : CS32->Type; }
+  uint16_t getType() const {
+    assert(isSet() && "COFFSymbolRef points to nothing!");
+    return CS16 ? CS16->Type : CS32->Type;
+  }
 
   uint8_t getStorageClass() const {
+    assert(isSet() && "COFFSymbolRef points to nothing!");
     return CS16 ? CS16->StorageClass : CS32->StorageClass;
   }
 
   uint8_t getNumberOfAuxSymbols() const {
+    assert(isSet() && "COFFSymbolRef points to nothing!");
     return CS16 ? CS16->NumberOfAuxSymbols : CS32->NumberOfAuxSymbols;
   }
 
@@ -360,6 +367,8 @@ public:
   }
 
 private:
+  bool isSet() const { return CS16 || CS32; }
+
   const coff_symbol16 *CS16;
   const coff_symbol32 *CS32;
 };
@@ -593,9 +602,6 @@ protected:
   bool isSectionData(DataRefImpl Sec) const override;
   bool isSectionBSS(DataRefImpl Sec) const override;
   bool isSectionVirtual(DataRefImpl Sec) const override;
-  bool isSectionZeroInit(DataRefImpl Sec) const override;
-  bool isSectionReadOnlyData(DataRefImpl Sec) const override;
-  bool isSectionRequiredForExecution(DataRefImpl Sec) const override;
   bool sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h
index 3fcd98d..e7eba97 100644
--- a/include/llvm/Object/ELFObjectFile.h
+++ b/include/llvm/Object/ELFObjectFile.h
@@ -48,6 +48,9 @@ public:
   virtual std::error_code getSymbolVersion(SymbolRef Symb, StringRef &Version,
                                            bool &IsDefault) const = 0;
 
+  virtual uint64_t getSectionFlags(SectionRef Sec) const = 0;
+  virtual uint32_t getSectionType(SectionRef Sec) const = 0;
+
   static inline bool classof(const Binary *v) { return v->isELF(); }
 };
 
@@ -97,10 +100,7 @@ protected:
   bool isSectionText(DataRefImpl Sec) const override;
   bool isSectionData(DataRefImpl Sec) const override;
   bool isSectionBSS(DataRefImpl Sec) const override;
-  bool isSectionRequiredForExecution(DataRefImpl Sec) const override;
   bool isSectionVirtual(DataRefImpl Sec) const override;
-  bool isSectionZeroInit(DataRefImpl Sec) const override;
-  bool isSectionReadOnlyData(DataRefImpl Sec) const override;
   bool sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
@@ -177,6 +177,20 @@ protected:
     return DRI;
   }
 
+  bool isExportedToOtherDSO(const Elf_Sym *ESym) const {
+    unsigned char Binding = ESym->getBinding();
+    unsigned char Visibility = ESym->getVisibility();
+
+    // A symbol is exported if its binding is either GLOBAL or WEAK, and its
+    // visibility is either DEFAULT or PROTECTED. All other symbols are not
+    // exported.
+    if ((Binding == ELF::STB_GLOBAL || Binding == ELF::STB_WEAK) &&
+        (Visibility == ELF::STV_DEFAULT || Visibility == ELF::STV_PROTECTED))
+      return true;
+
+    return false;
+  }
+
   // This flag is used for classof, to distinguish ELFObjectFile from
   // its subclass. If more subclasses will be created, this flag will
   // have to become an enum.
@@ -201,6 +215,9 @@ public:
   std::error_code getSymbolVersion(SymbolRef Symb, StringRef &Version,
                                    bool &IsDefault) const override;
 
+  uint64_t getSectionFlags(SectionRef Sec) const override;
+  uint32_t getSectionType(SectionRef Sec) const override;
+
   uint8_t getBytesInAddress() const override;
   StringRef getFileFormatName() const override;
   unsigned getArch() const override;
@@ -262,6 +279,18 @@ std::error_code ELFObjectFile<ELFT>::getSymbolVersion(SymbolRef SymRef,
 }
 
 template <class ELFT>
+uint64_t ELFObjectFile<ELFT>::getSectionFlags(SectionRef Sec) const {
+  DataRefImpl DRI = Sec.getRawDataRefImpl();
+  return toELFShdrIter(DRI)->sh_flags;
+}
+
+template <class ELFT>
+uint32_t ELFObjectFile<ELFT>::getSectionType(SectionRef Sec) const {
+  DataRefImpl DRI = Sec.getRawDataRefImpl();
+  return toELFShdrIter(DRI)->sh_type;
+}
+
+template <class ELFT>
 std::error_code ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb,
                                                       uint64_t &Result) const {
   const Elf_Sym *ESym = getSymbol(Symb);
@@ -280,12 +309,16 @@ std::error_code ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb,
   const Elf_Ehdr *Header = EF.getHeader();
   Result = ESym->st_value;
 
-  // Clear the ARM/Thumb indicator flag.
-  if (Header->e_machine == ELF::EM_ARM && ESym->getType() == ELF::STT_FUNC)
+  // Clear the ARM/Thumb or microMIPS indicator flag.
+  if ((Header->e_machine == ELF::EM_ARM || Header->e_machine == ELF::EM_MIPS) &&
+      ESym->getType() == ELF::STT_FUNC)
     Result &= ~1;
 
-  if (Header->e_type == ELF::ET_REL)
-    Result += EF.getSection(ESym)->sh_addr;
+  if (Header->e_type == ELF::ET_REL) {
+    const typename ELFFile<ELFT>::Elf_Shdr * Section = EF.getSection(ESym);
+    if (Section != nullptr)
+      Result += Section->sh_addr;
+  }
 
   return object_error::success;
 }
@@ -373,6 +406,9 @@ uint32_t ELFObjectFile<ELFT>::getSymbolFlags(DataRefImpl Symb) const {
       EF.getSymbolTableIndex(ESym) == ELF::SHN_COMMON)
     Result |= SymbolRef::SF_Common;
 
+  if (isExportedToOtherDSO(ESym))
+    Result |= SymbolRef::SF_Exported;
+
   return Result;
 }
 
@@ -451,27 +487,11 @@ bool ELFObjectFile<ELFT>::isSectionBSS(DataRefImpl Sec) const {
 }
 
 template <class ELFT>
-bool ELFObjectFile<ELFT>::isSectionRequiredForExecution(DataRefImpl Sec) const {
-  return toELFShdrIter(Sec)->sh_flags & ELF::SHF_ALLOC;
-}
-
-template <class ELFT>
 bool ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec) const {
   return toELFShdrIter(Sec)->sh_type == ELF::SHT_NOBITS;
 }
 
 template <class ELFT>
-bool ELFObjectFile<ELFT>::isSectionZeroInit(DataRefImpl Sec) const {
-  return toELFShdrIter(Sec)->sh_type == ELF::SHT_NOBITS;
-}
-
-template <class ELFT>
-bool ELFObjectFile<ELFT>::isSectionReadOnlyData(DataRefImpl Sec) const {
-  Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
-  return !(EShdr->sh_flags & (ELF::SHF_WRITE | ELF::SHF_EXECINSTR));
-}
-
-template <class ELFT>
 bool ELFObjectFile<ELFT>::sectionContainsSymbol(DataRefImpl Sec,
                                                 DataRefImpl Symb) const {
   Elf_Sym_Iter ESym = toELFSymIter(Symb);
diff --git a/include/llvm/Object/ELFTypes.h b/include/llvm/Object/ELFTypes.h
index 4bc0c7c..9a97f7b 100644
--- a/include/llvm/Object/ELFTypes.h
+++ b/include/llvm/Object/ELFTypes.h
@@ -302,7 +302,10 @@ struct Elf_Rel_Base<ELFType<TargetEndianness, MaxAlign, false>, false> {
     assert(!isMips64EL);
     return r_info;
   }
-  void setRInfo(uint32_t R) { r_info = R; }
+  void setRInfo(uint32_t R, bool IsMips64EL) {
+    assert(!IsMips64EL);
+    r_info = R;
+  }
 };
 
 template <endianness TargetEndianness, std::size_t MaxAlign>
@@ -321,9 +324,12 @@ struct Elf_Rel_Base<ELFType<TargetEndianness, MaxAlign, true>, false> {
     return (t << 32) | ((t >> 8) & 0xff000000) | ((t >> 24) & 0x00ff0000) |
            ((t >> 40) & 0x0000ff00) | ((t >> 56) & 0x000000ff);
   }
-  void setRInfo(uint64_t R) {
-    // FIXME: Add mips64el support.
-    r_info = R;
+  void setRInfo(uint64_t R, bool IsMips64EL) {
+    if (IsMips64EL)
+      r_info = (R >> 32) | ((R & 0xff000000) << 8) | ((R & 0x00ff0000) << 24) |
+               ((R & 0x0000ff00) << 40) | ((R & 0x000000ff) << 56);
+    else
+      r_info = R;
   }
 };
 
@@ -338,7 +344,10 @@ struct Elf_Rel_Base<ELFType<TargetEndianness, MaxAlign, false>, true> {
     assert(!isMips64EL);
     return r_info;
   }
-  void setRInfo(uint32_t R) { r_info = R; }
+  void setRInfo(uint32_t R, bool IsMips64EL) {
+    assert(!IsMips64EL);
+    r_info = R;
+  }
 };
 
 template <endianness TargetEndianness, std::size_t MaxAlign>
@@ -358,9 +367,12 @@ struct Elf_Rel_Base<ELFType<TargetEndianness, MaxAlign, true>, true> {
     return (t << 32) | ((t >> 8) & 0xff000000) | ((t >> 24) & 0x00ff0000) |
            ((t >> 40) & 0x0000ff00) | ((t >> 56) & 0x000000ff);
   }
-  void setRInfo(uint64_t R) {
-    // FIXME: Add mips64el support.
-    r_info = R;
+  void setRInfo(uint64_t R, bool IsMips64EL) {
+    if (IsMips64EL)
+      r_info = (R >> 32) | ((R & 0xff000000) << 8) | ((R & 0x00ff0000) << 24) |
+               ((R & 0x0000ff00) << 40) | ((R & 0x000000ff) << 56);
+    else
+      r_info = R;
   }
 };
 
@@ -380,10 +392,14 @@ struct Elf_Rel_Impl<ELFType<TargetEndianness, MaxAlign, true>,
   uint32_t getType(bool isMips64EL) const {
     return (uint32_t)(this->getRInfo(isMips64EL) & 0xffffffffL);
   }
-  void setSymbol(uint32_t s) { setSymbolAndType(s, getType()); }
-  void setType(uint32_t t) { setSymbolAndType(getSymbol(), t); }
-  void setSymbolAndType(uint32_t s, uint32_t t) {
-    this->setRInfo(((uint64_t)s << 32) + (t & 0xffffffffL));
+  void setSymbol(uint32_t s, bool IsMips64EL) {
+    setSymbolAndType(s, getType(), IsMips64EL);
+  }
+  void setType(uint32_t t, bool IsMips64EL) {
+    setSymbolAndType(getSymbol(), t, IsMips64EL);
+  }
+  void setSymbolAndType(uint32_t s, uint32_t t, bool IsMips64EL) {
+    this->setRInfo(((uint64_t)s << 32) + (t & 0xffffffffL), IsMips64EL);
   }
 };
 
@@ -401,10 +417,14 @@ struct Elf_Rel_Impl<ELFType<TargetEndianness, MaxAlign, false>,
   unsigned char getType(bool isMips64EL) const {
     return (unsigned char)(this->getRInfo(isMips64EL) & 0x0ff);
   }
-  void setSymbol(uint32_t s) { setSymbolAndType(s, getType()); }
-  void setType(unsigned char t) { setSymbolAndType(getSymbol(), t); }
-  void setSymbolAndType(uint32_t s, unsigned char t) {
-    this->setRInfo((s << 8) + t);
+  void setSymbol(uint32_t s, bool IsMips64EL) {
+    setSymbolAndType(s, getType(), IsMips64EL);
+  }
+  void setType(unsigned char t, bool IsMips64EL) {
+    setSymbolAndType(getSymbol(), t, IsMips64EL);
+  }
+  void setSymbolAndType(uint32_t s, unsigned char t, bool IsMips64EL) {
+    this->setRInfo((s << 8) + t, IsMips64EL);
   }
 };
 
diff --git a/include/llvm/Object/ELFYAML.h b/include/llvm/Object/ELFYAML.h
index 687611d..45cbdbd 100644
--- a/include/llvm/Object/ELFYAML.h
+++ b/include/llvm/Object/ELFYAML.h
@@ -40,7 +40,8 @@ LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_ELFOSABI)
 // Just use 64, since it can hold 32-bit values too.
 LLVM_YAML_STRONG_TYPEDEF(uint64_t, ELF_EF)
 LLVM_YAML_STRONG_TYPEDEF(uint32_t, ELF_SHT)
-LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_REL)
+LLVM_YAML_STRONG_TYPEDEF(uint32_t, ELF_REL)
+LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_RSS)
 // Just use 64, since it can hold 32-bit values too.
 LLVM_YAML_STRONG_TYPEDEF(uint64_t, ELF_SHF)
 LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_STT)
@@ -71,14 +72,20 @@ struct LocalGlobalWeakSymbols {
   std::vector<Symbol> Global;
   std::vector<Symbol> Weak;
 };
+
+struct SectionOrType {
+  StringRef sectionNameOrType;
+};
+
 struct Section {
-  enum class SectionKind { RawContent, Relocation };
+  enum class SectionKind { Group, RawContent, Relocation };
   SectionKind Kind;
   StringRef Name;
   ELF_SHT Type;
   ELF_SHF Flags;
   llvm::yaml::Hex64 Address;
   StringRef Link;
+  StringRef Info;
   llvm::yaml::Hex64 AddressAlign;
   Section(SectionKind Kind) : Kind(Kind) {}
   virtual ~Section();
@@ -91,6 +98,17 @@ struct RawContentSection : Section {
     return S->Kind == SectionKind::RawContent;
   }
 };
+
+struct Group : Section {
+  // Members of a group contain a flag and a list of section indices
+  // that are part of the group.
+  std::vector<SectionOrType> Members;
+  Group() : Section(SectionKind::Group) {}
+  static bool classof(const Section *S) {
+    return S->Kind == SectionKind::Group;
+  }
+};
+
 struct Relocation {
   llvm::yaml::Hex64 Offset;
   int64_t Addend;
@@ -98,7 +116,6 @@ struct Relocation {
   StringRef Symbol;
 };
 struct RelocationSection : Section {
-  StringRef Info;
   std::vector<Relocation> Relocations;
   RelocationSection() : Section(SectionKind::Relocation) {}
   static bool classof(const Section *S) {
@@ -121,6 +138,7 @@ struct Object {
 LLVM_YAML_IS_SEQUENCE_VECTOR(std::unique_ptr<llvm::ELFYAML::Section>)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::Symbol)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::Relocation)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::SectionOrType)
 
 namespace llvm {
 namespace yaml {
@@ -186,6 +204,11 @@ struct ScalarEnumerationTraits<ELFYAML::ELF_REL> {
 };
 
 template <>
+struct ScalarEnumerationTraits<ELFYAML::ELF_RSS> {
+  static void enumeration(IO &IO, ELFYAML::ELF_RSS &Value);
+};
+
+template <>
 struct MappingTraits<ELFYAML::FileHeader> {
   static void mapping(IO &IO, ELFYAML::FileHeader &FileHdr);
 };
@@ -215,6 +238,10 @@ struct MappingTraits<ELFYAML::Object> {
   static void mapping(IO &IO, ELFYAML::Object &Object);
 };
 
+template <> struct MappingTraits<ELFYAML::SectionOrType> {
+  static void mapping(IO &IO, ELFYAML::SectionOrType &sectionOrType);
+};
+
 } // end namespace yaml
 } // end namespace llvm
 
diff --git a/include/llvm/Object/IRObjectFile.h b/include/llvm/Object/IRObjectFile.h
index b650d5d..74f4666 100644
--- a/include/llvm/Object/IRObjectFile.h
+++ b/include/llvm/Object/IRObjectFile.h
@@ -36,7 +36,10 @@ public:
   std::error_code printSymbolName(raw_ostream &OS,
                                   DataRefImpl Symb) const override;
   uint32_t getSymbolFlags(DataRefImpl Symb) const override;
-  const GlobalValue *getSymbolGV(DataRefImpl Symb) const;
+  GlobalValue *getSymbolGV(DataRefImpl Symb);
+  const GlobalValue *getSymbolGV(DataRefImpl Symb) const {
+    return const_cast<IRObjectFile *>(this)->getSymbolGV(Symb);
+  }
   basic_symbol_iterator symbol_begin_impl() const override;
   basic_symbol_iterator symbol_end_impl() const override;
 
@@ -46,6 +49,7 @@ public:
   Module &getModule() {
     return *M;
   }
+  std::unique_ptr<Module> takeModule();
 
   static inline bool classof(const Binary *v) {
     return v->isIR();
@@ -61,8 +65,8 @@ public:
   static ErrorOr<MemoryBufferRef>
   findBitcodeInMemBuffer(MemoryBufferRef Object);
 
-  static ErrorOr<std::unique_ptr<IRObjectFile>>
-  createIRObjectFile(MemoryBufferRef Object, LLVMContext &Context);
+  static ErrorOr<std::unique_ptr<IRObjectFile>> create(MemoryBufferRef Object,
+                                                       LLVMContext &Context);
 };
 }
 }
diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h
index 768cda6..a552aca 100644
--- a/include/llvm/Object/MachO.h
+++ b/include/llvm/Object/MachO.h
@@ -200,6 +200,7 @@ public:
 
   // MachO specific.
   std::error_code getIndirectName(DataRefImpl Symb, StringRef &Res) const;
+  unsigned getSectionType(SectionRef Sec) const;
 
   std::error_code getSymbolAddress(DataRefImpl Symb,
                                    uint64_t &Res) const override;
@@ -223,10 +224,7 @@ public:
   bool isSectionText(DataRefImpl Sec) const override;
   bool isSectionData(DataRefImpl Sec) const override;
   bool isSectionBSS(DataRefImpl Sec) const override;
-  bool isSectionRequiredForExecution(DataRefImpl Sec) const override;
   bool isSectionVirtual(DataRefImpl Sec) const override;
-  bool isSectionZeroInit(DataRefImpl Sec) const override;
-  bool isSectionReadOnlyData(DataRefImpl Sec) const override;
   bool sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
@@ -348,8 +346,8 @@ public:
   getSegmentLoadCommand(const LoadCommandInfo &L) const;
   MachO::segment_command_64
   getSegment64LoadCommand(const LoadCommandInfo &L) const;
-  MachO::linker_options_command
-  getLinkerOptionsLoadCommand(const LoadCommandInfo &L) const;
+  MachO::linker_option_command
+  getLinkerOptionLoadCommand(const LoadCommandInfo &L) const;
   MachO::version_min_command
   getVersionMinLoadCommand(const LoadCommandInfo &L) const;
   MachO::dylib_command
@@ -360,10 +358,30 @@ public:
   getDylinkerCommand(const LoadCommandInfo &L) const;
   MachO::uuid_command
   getUuidCommand(const LoadCommandInfo &L) const;
+  MachO::rpath_command
+  getRpathCommand(const LoadCommandInfo &L) const;
   MachO::source_version_command
   getSourceVersionCommand(const LoadCommandInfo &L) const;
   MachO::entry_point_command
   getEntryPointCommand(const LoadCommandInfo &L) const;
+  MachO::encryption_info_command
+  getEncryptionInfoCommand(const LoadCommandInfo &L) const;
+  MachO::encryption_info_command_64
+  getEncryptionInfoCommand64(const LoadCommandInfo &L) const;
+  MachO::sub_framework_command
+  getSubFrameworkCommand(const LoadCommandInfo &L) const;
+  MachO::sub_umbrella_command
+  getSubUmbrellaCommand(const LoadCommandInfo &L) const;
+  MachO::sub_library_command
+  getSubLibraryCommand(const LoadCommandInfo &L) const;
+  MachO::sub_client_command
+  getSubClientCommand(const LoadCommandInfo &L) const;
+  MachO::routines_command
+  getRoutinesCommand(const LoadCommandInfo &L) const;
+  MachO::routines_command_64
+  getRoutinesCommand64(const LoadCommandInfo &L) const;
+  MachO::thread_command
+  getThreadCommand(const LoadCommandInfo &L) const;
 
   MachO::any_relocation_info getRelocation(DataRefImpl Rel) const;
   MachO::data_in_code_entry getDice(DataRefImpl Rel) const;
@@ -377,6 +395,7 @@ public:
   MachO::symtab_command getSymtabLoadCommand() const;
   MachO::dysymtab_command getDysymtabLoadCommand() const;
   MachO::linkedit_data_command getDataInCodeLoadCommand() const;
+  MachO::linkedit_data_command getLinkOptHintsLoadCommand() const;
   ArrayRef<uint8_t> getDyldInfoRebaseOpcodes() const;
   ArrayRef<uint8_t> getDyldInfoBindOpcodes() const;
   ArrayRef<uint8_t> getDyldInfoWeakBindOpcodes() const;
@@ -419,6 +438,7 @@ private:
   const char *SymtabLoadCmd;
   const char *DysymtabLoadCmd;
   const char *DataInCodeLoadCmd;
+  const char *LinkOptHintsLoadCmd;
   const char *DyldInfoLoadCmd;
   const char *UuidLoadCmd;
   bool HasPageZeroSegment;
diff --git a/include/llvm/Object/MachOUniversal.h b/include/llvm/Object/MachOUniversal.h
index 46cf3fb..93f6654 100644
--- a/include/llvm/Object/MachOUniversal.h
+++ b/include/llvm/Object/MachOUniversal.h
@@ -16,8 +16,8 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Object/Binary.h"
 #include "llvm/Object/Archive.h"
+#include "llvm/Object/Binary.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MachO.h"
@@ -51,6 +51,10 @@ public:
 
     ObjectForArch getNext() const { return ObjectForArch(Parent, Index + 1); }
     uint32_t getCPUType() const { return Header.cputype; }
+    uint32_t getCPUSubType() const { return Header.cpusubtype; }
+    uint32_t getOffset() const { return Header.offset; }
+    uint32_t getSize() const { return Header.size; }
+    uint32_t getAlign() const { return Header.align; }
     std::string getArchTypeName() const {
       Triple T = MachOObjectFile::getArch(Header.cputype, Header.cpusubtype);
       return T.getArchName();
@@ -58,7 +62,7 @@ public:
 
     ErrorOr<std::unique_ptr<MachOObjectFile>> getAsObjectFile() const;
 
-    std::error_code getAsArchive(std::unique_ptr<Archive> &Result) const;
+    ErrorOr<std::unique_ptr<Archive>> getAsArchive() const;
   };
 
   class object_iterator {
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index 68b873a..b7e19b9 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -105,10 +105,7 @@ public:
   bool isText() const;
   bool isData() const;
   bool isBSS() const;
-  bool isRequiredForExecution() const;
   bool isVirtual() const;
-  bool isZeroInit() const;
-  bool isReadOnlyData() const;
 
   bool containsSymbol(SymbolRef S) const;
 
@@ -121,6 +118,7 @@ public:
   section_iterator getRelocatedSection() const;
 
   DataRefImpl getRawDataRefImpl() const;
+  const ObjectFile *getObject() const;
 };
 
 /// SymbolRef - This is a value type class that represents a single symbol in
@@ -182,8 +180,8 @@ public:
 /// figures out which type to create.
 class ObjectFile : public SymbolicFile {
   virtual void anchor();
-  ObjectFile() LLVM_DELETED_FUNCTION;
-  ObjectFile(const ObjectFile &other) LLVM_DELETED_FUNCTION;
+  ObjectFile() = delete;
+  ObjectFile(const ObjectFile &other) = delete;
 
 protected:
   ObjectFile(unsigned int Type, MemoryBufferRef Source);
@@ -233,11 +231,8 @@ protected:
   virtual bool isSectionText(DataRefImpl Sec) const = 0;
   virtual bool isSectionData(DataRefImpl Sec) const = 0;
   virtual bool isSectionBSS(DataRefImpl Sec) const = 0;
-  virtual bool isSectionRequiredForExecution(DataRefImpl Sec) const = 0;
   // A section is 'virtual' if its contents aren't present in the object image.
   virtual bool isSectionVirtual(DataRefImpl Sec) const = 0;
-  virtual bool isSectionZeroInit(DataRefImpl Sec) const = 0;
-  virtual bool isSectionReadOnlyData(DataRefImpl Sec) const = 0;
   virtual bool sectionContainsSymbol(DataRefImpl Sec,
                                      DataRefImpl Symb) const = 0;
   virtual relocation_iterator section_rel_begin(DataRefImpl Sec) const = 0;
@@ -417,22 +412,10 @@ inline bool SectionRef::isBSS() const {
   return OwningObject->isSectionBSS(SectionPimpl);
 }
 
-inline bool SectionRef::isRequiredForExecution() const {
-  return OwningObject->isSectionRequiredForExecution(SectionPimpl);
-}
-
 inline bool SectionRef::isVirtual() const {
   return OwningObject->isSectionVirtual(SectionPimpl);
 }
 
-inline bool SectionRef::isZeroInit() const {
-  return OwningObject->isSectionZeroInit(SectionPimpl);
-}
-
-inline bool SectionRef::isReadOnlyData() const {
-  return OwningObject->isSectionReadOnlyData(SectionPimpl);
-}
-
 inline bool SectionRef::containsSymbol(SymbolRef S) const {
   return OwningObject->sectionContainsSymbol(SectionPimpl,
                                              S.getRawDataRefImpl());
@@ -454,6 +437,10 @@ inline DataRefImpl SectionRef::getRawDataRefImpl() const {
   return SectionPimpl;
 }
 
+inline const ObjectFile *SectionRef::getObject() const {
+  return OwningObject;
+}
+
 /// RelocationRef
 inline RelocationRef::RelocationRef(DataRefImpl RelocationP,
                               const ObjectFile *Owner)
diff --git a/include/llvm/Object/SymbolicFile.h b/include/llvm/Object/SymbolicFile.h
index 435799a..f7b7cb4 100644
--- a/include/llvm/Object/SymbolicFile.h
+++ b/include/llvm/Object/SymbolicFile.h
@@ -87,9 +87,10 @@ public:
     SF_Absolute = 1U << 3,       // Absolute symbol
     SF_Common = 1U << 4,         // Symbol has common linkage
     SF_Indirect = 1U << 5,       // Symbol is an alias to another symbol
-    SF_FormatSpecific = 1U << 6, // Specific to the object file format
+    SF_Exported = 1U << 6,       // Symbol is visible to other DSOs
+    SF_FormatSpecific = 1U << 7, // Specific to the object file format
                                  // (e.g. section symbols)
-    SF_Thumb = 1U << 7           // Thumb symbol in a 32-bit ARM binary
+    SF_Thumb = 1U << 8,          // Thumb symbol in a 32-bit ARM binary
   };
 
   BasicSymbolRef() : OwningObject(nullptr) { }
diff --git a/include/llvm/Option/Arg.h b/include/llvm/Option/Arg.h
index dcaa540..9459a3d 100644
--- a/include/llvm/Option/Arg.h
+++ b/include/llvm/Option/Arg.h
@@ -29,8 +29,8 @@ class ArgList;
 /// The Arg class encodes just enough information to be able to
 /// derive the argument values efficiently.
 class Arg {
-  Arg(const Arg &) LLVM_DELETED_FUNCTION;
-  void operator=(const Arg &) LLVM_DELETED_FUNCTION;
+  Arg(const Arg &) = delete;
+  void operator=(const Arg &) = delete;
 
 private:
   /// \brief The option this argument is an instance of.
diff --git a/include/llvm/Option/ArgList.h b/include/llvm/Option/ArgList.h
index 3f8547e..f5ded94 100644
--- a/include/llvm/Option/ArgList.h
+++ b/include/llvm/Option/ArgList.h
@@ -91,8 +91,8 @@ public:
 /// and to iterate over groups of arguments.
 class ArgList {
 private:
-  ArgList(const ArgList &) LLVM_DELETED_FUNCTION;
-  void operator=(const ArgList &) LLVM_DELETED_FUNCTION;
+  ArgList(const ArgList &) = delete;
+  void operator=(const ArgList &) = delete;
 
 public:
   typedef SmallVector<Arg*, 16> arglist_type;
diff --git a/include/llvm/Option/OptSpecifier.h b/include/llvm/Option/OptSpecifier.h
index b7caa6e..7206bf2 100644
--- a/include/llvm/Option/OptSpecifier.h
+++ b/include/llvm/Option/OptSpecifier.h
@@ -21,7 +21,7 @@ namespace opt {
     unsigned ID;
 
   private:
-    explicit OptSpecifier(bool) LLVM_DELETED_FUNCTION;
+    explicit OptSpecifier(bool) = delete;
 
   public:
     OptSpecifier() : ID(0) {}
diff --git a/include/llvm/Pass.h b/include/llvm/Pass.h
index c2b9f95..6e92d96 100644
--- a/include/llvm/Pass.h
+++ b/include/llvm/Pass.h
@@ -83,8 +83,8 @@ class Pass {
   AnalysisResolver *Resolver;  // Used to resolve analysis
   const void *PassID;
   PassKind Kind;
-  void operator=(const Pass&) LLVM_DELETED_FUNCTION;
-  Pass(const Pass &) LLVM_DELETED_FUNCTION;
+  void operator=(const Pass&) = delete;
+  Pass(const Pass &) = delete;
 
 public:
   explicit Pass(PassKind K, char &pid)
diff --git a/include/llvm/PassAnalysisSupport.h b/include/llvm/PassAnalysisSupport.h
index 9164305..38adb2d 100644
--- a/include/llvm/PassAnalysisSupport.h
+++ b/include/llvm/PassAnalysisSupport.h
@@ -120,7 +120,7 @@ public:
 class PMDataManager;
 class AnalysisResolver {
 private:
-  AnalysisResolver() LLVM_DELETED_FUNCTION;
+  AnalysisResolver() = delete;
 
 public:
   explicit AnalysisResolver(PMDataManager &P) : PM(P) { }
diff --git a/include/llvm/PassInfo.h b/include/llvm/PassInfo.h
index d53daf1..d107618 100644
--- a/include/llvm/PassInfo.h
+++ b/include/llvm/PassInfo.h
@@ -138,8 +138,8 @@ public:
   }
 
 private:
-  void operator=(const PassInfo &) LLVM_DELETED_FUNCTION;
-  PassInfo(const PassInfo &) LLVM_DELETED_FUNCTION;
+  void operator=(const PassInfo &) = delete;
+  PassInfo(const PassInfo &) = delete;
 };
 
 }
diff --git a/include/llvm/PassManager.h b/include/llvm/PassManager.h
deleted file mode 100644
index 2a191b3..0000000
--- a/include/llvm/PassManager.h
+++ /dev/null
@@ -1,39 +0,0 @@
-//===- llvm/PassManager.h - Container for Passes ----------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This is a legacy redirect header for the old PassManager. It is intended to
-// be used by clients that have not been converted to be aware of the new pass
-// management infrastructure being built for LLVM, which is every client
-// initially. Eventually this header (and the legacy management layer) will go
-// away, but we want to minimize changes to out-of-tree users of LLVM in the
-// interim.
-//
-// Note that this header *must not* be included into the same file as the new
-// pass management infrastructure is included. Things will break spectacularly.
-// If you are starting that conversion, you should switch to explicitly
-// including LegacyPassManager.h and using the legacy namespace.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_PASSMANAGER_H
-#define LLVM_PASSMANAGER_H
-
-#include "llvm/IR/LegacyPassManager.h"
-
-namespace llvm {
-
-// Pull these into the llvm namespace so that existing code that expects it
-// there can find it.
-using legacy::PassManagerBase;
-using legacy::PassManager;
-using legacy::FunctionPassManager;
-
-}
-
-#endif
diff --git a/include/llvm/ProfileData/CoverageMapping.h b/include/llvm/ProfileData/CoverageMapping.h
index 38fc8ca..4d393b3 100644
--- a/include/llvm/ProfileData/CoverageMapping.h
+++ b/include/llvm/ProfileData/CoverageMapping.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/iterator.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/raw_ostream.h"
 #include <system_error>
@@ -27,7 +28,7 @@ namespace llvm {
 class IndexedInstrProfReader;
 namespace coverage {
 
-class ObjectFileCoverageMappingReader;
+class CoverageMappingReader;
 
 class CoverageMapping;
 struct CounterExpressions;
@@ -62,8 +63,12 @@ public:
 
   unsigned getExpressionID() const { return ID; }
 
-  bool operator==(const Counter &Other) const {
-    return Kind == Other.Kind && ID == Other.ID;
+  friend bool operator==(const Counter &LHS, const Counter &RHS) {
+    return LHS.Kind == RHS.Kind && LHS.ID == RHS.ID;
+  }
+
+  friend bool operator!=(const Counter &LHS, const Counter &RHS) {
+    return !(LHS == RHS);
   }
 
   friend bool operator<(const Counter &LHS, const Counter &RHS) {
@@ -153,24 +158,40 @@ struct CounterMappingRegion {
     SkippedRegion
   };
 
-  static const unsigned EncodingHasCodeBeforeBits = 1;
-
   Counter Count;
   unsigned FileID, ExpandedFileID;
   unsigned LineStart, ColumnStart, LineEnd, ColumnEnd;
   RegionKind Kind;
-  /// \brief A flag that is set to true when there is already code before
-  /// this region on the same line.
-  /// This is useful to accurately compute the execution counts for a line.
-  bool HasCodeBefore;
-
-  CounterMappingRegion(Counter Count, unsigned FileID, unsigned LineStart,
-                       unsigned ColumnStart, unsigned LineEnd,
-                       unsigned ColumnEnd, bool HasCodeBefore = false,
-                       RegionKind Kind = CodeRegion)
-      : Count(Count), FileID(FileID), ExpandedFileID(0), LineStart(LineStart),
-        ColumnStart(ColumnStart), LineEnd(LineEnd), ColumnEnd(ColumnEnd),
-        Kind(Kind), HasCodeBefore(HasCodeBefore) {}
+
+  CounterMappingRegion(Counter Count, unsigned FileID, unsigned ExpandedFileID,
+                       unsigned LineStart, unsigned ColumnStart,
+                       unsigned LineEnd, unsigned ColumnEnd, RegionKind Kind)
+      : Count(Count), FileID(FileID), ExpandedFileID(ExpandedFileID),
+        LineStart(LineStart), ColumnStart(ColumnStart), LineEnd(LineEnd),
+        ColumnEnd(ColumnEnd), Kind(Kind) {}
+
+  static CounterMappingRegion
+  makeRegion(Counter Count, unsigned FileID, unsigned LineStart,
+             unsigned ColumnStart, unsigned LineEnd, unsigned ColumnEnd) {
+    return CounterMappingRegion(Count, FileID, 0, LineStart, ColumnStart,
+                                LineEnd, ColumnEnd, CodeRegion);
+  }
+
+  static CounterMappingRegion
+  makeExpansion(unsigned FileID, unsigned ExpandedFileID, unsigned LineStart,
+                unsigned ColumnStart, unsigned LineEnd, unsigned ColumnEnd) {
+    return CounterMappingRegion(Counter(), FileID, ExpandedFileID, LineStart,
+                                ColumnStart, LineEnd, ColumnEnd,
+                                ExpansionRegion);
+  }
+
+  static CounterMappingRegion
+  makeSkipped(unsigned FileID, unsigned LineStart, unsigned ColumnStart,
+              unsigned LineEnd, unsigned ColumnEnd) {
+    return CounterMappingRegion(Counter(), FileID, 0, LineStart, ColumnStart,
+                                LineEnd, ColumnEnd, SkippedRegion);
+  }
+
 
   inline std::pair<unsigned, unsigned> startLoc() const {
     return std::pair<unsigned, unsigned>(LineStart, ColumnStart);
@@ -216,8 +237,10 @@ public:
                         ArrayRef<uint64_t> CounterValues = ArrayRef<uint64_t>())
       : Expressions(Expressions), CounterValues(CounterValues) {}
 
+  void setCounts(ArrayRef<uint64_t> Counts) { CounterValues = Counts; }
+
   void dump(const Counter &C, llvm::raw_ostream &OS) const;
-  void dump(const Counter &C) const { dump(C, llvm::outs()); }
+  void dump(const Counter &C) const { dump(C, dbgs()); }
 
   /// \brief Return the number of times that a region of code associated with
   /// this counter was executed.
@@ -235,10 +258,14 @@ struct FunctionRecord {
   /// \brief The number of times this function was executed.
   uint64_t ExecutionCount;
 
-  FunctionRecord(StringRef Name, ArrayRef<StringRef> Filenames,
-                 uint64_t ExecutionCount)
-      : Name(Name), Filenames(Filenames.begin(), Filenames.end()),
-        ExecutionCount(ExecutionCount) {}
+  FunctionRecord(StringRef Name, ArrayRef<StringRef> Filenames)
+      : Name(Name), Filenames(Filenames.begin(), Filenames.end()) {}
+
+  void pushRegion(CounterMappingRegion Region, uint64_t Count) {
+    if (CountedRegions.empty())
+      ExecutionCount = Count;
+    CountedRegions.emplace_back(Region, Count);
+  }
 };
 
 /// \brief Iterator over Functions, optionally filtered to a single file.
@@ -312,10 +339,22 @@ struct CoverageSegment {
   CoverageSegment(unsigned Line, unsigned Col, bool IsRegionEntry)
       : Line(Line), Col(Col), Count(0), HasCount(false),
         IsRegionEntry(IsRegionEntry) {}
+
+  CoverageSegment(unsigned Line, unsigned Col, uint64_t Count,
+                  bool IsRegionEntry)
+      : Line(Line), Col(Col), Count(Count), HasCount(true),
+        IsRegionEntry(IsRegionEntry) {}
+
+  friend bool operator==(const CoverageSegment &L, const CoverageSegment &R) {
+    return std::tie(L.Line, L.Col, L.Count, L.HasCount, L.IsRegionEntry) ==
+           std::tie(R.Line, R.Col, R.Count, R.HasCount, R.IsRegionEntry);
+  }
+
   void setCount(uint64_t NewCount) {
     Count = NewCount;
     HasCount = true;
   }
+
   void addCount(uint64_t NewCount) { setCount(Count + NewCount); }
 };
 
@@ -363,7 +402,7 @@ class CoverageMapping {
 public:
   /// \brief Load the coverage mapping using the given readers.
   static ErrorOr<std::unique_ptr<CoverageMapping>>
-  load(ObjectFileCoverageMappingReader &CoverageReader,
+  load(CoverageMappingReader &CoverageReader,
        IndexedInstrProfReader &ProfileReader);
 
   /// \brief Load the coverage mapping from the given files.
diff --git a/include/llvm/ProfileData/CoverageMappingReader.h b/include/llvm/ProfileData/CoverageMappingReader.h
index 73b0248..6714366 100644
--- a/include/llvm/ProfileData/CoverageMappingReader.h
+++ b/include/llvm/ProfileData/CoverageMappingReader.h
@@ -15,20 +15,19 @@
 #ifndef LLVM_PROFILEDATA_COVERAGEMAPPINGREADER_H
 #define LLVM_PROFILEDATA_COVERAGEMAPPINGREADER_H
 
-#include "llvm/ProfileData/InstrProf.h"
-#include "llvm/ProfileData/CoverageMapping.h"
-#include "llvm/Object/ObjectFile.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/ProfileData/CoverageMapping.h"
+#include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/FileSystem.h"
-
+#include "llvm/Support/MemoryBuffer.h"
 #include <iterator>
 
 namespace llvm {
 namespace coverage {
 
-class ObjectFileCoverageMappingReader;
+class CoverageMappingReader;
 
 /// \brief Coverage mapping information for a single function.
 struct CoverageMappingRecord {
@@ -42,15 +41,14 @@ struct CoverageMappingRecord {
 /// \brief A file format agnostic iterator over coverage mapping data.
 class CoverageMappingIterator
     : public std::iterator<std::input_iterator_tag, CoverageMappingRecord> {
-  ObjectFileCoverageMappingReader *Reader;
+  CoverageMappingReader *Reader;
   CoverageMappingRecord Record;
 
   void increment();
 
 public:
   CoverageMappingIterator() : Reader(nullptr) {}
-  CoverageMappingIterator(ObjectFileCoverageMappingReader *Reader)
-      : Reader(Reader) {
+  CoverageMappingIterator(CoverageMappingReader *Reader) : Reader(Reader) {
     increment();
   }
 
@@ -68,6 +66,14 @@ public:
   CoverageMappingRecord *operator->() { return &Record; }
 };
 
+class CoverageMappingReader {
+public:
+  virtual std::error_code readNextRecord(CoverageMappingRecord &Record) = 0;
+  CoverageMappingIterator begin() { return CoverageMappingIterator(this); }
+  CoverageMappingIterator end() { return CoverageMappingIterator(); }
+  virtual ~CoverageMappingReader() {}
+};
+
 /// \brief Base class for the raw coverage mapping and filenames data readers.
 class RawCoverageReader {
 protected:
@@ -91,10 +97,9 @@ protected:
 class RawCoverageFilenamesReader : public RawCoverageReader {
   std::vector<StringRef> &Filenames;
 
-  RawCoverageFilenamesReader(const RawCoverageFilenamesReader &)
-      LLVM_DELETED_FUNCTION;
+  RawCoverageFilenamesReader(const RawCoverageFilenamesReader &) = delete;
   RawCoverageFilenamesReader &
-  operator=(const RawCoverageFilenamesReader &) LLVM_DELETED_FUNCTION;
+  operator=(const RawCoverageFilenamesReader &) = delete;
 
 public:
   RawCoverageFilenamesReader(StringRef Data, std::vector<StringRef> &Filenames)
@@ -105,29 +110,27 @@ public:
 
 /// \brief Reader for the raw coverage mapping data.
 class RawCoverageMappingReader : public RawCoverageReader {
-  StringRef FunctionName;
   ArrayRef<StringRef> TranslationUnitFilenames;
   std::vector<StringRef> &Filenames;
   std::vector<CounterExpression> &Expressions;
   std::vector<CounterMappingRegion> &MappingRegions;
 
-  RawCoverageMappingReader(const RawCoverageMappingReader &)
-      LLVM_DELETED_FUNCTION;
+  RawCoverageMappingReader(const RawCoverageMappingReader &) = delete;
   RawCoverageMappingReader &
-  operator=(const RawCoverageMappingReader &) LLVM_DELETED_FUNCTION;
+  operator=(const RawCoverageMappingReader &) = delete;
 
 public:
-  RawCoverageMappingReader(StringRef FunctionName, StringRef MappingData,
+  RawCoverageMappingReader(StringRef MappingData,
                            ArrayRef<StringRef> TranslationUnitFilenames,
                            std::vector<StringRef> &Filenames,
                            std::vector<CounterExpression> &Expressions,
                            std::vector<CounterMappingRegion> &MappingRegions)
-      : RawCoverageReader(MappingData), FunctionName(FunctionName),
+      : RawCoverageReader(MappingData),
         TranslationUnitFilenames(TranslationUnitFilenames),
         Filenames(Filenames), Expressions(Expressions),
         MappingRegions(MappingRegions) {}
 
-  std::error_code read(CoverageMappingRecord &Record);
+  std::error_code read();
 
 private:
   std::error_code decodeCounter(unsigned Value, Counter &C);
@@ -139,7 +142,7 @@ private:
 
 /// \brief Reader for the coverage mapping data that is emitted by the
 /// frontend and stored in an object file.
-class ObjectFileCoverageMappingReader {
+class BinaryCoverageReader : public CoverageMappingReader {
 public:
   struct ProfileMappingRecord {
     CoverageMappingVersion Version;
@@ -158,8 +161,6 @@ public:
   };
 
 private:
-  std::error_code LastError;
-  object::OwningBinary<object::ObjectFile> Object;
   std::vector<StringRef> Filenames;
   std::vector<ProfileMappingRecord> MappingRecords;
   size_t CurrentRecord;
@@ -167,40 +168,16 @@ private:
   std::vector<CounterExpression> Expressions;
   std::vector<CounterMappingRegion> MappingRegions;
 
-  ObjectFileCoverageMappingReader(const ObjectFileCoverageMappingReader &)
-      LLVM_DELETED_FUNCTION;
-  ObjectFileCoverageMappingReader &
-  operator=(const ObjectFileCoverageMappingReader &) LLVM_DELETED_FUNCTION;
-
-  /// \brief Set the current error_code and return same.
-  std::error_code error(std::error_code EC) {
-    LastError = EC;
-    return EC;
-  }
+  BinaryCoverageReader(const BinaryCoverageReader &) = delete;
+  BinaryCoverageReader &operator=(const BinaryCoverageReader &) = delete;
 
-  /// \brief Clear the current error code and return a successful one.
-  std::error_code success() { return error(instrprof_error::success); }
+  BinaryCoverageReader() : CurrentRecord(0) {}
 
 public:
-  ObjectFileCoverageMappingReader(StringRef FileName);
-  ObjectFileCoverageMappingReader(
-      std::unique_ptr<MemoryBuffer> &ObjectBuffer,
-      sys::fs::file_magic Type = sys::fs::file_magic::unknown);
-
-  std::error_code readHeader();
-  std::error_code readNextRecord(CoverageMappingRecord &Record);
-
-  /// Iterator over profile data.
-  CoverageMappingIterator begin() { return CoverageMappingIterator(this); }
-  CoverageMappingIterator end() { return CoverageMappingIterator(); }
+  static ErrorOr<std::unique_ptr<BinaryCoverageReader>>
+  create(std::unique_ptr<MemoryBuffer> &ObjectBuffer);
 
-  /// \brief Return true if the reader has finished reading the profile data.
-  bool isEOF() { return LastError == instrprof_error::eof; }
-  /// \brief Return true if the reader encountered an error reading profiling
-  /// data.
-  bool hasError() { return LastError && !isEOF(); }
-  /// \brief Get the current error code.
-  std::error_code getError() { return LastError; }
+  std::error_code readNextRecord(CoverageMappingRecord &Record) override;
 };
 
 } // end namespace coverage
diff --git a/include/llvm/ProfileData/CoverageMappingWriter.h b/include/llvm/ProfileData/CoverageMappingWriter.h
index cf16140..2e3b037 100644
--- a/include/llvm/ProfileData/CoverageMappingWriter.h
+++ b/include/llvm/ProfileData/CoverageMappingWriter.h
@@ -15,9 +15,9 @@
 #ifndef LLVM_PROFILEDATA_COVERAGEMAPPINGWRITER_H
 #define LLVM_PROFILEDATA_COVERAGEMAPPINGWRITER_H
 
-#include "llvm/ProfileData/CoverageMapping.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ProfileData/CoverageMapping.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h
index 38c5310..63a6ac6 100644
--- a/include/llvm/ProfileData/InstrProfReader.h
+++ b/include/llvm/ProfileData/InstrProfReader.h
@@ -18,12 +18,11 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/OnDiskHashTable.h"
-
 #include <iterator>
 
 namespace llvm {
@@ -96,6 +95,9 @@ public:
   /// Factory method to create an appropriately typed reader for the given
   /// instrprof file.
   static ErrorOr<std::unique_ptr<InstrProfReader>> create(std::string Path);
+
+  static ErrorOr<std::unique_ptr<InstrProfReader>>
+  create(std::unique_ptr<MemoryBuffer> Buffer);
 };
 
 /// Reader for the simple text based instrprof format.
@@ -115,9 +117,8 @@ private:
   /// The current set of counter values.
   std::vector<uint64_t> Counts;
 
-  TextInstrProfReader(const TextInstrProfReader &) LLVM_DELETED_FUNCTION;
-  TextInstrProfReader &operator=(const TextInstrProfReader &)
-    LLVM_DELETED_FUNCTION;
+  TextInstrProfReader(const TextInstrProfReader &) = delete;
+  TextInstrProfReader &operator=(const TextInstrProfReader &) = delete;
 public:
   TextInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer_)
       : DataBuffer(std::move(DataBuffer_)), Line(*DataBuffer, true, '#') {}
@@ -168,9 +169,8 @@ private:
   const char *NamesStart;
   const char *ProfileEnd;
 
-  RawInstrProfReader(const RawInstrProfReader &) LLVM_DELETED_FUNCTION;
-  RawInstrProfReader &operator=(const RawInstrProfReader &)
-    LLVM_DELETED_FUNCTION;
+  RawInstrProfReader(const RawInstrProfReader &) = delete;
+  RawInstrProfReader &operator=(const RawInstrProfReader &) = delete;
 public:
   RawInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
       : DataBuffer(std::move(DataBuffer)) { }
@@ -274,9 +274,8 @@ private:
   /// The maximal execution count among all functions.
   uint64_t MaxFunctionCount;
 
-  IndexedInstrProfReader(const IndexedInstrProfReader &) LLVM_DELETED_FUNCTION;
-  IndexedInstrProfReader &operator=(const IndexedInstrProfReader &)
-    LLVM_DELETED_FUNCTION;
+  IndexedInstrProfReader(const IndexedInstrProfReader &) = delete;
+  IndexedInstrProfReader &operator=(const IndexedInstrProfReader &) = delete;
 public:
   IndexedInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
       : DataBuffer(std::move(DataBuffer)), Index(nullptr), CurrentOffset(0) {}
@@ -296,8 +295,11 @@ public:
   uint64_t getMaximumFunctionCount() { return MaxFunctionCount; }
 
   /// Factory method to create an indexed reader.
-  static std::error_code
-  create(std::string Path, std::unique_ptr<IndexedInstrProfReader> &Result);
+  static ErrorOr<std::unique_ptr<IndexedInstrProfReader>>
+  create(std::string Path);
+
+  static ErrorOr<std::unique_ptr<IndexedInstrProfReader>>
+  create(std::unique_ptr<MemoryBuffer> Buffer);
 };
 
 } // end namespace llvm
diff --git a/include/llvm/ProfileData/InstrProfWriter.h b/include/llvm/ProfileData/InstrProfWriter.h
index e76f668..ce0bb52 100644
--- a/include/llvm/ProfileData/InstrProfWriter.h
+++ b/include/llvm/ProfileData/InstrProfWriter.h
@@ -20,8 +20,8 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
-
 #include <vector>
 
 namespace llvm {
@@ -42,8 +42,13 @@ public:
   std::error_code addFunctionCounts(StringRef FunctionName,
                                     uint64_t FunctionHash,
                                     ArrayRef<uint64_t> Counters);
-  /// Ensure that all data is written to disk.
+  /// Write the profile to \c OS
   void write(raw_fd_ostream &OS);
+  /// Write the profile, returning the raw data. For testing.
+  std::unique_ptr<MemoryBuffer> writeBuffer();
+
+private:
+  std::pair<uint64_t, uint64_t> writeImpl(raw_ostream &OS);
 };
 
 } // end namespace llvm
diff --git a/include/llvm/ProfileData/SampleProf.h b/include/llvm/ProfileData/SampleProf.h
index 5c70b31..df0a055 100644
--- a/include/llvm/ProfileData/SampleProf.h
+++ b/include/llvm/ProfileData/SampleProf.h
@@ -19,7 +19,6 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-
 #include <system_error>
 
 namespace llvm {
diff --git a/include/llvm/ProfileData/SampleProfReader.h b/include/llvm/ProfileData/SampleProfReader.h
index c20b815..c082a1a 100644
--- a/include/llvm/ProfileData/SampleProfReader.h
+++ b/include/llvm/ProfileData/SampleProfReader.h
@@ -14,12 +14,12 @@
 #define LLVM_PROFILEDATA_SAMPLEPROFREADER_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/include/llvm/Support/ARMBuildAttributes.h b/include/llvm/Support/ARMBuildAttributes.h
index 07340de..96a8219 100644
--- a/include/llvm/Support/ARMBuildAttributes.h
+++ b/include/llvm/Support/ARMBuildAttributes.h
@@ -171,6 +171,8 @@ enum {
   WCharWidth4Bytes = 4, // sizeof(wchar_t) == 4
 
   // Tag_ABI_FP_denormal, (=20), uleb128
+  PositiveZero = 0,
+  IEEEDenormals = 1,
   PreserveFPSign = 2, // sign when flushed-to-zero is preserved
 
   // Tag_ABI_FP_number_model, (=23), uleb128
@@ -198,6 +200,9 @@ enum {
   // Tag_FP_HP_extension, (=36), uleb128
   AllowHPFP = 1, // Allow use of Half Precision FP
 
+  // Tag_FP_16bit_format, (=38), uleb128
+  FP16FormatIEEE = 1,
+
   // Tag_MPextension_use, (=42), uleb128
   AllowMP = 1, // Allow use of MP extensions
 
diff --git a/include/llvm/Support/COFF.h b/include/llvm/Support/COFF.h
index 150bce5..7f54822 100644
--- a/include/llvm/Support/COFF.h
+++ b/include/llvm/Support/COFF.h
@@ -664,16 +664,16 @@ namespace COFF {
     }
   };
 
-  enum CodeViewLineTableIdentifiers {
-    DEBUG_SECTION_MAGIC           = 0x4,
-    DEBUG_SYMBOL_SUBSECTION       = 0xF1,
-    DEBUG_LINE_TABLE_SUBSECTION   = 0xF2,
+  enum CodeViewIdentifiers {
+    DEBUG_SECTION_MAGIC = 0x4,
+    DEBUG_SYMBOL_SUBSECTION = 0xF1,
+    DEBUG_LINE_TABLE_SUBSECTION = 0xF2,
     DEBUG_STRING_TABLE_SUBSECTION = 0xF3,
-    DEBUG_INDEX_SUBSECTION        = 0xF4,
+    DEBUG_INDEX_SUBSECTION = 0xF4,
 
     // Symbol subsections are split into records of different types.
     DEBUG_SYMBOL_TYPE_PROC_START = 0x1147,
-    DEBUG_SYMBOL_TYPE_PROC_END   = 0x114F
+    DEBUG_SYMBOL_TYPE_PROC_END = 0x114F
   };
 
   inline bool isReservedSectionNumber(int32_t SectionNumber) {
diff --git a/include/llvm/Support/Casting.h b/include/llvm/Support/Casting.h
index beed31a..6ba5efa 100644
--- a/include/llvm/Support/Casting.h
+++ b/include/llvm/Support/Casting.h
@@ -243,6 +243,26 @@ inline typename cast_retty<X, Y *>::ret_type cast(Y *Val) {
 // accepted.
 //
 template <class X, class Y>
+LLVM_ATTRIBUTE_UNUSED_RESULT inline typename std::enable_if<
+    !is_simple_type<Y>::value, typename cast_retty<X, const Y>::ret_type>::type
+cast_or_null(const Y &Val) {
+  if (!Val)
+    return nullptr;
+  assert(isa<X>(Val) && "cast_or_null<Ty>() argument of incompatible type!");
+  return cast<X>(Val);
+}
+
+template <class X, class Y>
+LLVM_ATTRIBUTE_UNUSED_RESULT inline typename std::enable_if<
+    !is_simple_type<Y>::value, typename cast_retty<X, Y>::ret_type>::type
+cast_or_null(Y &Val) {
+  if (!Val)
+    return nullptr;
+  assert(isa<X>(Val) && "cast_or_null<Ty>() argument of incompatible type!");
+  return cast<X>(Val);
+}
+
+template <class X, class Y>
 LLVM_ATTRIBUTE_UNUSED_RESULT inline typename cast_retty<X, Y *>::ret_type
 cast_or_null(Y *Val) {
   if (!Val) return nullptr;
@@ -282,6 +302,20 @@ dyn_cast(Y *Val) {
 // value is accepted.
 //
 template <class X, class Y>
+LLVM_ATTRIBUTE_UNUSED_RESULT inline typename std::enable_if<
+    !is_simple_type<Y>::value, typename cast_retty<X, const Y>::ret_type>::type
+dyn_cast_or_null(const Y &Val) {
+  return (Val && isa<X>(Val)) ? cast<X>(Val) : nullptr;
+}
+
+template <class X, class Y>
+LLVM_ATTRIBUTE_UNUSED_RESULT inline typename std::enable_if<
+    !is_simple_type<Y>::value, typename cast_retty<X, Y>::ret_type>::type
+dyn_cast_or_null(Y &Val) {
+  return (Val && isa<X>(Val)) ? cast<X>(Val) : nullptr;
+}
+
+template <class X, class Y>
 LLVM_ATTRIBUTE_UNUSED_RESULT inline typename cast_retty<X, Y *>::ret_type
 dyn_cast_or_null(Y *Val) {
   return (Val && isa<X>(Val)) ? cast<X>(Val) : nullptr;
diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h
index 2b5c9c5..64c5d96 100644
--- a/include/llvm/Support/CommandLine.h
+++ b/include/llvm/Support/CommandLine.h
@@ -20,6 +20,7 @@
 #ifndef LLVM_SUPPORT_COMMANDLINE_H
 #define LLVM_SUPPORT_COMMANDLINE_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Twine.h"
@@ -40,7 +41,7 @@ namespace cl {
 //===----------------------------------------------------------------------===//
 // ParseCommandLineOptions - Command line option processing entry point.
 //
-void ParseCommandLineOptions(int argc, const char * const *argv,
+void ParseCommandLineOptions(int argc, const char *const *argv,
                              const char *Overview = nullptr);
 
 //===----------------------------------------------------------------------===//
@@ -66,25 +67,33 @@ void SetVersionPrinter(void (*func)());
 ///                          information specific to the tool.
 void AddExtraVersionPrinter(void (*func)());
 
-
 // PrintOptionValues - Print option values.
 // With -print-options print the difference between option values and defaults.
 // With -print-all-options print all option values.
 // (Currently not perfect, but best-effort.)
 void PrintOptionValues();
 
-// MarkOptionsChanged - Internal helper function.
-void MarkOptionsChanged();
+// Forward declaration - AddLiteralOption needs to be up here to make gcc happy.
+class Option;
+
+/// \brief Adds a new option for parsing and provides the option it refers to.
+///
+/// \param O pointer to the option
+/// \param Name the string name for the option to handle during parsing
+///
+/// Literal options are used by some parsers to register special option values.
+/// This is how the PassNameParser registers pass names for opt.
+void AddLiteralOption(Option &O, const char *Name);
 
 //===----------------------------------------------------------------------===//
 // Flags permitted to be passed to command line arguments
 //
 
-enum NumOccurrencesFlag {      // Flags for the number of occurrences allowed
-  Optional        = 0x00,      // Zero or One occurrence
-  ZeroOrMore      = 0x01,      // Zero or more occurrences allowed
-  Required        = 0x02,      // One occurrence required
-  OneOrMore       = 0x03,      // One or more occurrences required
+enum NumOccurrencesFlag { // Flags for the number of occurrences allowed
+  Optional = 0x00,        // Zero or One occurrence
+  ZeroOrMore = 0x01,      // Zero or more occurrences allowed
+  Required = 0x02,        // One occurrence required
+  OneOrMore = 0x03,       // One or more occurrences required
 
   // ConsumeAfter - Indicates that this option is fed anything that follows the
   // last positional argument required by the application (it is an error if
@@ -93,20 +102,20 @@ enum NumOccurrencesFlag {      // Flags for the number of occurrences allowed
   // found.  Once a filename is found, all of the succeeding arguments are
   // passed, unprocessed, to the ConsumeAfter option.
   //
-  ConsumeAfter    = 0x04
+  ConsumeAfter = 0x04
 };
 
-enum ValueExpected {           // Is a value required for the option?
+enum ValueExpected { // Is a value required for the option?
   // zero reserved for the unspecified value
-  ValueOptional   = 0x01,      // The value can appear... or not
-  ValueRequired   = 0x02,      // The value is required to appear!
-  ValueDisallowed = 0x03       // A value may not be specified (for flags)
+  ValueOptional = 0x01,  // The value can appear... or not
+  ValueRequired = 0x02,  // The value is required to appear!
+  ValueDisallowed = 0x03 // A value may not be specified (for flags)
 };
 
-enum OptionHidden {            // Control whether -help shows this option
-  NotHidden       = 0x00,      // Option included in -help & -help-hidden
-  Hidden          = 0x01,      // -help doesn't, but -help-hidden does
-  ReallyHidden    = 0x02       // Neither -help nor -help-hidden show this arg
+enum OptionHidden {   // Control whether -help shows this option
+  NotHidden = 0x00,   // Option included in -help & -help-hidden
+  Hidden = 0x01,      // -help doesn't, but -help-hidden does
+  ReallyHidden = 0x02 // Neither -help nor -help-hidden show this arg
 };
 
 // Formatting flags - This controls special features that the option might have
@@ -125,16 +134,16 @@ enum OptionHidden {            // Control whether -help shows this option
 //
 
 enum FormattingFlags {
-  NormalFormatting = 0x00,     // Nothing special
-  Positional       = 0x01,     // Is a positional argument, no '-' required
-  Prefix           = 0x02,     // Can this option directly prefix its value?
-  Grouping         = 0x03      // Can this option group with other options?
+  NormalFormatting = 0x00, // Nothing special
+  Positional = 0x01,       // Is a positional argument, no '-' required
+  Prefix = 0x02,           // Can this option directly prefix its value?
+  Grouping = 0x03          // Can this option group with other options?
 };
 
-enum MiscFlags {               // Miscellaneous flags to adjust argument
-  CommaSeparated     = 0x01,  // Should this cl::list split between commas?
-  PositionalEatsArgs = 0x02,  // Should this positional cl::list eat -args?
-  Sink               = 0x04   // Should this cl::list eat all unknown options?
+enum MiscFlags {             // Miscellaneous flags to adjust argument
+  CommaSeparated = 0x01,     // Should this cl::list split between commas?
+  PositionalEatsArgs = 0x02, // Should this positional cl::list eat -args?
+  Sink = 0x04                // Should this cl::list eat all unknown options?
 };
 
 //===----------------------------------------------------------------------===//
@@ -145,9 +154,13 @@ private:
   const char *const Name;
   const char *const Description;
   void registerCategory();
+
 public:
-  OptionCategory(const char *const Name, const char *const Description = nullptr)
-      : Name(Name), Description(Description) { registerCategory(); }
+  OptionCategory(const char *const Name,
+                 const char *const Description = nullptr)
+      : Name(Name), Description(Description) {
+    registerCategory();
+  }
   const char *getName() const { return Name; }
   const char *getDescription() const { return Description; }
 };
@@ -176,7 +189,7 @@ class Option {
   // Out of line virtual function to provide home for the class.
   virtual void anchor();
 
-  int NumOccurrences;     // The number of times specified
+  int NumOccurrences; // The number of times specified
   // Occurrences, HiddenFlag, and Formatting are all enum types but to avoid
   // problems with signed enums in bitfields.
   unsigned Occurrences : 3; // enum NumOccurrencesFlag
@@ -186,22 +199,21 @@ class Option {
   unsigned HiddenFlag : 2; // enum OptionHidden
   unsigned Formatting : 2; // enum FormattingFlags
   unsigned Misc : 3;
-  unsigned Position;      // Position of last occurrence of the option
-  unsigned AdditionalVals;// Greater than 0 for multi-valued option.
-  Option *NextRegistered; // Singly linked list of registered options.
+  unsigned Position;       // Position of last occurrence of the option
+  unsigned AdditionalVals; // Greater than 0 for multi-valued option.
 
 public:
   const char *ArgStr;   // The argument string itself (ex: "help", "o")
   const char *HelpStr;  // The descriptive text message for -help
   const char *ValueStr; // String describing what the value of this option is
   OptionCategory *Category; // The Category this option belongs to
+  bool FullyInitialized;    // Has addArguemnt been called?
 
   inline enum NumOccurrencesFlag getNumOccurrencesFlag() const {
     return (enum NumOccurrencesFlag)Occurrences;
   }
   inline enum ValueExpected getValueExpectedFlag() const {
-    return Value ? ((enum ValueExpected)Value)
-              : getValueExpectedFlagDefault();
+    return Value ? ((enum ValueExpected)Value) : getValueExpectedFlagDefault();
   }
   inline enum OptionHidden getOptionHiddenFlag() const {
     return (enum OptionHidden)HiddenFlag;
@@ -209,9 +221,7 @@ public:
   inline enum FormattingFlags getFormattingFlag() const {
     return (enum FormattingFlags)Formatting;
   }
-  inline unsigned getMiscFlags() const {
-    return Misc;
-  }
+  inline unsigned getMiscFlags() const { return Misc; }
   inline unsigned getPosition() const { return Position; }
   inline unsigned getNumAdditionalVals() const { return AdditionalVals; }
 
@@ -221,28 +231,27 @@ public:
   //-------------------------------------------------------------------------===
   // Accessor functions set by OptionModifiers
   //
-  void setArgStr(const char *S) { ArgStr = S; }
+  void setArgStr(const char *S);
   void setDescription(const char *S) { HelpStr = S; }
   void setValueStr(const char *S) { ValueStr = S; }
-  void setNumOccurrencesFlag(enum NumOccurrencesFlag Val) {
-    Occurrences = Val;
-  }
+  void setNumOccurrencesFlag(enum NumOccurrencesFlag Val) { Occurrences = Val; }
   void setValueExpectedFlag(enum ValueExpected Val) { Value = Val; }
   void setHiddenFlag(enum OptionHidden Val) { HiddenFlag = Val; }
   void setFormattingFlag(enum FormattingFlags V) { Formatting = V; }
   void setMiscFlag(enum MiscFlags M) { Misc |= M; }
   void setPosition(unsigned pos) { Position = pos; }
   void setCategory(OptionCategory &C) { Category = &C; }
+
 protected:
   explicit Option(enum NumOccurrencesFlag OccurrencesFlag,
                   enum OptionHidden Hidden)
-    : NumOccurrences(0), Occurrences(OccurrencesFlag), Value(0),
-      HiddenFlag(Hidden), Formatting(NormalFormatting), Misc(0),
-      Position(0), AdditionalVals(0), NextRegistered(nullptr),
-      ArgStr(""), HelpStr(""), ValueStr(""), Category(&GeneralCategory) {
-  }
+      : NumOccurrences(0), Occurrences(OccurrencesFlag), Value(0),
+        HiddenFlag(Hidden), Formatting(NormalFormatting), Misc(0), Position(0),
+        AdditionalVals(0), ArgStr(""), HelpStr(""), ValueStr(""),
+        Category(&GeneralCategory), FullyInitialized(false) {}
 
   inline void setNumAdditionalVals(unsigned n) { AdditionalVals = n; }
+
 public:
   // addArgument - Register this argument with the commandline system.
   //
@@ -254,8 +263,6 @@ public:
   /// For testing purposes only.
   void removeArgument();
 
-  Option *getNextRegisteredOption() const { return NextRegistered; }
-
   // Return the width of the option tag for printing...
   virtual size_t getOptionWidth() const = 0;
 
@@ -266,12 +273,12 @@ public:
 
   virtual void printOptionValue(size_t GlobalWidth, bool Force) const = 0;
 
-  virtual void getExtraOptionNames(SmallVectorImpl<const char*> &) {}
+  virtual void getExtraOptionNames(SmallVectorImpl<const char *> &) {}
 
   // addOccurrence - Wrapper around handleOccurrence that enforces Flags.
   //
-  virtual bool addOccurrence(unsigned pos, StringRef ArgName,
-                             StringRef Value, bool MultiArg = false);
+  virtual bool addOccurrence(unsigned pos, StringRef ArgName, StringRef Value,
+                             bool MultiArg = false);
 
   // Prints option name followed by message.  Always returns true.
   bool error(const Twine &Message, StringRef ArgName = StringRef());
@@ -281,7 +288,6 @@ public:
   virtual ~Option() {}
 };
 
-
 //===----------------------------------------------------------------------===//
 // Command line option modifiers that can be used to modify the behavior of
 // command line option parsers...
@@ -306,36 +312,31 @@ struct value_desc {
 // the default constructor for the argument type does not give you what you
 // want.  This is only valid on "opt" arguments, not on "list" arguments.
 //
-template<class Ty>
-struct initializer {
+template <class Ty> struct initializer {
   const Ty &Init;
   initializer(const Ty &Val) : Init(Val) {}
 
-  template<class Opt>
-  void apply(Opt &O) const { O.setInitialValue(Init); }
+  template <class Opt> void apply(Opt &O) const { O.setInitialValue(Init); }
 };
 
-template<class Ty>
-initializer<Ty> init(const Ty &Val) {
+template <class Ty> initializer<Ty> init(const Ty &Val) {
   return initializer<Ty>(Val);
 }
 
-
 // location - Allow the user to specify which external variable they want to
 // store the results of the command line argument processing into, if they don't
 // want to store it in the option itself.
 //
-template<class Ty>
-struct LocationClass {
+template <class Ty> struct LocationClass {
   Ty &Loc;
   LocationClass(Ty &L) : Loc(L) {}
 
-  template<class Opt>
-  void apply(Opt &O) const { O.setLocation(O, Loc); }
+  template <class Opt> void apply(Opt &O) const { O.setLocation(O, Loc); }
 };
 
-template<class Ty>
-LocationClass<Ty> location(Ty &L) { return LocationClass<Ty>(L); }
+template <class Ty> LocationClass<Ty> location(Ty &L) {
+  return LocationClass<Ty>(L);
+}
 
 // cat - Specifiy the Option category for the command line argument to belong
 // to.
@@ -343,11 +344,9 @@ struct cat {
   OptionCategory &Category;
   cat(OptionCategory &c) : Category(c) {}
 
-  template<class Opt>
-  void apply(Opt &O) const { O.setCategory(Category); }
+  template <class Opt> void apply(Opt &O) const { O.setCategory(Category); }
 };
 
-
 //===----------------------------------------------------------------------===//
 // OptionValue class
 
@@ -360,11 +359,11 @@ private:
   virtual void anchor();
 };
 
-template<class DataType> struct OptionValue;
+template <class DataType> struct OptionValue;
 
 // The default value safely does nothing. Option value printing is only
 // best-effort.
-template<class DataType, bool isClass>
+template <class DataType, bool isClass>
 struct OptionValueBase : public GenericOptionValue {
   // Temporary storage for argument passing.
   typedef OptionValue<DataType> WrapperType;
@@ -374,21 +373,20 @@ struct OptionValueBase : public GenericOptionValue {
   const DataType &getValue() const { llvm_unreachable("no default value"); }
 
   // Some options may take their value from a different data type.
-  template<class DT>
-  void setValue(const DT& /*V*/) {}
+  template <class DT> void setValue(const DT & /*V*/) {}
 
-  bool compare(const DataType &/*V*/) const { return false; }
+  bool compare(const DataType & /*V*/) const { return false; }
 
-  bool compare(const GenericOptionValue& /*V*/) const override {
+  bool compare(const GenericOptionValue & /*V*/) const override {
     return false;
   }
 };
 
 // Simple copy of the option value.
-template<class DataType>
-class OptionValueCopy : public GenericOptionValue {
+template <class DataType> class OptionValueCopy : public GenericOptionValue {
   DataType Value;
   bool Valid;
+
 public:
   OptionValueCopy() : Valid(false) {}
 
@@ -399,37 +397,36 @@ public:
     return Value;
   }
 
-  void setValue(const DataType &V) { Valid = true; Value = V; }
-
-  bool compare(const DataType &V) const {
-    return Valid && (Value != V);
+  void setValue(const DataType &V) {
+    Valid = true;
+    Value = V;
   }
 
+  bool compare(const DataType &V) const { return Valid && (Value != V); }
+
   bool compare(const GenericOptionValue &V) const override {
     const OptionValueCopy<DataType> &VC =
-      static_cast< const OptionValueCopy<DataType>& >(V);
-    if (!VC.hasValue()) return false;
+        static_cast<const OptionValueCopy<DataType> &>(V);
+    if (!VC.hasValue())
+      return false;
     return compare(VC.getValue());
   }
 };
 
 // Non-class option values.
-template<class DataType>
+template <class DataType>
 struct OptionValueBase<DataType, false> : OptionValueCopy<DataType> {
   typedef DataType WrapperType;
 };
 
 // Top-level option class.
-template<class DataType>
+template <class DataType>
 struct OptionValue : OptionValueBase<DataType, std::is_class<DataType>::value> {
   OptionValue() {}
 
-  OptionValue(const DataType& V) {
-    this->setValue(V);
-  }
+  OptionValue(const DataType &V) { this->setValue(V); }
   // Some options may take their value from a different data type.
-  template<class DT>
-  OptionValue<DataType> &operator=(const DT& V) {
+  template <class DT> OptionValue<DataType> &operator=(const DT &V) {
     this->setValue(V);
     return *this;
   }
@@ -437,36 +434,33 @@ struct OptionValue : OptionValueBase<DataType, std::is_class<DataType>::value> {
 
 // Other safe-to-copy-by-value common option types.
 enum boolOrDefault { BOU_UNSET, BOU_TRUE, BOU_FALSE };
-template<>
+template <>
 struct OptionValue<cl::boolOrDefault> : OptionValueCopy<cl::boolOrDefault> {
   typedef cl::boolOrDefault WrapperType;
 
   OptionValue() {}
 
-  OptionValue(const cl::boolOrDefault& V) {
-    this->setValue(V);
-  }
-  OptionValue<cl::boolOrDefault> &operator=(const cl::boolOrDefault& V) {
+  OptionValue(const cl::boolOrDefault &V) { this->setValue(V); }
+  OptionValue<cl::boolOrDefault> &operator=(const cl::boolOrDefault &V) {
     setValue(V);
     return *this;
   }
+
 private:
   void anchor() override;
 };
 
-template<>
-struct OptionValue<std::string> : OptionValueCopy<std::string> {
+template <> struct OptionValue<std::string> : OptionValueCopy<std::string> {
   typedef StringRef WrapperType;
 
   OptionValue() {}
 
-  OptionValue(const std::string& V) {
-    this->setValue(V);
-  }
-  OptionValue<std::string> &operator=(const std::string& V) {
+  OptionValue(const std::string &V) { this->setValue(V); }
+  OptionValue<std::string> &operator=(const std::string &V) {
     setValue(V);
     return *this;
   }
+
 private:
   void anchor() override;
 };
@@ -476,20 +470,20 @@ private:
 //
 #define clEnumVal(ENUMVAL, DESC) #ENUMVAL, int(ENUMVAL), DESC
 #define clEnumValN(ENUMVAL, FLAGNAME, DESC) FLAGNAME, int(ENUMVAL), DESC
-#define clEnumValEnd (reinterpret_cast<void*>(0))
+#define clEnumValEnd (reinterpret_cast<void *>(0))
 
 // values - For custom data types, allow specifying a group of values together
 // as the values that go into the mapping that the option handler uses.  Note
 // that the values list must always have a 0 at the end of the list to indicate
 // that the list has ended.
 //
-template<class DataType>
-class ValuesClass {
+template <class DataType> class ValuesClass {
   // Use a vector instead of a map, because the lists should be short,
   // the overhead is less, and most importantly, it keeps them in the order
   // inserted so we can print our option out nicely.
-  SmallVector<std::pair<const char *, std::pair<int, const char *> >,4> Values;
+  SmallVector<std::pair<const char *, std::pair<int, const char *>>, 4> Values;
   void processValues(va_list Vals);
+
 public:
   ValuesClass(const char *EnumName, DataType Val, const char *Desc,
               va_list ValueArgs) {
@@ -500,13 +494,12 @@ public:
     while (const char *enumName = va_arg(ValueArgs, const char *)) {
       DataType EnumVal = static_cast<DataType>(va_arg(ValueArgs, int));
       const char *EnumDesc = va_arg(ValueArgs, const char *);
-      Values.push_back(std::make_pair(enumName,      // Add value to value map
+      Values.push_back(std::make_pair(enumName, // Add value to value map
                                       std::make_pair(EnumVal, EnumDesc)));
     }
   }
 
-  template<class Opt>
-  void apply(Opt &O) const {
+  template <class Opt> void apply(Opt &O) const {
     for (size_t i = 0, e = Values.size(); i != e; ++i)
       O.getParser().addLiteralOption(Values[i].first, Values[i].second.first,
                                      Values[i].second.second);
@@ -516,11 +509,11 @@ public:
 template <class DataType>
 ValuesClass<DataType> LLVM_END_WITH_NULL
 values(const char *Arg, DataType Val, const char *Desc, ...) {
-    va_list ValueArgs;
-    va_start(ValueArgs, Desc);
-    ValuesClass<DataType> Vals(Arg, Val, Desc, ValueArgs);
-    va_end(ValueArgs);
-    return Vals;
+  va_list ValueArgs;
+  va_start(ValueArgs, Desc);
+  ValuesClass<DataType> Vals(Arg, Val, Desc, ValueArgs);
+  va_end(ValueArgs);
+  return Vals;
 }
 
 //===----------------------------------------------------------------------===//
@@ -539,13 +532,16 @@ class generic_parser_base {
 protected:
   class GenericOptionInfo {
   public:
-    GenericOptionInfo(const char *name, const char *helpStr) :
-      Name(name), HelpStr(helpStr) {}
+    GenericOptionInfo(const char *name, const char *helpStr)
+        : Name(name), HelpStr(helpStr) {}
     const char *Name;
     const char *HelpStr;
   };
+
 public:
-  virtual ~generic_parser_base() {}  // Base class should have virtual-dtor
+  generic_parser_base(Option &O) : Owner(O) {}
+
+  virtual ~generic_parser_base() {} // Base class should have virtual-dtor
 
   // getNumOptions - Virtual function implemented by generic subclass to
   // indicate how many entries are in Values.
@@ -576,30 +572,24 @@ public:
   //
   // Template definition ensures that the option and default have the same
   // DataType (via the same AnyOptionValue).
-  template<class AnyOptionValue>
+  template <class AnyOptionValue>
   void printOptionDiff(const Option &O, const AnyOptionValue &V,
                        const AnyOptionValue &Default,
                        size_t GlobalWidth) const {
     printGenericOptionDiff(O, V, Default, GlobalWidth);
   }
 
-  void initialize(Option &O) {
-    // All of the modifiers for the option have been processed by now, so the
-    // argstr field should be stable, copy it down now.
-    //
-    hasArgStr = O.hasArgStr();
-  }
+  void initialize() {}
 
-  void getExtraOptionNames(SmallVectorImpl<const char*> &OptionNames) {
+  void getExtraOptionNames(SmallVectorImpl<const char *> &OptionNames) {
     // If there has been no argstr specified, that means that we need to add an
     // argument for every possible option.  This ensures that our options are
     // vectored to us.
-    if (!hasArgStr)
+    if (!Owner.hasArgStr())
       for (unsigned i = 0, e = getNumOptions(); i != e; ++i)
         OptionNames.push_back(getOption(i));
   }
 
-
   enum ValueExpected getValueExpectedFlagDefault() const {
     // If there is an ArgStr specified, then we are of the form:
     //
@@ -612,7 +602,7 @@ public:
     //
     // If this is the case, we cannot allow a value.
     //
-    if (hasArgStr)
+    if (Owner.hasArgStr())
       return ValueRequired;
     else
       return ValueDisallowed;
@@ -624,7 +614,7 @@ public:
   unsigned findOption(const char *Name);
 
 protected:
-  bool hasArgStr;
+  Option &Owner;
 };
 
 // Default parser implementation - This implementation depends on having a
@@ -633,17 +623,18 @@ protected:
 // command line option for -help.  Because this is a simple mapping parser, the
 // data type can be any unsupported type.
 //
-template <class DataType>
-class parser : public generic_parser_base {
+template <class DataType> class parser : public generic_parser_base {
 protected:
   class OptionInfo : public GenericOptionInfo {
   public:
-    OptionInfo(const char *name, DataType v, const char *helpStr) :
-      GenericOptionInfo(name, helpStr), V(v) {}
+    OptionInfo(const char *name, DataType v, const char *helpStr)
+        : GenericOptionInfo(name, helpStr), V(v) {}
     OptionValue<DataType> V;
   };
   SmallVector<OptionInfo, 8> Values;
+
 public:
+  parser(Option &O) : generic_parser_base(O) {}
   typedef DataType parser_data_type;
 
   // Implement virtual functions needed by generic_parser_base
@@ -661,7 +652,7 @@ public:
   // parse - Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, DataType &V) {
     StringRef ArgVal;
-    if (hasArgStr)
+    if (Owner.hasArgStr())
       ArgVal = Arg;
     else
       ArgVal = ArgName;
@@ -682,7 +673,7 @@ public:
     assert(findOption(Name) == Values.size() && "Option already exists!");
     OptionInfo X(Name, static_cast<DataType>(V), HelpStr);
     Values.push_back(X);
-    MarkOptionsChanged();
+    AddLiteralOption(Owner, Name);
   }
 
   /// removeLiteralOption - Remove the specified option.
@@ -690,24 +681,26 @@ public:
   void removeLiteralOption(const char *Name) {
     unsigned N = findOption(Name);
     assert(N != Values.size() && "Option not found!");
-    Values.erase(Values.begin()+N);
+    Values.erase(Values.begin() + N);
   }
 };
 
 //--------------------------------------------------
 // basic_parser - Super class of parsers to provide boilerplate code
 //
-class basic_parser_impl {  // non-template implementation of basic_parser<t>
+class basic_parser_impl { // non-template implementation of basic_parser<t>
 public:
+  basic_parser_impl(Option &O) {}
+
   virtual ~basic_parser_impl() {}
 
   enum ValueExpected getValueExpectedFlagDefault() const {
     return ValueRequired;
   }
 
-  void getExtraOptionNames(SmallVectorImpl<const char*> &) {}
+  void getExtraOptionNames(SmallVectorImpl<const char *> &) {}
 
-  void initialize(Option &) {}
+  void initialize() {}
 
   // Return the width of the option tag for printing...
   size_t getOptionWidth(const Option &O) const;
@@ -735,9 +728,9 @@ protected:
 // basic_parser - The real basic parser is just a template wrapper that provides
 // a typedef for the provided data type.
 //
-template<class DataType>
-class basic_parser : public basic_parser_impl {
+template <class DataType> class basic_parser : public basic_parser_impl {
 public:
+  basic_parser(Option &O) : basic_parser_impl(O) {}
   typedef DataType parser_data_type;
   typedef OptionValue<DataType> OptVal;
 };
@@ -745,18 +738,14 @@ public:
 //--------------------------------------------------
 // parser<bool>
 //
-template<>
-class parser<bool> : public basic_parser<bool> {
-  const char *ArgStr;
+template <> class parser<bool> : public basic_parser<bool> {
 public:
+  parser(Option &O) : basic_parser(O) {}
 
   // parse - Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, bool &Val);
 
-  template <class Opt>
-  void initialize(Opt &O) {
-    ArgStr = O.ArgStr;
-  }
+  void initialize() {}
 
   enum ValueExpected getValueExpectedFlagDefault() const {
     return ValueOptional;
@@ -776,9 +765,10 @@ EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<bool>);
 
 //--------------------------------------------------
 // parser<boolOrDefault>
-template<>
-class parser<boolOrDefault> : public basic_parser<boolOrDefault> {
+template <> class parser<boolOrDefault> : public basic_parser<boolOrDefault> {
 public:
+  parser(Option &O) : basic_parser(O) {}
+
   // parse - Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, boolOrDefault &Val);
 
@@ -801,9 +791,10 @@ EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<boolOrDefault>);
 //--------------------------------------------------
 // parser<int>
 //
-template<>
-class parser<int> : public basic_parser<int> {
+template <> class parser<int> : public basic_parser<int> {
 public:
+  parser(Option &O) : basic_parser(O) {}
+
   // parse - Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, int &Val);
 
@@ -819,13 +810,13 @@ public:
 
 EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<int>);
 
-
 //--------------------------------------------------
 // parser<unsigned>
 //
-template<>
-class parser<unsigned> : public basic_parser<unsigned> {
+template <> class parser<unsigned> : public basic_parser<unsigned> {
 public:
+  parser(Option &O) : basic_parser(O) {}
+
   // parse - Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, unsigned &Val);
 
@@ -844,9 +835,11 @@ EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<unsigned>);
 //--------------------------------------------------
 // parser<unsigned long long>
 //
-template<>
+template <>
 class parser<unsigned long long> : public basic_parser<unsigned long long> {
 public:
+  parser(Option &O) : basic_parser(O) {}
+
   // parse - Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg,
              unsigned long long &Val);
@@ -866,9 +859,10 @@ EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<unsigned long long>);
 //--------------------------------------------------
 // parser<double>
 //
-template<>
-class parser<double> : public basic_parser<double> {
+template <> class parser<double> : public basic_parser<double> {
 public:
+  parser(Option &O) : basic_parser(O) {}
+
   // parse - Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, double &Val);
 
@@ -887,9 +881,10 @@ EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<double>);
 //--------------------------------------------------
 // parser<float>
 //
-template<>
-class parser<float> : public basic_parser<float> {
+template <> class parser<float> : public basic_parser<float> {
 public:
+  parser(Option &O) : basic_parser(O) {}
+
   // parse - Return true on error.
   bool parse(Option &O, StringRef ArgName, StringRef Arg, float &Val);
 
@@ -908,9 +903,10 @@ EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<float>);
 //--------------------------------------------------
 // parser<std::string>
 //
-template<>
-class parser<std::string> : public basic_parser<std::string> {
+template <> class parser<std::string> : public basic_parser<std::string> {
 public:
+  parser(Option &O) : basic_parser(O) {}
+
   // parse - Return true on error.
   bool parse(Option &, StringRef, StringRef Arg, std::string &Value) {
     Value = Arg.str();
@@ -932,9 +928,10 @@ EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<std::string>);
 //--------------------------------------------------
 // parser<char>
 //
-template<>
-class parser<char> : public basic_parser<char> {
+template <> class parser<char> : public basic_parser<char> {
 public:
+  parser(Option &O) : basic_parser(O) {}
+
   // parse - Return true on error.
   bool parse(Option &, StringRef, StringRef Arg, char &Value) {
     Value = Arg[0];
@@ -960,7 +957,7 @@ EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<char>);
 // parser to handle all the template nastiness.
 
 // This overloaded function is selected by the generic parser.
-template<class ParserClass, class DT>
+template <class ParserClass, class DT>
 void printOptionDiff(const Option &O, const generic_parser_base &P, const DT &V,
                      const OptionValue<DT> &Default, size_t GlobalWidth) {
   OptionValue<DT> OV = V;
@@ -969,18 +966,16 @@ void printOptionDiff(const Option &O, const generic_parser_base &P, const DT &V,
 
 // This is instantiated for basic parsers when the parsed value has a different
 // type than the option value. e.g. HelpPrinter.
-template<class ParserDT, class ValDT>
-struct OptionDiffPrinter {
-  void print(const Option &O, const parser<ParserDT> P, const ValDT &/*V*/,
-             const OptionValue<ValDT> &/*Default*/, size_t GlobalWidth) {
+template <class ParserDT, class ValDT> struct OptionDiffPrinter {
+  void print(const Option &O, const parser<ParserDT> P, const ValDT & /*V*/,
+             const OptionValue<ValDT> & /*Default*/, size_t GlobalWidth) {
     P.printOptionNoValue(O, GlobalWidth);
   }
 };
 
 // This is instantiated for basic parsers when the parsed value has the same
 // type as the option value.
-template<class DT>
-struct OptionDiffPrinter<DT, DT> {
+template <class DT> struct OptionDiffPrinter<DT, DT> {
   void print(const Option &O, const parser<DT> P, const DT &V,
              const OptionValue<DT> &Default, size_t GlobalWidth) {
     P.printOptionDiff(O, V, Default, GlobalWidth);
@@ -989,15 +984,14 @@ struct OptionDiffPrinter<DT, DT> {
 
 // This overloaded function is selected by the basic parser, which may parse a
 // different type than the option type.
-template<class ParserClass, class ValDT>
+template <class ParserClass, class ValDT>
 void printOptionDiff(
-  const Option &O,
-  const basic_parser<typename ParserClass::parser_data_type> &P,
-  const ValDT &V, const OptionValue<ValDT> &Default,
-  size_t GlobalWidth) {
+    const Option &O,
+    const basic_parser<typename ParserClass::parser_data_type> &P,
+    const ValDT &V, const OptionValue<ValDT> &Default, size_t GlobalWidth) {
 
   OptionDiffPrinter<typename ParserClass::parser_data_type, ValDT> printer;
-  printer.print(O, static_cast<const ParserClass&>(P), V, Default,
+  printer.print(O, static_cast<const ParserClass &>(P), V, Default,
                 GlobalWidth);
 }
 
@@ -1007,46 +1001,53 @@ void printOptionDiff(
 // not correctly respond to the apply method).  Because the syntax to use this
 // is a pain, we have the 'apply' method below to handle the nastiness...
 //
-template<class Mod> struct applicator {
-  template<class Opt>
-  static void opt(const Mod &M, Opt &O) { M.apply(O); }
+template <class Mod> struct applicator {
+  template <class Opt> static void opt(const Mod &M, Opt &O) { M.apply(O); }
 };
 
 // Handle const char* as a special case...
-template<unsigned n> struct applicator<char[n]> {
-  template<class Opt>
-  static void opt(const char *Str, Opt &O) { O.setArgStr(Str); }
+template <unsigned n> struct applicator<char[n]> {
+  template <class Opt> static void opt(const char *Str, Opt &O) {
+    O.setArgStr(Str);
+  }
 };
-template<unsigned n> struct applicator<const char[n]> {
-  template<class Opt>
-  static void opt(const char *Str, Opt &O) { O.setArgStr(Str); }
+template <unsigned n> struct applicator<const char[n]> {
+  template <class Opt> static void opt(const char *Str, Opt &O) {
+    O.setArgStr(Str);
+  }
 };
-template<> struct applicator<const char*> {
-  template<class Opt>
-  static void opt(const char *Str, Opt &O) { O.setArgStr(Str); }
+template <> struct applicator<const char *> {
+  template <class Opt> static void opt(const char *Str, Opt &O) {
+    O.setArgStr(Str);
+  }
 };
 
-template<> struct applicator<NumOccurrencesFlag> {
+template <> struct applicator<NumOccurrencesFlag> {
   static void opt(NumOccurrencesFlag N, Option &O) {
     O.setNumOccurrencesFlag(N);
   }
 };
-template<> struct applicator<ValueExpected> {
+template <> struct applicator<ValueExpected> {
   static void opt(ValueExpected VE, Option &O) { O.setValueExpectedFlag(VE); }
 };
-template<> struct applicator<OptionHidden> {
+template <> struct applicator<OptionHidden> {
   static void opt(OptionHidden OH, Option &O) { O.setHiddenFlag(OH); }
 };
-template<> struct applicator<FormattingFlags> {
+template <> struct applicator<FormattingFlags> {
   static void opt(FormattingFlags FF, Option &O) { O.setFormattingFlag(FF); }
 };
-template<> struct applicator<MiscFlags> {
+template <> struct applicator<MiscFlags> {
   static void opt(MiscFlags MF, Option &O) { O.setMiscFlag(MF); }
 };
 
-// apply method - Apply a modifier to an option in a type safe way.
-template<class Mod, class Opt>
-void apply(const Mod &M, Opt *O) {
+// apply method - Apply modifiers to an option in a type safe way.
+template <class Opt, class Mod, class... Mods>
+void apply(Opt *O, const Mod &M, const Mods &... Ms) {
+  applicator<Mod>::opt(M, *O);
+  apply(O, Ms...);
+}
+
+template <class Opt, class Mod> void apply(Opt *O, const Mod &M) {
   applicator<Mod>::opt(M, *O);
 }
 
@@ -1057,16 +1058,17 @@ void apply(const Mod &M, Opt *O) {
 // assumes the user will specify a variable to store the data into with the
 // cl::location(x) modifier.
 //
-template<class DataType, bool ExternalStorage, bool isClass>
+template <class DataType, bool ExternalStorage, bool isClass>
 class opt_storage {
-  DataType *Location;   // Where to store the object...
+  DataType *Location; // Where to store the object...
   OptionValue<DataType> Default;
 
   void check_location() const {
     assert(Location && "cl::location(...) not specified for a command "
-           "line option with external storage, "
-           "or cl::init specified before cl::location()!!");
+                       "line option with external storage, "
+                       "or cl::init specified before cl::location()!!");
   }
+
 public:
   opt_storage() : Location(nullptr) {}
 
@@ -1078,16 +1080,21 @@ public:
     return false;
   }
 
-  template<class T>
-  void setValue(const T &V, bool initial = false) {
+  template <class T> void setValue(const T &V, bool initial = false) {
     check_location();
     *Location = V;
     if (initial)
       Default = V;
   }
 
-  DataType &getValue() { check_location(); return *Location; }
-  const DataType &getValue() const { check_location(); return *Location; }
+  DataType &getValue() {
+    check_location();
+    return *Location;
+  }
+  const DataType &getValue() const {
+    check_location();
+    return *Location;
+  }
 
   operator DataType() const { return this->getValue(); }
 
@@ -1098,13 +1105,12 @@ public:
 // inherit from a class, we do so.  This makes us exactly compatible with the
 // object in all cases that it is used.
 //
-template<class DataType>
-class opt_storage<DataType,false,true> : public DataType {
+template <class DataType>
+class opt_storage<DataType, false, true> : public DataType {
 public:
   OptionValue<DataType> Default;
 
-  template<class T>
-  void setValue(const T &V, bool initial = false) {
+  template <class T> void setValue(const T &V, bool initial = false) {
     DataType::operator=(V);
     if (initial)
       Default = V;
@@ -1120,8 +1126,7 @@ public:
 // this case, we store an instance through containment, and overload operators
 // to get at the value.
 //
-template<class DataType>
-class opt_storage<DataType, false, false> {
+template <class DataType> class opt_storage<DataType, false, false> {
 public:
   DataType Value;
   OptionValue<DataType> Default;
@@ -1130,8 +1135,7 @@ public:
   // type.
   opt_storage() : Value(DataType()), Default(DataType()) {}
 
-  template<class T>
-  void setValue(const T &V, bool initial = false) {
+  template <class T> void setValue(const T &V, bool initial = false) {
     Value = V;
     if (initial)
       Default = V;
@@ -1147,12 +1151,11 @@ public:
   DataType operator->() const { return Value; }
 };
 
-
 //===----------------------------------------------------------------------===//
 // opt - A scalar command line option.
 //
 template <class DataType, bool ExternalStorage = false,
-          class ParserClass = parser<DataType> >
+          class ParserClass = parser<DataType>>
 class opt : public Option,
             public opt_storage<DataType, ExternalStorage,
                                std::is_class<DataType>::value> {
@@ -1161,9 +1164,9 @@ class opt : public Option,
   bool handleOccurrence(unsigned pos, StringRef ArgName,
                         StringRef Arg) override {
     typename ParserClass::parser_data_type Val =
-       typename ParserClass::parser_data_type();
+        typename ParserClass::parser_data_type();
     if (Parser.parse(*this, ArgName, Arg, Val))
-      return true;                            // Parse error!
+      return true; // Parse error!
     this->setValue(Val);
     this->setPosition(pos);
     return false;
@@ -1172,102 +1175,50 @@ class opt : public Option,
   enum ValueExpected getValueExpectedFlagDefault() const override {
     return Parser.getValueExpectedFlagDefault();
   }
-  void getExtraOptionNames(SmallVectorImpl<const char*> &OptionNames) override {
+  void
+  getExtraOptionNames(SmallVectorImpl<const char *> &OptionNames) override {
     return Parser.getExtraOptionNames(OptionNames);
   }
 
   // Forward printing stuff to the parser...
-  size_t getOptionWidth() const override {return Parser.getOptionWidth(*this);}
+  size_t getOptionWidth() const override {
+    return Parser.getOptionWidth(*this);
+  }
   void printOptionInfo(size_t GlobalWidth) const override {
     Parser.printOptionInfo(*this, GlobalWidth);
   }
 
   void printOptionValue(size_t GlobalWidth, bool Force) const override {
     if (Force || this->getDefault().compare(this->getValue())) {
-      cl::printOptionDiff<ParserClass>(
-        *this, Parser, this->getValue(), this->getDefault(), GlobalWidth);
+      cl::printOptionDiff<ParserClass>(*this, Parser, this->getValue(),
+                                       this->getDefault(), GlobalWidth);
     }
   }
 
   void done() {
     addArgument();
-    Parser.initialize(*this);
+    Parser.initialize();
   }
+
+  // Command line options should not be copyable
+  opt(const opt &) = delete;
+  opt &operator=(const opt &) = delete;
+
 public:
   // setInitialValue - Used by the cl::init modifier...
   void setInitialValue(const DataType &V) { this->setValue(V, true); }
 
   ParserClass &getParser() { return Parser; }
 
-  template<class T>
-  DataType &operator=(const T &Val) {
+  template <class T> DataType &operator=(const T &Val) {
     this->setValue(Val);
     return this->getValue();
   }
 
-  // One option...
-  template<class M0t>
-  explicit opt(const M0t &M0) : Option(Optional, NotHidden) {
-    apply(M0, this);
-    done();
-  }
-
-  // Two options...
-  template<class M0t, class M1t>
-  opt(const M0t &M0, const M1t &M1) : Option(Optional, NotHidden) {
-    apply(M0, this); apply(M1, this);
-    done();
-  }
-
-  // Three options...
-  template<class M0t, class M1t, class M2t>
-  opt(const M0t &M0, const M1t &M1,
-      const M2t &M2) : Option(Optional, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this);
-    done();
-  }
-  // Four options...
-  template<class M0t, class M1t, class M2t, class M3t>
-  opt(const M0t &M0, const M1t &M1, const M2t &M2,
-      const M3t &M3) : Option(Optional, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
-    done();
-  }
-  // Five options...
-  template<class M0t, class M1t, class M2t, class M3t, class M4t>
-  opt(const M0t &M0, const M1t &M1, const M2t &M2, const M3t &M3,
-      const M4t &M4) : Option(Optional, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
-    apply(M4, this);
-    done();
-  }
-  // Six options...
-  template<class M0t, class M1t, class M2t, class M3t,
-           class M4t, class M5t>
-  opt(const M0t &M0, const M1t &M1, const M2t &M2, const M3t &M3,
-      const M4t &M4, const M5t &M5) : Option(Optional, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
-    apply(M4, this); apply(M5, this);
-    done();
-  }
-  // Seven options...
-  template<class M0t, class M1t, class M2t, class M3t,
-           class M4t, class M5t, class M6t>
-  opt(const M0t &M0, const M1t &M1, const M2t &M2, const M3t &M3,
-      const M4t &M4, const M5t &M5,
-      const M6t &M6) : Option(Optional, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
-    apply(M4, this); apply(M5, this); apply(M6, this);
-    done();
-  }
-  // Eight options...
-  template<class M0t, class M1t, class M2t, class M3t,
-           class M4t, class M5t, class M6t, class M7t>
-  opt(const M0t &M0, const M1t &M1, const M2t &M2, const M3t &M3,
-      const M4t &M4, const M5t &M5, const M6t &M6,
-      const M7t &M7) : Option(Optional, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
-    apply(M4, this); apply(M5, this); apply(M6, this); apply(M7, this);
+  template <class... Mods>
+  explicit opt(const Mods &... Ms)
+      : Option(Optional, NotHidden), Parser(*this) {
+    apply(this, Ms...);
     done();
   }
 };
@@ -1285,9 +1236,8 @@ EXTERN_TEMPLATE_INSTANTIATION(class opt<bool>);
 // assumes the user will specify a variable to store the data into with the
 // cl::location(x) modifier.
 //
-template<class DataType, class StorageClass>
-class list_storage {
-  StorageClass *Location;   // Where to store the object...
+template <class DataType, class StorageClass> class list_storage {
+  StorageClass *Location; // Where to store the object...
 
 public:
   list_storage() : Location(0) {}
@@ -1299,32 +1249,30 @@ public:
     return false;
   }
 
-  template<class T>
-  void addValue(const T &V) {
+  template <class T> void addValue(const T &V) {
     assert(Location != 0 && "cl::location(...) not specified for a command "
-           "line option with external storage!");
+                            "line option with external storage!");
     Location->push_back(V);
   }
 };
 
-
 // Define how to hold a class type object, such as a string.  Since we can
 // inherit from a class, we do so.  This makes us exactly compatible with the
 // object in all cases that it is used.
 //
-template<class DataType>
+template <class DataType>
 class list_storage<DataType, bool> : public std::vector<DataType> {
 public:
-  template<class T>
-  void addValue(const T &V) { std::vector<DataType>::push_back(V); }
+  template <class T> void addValue(const T &V) {
+    std::vector<DataType>::push_back(V);
+  }
 };
 
-
 //===----------------------------------------------------------------------===//
 // list - A list of command line options.
 //
 template <class DataType, class Storage = bool,
-          class ParserClass = parser<DataType> >
+          class ParserClass = parser<DataType>>
 class list : public Option, public list_storage<DataType, Storage> {
   std::vector<unsigned> Positions;
   ParserClass Parser;
@@ -1332,16 +1280,17 @@ class list : public Option, public list_storage<DataType, Storage> {
   enum ValueExpected getValueExpectedFlagDefault() const override {
     return Parser.getValueExpectedFlagDefault();
   }
-  void getExtraOptionNames(SmallVectorImpl<const char*> &OptionNames) override {
+  void
+  getExtraOptionNames(SmallVectorImpl<const char *> &OptionNames) override {
     return Parser.getExtraOptionNames(OptionNames);
   }
 
   bool handleOccurrence(unsigned pos, StringRef ArgName,
                         StringRef Arg) override {
     typename ParserClass::parser_data_type Val =
-      typename ParserClass::parser_data_type();
+        typename ParserClass::parser_data_type();
     if (Parser.parse(*this, ArgName, Arg, Val))
-      return true;  // Parse Error!
+      return true; // Parse Error!
     list_storage<DataType, Storage>::addValue(Val);
     setPosition(pos);
     Positions.push_back(pos);
@@ -1349,19 +1298,26 @@ class list : public Option, public list_storage<DataType, Storage> {
   }
 
   // Forward printing stuff to the parser...
-  size_t getOptionWidth() const override {return Parser.getOptionWidth(*this);}
+  size_t getOptionWidth() const override {
+    return Parser.getOptionWidth(*this);
+  }
   void printOptionInfo(size_t GlobalWidth) const override {
     Parser.printOptionInfo(*this, GlobalWidth);
   }
 
   // Unimplemented: list options don't currently store their default value.
-  void printOptionValue(size_t /*GlobalWidth*/,
-                        bool /*Force*/) const override {}
+  void printOptionValue(size_t /*GlobalWidth*/, bool /*Force*/) const override {
+  }
 
   void done() {
     addArgument();
-    Parser.initialize(*this);
+    Parser.initialize();
   }
+
+  // Command line options should not be copyable
+  list(const list &) = delete;
+  list &operator=(const list &) = delete;
+
 public:
   ParserClass &getParser() { return Parser; }
 
@@ -1370,71 +1326,12 @@ public:
     return Positions[optnum];
   }
 
-  void setNumAdditionalVals(unsigned n) {
-    Option::setNumAdditionalVals(n);
-  }
+  void setNumAdditionalVals(unsigned n) { Option::setNumAdditionalVals(n); }
 
-  // One option...
-  template<class M0t>
-  explicit list(const M0t &M0) : Option(ZeroOrMore, NotHidden) {
-    apply(M0, this);
-    done();
-  }
-  // Two options...
-  template<class M0t, class M1t>
-  list(const M0t &M0, const M1t &M1) : Option(ZeroOrMore, NotHidden) {
-    apply(M0, this); apply(M1, this);
-    done();
-  }
-  // Three options...
-  template<class M0t, class M1t, class M2t>
-  list(const M0t &M0, const M1t &M1, const M2t &M2)
-    : Option(ZeroOrMore, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this);
-    done();
-  }
-  // Four options...
-  template<class M0t, class M1t, class M2t, class M3t>
-  list(const M0t &M0, const M1t &M1, const M2t &M2, const M3t &M3)
-    : Option(ZeroOrMore, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
-    done();
-  }
-  // Five options...
-  template<class M0t, class M1t, class M2t, class M3t, class M4t>
-  list(const M0t &M0, const M1t &M1, const M2t &M2, const M3t &M3,
-       const M4t &M4) : Option(ZeroOrMore, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
-    apply(M4, this);
-    done();
-  }
-  // Six options...
-  template<class M0t, class M1t, class M2t, class M3t,
-           class M4t, class M5t>
-  list(const M0t &M0, const M1t &M1, const M2t &M2, const M3t &M3,
-       const M4t &M4, const M5t &M5) : Option(ZeroOrMore, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
-    apply(M4, this); apply(M5, this);
-    done();
-  }
-  // Seven options...
-  template<class M0t, class M1t, class M2t, class M3t,
-           class M4t, class M5t, class M6t>
-  list(const M0t &M0, const M1t &M1, const M2t &M2, const M3t &M3,
-       const M4t &M4, const M5t &M5, const M6t &M6)
-    : Option(ZeroOrMore, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
-    apply(M4, this); apply(M5, this); apply(M6, this);
-    done();
-  }
-  // Eight options...
-  template<class M0t, class M1t, class M2t, class M3t,
-           class M4t, class M5t, class M6t, class M7t>
-  list(const M0t &M0, const M1t &M1, const M2t &M2, const M3t &M3,
-       const M4t &M4, const M5t &M5, const M6t &M6,
-       const M7t &M7) : Option(ZeroOrMore, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
-    apply(M4, this); apply(M5, this); apply(M6, this); apply(M7, this);
+  template <class... Mods>
+  explicit list(const Mods &... Ms)
+      : Option(ZeroOrMore, NotHidden), Parser(*this) {
+    apply(this, Ms...);
     done();
   }
 };
@@ -1445,10 +1342,11 @@ struct multi_val {
   explicit multi_val(unsigned N) : AdditionalVals(N) {}
 
   template <typename D, typename S, typename P>
-  void apply(list<D, S, P> &L) const { L.setNumAdditionalVals(AdditionalVals); }
+  void apply(list<D, S, P> &L) const {
+    L.setNumAdditionalVals(AdditionalVals);
+  }
 };
 
-
 //===----------------------------------------------------------------------===//
 // bits_storage class
 
@@ -1456,15 +1354,13 @@ struct multi_val {
 // assumes the user will specify a variable to store the data into with the
 // cl::location(x) modifier.
 //
-template<class DataType, class StorageClass>
-class bits_storage {
-  unsigned *Location;   // Where to store the bits...
+template <class DataType, class StorageClass> class bits_storage {
+  unsigned *Location; // Where to store the bits...
 
-  template<class T>
-  static unsigned Bit(const T &V) {
+  template <class T> static unsigned Bit(const T &V) {
     unsigned BitPos = reinterpret_cast<unsigned>(V);
     assert(BitPos < sizeof(unsigned) * CHAR_BIT &&
-          "enum exceeds width of bit vector!");
+           "enum exceeds width of bit vector!");
     return 1 << BitPos;
   }
 
@@ -1478,57 +1374,45 @@ public:
     return false;
   }
 
-  template<class T>
-  void addValue(const T &V) {
+  template <class T> void addValue(const T &V) {
     assert(Location != 0 && "cl::location(...) not specified for a command "
-           "line option with external storage!");
+                            "line option with external storage!");
     *Location |= Bit(V);
   }
 
   unsigned getBits() { return *Location; }
 
-  template<class T>
-  bool isSet(const T &V) {
+  template <class T> bool isSet(const T &V) {
     return (*Location & Bit(V)) != 0;
   }
 };
 
-
 // Define how to hold bits.  Since we can inherit from a class, we do so.
 // This makes us exactly compatible with the bits in all cases that it is used.
 //
-template<class DataType>
-class bits_storage<DataType, bool> {
-  unsigned Bits;   // Where to store the bits...
+template <class DataType> class bits_storage<DataType, bool> {
+  unsigned Bits; // Where to store the bits...
 
-  template<class T>
-  static unsigned Bit(const T &V) {
+  template <class T> static unsigned Bit(const T &V) {
     unsigned BitPos = (unsigned)V;
     assert(BitPos < sizeof(unsigned) * CHAR_BIT &&
-          "enum exceeds width of bit vector!");
+           "enum exceeds width of bit vector!");
     return 1 << BitPos;
   }
 
 public:
-  template<class T>
-  void addValue(const T &V) {
-    Bits |=  Bit(V);
-  }
+  template <class T> void addValue(const T &V) { Bits |= Bit(V); }
 
   unsigned getBits() { return Bits; }
 
-  template<class T>
-  bool isSet(const T &V) {
-    return (Bits & Bit(V)) != 0;
-  }
+  template <class T> bool isSet(const T &V) { return (Bits & Bit(V)) != 0; }
 };
 
-
 //===----------------------------------------------------------------------===//
 // bits - A bit vector of command options.
 //
 template <class DataType, class Storage = bool,
-          class ParserClass = parser<DataType> >
+          class ParserClass = parser<DataType>>
 class bits : public Option, public bits_storage<DataType, Storage> {
   std::vector<unsigned> Positions;
   ParserClass Parser;
@@ -1536,16 +1420,17 @@ class bits : public Option, public bits_storage<DataType, Storage> {
   enum ValueExpected getValueExpectedFlagDefault() const override {
     return Parser.getValueExpectedFlagDefault();
   }
-  void getExtraOptionNames(SmallVectorImpl<const char*> &OptionNames) override {
+  void
+  getExtraOptionNames(SmallVectorImpl<const char *> &OptionNames) override {
     return Parser.getExtraOptionNames(OptionNames);
   }
 
   bool handleOccurrence(unsigned pos, StringRef ArgName,
                         StringRef Arg) override {
     typename ParserClass::parser_data_type Val =
-      typename ParserClass::parser_data_type();
+        typename ParserClass::parser_data_type();
     if (Parser.parse(*this, ArgName, Arg, Val))
-      return true;  // Parse Error!
+      return true; // Parse Error!
     this->addValue(Val);
     setPosition(pos);
     Positions.push_back(pos);
@@ -1553,19 +1438,26 @@ class bits : public Option, public bits_storage<DataType, Storage> {
   }
 
   // Forward printing stuff to the parser...
-  size_t getOptionWidth() const override {return Parser.getOptionWidth(*this);}
+  size_t getOptionWidth() const override {
+    return Parser.getOptionWidth(*this);
+  }
   void printOptionInfo(size_t GlobalWidth) const override {
     Parser.printOptionInfo(*this, GlobalWidth);
   }
 
   // Unimplemented: bits options don't currently store their default values.
-  void printOptionValue(size_t /*GlobalWidth*/,
-                        bool /*Force*/) const override {}
+  void printOptionValue(size_t /*GlobalWidth*/, bool /*Force*/) const override {
+  }
 
   void done() {
     addArgument();
-    Parser.initialize(*this);
+    Parser.initialize();
   }
+
+  // Command line options should not be copyable
+  bits(const bits &) = delete;
+  bits &operator=(const bits &) = delete;
+
 public:
   ParserClass &getParser() { return Parser; }
 
@@ -1574,67 +1466,10 @@ public:
     return Positions[optnum];
   }
 
-  // One option...
-  template<class M0t>
-  explicit bits(const M0t &M0) : Option(ZeroOrMore, NotHidden) {
-    apply(M0, this);
-    done();
-  }
-  // Two options...
-  template<class M0t, class M1t>
-  bits(const M0t &M0, const M1t &M1) : Option(ZeroOrMore, NotHidden) {
-    apply(M0, this); apply(M1, this);
-    done();
-  }
-  // Three options...
-  template<class M0t, class M1t, class M2t>
-  bits(const M0t &M0, const M1t &M1, const M2t &M2)
-    : Option(ZeroOrMore, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this);
-    done();
-  }
-  // Four options...
-  template<class M0t, class M1t, class M2t, class M3t>
-  bits(const M0t &M0, const M1t &M1, const M2t &M2, const M3t &M3)
-    : Option(ZeroOrMore, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
-    done();
-  }
-  // Five options...
-  template<class M0t, class M1t, class M2t, class M3t, class M4t>
-  bits(const M0t &M0, const M1t &M1, const M2t &M2, const M3t &M3,
-       const M4t &M4) : Option(ZeroOrMore, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
-    apply(M4, this);
-    done();
-  }
-  // Six options...
-  template<class M0t, class M1t, class M2t, class M3t,
-           class M4t, class M5t>
-  bits(const M0t &M0, const M1t &M1, const M2t &M2, const M3t &M3,
-       const M4t &M4, const M5t &M5) : Option(ZeroOrMore, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
-    apply(M4, this); apply(M5, this);
-    done();
-  }
-  // Seven options...
-  template<class M0t, class M1t, class M2t, class M3t,
-           class M4t, class M5t, class M6t>
-  bits(const M0t &M0, const M1t &M1, const M2t &M2, const M3t &M3,
-       const M4t &M4, const M5t &M5, const M6t &M6)
-    : Option(ZeroOrMore, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
-    apply(M4, this); apply(M5, this); apply(M6, this);
-    done();
-  }
-  // Eight options...
-  template<class M0t, class M1t, class M2t, class M3t,
-           class M4t, class M5t, class M6t, class M7t>
-  bits(const M0t &M0, const M1t &M1, const M2t &M2, const M3t &M3,
-       const M4t &M4, const M5t &M5, const M6t &M6,
-       const M7t &M7) : Option(ZeroOrMore, NotHidden) {
-    apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
-    apply(M4, this); apply(M5, this); apply(M6, this); apply(M7, this);
+  template <class... Mods>
+  explicit bits(const Mods &... Ms)
+      : Option(ZeroOrMore, NotHidden), Parser(*this) {
+    apply(this, Ms...);
     done();
   }
 };
@@ -1646,11 +1481,11 @@ public:
 class alias : public Option {
   Option *AliasFor;
   bool handleOccurrence(unsigned pos, StringRef /*ArgName*/,
-                                StringRef Arg) override {
+                        StringRef Arg) override {
     return AliasFor->handleOccurrence(pos, AliasFor->ArgStr, Arg);
   }
-  bool addOccurrence(unsigned pos, StringRef /*ArgName*/,
-                     StringRef Value, bool MultiArg = false) override {
+  bool addOccurrence(unsigned pos, StringRef /*ArgName*/, StringRef Value,
+                     bool MultiArg = false) override {
     return AliasFor->addOccurrence(pos, AliasFor->ArgStr, Value, MultiArg);
   }
   // Handle printing stuff...
@@ -1658,8 +1493,8 @@ class alias : public Option {
   void printOptionInfo(size_t GlobalWidth) const override;
 
   // Aliases do not need to print their values.
-  void printOptionValue(size_t /*GlobalWidth*/,
-                        bool /*Force*/) const override {}
+  void printOptionValue(size_t /*GlobalWidth*/, bool /*Force*/) const override {
+  }
 
   ValueExpected getValueExpectedFlagDefault() const override {
     return AliasFor->getValueExpectedFlag();
@@ -1670,8 +1505,13 @@ class alias : public Option {
       error("cl::alias must have argument name specified!");
     if (!AliasFor)
       error("cl::alias must have an cl::aliasopt(option) specified!");
-      addArgument();
+    addArgument();
   }
+
+  // Command line options should not be copyable
+  alias(const alias &) = delete;
+  alias &operator=(const alias &) = delete;
+
 public:
   void setAliasFor(Option &O) {
     if (AliasFor)
@@ -1679,31 +1519,10 @@ public:
     AliasFor = &O;
   }
 
-  // One option...
-  template<class M0t>
-  explicit alias(const M0t &M0) : Option(Optional, Hidden), AliasFor(nullptr) {
-    apply(M0, this);
-    done();
-  }
-  // Two options...
-  template<class M0t, class M1t>
-  alias(const M0t &M0, const M1t &M1)
-    : Option(Optional, Hidden), AliasFor(nullptr) {
-    apply(M0, this); apply(M1, this);
-    done();
-  }
-  // Three options...
-  template<class M0t, class M1t, class M2t>
-  alias(const M0t &M0, const M1t &M1, const M2t &M2)
-    : Option(Optional, Hidden), AliasFor(nullptr) {
-    apply(M0, this); apply(M1, this); apply(M2, this);
-    done();
-  }
-  // Four options...
-  template<class M0t, class M1t, class M2t, class M3t>
-  alias(const M0t &M0, const M1t &M1, const M2t &M2, const M3t &M3)
-    : Option(Optional, Hidden), AliasFor(nullptr) {
-    apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
+  template <class... Mods>
+  explicit alias(const Mods &... Ms)
+      : Option(Optional, Hidden), AliasFor(nullptr) {
+    apply(this, Ms...);
     done();
   }
 };
@@ -1720,8 +1539,8 @@ struct aliasopt {
 // printed to stderr at the end of the regular help, just before
 // exit is called.
 struct extrahelp {
-  const char * morehelp;
-  explicit extrahelp(const char* help);
+  const char *morehelp;
+  explicit extrahelp(const char *help);
 };
 
 void PrintVersionMessage();
@@ -1733,8 +1552,7 @@ void PrintVersionMessage();
 ///
 /// \param Hidden if true will print hidden options
 /// \param Categorized if true print options in categories
-void PrintHelpMessage(bool Hidden=false, bool Categorized=false);
-
+void PrintHelpMessage(bool Hidden = false, bool Categorized = false);
 
 //===----------------------------------------------------------------------===//
 // Public interface for accessing registered options.
@@ -1743,9 +1561,7 @@ void PrintHelpMessage(bool Hidden=false, bool Categorized=false);
 /// \brief Use this to get a StringMap to all registered named options
 /// (e.g. -help). Note \p Map Should be an empty StringMap.
 ///
-/// \param [out] Map will be filled with mappings where the key is the
-/// Option argument string (e.g. "help") and value is the corresponding
-/// Option*.
+/// \return A reference to the StringMap used by the cl APIs to parse options.
 ///
 /// Access to unnamed arguments (i.e. positional) are not provided because
 /// it is expected that the client already has access to these.
@@ -1753,8 +1569,7 @@ void PrintHelpMessage(bool Hidden=false, bool Categorized=false);
 /// Typical usage:
 /// \code
 /// main(int argc,char* argv[]) {
-/// StringMap<llvm::cl::Option*> opts;
-/// llvm::cl::getRegisteredOptions(opts);
+/// StringMap<llvm::cl::Option*> &opts = llvm::cl::getRegisteredOptions();
 /// assert(opts.count("help") == 1)
 /// opts["help"]->setDescription("Show alphabetical help information")
 /// // More code
@@ -1766,7 +1581,11 @@ void PrintHelpMessage(bool Hidden=false, bool Categorized=false);
 /// This interface is useful for modifying options in libraries that are out of
 /// the control of the client. The options should be modified before calling
 /// llvm::cl::ParseCommandLineOptions().
-void getRegisteredOptions(StringMap<Option*> &Map);
+///
+/// Hopefully this API can be depricated soon. Any situation where options need
+/// to be modified by tools or libraries should be handled by sane APIs rather
+/// than just handing around a global list.
+StringMap<Option *> &getRegisteredOptions();
 
 //===----------------------------------------------------------------------===//
 // Standalone command line processing utilities.
@@ -1776,9 +1595,10 @@ void getRegisteredOptions(StringMap<Option*> &Map);
 /// raw character pointer.
 class StringSaver {
   virtual void anchor();
+
 public:
   virtual const char *SaveString(const char *Str) = 0;
-  virtual ~StringSaver() {};  // Pacify -Wnon-virtual-dtor.
+  virtual ~StringSaver(){}; // Pacify -Wnon-virtual-dtor.
 };
 
 /// \brief Tokenizes a command line that can contain escapes and quotes.
@@ -1836,6 +1656,24 @@ bool ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
                          SmallVectorImpl<const char *> &Argv,
                          bool MarkEOLs = false);
 
+/// \brief Mark all options not part of this category as cl::ReallyHidden.
+///
+/// \param Category the category of options to keep displaying
+///
+/// Some tools (like clang-format) like to be able to hide all options that are
+/// not specific to the tool. This function allows a tool to specify a single
+/// option category to display in the -help output.
+void HideUnrelatedOptions(cl::OptionCategory &Category);
+
+/// \brief Mark all options not part of the categories as cl::ReallyHidden.
+///
+/// \param Categories the categories of options to keep displaying.
+///
+/// Some tools (like clang-format) like to be able to hide all options that are
+/// not specific to the tool. This function allows a tool to specify a single
+/// option category to display in the -help output.
+void HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *> Categories);
+
 } // End namespace cl
 
 } // End namespace llvm
diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index d008fec..f7de840 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h
@@ -52,14 +52,14 @@
 /// \macro LLVM_MSC_PREREQ
 /// \brief Is the compiler MSVC of at least the specified version?
 /// The common \param version values to check for are:
-///  * 1700: Microsoft Visual Studio 2012 / 11.0
 ///  * 1800: Microsoft Visual Studio 2013 / 12.0
+///  * 1900: Microsoft Visual Studio 2015 / 14.0
 #ifdef _MSC_VER
 #define LLVM_MSC_PREREQ(version) (_MSC_VER >= (version))
 
-// We require at least MSVC 2012.
-#if !LLVM_MSC_PREREQ(1700)
-#error LLVM requires at least MSVC 2012.
+// We require at least MSVC 2013.
+#if !LLVM_MSC_PREREQ(1800)
+#error LLVM requires at least MSVC 2013.
 #endif
 
 #else
@@ -82,16 +82,6 @@
 #define LLVM_HAS_RVALUE_REFERENCE_THIS 0
 #endif
 
-/// \macro LLVM_HAS_VARIADIC_TEMPLATES
-/// \brief Does this compiler support variadic templates.
-///
-/// Implies LLVM_HAS_RVALUE_REFERENCES and the existence of std::forward.
-#if __has_feature(cxx_variadic_templates) || LLVM_MSC_PREREQ(1800)
-# define LLVM_HAS_VARIADIC_TEMPLATES 1
-#else
-# define LLVM_HAS_VARIADIC_TEMPLATES 0
-#endif
-
 /// Expands to '&' if r-value references are supported.
 ///
 /// This can be used to provide l-value/r-value overrides of member functions.
@@ -102,24 +92,6 @@
 #define LLVM_LVALUE_FUNCTION
 #endif
 
-/// LLVM_DELETED_FUNCTION - Expands to = delete if the compiler supports it.
-/// Use to mark functions as uncallable. Member functions with this should
-/// be declared private so that some behavior is kept in C++03 mode.
-///
-/// class DontCopy {
-/// private:
-///   DontCopy(const DontCopy&) LLVM_DELETED_FUNCTION;
-///   DontCopy &operator =(const DontCopy&) LLVM_DELETED_FUNCTION;
-/// public:
-///   ...
-/// };
-#if __has_feature(cxx_deleted_functions) || \
-    defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1800)
-#define LLVM_DELETED_FUNCTION = delete
-#else
-#define LLVM_DELETED_FUNCTION
-#endif
-
 #if __has_feature(cxx_constexpr) || defined(__GXX_EXPERIMENTAL_CXX0X__)
 # define LLVM_CONSTEXPR constexpr
 #else
@@ -287,6 +259,12 @@
 /// which causes the program to exit abnormally.
 #if __has_builtin(__builtin_trap) || LLVM_GNUC_PREREQ(4, 3, 0)
 # define LLVM_BUILTIN_TRAP __builtin_trap()
+#elif defined(_MSC_VER)
+// The __debugbreak intrinsic is supported by MSVC, does not require forward
+// declarations involving platform-specific typedefs (unlike RaiseException),
+// results in a call to vectored exception handlers, and encodes to a short
+// instruction that still causes the trapping behavior we want.
+# define LLVM_BUILTIN_TRAP __debugbreak()
 #else
 # define LLVM_BUILTIN_TRAP *(volatile int*)0x11 = 0
 #endif
@@ -344,26 +322,6 @@
 # define LLVM_IS_UNALIGNED_ACCESS_FAST 0
 #endif
 
-/// \macro LLVM_EXPLICIT
-/// \brief Expands to explicit on compilers which support explicit conversion
-/// operators. Otherwise expands to nothing.
-#if __has_feature(cxx_explicit_conversions) || \
-    defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1800)
-#define LLVM_EXPLICIT explicit
-#else
-#define LLVM_EXPLICIT
-#endif
-
-/// \brief Does the compiler support generalized initializers (using braced
-/// lists and std::initializer_list).  While clang may claim it supports general
-/// initializers, if we're using MSVC's headers, we might not have a usable
-/// std::initializer list type from the STL.  Disable this for now.
-#if __has_feature(cxx_generalized_initializers) && !defined(_MSC_VER)
-#define LLVM_HAS_INITIALIZER_LISTS 1
-#else
-#define LLVM_HAS_INITIALIZER_LISTS 0
-#endif
-
 /// \brief Mark debug helper function definitions like dump() that should not be
 /// stripped from debug builds.
 // FIXME: Move this to a private config.h as it's not usable in public headers.
@@ -373,4 +331,33 @@
 #define LLVM_DUMP_METHOD LLVM_ATTRIBUTE_NOINLINE
 #endif
 
+/// \macro LLVM_THREAD_LOCAL
+/// \brief A thread-local storage specifier which can be used with globals,
+/// extern globals, and static globals.
+///
+/// This is essentially an extremely restricted analog to C++11's thread_local
+/// support, and uses that when available. However, it falls back on
+/// platform-specific or vendor-provided extensions when necessary. These
+/// extensions don't support many of the C++11 thread_local's features. You
+/// should only use this for PODs that you can statically initialize to
+/// some constant value. In almost all circumstances this is most appropriate
+/// for use with a pointer, integer, or small aggregation of pointers and
+/// integers.
+#if LLVM_ENABLE_THREADS
+#if __has_feature(cxx_thread_local)
+#define LLVM_THREAD_LOCAL thread_local
+#elif defined(_MSC_VER)
+// MSVC supports this with a __declspec.
+#define LLVM_THREAD_LOCAL __declspec(thread)
+#else
+// Clang, GCC, and other compatible compilers used __thread prior to C++11 and
+// we only need the restricted functionality that provides.
+#define LLVM_THREAD_LOCAL __thread
+#endif
+#else // !LLVM_ENABLE_THREADS
+// If threading is disabled entirely, this compiles to nothing and you get
+// a normal global variable.
+#define LLVM_THREAD_LOCAL
+#endif
+
 #endif
diff --git a/include/llvm/Support/Compression.h b/include/llvm/Support/Compression.h
index 8152b60..88727fa 100644
--- a/include/llvm/Support/Compression.h
+++ b/include/llvm/Support/Compression.h
@@ -14,9 +14,9 @@
 #ifndef LLVM_SUPPORT_COMPRESSION_H
 #define LLVM_SUPPORT_COMPRESSION_H
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/DataTypes.h"
 #include <memory>
-#include "llvm/ADT/SmallVector.h"
 
 namespace llvm {
 
diff --git a/include/llvm/Support/ConvertUTF.h b/include/llvm/Support/ConvertUTF.h
index a184d0d..38952ec 100644
--- a/include/llvm/Support/ConvertUTF.h
+++ b/include/llvm/Support/ConvertUTF.h
@@ -251,6 +251,14 @@ bool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes);
  */
 bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
 
+/**
+ * Converts a UTF-8 string into a UTF-16 string with native endianness.
+ *
+ * \returns true on success
+ */
+bool convertUTF8ToUTF16String(StringRef SrcUTF8,
+                              SmallVectorImpl<UTF16> &DstUTF16);
+
 } /* end namespace llvm */
 
 #endif
diff --git a/include/llvm/Support/CrashRecoveryContext.h b/include/llvm/Support/CrashRecoveryContext.h
index f1e636d..1f3965c 100644
--- a/include/llvm/Support/CrashRecoveryContext.h
+++ b/include/llvm/Support/CrashRecoveryContext.h
@@ -10,9 +10,8 @@
 #ifndef LLVM_SUPPORT_CRASHRECOVERYCONTEXT_H
 #define LLVM_SUPPORT_CRASHRECOVERYCONTEXT_H
 
-#include <string>
-
 #include "llvm/ADT/STLExtras.h"
+#include <string>
 
 namespace llvm {
 class StringRef;
diff --git a/include/llvm/Support/Dwarf.def b/include/llvm/Support/Dwarf.def
new file mode 100644
index 0000000..c663af9
--- /dev/null
+++ b/include/llvm/Support/Dwarf.def
@@ -0,0 +1,351 @@
+//===- llvm/Support/Dwarf.def - Dwarf definitions ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Macros for running through Dwarf enumerators.
+//
+//===----------------------------------------------------------------------===//
+
+// TODO: Add other DW-based macros.
+#if !(defined HANDLE_DW_TAG || defined HANDLE_DW_OP ||                         \
+      defined HANDLE_DW_LANG || defined HANDLE_DW_ATE ||                       \
+      defined HANDLE_DW_VIRTUALITY)
+#error "Missing macro definition of HANDLE_DW*"
+#endif
+
+#ifndef HANDLE_DW_TAG
+#define HANDLE_DW_TAG(ID, NAME)
+#endif
+
+#ifndef HANDLE_DW_OP
+#define HANDLE_DW_OP(ID, NAME)
+#endif
+
+#ifndef HANDLE_DW_LANG
+#define HANDLE_DW_LANG(ID, NAME)
+#endif
+
+#ifndef HANDLE_DW_ATE
+#define HANDLE_DW_ATE(ID, NAME)
+#endif
+
+#ifndef HANDLE_DW_VIRTUALITY
+#define HANDLE_DW_VIRTUALITY(ID, NAME)
+#endif
+
+HANDLE_DW_TAG(0x0001, array_type)
+HANDLE_DW_TAG(0x0002, class_type)
+HANDLE_DW_TAG(0x0003, entry_point)
+HANDLE_DW_TAG(0x0004, enumeration_type)
+HANDLE_DW_TAG(0x0005, formal_parameter)
+HANDLE_DW_TAG(0x0008, imported_declaration)
+HANDLE_DW_TAG(0x000a, label)
+HANDLE_DW_TAG(0x000b, lexical_block)
+HANDLE_DW_TAG(0x000d, member)
+HANDLE_DW_TAG(0x000f, pointer_type)
+HANDLE_DW_TAG(0x0010, reference_type)
+HANDLE_DW_TAG(0x0011, compile_unit)
+HANDLE_DW_TAG(0x0012, string_type)
+HANDLE_DW_TAG(0x0013, structure_type)
+HANDLE_DW_TAG(0x0015, subroutine_type)
+HANDLE_DW_TAG(0x0016, typedef)
+HANDLE_DW_TAG(0x0017, union_type)
+HANDLE_DW_TAG(0x0018, unspecified_parameters)
+HANDLE_DW_TAG(0x0019, variant)
+HANDLE_DW_TAG(0x001a, common_block)
+HANDLE_DW_TAG(0x001b, common_inclusion)
+HANDLE_DW_TAG(0x001c, inheritance)
+HANDLE_DW_TAG(0x001d, inlined_subroutine)
+HANDLE_DW_TAG(0x001e, module)
+HANDLE_DW_TAG(0x001f, ptr_to_member_type)
+HANDLE_DW_TAG(0x0020, set_type)
+HANDLE_DW_TAG(0x0021, subrange_type)
+HANDLE_DW_TAG(0x0022, with_stmt)
+HANDLE_DW_TAG(0x0023, access_declaration)
+HANDLE_DW_TAG(0x0024, base_type)
+HANDLE_DW_TAG(0x0025, catch_block)
+HANDLE_DW_TAG(0x0026, const_type)
+HANDLE_DW_TAG(0x0027, constant)
+HANDLE_DW_TAG(0x0028, enumerator)
+HANDLE_DW_TAG(0x0029, file_type)
+HANDLE_DW_TAG(0x002a, friend)
+HANDLE_DW_TAG(0x002b, namelist)
+HANDLE_DW_TAG(0x002c, namelist_item)
+HANDLE_DW_TAG(0x002d, packed_type)
+HANDLE_DW_TAG(0x002e, subprogram)
+HANDLE_DW_TAG(0x002f, template_type_parameter)
+HANDLE_DW_TAG(0x0030, template_value_parameter)
+HANDLE_DW_TAG(0x0031, thrown_type)
+HANDLE_DW_TAG(0x0032, try_block)
+HANDLE_DW_TAG(0x0033, variant_part)
+HANDLE_DW_TAG(0x0034, variable)
+HANDLE_DW_TAG(0x0035, volatile_type)
+HANDLE_DW_TAG(0x0036, dwarf_procedure)
+HANDLE_DW_TAG(0x0037, restrict_type)
+HANDLE_DW_TAG(0x0038, interface_type)
+HANDLE_DW_TAG(0x0039, namespace)
+HANDLE_DW_TAG(0x003a, imported_module)
+HANDLE_DW_TAG(0x003b, unspecified_type)
+HANDLE_DW_TAG(0x003c, partial_unit)
+HANDLE_DW_TAG(0x003d, imported_unit)
+HANDLE_DW_TAG(0x003f, condition)
+HANDLE_DW_TAG(0x0040, shared_type)
+HANDLE_DW_TAG(0x0041, type_unit)
+HANDLE_DW_TAG(0x0042, rvalue_reference_type)
+HANDLE_DW_TAG(0x0043, template_alias)
+
+// Mock tags we use as discriminators.
+HANDLE_DW_TAG(0x0100, auto_variable) // Tag for local (auto) variables.
+HANDLE_DW_TAG(0x0101, arg_variable)  // Tag for argument variables.
+HANDLE_DW_TAG(0x0102, expression)    // Tag for complex address expressions.
+
+// New in DWARF v5.
+HANDLE_DW_TAG(0x0044, coarray_type)
+HANDLE_DW_TAG(0x0045, generic_subrange)
+HANDLE_DW_TAG(0x0046, dynamic_type)
+
+// User-defined tags.
+HANDLE_DW_TAG(0x4081, MIPS_loop)
+HANDLE_DW_TAG(0x4101, format_label)
+HANDLE_DW_TAG(0x4102, function_template)
+HANDLE_DW_TAG(0x4103, class_template)
+HANDLE_DW_TAG(0x4106, GNU_template_template_param)
+HANDLE_DW_TAG(0x4107, GNU_template_parameter_pack)
+HANDLE_DW_TAG(0x4108, GNU_formal_parameter_pack)
+HANDLE_DW_TAG(0x4200, APPLE_property)
+
+HANDLE_DW_OP(0x03, addr)
+HANDLE_DW_OP(0x06, deref)
+HANDLE_DW_OP(0x08, const1u)
+HANDLE_DW_OP(0x09, const1s)
+HANDLE_DW_OP(0x0a, const2u)
+HANDLE_DW_OP(0x0b, const2s)
+HANDLE_DW_OP(0x0c, const4u)
+HANDLE_DW_OP(0x0d, const4s)
+HANDLE_DW_OP(0x0e, const8u)
+HANDLE_DW_OP(0x0f, const8s)
+HANDLE_DW_OP(0x10, constu)
+HANDLE_DW_OP(0x11, consts)
+HANDLE_DW_OP(0x12, dup)
+HANDLE_DW_OP(0x13, drop)
+HANDLE_DW_OP(0x14, over)
+HANDLE_DW_OP(0x15, pick)
+HANDLE_DW_OP(0x16, swap)
+HANDLE_DW_OP(0x17, rot)
+HANDLE_DW_OP(0x18, xderef)
+HANDLE_DW_OP(0x19, abs)
+HANDLE_DW_OP(0x1a, and)
+HANDLE_DW_OP(0x1b, div)
+HANDLE_DW_OP(0x1c, minus)
+HANDLE_DW_OP(0x1d, mod)
+HANDLE_DW_OP(0x1e, mul)
+HANDLE_DW_OP(0x1f, neg)
+HANDLE_DW_OP(0x20, not)
+HANDLE_DW_OP(0x21, or )
+HANDLE_DW_OP(0x22, plus)
+HANDLE_DW_OP(0x23, plus_uconst)
+HANDLE_DW_OP(0x24, shl)
+HANDLE_DW_OP(0x25, shr)
+HANDLE_DW_OP(0x26, shra)
+HANDLE_DW_OP(0x27, xor)
+HANDLE_DW_OP(0x2f, skip)
+HANDLE_DW_OP(0x28, bra)
+HANDLE_DW_OP(0x29, eq)
+HANDLE_DW_OP(0x2a, ge)
+HANDLE_DW_OP(0x2b, gt)
+HANDLE_DW_OP(0x2c, le)
+HANDLE_DW_OP(0x2d, lt)
+HANDLE_DW_OP(0x2e, ne)
+HANDLE_DW_OP(0x30, lit0)
+HANDLE_DW_OP(0x31, lit1)
+HANDLE_DW_OP(0x32, lit2)
+HANDLE_DW_OP(0x33, lit3)
+HANDLE_DW_OP(0x34, lit4)
+HANDLE_DW_OP(0x35, lit5)
+HANDLE_DW_OP(0x36, lit6)
+HANDLE_DW_OP(0x37, lit7)
+HANDLE_DW_OP(0x38, lit8)
+HANDLE_DW_OP(0x39, lit9)
+HANDLE_DW_OP(0x3a, lit10)
+HANDLE_DW_OP(0x3b, lit11)
+HANDLE_DW_OP(0x3c, lit12)
+HANDLE_DW_OP(0x3d, lit13)
+HANDLE_DW_OP(0x3e, lit14)
+HANDLE_DW_OP(0x3f, lit15)
+HANDLE_DW_OP(0x40, lit16)
+HANDLE_DW_OP(0x41, lit17)
+HANDLE_DW_OP(0x42, lit18)
+HANDLE_DW_OP(0x43, lit19)
+HANDLE_DW_OP(0x44, lit20)
+HANDLE_DW_OP(0x45, lit21)
+HANDLE_DW_OP(0x46, lit22)
+HANDLE_DW_OP(0x47, lit23)
+HANDLE_DW_OP(0x48, lit24)
+HANDLE_DW_OP(0x49, lit25)
+HANDLE_DW_OP(0x4a, lit26)
+HANDLE_DW_OP(0x4b, lit27)
+HANDLE_DW_OP(0x4c, lit28)
+HANDLE_DW_OP(0x4d, lit29)
+HANDLE_DW_OP(0x4e, lit30)
+HANDLE_DW_OP(0x4f, lit31)
+HANDLE_DW_OP(0x50, reg0)
+HANDLE_DW_OP(0x51, reg1)
+HANDLE_DW_OP(0x52, reg2)
+HANDLE_DW_OP(0x53, reg3)
+HANDLE_DW_OP(0x54, reg4)
+HANDLE_DW_OP(0x55, reg5)
+HANDLE_DW_OP(0x56, reg6)
+HANDLE_DW_OP(0x57, reg7)
+HANDLE_DW_OP(0x58, reg8)
+HANDLE_DW_OP(0x59, reg9)
+HANDLE_DW_OP(0x5a, reg10)
+HANDLE_DW_OP(0x5b, reg11)
+HANDLE_DW_OP(0x5c, reg12)
+HANDLE_DW_OP(0x5d, reg13)
+HANDLE_DW_OP(0x5e, reg14)
+HANDLE_DW_OP(0x5f, reg15)
+HANDLE_DW_OP(0x60, reg16)
+HANDLE_DW_OP(0x61, reg17)
+HANDLE_DW_OP(0x62, reg18)
+HANDLE_DW_OP(0x63, reg19)
+HANDLE_DW_OP(0x64, reg20)
+HANDLE_DW_OP(0x65, reg21)
+HANDLE_DW_OP(0x66, reg22)
+HANDLE_DW_OP(0x67, reg23)
+HANDLE_DW_OP(0x68, reg24)
+HANDLE_DW_OP(0x69, reg25)
+HANDLE_DW_OP(0x6a, reg26)
+HANDLE_DW_OP(0x6b, reg27)
+HANDLE_DW_OP(0x6c, reg28)
+HANDLE_DW_OP(0x6d, reg29)
+HANDLE_DW_OP(0x6e, reg30)
+HANDLE_DW_OP(0x6f, reg31)
+HANDLE_DW_OP(0x70, breg0)
+HANDLE_DW_OP(0x71, breg1)
+HANDLE_DW_OP(0x72, breg2)
+HANDLE_DW_OP(0x73, breg3)
+HANDLE_DW_OP(0x74, breg4)
+HANDLE_DW_OP(0x75, breg5)
+HANDLE_DW_OP(0x76, breg6)
+HANDLE_DW_OP(0x77, breg7)
+HANDLE_DW_OP(0x78, breg8)
+HANDLE_DW_OP(0x79, breg9)
+HANDLE_DW_OP(0x7a, breg10)
+HANDLE_DW_OP(0x7b, breg11)
+HANDLE_DW_OP(0x7c, breg12)
+HANDLE_DW_OP(0x7d, breg13)
+HANDLE_DW_OP(0x7e, breg14)
+HANDLE_DW_OP(0x7f, breg15)
+HANDLE_DW_OP(0x80, breg16)
+HANDLE_DW_OP(0x81, breg17)
+HANDLE_DW_OP(0x82, breg18)
+HANDLE_DW_OP(0x83, breg19)
+HANDLE_DW_OP(0x84, breg20)
+HANDLE_DW_OP(0x85, breg21)
+HANDLE_DW_OP(0x86, breg22)
+HANDLE_DW_OP(0x87, breg23)
+HANDLE_DW_OP(0x88, breg24)
+HANDLE_DW_OP(0x89, breg25)
+HANDLE_DW_OP(0x8a, breg26)
+HANDLE_DW_OP(0x8b, breg27)
+HANDLE_DW_OP(0x8c, breg28)
+HANDLE_DW_OP(0x8d, breg29)
+HANDLE_DW_OP(0x8e, breg30)
+HANDLE_DW_OP(0x8f, breg31)
+HANDLE_DW_OP(0x90, regx)
+HANDLE_DW_OP(0x91, fbreg)
+HANDLE_DW_OP(0x92, bregx)
+HANDLE_DW_OP(0x93, piece)
+HANDLE_DW_OP(0x94, deref_size)
+HANDLE_DW_OP(0x95, xderef_size)
+HANDLE_DW_OP(0x96, nop)
+HANDLE_DW_OP(0x97, push_object_address)
+HANDLE_DW_OP(0x98, call2)
+HANDLE_DW_OP(0x99, call4)
+HANDLE_DW_OP(0x9a, call_ref)
+HANDLE_DW_OP(0x9b, form_tls_address)
+HANDLE_DW_OP(0x9c, call_frame_cfa)
+HANDLE_DW_OP(0x9d, bit_piece)
+HANDLE_DW_OP(0x9e, implicit_value)
+HANDLE_DW_OP(0x9f, stack_value)
+
+// Extensions for GNU-style thread-local storage.
+HANDLE_DW_OP(0xe0, GNU_push_tls_address)
+
+// Extensions for Fission proposal.
+HANDLE_DW_OP(0xfb, GNU_addr_index)
+HANDLE_DW_OP(0xfc, GNU_const_index)
+
+// DWARF languages.
+HANDLE_DW_LANG(0x0001, C89)
+HANDLE_DW_LANG(0x0002, C)
+HANDLE_DW_LANG(0x0003, Ada83)
+HANDLE_DW_LANG(0x0004, C_plus_plus)
+HANDLE_DW_LANG(0x0005, Cobol74)
+HANDLE_DW_LANG(0x0006, Cobol85)
+HANDLE_DW_LANG(0x0007, Fortran77)
+HANDLE_DW_LANG(0x0008, Fortran90)
+HANDLE_DW_LANG(0x0009, Pascal83)
+HANDLE_DW_LANG(0x000a, Modula2)
+HANDLE_DW_LANG(0x000b, Java)
+HANDLE_DW_LANG(0x000c, C99)
+HANDLE_DW_LANG(0x000d, Ada95)
+HANDLE_DW_LANG(0x000e, Fortran95)
+HANDLE_DW_LANG(0x000f, PLI)
+HANDLE_DW_LANG(0x0010, ObjC)
+HANDLE_DW_LANG(0x0011, ObjC_plus_plus)
+HANDLE_DW_LANG(0x0012, UPC)
+HANDLE_DW_LANG(0x0013, D)
+
+// New in DWARF 5:
+HANDLE_DW_LANG(0x0014, Python)
+HANDLE_DW_LANG(0x0015, OpenCL)
+HANDLE_DW_LANG(0x0016, Go)
+HANDLE_DW_LANG(0x0017, Modula3)
+HANDLE_DW_LANG(0x0018, Haskell)
+HANDLE_DW_LANG(0x0019, C_plus_plus_03)
+HANDLE_DW_LANG(0x001a, C_plus_plus_11)
+HANDLE_DW_LANG(0x001b, OCaml)
+HANDLE_DW_LANG(0x001c, Rust)
+HANDLE_DW_LANG(0x001d, C11)
+HANDLE_DW_LANG(0x001e, Swift)
+HANDLE_DW_LANG(0x001f, Julia)
+HANDLE_DW_LANG(0x0020, Dylan)
+HANDLE_DW_LANG(0x0021, C_plus_plus_14)
+HANDLE_DW_LANG(0x0022, Fortran03)
+HANDLE_DW_LANG(0x0023, Fortran08)
+HANDLE_DW_LANG(0x8001, Mips_Assembler)
+
+// DWARF attribute type encodings.
+HANDLE_DW_ATE(0x01, address)
+HANDLE_DW_ATE(0x02, boolean)
+HANDLE_DW_ATE(0x03, complex_float)
+HANDLE_DW_ATE(0x04, float)
+HANDLE_DW_ATE(0x05, signed)
+HANDLE_DW_ATE(0x06, signed_char)
+HANDLE_DW_ATE(0x07, unsigned)
+HANDLE_DW_ATE(0x08, unsigned_char)
+HANDLE_DW_ATE(0x09, imaginary_float)
+HANDLE_DW_ATE(0x0a, packed_decimal)
+HANDLE_DW_ATE(0x0b, numeric_string)
+HANDLE_DW_ATE(0x0c, edited)
+HANDLE_DW_ATE(0x0d, signed_fixed)
+HANDLE_DW_ATE(0x0e, unsigned_fixed)
+HANDLE_DW_ATE(0x0f, decimal_float)
+HANDLE_DW_ATE(0x10, UTF)
+
+// DWARF virtuality codes.
+HANDLE_DW_VIRTUALITY(0x00, none)
+HANDLE_DW_VIRTUALITY(0x01, virtual)
+HANDLE_DW_VIRTUALITY(0x02, pure_virtual)
+
+#undef HANDLE_DW_TAG
+#undef HANDLE_DW_OP
+#undef HANDLE_DW_LANG
+#undef HANDLE_DW_ATE
+#undef HANDLE_DW_VIRTUALITY
diff --git a/include/llvm/Support/Dwarf.h b/include/llvm/Support/Dwarf.h
index 47b00b1..c294a72 100644
--- a/include/llvm/Support/Dwarf.h
+++ b/include/llvm/Support/Dwarf.h
@@ -20,6 +20,7 @@
 #ifndef LLVM_SUPPORT_DWARF_H
 #define LLVM_SUPPORT_DWARF_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 
@@ -36,15 +37,11 @@ namespace dwarf {
 // enumeration base type.
 
 enum LLVMConstants : uint32_t {
-  // llvm mock tags
-  DW_TAG_invalid = ~0U, // Tag for invalid results.
-
-  DW_TAG_auto_variable = 0x100, // Tag for local (auto) variables.
-  DW_TAG_arg_variable = 0x101,  // Tag for argument variables.
-  DW_TAG_expression = 0x102,    // Tag for complex address expressions.
-
-  DW_TAG_user_base = 0x1000, // Recommended base for user tags.
+  // LLVM mock tags (see also llvm/Support/Dwarf.def).
+  DW_TAG_invalid = ~0U,        // Tag for invalid results.
+  DW_VIRTUALITY_invalid = ~0U, // Virtuality for invalid results.
 
+  // Other constants.
   DWARF_VERSION = 4,       // Default dwarf version we output.
   DW_PUBTYPES_VERSION = 2, // Section version number for .debug_pubtypes.
   DW_PUBNAMES_VERSION = 2, // Section version number for .debug_pubnames.
@@ -57,82 +54,11 @@ const uint32_t DW_CIE_ID = UINT32_MAX;
 const uint64_t DW64_CIE_ID = UINT64_MAX;
 
 enum Tag : uint16_t {
-  DW_TAG_array_type = 0x01,
-  DW_TAG_class_type = 0x02,
-  DW_TAG_entry_point = 0x03,
-  DW_TAG_enumeration_type = 0x04,
-  DW_TAG_formal_parameter = 0x05,
-  DW_TAG_imported_declaration = 0x08,
-  DW_TAG_label = 0x0a,
-  DW_TAG_lexical_block = 0x0b,
-  DW_TAG_member = 0x0d,
-  DW_TAG_pointer_type = 0x0f,
-  DW_TAG_reference_type = 0x10,
-  DW_TAG_compile_unit = 0x11,
-  DW_TAG_string_type = 0x12,
-  DW_TAG_structure_type = 0x13,
-  DW_TAG_subroutine_type = 0x15,
-  DW_TAG_typedef = 0x16,
-  DW_TAG_union_type = 0x17,
-  DW_TAG_unspecified_parameters = 0x18,
-  DW_TAG_variant = 0x19,
-  DW_TAG_common_block = 0x1a,
-  DW_TAG_common_inclusion = 0x1b,
-  DW_TAG_inheritance = 0x1c,
-  DW_TAG_inlined_subroutine = 0x1d,
-  DW_TAG_module = 0x1e,
-  DW_TAG_ptr_to_member_type = 0x1f,
-  DW_TAG_set_type = 0x20,
-  DW_TAG_subrange_type = 0x21,
-  DW_TAG_with_stmt = 0x22,
-  DW_TAG_access_declaration = 0x23,
-  DW_TAG_base_type = 0x24,
-  DW_TAG_catch_block = 0x25,
-  DW_TAG_const_type = 0x26,
-  DW_TAG_constant = 0x27,
-  DW_TAG_enumerator = 0x28,
-  DW_TAG_file_type = 0x29,
-  DW_TAG_friend = 0x2a,
-  DW_TAG_namelist = 0x2b,
-  DW_TAG_namelist_item = 0x2c,
-  DW_TAG_packed_type = 0x2d,
-  DW_TAG_subprogram = 0x2e,
-  DW_TAG_template_type_parameter = 0x2f,
-  DW_TAG_template_value_parameter = 0x30,
-  DW_TAG_thrown_type = 0x31,
-  DW_TAG_try_block = 0x32,
-  DW_TAG_variant_part = 0x33,
-  DW_TAG_variable = 0x34,
-  DW_TAG_volatile_type = 0x35,
-  DW_TAG_dwarf_procedure = 0x36,
-  DW_TAG_restrict_type = 0x37,
-  DW_TAG_interface_type = 0x38,
-  DW_TAG_namespace = 0x39,
-  DW_TAG_imported_module = 0x3a,
-  DW_TAG_unspecified_type = 0x3b,
-  DW_TAG_partial_unit = 0x3c,
-  DW_TAG_imported_unit = 0x3d,
-  DW_TAG_condition = 0x3f,
-  DW_TAG_shared_type = 0x40,
-  DW_TAG_type_unit = 0x41,
-  DW_TAG_rvalue_reference_type = 0x42,
-  DW_TAG_template_alias = 0x43,
-
-  // New in DWARF 5:
-  DW_TAG_coarray_type = 0x44,
-  DW_TAG_generic_subrange = 0x45,
-  DW_TAG_dynamic_type = 0x46,
-
-  DW_TAG_MIPS_loop = 0x4081,
-  DW_TAG_format_label = 0x4101,
-  DW_TAG_function_template = 0x4102,
-  DW_TAG_class_template = 0x4103,
-  DW_TAG_GNU_template_template_param = 0x4106,
-  DW_TAG_GNU_template_parameter_pack = 0x4107,
-  DW_TAG_GNU_formal_parameter_pack = 0x4108,
+#define HANDLE_DW_TAG(ID, NAME) DW_TAG_##NAME = ID,
+#include "llvm/Support/Dwarf.def"
   DW_TAG_lo_user = 0x4080,
-  DW_TAG_APPLE_property = 0x4200,
-  DW_TAG_hi_user = 0xffff
+  DW_TAG_hi_user = 0xffff,
+  DW_TAG_user_base = 0x1000 // Recommended base for user tags.
 };
 
 inline bool isType(Tag T) {
@@ -363,190 +289,15 @@ enum Form : uint16_t {
 };
 
 enum LocationAtom {
-  // Operation encodings
-  DW_OP_addr = 0x03,
-  DW_OP_deref = 0x06,
-  DW_OP_const1u = 0x08,
-  DW_OP_const1s = 0x09,
-  DW_OP_const2u = 0x0a,
-  DW_OP_const2s = 0x0b,
-  DW_OP_const4u = 0x0c,
-  DW_OP_const4s = 0x0d,
-  DW_OP_const8u = 0x0e,
-  DW_OP_const8s = 0x0f,
-  DW_OP_constu = 0x10,
-  DW_OP_consts = 0x11,
-  DW_OP_dup = 0x12,
-  DW_OP_drop = 0x13,
-  DW_OP_over = 0x14,
-  DW_OP_pick = 0x15,
-  DW_OP_swap = 0x16,
-  DW_OP_rot = 0x17,
-  DW_OP_xderef = 0x18,
-  DW_OP_abs = 0x19,
-  DW_OP_and = 0x1a,
-  DW_OP_div = 0x1b,
-  DW_OP_minus = 0x1c,
-  DW_OP_mod = 0x1d,
-  DW_OP_mul = 0x1e,
-  DW_OP_neg = 0x1f,
-  DW_OP_not = 0x20,
-  DW_OP_or = 0x21,
-  DW_OP_plus = 0x22,
-  DW_OP_plus_uconst = 0x23,
-  DW_OP_shl = 0x24,
-  DW_OP_shr = 0x25,
-  DW_OP_shra = 0x26,
-  DW_OP_xor = 0x27,
-  DW_OP_skip = 0x2f,
-  DW_OP_bra = 0x28,
-  DW_OP_eq = 0x29,
-  DW_OP_ge = 0x2a,
-  DW_OP_gt = 0x2b,
-  DW_OP_le = 0x2c,
-  DW_OP_lt = 0x2d,
-  DW_OP_ne = 0x2e,
-  DW_OP_lit0 = 0x30,
-  DW_OP_lit1 = 0x31,
-  DW_OP_lit2 = 0x32,
-  DW_OP_lit3 = 0x33,
-  DW_OP_lit4 = 0x34,
-  DW_OP_lit5 = 0x35,
-  DW_OP_lit6 = 0x36,
-  DW_OP_lit7 = 0x37,
-  DW_OP_lit8 = 0x38,
-  DW_OP_lit9 = 0x39,
-  DW_OP_lit10 = 0x3a,
-  DW_OP_lit11 = 0x3b,
-  DW_OP_lit12 = 0x3c,
-  DW_OP_lit13 = 0x3d,
-  DW_OP_lit14 = 0x3e,
-  DW_OP_lit15 = 0x3f,
-  DW_OP_lit16 = 0x40,
-  DW_OP_lit17 = 0x41,
-  DW_OP_lit18 = 0x42,
-  DW_OP_lit19 = 0x43,
-  DW_OP_lit20 = 0x44,
-  DW_OP_lit21 = 0x45,
-  DW_OP_lit22 = 0x46,
-  DW_OP_lit23 = 0x47,
-  DW_OP_lit24 = 0x48,
-  DW_OP_lit25 = 0x49,
-  DW_OP_lit26 = 0x4a,
-  DW_OP_lit27 = 0x4b,
-  DW_OP_lit28 = 0x4c,
-  DW_OP_lit29 = 0x4d,
-  DW_OP_lit30 = 0x4e,
-  DW_OP_lit31 = 0x4f,
-  DW_OP_reg0 = 0x50,
-  DW_OP_reg1 = 0x51,
-  DW_OP_reg2 = 0x52,
-  DW_OP_reg3 = 0x53,
-  DW_OP_reg4 = 0x54,
-  DW_OP_reg5 = 0x55,
-  DW_OP_reg6 = 0x56,
-  DW_OP_reg7 = 0x57,
-  DW_OP_reg8 = 0x58,
-  DW_OP_reg9 = 0x59,
-  DW_OP_reg10 = 0x5a,
-  DW_OP_reg11 = 0x5b,
-  DW_OP_reg12 = 0x5c,
-  DW_OP_reg13 = 0x5d,
-  DW_OP_reg14 = 0x5e,
-  DW_OP_reg15 = 0x5f,
-  DW_OP_reg16 = 0x60,
-  DW_OP_reg17 = 0x61,
-  DW_OP_reg18 = 0x62,
-  DW_OP_reg19 = 0x63,
-  DW_OP_reg20 = 0x64,
-  DW_OP_reg21 = 0x65,
-  DW_OP_reg22 = 0x66,
-  DW_OP_reg23 = 0x67,
-  DW_OP_reg24 = 0x68,
-  DW_OP_reg25 = 0x69,
-  DW_OP_reg26 = 0x6a,
-  DW_OP_reg27 = 0x6b,
-  DW_OP_reg28 = 0x6c,
-  DW_OP_reg29 = 0x6d,
-  DW_OP_reg30 = 0x6e,
-  DW_OP_reg31 = 0x6f,
-  DW_OP_breg0 = 0x70,
-  DW_OP_breg1 = 0x71,
-  DW_OP_breg2 = 0x72,
-  DW_OP_breg3 = 0x73,
-  DW_OP_breg4 = 0x74,
-  DW_OP_breg5 = 0x75,
-  DW_OP_breg6 = 0x76,
-  DW_OP_breg7 = 0x77,
-  DW_OP_breg8 = 0x78,
-  DW_OP_breg9 = 0x79,
-  DW_OP_breg10 = 0x7a,
-  DW_OP_breg11 = 0x7b,
-  DW_OP_breg12 = 0x7c,
-  DW_OP_breg13 = 0x7d,
-  DW_OP_breg14 = 0x7e,
-  DW_OP_breg15 = 0x7f,
-  DW_OP_breg16 = 0x80,
-  DW_OP_breg17 = 0x81,
-  DW_OP_breg18 = 0x82,
-  DW_OP_breg19 = 0x83,
-  DW_OP_breg20 = 0x84,
-  DW_OP_breg21 = 0x85,
-  DW_OP_breg22 = 0x86,
-  DW_OP_breg23 = 0x87,
-  DW_OP_breg24 = 0x88,
-  DW_OP_breg25 = 0x89,
-  DW_OP_breg26 = 0x8a,
-  DW_OP_breg27 = 0x8b,
-  DW_OP_breg28 = 0x8c,
-  DW_OP_breg29 = 0x8d,
-  DW_OP_breg30 = 0x8e,
-  DW_OP_breg31 = 0x8f,
-  DW_OP_regx = 0x90,
-  DW_OP_fbreg = 0x91,
-  DW_OP_bregx = 0x92,
-  DW_OP_piece = 0x93,
-  DW_OP_deref_size = 0x94,
-  DW_OP_xderef_size = 0x95,
-  DW_OP_nop = 0x96,
-  DW_OP_push_object_address = 0x97,
-  DW_OP_call2 = 0x98,
-  DW_OP_call4 = 0x99,
-  DW_OP_call_ref = 0x9a,
-  DW_OP_form_tls_address = 0x9b,
-  DW_OP_call_frame_cfa = 0x9c,
-  DW_OP_bit_piece = 0x9d,
-  DW_OP_implicit_value = 0x9e,
-  DW_OP_stack_value = 0x9f,
+#define HANDLE_DW_OP(ID, NAME) DW_OP_##NAME = ID,
+#include "llvm/Support/Dwarf.def"
   DW_OP_lo_user = 0xe0,
-  DW_OP_hi_user = 0xff,
-
-  // Extensions for GNU-style thread-local storage.
-  DW_OP_GNU_push_tls_address = 0xe0,
-
-  // Extensions for Fission proposal.
-  DW_OP_GNU_addr_index = 0xfb,
-  DW_OP_GNU_const_index = 0xfc
+  DW_OP_hi_user = 0xff
 };
 
 enum TypeKind {
-  // Encoding attribute values
-  DW_ATE_address = 0x01,
-  DW_ATE_boolean = 0x02,
-  DW_ATE_complex_float = 0x03,
-  DW_ATE_float = 0x04,
-  DW_ATE_signed = 0x05,
-  DW_ATE_signed_char = 0x06,
-  DW_ATE_unsigned = 0x07,
-  DW_ATE_unsigned_char = 0x08,
-  DW_ATE_imaginary_float = 0x09,
-  DW_ATE_packed_decimal = 0x0a,
-  DW_ATE_numeric_string = 0x0b,
-  DW_ATE_edited = 0x0c,
-  DW_ATE_signed_fixed = 0x0d,
-  DW_ATE_unsigned_fixed = 0x0e,
-  DW_ATE_decimal_float = 0x0f,
-  DW_ATE_UTF = 0x10,
+#define HANDLE_DW_ATE(ID, NAME) DW_ATE_##NAME = ID,
+#include "llvm/Support/Dwarf.def"
   DW_ATE_lo_user = 0x80,
   DW_ATE_hi_user = 0xff
 };
@@ -584,45 +335,15 @@ enum VisibilityAttribute {
 };
 
 enum VirtualityAttribute {
-  // Virtuality codes
-  DW_VIRTUALITY_none = 0x00,
-  DW_VIRTUALITY_virtual = 0x01,
-  DW_VIRTUALITY_pure_virtual = 0x02
+#define HANDLE_DW_VIRTUALITY(ID, NAME) DW_VIRTUALITY_##NAME = ID,
+#include "llvm/Support/Dwarf.def"
+  DW_VIRTUALITY_max = 0x02
 };
 
 enum SourceLanguage {
-  // Language names
-  DW_LANG_C89 = 0x0001,
-  DW_LANG_C = 0x0002,
-  DW_LANG_Ada83 = 0x0003,
-  DW_LANG_C_plus_plus = 0x0004,
-  DW_LANG_Cobol74 = 0x0005,
-  DW_LANG_Cobol85 = 0x0006,
-  DW_LANG_Fortran77 = 0x0007,
-  DW_LANG_Fortran90 = 0x0008,
-  DW_LANG_Pascal83 = 0x0009,
-  DW_LANG_Modula2 = 0x000a,
-  DW_LANG_Java = 0x000b,
-  DW_LANG_C99 = 0x000c,
-  DW_LANG_Ada95 = 0x000d,
-  DW_LANG_Fortran95 = 0x000e,
-  DW_LANG_PLI = 0x000f,
-  DW_LANG_ObjC = 0x0010,
-  DW_LANG_ObjC_plus_plus = 0x0011,
-  DW_LANG_UPC = 0x0012,
-  DW_LANG_D = 0x0013,
-  // New in DWARF 5:
-  DW_LANG_Python = 0x0014,
-  DW_LANG_OpenCL = 0x0015,
-  DW_LANG_Go = 0x0016,
-  DW_LANG_Modula3 = 0x0017,
-  DW_LANG_Haskell = 0x0018,
-  DW_LANG_C_plus_plus_03 = 0x0019,
-  DW_LANG_C_plus_plus_11 = 0x001a,
-  DW_LANG_OCaml = 0x001b,
-
+#define HANDLE_DW_LANG(ID, NAME) DW_LANG_##NAME = ID,
+#include "llvm/Support/Dwarf.def"
   DW_LANG_lo_user = 0x8000,
-  DW_LANG_Mips_Assembler = 0x8001,
   DW_LANG_hi_user = 0xffff
 };
 
@@ -859,6 +580,22 @@ const char *GDBIndexEntryKindString(GDBIndexEntryKind Kind);
 const char *GDBIndexEntryLinkageString(GDBIndexEntryLinkage Linkage);
 /// @}
 
+/// \defgroup DwarfConstantsParsing Dwarf constants parsing functions
+///
+/// These functions map their strings back to the corresponding enumeration
+/// value or return 0 if there is none, except for these exceptions:
+///
+/// \li \a getTag() returns \a DW_TAG_invalid on invalid input.
+/// \li \a getVirtuality() returns \a DW_VIRTUALITY_invalid on invalid input.
+///
+/// @{
+unsigned getTag(StringRef TagString);
+unsigned getOperationEncoding(StringRef OperationEncodingString);
+unsigned getVirtuality(StringRef VirtualityString);
+unsigned getLanguage(StringRef LanguageString);
+unsigned getAttributeEncoding(StringRef EncodingString);
+/// @}
+
 /// \brief Returns the symbolic string representing Val when used as a value
 /// for attribute Attr.
 const char *AttributeValueString(uint16_t Attr, unsigned Val);
diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h
index 5f78cc2..fc6f314 100644
--- a/include/llvm/Support/ELF.h
+++ b/include/llvm/Support/ELF.h
@@ -350,152 +350,21 @@ enum {
   ELFOSABI_STANDALONE = 255   // Standalone (embedded) application
 };
 
+#define ELF_RELOC(name, value) name = value,
+
 // X86_64 relocations.
 enum {
-  R_X86_64_NONE       = 0,
-  R_X86_64_64         = 1,
-  R_X86_64_PC32       = 2,
-  R_X86_64_GOT32      = 3,
-  R_X86_64_PLT32      = 4,
-  R_X86_64_COPY       = 5,
-  R_X86_64_GLOB_DAT   = 6,
-  R_X86_64_JUMP_SLOT  = 7,
-  R_X86_64_RELATIVE   = 8,
-  R_X86_64_GOTPCREL   = 9,
-  R_X86_64_32         = 10,
-  R_X86_64_32S        = 11,
-  R_X86_64_16         = 12,
-  R_X86_64_PC16       = 13,
-  R_X86_64_8          = 14,
-  R_X86_64_PC8        = 15,
-  R_X86_64_DTPMOD64   = 16,
-  R_X86_64_DTPOFF64   = 17,
-  R_X86_64_TPOFF64    = 18,
-  R_X86_64_TLSGD      = 19,
-  R_X86_64_TLSLD      = 20,
-  R_X86_64_DTPOFF32   = 21,
-  R_X86_64_GOTTPOFF   = 22,
-  R_X86_64_TPOFF32    = 23,
-  R_X86_64_PC64       = 24,
-  R_X86_64_GOTOFF64   = 25,
-  R_X86_64_GOTPC32    = 26,
-  R_X86_64_GOT64      = 27,
-  R_X86_64_GOTPCREL64 = 28,
-  R_X86_64_GOTPC64    = 29,
-  R_X86_64_GOTPLT64   = 30,
-  R_X86_64_PLTOFF64   = 31,
-  R_X86_64_SIZE32     = 32,
-  R_X86_64_SIZE64     = 33,
-  R_X86_64_GOTPC32_TLSDESC = 34,
-  R_X86_64_TLSDESC_CALL    = 35,
-  R_X86_64_TLSDESC    = 36,
-  R_X86_64_IRELATIVE  = 37
+#include "ELFRelocs/x86_64.def"
 };
 
 // i386 relocations.
-// TODO: this is just a subset
 enum {
-  R_386_NONE          = 0,
-  R_386_32            = 1,
-  R_386_PC32          = 2,
-  R_386_GOT32         = 3,
-  R_386_PLT32         = 4,
-  R_386_COPY          = 5,
-  R_386_GLOB_DAT      = 6,
-  R_386_JUMP_SLOT     = 7,
-  R_386_RELATIVE      = 8,
-  R_386_GOTOFF        = 9,
-  R_386_GOTPC         = 10,
-  R_386_32PLT         = 11,
-  R_386_TLS_TPOFF     = 14,
-  R_386_TLS_IE        = 15,
-  R_386_TLS_GOTIE     = 16,
-  R_386_TLS_LE        = 17,
-  R_386_TLS_GD        = 18,
-  R_386_TLS_LDM       = 19,
-  R_386_16            = 20,
-  R_386_PC16          = 21,
-  R_386_8             = 22,
-  R_386_PC8           = 23,
-  R_386_TLS_GD_32     = 24,
-  R_386_TLS_GD_PUSH   = 25,
-  R_386_TLS_GD_CALL   = 26,
-  R_386_TLS_GD_POP    = 27,
-  R_386_TLS_LDM_32    = 28,
-  R_386_TLS_LDM_PUSH  = 29,
-  R_386_TLS_LDM_CALL  = 30,
-  R_386_TLS_LDM_POP   = 31,
-  R_386_TLS_LDO_32    = 32,
-  R_386_TLS_IE_32     = 33,
-  R_386_TLS_LE_32     = 34,
-  R_386_TLS_DTPMOD32  = 35,
-  R_386_TLS_DTPOFF32  = 36,
-  R_386_TLS_TPOFF32   = 37,
-  R_386_TLS_GOTDESC   = 39,
-  R_386_TLS_DESC_CALL = 40,
-  R_386_TLS_DESC      = 41,
-  R_386_IRELATIVE     = 42,
-  R_386_NUM           = 43
+#include "ELFRelocs/i386.def"
 };
 
 // ELF Relocation types for PPC32
 enum {
-  R_PPC_NONE                  = 0,      /* No relocation. */
-  R_PPC_ADDR32                = 1,
-  R_PPC_ADDR24                = 2,
-  R_PPC_ADDR16                = 3,
-  R_PPC_ADDR16_LO             = 4,
-  R_PPC_ADDR16_HI             = 5,
-  R_PPC_ADDR16_HA             = 6,
-  R_PPC_ADDR14                = 7,
-  R_PPC_ADDR14_BRTAKEN        = 8,
-  R_PPC_ADDR14_BRNTAKEN       = 9,
-  R_PPC_REL24                 = 10,
-  R_PPC_REL14                 = 11,
-  R_PPC_REL14_BRTAKEN         = 12,
-  R_PPC_REL14_BRNTAKEN        = 13,
-  R_PPC_GOT16                 = 14,
-  R_PPC_GOT16_LO              = 15,
-  R_PPC_GOT16_HI              = 16,
-  R_PPC_GOT16_HA              = 17,
-  R_PPC_PLTREL24              = 18,
-  R_PPC_JMP_SLOT              = 21,
-  R_PPC_LOCAL24PC             = 23,
-  R_PPC_REL32                 = 26,
-  R_PPC_TLS                   = 67,
-  R_PPC_DTPMOD32              = 68,
-  R_PPC_TPREL16               = 69,
-  R_PPC_TPREL16_LO            = 70,
-  R_PPC_TPREL16_HI            = 71,
-  R_PPC_TPREL16_HA            = 72,
-  R_PPC_TPREL32               = 73,
-  R_PPC_DTPREL16              = 74,
-  R_PPC_DTPREL16_LO           = 75,
-  R_PPC_DTPREL16_HI           = 76,
-  R_PPC_DTPREL16_HA           = 77,
-  R_PPC_DTPREL32              = 78,
-  R_PPC_GOT_TLSGD16           = 79,
-  R_PPC_GOT_TLSGD16_LO        = 80,
-  R_PPC_GOT_TLSGD16_HI        = 81,
-  R_PPC_GOT_TLSGD16_HA        = 82,
-  R_PPC_GOT_TLSLD16           = 83,
-  R_PPC_GOT_TLSLD16_LO        = 84,
-  R_PPC_GOT_TLSLD16_HI        = 85,
-  R_PPC_GOT_TLSLD16_HA        = 86,
-  R_PPC_GOT_TPREL16           = 87,
-  R_PPC_GOT_TPREL16_LO        = 88,
-  R_PPC_GOT_TPREL16_HI        = 89,
-  R_PPC_GOT_TPREL16_HA        = 90,
-  R_PPC_GOT_DTPREL16          = 91,
-  R_PPC_GOT_DTPREL16_LO       = 92,
-  R_PPC_GOT_DTPREL16_HI       = 93,
-  R_PPC_GOT_DTPREL16_HA       = 94,
-  R_PPC_TLSGD                 = 95,
-  R_PPC_TLSLD                 = 96,
-  R_PPC_REL16                 = 249,
-  R_PPC_REL16_LO              = 250,
-  R_PPC_REL16_HI              = 251,
-  R_PPC_REL16_HA              = 252
+#include "ELFRelocs/PowerPC.def"
 };
 
 // Specific e_flags for PPC64
@@ -531,192 +400,12 @@ encodePPC64LocalEntryOffset(int64_t Offset) {
 
 // ELF Relocation types for PPC64
 enum {
-  R_PPC64_NONE                = 0,
-  R_PPC64_ADDR32              = 1,
-  R_PPC64_ADDR24              = 2,
-  R_PPC64_ADDR16              = 3,
-  R_PPC64_ADDR16_LO           = 4,
-  R_PPC64_ADDR16_HI           = 5,
-  R_PPC64_ADDR16_HA           = 6,
-  R_PPC64_ADDR14              = 7,
-  R_PPC64_ADDR14_BRTAKEN      = 8,
-  R_PPC64_ADDR14_BRNTAKEN     = 9,
-  R_PPC64_REL24               = 10,
-  R_PPC64_REL14               = 11,
-  R_PPC64_REL14_BRTAKEN       = 12,
-  R_PPC64_REL14_BRNTAKEN      = 13,
-  R_PPC64_GOT16               = 14,
-  R_PPC64_GOT16_LO            = 15,
-  R_PPC64_GOT16_HI            = 16,
-  R_PPC64_GOT16_HA            = 17,
-  R_PPC64_JMP_SLOT            = 21,
-  R_PPC64_REL32               = 26,
-  R_PPC64_ADDR64              = 38,
-  R_PPC64_ADDR16_HIGHER       = 39,
-  R_PPC64_ADDR16_HIGHERA      = 40,
-  R_PPC64_ADDR16_HIGHEST      = 41,
-  R_PPC64_ADDR16_HIGHESTA     = 42,
-  R_PPC64_REL64               = 44,
-  R_PPC64_TOC16               = 47,
-  R_PPC64_TOC16_LO            = 48,
-  R_PPC64_TOC16_HI            = 49,
-  R_PPC64_TOC16_HA            = 50,
-  R_PPC64_TOC                 = 51,
-  R_PPC64_ADDR16_DS           = 56,
-  R_PPC64_ADDR16_LO_DS        = 57,
-  R_PPC64_GOT16_DS            = 58,
-  R_PPC64_GOT16_LO_DS         = 59,
-  R_PPC64_TOC16_DS            = 63,
-  R_PPC64_TOC16_LO_DS         = 64,
-  R_PPC64_TLS                 = 67,
-  R_PPC64_DTPMOD64            = 68,
-  R_PPC64_TPREL16             = 69,
-  R_PPC64_TPREL16_LO          = 70,
-  R_PPC64_TPREL16_HI          = 71,
-  R_PPC64_TPREL16_HA          = 72,
-  R_PPC64_TPREL64             = 73,
-  R_PPC64_DTPREL16            = 74,
-  R_PPC64_DTPREL16_LO         = 75,
-  R_PPC64_DTPREL16_HI         = 76,
-  R_PPC64_DTPREL16_HA         = 77,
-  R_PPC64_DTPREL64            = 78,
-  R_PPC64_GOT_TLSGD16         = 79,
-  R_PPC64_GOT_TLSGD16_LO      = 80,
-  R_PPC64_GOT_TLSGD16_HI      = 81,
-  R_PPC64_GOT_TLSGD16_HA      = 82,
-  R_PPC64_GOT_TLSLD16         = 83,
-  R_PPC64_GOT_TLSLD16_LO      = 84,
-  R_PPC64_GOT_TLSLD16_HI      = 85,
-  R_PPC64_GOT_TLSLD16_HA      = 86,
-  R_PPC64_GOT_TPREL16_DS      = 87,
-  R_PPC64_GOT_TPREL16_LO_DS   = 88,
-  R_PPC64_GOT_TPREL16_HI      = 89,
-  R_PPC64_GOT_TPREL16_HA      = 90,
-  R_PPC64_GOT_DTPREL16_DS     = 91,
-  R_PPC64_GOT_DTPREL16_LO_DS  = 92,
-  R_PPC64_GOT_DTPREL16_HI     = 93,
-  R_PPC64_GOT_DTPREL16_HA     = 94,
-  R_PPC64_TPREL16_DS          = 95,
-  R_PPC64_TPREL16_LO_DS       = 96,
-  R_PPC64_TPREL16_HIGHER      = 97,
-  R_PPC64_TPREL16_HIGHERA     = 98,
-  R_PPC64_TPREL16_HIGHEST     = 99,
-  R_PPC64_TPREL16_HIGHESTA    = 100,
-  R_PPC64_DTPREL16_DS         = 101,
-  R_PPC64_DTPREL16_LO_DS      = 102,
-  R_PPC64_DTPREL16_HIGHER     = 103,
-  R_PPC64_DTPREL16_HIGHERA    = 104,
-  R_PPC64_DTPREL16_HIGHEST    = 105,
-  R_PPC64_DTPREL16_HIGHESTA   = 106,
-  R_PPC64_TLSGD               = 107,
-  R_PPC64_TLSLD               = 108,
-  R_PPC64_REL16               = 249,
-  R_PPC64_REL16_LO            = 250,
-  R_PPC64_REL16_HI            = 251,
-  R_PPC64_REL16_HA            = 252
+#include "ELFRelocs/PowerPC64.def"
 };
 
 // ELF Relocation types for AArch64
-
 enum {
-  R_AARCH64_NONE                        = 0x100,
-
-  R_AARCH64_ABS64                       = 0x101,
-  R_AARCH64_ABS32                       = 0x102,
-  R_AARCH64_ABS16                       = 0x103,
-  R_AARCH64_PREL64                      = 0x104,
-  R_AARCH64_PREL32                      = 0x105,
-  R_AARCH64_PREL16                      = 0x106,
-
-  R_AARCH64_MOVW_UABS_G0                = 0x107,
-  R_AARCH64_MOVW_UABS_G0_NC             = 0x108,
-  R_AARCH64_MOVW_UABS_G1                = 0x109,
-  R_AARCH64_MOVW_UABS_G1_NC             = 0x10a,
-  R_AARCH64_MOVW_UABS_G2                = 0x10b,
-  R_AARCH64_MOVW_UABS_G2_NC             = 0x10c,
-  R_AARCH64_MOVW_UABS_G3                = 0x10d,
-  R_AARCH64_MOVW_SABS_G0                = 0x10e,
-  R_AARCH64_MOVW_SABS_G1                = 0x10f,
-  R_AARCH64_MOVW_SABS_G2                = 0x110,
-
-  R_AARCH64_LD_PREL_LO19                = 0x111,
-  R_AARCH64_ADR_PREL_LO21               = 0x112,
-  R_AARCH64_ADR_PREL_PG_HI21            = 0x113,
-  R_AARCH64_ADD_ABS_LO12_NC             = 0x115,
-  R_AARCH64_LDST8_ABS_LO12_NC           = 0x116,
-
-  R_AARCH64_TSTBR14                     = 0x117,
-  R_AARCH64_CONDBR19                    = 0x118,
-  R_AARCH64_JUMP26                      = 0x11a,
-  R_AARCH64_CALL26                      = 0x11b,
-
-  R_AARCH64_LDST16_ABS_LO12_NC          = 0x11c,
-  R_AARCH64_LDST32_ABS_LO12_NC          = 0x11d,
-  R_AARCH64_LDST64_ABS_LO12_NC          = 0x11e,
-
-  R_AARCH64_LDST128_ABS_LO12_NC         = 0x12b,
-
-  R_AARCH64_GOTREL64                    = 0x133,
-  R_AARCH64_GOTREL32                    = 0x134,
-
-  R_AARCH64_ADR_GOT_PAGE                = 0x137,
-  R_AARCH64_LD64_GOT_LO12_NC            = 0x138,
-
-  R_AARCH64_TLSLD_MOVW_DTPREL_G2        = 0x20b,
-  R_AARCH64_TLSLD_MOVW_DTPREL_G1        = 0x20c,
-  R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC     = 0x20d,
-  R_AARCH64_TLSLD_MOVW_DTPREL_G0        = 0x20e,
-  R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC     = 0x20f,
-  R_AARCH64_TLSLD_ADD_DTPREL_HI12       = 0x210,
-  R_AARCH64_TLSLD_ADD_DTPREL_LO12       = 0x211,
-  R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC    = 0x212,
-  R_AARCH64_TLSLD_LDST8_DTPREL_LO12     = 0x213,
-  R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC  = 0x214,
-  R_AARCH64_TLSLD_LDST16_DTPREL_LO12    = 0x215,
-  R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC = 0x216,
-  R_AARCH64_TLSLD_LDST32_DTPREL_LO12    = 0x217,
-  R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC = 0x218,
-  R_AARCH64_TLSLD_LDST64_DTPREL_LO12    = 0x219,
-  R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC = 0x21a,
-
-  R_AARCH64_TLSIE_MOVW_GOTTPREL_G1      = 0x21b,
-  R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC   = 0x21c,
-  R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21   = 0x21d,
-  R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC = 0x21e,
-  R_AARCH64_TLSIE_LD_GOTTPREL_PREL19    = 0x21f,
-
-  R_AARCH64_TLSLE_MOVW_TPREL_G2         = 0x220,
-  R_AARCH64_TLSLE_MOVW_TPREL_G1         = 0x221,
-  R_AARCH64_TLSLE_MOVW_TPREL_G1_NC      = 0x222,
-  R_AARCH64_TLSLE_MOVW_TPREL_G0         = 0x223,
-  R_AARCH64_TLSLE_MOVW_TPREL_G0_NC      = 0x224,
-  R_AARCH64_TLSLE_ADD_TPREL_HI12        = 0x225,
-  R_AARCH64_TLSLE_ADD_TPREL_LO12        = 0x226,
-  R_AARCH64_TLSLE_ADD_TPREL_LO12_NC     = 0x227,
-  R_AARCH64_TLSLE_LDST8_TPREL_LO12      = 0x228,
-  R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC   = 0x229,
-  R_AARCH64_TLSLE_LDST16_TPREL_LO12     = 0x22a,
-  R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC  = 0x22b,
-  R_AARCH64_TLSLE_LDST32_TPREL_LO12     = 0x22c,
-  R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC  = 0x22d,
-  R_AARCH64_TLSLE_LDST64_TPREL_LO12     = 0x22e,
-  R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC  = 0x22f,
-
-  R_AARCH64_TLSDESC_ADR_PAGE            = 0x232,
-  R_AARCH64_TLSDESC_LD64_LO12_NC        = 0x233,
-  R_AARCH64_TLSDESC_ADD_LO12_NC         = 0x234,
-
-  R_AARCH64_TLSDESC_CALL                = 0x239,
-
-  R_AARCH64_COPY                        = 0x400,
-  R_AARCH64_GLOB_DAT                    = 0x401,
-  R_AARCH64_JUMP_SLOT                   = 0x402,
-  R_AARCH64_RELATIVE                    = 0x403,
-  R_AARCH64_TLS_DTPREL64                = 0x404,
-  R_AARCH64_TLS_DTPMOD64                = 0x405,
-  R_AARCH64_TLS_TPREL64                 = 0x406,
-  R_AARCH64_TLSDESC                     = 0x407,
-  R_AARCH64_IRELATIVE                   = 0x408
+#include "ELFRelocs/AArch64.def"
 };
 
 // ARM Specific e_flags
@@ -733,140 +422,8 @@ enum : unsigned {
 };
 
 // ELF Relocation types for ARM
-// Meets 2.08 ABI Specs.
-
 enum {
-  R_ARM_NONE                  = 0x00,
-  R_ARM_PC24                  = 0x01,
-  R_ARM_ABS32                 = 0x02,
-  R_ARM_REL32                 = 0x03,
-  R_ARM_LDR_PC_G0             = 0x04,
-  R_ARM_ABS16                 = 0x05,
-  R_ARM_ABS12                 = 0x06,
-  R_ARM_THM_ABS5              = 0x07,
-  R_ARM_ABS8                  = 0x08,
-  R_ARM_SBREL32               = 0x09,
-  R_ARM_THM_CALL              = 0x0a,
-  R_ARM_THM_PC8               = 0x0b,
-  R_ARM_BREL_ADJ              = 0x0c,
-  R_ARM_TLS_DESC              = 0x0d,
-  R_ARM_THM_SWI8              = 0x0e,
-  R_ARM_XPC25                 = 0x0f,
-  R_ARM_THM_XPC22             = 0x10,
-  R_ARM_TLS_DTPMOD32          = 0x11,
-  R_ARM_TLS_DTPOFF32          = 0x12,
-  R_ARM_TLS_TPOFF32           = 0x13,
-  R_ARM_COPY                  = 0x14,
-  R_ARM_GLOB_DAT              = 0x15,
-  R_ARM_JUMP_SLOT             = 0x16,
-  R_ARM_RELATIVE              = 0x17,
-  R_ARM_GOTOFF32              = 0x18,
-  R_ARM_BASE_PREL             = 0x19,
-  R_ARM_GOT_BREL              = 0x1a,
-  R_ARM_PLT32                 = 0x1b,
-  R_ARM_CALL                  = 0x1c,
-  R_ARM_JUMP24                = 0x1d,
-  R_ARM_THM_JUMP24            = 0x1e,
-  R_ARM_BASE_ABS              = 0x1f,
-  R_ARM_ALU_PCREL_7_0         = 0x20,
-  R_ARM_ALU_PCREL_15_8        = 0x21,
-  R_ARM_ALU_PCREL_23_15       = 0x22,
-  R_ARM_LDR_SBREL_11_0_NC     = 0x23,
-  R_ARM_ALU_SBREL_19_12_NC    = 0x24,
-  R_ARM_ALU_SBREL_27_20_CK    = 0x25,
-  R_ARM_TARGET1               = 0x26,
-  R_ARM_SBREL31               = 0x27,
-  R_ARM_V4BX                  = 0x28,
-  R_ARM_TARGET2               = 0x29,
-  R_ARM_PREL31                = 0x2a,
-  R_ARM_MOVW_ABS_NC           = 0x2b,
-  R_ARM_MOVT_ABS              = 0x2c,
-  R_ARM_MOVW_PREL_NC          = 0x2d,
-  R_ARM_MOVT_PREL             = 0x2e,
-  R_ARM_THM_MOVW_ABS_NC       = 0x2f,
-  R_ARM_THM_MOVT_ABS          = 0x30,
-  R_ARM_THM_MOVW_PREL_NC      = 0x31,
-  R_ARM_THM_MOVT_PREL         = 0x32,
-  R_ARM_THM_JUMP19            = 0x33,
-  R_ARM_THM_JUMP6             = 0x34,
-  R_ARM_THM_ALU_PREL_11_0     = 0x35,
-  R_ARM_THM_PC12              = 0x36,
-  R_ARM_ABS32_NOI             = 0x37,
-  R_ARM_REL32_NOI             = 0x38,
-  R_ARM_ALU_PC_G0_NC          = 0x39,
-  R_ARM_ALU_PC_G0             = 0x3a,
-  R_ARM_ALU_PC_G1_NC          = 0x3b,
-  R_ARM_ALU_PC_G1             = 0x3c,
-  R_ARM_ALU_PC_G2             = 0x3d,
-  R_ARM_LDR_PC_G1             = 0x3e,
-  R_ARM_LDR_PC_G2             = 0x3f,
-  R_ARM_LDRS_PC_G0            = 0x40,
-  R_ARM_LDRS_PC_G1            = 0x41,
-  R_ARM_LDRS_PC_G2            = 0x42,
-  R_ARM_LDC_PC_G0             = 0x43,
-  R_ARM_LDC_PC_G1             = 0x44,
-  R_ARM_LDC_PC_G2             = 0x45,
-  R_ARM_ALU_SB_G0_NC          = 0x46,
-  R_ARM_ALU_SB_G0             = 0x47,
-  R_ARM_ALU_SB_G1_NC          = 0x48,
-  R_ARM_ALU_SB_G1             = 0x49,
-  R_ARM_ALU_SB_G2             = 0x4a,
-  R_ARM_LDR_SB_G0             = 0x4b,
-  R_ARM_LDR_SB_G1             = 0x4c,
-  R_ARM_LDR_SB_G2             = 0x4d,
-  R_ARM_LDRS_SB_G0            = 0x4e,
-  R_ARM_LDRS_SB_G1            = 0x4f,
-  R_ARM_LDRS_SB_G2            = 0x50,
-  R_ARM_LDC_SB_G0             = 0x51,
-  R_ARM_LDC_SB_G1             = 0x52,
-  R_ARM_LDC_SB_G2             = 0x53,
-  R_ARM_MOVW_BREL_NC          = 0x54,
-  R_ARM_MOVT_BREL             = 0x55,
-  R_ARM_MOVW_BREL             = 0x56,
-  R_ARM_THM_MOVW_BREL_NC      = 0x57,
-  R_ARM_THM_MOVT_BREL         = 0x58,
-  R_ARM_THM_MOVW_BREL         = 0x59,
-  R_ARM_TLS_GOTDESC           = 0x5a,
-  R_ARM_TLS_CALL              = 0x5b,
-  R_ARM_TLS_DESCSEQ           = 0x5c,
-  R_ARM_THM_TLS_CALL          = 0x5d,
-  R_ARM_PLT32_ABS             = 0x5e,
-  R_ARM_GOT_ABS               = 0x5f,
-  R_ARM_GOT_PREL              = 0x60,
-  R_ARM_GOT_BREL12            = 0x61,
-  R_ARM_GOTOFF12              = 0x62,
-  R_ARM_GOTRELAX              = 0x63,
-  R_ARM_GNU_VTENTRY           = 0x64,
-  R_ARM_GNU_VTINHERIT         = 0x65,
-  R_ARM_THM_JUMP11            = 0x66,
-  R_ARM_THM_JUMP8             = 0x67,
-  R_ARM_TLS_GD32              = 0x68,
-  R_ARM_TLS_LDM32             = 0x69,
-  R_ARM_TLS_LDO32             = 0x6a,
-  R_ARM_TLS_IE32              = 0x6b,
-  R_ARM_TLS_LE32              = 0x6c,
-  R_ARM_TLS_LDO12             = 0x6d,
-  R_ARM_TLS_LE12              = 0x6e,
-  R_ARM_TLS_IE12GP            = 0x6f,
-  R_ARM_PRIVATE_0             = 0x70,
-  R_ARM_PRIVATE_1             = 0x71,
-  R_ARM_PRIVATE_2             = 0x72,
-  R_ARM_PRIVATE_3             = 0x73,
-  R_ARM_PRIVATE_4             = 0x74,
-  R_ARM_PRIVATE_5             = 0x75,
-  R_ARM_PRIVATE_6             = 0x76,
-  R_ARM_PRIVATE_7             = 0x77,
-  R_ARM_PRIVATE_8             = 0x78,
-  R_ARM_PRIVATE_9             = 0x79,
-  R_ARM_PRIVATE_10            = 0x7a,
-  R_ARM_PRIVATE_11            = 0x7b,
-  R_ARM_PRIVATE_12            = 0x7c,
-  R_ARM_PRIVATE_13            = 0x7d,
-  R_ARM_PRIVATE_14            = 0x7e,
-  R_ARM_PRIVATE_15            = 0x7f,
-  R_ARM_ME_TOO                = 0x80,
-  R_ARM_THM_TLS_DESCSEQ16     = 0x81,
-  R_ARM_THM_TLS_DESCSEQ32     = 0x82
+#include "ELFRelocs/ARM.def"
 };
 
 // Mips Specific e_flags
@@ -897,8 +454,8 @@ enum : unsigned {
   EF_MIPS_ARCH_5    = 0x40000000, // MIPS5 instruction set
   EF_MIPS_ARCH_32   = 0x50000000, // MIPS32 instruction set per linux not elf.h
   EF_MIPS_ARCH_64   = 0x60000000, // MIPS64 instruction set per linux not elf.h
-  EF_MIPS_ARCH_32R2 = 0x70000000, // mips32r2
-  EF_MIPS_ARCH_64R2 = 0x80000000, // mips64r2
+  EF_MIPS_ARCH_32R2 = 0x70000000, // mips32r2, mips32r3, mips32r5
+  EF_MIPS_ARCH_64R2 = 0x80000000, // mips64r2, mips64r3, mips64r5
   EF_MIPS_ARCH_32R6 = 0x90000000, // mips32r6
   EF_MIPS_ARCH_64R6 = 0xa0000000, // mips64r6
   EF_MIPS_ARCH      = 0xf0000000  // Mask for applying EF_MIPS_ARCH_ variant
@@ -906,85 +463,7 @@ enum : unsigned {
 
 // ELF Relocation types for Mips
 enum {
-  R_MIPS_NONE              =  0,
-  R_MIPS_16                =  1,
-  R_MIPS_32                =  2,
-  R_MIPS_REL32             =  3,
-  R_MIPS_26                =  4,
-  R_MIPS_HI16              =  5,
-  R_MIPS_LO16              =  6,
-  R_MIPS_GPREL16           =  7,
-  R_MIPS_LITERAL           =  8,
-  R_MIPS_GOT16             =  9,
-  R_MIPS_PC16              = 10,
-  R_MIPS_CALL16            = 11,
-  R_MIPS_GPREL32           = 12,
-  R_MIPS_UNUSED1           = 13,
-  R_MIPS_UNUSED2           = 14,
-  R_MIPS_SHIFT5            = 16,
-  R_MIPS_SHIFT6            = 17,
-  R_MIPS_64                = 18,
-  R_MIPS_GOT_DISP          = 19,
-  R_MIPS_GOT_PAGE          = 20,
-  R_MIPS_GOT_OFST          = 21,
-  R_MIPS_GOT_HI16          = 22,
-  R_MIPS_GOT_LO16          = 23,
-  R_MIPS_SUB               = 24,
-  R_MIPS_INSERT_A          = 25,
-  R_MIPS_INSERT_B          = 26,
-  R_MIPS_DELETE            = 27,
-  R_MIPS_HIGHER            = 28,
-  R_MIPS_HIGHEST           = 29,
-  R_MIPS_CALL_HI16         = 30,
-  R_MIPS_CALL_LO16         = 31,
-  R_MIPS_SCN_DISP          = 32,
-  R_MIPS_REL16             = 33,
-  R_MIPS_ADD_IMMEDIATE     = 34,
-  R_MIPS_PJUMP             = 35,
-  R_MIPS_RELGOT            = 36,
-  R_MIPS_JALR              = 37,
-  R_MIPS_TLS_DTPMOD32      = 38,
-  R_MIPS_TLS_DTPREL32      = 39,
-  R_MIPS_TLS_DTPMOD64      = 40,
-  R_MIPS_TLS_DTPREL64      = 41,
-  R_MIPS_TLS_GD            = 42,
-  R_MIPS_TLS_LDM           = 43,
-  R_MIPS_TLS_DTPREL_HI16   = 44,
-  R_MIPS_TLS_DTPREL_LO16   = 45,
-  R_MIPS_TLS_GOTTPREL      = 46,
-  R_MIPS_TLS_TPREL32       = 47,
-  R_MIPS_TLS_TPREL64       = 48,
-  R_MIPS_TLS_TPREL_HI16    = 49,
-  R_MIPS_TLS_TPREL_LO16    = 50,
-  R_MIPS_GLOB_DAT          = 51,
-  R_MIPS_PC21_S2           = 60,
-  R_MIPS_PC26_S2           = 61,
-  R_MIPS_PC18_S3           = 62,
-  R_MIPS_PC19_S2           = 63,
-  R_MIPS_PCHI16            = 64,
-  R_MIPS_PCLO16            = 65,
-  R_MIPS16_GOT16           = 102,
-  R_MIPS16_HI16            = 104,
-  R_MIPS16_LO16            = 105,
-  R_MIPS_COPY              = 126,
-  R_MIPS_JUMP_SLOT         = 127,
-  R_MICROMIPS_26_S1        = 133,
-  R_MICROMIPS_HI16         = 134,
-  R_MICROMIPS_LO16         = 135,
-  R_MICROMIPS_GOT16        = 138,
-  R_MICROMIPS_PC16_S1      = 141,
-  R_MICROMIPS_CALL16       = 142,
-  R_MICROMIPS_GOT_DISP     = 145,
-  R_MICROMIPS_GOT_PAGE     = 146,
-  R_MICROMIPS_GOT_OFST     = 147,
-  R_MICROMIPS_TLS_GD          = 162,
-  R_MICROMIPS_TLS_LDM         = 163,
-  R_MICROMIPS_TLS_DTPREL_HI16 = 164,
-  R_MICROMIPS_TLS_DTPREL_LO16 = 165,
-  R_MICROMIPS_TLS_TPREL_HI16  = 169,
-  R_MICROMIPS_TLS_TPREL_LO16  = 170,
-  R_MIPS_NUM               = 218,
-  R_MIPS_PC32              = 248
+#include "ELFRelocs/Mips.def"
 };
 
 // Special values for the st_other field in the symbol table entry for MIPS.
@@ -996,6 +475,22 @@ enum {
   STO_MIPS_MIPS16          = 0xf0   // MIPS Specific ISA for Mips16
 };
 
+// .MIPS.options section descriptor kinds
+enum {
+  ODK_NULL       = 0,   // Undefined
+  ODK_REGINFO    = 1,   // Register usage information
+  ODK_EXCEPTIONS = 2,   // Exception processing options
+  ODK_PAD        = 3,   // Section padding options
+  ODK_HWPATCH    = 4,   // Hardware patches applied
+  ODK_FILL       = 5,   // Linker fill value
+  ODK_TAGS       = 6,   // Space for tool identification
+  ODK_HWAND      = 7,   // Hardware AND patches applied
+  ODK_HWOR       = 8,   // Hardware OR patches applied
+  ODK_GP_GROUP   = 9,   // GP group to use for text/data sections
+  ODK_IDENT      = 10,  // ID information
+  ODK_PAGESIZE   = 11   // Page size information
+};
+
 // Hexagon Specific e_flags
 // Release 5 ABI
 enum {
@@ -1025,250 +520,22 @@ enum {
 };
 
 // ELF Relocation types for Hexagon
-// Release 5 ABI
 enum {
-  R_HEX_NONE              =  0,
-  R_HEX_B22_PCREL         =  1,
-  R_HEX_B15_PCREL         =  2,
-  R_HEX_B7_PCREL          =  3,
-  R_HEX_LO16              =  4,
-  R_HEX_HI16              =  5,
-  R_HEX_32                =  6,
-  R_HEX_16                =  7,
-  R_HEX_8                 =  8,
-  R_HEX_GPREL16_0         =  9,
-  R_HEX_GPREL16_1         =  10,
-  R_HEX_GPREL16_2         =  11,
-  R_HEX_GPREL16_3         =  12,
-  R_HEX_HL16              =  13,
-  R_HEX_B13_PCREL         =  14,
-  R_HEX_B9_PCREL          =  15,
-  R_HEX_B32_PCREL_X       =  16,
-  R_HEX_32_6_X            =  17,
-  R_HEX_B22_PCREL_X       =  18,
-  R_HEX_B15_PCREL_X       =  19,
-  R_HEX_B13_PCREL_X       =  20,
-  R_HEX_B9_PCREL_X        =  21,
-  R_HEX_B7_PCREL_X        =  22,
-  R_HEX_16_X              =  23,
-  R_HEX_12_X              =  24,
-  R_HEX_11_X              =  25,
-  R_HEX_10_X              =  26,
-  R_HEX_9_X               =  27,
-  R_HEX_8_X               =  28,
-  R_HEX_7_X               =  29,
-  R_HEX_6_X               =  30,
-  R_HEX_32_PCREL          =  31,
-  R_HEX_COPY              =  32,
-  R_HEX_GLOB_DAT          =  33,
-  R_HEX_JMP_SLOT          =  34,
-  R_HEX_RELATIVE          =  35,
-  R_HEX_PLT_B22_PCREL     =  36,
-  R_HEX_GOTREL_LO16       =  37,
-  R_HEX_GOTREL_HI16       =  38,
-  R_HEX_GOTREL_32         =  39,
-  R_HEX_GOT_LO16          =  40,
-  R_HEX_GOT_HI16          =  41,
-  R_HEX_GOT_32            =  42,
-  R_HEX_GOT_16            =  43,
-  R_HEX_DTPMOD_32         =  44,
-  R_HEX_DTPREL_LO16       =  45,
-  R_HEX_DTPREL_HI16       =  46,
-  R_HEX_DTPREL_32         =  47,
-  R_HEX_DTPREL_16         =  48,
-  R_HEX_GD_PLT_B22_PCREL  =  49,
-  R_HEX_GD_GOT_LO16       =  50,
-  R_HEX_GD_GOT_HI16       =  51,
-  R_HEX_GD_GOT_32         =  52,
-  R_HEX_GD_GOT_16         =  53,
-  R_HEX_IE_LO16           =  54,
-  R_HEX_IE_HI16           =  55,
-  R_HEX_IE_32             =  56,
-  R_HEX_IE_GOT_LO16       =  57,
-  R_HEX_IE_GOT_HI16       =  58,
-  R_HEX_IE_GOT_32         =  59,
-  R_HEX_IE_GOT_16         =  60,
-  R_HEX_TPREL_LO16        =  61,
-  R_HEX_TPREL_HI16        =  62,
-  R_HEX_TPREL_32          =  63,
-  R_HEX_TPREL_16          =  64,
-  R_HEX_6_PCREL_X         =  65,
-  R_HEX_GOTREL_32_6_X     =  66,
-  R_HEX_GOTREL_16_X       =  67,
-  R_HEX_GOTREL_11_X       =  68,
-  R_HEX_GOT_32_6_X        =  69,
-  R_HEX_GOT_16_X          =  70,
-  R_HEX_GOT_11_X          =  71,
-  R_HEX_DTPREL_32_6_X     =  72,
-  R_HEX_DTPREL_16_X       =  73,
-  R_HEX_DTPREL_11_X       =  74,
-  R_HEX_GD_GOT_32_6_X     =  75,
-  R_HEX_GD_GOT_16_X       =  76,
-  R_HEX_GD_GOT_11_X       =  77,
-  R_HEX_IE_32_6_X         =  78,
-  R_HEX_IE_16_X           =  79,
-  R_HEX_IE_GOT_32_6_X     =  80,
-  R_HEX_IE_GOT_16_X       =  81,
-  R_HEX_IE_GOT_11_X       =  82,
-  R_HEX_TPREL_32_6_X      =  83,
-  R_HEX_TPREL_16_X        =  84,
-  R_HEX_TPREL_11_X        =  85
+#include "ELFRelocs/Hexagon.def"
 };
 
 // ELF Relocation types for S390/zSeries
 enum {
-  R_390_NONE        =  0,
-  R_390_8           =  1,
-  R_390_12          =  2,
-  R_390_16          =  3,
-  R_390_32          =  4,
-  R_390_PC32        =  5,
-  R_390_GOT12       =  6,
-  R_390_GOT32       =  7,
-  R_390_PLT32       =  8,
-  R_390_COPY        =  9,
-  R_390_GLOB_DAT    = 10,
-  R_390_JMP_SLOT    = 11,
-  R_390_RELATIVE    = 12,
-  R_390_GOTOFF      = 13,
-  R_390_GOTPC       = 14,
-  R_390_GOT16       = 15,
-  R_390_PC16        = 16,
-  R_390_PC16DBL     = 17,
-  R_390_PLT16DBL    = 18,
-  R_390_PC32DBL     = 19,
-  R_390_PLT32DBL    = 20,
-  R_390_GOTPCDBL    = 21,
-  R_390_64          = 22,
-  R_390_PC64        = 23,
-  R_390_GOT64       = 24,
-  R_390_PLT64       = 25,
-  R_390_GOTENT      = 26,
-  R_390_GOTOFF16    = 27,
-  R_390_GOTOFF64    = 28,
-  R_390_GOTPLT12    = 29,
-  R_390_GOTPLT16    = 30,
-  R_390_GOTPLT32    = 31,
-  R_390_GOTPLT64    = 32,
-  R_390_GOTPLTENT   = 33,
-  R_390_PLTOFF16    = 34,
-  R_390_PLTOFF32    = 35,
-  R_390_PLTOFF64    = 36,
-  R_390_TLS_LOAD    = 37,
-  R_390_TLS_GDCALL  = 38,
-  R_390_TLS_LDCALL  = 39,
-  R_390_TLS_GD32    = 40,
-  R_390_TLS_GD64    = 41,
-  R_390_TLS_GOTIE12 = 42,
-  R_390_TLS_GOTIE32 = 43,
-  R_390_TLS_GOTIE64 = 44,
-  R_390_TLS_LDM32   = 45,
-  R_390_TLS_LDM64   = 46,
-  R_390_TLS_IE32    = 47,
-  R_390_TLS_IE64    = 48,
-  R_390_TLS_IEENT   = 49,
-  R_390_TLS_LE32    = 50,
-  R_390_TLS_LE64    = 51,
-  R_390_TLS_LDO32   = 52,
-  R_390_TLS_LDO64   = 53,
-  R_390_TLS_DTPMOD  = 54,
-  R_390_TLS_DTPOFF  = 55,
-  R_390_TLS_TPOFF   = 56,
-  R_390_20          = 57,
-  R_390_GOT20       = 58,
-  R_390_GOTPLT20    = 59,
-  R_390_TLS_GOTIE20 = 60,
-  R_390_IRELATIVE   = 61
+#include "ELFRelocs/SystemZ.def"
 };
 
 // ELF Relocation type for Sparc.
 enum {
-  R_SPARC_NONE        = 0,
-  R_SPARC_8           = 1,
-  R_SPARC_16          = 2,
-  R_SPARC_32          = 3,
-  R_SPARC_DISP8       = 4,
-  R_SPARC_DISP16      = 5,
-  R_SPARC_DISP32      = 6,
-  R_SPARC_WDISP30     = 7,
-  R_SPARC_WDISP22     = 8,
-  R_SPARC_HI22        = 9,
-  R_SPARC_22          = 10,
-  R_SPARC_13          = 11,
-  R_SPARC_LO10        = 12,
-  R_SPARC_GOT10       = 13,
-  R_SPARC_GOT13       = 14,
-  R_SPARC_GOT22       = 15,
-  R_SPARC_PC10        = 16,
-  R_SPARC_PC22        = 17,
-  R_SPARC_WPLT30      = 18,
-  R_SPARC_COPY        = 19,
-  R_SPARC_GLOB_DAT    = 20,
-  R_SPARC_JMP_SLOT    = 21,
-  R_SPARC_RELATIVE    = 22,
-  R_SPARC_UA32        = 23,
-  R_SPARC_PLT32       = 24,
-  R_SPARC_HIPLT22     = 25,
-  R_SPARC_LOPLT10     = 26,
-  R_SPARC_PCPLT32     = 27,
-  R_SPARC_PCPLT22     = 28,
-  R_SPARC_PCPLT10     = 29,
-  R_SPARC_10          = 30,
-  R_SPARC_11          = 31,
-  R_SPARC_64          = 32,
-  R_SPARC_OLO10       = 33,
-  R_SPARC_HH22        = 34,
-  R_SPARC_HM10        = 35,
-  R_SPARC_LM22        = 36,
-  R_SPARC_PC_HH22     = 37,
-  R_SPARC_PC_HM10     = 38,
-  R_SPARC_PC_LM22     = 39,
-  R_SPARC_WDISP16     = 40,
-  R_SPARC_WDISP19     = 41,
-  R_SPARC_7           = 43,
-  R_SPARC_5           = 44,
-  R_SPARC_6           = 45,
-  R_SPARC_DISP64      = 46,
-  R_SPARC_PLT64       = 47,
-  R_SPARC_HIX22       = 48,
-  R_SPARC_LOX10       = 49,
-  R_SPARC_H44         = 50,
-  R_SPARC_M44         = 51,
-  R_SPARC_L44         = 52,
-  R_SPARC_REGISTER    = 53,
-  R_SPARC_UA64        = 54,
-  R_SPARC_UA16        = 55,
-  R_SPARC_TLS_GD_HI22   = 56,
-  R_SPARC_TLS_GD_LO10   = 57,
-  R_SPARC_TLS_GD_ADD    = 58,
-  R_SPARC_TLS_GD_CALL   = 59,
-  R_SPARC_TLS_LDM_HI22  = 60,
-  R_SPARC_TLS_LDM_LO10  = 61,
-  R_SPARC_TLS_LDM_ADD   = 62,
-  R_SPARC_TLS_LDM_CALL  = 63,
-  R_SPARC_TLS_LDO_HIX22 = 64,
-  R_SPARC_TLS_LDO_LOX10 = 65,
-  R_SPARC_TLS_LDO_ADD   = 66,
-  R_SPARC_TLS_IE_HI22   = 67,
-  R_SPARC_TLS_IE_LO10   = 68,
-  R_SPARC_TLS_IE_LD     = 69,
-  R_SPARC_TLS_IE_LDX    = 70,
-  R_SPARC_TLS_IE_ADD    = 71,
-  R_SPARC_TLS_LE_HIX22  = 72,
-  R_SPARC_TLS_LE_LOX10  = 73,
-  R_SPARC_TLS_DTPMOD32  = 74,
-  R_SPARC_TLS_DTPMOD64  = 75,
-  R_SPARC_TLS_DTPOFF32  = 76,
-  R_SPARC_TLS_DTPOFF64  = 77,
-  R_SPARC_TLS_TPOFF32   = 78,
-  R_SPARC_TLS_TPOFF64   = 79,
-  R_SPARC_GOTDATA_HIX22 = 80,
-  R_SPARC_GOTDATA_LOX22 = 81,
-  R_SPARC_GOTDATA_OP_HIX22 = 82,
-  R_SPARC_GOTDATA_OP_LOX22 = 83,
-  R_SPARC_GOTDATA_OP    = 84
+#include "ELFRelocs/Sparc.def"
 };
 
+#undef ELF_RELOC
+
 // Section header.
 struct Elf32_Shdr {
   Elf32_Word sh_name;      // Section name (index into string table)
@@ -1510,6 +777,7 @@ enum {
   STB_LOCAL = 0,   // Local symbol, not visible outside obj file containing def
   STB_GLOBAL = 1,  // Global symbol, visible to all object files being combined
   STB_WEAK = 2,    // Weak symbol, like global but lower-precedence
+  STB_GNU_UNIQUE = 10,
   STB_LOOS   = 10, // Lowest operating system-specific binding type
   STB_HIOS   = 12, // Highest operating system-specific binding type
   STB_LOPROC = 13, // Lowest processor-specific binding type
@@ -1544,6 +812,14 @@ enum {
   STN_UNDEF = 0
 };
 
+// Special relocation symbols used in the MIPS64 ELF relocation entries
+enum {
+  RSS_UNDEF = 0, // None
+  RSS_GP = 1,    // Value of gp
+  RSS_GP0 = 2,   // Value of gp used to create object being relocated
+  RSS_LOC = 3    // Address of location being relocated
+};
+
 // Relocation entry, without explicit addend.
 struct Elf32_Rel {
   Elf32_Addr r_offset; // Location (file byte offset, or program virtual addr)
diff --git a/include/llvm/Support/ELFRelocs/AArch64.def b/include/llvm/Support/ELFRelocs/AArch64.def
new file mode 100644
index 0000000..aa0c560
--- /dev/null
+++ b/include/llvm/Support/ELFRelocs/AArch64.def
@@ -0,0 +1,147 @@
+
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+// ABI release 1.0
+ELF_RELOC(R_AARCH64_NONE,                             0)
+
+ELF_RELOC(R_AARCH64_ABS64,                        0x101)
+ELF_RELOC(R_AARCH64_ABS32,                        0x102)
+ELF_RELOC(R_AARCH64_ABS16,                        0x103)
+ELF_RELOC(R_AARCH64_PREL64,                       0x104)
+ELF_RELOC(R_AARCH64_PREL32,                       0x105)
+ELF_RELOC(R_AARCH64_PREL16,                       0x106)
+
+ELF_RELOC(R_AARCH64_MOVW_UABS_G0,                 0x107)
+ELF_RELOC(R_AARCH64_MOVW_UABS_G0_NC,              0x108)
+ELF_RELOC(R_AARCH64_MOVW_UABS_G1,                 0x109)
+ELF_RELOC(R_AARCH64_MOVW_UABS_G1_NC,              0x10a)
+ELF_RELOC(R_AARCH64_MOVW_UABS_G2,                 0x10b)
+ELF_RELOC(R_AARCH64_MOVW_UABS_G2_NC,              0x10c)
+ELF_RELOC(R_AARCH64_MOVW_UABS_G3,                 0x10d)
+ELF_RELOC(R_AARCH64_MOVW_SABS_G0,                 0x10e)
+ELF_RELOC(R_AARCH64_MOVW_SABS_G1,                 0x10f)
+ELF_RELOC(R_AARCH64_MOVW_SABS_G2,                 0x110)
+
+ELF_RELOC(R_AARCH64_LD_PREL_LO19,                 0x111)
+ELF_RELOC(R_AARCH64_ADR_PREL_LO21,                0x112)
+ELF_RELOC(R_AARCH64_ADR_PREL_PG_HI21,             0x113)
+ELF_RELOC(R_AARCH64_ADR_PREL_PG_HI21_NC,          0x114)
+ELF_RELOC(R_AARCH64_ADD_ABS_LO12_NC,              0x115)
+ELF_RELOC(R_AARCH64_LDST8_ABS_LO12_NC,            0x116)
+
+ELF_RELOC(R_AARCH64_TSTBR14,                      0x117)
+ELF_RELOC(R_AARCH64_CONDBR19,                     0x118)
+ELF_RELOC(R_AARCH64_JUMP26,                       0x11a)
+ELF_RELOC(R_AARCH64_CALL26,                       0x11b)
+
+ELF_RELOC(R_AARCH64_LDST16_ABS_LO12_NC,           0x11c)
+ELF_RELOC(R_AARCH64_LDST32_ABS_LO12_NC,           0x11d)
+ELF_RELOC(R_AARCH64_LDST64_ABS_LO12_NC,           0x11e)
+
+ELF_RELOC(R_AARCH64_MOVW_PREL_G0,                 0x11f)
+ELF_RELOC(R_AARCH64_MOVW_PREL_G0_NC,              0x120)
+ELF_RELOC(R_AARCH64_MOVW_PREL_G1,                 0x121)
+ELF_RELOC(R_AARCH64_MOVW_PREL_G1_NC,              0x122)
+ELF_RELOC(R_AARCH64_MOVW_PREL_G2,                 0x123)
+ELF_RELOC(R_AARCH64_MOVW_PREL_G2_NC,              0x124)
+ELF_RELOC(R_AARCH64_MOVW_PREL_G3,                 0x125)
+
+ELF_RELOC(R_AARCH64_LDST128_ABS_LO12_NC,          0x12b)
+
+ELF_RELOC(R_AARCH64_MOVW_GOTOFF_G0,               0x12c)
+ELF_RELOC(R_AARCH64_MOVW_GOTOFF_G0_NC,            0x12d)
+ELF_RELOC(R_AARCH64_MOVW_GOTOFF_G1,               0x12e)
+ELF_RELOC(R_AARCH64_MOVW_GOTOFF_G1_NC,            0x12f)
+ELF_RELOC(R_AARCH64_MOVW_GOTOFF_G2,               0x130)
+ELF_RELOC(R_AARCH64_MOVW_GOTOFF_G2_NC,            0x131)
+ELF_RELOC(R_AARCH64_MOVW_GOTOFF_G3,               0x132)
+
+ELF_RELOC(R_AARCH64_GOTREL64,                     0x133)
+ELF_RELOC(R_AARCH64_GOTREL32,                     0x134)
+
+ELF_RELOC(R_AARCH64_GOT_LD_PREL19,                0x135)
+ELF_RELOC(R_AARCH64_LD64_GOTOFF_LO15,             0x136)
+ELF_RELOC(R_AARCH64_ADR_GOT_PAGE,                 0x137)
+ELF_RELOC(R_AARCH64_LD64_GOT_LO12_NC,             0x138)
+ELF_RELOC(R_AARCH64_LD64_GOTPAGE_LO15,            0x139)
+
+ELF_RELOC(R_AARCH64_TLSGD_ADR_PREL21,             0x200)
+ELF_RELOC(R_AARCH64_TLSGD_ADR_PAGE21,             0x201)
+ELF_RELOC(R_AARCH64_TLSGD_ADD_LO12_NC,            0x202)
+ELF_RELOC(R_AARCH64_TLSGD_MOVW_G1,                0x203)
+ELF_RELOC(R_AARCH64_TLSGD_MOVW_G0_NC,             0x204)
+
+ELF_RELOC(R_AARCH64_TLSLD_ADR_PREL21,             0x205)
+ELF_RELOC(R_AARCH64_TLSLD_ADR_PAGE21,             0x206)
+ELF_RELOC(R_AARCH64_TLSLD_ADD_LO12_NC,            0x207)
+ELF_RELOC(R_AARCH64_TLSLD_MOVW_G1,                0x208)
+ELF_RELOC(R_AARCH64_TLSLD_MOVW_G0_NC,             0x209)
+ELF_RELOC(R_AARCH64_TLSLD_LD_PREL19,              0x20a)
+ELF_RELOC(R_AARCH64_TLSLD_MOVW_DTPREL_G2,         0x20b)
+ELF_RELOC(R_AARCH64_TLSLD_MOVW_DTPREL_G1,         0x20c)
+ELF_RELOC(R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC,      0x20d)
+ELF_RELOC(R_AARCH64_TLSLD_MOVW_DTPREL_G0,         0x20e)
+ELF_RELOC(R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC,      0x20f)
+ELF_RELOC(R_AARCH64_TLSLD_ADD_DTPREL_HI12,        0x210)
+ELF_RELOC(R_AARCH64_TLSLD_ADD_DTPREL_LO12,        0x211)
+ELF_RELOC(R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC,     0x212)
+ELF_RELOC(R_AARCH64_TLSLD_LDST8_DTPREL_LO12,      0x213)
+ELF_RELOC(R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC,   0x214)
+ELF_RELOC(R_AARCH64_TLSLD_LDST16_DTPREL_LO12,     0x215)
+ELF_RELOC(R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC,  0x216)
+ELF_RELOC(R_AARCH64_TLSLD_LDST32_DTPREL_LO12,     0x217)
+ELF_RELOC(R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC,  0x218)
+ELF_RELOC(R_AARCH64_TLSLD_LDST64_DTPREL_LO12,     0x219)
+ELF_RELOC(R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC,  0x21a)
+
+ELF_RELOC(R_AARCH64_TLSIE_MOVW_GOTTPREL_G1,       0x21b)
+ELF_RELOC(R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC,    0x21c)
+ELF_RELOC(R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21,    0x21d)
+ELF_RELOC(R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC,  0x21e)
+ELF_RELOC(R_AARCH64_TLSIE_LD_GOTTPREL_PREL19,     0x21f)
+
+ELF_RELOC(R_AARCH64_TLSLE_MOVW_TPREL_G2,          0x220)
+ELF_RELOC(R_AARCH64_TLSLE_MOVW_TPREL_G1,          0x221)
+ELF_RELOC(R_AARCH64_TLSLE_MOVW_TPREL_G1_NC,       0x222)
+ELF_RELOC(R_AARCH64_TLSLE_MOVW_TPREL_G0,          0x223)
+ELF_RELOC(R_AARCH64_TLSLE_MOVW_TPREL_G0_NC,       0x224)
+ELF_RELOC(R_AARCH64_TLSLE_ADD_TPREL_HI12,         0x225)
+ELF_RELOC(R_AARCH64_TLSLE_ADD_TPREL_LO12,         0x226)
+ELF_RELOC(R_AARCH64_TLSLE_ADD_TPREL_LO12_NC,      0x227)
+ELF_RELOC(R_AARCH64_TLSLE_LDST8_TPREL_LO12,       0x228)
+ELF_RELOC(R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC,    0x229)
+ELF_RELOC(R_AARCH64_TLSLE_LDST16_TPREL_LO12,      0x22a)
+ELF_RELOC(R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC,   0x22b)
+ELF_RELOC(R_AARCH64_TLSLE_LDST32_TPREL_LO12,      0x22c)
+ELF_RELOC(R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC,   0x22d)
+ELF_RELOC(R_AARCH64_TLSLE_LDST64_TPREL_LO12,      0x22e)
+ELF_RELOC(R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC,   0x22f)
+
+ELF_RELOC(R_AARCH64_TLSDESC_LD_PREL19,            0x230)
+ELF_RELOC(R_AARCH64_TLSDESC_ADR_PREL21,           0x231)
+ELF_RELOC(R_AARCH64_TLSDESC_ADR_PAGE21,           0x232)
+ELF_RELOC(R_AARCH64_TLSDESC_LD64_LO12_NC,         0x233)
+ELF_RELOC(R_AARCH64_TLSDESC_ADD_LO12_NC,          0x234)
+ELF_RELOC(R_AARCH64_TLSDESC_OFF_G1,               0x235)
+ELF_RELOC(R_AARCH64_TLSDESC_OFF_G0_NC,            0x236)
+ELF_RELOC(R_AARCH64_TLSDESC_LDR,                  0x237)
+ELF_RELOC(R_AARCH64_TLSDESC_ADD,                  0x238)
+ELF_RELOC(R_AARCH64_TLSDESC_CALL,                 0x239)
+
+ELF_RELOC(R_AARCH64_TLSLE_LDST128_TPREL_LO12,     0x23a)
+ELF_RELOC(R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC,  0x23b)
+
+ELF_RELOC(R_AARCH64_TLSLD_LDST128_DTPREL_LO12,    0x23c)
+ELF_RELOC(R_AARCH64_TLSLD_LDST128_DTPREL_LO12_NC, 0x23d)
+
+ELF_RELOC(R_AARCH64_COPY,                         0x400)
+ELF_RELOC(R_AARCH64_GLOB_DAT,                     0x401)
+ELF_RELOC(R_AARCH64_JUMP_SLOT,                    0x402)
+ELF_RELOC(R_AARCH64_RELATIVE,                     0x403)
+ELF_RELOC(R_AARCH64_TLS_DTPREL64,                 0x404)
+ELF_RELOC(R_AARCH64_TLS_DTPMOD64,                 0x405)
+ELF_RELOC(R_AARCH64_TLS_TPREL64,                  0x406)
+ELF_RELOC(R_AARCH64_TLSDESC,                      0x407)
+ELF_RELOC(R_AARCH64_IRELATIVE,                    0x408)
diff --git a/include/llvm/Support/ELFRelocs/ARM.def b/include/llvm/Support/ELFRelocs/ARM.def
new file mode 100644
index 0000000..730fc5b
--- /dev/null
+++ b/include/llvm/Support/ELFRelocs/ARM.def
@@ -0,0 +1,138 @@
+
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+// Meets 2.09 ABI Specs.
+ELF_RELOC(R_ARM_NONE,                   0x00)
+ELF_RELOC(R_ARM_PC24,                   0x01)
+ELF_RELOC(R_ARM_ABS32,                  0x02)
+ELF_RELOC(R_ARM_REL32,                  0x03)
+ELF_RELOC(R_ARM_LDR_PC_G0,              0x04)
+ELF_RELOC(R_ARM_ABS16,                  0x05)
+ELF_RELOC(R_ARM_ABS12,                  0x06)
+ELF_RELOC(R_ARM_THM_ABS5,               0x07)
+ELF_RELOC(R_ARM_ABS8,                   0x08)
+ELF_RELOC(R_ARM_SBREL32,                0x09)
+ELF_RELOC(R_ARM_THM_CALL,               0x0a)
+ELF_RELOC(R_ARM_THM_PC8,                0x0b)
+ELF_RELOC(R_ARM_BREL_ADJ,               0x0c)
+ELF_RELOC(R_ARM_TLS_DESC,               0x0d)
+ELF_RELOC(R_ARM_THM_SWI8,               0x0e)
+ELF_RELOC(R_ARM_XPC25,                  0x0f)
+ELF_RELOC(R_ARM_THM_XPC22,              0x10)
+ELF_RELOC(R_ARM_TLS_DTPMOD32,           0x11)
+ELF_RELOC(R_ARM_TLS_DTPOFF32,           0x12)
+ELF_RELOC(R_ARM_TLS_TPOFF32,            0x13)
+ELF_RELOC(R_ARM_COPY,                   0x14)
+ELF_RELOC(R_ARM_GLOB_DAT,               0x15)
+ELF_RELOC(R_ARM_JUMP_SLOT,              0x16)
+ELF_RELOC(R_ARM_RELATIVE,               0x17)
+ELF_RELOC(R_ARM_GOTOFF32,               0x18)
+ELF_RELOC(R_ARM_BASE_PREL,              0x19)
+ELF_RELOC(R_ARM_GOT_BREL,               0x1a)
+ELF_RELOC(R_ARM_PLT32,                  0x1b)
+ELF_RELOC(R_ARM_CALL,                   0x1c)
+ELF_RELOC(R_ARM_JUMP24,                 0x1d)
+ELF_RELOC(R_ARM_THM_JUMP24,             0x1e)
+ELF_RELOC(R_ARM_BASE_ABS,               0x1f)
+ELF_RELOC(R_ARM_ALU_PCREL_7_0,          0x20)
+ELF_RELOC(R_ARM_ALU_PCREL_15_8,         0x21)
+ELF_RELOC(R_ARM_ALU_PCREL_23_15,        0x22)
+ELF_RELOC(R_ARM_LDR_SBREL_11_0_NC,      0x23)
+ELF_RELOC(R_ARM_ALU_SBREL_19_12_NC,     0x24)
+ELF_RELOC(R_ARM_ALU_SBREL_27_20_CK,     0x25)
+ELF_RELOC(R_ARM_TARGET1,                0x26)
+ELF_RELOC(R_ARM_SBREL31,                0x27)
+ELF_RELOC(R_ARM_V4BX,                   0x28)
+ELF_RELOC(R_ARM_TARGET2,                0x29)
+ELF_RELOC(R_ARM_PREL31,                 0x2a)
+ELF_RELOC(R_ARM_MOVW_ABS_NC,            0x2b)
+ELF_RELOC(R_ARM_MOVT_ABS,               0x2c)
+ELF_RELOC(R_ARM_MOVW_PREL_NC,           0x2d)
+ELF_RELOC(R_ARM_MOVT_PREL,              0x2e)
+ELF_RELOC(R_ARM_THM_MOVW_ABS_NC,        0x2f)
+ELF_RELOC(R_ARM_THM_MOVT_ABS,           0x30)
+ELF_RELOC(R_ARM_THM_MOVW_PREL_NC,       0x31)
+ELF_RELOC(R_ARM_THM_MOVT_PREL,          0x32)
+ELF_RELOC(R_ARM_THM_JUMP19,             0x33)
+ELF_RELOC(R_ARM_THM_JUMP6,              0x34)
+ELF_RELOC(R_ARM_THM_ALU_PREL_11_0,      0x35)
+ELF_RELOC(R_ARM_THM_PC12,               0x36)
+ELF_RELOC(R_ARM_ABS32_NOI,              0x37)
+ELF_RELOC(R_ARM_REL32_NOI,              0x38)
+ELF_RELOC(R_ARM_ALU_PC_G0_NC,           0x39)
+ELF_RELOC(R_ARM_ALU_PC_G0,              0x3a)
+ELF_RELOC(R_ARM_ALU_PC_G1_NC,           0x3b)
+ELF_RELOC(R_ARM_ALU_PC_G1,              0x3c)
+ELF_RELOC(R_ARM_ALU_PC_G2,              0x3d)
+ELF_RELOC(R_ARM_LDR_PC_G1,              0x3e)
+ELF_RELOC(R_ARM_LDR_PC_G2,              0x3f)
+ELF_RELOC(R_ARM_LDRS_PC_G0,             0x40)
+ELF_RELOC(R_ARM_LDRS_PC_G1,             0x41)
+ELF_RELOC(R_ARM_LDRS_PC_G2,             0x42)
+ELF_RELOC(R_ARM_LDC_PC_G0,              0x43)
+ELF_RELOC(R_ARM_LDC_PC_G1,              0x44)
+ELF_RELOC(R_ARM_LDC_PC_G2,              0x45)
+ELF_RELOC(R_ARM_ALU_SB_G0_NC,           0x46)
+ELF_RELOC(R_ARM_ALU_SB_G0,              0x47)
+ELF_RELOC(R_ARM_ALU_SB_G1_NC,           0x48)
+ELF_RELOC(R_ARM_ALU_SB_G1,              0x49)
+ELF_RELOC(R_ARM_ALU_SB_G2,              0x4a)
+ELF_RELOC(R_ARM_LDR_SB_G0,              0x4b)
+ELF_RELOC(R_ARM_LDR_SB_G1,              0x4c)
+ELF_RELOC(R_ARM_LDR_SB_G2,              0x4d)
+ELF_RELOC(R_ARM_LDRS_SB_G0,             0x4e)
+ELF_RELOC(R_ARM_LDRS_SB_G1,             0x4f)
+ELF_RELOC(R_ARM_LDRS_SB_G2,             0x50)
+ELF_RELOC(R_ARM_LDC_SB_G0,              0x51)
+ELF_RELOC(R_ARM_LDC_SB_G1,              0x52)
+ELF_RELOC(R_ARM_LDC_SB_G2,              0x53)
+ELF_RELOC(R_ARM_MOVW_BREL_NC,           0x54)
+ELF_RELOC(R_ARM_MOVT_BREL,              0x55)
+ELF_RELOC(R_ARM_MOVW_BREL,              0x56)
+ELF_RELOC(R_ARM_THM_MOVW_BREL_NC,       0x57)
+ELF_RELOC(R_ARM_THM_MOVT_BREL,          0x58)
+ELF_RELOC(R_ARM_THM_MOVW_BREL,          0x59)
+ELF_RELOC(R_ARM_TLS_GOTDESC,            0x5a)
+ELF_RELOC(R_ARM_TLS_CALL,               0x5b)
+ELF_RELOC(R_ARM_TLS_DESCSEQ,            0x5c)
+ELF_RELOC(R_ARM_THM_TLS_CALL,           0x5d)
+ELF_RELOC(R_ARM_PLT32_ABS,              0x5e)
+ELF_RELOC(R_ARM_GOT_ABS,                0x5f)
+ELF_RELOC(R_ARM_GOT_PREL,               0x60)
+ELF_RELOC(R_ARM_GOT_BREL12,             0x61)
+ELF_RELOC(R_ARM_GOTOFF12,               0x62)
+ELF_RELOC(R_ARM_GOTRELAX,               0x63)
+ELF_RELOC(R_ARM_GNU_VTENTRY,            0x64)
+ELF_RELOC(R_ARM_GNU_VTINHERIT,          0x65)
+ELF_RELOC(R_ARM_THM_JUMP11,             0x66)
+ELF_RELOC(R_ARM_THM_JUMP8,              0x67)
+ELF_RELOC(R_ARM_TLS_GD32,               0x68)
+ELF_RELOC(R_ARM_TLS_LDM32,              0x69)
+ELF_RELOC(R_ARM_TLS_LDO32,              0x6a)
+ELF_RELOC(R_ARM_TLS_IE32,               0x6b)
+ELF_RELOC(R_ARM_TLS_LE32,               0x6c)
+ELF_RELOC(R_ARM_TLS_LDO12,              0x6d)
+ELF_RELOC(R_ARM_TLS_LE12,               0x6e)
+ELF_RELOC(R_ARM_TLS_IE12GP,             0x6f)
+ELF_RELOC(R_ARM_PRIVATE_0,              0x70)
+ELF_RELOC(R_ARM_PRIVATE_1,              0x71)
+ELF_RELOC(R_ARM_PRIVATE_2,              0x72)
+ELF_RELOC(R_ARM_PRIVATE_3,              0x73)
+ELF_RELOC(R_ARM_PRIVATE_4,              0x74)
+ELF_RELOC(R_ARM_PRIVATE_5,              0x75)
+ELF_RELOC(R_ARM_PRIVATE_6,              0x76)
+ELF_RELOC(R_ARM_PRIVATE_7,              0x77)
+ELF_RELOC(R_ARM_PRIVATE_8,              0x78)
+ELF_RELOC(R_ARM_PRIVATE_9,              0x79)
+ELF_RELOC(R_ARM_PRIVATE_10,             0x7a)
+ELF_RELOC(R_ARM_PRIVATE_11,             0x7b)
+ELF_RELOC(R_ARM_PRIVATE_12,             0x7c)
+ELF_RELOC(R_ARM_PRIVATE_13,             0x7d)
+ELF_RELOC(R_ARM_PRIVATE_14,             0x7e)
+ELF_RELOC(R_ARM_PRIVATE_15,             0x7f)
+ELF_RELOC(R_ARM_ME_TOO,                 0x80)
+ELF_RELOC(R_ARM_THM_TLS_DESCSEQ16,      0x81)
+ELF_RELOC(R_ARM_THM_TLS_DESCSEQ32,      0x82)
+ELF_RELOC(R_ARM_IRELATIVE,              0xa0)
diff --git a/include/llvm/Support/ELFRelocs/Hexagon.def b/include/llvm/Support/ELFRelocs/Hexagon.def
new file mode 100644
index 0000000..c9d35b8
--- /dev/null
+++ b/include/llvm/Support/ELFRelocs/Hexagon.def
@@ -0,0 +1,92 @@
+
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+// Release 5 ABI
+ELF_RELOC(R_HEX_NONE,                0)
+ELF_RELOC(R_HEX_B22_PCREL,           1)
+ELF_RELOC(R_HEX_B15_PCREL,           2)
+ELF_RELOC(R_HEX_B7_PCREL,            3)
+ELF_RELOC(R_HEX_LO16,                4)
+ELF_RELOC(R_HEX_HI16,                5)
+ELF_RELOC(R_HEX_32,                  6)
+ELF_RELOC(R_HEX_16,                  7)
+ELF_RELOC(R_HEX_8,                   8)
+ELF_RELOC(R_HEX_GPREL16_0,           9)
+ELF_RELOC(R_HEX_GPREL16_1,           10)
+ELF_RELOC(R_HEX_GPREL16_2,           11)
+ELF_RELOC(R_HEX_GPREL16_3,           12)
+ELF_RELOC(R_HEX_HL16,                13)
+ELF_RELOC(R_HEX_B13_PCREL,           14)
+ELF_RELOC(R_HEX_B9_PCREL,            15)
+ELF_RELOC(R_HEX_B32_PCREL_X,         16)
+ELF_RELOC(R_HEX_32_6_X,              17)
+ELF_RELOC(R_HEX_B22_PCREL_X,         18)
+ELF_RELOC(R_HEX_B15_PCREL_X,         19)
+ELF_RELOC(R_HEX_B13_PCREL_X,         20)
+ELF_RELOC(R_HEX_B9_PCREL_X,          21)
+ELF_RELOC(R_HEX_B7_PCREL_X,          22)
+ELF_RELOC(R_HEX_16_X,                23)
+ELF_RELOC(R_HEX_12_X,                24)
+ELF_RELOC(R_HEX_11_X,                25)
+ELF_RELOC(R_HEX_10_X,                26)
+ELF_RELOC(R_HEX_9_X,                 27)
+ELF_RELOC(R_HEX_8_X,                 28)
+ELF_RELOC(R_HEX_7_X,                 29)
+ELF_RELOC(R_HEX_6_X,                 30)
+ELF_RELOC(R_HEX_32_PCREL,            31)
+ELF_RELOC(R_HEX_COPY,                32)
+ELF_RELOC(R_HEX_GLOB_DAT,            33)
+ELF_RELOC(R_HEX_JMP_SLOT,            34)
+ELF_RELOC(R_HEX_RELATIVE,            35)
+ELF_RELOC(R_HEX_PLT_B22_PCREL,       36)
+ELF_RELOC(R_HEX_GOTREL_LO16,         37)
+ELF_RELOC(R_HEX_GOTREL_HI16,         38)
+ELF_RELOC(R_HEX_GOTREL_32,           39)
+ELF_RELOC(R_HEX_GOT_LO16,            40)
+ELF_RELOC(R_HEX_GOT_HI16,            41)
+ELF_RELOC(R_HEX_GOT_32,              42)
+ELF_RELOC(R_HEX_GOT_16,              43)
+ELF_RELOC(R_HEX_DTPMOD_32,           44)
+ELF_RELOC(R_HEX_DTPREL_LO16,         45)
+ELF_RELOC(R_HEX_DTPREL_HI16,         46)
+ELF_RELOC(R_HEX_DTPREL_32,           47)
+ELF_RELOC(R_HEX_DTPREL_16,           48)
+ELF_RELOC(R_HEX_GD_PLT_B22_PCREL,    49)
+ELF_RELOC(R_HEX_GD_GOT_LO16,         50)
+ELF_RELOC(R_HEX_GD_GOT_HI16,         51)
+ELF_RELOC(R_HEX_GD_GOT_32,           52)
+ELF_RELOC(R_HEX_GD_GOT_16,           53)
+ELF_RELOC(R_HEX_IE_LO16,             54)
+ELF_RELOC(R_HEX_IE_HI16,             55)
+ELF_RELOC(R_HEX_IE_32,               56)
+ELF_RELOC(R_HEX_IE_GOT_LO16,         57)
+ELF_RELOC(R_HEX_IE_GOT_HI16,         58)
+ELF_RELOC(R_HEX_IE_GOT_32,           59)
+ELF_RELOC(R_HEX_IE_GOT_16,           60)
+ELF_RELOC(R_HEX_TPREL_LO16,          61)
+ELF_RELOC(R_HEX_TPREL_HI16,          62)
+ELF_RELOC(R_HEX_TPREL_32,            63)
+ELF_RELOC(R_HEX_TPREL_16,            64)
+ELF_RELOC(R_HEX_6_PCREL_X,           65)
+ELF_RELOC(R_HEX_GOTREL_32_6_X,       66)
+ELF_RELOC(R_HEX_GOTREL_16_X,         67)
+ELF_RELOC(R_HEX_GOTREL_11_X,         68)
+ELF_RELOC(R_HEX_GOT_32_6_X,          69)
+ELF_RELOC(R_HEX_GOT_16_X,            70)
+ELF_RELOC(R_HEX_GOT_11_X,            71)
+ELF_RELOC(R_HEX_DTPREL_32_6_X,       72)
+ELF_RELOC(R_HEX_DTPREL_16_X,         73)
+ELF_RELOC(R_HEX_DTPREL_11_X,         74)
+ELF_RELOC(R_HEX_GD_GOT_32_6_X,       75)
+ELF_RELOC(R_HEX_GD_GOT_16_X,         76)
+ELF_RELOC(R_HEX_GD_GOT_11_X,         77)
+ELF_RELOC(R_HEX_IE_32_6_X,           78)
+ELF_RELOC(R_HEX_IE_16_X,             79)
+ELF_RELOC(R_HEX_IE_GOT_32_6_X,       80)
+ELF_RELOC(R_HEX_IE_GOT_16_X,         81)
+ELF_RELOC(R_HEX_IE_GOT_11_X,         82)
+ELF_RELOC(R_HEX_TPREL_32_6_X,        83)
+ELF_RELOC(R_HEX_TPREL_16_X,          84)
+ELF_RELOC(R_HEX_TPREL_11_X,          85)
diff --git a/include/llvm/Support/ELFRelocs/Mips.def b/include/llvm/Support/ELFRelocs/Mips.def
new file mode 100644
index 0000000..dc57346
--- /dev/null
+++ b/include/llvm/Support/ELFRelocs/Mips.def
@@ -0,0 +1,112 @@
+
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+ELF_RELOC(R_MIPS_NONE,                0)
+ELF_RELOC(R_MIPS_16,                  1)
+ELF_RELOC(R_MIPS_32,                  2)
+ELF_RELOC(R_MIPS_REL32,               3)
+ELF_RELOC(R_MIPS_26,                  4)
+ELF_RELOC(R_MIPS_HI16,                5)
+ELF_RELOC(R_MIPS_LO16,                6)
+ELF_RELOC(R_MIPS_GPREL16,             7)
+ELF_RELOC(R_MIPS_LITERAL,             8)
+ELF_RELOC(R_MIPS_GOT16,               9)
+ELF_RELOC(R_MIPS_PC16,               10)
+ELF_RELOC(R_MIPS_CALL16,             11)
+ELF_RELOC(R_MIPS_GPREL32,            12)
+ELF_RELOC(R_MIPS_UNUSED1,            13)
+ELF_RELOC(R_MIPS_UNUSED2,            14)
+ELF_RELOC(R_MIPS_UNUSED3,            15)
+ELF_RELOC(R_MIPS_SHIFT5,             16)
+ELF_RELOC(R_MIPS_SHIFT6,             17)
+ELF_RELOC(R_MIPS_64,                 18)
+ELF_RELOC(R_MIPS_GOT_DISP,           19)
+ELF_RELOC(R_MIPS_GOT_PAGE,           20)
+ELF_RELOC(R_MIPS_GOT_OFST,           21)
+ELF_RELOC(R_MIPS_GOT_HI16,           22)
+ELF_RELOC(R_MIPS_GOT_LO16,           23)
+ELF_RELOC(R_MIPS_SUB,                24)
+ELF_RELOC(R_MIPS_INSERT_A,           25)
+ELF_RELOC(R_MIPS_INSERT_B,           26)
+ELF_RELOC(R_MIPS_DELETE,             27)
+ELF_RELOC(R_MIPS_HIGHER,             28)
+ELF_RELOC(R_MIPS_HIGHEST,            29)
+ELF_RELOC(R_MIPS_CALL_HI16,          30)
+ELF_RELOC(R_MIPS_CALL_LO16,          31)
+ELF_RELOC(R_MIPS_SCN_DISP,           32)
+ELF_RELOC(R_MIPS_REL16,              33)
+ELF_RELOC(R_MIPS_ADD_IMMEDIATE,      34)
+ELF_RELOC(R_MIPS_PJUMP,              35)
+ELF_RELOC(R_MIPS_RELGOT,             36)
+ELF_RELOC(R_MIPS_JALR,               37)
+ELF_RELOC(R_MIPS_TLS_DTPMOD32,       38)
+ELF_RELOC(R_MIPS_TLS_DTPREL32,       39)
+ELF_RELOC(R_MIPS_TLS_DTPMOD64,       40)
+ELF_RELOC(R_MIPS_TLS_DTPREL64,       41)
+ELF_RELOC(R_MIPS_TLS_GD,             42)
+ELF_RELOC(R_MIPS_TLS_LDM,            43)
+ELF_RELOC(R_MIPS_TLS_DTPREL_HI16,    44)
+ELF_RELOC(R_MIPS_TLS_DTPREL_LO16,    45)
+ELF_RELOC(R_MIPS_TLS_GOTTPREL,       46)
+ELF_RELOC(R_MIPS_TLS_TPREL32,        47)
+ELF_RELOC(R_MIPS_TLS_TPREL64,        48)
+ELF_RELOC(R_MIPS_TLS_TPREL_HI16,     49)
+ELF_RELOC(R_MIPS_TLS_TPREL_LO16,     50)
+ELF_RELOC(R_MIPS_GLOB_DAT,           51)
+ELF_RELOC(R_MIPS_PC21_S2,            60)
+ELF_RELOC(R_MIPS_PC26_S2,            61)
+ELF_RELOC(R_MIPS_PC18_S3,            62)
+ELF_RELOC(R_MIPS_PC19_S2,            63)
+ELF_RELOC(R_MIPS_PCHI16,             64)
+ELF_RELOC(R_MIPS_PCLO16,             65)
+ELF_RELOC(R_MIPS16_26,               100)
+ELF_RELOC(R_MIPS16_GPREL,            101)
+ELF_RELOC(R_MIPS16_GOT16,            102)
+ELF_RELOC(R_MIPS16_CALL16,           103)
+ELF_RELOC(R_MIPS16_HI16,             104)
+ELF_RELOC(R_MIPS16_LO16,             105)
+ELF_RELOC(R_MIPS16_TLS_GD,           106)
+ELF_RELOC(R_MIPS16_TLS_LDM,          107)
+ELF_RELOC(R_MIPS16_TLS_DTPREL_HI16,  108)
+ELF_RELOC(R_MIPS16_TLS_DTPREL_LO16,  109)
+ELF_RELOC(R_MIPS16_TLS_GOTTPREL,     110)
+ELF_RELOC(R_MIPS16_TLS_TPREL_HI16,   111)
+ELF_RELOC(R_MIPS16_TLS_TPREL_LO16,   112)
+ELF_RELOC(R_MIPS_COPY,               126)
+ELF_RELOC(R_MIPS_JUMP_SLOT,          127)
+ELF_RELOC(R_MICROMIPS_26_S1,         133)
+ELF_RELOC(R_MICROMIPS_HI16,          134)
+ELF_RELOC(R_MICROMIPS_LO16,          135)
+ELF_RELOC(R_MICROMIPS_GPREL16,       136)
+ELF_RELOC(R_MICROMIPS_LITERAL,       137)
+ELF_RELOC(R_MICROMIPS_GOT16,         138)
+ELF_RELOC(R_MICROMIPS_PC7_S1,        139)
+ELF_RELOC(R_MICROMIPS_PC10_S1,       140)
+ELF_RELOC(R_MICROMIPS_PC16_S1,       141)
+ELF_RELOC(R_MICROMIPS_CALL16,        142)
+ELF_RELOC(R_MICROMIPS_GOT_DISP,      145)
+ELF_RELOC(R_MICROMIPS_GOT_PAGE,      146)
+ELF_RELOC(R_MICROMIPS_GOT_OFST,      147)
+ELF_RELOC(R_MICROMIPS_GOT_HI16,      148)
+ELF_RELOC(R_MICROMIPS_GOT_LO16,      149)
+ELF_RELOC(R_MICROMIPS_SUB,           150)
+ELF_RELOC(R_MICROMIPS_HIGHER,        151)
+ELF_RELOC(R_MICROMIPS_HIGHEST,       152)
+ELF_RELOC(R_MICROMIPS_CALL_HI16,     153)
+ELF_RELOC(R_MICROMIPS_CALL_LO16,     154)
+ELF_RELOC(R_MICROMIPS_SCN_DISP,      155)
+ELF_RELOC(R_MICROMIPS_JALR,          156)
+ELF_RELOC(R_MICROMIPS_HI0_LO16,      157)
+ELF_RELOC(R_MICROMIPS_TLS_GD,           162)
+ELF_RELOC(R_MICROMIPS_TLS_LDM,          163)
+ELF_RELOC(R_MICROMIPS_TLS_DTPREL_HI16,  164)
+ELF_RELOC(R_MICROMIPS_TLS_DTPREL_LO16,  165)
+ELF_RELOC(R_MICROMIPS_TLS_GOTTPREL,     166)
+ELF_RELOC(R_MICROMIPS_TLS_TPREL_HI16,   169)
+ELF_RELOC(R_MICROMIPS_TLS_TPREL_LO16,   170)
+ELF_RELOC(R_MICROMIPS_GPREL7_S2,        172)
+ELF_RELOC(R_MICROMIPS_PC23_S2,          173)
+ELF_RELOC(R_MIPS_NUM,                218)
+ELF_RELOC(R_MIPS_PC32,               248)
diff --git a/include/llvm/Support/ELFRelocs/PowerPC.def b/include/llvm/Support/ELFRelocs/PowerPC.def
new file mode 100644
index 0000000..b6c3941
--- /dev/null
+++ b/include/llvm/Support/ELFRelocs/PowerPC.def
@@ -0,0 +1,61 @@
+
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+ELF_RELOC(R_PPC_NONE,                   0)      /* No relocation. */
+ELF_RELOC(R_PPC_ADDR32,                 1)
+ELF_RELOC(R_PPC_ADDR24,                 2)
+ELF_RELOC(R_PPC_ADDR16,                 3)
+ELF_RELOC(R_PPC_ADDR16_LO,              4)
+ELF_RELOC(R_PPC_ADDR16_HI,              5)
+ELF_RELOC(R_PPC_ADDR16_HA,              6)
+ELF_RELOC(R_PPC_ADDR14,                 7)
+ELF_RELOC(R_PPC_ADDR14_BRTAKEN,         8)
+ELF_RELOC(R_PPC_ADDR14_BRNTAKEN,        9)
+ELF_RELOC(R_PPC_REL24,                  10)
+ELF_RELOC(R_PPC_REL14,                  11)
+ELF_RELOC(R_PPC_REL14_BRTAKEN,          12)
+ELF_RELOC(R_PPC_REL14_BRNTAKEN,         13)
+ELF_RELOC(R_PPC_GOT16,                  14)
+ELF_RELOC(R_PPC_GOT16_LO,               15)
+ELF_RELOC(R_PPC_GOT16_HI,               16)
+ELF_RELOC(R_PPC_GOT16_HA,               17)
+ELF_RELOC(R_PPC_PLTREL24,               18)
+ELF_RELOC(R_PPC_JMP_SLOT,               21)
+ELF_RELOC(R_PPC_LOCAL24PC,              23)
+ELF_RELOC(R_PPC_REL32,                  26)
+ELF_RELOC(R_PPC_TLS,                    67)
+ELF_RELOC(R_PPC_DTPMOD32,               68)
+ELF_RELOC(R_PPC_TPREL16,                69)
+ELF_RELOC(R_PPC_TPREL16_LO,             70)
+ELF_RELOC(R_PPC_TPREL16_HI,             71)
+ELF_RELOC(R_PPC_TPREL16_HA,             72)
+ELF_RELOC(R_PPC_TPREL32,                73)
+ELF_RELOC(R_PPC_DTPREL16,               74)
+ELF_RELOC(R_PPC_DTPREL16_LO,            75)
+ELF_RELOC(R_PPC_DTPREL16_HI,            76)
+ELF_RELOC(R_PPC_DTPREL16_HA,            77)
+ELF_RELOC(R_PPC_DTPREL32,               78)
+ELF_RELOC(R_PPC_GOT_TLSGD16,            79)
+ELF_RELOC(R_PPC_GOT_TLSGD16_LO,         80)
+ELF_RELOC(R_PPC_GOT_TLSGD16_HI,         81)
+ELF_RELOC(R_PPC_GOT_TLSGD16_HA,         82)
+ELF_RELOC(R_PPC_GOT_TLSLD16,            83)
+ELF_RELOC(R_PPC_GOT_TLSLD16_LO,         84)
+ELF_RELOC(R_PPC_GOT_TLSLD16_HI,         85)
+ELF_RELOC(R_PPC_GOT_TLSLD16_HA,         86)
+ELF_RELOC(R_PPC_GOT_TPREL16,            87)
+ELF_RELOC(R_PPC_GOT_TPREL16_LO,         88)
+ELF_RELOC(R_PPC_GOT_TPREL16_HI,         89)
+ELF_RELOC(R_PPC_GOT_TPREL16_HA,         90)
+ELF_RELOC(R_PPC_GOT_DTPREL16,           91)
+ELF_RELOC(R_PPC_GOT_DTPREL16_LO,        92)
+ELF_RELOC(R_PPC_GOT_DTPREL16_HI,        93)
+ELF_RELOC(R_PPC_GOT_DTPREL16_HA,        94)
+ELF_RELOC(R_PPC_TLSGD,                  95)
+ELF_RELOC(R_PPC_TLSLD,                  96)
+ELF_RELOC(R_PPC_REL16,                  249)
+ELF_RELOC(R_PPC_REL16_LO,               250)
+ELF_RELOC(R_PPC_REL16_HI,               251)
+ELF_RELOC(R_PPC_REL16_HA,               252)
diff --git a/include/llvm/Support/ELFRelocs/PowerPC64.def b/include/llvm/Support/ELFRelocs/PowerPC64.def
new file mode 100644
index 0000000..7b2a3cb
--- /dev/null
+++ b/include/llvm/Support/ELFRelocs/PowerPC64.def
@@ -0,0 +1,88 @@
+
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+ELF_RELOC(R_PPC64_NONE,                 0)
+ELF_RELOC(R_PPC64_ADDR32,               1)
+ELF_RELOC(R_PPC64_ADDR24,               2)
+ELF_RELOC(R_PPC64_ADDR16,               3)
+ELF_RELOC(R_PPC64_ADDR16_LO,            4)
+ELF_RELOC(R_PPC64_ADDR16_HI,            5)
+ELF_RELOC(R_PPC64_ADDR16_HA,            6)
+ELF_RELOC(R_PPC64_ADDR14,               7)
+ELF_RELOC(R_PPC64_ADDR14_BRTAKEN,       8)
+ELF_RELOC(R_PPC64_ADDR14_BRNTAKEN,      9)
+ELF_RELOC(R_PPC64_REL24,                10)
+ELF_RELOC(R_PPC64_REL14,                11)
+ELF_RELOC(R_PPC64_REL14_BRTAKEN,        12)
+ELF_RELOC(R_PPC64_REL14_BRNTAKEN,       13)
+ELF_RELOC(R_PPC64_GOT16,                14)
+ELF_RELOC(R_PPC64_GOT16_LO,             15)
+ELF_RELOC(R_PPC64_GOT16_HI,             16)
+ELF_RELOC(R_PPC64_GOT16_HA,             17)
+ELF_RELOC(R_PPC64_JMP_SLOT,             21)
+ELF_RELOC(R_PPC64_REL32,                26)
+ELF_RELOC(R_PPC64_ADDR64,               38)
+ELF_RELOC(R_PPC64_ADDR16_HIGHER,        39)
+ELF_RELOC(R_PPC64_ADDR16_HIGHERA,       40)
+ELF_RELOC(R_PPC64_ADDR16_HIGHEST,       41)
+ELF_RELOC(R_PPC64_ADDR16_HIGHESTA,      42)
+ELF_RELOC(R_PPC64_REL64,                44)
+ELF_RELOC(R_PPC64_TOC16,                47)
+ELF_RELOC(R_PPC64_TOC16_LO,             48)
+ELF_RELOC(R_PPC64_TOC16_HI,             49)
+ELF_RELOC(R_PPC64_TOC16_HA,             50)
+ELF_RELOC(R_PPC64_TOC,                  51)
+ELF_RELOC(R_PPC64_ADDR16_DS,            56)
+ELF_RELOC(R_PPC64_ADDR16_LO_DS,         57)
+ELF_RELOC(R_PPC64_GOT16_DS,             58)
+ELF_RELOC(R_PPC64_GOT16_LO_DS,          59)
+ELF_RELOC(R_PPC64_TOC16_DS,             63)
+ELF_RELOC(R_PPC64_TOC16_LO_DS,          64)
+ELF_RELOC(R_PPC64_TLS,                  67)
+ELF_RELOC(R_PPC64_DTPMOD64,             68)
+ELF_RELOC(R_PPC64_TPREL16,              69)
+ELF_RELOC(R_PPC64_TPREL16_LO,           70)
+ELF_RELOC(R_PPC64_TPREL16_HI,           71)
+ELF_RELOC(R_PPC64_TPREL16_HA,           72)
+ELF_RELOC(R_PPC64_TPREL64,              73)
+ELF_RELOC(R_PPC64_DTPREL16,             74)
+ELF_RELOC(R_PPC64_DTPREL16_LO,          75)
+ELF_RELOC(R_PPC64_DTPREL16_HI,          76)
+ELF_RELOC(R_PPC64_DTPREL16_HA,          77)
+ELF_RELOC(R_PPC64_DTPREL64,             78)
+ELF_RELOC(R_PPC64_GOT_TLSGD16,          79)
+ELF_RELOC(R_PPC64_GOT_TLSGD16_LO,       80)
+ELF_RELOC(R_PPC64_GOT_TLSGD16_HI,       81)
+ELF_RELOC(R_PPC64_GOT_TLSGD16_HA,       82)
+ELF_RELOC(R_PPC64_GOT_TLSLD16,          83)
+ELF_RELOC(R_PPC64_GOT_TLSLD16_LO,       84)
+ELF_RELOC(R_PPC64_GOT_TLSLD16_HI,       85)
+ELF_RELOC(R_PPC64_GOT_TLSLD16_HA,       86)
+ELF_RELOC(R_PPC64_GOT_TPREL16_DS,       87)
+ELF_RELOC(R_PPC64_GOT_TPREL16_LO_DS,    88)
+ELF_RELOC(R_PPC64_GOT_TPREL16_HI,       89)
+ELF_RELOC(R_PPC64_GOT_TPREL16_HA,       90)
+ELF_RELOC(R_PPC64_GOT_DTPREL16_DS,      91)
+ELF_RELOC(R_PPC64_GOT_DTPREL16_LO_DS,   92)
+ELF_RELOC(R_PPC64_GOT_DTPREL16_HI,      93)
+ELF_RELOC(R_PPC64_GOT_DTPREL16_HA,      94)
+ELF_RELOC(R_PPC64_TPREL16_DS,           95)
+ELF_RELOC(R_PPC64_TPREL16_LO_DS,        96)
+ELF_RELOC(R_PPC64_TPREL16_HIGHER,       97)
+ELF_RELOC(R_PPC64_TPREL16_HIGHERA,      98)
+ELF_RELOC(R_PPC64_TPREL16_HIGHEST,      99)
+ELF_RELOC(R_PPC64_TPREL16_HIGHESTA,     100)
+ELF_RELOC(R_PPC64_DTPREL16_DS,          101)
+ELF_RELOC(R_PPC64_DTPREL16_LO_DS,       102)
+ELF_RELOC(R_PPC64_DTPREL16_HIGHER,      103)
+ELF_RELOC(R_PPC64_DTPREL16_HIGHERA,     104)
+ELF_RELOC(R_PPC64_DTPREL16_HIGHEST,     105)
+ELF_RELOC(R_PPC64_DTPREL16_HIGHESTA,    106)
+ELF_RELOC(R_PPC64_TLSGD,                107)
+ELF_RELOC(R_PPC64_TLSLD,                108)
+ELF_RELOC(R_PPC64_REL16,                249)
+ELF_RELOC(R_PPC64_REL16_LO,             250)
+ELF_RELOC(R_PPC64_REL16_HI,             251)
+ELF_RELOC(R_PPC64_REL16_HA,             252)
diff --git a/include/llvm/Support/ELFRelocs/Sparc.def b/include/llvm/Support/ELFRelocs/Sparc.def
new file mode 100644
index 0000000..d6772ea
--- /dev/null
+++ b/include/llvm/Support/ELFRelocs/Sparc.def
@@ -0,0 +1,89 @@
+
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+ELF_RELOC(R_SPARC_NONE,         0)
+ELF_RELOC(R_SPARC_8,            1)
+ELF_RELOC(R_SPARC_16,           2)
+ELF_RELOC(R_SPARC_32,           3)
+ELF_RELOC(R_SPARC_DISP8,        4)
+ELF_RELOC(R_SPARC_DISP16,       5)
+ELF_RELOC(R_SPARC_DISP32,       6)
+ELF_RELOC(R_SPARC_WDISP30,      7)
+ELF_RELOC(R_SPARC_WDISP22,      8)
+ELF_RELOC(R_SPARC_HI22,         9)
+ELF_RELOC(R_SPARC_22,           10)
+ELF_RELOC(R_SPARC_13,           11)
+ELF_RELOC(R_SPARC_LO10,         12)
+ELF_RELOC(R_SPARC_GOT10,        13)
+ELF_RELOC(R_SPARC_GOT13,        14)
+ELF_RELOC(R_SPARC_GOT22,        15)
+ELF_RELOC(R_SPARC_PC10,         16)
+ELF_RELOC(R_SPARC_PC22,         17)
+ELF_RELOC(R_SPARC_WPLT30,       18)
+ELF_RELOC(R_SPARC_COPY,         19)
+ELF_RELOC(R_SPARC_GLOB_DAT,     20)
+ELF_RELOC(R_SPARC_JMP_SLOT,     21)
+ELF_RELOC(R_SPARC_RELATIVE,     22)
+ELF_RELOC(R_SPARC_UA32,         23)
+ELF_RELOC(R_SPARC_PLT32,        24)
+ELF_RELOC(R_SPARC_HIPLT22,      25)
+ELF_RELOC(R_SPARC_LOPLT10,      26)
+ELF_RELOC(R_SPARC_PCPLT32,      27)
+ELF_RELOC(R_SPARC_PCPLT22,      28)
+ELF_RELOC(R_SPARC_PCPLT10,      29)
+ELF_RELOC(R_SPARC_10,           30)
+ELF_RELOC(R_SPARC_11,           31)
+ELF_RELOC(R_SPARC_64,           32)
+ELF_RELOC(R_SPARC_OLO10,        33)
+ELF_RELOC(R_SPARC_HH22,         34)
+ELF_RELOC(R_SPARC_HM10,         35)
+ELF_RELOC(R_SPARC_LM22,         36)
+ELF_RELOC(R_SPARC_PC_HH22,      37)
+ELF_RELOC(R_SPARC_PC_HM10,      38)
+ELF_RELOC(R_SPARC_PC_LM22,      39)
+ELF_RELOC(R_SPARC_WDISP16,      40)
+ELF_RELOC(R_SPARC_WDISP19,      41)
+ELF_RELOC(R_SPARC_7,            43)
+ELF_RELOC(R_SPARC_5,            44)
+ELF_RELOC(R_SPARC_6,            45)
+ELF_RELOC(R_SPARC_DISP64,       46)
+ELF_RELOC(R_SPARC_PLT64,        47)
+ELF_RELOC(R_SPARC_HIX22,        48)
+ELF_RELOC(R_SPARC_LOX10,        49)
+ELF_RELOC(R_SPARC_H44,          50)
+ELF_RELOC(R_SPARC_M44,          51)
+ELF_RELOC(R_SPARC_L44,          52)
+ELF_RELOC(R_SPARC_REGISTER,     53)
+ELF_RELOC(R_SPARC_UA64,         54)
+ELF_RELOC(R_SPARC_UA16,         55)
+ELF_RELOC(R_SPARC_TLS_GD_HI22,    56)
+ELF_RELOC(R_SPARC_TLS_GD_LO10,    57)
+ELF_RELOC(R_SPARC_TLS_GD_ADD,     58)
+ELF_RELOC(R_SPARC_TLS_GD_CALL,    59)
+ELF_RELOC(R_SPARC_TLS_LDM_HI22,   60)
+ELF_RELOC(R_SPARC_TLS_LDM_LO10,   61)
+ELF_RELOC(R_SPARC_TLS_LDM_ADD,    62)
+ELF_RELOC(R_SPARC_TLS_LDM_CALL,   63)
+ELF_RELOC(R_SPARC_TLS_LDO_HIX22,  64)
+ELF_RELOC(R_SPARC_TLS_LDO_LOX10,  65)
+ELF_RELOC(R_SPARC_TLS_LDO_ADD,    66)
+ELF_RELOC(R_SPARC_TLS_IE_HI22,    67)
+ELF_RELOC(R_SPARC_TLS_IE_LO10,    68)
+ELF_RELOC(R_SPARC_TLS_IE_LD,      69)
+ELF_RELOC(R_SPARC_TLS_IE_LDX,     70)
+ELF_RELOC(R_SPARC_TLS_IE_ADD,     71)
+ELF_RELOC(R_SPARC_TLS_LE_HIX22,   72)
+ELF_RELOC(R_SPARC_TLS_LE_LOX10,   73)
+ELF_RELOC(R_SPARC_TLS_DTPMOD32,   74)
+ELF_RELOC(R_SPARC_TLS_DTPMOD64,   75)
+ELF_RELOC(R_SPARC_TLS_DTPOFF32,   76)
+ELF_RELOC(R_SPARC_TLS_DTPOFF64,   77)
+ELF_RELOC(R_SPARC_TLS_TPOFF32,    78)
+ELF_RELOC(R_SPARC_TLS_TPOFF64,    79)
+ELF_RELOC(R_SPARC_GOTDATA_HIX22,  80)
+ELF_RELOC(R_SPARC_GOTDATA_LOX22,  81)
+ELF_RELOC(R_SPARC_GOTDATA_OP_HIX22,  82)
+ELF_RELOC(R_SPARC_GOTDATA_OP_LOX22,  83)
+ELF_RELOC(R_SPARC_GOTDATA_OP,     84)
diff --git a/include/llvm/Support/ELFRelocs/SystemZ.def b/include/llvm/Support/ELFRelocs/SystemZ.def
new file mode 100644
index 0000000..711f940
--- /dev/null
+++ b/include/llvm/Support/ELFRelocs/SystemZ.def
@@ -0,0 +1,67 @@
+
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+ELF_RELOC(R_390_NONE,          0)
+ELF_RELOC(R_390_8,             1)
+ELF_RELOC(R_390_12,            2)
+ELF_RELOC(R_390_16,            3)
+ELF_RELOC(R_390_32,            4)
+ELF_RELOC(R_390_PC32,          5)
+ELF_RELOC(R_390_GOT12,         6)
+ELF_RELOC(R_390_GOT32,         7)
+ELF_RELOC(R_390_PLT32,         8)
+ELF_RELOC(R_390_COPY,          9)
+ELF_RELOC(R_390_GLOB_DAT,     10)
+ELF_RELOC(R_390_JMP_SLOT,     11)
+ELF_RELOC(R_390_RELATIVE,     12)
+ELF_RELOC(R_390_GOTOFF,       13)
+ELF_RELOC(R_390_GOTPC,        14)
+ELF_RELOC(R_390_GOT16,        15)
+ELF_RELOC(R_390_PC16,         16)
+ELF_RELOC(R_390_PC16DBL,      17)
+ELF_RELOC(R_390_PLT16DBL,     18)
+ELF_RELOC(R_390_PC32DBL,      19)
+ELF_RELOC(R_390_PLT32DBL,     20)
+ELF_RELOC(R_390_GOTPCDBL,     21)
+ELF_RELOC(R_390_64,           22)
+ELF_RELOC(R_390_PC64,         23)
+ELF_RELOC(R_390_GOT64,        24)
+ELF_RELOC(R_390_PLT64,        25)
+ELF_RELOC(R_390_GOTENT,       26)
+ELF_RELOC(R_390_GOTOFF16,     27)
+ELF_RELOC(R_390_GOTOFF64,     28)
+ELF_RELOC(R_390_GOTPLT12,     29)
+ELF_RELOC(R_390_GOTPLT16,     30)
+ELF_RELOC(R_390_GOTPLT32,     31)
+ELF_RELOC(R_390_GOTPLT64,     32)
+ELF_RELOC(R_390_GOTPLTENT,    33)
+ELF_RELOC(R_390_PLTOFF16,     34)
+ELF_RELOC(R_390_PLTOFF32,     35)
+ELF_RELOC(R_390_PLTOFF64,     36)
+ELF_RELOC(R_390_TLS_LOAD,     37)
+ELF_RELOC(R_390_TLS_GDCALL,   38)
+ELF_RELOC(R_390_TLS_LDCALL,   39)
+ELF_RELOC(R_390_TLS_GD32,     40)
+ELF_RELOC(R_390_TLS_GD64,     41)
+ELF_RELOC(R_390_TLS_GOTIE12,  42)
+ELF_RELOC(R_390_TLS_GOTIE32,  43)
+ELF_RELOC(R_390_TLS_GOTIE64,  44)
+ELF_RELOC(R_390_TLS_LDM32,    45)
+ELF_RELOC(R_390_TLS_LDM64,    46)
+ELF_RELOC(R_390_TLS_IE32,     47)
+ELF_RELOC(R_390_TLS_IE64,     48)
+ELF_RELOC(R_390_TLS_IEENT,    49)
+ELF_RELOC(R_390_TLS_LE32,     50)
+ELF_RELOC(R_390_TLS_LE64,     51)
+ELF_RELOC(R_390_TLS_LDO32,    52)
+ELF_RELOC(R_390_TLS_LDO64,    53)
+ELF_RELOC(R_390_TLS_DTPMOD,   54)
+ELF_RELOC(R_390_TLS_DTPOFF,   55)
+ELF_RELOC(R_390_TLS_TPOFF,    56)
+ELF_RELOC(R_390_20,           57)
+ELF_RELOC(R_390_GOT20,        58)
+ELF_RELOC(R_390_GOTPLT20,     59)
+ELF_RELOC(R_390_TLS_GOTIE20,  60)
+ELF_RELOC(R_390_IRELATIVE,    61)
diff --git a/include/llvm/Support/ELFRelocs/i386.def b/include/llvm/Support/ELFRelocs/i386.def
new file mode 100644
index 0000000..45eae7f
--- /dev/null
+++ b/include/llvm/Support/ELFRelocs/i386.def
@@ -0,0 +1,47 @@
+
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+// TODO: this is just a subset
+ELF_RELOC(R_386_NONE,           0)
+ELF_RELOC(R_386_32,             1)
+ELF_RELOC(R_386_PC32,           2)
+ELF_RELOC(R_386_GOT32,          3)
+ELF_RELOC(R_386_PLT32,          4)
+ELF_RELOC(R_386_COPY,           5)
+ELF_RELOC(R_386_GLOB_DAT,       6)
+ELF_RELOC(R_386_JUMP_SLOT,      7)
+ELF_RELOC(R_386_RELATIVE,       8)
+ELF_RELOC(R_386_GOTOFF,         9)
+ELF_RELOC(R_386_GOTPC,          10)
+ELF_RELOC(R_386_32PLT,          11)
+ELF_RELOC(R_386_TLS_TPOFF,      14)
+ELF_RELOC(R_386_TLS_IE,         15)
+ELF_RELOC(R_386_TLS_GOTIE,      16)
+ELF_RELOC(R_386_TLS_LE,         17)
+ELF_RELOC(R_386_TLS_GD,         18)
+ELF_RELOC(R_386_TLS_LDM,        19)
+ELF_RELOC(R_386_16,             20)
+ELF_RELOC(R_386_PC16,           21)
+ELF_RELOC(R_386_8,              22)
+ELF_RELOC(R_386_PC8,            23)
+ELF_RELOC(R_386_TLS_GD_32,      24)
+ELF_RELOC(R_386_TLS_GD_PUSH,    25)
+ELF_RELOC(R_386_TLS_GD_CALL,    26)
+ELF_RELOC(R_386_TLS_GD_POP,     27)
+ELF_RELOC(R_386_TLS_LDM_32,     28)
+ELF_RELOC(R_386_TLS_LDM_PUSH,   29)
+ELF_RELOC(R_386_TLS_LDM_CALL,   30)
+ELF_RELOC(R_386_TLS_LDM_POP,    31)
+ELF_RELOC(R_386_TLS_LDO_32,     32)
+ELF_RELOC(R_386_TLS_IE_32,      33)
+ELF_RELOC(R_386_TLS_LE_32,      34)
+ELF_RELOC(R_386_TLS_DTPMOD32,   35)
+ELF_RELOC(R_386_TLS_DTPOFF32,   36)
+ELF_RELOC(R_386_TLS_TPOFF32,    37)
+ELF_RELOC(R_386_TLS_GOTDESC,    39)
+ELF_RELOC(R_386_TLS_DESC_CALL,  40)
+ELF_RELOC(R_386_TLS_DESC,       41)
+ELF_RELOC(R_386_IRELATIVE,      42)
+ELF_RELOC(R_386_NUM,            43)
diff --git a/include/llvm/Support/ELFRelocs/x86_64.def b/include/llvm/Support/ELFRelocs/x86_64.def
new file mode 100644
index 0000000..36ad061
--- /dev/null
+++ b/include/llvm/Support/ELFRelocs/x86_64.def
@@ -0,0 +1,44 @@
+
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+ELF_RELOC(R_X86_64_NONE,        0)
+ELF_RELOC(R_X86_64_64,          1)
+ELF_RELOC(R_X86_64_PC32,        2)
+ELF_RELOC(R_X86_64_GOT32,       3)
+ELF_RELOC(R_X86_64_PLT32,       4)
+ELF_RELOC(R_X86_64_COPY,        5)
+ELF_RELOC(R_X86_64_GLOB_DAT,    6)
+ELF_RELOC(R_X86_64_JUMP_SLOT,   7)
+ELF_RELOC(R_X86_64_RELATIVE,    8)
+ELF_RELOC(R_X86_64_GOTPCREL,    9)
+ELF_RELOC(R_X86_64_32,          10)
+ELF_RELOC(R_X86_64_32S,         11)
+ELF_RELOC(R_X86_64_16,          12)
+ELF_RELOC(R_X86_64_PC16,        13)
+ELF_RELOC(R_X86_64_8,           14)
+ELF_RELOC(R_X86_64_PC8,         15)
+ELF_RELOC(R_X86_64_DTPMOD64,    16)
+ELF_RELOC(R_X86_64_DTPOFF64,    17)
+ELF_RELOC(R_X86_64_TPOFF64,     18)
+ELF_RELOC(R_X86_64_TLSGD,       19)
+ELF_RELOC(R_X86_64_TLSLD,       20)
+ELF_RELOC(R_X86_64_DTPOFF32,    21)
+ELF_RELOC(R_X86_64_GOTTPOFF,    22)
+ELF_RELOC(R_X86_64_TPOFF32,     23)
+ELF_RELOC(R_X86_64_PC64,        24)
+ELF_RELOC(R_X86_64_GOTOFF64,    25)
+ELF_RELOC(R_X86_64_GOTPC32,     26)
+ELF_RELOC(R_X86_64_GOT64,       27)
+ELF_RELOC(R_X86_64_GOTPCREL64,  28)
+ELF_RELOC(R_X86_64_GOTPC64,     29)
+ELF_RELOC(R_X86_64_GOTPLT64,    30)
+ELF_RELOC(R_X86_64_PLTOFF64,    31)
+ELF_RELOC(R_X86_64_SIZE32,      32)
+ELF_RELOC(R_X86_64_SIZE64,      33)
+ELF_RELOC(R_X86_64_GOTPC32_TLSDESC,  34)
+ELF_RELOC(R_X86_64_TLSDESC_CALL,     35)
+ELF_RELOC(R_X86_64_TLSDESC,     36)
+ELF_RELOC(R_X86_64_IRELATIVE,   37)
+
diff --git a/include/llvm/Support/EndianStream.h b/include/llvm/Support/EndianStream.h
index 94f372f..d44a9b3 100644
--- a/include/llvm/Support/EndianStream.h
+++ b/include/llvm/Support/EndianStream.h
@@ -31,6 +31,31 @@ template <endianness endian> struct Writer {
     OS.write((const char *)&Val, sizeof(value_type));
   }
 };
+
+template <>
+template <>
+inline void Writer<little>::write<float>(float Val) {
+  write(FloatToBits(Val));
+}
+
+template <>
+template <>
+inline void Writer<little>::write<double>(double Val) {
+  write(DoubleToBits(Val));
+}
+
+template <>
+template <>
+inline void Writer<big>::write<float>(float Val) {
+  write(FloatToBits(Val));
+}
+
+template <>
+template <>
+inline void Writer<big>::write<double>(double Val) {
+  write(DoubleToBits(Val));
+}
+
 } // end namespace endian
 
 } // end namespace support
diff --git a/include/llvm/Support/ErrorOr.h b/include/llvm/Support/ErrorOr.h
index 84763de..3577a12 100644
--- a/include/llvm/Support/ErrorOr.h
+++ b/include/llvm/Support/ErrorOr.h
@@ -168,7 +168,7 @@ public:
   }
 
   /// \brief Return false if there is an error.
-  LLVM_EXPLICIT operator bool() const {
+  explicit operator bool() const {
     return !HasError;
   }
 
diff --git a/include/llvm/Support/FileOutputBuffer.h b/include/llvm/Support/FileOutputBuffer.h
index a7cfacd..fd8879c 100644
--- a/include/llvm/Support/FileOutputBuffer.h
+++ b/include/llvm/Support/FileOutputBuffer.h
@@ -66,7 +66,7 @@ public:
   /// is called, the file is deleted in the destructor. The optional parameter
   /// is used if it turns out you want the file size to be smaller than
   /// initially requested.
-  std::error_code commit(int64_t NewSmallerSize = -1);
+  std::error_code commit();
 
   /// If this object was previously committed, the destructor just deletes
   /// this object.  If this object was not committed, the destructor
@@ -74,8 +74,8 @@ public:
   ~FileOutputBuffer();
 
 private:
-  FileOutputBuffer(const FileOutputBuffer &) LLVM_DELETED_FUNCTION;
-  FileOutputBuffer &operator=(const FileOutputBuffer &) LLVM_DELETED_FUNCTION;
+  FileOutputBuffer(const FileOutputBuffer &) = delete;
+  FileOutputBuffer &operator=(const FileOutputBuffer &) = delete;
 
   FileOutputBuffer(std::unique_ptr<llvm::sys::fs::mapped_file_region> R,
                    StringRef Path, StringRef TempPath);
diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index 63c9ed5..b3b44c4 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h
@@ -241,6 +241,7 @@ struct file_magic {
     macho_bundle,             ///< Mach-O Bundle file
     macho_dynamically_linked_shared_lib_stub, ///< Mach-O Shared lib stub
     macho_dsym_companion,     ///< Mach-O dSYM companion file
+    macho_kext_bundle,        ///< Mach-O kext bundle file
     macho_universal_binary,   ///< Mach-O universal binary
     coff_object,              ///< COFF object file
     coff_import_library,      ///< COFF import library
@@ -336,11 +337,11 @@ std::error_code copy_file(const Twine &From, const Twine &To);
 
 /// @brief Resize path to size. File is resized as if by POSIX truncate().
 ///
-/// @param path Input path.
-/// @param size Size to resize to.
+/// @param FD Input file descriptor.
+/// @param Size Size to resize to.
 /// @returns errc::success if \a path has been resized to \a size, otherwise a
 ///          platform-specific error_code.
-std::error_code resize_file(const Twine &path, uint64_t size);
+std::error_code resize_file(int FD, uint64_t Size);
 
 /// @}
 /// @name Physical Observers
@@ -624,9 +625,9 @@ std::error_code getUniqueID(const Twine Path, UniqueID &Result);
 /// This class represents a memory mapped file. It is based on
 /// boost::iostreams::mapped_file.
 class mapped_file_region {
-  mapped_file_region() LLVM_DELETED_FUNCTION;
-  mapped_file_region(mapped_file_region&) LLVM_DELETED_FUNCTION;
-  mapped_file_region &operator =(mapped_file_region&) LLVM_DELETED_FUNCTION;
+  mapped_file_region() = delete;
+  mapped_file_region(mapped_file_region&) = delete;
+  mapped_file_region &operator =(mapped_file_region&) = delete;
 
 public:
   enum mapmode {
@@ -637,49 +638,20 @@ public:
 
 private:
   /// Platform-specific mapping state.
-  mapmode Mode;
   uint64_t Size;
   void *Mapping;
-#ifdef LLVM_ON_WIN32
-  int FileDescriptor;
-  void *FileHandle;
-  void *FileMappingHandle;
-#endif
 
-  std::error_code init(int FD, bool CloseFD, uint64_t Offset);
+  std::error_code init(int FD, uint64_t Offset, mapmode Mode);
 
 public:
-  typedef char char_type;
-
-  mapped_file_region(mapped_file_region&&);
-  mapped_file_region &operator =(mapped_file_region&&);
-
-  /// Construct a mapped_file_region at \a path starting at \a offset of length
-  /// \a length and with access \a mode.
-  ///
-  /// \param path Path to the file to map. If it does not exist it will be
-  ///             created.
-  /// \param mode How to map the memory.
-  /// \param length Number of bytes to map in starting at \a offset. If the file
-  ///               is shorter than this, it will be extended. If \a length is
-  ///               0, the entire file will be mapped.
-  /// \param offset Byte offset from the beginning of the file where the map
-  ///               should begin. Must be a multiple of
-  ///               mapped_file_region::alignment().
-  /// \param ec This is set to errc::success if the map was constructed
-  ///           successfully. Otherwise it is set to a platform dependent error.
-  mapped_file_region(const Twine &path, mapmode mode, uint64_t length,
-                     uint64_t offset, std::error_code &ec);
-
   /// \param fd An open file descriptor to map. mapped_file_region takes
   ///   ownership if closefd is true. It must have been opended in the correct
   ///   mode.
-  mapped_file_region(int fd, bool closefd, mapmode mode, uint64_t length,
-                     uint64_t offset, std::error_code &ec);
+  mapped_file_region(int fd, mapmode mode, uint64_t length, uint64_t offset,
+                     std::error_code &ec);
 
   ~mapped_file_region();
 
-  mapmode flags() const;
   uint64_t size() const;
   char *data() const;
 
diff --git a/include/llvm/Support/Format.h b/include/llvm/Support/Format.h
index 8e163dd..682c5a9 100644
--- a/include/llvm/Support/Format.h
+++ b/include/llvm/Support/Format.h
@@ -23,19 +23,12 @@
 #ifndef LLVM_SUPPORT_FORMAT_H
 #define LLVM_SUPPORT_FORMAT_H
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
-
 #include <cassert>
 #include <cstdio>
-#ifdef _MSC_VER
-// FIXME: This define is wrong:
-//  - _snprintf does not guarantee that trailing null is always added - if
-//    there is no space for null, it does not report any error.
-//  - According to C++ standard, snprintf should be visible in the 'std' 
-//    namespace - this define makes this impossible.
-#define snprintf _snprintf
-#endif
+#include <tuple>
 
 namespace llvm {
 
@@ -81,101 +74,26 @@ public:
 /// printed, this synthesizes the string into a temporary buffer provided and
 /// returns whether or not it is big enough.
 
-template <typename T>
-class format_object1 final : public format_object_base {
-  T Val;
-public:
-  format_object1(const char *fmt, const T &val)
-    : format_object_base(fmt), Val(val) {
-  }
-
-  int snprint(char *Buffer, unsigned BufferSize) const override {
-    return snprintf(Buffer, BufferSize, Fmt, Val);
-  }
-};
-
-template <typename T1, typename T2>
-class format_object2 final : public format_object_base {
-  T1 Val1;
-  T2 Val2;
-public:
-  format_object2(const char *fmt, const T1 &val1, const T2 &val2)
-  : format_object_base(fmt), Val1(val1), Val2(val2) {
-  }
-
-  int snprint(char *Buffer, unsigned BufferSize) const override {
-    return snprintf(Buffer, BufferSize, Fmt, Val1, Val2);
-  }
-};
-
-template <typename T1, typename T2, typename T3>
-class format_object3 final : public format_object_base {
-  T1 Val1;
-  T2 Val2;
-  T3 Val3;
-public:
-  format_object3(const char *fmt, const T1 &val1, const T2 &val2,const T3 &val3)
-    : format_object_base(fmt), Val1(val1), Val2(val2), Val3(val3) {
-  }
-
-  int snprint(char *Buffer, unsigned BufferSize) const override {
-    return snprintf(Buffer, BufferSize, Fmt, Val1, Val2, Val3);
-  }
-};
-
-template <typename T1, typename T2, typename T3, typename T4>
-class format_object4 final : public format_object_base {
-  T1 Val1;
-  T2 Val2;
-  T3 Val3;
-  T4 Val4;
-public:
-  format_object4(const char *fmt, const T1 &val1, const T2 &val2,
-                 const T3 &val3, const T4 &val4)
-    : format_object_base(fmt), Val1(val1), Val2(val2), Val3(val3), Val4(val4) {
-  }
-
-  int snprint(char *Buffer, unsigned BufferSize) const override {
-    return snprintf(Buffer, BufferSize, Fmt, Val1, Val2, Val3, Val4);
-  }
-};
+template <typename... Ts>
+class format_object final : public format_object_base {
+  std::tuple<Ts...> Vals;
 
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class format_object5 final : public format_object_base {
-  T1 Val1;
-  T2 Val2;
-  T3 Val3;
-  T4 Val4;
-  T5 Val5;
-public:
-  format_object5(const char *fmt, const T1 &val1, const T2 &val2,
-                 const T3 &val3, const T4 &val4, const T5 &val5)
-    : format_object_base(fmt), Val1(val1), Val2(val2), Val3(val3), Val4(val4),
-      Val5(val5) {
-  }
-
-  int snprint(char *Buffer, unsigned BufferSize) const override {
-    return snprintf(Buffer, BufferSize, Fmt, Val1, Val2, Val3, Val4, Val5);
+  template <std::size_t... Is>
+  int snprint_tuple(char *Buffer, unsigned BufferSize,
+                    index_sequence<Is...>) const {
+#ifdef _MSC_VER
+    return _snprintf(Buffer, BufferSize, Fmt, std::get<Is>(Vals)...);
+#else
+    return snprintf(Buffer, BufferSize, Fmt, std::get<Is>(Vals)...);
+#endif
   }
-};
 
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6>
-class format_object6 final : public format_object_base {
-  T1 Val1;
-  T2 Val2;
-  T3 Val3;
-  T4 Val4;
-  T5 Val5;
-  T6 Val6;
 public:
-  format_object6(const char *Fmt, const T1 &Val1, const T2 &Val2,
-                 const T3 &Val3, const T4 &Val4, const T5 &Val5, const T6 &Val6)
-    : format_object_base(Fmt), Val1(Val1), Val2(Val2), Val3(Val3), Val4(Val4),
-      Val5(Val5), Val6(Val6) { }
+  format_object(const char *fmt, const Ts &... vals)
+      : format_object_base(fmt), Vals(vals...) {}
 
   int snprint(char *Buffer, unsigned BufferSize) const override {
-    return snprintf(Buffer, BufferSize, Fmt, Val1, Val2, Val3, Val4, Val5, Val6);
+    return snprint_tuple(Buffer, BufferSize, index_sequence_for<Ts...>());
   }
 };
 
@@ -188,44 +106,9 @@ public:
 ///   OS << format("%0.4f", myfloat) << '\n';
 /// \endcode
 
-template <typename T>
-inline format_object1<T> format(const char *Fmt, const T &Val) {
-  return format_object1<T>(Fmt, Val);
-}
-
-template <typename T1, typename T2>
-inline format_object2<T1, T2> format(const char *Fmt, const T1 &Val1,
-                                     const T2 &Val2) {
-  return format_object2<T1, T2>(Fmt, Val1, Val2);
-}
-
-template <typename T1, typename T2, typename T3>
-  inline format_object3<T1, T2, T3> format(const char *Fmt, const T1 &Val1,
-                                           const T2 &Val2, const T3 &Val3) {
-  return format_object3<T1, T2, T3>(Fmt, Val1, Val2, Val3);
-}
-
-template <typename T1, typename T2, typename T3, typename T4>
-inline format_object4<T1, T2, T3, T4> format(const char *Fmt, const T1 &Val1,
-                                             const T2 &Val2, const T3 &Val3,
-                                             const T4 &Val4) {
-  return format_object4<T1, T2, T3, T4>(Fmt, Val1, Val2, Val3, Val4);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-inline format_object5<T1, T2, T3, T4, T5> format(const char *Fmt,const T1 &Val1,
-                                             const T2 &Val2, const T3 &Val3,
-                                             const T4 &Val4, const T5 &Val5) {
-  return format_object5<T1, T2, T3, T4, T5>(Fmt, Val1, Val2, Val3, Val4, Val5);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6>
-inline format_object6<T1, T2, T3, T4, T5, T6>
-format(const char *Fmt, const T1 &Val1, const T2 &Val2, const T3 &Val3,
-       const T4 &Val4, const T5 &Val5, const T6 &Val6) {
-  return format_object6<T1, T2, T3, T4, T5, T6>(Fmt, Val1, Val2, Val3, Val4,
-                                                Val5, Val6);
+template <typename... Ts>
+inline format_object<Ts...> format(const char *Fmt, const Ts &... Vals) {
+  return format_object<Ts...>(Fmt, Vals...);
 }
 
 /// This is a helper class used for left_justify() and right_justify().
@@ -260,21 +143,38 @@ class FormattedNumber {
   unsigned Width;
   bool Hex;
   bool Upper;
+  bool HexPrefix;
   friend class raw_ostream;
 public:
-    FormattedNumber(uint64_t HV, int64_t DV, unsigned W, bool H, bool U)
-      : HexValue(HV), DecValue(DV), Width(W), Hex(H), Upper(U) { }
+  FormattedNumber(uint64_t HV, int64_t DV, unsigned W, bool H, bool U,
+                  bool Prefix)
+      : HexValue(HV), DecValue(DV), Width(W), Hex(H), Upper(U),
+        HexPrefix(Prefix) {}
 };
 
 /// format_hex - Output \p N as a fixed width hexadecimal. If number will not
 /// fit in width, full number is still printed.  Examples:
-///   OS << format_hex(255, 4)        => 0xff
-///   OS << format_hex(255, 4, true)  => 0xFF
-///   OS << format_hex(255, 6)        => 0x00ff
-///   OS << format_hex(255, 2)        => 0xff
-inline FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false) {
+///   OS << format_hex(255, 4)              => 0xff
+///   OS << format_hex(255, 4, true)        => 0xFF
+///   OS << format_hex(255, 6)              => 0x00ff
+///   OS << format_hex(255, 2)              => 0xff
+inline FormattedNumber format_hex(uint64_t N, unsigned Width,
+                                  bool Upper = false) {
+  assert(Width <= 18 && "hex width must be <= 18");
+  return FormattedNumber(N, 0, Width, true, Upper, true);
+}
+
+/// format_hex_no_prefix - Output \p N as a fixed width hexadecimal. Does not
+/// prepend '0x' to the outputted string.  If number will not fit in width,
+/// full number is still printed.  Examples:
+///   OS << format_hex_no_prefix(255, 4)              => ff
+///   OS << format_hex_no_prefix(255, 4, true)        => FF
+///   OS << format_hex_no_prefix(255, 6)              => 00ff
+///   OS << format_hex_no_prefix(255, 2)              => ff
+inline FormattedNumber format_hex_no_prefix(uint64_t N, unsigned Width,
+                                            bool Upper = false) {
   assert(Width <= 18 && "hex width must be <= 18");
-  return FormattedNumber(N, 0, Width, true, Upper);
+  return FormattedNumber(N, 0, Width, true, Upper, false);
 }
 
 /// format_decimal - Output \p N as a right justified, fixed-width decimal. If 
@@ -284,7 +184,7 @@ inline FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
 ///   OS << format_decimal(-1, 3)    => " -1"
 ///   OS << format_decimal(12345, 3) => "12345"
 inline FormattedNumber format_decimal(int64_t N, unsigned Width) {
-  return FormattedNumber(0, N, Width, false, false);
+  return FormattedNumber(0, N, Width, false, false, false);
 }
 
 
diff --git a/include/llvm/Support/GCOV.h b/include/llvm/Support/GCOV.h
index e378602..c2e34bd 100644
--- a/include/llvm/Support/GCOV.h
+++ b/include/llvm/Support/GCOV.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/iterator.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -29,10 +30,7 @@ class GCOVBlock;
 class FileInfo;
 
 namespace GCOV {
-  enum GCOVVersion {
-    V402,
-    V404
-  };
+enum GCOVVersion { V402, V404 };
 } // end GCOV namespace
 
 /// GCOVOptions - A struct for passing gcov options between functions.
@@ -56,7 +54,7 @@ struct GCOVOptions {
 class GCOVBuffer {
 public:
   GCOVBuffer(MemoryBuffer *B) : Buffer(B), Cursor(0) {}
-  
+
   /// readGCNOFormat - Check GCNO signature is valid at the beginning of buffer.
   bool readGCNOFormat() {
     StringRef File = Buffer->getBuffer().slice(0, 4);
@@ -81,7 +79,7 @@ public:
 
   /// readGCOVVersion - Read GCOV version.
   bool readGCOVVersion(GCOV::GCOVVersion &Version) {
-    StringRef VersionStr = Buffer->getBuffer().slice(Cursor, Cursor+4);
+    StringRef VersionStr = Buffer->getBuffer().slice(Cursor, Cursor + 4);
     if (VersionStr == "*204") {
       Cursor += 4;
       Version = GCOV::V402;
@@ -99,10 +97,9 @@ public:
   /// readFunctionTag - If cursor points to a function tag then increment the
   /// cursor and return true otherwise return false.
   bool readFunctionTag() {
-    StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
-    if (Tag.empty() ||
-        Tag[0] != '\0' || Tag[1] != '\0' ||
-        Tag[2] != '\0' || Tag[3] != '\1') {
+    StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4);
+    if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\0' ||
+        Tag[3] != '\1') {
       return false;
     }
     Cursor += 4;
@@ -112,10 +109,9 @@ public:
   /// readBlockTag - If cursor points to a block tag then increment the
   /// cursor and return true otherwise return false.
   bool readBlockTag() {
-    StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
-    if (Tag.empty() ||
-        Tag[0] != '\0' || Tag[1] != '\0' ||
-        Tag[2] != '\x41' || Tag[3] != '\x01') {
+    StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4);
+    if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\x41' ||
+        Tag[3] != '\x01') {
       return false;
     }
     Cursor += 4;
@@ -125,10 +121,9 @@ public:
   /// readEdgeTag - If cursor points to an edge tag then increment the
   /// cursor and return true otherwise return false.
   bool readEdgeTag() {
-    StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
-    if (Tag.empty() ||
-        Tag[0] != '\0' || Tag[1] != '\0' ||
-        Tag[2] != '\x43' || Tag[3] != '\x01') {
+    StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4);
+    if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\x43' ||
+        Tag[3] != '\x01') {
       return false;
     }
     Cursor += 4;
@@ -138,10 +133,9 @@ public:
   /// readLineTag - If cursor points to a line tag then increment the
   /// cursor and return true otherwise return false.
   bool readLineTag() {
-    StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
-    if (Tag.empty() ||
-        Tag[0] != '\0' || Tag[1] != '\0' ||
-        Tag[2] != '\x45' || Tag[3] != '\x01') {
+    StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4);
+    if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\x45' ||
+        Tag[3] != '\x01') {
       return false;
     }
     Cursor += 4;
@@ -151,10 +145,9 @@ public:
   /// readArcTag - If cursor points to an gcda arc tag then increment the
   /// cursor and return true otherwise return false.
   bool readArcTag() {
-    StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
-    if (Tag.empty() ||
-        Tag[0] != '\0' || Tag[1] != '\0' ||
-        Tag[2] != '\xa1' || Tag[3] != '\1') {
+    StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4);
+    if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\xa1' ||
+        Tag[3] != '\1') {
       return false;
     }
     Cursor += 4;
@@ -164,10 +157,9 @@ public:
   /// readObjectTag - If cursor points to an object summary tag then increment
   /// the cursor and return true otherwise return false.
   bool readObjectTag() {
-    StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
-    if (Tag.empty() ||
-        Tag[0] != '\0' || Tag[1] != '\0' ||
-        Tag[2] != '\0' || Tag[3] != '\xa1') {
+    StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4);
+    if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\0' ||
+        Tag[3] != '\xa1') {
       return false;
     }
     Cursor += 4;
@@ -177,10 +169,9 @@ public:
   /// readProgramTag - If cursor points to a program summary tag then increment
   /// the cursor and return true otherwise return false.
   bool readProgramTag() {
-    StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
-    if (Tag.empty() ||
-        Tag[0] != '\0' || Tag[1] != '\0' ||
-        Tag[2] != '\0' || Tag[3] != '\xa3') {
+    StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4);
+    if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\0' ||
+        Tag[3] != '\xa3') {
       return false;
     }
     Cursor += 4;
@@ -188,11 +179,11 @@ public:
   }
 
   bool readInt(uint32_t &Val) {
-    if (Buffer->getBuffer().size() < Cursor+4) {
-      errs() << "Unexpected end of memory buffer: " << Cursor+4 << ".\n";
+    if (Buffer->getBuffer().size() < Cursor + 4) {
+      errs() << "Unexpected end of memory buffer: " << Cursor + 4 << ".\n";
       return false;
     }
-    StringRef Str = Buffer->getBuffer().slice(Cursor, Cursor+4);
+    StringRef Str = Buffer->getBuffer().slice(Cursor, Cursor + 4);
     Cursor += 4;
     Val = *(const uint32_t *)(Str.data());
     return true;
@@ -200,7 +191,8 @@ public:
 
   bool readInt64(uint64_t &Val) {
     uint32_t Lo, Hi;
-    if (!readInt(Lo) || !readInt(Hi)) return false;
+    if (!readInt(Lo) || !readInt(Hi))
+      return false;
     Val = ((uint64_t)Hi << 32) | Lo;
     return true;
   }
@@ -210,19 +202,21 @@ public:
     // Keep reading until we find a non-zero length. This emulates gcov's
     // behaviour, which appears to do the same.
     while (Len == 0)
-      if (!readInt(Len)) return false;
+      if (!readInt(Len))
+        return false;
     Len *= 4;
-    if (Buffer->getBuffer().size() < Cursor+Len) {
-      errs() << "Unexpected end of memory buffer: " << Cursor+Len << ".\n";
+    if (Buffer->getBuffer().size() < Cursor + Len) {
+      errs() << "Unexpected end of memory buffer: " << Cursor + Len << ".\n";
       return false;
     }
-    Str = Buffer->getBuffer().slice(Cursor, Cursor+Len).split('\0').first;
+    Str = Buffer->getBuffer().slice(Cursor, Cursor + Len).split('\0').first;
     Cursor += Len;
     return true;
   }
 
   uint64_t getCursor() const { return Cursor; }
-  void advanceCursor(uint32_t n) { Cursor += n*4; }
+  void advanceCursor(uint32_t n) { Cursor += n * 4; }
+
 private:
   MemoryBuffer *Buffer;
   uint64_t Cursor;
@@ -232,13 +226,15 @@ private:
 /// (.gcno and .gcda).
 class GCOVFile {
 public:
-  GCOVFile() : GCNOInitialized(false), Checksum(0), Functions(), RunCount(0),
-               ProgramCount(0) {}
+  GCOVFile()
+      : GCNOInitialized(false), Checksum(0), Functions(), RunCount(0),
+        ProgramCount(0) {}
   bool readGCNO(GCOVBuffer &Buffer);
   bool readGCDA(GCOVBuffer &Buffer);
   uint32_t getChecksum() const { return Checksum; }
   void dump() const;
   void collectLineCounts(FileInfo &FI);
+
 private:
   bool GCNOInitialized;
   GCOV::GCOVVersion Version;
@@ -260,8 +256,8 @@ struct GCOVEdge {
 /// GCOVFunction - Collects function information.
 class GCOVFunction {
 public:
-  typedef SmallVectorImpl<std::unique_ptr<GCOVBlock>>::const_iterator
-  BlockIterator;
+  typedef pointee_iterator<SmallVectorImpl<
+      std::unique_ptr<GCOVBlock>>::const_iterator> BlockIterator;
 
   GCOVFunction(GCOVFile &P) : Parent(P), Ident(0), LineNumber(0) {}
   bool readGCNO(GCOVBuffer &Buffer, GCOV::GCOVVersion Version);
@@ -274,9 +270,13 @@ public:
 
   BlockIterator block_begin() const { return Blocks.begin(); }
   BlockIterator block_end() const { return Blocks.end(); }
+  iterator_range<BlockIterator> blocks() const {
+    return make_range(block_begin(), block_end());
+  }
 
   void dump() const;
   void collectLineCounts(FileInfo &FI);
+
 private:
   GCOVFile &Parent;
   uint32_t Ident;
@@ -291,7 +291,7 @@ private:
 /// GCOVBlock - Collects block information.
 class GCOVBlock {
   struct EdgeWeight {
-    EdgeWeight(GCOVBlock *D): Dst(D), Count(0) {}
+    EdgeWeight(GCOVBlock *D) : Dst(D), Count(0) {}
 
     GCOVBlock *Dst;
     uint64_t Count;
@@ -302,11 +302,13 @@ class GCOVBlock {
       return E1->Dst.Number < E2->Dst.Number;
     }
   };
+
 public:
   typedef SmallVectorImpl<GCOVEdge *>::const_iterator EdgeIterator;
 
-  GCOVBlock(GCOVFunction &P, uint32_t N) : Parent(P), Number(N), Counter(0),
-    DstEdgesAreSorted(true), SrcEdges(), DstEdges(), Lines() {}
+  GCOVBlock(GCOVFunction &P, uint32_t N)
+      : Parent(P), Number(N), Counter(0), DstEdgesAreSorted(true), SrcEdges(),
+        DstEdges(), Lines() {}
   ~GCOVBlock();
   const GCOVFunction &getParent() const { return Parent; }
   void addLine(uint32_t N) { Lines.push_back(N); }
@@ -331,11 +333,19 @@ public:
 
   EdgeIterator src_begin() const { return SrcEdges.begin(); }
   EdgeIterator src_end() const { return SrcEdges.end(); }
+  iterator_range<EdgeIterator> srcs() const {
+    return make_range(src_begin(), src_end());
+  }
+
   EdgeIterator dst_begin() const { return DstEdges.begin(); }
   EdgeIterator dst_end() const { return DstEdges.end(); }
+  iterator_range<EdgeIterator> dsts() const {
+    return make_range(dst_begin(), dst_end());
+  }
 
   void dump() const;
   void collectLineCounts(FileInfo &FI);
+
 private:
   GCOVFunction &Parent;
   uint32_t Number;
@@ -347,8 +357,10 @@ private:
 };
 
 class FileInfo {
-  // It is unlikely--but possible--for multiple functions to be on the same line.
-  // Therefore this typedef allows LineData.Functions to store multiple functions
+  // It is unlikely--but possible--for multiple functions to be on the same
+  // line.
+  // Therefore this typedef allows LineData.Functions to store multiple
+  // functions
   // per instance. This is rare, however, so optimize for the common case.
   typedef SmallVector<const GCOVFunction *, 1> FunctionVector;
   typedef DenseMap<uint32_t, FunctionVector> FunctionLines;
@@ -363,9 +375,9 @@ class FileInfo {
   };
 
   struct GCOVCoverage {
-    GCOVCoverage(StringRef Name) :
-      Name(Name), LogicalLines(0), LinesExec(0), Branches(0), BranchesExec(0),
-      BranchesTaken(0) {}
+    GCOVCoverage(StringRef Name)
+        : Name(Name), LogicalLines(0), LinesExec(0), Branches(0),
+          BranchesExec(0), BranchesTaken(0) {}
 
     StringRef Name;
 
@@ -376,30 +388,31 @@ class FileInfo {
     uint32_t BranchesExec;
     uint32_t BranchesTaken;
   };
+
 public:
-  FileInfo(const GCOVOptions &Options) :
-    Options(Options), LineInfo(), RunCount(0), ProgramCount(0) {}
+  FileInfo(const GCOVOptions &Options)
+      : Options(Options), LineInfo(), RunCount(0), ProgramCount(0) {}
 
   void addBlockLine(StringRef Filename, uint32_t Line, const GCOVBlock *Block) {
     if (Line > LineInfo[Filename].LastLine)
       LineInfo[Filename].LastLine = Line;
-    LineInfo[Filename].Blocks[Line-1].push_back(Block);
+    LineInfo[Filename].Blocks[Line - 1].push_back(Block);
   }
   void addFunctionLine(StringRef Filename, uint32_t Line,
                        const GCOVFunction *Function) {
     if (Line > LineInfo[Filename].LastLine)
       LineInfo[Filename].LastLine = Line;
-    LineInfo[Filename].Functions[Line-1].push_back(Function);
+    LineInfo[Filename].Functions[Line - 1].push_back(Function);
   }
   void setRunCount(uint32_t Runs) { RunCount = Runs; }
   void setProgramCount(uint32_t Programs) { ProgramCount = Programs; }
-  void print(StringRef MainFilename, StringRef GCNOFile, StringRef GCDAFile);
+  void print(raw_ostream &OS, StringRef MainFilename, StringRef GCNOFile,
+             StringRef GCDAFile);
 
 private:
   std::string getCoveragePath(StringRef Filename, StringRef MainFilename);
   std::unique_ptr<raw_ostream> openCoveragePath(StringRef CoveragePath);
-  void printFunctionSummary(raw_ostream &OS,
-                            const FunctionVector &Funcs) const;
+  void printFunctionSummary(raw_ostream &OS, const FunctionVector &Funcs) const;
   void printBlockInfo(raw_ostream &OS, const GCOVBlock &Block,
                       uint32_t LineIndex, uint32_t &BlockNo) const;
   void printBranchInfo(raw_ostream &OS, const GCOVBlock &Block,
@@ -407,23 +420,21 @@ private:
   void printUncondBranchInfo(raw_ostream &OS, uint32_t &EdgeNo,
                              uint64_t Count) const;
 
-  void printCoverage(const GCOVCoverage &Coverage) const;
-  void printFuncCoverage() const;
-  void printFileCoverage() const;
+  void printCoverage(raw_ostream &OS, const GCOVCoverage &Coverage) const;
+  void printFuncCoverage(raw_ostream &OS) const;
+  void printFileCoverage(raw_ostream &OS) const;
 
   const GCOVOptions &Options;
   StringMap<LineData> LineInfo;
   uint32_t RunCount;
   uint32_t ProgramCount;
 
-  typedef SmallVector<std::pair<std::string, GCOVCoverage>, 4>
-      FileCoverageList;
+  typedef SmallVector<std::pair<std::string, GCOVCoverage>, 4> FileCoverageList;
   typedef MapVector<const GCOVFunction *, GCOVCoverage> FuncCoverageMap;
 
   FileCoverageList FileCoverages;
   FuncCoverageMap FuncCoverages;
 };
-
 }
 
 #endif
diff --git a/include/llvm/Support/GenericDomTree.h b/include/llvm/Support/GenericDomTree.h
index 6bc4b44..0998eb9 100644
--- a/include/llvm/Support/GenericDomTree.h
+++ b/include/llvm/Support/GenericDomTree.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Compiler.h"
@@ -29,76 +30,79 @@
 
 namespace llvm {
 
-//===----------------------------------------------------------------------===//
-/// DominatorBase - Base class that other, more interesting dominator analyses
+/// \brief Base class that other, more interesting dominator analyses
 /// inherit from.
-///
-template <class NodeT>
-class DominatorBase {
+template <class NodeT> class DominatorBase {
 protected:
-  std::vector<NodeT*> Roots;
-  const bool IsPostDominators;
-  inline explicit DominatorBase(bool isPostDom) :
-    Roots(), IsPostDominators(isPostDom) {}
-public:
+  std::vector<NodeT *> Roots;
+  bool IsPostDominators;
+  explicit DominatorBase(bool isPostDom)
+      : Roots(), IsPostDominators(isPostDom) {}
+  DominatorBase(DominatorBase &&Arg)
+      : Roots(std::move(Arg.Roots)),
+        IsPostDominators(std::move(Arg.IsPostDominators)) {
+    Arg.Roots.clear();
+  }
+  DominatorBase &operator=(DominatorBase &&RHS) {
+    Roots = std::move(RHS.Roots);
+    IsPostDominators = std::move(RHS.IsPostDominators);
+    RHS.Roots.clear();
+    return *this;
+  }
 
+public:
   /// getRoots - Return the root blocks of the current CFG.  This may include
   /// multiple blocks if we are computing post dominators.  For forward
   /// dominators, this will always be a single block (the entry node).
   ///
-  inline const std::vector<NodeT*> &getRoots() const { return Roots; }
+  const std::vector<NodeT *> &getRoots() const { return Roots; }
 
   /// isPostDominator - Returns true if analysis based of postdoms
   ///
   bool isPostDominator() const { return IsPostDominators; }
 };
 
-
-//===----------------------------------------------------------------------===//
-// DomTreeNodeBase - Dominator Tree Node
-template<class NodeT> class DominatorTreeBase;
+template <class NodeT> class DominatorTreeBase;
 struct PostDominatorTree;
 
-template <class NodeT>
-class DomTreeNodeBase {
+/// \brief Base class for the actual dominator tree node.
+template <class NodeT> class DomTreeNodeBase {
   NodeT *TheBB;
   DomTreeNodeBase<NodeT> *IDom;
   std::vector<DomTreeNodeBase<NodeT> *> Children;
   mutable int DFSNumIn, DFSNumOut;
 
-  template<class N> friend class DominatorTreeBase;
+  template <class N> friend class DominatorTreeBase;
   friend struct PostDominatorTree;
+
 public:
   typedef typename std::vector<DomTreeNodeBase<NodeT> *>::iterator iterator;
   typedef typename std::vector<DomTreeNodeBase<NodeT> *>::const_iterator
-                   const_iterator;
+      const_iterator;
 
-  iterator begin()             { return Children.begin(); }
-  iterator end()               { return Children.end(); }
+  iterator begin() { return Children.begin(); }
+  iterator end() { return Children.end(); }
   const_iterator begin() const { return Children.begin(); }
-  const_iterator end()   const { return Children.end(); }
+  const_iterator end() const { return Children.end(); }
 
   NodeT *getBlock() const { return TheBB; }
   DomTreeNodeBase<NodeT> *getIDom() const { return IDom; }
-  const std::vector<DomTreeNodeBase<NodeT>*> &getChildren() const {
+  const std::vector<DomTreeNodeBase<NodeT> *> &getChildren() const {
     return Children;
   }
 
   DomTreeNodeBase(NodeT *BB, DomTreeNodeBase<NodeT> *iDom)
-    : TheBB(BB), IDom(iDom), DFSNumIn(-1), DFSNumOut(-1) { }
+      : TheBB(BB), IDom(iDom), DFSNumIn(-1), DFSNumOut(-1) {}
 
-  DomTreeNodeBase<NodeT> *addChild(DomTreeNodeBase<NodeT> *C) {
-    Children.push_back(C);
+  std::unique_ptr<DomTreeNodeBase<NodeT>>
+  addChild(std::unique_ptr<DomTreeNodeBase<NodeT>> C) {
+    Children.push_back(C.get());
     return C;
   }
 
-  size_t getNumChildren() const {
-    return Children.size();
-  }
+  size_t getNumChildren() const { return Children.size(); }
 
-  void clearAllChildren() {
-    Children.clear();
-  }
+  void clearAllChildren() { Children.clear(); }
 
   bool compare(const DomTreeNodeBase<NodeT> *Other) const {
     if (getNumChildren() != Other->getNumChildren())
@@ -121,8 +125,8 @@ public:
   void setIDom(DomTreeNodeBase<NodeT> *NewIDom) {
     assert(IDom && "No immediate dominator?");
     if (IDom != NewIDom) {
-      typename std::vector<DomTreeNodeBase<NodeT>*>::iterator I =
-                  std::find(IDom->Children.begin(), IDom->Children.end(), this);
+      typename std::vector<DomTreeNodeBase<NodeT> *>::iterator I =
+          std::find(IDom->Children.begin(), IDom->Children.end(), this);
       assert(I != IDom->Children.end() &&
              "Not in immediate dominator children set!");
       // I am no longer your child...
@@ -138,18 +142,18 @@ public:
   /// not call them.
   unsigned getDFSNumIn() const { return DFSNumIn; }
   unsigned getDFSNumOut() const { return DFSNumOut; }
+
 private:
   // Return true if this node is dominated by other. Use this only if DFS info
   // is valid.
   bool DominatedBy(const DomTreeNodeBase<NodeT> *other) const {
     return this->DFSNumIn >= other->DFSNumIn &&
-      this->DFSNumOut <= other->DFSNumOut;
+           this->DFSNumOut <= other->DFSNumOut;
   }
 };
 
-template<class NodeT>
-inline raw_ostream &operator<<(raw_ostream &o,
-                               const DomTreeNodeBase<NodeT> *Node) {
+template <class NodeT>
+raw_ostream &operator<<(raw_ostream &o, const DomTreeNodeBase<NodeT> *Node) {
   if (Node->getBlock())
     Node->getBlock()->printAsOperand(o, false);
   else
@@ -160,25 +164,29 @@ inline raw_ostream &operator<<(raw_ostream &o,
   return o << "\n";
 }
 
-template<class NodeT>
-inline void PrintDomTree(const DomTreeNodeBase<NodeT> *N, raw_ostream &o,
-                         unsigned Lev) {
-  o.indent(2*Lev) << "[" << Lev << "] " << N;
+template <class NodeT>
+void PrintDomTree(const DomTreeNodeBase<NodeT> *N, raw_ostream &o,
+                  unsigned Lev) {
+  o.indent(2 * Lev) << "[" << Lev << "] " << N;
   for (typename DomTreeNodeBase<NodeT>::const_iterator I = N->begin(),
-       E = N->end(); I != E; ++I)
-    PrintDomTree<NodeT>(*I, o, Lev+1);
+                                                       E = N->end();
+       I != E; ++I)
+    PrintDomTree<NodeT>(*I, o, Lev + 1);
 }
 
-//===----------------------------------------------------------------------===//
-/// DominatorTree - Calculate the immediate dominator tree for a function.
-///
+// The calculate routine is provided in a separate header but referenced here.
+template <class FuncT, class N>
+void Calculate(DominatorTreeBase<typename GraphTraits<N>::NodeType> &DT,
+               FuncT &F);
 
-template<class FuncT, class N>
-void Calculate(DominatorTreeBase<typename GraphTraits<N>::NodeType>& DT,
-               FuncT& F);
+/// \brief Core dominator tree base class.
+///
+/// This class is a generic template over graph nodes. It is instantiated for
+/// various graphs in the LLVM IR or in the code generator.
+template <class NodeT> class DominatorTreeBase : public DominatorBase<NodeT> {
+  DominatorTreeBase(const DominatorTreeBase &) = delete;
+  DominatorTreeBase &operator=(const DominatorTreeBase &) = delete;
 
-template<class NodeT>
-class DominatorTreeBase : public DominatorBase<NodeT> {
   bool dominatedBySlowTreeWalk(const DomTreeNodeBase<NodeT> *A,
                                const DomTreeNodeBase<NodeT> *B) const {
     assert(A != B);
@@ -187,12 +195,25 @@ class DominatorTreeBase : public DominatorBase<NodeT> {
 
     const DomTreeNodeBase<NodeT> *IDom;
     while ((IDom = B->getIDom()) != nullptr && IDom != A && IDom != B)
-      B = IDom;   // Walk up the tree
+      B = IDom; // Walk up the tree
     return IDom != nullptr;
   }
 
+  /// \brief Wipe this tree's state without releasing any resources.
+  ///
+  /// This is essentially a post-move helper only. It leaves the object in an
+  /// assignable and destroyable state, but otherwise invalid.
+  void wipe() {
+    DomTreeNodes.clear();
+    IDoms.clear();
+    Vertex.clear();
+    Info.clear();
+    RootNode = nullptr;
+  }
+
 protected:
-  typedef DenseMap<NodeT*, DomTreeNodeBase<NodeT>*> DomTreeNodeMapType;
+  typedef DenseMap<NodeT *, std::unique_ptr<DomTreeNodeBase<NodeT>>>
+      DomTreeNodeMapType;
   DomTreeNodeMapType DomTreeNodes;
   DomTreeNodeBase<NodeT> *RootNode;
 
@@ -208,18 +229,15 @@ protected:
     InfoRec() : DFSNum(0), Parent(0), Semi(0), Label(nullptr) {}
   };
 
-  DenseMap<NodeT*, NodeT*> IDoms;
+  DenseMap<NodeT *, NodeT *> IDoms;
 
   // Vertex - Map the DFS number to the NodeT*
-  std::vector<NodeT*> Vertex;
+  std::vector<NodeT *> Vertex;
 
   // Info - Collection of information used during the computation of idoms.
-  DenseMap<NodeT*, InfoRec> Info;
+  DenseMap<NodeT *, InfoRec> Info;
 
   void reset() {
-    for (typename DomTreeNodeMapType::iterator I = this->DomTreeNodes.begin(),
-           E = DomTreeNodes.end(); I != E; ++I)
-      delete I->second;
     DomTreeNodes.clear();
     IDoms.clear();
     this->Roots.clear();
@@ -229,27 +247,29 @@ protected:
 
   // NewBB is split and now it has one successor. Update dominator tree to
   // reflect this change.
-  template<class N, class GraphT>
-  void Split(DominatorTreeBase<typename GraphT::NodeType>& DT,
-             typename GraphT::NodeType* NewBB) {
+  template <class N, class GraphT>
+  void Split(DominatorTreeBase<typename GraphT::NodeType> &DT,
+             typename GraphT::NodeType *NewBB) {
     assert(std::distance(GraphT::child_begin(NewBB),
                          GraphT::child_end(NewBB)) == 1 &&
            "NewBB should have a single successor!");
-    typename GraphT::NodeType* NewBBSucc = *GraphT::child_begin(NewBB);
-
-    std::vector<typename GraphT::NodeType*> PredBlocks;
-    typedef GraphTraits<Inverse<N> > InvTraits;
-    for (typename InvTraits::ChildIteratorType PI =
-         InvTraits::child_begin(NewBB),
-         PE = InvTraits::child_end(NewBB); PI != PE; ++PI)
+    typename GraphT::NodeType *NewBBSucc = *GraphT::child_begin(NewBB);
+
+    std::vector<typename GraphT::NodeType *> PredBlocks;
+    typedef GraphTraits<Inverse<N>> InvTraits;
+    for (typename InvTraits::ChildIteratorType
+             PI = InvTraits::child_begin(NewBB),
+             PE = InvTraits::child_end(NewBB);
+         PI != PE; ++PI)
       PredBlocks.push_back(*PI);
 
     assert(!PredBlocks.empty() && "No predblocks?");
 
     bool NewBBDominatesNewBBSucc = true;
-    for (typename InvTraits::ChildIteratorType PI =
-         InvTraits::child_begin(NewBBSucc),
-         E = InvTraits::child_end(NewBBSucc); PI != E; ++PI) {
+    for (typename InvTraits::ChildIteratorType
+             PI = InvTraits::child_begin(NewBBSucc),
+             E = InvTraits::child_end(NewBBSucc);
+         PI != E; ++PI) {
       typename InvTraits::NodeType *ND = *PI;
       if (ND != NewBB && !DT.dominates(NewBBSucc, ND) &&
           DT.isReachableFromEntry(ND)) {
@@ -292,8 +312,31 @@ protected:
 
 public:
   explicit DominatorTreeBase(bool isPostDom)
-    : DominatorBase<NodeT>(isPostDom), DFSInfoValid(false), SlowQueries(0) {}
-  virtual ~DominatorTreeBase() { reset(); }
+      : DominatorBase<NodeT>(isPostDom), DFSInfoValid(false), SlowQueries(0) {}
+
+  DominatorTreeBase(DominatorTreeBase &&Arg)
+      : DominatorBase<NodeT>(
+            std::move(static_cast<DominatorBase<NodeT> &>(Arg))),
+        DomTreeNodes(std::move(Arg.DomTreeNodes)),
+        RootNode(std::move(Arg.RootNode)),
+        DFSInfoValid(std::move(Arg.DFSInfoValid)),
+        SlowQueries(std::move(Arg.SlowQueries)), IDoms(std::move(Arg.IDoms)),
+        Vertex(std::move(Arg.Vertex)), Info(std::move(Arg.Info)) {
+    Arg.wipe();
+  }
+  DominatorTreeBase &operator=(DominatorTreeBase &&RHS) {
+    DominatorBase<NodeT>::operator=(
+        std::move(static_cast<DominatorBase<NodeT> &>(RHS)));
+    DomTreeNodes = std::move(RHS.DomTreeNodes);
+    RootNode = std::move(RHS.RootNode);
+    DFSInfoValid = std::move(RHS.DFSInfoValid);
+    SlowQueries = std::move(RHS.SlowQueries);
+    IDoms = std::move(RHS.IDoms);
+    Vertex = std::move(RHS.Vertex);
+    Info = std::move(RHS.Info);
+    RHS.wipe();
+    return *this;
+  }
 
   /// compare - Return false if the other dominator tree base matches this
   /// dominator tree base. Otherwise return true.
@@ -304,35 +347,38 @@ public:
       return true;
 
     for (typename DomTreeNodeMapType::const_iterator
-           I = this->DomTreeNodes.begin(),
-           E = this->DomTreeNodes.end(); I != E; ++I) {
+             I = this->DomTreeNodes.begin(),
+             E = this->DomTreeNodes.end();
+         I != E; ++I) {
       NodeT *BB = I->first;
-      typename DomTreeNodeMapType::const_iterator OI = OtherDomTreeNodes.find(BB);
+      typename DomTreeNodeMapType::const_iterator OI =
+          OtherDomTreeNodes.find(BB);
       if (OI == OtherDomTreeNodes.end())
         return true;
 
-      DomTreeNodeBase<NodeT>* MyNd = I->second;
-      DomTreeNodeBase<NodeT>* OtherNd = OI->second;
+      DomTreeNodeBase<NodeT> &MyNd = *I->second;
+      DomTreeNodeBase<NodeT> &OtherNd = *OI->second;
 
-      if (MyNd->compare(OtherNd))
+      if (MyNd.compare(&OtherNd))
         return true;
     }
 
     return false;
   }
 
-  virtual void releaseMemory() { reset(); }
+  void releaseMemory() { reset(); }
 
   /// getNode - return the (Post)DominatorTree node for the specified basic
   /// block.  This is the same as using operator[] on this class.
   ///
-  inline DomTreeNodeBase<NodeT> *getNode(NodeT *BB) const {
-    return DomTreeNodes.lookup(BB);
+  DomTreeNodeBase<NodeT> *getNode(NodeT *BB) const {
+    auto I = DomTreeNodes.find(BB);
+    if (I != DomTreeNodes.end())
+      return I->second.get();
+    return nullptr;
   }
 
-  inline DomTreeNodeBase<NodeT> *operator[](NodeT *BB) const {
-    return getNode(BB);
-  }
+  DomTreeNodeBase<NodeT> *operator[](NodeT *BB) const { return getNode(BB); }
 
   /// getRootNode - This returns the entry node for the CFG of the function.  If
   /// this tree represents the post-dominance relations for a function, however,
@@ -376,21 +422,19 @@ public:
 
   /// isReachableFromEntry - Return true if A is dominated by the entry
   /// block of the function containing it.
-  bool isReachableFromEntry(const NodeT* A) const {
+  bool isReachableFromEntry(const NodeT *A) const {
     assert(!this->isPostDominator() &&
            "This is not implemented for post dominators");
     return isReachableFromEntry(getNode(const_cast<NodeT *>(A)));
   }
 
-  inline bool isReachableFromEntry(const DomTreeNodeBase<NodeT> *A) const {
-    return A;
-  }
+  bool isReachableFromEntry(const DomTreeNodeBase<NodeT> *A) const { return A; }
 
   /// dominates - Returns true iff A dominates B.  Note that this is not a
   /// constant time operation!
   ///
-  inline bool dominates(const DomTreeNodeBase<NodeT> *A,
-                        const DomTreeNodeBase<NodeT> *B) const {
+  bool dominates(const DomTreeNodeBase<NodeT> *A,
+                 const DomTreeNodeBase<NodeT> *B) const {
     // A node trivially dominates itself.
     if (B == A)
       return true;
@@ -473,7 +517,7 @@ public:
     }
 
     // Collect NodeA dominators set.
-    SmallPtrSet<DomTreeNodeBase<NodeT>*, 16> NodeADoms;
+    SmallPtrSet<DomTreeNodeBase<NodeT> *, 16> NodeADoms;
     NodeADoms.insert(NodeA);
     DomTreeNodeBase<NodeT> *IDomA = NodeA->getIDom();
     while (IDomA) {
@@ -512,8 +556,8 @@ public:
     DomTreeNodeBase<NodeT> *IDomNode = getNode(DomBB);
     assert(IDomNode && "Not immediate dominator specified for block!");
     DFSInfoValid = false;
-    return DomTreeNodes[BB] =
-      IDomNode->addChild(new DomTreeNodeBase<NodeT>(BB, IDomNode));
+    return (DomTreeNodes[BB] = IDomNode->addChild(
+                llvm::make_unique<DomTreeNodeBase<NodeT>>(BB, IDomNode))).get();
   }
 
   /// changeImmediateDominator - This method is used to update the dominator
@@ -538,11 +582,11 @@ public:
     assert(Node && "Removing node that isn't in dominator tree.");
     assert(Node->getChildren().empty() && "Node is not a leaf node.");
 
-      // Remove node from immediate dominator's children list.
+    // Remove node from immediate dominator's children list.
     DomTreeNodeBase<NodeT> *IDom = Node->getIDom();
     if (IDom) {
-      typename std::vector<DomTreeNodeBase<NodeT>*>::iterator I =
-        std::find(IDom->Children.begin(), IDom->Children.end(), Node);
+      typename std::vector<DomTreeNodeBase<NodeT> *>::iterator I =
+          std::find(IDom->Children.begin(), IDom->Children.end(), Node);
       assert(I != IDom->Children.end() &&
              "Not in immediate dominator children set!");
       // I am no longer your child...
@@ -550,24 +594,16 @@ public:
     }
 
     DomTreeNodes.erase(BB);
-    delete Node;
-  }
-
-  /// removeNode - Removes a node from the dominator tree.  Block must not
-  /// dominate any other blocks.  Invalidates any node pointing to removed
-  /// block.
-  void removeNode(NodeT *BB) {
-    assert(getNode(BB) && "Removing node that isn't in dominator tree.");
-    DomTreeNodes.erase(BB);
   }
 
   /// splitBlock - BB is split and now it has one successor. Update dominator
   /// tree to reflect this change.
-  void splitBlock(NodeT* NewBB) {
+  void splitBlock(NodeT *NewBB) {
     if (this->IsPostDominators)
-      this->Split<Inverse<NodeT*>, GraphTraits<Inverse<NodeT*> > >(*this, NewBB);
+      this->Split<Inverse<NodeT *>, GraphTraits<Inverse<NodeT *>>>(*this,
+                                                                   NewBB);
     else
-      this->Split<NodeT*, GraphTraits<NodeT*> >(*this, NewBB);
+      this->Split<NodeT *, GraphTraits<NodeT *>>(*this, NewBB);
   }
 
   /// print - Convert to human readable form
@@ -588,28 +624,27 @@ public:
   }
 
 protected:
-  template<class GraphT>
-  friend typename GraphT::NodeType* Eval(
-                               DominatorTreeBase<typename GraphT::NodeType>& DT,
-                                         typename GraphT::NodeType* V,
-                                         unsigned LastLinked);
+  template <class GraphT>
+  friend typename GraphT::NodeType *
+  Eval(DominatorTreeBase<typename GraphT::NodeType> &DT,
+       typename GraphT::NodeType *V, unsigned LastLinked);
 
-  template<class GraphT>
-  friend unsigned DFSPass(DominatorTreeBase<typename GraphT::NodeType>& DT,
-                          typename GraphT::NodeType* V,
-                          unsigned N);
+  template <class GraphT>
+  friend unsigned DFSPass(DominatorTreeBase<typename GraphT::NodeType> &DT,
+                          typename GraphT::NodeType *V, unsigned N);
 
-  template<class FuncT, class N>
-  friend void Calculate(DominatorTreeBase<typename GraphTraits<N>::NodeType>& DT,
-                        FuncT& F);
+  template <class FuncT, class N>
+  friend void
+  Calculate(DominatorTreeBase<typename GraphTraits<N>::NodeType> &DT, FuncT &F);
 
   /// updateDFSNumbers - Assign In and Out numbers to the nodes while walking
   /// dominator tree in dfs order.
   void updateDFSNumbers() const {
     unsigned DFSNum = 0;
 
-    SmallVector<std::pair<const DomTreeNodeBase<NodeT>*,
-                typename DomTreeNodeBase<NodeT>::const_iterator>, 32> WorkStack;
+    SmallVector<std::pair<const DomTreeNodeBase<NodeT> *,
+                          typename DomTreeNodeBase<NodeT>::const_iterator>,
+                32> WorkStack;
 
     const DomTreeNodeBase<NodeT> *ThisRoot = getRootNode();
 
@@ -626,7 +661,7 @@ protected:
     while (!WorkStack.empty()) {
       const DomTreeNodeBase<NodeT> *Node = WorkStack.back().first;
       typename DomTreeNodeBase<NodeT>::const_iterator ChildIt =
-        WorkStack.back().second;
+          WorkStack.back().second;
 
       // If we visited all of the children of this node, "recurse" back up the
       // stack setting the DFOutNum.
@@ -660,23 +695,18 @@ protected:
 
     // Add a new tree node for this NodeT, and link it as a child of
     // IDomNode
-    DomTreeNodeBase<NodeT> *C = new DomTreeNodeBase<NodeT>(BB, IDomNode);
-    return this->DomTreeNodes[BB] = IDomNode->addChild(C);
+    return (this->DomTreeNodes[BB] = IDomNode->addChild(
+                llvm::make_unique<DomTreeNodeBase<NodeT>>(BB, IDomNode))).get();
   }
 
-  inline NodeT *getIDom(NodeT *BB) const {
-    return IDoms.lookup(BB);
-  }
+  NodeT *getIDom(NodeT *BB) const { return IDoms.lookup(BB); }
 
-  inline void addRoot(NodeT* BB) {
-    this->Roots.push_back(BB);
-  }
+  void addRoot(NodeT *BB) { this->Roots.push_back(BB); }
 
 public:
   /// recalculate - compute a dominator tree for the given function
-  template<class FT>
-  void recalculate(FT& F) {
-    typedef GraphTraits<FT*> TraitsTy;
+  template <class FT> void recalculate(FT &F) {
+    typedef GraphTraits<FT *> TraitsTy;
     reset();
     this->Vertex.push_back(nullptr);
 
@@ -687,27 +717,29 @@ public:
       this->IDoms[entry] = nullptr;
       this->DomTreeNodes[entry] = nullptr;
 
-      Calculate<FT, NodeT*>(*this, F);
+      Calculate<FT, NodeT *>(*this, F);
     } else {
       // Initialize the roots list
       for (typename TraitsTy::nodes_iterator I = TraitsTy::nodes_begin(&F),
-                                        E = TraitsTy::nodes_end(&F); I != E; ++I) {
+                                             E = TraitsTy::nodes_end(&F);
+           I != E; ++I) {
         if (TraitsTy::child_begin(I) == TraitsTy::child_end(I))
           addRoot(I);
 
-        // Prepopulate maps so that we don't get iterator invalidation issues later.
+        // Prepopulate maps so that we don't get iterator invalidation issues
+        // later.
         this->IDoms[I] = nullptr;
         this->DomTreeNodes[I] = nullptr;
       }
 
-      Calculate<FT, Inverse<NodeT*> >(*this, F);
+      Calculate<FT, Inverse<NodeT *>>(*this, F);
     }
   }
 };
 
 // These two functions are declared out of line as a workaround for building
 // with old (< r147295) versions of clang because of pr11642.
-template<class NodeT>
+template <class NodeT>
 bool DominatorTreeBase<NodeT>::dominates(const NodeT *A, const NodeT *B) const {
   if (A == B)
     return true;
@@ -718,9 +750,9 @@ bool DominatorTreeBase<NodeT>::dominates(const NodeT *A, const NodeT *B) const {
   return dominates(getNode(const_cast<NodeT *>(A)),
                    getNode(const_cast<NodeT *>(B)));
 }
-template<class NodeT>
-bool
-DominatorTreeBase<NodeT>::properlyDominates(const NodeT *A, const NodeT *B) const {
+template <class NodeT>
+bool DominatorTreeBase<NodeT>::properlyDominates(const NodeT *A,
+                                                 const NodeT *B) const {
   if (A == B)
     return false;
 
diff --git a/include/llvm/Support/GenericDomTreeConstruction.h b/include/llvm/Support/GenericDomTreeConstruction.h
index ad4f8a9..7c065f9 100644
--- a/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/include/llvm/Support/GenericDomTreeConstruction.h
@@ -251,15 +251,18 @@ void Calculate(DominatorTreeBase<typename GraphTraits<NodeT>::NodeType>& DT,
   // an infinite loop.
   typename GraphT::NodeType* Root = !MultipleRoots ? DT.Roots[0] : nullptr;
 
-  DT.DomTreeNodes[Root] = DT.RootNode =
-                  new DomTreeNodeBase<typename GraphT::NodeType>(Root, nullptr);
+  DT.RootNode =
+      (DT.DomTreeNodes[Root] =
+           llvm::make_unique<DomTreeNodeBase<typename GraphT::NodeType>>(
+               Root, nullptr)).get();
 
   // Loop over all of the reachable blocks in the function...
   for (unsigned i = 2; i <= N; ++i) {
     typename GraphT::NodeType* W = DT.Vertex[i];
 
-    DomTreeNodeBase<typename GraphT::NodeType> *BBNode = DT.DomTreeNodes[W];
-    if (BBNode) continue;  // Haven't calculated this node yet?
+    // Don't replace this with 'count', the insertion side effect is important
+    if (DT.DomTreeNodes[W])
+      continue; // Haven't calculated this node yet?
 
     typename GraphT::NodeType* ImmDom = DT.getIDom(W);
 
@@ -271,15 +274,16 @@ void Calculate(DominatorTreeBase<typename GraphTraits<NodeT>::NodeType>& DT,
 
     // Add a new tree node for this BasicBlock, and link it as a child of
     // IDomNode
-    DomTreeNodeBase<typename GraphT::NodeType> *C =
-                    new DomTreeNodeBase<typename GraphT::NodeType>(W, IDomNode);
-    DT.DomTreeNodes[W] = IDomNode->addChild(C);
+    DT.DomTreeNodes[W] = IDomNode->addChild(
+        llvm::make_unique<DomTreeNodeBase<typename GraphT::NodeType>>(
+            W, IDomNode));
   }
 
   // Free temporary memory used to construct idom's
   DT.IDoms.clear();
   DT.Info.clear();
-  std::vector<typename GraphT::NodeType*>().swap(DT.Vertex);
+  DT.Vertex.clear();
+  DT.Vertex.shrink_to_fit();
 
   DT.updateDFSNumbers();
 }
diff --git a/include/llvm/Support/LockFileManager.h b/include/llvm/Support/LockFileManager.h
index 61c65da..8e88d42 100644
--- a/include/llvm/Support/LockFileManager.h
+++ b/include/llvm/Support/LockFileManager.h
@@ -57,8 +57,8 @@ private:
   Optional<std::pair<std::string, int> > Owner;
   Optional<std::error_code> Error;
 
-  LockFileManager(const LockFileManager &) LLVM_DELETED_FUNCTION;
-  LockFileManager &operator=(const LockFileManager &) LLVM_DELETED_FUNCTION;
+  LockFileManager(const LockFileManager &) = delete;
+  LockFileManager &operator=(const LockFileManager &) = delete;
 
   static Optional<std::pair<std::string, int> >
   readLockFile(StringRef LockFileName);
@@ -77,6 +77,10 @@ public:
 
   /// \brief For a shared lock, wait until the owner releases the lock.
   WaitForUnlockResult waitForUnlock();
+
+  /// \brief Remove the lock file.  This may delete a different lock file than
+  /// the one previously read if there is a race.
+  std::error_code unsafeRemoveLockFile();
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Support/MachO.h b/include/llvm/Support/MachO.h
index c07bd88..7751275 100644
--- a/include/llvm/Support/MachO.h
+++ b/include/llvm/Support/MachO.h
@@ -130,8 +130,8 @@ namespace llvm {
       LC_DATA_IN_CODE         = 0x00000029u,
       LC_SOURCE_VERSION       = 0x0000002Au,
       LC_DYLIB_CODE_SIGN_DRS  = 0x0000002Bu,
-      //                        0x0000002Cu,
-      LC_LINKER_OPTIONS       = 0x0000002Du,
+      LC_ENCRYPTION_INFO_64   = 0x0000002Cu,
+      LC_LINKER_OPTION        = 0x0000002Du,
       LC_LINKER_OPTIMIZATION_HINT = 0x0000002Eu
     };
 
@@ -842,6 +842,15 @@ namespace llvm {
       uint32_t cryptid;
     };
 
+    struct encryption_info_command_64 {
+      uint32_t cmd;
+      uint32_t cmdsize;
+      uint32_t cryptoff;
+      uint32_t cryptsize;
+      uint32_t cryptid;
+      uint32_t pad;
+    };
+
     struct version_min_command {
       uint32_t cmd;       // LC_VERSION_MIN_MACOSX or
                           // LC_VERSION_MIN_IPHONEOS
@@ -865,7 +874,7 @@ namespace llvm {
       uint32_t export_size;
     };
 
-    struct linker_options_command {
+    struct linker_option_command {
       uint32_t cmd;
       uint32_t cmdsize;
       uint32_t count;
@@ -1098,6 +1107,61 @@ namespace llvm {
       sys::swapByteOrder(d.dylib.compatibility_version);
     }
 
+    inline void swapStruct(sub_framework_command &s) {
+      sys::swapByteOrder(s.cmd);
+      sys::swapByteOrder(s.cmdsize);
+      sys::swapByteOrder(s.umbrella);
+    }
+
+    inline void swapStruct(sub_umbrella_command &s) {
+      sys::swapByteOrder(s.cmd);
+      sys::swapByteOrder(s.cmdsize);
+      sys::swapByteOrder(s.sub_umbrella);
+    }
+
+    inline void swapStruct(sub_library_command &s) {
+      sys::swapByteOrder(s.cmd);
+      sys::swapByteOrder(s.cmdsize);
+      sys::swapByteOrder(s.sub_library);
+    }
+
+    inline void swapStruct(sub_client_command &s) {
+      sys::swapByteOrder(s.cmd);
+      sys::swapByteOrder(s.cmdsize);
+      sys::swapByteOrder(s.client);
+    }
+
+    inline void swapStruct(routines_command &r) {
+      sys::swapByteOrder(r.cmd);
+      sys::swapByteOrder(r.cmdsize);
+      sys::swapByteOrder(r.init_address);
+      sys::swapByteOrder(r.init_module);
+      sys::swapByteOrder(r.reserved1);
+      sys::swapByteOrder(r.reserved2);
+      sys::swapByteOrder(r.reserved3);
+      sys::swapByteOrder(r.reserved4);
+      sys::swapByteOrder(r.reserved5);
+      sys::swapByteOrder(r.reserved6);
+    }
+
+    inline void swapStruct(routines_command_64 &r) {
+      sys::swapByteOrder(r.cmd);
+      sys::swapByteOrder(r.cmdsize);
+      sys::swapByteOrder(r.init_address);
+      sys::swapByteOrder(r.init_module);
+      sys::swapByteOrder(r.reserved1);
+      sys::swapByteOrder(r.reserved2);
+      sys::swapByteOrder(r.reserved3);
+      sys::swapByteOrder(r.reserved4);
+      sys::swapByteOrder(r.reserved5);
+      sys::swapByteOrder(r.reserved6);
+    }
+
+    inline void swapStruct(thread_command &t) {
+      sys::swapByteOrder(t.cmd);
+      sys::swapByteOrder(t.cmdsize);
+    }
+
     inline void swapStruct(dylinker_command &d) {
       sys::swapByteOrder(d.cmd);
       sys::swapByteOrder(d.cmdsize);
@@ -1109,6 +1173,12 @@ namespace llvm {
       sys::swapByteOrder(u.cmdsize);
     }
 
+    inline void swapStruct(rpath_command &r) {
+      sys::swapByteOrder(r.cmd);
+      sys::swapByteOrder(r.cmdsize);
+      sys::swapByteOrder(r.path);
+    }
+
     inline void swapStruct(source_version_command &s) {
       sys::swapByteOrder(s.cmd);
       sys::swapByteOrder(s.cmdsize);
@@ -1122,6 +1192,23 @@ namespace llvm {
       sys::swapByteOrder(e.stacksize);
     }
 
+    inline void swapStruct(encryption_info_command &e) {
+      sys::swapByteOrder(e.cmd);
+      sys::swapByteOrder(e.cmdsize);
+      sys::swapByteOrder(e.cryptoff);
+      sys::swapByteOrder(e.cryptsize);
+      sys::swapByteOrder(e.cryptid);
+    }
+
+    inline void swapStruct(encryption_info_command_64 &e) {
+      sys::swapByteOrder(e.cmd);
+      sys::swapByteOrder(e.cmdsize);
+      sys::swapByteOrder(e.cryptoff);
+      sys::swapByteOrder(e.cryptsize);
+      sys::swapByteOrder(e.cryptid);
+      sys::swapByteOrder(e.pad);
+    }
+
     inline void swapStruct(dysymtab_command &dst) {
       sys::swapByteOrder(dst.cmd);
       sys::swapByteOrder(dst.cmdsize);
@@ -1174,7 +1261,7 @@ namespace llvm {
       sys::swapByteOrder(C.datasize);
     }
 
-    inline void swapStruct(linker_options_command &C) {
+    inline void swapStruct(linker_option_command &C) {
       sys::swapByteOrder(C.cmd);
       sys::swapByteOrder(C.cmdsize);
       sys::swapByteOrder(C.count);
@@ -1331,6 +1418,262 @@ namespace llvm {
       CPU_SUBTYPE_MC980000_ALL  = CPU_SUBTYPE_POWERPC_ALL,
       CPU_SUBTYPE_MC98601       = CPU_SUBTYPE_POWERPC_601
     };
+
+    struct x86_thread_state64_t {
+      uint64_t rax;
+      uint64_t rbx;
+      uint64_t rcx;
+      uint64_t rdx;
+      uint64_t rdi;
+      uint64_t rsi;
+      uint64_t rbp;
+      uint64_t rsp;
+      uint64_t r8;
+      uint64_t r9;
+      uint64_t r10;
+      uint64_t r11;
+      uint64_t r12;
+      uint64_t r13;
+      uint64_t r14;
+      uint64_t r15;
+      uint64_t rip;
+      uint64_t rflags;
+      uint64_t cs;
+      uint64_t fs;
+      uint64_t gs;
+    };
+
+    enum x86_fp_control_precis {
+      x86_FP_PREC_24B = 0,
+      x86_FP_PREC_53B = 2,
+      x86_FP_PREC_64B = 3
+    };
+
+    enum x86_fp_control_rc {
+      x86_FP_RND_NEAR = 0,
+      x86_FP_RND_DOWN = 1,
+      x86_FP_RND_UP = 2,
+      x86_FP_CHOP = 3
+    };
+
+    struct fp_control_t {
+      unsigned short
+       invalid :1,
+       denorm  :1,
+       zdiv    :1,
+       ovrfl   :1,
+       undfl   :1,
+       precis  :1,
+               :2,
+       pc      :2,
+       rc      :2,
+               :1,
+               :3;
+    };
+
+    struct fp_status_t {
+      unsigned short
+        invalid :1,
+        denorm  :1,
+        zdiv    :1,
+        ovrfl   :1,
+        undfl   :1,
+        precis  :1,
+        stkflt  :1,
+        errsumm :1,
+        c0      :1,
+        c1      :1,
+        c2      :1,
+        tos     :3,
+        c3      :1,
+        busy    :1;
+    };
+
+    struct mmst_reg_t {
+      char mmst_reg[10];
+      char mmst_rsrv[6];
+    };
+
+    struct xmm_reg_t {
+      char xmm_reg[16];
+    };
+
+    struct x86_float_state64_t {
+      int32_t fpu_reserved[2];
+      fp_control_t fpu_fcw;
+      fp_status_t fpu_fsw;
+      uint8_t fpu_ftw;
+      uint8_t fpu_rsrv1;
+      uint16_t fpu_fop;
+      uint32_t fpu_ip;
+      uint16_t fpu_cs;
+      uint16_t fpu_rsrv2;
+      uint32_t fpu_dp;
+      uint16_t fpu_ds;
+      uint16_t fpu_rsrv3;
+      uint32_t fpu_mxcsr;
+      uint32_t fpu_mxcsrmask;
+      mmst_reg_t fpu_stmm0;
+      mmst_reg_t fpu_stmm1;
+      mmst_reg_t fpu_stmm2;
+      mmst_reg_t fpu_stmm3;
+      mmst_reg_t fpu_stmm4;
+      mmst_reg_t fpu_stmm5;
+      mmst_reg_t fpu_stmm6;
+      mmst_reg_t fpu_stmm7;
+      xmm_reg_t fpu_xmm0;
+      xmm_reg_t fpu_xmm1;
+      xmm_reg_t fpu_xmm2;
+      xmm_reg_t fpu_xmm3;
+      xmm_reg_t fpu_xmm4;
+      xmm_reg_t fpu_xmm5;
+      xmm_reg_t fpu_xmm6;
+      xmm_reg_t fpu_xmm7;
+      xmm_reg_t fpu_xmm8;
+      xmm_reg_t fpu_xmm9;
+      xmm_reg_t fpu_xmm10;
+      xmm_reg_t fpu_xmm11;
+      xmm_reg_t fpu_xmm12;
+      xmm_reg_t fpu_xmm13;
+      xmm_reg_t fpu_xmm14;
+      xmm_reg_t fpu_xmm15;
+      char fpu_rsrv4[6*16];
+      uint32_t fpu_reserved1;
+    };
+
+    struct x86_exception_state64_t {
+      uint16_t trapno;
+      uint16_t cpu;
+      uint32_t err;
+      uint64_t faultvaddr;
+    };
+
+    inline void swapStruct(x86_thread_state64_t &x) {
+      sys::swapByteOrder(x.rax);
+      sys::swapByteOrder(x.rbx);
+      sys::swapByteOrder(x.rcx);
+      sys::swapByteOrder(x.rdx);
+      sys::swapByteOrder(x.rdi);
+      sys::swapByteOrder(x.rsi);
+      sys::swapByteOrder(x.rbp);
+      sys::swapByteOrder(x.rsp);
+      sys::swapByteOrder(x.r8);
+      sys::swapByteOrder(x.r9);
+      sys::swapByteOrder(x.r10);
+      sys::swapByteOrder(x.r11);
+      sys::swapByteOrder(x.r12);
+      sys::swapByteOrder(x.r13);
+      sys::swapByteOrder(x.r14);
+      sys::swapByteOrder(x.r15);
+      sys::swapByteOrder(x.rip);
+      sys::swapByteOrder(x.rflags);
+      sys::swapByteOrder(x.cs);
+      sys::swapByteOrder(x.fs);
+      sys::swapByteOrder(x.gs);
+    }
+
+    inline void swapStruct(x86_float_state64_t &x) {
+      sys::swapByteOrder(x.fpu_reserved[0]);
+      sys::swapByteOrder(x.fpu_reserved[1]);
+      // TODO swap: fp_control_t fpu_fcw;
+      // TODO swap: fp_status_t fpu_fsw;
+      sys::swapByteOrder(x.fpu_fop);
+      sys::swapByteOrder(x.fpu_ip);
+      sys::swapByteOrder(x.fpu_cs);
+      sys::swapByteOrder(x.fpu_rsrv2);
+      sys::swapByteOrder(x.fpu_dp);
+      sys::swapByteOrder(x.fpu_ds);
+      sys::swapByteOrder(x.fpu_rsrv3);
+      sys::swapByteOrder(x.fpu_mxcsr);
+      sys::swapByteOrder(x.fpu_mxcsrmask);
+      sys::swapByteOrder(x.fpu_reserved1);
+    }
+
+    inline void swapStruct(x86_exception_state64_t &x) {
+      sys::swapByteOrder(x.trapno);
+      sys::swapByteOrder(x.cpu);
+      sys::swapByteOrder(x.err);
+      sys::swapByteOrder(x.faultvaddr);
+    }
+
+    struct x86_state_hdr_t {
+      uint32_t flavor;
+      uint32_t count;
+    };
+
+    struct x86_thread_state_t {
+      x86_state_hdr_t tsh;
+      union {
+        x86_thread_state64_t ts64;
+      } uts;
+    };
+
+    struct x86_float_state_t {
+      x86_state_hdr_t fsh;
+      union {
+        x86_float_state64_t fs64;
+      } ufs;
+    };
+
+    struct x86_exception_state_t {
+      x86_state_hdr_t esh;
+      union {
+        x86_exception_state64_t es64;
+      } ues;
+    };
+
+    inline void swapStruct(x86_state_hdr_t &x) {
+      sys::swapByteOrder(x.flavor);
+      sys::swapByteOrder(x.count);
+    }
+
+    enum X86ThreadFlavors {
+      x86_THREAD_STATE32    = 1,
+      x86_FLOAT_STATE32     = 2,
+      x86_EXCEPTION_STATE32 = 3,
+      x86_THREAD_STATE64    = 4,
+      x86_FLOAT_STATE64     = 5,
+      x86_EXCEPTION_STATE64 = 6,
+      x86_THREAD_STATE      = 7,
+      x86_FLOAT_STATE       = 8,
+      x86_EXCEPTION_STATE   = 9,
+      x86_DEBUG_STATE32     = 10,
+      x86_DEBUG_STATE64     = 11,
+      x86_DEBUG_STATE       = 12
+    };
+
+    inline void swapStruct(x86_thread_state_t &x) {
+      swapStruct(x.tsh);
+      if (x.tsh.flavor == x86_THREAD_STATE64)
+        swapStruct(x.uts.ts64);
+    }
+
+    inline void swapStruct(x86_float_state_t &x) {
+      swapStruct(x.fsh);
+      if (x.fsh.flavor == x86_FLOAT_STATE64)
+        swapStruct(x.ufs.fs64);
+    }
+
+    inline void swapStruct(x86_exception_state_t &x) {
+      swapStruct(x.esh);
+      if (x.esh.flavor == x86_EXCEPTION_STATE64)
+        swapStruct(x.ues.es64);
+    }
+
+    const uint32_t x86_THREAD_STATE64_COUNT =
+      sizeof(x86_thread_state64_t) / sizeof(uint32_t);
+    const uint32_t x86_FLOAT_STATE64_COUNT =
+      sizeof(x86_float_state64_t) / sizeof(uint32_t);
+    const uint32_t x86_EXCEPTION_STATE64_COUNT =
+      sizeof(x86_exception_state64_t) / sizeof(uint32_t);
+
+    const uint32_t x86_THREAD_STATE_COUNT =
+      sizeof(x86_thread_state_t) / sizeof(uint32_t);
+    const uint32_t x86_FLOAT_STATE_COUNT =
+      sizeof(x86_float_state_t) / sizeof(uint32_t);
+    const uint32_t x86_EXCEPTION_STATE_COUNT =
+      sizeof(x86_exception_state_t) / sizeof(uint32_t);
+
   } // end namespace MachO
 } // end namespace llvm
 
diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h
index 9d16182..acffb46 100644
--- a/include/llvm/Support/MathExtras.h
+++ b/include/llvm/Support/MathExtras.h
@@ -35,78 +35,66 @@ enum ZeroBehavior {
   ZB_Width
 };
 
-/// \brief Count number of 0's from the least significant bit to the most
-///   stopping at the first 1.
-///
-/// Only unsigned integral types are allowed.
-///
-/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
-///   valid arguments.
-template <typename T>
-typename std::enable_if<std::numeric_limits<T>::is_integer &&
-                        !std::numeric_limits<T>::is_signed, std::size_t>::type
-countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
-  (void)ZB;
-
-  if (!Val)
-    return std::numeric_limits<T>::digits;
-  if (Val & 0x1)
-    return 0;
-
-  // Bisection method.
-  std::size_t ZeroBits = 0;
-  T Shift = std::numeric_limits<T>::digits >> 1;
-  T Mask = std::numeric_limits<T>::max() >> Shift;
-  while (Shift) {
-    if ((Val & Mask) == 0) {
-      Val >>= Shift;
-      ZeroBits |= Shift;
+namespace detail {
+template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
+  static std::size_t count(T Val, ZeroBehavior) {
+    if (!Val)
+      return std::numeric_limits<T>::digits;
+    if (Val & 0x1)
+      return 0;
+
+    // Bisection method.
+    std::size_t ZeroBits = 0;
+    T Shift = std::numeric_limits<T>::digits >> 1;
+    T Mask = std::numeric_limits<T>::max() >> Shift;
+    while (Shift) {
+      if ((Val & Mask) == 0) {
+        Val >>= Shift;
+        ZeroBits |= Shift;
+      }
+      Shift >>= 1;
+      Mask >>= Shift;
     }
-    Shift >>= 1;
-    Mask >>= Shift;
+    return ZeroBits;
   }
-  return ZeroBits;
-}
-
-// Disable signed.
-template <typename T>
-typename std::enable_if<std::numeric_limits<T>::is_integer &&
-                        std::numeric_limits<T>::is_signed, std::size_t>::type
-countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) LLVM_DELETED_FUNCTION;
+};
 
 #if __GNUC__ >= 4 || _MSC_VER
-template <>
-inline std::size_t countTrailingZeros<uint32_t>(uint32_t Val, ZeroBehavior ZB) {
-  if (ZB != ZB_Undefined && Val == 0)
-    return 32;
+template <typename T> struct TrailingZerosCounter<T, 4> {
+  static std::size_t count(T Val, ZeroBehavior ZB) {
+    if (ZB != ZB_Undefined && Val == 0)
+      return 32;
 
 #if __has_builtin(__builtin_ctz) || LLVM_GNUC_PREREQ(4, 0, 0)
-  return __builtin_ctz(Val);
+    return __builtin_ctz(Val);
 #elif _MSC_VER
-  unsigned long Index;
-  _BitScanForward(&Index, Val);
-  return Index;
+    unsigned long Index;
+    _BitScanForward(&Index, Val);
+    return Index;
 #endif
-}
+  }
+};
 
 #if !defined(_MSC_VER) || defined(_M_X64)
-template <>
-inline std::size_t countTrailingZeros<uint64_t>(uint64_t Val, ZeroBehavior ZB) {
-  if (ZB != ZB_Undefined && Val == 0)
-    return 64;
+template <typename T> struct TrailingZerosCounter<T, 8> {
+  static std::size_t count(T Val, ZeroBehavior ZB) {
+    if (ZB != ZB_Undefined && Val == 0)
+      return 64;
 
 #if __has_builtin(__builtin_ctzll) || LLVM_GNUC_PREREQ(4, 0, 0)
-  return __builtin_ctzll(Val);
+    return __builtin_ctzll(Val);
 #elif _MSC_VER
-  unsigned long Index;
-  _BitScanForward64(&Index, Val);
-  return Index;
+    unsigned long Index;
+    _BitScanForward64(&Index, Val);
+    return Index;
 #endif
-}
+  }
+};
 #endif
 #endif
+} // namespace detail
 
-/// \brief Count number of 0's from the most significant bit to the least
+/// \brief Count number of 0's from the least significant bit to the most
 ///   stopping at the first 1.
 ///
 /// Only unsigned integral types are allowed.
@@ -114,63 +102,81 @@ inline std::size_t countTrailingZeros<uint64_t>(uint64_t Val, ZeroBehavior ZB) {
 /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
 ///   valid arguments.
 template <typename T>
-typename std::enable_if<std::numeric_limits<T>::is_integer &&
-                        !std::numeric_limits<T>::is_signed, std::size_t>::type
-countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
-  (void)ZB;
-
-  if (!Val)
-    return std::numeric_limits<T>::digits;
-
-  // Bisection method.
-  std::size_t ZeroBits = 0;
-  for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
-    T Tmp = Val >> Shift;
-    if (Tmp)
-      Val = Tmp;
-    else
-      ZeroBits |= Shift;
+std::size_t countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
+  static_assert(std::numeric_limits<T>::is_integer &&
+                    !std::numeric_limits<T>::is_signed,
+                "Only unsigned integral types are allowed.");
+  return detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
+}
+
+namespace detail {
+template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
+  static std::size_t count(T Val, ZeroBehavior) {
+    if (!Val)
+      return std::numeric_limits<T>::digits;
+
+    // Bisection method.
+    std::size_t ZeroBits = 0;
+    for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
+      T Tmp = Val >> Shift;
+      if (Tmp)
+        Val = Tmp;
+      else
+        ZeroBits |= Shift;
+    }
+    return ZeroBits;
   }
-  return ZeroBits;
-}
-
-// Disable signed.
-template <typename T>
-typename std::enable_if<std::numeric_limits<T>::is_integer &&
-                        std::numeric_limits<T>::is_signed, std::size_t>::type
-countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) LLVM_DELETED_FUNCTION;
+};
 
 #if __GNUC__ >= 4 || _MSC_VER
-template <>
-inline std::size_t countLeadingZeros<uint32_t>(uint32_t Val, ZeroBehavior ZB) {
-  if (ZB != ZB_Undefined && Val == 0)
-    return 32;
+template <typename T> struct LeadingZerosCounter<T, 4> {
+  static std::size_t count(T Val, ZeroBehavior ZB) {
+    if (ZB != ZB_Undefined && Val == 0)
+      return 32;
 
 #if __has_builtin(__builtin_clz) || LLVM_GNUC_PREREQ(4, 0, 0)
-  return __builtin_clz(Val);
+    return __builtin_clz(Val);
 #elif _MSC_VER
-  unsigned long Index;
-  _BitScanReverse(&Index, Val);
-  return Index ^ 31;
+    unsigned long Index;
+    _BitScanReverse(&Index, Val);
+    return Index ^ 31;
 #endif
-}
+  }
+};
 
 #if !defined(_MSC_VER) || defined(_M_X64)
-template <>
-inline std::size_t countLeadingZeros<uint64_t>(uint64_t Val, ZeroBehavior ZB) {
-  if (ZB != ZB_Undefined && Val == 0)
-    return 64;
+template <typename T> struct LeadingZerosCounter<T, 8> {
+  static std::size_t count(T Val, ZeroBehavior ZB) {
+    if (ZB != ZB_Undefined && Val == 0)
+      return 64;
 
 #if __has_builtin(__builtin_clzll) || LLVM_GNUC_PREREQ(4, 0, 0)
-  return __builtin_clzll(Val);
+    return __builtin_clzll(Val);
 #elif _MSC_VER
-  unsigned long Index;
-  _BitScanReverse64(&Index, Val);
-  return Index ^ 63;
+    unsigned long Index;
+    _BitScanReverse64(&Index, Val);
+    return Index ^ 63;
 #endif
-}
+  }
+};
 #endif
 #endif
+} // namespace detail
+
+/// \brief Count number of 0's from the most significant bit to the least
+///   stopping at the first 1.
+///
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
+///   valid arguments.
+template <typename T>
+std::size_t countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
+  static_assert(std::numeric_limits<T>::is_integer &&
+                    !std::numeric_limits<T>::is_signed,
+                "Only unsigned integral types are allowed.");
+  return detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
+}
 
 /// \brief Get the index of the first set bit starting from the least
 ///   significant bit.
@@ -179,22 +185,13 @@ inline std::size_t countLeadingZeros<uint64_t>(uint64_t Val, ZeroBehavior ZB) {
 ///
 /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
 ///   valid arguments.
-template <typename T>
-typename std::enable_if<std::numeric_limits<T>::is_integer &&
-                       !std::numeric_limits<T>::is_signed, T>::type
-findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
+template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
   if (ZB == ZB_Max && Val == 0)
     return std::numeric_limits<T>::max();
 
   return countTrailingZeros(Val, ZB_Undefined);
 }
 
-// Disable signed.
-template <typename T>
-typename std::enable_if<std::numeric_limits<T>::is_integer &&
-                        std::numeric_limits<T>::is_signed, T>::type
-findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) LLVM_DELETED_FUNCTION;
-
 /// \brief Get the index of the last set bit starting from the least
 ///   significant bit.
 ///
@@ -202,10 +199,7 @@ findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) LLVM_DELETED_FUNCTION;
 ///
 /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
 ///   valid arguments.
-template <typename T>
-typename std::enable_if<std::numeric_limits<T>::is_integer &&
-                        !std::numeric_limits<T>::is_signed, T>::type
-findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
+template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
   if (ZB == ZB_Max && Val == 0)
     return std::numeric_limits<T>::max();
 
@@ -215,12 +209,6 @@ findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
          (std::numeric_limits<T>::digits - 1);
 }
 
-// Disable signed.
-template <typename T>
-typename std::enable_if<std::numeric_limits<T>::is_integer &&
-                        std::numeric_limits<T>::is_signed, T>::type
-findLastSet(T Val, ZeroBehavior ZB = ZB_Max) LLVM_DELETED_FUNCTION;
-
 /// \brief Macro compressed bit reversal table for 256 bits.
 ///
 /// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
@@ -387,62 +375,78 @@ inline uint64_t ByteSwap_64(uint64_t Value) {
   return sys::SwapByteOrder_64(Value);
 }
 
-/// CountLeadingOnes_32 - this function performs the operation of
-/// counting the number of ones from the most significant bit to the first zero
-/// bit.  Ex. CountLeadingOnes_32(0xFF0FFF00) == 8.
-/// Returns 32 if the word is all ones.
-inline unsigned CountLeadingOnes_32(uint32_t Value) {
-  return countLeadingZeros(~Value);
-}
-
-/// CountLeadingOnes_64 - This function performs the operation
-/// of counting the number of ones from the most significant bit to the first
-/// zero bit (64 bit edition.)
-/// Returns 64 if the word is all ones.
-inline unsigned CountLeadingOnes_64(uint64_t Value) {
-  return countLeadingZeros(~Value);
-}
-
-/// CountTrailingOnes_32 - this function performs the operation of
-/// counting the number of ones from the least significant bit to the first zero
-/// bit.  Ex. CountTrailingOnes_32(0x00FF00FF) == 8.
-/// Returns 32 if the word is all ones.
-inline unsigned CountTrailingOnes_32(uint32_t Value) {
-  return countTrailingZeros(~Value);
-}
-
-/// CountTrailingOnes_64 - This function performs the operation
-/// of counting the number of ones from the least significant bit to the first
-/// zero bit (64 bit edition.)
-/// Returns 64 if the word is all ones.
-inline unsigned CountTrailingOnes_64(uint64_t Value) {
-  return countTrailingZeros(~Value);
+/// \brief Count the number of ones from the most significant bit to the first
+/// zero bit.
+///
+/// Ex. CountLeadingOnes(0xFF0FFF00) == 8.
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of all ones. Only ZB_Width and
+/// ZB_Undefined are valid arguments.
+template <typename T>
+std::size_t countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
+  static_assert(std::numeric_limits<T>::is_integer &&
+                    !std::numeric_limits<T>::is_signed,
+                "Only unsigned integral types are allowed.");
+  return countLeadingZeros(~Value, ZB);
 }
 
-/// CountPopulation_32 - this function counts the number of set bits in a value.
-/// Ex. CountPopulation(0xF000F000) = 8
-/// Returns 0 if the word is zero.
-inline unsigned CountPopulation_32(uint32_t Value) {
+/// \brief Count the number of ones from the least significant bit to the first
+/// zero bit.
+///
+/// Ex. countTrailingOnes(0x00FF00FF) == 8.
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of all ones. Only ZB_Width and
+/// ZB_Undefined are valid arguments.
+template <typename T>
+std::size_t countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
+  static_assert(std::numeric_limits<T>::is_integer &&
+                    !std::numeric_limits<T>::is_signed,
+                "Only unsigned integral types are allowed.");
+  return countTrailingZeros(~Value, ZB);
+}
+
+namespace detail {
+template <typename T, std::size_t SizeOfT> struct PopulationCounter {
+  static unsigned count(T Value) {
+    // Generic version, forward to 32 bits.
+    static_assert(SizeOfT <= 4, "Not implemented!");
 #if __GNUC__ >= 4
-  return __builtin_popcount(Value);
+    return __builtin_popcount(Value);
 #else
-  uint32_t v = Value - ((Value >> 1) & 0x55555555);
-  v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
-  return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
+    uint32_t v = Value;
+    v = v - ((v >> 1) & 0x55555555);
+    v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+    return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
 #endif
-}
+  }
+};
 
-/// CountPopulation_64 - this function counts the number of set bits in a value,
-/// (64 bit edition.)
-inline unsigned CountPopulation_64(uint64_t Value) {
+template <typename T> struct PopulationCounter<T, 8> {
+  static unsigned count(T Value) {
 #if __GNUC__ >= 4
-  return __builtin_popcountll(Value);
+    return __builtin_popcountll(Value);
 #else
-  uint64_t v = Value - ((Value >> 1) & 0x5555555555555555ULL);
-  v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
-  v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
-  return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
+    uint64_t v = Value;
+    v = v - ((v >> 1) & 0x5555555555555555ULL);
+    v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
+    v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+    return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
 #endif
+  }
+};
+} // namespace detail
+
+/// \brief Count the number of set bits in a value.
+/// Ex. countPopulation(0xF000F000) = 8
+/// Returns 0 if the word is zero.
+template <typename T>
+inline unsigned countPopulation(T Value) {
+  static_assert(std::numeric_limits<T>::is_integer &&
+                    !std::numeric_limits<T>::is_signed,
+                "Only unsigned integral types are allowed.");
+  return detail::PopulationCounter<T, sizeof(T)>::count(Value);
 }
 
 /// Log2_32 - This function returns the floor log base 2 of the specified value,
diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h
index e2f8d7e..cc65ca5 100644
--- a/include/llvm/Support/MemoryBuffer.h
+++ b/include/llvm/Support/MemoryBuffer.h
@@ -40,8 +40,8 @@ class MemoryBuffer {
   const char *BufferStart; // Start of the buffer.
   const char *BufferEnd;   // End of the buffer.
 
-  MemoryBuffer(const MemoryBuffer &) LLVM_DELETED_FUNCTION;
-  MemoryBuffer &operator=(const MemoryBuffer &) LLVM_DELETED_FUNCTION;
+  MemoryBuffer(const MemoryBuffer &) = delete;
+  MemoryBuffer &operator=(const MemoryBuffer &) = delete;
 protected:
   MemoryBuffer() {}
   void init(const char *BufStart, const char *BufEnd,
diff --git a/include/llvm/Support/Mutex.h b/include/llvm/Support/Mutex.h
index 97dd501..ae69634 100644
--- a/include/llvm/Support/Mutex.h
+++ b/include/llvm/Support/Mutex.h
@@ -76,8 +76,8 @@ namespace llvm
     /// @name Do Not Implement
     /// @{
     private:
-      MutexImpl(const MutexImpl &) LLVM_DELETED_FUNCTION;
-      void operator=(const MutexImpl &) LLVM_DELETED_FUNCTION;
+      MutexImpl(const MutexImpl &) = delete;
+      void operator=(const MutexImpl &) = delete;
     /// @}
     };
 
diff --git a/include/llvm/Support/MutexGuard.h b/include/llvm/Support/MutexGuard.h
index b9f941d..07b64b6 100644
--- a/include/llvm/Support/MutexGuard.h
+++ b/include/llvm/Support/MutexGuard.h
@@ -26,8 +26,8 @@ namespace llvm {
   /// @brief Guard a section of code with a Mutex.
   class MutexGuard {
     sys::Mutex &M;
-    MutexGuard(const MutexGuard &) LLVM_DELETED_FUNCTION;
-    void operator=(const MutexGuard &) LLVM_DELETED_FUNCTION;
+    MutexGuard(const MutexGuard &) = delete;
+    void operator=(const MutexGuard &) = delete;
   public:
     MutexGuard(sys::Mutex &m) : M(m) { M.lock(); }
     ~MutexGuard() { M.unlock(); }
diff --git a/include/llvm/Support/OnDiskHashTable.h b/include/llvm/Support/OnDiskHashTable.h
index b039fae..52f133c 100644
--- a/include/llvm/Support/OnDiskHashTable.h
+++ b/include/llvm/Support/OnDiskHashTable.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_SUPPORT_ONDISKHASHTABLE_H
 #define LLVM_SUPPORT_ONDISKHASHTABLE_H
 
-#include "llvm/Support/Allocator.h"
 #include "llvm/Support/AlignOf.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/Host.h"
diff --git a/include/llvm/Support/PrettyStackTrace.h b/include/llvm/Support/PrettyStackTrace.h
index 914141a..96afb60 100644
--- a/include/llvm/Support/PrettyStackTrace.h
+++ b/include/llvm/Support/PrettyStackTrace.h
@@ -30,8 +30,8 @@ namespace llvm {
   /// virtual stack trace.  This gets dumped out if the program crashes.
   class PrettyStackTraceEntry {
     const PrettyStackTraceEntry *NextEntry;
-    PrettyStackTraceEntry(const PrettyStackTraceEntry &) LLVM_DELETED_FUNCTION;
-    void operator=(const PrettyStackTraceEntry&) LLVM_DELETED_FUNCTION;
+    PrettyStackTraceEntry(const PrettyStackTraceEntry &) = delete;
+    void operator=(const PrettyStackTraceEntry&) = delete;
   public:
     PrettyStackTraceEntry();
     virtual ~PrettyStackTraceEntry();
diff --git a/include/llvm/Support/Process.h b/include/llvm/Support/Process.h
index 8616679..cfdd06c 100644
--- a/include/llvm/Support/Process.h
+++ b/include/llvm/Support/Process.h
@@ -38,111 +38,13 @@ class StringRef;
 
 namespace sys {
 
-class self_process;
-
-/// \brief Generic base class which exposes information about an operating
-/// system process.
-///
-/// This base class is the core interface behind any OS process. It exposes
-/// methods to query for generic information about a particular process.
-///
-/// Subclasses implement this interface based on the mechanisms available, and
-/// can optionally expose more interfaces unique to certain process kinds.
-class process {
-protected:
-  /// \brief Only specific subclasses of process objects can be destroyed.
-  virtual ~process();
-
-public:
-  /// \brief Operating system specific type to identify a process.
-  ///
-  /// Note that the windows one is defined to 'unsigned long' as this is the
-  /// documented type for DWORD on windows, and we don't want to pull in the
-  /// Windows headers here.
-#if defined(LLVM_ON_UNIX)
-  typedef pid_t id_type;
-#elif defined(LLVM_ON_WIN32)
-  typedef unsigned long id_type; // Must match the type of DWORD.
-#else
-#error Unsupported operating system.
-#endif
-
-  /// \brief Get the operating system specific identifier for this process.
-  virtual id_type get_id() = 0;
-
-  /// \brief Get the user time consumed by this process.
-  ///
-  /// Note that this is often an approximation and may be zero on platforms
-  /// where we don't have good support for the functionality.
-  virtual TimeValue get_user_time() const = 0;
-
-  /// \brief Get the system time consumed by this process.
-  ///
-  /// Note that this is often an approximation and may be zero on platforms
-  /// where we don't have good support for the functionality.
-  virtual TimeValue get_system_time() const = 0;
-
-  /// \brief Get the wall time consumed by this process.
-  ///
-  /// Note that this is often an approximation and may be zero on platforms
-  /// where we don't have good support for the functionality.
-  virtual TimeValue get_wall_time() const = 0;
-
-  /// \name Static factory routines for processes.
-  /// @{
-
-  /// \brief Get the process object for the current process.
-  static self_process *get_self();
-
-  /// @}
-
-};
-
-/// \brief The specific class representing the current process.
-///
-/// The current process can both specialize the implementation of the routines
-/// and can expose certain information not available for other OS processes.
-class self_process : public process {
-  friend class process;
-
-  /// \brief Private destructor, as users shouldn't create objects of this
-  /// type.
-  virtual ~self_process();
-
-public:
-  id_type get_id() override;
-  TimeValue get_user_time() const override;
-  TimeValue get_system_time() const override;
-  TimeValue get_wall_time() const override;
-
-  /// \name Process configuration (sysconf on POSIX)
-  /// @{
-
-  /// \brief Get the virtual memory page size.
-  ///
-  /// Query the operating system for this process's page size.
-  size_t page_size() const { return PageSize; };
-
-  /// @}
-
-private:
-  /// \name Cached process state.
-  /// @{
-
-  /// \brief Cached page size, this cannot vary during the life of the process.
-  size_t PageSize;
-
-  /// @}
-
-  /// \brief Constructor, used by \c process::get_self() only.
-  self_process();
-};
-
 
 /// \brief A collection of legacy interfaces for querying information about the
 /// current executing process.
 class Process {
 public:
+  static unsigned getPageSize();
+
   /// \brief Return process memory usage.
   /// This static function will return the total amount of memory allocated
   /// by the process. This only counts the memory allocated via the malloc,
diff --git a/include/llvm/Support/RWMutex.h b/include/llvm/Support/RWMutex.h
index b80b855..9a5e421 100644
--- a/include/llvm/Support/RWMutex.h
+++ b/include/llvm/Support/RWMutex.h
@@ -76,8 +76,8 @@ namespace llvm
     /// @name Do Not Implement
     /// @{
     private:
-      RWMutexImpl(const RWMutexImpl & original) LLVM_DELETED_FUNCTION;
-      void operator=(const RWMutexImpl &) LLVM_DELETED_FUNCTION;
+      RWMutexImpl(const RWMutexImpl & original) = delete;
+      void operator=(const RWMutexImpl &) = delete;
     /// @}
     };
 
diff --git a/include/llvm/Support/RandomNumberGenerator.h b/include/llvm/Support/RandomNumberGenerator.h
index cadc713..7446558 100644
--- a/include/llvm/Support/RandomNumberGenerator.h
+++ b/include/llvm/Support/RandomNumberGenerator.h
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines an abstraction for random number generation (RNG).
-// Note that the current implementation is not cryptographically secure
-// as it uses the C++11 <random> facilities.
+// This file defines an abstraction for deterministic random number
+// generation (RNG).  Note that the current implementation is not
+// cryptographically secure as it uses the C++11 <random> facilities.
 //
 //===----------------------------------------------------------------------===//
 
@@ -24,33 +24,34 @@
 namespace llvm {
 
 /// A random number generator.
-/// Instances of this class should not be shared across threads.
+///
+/// Instances of this class should not be shared across threads. The
+/// seed should be set by passing the -rng-seed=<uint64> option. Use
+/// Module::createRNG to create a new RNG instance for use with that
+/// module.
 class RandomNumberGenerator {
 public:
-  /// Seeds and salts the underlying RNG engine. The salt of type StringRef
-  /// is passed into the constructor. The seed can be set on the command
-  /// line via -rng-seed=<uint64>.
-  /// The reason for the salt is to ensure different random streams even if
-  /// the same seed is used for multiple invocations of the compiler.
-  /// A good salt value should add additional entropy and be constant across
-  /// different machines (i.e., no paths) to allow for reproducible builds.
-  /// An instance of this class can be retrieved from the current Module.
-  /// \see Module::getRNG
-  RandomNumberGenerator(StringRef Salt);
-
   /// Returns a random number in the range [0, Max).
-  uint64_t next(uint64_t Max);
+  uint_fast64_t operator()();
 
 private:
+  /// Seeds and salts the underlying RNG engine.
+  ///
+  /// This constructor should not be used directly. Instead use
+  /// Module::createRNG to create a new RNG salted with the Module ID.
+  RandomNumberGenerator(StringRef Salt);
+
   // 64-bit Mersenne Twister by Matsumoto and Nishimura, 2000
   // http://en.cppreference.com/w/cpp/numeric/random/mersenne_twister_engine
+  // This RNG is deterministically portable across C++11
+  // implementations.
   std::mt19937_64 Generator;
 
   // Noncopyable.
-  RandomNumberGenerator(const RandomNumberGenerator &other)
-      LLVM_DELETED_FUNCTION;
-  RandomNumberGenerator &
-  operator=(const RandomNumberGenerator &other) LLVM_DELETED_FUNCTION;
+  RandomNumberGenerator(const RandomNumberGenerator &other) = delete;
+  RandomNumberGenerator &operator=(const RandomNumberGenerator &other) = delete;
+
+  friend class Module;
 };
 }
 
diff --git a/include/llvm/Support/Regex.h b/include/llvm/Support/Regex.h
index bf533ca..5e5a5a3 100644
--- a/include/llvm/Support/Regex.h
+++ b/include/llvm/Support/Regex.h
@@ -46,7 +46,7 @@ namespace llvm {
 
     /// Compiles the given regular expression \p Regex.
     Regex(StringRef Regex, unsigned Flags = NoFlags);
-    Regex(const Regex &) LLVM_DELETED_FUNCTION;
+    Regex(const Regex &) = delete;
     Regex &operator=(Regex regex) {
       std::swap(preg, regex.preg);
       std::swap(error, regex.error);
diff --git a/include/llvm/Support/Registry.h b/include/llvm/Support/Registry.h
index b0c2e89..1f81d07 100644
--- a/include/llvm/Support/Registry.h
+++ b/include/llvm/Support/Registry.h
@@ -16,7 +16,6 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Compiler.h"
-
 #include <memory>
 
 namespace llvm {
@@ -42,7 +41,7 @@ namespace llvm {
   /// is necessary to define an alternate traits class.
   template <typename T>
   class RegistryTraits {
-    RegistryTraits() LLVM_DELETED_FUNCTION;
+    RegistryTraits() = delete;
 
   public:
     typedef SimpleRegistryEntry<T> entry;
@@ -68,7 +67,7 @@ namespace llvm {
     class iterator;
 
   private:
-    Registry() LLVM_DELETED_FUNCTION;
+    Registry() = delete;
 
     static void Announce(const entry &E) {
       for (listener *Cur = ListenerHead; Cur; Cur = Cur->Next)
@@ -121,6 +120,10 @@ namespace llvm {
     static iterator begin() { return iterator(Head); }
     static iterator end()   { return iterator(nullptr); }
 
+    static iterator_range<iterator> entries() {
+      return iterator_range<iterator>(begin(), end());
+    }
+
 
     /// Abstract base class for registry listeners, which are informed when new
     /// entries are added to the registry. Simply subclass and instantiate:
diff --git a/include/llvm/Support/ScaledNumber.h b/include/llvm/Support/ScaledNumber.h
index 2bd7e74..a1c4c80 100644
--- a/include/llvm/Support/ScaledNumber.h
+++ b/include/llvm/Support/ScaledNumber.h
@@ -23,7 +23,6 @@
 #define LLVM_SUPPORT_SCALEDNUMBER_H
 
 #include "llvm/Support/MathExtras.h"
-
 #include <algorithm>
 #include <cstdint>
 #include <limits>
diff --git a/include/llvm/Support/Signals.h b/include/llvm/Support/Signals.h
index 6cbc1f6..6b1da2a 100644
--- a/include/llvm/Support/Signals.h
+++ b/include/llvm/Support/Signals.h
@@ -39,6 +39,9 @@ namespace sys {
   /// @brief Print a stack trace if a fatal signal occurs.
   void PrintStackTraceOnErrorSignal();
 
+  /// Disable all system dialog boxes that appear when the process crashes.
+  void DisableSystemDialogsOnCrash();
+
   /// \brief Print the stack trace using the given \c FILE object.
   void PrintStackTrace(FILE *);
 
diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h
index f9e114b..d492748 100644
--- a/include/llvm/Support/SourceMgr.h
+++ b/include/llvm/Support/SourceMgr.h
@@ -73,8 +73,8 @@ private:
 
   bool isValidBufferID(unsigned i) const { return i && i <= Buffers.size(); }
 
-  SourceMgr(const SourceMgr&) LLVM_DELETED_FUNCTION;
-  void operator=(const SourceMgr&) LLVM_DELETED_FUNCTION;
+  SourceMgr(const SourceMgr&) = delete;
+  void operator=(const SourceMgr&) = delete;
 public:
   SourceMgr()
     : LineNoCache(nullptr), DiagHandler(nullptr), DiagContext(nullptr) {}
diff --git a/include/llvm/Support/SpecialCaseList.h b/include/llvm/Support/SpecialCaseList.h
index 313212e..ce693c5 100644
--- a/include/llvm/Support/SpecialCaseList.h
+++ b/include/llvm/Support/SpecialCaseList.h
@@ -49,6 +49,8 @@
 #define LLVM_SUPPORT_SPECIALCASELIST_H
 
 #include "llvm/ADT/StringMap.h"
+#include <string>
+#include <vector>
 
 namespace llvm {
 class MemoryBuffer;
@@ -57,18 +59,18 @@ class StringRef;
 
 class SpecialCaseList {
 public:
-  /// Parses the special case list from a file. If Path is empty, returns
-  /// an empty special case list. On failure, returns 0 and writes an error
-  /// message to string.
-  static std::unique_ptr<SpecialCaseList> create(StringRef Path,
-                                                  std::string &Error);
+  /// Parses the special case list entries from files. On failure, returns
+  /// 0 and writes an error message to string.
+  static std::unique_ptr<SpecialCaseList>
+  create(const std::vector<std::string> &Paths, std::string &Error);
   /// Parses the special case list from a memory buffer. On failure, returns
   /// 0 and writes an error message to string.
   static std::unique_ptr<SpecialCaseList> create(const MemoryBuffer *MB,
-                                                  std::string &Error);
-  /// Parses the special case list from a file. On failure, reports a fatal
-  /// error.
-  static std::unique_ptr<SpecialCaseList> createOrDie(StringRef Path);
+                                                 std::string &Error);
+  /// Parses the special case list entries from files. On failure, reports a
+  /// fatal error.
+  static std::unique_ptr<SpecialCaseList>
+  createOrDie(const std::vector<std::string> &Paths);
 
   ~SpecialCaseList();
 
@@ -81,15 +83,19 @@ public:
                  StringRef Category = StringRef()) const;
 
 private:
-  SpecialCaseList(SpecialCaseList const &) LLVM_DELETED_FUNCTION;
-  SpecialCaseList &operator=(SpecialCaseList const &) LLVM_DELETED_FUNCTION;
+  SpecialCaseList(SpecialCaseList const &) = delete;
+  SpecialCaseList &operator=(SpecialCaseList const &) = delete;
 
   struct Entry;
-  StringMap<StringMap<Entry> > Entries;
+  StringMap<StringMap<Entry>> Entries;
+  StringMap<StringMap<std::string>> Regexps;
+  bool IsCompiled;
 
   SpecialCaseList();
   /// Parses just-constructed SpecialCaseList entries from a memory buffer.
   bool parse(const MemoryBuffer *MB, std::string &Error);
+  /// compile() should be called once, after parsing all the memory buffers.
+  void compile();
 };
 
 }  // namespace llvm
diff --git a/include/llvm/Support/StreamingMemoryObject.h b/include/llvm/Support/StreamingMemoryObject.h
index 6957c6e..f914817 100644
--- a/include/llvm/Support/StreamingMemoryObject.h
+++ b/include/llvm/Support/StreamingMemoryObject.h
@@ -65,23 +65,24 @@ private:
   // Most of the requests will be small, but we fetch at kChunkSize bytes
   // at a time to avoid making too many potentially expensive GetBytes calls
   bool fetchToPos(size_t Pos) const {
-    if (EOFReached) return Pos < ObjectSize;
+    if (EOFReached)
+      return Pos < ObjectSize;
     while (Pos >= BytesRead) {
       Bytes.resize(BytesRead + BytesSkipped + kChunkSize);
       size_t bytes = Streamer->GetBytes(&Bytes[BytesRead + BytesSkipped],
                                         kChunkSize);
       BytesRead += bytes;
-      if (BytesRead <= Pos) { // reached EOF/ran out of bytes
+      if (bytes != kChunkSize) { // reached EOF/ran out of bytes
         ObjectSize = BytesRead;
         EOFReached = true;
-        return false;
+        break;
       }
     }
-    return true;
+    return Pos < BytesRead;
   }
 
-  StreamingMemoryObject(const StreamingMemoryObject&) LLVM_DELETED_FUNCTION;
-  void operator=(const StreamingMemoryObject&) LLVM_DELETED_FUNCTION;
+  StreamingMemoryObject(const StreamingMemoryObject&) = delete;
+  void operator=(const StreamingMemoryObject&) = delete;
 };
 
 MemoryObject *getNonStreamedMemoryObject(
diff --git a/include/llvm/Support/StringPool.h b/include/llvm/Support/StringPool.h
index 3e04653..675adde 100644
--- a/include/llvm/Support/StringPool.h
+++ b/include/llvm/Support/StringPool.h
@@ -29,8 +29,8 @@
 #ifndef LLVM_SUPPORT_STRINGPOOL_H
 #define LLVM_SUPPORT_STRINGPOOL_H
 
-#include "llvm/Support/Compiler.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Compiler.h"
 #include <cassert>
 #include <new>
 
@@ -129,7 +129,7 @@ namespace llvm {
     }
 
     inline const char *operator*() const { return begin(); }
-    inline LLVM_EXPLICIT operator bool() const { return S != nullptr; }
+    inline explicit operator bool() const { return S != nullptr; }
 
     inline bool operator==(const PooledStringPtr &That) const { return S == That.S; }
     inline bool operator!=(const PooledStringPtr &That) const { return S != That.S; }
diff --git a/include/llvm/Support/SwapByteOrder.h b/include/llvm/Support/SwapByteOrder.h
index 9c5a3c5..70564be 100644
--- a/include/llvm/Support/SwapByteOrder.h
+++ b/include/llvm/Support/SwapByteOrder.h
@@ -94,6 +94,26 @@ inline signed long long getSwappedBytes(signed long long C) {
   return SwapByteOrder_64(C);
 }
 
+inline float getSwappedBytes(float C) {
+  union {
+    uint32_t i;
+    float f;
+  } in, out;
+  in.f = C;
+  out.i = SwapByteOrder_32(in.i);
+  return out.f;
+}
+
+inline float getSwappedBytes(double C) {
+  union {
+    uint64_t i;
+    double d;
+  } in, out;
+  in.d = C;
+  out.i = SwapByteOrder_64(in.i);
+  return out.d;
+}
+
 template<typename T>
 inline void swapByteOrder(T &Value) {
   Value = getSwappedBytes(Value);
diff --git a/include/llvm/Support/TargetRegistry.h b/include/llvm/Support/TargetRegistry.h
index 8ac4b90..7a71f6d 100644
--- a/include/llvm/Support/TargetRegistry.h
+++ b/include/llvm/Support/TargetRegistry.h
@@ -23,6 +23,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/CodeGen.h"
 #include <cassert>
+#include <memory>
 #include <string>
 
 namespace llvm {
@@ -46,6 +47,7 @@ namespace llvm {
   class MCRelocationInfo;
   class MCTargetAsmParser;
   class MCTargetOptions;
+  class MCTargetStreamer;
   class TargetMachine;
   class TargetOptions;
   class raw_ostream;
@@ -61,9 +63,8 @@ namespace llvm {
 
   MCSymbolizer *createMCSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
                                    LLVMSymbolLookupCallback SymbolLookUp,
-                                   void *DisInfo,
-                                   MCContext *Ctx,
-                                   MCRelocationInfo *RelInfo);
+                                   void *DisInfo, MCContext *Ctx,
+                                   std::unique_ptr<MCRelocationInfo> &&RelInfo);
 
   /// Target - Wrapper for Target specific information.
   ///
@@ -99,8 +100,11 @@ namespace llvm {
                                                   Reloc::Model RM,
                                                   CodeModel::Model CM,
                                                   CodeGenOpt::Level OL);
-    typedef AsmPrinter *(*AsmPrinterCtorTy)(TargetMachine &TM,
-                                            MCStreamer &Streamer);
+    // If it weren't for layering issues (this header is in llvm/Support, but
+    // depends on MC?) this should take the Streamer by value rather than rvalue
+    // reference.
+    typedef AsmPrinter *(*AsmPrinterCtorTy)(
+        TargetMachine &TM, std::unique_ptr<MCStreamer> &&Streamer);
     typedef MCAsmBackend *(*MCAsmBackendCtorTy)(const Target &T,
                                                 const MCRegisterInfo &MRI,
                                                 StringRef TT,
@@ -135,15 +139,13 @@ namespace llvm {
                                              MCCodeEmitter *CE,
                                              MCAsmBackend *TAB,
                                              bool ShowInst);
-    typedef MCStreamer *(*NullStreamerCtorTy)(MCContext &Ctx);
+    typedef MCTargetStreamer *(*NullTargetStreamerCtorTy)(MCStreamer &S);
     typedef MCRelocationInfo *(*MCRelocationInfoCtorTy)(StringRef TT,
                                                         MCContext &Ctx);
-    typedef MCSymbolizer *(*MCSymbolizerCtorTy)(StringRef TT,
-                                   LLVMOpInfoCallback GetOpInfo,
-                                   LLVMSymbolLookupCallback SymbolLookUp,
-                                   void *DisInfo,
-                                   MCContext *Ctx,
-                                   MCRelocationInfo *RelInfo);
+    typedef MCSymbolizer *(*MCSymbolizerCtorTy)(
+        StringRef TT, LLVMOpInfoCallback GetOpInfo,
+        LLVMSymbolLookupCallback SymbolLookUp, void *DisInfo, MCContext *Ctx,
+        std::unique_ptr<MCRelocationInfo> &&RelInfo);
 
   private:
     /// Next - The next registered target in the linked list, maintained by the
@@ -222,9 +224,9 @@ namespace llvm {
     /// AsmStreamer, if registered (default = llvm::createAsmStreamer).
     AsmStreamerCtorTy AsmStreamerCtorFn;
 
-    /// Construction function for this target's NullStreamer, if registered
-    /// (default = llvm::createNullStreamer).
-    NullStreamerCtorTy NullStreamerCtorFn;
+    /// Construction function for this target's null TargetStreamer, if
+    /// registered (default = nullptr).
+    NullTargetStreamerCtorTy NullTargetStreamerCtorFn;
 
     /// MCRelocationInfoCtorFn - Construction function for this target's
     /// MCRelocationInfo, if registered (default = llvm::createMCRelocationInfo)
@@ -236,8 +238,8 @@ namespace llvm {
 
   public:
     Target()
-        : AsmStreamerCtorFn(nullptr), NullStreamerCtorFn(nullptr),
-          MCRelocationInfoCtorFn(nullptr), MCSymbolizerCtorFn(nullptr) {}
+        : AsmStreamerCtorFn(nullptr), MCRelocationInfoCtorFn(nullptr),
+          MCSymbolizerCtorFn(nullptr) {}
 
     /// @name Target Information
     /// @{
@@ -376,10 +378,11 @@ namespace llvm {
 
     /// createAsmPrinter - Create a target specific assembly printer pass.  This
     /// takes ownership of the MCStreamer object.
-    AsmPrinter *createAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) const{
+    AsmPrinter *createAsmPrinter(TargetMachine &TM,
+                                 std::unique_ptr<MCStreamer> &&Streamer) const {
       if (!AsmPrinterCtorFn)
         return nullptr;
-      return AsmPrinterCtorFn(TM, Streamer);
+      return AsmPrinterCtorFn(TM, std::move(Streamer));
     }
 
     MCDisassembler *createMCDisassembler(const MCSubtargetInfo &STI,
@@ -446,9 +449,15 @@ namespace llvm {
     }
 
     MCStreamer *createNullStreamer(MCContext &Ctx) const {
-      if (NullStreamerCtorFn)
-        return NullStreamerCtorFn(Ctx);
-      return llvm::createNullStreamer(Ctx);
+      MCStreamer *S = llvm::createNullStreamer(Ctx);
+      createNullTargetStreamer(*S);
+      return S;
+    }
+
+    MCTargetStreamer *createNullTargetStreamer(MCStreamer &S) const {
+      if (NullTargetStreamerCtorFn)
+        return NullTargetStreamerCtorFn(S);
+      return nullptr;
     }
 
     /// createMCRelocationInfo - Create a target specific MCRelocationInfo.
@@ -474,12 +483,12 @@ namespace llvm {
     /// \param RelInfo The relocation information for this target. Takes ownership.
     MCSymbolizer *
     createMCSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
-                       LLVMSymbolLookupCallback SymbolLookUp,
-                       void *DisInfo,
-                       MCContext *Ctx, MCRelocationInfo *RelInfo) const {
+                       LLVMSymbolLookupCallback SymbolLookUp, void *DisInfo,
+                       MCContext *Ctx,
+                       std::unique_ptr<MCRelocationInfo> &&RelInfo) const {
       MCSymbolizerCtorTy Fn =
           MCSymbolizerCtorFn ? MCSymbolizerCtorFn : llvm::createMCSymbolizer;
-      return Fn(TT, GetOpInfo, SymbolLookUp, DisInfo, Ctx, RelInfo);
+      return Fn(TT, GetOpInfo, SymbolLookUp, DisInfo, Ctx, std::move(RelInfo));
     }
 
     /// @}
@@ -777,8 +786,9 @@ namespace llvm {
       T.AsmStreamerCtorFn = Fn;
     }
 
-    static void RegisterNullStreamer(Target &T, Target::NullStreamerCtorTy Fn) {
-      T.NullStreamerCtorFn = Fn;
+    static void
+    RegisterNullTargetStreamer(Target &T, Target::NullTargetStreamerCtorTy Fn) {
+      T.NullTargetStreamerCtorFn = Fn;
     }
 
     /// RegisterMCRelocationInfo - Register an MCRelocationInfo
@@ -1121,8 +1131,9 @@ namespace llvm {
     }
 
   private:
-    static AsmPrinter *Allocator(TargetMachine &TM, MCStreamer &Streamer) {
-      return new AsmPrinterImpl(TM, Streamer);
+    static AsmPrinter *Allocator(TargetMachine &TM,
+                                 std::unique_ptr<MCStreamer> &&Streamer) {
+      return new AsmPrinterImpl(TM, std::move(Streamer));
     }
   };
 
diff --git a/include/llvm/Support/ThreadLocal.h b/include/llvm/Support/ThreadLocal.h
index 7518626..427a67e 100644
--- a/include/llvm/Support/ThreadLocal.h
+++ b/include/llvm/Support/ThreadLocal.h
@@ -36,7 +36,7 @@ namespace llvm {
       ThreadLocalImpl();
       virtual ~ThreadLocalImpl();
       void setInstance(const void* d);
-      const void* getInstance();
+      void *getInstance();
       void removeInstance();
     };
 
diff --git a/include/llvm/Support/Threading.h b/include/llvm/Support/Threading.h
index 7e87584..3cca1d6 100644
--- a/include/llvm/Support/Threading.h
+++ b/include/llvm/Support/Threading.h
@@ -21,7 +21,8 @@ namespace llvm {
   bool llvm_is_multithreaded();
 
   /// llvm_execute_on_thread - Execute the given \p UserFn on a separate
-  /// thread, passing it the provided \p UserData.
+  /// thread, passing it the provided \p UserData and waits for thread 
+  /// completion.
   ///
   /// This function does not guarantee that the code will actually be executed
   /// on a separate thread or honoring the requested stack size, but tries to do
diff --git a/include/llvm/Support/Timer.h b/include/llvm/Support/Timer.h
index 45c1828..442b361 100644
--- a/include/llvm/Support/Timer.h
+++ b/include/llvm/Support/Timer.h
@@ -126,7 +126,7 @@ private:
 ///
 class TimeRegion {
   Timer *T;
-  TimeRegion(const TimeRegion &) LLVM_DELETED_FUNCTION;
+  TimeRegion(const TimeRegion &) = delete;
 public:
   explicit TimeRegion(Timer &t) : T(&t) {
     T->startTimer();
@@ -164,8 +164,8 @@ class TimerGroup {
   std::vector<std::pair<TimeRecord, std::string> > TimersToPrint;
   
   TimerGroup **Prev, *Next; // Doubly linked list of TimerGroup's.
-  TimerGroup(const TimerGroup &TG) LLVM_DELETED_FUNCTION;
-  void operator=(const TimerGroup &TG) LLVM_DELETED_FUNCTION;
+  TimerGroup(const TimerGroup &TG) = delete;
+  void operator=(const TimerGroup &TG) = delete;
 public:
   explicit TimerGroup(StringRef name);
   ~TimerGroup();
diff --git a/include/llvm/Support/UniqueLock.h b/include/llvm/Support/UniqueLock.h
index 5a4c273..529284d 100644
--- a/include/llvm/Support/UniqueLock.h
+++ b/include/llvm/Support/UniqueLock.h
@@ -29,8 +29,8 @@ namespace llvm {
     MutexT *M;
     bool locked;
 
-    unique_lock(const unique_lock &) LLVM_DELETED_FUNCTION;
-    void operator=(const unique_lock &) LLVM_DELETED_FUNCTION;
+    unique_lock(const unique_lock &) = delete;
+    void operator=(const unique_lock &) = delete;
   public:
     unique_lock() : M(nullptr), locked(false) {}
     explicit unique_lock(MutexT &m) : M(&m), locked(true) { M->lock(); }
diff --git a/include/llvm/Support/Watchdog.h b/include/llvm/Support/Watchdog.h
index b58496b..01e1d92 100644
--- a/include/llvm/Support/Watchdog.h
+++ b/include/llvm/Support/Watchdog.h
@@ -29,8 +29,8 @@ namespace llvm {
       ~Watchdog();
     private:
       // Noncopyable.
-      Watchdog(const Watchdog &other) LLVM_DELETED_FUNCTION;
-      Watchdog &operator=(const Watchdog &other) LLVM_DELETED_FUNCTION;
+      Watchdog(const Watchdog &other) = delete;
+      Watchdog &operator=(const Watchdog &other) = delete;
     };
   }
 }
diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index 023dcee7..78829f8 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -447,6 +447,7 @@ public:
 
   virtual void beginEnumScalar() = 0;
   virtual bool matchEnumScalar(const char*, bool) = 0;
+  virtual bool matchEnumFallback() = 0;
   virtual void endEnumScalar() = 0;
 
   virtual bool beginBitSetScalar(bool &) = 0;
@@ -472,6 +473,16 @@ public:
     }
   }
 
+  template <typename FBT, typename T>
+  void enumFallback(T &Val) {
+    if ( matchEnumFallback() ) {
+      // FIXME: Force integral conversion to allow strong typedefs to convert.
+      FBT Res = (uint64_t)Val;
+      yamlize(*this, Res, true);
+      Val = (uint64_t)Res;
+    }
+  }
+
   template <typename T>
   void bitSetCase(T &Val, const char* Str, const T ConstVal) {
     if ( bitSetMatch(Str, outputting() && (Val & ConstVal) == ConstVal) ) {
@@ -532,7 +543,7 @@ public:
   void mapOptional(const char* Key, T& Val, const T& Default) {
     this->processKeyWithDefault(Key, Val, Default, false);
   }
-  
+
 private:
   template <typename T>
   void processKeyWithDefault(const char *Key, Optional<T> &Val,
@@ -706,7 +717,7 @@ struct ScalarTraits<StringRef> {
   static StringRef input(StringRef, void*, StringRef &);
   static bool mustQuote(StringRef S) { return needsQuotes(S); }
 };
- 
+
 template<>
 struct ScalarTraits<std::string> {
   static void output(const std::string &, void*, llvm::raw_ostream &);
@@ -899,6 +910,7 @@ private:
   void endFlowSequence() override;
   void beginEnumScalar() override;
   bool matchEnumScalar(const char*, bool) override;
+  bool matchEnumFallback() override;
   void endEnumScalar() override;
   bool beginBitSetScalar(bool &) override;
   bool bitSetMatch(const char *, bool ) override;
@@ -1026,6 +1038,7 @@ public:
   void endFlowSequence() override;
   void beginEnumScalar() override;
   bool matchEnumScalar(const char*, bool) override;
+  bool matchEnumFallback() override;
   void endEnumScalar() override;
   bool beginBitSetScalar(bool &) override;
   bool bitSetMatch(const char *, bool ) override;
diff --git a/include/llvm/Support/raw_ostream.h b/include/llvm/Support/raw_ostream.h
index c9ef637..94686d9 100644
--- a/include/llvm/Support/raw_ostream.h
+++ b/include/llvm/Support/raw_ostream.h
@@ -38,8 +38,8 @@ namespace llvm {
 /// a chunk at a time.
 class raw_ostream {
 private:
-  void operator=(const raw_ostream &) LLVM_DELETED_FUNCTION;
-  raw_ostream(const raw_ostream &) LLVM_DELETED_FUNCTION;
+  void operator=(const raw_ostream &) = delete;
+  raw_ostream(const raw_ostream &) = delete;
 
   /// The buffer is handled in such a way that the buffer is
   /// uninitialized, unbuffered, or out of space when OutBufCur >=
diff --git a/include/llvm/Support/type_traits.h b/include/llvm/Support/type_traits.h
index 70953a9..45465ae 100644
--- a/include/llvm/Support/type_traits.h
+++ b/include/llvm/Support/type_traits.h
@@ -28,9 +28,17 @@ namespace llvm {
 /// type can be copied around with memcpy instead of running ctors etc.
 template <typename T>
 struct isPodLike {
-#if __has_feature(is_trivially_copyable)
+  // std::is_trivially_copyable is available in libc++ with clang, libstdc++
+  // that comes with GCC 5.
+#if (__has_feature(is_trivially_copyable) && defined(_LIBCPP_VERSION)) ||      \
+    (defined(__GNUC__) && __GNUC__ >= 5)
   // If the compiler supports the is_trivially_copyable trait use it, as it
   // matches the definition of isPodLike closely.
+  static const bool value = std::is_trivially_copyable<T>::value;
+#elif __has_feature(is_trivially_copyable)
+  // Use the internal name if the compiler supports is_trivially_copyable but we
+  // don't know if the standard library does. This is the case for clang in
+  // conjunction with libstdc++ from GCC 4.x.
   static const bool value = __is_trivially_copyable(T);
 #else
   // If we don't know anything else, we can (at least) assume that all non-class
diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index 8c5452e..e8d22d9 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h
@@ -454,8 +454,8 @@ protected:
 
 private:
   const InitKind Kind;
-  Init(const Init &) LLVM_DELETED_FUNCTION;
-  Init &operator=(const Init &) LLVM_DELETED_FUNCTION;
+  Init(const Init &) = delete;
+  Init &operator=(const Init &) = delete;
   virtual void anchor();
 
 public:
@@ -561,8 +561,8 @@ inline raw_ostream &operator<<(raw_ostream &OS, const Init &I) {
 class TypedInit : public Init {
   RecTy *Ty;
 
-  TypedInit(const TypedInit &Other) LLVM_DELETED_FUNCTION;
-  TypedInit &operator=(const TypedInit &Other) LLVM_DELETED_FUNCTION;
+  TypedInit(const TypedInit &Other) = delete;
+  TypedInit &operator=(const TypedInit &Other) = delete;
 
 protected:
   explicit TypedInit(InitKind K, RecTy *T) : Init(K), Ty(T) {}
@@ -596,8 +596,8 @@ public:
 ///
 class UnsetInit : public Init {
   UnsetInit() : Init(IK_UnsetInit) {}
-  UnsetInit(const UnsetInit &) LLVM_DELETED_FUNCTION;
-  UnsetInit &operator=(const UnsetInit &Other) LLVM_DELETED_FUNCTION;
+  UnsetInit(const UnsetInit &) = delete;
+  UnsetInit &operator=(const UnsetInit &Other) = delete;
   void anchor() override;
 
 public:
@@ -624,8 +624,8 @@ class BitInit : public Init {
   bool Value;
 
   explicit BitInit(bool V) : Init(IK_BitInit), Value(V) {}
-  BitInit(const BitInit &Other) LLVM_DELETED_FUNCTION;
-  BitInit &operator=(BitInit &Other) LLVM_DELETED_FUNCTION;
+  BitInit(const BitInit &Other) = delete;
+  BitInit &operator=(BitInit &Other) = delete;
   void anchor() override;
 
 public:
@@ -658,8 +658,8 @@ class BitsInit : public TypedInit, public FoldingSetNode {
     : TypedInit(IK_BitsInit, BitsRecTy::get(Range.size())),
       Bits(Range.begin(), Range.end()) {}
 
-  BitsInit(const BitsInit &Other) LLVM_DELETED_FUNCTION;
-  BitsInit &operator=(const BitsInit &Other) LLVM_DELETED_FUNCTION;
+  BitsInit(const BitsInit &Other) = delete;
+  BitsInit &operator=(const BitsInit &Other) = delete;
 
 public:
   static bool classof(const Init *I) {
@@ -713,8 +713,8 @@ class IntInit : public TypedInit {
   explicit IntInit(int64_t V)
     : TypedInit(IK_IntInit, IntRecTy::get()), Value(V) {}
 
-  IntInit(const IntInit &Other) LLVM_DELETED_FUNCTION;
-  IntInit &operator=(const IntInit &Other) LLVM_DELETED_FUNCTION;
+  IntInit(const IntInit &Other) = delete;
+  IntInit &operator=(const IntInit &Other) = delete;
 
 public:
   static bool classof(const Init *I) {
@@ -753,8 +753,8 @@ class StringInit : public TypedInit {
   explicit StringInit(const std::string &V)
     : TypedInit(IK_StringInit, StringRecTy::get()), Value(V) {}
 
-  StringInit(const StringInit &Other) LLVM_DELETED_FUNCTION;
-  StringInit &operator=(const StringInit &Other) LLVM_DELETED_FUNCTION;
+  StringInit(const StringInit &Other) = delete;
+  StringInit &operator=(const StringInit &Other) = delete;
   void anchor() override;
 
 public:
@@ -798,8 +798,8 @@ private:
     : TypedInit(IK_ListInit, ListRecTy::get(EltTy)),
       Values(Range.begin(), Range.end()) {}
 
-  ListInit(const ListInit &Other) LLVM_DELETED_FUNCTION;
-  ListInit &operator=(const ListInit &Other) LLVM_DELETED_FUNCTION;
+  ListInit(const ListInit &Other) = delete;
+  ListInit &operator=(const ListInit &Other) = delete;
 
 public:
   static bool classof(const Init *I) {
@@ -855,8 +855,8 @@ public:
 /// OpInit - Base class for operators
 ///
 class OpInit : public TypedInit {
-  OpInit(const OpInit &Other) LLVM_DELETED_FUNCTION;
-  OpInit &operator=(OpInit &Other) LLVM_DELETED_FUNCTION;
+  OpInit(const OpInit &Other) = delete;
+  OpInit &operator=(OpInit &Other) = delete;
 
 protected:
   explicit OpInit(InitKind K, RecTy *Type) : TypedInit(K, Type) {}
@@ -899,8 +899,8 @@ private:
   UnOpInit(UnaryOp opc, Init *lhs, RecTy *Type)
     : OpInit(IK_UnOpInit, Type), Opc(opc), LHS(lhs) {}
 
-  UnOpInit(const UnOpInit &Other) LLVM_DELETED_FUNCTION;
-  UnOpInit &operator=(const UnOpInit &Other) LLVM_DELETED_FUNCTION;
+  UnOpInit(const UnOpInit &Other) = delete;
+  UnOpInit &operator=(const UnOpInit &Other) = delete;
 
 public:
   static bool classof(const Init *I) {
@@ -946,8 +946,8 @@ private:
   BinOpInit(BinaryOp opc, Init *lhs, Init *rhs, RecTy *Type) :
       OpInit(IK_BinOpInit, Type), Opc(opc), LHS(lhs), RHS(rhs) {}
 
-  BinOpInit(const BinOpInit &Other) LLVM_DELETED_FUNCTION;
-  BinOpInit &operator=(const BinOpInit &Other) LLVM_DELETED_FUNCTION;
+  BinOpInit(const BinOpInit &Other) = delete;
+  BinOpInit &operator=(const BinOpInit &Other) = delete;
 
 public:
   static bool classof(const Init *I) {
@@ -1000,8 +1000,8 @@ private:
              RecTy *Type) :
       OpInit(IK_TernOpInit, Type), Opc(opc), LHS(lhs), MHS(mhs), RHS(rhs) {}
 
-  TernOpInit(const TernOpInit &Other) LLVM_DELETED_FUNCTION;
-  TernOpInit &operator=(const TernOpInit &Other) LLVM_DELETED_FUNCTION;
+  TernOpInit(const TernOpInit &Other) = delete;
+  TernOpInit &operator=(const TernOpInit &Other) = delete;
 
 public:
   static bool classof(const Init *I) {
@@ -1058,8 +1058,8 @@ class VarInit : public TypedInit {
   explicit VarInit(Init *VN, RecTy *T)
       : TypedInit(IK_VarInit, T), VarName(VN) {}
 
-  VarInit(const VarInit &Other) LLVM_DELETED_FUNCTION;
-  VarInit &operator=(const VarInit &Other) LLVM_DELETED_FUNCTION;
+  VarInit(const VarInit &Other) = delete;
+  VarInit &operator=(const VarInit &Other) = delete;
 
 public:
   static bool classof(const Init *I) {
@@ -1111,8 +1111,8 @@ class VarBitInit : public Init {
            "Illegal VarBitInit expression!");
   }
 
-  VarBitInit(const VarBitInit &Other) LLVM_DELETED_FUNCTION;
-  VarBitInit &operator=(const VarBitInit &Other) LLVM_DELETED_FUNCTION;
+  VarBitInit(const VarBitInit &Other) = delete;
+  VarBitInit &operator=(const VarBitInit &Other) = delete;
 
 public:
   static bool classof(const Init *I) {
@@ -1150,8 +1150,8 @@ class VarListElementInit : public TypedInit {
            "Illegal VarBitInit expression!");
   }
 
-  VarListElementInit(const VarListElementInit &Other) LLVM_DELETED_FUNCTION;
-  void operator=(const VarListElementInit &Other) LLVM_DELETED_FUNCTION;
+  VarListElementInit(const VarListElementInit &Other) = delete;
+  void operator=(const VarListElementInit &Other) = delete;
 
 public:
   static bool classof(const Init *I) {
@@ -1186,8 +1186,8 @@ class DefInit : public TypedInit {
   DefInit(Record *D, RecordRecTy *T) : TypedInit(IK_DefInit, T), Def(D) {}
   friend class Record;
 
-  DefInit(const DefInit &Other) LLVM_DELETED_FUNCTION;
-  DefInit &operator=(const DefInit &Other) LLVM_DELETED_FUNCTION;
+  DefInit(const DefInit &Other) = delete;
+  DefInit &operator=(const DefInit &Other) = delete;
 
 public:
   static bool classof(const Init *I) {
@@ -1233,8 +1233,8 @@ class FieldInit : public TypedInit {
     assert(getType() && "FieldInit with non-record type!");
   }
 
-  FieldInit(const FieldInit &Other) LLVM_DELETED_FUNCTION;
-  FieldInit &operator=(const FieldInit &Other) LLVM_DELETED_FUNCTION;
+  FieldInit(const FieldInit &Other) = delete;
+  FieldInit &operator=(const FieldInit &Other) = delete;
 
 public:
   static bool classof(const Init *I) {
@@ -1276,8 +1276,8 @@ class DagInit : public TypedInit, public FoldingSetNode {
           Args(ArgRange.begin(), ArgRange.end()),
           ArgNames(NameRange.begin(), NameRange.end()) {}
 
-  DagInit(const DagInit &Other) LLVM_DELETED_FUNCTION;
-  DagInit &operator=(const DagInit &Other) LLVM_DELETED_FUNCTION;
+  DagInit(const DagInit &Other) = delete;
+  DagInit &operator=(const DagInit &Other) = delete;
 
 public:
   static bool classof(const Init *I) {
@@ -1663,7 +1663,7 @@ raw_ostream &operator<<(raw_ostream &OS, const Record &R);
 
 struct MultiClass {
   Record Rec;  // Placeholder for template args and Name.
-  typedef std::vector<Record*> RecordVector;
+  typedef std::vector<std::unique_ptr<Record>> RecordVector;
   RecordVector DefPrototypes;
 
   void dump() const;
@@ -1688,15 +1688,13 @@ public:
     auto I = Defs.find(Name);
     return I == Defs.end() ? nullptr : I->second.get();
   }
-  void addClass(Record *_R) {
-    std::unique_ptr<Record> R(_R);
+  void addClass(std::unique_ptr<Record> R) {
     bool Ins = Classes.insert(std::make_pair(R->getName(),
                                              std::move(R))).second;
     (void)Ins;
     assert(Ins && "Class already exists");
   }
-  void addDef(Record *_R) {
-    std::unique_ptr<Record> R(_R);
+  void addDef(std::unique_ptr<Record> R) {
     bool Ins = Defs.insert(std::make_pair(R->getName(),
                                           std::move(R))).second;
     (void)Ins;
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index 902647e..3e65a5d 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -396,12 +396,9 @@ class Instruction {
   //  hasSideEffects - The instruction has side effects that are not
   //    captured by any operands of the instruction or other flags.
   //
-  //  neverHasSideEffects (deprecated) - Set on an instruction with no pattern
-  //    if it has no side effects. This is now equivalent to setting
-  //    "hasSideEffects = 0".
   bit hasSideEffects = ?;
-  bit neverHasSideEffects = 0;
-
+  bit hasTwoExplicitDefs = 0;  // Does this instruction have 2 explicit
+                               // destinations?
   // Is this instruction a "real" instruction (with a distinct machine
   // encoding), or is it a pseudo instruction used for codegen modeling
   // purposes.
@@ -628,6 +625,9 @@ class RegisterOperand<RegisterClass regclass, string pm = "printOperand">
   // can match a subset of some other class, in which case the AsmOperandClass
   // should declare the other operand as one of its super classes.
   AsmOperandClass ParserMatchClass;
+
+  string OperandNamespace = "MCOI";
+  string OperandType = "OPERAND_REGISTER";
 }
 
 let OperandType = "OPERAND_IMMEDIATE" in {
@@ -734,7 +734,7 @@ def INLINEASM : Instruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let AsmString = "";
-  let neverHasSideEffects = 1;  // Note side effect is encoded in an operand.
+  let hasSideEffects = 0;  // Note side effect is encoded in an operand.
 }
 def CFI_INSTRUCTION : Instruction {
   let OutOperandList = (outs);
@@ -761,26 +761,26 @@ def KILL : Instruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let AsmString = "";
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
 }
 def EXTRACT_SUBREG : Instruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins unknown:$supersrc, i32imm:$subidx);
   let AsmString = "";
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
 }
 def INSERT_SUBREG : Instruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins unknown:$supersrc, unknown:$subsrc, i32imm:$subidx);
   let AsmString = "";
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
   let Constraints = "$supersrc = $dst";
 }
 def IMPLICIT_DEF : Instruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins);
   let AsmString = "";
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
   let isReMaterializable = 1;
   let isAsCheapAsAMove = 1;
 }
@@ -788,33 +788,33 @@ def SUBREG_TO_REG : Instruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins unknown:$implsrc, unknown:$subsrc, i32imm:$subidx);
   let AsmString = "";
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
 }
 def COPY_TO_REGCLASS : Instruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins unknown:$src, i32imm:$regclass);
   let AsmString = "";
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
   let isAsCheapAsAMove = 1;
 }
 def DBG_VALUE : Instruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let AsmString = "DBG_VALUE";
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
 }
 def REG_SEQUENCE : Instruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins unknown:$supersrc, variable_ops);
   let AsmString = "";
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
   let isAsCheapAsAMove = 1;
 }
 def COPY : Instruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins unknown:$src);
   let AsmString = "";
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
   let isAsCheapAsAMove = 1;
 }
 def BUNDLE : Instruction {
@@ -826,13 +826,13 @@ def LIFETIME_START : Instruction {
   let OutOperandList = (outs);
   let InOperandList = (ins i32imm:$id);
   let AsmString = "LIFETIME_START";
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
 }
 def LIFETIME_END : Instruction {
   let OutOperandList = (outs);
   let InOperandList = (ins i32imm:$id);
   let AsmString = "LIFETIME_END";
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
 }
 def STACKMAP : Instruction {
   let OutOperandList = (outs);
@@ -849,6 +849,15 @@ def PATCHPOINT : Instruction {
   let mayLoad = 1;
   let usesCustomInserter = 1;
 }
+def STATEPOINT : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins variable_ops);
+  let usesCustomInserter = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let hasSideEffects = 1;
+  let isCall = 1;
+}
 def LOAD_STACK_GUARD : Instruction {
   let OutOperandList = (outs ptr_rc:$dst);
   let InOperandList = (ins);
@@ -857,6 +866,15 @@ def LOAD_STACK_GUARD : Instruction {
   let hasSideEffects = 0;
   bit isPseudo = 1;
 }
+def FRAME_ALLOC : Instruction {
+  // This instruction is really just a label. It has to be part of the chain so
+  // that it doesn't get dropped from the DAG, but it produces nothing and has
+  // no side effects.
+  let OutOperandList = (outs);
+  let InOperandList = (ins ptr_rc:$symbol, i32imm:$id);
+  let hasSideEffects = 0;
+  let hasCtrlDep = 1;
+}
 }
 
 //===----------------------------------------------------------------------===//
@@ -1005,9 +1023,6 @@ class AsmWriter {
   // will specify which alternative to use.  For example "{x|y|z}" with Variant
   // == 1, will expand to "y".
   int Variant = 0;
-
-  // OperandSpacing - Space between operand columns.
-  int OperandSpacing = -1;
 }
 def DefaultAsmWriter : AsmWriter;
 
diff --git a/include/llvm/Target/TargetCallingConv.h b/include/llvm/Target/TargetCallingConv.h
index a0f2674..9071bfe 100644
--- a/include/llvm/Target/TargetCallingConv.h
+++ b/include/llvm/Target/TargetCallingConv.h
@@ -134,6 +134,8 @@ namespace ISD {
 
     /// Index original Function's argument.
     unsigned OrigArgIndex;
+    /// Sentinel value for implicit machine-level input arguments.
+    static const unsigned NoArgIndex = UINT_MAX;
 
     /// Offset in bytes of current input value relative to the beginning of
     /// original argument. E.g. if argument was splitted into four 32 bit
@@ -147,6 +149,15 @@ namespace ISD {
       VT = vt.getSimpleVT();
       ArgVT = argvt;
     }
+
+    bool isOrigArg() const {
+      return OrigArgIndex != NoArgIndex;
+    }
+
+    unsigned getOrigArgIndex() const {
+      assert(OrigArgIndex != NoArgIndex && "Implicit machine-level argument");
+      return OrigArgIndex;
+    }
   };
 
   /// OutputArg - This struct carries flags and a value for a
diff --git a/include/llvm/Target/TargetFrameLowering.h b/include/llvm/Target/TargetFrameLowering.h
index bfddd06..f17640f 100644
--- a/include/llvm/Target/TargetFrameLowering.h
+++ b/include/llvm/Target/TargetFrameLowering.h
@@ -142,6 +142,10 @@ public:
   /// the assembly prologue to explicitly handle the stack.
   virtual void adjustForHiPEPrologue(MachineFunction &MF) const { }
 
+  /// Adjust the prologue to add an allocation at a fixed offset from the frame
+  /// pointer.
+  virtual void adjustForFrameAllocatePrologue(MachineFunction &MF) const { }
+
   /// spillCalleeSavedRegisters - Issues instruction(s) to spill all callee
   /// saved registers and returns true if it isn't possible / profitable to do
   /// so by issuing a series of store instructions via
@@ -189,6 +193,11 @@ public:
     return hasReservedCallFrame(MF) || hasFP(MF);
   }
 
+  // needsFrameIndexResolution - Do we need to perform FI resolution for
+  // this function. Normally, this is required only when the function
+  // has any stack objects. However, targets may want to override this.
+  virtual bool needsFrameIndexResolution(const MachineFunction &MF) const;
+
   /// getFrameIndexOffset - Returns the displacement from the frame register to
   /// the stack frame of the specified index.
   virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
@@ -199,6 +208,16 @@ public:
   virtual int getFrameIndexReference(const MachineFunction &MF, int FI,
                                      unsigned &FrameReg) const;
 
+  /// Same as above, except that the 'base register' will always be RSP, not
+  /// RBP on x86.  This is used exclusively for lowering STATEPOINT nodes.
+  /// TODO: This should really be a parameterizable choice.
+  virtual int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI,
+                                          unsigned &FrameReg) const {
+    // default to calling normal version, we override this on x86 only
+    llvm_unreachable("unimplemented for non-x86");
+    return 0;
+  }
+
   /// processFunctionBeforeCalleeSavedScan - This method is called immediately
   /// before PrologEpilogInserter scans the physical registers used to determine
   /// what callee saved registers should be spilled. This method is optional.
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index a37a7f9..247f9d8 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -14,10 +14,10 @@
 #ifndef LLVM_TARGET_TARGETINSTRINFO_H
 #define LLVM_TARGET_TARGETINSTRINFO_H
 
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
@@ -50,8 +50,8 @@ template<class T> class SmallVectorImpl;
 /// TargetInstrInfo - Interface to description of machine instruction set
 ///
 class TargetInstrInfo : public MCInstrInfo {
-  TargetInstrInfo(const TargetInstrInfo &) LLVM_DELETED_FUNCTION;
-  void operator=(const TargetInstrInfo &) LLVM_DELETED_FUNCTION;
+  TargetInstrInfo(const TargetInstrInfo &) = delete;
+  void operator=(const TargetInstrInfo &) = delete;
 public:
   TargetInstrInfo(int CFSetupOpcode = -1, int CFDestroyOpcode = -1)
     : CallFrameSetupOpcode(CFSetupOpcode),
@@ -109,6 +109,12 @@ public:
   int getCallFrameSetupOpcode() const { return CallFrameSetupOpcode; }
   int getCallFrameDestroyOpcode() const { return CallFrameDestroyOpcode; }
 
+  /// Returns the actual stack pointer adjustment made by an instruction
+  /// as part of a call sequence. By default, only call frame setup/destroy
+  /// instructions adjust the stack, but targets may want to override this
+  /// to enable more fine-grained adjustment, or adjust by a different value.
+  virtual int getSPAdjust(const MachineInstr *MI) const;
+
   /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
   /// extension instruction. That is, it's like a copy where it's legal for the
   /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
@@ -597,9 +603,12 @@ public:
   /// a side.
   ///
   /// @param MI          Optimizable select instruction.
+  /// @param NewMIs     Set that record all MIs in the basic block up to \p
+  /// MI. Has to be updated with any newly created MI or deleted ones.
   /// @param PreferFalse Try to optimize FalseOp instead of TrueOp.
   /// @returns Optimized instruction or NULL.
   virtual MachineInstr *optimizeSelect(MachineInstr *MI,
+                                       SmallPtrSetImpl<MachineInstr *> &NewMIs,
                                        bool PreferFalse = false) const {
     // This function must be implemented if Optimizable is ever set.
     llvm_unreachable("Target must implement TargetInstrInfo::optimizeSelect!");
diff --git a/include/llvm/Target/TargetIntrinsicInfo.h b/include/llvm/Target/TargetIntrinsicInfo.h
index 71c0166..c630f5b 100644
--- a/include/llvm/Target/TargetIntrinsicInfo.h
+++ b/include/llvm/Target/TargetIntrinsicInfo.h
@@ -28,8 +28,8 @@ class Type;
 /// TargetIntrinsicInfo - Interface to description of machine instruction set
 ///
 class TargetIntrinsicInfo {
-  TargetIntrinsicInfo(const TargetIntrinsicInfo &) LLVM_DELETED_FUNCTION;
-  void operator=(const TargetIntrinsicInfo &) LLVM_DELETED_FUNCTION;
+  TargetIntrinsicInfo(const TargetIntrinsicInfo &) = delete;
+  void operator=(const TargetIntrinsicInfo &) = delete;
 public:
   TargetIntrinsicInfo();
   virtual ~TargetIntrinsicInfo();
diff --git a/include/llvm/Target/TargetLibraryInfo.h b/include/llvm/Target/TargetLibraryInfo.h
deleted file mode 100644
index 249849b..0000000
--- a/include/llvm/Target/TargetLibraryInfo.h
+++ /dev/null
@@ -1,813 +0,0 @@
-//===-- llvm/Target/TargetLibraryInfo.h - Library information ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_TARGETLIBRARYINFO_H
-#define LLVM_TARGET_TARGETLIBRARYINFO_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/Pass.h"
-
-// BEGIN ANDROID-SPECIFIC
-#ifdef WIN32
-#ifdef fseeko
-#undef fseeko
-#endif
-#ifdef ftello
-#undef ftello
-#endif
-#endif  // WIN32
-// END ANDROID-SPECIFIC
-
-namespace llvm {
-  class Triple;
-
-  namespace LibFunc {
-    enum Func {
-      /// int _IO_getc(_IO_FILE * __fp);
-      under_IO_getc,
-      /// int _IO_putc(int __c, _IO_FILE * __fp);
-      under_IO_putc,
-      /// void operator delete[](void*);
-      ZdaPv,
-      /// void operator delete[](void*, nothrow);
-      ZdaPvRKSt9nothrow_t,
-      /// void operator delete[](void*, unsigned int);
-      ZdaPvj,
-      /// void operator delete[](void*, unsigned long);
-      ZdaPvm,
-      /// void operator delete(void*);
-      ZdlPv,
-      /// void operator delete(void*, nothrow);
-      ZdlPvRKSt9nothrow_t,
-      /// void operator delete(void*, unsigned int);
-      ZdlPvj,
-      /// void operator delete(void*, unsigned long);
-      ZdlPvm,
-      /// void *new[](unsigned int);
-      Znaj,
-      /// void *new[](unsigned int, nothrow);
-      ZnajRKSt9nothrow_t,
-      /// void *new[](unsigned long);
-      Znam,
-      /// void *new[](unsigned long, nothrow);
-      ZnamRKSt9nothrow_t,
-      /// void *new(unsigned int);
-      Znwj,
-      /// void *new(unsigned int, nothrow);
-      ZnwjRKSt9nothrow_t,
-      /// void *new(unsigned long);
-      Znwm,
-      /// void *new(unsigned long, nothrow);
-      ZnwmRKSt9nothrow_t,
-      /// double __cospi(double x);
-      cospi,
-      /// float __cospif(float x);
-      cospif,
-      /// int __cxa_atexit(void (*f)(void *), void *p, void *d);
-      cxa_atexit,
-      /// void __cxa_guard_abort(guard_t *guard);
-      /// guard_t is int64_t in Itanium ABI or int32_t on ARM eabi.
-      cxa_guard_abort,
-      /// int __cxa_guard_acquire(guard_t *guard);
-      cxa_guard_acquire,
-      /// void __cxa_guard_release(guard_t *guard);
-      cxa_guard_release,
-      /// int __isoc99_scanf (const char *format, ...)
-      dunder_isoc99_scanf,
-      /// int __isoc99_sscanf(const char *s, const char *format, ...)
-      dunder_isoc99_sscanf,
-      /// void *__memcpy_chk(void *s1, const void *s2, size_t n, size_t s1size);
-      memcpy_chk,
-      /// void *__memmove_chk(void *s1, const void *s2, size_t n,
-      ///                     size_t s1size);
-      memmove_chk,
-      /// void *__memset_chk(void *s, char v, size_t n, size_t s1size);
-      memset_chk,
-      /// double __sincospi_stret(double x);
-      sincospi_stret,
-      /// float __sincospif_stret(float x);
-      sincospif_stret,
-      /// double __sinpi(double x);
-      sinpi,
-      /// float __sinpif(float x);
-      sinpif,
-      /// double __sqrt_finite(double x);
-      sqrt_finite,
-      /// float __sqrt_finite(float x);
-      sqrtf_finite,
-      /// long double __sqrt_finite(long double x);
-      sqrtl_finite,
-      /// char *__stpcpy_chk(char *s1, const char *s2, size_t s1size);
-      stpcpy_chk,
-      /// char *__stpncpy_chk(char *s1, const char *s2, size_t n,
-      ///                     size_t s1size);
-      stpncpy_chk,
-      /// char *__strcpy_chk(char *s1, const char *s2, size_t s1size);
-      strcpy_chk,
-      /// char * __strdup(const char *s);
-      dunder_strdup,
-      /// char *__strncpy_chk(char *s1, const char *s2, size_t n,
-      ///                     size_t s1size);
-      strncpy_chk,
-      /// char *__strndup(const char *s, size_t n);
-      dunder_strndup,
-      /// char * __strtok_r(char *s, const char *delim, char **save_ptr);
-      dunder_strtok_r,
-      /// int abs(int j);
-      abs,
-      /// int access(const char *path, int amode);
-      access,
-      /// double acos(double x);
-      acos,
-      /// float acosf(float x);
-      acosf,
-      /// double acosh(double x);
-      acosh,
-      /// float acoshf(float x);
-      acoshf,
-      /// long double acoshl(long double x);
-      acoshl,
-      /// long double acosl(long double x);
-      acosl,
-      /// double asin(double x);
-      asin,
-      /// float asinf(float x);
-      asinf,
-      /// double asinh(double x);
-      asinh,
-      /// float asinhf(float x);
-      asinhf,
-      /// long double asinhl(long double x);
-      asinhl,
-      /// long double asinl(long double x);
-      asinl,
-      /// double atan(double x);
-      atan,
-      /// double atan2(double y, double x);
-      atan2,
-      /// float atan2f(float y, float x);
-      atan2f,
-      /// long double atan2l(long double y, long double x);
-      atan2l,
-      /// float atanf(float x);
-      atanf,
-      /// double atanh(double x);
-      atanh,
-      /// float atanhf(float x);
-      atanhf,
-      /// long double atanhl(long double x);
-      atanhl,
-      /// long double atanl(long double x);
-      atanl,
-      /// double atof(const char *str);
-      atof,
-      /// int atoi(const char *str);
-      atoi,
-      /// long atol(const char *str);
-      atol,
-      /// long long atoll(const char *nptr);
-      atoll,
-      /// int bcmp(const void *s1, const void *s2, size_t n);
-      bcmp,
-      /// void bcopy(const void *s1, void *s2, size_t n);
-      bcopy,
-      /// void bzero(void *s, size_t n);
-      bzero,
-      /// void *calloc(size_t count, size_t size);
-      calloc,
-      /// double cbrt(double x);
-      cbrt,
-      /// float cbrtf(float x);
-      cbrtf,
-      /// long double cbrtl(long double x);
-      cbrtl,
-      /// double ceil(double x);
-      ceil,
-      /// float ceilf(float x);
-      ceilf,
-      /// long double ceill(long double x);
-      ceill,
-      /// int chmod(const char *path, mode_t mode);
-      chmod,
-      /// int chown(const char *path, uid_t owner, gid_t group);
-      chown,
-      /// void clearerr(FILE *stream);
-      clearerr,
-      /// int closedir(DIR *dirp);
-      closedir,
-      /// double copysign(double x, double y);
-      copysign,
-      /// float copysignf(float x, float y);
-      copysignf,
-      /// long double copysignl(long double x, long double y);
-      copysignl,
-      /// double cos(double x);
-      cos,
-      /// float cosf(float x);
-      cosf,
-      /// double cosh(double x);
-      cosh,
-      /// float coshf(float x);
-      coshf,
-      /// long double coshl(long double x);
-      coshl,
-      /// long double cosl(long double x);
-      cosl,
-      /// char *ctermid(char *s);
-      ctermid,
-      /// double exp(double x);
-      exp,
-      /// double exp10(double x);
-      exp10,
-      /// float exp10f(float x);
-      exp10f,
-      /// long double exp10l(long double x);
-      exp10l,
-      /// double exp2(double x);
-      exp2,
-      /// float exp2f(float x);
-      exp2f,
-      /// long double exp2l(long double x);
-      exp2l,
-      /// float expf(float x);
-      expf,
-      /// long double expl(long double x);
-      expl,
-      /// double expm1(double x);
-      expm1,
-      /// float expm1f(float x);
-      expm1f,
-      /// long double expm1l(long double x);
-      expm1l,
-      /// double fabs(double x);
-      fabs,
-      /// float fabsf(float x);
-      fabsf,
-      /// long double fabsl(long double x);
-      fabsl,
-      /// int fclose(FILE *stream);
-      fclose,
-      /// FILE *fdopen(int fildes, const char *mode);
-      fdopen,
-      /// int feof(FILE *stream);
-      feof,
-      /// int ferror(FILE *stream);
-      ferror,
-      /// int fflush(FILE *stream);
-      fflush,
-      /// int ffs(int i);
-      ffs,
-      /// int ffsl(long int i);
-      ffsl,
-      /// int ffsll(long long int i);
-      ffsll,
-      /// int fgetc(FILE *stream);
-      fgetc,
-      /// int fgetpos(FILE *stream, fpos_t *pos);
-      fgetpos,
-      /// char *fgets(char *s, int n, FILE *stream);
-      fgets,
-      /// int fileno(FILE *stream);
-      fileno,
-      /// int fiprintf(FILE *stream, const char *format, ...);
-      fiprintf,
-      /// void flockfile(FILE *file);
-      flockfile,
-      /// double floor(double x);
-      floor,
-      /// float floorf(float x);
-      floorf,
-      /// long double floorl(long double x);
-      floorl,
-      /// double fmax(double x, double y);
-      fmax,
-      /// float fmaxf(float x, float y);
-      fmaxf,
-      /// long double fmaxl(long double x, long double y);
-      fmaxl,
-      /// double fmin(double x, double y);
-      fmin,
-      /// float fminf(float x, float y);
-      fminf,
-      /// long double fminl(long double x, long double y);
-      fminl,
-      /// double fmod(double x, double y);
-      fmod,
-      /// float fmodf(float x, float y);
-      fmodf,
-      /// long double fmodl(long double x, long double y);
-      fmodl,
-      /// FILE *fopen(const char *filename, const char *mode);
-      fopen,
-      /// FILE *fopen64(const char *filename, const char *opentype)
-      fopen64,
-      /// int fprintf(FILE *stream, const char *format, ...);
-      fprintf,
-      /// int fputc(int c, FILE *stream);
-      fputc,
-      /// int fputs(const char *s, FILE *stream);
-      fputs,
-      /// size_t fread(void *ptr, size_t size, size_t nitems, FILE *stream);
-      fread,
-      /// void free(void *ptr);
-      free,
-      /// double frexp(double num, int *exp);
-      frexp,
-      /// float frexpf(float num, int *exp);
-      frexpf,
-      /// long double frexpl(long double num, int *exp);
-      frexpl,
-      /// int fscanf(FILE *stream, const char *format, ... );
-      fscanf,
-      /// int fseek(FILE *stream, long offset, int whence);
-      fseek,
-      /// int fseeko(FILE *stream, off_t offset, int whence);
-      fseeko,
-      /// int fseeko64(FILE *stream, off64_t offset, int whence)
-      fseeko64,
-      /// int fsetpos(FILE *stream, const fpos_t *pos);
-      fsetpos,
-      /// int fstat(int fildes, struct stat *buf);
-      fstat,
-      /// int fstat64(int filedes, struct stat64 *buf)
-      fstat64,
-      /// int fstatvfs(int fildes, struct statvfs *buf);
-      fstatvfs,
-      /// int fstatvfs64(int fildes, struct statvfs64 *buf);
-      fstatvfs64,
-      /// long ftell(FILE *stream);
-      ftell,
-      /// off_t ftello(FILE *stream);
-      ftello,
-      /// off64_t ftello64(FILE *stream)
-      ftello64,
-      /// int ftrylockfile(FILE *file);
-      ftrylockfile,
-      /// void funlockfile(FILE *file);
-      funlockfile,
-      /// size_t fwrite(const void *ptr, size_t size, size_t nitems,
-      /// FILE *stream);
-      fwrite,
-      /// int getc(FILE *stream);
-      getc,
-      /// int getc_unlocked(FILE *stream);
-      getc_unlocked,
-      /// int getchar(void);
-      getchar,
-      /// char *getenv(const char *name);
-      getenv,
-      /// int getitimer(int which, struct itimerval *value);
-      getitimer,
-      /// int getlogin_r(char *name, size_t namesize);
-      getlogin_r,
-      /// struct passwd *getpwnam(const char *name);
-      getpwnam,
-      /// char *gets(char *s);
-      gets,
-      /// int gettimeofday(struct timeval *tp, void *tzp);
-      gettimeofday,
-      /// uint32_t htonl(uint32_t hostlong);
-      htonl,
-      /// uint16_t htons(uint16_t hostshort);
-      htons,
-      /// int iprintf(const char *format, ...);
-      iprintf,
-      /// int isascii(int c);
-      isascii,
-      /// int isdigit(int c);
-      isdigit,
-      /// long int labs(long int j);
-      labs,
-      /// int lchown(const char *path, uid_t owner, gid_t group);
-      lchown,
-      /// double ldexp(double x, int n);
-      ldexp,
-      /// float ldexpf(float x, int n);
-      ldexpf,
-      /// long double ldexpl(long double x, int n);
-      ldexpl,
-      /// long long int llabs(long long int j);
-      llabs,
-      /// double log(double x);
-      log,
-      /// double log10(double x);
-      log10,
-      /// float log10f(float x);
-      log10f,
-      /// long double log10l(long double x);
-      log10l,
-      /// double log1p(double x);
-      log1p,
-      /// float log1pf(float x);
-      log1pf,
-      /// long double log1pl(long double x);
-      log1pl,
-      /// double log2(double x);
-      log2,
-      /// float log2f(float x);
-      log2f,
-      /// double long double log2l(long double x);
-      log2l,
-      /// double logb(double x);
-      logb,
-      /// float logbf(float x);
-      logbf,
-      /// long double logbl(long double x);
-      logbl,
-      /// float logf(float x);
-      logf,
-      /// long double logl(long double x);
-      logl,
-      /// int lstat(const char *path, struct stat *buf);
-      lstat,
-      /// int lstat64(const char *path, struct stat64 *buf);
-      lstat64,
-      /// void *malloc(size_t size);
-      malloc,
-      /// void *memalign(size_t boundary, size_t size);
-      memalign,
-      /// void *memccpy(void *s1, const void *s2, int c, size_t n);
-      memccpy,
-      /// void *memchr(const void *s, int c, size_t n);
-      memchr,
-      /// int memcmp(const void *s1, const void *s2, size_t n);
-      memcmp,
-      /// void *memcpy(void *s1, const void *s2, size_t n);
-      memcpy,
-      /// void *memmove(void *s1, const void *s2, size_t n);
-      memmove,
-      // void *memrchr(const void *s, int c, size_t n);
-      memrchr,
-      /// void *memset(void *b, int c, size_t len);
-      memset,
-      /// void memset_pattern16(void *b, const void *pattern16, size_t len);
-      memset_pattern16,
-      /// int mkdir(const char *path, mode_t mode);
-      mkdir,
-      /// time_t mktime(struct tm *timeptr);
-      mktime,
-      /// double modf(double x, double *iptr);
-      modf,
-      /// float modff(float, float *iptr);
-      modff,
-      /// long double modfl(long double value, long double *iptr);
-      modfl,
-      /// double nearbyint(double x);
-      nearbyint,
-      /// float nearbyintf(float x);
-      nearbyintf,
-      /// long double nearbyintl(long double x);
-      nearbyintl,
-      /// uint32_t ntohl(uint32_t netlong);
-      ntohl,
-      /// uint16_t ntohs(uint16_t netshort);
-      ntohs,
-      /// int open(const char *path, int oflag, ... );
-      open,
-      /// int open64(const char *filename, int flags[, mode_t mode])
-      open64,
-      /// DIR *opendir(const char *dirname);
-      opendir,
-      /// int pclose(FILE *stream);
-      pclose,
-      /// void perror(const char *s);
-      perror,
-      /// FILE *popen(const char *command, const char *mode);
-      popen,
-      /// int posix_memalign(void **memptr, size_t alignment, size_t size);
-      posix_memalign,
-      /// double pow(double x, double y);
-      pow,
-      /// float powf(float x, float y);
-      powf,
-      /// long double powl(long double x, long double y);
-      powl,
-      /// ssize_t pread(int fildes, void *buf, size_t nbyte, off_t offset);
-      pread,
-      /// int printf(const char *format, ...);
-      printf,
-      /// int putc(int c, FILE *stream);
-      putc,
-      /// int putchar(int c);
-      putchar,
-      /// int puts(const char *s);
-      puts,
-      /// ssize_t pwrite(int fildes, const void *buf, size_t nbyte,
-      ///                off_t offset);
-      pwrite,
-      /// void qsort(void *base, size_t nel, size_t width,
-      ///            int (*compar)(const void *, const void *));
-      qsort,
-      /// ssize_t read(int fildes, void *buf, size_t nbyte);
-      read,
-      /// ssize_t readlink(const char *path, char *buf, size_t bufsize);
-      readlink,
-      /// void *realloc(void *ptr, size_t size);
-      realloc,
-      /// void *reallocf(void *ptr, size_t size);
-      reallocf,
-      /// char *realpath(const char *file_name, char *resolved_name);
-      realpath,
-      /// int remove(const char *path);
-      remove,
-      /// int rename(const char *old, const char *new);
-      rename,
-      /// void rewind(FILE *stream);
-      rewind,
-      /// double rint(double x);
-      rint,
-      /// float rintf(float x);
-      rintf,
-      /// long double rintl(long double x);
-      rintl,
-      /// int rmdir(const char *path);
-      rmdir,
-      /// double round(double x);
-      round,
-      /// float roundf(float x);
-      roundf,
-      /// long double roundl(long double x);
-      roundl,
-      /// int scanf(const char *restrict format, ... );
-      scanf,
-      /// void setbuf(FILE *stream, char *buf);
-      setbuf,
-      /// int setitimer(int which, const struct itimerval *value,
-      ///               struct itimerval *ovalue);
-      setitimer,
-      /// int setvbuf(FILE *stream, char *buf, int type, size_t size);
-      setvbuf,
-      /// double sin(double x);
-      sin,
-      /// float sinf(float x);
-      sinf,
-      /// double sinh(double x);
-      sinh,
-      /// float sinhf(float x);
-      sinhf,
-      /// long double sinhl(long double x);
-      sinhl,
-      /// long double sinl(long double x);
-      sinl,
-      /// int siprintf(char *str, const char *format, ...);
-      siprintf,
-      /// int snprintf(char *s, size_t n, const char *format, ...);
-      snprintf,
-      /// int sprintf(char *str, const char *format, ...);
-      sprintf,
-      /// double sqrt(double x);
-      sqrt,
-      /// float sqrtf(float x);
-      sqrtf,
-      /// long double sqrtl(long double x);
-      sqrtl,
-      /// int sscanf(const char *s, const char *format, ... );
-      sscanf,
-      /// int stat(const char *path, struct stat *buf);
-      stat,
-      /// int stat64(const char *path, struct stat64 *buf);
-      stat64,
-      /// int statvfs(const char *path, struct statvfs *buf);
-      statvfs,
-      /// int statvfs64(const char *path, struct statvfs64 *buf)
-      statvfs64,
-      /// char *stpcpy(char *s1, const char *s2);
-      stpcpy,
-      /// char *stpncpy(char *s1, const char *s2, size_t n);
-      stpncpy,
-      /// int strcasecmp(const char *s1, const char *s2);
-      strcasecmp,
-      /// char *strcat(char *s1, const char *s2);
-      strcat,
-      /// char *strchr(const char *s, int c);
-      strchr,
-      /// int strcmp(const char *s1, const char *s2);
-      strcmp,
-      /// int strcoll(const char *s1, const char *s2);
-      strcoll,
-      /// char *strcpy(char *s1, const char *s2);
-      strcpy,
-      /// size_t strcspn(const char *s1, const char *s2);
-      strcspn,
-      /// char *strdup(const char *s1);
-      strdup,
-      /// size_t strlen(const char *s);
-      strlen,
-      /// int strncasecmp(const char *s1, const char *s2, size_t n);
-      strncasecmp,
-      /// char *strncat(char *s1, const char *s2, size_t n);
-      strncat,
-      /// int strncmp(const char *s1, const char *s2, size_t n);
-      strncmp,
-      /// char *strncpy(char *s1, const char *s2, size_t n);
-      strncpy,
-      /// char *strndup(const char *s1, size_t n);
-      strndup,
-      /// size_t strnlen(const char *s, size_t maxlen);
-      strnlen,
-      /// char *strpbrk(const char *s1, const char *s2);
-      strpbrk,
-      /// char *strrchr(const char *s, int c);
-      strrchr,
-      /// size_t strspn(const char *s1, const char *s2);
-      strspn,
-      /// char *strstr(const char *s1, const char *s2);
-      strstr,
-      /// double strtod(const char *nptr, char **endptr);
-      strtod,
-      /// float strtof(const char *nptr, char **endptr);
-      strtof,
-      // char *strtok(char *s1, const char *s2);
-      strtok,
-      // char *strtok_r(char *s, const char *sep, char **lasts);
-      strtok_r,
-      /// long int strtol(const char *nptr, char **endptr, int base);
-      strtol,
-      /// long double strtold(const char *nptr, char **endptr);
-      strtold,
-      /// long long int strtoll(const char *nptr, char **endptr, int base);
-      strtoll,
-      /// unsigned long int strtoul(const char *nptr, char **endptr, int base);
-      strtoul,
-      /// unsigned long long int strtoull(const char *nptr, char **endptr,
-      ///                                 int base);
-      strtoull,
-      /// size_t strxfrm(char *s1, const char *s2, size_t n);
-      strxfrm,
-      /// int system(const char *command);
-      system,
-      /// double tan(double x);
-      tan,
-      /// float tanf(float x);
-      tanf,
-      /// double tanh(double x);
-      tanh,
-      /// float tanhf(float x);
-      tanhf,
-      /// long double tanhl(long double x);
-      tanhl,
-      /// long double tanl(long double x);
-      tanl,
-      /// clock_t times(struct tms *buffer);
-      times,
-      /// FILE *tmpfile(void);
-      tmpfile,
-      /// FILE *tmpfile64(void)
-      tmpfile64,
-      /// int toascii(int c);
-      toascii,
-      /// double trunc(double x);
-      trunc,
-      /// float truncf(float x);
-      truncf,
-      /// long double truncl(long double x);
-      truncl,
-      /// int uname(struct utsname *name);
-      uname,
-      /// int ungetc(int c, FILE *stream);
-      ungetc,
-      /// int unlink(const char *path);
-      unlink,
-      /// int unsetenv(const char *name);
-      unsetenv,
-      /// int utime(const char *path, const struct utimbuf *times);
-      utime,
-      /// int utimes(const char *path, const struct timeval times[2]);
-      utimes,
-      /// void *valloc(size_t size);
-      valloc,
-      /// int vfprintf(FILE *stream, const char *format, va_list ap);
-      vfprintf,
-      /// int vfscanf(FILE *stream, const char *format, va_list arg);
-      vfscanf,
-      /// int vprintf(const char *restrict format, va_list ap);
-      vprintf,
-      /// int vscanf(const char *format, va_list arg);
-      vscanf,
-      /// int vsnprintf(char *s, size_t n, const char *format, va_list ap);
-      vsnprintf,
-      /// int vsprintf(char *s, const char *format, va_list ap);
-      vsprintf,
-      /// int vsscanf(const char *s, const char *format, va_list arg);
-      vsscanf,
-      /// ssize_t write(int fildes, const void *buf, size_t nbyte);
-      write,
-
-      NumLibFuncs
-    };
-  }
-
-/// TargetLibraryInfo - This immutable pass captures information about what
-/// library functions are available for the current target, and allows a
-/// frontend to disable optimizations through -fno-builtin etc.
-class TargetLibraryInfo : public ImmutablePass {
-  virtual void anchor();
-  unsigned char AvailableArray[(LibFunc::NumLibFuncs+3)/4];
-  llvm::DenseMap<unsigned, std::string> CustomNames;
-  static const char* StandardNames[LibFunc::NumLibFuncs];
-
-  enum AvailabilityState {
-    StandardName = 3, // (memset to all ones)
-    CustomName = 1,
-    Unavailable = 0  // (memset to all zeros)
-  };
-  void setState(LibFunc::Func F, AvailabilityState State) {
-    AvailableArray[F/4] &= ~(3 << 2*(F&3));
-    AvailableArray[F/4] |= State << 2*(F&3);
-  }
-  AvailabilityState getState(LibFunc::Func F) const {
-    return static_cast<AvailabilityState>((AvailableArray[F/4] >> 2*(F&3)) & 3);
-  }
-
-public:
-  static char ID;
-  TargetLibraryInfo();
-  TargetLibraryInfo(const Triple &T);
-  explicit TargetLibraryInfo(const TargetLibraryInfo &TLI);
-
-  /// getLibFunc - Search for a particular function name.  If it is one of the
-  /// known library functions, return true and set F to the corresponding value.
-  bool getLibFunc(StringRef funcName, LibFunc::Func &F) const;
-
-  /// has - This function is used by optimizations that want to match on or form
-  /// a given library function.
-  bool has(LibFunc::Func F) const {
-    return getState(F) != Unavailable;
-  }
-
-  /// hasOptimizedCodeGen - Return true if the function is both available as
-  /// a builtin and a candidate for optimized code generation.
-  bool hasOptimizedCodeGen(LibFunc::Func F) const {
-    if (getState(F) == Unavailable)
-      return false;
-    switch (F) {
-    default: break;
-    case LibFunc::copysign:  case LibFunc::copysignf:  case LibFunc::copysignl:
-    case LibFunc::fabs:      case LibFunc::fabsf:      case LibFunc::fabsl:
-    case LibFunc::sin:       case LibFunc::sinf:       case LibFunc::sinl:
-    case LibFunc::cos:       case LibFunc::cosf:       case LibFunc::cosl:
-    case LibFunc::sqrt:      case LibFunc::sqrtf:      case LibFunc::sqrtl:
-    case LibFunc::sqrt_finite: case LibFunc::sqrtf_finite:
-                                                  case LibFunc::sqrtl_finite:
-    case LibFunc::fmax:      case LibFunc::fmaxf:      case LibFunc::fmaxl:
-    case LibFunc::fmin:      case LibFunc::fminf:      case LibFunc::fminl:
-    case LibFunc::floor:     case LibFunc::floorf:     case LibFunc::floorl:
-    case LibFunc::nearbyint: case LibFunc::nearbyintf: case LibFunc::nearbyintl:
-    case LibFunc::ceil:      case LibFunc::ceilf:      case LibFunc::ceill:
-    case LibFunc::rint:      case LibFunc::rintf:      case LibFunc::rintl:
-    case LibFunc::round:     case LibFunc::roundf:     case LibFunc::roundl:
-    case LibFunc::trunc:     case LibFunc::truncf:     case LibFunc::truncl:
-    case LibFunc::log2:      case LibFunc::log2f:      case LibFunc::log2l:
-    case LibFunc::exp2:      case LibFunc::exp2f:      case LibFunc::exp2l:
-    case LibFunc::memcmp:    case LibFunc::strcmp:     case LibFunc::strcpy:
-    case LibFunc::stpcpy:    case LibFunc::strlen:     case LibFunc::strnlen:
-    case LibFunc::memchr:
-      return true;
-    }
-    return false;
-  }
-
-  StringRef getName(LibFunc::Func F) const {
-    AvailabilityState State = getState(F);
-    if (State == Unavailable)
-      return StringRef();
-    if (State == StandardName)
-      return StandardNames[F];
-    assert(State == CustomName);
-    return CustomNames.find(F)->second;
-  }
-
-  /// setUnavailable - this can be used by whatever sets up TargetLibraryInfo to
-  /// ban use of specific library functions.
-  void setUnavailable(LibFunc::Func F) {
-    setState(F, Unavailable);
-  }
-
-  void setAvailable(LibFunc::Func F) {
-    setState(F, StandardName);
-  }
-
-  void setAvailableWithName(LibFunc::Func F, StringRef Name) {
-    if (StandardNames[F] != Name) {
-      setState(F, CustomName);
-      CustomNames[F] = Name;
-      assert(CustomNames.find(F) != CustomNames.end());
-    } else {
-      setState(F, StandardName);
-    }
-  }
-
-  /// disableAllFunctions - This disables all builtins, which is used for
-  /// options like -fno-builtin.
-  void disableAllFunctions();
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 882dab4..4118917 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -30,9 +30,9 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Target/TargetCallingConv.h"
 #include "llvm/Target/TargetMachine.h"
@@ -51,6 +51,7 @@ namespace llvm {
   class MachineFunction;
   class MachineInstr;
   class MachineJumpTableInfo;
+  class MachineLoop;
   class Mangler;
   class MCContext;
   class MCExpr;
@@ -76,8 +77,8 @@ namespace llvm {
 /// This base class for TargetLowering contains the SelectionDAG-independent
 /// parts that can be used from the rest of CodeGen.
 class TargetLoweringBase {
-  TargetLoweringBase(const TargetLoweringBase&) LLVM_DELETED_FUNCTION;
-  void operator=(const TargetLoweringBase&) LLVM_DELETED_FUNCTION;
+  TargetLoweringBase(const TargetLoweringBase&) = delete;
+  void operator=(const TargetLoweringBase&) = delete;
 
 public:
   /// This enum indicates whether operations are valid for a target, and if not,
@@ -148,9 +149,6 @@ protected:
 public:
   const TargetMachine &getTargetMachine() const { return TM; }
   const DataLayout *getDataLayout() const { return DL; }
-  const TargetLoweringObjectFile &getObjFileLowering() const {
-    return *TM.getObjFileLowering();
-  }
 
   bool isBigEndian() const { return !IsLittleEndian; }
   bool isLittleEndian() const { return IsLittleEndian; }
@@ -216,6 +214,11 @@ public:
   /// several shifts, adds, and multiplies for this target.
   bool isIntDivCheap() const { return IntDivIsCheap; }
 
+  /// Return true if sqrt(x) is as cheap or cheaper than 1 / rsqrt(x)
+  bool isFsqrtCheap() const {
+    return FsqrtIsCheap;
+  }
+
   /// Returns true if target has indicated at least one type should be bypassed.
   bool isSlowDivBypassed() const { return !BypassSlowDivWidths.empty(); }
 
@@ -249,6 +252,16 @@ public:
     return true;
   }
 
+  /// \brief Return true if it is cheap to speculate a call to intrinsic cttz.
+  virtual bool isCheapToSpeculateCttz() const {
+    return false;
+  }
+  
+  /// \brief Return true if it is cheap to speculate a call to intrinsic ctlz.
+  virtual bool isCheapToSpeculateCtlz() const {
+    return false;
+  }
+
   /// \brief Return if the target supports combining a
   /// chain like:
   /// \code
@@ -264,6 +277,11 @@ public:
     return MaskAndBranchFoldingIsLegal;
   }
 
+  /// \brief Return true if the target wants to use the optimization that
+  /// turns ext(promotableInst1(...(promotableInstN(load)))) into
+  /// promotedInst1(...(promotedInstN(ext(load)))).
+  bool enableExtLdPromotion() const { return EnableExtLdPromotion; }
+
   /// Return true if the target can combine store(extractelement VectorTy,
   /// Idx).
   /// \p Cost[out] gives the cost of that transformation when this is true.
@@ -541,18 +559,27 @@ public:
   /// Return how this load with extension should be treated: either it is legal,
   /// needs to be promoted to a larger size, needs to be expanded to some other
   /// code sequence, or the target has a custom expander for it.
-  LegalizeAction getLoadExtAction(unsigned ExtType, EVT VT) const {
-    if (VT.isExtended()) return Expand;
-    unsigned I = (unsigned) VT.getSimpleVT().SimpleTy;
-    assert(ExtType < ISD::LAST_LOADEXT_TYPE && I < MVT::LAST_VALUETYPE &&
-           "Table isn't big enough!");
-    return (LegalizeAction)LoadExtActions[I][ExtType];
+  LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, EVT MemVT) const {
+    if (ValVT.isExtended() || MemVT.isExtended()) return Expand;
+    unsigned ValI = (unsigned) ValVT.getSimpleVT().SimpleTy;
+    unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy;
+    assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValI < MVT::LAST_VALUETYPE &&
+           MemI < MVT::LAST_VALUETYPE && "Table isn't big enough!");
+    return (LegalizeAction)LoadExtActions[ValI][MemI][ExtType];
   }
 
   /// Return true if the specified load with extension is legal on this target.
-  bool isLoadExtLegal(unsigned ExtType, EVT VT) const {
-    return VT.isSimple() &&
-      getLoadExtAction(ExtType, VT.getSimpleVT()) == Legal;
+  bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const {
+    return ValVT.isSimple() && MemVT.isSimple() &&
+      getLoadExtAction(ExtType, ValVT, MemVT) == Legal;
+  }
+
+  /// Return true if the specified load with extension is legal or custom
+  /// on this target.
+  bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const {
+    return ValVT.isSimple() && MemVT.isSimple() &&
+      (getLoadExtAction(ExtType, ValVT, MemVT) == Legal ||
+       getLoadExtAction(ExtType, ValVT, MemVT) == Custom);
   }
 
   /// Return how this store with truncation should be treated: either it is
@@ -579,7 +606,7 @@ public:
   /// sequence, or the target has a custom expander for it.
   LegalizeAction
   getIndexedLoadAction(unsigned IdxMode, MVT VT) const {
-    assert(IdxMode < ISD::LAST_INDEXED_MODE && VT < MVT::LAST_VALUETYPE &&
+    assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() &&
            "Table isn't big enough!");
     unsigned Ty = (unsigned)VT.SimpleTy;
     return (LegalizeAction)((IndexedModeActions[Ty][IdxMode] & 0xf0) >> 4);
@@ -597,7 +624,7 @@ public:
   /// sequence, or the target has a custom expander for it.
   LegalizeAction
   getIndexedStoreAction(unsigned IdxMode, MVT VT) const {
-    assert(IdxMode < ISD::LAST_INDEXED_MODE && VT < MVT::LAST_VALUETYPE &&
+    assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() &&
            "Table isn't big enough!");
     unsigned Ty = (unsigned)VT.SimpleTy;
     return (LegalizeAction)(IndexedModeActions[Ty][IdxMode] & 0x0f);
@@ -753,6 +780,16 @@ public:
   /// reduce runtime.
   virtual bool ShouldShrinkFPConstant(EVT) const { return true; }
 
+  // Return true if it is profitable to reduce the given load node to a smaller
+  // type.
+  //
+  // e.g. (i16 (trunc (i32 (load x))) -> i16 load x should be performed
+  virtual bool shouldReduceLoadWidth(SDNode *Load,
+                                     ISD::LoadExtType ExtTy,
+                                     EVT NewVT) const {
+    return true;
+  }
+
   /// When splitting a value of the specified type into parts, does the Lo
   /// or Hi part come first?  This usually follows the endianness, except
   /// for ppcf128, where the Hi part always comes first.
@@ -904,7 +941,7 @@ public:
   }
 
   /// Return the preferred loop alignment.
-  unsigned getPrefLoopAlignment() const {
+  virtual unsigned getPrefLoopAlignment(MachineLoop *ML = nullptr) const {
     return PrefLoopAlignment;
   }
 
@@ -922,12 +959,6 @@ public:
     return false;
   }
 
-  /// Returns the maximal possible offset which can be used for loads / stores
-  /// from the global.
-  virtual unsigned getMaximalGlobalOffset() const {
-    return 0;
-  }
-
   /// Returns true if a cast between SrcAS and DestAS is a noop.
   virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
     return false;
@@ -1057,10 +1088,6 @@ public:
   // TargetLowering Configuration Methods - These methods should be invoked by
   // the derived class constructor to configure this object for the target.
   //
-
-  /// \brief Reset the operation actions based on target options.
-  virtual void resetOperationActions() {}
-
 protected:
   /// Specify how the target extends the result of integer and floating point
   /// boolean values from i1 to a wider type.  See getBooleanContents.
@@ -1156,7 +1183,11 @@ protected:
   /// possible, should be replaced by an alternate sequence of instructions not
   /// containing an integer divide.
   void setIntDivIsCheap(bool isCheap = true) { IntDivIsCheap = isCheap; }
-  
+
+  /// Tells the code generator that fsqrt is cheap, and should not be replaced
+  /// with an alternative sequence of instructions.
+  void setFsqrtIsCheap(bool isCheap = true) { FsqrtIsCheap = isCheap; }
+
   /// Tells the code generator that this target supports floating point
   /// exceptions and cares about preserving floating point exception behavior.
   void setHasFloatingPointExceptions(bool FPExceptions = true) {
@@ -1194,12 +1225,12 @@ protected:
 
   /// Return the largest legal super-reg register class of the register class
   /// for the specified type and its associated "cost".
-  virtual std::pair<const TargetRegisterClass*, uint8_t>
-  findRepresentativeClass(MVT VT) const;
+  virtual std::pair<const TargetRegisterClass *, uint8_t>
+  findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const;
 
   /// Once all of the register classes are added, this allows us to compute
   /// derived properties we expose.
-  void computeRegisterProperties();
+  void computeRegisterProperties(const TargetRegisterInfo *TRI);
 
   /// Indicate that the specified operation does not work with the specified
   /// type and indicate what to do about it.
@@ -1211,19 +1242,18 @@ protected:
 
   /// Indicate that the specified load with extension does not work with the
   /// specified type and indicate what to do about it.
-  void setLoadExtAction(unsigned ExtType, MVT VT,
+  void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT,
                         LegalizeAction Action) {
-    assert(ExtType < ISD::LAST_LOADEXT_TYPE && VT < MVT::LAST_VALUETYPE &&
-           "Table isn't big enough!");
-    LoadExtActions[VT.SimpleTy][ExtType] = (uint8_t)Action;
+    assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValVT.isValid() &&
+           MemVT.isValid() && "Table isn't big enough!");
+    LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy][ExtType] = (uint8_t)Action;
   }
 
   /// Indicate that the specified truncating store does not work with the
   /// specified type and indicate what to do about it.
   void setTruncStoreAction(MVT ValVT, MVT MemVT,
                            LegalizeAction Action) {
-    assert(ValVT < MVT::LAST_VALUETYPE && MemVT < MVT::LAST_VALUETYPE &&
-           "Table isn't big enough!");
+    assert(ValVT.isValid() && MemVT.isValid() && "Table isn't big enough!");
     TruncStoreActions[ValVT.SimpleTy][MemVT.SimpleTy] = (uint8_t)Action;
   }
 
@@ -1234,7 +1264,7 @@ protected:
   /// TargetLowering.cpp
   void setIndexedLoadAction(unsigned IdxMode, MVT VT,
                             LegalizeAction Action) {
-    assert(VT < MVT::LAST_VALUETYPE && IdxMode < ISD::LAST_INDEXED_MODE &&
+    assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE &&
            (unsigned)Action < 0xf && "Table isn't big enough!");
     // Load action are kept in the upper half.
     IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0xf0;
@@ -1248,7 +1278,7 @@ protected:
   /// TargetLowering.cpp
   void setIndexedStoreAction(unsigned IdxMode, MVT VT,
                              LegalizeAction Action) {
-    assert(VT < MVT::LAST_VALUETYPE && IdxMode < ISD::LAST_INDEXED_MODE &&
+    assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE &&
            (unsigned)Action < 0xf && "Table isn't big enough!");
     // Store action are kept in the lower half.
     IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0x0f;
@@ -1259,8 +1289,7 @@ protected:
   /// target and indicate what to do about it.
   void setCondCodeAction(ISD::CondCode CC, MVT VT,
                          LegalizeAction Action) {
-    assert(VT < MVT::LAST_VALUETYPE &&
-           (unsigned)CC < array_lengthof(CondCodeActions) &&
+    assert(VT.isValid() && (unsigned)CC < array_lengthof(CondCodeActions) &&
            "Table isn't big enough!");
     /// The lower 5 bits of the SimpleTy index into Nth 2bit set from the 32-bit
     /// value and the upper 27 bits index into the second dimension of the array
@@ -1311,7 +1340,8 @@ protected:
 
   /// Set the target's preferred loop alignment. Default alignment is zero, it
   /// means the target does not care about loop alignment.  The alignment is
-  /// specified in log2(bytes).
+  /// specified in log2(bytes). The target may also override
+  /// getPrefLoopAlignment to provide per-loop values.
   void setPrefLoopAlignment(unsigned Align) {
     PrefLoopAlignment = Align;
   }
@@ -1420,6 +1450,8 @@ public:
     return false;
   }
 
+  virtual bool isProfitableToHoist(Instruction *I) const { return true; }
+
   /// Return true if any actual instruction that defines a value of type Ty1
   /// implicitly zero-extends the value to Ty2 in the result register.
   ///
@@ -1474,6 +1506,18 @@ public:
     return isZExtFree(Val.getValueType(), VT2);
   }
 
+  /// Return true if an fpext operation is free (for instance, because
+  /// single-precision floating-point numbers are implicitly extended to
+  /// double-precision).
+  virtual bool isFPExtFree(EVT VT) const {
+    assert(VT.isFloatingPoint());
+    return false;
+  }
+
+  /// Return true if folding a vector load into ExtVal (a sign, zero, or any
+  /// extend node) is profitable.
+  virtual bool isVectorLoadExtDesirable(SDValue ExtVal) const { return false; }
+
   /// Return true if an fneg operation is free to the point where it is never
   /// worthwhile to replace it with a bitwise operation.
   virtual bool isFNegFree(EVT VT) const {
@@ -1516,6 +1560,15 @@ public:
                                                  Type *Ty) const {
     return false;
   }
+
+  /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
+  /// with this index. This is needed because EXTRACT_SUBVECTOR usually
+  /// has custom lowering that depends on the index of the first element,
+  /// and only the target knows which lowering is cheap.
+  virtual bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const {
+    return false;
+  }
+
   //===--------------------------------------------------------------------===//
   // Runtime Library hooks
   //
@@ -1582,6 +1635,9 @@ private:
   /// unconditionally.
   bool IntDivIsCheap;
 
+  // Don't expand fsqrt with an approximation based on the inverse sqrt.
+  bool FsqrtIsCheap;
+
   /// Tells the code generator to bypass slow divide or remainder
   /// instructions. For example, BypassSlowDivWidths[32,8] tells the code
   /// generator to bypass 32-bit integer div/rem with an 8-bit unsigned integer
@@ -1703,7 +1759,8 @@ private:
   /// For each load extension type and each value type, keep a LegalizeAction
   /// that indicates how instruction selection should deal with a load of a
   /// specific value type and extension type.
-  uint8_t LoadExtActions[MVT::LAST_VALUETYPE][ISD::LAST_LOADEXT_TYPE];
+  uint8_t LoadExtActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE]
+                        [ISD::LAST_LOADEXT_TYPE];
 
   /// For each value type pair keep a LegalizeAction that indicates whether a
   /// truncating store of a specific value type and truncating type is legal.
@@ -1727,136 +1784,8 @@ private:
 
   ValueTypeActionImpl ValueTypeActions;
 
-public:
-  LegalizeKind
-  getTypeConversion(LLVMContext &Context, EVT VT) const {
-    // If this is a simple type, use the ComputeRegisterProp mechanism.
-    if (VT.isSimple()) {
-      MVT SVT = VT.getSimpleVT();
-      assert((unsigned)SVT.SimpleTy < array_lengthof(TransformToType));
-      MVT NVT = TransformToType[SVT.SimpleTy];
-      LegalizeTypeAction LA = ValueTypeActions.getTypeAction(SVT);
-
-      assert(
-        (LA == TypeLegal || LA == TypeSoftenFloat ||
-         ValueTypeActions.getTypeAction(NVT) != TypePromoteInteger)
-         && "Promote may not follow Expand or Promote");
-
-      if (LA == TypeSplitVector)
-        return LegalizeKind(LA, EVT::getVectorVT(Context,
-                                                 SVT.getVectorElementType(),
-                                                 SVT.getVectorNumElements()/2));
-      if (LA == TypeScalarizeVector)
-        return LegalizeKind(LA, SVT.getVectorElementType());
-      return LegalizeKind(LA, NVT);
-    }
-
-    // Handle Extended Scalar Types.
-    if (!VT.isVector()) {
-      assert(VT.isInteger() && "Float types must be simple");
-      unsigned BitSize = VT.getSizeInBits();
-      // First promote to a power-of-two size, then expand if necessary.
-      if (BitSize < 8 || !isPowerOf2_32(BitSize)) {
-        EVT NVT = VT.getRoundIntegerType(Context);
-        assert(NVT != VT && "Unable to round integer VT");
-        LegalizeKind NextStep = getTypeConversion(Context, NVT);
-        // Avoid multi-step promotion.
-        if (NextStep.first == TypePromoteInteger) return NextStep;
-        // Return rounded integer type.
-        return LegalizeKind(TypePromoteInteger, NVT);
-      }
-
-      return LegalizeKind(TypeExpandInteger,
-                          EVT::getIntegerVT(Context, VT.getSizeInBits()/2));
-    }
-
-    // Handle vector types.
-    unsigned NumElts = VT.getVectorNumElements();
-    EVT EltVT = VT.getVectorElementType();
-
-    // Vectors with only one element are always scalarized.
-    if (NumElts == 1)
-      return LegalizeKind(TypeScalarizeVector, EltVT);
-
-    // Try to widen vector elements until the element type is a power of two and
-    // promote it to a legal type later on, for example:
-    // <3 x i8> -> <4 x i8> -> <4 x i32>
-    if (EltVT.isInteger()) {
-      // Vectors with a number of elements that is not a power of two are always
-      // widened, for example <3 x i8> -> <4 x i8>.
-      if (!VT.isPow2VectorType()) {
-        NumElts = (unsigned)NextPowerOf2(NumElts);
-        EVT NVT = EVT::getVectorVT(Context, EltVT, NumElts);
-        return LegalizeKind(TypeWidenVector, NVT);
-      }
-
-      // Examine the element type.
-      LegalizeKind LK = getTypeConversion(Context, EltVT);
-
-      // If type is to be expanded, split the vector.
-      //  <4 x i140> -> <2 x i140>
-      if (LK.first == TypeExpandInteger)
-        return LegalizeKind(TypeSplitVector,
-                            EVT::getVectorVT(Context, EltVT, NumElts / 2));
-
-      // Promote the integer element types until a legal vector type is found
-      // or until the element integer type is too big. If a legal type was not
-      // found, fallback to the usual mechanism of widening/splitting the
-      // vector.
-      EVT OldEltVT = EltVT;
-      while (1) {
-        // Increase the bitwidth of the element to the next pow-of-two
-        // (which is greater than 8 bits).
-        EltVT = EVT::getIntegerVT(Context, 1 + EltVT.getSizeInBits()
-                                 ).getRoundIntegerType(Context);
-
-        // Stop trying when getting a non-simple element type.
-        // Note that vector elements may be greater than legal vector element
-        // types. Example: X86 XMM registers hold 64bit element on 32bit
-        // systems.
-        if (!EltVT.isSimple()) break;
-
-        // Build a new vector type and check if it is legal.
-        MVT NVT = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
-        // Found a legal promoted vector type.
-        if (NVT != MVT() && ValueTypeActions.getTypeAction(NVT) == TypeLegal)
-          return LegalizeKind(TypePromoteInteger,
-                              EVT::getVectorVT(Context, EltVT, NumElts));
-      }
-
-      // Reset the type to the unexpanded type if we did not find a legal vector
-      // type with a promoted vector element type.
-      EltVT = OldEltVT;
-    }
-
-    // Try to widen the vector until a legal type is found.
-    // If there is no wider legal type, split the vector.
-    while (1) {
-      // Round up to the next power of 2.
-      NumElts = (unsigned)NextPowerOf2(NumElts);
-
-      // If there is no simple vector type with this many elements then there
-      // cannot be a larger legal vector type.  Note that this assumes that
-      // there are no skipped intermediate vector types in the simple types.
-      if (!EltVT.isSimple()) break;
-      MVT LargerVector = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
-      if (LargerVector == MVT()) break;
-
-      // If this type is legal then widen the vector.
-      if (ValueTypeActions.getTypeAction(LargerVector) == TypeLegal)
-        return LegalizeKind(TypeWidenVector, LargerVector);
-    }
-
-    // Widen odd vectors to next power of two.
-    if (!VT.isPow2VectorType()) {
-      EVT NVT = VT.getPow2VectorType(Context);
-      return LegalizeKind(TypeWidenVector, NVT);
-    }
-
-    // Vectors with illegal element types are expanded.
-    EVT NVT = EVT::getVectorVT(Context, EltVT, VT.getVectorNumElements() / 2);
-    return LegalizeKind(TypeSplitVector, NVT);
-  }
+private:
+  LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const;
 
 private:
   std::vector<std::pair<MVT, const TargetRegisterClass*> > AvailableRegClasses;
@@ -1944,6 +1873,9 @@ protected:
   /// a mask of a single bit, a compare, and a branch into a single instruction.
   bool MaskAndBranchFoldingIsLegal;
 
+  /// \see enableExtLdPromotion.
+  bool EnableExtLdPromotion;
+
 protected:
   /// Return true if the value types that can be represented by the specified
   /// register class are all legal.
@@ -1960,8 +1892,8 @@ protected:
 /// This class also defines callbacks that targets must implement to lower
 /// target-specific constructs to SelectionDAG operators.
 class TargetLowering : public TargetLoweringBase {
-  TargetLowering(const TargetLowering&) LLVM_DELETED_FUNCTION;
-  void operator=(const TargetLowering&) LLVM_DELETED_FUNCTION;
+  TargetLowering(const TargetLowering&) = delete;
+  void operator=(const TargetLowering&) = delete;
 
 public:
   /// NOTE: The TargetMachine owns TLOF.
@@ -2112,8 +2044,7 @@ public:
 
     void AddToWorklist(SDNode *N);
     void RemoveFromWorklist(SDNode *N);
-    SDValue CombineTo(SDNode *N, const std::vector<SDValue> &To,
-                      bool AddTo = true);
+    SDValue CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo = true);
     SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true);
     SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo = true);
 
@@ -2252,6 +2183,7 @@ public:
     SelectionDAG &DAG;
     SDLoc DL;
     ImmutableCallSite *CS;
+    bool IsPatchPoint;
     SmallVector<ISD::OutputArg, 32> Outs;
     SmallVector<SDValue, 32> OutVals;
     SmallVector<ISD::InputArg, 32> Ins;
@@ -2260,7 +2192,7 @@ public:
       : RetTy(nullptr), RetSExt(false), RetZExt(false), IsVarArg(false),
         IsInReg(false), DoesNotReturn(false), IsReturnValueUsed(true),
         IsTailCall(false), NumFixedArgs(-1), CallConv(CallingConv::C),
-        DAG(DAG), CS(nullptr) {}
+        DAG(DAG), CS(nullptr), IsPatchPoint(false) {}
 
     CallLoweringInfo &setDebugLoc(SDLoc dl) {
       DL = dl;
@@ -2342,6 +2274,11 @@ public:
       return *this;
     }
 
+    CallLoweringInfo &setIsPatchPoint(bool Value = true) {
+      IsPatchPoint = Value;
+      return *this;
+    }
+
     ArgListTy &getArgs() {
       return Args;
     }
@@ -2588,7 +2525,8 @@ public:
   /// specific constraints and their prefixes, and also tie in the associated
   /// operand values.  If this returns an empty vector, and if the constraint
   /// string itself isn't empty, there was an error parsing.
-  virtual AsmOperandInfoVector ParseConstraints(ImmutableCallSite CS) const;
+  virtual AsmOperandInfoVector ParseConstraints(const TargetRegisterInfo *TRI,
+                                                ImmutableCallSite CS) const;
 
   /// Examine constraint type and operand type and determine a weight value.
   /// The operand object must already have been set up with the operand type.
@@ -2619,10 +2557,10 @@ public:
   /// pointer.
   ///
   /// This should only be used for C_Register constraints.  On error, this
-  /// returns a register number of 0 and a null register class pointer..
-  virtual std::pair<unsigned, const TargetRegisterClass*>
-    getRegForInlineAsmConstraint(const std::string &Constraint,
-                                 MVT VT) const;
+  /// returns a register number of 0 and a null register class pointer.
+  virtual std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               const std::string &Constraint, MVT VT) const;
 
   /// Try to replace an X constraint, which matches anything, with another that
   /// has more specific requirements based on the type of the corresponding
@@ -2652,6 +2590,12 @@ public:
     return SDValue();
   }
 
+  /// Indicate whether this target prefers to combine the given number of FDIVs
+  /// with the same divisor.
+  virtual bool combineRepeatedFPDivisors(unsigned NumUsers) const {
+    return false;
+  }
+
   /// Hooks for building estimates in place of slower divisions and square
   /// roots.
   
diff --git a/include/llvm/Target/TargetLoweringObjectFile.h b/include/llvm/Target/TargetLoweringObjectFile.h
index 7fcb171..57c2606 100644
--- a/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/include/llvm/Target/TargetLoweringObjectFile.h
@@ -38,13 +38,17 @@ class TargetLoweringObjectFile : public MCObjectFileInfo {
   const DataLayout *DL;
 
   TargetLoweringObjectFile(
-    const TargetLoweringObjectFile&) LLVM_DELETED_FUNCTION;
-  void operator=(const TargetLoweringObjectFile&) LLVM_DELETED_FUNCTION;
+    const TargetLoweringObjectFile&) = delete;
+  void operator=(const TargetLoweringObjectFile&) = delete;
+
+protected:
+  bool SupportIndirectSymViaGOTPCRel;
 
 public:
   MCContext &getContext() const { return *Ctx; }
 
-  TargetLoweringObjectFile() : MCObjectFileInfo(), Ctx(nullptr), DL(nullptr) {}
+  TargetLoweringObjectFile() : MCObjectFileInfo(), Ctx(nullptr), DL(nullptr),
+                               SupportIndirectSymViaGOTPCRel(false) {}
 
   virtual ~TargetLoweringObjectFile();
 
@@ -94,6 +98,13 @@ public:
     return SectionForGlobal(GV, getKindForGlobal(GV, TM), Mang, TM);
   }
 
+  virtual const MCSection *
+  getSectionForJumpTable(const Function &F, Mangler &Mang,
+                         const TargetMachine &TM) const;
+
+  virtual bool shouldPutJumpTableInFunctionSection(bool UsesLabelDifference,
+                                                   const Function &F) const;
+
   /// Targets should implement this method to assign a section to globals with
   /// an explicit section specfied. The implementation of this method can
   /// assume that GV->hasSection() is true.
@@ -151,11 +162,17 @@ public:
     return nullptr;
   }
 
-  /// \brief True if the section is atomized using the symbols in it.
-  /// This is false if the section is not atomized at all (most ELF sections) or
-  /// if it is atomized based on its contents (MachO' __TEXT,__cstring for
-  /// example).
-  virtual bool isSectionAtomizableBySymbols(const MCSection &Section) const;
+  /// \brief Target supports replacing a data "PC"-relative access to a symbol
+  /// through another symbol, by accessing the later via a GOT entry instead?
+  bool supportIndirectSymViaGOTPCRel() const {
+    return SupportIndirectSymViaGOTPCRel;
+  }
+
+  /// \brief Get the target specific PC relative GOT entry relocation
+  virtual const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
+                                                  int64_t Offset) const {
+    return nullptr;
+  }
 
 protected:
   virtual const MCSection *
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index a4f95c0..cdf643d 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -34,14 +34,14 @@ class Target;
 class DataLayout;
 class TargetLibraryInfo;
 class TargetFrameLowering;
+class TargetIRAnalysis;
 class TargetIntrinsicInfo;
 class TargetLowering;
 class TargetPassConfig;
 class TargetRegisterInfo;
 class TargetSelectionDAGInfo;
 class TargetSubtargetInfo;
-class ScalarTargetTransformInfo;
-class VectorTargetTransformInfo;
+class TargetTransformInfo;
 class formatted_raw_ostream;
 class raw_ostream;
 class TargetLoweringObjectFile;
@@ -59,8 +59,8 @@ using legacy::PassManagerBase;
 /// through this interface.
 ///
 class TargetMachine {
-  TargetMachine(const TargetMachine &) LLVM_DELETED_FUNCTION;
-  void operator=(const TargetMachine &) LLVM_DELETED_FUNCTION;
+  TargetMachine(const TargetMachine &) = delete;
+  void operator=(const TargetMachine &) = delete;
 protected: // Can only create subclasses.
   TargetMachine(const Target &T, StringRef TargetTriple,
                 StringRef CPU, StringRef FS, const TargetOptions &Options);
@@ -113,10 +113,16 @@ public:
   template<typename STC> const STC &getSubtarget() const {
     return *static_cast<const STC*>(getSubtargetImpl());
   }
-  template <typename STC> const STC &getSubtarget(const Function *) const {
+  template <typename STC> const STC &getSubtarget(const Function &) const {
     return *static_cast<const STC*>(getSubtargetImpl());
   }
 
+  /// getDataLayout - This method returns a pointer to the DataLayout for
+  /// the target. It should be unchanging for every subtarget.
+  virtual const DataLayout *getDataLayout() const {
+    return nullptr;
+  }
+
   /// \brief Reset the target options based on the function's attributes.
   // FIXME: Remove TargetOptions that affect per-function code generation
   // from TargetMachine.
@@ -159,31 +165,32 @@ public:
 
   bool shouldPrintMachineCode() const { return Options.PrintMachineCode; }
 
-  /// getAsmVerbosityDefault - Returns the default value of asm verbosity.
+  /// Returns the default value of asm verbosity.
   ///
-  bool getAsmVerbosityDefault() const ;
-
-  /// setAsmVerbosityDefault - Set the default value of asm verbosity. Default
-  /// is false.
-  void setAsmVerbosityDefault(bool);
-
-  /// getDataSections - Return true if data objects should be emitted into their
-  /// own section, corresponds to -fdata-sections.
-  bool getDataSections() const;
+  bool getAsmVerbosityDefault() const {
+    return Options.MCOptions.AsmVerbose;
+  }
 
-  /// getFunctionSections - Return true if functions should be emitted into
-  /// their own section, corresponding to -ffunction-sections.
-  bool getFunctionSections() const;
+  bool getUniqueSectionNames() const { return Options.UniqueSectionNames; }
 
-  /// setDataSections - Set if the data are emit into separate sections.
-  void setDataSections(bool);
+  /// Return true if data objects should be emitted into their own section,
+  /// corresponds to -fdata-sections.
+  bool getDataSections() const {
+    return Options.DataSections;
+  }
 
-  /// setFunctionSections - Set if the functions are emit into separate
-  /// sections.
-  void setFunctionSections(bool);
+  /// Return true if functions should be emitted into their own section,
+  /// corresponding to -ffunction-sections.
+  bool getFunctionSections() const {
+    return Options.FunctionSections;
+  }
 
-  /// \brief Register analysis passes for this target with a pass manager.
-  virtual void addAnalysisPasses(PassManagerBase &) {}
+  /// \brief Get a \c TargetIRAnalysis appropriate for the target.
+  ///
+  /// This is used to construct the new pass manager's target IR analysis pass,
+  /// set up appropriately for this target machine. Even the old pass manager
+  /// uses this to answer queries about the IR.
+  virtual TargetIRAnalysis getTargetIRAnalysis();
 
   /// CodeGenFileType - These enums are meant to be passed into
   /// addPassesToEmitFile to indicate what type of file to emit, and returned by
@@ -236,10 +243,11 @@ protected: // Can only create subclasses.
 
   void initAsmInfo();
 public:
-  /// \brief Register analysis passes for this target with a pass manager.
+  /// \brief Get a TargetIRAnalysis implementation for the target.
   ///
-  /// This registers target independent analysis passes.
-  void addAnalysisPasses(PassManagerBase &PM) override;
+  /// This analysis will produce a TTI result which uses the common code
+  /// generator to answer queries about the IR.
+  TargetIRAnalysis getTargetIRAnalysis() override;
 
   /// createPassConfig - Create a pass configuration object to be used by
   /// addPassToEmitX methods for generating a pipeline of CodeGen passes.
diff --git a/include/llvm/Target/TargetOpcodes.h b/include/llvm/Target/TargetOpcodes.h
index 1fbd2ae..afc2236 100644
--- a/include/llvm/Target/TargetOpcodes.h
+++ b/include/llvm/Target/TargetOpcodes.h
@@ -110,7 +110,18 @@ enum {
   /// to prevent the stack guard value or address from being spilled to the
   /// stack should override TargetLowering::emitLoadStackGuardNode and
   /// additionally expand this pseudo after register allocation.
-  LOAD_STACK_GUARD = 19
+  LOAD_STACK_GUARD = 19,
+
+  /// Call instruction with associated vm state for deoptimization and list
+  /// of live pointers for relocation by the garbage collector.  It is
+  /// intended to support garbage collection with fully precise relocating
+  /// collectors and deoptimizations in either the callee or caller.
+  STATEPOINT = 20,
+
+  /// Instruction that records the offset of a function's frame allocation in a
+  /// label. Created by the llvm.frameallocate intrinsic. It has two arguments:
+  /// the symbol for the label and the frame index of the stack allocation.
+  FRAME_ALLOC = 21,
 };
 } // end namespace TargetOpcode
 } // end namespace llvm
diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h
index 73014d8..f447fd6 100644
--- a/include/llvm/Target/TargetOptions.h
+++ b/include/llvm/Target/TargetOptions.h
@@ -78,8 +78,8 @@ namespace llvm {
           EnableFastISel(false), PositionIndependentExecutable(false),
           UseInitArray(false), DisableIntegratedAS(false),
           CompressDebugSections(false), FunctionSections(false),
-          DataSections(false), TrapUnreachable(false), TrapFuncName(),
-          FloatABIType(FloatABI::Default),
+          DataSections(false), UniqueSectionNames(true), TrapUnreachable(false),
+          TrapFuncName(), FloatABIType(FloatABI::Default),
           AllowFPOpFusion(FPOpFusion::Standard), JTType(JumpTable::Single),
           FCFI(false), ThreadModel(ThreadModel::POSIX),
           CFIType(CFIntegrity::Sub), CFIEnforcing(false), CFIFuncName() {}
@@ -198,6 +198,8 @@ namespace llvm {
     /// Emit data into separate sections.
     unsigned DataSections : 1;
 
+    unsigned UniqueSectionNames : 1;
+
     /// Emit target-specific trap instruction for 'unreachable' IR instructions.
     unsigned TrapUnreachable : 1;
 
@@ -288,6 +290,12 @@ inline bool operator==(const TargetOptions &LHS,
     ARE_EQUAL(TrapFuncName) &&
     ARE_EQUAL(FloatABIType) &&
     ARE_EQUAL(AllowFPOpFusion) &&
+    ARE_EQUAL(JTType) &&
+    ARE_EQUAL(FCFI) &&
+    ARE_EQUAL(ThreadModel) &&
+    ARE_EQUAL(CFIType) &&
+    ARE_EQUAL(CFIEnforcing) &&
+    ARE_EQUAL(CFIFuncName) &&
     ARE_EQUAL(MCOptions);
 #undef ARE_EQUAL
 }
diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
index 16b72a9..fc94a84 100644
--- a/include/llvm/Target/TargetRegisterInfo.h
+++ b/include/llvm/Target/TargetRegisterInfo.h
@@ -45,6 +45,7 @@ public:
   const vt_iterator VTs;
   const uint32_t *SubClassMask;
   const uint16_t *SuperRegIndices;
+  const unsigned LaneMask;
   const sc_iterator SuperClasses;
   ArrayRef<MCPhysReg> (*OrderFunc)(const MachineFunction&);
 
@@ -190,6 +191,13 @@ public:
   ArrayRef<MCPhysReg> getRawAllocationOrder(const MachineFunction &MF) const {
     return OrderFunc ? OrderFunc(MF) : makeArrayRef(begin(), getNumRegs());
   }
+
+  /// Returns the combination of all lane masks of register in this class.
+  /// The lane masks of the registers are the combination of all lane masks
+  /// of their subregisters.
+  unsigned getLaneMask() const {
+    return LaneMask;
+  }
 };
 
 /// TargetRegisterInfoDesc - Extra information, not in MCRegisterDesc, about
@@ -448,6 +456,11 @@ public:
   /// used by register scavenger to determine what registers are free.
   virtual BitVector getReservedRegs(const MachineFunction &MF) const = 0;
 
+  /// Prior to adding the live-out mask to a stackmap or patchpoint
+  /// instruction, provide the target the opportunity to adjust it (mainly to
+  /// remove pseudo-registers that should be ignored).
+  virtual void adjustStackMapLiveOutMask(uint32_t *Mask) const { }
+
   /// getMatchingSuperReg - Return a super-register of the specified register
   /// Reg so its sub-register of index SubIdx is Reg.
   unsigned getMatchingSuperReg(unsigned Reg, unsigned SubIdx,
@@ -502,6 +515,15 @@ public:
     return composeSubRegIndicesImpl(a, b);
   }
 
+  /// Transforms a LaneMask computed for one subregister to the lanemask that
+  /// would have been computed when composing the subsubregisters with IdxA
+  /// first. @sa composeSubRegIndices()
+  unsigned composeSubRegIndexLaneMask(unsigned IdxA, unsigned LaneMask) const {
+    if (!IdxA)
+      return LaneMask;
+    return composeSubRegIndexLaneMaskImpl(IdxA, LaneMask);
+  }
+
   /// Debugging helper: dump register in human readable form to dbgs() stream.
   static void dumpReg(unsigned Reg, unsigned SubRegIndex = 0,
                       const TargetRegisterInfo* TRI = nullptr);
@@ -512,6 +534,12 @@ protected:
     llvm_unreachable("Target has no sub-registers");
   }
 
+  /// Overridden by TableGen in targets that have sub-registers.
+  virtual unsigned
+  composeSubRegIndexLaneMaskImpl(unsigned, unsigned) const {
+    llvm_unreachable("Target has no sub-registers");
+  }
+
 public:
   /// getCommonSuperRegClass - Find a common super-register class if it exists.
   ///
@@ -666,13 +694,13 @@ public:
     return false;
   }
 
-  /// UpdateRegAllocHint - A callback to allow target a chance to update
+  /// updateRegAllocHint - A callback to allow target a chance to update
   /// register allocation hints when a register is "changed" (e.g. coalesced)
   /// to another register. e.g. On ARM, some virtual registers should target
   /// register pairs, if one of pair is coalesced to another register, the
   /// allocation hint of the other half of the pair should be changed to point
   /// to the new register.
-  virtual void UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
+  virtual void updateRegAllocHint(unsigned Reg, unsigned NewReg,
                                   MachineFunction &MF) const {
     // Do nothing.
   }
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index f63afd7..d297162 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -188,6 +188,22 @@ def SDTIStore : SDTypeProfile<1, 3, [       // indexed store
   SDTCisSameAs<0, 2>, SDTCisPtrTy<0>, SDTCisPtrTy<3>
 ]>;
 
+def SDTMaskedStore: SDTypeProfile<0, 3, [       // masked store
+  SDTCisPtrTy<0>, SDTCisVec<1>, SDTCisVec<2>
+]>;
+
+def SDTMaskedLoad: SDTypeProfile<1, 3, [       // masked load
+  SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameAs<0, 3>
+]>;
+
+def SDTMaskedGather: SDTypeProfile<1, 3, [       // masked gather
+  SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVec<2>
+]>;
+
+def SDTMaskedScatter: SDTypeProfile<1, 3, [       // masked scatter
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>
+]>;
+
 def SDTVecShuffle : SDTypeProfile<1, 2, [
   SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
 ]>;
@@ -363,6 +379,7 @@ def zext       : SDNode<"ISD::ZERO_EXTEND", SDTIntExtendOp>;
 def anyext     : SDNode<"ISD::ANY_EXTEND" , SDTIntExtendOp>;
 def trunc      : SDNode<"ISD::TRUNCATE"   , SDTIntTruncOp>;
 def bitconvert : SDNode<"ISD::BITCAST"    , SDTUnaryOp>;
+def addrspacecast : SDNode<"ISD::ADDRSPACECAST", SDTUnaryOp>;
 def extractelt : SDNode<"ISD::EXTRACT_VECTOR_ELT", SDTVecExtract>;
 def insertelt  : SDNode<"ISD::INSERT_VECTOR_ELT", SDTVecInsert>;
 
@@ -372,6 +389,7 @@ def fmul       : SDNode<"ISD::FMUL"       , SDTFPBinOp, [SDNPCommutative]>;
 def fdiv       : SDNode<"ISD::FDIV"       , SDTFPBinOp>;
 def frem       : SDNode<"ISD::FREM"       , SDTFPBinOp>;
 def fma        : SDNode<"ISD::FMA"        , SDTFPTernaryOp>;
+def fmad       : SDNode<"ISD::FMAD"       , SDTFPTernaryOp>;
 def fabs       : SDNode<"ISD::FABS"       , SDTFPUnaryOp>;
 def fminnum    : SDNode<"ISD::FMINNUM"    , SDTFPBinOp>;
 def fmaxnum    : SDNode<"ISD::FMAXNUM"    , SDTFPBinOp>;
@@ -454,6 +472,15 @@ def atomic_load      : SDNode<"ISD::ATOMIC_LOAD", SDTAtomicLoad,
 def atomic_store     : SDNode<"ISD::ATOMIC_STORE", SDTAtomicStore,
                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
+def masked_store : SDNode<"ISD::MSTORE",  SDTMaskedStore,
+                       [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def masked_load  : SDNode<"ISD::MLOAD",  SDTMaskedLoad,
+                       [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def masked_scatter : SDNode<"ISD::MSCATTER",  SDTMaskedScatter,
+                       [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def masked_gather  : SDNode<"ISD::MGATHER",  SDTMaskedGather,
+                       [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
 // Do not use ld, st directly. Use load, extload, sextload, zextload, store,
 // and truncst (see below).
 def ld         : SDNode<"ISD::LOAD"       , SDTLoad,
diff --git a/include/llvm/Target/TargetSelectionDAGInfo.h b/include/llvm/Target/TargetSelectionDAGInfo.h
index d1a3fcf..bacdd95 100644
--- a/include/llvm/Target/TargetSelectionDAGInfo.h
+++ b/include/llvm/Target/TargetSelectionDAGInfo.h
@@ -21,15 +21,14 @@
 namespace llvm {
 
 class DataLayout;
-class TargetMachine;
 
 //===----------------------------------------------------------------------===//
 /// TargetSelectionDAGInfo - Targets can subclass this to parameterize the
 /// SelectionDAG lowering and instruction selection process.
 ///
 class TargetSelectionDAGInfo {
-  TargetSelectionDAGInfo(const TargetSelectionDAGInfo &) LLVM_DELETED_FUNCTION;
-  void operator=(const TargetSelectionDAGInfo &) LLVM_DELETED_FUNCTION;
+  TargetSelectionDAGInfo(const TargetSelectionDAGInfo &) = delete;
+  void operator=(const TargetSelectionDAGInfo &) = delete;
 
   const DataLayout *DL;
 
diff --git a/include/llvm/Target/TargetSubtargetInfo.h b/include/llvm/Target/TargetSubtargetInfo.h
index 80ff9e3..83ab4ec 100644
--- a/include/llvm/Target/TargetSubtargetInfo.h
+++ b/include/llvm/Target/TargetSubtargetInfo.h
@@ -42,8 +42,8 @@ template <typename T> class SmallVectorImpl;
 /// be exposed through a TargetSubtargetInfo-derived class.
 ///
 class TargetSubtargetInfo : public MCSubtargetInfo {
-  TargetSubtargetInfo(const TargetSubtargetInfo&) LLVM_DELETED_FUNCTION;
-  void operator=(const TargetSubtargetInfo&) LLVM_DELETED_FUNCTION;
+  TargetSubtargetInfo(const TargetSubtargetInfo&) = delete;
+  void operator=(const TargetSubtargetInfo&) = delete;
 protected: // Can only create subclasses...
   TargetSubtargetInfo();
 public:
@@ -71,7 +71,6 @@ public:
   virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const {
     return nullptr;
   }
-  virtual const DataLayout *getDataLayout() const { return nullptr; }
 
   /// getRegisterInfo - If register information is available, return it.  If
   /// not, return null.  This is kept separate from RegInfo until RegInfo has
@@ -168,6 +167,11 @@ public:
   virtual std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const {
     return nullptr;
   }
+
+  /// Enable tracking of subregister liveness in register allocator.
+  virtual bool enableSubRegLiveness() const {
+    return false;
+  }
 };
 
 } // End llvm namespace
diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index ce1a7d6..fbd999c 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -199,6 +199,10 @@ ModulePass *createMetaRenamerPass();
 /// manager.
 ModulePass *createBarrierNoopPass();
 
+/// \brief This pass lowers bitset metadata and the llvm.bitset.test intrinsic
+/// to bitsets.
+ModulePass *createLowerBitSetsPass();
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/IPO/LowerBitSets.h b/include/llvm/Transforms/IPO/LowerBitSets.h
new file mode 100644
index 0000000..0f60617
--- /dev/null
+++ b/include/llvm/Transforms/IPO/LowerBitSets.h
@@ -0,0 +1,153 @@
+//===- LowerBitSets.h - Bitset lowering pass --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines parts of the bitset lowering pass implementation that may
+// be usefully unit tested.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_LOWERBITSETS_H
+#define LLVM_TRANSFORMS_IPO_LOWERBITSETS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+
+#include <stdint.h>
+#include <limits>
+#include <set>
+#include <vector>
+
+namespace llvm {
+
+class DataLayout;
+class GlobalVariable;
+class Value;
+
+struct BitSetInfo {
+  // The actual bitset.
+  std::vector<uint8_t> Bits;
+
+  // The byte offset into the combined global represented by the bitset.
+  uint64_t ByteOffset;
+
+  // The size of the bitset in bits.
+  uint64_t BitSize;
+
+  // Log2 alignment of the bit set relative to the combined global.
+  // For example, a log2 alignment of 3 means that bits in the bitset
+  // represent addresses 8 bytes apart.
+  unsigned AlignLog2;
+
+  bool isSingleOffset() const {
+    return Bits.size() == 1 && Bits[0] == 1;
+  }
+
+  bool isAllOnes() const {
+    for (unsigned I = 0; I != Bits.size() - 1; ++I)
+      if (Bits[I] != 0xFF)
+        return false;
+
+    if (BitSize % 8 == 0)
+      return Bits[Bits.size() - 1] == 0xFF;
+
+    return Bits[Bits.size() - 1] == (1 << (BitSize % 8)) - 1;
+  }
+
+  bool containsGlobalOffset(uint64_t Offset) const;
+
+  bool containsValue(const DataLayout *DL,
+                     const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout,
+                     Value *V, uint64_t COffset = 0) const;
+
+};
+
+struct BitSetBuilder {
+  SmallVector<uint64_t, 16> Offsets;
+  uint64_t Min, Max;
+
+  BitSetBuilder() : Min(std::numeric_limits<uint64_t>::max()), Max(0) {}
+
+  void addOffset(uint64_t Offset) {
+    if (Min > Offset)
+      Min = Offset;
+    if (Max < Offset)
+      Max = Offset;
+
+    Offsets.push_back(Offset);
+  }
+
+  BitSetInfo build();
+};
+
+/// This class implements a layout algorithm for globals referenced by bit sets
+/// that tries to keep members of small bit sets together. This can
+/// significantly reduce bit set sizes in many cases.
+///
+/// It works by assembling fragments of layout from sets of referenced globals.
+/// Each set of referenced globals causes the algorithm to create a new
+/// fragment, which is assembled by appending each referenced global in the set
+/// into the fragment. If a referenced global has already been referenced by an
+/// fragment created earlier, we instead delete that fragment and append its
+/// contents into the fragment we are assembling.
+///
+/// By starting with the smallest fragments, we minimize the size of the
+/// fragments that are copied into larger fragments. This is most intuitively
+/// thought about when considering the case where the globals are virtual tables
+/// and the bit sets represent their derived classes: in a single inheritance
+/// hierarchy, the optimum layout would involve a depth-first search of the
+/// class hierarchy (and in fact the computed layout ends up looking a lot like
+/// a DFS), but a naive DFS would not work well in the presence of multiple
+/// inheritance. This aspect of the algorithm ends up fitting smaller
+/// hierarchies inside larger ones where that would be beneficial.
+///
+/// For example, consider this class hierarchy:
+///
+/// A       B
+///   \   / | \
+///     C   D   E
+///
+/// We have five bit sets: bsA (A, C), bsB (B, C, D, E), bsC (C), bsD (D) and
+/// bsE (E). If we laid out our objects by DFS traversing B followed by A, our
+/// layout would be {B, C, D, E, A}. This is optimal for bsB as it needs to
+/// cover the only 4 objects in its hierarchy, but not for bsA as it needs to
+/// cover 5 objects, i.e. the entire layout. Our algorithm proceeds as follows:
+///
+/// Add bsC, fragments {{C}}
+/// Add bsD, fragments {{C}, {D}}
+/// Add bsE, fragments {{C}, {D}, {E}}
+/// Add bsA, fragments {{A, C}, {D}, {E}}
+/// Add bsB, fragments {{B, A, C, D, E}}
+///
+/// This layout is optimal for bsA, as it now only needs to cover two (i.e. 3
+/// fewer) objects, at the cost of bsB needing to cover 1 more object.
+///
+/// The bit set lowering pass assigns an object index to each object that needs
+/// to be laid out, and calls addFragment for each bit set passing the object
+/// indices of its referenced globals. It then assembles a layout from the
+/// computed layout in the Fragments field.
+struct GlobalLayoutBuilder {
+  /// The computed layout. Each element of this vector contains a fragment of
+  /// layout (which may be empty) consisting of object indices.
+  std::vector<std::vector<uint64_t>> Fragments;
+
+  /// Mapping from object index to fragment index.
+  std::vector<uint64_t> FragmentMap;
+
+  GlobalLayoutBuilder(uint64_t NumObjects)
+      : Fragments(1), FragmentMap(NumObjects) {}
+
+  /// Add F to the layout while trying to keep its indices contiguous.
+  /// If a previously seen fragment uses any of F's indices, that
+  /// fragment will be laid out inside F.
+  void addFragment(const std::set<uint64_t> &F);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index b1426b4..65f4712 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -19,7 +19,7 @@
 
 namespace llvm {
 class Pass;
-class TargetLibraryInfo;
+class TargetLibraryInfoImpl;
 class TargetMachine;
 
 // The old pass manager infrastructure is hidden in a legacy namespace now.
@@ -27,8 +27,6 @@ namespace legacy {
 class FunctionPassManager;
 class PassManagerBase;
 }
-using legacy::FunctionPassManager;
-using legacy::PassManagerBase;
 
 /// PassManagerBuilder - This class is used to set up a standard optimization
 /// sequence for languages like C and C++, allowing some APIs to customize the
@@ -59,7 +57,7 @@ public:
   /// Extensions are passed the builder itself (so they can see how it is
   /// configured) as well as the pass manager to add stuff to.
   typedef void (*ExtensionFn)(const PassManagerBuilder &Builder,
-                              PassManagerBase &PM);
+                              legacy::PassManagerBase &PM);
   enum ExtensionPointTy {
     /// EP_EarlyAsPossible - This extension point allows adding passes before
     /// any other transformations, allowing them to see the code as it is coming
@@ -105,7 +103,7 @@ public:
   /// LibraryInfo - Specifies information about the runtime library for the
   /// optimizer.  If this is non-null, it is added to both the function and
   /// per-module pass pipeline.
-  TargetLibraryInfo *LibraryInfo;
+  TargetLibraryInfoImpl *LibraryInfo;
 
   /// Inliner - Specifies the inliner to use.  If this is non-null, it is
   /// added to the per-module passes.
@@ -139,19 +137,20 @@ public:
   void addExtension(ExtensionPointTy Ty, ExtensionFn Fn);
 
 private:
-  void addExtensionsToPM(ExtensionPointTy ETy, PassManagerBase &PM) const;
-  void addInitialAliasAnalysisPasses(PassManagerBase &PM) const;
-  void addLTOOptimizationPasses(PassManagerBase &PM);
+  void addExtensionsToPM(ExtensionPointTy ETy,
+                         legacy::PassManagerBase &PM) const;
+  void addInitialAliasAnalysisPasses(legacy::PassManagerBase &PM) const;
+  void addLTOOptimizationPasses(legacy::PassManagerBase &PM);
 
 public:
   /// populateFunctionPassManager - This fills in the function pass manager,
   /// which is expected to be run on each function immediately as it is
   /// generated.  The idea is to reduce the size of the IR in memory.
-  void populateFunctionPassManager(FunctionPassManager &FPM);
+  void populateFunctionPassManager(legacy::FunctionPassManager &FPM);
 
   /// populateModulePassManager - This sets up the primary pass manager.
-  void populateModulePassManager(PassManagerBase &MPM);
-  void populateLTOPassManager(PassManagerBase &PM, TargetMachine *TM = nullptr);
+  void populateModulePassManager(legacy::PassManagerBase &MPM);
+  void populateLTOPassManager(legacy::PassManagerBase &PM);
 };
 
 /// Registers a function for adding a standard set of passes.  This should be
diff --git a/include/llvm/Transforms/InstCombine/InstCombine.h b/include/llvm/Transforms/InstCombine/InstCombine.h
new file mode 100644
index 0000000..f48ec13
--- /dev/null
+++ b/include/llvm/Transforms/InstCombine/InstCombine.h
@@ -0,0 +1,46 @@
+//===- InstCombine.h - InstCombine pass -------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file provides the primary interface to the instcombine pass. This pass
+/// is suitable for use in the new pass manager. For a pass that works with the
+/// legacy pass manager, please look for \c createInstructionCombiningPass() in
+/// Scalar.h.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_INSTCOMBINE_INSTCOMBINE_H
+#define LLVM_TRANSFORMS_INSTCOMBINE_INSTCOMBINE_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+
+namespace llvm {
+
+class InstCombinePass {
+  InstCombineWorklist Worklist;
+
+public:
+  static StringRef name() { return "InstCombinePass"; }
+
+  // Explicitly define constructors for MSVC.
+  InstCombinePass() {}
+  InstCombinePass(InstCombinePass &&Arg) : Worklist(std::move(Arg.Worklist)) {}
+  InstCombinePass &operator=(InstCombinePass &&RHS) {
+    Worklist = std::move(RHS.Worklist);
+    return *this;
+  }
+
+  PreservedAnalyses run(Function &F, AnalysisManager<Function> *AM);
+};
+
+}
+
+#endif
diff --git a/include/llvm/Transforms/InstCombine/InstCombineWorklist.h b/include/llvm/Transforms/InstCombine/InstCombineWorklist.h
new file mode 100644
index 0000000..a6bad34
--- /dev/null
+++ b/include/llvm/Transforms/InstCombine/InstCombineWorklist.h
@@ -0,0 +1,116 @@
+//===- InstCombineWorklist.h - Worklist for InstCombine pass ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_INSTCOMBINE_INSTCOMBINEWORKLIST_H
+#define LLVM_TRANSFORMS_INSTCOMBINE_INSTCOMBINEWORKLIST_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "instcombine"
+
+namespace llvm {
+
+/// InstCombineWorklist - This is the worklist management logic for
+/// InstCombine.
+class InstCombineWorklist {
+  SmallVector<Instruction*, 256> Worklist;
+  DenseMap<Instruction*, unsigned> WorklistMap;
+
+  void operator=(const InstCombineWorklist&RHS) = delete;
+  InstCombineWorklist(const InstCombineWorklist&) = delete;
+public:
+  InstCombineWorklist() {}
+
+  InstCombineWorklist(InstCombineWorklist &&Arg)
+      : Worklist(std::move(Arg.Worklist)),
+        WorklistMap(std::move(Arg.WorklistMap)) {}
+  InstCombineWorklist &operator=(InstCombineWorklist &&RHS) {
+    Worklist = std::move(RHS.Worklist);
+    WorklistMap = std::move(RHS.WorklistMap);
+    return *this;
+  }
+
+  bool isEmpty() const { return Worklist.empty(); }
+
+  /// Add - Add the specified instruction to the worklist if it isn't already
+  /// in it.
+  void Add(Instruction *I) {
+    if (WorklistMap.insert(std::make_pair(I, Worklist.size())).second) {
+      DEBUG(dbgs() << "IC: ADD: " << *I << '\n');
+      Worklist.push_back(I);
+    }
+  }
+
+  void AddValue(Value *V) {
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      Add(I);
+  }
+
+  /// AddInitialGroup - Add the specified batch of stuff in reverse order.
+  /// which should only be done when the worklist is empty and when the group
+  /// has no duplicates.
+  void AddInitialGroup(Instruction *const *List, unsigned NumEntries) {
+    assert(Worklist.empty() && "Worklist must be empty to add initial group");
+    Worklist.reserve(NumEntries+16);
+    WorklistMap.resize(NumEntries);
+    DEBUG(dbgs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n");
+    for (unsigned Idx = 0; NumEntries; --NumEntries) {
+      Instruction *I = List[NumEntries-1];
+      WorklistMap.insert(std::make_pair(I, Idx++));
+      Worklist.push_back(I);
+    }
+  }
+
+  // Remove - remove I from the worklist if it exists.
+  void Remove(Instruction *I) {
+    DenseMap<Instruction*, unsigned>::iterator It = WorklistMap.find(I);
+    if (It == WorklistMap.end()) return; // Not in worklist.
+
+    // Don't bother moving everything down, just null out the slot.
+    Worklist[It->second] = nullptr;
+
+    WorklistMap.erase(It);
+  }
+
+  Instruction *RemoveOne() {
+    Instruction *I = Worklist.pop_back_val();
+    WorklistMap.erase(I);
+    return I;
+  }
+
+  /// AddUsersToWorkList - When an instruction is simplified, add all users of
+  /// the instruction to the work lists because they might get more simplified
+  /// now.
+  ///
+  void AddUsersToWorkList(Instruction &I) {
+    for (User *U : I.users())
+      Add(cast<Instruction>(U));
+  }
+
+
+  /// Zap - check that the worklist is empty and nuke the backing store for
+  /// the map if it is large.
+  void Zap() {
+    assert(WorklistMap.empty() && "Worklist empty, but map not?");
+
+    // Do an explicit clear, this shrinks the map if needed.
+    WorklistMap.clear();
+  }
+};
+
+} // end namespace llvm.
+
+#undef DEBUG_TYPE
+
+#endif
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index 87422df..8fac6ca 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -15,6 +15,7 @@
 #define LLVM_TRANSFORMS_INSTRUMENTATION_H
 
 #include "llvm/ADT/StringRef.h"
+#include <vector>
 
 #if defined(__GNUC__) && defined(__linux__) && !defined(ANDROID)
 inline void *getDFSanArgTLSPtrForJIT() {
@@ -63,6 +64,18 @@ struct GCOVOptions {
 ModulePass *createGCOVProfilerPass(const GCOVOptions &Options =
                                    GCOVOptions::getDefault());
 
+/// Options for the frontend instrumentation based profiling pass.
+struct InstrProfOptions {
+  InstrProfOptions() : NoRedZone(false) {}
+
+  // Add the 'noredzone' attribute to added runtime library calls.
+  bool NoRedZone;
+};
+
+/// Insert frontend instrumentation based profiling.
+ModulePass *createInstrProfilingPass(
+    const InstrProfOptions &Options = InstrProfOptions());
+
 // Insert AddressSanitizer (address sanity checking) instrumentation
 FunctionPass *createAddressSanitizerFunctionPass();
 ModulePass *createAddressSanitizerModulePass();
@@ -74,17 +87,17 @@ FunctionPass *createMemorySanitizerPass(int TrackOrigins = 0);
 FunctionPass *createThreadSanitizerPass();
 
 // Insert DataFlowSanitizer (dynamic data flow analysis) instrumentation
-ModulePass *createDataFlowSanitizerPass(StringRef ABIListFile = StringRef(),
-                                        void *(*getArgTLS)() = nullptr,
-                                        void *(*getRetValTLS)() = nullptr);
+ModulePass *createDataFlowSanitizerPass(
+    const std::vector<std::string> &ABIListFiles = std::vector<std::string>(),
+    void *(*getArgTLS)() = nullptr, void *(*getRetValTLS)() = nullptr);
 
 // Insert SanitizerCoverage instrumentation.
 ModulePass *createSanitizerCoverageModulePass(int CoverageLevel);
 
 #if defined(__GNUC__) && defined(__linux__) && !defined(ANDROID)
-inline ModulePass *createDataFlowSanitizerPassForJIT(StringRef ABIListFile =
-                                                         StringRef()) {
-  return createDataFlowSanitizerPass(ABIListFile, getDFSanArgTLSPtrForJIT,
+inline ModulePass *createDataFlowSanitizerPassForJIT(
+    const std::vector<std::string> &ABIListFiles = std::vector<std::string>()) {
+  return createDataFlowSanitizerPass(ABIListFiles, getDFSanArgTLSPtrForJIT,
                                      getDFSanRetValTLSPtrForJIT);
 }
 #endif
@@ -93,37 +106,6 @@ inline ModulePass *createDataFlowSanitizerPassForJIT(StringRef ABIListFile =
 // checking on loads, stores, and other memory intrinsics.
 FunctionPass *createBoundsCheckingPass();
 
-/// createDebugIRPass - Enable interactive stepping through LLVM IR in LLDB (or
-///                     GDB) and generate a file with the LLVM IR to be
-///                     displayed in the debugger.
-///
-/// Existing debug metadata is preserved (but may be modified) in order to allow
-/// accessing variables in the original source. The line table and file
-/// information is modified to correspond to the lines in the LLVM IR. If
-/// Filename and Directory are empty, a file name is generated based on existing
-/// debug information. If no debug information is available, a temporary file
-/// name is generated.
-///
-/// @param HideDebugIntrinsics  Omit debug intrinsics in emitted IR source file.
-/// @param HideDebugMetadata    Omit debug metadata in emitted IR source file.
-/// @param Directory            Embed this directory in the debug information.
-/// @param Filename             Embed this file name in the debug information.
-ModulePass *createDebugIRPass(bool HideDebugIntrinsics,
-                              bool HideDebugMetadata,
-                              StringRef Directory = StringRef(),
-                              StringRef Filename = StringRef());
-
-/// createDebugIRPass - Enable interactive stepping through LLVM IR in LLDB
-///                     (or GDB) with an existing IR file on disk. When creating
-///                     a DebugIR pass with this function, no source file is
-///                     output to disk and the existing one is unmodified. Debug
-///                     metadata in the Module is created/updated to point to
-///                     the existing textual IR file on disk.
-/// NOTE: If the IR file to be debugged is not on disk, use the version of this
-///       function with parameters in order to generate the file that will be
-///       seen by the debugger.
-ModulePass *createDebugIRPass();
-
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index 5dcd899..558b81e 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -21,6 +21,7 @@ namespace llvm {
 
 class BasicBlockPass;
 class FunctionPass;
+class ModulePass;
 class Pass;
 class GetElementPtrInst;
 class PassInfo;
@@ -81,6 +82,13 @@ FunctionPass *createAggressiveDCEPass();
 
 //===----------------------------------------------------------------------===//
 //
+// BitTrackingDCE - This pass uses a bit-tracking DCE algorithm in order to
+// remove computations of dead bits.
+//
+FunctionPass *createBitTrackingDCEPass();
+
+//===----------------------------------------------------------------------===//
+//
 // SROA - Replace aggregates or pieces of aggregates with scalar SSA values.
 //
 FunctionPass *createSROAPass(bool RequiresDomTree = true);
@@ -98,6 +106,13 @@ FunctionPass *createScalarReplAggregatesPass(signed Threshold = -1,
 
 //===----------------------------------------------------------------------===//
 //
+// InductiveRangeCheckElimination - Transform loops to elide range checks on
+// linear functions of the induction variable.
+//
+Pass *createInductiveRangeCheckEliminationPass();
+
+//===----------------------------------------------------------------------===//
+//
 // InductionVariableSimplify - Transform induction variables in a program to all
 // use a single canonical induction variable per loop.
 //
@@ -130,7 +145,7 @@ Pass *createLICMPass();
 //
 Pass *createLoopStrengthReducePass();
 
-Pass *createGlobalMergePass(const TargetMachine *TM = nullptr);
+Pass *createGlobalMergePass(const TargetMachine *TM, unsigned MaximalOffset);
 
 //===----------------------------------------------------------------------===//
 //
@@ -405,6 +420,26 @@ createSeparateConstOffsetFromGEPPass(const TargetMachine *TM = nullptr,
 //
 BasicBlockPass *createLoadCombinePass();
 
+FunctionPass *createStraightLineStrengthReducePass();
+
+
+//===----------------------------------------------------------------------===//
+//
+// PlaceSafepoints - Rewrite any IR calls to gc.statepoints and insert any
+// safepoint polls (method entry, backedge) that might be required.  This pass
+// does not generate explicit relocation sequences - that's handled by
+// RewriteStatepointsForGC which can be run at an arbitrary point in the pass
+// order following this pass.
+//
+ModulePass *createPlaceSafepointsPass();
+
+//===----------------------------------------------------------------------===//
+//
+// RewriteStatepointsForGC - Rewrite any gc.statepoints which do not yet have
+// explicit relocations to include explicit relocations.
+//
+FunctionPass *createRewriteStatepointsForGCPass();
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Scalar/EarlyCSE.h b/include/llvm/Transforms/Scalar/EarlyCSE.h
new file mode 100644
index 0000000..e3dd3c0
--- /dev/null
+++ b/include/llvm/Transforms/Scalar/EarlyCSE.h
@@ -0,0 +1,39 @@
+//===- EarlyCSE.h - Simple and fast CSE pass --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file provides the interface for a simple, fast CSE pass.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_EARLYCSE_H
+#define LLVM_TRANSFORMS_SCALAR_EARLYCSE_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+/// \brief A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+class EarlyCSEPass {
+public:
+  static StringRef name() { return "EarlyCSEPass"; }
+
+  /// \brief Run the pass over the function.
+  PreservedAnalyses run(Function &F, AnalysisManager<Function> *AM);
+};
+
+}
+
+#endif
diff --git a/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h b/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h
new file mode 100644
index 0000000..4028320
--- /dev/null
+++ b/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h
@@ -0,0 +1,40 @@
+//===- LowerExpectIntrinsic.h - LowerExpectIntrinsic pass -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// The header file for the LowerExpectIntrinsic pass as used by the new pass
+/// manager.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_LOWEREXPECTINTRINSIC_H
+#define LLVM_TRANSFORMS_SCALAR_LOWEREXPECTINTRINSIC_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class LowerExpectIntrinsicPass {
+public:
+  static StringRef name() { return "LowerExpectIntrinsicPass"; }
+
+  /// \brief Run the pass over the function.
+  ///
+  /// This will lower all of th expect intrinsic calls in this function into
+  /// branch weight metadata. That metadata will subsequently feed the analysis
+  /// of the probabilities and frequencies of the CFG. After running this pass,
+  /// no more expect intrinsics remain, allowing the rest of the optimizer to
+  /// ignore them.
+  PreservedAnalyses run(Function &F);
+};
+
+}
+
+#endif
diff --git a/include/llvm/Transforms/Scalar/SimplifyCFG.h b/include/llvm/Transforms/Scalar/SimplifyCFG.h
new file mode 100644
index 0000000..ef28e0f
--- /dev/null
+++ b/include/llvm/Transforms/Scalar/SimplifyCFG.h
@@ -0,0 +1,46 @@
+//===- SimplifyCFG.h - Simplify and canonicalize the CFG --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file provides the interface for the pass responsible for both
+/// simplifying and canonicalizing the CFG.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_SIMPLIFYCFG_H
+#define LLVM_TRANSFORMS_SCALAR_SIMPLIFYCFG_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+/// \brief A pass to simplify and canonicalize the CFG of a function.
+///
+/// This pass iteratively simplifies the entire CFG of a function, removing
+/// unnecessary control flows and bringing it into the canonical form expected
+/// by the rest of the mid-level optimizer.
+class SimplifyCFGPass {
+  int BonusInstThreshold;
+
+public:
+  static StringRef name() { return "SimplifyCFGPass"; }
+
+  /// \brief Construct a pass with the default thresholds.
+  SimplifyCFGPass();
+
+  /// \brief Construct a pass with a specific bonus threshold.
+  SimplifyCFGPass(int BonusInstThreshold);
+
+  /// \brief Run the pass over the function.
+  PreservedAnalyses run(Function &F, AnalysisManager<Function> *AM);
+};
+
+}
+
+#endif
diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 19acf5b..710db03 100644
--- a/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -23,10 +23,11 @@
 namespace llvm {
 
 class AliasAnalysis;
+class MemoryDependenceAnalysis;
 class DominatorTree;
+class LoopInfo;
 class Instruction;
 class MDNode;
-class Pass;
 class ReturnInst;
 class TargetLibraryInfo;
 class TerminatorInst;
@@ -39,7 +40,8 @@ void DeleteDeadBlock(BasicBlock *BB);
 /// any single-entry PHI nodes in it, fold them away.  This handles the case
 /// when all entries to the PHI nodes in a block are guaranteed equal, such as
 /// when the block has exactly one predecessor.
-void FoldSingleEntryPHINodes(BasicBlock *BB, Pass *P = nullptr);
+void FoldSingleEntryPHINodes(BasicBlock *BB, AliasAnalysis *AA = nullptr,
+                             MemoryDependenceAnalysis *MemDep = nullptr);
 
 /// DeleteDeadPHIs - Examine each PHI in the given block and delete it if it
 /// is dead. Also recursively delete any operands that become dead as
@@ -50,7 +52,10 @@ bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI = nullptr);
 
 /// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor,
 /// if possible.  The return value indicates success or failure.
-bool MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P = nullptr);
+bool MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT = nullptr,
+                               LoopInfo *LI = nullptr,
+                               AliasAnalysis *AA = nullptr,
+                               MemoryDependenceAnalysis *MemDep = nullptr);
 
 // ReplaceInstWithValue - Replace all uses of an instruction (specified by BI)
 // with a value, then remove and delete the original instruction.
@@ -70,18 +75,62 @@ void ReplaceInstWithInst(BasicBlock::InstListType &BIL,
 //
 void ReplaceInstWithInst(Instruction *From, Instruction *To);
 
+/// \brief Option class for critical edge splitting.
+///
+/// This provides a builder interface for overriding the default options used
+/// during critical edge splitting.
+struct CriticalEdgeSplittingOptions {
+  AliasAnalysis *AA;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  bool MergeIdenticalEdges;
+  bool DontDeleteUselessPHIs;
+  bool PreserveLCSSA;
+
+  CriticalEdgeSplittingOptions()
+      : AA(nullptr), DT(nullptr), LI(nullptr), MergeIdenticalEdges(false),
+        DontDeleteUselessPHIs(false), PreserveLCSSA(false) {}
+
+  /// \brief Basic case of setting up all the analysis.
+  CriticalEdgeSplittingOptions(AliasAnalysis *AA, DominatorTree *DT = nullptr,
+                               LoopInfo *LI = nullptr)
+      : AA(AA), DT(DT), LI(LI), MergeIdenticalEdges(false),
+        DontDeleteUselessPHIs(false), PreserveLCSSA(false) {}
+
+  /// \brief A common pattern is to preserve the dominator tree and loop
+  /// info but not care about AA.
+  CriticalEdgeSplittingOptions(DominatorTree *DT, LoopInfo *LI)
+      : AA(nullptr), DT(DT), LI(LI), MergeIdenticalEdges(false),
+        DontDeleteUselessPHIs(false), PreserveLCSSA(false) {}
+
+  CriticalEdgeSplittingOptions &setMergeIdenticalEdges() {
+    MergeIdenticalEdges = true;
+    return *this;
+  }
+
+  CriticalEdgeSplittingOptions &setDontDeleteUselessPHIs() {
+    DontDeleteUselessPHIs = true;
+    return *this;
+  }
+
+  CriticalEdgeSplittingOptions &setPreserveLCSSA() {
+    PreserveLCSSA = true;
+    return *this;
+  }
+};
+
 /// SplitCriticalEdge - If this edge is a critical edge, insert a new node to
-/// split the critical edge.  This will update DominatorTree and
-/// DominatorFrontier information if it is available, thus calling this pass
-/// will not invalidate either of them. This returns the new block if the edge
-/// was split, null otherwise.
+/// split the critical edge.  This will update the analyses passed in through
+/// the option struct. This returns the new block if the edge was split, null
+/// otherwise.
 ///
-/// If MergeIdenticalEdges is true (not the default), *all* edges from TI to the
-/// specified successor will be merged into the same critical edge block.
-/// This is most commonly interesting with switch instructions, which may
-/// have many edges to any one destination.  This ensures that all edges to that
-/// dest go to one block instead of each going to a different block, but isn't
-/// the standard definition of a "critical edge".
+/// If MergeIdenticalEdges in the options struct is true (not the default),
+/// *all* edges from TI to the specified successor will be merged into the same
+/// critical edge block. This is most commonly interesting with switch
+/// instructions, which may have many edges to any one destination.  This
+/// ensures that all edges to that dest go to one block instead of each going
+/// to a different block, but isn't the standard definition of a "critical
+/// edge".
 ///
 /// It is invalid to call this function on a critical edge that starts at an
 /// IndirectBrInst.  Splitting these edges will almost always create an invalid
@@ -89,14 +138,15 @@ void ReplaceInstWithInst(Instruction *From, Instruction *To);
 /// to.
 ///
 BasicBlock *SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
-                              Pass *P = nullptr,
-                              bool MergeIdenticalEdges = false,
-                              bool DontDeleteUselessPHIs = false,
-                              bool SplitLandingPads = false);
-
-inline BasicBlock *SplitCriticalEdge(BasicBlock *BB, succ_iterator SI,
-                                     Pass *P = nullptr) {
-  return SplitCriticalEdge(BB->getTerminator(), SI.getSuccessorIndex(), P);
+                              const CriticalEdgeSplittingOptions &Options =
+                                  CriticalEdgeSplittingOptions());
+
+inline BasicBlock *
+SplitCriticalEdge(BasicBlock *BB, succ_iterator SI,
+                  const CriticalEdgeSplittingOptions &Options =
+                      CriticalEdgeSplittingOptions()) {
+  return SplitCriticalEdge(BB->getTerminator(), SI.getSuccessorIndex(),
+                           Options);
 }
 
 /// SplitCriticalEdge - If the edge from *PI to BB is not critical, return
@@ -105,55 +155,62 @@ inline BasicBlock *SplitCriticalEdge(BasicBlock *BB, succ_iterator SI,
 /// function.  If P is specified, it updates the analyses
 /// described above.
 inline bool SplitCriticalEdge(BasicBlock *Succ, pred_iterator PI,
-                              Pass *P = nullptr) {
+                              const CriticalEdgeSplittingOptions &Options =
+                                  CriticalEdgeSplittingOptions()) {
   bool MadeChange = false;
   TerminatorInst *TI = (*PI)->getTerminator();
   for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
     if (TI->getSuccessor(i) == Succ)
-      MadeChange |= !!SplitCriticalEdge(TI, i, P);
+      MadeChange |= !!SplitCriticalEdge(TI, i, Options);
   return MadeChange;
 }
 
 /// SplitCriticalEdge - If an edge from Src to Dst is critical, split the edge
 /// and return true, otherwise return false.  This method requires that there be
-/// an edge between the two blocks.  If P is specified, it updates the analyses
-/// described above.
-inline BasicBlock *SplitCriticalEdge(BasicBlock *Src, BasicBlock *Dst,
-                                     Pass *P = nullptr,
-                                     bool MergeIdenticalEdges = false,
-                                     bool DontDeleteUselessPHIs = false) {
+/// an edge between the two blocks.  It updates the analyses
+/// passed in the options struct
+inline BasicBlock *
+SplitCriticalEdge(BasicBlock *Src, BasicBlock *Dst,
+                  const CriticalEdgeSplittingOptions &Options =
+                      CriticalEdgeSplittingOptions()) {
   TerminatorInst *TI = Src->getTerminator();
   unsigned i = 0;
   while (1) {
     assert(i != TI->getNumSuccessors() && "Edge doesn't exist!");
     if (TI->getSuccessor(i) == Dst)
-      return SplitCriticalEdge(TI, i, P, MergeIdenticalEdges,
-                               DontDeleteUselessPHIs);
+      return SplitCriticalEdge(TI, i, Options);
     ++i;
   }
 }
 
 // SplitAllCriticalEdges - Loop over all of the edges in the CFG,
-// breaking critical edges as they are found. Pass P must not be NULL.
+// breaking critical edges as they are found.
 // Returns the number of broken edges.
-unsigned SplitAllCriticalEdges(Function &F, Pass *P);
+unsigned SplitAllCriticalEdges(Function &F,
+                               const CriticalEdgeSplittingOptions &Options =
+                                   CriticalEdgeSplittingOptions());
 
-/// SplitEdge -  Split the edge connecting specified block. Pass P must
-/// not be NULL.
-BasicBlock *SplitEdge(BasicBlock *From, BasicBlock *To, Pass *P);
+/// SplitEdge -  Split the edge connecting specified block.
+BasicBlock *SplitEdge(BasicBlock *From, BasicBlock *To,
+                      DominatorTree *DT = nullptr, LoopInfo *LI = nullptr);
 
 /// SplitBlock - Split the specified block at the specified instruction - every
 /// thing before SplitPt stays in Old and everything starting with SplitPt moves
 /// to a new block.  The two blocks are joined by an unconditional branch and
 /// the loop info is updated.
 ///
-BasicBlock *SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P);
+BasicBlock *SplitBlock(BasicBlock *Old, Instruction *SplitPt,
+                       DominatorTree *DT = nullptr, LoopInfo *LI = nullptr);
 
-/// SplitBlockPredecessors - This method transforms BB by introducing a new
-/// basic block into the function, and moving some of the predecessors of BB to
-/// be predecessors of the new block.  The new predecessors are indicated by the
-/// Preds array, which has NumPreds elements in it.  The new block is given a
-/// suffix of 'Suffix'.  This function returns the new block.
+/// SplitBlockPredecessors - This method introduces at least one new basic block
+/// into the function and moves some of the predecessors of BB to be
+/// predecessors of the new block. The new predecessors are indicated by the
+/// Preds array. The new block is given a suffix of 'Suffix'. Returns new basic
+/// block to which predecessors from Preds are now pointing.
+///
+/// If BB is a landingpad block then additional basicblock might be introduced.
+/// It will have Suffix+".split_lp". See SplitLandingPadPredecessors for more
+/// details on this case.
 ///
 /// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
 /// DominanceFrontier, LoopInfo, and LCCSA but no other analyses.
@@ -161,8 +218,12 @@ BasicBlock *SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P);
 /// complicated to handle the case where one of the edges being split
 /// is an exit of a loop with other exits).
 ///
-BasicBlock *SplitBlockPredecessors(BasicBlock *BB, ArrayRef<BasicBlock*> Preds,
-                                   const char *Suffix, Pass *P = nullptr);
+BasicBlock *SplitBlockPredecessors(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
+                                   const char *Suffix,
+                                   AliasAnalysis *AA = nullptr,
+                                   DominatorTree *DT = nullptr,
+                                   LoopInfo *LI = nullptr,
+                                   bool PreserveLCSSA = false);
 
 /// SplitLandingPadPredecessors - This method transforms the landing pad,
 /// OrigBB, by introducing two new basic blocks into the function. One of those
@@ -177,9 +238,14 @@ BasicBlock *SplitBlockPredecessors(BasicBlock *BB, ArrayRef<BasicBlock*> Preds,
 /// case where one of the edges being split is an exit of a loop with other
 /// exits).
 ///
-void SplitLandingPadPredecessors(BasicBlock *OrigBB,ArrayRef<BasicBlock*> Preds,
+void SplitLandingPadPredecessors(BasicBlock *OrigBB,
+                                 ArrayRef<BasicBlock *> Preds,
                                  const char *Suffix, const char *Suffix2,
-                                 Pass *P, SmallVectorImpl<BasicBlock*> &NewBBs);
+                                 SmallVectorImpl<BasicBlock *> &NewBBs,
+                                 AliasAnalysis *AA = nullptr,
+                                 DominatorTree *DT = nullptr,
+                                 LoopInfo *LI = nullptr,
+                                 bool PreserveLCSSA = false);
 
 /// FoldReturnIntoUncondBranch - This method duplicates the specified return
 /// instruction into a predecessor which ends in an unconditional branch. If
diff --git a/include/llvm/Transforms/Utils/BuildLibCalls.h b/include/llvm/Transforms/Utils/BuildLibCalls.h
index 1e407fb..6387c16 100644
--- a/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -115,20 +115,6 @@ namespace llvm {
   /// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
   Value *EmitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
                     const DataLayout *TD, const TargetLibraryInfo *TLI);
-
-  /// SimplifyFortifiedLibCalls - Helper class for folding checked library
-  /// calls (e.g. __strcpy_chk) into their unchecked counterparts.
-  class SimplifyFortifiedLibCalls {
-  protected:
-    CallInst *CI;
-    virtual void replaceCall(Value *With) = 0;
-    virtual bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp,
-                            bool isString) const = 0;
-
-  public:
-    virtual ~SimplifyFortifiedLibCalls();
-    bool fold(CallInst *CI, const DataLayout *TD, const TargetLibraryInfo *TLI);
-  };
 }
 
 #endif
diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index 740d725..d1563ef 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h
@@ -44,7 +44,7 @@ class Loop;
 class LoopInfo;
 class AllocaInst;
 class AliasAnalysis;
-class AssumptionTracker;
+class AssumptionCacheTracker;
 
 /// CloneModule - Return an exact copy of the specified module
 ///
@@ -95,8 +95,7 @@ struct ClonedCodeInfo {
 /// function, you can specify a ClonedCodeInfo object with the optional fifth
 /// parameter.
 ///
-BasicBlock *CloneBasicBlock(const BasicBlock *BB,
-                            ValueToValueMapTy &VMap,
+BasicBlock *CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
                             const Twine &NameSuffix = "", Function *F = nullptr,
                             ClonedCodeInfo *CodeInfo = nullptr);
 
@@ -112,8 +111,7 @@ BasicBlock *CloneBasicBlock(const BasicBlock *BB,
 /// If ModuleLevelChanges is false, VMap contains no non-identity GlobalValue
 /// mappings, and debug info metadata will not be cloned.
 ///
-Function *CloneFunction(const Function *F,
-                        ValueToValueMapTy &VMap,
+Function *CloneFunction(const Function *F, ValueToValueMapTy &VMap,
                         bool ModuleLevelChanges,
                         ClonedCodeInfo *CodeInfo = nullptr);
 
@@ -127,14 +125,49 @@ Function *CloneFunction(const Function *F,
 /// mappings.
 ///
 void CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
-                       ValueToValueMapTy &VMap,
-                       bool ModuleLevelChanges,
+                       ValueToValueMapTy &VMap, bool ModuleLevelChanges,
                        SmallVectorImpl<ReturnInst*> &Returns,
                        const char *NameSuffix = "",
                        ClonedCodeInfo *CodeInfo = nullptr,
                        ValueMapTypeRemapper *TypeMapper = nullptr,
                        ValueMaterializer *Materializer = nullptr);
 
+/// A helper class used with CloneAndPruneIntoFromInst to change the default
+/// behavior while instructions are being cloned.
+class CloningDirector {
+public:
+  /// This enumeration describes the way CloneAndPruneIntoFromInst should
+  /// proceed after the CloningDirector has examined an instruction.
+  enum CloningAction {
+    ///< Continue cloning the instruction (default behavior).
+    CloneInstruction,
+    ///< Skip this instruction but continue cloning the current basic block.
+    SkipInstruction,
+    ///< Skip this instruction and stop cloning the current basic block.
+    StopCloningBB
+  };
+
+  virtual ~CloningDirector() {}
+
+  /// Subclasses must override this function to customize cloning behavior.
+  virtual CloningAction handleInstruction(ValueToValueMapTy &VMap,
+                                          const Instruction *Inst,
+                                          BasicBlock *NewBB) = 0;
+
+  virtual ValueMapTypeRemapper *getTypeRemapper() { return nullptr; }
+  virtual ValueMaterializer *getValueMaterializer() { return nullptr; }
+};
+
+void CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
+                               const Instruction *StartingInst,
+                               ValueToValueMapTy &VMap, bool ModuleLevelChanges,
+                               SmallVectorImpl<ReturnInst*> &Returns,
+                               const char *NameSuffix = "", 
+                               ClonedCodeInfo *CodeInfo = nullptr,
+                               const DataLayout *DL = nullptr,
+                               CloningDirector *Director = nullptr);
+
+
 /// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
 /// except that it does some simple constant prop and DCE on the fly.  The
 /// effect of this is to copy significantly less code in cases where (for
@@ -147,8 +180,7 @@ void CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
 /// mappings.
 ///
 void CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
-                               ValueToValueMapTy &VMap,
-                               bool ModuleLevelChanges,
+                               ValueToValueMapTy &VMap, bool ModuleLevelChanges,
                                SmallVectorImpl<ReturnInst*> &Returns,
                                const char *NameSuffix = "",
                                ClonedCodeInfo *CodeInfo = nullptr,
@@ -162,19 +194,19 @@ public:
   explicit InlineFunctionInfo(CallGraph *cg = nullptr,
                               const DataLayout *DL = nullptr,
                               AliasAnalysis *AA = nullptr,
-                              AssumptionTracker *AT = nullptr)
-    : CG(cg), DL(DL), AA(AA), AT(AT) {}
+                              AssumptionCacheTracker *ACT = nullptr)
+      : CG(cg), DL(DL), AA(AA), ACT(ACT) {}
 
   /// CG - If non-null, InlineFunction will update the callgraph to reflect the
   /// changes it makes.
   CallGraph *CG;
   const DataLayout *DL;
   AliasAnalysis *AA;
-  AssumptionTracker *AT;
+  AssumptionCacheTracker *ACT;
 
   /// StaticAllocas - InlineFunction fills this in with all static allocas that
   /// get copied into the caller.
-  SmallVector<AllocaInst*, 4> StaticAllocas;
+  SmallVector<AllocaInst *, 4> StaticAllocas;
 
   /// InlinedCalls - InlineFunction fills this in with callsites that were
   /// inlined from the callee.  This is only filled in if CG is non-null.
@@ -196,9 +228,12 @@ public:
 /// exists in the instruction stream.  Similarly this will inline a recursive
 /// function by one level.
 ///
-bool InlineFunction(CallInst *C, InlineFunctionInfo &IFI, bool InsertLifetime = true);
-bool InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI, bool InsertLifetime = true);
-bool InlineFunction(CallSite CS, InlineFunctionInfo &IFI, bool InsertLifetime = true);
+bool InlineFunction(CallInst *C, InlineFunctionInfo &IFI,
+                    bool InsertLifetime = true);
+bool InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI,
+                    bool InsertLifetime = true);
+bool InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
+                    bool InsertLifetime = true);
 
 } // End llvm namespace
 
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index e89e5e5..463ab96 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -31,10 +31,9 @@ class DbgDeclareInst;
 class StoreInst;
 class LoadInst;
 class Value;
-class Pass;
 class PHINode;
 class AllocaInst;
-class AssumptionTracker;
+class AssumptionCache;
 class ConstantExpr;
 class DataLayout;
 class TargetLibraryInfo;
@@ -115,7 +114,7 @@ void RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
 /// between them, moving the instructions in the predecessor into BB.  This
 /// deletes the predecessor block.
 ///
-void MergeBasicBlockIntoOnlyPred(BasicBlock *BB, Pass *P = nullptr);
+void MergeBasicBlockIntoOnlyPred(BasicBlock *BB, DominatorTree *DT = nullptr);
 
 /// TryToSimplifyUncondBranchFromEmptyBlock - BB is known to contain an
 /// unconditional branch, and contains no instructions other than PHI nodes,
@@ -138,9 +137,8 @@ bool EliminateDuplicatePHINodes(BasicBlock *BB);
 /// the basic block that was pointed to.
 ///
 bool SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
-                 unsigned BonusInstThreshold,
-                 const DataLayout *TD = nullptr,
-                 AssumptionTracker *AT = nullptr);
+                 unsigned BonusInstThreshold, const DataLayout *TD = nullptr,
+                 AssumptionCache *AC = nullptr);
 
 /// FlatternCFG - This function is used to flatten a CFG.  For
 /// example, it uses parallel-and and parallel-or mode to collapse
@@ -176,17 +174,17 @@ AllocaInst *DemotePHIToStack(PHINode *P, Instruction *AllocaPoint = nullptr);
 /// increase the alignment of the ultimate object, making this check succeed.
 unsigned getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
                                     const DataLayout *TD = nullptr,
-                                    AssumptionTracker *AT = nullptr,
+                                    AssumptionCache *AC = nullptr,
                                     const Instruction *CxtI = nullptr,
                                     const DominatorTree *DT = nullptr);
 
 /// getKnownAlignment - Try to infer an alignment for the specified pointer.
 static inline unsigned getKnownAlignment(Value *V,
                                          const DataLayout *TD = nullptr,
-                                         AssumptionTracker *AT = nullptr,
+                                         AssumptionCache *AC = nullptr,
                                          const Instruction *CxtI = nullptr,
                                          const DominatorTree *DT = nullptr) {
-  return getOrEnforceKnownAlignment(V, 0, TD, AT, CxtI, DT);
+  return getOrEnforceKnownAlignment(V, 0, TD, AC, CxtI, DT);
 }
 
 /// EmitGEPOffset - Given a getelementptr instruction/constantexpr, emit the
@@ -276,10 +274,11 @@ bool LowerDbgDeclare(Function &F);
 /// an alloca, if any.
 DbgDeclareInst *FindAllocaDbgDeclare(Value *V);
 
-/// replaceDbgDeclareForAlloca - Replaces llvm.dbg.declare instruction when
-/// alloca is replaced with a new value.
+/// \brief Replaces llvm.dbg.declare instruction when an alloca is replaced with
+/// a new value.  If Deref is true, tan additional DW_OP_deref is prepended to
+/// the expression.
 bool replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
-                                DIBuilder &Builder);
+                                DIBuilder &Builder, bool Deref);
 
 /// \brief Remove all blocks that can not be reached from the function's entry.
 ///
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index fdae80d..bb80f20 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -14,16 +14,33 @@
 #ifndef LLVM_TRANSFORMS_UTILS_LOOPUTILS_H
 #define LLVM_TRANSFORMS_UTILS_LOOPUTILS_H
 
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Dominators.h"
+
 namespace llvm {
 class AliasAnalysis;
-class AssumptionTracker;
+class AliasSet;
+class AliasSetTracker;
+class AssumptionCache;
 class BasicBlock;
 class DataLayout;
 class DominatorTree;
 class Loop;
 class LoopInfo;
 class Pass;
+class PredIteratorCache;
 class ScalarEvolution;
+class TargetLibraryInfo;
+
+/// \brief Captures loop safety information.
+/// It keep information for loop & its header may throw exception.
+struct LICMSafetyInfo {
+  bool MayThrow;           // The current loop contains an instruction which
+                           // may throw.
+  bool HeaderMayThrow;     // Same as previous, but specific to loop header
+  LICMSafetyInfo() : MayThrow(false), HeaderMayThrow(false)
+  {}
+};
 
 BasicBlock *InsertPreheaderForLoop(Loop *L, Pass *P);
 
@@ -36,7 +53,7 @@ BasicBlock *InsertPreheaderForLoop(Loop *L, Pass *P);
 bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP,
                   AliasAnalysis *AA = nullptr, ScalarEvolution *SE = nullptr,
                   const DataLayout *DL = nullptr,
-                  AssumptionTracker *AT = nullptr);
+                  AssumptionCache *AC = nullptr);
 
 /// \brief Put loop into LCSSA form.
 ///
@@ -49,7 +66,8 @@ bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP,
 /// If ScalarEvolution is passed in, it will be preserved.
 ///
 /// Returns true if any modifications are made to the loop.
-bool formLCSSA(Loop &L, DominatorTree &DT, ScalarEvolution *SE = nullptr);
+bool formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,
+               ScalarEvolution *SE = nullptr);
 
 /// \brief Put a loop nest into LCSSA form.
 ///
@@ -60,8 +78,51 @@ bool formLCSSA(Loop &L, DominatorTree &DT, ScalarEvolution *SE = nullptr);
 /// If ScalarEvolution is passed in, it will be preserved.
 ///
 /// Returns true if any modifications are made to the loop.
-bool formLCSSARecursively(Loop &L, DominatorTree &DT,
+bool formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI,
                           ScalarEvolution *SE = nullptr);
+
+/// \brief Walk the specified region of the CFG (defined by all blocks
+/// dominated by the specified block, and that are in the current loop) in
+/// reverse depth first order w.r.t the DominatorTree. This allows us to visit
+/// uses before definitions, allowing us to sink a loop body in one pass without
+/// iteration. Takes DomTreeNode, AliasAnalysis, LoopInfo, DominatorTree, 
+/// DataLayout, TargetLibraryInfo, Loop, AliasSet information for all 
+/// instructions of the loop and loop safety information as arguments. 
+/// It returns changed status. 
+bool sinkRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
+                const DataLayout *, TargetLibraryInfo *, Loop *,
+                AliasSetTracker *, LICMSafetyInfo *);
+
+/// \brief Walk the specified region of the CFG (defined by all blocks
+/// dominated by the specified block, and that are in the current loop) in depth
+/// first order w.r.t the DominatorTree.  This allows us to visit definitions
+/// before uses, allowing us to hoist a loop body in one pass without iteration.
+/// Takes DomTreeNode, AliasAnalysis, LoopInfo, DominatorTree, DataLayout,
+/// TargetLibraryInfo, Loop, AliasSet information for all instructions of the 
+/// loop and loop safety information as arguments. It returns changed status.
+bool hoistRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
+                 const DataLayout *, TargetLibraryInfo *, Loop *,
+                 AliasSetTracker *, LICMSafetyInfo *);
+
+/// \brief Try to promote memory values to scalars by sinking stores out of 
+/// the loop and moving loads to before the loop.  We do this by looping over
+/// the stores in the loop, looking for stores to Must pointers which are 
+/// loop invariant. It takes AliasSet, Loop exit blocks vector, loop exit blocks
+/// insertion point vector, PredIteratorCache, LoopInfo, DominatorTree, Loop,
+/// AliasSet information for all instructions of the loop and loop safety 
+/// information as arguments. It returns changed status.
+bool promoteLoopAccessesToScalars(AliasSet &, SmallVectorImpl<BasicBlock*> &,
+                                  SmallVectorImpl<Instruction*> &,
+                                  PredIteratorCache &, LoopInfo *,
+                                  DominatorTree *, Loop *, AliasSetTracker *,
+                                  LICMSafetyInfo *);
+
+/// \brief Computes safety information for a loop
+/// checks loop body & header for the possiblity of may throw
+/// exception, it takes LICMSafetyInfo and loop as argument.
+/// Updates safety information in LICMSafetyInfo argument.
+void computeLICMSafetyInfo(LICMSafetyInfo *, Loop *);
+
 }
 
 #endif
diff --git a/include/llvm/Transforms/Utils/PromoteMemToReg.h b/include/llvm/Transforms/Utils/PromoteMemToReg.h
index 3fdd5e9..d0602bf 100644
--- a/include/llvm/Transforms/Utils/PromoteMemToReg.h
+++ b/include/llvm/Transforms/Utils/PromoteMemToReg.h
@@ -22,7 +22,7 @@ namespace llvm {
 class AllocaInst;
 class DominatorTree;
 class AliasSetTracker;
-class AssumptionTracker;
+class AssumptionCache;
 
 /// \brief Return true if this alloca is legal for promotion.
 ///
@@ -43,7 +43,7 @@ bool isAllocaPromotable(const AllocaInst *AI);
 /// made to the IR.
 void PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
                      AliasSetTracker *AST = nullptr,
-                     AssumptionTracker *AT = nullptr);
+                     AssumptionCache *AC = nullptr);
 
 } // End llvm namespace
 
diff --git a/include/llvm/Transforms/Utils/SSAUpdater.h b/include/llvm/Transforms/Utils/SSAUpdater.h
index 7874a5f..19e2a4a 100644
--- a/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -118,8 +118,8 @@ public:
 private:
   Value *GetValueAtEndOfBlockInternal(BasicBlock *BB);
 
-  void operator=(const SSAUpdater&) LLVM_DELETED_FUNCTION;
-  SSAUpdater(const SSAUpdater&) LLVM_DELETED_FUNCTION;
+  void operator=(const SSAUpdater&) = delete;
+  SSAUpdater(const SSAUpdater&) = delete;
 };
 
 /// \brief Helper class for promoting a collection of loads and stores into SSA
diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index 6765ac1..08358e1 100644
--- a/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -16,6 +16,7 @@
 #define LLVM_TRANSFORMS_UTILS_SIMPLIFYLIBCALLS_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/IRBuilder.h"
 
 namespace llvm {
@@ -27,20 +28,67 @@ class TargetLibraryInfo;
 class BasicBlock;
 class Function;
 
+/// \brief This class implements simplifications for calls to fortified library
+/// functions (__st*cpy_chk, __memcpy_chk, __memmove_chk, __memset_chk), to,
+/// when possible, replace them with their non-checking counterparts.
+/// Other optimizations can also be done, but it's possible to disable them and
+/// only simplify needless use of the checking versions (when the object size
+/// is unknown) by passing true for OnlyLowerUnknownSize.
+class FortifiedLibCallSimplifier {
+private:
+  const DataLayout *DL;
+  const TargetLibraryInfo *TLI;
+  bool OnlyLowerUnknownSize;
+
+public:
+  FortifiedLibCallSimplifier(const DataLayout *DL, const TargetLibraryInfo *TLI,
+                             bool OnlyLowerUnknownSize = false);
+
+  /// \brief Take the given call instruction and return a more
+  /// optimal value to replace the instruction with or 0 if a more
+  /// optimal form can't be found.
+  /// The call must not be an indirect call.
+  Value *optimizeCall(CallInst *CI);
+
+private:
+  Value *optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeMemSetChk(CallInst *CI, IRBuilder<> &B);
+
+  // Str/Stp cpy are similar enough to be handled in the same functions.
+  Value *optimizeStrpCpyChk(CallInst *CI, IRBuilder<> &B, LibFunc::Func Func);
+  Value *optimizeStrpNCpyChk(CallInst *CI, IRBuilder<> &B, LibFunc::Func Func);
+
+  /// \brief Checks whether the call \p CI to a fortified libcall is foldable
+  /// to the non-fortified version.
+  bool isFortifiedCallFoldable(CallInst *CI, unsigned ObjSizeOp,
+                               unsigned SizeOp, bool isString);
+};
+
 /// LibCallSimplifier - This class implements a collection of optimizations
 /// that replace well formed calls to library functions with a more optimal
 /// form.  For example, replacing 'printf("Hello!")' with 'puts("Hello!")'.
 class LibCallSimplifier {
 private:
+  FortifiedLibCallSimplifier FortifiedSimplifier;
   const DataLayout *DL;
   const TargetLibraryInfo *TLI;
   bool UnsafeFPShrink;
+  function_ref<void(Instruction *, Value *)> Replacer;
+
+  /// \brief Internal wrapper for RAUW that is the default implementation.
+  ///
+  /// Other users may provide an alternate function with this signature instead
+  /// of this one.
+  static void replaceAllUsesWithDefault(Instruction *I, Value *With);
 
-protected:
-  ~LibCallSimplifier() {}
+  /// \brief Replace an instruction's uses with a value using our replacer.
+  void replaceAllUsesWith(Instruction *I, Value *With);
 
 public:
-  LibCallSimplifier(const DataLayout *TD, const TargetLibraryInfo *TLI);
+  LibCallSimplifier(const DataLayout *TD, const TargetLibraryInfo *TLI,
+                    function_ref<void(Instruction *, Value *)> Replacer =
+                        &replaceAllUsesWithDefault);
 
   /// optimizeCall - Take the given call instruction and return a more
   /// optimal value to replace the instruction with or 0 if a more
@@ -48,22 +96,10 @@ public:
   /// be equal to the instruction being optimized.  In this case all
   /// other instructions that use the given instruction were modified
   /// and the given instruction is dead.
+  /// The call must not be an indirect call.
   Value *optimizeCall(CallInst *CI);
 
-  /// replaceAllUsesWith - This method is used when the library call
-  /// simplifier needs to replace instructions other than the library
-  /// call being modified.
-  virtual void replaceAllUsesWith(Instruction *I, Value *With) const;
-
 private:
-  // Fortified Library Call Optimizations
-  Value *optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B);
-  Value *optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B);
-  Value *optimizeMemSetChk(CallInst *CI, IRBuilder<> &B);
-  Value *optimizeStrCpyChk(CallInst *CI, IRBuilder<> &B);
-  Value *optimizeStpCpyChk(CallInst *CI, IRBuilder<> &B);
-  Value *optimizeStrNCpyChk(CallInst *CI, IRBuilder<> &B);
-
   // String and Memory Library Call Optimizations
   Value *optimizeStrCat(CallInst *CI, IRBuilder<> &B);
   Value *optimizeStrNCat(CallInst *CI, IRBuilder<> &B);
@@ -84,6 +120,8 @@ private:
   Value *optimizeMemCpy(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemMove(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemSet(CallInst *CI, IRBuilder<> &B);
+  // Wrapper for all String/Memory Library Call Optimizations
+  Value *optimizeStringMemoryLibCall(CallInst *CI, IRBuilder<> &B);
 
   // Math Library Optimizations
   Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, bool CheckRetType);
diff --git a/include/llvm/Transforms/Utils/SymbolRewriter.h b/include/llvm/Transforms/Utils/SymbolRewriter.h
index af79372..48dea04 100644
--- a/include/llvm/Transforms/Utils/SymbolRewriter.h
+++ b/include/llvm/Transforms/Utils/SymbolRewriter.h
@@ -60,10 +60,10 @@ namespace SymbolRewriter {
 /// select the symbols to rewrite.  This descriptor list is passed to the
 /// SymbolRewriter pass.
 class RewriteDescriptor : public ilist_node<RewriteDescriptor> {
-  RewriteDescriptor(const RewriteDescriptor &) LLVM_DELETED_FUNCTION;
+  RewriteDescriptor(const RewriteDescriptor &) = delete;
 
   const RewriteDescriptor &
-  operator=(const RewriteDescriptor &) LLVM_DELETED_FUNCTION;
+  operator=(const RewriteDescriptor &) = delete;
 
 public:
   enum class Type {
diff --git a/include/llvm/Transforms/Utils/UnrollLoop.h b/include/llvm/Transforms/Utils/UnrollLoop.h
index 0b88d25..04d9ee1 100644
--- a/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -16,20 +16,25 @@
 #ifndef LLVM_TRANSFORMS_UTILS_UNROLLLOOP_H
 #define LLVM_TRANSFORMS_UTILS_UNROLLLOOP_H
 
+#include "llvm/ADT/StringRef.h"
+
 namespace llvm {
 
-class AssumptionTracker;
+class AssumptionCache;
 class Loop;
 class LoopInfo;
 class LPPassManager;
+class MDNode;
 class Pass;
 
 bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool AllowRuntime,
                 unsigned TripMultiple, LoopInfo *LI, Pass *PP,
-                LPPassManager *LPM, AssumptionTracker *AT);
+                LPPassManager *LPM, AssumptionCache *AC);
 
 bool UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
                              LPPassManager* LPM);
+
+MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
 }
 
 #endif
diff --git a/include/llvm/Transforms/Utils/ValueMapper.h b/include/llvm/Transforms/Utils/ValueMapper.h
index 5774763..047ab81 100644
--- a/include/llvm/Transforms/Utils/ValueMapper.h
+++ b/include/llvm/Transforms/Utils/ValueMapper.h
@@ -71,20 +71,23 @@ namespace llvm {
                   ValueMapTypeRemapper *TypeMapper = nullptr,
                   ValueMaterializer *Materializer = nullptr);
 
+  Metadata *MapMetadata(const Metadata *MD, ValueToValueMapTy &VM,
+                        RemapFlags Flags = RF_None,
+                        ValueMapTypeRemapper *TypeMapper = nullptr,
+                        ValueMaterializer *Materializer = nullptr);
+
+  /// MapMetadata - provide versions that preserve type safety for MDNodes.
+  MDNode *MapMetadata(const MDNode *MD, ValueToValueMapTy &VM,
+                      RemapFlags Flags = RF_None,
+                      ValueMapTypeRemapper *TypeMapper = nullptr,
+                      ValueMaterializer *Materializer = nullptr);
+
   void RemapInstruction(Instruction *I, ValueToValueMapTy &VM,
                         RemapFlags Flags = RF_None,
                         ValueMapTypeRemapper *TypeMapper = nullptr,
                         ValueMaterializer *Materializer = nullptr);
 
-  /// MapValue - provide versions that preserve type safety for MDNode and
-  /// Constants.
-  inline MDNode *MapValue(const MDNode *V, ValueToValueMapTy &VM,
-                          RemapFlags Flags = RF_None,
-                          ValueMapTypeRemapper *TypeMapper = nullptr,
-                          ValueMaterializer *Materializer = nullptr) {
-    return cast<MDNode>(MapValue((const Value*)V, VM, Flags, TypeMapper,
-                                 Materializer));
-  }
+  /// MapValue - provide versions that preserve type safety for Constants.
   inline Constant *MapValue(const Constant *V, ValueToValueMapTy &VM,
                             RemapFlags Flags = RF_None,
                             ValueMapTypeRemapper *TypeMapper = nullptr,
diff --git a/include/llvm/Transforms/Utils/VectorUtils.h b/include/llvm/Transforms/Utils/VectorUtils.h
index 83871fc..9f0fb19 100644
--- a/include/llvm/Transforms/Utils/VectorUtils.h
+++ b/include/llvm/Transforms/Utils/VectorUtils.h
@@ -14,9 +14,9 @@
 #ifndef LLVM_TRANSFORMS_UTILS_VECTORUTILS_H
 #define LLVM_TRANSFORMS_UTILS_VECTORUTILS_H
 
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/IR/Intrinsics.h"
 
 namespace llvm {
 
@@ -59,7 +59,7 @@ static inline bool isTriviallyVectorizable(Intrinsic::ID ID) {
   }
 }
 
-static bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID,
+static inline bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID,
                                          unsigned ScalarOpdIdx) {
   switch (ID) {
     case Intrinsic::ctlz:
diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap
index 46f6e40..0f9c22e 100644
--- a/include/llvm/module.modulemap
+++ b/include/llvm/module.modulemap
@@ -40,7 +40,43 @@ module LLVM_Backend {
 
 module LLVM_Bitcode { requires cplusplus umbrella "Bitcode" module * { export * } }
 module LLVM_Config { requires cplusplus umbrella "Config" module * { export * } }
-module LLVM_DebugInfo { requires cplusplus umbrella "DebugInfo" module * { export * } }
+
+module LLVM_DebugInfo_DWARF {
+  requires cplusplus
+
+  umbrella "DebugInfo/DWARF"
+  module * { export * }
+}
+
+module LLVM_DebugInfo_PDB {
+  requires cplusplus
+
+  umbrella "DebugInfo/PDB"
+  module * { export * }
+
+  // Separate out this subdirectory; it's an optional component that depends on
+  // a separate library which might not be available.
+  //
+  // FIXME: There should be a better way to specify this.
+  exclude header "DebugInfo/PDB/DIA/DIADataStream.h"
+  exclude header "DebugInfo/PDB/DIA/DIAEnumDebugStreams.h"
+  exclude header "DebugInfo/PDB/DIA/DIAEnumLineNumbers.h"
+  exclude header "DebugInfo/PDB/DIA/DIAEnumSourceFiles.h"
+  exclude header "DebugInfo/PDB/DIA/DIAEnumSymbols.h"
+  exclude header "DebugInfo/PDB/DIA/DIALineNumber.h"
+  exclude header "DebugInfo/PDB/DIA/DIARawSymbol.h"
+  exclude header "DebugInfo/PDB/DIA/DIASession.h"
+  exclude header "DebugInfo/PDB/DIA/DIASourceFile.h"
+  exclude header "DebugInfo/PDB/DIA/DIASupport.h"
+}
+
+module LLVM_DebugInfo_PDB_DIA {
+  requires cplusplus
+
+  umbrella "DebugInfo/PDB/DIA"
+  module * { export * }
+}
+
 module LLVM_ExecutionEngine {
   requires cplusplus
 
@@ -55,6 +91,7 @@ module LLVM_ExecutionEngine {
   exclude header "ExecutionEngine/JIT.h"
   exclude header "ExecutionEngine/MCJIT.h"
   exclude header "ExecutionEngine/Interpreter.h"
+  exclude header "ExecutionEngine/OrcMCJITReplacement.h"
 }
 
 module LLVM_IR {
@@ -70,26 +107,10 @@ module LLVM_IR {
   umbrella "IR"
   module * { export * }
 
-  // We cannot have llvm/PassManager.h and llvm/IR/PassManager.h in the same TU,
-  // so we can't include llvm/IR/PassManager.h in the IR module.
-  exclude header "IR/PassManager.h"
-  exclude header "IR/LegacyPassManager.h"
-
-  // Exclude this; it's intended for (repeated) textual inclusion.
-  exclude header "IR/Instruction.def"
-}
-
-module LLVM_LegacyPassManager {
-  requires cplusplus
-  module CompatInterface { header "PassManager.h" export * }
-  module Implementation { header "IR/LegacyPassManager.h" export * }
-}
-
-module LLVM_IR_PassManager {
-  requires cplusplus
-  // FIXME PR19358: This doesn't work! conflict LLVM_LegacyPassManager, "cannot use legacy pass manager and new pass manager in same file"
-  header "IR/PassManager.h"
-  export *
+  // These are intended for (repeated) textual inclusion.
+  textual header "IR/DebugInfoFlags.def"
+  textual header "IR/Instruction.def"
+  textual header "IR/Metadata.def"
 }
 
 module LLVM_IRReader { requires cplusplus umbrella "IRReader" module * { export * } }
@@ -161,6 +182,19 @@ module LLVM_Utils {
 
     // FIXME: Mislayered?
     exclude header "Support/TargetRegistry.h"
+
+    // These are intended for textual inclusion.
+    textual header "Support/Dwarf.def"
+    textual header "Support/ELFRelocs/AArch64.def"
+    textual header "Support/ELFRelocs/ARM.def"
+    textual header "Support/ELFRelocs/Hexagon.def"
+    textual header "Support/ELFRelocs/i386.def"
+    textual header "Support/ELFRelocs/Mips.def"
+    textual header "Support/ELFRelocs/PowerPC64.def"
+    textual header "Support/ELFRelocs/PowerPC.def"
+    textual header "Support/ELFRelocs/Sparc.def"
+    textual header "Support/ELFRelocs/SystemZ.def"
+    textual header "Support/ELFRelocs/x86_64.def"
   }
 }
 
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index 5171a45..4e95aa0 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
@@ -37,7 +38,6 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 using namespace llvm;
 
 // Register the AliasAnalysis interface, providing a nice name to refer to.
@@ -465,7 +465,8 @@ AliasAnalysis::~AliasAnalysis() {}
 void AliasAnalysis::InitializeAliasAnalysis(Pass *P) {
   DataLayoutPass *DLP = P->getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
-  TLI = P->getAnalysisIfAvailable<TargetLibraryInfo>();
+  auto *TLIP = P->getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+  TLI = TLIP ? &TLIP->getTLI() : nullptr;
   AA = &P->getAnalysis<AliasAnalysis>();
 }
 
@@ -483,21 +484,22 @@ uint64_t AliasAnalysis::getTypeStoreSize(Type *Ty) {
 }
 
 /// canBasicBlockModify - Return true if it is possible for execution of the
-/// specified basic block to modify the value pointed to by Ptr.
+/// specified basic block to modify the location Loc.
 ///
 bool AliasAnalysis::canBasicBlockModify(const BasicBlock &BB,
                                         const Location &Loc) {
-  return canInstructionRangeModify(BB.front(), BB.back(), Loc);
+  return canInstructionRangeModRef(BB.front(), BB.back(), Loc, Mod);
 }
 
-/// canInstructionRangeModify - Return true if it is possible for the execution
-/// of the specified instructions to modify the value pointed to by Ptr.  The
-/// instructions to consider are all of the instructions in the range of [I1,I2]
-/// INCLUSIVE.  I1 and I2 must be in the same basic block.
-///
-bool AliasAnalysis::canInstructionRangeModify(const Instruction &I1,
+/// canInstructionRangeModRef - Return true if it is possible for the
+/// execution of the specified instructions to mod\ref (according to the
+/// mode) the location Loc. The instructions to consider are all
+/// of the instructions in the range of [I1,I2] INCLUSIVE.
+/// I1 and I2 must be in the same basic block.  
+bool AliasAnalysis::canInstructionRangeModRef(const Instruction &I1,
                                               const Instruction &I2,
-                                              const Location &Loc) {
+                                              const Location &Loc,
+                                              const ModRefResult Mode) {
   assert(I1.getParent() == I2.getParent() &&
          "Instructions not in same basic block!");
   BasicBlock::const_iterator I = &I1;
@@ -505,7 +507,7 @@ bool AliasAnalysis::canInstructionRangeModify(const Instruction &I1,
   ++E;  // Convert from inclusive to exclusive range.
 
   for (; I != E; ++I) // Check every instruction in range
-    if (getModRefInfo(I, Loc) & Mod)
+    if (getModRefInfo(I, Loc) & Mode)
       return true;
   return false;
 }
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index f64bf0e..1bfb06d 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp
@@ -53,8 +53,9 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeLazyValueInfoPass(Registry);
   initializeLibCallAliasAnalysisPass(Registry);
   initializeLintPass(Registry);
-  initializeLoopInfoPass(Registry);
+  initializeLoopInfoWrapperPassPass(Registry);
   initializeMemDepPrinterPass(Registry);
+  initializeMemDerefPrinterPass(Registry);
   initializeMemoryDependenceAnalysisPass(Registry);
   initializeModuleDebugInfoPrinterPass(Registry);
   initializePostDominatorTreePass(Registry);
@@ -65,7 +66,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeRegionOnlyPrinterPass(Registry);
   initializeScalarEvolutionPass(Registry);
   initializeScalarEvolutionAliasAnalysisPass(Registry);
-  initializeTargetTransformInfoAnalysisGroup(Registry);
+  initializeTargetTransformInfoWrapperPassPass(Registry);
   initializeTypeBasedAliasAnalysisPass(Registry);
   initializeScopedNoAliasAAPass(Registry);
 }
diff --git a/lib/Analysis/Android.mk b/lib/Analysis/Android.mk
index 8770fa7..e17b870 100644
--- a/lib/Analysis/Android.mk
+++ b/lib/Analysis/Android.mk
@@ -7,7 +7,7 @@ analysis_SRC_FILES := \
   AliasDebugger.cpp \
   AliasSetTracker.cpp \
   Analysis.cpp \
-  AssumptionTracker.cpp \
+  AssumptionCache.cpp \
   BasicAliasAnalysis.cpp \
   BlockFrequencyInfo.cpp \
   BlockFrequencyInfoImpl.cpp \
@@ -24,7 +24,6 @@ analysis_SRC_FILES := \
   DependenceAnalysis.cpp \
   DomPrinter.cpp \
   DominanceFrontier.cpp \
-  FunctionTargetTransformInfo.cpp \
   IVUsers.cpp \
   InstCount.cpp \
   InstructionSimplify.cpp \
@@ -37,9 +36,11 @@ analysis_SRC_FILES := \
   LibCallSemantics.cpp \
   Lint.cpp \
   Loads.cpp \
+  LoopAccessAnalysis.cpp \
   LoopInfo.cpp \
   LoopPass.cpp \
   MemDepPrinter.cpp \
+  MemDerefPrinter.cpp \
   MemoryBuiltins.cpp \
   MemoryDependenceAnalysis.cpp \
   ModuleDebugInfoPrinter.cpp \
@@ -56,6 +57,7 @@ analysis_SRC_FILES := \
   ScalarEvolutionNormalization.cpp \
   ScopedNoAliasAA.cpp \
   SparsePropagation.cpp \
+  TargetLibraryInfo.cpp \
   TargetTransformInfo.cpp \
   Trace.cpp \
   TypeBasedAliasAnalysis.cpp \
diff --git a/lib/Analysis/AssumptionCache.cpp b/lib/Analysis/AssumptionCache.cpp
new file mode 100644
index 0000000..f468a43
--- /dev/null
+++ b/lib/Analysis/AssumptionCache.cpp
@@ -0,0 +1,140 @@
+//===- AssumptionCache.cpp - Cache finding @llvm.assume calls -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that keeps track of @llvm.assume intrinsics in
+// the functions of a module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+void AssumptionCache::scanFunction() {
+  assert(!Scanned && "Tried to scan the function twice!");
+  assert(AssumeHandles.empty() && "Already have assumes when scanning!");
+
+  // Go through all instructions in all blocks, add all calls to @llvm.assume
+  // to this cache.
+  for (BasicBlock &B : F)
+    for (Instruction &II : B)
+      if (match(&II, m_Intrinsic<Intrinsic::assume>()))
+        AssumeHandles.push_back(&II);
+
+  // Mark the scan as complete.
+  Scanned = true;
+}
+
+void AssumptionCache::registerAssumption(CallInst *CI) {
+  assert(match(CI, m_Intrinsic<Intrinsic::assume>()) &&
+         "Registered call does not call @llvm.assume");
+
+  // If we haven't scanned the function yet, just drop this assumption. It will
+  // be found when we scan later.
+  if (!Scanned)
+    return;
+
+  AssumeHandles.push_back(CI);
+
+#ifndef NDEBUG
+  assert(CI->getParent() &&
+         "Cannot register @llvm.assume call not in a basic block");
+  assert(&F == CI->getParent()->getParent() &&
+         "Cannot register @llvm.assume call not in this function");
+
+  // We expect the number of assumptions to be small, so in an asserts build
+  // check that we don't accumulate duplicates and that all assumptions point
+  // to the same function.
+  SmallPtrSet<Value *, 16> AssumptionSet;
+  for (auto &VH : AssumeHandles) {
+    if (!VH)
+      continue;
+
+    assert(&F == cast<Instruction>(VH)->getParent()->getParent() &&
+           "Cached assumption not inside this function!");
+    assert(match(cast<CallInst>(VH), m_Intrinsic<Intrinsic::assume>()) &&
+           "Cached something other than a call to @llvm.assume!");
+    assert(AssumptionSet.insert(VH).second &&
+           "Cache contains multiple copies of a call!");
+  }
+#endif
+}
+
+char AssumptionAnalysis::PassID;
+
+PreservedAnalyses AssumptionPrinterPass::run(Function &F,
+                                             AnalysisManager<Function> *AM) {
+  AssumptionCache &AC = AM->getResult<AssumptionAnalysis>(F);
+
+  OS << "Cached assumptions for function: " << F.getName() << "\n";
+  for (auto &VH : AC.assumptions())
+    if (VH)
+      OS << "  " << *cast<CallInst>(VH)->getArgOperand(0) << "\n";
+
+  return PreservedAnalyses::all();
+}
+
+void AssumptionCacheTracker::FunctionCallbackVH::deleted() {
+  auto I = ACT->AssumptionCaches.find_as(cast<Function>(getValPtr()));
+  if (I != ACT->AssumptionCaches.end())
+    ACT->AssumptionCaches.erase(I);
+  // 'this' now dangles!
+}
+
+AssumptionCache &AssumptionCacheTracker::getAssumptionCache(Function &F) {
+  // We probe the function map twice to try and avoid creating a value handle
+  // around the function in common cases. This makes insertion a bit slower,
+  // but if we have to insert we're going to scan the whole function so that
+  // shouldn't matter.
+  auto I = AssumptionCaches.find_as(&F);
+  if (I != AssumptionCaches.end())
+    return *I->second;
+
+  // Ok, build a new cache by scanning the function, insert it and the value
+  // handle into our map, and return the newly populated cache.
+  auto IP = AssumptionCaches.insert(std::make_pair(
+      FunctionCallbackVH(&F, this), llvm::make_unique<AssumptionCache>(F)));
+  assert(IP.second && "Scanning function already in the map?");
+  return *IP.first->second;
+}
+
+void AssumptionCacheTracker::verifyAnalysis() const {
+#ifndef NDEBUG
+  SmallPtrSet<const CallInst *, 4> AssumptionSet;
+  for (const auto &I : AssumptionCaches) {
+    for (auto &VH : I.second->assumptions())
+      if (VH)
+        AssumptionSet.insert(cast<CallInst>(VH));
+
+    for (const BasicBlock &B : cast<Function>(*I.first))
+      for (const Instruction &II : B)
+        if (match(&II, m_Intrinsic<Intrinsic::assume>()))
+          assert(AssumptionSet.count(cast<CallInst>(&II)) &&
+                 "Assumption in scanned function not in cache");
+  }
+#endif
+}
+
+AssumptionCacheTracker::AssumptionCacheTracker() : ImmutablePass(ID) {
+  initializeAssumptionCacheTrackerPass(*PassRegistry::getPassRegistry());
+}
+
+AssumptionCacheTracker::~AssumptionCacheTracker() {}
+
+INITIALIZE_PASS(AssumptionCacheTracker, "assumption-cache-tracker",
+                "Assumption Cache Tracker", false, true)
+char AssumptionCacheTracker::ID = 0;
diff --git a/lib/Analysis/AssumptionTracker.cpp b/lib/Analysis/AssumptionTracker.cpp
deleted file mode 100644
index 775ce1d..0000000
--- a/lib/Analysis/AssumptionTracker.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-//===- AssumptionTracker.cpp - Track @llvm.assume -------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass that keeps track of @llvm.assume intrinsics in
-// the functions of a module.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/AssumptionTracker.h"
-#include "llvm/IR/CallSite.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/Debug.h"
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-void AssumptionTracker::FunctionCallbackVH::deleted() {
-  AT->forgetCachedAssumptions(cast<Function>(getValPtr()));
-  // 'this' now dangles!
-}
-
-void AssumptionTracker::forgetCachedAssumptions(Function *F) {
-  auto I = CachedAssumeCalls.find_as(F);
-  if (I != CachedAssumeCalls.end())
-    CachedAssumeCalls.erase(I);
-}
-
-void AssumptionTracker::CallCallbackVH::deleted() {
-  assert(F && "delete callback called on dummy handle");
-  FunctionCallsMap::iterator I = AT->CachedAssumeCalls.find_as(F);
-  assert(I != AT->CachedAssumeCalls.end() &&
-         "Function cleared from the map without removing the values?");
-
-  I->second->erase(*this);
-  // 'this' now dangles!
-}
-
-AssumptionTracker::FunctionCallsMap::iterator
-AssumptionTracker::scanFunction(Function *F) {
-  auto IP = CachedAssumeCalls.insert(std::make_pair(
-      FunctionCallbackVH(F, this), llvm::make_unique<CallHandleSet>()));
-  assert(IP.second && "Scanning function already in the map?");
-
-  FunctionCallsMap::iterator I = IP.first;
-
-  // Go through all instructions in all blocks, add all calls to @llvm.assume
-  // to our cache.
-  for (BasicBlock &B : *F)
-    for (Instruction &II : B)
-      if (match(&II, m_Intrinsic<Intrinsic::assume>()))
-        I->second->insert(CallCallbackVH(&II, this));
-
-  return I;
-}
-
-void AssumptionTracker::verifyAnalysis() const {
-#ifndef NDEBUG
-  for (const auto &I : CachedAssumeCalls) {
-    for (const BasicBlock &B : cast<Function>(*I.first))
-      for (const Instruction &II : B) {
-        if (match(&II, m_Intrinsic<Intrinsic::assume>())) {
-          assert(I.second->find_as(&II) != I.second->end() &&
-                 "Assumption in scanned function not in cache");
-        }
-    }
-  }
-#endif
-}
-
-void AssumptionTracker::registerAssumption(CallInst *CI) {
-  assert(match(CI, m_Intrinsic<Intrinsic::assume>()) &&
-         "Registered call does not call @llvm.assume");
-  assert(CI->getParent() &&
-         "Cannot register @llvm.assume call not in a basic block");
-
-  Function *F = CI->getParent()->getParent();
-  assert(F && "Cannot register @llvm.assume call not in a function");
-
-  FunctionCallsMap::iterator I = CachedAssumeCalls.find_as(F);
-  if (I == CachedAssumeCalls.end()) {
-    // If this function has not already been scanned, then don't do anything
-    // here. This intrinsic will be found, if it still exists, if the list of
-    // assumptions in this function is requested at some later point. This
-    // maintains the following invariant: if a function is present in the
-    // cache, then its list of assumption intrinsic calls is complete.
-    return;
-  }
-
-  I->second->insert(CallCallbackVH(CI, this));
-}
-
-AssumptionTracker::AssumptionTracker() : ImmutablePass(ID) {
-  initializeAssumptionTrackerPass(*PassRegistry::getPassRegistry());
-}
-
-AssumptionTracker::~AssumptionTracker() {}
-
-INITIALIZE_PASS(AssumptionTracker, "assumption-tracker", "Assumption Tracker",
-                false, true)
-char AssumptionTracker::ID = 0;
-
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 9aba0d3..46ca6ee 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -17,12 +17,13 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -38,7 +39,6 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -196,8 +196,7 @@ namespace {
 static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
                                   ExtensionKind &Extension,
                                   const DataLayout &DL, unsigned Depth,
-                                  AssumptionTracker *AT,
-                                  DominatorTree *DT) {
+                                  AssumptionCache *AC, DominatorTree *DT) {
   assert(V->getType()->isIntegerTy() && "Not an integer value");
 
   // Limit our recursion depth.
@@ -222,24 +221,24 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
       case Instruction::Or:
         // X|C == X+C if all the bits in C are unset in X.  Otherwise we can't
         // analyze it.
-        if (!MaskedValueIsZero(BOp->getOperand(0), RHSC->getValue(), &DL, 0,
-                               AT, BOp, DT))
+        if (!MaskedValueIsZero(BOp->getOperand(0), RHSC->getValue(), &DL, 0, AC,
+                               BOp, DT))
           break;
         // FALL THROUGH.
       case Instruction::Add:
         V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension,
-                                DL, Depth+1, AT, DT);
+                                DL, Depth + 1, AC, DT);
         Offset += RHSC->getValue();
         return V;
       case Instruction::Mul:
         V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension,
-                                DL, Depth+1, AT, DT);
+                                DL, Depth + 1, AC, DT);
         Offset *= RHSC->getValue();
         Scale *= RHSC->getValue();
         return V;
       case Instruction::Shl:
         V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension,
-                                DL, Depth+1, AT, DT);
+                                DL, Depth + 1, AC, DT);
         Offset <<= RHSC->getValue().getLimitedValue();
         Scale <<= RHSC->getValue().getLimitedValue();
         return V;
@@ -259,8 +258,8 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
     Offset = Offset.trunc(SmallWidth);
     Extension = isa<SExtInst>(V) ? EK_SignExt : EK_ZeroExt;
 
-    Value *Result = GetLinearExpression(CastOp, Scale, Offset, Extension,
-                                        DL, Depth+1, AT, DT);
+    Value *Result = GetLinearExpression(CastOp, Scale, Offset, Extension, DL,
+                                        Depth + 1, AC, DT);
     Scale = Scale.zext(OldWidth);
 
     // We have to sign-extend even if Extension == EK_ZeroExt as we can't
@@ -294,7 +293,7 @@ static const Value *
 DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
                        SmallVectorImpl<VariableGEPIndex> &VarIndices,
                        bool &MaxLookupReached, const DataLayout *DL,
-                       AssumptionTracker *AT, DominatorTree *DT) {
+                       AssumptionCache *AC, DominatorTree *DT) {
   // Limit recursion depth to limit compile time in crazy cases.
   unsigned MaxLookup = MaxLookupSearchDepth;
   MaxLookupReached = false;
@@ -325,7 +324,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
       // If it's not a GEP, hand it off to SimplifyInstruction to see if it
       // can come up with something. This matches what GetUnderlyingObject does.
       if (const Instruction *I = dyn_cast<Instruction>(V))
-        // TODO: Get a DominatorTree and AssumptionTracker and use them here
+        // TODO: Get a DominatorTree and AssumptionCache and use them here
         // (these are both now available in this function, but this should be
         // updated when GetUnderlyingObject is updated). TLI should be
         // provided also.
@@ -387,7 +386,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
       // Use GetLinearExpression to decompose the index into a C1*V+C2 form.
       APInt IndexScale(Width, 0), IndexOffset(Width, 0);
       Index = GetLinearExpression(Index, IndexScale, IndexOffset, Extension,
-                                  *DL, 0, AT, DT);
+                                  *DL, 0, AC, DT);
 
       // The GEP index scale ("Scale") scales C1*V+C2, yielding (C1*V+C2)*Scale.
       // This gives us an aggregate computation of (C1*Scale)*V + C2*Scale.
@@ -468,8 +467,8 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<AliasAnalysis>();
-      AU.addRequired<AssumptionTracker>();
-      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
 
     AliasResult alias(const Location &LocA, const Location &LocB) override {
@@ -591,8 +590,8 @@ char BasicAliasAnalysis::ID = 0;
 INITIALIZE_AG_PASS_BEGIN(BasicAliasAnalysis, AliasAnalysis, "basicaa",
                    "Basic Alias Analysis (stateless AA impl)",
                    false, true, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_AG_PASS_END(BasicAliasAnalysis, AliasAnalysis, "basicaa",
                    "Basic Alias Analysis (stateless AA impl)",
                    false, true, false)
@@ -719,7 +718,8 @@ BasicAliasAnalysis::getModRefBehavior(const Function *F) {
   if (F->onlyReadsMemory())
     Min = OnlyReadsMemory;
 
-  const TargetLibraryInfo &TLI = getAnalysis<TargetLibraryInfo>();
+  const TargetLibraryInfo &TLI =
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   if (isMemsetPattern16(F, TLI))
     Min = OnlyAccessesArgumentPointees;
 
@@ -731,7 +731,8 @@ AliasAnalysis::Location
 BasicAliasAnalysis::getArgLocation(ImmutableCallSite CS, unsigned ArgIdx,
                                    ModRefResult &Mask) {
   Location Loc = AliasAnalysis::getArgLocation(CS, ArgIdx, Mask);
-  const TargetLibraryInfo &TLI = getAnalysis<TargetLibraryInfo>();
+  const TargetLibraryInfo &TLI =
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction());
   if (II != nullptr)
     switch (II->getIntrinsicID()) {
@@ -889,6 +890,99 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
   return AliasAnalysis::getModRefInfo(CS1, CS2);
 }
 
+/// \brief Provide ad-hoc rules to disambiguate accesses through two GEP
+/// operators, both having the exact same pointer operand.
+static AliasAnalysis::AliasResult
+aliasSameBasePointerGEPs(const GEPOperator *GEP1, uint64_t V1Size,
+                         const GEPOperator *GEP2, uint64_t V2Size,
+                         const DataLayout &DL) {
+
+  assert(GEP1->getPointerOperand() == GEP2->getPointerOperand() &&
+         "Expected GEPs with the same pointer operand");
+
+  // Try to determine whether GEP1 and GEP2 index through arrays, into structs,
+  // such that the struct field accesses provably cannot alias.
+  // We also need at least two indices (the pointer, and the struct field).
+  if (GEP1->getNumIndices() != GEP2->getNumIndices() ||
+      GEP1->getNumIndices() < 2)
+    return AliasAnalysis::MayAlias;
+
+  // If we don't know the size of the accesses through both GEPs, we can't
+  // determine whether the struct fields accessed can't alias.
+  if (V1Size == AliasAnalysis::UnknownSize ||
+      V2Size == AliasAnalysis::UnknownSize)
+    return AliasAnalysis::MayAlias;
+
+  ConstantInt *C1 =
+      dyn_cast<ConstantInt>(GEP1->getOperand(GEP1->getNumOperands() - 1));
+  ConstantInt *C2 =
+      dyn_cast<ConstantInt>(GEP2->getOperand(GEP2->getNumOperands() - 1));
+
+  // If the last (struct) indices aren't constants, we can't say anything.
+  // If they're identical, the other indices might be also be dynamically
+  // equal, so the GEPs can alias.
+  if (!C1 || !C2 || C1 == C2)
+    return AliasAnalysis::MayAlias;
+
+  // Find the last-indexed type of the GEP, i.e., the type you'd get if
+  // you stripped the last index.
+  // On the way, look at each indexed type.  If there's something other
+  // than an array, different indices can lead to different final types.
+  SmallVector<Value *, 8> IntermediateIndices;
+
+  // Insert the first index; we don't need to check the type indexed
+  // through it as it only drops the pointer indirection.
+  assert(GEP1->getNumIndices() > 1 && "Not enough GEP indices to examine");
+  IntermediateIndices.push_back(GEP1->getOperand(1));
+
+  // Insert all the remaining indices but the last one.
+  // Also, check that they all index through arrays.
+  for (unsigned i = 1, e = GEP1->getNumIndices() - 1; i != e; ++i) {
+    if (!isa<ArrayType>(GetElementPtrInst::getIndexedType(
+            GEP1->getPointerOperandType(), IntermediateIndices)))
+      return AliasAnalysis::MayAlias;
+    IntermediateIndices.push_back(GEP1->getOperand(i + 1));
+  }
+
+  StructType *LastIndexedStruct =
+      dyn_cast<StructType>(GetElementPtrInst::getIndexedType(
+          GEP1->getPointerOperandType(), IntermediateIndices));
+
+  if (!LastIndexedStruct)
+    return AliasAnalysis::MayAlias;
+
+  // We know that:
+  // - both GEPs begin indexing from the exact same pointer;
+  // - the last indices in both GEPs are constants, indexing into a struct;
+  // - said indices are different, hence, the pointed-to fields are different;
+  // - both GEPs only index through arrays prior to that.
+  //
+  // This lets us determine that the struct that GEP1 indexes into and the
+  // struct that GEP2 indexes into must either precisely overlap or be
+  // completely disjoint.  Because they cannot partially overlap, indexing into
+  // different non-overlapping fields of the struct will never alias.
+
+  // Therefore, the only remaining thing needed to show that both GEPs can't
+  // alias is that the fields are not overlapping.
+  const StructLayout *SL = DL.getStructLayout(LastIndexedStruct);
+  const uint64_t StructSize = SL->getSizeInBytes();
+  const uint64_t V1Off = SL->getElementOffset(C1->getZExtValue());
+  const uint64_t V2Off = SL->getElementOffset(C2->getZExtValue());
+
+  auto EltsDontOverlap = [StructSize](uint64_t V1Off, uint64_t V1Size,
+                                      uint64_t V2Off, uint64_t V2Size) {
+    return V1Off < V2Off && V1Off + V1Size <= V2Off &&
+           ((V2Off + V2Size <= StructSize) ||
+            (V2Off + V2Size - StructSize <= V1Off));
+  };
+
+  if (EltsDontOverlap(V1Off, V1Size, V2Off, V2Size) ||
+      EltsDontOverlap(V2Off, V2Size, V1Off, V1Size))
+    return AliasAnalysis::NoAlias;
+
+  return AliasAnalysis::MayAlias;
+}
+
 /// aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP instruction
 /// against another pointer.  We know that V1 is a GEP, but we don't know
 /// anything about V2.  UnderlyingV1 is GetUnderlyingObject(GEP1, DL),
@@ -905,7 +999,22 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
   bool GEP1MaxLookupReached;
   SmallVector<VariableGEPIndex, 4> GEP1VariableIndices;
 
-  AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
+  // We have to get two AssumptionCaches here because GEP1 and V2 may be from
+  // different functions.
+  // FIXME: This really doesn't make any sense. We get a dominator tree below
+  // that can only refer to a single function. But this function (aliasGEP) is
+  // a method on an immutable pass that can be called when there *isn't*
+  // a single function. The old pass management layer makes this "work", but
+  // this isn't really a clean solution.
+  AssumptionCacheTracker &ACT = getAnalysis<AssumptionCacheTracker>();
+  AssumptionCache *AC1 = nullptr, *AC2 = nullptr;
+  if (auto *GEP1I = dyn_cast<Instruction>(GEP1))
+    AC1 = &ACT.getAssumptionCache(
+        const_cast<Function &>(*GEP1I->getParent()->getParent()));
+  if (auto *I2 = dyn_cast<Instruction>(V2))
+    AC2 = &ACT.getAssumptionCache(
+        const_cast<Function &>(*I2->getParent()->getParent()));
+
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
@@ -932,11 +1041,11 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
         bool GEP2MaxLookupReached;
         SmallVector<VariableGEPIndex, 4> GEP2VariableIndices;
         const Value *GEP2BasePtr =
-          DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices,
-                                 GEP2MaxLookupReached, DL, AT, DT);
+            DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices,
+                                   GEP2MaxLookupReached, DL, AC2, DT);
         const Value *GEP1BasePtr =
-          DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices,
-                                 GEP1MaxLookupReached, DL, AT, DT);
+            DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices,
+                                   GEP1MaxLookupReached, DL, AC1, DT);
         // DecomposeGEPExpression and GetUnderlyingObject should return the
         // same result except when DecomposeGEPExpression has no DataLayout.
         if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
@@ -964,15 +1073,15 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
     // exactly, see if the computed offset from the common pointer tells us
     // about the relation of the resulting pointer.
     const Value *GEP1BasePtr =
-      DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices,
-                             GEP1MaxLookupReached, DL, AT, DT);
+        DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices,
+                               GEP1MaxLookupReached, DL, AC1, DT);
 
     int64_t GEP2BaseOffset;
     bool GEP2MaxLookupReached;
     SmallVector<VariableGEPIndex, 4> GEP2VariableIndices;
     const Value *GEP2BasePtr =
-      DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices,
-                             GEP2MaxLookupReached, DL, AT, DT);
+        DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices,
+                               GEP2MaxLookupReached, DL, AC2, DT);
 
     // DecomposeGEPExpression and GetUnderlyingObject should return the
     // same result except when DecomposeGEPExpression has no DataLayout.
@@ -981,6 +1090,17 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
              "DecomposeGEPExpression and GetUnderlyingObject disagree!");
       return MayAlias;
     }
+
+    // If we know the two GEPs are based off of the exact same pointer (and not
+    // just the same underlying object), see if that tells us anything about
+    // the resulting pointers.
+    if (DL && GEP1->getPointerOperand() == GEP2->getPointerOperand()) {
+      AliasResult R = aliasSameBasePointerGEPs(GEP1, V1Size, GEP2, V2Size, *DL);
+      // If we couldn't find anything interesting, don't abandon just yet.
+      if (R != MayAlias)
+        return R;
+    }
+
     // If the max search depth is reached the result is undefined
     if (GEP2MaxLookupReached || GEP1MaxLookupReached)
       return MayAlias;
@@ -1010,8 +1130,8 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
       return R;
 
     const Value *GEP1BasePtr =
-      DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices,
-                             GEP1MaxLookupReached, DL, AT, DT);
+        DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices,
+                               GEP1MaxLookupReached, DL, AC1, DT);
 
     // DecomposeGEPExpression and GetUnderlyingObject should return the
     // same result except when DecomposeGEPExpression has no DataLayout.
@@ -1080,10 +1200,8 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
         const Value *V = GEP1VariableIndices[i].V;
 
         bool SignKnownZero, SignKnownOne;
-        ComputeSignBit(
-          const_cast<Value *>(V),
-          SignKnownZero, SignKnownOne,
-          DL, 0, AT, nullptr, DT);
+        ComputeSignBit(const_cast<Value *>(V), SignKnownZero, SignKnownOne, DL,
+                       0, AC1, nullptr, DT);
 
         // Zero-extension widens the variable, and so forces the sign
         // bit to zero.
@@ -1422,7 +1540,8 @@ bool BasicAliasAnalysis::isValueEqualInPotentialCycles(const Value *V,
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  LoopInfo *LI = getAnalysisIfAvailable<LoopInfo>();
+  auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+  LoopInfo *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
 
   // Make sure that the visited phis cannot reach the Value. This ensures that
   // the Values cannot come from different iterations of a potential cycle the
diff --git a/lib/Analysis/BlockFrequencyInfo.cpp b/lib/Analysis/BlockFrequencyInfo.cpp
index 8ed8e3e..37f2fae 100644
--- a/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/lib/Analysis/BlockFrequencyInfo.cpp
@@ -108,7 +108,7 @@ struct DOTGraphTraits<BlockFrequencyInfo*> : public DefaultDOTGraphTraits {
 INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq",
                       "Block Frequency Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfo)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq",
                     "Block Frequency Analysis", true, true)
 
@@ -123,13 +123,13 @@ BlockFrequencyInfo::~BlockFrequencyInfo() {}
 
 void BlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<BranchProbabilityInfo>();
-  AU.addRequired<LoopInfo>();
+  AU.addRequired<LoopInfoWrapperPass>();
   AU.setPreservesAll();
 }
 
 bool BlockFrequencyInfo::runOnFunction(Function &F) {
   BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>();
-  LoopInfo &LI = getAnalysis<LoopInfo>();
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   if (!BFI)
     BFI.reset(new ImplType);
   BFI->doFunction(&F, &BPI, &LI);
diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 06b8acd..278073c 100644
--- a/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/Support/raw_ostream.h"
+#include <numeric>
 
 using namespace llvm;
 using namespace llvm::bfi_detail;
@@ -122,8 +123,12 @@ static void combineWeight(Weight &W, const Weight &OtherW) {
   }
   assert(W.Type == OtherW.Type);
   assert(W.TargetNode == OtherW.TargetNode);
-  assert(W.Amount < W.Amount + OtherW.Amount && "Unexpected overflow");
-  W.Amount += OtherW.Amount;
+  assert(OtherW.Amount && "Expected non-zero weight");
+  if (W.Amount > W.Amount + OtherW.Amount)
+    // Saturate on overflow.
+    W.Amount = UINT64_MAX;
+  else
+    W.Amount += OtherW.Amount;
 }
 static void combineWeightsBySorting(WeightList &Weights) {
   // Sort so edges to the same node are adjacent.
@@ -206,11 +211,19 @@ void Distribution::normalize() {
     Shift = 33 - countLeadingZeros(Total);
 
   // Early exit if nothing needs to be scaled.
-  if (!Shift)
+  if (!Shift) {
+    // If we didn't overflow then combineWeights() shouldn't have changed the
+    // sum of the weights, but let's double-check.
+    assert(Total == std::accumulate(Weights.begin(), Weights.end(), UINT64_C(0),
+                                    [](uint64_t Sum, const Weight &W) {
+                      return Sum + W.Amount;
+                    }) &&
+           "Expected total to be correct");
     return;
+  }
 
   // Recompute the total through accumulation (rather than shifting it) so that
-  // it's accurate after shifting.
+  // it's accurate after shifting and any changes combineWeights() made above.
   Total = 0;
 
   // Sum the weights to each node and shift right if necessary.
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index bbd8750..8cd6ea4 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
 
 INITIALIZE_PASS_BEGIN(BranchProbabilityInfo, "branch-prob",
                       "Branch Probability Analysis", false, true)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(BranchProbabilityInfo, "branch-prob",
                     "Branch Probability Analysis", false, true)
 
@@ -196,7 +196,8 @@ bool BranchProbabilityInfo::calcMetadataWeights(BasicBlock *BB) {
   SmallVector<uint32_t, 2> Weights;
   Weights.reserve(TI->getNumSuccessors());
   for (unsigned i = 1, e = WeightsNode->getNumOperands(); i != e; ++i) {
-    ConstantInt *Weight = dyn_cast<ConstantInt>(WeightsNode->getOperand(i));
+    ConstantInt *Weight =
+        mdconst::dyn_extract<ConstantInt>(WeightsNode->getOperand(i));
     if (!Weight)
       return false;
     Weights.push_back(
@@ -483,7 +484,7 @@ bool BranchProbabilityInfo::calcInvokeHeuristics(BasicBlock *BB) {
 }
 
 void BranchProbabilityInfo::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<LoopInfo>();
+  AU.addRequired<LoopInfoWrapperPass>();
   AU.setPreservesAll();
 }
 
@@ -491,7 +492,7 @@ bool BranchProbabilityInfo::runOnFunction(Function &F) {
   DEBUG(dbgs() << "---- Branch Probability Info : " << F.getName()
                << " ----\n\n");
   LastF = &F; // Store the last function we ran on for printing.
-  LI = &getAnalysis<LoopInfo>();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   assert(PostDominatedByUnreachable.empty());
   assert(PostDominatedByColdCall.empty());
 
diff --git a/lib/Analysis/CFG.cpp b/lib/Analysis/CFG.cpp
index 25e7bc0..8ecd70b 100644
--- a/lib/Analysis/CFG.cpp
+++ b/lib/Analysis/CFG.cpp
@@ -27,7 +27,7 @@ using namespace llvm;
 void llvm::FindFunctionBackedges(const Function &F,
      SmallVectorImpl<std::pair<const BasicBlock*,const BasicBlock*> > &Result) {
   const BasicBlock *BB = &F.getEntryBlock();
-  if (succ_begin(BB) == succ_end(BB))
+  if (succ_empty(BB))
     return;
 
   SmallPtrSet<const BasicBlock*, 8> Visited;
diff --git a/lib/Analysis/CFLAliasAnalysis.cpp b/lib/Analysis/CFLAliasAnalysis.cpp
index 5f1b3d3..82fbfe0 100644
--- a/lib/Analysis/CFLAliasAnalysis.cpp
+++ b/lib/Analysis/CFLAliasAnalysis.cpp
@@ -29,20 +29,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "StratifiedSets.h"
-#include "llvm/Analysis/Passes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <algorithm>
 #include <cassert>
@@ -51,6 +52,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "cfl-aa"
+
 // Try to go from a Value* to a Function*. Never returns nullptr.
 static Optional<Function *> parentFunctionOfValue(Value *);
 
@@ -227,10 +230,14 @@ public:
     // Comparisons between global variables and other constants should be
     // handled by BasicAA.
     if (isa<Constant>(LocA.Ptr) && isa<Constant>(LocB.Ptr)) {
-      return MayAlias;
+      return AliasAnalysis::alias(LocA, LocB);
     }
 
-    return query(LocA, LocB);
+    AliasResult QueryResult = query(LocA, LocB);
+    if (QueryResult == MayAlias)
+      return AliasAnalysis::alias(LocA, LocB);
+
+    return QueryResult;
   }
 
   void initializePass() override { InitializeAliasAnalysis(this); }
@@ -295,8 +302,11 @@ public:
   }
 
   void visitSelectInst(SelectInst &Inst) {
-    auto *Condition = Inst.getCondition();
-    Output.push_back(Edge(&Inst, Condition, EdgeType::Assign, AttrNone));
+    // Condition is not processed here (The actual statement producing
+    // the condition result is processed elsewhere). For select, the
+    // condition is evaluated, but not loaded, stored, or assigned
+    // simply as a result of being the condition of a select.
+
     auto *TrueVal = Inst.getTrueValue();
     Output.push_back(Edge(&Inst, TrueVal, EdgeType::Assign, AttrNone));
     auto *FalseVal = Inst.getFalseValue();
@@ -768,13 +778,16 @@ static Optional<StratifiedAttr> valueToAttrIndex(Value *Val) {
     return AttrGlobalIndex;
 
   if (auto *Arg = dyn_cast<Argument>(Val))
-    if (!Arg->hasNoAliasAttr())
+    // Only pointer arguments should have the argument attribute,
+    // because things can't escape through scalars without us seeing a
+    // cast, and thus, interaction with them doesn't matter.
+    if (!Arg->hasNoAliasAttr() && Arg->getType()->isPointerTy())
       return argNumberToAttrIndex(Arg->getArgNo());
   return NoneType();
 }
 
 static StratifiedAttr argNumberToAttrIndex(unsigned ArgNum) {
-  if (ArgNum > AttrMaxNumArgs)
+  if (ArgNum >= AttrMaxNumArgs)
     return AttrAllIndex;
   return ArgNum + AttrFirstArgIndex;
 }
@@ -964,8 +977,10 @@ CFLAliasAnalysis::query(const AliasAnalysis::Location &LocA,
   auto MaybeFnA = parentFunctionOfValue(ValA);
   auto MaybeFnB = parentFunctionOfValue(ValB);
   if (!MaybeFnA.hasValue() && !MaybeFnB.hasValue()) {
-    llvm_unreachable("Don't know how to extract the parent function "
-                     "from values A or B");
+    // The only times this is known to happen are when globals + InlineAsm
+    // are involved
+    DEBUG(dbgs() << "CFLAA: could not extract parent function information.\n");
+    return AliasAnalysis::MayAlias;
   }
 
   if (MaybeFnA.hasValue()) {
@@ -991,23 +1006,31 @@ CFLAliasAnalysis::query(const AliasAnalysis::Location &LocA,
 
   auto SetA = *MaybeA;
   auto SetB = *MaybeB;
-
-  if (SetA.Index == SetB.Index)
-    return AliasAnalysis::PartialAlias;
-
   auto AttrsA = Sets.getLink(SetA.Index).Attrs;
   auto AttrsB = Sets.getLink(SetB.Index).Attrs;
+
   // Stratified set attributes are used as markets to signify whether a member
-  // of a StratifiedSet (or a member of a set above the current set) has 
+  // of a StratifiedSet (or a member of a set above the current set) has
   // interacted with either arguments or globals. "Interacted with" meaning
-  // its value may be different depending on the value of an argument or 
+  // its value may be different depending on the value of an argument or
   // global. The thought behind this is that, because arguments and globals
   // may alias each other, if AttrsA and AttrsB have touched args/globals,
-  // we must conservatively say that they alias. However, if at least one of 
-  // the sets has no values that could legally be altered by changing the value 
+  // we must conservatively say that they alias. However, if at least one of
+  // the sets has no values that could legally be altered by changing the value
   // of an argument or global, then we don't have to be as conservative.
   if (AttrsA.any() && AttrsB.any())
     return AliasAnalysis::MayAlias;
 
+  // We currently unify things even if the accesses to them may not be in
+  // bounds, so we can't return partial alias here because we don't
+  // know whether the pointer is really within the object or not.
+  // IE Given an out of bounds GEP and an alloca'd pointer, we may
+  // unify the two. We can't return partial alias for this case.
+  // Since we do not currently track enough information to
+  // differentiate
+
+  if (SetA.Index == SetB.Index)
+    return AliasAnalysis::MayAlias;
+
   return AliasAnalysis::NoAlias;
 }
diff --git a/lib/Analysis/CGSCCPassManager.cpp b/lib/Analysis/CGSCCPassManager.cpp
index 5d1d8a9..4a03002 100644
--- a/lib/Analysis/CGSCCPassManager.cpp
+++ b/lib/Analysis/CGSCCPassManager.cpp
@@ -13,105 +13,10 @@
 
 using namespace llvm;
 
-static cl::opt<bool>
-DebugPM("debug-cgscc-pass-manager", cl::Hidden,
-        cl::desc("Print CGSCC pass management debugging information"));
-
-PreservedAnalyses CGSCCPassManager::run(LazyCallGraph::SCC *C,
-                                        CGSCCAnalysisManager *AM) {
-  PreservedAnalyses PA = PreservedAnalyses::all();
-
-  if (DebugPM)
-    dbgs() << "Starting CGSCC pass manager run.\n";
-
-  for (unsigned Idx = 0, Size = Passes.size(); Idx != Size; ++Idx) {
-    if (DebugPM)
-      dbgs() << "Running CGSCC pass: " << Passes[Idx]->name() << "\n";
-
-    PreservedAnalyses PassPA = Passes[Idx]->run(C, AM);
-    if (AM)
-      AM->invalidate(C, PassPA);
-    PA.intersect(std::move(PassPA));
-  }
-
-  if (DebugPM)
-    dbgs() << "Finished CGSCC pass manager run.\n";
-
-  return PA;
-}
-
-bool CGSCCAnalysisManager::empty() const {
-  assert(CGSCCAnalysisResults.empty() == CGSCCAnalysisResultLists.empty() &&
-         "The storage and index of analysis results disagree on how many there "
-         "are!");
-  return CGSCCAnalysisResults.empty();
-}
-
-void CGSCCAnalysisManager::clear() {
-  CGSCCAnalysisResults.clear();
-  CGSCCAnalysisResultLists.clear();
-}
-
-CGSCCAnalysisManager::ResultConceptT &
-CGSCCAnalysisManager::getResultImpl(void *PassID, LazyCallGraph::SCC *C) {
-  CGSCCAnalysisResultMapT::iterator RI;
-  bool Inserted;
-  std::tie(RI, Inserted) = CGSCCAnalysisResults.insert(std::make_pair(
-      std::make_pair(PassID, C), CGSCCAnalysisResultListT::iterator()));
-
-  // If we don't have a cached result for this function, look up the pass and
-  // run it to produce a result, which we then add to the cache.
-  if (Inserted) {
-    CGSCCAnalysisResultListT &ResultList = CGSCCAnalysisResultLists[C];
-    ResultList.emplace_back(PassID, lookupPass(PassID).run(C, this));
-    RI->second = std::prev(ResultList.end());
-  }
-
-  return *RI->second->second;
-}
-
-CGSCCAnalysisManager::ResultConceptT *
-CGSCCAnalysisManager::getCachedResultImpl(void *PassID,
-                                          LazyCallGraph::SCC *C) const {
-  CGSCCAnalysisResultMapT::const_iterator RI =
-      CGSCCAnalysisResults.find(std::make_pair(PassID, C));
-  return RI == CGSCCAnalysisResults.end() ? nullptr : &*RI->second->second;
-}
-
-void CGSCCAnalysisManager::invalidateImpl(void *PassID, LazyCallGraph::SCC *C) {
-  CGSCCAnalysisResultMapT::iterator RI =
-      CGSCCAnalysisResults.find(std::make_pair(PassID, C));
-  if (RI == CGSCCAnalysisResults.end())
-    return;
-
-  CGSCCAnalysisResultLists[C].erase(RI->second);
-}
-
-void CGSCCAnalysisManager::invalidateImpl(LazyCallGraph::SCC *C,
-                                          const PreservedAnalyses &PA) {
-  // Clear all the invalidated results associated specifically with this
-  // function.
-  SmallVector<void *, 8> InvalidatedPassIDs;
-  CGSCCAnalysisResultListT &ResultsList = CGSCCAnalysisResultLists[C];
-  for (CGSCCAnalysisResultListT::iterator I = ResultsList.begin(),
-                                          E = ResultsList.end();
-       I != E;)
-    if (I->second->invalidate(C, PA)) {
-      InvalidatedPassIDs.push_back(I->first);
-      I = ResultsList.erase(I);
-    } else {
-      ++I;
-    }
-  while (!InvalidatedPassIDs.empty())
-    CGSCCAnalysisResults.erase(
-        std::make_pair(InvalidatedPassIDs.pop_back_val(), C));
-  CGSCCAnalysisResultLists.erase(C);
-}
-
 char CGSCCAnalysisManagerModuleProxy::PassID;
 
 CGSCCAnalysisManagerModuleProxy::Result
-CGSCCAnalysisManagerModuleProxy::run(Module *M) {
+CGSCCAnalysisManagerModuleProxy::run(Module &M) {
   assert(CGAM->empty() && "CGSCC analyses ran prior to the module proxy!");
   return Result(*CGAM);
 }
@@ -123,7 +28,7 @@ CGSCCAnalysisManagerModuleProxy::Result::~Result() {
 }
 
 bool CGSCCAnalysisManagerModuleProxy::Result::invalidate(
-    Module *M, const PreservedAnalyses &PA) {
+    Module &M, const PreservedAnalyses &PA) {
   // If this proxy isn't marked as preserved, then we can't even invalidate
   // individual CGSCC analyses, there may be an invalid set of SCC objects in
   // the cache making it impossible to incrementally preserve them.
@@ -140,7 +45,7 @@ char ModuleAnalysisManagerCGSCCProxy::PassID;
 char FunctionAnalysisManagerCGSCCProxy::PassID;
 
 FunctionAnalysisManagerCGSCCProxy::Result
-FunctionAnalysisManagerCGSCCProxy::run(LazyCallGraph::SCC *C) {
+FunctionAnalysisManagerCGSCCProxy::run(LazyCallGraph::SCC &C) {
   assert(FAM->empty() && "Function analyses ran prior to the CGSCC proxy!");
   return Result(*FAM);
 }
@@ -152,7 +57,7 @@ FunctionAnalysisManagerCGSCCProxy::Result::~Result() {
 }
 
 bool FunctionAnalysisManagerCGSCCProxy::Result::invalidate(
-    LazyCallGraph::SCC *C, const PreservedAnalyses &PA) {
+    LazyCallGraph::SCC &C, const PreservedAnalyses &PA) {
   // If this proxy isn't marked as preserved, then we can't even invalidate
   // individual function analyses, there may be an invalid set of Function
   // objects in the cache making it impossible to incrementally preserve them.
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 4e9664f..d840037 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -5,7 +5,7 @@ add_llvm_library(LLVMAnalysis
   AliasDebugger.cpp
   AliasSetTracker.cpp
   Analysis.cpp
-  AssumptionTracker.cpp
+  AssumptionCache.cpp
   BasicAliasAnalysis.cpp
   BlockFrequencyInfo.cpp
   BlockFrequencyInfoImpl.cpp
@@ -22,7 +22,6 @@ add_llvm_library(LLVMAnalysis
   DependenceAnalysis.cpp
   DomPrinter.cpp
   DominanceFrontier.cpp
-  FunctionTargetTransformInfo.cpp
   IVUsers.cpp
   InstCount.cpp
   InstructionSimplify.cpp
@@ -35,9 +34,11 @@ add_llvm_library(LLVMAnalysis
   LibCallSemantics.cpp
   Lint.cpp
   Loads.cpp
+  LoopAccessAnalysis.cpp
   LoopInfo.cpp
   LoopPass.cpp
   MemDepPrinter.cpp
+  MemDerefPrinter.cpp
   MemoryBuiltins.cpp
   MemoryDependenceAnalysis.cpp
   ModuleDebugInfoPrinter.cpp
@@ -53,11 +54,15 @@ add_llvm_library(LLVMAnalysis
   ScalarEvolutionExpander.cpp
   ScalarEvolutionNormalization.cpp
   SparsePropagation.cpp
+  TargetLibraryInfo.cpp
   TargetTransformInfo.cpp
   Trace.cpp
   TypeBasedAliasAnalysis.cpp
   ScopedNoAliasAA.cpp
   ValueTracking.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Analysis
   )
 
 add_dependencies(LLVMAnalysis intrinsics_gen)
diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
index a271729..5a54754 100644
--- a/lib/Analysis/CaptureTracking.cpp
+++ b/lib/Analysis/CaptureTracking.cpp
@@ -19,8 +19,8 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
diff --git a/lib/Analysis/CodeMetrics.cpp b/lib/Analysis/CodeMetrics.cpp
index f29e4a2..fa5683c 100644
--- a/lib/Analysis/CodeMetrics.cpp
+++ b/lib/Analysis/CodeMetrics.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -66,11 +66,16 @@ static void completeEphemeralValues(SmallVector<const Value *, 16> &WorkSet,
 }
 
 // Find all ephemeral values.
-void CodeMetrics::collectEphemeralValues(const Loop *L, AssumptionTracker *AT,
-                                         SmallPtrSetImpl<const Value*> &EphValues) {
+void CodeMetrics::collectEphemeralValues(
+    const Loop *L, AssumptionCache *AC,
+    SmallPtrSetImpl<const Value *> &EphValues) {
   SmallVector<const Value *, 16> WorkSet;
 
-  for (auto &I : AT->assumptions(L->getHeader()->getParent())) {
+  for (auto &AssumeVH : AC->assumptions()) {
+    if (!AssumeVH)
+      continue;
+    Instruction *I = cast<Instruction>(AssumeVH);
+
     // Filter out call sites outside of the loop so we don't to a function's
     // worth of work for each of its loops (and, in the common case, ephemeral
     // values in the loop are likely due to @llvm.assume calls in the loop).
@@ -83,12 +88,19 @@ void CodeMetrics::collectEphemeralValues(const Loop *L, AssumptionTracker *AT,
   completeEphemeralValues(WorkSet, EphValues);
 }
 
-void CodeMetrics::collectEphemeralValues(const Function *F, AssumptionTracker *AT,
-                                         SmallPtrSetImpl<const Value*> &EphValues) {
+void CodeMetrics::collectEphemeralValues(
+    const Function *F, AssumptionCache *AC,
+    SmallPtrSetImpl<const Value *> &EphValues) {
   SmallVector<const Value *, 16> WorkSet;
 
-  for (auto &I : AT->assumptions(const_cast<Function*>(F)))
+  for (auto &AssumeVH : AC->assumptions()) {
+    if (!AssumeVH)
+      continue;
+    Instruction *I = cast<Instruction>(AssumeVH);
+    assert(I->getParent()->getParent() == F &&
+           "Found assumption for the wrong function!");
     WorkSet.push_back(I);
+  }
 
   completeEphemeralValues(WorkSet, EphValues);
 }
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index fd8f2ae..fcafb41 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Config/config.h"
 #include "llvm/IR/Constants.h"
@@ -33,7 +34,6 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include <cerrno>
 #include <cmath>
 
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp
index 1b74f8c..b529c1a 100644
--- a/lib/Analysis/CostModel.cpp
+++ b/lib/Analysis/CostModel.cpp
@@ -83,7 +83,8 @@ CostModelAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
 bool
 CostModelAnalysis::runOnFunction(Function &F) {
  this->F = &F;
- TTI = getAnalysisIfAvailable<TargetTransformInfo>();
+ auto *TTIWP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
+ TTI = TTIWP ? &TTIWP->getTTI(F) : nullptr;
 
  return false;
 }
diff --git a/lib/Analysis/Delinearization.cpp b/lib/Analysis/Delinearization.cpp
index 9334ceb..d603b7b 100644
--- a/lib/Analysis/Delinearization.cpp
+++ b/lib/Analysis/Delinearization.cpp
@@ -59,14 +59,14 @@ public:
 
 void Delinearization::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<LoopInfo>();
+  AU.addRequired<LoopInfoWrapperPass>();
   AU.addRequired<ScalarEvolution>();
 }
 
 bool Delinearization::runOnFunction(Function &F) {
   this->F = &F;
   SE = &getAnalysis<ScalarEvolution>();
-  LI = &getAnalysis<LoopInfo>();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   return false;
 }
 
@@ -141,7 +141,7 @@ char Delinearization::ID = 0;
 static const char delinearization_name[] = "Delinearization";
 INITIALIZE_PASS_BEGIN(Delinearization, DL_NAME, delinearization_name, true,
                       true)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(Delinearization, DL_NAME, delinearization_name, true, true)
 
 FunctionPass *llvm::createDelinearizationPass() { return new Delinearization; }
diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp
index 092df5c..fda664b 100644
--- a/lib/Analysis/DependenceAnalysis.cpp
+++ b/lib/Analysis/DependenceAnalysis.cpp
@@ -114,7 +114,7 @@ Delinearize("da-delinearize", cl::init(false), cl::Hidden, cl::ZeroOrMore,
 
 INITIALIZE_PASS_BEGIN(DependenceAnalysis, "da",
                       "Dependence Analysis", true, true)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(DependenceAnalysis, "da",
@@ -132,7 +132,7 @@ bool DependenceAnalysis::runOnFunction(Function &F) {
   this->F = &F;
   AA = &getAnalysis<AliasAnalysis>();
   SE = &getAnalysis<ScalarEvolution>();
-  LI = &getAnalysis<LoopInfo>();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   return false;
 }
 
@@ -145,7 +145,7 @@ void DependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequiredTransitive<AliasAnalysis>();
   AU.addRequiredTransitive<ScalarEvolution>();
-  AU.addRequiredTransitive<LoopInfo>();
+  AU.addRequiredTransitive<LoopInfoWrapperPass>();
 }
 
 
diff --git a/lib/Analysis/FunctionTargetTransformInfo.cpp b/lib/Analysis/FunctionTargetTransformInfo.cpp
deleted file mode 100644
index a686bec..0000000
--- a/lib/Analysis/FunctionTargetTransformInfo.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//===- llvm/Analysis/FunctionTargetTransformInfo.h --------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass wraps a TargetTransformInfo in a FunctionPass so that it can
-// forward along the current Function so that we can make target specific
-// decisions based on the particular subtarget specified for each Function.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/InitializePasses.h"
-#include "llvm/Analysis/FunctionTargetTransformInfo.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "function-tti"
-static const char ftti_name[] = "Function TargetTransformInfo";
-INITIALIZE_PASS_BEGIN(FunctionTargetTransformInfo, "function_tti", ftti_name, false, true)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
-INITIALIZE_PASS_END(FunctionTargetTransformInfo, "function_tti", ftti_name, false, true)
-char FunctionTargetTransformInfo::ID = 0;
-
-namespace llvm {
-FunctionPass *createFunctionTargetTransformInfoPass() {
-  return new FunctionTargetTransformInfo();
-}
-}
-
-FunctionTargetTransformInfo::FunctionTargetTransformInfo()
-  : FunctionPass(ID), Fn(nullptr), TTI(nullptr) {
-  initializeFunctionTargetTransformInfoPass(*PassRegistry::getPassRegistry());
-}
-
-void FunctionTargetTransformInfo::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  AU.addRequired<TargetTransformInfo>();
-}
-
-void FunctionTargetTransformInfo::releaseMemory() {}
-
-bool FunctionTargetTransformInfo::runOnFunction(Function &F) {
-  Fn = &F;
-  TTI = &getAnalysis<TargetTransformInfo>();
-  return false;
-}
diff --git a/lib/Analysis/IPA/Android.mk b/lib/Analysis/IPA/Android.mk
index d56d931..2e5e571 100644
--- a/lib/Analysis/IPA/Android.mk
+++ b/lib/Analysis/IPA/Android.mk
@@ -4,7 +4,6 @@ analysis_ipa_SRC_FILES := \
   CallGraph.cpp \
   CallGraphSCCPass.cpp \
   CallPrinter.cpp \
-  FindUsedTypes.cpp \
   GlobalsModRef.cpp \
   IPA.cpp \
   InlineCost.cpp
diff --git a/lib/Analysis/IPA/CMakeLists.txt b/lib/Analysis/IPA/CMakeLists.txt
index 67b4135..6095136 100644
--- a/lib/Analysis/IPA/CMakeLists.txt
+++ b/lib/Analysis/IPA/CMakeLists.txt
@@ -2,7 +2,6 @@ add_llvm_library(LLVMipa
   CallGraph.cpp
   CallGraphSCCPass.cpp
   CallPrinter.cpp
-  FindUsedTypes.cpp
   GlobalsModRef.cpp
   IPA.cpp
   InlineCost.cpp
diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp
index 665aa7f..ded1de7 100644
--- a/lib/Analysis/IPA/CallGraphSCCPass.cpp
+++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp
@@ -21,8 +21,8 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LegacyPassManagers.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManagers.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Timer.h"
diff --git a/lib/Analysis/IPA/FindUsedTypes.cpp b/lib/Analysis/IPA/FindUsedTypes.cpp
deleted file mode 100644
index b37344b..0000000
--- a/lib/Analysis/IPA/FindUsedTypes.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-//===- FindUsedTypes.cpp - Find all Types used by a module ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass is used to seek out all of the types in use by the program.  Note
-// that this analysis explicitly does not include types only used by the symbol
-// table.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/FindUsedTypes.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-char FindUsedTypes::ID = 0;
-INITIALIZE_PASS(FindUsedTypes, "print-used-types",
-                "Find Used Types", false, true)
-
-// IncorporateType - Incorporate one type and all of its subtypes into the
-// collection of used types.
-//
-void FindUsedTypes::IncorporateType(Type *Ty) {
-  // If ty doesn't already exist in the used types map, add it now, otherwise
-  // return.
-  if (!UsedTypes.insert(Ty)) return;  // Already contain Ty.
-
-  // Make sure to add any types this type references now.
-  //
-  for (Type::subtype_iterator I = Ty->subtype_begin(), E = Ty->subtype_end();
-       I != E; ++I)
-    IncorporateType(*I);
-}
-
-void FindUsedTypes::IncorporateValue(const Value *V) {
-  IncorporateType(V->getType());
-
-  // If this is a constant, it could be using other types...
-  if (const Constant *C = dyn_cast<Constant>(V)) {
-    if (!isa<GlobalValue>(C))
-      for (User::const_op_iterator OI = C->op_begin(), OE = C->op_end();
-           OI != OE; ++OI)
-        IncorporateValue(*OI);
-  }
-}
-
-
-// run - This incorporates all types used by the specified module
-//
-bool FindUsedTypes::runOnModule(Module &m) {
-  UsedTypes.clear();  // reset if run multiple times...
-
-  // Loop over global variables, incorporating their types
-  for (Module::const_global_iterator I = m.global_begin(), E = m.global_end();
-       I != E; ++I) {
-    IncorporateType(I->getType());
-    if (I->hasInitializer())
-      IncorporateValue(I->getInitializer());
-  }
-
-  for (Module::iterator MI = m.begin(), ME = m.end(); MI != ME; ++MI) {
-    IncorporateType(MI->getType());
-    const Function &F = *MI;
-
-    // Loop over all of the instructions in the function, adding their return
-    // type as well as the types of their operands.
-    //
-    for (const_inst_iterator II = inst_begin(F), IE = inst_end(F);
-         II != IE; ++II) {
-      const Instruction &I = *II;
-
-      IncorporateType(I.getType());  // Incorporate the type of the instruction
-      for (User::const_op_iterator OI = I.op_begin(), OE = I.op_end();
-           OI != OE; ++OI)
-        IncorporateValue(*OI);  // Insert inst operand types as well
-    }
-  }
-
-  return false;
-}
-
-// Print the types found in the module.  If the optional Module parameter is
-// passed in, then the types are printed symbolically if possible, using the
-// symbol table from the module.
-//
-void FindUsedTypes::print(raw_ostream &OS, const Module *M) const {
-  OS << "Types in use by this module:\n";
-  for (SetVector<Type *>::const_iterator I = UsedTypes.begin(),
-       E = UsedTypes.end(); I != E; ++I) {
-    OS << "   " << **I << '\n';
-  }
-}
diff --git a/lib/Analysis/IPA/IPA.cpp b/lib/Analysis/IPA/IPA.cpp
index b26c052..806bfb8 100644
--- a/lib/Analysis/IPA/IPA.cpp
+++ b/lib/Analysis/IPA/IPA.cpp
@@ -22,7 +22,6 @@ void llvm::initializeIPA(PassRegistry &Registry) {
   initializeCallGraphWrapperPassPass(Registry);
   initializeCallGraphPrinterPass(Registry);
   initializeCallGraphViewerPass(Registry);
-  initializeFindUsedTypesPass(Registry);
   initializeGlobalsModRefPass(Registry);
 }
 
diff --git a/lib/Analysis/IPA/InlineCost.cpp b/lib/Analysis/IPA/InlineCost.cpp
index 85db278..cd494ba 100644
--- a/lib/Analysis/IPA/InlineCost.cpp
+++ b/lib/Analysis/IPA/InlineCost.cpp
@@ -17,9 +17,9 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
-#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CallSite.h"
@@ -52,7 +52,7 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   const TargetTransformInfo &TTI;
 
   /// The cache of @llvm.assume intrinsics.
-  AssumptionTracker *AT;
+  AssumptionCacheTracker *ACT;
 
   // The called function.
   Function &F;
@@ -146,8 +146,8 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
 
 public:
   CallAnalyzer(const DataLayout *DL, const TargetTransformInfo &TTI,
-               AssumptionTracker *AT, Function &Callee, int Threshold)
-      : DL(DL), TTI(TTI), AT(AT), F(Callee), Threshold(Threshold), Cost(0),
+               AssumptionCacheTracker *ACT, Function &Callee, int Threshold)
+      : DL(DL), TTI(TTI), ACT(ACT), F(Callee), Threshold(Threshold), Cost(0),
         IsCallerRecursive(false), IsRecursiveCall(false),
         ExposesReturnsTwice(false), HasDynamicAlloca(false),
         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
@@ -601,7 +601,13 @@ bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
   if (!isa<Constant>(RHS))
     if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
       RHS = SimpleRHS;
-  Value *SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
+  Value *SimpleV = nullptr;
+  if (auto FI = dyn_cast<FPMathOperator>(&I))
+    SimpleV =
+        SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
+  else
+    SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
+
   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
     SimplifiedValues[&I] = C;
     return true;
@@ -713,8 +719,7 @@ bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
 
 bool CallAnalyzer::visitCallSite(CallSite CS) {
   if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
-      !F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                      Attribute::ReturnsTwice)) {
+      !F.hasFnAttribute(Attribute::ReturnsTwice)) {
     // This aborts the entire analysis.
     ExposesReturnsTwice = true;
     return false;
@@ -783,7 +788,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
   // during devirtualization and so we want to give it a hefty bonus for
   // inlining, but cap that bonus in the event that inlining wouldn't pan
   // out. Pretend to inline the function, with a custom threshold.
-  CallAnalyzer CA(DL, TTI, AT, *F, InlineConstants::IndirectCallThreshold);
+  CallAnalyzer CA(DL, TTI, ACT, *F, InlineConstants::IndirectCallThreshold);
   if (CA.analyzeCall(CS)) {
     // We were able to inline the indirect call! Subtract the cost from the
     // bonus we want to apply, but don't go below zero.
@@ -907,6 +912,25 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
       ++NumVectorInstructions;
 
+    // If the instruction is floating point, and the target says this operation is
+    // expensive or the function has the "use-soft-float" attribute, this may
+    // eventually become a library call.  Treat the cost as such.
+    if (I->getType()->isFloatingPointTy()) {
+      bool hasSoftFloatAttr = false;
+
+      // If the function has the "use-soft-float" attribute, mark it as expensive.
+      if (F.hasFnAttribute("use-soft-float")) {
+        Attribute Attr = F.getFnAttribute("use-soft-float");
+        StringRef Val = Attr.getValueAsString();
+        if (Val == "true")
+          hasSoftFloatAttr = true;
+      }
+
+      if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
+          hasSoftFloatAttr)
+        Cost += InlineConstants::CallPenalty;
+    }
+
     // If the instruction simplified to a constant, there is no cost to this
     // instruction. Visit the instructions using our InstVisitor to account for
     // all of the per-instruction logic. The visit tree returns true if we
@@ -1110,7 +1134,7 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
   // the ephemeral values multiple times (and they're completely determined by
   // the callee, so this is purely duplicate work).
   SmallPtrSet<const Value *, 32> EphValues;
-  CodeMetrics::collectEphemeralValues(&F, AT, EphValues);
+  CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F), EphValues);
 
   // The worklist of live basic blocks in the callee *after* inlining. We avoid
   // adding basic blocks of the callee which can be proven to be dead for this
@@ -1232,8 +1256,8 @@ void CallAnalyzer::dump() {
 
 INITIALIZE_PASS_BEGIN(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
                       true, true)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_END(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
                     true, true)
 
@@ -1245,14 +1269,14 @@ InlineCostAnalysis::~InlineCostAnalysis() {}
 
 void InlineCostAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<AssumptionTracker>();
-  AU.addRequired<TargetTransformInfo>();
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
   CallGraphSCCPass::getAnalysisUsage(AU);
 }
 
 bool InlineCostAnalysis::runOnSCC(CallGraphSCC &SCC) {
-  TTI = &getAnalysis<TargetTransformInfo>();
-  AT = &getAnalysis<AssumptionTracker>();
+  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
+  ACT = &getAnalysis<AssumptionCacheTracker>();
   return false;
 }
 
@@ -1309,7 +1333,8 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
         << "...\n");
 
-  CallAnalyzer CA(Callee->getDataLayout(), *TTI, AT, *Callee, Threshold);
+  CallAnalyzer CA(Callee->getDataLayout(), TTIWP->getTTI(*Callee),
+                  ACT, *Callee, Threshold);
   bool ShouldInline = CA.analyzeCall(CS);
 
   DEBUG(CA.dump());
@@ -1324,9 +1349,7 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
 }
 
 bool InlineCostAnalysis::isInlineViable(Function &F) {
-  bool ReturnsTwice =
-    F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                   Attribute::ReturnsTwice);
+  bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
     // Disallow inlining of functions which contain indirect branches or
     // blockaddresses.
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index 6b5f370..140753c 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -33,7 +33,7 @@ using namespace llvm;
 char IVUsers::ID = 0;
 INITIALIZE_PASS_BEGIN(IVUsers, "iv-users",
                       "Induction Variable Users", false, true)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(IVUsers, "iv-users",
@@ -241,7 +241,7 @@ IVUsers::IVUsers()
 }
 
 void IVUsers::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<LoopInfo>();
+  AU.addRequired<LoopInfoWrapperPass>();
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<ScalarEvolution>();
   AU.setPreservesAll();
@@ -250,7 +250,7 @@ void IVUsers::getAnalysisUsage(AnalysisUsage &AU) const {
 bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
 
   L = l;
-  LI = &getAnalysis<LoopInfo>();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   SE = &getAnalysis<ScalarEvolution>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index f151a3a..0cb0982 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -31,6 +32,7 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
+#include <algorithm>
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -46,19 +48,21 @@ struct Query {
   const DataLayout *DL;
   const TargetLibraryInfo *TLI;
   const DominatorTree *DT;
-  AssumptionTracker *AT;
+  AssumptionCache *AC;
   const Instruction *CxtI;
 
   Query(const DataLayout *DL, const TargetLibraryInfo *tli,
-        const DominatorTree *dt, AssumptionTracker *at = nullptr,
+        const DominatorTree *dt, AssumptionCache *ac = nullptr,
         const Instruction *cxti = nullptr)
-    : DL(DL), TLI(tli), DT(dt), AT(at), CxtI(cxti) {}
+      : DL(DL), TLI(tli), DT(dt), AC(ac), CxtI(cxti) {}
 };
 } // end anonymous namespace
 
 static Value *SimplifyAndInst(Value *, Value *, const Query &, unsigned);
 static Value *SimplifyBinOp(unsigned, Value *, Value *, const Query &,
                             unsigned);
+static Value *SimplifyFPBinOp(unsigned, Value *, Value *, const FastMathFlags &,
+                              const Query &, unsigned);
 static Value *SimplifyCmpInst(unsigned, Value *, Value *, const Query &,
                               unsigned);
 static Value *SimplifyOrInst(Value *, Value *, const Query &, unsigned);
@@ -581,10 +585,10 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 
 Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                              const DataLayout *DL, const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const DominatorTree *DT, AssumptionCache *AC,
                              const Instruction *CxtI) {
-  return ::SimplifyAddInst(Op0, Op1, isNSW, isNUW,
-                           Query (DL, TLI, DT, AT, CxtI), RecursionLimit);
+  return ::SimplifyAddInst(Op0, Op1, isNSW, isNUW, Query(DL, TLI, DT, AC, CxtI),
+                           RecursionLimit);
 }
 
 /// \brief Compute the base pointer and cumulative constant offsets for V.
@@ -683,17 +687,9 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   if (Op0 == Op1)
     return Constant::getNullValue(Op0->getType());
 
-  // X - (0 - Y) -> X if the second sub is NUW.
-  // If Y != 0, 0 - Y is a poison value.
-  // If Y == 0, 0 - Y simplifies to 0.
-  if (BinaryOperator::isNeg(Op1)) {
-    if (const auto *BO = dyn_cast<BinaryOperator>(Op1)) {
-      assert(BO->getOpcode() == Instruction::Sub &&
-             "Expected a subtraction operator!");
-      if (BO->hasNoUnsignedWrap())
-        return Op0;
-    }
-  }
+  // 0 - X -> 0 if the sub is NUW.
+  if (isNUW && match(Op0, m_Zero()))
+    return Op0;
 
   // (X + Y) - Z -> X + (Y - Z) or Y + (X - Z) if everything simplifies.
   // For example, (X + Y) - Y -> X; (Y + X) - Y -> X
@@ -788,10 +784,10 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 
 Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                              const DataLayout *DL, const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const DominatorTree *DT, AssumptionCache *AC,
                              const Instruction *CxtI) {
-  return ::SimplifySubInst(Op0, Op1, isNSW, isNUW,
-                           Query (DL, TLI, DT, AT, CxtI), RecursionLimit);
+  return ::SimplifySubInst(Op0, Op1, isNSW, isNUW, Query(DL, TLI, DT, AC, CxtI),
+                           RecursionLimit);
 }
 
 /// Given operands for an FAdd, see if we can fold the result.  If not, this
@@ -966,37 +962,37 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q,
 }
 
 Value *llvm::SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
-                             const DataLayout *DL, const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionTracker *AT,
-                             const Instruction *CxtI) {
-  return ::SimplifyFAddInst(Op0, Op1, FMF, Query (DL, TLI, DT, AT, CxtI),
+                              const DataLayout *DL,
+                              const TargetLibraryInfo *TLI,
+                              const DominatorTree *DT, AssumptionCache *AC,
+                              const Instruction *CxtI) {
+  return ::SimplifyFAddInst(Op0, Op1, FMF, Query(DL, TLI, DT, AC, CxtI),
                             RecursionLimit);
 }
 
 Value *llvm::SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
-                             const DataLayout *DL, const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionTracker *AT,
-                             const Instruction *CxtI) {
-  return ::SimplifyFSubInst(Op0, Op1, FMF, Query (DL, TLI, DT, AT, CxtI),
+                              const DataLayout *DL,
+                              const TargetLibraryInfo *TLI,
+                              const DominatorTree *DT, AssumptionCache *AC,
+                              const Instruction *CxtI) {
+  return ::SimplifyFSubInst(Op0, Op1, FMF, Query(DL, TLI, DT, AC, CxtI),
                             RecursionLimit);
 }
 
-Value *llvm::SimplifyFMulInst(Value *Op0, Value *Op1,
-                              FastMathFlags FMF,
+Value *llvm::SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT,
-                              AssumptionTracker *AT,
+                              const DominatorTree *DT, AssumptionCache *AC,
                               const Instruction *CxtI) {
-  return ::SimplifyFMulInst(Op0, Op1, FMF, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyFMulInst(Op0, Op1, FMF, Query(DL, TLI, DT, AC, CxtI),
                             RecursionLimit);
 }
 
 Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const DataLayout *DL,
                              const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const DominatorTree *DT, AssumptionCache *AC,
                              const Instruction *CxtI) {
-  return ::SimplifyMulInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyMulInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI),
                            RecursionLimit);
 }
 
@@ -1017,6 +1013,10 @@ static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
   if (match(Op1, m_Undef()))
     return Op1;
 
+  // X / 0 -> undef, we don't need to preserve faults!
+  if (match(Op1, m_Zero()))
+    return UndefValue::get(Op1->getType());
+
   // undef / X -> 0
   if (match(Op0, m_Undef()))
     return Constant::getNullValue(Op0->getType());
@@ -1094,10 +1094,9 @@ static Value *SimplifySDivInst(Value *Op0, Value *Op1, const Query &Q,
 
 Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT,
-                              AssumptionTracker *AT,
+                              const DominatorTree *DT, AssumptionCache *AC,
                               const Instruction *CxtI) {
-  return ::SimplifySDivInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifySDivInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI),
                             RecursionLimit);
 }
 
@@ -1113,15 +1112,14 @@ static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const Query &Q,
 
 Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT,
-                              AssumptionTracker *AT,
+                              const DominatorTree *DT, AssumptionCache *AC,
                               const Instruction *CxtI) {
-  return ::SimplifyUDivInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyUDivInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI),
                             RecursionLimit);
 }
 
-static Value *SimplifyFDivInst(Value *Op0, Value *Op1, const Query &Q,
-                               unsigned) {
+static Value *SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+                               const Query &Q, unsigned) {
   // undef / X -> undef    (the undef could be a snan).
   if (match(Op0, m_Undef()))
     return Op0;
@@ -1130,15 +1128,21 @@ static Value *SimplifyFDivInst(Value *Op0, Value *Op1, const Query &Q,
   if (match(Op1, m_Undef()))
     return Op1;
 
+  // 0 / X -> 0
+  // Requires that NaNs are off (X could be zero) and signed zeroes are
+  // ignored (X could be positive or negative, so the output sign is unknown).
+  if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op0, m_AnyZero()))
+    return Op0;
+
   return nullptr;
 }
 
-Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, const DataLayout *DL,
+Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+                              const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT,
-                              AssumptionTracker *AT,
+                              const DominatorTree *DT, AssumptionCache *AC,
                               const Instruction *CxtI) {
-  return ::SimplifyFDivInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyFDivInst(Op0, Op1, FMF, Query(DL, TLI, DT, AC, CxtI),
                             RecursionLimit);
 }
 
@@ -1215,10 +1219,9 @@ static Value *SimplifySRemInst(Value *Op0, Value *Op1, const Query &Q,
 
 Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT,
-                              AssumptionTracker *AT,
+                              const DominatorTree *DT, AssumptionCache *AC,
                               const Instruction *CxtI) {
-  return ::SimplifySRemInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifySRemInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI),
                             RecursionLimit);
 }
 
@@ -1234,15 +1237,14 @@ static Value *SimplifyURemInst(Value *Op0, Value *Op1, const Query &Q,
 
 Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT,
-                              AssumptionTracker *AT,
+                              const DominatorTree *DT, AssumptionCache *AC,
                               const Instruction *CxtI) {
-  return ::SimplifyURemInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyURemInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI),
                             RecursionLimit);
 }
 
-static Value *SimplifyFRemInst(Value *Op0, Value *Op1, const Query &,
-                               unsigned) {
+static Value *SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+                               const Query &, unsigned) {
   // undef % X -> undef    (the undef could be a snan).
   if (match(Op0, m_Undef()))
     return Op0;
@@ -1251,15 +1253,21 @@ static Value *SimplifyFRemInst(Value *Op0, Value *Op1, const Query &,
   if (match(Op1, m_Undef()))
     return Op1;
 
+  // 0 % X -> 0
+  // Requires that NaNs are off (X could be zero) and signed zeroes are
+  // ignored (X could be positive or negative, so the output sign is unknown).
+  if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op0, m_AnyZero()))
+    return Op0;
+
   return nullptr;
 }
 
-Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, const DataLayout *DL,
+Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+                              const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT,
-                              AssumptionTracker *AT,
+                              const DominatorTree *DT, AssumptionCache *AC,
                               const Instruction *CxtI) {
-  return ::SimplifyFRemInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyFRemInst(Op0, Op1, FMF, Query(DL, TLI, DT, AC, CxtI),
                             RecursionLimit);
 }
 
@@ -1340,13 +1348,18 @@ static Value *SimplifyRightShift(unsigned Opcode, Value *Op0, Value *Op1,
   if (Op0 == Op1)
     return Constant::getNullValue(Op0->getType());
 
+  // undef >> X -> 0
+  // undef >> X -> undef (if it's exact)
+  if (match(Op0, m_Undef()))
+    return isExact ? Op0 : Constant::getNullValue(Op0->getType());
+
   // The low bit cannot be shifted out of an exact shift if it is set.
   if (isExact) {
     unsigned BitWidth = Op0->getType()->getScalarSizeInBits();
     APInt Op0KnownZero(BitWidth, 0);
     APInt Op0KnownOne(BitWidth, 0);
-    computeKnownBits(Op0, Op0KnownZero, Op0KnownOne, Q.DL, /*Depth=*/0, Q.AT, Q.CxtI,
-                     Q.DT);
+    computeKnownBits(Op0, Op0KnownZero, Op0KnownOne, Q.DL, /*Depth=*/0, Q.AC,
+                     Q.CxtI, Q.DT);
     if (Op0KnownOne[0])
       return Op0;
   }
@@ -1362,8 +1375,9 @@ static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
     return V;
 
   // undef << X -> 0
+  // undef << X -> undef if (if it's NSW/NUW)
   if (match(Op0, m_Undef()))
-    return Constant::getNullValue(Op0->getType());
+    return isNSW || isNUW ? Op0 : Constant::getNullValue(Op0->getType());
 
   // (X >> A) << A -> X
   Value *X;
@@ -1374,9 +1388,9 @@ static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 
 Value *llvm::SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                              const DataLayout *DL, const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const DominatorTree *DT, AssumptionCache *AC,
                              const Instruction *CxtI) {
-  return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, Query(DL, TLI, DT, AC, CxtI),
                            RecursionLimit);
 }
 
@@ -1388,10 +1402,6 @@ static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
                                     MaxRecurse))
       return V;
 
-  // undef >>l X -> 0
-  if (match(Op0, m_Undef()))
-    return Constant::getNullValue(Op0->getType());
-
   // (X << A) >> A -> X
   Value *X;
   if (match(Op0, m_NUWShl(m_Value(X), m_Specific(Op1))))
@@ -1403,10 +1413,9 @@ static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
 Value *llvm::SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
                               const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT,
-                              AssumptionTracker *AT,
+                              const DominatorTree *DT, AssumptionCache *AC,
                               const Instruction *CxtI) {
-  return ::SimplifyLShrInst(Op0, Op1, isExact, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyLShrInst(Op0, Op1, isExact, Query(DL, TLI, DT, AC, CxtI),
                             RecursionLimit);
 }
 
@@ -1422,17 +1431,13 @@ static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
   if (match(Op0, m_AllOnes()))
     return Op0;
 
-  // undef >>a X -> all ones
-  if (match(Op0, m_Undef()))
-    return Constant::getAllOnesValue(Op0->getType());
-
   // (X << A) >> A -> X
   Value *X;
   if (match(Op0, m_NSWShl(m_Value(X), m_Specific(Op1))))
     return X;
 
   // Arithmetic shifting an all-sign-bit value is a no-op.
-  unsigned NumSignBits = ComputeNumSignBits(Op0, Q.DL, 0, Q.AT, Q.CxtI, Q.DT);
+  unsigned NumSignBits = ComputeNumSignBits(Op0, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
   if (NumSignBits == Op0->getType()->getScalarSizeInBits())
     return Op0;
 
@@ -1442,19 +1447,63 @@ static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
 Value *llvm::SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
                               const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT,
-                              AssumptionTracker *AT,
+                              const DominatorTree *DT, AssumptionCache *AC,
                               const Instruction *CxtI) {
-  return ::SimplifyAShrInst(Op0, Op1, isExact, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyAShrInst(Op0, Op1, isExact, Query(DL, TLI, DT, AC, CxtI),
                             RecursionLimit);
 }
 
+static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
+                                         ICmpInst *UnsignedICmp, bool IsAnd) {
+  Value *X, *Y;
+
+  ICmpInst::Predicate EqPred;
+  if (!match(ZeroICmp, m_ICmp(EqPred, m_Value(Y), m_Zero())) ||
+      !ICmpInst::isEquality(EqPred))
+    return nullptr;
+
+  ICmpInst::Predicate UnsignedPred;
+  if (match(UnsignedICmp, m_ICmp(UnsignedPred, m_Value(X), m_Specific(Y))) &&
+      ICmpInst::isUnsigned(UnsignedPred))
+    ;
+  else if (match(UnsignedICmp,
+                 m_ICmp(UnsignedPred, m_Value(Y), m_Specific(X))) &&
+           ICmpInst::isUnsigned(UnsignedPred))
+    UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred);
+  else
+    return nullptr;
+
+  // X < Y && Y != 0  -->  X < Y
+  // X < Y || Y != 0  -->  Y != 0
+  if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE)
+    return IsAnd ? UnsignedICmp : ZeroICmp;
+
+  // X >= Y || Y != 0  -->  true
+  // X >= Y || Y == 0  -->  X >= Y
+  if (UnsignedPred == ICmpInst::ICMP_UGE && !IsAnd) {
+    if (EqPred == ICmpInst::ICMP_NE)
+      return getTrue(UnsignedICmp->getType());
+    return UnsignedICmp;
+  }
+
+  // X < Y && Y == 0  -->  false
+  if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_EQ &&
+      IsAnd)
+    return getFalse(UnsignedICmp->getType());
+
+  return nullptr;
+}
+
 // Simplify (and (icmp ...) (icmp ...)) to true when we can tell that the range
 // of possible values cannot be satisfied.
 static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   ICmpInst::Predicate Pred0, Pred1;
   ConstantInt *CI1, *CI2;
   Value *V;
+
+  if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/true))
+    return X;
+
   if (!match(Op0, m_ICmp(Pred0, m_Add(m_Value(V), m_ConstantInt(CI1)),
                          m_ConstantInt(CI2))))
    return nullptr;
@@ -1547,9 +1596,9 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
   // A & (-A) = A if A is a power of two or zero.
   if (match(Op0, m_Neg(m_Specific(Op1))) ||
       match(Op1, m_Neg(m_Specific(Op0)))) {
-    if (isKnownToBeAPowerOfTwo(Op0, /*OrZero*/true, 0, Q.AT, Q.CxtI, Q.DT))
+    if (isKnownToBeAPowerOfTwo(Op0, /*OrZero*/ true, 0, Q.AC, Q.CxtI, Q.DT))
       return Op0;
-    if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/true, 0, Q.AT, Q.CxtI, Q.DT))
+    if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, Q.AC, Q.CxtI, Q.DT))
       return Op1;
   }
 
@@ -1596,9 +1645,9 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
 
 Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const DataLayout *DL,
                              const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const DominatorTree *DT, AssumptionCache *AC,
                              const Instruction *CxtI) {
-  return ::SimplifyAndInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyAndInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI),
                            RecursionLimit);
 }
 
@@ -1608,6 +1657,10 @@ static Value *SimplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   ICmpInst::Predicate Pred0, Pred1;
   ConstantInt *CI1, *CI2;
   Value *V;
+
+  if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/false))
+    return X;
+
   if (!match(Op0, m_ICmp(Pred0, m_Add(m_Value(V), m_ConstantInt(CI1)),
                          m_ConstantInt(CI2))))
    return nullptr;
@@ -1748,22 +1801,22 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
       if ((C2->getValue() & (C2->getValue() + 1)) == 0 && // C2 == 0+1+
           match(A, m_Add(m_Value(V1), m_Value(V2)))) {
         // Add commutes, try both ways.
-        if (V1 == B && MaskedValueIsZero(V2, C2->getValue(), Q.DL,
-                                         0, Q.AT, Q.CxtI, Q.DT))
+        if (V1 == B &&
+            MaskedValueIsZero(V2, C2->getValue(), Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
           return A;
-        if (V2 == B && MaskedValueIsZero(V1, C2->getValue(), Q.DL,
-                                         0, Q.AT, Q.CxtI, Q.DT))
+        if (V2 == B &&
+            MaskedValueIsZero(V1, C2->getValue(), Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
           return A;
       }
       // Or commutes, try both ways.
       if ((C1->getValue() & (C1->getValue() + 1)) == 0 &&
           match(B, m_Add(m_Value(V1), m_Value(V2)))) {
         // Add commutes, try both ways.
-        if (V1 == A && MaskedValueIsZero(V2, C1->getValue(), Q.DL,
-                                         0, Q.AT, Q.CxtI, Q.DT))
+        if (V1 == A &&
+            MaskedValueIsZero(V2, C1->getValue(), Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
           return B;
-        if (V2 == A && MaskedValueIsZero(V1, C1->getValue(), Q.DL,
-                                         0, Q.AT, Q.CxtI, Q.DT))
+        if (V2 == A &&
+            MaskedValueIsZero(V1, C1->getValue(), Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
           return B;
       }
     }
@@ -1780,9 +1833,9 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
 
 Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const DataLayout *DL,
                             const TargetLibraryInfo *TLI,
-                            const DominatorTree *DT, AssumptionTracker *AT,
+                            const DominatorTree *DT, AssumptionCache *AC,
                             const Instruction *CxtI) {
-  return ::SimplifyOrInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyOrInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI),
                           RecursionLimit);
 }
 
@@ -1837,9 +1890,9 @@ static Value *SimplifyXorInst(Value *Op0, Value *Op1, const Query &Q,
 
 Value *llvm::SimplifyXorInst(Value *Op0, Value *Op1, const DataLayout *DL,
                              const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const DominatorTree *DT, AssumptionCache *AC,
                              const Instruction *CxtI) {
-  return ::SimplifyXorInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyXorInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI),
                            RecursionLimit);
 }
 
@@ -2015,6 +2068,50 @@ static Constant *computePointerICmp(const DataLayout *DL,
       return ConstantExpr::getICmp(Pred,
                                    ConstantExpr::getAdd(LHSOffset, LHSNoBound),
                                    ConstantExpr::getAdd(RHSOffset, RHSNoBound));
+
+    // If one side of the equality comparison must come from a noalias call
+    // (meaning a system memory allocation function), and the other side must
+    // come from a pointer that cannot overlap with dynamically-allocated
+    // memory within the lifetime of the current function (allocas, byval
+    // arguments, globals), then determine the comparison result here.
+    SmallVector<Value *, 8> LHSUObjs, RHSUObjs;
+    GetUnderlyingObjects(LHS, LHSUObjs, DL);
+    GetUnderlyingObjects(RHS, RHSUObjs, DL);
+
+    // Is the set of underlying objects all noalias calls?
+    auto IsNAC = [](SmallVectorImpl<Value *> &Objects) {
+      return std::all_of(Objects.begin(), Objects.end(),
+                         [](Value *V){ return isNoAliasCall(V); });
+    };
+
+    // Is the set of underlying objects all things which must be disjoint from
+    // noalias calls. For allocas, we consider only static ones (dynamic
+    // allocas might be transformed into calls to malloc not simultaneously
+    // live with the compared-to allocation). For globals, we exclude symbols
+    // that might be resolve lazily to symbols in another dynamically-loaded
+    // library (and, thus, could be malloc'ed by the implementation).
+    auto IsAllocDisjoint = [](SmallVectorImpl<Value *> &Objects) {
+      return std::all_of(Objects.begin(), Objects.end(),
+                         [](Value *V){
+                           if (const AllocaInst *AI = dyn_cast<AllocaInst>(V))
+                             return AI->getParent() && AI->getParent()->getParent() &&
+                                    AI->isStaticAlloca();
+                           if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
+                             return (GV->hasLocalLinkage() ||
+                                     GV->hasHiddenVisibility() ||
+                                     GV->hasProtectedVisibility() ||
+                                     GV->hasUnnamedAddr()) &&
+                                    !GV->isThreadLocal();
+                           if (const Argument *A = dyn_cast<Argument>(V))
+                             return A->hasByValAttr();
+                           return false;
+                         });
+    };
+
+    if ((IsNAC(LHSUObjs) && IsAllocDisjoint(RHSUObjs)) ||
+        (IsNAC(RHSUObjs) && IsAllocDisjoint(LHSUObjs)))
+        return ConstantInt::get(GetCompareTy(LHS),
+                                !CmpInst::isTrueWhenEqual(Pred));
   }
 
   // Otherwise, fail.
@@ -2094,46 +2191,46 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       return getTrue(ITy);
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_ULE:
-      if (isKnownNonZero(LHS, Q.DL, 0, Q.AT, Q.CxtI, Q.DT))
+      if (isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
         return getFalse(ITy);
       break;
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_UGT:
-      if (isKnownNonZero(LHS, Q.DL, 0, Q.AT, Q.CxtI, Q.DT))
+      if (isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
         return getTrue(ITy);
       break;
     case ICmpInst::ICMP_SLT:
-      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL,
-                     0, Q.AT, Q.CxtI, Q.DT);
+      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
+                     Q.CxtI, Q.DT);
       if (LHSKnownNegative)
         return getTrue(ITy);
       if (LHSKnownNonNegative)
         return getFalse(ITy);
       break;
     case ICmpInst::ICMP_SLE:
-      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL,
-                     0, Q.AT, Q.CxtI, Q.DT);
+      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
+                     Q.CxtI, Q.DT);
       if (LHSKnownNegative)
         return getTrue(ITy);
-      if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL,
-                                                0, Q.AT, Q.CxtI, Q.DT))
+      if (LHSKnownNonNegative &&
+          isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
         return getFalse(ITy);
       break;
     case ICmpInst::ICMP_SGE:
-      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL,
-                     0, Q.AT, Q.CxtI, Q.DT);
+      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
+                     Q.CxtI, Q.DT);
       if (LHSKnownNegative)
         return getFalse(ITy);
       if (LHSKnownNonNegative)
         return getTrue(ITy);
       break;
     case ICmpInst::ICMP_SGT:
-      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL,
-                     0, Q.AT, Q.CxtI, Q.DT);
+      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
+                     Q.CxtI, Q.DT);
       if (LHSKnownNegative)
         return getFalse(ITy);
-      if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL, 
-                                                0, Q.AT, Q.CxtI, Q.DT))
+      if (LHSKnownNonNegative &&
+          isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
         return getTrue(ITy);
       break;
     }
@@ -2485,6 +2582,40 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     }
   }
 
+  // icmp pred (or X, Y), X
+  if (LBO && match(LBO, m_CombineOr(m_Or(m_Value(), m_Specific(RHS)),
+                                    m_Or(m_Specific(RHS), m_Value())))) {
+    if (Pred == ICmpInst::ICMP_ULT)
+      return getFalse(ITy);
+    if (Pred == ICmpInst::ICMP_UGE)
+      return getTrue(ITy);
+  }
+  // icmp pred X, (or X, Y)
+  if (RBO && match(RBO, m_CombineOr(m_Or(m_Value(), m_Specific(LHS)),
+                                    m_Or(m_Specific(LHS), m_Value())))) {
+    if (Pred == ICmpInst::ICMP_ULE)
+      return getTrue(ITy);
+    if (Pred == ICmpInst::ICMP_UGT)
+      return getFalse(ITy);
+  }
+
+  // icmp pred (and X, Y), X
+  if (LBO && match(LBO, m_CombineOr(m_And(m_Value(), m_Specific(RHS)),
+                                    m_And(m_Specific(RHS), m_Value())))) {
+    if (Pred == ICmpInst::ICMP_UGT)
+      return getFalse(ITy);
+    if (Pred == ICmpInst::ICMP_ULE)
+      return getTrue(ITy);
+  }
+  // icmp pred X, (and X, Y)
+  if (RBO && match(RBO, m_CombineOr(m_And(m_Value(), m_Specific(LHS)),
+                                    m_And(m_Specific(LHS), m_Value())))) {
+    if (Pred == ICmpInst::ICMP_UGE)
+      return getTrue(ITy);
+    if (Pred == ICmpInst::ICMP_ULT)
+      return getFalse(ITy);
+  }
+
   // 0 - (zext X) pred C
   if (!CmpInst::isUnsigned(Pred) && match(LHS, m_Neg(m_ZExt(m_Value())))) {
     if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) {
@@ -2515,8 +2646,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       break;
     case ICmpInst::ICMP_SGT:
     case ICmpInst::ICMP_SGE:
-      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL,
-                     0, Q.AT, Q.CxtI, Q.DT);
+      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
+                     Q.CxtI, Q.DT);
       if (!KnownNonNegative)
         break;
       // fall-through
@@ -2526,8 +2657,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       return getFalse(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE:
-      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL,
-                     0, Q.AT, Q.CxtI, Q.DT);
+      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
+                     Q.CxtI, Q.DT);
       if (!KnownNonNegative)
         break;
       // fall-through
@@ -2546,8 +2677,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       break;
     case ICmpInst::ICMP_SGT:
     case ICmpInst::ICMP_SGE:
-      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL,
-                     0, Q.AT, Q.CxtI, Q.DT);
+      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
+                     Q.CxtI, Q.DT);
       if (!KnownNonNegative)
         break;
       // fall-through
@@ -2557,8 +2688,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       return getTrue(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE:
-      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL,
-                     0, Q.AT, Q.CxtI, Q.DT);
+      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
+                     Q.CxtI, Q.DT);
       if (!KnownNonNegative)
         break;
       // fall-through
@@ -2867,7 +2998,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       uint32_t BitWidth = CI->getBitWidth();
       APInt LHSKnownZero(BitWidth, 0);
       APInt LHSKnownOne(BitWidth, 0);
-      computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, Q.DL, /*Depth=*/0, Q.AT,
+      computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, Q.DL, /*Depth=*/0, Q.AC,
                        Q.CxtI, Q.DT);
       const APInt &RHSVal = CI->getValue();
       if (((LHSKnownZero & RHSVal) != 0) || ((LHSKnownOne & ~RHSVal) != 0))
@@ -2895,10 +3026,9 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 Value *llvm::SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT,
-                              AssumptionTracker *AT,
+                              const DominatorTree *DT, AssumptionCache *AC,
                               Instruction *CxtI) {
-  return ::SimplifyICmpInst(Predicate, LHS, RHS, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyICmpInst(Predicate, LHS, RHS, Query(DL, TLI, DT, AC, CxtI),
                             RecursionLimit);
 }
 
@@ -2936,44 +3066,57 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   }
 
   // Handle fcmp with constant RHS
-  if (Constant *RHSC = dyn_cast<Constant>(RHS)) {
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) {
     // If the constant is a nan, see if we can fold the comparison based on it.
-    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHSC)) {
-      if (CFP->getValueAPF().isNaN()) {
-        if (FCmpInst::isOrdered(Pred))   // True "if ordered and foo"
+    if (CFP->getValueAPF().isNaN()) {
+      if (FCmpInst::isOrdered(Pred)) // True "if ordered and foo"
+        return ConstantInt::getFalse(CFP->getContext());
+      assert(FCmpInst::isUnordered(Pred) &&
+             "Comparison must be either ordered or unordered!");
+      // True if unordered.
+      return ConstantInt::getTrue(CFP->getContext());
+    }
+    // Check whether the constant is an infinity.
+    if (CFP->getValueAPF().isInfinity()) {
+      if (CFP->getValueAPF().isNegative()) {
+        switch (Pred) {
+        case FCmpInst::FCMP_OLT:
+          // No value is ordered and less than negative infinity.
           return ConstantInt::getFalse(CFP->getContext());
-        assert(FCmpInst::isUnordered(Pred) &&
-               "Comparison must be either ordered or unordered!");
-        // True if unordered.
-        return ConstantInt::getTrue(CFP->getContext());
-      }
-      // Check whether the constant is an infinity.
-      if (CFP->getValueAPF().isInfinity()) {
-        if (CFP->getValueAPF().isNegative()) {
-          switch (Pred) {
-          case FCmpInst::FCMP_OLT:
-            // No value is ordered and less than negative infinity.
-            return ConstantInt::getFalse(CFP->getContext());
-          case FCmpInst::FCMP_UGE:
-            // All values are unordered with or at least negative infinity.
-            return ConstantInt::getTrue(CFP->getContext());
-          default:
-            break;
-          }
-        } else {
-          switch (Pred) {
-          case FCmpInst::FCMP_OGT:
-            // No value is ordered and greater than infinity.
-            return ConstantInt::getFalse(CFP->getContext());
-          case FCmpInst::FCMP_ULE:
-            // All values are unordered with and at most infinity.
-            return ConstantInt::getTrue(CFP->getContext());
-          default:
-            break;
-          }
+        case FCmpInst::FCMP_UGE:
+          // All values are unordered with or at least negative infinity.
+          return ConstantInt::getTrue(CFP->getContext());
+        default:
+          break;
+        }
+      } else {
+        switch (Pred) {
+        case FCmpInst::FCMP_OGT:
+          // No value is ordered and greater than infinity.
+          return ConstantInt::getFalse(CFP->getContext());
+        case FCmpInst::FCMP_ULE:
+          // All values are unordered with and at most infinity.
+          return ConstantInt::getTrue(CFP->getContext());
+        default:
+          break;
         }
       }
     }
+    if (CFP->getValueAPF().isZero()) {
+      switch (Pred) {
+      case FCmpInst::FCMP_UGE:
+        if (CannotBeOrderedLessThanZero(LHS))
+          return ConstantInt::getTrue(CFP->getContext());
+        break;
+      case FCmpInst::FCMP_OLT:
+        // X < 0
+        if (CannotBeOrderedLessThanZero(LHS))
+          return ConstantInt::getFalse(CFP->getContext());
+        break;
+      default:
+        break;
+      }
+    }
   }
 
   // If the comparison is with the result of a select instruction, check whether
@@ -2994,10 +3137,9 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT,
-                              AssumptionTracker *AT,
+                              const DominatorTree *DT, AssumptionCache *AC,
                               const Instruction *CxtI) {
-  return ::SimplifyFCmpInst(Predicate, LHS, RHS, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyFCmpInst(Predicate, LHS, RHS, Query(DL, TLI, DT, AC, CxtI),
                             RecursionLimit);
 }
 
@@ -3029,17 +3171,71 @@ static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal,
   if (isa<UndefValue>(FalseVal))   // select C, X, undef -> X
     return TrueVal;
 
+  const auto *ICI = dyn_cast<ICmpInst>(CondVal);
+  unsigned BitWidth = TrueVal->getType()->getScalarSizeInBits();
+  if (ICI && BitWidth) {
+    ICmpInst::Predicate Pred = ICI->getPredicate();
+    APInt MinSignedValue = APInt::getSignBit(BitWidth);
+    Value *X;
+    const APInt *Y;
+    bool TrueWhenUnset;
+    bool IsBitTest = false;
+    if (ICmpInst::isEquality(Pred) &&
+        match(ICI->getOperand(0), m_And(m_Value(X), m_APInt(Y))) &&
+        match(ICI->getOperand(1), m_Zero())) {
+      IsBitTest = true;
+      TrueWhenUnset = Pred == ICmpInst::ICMP_EQ;
+    } else if (Pred == ICmpInst::ICMP_SLT &&
+               match(ICI->getOperand(1), m_Zero())) {
+      X = ICI->getOperand(0);
+      Y = &MinSignedValue;
+      IsBitTest = true;
+      TrueWhenUnset = false;
+    } else if (Pred == ICmpInst::ICMP_SGT &&
+               match(ICI->getOperand(1), m_AllOnes())) {
+      X = ICI->getOperand(0);
+      Y = &MinSignedValue;
+      IsBitTest = true;
+      TrueWhenUnset = true;
+    }
+    if (IsBitTest) {
+      const APInt *C;
+      // (X & Y) == 0 ? X & ~Y : X  --> X
+      // (X & Y) != 0 ? X & ~Y : X  --> X & ~Y
+      if (FalseVal == X && match(TrueVal, m_And(m_Specific(X), m_APInt(C))) &&
+          *Y == ~*C)
+        return TrueWhenUnset ? FalseVal : TrueVal;
+      // (X & Y) == 0 ? X : X & ~Y  --> X & ~Y
+      // (X & Y) != 0 ? X : X & ~Y  --> X
+      if (TrueVal == X && match(FalseVal, m_And(m_Specific(X), m_APInt(C))) &&
+          *Y == ~*C)
+        return TrueWhenUnset ? FalseVal : TrueVal;
+
+      if (Y->isPowerOf2()) {
+        // (X & Y) == 0 ? X | Y : X  --> X | Y
+        // (X & Y) != 0 ? X | Y : X  --> X
+        if (FalseVal == X && match(TrueVal, m_Or(m_Specific(X), m_APInt(C))) &&
+            *Y == *C)
+          return TrueWhenUnset ? TrueVal : FalseVal;
+        // (X & Y) == 0 ? X : X | Y  --> X
+        // (X & Y) != 0 ? X : X | Y  --> X | Y
+        if (TrueVal == X && match(FalseVal, m_Or(m_Specific(X), m_APInt(C))) &&
+            *Y == *C)
+          return TrueWhenUnset ? TrueVal : FalseVal;
+      }
+    }
+  }
+
   return nullptr;
 }
 
 Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
                                 const DataLayout *DL,
                                 const TargetLibraryInfo *TLI,
-                                const DominatorTree *DT,
-                                AssumptionTracker *AT,
+                                const DominatorTree *DT, AssumptionCache *AC,
                                 const Instruction *CxtI) {
   return ::SimplifySelectInst(Cond, TrueVal, FalseVal,
-                              Query (DL, TLI, DT, AT, CxtI), RecursionLimit);
+                              Query(DL, TLI, DT, AC, CxtI), RecursionLimit);
 }
 
 /// SimplifyGEPInst - Given operands for an GetElementPtrInst, see if we can
@@ -3126,9 +3322,9 @@ static Value *SimplifyGEPInst(ArrayRef<Value *> Ops, const Query &Q, unsigned) {
 
 Value *llvm::SimplifyGEPInst(ArrayRef<Value *> Ops, const DataLayout *DL,
                              const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const DominatorTree *DT, AssumptionCache *AC,
                              const Instruction *CxtI) {
-  return ::SimplifyGEPInst(Ops, Query (DL, TLI, DT, AT, CxtI), RecursionLimit);
+  return ::SimplifyGEPInst(Ops, Query(DL, TLI, DT, AC, CxtI), RecursionLimit);
 }
 
 /// SimplifyInsertValueInst - Given operands for an InsertValueInst, see if we
@@ -3160,15 +3356,11 @@ static Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
   return nullptr;
 }
 
-Value *llvm::SimplifyInsertValueInst(Value *Agg, Value *Val,
-                                     ArrayRef<unsigned> Idxs,
-                                     const DataLayout *DL,
-                                     const TargetLibraryInfo *TLI,
-                                     const DominatorTree *DT,
-                                     AssumptionTracker *AT,
-                                     const Instruction *CxtI) {
-  return ::SimplifyInsertValueInst(Agg, Val, Idxs,
-                                   Query (DL, TLI, DT, AT, CxtI),
+Value *llvm::SimplifyInsertValueInst(
+    Value *Agg, Value *Val, ArrayRef<unsigned> Idxs, const DataLayout *DL,
+    const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC,
+    const Instruction *CxtI) {
+  return ::SimplifyInsertValueInst(Agg, Val, Idxs, Query(DL, TLI, DT, AC, CxtI),
                                    RecursionLimit);
 }
 
@@ -3215,10 +3407,9 @@ static Value *SimplifyTruncInst(Value *Op, Type *Ty, const Query &Q, unsigned) {
 
 Value *llvm::SimplifyTruncInst(Value *Op, Type *Ty, const DataLayout *DL,
                                const TargetLibraryInfo *TLI,
-                               const DominatorTree *DT,
-                               AssumptionTracker *AT,
+                               const DominatorTree *DT, AssumptionCache *AC,
                                const Instruction *CxtI) {
-  return ::SimplifyTruncInst(Op, Ty, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyTruncInst(Op, Ty, Query(DL, TLI, DT, AC, CxtI),
                              RecursionLimit);
 }
 
@@ -3246,10 +3437,12 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
     return SimplifyFMulInst (LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::SDiv: return SimplifySDivInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::UDiv: return SimplifyUDivInst(LHS, RHS, Q, MaxRecurse);
-  case Instruction::FDiv: return SimplifyFDivInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::FDiv:
+      return SimplifyFDivInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::SRem: return SimplifySRemInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::URem: return SimplifyURemInst(LHS, RHS, Q, MaxRecurse);
-  case Instruction::FRem: return SimplifyFRemInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::FRem:
+      return SimplifyFRemInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::Shl:
     return SimplifyShlInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
                            Q, MaxRecurse);
@@ -3289,14 +3482,42 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
   }
 }
 
+/// SimplifyFPBinOp - Given operands for a BinaryOperator, see if we can
+/// fold the result.  If not, this returns null.
+/// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the
+/// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp.
+static Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+                              const FastMathFlags &FMF, const Query &Q,
+                              unsigned MaxRecurse) {
+  switch (Opcode) {
+  case Instruction::FAdd:
+    return SimplifyFAddInst(LHS, RHS, FMF, Q, MaxRecurse);
+  case Instruction::FSub:
+    return SimplifyFSubInst(LHS, RHS, FMF, Q, MaxRecurse);
+  case Instruction::FMul:
+    return SimplifyFMulInst(LHS, RHS, FMF, Q, MaxRecurse);
+  default:
+    return SimplifyBinOp(Opcode, LHS, RHS, Q, MaxRecurse);
+  }
+}
+
 Value *llvm::SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                            const DataLayout *DL, const TargetLibraryInfo *TLI,
-                           const DominatorTree *DT, AssumptionTracker *AT,
+                           const DominatorTree *DT, AssumptionCache *AC,
                            const Instruction *CxtI) {
-  return ::SimplifyBinOp(Opcode, LHS, RHS, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyBinOp(Opcode, LHS, RHS, Query(DL, TLI, DT, AC, CxtI),
                          RecursionLimit);
 }
 
+Value *llvm::SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+                             const FastMathFlags &FMF, const DataLayout *DL,
+                             const TargetLibraryInfo *TLI,
+                             const DominatorTree *DT, AssumptionCache *AC,
+                             const Instruction *CxtI) {
+  return ::SimplifyFPBinOp(Opcode, LHS, RHS, FMF, Query(DL, TLI, DT, AC, CxtI),
+                           RecursionLimit);
+}
+
 /// SimplifyCmpInst - Given operands for a CmpInst, see if we can
 /// fold the result.
 static Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
@@ -3308,9 +3529,9 @@ static Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 
 Value *llvm::SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                              const DataLayout *DL, const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const DominatorTree *DT, AssumptionCache *AC,
                              const Instruction *CxtI) {
-  return ::SimplifyCmpInst(Predicate, LHS, RHS, Query (DL, TLI, DT, AT, CxtI),
+  return ::SimplifyCmpInst(Predicate, LHS, RHS, Query(DL, TLI, DT, AC, CxtI),
                            RecursionLimit);
 }
 
@@ -3384,27 +3605,25 @@ static Value *SimplifyCall(Value *V, IterTy ArgBegin, IterTy ArgEnd,
 
 Value *llvm::SimplifyCall(Value *V, User::op_iterator ArgBegin,
                           User::op_iterator ArgEnd, const DataLayout *DL,
-                          const TargetLibraryInfo *TLI,
-                          const DominatorTree *DT, AssumptionTracker *AT,
-                          const Instruction *CxtI) {
-  return ::SimplifyCall(V, ArgBegin, ArgEnd, Query(DL, TLI, DT, AT, CxtI),
+                          const TargetLibraryInfo *TLI, const DominatorTree *DT,
+                          AssumptionCache *AC, const Instruction *CxtI) {
+  return ::SimplifyCall(V, ArgBegin, ArgEnd, Query(DL, TLI, DT, AC, CxtI),
                         RecursionLimit);
 }
 
 Value *llvm::SimplifyCall(Value *V, ArrayRef<Value *> Args,
                           const DataLayout *DL, const TargetLibraryInfo *TLI,
-                          const DominatorTree *DT, AssumptionTracker *AT,
+                          const DominatorTree *DT, AssumptionCache *AC,
                           const Instruction *CxtI) {
   return ::SimplifyCall(V, Args.begin(), Args.end(),
-                        Query(DL, TLI, DT, AT, CxtI), RecursionLimit);
+                        Query(DL, TLI, DT, AC, CxtI), RecursionLimit);
 }
 
 /// SimplifyInstruction - See if we can compute a simplified version of this
 /// instruction.  If not, this returns null.
 Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout *DL,
                                  const TargetLibraryInfo *TLI,
-                                 const DominatorTree *DT,
-                                 AssumptionTracker *AT) {
+                                 const DominatorTree *DT, AssumptionCache *AC) {
   Value *Result;
 
   switch (I->getOpcode()) {
@@ -3413,122 +3632,122 @@ Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout *DL,
     break;
   case Instruction::FAdd:
     Result = SimplifyFAddInst(I->getOperand(0), I->getOperand(1),
-                              I->getFastMathFlags(), DL, TLI, DT, AT, I);
+                              I->getFastMathFlags(), DL, TLI, DT, AC, I);
     break;
   case Instruction::Add:
     Result = SimplifyAddInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
-                             cast<BinaryOperator>(I)->hasNoUnsignedWrap(),
-                             DL, TLI, DT, AT, I);
+                             cast<BinaryOperator>(I)->hasNoUnsignedWrap(), DL,
+                             TLI, DT, AC, I);
     break;
   case Instruction::FSub:
     Result = SimplifyFSubInst(I->getOperand(0), I->getOperand(1),
-                              I->getFastMathFlags(), DL, TLI, DT, AT, I);
+                              I->getFastMathFlags(), DL, TLI, DT, AC, I);
     break;
   case Instruction::Sub:
     Result = SimplifySubInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
-                             cast<BinaryOperator>(I)->hasNoUnsignedWrap(),
-                             DL, TLI, DT, AT, I);
+                             cast<BinaryOperator>(I)->hasNoUnsignedWrap(), DL,
+                             TLI, DT, AC, I);
     break;
   case Instruction::FMul:
     Result = SimplifyFMulInst(I->getOperand(0), I->getOperand(1),
-                              I->getFastMathFlags(), DL, TLI, DT, AT, I);
+                              I->getFastMathFlags(), DL, TLI, DT, AC, I);
     break;
   case Instruction::Mul:
-    Result = SimplifyMulInst(I->getOperand(0), I->getOperand(1),
-                             DL, TLI, DT, AT, I);
+    Result =
+        SimplifyMulInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT, AC, I);
     break;
   case Instruction::SDiv:
-    Result = SimplifySDivInst(I->getOperand(0), I->getOperand(1),
-                              DL, TLI, DT, AT, I);
+    Result = SimplifySDivInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT,
+                              AC, I);
     break;
   case Instruction::UDiv:
-    Result = SimplifyUDivInst(I->getOperand(0), I->getOperand(1),
-                              DL, TLI, DT, AT, I);
+    Result = SimplifyUDivInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT,
+                              AC, I);
     break;
   case Instruction::FDiv:
     Result = SimplifyFDivInst(I->getOperand(0), I->getOperand(1),
-                              DL, TLI, DT, AT, I);
+                              I->getFastMathFlags(), DL, TLI, DT, AC, I);
     break;
   case Instruction::SRem:
-    Result = SimplifySRemInst(I->getOperand(0), I->getOperand(1),
-                              DL, TLI, DT, AT, I);
+    Result = SimplifySRemInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT,
+                              AC, I);
     break;
   case Instruction::URem:
-    Result = SimplifyURemInst(I->getOperand(0), I->getOperand(1),
-                              DL, TLI, DT, AT, I);
+    Result = SimplifyURemInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT,
+                              AC, I);
     break;
   case Instruction::FRem:
     Result = SimplifyFRemInst(I->getOperand(0), I->getOperand(1),
-                              DL, TLI, DT, AT, I);
+                              I->getFastMathFlags(), DL, TLI, DT, AC, I);
     break;
   case Instruction::Shl:
     Result = SimplifyShlInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
-                             cast<BinaryOperator>(I)->hasNoUnsignedWrap(),
-                             DL, TLI, DT, AT, I);
+                             cast<BinaryOperator>(I)->hasNoUnsignedWrap(), DL,
+                             TLI, DT, AC, I);
     break;
   case Instruction::LShr:
     Result = SimplifyLShrInst(I->getOperand(0), I->getOperand(1),
-                              cast<BinaryOperator>(I)->isExact(),
-                              DL, TLI, DT, AT, I);
+                              cast<BinaryOperator>(I)->isExact(), DL, TLI, DT,
+                              AC, I);
     break;
   case Instruction::AShr:
     Result = SimplifyAShrInst(I->getOperand(0), I->getOperand(1),
-                              cast<BinaryOperator>(I)->isExact(),
-                              DL, TLI, DT, AT, I);
+                              cast<BinaryOperator>(I)->isExact(), DL, TLI, DT,
+                              AC, I);
     break;
   case Instruction::And:
-    Result = SimplifyAndInst(I->getOperand(0), I->getOperand(1),
-                             DL, TLI, DT, AT, I);
+    Result =
+        SimplifyAndInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT, AC, I);
     break;
   case Instruction::Or:
-    Result = SimplifyOrInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT,
-                            AT, I);
+    Result =
+        SimplifyOrInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT, AC, I);
     break;
   case Instruction::Xor:
-    Result = SimplifyXorInst(I->getOperand(0), I->getOperand(1),
-                             DL, TLI, DT, AT, I);
+    Result =
+        SimplifyXorInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT, AC, I);
     break;
   case Instruction::ICmp:
-    Result = SimplifyICmpInst(cast<ICmpInst>(I)->getPredicate(),
-                              I->getOperand(0), I->getOperand(1),
-                              DL, TLI, DT, AT, I);
+    Result =
+        SimplifyICmpInst(cast<ICmpInst>(I)->getPredicate(), I->getOperand(0),
+                         I->getOperand(1), DL, TLI, DT, AC, I);
     break;
   case Instruction::FCmp:
-    Result = SimplifyFCmpInst(cast<FCmpInst>(I)->getPredicate(),
-                              I->getOperand(0), I->getOperand(1),
-                              DL, TLI, DT, AT, I);
+    Result =
+        SimplifyFCmpInst(cast<FCmpInst>(I)->getPredicate(), I->getOperand(0),
+                         I->getOperand(1), DL, TLI, DT, AC, I);
     break;
   case Instruction::Select:
     Result = SimplifySelectInst(I->getOperand(0), I->getOperand(1),
-                                I->getOperand(2), DL, TLI, DT, AT, I);
+                                I->getOperand(2), DL, TLI, DT, AC, I);
     break;
   case Instruction::GetElementPtr: {
     SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
-    Result = SimplifyGEPInst(Ops, DL, TLI, DT, AT, I);
+    Result = SimplifyGEPInst(Ops, DL, TLI, DT, AC, I);
     break;
   }
   case Instruction::InsertValue: {
     InsertValueInst *IV = cast<InsertValueInst>(I);
     Result = SimplifyInsertValueInst(IV->getAggregateOperand(),
                                      IV->getInsertedValueOperand(),
-                                     IV->getIndices(), DL, TLI, DT, AT, I);
+                                     IV->getIndices(), DL, TLI, DT, AC, I);
     break;
   }
   case Instruction::PHI:
-    Result = SimplifyPHINode(cast<PHINode>(I), Query (DL, TLI, DT, AT, I));
+    Result = SimplifyPHINode(cast<PHINode>(I), Query(DL, TLI, DT, AC, I));
     break;
   case Instruction::Call: {
     CallSite CS(cast<CallInst>(I));
-    Result = SimplifyCall(CS.getCalledValue(), CS.arg_begin(), CS.arg_end(),
-                          DL, TLI, DT, AT, I);
+    Result = SimplifyCall(CS.getCalledValue(), CS.arg_begin(), CS.arg_end(), DL,
+                          TLI, DT, AC, I);
     break;
   }
   case Instruction::Trunc:
-    Result = SimplifyTruncInst(I->getOperand(0), I->getType(), DL, TLI, DT,
-                               AT, I);
+    Result =
+        SimplifyTruncInst(I->getOperand(0), I->getType(), DL, TLI, DT, AC, I);
     break;
   }
 
@@ -3553,7 +3772,7 @@ static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
                                               const DataLayout *DL,
                                               const TargetLibraryInfo *TLI,
                                               const DominatorTree *DT,
-                                              AssumptionTracker *AT) {
+                                              AssumptionCache *AC) {
   bool Simplified = false;
   SmallSetVector<Instruction *, 8> Worklist;
 
@@ -3580,7 +3799,7 @@ static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
     I = Worklist[Idx];
 
     // See if this instruction simplifies.
-    SimpleV = SimplifyInstruction(I, DL, TLI, DT, AT);
+    SimpleV = SimplifyInstruction(I, DL, TLI, DT, AC);
     if (!SimpleV)
       continue;
 
@@ -3603,20 +3822,19 @@ static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
   return Simplified;
 }
 
-bool llvm::recursivelySimplifyInstruction(Instruction *I,
-                                          const DataLayout *DL,
+bool llvm::recursivelySimplifyInstruction(Instruction *I, const DataLayout *DL,
                                           const TargetLibraryInfo *TLI,
                                           const DominatorTree *DT,
-                                          AssumptionTracker *AT) {
-  return replaceAndRecursivelySimplifyImpl(I, nullptr, DL, TLI, DT, AT);
+                                          AssumptionCache *AC) {
+  return replaceAndRecursivelySimplifyImpl(I, nullptr, DL, TLI, DT, AC);
 }
 
 bool llvm::replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV,
                                          const DataLayout *DL,
                                          const TargetLibraryInfo *TLI,
                                          const DominatorTree *DT,
-                                         AssumptionTracker *AT) {
+                                         AssumptionCache *AC) {
   assert(I != SimpleV && "replaceAndRecursivelySimplify(X,X) is not valid!");
   assert(SimpleV && "Must provide a simplified value.");
-  return replaceAndRecursivelySimplifyImpl(I, SimpleV, DL, TLI, DT, AT);
+  return replaceAndRecursivelySimplifyImpl(I, SimpleV, DL, TLI, DT, AC);
 }
diff --git a/lib/Analysis/LLVMBuild.txt b/lib/Analysis/LLVMBuild.txt
index a8a8079..3039dde 100644
--- a/lib/Analysis/LLVMBuild.txt
+++ b/lib/Analysis/LLVMBuild.txt
@@ -22,4 +22,4 @@ subdirectories = IPA
 type = Library
 name = Analysis
 parent = Libraries
-required_libraries = Core Support Target
+required_libraries = Core Support
diff --git a/lib/Analysis/LazyCallGraph.cpp b/lib/Analysis/LazyCallGraph.cpp
index 767da4e..c8d0410 100644
--- a/lib/Analysis/LazyCallGraph.cpp
+++ b/lib/Analysis/LazyCallGraph.cpp
@@ -708,11 +708,11 @@ static void printSCC(raw_ostream &OS, LazyCallGraph::SCC &SCC) {
   OS << "\n";
 }
 
-PreservedAnalyses LazyCallGraphPrinterPass::run(Module *M,
+PreservedAnalyses LazyCallGraphPrinterPass::run(Module &M,
                                                 ModuleAnalysisManager *AM) {
   LazyCallGraph &G = AM->getResult<LazyCallGraphAnalysis>(M);
 
-  OS << "Printing the call graph for module: " << M->getModuleIdentifier()
+  OS << "Printing the call graph for module: " << M.getModuleIdentifier()
      << "\n\n";
 
   SmallPtrSet<LazyCallGraph::Node *, 16> Printed;
@@ -724,5 +724,4 @@ PreservedAnalyses LazyCallGraphPrinterPass::run(Module *M,
     printSCC(OS, SCC);
 
   return PreservedAnalyses::all();
-
 }
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index c712c9f..87c31fd 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -1,4 +1,4 @@
-//===- LazyValueInfo.cpp - Value constraint analysis ----------------------===//
+//===- LazyValueInfo.cpp - Value constraint analysis ------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,8 +15,9 @@
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/ConstantRange.h"
@@ -29,7 +30,6 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include <map>
 #include <stack>
 using namespace llvm;
@@ -40,8 +40,8 @@ using namespace PatternMatch;
 char LazyValueInfo::ID = 0;
 INITIALIZE_PASS_BEGIN(LazyValueInfo, "lazy-value-info",
                 "Lazy Value Information Analysis", false, true)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(LazyValueInfo, "lazy-value-info",
                 "Lazy Value Information Analysis", false, true)
 
@@ -54,8 +54,7 @@ namespace llvm {
 //                               LVILatticeVal
 //===----------------------------------------------------------------------===//
 
-/// LVILatticeVal - This is the information tracked by LazyValueInfo for each
-/// value.
+/// This is the information tracked by LazyValueInfo for each value.
 ///
 /// FIXME: This is basically just for bringup, this can be made a lot more rich
 /// in the future.
@@ -63,19 +62,19 @@ namespace llvm {
 namespace {
 class LVILatticeVal {
   enum LatticeValueTy {
-    /// undefined - This Value has no known value yet.
+    /// This Value has no known value yet.
     undefined,
     
-    /// constant - This Value has a specific constant value.
+    /// This Value has a specific constant value.
     constant,
-    /// notconstant - This Value is known to not have the specified value.
+    
+    /// This Value is known to not have the specified value.
     notconstant,
 
-    /// constantrange - The Value falls within this range.
+    /// The Value falls within this range.
     constantrange,
 
-    /// overdefined - This value is not known to be constant, and we know that
-    /// it has a value.
+    /// This value is not known to be constant, and we know that it has a value.
     overdefined
   };
   
@@ -128,7 +127,7 @@ public:
     return Range;
   }
   
-  /// markOverdefined - Return true if this is a change in status.
+  /// Return true if this is a change in status.
   bool markOverdefined() {
     if (isOverdefined())
       return false;
@@ -136,7 +135,7 @@ public:
     return true;
   }
 
-  /// markConstant - Return true if this is a change in status.
+  /// Return true if this is a change in status.
   bool markConstant(Constant *V) {
     assert(V && "Marking constant with NULL");
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
@@ -152,7 +151,7 @@ public:
     return true;
   }
   
-  /// markNotConstant - Return true if this is a change in status.
+  /// Return true if this is a change in status.
   bool markNotConstant(Constant *V) {
     assert(V && "Marking constant with NULL");
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
@@ -170,7 +169,7 @@ public:
     return true;
   }
   
-  /// markConstantRange - Return true if this is a change in status.
+  /// Return true if this is a change in status.
   bool markConstantRange(const ConstantRange NewR) {
     if (isConstantRange()) {
       if (NewR.isEmptySet())
@@ -190,7 +189,7 @@ public:
     return true;
   }
   
-  /// mergeIn - Merge the specified lattice value into this one, updating this
+  /// Merge the specified lattice value into this one, updating this
   /// one and returning true if anything changed.
   bool mergeIn(const LVILatticeVal &RHS) {
     if (RHS.isUndefined() || isOverdefined()) return false;
@@ -298,8 +297,7 @@ raw_ostream &operator<<(raw_ostream &OS, const LVILatticeVal &Val) {
 //===----------------------------------------------------------------------===//
 
 namespace {
-  /// LVIValueHandle - A callback value handle updates the cache when
-  /// values are erased.
+  /// A callback value handle updates the cache when values are erased.
   class LazyValueInfoCache;
   struct LVIValueHandle : public CallbackVH {
     LazyValueInfoCache *Parent;
@@ -315,62 +313,62 @@ namespace {
 }
 
 namespace { 
-  /// LazyValueInfoCache - This is the cache kept by LazyValueInfo which
+  /// This is the cache kept by LazyValueInfo which
   /// maintains information about queries across the clients' queries.
   class LazyValueInfoCache {
-    /// ValueCacheEntryTy - This is all of the cached block information for
-    /// exactly one Value*.  The entries are sorted by the BasicBlock* of the
+    /// This is all of the cached block information for exactly one Value*.
+    /// The entries are sorted by the BasicBlock* of the
     /// entries, allowing us to do a lookup with a binary search.
     typedef std::map<AssertingVH<BasicBlock>, LVILatticeVal> ValueCacheEntryTy;
 
-    /// ValueCache - This is all of the cached information for all values,
+    /// This is all of the cached information for all values,
     /// mapped from Value* to key information.
     std::map<LVIValueHandle, ValueCacheEntryTy> ValueCache;
     
-    /// OverDefinedCache - This tracks, on a per-block basis, the set of 
-    /// values that are over-defined at the end of that block.  This is required
+    /// This tracks, on a per-block basis, the set of values that are
+    /// over-defined at the end of that block.  This is required
     /// for cache updating.
     typedef std::pair<AssertingVH<BasicBlock>, Value*> OverDefinedPairTy;
     DenseSet<OverDefinedPairTy> OverDefinedCache;
 
-    /// SeenBlocks - Keep track of all blocks that we have ever seen, so we
+    /// Keep track of all blocks that we have ever seen, so we
     /// don't spend time removing unused blocks from our caches.
     DenseSet<AssertingVH<BasicBlock> > SeenBlocks;
 
-    /// BlockValueStack - This stack holds the state of the value solver
-    /// during a query.  It basically emulates the callstack of the naive
+    /// This stack holds the state of the value solver during a query.
+    /// It basically emulates the callstack of the naive
     /// recursive value lookup process.
     std::stack<std::pair<BasicBlock*, Value*> > BlockValueStack;
 
+    /// Keeps track of which block-value pairs are in BlockValueStack.
+    DenseSet<std::pair<BasicBlock*, Value*> > BlockValueSet;
+
+    /// Push BV onto BlockValueStack unless it's already in there.
+    /// Returns true on success.
+    bool pushBlockValue(const std::pair<BasicBlock *, Value *> &BV) {
+      if (BlockValueSet.count(BV))
+        return false;  // It's already in the stack.
+
+      BlockValueStack.push(BV);
+      BlockValueSet.insert(BV);
+      return true;
+    }
+
     /// A pointer to the cache of @llvm.assume calls.
-    AssumptionTracker *AT;
+    AssumptionCache *AC;
     /// An optional DL pointer.
     const DataLayout *DL;
     /// An optional DT pointer.
     DominatorTree *DT;
     
     friend struct LVIValueHandle;
-    
-    /// OverDefinedCacheUpdater - A helper object that ensures that the
-    /// OverDefinedCache is updated whenever solveBlockValue returns.
-    struct OverDefinedCacheUpdater {
-      LazyValueInfoCache *Parent;
-      Value *Val;
-      BasicBlock *BB;
-      LVILatticeVal &BBLV;
-      
-      OverDefinedCacheUpdater(Value *V, BasicBlock *B, LVILatticeVal &LV,
-                       LazyValueInfoCache *P)
-        : Parent(P), Val(V), BB(B), BBLV(LV) { }
-      
-      bool markResult(bool changed) { 
-        if (changed && BBLV.isOverdefined())
-          Parent->OverDefinedCache.insert(std::make_pair(BB, Val));
-        return changed;
-      }
-    };
-    
 
+    void insertResult(Value *Val, BasicBlock *BB, const LVILatticeVal &Result) {
+      SeenBlocks.insert(BB);
+      lookup(Val)[BB] = Result;
+      if (Result.isOverdefined())
+        OverDefinedCache.insert(std::make_pair(BB, Val));
+    }
 
     LVILatticeVal getBlockValue(Value *Val, BasicBlock *BB);
     bool getEdgeValue(Value *V, BasicBlock *F, BasicBlock *T,
@@ -398,27 +396,26 @@ namespace {
     }
 
   public:
-    /// getValueInBlock - This is the query interface to determine the lattice
+    /// This is the query interface to determine the lattice
     /// value for the specified Value* at the end of the specified block.
     LVILatticeVal getValueInBlock(Value *V, BasicBlock *BB,
                                   Instruction *CxtI = nullptr);
 
-    /// getValueAt - This is the query interface to determine the lattice
+    /// This is the query interface to determine the lattice
     /// value for the specified Value* at the specified instruction (generally
     /// from an assume intrinsic).
     LVILatticeVal getValueAt(Value *V, Instruction *CxtI);
 
-    /// getValueOnEdge - This is the query interface to determine the lattice
+    /// This is the query interface to determine the lattice
     /// value for the specified Value* that is true on the specified edge.
     LVILatticeVal getValueOnEdge(Value *V, BasicBlock *FromBB,BasicBlock *ToBB,
                                  Instruction *CxtI = nullptr);
     
-    /// threadEdge - This is the update interface to inform the cache that an
-    /// edge from PredBB to OldSucc has been threaded to be from PredBB to
-    /// NewSucc.
+    /// This is the update interface to inform the cache that an edge from
+    /// PredBB to OldSucc has been threaded to be from PredBB to NewSucc.
     void threadEdge(BasicBlock *PredBB,BasicBlock *OldSucc,BasicBlock *NewSucc);
     
-    /// eraseBlock - This is part of the update interface to inform the cache
+    /// This is part of the update interface to inform the cache
     /// that a block has been deleted.
     void eraseBlock(BasicBlock *BB);
     
@@ -429,9 +426,9 @@ namespace {
       OverDefinedCache.clear();
     }
 
-    LazyValueInfoCache(AssumptionTracker *AT,
-                       const DataLayout *DL = nullptr,
-                       DominatorTree *DT = nullptr) : AT(AT), DL(DL), DT(DT) {}
+    LazyValueInfoCache(AssumptionCache *AC, const DataLayout *DL = nullptr,
+                       DominatorTree *DT = nullptr)
+        : AC(AC), DL(DL), DT(DT) {}
   };
 } // end anonymous namespace
 
@@ -439,17 +436,11 @@ void LVIValueHandle::deleted() {
   typedef std::pair<AssertingVH<BasicBlock>, Value*> OverDefinedPairTy;
   
   SmallVector<OverDefinedPairTy, 4> ToErase;
-  for (DenseSet<OverDefinedPairTy>::iterator 
-       I = Parent->OverDefinedCache.begin(),
-       E = Parent->OverDefinedCache.end();
-       I != E; ++I) {
-    if (I->second == getValPtr())
-      ToErase.push_back(*I);
-  }
-
-  for (SmallVectorImpl<OverDefinedPairTy>::iterator I = ToErase.begin(),
-       E = ToErase.end(); I != E; ++I)
-    Parent->OverDefinedCache.erase(*I);
+  for (const OverDefinedPairTy &P : Parent->OverDefinedCache)
+    if (P.second == getValPtr())
+      ToErase.push_back(P);
+  for (const OverDefinedPairTy &P : ToErase)
+    Parent->OverDefinedCache.erase(P);
   
   // This erasure deallocates *this, so it MUST happen after we're done
   // using any and all members of *this.
@@ -464,15 +455,11 @@ void LazyValueInfoCache::eraseBlock(BasicBlock *BB) {
   SeenBlocks.erase(I);
 
   SmallVector<OverDefinedPairTy, 4> ToErase;
-  for (DenseSet<OverDefinedPairTy>::iterator  I = OverDefinedCache.begin(),
-       E = OverDefinedCache.end(); I != E; ++I) {
-    if (I->first == BB)
-      ToErase.push_back(*I);
-  }
-
-  for (SmallVectorImpl<OverDefinedPairTy>::iterator I = ToErase.begin(),
-       E = ToErase.end(); I != E; ++I)
-    OverDefinedCache.erase(*I);
+  for (const OverDefinedPairTy& P : OverDefinedCache)
+    if (P.first == BB)
+      ToErase.push_back(P);
+  for (const OverDefinedPairTy &P : ToErase)
+    OverDefinedCache.erase(P);
 
   for (std::map<LVIValueHandle, ValueCacheEntryTy>::iterator
        I = ValueCache.begin(), E = ValueCache.end(); I != E; ++I)
@@ -482,9 +469,18 @@ void LazyValueInfoCache::eraseBlock(BasicBlock *BB) {
 void LazyValueInfoCache::solve() {
   while (!BlockValueStack.empty()) {
     std::pair<BasicBlock*, Value*> &e = BlockValueStack.top();
+    assert(BlockValueSet.count(e) && "Stack value should be in BlockValueSet!");
+
     if (solveBlockValue(e.second, e.first)) {
-      assert(BlockValueStack.top() == e);
+      // The work item was completely processed.
+      assert(BlockValueStack.top() == e && "Nothing should have been pushed!");
+      assert(lookup(e.second).count(e.first) && "Result should be in cache!");
+
       BlockValueStack.pop();
+      BlockValueSet.erase(e);
+    } else {
+      // More work needs to be done before revisiting.
+      assert(BlockValueStack.top() != e && "Stack should have been pushed!");
     }
   }
 }
@@ -514,43 +510,40 @@ bool LazyValueInfoCache::solveBlockValue(Value *Val, BasicBlock *BB) {
   if (isa<Constant>(Val))
     return true;
 
-  ValueCacheEntryTy &Cache = lookup(Val);
-  SeenBlocks.insert(BB);
-  LVILatticeVal &BBLV = Cache[BB];
-  
-  // OverDefinedCacheUpdater is a helper object that will update
-  // the OverDefinedCache for us when this method exits.  Make sure to
-  // call markResult on it as we exist, passing a bool to indicate if the
-  // cache needs updating, i.e. if we have solve a new value or not.
-  OverDefinedCacheUpdater ODCacheUpdater(Val, BB, BBLV, this);
-
-  if (!BBLV.isUndefined()) {
-    DEBUG(dbgs() << "  reuse BB '" << BB->getName() << "' val=" << BBLV <<'\n');
-    
-    // Since we're reusing a cached value here, we don't need to update the 
-    // OverDefinedCahce.  The cache will have been properly updated 
-    // whenever the cached value was inserted.
-    ODCacheUpdater.markResult(false);
+  if (lookup(Val).count(BB)) {
+    // If we have a cached value, use that.
+    DEBUG(dbgs() << "  reuse BB '" << BB->getName()
+                 << "' val=" << lookup(Val)[BB] << '\n');
+
+    // Since we're reusing a cached value, we don't need to update the
+    // OverDefinedCache. The cache will have been properly updated whenever the
+    // cached value was inserted.
     return true;
   }
 
-  // Otherwise, this is the first time we're seeing this block.  Reset the
-  // lattice value to overdefined, so that cycles will terminate and be
-  // conservatively correct.
-  BBLV.markOverdefined();
+  // Hold off inserting this value into the Cache in case we have to return
+  // false and come back later.
+  LVILatticeVal Res;
   
   Instruction *BBI = dyn_cast<Instruction>(Val);
   if (!BBI || BBI->getParent() != BB) {
-    return ODCacheUpdater.markResult(solveBlockValueNonLocal(BBLV, Val, BB));
+    if (!solveBlockValueNonLocal(Res, Val, BB))
+      return false;
+   insertResult(Val, BB, Res);
+   return true;
   }
 
   if (PHINode *PN = dyn_cast<PHINode>(BBI)) {
-    return ODCacheUpdater.markResult(solveBlockValuePHINode(BBLV, PN, BB));
+    if (!solveBlockValuePHINode(Res, PN, BB))
+      return false;
+    insertResult(Val, BB, Res);
+    return true;
   }
 
   if (AllocaInst *AI = dyn_cast<AllocaInst>(BBI)) {
-    BBLV = LVILatticeVal::getNot(ConstantPointerNull::get(AI->getType()));
-    return ODCacheUpdater.markResult(true);
+    Res = LVILatticeVal::getNot(ConstantPointerNull::get(AI->getType()));
+    insertResult(Val, BB, Res);
+    return true;
   }
 
   // We can only analyze the definitions of certain classes of instructions
@@ -560,8 +553,9 @@ bool LazyValueInfoCache::solveBlockValue(Value *Val, BasicBlock *BB) {
      !BBI->getType()->isIntegerTy()) {
     DEBUG(dbgs() << " compute BB '" << BB->getName()
                  << "' - overdefined because inst def found.\n");
-    BBLV.markOverdefined();
-    return ODCacheUpdater.markResult(true);
+    Res.markOverdefined();
+    insertResult(Val, BB, Res);
+    return true;
   }
 
   // FIXME: We're currently limited to binops with a constant RHS.  This should
@@ -571,11 +565,15 @@ bool LazyValueInfoCache::solveBlockValue(Value *Val, BasicBlock *BB) {
     DEBUG(dbgs() << " compute BB '" << BB->getName()
                  << "' - overdefined because inst def found.\n");
 
-    BBLV.markOverdefined();
-    return ODCacheUpdater.markResult(true);
+    Res.markOverdefined();
+    insertResult(Val, BB, Res);
+    return true;
   }
 
-  return ODCacheUpdater.markResult(solveBlockValueConstantRange(BBLV, BBI, BB));
+  if (!solveBlockValueConstantRange(Res, BBI, BB))
+    return false;
+  insertResult(Val, BB, Res);
+  return true;
 }
 
 static bool InstructionDereferencesPointer(Instruction *I, Value *Ptr) {
@@ -620,9 +618,8 @@ bool LazyValueInfoCache::solveBlockValueNonLocal(LVILatticeVal &BBLV,
       // If 'GetUnderlyingObject' didn't converge, skip it. It won't converge
       // inside InstructionDereferencesPointer either.
       if (UnderlyingVal == GetUnderlyingObject(UnderlyingVal, nullptr, 1)) {
-        for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
-             BI != BE; ++BI) {
-          if (InstructionDereferencesPointer(BI, UnderlyingVal)) {
+        for (Instruction &I : *BB) {
+          if (InstructionDereferencesPointer(&I, UnderlyingVal)) {
             NotNull = true;
             break;
           }
@@ -724,16 +721,20 @@ static bool getValueFromFromCondition(Value *Val, ICmpInst *ICI,
                                       LVILatticeVal &Result,
                                       bool isTrueDest = true);
 
-// If we can determine a constant range for the value Val at the context
+// If we can determine a constant range for the value Val in the context
 // provided by the instruction BBI, then merge it into BBLV. If we did find a
 // constant range, return true.
-void LazyValueInfoCache::mergeAssumeBlockValueConstantRange(
-  Value *Val, LVILatticeVal &BBLV, Instruction *BBI) {
+void LazyValueInfoCache::mergeAssumeBlockValueConstantRange(Value *Val,
+                                                            LVILatticeVal &BBLV,
+                                                            Instruction *BBI) {
   BBI = BBI ? BBI : dyn_cast<Instruction>(Val);
   if (!BBI)
     return;
 
-  for (auto &I : AT->assumptions(BBI->getParent()->getParent())) {
+  for (auto &AssumeVH : AC->assumptions()) {
+    if (!AssumeVH)
+      continue;
+    auto *I = cast<CallInst>(AssumeVH);
     if (!isValidAssumeForContext(I, BBI, DL, DT))
       continue;
 
@@ -755,8 +756,10 @@ bool LazyValueInfoCache::solveBlockValueConstantRange(LVILatticeVal &BBLV,
                                                       BasicBlock *BB) {
   // Figure out the range of the LHS.  If that fails, bail.
   if (!hasBlockValue(BBI->getOperand(0), BB)) {
-    BlockValueStack.push(std::make_pair(BB, BBI->getOperand(0)));
-    return false;
+    if (pushBlockValue(std::make_pair(BB, BBI->getOperand(0))))
+      return false;
+    BBLV.markOverdefined();
+    return true;
   }
 
   LVILatticeVal LHSVal = getBlockValue(BBI->getOperand(0), BB);
@@ -881,7 +884,7 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
   // know that v != 0.
   if (BranchInst *BI = dyn_cast<BranchInst>(BBFrom->getTerminator())) {
     // If this is a conditional branch and only one successor goes to BBTo, then
-    // we maybe able to infer something from the condition. 
+    // we may be able to infer something from the condition.
     if (BI->isConditional() &&
         BI->getSuccessor(0) != BI->getSuccessor(1)) {
       bool isTrueDest = BI->getSuccessor(0) == BBTo;
@@ -898,9 +901,9 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
       
       // If the condition of the branch is an equality comparison, we may be
       // able to infer the value.
-      ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition());
-      if (getValueFromFromCondition(Val, ICI, Result, isTrueDest))
-        return true;
+      if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition()))
+        if (getValueFromFromCondition(Val, ICI, Result, isTrueDest))
+          return true;
     }
   }
 
@@ -914,8 +917,7 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
     unsigned BitWidth = Val->getType()->getIntegerBitWidth();
     ConstantRange EdgesVals(BitWidth, DefaultCase/*isFullSet*/);
 
-    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-         i != e; ++i) {
+    for (SwitchInst::CaseIt i : SI->cases()) {
       ConstantRange EdgeVal(i.getCaseValue()->getValue());
       if (DefaultCase) {
         // It is possible that the default destination is the destination of
@@ -931,8 +933,8 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
   return false;
 }
 
-/// \brief Compute the value of Val on the edge BBFrom -> BBTo, or the value at
-/// the basic block if the edge does not constraint Val.
+/// \brief Compute the value of Val on the edge BBFrom -> BBTo or the value at
+/// the basic block if the edge does not constrain Val.
 bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
                                       BasicBlock *BBTo, LVILatticeVal &Result,
                                       Instruction *CxtI) {
@@ -944,15 +946,17 @@ bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
 
   if (getEdgeValueLocal(Val, BBFrom, BBTo, Result)) {
     if (!Result.isConstantRange() ||
-      Result.getConstantRange().getSingleElement())
+        Result.getConstantRange().getSingleElement())
       return true;
 
     // FIXME: this check should be moved to the beginning of the function when
     // LVI better supports recursive values. Even for the single value case, we
     // can intersect to detect dead code (an empty range).
     if (!hasBlockValue(Val, BBFrom)) {
-      BlockValueStack.push(std::make_pair(BBFrom, Val));
-      return false;
+      if (pushBlockValue(std::make_pair(BBFrom, Val)))
+        return false;
+      Result.markOverdefined();
+      return true;
     }
 
     // Try to intersect ranges of the BB and the constraint on the edge.
@@ -971,11 +975,13 @@ bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
   }
 
   if (!hasBlockValue(Val, BBFrom)) {
-    BlockValueStack.push(std::make_pair(BBFrom, Val));
-    return false;
+    if (pushBlockValue(std::make_pair(BBFrom, Val)))
+      return false;
+    Result.markOverdefined();
+    return true;
   }
 
-  // if we couldn't compute the value on the edge, use the value from the BB
+  // If we couldn't compute the value on the edge, use the value from the BB.
   Result = getBlockValue(Val, BBFrom);
   mergeAssumeBlockValueConstantRange(Val, Result, BBFrom->getTerminator());
   // We can use the context instruction (generically the ultimate instruction
@@ -995,7 +1001,9 @@ LVILatticeVal LazyValueInfoCache::getValueInBlock(Value *V, BasicBlock *BB,
   DEBUG(dbgs() << "LVI Getting block end value " << *V << " at '"
         << BB->getName() << "'\n");
   
-  BlockValueStack.push(std::make_pair(BB, V));
+  assert(BlockValueStack.empty() && BlockValueSet.empty());
+  pushBlockValue(std::make_pair(BB, V));
+
   solve();
   LVILatticeVal Result = getBlockValue(V, BB);
   mergeAssumeBlockValueConstantRange(V, Result, CxtI);
@@ -1041,7 +1049,7 @@ void LazyValueInfoCache::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
   // we clear their entries from the cache, and allow lazy updating to recompute
   // them when needed.
   
-  // The updating process is fairly simple: we need to dropped cached info
+  // The updating process is fairly simple: we need to drop cached info
   // for all values that were marked overdefined in OldSucc, and for those same
   // values in any successor of OldSucc (except NewSucc) in which they were
   // also marked overdefined.
@@ -1049,11 +1057,9 @@ void LazyValueInfoCache::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
   worklist.push_back(OldSucc);
   
   DenseSet<Value*> ClearSet;
-  for (DenseSet<OverDefinedPairTy>::iterator I = OverDefinedCache.begin(),
-       E = OverDefinedCache.end(); I != E; ++I) {
-    if (I->first == OldSucc)
-      ClearSet.insert(I->second);
-  }
+  for (OverDefinedPairTy &P : OverDefinedCache)
+    if (P.first == OldSucc)
+      ClearSet.insert(P.second);
   
   // Use a worklist to perform a depth-first search of OldSucc's successors.
   // NOTE: We do not need a visited list since any blocks we have already
@@ -1067,15 +1073,14 @@ void LazyValueInfoCache::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
     if (ToUpdate == NewSucc) continue;
     
     bool changed = false;
-    for (DenseSet<Value*>::iterator I = ClearSet.begin(), E = ClearSet.end();
-         I != E; ++I) {
+    for (Value *V : ClearSet) {
       // If a value was marked overdefined in OldSucc, and is here too...
       DenseSet<OverDefinedPairTy>::iterator OI =
-        OverDefinedCache.find(std::make_pair(ToUpdate, *I));
+        OverDefinedCache.find(std::make_pair(ToUpdate, V));
       if (OI == OverDefinedCache.end()) continue;
 
       // Remove it from the caches.
-      ValueCacheEntryTy &Entry = ValueCache[LVIValueHandle(*I, this)];
+      ValueCacheEntryTy &Entry = ValueCache[LVIValueHandle(V, this)];
       ValueCacheEntryTy::iterator CI = Entry.find(ToUpdate);
 
       assert(CI != Entry.end() && "Couldn't find entry to update?");
@@ -1097,18 +1102,17 @@ void LazyValueInfoCache::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
 //                            LazyValueInfo Impl
 //===----------------------------------------------------------------------===//
 
-/// getCache - This lazily constructs the LazyValueInfoCache.
-static LazyValueInfoCache &getCache(void *&PImpl,
-                                    AssumptionTracker *AT,
+/// This lazily constructs the LazyValueInfoCache.
+static LazyValueInfoCache &getCache(void *&PImpl, AssumptionCache *AC,
                                     const DataLayout *DL = nullptr,
                                     DominatorTree *DT = nullptr) {
   if (!PImpl)
-    PImpl = new LazyValueInfoCache(AT, DL, DT);
+    PImpl = new LazyValueInfoCache(AC, DL, DT);
   return *static_cast<LazyValueInfoCache*>(PImpl);
 }
 
 bool LazyValueInfo::runOnFunction(Function &F) {
-  AT = &getAnalysis<AssumptionTracker>();
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
@@ -1116,10 +1120,11 @@ bool LazyValueInfo::runOnFunction(Function &F) {
 
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
-  TLI = &getAnalysis<TargetLibraryInfo>();
+
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   if (PImpl)
-    getCache(PImpl, AT, DL, DT).clear();
+    getCache(PImpl, AC, DL, DT).clear();
 
   // Fully lazy.
   return false;
@@ -1127,14 +1132,14 @@ bool LazyValueInfo::runOnFunction(Function &F) {
 
 void LazyValueInfo::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<AssumptionTracker>();
-  AU.addRequired<TargetLibraryInfo>();
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
 }
 
 void LazyValueInfo::releaseMemory() {
   // If the cache was allocated, free it.
   if (PImpl) {
-    delete &getCache(PImpl, AT);
+    delete &getCache(PImpl, AC);
     PImpl = nullptr;
   }
 }
@@ -1142,8 +1147,8 @@ void LazyValueInfo::releaseMemory() {
 Constant *LazyValueInfo::getConstant(Value *V, BasicBlock *BB,
                                      Instruction *CxtI) {
   LVILatticeVal Result =
-    getCache(PImpl, AT, DL, DT).getValueInBlock(V, BB, CxtI);
-  
+      getCache(PImpl, AC, DL, DT).getValueInBlock(V, BB, CxtI);
+
   if (Result.isConstant())
     return Result.getConstant();
   if (Result.isConstantRange()) {
@@ -1154,14 +1159,14 @@ Constant *LazyValueInfo::getConstant(Value *V, BasicBlock *BB,
   return nullptr;
 }
 
-/// getConstantOnEdge - Determine whether the specified value is known to be a
+/// Determine whether the specified value is known to be a
 /// constant on the specified edge.  Return null if not.
 Constant *LazyValueInfo::getConstantOnEdge(Value *V, BasicBlock *FromBB,
                                            BasicBlock *ToBB,
                                            Instruction *CxtI) {
   LVILatticeVal Result =
-    getCache(PImpl, AT, DL, DT).getValueOnEdge(V, FromBB, ToBB, CxtI);
-  
+      getCache(PImpl, AC, DL, DT).getValueOnEdge(V, FromBB, ToBB, CxtI);
+
   if (Result.isConstant())
     return Result.getConstant();
   if (Result.isConstantRange()) {
@@ -1239,15 +1244,14 @@ getPredicateResult(unsigned Pred, Constant *C, LVILatticeVal &Result,
   return LazyValueInfo::Unknown;
 }
 
-/// getPredicateOnEdge - Determine whether the specified value comparison
-/// with a constant is known to be true or false on the specified CFG edge.
-/// Pred is a CmpInst predicate.
+/// Determine whether the specified value comparison with a constant is known to
+/// be true or false on the specified CFG edge. Pred is a CmpInst predicate.
 LazyValueInfo::Tristate
 LazyValueInfo::getPredicateOnEdge(unsigned Pred, Value *V, Constant *C,
                                   BasicBlock *FromBB, BasicBlock *ToBB,
                                   Instruction *CxtI) {
   LVILatticeVal Result =
-    getCache(PImpl, AT, DL, DT).getValueOnEdge(V, FromBB, ToBB, CxtI);
+      getCache(PImpl, AC, DL, DT).getValueOnEdge(V, FromBB, ToBB, CxtI);
 
   return getPredicateResult(Pred, C, Result, DL, TLI);
 }
@@ -1255,17 +1259,18 @@ LazyValueInfo::getPredicateOnEdge(unsigned Pred, Value *V, Constant *C,
 LazyValueInfo::Tristate
 LazyValueInfo::getPredicateAt(unsigned Pred, Value *V, Constant *C,
                               Instruction *CxtI) {
-  LVILatticeVal Result =
-    getCache(PImpl, AT, DL, DT).getValueAt(V, CxtI);
+  LVILatticeVal Result = getCache(PImpl, AC, DL, DT).getValueAt(V, CxtI);
 
   return getPredicateResult(Pred, C, Result, DL, TLI);
 }
 
 void LazyValueInfo::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
                                BasicBlock *NewSucc) {
-  if (PImpl) getCache(PImpl, AT, DL, DT).threadEdge(PredBB, OldSucc, NewSucc);
+  if (PImpl)
+    getCache(PImpl, AC, DL, DT).threadEdge(PredBB, OldSucc, NewSucc);
 }
 
 void LazyValueInfo::eraseBlock(BasicBlock *BB) {
-  if (PImpl) getCache(PImpl, AT, DL, DT).eraseBlock(BB);
+  if (PImpl)
+    getCache(PImpl, AC, DL, DT).eraseBlock(BB);
 }
diff --git a/lib/Analysis/LibCallSemantics.cpp b/lib/Analysis/LibCallSemantics.cpp
index 23639e7..cf752dd 100644
--- a/lib/Analysis/LibCallSemantics.cpp
+++ b/lib/Analysis/LibCallSemantics.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/Analysis/LibCallSemantics.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Function.h"
 using namespace llvm;
 
@@ -61,3 +62,41 @@ LibCallInfo::getFunctionInfo(const Function *F) const {
   return Map->lookup(F->getName());
 }
 
+/// See if the given exception handling personality function is one that we
+/// understand.  If so, return a description of it; otherwise return Unknown.
+EHPersonality llvm::classifyEHPersonality(const Value *Pers) {
+  const Function *F = dyn_cast<Function>(Pers->stripPointerCasts());
+  if (!F)
+    return EHPersonality::Unknown;
+  return StringSwitch<EHPersonality>(F->getName())
+    .Case("__gnat_eh_personality", EHPersonality::GNU_Ada)
+    .Case("__gxx_personality_v0",  EHPersonality::GNU_CXX)
+    .Case("__gcc_personality_v0",  EHPersonality::GNU_C)
+    .Case("__objc_personality_v0", EHPersonality::GNU_ObjC)
+    .Case("__except_handler3",     EHPersonality::MSVC_X86SEH)
+    .Case("__except_handler4",     EHPersonality::MSVC_X86SEH)
+    .Case("__C_specific_handler",  EHPersonality::MSVC_Win64SEH)
+    .Case("__CxxFrameHandler3",    EHPersonality::MSVC_CXX)
+    .Default(EHPersonality::Unknown);
+}
+
+bool llvm::isAsynchronousEHPersonality(EHPersonality Pers) {
+  // The two SEH personality functions can catch asynch exceptions. We assume
+  // unknown personalities don't catch asynch exceptions.
+  switch (Pers) {
+  case EHPersonality::MSVC_X86SEH:
+  case EHPersonality::MSVC_Win64SEH:
+    return true;
+  default: return false;
+  }
+  llvm_unreachable("invalid enum");
+}
+
+bool llvm::canSimplifyInvokeNoUnwind(const InvokeInst *II) {
+  const LandingPadInst *LP = II->getLandingPadInst();
+  EHPersonality Personality = classifyEHPersonality(LP->getPersonalityFn());
+  // We can't simplify any invokes to nounwind functions if the personality
+  // function wants to catch asynch exceptions.  The nounwind attribute only
+  // implies that the function does not throw synchronous exceptions.
+  return !isAsynchronousEHPersonality(Personality);
+}
diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index 8ee9b8a..874ed0a 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp
@@ -36,12 +36,14 @@
 
 #include "llvm/Analysis/Lint.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
@@ -49,11 +51,10 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Pass.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 using namespace llvm;
 
 namespace {
@@ -73,6 +74,8 @@ namespace {
     void visitMemoryReference(Instruction &I, Value *Ptr,
                               uint64_t Size, unsigned Align,
                               Type *Ty, unsigned Flags);
+    void visitEHBeginCatch(IntrinsicInst *II);
+    void visitEHEndCatch(IntrinsicInst *II);
 
     void visitCallInst(CallInst &I);
     void visitInvokeInst(InvokeInst &I);
@@ -102,7 +105,7 @@ namespace {
   public:
     Module *Mod;
     AliasAnalysis *AA;
-    AssumptionTracker *AT;
+    AssumptionCache *AC;
     DominatorTree *DT;
     const DataLayout *DL;
     TargetLibraryInfo *TLI;
@@ -120,8 +123,8 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesAll();
       AU.addRequired<AliasAnalysis>();
-      AU.addRequired<AssumptionTracker>();
-      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
       AU.addRequired<DominatorTreeWrapperPass>();
     }
     void print(raw_ostream &O, const Module *M) const override {}
@@ -154,8 +157,8 @@ namespace {
 char Lint::ID = 0;
 INITIALIZE_PASS_BEGIN(Lint, "lint", "Statically lint-checks LLVM IR",
                       false, true)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(Lint, "lint", "Statically lint-checks LLVM IR",
@@ -179,11 +182,11 @@ INITIALIZE_PASS_END(Lint, "lint", "Statically lint-checks LLVM IR",
 bool Lint::runOnFunction(Function &F) {
   Mod = F.getParent();
   AA = &getAnalysis<AliasAnalysis>();
-  AT = &getAnalysis<AssumptionTracker>();
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
-  TLI = &getAnalysis<TargetLibraryInfo>();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   visit(F);
   dbgs() << MessagesStr.str();
   Messages.clear();
@@ -346,6 +349,13 @@ void Lint::visitCallSite(CallSite CS) {
       visitMemoryReference(I, CS.getArgument(0), AliasAnalysis::UnknownSize,
                            0, nullptr, MemRef::Read | MemRef::Write);
       break;
+
+    case Intrinsic::eh_begincatch:
+      visitEHBeginCatch(II);
+      break;
+    case Intrinsic::eh_endcatch:
+      visitEHEndCatch(II);
+      break;
     }
 }
 
@@ -509,8 +519,190 @@ void Lint::visitShl(BinaryOperator &I) {
             "Undefined result: Shift count out of range", &I);
 }
 
+static bool
+allPredsCameFromLandingPad(BasicBlock *BB,
+                           SmallSet<BasicBlock *, 4> &VisitedBlocks) {
+  VisitedBlocks.insert(BB);
+  if (BB->isLandingPad())
+    return true;
+  // If we find a block with no predecessors, the search failed.
+  if (pred_empty(BB))
+    return false;
+  for (BasicBlock *Pred : predecessors(BB)) {
+    if (VisitedBlocks.count(Pred))
+      continue;
+    if (!allPredsCameFromLandingPad(Pred, VisitedBlocks))
+      return false;
+  }
+  return true;
+}
+
+static bool
+allSuccessorsReachEndCatch(BasicBlock *BB, BasicBlock::iterator InstBegin,
+                           IntrinsicInst **SecondBeginCatch,
+                           SmallSet<BasicBlock *, 4> &VisitedBlocks) {
+  VisitedBlocks.insert(BB);
+  for (BasicBlock::iterator I = InstBegin, E = BB->end(); I != E; ++I) {
+    IntrinsicInst *IC = dyn_cast<IntrinsicInst>(I);
+    if (IC && IC->getIntrinsicID() == Intrinsic::eh_endcatch)
+      return true;
+    // If we find another begincatch while looking for an endcatch,
+    // that's also an error.
+    if (IC && IC->getIntrinsicID() == Intrinsic::eh_begincatch) {
+      *SecondBeginCatch = IC;
+      return false;
+    }
+  }
+
+  // If we reach a block with no successors while searching, the
+  // search has failed.
+  if (succ_empty(BB))
+    return false;
+  // Otherwise, search all of the successors.
+  for (BasicBlock *Succ : successors(BB)) {
+    if (VisitedBlocks.count(Succ))
+      continue;
+    if (!allSuccessorsReachEndCatch(Succ, Succ->begin(), SecondBeginCatch,
+                                    VisitedBlocks))
+      return false;
+  }
+  return true;
+}
+
+void Lint::visitEHBeginCatch(IntrinsicInst *II) {
+  // The checks in this function make a potentially dubious assumption about
+  // the CFG, namely that any block involved in a catch is only used for the
+  // catch.  This will very likely be true of IR generated by a front end,
+  // but it may cease to be true, for example, if the IR is run through a
+  // pass which combines similar blocks.
+  //
+  // In general, if we encounter a block the isn't dominated by the catch
+  // block while we are searching the catch block's successors for a call
+  // to end catch intrinsic, then it is possible that it will be legal for
+  // a path through this block to never reach a call to llvm.eh.endcatch.
+  // An analogous statement could be made about our search for a landing
+  // pad among the catch block's predecessors.
+  //
+  // What is actually required is that no path is possible at runtime that
+  // reaches a call to llvm.eh.begincatch without having previously visited
+  // a landingpad instruction and that no path is possible at runtime that
+  // calls llvm.eh.begincatch and does not subsequently call llvm.eh.endcatch
+  // (mentally adjusting for the fact that in reality these calls will be
+  // removed before code generation).
+  //
+  // Because this is a lint check, we take a pessimistic approach and warn if
+  // the control flow is potentially incorrect.
+
+  SmallSet<BasicBlock *, 4> VisitedBlocks;
+  BasicBlock *CatchBB = II->getParent();
+
+  // The begin catch must occur in a landing pad block or all paths
+  // to it must have come from a landing pad.
+  Assert1(allPredsCameFromLandingPad(CatchBB, VisitedBlocks),
+          "llvm.eh.begincatch may be reachable without passing a landingpad", 
+          II);
+
+  // Reset the visited block list.
+  VisitedBlocks.clear();
+
+  IntrinsicInst *SecondBeginCatch = nullptr;
+
+  // This has to be called before it is asserted.  Otherwise, the first assert
+  // below can never be hit.
+  bool EndCatchFound = allSuccessorsReachEndCatch(
+      CatchBB, std::next(static_cast<BasicBlock::iterator>(II)),
+      &SecondBeginCatch, VisitedBlocks);
+  Assert2(
+      SecondBeginCatch == nullptr,
+      "llvm.eh.begincatch may be called a second time before llvm.eh.endcatch",
+      II, SecondBeginCatch);
+  Assert1(EndCatchFound,
+          "Some paths from llvm.eh.begincatch may not reach llvm.eh.endcatch",
+          II);
+}
+
+static bool allPredCameFromBeginCatch(
+    BasicBlock *BB, BasicBlock::reverse_iterator InstRbegin,
+    IntrinsicInst **SecondEndCatch, SmallSet<BasicBlock *, 4> &VisitedBlocks) {
+  VisitedBlocks.insert(BB);
+  // Look for a begincatch in this block.
+  for (BasicBlock::reverse_iterator RI = InstRbegin, RE = BB->rend(); RI != RE;
+       ++RI) {
+    IntrinsicInst *IC = dyn_cast<IntrinsicInst>(&*RI);
+    if (IC && IC->getIntrinsicID() == Intrinsic::eh_begincatch)
+      return true;
+    // If we find another end catch before we find a begin catch, that's
+    // an error.
+    if (IC && IC->getIntrinsicID() == Intrinsic::eh_endcatch) {
+      *SecondEndCatch = IC;
+      return false;
+    }
+    // If we encounter a landingpad instruction, the search failed.
+    if (isa<LandingPadInst>(*RI))
+      return false;
+  }
+  // If while searching we find a block with no predeccesors,
+  // the search failed.
+  if (pred_empty(BB))
+    return false;
+  // Search any predecessors we haven't seen before.
+  for (BasicBlock *Pred : predecessors(BB)) {
+    if (VisitedBlocks.count(Pred))
+      continue;
+    if (!allPredCameFromBeginCatch(Pred, Pred->rbegin(), SecondEndCatch,
+                                   VisitedBlocks))
+      return false;
+  }
+  return true;
+}
+
+void Lint::visitEHEndCatch(IntrinsicInst *II) {
+  // The check in this function makes a potentially dubious assumption about
+  // the CFG, namely that any block involved in a catch is only used for the
+  // catch.  This will very likely be true of IR generated by a front end,
+  // but it may cease to be true, for example, if the IR is run through a
+  // pass which combines similar blocks.
+  //
+  // In general, if we encounter a block the isn't post-dominated by the
+  // end catch block while we are searching the end catch block's predecessors
+  // for a call to the begin catch intrinsic, then it is possible that it will
+  // be legal for a path to reach the end catch block without ever having
+  // called llvm.eh.begincatch.
+  //
+  // What is actually required is that no path is possible at runtime that
+  // reaches a call to llvm.eh.endcatch without having previously visited
+  // a call to llvm.eh.begincatch (mentally adjusting for the fact that in
+  // reality these calls will be removed before code generation).
+  //
+  // Because this is a lint check, we take a pessimistic approach and warn if
+  // the control flow is potentially incorrect.
+
+  BasicBlock *EndCatchBB = II->getParent();
+
+  // Alls paths to the end catch call must pass through a begin catch call.
+
+  // If llvm.eh.begincatch wasn't called in the current block, we'll use this
+  // lambda to recursively look for it in predecessors.
+  SmallSet<BasicBlock *, 4> VisitedBlocks;
+  IntrinsicInst *SecondEndCatch = nullptr;
+
+  // This has to be called before it is asserted.  Otherwise, the first assert
+  // below can never be hit.
+  bool BeginCatchFound =
+      allPredCameFromBeginCatch(EndCatchBB, BasicBlock::reverse_iterator(II),
+                                &SecondEndCatch, VisitedBlocks);
+  Assert2(
+      SecondEndCatch == nullptr,
+      "llvm.eh.endcatch may be called a second time after llvm.eh.begincatch",
+      II, SecondEndCatch);
+  Assert1(
+      BeginCatchFound,
+      "llvm.eh.endcatch may be reachable without passing llvm.eh.begincatch",
+      II);
+}
+
 static bool isZero(Value *V, const DataLayout *DL, DominatorTree *DT,
-                   AssumptionTracker *AT) {
+                   AssumptionCache *AC) {
   // Assume undef could be zero.
   if (isa<UndefValue>(V))
     return true;
@@ -519,8 +711,8 @@ static bool isZero(Value *V, const DataLayout *DL, DominatorTree *DT,
   if (!VecTy) {
     unsigned BitWidth = V->getType()->getIntegerBitWidth();
     APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-    computeKnownBits(V, KnownZero, KnownOne, DL,
-                     0, AT, dyn_cast<Instruction>(V), DT);
+    computeKnownBits(V, KnownZero, KnownOne, DL, 0, AC,
+                     dyn_cast<Instruction>(V), DT);
     return KnownZero.isAllOnesValue();
   }
 
@@ -550,22 +742,22 @@ static bool isZero(Value *V, const DataLayout *DL, DominatorTree *DT,
 }
 
 void Lint::visitSDiv(BinaryOperator &I) {
-  Assert1(!isZero(I.getOperand(1), DL, DT, AT),
+  Assert1(!isZero(I.getOperand(1), DL, DT, AC),
           "Undefined behavior: Division by zero", &I);
 }
 
 void Lint::visitUDiv(BinaryOperator &I) {
-  Assert1(!isZero(I.getOperand(1), DL, DT, AT),
+  Assert1(!isZero(I.getOperand(1), DL, DT, AC),
           "Undefined behavior: Division by zero", &I);
 }
 
 void Lint::visitSRem(BinaryOperator &I) {
-  Assert1(!isZero(I.getOperand(1), DL, DT, AT),
+  Assert1(!isZero(I.getOperand(1), DL, DT, AC),
           "Undefined behavior: Division by zero", &I);
 }
 
 void Lint::visitURem(BinaryOperator &I) {
-  Assert1(!isZero(I.getOperand(1), DL, DT, AT),
+  Assert1(!isZero(I.getOperand(1), DL, DT, AC),
           "Undefined behavior: Division by zero", &I);
 }
 
@@ -686,7 +878,7 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
 
   // As a last resort, try SimplifyInstruction or constant folding.
   if (Instruction *Inst = dyn_cast<Instruction>(V)) {
-    if (Value *W = SimplifyInstruction(Inst, DL, TLI, DT, AT))
+    if (Value *W = SimplifyInstruction(Inst, DL, TLI, DT, AC))
       return findValueImpl(W, OffsetOk, Visited);
   } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
     if (Value *W = ConstantFoldConstantExpression(CE, DL, TLI))
@@ -711,7 +903,7 @@ void llvm::lintFunction(const Function &f) {
   Function &F = const_cast<Function&>(f);
   assert(!F.isDeclaration() && "Cannot lint external functions");
 
-  FunctionPassManager FPM(F.getParent());
+  legacy::FunctionPassManager FPM(F.getParent());
   Lint *V = new Lint();
   FPM.add(V);
   FPM.run(F);
@@ -720,7 +912,7 @@ void llvm::lintFunction(const Function &f) {
 /// lintModule - Check a module for errors, printing messages on stderr.
 ///
 void llvm::lintModule(const Module &M) {
-  PassManager PM;
+  legacy::PassManager PM;
   Lint *V = new Lint();
   PM.add(V);
   PM.run(const_cast<Module&>(M));
diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp
index bb0d60e..5042eb9 100644
--- a/lib/Analysis/Loads.cpp
+++ b/lib/Analysis/Loads.cpp
@@ -176,8 +176,13 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
 
   Type *AccessTy = cast<PointerType>(Ptr->getType())->getElementType();
 
-  // If we're using alias analysis to disambiguate get the size of *Ptr.
-  uint64_t AccessSize = AA ? AA->getTypeStoreSize(AccessTy) : 0;
+  // Try to get the DataLayout for this module. This may be null, in which case
+  // the optimizations will be limited.
+  const DataLayout *DL = ScanBB->getDataLayout();
+
+  // Try to get the store size for the type.
+  uint64_t AccessSize = DL ? DL->getTypeStoreSize(AccessTy)
+                           : AA ? AA->getTypeStoreSize(AccessTy) : 0;
 
   Value *StrippedPtr = Ptr->stripPointerCasts();
 
@@ -202,7 +207,7 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
       if (AreEquivalentAddressValues(
               LI->getPointerOperand()->stripPointerCasts(), StrippedPtr) &&
-          CastInst::isBitCastable(LI->getType(), AccessTy)) {
+          CastInst::isBitOrNoopPointerCastable(LI->getType(), AccessTy, DL)) {
         if (AATags)
           LI->getAAMetadata(*AATags);
         return LI;
@@ -214,7 +219,8 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
       // (This is true even if the store is volatile or atomic, although
       // those cases are unlikely.)
       if (AreEquivalentAddressValues(StorePtr, StrippedPtr) &&
-          CastInst::isBitCastable(SI->getValueOperand()->getType(), AccessTy)) {
+          CastInst::isBitOrNoopPointerCastable(SI->getValueOperand()->getType(),
+                                               AccessTy, DL)) {
         if (AATags)
           SI->getAAMetadata(*AATags);
         return SI->getOperand(0);
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
new file mode 100644
index 0000000..7bedd40
--- /dev/null
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -0,0 +1,1396 @@
+//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The implementation for the loop memory dependence that was originally
+// developed for the loop vectorizer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/VectorUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-accesses"
+
+static cl::opt<unsigned, true>
+VectorizationFactor("force-vector-width", cl::Hidden,
+                    cl::desc("Sets the SIMD width. Zero is autoselect."),
+                    cl::location(VectorizerParams::VectorizationFactor));
+unsigned VectorizerParams::VectorizationFactor;
+
+static cl::opt<unsigned, true>
+VectorizationInterleave("force-vector-interleave", cl::Hidden,
+                        cl::desc("Sets the vectorization interleave count. "
+                                 "Zero is autoselect."),
+                        cl::location(
+                            VectorizerParams::VectorizationInterleave));
+unsigned VectorizerParams::VectorizationInterleave;
+
+static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
+    "runtime-memory-check-threshold", cl::Hidden,
+    cl::desc("When performing memory disambiguation checks at runtime do not "
+             "generate more than this number of comparisons (default = 8)."),
+    cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8));
+unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
+
+/// Maximum SIMD width.
+const unsigned VectorizerParams::MaxVectorWidth = 64;
+
+bool VectorizerParams::isInterleaveForced() {
+  return ::VectorizationInterleave.getNumOccurrences() > 0;
+}
+
+void LoopAccessReport::emitAnalysis(const LoopAccessReport &Message,
+                                    const Function *TheFunction,
+                                    const Loop *TheLoop,
+                                    const char *PassName) {
+  DebugLoc DL = TheLoop->getStartLoc();
+  if (const Instruction *I = Message.getInstr())
+    DL = I->getDebugLoc();
+  emitOptimizationRemarkAnalysis(TheFunction->getContext(), PassName,
+                                 *TheFunction, DL, Message.str());
+}
+
+Value *llvm::stripIntegerCast(Value *V) {
+  if (CastInst *CI = dyn_cast<CastInst>(V))
+    if (CI->getOperand(0)->getType()->isIntegerTy())
+      return CI->getOperand(0);
+  return V;
+}
+
+const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
+                                            const ValueToValueMap &PtrToStride,
+                                            Value *Ptr, Value *OrigPtr) {
+
+  const SCEV *OrigSCEV = SE->getSCEV(Ptr);
+
+  // If there is an entry in the map return the SCEV of the pointer with the
+  // symbolic stride replaced by one.
+  ValueToValueMap::const_iterator SI =
+      PtrToStride.find(OrigPtr ? OrigPtr : Ptr);
+  if (SI != PtrToStride.end()) {
+    Value *StrideVal = SI->second;
+
+    // Strip casts.
+    StrideVal = stripIntegerCast(StrideVal);
+
+    // Replace symbolic stride by one.
+    Value *One = ConstantInt::get(StrideVal->getType(), 1);
+    ValueToValueMap RewriteMap;
+    RewriteMap[StrideVal] = One;
+
+    const SCEV *ByOne =
+        SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true);
+    DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne
+                 << "\n");
+    return ByOne;
+  }
+
+  // Otherwise, just return the SCEV of the original pointer.
+  return SE->getSCEV(Ptr);
+}
+
+void LoopAccessInfo::RuntimePointerCheck::insert(
+    ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId,
+    unsigned ASId, const ValueToValueMap &Strides) {
+  // Get the stride replaced scev.
+  const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+  assert(AR && "Invalid addrec expression");
+  const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
+  const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+  Pointers.push_back(Ptr);
+  Starts.push_back(AR->getStart());
+  Ends.push_back(ScEnd);
+  IsWritePtr.push_back(WritePtr);
+  DependencySetId.push_back(DepSetId);
+  AliasSetId.push_back(ASId);
+}
+
+bool LoopAccessInfo::RuntimePointerCheck::needsChecking(unsigned I,
+                                                        unsigned J) const {
+  // No need to check if two readonly pointers intersect.
+  if (!IsWritePtr[I] && !IsWritePtr[J])
+    return false;
+
+  // Only need to check pointers between two different dependency sets.
+  if (DependencySetId[I] == DependencySetId[J])
+    return false;
+
+  // Only need to check pointers in the same alias set.
+  if (AliasSetId[I] != AliasSetId[J])
+    return false;
+
+  return true;
+}
+
+void LoopAccessInfo::RuntimePointerCheck::print(raw_ostream &OS,
+                                                unsigned Depth) const {
+  unsigned NumPointers = Pointers.size();
+  if (NumPointers == 0)
+    return;
+
+  OS.indent(Depth) << "Run-time memory checks:\n";
+  unsigned N = 0;
+  for (unsigned I = 0; I < NumPointers; ++I)
+    for (unsigned J = I + 1; J < NumPointers; ++J)
+      if (needsChecking(I, J)) {
+        OS.indent(Depth) << N++ << ":\n";
+        OS.indent(Depth + 2) << *Pointers[I] << "\n";
+        OS.indent(Depth + 2) << *Pointers[J] << "\n";
+      }
+}
+
+namespace {
+/// \brief Analyses memory accesses in a loop.
+///
+/// Checks whether run time pointer checks are needed and builds sets for data
+/// dependence checking.
+class AccessAnalysis {
+public:
+  /// \brief Read or write access location.
+  typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
+  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
+
+  /// \brief Set of potential dependent memory accesses.
+  typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
+
+  AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) :
+    DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {}
+
+  /// \brief Register a load  and whether it is only read from.
+  void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) {
+    Value *Ptr = const_cast<Value*>(Loc.Ptr);
+    AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
+    Accesses.insert(MemAccessInfo(Ptr, false));
+    if (IsReadOnly)
+      ReadOnlyPtr.insert(Ptr);
+  }
+
+  /// \brief Register a store.
+  void addStore(AliasAnalysis::Location &Loc) {
+    Value *Ptr = const_cast<Value*>(Loc.Ptr);
+    AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
+    Accesses.insert(MemAccessInfo(Ptr, true));
+  }
+
+  /// \brief Check whether we can check the pointers at runtime for
+  /// non-intersection.
+  bool canCheckPtrAtRT(LoopAccessInfo::RuntimePointerCheck &RtCheck,
+                       unsigned &NumComparisons, ScalarEvolution *SE,
+                       Loop *TheLoop, const ValueToValueMap &Strides,
+                       bool ShouldCheckStride = false);
+
+  /// \brief Goes over all memory accesses, checks whether a RT check is needed
+  /// and builds sets of dependent accesses.
+  void buildDependenceSets() {
+    processMemAccesses();
+  }
+
+  bool isRTCheckNeeded() { return IsRTCheckNeeded; }
+
+  bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
+  void resetDepChecks() { CheckDeps.clear(); }
+
+  MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
+
+private:
+  typedef SetVector<MemAccessInfo> PtrAccessSet;
+
+  /// \brief Go over all memory access and check whether runtime pointer checks
+  /// are needed /// and build sets of dependency check candidates.
+  void processMemAccesses();
+
+  /// Set of all accesses.
+  PtrAccessSet Accesses;
+
+  /// Set of accesses that need a further dependence check.
+  MemAccessInfoSet CheckDeps;
+
+  /// Set of pointers that are read only.
+  SmallPtrSet<Value*, 16> ReadOnlyPtr;
+
+  const DataLayout *DL;
+
+  /// An alias set tracker to partition the access set by underlying object and
+  //intrinsic property (such as TBAA metadata).
+  AliasSetTracker AST;
+
+  /// Sets of potentially dependent accesses - members of one set share an
+  /// underlying pointer. The set "CheckDeps" identfies which sets really need a
+  /// dependence check.
+  DepCandidates &DepCands;
+
+  bool IsRTCheckNeeded;
+};
+
+} // end anonymous namespace
+
+/// \brief Check whether a pointer can participate in a runtime bounds check.
+static bool hasComputableBounds(ScalarEvolution *SE,
+                                const ValueToValueMap &Strides, Value *Ptr) {
+  const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
+  if (!AR)
+    return false;
+
+  return AR->isAffine();
+}
+
+/// \brief Check the stride of the pointer and ensure that it does not wrap in
+/// the address space.
+static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
+                        const Loop *Lp, const ValueToValueMap &StridesMap);
+
+bool AccessAnalysis::canCheckPtrAtRT(
+    LoopAccessInfo::RuntimePointerCheck &RtCheck, unsigned &NumComparisons,
+    ScalarEvolution *SE, Loop *TheLoop, const ValueToValueMap &StridesMap,
+    bool ShouldCheckStride) {
+  // Find pointers with computable bounds. We are going to use this information
+  // to place a runtime bound check.
+  bool CanDoRT = true;
+
+  bool IsDepCheckNeeded = isDependencyCheckNeeded();
+  NumComparisons = 0;
+
+  // We assign a consecutive id to access from different alias sets.
+  // Accesses between different groups doesn't need to be checked.
+  unsigned ASId = 1;
+  for (auto &AS : AST) {
+    unsigned NumReadPtrChecks = 0;
+    unsigned NumWritePtrChecks = 0;
+
+    // We assign consecutive id to access from different dependence sets.
+    // Accesses within the same set don't need a runtime check.
+    unsigned RunningDepId = 1;
+    DenseMap<Value *, unsigned> DepSetId;
+
+    for (auto A : AS) {
+      Value *Ptr = A.getValue();
+      bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
+      MemAccessInfo Access(Ptr, IsWrite);
+
+      if (IsWrite)
+        ++NumWritePtrChecks;
+      else
+        ++NumReadPtrChecks;
+
+      if (hasComputableBounds(SE, StridesMap, Ptr) &&
+          // When we run after a failing dependency check we have to make sure we
+          // don't have wrapping pointers.
+          (!ShouldCheckStride ||
+           isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) {
+        // The id of the dependence set.
+        unsigned DepId;
+
+        if (IsDepCheckNeeded) {
+          Value *Leader = DepCands.getLeaderValue(Access).getPointer();
+          unsigned &LeaderId = DepSetId[Leader];
+          if (!LeaderId)
+            LeaderId = RunningDepId++;
+          DepId = LeaderId;
+        } else
+          // Each access has its own dependence set.
+          DepId = RunningDepId++;
+
+        RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap);
+
+        DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
+      } else {
+        CanDoRT = false;
+      }
+    }
+
+    if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
+      NumComparisons += 0; // Only one dependence set.
+    else {
+      NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks +
+                                              NumWritePtrChecks - 1));
+    }
+
+    ++ASId;
+  }
+
+  // If the pointers that we would use for the bounds comparison have different
+  // address spaces, assume the values aren't directly comparable, so we can't
+  // use them for the runtime check. We also have to assume they could
+  // overlap. In the future there should be metadata for whether address spaces
+  // are disjoint.
+  unsigned NumPointers = RtCheck.Pointers.size();
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    for (unsigned j = i + 1; j < NumPointers; ++j) {
+      // Only need to check pointers between two different dependency sets.
+      if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j])
+       continue;
+      // Only need to check pointers in the same alias set.
+      if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j])
+        continue;
+
+      Value *PtrI = RtCheck.Pointers[i];
+      Value *PtrJ = RtCheck.Pointers[j];
+
+      unsigned ASi = PtrI->getType()->getPointerAddressSpace();
+      unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
+      if (ASi != ASj) {
+        DEBUG(dbgs() << "LAA: Runtime check would require comparison between"
+                       " different address spaces\n");
+        return false;
+      }
+    }
+  }
+
+  return CanDoRT;
+}
+
+void AccessAnalysis::processMemAccesses() {
+  // We process the set twice: first we process read-write pointers, last we
+  // process read-only pointers. This allows us to skip dependence tests for
+  // read-only pointers.
+
+  DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
+  DEBUG(dbgs() << "  AST: "; AST.dump());
+  DEBUG(dbgs() << "LAA:   Accesses:\n");
+  DEBUG({
+    for (auto A : Accesses)
+      dbgs() << "\t" << *A.getPointer() << " (" <<
+                (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?
+                                         "read-only" : "read")) << ")\n";
+  });
+
+  // The AliasSetTracker has nicely partitioned our pointers by metadata
+  // compatibility and potential for underlying-object overlap. As a result, we
+  // only need to check for potential pointer dependencies within each alias
+  // set.
+  for (auto &AS : AST) {
+    // Note that both the alias-set tracker and the alias sets themselves used
+    // linked lists internally and so the iteration order here is deterministic
+    // (matching the original instruction order within each set).
+
+    bool SetHasWrite = false;
+
+    // Map of pointers to last access encountered.
+    typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap;
+    UnderlyingObjToAccessMap ObjToLastAccess;
+
+    // Set of access to check after all writes have been processed.
+    PtrAccessSet DeferredAccesses;
+
+    // Iterate over each alias set twice, once to process read/write pointers,
+    // and then to process read-only pointers.
+    for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
+      bool UseDeferred = SetIteration > 0;
+      PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
+
+      for (auto AV : AS) {
+        Value *Ptr = AV.getValue();
+
+        // For a single memory access in AliasSetTracker, Accesses may contain
+        // both read and write, and they both need to be handled for CheckDeps.
+        for (auto AC : S) {
+          if (AC.getPointer() != Ptr)
+            continue;
+
+          bool IsWrite = AC.getInt();
+
+          // If we're using the deferred access set, then it contains only
+          // reads.
+          bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
+          if (UseDeferred && !IsReadOnlyPtr)
+            continue;
+          // Otherwise, the pointer must be in the PtrAccessSet, either as a
+          // read or a write.
+          assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
+                  S.count(MemAccessInfo(Ptr, false))) &&
+                 "Alias-set pointer not in the access set?");
+
+          MemAccessInfo Access(Ptr, IsWrite);
+          DepCands.insert(Access);
+
+          // Memorize read-only pointers for later processing and skip them in
+          // the first round (they need to be checked after we have seen all
+          // write pointers). Note: we also mark pointer that are not
+          // consecutive as "read-only" pointers (so that we check
+          // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
+          if (!UseDeferred && IsReadOnlyPtr) {
+            DeferredAccesses.insert(Access);
+            continue;
+          }
+
+          // If this is a write - check other reads and writes for conflicts. If
+          // this is a read only check other writes for conflicts (but only if
+          // there is no other write to the ptr - this is an optimization to
+          // catch "a[i] = a[i] + " without having to do a dependence check).
+          if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
+            CheckDeps.insert(Access);
+            IsRTCheckNeeded = true;
+          }
+
+          if (IsWrite)
+            SetHasWrite = true;
+
+          // Create sets of pointers connected by a shared alias set and
+          // underlying object.
+          typedef SmallVector<Value *, 16> ValueVector;
+          ValueVector TempObjects;
+          GetUnderlyingObjects(Ptr, TempObjects, DL);
+          for (Value *UnderlyingObj : TempObjects) {
+            UnderlyingObjToAccessMap::iterator Prev =
+                ObjToLastAccess.find(UnderlyingObj);
+            if (Prev != ObjToLastAccess.end())
+              DepCands.unionSets(Access, Prev->second);
+
+            ObjToLastAccess[UnderlyingObj] = Access;
+          }
+        }
+      }
+    }
+  }
+}
+
+namespace {
+/// \brief Checks memory dependences among accesses to the same underlying
+/// object to determine whether there vectorization is legal or not (and at
+/// which vectorization factor).
+///
+/// This class works under the assumption that we already checked that memory
+/// locations with different underlying pointers are "must-not alias".
+/// We use the ScalarEvolution framework to symbolically evalutate access
+/// functions pairs. Since we currently don't restructure the loop we can rely
+/// on the program order of memory accesses to determine their safety.
+/// At the moment we will only deem accesses as safe for:
+///  * A negative constant distance assuming program order.
+///
+///      Safe: tmp = a[i + 1];     OR     a[i + 1] = x;
+///            a[i] = tmp;                y = a[i];
+///
+///   The latter case is safe because later checks guarantuee that there can't
+///   be a cycle through a phi node (that is, we check that "x" and "y" is not
+///   the same variable: a header phi can only be an induction or a reduction, a
+///   reduction can't have a memory sink, an induction can't have a memory
+///   source). This is important and must not be violated (or we have to
+///   resort to checking for cycles through memory).
+///
+///  * A positive constant distance assuming program order that is bigger
+///    than the biggest memory access.
+///
+///     tmp = a[i]        OR              b[i] = x
+///     a[i+2] = tmp                      y = b[i+2];
+///
+///     Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively.
+///
+///  * Zero distances and all accesses have the same size.
+///
+class MemoryDepChecker {
+public:
+  typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
+  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
+
+  MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L)
+      : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
+        ShouldRetryWithRuntimeCheck(false) {}
+
+  /// \brief Register the location (instructions are given increasing numbers)
+  /// of a write access.
+  void addAccess(StoreInst *SI) {
+    Value *Ptr = SI->getPointerOperand();
+    Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
+    InstMap.push_back(SI);
+    ++AccessIdx;
+  }
+
+  /// \brief Register the location (instructions are given increasing numbers)
+  /// of a write access.
+  void addAccess(LoadInst *LI) {
+    Value *Ptr = LI->getPointerOperand();
+    Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
+    InstMap.push_back(LI);
+    ++AccessIdx;
+  }
+
+  /// \brief Check whether the dependencies between the accesses are safe.
+  ///
+  /// Only checks sets with elements in \p CheckDeps.
+  bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
+                   MemAccessInfoSet &CheckDeps, const ValueToValueMap &Strides);
+
+  /// \brief The maximum number of bytes of a vector register we can vectorize
+  /// the accesses safely with.
+  unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
+
+  /// \brief In same cases when the dependency check fails we can still
+  /// vectorize the loop with a dynamic array access check.
+  bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
+
+private:
+  ScalarEvolution *SE;
+  const DataLayout *DL;
+  const Loop *InnermostLoop;
+
+  /// \brief Maps access locations (ptr, read/write) to program order.
+  DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
+
+  /// \brief Memory access instructions in program order.
+  SmallVector<Instruction *, 16> InstMap;
+
+  /// \brief The program order index to be used for the next instruction.
+  unsigned AccessIdx;
+
+  // We can access this many bytes in parallel safely.
+  unsigned MaxSafeDepDistBytes;
+
+  /// \brief If we see a non-constant dependence distance we can still try to
+  /// vectorize this loop with runtime checks.
+  bool ShouldRetryWithRuntimeCheck;
+
+  /// \brief Check whether there is a plausible dependence between the two
+  /// accesses.
+  ///
+  /// Access \p A must happen before \p B in program order. The two indices
+  /// identify the index into the program order map.
+  ///
+  /// This function checks  whether there is a plausible dependence (or the
+  /// absence of such can't be proved) between the two accesses. If there is a
+  /// plausible dependence but the dependence distance is bigger than one
+  /// element access it records this distance in \p MaxSafeDepDistBytes (if this
+  /// distance is smaller than any other distance encountered so far).
+  /// Otherwise, this function returns true signaling a possible dependence.
+  bool isDependent(const MemAccessInfo &A, unsigned AIdx,
+                   const MemAccessInfo &B, unsigned BIdx,
+                   const ValueToValueMap &Strides);
+
+  /// \brief Check whether the data dependence could prevent store-load
+  /// forwarding.
+  bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
+};
+
+} // end anonymous namespace
+
+static bool isInBoundsGep(Value *Ptr) {
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
+    return GEP->isInBounds();
+  return false;
+}
+
+/// \brief Check whether the access through \p Ptr has a constant stride.
+static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
+                        const Loop *Lp, const ValueToValueMap &StridesMap) {
+  const Type *Ty = Ptr->getType();
+  assert(Ty->isPointerTy() && "Unexpected non-ptr");
+
+  // Make sure that the pointer does not point to aggregate types.
+  const PointerType *PtrTy = cast<PointerType>(Ty);
+  if (PtrTy->getElementType()->isAggregateType()) {
+    DEBUG(dbgs() << "LAA: Bad stride - Not a pointer to a scalar type"
+          << *Ptr << "\n");
+    return 0;
+  }
+
+  const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr);
+
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
+  if (!AR) {
+    DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer "
+          << *Ptr << " SCEV: " << *PtrScev << "\n");
+    return 0;
+  }
+
+  // The accesss function must stride over the innermost loop.
+  if (Lp != AR->getLoop()) {
+    DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " <<
+          *Ptr << " SCEV: " << *PtrScev << "\n");
+  }
+
+  // The address calculation must not wrap. Otherwise, a dependence could be
+  // inverted.
+  // An inbounds getelementptr that is a AddRec with a unit stride
+  // cannot wrap per definition. The unit stride requirement is checked later.
+  // An getelementptr without an inbounds attribute and unit stride would have
+  // to access the pointer value "0" which is undefined behavior in address
+  // space 0, therefore we can also vectorize this case.
+  bool IsInBoundsGEP = isInBoundsGep(Ptr);
+  bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
+  bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
+  if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
+    DEBUG(dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
+          << *Ptr << " SCEV: " << *PtrScev << "\n");
+    return 0;
+  }
+
+  // Check the step is constant.
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+
+  // Calculate the pointer stride and check if it is consecutive.
+  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+  if (!C) {
+    DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr <<
+          " SCEV: " << *PtrScev << "\n");
+    return 0;
+  }
+
+  int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType());
+  const APInt &APStepVal = C->getValue()->getValue();
+
+  // Huge step value - give up.
+  if (APStepVal.getBitWidth() > 64)
+    return 0;
+
+  int64_t StepVal = APStepVal.getSExtValue();
+
+  // Strided access.
+  int64_t Stride = StepVal / Size;
+  int64_t Rem = StepVal % Size;
+  if (Rem)
+    return 0;
+
+  // If the SCEV could wrap but we have an inbounds gep with a unit stride we
+  // know we can't "wrap around the address space". In case of address space
+  // zero we know that this won't happen without triggering undefined behavior.
+  if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) &&
+      Stride != 1 && Stride != -1)
+    return 0;
+
+  return Stride;
+}
+
+bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
+                                                    unsigned TypeByteSize) {
+  // If loads occur at a distance that is not a multiple of a feasible vector
+  // factor store-load forwarding does not take place.
+  // Positive dependences might cause troubles because vectorizing them might
+  // prevent store-load forwarding making vectorized code run a lot slower.
+  //   a[i] = a[i-3] ^ a[i-8];
+  //   The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
+  //   hence on your typical architecture store-load forwarding does not take
+  //   place. Vectorizing in such cases does not make sense.
+  // Store-load forwarding distance.
+  const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize;
+  // Maximum vector factor.
+  unsigned MaxVFWithoutSLForwardIssues =
+    VectorizerParams::MaxVectorWidth * TypeByteSize;
+  if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues)
+    MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes;
+
+  for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues;
+       vf *= 2) {
+    if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) {
+      MaxVFWithoutSLForwardIssues = (vf >>=1);
+      break;
+    }
+  }
+
+  if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) {
+    DEBUG(dbgs() << "LAA: Distance " << Distance <<
+          " that could cause a store-load forwarding conflict\n");
+    return true;
+  }
+
+  if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
+      MaxVFWithoutSLForwardIssues !=
+      VectorizerParams::MaxVectorWidth * TypeByteSize)
+    MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
+  return false;
+}
+
+bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
+                                   const MemAccessInfo &B, unsigned BIdx,
+                                   const ValueToValueMap &Strides) {
+  assert (AIdx < BIdx && "Must pass arguments in program order");
+
+  Value *APtr = A.getPointer();
+  Value *BPtr = B.getPointer();
+  bool AIsWrite = A.getInt();
+  bool BIsWrite = B.getInt();
+
+  // Two reads are independent.
+  if (!AIsWrite && !BIsWrite)
+    return false;
+
+  // We cannot check pointers in different address spaces.
+  if (APtr->getType()->getPointerAddressSpace() !=
+      BPtr->getType()->getPointerAddressSpace())
+    return true;
+
+  const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr);
+  const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr);
+
+  int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides);
+  int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides);
+
+  const SCEV *Src = AScev;
+  const SCEV *Sink = BScev;
+
+  // If the induction step is negative we have to invert source and sink of the
+  // dependence.
+  if (StrideAPtr < 0) {
+    //Src = BScev;
+    //Sink = AScev;
+    std::swap(APtr, BPtr);
+    std::swap(Src, Sink);
+    std::swap(AIsWrite, BIsWrite);
+    std::swap(AIdx, BIdx);
+    std::swap(StrideAPtr, StrideBPtr);
+  }
+
+  const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
+
+  DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
+        << "(Induction step: " << StrideAPtr <<  ")\n");
+  DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to "
+        << *InstMap[BIdx] << ": " << *Dist << "\n");
+
+  // Need consecutive accesses. We don't want to vectorize
+  // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
+  // the address space.
+  if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
+    DEBUG(dbgs() << "Non-consecutive pointer access\n");
+    return true;
+  }
+
+  const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
+  if (!C) {
+    DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
+    ShouldRetryWithRuntimeCheck = true;
+    return true;
+  }
+
+  Type *ATy = APtr->getType()->getPointerElementType();
+  Type *BTy = BPtr->getType()->getPointerElementType();
+  unsigned TypeByteSize = DL->getTypeAllocSize(ATy);
+
+  // Negative distances are not plausible dependencies.
+  const APInt &Val = C->getValue()->getValue();
+  if (Val.isNegative()) {
+    bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
+    if (IsTrueDataDependence &&
+        (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
+         ATy != BTy))
+      return true;
+
+    DEBUG(dbgs() << "LAA: Dependence is negative: NoDep\n");
+    return false;
+  }
+
+  // Write to the same location with the same size.
+  // Could be improved to assert type sizes are the same (i32 == float, etc).
+  if (Val == 0) {
+    if (ATy == BTy)
+      return false;
+    DEBUG(dbgs() << "LAA: Zero dependence difference but different types\n");
+    return true;
+  }
+
+  assert(Val.isStrictlyPositive() && "Expect a positive value");
+
+  if (ATy != BTy) {
+    DEBUG(dbgs() <<
+          "LAA: ReadWrite-Write positive dependency with different types\n");
+    return true;
+  }
+
+  unsigned Distance = (unsigned) Val.getZExtValue();
+
+  // Bail out early if passed-in parameters make vectorization not feasible.
+  unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
+                           VectorizerParams::VectorizationFactor : 1);
+  unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
+                           VectorizerParams::VectorizationInterleave : 1);
+
+  // The distance must be bigger than the size needed for a vectorized version
+  // of the operation and the size of the vectorized operation must not be
+  // bigger than the currrent maximum size.
+  if (Distance < 2*TypeByteSize ||
+      2*TypeByteSize > MaxSafeDepDistBytes ||
+      Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
+    DEBUG(dbgs() << "LAA: Failure because of Positive distance "
+        << Val.getSExtValue() << '\n');
+    return true;
+  }
+
+  // Positive distance bigger than max vectorization factor.
+  MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
+    Distance : MaxSafeDepDistBytes;
+
+  bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
+  if (IsTrueDataDependence &&
+      couldPreventStoreLoadForward(Distance, TypeByteSize))
+     return true;
+
+  DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue() <<
+        " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n');
+
+  return false;
+}
+
+bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
+                                   MemAccessInfoSet &CheckDeps,
+                                   const ValueToValueMap &Strides) {
+
+  MaxSafeDepDistBytes = -1U;
+  while (!CheckDeps.empty()) {
+    MemAccessInfo CurAccess = *CheckDeps.begin();
+
+    // Get the relevant memory access set.
+    EquivalenceClasses<MemAccessInfo>::iterator I =
+      AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
+
+    // Check accesses within this set.
+    EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE;
+    AI = AccessSets.member_begin(I), AE = AccessSets.member_end();
+
+    // Check every access pair.
+    while (AI != AE) {
+      CheckDeps.erase(*AI);
+      EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
+      while (OI != AE) {
+        // Check every accessing instruction pair in program order.
+        for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
+             I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
+          for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
+               I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
+            if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides))
+              return false;
+            if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides))
+              return false;
+          }
+        ++OI;
+      }
+      AI++;
+    }
+  }
+  return true;
+}
+
+bool LoopAccessInfo::canAnalyzeLoop() {
+    // We can only analyze innermost loops.
+  if (!TheLoop->empty()) {
+    emitAnalysis(LoopAccessReport() << "loop is not the innermost loop");
+    return false;
+  }
+
+  // We must have a single backedge.
+  if (TheLoop->getNumBackEdges() != 1) {
+    emitAnalysis(
+        LoopAccessReport() <<
+        "loop control flow is not understood by analyzer");
+    return false;
+  }
+
+  // We must have a single exiting block.
+  if (!TheLoop->getExitingBlock()) {
+    emitAnalysis(
+        LoopAccessReport() <<
+        "loop control flow is not understood by analyzer");
+    return false;
+  }
+
+  // We only handle bottom-tested loops, i.e. loop in which the condition is
+  // checked at the end of each iteration. With that we can assume that all
+  // instructions in the loop are executed the same number of times.
+  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+    emitAnalysis(
+        LoopAccessReport() <<
+        "loop control flow is not understood by analyzer");
+    return false;
+  }
+
+  // We need to have a loop header.
+  DEBUG(dbgs() << "LAA: Found a loop: " <<
+        TheLoop->getHeader()->getName() << '\n');
+
+  // ScalarEvolution needs to be able to find the exit count.
+  const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
+  if (ExitCount == SE->getCouldNotCompute()) {
+    emitAnalysis(LoopAccessReport() <<
+                 "could not determine number of loop iterations");
+    DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
+    return false;
+  }
+
+  return true;
+}
+
+void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
+
+  typedef SmallVector<Value*, 16> ValueVector;
+  typedef SmallPtrSet<Value*, 16> ValueSet;
+
+  // Holds the Load and Store *instructions*.
+  ValueVector Loads;
+  ValueVector Stores;
+
+  // Holds all the different accesses in the loop.
+  unsigned NumReads = 0;
+  unsigned NumReadWrites = 0;
+
+  PtrRtCheck.Pointers.clear();
+  PtrRtCheck.Need = false;
+
+  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+  MemoryDepChecker DepChecker(SE, DL, TheLoop);
+
+  // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+
+    // Scan the BB and collect legal loads and stores.
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+
+      // If this is a load, save it. If this instruction can read from memory
+      // but is not a load, then we quit. Notice that we don't handle function
+      // calls that read or write.
+      if (it->mayReadFromMemory()) {
+        // Many math library functions read the rounding mode. We will only
+        // vectorize a loop if it contains known function calls that don't set
+        // the flag. Therefore, it is safe to ignore this read from memory.
+        CallInst *Call = dyn_cast<CallInst>(it);
+        if (Call && getIntrinsicIDForCall(Call, TLI))
+          continue;
+
+        LoadInst *Ld = dyn_cast<LoadInst>(it);
+        if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
+          emitAnalysis(LoopAccessReport(Ld)
+                       << "read with atomic ordering or volatile read");
+          DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
+          CanVecMem = false;
+          return;
+        }
+        NumLoads++;
+        Loads.push_back(Ld);
+        DepChecker.addAccess(Ld);
+        continue;
+      }
+
+      // Save 'store' instructions. Abort if other instructions write to memory.
+      if (it->mayWriteToMemory()) {
+        StoreInst *St = dyn_cast<StoreInst>(it);
+        if (!St) {
+          emitAnalysis(LoopAccessReport(it) <<
+                       "instruction cannot be vectorized");
+          CanVecMem = false;
+          return;
+        }
+        if (!St->isSimple() && !IsAnnotatedParallel) {
+          emitAnalysis(LoopAccessReport(St)
+                       << "write with atomic ordering or volatile write");
+          DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
+          CanVecMem = false;
+          return;
+        }
+        NumStores++;
+        Stores.push_back(St);
+        DepChecker.addAccess(St);
+      }
+    } // Next instr.
+  } // Next block.
+
+  // Now we have two lists that hold the loads and the stores.
+  // Next, we find the pointers that they use.
+
+  // Check if we see any stores. If there are no stores, then we don't
+  // care if the pointers are *restrict*.
+  if (!Stores.size()) {
+    DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
+    CanVecMem = true;
+    return;
+  }
+
+  AccessAnalysis::DepCandidates DependentAccesses;
+  AccessAnalysis Accesses(DL, AA, DependentAccesses);
+
+  // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
+  // multiple times on the same object. If the ptr is accessed twice, once
+  // for read and once for write, it will only appear once (on the write
+  // list). This is okay, since we are going to check for conflicts between
+  // writes and between reads and writes, but not between reads and reads.
+  ValueSet Seen;
+
+  ValueVector::iterator I, IE;
+  for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
+    StoreInst *ST = cast<StoreInst>(*I);
+    Value* Ptr = ST->getPointerOperand();
+
+    if (isUniform(Ptr)) {
+      emitAnalysis(
+          LoopAccessReport(ST)
+          << "write to a loop invariant address could not be vectorized");
+      DEBUG(dbgs() << "LAA: We don't allow storing to uniform addresses\n");
+      CanVecMem = false;
+      return;
+    }
+
+    // If we did *not* see this pointer before, insert it to  the read-write
+    // list. At this phase it is only a 'write' list.
+    if (Seen.insert(Ptr).second) {
+      ++NumReadWrites;
+
+      AliasAnalysis::Location Loc = AA->getLocation(ST);
+      // The TBAA metadata could have a control dependency on the predication
+      // condition, so we cannot rely on it when determining whether or not we
+      // need runtime pointer checks.
+      if (blockNeedsPredication(ST->getParent(), TheLoop, DT))
+        Loc.AATags.TBAA = nullptr;
+
+      Accesses.addStore(Loc);
+    }
+  }
+
+  if (IsAnnotatedParallel) {
+    DEBUG(dbgs()
+          << "LAA: A loop annotated parallel, ignore memory dependency "
+          << "checks.\n");
+    CanVecMem = true;
+    return;
+  }
+
+  for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
+    LoadInst *LD = cast<LoadInst>(*I);
+    Value* Ptr = LD->getPointerOperand();
+    // If we did *not* see this pointer before, insert it to the
+    // read list. If we *did* see it before, then it is already in
+    // the read-write list. This allows us to vectorize expressions
+    // such as A[i] += x;  Because the address of A[i] is a read-write
+    // pointer. This only works if the index of A[i] is consecutive.
+    // If the address of i is unknown (for example A[B[i]]) then we may
+    // read a few words, modify, and write a few words, and some of the
+    // words may be written to the same address.
+    bool IsReadOnlyPtr = false;
+    if (Seen.insert(Ptr).second ||
+        !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) {
+      ++NumReads;
+      IsReadOnlyPtr = true;
+    }
+
+    AliasAnalysis::Location Loc = AA->getLocation(LD);
+    // The TBAA metadata could have a control dependency on the predication
+    // condition, so we cannot rely on it when determining whether or not we
+    // need runtime pointer checks.
+    if (blockNeedsPredication(LD->getParent(), TheLoop, DT))
+      Loc.AATags.TBAA = nullptr;
+
+    Accesses.addLoad(Loc, IsReadOnlyPtr);
+  }
+
+  // If we write (or read-write) to a single destination and there are no
+  // other reads in this loop then is it safe to vectorize.
+  if (NumReadWrites == 1 && NumReads == 0) {
+    DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
+    CanVecMem = true;
+    return;
+  }
+
+  // Build dependence sets and check whether we need a runtime pointer bounds
+  // check.
+  Accesses.buildDependenceSets();
+  bool NeedRTCheck = Accesses.isRTCheckNeeded();
+
+  // Find pointers with computable bounds. We are going to use this information
+  // to place a runtime bound check.
+  unsigned NumComparisons = 0;
+  bool CanDoRT = false;
+  if (NeedRTCheck)
+    CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop,
+                                       Strides);
+
+  DEBUG(dbgs() << "LAA: We need to do " << NumComparisons <<
+        " pointer comparisons.\n");
+
+  // If we only have one set of dependences to check pointers among we don't
+  // need a runtime check.
+  if (NumComparisons == 0 && NeedRTCheck)
+    NeedRTCheck = false;
+
+  // Check that we did not collect too many pointers or found an unsizeable
+  // pointer.
+  if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
+    PtrRtCheck.reset();
+    CanDoRT = false;
+  }
+
+  if (CanDoRT) {
+    DEBUG(dbgs() << "LAA: We can perform a memory runtime check if needed.\n");
+  }
+
+  if (NeedRTCheck && !CanDoRT) {
+    emitAnalysis(LoopAccessReport() << "cannot identify array bounds");
+    DEBUG(dbgs() << "LAA: We can't vectorize because we can't find " <<
+          "the array bounds.\n");
+    PtrRtCheck.reset();
+    CanVecMem = false;
+    return;
+  }
+
+  PtrRtCheck.Need = NeedRTCheck;
+
+  CanVecMem = true;
+  if (Accesses.isDependencyCheckNeeded()) {
+    DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
+    CanVecMem = DepChecker.areDepsSafe(
+        DependentAccesses, Accesses.getDependenciesToCheck(), Strides);
+    MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
+
+    if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) {
+      DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
+      NeedRTCheck = true;
+
+      // Clear the dependency checks. We assume they are not needed.
+      Accesses.resetDepChecks();
+
+      PtrRtCheck.reset();
+      PtrRtCheck.Need = true;
+
+      CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE,
+                                         TheLoop, Strides, true);
+      // Check that we did not collect too many pointers or found an unsizeable
+      // pointer.
+      if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
+        if (!CanDoRT && NumComparisons > 0)
+          emitAnalysis(LoopAccessReport()
+                       << "cannot check memory dependencies at runtime");
+        else
+          emitAnalysis(LoopAccessReport()
+                       << NumComparisons << " exceeds limit of "
+                       << RuntimeMemoryCheckThreshold
+                       << " dependent memory operations checked at runtime");
+        DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
+        PtrRtCheck.reset();
+        CanVecMem = false;
+        return;
+      }
+
+      CanVecMem = true;
+    }
+  }
+
+  if (!CanVecMem)
+    emitAnalysis(LoopAccessReport() <<
+                 "unsafe dependent memory operations in loop");
+
+  DEBUG(dbgs() << "LAA: We" << (NeedRTCheck ? "" : " don't") <<
+        " need a runtime memory check.\n");
+}
+
+bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
+                                           DominatorTree *DT)  {
+  assert(TheLoop->contains(BB) && "Unknown block used");
+
+  // Blocks that do not dominate the latch need predication.
+  BasicBlock* Latch = TheLoop->getLoopLatch();
+  return !DT->dominates(BB, Latch);
+}
+
+void LoopAccessInfo::emitAnalysis(LoopAccessReport &Message) {
+  assert(!Report && "Multiple reports generated");
+  Report = Message;
+}
+
+bool LoopAccessInfo::isUniform(Value *V) const {
+  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+}
+
+// FIXME: this function is currently a duplicate of the one in
+// LoopVectorize.cpp.
+static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
+                                 Instruction *Loc) {
+  if (FirstInst)
+    return FirstInst;
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() == Loc->getParent() ? I : nullptr;
+  return nullptr;
+}
+
+std::pair<Instruction *, Instruction *>
+LoopAccessInfo::addRuntimeCheck(Instruction *Loc) const {
+  Instruction *tnullptr = nullptr;
+  if (!PtrRtCheck.Need)
+    return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
+
+  unsigned NumPointers = PtrRtCheck.Pointers.size();
+  SmallVector<TrackingVH<Value> , 2> Starts;
+  SmallVector<TrackingVH<Value> , 2> Ends;
+
+  LLVMContext &Ctx = Loc->getContext();
+  SCEVExpander Exp(*SE, "induction");
+  Instruction *FirstInst = nullptr;
+
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    Value *Ptr = PtrRtCheck.Pointers[i];
+    const SCEV *Sc = SE->getSCEV(Ptr);
+
+    if (SE->isLoopInvariant(Sc, TheLoop)) {
+      DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:" <<
+            *Ptr <<"\n");
+      Starts.push_back(Ptr);
+      Ends.push_back(Ptr);
+    } else {
+      DEBUG(dbgs() << "LAA: Adding RT check for range:" << *Ptr << '\n');
+      unsigned AS = Ptr->getType()->getPointerAddressSpace();
+
+      // Use this type for pointer arithmetic.
+      Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
+
+      Value *Start = Exp.expandCodeFor(PtrRtCheck.Starts[i], PtrArithTy, Loc);
+      Value *End = Exp.expandCodeFor(PtrRtCheck.Ends[i], PtrArithTy, Loc);
+      Starts.push_back(Start);
+      Ends.push_back(End);
+    }
+  }
+
+  IRBuilder<> ChkBuilder(Loc);
+  // Our instructions might fold to a constant.
+  Value *MemoryRuntimeCheck = nullptr;
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    for (unsigned j = i+1; j < NumPointers; ++j) {
+      if (!PtrRtCheck.needsChecking(i, j))
+        continue;
+
+      unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
+      unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
+
+      assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&
+             (AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&
+             "Trying to bounds check pointers with different address spaces");
+
+      Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
+      Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
+
+      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc");
+      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc");
+      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy1, "bc");
+      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy0, "bc");
+
+      Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
+      FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
+      Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
+      FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
+      Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
+      FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
+      if (MemoryRuntimeCheck) {
+        IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
+                                         "conflict.rdx");
+        FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
+      }
+      MemoryRuntimeCheck = IsConflict;
+    }
+  }
+
+  // We have to do this trickery because the IRBuilder might fold the check to a
+  // constant expression in which case there is no Instruction anchored in a
+  // the block.
+  Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
+                                                 ConstantInt::getTrue(Ctx));
+  ChkBuilder.Insert(Check, "memcheck.conflict");
+  FirstInst = getFirstInst(FirstInst, Check, Loc);
+  return std::make_pair(FirstInst, Check);
+}
+
+LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
+                               const DataLayout *DL,
+                               const TargetLibraryInfo *TLI, AliasAnalysis *AA,
+                               DominatorTree *DT,
+                               const ValueToValueMap &Strides)
+    : TheLoop(L), SE(SE), DL(DL), TLI(TLI), AA(AA), DT(DT), NumLoads(0),
+      NumStores(0), MaxSafeDepDistBytes(-1U), CanVecMem(false) {
+  if (canAnalyzeLoop())
+    analyzeLoop(Strides);
+}
+
+void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
+  if (CanVecMem) {
+    if (PtrRtCheck.empty())
+      OS.indent(Depth) << "Memory dependences are safe\n";
+    else
+      OS.indent(Depth) << "Memory dependences are safe with run-time checks\n";
+  }
+
+  if (Report)
+    OS.indent(Depth) << "Report: " << Report->str() << "\n";
+
+  // FIXME: Print unsafe dependences
+
+  // List the pair of accesses need run-time checks to prove independence.
+  PtrRtCheck.print(OS, Depth);
+  OS << "\n";
+}
+
+const LoopAccessInfo &
+LoopAccessAnalysis::getInfo(Loop *L, const ValueToValueMap &Strides) {
+  auto &LAI = LoopAccessInfoMap[L];
+
+#ifndef NDEBUG
+  assert((!LAI || LAI->NumSymbolicStrides == Strides.size()) &&
+         "Symbolic strides changed for loop");
+#endif
+
+  if (!LAI) {
+    LAI = llvm::make_unique<LoopAccessInfo>(L, SE, DL, TLI, AA, DT, Strides);
+#ifndef NDEBUG
+    LAI->NumSymbolicStrides = Strides.size();
+#endif
+  }
+  return *LAI.get();
+}
+
+void LoopAccessAnalysis::print(raw_ostream &OS, const Module *M) const {
+  LoopAccessAnalysis &LAA = *const_cast<LoopAccessAnalysis *>(this);
+
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  ValueToValueMap NoSymbolicStrides;
+
+  for (Loop *TopLevelLoop : *LI)
+    for (Loop *L : depth_first(TopLevelLoop)) {
+      OS.indent(2) << L->getHeader()->getName() << ":\n";
+      auto &LAI = LAA.getInfo(L, NoSymbolicStrides);
+      LAI.print(OS, 4);
+    }
+}
+
+bool LoopAccessAnalysis::runOnFunction(Function &F) {
+  SE = &getAnalysis<ScalarEvolution>();
+  DL = F.getParent()->getDataLayout();
+  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+  TLI = TLIP ? &TLIP->getTLI() : nullptr;
+  AA = &getAnalysis<AliasAnalysis>();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  return false;
+}
+
+void LoopAccessAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<AliasAnalysis>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+
+    AU.setPreservesAll();
+}
+
+char LoopAccessAnalysis::ID = 0;
+static const char laa_name[] = "Loop Access Analysis";
+#define LAA_NAME "loop-accesses"
+
+INITIALIZE_PASS_BEGIN(LoopAccessAnalysis, LAA_NAME, laa_name, false, true)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LoopAccessAnalysis, LAA_NAME, laa_name, false, true)
+
+namespace llvm {
+  Pass *createLAAPass() {
+    return new LoopAccessAnalysis();
+  }
+}
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index b1f62c4..95f6eb0 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include <algorithm>
@@ -45,11 +46,6 @@ static cl::opt<bool,true>
 VerifyLoopInfoX("verify-loop-info", cl::location(VerifyLoopInfo),
                 cl::desc("Verify loop info (time consuming)"));
 
-char LoopInfo::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopInfo, "loops", "Natural Loop Information", true, true)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(LoopInfo, "loops", "Natural Loop Information", true, true)
-
 // Loop identifier metadata name.
 static const char *const LoopMDName = "llvm.loop";
 
@@ -609,15 +605,6 @@ Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) {
   return NearLoop;
 }
 
-//===----------------------------------------------------------------------===//
-// LoopInfo implementation
-//
-bool LoopInfo::runOnFunction(Function &) {
-  releaseMemory();
-  LI.Analyze(getAnalysis<DominatorTreeWrapperPass>().getDomTree());
-  return false;
-}
-
 /// updateUnloop - The last backedge has been removed from a loop--now the
 /// "unloop". Find a new parent for the blocks contained within unloop and
 /// update the loop tree. We don't necessarily have valid dominators at this
@@ -631,7 +618,8 @@ void LoopInfo::updateUnloop(Loop *Unloop) {
   if (!Unloop->getParentLoop()) {
     // Since BBLoop had no parent, Unloop blocks are no longer in a loop.
     for (Loop::block_iterator I = Unloop->block_begin(),
-         E = Unloop->block_end(); I != E; ++I) {
+                              E = Unloop->block_end();
+         I != E; ++I) {
 
       // Don't reparent blocks in subloops.
       if (getLoopFor(*I) != Unloop)
@@ -639,21 +627,21 @@ void LoopInfo::updateUnloop(Loop *Unloop) {
 
       // Blocks no longer have a parent but are still referenced by Unloop until
       // the Unloop object is deleted.
-      LI.changeLoopFor(*I, nullptr);
+      changeLoopFor(*I, nullptr);
     }
 
     // Remove the loop from the top-level LoopInfo object.
-    for (LoopInfo::iterator I = LI.begin();; ++I) {
-      assert(I != LI.end() && "Couldn't find loop");
+    for (iterator I = begin();; ++I) {
+      assert(I != end() && "Couldn't find loop");
       if (*I == Unloop) {
-        LI.removeLoop(I);
+        removeLoop(I);
         break;
       }
     }
 
     // Move all of the subloops to the top-level.
     while (!Unloop->empty())
-      LI.addTopLevelLoop(Unloop->removeChildLoop(std::prev(Unloop->end())));
+      addTopLevelLoop(Unloop->removeChildLoop(std::prev(Unloop->end())));
 
     return;
   }
@@ -680,35 +668,59 @@ void LoopInfo::updateUnloop(Loop *Unloop) {
   }
 }
 
-void LoopInfo::verifyAnalysis() const {
-  // LoopInfo is a FunctionPass, but verifying every loop in the function
-  // each time verifyAnalysis is called is very expensive. The
-  // -verify-loop-info option can enable this. In order to perform some
-  // checking by default, LoopPass has been taught to call verifyLoop
-  // manually during loop pass sequences.
+char LoopAnalysis::PassID;
+
+LoopInfo LoopAnalysis::run(Function &F, AnalysisManager<Function> *AM) {
+  // FIXME: Currently we create a LoopInfo from scratch for every function.
+  // This may prove to be too wasteful due to deallocating and re-allocating
+  // memory each time for the underlying map and vector datastructures. At some
+  // point it may prove worthwhile to use a freelist and recycle LoopInfo
+  // objects. I don't want to add that kind of complexity until the scope of
+  // the problem is better understood.
+  LoopInfo LI;
+  LI.Analyze(AM->getResult<DominatorTreeAnalysis>(F));
+  return std::move(LI);
+}
+
+PreservedAnalyses LoopPrinterPass::run(Function &F,
+                                       AnalysisManager<Function> *AM) {
+  AM->getResult<LoopAnalysis>(F).print(OS);
+  return PreservedAnalyses::all();
+}
 
-  if (!VerifyLoopInfo) return;
+//===----------------------------------------------------------------------===//
+// LoopInfo implementation
+//
 
-  DenseSet<const Loop*> Loops;
-  for (iterator I = begin(), E = end(); I != E; ++I) {
-    assert(!(*I)->getParentLoop() && "Top-level loop has a parent!");
-    (*I)->verifyLoopNest(&Loops);
-  }
+char LoopInfoWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopInfoWrapperPass, "loops", "Natural Loop Information",
+                      true, true)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(LoopInfoWrapperPass, "loops", "Natural Loop Information",
+                    true, true)
 
-  // Verify that blocks are mapped to valid loops.
-  for (DenseMap<BasicBlock*, Loop*>::const_iterator I = LI.BBMap.begin(),
-         E = LI.BBMap.end(); I != E; ++I) {
-    assert(Loops.count(I->second) && "orphaned loop");
-    assert(I->second->contains(I->first) && "orphaned block");
-  }
+bool LoopInfoWrapperPass::runOnFunction(Function &) {
+  releaseMemory();
+  LI.Analyze(getAnalysis<DominatorTreeWrapperPass>().getDomTree());
+  return false;
+}
+
+void LoopInfoWrapperPass::verifyAnalysis() const {
+  // LoopInfoWrapperPass is a FunctionPass, but verifying every loop in the
+  // function each time verifyAnalysis is called is very expensive. The
+  // -verify-loop-info option can enable this. In order to perform some
+  // checking by default, LoopPass has been taught to call verifyLoop manually
+  // during loop pass sequences.
+  if (VerifyLoopInfo)
+    LI.verify();
 }
 
-void LoopInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+void LoopInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<DominatorTreeWrapperPass>();
 }
 
-void LoopInfo::print(raw_ostream &OS, const Module*) const {
+void LoopInfoWrapperPass::print(raw_ostream &OS, const Module *) const {
   LI.print(OS);
 }
 
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index 190abc7..a99c949 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -187,14 +187,15 @@ static void addLoopIntoQueue(Loop *L, std::deque<Loop *> &LQ) {
 void LPPassManager::getAnalysisUsage(AnalysisUsage &Info) const {
   // LPPassManager needs LoopInfo. In the long term LoopInfo class will
   // become part of LPPassManager.
-  Info.addRequired<LoopInfo>();
+  Info.addRequired<LoopInfoWrapperPass>();
   Info.setPreservesAll();
 }
 
 /// run - Execute all of the passes scheduled for execution.  Keep track of
 /// whether any of the passes modifies the function, and if so, return true.
 bool LPPassManager::runOnFunction(Function &F) {
-  LI = &getAnalysis<LoopInfo>();
+  auto &LIWP = getAnalysis<LoopInfoWrapperPass>();
+  LI = &LIWP.getLoopInfo();
   bool Changed = false;
 
   // Collect inherited analysis from Module level pass manager.
@@ -262,7 +263,7 @@ bool LPPassManager::runOnFunction(Function &F) {
         // loop in the function every time. That level of checking can be
         // enabled with the -verify-loop-info option.
         {
-          TimeRegion PassTimer(getPassTimer(LI));
+          TimeRegion PassTimer(getPassTimer(&LIWP));
           CurrentLoop->verifyLoop();
         }
 
diff --git a/lib/Analysis/MemDepPrinter.cpp b/lib/Analysis/MemDepPrinter.cpp
index 10da3d5..e1b7b4b 100644
--- a/lib/Analysis/MemDepPrinter.cpp
+++ b/lib/Analysis/MemDepPrinter.cpp
@@ -92,13 +92,12 @@ const char *const MemDepPrinter::DepTypeStr[]
 
 bool MemDepPrinter::runOnFunction(Function &F) {
   this->F = &F;
-  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
   MemoryDependenceAnalysis &MDA = getAnalysis<MemoryDependenceAnalysis>();
 
   // All this code uses non-const interfaces because MemDep is not
   // const-friendly, though nothing is actually modified.
-  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
-    Instruction *Inst = &*I;
+  for (auto &I : inst_range(F)) {
+    Instruction *Inst = &I;
 
     if (!Inst->mayReadFromMemory() && !Inst->mayWriteToMemory())
       continue;
@@ -119,30 +118,9 @@ bool MemDepPrinter::runOnFunction(Function &F) {
       }
     } else {
       SmallVector<NonLocalDepResult, 4> NLDI;
-      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-        if (!LI->isUnordered()) {
-          // FIXME: Handle atomic/volatile loads.
-          Deps[Inst].insert(std::make_pair(getInstTypePair(nullptr, Unknown),
-                                           static_cast<BasicBlock *>(nullptr)));
-          continue;
-        }
-        AliasAnalysis::Location Loc = AA.getLocation(LI);
-        MDA.getNonLocalPointerDependency(Loc, true, LI->getParent(), NLDI);
-      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-        if (!SI->isUnordered()) {
-          // FIXME: Handle atomic/volatile stores.
-          Deps[Inst].insert(std::make_pair(getInstTypePair(nullptr, Unknown),
-                                           static_cast<BasicBlock *>(nullptr)));
-          continue;
-        }
-        AliasAnalysis::Location Loc = AA.getLocation(SI);
-        MDA.getNonLocalPointerDependency(Loc, false, SI->getParent(), NLDI);
-      } else if (VAArgInst *VI = dyn_cast<VAArgInst>(Inst)) {
-        AliasAnalysis::Location Loc = AA.getLocation(VI);
-        MDA.getNonLocalPointerDependency(Loc, false, VI->getParent(), NLDI);
-      } else {
-        llvm_unreachable("Unknown memory instruction!");
-      }
+      assert( (isa<LoadInst>(Inst) || isa<StoreInst>(Inst) ||
+               isa<VAArgInst>(Inst)) && "Unknown memory instruction!"); 
+      MDA.getNonLocalPointerDependency(Inst, NLDI);
 
       DepSet &InstDeps = Deps[Inst];
       for (SmallVectorImpl<NonLocalDepResult>::const_iterator
@@ -157,8 +135,8 @@ bool MemDepPrinter::runOnFunction(Function &F) {
 }
 
 void MemDepPrinter::print(raw_ostream &OS, const Module *M) const {
-  for (const_inst_iterator I = inst_begin(*F), E = inst_end(*F); I != E; ++I) {
-    const Instruction *Inst = &*I;
+  for (const auto &I : inst_range(*F)) {
+    const Instruction *Inst = &I;
 
     DepSetMap::const_iterator DI = Deps.find(Inst);
     if (DI == Deps.end())
@@ -166,11 +144,10 @@ void MemDepPrinter::print(raw_ostream &OS, const Module *M) const {
 
     const DepSet &InstDeps = DI->second;
 
-    for (DepSet::const_iterator I = InstDeps.begin(), E = InstDeps.end();
-         I != E; ++I) {
-      const Instruction *DepInst = I->first.getPointer();
-      DepType type = I->first.getInt();
-      const BasicBlock *DepBB = I->second;
+    for (const auto &I : InstDeps) {
+      const Instruction *DepInst = I.first.getPointer();
+      DepType type = I.first.getInt();
+      const BasicBlock *DepBB = I.second;
 
       OS << "    ";
       OS << DepTypeStr[type];
diff --git a/lib/Analysis/MemDerefPrinter.cpp b/lib/Analysis/MemDerefPrinter.cpp
new file mode 100644
index 0000000..531d75e
--- /dev/null
+++ b/lib/Analysis/MemDerefPrinter.cpp
@@ -0,0 +1,70 @@
+//===- MemDerefPrinter.cpp - Printer for isDereferenceablePointer ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+  struct MemDerefPrinter : public FunctionPass {
+    SmallVector<Value *, 4> Vec;
+
+    static char ID; // Pass identifcation, replacement for typeid
+    MemDerefPrinter() : FunctionPass(ID) {
+      initializeMemDerefPrinterPass(*PassRegistry::getPassRegistry());
+    }
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<DataLayoutPass>();
+      AU.setPreservesAll();
+    }
+    bool runOnFunction(Function &F) override;
+    void print(raw_ostream &OS, const Module * = nullptr) const override;
+    void releaseMemory() override {
+      Vec.clear();
+    }
+  };
+}
+
+char MemDerefPrinter::ID = 0;
+INITIALIZE_PASS_BEGIN(MemDerefPrinter, "print-memderefs",
+                      "Memory Dereferenciblity of pointers in function", false, true)
+INITIALIZE_PASS_DEPENDENCY(DataLayoutPass)
+INITIALIZE_PASS_END(MemDerefPrinter, "print-memderefs",
+                    "Memory Dereferenciblity of pointers in function", false, true)
+
+FunctionPass *llvm::createMemDerefPrinter() {
+  return new MemDerefPrinter();
+}
+
+bool MemDerefPrinter::runOnFunction(Function &F) {
+  const DataLayout *DL = &getAnalysis<DataLayoutPass>().getDataLayout();
+  for (auto &I: inst_range(F)) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+      Value *PO = LI->getPointerOperand();
+      if (PO->isDereferenceablePointer(DL))
+        Vec.push_back(PO);
+    }
+  }
+  return false;
+}
+
+void MemDerefPrinter::print(raw_ostream &OS, const Module *M) const {
+  OS << "The following are dereferenceable:\n";
+  for (auto &V: Vec) {
+    V->print(OS);
+    OS << "\n\n";
+  }
+}
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index 08b41fe..6108af3 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -25,7 +26,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -319,7 +319,7 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
   if (!CI || isa<IntrinsicInst>(CI))
     return nullptr;
   Function *Callee = CI->getCalledFunction();
-  if (Callee == nullptr || !Callee->isDeclaration())
+  if (Callee == nullptr)
     return nullptr;
 
   StringRef FnName = Callee->getName();
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 187eada..6d38863 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -18,7 +18,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/PHITransAddr.h"
@@ -59,7 +59,7 @@ char MemoryDependenceAnalysis::ID = 0;
 // Register this pass...
 INITIALIZE_PASS_BEGIN(MemoryDependenceAnalysis, "memdep",
                 "Memory Dependence Analysis", false, true)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(MemoryDependenceAnalysis, "memdep",
                       "Memory Dependence Analysis", false, true)
@@ -82,19 +82,17 @@ void MemoryDependenceAnalysis::releaseMemory() {
   PredCache->clear();
 }
 
-
-
 /// getAnalysisUsage - Does not modify anything.  It uses Alias Analysis.
 ///
 void MemoryDependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<AssumptionTracker>();
+  AU.addRequired<AssumptionCacheTracker>();
   AU.addRequiredTransitive<AliasAnalysis>();
 }
 
-bool MemoryDependenceAnalysis::runOnFunction(Function &) {
+bool MemoryDependenceAnalysis::runOnFunction(Function &F) {
   AA = &getAnalysis<AliasAnalysis>();
-  AT = &getAnalysis<AssumptionTracker>();
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
   DominatorTreeWrapperPass *DTWP =
@@ -300,8 +298,7 @@ getLoadLoadClobberFullWidthSize(const Value *MemLocBase, int64_t MemLocOffs,
 
   // Load widening is hostile to ThreadSanitizer: it may cause false positives
   // or make the reports more cryptic (access sizes are wrong).
-  if (LI->getParent()->getParent()->getAttributes().
-      hasAttribute(AttributeSet::FunctionIndex, Attribute::SanitizeThread))
+  if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeThread))
     return 0;
 
   // Get the base of this load.
@@ -346,9 +343,9 @@ getLoadLoadClobberFullWidthSize(const Value *MemLocBase, int64_t MemLocOffs,
         !DL.fitsInLegalInteger(NewLoadByteSize*8))
       return 0;
 
-    if (LIOffs+NewLoadByteSize > MemLocEnd &&
-        LI->getParent()->getParent()->getAttributes().
-          hasAttribute(AttributeSet::FunctionIndex, Attribute::SanitizeAddress))
+    if (LIOffs + NewLoadByteSize > MemLocEnd &&
+        LI->getParent()->getParent()->hasFnAttribute(
+            Attribute::SanitizeAddress))
       // We will be reading past the location accessed by the original program.
       // While this is safe in a regular build, Address Safety analysis tools
       // may start reporting false warnings. So, don't do widening.
@@ -362,6 +359,17 @@ getLoadLoadClobberFullWidthSize(const Value *MemLocBase, int64_t MemLocOffs,
   }
 }
 
+static bool isVolatile(Instruction *Inst) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+    return LI->isVolatile();
+  else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    return SI->isVolatile();
+  else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(Inst))
+    return AI->isVolatile();
+  return false;
+}
+
+
 /// getPointerDependencyFrom - Return the instruction on which a memory
 /// location depends.  If isLoad is true, this routine ignores may-aliases with
 /// read-only operations.  If isLoad is false, this routine ignores may-aliases
@@ -448,12 +456,26 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
     // does not alias with when this atomic load indicates that another thread may
     // be accessing the location.
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+
+      // While volatile access cannot be eliminated, they do not have to clobber
+      // non-aliasing locations, as normal accesses, for example, can be safely
+      // reordered with volatile accesses.
+      if (LI->isVolatile()) {
+        if (!QueryInst)
+          // Original QueryInst *may* be volatile
+          return MemDepResult::getClobber(LI);
+        if (isVolatile(QueryInst))
+          // Ordering required if QueryInst is itself volatile
+          return MemDepResult::getClobber(LI);
+        // Otherwise, volatile doesn't imply any special ordering
+      }
+      
       // Atomic loads have complications involved.
       // A Monotonic (or higher) load is OK if the query inst is itself not atomic.
       // An Acquire (or higher) load sets the HasSeenAcquire flag, so that any
       //   release store will know to return getClobber.
       // FIXME: This is overly conservative.
-      if (!LI->isUnordered()) {
+      if (LI->isAtomic() && LI->getOrdering() > Unordered) {
         if (!QueryInst)
           return MemDepResult::getClobber(LI);
         if (auto *QueryLI = dyn_cast<LoadInst>(QueryInst)) {
@@ -470,13 +492,6 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
           HasSeenAcquire = true;
       }
 
-      // FIXME: this is overly conservative.
-      // While volatile access cannot be eliminated, they do not have to clobber
-      // non-aliasing locations, as normal accesses can for example be reordered
-      // with volatile accesses.
-      if (LI->isVolatile())
-        return MemDepResult::getClobber(LI);
-
       AliasAnalysis::Location LoadLoc = AA->getLocation(LI);
 
       // If we found a pointer, check if it could be the same as our pointer.
@@ -859,21 +874,65 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
 /// own block.
 ///
 void MemoryDependenceAnalysis::
-getNonLocalPointerDependency(const AliasAnalysis::Location &Loc, bool isLoad,
-                             BasicBlock *FromBB,
+getNonLocalPointerDependency(Instruction *QueryInst,
                              SmallVectorImpl<NonLocalDepResult> &Result) {
+
+  auto getLocation = [](AliasAnalysis *AA, Instruction *Inst) {
+    if (auto *I = dyn_cast<LoadInst>(Inst))
+      return AA->getLocation(I);
+    else if (auto *I = dyn_cast<StoreInst>(Inst))
+      return AA->getLocation(I);
+    else if (auto *I = dyn_cast<VAArgInst>(Inst))
+      return AA->getLocation(I);
+    else if (auto *I = dyn_cast<AtomicCmpXchgInst>(Inst))
+      return AA->getLocation(I);
+    else if (auto *I = dyn_cast<AtomicRMWInst>(Inst))
+      return AA->getLocation(I);
+    else
+      llvm_unreachable("unsupported memory instruction");
+  };
+   
+  const AliasAnalysis::Location Loc = getLocation(AA, QueryInst);
+  bool isLoad = isa<LoadInst>(QueryInst);
+  BasicBlock *FromBB = QueryInst->getParent();
+  assert(FromBB);
+
   assert(Loc.Ptr->getType()->isPointerTy() &&
          "Can't get pointer deps of a non-pointer!");
   Result.clear();
+  
+  // This routine does not expect to deal with volatile instructions.
+  // Doing so would require piping through the QueryInst all the way through.
+  // TODO: volatiles can't be elided, but they can be reordered with other
+  // non-volatile accesses.
+
+  // We currently give up on any instruction which is ordered, but we do handle
+  // atomic instructions which are unordered.
+  // TODO: Handle ordered instructions
+  auto isOrdered = [](Instruction *Inst) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+      return !LI->isUnordered();
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      return !SI->isUnordered();
+    }
+    return false;
+  };
+  if (isVolatile(QueryInst) || isOrdered(QueryInst)) {
+    Result.push_back(NonLocalDepResult(FromBB,
+                                       MemDepResult::getUnknown(),
+                                       const_cast<Value *>(Loc.Ptr)));
+    return;
+  }
+
 
-  PHITransAddr Address(const_cast<Value *>(Loc.Ptr), DL, AT);
+  PHITransAddr Address(const_cast<Value *>(Loc.Ptr), DL, AC);
 
   // This is the set of blocks we've inspected, and the pointer we consider in
   // each block.  Because of critical edges, we currently bail out if querying
   // a block with multiple different pointers.  This can happen during PHI
   // translation.
   DenseMap<BasicBlock*, Value*> Visited;
-  if (!getNonLocalPointerDepFromBB(Address, Loc, isLoad, FromBB,
+  if (!getNonLocalPointerDepFromBB(QueryInst, Address, Loc, isLoad, FromBB,
                                    Result, Visited, true))
     return;
   Result.clear();
@@ -887,7 +946,8 @@ getNonLocalPointerDependency(const AliasAnalysis::Location &Loc, bool isLoad,
 /// lookup (which may use dirty cache info if available).  If we do a lookup,
 /// add the result to the cache.
 MemDepResult MemoryDependenceAnalysis::
-GetNonLocalInfoForBlock(const AliasAnalysis::Location &Loc,
+GetNonLocalInfoForBlock(Instruction *QueryInst,
+                        const AliasAnalysis::Location &Loc,
                         bool isLoad, BasicBlock *BB,
                         NonLocalDepInfo *Cache, unsigned NumSortedEntries) {
 
@@ -928,7 +988,8 @@ GetNonLocalInfoForBlock(const AliasAnalysis::Location &Loc,
   }
 
   // Scan the block for the dependency.
-  MemDepResult Dep = getPointerDependencyFrom(Loc, isLoad, ScanPos, BB);
+  MemDepResult Dep = getPointerDependencyFrom(Loc, isLoad, ScanPos, BB,
+                                              QueryInst);
 
   // If we had a dirty entry for the block, update it.  Otherwise, just add
   // a new entry.
@@ -1001,7 +1062,8 @@ SortNonLocalDepInfoCache(MemoryDependenceAnalysis::NonLocalDepInfo &Cache,
 /// not compute dependence information for some reason.  This should be treated
 /// as a clobber dependence on the first instruction in the predecessor block.
 bool MemoryDependenceAnalysis::
-getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
+getNonLocalPointerDepFromBB(Instruction *QueryInst,
+                            const PHITransAddr &Pointer,
                             const AliasAnalysis::Location &Loc,
                             bool isLoad, BasicBlock *StartBB,
                             SmallVectorImpl<NonLocalDepResult> &Result,
@@ -1040,7 +1102,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
     } else if (CacheInfo->Size > Loc.Size) {
       // This query's Size is less than the cached one. Conservatively restart
       // the query using the greater size.
-      return getNonLocalPointerDepFromBB(Pointer,
+      return getNonLocalPointerDepFromBB(QueryInst, Pointer,
                                          Loc.getWithNewSize(CacheInfo->Size),
                                          isLoad, StartBB, Result, Visited,
                                          SkipFirstBlock);
@@ -1060,7 +1122,8 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
         CacheInfo->NonLocalDeps.clear();
       }
       if (Loc.AATags)
-        return getNonLocalPointerDepFromBB(Pointer, Loc.getWithoutAATags(),
+        return getNonLocalPointerDepFromBB(QueryInst,
+                                           Pointer, Loc.getWithoutAATags(),
                                            isLoad, StartBB, Result, Visited,
                                            SkipFirstBlock);
     }
@@ -1145,7 +1208,6 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
       // cache value will only see properly sorted cache arrays.
       if (Cache && NumSortedEntries != Cache->size()) {
         SortNonLocalDepInfoCache(*Cache, NumSortedEntries);
-        NumSortedEntries = Cache->size();
       }
       // Since we bail out, the "Cache" set won't contain all of the
       // results for the query.  This is ok (we can still use it to accelerate
@@ -1164,7 +1226,8 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
       // Get the dependency info for Pointer in BB.  If we have cached
       // information, we will use it, otherwise we compute it.
       DEBUG(AssertSorted(*Cache, NumSortedEntries));
-      MemDepResult Dep = GetNonLocalInfoForBlock(Loc, isLoad, BB, Cache,
+      MemDepResult Dep = GetNonLocalInfoForBlock(QueryInst,
+                                                 Loc, isLoad, BB, Cache,
                                                  NumSortedEntries);
 
       // If we got a Def or Clobber, add this to the list of results.
@@ -1298,7 +1361,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
       // result conflicted with the Visited list; we have to conservatively
       // assume it is unknown, but this also does not block PRE of the load.
       if (!CanTranslate ||
-          getNonLocalPointerDepFromBB(PredPointer,
+          getNonLocalPointerDepFromBB(QueryInst, PredPointer,
                                       Loc.getWithNewPtr(PredPtrVal),
                                       isLoad, Pred,
                                       Result, Visited)) {
@@ -1361,7 +1424,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
       if (I->getBB() != BB)
         continue;
 
-      assert(I->getResult().isNonLocal() &&
+      assert((I->getResult().isNonLocal() || !DT->isReachableFromEntry(BB)) &&
              "Should only be here with transparent block");
       I->setResult(MemDepResult::getUnknown());
       Result.push_back(NonLocalDepResult(I->getBB(), I->getResult(),
diff --git a/lib/Analysis/PHITransAddr.cpp b/lib/Analysis/PHITransAddr.cpp
index b3d060a..a534418 100644
--- a/lib/Analysis/PHITransAddr.cpp
+++ b/lib/Analysis/PHITransAddr.cpp
@@ -228,7 +228,7 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
       return GEP;
 
     // Simplify the GEP to handle 'gep x, 0' -> x etc.
-    if (Value *V = SimplifyGEPInst(GEPOps, DL, TLI, DT, AT)) {
+    if (Value *V = SimplifyGEPInst(GEPOps, DL, TLI, DT, AC)) {
       for (unsigned i = 0, e = GEPOps.size(); i != e; ++i)
         RemoveInstInputs(GEPOps[i], InstInputs);
 
@@ -283,7 +283,7 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
         }
 
     // See if the add simplifies away.
-    if (Value *Res = SimplifyAddInst(LHS, RHS, isNSW, isNUW, DL, TLI, DT, AT)) {
+    if (Value *Res = SimplifyAddInst(LHS, RHS, isNSW, isNUW, DL, TLI, DT, AC)) {
       // If we simplified the operands, the LHS is no longer an input, but Res
       // is.
       RemoveInstInputs(LHS, InstInputs);
@@ -369,7 +369,7 @@ InsertPHITranslatedSubExpr(Value *InVal, BasicBlock *CurBB,
                            SmallVectorImpl<Instruction*> &NewInsts) {
   // See if we have a version of this value already available and dominating
   // PredBB.  If so, there is no need to insert a new instance of it.
-  PHITransAddr Tmp(InVal, DL, AT);
+  PHITransAddr Tmp(InVal, DL, AC);
   if (!Tmp.PHITranslateValue(CurBB, PredBB, &DT))
     return Tmp.getAddr();
 
diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp
index 08ebf0d..8cd8534 100644
--- a/lib/Analysis/RegionInfo.cpp
+++ b/lib/Analysis/RegionInfo.cpp
@@ -10,10 +10,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/RegionInfo.h"
-#include "llvm/Analysis/RegionInfoImpl.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/RegionInfoImpl.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp
index de34b72..6fa7b2e 100644
--- a/lib/Analysis/RegionPass.cpp
+++ b/lib/Analysis/RegionPass.cpp
@@ -15,9 +15,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/Analysis/RegionPass.h"
 #include "llvm/Analysis/RegionIterator.h"
-#include "llvm/Support/Timer.h"
-
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Timer.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "regionpassmgr"
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 68549ef..9e4eb11 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -63,11 +63,12 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
@@ -87,7 +88,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -116,10 +116,10 @@ VerifySCEV("verify-scev",
 
 INITIALIZE_PASS_BEGIN(ScalarEvolution, "scalar-evolution",
                 "Scalar Evolution Analysis", false, true)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(ScalarEvolution, "scalar-evolution",
                 "Scalar Evolution Analysis", false, true)
 char ScalarEvolution::ID = 0;
@@ -675,62 +675,6 @@ static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
   }
 }
 
-static const APInt srem(const SCEVConstant *C1, const SCEVConstant *C2) {
-  APInt A = C1->getValue()->getValue();
-  APInt B = C2->getValue()->getValue();
-  uint32_t ABW = A.getBitWidth();
-  uint32_t BBW = B.getBitWidth();
-
-  if (ABW > BBW)
-    B = B.sext(ABW);
-  else if (ABW < BBW)
-    A = A.sext(BBW);
-
-  return APIntOps::srem(A, B);
-}
-
-static const APInt sdiv(const SCEVConstant *C1, const SCEVConstant *C2) {
-  APInt A = C1->getValue()->getValue();
-  APInt B = C2->getValue()->getValue();
-  uint32_t ABW = A.getBitWidth();
-  uint32_t BBW = B.getBitWidth();
-
-  if (ABW > BBW)
-    B = B.sext(ABW);
-  else if (ABW < BBW)
-    A = A.sext(BBW);
-
-  return APIntOps::sdiv(A, B);
-}
-
-static const APInt urem(const SCEVConstant *C1, const SCEVConstant *C2) {
-  APInt A = C1->getValue()->getValue();
-  APInt B = C2->getValue()->getValue();
-  uint32_t ABW = A.getBitWidth();
-  uint32_t BBW = B.getBitWidth();
-
-  if (ABW > BBW)
-    B = B.zext(ABW);
-  else if (ABW < BBW)
-    A = A.zext(BBW);
-
-  return APIntOps::urem(A, B);
-}
-
-static const APInt udiv(const SCEVConstant *C1, const SCEVConstant *C2) {
-  APInt A = C1->getValue()->getValue();
-  APInt B = C2->getValue()->getValue();
-  uint32_t ABW = A.getBitWidth();
-  uint32_t BBW = B.getBitWidth();
-
-  if (ABW > BBW)
-    B = B.zext(ABW);
-  else if (ABW < BBW)
-    A = A.zext(BBW);
-
-  return APIntOps::udiv(A, B);
-}
-
 namespace {
 struct FindSCEVSize {
   int Size;
@@ -757,8 +701,7 @@ static inline int sizeOfSCEV(const SCEV *S) {
 
 namespace {
 
-template <typename Derived>
-struct SCEVDivision : public SCEVVisitor<Derived, void> {
+struct SCEVDivision : public SCEVVisitor<SCEVDivision, void> {
 public:
   // Computes the Quotient and Remainder of the division of Numerator by
   // Denominator.
@@ -767,7 +710,7 @@ public:
                      const SCEV **Remainder) {
     assert(Numerator && Denominator && "Uninitialized SCEV");
 
-    Derived D(SE, Numerator, Denominator);
+    SCEVDivision D(SE, Numerator, Denominator);
 
     // Check for the trivial case here to avoid having to check for it in the
     // rest of the code.
@@ -819,6 +762,27 @@ public:
   void visitUnknown(const SCEVUnknown *Numerator) {}
   void visitCouldNotCompute(const SCEVCouldNotCompute *Numerator) {}
 
+  void visitConstant(const SCEVConstant *Numerator) {
+    if (const SCEVConstant *D = dyn_cast<SCEVConstant>(Denominator)) {
+      APInt NumeratorVal = Numerator->getValue()->getValue();
+      APInt DenominatorVal = D->getValue()->getValue();
+      uint32_t NumeratorBW = NumeratorVal.getBitWidth();
+      uint32_t DenominatorBW = DenominatorVal.getBitWidth();
+
+      if (NumeratorBW > DenominatorBW)
+        DenominatorVal = DenominatorVal.sext(NumeratorBW);
+      else if (NumeratorBW < DenominatorBW)
+        NumeratorVal = NumeratorVal.sext(DenominatorBW);
+
+      APInt QuotientVal(NumeratorVal.getBitWidth(), 0);
+      APInt RemainderVal(NumeratorVal.getBitWidth(), 0);
+      APInt::sdivrem(NumeratorVal, DenominatorVal, QuotientVal, RemainderVal);
+      Quotient = SE.getConstant(QuotientVal);
+      Remainder = SE.getConstant(RemainderVal);
+      return;
+    }
+  }
+
   void visitAddRecExpr(const SCEVAddRecExpr *Numerator) {
     const SCEV *StartQ, *StartR, *StepQ, *StepR;
     assert(Numerator->isAffine() && "Numerator should be affine");
@@ -956,37 +920,6 @@ private:
 
   ScalarEvolution &SE;
   const SCEV *Denominator, *Quotient, *Remainder, *Zero, *One;
-
-  friend struct SCEVSDivision;
-  friend struct SCEVUDivision;
-};
-
-struct SCEVSDivision : public SCEVDivision<SCEVSDivision> {
-  SCEVSDivision(ScalarEvolution &S, const SCEV *Numerator,
-                const SCEV *Denominator)
-      : SCEVDivision(S, Numerator, Denominator) {}
-
-  void visitConstant(const SCEVConstant *Numerator) {
-    if (const SCEVConstant *D = dyn_cast<SCEVConstant>(Denominator)) {
-      Quotient = SE.getConstant(sdiv(Numerator, D));
-      Remainder = SE.getConstant(srem(Numerator, D));
-      return;
-    }
-  }
-};
-
-struct SCEVUDivision : public SCEVDivision<SCEVUDivision> {
-  SCEVUDivision(ScalarEvolution &S, const SCEV *Numerator,
-                const SCEV *Denominator)
-      : SCEVDivision(S, Numerator, Denominator) {}
-
-  void visitConstant(const SCEVConstant *Numerator) {
-    if (const SCEVConstant *D = dyn_cast<SCEVConstant>(Denominator)) {
-      Quotient = SE.getConstant(udiv(Numerator, D));
-      Remainder = SE.getConstant(urem(Numerator, D));
-      return;
-    }
-  }
 };
 
 }
@@ -1215,6 +1148,183 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op,
   return S;
 }
 
+// Get the limit of a recurrence such that incrementing by Step cannot cause
+// signed overflow as long as the value of the recurrence within the
+// loop does not exceed this limit before incrementing.
+static const SCEV *getSignedOverflowLimitForStep(const SCEV *Step,
+                                                 ICmpInst::Predicate *Pred,
+                                                 ScalarEvolution *SE) {
+  unsigned BitWidth = SE->getTypeSizeInBits(Step->getType());
+  if (SE->isKnownPositive(Step)) {
+    *Pred = ICmpInst::ICMP_SLT;
+    return SE->getConstant(APInt::getSignedMinValue(BitWidth) -
+                           SE->getSignedRange(Step).getSignedMax());
+  }
+  if (SE->isKnownNegative(Step)) {
+    *Pred = ICmpInst::ICMP_SGT;
+    return SE->getConstant(APInt::getSignedMaxValue(BitWidth) -
+                           SE->getSignedRange(Step).getSignedMin());
+  }
+  return nullptr;
+}
+
+// Get the limit of a recurrence such that incrementing by Step cannot cause
+// unsigned overflow as long as the value of the recurrence within the loop does
+// not exceed this limit before incrementing.
+static const SCEV *getUnsignedOverflowLimitForStep(const SCEV *Step,
+                                                   ICmpInst::Predicate *Pred,
+                                                   ScalarEvolution *SE) {
+  unsigned BitWidth = SE->getTypeSizeInBits(Step->getType());
+  *Pred = ICmpInst::ICMP_ULT;
+
+  return SE->getConstant(APInt::getMinValue(BitWidth) -
+                         SE->getUnsignedRange(Step).getUnsignedMax());
+}
+
+namespace {
+
+struct ExtendOpTraitsBase {
+  typedef const SCEV *(ScalarEvolution::*GetExtendExprTy)(const SCEV *, Type *);
+};
+
+// Used to make code generic over signed and unsigned overflow.
+template <typename ExtendOp> struct ExtendOpTraits {
+  // Members present:
+  //
+  // static const SCEV::NoWrapFlags WrapType;
+  //
+  // static const ExtendOpTraitsBase::GetExtendExprTy GetExtendExpr;
+  //
+  // static const SCEV *getOverflowLimitForStep(const SCEV *Step,
+  //                                           ICmpInst::Predicate *Pred,
+  //                                           ScalarEvolution *SE);
+};
+
+template <>
+struct ExtendOpTraits<SCEVSignExtendExpr> : public ExtendOpTraitsBase {
+  static const SCEV::NoWrapFlags WrapType = SCEV::FlagNSW;
+
+  static const GetExtendExprTy GetExtendExpr;
+
+  static const SCEV *getOverflowLimitForStep(const SCEV *Step,
+                                             ICmpInst::Predicate *Pred,
+                                             ScalarEvolution *SE) {
+    return getSignedOverflowLimitForStep(Step, Pred, SE);
+  }
+};
+
+const ExtendOpTraitsBase::GetExtendExprTy ExtendOpTraits<
+    SCEVSignExtendExpr>::GetExtendExpr = &ScalarEvolution::getSignExtendExpr;
+
+template <>
+struct ExtendOpTraits<SCEVZeroExtendExpr> : public ExtendOpTraitsBase {
+  static const SCEV::NoWrapFlags WrapType = SCEV::FlagNUW;
+
+  static const GetExtendExprTy GetExtendExpr;
+
+  static const SCEV *getOverflowLimitForStep(const SCEV *Step,
+                                             ICmpInst::Predicate *Pred,
+                                             ScalarEvolution *SE) {
+    return getUnsignedOverflowLimitForStep(Step, Pred, SE);
+  }
+};
+
+const ExtendOpTraitsBase::GetExtendExprTy ExtendOpTraits<
+    SCEVZeroExtendExpr>::GetExtendExpr = &ScalarEvolution::getZeroExtendExpr;
+}
+
+// The recurrence AR has been shown to have no signed/unsigned wrap or something
+// close to it. Typically, if we can prove NSW/NUW for AR, then we can just as
+// easily prove NSW/NUW for its preincrement or postincrement sibling. This
+// allows normalizing a sign/zero extended AddRec as such: {sext/zext(Step +
+// Start),+,Step} => {(Step + sext/zext(Start),+,Step} As a result, the
+// expression "Step + sext/zext(PreIncAR)" is congruent with
+// "sext/zext(PostIncAR)"
+template <typename ExtendOpTy>
+static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty,
+                                        ScalarEvolution *SE) {
+  auto WrapType = ExtendOpTraits<ExtendOpTy>::WrapType;
+  auto GetExtendExpr = ExtendOpTraits<ExtendOpTy>::GetExtendExpr;
+
+  const Loop *L = AR->getLoop();
+  const SCEV *Start = AR->getStart();
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+
+  // Check for a simple looking step prior to loop entry.
+  const SCEVAddExpr *SA = dyn_cast<SCEVAddExpr>(Start);
+  if (!SA)
+    return nullptr;
+
+  // Create an AddExpr for "PreStart" after subtracting Step. Full SCEV
+  // subtraction is expensive. For this purpose, perform a quick and dirty
+  // difference, by checking for Step in the operand list.
+  SmallVector<const SCEV *, 4> DiffOps;
+  for (const SCEV *Op : SA->operands())
+    if (Op != Step)
+      DiffOps.push_back(Op);
+
+  if (DiffOps.size() == SA->getNumOperands())
+    return nullptr;
+
+  // Try to prove `WrapType` (SCEV::FlagNSW or SCEV::FlagNUW) on `PreStart` +
+  // `Step`:
+
+  // 1. NSW/NUW flags on the step increment.
+  const SCEV *PreStart = SE->getAddExpr(DiffOps, SA->getNoWrapFlags());
+  const SCEVAddRecExpr *PreAR = dyn_cast<SCEVAddRecExpr>(
+      SE->getAddRecExpr(PreStart, Step, L, SCEV::FlagAnyWrap));
+
+  // "{S,+,X} is <nsw>/<nuw>" and "the backedge is taken at least once" implies
+  // "S+X does not sign/unsign-overflow".
+  //
+
+  const SCEV *BECount = SE->getBackedgeTakenCount(L);
+  if (PreAR && PreAR->getNoWrapFlags(WrapType) &&
+      !isa<SCEVCouldNotCompute>(BECount) && SE->isKnownPositive(BECount))
+    return PreStart;
+
+  // 2. Direct overflow check on the step operation's expression.
+  unsigned BitWidth = SE->getTypeSizeInBits(AR->getType());
+  Type *WideTy = IntegerType::get(SE->getContext(), BitWidth * 2);
+  const SCEV *OperandExtendedStart =
+      SE->getAddExpr((SE->*GetExtendExpr)(PreStart, WideTy),
+                     (SE->*GetExtendExpr)(Step, WideTy));
+  if ((SE->*GetExtendExpr)(Start, WideTy) == OperandExtendedStart) {
+    if (PreAR && AR->getNoWrapFlags(WrapType)) {
+      // If we know `AR` == {`PreStart`+`Step`,+,`Step`} is `WrapType` (FlagNSW
+      // or FlagNUW) and that `PreStart` + `Step` is `WrapType` too, then
+      // `PreAR` == {`PreStart`,+,`Step`} is also `WrapType`.  Cache this fact.
+      const_cast<SCEVAddRecExpr *>(PreAR)->setNoWrapFlags(WrapType);
+    }
+    return PreStart;
+  }
+
+  // 3. Loop precondition.
+  ICmpInst::Predicate Pred;
+  const SCEV *OverflowLimit =
+      ExtendOpTraits<ExtendOpTy>::getOverflowLimitForStep(Step, &Pred, SE);
+
+  if (OverflowLimit &&
+      SE->isLoopEntryGuardedByCond(L, Pred, PreStart, OverflowLimit)) {
+    return PreStart;
+  }
+  return nullptr;
+}
+
+// Get the normalized zero or sign extended expression for this AddRec's Start.
+template <typename ExtendOpTy>
+static const SCEV *getExtendAddRecStart(const SCEVAddRecExpr *AR, Type *Ty,
+                                        ScalarEvolution *SE) {
+  auto GetExtendExpr = ExtendOpTraits<ExtendOpTy>::GetExtendExpr;
+
+  const SCEV *PreStart = getPreStartForExtend<ExtendOpTy>(AR, Ty, SE);
+  if (!PreStart)
+    return (SE->*GetExtendExpr)(AR->getStart(), Ty);
+
+  return SE->getAddExpr((SE->*GetExtendExpr)(AR->getStepRecurrence(*SE), Ty),
+                        (SE->*GetExtendExpr)(PreStart, Ty));
+}
+
 const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
                                                Type *Ty) {
   assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
@@ -1268,9 +1378,9 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
       // If we have special knowledge that this addrec won't overflow,
       // we don't need to do any further analysis.
       if (AR->getNoWrapFlags(SCEV::FlagNUW))
-        return getAddRecExpr(getZeroExtendExpr(Start, Ty),
-                             getZeroExtendExpr(Step, Ty),
-                             L, AR->getNoWrapFlags());
+        return getAddRecExpr(
+            getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this),
+            getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
 
       // Check whether the backedge-taken count is SCEVCouldNotCompute.
       // Note that this serves two purposes: It filters out loops that are
@@ -1307,9 +1417,9 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
             // Cache knowledge of AR NUW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
             // Return the expression with the addrec on the outside.
-            return getAddRecExpr(getZeroExtendExpr(Start, Ty),
-                                 getZeroExtendExpr(Step, Ty),
-                                 L, AR->getNoWrapFlags());
+            return getAddRecExpr(
+                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this),
+                getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
           }
           // Similar to above, only this time treat the step value as signed.
           // This covers loops that count down.
@@ -1322,9 +1432,9 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
             // Negative step causes unsigned wrap, but it still can't self-wrap.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
             // Return the expression with the addrec on the outside.
-            return getAddRecExpr(getZeroExtendExpr(Start, Ty),
-                                 getSignExtendExpr(Step, Ty),
-                                 L, AR->getNoWrapFlags());
+            return getAddRecExpr(
+                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this),
+                getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
           }
         }
 
@@ -1342,9 +1452,9 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
             // Cache knowledge of AR NUW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
             // Return the expression with the addrec on the outside.
-            return getAddRecExpr(getZeroExtendExpr(Start, Ty),
-                                 getZeroExtendExpr(Step, Ty),
-                                 L, AR->getNoWrapFlags());
+            return getAddRecExpr(
+                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this),
+                getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
           }
         } else if (isKnownNegative(Step)) {
           const SCEV *N = getConstant(APInt::getMaxValue(BitWidth) -
@@ -1357,9 +1467,9 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
             // Negative step causes unsigned wrap, but it still can't self-wrap.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
             // Return the expression with the addrec on the outside.
-            return getAddRecExpr(getZeroExtendExpr(Start, Ty),
-                                 getSignExtendExpr(Step, Ty),
-                                 L, AR->getNoWrapFlags());
+            return getAddRecExpr(
+                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this),
+                getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
           }
         }
       }
@@ -1374,104 +1484,6 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
   return S;
 }
 
-// Get the limit of a recurrence such that incrementing by Step cannot cause
-// signed overflow as long as the value of the recurrence within the loop does
-// not exceed this limit before incrementing.
-static const SCEV *getOverflowLimitForStep(const SCEV *Step,
-                                           ICmpInst::Predicate *Pred,
-                                           ScalarEvolution *SE) {
-  unsigned BitWidth = SE->getTypeSizeInBits(Step->getType());
-  if (SE->isKnownPositive(Step)) {
-    *Pred = ICmpInst::ICMP_SLT;
-    return SE->getConstant(APInt::getSignedMinValue(BitWidth) -
-                           SE->getSignedRange(Step).getSignedMax());
-  }
-  if (SE->isKnownNegative(Step)) {
-    *Pred = ICmpInst::ICMP_SGT;
-    return SE->getConstant(APInt::getSignedMaxValue(BitWidth) -
-                       SE->getSignedRange(Step).getSignedMin());
-  }
-  return nullptr;
-}
-
-// The recurrence AR has been shown to have no signed wrap. Typically, if we can
-// prove NSW for AR, then we can just as easily prove NSW for its preincrement
-// or postincrement sibling. This allows normalizing a sign extended AddRec as
-// such: {sext(Step + Start),+,Step} => {(Step + sext(Start),+,Step} As a
-// result, the expression "Step + sext(PreIncAR)" is congruent with
-// "sext(PostIncAR)"
-static const SCEV *getPreStartForSignExtend(const SCEVAddRecExpr *AR,
-                                            Type *Ty,
-                                            ScalarEvolution *SE) {
-  const Loop *L = AR->getLoop();
-  const SCEV *Start = AR->getStart();
-  const SCEV *Step = AR->getStepRecurrence(*SE);
-
-  // Check for a simple looking step prior to loop entry.
-  const SCEVAddExpr *SA = dyn_cast<SCEVAddExpr>(Start);
-  if (!SA)
-    return nullptr;
-
-  // Create an AddExpr for "PreStart" after subtracting Step. Full SCEV
-  // subtraction is expensive. For this purpose, perform a quick and dirty
-  // difference, by checking for Step in the operand list.
-  SmallVector<const SCEV *, 4> DiffOps;
-  for (const SCEV *Op : SA->operands())
-    if (Op != Step)
-      DiffOps.push_back(Op);
-
-  if (DiffOps.size() == SA->getNumOperands())
-    return nullptr;
-
-  // This is a postinc AR. Check for overflow on the preinc recurrence using the
-  // same three conditions that getSignExtendedExpr checks.
-
-  // 1. NSW flags on the step increment.
-  const SCEV *PreStart = SE->getAddExpr(DiffOps, SA->getNoWrapFlags());
-  const SCEVAddRecExpr *PreAR = dyn_cast<SCEVAddRecExpr>(
-    SE->getAddRecExpr(PreStart, Step, L, SCEV::FlagAnyWrap));
-
-  if (PreAR && PreAR->getNoWrapFlags(SCEV::FlagNSW))
-    return PreStart;
-
-  // 2. Direct overflow check on the step operation's expression.
-  unsigned BitWidth = SE->getTypeSizeInBits(AR->getType());
-  Type *WideTy = IntegerType::get(SE->getContext(), BitWidth * 2);
-  const SCEV *OperandExtendedStart =
-    SE->getAddExpr(SE->getSignExtendExpr(PreStart, WideTy),
-                   SE->getSignExtendExpr(Step, WideTy));
-  if (SE->getSignExtendExpr(Start, WideTy) == OperandExtendedStart) {
-    // Cache knowledge of PreAR NSW.
-    if (PreAR)
-      const_cast<SCEVAddRecExpr *>(PreAR)->setNoWrapFlags(SCEV::FlagNSW);
-    // FIXME: this optimization needs a unit test
-    DEBUG(dbgs() << "SCEV: untested prestart overflow check\n");
-    return PreStart;
-  }
-
-  // 3. Loop precondition.
-  ICmpInst::Predicate Pred;
-  const SCEV *OverflowLimit = getOverflowLimitForStep(Step, &Pred, SE);
-
-  if (OverflowLimit &&
-      SE->isLoopEntryGuardedByCond(L, Pred, PreStart, OverflowLimit)) {
-    return PreStart;
-  }
-  return nullptr;
-}
-
-// Get the normalized sign-extended expression for this AddRec's Start.
-static const SCEV *getSignExtendAddRecStart(const SCEVAddRecExpr *AR,
-                                            Type *Ty,
-                                            ScalarEvolution *SE) {
-  const SCEV *PreStart = getPreStartForSignExtend(AR, Ty, SE);
-  if (!PreStart)
-    return SE->getSignExtendExpr(AR->getStart(), Ty);
-
-  return SE->getAddExpr(SE->getSignExtendExpr(AR->getStepRecurrence(*SE), Ty),
-                        SE->getSignExtendExpr(PreStart, Ty));
-}
-
 const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
                                                Type *Ty) {
   assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
@@ -1550,9 +1562,9 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
       // If we have special knowledge that this addrec won't overflow,
       // we don't need to do any further analysis.
       if (AR->getNoWrapFlags(SCEV::FlagNSW))
-        return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this),
-                             getSignExtendExpr(Step, Ty),
-                             L, SCEV::FlagNSW);
+        return getAddRecExpr(
+            getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this),
+            getSignExtendExpr(Step, Ty), L, SCEV::FlagNSW);
 
       // Check whether the backedge-taken count is SCEVCouldNotCompute.
       // Note that this serves two purposes: It filters out loops that are
@@ -1589,9 +1601,9 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
             // Cache knowledge of AR NSW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
             // Return the expression with the addrec on the outside.
-            return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this),
-                                 getSignExtendExpr(Step, Ty),
-                                 L, AR->getNoWrapFlags());
+            return getAddRecExpr(
+                getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this),
+                getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
           }
           // Similar to above, only this time treat the step value as unsigned.
           // This covers loops that count up with an unsigned step.
@@ -1600,12 +1612,20 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
                        getMulExpr(WideMaxBECount,
                                   getZeroExtendExpr(Step, WideTy)));
           if (SAdd == OperandExtendedAdd) {
-            // Cache knowledge of AR NSW, which is propagated to this AddRec.
-            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
+            // If AR wraps around then
+            //
+            //    abs(Step) * MaxBECount > unsigned-max(AR->getType())
+            // => SAdd != OperandExtendedAdd
+            //
+            // Thus (AR is not NW => SAdd != OperandExtendedAdd) <=>
+            // (SAdd == OperandExtendedAdd => AR is NW)
+
+            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
+
             // Return the expression with the addrec on the outside.
-            return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this),
-                                 getZeroExtendExpr(Step, Ty),
-                                 L, AR->getNoWrapFlags());
+            return getAddRecExpr(
+                getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this),
+                getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
           }
         }
 
@@ -1614,7 +1634,8 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
         // with the start value and the backedge is guarded by a comparison
         // with the post-inc value, the addrec is safe.
         ICmpInst::Predicate Pred;
-        const SCEV *OverflowLimit = getOverflowLimitForStep(Step, &Pred, this);
+        const SCEV *OverflowLimit =
+            getSignedOverflowLimitForStep(Step, &Pred, this);
         if (OverflowLimit &&
             (isLoopBackedgeGuardedByCond(L, Pred, AR, OverflowLimit) ||
              (isLoopEntryGuardedByCond(L, Pred, Start, OverflowLimit) &&
@@ -1622,9 +1643,9 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
                                           OverflowLimit)))) {
           // Cache knowledge of AR NSW, then propagate NSW to the wide AddRec.
           const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
-          return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this),
-                               getSignExtendExpr(Step, Ty),
-                               L, AR->getNoWrapFlags());
+          return getAddRecExpr(
+              getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this),
+              getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
         }
       }
       // If Start and Step are constants, check if we can apply this
@@ -1804,6 +1825,36 @@ namespace {
   };
 }
 
+// We're trying to construct a SCEV of type `Type' with `Ops' as operands and
+// `OldFlags' as can't-wrap behavior.  Infer a more aggressive set of
+// can't-overflow flags for the operation if possible.
+static SCEV::NoWrapFlags
+StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
+                      const SmallVectorImpl<const SCEV *> &Ops,
+                      SCEV::NoWrapFlags OldFlags) {
+  using namespace std::placeholders;
+
+  bool CanAnalyze =
+      Type == scAddExpr || Type == scAddRecExpr || Type == scMulExpr;
+  (void)CanAnalyze;
+  assert(CanAnalyze && "don't call from other places!");
+
+  int SignOrUnsignMask = SCEV::FlagNUW | SCEV::FlagNSW;
+  SCEV::NoWrapFlags SignOrUnsignWrap =
+      ScalarEvolution::maskFlags(OldFlags, SignOrUnsignMask);
+
+  // If FlagNSW is true and all the operands are non-negative, infer FlagNUW.
+  auto IsKnownNonNegative =
+    std::bind(std::mem_fn(&ScalarEvolution::isKnownNonNegative), SE, _1);
+
+  if (SignOrUnsignWrap == SCEV::FlagNSW &&
+      std::all_of(Ops.begin(), Ops.end(), IsKnownNonNegative))
+    return ScalarEvolution::setFlags(OldFlags,
+                                     (SCEV::NoWrapFlags)SignOrUnsignMask);
+
+  return OldFlags;
+}
+
 /// getAddExpr - Get a canonical add expression, or something simpler if
 /// possible.
 const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
@@ -1819,20 +1870,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
            "SCEVAddExpr operand types don't match!");
 #endif
 
-  // If FlagNSW is true and all the operands are non-negative, infer FlagNUW.
-  // And vice-versa.
-  int SignOrUnsignMask = SCEV::FlagNUW | SCEV::FlagNSW;
-  SCEV::NoWrapFlags SignOrUnsignWrap = maskFlags(Flags, SignOrUnsignMask);
-  if (SignOrUnsignWrap && (SignOrUnsignWrap != SignOrUnsignMask)) {
-    bool All = true;
-    for (SmallVectorImpl<const SCEV *>::const_iterator I = Ops.begin(),
-         E = Ops.end(); I != E; ++I)
-      if (!isKnownNonNegative(*I)) {
-        All = false;
-        break;
-      }
-    if (All) Flags = setFlags(Flags, (SCEV::NoWrapFlags)SignOrUnsignMask);
-  }
+  Flags = StrengthenNoWrapFlags(this, scAddExpr, Ops, Flags);
 
   // Sort by complexity, this groups all similar expression types together.
   GroupByComplexity(Ops, LI);
@@ -2207,6 +2245,24 @@ static uint64_t Choose(uint64_t n, uint64_t k, bool &Overflow) {
   return r;
 }
 
+/// Determine if any of the operands in this SCEV are a constant or if
+/// any of the add or multiply expressions in this SCEV contain a constant.
+static bool containsConstantSomewhere(const SCEV *StartExpr) {
+  SmallVector<const SCEV *, 4> Ops;
+  Ops.push_back(StartExpr);
+  while (!Ops.empty()) {
+    const SCEV *CurrentExpr = Ops.pop_back_val();
+    if (isa<SCEVConstant>(*CurrentExpr))
+      return true;
+
+    if (isa<SCEVAddExpr>(*CurrentExpr) || isa<SCEVMulExpr>(*CurrentExpr)) {
+      const auto *CurrentNAry = cast<SCEVNAryExpr>(CurrentExpr);
+      Ops.append(CurrentNAry->op_begin(), CurrentNAry->op_end());
+    }
+  }
+  return false;
+}
+
 /// getMulExpr - Get a canonical multiply expression, or something simpler if
 /// possible.
 const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
@@ -2222,20 +2278,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
            "SCEVMulExpr operand types don't match!");
 #endif
 
-  // If FlagNSW is true and all the operands are non-negative, infer FlagNUW.
-  // And vice-versa.
-  int SignOrUnsignMask = SCEV::FlagNUW | SCEV::FlagNSW;
-  SCEV::NoWrapFlags SignOrUnsignWrap = maskFlags(Flags, SignOrUnsignMask);
-  if (SignOrUnsignWrap && (SignOrUnsignWrap != SignOrUnsignMask)) {
-    bool All = true;
-    for (SmallVectorImpl<const SCEV *>::const_iterator I = Ops.begin(),
-         E = Ops.end(); I != E; ++I)
-      if (!isKnownNonNegative(*I)) {
-        All = false;
-        break;
-      }
-    if (All) Flags = setFlags(Flags, (SCEV::NoWrapFlags)SignOrUnsignMask);
-  }
+  Flags = StrengthenNoWrapFlags(this, scMulExpr, Ops, Flags);
 
   // Sort by complexity, this groups all similar expression types together.
   GroupByComplexity(Ops, LI);
@@ -2246,11 +2289,13 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
 
     // C1*(C2+V) -> C1*C2 + C1*V
     if (Ops.size() == 2)
-      if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1]))
-        if (Add->getNumOperands() == 2 &&
-            isa<SCEVConstant>(Add->getOperand(0)))
-          return getAddExpr(getMulExpr(LHSC, Add->getOperand(0)),
-                            getMulExpr(LHSC, Add->getOperand(1)));
+        if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1]))
+          // If any of Add's ops are Adds or Muls with a constant,
+          // apply this transformation as well.
+          if (Add->getNumOperands() == 2)
+            if (containsConstantSomewhere(Add))
+              return getAddExpr(getMulExpr(LHSC, Add->getOperand(0)),
+                                getMulExpr(LHSC, Add->getOperand(1)));
 
     ++Idx;
     while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
@@ -2699,20 +2744,7 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
   // meaningful BE count at this point (and if we don't, we'd be stuck
   // with a SCEVCouldNotCompute as the cached BE count).
 
-  // If FlagNSW is true and all the operands are non-negative, infer FlagNUW.
-  // And vice-versa.
-  int SignOrUnsignMask = SCEV::FlagNUW | SCEV::FlagNSW;
-  SCEV::NoWrapFlags SignOrUnsignWrap = maskFlags(Flags, SignOrUnsignMask);
-  if (SignOrUnsignWrap && (SignOrUnsignWrap != SignOrUnsignMask)) {
-    bool All = true;
-    for (SmallVectorImpl<const SCEV *>::const_iterator I = Operands.begin(),
-         E = Operands.end(); I != E; ++I)
-      if (!isKnownNonNegative(*I)) {
-        All = false;
-        break;
-      }
-    if (All) Flags = setFlags(Flags, (SCEV::NoWrapFlags)SignOrUnsignMask);
-  }
+  Flags = StrengthenNoWrapFlags(this, scAddRecExpr, Operands, Flags);
 
   // Canonicalize nested AddRecs in by nesting them in order of loop depth.
   if (const SCEVAddRecExpr *NestedAR = dyn_cast<SCEVAddRecExpr>(Operands[0])) {
@@ -3209,8 +3241,9 @@ const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
   if (LHS == RHS)
     return getConstant(LHS->getType(), 0);
 
-  // X - Y --> X + -Y
-  return getAddExpr(LHS, getNegativeSCEV(RHS), Flags);
+  // X - Y --> X + -Y.
+  // X -(nsw || nuw) Y --> X + -Y.
+  return getAddExpr(LHS, getNegativeSCEV(RHS));
 }
 
 /// getTruncateOrZeroExtend - Return a SCEV corresponding to a conversion of the
@@ -3516,12 +3549,10 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
                   if (isKnownPositive(getMinusSCEV(getSCEV(GEP), Ptr)))
                     Flags = setFlags(Flags, SCEV::FlagNUW);
                 }
-              } else if (const SubOperator *OBO =
-                           dyn_cast<SubOperator>(BEValueV)) {
-                if (OBO->hasNoUnsignedWrap())
-                  Flags = setFlags(Flags, SCEV::FlagNUW);
-                if (OBO->hasNoSignedWrap())
-                  Flags = setFlags(Flags, SCEV::FlagNSW);
+
+                // We cannot transfer nuw and nsw flags from subtraction
+                // operations -- sub nuw X, Y is not the same as add nuw X, -Y
+                // for instance.
               }
 
               const SCEV *StartVal = getSCEV(StartValueV);
@@ -3577,7 +3608,7 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
   // PHI's incoming blocks are in a different loop, in which case doing so
   // risks breaking LCSSA form. Instcombine would normally zap these, but
   // it doesn't have DominatorTree information, so it may miss cases.
-  if (Value *V = SimplifyInstruction(PN, DL, TLI, DT, AT))
+  if (Value *V = SimplifyInstruction(PN, DL, TLI, DT, AC))
     if (LI->replacementPreservesLCSSAForm(PN, V))
       return getSCEV(V);
 
@@ -3709,7 +3740,7 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
     // For a SCEVUnknown, ask ValueTracking.
     unsigned BitWidth = getTypeSizeInBits(U->getType());
     APInt Zeros(BitWidth, 0), Ones(BitWidth, 0);
-    computeKnownBits(U->getValue(), Zeros, Ones, DL, 0, AT, nullptr, DT);
+    computeKnownBits(U->getValue(), Zeros, Ones, DL, 0, AC, nullptr, DT);
     return Zeros.countTrailingOnes();
   }
 
@@ -3729,8 +3760,10 @@ static Optional<ConstantRange> GetRangeFromMetadata(Value *V) {
       assert(NumRanges >= 1);
 
       for (unsigned i = 0; i < NumRanges; ++i) {
-        ConstantInt *Lower = cast<ConstantInt>(MD->getOperand(2*i + 0));
-        ConstantInt *Upper = cast<ConstantInt>(MD->getOperand(2*i + 1));
+        ConstantInt *Lower =
+            mdconst::extract<ConstantInt>(MD->getOperand(2 * i + 0));
+        ConstantInt *Upper =
+            mdconst::extract<ConstantInt>(MD->getOperand(2 * i + 1));
         ConstantRange Range(Lower->getValue(), Upper->getValue());
         TotalRange = TotalRange.unionWith(Range);
       }
@@ -3878,7 +3911,7 @@ ScalarEvolution::getUnsignedRange(const SCEV *S) {
 
     // For a SCEVUnknown, ask ValueTracking.
     APInt Zeros(BitWidth, 0), Ones(BitWidth, 0);
-    computeKnownBits(U->getValue(), Zeros, Ones, DL, 0, AT, nullptr, DT);
+    computeKnownBits(U->getValue(), Zeros, Ones, DL, 0, AC, nullptr, DT);
     if (Ones == ~Zeros + 1)
       return setUnsignedRange(U, ConservativeResult);
     return setUnsignedRange(U,
@@ -4035,7 +4068,7 @@ ScalarEvolution::getSignedRange(const SCEV *S) {
     // For a SCEVUnknown, ask ValueTracking.
     if (!U->getValue()->getType()->isIntegerTy() && !DL)
       return setSignedRange(U, ConservativeResult);
-    unsigned NS = ComputeNumSignBits(U->getValue(), DL, 0, AT, nullptr, DT);
+    unsigned NS = ComputeNumSignBits(U->getValue(), DL, 0, AC, nullptr, DT);
     if (NS <= 1)
       return setSignedRange(U, ConservativeResult);
     return setSignedRange(U, ConservativeResult.intersectWith(
@@ -4142,8 +4175,8 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
       unsigned TZ = A.countTrailingZeros();
       unsigned BitWidth = A.getBitWidth();
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      computeKnownBits(U->getOperand(0), KnownZero, KnownOne, DL,
-                       0, AT, nullptr, DT);
+      computeKnownBits(U->getOperand(0), KnownZero, KnownOne, DL, 0, AC,
+                       nullptr, DT);
 
       APInt EffectiveMask =
           APInt::getLowBitsSet(BitWidth, BitWidth - LZ - TZ).shl(TZ);
@@ -4334,9 +4367,10 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
       case ICmpInst::ICMP_SGE:
         // a >s b ? a+x : b+x  ->  smax(a, b)+x
         // a >s b ? b+x : a+x  ->  smin(a, b)+x
-        if (LHS->getType() == U->getType()) {
-          const SCEV *LS = getSCEV(LHS);
-          const SCEV *RS = getSCEV(RHS);
+        if (getTypeSizeInBits(LHS->getType()) <=
+            getTypeSizeInBits(U->getType())) {
+          const SCEV *LS = getNoopOrSignExtend(getSCEV(LHS), U->getType());
+          const SCEV *RS = getNoopOrSignExtend(getSCEV(RHS), U->getType());
           const SCEV *LA = getSCEV(U->getOperand(1));
           const SCEV *RA = getSCEV(U->getOperand(2));
           const SCEV *LDiff = getMinusSCEV(LA, LS);
@@ -4357,9 +4391,10 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
       case ICmpInst::ICMP_UGE:
         // a >u b ? a+x : b+x  ->  umax(a, b)+x
         // a >u b ? b+x : a+x  ->  umin(a, b)+x
-        if (LHS->getType() == U->getType()) {
-          const SCEV *LS = getSCEV(LHS);
-          const SCEV *RS = getSCEV(RHS);
+        if (getTypeSizeInBits(LHS->getType()) <=
+            getTypeSizeInBits(U->getType())) {
+          const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), U->getType());
+          const SCEV *RS = getNoopOrZeroExtend(getSCEV(RHS), U->getType());
           const SCEV *LA = getSCEV(U->getOperand(1));
           const SCEV *RA = getSCEV(U->getOperand(2));
           const SCEV *LDiff = getMinusSCEV(LA, LS);
@@ -4374,11 +4409,11 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
         break;
       case ICmpInst::ICMP_NE:
         // n != 0 ? n+x : 1+x  ->  umax(n, 1)+x
-        if (LHS->getType() == U->getType() &&
-            isa<ConstantInt>(RHS) &&
-            cast<ConstantInt>(RHS)->isZero()) {
-          const SCEV *One = getConstant(LHS->getType(), 1);
-          const SCEV *LS = getSCEV(LHS);
+        if (getTypeSizeInBits(LHS->getType()) <=
+                getTypeSizeInBits(U->getType()) &&
+            isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isZero()) {
+          const SCEV *One = getConstant(U->getType(), 1);
+          const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), U->getType());
           const SCEV *LA = getSCEV(U->getOperand(1));
           const SCEV *RA = getSCEV(U->getOperand(2));
           const SCEV *LDiff = getMinusSCEV(LA, LS);
@@ -4389,11 +4424,11 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
         break;
       case ICmpInst::ICMP_EQ:
         // n == 0 ? 1+x : n+x  ->  umax(n, 1)+x
-        if (LHS->getType() == U->getType() &&
-            isa<ConstantInt>(RHS) &&
-            cast<ConstantInt>(RHS)->isZero()) {
-          const SCEV *One = getConstant(LHS->getType(), 1);
-          const SCEV *LS = getSCEV(LHS);
+        if (getTypeSizeInBits(LHS->getType()) <=
+                getTypeSizeInBits(U->getType()) &&
+            isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isZero()) {
+          const SCEV *One = getConstant(U->getType(), 1);
+          const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), U->getType());
           const SCEV *LA = getSCEV(U->getOperand(1));
           const SCEV *RA = getSCEV(U->getOperand(2));
           const SCEV *LDiff = getMinusSCEV(LA, One);
@@ -6138,15 +6173,18 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool ControlsExit) {
     return ExitLimit(Distance, MaxBECount);
   }
 
-  // If the step exactly divides the distance then unsigned divide computes the
-  // backedge count.
-  const SCEV *Q, *R;
-  ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this);
-  SCEVUDivision::divide(SE, Distance, Step, &Q, &R);
-  if (R->isZero()) {
-    const SCEV *Exact =
-        getUDivExactExpr(Distance, CountDown ? getNegativeSCEV(Step) : Step);
-    return ExitLimit(Exact, Exact);
+  // As a special case, handle the instance where Step is a positive power of
+  // two. In this case, determining whether Step divides Distance evenly can be
+  // done by counting and comparing the number of trailing zeros of Step and
+  // Distance.
+  if (!CountDown) {
+    const APInt &StepV = StepC->getValue()->getValue();
+    // StepV.isPowerOf2() returns true if StepV is an positive power of two.  It
+    // also returns true if StepV is maximally negative (eg, INT_MIN), but that
+    // case is not handled as this code is guarded by !CountDown.
+    if (StepV.isPowerOf2() &&
+        GetMinTrailingZeros(Distance) >= StepV.countTrailingZeros())
+      return getUDivExactExpr(Distance, Step);
   }
 
   // If the condition controls loop exit (the loop exits only if the expression
@@ -6671,7 +6709,10 @@ ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L,
     return true;
 
   // Check conditions due to any @llvm.assume intrinsics.
-  for (auto &CI : AT->assumptions(F)) {
+  for (auto &AssumeVH : AC->assumptions()) {
+    if (!AssumeVH)
+      continue;
+    auto *CI = cast<CallInst>(AssumeVH);
     if (!DT->dominates(CI, Latch->getTerminator()))
       continue;
 
@@ -6716,7 +6757,10 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
   }
 
   // Check conditions due to any @llvm.assume intrinsics.
-  for (auto &CI : AT->assumptions(F)) {
+  for (auto &AssumeVH : AC->assumptions()) {
+    if (!AssumeVH)
+      continue;
+    auto *CI = cast<CallInst>(AssumeVH);
     if (!DT->dominates(CI, L->getHeader()))
       continue;
 
@@ -6927,6 +6971,85 @@ bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred,
                                      getNotSCEV(FoundLHS));
 }
 
+
+/// If Expr computes ~A, return A else return nullptr
+static const SCEV *MatchNotExpr(const SCEV *Expr) {
+  const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Expr);
+  if (!Add || Add->getNumOperands() != 2) return nullptr;
+
+  const SCEVConstant *AddLHS = dyn_cast<SCEVConstant>(Add->getOperand(0));
+  if (!(AddLHS && AddLHS->getValue()->getValue().isAllOnesValue()))
+    return nullptr;
+
+  const SCEVMulExpr *AddRHS = dyn_cast<SCEVMulExpr>(Add->getOperand(1));
+  if (!AddRHS || AddRHS->getNumOperands() != 2) return nullptr;
+
+  const SCEVConstant *MulLHS = dyn_cast<SCEVConstant>(AddRHS->getOperand(0));
+  if (!(MulLHS && MulLHS->getValue()->getValue().isAllOnesValue()))
+    return nullptr;
+
+  return AddRHS->getOperand(1);
+}
+
+
+/// Is MaybeMaxExpr an SMax or UMax of Candidate and some other values?
+template<typename MaxExprType>
+static bool IsMaxConsistingOf(const SCEV *MaybeMaxExpr,
+                              const SCEV *Candidate) {
+  const MaxExprType *MaxExpr = dyn_cast<MaxExprType>(MaybeMaxExpr);
+  if (!MaxExpr) return false;
+
+  auto It = std::find(MaxExpr->op_begin(), MaxExpr->op_end(), Candidate);
+  return It != MaxExpr->op_end();
+}
+
+
+/// Is MaybeMinExpr an SMin or UMin of Candidate and some other values?
+template<typename MaxExprType>
+static bool IsMinConsistingOf(ScalarEvolution &SE,
+                              const SCEV *MaybeMinExpr,
+                              const SCEV *Candidate) {
+  const SCEV *MaybeMaxExpr = MatchNotExpr(MaybeMinExpr);
+  if (!MaybeMaxExpr)
+    return false;
+
+  return IsMaxConsistingOf<MaxExprType>(MaybeMaxExpr, SE.getNotSCEV(Candidate));
+}
+
+
+/// Is LHS `Pred` RHS true on the virtue of LHS or RHS being a Min or Max
+/// expression?
+static bool IsKnownPredicateViaMinOrMax(ScalarEvolution &SE,
+                                        ICmpInst::Predicate Pred,
+                                        const SCEV *LHS, const SCEV *RHS) {
+  switch (Pred) {
+  default:
+    return false;
+
+  case ICmpInst::ICMP_SGE:
+    std::swap(LHS, RHS);
+    // fall through
+  case ICmpInst::ICMP_SLE:
+    return
+      // min(A, ...) <= A
+      IsMinConsistingOf<SCEVSMaxExpr>(SE, LHS, RHS) ||
+      // A <= max(A, ...)
+      IsMaxConsistingOf<SCEVSMaxExpr>(RHS, LHS);
+
+  case ICmpInst::ICMP_UGE:
+    std::swap(LHS, RHS);
+    // fall through
+  case ICmpInst::ICMP_ULE:
+    return
+      // min(A, ...) <= A
+      IsMinConsistingOf<SCEVUMaxExpr>(SE, LHS, RHS) ||
+      // A <= max(A, ...)
+      IsMaxConsistingOf<SCEVUMaxExpr>(RHS, LHS);
+  }
+
+  llvm_unreachable("covered switch fell through?!");
+}
+
 /// isImpliedCondOperandsHelper - Test whether the condition described by
 /// Pred, LHS, and RHS is true whenever the condition described by Pred,
 /// FoundLHS, and FoundRHS is true.
@@ -6935,6 +7058,12 @@ ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
                                              const SCEV *LHS, const SCEV *RHS,
                                              const SCEV *FoundLHS,
                                              const SCEV *FoundRHS) {
+  auto IsKnownPredicateFull =
+      [this](ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) {
+    return isKnownPredicateWithRanges(Pred, LHS, RHS) ||
+        IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS);
+  };
+
   switch (Pred) {
   default: llvm_unreachable("Unexpected ICmpInst::Predicate value!");
   case ICmpInst::ICMP_EQ:
@@ -6944,26 +7073,26 @@ ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
     break;
   case ICmpInst::ICMP_SLT:
   case ICmpInst::ICMP_SLE:
-    if (isKnownPredicateWithRanges(ICmpInst::ICMP_SLE, LHS, FoundLHS) &&
-        isKnownPredicateWithRanges(ICmpInst::ICMP_SGE, RHS, FoundRHS))
+    if (IsKnownPredicateFull(ICmpInst::ICMP_SLE, LHS, FoundLHS) &&
+        IsKnownPredicateFull(ICmpInst::ICMP_SGE, RHS, FoundRHS))
       return true;
     break;
   case ICmpInst::ICMP_SGT:
   case ICmpInst::ICMP_SGE:
-    if (isKnownPredicateWithRanges(ICmpInst::ICMP_SGE, LHS, FoundLHS) &&
-        isKnownPredicateWithRanges(ICmpInst::ICMP_SLE, RHS, FoundRHS))
+    if (IsKnownPredicateFull(ICmpInst::ICMP_SGE, LHS, FoundLHS) &&
+        IsKnownPredicateFull(ICmpInst::ICMP_SLE, RHS, FoundRHS))
       return true;
     break;
   case ICmpInst::ICMP_ULT:
   case ICmpInst::ICMP_ULE:
-    if (isKnownPredicateWithRanges(ICmpInst::ICMP_ULE, LHS, FoundLHS) &&
-        isKnownPredicateWithRanges(ICmpInst::ICMP_UGE, RHS, FoundRHS))
+    if (IsKnownPredicateFull(ICmpInst::ICMP_ULE, LHS, FoundLHS) &&
+        IsKnownPredicateFull(ICmpInst::ICMP_UGE, RHS, FoundRHS))
       return true;
     break;
   case ICmpInst::ICMP_UGT:
   case ICmpInst::ICMP_UGE:
-    if (isKnownPredicateWithRanges(ICmpInst::ICMP_UGE, LHS, FoundLHS) &&
-        isKnownPredicateWithRanges(ICmpInst::ICMP_ULE, RHS, FoundRHS))
+    if (IsKnownPredicateFull(ICmpInst::ICMP_UGE, LHS, FoundLHS) &&
+        IsKnownPredicateFull(ICmpInst::ICMP_ULE, RHS, FoundRHS))
       return true;
     break;
   }
@@ -6971,8 +7100,8 @@ ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
   return false;
 }
 
-// Verify if an linear IV with positive stride can overflow when in a 
-// less-than comparison, knowing the invariant term of the comparison, the 
+// Verify if an linear IV with positive stride can overflow when in a
+// less-than comparison, knowing the invariant term of the comparison, the
 // stride and the knowledge of NSW/NUW flags on the recurrence.
 bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
                                          bool IsSigned, bool NoWrap) {
@@ -7000,7 +7129,7 @@ bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
   return (MaxValue - MaxStrideMinusOne).ult(MaxRHS);
 }
 
-// Verify if an linear IV with negative stride can overflow when in a 
+// Verify if an linear IV with negative stride can overflow when in a
 // greater-than comparison, knowing the invariant term of the comparison,
 // the stride and the knowledge of NSW/NUW flags on the recurrence.
 bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
@@ -7031,7 +7160,7 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
 
 // Compute the backedge taken count knowing the interval difference, the
 // stride and presence of the equality in the comparison.
-const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, const SCEV *Step, 
+const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, const SCEV *Step,
                                             bool Equality) {
   const SCEV *One = getConstant(Step->getType(), 1);
   Delta = Equality ? getAddExpr(Delta, Step)
@@ -7071,7 +7200,7 @@ ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
 
   // Avoid proven overflow cases: this will ensure that the backedge taken count
   // will not generate any unsigned overflow. Relaxed no-overflow conditions
-  // exploit NoWrapFlags, allowing to optimize in presence of undefined 
+  // exploit NoWrapFlags, allowing to optimize in presence of undefined
   // behaviors like the case of C language.
   if (!Stride->isOne() && doesIVOverflowOnLT(RHS, Stride, IsSigned, NoWrap))
     return getCouldNotCompute();
@@ -7151,7 +7280,7 @@ ScalarEvolution::HowManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
 
   // Avoid proven overflow cases: this will ensure that the backedge taken count
   // will not generate any unsigned overflow. Relaxed no-overflow conditions
-  // exploit NoWrapFlags, allowing to optimize in presence of undefined 
+  // exploit NoWrapFlags, allowing to optimize in presence of undefined
   // behaviors like the case of C language.
   if (!Stride->isOne() && doesIVOverflowOnGT(RHS, Stride, IsSigned, NoWrap))
     return getCouldNotCompute();
@@ -7199,7 +7328,7 @@ ScalarEvolution::HowManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
   if (isa<SCEVConstant>(BECount))
     MaxBECount = BECount;
   else
-    MaxBECount = computeBECount(getConstant(MaxStart - MinEnd), 
+    MaxBECount = computeBECount(getConstant(MaxStart - MinEnd),
                                 getConstant(MinStride), false);
 
   if (isa<SCEVCouldNotCompute>(MaxBECount))
@@ -7457,7 +7586,7 @@ static bool findArrayDimensionsRec(ScalarEvolution &SE,
   for (const SCEV *&Term : Terms) {
     // Normalize the terms before the next call to findArrayDimensionsRec.
     const SCEV *Q, *R;
-    SCEVSDivision::divide(SE, Term, Step, &Q, &R);
+    SCEVDivision::divide(SE, Term, Step, &Q, &R);
 
     // Bail out when GCD does not evenly divide one of the terms.
     if (!R->isZero())
@@ -7594,7 +7723,7 @@ void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
   // Divide all terms by the element size.
   for (const SCEV *&Term : Terms) {
     const SCEV *Q, *R;
-    SCEVSDivision::divide(SE, Term, ElementSize, &Q, &R);
+    SCEVDivision::divide(SE, Term, ElementSize, &Q, &R);
     Term = Q;
   }
 
@@ -7641,7 +7770,7 @@ void SCEVAddRecExpr::computeAccessFunctions(
   int Last = Sizes.size() - 1;
   for (int i = Last; i >= 0; i--) {
     const SCEV *Q, *R;
-    SCEVSDivision::divide(SE, Res, Sizes[i], &Q, &R);
+    SCEVDivision::divide(SE, Res, Sizes[i], &Q, &R);
 
     DEBUG({
         dbgs() << "Res: " << *Res << "\n";
@@ -7825,11 +7954,11 @@ ScalarEvolution::ScalarEvolution()
 
 bool ScalarEvolution::runOnFunction(Function &F) {
   this->F = &F;
-  AT = &getAnalysis<AssumptionTracker>();
-  LI = &getAnalysis<LoopInfo>();
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
-  TLI = &getAnalysis<TargetLibraryInfo>();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   return false;
 }
@@ -7866,10 +7995,10 @@ void ScalarEvolution::releaseMemory() {
 
 void ScalarEvolution::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<AssumptionTracker>();
-  AU.addRequiredTransitive<LoopInfo>();
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequiredTransitive<LoopInfoWrapperPass>();
   AU.addRequiredTransitive<DominatorTreeWrapperPass>();
-  AU.addRequired<TargetLibraryInfo>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
 }
 
 bool ScalarEvolution::hasLoopInvariantBackedgeTakenCount(const Loop *L) {
@@ -7960,17 +8089,17 @@ void ScalarEvolution::print(raw_ostream &OS, const Module *) const {
 
 ScalarEvolution::LoopDisposition
 ScalarEvolution::getLoopDisposition(const SCEV *S, const Loop *L) {
-  SmallVector<std::pair<const Loop *, LoopDisposition>, 2> &Values = LoopDispositions[S];
-  for (unsigned u = 0; u < Values.size(); u++) {
-    if (Values[u].first == L)
-      return Values[u].second;
+  auto &Values = LoopDispositions[S];
+  for (auto &V : Values) {
+    if (V.getPointer() == L)
+      return V.getInt();
   }
-  Values.push_back(std::make_pair(L, LoopVariant));
+  Values.emplace_back(L, LoopVariant);
   LoopDisposition D = computeLoopDisposition(S, L);
-  SmallVector<std::pair<const Loop *, LoopDisposition>, 2> &Values2 = LoopDispositions[S];
-  for (unsigned u = Values2.size(); u > 0; u--) {
-    if (Values2[u - 1].first == L) {
-      Values2[u - 1].second = D;
+  auto &Values2 = LoopDispositions[S];
+  for (auto &V : make_range(Values2.rbegin(), Values2.rend())) {
+    if (V.getPointer() == L) {
+      V.setInt(D);
       break;
     }
   }
@@ -8066,17 +8195,17 @@ bool ScalarEvolution::hasComputableLoopEvolution(const SCEV *S, const Loop *L) {
 
 ScalarEvolution::BlockDisposition
 ScalarEvolution::getBlockDisposition(const SCEV *S, const BasicBlock *BB) {
-  SmallVector<std::pair<const BasicBlock *, BlockDisposition>, 2> &Values = BlockDispositions[S];
-  for (unsigned u = 0; u < Values.size(); u++) {
-    if (Values[u].first == BB)
-      return Values[u].second;
+  auto &Values = BlockDispositions[S];
+  for (auto &V : Values) {
+    if (V.getPointer() == BB)
+      return V.getInt();
   }
-  Values.push_back(std::make_pair(BB, DoesNotDominateBlock));
+  Values.emplace_back(BB, DoesNotDominateBlock);
   BlockDisposition D = computeBlockDisposition(S, BB);
-  SmallVector<std::pair<const BasicBlock *, BlockDisposition>, 2> &Values2 = BlockDispositions[S];
-  for (unsigned u = Values2.size(); u > 0; u--) {
-    if (Values2[u - 1].first == BB) {
-      Values2[u - 1].second = D;
+  auto &Values2 = BlockDispositions[S];
+  for (auto &V : make_range(Values2.rbegin(), Values2.rend())) {
+    if (V.getPointer() == BB) {
+      V.setInt(D);
       break;
     }
   }
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index bee3685..2625cf3 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -1063,6 +1063,34 @@ static bool canBeCheaplyTransformed(ScalarEvolution &SE,
   return false;
 }
 
+static bool IsIncrementNSW(ScalarEvolution &SE, const SCEVAddRecExpr *AR) {
+  if (!isa<IntegerType>(AR->getType()))
+    return false;
+
+  unsigned BitWidth = cast<IntegerType>(AR->getType())->getBitWidth();
+  Type *WideTy = IntegerType::get(AR->getType()->getContext(), BitWidth * 2);
+  const SCEV *Step = AR->getStepRecurrence(SE);
+  const SCEV *OpAfterExtend = SE.getAddExpr(SE.getSignExtendExpr(Step, WideTy),
+                                            SE.getSignExtendExpr(AR, WideTy));
+  const SCEV *ExtendAfterOp =
+    SE.getSignExtendExpr(SE.getAddExpr(AR, Step), WideTy);
+  return ExtendAfterOp == OpAfterExtend;
+}
+
+static bool IsIncrementNUW(ScalarEvolution &SE, const SCEVAddRecExpr *AR) {
+  if (!isa<IntegerType>(AR->getType()))
+    return false;
+
+  unsigned BitWidth = cast<IntegerType>(AR->getType())->getBitWidth();
+  Type *WideTy = IntegerType::get(AR->getType()->getContext(), BitWidth * 2);
+  const SCEV *Step = AR->getStepRecurrence(SE);
+  const SCEV *OpAfterExtend = SE.getAddExpr(SE.getZeroExtendExpr(Step, WideTy),
+                                            SE.getZeroExtendExpr(AR, WideTy));
+  const SCEV *ExtendAfterOp =
+    SE.getZeroExtendExpr(SE.getAddExpr(AR, Step), WideTy);
+  return ExtendAfterOp == OpAfterExtend;
+}
+
 /// getAddRecExprPHILiterally - Helper for expandAddRecExprLiterally. Expand
 /// the base addrec, which is the addrec without any non-loop-dominating
 /// values, and return the PHI.
@@ -1188,6 +1216,12 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
   // Expand the step somewhere that dominates the loop header.
   Value *StepV = expandCodeFor(Step, IntTy, L->getHeader()->begin());
 
+  // The no-wrap behavior proved by IsIncrement(NUW|NSW) is only applicable if
+  // we actually do emit an addition.  It does not apply if we emit a
+  // subtraction.
+  bool IncrementIsNUW = !useSubtract && IsIncrementNUW(SE, Normalized);
+  bool IncrementIsNSW = !useSubtract && IsIncrementNSW(SE, Normalized);
+
   // Create the PHI.
   BasicBlock *Header = L->getHeader();
   Builder.SetInsertPoint(Header, Header->begin());
@@ -1213,10 +1247,11 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
       IVIncInsertPos : Pred->getTerminator();
     Builder.SetInsertPoint(InsertPos);
     Value *IncV = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract);
+
     if (isa<OverflowingBinaryOperator>(IncV)) {
-      if (Normalized->getNoWrapFlags(SCEV::FlagNUW))
+      if (IncrementIsNUW)
         cast<BinaryOperator>(IncV)->setHasNoUnsignedWrap();
-      if (Normalized->getNoWrapFlags(SCEV::FlagNSW))
+      if (IncrementIsNSW)
         cast<BinaryOperator>(IncV)->setHasNoSignedWrap();
     }
     PN->addIncoming(IncV, Pred);
@@ -1711,7 +1746,7 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
 
     // Fold constant phis. They may be congruent to other constant phis and
     // would confuse the logic below that expects proper IVs.
-    if (Value *V = SimplifyInstruction(Phi, SE.DL, SE.TLI, SE.DT, SE.AT)) {
+    if (Value *V = SimplifyInstruction(Phi, SE.DL, SE.TLI, SE.DT, SE.AC)) {
       Phi->replaceAllUsesWith(V);
       DeadInsts.push_back(Phi);
       ++NumElim;
diff --git a/lib/Analysis/ScopedNoAliasAA.cpp b/lib/Analysis/ScopedNoAliasAA.cpp
index f6c300a..c6ea3af 100644
--- a/lib/Analysis/ScopedNoAliasAA.cpp
+++ b/lib/Analysis/ScopedNoAliasAA.cpp
@@ -33,8 +33,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp
new file mode 100644
index 0000000..91041fc
--- /dev/null
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@@ -0,0 +1,810 @@
+//===-- TargetLibraryInfo.cpp - Runtime library information ----------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetLibraryInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/ADT/Triple.h"
+using namespace llvm;
+
+const char* TargetLibraryInfoImpl::StandardNames[LibFunc::NumLibFuncs] =
+  {
+    "_IO_getc",
+    "_IO_putc",
+    "_ZdaPv",
+    "_ZdaPvRKSt9nothrow_t",
+    "_ZdaPvj",
+    "_ZdaPvm",
+    "_ZdlPv",
+    "_ZdlPvRKSt9nothrow_t",
+    "_ZdlPvj",
+    "_ZdlPvm",
+    "_Znaj",
+    "_ZnajRKSt9nothrow_t",
+    "_Znam",
+    "_ZnamRKSt9nothrow_t",
+    "_Znwj",
+    "_ZnwjRKSt9nothrow_t",
+    "_Znwm",
+    "_ZnwmRKSt9nothrow_t",
+    "__cospi",
+    "__cospif",
+    "__cxa_atexit",
+    "__cxa_guard_abort",
+    "__cxa_guard_acquire",
+    "__cxa_guard_release",
+    "__isoc99_scanf",
+    "__isoc99_sscanf",
+    "__memcpy_chk",
+    "__memmove_chk",
+    "__memset_chk",
+    "__sincospi_stret",
+    "__sincospif_stret",
+    "__sinpi",
+    "__sinpif",
+    "__sqrt_finite",
+    "__sqrtf_finite",
+    "__sqrtl_finite",
+    "__stpcpy_chk",
+    "__stpncpy_chk",
+    "__strcpy_chk",
+    "__strdup",
+    "__strncpy_chk",
+    "__strndup",
+    "__strtok_r",
+    "abs",
+    "access",
+    "acos",
+    "acosf",
+    "acosh",
+    "acoshf",
+    "acoshl",
+    "acosl",
+    "asin",
+    "asinf",
+    "asinh",
+    "asinhf",
+    "asinhl",
+    "asinl",
+    "atan",
+    "atan2",
+    "atan2f",
+    "atan2l",
+    "atanf",
+    "atanh",
+    "atanhf",
+    "atanhl",
+    "atanl",
+    "atof",
+    "atoi",
+    "atol",
+    "atoll",
+    "bcmp",
+    "bcopy",
+    "bzero",
+    "calloc",
+    "cbrt",
+    "cbrtf",
+    "cbrtl",
+    "ceil",
+    "ceilf",
+    "ceill",
+    "chmod",
+    "chown",
+    "clearerr",
+    "closedir",
+    "copysign",
+    "copysignf",
+    "copysignl",
+    "cos",
+    "cosf",
+    "cosh",
+    "coshf",
+    "coshl",
+    "cosl",
+    "ctermid",
+    "exp",
+    "exp10",
+    "exp10f",
+    "exp10l",
+    "exp2",
+    "exp2f",
+    "exp2l",
+    "expf",
+    "expl",
+    "expm1",
+    "expm1f",
+    "expm1l",
+    "fabs",
+    "fabsf",
+    "fabsl",
+    "fclose",
+    "fdopen",
+    "feof",
+    "ferror",
+    "fflush",
+    "ffs",
+    "ffsl",
+    "ffsll",
+    "fgetc",
+    "fgetpos",
+    "fgets",
+    "fileno",
+    "fiprintf",
+    "flockfile",
+    "floor",
+    "floorf",
+    "floorl",
+    "fmax",
+    "fmaxf",
+    "fmaxl",
+    "fmin",
+    "fminf",
+    "fminl",
+    "fmod",
+    "fmodf",
+    "fmodl",
+    "fopen",
+    "fopen64",
+    "fprintf",
+    "fputc",
+    "fputs",
+    "fread",
+    "free",
+    "frexp",
+    "frexpf",
+    "frexpl",
+    "fscanf",
+    "fseek",
+    "fseeko",
+    "fseeko64",
+    "fsetpos",
+    "fstat",
+    "fstat64",
+    "fstatvfs",
+    "fstatvfs64",
+    "ftell",
+    "ftello",
+    "ftello64",
+    "ftrylockfile",
+    "funlockfile",
+    "fwrite",
+    "getc",
+    "getc_unlocked",
+    "getchar",
+    "getenv",
+    "getitimer",
+    "getlogin_r",
+    "getpwnam",
+    "gets",
+    "gettimeofday",
+    "htonl",
+    "htons",
+    "iprintf",
+    "isascii",
+    "isdigit",
+    "labs",
+    "lchown",
+    "ldexp",
+    "ldexpf",
+    "ldexpl",
+    "llabs",
+    "log",
+    "log10",
+    "log10f",
+    "log10l",
+    "log1p",
+    "log1pf",
+    "log1pl",
+    "log2",
+    "log2f",
+    "log2l",
+    "logb",
+    "logbf",
+    "logbl",
+    "logf",
+    "logl",
+    "lstat",
+    "lstat64",
+    "malloc",
+    "memalign",
+    "memccpy",
+    "memchr",
+    "memcmp",
+    "memcpy",
+    "memmove",
+    "memrchr",
+    "memset",
+    "memset_pattern16",
+    "mkdir",
+    "mktime",
+    "modf",
+    "modff",
+    "modfl",
+    "nearbyint",
+    "nearbyintf",
+    "nearbyintl",
+    "ntohl",
+    "ntohs",
+    "open",
+    "open64",
+    "opendir",
+    "pclose",
+    "perror",
+    "popen",
+    "posix_memalign",
+    "pow",
+    "powf",
+    "powl",
+    "pread",
+    "printf",
+    "putc",
+    "putchar",
+    "puts",
+    "pwrite",
+    "qsort",
+    "read",
+    "readlink",
+    "realloc",
+    "reallocf",
+    "realpath",
+    "remove",
+    "rename",
+    "rewind",
+    "rint",
+    "rintf",
+    "rintl",
+    "rmdir",
+    "round",
+    "roundf",
+    "roundl",
+    "scanf",
+    "setbuf",
+    "setitimer",
+    "setvbuf",
+    "sin",
+    "sinf",
+    "sinh",
+    "sinhf",
+    "sinhl",
+    "sinl",
+    "siprintf",
+    "snprintf",
+    "sprintf",
+    "sqrt",
+    "sqrtf",
+    "sqrtl",
+    "sscanf",
+    "stat",
+    "stat64",
+    "statvfs",
+    "statvfs64",
+    "stpcpy",
+    "stpncpy",
+    "strcasecmp",
+    "strcat",
+    "strchr",
+    "strcmp",
+    "strcoll",
+    "strcpy",
+    "strcspn",
+    "strdup",
+    "strlen",
+    "strncasecmp",
+    "strncat",
+    "strncmp",
+    "strncpy",
+    "strndup",
+    "strnlen",
+    "strpbrk",
+    "strrchr",
+    "strspn",
+    "strstr",
+    "strtod",
+    "strtof",
+    "strtok",
+    "strtok_r",
+    "strtol",
+    "strtold",
+    "strtoll",
+    "strtoul",
+    "strtoull",
+    "strxfrm",
+    "system",
+    "tan",
+    "tanf",
+    "tanh",
+    "tanhf",
+    "tanhl",
+    "tanl",
+    "times",
+    "tmpfile",
+    "tmpfile64",
+    "toascii",
+    "trunc",
+    "truncf",
+    "truncl",
+    "uname",
+    "ungetc",
+    "unlink",
+    "unsetenv",
+    "utime",
+    "utimes",
+    "valloc",
+    "vfprintf",
+    "vfscanf",
+    "vprintf",
+    "vscanf",
+    "vsnprintf",
+    "vsprintf",
+    "vsscanf",
+    "write"
+  };
+
+static bool hasSinCosPiStret(const Triple &T) {
+  // Only Darwin variants have _stret versions of combined trig functions.
+  if (!T.isOSDarwin())
+    return false;
+
+  // The ABI is rather complicated on x86, so don't do anything special there.
+  if (T.getArch() == Triple::x86)
+    return false;
+
+  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 9))
+    return false;
+
+  if (T.isiOS() && T.isOSVersionLT(7, 0))
+    return false;
+
+  return true;
+}
+
+/// initialize - Initialize the set of available library functions based on the
+/// specified target triple.  This should be carefully written so that a missing
+/// target triple gets a sane set of defaults.
+static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
+                       const char **StandardNames) {
+#ifndef NDEBUG
+  // Verify that the StandardNames array is in alphabetical order.
+  for (unsigned F = 1; F < LibFunc::NumLibFuncs; ++F) {
+    if (strcmp(StandardNames[F-1], StandardNames[F]) >= 0)
+      llvm_unreachable("TargetLibraryInfoImpl function names must be sorted");
+  }
+#endif // !NDEBUG
+
+  // There are no library implementations of mempcy and memset for AMD gpus and
+  // these can be difficult to lower in the backend.
+  if (T.getArch() == Triple::r600 ||
+      T.getArch() == Triple::amdgcn) {
+    TLI.setUnavailable(LibFunc::memcpy);
+    TLI.setUnavailable(LibFunc::memset);
+    TLI.setUnavailable(LibFunc::memset_pattern16);
+    return;
+  }
+
+  // memset_pattern16 is only available on iOS 3.0 and Mac OS X 10.5 and later.
+  if (T.isMacOSX()) {
+    if (T.isMacOSXVersionLT(10, 5))
+      TLI.setUnavailable(LibFunc::memset_pattern16);
+  } else if (T.isiOS()) {
+    if (T.isOSVersionLT(3, 0))
+      TLI.setUnavailable(LibFunc::memset_pattern16);
+  } else {
+    TLI.setUnavailable(LibFunc::memset_pattern16);
+  }
+
+  if (!hasSinCosPiStret(T)) {
+    TLI.setUnavailable(LibFunc::sinpi);
+    TLI.setUnavailable(LibFunc::sinpif);
+    TLI.setUnavailable(LibFunc::cospi);
+    TLI.setUnavailable(LibFunc::cospif);
+    TLI.setUnavailable(LibFunc::sincospi_stret);
+    TLI.setUnavailable(LibFunc::sincospif_stret);
+  }
+
+  if (T.isMacOSX() && T.getArch() == Triple::x86 &&
+      !T.isMacOSXVersionLT(10, 7)) {
+    // x86-32 OSX has a scheme where fwrite and fputs (and some other functions
+    // we don't care about) have two versions; on recent OSX, the one we want
+    // has a $UNIX2003 suffix. The two implementations are identical except
+    // for the return value in some edge cases.  However, we don't want to
+    // generate code that depends on the old symbols.
+    TLI.setAvailableWithName(LibFunc::fwrite, "fwrite$UNIX2003");
+    TLI.setAvailableWithName(LibFunc::fputs, "fputs$UNIX2003");
+  }
+
+  // iprintf and friends are only available on XCore and TCE.
+  if (T.getArch() != Triple::xcore && T.getArch() != Triple::tce) {
+    TLI.setUnavailable(LibFunc::iprintf);
+    TLI.setUnavailable(LibFunc::siprintf);
+    TLI.setUnavailable(LibFunc::fiprintf);
+  }
+
+  if (T.isOSWindows() && !T.isOSCygMing()) {
+    // Win32 does not support long double
+    TLI.setUnavailable(LibFunc::acosl);
+    TLI.setUnavailable(LibFunc::asinl);
+    TLI.setUnavailable(LibFunc::atanl);
+    TLI.setUnavailable(LibFunc::atan2l);
+    TLI.setUnavailable(LibFunc::ceill);
+    TLI.setUnavailable(LibFunc::copysignl);
+    TLI.setUnavailable(LibFunc::cosl);
+    TLI.setUnavailable(LibFunc::coshl);
+    TLI.setUnavailable(LibFunc::expl);
+    TLI.setUnavailable(LibFunc::fabsf); // Win32 and Win64 both lack fabsf
+    TLI.setUnavailable(LibFunc::fabsl);
+    TLI.setUnavailable(LibFunc::floorl);
+    TLI.setUnavailable(LibFunc::fmaxl);
+    TLI.setUnavailable(LibFunc::fminl);
+    TLI.setUnavailable(LibFunc::fmodl);
+    TLI.setUnavailable(LibFunc::frexpl);
+    TLI.setUnavailable(LibFunc::ldexpf);
+    TLI.setUnavailable(LibFunc::ldexpl);
+    TLI.setUnavailable(LibFunc::logl);
+    TLI.setUnavailable(LibFunc::modfl);
+    TLI.setUnavailable(LibFunc::powl);
+    TLI.setUnavailable(LibFunc::sinl);
+    TLI.setUnavailable(LibFunc::sinhl);
+    TLI.setUnavailable(LibFunc::sqrtl);
+    TLI.setUnavailable(LibFunc::tanl);
+    TLI.setUnavailable(LibFunc::tanhl);
+
+    // Win32 only has C89 math
+    TLI.setUnavailable(LibFunc::acosh);
+    TLI.setUnavailable(LibFunc::acoshf);
+    TLI.setUnavailable(LibFunc::acoshl);
+    TLI.setUnavailable(LibFunc::asinh);
+    TLI.setUnavailable(LibFunc::asinhf);
+    TLI.setUnavailable(LibFunc::asinhl);
+    TLI.setUnavailable(LibFunc::atanh);
+    TLI.setUnavailable(LibFunc::atanhf);
+    TLI.setUnavailable(LibFunc::atanhl);
+    TLI.setUnavailable(LibFunc::cbrt);
+    TLI.setUnavailable(LibFunc::cbrtf);
+    TLI.setUnavailable(LibFunc::cbrtl);
+    TLI.setUnavailable(LibFunc::exp2);
+    TLI.setUnavailable(LibFunc::exp2f);
+    TLI.setUnavailable(LibFunc::exp2l);
+    TLI.setUnavailable(LibFunc::expm1);
+    TLI.setUnavailable(LibFunc::expm1f);
+    TLI.setUnavailable(LibFunc::expm1l);
+    TLI.setUnavailable(LibFunc::log2);
+    TLI.setUnavailable(LibFunc::log2f);
+    TLI.setUnavailable(LibFunc::log2l);
+    TLI.setUnavailable(LibFunc::log1p);
+    TLI.setUnavailable(LibFunc::log1pf);
+    TLI.setUnavailable(LibFunc::log1pl);
+    TLI.setUnavailable(LibFunc::logb);
+    TLI.setUnavailable(LibFunc::logbf);
+    TLI.setUnavailable(LibFunc::logbl);
+    TLI.setUnavailable(LibFunc::nearbyint);
+    TLI.setUnavailable(LibFunc::nearbyintf);
+    TLI.setUnavailable(LibFunc::nearbyintl);
+    TLI.setUnavailable(LibFunc::rint);
+    TLI.setUnavailable(LibFunc::rintf);
+    TLI.setUnavailable(LibFunc::rintl);
+    TLI.setUnavailable(LibFunc::round);
+    TLI.setUnavailable(LibFunc::roundf);
+    TLI.setUnavailable(LibFunc::roundl);
+    TLI.setUnavailable(LibFunc::trunc);
+    TLI.setUnavailable(LibFunc::truncf);
+    TLI.setUnavailable(LibFunc::truncl);
+
+    // Win32 provides some C99 math with mangled names
+    TLI.setAvailableWithName(LibFunc::copysign, "_copysign");
+
+    if (T.getArch() == Triple::x86) {
+      // Win32 on x86 implements single-precision math functions as macros
+      TLI.setUnavailable(LibFunc::acosf);
+      TLI.setUnavailable(LibFunc::asinf);
+      TLI.setUnavailable(LibFunc::atanf);
+      TLI.setUnavailable(LibFunc::atan2f);
+      TLI.setUnavailable(LibFunc::ceilf);
+      TLI.setUnavailable(LibFunc::copysignf);
+      TLI.setUnavailable(LibFunc::cosf);
+      TLI.setUnavailable(LibFunc::coshf);
+      TLI.setUnavailable(LibFunc::expf);
+      TLI.setUnavailable(LibFunc::floorf);
+      TLI.setUnavailable(LibFunc::fminf);
+      TLI.setUnavailable(LibFunc::fmaxf);
+      TLI.setUnavailable(LibFunc::fmodf);
+      TLI.setUnavailable(LibFunc::logf);
+      TLI.setUnavailable(LibFunc::powf);
+      TLI.setUnavailable(LibFunc::sinf);
+      TLI.setUnavailable(LibFunc::sinhf);
+      TLI.setUnavailable(LibFunc::sqrtf);
+      TLI.setUnavailable(LibFunc::tanf);
+      TLI.setUnavailable(LibFunc::tanhf);
+    }
+
+    // Win32 does *not* provide provide these functions, but they are
+    // generally available on POSIX-compliant systems:
+    TLI.setUnavailable(LibFunc::access);
+    TLI.setUnavailable(LibFunc::bcmp);
+    TLI.setUnavailable(LibFunc::bcopy);
+    TLI.setUnavailable(LibFunc::bzero);
+    TLI.setUnavailable(LibFunc::chmod);
+    TLI.setUnavailable(LibFunc::chown);
+    TLI.setUnavailable(LibFunc::closedir);
+    TLI.setUnavailable(LibFunc::ctermid);
+    TLI.setUnavailable(LibFunc::fdopen);
+    TLI.setUnavailable(LibFunc::ffs);
+    TLI.setUnavailable(LibFunc::fileno);
+    TLI.setUnavailable(LibFunc::flockfile);
+    TLI.setUnavailable(LibFunc::fseeko);
+    TLI.setUnavailable(LibFunc::fstat);
+    TLI.setUnavailable(LibFunc::fstatvfs);
+    TLI.setUnavailable(LibFunc::ftello);
+    TLI.setUnavailable(LibFunc::ftrylockfile);
+    TLI.setUnavailable(LibFunc::funlockfile);
+    TLI.setUnavailable(LibFunc::getc_unlocked);
+    TLI.setUnavailable(LibFunc::getitimer);
+    TLI.setUnavailable(LibFunc::getlogin_r);
+    TLI.setUnavailable(LibFunc::getpwnam);
+    TLI.setUnavailable(LibFunc::gettimeofday);
+    TLI.setUnavailable(LibFunc::htonl);
+    TLI.setUnavailable(LibFunc::htons);
+    TLI.setUnavailable(LibFunc::lchown);
+    TLI.setUnavailable(LibFunc::lstat);
+    TLI.setUnavailable(LibFunc::memccpy);
+    TLI.setUnavailable(LibFunc::mkdir);
+    TLI.setUnavailable(LibFunc::ntohl);
+    TLI.setUnavailable(LibFunc::ntohs);
+    TLI.setUnavailable(LibFunc::open);
+    TLI.setUnavailable(LibFunc::opendir);
+    TLI.setUnavailable(LibFunc::pclose);
+    TLI.setUnavailable(LibFunc::popen);
+    TLI.setUnavailable(LibFunc::pread);
+    TLI.setUnavailable(LibFunc::pwrite);
+    TLI.setUnavailable(LibFunc::read);
+    TLI.setUnavailable(LibFunc::readlink);
+    TLI.setUnavailable(LibFunc::realpath);
+    TLI.setUnavailable(LibFunc::rmdir);
+    TLI.setUnavailable(LibFunc::setitimer);
+    TLI.setUnavailable(LibFunc::stat);
+    TLI.setUnavailable(LibFunc::statvfs);
+    TLI.setUnavailable(LibFunc::stpcpy);
+    TLI.setUnavailable(LibFunc::stpncpy);
+    TLI.setUnavailable(LibFunc::strcasecmp);
+    TLI.setUnavailable(LibFunc::strncasecmp);
+    TLI.setUnavailable(LibFunc::times);
+    TLI.setUnavailable(LibFunc::uname);
+    TLI.setUnavailable(LibFunc::unlink);
+    TLI.setUnavailable(LibFunc::unsetenv);
+    TLI.setUnavailable(LibFunc::utime);
+    TLI.setUnavailable(LibFunc::utimes);
+    TLI.setUnavailable(LibFunc::write);
+
+    // Win32 does *not* provide provide these functions, but they are
+    // specified by C99:
+    TLI.setUnavailable(LibFunc::atoll);
+    TLI.setUnavailable(LibFunc::frexpf);
+    TLI.setUnavailable(LibFunc::llabs);
+  }
+
+  switch (T.getOS()) {
+  case Triple::MacOSX:
+    // exp10 and exp10f are not available on OS X until 10.9 and iOS until 7.0
+    // and their names are __exp10 and __exp10f. exp10l is not available on
+    // OS X or iOS.
+    TLI.setUnavailable(LibFunc::exp10l);
+    if (T.isMacOSXVersionLT(10, 9)) {
+      TLI.setUnavailable(LibFunc::exp10);
+      TLI.setUnavailable(LibFunc::exp10f);
+    } else {
+      TLI.setAvailableWithName(LibFunc::exp10, "__exp10");
+      TLI.setAvailableWithName(LibFunc::exp10f, "__exp10f");
+    }
+    break;
+  case Triple::IOS:
+    TLI.setUnavailable(LibFunc::exp10l);
+    if (T.isOSVersionLT(7, 0)) {
+      TLI.setUnavailable(LibFunc::exp10);
+      TLI.setUnavailable(LibFunc::exp10f);
+    } else {
+      TLI.setAvailableWithName(LibFunc::exp10, "__exp10");
+      TLI.setAvailableWithName(LibFunc::exp10f, "__exp10f");
+    }
+    break;
+  case Triple::Linux:
+    // exp10, exp10f, exp10l is available on Linux (GLIBC) but are extremely
+    // buggy prior to glibc version 2.18. Until this version is widely deployed
+    // or we have a reasonable detection strategy, we cannot use exp10 reliably
+    // on Linux.
+    //
+    // Fall through to disable all of them.
+  default:
+    TLI.setUnavailable(LibFunc::exp10);
+    TLI.setUnavailable(LibFunc::exp10f);
+    TLI.setUnavailable(LibFunc::exp10l);
+  }
+
+  // ffsl is available on at least Darwin, Mac OS X, iOS, FreeBSD, and
+  // Linux (GLIBC):
+  // http://developer.apple.com/library/mac/#documentation/Darwin/Reference/ManPages/man3/ffsl.3.html
+  // http://svn.freebsd.org/base/user/eri/pf45/head/lib/libc/string/ffsl.c
+  // http://www.gnu.org/software/gnulib/manual/html_node/ffsl.html
+  switch (T.getOS()) {
+  case Triple::Darwin:
+  case Triple::MacOSX:
+  case Triple::IOS:
+  case Triple::FreeBSD:
+  case Triple::Linux:
+    break;
+  default:
+    TLI.setUnavailable(LibFunc::ffsl);
+  }
+
+  // ffsll is available on at least FreeBSD and Linux (GLIBC):
+  // http://svn.freebsd.org/base/user/eri/pf45/head/lib/libc/string/ffsll.c
+  // http://www.gnu.org/software/gnulib/manual/html_node/ffsll.html
+  switch (T.getOS()) {
+  case Triple::FreeBSD:
+  case Triple::Linux:
+    break;
+  default:
+    TLI.setUnavailable(LibFunc::ffsll);
+  }
+
+  // The following functions are available on at least Linux:
+  if (!T.isOSLinux()) {
+    TLI.setUnavailable(LibFunc::dunder_strdup);
+    TLI.setUnavailable(LibFunc::dunder_strtok_r);
+    TLI.setUnavailable(LibFunc::dunder_isoc99_scanf);
+    TLI.setUnavailable(LibFunc::dunder_isoc99_sscanf);
+    TLI.setUnavailable(LibFunc::under_IO_getc);
+    TLI.setUnavailable(LibFunc::under_IO_putc);
+    TLI.setUnavailable(LibFunc::memalign);
+    TLI.setUnavailable(LibFunc::fopen64);
+    TLI.setUnavailable(LibFunc::fseeko64);
+    TLI.setUnavailable(LibFunc::fstat64);
+    TLI.setUnavailable(LibFunc::fstatvfs64);
+    TLI.setUnavailable(LibFunc::ftello64);
+    TLI.setUnavailable(LibFunc::lstat64);
+    TLI.setUnavailable(LibFunc::open64);
+    TLI.setUnavailable(LibFunc::stat64);
+    TLI.setUnavailable(LibFunc::statvfs64);
+    TLI.setUnavailable(LibFunc::tmpfile64);
+  }
+}
+
+TargetLibraryInfoImpl::TargetLibraryInfoImpl() {
+  // Default to everything being available.
+  memset(AvailableArray, -1, sizeof(AvailableArray));
+
+  initialize(*this, Triple(), StandardNames);
+}
+
+TargetLibraryInfoImpl::TargetLibraryInfoImpl(const Triple &T) {
+  // Default to everything being available.
+  memset(AvailableArray, -1, sizeof(AvailableArray));
+
+  initialize(*this, T, StandardNames);
+}
+
+TargetLibraryInfoImpl::TargetLibraryInfoImpl(const TargetLibraryInfoImpl &TLI)
+    : CustomNames(TLI.CustomNames) {
+  memcpy(AvailableArray, TLI.AvailableArray, sizeof(AvailableArray));
+}
+
+TargetLibraryInfoImpl::TargetLibraryInfoImpl(TargetLibraryInfoImpl &&TLI)
+    : CustomNames(std::move(TLI.CustomNames)) {
+  std::move(std::begin(TLI.AvailableArray), std::end(TLI.AvailableArray),
+            AvailableArray);
+}
+
+TargetLibraryInfoImpl &TargetLibraryInfoImpl::operator=(const TargetLibraryInfoImpl &TLI) {
+  CustomNames = TLI.CustomNames;
+  memcpy(AvailableArray, TLI.AvailableArray, sizeof(AvailableArray));
+  return *this;
+}
+
+TargetLibraryInfoImpl &TargetLibraryInfoImpl::operator=(TargetLibraryInfoImpl &&TLI) {
+  CustomNames = std::move(TLI.CustomNames);
+  std::move(std::begin(TLI.AvailableArray), std::end(TLI.AvailableArray),
+            AvailableArray);
+  return *this;
+}
+
+namespace {
+struct StringComparator {
+  /// Compare two strings and return true if LHS is lexicographically less than
+  /// RHS. Requires that RHS doesn't contain any zero bytes.
+  bool operator()(const char *LHS, StringRef RHS) const {
+    // Compare prefixes with strncmp. If prefixes match we know that LHS is
+    // greater or equal to RHS as RHS can't contain any '\0'.
+    return std::strncmp(LHS, RHS.data(), RHS.size()) < 0;
+  }
+
+  // Provided for compatibility with MSVC's debug mode.
+  bool operator()(StringRef LHS, const char *RHS) const { return LHS < RHS; }
+  bool operator()(StringRef LHS, StringRef RHS) const { return LHS < RHS; }
+  bool operator()(const char *LHS, const char *RHS) const {
+    return std::strcmp(LHS, RHS) < 0;
+  }
+};
+}
+
+bool TargetLibraryInfoImpl::getLibFunc(StringRef funcName,
+                                   LibFunc::Func &F) const {
+  const char **Start = &StandardNames[0];
+  const char **End = &StandardNames[LibFunc::NumLibFuncs];
+
+  // Filter out empty names and names containing null bytes, those can't be in
+  // our table.
+  if (funcName.empty() || funcName.find('\0') != StringRef::npos)
+    return false;
+
+  // Check for \01 prefix that is used to mangle __asm declarations and
+  // strip it if present.
+  if (funcName.front() == '\01')
+    funcName = funcName.substr(1);
+  const char **I = std::lower_bound(Start, End, funcName, StringComparator());
+  if (I != End && *I == funcName) {
+    F = (LibFunc::Func)(I - Start);
+    return true;
+  }
+  return false;
+}
+
+void TargetLibraryInfoImpl::disableAllFunctions() {
+  memset(AvailableArray, 0, sizeof(AvailableArray));
+}
+
+TargetLibraryInfo TargetLibraryAnalysis::run(Module &M) {
+  if (PresetInfoImpl)
+    return TargetLibraryInfo(*PresetInfoImpl);
+
+  return TargetLibraryInfo(lookupInfoImpl(Triple(M.getTargetTriple())));
+}
+
+TargetLibraryInfo TargetLibraryAnalysis::run(Function &F) {
+  if (PresetInfoImpl)
+    return TargetLibraryInfo(*PresetInfoImpl);
+
+  return TargetLibraryInfo(
+      lookupInfoImpl(Triple(F.getParent()->getTargetTriple())));
+}
+
+TargetLibraryInfoImpl &TargetLibraryAnalysis::lookupInfoImpl(Triple T) {
+  std::unique_ptr<TargetLibraryInfoImpl> &Impl =
+      Impls[T.normalize()];
+  if (!Impl)
+    Impl.reset(new TargetLibraryInfoImpl(T));
+
+  return *Impl;
+}
+
+
+TargetLibraryInfoWrapperPass::TargetLibraryInfoWrapperPass()
+    : ImmutablePass(ID), TLIImpl(), TLI(TLIImpl) {
+  initializeTargetLibraryInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+TargetLibraryInfoWrapperPass::TargetLibraryInfoWrapperPass(const Triple &T)
+    : ImmutablePass(ID), TLIImpl(T), TLI(TLIImpl) {
+  initializeTargetLibraryInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+TargetLibraryInfoWrapperPass::TargetLibraryInfoWrapperPass(
+    const TargetLibraryInfoImpl &TLIImpl)
+    : ImmutablePass(ID), TLIImpl(TLIImpl), TLI(this->TLIImpl) {
+  initializeTargetLibraryInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+char TargetLibraryAnalysis::PassID;
+
+// Register the basic pass.
+INITIALIZE_PASS(TargetLibraryInfoWrapperPass, "targetlibinfo",
+                "Target Library Information", false, true)
+char TargetLibraryInfoWrapperPass::ID = 0;
+
+void TargetLibraryInfoWrapperPass::anchor() {}
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index c1ffb9d..7ff29b0 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -8,11 +8,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfoImpl.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -20,623 +22,290 @@ using namespace llvm;
 
 #define DEBUG_TYPE "tti"
 
-// Setup the analysis group to manage the TargetTransformInfo passes.
-INITIALIZE_ANALYSIS_GROUP(TargetTransformInfo, "Target Information", NoTTI)
-char TargetTransformInfo::ID = 0;
-
-TargetTransformInfo::~TargetTransformInfo() {
+namespace {
+/// \brief No-op implementation of the TTI interface using the utility base
+/// classes.
+///
+/// This is used when no target specific information is available.
+struct NoTTIImpl : TargetTransformInfoImplCRTPBase<NoTTIImpl> {
+  explicit NoTTIImpl(const DataLayout *DL)
+      : TargetTransformInfoImplCRTPBase<NoTTIImpl>(DL) {}
+};
 }
 
-void TargetTransformInfo::pushTTIStack(Pass *P) {
-  TopTTI = this;
-  PrevTTI = &P->getAnalysis<TargetTransformInfo>();
+TargetTransformInfo::TargetTransformInfo(const DataLayout *DL)
+    : TTIImpl(new Model<NoTTIImpl>(NoTTIImpl(DL))) {}
 
-  // Walk up the chain and update the top TTI pointer.
-  for (TargetTransformInfo *PTTI = PrevTTI; PTTI; PTTI = PTTI->PrevTTI)
-    PTTI->TopTTI = this;
-}
+TargetTransformInfo::~TargetTransformInfo() {}
+
+TargetTransformInfo::TargetTransformInfo(TargetTransformInfo &&Arg)
+    : TTIImpl(std::move(Arg.TTIImpl)) {}
 
-void TargetTransformInfo::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<TargetTransformInfo>();
+TargetTransformInfo &TargetTransformInfo::operator=(TargetTransformInfo &&RHS) {
+  TTIImpl = std::move(RHS.TTIImpl);
+  return *this;
 }
 
 unsigned TargetTransformInfo::getOperationCost(unsigned Opcode, Type *Ty,
                                                Type *OpTy) const {
-  return PrevTTI->getOperationCost(Opcode, Ty, OpTy);
-}
-
-unsigned TargetTransformInfo::getGEPCost(
-    const Value *Ptr, ArrayRef<const Value *> Operands) const {
-  return PrevTTI->getGEPCost(Ptr, Operands);
+  return TTIImpl->getOperationCost(Opcode, Ty, OpTy);
 }
 
 unsigned TargetTransformInfo::getCallCost(FunctionType *FTy,
                                           int NumArgs) const {
-  return PrevTTI->getCallCost(FTy, NumArgs);
-}
-
-unsigned TargetTransformInfo::getCallCost(const Function *F,
-                                          int NumArgs) const {
-  return PrevTTI->getCallCost(F, NumArgs);
-}
-
-unsigned TargetTransformInfo::getCallCost(
-    const Function *F, ArrayRef<const Value *> Arguments) const {
-  return PrevTTI->getCallCost(F, Arguments);
+  return TTIImpl->getCallCost(FTy, NumArgs);
 }
 
-unsigned TargetTransformInfo::getIntrinsicCost(
-    Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> ParamTys) const {
-  return PrevTTI->getIntrinsicCost(IID, RetTy, ParamTys);
+unsigned
+TargetTransformInfo::getCallCost(const Function *F,
+                                 ArrayRef<const Value *> Arguments) const {
+  return TTIImpl->getCallCost(F, Arguments);
 }
 
-unsigned TargetTransformInfo::getIntrinsicCost(
-    Intrinsic::ID IID, Type *RetTy, ArrayRef<const Value *> Arguments) const {
-  return PrevTTI->getIntrinsicCost(IID, RetTy, Arguments);
+unsigned
+TargetTransformInfo::getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                                      ArrayRef<const Value *> Arguments) const {
+  return TTIImpl->getIntrinsicCost(IID, RetTy, Arguments);
 }
 
 unsigned TargetTransformInfo::getUserCost(const User *U) const {
-  return PrevTTI->getUserCost(U);
+  return TTIImpl->getUserCost(U);
 }
 
 bool TargetTransformInfo::hasBranchDivergence() const {
-  return PrevTTI->hasBranchDivergence();
+  return TTIImpl->hasBranchDivergence();
 }
 
 bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
-  return PrevTTI->isLoweredToCall(F);
+  return TTIImpl->isLoweredToCall(F);
 }
 
-void
-TargetTransformInfo::getUnrollingPreferences(const Function *F, Loop *L,
-                                             UnrollingPreferences &UP) const {
-  PrevTTI->getUnrollingPreferences(F, L, UP);
+void TargetTransformInfo::getUnrollingPreferences(
+    Loop *L, UnrollingPreferences &UP) const {
+  return TTIImpl->getUnrollingPreferences(L, UP);
 }
 
 bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const {
-  return PrevTTI->isLegalAddImmediate(Imm);
+  return TTIImpl->isLegalAddImmediate(Imm);
 }
 
 bool TargetTransformInfo::isLegalICmpImmediate(int64_t Imm) const {
-  return PrevTTI->isLegalICmpImmediate(Imm);
+  return TTIImpl->isLegalICmpImmediate(Imm);
 }
 
 bool TargetTransformInfo::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
                                                 int64_t BaseOffset,
                                                 bool HasBaseReg,
                                                 int64_t Scale) const {
-  return PrevTTI->isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg,
+  return TTIImpl->isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg,
                                         Scale);
 }
 
+bool TargetTransformInfo::isLegalMaskedStore(Type *DataType,
+                                             int Consecutive) const {
+  return TTIImpl->isLegalMaskedStore(DataType, Consecutive);
+}
+
+bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType,
+                                            int Consecutive) const {
+  return TTIImpl->isLegalMaskedLoad(DataType, Consecutive);
+}
+
 int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                               int64_t BaseOffset,
                                               bool HasBaseReg,
                                               int64_t Scale) const {
-  return PrevTTI->getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg,
+  return TTIImpl->getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg,
                                        Scale);
 }
 
 bool TargetTransformInfo::isTruncateFree(Type *Ty1, Type *Ty2) const {
-  return PrevTTI->isTruncateFree(Ty1, Ty2);
+  return TTIImpl->isTruncateFree(Ty1, Ty2);
+}
+
+bool TargetTransformInfo::isProfitableToHoist(Instruction *I) const {
+  return TTIImpl->isProfitableToHoist(I);
 }
 
 bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
-  return PrevTTI->isTypeLegal(Ty);
+  return TTIImpl->isTypeLegal(Ty);
 }
 
 unsigned TargetTransformInfo::getJumpBufAlignment() const {
-  return PrevTTI->getJumpBufAlignment();
+  return TTIImpl->getJumpBufAlignment();
 }
 
 unsigned TargetTransformInfo::getJumpBufSize() const {
-  return PrevTTI->getJumpBufSize();
+  return TTIImpl->getJumpBufSize();
 }
 
 bool TargetTransformInfo::shouldBuildLookupTables() const {
-  return PrevTTI->shouldBuildLookupTables();
+  return TTIImpl->shouldBuildLookupTables();
 }
 
 TargetTransformInfo::PopcntSupportKind
 TargetTransformInfo::getPopcntSupport(unsigned IntTyWidthInBit) const {
-  return PrevTTI->getPopcntSupport(IntTyWidthInBit);
+  return TTIImpl->getPopcntSupport(IntTyWidthInBit);
 }
 
 bool TargetTransformInfo::haveFastSqrt(Type *Ty) const {
-  return PrevTTI->haveFastSqrt(Ty);
+  return TTIImpl->haveFastSqrt(Ty);
+}
+
+unsigned TargetTransformInfo::getFPOpCost(Type *Ty) const {
+  return TTIImpl->getFPOpCost(Ty);
 }
 
 unsigned TargetTransformInfo::getIntImmCost(const APInt &Imm, Type *Ty) const {
-  return PrevTTI->getIntImmCost(Imm, Ty);
+  return TTIImpl->getIntImmCost(Imm, Ty);
 }
 
-unsigned TargetTransformInfo::getIntImmCost(unsigned Opc, unsigned Idx,
+unsigned TargetTransformInfo::getIntImmCost(unsigned Opcode, unsigned Idx,
                                             const APInt &Imm, Type *Ty) const {
-  return PrevTTI->getIntImmCost(Opc, Idx, Imm, Ty);
+  return TTIImpl->getIntImmCost(Opcode, Idx, Imm, Ty);
 }
 
 unsigned TargetTransformInfo::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
                                             const APInt &Imm, Type *Ty) const {
-  return PrevTTI->getIntImmCost(IID, Idx, Imm, Ty);
+  return TTIImpl->getIntImmCost(IID, Idx, Imm, Ty);
 }
 
 unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const {
-  return PrevTTI->getNumberOfRegisters(Vector);
+  return TTIImpl->getNumberOfRegisters(Vector);
 }
 
 unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
-  return PrevTTI->getRegisterBitWidth(Vector);
+  return TTIImpl->getRegisterBitWidth(Vector);
 }
 
 unsigned TargetTransformInfo::getMaxInterleaveFactor() const {
-  return PrevTTI->getMaxInterleaveFactor();
+  return TTIImpl->getMaxInterleaveFactor();
 }
 
 unsigned TargetTransformInfo::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, OperandValueKind Op1Info,
-    OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo,
+    unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
+    OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
     OperandValueProperties Opd2PropInfo) const {
-  return PrevTTI->getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+  return TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
                                          Opd1PropInfo, Opd2PropInfo);
 }
 
-unsigned TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Tp,
+unsigned TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Ty,
                                              int Index, Type *SubTp) const {
-  return PrevTTI->getShuffleCost(Kind, Tp, Index, SubTp);
+  return TTIImpl->getShuffleCost(Kind, Ty, Index, SubTp);
 }
 
 unsigned TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst,
                                                Type *Src) const {
-  return PrevTTI->getCastInstrCost(Opcode, Dst, Src);
+  return TTIImpl->getCastInstrCost(Opcode, Dst, Src);
 }
 
 unsigned TargetTransformInfo::getCFInstrCost(unsigned Opcode) const {
-  return PrevTTI->getCFInstrCost(Opcode);
+  return TTIImpl->getCFInstrCost(Opcode);
 }
 
 unsigned TargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                                                  Type *CondTy) const {
-  return PrevTTI->getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy);
 }
 
 unsigned TargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                  unsigned Index) const {
-  return PrevTTI->getVectorInstrCost(Opcode, Val, Index);
+  return TTIImpl->getVectorInstrCost(Opcode, Val, Index);
 }
 
 unsigned TargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src,
                                               unsigned Alignment,
                                               unsigned AddressSpace) const {
-  return PrevTTI->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
-  ;
+  return TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
 }
 
 unsigned
-TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID,
-                                           Type *RetTy,
+TargetTransformInfo::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
+                                           unsigned Alignment,
+                                           unsigned AddressSpace) const {
+  return TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+}
+
+unsigned
+TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                            ArrayRef<Type *> Tys) const {
-  return PrevTTI->getIntrinsicInstrCost(ID, RetTy, Tys);
+  return TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys);
 }
 
 unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
-  return PrevTTI->getNumberOfParts(Tp);
+  return TTIImpl->getNumberOfParts(Tp);
 }
 
 unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp,
                                                         bool IsComplex) const {
-  return PrevTTI->getAddressComputationCost(Tp, IsComplex);
+  return TTIImpl->getAddressComputationCost(Tp, IsComplex);
 }
 
 unsigned TargetTransformInfo::getReductionCost(unsigned Opcode, Type *Ty,
-                                               bool IsPairwise) const {
-  return PrevTTI->getReductionCost(Opcode, Ty, IsPairwise);
+                                               bool IsPairwiseForm) const {
+  return TTIImpl->getReductionCost(Opcode, Ty, IsPairwiseForm);
 }
 
-unsigned TargetTransformInfo::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys)
-  const {
-  return PrevTTI->getCostOfKeepingLiveOverCall(Tys);
+unsigned
+TargetTransformInfo::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {
+  return TTIImpl->getCostOfKeepingLiveOverCall(Tys);
 }
 
-namespace {
+bool TargetTransformInfo::getTgtMemIntrinsic(IntrinsicInst *Inst,
+                                             MemIntrinsicInfo &Info) const {
+  return TTIImpl->getTgtMemIntrinsic(Inst, Info);
+}
 
-struct NoTTI final : ImmutablePass, TargetTransformInfo {
-  const DataLayout *DL;
-
-  NoTTI() : ImmutablePass(ID), DL(nullptr) {
-    initializeNoTTIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void initializePass() override {
-    // Note that this subclass is special, and must *not* call initializeTTI as
-    // it does not chain.
-    TopTTI = this;
-    PrevTTI = nullptr;
-    DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-    DL = DLP ? &DLP->getDataLayout() : nullptr;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    // Note that this subclass is special, and must *not* call
-    // TTI::getAnalysisUsage as it breaks the recursion.
-  }
-
-  /// Pass identification.
-  static char ID;
-
-  /// Provide necessary pointer adjustments for the two base classes.
-  void *getAdjustedAnalysisPointer(const void *ID) override {
-    if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo*)this;
-    return this;
-  }
-
-  unsigned getOperationCost(unsigned Opcode, Type *Ty,
-                            Type *OpTy) const override {
-    switch (Opcode) {
-    default:
-      // By default, just classify everything as 'basic'.
-      return TCC_Basic;
-
-    case Instruction::GetElementPtr:
-      llvm_unreachable("Use getGEPCost for GEP operations!");
-
-    case Instruction::BitCast:
-      assert(OpTy && "Cast instructions must provide the operand type");
-      if (Ty == OpTy || (Ty->isPointerTy() && OpTy->isPointerTy()))
-        // Identity and pointer-to-pointer casts are free.
-        return TCC_Free;
-
-      // Otherwise, the default basic cost is used.
-      return TCC_Basic;
-
-    case Instruction::IntToPtr: {
-      if (!DL)
-        return TCC_Basic;
-
-      // An inttoptr cast is free so long as the input is a legal integer type
-      // which doesn't contain values outside the range of a pointer.
-      unsigned OpSize = OpTy->getScalarSizeInBits();
-      if (DL->isLegalInteger(OpSize) &&
-          OpSize <= DL->getPointerTypeSizeInBits(Ty))
-        return TCC_Free;
-
-      // Otherwise it's not a no-op.
-      return TCC_Basic;
-    }
-    case Instruction::PtrToInt: {
-      if (!DL)
-        return TCC_Basic;
-
-      // A ptrtoint cast is free so long as the result is large enough to store
-      // the pointer, and a legal integer type.
-      unsigned DestSize = Ty->getScalarSizeInBits();
-      if (DL->isLegalInteger(DestSize) &&
-          DestSize >= DL->getPointerTypeSizeInBits(OpTy))
-        return TCC_Free;
-
-      // Otherwise it's not a no-op.
-      return TCC_Basic;
-    }
-    case Instruction::Trunc:
-      // trunc to a native type is free (assuming the target has compare and
-      // shift-right of the same width).
-      if (DL && DL->isLegalInteger(DL->getTypeSizeInBits(Ty)))
-        return TCC_Free;
-
-      return TCC_Basic;
-    }
-  }
-
-  unsigned getGEPCost(const Value *Ptr,
-                      ArrayRef<const Value *> Operands) const override {
-    // In the basic model, we just assume that all-constant GEPs will be folded
-    // into their uses via addressing modes.
-    for (unsigned Idx = 0, Size = Operands.size(); Idx != Size; ++Idx)
-      if (!isa<Constant>(Operands[Idx]))
-        return TCC_Basic;
-
-    return TCC_Free;
-  }
-
-  unsigned getCallCost(FunctionType *FTy, int NumArgs = -1) const override
-  {
-    assert(FTy && "FunctionType must be provided to this routine.");
-
-    // The target-independent implementation just measures the size of the
-    // function by approximating that each argument will take on average one
-    // instruction to prepare.
-
-    if (NumArgs < 0)
-      // Set the argument number to the number of explicit arguments in the
-      // function.
-      NumArgs = FTy->getNumParams();
-
-    return TCC_Basic * (NumArgs + 1);
-  }
-
-  unsigned getCallCost(const Function *F, int NumArgs = -1) const override
-  {
-    assert(F && "A concrete function must be provided to this routine.");
-
-    if (NumArgs < 0)
-      // Set the argument number to the number of explicit arguments in the
-      // function.
-      NumArgs = F->arg_size();
-
-    if (Intrinsic::ID IID = (Intrinsic::ID)F->getIntrinsicID()) {
-      FunctionType *FTy = F->getFunctionType();
-      SmallVector<Type *, 8> ParamTys(FTy->param_begin(), FTy->param_end());
-      return TopTTI->getIntrinsicCost(IID, FTy->getReturnType(), ParamTys);
-    }
-
-    if (!TopTTI->isLoweredToCall(F))
-      return TCC_Basic; // Give a basic cost if it will be lowered directly.
-
-    return TopTTI->getCallCost(F->getFunctionType(), NumArgs);
-  }
-
-  unsigned getCallCost(const Function *F,
-                       ArrayRef<const Value *> Arguments) const override {
-    // Simply delegate to generic handling of the call.
-    // FIXME: We should use instsimplify or something else to catch calls which
-    // will constant fold with these arguments.
-    return TopTTI->getCallCost(F, Arguments.size());
-  }
-
-  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<Type *> ParamTys) const override {
-    switch (IID) {
-    default:
-      // Intrinsics rarely (if ever) have normal argument setup constraints.
-      // Model them as having a basic instruction cost.
-      // FIXME: This is wrong for libc intrinsics.
-      return TCC_Basic;
-
-    case Intrinsic::annotation:
-    case Intrinsic::assume:
-    case Intrinsic::dbg_declare:
-    case Intrinsic::dbg_value:
-    case Intrinsic::invariant_start:
-    case Intrinsic::invariant_end:
-    case Intrinsic::lifetime_start:
-    case Intrinsic::lifetime_end:
-    case Intrinsic::objectsize:
-    case Intrinsic::ptr_annotation:
-    case Intrinsic::var_annotation:
-      // These intrinsics don't actually represent code after lowering.
-      return TCC_Free;
-    }
-  }
-
-  unsigned
-  getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                   ArrayRef<const Value *> Arguments) const override {
-    // Delegate to the generic intrinsic handling code. This mostly provides an
-    // opportunity for targets to (for example) special case the cost of
-    // certain intrinsics based on constants used as arguments.
-    SmallVector<Type *, 8> ParamTys;
-    ParamTys.reserve(Arguments.size());
-    for (unsigned Idx = 0, Size = Arguments.size(); Idx != Size; ++Idx)
-      ParamTys.push_back(Arguments[Idx]->getType());
-    return TopTTI->getIntrinsicCost(IID, RetTy, ParamTys);
-  }
-
-  unsigned getUserCost(const User *U) const override {
-    if (isa<PHINode>(U))
-      return TCC_Free; // Model all PHI nodes as free.
-
-    if (const GEPOperator *GEP = dyn_cast<GEPOperator>(U)) {
-      SmallVector<const Value *, 4> Indices(GEP->idx_begin(), GEP->idx_end());
-      return TopTTI->getGEPCost(GEP->getPointerOperand(), Indices);
-    }
-
-    if (ImmutableCallSite CS = U) {
-      const Function *F = CS.getCalledFunction();
-      if (!F) {
-        // Just use the called value type.
-        Type *FTy = CS.getCalledValue()->getType()->getPointerElementType();
-        return TopTTI->getCallCost(cast<FunctionType>(FTy), CS.arg_size());
-      }
-
-      SmallVector<const Value *, 8> Arguments(CS.arg_begin(), CS.arg_end());
-      return TopTTI->getCallCost(F, Arguments);
-    }
-
-    if (const CastInst *CI = dyn_cast<CastInst>(U)) {
-      // Result of a cmp instruction is often extended (to be used by other
-      // cmp instructions, logical or return instructions). These are usually
-      // nop on most sane targets.
-      if (isa<CmpInst>(CI->getOperand(0)))
-        return TCC_Free;
-    }
-
-    // Otherwise delegate to the fully generic implementations.
-    return getOperationCost(Operator::getOpcode(U), U->getType(),
-                            U->getNumOperands() == 1 ?
-                                U->getOperand(0)->getType() : nullptr);
-  }
-
-  bool hasBranchDivergence() const override { return false; }
-
-  bool isLoweredToCall(const Function *F) const override {
-    // FIXME: These should almost certainly not be handled here, and instead
-    // handled with the help of TLI or the target itself. This was largely
-    // ported from existing analysis heuristics here so that such refactorings
-    // can take place in the future.
-
-    if (F->isIntrinsic())
-      return false;
-
-    if (F->hasLocalLinkage() || !F->hasName())
-      return true;
-
-    StringRef Name = F->getName();
-
-    // These will all likely lower to a single selection DAG node.
-    if (Name == "copysign" || Name == "copysignf" || Name == "copysignl" ||
-        Name == "fabs" || Name == "fabsf" || Name == "fabsl" || Name == "sin" ||
-        Name == "fmin" || Name == "fminf" || Name == "fminl" ||
-        Name == "fmax" || Name == "fmaxf" || Name == "fmaxl" ||
-        Name == "sinf" || Name == "sinl" || Name == "cos" || Name == "cosf" ||
-        Name == "cosl" || Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl")
-      return false;
-
-    // These are all likely to be optimized into something smaller.
-    if (Name == "pow" || Name == "powf" || Name == "powl" || Name == "exp2" ||
-        Name == "exp2l" || Name == "exp2f" || Name == "floor" || Name ==
-        "floorf" || Name == "ceil" || Name == "round" || Name == "ffs" ||
-        Name == "ffsl" || Name == "abs" || Name == "labs" || Name == "llabs")
-      return false;
-
-    return true;
-  }
-
-  void getUnrollingPreferences(const Function *, Loop *,
-                               UnrollingPreferences &) const override {}
-
-  bool isLegalAddImmediate(int64_t Imm) const override {
-    return false;
-  }
-
-  bool isLegalICmpImmediate(int64_t Imm) const override {
-    return false;
-  }
-
-  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
-                             bool HasBaseReg, int64_t Scale) const override
-  {
-    // Guess that reg+reg addressing is allowed. This heuristic is taken from
-    // the implementation of LSR.
-    return !BaseGV && BaseOffset == 0 && Scale <= 1;
-  }
-
-  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
-                           bool HasBaseReg, int64_t Scale) const override {
-    // Guess that all legal addressing mode are free.
-    if(isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale))
-      return 0;
-    return -1;
-  }
-
-  bool isTruncateFree(Type *Ty1, Type *Ty2) const override {
-    return false;
-  }
-
-  bool isTypeLegal(Type *Ty) const override {
-    return false;
-  }
-
-  unsigned getJumpBufAlignment() const override {
-    return 0;
-  }
-
-  unsigned getJumpBufSize() const override {
-    return 0;
-  }
-
-  bool shouldBuildLookupTables() const override {
-    return true;
-  }
-
-  PopcntSupportKind
-  getPopcntSupport(unsigned IntTyWidthInBit) const override {
-    return PSK_Software;
-  }
-
-  bool haveFastSqrt(Type *Ty) const override {
-    return false;
-  }
-
-  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override {
-    return TCC_Basic;
-  }
-
-  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                         Type *Ty) const override {
-    return TCC_Free;
-  }
-
-  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                         Type *Ty) const override {
-    return TCC_Free;
-  }
-
-  unsigned getNumberOfRegisters(bool Vector) const override {
-    return 8;
-  }
-
-  unsigned  getRegisterBitWidth(bool Vector) const override {
-    return 32;
-  }
-
-  unsigned getMaxInterleaveFactor() const override {
-    return 1;
-  }
-
-  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
-                                  OperandValueKind, OperandValueProperties,
-                                  OperandValueProperties) const override {
-    return 1;
-  }
-
-  unsigned getShuffleCost(ShuffleKind Kind, Type *Ty,
-                          int Index = 0, Type *SubTp = nullptr) const override {
-    return 1;
-  }
-
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
-                            Type *Src) const override {
-    return 1;
-  }
-
-  unsigned getCFInstrCost(unsigned Opcode) const override {
-    return 1;
-  }
-
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                              Type *CondTy = nullptr) const override {
-    return 1;
-  }
-
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
-                              unsigned Index = -1) const override {
-    return 1;
-  }
-
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) const override {
-    return 1;
-  }
-
-  unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                 ArrayRef<Type*> Tys) const override {
-    return 1;
-  }
-
-  unsigned getNumberOfParts(Type *Tp) const override {
-    return 0;
-  }
-
-  unsigned getAddressComputationCost(Type *Tp, bool) const override {
-    return 0;
-  }
-
-  unsigned getReductionCost(unsigned, Type *, bool) const override {
-    return 1;
-  }
-
-  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const override {
-    return 0;
-  }
+Value *TargetTransformInfo::getOrCreateResultFromMemIntrinsic(
+    IntrinsicInst *Inst, Type *ExpectedType) const {
+  return TTIImpl->getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
+}
 
-};
+TargetTransformInfo::Concept::~Concept() {}
+
+TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
 
-} // end anonymous namespace
+TargetIRAnalysis::TargetIRAnalysis(
+    std::function<Result(Function &)> TTICallback)
+    : TTICallback(TTICallback) {}
+
+TargetIRAnalysis::Result TargetIRAnalysis::run(Function &F) {
+  return TTICallback(F);
+}
 
-INITIALIZE_AG_PASS(NoTTI, TargetTransformInfo, "notti",
-                   "No target information", true, true, true)
-char NoTTI::ID = 0;
+char TargetIRAnalysis::PassID;
+
+TargetIRAnalysis::Result TargetIRAnalysis::getDefaultTTI(Function &F) {
+  return Result(F.getParent()->getDataLayout());
+}
+
+// Register the basic pass.
+INITIALIZE_PASS(TargetTransformInfoWrapperPass, "tti",
+                "Target Transform Information", false, true)
+char TargetTransformInfoWrapperPass::ID = 0;
+
+void TargetTransformInfoWrapperPass::anchor() {}
+
+TargetTransformInfoWrapperPass::TargetTransformInfoWrapperPass()
+    : ImmutablePass(ID) {
+  initializeTargetTransformInfoWrapperPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+TargetTransformInfoWrapperPass::TargetTransformInfoWrapperPass(
+    TargetIRAnalysis TIRA)
+    : ImmutablePass(ID), TIRA(std::move(TIRA)) {
+  initializeTargetTransformInfoWrapperPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+TargetTransformInfo &TargetTransformInfoWrapperPass::getTTI(Function &F) {
+  TTI = TIRA.run(F);
+  return *TTI;
+}
 
-ImmutablePass *llvm::createNoTargetTransformInfoPass() {
-  return new NoTTI();
+ImmutablePass *
+llvm::createTargetTransformInfoWrapperPass(TargetIRAnalysis TIRA) {
+  return new TargetTransformInfoWrapperPass(std::move(TIRA));
 }
diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp
index f347eb5..ff89558 100644
--- a/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -167,7 +167,7 @@ namespace {
     bool TypeIsImmutable() const {
       if (Node->getNumOperands() < 3)
         return false;
-      ConstantInt *CI = dyn_cast<ConstantInt>(Node->getOperand(2));
+      ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Node->getOperand(2));
       if (!CI)
         return false;
       return CI->getValue()[0];
@@ -194,7 +194,7 @@ namespace {
       return dyn_cast_or_null<MDNode>(Node->getOperand(1));
     }
     uint64_t getOffset() const {
-      return cast<ConstantInt>(Node->getOperand(2))->getZExtValue();
+      return mdconst::extract<ConstantInt>(Node->getOperand(2))->getZExtValue();
     }
     /// TypeIsImmutable - Test if this TBAAStructTagNode represents a type for
     /// objects which are not modified (by any means) in the context where this
@@ -202,7 +202,7 @@ namespace {
     bool TypeIsImmutable() const {
       if (Node->getNumOperands() < 4)
         return false;
-      ConstantInt *CI = dyn_cast<ConstantInt>(Node->getOperand(3));
+      ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Node->getOperand(3));
       if (!CI)
         return false;
       return CI->getValue()[0];
@@ -233,8 +233,10 @@ namespace {
       // Fast path for a scalar type node and a struct type node with a single
       // field.
       if (Node->getNumOperands() <= 3) {
-        uint64_t Cur = Node->getNumOperands() == 2 ? 0 :
-                       cast<ConstantInt>(Node->getOperand(2))->getZExtValue();
+        uint64_t Cur = Node->getNumOperands() == 2
+                           ? 0
+                           : mdconst::extract<ConstantInt>(Node->getOperand(2))
+                                 ->getZExtValue();
         Offset -= Cur;
         MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(1));
         if (!P)
@@ -246,8 +248,8 @@ namespace {
       // the current offset is bigger than the given offset.
       unsigned TheIdx = 0;
       for (unsigned Idx = 1; Idx < Node->getNumOperands(); Idx += 2) {
-        uint64_t Cur = cast<ConstantInt>(Node->getOperand(Idx + 1))->
-                         getZExtValue();
+        uint64_t Cur = mdconst::extract<ConstantInt>(Node->getOperand(Idx + 1))
+                           ->getZExtValue();
         if (Cur > Offset) {
           assert(Idx >= 3 &&
                  "TBAAStructTypeNode::getParent should have an offset match!");
@@ -258,8 +260,8 @@ namespace {
       // Move along the last field.
       if (TheIdx == 0)
         TheIdx = Node->getNumOperands() - 2;
-      uint64_t Cur = cast<ConstantInt>(Node->getOperand(TheIdx + 1))->
-                       getZExtValue();
+      uint64_t Cur = mdconst::extract<ConstantInt>(Node->getOperand(TheIdx + 1))
+                         ->getZExtValue();
       Offset -= Cur;
       MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(TheIdx));
       if (!P)
@@ -608,7 +610,8 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
     return nullptr;
   // We need to convert from a type node to a tag node.
   Type *Int64 = IntegerType::get(A->getContext(), 64);
-  Value *Ops[3] = { Ret, Ret, ConstantInt::get(Int64, 0) };
+  Metadata *Ops[3] = {Ret, Ret,
+                      ConstantAsMetadata::get(ConstantInt::get(Int64, 0))};
   return MDNode::get(A->getContext(), Ops);
 }
 
@@ -620,8 +623,8 @@ void Instruction::getAAMetadata(AAMDNodes &N, bool Merge) const {
     N.TBAA = getMetadata(LLVMContext::MD_tbaa);
 
   if (Merge)
-    N.Scope =
-        MDNode::intersect(N.Scope, getMetadata(LLVMContext::MD_alias_scope));
+    N.Scope = MDNode::getMostGenericAliasScope(
+        N.Scope, getMetadata(LLVMContext::MD_alias_scope));
   else
     N.Scope = getMetadata(LLVMContext::MD_alias_scope);
 
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index e9bbf83..0458d28 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/IR/CallSite.h"
@@ -65,16 +65,16 @@ namespace {
 // figuring out if we can use it.
 struct Query {
   ExclInvsSet ExclInvs;
-  AssumptionTracker *AT;
+  AssumptionCache *AC;
   const Instruction *CxtI;
   const DominatorTree *DT;
 
-  Query(AssumptionTracker *AT = nullptr, const Instruction *CxtI = nullptr,
+  Query(AssumptionCache *AC = nullptr, const Instruction *CxtI = nullptr,
         const DominatorTree *DT = nullptr)
-    : AT(AT), CxtI(CxtI), DT(DT) {}
+      : AC(AC), CxtI(CxtI), DT(DT) {}
 
   Query(const Query &Q, const Value *NewExcl)
-    : ExclInvs(Q.ExclInvs), AT(Q.AT), CxtI(Q.CxtI), DT(Q.DT) {
+      : ExclInvs(Q.ExclInvs), AC(Q.AC), CxtI(Q.CxtI), DT(Q.DT) {
     ExclInvs.insert(NewExcl);
   }
 };
@@ -102,10 +102,10 @@ static void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
 
 void llvm::computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
                             const DataLayout *TD, unsigned Depth,
-                            AssumptionTracker *AT, const Instruction *CxtI,
+                            AssumptionCache *AC, const Instruction *CxtI,
                             const DominatorTree *DT) {
   ::computeKnownBits(V, KnownZero, KnownOne, TD, Depth,
-                     Query(AT, safeCxtI(V, CxtI), DT));
+                     Query(AC, safeCxtI(V, CxtI), DT));
 }
 
 static void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
@@ -114,52 +114,50 @@ static void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
 
 void llvm::ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
                           const DataLayout *TD, unsigned Depth,
-                          AssumptionTracker *AT, const Instruction *CxtI,
+                          AssumptionCache *AC, const Instruction *CxtI,
                           const DominatorTree *DT) {
   ::ComputeSignBit(V, KnownZero, KnownOne, TD, Depth,
-                   Query(AT, safeCxtI(V, CxtI), DT));
+                   Query(AC, safeCxtI(V, CxtI), DT));
 }
 
 static bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth,
                                    const Query &Q);
 
 bool llvm::isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth,
-                                  AssumptionTracker *AT,
-                                  const Instruction *CxtI,
+                                  AssumptionCache *AC, const Instruction *CxtI,
                                   const DominatorTree *DT) {
   return ::isKnownToBeAPowerOfTwo(V, OrZero, Depth,
-                                  Query(AT, safeCxtI(V, CxtI), DT));
+                                  Query(AC, safeCxtI(V, CxtI), DT));
 }
 
 static bool isKnownNonZero(Value *V, const DataLayout *TD, unsigned Depth,
                            const Query &Q);
 
 bool llvm::isKnownNonZero(Value *V, const DataLayout *TD, unsigned Depth,
-                          AssumptionTracker *AT, const Instruction *CxtI,
+                          AssumptionCache *AC, const Instruction *CxtI,
                           const DominatorTree *DT) {
-  return ::isKnownNonZero(V, TD, Depth, Query(AT, safeCxtI(V, CxtI), DT));
+  return ::isKnownNonZero(V, TD, Depth, Query(AC, safeCxtI(V, CxtI), DT));
 }
 
 static bool MaskedValueIsZero(Value *V, const APInt &Mask,
                               const DataLayout *TD, unsigned Depth,
                               const Query &Q);
 
-bool llvm::MaskedValueIsZero(Value *V, const APInt &Mask,
-                             const DataLayout *TD, unsigned Depth,
-                             AssumptionTracker *AT, const Instruction *CxtI,
-                             const DominatorTree *DT) {
+bool llvm::MaskedValueIsZero(Value *V, const APInt &Mask, const DataLayout *TD,
+                             unsigned Depth, AssumptionCache *AC,
+                             const Instruction *CxtI, const DominatorTree *DT) {
   return ::MaskedValueIsZero(V, Mask, TD, Depth,
-                             Query(AT, safeCxtI(V, CxtI), DT));
+                             Query(AC, safeCxtI(V, CxtI), DT));
 }
 
 static unsigned ComputeNumSignBits(Value *V, const DataLayout *TD,
                                    unsigned Depth, const Query &Q);
 
 unsigned llvm::ComputeNumSignBits(Value *V, const DataLayout *TD,
-                                  unsigned Depth, AssumptionTracker *AT,
+                                  unsigned Depth, AssumptionCache *AC,
                                   const Instruction *CxtI,
                                   const DominatorTree *DT) {
-  return ::ComputeNumSignBits(V, TD, Depth, Query(AT, safeCxtI(V, CxtI), DT));
+  return ::ComputeNumSignBits(V, TD, Depth, Query(AC, safeCxtI(V, CxtI), DT));
 }
 
 static void computeKnownBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
@@ -312,8 +310,10 @@ void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges,
   // Use the high end of the ranges to find leading zeros.
   unsigned MinLeadingZeros = BitWidth;
   for (unsigned i = 0; i < NumRanges; ++i) {
-    ConstantInt *Lower = cast<ConstantInt>(Ranges.getOperand(2*i + 0));
-    ConstantInt *Upper = cast<ConstantInt>(Ranges.getOperand(2*i + 1));
+    ConstantInt *Lower =
+        mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 0));
+    ConstantInt *Upper =
+        mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 1));
     ConstantRange Range(Lower->getValue(), Upper->getValue());
     if (Range.isWrappedSet())
       MinLeadingZeros = 0; // -1 has no zeros
@@ -480,18 +480,31 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
                                        unsigned Depth, const Query &Q) {
   // Use of assumptions is context-sensitive. If we don't have a context, we
   // cannot use them!
-  if (!Q.AT || !Q.CxtI)
+  if (!Q.AC || !Q.CxtI)
     return;
 
   unsigned BitWidth = KnownZero.getBitWidth();
 
-  Function *F = const_cast<Function*>(Q.CxtI->getParent()->getParent());
-  for (auto &CI : Q.AT->assumptions(F)) {
-    CallInst *I = CI;
+  for (auto &AssumeVH : Q.AC->assumptions()) {
+    if (!AssumeVH)
+      continue;
+    CallInst *I = cast<CallInst>(AssumeVH);
+    assert(I->getParent()->getParent() == Q.CxtI->getParent()->getParent() &&
+           "Got assumption for the wrong function!");
     if (Q.ExclInvs.count(I))
       continue;
 
-    if (match(I, m_Intrinsic<Intrinsic::assume>(m_Specific(V))) &&
+    // Warning: This loop can end up being somewhat performance sensetive.
+    // We're running this loop for once for each value queried resulting in a
+    // runtime of ~O(#assumes * #values).
+
+    assert(isa<IntrinsicInst>(I) &&
+           dyn_cast<IntrinsicInst>(I)->getIntrinsicID() == Intrinsic::assume &&
+           "must be an assume intrinsic");
+    
+    Value *Arg = I->getArgOperand(0);
+
+    if (Arg == V &&
         isValidAssumeForContext(I, Q, DL)) {
       assert(BitWidth == 1 && "assume operand is not i1?");
       KnownZero.clearAllBits();
@@ -499,6 +512,10 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
       return;
     }
 
+    // The remaining tests are all recursive, so bail out if we hit the limit.
+    if (Depth == MaxDepth)
+      continue;
+
     Value *A, *B;
     auto m_V = m_CombineOr(m_Specific(V),
                            m_CombineOr(m_PtrToInt(m_Specific(V)),
@@ -507,16 +524,15 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
     CmpInst::Predicate Pred;
     ConstantInt *C;
     // assume(v = a)
-    if (match(I, m_Intrinsic<Intrinsic::assume>(
-                   m_c_ICmp(Pred, m_V, m_Value(A)))) &&
+    if (match(Arg, m_c_ICmp(Pred, m_V, m_Value(A))) &&
         Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
       computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
       KnownZero |= RHSKnownZero;
       KnownOne  |= RHSKnownOne;
     // assume(v & b = a)
-    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
-                       m_c_ICmp(Pred, m_c_And(m_V, m_Value(B)), m_Value(A)))) &&
+    } else if (match(Arg, m_c_ICmp(Pred, m_c_And(m_V, m_Value(B)),
+                                   m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
       computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
@@ -528,9 +544,8 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
       KnownZero |= RHSKnownZero & MaskKnownOne;
       KnownOne  |= RHSKnownOne  & MaskKnownOne;
     // assume(~(v & b) = a)
-    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
-                       m_c_ICmp(Pred, m_Not(m_c_And(m_V, m_Value(B))),
-                                m_Value(A)))) &&
+    } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_And(m_V, m_Value(B))),
+                                   m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
       computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
@@ -542,8 +557,8 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
       KnownZero |= RHSKnownOne  & MaskKnownOne;
       KnownOne  |= RHSKnownZero & MaskKnownOne;
     // assume(v | b = a)
-    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
-                       m_c_ICmp(Pred, m_c_Or(m_V, m_Value(B)), m_Value(A)))) &&
+    } else if (match(Arg, m_c_ICmp(Pred, m_c_Or(m_V, m_Value(B)),
+                                   m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
       computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
@@ -555,9 +570,8 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
       KnownZero |= RHSKnownZero & BKnownZero;
       KnownOne  |= RHSKnownOne  & BKnownZero;
     // assume(~(v | b) = a)
-    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
-                       m_c_ICmp(Pred, m_Not(m_c_Or(m_V, m_Value(B))),
-                                m_Value(A)))) &&
+    } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Or(m_V, m_Value(B))),
+                                   m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
       computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
@@ -569,8 +583,8 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
       KnownZero |= RHSKnownOne  & BKnownZero;
       KnownOne  |= RHSKnownZero & BKnownZero;
     // assume(v ^ b = a)
-    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
-                       m_c_ICmp(Pred, m_c_Xor(m_V, m_Value(B)), m_Value(A)))) &&
+    } else if (match(Arg, m_c_ICmp(Pred, m_c_Xor(m_V, m_Value(B)),
+                                   m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
       computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
@@ -585,9 +599,8 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
       KnownZero |= RHSKnownOne  & BKnownOne;
       KnownOne  |= RHSKnownZero & BKnownOne;
     // assume(~(v ^ b) = a)
-    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
-                       m_c_ICmp(Pred, m_Not(m_c_Xor(m_V, m_Value(B))),
-                                m_Value(A)))) &&
+    } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Xor(m_V, m_Value(B))),
+                                   m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
       computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
@@ -602,9 +615,8 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
       KnownZero |= RHSKnownZero & BKnownOne;
       KnownOne  |= RHSKnownOne  & BKnownOne;
     // assume(v << c = a)
-    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
-                       m_c_ICmp(Pred, m_Shl(m_V, m_ConstantInt(C)),
-                                      m_Value(A)))) &&
+    } else if (match(Arg, m_c_ICmp(Pred, m_Shl(m_V, m_ConstantInt(C)),
+                                   m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
       computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
@@ -613,9 +625,8 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
       KnownZero |= RHSKnownZero.lshr(C->getZExtValue());
       KnownOne  |= RHSKnownOne.lshr(C->getZExtValue());
     // assume(~(v << c) = a)
-    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
-                       m_c_ICmp(Pred, m_Not(m_Shl(m_V, m_ConstantInt(C))),
-                                      m_Value(A)))) &&
+    } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shl(m_V, m_ConstantInt(C))),
+                                   m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
       computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
@@ -624,11 +635,11 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
       KnownZero |= RHSKnownOne.lshr(C->getZExtValue());
       KnownOne  |= RHSKnownZero.lshr(C->getZExtValue());
     // assume(v >> c = a)
-    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
-                       m_c_ICmp(Pred, m_CombineOr(m_LShr(m_V, m_ConstantInt(C)),
+    } else if (match(Arg,
+                     m_c_ICmp(Pred, m_CombineOr(m_LShr(m_V, m_ConstantInt(C)),
                                                   m_AShr(m_V,
                                                          m_ConstantInt(C))),
-                                     m_Value(A)))) &&
+                                     m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
       computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
@@ -637,11 +648,10 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
       KnownZero |= RHSKnownZero << C->getZExtValue();
       KnownOne  |= RHSKnownOne  << C->getZExtValue();
     // assume(~(v >> c) = a)
-    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
-                       m_c_ICmp(Pred, m_Not(m_CombineOr(
+    } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_CombineOr(
                                               m_LShr(m_V, m_ConstantInt(C)),
                                               m_AShr(m_V, m_ConstantInt(C)))),
-                                     m_Value(A)))) &&
+                                   m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
       computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
@@ -650,8 +660,7 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
       KnownZero |= RHSKnownOne  << C->getZExtValue();
       KnownOne  |= RHSKnownZero << C->getZExtValue();
     // assume(v >=_s c) where c is non-negative
-    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
-                       m_ICmp(Pred, m_V, m_Value(A)))) &&
+    } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_SGE &&
                isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
@@ -662,8 +671,7 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
         KnownZero |= APInt::getSignBit(BitWidth);
       }
     // assume(v >_s c) where c is at least -1.
-    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
-                       m_ICmp(Pred, m_V, m_Value(A)))) &&
+    } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_SGT &&
                isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
@@ -674,8 +682,7 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
         KnownZero |= APInt::getSignBit(BitWidth);
       }
     // assume(v <=_s c) where c is negative
-    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
-                       m_ICmp(Pred, m_V, m_Value(A)))) &&
+    } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_SLE &&
                isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
@@ -686,8 +693,7 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
         KnownOne |= APInt::getSignBit(BitWidth);
       }
     // assume(v <_s c) where c is non-positive
-    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
-                       m_ICmp(Pred, m_V, m_Value(A)))) &&
+    } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_SLT &&
                isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
@@ -698,8 +704,7 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
         KnownOne |= APInt::getSignBit(BitWidth);
       }
     // assume(v <=_u c)
-    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
-                       m_ICmp(Pred, m_V, m_Value(A)))) &&
+    } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_ULE &&
                isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
@@ -709,8 +714,7 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
       KnownZero |=
         APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes());
     // assume(v <_u c)
-    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
-                       m_ICmp(Pred, m_V, m_Value(A)))) &&
+    } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_ULT &&
                isValidAssumeForContext(I, Q, DL)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
@@ -790,22 +794,11 @@ void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
     return;
   }
 
-  // A weak GlobalAlias is totally unknown. A non-weak GlobalAlias has
-  // the bits of its aliasee.
-  if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
-    if (GA->mayBeOverridden()) {
-      KnownZero.clearAllBits(); KnownOne.clearAllBits();
-    } else {
-      computeKnownBits(GA->getAliasee(), KnownZero, KnownOne, TD, Depth+1, Q);
-    }
-    return;
-  }
-
   // The address of an aligned GlobalValue has trailing zeros.
-  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
-    unsigned Align = GV->getAlignment();
+  if (auto *GO = dyn_cast<GlobalObject>(V)) {
+    unsigned Align = GO->getAlignment();
     if (Align == 0 && TD) {
-      if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) {
+      if (auto *GVar = dyn_cast<GlobalVariable>(GO)) {
         Type *ObjectType = GVar->getType()->getElementType();
         if (ObjectType->isSized()) {
           // If the object is defined in the current Module, we'll be giving
@@ -839,6 +832,9 @@ void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
 
     if (Align)
       KnownZero = APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align));
+    else
+      KnownZero.clearAllBits();
+    KnownOne.clearAllBits();
 
     // Don't give up yet... there might be an assumption that provides more
     // information...
@@ -849,8 +845,18 @@ void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
   // Start out not knowing anything.
   KnownZero.clearAllBits(); KnownOne.clearAllBits();
 
+  // Limit search depth.
+  // All recursive calls that increase depth must come after this.
   if (Depth == MaxDepth)
-    return;  // Limit search depth.
+    return;  
+
+  // A weak GlobalAlias is totally unknown. A non-weak GlobalAlias has
+  // the bits of its aliasee.
+  if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+    if (!GA->mayBeOverridden())
+      computeKnownBits(GA->getAliasee(), KnownZero, KnownOne, TD, Depth + 1, Q);
+    return;
+  }
 
   // Check whether a nearby assume intrinsic can determine some known bits.
   computeKnownBitsFromAssume(V, KnownZero, KnownOne, TD, Depth, Q);
@@ -1507,8 +1513,10 @@ static bool rangeMetadataExcludesValue(MDNode* Ranges,
   const unsigned NumRanges = Ranges->getNumOperands() / 2;
   assert(NumRanges >= 1);
   for (unsigned i = 0; i < NumRanges; ++i) {
-    ConstantInt *Lower = cast<ConstantInt>(Ranges->getOperand(2*i + 0));
-    ConstantInt *Upper = cast<ConstantInt>(Ranges->getOperand(2*i + 1));
+    ConstantInt *Lower =
+        mdconst::extract<ConstantInt>(Ranges->getOperand(2 * i + 0));
+    ConstantInt *Upper =
+        mdconst::extract<ConstantInt>(Ranges->getOperand(2 * i + 1));
     ConstantRange Range(Lower->getValue(), Upper->getValue());
     if (Range.contains(Value))
       return false;
@@ -1764,7 +1772,7 @@ unsigned ComputeNumSignBits(Value *V, const DataLayout *TD,
     if (Tmp == 1) return 1;  // Early out.
 
     // Special case decrementing a value (ADD X, -1):
-    if (ConstantInt *CRHS = dyn_cast<ConstantInt>(U->getOperand(1)))
+    if (const auto *CRHS = dyn_cast<Constant>(U->getOperand(1)))
       if (CRHS->isAllOnesValue()) {
         APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
         computeKnownBits(U->getOperand(0), KnownZero, KnownOne, TD, Depth+1, Q);
@@ -1789,7 +1797,7 @@ unsigned ComputeNumSignBits(Value *V, const DataLayout *TD,
     if (Tmp2 == 1) return 1;
 
     // Handle NEG.
-    if (ConstantInt *CLHS = dyn_cast<ConstantInt>(U->getOperand(0)))
+    if (const auto *CLHS = dyn_cast<Constant>(U->getOperand(0)))
       if (CLHS->isNullValue()) {
         APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
         computeKnownBits(U->getOperand(1), KnownZero, KnownOne, TD, Depth+1, Q);
@@ -1814,13 +1822,16 @@ unsigned ComputeNumSignBits(Value *V, const DataLayout *TD,
 
   case Instruction::PHI: {
     PHINode *PN = cast<PHINode>(U);
+    unsigned NumIncomingValues = PN->getNumIncomingValues();
     // Don't analyze large in-degree PHIs.
-    if (PN->getNumIncomingValues() > 4) break;
+    if (NumIncomingValues > 4) break;
+    // Unreachable blocks may have zero-operand PHI nodes.
+    if (NumIncomingValues == 0) break;
 
     // Take the minimum of all incoming values.  This can't infinitely loop
     // because of our depth threshold.
     Tmp = ComputeNumSignBits(PN->getIncomingValue(0), TD, Depth+1, Q);
-    for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) {
+    for (unsigned i = 1, e = NumIncomingValues; i != e; ++i) {
       if (Tmp == 1) return Tmp;
       Tmp = std::min(Tmp,
                      ComputeNumSignBits(PN->getIncomingValue(i), TD,
@@ -1989,8 +2000,11 @@ bool llvm::CannotBeNegativeZero(const Value *V, unsigned Depth) {
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
     return !CFP->getValueAPF().isNegZero();
 
+  // FIXME: Magic number! At the least, this should be given a name because it's
+  // used similarly in CannotBeOrderedLessThanZero(). A better fix may be to
+  // expose it as a parameter, so it can be used for testing / experimenting.
   if (Depth == 6)
-    return 1;  // Limit search depth.
+    return false;  // Limit search depth.
 
   const Operator *I = dyn_cast<Operator>(V);
   if (!I) return false;
@@ -2033,6 +2047,62 @@ bool llvm::CannotBeNegativeZero(const Value *V, unsigned Depth) {
   return false;
 }
 
+bool llvm::CannotBeOrderedLessThanZero(const Value *V, unsigned Depth) {
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
+    return !CFP->getValueAPF().isNegative() || CFP->getValueAPF().isZero();
+
+  // FIXME: Magic number! At the least, this should be given a name because it's
+  // used similarly in CannotBeNegativeZero(). A better fix may be to
+  // expose it as a parameter, so it can be used for testing / experimenting.
+  if (Depth == 6)
+    return false;  // Limit search depth.
+
+  const Operator *I = dyn_cast<Operator>(V);
+  if (!I) return false;
+
+  switch (I->getOpcode()) {
+  default: break;
+  case Instruction::FMul:
+    // x*x is always non-negative or a NaN.
+    if (I->getOperand(0) == I->getOperand(1)) 
+      return true;
+    // Fall through
+  case Instruction::FAdd:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+    return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1) &&
+           CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1);
+  case Instruction::FPExt:
+  case Instruction::FPTrunc:
+    // Widening/narrowing never change sign.
+    return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1);
+  case Instruction::Call: 
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) 
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::exp:
+      case Intrinsic::exp2:
+      case Intrinsic::fabs:
+      case Intrinsic::sqrt:
+        return true;
+      case Intrinsic::powi: 
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+          // powi(x,n) is non-negative if n is even.
+          if (CI->getBitWidth() <= 64 && CI->getSExtValue() % 2u == 0)
+            return true;
+        }
+        return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1);
+      case Intrinsic::fma:
+      case Intrinsic::fmuladd:
+        // x*x+y is non-negative if y is non-negative.
+        return I->getOperand(0) == I->getOperand(1) && 
+               CannotBeOrderedLessThanZero(I->getOperand(2), Depth+1);
+      }
+    break;
+  }
+  return false; 
+}
+
 /// If the specified value can be set by repeating the same byte in memory,
 /// return the i8 value that it is represented with.  This is
 /// true for all i8 values obviously, but is also true for i32 0, i32 -1,
@@ -2057,26 +2127,16 @@ Value *llvm::isBytewiseValue(Value *V) {
     // Don't handle long double formats, which have strange constraints.
   }
 
-  // We can handle constant integers that are power of two in size and a
-  // multiple of 8 bits.
+  // We can handle constant integers that are multiple of 8 bits.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-    unsigned Width = CI->getBitWidth();
-    if (isPowerOf2_32(Width) && Width > 8) {
-      // We can handle this value if the recursive binary decomposition is the
-      // same at all levels.
-      APInt Val = CI->getValue();
-      APInt Val2;
-      while (Val.getBitWidth() != 8) {
-        unsigned NextWidth = Val.getBitWidth()/2;
-        Val2  = Val.lshr(NextWidth);
-        Val2 = Val2.trunc(Val.getBitWidth()/2);
-        Val = Val.trunc(Val.getBitWidth()/2);
-
-        // If the top/bottom halves aren't the same, reject it.
-        if (Val != Val2)
-          return nullptr;
-      }
-      return ConstantInt::get(V->getContext(), Val);
+    if (CI->getBitWidth() % 8 == 0) {
+      assert(CI->getBitWidth() > 8 && "8 bits should be handled above!");
+
+      // We can check that all bytes of an integer are equal by making use of a
+      // little trick: rotate by 8 and check if it's still the same value.
+      if (CI->getValue() != CI->getValue().rotl(8))
+        return nullptr;
+      return ConstantInt::get(V->getContext(), CI->getValue().trunc(8));
     }
   }
 
@@ -2474,7 +2534,7 @@ llvm::GetUnderlyingObject(Value *V, const DataLayout *TD, unsigned MaxLookup) {
     } else {
       // See if InstructionSimplify knows any relevant tricks.
       if (Instruction *I = dyn_cast<Instruction>(V))
-        // TODO: Acquire a DominatorTree and AssumptionTracker and use them.
+        // TODO: Acquire a DominatorTree and AssumptionCache and use them.
         if (Value *Simplified = SimplifyInstruction(I, TD, nullptr)) {
           V = Simplified;
           continue;
@@ -2556,20 +2616,20 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
   case Instruction::SDiv:
   case Instruction::SRem: {
     // x / y is undefined if y == 0 or x == INT_MIN and y == -1
-    const APInt *X, *Y;
-    if (match(Inst->getOperand(1), m_APInt(Y))) {
-      if (*Y != 0) {
-        if (*Y == -1) {
-          // The numerator can't be MinSignedValue if the denominator is -1.
-          if (match(Inst->getOperand(0), m_APInt(X)))
-            return !Y->isMinSignedValue();
-          // The numerator *might* be MinSignedValue.
-          return false;
-        }
-        // The denominator is not 0 or -1, it's safe to proceed.
-        return true;
-      }
-    }
+    const APInt *Numerator, *Denominator;
+    if (!match(Inst->getOperand(1), m_APInt(Denominator)))
+      return false;
+    // We cannot hoist this division if the denominator is 0.
+    if (*Denominator == 0)
+      return false;
+    // It's safe to hoist if the denominator is not 0 or -1.
+    if (*Denominator != -1)
+      return true;
+    // At this point we know that the denominator is -1.  It is safe to hoist as
+    // long we know that the numerator is not INT_MIN.
+    if (match(Inst->getOperand(0), m_APInt(Numerator)))
+      return !Numerator->isMinSignedValue();
+    // The numerator *might* be MinSignedValue.
     return false;
   }
   case Instruction::Load: {
@@ -2668,3 +2728,82 @@ bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) {
 
   return false;
 }
+
+OverflowResult llvm::computeOverflowForUnsignedMul(Value *LHS, Value *RHS,
+                                                   const DataLayout *DL,
+                                                   AssumptionCache *AC,
+                                                   const Instruction *CxtI,
+                                                   const DominatorTree *DT) {
+  // Multiplying n * m significant bits yields a result of n + m significant
+  // bits. If the total number of significant bits does not exceed the
+  // result bit width (minus 1), there is no overflow.
+  // This means if we have enough leading zero bits in the operands
+  // we can guarantee that the result does not overflow.
+  // Ref: "Hacker's Delight" by Henry Warren
+  unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
+  APInt LHSKnownZero(BitWidth, 0);
+  APInt LHSKnownOne(BitWidth, 0);
+  APInt RHSKnownZero(BitWidth, 0);
+  APInt RHSKnownOne(BitWidth, 0);
+  computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, DL, /*Depth=*/0, AC, CxtI,
+                   DT);
+  computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, DL, /*Depth=*/0, AC, CxtI,
+                   DT);
+  // Note that underestimating the number of zero bits gives a more
+  // conservative answer.
+  unsigned ZeroBits = LHSKnownZero.countLeadingOnes() +
+                      RHSKnownZero.countLeadingOnes();
+  // First handle the easy case: if we have enough zero bits there's
+  // definitely no overflow.
+  if (ZeroBits >= BitWidth)
+    return OverflowResult::NeverOverflows;
+
+  // Get the largest possible values for each operand.
+  APInt LHSMax = ~LHSKnownZero;
+  APInt RHSMax = ~RHSKnownZero;
+
+  // We know the multiply operation doesn't overflow if the maximum values for
+  // each operand will not overflow after we multiply them together.
+  bool MaxOverflow;
+  LHSMax.umul_ov(RHSMax, MaxOverflow);
+  if (!MaxOverflow)
+    return OverflowResult::NeverOverflows;
+
+  // We know it always overflows if multiplying the smallest possible values for
+  // the operands also results in overflow.
+  bool MinOverflow;
+  LHSKnownOne.umul_ov(RHSKnownOne, MinOverflow);
+  if (MinOverflow)
+    return OverflowResult::AlwaysOverflows;
+
+  return OverflowResult::MayOverflow;
+}
+
+OverflowResult llvm::computeOverflowForUnsignedAdd(Value *LHS, Value *RHS,
+                                                   const DataLayout *DL,
+                                                   AssumptionCache *AC,
+                                                   const Instruction *CxtI,
+                                                   const DominatorTree *DT) {
+  bool LHSKnownNonNegative, LHSKnownNegative;
+  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0,
+                 AC, CxtI, DT);
+  if (LHSKnownNonNegative || LHSKnownNegative) {
+    bool RHSKnownNonNegative, RHSKnownNegative;
+    ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0,
+                   AC, CxtI, DT);
+
+    if (LHSKnownNegative && RHSKnownNegative) {
+      // The sign bit is set in both cases: this MUST overflow.
+      // Create a simple add instruction, and insert it into the struct.
+      return OverflowResult::AlwaysOverflows;
+    }
+
+    if (LHSKnownNonNegative && RHSKnownNonNegative) {
+      // The sign bit is clear in both cases: this CANNOT overflow.
+      // Create a simple add instruction, and insert it into the struct.
+      return OverflowResult::NeverOverflows;
+    }
+  }
+
+  return OverflowResult::MayOverflow;
+}
diff --git a/lib/AsmParser/CMakeLists.txt b/lib/AsmParser/CMakeLists.txt
index 985ebe2..7866837 100644
--- a/lib/AsmParser/CMakeLists.txt
+++ b/lib/AsmParser/CMakeLists.txt
@@ -3,4 +3,7 @@ add_llvm_library(LLVMAsmParser
   LLLexer.cpp
   LLParser.cpp
   Parser.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Analysis
   )
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 6523bce..3bf090a 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -78,13 +78,15 @@ uint64_t LLLexer::HexIntToVal(const char *Buffer, const char *End) {
 void LLLexer::HexToIntPair(const char *Buffer, const char *End,
                            uint64_t Pair[2]) {
   Pair[0] = 0;
-  for (int i=0; i<16; i++, Buffer++) {
-    assert(Buffer != End);
-    Pair[0] *= 16;
-    Pair[0] += hexDigitValue(*Buffer);
+  if (End - Buffer >= 16) {
+    for (int i = 0; i < 16; i++, Buffer++) {
+      assert(Buffer != End);
+      Pair[0] *= 16;
+      Pair[0] += hexDigitValue(*Buffer);
+    }
   }
   Pair[1] = 0;
-  for (int i=0; i<16 && Buffer != End; i++, Buffer++) {
+  for (int i = 0; i < 16 && Buffer != End; i++, Buffer++) {
     Pair[1] *= 16;
     Pair[1] += hexDigitValue(*Buffer);
   }
@@ -239,7 +241,7 @@ lltok::Kind LLLexer::LexToken() {
   case ')': return lltok::rparen;
   case ',': return lltok::comma;
   case '*': return lltok::star;
-  case '\\': return lltok::backslash;
+  case '|': return lltok::bar;
   }
 }
 
@@ -255,46 +257,7 @@ void LLLexer::SkipLineComment() {
 ///   GlobalVar   @[-a-zA-Z$._][-a-zA-Z$._0-9]*
 ///   GlobalVarID @[0-9]+
 lltok::Kind LLLexer::LexAt() {
-  // Handle AtStringConstant: @\"[^\"]*\"
-  if (CurPtr[0] == '"') {
-    ++CurPtr;
-
-    while (1) {
-      int CurChar = getNextChar();
-
-      if (CurChar == EOF) {
-        Error("end of file in global variable name");
-        return lltok::Error;
-      }
-      if (CurChar == '"') {
-        StrVal.assign(TokStart+2, CurPtr-1);
-        UnEscapeLexed(StrVal);
-        if (StringRef(StrVal).find_first_of(0) != StringRef::npos) {
-          Error("Null bytes are not allowed in names");
-          return lltok::Error;
-        }
-        return lltok::GlobalVar;
-      }
-    }
-  }
-
-  // Handle GlobalVarName: @[-a-zA-Z$._][-a-zA-Z$._0-9]*
-  if (ReadVarName())
-    return lltok::GlobalVar;
-
-  // Handle GlobalVarID: @[0-9]+
-  if (isdigit(static_cast<unsigned char>(CurPtr[0]))) {
-    for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
-      /*empty*/;
-
-    uint64_t Val = atoull(TokStart+1, CurPtr);
-    if ((unsigned)Val != Val)
-      Error("invalid value number (too large)!");
-    UIntVal = unsigned(Val);
-    return lltok::GlobalID;
-  }
-
-  return lltok::Error;
+  return LexVar(lltok::GlobalVar, lltok::GlobalID);
 }
 
 lltok::Kind LLLexer::LexDollar() {
@@ -370,22 +333,35 @@ bool LLLexer::ReadVarName() {
   return false;
 }
 
-/// LexPercent - Lex all tokens that start with a % character:
-///   LocalVar   ::= %\"[^\"]*\"
-///   LocalVar   ::= %[-a-zA-Z$._][-a-zA-Z$._0-9]*
-///   LocalVarID ::= %[0-9]+
-lltok::Kind LLLexer::LexPercent() {
-  // Handle LocalVarName: %\"[^\"]*\"
+lltok::Kind LLLexer::LexVar(lltok::Kind Var, lltok::Kind VarID) {
+  // Handle StringConstant: \"[^\"]*\"
   if (CurPtr[0] == '"') {
     ++CurPtr;
-    return ReadString(lltok::LocalVar);
+
+    while (1) {
+      int CurChar = getNextChar();
+
+      if (CurChar == EOF) {
+        Error("end of file in global variable name");
+        return lltok::Error;
+      }
+      if (CurChar == '"') {
+        StrVal.assign(TokStart+2, CurPtr-1);
+        UnEscapeLexed(StrVal);
+        if (StringRef(StrVal).find_first_of(0) != StringRef::npos) {
+          Error("Null bytes are not allowed in names");
+          return lltok::Error;
+        }
+        return Var;
+      }
+    }
   }
 
-  // Handle LocalVarName: %[-a-zA-Z$._][-a-zA-Z$._0-9]*
+  // Handle VarName: [-a-zA-Z$._][-a-zA-Z$._0-9]*
   if (ReadVarName())
-    return lltok::LocalVar;
+    return Var;
 
-  // Handle LocalVarID: %[0-9]+
+  // Handle VarID: [0-9]+
   if (isdigit(static_cast<unsigned char>(CurPtr[0]))) {
     for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
       /*empty*/;
@@ -394,12 +370,19 @@ lltok::Kind LLLexer::LexPercent() {
     if ((unsigned)Val != Val)
       Error("invalid value number (too large)!");
     UIntVal = unsigned(Val);
-    return lltok::LocalVarID;
+    return VarID;
   }
-
   return lltok::Error;
 }
 
+/// LexPercent - Lex all tokens that start with a % character:
+///   LocalVar   ::= %\"[^\"]*\"
+///   LocalVar   ::= %[-a-zA-Z$._][-a-zA-Z$._0-9]*
+///   LocalVarID ::= %[0-9]+
+lltok::Kind LLLexer::LexPercent() {
+  return LexVar(lltok::LocalVar, lltok::LocalVarID);
+}
+
 /// LexQuote - Lex all tokens that start with a " character:
 ///   QuoteLabel        "[^"]+":
 ///   StringConstant    "[^"]*"
@@ -410,7 +393,12 @@ lltok::Kind LLLexer::LexQuote() {
 
   if (CurPtr[0] == ':') {
     ++CurPtr;
-    kind = lltok::LabelStr;
+    if (StringRef(StrVal).find_first_of(0) != StringRef::npos) {
+      Error("Null bytes are not allowed in names");
+      kind = lltok::Error;
+    } else {
+      kind = lltok::LabelStr;
+    }
   }
 
   return kind;
@@ -499,11 +487,11 @@ lltok::Kind LLLexer::LexIdentifier() {
   if (!KeywordEnd) KeywordEnd = CurPtr;
   CurPtr = KeywordEnd;
   --StartChar;
-  unsigned Len = CurPtr-StartChar;
-#define KEYWORD(STR)                                                    \
-  do {                                                                  \
-    if (Len == strlen(#STR) && !memcmp(StartChar, #STR, strlen(#STR)))  \
-      return lltok::kw_##STR;                                           \
+  StringRef Keyword(StartChar, CurPtr - StartChar);
+#define KEYWORD(STR)                                                           \
+  do {                                                                         \
+    if (Keyword == #STR)                                                       \
+      return lltok::kw_##STR;                                                  \
   } while (0)
 
   KEYWORD(true);    KEYWORD(false);
@@ -573,6 +561,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(inteldialect);
   KEYWORD(gc);
   KEYWORD(prefix);
+  KEYWORD(prologue);
 
   KEYWORD(ccc);
   KEYWORD(fastcc);
@@ -596,6 +585,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(anyregcc);
   KEYWORD(preserve_mostcc);
   KEYWORD(preserve_allcc);
+  KEYWORD(ghccc);
 
   KEYWORD(cc);
   KEYWORD(c);
@@ -665,6 +655,9 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(x);
   KEYWORD(blockaddress);
 
+  // Metadata types.
+  KEYWORD(distinct);
+
   // Use-list order directives.
   KEYWORD(uselistorder);
   KEYWORD(uselistorder_bb);
@@ -676,9 +669,13 @@ lltok::Kind LLLexer::LexIdentifier() {
 #undef KEYWORD
 
   // Keywords for types.
-#define TYPEKEYWORD(STR, LLVMTY) \
-  if (Len == strlen(STR) && !memcmp(StartChar, STR, strlen(STR))) { \
-    TyVal = LLVMTY; return lltok::Type; }
+#define TYPEKEYWORD(STR, LLVMTY)                                               \
+  do {                                                                         \
+    if (Keyword == STR) {                                                      \
+      TyVal = LLVMTY;                                                          \
+      return lltok::Type;                                                      \
+    }                                                                          \
+  } while (false)
   TYPEKEYWORD("void",      Type::getVoidTy(Context));
   TYPEKEYWORD("half",      Type::getHalfTy(Context));
   TYPEKEYWORD("float",     Type::getFloatTy(Context));
@@ -692,9 +689,13 @@ lltok::Kind LLLexer::LexIdentifier() {
 #undef TYPEKEYWORD
 
   // Keywords for instructions.
-#define INSTKEYWORD(STR, Enum) \
-  if (Len == strlen(#STR) && !memcmp(StartChar, #STR, strlen(#STR))) { \
-    UIntVal = Instruction::Enum; return lltok::kw_##STR; }
+#define INSTKEYWORD(STR, Enum)                                                 \
+  do {                                                                         \
+    if (Keyword == #STR) {                                                     \
+      UIntVal = Instruction::Enum;                                             \
+      return lltok::kw_##STR;                                                  \
+    }                                                                          \
+  } while (false)
 
   INSTKEYWORD(add,   Add);  INSTKEYWORD(fadd,   FAdd);
   INSTKEYWORD(sub,   Sub);  INSTKEYWORD(fsub,   FSub);
@@ -746,6 +747,25 @@ lltok::Kind LLLexer::LexIdentifier() {
   INSTKEYWORD(landingpad,     LandingPad);
 #undef INSTKEYWORD
 
+#define DWKEYWORD(TYPE, TOKEN)                                                 \
+  do {                                                                         \
+    if (Keyword.startswith("DW_" #TYPE "_")) {                                 \
+      StrVal.assign(Keyword.begin(), Keyword.end());                           \
+      return lltok::TOKEN;                                                     \
+    }                                                                          \
+  } while (false)
+  DWKEYWORD(TAG, DwarfTag);
+  DWKEYWORD(ATE, DwarfAttEncoding);
+  DWKEYWORD(VIRTUALITY, DwarfVirtuality);
+  DWKEYWORD(LANG, DwarfLang);
+  DWKEYWORD(OP, DwarfOp);
+#undef DWKEYWORD
+
+  if (Keyword.startswith("DIFlag")) {
+    StrVal.assign(Keyword.begin(), Keyword.end());
+    return lltok::DIFlag;
+  }
+
   // Check for [us]0x[0-9A-Fa-f]+ which are Hexadecimal constant generated by
   // the CFE to avoid forcing it to deal with 64-bit numbers.
   if ((TokStart[0] == 'u' || TokStart[0] == 's') &&
@@ -753,7 +773,13 @@ lltok::Kind LLLexer::LexIdentifier() {
       isxdigit(static_cast<unsigned char>(TokStart[3]))) {
     int len = CurPtr-TokStart-3;
     uint32_t bits = len * 4;
-    APInt Tmp(bits, StringRef(TokStart+3, len), 16);
+    StringRef HexStr(TokStart + 3, len);
+    if (!std::all_of(HexStr.begin(), HexStr.end(), isxdigit)) {
+      // Bad token, return it as an error.
+      CurPtr = TokStart+3;
+      return lltok::Error;
+    }
+    APInt Tmp(bits, HexStr, 16);
     uint32_t activeBits = Tmp.getActiveBits();
     if (activeBits > 0 && activeBits < bits)
       Tmp = Tmp.trunc(activeBits);
diff --git a/lib/AsmParser/LLLexer.h b/lib/AsmParser/LLLexer.h
index 219827f..3343168 100644
--- a/lib/AsmParser/LLLexer.h
+++ b/lib/AsmParser/LLLexer.h
@@ -82,6 +82,7 @@ namespace llvm {
     lltok::Kind LexDollar();
     lltok::Kind LexExclaim();
     lltok::Kind LexPercent();
+    lltok::Kind LexVar(lltok::Kind Var, lltok::Kind VarID);
     lltok::Kind LexQuote();
     lltok::Kind Lex0x();
     lltok::Kind LexHash();
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 2c835f9..9e7354e 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -16,6 +16,8 @@
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
@@ -23,6 +25,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/raw_ostream.h"
@@ -47,27 +50,6 @@ bool LLParser::Run() {
 /// ValidateEndOfModule - Do final validity and sanity checks at the end of the
 /// module.
 bool LLParser::ValidateEndOfModule() {
-  // Handle any instruction metadata forward references.
-  if (!ForwardRefInstMetadata.empty()) {
-    for (DenseMap<Instruction*, std::vector<MDRef> >::iterator
-         I = ForwardRefInstMetadata.begin(), E = ForwardRefInstMetadata.end();
-         I != E; ++I) {
-      Instruction *Inst = I->first;
-      const std::vector<MDRef> &MDList = I->second;
-
-      for (unsigned i = 0, e = MDList.size(); i != e; ++i) {
-        unsigned SlotNo = MDList[i].MDSlot;
-
-        if (SlotNo >= NumberedMetadata.size() ||
-            NumberedMetadata[SlotNo] == nullptr)
-          return Error(MDList[i].Loc, "use of undefined metadata '!" +
-                       Twine(SlotNo) + "'");
-        Inst->setMetadata(MDList[i].MDKind, NumberedMetadata[SlotNo]);
-      }
-    }
-    ForwardRefInstMetadata.clear();
-  }
-
   for (unsigned I = 0, E = InstsWithTBAATag.size(); I < E; I++)
     UpgradeInstWithTBAATag(InstsWithTBAATag[I]);
 
@@ -136,10 +118,10 @@ bool LLParser::ValidateEndOfModule() {
     return Error(ForwardRefBlockAddresses.begin()->first.Loc,
                  "expected function name in blockaddress");
 
-  for (unsigned i = 0, e = NumberedTypes.size(); i != e; ++i)
-    if (NumberedTypes[i].second.isValid())
-      return Error(NumberedTypes[i].second,
-                   "use of undefined type '%" + Twine(i) + "'");
+  for (const auto &NT : NumberedTypes)
+    if (NT.second.second.isValid())
+      return Error(NT.second.second,
+                   "use of undefined type '%" + Twine(NT.first) + "'");
 
   for (StringMap<std::pair<Type*, LocTy> >::iterator I =
        NamedTypes.begin(), E = NamedTypes.end(); I != E; ++I)
@@ -167,6 +149,11 @@ bool LLParser::ValidateEndOfModule() {
                  "use of undefined metadata '!" +
                  Twine(ForwardRefMDNodes.begin()->first) + "'");
 
+  // Resolve metadata cycles.
+  for (auto &N : NumberedMetadata) {
+    if (N.second && !N.second->isResolved())
+      N.second->resolveCycles();
+  }
 
   // Look for intrinsic functions and CallInst that need to be upgraded
   for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; )
@@ -319,9 +306,6 @@ bool LLParser::ParseUnnamedType() {
       ParseToken(lltok::kw_type, "expected 'type' after '='"))
     return true;
 
-  if (TypeID >= NumberedTypes.size())
-    NumberedTypes.resize(TypeID+1);
-
   Type *Result = nullptr;
   if (ParseStructDefinition(TypeLoc, "",
                             NumberedTypes[TypeID], Result)) return true;
@@ -535,37 +519,24 @@ bool LLParser::ParseMDString(MDString *&Result) {
 
 // MDNode:
 //   ::= '!' MDNodeNumber
-//
-/// This version of ParseMDNodeID returns the slot number and null in the case
-/// of a forward reference.
-bool LLParser::ParseMDNodeID(MDNode *&Result, unsigned &SlotNo) {
-  // !{ ..., !42, ... }
-  if (ParseUInt32(SlotNo)) return true;
-
-  // Check existing MDNode.
-  if (SlotNo < NumberedMetadata.size() && NumberedMetadata[SlotNo] != nullptr)
-    Result = NumberedMetadata[SlotNo];
-  else
-    Result = nullptr;
-  return false;
-}
-
 bool LLParser::ParseMDNodeID(MDNode *&Result) {
   // !{ ..., !42, ... }
   unsigned MID = 0;
-  if (ParseMDNodeID(Result, MID)) return true;
+  if (ParseUInt32(MID))
+    return true;
 
   // If not a forward reference, just return it now.
-  if (Result) return false;
+  if (NumberedMetadata.count(MID)) {
+    Result = NumberedMetadata[MID];
+    return false;
+  }
 
   // Otherwise, create MDNode forward reference.
-  MDNode *FwdNode = MDNode::getTemporary(Context, None);
-  ForwardRefMDNodes[MID] = std::make_pair(FwdNode, Lex.getLoc());
+  auto &FwdRef = ForwardRefMDNodes[MID];
+  FwdRef = std::make_pair(MDTuple::getTemporary(Context, None), Lex.getLoc());
 
-  if (NumberedMetadata.size() <= MID)
-    NumberedMetadata.resize(MID+1);
-  NumberedMetadata[MID] = FwdNode;
-  Result = FwdNode;
+  Result = FwdRef.first.get();
+  NumberedMetadata[MID].reset(Result);
   return false;
 }
 
@@ -605,37 +576,34 @@ bool LLParser::ParseStandaloneMetadata() {
   Lex.Lex();
   unsigned MetadataID = 0;
 
-  LocTy TyLoc;
-  Type *Ty = nullptr;
-  SmallVector<Value *, 16> Elts;
+  MDNode *Init;
   if (ParseUInt32(MetadataID) ||
-      ParseToken(lltok::equal, "expected '=' here") ||
-      ParseType(Ty, TyLoc) ||
-      ParseToken(lltok::exclaim, "Expected '!' here") ||
-      ParseToken(lltok::lbrace, "Expected '{' here") ||
-      ParseMDNodeVector(Elts, nullptr) ||
-      ParseToken(lltok::rbrace, "expected end of metadata node"))
+      ParseToken(lltok::equal, "expected '=' here"))
     return true;
 
-  MDNode *Init = MDNode::get(Context, Elts);
+  // Detect common error, from old metadata syntax.
+  if (Lex.getKind() == lltok::Type)
+    return TokError("unexpected type in metadata definition");
+
+  bool IsDistinct = EatIfPresent(lltok::kw_distinct);
+  if (Lex.getKind() == lltok::MetadataVar) {
+    if (ParseSpecializedMDNode(Init, IsDistinct))
+      return true;
+  } else if (ParseToken(lltok::exclaim, "Expected '!' here") ||
+             ParseMDTuple(Init, IsDistinct))
+    return true;
 
   // See if this was forward referenced, if so, handle it.
-  std::map<unsigned, std::pair<TrackingVH<MDNode>, LocTy> >::iterator
-    FI = ForwardRefMDNodes.find(MetadataID);
+  auto FI = ForwardRefMDNodes.find(MetadataID);
   if (FI != ForwardRefMDNodes.end()) {
-    MDNode *Temp = FI->second.first;
-    Temp->replaceAllUsesWith(Init);
-    MDNode::deleteTemporary(Temp);
+    FI->second.first->replaceAllUsesWith(Init);
     ForwardRefMDNodes.erase(FI);
 
     assert(NumberedMetadata[MetadataID] == Init && "Tracking VH didn't work");
   } else {
-    if (MetadataID >= NumberedMetadata.size())
-      NumberedMetadata.resize(MetadataID+1);
-
-    if (NumberedMetadata[MetadataID] != nullptr)
+    if (NumberedMetadata.count(MetadataID))
       return TokError("Metadata id is already used");
-    NumberedMetadata[MetadataID] = Init;
+    NumberedMetadata[MetadataID].reset(Init);
   }
 
   return false;
@@ -782,36 +750,39 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
       return true;
   }
 
-  if (Ty->isFunctionTy() || Ty->isLabelTy())
+  if (Ty->isFunctionTy() || !PointerType::isValidElementType(Ty))
     return Error(TyLoc, "invalid type for global variable");
 
-  GlobalVariable *GV = nullptr;
+  GlobalValue *GVal = nullptr;
 
   // See if the global was forward referenced, if so, use the global.
   if (!Name.empty()) {
-    if (GlobalValue *GVal = M->getNamedValue(Name)) {
+    GVal = M->getNamedValue(Name);
+    if (GVal) {
       if (!ForwardRefVals.erase(Name) || !isa<GlobalValue>(GVal))
         return Error(NameLoc, "redefinition of global '@" + Name + "'");
-      GV = cast<GlobalVariable>(GVal);
     }
   } else {
     std::map<unsigned, std::pair<GlobalValue*, LocTy> >::iterator
       I = ForwardRefValIDs.find(NumberedVals.size());
     if (I != ForwardRefValIDs.end()) {
-      GV = cast<GlobalVariable>(I->second.first);
+      GVal = I->second.first;
       ForwardRefValIDs.erase(I);
     }
   }
 
-  if (!GV) {
+  GlobalVariable *GV;
+  if (!GVal) {
     GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, nullptr,
                             Name, nullptr, GlobalVariable::NotThreadLocal,
                             AddrSpace);
   } else {
-    if (GV->getType()->getElementType() != Ty)
+    if (GVal->getType()->getElementType() != Ty)
       return Error(TyLoc,
             "forward reference and definition of global have different types");
 
+    GV = cast<GlobalVariable>(GVal);
+
     // Move the forward-reference to the correct spot in the module.
     M->getGlobalList().splice(M->global_end(), M->getGlobalList(), GV);
   }
@@ -845,7 +816,7 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
       GV->setAlignment(Alignment);
     } else {
       Comdat *C;
-      if (parseOptionalComdat(C))
+      if (parseOptionalComdat(Name, C))
         return true;
       if (C)
         GV->setComdat(C);
@@ -864,7 +835,9 @@ bool LLParser::ParseUnnamedAttrGrp() {
   LocTy AttrGrpLoc = Lex.getLoc();
   Lex.Lex();
 
-  assert(Lex.getKind() == lltok::AttrGrpID);
+  if (Lex.getKind() != lltok::AttrGrpID)
+    return TokError("expected attribute group id");
+
   unsigned VarID = Lex.getUIntVal();
   std::vector<unsigned> unused;
   LocTy BuiltinLoc;
@@ -1443,7 +1416,7 @@ bool LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= /*empty*/
 ///   ::= 'ccc'
 ///   ::= 'fastcc'
-///   ::= 'kw_intel_ocl_bicc'
+///   ::= 'intel_ocl_bicc'
 ///   ::= 'coldcc'
 ///   ::= 'x86_stdcallcc'
 ///   ::= 'x86_fastcallcc'
@@ -1463,6 +1436,7 @@ bool LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'anyregcc'
 ///   ::= 'preserve_mostcc'
 ///   ::= 'preserve_allcc'
+///   ::= 'ghccc'
 ///   ::= 'cc' UINT
 ///
 bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
@@ -1490,6 +1464,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_anyregcc:       CC = CallingConv::AnyReg; break;
   case lltok::kw_preserve_mostcc:CC = CallingConv::PreserveMost; break;
   case lltok::kw_preserve_allcc: CC = CallingConv::PreserveAll; break;
+  case lltok::kw_ghccc:          CC = CallingConv::GHC; break;
   case lltok::kw_cc: {
       Lex.Lex();
       return ParseUInt32(CC);
@@ -1512,36 +1487,11 @@ bool LLParser::ParseInstructionMetadata(Instruction *Inst,
     unsigned MDK = M->getMDKindID(Name);
     Lex.Lex();
 
-    MDNode *Node;
-    SMLoc Loc = Lex.getLoc();
-
-    if (ParseToken(lltok::exclaim, "expected '!' here"))
+    MDNode *N;
+    if (ParseMDNode(N))
       return true;
 
-    // This code is similar to that of ParseMetadataValue, however it needs to
-    // have special-case code for a forward reference; see the comments on
-    // ForwardRefInstMetadata for details. Also, MDStrings are not supported
-    // at the top level here.
-    if (Lex.getKind() == lltok::lbrace) {
-      ValID ID;
-      if (ParseMetadataListValue(ID, PFS))
-        return true;
-      assert(ID.Kind == ValID::t_MDNode);
-      Inst->setMetadata(MDK, ID.MDNodeVal);
-    } else {
-      unsigned NodeID = 0;
-      if (ParseMDNodeID(Node, NodeID))
-        return true;
-      if (Node) {
-        // If we got the node, add it to the instruction.
-        Inst->setMetadata(MDK, Node);
-      } else {
-        MDRef R = { Loc, MDK, NodeID };
-        // Otherwise, remember that this should be resolved later.
-        ForwardRefInstMetadata[Inst].push_back(R);
-      }
-    }
-
+    Inst->setMetadata(MDK, N);
     if (MDK == LLVMContext::MD_tbaa)
       InstsWithTBAATag.push_back(Inst);
 
@@ -1684,6 +1634,7 @@ bool LLParser::ParseIndexList(SmallVectorImpl<unsigned> &Indices,
 
   while (EatIfPresent(lltok::comma)) {
     if (Lex.getKind() == lltok::MetadataVar) {
+      if (Indices.empty()) return TokError("expected index");
       AteExtraComma = true;
       return false;
     }
@@ -1700,11 +1651,11 @@ bool LLParser::ParseIndexList(SmallVectorImpl<unsigned> &Indices,
 //===----------------------------------------------------------------------===//
 
 /// ParseType - Parse a type.
-bool LLParser::ParseType(Type *&Result, bool AllowVoid) {
+bool LLParser::ParseType(Type *&Result, const Twine &Msg, bool AllowVoid) {
   SMLoc TypeLoc = Lex.getLoc();
   switch (Lex.getKind()) {
   default:
-    return TokError("expected type");
+    return TokError(Msg);
   case lltok::Type:
     // Type ::= 'float' | 'void' (etc)
     Result = Lex.getTyVal();
@@ -1748,8 +1699,6 @@ bool LLParser::ParseType(Type *&Result, bool AllowVoid) {
 
   case lltok::LocalVarID: {
     // Type ::= %4
-    if (Lex.getUIntVal() >= NumberedTypes.size())
-      NumberedTypes.resize(Lex.getUIntVal()+1);
     std::pair<Type*, LocTy> &Entry = NumberedTypes[Lex.getUIntVal()];
 
     // If the type hasn't been defined yet, create a forward definition and
@@ -1848,9 +1797,14 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
     if (ParseType(ArgTy, ArgLoc))
       return true;
 
-    // Otherwise, handle normal operands.
-    if (ParseOptionalParamAttrs(ArgAttrs) || ParseValue(ArgTy, V, PFS))
-      return true;
+    if (ArgTy->isMetadataTy()) {
+      if (ParseMetadataAsValue(V, PFS))
+        return true;
+    } else {
+      // Otherwise, handle normal operands.
+      if (ParseOptionalParamAttrs(ArgAttrs) || ParseValue(ArgTy, V, PFS))
+        return true;
+    }
     ArgList.push_back(ParamInfo(ArgLoc, V, AttributeSet::get(V->getContext(),
                                                              AttrIndex++,
                                                              ArgAttrs)));
@@ -2383,8 +2337,6 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     ID.StrVal = Lex.getStrVal();
     ID.Kind = ValID::t_LocalName;
     break;
-  case lltok::exclaim:   // !42, !{...}, or !"foo"
-    return ParseMetadataValue(ID, PFS);
   case lltok::APSInt:
     ID.APSIntVal = Lex.getAPSIntVal();
     ID.Kind = ValID::t_APSInt;
@@ -2657,8 +2609,15 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
       return true;
     if (!Val0->getType()->isAggregateType())
       return Error(ID.Loc, "insertvalue operand must be aggregate type");
-    if (!ExtractValueInst::getIndexedType(Val0->getType(), Indices))
+    Type *IndexedType =
+        ExtractValueInst::getIndexedType(Val0->getType(), Indices);
+    if (!IndexedType)
       return Error(ID.Loc, "invalid indices for insertvalue");
+    if (IndexedType != Val1->getType())
+      return Error(ID.Loc, "insertvalue operand and field disagree in type: '" +
+                               getTypeString(Val1->getType()) +
+                               "' instead of '" + getTypeString(IndexedType) +
+                               "'");
     ID.ConstantVal = ConstantExpr::getInsertValue(Val0, Val1, Indices);
     ID.Kind = ValID::t_Constant;
     return false;
@@ -2824,11 +2783,33 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     if (Opc == Instruction::GetElementPtr) {
       if (Elts.size() == 0 ||
           !Elts[0]->getType()->getScalarType()->isPointerTy())
-        return Error(ID.Loc, "getelementptr requires pointer operand");
+        return Error(ID.Loc, "base of getelementptr must be a pointer");
+
+      Type *BaseType = Elts[0]->getType();
+      auto *BasePointerType = cast<PointerType>(BaseType->getScalarType());
 
       ArrayRef<Constant *> Indices(Elts.begin() + 1, Elts.end());
+      for (Constant *Val : Indices) {
+        Type *ValTy = Val->getType();
+        if (!ValTy->getScalarType()->isIntegerTy())
+          return Error(ID.Loc, "getelementptr index must be an integer");
+        if (ValTy->isVectorTy() != BaseType->isVectorTy())
+          return Error(ID.Loc, "getelementptr index type missmatch");
+        if (ValTy->isVectorTy()) {
+          unsigned ValNumEl = cast<VectorType>(ValTy)->getNumElements();
+          unsigned PtrNumEl = cast<VectorType>(BaseType)->getNumElements();
+          if (ValNumEl != PtrNumEl)
+            return Error(
+                ID.Loc,
+                "getelementptr vector index has a wrong number of elements");
+        }
+      }
+
+      if (!Indices.empty() && !BasePointerType->getElementType()->isSized())
+        return Error(ID.Loc, "base element of getelementptr must be sized");
+
       if (!GetElementPtrInst::getIndexedType(Elts[0]->getType(), Indices))
-        return Error(ID.Loc, "invalid indices for getelementptr");
+        return Error(ID.Loc, "invalid getelementptr indices");
       ID.ConstantVal = ConstantExpr::getGetElementPtr(Elts[0], Indices,
                                                       InBounds);
     } else if (Opc == Instruction::Select) {
@@ -2888,16 +2869,26 @@ bool LLParser::ParseGlobalTypeAndValue(Constant *&V) {
          ParseGlobalValue(Ty, V);
 }
 
-bool LLParser::parseOptionalComdat(Comdat *&C) {
+bool LLParser::parseOptionalComdat(StringRef GlobalName, Comdat *&C) {
   C = nullptr;
+
+  LocTy KwLoc = Lex.getLoc();
   if (!EatIfPresent(lltok::kw_comdat))
     return false;
-  if (Lex.getKind() != lltok::ComdatVar)
-    return TokError("expected comdat variable");
-  LocTy Loc = Lex.getLoc();
-  StringRef Name = Lex.getStrVal();
-  C = getComdat(Name, Loc);
-  Lex.Lex();
+
+  if (EatIfPresent(lltok::lparen)) {
+    if (Lex.getKind() != lltok::ComdatVar)
+      return TokError("expected comdat variable");
+    C = getComdat(Lex.getStrVal(), Lex.getLoc());
+    Lex.Lex();
+    if (ParseToken(lltok::rparen, "expected ')' after comdat var"))
+      return true;
+  } else {
+    if (GlobalName.empty())
+      return TokError("comdat cannot be unnamed");
+    C = getComdat(GlobalName, KwLoc);
+  }
+
   return false;
 }
 
@@ -2924,45 +2915,921 @@ bool LLParser::ParseGlobalValueVector(SmallVectorImpl<Constant *> &Elts) {
   return false;
 }
 
-bool LLParser::ParseMetadataListValue(ValID &ID, PerFunctionState *PFS) {
-  assert(Lex.getKind() == lltok::lbrace);
-  Lex.Lex();
-
-  SmallVector<Value*, 16> Elts;
-  if (ParseMDNodeVector(Elts, PFS) ||
-      ParseToken(lltok::rbrace, "expected end of metadata node"))
+bool LLParser::ParseMDTuple(MDNode *&MD, bool IsDistinct) {
+  SmallVector<Metadata *, 16> Elts;
+  if (ParseMDNodeVector(Elts))
     return true;
 
-  ID.MDNodeVal = MDNode::get(Context, Elts);
-  ID.Kind = ValID::t_MDNode;
+  MD = (IsDistinct ? MDTuple::getDistinct : MDTuple::get)(Context, Elts);
   return false;
 }
 
-/// ParseMetadataValue
-///  ::= !42
-///  ::= !{...}
-///  ::= !"string"
-bool LLParser::ParseMetadataValue(ValID &ID, PerFunctionState *PFS) {
-  assert(Lex.getKind() == lltok::exclaim);
-  Lex.Lex();
+/// MDNode:
+///  ::= !{ ... }
+///  ::= !7
+///  ::= !MDLocation(...)
+bool LLParser::ParseMDNode(MDNode *&N) {
+  if (Lex.getKind() == lltok::MetadataVar)
+    return ParseSpecializedMDNode(N);
 
-  // MDNode:
+  return ParseToken(lltok::exclaim, "expected '!' here") ||
+         ParseMDNodeTail(N);
+}
+
+bool LLParser::ParseMDNodeTail(MDNode *&N) {
   // !{ ... }
   if (Lex.getKind() == lltok::lbrace)
-    return ParseMetadataListValue(ID, PFS);
+    return ParseMDTuple(N);
 
-  // Standalone metadata reference
   // !42
-  if (Lex.getKind() == lltok::APSInt) {
-    if (ParseMDNodeID(ID.MDNodeVal)) return true;
-    ID.Kind = ValID::t_MDNode;
+  return ParseMDNodeID(N);
+}
+
+namespace {
+
+/// Structure to represent an optional metadata field.
+template <class FieldTy> struct MDFieldImpl {
+  typedef MDFieldImpl ImplTy;
+  FieldTy Val;
+  bool Seen;
+
+  void assign(FieldTy Val) {
+    Seen = true;
+    this->Val = std::move(Val);
+  }
+
+  explicit MDFieldImpl(FieldTy Default)
+      : Val(std::move(Default)), Seen(false) {}
+};
+
+struct MDUnsignedField : public MDFieldImpl<uint64_t> {
+  uint64_t Max;
+
+  MDUnsignedField(uint64_t Default = 0, uint64_t Max = UINT64_MAX)
+      : ImplTy(Default), Max(Max) {}
+};
+struct LineField : public MDUnsignedField {
+  LineField() : MDUnsignedField(0, UINT32_MAX) {}
+};
+struct ColumnField : public MDUnsignedField {
+  ColumnField() : MDUnsignedField(0, UINT16_MAX) {}
+};
+struct DwarfTagField : public MDUnsignedField {
+  DwarfTagField() : MDUnsignedField(0, dwarf::DW_TAG_hi_user) {}
+};
+struct DwarfAttEncodingField : public MDUnsignedField {
+  DwarfAttEncodingField() : MDUnsignedField(0, dwarf::DW_ATE_hi_user) {}
+};
+struct DwarfVirtualityField : public MDUnsignedField {
+  DwarfVirtualityField() : MDUnsignedField(0, dwarf::DW_VIRTUALITY_max) {}
+};
+struct DwarfLangField : public MDUnsignedField {
+  DwarfLangField() : MDUnsignedField(0, dwarf::DW_LANG_hi_user) {}
+};
+
+struct DIFlagField : public MDUnsignedField {
+  DIFlagField() : MDUnsignedField(0, UINT32_MAX) {}
+};
+
+struct MDSignedField : public MDFieldImpl<int64_t> {
+  int64_t Min;
+  int64_t Max;
+
+  MDSignedField(int64_t Default = 0)
+      : ImplTy(Default), Min(INT64_MIN), Max(INT64_MAX) {}
+  MDSignedField(int64_t Default, int64_t Min, int64_t Max)
+      : ImplTy(Default), Min(Min), Max(Max) {}
+};
+
+struct MDBoolField : public MDFieldImpl<bool> {
+  MDBoolField(bool Default = false) : ImplTy(Default) {}
+};
+struct MDField : public MDFieldImpl<Metadata *> {
+  MDField() : ImplTy(nullptr) {}
+};
+struct MDConstant : public MDFieldImpl<ConstantAsMetadata *> {
+  MDConstant() : ImplTy(nullptr) {}
+};
+struct MDStringField : public MDFieldImpl<std::string> {
+  MDStringField() : ImplTy(std::string()) {}
+};
+struct MDFieldList : public MDFieldImpl<SmallVector<Metadata *, 4>> {
+  MDFieldList() : ImplTy(SmallVector<Metadata *, 4>()) {}
+};
+
+} // end namespace
+
+namespace llvm {
+
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+                            MDUnsignedField &Result) {
+  if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned())
+    return TokError("expected unsigned integer");
+
+  auto &U = Lex.getAPSIntVal();
+  if (U.ugt(Result.Max))
+    return TokError("value for '" + Name + "' too large, limit is " +
+                    Twine(Result.Max));
+  Result.assign(U.getZExtValue());
+  assert(Result.Val <= Result.Max && "Expected value in range");
+  Lex.Lex();
+  return false;
+}
+
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name, LineField &Result) {
+  return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+}
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name, ColumnField &Result) {
+  return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+}
+
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfTagField &Result) {
+  if (Lex.getKind() == lltok::APSInt)
+    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+
+  if (Lex.getKind() != lltok::DwarfTag)
+    return TokError("expected DWARF tag");
+
+  unsigned Tag = dwarf::getTag(Lex.getStrVal());
+  if (Tag == dwarf::DW_TAG_invalid)
+    return TokError("invalid DWARF tag" + Twine(" '") + Lex.getStrVal() + "'");
+  assert(Tag <= Result.Max && "Expected valid DWARF tag");
+
+  Result.assign(Tag);
+  Lex.Lex();
+  return false;
+}
+
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+                            DwarfVirtualityField &Result) {
+  if (Lex.getKind() == lltok::APSInt)
+    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+
+  if (Lex.getKind() != lltok::DwarfVirtuality)
+    return TokError("expected DWARF virtuality code");
+
+  unsigned Virtuality = dwarf::getVirtuality(Lex.getStrVal());
+  if (!Virtuality)
+    return TokError("invalid DWARF virtuality code" + Twine(" '") +
+                    Lex.getStrVal() + "'");
+  assert(Virtuality <= Result.Max && "Expected valid DWARF virtuality code");
+  Result.assign(Virtuality);
+  Lex.Lex();
+  return false;
+}
+
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfLangField &Result) {
+  if (Lex.getKind() == lltok::APSInt)
+    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+
+  if (Lex.getKind() != lltok::DwarfLang)
+    return TokError("expected DWARF language");
+
+  unsigned Lang = dwarf::getLanguage(Lex.getStrVal());
+  if (!Lang)
+    return TokError("invalid DWARF language" + Twine(" '") + Lex.getStrVal() +
+                    "'");
+  assert(Lang <= Result.Max && "Expected valid DWARF language");
+  Result.assign(Lang);
+  Lex.Lex();
+  return false;
+}
+
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+                            DwarfAttEncodingField &Result) {
+  if (Lex.getKind() == lltok::APSInt)
+    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+
+  if (Lex.getKind() != lltok::DwarfAttEncoding)
+    return TokError("expected DWARF type attribute encoding");
+
+  unsigned Encoding = dwarf::getAttributeEncoding(Lex.getStrVal());
+  if (!Encoding)
+    return TokError("invalid DWARF type attribute encoding" + Twine(" '") +
+                    Lex.getStrVal() + "'");
+  assert(Encoding <= Result.Max && "Expected valid DWARF language");
+  Result.assign(Encoding);
+  Lex.Lex();
+  return false;
+}
+
+/// DIFlagField
+///  ::= uint32
+///  ::= DIFlagVector
+///  ::= DIFlagVector '|' DIFlagFwdDecl '|' uint32 '|' DIFlagPublic
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DIFlagField &Result) {
+  assert(Result.Max == UINT32_MAX && "Expected only 32-bits");
+
+  // Parser for a single flag.
+  auto parseFlag = [&](unsigned &Val) {
+    if (Lex.getKind() == lltok::APSInt && !Lex.getAPSIntVal().isSigned())
+      return ParseUInt32(Val);
+
+    if (Lex.getKind() != lltok::DIFlag)
+      return TokError("expected debug info flag");
+
+    Val = DIDescriptor::getFlag(Lex.getStrVal());
+    if (!Val)
+      return TokError(Twine("invalid debug info flag flag '") +
+                      Lex.getStrVal() + "'");
+    Lex.Lex();
+    return false;
+  };
+
+  // Parse the flags and combine them together.
+  unsigned Combined = 0;
+  do {
+    unsigned Val;
+    if (parseFlag(Val))
+      return true;
+    Combined |= Val;
+  } while (EatIfPresent(lltok::bar));
+
+  Result.assign(Combined);
+  return false;
+}
+
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+                            MDSignedField &Result) {
+  if (Lex.getKind() != lltok::APSInt)
+    return TokError("expected signed integer");
+
+  auto &S = Lex.getAPSIntVal();
+  if (S < Result.Min)
+    return TokError("value for '" + Name + "' too small, limit is " +
+                    Twine(Result.Min));
+  if (S > Result.Max)
+    return TokError("value for '" + Name + "' too large, limit is " +
+                    Twine(Result.Max));
+  Result.assign(S.getExtValue());
+  assert(Result.Val >= Result.Min && "Expected value in range");
+  assert(Result.Val <= Result.Max && "Expected value in range");
+  Lex.Lex();
+  return false;
+}
+
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDBoolField &Result) {
+  switch (Lex.getKind()) {
+  default:
+    return TokError("expected 'true' or 'false'");
+  case lltok::kw_true:
+    Result.assign(true);
+    break;
+  case lltok::kw_false:
+    Result.assign(false);
+    break;
+  }
+  Lex.Lex();
+  return false;
+}
+
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDField &Result) {
+  if (Lex.getKind() == lltok::kw_null) {
+    Lex.Lex();
+    Result.assign(nullptr);
     return false;
   }
 
+  Metadata *MD;
+  if (ParseMetadata(MD, nullptr))
+    return true;
+
+  Result.assign(MD);
+  return false;
+}
+
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDConstant &Result) {
+  Metadata *MD;
+  if (ParseValueAsMetadata(MD, "expected constant", nullptr))
+    return true;
+
+  Result.assign(cast<ConstantAsMetadata>(MD));
+  return false;
+}
+
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDStringField &Result) {
+  std::string S;
+  if (ParseStringConstant(S))
+    return true;
+
+  Result.assign(std::move(S));
+  return false;
+}
+
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDFieldList &Result) {
+  SmallVector<Metadata *, 4> MDs;
+  if (ParseMDNodeVector(MDs))
+    return true;
+
+  Result.assign(std::move(MDs));
+  return false;
+}
+
+} // end namespace llvm
+
+template <class ParserTy>
+bool LLParser::ParseMDFieldsImplBody(ParserTy parseField) {
+  do {
+    if (Lex.getKind() != lltok::LabelStr)
+      return TokError("expected field label here");
+
+    if (parseField())
+      return true;
+  } while (EatIfPresent(lltok::comma));
+
+  return false;
+}
+
+template <class ParserTy>
+bool LLParser::ParseMDFieldsImpl(ParserTy parseField, LocTy &ClosingLoc) {
+  assert(Lex.getKind() == lltok::MetadataVar && "Expected metadata type name");
+  Lex.Lex();
+
+  if (ParseToken(lltok::lparen, "expected '(' here"))
+    return true;
+  if (Lex.getKind() != lltok::rparen)
+    if (ParseMDFieldsImplBody(parseField))
+      return true;
+
+  ClosingLoc = Lex.getLoc();
+  return ParseToken(lltok::rparen, "expected ')' here");
+}
+
+template <class FieldTy>
+bool LLParser::ParseMDField(StringRef Name, FieldTy &Result) {
+  if (Result.Seen)
+    return TokError("field '" + Name + "' cannot be specified more than once");
+
+  LocTy Loc = Lex.getLoc();
+  Lex.Lex();
+  return ParseMDField(Loc, Name, Result);
+}
+
+bool LLParser::ParseSpecializedMDNode(MDNode *&N, bool IsDistinct) {
+  assert(Lex.getKind() == lltok::MetadataVar && "Expected metadata type name");
+
+#define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS)                                  \
+  if (Lex.getStrVal() == #CLASS)                                               \
+    return Parse##CLASS(N, IsDistinct);
+#include "llvm/IR/Metadata.def"
+
+  return TokError("expected metadata type");
+}
+
+#define DECLARE_FIELD(NAME, TYPE, INIT) TYPE NAME INIT
+#define NOP_FIELD(NAME, TYPE, INIT)
+#define REQUIRE_FIELD(NAME, TYPE, INIT)                                        \
+  if (!NAME.Seen)                                                              \
+    return Error(ClosingLoc, "missing required field '" #NAME "'");
+#define PARSE_MD_FIELD(NAME, TYPE, DEFAULT)                                    \
+  if (Lex.getStrVal() == #NAME)                                                \
+    return ParseMDField(#NAME, NAME);
+#define PARSE_MD_FIELDS()                                                      \
+  VISIT_MD_FIELDS(DECLARE_FIELD, DECLARE_FIELD)                                \
+  do {                                                                         \
+    LocTy ClosingLoc;                                                          \
+    if (ParseMDFieldsImpl([&]() -> bool {                                      \
+      VISIT_MD_FIELDS(PARSE_MD_FIELD, PARSE_MD_FIELD)                          \
+      return TokError(Twine("invalid field '") + Lex.getStrVal() + "'");       \
+    }, ClosingLoc))                                                            \
+      return true;                                                             \
+    VISIT_MD_FIELDS(NOP_FIELD, REQUIRE_FIELD)                                  \
+  } while (false)
+#define GET_OR_DISTINCT(CLASS, ARGS)                                           \
+  (IsDistinct ? CLASS::getDistinct ARGS : CLASS::get ARGS)
+
+/// ParseMDLocationFields:
+///   ::= !MDLocation(line: 43, column: 8, scope: !5, inlinedAt: !6)
+bool LLParser::ParseMDLocation(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  OPTIONAL(line, LineField, );                                                 \
+  OPTIONAL(column, ColumnField, );                                             \
+  REQUIRED(scope, MDField, );                                                  \
+  OPTIONAL(inlinedAt, MDField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  auto get = (IsDistinct ? MDLocation::getDistinct : MDLocation::get);
+  Result = get(Context, line.Val, column.Val, scope.Val, inlinedAt.Val);
+  return false;
+}
+
+/// ParseGenericDebugNode:
+///   ::= !GenericDebugNode(tag: 15, header: "...", operands: {...})
+bool LLParser::ParseGenericDebugNode(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(tag, DwarfTagField, );                                              \
+  OPTIONAL(header, MDStringField, );                                           \
+  OPTIONAL(operands, MDFieldList, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(GenericDebugNode,
+                           (Context, tag.Val, header.Val, operands.Val));
+  return false;
+}
+
+/// ParseMDSubrange:
+///   ::= !MDSubrange(count: 30, lowerBound: 2)
+bool LLParser::ParseMDSubrange(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(count, MDSignedField, (-1, -1, INT64_MAX));                         \
+  OPTIONAL(lowerBound, MDSignedField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(MDSubrange, (Context, count.Val, lowerBound.Val));
+  return false;
+}
+
+/// ParseMDEnumerator:
+///   ::= !MDEnumerator(value: 30, name: "SomeKind")
+bool LLParser::ParseMDEnumerator(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(name, MDStringField, );                                             \
+  REQUIRED(value, MDSignedField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(MDEnumerator, (Context, value.Val, name.Val));
+  return false;
+}
+
+/// ParseMDBasicType:
+///   ::= !MDBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32)
+bool LLParser::ParseMDBasicType(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(tag, DwarfTagField, );                                              \
+  OPTIONAL(name, MDStringField, );                                             \
+  OPTIONAL(size, MDUnsignedField, (0, UINT64_MAX));                            \
+  OPTIONAL(align, MDUnsignedField, (0, UINT64_MAX));                           \
+  OPTIONAL(encoding, DwarfAttEncodingField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(MDBasicType, (Context, tag.Val, name.Val, size.Val,
+                                         align.Val, encoding.Val));
+  return false;
+}
+
+/// ParseMDDerivedType:
+///   ::= !MDDerivedType(tag: DW_TAG_pointer_type, name: "int", file: !0,
+///                      line: 7, scope: !1, baseType: !2, size: 32,
+///                      align: 32, offset: 0, flags: 0, extraData: !3)
+bool LLParser::ParseMDDerivedType(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(tag, DwarfTagField, );                                              \
+  OPTIONAL(name, MDStringField, );                                             \
+  OPTIONAL(file, MDField, );                                                   \
+  OPTIONAL(line, LineField, );                                                 \
+  OPTIONAL(scope, MDField, );                                                  \
+  REQUIRED(baseType, MDField, );                                               \
+  OPTIONAL(size, MDUnsignedField, (0, UINT64_MAX));                            \
+  OPTIONAL(align, MDUnsignedField, (0, UINT64_MAX));                           \
+  OPTIONAL(offset, MDUnsignedField, (0, UINT64_MAX));                          \
+  OPTIONAL(flags, DIFlagField, );                                              \
+  OPTIONAL(extraData, MDField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(MDDerivedType,
+                           (Context, tag.Val, name.Val, file.Val, line.Val,
+                            scope.Val, baseType.Val, size.Val, align.Val,
+                            offset.Val, flags.Val, extraData.Val));
+  return false;
+}
+
+bool LLParser::ParseMDCompositeType(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(tag, DwarfTagField, );                                              \
+  OPTIONAL(name, MDStringField, );                                             \
+  OPTIONAL(file, MDField, );                                                   \
+  OPTIONAL(line, LineField, );                                                 \
+  OPTIONAL(scope, MDField, );                                                  \
+  OPTIONAL(baseType, MDField, );                                               \
+  OPTIONAL(size, MDUnsignedField, (0, UINT64_MAX));                            \
+  OPTIONAL(align, MDUnsignedField, (0, UINT64_MAX));                           \
+  OPTIONAL(offset, MDUnsignedField, (0, UINT64_MAX));                          \
+  OPTIONAL(flags, DIFlagField, );                                              \
+  OPTIONAL(elements, MDField, );                                               \
+  OPTIONAL(runtimeLang, DwarfLangField, );                                     \
+  OPTIONAL(vtableHolder, MDField, );                                           \
+  OPTIONAL(templateParams, MDField, );                                         \
+  OPTIONAL(identifier, MDStringField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(
+      MDCompositeType,
+      (Context, tag.Val, name.Val, file.Val, line.Val, scope.Val, baseType.Val,
+       size.Val, align.Val, offset.Val, flags.Val, elements.Val,
+       runtimeLang.Val, vtableHolder.Val, templateParams.Val, identifier.Val));
+  return false;
+}
+
+bool LLParser::ParseMDSubroutineType(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  OPTIONAL(flags, DIFlagField, );                                              \
+  REQUIRED(types, MDField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(MDSubroutineType, (Context, flags.Val, types.Val));
+  return false;
+}
+
+/// ParseMDFileType:
+///   ::= !MDFileType(filename: "path/to/file", directory: "/path/to/dir")
+bool LLParser::ParseMDFile(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(filename, MDStringField, );                                         \
+  REQUIRED(directory, MDStringField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(MDFile, (Context, filename.Val, directory.Val));
+  return false;
+}
+
+/// ParseMDCompileUnit:
+///   ::= !MDCompileUnit(language: DW_LANG_C99, file: !0, producer: "clang",
+///                      isOptimized: true, flags: "-O2", runtimeVersion: 1,
+///                      splitDebugFilename: "abc.debug", emissionKind: 1,
+///                      enums: !1, retainedTypes: !2, subprograms: !3,
+///                      globals: !4, imports: !5)
+bool LLParser::ParseMDCompileUnit(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(language, DwarfLangField, );                                        \
+  REQUIRED(file, MDField, );                                                   \
+  OPTIONAL(producer, MDStringField, );                                         \
+  OPTIONAL(isOptimized, MDBoolField, );                                        \
+  OPTIONAL(flags, MDStringField, );                                            \
+  OPTIONAL(runtimeVersion, MDUnsignedField, (0, UINT32_MAX));                  \
+  OPTIONAL(splitDebugFilename, MDStringField, );                               \
+  OPTIONAL(emissionKind, MDUnsignedField, (0, UINT32_MAX));                    \
+  OPTIONAL(enums, MDField, );                                                  \
+  OPTIONAL(retainedTypes, MDField, );                                          \
+  OPTIONAL(subprograms, MDField, );                                            \
+  OPTIONAL(globals, MDField, );                                                \
+  OPTIONAL(imports, MDField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(MDCompileUnit,
+                           (Context, language.Val, file.Val, producer.Val,
+                            isOptimized.Val, flags.Val, runtimeVersion.Val,
+                            splitDebugFilename.Val, emissionKind.Val, enums.Val,
+                            retainedTypes.Val, subprograms.Val, globals.Val,
+                            imports.Val));
+  return false;
+}
+
+/// ParseMDSubprogram:
+///   ::= !MDSubprogram(scope: !0, name: "foo", linkageName: "_Zfoo",
+///                     file: !1, line: 7, type: !2, isLocal: false,
+///                     isDefinition: true, scopeLine: 8, containingType: !3,
+///                     virtuality: DW_VIRTUALTIY_pure_virtual,
+///                     virtualIndex: 10, flags: 11,
+///                     isOptimized: false, function: void ()* @_Z3foov,
+///                     templateParams: !4, declaration: !5, variables: !6)
+bool LLParser::ParseMDSubprogram(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  OPTIONAL(scope, MDField, );                                                  \
+  REQUIRED(name, MDStringField, );                                             \
+  OPTIONAL(linkageName, MDStringField, );                                      \
+  OPTIONAL(file, MDField, );                                                   \
+  OPTIONAL(line, LineField, );                                                 \
+  OPTIONAL(type, MDField, );                                                   \
+  OPTIONAL(isLocal, MDBoolField, );                                            \
+  OPTIONAL(isDefinition, MDBoolField, (true));                                 \
+  OPTIONAL(scopeLine, LineField, );                                            \
+  OPTIONAL(containingType, MDField, );                                         \
+  OPTIONAL(virtuality, DwarfVirtualityField, );                                \
+  OPTIONAL(virtualIndex, MDUnsignedField, (0, UINT32_MAX));                    \
+  OPTIONAL(flags, DIFlagField, );                                              \
+  OPTIONAL(isOptimized, MDBoolField, );                                        \
+  OPTIONAL(function, MDConstant, );                                            \
+  OPTIONAL(templateParams, MDField, );                                         \
+  OPTIONAL(declaration, MDField, );                                            \
+  OPTIONAL(variables, MDField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(
+      MDSubprogram, (Context, scope.Val, name.Val, linkageName.Val, file.Val,
+                     line.Val, type.Val, isLocal.Val, isDefinition.Val,
+                     scopeLine.Val, containingType.Val, virtuality.Val,
+                     virtualIndex.Val, flags.Val, isOptimized.Val, function.Val,
+                     templateParams.Val, declaration.Val, variables.Val));
+  return false;
+}
+
+/// ParseMDLexicalBlock:
+///   ::= !MDLexicalBlock(scope: !0, file: !2, line: 7, column: 9)
+bool LLParser::ParseMDLexicalBlock(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(scope, MDField, );                                                  \
+  OPTIONAL(file, MDField, );                                                   \
+  OPTIONAL(line, LineField, );                                                 \
+  OPTIONAL(column, ColumnField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(
+      MDLexicalBlock, (Context, scope.Val, file.Val, line.Val, column.Val));
+  return false;
+}
+
+/// ParseMDLexicalBlockFile:
+///   ::= !MDLexicalBlockFile(scope: !0, file: !2, discriminator: 9)
+bool LLParser::ParseMDLexicalBlockFile(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(scope, MDField, );                                                  \
+  OPTIONAL(file, MDField, );                                                   \
+  REQUIRED(discriminator, MDUnsignedField, (0, UINT32_MAX));
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(MDLexicalBlockFile,
+                           (Context, scope.Val, file.Val, discriminator.Val));
+  return false;
+}
+
+/// ParseMDNamespace:
+///   ::= !MDNamespace(scope: !0, file: !2, name: "SomeNamespace", line: 9)
+bool LLParser::ParseMDNamespace(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(scope, MDField, );                                                  \
+  OPTIONAL(file, MDField, );                                                   \
+  OPTIONAL(name, MDStringField, );                                             \
+  OPTIONAL(line, LineField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(MDNamespace,
+                           (Context, scope.Val, file.Val, name.Val, line.Val));
+  return false;
+}
+
+/// ParseMDTemplateTypeParameter:
+///   ::= !MDTemplateTypeParameter(name: "Ty", type: !1)
+bool LLParser::ParseMDTemplateTypeParameter(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  OPTIONAL(name, MDStringField, );                                             \
+  REQUIRED(type, MDField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result =
+      GET_OR_DISTINCT(MDTemplateTypeParameter, (Context, name.Val, type.Val));
+  return false;
+}
+
+/// ParseMDTemplateValueParameter:
+///   ::= !MDTemplateValueParameter(tag: DW_TAG_template_value_parameter,
+///                                 name: "V", type: !1, value: i32 7)
+bool LLParser::ParseMDTemplateValueParameter(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(tag, DwarfTagField, );                                              \
+  OPTIONAL(name, MDStringField, );                                             \
+  REQUIRED(type, MDField, );                                                   \
+  REQUIRED(value, MDField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(MDTemplateValueParameter,
+                           (Context, tag.Val, name.Val, type.Val, value.Val));
+  return false;
+}
+
+/// ParseMDGlobalVariable:
+///   ::= !MDGlobalVariable(scope: !0, name: "foo", linkageName: "foo",
+///                         file: !1, line: 7, type: !2, isLocal: false,
+///                         isDefinition: true, variable: i32* @foo,
+///                         declaration: !3)
+bool LLParser::ParseMDGlobalVariable(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  OPTIONAL(scope, MDField, );                                                  \
+  REQUIRED(name, MDStringField, );                                             \
+  OPTIONAL(linkageName, MDStringField, );                                      \
+  OPTIONAL(file, MDField, );                                                   \
+  OPTIONAL(line, LineField, );                                                 \
+  OPTIONAL(type, MDField, );                                                   \
+  OPTIONAL(isLocal, MDBoolField, );                                            \
+  OPTIONAL(isDefinition, MDBoolField, (true));                                 \
+  OPTIONAL(variable, MDConstant, );                                            \
+  OPTIONAL(declaration, MDField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(MDGlobalVariable,
+                           (Context, scope.Val, name.Val, linkageName.Val,
+                            file.Val, line.Val, type.Val, isLocal.Val,
+                            isDefinition.Val, variable.Val, declaration.Val));
+  return false;
+}
+
+/// ParseMDLocalVariable:
+///   ::= !MDLocalVariable(tag: DW_TAG_arg_variable, scope: !0, name: "foo",
+///                        file: !1, line: 7, type: !2, arg: 2, flags: 7,
+///                        inlinedAt: !3)
+bool LLParser::ParseMDLocalVariable(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(tag, DwarfTagField, );                                              \
+  OPTIONAL(scope, MDField, );                                                  \
+  OPTIONAL(name, MDStringField, );                                             \
+  OPTIONAL(file, MDField, );                                                   \
+  OPTIONAL(line, LineField, );                                                 \
+  OPTIONAL(type, MDField, );                                                   \
+  OPTIONAL(arg, MDUnsignedField, (0, UINT8_MAX));                              \
+  OPTIONAL(flags, DIFlagField, );                                              \
+  OPTIONAL(inlinedAt, MDField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(
+      MDLocalVariable, (Context, tag.Val, scope.Val, name.Val, file.Val,
+                        line.Val, type.Val, arg.Val, flags.Val, inlinedAt.Val));
+  return false;
+}
+
+/// ParseMDExpression:
+///   ::= !MDExpression(0, 7, -1)
+bool LLParser::ParseMDExpression(MDNode *&Result, bool IsDistinct) {
+  assert(Lex.getKind() == lltok::MetadataVar && "Expected metadata type name");
+  Lex.Lex();
+
+  if (ParseToken(lltok::lparen, "expected '(' here"))
+    return true;
+
+  SmallVector<uint64_t, 8> Elements;
+  if (Lex.getKind() != lltok::rparen)
+    do {
+      if (Lex.getKind() == lltok::DwarfOp) {
+        if (unsigned Op = dwarf::getOperationEncoding(Lex.getStrVal())) {
+          Lex.Lex();
+          Elements.push_back(Op);
+          continue;
+        }
+        return TokError(Twine("invalid DWARF op '") + Lex.getStrVal() + "'");
+      }
+
+      if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned())
+        return TokError("expected unsigned integer");
+
+      auto &U = Lex.getAPSIntVal();
+      if (U.ugt(UINT64_MAX))
+        return TokError("element too large, limit is " + Twine(UINT64_MAX));
+      Elements.push_back(U.getZExtValue());
+      Lex.Lex();
+    } while (EatIfPresent(lltok::comma));
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  Result = GET_OR_DISTINCT(MDExpression, (Context, Elements));
+  return false;
+}
+
+/// ParseMDObjCProperty:
+///   ::= !MDObjCProperty(name: "foo", file: !1, line: 7, setter: "setFoo",
+///                       getter: "getFoo", attributes: 7, type: !2)
+bool LLParser::ParseMDObjCProperty(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(name, MDStringField, );                                             \
+  OPTIONAL(file, MDField, );                                                   \
+  OPTIONAL(line, LineField, );                                                 \
+  OPTIONAL(setter, MDStringField, );                                           \
+  OPTIONAL(getter, MDStringField, );                                           \
+  OPTIONAL(attributes, MDUnsignedField, (0, UINT32_MAX));                      \
+  OPTIONAL(type, MDField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(MDObjCProperty,
+                           (Context, name.Val, file.Val, line.Val, setter.Val,
+                            getter.Val, attributes.Val, type.Val));
+  return false;
+}
+
+/// ParseMDImportedEntity:
+///   ::= !MDImportedEntity(tag: DW_TAG_imported_module, scope: !0, entity: !1,
+///                         line: 7, name: "foo")
+bool LLParser::ParseMDImportedEntity(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(tag, DwarfTagField, );                                              \
+  REQUIRED(scope, MDField, );                                                  \
+  OPTIONAL(entity, MDField, );                                                 \
+  OPTIONAL(line, LineField, );                                                 \
+  OPTIONAL(name, MDStringField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(MDImportedEntity, (Context, tag.Val, scope.Val,
+                                              entity.Val, line.Val, name.Val));
+  return false;
+}
+
+#undef PARSE_MD_FIELD
+#undef NOP_FIELD
+#undef REQUIRE_FIELD
+#undef DECLARE_FIELD
+
+/// ParseMetadataAsValue
+///  ::= metadata i32 %local
+///  ::= metadata i32 @global
+///  ::= metadata i32 7
+///  ::= metadata !0
+///  ::= metadata !{...}
+///  ::= metadata !"string"
+bool LLParser::ParseMetadataAsValue(Value *&V, PerFunctionState &PFS) {
+  // Note: the type 'metadata' has already been parsed.
+  Metadata *MD;
+  if (ParseMetadata(MD, &PFS))
+    return true;
+
+  V = MetadataAsValue::get(Context, MD);
+  return false;
+}
+
+/// ParseValueAsMetadata
+///  ::= i32 %local
+///  ::= i32 @global
+///  ::= i32 7
+bool LLParser::ParseValueAsMetadata(Metadata *&MD, const Twine &TypeMsg,
+                                    PerFunctionState *PFS) {
+  Type *Ty;
+  LocTy Loc;
+  if (ParseType(Ty, TypeMsg, Loc))
+    return true;
+  if (Ty->isMetadataTy())
+    return Error(Loc, "invalid metadata-value-metadata roundtrip");
+
+  Value *V;
+  if (ParseValue(Ty, V, PFS))
+    return true;
+
+  MD = ValueAsMetadata::get(V);
+  return false;
+}
+
+/// ParseMetadata
+///  ::= i32 %local
+///  ::= i32 @global
+///  ::= i32 7
+///  ::= !42
+///  ::= !{...}
+///  ::= !"string"
+///  ::= !MDLocation(...)
+bool LLParser::ParseMetadata(Metadata *&MD, PerFunctionState *PFS) {
+  if (Lex.getKind() == lltok::MetadataVar) {
+    MDNode *N;
+    if (ParseSpecializedMDNode(N))
+      return true;
+    MD = N;
+    return false;
+  }
+
+  // ValueAsMetadata:
+  // <type> <value>
+  if (Lex.getKind() != lltok::exclaim)
+    return ParseValueAsMetadata(MD, "expected metadata operand", PFS);
+
+  // '!'.
+  assert(Lex.getKind() == lltok::exclaim && "Expected '!' here");
+  Lex.Lex();
+
   // MDString:
   //   ::= '!' STRINGCONSTANT
-  if (ParseMDString(ID.MDStringVal)) return true;
-  ID.Kind = ValID::t_MDString;
+  if (Lex.getKind() == lltok::StringConstant) {
+    MDString *S;
+    if (ParseMDString(S))
+      return true;
+    MD = S;
+    return false;
+  }
+
+  // MDNode:
+  // !{ ... }
+  // !7
+  MDNode *N;
+  if (ParseMDNodeTail(N))
+    return true;
+  MD = N;
   return false;
 }
 
@@ -2995,16 +3862,6 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
                        (ID.UIntVal>>1)&1, (InlineAsm::AsmDialect(ID.UIntVal>>2)));
     return false;
   }
-  case ValID::t_MDNode:
-    if (!Ty->isMetadataTy())
-      return Error(ID.Loc, "metadata value must have metadata type");
-    V = ID.MDNodeVal;
-    return false;
-  case ValID::t_MDString:
-    if (!Ty->isMetadataTy())
-      return Error(ID.Loc, "metadata value must have metadata type");
-    V = ID.MDStringVal;
-    return false;
   case ValID::t_GlobalName:
     V = GetGlobalVal(ID.StrVal, Ty, ID.Loc);
     return V == nullptr;
@@ -3120,7 +3977,7 @@ bool LLParser::ParseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc,
 /// FunctionHeader
 ///   ::= OptionalLinkage OptionalVisibility OptionalCallingConv OptRetAttrs
 ///       OptUnnamedAddr Type GlobalName '(' ArgList ')' OptFuncAttrs OptSection
-///       OptionalAlign OptGC OptionalPrefix
+///       OptionalAlign OptGC OptionalPrefix OptionalPrologue
 bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   // Parse the linkage.
   LocTy LinkageLoc = Lex.getLoc();
@@ -3201,6 +4058,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   bool UnnamedAddr;
   LocTy UnnamedAddrLoc;
   Constant *Prefix = nullptr;
+  Constant *Prologue = nullptr;
   Comdat *C;
 
   if (ParseArgumentList(ArgList, isVarArg) ||
@@ -3210,12 +4068,14 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
                                  BuiltinLoc) ||
       (EatIfPresent(lltok::kw_section) &&
        ParseStringConstant(Section)) ||
-      parseOptionalComdat(C) ||
+      parseOptionalComdat(FunctionName, C) ||
       ParseOptionalAlignment(Alignment) ||
       (EatIfPresent(lltok::kw_gc) &&
        ParseStringConstant(GC)) ||
       (EatIfPresent(lltok::kw_prefix) &&
-       ParseGlobalTypeAndValue(Prefix)))
+       ParseGlobalTypeAndValue(Prefix)) ||
+      (EatIfPresent(lltok::kw_prologue) &&
+       ParseGlobalTypeAndValue(Prologue)))
     return true;
 
   if (FuncAttrs.contains(Attribute::Builtin))
@@ -3316,6 +4176,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   Fn->setComdat(C);
   if (!GC.empty()) Fn->setGC(GC.c_str());
   Fn->setPrefixData(Prefix);
+  Fn->setPrologueData(Prologue);
   ForwardRefAttrGroups[Fn] = FwdRefAttrGrps;
 
   // Add all of the arguments we parsed to the function.
@@ -3878,10 +4739,14 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   if (I != E)
     return Error(CallLoc, "not enough parameters specified for call");
 
-  if (FnAttrs.hasAttributes())
+  if (FnAttrs.hasAttributes()) {
+    if (FnAttrs.hasAlignmentAttr())
+      return Error(CallLoc, "invoke instructions may not have an alignment");
+
     Attrs.push_back(AttributeSet::get(RetType->getContext(),
                                       AttributeSet::FunctionIndex,
                                       FnAttrs));
+  }
 
   // Finish off the Attribute and check them
   AttributeSet PAL = AttributeSet::get(Context, Attrs);
@@ -4291,10 +5156,14 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
   if (I != E)
     return Error(CallLoc, "not enough parameters specified for call");
 
-  if (FnAttrs.hasAttributes())
+  if (FnAttrs.hasAttributes()) {
+    if (FnAttrs.hasAlignmentAttr())
+      return Error(CallLoc, "call instructions may not have an alignment");
+
     Attrs.push_back(AttributeSet::get(RetType->getContext(),
                                       AttributeSet::FunctionIndex,
                                       FnAttrs));
+  }
 
   // Finish off the Attribute and check them
   AttributeSet PAL = AttributeSet::get(Context, Attrs);
@@ -4316,13 +5185,16 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 ///   ::= 'alloca' 'inalloca'? Type (',' TypeAndValue)? (',' 'align' i32)?
 int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Size = nullptr;
-  LocTy SizeLoc;
+  LocTy SizeLoc, TyLoc;
   unsigned Alignment = 0;
   Type *Ty = nullptr;
 
   bool IsInAlloca = EatIfPresent(lltok::kw_inalloca);
 
-  if (ParseType(Ty)) return true;
+  if (ParseType(Ty, TyLoc)) return true;
+
+  if (Ty->isFunctionTy() || !PointerType::isValidElementType(Ty))
+    return Error(TyLoc, "invalid type for alloca");
 
   bool AteExtraComma = false;
   if (EatIfPresent(lltok::comma)) {
@@ -4642,8 +5514,13 @@ int LLParser::ParseInsertValue(Instruction *&Inst, PerFunctionState &PFS) {
   if (!Val0->getType()->isAggregateType())
     return Error(Loc0, "insertvalue operand must be aggregate type");
 
-  if (!ExtractValueInst::getIndexedType(Val0->getType(), Indices))
+  Type *IndexedType = ExtractValueInst::getIndexedType(Val0->getType(), Indices);
+  if (!IndexedType)
     return Error(Loc0, "invalid indices for insertvalue");
+  if (IndexedType != Val1->getType())
+    return Error(Loc1, "insertvalue operand and field disagree in type: '" +
+                           getTypeString(Val1->getType()) + "' instead of '" +
+                           getTypeString(IndexedType) + "'");
   Inst = InsertValueInst::Create(Val0, Val1, Indices);
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
@@ -4653,13 +5530,15 @@ int LLParser::ParseInsertValue(Instruction *&Inst, PerFunctionState &PFS) {
 //===----------------------------------------------------------------------===//
 
 /// ParseMDNodeVector
-///   ::= Element (',' Element)*
+///   ::= { Element (',' Element)* }
 /// Element
 ///   ::= 'null' | TypeAndValue
-bool LLParser::ParseMDNodeVector(SmallVectorImpl<Value*> &Elts,
-                                 PerFunctionState *PFS) {
+bool LLParser::ParseMDNodeVector(SmallVectorImpl<Metadata *> &Elts) {
+  if (ParseToken(lltok::lbrace, "expected '{' here"))
+    return true;
+
   // Check for an empty list.
-  if (Lex.getKind() == lltok::rbrace)
+  if (EatIfPresent(lltok::rbrace))
     return false;
 
   do {
@@ -4669,12 +5548,13 @@ bool LLParser::ParseMDNodeVector(SmallVectorImpl<Value*> &Elts,
       continue;
     }
 
-    Value *V = nullptr;
-    if (ParseTypeAndValue(V, PFS)) return true;
-    Elts.push_back(V);
+    Metadata *MD;
+    if (ParseMetadata(MD, nullptr))
+      return true;
+    Elts.push_back(MD);
   } while (EatIfPresent(lltok::comma));
 
-  return false;
+  return ParseToken(lltok::rbrace, "expected end of metadata node");
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index aa62bcc..5e92e57 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -52,8 +52,6 @@ namespace llvm {
       t_EmptyArray,               // No value:  []
       t_Constant,                 // Value in ConstantVal.
       t_InlineAsm,                // Value in StrVal/StrVal2/UIntVal.
-      t_MDNode,                   // Value in MDNodeVal.
-      t_MDString,                 // Value in MDStringVal.
       t_ConstantStruct,           // Value in ConstantStructElts.
       t_PackedConstantStruct      // Value in ConstantStructElts.
     } Kind;
@@ -64,8 +62,6 @@ namespace llvm {
     APSInt APSIntVal;
     APFloat APFloatVal;
     Constant *ConstantVal;
-    MDNode *MDNodeVal;
-    MDString *MDStringVal;
     Constant **ConstantStructElts;
 
     ValID() : Kind(t_LocalID), APFloatVal(0.0) {}
@@ -106,17 +102,16 @@ namespace llvm {
       SMLoc Loc;
       unsigned MDKind, MDSlot;
     };
-    DenseMap<Instruction*, std::vector<MDRef> > ForwardRefInstMetadata;
 
     SmallVector<Instruction*, 64> InstsWithTBAATag;
 
     // Type resolution handling data structures.  The location is set when we
     // have processed a use of the type but not a definition yet.
     StringMap<std::pair<Type*, LocTy> > NamedTypes;
-    std::vector<std::pair<Type*, LocTy> > NumberedTypes;
+    std::map<unsigned, std::pair<Type*, LocTy> > NumberedTypes;
 
-    std::vector<TrackingVH<MDNode> > NumberedMetadata;
-    std::map<unsigned, std::pair<TrackingVH<MDNode>, LocTy> > ForwardRefMDNodes;
+    std::map<unsigned, TrackingMDNodeRef> NumberedMetadata;
+    std::map<unsigned, std::pair<TempMDTuple, LocTy>> ForwardRefMDNodes;
 
     // Global Value reference information.
     std::map<std::string, std::pair<GlobalValue*, LocTy> > ForwardRefVals;
@@ -270,14 +265,21 @@ namespace llvm {
     bool ParseNamedMetadata();
     bool ParseMDString(MDString *&Result);
     bool ParseMDNodeID(MDNode *&Result);
-    bool ParseMDNodeID(MDNode *&Result, unsigned &SlotNo);
     bool ParseUnnamedAttrGrp();
     bool ParseFnAttributeValuePairs(AttrBuilder &B,
                                     std::vector<unsigned> &FwdRefAttrGrps,
                                     bool inAttrGrp, LocTy &BuiltinLoc);
 
     // Type Parsing.
-    bool ParseType(Type *&Result, bool AllowVoid = false);
+    bool ParseType(Type *&Result, const Twine &Msg, bool AllowVoid = false);
+    bool ParseType(Type *&Result, bool AllowVoid = false) {
+      return ParseType(Result, "expected type", AllowVoid);
+    }
+    bool ParseType(Type *&Result, const Twine &Msg, LocTy &Loc,
+                   bool AllowVoid = false) {
+      Loc = Lex.getLoc();
+      return ParseType(Result, Msg, AllowVoid);
+    }
     bool ParseType(Type *&Result, LocTy &Loc, bool AllowVoid = false) {
       Loc = Lex.getLoc();
       return ParseType(Result, AllowVoid);
@@ -381,12 +383,30 @@ namespace llvm {
     bool ParseGlobalValue(Type *Ty, Constant *&V);
     bool ParseGlobalTypeAndValue(Constant *&V);
     bool ParseGlobalValueVector(SmallVectorImpl<Constant *> &Elts);
-    bool parseOptionalComdat(Comdat *&C);
-    bool ParseMetadataListValue(ValID &ID, PerFunctionState *PFS);
-    bool ParseMetadataValue(ValID &ID, PerFunctionState *PFS);
-    bool ParseMDNodeVector(SmallVectorImpl<Value*> &, PerFunctionState *PFS);
+    bool parseOptionalComdat(StringRef GlobalName, Comdat *&C);
+    bool ParseMetadataAsValue(Value *&V, PerFunctionState &PFS);
+    bool ParseValueAsMetadata(Metadata *&MD, const Twine &TypeMsg,
+                              PerFunctionState *PFS);
+    bool ParseMetadata(Metadata *&MD, PerFunctionState *PFS);
+    bool ParseMDTuple(MDNode *&MD, bool IsDistinct = false);
+    bool ParseMDNode(MDNode *&MD);
+    bool ParseMDNodeTail(MDNode *&MD);
+    bool ParseMDNodeVector(SmallVectorImpl<Metadata *> &MDs);
     bool ParseInstructionMetadata(Instruction *Inst, PerFunctionState *PFS);
 
+    template <class FieldTy>
+    bool ParseMDField(LocTy Loc, StringRef Name, FieldTy &Result);
+    template <class FieldTy> bool ParseMDField(StringRef Name, FieldTy &Result);
+    template <class ParserTy>
+    bool ParseMDFieldsImplBody(ParserTy parseField);
+    template <class ParserTy>
+    bool ParseMDFieldsImpl(ParserTy parseField, LocTy &ClosingLoc);
+    bool ParseSpecializedMDNode(MDNode *&N, bool IsDistinct = false);
+
+#define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS)                                  \
+  bool Parse##CLASS(MDNode *&Result, bool IsDistinct);
+#include "llvm/IR/Metadata.def"
+
     // Function Parsing.
     struct ArgInfo {
       LocTy Loc;
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index f9821f7..a7aa17c 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -28,9 +28,8 @@ namespace lltok {
     lbrace, rbrace,    // {  }
     less, greater,     // <  >
     lparen, rparen,    // (  )
-    backslash,         // \    (not /)
     exclaim,           // !
-    hash,              // #
+    bar,               // |
 
     kw_x,
     kw_true,    kw_false,
@@ -83,6 +82,7 @@ namespace lltok {
     kw_inteldialect,
     kw_gc,
     kw_prefix,
+    kw_prologue,
     kw_c,
 
     kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
@@ -95,6 +95,7 @@ namespace lltok {
     kw_x86_64_sysvcc, kw_x86_64_win64cc,
     kw_webkit_jscc, kw_anyregcc,
     kw_preserve_mostcc, kw_preserve_allcc,
+    kw_ghccc,
 
     // Attributes:
     kw_attributes,
@@ -180,6 +181,9 @@ namespace lltok {
     kw_extractelement, kw_insertelement, kw_shufflevector,
     kw_extractvalue, kw_insertvalue, kw_blockaddress,
 
+    // Metadata types.
+    kw_distinct,
+
     // Use-list order directives.
     kw_uselistorder, kw_uselistorder_bb,
 
@@ -195,6 +199,12 @@ namespace lltok {
     LocalVar,          // %foo %"foo"
     MetadataVar,       // !foo
     StringConstant,    // "foo"
+    DwarfTag,          // DW_TAG_foo
+    DwarfAttEncoding,  // DW_ATE_foo
+    DwarfVirtuality,   // DW_VIRTUALITY_foo
+    DwarfLang,         // DW_LANG_foo
+    DwarfOp,           // DW_OP_foo
+    DIFlag,            // DIFlagFoo
 
     // Type valued tokens (TyVal).
     Type,
diff --git a/lib/AsmParser/Parser.cpp b/lib/AsmParser/Parser.cpp
index 0815907..ed1a753 100644
--- a/lib/AsmParser/Parser.cpp
+++ b/lib/AsmParser/Parser.cpp
@@ -38,7 +38,7 @@ std::unique_ptr<Module> llvm::parseAssembly(MemoryBufferRef F,
   if (parseAssemblyInto(F, *M, Err))
     return nullptr;
 
-  return std::move(M);
+  return M;
 }
 
 std::unique_ptr<Module> llvm::parseAssemblyFile(StringRef Filename,
diff --git a/lib/Bitcode/Reader/BitReader.cpp b/lib/Bitcode/Reader/BitReader.cpp
index 9b3acb5..868fbf0 100644
--- a/lib/Bitcode/Reader/BitReader.cpp
+++ b/lib/Bitcode/Reader/BitReader.cpp
@@ -9,9 +9,11 @@
 
 #include "llvm-c/BitReader.h"
 #include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cstring>
 #include <string>
 
@@ -30,11 +32,20 @@ LLVMBool LLVMParseBitcodeInContext(LLVMContextRef ContextRef,
                                    LLVMMemoryBufferRef MemBuf,
                                    LLVMModuleRef *OutModule,
                                    char **OutMessage) {
-  ErrorOr<Module *> ModuleOrErr =
-      parseBitcodeFile(unwrap(MemBuf)->getMemBufferRef(), *unwrap(ContextRef));
-  if (std::error_code EC = ModuleOrErr.getError()) {
-    if (OutMessage)
-      *OutMessage = strdup(EC.message().c_str());
+  MemoryBufferRef Buf = unwrap(MemBuf)->getMemBufferRef();
+  LLVMContext &Ctx = *unwrap(ContextRef);
+
+  std::string Message;
+  raw_string_ostream Stream(Message);
+  DiagnosticPrinterRawOStream DP(Stream);
+
+  ErrorOr<Module *> ModuleOrErr = parseBitcodeFile(
+      Buf, Ctx, [&](const DiagnosticInfo &DI) { DI.print(DP); });
+  if (ModuleOrErr.getError()) {
+    if (OutMessage) {
+      Stream.flush();
+      *OutMessage = strdup(Message.c_str());
+    }
     *OutModule = wrap((Module*)nullptr);
     return 1;
   }
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index b2ca22c..92af0f8 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -11,10 +11,13 @@
 #include "BitcodeReader.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Bitcode/LLVMBitCodes.h"
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
@@ -22,10 +25,10 @@
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/DataStream.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/ManagedStatic.h"
 
 using namespace llvm;
 
@@ -33,6 +36,61 @@ enum {
   SWITCH_INST_MAGIC = 0x4B5 // May 2012 => 1205 => Hex
 };
 
+BitcodeDiagnosticInfo::BitcodeDiagnosticInfo(std::error_code EC,
+                                             DiagnosticSeverity Severity,
+                                             const Twine &Msg)
+    : DiagnosticInfo(DK_Bitcode, Severity), Msg(Msg), EC(EC) {}
+
+void BitcodeDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; }
+
+static std::error_code Error(DiagnosticHandlerFunction DiagnosticHandler,
+                             std::error_code EC, const Twine &Message) {
+  BitcodeDiagnosticInfo DI(EC, DS_Error, Message);
+  DiagnosticHandler(DI);
+  return EC;
+}
+
+static std::error_code Error(DiagnosticHandlerFunction DiagnosticHandler,
+                             std::error_code EC) {
+  return Error(DiagnosticHandler, EC, EC.message());
+}
+
+std::error_code BitcodeReader::Error(BitcodeError E, const Twine &Message) {
+  return ::Error(DiagnosticHandler, make_error_code(E), Message);
+}
+
+std::error_code BitcodeReader::Error(const Twine &Message) {
+  return ::Error(DiagnosticHandler,
+                 make_error_code(BitcodeError::CorruptedBitcode), Message);
+}
+
+std::error_code BitcodeReader::Error(BitcodeError E) {
+  return ::Error(DiagnosticHandler, make_error_code(E));
+}
+
+static DiagnosticHandlerFunction getDiagHandler(DiagnosticHandlerFunction F,
+                                                LLVMContext &C) {
+  if (F)
+    return F;
+  return [&C](const DiagnosticInfo &DI) { C.diagnose(DI); };
+}
+
+BitcodeReader::BitcodeReader(MemoryBuffer *buffer, LLVMContext &C,
+                             DiagnosticHandlerFunction DiagnosticHandler)
+    : Context(C), DiagnosticHandler(getDiagHandler(DiagnosticHandler, C)),
+      TheModule(nullptr), Buffer(buffer), LazyStreamer(nullptr),
+      NextUnreadBit(0), SeenValueSymbolTable(false), ValueList(C),
+      MDValueList(C), SeenFirstFunctionBody(false), UseRelativeIDs(false),
+      WillMaterializeAllForwardRefs(false) {}
+
+BitcodeReader::BitcodeReader(DataStreamer *streamer, LLVMContext &C,
+                             DiagnosticHandlerFunction DiagnosticHandler)
+    : Context(C), DiagnosticHandler(getDiagHandler(DiagnosticHandler, C)),
+      TheModule(nullptr), Buffer(nullptr), LazyStreamer(streamer),
+      NextUnreadBit(0), SeenValueSymbolTable(false), ValueList(C),
+      MDValueList(C), SeenFirstFunctionBody(false), UseRelativeIDs(false),
+      WillMaterializeAllForwardRefs(false) {}
+
 std::error_code BitcodeReader::materializeForwardReferencedFunctions() {
   if (WillMaterializeAllForwardRefs)
     return std::error_code();
@@ -53,7 +111,7 @@ std::error_code BitcodeReader::materializeForwardReferencedFunctions() {
     // isn't a trivial way to check if a function will have a body without a
     // linear search through FunctionsWithBodies, so just check it here.
     if (!F->isMaterializable())
-      return Error(BitcodeError::NeverResolvedFunctionFromBlockAddress);
+      return Error("Never resolved function from blockaddress");
 
     // Try to materialize F.
     if (std::error_code EC = materialize(F))
@@ -100,26 +158,57 @@ static bool ConvertToString(ArrayRef<uint64_t> Record, unsigned Idx,
   return false;
 }
 
-static GlobalValue::LinkageTypes GetDecodedLinkage(unsigned Val) {
+static bool hasImplicitComdat(size_t Val) {
+  switch (Val) {
+  default:
+    return false;
+  case 1:  // Old WeakAnyLinkage
+  case 4:  // Old LinkOnceAnyLinkage
+  case 10: // Old WeakODRLinkage
+  case 11: // Old LinkOnceODRLinkage
+    return true;
+  }
+}
+
+static GlobalValue::LinkageTypes getDecodedLinkage(unsigned Val) {
   switch (Val) {
   default: // Map unknown/new linkages to external
-  case 0:  return GlobalValue::ExternalLinkage;
-  case 1:  return GlobalValue::WeakAnyLinkage;
-  case 2:  return GlobalValue::AppendingLinkage;
-  case 3:  return GlobalValue::InternalLinkage;
-  case 4:  return GlobalValue::LinkOnceAnyLinkage;
-  case 5:  return GlobalValue::ExternalLinkage; // Obsolete DLLImportLinkage
-  case 6:  return GlobalValue::ExternalLinkage; // Obsolete DLLExportLinkage
-  case 7:  return GlobalValue::ExternalWeakLinkage;
-  case 8:  return GlobalValue::CommonLinkage;
-  case 9:  return GlobalValue::PrivateLinkage;
-  case 10: return GlobalValue::WeakODRLinkage;
-  case 11: return GlobalValue::LinkOnceODRLinkage;
-  case 12: return GlobalValue::AvailableExternallyLinkage;
+  case 0:
+    return GlobalValue::ExternalLinkage;
+  case 2:
+    return GlobalValue::AppendingLinkage;
+  case 3:
+    return GlobalValue::InternalLinkage;
+  case 5:
+    return GlobalValue::ExternalLinkage; // Obsolete DLLImportLinkage
+  case 6:
+    return GlobalValue::ExternalLinkage; // Obsolete DLLExportLinkage
+  case 7:
+    return GlobalValue::ExternalWeakLinkage;
+  case 8:
+    return GlobalValue::CommonLinkage;
+  case 9:
+    return GlobalValue::PrivateLinkage;
+  case 12:
+    return GlobalValue::AvailableExternallyLinkage;
   case 13:
     return GlobalValue::PrivateLinkage; // Obsolete LinkerPrivateLinkage
   case 14:
     return GlobalValue::PrivateLinkage; // Obsolete LinkerPrivateWeakLinkage
+  case 15:
+    return GlobalValue::ExternalLinkage; // Obsolete LinkOnceODRAutoHideLinkage
+  case 1: // Old value with implicit comdat.
+  case 16:
+    return GlobalValue::WeakAnyLinkage;
+  case 10: // Old value with implicit comdat.
+  case 17:
+    return GlobalValue::WeakODRLinkage;
+  case 4: // Old value with implicit comdat.
+  case 18:
+    return GlobalValue::LinkOnceAnyLinkage;
+  case 11: // Old value with implicit comdat.
+  case 19:
+    return GlobalValue::LinkOnceODRLinkage;
   }
 }
 
@@ -261,7 +350,7 @@ namespace {
   /// @brief A class for maintaining the slot number definition
   /// as a placeholder for the actual definition for forward constants defs.
   class ConstantPlaceHolder : public ConstantExpr {
-    void operator=(const ConstantPlaceHolder &) LLVM_DELETED_FUNCTION;
+    void operator=(const ConstantPlaceHolder &) = delete;
   public:
     // allocate space for exactly one operand
     void *operator new(size_t s) {
@@ -280,7 +369,7 @@ namespace {
 
 
     /// Provide fast operand accessors
-    //DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+    DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
   };
 }
 
@@ -289,6 +378,7 @@ template <>
 struct OperandTraits<ConstantPlaceHolder> :
   public FixedNumOperandTraits<ConstantPlaceHolder, 1> {
 };
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantPlaceHolder, Value)
 }
 
 
@@ -437,43 +527,72 @@ void BitcodeReaderValueList::ResolveConstantForwardRefs() {
   }
 }
 
-void BitcodeReaderMDValueList::AssignValue(Value *V, unsigned Idx) {
+void BitcodeReaderMDValueList::AssignValue(Metadata *MD, unsigned Idx) {
   if (Idx == size()) {
-    push_back(V);
+    push_back(MD);
     return;
   }
 
   if (Idx >= size())
     resize(Idx+1);
 
-  WeakVH &OldV = MDValuePtrs[Idx];
-  if (!OldV) {
-    OldV = V;
+  TrackingMDRef &OldMD = MDValuePtrs[Idx];
+  if (!OldMD) {
+    OldMD.reset(MD);
     return;
   }
 
   // If there was a forward reference to this value, replace it.
-  MDNode *PrevVal = cast<MDNode>(OldV);
-  OldV->replaceAllUsesWith(V);
-  MDNode::deleteTemporary(PrevVal);
-  // Deleting PrevVal sets Idx value in MDValuePtrs to null. Set new
-  // value for Idx.
-  MDValuePtrs[Idx] = V;
+  TempMDTuple PrevMD(cast<MDTuple>(OldMD.get()));
+  PrevMD->replaceAllUsesWith(MD);
+  --NumFwdRefs;
 }
 
-Value *BitcodeReaderMDValueList::getValueFwdRef(unsigned Idx) {
+Metadata *BitcodeReaderMDValueList::getValueFwdRef(unsigned Idx) {
   if (Idx >= size())
     resize(Idx + 1);
 
-  if (Value *V = MDValuePtrs[Idx]) {
-    assert(V->getType()->isMetadataTy() && "Type mismatch in value table!");
-    return V;
+  if (Metadata *MD = MDValuePtrs[Idx])
+    return MD;
+
+  // Track forward refs to be resolved later.
+  if (AnyFwdRefs) {
+    MinFwdRef = std::min(MinFwdRef, Idx);
+    MaxFwdRef = std::max(MaxFwdRef, Idx);
+  } else {
+    AnyFwdRefs = true;
+    MinFwdRef = MaxFwdRef = Idx;
   }
+  ++NumFwdRefs;
 
   // Create and return a placeholder, which will later be RAUW'd.
-  Value *V = MDNode::getTemporary(Context, None);
-  MDValuePtrs[Idx] = V;
-  return V;
+  Metadata *MD = MDNode::getTemporary(Context, None).release();
+  MDValuePtrs[Idx].reset(MD);
+  return MD;
+}
+
+void BitcodeReaderMDValueList::tryToResolveCycles() {
+  if (!AnyFwdRefs)
+    // Nothing to do.
+    return;
+
+  if (NumFwdRefs)
+    // Still forward references... can't resolve cycles.
+    return;
+
+  // Resolve any cycles.
+  for (unsigned I = MinFwdRef, E = MaxFwdRef + 1; I != E; ++I) {
+    auto &MD = MDValuePtrs[I];
+    auto *N = dyn_cast_or_null<MDNode>(MD);
+    if (!N)
+      continue;
+
+    assert(!N->isTemporary() && "Unexpected forward reference");
+    N->resolveCycles();
+  }
+
+  // Make sure we return early again until there's another forward ref.
+  AnyFwdRefs = false;
 }
 
 Type *BitcodeReader::getTypeByID(unsigned ID) {
@@ -486,7 +605,20 @@ Type *BitcodeReader::getTypeByID(unsigned ID) {
 
   // If we have a forward reference, the only possible case is when it is to a
   // named struct.  Just create a placeholder for now.
-  return TypeList[ID] = StructType::create(Context);
+  return TypeList[ID] = createIdentifiedStructType(Context);
+}
+
+StructType *BitcodeReader::createIdentifiedStructType(LLVMContext &Context,
+                                                      StringRef Name) {
+  auto *Ret = StructType::create(Context, Name);
+  IdentifiedStructTypes.push_back(Ret);
+  return Ret;
+}
+
+StructType *BitcodeReader::createIdentifiedStructType(LLVMContext &Context) {
+  auto *Ret = StructType::create(Context);
+  IdentifiedStructTypes.push_back(Ret);
+  return Ret;
 }
 
 
@@ -516,10 +648,10 @@ static void decodeLLVMAttributesForBitcode(AttrBuilder &B,
 
 std::error_code BitcodeReader::ParseAttributeBlock() {
   if (Stream.EnterSubBlock(bitc::PARAMATTR_BLOCK_ID))
-    return Error(BitcodeError::InvalidRecord);
+    return Error("Invalid record");
 
   if (!MAttributes.empty())
-    return Error(BitcodeError::InvalidMultipleBlocks);
+    return Error("Invalid multiple blocks");
 
   SmallVector<uint64_t, 64> Record;
 
@@ -532,7 +664,7 @@ std::error_code BitcodeReader::ParseAttributeBlock() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(BitcodeError::MalformedBlock);
+      return Error("Malformed block");
     case BitstreamEntry::EndBlock:
       return std::error_code();
     case BitstreamEntry::Record:
@@ -548,7 +680,7 @@ std::error_code BitcodeReader::ParseAttributeBlock() {
     case bitc::PARAMATTR_CODE_ENTRY_OLD: { // ENTRY: [paramidx0, attr0, ...]
       // FIXME: Remove in 4.0.
       if (Record.size() & 1)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
         AttrBuilder B;
@@ -662,20 +794,31 @@ static Attribute::AttrKind GetAttrFromCode(uint64_t Code) {
   }
 }
 
+std::error_code BitcodeReader::parseAlignmentValue(uint64_t Exponent,
+                                                   unsigned &Alignment) {
+  // Note: Alignment in bitcode files is incremented by 1, so that zero
+  // can be used for default alignment.
+  if (Exponent > Value::MaxAlignmentExponent + 1)
+    return Error("Invalid alignment value");
+  Alignment = (1 << static_cast<unsigned>(Exponent)) >> 1;
+  return std::error_code();
+}
+
 std::error_code BitcodeReader::ParseAttrKind(uint64_t Code,
                                              Attribute::AttrKind *Kind) {
   *Kind = GetAttrFromCode(Code);
   if (*Kind == Attribute::None)
-    return Error(BitcodeError::InvalidValue);
+    return Error(BitcodeError::CorruptedBitcode,
+                 "Unknown attribute kind (" + Twine(Code) + ")");
   return std::error_code();
 }
 
 std::error_code BitcodeReader::ParseAttributeGroupBlock() {
   if (Stream.EnterSubBlock(bitc::PARAMATTR_GROUP_BLOCK_ID))
-    return Error(BitcodeError::InvalidRecord);
+    return Error("Invalid record");
 
   if (!MAttributeGroups.empty())
-    return Error(BitcodeError::InvalidMultipleBlocks);
+    return Error("Invalid multiple blocks");
 
   SmallVector<uint64_t, 64> Record;
 
@@ -686,7 +829,7 @@ std::error_code BitcodeReader::ParseAttributeGroupBlock() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(BitcodeError::MalformedBlock);
+      return Error("Malformed block");
     case BitstreamEntry::EndBlock:
       return std::error_code();
     case BitstreamEntry::Record:
@@ -701,7 +844,7 @@ std::error_code BitcodeReader::ParseAttributeGroupBlock() {
       break;
     case bitc::PARAMATTR_GRP_CODE_ENTRY: { // ENTRY: [grpid, idx, a0, a1, ...]
       if (Record.size() < 3)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       uint64_t GrpID = Record[0];
       uint64_t Idx = Record[1]; // Index of the object this attribute refers to.
@@ -756,14 +899,14 @@ std::error_code BitcodeReader::ParseAttributeGroupBlock() {
 
 std::error_code BitcodeReader::ParseTypeTable() {
   if (Stream.EnterSubBlock(bitc::TYPE_BLOCK_ID_NEW))
-    return Error(BitcodeError::InvalidRecord);
+    return Error("Invalid record");
 
   return ParseTypeTableBody();
 }
 
 std::error_code BitcodeReader::ParseTypeTableBody() {
   if (!TypeList.empty())
-    return Error(BitcodeError::InvalidMultipleBlocks);
+    return Error("Invalid multiple blocks");
 
   SmallVector<uint64_t, 64> Record;
   unsigned NumRecords = 0;
@@ -777,10 +920,10 @@ std::error_code BitcodeReader::ParseTypeTableBody() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(BitcodeError::MalformedBlock);
+      return Error("Malformed block");
     case BitstreamEntry::EndBlock:
       if (NumRecords != TypeList.size())
-        return Error(BitcodeError::MalformedBlock);
+        return Error("Malformed block");
       return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
@@ -792,12 +935,12 @@ std::error_code BitcodeReader::ParseTypeTableBody() {
     Type *ResultTy = nullptr;
     switch (Stream.readRecord(Entry.ID, Record)) {
     default:
-      return Error(BitcodeError::InvalidValue);
+      return Error("Invalid value");
     case bitc::TYPE_CODE_NUMENTRY: // TYPE_CODE_NUMENTRY: [numentries]
       // TYPE_CODE_NUMENTRY contains a count of the number of types in the
       // type list.  This allows us to reserve space.
       if (Record.size() < 1)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       TypeList.resize(Record[0]);
       continue;
     case bitc::TYPE_CODE_VOID:      // VOID
@@ -830,22 +973,27 @@ std::error_code BitcodeReader::ParseTypeTableBody() {
     case bitc::TYPE_CODE_X86_MMX:   // X86_MMX
       ResultTy = Type::getX86_MMXTy(Context);
       break;
-    case bitc::TYPE_CODE_INTEGER:   // INTEGER: [width]
+    case bitc::TYPE_CODE_INTEGER: { // INTEGER: [width]
       if (Record.size() < 1)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
-      ResultTy = IntegerType::get(Context, Record[0]);
+      uint64_t NumBits = Record[0];
+      if (NumBits < IntegerType::MIN_INT_BITS ||
+          NumBits > IntegerType::MAX_INT_BITS)
+        return Error("Bitwidth for integer type out of range");
+      ResultTy = IntegerType::get(Context, NumBits);
       break;
+    }
     case bitc::TYPE_CODE_POINTER: { // POINTER: [pointee type] or
                                     //          [pointee type, address space]
       if (Record.size() < 1)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       unsigned AddressSpace = 0;
       if (Record.size() == 2)
         AddressSpace = Record[1];
       ResultTy = getTypeByID(Record[0]);
       if (!ResultTy)
-        return Error(BitcodeError::InvalidType);
+        return Error("Invalid type");
       ResultTy = PointerType::get(ResultTy, AddressSpace);
       break;
     }
@@ -853,7 +1001,7 @@ std::error_code BitcodeReader::ParseTypeTableBody() {
       // FIXME: attrid is dead, remove it in LLVM 4.0
       // FUNCTION: [vararg, attrid, retty, paramty x N]
       if (Record.size() < 3)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       SmallVector<Type*, 8> ArgTys;
       for (unsigned i = 3, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -864,7 +1012,7 @@ std::error_code BitcodeReader::ParseTypeTableBody() {
 
       ResultTy = getTypeByID(Record[2]);
       if (!ResultTy || ArgTys.size() < Record.size()-3)
-        return Error(BitcodeError::InvalidType);
+        return Error("Invalid type");
 
       ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
       break;
@@ -872,7 +1020,7 @@ std::error_code BitcodeReader::ParseTypeTableBody() {
     case bitc::TYPE_CODE_FUNCTION: {
       // FUNCTION: [vararg, retty, paramty x N]
       if (Record.size() < 2)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       SmallVector<Type*, 8> ArgTys;
       for (unsigned i = 2, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -883,14 +1031,14 @@ std::error_code BitcodeReader::ParseTypeTableBody() {
 
       ResultTy = getTypeByID(Record[1]);
       if (!ResultTy || ArgTys.size() < Record.size()-2)
-        return Error(BitcodeError::InvalidType);
+        return Error("Invalid type");
 
       ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
       break;
     }
     case bitc::TYPE_CODE_STRUCT_ANON: {  // STRUCT: [ispacked, eltty x N]
       if (Record.size() < 1)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       SmallVector<Type*, 8> EltTys;
       for (unsigned i = 1, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -899,21 +1047,21 @@ std::error_code BitcodeReader::ParseTypeTableBody() {
           break;
       }
       if (EltTys.size() != Record.size()-1)
-        return Error(BitcodeError::InvalidType);
+        return Error("Invalid type");
       ResultTy = StructType::get(Context, EltTys, Record[0]);
       break;
     }
     case bitc::TYPE_CODE_STRUCT_NAME:   // STRUCT_NAME: [strchr x N]
       if (ConvertToString(Record, 0, TypeName))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       continue;
 
     case bitc::TYPE_CODE_STRUCT_NAMED: { // STRUCT: [ispacked, eltty x N]
       if (Record.size() < 1)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       if (NumRecords >= TypeList.size())
-        return Error(BitcodeError::InvalidTYPETable);
+        return Error("Invalid TYPE table");
 
       // Check to see if this was forward referenced, if so fill in the temp.
       StructType *Res = cast_or_null<StructType>(TypeList[NumRecords]);
@@ -921,7 +1069,7 @@ std::error_code BitcodeReader::ParseTypeTableBody() {
         Res->setName(TypeName);
         TypeList[NumRecords] = nullptr;
       } else  // Otherwise, create a new struct.
-        Res = StructType::create(Context, TypeName);
+        Res = createIdentifiedStructType(Context, TypeName);
       TypeName.clear();
 
       SmallVector<Type*, 8> EltTys;
@@ -932,17 +1080,17 @@ std::error_code BitcodeReader::ParseTypeTableBody() {
           break;
       }
       if (EltTys.size() != Record.size()-1)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Res->setBody(EltTys, Record[0]);
       ResultTy = Res;
       break;
     }
     case bitc::TYPE_CODE_OPAQUE: {       // OPAQUE: []
       if (Record.size() != 1)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       if (NumRecords >= TypeList.size())
-        return Error(BitcodeError::InvalidTYPETable);
+        return Error("Invalid TYPE table");
 
       // Check to see if this was forward referenced, if so fill in the temp.
       StructType *Res = cast_or_null<StructType>(TypeList[NumRecords]);
@@ -950,43 +1098,47 @@ std::error_code BitcodeReader::ParseTypeTableBody() {
         Res->setName(TypeName);
         TypeList[NumRecords] = nullptr;
       } else  // Otherwise, create a new struct with no body.
-        Res = StructType::create(Context, TypeName);
+        Res = createIdentifiedStructType(Context, TypeName);
       TypeName.clear();
       ResultTy = Res;
       break;
     }
     case bitc::TYPE_CODE_ARRAY:     // ARRAY: [numelts, eltty]
       if (Record.size() < 2)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       if ((ResultTy = getTypeByID(Record[1])))
         ResultTy = ArrayType::get(ResultTy, Record[0]);
       else
-        return Error(BitcodeError::InvalidType);
+        return Error("Invalid type");
       break;
     case bitc::TYPE_CODE_VECTOR:    // VECTOR: [numelts, eltty]
       if (Record.size() < 2)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       if ((ResultTy = getTypeByID(Record[1])))
         ResultTy = VectorType::get(ResultTy, Record[0]);
       else
-        return Error(BitcodeError::InvalidType);
+        return Error("Invalid type");
       break;
     }
 
     if (NumRecords >= TypeList.size())
-      return Error(BitcodeError::InvalidTYPETable);
+      return Error("Invalid TYPE table");
+    if (TypeList[NumRecords])
+      return Error(
+          "Invalid TYPE table: Only named structs can be forward referenced");
     assert(ResultTy && "Didn't read a type?");
-    assert(!TypeList[NumRecords] && "Already read type?");
     TypeList[NumRecords++] = ResultTy;
   }
 }
 
 std::error_code BitcodeReader::ParseValueSymbolTable() {
   if (Stream.EnterSubBlock(bitc::VALUE_SYMTAB_BLOCK_ID))
-    return Error(BitcodeError::InvalidRecord);
+    return Error("Invalid record");
 
   SmallVector<uint64_t, 64> Record;
 
+  Triple TT(TheModule->getTargetTriple());
+
   // Read all the records for this value table.
   SmallString<128> ValueName;
   while (1) {
@@ -995,7 +1147,7 @@ std::error_code BitcodeReader::ParseValueSymbolTable() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(BitcodeError::MalformedBlock);
+      return Error("Malformed block");
     case BitstreamEntry::EndBlock:
       return std::error_code();
     case BitstreamEntry::Record:
@@ -1010,22 +1162,30 @@ std::error_code BitcodeReader::ParseValueSymbolTable() {
       break;
     case bitc::VST_CODE_ENTRY: {  // VST_ENTRY: [valueid, namechar x N]
       if (ConvertToString(Record, 1, ValueName))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       unsigned ValueID = Record[0];
       if (ValueID >= ValueList.size() || !ValueList[ValueID])
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Value *V = ValueList[ValueID];
 
       V->setName(StringRef(ValueName.data(), ValueName.size()));
+      if (auto *GO = dyn_cast<GlobalObject>(V)) {
+        if (GO->getComdat() == reinterpret_cast<Comdat *>(1)) {
+          if (TT.isOSBinFormatMachO())
+            GO->setComdat(nullptr);
+          else
+            GO->setComdat(TheModule->getOrInsertComdat(V->getName()));
+        }
+      }
       ValueName.clear();
       break;
     }
     case bitc::VST_CODE_BBENTRY: {
       if (ConvertToString(Record, 1, ValueName))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       BasicBlock *BB = getBasicBlock(Record[0]);
       if (!BB)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       BB->setName(StringRef(ValueName.data(), ValueName.size()));
       ValueName.clear();
@@ -1035,14 +1195,32 @@ std::error_code BitcodeReader::ParseValueSymbolTable() {
   }
 }
 
+static int64_t unrotateSign(uint64_t U) { return U & 1 ? ~(U >> 1) : U >> 1; }
+
 std::error_code BitcodeReader::ParseMetadata() {
   unsigned NextMDValueNo = MDValueList.size();
 
   if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID))
-    return Error(BitcodeError::InvalidRecord);
+    return Error("Invalid record");
 
   SmallVector<uint64_t, 64> Record;
 
+  auto getMD =
+      [&](unsigned ID) -> Metadata *{ return MDValueList.getValueFwdRef(ID); };
+  auto getMDOrNull = [&](unsigned ID) -> Metadata *{
+    if (ID)
+      return getMD(ID - 1);
+    return nullptr;
+  };
+  auto getMDString = [&](unsigned ID) -> MDString *{
+    // This requires that the ID is not really a forward reference.  In
+    // particular, the MDString must already have been resolved.
+    return cast_or_null<MDString>(getMDOrNull(ID));
+  };
+
+#define GET_OR_DISTINCT(CLASS, DISTINCT, ARGS)                                 \
+  (DISTINCT ? CLASS::getDistinct ARGS : CLASS::get ARGS)
+
   // Read all the records.
   while (1) {
     BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
@@ -1050,18 +1228,19 @@ std::error_code BitcodeReader::ParseMetadata() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(BitcodeError::MalformedBlock);
+      return Error("Malformed block");
     case BitstreamEntry::EndBlock:
+      MDValueList.tryToResolveCycles();
       return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
     }
 
-    bool IsFunctionLocal = false;
     // Read a record.
     Record.clear();
     unsigned Code = Stream.readRecord(Entry.ID, Record);
+    bool IsDistinct = false;
     switch (Code) {
     default:  // Default behavior: ignore.
       break;
@@ -1081,57 +1260,377 @@ std::error_code BitcodeReader::ParseMetadata() {
       for (unsigned i = 0; i != Size; ++i) {
         MDNode *MD = dyn_cast_or_null<MDNode>(MDValueList.getValueFwdRef(Record[i]));
         if (!MD)
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
         NMD->addOperand(MD);
       }
       break;
     }
-    case bitc::METADATA_FN_NODE:
-      IsFunctionLocal = true;
-      // fall-through
-    case bitc::METADATA_NODE: {
+    case bitc::METADATA_OLD_FN_NODE: {
+      // FIXME: Remove in 4.0.
+      // This is a LocalAsMetadata record, the only type of function-local
+      // metadata.
       if (Record.size() % 2 == 1)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
+
+      // If this isn't a LocalAsMetadata record, we're dropping it.  This used
+      // to be legal, but there's no upgrade path.
+      auto dropRecord = [&] {
+        MDValueList.AssignValue(MDNode::get(Context, None), NextMDValueNo++);
+      };
+      if (Record.size() != 2) {
+        dropRecord();
+        break;
+      }
+
+      Type *Ty = getTypeByID(Record[0]);
+      if (Ty->isMetadataTy() || Ty->isVoidTy()) {
+        dropRecord();
+        break;
+      }
+
+      MDValueList.AssignValue(
+          LocalAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_OLD_NODE: {
+      // FIXME: Remove in 4.0.
+      if (Record.size() % 2 == 1)
+        return Error("Invalid record");
 
       unsigned Size = Record.size();
-      SmallVector<Value*, 8> Elts;
+      SmallVector<Metadata *, 8> Elts;
       for (unsigned i = 0; i != Size; i += 2) {
         Type *Ty = getTypeByID(Record[i]);
         if (!Ty)
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
         if (Ty->isMetadataTy())
           Elts.push_back(MDValueList.getValueFwdRef(Record[i+1]));
-        else if (!Ty->isVoidTy())
-          Elts.push_back(ValueList.getValueFwdRef(Record[i+1], Ty));
-        else
+        else if (!Ty->isVoidTy()) {
+          auto *MD =
+              ValueAsMetadata::get(ValueList.getValueFwdRef(Record[i + 1], Ty));
+          assert(isa<ConstantAsMetadata>(MD) &&
+                 "Expected non-function-local metadata");
+          Elts.push_back(MD);
+        } else
           Elts.push_back(nullptr);
       }
-      Value *V = MDNode::getWhenValsUnresolved(Context, Elts, IsFunctionLocal);
-      IsFunctionLocal = false;
-      MDValueList.AssignValue(V, NextMDValueNo++);
+      MDValueList.AssignValue(MDNode::get(Context, Elts), NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_VALUE: {
+      if (Record.size() != 2)
+        return Error("Invalid record");
+
+      Type *Ty = getTypeByID(Record[0]);
+      if (Ty->isMetadataTy() || Ty->isVoidTy())
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          ValueAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_DISTINCT_NODE:
+      IsDistinct = true;
+      // fallthrough...
+    case bitc::METADATA_NODE: {
+      SmallVector<Metadata *, 8> Elts;
+      Elts.reserve(Record.size());
+      for (unsigned ID : Record)
+        Elts.push_back(ID ? MDValueList.getValueFwdRef(ID - 1) : nullptr);
+      MDValueList.AssignValue(IsDistinct ? MDNode::getDistinct(Context, Elts)
+                                         : MDNode::get(Context, Elts),
+                              NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_LOCATION: {
+      if (Record.size() != 5)
+        return Error("Invalid record");
+
+      auto get = Record[0] ? MDLocation::getDistinct : MDLocation::get;
+      unsigned Line = Record[1];
+      unsigned Column = Record[2];
+      MDNode *Scope = cast<MDNode>(MDValueList.getValueFwdRef(Record[3]));
+      Metadata *InlinedAt =
+          Record[4] ? MDValueList.getValueFwdRef(Record[4] - 1) : nullptr;
+      MDValueList.AssignValue(get(Context, Line, Column, Scope, InlinedAt),
+                              NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_GENERIC_DEBUG: {
+      if (Record.size() < 4)
+        return Error("Invalid record");
+
+      unsigned Tag = Record[1];
+      unsigned Version = Record[2];
+
+      if (Tag >= 1u << 16 || Version != 0)
+        return Error("Invalid record");
+
+      auto *Header = getMDString(Record[3]);
+      SmallVector<Metadata *, 8> DwarfOps;
+      for (unsigned I = 4, E = Record.size(); I != E; ++I)
+        DwarfOps.push_back(Record[I] ? MDValueList.getValueFwdRef(Record[I] - 1)
+                                     : nullptr);
+      MDValueList.AssignValue(GET_OR_DISTINCT(GenericDebugNode, Record[0],
+                                              (Context, Tag, Header, DwarfOps)),
+                              NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_SUBRANGE: {
+      if (Record.size() != 3)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(MDSubrange, Record[0],
+                          (Context, Record[1], unrotateSign(Record[2]))),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_ENUMERATOR: {
+      if (Record.size() != 3)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(GET_OR_DISTINCT(MDEnumerator, Record[0],
+                                              (Context, unrotateSign(Record[1]),
+                                               getMDString(Record[2]))),
+                              NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_BASIC_TYPE: {
+      if (Record.size() != 6)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(MDBasicType, Record[0],
+                          (Context, Record[1], getMDString(Record[2]),
+                           Record[3], Record[4], Record[5])),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_DERIVED_TYPE: {
+      if (Record.size() != 12)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(MDDerivedType, Record[0],
+                          (Context, Record[1], getMDString(Record[2]),
+                           getMDOrNull(Record[3]), Record[4],
+                           getMDOrNull(Record[5]), getMDOrNull(Record[6]),
+                           Record[7], Record[8], Record[9], Record[10],
+                           getMDOrNull(Record[11]))),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_COMPOSITE_TYPE: {
+      if (Record.size() != 16)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(MDCompositeType, Record[0],
+                          (Context, Record[1], getMDString(Record[2]),
+                           getMDOrNull(Record[3]), Record[4],
+                           getMDOrNull(Record[5]), getMDOrNull(Record[6]),
+                           Record[7], Record[8], Record[9], Record[10],
+                           getMDOrNull(Record[11]), Record[12],
+                           getMDOrNull(Record[13]), getMDOrNull(Record[14]),
+                           getMDString(Record[15]))),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_SUBROUTINE_TYPE: {
+      if (Record.size() != 3)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(MDSubroutineType, Record[0],
+                          (Context, Record[1], getMDOrNull(Record[2]))),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_FILE: {
+      if (Record.size() != 3)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(MDFile, Record[0], (Context, getMDString(Record[1]),
+                                              getMDString(Record[2]))),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_COMPILE_UNIT: {
+      if (Record.size() != 14)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(MDCompileUnit, Record[0],
+                          (Context, Record[1], getMDOrNull(Record[2]),
+                           getMDString(Record[3]), Record[4],
+                           getMDString(Record[5]), Record[6],
+                           getMDString(Record[7]), Record[8],
+                           getMDOrNull(Record[9]), getMDOrNull(Record[10]),
+                           getMDOrNull(Record[11]), getMDOrNull(Record[12]),
+                           getMDOrNull(Record[13]))),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_SUBPROGRAM: {
+      if (Record.size() != 19)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(
+              MDSubprogram, Record[0],
+              (Context, getMDOrNull(Record[1]), getMDString(Record[2]),
+               getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
+               getMDOrNull(Record[6]), Record[7], Record[8], Record[9],
+               getMDOrNull(Record[10]), Record[11], Record[12], Record[13],
+               Record[14], getMDOrNull(Record[15]), getMDOrNull(Record[16]),
+               getMDOrNull(Record[17]), getMDOrNull(Record[18]))),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_LEXICAL_BLOCK: {
+      if (Record.size() != 5)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(MDLexicalBlock, Record[0],
+                          (Context, getMDOrNull(Record[1]),
+                           getMDOrNull(Record[2]), Record[3], Record[4])),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_LEXICAL_BLOCK_FILE: {
+      if (Record.size() != 4)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(MDLexicalBlockFile, Record[0],
+                          (Context, getMDOrNull(Record[1]),
+                           getMDOrNull(Record[2]), Record[3])),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_NAMESPACE: {
+      if (Record.size() != 5)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(MDNamespace, Record[0],
+                          (Context, getMDOrNull(Record[1]),
+                           getMDOrNull(Record[2]), getMDString(Record[3]),
+                           Record[4])),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_TEMPLATE_TYPE: {
+      if (Record.size() != 3)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(GET_OR_DISTINCT(MDTemplateTypeParameter,
+                                              Record[0],
+                                              (Context, getMDString(Record[1]),
+                                               getMDOrNull(Record[2]))),
+                              NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_TEMPLATE_VALUE: {
+      if (Record.size() != 5)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(MDTemplateValueParameter, Record[0],
+                          (Context, Record[1], getMDString(Record[2]),
+                           getMDOrNull(Record[3]), getMDOrNull(Record[4]))),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_GLOBAL_VAR: {
+      if (Record.size() != 11)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(MDGlobalVariable, Record[0],
+                          (Context, getMDOrNull(Record[1]),
+                           getMDString(Record[2]), getMDString(Record[3]),
+                           getMDOrNull(Record[4]), Record[5],
+                           getMDOrNull(Record[6]), Record[7], Record[8],
+                           getMDOrNull(Record[9]), getMDOrNull(Record[10]))),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_LOCAL_VAR: {
+      if (Record.size() != 10)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(MDLocalVariable, Record[0],
+                          (Context, Record[1], getMDOrNull(Record[2]),
+                           getMDString(Record[3]), getMDOrNull(Record[4]),
+                           Record[5], getMDOrNull(Record[6]), Record[7],
+                           Record[8], getMDOrNull(Record[9]))),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_EXPRESSION: {
+      if (Record.size() < 1)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(MDExpression, Record[0],
+                          (Context, makeArrayRef(Record).slice(1))),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_OBJC_PROPERTY: {
+      if (Record.size() != 8)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(MDObjCProperty, Record[0],
+                          (Context, getMDString(Record[1]),
+                           getMDOrNull(Record[2]), Record[3],
+                           getMDString(Record[4]), getMDString(Record[5]),
+                           Record[6], getMDOrNull(Record[7]))),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_IMPORTED_ENTITY: {
+      if (Record.size() != 6)
+        return Error("Invalid record");
+
+      MDValueList.AssignValue(
+          GET_OR_DISTINCT(MDImportedEntity, Record[0],
+                          (Context, Record[1], getMDOrNull(Record[2]),
+                           getMDOrNull(Record[3]), Record[4],
+                           getMDString(Record[5]))),
+          NextMDValueNo++);
       break;
     }
     case bitc::METADATA_STRING: {
       std::string String(Record.begin(), Record.end());
       llvm::UpgradeMDStringConstant(String);
-      Value *V = MDString::get(Context, String);
-      MDValueList.AssignValue(V, NextMDValueNo++);
+      Metadata *MD = MDString::get(Context, String);
+      MDValueList.AssignValue(MD, NextMDValueNo++);
       break;
     }
     case bitc::METADATA_KIND: {
       if (Record.size() < 2)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       unsigned Kind = Record[0];
       SmallString<8> Name(Record.begin()+1, Record.end());
 
       unsigned NewKind = TheModule->getMDKindID(Name.str());
       if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second)
-        return Error(BitcodeError::ConflictingMETADATA_KINDRecords);
+        return Error("Conflicting METADATA_KIND records");
       break;
     }
     }
   }
+#undef GET_OR_DISTINCT
 }
 
 /// decodeSignRotatedValue - Decode a signed value stored with the sign bit in
@@ -1151,10 +1650,12 @@ std::error_code BitcodeReader::ResolveGlobalAndAliasInits() {
   std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInitWorklist;
   std::vector<std::pair<GlobalAlias*, unsigned> > AliasInitWorklist;
   std::vector<std::pair<Function*, unsigned> > FunctionPrefixWorklist;
+  std::vector<std::pair<Function*, unsigned> > FunctionPrologueWorklist;
 
   GlobalInitWorklist.swap(GlobalInits);
   AliasInitWorklist.swap(AliasInits);
   FunctionPrefixWorklist.swap(FunctionPrefixes);
+  FunctionPrologueWorklist.swap(FunctionPrologues);
 
   while (!GlobalInitWorklist.empty()) {
     unsigned ValID = GlobalInitWorklist.back().second;
@@ -1165,7 +1666,7 @@ std::error_code BitcodeReader::ResolveGlobalAndAliasInits() {
       if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
         GlobalInitWorklist.back().first->setInitializer(C);
       else
-        return Error(BitcodeError::ExpectedConstant);
+        return Error("Expected a constant");
     }
     GlobalInitWorklist.pop_back();
   }
@@ -1178,7 +1679,7 @@ std::error_code BitcodeReader::ResolveGlobalAndAliasInits() {
       if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
         AliasInitWorklist.back().first->setAliasee(C);
       else
-        return Error(BitcodeError::ExpectedConstant);
+        return Error("Expected a constant");
     }
     AliasInitWorklist.pop_back();
   }
@@ -1191,11 +1692,24 @@ std::error_code BitcodeReader::ResolveGlobalAndAliasInits() {
       if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
         FunctionPrefixWorklist.back().first->setPrefixData(C);
       else
-        return Error(BitcodeError::ExpectedConstant);
+        return Error("Expected a constant");
     }
     FunctionPrefixWorklist.pop_back();
   }
 
+  while (!FunctionPrologueWorklist.empty()) {
+    unsigned ValID = FunctionPrologueWorklist.back().second;
+    if (ValID >= ValueList.size()) {
+      FunctionPrologues.push_back(FunctionPrologueWorklist.back());
+    } else {
+      if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
+        FunctionPrologueWorklist.back().first->setPrologueData(C);
+      else
+        return Error("Expected a constant");
+    }
+    FunctionPrologueWorklist.pop_back();
+  }
+
   return std::error_code();
 }
 
@@ -1209,7 +1723,7 @@ static APInt ReadWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
 
 std::error_code BitcodeReader::ParseConstants() {
   if (Stream.EnterSubBlock(bitc::CONSTANTS_BLOCK_ID))
-    return Error(BitcodeError::InvalidRecord);
+    return Error("Invalid record");
 
   SmallVector<uint64_t, 64> Record;
 
@@ -1222,10 +1736,10 @@ std::error_code BitcodeReader::ParseConstants() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(BitcodeError::MalformedBlock);
+      return Error("Malformed block");
     case BitstreamEntry::EndBlock:
       if (NextCstNo != ValueList.size())
-        return Error(BitcodeError::InvalidConstantReference);
+        return Error("Invalid ronstant reference");
 
       // Once all the constants have been read, go through and resolve forward
       // references.
@@ -1247,9 +1761,9 @@ std::error_code BitcodeReader::ParseConstants() {
       break;
     case bitc::CST_CODE_SETTYPE:   // SETTYPE: [typeid]
       if (Record.empty())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       if (Record[0] >= TypeList.size() || !TypeList[Record[0]])
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       CurTy = TypeList[Record[0]];
       continue;  // Skip the ValueList manipulation.
     case bitc::CST_CODE_NULL:      // NULL
@@ -1257,12 +1771,12 @@ std::error_code BitcodeReader::ParseConstants() {
       break;
     case bitc::CST_CODE_INTEGER:   // INTEGER: [intval]
       if (!CurTy->isIntegerTy() || Record.empty())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       V = ConstantInt::get(CurTy, decodeSignRotatedValue(Record[0]));
       break;
     case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval]
       if (!CurTy->isIntegerTy() || Record.empty())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       APInt VInt = ReadWideAPInt(Record,
                                  cast<IntegerType>(CurTy)->getBitWidth());
@@ -1272,7 +1786,7 @@ std::error_code BitcodeReader::ParseConstants() {
     }
     case bitc::CST_CODE_FLOAT: {    // FLOAT: [fpval]
       if (Record.empty())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       if (CurTy->isHalfTy())
         V = ConstantFP::get(Context, APFloat(APFloat::IEEEhalf,
                                              APInt(16, (uint16_t)Record[0])));
@@ -1302,7 +1816,7 @@ std::error_code BitcodeReader::ParseConstants() {
 
     case bitc::CST_CODE_AGGREGATE: {// AGGREGATE: [n x value number]
       if (Record.empty())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       unsigned Size = Record.size();
       SmallVector<Constant*, 16> Elts;
@@ -1330,7 +1844,7 @@ std::error_code BitcodeReader::ParseConstants() {
     case bitc::CST_CODE_STRING:    // STRING: [values]
     case bitc::CST_CODE_CSTRING: { // CSTRING: [values]
       if (Record.empty())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       SmallString<16> Elts(Record.begin(), Record.end());
       V = ConstantDataArray::getString(Context, Elts,
@@ -1339,7 +1853,7 @@ std::error_code BitcodeReader::ParseConstants() {
     }
     case bitc::CST_CODE_DATA: {// DATA: [n x value]
       if (Record.empty())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       Type *EltTy = cast<SequentialType>(CurTy)->getElementType();
       unsigned Size = Record.size();
@@ -1384,14 +1898,14 @@ std::error_code BitcodeReader::ParseConstants() {
         else
           V = ConstantDataArray::get(Context, Elts);
       } else {
-        return Error(BitcodeError::InvalidTypeForValue);
+        return Error("Invalid type for value");
       }
       break;
     }
 
     case bitc::CST_CODE_CE_BINOP: {  // CE_BINOP: [opcode, opval, opval]
       if (Record.size() < 3)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       int Opc = GetDecodedBinaryOpcode(Record[0], CurTy);
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown binop.
@@ -1422,14 +1936,14 @@ std::error_code BitcodeReader::ParseConstants() {
     }
     case bitc::CST_CODE_CE_CAST: {  // CE_CAST: [opcode, opty, opval]
       if (Record.size() < 3)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       int Opc = GetDecodedCastOpcode(Record[0]);
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown cast.
       } else {
         Type *OpTy = getTypeByID(Record[1]);
         if (!OpTy)
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
         Constant *Op = ValueList.getConstantFwdRef(Record[2], OpTy);
         V = UpgradeBitCastExpr(Opc, Op, CurTy);
         if (!V) V = ConstantExpr::getCast(Opc, Op, CurTy);
@@ -1439,12 +1953,12 @@ std::error_code BitcodeReader::ParseConstants() {
     case bitc::CST_CODE_CE_INBOUNDS_GEP:
     case bitc::CST_CODE_CE_GEP: {  // CE_GEP:        [n x operands]
       if (Record.size() & 1)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       SmallVector<Constant*, 16> Elts;
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
         Type *ElTy = getTypeByID(Record[i]);
         if (!ElTy)
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
         Elts.push_back(ValueList.getConstantFwdRef(Record[i+1], ElTy));
       }
       ArrayRef<Constant *> Indices(Elts.begin() + 1, Elts.end());
@@ -1455,7 +1969,7 @@ std::error_code BitcodeReader::ParseConstants() {
     }
     case bitc::CST_CODE_CE_SELECT: {  // CE_SELECT: [opval#, opval#, opval#]
       if (Record.size() < 3)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       Type *SelectorTy = Type::getInt1Ty(Context);
 
@@ -1474,22 +1988,22 @@ std::error_code BitcodeReader::ParseConstants() {
     case bitc::CST_CODE_CE_EXTRACTELT
         : { // CE_EXTRACTELT: [opty, opval, opty, opval]
       if (Record.size() < 3)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       VectorType *OpTy =
         dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
       if (!OpTy)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = nullptr;
       if (Record.size() == 4) {
         Type *IdxTy = getTypeByID(Record[2]);
         if (!IdxTy)
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
         Op1 = ValueList.getConstantFwdRef(Record[3], IdxTy);
       } else // TODO: Remove with llvm 4.0
         Op1 = ValueList.getConstantFwdRef(Record[2], Type::getInt32Ty(Context));
       if (!Op1)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       V = ConstantExpr::getExtractElement(Op0, Op1);
       break;
     }
@@ -1497,7 +2011,7 @@ std::error_code BitcodeReader::ParseConstants() {
         : { // CE_INSERTELT: [opval, opval, opty, opval]
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || !OpTy)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[1],
                                                   OpTy->getElementType());
@@ -1505,19 +2019,19 @@ std::error_code BitcodeReader::ParseConstants() {
       if (Record.size() == 4) {
         Type *IdxTy = getTypeByID(Record[2]);
         if (!IdxTy)
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
         Op2 = ValueList.getConstantFwdRef(Record[3], IdxTy);
       } else // TODO: Remove with llvm 4.0
         Op2 = ValueList.getConstantFwdRef(Record[2], Type::getInt32Ty(Context));
       if (!Op2)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       V = ConstantExpr::getInsertElement(Op0, Op1, Op2);
       break;
     }
     case bitc::CST_CODE_CE_SHUFFLEVEC: { // CE_SHUFFLEVEC: [opval, opval, opval]
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || !OpTy)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
@@ -1531,7 +2045,7 @@ std::error_code BitcodeReader::ParseConstants() {
       VectorType *OpTy =
         dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
       if (Record.size() < 4 || !RTy || !OpTy)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
       Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
@@ -1542,10 +2056,10 @@ std::error_code BitcodeReader::ParseConstants() {
     }
     case bitc::CST_CODE_CE_CMP: {     // CE_CMP: [opty, opval, opval, pred]
       if (Record.size() < 4)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Type *OpTy = getTypeByID(Record[0]);
       if (!OpTy)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
 
@@ -1559,16 +2073,16 @@ std::error_code BitcodeReader::ParseConstants() {
     // FIXME: Remove with the 4.0 release.
     case bitc::CST_CODE_INLINEASM_OLD: {
       if (Record.size() < 2)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[0] & 1;
       bool IsAlignStack = Record[0] >> 1;
       unsigned AsmStrSize = Record[1];
       if (2+AsmStrSize >= Record.size())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       unsigned ConstStrSize = Record[2+AsmStrSize];
       if (3+AsmStrSize+ConstStrSize > Record.size())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
         AsmStr += (char)Record[2+i];
@@ -1583,17 +2097,17 @@ std::error_code BitcodeReader::ParseConstants() {
     // inteldialect).
     case bitc::CST_CODE_INLINEASM: {
       if (Record.size() < 2)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[0] & 1;
       bool IsAlignStack = (Record[0] >> 1) & 1;
       unsigned AsmDialect = Record[0] >> 2;
       unsigned AsmStrSize = Record[1];
       if (2+AsmStrSize >= Record.size())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       unsigned ConstStrSize = Record[2+AsmStrSize];
       if (3+AsmStrSize+ConstStrSize > Record.size())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
         AsmStr += (char)Record[2+i];
@@ -1607,14 +2121,14 @@ std::error_code BitcodeReader::ParseConstants() {
     }
     case bitc::CST_CODE_BLOCKADDRESS:{
       if (Record.size() < 3)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Type *FnTy = getTypeByID(Record[0]);
       if (!FnTy)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Function *Fn =
         dyn_cast_or_null<Function>(ValueList.getConstantFwdRef(Record[1],FnTy));
       if (!Fn)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       // Don't let Fn get dematerialized.
       BlockAddressesTaken.insert(Fn);
@@ -1625,12 +2139,12 @@ std::error_code BitcodeReader::ParseConstants() {
       unsigned BBID = Record[2];
       if (!BBID)
         // Invalid reference to entry block.
-        return Error(BitcodeError::InvalidID);
+        return Error("Invalid ID");
       if (!Fn->empty()) {
         Function::iterator BBI = Fn->begin(), BBE = Fn->end();
         for (size_t I = 0, E = BBID; I != E; ++I) {
           if (BBI == BBE)
-            return Error(BitcodeError::InvalidID);
+            return Error("Invalid ID");
           ++BBI;
         }
         BB = BBI;
@@ -1658,7 +2172,7 @@ std::error_code BitcodeReader::ParseConstants() {
 
 std::error_code BitcodeReader::ParseUseLists() {
   if (Stream.EnterSubBlock(bitc::USELIST_BLOCK_ID))
-    return Error(BitcodeError::InvalidRecord);
+    return Error("Invalid record");
 
   // Read all the records.
   SmallVector<uint64_t, 64> Record;
@@ -1668,7 +2182,7 @@ std::error_code BitcodeReader::ParseUseLists() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(BitcodeError::MalformedBlock);
+      return Error("Malformed block");
     case BitstreamEntry::EndBlock:
       return std::error_code();
     case BitstreamEntry::Record:
@@ -1689,7 +2203,7 @@ std::error_code BitcodeReader::ParseUseLists() {
       unsigned RecordLength = Record.size();
       if (RecordLength < 3)
         // Records should have at least an ID and two indexes.
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       unsigned ID = Record.back();
       Record.pop_back();
 
@@ -1726,7 +2240,7 @@ std::error_code BitcodeReader::ParseUseLists() {
 std::error_code BitcodeReader::RememberAndSkipFunctionBody() {
   // Get the function we are talking about.
   if (FunctionsWithBodies.empty())
-    return Error(BitcodeError::InsufficientFunctionProtos);
+    return Error("Insufficient function protos");
 
   Function *Fn = FunctionsWithBodies.back();
   FunctionsWithBodies.pop_back();
@@ -1737,7 +2251,7 @@ std::error_code BitcodeReader::RememberAndSkipFunctionBody() {
 
   // Skip over the function block for now.
   if (Stream.SkipBlock())
-    return Error(BitcodeError::InvalidRecord);
+    return Error("Invalid record");
   return std::error_code();
 }
 
@@ -1745,7 +2259,7 @@ std::error_code BitcodeReader::GlobalCleanup() {
   // Patch the initializers for globals and aliases up.
   ResolveGlobalAndAliasInits();
   if (!GlobalInits.empty() || !AliasInits.empty())
-    return Error(BitcodeError::MalformedGlobalInitializerSet);
+    return Error("Malformed global initializer set");
 
   // Look for intrinsic functions which need to be upgraded at some point
   for (Module::iterator FI = TheModule->begin(), FE = TheModule->end();
@@ -1774,7 +2288,7 @@ std::error_code BitcodeReader::ParseModule(bool Resume) {
   if (Resume)
     Stream.JumpToBit(NextUnreadBit);
   else if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
-    return Error(BitcodeError::InvalidRecord);
+    return Error("Invalid record");
 
   SmallVector<uint64_t, 64> Record;
   std::vector<std::string> SectionTable;
@@ -1786,7 +2300,7 @@ std::error_code BitcodeReader::ParseModule(bool Resume) {
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      return Error(BitcodeError::MalformedBlock);
+      return Error("Malformed block");
     case BitstreamEntry::EndBlock:
       return GlobalCleanup();
 
@@ -1794,11 +2308,11 @@ std::error_code BitcodeReader::ParseModule(bool Resume) {
       switch (Entry.ID) {
       default:  // Skip unknown content.
         if (Stream.SkipBlock())
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
         break;
       case bitc::BLOCKINFO_BLOCK_ID:
         if (Stream.ReadBlockInfoBlock())
-          return Error(BitcodeError::MalformedBlock);
+          return Error("Malformed block");
         break;
       case bitc::PARAMATTR_BLOCK_ID:
         if (std::error_code EC = ParseAttributeBlock())
@@ -1868,12 +2382,12 @@ std::error_code BitcodeReader::ParseModule(bool Resume) {
     default: break;  // Default behavior, ignore unknown content.
     case bitc::MODULE_CODE_VERSION: {  // VERSION: [version#]
       if (Record.size() < 1)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       // Only version #0 and #1 are supported so far.
       unsigned module_version = Record[0];
       switch (module_version) {
         default:
-          return Error(BitcodeError::InvalidValue);
+          return Error("Invalid value");
         case 0:
           UseRelativeIDs = false;
           break;
@@ -1886,21 +2400,21 @@ std::error_code BitcodeReader::ParseModule(bool Resume) {
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       TheModule->setTargetTriple(S);
       break;
     }
     case bitc::MODULE_CODE_DATALAYOUT: {  // DATALAYOUT: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       TheModule->setDataLayout(S);
       break;
     }
     case bitc::MODULE_CODE_ASM: {  // ASM: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       TheModule->setModuleInlineAsm(S);
       break;
     }
@@ -1908,27 +2422,27 @@ std::error_code BitcodeReader::ParseModule(bool Resume) {
       // FIXME: Remove in 4.0.
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       // Ignore value.
       break;
     }
     case bitc::MODULE_CODE_SECTIONNAME: {  // SECTIONNAME: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       SectionTable.push_back(S);
       break;
     }
     case bitc::MODULE_CODE_GCNAME: {  // SECTIONNAME: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       GCTable.push_back(S);
       break;
     }
     case bitc::MODULE_CODE_COMDAT: { // COMDAT: [selection_kind, name]
       if (Record.size() < 2)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Comdat::SelectionKind SK = getDecodedComdatSelectionKind(Record[0]);
       unsigned ComdatNameSize = Record[1];
       std::string ComdatName;
@@ -1942,25 +2456,29 @@ std::error_code BitcodeReader::ParseModule(bool Resume) {
     }
     // GLOBALVAR: [pointer type, isconst, initid,
     //             linkage, alignment, section, visibility, threadlocal,
-    //             unnamed_addr, dllstorageclass]
+    //             unnamed_addr, externally_initialized, dllstorageclass,
+    //             comdat]
     case bitc::MODULE_CODE_GLOBALVAR: {
       if (Record.size() < 6)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Type *Ty = getTypeByID(Record[0]);
       if (!Ty)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       if (!Ty->isPointerTy())
-        return Error(BitcodeError::InvalidTypeForValue);
+        return Error("Invalid type for value");
       unsigned AddressSpace = cast<PointerType>(Ty)->getAddressSpace();
       Ty = cast<PointerType>(Ty)->getElementType();
 
       bool isConstant = Record[1];
-      GlobalValue::LinkageTypes Linkage = GetDecodedLinkage(Record[3]);
-      unsigned Alignment = (1 << Record[4]) >> 1;
+      uint64_t RawLinkage = Record[3];
+      GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
+      unsigned Alignment;
+      if (std::error_code EC = parseAlignmentValue(Record[4], Alignment))
+        return EC;
       std::string Section;
       if (Record[5]) {
         if (Record[5]-1 >= SectionTable.size())
-          return Error(BitcodeError::InvalidID);
+          return Error("Invalid ID");
         Section = SectionTable[Record[5]-1];
       }
       GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
@@ -1993,7 +2511,7 @@ std::error_code BitcodeReader::ParseModule(bool Resume) {
       if (Record.size() > 10)
         NewGV->setDLLStorageClass(GetDecodedDLLStorageClass(Record[10]));
       else
-        UpgradeDLLImportExportLinkage(NewGV, Record[3]);
+        UpgradeDLLImportExportLinkage(NewGV, RawLinkage);
 
       ValueList.push_back(NewGV);
 
@@ -2001,41 +2519,48 @@ std::error_code BitcodeReader::ParseModule(bool Resume) {
       if (unsigned InitID = Record[2])
         GlobalInits.push_back(std::make_pair(NewGV, InitID-1));
 
-      if (Record.size() > 11)
+      if (Record.size() > 11) {
         if (unsigned ComdatID = Record[11]) {
           assert(ComdatID <= ComdatList.size());
           NewGV->setComdat(ComdatList[ComdatID - 1]);
         }
+      } else if (hasImplicitComdat(RawLinkage)) {
+        NewGV->setComdat(reinterpret_cast<Comdat *>(1));
+      }
       break;
     }
     // FUNCTION:  [type, callingconv, isproto, linkage, paramattr,
     //             alignment, section, visibility, gc, unnamed_addr,
-    //             dllstorageclass]
+    //             prologuedata, dllstorageclass, comdat, prefixdata]
     case bitc::MODULE_CODE_FUNCTION: {
       if (Record.size() < 8)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Type *Ty = getTypeByID(Record[0]);
       if (!Ty)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       if (!Ty->isPointerTy())
-        return Error(BitcodeError::InvalidTypeForValue);
+        return Error("Invalid type for value");
       FunctionType *FTy =
         dyn_cast<FunctionType>(cast<PointerType>(Ty)->getElementType());
       if (!FTy)
-        return Error(BitcodeError::InvalidTypeForValue);
+        return Error("Invalid type for value");
 
       Function *Func = Function::Create(FTy, GlobalValue::ExternalLinkage,
                                         "", TheModule);
 
       Func->setCallingConv(static_cast<CallingConv::ID>(Record[1]));
       bool isProto = Record[2];
-      Func->setLinkage(GetDecodedLinkage(Record[3]));
+      uint64_t RawLinkage = Record[3];
+      Func->setLinkage(getDecodedLinkage(RawLinkage));
       Func->setAttributes(getAttributes(Record[4]));
 
-      Func->setAlignment((1 << Record[5]) >> 1);
+      unsigned Alignment;
+      if (std::error_code EC = parseAlignmentValue(Record[5], Alignment))
+        return EC;
+      Func->setAlignment(Alignment);
       if (Record[6]) {
         if (Record[6]-1 >= SectionTable.size())
-          return Error(BitcodeError::InvalidID);
+          return Error("Invalid ID");
         Func->setSection(SectionTable[Record[6]-1]);
       }
       // Local linkage must have default visibility.
@@ -2044,7 +2569,7 @@ std::error_code BitcodeReader::ParseModule(bool Resume) {
         Func->setVisibility(GetDecodedVisibility(Record[7]));
       if (Record.size() > 8 && Record[8]) {
         if (Record[8]-1 > GCTable.size())
-          return Error(BitcodeError::InvalidID);
+          return Error("Invalid ID");
         Func->setGC(GCTable[Record[8]-1].c_str());
       }
       bool UnnamedAddr = false;
@@ -2052,18 +2577,24 @@ std::error_code BitcodeReader::ParseModule(bool Resume) {
         UnnamedAddr = Record[9];
       Func->setUnnamedAddr(UnnamedAddr);
       if (Record.size() > 10 && Record[10] != 0)
-        FunctionPrefixes.push_back(std::make_pair(Func, Record[10]-1));
+        FunctionPrologues.push_back(std::make_pair(Func, Record[10]-1));
 
       if (Record.size() > 11)
         Func->setDLLStorageClass(GetDecodedDLLStorageClass(Record[11]));
       else
-        UpgradeDLLImportExportLinkage(Func, Record[3]);
+        UpgradeDLLImportExportLinkage(Func, RawLinkage);
 
-      if (Record.size() > 12)
+      if (Record.size() > 12) {
         if (unsigned ComdatID = Record[12]) {
           assert(ComdatID <= ComdatList.size());
           Func->setComdat(ComdatList[ComdatID - 1]);
         }
+      } else if (hasImplicitComdat(RawLinkage)) {
+        Func->setComdat(reinterpret_cast<Comdat *>(1));
+      }
+
+      if (Record.size() > 13 && Record[13] != 0)
+        FunctionPrefixes.push_back(std::make_pair(Func, Record[13]-1));
 
       ValueList.push_back(Func);
 
@@ -2081,17 +2612,17 @@ std::error_code BitcodeReader::ParseModule(bool Resume) {
     // ALIAS: [alias type, aliasee val#, linkage, visibility, dllstorageclass]
     case bitc::MODULE_CODE_ALIAS: {
       if (Record.size() < 3)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Type *Ty = getTypeByID(Record[0]);
       if (!Ty)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       auto *PTy = dyn_cast<PointerType>(Ty);
       if (!PTy)
-        return Error(BitcodeError::InvalidTypeForValue);
+        return Error("Invalid type for value");
 
       auto *NewGA =
           GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
-                              GetDecodedLinkage(Record[2]), "", TheModule);
+                              getDecodedLinkage(Record[2]), "", TheModule);
       // Old bitcode files didn't have visibility field.
       // Local linkage must have default visibility.
       if (Record.size() > 3 && !NewGA->hasLocalLinkage())
@@ -2113,7 +2644,7 @@ std::error_code BitcodeReader::ParseModule(bool Resume) {
     case bitc::MODULE_CODE_PURGEVALS:
       // Trim down the value list to the specified size.
       if (Record.size() < 1 || Record[0] > ValueList.size())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       ValueList.shrinkTo(Record[0]);
       break;
     }
@@ -2134,7 +2665,7 @@ std::error_code BitcodeReader::ParseBitcodeInto(Module *M) {
       Stream.Read(4) != 0xC ||
       Stream.Read(4) != 0xE ||
       Stream.Read(4) != 0xD)
-    return Error(BitcodeError::InvalidBitcodeSignature);
+    return Error("Invalid bitcode signature");
 
   // We expect a number of well-defined blocks, though we don't necessarily
   // need to understand them all.
@@ -2147,7 +2678,7 @@ std::error_code BitcodeReader::ParseBitcodeInto(Module *M) {
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      return Error(BitcodeError::MalformedBlock);
+      return Error("Malformed block");
     case BitstreamEntry::EndBlock:
       return std::error_code();
 
@@ -2155,12 +2686,12 @@ std::error_code BitcodeReader::ParseBitcodeInto(Module *M) {
       switch (Entry.ID) {
       case bitc::BLOCKINFO_BLOCK_ID:
         if (Stream.ReadBlockInfoBlock())
-          return Error(BitcodeError::MalformedBlock);
+          return Error("Malformed block");
         break;
       case bitc::MODULE_BLOCK_ID:
         // Reject multiple MODULE_BLOCK's in a single bitstream.
         if (TheModule)
-          return Error(BitcodeError::InvalidMultipleBlocks);
+          return Error("Invalid multiple blocks");
         TheModule = M;
         if (std::error_code EC = ParseModule(false))
           return EC;
@@ -2169,7 +2700,7 @@ std::error_code BitcodeReader::ParseBitcodeInto(Module *M) {
         break;
       default:
         if (Stream.SkipBlock())
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
         break;
       }
       continue;
@@ -2184,14 +2715,14 @@ std::error_code BitcodeReader::ParseBitcodeInto(Module *M) {
           Stream.AtEndOfStream())
         return std::error_code();
 
-      return Error(BitcodeError::InvalidRecord);
+      return Error("Invalid record");
     }
   }
 }
 
 ErrorOr<std::string> BitcodeReader::parseModuleTriple() {
   if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
-    return Error(BitcodeError::InvalidRecord);
+    return Error("Invalid record");
 
   SmallVector<uint64_t, 64> Record;
 
@@ -2203,7 +2734,7 @@ ErrorOr<std::string> BitcodeReader::parseModuleTriple() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(BitcodeError::MalformedBlock);
+      return Error("Malformed block");
     case BitstreamEntry::EndBlock:
       return Triple;
     case BitstreamEntry::Record:
@@ -2217,7 +2748,7 @@ ErrorOr<std::string> BitcodeReader::parseModuleTriple() {
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Triple = S;
       break;
     }
@@ -2238,7 +2769,7 @@ ErrorOr<std::string> BitcodeReader::parseTriple() {
       Stream.Read(4) != 0xC ||
       Stream.Read(4) != 0xE ||
       Stream.Read(4) != 0xD)
-    return Error(BitcodeError::InvalidBitcodeSignature);
+    return Error("Invalid bitcode signature");
 
   // We expect a number of well-defined blocks, though we don't necessarily
   // need to understand them all.
@@ -2247,7 +2778,7 @@ ErrorOr<std::string> BitcodeReader::parseTriple() {
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      return Error(BitcodeError::MalformedBlock);
+      return Error("Malformed block");
     case BitstreamEntry::EndBlock:
       return std::error_code();
 
@@ -2257,7 +2788,7 @@ ErrorOr<std::string> BitcodeReader::parseTriple() {
 
       // Ignore other sub-blocks.
       if (Stream.SkipBlock())
-        return Error(BitcodeError::MalformedBlock);
+        return Error("Malformed block");
       continue;
 
     case BitstreamEntry::Record:
@@ -2270,7 +2801,7 @@ ErrorOr<std::string> BitcodeReader::parseTriple() {
 /// ParseMetadataAttachment - Parse metadata attachments.
 std::error_code BitcodeReader::ParseMetadataAttachment() {
   if (Stream.EnterSubBlock(bitc::METADATA_ATTACHMENT_ID))
-    return Error(BitcodeError::InvalidRecord);
+    return Error("Invalid record");
 
   SmallVector<uint64_t, 64> Record;
   while (1) {
@@ -2279,7 +2810,7 @@ std::error_code BitcodeReader::ParseMetadataAttachment() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(BitcodeError::MalformedBlock);
+      return Error("Malformed block");
     case BitstreamEntry::EndBlock:
       return std::error_code();
     case BitstreamEntry::Record:
@@ -2295,15 +2826,19 @@ std::error_code BitcodeReader::ParseMetadataAttachment() {
     case bitc::METADATA_ATTACHMENT: {
       unsigned RecordLength = Record.size();
       if (Record.empty() || (RecordLength - 1) % 2 == 1)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Instruction *Inst = InstructionList[Record[0]];
       for (unsigned i = 1; i != RecordLength; i = i+2) {
         unsigned Kind = Record[i];
         DenseMap<unsigned, unsigned>::iterator I =
           MDKindMap.find(Kind);
         if (I == MDKindMap.end())
-          return Error(BitcodeError::InvalidID);
-        Value *Node = MDValueList.getValueFwdRef(Record[i+1]);
+          return Error("Invalid ID");
+        Metadata *Node = MDValueList.getValueFwdRef(Record[i + 1]);
+        if (isa<LocalAsMetadata>(Node))
+          // Drop the attachment.  This used to be legal, but there's no
+          // upgrade path.
+          break;
         Inst->setMetadata(I->second, cast<MDNode>(Node));
         if (I->second == LLVMContext::MD_tbaa)
           InstsWithTBAATag.push_back(Inst);
@@ -2317,7 +2852,7 @@ std::error_code BitcodeReader::ParseMetadataAttachment() {
 /// ParseFunctionBody - Lazily parse the specified function body block.
 std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
   if (Stream.EnterSubBlock(bitc::FUNCTION_BLOCK_ID))
-    return Error(BitcodeError::InvalidRecord);
+    return Error("Invalid record");
 
   InstructionList.clear();
   unsigned ModuleValueListSize = ValueList.size();
@@ -2332,6 +2867,14 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
   unsigned CurBBNo = 0;
 
   DebugLoc LastLoc;
+  auto getLastInstruction = [&]() -> Instruction * {
+    if (CurBB && !CurBB->empty())
+      return &CurBB->back();
+    else if (CurBBNo && FunctionBBs[CurBBNo - 1] &&
+             !FunctionBBs[CurBBNo - 1]->empty())
+      return &FunctionBBs[CurBBNo - 1]->back();
+    return nullptr;
+  };
 
   // Read all the records.
   SmallVector<uint64_t, 64> Record;
@@ -2340,7 +2883,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      return Error(BitcodeError::MalformedBlock);
+      return Error("Malformed block");
     case BitstreamEntry::EndBlock:
       goto OutOfRecordLoop;
 
@@ -2348,7 +2891,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       switch (Entry.ID) {
       default:  // Skip unknown content.
         if (Stream.SkipBlock())
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
         break;
       case bitc::CONSTANTS_BLOCK_ID:
         if (std::error_code EC = ParseConstants())
@@ -2385,10 +2928,10 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
     unsigned BitCode = Stream.readRecord(Entry.ID, Record);
     switch (BitCode) {
     default: // Default behavior: reject
-      return Error(BitcodeError::InvalidValue);
+      return Error("Invalid value");
     case bitc::FUNC_CODE_DECLAREBLOCKS: {   // DECLAREBLOCKS: [nblocks]
       if (Record.size() < 1 || Record[0] == 0)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       // Create all the basic blocks for the function.
       FunctionBBs.resize(Record[0]);
 
@@ -2401,7 +2944,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
         auto &BBRefs = BBFRI->second;
         // Check for invalid basic block references.
         if (BBRefs.size() > FunctionBBs.size())
-          return Error(BitcodeError::InvalidID);
+          return Error("Invalid ID");
         assert(!BBRefs.empty() && "Unexpected empty array");
         assert(!BBRefs.front() && "Invalid reference to entry block");
         for (unsigned I = 0, E = FunctionBBs.size(), RE = BBRefs.size(); I != E;
@@ -2424,30 +2967,18 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_DEBUG_LOC_AGAIN:  // DEBUG_LOC_AGAIN
       // This record indicates that the last instruction is at the same
       // location as the previous instruction with a location.
-      I = nullptr;
-
-      // Get the last instruction emitted.
-      if (CurBB && !CurBB->empty())
-        I = &CurBB->back();
-      else if (CurBBNo && FunctionBBs[CurBBNo-1] &&
-               !FunctionBBs[CurBBNo-1]->empty())
-        I = &FunctionBBs[CurBBNo-1]->back();
+      I = getLastInstruction();
 
       if (!I)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       I->setDebugLoc(LastLoc);
       I = nullptr;
       continue;
 
     case bitc::FUNC_CODE_DEBUG_LOC: {      // DEBUG_LOC: [line, col, scope, ia]
-      I = nullptr;     // Get the last instruction emitted.
-      if (CurBB && !CurBB->empty())
-        I = &CurBB->back();
-      else if (CurBBNo && FunctionBBs[CurBBNo-1] &&
-               !FunctionBBs[CurBBNo-1]->empty())
-        I = &FunctionBBs[CurBBNo-1]->back();
+      I = getLastInstruction();
       if (!I || Record.size() < 4)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       unsigned Line = Record[0], Col = Record[1];
       unsigned ScopeID = Record[2], IAID = Record[3];
@@ -2467,11 +2998,11 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
           popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
           OpNum+1 > Record.size())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       int Opc = GetDecodedBinaryOpcode(Record[OpNum++], LHS->getType());
       if (Opc == -1)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       I = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
       InstructionList.push_back(I);
       if (OpNum < Record.size()) {
@@ -2513,12 +3044,12 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
           OpNum+2 != Record.size())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       Type *ResTy = getTypeByID(Record[OpNum]);
       int Opc = GetDecodedCastOpcode(Record[OpNum+1]);
       if (Opc == -1 || !ResTy)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Instruction *Temp = nullptr;
       if ((I = UpgradeBitCastInst(Opc, Op, ResTy, Temp))) {
         if (Temp) {
@@ -2531,24 +3062,38 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       InstructionList.push_back(I);
       break;
     }
-    case bitc::FUNC_CODE_INST_INBOUNDS_GEP:
-    case bitc::FUNC_CODE_INST_GEP: { // GEP: [n x operands]
+    case bitc::FUNC_CODE_INST_INBOUNDS_GEP_OLD:
+    case bitc::FUNC_CODE_INST_GEP_OLD:
+    case bitc::FUNC_CODE_INST_GEP: { // GEP: type, [n x operands]
       unsigned OpNum = 0;
+
+      Type *Ty;
+      bool InBounds;
+
+      if (BitCode == bitc::FUNC_CODE_INST_GEP) {
+        InBounds = Record[OpNum++];
+        Ty = getTypeByID(Record[OpNum++]);
+      } else {
+        InBounds = BitCode == bitc::FUNC_CODE_INST_INBOUNDS_GEP_OLD;
+        Ty = nullptr;
+      }
+
       Value *BasePtr;
       if (getValueTypePair(Record, OpNum, NextValueNo, BasePtr))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       SmallVector<Value*, 16> GEPIdx;
       while (OpNum != Record.size()) {
         Value *Op;
         if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
         GEPIdx.push_back(Op);
       }
 
       I = GetElementPtrInst::Create(BasePtr, GEPIdx);
+      assert(!Ty || Ty == cast<GetElementPtrInst>(I)->getSourceElementType());
       InstructionList.push_back(I);
-      if (BitCode == bitc::FUNC_CODE_INST_INBOUNDS_GEP)
+      if (InBounds)
         cast<GetElementPtrInst>(I)->setIsInBounds(true);
       break;
     }
@@ -2558,15 +3103,30 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Agg;
       if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       SmallVector<unsigned, 4> EXTRACTVALIdx;
+      Type *CurTy = Agg->getType();
       for (unsigned RecSize = Record.size();
            OpNum != RecSize; ++OpNum) {
+        bool IsArray = CurTy->isArrayTy();
+        bool IsStruct = CurTy->isStructTy();
         uint64_t Index = Record[OpNum];
+
+        if (!IsStruct && !IsArray)
+          return Error("EXTRACTVAL: Invalid type");
         if ((unsigned)Index != Index)
-          return Error(BitcodeError::InvalidValue);
+          return Error("Invalid value");
+        if (IsStruct && Index >= CurTy->subtypes().size())
+          return Error("EXTRACTVAL: Invalid struct index");
+        if (IsArray && Index >= CurTy->getArrayNumElements())
+          return Error("EXTRACTVAL: Invalid array index");
         EXTRACTVALIdx.push_back((unsigned)Index);
+
+        if (IsStruct)
+          CurTy = CurTy->subtypes()[Index];
+        else
+          CurTy = CurTy->subtypes()[0];
       }
 
       I = ExtractValueInst::Create(Agg, EXTRACTVALIdx);
@@ -2579,18 +3139,35 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Agg;
       if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Value *Val;
       if (getValueTypePair(Record, OpNum, NextValueNo, Val))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       SmallVector<unsigned, 4> INSERTVALIdx;
+      Type *CurTy = Agg->getType();
       for (unsigned RecSize = Record.size();
            OpNum != RecSize; ++OpNum) {
+        bool IsArray = CurTy->isArrayTy();
+        bool IsStruct = CurTy->isStructTy();
         uint64_t Index = Record[OpNum];
+
+        if (!IsStruct && !IsArray)
+          return Error("INSERTVAL: Invalid type");
+        if (!CurTy->isStructTy() && !CurTy->isArrayTy())
+          return Error("Invalid type");
         if ((unsigned)Index != Index)
-          return Error(BitcodeError::InvalidValue);
+          return Error("Invalid value");
+        if (IsStruct && Index >= CurTy->subtypes().size())
+          return Error("INSERTVAL: Invalid struct index");
+        if (IsArray && Index >= CurTy->getArrayNumElements())
+          return Error("INSERTVAL: Invalid array index");
+
         INSERTVALIdx.push_back((unsigned)Index);
+        if (IsStruct)
+          CurTy = CurTy->subtypes()[Index];
+        else
+          CurTy = CurTy->subtypes()[0];
       }
 
       I = InsertValueInst::Create(Agg, Val, INSERTVALIdx);
@@ -2606,7 +3183,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
           popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
           popValue(Record, OpNum, NextValueNo, Type::getInt1Ty(Context), Cond))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       I = SelectInst::Create(Cond, TrueVal, FalseVal);
       InstructionList.push_back(I);
@@ -2621,18 +3198,18 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
           popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
           getValueTypePair(Record, OpNum, NextValueNo, Cond))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       // select condition can be either i1 or [N x i1]
       if (VectorType* vector_type =
           dyn_cast<VectorType>(Cond->getType())) {
         // expect <n x i1>
         if (vector_type->getElementType() != Type::getInt1Ty(Context))
-          return Error(BitcodeError::InvalidTypeForValue);
+          return Error("Invalid type for value");
       } else {
         // expect i1
         if (Cond->getType() != Type::getInt1Ty(Context))
-          return Error(BitcodeError::InvalidTypeForValue);
+          return Error("Invalid type for value");
       }
 
       I = SelectInst::Create(Cond, TrueVal, FalseVal);
@@ -2645,7 +3222,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Vec, *Idx;
       if (getValueTypePair(Record, OpNum, NextValueNo, Vec) ||
           getValueTypePair(Record, OpNum, NextValueNo, Idx))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       I = ExtractElementInst::Create(Vec, Idx);
       InstructionList.push_back(I);
       break;
@@ -2658,7 +3235,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                    cast<VectorType>(Vec->getType())->getElementType(), Elt) ||
           getValueTypePair(Record, OpNum, NextValueNo, Idx))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       I = InsertElementInst::Create(Vec, Elt, Idx);
       InstructionList.push_back(I);
       break;
@@ -2669,10 +3246,10 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Vec1, *Vec2, *Mask;
       if (getValueTypePair(Record, OpNum, NextValueNo, Vec1) ||
           popValue(Record, OpNum, NextValueNo, Vec1->getType(), Vec2))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       if (getValueTypePair(Record, OpNum, NextValueNo, Mask))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       I = new ShuffleVectorInst(Vec1, Vec2, Mask);
       InstructionList.push_back(I);
       break;
@@ -2690,7 +3267,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
           popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
           OpNum+1 != Record.size())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       if (LHS->getType()->isFPOrFPVectorTy())
         I = new FCmpInst((FCmpInst::Predicate)Record[OpNum], LHS, RHS);
@@ -2712,9 +3289,9 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
         unsigned OpNum = 0;
         Value *Op = nullptr;
         if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
         if (OpNum != Record.size())
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
 
         I = ReturnInst::Create(Context, Op);
         InstructionList.push_back(I);
@@ -2722,10 +3299,10 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       }
     case bitc::FUNC_CODE_INST_BR: { // BR: [bb#, bb#, opval] or [bb#]
       if (Record.size() != 1 && Record.size() != 3)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       BasicBlock *TrueDest = getBasicBlock(Record[0]);
       if (!TrueDest)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       if (Record.size() == 1) {
         I = BranchInst::Create(TrueDest);
@@ -2736,7 +3313,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
         Value *Cond = getValue(Record, 2, NextValueNo,
                                Type::getInt1Ty(Context));
         if (!FalseDest || !Cond)
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
         I = BranchInst::Create(TrueDest, FalseDest, Cond);
         InstructionList.push_back(I);
       }
@@ -2756,7 +3333,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
         Value *Cond = getValue(Record, 2, NextValueNo, OpTy);
         BasicBlock *Default = getBasicBlock(Record[3]);
         if (!OpTy || !Cond || !Default)
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
 
         unsigned NumCases = Record[4];
 
@@ -2808,12 +3385,12 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       // Old SwitchInst format without case ranges.
 
       if (Record.size() < 3 || (Record.size() & 1) == 0)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Type *OpTy = getTypeByID(Record[0]);
       Value *Cond = getValue(Record, 1, NextValueNo, OpTy);
       BasicBlock *Default = getBasicBlock(Record[2]);
       if (!OpTy || !Cond || !Default)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       unsigned NumCases = (Record.size()-3)/2;
       SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases);
       InstructionList.push_back(SI);
@@ -2823,7 +3400,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
         BasicBlock *DestBB = getBasicBlock(Record[1+3+i*2]);
         if (!CaseVal || !DestBB) {
           delete SI;
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
         }
         SI->addCase(CaseVal, DestBB);
       }
@@ -2832,11 +3409,11 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_INDIRECTBR: { // INDIRECTBR: [opty, op0, op1, ...]
       if (Record.size() < 2)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Type *OpTy = getTypeByID(Record[0]);
       Value *Address = getValue(Record, 1, NextValueNo, OpTy);
       if (!OpTy || !Address)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       unsigned NumDests = Record.size()-2;
       IndirectBrInst *IBI = IndirectBrInst::Create(Address, NumDests);
       InstructionList.push_back(IBI);
@@ -2845,7 +3422,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
           IBI->addDestination(DestBB);
         } else {
           delete IBI;
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
         }
       }
       I = IBI;
@@ -2855,7 +3432,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_INVOKE: {
       // INVOKE: [attrs, cc, normBB, unwindBB, fnty, op0,op1,op2, ...]
       if (Record.size() < 4)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       AttributeSet PAL = getAttributes(Record[0]);
       unsigned CCInfo = Record[1];
       BasicBlock *NormalBB = getBasicBlock(Record[2]);
@@ -2864,7 +3441,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 4;
       Value *Callee;
       if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       PointerType *CalleeTy = dyn_cast<PointerType>(Callee->getType());
       FunctionType *FTy = !CalleeTy ? nullptr :
@@ -2873,25 +3450,25 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       // Check that the right number of fixed parameters are here.
       if (!FTy || !NormalBB || !UnwindBB ||
           Record.size() < OpNum+FTy->getNumParams())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       SmallVector<Value*, 16> Ops;
       for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
         Ops.push_back(getValue(Record, OpNum, NextValueNo,
                                FTy->getParamType(i)));
         if (!Ops.back())
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
       }
 
       if (!FTy->isVarArg()) {
         if (Record.size() != OpNum)
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
       } else {
         // Read type/value pairs for varargs params.
         while (OpNum != Record.size()) {
           Value *Op;
           if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-            return Error(BitcodeError::InvalidRecord);
+            return Error("Invalid record");
           Ops.push_back(Op);
         }
       }
@@ -2907,7 +3484,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned Idx = 0;
       Value *Val = nullptr;
       if (getValueTypePair(Record, Idx, NextValueNo, Val))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       I = ResumeInst::Create(Val);
       InstructionList.push_back(I);
       break;
@@ -2918,10 +3495,10 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       break;
     case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...]
       if (Record.size() < 1 || ((Record.size()-1)&1))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Type *Ty = getTypeByID(Record[0]);
       if (!Ty)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       PHINode *PN = PHINode::Create(Ty, (Record.size()-1)/2);
       InstructionList.push_back(PN);
@@ -2937,7 +3514,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
           V = getValue(Record, 1+i, NextValueNo, Ty);
         BasicBlock *BB = getBasicBlock(Record[2+i]);
         if (!V || !BB)
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
         PN->addIncoming(V, BB);
       }
       I = PN;
@@ -2948,13 +3525,13 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       // LANDINGPAD: [ty, val, val, num, (id0,val0 ...)?]
       unsigned Idx = 0;
       if (Record.size() < 4)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Type *Ty = getTypeByID(Record[Idx++]);
       if (!Ty)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Value *PersFn = nullptr;
       if (getValueTypePair(Record, Idx, NextValueNo, PersFn))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       bool IsCleanup = !!Record[Idx++];
       unsigned NumClauses = Record[Idx++];
@@ -2967,7 +3544,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
 
         if (getValueTypePair(Record, Idx, NextValueNo, Val)) {
           delete LP;
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
         }
 
         assert((CT != LandingPadInst::Catch ||
@@ -2986,17 +3563,22 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
 
     case bitc::FUNC_CODE_INST_ALLOCA: { // ALLOCA: [instty, opty, op, align]
       if (Record.size() != 4)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       PointerType *Ty =
         dyn_cast_or_null<PointerType>(getTypeByID(Record[0]));
       Type *OpTy = getTypeByID(Record[1]);
       Value *Size = getFnValueByID(Record[2], OpTy);
-      unsigned AlignRecord = Record[3];
-      bool InAlloca = AlignRecord & (1 << 5);
-      unsigned Align = AlignRecord & ((1 << 5) - 1);
+      uint64_t AlignRecord = Record[3];
+      const uint64_t InAllocaMask = uint64_t(1) << 5;
+      bool InAlloca = AlignRecord & InAllocaMask;
+      unsigned Align;
+      if (std::error_code EC =
+          parseAlignmentValue(AlignRecord & ~InAllocaMask, Align)) {
+        return EC;
+      }
       if (!Ty || !Size)
-        return Error(BitcodeError::InvalidRecord);
-      AllocaInst *AI = new AllocaInst(Ty->getElementType(), Size, (1 << Align) >> 1);
+        return Error("Invalid record");
+      AllocaInst *AI = new AllocaInst(Ty->getElementType(), Size, Align);
       AI->setUsedWithInAlloca(InAlloca);
       I = AI;
       InstructionList.push_back(I);
@@ -3006,10 +3588,21 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
-          OpNum+2 != Record.size())
-        return Error(BitcodeError::InvalidRecord);
+          (OpNum + 2 != Record.size() && OpNum + 3 != Record.size()))
+        return Error("Invalid record");
+
+      Type *Ty = nullptr;
+      if (OpNum + 3 == Record.size())
+        Ty = getTypeByID(Record[OpNum++]);
+
+      unsigned Align;
+      if (std::error_code EC = parseAlignmentValue(Record[OpNum], Align))
+        return EC;
+      I = new LoadInst(Op, "", Record[OpNum+1], Align);
+
+      assert((!Ty || Ty == I->getType()) &&
+             "Explicit type doesn't match pointee type of the first operand");
 
-      I = new LoadInst(Op, "", Record[OpNum+1], (1 << Record[OpNum]) >> 1);
       InstructionList.push_back(I);
       break;
     }
@@ -3018,19 +3611,29 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
-          OpNum+4 != Record.size())
-        return Error(BitcodeError::InvalidRecord);
+          (OpNum + 4 != Record.size() && OpNum + 5 != Record.size()))
+        return Error("Invalid record");
+
+      Type *Ty = nullptr;
+      if (OpNum + 5 == Record.size())
+        Ty = getTypeByID(Record[OpNum++]);
 
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
       if (Ordering == NotAtomic || Ordering == Release ||
           Ordering == AcquireRelease)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       if (Ordering != NotAtomic && Record[OpNum] == 0)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
 
-      I = new LoadInst(Op, "", Record[OpNum+1], (1 << Record[OpNum]) >> 1,
-                       Ordering, SynchScope);
+      unsigned Align;
+      if (std::error_code EC = parseAlignmentValue(Record[OpNum], Align))
+        return EC;
+      I = new LoadInst(Op, "", Record[OpNum+1], Align, Ordering, SynchScope);
+
+      assert((!Ty || Ty == I->getType()) &&
+             "Explicit type doesn't match pointee type of the first operand");
+
       InstructionList.push_back(I);
       break;
     }
@@ -3041,9 +3644,11 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+2 != Record.size())
-        return Error(BitcodeError::InvalidRecord);
-
-      I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1);
+        return Error("Invalid record");
+      unsigned Align;
+      if (std::error_code EC = parseAlignmentValue(Record[OpNum], Align))
+        return EC;
+      I = new StoreInst(Val, Ptr, Record[OpNum+1], Align);
       InstructionList.push_back(I);
       break;
     }
@@ -3055,18 +3660,20 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+4 != Record.size())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
       if (Ordering == NotAtomic || Ordering == Acquire ||
           Ordering == AcquireRelease)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
       if (Ordering != NotAtomic && Record[OpNum] == 0)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
-      I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1,
-                        Ordering, SynchScope);
+      unsigned Align;
+      if (std::error_code EC = parseAlignmentValue(Record[OpNum], Align))
+        return EC;
+      I = new StoreInst(Val, Ptr, Record[OpNum+1], Align, Ordering, SynchScope);
       InstructionList.push_back(I);
       break;
     }
@@ -3081,10 +3688,10 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), New) ||
           (Record.size() < OpNum + 3 || Record.size() > OpNum + 5))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       AtomicOrdering SuccessOrdering = GetDecodedOrdering(Record[OpNum+1]);
       if (SuccessOrdering == NotAtomic || SuccessOrdering == Unordered)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+2]);
 
       AtomicOrdering FailureOrdering;
@@ -3119,14 +3726,14 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+4 != Record.size())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       AtomicRMWInst::BinOp Operation = GetDecodedRMWOperation(Record[OpNum]);
       if (Operation < AtomicRMWInst::FIRST_BINOP ||
           Operation > AtomicRMWInst::LAST_BINOP)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
       if (Ordering == NotAtomic || Ordering == Unordered)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
       I = new AtomicRMWInst(Operation, Ptr, Val, Ordering, SynchScope);
       cast<AtomicRMWInst>(I)->setVolatile(Record[OpNum+1]);
@@ -3135,11 +3742,11 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_FENCE: { // FENCE:[ordering, synchscope]
       if (2 != Record.size())
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       AtomicOrdering Ordering = GetDecodedOrdering(Record[0]);
       if (Ordering == NotAtomic || Ordering == Unordered ||
           Ordering == Monotonic)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[1]);
       I = new FenceInst(Context, Ordering, SynchScope);
       InstructionList.push_back(I);
@@ -3148,7 +3755,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_CALL: {
       // CALL: [paramattrs, cc, fnty, fnid, arg0, arg1...]
       if (Record.size() < 3)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       AttributeSet PAL = getAttributes(Record[0]);
       unsigned CCInfo = Record[1];
@@ -3156,13 +3763,13 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 2;
       Value *Callee;
       if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       PointerType *OpTy = dyn_cast<PointerType>(Callee->getType());
       FunctionType *FTy = nullptr;
       if (OpTy) FTy = dyn_cast<FunctionType>(OpTy->getElementType());
       if (!FTy || Record.size() < FTy->getNumParams()+OpNum)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
 
       SmallVector<Value*, 16> Args;
       // Read the fixed params.
@@ -3173,18 +3780,18 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
           Args.push_back(getValue(Record, OpNum, NextValueNo,
                                   FTy->getParamType(i)));
         if (!Args.back())
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
       }
 
       // Read type/value pairs for varargs params.
       if (!FTy->isVarArg()) {
         if (OpNum != Record.size())
-          return Error(BitcodeError::InvalidRecord);
+          return Error("Invalid record");
       } else {
         while (OpNum != Record.size()) {
           Value *Op;
           if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-            return Error(BitcodeError::InvalidRecord);
+            return Error("Invalid record");
           Args.push_back(Op);
         }
       }
@@ -3204,12 +3811,12 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_VAARG: { // VAARG: [valistty, valist, instty]
       if (Record.size() < 3)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       Type *OpTy = getTypeByID(Record[0]);
       Value *Op = getValue(Record, 1, NextValueNo, OpTy);
       Type *ResTy = getTypeByID(Record[2]);
       if (!OpTy || !Op || !ResTy)
-        return Error(BitcodeError::InvalidRecord);
+        return Error("Invalid record");
       I = new VAArgInst(Op, ResTy);
       InstructionList.push_back(I);
       break;
@@ -3220,7 +3827,7 @@ std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
     // this file.
     if (!CurBB) {
       delete I;
-      return Error(BitcodeError::InvalidInstructionWithNoBB);
+      return Error("Invalid instruction with no BB");
     }
     CurBB->getInstList().push_back(I);
 
@@ -3247,7 +3854,7 @@ OutOfRecordLoop:
           delete A;
         }
       }
-      return Error(BitcodeError::NeverResolvedValueFoundInFunction);
+      return Error("Never resolved value found in function");
     }
   }
 
@@ -3267,7 +3874,7 @@ std::error_code BitcodeReader::FindFunctionInStream(
     DenseMap<Function *, uint64_t>::iterator DeferredFunctionInfoIterator) {
   while (DeferredFunctionInfoIterator->second == 0) {
     if (Stream.AtEndOfStream())
-      return Error(BitcodeError::CouldNotFindFunctionInStream);
+      return Error("Could not find function in stream");
     // ParseModule will parse the next body in the stream and set its
     // position in the DeferredFunctionInfo map.
     if (std::error_code EC = ParseModule(true))
@@ -3369,7 +3976,7 @@ std::error_code BitcodeReader::MaterializeModule(Module *M) {
   // Check that all block address forward references got resolved (as we
   // promised above).
   if (!BasicBlockFwdRefs.empty())
-    return Error(BitcodeError::NeverResolvedFunctionFromBlockAddress);
+    return Error("Never resolved function from blockaddress");
 
   // Upgrade any intrinsic calls that slipped through (should not happen!) and
   // delete the old functions to clean up. We can't do this unless the entire
@@ -3397,6 +4004,10 @@ std::error_code BitcodeReader::MaterializeModule(Module *M) {
   return std::error_code();
 }
 
+std::vector<StructType *> BitcodeReader::getIdentifiedStructTypes() const {
+  return IdentifiedStructTypes;
+}
+
 std::error_code BitcodeReader::InitStream() {
   if (LazyStreamer)
     return InitLazyStream();
@@ -3408,13 +4019,13 @@ std::error_code BitcodeReader::InitStreamFromBuffer() {
   const unsigned char *BufEnd = BufPtr+Buffer->getBufferSize();
 
   if (Buffer->getBufferSize() & 3)
-    return Error(BitcodeError::InvalidBitcodeSignature);
+    return Error("Invalid bitcode signature");
 
   // If we have a wrapper header, parse it and ignore the non-bc file contents.
   // The magic number is 0x0B17C0DE stored in little endian.
   if (isBitcodeWrapper(BufPtr, BufEnd))
     if (SkipBitcodeWrapperHeader(BufPtr, BufEnd, true))
-      return Error(BitcodeError::InvalidBitcodeWrapperHeader);
+      return Error("Invalid bitcode wrapper header");
 
   StreamFile.reset(new BitstreamReader(BufPtr, BufEnd));
   Stream.init(&*StreamFile);
@@ -3425,23 +4036,24 @@ std::error_code BitcodeReader::InitStreamFromBuffer() {
 std::error_code BitcodeReader::InitLazyStream() {
   // Check and strip off the bitcode wrapper; BitstreamReader expects never to
   // see it.
-  StreamingMemoryObject *Bytes = new StreamingMemoryObject(LazyStreamer);
-  StreamFile.reset(new BitstreamReader(Bytes));
+  auto OwnedBytes = llvm::make_unique<StreamingMemoryObject>(LazyStreamer);
+  StreamingMemoryObject &Bytes = *OwnedBytes;
+  StreamFile = llvm::make_unique<BitstreamReader>(std::move(OwnedBytes));
   Stream.init(&*StreamFile);
 
   unsigned char buf[16];
-  if (Bytes->readBytes(buf, 16, 0) != 16)
-    return Error(BitcodeError::InvalidBitcodeSignature);
+  if (Bytes.readBytes(buf, 16, 0) != 16)
+    return Error("Invalid bitcode signature");
 
   if (!isBitcode(buf, buf + 16))
-    return Error(BitcodeError::InvalidBitcodeSignature);
+    return Error("Invalid bitcode signature");
 
   if (isBitcodeWrapper(buf, buf + 4)) {
     const unsigned char *bitcodeStart = buf;
     const unsigned char *bitcodeEnd = buf + 16;
     SkipBitcodeWrapperHeader(bitcodeStart, bitcodeEnd, false);
-    Bytes->dropLeadingBytes(bitcodeStart - buf);
-    Bytes->setKnownObjectSize(bitcodeEnd - bitcodeStart);
+    Bytes.dropLeadingBytes(bitcodeStart - buf);
+    Bytes.setKnownObjectSize(bitcodeEnd - bitcodeStart);
   }
   return std::error_code();
 }
@@ -3454,44 +4066,10 @@ class BitcodeErrorCategoryType : public std::error_category {
   std::string message(int IE) const override {
     BitcodeError E = static_cast<BitcodeError>(IE);
     switch (E) {
-    case BitcodeError::ConflictingMETADATA_KINDRecords:
-      return "Conflicting METADATA_KIND records";
-    case BitcodeError::CouldNotFindFunctionInStream:
-      return "Could not find function in stream";
-    case BitcodeError::ExpectedConstant:
-      return "Expected a constant";
-    case BitcodeError::InsufficientFunctionProtos:
-      return "Insufficient function protos";
     case BitcodeError::InvalidBitcodeSignature:
       return "Invalid bitcode signature";
-    case BitcodeError::InvalidBitcodeWrapperHeader:
-      return "Invalid bitcode wrapper header";
-    case BitcodeError::InvalidConstantReference:
-      return "Invalid ronstant reference";
-    case BitcodeError::InvalidID:
-      return "Invalid ID";
-    case BitcodeError::InvalidInstructionWithNoBB:
-      return "Invalid instruction with no BB";
-    case BitcodeError::InvalidRecord:
-      return "Invalid record";
-    case BitcodeError::InvalidTypeForValue:
-      return "Invalid type for value";
-    case BitcodeError::InvalidTYPETable:
-      return "Invalid TYPE table";
-    case BitcodeError::InvalidType:
-      return "Invalid type";
-    case BitcodeError::MalformedBlock:
-      return "Malformed block";
-    case BitcodeError::MalformedGlobalInitializerSet:
-      return "Malformed global initializer set";
-    case BitcodeError::InvalidMultipleBlocks:
-      return "Invalid multiple blocks";
-    case BitcodeError::NeverResolvedValueFoundInFunction:
-      return "Never resolved value found in function";
-    case BitcodeError::NeverResolvedFunctionFromBlockAddress:
-      return "Never resolved function from blockaddress";
-    case BitcodeError::InvalidValue:
-      return "Invalid value";
+    case BitcodeError::CorruptedBitcode:
+      return "Corrupted bitcode";
     }
     llvm_unreachable("Unknown error type!");
   }
@@ -3518,9 +4096,11 @@ const std::error_category &llvm::BitcodeErrorCategory() {
 /// materialize everything -- in particular, if this isn't truly lazy.
 static ErrorOr<Module *>
 getLazyBitcodeModuleImpl(std::unique_ptr<MemoryBuffer> &&Buffer,
-                         LLVMContext &Context, bool WillMaterializeAll) {
+                         LLVMContext &Context, bool WillMaterializeAll,
+                         DiagnosticHandlerFunction DiagnosticHandler) {
   Module *M = new Module(Buffer->getBufferIdentifier(), Context);
-  BitcodeReader *R = new BitcodeReader(Buffer.get(), Context);
+  BitcodeReader *R =
+      new BitcodeReader(Buffer.get(), Context, DiagnosticHandler);
   M->setMaterializer(R);
 
   auto cleanupOnError = [&](std::error_code EC) {
@@ -3543,31 +4123,30 @@ getLazyBitcodeModuleImpl(std::unique_ptr<MemoryBuffer> &&Buffer,
 
 ErrorOr<Module *>
 llvm::getLazyBitcodeModule(std::unique_ptr<MemoryBuffer> &&Buffer,
-                           LLVMContext &Context) {
-  return getLazyBitcodeModuleImpl(std::move(Buffer), Context, false);
+                           LLVMContext &Context,
+                           DiagnosticHandlerFunction DiagnosticHandler) {
+  return getLazyBitcodeModuleImpl(std::move(Buffer), Context, false,
+                                  DiagnosticHandler);
 }
 
-Module *llvm::getStreamedBitcodeModule(const std::string &name,
-                                       DataStreamer *streamer,
-                                       LLVMContext &Context,
-                                       std::string *ErrMsg) {
-  Module *M = new Module(name, Context);
-  BitcodeReader *R = new BitcodeReader(streamer, Context);
+ErrorOr<std::unique_ptr<Module>>
+llvm::getStreamedBitcodeModule(StringRef Name, DataStreamer *Streamer,
+                               LLVMContext &Context,
+                               DiagnosticHandlerFunction DiagnosticHandler) {
+  std::unique_ptr<Module> M = make_unique<Module>(Name, Context);
+  BitcodeReader *R = new BitcodeReader(Streamer, Context, DiagnosticHandler);
   M->setMaterializer(R);
-  if (std::error_code EC = R->ParseBitcodeInto(M)) {
-    if (ErrMsg)
-      *ErrMsg = EC.message();
-    delete M;  // Also deletes R.
-    return nullptr;
-  }
-  return M;
+  if (std::error_code EC = R->ParseBitcodeInto(M.get()))
+    return EC;
+  return std::move(M);
 }
 
-ErrorOr<Module *> llvm::parseBitcodeFile(MemoryBufferRef Buffer,
-                                         LLVMContext &Context) {
+ErrorOr<Module *>
+llvm::parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context,
+                       DiagnosticHandlerFunction DiagnosticHandler) {
   std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
-  ErrorOr<Module *> ModuleOrErr =
-      getLazyBitcodeModuleImpl(std::move(Buf), Context, true);
+  ErrorOr<Module *> ModuleOrErr = getLazyBitcodeModuleImpl(
+      std::move(Buf), Context, true, DiagnosticHandler);
   if (!ModuleOrErr)
     return ModuleOrErr;
   Module *M = ModuleOrErr.get();
@@ -3583,10 +4162,12 @@ ErrorOr<Module *> llvm::parseBitcodeFile(MemoryBufferRef Buffer,
   return M;
 }
 
-std::string llvm::getBitcodeTargetTriple(MemoryBufferRef Buffer,
-                                         LLVMContext &Context) {
+std::string
+llvm::getBitcodeTargetTriple(MemoryBufferRef Buffer, LLVMContext &Context,
+                             DiagnosticHandlerFunction DiagnosticHandler) {
   std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
-  auto R = llvm::make_unique<BitcodeReader>(Buf.release(), Context);
+  auto R = llvm::make_unique<BitcodeReader>(Buf.release(), Context,
+                                            DiagnosticHandler);
   ErrorOr<std::string> Triple = R->parseTriple();
   if (Triple.getError())
     return "";
diff --git a/lib/Bitcode/Reader/BitcodeReader.h b/lib/Bitcode/Reader/BitcodeReader.h
index 047fef8..9803e78 100644
--- a/lib/Bitcode/Reader/BitcodeReader.h
+++ b/lib/Bitcode/Reader/BitcodeReader.h
@@ -19,7 +19,9 @@
 #include "llvm/Bitcode/LLVMBitCodes.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/GVMaterializer.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/TrackingMDRef.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/ValueHandle.h"
 #include <deque>
@@ -95,22 +97,27 @@ public:
 //===----------------------------------------------------------------------===//
 
 class BitcodeReaderMDValueList {
-  std::vector<WeakVH> MDValuePtrs;
+  unsigned NumFwdRefs;
+  bool AnyFwdRefs;
+  unsigned MinFwdRef;
+  unsigned MaxFwdRef;
+  std::vector<TrackingMDRef> MDValuePtrs;
 
   LLVMContext &Context;
 public:
-  BitcodeReaderMDValueList(LLVMContext& C) : Context(C) {}
+  BitcodeReaderMDValueList(LLVMContext &C)
+      : NumFwdRefs(0), AnyFwdRefs(false), Context(C) {}
 
   // vector compatibility methods
   unsigned size() const       { return MDValuePtrs.size(); }
   void resize(unsigned N)     { MDValuePtrs.resize(N); }
-  void push_back(Value *V)    { MDValuePtrs.push_back(V);  }
+  void push_back(Metadata *MD) { MDValuePtrs.emplace_back(MD); }
   void clear()                { MDValuePtrs.clear();  }
-  Value *back() const         { return MDValuePtrs.back(); }
+  Metadata *back() const      { return MDValuePtrs.back(); }
   void pop_back()             { MDValuePtrs.pop_back(); }
   bool empty() const          { return MDValuePtrs.empty(); }
 
-  Value *operator[](unsigned i) const {
+  Metadata *operator[](unsigned i) const {
     assert(i < MDValuePtrs.size());
     return MDValuePtrs[i];
   }
@@ -120,12 +127,14 @@ public:
     MDValuePtrs.resize(N);
   }
 
-  Value *getValueFwdRef(unsigned Idx);
-  void AssignValue(Value *V, unsigned Idx);
+  Metadata *getValueFwdRef(unsigned Idx);
+  void AssignValue(Metadata *MD, unsigned Idx);
+  void tryToResolveCycles();
 };
 
 class BitcodeReader : public GVMaterializer {
   LLVMContext &Context;
+  DiagnosticHandlerFunction DiagnosticHandler;
   Module *TheModule;
   std::unique_ptr<MemoryBuffer> Buffer;
   std::unique_ptr<BitstreamReader> StreamFile;
@@ -143,6 +152,7 @@ class BitcodeReader : public GVMaterializer {
   std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInits;
   std::vector<std::pair<GlobalAlias*, unsigned> > AliasInits;
   std::vector<std::pair<Function*, unsigned> > FunctionPrefixes;
+  std::vector<std::pair<Function*, unsigned> > FunctionPrologues;
 
   SmallVector<Instruction*, 64> InstsWithTBAATag;
 
@@ -203,18 +213,14 @@ class BitcodeReader : public GVMaterializer {
   SmallPtrSet<const Function *, 4> BlockAddressesTaken;
 
 public:
-  std::error_code Error(BitcodeError E) { return make_error_code(E); }
-
-  explicit BitcodeReader(MemoryBuffer *buffer, LLVMContext &C)
-      : Context(C), TheModule(nullptr), Buffer(buffer), LazyStreamer(nullptr),
-        NextUnreadBit(0), SeenValueSymbolTable(false), ValueList(C),
-        MDValueList(C), SeenFirstFunctionBody(false), UseRelativeIDs(false),
-        WillMaterializeAllForwardRefs(false) {}
-  explicit BitcodeReader(DataStreamer *streamer, LLVMContext &C)
-      : Context(C), TheModule(nullptr), Buffer(nullptr), LazyStreamer(streamer),
-        NextUnreadBit(0), SeenValueSymbolTable(false), ValueList(C),
-        MDValueList(C), SeenFirstFunctionBody(false), UseRelativeIDs(false),
-        WillMaterializeAllForwardRefs(false) {}
+  std::error_code Error(BitcodeError E, const Twine &Message);
+  std::error_code Error(BitcodeError E);
+  std::error_code Error(const Twine &Message);
+
+  explicit BitcodeReader(MemoryBuffer *buffer, LLVMContext &C,
+                         DiagnosticHandlerFunction DiagnosticHandler);
+  explicit BitcodeReader(DataStreamer *streamer, LLVMContext &C,
+                         DiagnosticHandlerFunction DiagnosticHandler);
   ~BitcodeReader() { FreeState(); }
 
   std::error_code materializeForwardReferencedFunctions();
@@ -226,6 +232,7 @@ public:
   bool isDematerializable(const GlobalValue *GV) const override;
   std::error_code materialize(GlobalValue *GV) override;
   std::error_code MaterializeModule(Module *M) override;
+  std::vector<StructType *> getIdentifiedStructTypes() const override;
   void Dematerialize(GlobalValue *GV) override;
 
   /// @brief Main interface to parsing a bitcode buffer.
@@ -239,12 +246,19 @@ public:
   static uint64_t decodeSignRotatedValue(uint64_t V);
 
 private:
+  std::vector<StructType *> IdentifiedStructTypes;
+  StructType *createIdentifiedStructType(LLVMContext &Context, StringRef Name);
+  StructType *createIdentifiedStructType(LLVMContext &Context);
+
   Type *getTypeByID(unsigned ID);
   Value *getFnValueByID(unsigned ID, Type *Ty) {
     if (Ty && Ty->isMetadataTy())
-      return MDValueList.getValueFwdRef(ID);
+      return MetadataAsValue::get(Ty->getContext(), getFnMetadataByID(ID));
     return ValueList.getValueFwdRef(ID, Ty);
   }
+  Metadata *getFnMetadataByID(unsigned ID) {
+    return MDValueList.getValueFwdRef(ID);
+  }
   BasicBlock *getBasicBlock(unsigned ID) const {
     if (ID >= FunctionBBs.size()) return nullptr; // Invalid ID
     return FunctionBBs[ID];
@@ -321,6 +335,10 @@ private:
     return getFnValueByID(ValNo, Ty);
   }
 
+  /// Converts alignment exponent (i.e. power of two (or zero)) to the
+  /// corresponding alignment to use. If alignment is too large, returns
+  /// a corresponding error code.
+  std::error_code parseAlignmentValue(uint64_t Exponent, unsigned &Alignment);
   std::error_code ParseAttrKind(uint64_t Code, Attribute::AttrKind *Kind);
   std::error_code ParseModule(bool Resume);
   std::error_code ParseAttributeBlock();
diff --git a/lib/Bitcode/Reader/BitstreamReader.cpp b/lib/Bitcode/Reader/BitstreamReader.cpp
index 5e3232e..ca68257 100644
--- a/lib/Bitcode/Reader/BitstreamReader.cpp
+++ b/lib/Bitcode/Reader/BitstreamReader.cpp
@@ -170,8 +170,12 @@ unsigned BitstreamCursor::readRecord(unsigned AbbrevID,
   unsigned Code;
   if (CodeOp.isLiteral())
     Code = CodeOp.getLiteralValue();
-  else
+  else {
+    if (CodeOp.getEncoding() == BitCodeAbbrevOp::Array ||
+        CodeOp.getEncoding() == BitCodeAbbrevOp::Blob)
+      report_fatal_error("Abbreviation starts with an Array or a Blob");
     Code = readAbbreviatedField(*this, CodeOp);
+  }
 
   for (unsigned i = 1, e = Abbv->getNumOperandInfos(); i != e; ++i) {
     const BitCodeAbbrevOp &Op = Abbv->getOperandInfo(i);
@@ -249,7 +253,7 @@ void BitstreamCursor::ReadAbbrevRecord() {
 
     BitCodeAbbrevOp::Encoding E = (BitCodeAbbrevOp::Encoding)Read(3);
     if (BitCodeAbbrevOp::hasEncodingData(E)) {
-      unsigned Data = ReadVBR64(5);
+      uint64_t Data = ReadVBR64(5);
 
       // As a special case, handle fixed(0) (i.e., a fixed field with zero bits)
       // and vbr(0) as a literal zero.  This is decoded the same way, and avoids
diff --git a/lib/Bitcode/Reader/CMakeLists.txt b/lib/Bitcode/Reader/CMakeLists.txt
index f614c9f..62954f2 100644
--- a/lib/Bitcode/Reader/CMakeLists.txt
+++ b/lib/Bitcode/Reader/CMakeLists.txt
@@ -2,6 +2,9 @@ add_llvm_library(LLVMBitReader
   BitReader.cpp
   BitcodeReader.cpp
   BitstreamReader.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Bitcode
   )
 
 add_dependencies(LLVMBitReader intrinsics_gen)
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 6cfc357..ecb6f7c 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Bitcode/BitstreamWriter.h"
 #include "llvm/Bitcode/LLVMBitCodes.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
@@ -55,7 +56,8 @@ enum {
   FUNCTION_INST_CAST_ABBREV,
   FUNCTION_INST_RET_VOID_ABBREV,
   FUNCTION_INST_RET_VAL_ABBREV,
-  FUNCTION_INST_UNREACHABLE_ABBREV
+  FUNCTION_INST_UNREACHABLE_ABBREV,
+  FUNCTION_INST_GEP_ABBREV,
 };
 
 static unsigned GetEncodedCastOpcode(unsigned Opcode) {
@@ -322,7 +324,7 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
   Stream.EnterSubblock(bitc::TYPE_BLOCK_ID_NEW, 4 /*count from # abbrevs */);
   SmallVector<uint64_t, 64> TypeVals;
 
-  uint64_t NumBits = Log2_32_Ceil(VE.getTypes().size()+1);
+  uint64_t NumBits = VE.computeBitsRequiredForTypeIndicies();
 
   // Abbrev for TYPE_CODE_POINTER.
   BitCodeAbbrev *Abbv = new BitCodeAbbrev();
@@ -477,17 +479,28 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
 
 static unsigned getEncodedLinkage(const GlobalValue &GV) {
   switch (GV.getLinkage()) {
-  case GlobalValue::ExternalLinkage:                 return 0;
-  case GlobalValue::WeakAnyLinkage:                  return 1;
-  case GlobalValue::AppendingLinkage:                return 2;
-  case GlobalValue::InternalLinkage:                 return 3;
-  case GlobalValue::LinkOnceAnyLinkage:              return 4;
-  case GlobalValue::ExternalWeakLinkage:             return 7;
-  case GlobalValue::CommonLinkage:                   return 8;
-  case GlobalValue::PrivateLinkage:                  return 9;
-  case GlobalValue::WeakODRLinkage:                  return 10;
-  case GlobalValue::LinkOnceODRLinkage:              return 11;
-  case GlobalValue::AvailableExternallyLinkage:      return 12;
+  case GlobalValue::ExternalLinkage:
+    return 0;
+  case GlobalValue::WeakAnyLinkage:
+    return 16;
+  case GlobalValue::AppendingLinkage:
+    return 2;
+  case GlobalValue::InternalLinkage:
+    return 3;
+  case GlobalValue::LinkOnceAnyLinkage:
+    return 18;
+  case GlobalValue::ExternalWeakLinkage:
+    return 7;
+  case GlobalValue::CommonLinkage:
+    return 8;
+  case GlobalValue::PrivateLinkage:
+    return 9;
+  case GlobalValue::WeakODRLinkage:
+    return 17;
+  case GlobalValue::LinkOnceODRLinkage:
+    return 19;
+  case GlobalValue::AvailableExternallyLinkage:
+    return 12;
   }
   llvm_unreachable("Invalid linkage");
 }
@@ -538,11 +551,13 @@ static unsigned getEncodedComdatSelectionKind(const Comdat &C) {
 }
 
 static void writeComdats(const ValueEnumerator &VE, BitstreamWriter &Stream) {
-  SmallVector<uint8_t, 64> Vals;
+  SmallVector<uint16_t, 64> Vals;
   for (const Comdat *C : VE.getComdats()) {
     // COMDAT: [selection_kind, name]
     Vals.push_back(getEncodedComdatSelectionKind(*C));
-    Vals.push_back(C->getName().size());
+    size_t Size = C->getName().size();
+    assert(isUInt<16>(Size));
+    Vals.push_back(Size);
     for (char Chr : C->getName())
       Vals.push_back((unsigned char)Chr);
     Stream.EmitRecord(bitc::MODULE_CODE_COMDAT, Vals, /*AbbrevToUse=*/0);
@@ -616,7 +631,7 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
                               Log2_32_Ceil(MaxGlobalType+1)));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));      // Constant.
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));        // Initializer.
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));      // Linkage.
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 5));      // Linkage.
     if (MaxAlignment == 0)                                      // Alignment.
       Abbv->Add(BitCodeAbbrevOp(0));
     else {
@@ -640,7 +655,8 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
 
     // GLOBALVAR: [type, isconst, initid,
     //             linkage, alignment, section, visibility, threadlocal,
-    //             unnamed_addr, externally_initialized, dllstorageclass]
+    //             unnamed_addr, externally_initialized, dllstorageclass,
+    //             comdat]
     Vals.push_back(VE.getTypeID(GV.getType()));
     Vals.push_back(GV.isConstant());
     Vals.push_back(GV.isDeclaration() ? 0 :
@@ -670,7 +686,8 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
   // Emit the function proto information.
   for (const Function &F : *M) {
     // FUNCTION:  [type, callingconv, isproto, linkage, paramattrs, alignment,
-    //             section, visibility, gc, unnamed_addr, prefix]
+    //             section, visibility, gc, unnamed_addr, prologuedata,
+    //             dllstorageclass, comdat, prefixdata]
     Vals.push_back(VE.getTypeID(F.getType()));
     Vals.push_back(F.getCallingConv());
     Vals.push_back(F.isDeclaration());
@@ -681,10 +698,12 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
     Vals.push_back(getEncodedVisibility(F));
     Vals.push_back(F.hasGC() ? GCMap[F.getGC()] : 0);
     Vals.push_back(F.hasUnnamedAddr());
-    Vals.push_back(F.hasPrefixData() ? (VE.getValueID(F.getPrefixData()) + 1)
-                                      : 0);
+    Vals.push_back(F.hasPrologueData() ? (VE.getValueID(F.getPrologueData()) + 1)
+                                       : 0);
     Vals.push_back(getEncodedDLLStorageClass(F));
     Vals.push_back(F.hasComdat() ? VE.getComdatID(F.getComdat()) : 0);
+    Vals.push_back(F.hasPrefixData() ? (VE.getValueID(F.getPrefixData()) + 1)
+                                     : 0);
 
     unsigned AbbrevToUse = 0;
     Stream.EmitRecord(bitc::MODULE_CODE_FUNCTION, Vals, AbbrevToUse);
@@ -734,89 +753,497 @@ static uint64_t GetOptimizationFlags(const Value *V) {
   return Flags;
 }
 
-static void WriteMDNode(const MDNode *N,
-                        const ValueEnumerator &VE,
-                        BitstreamWriter &Stream,
-                        SmallVectorImpl<uint64_t> &Record) {
+static void WriteValueAsMetadata(const ValueAsMetadata *MD,
+                                 const ValueEnumerator &VE,
+                                 BitstreamWriter &Stream,
+                                 SmallVectorImpl<uint64_t> &Record) {
+  // Mimic an MDNode with a value as one operand.
+  Value *V = MD->getValue();
+  Record.push_back(VE.getTypeID(V->getType()));
+  Record.push_back(VE.getValueID(V));
+  Stream.EmitRecord(bitc::METADATA_VALUE, Record, 0);
+  Record.clear();
+}
+
+static void WriteMDTuple(const MDTuple *N, const ValueEnumerator &VE,
+                         BitstreamWriter &Stream,
+                         SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) {
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    if (N->getOperand(i)) {
-      Record.push_back(VE.getTypeID(N->getOperand(i)->getType()));
-      Record.push_back(VE.getValueID(N->getOperand(i)));
-    } else {
-      Record.push_back(VE.getTypeID(Type::getVoidTy(N->getContext())));
-      Record.push_back(0);
-    }
+    Metadata *MD = N->getOperand(i);
+    assert(!(MD && isa<LocalAsMetadata>(MD)) &&
+           "Unexpected function-local metadata");
+    Record.push_back(VE.getMetadataOrNullID(MD));
   }
-  unsigned MDCode = N->isFunctionLocal() ? bitc::METADATA_FN_NODE :
-                                           bitc::METADATA_NODE;
-  Stream.EmitRecord(MDCode, Record, 0);
+  Stream.EmitRecord(N->isDistinct() ? bitc::METADATA_DISTINCT_NODE
+                                    : bitc::METADATA_NODE,
+                    Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDLocation(const MDLocation *N, const ValueEnumerator &VE,
+                            BitstreamWriter &Stream,
+                            SmallVectorImpl<uint64_t> &Record,
+                            unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getLine());
+  Record.push_back(N->getColumn());
+  Record.push_back(VE.getMetadataID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getInlinedAt()));
+
+  Stream.EmitRecord(bitc::METADATA_LOCATION, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteGenericDebugNode(const GenericDebugNode *N,
+                                  const ValueEnumerator &VE,
+                                  BitstreamWriter &Stream,
+                                  SmallVectorImpl<uint64_t> &Record,
+                                  unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getTag());
+  Record.push_back(0); // Per-tag version field; unused for now.
+
+  for (auto &I : N->operands())
+    Record.push_back(VE.getMetadataOrNullID(I));
+
+  Stream.EmitRecord(bitc::METADATA_GENERIC_DEBUG, Record, Abbrev);
+  Record.clear();
+}
+
+static uint64_t rotateSign(int64_t I) {
+  uint64_t U = I;
+  return I < 0 ? ~(U << 1) : U << 1;
+}
+
+static void WriteMDSubrange(const MDSubrange *N, const ValueEnumerator &,
+                            BitstreamWriter &Stream,
+                            SmallVectorImpl<uint64_t> &Record,
+                            unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getCount());
+  Record.push_back(rotateSign(N->getLo()));
+
+  Stream.EmitRecord(bitc::METADATA_SUBRANGE, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDEnumerator(const MDEnumerator *N, const ValueEnumerator &VE,
+                              BitstreamWriter &Stream,
+                              SmallVectorImpl<uint64_t> &Record,
+                              unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(rotateSign(N->getValue()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+
+  Stream.EmitRecord(bitc::METADATA_ENUMERATOR, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDBasicType(const MDBasicType *N, const ValueEnumerator &VE,
+                             BitstreamWriter &Stream,
+                             SmallVectorImpl<uint64_t> &Record,
+                             unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getTag());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(N->getSizeInBits());
+  Record.push_back(N->getAlignInBits());
+  Record.push_back(N->getEncoding());
+
+  Stream.EmitRecord(bitc::METADATA_BASIC_TYPE, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDDerivedType(const MDDerivedType *N,
+                               const ValueEnumerator &VE,
+                               BitstreamWriter &Stream,
+                               SmallVectorImpl<uint64_t> &Record,
+                               unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getTag());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getBaseType()));
+  Record.push_back(N->getSizeInBits());
+  Record.push_back(N->getAlignInBits());
+  Record.push_back(N->getOffsetInBits());
+  Record.push_back(N->getFlags());
+  Record.push_back(VE.getMetadataOrNullID(N->getExtraData()));
+
+  Stream.EmitRecord(bitc::METADATA_DERIVED_TYPE, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDCompositeType(const MDCompositeType *N,
+                                 const ValueEnumerator &VE,
+                                 BitstreamWriter &Stream,
+                                 SmallVectorImpl<uint64_t> &Record,
+                                 unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getTag());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getBaseType()));
+  Record.push_back(N->getSizeInBits());
+  Record.push_back(N->getAlignInBits());
+  Record.push_back(N->getOffsetInBits());
+  Record.push_back(N->getFlags());
+  Record.push_back(VE.getMetadataOrNullID(N->getElements()));
+  Record.push_back(N->getRuntimeLang());
+  Record.push_back(VE.getMetadataOrNullID(N->getVTableHolder()));
+  Record.push_back(VE.getMetadataOrNullID(N->getTemplateParams()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawIdentifier()));
+
+  Stream.EmitRecord(bitc::METADATA_COMPOSITE_TYPE, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDSubroutineType(const MDSubroutineType *N,
+                                  const ValueEnumerator &VE,
+                                  BitstreamWriter &Stream,
+                                  SmallVectorImpl<uint64_t> &Record,
+                                  unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getFlags());
+  Record.push_back(VE.getMetadataOrNullID(N->getTypeArray()));
+
+  Stream.EmitRecord(bitc::METADATA_SUBROUTINE_TYPE, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDFile(const MDFile *N, const ValueEnumerator &VE,
+                        BitstreamWriter &Stream,
+                        SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawFilename()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawDirectory()));
+
+  Stream.EmitRecord(bitc::METADATA_FILE, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDCompileUnit(const MDCompileUnit *N,
+                               const ValueEnumerator &VE,
+                               BitstreamWriter &Stream,
+                               SmallVectorImpl<uint64_t> &Record,
+                               unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getSourceLanguage());
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawProducer()));
+  Record.push_back(N->isOptimized());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawFlags()));
+  Record.push_back(N->getRuntimeVersion());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawSplitDebugFilename()));
+  Record.push_back(N->getEmissionKind());
+  Record.push_back(VE.getMetadataOrNullID(N->getEnumTypes()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRetainedTypes()));
+  Record.push_back(VE.getMetadataOrNullID(N->getSubprograms()));
+  Record.push_back(VE.getMetadataOrNullID(N->getGlobalVariables()));
+  Record.push_back(VE.getMetadataOrNullID(N->getImportedEntities()));
+
+  Stream.EmitRecord(bitc::METADATA_COMPILE_UNIT, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDSubprogram(const MDSubprogram *N,
+                               const ValueEnumerator &VE,
+                               BitstreamWriter &Stream,
+                               SmallVectorImpl<uint64_t> &Record,
+                               unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawLinkageName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getType()));
+  Record.push_back(N->isLocalToUnit());
+  Record.push_back(N->isDefinition());
+  Record.push_back(N->getScopeLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getContainingType()));
+  Record.push_back(N->getVirtuality());
+  Record.push_back(N->getVirtualIndex());
+  Record.push_back(N->getFlags());
+  Record.push_back(N->isOptimized());
+  Record.push_back(VE.getMetadataOrNullID(N->getFunction()));
+  Record.push_back(VE.getMetadataOrNullID(N->getTemplateParams()));
+  Record.push_back(VE.getMetadataOrNullID(N->getDeclaration()));
+  Record.push_back(VE.getMetadataOrNullID(N->getVariables()));
+
+  Stream.EmitRecord(bitc::METADATA_SUBPROGRAM, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDLexicalBlock(const MDLexicalBlock *N,
+                               const ValueEnumerator &VE,
+                               BitstreamWriter &Stream,
+                               SmallVectorImpl<uint64_t> &Record,
+                               unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(N->getLine());
+  Record.push_back(N->getColumn());
+
+  Stream.EmitRecord(bitc::METADATA_LEXICAL_BLOCK, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDLexicalBlockFile(const MDLexicalBlockFile *N,
+                                    const ValueEnumerator &VE,
+                                    BitstreamWriter &Stream,
+                                    SmallVectorImpl<uint64_t> &Record,
+                                    unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(N->getDiscriminator());
+
+  Stream.EmitRecord(bitc::METADATA_LEXICAL_BLOCK_FILE, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDNamespace(const MDNamespace *N, const ValueEnumerator &VE,
+                             BitstreamWriter &Stream,
+                             SmallVectorImpl<uint64_t> &Record,
+                             unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(N->getLine());
+
+  Stream.EmitRecord(bitc::METADATA_NAMESPACE, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDTemplateTypeParameter(const MDTemplateTypeParameter *N,
+                                         const ValueEnumerator &VE,
+                                         BitstreamWriter &Stream,
+                                         SmallVectorImpl<uint64_t> &Record,
+                                         unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getType()));
+
+  Stream.EmitRecord(bitc::METADATA_TEMPLATE_TYPE, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDTemplateValueParameter(const MDTemplateValueParameter *N,
+                                          const ValueEnumerator &VE,
+                                          BitstreamWriter &Stream,
+                                          SmallVectorImpl<uint64_t> &Record,
+                                          unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getTag());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getType()));
+  Record.push_back(VE.getMetadataOrNullID(N->getValue()));
+
+  Stream.EmitRecord(bitc::METADATA_TEMPLATE_VALUE, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDGlobalVariable(const MDGlobalVariable *N,
+                                  const ValueEnumerator &VE,
+                                  BitstreamWriter &Stream,
+                                  SmallVectorImpl<uint64_t> &Record,
+                                  unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawLinkageName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getType()));
+  Record.push_back(N->isLocalToUnit());
+  Record.push_back(N->isDefinition());
+  Record.push_back(VE.getMetadataOrNullID(N->getVariable()));
+  Record.push_back(VE.getMetadataOrNullID(N->getStaticDataMemberDeclaration()));
+
+  Stream.EmitRecord(bitc::METADATA_GLOBAL_VAR, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDLocalVariable(const MDLocalVariable *N,
+                                 const ValueEnumerator &VE,
+                                 BitstreamWriter &Stream,
+                                 SmallVectorImpl<uint64_t> &Record,
+                                 unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getTag());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getType()));
+  Record.push_back(N->getArg());
+  Record.push_back(N->getFlags());
+  Record.push_back(VE.getMetadataOrNullID(N->getInlinedAt()));
+
+  Stream.EmitRecord(bitc::METADATA_LOCAL_VAR, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDExpression(const MDExpression *N, const ValueEnumerator &,
+                              BitstreamWriter &Stream,
+                              SmallVectorImpl<uint64_t> &Record,
+                              unsigned Abbrev) {
+  Record.reserve(N->getElements().size() + 1);
+
+  Record.push_back(N->isDistinct());
+  Record.append(N->elements_begin(), N->elements_end());
+
+  Stream.EmitRecord(bitc::METADATA_EXPRESSION, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDObjCProperty(const MDObjCProperty *N,
+                                 const ValueEnumerator &VE,
+                                 BitstreamWriter &Stream,
+                                 SmallVectorImpl<uint64_t> &Record,
+                                 unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawSetterName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawGetterName()));
+  Record.push_back(N->getAttributes());
+  Record.push_back(VE.getMetadataOrNullID(N->getType()));
+
+  Stream.EmitRecord(bitc::METADATA_OBJC_PROPERTY, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteMDImportedEntity(const MDImportedEntity *N,
+                                  const ValueEnumerator &VE,
+                                  BitstreamWriter &Stream,
+                                  SmallVectorImpl<uint64_t> &Record,
+                                  unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getTag());
+  Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+  Record.push_back(VE.getMetadataOrNullID(N->getEntity()));
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+
+  Stream.EmitRecord(bitc::METADATA_IMPORTED_ENTITY, Record, Abbrev);
   Record.clear();
 }
 
 static void WriteModuleMetadata(const Module *M,
                                 const ValueEnumerator &VE,
                                 BitstreamWriter &Stream) {
-  const auto &Vals = VE.getMDValues();
-  bool StartedMetadataBlock = false;
+  const auto &MDs = VE.getMDs();
+  if (MDs.empty() && M->named_metadata_empty())
+    return;
+
+  Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
+
   unsigned MDSAbbrev = 0;
-  SmallVector<uint64_t, 64> Record;
-  for (unsigned i = 0, e = Vals.size(); i != e; ++i) {
+  if (VE.hasMDString()) {
+    // Abbrev for METADATA_STRING.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_STRING));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
+    MDSAbbrev = Stream.EmitAbbrev(Abbv);
+  }
 
-    if (const MDNode *N = dyn_cast<MDNode>(Vals[i])) {
-      if (!N->isFunctionLocal() || !N->getFunction()) {
-        if (!StartedMetadataBlock) {
-          Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
-          StartedMetadataBlock = true;
-        }
-        WriteMDNode(N, VE, Stream, Record);
-      }
-    } else if (const MDString *MDS = dyn_cast<MDString>(Vals[i])) {
-      if (!StartedMetadataBlock)  {
-        Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
-
-        // Abbrev for METADATA_STRING.
-        BitCodeAbbrev *Abbv = new BitCodeAbbrev();
-        Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_STRING));
-        Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-        Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
-        MDSAbbrev = Stream.EmitAbbrev(Abbv);
-        StartedMetadataBlock = true;
-      }
+  // Initialize MDNode abbreviations.
+#define HANDLE_MDNODE_LEAF(CLASS) unsigned CLASS##Abbrev = 0;
+#include "llvm/IR/Metadata.def"
 
-      // Code: [strchar x N]
-      Record.append(MDS->begin(), MDS->end());
+  if (VE.hasMDLocation()) {
+    // Abbrev for METADATA_LOCATION.
+    //
+    // Assume the column is usually under 128, and always output the inlined-at
+    // location (it's never more expensive than building an array size 1).
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_LOCATION));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+    MDLocationAbbrev = Stream.EmitAbbrev(Abbv);
+  }
 
-      // Emit the finished record.
-      Stream.EmitRecord(bitc::METADATA_STRING, Record, MDSAbbrev);
-      Record.clear();
-    }
+  if (VE.hasGenericDebugNode()) {
+    // Abbrev for METADATA_GENERIC_DEBUG.
+    //
+    // Assume the column is usually under 128, and always output the inlined-at
+    // location (it's never more expensive than building an array size 1).
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_GENERIC_DEBUG));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+    GenericDebugNodeAbbrev = Stream.EmitAbbrev(Abbv);
   }
 
-  // Write named metadata.
-  for (Module::const_named_metadata_iterator I = M->named_metadata_begin(),
-       E = M->named_metadata_end(); I != E; ++I) {
-    const NamedMDNode *NMD = I;
-    if (!StartedMetadataBlock)  {
-      Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
-      StartedMetadataBlock = true;
+  unsigned NameAbbrev = 0;
+  if (!M->named_metadata_empty()) {
+    // Abbrev for METADATA_NAME.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_NAME));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
+    NameAbbrev = Stream.EmitAbbrev(Abbv);
+  }
+
+  SmallVector<uint64_t, 64> Record;
+  for (const Metadata *MD : MDs) {
+    if (const MDNode *N = dyn_cast<MDNode>(MD)) {
+      switch (N->getMetadataID()) {
+      default:
+        llvm_unreachable("Invalid MDNode subclass");
+#define HANDLE_MDNODE_LEAF(CLASS)                                              \
+  case Metadata::CLASS##Kind:                                                  \
+    Write##CLASS(cast<CLASS>(N), VE, Stream, Record, CLASS##Abbrev);           \
+    continue;
+#include "llvm/IR/Metadata.def"
+      }
     }
+    if (const auto *MDC = dyn_cast<ConstantAsMetadata>(MD)) {
+      WriteValueAsMetadata(MDC, VE, Stream, Record);
+      continue;
+    }
+    const MDString *MDS = cast<MDString>(MD);
+    // Code: [strchar x N]
+    Record.append(MDS->bytes_begin(), MDS->bytes_end());
 
+    // Emit the finished record.
+    Stream.EmitRecord(bitc::METADATA_STRING, Record, MDSAbbrev);
+    Record.clear();
+  }
+
+  // Write named metadata.
+  for (const NamedMDNode &NMD : M->named_metadata()) {
     // Write name.
-    StringRef Str = NMD->getName();
-    for (unsigned i = 0, e = Str.size(); i != e; ++i)
-      Record.push_back(Str[i]);
-    Stream.EmitRecord(bitc::METADATA_NAME, Record, 0/*TODO*/);
+    StringRef Str = NMD.getName();
+    Record.append(Str.bytes_begin(), Str.bytes_end());
+    Stream.EmitRecord(bitc::METADATA_NAME, Record, NameAbbrev);
     Record.clear();
 
     // Write named metadata operands.
-    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
-      Record.push_back(VE.getValueID(NMD->getOperand(i)));
+    for (const MDNode *N : NMD.operands())
+      Record.push_back(VE.getMetadataID(N));
     Stream.EmitRecord(bitc::METADATA_NAMED_NODE, Record, 0);
     Record.clear();
   }
 
-  if (StartedMetadataBlock)
-    Stream.ExitBlock();
+  Stream.ExitBlock();
 }
 
 static void WriteFunctionLocalMetadata(const Function &F,
@@ -824,16 +1251,16 @@ static void WriteFunctionLocalMetadata(const Function &F,
                                        BitstreamWriter &Stream) {
   bool StartedMetadataBlock = false;
   SmallVector<uint64_t, 64> Record;
-  const SmallVectorImpl<const MDNode *> &Vals = VE.getFunctionLocalMDValues();
-  for (unsigned i = 0, e = Vals.size(); i != e; ++i)
-    if (const MDNode *N = Vals[i])
-      if (N->isFunctionLocal() && N->getFunction() == &F) {
-        if (!StartedMetadataBlock) {
-          Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
-          StartedMetadataBlock = true;
-        }
-        WriteMDNode(N, VE, Stream, Record);
-      }
+  const SmallVectorImpl<const LocalAsMetadata *> &MDs =
+      VE.getFunctionLocalMDs();
+  for (unsigned i = 0, e = MDs.size(); i != e; ++i) {
+    assert(MDs[i] && "Expected valid function-local metadata");
+    if (!StartedMetadataBlock) {
+      Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
+      StartedMetadataBlock = true;
+    }
+    WriteValueAsMetadata(MDs[i], VE, Stream, Record);
+  }
 
   if (StartedMetadataBlock)
     Stream.ExitBlock();
@@ -863,7 +1290,7 @@ static void WriteMetadataAttachment(const Function &F,
 
       for (unsigned i = 0, e = MDs.size(); i != e; ++i) {
         Record.push_back(MDs[i].first);
-        Record.push_back(VE.getValueID(MDs[i].second));
+        Record.push_back(VE.getMetadataID(MDs[i].second));
       }
       Stream.EmitRecord(bitc::METADATA_ATTACHMENT, Record, 0);
       Record.clear();
@@ -966,14 +1393,12 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
       // Add the asm string.
       const std::string &AsmStr = IA->getAsmString();
       Record.push_back(AsmStr.size());
-      for (unsigned i = 0, e = AsmStr.size(); i != e; ++i)
-        Record.push_back(AsmStr[i]);
+      Record.append(AsmStr.begin(), AsmStr.end());
 
       // Add the constraint string.
       const std::string &ConstraintStr = IA->getConstraintString();
       Record.push_back(ConstraintStr.size());
-      for (unsigned i = 0, e = ConstraintStr.size(); i != e; ++i)
-        Record.push_back(ConstraintStr[i]);
+      Record.append(ConstraintStr.begin(), ConstraintStr.end());
       Stream.EmitRecord(bitc::CST_CODE_INLINEASM, Record);
       Record.clear();
       continue;
@@ -1251,19 +1676,21 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     }
     break;
 
-  case Instruction::GetElementPtr:
+  case Instruction::GetElementPtr: {
     Code = bitc::FUNC_CODE_INST_GEP;
-    if (cast<GEPOperator>(&I)->isInBounds())
-      Code = bitc::FUNC_CODE_INST_INBOUNDS_GEP;
+    AbbrevToUse = FUNCTION_INST_GEP_ABBREV;
+    auto &GEPInst = cast<GetElementPtrInst>(I);
+    Vals.push_back(GEPInst.isInBounds());
+    Vals.push_back(VE.getTypeID(GEPInst.getSourceElementType()));
     for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
       PushValueAndType(I.getOperand(i), InstID, Vals, VE);
     break;
+  }
   case Instruction::ExtractValue: {
     Code = bitc::FUNC_CODE_INST_EXTRACTVAL;
     PushValueAndType(I.getOperand(0), InstID, Vals, VE);
     const ExtractValueInst *EVI = cast<ExtractValueInst>(&I);
-    for (const unsigned *i = EVI->idx_begin(), *e = EVI->idx_end(); i != e; ++i)
-      Vals.push_back(*i);
+    Vals.append(EVI->idx_begin(), EVI->idx_end());
     break;
   }
   case Instruction::InsertValue: {
@@ -1271,8 +1698,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     PushValueAndType(I.getOperand(0), InstID, Vals, VE);
     PushValueAndType(I.getOperand(1), InstID, Vals, VE);
     const InsertValueInst *IVI = cast<InsertValueInst>(&I);
-    for (const unsigned *i = IVI->idx_begin(), *e = IVI->idx_end(); i != e; ++i)
-      Vals.push_back(*i);
+    Vals.append(IVI->idx_begin(), IVI->idx_end());
     break;
   }
   case Instruction::Select:
@@ -1449,6 +1875,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
       if (!PushValueAndType(I.getOperand(0), InstID, Vals, VE))  // ptr
         AbbrevToUse = FUNCTION_INST_LOAD_ABBREV;
     }
+    Vals.push_back(VE.getTypeID(I.getType()));
     Vals.push_back(Log2_32(cast<LoadInst>(I).getAlignment())+1);
     Vals.push_back(cast<LoadInst>(I).isVolatile());
     if (cast<LoadInst>(I).isAtomic()) {
@@ -1608,9 +2035,7 @@ static void WriteUseList(ValueEnumerator &VE, UseListOrder &&Order,
   else
     Code = bitc::USELIST_CODE_DEFAULT;
 
-  SmallVector<uint64_t, 64> Record;
-  for (unsigned I : Order.Shuffle)
-    Record.push_back(I);
+  SmallVector<uint64_t, 64> Record(Order.Shuffle.begin(), Order.Shuffle.end());
   Record.push_back(VE.getValueID(Order.V));
   Stream.EmitRecord(Code, Record);
 }
@@ -1683,11 +2108,12 @@ static void WriteFunction(const Function &F, ValueEnumerator &VE,
       } else {
         MDNode *Scope, *IA;
         DL.getScopeAndInlinedAt(Scope, IA, I->getContext());
+        assert(Scope && "Expected valid scope");
 
         Vals.push_back(DL.getLine());
         Vals.push_back(DL.getCol());
-        Vals.push_back(Scope ? VE.getValueID(Scope)+1 : 0);
-        Vals.push_back(IA ? VE.getValueID(IA)+1 : 0);
+        Vals.push_back(VE.getMetadataOrNullID(Scope));
+        Vals.push_back(VE.getMetadataOrNullID(IA));
         Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC, Vals);
         Vals.clear();
 
@@ -1761,7 +2187,7 @@ static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
     BitCodeAbbrev *Abbv = new BitCodeAbbrev();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_SETTYPE));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                              Log2_32_Ceil(VE.getTypes().size()+1)));
+                              VE.computeBitsRequiredForTypeIndicies()));
     if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID,
                                    Abbv) != CONSTANTS_SETTYPE_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
@@ -1781,7 +2207,7 @@ static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CE_CAST));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));  // cast opc
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // typeid
-                              Log2_32_Ceil(VE.getTypes().size()+1)));
+                              VE.computeBitsRequiredForTypeIndicies()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));    // value id
 
     if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID,
@@ -1802,6 +2228,8 @@ static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
     BitCodeAbbrev *Abbv = new BitCodeAbbrev();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_LOAD));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Ptr
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,    // dest ty
+                              VE.computeBitsRequiredForTypeIndicies()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // Align
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // volatile
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
@@ -1834,7 +2262,7 @@ static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_CAST));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));    // OpVal
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // dest ty
-                              Log2_32_Ceil(VE.getTypes().size()+1)));
+                              VE.computeBitsRequiredForTypeIndicies()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));  // opc
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
                                    Abbv) != FUNCTION_INST_CAST_ABBREV)
@@ -1863,6 +2291,18 @@ static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
                                    Abbv) != FUNCTION_INST_UNREACHABLE_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
+  {
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_GEP));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, // dest ty
+                              Log2_32_Ceil(VE.getTypes().size() + 1)));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
+        FUNCTION_INST_GEP_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
 
   Stream.ExitBlock();
 }
diff --git a/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/lib/Bitcode/Writer/BitcodeWriterPass.cpp
index 4167f6d..25456a4 100644
--- a/lib/Bitcode/Writer/BitcodeWriterPass.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -18,8 +18,8 @@
 #include "llvm/Pass.h"
 using namespace llvm;
 
-PreservedAnalyses BitcodeWriterPass::run(Module *M) {
-  WriteBitcodeToFile(M, OS);
+PreservedAnalyses BitcodeWriterPass::run(Module &M) {
+  WriteBitcodeToFile(&M, OS);
   return PreservedAnalyses::all();
 }
 
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index f065c83..549e94f 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -85,10 +86,14 @@ static OrderMap orderModule(const Module &M) {
   for (const GlobalAlias &A : M.aliases())
     if (!isa<GlobalValue>(A.getAliasee()))
       orderValue(A.getAliasee(), OM);
-  for (const Function &F : M)
+  for (const Function &F : M) {
     if (F.hasPrefixData())
       if (!isa<GlobalValue>(F.getPrefixData()))
         orderValue(F.getPrefixData(), OM);
+    if (F.hasPrologueData())
+      if (!isa<GlobalValue>(F.getPrologueData()))
+        orderValue(F.getPrologueData(), OM);
+  }
   OM.LastGlobalConstantID = OM.size();
 
   // Initializers of GlobalValues are processed in
@@ -264,9 +269,12 @@ static UseListOrderStack predictUseListOrder(const Module &M) {
       predictValueUseListOrder(G.getInitializer(), nullptr, OM, Stack);
   for (const GlobalAlias &A : M.aliases())
     predictValueUseListOrder(A.getAliasee(), nullptr, OM, Stack);
-  for (const Function &F : M)
+  for (const Function &F : M) {
     if (F.hasPrefixData())
       predictValueUseListOrder(F.getPrefixData(), nullptr, OM, Stack);
+    if (F.hasPrologueData())
+      predictValueUseListOrder(F.getPrologueData(), nullptr, OM, Stack);
+  }
 
   return Stack;
 }
@@ -275,7 +283,8 @@ static bool isIntOrIntVectorValue(const std::pair<const Value*, unsigned> &V) {
   return V.first->getType()->isIntOrIntVectorTy();
 }
 
-ValueEnumerator::ValueEnumerator(const Module &M) {
+ValueEnumerator::ValueEnumerator(const Module &M)
+    : HasMDString(false), HasMDLocation(false), HasGenericDebugNode(false) {
   if (shouldPreserveBitcodeUseListOrder())
     UseListOrders = predictUseListOrder(M);
 
@@ -314,6 +323,17 @@ ValueEnumerator::ValueEnumerator(const Module &M) {
     if (I->hasPrefixData())
       EnumerateValue(I->getPrefixData());
 
+  // Enumerate the prologue data constants.
+  for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (I->hasPrologueData())
+      EnumerateValue(I->getPrologueData());
+
+  // Enumerate the metadata type.
+  //
+  // TODO: Move this to ValueEnumerator::EnumerateOperandType() once bitcode
+  // only encodes the metadata type when it's used as a value.
+  EnumerateType(Type::getMetadataTy(M.getContext()));
+
   // Insert constants and metadata that are named at module level into the slot
   // pool so that the module symbol table can refer to them...
   EnumerateValueSymbolTable(M.getValueSymbolTable());
@@ -329,11 +349,17 @@ ValueEnumerator::ValueEnumerator(const Module &M) {
     for (const BasicBlock &BB : F)
       for (const Instruction &I : BB) {
         for (const Use &Op : I.operands()) {
-          if (MDNode *MD = dyn_cast<MDNode>(&Op))
-            if (MD->isFunctionLocal() && MD->getFunction())
-              // These will get enumerated during function-incorporation.
-              continue;
-          EnumerateOperandType(Op);
+          auto *MD = dyn_cast<MetadataAsValue>(&Op);
+          if (!MD) {
+            EnumerateOperandType(Op);
+            continue;
+          }
+
+          // Local metadata is enumerated during function-incorporation.
+          if (isa<LocalAsMetadata>(MD->getMetadata()))
+            continue;
+
+          EnumerateMetadata(MD->getMetadata());
         }
         EnumerateType(I.getType());
         if (const CallInst *CI = dyn_cast<CallInst>(&I))
@@ -377,11 +403,8 @@ void ValueEnumerator::setInstructionID(const Instruction *I) {
 }
 
 unsigned ValueEnumerator::getValueID(const Value *V) const {
-  if (isa<MDNode>(V) || isa<MDString>(V)) {
-    ValueMapType::const_iterator I = MDValueMap.find(V);
-    assert(I != MDValueMap.end() && "Value not in slotcalculator!");
-    return I->second-1;
-  }
+  if (auto *MD = dyn_cast<MetadataAsValue>(V))
+    return getMetadataID(MD->getMetadata());
 
   ValueMapType::const_iterator I = ValueMap.find(V);
   assert(I != ValueMap.end() && "Value not in slotcalculator!");
@@ -424,6 +447,18 @@ void ValueEnumerator::print(raw_ostream &OS, const ValueMapType &Map,
   }
 }
 
+void ValueEnumerator::print(raw_ostream &OS, const MetadataMapType &Map,
+                            const char *Name) const {
+
+  OS << "Map Name: " << Name << "\n";
+  OS << "Size: " << Map.size() << "\n";
+  for (auto I = Map.begin(), E = Map.end(); I != E; ++I) {
+    const Metadata *MD = I->first;
+    OS << "Metadata: slot = " << I->second << "\n";
+    MD->print(OS);
+  }
+}
+
 /// OptimizeConstants - Reorder constant pool for denser encoding.
 void ValueEnumerator::OptimizeConstants(unsigned CstStart, unsigned CstEnd) {
   if (CstStart == CstEnd || CstStart+1 == CstEnd) return;
@@ -481,25 +516,18 @@ void ValueEnumerator::EnumerateNamedMDNode(const NamedMDNode *MD) {
 /// and types referenced by the given MDNode.
 void ValueEnumerator::EnumerateMDNodeOperands(const MDNode *N) {
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    if (Value *V = N->getOperand(i)) {
-      if (isa<MDNode>(V) || isa<MDString>(V))
-        EnumerateMetadata(V);
-      else if (!isa<Instruction>(V) && !isa<Argument>(V))
-        EnumerateValue(V);
-    } else
-      EnumerateType(Type::getVoidTy(N->getContext()));
+    Metadata *MD = N->getOperand(i);
+    if (!MD)
+      continue;
+    assert(!isa<LocalAsMetadata>(MD) && "MDNodes cannot be function-local");
+    EnumerateMetadata(MD);
   }
 }
 
-void ValueEnumerator::EnumerateMetadata(const Value *MD) {
-  assert((isa<MDNode>(MD) || isa<MDString>(MD)) && "Invalid metadata kind");
-
-  // Skip function-local nodes themselves, but walk their operands.
-  const MDNode *N = dyn_cast<MDNode>(MD);
-  if (N && N->isFunctionLocal() && N->getFunction()) {
-    EnumerateMDNodeOperands(N);
-    return;
-  }
+void ValueEnumerator::EnumerateMetadata(const Metadata *MD) {
+  assert(
+      (isa<MDNode>(MD) || isa<MDString>(MD) || isa<ConstantAsMetadata>(MD)) &&
+      "Invalid metadata kind");
 
   // Insert a dummy ID to block the co-recursive call to
   // EnumerateMDNodeOperands() from re-visiting MD in a cyclic graph.
@@ -508,55 +536,43 @@ void ValueEnumerator::EnumerateMetadata(const Value *MD) {
   if (!MDValueMap.insert(std::make_pair(MD, 0)).second)
     return;
 
-  // Enumerate the type of this value.
-  EnumerateType(MD->getType());
-
   // Visit operands first to minimize RAUW.
-  if (N)
+  if (auto *N = dyn_cast<MDNode>(MD))
     EnumerateMDNodeOperands(N);
+  else if (auto *C = dyn_cast<ConstantAsMetadata>(MD))
+    EnumerateValue(C->getValue());
+
+  HasMDString |= isa<MDString>(MD);
+  HasMDLocation |= isa<MDLocation>(MD);
+  HasGenericDebugNode |= isa<GenericDebugNode>(MD);
 
   // Replace the dummy ID inserted above with the correct one.  MDValueMap may
   // have changed by inserting operands, so we need a fresh lookup here.
-  MDValues.push_back(MD);
-  MDValueMap[MD] = MDValues.size();
+  MDs.push_back(MD);
+  MDValueMap[MD] = MDs.size();
 }
 
 /// EnumerateFunctionLocalMetadataa - Incorporate function-local metadata
-/// information reachable from the given MDNode.
-void ValueEnumerator::EnumerateFunctionLocalMetadata(const MDNode *N) {
-  assert(N->isFunctionLocal() && N->getFunction() &&
-         "EnumerateFunctionLocalMetadata called on non-function-local mdnode!");
-
-  // Enumerate the type of this value.
-  EnumerateType(N->getType());
-
+/// information reachable from the metadata.
+void ValueEnumerator::EnumerateFunctionLocalMetadata(
+    const LocalAsMetadata *Local) {
   // Check to see if it's already in!
-  unsigned &MDValueID = MDValueMap[N];
+  unsigned &MDValueID = MDValueMap[Local];
   if (MDValueID)
     return;
 
-  MDValues.push_back(N);
-  MDValueID = MDValues.size();
-
-  // To incoroporate function-local information visit all function-local
-  // MDNodes and all function-local values they reference.
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-    if (Value *V = N->getOperand(i)) {
-      if (MDNode *O = dyn_cast<MDNode>(V)) {
-        if (O->isFunctionLocal() && O->getFunction())
-          EnumerateFunctionLocalMetadata(O);
-      } else if (isa<Instruction>(V) || isa<Argument>(V))
-        EnumerateValue(V);
-    }
+  MDs.push_back(Local);
+  MDValueID = MDs.size();
+
+  EnumerateValue(Local->getValue());
 
-  // Also, collect all function-local MDNodes for easy access.
-  FunctionLocalMDs.push_back(N);
+  // Also, collect all function-local metadata for easy access.
+  FunctionLocalMDs.push_back(Local);
 }
 
 void ValueEnumerator::EnumerateValue(const Value *V) {
   assert(!V->getType()->isVoidTy() && "Can't insert void values!");
-  assert(!isa<MDNode>(V) && !isa<MDString>(V) &&
-         "EnumerateValue doesn't handle Metadata!");
+  assert(!isa<MetadataAsValue>(V) && "EnumerateValue doesn't handle Metadata!");
 
   // Check to see if it's already in!
   unsigned &ValueID = ValueMap[V];
@@ -620,9 +636,8 @@ void ValueEnumerator::EnumerateType(Type *Ty) {
 
   // Enumerate all of the subtypes before we enumerate this type.  This ensures
   // that the type will be enumerated in an order that can be directly built.
-  for (Type::subtype_iterator I = Ty->subtype_begin(), E = Ty->subtype_end();
-       I != E; ++I)
-    EnumerateType(*I);
+  for (Type *SubTy : Ty->subtypes())
+    EnumerateType(SubTy);
 
   // Refresh the TypeID pointer in case the table rehashed.
   TypeID = &TypeMap[Ty];
@@ -646,30 +661,35 @@ void ValueEnumerator::EnumerateType(Type *Ty) {
 void ValueEnumerator::EnumerateOperandType(const Value *V) {
   EnumerateType(V->getType());
 
-  if (const Constant *C = dyn_cast<Constant>(V)) {
-    // If this constant is already enumerated, ignore it, we know its type must
-    // be enumerated.
-    if (ValueMap.count(V)) return;
+  if (auto *MD = dyn_cast<MetadataAsValue>(V)) {
+    assert(!isa<LocalAsMetadata>(MD->getMetadata()) &&
+           "Function-local metadata should be left for later");
+
+    EnumerateMetadata(MD->getMetadata());
+    return;
+  }
 
-    // This constant may have operands, make sure to enumerate the types in
-    // them.
-    for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) {
-      const Value *Op = C->getOperand(i);
+  const Constant *C = dyn_cast<Constant>(V);
+  if (!C)
+    return;
 
-      // Don't enumerate basic blocks here, this happens as operands to
-      // blockaddress.
-      if (isa<BasicBlock>(Op)) continue;
+  // If this constant is already enumerated, ignore it, we know its type must
+  // be enumerated.
+  if (ValueMap.count(C))
+    return;
 
-      EnumerateOperandType(Op);
-    }
+  // This constant may have operands, make sure to enumerate the types in
+  // them.
+  for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) {
+    const Value *Op = C->getOperand(i);
 
-    if (const MDNode *N = dyn_cast<MDNode>(V)) {
-      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-        if (Value *Elem = N->getOperand(i))
-          EnumerateOperandType(Elem);
-    }
-  } else if (isa<MDString>(V) || isa<MDNode>(V))
-    EnumerateMetadata(V);
+    // Don't enumerate basic blocks here, this happens as operands to
+    // blockaddress.
+    if (isa<BasicBlock>(Op))
+      continue;
+
+    EnumerateOperandType(Op);
+  }
 }
 
 void ValueEnumerator::EnumerateAttributes(AttributeSet PAL) {
@@ -697,7 +717,7 @@ void ValueEnumerator::EnumerateAttributes(AttributeSet PAL) {
 void ValueEnumerator::incorporateFunction(const Function &F) {
   InstructionCount = 0;
   NumModuleValues = Values.size();
-  NumModuleMDValues = MDValues.size();
+  NumModuleMDs = MDs.size();
 
   // Adding function arguments to the value table.
   for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
@@ -728,24 +748,16 @@ void ValueEnumerator::incorporateFunction(const Function &F) {
 
   FirstInstID = Values.size();
 
-  SmallVector<MDNode *, 8> FnLocalMDVector;
+  SmallVector<LocalAsMetadata *, 8> FnLocalMDVector;
   // Add all of the instructions.
   for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
     for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E; ++I) {
       for (User::const_op_iterator OI = I->op_begin(), E = I->op_end();
            OI != E; ++OI) {
-        if (MDNode *MD = dyn_cast<MDNode>(*OI))
-          if (MD->isFunctionLocal() && MD->getFunction())
+        if (auto *MD = dyn_cast<MetadataAsValue>(&*OI))
+          if (auto *Local = dyn_cast<LocalAsMetadata>(MD->getMetadata()))
             // Enumerate metadata after the instructions they might refer to.
-            FnLocalMDVector.push_back(MD);
-      }
-
-      SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
-      I->getAllMetadataOtherThanDebugLoc(MDs);
-      for (unsigned i = 0, e = MDs.size(); i != e; ++i) {
-        MDNode *N = MDs[i].second;
-        if (N->isFunctionLocal() && N->getFunction())
-          FnLocalMDVector.push_back(N);
+            FnLocalMDVector.push_back(Local);
       }
 
       if (!I->getType()->isVoidTy())
@@ -762,13 +774,13 @@ void ValueEnumerator::purgeFunction() {
   /// Remove purged values from the ValueMap.
   for (unsigned i = NumModuleValues, e = Values.size(); i != e; ++i)
     ValueMap.erase(Values[i].first);
-  for (unsigned i = NumModuleMDValues, e = MDValues.size(); i != e; ++i)
-    MDValueMap.erase(MDValues[i]);
+  for (unsigned i = NumModuleMDs, e = MDs.size(); i != e; ++i)
+    MDValueMap.erase(MDs[i]);
   for (unsigned i = 0, e = BasicBlocks.size(); i != e; ++i)
     ValueMap.erase(BasicBlocks[i]);
 
   Values.resize(NumModuleValues);
-  MDValues.resize(NumModuleMDValues);
+  MDs.resize(NumModuleMDs);
   BasicBlocks.clear();
   FunctionLocalMDs.clear();
 }
@@ -792,3 +804,6 @@ unsigned ValueEnumerator::getGlobalBasicBlockID(const BasicBlock *BB) const {
   return getGlobalBasicBlockID(BB);
 }
 
+uint64_t ValueEnumerator::computeBitsRequiredForTypeIndicies() const {
+  return Log2_32_Ceil(getTypes().size() + 1);
+}
diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
index 563c214..b94c370 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/lib/Bitcode/Writer/ValueEnumerator.h
@@ -30,6 +30,8 @@ class BasicBlock;
 class Comdat;
 class Function;
 class Module;
+class Metadata;
+class LocalAsMetadata;
 class MDNode;
 class NamedMDNode;
 class AttributeSet;
@@ -58,9 +60,13 @@ private:
   typedef UniqueVector<const Comdat *> ComdatSetType;
   ComdatSetType Comdats;
 
-  std::vector<const Value *> MDValues;
-  SmallVector<const MDNode *, 8> FunctionLocalMDs;
-  ValueMapType MDValueMap;
+  std::vector<const Metadata *> MDs;
+  SmallVector<const LocalAsMetadata *, 8> FunctionLocalMDs;
+  typedef DenseMap<const Metadata *, unsigned> MetadataMapType;
+  MetadataMapType MDValueMap;
+  bool HasMDString;
+  bool HasMDLocation;
+  bool HasGenericDebugNode;
 
   typedef DenseMap<AttributeSet, unsigned> AttributeGroupMapType;
   AttributeGroupMapType AttributeGroupMap;
@@ -88,20 +94,34 @@ private:
 
   /// When a function is incorporated, this is the size of the MDValues list
   /// before incorporation.
-  unsigned NumModuleMDValues;
+  unsigned NumModuleMDs;
 
   unsigned FirstFuncConstantID;
   unsigned FirstInstID;
 
-  ValueEnumerator(const ValueEnumerator &) LLVM_DELETED_FUNCTION;
-  void operator=(const ValueEnumerator &) LLVM_DELETED_FUNCTION;
+  ValueEnumerator(const ValueEnumerator &) = delete;
+  void operator=(const ValueEnumerator &) = delete;
 public:
   ValueEnumerator(const Module &M);
 
   void dump() const;
   void print(raw_ostream &OS, const ValueMapType &Map, const char *Name) const;
+  void print(raw_ostream &OS, const MetadataMapType &Map,
+             const char *Name) const;
 
   unsigned getValueID(const Value *V) const;
+  unsigned getMetadataID(const Metadata *MD) const {
+    auto ID = getMetadataOrNullID(MD);
+    assert(ID != 0 && "Metadata not in slotcalculator!");
+    return ID - 1;
+  }
+  unsigned getMetadataOrNullID(const Metadata *MD) const {
+    return MDValueMap.lookup(MD);
+  }
+
+  bool hasMDString() const { return HasMDString; }
+  bool hasMDLocation() const { return HasMDLocation; }
+  bool hasGenericDebugNode() const { return HasGenericDebugNode; }
 
   unsigned getTypeID(Type *T) const {
     TypeMapType::const_iterator I = TypeMap.find(T);
@@ -134,8 +154,8 @@ public:
   }
 
   const ValueList &getValues() const { return Values; }
-  const std::vector<const Value *> &getMDValues() const { return MDValues; }
-  const SmallVectorImpl<const MDNode *> &getFunctionLocalMDValues() const {
+  const std::vector<const Metadata *> &getMDs() const { return MDs; }
+  const SmallVectorImpl<const LocalAsMetadata *> &getFunctionLocalMDs() const {
     return FunctionLocalMDs;
   }
   const TypeList &getTypes() const { return Types; }
@@ -162,13 +182,14 @@ public:
   ///
   void incorporateFunction(const Function &F);
   void purgeFunction();
+  uint64_t computeBitsRequiredForTypeIndicies() const;
 
 private:
   void OptimizeConstants(unsigned CstStart, unsigned CstEnd);
 
   void EnumerateMDNodeOperands(const MDNode *N);
-  void EnumerateMetadata(const Value *MD);
-  void EnumerateFunctionLocalMetadata(const MDNode *N);
+  void EnumerateMetadata(const Metadata *MD);
+  void EnumerateFunctionLocalMetadata(const LocalAsMetadata *Local);
   void EnumerateNamedMDNode(const NamedMDNode *NMD);
   void EnumerateValue(const Value *V);
   void EnumerateType(Type *T);
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index fab1c87..8ab2d6e 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -17,3 +17,4 @@ add_subdirectory(Target)
 add_subdirectory(AsmParser)
 add_subdirectory(LineEditor)
 add_subdirectory(ProfileData)
+add_subdirectory(Fuzzer)
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 91c1314..58b87e1 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -296,6 +296,16 @@ void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx,
   std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
     RegRefs = State->GetRegRefs();
 
+  // FIXME: We must leave subregisters of live super registers as live, so that
+  // we don't clear out the register tracking information for subregisters of
+  // super registers we're still tracking (and with which we're unioning
+  // subregister definitions).
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    if (TRI->isSuperRegister(Reg, *AI) && State->IsLive(*AI)) {
+      DEBUG(if (!header && footer) dbgs() << footer);
+      return;
+    }
+
   if (!State->IsLive(Reg)) {
     KillIndices[Reg] = KillIdx;
     DefIndices[Reg] = ~0u;
@@ -673,6 +683,21 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
           goto next_super_reg;
       }
 
+      // We cannot rename 'Reg' to 'NewReg' if one of the uses of 'Reg' also
+      // defines 'NewReg' via an early-clobber operand.
+      auto Range = RegRefs.equal_range(Reg);
+      for (auto Q = Range.first, QE = Range.second; Q != QE; ++Q) {
+        auto UseMI = Q->second.Operand->getParent();
+        int Idx = UseMI->findRegisterDefOperandIdx(NewReg, false, true, TRI);
+        if (Idx == -1)
+          continue;
+
+        if (UseMI->getOperand(Idx).isEarlyClobber()) {
+          DEBUG(dbgs() << "(ec)");
+          goto next_super_reg;
+        }
+      }
+
       // Record that 'Reg' can be renamed to 'NewReg'.
       RenameMap.insert(std::pair<unsigned, unsigned>(Reg, NewReg));
     }
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 9a3b790..e50b846 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -30,10 +30,9 @@
 
 using namespace llvm;
 
-/// ComputeLinearIndex - Given an LLVM IR aggregate type and a sequence
-/// of insertvalue or extractvalue indices that identify a member, return
-/// the linearized index of the start of the member.
-///
+/// Compute the linearized index of a member in a nested aggregate/struct/array
+/// by recursing and accumulating CurIndex as long as there are indices in the
+/// index list.
 unsigned llvm::ComputeLinearIndex(Type *Ty,
                                   const unsigned *Indices,
                                   const unsigned *IndicesEnd,
@@ -52,16 +51,23 @@ unsigned llvm::ComputeLinearIndex(Type *Ty,
         return ComputeLinearIndex(*EI, Indices+1, IndicesEnd, CurIndex);
       CurIndex = ComputeLinearIndex(*EI, nullptr, nullptr, CurIndex);
     }
+    assert(!Indices && "Unexpected out of bound");
     return CurIndex;
   }
   // Given an array type, recursively traverse the elements.
   else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     Type *EltTy = ATy->getElementType();
-    for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i) {
-      if (Indices && *Indices == i)
-        return ComputeLinearIndex(EltTy, Indices+1, IndicesEnd, CurIndex);
-      CurIndex = ComputeLinearIndex(EltTy, nullptr, nullptr, CurIndex);
+    unsigned NumElts = ATy->getNumElements();
+    // Compute the Linear offset when jumping one element of the array
+    unsigned EltLinearOffset = ComputeLinearIndex(EltTy, nullptr, nullptr, 0);
+    if (Indices) {
+      assert(*Indices < NumElts && "Unexpected out of bound");
+      // If the indice is inside the array, compute the index to the requested
+      // elt and recurse inside the element with the end of the indices list
+      CurIndex += EltLinearOffset* *Indices;
+      return ComputeLinearIndex(EltTy, Indices+1, IndicesEnd, CurIndex);
     }
+    CurIndex += EltLinearOffset*NumElts;
     return CurIndex;
   }
   // We haven't found the type we're looking for, so keep searching.
@@ -512,8 +518,9 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) {
         return false;
     }
 
+  const Function *F = ExitBB->getParent();
   return returnTypeIsEligibleForTailCall(
-      ExitBB->getParent(), I, Ret, *TM.getSubtargetImpl()->getTargetLowering());
+      F, I, Ret, *TM.getSubtargetImpl(*F)->getTargetLowering());
 }
 
 bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
diff --git a/lib/CodeGen/Android.mk b/lib/CodeGen/Android.mk
index 5cb351d..ec3cd77 100644
--- a/lib/CodeGen/Android.mk
+++ b/lib/CodeGen/Android.mk
@@ -24,6 +24,7 @@ codegen_SRC_FILES := \
   ForwardControlFlowIntegrity.cpp \
   GCMetadata.cpp \
   GCMetadataPrinter.cpp \
+  GCRootLowering.cpp \
   GCStrategy.cpp \
   GlobalMerge.cpp \
   IfConversion.cpp \
@@ -95,6 +96,7 @@ codegen_SRC_FILES := \
   ScheduleDAGPrinter.cpp \
   ScoreboardHazardRecognizer.cpp \
   ShadowStackGC.cpp \
+  ShadowStackGCLowering.cpp \
   SjLjEHPrepare.cpp \
   SlotIndexes.cpp \
   SpillPlacement.cpp \
@@ -104,6 +106,7 @@ codegen_SRC_FILES := \
   StackMaps.cpp \
   StackProtector.cpp \
   StackSlotColoring.cpp \
+  StatepointExampleGC.cpp \
   TailDuplication.cpp \
   TargetFrameLoweringImpl.cpp \
   TargetInstrInfo.cpp \
@@ -114,7 +117,8 @@ codegen_SRC_FILES := \
   TargetSchedule.cpp \
   TwoAddressInstructionPass.cpp \
   UnreachableBlockElim.cpp \
-  VirtRegMap.cpp
+  VirtRegMap.cpp \
+  WinEHPrepare.cpp
 
 # For the host
 # =====================================================
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 66c6c63..6fe75ad 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -88,8 +88,7 @@ void ARMException::endFunction(const MachineFunction *) {
                                                   Asm->getFunctionNumber()));
     if (!MMI->getLandingPads().empty()) {
       // Emit references to personality.
-      if (const Function * Personality =
-          MMI->getPersonalities()[MMI->getPersonalityIndex()]) {
+      if (const Function *Personality = MMI->getPersonality()) {
         MCSymbol *PerSym = Asm->getSymbol(Personality);
         Asm->OutStreamer.EmitSymbolAttribute(PerSym, MCSA_Global);
         ATS.emitPersonality(PerSym);
diff --git a/lib/CodeGen/AsmPrinter/Android.mk b/lib/CodeGen/AsmPrinter/Android.mk
index cb8e96a..0ce457f 100644
--- a/lib/CodeGen/AsmPrinter/Android.mk
+++ b/lib/CodeGen/AsmPrinter/Android.mk
@@ -13,6 +13,7 @@ codegen_asmprinter_SRC_FILES := \
   DwarfCFIException.cpp \
   DwarfCompileUnit.cpp \
   DwarfDebug.cpp \
+  DwarfExpression.cpp \
   DwarfFile.cpp \
   DwarfStringPool.cpp \
   DwarfUnit.cpp \
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 8a32713..988381d 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -41,9 +41,11 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -98,15 +100,17 @@ static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &TD,
   return NumBits;
 }
 
-AsmPrinter::AsmPrinter(TargetMachine &tm, MCStreamer &Streamer)
+AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer)
     : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()),
-      MII(tm.getSubtargetImpl()->getInstrInfo()),
-      OutContext(Streamer.getContext()), OutStreamer(Streamer), LastMI(nullptr),
-      LastFn(0), Counter(~0U), SetCounter(0) {
-  DD = nullptr; MMI = nullptr; LI = nullptr; MF = nullptr;
+      OutContext(Streamer->getContext()), OutStreamer(*Streamer.release()),
+      LastMI(nullptr), LastFn(0), Counter(~0U), SetCounter(0) {
+  DD = nullptr;
+  MMI = nullptr;
+  LI = nullptr;
+  MF = nullptr;
   CurrentFnSym = CurrentFnSymForSize = nullptr;
   GCMetadataPrinters = nullptr;
-  VerboseAsm = Streamer.isVerboseAsm();
+  VerboseAsm = OutStreamer.isVerboseAsm();
 }
 
 AsmPrinter::~AsmPrinter() {
@@ -129,16 +133,17 @@ unsigned AsmPrinter::getFunctionNumber() const {
 }
 
 const TargetLoweringObjectFile &AsmPrinter::getObjFileLowering() const {
-  return TM.getSubtargetImpl()->getTargetLowering()->getObjFileLowering();
+  return *TM.getObjFileLowering();
 }
 
 /// getDataLayout - Return information about data layout.
 const DataLayout &AsmPrinter::getDataLayout() const {
-  return *TM.getSubtargetImpl()->getDataLayout();
+  return *TM.getDataLayout();
 }
 
 const MCSubtargetInfo &AsmPrinter::getSubtargetInfo() const {
-  return TM.getSubtarget<MCSubtargetInfo>();
+  assert(MF && "getSubtargetInfo requires a valid MachineFunction!");
+  return MF->getSubtarget<MCSubtargetInfo>();
 }
 
 void AsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
@@ -175,7 +180,7 @@ bool AsmPrinter::doInitialization(Module &M) {
 
   OutStreamer.InitSections(false);
 
-  Mang = new Mangler(TM.getSubtargetImpl()->getDataLayout());
+  Mang = new Mangler(TM.getDataLayout());
 
   // Emit the version-min deplyment target directive if needed.
   //
@@ -210,7 +215,7 @@ bool AsmPrinter::doInitialization(Module &M) {
   assert(MI && "AsmPrinter didn't require GCModuleInfo?");
   for (auto &I : *MI)
     if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*I))
-      MP->beginAssembly(*this);
+      MP->beginAssembly(M, *MI, *this);
 
   // Emit module-level inline asm if it exists.
   if (!M.getModuleInlineAsm().empty()) {
@@ -222,12 +227,25 @@ bool AsmPrinter::doInitialization(Module &M) {
   }
 
   if (MAI->doesSupportDebugInformation()) {
-    if (Triple(TM.getTargetTriple()).isKnownWindowsMSVCEnvironment())
+    bool skip_dwarf = false;
+    if (Triple(TM.getTargetTriple()).isKnownWindowsMSVCEnvironment()) {
       Handlers.push_back(HandlerInfo(new WinCodeViewLineTables(this),
                                      DbgTimerName,
                                      CodeViewLineTablesGroupName));
-    DD = new DwarfDebug(this, &M);
-    Handlers.push_back(HandlerInfo(DD, DbgTimerName, DWARFGroupName));
+      // FIXME: Don't emit DWARF debug info if there's at least one function
+      // with AddressSanitizer instrumentation.
+      // This is a band-aid fix for PR22032.
+      for (auto &F : M.functions()) {
+        if (F.hasFnAttribute(Attribute::SanitizeAddress)) {
+          skip_dwarf = true;
+          break;
+        }
+      }
+    }
+    if (!skip_dwarf) {
+      DD = new DwarfDebug(this, &M);
+      Handlers.push_back(HandlerInfo(DD, DbgTimerName, DWARFGroupName));
+    }
   }
 
   EHStreamer *ES = nullptr;
@@ -241,7 +259,7 @@ bool AsmPrinter::doInitialization(Module &M) {
   case ExceptionHandling::ARM:
     ES = new ARMException(this);
     break;
-  case ExceptionHandling::ItaniumWinEH:
+  case ExceptionHandling::WinEH:
     switch (MAI->getWinEHEncodingType()) {
     default: llvm_unreachable("unsupported unwinding information encoding");
     case WinEH::EncodingType::Itanium:
@@ -323,6 +341,11 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     if (EmitSpecialLLVMGlobal(GV))
       return;
 
+    // Skip the emission of global equivalents. The symbol can be emitted later
+    // on by emitGlobalGOTEquivs in case it turns out to be needed.
+    if (GlobalGOTEquivs.count(getSymbol(GV)))
+      return;
+
     if (isVerbose()) {
       GV->printAsOperand(OutStreamer.GetCommentOS(),
                      /*PrintType=*/false, GV->getParent());
@@ -336,12 +359,17 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   if (!GV->hasInitializer())   // External globals require no extra code.
     return;
 
+  GVSym->redefineIfPossible();
+  if (GVSym->isDefined() || GVSym->isVariable())
+    report_fatal_error("symbol '" + Twine(GVSym->getName()) +
+                       "' is already defined");
+
   if (MAI->hasDotTypeDotSizeDirective())
     OutStreamer.EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject);
 
   SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
 
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   uint64_t Size = DL->getTypeAllocSize(GV->getType()->getElementType());
 
   // If the alignment is specified, we *must* obey it.  Overaligning a global
@@ -508,6 +536,10 @@ void AsmPrinter::EmitFunctionHeader() {
     OutStreamer.GetCommentOS() << '\n';
   }
 
+  // Emit the prefix data.
+  if (F->hasPrefixData())
+    EmitGlobalConstant(F->getPrefixData());
+
   // Emit the CurrentFnSym.  This is a virtual function to allow targets to
   // do their wild and crazy things as required.
   EmitFunctionEntryLabel();
@@ -528,27 +560,32 @@ void AsmPrinter::EmitFunctionHeader() {
     HI.Handler->beginFunction(MF);
   }
 
-  // Emit the prefix data.
-  if (F->hasPrefixData())
-    EmitGlobalConstant(F->getPrefixData());
+  // Emit the prologue data.
+  if (F->hasPrologueData())
+    EmitGlobalConstant(F->getPrologueData());
 }
 
 /// EmitFunctionEntryLabel - Emit the label that is the entrypoint for the
 /// function.  This can be overridden by targets as required to do custom stuff.
 void AsmPrinter::EmitFunctionEntryLabel() {
+  CurrentFnSym->redefineIfPossible();
+
   // The function label could have already been emitted if two symbols end up
   // conflicting due to asm renaming.  Detect this and emit an error.
-  if (CurrentFnSym->isUndefined())
-    return OutStreamer.EmitLabel(CurrentFnSym);
+  if (CurrentFnSym->isVariable())
+    report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
+                       "' is a protected alias");
+  if (CurrentFnSym->isDefined())
+    report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
+                       "' label emitted multiple times to assembly file");
 
-  report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
-                     "' label emitted multiple times to assembly file");
+  return OutStreamer.EmitLabel(CurrentFnSym);
 }
 
 /// emitComments - Pretty-print comments for instructions.
 static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   const MachineFunction *MF = MI.getParent()->getParent();
-  const TargetMachine &TM = MF->getTarget();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
   // Check for spills and reloads
   int FI;
@@ -558,24 +595,20 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   // We assume a single instruction only has a spill or reload, not
   // both.
   const MachineMemOperand *MMO;
-  if (TM.getSubtargetImpl()->getInstrInfo()->isLoadFromStackSlotPostFE(&MI,
-                                                                       FI)) {
+  if (TII->isLoadFromStackSlotPostFE(&MI, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
       MMO = *MI.memoperands_begin();
       CommentOS << MMO->getSize() << "-byte Reload\n";
     }
-  } else if (TM.getSubtargetImpl()->getInstrInfo()->hasLoadFromStackSlot(
-                 &MI, MMO, FI)) {
+  } else if (TII->hasLoadFromStackSlot(&MI, MMO, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI))
       CommentOS << MMO->getSize() << "-byte Folded Reload\n";
-  } else if (TM.getSubtargetImpl()->getInstrInfo()->isStoreToStackSlotPostFE(
-                 &MI, FI)) {
+  } else if (TII->isStoreToStackSlotPostFE(&MI, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
       MMO = *MI.memoperands_begin();
       CommentOS << MMO->getSize() << "-byte Spill\n";
     }
-  } else if (TM.getSubtargetImpl()->getInstrInfo()->hasStoreToStackSlot(
-                 &MI, MMO, FI)) {
+  } else if (TII->hasStoreToStackSlot(&MI, MMO, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI))
       CommentOS << MMO->getSize() << "-byte Folded Spill\n";
   }
@@ -589,9 +622,8 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
 /// that is an implicit def.
 void AsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
   unsigned RegNo = MI->getOperand(0).getReg();
-  OutStreamer.AddComment(
-      Twine("implicit-def: ") +
-      TM.getSubtargetImpl()->getRegisterInfo()->getName(RegNo));
+  OutStreamer.AddComment(Twine("implicit-def: ") +
+                         MMI->getContext().getRegisterInfo()->getName(RegNo));
   OutStreamer.AddBlankLine();
 }
 
@@ -601,7 +633,7 @@ static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
     const MachineOperand &Op = MI->getOperand(i);
     assert(Op.isReg() && "KILL instruction must have only register operands");
     Str += ' ';
-    Str += AP.TM.getSubtargetImpl()->getRegisterInfo()->getName(Op.getReg());
+    Str += AP.MMI->getContext().getRegisterInfo()->getName(Op.getReg());
     Str += (Op.isDef() ? "<def>" : "<kill>");
   }
   AP.OutStreamer.AddComment(Str);
@@ -629,9 +661,9 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
   OS << V.getName();
 
   DIExpression Expr = MI->getDebugExpression();
-  if (Expr.isVariablePiece())
-    OS << " [piece offset=" << Expr.getPieceOffset()
-       << " size=" << Expr.getPieceSize() << "]";
+  if (Expr.isBitPiece())
+    OS << " [bit_piece offset=" << Expr.getBitPieceOffset()
+       << " size=" << Expr.getBitPieceSize() << "]";
   OS << " <- ";
 
   // The second operand is only an offset if it's an immediate.
@@ -663,8 +695,7 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
       Reg = MI->getOperand(0).getReg();
     } else {
       assert(MI->getOperand(0).isFI() && "Unknown operand type");
-      const TargetFrameLowering *TFI =
-          AP.TM.getSubtargetImpl()->getFrameLowering();
+      const TargetFrameLowering *TFI = AP.MF->getSubtarget().getFrameLowering();
       Offset += TFI->getFrameIndexReference(*AP.MF,
                                             MI->getOperand(0).getIndex(), Reg);
       Deref = true;
@@ -678,7 +709,7 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
     }
     if (Deref)
       OS << '[';
-    OS << AP.TM.getSubtargetImpl()->getRegisterInfo()->getName(Reg);
+    OS << AP.MMI->getContext().getRegisterInfo()->getName(Reg);
   }
 
   if (Deref)
@@ -701,8 +732,7 @@ AsmPrinter::CFIMoveType AsmPrinter::needsCFIMoves() {
 }
 
 bool AsmPrinter::needsSEHMoves() {
-  return MAI->getExceptionHandlingType() == ExceptionHandling::ItaniumWinEH &&
-         MF->getFunction()->needsUnwindTableEntry();
+  return MAI->usesWindowsCFI() && MF->getFunction()->needsUnwindTableEntry();
 }
 
 void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) {
@@ -721,6 +751,16 @@ void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) {
   emitCFIInstruction(CFI);
 }
 
+void AsmPrinter::emitFrameAlloc(const MachineInstr &MI) {
+  // The operands are the MCSymbol and the frame offset of the allocation.
+  MCSymbol *FrameAllocSym = MI.getOperand(0).getMCSymbol();
+  int FrameOffset = MI.getOperand(1).getImm();
+
+  // Emit a symbol assignment.
+  OutStreamer.EmitAssignment(FrameAllocSym,
+                             MCConstantExpr::Create(FrameOffset, OutContext));
+}
+
 /// EmitFunctionBody - This method emits the body and trailer for a
 /// function.
 void AsmPrinter::EmitFunctionBody() {
@@ -759,6 +799,10 @@ void AsmPrinter::EmitFunctionBody() {
         emitCFIInstruction(MI);
         break;
 
+      case TargetOpcode::FRAME_ALLOC:
+        emitFrameAlloc(MI);
+        break;
+
       case TargetOpcode::EH_LABEL:
       case TargetOpcode::GC_LABEL:
         OutStreamer.EmitLabel(MI.getOperand(0).getMCSymbol());
@@ -800,7 +844,7 @@ void AsmPrinter::EmitFunctionBody() {
   // labels from collapsing together.  Just emit a noop.
   if ((MAI->hasSubsectionsViaSymbols() && !HasAnyRealCode)) {
     MCInst Noop;
-    TM.getSubtargetImpl()->getInstrInfo()->getNoopForMachoTarget(Noop);
+    MF->getSubtarget().getInstrInfo()->getNoopForMachoTarget(Noop);
     OutStreamer.AddComment("avoids zero-length function");
 
     // Targets can opt-out of emitting the noop here by leaving the opcode
@@ -852,13 +896,95 @@ void AsmPrinter::EmitFunctionBody() {
   OutStreamer.AddBlankLine();
 }
 
-static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP);
+/// \brief Compute the number of Global Variables that uses a Constant.
+static unsigned getNumGlobalVariableUses(const Constant *C) {
+  if (!C)
+    return 0;
+
+  if (isa<GlobalVariable>(C))
+    return 1;
+
+  unsigned NumUses = 0;
+  for (auto *CU : C->users())
+    NumUses += getNumGlobalVariableUses(dyn_cast<Constant>(CU));
+
+  return NumUses;
+}
+
+/// \brief Only consider global GOT equivalents if at least one user is a
+/// cstexpr inside an initializer of another global variables. Also, don't
+/// handle cstexpr inside instructions. During global variable emission,
+/// candidates are skipped and are emitted later in case at least one cstexpr
+/// isn't replaced by a PC relative GOT entry access.
+static bool isGOTEquivalentCandidate(const GlobalVariable *GV,
+                                     unsigned &NumGOTEquivUsers) {
+  // Global GOT equivalents are unnamed private globals with a constant
+  // pointer initializer to another global symbol. They must point to a
+  // GlobalVariable or Function, i.e., as GlobalValue.
+  if (!GV->hasUnnamedAddr() || !GV->hasInitializer() || !GV->isConstant() ||
+      !GV->isDiscardableIfUnused() || !dyn_cast<GlobalValue>(GV->getOperand(0)))
+    return false;
+
+  // To be a got equivalent, at least one of its users need to be a constant
+  // expression used by another global variable.
+  for (auto *U : GV->users())
+    NumGOTEquivUsers += getNumGlobalVariableUses(cast<Constant>(U));
+
+  return NumGOTEquivUsers > 0;
+}
+
+/// \brief Unnamed constant global variables solely contaning a pointer to
+/// another globals variable is equivalent to a GOT table entry; it contains the
+/// the address of another symbol. Optimize it and replace accesses to these
+/// "GOT equivalents" by using the GOT entry for the final global instead.
+/// Compute GOT equivalent candidates among all global variables to avoid
+/// emitting them if possible later on, after it use is replaced by a GOT entry
+/// access.
+void AsmPrinter::computeGlobalGOTEquivs(Module &M) {
+  if (!getObjFileLowering().supportIndirectSymViaGOTPCRel())
+    return;
+
+  for (const auto &G : M.globals()) {
+    unsigned NumGOTEquivUsers = 0;
+    if (!isGOTEquivalentCandidate(&G, NumGOTEquivUsers))
+      continue;
+
+    const MCSymbol *GOTEquivSym = getSymbol(&G);
+    GlobalGOTEquivs[GOTEquivSym] = std::make_pair(&G, NumGOTEquivUsers);
+  }
+}
+
+/// \brief Constant expressions using GOT equivalent globals may not be eligible
+/// for PC relative GOT entry conversion, in such cases we need to emit such
+/// globals we previously omitted in EmitGlobalVariable.
+void AsmPrinter::emitGlobalGOTEquivs() {
+  if (!getObjFileLowering().supportIndirectSymViaGOTPCRel())
+    return;
+
+  while (!GlobalGOTEquivs.empty()) {
+    DenseMap<const MCSymbol *, GOTEquivUsePair>::iterator I =
+      GlobalGOTEquivs.begin();
+    const MCSymbol *S = I->first;
+    const GlobalVariable *GV = I->second.first;
+    GlobalGOTEquivs.erase(S);
+    EmitGlobalVariable(GV);
+  }
+}
 
 bool AsmPrinter::doFinalization(Module &M) {
+  // Gather all GOT equivalent globals in the module. We really need two
+  // passes over the globals: one to compute and another to avoid its emission
+  // in EmitGlobalVariable, otherwise we would not be able to handle cases
+  // where the got equivalent shows up before its use.
+  computeGlobalGOTEquivs(M);
+
   // Emit global variables.
   for (const auto &G : M.globals())
     EmitGlobalVariable(&G);
 
+  // Emit remaining GOT equivalent globals.
+  emitGlobalGOTEquivs();
+
   // Emit visibility info for declarations
   for (const Function &F : M) {
     if (!F.isDeclaration())
@@ -875,10 +1001,15 @@ bool AsmPrinter::doFinalization(Module &M) {
   JumpInstrTableInfo *JITI = getAnalysisIfAvailable<JumpInstrTableInfo>();
 
   if (JITI && !JITI->getTables().empty()) {
+    // Since we're at the module level we can't use a function specific
+    // MCSubtargetInfo - instead create one with the module defaults.
+    std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
+        TM.getTargetTriple(), TM.getTargetCPU(), TM.getTargetFeatureString()));
     unsigned Arch = Triple(getTargetTriple()).getArch();
     bool IsThumb = (Arch == Triple::thumb || Arch == Triple::thumbeb);
+    const TargetInstrInfo *TII = TM.getSubtargetImpl()->getInstrInfo();
     MCInst TrapInst;
-    TM.getSubtargetImpl()->getInstrInfo()->getTrap(TrapInst);
+    TII->getTrap(TrapInst);
     unsigned LogAlignment = llvm::Log2_64(JITI->entryByteAlignment());
 
     // Emit the right section for these functions.
@@ -904,9 +1035,8 @@ bool AsmPrinter::doFinalization(Module &M) {
         const MCSymbolRefExpr *TargetSymRef =
           MCSymbolRefExpr::Create(TargetSymbol, MCSymbolRefExpr::VK_PLT,
                                   OutContext);
-        TM.getSubtargetImpl()->getInstrInfo()->getUnconditionalBranch(
-            JumpToFun, TargetSymRef);
-        OutStreamer.EmitInstruction(JumpToFun, getSubtargetInfo());
+        TII->getUnconditionalBranch(JumpToFun, TargetSymRef);
+        OutStreamer.EmitInstruction(JumpToFun, *STI);
         ++Count;
       }
 
@@ -914,7 +1044,7 @@ bool AsmPrinter::doFinalization(Module &M) {
       uint64_t Remaining = NextPowerOf2(Count) - Count;
       for (uint64_t C = 0; C < Remaining; ++C) {
         EmitAlignment(LogAlignment);
-        OutStreamer.EmitInstruction(TrapInst, getSubtargetInfo());
+        OutStreamer.EmitInstruction(TrapInst, *STI);
       }
 
     }
@@ -974,18 +1104,34 @@ bool AsmPrinter::doFinalization(Module &M) {
     EmitVisibility(Name, Alias.getVisibility());
 
     // Emit the directives as assignments aka .set:
-    OutStreamer.EmitAssignment(Name, lowerConstant(Alias.getAliasee(), *this));
+    OutStreamer.EmitAssignment(Name, lowerConstant(Alias.getAliasee()));
   }
 
   GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
   assert(MI && "AsmPrinter didn't require GCModuleInfo?");
   for (GCModuleInfo::iterator I = MI->end(), E = MI->begin(); I != E; )
     if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(**--I))
-      MP->finishAssembly(*this);
+      MP->finishAssembly(M, *MI, *this);
 
   // Emit llvm.ident metadata in an '.ident' directive.
   EmitModuleIdents(M);
 
+  // Emit __morestack address if needed for indirect calls.
+  if (MMI->usesMorestackAddr()) {
+    const MCSection *ReadOnlySection =
+        getObjFileLowering().getSectionForConstant(SectionKind::getReadOnly(),
+                                                   /*C=*/nullptr);
+    OutStreamer.SwitchSection(ReadOnlySection);
+
+    MCSymbol *AddrSymbol =
+        OutContext.GetOrCreateSymbol(StringRef("__morestack_addr"));
+    OutStreamer.EmitLabel(AddrSymbol);
+
+    unsigned PtrSize = TM.getDataLayout()->getPointerSize(0);
+    OutStreamer.EmitSymbolValue(GetExternalSymbolSymbol("__morestack"),
+                                PtrSize);
+  }
+
   // If we don't have any trampolines, then we don't require stack memory
   // to be executable. Some targets have a directive to declare this.
   Function *InitTrampolineIntrinsic = M.getFunction("llvm.init.trampoline");
@@ -1044,7 +1190,7 @@ void AsmPrinter::EmitConstantPool() {
     unsigned Align = CPE.getAlignment();
 
     SectionKind Kind =
-        CPE.getSectionKind(TM.getSubtargetImpl()->getDataLayout());
+        CPE.getSectionKind(TM.getDataLayout());
 
     const Constant *C = nullptr;
     if (!CPE.isMachineConstantPoolEntry())
@@ -1098,7 +1244,7 @@ void AsmPrinter::EmitConstantPool() {
 
       Type *Ty = CPE.getType();
       Offset = NewOffset +
-               TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(Ty);
+               TM.getDataLayout()->getTypeAllocSize(Ty);
 
       OutStreamer.EmitLabel(Sym);
       if (CPE.isMachineConstantPoolEntry())
@@ -1113,7 +1259,7 @@ void AsmPrinter::EmitConstantPool() {
 /// by the current function to the current output stream.
 ///
 void AsmPrinter::EmitJumpTableInfo() {
-  const DataLayout *DL = MF->getSubtarget().getDataLayout();
+  const DataLayout *DL = MF->getTarget().getDataLayout();
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   if (!MJTI) return;
   if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_Inline) return;
@@ -1123,29 +1269,21 @@ void AsmPrinter::EmitJumpTableInfo() {
   // Pick the directive to use to print the jump table entries, and switch to
   // the appropriate section.
   const Function *F = MF->getFunction();
-  bool JTInDiffSection = false;
-  if (// In PIC mode, we need to emit the jump table to the same section as the
-      // function body itself, otherwise the label differences won't make sense.
-      // FIXME: Need a better predicate for this: what about custom entries?
-      MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 ||
-      // We should also do if the section name is NULL or function is declared
-      // in discardable section
-      // FIXME: this isn't the right predicate, should be based on the MCSection
-      // for the function.
-      F->isWeakForLinker()) {
-    OutStreamer.SwitchSection(
-        getObjFileLowering().SectionForGlobal(F, *Mang, TM));
+  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+  bool JTInDiffSection = !TLOF.shouldPutJumpTableInFunctionSection(
+      MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32,
+      *F);
+  if (!JTInDiffSection) {
+    OutStreamer.SwitchSection(TLOF.SectionForGlobal(F, *Mang, TM));
   } else {
     // Otherwise, drop it in the readonly section.
     const MCSection *ReadOnlySection =
-        getObjFileLowering().getSectionForConstant(SectionKind::getReadOnly(),
-                                                   /*C=*/nullptr);
+        TLOF.getSectionForJumpTable(*F, *Mang, TM);
     OutStreamer.SwitchSection(ReadOnlySection);
-    JTInDiffSection = true;
   }
 
   EmitAlignment(Log2_32(
-      MJTI->getEntryAlignment(*TM.getSubtargetImpl()->getDataLayout())));
+      MJTI->getEntryAlignment(*TM.getDataLayout())));
 
   // Jump tables in code sections are marked with a data_region directive
   // where that's supported.
@@ -1163,7 +1301,7 @@ void AsmPrinter::EmitJumpTableInfo() {
     if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 &&
         MAI->doesSetDirectiveSuppressesReloc()) {
       SmallPtrSet<const MachineBasicBlock*, 16> EmittedSets;
-      const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering();
+      const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
       const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF,JTI,OutContext);
       for (unsigned ii = 0, ee = JTBBs.size(); ii != ee; ++ii) {
         const MachineBasicBlock *MBB = JTBBs[ii];
@@ -1207,9 +1345,8 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
   case MachineJumpTableInfo::EK_Inline:
     llvm_unreachable("Cannot emit EK_Inline jump table entry");
   case MachineJumpTableInfo::EK_Custom32:
-    Value =
-        TM.getSubtargetImpl()->getTargetLowering()->LowerCustomJumpTableEntry(
-            MJTI, MBB, UID, OutContext);
+    Value = MF->getSubtarget().getTargetLowering()->LowerCustomJumpTableEntry(
+        MJTI, MBB, UID, OutContext);
     break;
   case MachineJumpTableInfo::EK_BlockAddress:
     // EK_BlockAddress - Each entry is a plain address of block, e.g.:
@@ -1248,7 +1385,7 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
       break;
     }
     Value = MCSymbolRefExpr::Create(MBB->getSymbol(), OutContext);
-    const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering();
+    const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
     const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF, UID, OutContext);
     Value = MCBinaryExpr::CreateSub(Value, Base, OutContext);
     break;
@@ -1258,7 +1395,7 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
   assert(Value && "Unknown entry kind!");
 
   unsigned EntrySize =
-      MJTI->getEntrySize(*TM.getSubtargetImpl()->getDataLayout());
+      MJTI->getEntrySize(*TM.getDataLayout());
   OutStreamer.EmitValue(Value, EntrySize);
 }
 
@@ -1368,7 +1505,7 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
   }
 
   // Emit the function pointers in the target-specific order
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   unsigned Align = Log2_32(DL->getPointerPrefAlignment());
   std::stable_sort(Structors.begin(), Structors.end(),
                    [](const Structor &L,
@@ -1483,25 +1620,26 @@ void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
 //
 void AsmPrinter::EmitAlignment(unsigned NumBits, const GlobalObject *GV) const {
   if (GV)
-    NumBits = getGVAlignmentLog2(GV, *TM.getSubtargetImpl()->getDataLayout(),
+    NumBits = getGVAlignmentLog2(GV, *TM.getDataLayout(),
                                  NumBits);
 
   if (NumBits == 0) return;   // 1-byte aligned: no need to emit alignment.
 
+  assert(NumBits <
+             static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
+         "undefined behavior");
   if (getCurrentSection()->getKind().isText())
-    OutStreamer.EmitCodeAlignment(1 << NumBits);
+    OutStreamer.EmitCodeAlignment(1u << NumBits);
   else
-    OutStreamer.EmitValueToAlignment(1 << NumBits);
+    OutStreamer.EmitValueToAlignment(1u << NumBits);
 }
 
 //===----------------------------------------------------------------------===//
 // Constant emission.
 //===----------------------------------------------------------------------===//
 
-/// lowerConstant - Lower the specified LLVM Constant to an MCExpr.
-///
-static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
-  MCContext &Ctx = AP.OutContext;
+const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
+  MCContext &Ctx = OutContext;
 
   if (CV->isNullValue() || isa<UndefValue>(CV))
     return MCConstantExpr::Create(0, Ctx);
@@ -1510,19 +1648,18 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
     return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
 
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
-    return MCSymbolRefExpr::Create(AP.getSymbol(GV), Ctx);
+    return MCSymbolRefExpr::Create(getSymbol(GV), Ctx);
 
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
-    return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
+    return MCSymbolRefExpr::Create(GetBlockAddressSymbol(BA), Ctx);
 
   const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
   if (!CE) {
     llvm_unreachable("Unknown constant value to lower!");
   }
 
-  if (const MCExpr *RelocExpr =
-          AP.getObjFileLowering().getExecutableRelativeSymbol(CE, *AP.Mang,
-                                                              AP.TM))
+  if (const MCExpr *RelocExpr
+      = getObjFileLowering().getExecutableRelativeSymbol(CE, *Mang, TM))
     return RelocExpr;
 
   switch (CE->getOpcode()) {
@@ -1531,9 +1668,9 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
     // opportunities. Attempt to fold the expression using DataLayout as a
     // last resort before giving up.
     if (Constant *C = ConstantFoldConstantExpression(
-            CE, AP.TM.getSubtargetImpl()->getDataLayout()))
+            CE, TM.getDataLayout()))
       if (C != CE)
-        return lowerConstant(C, AP);
+        return lowerConstant(C);
 
     // Otherwise report the problem to the user.
     {
@@ -1541,16 +1678,17 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
       raw_string_ostream OS(S);
       OS << "Unsupported expression in static initializer: ";
       CE->printAsOperand(OS, /*PrintType=*/false,
-                     !AP.MF ? nullptr : AP.MF->getFunction()->getParent());
+                     !MF ? nullptr : MF->getFunction()->getParent());
       report_fatal_error(OS.str());
     }
   case Instruction::GetElementPtr: {
-    const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
+    const DataLayout &DL = *TM.getDataLayout();
+
     // Generate a symbolic expression for the byte address
     APInt OffsetAI(DL.getPointerTypeSizeInBits(CE->getType()), 0);
     cast<GEPOperator>(CE)->accumulateConstantOffset(DL, OffsetAI);
 
-    const MCExpr *Base = lowerConstant(CE->getOperand(0), AP);
+    const MCExpr *Base = lowerConstant(CE->getOperand(0));
     if (!OffsetAI)
       return Base;
 
@@ -1566,26 +1704,28 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
     // is reasonable to treat their delta as a 32-bit value.
     // FALL THROUGH.
   case Instruction::BitCast:
-    return lowerConstant(CE->getOperand(0), AP);
+    return lowerConstant(CE->getOperand(0));
 
   case Instruction::IntToPtr: {
-    const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
+    const DataLayout &DL = *TM.getDataLayout();
+
     // Handle casts to pointers by changing them into casts to the appropriate
     // integer type.  This promotes constant folding and simplifies this code.
     Constant *Op = CE->getOperand(0);
     Op = ConstantExpr::getIntegerCast(Op, DL.getIntPtrType(CV->getType()),
                                       false/*ZExt*/);
-    return lowerConstant(Op, AP);
+    return lowerConstant(Op);
   }
 
   case Instruction::PtrToInt: {
-    const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
+    const DataLayout &DL = *TM.getDataLayout();
+
     // Support only foldable casts to/from pointers that can be eliminated by
     // changing the pointer to the appropriately sized integer type.
     Constant *Op = CE->getOperand(0);
     Type *Ty = CE->getType();
 
-    const MCExpr *OpExpr = lowerConstant(Op, AP);
+    const MCExpr *OpExpr = lowerConstant(Op);
 
     // We can emit the pointer value into this slot if the slot is an
     // integer slot equal to the size of the pointer.
@@ -1611,8 +1751,8 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor: {
-    const MCExpr *LHS = lowerConstant(CE->getOperand(0), AP);
-    const MCExpr *RHS = lowerConstant(CE->getOperand(1), AP);
+    const MCExpr *LHS = lowerConstant(CE->getOperand(0));
+    const MCExpr *RHS = lowerConstant(CE->getOperand(1));
     switch (CE->getOpcode()) {
     default: llvm_unreachable("Unknown binary operator constant cast expr");
     case Instruction::Add: return MCBinaryExpr::CreateAdd(LHS, RHS, Ctx);
@@ -1629,7 +1769,9 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
   }
 }
 
-static void emitGlobalConstantImpl(const Constant *C, AsmPrinter &AP);
+static void emitGlobalConstantImpl(const Constant *C, AsmPrinter &AP,
+                                   const Constant *BaseCV = nullptr,
+                                   uint64_t Offset = 0);
 
 /// isRepeatedByteSequence - Determine whether the given value is
 /// composed of a repeated sequence of identical bytes and return the
@@ -1653,7 +1795,7 @@ static int isRepeatedByteSequence(const Value *V, TargetMachine &TM) {
     if (CI->getBitWidth() > 64) return -1;
 
     uint64_t Size =
-        TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(V->getType());
+        TM.getDataLayout()->getTypeAllocSize(V->getType());
     uint64_t Value = CI->getZExtValue();
 
     // Make sure the constant is at least 8 bits long and has a power
@@ -1698,7 +1840,7 @@ static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
   int Value = isRepeatedByteSequence(CDS, AP.TM);
   if (Value != -1) {
     uint64_t Bytes =
-        AP.TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(
+        AP.TM.getDataLayout()->getTypeAllocSize(
             CDS->getType());
     // Don't emit a 1-byte object as a .fill.
     if (Bytes > 1)
@@ -1749,7 +1891,7 @@ static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
     }
   }
 
-  const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout &DL = *AP.TM.getDataLayout();
   unsigned Size = DL.getTypeAllocSize(CDS->getType());
   unsigned EmittedSize = DL.getTypeAllocSize(CDS->getType()->getElementType()) *
                         CDS->getNumElements();
@@ -1758,20 +1900,22 @@ static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
 
 }
 
-static void emitGlobalConstantArray(const ConstantArray *CA, AsmPrinter &AP) {
+static void emitGlobalConstantArray(const ConstantArray *CA, AsmPrinter &AP,
+                                    const Constant *BaseCV, uint64_t Offset) {
   // See if we can aggregate some values.  Make sure it can be
   // represented as a series of bytes of the constant value.
   int Value = isRepeatedByteSequence(CA, AP.TM);
+  const DataLayout &DL = *AP.TM.getDataLayout();
 
   if (Value != -1) {
-    uint64_t Bytes =
-        AP.TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(
-            CA->getType());
+    uint64_t Bytes = DL.getTypeAllocSize(CA->getType());
     AP.OutStreamer.EmitFill(Bytes, Value);
   }
   else {
-    for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i)
-      emitGlobalConstantImpl(CA->getOperand(i), AP);
+    for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i) {
+      emitGlobalConstantImpl(CA->getOperand(i), AP, BaseCV, Offset);
+      Offset += DL.getTypeAllocSize(CA->getOperand(i)->getType());
+    }
   }
 }
 
@@ -1779,7 +1923,7 @@ static void emitGlobalConstantVector(const ConstantVector *CV, AsmPrinter &AP) {
   for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i)
     emitGlobalConstantImpl(CV->getOperand(i), AP);
 
-  const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout &DL = *AP.TM.getDataLayout();
   unsigned Size = DL.getTypeAllocSize(CV->getType());
   unsigned EmittedSize = DL.getTypeAllocSize(CV->getType()->getElementType()) *
                          CV->getType()->getNumElements();
@@ -1787,24 +1931,25 @@ static void emitGlobalConstantVector(const ConstantVector *CV, AsmPrinter &AP) {
     AP.OutStreamer.EmitZeros(Padding);
 }
 
-static void emitGlobalConstantStruct(const ConstantStruct *CS, AsmPrinter &AP) {
+static void emitGlobalConstantStruct(const ConstantStruct *CS, AsmPrinter &AP,
+                                     const Constant *BaseCV, uint64_t Offset) {
   // Print the fields in successive locations. Pad to align if needed!
-  const DataLayout *DL = AP.TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = AP.TM.getDataLayout();
   unsigned Size = DL->getTypeAllocSize(CS->getType());
   const StructLayout *Layout = DL->getStructLayout(CS->getType());
   uint64_t SizeSoFar = 0;
   for (unsigned i = 0, e = CS->getNumOperands(); i != e; ++i) {
     const Constant *Field = CS->getOperand(i);
 
+    // Print the actual field value.
+    emitGlobalConstantImpl(Field, AP, BaseCV, Offset+SizeSoFar);
+
     // Check if padding is needed and insert one or more 0s.
     uint64_t FieldSize = DL->getTypeAllocSize(Field->getType());
     uint64_t PadSize = ((i == e-1 ? Size : Layout->getElementOffset(i+1))
                         - Layout->getElementOffset(i)) - FieldSize;
     SizeSoFar += FieldSize + PadSize;
 
-    // Now print the actual field value.
-    emitGlobalConstantImpl(Field, AP);
-
     // Insert padding - this may include padding to increase the size of the
     // current field up to the ABI size (if the struct is not packed) as well
     // as padding to ensure that the next field starts at the right offset.
@@ -1839,7 +1984,7 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
 
   // PPC's long double has odd notions of endianness compared to how LLVM
   // handles it: p[0] goes first for *big* endian on PPC.
-  if (AP.TM.getSubtargetImpl()->getDataLayout()->isBigEndian() &&
+  if (AP.TM.getDataLayout()->isBigEndian() &&
       !CFP->getType()->isPPC_FP128Ty()) {
     int Chunk = API.getNumWords() - 1;
 
@@ -1858,13 +2003,13 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
   }
 
   // Emit the tail padding for the long double.
-  const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout &DL = *AP.TM.getDataLayout();
   AP.OutStreamer.EmitZeros(DL.getTypeAllocSize(CFP->getType()) -
                            DL.getTypeStoreSize(CFP->getType()));
 }
 
 static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
-  const DataLayout *DL = AP.TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = AP.TM.getDataLayout();
   unsigned BitWidth = CI->getBitWidth();
 
   // Copy the value as we may massage the layout for constants whose bit width
@@ -1910,7 +2055,7 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
     // Emit the extra bits after the 64-bits chunks.
 
     // Emit a directive that fills the expected size.
-    uint64_t Size = AP.TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(
+    uint64_t Size = AP.TM.getDataLayout()->getTypeAllocSize(
         CI->getType());
     Size -= (BitWidth / 64) * 8;
     assert(Size && Size * 8 >= ExtraBitsSize &&
@@ -1920,9 +2065,100 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
   }
 }
 
-static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP) {
-  const DataLayout *DL = AP.TM.getSubtargetImpl()->getDataLayout();
+/// \brief Transform a not absolute MCExpr containing a reference to a GOT
+/// equivalent global, by a target specific GOT pc relative access to the
+/// final symbol.
+static void handleIndirectSymViaGOTPCRel(AsmPrinter &AP, const MCExpr **ME,
+                                         const Constant *BaseCst,
+                                         uint64_t Offset) {
+  // The global @foo below illustrates a global that uses a got equivalent.
+  //
+  //  @bar = global i32 42
+  //  @gotequiv = private unnamed_addr constant i32* @bar
+  //  @foo = i32 trunc (i64 sub (i64 ptrtoint (i32** @gotequiv to i64),
+  //                             i64 ptrtoint (i32* @foo to i64))
+  //                        to i32)
+  //
+  // The cstexpr in @foo is converted into the MCExpr `ME`, where we actually
+  // check whether @foo is suitable to use a GOTPCREL. `ME` is usually in the
+  // form:
+  //
+  //  foo = cstexpr, where
+  //    cstexpr := <gotequiv> - "." + <cst>
+  //    cstexpr := <gotequiv> - (<foo> - <offset from @foo base>) + <cst>
+  //
+  // After canonicalization by EvaluateAsRelocatable `ME` turns into:
+  //
+  //  cstexpr := <gotequiv> - <foo> + gotpcrelcst, where
+  //    gotpcrelcst := <offset from @foo base> + <cst>
+  //
+  MCValue MV;
+  if (!(*ME)->EvaluateAsRelocatable(MV, nullptr, nullptr) || MV.isAbsolute())
+    return;
+
+  const MCSymbol *GOTEquivSym = &MV.getSymA()->getSymbol();
+  if (!AP.GlobalGOTEquivs.count(GOTEquivSym))
+    return;
+
+  const GlobalValue *BaseGV = dyn_cast<GlobalValue>(BaseCst);
+  if (!BaseGV)
+    return;
+
+  const MCSymbol *BaseSym = AP.getSymbol(BaseGV);
+  if (BaseSym != &MV.getSymB()->getSymbol())
+    return;
+
+  // Make sure to match:
+  //
+  //    gotpcrelcst := <offset from @foo base> + <cst>
+  //
+  int64_t GOTPCRelCst = Offset + MV.getConstant();
+  if (GOTPCRelCst < 0)
+    return;
+
+  // Emit the GOT PC relative to replace the got equivalent global, i.e.:
+  //
+  //  bar:
+  //    .long 42
+  //  gotequiv:
+  //    .quad bar
+  //  foo:
+  //    .long gotequiv - "." + <cst>
+  //
+  // is replaced by the target specific equivalent to:
+  //
+  //  bar:
+  //    .long 42
+  //  foo:
+  //    .long bar@GOTPCREL+<gotpcrelcst>
+  //
+  AsmPrinter::GOTEquivUsePair Result = AP.GlobalGOTEquivs[GOTEquivSym];
+  const GlobalVariable *GV = Result.first;
+  unsigned NumUses = Result.second;
+  const GlobalValue *FinalGV = dyn_cast<GlobalValue>(GV->getOperand(0));
+  const MCSymbol *FinalSym = AP.getSymbol(FinalGV);
+  *ME = AP.getObjFileLowering().getIndirectSymViaGOTPCRel(FinalSym,
+                                                          GOTPCRelCst);
+
+  // Update GOT equivalent usage information
+  --NumUses;
+  if (NumUses)
+    AP.GlobalGOTEquivs[GOTEquivSym] = std::make_pair(GV, NumUses);
+  else
+    AP.GlobalGOTEquivs.erase(GOTEquivSym);
+}
+
+static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP,
+                                   const Constant *BaseCV, uint64_t Offset) {
+  const DataLayout *DL = AP.TM.getDataLayout();
   uint64_t Size = DL->getTypeAllocSize(CV->getType());
+
+  // Globals with sub-elements such as combinations of arrays and structs
+  // are handled recursively by emitGlobalConstantImpl. Keep track of the
+  // constant symbol base and the current position with BaseCV and Offset.
+  if (!BaseCV && CV->hasOneUse())
+    BaseCV = dyn_cast<Constant>(CV->user_back());
+
   if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV))
     return AP.OutStreamer.EmitZeros(Size);
 
@@ -1955,10 +2191,10 @@ static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP) {
     return emitGlobalConstantDataSequential(CDS, AP);
 
   if (const ConstantArray *CVA = dyn_cast<ConstantArray>(CV))
-    return emitGlobalConstantArray(CVA, AP);
+    return emitGlobalConstantArray(CVA, AP, BaseCV, Offset);
 
   if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV))
-    return emitGlobalConstantStruct(CVS, AP);
+    return emitGlobalConstantStruct(CVS, AP, BaseCV, Offset);
 
   if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
     // Look through bitcasts, which might not be able to be MCExpr'ized (e.g. of
@@ -1981,13 +2217,21 @@ static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP) {
 
   // Otherwise, it must be a ConstantExpr.  Lower it to an MCExpr, then emit it
   // thread the streamer with EmitValue.
-  AP.OutStreamer.EmitValue(lowerConstant(CV, AP), Size);
+  const MCExpr *ME = AP.lowerConstant(CV);
+
+  // Since lowerConstant already folded and got rid of all IR pointer and
+  // integer casts, detect GOT equivalent accesses by looking into the MCExpr
+  // directly.
+  if (AP.getObjFileLowering().supportIndirectSymViaGOTPCRel())
+    handleIndirectSymViaGOTPCRel(AP, &ME, BaseCV, Offset);
+
+  AP.OutStreamer.EmitValue(ME, Size);
 }
 
 /// EmitGlobalConstant - Print a general LLVM constant to the .s file.
 void AsmPrinter::EmitGlobalConstant(const Constant *CV) {
   uint64_t Size =
-      TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(CV->getType());
+      TM.getDataLayout()->getTypeAllocSize(CV->getType());
   if (Size)
     emitGlobalConstantImpl(CV, *this);
   else if (MAI->hasSubsectionsViaSymbols()) {
@@ -2015,16 +2259,16 @@ void AsmPrinter::printOffset(int64_t Offset, raw_ostream &OS) const {
 
 /// GetTempSymbol - Return the MCSymbol corresponding to the assembler
 /// temporary label with the specified stem and unique ID.
-MCSymbol *AsmPrinter::GetTempSymbol(Twine Name, unsigned ID) const {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+MCSymbol *AsmPrinter::GetTempSymbol(const Twine &Name, unsigned ID) const {
+  const DataLayout *DL = TM.getDataLayout();
   return OutContext.GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix()) +
                                       Name + Twine(ID));
 }
 
 /// GetTempSymbol - Return an assembler temporary label with the specified
 /// stem.
-MCSymbol *AsmPrinter::GetTempSymbol(Twine Name) const {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+MCSymbol *AsmPrinter::GetTempSymbol(const Twine &Name) const {
+  const DataLayout *DL = TM.getDataLayout();
   return OutContext.GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix())+
                                       Name);
 }
@@ -2040,7 +2284,7 @@ MCSymbol *AsmPrinter::GetBlockAddressSymbol(const BasicBlock *BB) const {
 
 /// GetCPISymbol - Return the symbol for the specified constant pool entry.
 MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   return OutContext.GetOrCreateSymbol
     (Twine(DL->getPrivateGlobalPrefix()) + "CPI" + Twine(getFunctionNumber())
      + "_" + Twine(CPID));
@@ -2054,7 +2298,7 @@ MCSymbol *AsmPrinter::GetJTISymbol(unsigned JTID, bool isLinkerPrivate) const {
 /// GetJTSetSymbol - Return the symbol for the specified jump table .set
 /// FIXME: privatize to AsmPrinter.
 MCSymbol *AsmPrinter::GetJTSetSymbol(unsigned UID, unsigned MBBID) const {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   return OutContext.GetOrCreateSymbol
   (Twine(DL->getPrivateGlobalPrefix()) + Twine(getFunctionNumber()) + "_" +
    Twine(UID) + "_set_" + Twine(MBBID));
@@ -2252,6 +2496,11 @@ GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) {
   if (!S.usesMetadata())
     return nullptr;
 
+  assert(!S.useStatepoints() && "statepoints do not currently support custom"
+         " stackmap formats, please see the documentation for a description of"
+         " the default format.  If you really need a custom serialized format,"
+         " please file a bug");
+
   gcp_map_type &GCMap = getGCMap(GCMetadataPrinters);
   gcp_map_type::iterator GCPI = GCMap.find(&S);
   if (GCPI != GCMap.end())
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index 05f6a68..d0958c1 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -12,26 +12,44 @@
 //===----------------------------------------------------------------------===//
 
 #include "ByteStreamer.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/ADT/SmallBitVector.h"
+#include "DwarfExpression.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
+void DebugLocDwarfExpression::EmitOp(uint8_t Op, const char *Comment) {
+  BS.EmitInt8(
+      Op, Comment ? Twine(Comment) + " " + dwarf::OperationEncodingString(Op)
+                  : dwarf::OperationEncodingString(Op));
+}
+
+void DebugLocDwarfExpression::EmitSigned(int Value) {
+  BS.EmitSLEB128(Value, Twine(Value));
+}
+
+void DebugLocDwarfExpression::EmitUnsigned(unsigned Value) {
+  BS.EmitULEB128(Value, Twine(Value));
+}
+
+bool DebugLocDwarfExpression::isFrameRegister(unsigned MachineReg) {
+  // This information is not available while emitting .debug_loc entries.
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Dwarf Emission Helper Routines
 //===----------------------------------------------------------------------===//
@@ -131,7 +149,7 @@ unsigned AsmPrinter::GetSizeOfEncodedValue(unsigned Encoding) const {
   default:
     llvm_unreachable("Invalid encoded value.");
   case dwarf::DW_EH_PE_absptr:
-    return TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
+    return TM.getDataLayout()->getPointerSize();
   case dwarf::DW_EH_PE_udata2:
     return 2;
   case dwarf::DW_EH_PE_udata4:
@@ -187,57 +205,6 @@ void AsmPrinter::EmitSectionOffset(const MCSymbol *Label,
   EmitLabelDifference(Label, SectionLabel, 4);
 }
 
-/// Emit a dwarf register operation.
-static void emitDwarfRegOp(ByteStreamer &Streamer, int Reg) {
-  assert(Reg >= 0);
-  if (Reg < 32) {
-    Streamer.EmitInt8(dwarf::DW_OP_reg0 + Reg,
-                      dwarf::OperationEncodingString(dwarf::DW_OP_reg0 + Reg));
-  } else {
-    Streamer.EmitInt8(dwarf::DW_OP_regx, "DW_OP_regx");
-    Streamer.EmitULEB128(Reg, Twine(Reg));
-  }
-}
-
-/// Emit an (double-)indirect dwarf register operation.
-static void emitDwarfRegOpIndirect(ByteStreamer &Streamer, int Reg, int Offset,
-                                   bool Deref) {
-  assert(Reg >= 0);
-  if (Reg < 32) {
-    Streamer.EmitInt8(dwarf::DW_OP_breg0 + Reg,
-                      dwarf::OperationEncodingString(dwarf::DW_OP_breg0 + Reg));
-  } else {
-    Streamer.EmitInt8(dwarf::DW_OP_bregx, "DW_OP_bregx");
-    Streamer.EmitULEB128(Reg, Twine(Reg));
-  }
-  Streamer.EmitSLEB128(Offset);
-  if (Deref)
-    Streamer.EmitInt8(dwarf::DW_OP_deref, "DW_OP_deref");
-}
-
-void AsmPrinter::EmitDwarfOpPiece(ByteStreamer &Streamer, unsigned SizeInBits,
-                                  unsigned OffsetInBits) const {
-  assert(SizeInBits > 0 && "piece has size zero");
-  const unsigned SizeOfByte = 8;
-  if (OffsetInBits > 0 || SizeInBits % SizeOfByte) {
-    Streamer.EmitInt8(dwarf::DW_OP_bit_piece, "DW_OP_bit_piece");
-    Streamer.EmitULEB128(SizeInBits, Twine(SizeInBits));
-    Streamer.EmitULEB128(OffsetInBits, Twine(OffsetInBits));
-  } else {
-    Streamer.EmitInt8(dwarf::DW_OP_piece, "DW_OP_piece");
-    unsigned ByteSize = SizeInBits / SizeOfByte;
-    Streamer.EmitULEB128(ByteSize, Twine(ByteSize));
-  }
-}
-
-/// Emit a shift-right dwarf expression.
-static void emitDwarfOpShr(ByteStreamer &Streamer,
-                           unsigned ShiftBy) {
-  Streamer.EmitInt8(dwarf::DW_OP_constu, "DW_OP_constu");
-  Streamer.EmitULEB128(ShiftBy);
-  Streamer.EmitInt8(dwarf::DW_OP_shr, "DW_OP_shr");
-}
-
 // Some targets do not provide a DWARF register number for every
 // register.  This function attempts to emit a DWARF register by
 // emitting a piece of a super-register or by piecing together
@@ -247,112 +214,44 @@ void AsmPrinter::EmitDwarfRegOpPiece(ByteStreamer &Streamer,
                                      unsigned PieceSizeInBits,
                                      unsigned PieceOffsetInBits) const {
   assert(MLoc.isReg() && "MLoc must be a register");
-  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
-  int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false);
-
-  // If this is a valid register number, emit it.
-  if (Reg >= 0) {
-    emitDwarfRegOp(Streamer, Reg);
-    EmitDwarfOpPiece(Streamer, PieceSizeInBits, PieceOffsetInBits);
-    return;
-  }
-
-  // Walk up the super-register chain until we find a valid number.
-  // For example, EAX on x86_64 is a 32-bit piece of RAX with offset 0.
-  for (MCSuperRegIterator SR(MLoc.getReg(), TRI); SR.isValid(); ++SR) {
-    Reg = TRI->getDwarfRegNum(*SR, false);
-    if (Reg >= 0) {
-      unsigned Idx = TRI->getSubRegIndex(*SR, MLoc.getReg());
-      unsigned Size = TRI->getSubRegIdxSize(Idx);
-      unsigned RegOffset = TRI->getSubRegIdxOffset(Idx);
-      OutStreamer.AddComment("super-register");
-      emitDwarfRegOp(Streamer, Reg);
-      if (PieceOffsetInBits == RegOffset) {
-        EmitDwarfOpPiece(Streamer, Size, RegOffset);
-      } else {
-        // If this is part of a variable in a sub-register at a
-        // non-zero offset, we need to manually shift the value into
-        // place, since the DW_OP_piece describes the part of the
-        // variable, not the position of the subregister.
-        if (RegOffset)
-          emitDwarfOpShr(Streamer, RegOffset);
-        EmitDwarfOpPiece(Streamer, Size, PieceOffsetInBits);
-      }
-      return;
-    }
-  }
-
-  // Otherwise, attempt to find a covering set of sub-register numbers.
-  // For example, Q0 on ARM is a composition of D0+D1.
-  //
-  // Keep track of the current position so we can emit the more
-  // efficient DW_OP_piece.
-  unsigned CurPos = PieceOffsetInBits;
-  // The size of the register in bits, assuming 8 bits per byte.
-  unsigned RegSize = TRI->getMinimalPhysRegClass(MLoc.getReg())->getSize() * 8;
-  // Keep track of the bits in the register we already emitted, so we
-  // can avoid emitting redundant aliasing subregs.
-  SmallBitVector Coverage(RegSize, false);
-  for (MCSubRegIterator SR(MLoc.getReg(), TRI); SR.isValid(); ++SR) {
-    unsigned Idx = TRI->getSubRegIndex(MLoc.getReg(), *SR);
-    unsigned Size = TRI->getSubRegIdxSize(Idx);
-    unsigned Offset = TRI->getSubRegIdxOffset(Idx);
-    Reg = TRI->getDwarfRegNum(*SR, false);
-
-    // Intersection between the bits we already emitted and the bits
-    // covered by this subregister.
-    SmallBitVector Intersection(RegSize, false);
-    Intersection.set(Offset, Offset + Size);
-    Intersection ^= Coverage;
-
-    // If this sub-register has a DWARF number and we haven't covered
-    // its range, emit a DWARF piece for it.
-    if (Reg >= 0 && Intersection.any()) {
-      OutStreamer.AddComment("sub-register");
-      emitDwarfRegOp(Streamer, Reg);
-      EmitDwarfOpPiece(Streamer, Size, Offset == CurPos ? 0 : Offset);
-      CurPos = Offset + Size;
-
-      // Mark it as emitted.
-      Coverage.set(Offset, Offset + Size);
-    }
-  }
+  DebugLocDwarfExpression Expr(*this, Streamer);
+  Expr.AddMachineRegPiece(MLoc.getReg(), PieceSizeInBits, PieceOffsetInBits);
+}
 
-  if (CurPos == PieceOffsetInBits) {
-    // FIXME: We have no reasonable way of handling errors in here.
-    Streamer.EmitInt8(dwarf::DW_OP_nop,
-                      "nop (could not find a dwarf register number)");
-  }
+void AsmPrinter::EmitDwarfOpPiece(ByteStreamer &Streamer,
+                                  unsigned PieceSizeInBits,
+                                  unsigned PieceOffsetInBits) const {
+  DebugLocDwarfExpression Expr(*this, Streamer);
+  Expr.AddOpPiece(PieceSizeInBits, PieceOffsetInBits);
 }
 
 /// EmitDwarfRegOp - Emit dwarf register operation.
 void AsmPrinter::EmitDwarfRegOp(ByteStreamer &Streamer,
-                                const MachineLocation &MLoc,
-                                bool Indirect) const {
-  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
-  int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false);
+                                const MachineLocation &MLoc) const {
+  DebugLocDwarfExpression Expr(*this, Streamer);
+  const MCRegisterInfo *MRI = MMI->getContext().getRegisterInfo();
+  int Reg = MRI->getDwarfRegNum(MLoc.getReg(), false);
   if (Reg < 0) {
     // We assume that pointers are always in an addressable register.
-    if (Indirect || MLoc.isIndirect()) {
+    if (MLoc.isIndirect())
       // FIXME: We have no reasonable way of handling errors in here. The
       // caller might be in the middle of a dwarf expression. We should
       // probably assert that Reg >= 0 once debug info generation is more
       // mature.
-      Streamer.EmitInt8(dwarf::DW_OP_nop,
-                        "nop (invalid dwarf register number for indirect loc)");
-      return;
-    }
+      return Expr.EmitOp(dwarf::DW_OP_nop,
+                         "nop (could not find a dwarf register number)");
 
     // Attempt to find a valid super- or sub-register.
-    return EmitDwarfRegOpPiece(Streamer, MLoc);
+    if (!Expr.AddMachineRegPiece(MLoc.getReg()))
+      Expr.EmitOp(dwarf::DW_OP_nop,
+                  "nop (could not find a dwarf register number)");
+    return;
   }
 
   if (MLoc.isIndirect())
-    emitDwarfRegOpIndirect(Streamer, Reg, MLoc.getOffset(), Indirect);
-  else if (Indirect)
-    emitDwarfRegOpIndirect(Streamer, Reg, 0, false);
+    Expr.AddRegIndirect(Reg, MLoc.getOffset());
   else
-    emitDwarfRegOp(Streamer, Reg);
+    Expr.AddReg(Reg);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index cca5f22..e6e7c97 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -64,7 +65,7 @@ static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
 
     if (LocInfo->getNumOperands() != 0)
       if (const ConstantInt *CI =
-          dyn_cast<ConstantInt>(LocInfo->getOperand(ErrorLine)))
+              mdconst::dyn_extract<ConstantInt>(LocInfo->getOperand(ErrorLine)))
         LocCookie = CI->getZExtValue();
   }
 
@@ -90,8 +91,19 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
   assert(MCAI && "No MCAsmInfo");
   if (!MCAI->useIntegratedAssembler() &&
       !OutStreamer.isIntegratedAssemblerRequired()) {
+    emitInlineAsmStart();
     OutStreamer.EmitRawText(Str);
-    emitInlineAsmEnd(TM.getSubtarget<MCSubtargetInfo>(), nullptr);
+    // If we have a machine function then grab the MCSubtarget off of that,
+    // otherwise we're at the module level and want to construct one from
+    // the default CPU and target triple.
+    if (MF) {
+      emitInlineAsmEnd(MF->getSubtarget<MCSubtargetInfo>(), nullptr);
+    } else {
+      std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
+          TM.getTargetTriple(), TM.getTargetCPU(),
+          TM.getTargetFeatureString()));
+      emitInlineAsmEnd(*STI, nullptr);
+    }
     return;
   }
 
@@ -137,11 +149,13 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
   // emitInlineAsmEnd().
   MCSubtargetInfo STIOrig = *STI;
 
-  MCTargetOptions MCOptions;
-  if (MF)
-    MCOptions = MF->getTarget().Options.MCOptions;
-  std::unique_ptr<MCTargetAsmParser> TAP(
-      TM.getTarget().createMCAsmParser(*STI, *Parser, *MII, MCOptions));
+  // We create a new MCInstrInfo here since we might be at the module level
+  // and not have a MachineFunction to initialize the TargetInstrInfo from and
+  // we only need MCInstrInfo for asm parsing. We create one unconditionally
+  // because it's not subtarget dependent.
+  std::unique_ptr<MCInstrInfo> MII(TM.getTarget().createMCInstrInfo());
+  std::unique_ptr<MCTargetAsmParser> TAP(TM.getTarget().createMCAsmParser(
+      *STI, *Parser, *MII, TM.Options.MCOptions));
   if (!TAP)
     report_fatal_error("Inline asm not supported by this streamer because"
                        " we don't have an asm parser for this target\n");
@@ -152,6 +166,7 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
     TAP->SetFrameRegister(TRI->getFrameRegister(*MF));
   }
 
+  emitInlineAsmStart();
   // Don't implicitly switch to the text section before the asm.
   int Res = Parser->Run(/*NoInitialTextSection*/ true,
                         /*NoFinalize*/ true);
@@ -467,7 +482,8 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
     if (MI->getOperand(i-1).isMetadata() &&
         (LocMD = MI->getOperand(i-1).getMetadata()) &&
         LocMD->getNumOperands() != 0) {
-      if (const ConstantInt *CI = dyn_cast<ConstantInt>(LocMD->getOperand(0))) {
+      if (const ConstantInt *CI =
+              mdconst::dyn_extract<ConstantInt>(LocMD->getOperand(0))) {
         LocCookie = CI->getZExtValue();
         break;
       }
@@ -505,7 +521,7 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
 /// for their own strange codes.
 void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
                               const char *Code) const {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   if (!strcmp(Code, "private")) {
     OS << DL->getPrivateGlobalPrefix();
   } else if (!strcmp(Code, "comment")) {
@@ -566,5 +582,7 @@ bool AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
   return true;
 }
 
+void AsmPrinter::emitInlineAsmStart() const {}
+
 void AsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
                                   const MCSubtargetInfo *EndInfo) const {}
diff --git a/lib/CodeGen/AsmPrinter/ByteStreamer.h b/lib/CodeGen/AsmPrinter/ByteStreamer.h
index 0cc8353..42be114 100644
--- a/lib/CodeGen/AsmPrinter/ByteStreamer.h
+++ b/lib/CodeGen/AsmPrinter/ByteStreamer.h
@@ -15,10 +15,10 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_BYTESTREAMER_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_BYTESTREAMER_H
 
+#include "DIEHash.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/MC/MCStreamer.h"
-#include "DIEHash.h"
 
 namespace llvm {
 class ByteStreamer {
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index e6b7d64..01d2c72 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_library(LLVMAsmPrinter
   DwarfCFIException.cpp
   DwarfCompileUnit.cpp
   DwarfDebug.cpp
+  DwarfExpression.cpp
   DwarfFile.cpp
   DwarfStringPool.cpp
   DwarfUnit.cpp
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 50ea369..64ba56b 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -11,8 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DIE.h"
-
+#include "llvm/CodeGen/DIE.h"
 #include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
 #include "DwarfUnit.h"
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.cpp b/lib/CodeGen/AsmPrinter/DIEHash.cpp
index b2a3ba8..1e2ba2c 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -13,11 +13,11 @@
 
 #include "ByteStreamer.h"
 #include "DIEHash.h"
-#include "DIE.h"
 #include "DwarfDebug.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Endian.h"
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.h b/lib/CodeGen/AsmPrinter/DIEHash.h
index 872aa0e..ac014b7 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.h
+++ b/lib/CodeGen/AsmPrinter/DIEHash.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DIEHASH_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DIEHASH_H
 
-#include "DIE.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/Support/MD5.h"
 
 namespace llvm {
diff --git a/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
index 6cca985..6d55c03 100644
--- a/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+++ b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
@@ -11,8 +11,8 @@
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
-#include "llvm/MC/MachineLocation.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MachineLocation.h"
 
 namespace llvm {
 class MDNode;
@@ -74,7 +74,7 @@ public:
     MachineLocation getLoc() const { return Loc; }
     const MDNode *getVariableNode() const { return Variable; }
     DIVariable getVariable() const { return DIVariable(Variable); }
-    bool isVariablePiece() const { return getExpression().isVariablePiece(); }
+    bool isBitPiece() const { return getExpression().isBitPiece(); }
     DIExpression getExpression() const { return DIExpression(Expression); }
     friend bool operator==(const Value &, const Value &);
     friend bool operator<(const Value &, const Value &);
@@ -101,8 +101,8 @@ public:
       DIVariable Var(Values[0].Variable);
       DIExpression NextExpr(Next.Values[0].Expression);
       DIVariable NextVar(Next.Values[0].Variable);
-      if (Var == NextVar && Expr.isVariablePiece() &&
-          NextExpr.isVariablePiece()) {
+      if (Var == NextVar && Expr.isBitPiece() &&
+          NextExpr.isBitPiece()) {
         addValues(Next.Values);
         End = Next.End;
         return true;
@@ -131,7 +131,7 @@ public:
     Values.append(Vals.begin(), Vals.end());
     sortUniqueValues();
     assert(std::all_of(Values.begin(), Values.end(), [](DebugLocEntry::Value V){
-          return V.isVariablePiece();
+          return V.isBitPiece();
         }) && "value must be a piece");
   }
 
@@ -176,8 +176,8 @@ inline bool operator==(const DebugLocEntry::Value &A,
 /// Compare two pieces based on their offset.
 inline bool operator<(const DebugLocEntry::Value &A,
                       const DebugLocEntry::Value &B) {
-  return A.getExpression().getPieceOffset() <
-         B.getExpression().getPieceOffset();
+  return A.getExpression().getBitPieceOffset() <
+         B.getExpression().getBitPieceOffset();
 }
 
 }
diff --git a/lib/CodeGen/AsmPrinter/DebugLocList.h b/lib/CodeGen/AsmPrinter/DebugLocList.h
index 2a4f58f..0f1d2ed 100644
--- a/lib/CodeGen/AsmPrinter/DebugLocList.h
+++ b/lib/CodeGen/AsmPrinter/DebugLocList.h
@@ -10,8 +10,8 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCLIST_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCLIST_H
 
-#include "llvm/ADT/SmallVector.h"
 #include "DebugLocEntry.h"
+#include "llvm/ADT/SmallVector.h"
 
 namespace llvm {
 class DwarfCompileUnit;
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
index 7e87566..a71f35e 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
@@ -12,12 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "DwarfAccelTable.h"
-#include "DIE.h"
 #include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
index 3cdf678..74963da 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
@@ -14,9 +14,9 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFACCELTABLE_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFACCELTABLE_H
 
-#include "DIE.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
@@ -215,8 +215,8 @@ private:
 #endif
   };
 
-  DwarfAccelTable(const DwarfAccelTable &) LLVM_DELETED_FUNCTION;
-  void operator=(const DwarfAccelTable &) LLVM_DELETED_FUNCTION;
+  DwarfAccelTable(const DwarfAccelTable &) = delete;
+  void operator=(const DwarfAccelTable &) = delete;
 
   // Internal Functions
   void EmitHeader(AsmPrinter *);
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 0dc52da..f45b24c 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -51,7 +51,8 @@ void DwarfCFIException::endModule() {
   if (moveTypeModule == AsmPrinter::CFI_M_Debug)
     Asm->OutStreamer.EmitCFISections(false, true);
 
-  if (!Asm->MAI->usesItaniumLSDAForExceptions())
+  // SjLj uses this pass and it doesn't need this info.
+  if (!Asm->MAI->usesCFIForEH())
     return;
 
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
@@ -90,7 +91,7 @@ void DwarfCFIException::beginFunction(const MachineFunction *MF) {
 
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   unsigned PerEncoding = TLOF.getPersonalityEncoding();
-  const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()];
+  const Function *Per = MMI->getPersonality();
 
   shouldEmitPersonality = hasLandingPads &&
     PerEncoding != dwarf::DW_EH_PE_omit && Per;
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 2f1b0e5..dcc5fe4 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -1,5 +1,5 @@
 #include "DwarfCompileUnit.h"
-
+#include "DwarfExpression.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalValue.h"
@@ -10,8 +10,8 @@
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 namespace llvm {
 
@@ -103,7 +103,7 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(DIGlobalVariable GV) {
 
   assert(GV.isGlobalVariable());
 
-  DIScope GVContext = DD->resolve(GV.getContext());
+  DIScope GVContext = GV.getContext();
   DIType GTy = DD->resolve(GV.getType());
 
   // Construct the context before querying for the existence of the DIE in
@@ -122,7 +122,7 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(DIGlobalVariable GV) {
     DIE *VariableSpecDIE = getOrCreateStaticMemberDIE(SDMDecl);
     addDIEEntry(*VariableDIE, dwarf::DW_AT_specification, *VariableSpecDIE);
   } else {
-    DeclContext = resolve(GV.getContext());
+    DeclContext = GV.getContext();
     // Add name and type.
     addString(*VariableDIE, dwarf::DW_AT_name, GV.getDisplayName());
     addType(*VariableDIE, GTy);
@@ -292,10 +292,10 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIE(DISubprogram SP) {
 
   // Only include DW_AT_frame_base in full debug info
   if (!includeMinimalInlineScopes()) {
-    const TargetRegisterInfo *RI =
-        Asm->TM.getSubtargetImpl()->getRegisterInfo();
+    const TargetRegisterInfo *RI = Asm->MF->getSubtarget().getRegisterInfo();
     MachineLocation Location(RI->getFrameRegister(*Asm->MF));
-    addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
+    if (RI->isPhysicalRegister(Location.getReg()))
+      addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
   }
 
   // Add name to the name table, we do this here because we're guaranteed
@@ -515,15 +515,23 @@ DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
   }
 
   // .. else use frame index.
-  int FI = DV.getFrameIndex();
-  if (FI != ~0) {
+  if (DV.getFrameIndex().back() == ~0)
+    return VariableDie;
+
+  auto Expr = DV.getExpression().begin();
+  DIELoc *Loc = new (DIEValueAllocator) DIELoc();
+  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+  for (auto FI : DV.getFrameIndex()) {
     unsigned FrameReg = 0;
-    const TargetFrameLowering *TFI =
-        Asm->TM.getSubtargetImpl()->getFrameLowering();
+    const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
     int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
-    MachineLocation Location(FrameReg, Offset);
-    addVariableAddress(DV, *VariableDie, Location);
+    assert(Expr != DV.getExpression().end() &&
+           "Wrong number of expressions");
+    DwarfExpr.AddMachineRegIndirect(FrameReg, Offset);
+    DwarfExpr.AddExpression(Expr->begin(), Expr->end());
+    ++Expr;
   }
+  addBlock(*VariableDie, dwarf::DW_AT_location, Loc);
 
   return VariableDie;
 }
@@ -694,7 +702,7 @@ void DwarfCompileUnit::collectDeadVariables(DISubprogram SP) {
   for (unsigned vi = 0, ve = Variables.getNumElements(); vi != ve; ++vi) {
     DIVariable DV(Variables.getElement(vi));
     assert(DV.isVariable());
-    DbgVariable NewVar(DV, DIExpression(nullptr), DD);
+    DbgVariable NewVar(DV, DIExpression(), DD);
     auto VariableDie = constructVariableDIE(NewVar);
     applyVariableAttributes(NewVar, *VariableDie);
     SPDIE->addChild(std::move(VariableDie));
@@ -736,24 +744,22 @@ void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,
   else if (DV.isBlockByrefVariable())
     addBlockByrefAddress(DV, Die, dwarf::DW_AT_location, Location);
   else
-    addAddress(Die, dwarf::DW_AT_location, Location,
-               DV.getVariable().isIndirect());
+    addAddress(Die, dwarf::DW_AT_location, Location);
 }
 
 /// Add an address attribute to a die based on the location provided.
 void DwarfCompileUnit::addAddress(DIE &Die, dwarf::Attribute Attribute,
-                                  const MachineLocation &Location,
-                                  bool Indirect) {
+                                  const MachineLocation &Location) {
   DIELoc *Loc = new (DIEValueAllocator) DIELoc();
 
-  if (Location.isReg() && !Indirect)
-    addRegisterOpPiece(*Loc, Location.getReg());
-  else {
-    addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
-    if (Indirect && !Location.isReg()) {
-      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    }
-  }
+  bool validReg;
+  if (Location.isReg())
+    validReg = addRegisterOpPiece(*Loc, Location.getReg());
+  else
+    validReg = addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
+
+  if (!validReg)
+    return;
 
   // Now attach the location information to the DIE.
   addBlock(Die, Attribute, Loc);
@@ -767,53 +773,21 @@ void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
                                          dwarf::Attribute Attribute,
                                          const MachineLocation &Location) {
   DIELoc *Loc = new (DIEValueAllocator) DIELoc();
-  unsigned N = DV.getNumAddrElements();
-  unsigned i = 0;
-  if (Location.isReg()) {
-    if (N >= 2 && DV.getAddrElement(0) == dwarf::DW_OP_plus) {
-      assert(!DV.getVariable().isIndirect() &&
-             "double indirection not handled");
-      // If first address element is OpPlus then emit
-      // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
-      addRegisterOffset(*Loc, Location.getReg(), DV.getAddrElement(1));
-      i = 2;
-    } else if (N >= 2 && DV.getAddrElement(0) == dwarf::DW_OP_deref) {
-      assert(!DV.getVariable().isIndirect() &&
-             "double indirection not handled");
-      addRegisterOpPiece(*Loc, Location.getReg(),
-                         DV.getExpression().getPieceSize(),
-                         DV.getExpression().getPieceOffset());
-      i = 3;
-    } else
-      addRegisterOpPiece(*Loc, Location.getReg());
+  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+  assert(DV.getExpression().size() == 1);
+  DIExpression Expr = DV.getExpression().back();
+  bool ValidReg;
+  if (Location.getOffset()) {
+    ValidReg = DwarfExpr.AddMachineRegIndirect(Location.getReg(),
+                                               Location.getOffset());
+    if (ValidReg)
+      DwarfExpr.AddExpression(Expr.begin(), Expr.end());
   } else
-    addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
-
-  for (; i < N; ++i) {
-    uint64_t Element = DV.getAddrElement(i);
-    if (Element == dwarf::DW_OP_plus) {
-      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-      addUInt(*Loc, dwarf::DW_FORM_udata, DV.getAddrElement(++i));
-
-    } else if (Element == dwarf::DW_OP_deref) {
-      if (!Location.isReg())
-        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-
-    } else if (Element == dwarf::DW_OP_piece) {
-      const unsigned SizeOfByte = 8;
-      unsigned PieceOffsetInBits = DV.getAddrElement(++i) * SizeOfByte;
-      unsigned PieceSizeInBits = DV.getAddrElement(++i) * SizeOfByte;
-      // Emit DW_OP_bit_piece Size Offset.
-      assert(PieceSizeInBits > 0 && "piece has zero size");
-      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_bit_piece);
-      addUInt(*Loc, dwarf::DW_FORM_udata, PieceSizeInBits);
-      addUInt(*Loc, dwarf::DW_FORM_udata, PieceOffsetInBits);
-    } else
-      llvm_unreachable("unknown DIBuilder Opcode");
-  }
+    ValidReg = DwarfExpr.AddMachineRegExpression(Expr, Location.getReg());
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, Loc);
+  if (ValidReg)
+    addBlock(Die, Attribute, Loc);
 }
 
 /// Add a Dwarf loclistptr attribute data and value.
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index e521f39..c66af65 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -15,9 +15,9 @@
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
 
 #include "DwarfUnit.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/Dwarf.h"
 
 namespace llvm {
 
@@ -213,7 +213,7 @@ public:
                           MachineLocation Location);
   /// Add an address attribute to a die based on the location provided.
   void addAddress(DIE &Die, dwarf::Attribute Attribute,
-                  const MachineLocation &Location, bool Indirect = false);
+                  const MachineLocation &Location);
 
   /// Start with the address based on the location provided, and generate the
   /// DWARF information necessary to find the actual variable (navigating the
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 230ea46..aa1f79f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -12,16 +12,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "DwarfDebug.h"
-
 #include "ByteStreamer.h"
-#include "DwarfCompileUnit.h"
-#include "DIE.h"
 #include "DIEHash.h"
+#include "DwarfCompileUnit.h"
+#include "DwarfExpression.h"
 #include "DwarfUnit.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/Constants.h"
@@ -490,9 +490,6 @@ void DwarfDebug::beginModule() {
 
   // Tell MMI that we have debug info.
   MMI->setDebugInfoAvailability(true);
-
-  // Prime section data.
-  SectionMap[Asm->getObjFileLowering().getTextSection()];
 }
 
 void DwarfDebug::finishVariableDefinitions() {
@@ -608,53 +605,6 @@ void DwarfDebug::finalizeModuleInfo() {
     SkeletonHolder.computeSizeAndOffsets();
 }
 
-void DwarfDebug::endSections() {
-  // Filter labels by section.
-  for (const SymbolCU &SCU : ArangeLabels) {
-    if (SCU.Sym->isInSection()) {
-      // Make a note of this symbol and it's section.
-      const MCSection *Section = &SCU.Sym->getSection();
-      if (!Section->getKind().isMetadata())
-        SectionMap[Section].push_back(SCU);
-    } else {
-      // Some symbols (e.g. common/bss on mach-o) can have no section but still
-      // appear in the output. This sucks as we rely on sections to build
-      // arange spans. We can do it without, but it's icky.
-      SectionMap[nullptr].push_back(SCU);
-    }
-  }
-
-  // Build a list of sections used.
-  std::vector<const MCSection *> Sections;
-  for (const auto &it : SectionMap) {
-    const MCSection *Section = it.first;
-    Sections.push_back(Section);
-  }
-
-  // Sort the sections into order.
-  // This is only done to ensure consistent output order across different runs.
-  std::sort(Sections.begin(), Sections.end(), SectionSort);
-
-  // Add terminating symbols for each section.
-  for (unsigned ID = 0, E = Sections.size(); ID != E; ID++) {
-    const MCSection *Section = Sections[ID];
-    MCSymbol *Sym = nullptr;
-
-    if (Section) {
-      // We can't call MCSection::getLabelEndName, as it's only safe to do so
-      // if we know the section name up-front. For user-created sections, the
-      // resulting label may not be valid to use as a label. (section names can
-      // use a greater set of characters on some systems)
-      Sym = Asm->GetTempSymbol("debug_end", ID);
-      Asm->OutStreamer.SwitchSection(Section);
-      Asm->OutStreamer.EmitLabel(Sym);
-    }
-
-    // Insert a final terminator.
-    SectionMap[Section].push_back(SymbolCU(nullptr, Sym));
-  }
-}
-
 // Emit all Dwarf sections that should come after the content.
 void DwarfDebug::endModule() {
   assert(CurFn == nullptr);
@@ -666,10 +616,6 @@ void DwarfDebug::endModule() {
   if (!DwarfInfoSectionSym)
     return;
 
-  // End any existing sections.
-  // TODO: Does this need to happen?
-  endSections();
-
   // Finalize the debug info for the module.
   finalizeModuleInfo();
 
@@ -783,10 +729,9 @@ void DwarfDebug::collectVariableInfoFromMMITable(
     DIVariable DV(VI.Var);
     DIExpression Expr(VI.Expr);
     ensureAbstractVariableIsCreatedIfScoped(DV, Scope->getScopeNode());
-    ConcreteVariables.push_back(make_unique<DbgVariable>(DV, Expr, this));
-    DbgVariable *RegVar = ConcreteVariables.back().get();
-    RegVar->setFrameIndex(VI.Slot);
-    InfoHolder.addScopeVariable(Scope, RegVar);
+    auto RegVar = make_unique<DbgVariable>(DV, Expr, this, VI.Slot);
+    if (InfoHolder.addScopeVariable(Scope, RegVar.get()))
+      ConcreteVariables.push_back(std::move(RegVar));
   }
 }
 
@@ -818,12 +763,12 @@ static DebugLocEntry::Value getDebugLocValue(const MachineInstr *MI) {
 
 /// Determine whether two variable pieces overlap.
 static bool piecesOverlap(DIExpression P1, DIExpression P2) {
-  if (!P1.isVariablePiece() || !P2.isVariablePiece())
+  if (!P1.isBitPiece() || !P2.isBitPiece())
     return true;
-  unsigned l1 = P1.getPieceOffset();
-  unsigned l2 = P2.getPieceOffset();
-  unsigned r1 = l1 + P1.getPieceSize();
-  unsigned r2 = l2 + P2.getPieceSize();
+  unsigned l1 = P1.getBitPieceOffset();
+  unsigned l2 = P2.getBitPieceOffset();
+  unsigned r1 = l1 + P1.getBitPieceSize();
+  unsigned r2 = l2 + P2.getBitPieceSize();
   // True where [l1,r1[ and [r1,r2[ overlap.
   return (l1 < r2) && (l2 < r1);
 }
@@ -842,7 +787,8 @@ static bool piecesOverlap(DIExpression P1, DIExpression P2) {
 // 1 | |    [x, (reg1, piece 32, 32)] <- IsPieceOfPrevEntry
 // 2 | |    ...
 // 3   |    [clobber reg0]
-// 4        [x, (mem, piece 0, 64)] <- overlapping with both previous pieces of x.
+// 4        [x, (mem, piece 0, 64)] <- overlapping with both previous pieces of
+//                                     x.
 //
 // Output:
 //
@@ -894,7 +840,7 @@ DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
     bool couldMerge = false;
 
     // If this is a piece, it may belong to the current DebugLocEntry.
-    if (DIExpr.isVariablePiece()) {
+    if (DIExpr.isBitPiece()) {
       // Add this value to the list of open ranges.
       OpenRanges.push_back(Value);
 
@@ -950,11 +896,9 @@ DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU, DISubprogram SP,
       continue;
 
     LexicalScope *Scope = nullptr;
-    if (MDNode *IA = DV.getInlinedAt()) {
-      DebugLoc DL = DebugLoc::getFromDILocation(IA);
-      Scope = LScopes.findInlinedScope(DebugLoc::get(
-          DL.getLine(), DL.getCol(), DV.getContext(), IA));
-    } else
+    if (MDNode *IA = DV.getInlinedAt())
+      Scope = LScopes.findInlinedScope(DV.getContext(), IA);
+    else
       Scope = LScopes.findLexicalScope(DV.getContext());
     // If variable scope is not found then skip this variable.
     if (!Scope)
@@ -1026,8 +970,10 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
       if (DL == PrologEndLoc) {
         Flags |= DWARF2_FLAG_PROLOGUE_END;
         PrologEndLoc = DebugLoc();
+        Flags |= DWARF2_FLAG_IS_STMT;
       }
-      if (PrologEndLoc.isUnknown())
+      if (DL.getLine() !=
+          Asm->OutStreamer.getContext().getCurrentDwarfLoc().getLine())
         Flags |= DWARF2_FLAG_IS_STMT;
 
       if (!DL.isUnknown()) {
@@ -1117,8 +1063,12 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {
   for (const auto &MBB : *MF)
     for (const auto &MI : MBB)
       if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) &&
-          !MI.getDebugLoc().isUnknown())
+          !MI.getDebugLoc().isUnknown()) {
+        // Did the target forget to set the FrameSetup flag for CFI insns?
+        assert(!MI.isCFIInstruction() &&
+               "First non-frame-setup instruction is a CFI instruction.");
         return MI.getDebugLoc();
+      }
   return DebugLoc();
 }
 
@@ -1172,7 +1122,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   Asm->OutStreamer.EmitLabel(FunctionBeginSym);
 
   // Calculate history for local variables.
-  calculateDbgValueHistory(MF, Asm->TM.getSubtargetImpl()->getRegisterInfo(),
+  calculateDbgValueHistory(MF, Asm->MF->getSubtarget().getRegisterInfo(),
                            DbgValues);
 
   // Request labels for the full history.
@@ -1187,7 +1137,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
     if (DIVar.isVariable() && DIVar.getTag() == dwarf::DW_TAG_arg_variable &&
         getDISubprogram(DIVar.getContext()).describes(MF->getFunction())) {
       LabelsBeforeInsn[Ranges.front().first] = FunctionBeginSym;
-      if (Ranges.front().first->getDebugExpression().isVariablePiece()) {
+      if (Ranges.front().first->getDebugExpression().isBitPiece()) {
         // Mark all non-overlapping initial pieces.
         for (auto I = Ranges.begin(); I != Ranges.end(); ++I) {
           DIExpression Piece = I->first->getDebugExpression();
@@ -1217,12 +1167,12 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   if (!PrologEndLoc.isUnknown()) {
     DebugLoc FnStartDL =
         PrologEndLoc.getFnDebugLoc(MF->getFunction()->getContext());
-    recordSourceLine(
-        FnStartDL.getLine(), FnStartDL.getCol(),
-        FnStartDL.getScope(MF->getFunction()->getContext()),
-        // We'd like to list the prologue as "not statements" but GDB behaves
-        // poorly if we do that. Revisit this with caution/GDB (7.5+) testing.
-        DWARF2_FLAG_IS_STMT);
+
+    // We'd like to list the prologue as "not statements" but GDB behaves
+    // poorly if we do that. Revisit this with caution/GDB (7.5+) testing.
+    recordSourceLine(FnStartDL.getLine(), FnStartDL.getCol(),
+                     FnStartDL.getScope(MF->getFunction()->getContext()),
+                     DWARF2_FLAG_IS_STMT);
   }
 }
 
@@ -1350,8 +1300,8 @@ void DwarfDebug::emitSectionLabels() {
   if (useSplitDwarf()) {
     DwarfInfoDWOSectionSym =
         emitSectionSym(Asm, TLOF.getDwarfInfoDWOSection(), "section_info_dwo");
-    DwarfTypesDWOSectionSym =
-        emitSectionSym(Asm, TLOF.getDwarfTypesDWOSection(), "section_types_dwo");
+    DwarfTypesDWOSectionSym = emitSectionSym(
+        Asm, TLOF.getDwarfTypesDWOSection(), "section_types_dwo");
   }
   DwarfAbbrevSectionSym =
       emitSectionSym(Asm, TLOF.getDwarfAbbrevSection(), "section_abbrev");
@@ -1553,7 +1503,6 @@ static dwarf::PubIndexEntryDescriptor computeIndexValue(DwarfUnit *CU,
     return dwarf::GIEK_TYPE;
   case dwarf::DW_TAG_subprogram:
     return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_FUNCTION, Linkage);
-  case dwarf::DW_TAG_constant:
   case dwarf::DW_TAG_variable:
     return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_VARIABLE, Linkage);
   case dwarf::DW_TAG_enumerator:
@@ -1656,7 +1605,7 @@ void DwarfDebug::emitLocPieces(ByteStreamer &Streamer,
                                const DITypeIdentifierMap &Map,
                                ArrayRef<DebugLocEntry::Value> Values) {
   assert(std::all_of(Values.begin(), Values.end(), [](DebugLocEntry::Value P) {
-        return P.isVariablePiece();
+        return P.isBitPiece();
       }) && "all values are expected to be pieces");
   assert(std::is_sorted(Values.begin(), Values.end()) &&
          "pieces are expected to be sorted");
@@ -1664,35 +1613,25 @@ void DwarfDebug::emitLocPieces(ByteStreamer &Streamer,
   unsigned Offset = 0;
   for (auto Piece : Values) {
     DIExpression Expr = Piece.getExpression();
-    unsigned PieceOffset = Expr.getPieceOffset();
-    unsigned PieceSize = Expr.getPieceSize();
+    unsigned PieceOffset = Expr.getBitPieceOffset();
+    unsigned PieceSize = Expr.getBitPieceSize();
     assert(Offset <= PieceOffset && "overlapping or duplicate pieces");
     if (Offset < PieceOffset) {
       // The DWARF spec seriously mandates pieces with no locations for gaps.
-      Asm->EmitDwarfOpPiece(Streamer, (PieceOffset-Offset)*8);
+      Asm->EmitDwarfOpPiece(Streamer, PieceOffset-Offset);
       Offset += PieceOffset-Offset;
     }
-
     Offset += PieceSize;
 
-    const unsigned SizeOfByte = 8;
 #ifndef NDEBUG
     DIVariable Var = Piece.getVariable();
-    assert(!Var.isIndirect() && "indirect address for piece");
     unsigned VarSize = Var.getSizeInBits(Map);
-    assert(PieceSize+PieceOffset <= VarSize/SizeOfByte
+    assert(PieceSize+PieceOffset <= VarSize
            && "piece is larger than or outside of variable");
-    assert(PieceSize*SizeOfByte != VarSize
+    assert(PieceSize != VarSize
            && "piece covers entire variable");
 #endif
-    if (Piece.isLocation() && Piece.getLoc().isReg())
-      Asm->EmitDwarfRegOpPiece(Streamer,
-                               Piece.getLoc(),
-                               PieceSize*SizeOfByte);
-    else {
-      emitDebugLocValue(Streamer, Piece);
-      Asm->EmitDwarfOpPiece(Streamer, PieceSize*SizeOfByte);
-    }
+    emitDebugLocValue(Streamer, Piece, PieceOffset);
   }
 }
 
@@ -1700,7 +1639,7 @@ void DwarfDebug::emitLocPieces(ByteStreamer &Streamer,
 void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
                                    const DebugLocEntry &Entry) {
   const DebugLocEntry::Value Value = Entry.getValues()[0];
-  if (Value.isVariablePiece())
+  if (Value.isBitPiece())
     // Emit all pieces that belong to the same variable and range.
     return emitLocPieces(Streamer, TypeIdentifierMap, Entry.getValues());
 
@@ -1709,62 +1648,33 @@ void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
 }
 
 void DwarfDebug::emitDebugLocValue(ByteStreamer &Streamer,
-                                   const DebugLocEntry::Value &Value) {
+                                   const DebugLocEntry::Value &Value,
+                                   unsigned PieceOffsetInBits) {
   DIVariable DV = Value.getVariable();
+  DebugLocDwarfExpression DwarfExpr(*Asm, Streamer);
+
   // Regular entry.
   if (Value.isInt()) {
     DIBasicType BTy(resolve(DV.getType()));
     if (BTy.Verify() && (BTy.getEncoding() == dwarf::DW_ATE_signed ||
-                         BTy.getEncoding() == dwarf::DW_ATE_signed_char)) {
-      Streamer.EmitInt8(dwarf::DW_OP_consts, "DW_OP_consts");
-      Streamer.EmitSLEB128(Value.getInt());
-    } else {
-      Streamer.EmitInt8(dwarf::DW_OP_constu, "DW_OP_constu");
-      Streamer.EmitULEB128(Value.getInt());
-    }
+                         BTy.getEncoding() == dwarf::DW_ATE_signed_char))
+      DwarfExpr.AddSignedConstant(Value.getInt());
+    else
+      DwarfExpr.AddUnsignedConstant(Value.getInt());
   } else if (Value.isLocation()) {
     MachineLocation Loc = Value.getLoc();
     DIExpression Expr = Value.getExpression();
-    if (!Expr)
+    if (!Expr || (Expr.getNumElements() == 0))
       // Regular entry.
-      Asm->EmitDwarfRegOp(Streamer, Loc, DV.isIndirect());
+      Asm->EmitDwarfRegOp(Streamer, Loc);
     else {
       // Complex address entry.
-      unsigned N = Expr.getNumElements();
-      unsigned i = 0;
-      if (N >= 2 && Expr.getElement(0) == dwarf::DW_OP_plus) {
-        if (Loc.getOffset()) {
-          i = 2;
-          Asm->EmitDwarfRegOp(Streamer, Loc, DV.isIndirect());
-          Streamer.EmitInt8(dwarf::DW_OP_deref, "DW_OP_deref");
-          Streamer.EmitInt8(dwarf::DW_OP_plus_uconst, "DW_OP_plus_uconst");
-          Streamer.EmitSLEB128(Expr.getElement(1));
-        } else {
-          // If first address element is OpPlus then emit
-          // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
-          MachineLocation TLoc(Loc.getReg(), Expr.getElement(1));
-          Asm->EmitDwarfRegOp(Streamer, TLoc, DV.isIndirect());
-          i = 2;
-        }
-      } else {
-        Asm->EmitDwarfRegOp(Streamer, Loc, DV.isIndirect());
-      }
-
-      // Emit remaining complex address elements.
-      for (; i < N; ++i) {
-        uint64_t Element = Expr.getElement(i);
-        if (Element == dwarf::DW_OP_plus) {
-          Streamer.EmitInt8(dwarf::DW_OP_plus_uconst, "DW_OP_plus_uconst");
-          Streamer.EmitULEB128(Expr.getElement(++i));
-        } else if (Element == dwarf::DW_OP_deref) {
-          if (!Loc.isReg())
-            Streamer.EmitInt8(dwarf::DW_OP_deref, "DW_OP_deref");
-        } else if (Element == dwarf::DW_OP_piece) {
-          i += 3;
-          // handled in emitDebugLocEntry.
-        } else
-          llvm_unreachable("unknown Opcode found in complex address");
-      }
+      if (Loc.getOffset()) {
+        DwarfExpr.AddMachineRegIndirect(Loc.getReg(), Loc.getOffset());
+        DwarfExpr.AddExpression(Expr.begin(), Expr.end(), PieceOffsetInBits);
+      } else
+        DwarfExpr.AddMachineRegExpression(Expr, Loc.getReg(),
+                                          PieceOffsetInBits);
     }
   }
   // else ... ignore constant fp. There is not any good way to
@@ -1841,13 +1751,26 @@ struct ArangeSpan {
 // Emit a debug aranges section, containing a CU lookup for any
 // address we can tie back to a CU.
 void DwarfDebug::emitDebugARanges() {
-  // Start the dwarf aranges section.
-  Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getDwarfARangesSection());
+  // Provides a unique id per text section.
+  DenseMap<const MCSection *, SmallVector<SymbolCU, 8>> SectionMap;
 
-  typedef DenseMap<DwarfCompileUnit *, std::vector<ArangeSpan>> SpansType;
+  // Prime section data.
+  SectionMap[Asm->getObjFileLowering().getTextSection()];
 
-  SpansType Spans;
+  // Filter labels by section.
+  for (const SymbolCU &SCU : ArangeLabels) {
+    if (SCU.Sym->isInSection()) {
+      // Make a note of this symbol and it's section.
+      const MCSection *Section = &SCU.Sym->getSection();
+      if (!Section->getKind().isMetadata())
+        SectionMap[Section].push_back(SCU);
+    } else {
+      // Some symbols (e.g. common/bss on mach-o) can have no section but still
+      // appear in the output. This sucks as we rely on sections to build
+      // arange spans. We can do it without, but it's icky.
+      SectionMap[nullptr].push_back(SCU);
+    }
+  }
 
   // Build a list of sections used.
   std::vector<const MCSection *> Sections;
@@ -1860,12 +1783,45 @@ void DwarfDebug::emitDebugARanges() {
   // This is only done to ensure consistent output order across different runs.
   std::sort(Sections.begin(), Sections.end(), SectionSort);
 
-  // Build a set of address spans, sorted by CU.
+  // Add terminating symbols for each section.
+  for (unsigned ID = 0, E = Sections.size(); ID != E; ID++) {
+    const MCSection *Section = Sections[ID];
+    MCSymbol *Sym = nullptr;
+
+    if (Section) {
+      // We can't call MCSection::getLabelEndName, as it's only safe to do so
+      // if we know the section name up-front. For user-created sections, the
+      // resulting label may not be valid to use as a label. (section names can
+      // use a greater set of characters on some systems)
+      Sym = Asm->GetTempSymbol("debug_end", ID);
+      Asm->OutStreamer.SwitchSection(Section);
+      Asm->OutStreamer.EmitLabel(Sym);
+    }
+
+    // Insert a final terminator.
+    SectionMap[Section].push_back(SymbolCU(nullptr, Sym));
+  }
+
+  DenseMap<DwarfCompileUnit *, std::vector<ArangeSpan>> Spans;
+
   for (const MCSection *Section : Sections) {
     SmallVector<SymbolCU, 8> &List = SectionMap[Section];
     if (List.size() < 2)
       continue;
 
+    // If we have no section (e.g. common), just write out
+    // individual spans for each symbol.
+    if (!Section) {
+      for (const SymbolCU &Cur : List) {
+        ArangeSpan Span;
+        Span.Start = Cur.Sym;
+        Span.End = nullptr;
+        if (Cur.CU)
+          Spans[Cur.CU].push_back(Span);
+      }
+      continue;
+    }
+
     // Sort the symbols by offset within the section.
     std::sort(List.begin(), List.end(),
               [&](const SymbolCU &A, const SymbolCU &B) {
@@ -1881,35 +1837,27 @@ void DwarfDebug::emitDebugARanges() {
       return IA < IB;
     });
 
-    // If we have no section (e.g. common), just write out
-    // individual spans for each symbol.
-    if (!Section) {
-      for (const SymbolCU &Cur : List) {
+    // Build spans between each label.
+    const MCSymbol *StartSym = List[0].Sym;
+    for (size_t n = 1, e = List.size(); n < e; n++) {
+      const SymbolCU &Prev = List[n - 1];
+      const SymbolCU &Cur = List[n];
+
+      // Try and build the longest span we can within the same CU.
+      if (Cur.CU != Prev.CU) {
         ArangeSpan Span;
-        Span.Start = Cur.Sym;
-        Span.End = nullptr;
-        if (Cur.CU)
-          Spans[Cur.CU].push_back(Span);
-      }
-    } else {
-      // Build spans between each label.
-      const MCSymbol *StartSym = List[0].Sym;
-      for (size_t n = 1, e = List.size(); n < e; n++) {
-        const SymbolCU &Prev = List[n - 1];
-        const SymbolCU &Cur = List[n];
-
-        // Try and build the longest span we can within the same CU.
-        if (Cur.CU != Prev.CU) {
-          ArangeSpan Span;
-          Span.Start = StartSym;
-          Span.End = Cur.Sym;
-          Spans[Prev.CU].push_back(Span);
-          StartSym = Cur.Sym;
-        }
+        Span.Start = StartSym;
+        Span.End = Cur.Sym;
+        Spans[Prev.CU].push_back(Span);
+        StartSym = Cur.Sym;
       }
     }
   }
 
+  // Start the dwarf aranges section.
+  Asm->OutStreamer.SwitchSection(
+      Asm->getObjFileLowering().getDwarfARangesSection());
+
   unsigned PtrSize = Asm->getDataLayout().getPointerSize();
 
   // Build a list of CUs used.
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 48c2809..1c0e163 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -14,26 +14,25 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
 
-#include "DwarfFile.h"
 #include "AsmPrinterHandler.h"
-#include "DIE.h"
 #include "DbgValueHistoryCalculator.h"
 #include "DebugLocEntry.h"
 #include "DebugLocList.h"
 #include "DwarfAccelTable.h"
+#include "DwarfFile.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/FoldingSet.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugLoc.h"
-#include "llvm/MC/MachineLocation.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/Allocator.h"
-
 #include <memory>
 
 namespace llvm {
@@ -68,41 +67,67 @@ public:
 
 //===----------------------------------------------------------------------===//
 /// \brief This class is used to track local variable information.
+///
+/// - Variables whose location changes over time have a DotDebugLocOffset and
+///   the other fields are not used.
+///
+/// - Variables that are described by multiple MMI table entries have multiple
+///   expressions and frame indices.
 class DbgVariable {
-  DIVariable Var;             // Variable Descriptor.
-  DIExpression Expr;          // Complex address location expression.
-  DIE *TheDIE;                // Variable DIE.
-  unsigned DotDebugLocOffset; // Offset in DotDebugLocEntries.
-  const MachineInstr *MInsn;  // DBG_VALUE instruction of the variable.
-  int FrameIndex;
+  DIVariable Var;             /// Variable Descriptor.
+  SmallVector<DIExpression, 1> Expr; /// Complex address location expression.
+  DIE *TheDIE;                /// Variable DIE.
+  unsigned DotDebugLocOffset; /// Offset in DotDebugLocEntries.
+  const MachineInstr *MInsn;  /// DBG_VALUE instruction of the variable.
+  SmallVector<int, 1> FrameIndex; /// Frame index of the variable.
   DwarfDebug *DD;
 
 public:
   /// Construct a DbgVariable from a DIVariable.
-  DbgVariable(DIVariable V, DIExpression E, DwarfDebug *DD)
-      : Var(V), Expr(E), TheDIE(nullptr), DotDebugLocOffset(~0U),
-        MInsn(nullptr), FrameIndex(~0), DD(DD) {
-    assert(Var.Verify() && Expr.Verify());
+    DbgVariable(DIVariable V, DIExpression E, DwarfDebug *DD, int FI = ~0)
+    : Var(V), Expr(1, E), TheDIE(nullptr), DotDebugLocOffset(~0U),
+      MInsn(nullptr), DD(DD) {
+    FrameIndex.push_back(FI);
+    assert(Var.Verify() && E.Verify());
   }
 
   /// Construct a DbgVariable from a DEBUG_VALUE.
   /// AbstractVar may be NULL.
   DbgVariable(const MachineInstr *DbgValue, DwarfDebug *DD)
-      : Var(DbgValue->getDebugVariable()), Expr(DbgValue->getDebugExpression()),
-        TheDIE(nullptr), DotDebugLocOffset(~0U), MInsn(DbgValue),
-        FrameIndex(~0), DD(DD) {}
+      : Var(DbgValue->getDebugVariable()),
+        Expr(1, DbgValue->getDebugExpression()), TheDIE(nullptr),
+        DotDebugLocOffset(~0U), MInsn(DbgValue), DD(DD) {
+    FrameIndex.push_back(~0);
+  }
 
   // Accessors.
   DIVariable getVariable() const { return Var; }
-  DIExpression getExpression() const { return Expr; }
+  const ArrayRef<DIExpression> getExpression() const { return Expr; }
   void setDIE(DIE &D) { TheDIE = &D; }
   DIE *getDIE() const { return TheDIE; }
   void setDotDebugLocOffset(unsigned O) { DotDebugLocOffset = O; }
   unsigned getDotDebugLocOffset() const { return DotDebugLocOffset; }
   StringRef getName() const { return Var.getName(); }
   const MachineInstr *getMInsn() const { return MInsn; }
-  int getFrameIndex() const { return FrameIndex; }
-  void setFrameIndex(int FI) { FrameIndex = FI; }
+  const ArrayRef<int> getFrameIndex() const { return FrameIndex; }
+
+  void addMMIEntry(const DbgVariable &V) {
+    assert(  DotDebugLocOffset == ~0U &&   !MInsn && "not an MMI entry");
+    assert(V.DotDebugLocOffset == ~0U && !V.MInsn && "not an MMI entry");
+    assert(V.Var == Var && "conflicting DIVariable");
+
+    if (V.getFrameIndex().back() != ~0) {
+      auto E = V.getExpression();
+      auto FI = V.getFrameIndex();
+      Expr.append(E.begin(), E.end());
+      FrameIndex.append(FI.begin(), FI.end());
+    }
+    assert(Expr.size() > 1
+           ? std::all_of(Expr.begin(), Expr.end(),
+                         [](DIExpression &E) { return E.isBitPiece(); })
+           : (true && "conflicting locations for variable"));
+  }
+
   // Translate tag to proper Dwarf tag.
   dwarf::Tag getTag() const {
     if (Var.getTag() == dwarf::DW_TAG_arg_variable)
@@ -129,14 +154,11 @@ public:
 
   bool variableHasComplexAddress() const {
     assert(Var.isVariable() && "Invalid complex DbgVariable!");
-    return Expr.getNumElements() > 0;
+    assert(Expr.size() == 1 &&
+           "variableHasComplexAddress() invoked on multi-FI variable");
+    return Expr.back().getNumElements() > 0;
   }
   bool isBlockByrefVariable() const;
-  unsigned getNumAddrElements() const {
-    assert(Var.isVariable() && "Invalid complex DbgVariable!");
-    return Expr.getNumElements();
-  }
-  uint64_t getAddrElement(unsigned i) const { return Expr.getElement(i); }
   DIType getType() const;
 
 private:
@@ -179,10 +201,6 @@ class DwarfDebug : public AsmPrinterHandler {
   // Size of each symbol emitted (for those symbols that have a specific size).
   DenseMap<const MCSymbol *, uint64_t> SymSize;
 
-  // Provides a unique id per text section.
-  typedef DenseMap<const MCSection *, SmallVector<SymbolCU, 8> > SectionMapType;
-  SectionMapType SectionMap;
-
   LexicalScopes LScopes;
 
   // Collection of abstract variables.
@@ -259,7 +277,8 @@ class DwarfDebug : public AsmPrinterHandler {
   // them.
   DenseMap<const MDNode *, const DwarfTypeUnit *> DwarfTypeUnits;
 
-  SmallVector<std::pair<std::unique_ptr<DwarfTypeUnit>, DICompositeType>, 1> TypeUnitsUnderConstruction;
+  SmallVector<std::pair<std::unique_ptr<DwarfTypeUnit>, DICompositeType>, 1>
+      TypeUnitsUnderConstruction;
 
   // Whether to emit the pubnames/pubtypes sections.
   bool HasDwarfPubSections;
@@ -348,10 +367,6 @@ class DwarfDebug : public AsmPrinterHandler {
   /// processed.
   void finalizeModuleInfo();
 
-  /// \brief Emit labels to close any remaining sections that have been left
-  /// open.
-  void endSections();
-
   /// \brief Emit the debug info section.
   void emitDebugInfo();
 
@@ -565,7 +580,8 @@ public:
   void emitDebugLocEntry(ByteStreamer &Streamer, const DebugLocEntry &Entry);
   /// \brief emit a single value for the debug loc section.
   void emitDebugLocValue(ByteStreamer &Streamer,
-                         const DebugLocEntry::Value &Value);
+                         const DebugLocEntry::Value &Value,
+                         unsigned PieceOffsetInBits = 0);
   /// Emits an optimal (=sorted) sequence of DW_OP_pieces.
   void emitLocPieces(ByteStreamer &Streamer,
                      const DITypeIdentifierMap &Map,
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
new file mode 100644
index 0000000..fcab067
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -0,0 +1,269 @@
+//===-- llvm/CodeGen/DwarfExpression.cpp - Dwarf Debug Framework ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf debug info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfExpression.h"
+#include "DwarfDebug.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+const TargetRegisterInfo *DwarfExpression::getTRI() const {
+  return AP.TM.getSubtargetImpl()->getRegisterInfo();
+}
+
+unsigned DwarfExpression::getDwarfVersion() const {
+  return AP.getDwarfDebug()->getDwarfVersion();
+}
+
+void DwarfExpression::AddReg(int DwarfReg, const char *Comment) {
+  assert(DwarfReg >= 0 && "invalid negative dwarf register number");
+  if (DwarfReg < 32) {
+    EmitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment);
+  } else {
+    EmitOp(dwarf::DW_OP_regx, Comment);
+    EmitUnsigned(DwarfReg);
+  }
+}
+
+void DwarfExpression::AddRegIndirect(int DwarfReg, int Offset, bool Deref) {
+  assert(DwarfReg >= 0 && "invalid negative dwarf register number");
+  if (DwarfReg < 32) {
+    EmitOp(dwarf::DW_OP_breg0 + DwarfReg);
+  } else {
+    EmitOp(dwarf::DW_OP_bregx);
+    EmitUnsigned(DwarfReg);
+  }
+  EmitSigned(Offset);
+  if (Deref)
+    EmitOp(dwarf::DW_OP_deref);
+}
+
+void DwarfExpression::AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits) {
+  assert(SizeInBits > 0 && "piece has size zero");
+  const unsigned SizeOfByte = 8;
+  if (OffsetInBits > 0 || SizeInBits % SizeOfByte) {
+    EmitOp(dwarf::DW_OP_bit_piece);
+    EmitUnsigned(SizeInBits);
+    EmitUnsigned(OffsetInBits);
+  } else {
+    EmitOp(dwarf::DW_OP_piece);
+    unsigned ByteSize = SizeInBits / SizeOfByte;
+    EmitUnsigned(ByteSize);
+  }
+}
+
+void DwarfExpression::AddShr(unsigned ShiftBy) {
+  EmitOp(dwarf::DW_OP_constu);
+  EmitUnsigned(ShiftBy);
+  EmitOp(dwarf::DW_OP_shr);
+}
+
+bool DwarfExpression::AddMachineRegIndirect(unsigned MachineReg, int Offset) {
+  int DwarfReg = getTRI()->getDwarfRegNum(MachineReg, false);
+  if (DwarfReg < 0)
+    return false;
+
+  if (isFrameRegister(MachineReg)) {
+    // If variable offset is based in frame register then use fbreg.
+    EmitOp(dwarf::DW_OP_fbreg);
+    EmitSigned(Offset);
+  } else {
+    AddRegIndirect(DwarfReg, Offset);
+  }
+  return true;
+}
+
+bool DwarfExpression::AddMachineRegPiece(unsigned MachineReg,
+                                         unsigned PieceSizeInBits,
+                                         unsigned PieceOffsetInBits) {
+  const TargetRegisterInfo *TRI = getTRI();
+  if (!TRI->isPhysicalRegister(MachineReg))
+    return false;
+
+  int Reg = TRI->getDwarfRegNum(MachineReg, false);
+
+  // If this is a valid register number, emit it.
+  if (Reg >= 0) {
+    AddReg(Reg);
+    if (PieceSizeInBits)
+      AddOpPiece(PieceSizeInBits, PieceOffsetInBits);
+    return true;
+  }
+
+  // Walk up the super-register chain until we find a valid number.
+  // For example, EAX on x86_64 is a 32-bit piece of RAX with offset 0.
+  for (MCSuperRegIterator SR(MachineReg, TRI); SR.isValid(); ++SR) {
+    Reg = TRI->getDwarfRegNum(*SR, false);
+    if (Reg >= 0) {
+      unsigned Idx = TRI->getSubRegIndex(*SR, MachineReg);
+      unsigned Size = TRI->getSubRegIdxSize(Idx);
+      unsigned RegOffset = TRI->getSubRegIdxOffset(Idx);
+      AddReg(Reg, "super-register");
+      if (PieceOffsetInBits == RegOffset) {
+        AddOpPiece(Size, RegOffset);
+      } else {
+        // If this is part of a variable in a sub-register at a
+        // non-zero offset, we need to manually shift the value into
+        // place, since the DW_OP_piece describes the part of the
+        // variable, not the position of the subregister.
+        if (RegOffset)
+          AddShr(RegOffset);
+        AddOpPiece(Size, PieceOffsetInBits);
+      }
+      return true;
+    }
+  }
+
+  // Otherwise, attempt to find a covering set of sub-register numbers.
+  // For example, Q0 on ARM is a composition of D0+D1.
+  //
+  // Keep track of the current position so we can emit the more
+  // efficient DW_OP_piece.
+  unsigned CurPos = PieceOffsetInBits;
+  // The size of the register in bits, assuming 8 bits per byte.
+  unsigned RegSize = TRI->getMinimalPhysRegClass(MachineReg)->getSize() * 8;
+  // Keep track of the bits in the register we already emitted, so we
+  // can avoid emitting redundant aliasing subregs.
+  SmallBitVector Coverage(RegSize, false);
+  for (MCSubRegIterator SR(MachineReg, TRI); SR.isValid(); ++SR) {
+    unsigned Idx = TRI->getSubRegIndex(MachineReg, *SR);
+    unsigned Size = TRI->getSubRegIdxSize(Idx);
+    unsigned Offset = TRI->getSubRegIdxOffset(Idx);
+    Reg = TRI->getDwarfRegNum(*SR, false);
+
+    // Intersection between the bits we already emitted and the bits
+    // covered by this subregister.
+    SmallBitVector Intersection(RegSize, false);
+    Intersection.set(Offset, Offset + Size);
+    Intersection ^= Coverage;
+
+    // If this sub-register has a DWARF number and we haven't covered
+    // its range, emit a DWARF piece for it.
+    if (Reg >= 0 && Intersection.any()) {
+      AddReg(Reg, "sub-register");
+      AddOpPiece(Size, Offset == CurPos ? 0 : Offset);
+      CurPos = Offset + Size;
+
+      // Mark it as emitted.
+      Coverage.set(Offset, Offset + Size);
+    }
+  }
+
+  return CurPos > PieceOffsetInBits;
+}
+
+void DwarfExpression::AddSignedConstant(int Value) {
+  EmitOp(dwarf::DW_OP_consts);
+  EmitSigned(Value);
+  // The proper way to describe a constant value is
+  // DW_OP_constu <const>, DW_OP_stack_value.
+  // Unfortunately, DW_OP_stack_value was not available until DWARF-4,
+  // so we will continue to generate DW_OP_constu <const> for DWARF-2
+  // and DWARF-3. Technically, this is incorrect since DW_OP_const <const>
+  // actually describes a value at a constant addess, not a constant value.
+  // However, in the past there was no better way  to describe a constant
+  // value, so the producers and consumers started to rely on heuristics
+  // to disambiguate the value vs. location status of the expression.
+  // See PR21176 for more details.
+  if (getDwarfVersion() >= 4)
+    EmitOp(dwarf::DW_OP_stack_value);
+}
+
+void DwarfExpression::AddUnsignedConstant(unsigned Value) {
+  EmitOp(dwarf::DW_OP_constu);
+  EmitUnsigned(Value);
+  // cf. comment in DwarfExpression::AddSignedConstant().
+  if (getDwarfVersion() >= 4)
+    EmitOp(dwarf::DW_OP_stack_value);
+}
+
+static unsigned getOffsetOrZero(unsigned OffsetInBits,
+                                unsigned PieceOffsetInBits) {
+  if (OffsetInBits == PieceOffsetInBits)
+    return 0;
+  assert(OffsetInBits >= PieceOffsetInBits && "overlapping pieces");
+  return OffsetInBits;
+}
+
+bool DwarfExpression::AddMachineRegExpression(DIExpression Expr,
+                                              unsigned MachineReg,
+                                              unsigned PieceOffsetInBits) {
+  auto I = Expr.begin();
+  // Pattern-match combinations for which more efficient representations exist
+  // first.
+  if (I == Expr.end())
+    return AddMachineRegPiece(MachineReg);
+
+  bool ValidReg = false;
+  switch (*I) {
+  case dwarf::DW_OP_bit_piece: {
+    unsigned OffsetInBits = I->getArg(1);
+    unsigned SizeInBits   = I->getArg(2);
+    // Piece always comes at the end of the expression.
+    return AddMachineRegPiece(MachineReg, SizeInBits,
+               getOffsetOrZero(OffsetInBits, PieceOffsetInBits));
+  }
+  case dwarf::DW_OP_plus:
+    // [DW_OP_reg,Offset,DW_OP_plus,DW_OP_deref] --> [DW_OP_breg,Offset].
+    if (I->getNext() == dwarf::DW_OP_deref) {
+      unsigned Offset = I->getArg(1);
+      ValidReg = AddMachineRegIndirect(MachineReg, Offset);
+      std::advance(I, 2);
+      break;
+    } else
+      ValidReg = AddMachineRegPiece(MachineReg);
+  case dwarf::DW_OP_deref:
+    // [DW_OP_reg,DW_OP_deref] --> [DW_OP_breg].
+    ValidReg = AddMachineRegIndirect(MachineReg);
+    ++I;
+    break;
+  default:
+    llvm_unreachable("unsupported operand");
+  }
+
+  if (!ValidReg)
+    return false;
+
+  // Emit remaining elements of the expression.
+  AddExpression(I, Expr.end(), PieceOffsetInBits);
+  return true;
+}
+
+void DwarfExpression::AddExpression(DIExpression::iterator I,
+                                    DIExpression::iterator E,
+                                    unsigned PieceOffsetInBits) {
+  for (; I != E; ++I) {
+    switch (*I) {
+    case dwarf::DW_OP_bit_piece: {
+      unsigned OffsetInBits = I->getArg(1);
+      unsigned SizeInBits   = I->getArg(2);
+      AddOpPiece(SizeInBits, getOffsetOrZero(OffsetInBits, PieceOffsetInBits));
+      break;
+    }
+    case dwarf::DW_OP_plus:
+      EmitOp(dwarf::DW_OP_plus_uconst);
+      EmitUnsigned(I->getArg(1));
+      break;
+    case dwarf::DW_OP_deref:
+      EmitOp(dwarf::DW_OP_deref);
+      break;
+    default:
+      llvm_unreachable("unhandled opcode found in DIExpression");
+    }
+  }
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.h b/lib/CodeGen/AsmPrinter/DwarfExpression.h
new file mode 100644
index 0000000..b90b7b6
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -0,0 +1,133 @@
+//===-- llvm/CodeGen/DwarfExpression.h - Dwarf Compile Unit ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXPRESSION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXPRESSION_H
+
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class ByteStreamer;
+class TargetRegisterInfo;
+class DwarfUnit;
+class DIELoc;
+
+/// Base class containing the logic for constructing DWARF expressions
+/// independently of whether they are emitted into a DIE or into a .debug_loc
+/// entry.
+class DwarfExpression {
+protected:
+  const AsmPrinter &AP;
+  // Various convenience accessors that extract things out of AsmPrinter.
+  const TargetRegisterInfo *getTRI() const;
+  unsigned getDwarfVersion() const;
+
+public:
+  DwarfExpression(const AsmPrinter &AP) : AP(AP) {}
+  virtual ~DwarfExpression() {}
+
+  /// Output a dwarf operand and an optional assembler comment.
+  virtual void EmitOp(uint8_t Op, const char *Comment = nullptr) = 0;
+  /// Emit a raw signed value.
+  virtual void EmitSigned(int Value) = 0;
+  /// Emit a raw unsigned value.
+  virtual void EmitUnsigned(unsigned Value) = 0;
+  /// Return whether the given machine register is the frame register in the
+  /// current function.
+  virtual bool isFrameRegister(unsigned MachineReg) = 0;
+
+  /// Emit a dwarf register operation.
+  void AddReg(int DwarfReg, const char *Comment = nullptr);
+  /// Emit an (double-)indirect dwarf register operation.
+  void AddRegIndirect(int DwarfReg, int Offset, bool Deref = false);
+
+  /// Emit a dwarf register operation for describing
+  /// - a small value occupying only part of a register or
+  /// - a register representing only part of a value.
+  void AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0);
+  /// Emit a shift-right dwarf expression.
+  void AddShr(unsigned ShiftBy);
+
+  /// Emit an indirect dwarf register operation for the given machine register.
+  /// \return false if no DWARF register exists for MachineReg.
+  bool AddMachineRegIndirect(unsigned MachineReg, int Offset = 0);
+
+  /// \brief Emit a partial DWARF register operation.
+  /// \param MachineReg        the register
+  /// \param PieceSizeInBits   size and
+  /// \param PieceOffsetInBits offset of the piece in bits, if this is one
+  ///                          piece of an aggregate value.
+  ///
+  /// If size and offset is zero an operation for the entire
+  /// register is emitted: Some targets do not provide a DWARF
+  /// register number for every register.  If this is the case, this
+  /// function will attempt to emit a DWARF register by emitting a
+  /// piece of a super-register or by piecing together multiple
+  /// subregisters that alias the register.
+  ///
+  /// \return false if no DWARF register exists for MachineReg.
+  bool AddMachineRegPiece(unsigned MachineReg, unsigned PieceSizeInBits = 0,
+                          unsigned PieceOffsetInBits = 0);
+
+  /// Emit a signed constant.
+  void AddSignedConstant(int Value);
+  /// Emit an unsigned constant.
+  void AddUnsignedConstant(unsigned Value);
+
+  /// Emit an entire DIExpression on top of a machine register location.
+  /// \param PieceOffsetInBits If this is one piece out of a fragmented
+  /// location, this is the offset of the piece inside the entire variable.
+  /// \return false if no DWARF register exists for MachineReg.
+  bool AddMachineRegExpression(DIExpression Expr, unsigned MachineReg,
+                               unsigned PieceOffsetInBits = 0);
+  /// Emit a the operations remaining the DIExpressionIterator I.
+  /// \param PieceOffsetInBits If this is one piece out of a fragmented
+  /// location, this is the offset of the piece inside the entire variable.
+  void AddExpression(DIExpression::iterator I, DIExpression::iterator E,
+                     unsigned PieceOffsetInBits = 0);
+};
+
+/// DwarfExpression implementation for .debug_loc entries.
+class DebugLocDwarfExpression : public DwarfExpression {
+  ByteStreamer &BS;
+
+public:
+  DebugLocDwarfExpression(const AsmPrinter &AP, ByteStreamer &BS)
+      : DwarfExpression(AP), BS(BS) {}
+
+  void EmitOp(uint8_t Op, const char *Comment = nullptr) override;
+  void EmitSigned(int Value) override;
+  void EmitUnsigned(unsigned Value) override;
+  bool isFrameRegister(unsigned MachineReg) override;
+};
+
+/// DwarfExpression implementation for singular DW_AT_location.
+class DIEDwarfExpression : public DwarfExpression {
+  DwarfUnit &DU;
+  DIELoc &DIE;
+
+public:
+  DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE)
+      : DwarfExpression(AP), DU(DU), DIE(DIE) {}
+
+  void EmitOp(uint8_t Op, const char *Comment = nullptr) override;
+  void EmitSigned(int Value) override;
+  void EmitUnsigned(unsigned Value) override;
+  bool isFrameRegister(unsigned MachineReg) override;
+};
+}
+
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index 50180ea..3988f0d 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -8,13 +8,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "DwarfFile.h"
-
 #include "DwarfDebug.h"
 #include "DwarfUnit.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
 namespace llvm {
@@ -147,7 +146,7 @@ void DwarfFile::emitStrings(const MCSection *StrSection,
   StrPool.emit(*Asm, StrSection, OffsetSection);
 }
 
-void DwarfFile::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
+bool DwarfFile::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
   SmallVectorImpl<DbgVariable *> &Vars = ScopeVariables[LS];
   DIVariable DV = Var->getVariable();
   // Variables with positive arg numbers are parameters.
@@ -169,18 +168,17 @@ void DwarfFile::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
       // A later indexed parameter has been found, insert immediately before it.
       if (CurNum > ArgNum)
         break;
-      // FIXME: There are still some cases where two inlined functions are
-      // conflated together (two calls to the same function at the same
-      // location (eg: via a macro, or without column info, etc)) and then
-      // their arguments are conflated as well.
-      assert((LS->getParent() || CurNum != ArgNum) &&
-             "Duplicate argument for top level (non-inlined) function");
+      if (CurNum == ArgNum) {
+        (*I)->addMMIEntry(*Var);
+        return false;
+      }
       ++I;
     }
     Vars.insert(I, Var);
-    return;
+    return true;
   }
 
   Vars.push_back(Var);
+  return true;
 }
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index 9d64bfc..35bf33a 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -10,17 +10,16 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFFILE_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFFILE_H
 
+#include "AddressPool.h"
+#include "DwarfStringPool.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/Allocator.h"
-#include "AddressPool.h"
-#include "DwarfStringPool.h"
-
-#include <vector>
-#include <string>
 #include <memory>
+#include <string>
+#include <vector>
 
 namespace llvm {
 class AsmPrinter;
@@ -96,7 +95,8 @@ public:
   /// \brief Returns the string pool.
   DwarfStringPool &getStringPool() { return StrPool; }
 
-  void addScopeVariable(LexicalScope *LS, DbgVariable *Var);
+  /// \returns false if the variable was merged with a previous one.
+  bool addScopeVariable(LexicalScope *LS, DbgVariable *Var);
 
   DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8>> &getScopeVariables() {
     return ScopeVariables;
diff --git a/lib/CodeGen/AsmPrinter/DwarfStringPool.h b/lib/CodeGen/AsmPrinter/DwarfStringPool.h
index ab32c1b..63e3412 100644
--- a/lib/CodeGen/AsmPrinter/DwarfStringPool.h
+++ b/lib/CodeGen/AsmPrinter/DwarfStringPool.h
@@ -13,7 +13,6 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/Support/Allocator.h"
-
 #include <utility>
 
 namespace llvm {
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 919d9d2..b0c7d48 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -12,10 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "DwarfUnit.h"
-
 #include "DwarfAccelTable.h"
 #include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
+#include "DwarfExpression.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
@@ -43,6 +43,20 @@ GenerateDwarfTypeUnits("generate-type-units", cl::Hidden,
                        cl::desc("Generate DWARF4 type units."),
                        cl::init(false));
 
+void DIEDwarfExpression::EmitOp(uint8_t Op, const char* Comment) {
+  DU.addUInt(DIE, dwarf::DW_FORM_data1, Op);
+}
+void DIEDwarfExpression::EmitSigned(int Value) {
+  DU.addSInt(DIE, dwarf::DW_FORM_sdata, Value);
+}
+void DIEDwarfExpression::EmitUnsigned(unsigned Value) {
+  DU.addUInt(DIE, dwarf::DW_FORM_udata, Value);
+}
+bool DIEDwarfExpression::isFrameRegister(unsigned MachineReg) {
+  return MachineReg == getTRI()->getFrameRegister(*AP.MF);
+}
+
+
 /// Unit - Unit constructor.
 DwarfUnit::DwarfUnit(unsigned UID, dwarf::Tag UnitTag, DICompileUnit Node,
                      AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU)
@@ -116,6 +130,30 @@ int64_t DwarfUnit::getDefaultLowerBound() const {
     if (dwarf::DWARF_VERSION >= 4)
       return 1;
     break;
+
+  // The languages below have valid values only if the DWARF version >= 5.
+  case dwarf::DW_LANG_OpenCL:
+  case dwarf::DW_LANG_Go:
+  case dwarf::DW_LANG_Haskell:
+  case dwarf::DW_LANG_C_plus_plus_03:
+  case dwarf::DW_LANG_C_plus_plus_11:
+  case dwarf::DW_LANG_OCaml:
+  case dwarf::DW_LANG_Rust:
+  case dwarf::DW_LANG_C11:
+  case dwarf::DW_LANG_Swift:
+  case dwarf::DW_LANG_Dylan:
+  case dwarf::DW_LANG_C_plus_plus_14:
+    if (dwarf::DWARF_VERSION >= 5)
+      return 0;
+    break;
+
+  case dwarf::DW_LANG_Modula3:
+  case dwarf::DW_LANG_Julia:
+  case dwarf::DW_LANG_Fortran03:
+  case dwarf::DW_LANG_Fortran08:
+    if (dwarf::DWARF_VERSION >= 5)
+      return 1;
+    break;
   }
 
   return -1;
@@ -399,85 +437,18 @@ void DwarfUnit::addSourceLine(DIE &Die, DINameSpace NS) {
 }
 
 /// addRegisterOp - Add register operand.
-// FIXME: Ideally, this would share the implementation with
-// AsmPrinter::EmitDwarfRegOpPiece.
-void DwarfUnit::addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
+bool DwarfUnit::addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
                                    unsigned SizeInBits, unsigned OffsetInBits) {
-  const TargetRegisterInfo *RI = Asm->TM.getSubtargetImpl()->getRegisterInfo();
-  int DWReg = RI->getDwarfRegNum(Reg, false);
-  bool isSubRegister = DWReg < 0;
-
-  unsigned Idx = 0;
-
-  // Go up the super-register chain until we hit a valid dwarf register number.
-  for (MCSuperRegIterator SR(Reg, RI); SR.isValid() && DWReg < 0; ++SR) {
-    DWReg = RI->getDwarfRegNum(*SR, false);
-    if (DWReg >= 0)
-      Idx = RI->getSubRegIndex(*SR, Reg);
-  }
-
-  if (DWReg < 0) {
-    DEBUG(dbgs() << "Invalid Dwarf register number.\n");
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_nop);
-    return;
-  }
-
-  // Emit register.
-  if (DWReg < 32)
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + DWReg);
-  else {
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
-    addUInt(TheDie, dwarf::DW_FORM_udata, DWReg);
-  }
-
-  // Emit mask.
-  bool isPiece = SizeInBits > 0;
-  if (isSubRegister || isPiece) {
-    const unsigned SizeOfByte = 8;
-    unsigned RegSizeInBits = RI->getSubRegIdxSize(Idx);
-    unsigned RegOffsetInBits = RI->getSubRegIdxOffset(Idx);
-    unsigned PieceSizeInBits = std::max(SizeInBits, RegSizeInBits);
-    unsigned PieceOffsetInBits = OffsetInBits ? OffsetInBits : RegOffsetInBits;
-    assert(RegSizeInBits >= SizeInBits && "register smaller than value");
-
-    if (RegOffsetInBits != PieceOffsetInBits) {
-      // Manually shift the value into place, since the DW_OP_piece
-      // describes the part of the variable, not the position of the
-      // subregister.
-      addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-      addUInt(TheDie, dwarf::DW_FORM_data1, RegOffsetInBits);
-      addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_shr);
-    }
-
-    if (PieceOffsetInBits > 0 || PieceSizeInBits % SizeOfByte) {
-      assert(PieceSizeInBits > 0 && "piece has zero size");
-      addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_bit_piece);
-      addUInt(TheDie, dwarf::DW_FORM_data1, PieceSizeInBits);
-      addUInt(TheDie, dwarf::DW_FORM_data1, PieceOffsetInBits);
-     } else {
-      assert(PieceSizeInBits > 0 && "piece has zero size");
-      addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_piece);
-      addUInt(TheDie, dwarf::DW_FORM_data1, PieceSizeInBits/SizeOfByte);
-    }
-  }
+  DIEDwarfExpression Expr(*Asm, *this, TheDie);
+  Expr.AddMachineRegPiece(Reg, SizeInBits, OffsetInBits);
+  return true;
 }
 
 /// addRegisterOffset - Add register offset.
-void DwarfUnit::addRegisterOffset(DIELoc &TheDie, unsigned Reg,
+bool DwarfUnit::addRegisterOffset(DIELoc &TheDie, unsigned Reg,
                                   int64_t Offset) {
-  const TargetRegisterInfo *RI = Asm->TM.getSubtargetImpl()->getRegisterInfo();
-  unsigned DWReg = RI->getDwarfRegNum(Reg, false);
-  const TargetRegisterInfo *TRI = Asm->TM.getSubtargetImpl()->getRegisterInfo();
-  if (Reg == TRI->getFrameRegister(*Asm->MF))
-    // If variable offset is based in frame register then use fbreg.
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_fbreg);
-  else if (DWReg < 32)
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + DWReg);
-  else {
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
-    addUInt(TheDie, dwarf::DW_FORM_udata, DWReg);
-  }
-  addSInt(TheDie, dwarf::DW_FORM_sdata, Offset);
+  DIEDwarfExpression Expr(*Asm, *this, TheDie);
+  return Expr.AddMachineRegIndirect(Reg, Offset);
 }
 
 /* Byref variables, in Blocks, are declared by the programmer as "SomeType
@@ -581,10 +552,14 @@ void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die,
   // variable's location.
   DIELoc *Loc = new (DIEValueAllocator) DIELoc();
 
+  bool validReg;
   if (Location.isReg())
-    addRegisterOpPiece(*Loc, Location.getReg());
+    validReg = addRegisterOpPiece(*Loc, Location.getReg());
   else
-    addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
+    validReg = addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
+
+  if (!validReg)
+    return;
 
   // If we started with a pointer to the __Block_byref... struct, then
   // the first thing we need to do is dereference the pointer (DW_OP_deref).
@@ -622,13 +597,19 @@ static bool isUnsignedDIType(DwarfDebug *DD, DIType Ty) {
     dwarf::Tag T = (dwarf::Tag)Ty.getTag();
     // Encode pointer constants as unsigned bytes. This is used at least for
     // null pointer constant emission.
+    // (Pieces of) aggregate types that get hacked apart by SROA may also be
+    // represented by a constant. Encode them as unsigned bytes.
     // FIXME: reference and rvalue_reference /probably/ shouldn't be allowed
     // here, but accept them for now due to a bug in SROA producing bogus
     // dbg.values.
-    if (T == dwarf::DW_TAG_pointer_type ||
+    if (T == dwarf::DW_TAG_array_type ||
+        T == dwarf::DW_TAG_class_type ||
+        T == dwarf::DW_TAG_pointer_type ||
         T == dwarf::DW_TAG_ptr_to_member_type ||
         T == dwarf::DW_TAG_reference_type ||
-        T == dwarf::DW_TAG_rvalue_reference_type)
+        T == dwarf::DW_TAG_rvalue_reference_type ||
+        T == dwarf::DW_TAG_structure_type ||
+        T == dwarf::DW_TAG_union_type)
       return true;
     assert(T == dwarf::DW_TAG_typedef || T == dwarf::DW_TAG_const_type ||
            T == dwarf::DW_TAG_volatile_type ||
@@ -649,11 +630,15 @@ static bool isUnsignedDIType(DwarfDebug *DD, DIType Ty) {
           Encoding == dwarf::DW_ATE_unsigned_char ||
           Encoding == dwarf::DW_ATE_signed ||
           Encoding == dwarf::DW_ATE_signed_char ||
-          Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean) &&
+          Encoding == dwarf::DW_ATE_float ||
+          Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean ||
+          (Ty.getTag() == dwarf::DW_TAG_unspecified_type &&
+           Ty.getName() == "decltype(nullptr)")) &&
          "Unsupported encoding");
   return (Encoding == dwarf::DW_ATE_unsigned ||
           Encoding == dwarf::DW_ATE_unsigned_char ||
-          Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean);
+          Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean ||
+          Ty.getTag() == dwarf::DW_TAG_unspecified_type);
 }
 
 /// If this type is derived from a base type then return base type size.
@@ -667,10 +652,7 @@ static uint64_t getBaseTypeSize(DwarfDebug *DD, DIDerivedType Ty) {
 
   DIType BaseType = DD->resolve(Ty.getTypeDerivedFrom());
 
-  // If this type is not derived from any type or the type is a declaration then
-  // take conservative approach.
-  if (!BaseType.isValid() || BaseType.isForwardDecl())
-    return Ty.getSizeInBits();
+  assert(BaseType.isValid() && "Unexpected invalid base type");
 
   // If this is a derived type, go ahead and get the base type, unless it's a
   // reference then it's just the size of the field. Pointer types have no need
@@ -977,7 +959,8 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
     addString(Buffer, dwarf::DW_AT_name, Name);
 
   // Add size if non-zero (derived types might be zero-sized.)
-  if (Size && Tag != dwarf::DW_TAG_pointer_type)
+  if (Size && Tag != dwarf::DW_TAG_pointer_type
+           && Tag != dwarf::DW_TAG_ptr_to_member_type)
     addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);
 
   if (Tag == dwarf::DW_TAG_ptr_to_member_type)
@@ -1110,6 +1093,8 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     if (CTy.isAppleBlockExtension())
       addFlag(Buffer, dwarf::DW_AT_APPLE_block);
 
+    // This is outside the DWARF spec, but GDB expects a DW_AT_containing_type
+    // inside C++ composite types to point to the base class with the vtable.
     DICompositeType ContainingType(resolve(CTy.getContainingType()));
     if (ContainingType)
       addDIEEntry(Buffer, dwarf::DW_AT_containing_type,
@@ -1187,10 +1172,10 @@ DwarfUnit::constructTemplateValueParameterDIE(DIE &Buffer,
     addType(ParamDIE, resolve(VP.getType()));
   if (!VP.getName().empty())
     addString(ParamDIE, dwarf::DW_AT_name, VP.getName());
-  if (Value *Val = VP.getValue()) {
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(Val))
+  if (Metadata *Val = VP.getValue()) {
+    if (ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Val))
       addConstantValue(ParamDIE, CI, resolve(VP.getType()));
-    else if (GlobalValue *GV = dyn_cast<GlobalValue>(Val)) {
+    else if (GlobalValue *GV = mdconst::dyn_extract<GlobalValue>(Val)) {
       // For declaration non-type template parameters (such as global values and
       // functions)
       DIELoc *Loc = new (DIEValueAllocator) DIELoc();
@@ -1359,7 +1344,7 @@ void DwarfUnit::applySubprogramAttributes(DISubprogram SP, DIE &SPDie,
   if (SP.isOptimized())
     addFlag(SPDie, dwarf::DW_AT_APPLE_optimized);
 
-  if (unsigned isa = Asm->getISAEncoding()) {
+  if (unsigned isa = Asm->getISAEncoding(SP.getFunction())) {
     addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa);
   }
 
@@ -1511,7 +1496,7 @@ void DwarfUnit::constructMemberDIE(DIE &Buffer, DIDerivedType DT) {
     uint64_t FieldSize = getBaseTypeSize(DD, DT);
     uint64_t OffsetInBytes;
 
-    if (Size != FieldSize) {
+    if (FieldSize && Size != FieldSize) {
       // Handle bitfield, assume bytes are 8 bits.
       addUInt(MemberDie, dwarf::DW_AT_byte_size, None, FieldSize/8);
       addUInt(MemberDie, dwarf::DW_AT_bit_size, None, Size);
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index f40c937..7a5e47d 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -14,17 +14,17 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFUNIT_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFUNIT_H
 
-#include "DIE.h"
 #include "DwarfDebug.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCDwarf.h"
 
 namespace llvm {
 
@@ -138,6 +138,7 @@ public:
   }
 
   // Accessors.
+  AsmPrinter* getAsmPrinter() const { return Asm; }
   unsigned getUniqueID() const { return UniqueID; }
   uint16_t getLanguage() const { return CUNode.getLanguage(); }
   DICompileUnit getCUNode() const { return CUNode; }
@@ -253,12 +254,16 @@ public:
   /// addTemplateParams - Add template parameters in buffer.
   void addTemplateParams(DIE &Buffer, DIArray TParams);
 
-  /// addRegisterOp - Add register operand.
-  void addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
+  /// \brief Add register operand.
+  /// \returns false if the register does not exist, e.g., because it was never
+  /// materialized.
+  bool addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
                           unsigned SizeInBits = 0, unsigned OffsetInBits = 0);
 
-  /// addRegisterOffset - Add register offset.
-  void addRegisterOffset(DIELoc &TheDie, unsigned Reg, int64_t Offset);
+  /// \brief Add register offset.
+  /// \returns false if the register does not exist, e.g., because it was never
+  /// materialized.
+  bool addRegisterOffset(DIELoc &TheDie, unsigned Reg, int64_t Offset);
 
   // FIXME: Should be reformulated in terms of addComplexAddress.
   /// addBlockByrefAddress - Start with the address based on the location
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 2bbffb3..4841814 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -121,7 +121,8 @@ computeActionsTable(const SmallVectorImpl<const LandingPadInfo*> &LandingPads,
       for (unsigned J = NumShared, M = TypeIds.size(); J != M; ++J) {
         int TypeID = TypeIds[J];
         assert(-1 - TypeID < (int)FilterOffsets.size() && "Unknown filter id!");
-        int ValueForTypeID = TypeID < 0 ? FilterOffsets[-1 - TypeID] : TypeID;
+        int ValueForTypeID =
+            isFilterEHSelector(TypeID) ? FilterOffsets[-1 - TypeID] : TypeID;
         unsigned SizeTypeID = getSLEB128Size(ValueForTypeID);
 
         int NextAction = SizeAction ? -(SizeAction + SizeTypeID) : 0;
@@ -195,9 +196,22 @@ bool EHStreamer::callToNoUnwindFunction(const MachineInstr *MI) {
 /// table.  Entries must be ordered by try-range address.
 void EHStreamer::
 computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
-                     const RangeMapType &PadMap,
                      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
                      const SmallVectorImpl<unsigned> &FirstActions) {
+  // Invokes and nounwind calls have entries in PadMap (due to being bracketed
+  // by try-range labels when lowered).  Ordinary calls do not, so appropriate
+  // try-ranges for them need be deduced so we can put them in the LSDA.
+  RangeMapType PadMap;
+  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
+    const LandingPadInfo *LandingPad = LandingPads[i];
+    for (unsigned j = 0, E = LandingPad->BeginLabels.size(); j != E; ++j) {
+      MCSymbol *BeginLabel = LandingPad->BeginLabels[j];
+      assert(!PadMap.count(BeginLabel) && "Duplicate landing pad labels!");
+      PadRange P = { i, j };
+      PadMap[BeginLabel] = P;
+    }
+  }
+
   // The end label of the previous invoke or nounwind try-range.
   MCSymbol *LastLabel = nullptr;
 
@@ -208,6 +222,8 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
   // Whether the last CallSite entry was for an invoke.
   bool PreviousIsInvoke = false;
 
+  bool IsSJLJ = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::SjLj;
+
   // Visit all instructions in order of address.
   for (const auto &MBB : *Asm->MF) {
     for (const auto &MI : MBB) {
@@ -237,7 +253,7 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
       // instruction between the previous try-range and this one may throw,
       // create a call-site entry with no landing pad for the region between the
       // try-ranges.
-      if (SawPotentiallyThrowing && Asm->MAI->usesItaniumLSDAForExceptions()) {
+      if (SawPotentiallyThrowing && Asm->MAI->usesCFIForEH()) {
         CallSiteEntry Site = { LastLabel, BeginLabel, nullptr, 0 };
         CallSites.push_back(Site);
         PreviousIsInvoke = false;
@@ -254,14 +270,14 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
         CallSiteEntry Site = {
           BeginLabel,
           LastLabel,
-          LandingPad->LandingPadLabel,
+          LandingPad,
           FirstActions[P.PadIndex]
         };
 
         // Try to merge with the previous call-site. SJLJ doesn't do this
-        if (PreviousIsInvoke && Asm->MAI->usesItaniumLSDAForExceptions()) {
+        if (PreviousIsInvoke && !IsSJLJ) {
           CallSiteEntry &Prev = CallSites.back();
-          if (Site.PadLabel == Prev.PadLabel && Site.Action == Prev.Action) {
+          if (Site.LPad == Prev.LPad && Site.Action == Prev.Action) {
             // Extend the range of the previous entry.
             Prev.EndLabel = Site.EndLabel;
             continue;
@@ -269,7 +285,7 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
         }
 
         // Otherwise, create a new call-site.
-        if (Asm->MAI->usesItaniumLSDAForExceptions())
+        if (!IsSJLJ)
           CallSites.push_back(Site);
         else {
           // SjLj EH must maintain the call sites in the order assigned
@@ -287,7 +303,7 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
   // If some instruction between the previous try-range and the end of the
   // function may throw, create a call-site entry with no landing pad for the
   // region following the try-range.
-  if (SawPotentiallyThrowing && Asm->MAI->usesItaniumLSDAForExceptions()) {
+  if (SawPotentiallyThrowing && !IsSJLJ) {
     CallSiteEntry Site = { LastLabel, nullptr, nullptr, 0 };
     CallSites.push_back(Site);
   }
@@ -338,23 +354,9 @@ void EHStreamer::emitExceptionTable() {
   unsigned SizeActions =
     computeActionsTable(LandingPads, Actions, FirstActions);
 
-  // Invokes and nounwind calls have entries in PadMap (due to being bracketed
-  // by try-range labels when lowered).  Ordinary calls do not, so appropriate
-  // try-ranges for them need be deduced when using DWARF exception handling.
-  RangeMapType PadMap;
-  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
-    const LandingPadInfo *LandingPad = LandingPads[i];
-    for (unsigned j = 0, E = LandingPad->BeginLabels.size(); j != E; ++j) {
-      MCSymbol *BeginLabel = LandingPad->BeginLabels[j];
-      assert(!PadMap.count(BeginLabel) && "Duplicate landing pad labels!");
-      PadRange P = { i, j };
-      PadMap[BeginLabel] = P;
-    }
-  }
-
   // Compute the call-site table.
   SmallVector<CallSiteEntry, 64> CallSites;
-  computeCallSiteTable(CallSites, PadMap, LandingPads, FirstActions);
+  computeCallSiteTable(CallSites, LandingPads, FirstActions);
 
   // Final tallies.
 
@@ -519,8 +521,7 @@ void EHStreamer::emitExceptionTable() {
       Asm->EmitULEB128(S.Action);
     }
   } else {
-    // DWARF Exception handling
-    assert(Asm->MAI->usesItaniumLSDAForExceptions());
+    // Itanium LSDA exception handling
 
     // The call-site table is a list of all call sites that may throw an
     // exception (including C++ 'throw' statements) in the procedure
@@ -576,15 +577,15 @@ void EHStreamer::emitExceptionTable() {
 
       // Offset of the landing pad, counted in 16-byte bundles relative to the
       // @LPStart address.
-      if (!S.PadLabel) {
+      if (!S.LPad) {
         if (VerboseAsm)
           Asm->OutStreamer.AddComment("    has no landing pad");
         Asm->OutStreamer.EmitIntValue(0, 4/*size*/);
       } else {
         if (VerboseAsm)
           Asm->OutStreamer.AddComment(Twine("    jumps to ") +
-                                      S.PadLabel->getName());
-        Asm->EmitLabelDifference(S.PadLabel, EHFuncBeginSym, 4);
+                                      S.LPad->LandingPadLabel->getName());
+        Asm->EmitLabelDifference(S.LPad->LandingPadLabel, EHFuncBeginSym, 4);
       }
 
       // Offset of the first associated action record, relative to the start of
@@ -681,7 +682,7 @@ void EHStreamer::emitTypeInfos(unsigned TTypeEncoding) {
     unsigned TypeID = *I;
     if (VerboseAsm) {
       --Entry;
-      if (TypeID != 0)
+      if (isFilterEHSelector(TypeID))
         Asm->OutStreamer.AddComment("FilterInfo " + Twine(Entry));
     }
 
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.h b/lib/CodeGen/AsmPrinter/EHStreamer.h
index 7e9549d..9b316ff 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.h
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.h
@@ -23,6 +23,8 @@ class MachineModuleInfo;
 class MachineInstr;
 class MachineFunction;
 class AsmPrinter;
+class MCSymbol;
+class MCSymbolRefExpr;
 
 template <typename T>
 class SmallVectorImpl;
@@ -60,11 +62,11 @@ protected:
   /// Structure describing an entry in the call-site table.
   struct CallSiteEntry {
     // The 'try-range' is BeginLabel .. EndLabel.
-    MCSymbol *BeginLabel; // zero indicates the start of the function.
-    MCSymbol *EndLabel;   // zero indicates the end of the function.
+    MCSymbol *BeginLabel; // Null indicates the start of the function.
+    MCSymbol *EndLabel;   // Null indicates the end of the function.
 
-    // The landing pad starts at PadLabel.
-    MCSymbol *PadLabel;   // zero indicates that there is no landing pad.
+    // LPad contains the landing pad start labels.
+    const LandingPadInfo *LPad; // Null indicates that there is no landing pad.
     unsigned Action;
   };
 
@@ -86,7 +88,6 @@ protected:
   /// form gaps in the table.  Entries must be ordered by try-range address.
 
   void computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
-                            const RangeMapType &PadMap,
                             const SmallVectorImpl<const LandingPadInfo *> &LPs,
                             const SmallVectorImpl<unsigned> &FirstActions);
 
@@ -113,6 +114,13 @@ protected:
 
   virtual void emitTypeInfos(unsigned TTypeEncoding);
 
+  // Helpers for for identifying what kind of clause an EH typeid or selector
+  // corresponds to. Negative selectors are for filter clauses, the zero
+  // selector is for cleanups, and positive selectors are for catch clauses.
+  static bool isFilterEHSelector(int Selector) { return Selector < 0; }
+  static bool isCleanupEHSelector(int Selector) { return Selector == 0; }
+  static bool isCatchEHSelector(int Selector) { return Selector > 0; }
+
 public:
   EHStreamer(AsmPrinter *A);
   virtual ~EHStreamer();
diff --git a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
index 5bda5a9..97a3234 100644
--- a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
@@ -34,35 +34,35 @@ using namespace llvm;
 
 namespace {
 
-  class ErlangGCPrinter : public GCMetadataPrinter {
-  public:
-    void beginAssembly(AsmPrinter &AP) override;
-    void finishAssembly(AsmPrinter &AP) override;
-  };
-
+class ErlangGCPrinter : public GCMetadataPrinter {
+public:
+  void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
+};
 }
 
 static GCMetadataPrinterRegistry::Add<ErlangGCPrinter>
-X("erlang", "erlang-compatible garbage collector");
-
-void llvm::linkErlangGCPrinter() { }
+    X("erlang", "erlang-compatible garbage collector");
 
-void ErlangGCPrinter::beginAssembly(AsmPrinter &AP) { }
+void llvm::linkErlangGCPrinter() {}
 
-void ErlangGCPrinter::finishAssembly(AsmPrinter &AP) {
+void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
+                                     AsmPrinter &AP) {
   MCStreamer &OS = AP.OutStreamer;
-  unsigned IntPtrSize =
-      AP.TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
+  unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize();
 
   // Put this in a custom .note section.
-  AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getContext()
-    .getELFSection(".note.gc", ELF::SHT_PROGBITS, 0,
-                   SectionKind::getDataRel()));
+  AP.OutStreamer.SwitchSection(
+      AP.getObjFileLowering().getContext().getELFSection(".note.gc",
+                                                         ELF::SHT_PROGBITS, 0));
 
   // For each function...
-  for (iterator FI = begin(), FE = end(); FI != FE; ++FI) {
+  for (GCModuleInfo::FuncInfoVec::iterator FI = Info.funcinfo_begin(),
+                                           IE = Info.funcinfo_end();
+       FI != IE; ++FI) {
     GCFunctionInfo &MD = **FI;
-
+    if (MD.getStrategy().getName() != getStrategy().getName())
+      // this function is managed by some other GC
+      continue;
     /** A compact GC layout. Emit this data structure:
      *
      * struct {
@@ -88,7 +88,7 @@ void ErlangGCPrinter::finishAssembly(AsmPrinter &AP) {
       // Emit the address of the safe point.
       OS.AddComment("safe point address");
       MCSymbol *Label = PI->Label;
-      AP.EmitLabelPlusOffset(Label/*Hi*/, 0/*Offset*/, 4/*Size*/);
+      AP.EmitLabelPlusOffset(Label /*Hi*/, 0 /*Offset*/, 4 /*Size*/);
     }
 
     // Stack information never change in safe points! Only print info from the
@@ -101,8 +101,9 @@ void ErlangGCPrinter::finishAssembly(AsmPrinter &AP) {
 
     // Emit stack arity, i.e. the number of stacked arguments.
     unsigned RegisteredArgs = IntPtrSize == 4 ? 5 : 6;
-    unsigned StackArity = MD.getFunction().arg_size() > RegisteredArgs ?
-                          MD.getFunction().arg_size() - RegisteredArgs : 0;
+    unsigned StackArity = MD.getFunction().arg_size() > RegisteredArgs
+                              ? MD.getFunction().arg_size() - RegisteredArgs
+                              : 0;
     OS.AddComment("stack arity");
     AP.EmitInt16(StackArity);
 
@@ -113,7 +114,7 @@ void ErlangGCPrinter::finishAssembly(AsmPrinter &AP) {
     // And for each live root...
     for (GCFunctionInfo::live_iterator LI = MD.live_begin(PI),
                                        LE = MD.live_end(PI);
-                                       LI != LE; ++LI) {
+         LI != LE; ++LI) {
       // Emit live root's offset within the stack frame.
       OS.AddComment("stack index (offset / wordsize)");
       AP.EmitInt16(LI->StackOffset / IntPtrSize);
diff --git a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index 6480d048..76d6a06 100644
--- a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -32,18 +32,17 @@ using namespace llvm;
 
 namespace {
 
-  class OcamlGCMetadataPrinter : public GCMetadataPrinter {
-  public:
-    void beginAssembly(AsmPrinter &AP) override;
-    void finishAssembly(AsmPrinter &AP) override;
-  };
-
+class OcamlGCMetadataPrinter : public GCMetadataPrinter {
+public:
+  void beginAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
+  void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
+};
 }
 
 static GCMetadataPrinterRegistry::Add<OcamlGCMetadataPrinter>
-Y("ocaml", "ocaml 3.10-compatible collector");
+    Y("ocaml", "ocaml 3.10-compatible collector");
 
-void llvm::linkOcamlGCPrinter() { }
+void llvm::linkOcamlGCPrinter() {}
 
 static void EmitCamlGlobal(const Module &M, AsmPrinter &AP, const char *Id) {
   const std::string &MId = M.getModuleIdentifier();
@@ -67,12 +66,13 @@ static void EmitCamlGlobal(const Module &M, AsmPrinter &AP, const char *Id) {
   AP.OutStreamer.EmitLabel(Sym);
 }
 
-void OcamlGCMetadataPrinter::beginAssembly(AsmPrinter &AP) {
+void OcamlGCMetadataPrinter::beginAssembly(Module &M, GCModuleInfo &Info,
+                                           AsmPrinter &AP) {
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getTextSection());
-  EmitCamlGlobal(getModule(), AP, "code_begin");
+  EmitCamlGlobal(M, AP, "code_begin");
 
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getDataSection());
-  EmitCamlGlobal(getModule(), AP, "data_begin");
+  EmitCamlGlobal(M, AP, "data_begin");
 }
 
 /// emitAssembly - Print the frametable. The ocaml frametable format is thus:
@@ -91,47 +91,59 @@ void OcamlGCMetadataPrinter::beginAssembly(AsmPrinter &AP) {
 /// (FrameSize and LiveOffsets would overflow). FrameTablePrinter will abort if
 /// either condition is detected in a function which uses the GC.
 ///
-void OcamlGCMetadataPrinter::finishAssembly(AsmPrinter &AP) {
-  unsigned IntPtrSize =
-      AP.TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
+void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
+                                            AsmPrinter &AP) {
+  unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize();
 
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getTextSection());
-  EmitCamlGlobal(getModule(), AP, "code_end");
+  EmitCamlGlobal(M, AP, "code_end");
 
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getDataSection());
-  EmitCamlGlobal(getModule(), AP, "data_end");
+  EmitCamlGlobal(M, AP, "data_end");
 
   // FIXME: Why does ocaml emit this??
   AP.OutStreamer.EmitIntValue(0, IntPtrSize);
 
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getDataSection());
-  EmitCamlGlobal(getModule(), AP, "frametable");
+  EmitCamlGlobal(M, AP, "frametable");
 
   int NumDescriptors = 0;
-  for (iterator I = begin(), IE = end(); I != IE; ++I) {
+  for (GCModuleInfo::FuncInfoVec::iterator I = Info.funcinfo_begin(),
+                                           IE = Info.funcinfo_end();
+       I != IE; ++I) {
     GCFunctionInfo &FI = **I;
+    if (FI.getStrategy().getName() != getStrategy().getName())
+      // this function is managed by some other GC
+      continue;
     for (GCFunctionInfo::iterator J = FI.begin(), JE = FI.end(); J != JE; ++J) {
       NumDescriptors++;
     }
   }
 
-  if (NumDescriptors >= 1<<16) {
+  if (NumDescriptors >= 1 << 16) {
     // Very rude!
     report_fatal_error(" Too much descriptor for ocaml GC");
   }
   AP.EmitInt16(NumDescriptors);
   AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
 
-  for (iterator I = begin(), IE = end(); I != IE; ++I) {
+  for (GCModuleInfo::FuncInfoVec::iterator I = Info.funcinfo_begin(),
+                                           IE = Info.funcinfo_end();
+       I != IE; ++I) {
     GCFunctionInfo &FI = **I;
+    if (FI.getStrategy().getName() != getStrategy().getName())
+      // this function is managed by some other GC
+      continue;
 
     uint64_t FrameSize = FI.getFrameSize();
-    if (FrameSize >= 1<<16) {
+    if (FrameSize >= 1 << 16) {
       // Very rude!
       report_fatal_error("Function '" + FI.getFunction().getName() +
                          "' is too large for the ocaml GC! "
-                         "Frame size " + Twine(FrameSize) + ">= 65536.\n"
-                         "(" + Twine(uintptr_t(&FI)) + ")");
+                         "Frame size " +
+                         Twine(FrameSize) + ">= 65536.\n"
+                                            "(" +
+                         Twine(uintptr_t(&FI)) + ")");
     }
 
     AP.OutStreamer.AddComment("live roots for " +
@@ -140,11 +152,12 @@ void OcamlGCMetadataPrinter::finishAssembly(AsmPrinter &AP) {
 
     for (GCFunctionInfo::iterator J = FI.begin(), JE = FI.end(); J != JE; ++J) {
       size_t LiveCount = FI.live_size(J);
-      if (LiveCount >= 1<<16) {
+      if (LiveCount >= 1 << 16) {
         // Very rude!
         report_fatal_error("Function '" + FI.getFunction().getName() +
                            "' is too large for the ocaml GC! "
-                           "Live root count "+Twine(LiveCount)+" >= 65536.");
+                           "Live root count " +
+                           Twine(LiveCount) + " >= 65536.");
       }
 
       AP.OutStreamer.EmitSymbolValue(J->Label, IntPtrSize);
@@ -152,12 +165,13 @@ void OcamlGCMetadataPrinter::finishAssembly(AsmPrinter &AP) {
       AP.EmitInt16(LiveCount);
 
       for (GCFunctionInfo::live_iterator K = FI.live_begin(J),
-                                         KE = FI.live_end(J); K != KE; ++K) {
-        if (K->StackOffset >= 1<<16) {
+                                         KE = FI.live_end(J);
+           K != KE; ++K) {
+        if (K->StackOffset >= 1 << 16) {
           // Very rude!
           report_fatal_error(
-                 "GC root stack offset is outside of fixed stack frame and out "
-                 "of range for ocaml GC!");
+              "GC root stack offset is outside of fixed stack frame and out "
+              "of range for ocaml GC!");
         }
         AP.EmitInt16(K->StackOffset);
       }
diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.cpp b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
index 0f0ad75..2b03877 100644
--- a/lib/CodeGen/AsmPrinter/Win64Exception.cpp
+++ b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
@@ -60,7 +60,7 @@ void Win64Exception::beginFunction(const MachineFunction *MF) {
 
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   unsigned PerEncoding = TLOF.getPersonalityEncoding();
-  const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()];
+  const Function *Per = MF->getMMI().getPersonality();
 
   shouldEmitPersonality = hasLandingPads &&
     PerEncoding != dwarf::DW_EH_PE_omit && Per;
@@ -99,9 +99,151 @@ void Win64Exception::endFunction(const MachineFunction *) {
 
   if (shouldEmitPersonality) {
     Asm->OutStreamer.PushSection();
+
+    // Emit an UNWIND_INFO struct describing the prologue.
     Asm->OutStreamer.EmitWinEHHandlerData();
-    emitExceptionTable();
+
+    // Emit the tables appropriate to the personality function in use. If we
+    // don't recognize the personality, assume it uses an Itanium-style LSDA.
+    EHPersonality Per = MMI->getPersonalityType();
+    if (Per == EHPersonality::MSVC_Win64SEH)
+      emitCSpecificHandlerTable();
+    else
+      emitExceptionTable();
+
     Asm->OutStreamer.PopSection();
   }
   Asm->OutStreamer.EmitWinCFIEndProc();
 }
+
+const MCSymbolRefExpr *Win64Exception::createImageRel32(const MCSymbol *Value) {
+  return MCSymbolRefExpr::Create(Value, MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                 Asm->OutContext);
+}
+
+/// Emit the language-specific data that __C_specific_handler expects.  This
+/// handler lives in the x64 Microsoft C runtime and allows catching or cleaning
+/// up after faults with __try, __except, and __finally.  The typeinfo values
+/// are not really RTTI data, but pointers to filter functions that return an
+/// integer (1, 0, or -1) indicating how to handle the exception. For __finally
+/// blocks and other cleanups, the landing pad label is zero, and the filter
+/// function is actually a cleanup handler with the same prototype.  A catch-all
+/// entry is modeled with a null filter function field and a non-zero landing
+/// pad label.
+///
+/// Possible filter function return values:
+///   EXCEPTION_EXECUTE_HANDLER (1):
+///     Jump to the landing pad label after cleanups.
+///   EXCEPTION_CONTINUE_SEARCH (0):
+///     Continue searching this table or continue unwinding.
+///   EXCEPTION_CONTINUE_EXECUTION (-1):
+///     Resume execution at the trapping PC.
+///
+/// Inferred table structure:
+///   struct Table {
+///     int NumEntries;
+///     struct Entry {
+///       imagerel32 LabelStart;
+///       imagerel32 LabelEnd;
+///       imagerel32 FilterOrFinally;  // One means catch-all.
+///       imagerel32 LabelLPad;        // Zero means __finally.
+///     } Entries[NumEntries];
+///   };
+void Win64Exception::emitCSpecificHandlerTable() {
+  const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
+
+  // Simplifying assumptions for first implementation:
+  // - Cleanups are not implemented.
+  // - Filters are not implemented.
+
+  // The Itanium LSDA table sorts similar landing pads together to simplify the
+  // actions table, but we don't need that.
+  SmallVector<const LandingPadInfo *, 64> LandingPads;
+  LandingPads.reserve(PadInfos.size());
+  for (const auto &LP : PadInfos)
+    LandingPads.push_back(&LP);
+
+  // Compute label ranges for call sites as we would for the Itanium LSDA, but
+  // use an all zero action table because we aren't using these actions.
+  SmallVector<unsigned, 64> FirstActions;
+  FirstActions.resize(LandingPads.size());
+  SmallVector<CallSiteEntry, 64> CallSites;
+  computeCallSiteTable(CallSites, LandingPads, FirstActions);
+
+  MCSymbol *EHFuncBeginSym =
+      Asm->GetTempSymbol("eh_func_begin", Asm->getFunctionNumber());
+  MCSymbol *EHFuncEndSym =
+      Asm->GetTempSymbol("eh_func_end", Asm->getFunctionNumber());
+
+  // Emit the number of table entries.
+  unsigned NumEntries = 0;
+  for (const CallSiteEntry &CSE : CallSites) {
+    if (!CSE.LPad)
+      continue; // Ignore gaps.
+    for (int Selector : CSE.LPad->TypeIds) {
+      // Ignore C++ filter clauses in SEH.
+      // FIXME: Implement cleanup clauses.
+      if (isCatchEHSelector(Selector))
+        ++NumEntries;
+    }
+  }
+  Asm->OutStreamer.EmitIntValue(NumEntries, 4);
+
+  // Emit the four-label records for each call site entry. The table has to be
+  // sorted in layout order, and the call sites should already be sorted.
+  for (const CallSiteEntry &CSE : CallSites) {
+    // Ignore gaps. Unlike the Itanium model, unwinding through a frame without
+    // an EH table entry will propagate the exception rather than terminating
+    // the program.
+    if (!CSE.LPad)
+      continue;
+    const LandingPadInfo *LPad = CSE.LPad;
+
+    // Compute the label range. We may reuse the function begin and end labels
+    // rather than forming new ones.
+    const MCExpr *Begin =
+        createImageRel32(CSE.BeginLabel ? CSE.BeginLabel : EHFuncBeginSym);
+    const MCExpr *End;
+    if (CSE.EndLabel) {
+      // The interval is half-open, so we have to add one to include the return
+      // address of the last invoke in the range.
+      End = MCBinaryExpr::CreateAdd(createImageRel32(CSE.EndLabel),
+                                    MCConstantExpr::Create(1, Asm->OutContext),
+                                    Asm->OutContext);
+    } else {
+      End = createImageRel32(EHFuncEndSym);
+    }
+
+    // These aren't really type info globals, they are actually pointers to
+    // filter functions ordered by selector. The zero selector is used for
+    // cleanups, so slot zero corresponds to selector 1.
+    const std::vector<const GlobalValue *> &SelectorToFilter = MMI->getTypeInfos();
+
+    // Do a parallel iteration across typeids and clause labels, skipping filter
+    // clauses.
+    size_t NextClauseLabel = 0;
+    for (size_t I = 0, E = LPad->TypeIds.size(); I < E; ++I) {
+      // AddLandingPadInfo stores the clauses in reverse, but there is a FIXME
+      // to change that.
+      int Selector = LPad->TypeIds[E - I - 1];
+
+      // Ignore C++ filter clauses in SEH.
+      // FIXME: Implement cleanup clauses.
+      if (!isCatchEHSelector(Selector))
+        continue;
+
+      Asm->OutStreamer.EmitValue(Begin, 4);
+      Asm->OutStreamer.EmitValue(End, 4);
+      if (isCatchEHSelector(Selector)) {
+        assert(unsigned(Selector - 1) < SelectorToFilter.size());
+        const GlobalValue *TI = SelectorToFilter[Selector - 1];
+        if (TI) // Emit the filter function pointer.
+          Asm->OutStreamer.EmitValue(createImageRel32(Asm->getSymbol(TI)), 4);
+        else  // Otherwise, this is a "catch i8* null", or catch all.
+          Asm->OutStreamer.EmitIntValue(1, 4);
+      }
+      MCSymbol *ClauseLabel = LPad->ClauseLabels[NextClauseLabel++];
+      Asm->OutStreamer.EmitValue(createImageRel32(ClauseLabel), 4);
+    }
+  }
+}
diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.h b/lib/CodeGen/AsmPrinter/Win64Exception.h
index 538e132..b2d5d1b 100644
--- a/lib/CodeGen/AsmPrinter/Win64Exception.h
+++ b/lib/CodeGen/AsmPrinter/Win64Exception.h
@@ -29,6 +29,10 @@ class Win64Exception : public EHStreamer {
   /// Per-function flag to indicate if frame moves info should be emitted.
   bool shouldEmitMoves;
 
+  void emitCSpecificHandlerTable();
+
+  const MCSymbolRefExpr *createImageRel32(const MCSymbol *Value);
+
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
index 12f6bd7..4b64be0 100644
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@@ -31,10 +31,11 @@ using namespace llvm;
 namespace {
   class AtomicExpand: public FunctionPass {
     const TargetMachine *TM;
+    const TargetLowering *TLI;
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit AtomicExpand(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), TM(TM) {
+      : FunctionPass(ID), TM(TM), TLI(nullptr) {
       initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
     }
 
@@ -67,9 +68,9 @@ FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) {
 }
 
 bool AtomicExpand::runOnFunction(Function &F) {
-  if (!TM || !TM->getSubtargetImpl()->enableAtomicExpand())
+  if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand())
     return false;
-  auto TargetLowering = TM->getSubtargetImpl()->getTargetLowering();
+  TLI = TM->getSubtargetImpl(F)->getTargetLowering();
 
   SmallVector<Instruction *, 1> AtomicInsts;
 
@@ -91,7 +92,7 @@ bool AtomicExpand::runOnFunction(Function &F) {
 
     auto FenceOrdering = Monotonic;
     bool IsStore, IsLoad;
-    if (TargetLowering->getInsertFencesForAtomic()) {
+    if (TLI->getInsertFencesForAtomic()) {
       if (LI && isAtLeastAcquire(LI->getOrdering())) {
         FenceOrdering = LI->getOrdering();
         LI->setOrdering(Monotonic);
@@ -107,9 +108,9 @@ bool AtomicExpand::runOnFunction(Function &F) {
         FenceOrdering = RMWI->getOrdering();
         RMWI->setOrdering(Monotonic);
         IsStore = IsLoad = true;
-      } else if (CASI && !TargetLowering->hasLoadLinkedStoreConditional() &&
-                    (isAtLeastRelease(CASI->getSuccessOrdering()) ||
-                     isAtLeastAcquire(CASI->getSuccessOrdering()))) {
+      } else if (CASI && !TLI->hasLoadLinkedStoreConditional() &&
+                 (isAtLeastRelease(CASI->getSuccessOrdering()) ||
+                  isAtLeastAcquire(CASI->getSuccessOrdering()))) {
         // If a compare and swap is lowered to LL/SC, we can do smarter fence
         // insertion, with a stronger one on the success path than on the
         // failure path. As a result, fence insertion is directly done by
@@ -125,20 +126,19 @@ bool AtomicExpand::runOnFunction(Function &F) {
       }
     }
 
-    if (LI && TargetLowering->shouldExpandAtomicLoadInIR(LI)) {
+    if (LI && TLI->shouldExpandAtomicLoadInIR(LI)) {
       MadeChange |= expandAtomicLoad(LI);
-    } else if (SI && TargetLowering->shouldExpandAtomicStoreInIR(SI)) {
+    } else if (SI && TLI->shouldExpandAtomicStoreInIR(SI)) {
       MadeChange |= expandAtomicStore(SI);
     } else if (RMWI) {
       // There are two different ways of expanding RMW instructions:
       // - into a load if it is idempotent
       // - into a Cmpxchg/LL-SC loop otherwise
       // we try them in that order.
-      MadeChange |= (isIdempotentRMW(RMWI) &&
-                        simplifyIdempotentRMW(RMWI)) ||
-                    (TargetLowering->shouldExpandAtomicRMWInIR(RMWI) &&
-                        expandAtomicRMW(RMWI));
-    } else if (CASI && TargetLowering->hasLoadLinkedStoreConditional()) {
+      MadeChange |=
+          (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) ||
+          (TLI->shouldExpandAtomicRMWInIR(RMWI) && expandAtomicRMW(RMWI));
+    } else if (CASI && TLI->hasLoadLinkedStoreConditional()) {
       MadeChange |= expandAtomicCmpXchg(CASI);
     }
   }
@@ -149,13 +149,9 @@ bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
                                          bool IsStore, bool IsLoad) {
   IRBuilder<> Builder(I);
 
-  auto LeadingFence =
-      TM->getSubtargetImpl()->getTargetLowering()->emitLeadingFence(
-      Builder, Order, IsStore, IsLoad);
+  auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad);
 
-  auto TrailingFence =
-      TM->getSubtargetImpl()->getTargetLowering()->emitTrailingFence(
-      Builder, Order, IsStore, IsLoad);
+  auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad);
   // The trailing fence is emitted before the instruction instead of after
   // because there is no easy way of setting Builder insertion point after
   // an instruction. So we must erase it from the BB, and insert it back
@@ -171,16 +167,13 @@ bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
 }
 
 bool AtomicExpand::expandAtomicLoad(LoadInst *LI) {
-   if (TM->getSubtargetImpl()
-          ->getTargetLowering()
-          ->hasLoadLinkedStoreConditional())
+  if (TLI->hasLoadLinkedStoreConditional())
     return expandAtomicLoadToLL(LI);
   else
     return expandAtomicLoadToCmpXchg(LI);
 }
 
 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
-  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
   IRBuilder<> Builder(LI);
 
   // On some architectures, load-linked instructions are atomic for larger
@@ -231,9 +224,7 @@ bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
 }
 
 bool AtomicExpand::expandAtomicRMW(AtomicRMWInst *AI) {
-  if (TM->getSubtargetImpl()
-          ->getTargetLowering()
-          ->hasLoadLinkedStoreConditional())
+  if (TLI->hasLoadLinkedStoreConditional())
     return expandAtomicRMWToLLSC(AI);
   else
     return expandAtomicRMWToCmpXchg(AI);
@@ -277,7 +268,6 @@ static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
 }
 
 bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
-  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
   AtomicOrdering MemOpOrder = AI->getOrdering();
   Value *Addr = AI->getPointerOperand();
   BasicBlock *BB = AI->getParent();
@@ -397,7 +387,6 @@ bool AtomicExpand::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI) {
 }
 
 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
-  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
   AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
   AtomicOrdering FailureOrder = CI->getFailureOrdering();
   Value *Addr = CI->getPointerOperand();
@@ -551,13 +540,10 @@ bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) {
 }
 
 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
-  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
-
   if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
     if (TLI->shouldExpandAtomicLoadInIR(ResultingLoad))
       expandAtomicLoad(ResultingLoad);
     return true;
   }
-
   return false;
 }
diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp
index b9b1fd8..82f5c48 100644
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -15,633 +15,23 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfoImpl.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include <utility>
 using namespace llvm;
 
-static cl::opt<unsigned>
-PartialUnrollingThreshold("partial-unrolling-threshold", cl::init(0),
-  cl::desc("Threshold for partial unrolling"), cl::Hidden);
-
 #define DEBUG_TYPE "basictti"
 
-namespace {
-
-class BasicTTI final : public ImmutablePass, public TargetTransformInfo {
-  const TargetMachine *TM;
-
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
-
-  /// Estimate the cost overhead of SK_Alternate shuffle.
-  unsigned getAltShuffleOverhead(Type *Ty) const;
-
-  const TargetLoweringBase *getTLI() const {
-    return TM->getSubtargetImpl()->getTargetLowering();
-  }
-
-public:
-  BasicTTI() : ImmutablePass(ID), TM(nullptr) {
-    llvm_unreachable("This pass cannot be directly constructed");
-  }
-
-  BasicTTI(const TargetMachine *TM) : ImmutablePass(ID), TM(TM) {
-    initializeBasicTTIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void initializePass() override {
-    pushTTIStack(this);
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    TargetTransformInfo::getAnalysisUsage(AU);
-  }
-
-  /// Pass identification.
-  static char ID;
-
-  /// Provide necessary pointer adjustments for the two base classes.
-  void *getAdjustedAnalysisPointer(const void *ID) override {
-    if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo*)this;
-    return this;
-  }
-
-  bool hasBranchDivergence() const override;
-
-  /// \name Scalar TTI Implementations
-  /// @{
-
-  bool isLegalAddImmediate(int64_t imm) const override;
-  bool isLegalICmpImmediate(int64_t imm) const override;
-  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
-                             int64_t BaseOffset, bool HasBaseReg,
-                             int64_t Scale) const override;
-  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
-                           int64_t BaseOffset, bool HasBaseReg,
-                           int64_t Scale) const override;
-  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
-  bool isTypeLegal(Type *Ty) const override;
-  unsigned getJumpBufAlignment() const override;
-  unsigned getJumpBufSize() const override;
-  bool shouldBuildLookupTables() const override;
-  bool haveFastSqrt(Type *Ty) const override;
-  void getUnrollingPreferences(const Function *F, Loop *L,
-                               UnrollingPreferences &UP) const override;
-
-  /// @}
-
-  /// \name Vector TTI Implementations
-  /// @{
-
-  unsigned getNumberOfRegisters(bool Vector) const override;
-  unsigned getMaxInterleaveFactor() const override;
-  unsigned getRegisterBitWidth(bool Vector) const override;
-  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
-                                  OperandValueKind, OperandValueProperties,
-                                  OperandValueProperties) const override;
-  unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
-                          int Index, Type *SubTp) const override;
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
-                            Type *Src) const override;
-  unsigned getCFInstrCost(unsigned Opcode) const override;
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                              Type *CondTy) const override;
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
-                              unsigned Index) const override;
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) const override;
-  unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy,
-                                 ArrayRef<Type*> Tys) const override;
-  unsigned getNumberOfParts(Type *Tp) const override;
-  unsigned getAddressComputationCost( Type *Ty, bool IsComplex) const override;
-  unsigned getReductionCost(unsigned Opcode, Type *Ty,
-                            bool IsPairwise) const override;
-
-  /// @}
-};
-
-}
-
-INITIALIZE_AG_PASS(BasicTTI, TargetTransformInfo, "basictti",
-                   "Target independent code generator's TTI", true, true, false)
-char BasicTTI::ID = 0;
-
-ImmutablePass *
-llvm::createBasicTargetTransformInfoPass(const TargetMachine *TM) {
-  return new BasicTTI(TM);
-}
-
-bool BasicTTI::hasBranchDivergence() const { return false; }
-
-bool BasicTTI::isLegalAddImmediate(int64_t imm) const {
-  return getTLI()->isLegalAddImmediate(imm);
-}
-
-bool BasicTTI::isLegalICmpImmediate(int64_t imm) const {
-  return getTLI()->isLegalICmpImmediate(imm);
-}
-
-bool BasicTTI::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
-                                     int64_t BaseOffset, bool HasBaseReg,
-                                     int64_t Scale) const {
-  TargetLoweringBase::AddrMode AM;
-  AM.BaseGV = BaseGV;
-  AM.BaseOffs = BaseOffset;
-  AM.HasBaseReg = HasBaseReg;
-  AM.Scale = Scale;
-  return getTLI()->isLegalAddressingMode(AM, Ty);
-}
-
-int BasicTTI::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
-                                   int64_t BaseOffset, bool HasBaseReg,
-                                   int64_t Scale) const {
-  TargetLoweringBase::AddrMode AM;
-  AM.BaseGV = BaseGV;
-  AM.BaseOffs = BaseOffset;
-  AM.HasBaseReg = HasBaseReg;
-  AM.Scale = Scale;
-  return getTLI()->getScalingFactorCost(AM, Ty);
-}
-
-bool BasicTTI::isTruncateFree(Type *Ty1, Type *Ty2) const {
-  return getTLI()->isTruncateFree(Ty1, Ty2);
-}
-
-bool BasicTTI::isTypeLegal(Type *Ty) const {
-  EVT T = getTLI()->getValueType(Ty);
-  return getTLI()->isTypeLegal(T);
-}
-
-unsigned BasicTTI::getJumpBufAlignment() const {
-  return getTLI()->getJumpBufAlignment();
-}
-
-unsigned BasicTTI::getJumpBufSize() const {
-  return getTLI()->getJumpBufSize();
-}
-
-bool BasicTTI::shouldBuildLookupTables() const {
-  const TargetLoweringBase *TLI = getTLI();
-  return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
-         TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
-}
-
-bool BasicTTI::haveFastSqrt(Type *Ty) const {
-  const TargetLoweringBase *TLI = getTLI();
-  EVT VT = TLI->getValueType(Ty);
-  return TLI->isTypeLegal(VT) && TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
-}
-
-void BasicTTI::getUnrollingPreferences(const Function *F, Loop *L,
-                                       UnrollingPreferences &UP) const {
-  // This unrolling functionality is target independent, but to provide some
-  // motivation for its intended use, for x86:
-
-  // According to the Intel 64 and IA-32 Architectures Optimization Reference
-  // Manual, Intel Core models and later have a loop stream detector
-  // (and associated uop queue) that can benefit from partial unrolling.
-  // The relevant requirements are:
-  //  - The loop must have no more than 4 (8 for Nehalem and later) branches
-  //    taken, and none of them may be calls.
-  //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
-
-  // According to the Software Optimization Guide for AMD Family 15h Processors,
-  // models 30h-4fh (Steamroller and later) have a loop predictor and loop
-  // buffer which can benefit from partial unrolling.
-  // The relevant requirements are:
-  //  - The loop must have fewer than 16 branches
-  //  - The loop must have less than 40 uops in all executed loop branches
-
-  // The number of taken branches in a loop is hard to estimate here, and
-  // benchmarking has revealed that it is better not to be conservative when
-  // estimating the branch count. As a result, we'll ignore the branch limits
-  // until someone finds a case where it matters in practice.
-
-  unsigned MaxOps;
-  const TargetSubtargetInfo *ST = &TM->getSubtarget<TargetSubtargetInfo>(F);
-  if (PartialUnrollingThreshold.getNumOccurrences() > 0)
-    MaxOps = PartialUnrollingThreshold;
-  else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
-    MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
-  else
-    return;
-
-  // Scan the loop: don't unroll loops with calls.
-  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
-       I != E; ++I) {
-    BasicBlock *BB = *I;
-
-    for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J)
-      if (isa<CallInst>(J) || isa<InvokeInst>(J)) {
-        ImmutableCallSite CS(J);
-        if (const Function *F = CS.getCalledFunction()) {
-          if (!TopTTI->isLoweredToCall(F))
-            continue;
-        }
-
-        return;
-      }
-  }
-
-  // Enable runtime and partial unrolling up to the specified size.
-  UP.Partial = UP.Runtime = true;
-  UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
-}
-
-//===----------------------------------------------------------------------===//
-//
-// Calls used by the vectorizers.
-//
-//===----------------------------------------------------------------------===//
-
-unsigned BasicTTI::getScalarizationOverhead(Type *Ty, bool Insert,
-                                            bool Extract) const {
-  assert (Ty->isVectorTy() && "Can only scalarize vectors");
-  unsigned Cost = 0;
-
-  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
-    if (Insert)
-      Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
-    if (Extract)
-      Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
-  }
-
-  return Cost;
-}
-
-unsigned BasicTTI::getNumberOfRegisters(bool Vector) const {
-  return 1;
-}
-
-unsigned BasicTTI::getRegisterBitWidth(bool Vector) const {
-  return 32;
-}
-
-unsigned BasicTTI::getMaxInterleaveFactor() const {
-  return 1;
-}
-
-unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                          OperandValueKind, OperandValueKind,
-                                          OperandValueProperties,
-                                          OperandValueProperties) const {
-  // Check if any of the operands are vector operands.
-  const TargetLoweringBase *TLI = getTLI();
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  assert(ISD && "Invalid opcode");
-
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
-
-  bool IsFloat = Ty->getScalarType()->isFloatingPointTy();
-  // Assume that floating point arithmetic operations cost twice as much as
-  // integer operations.
-  unsigned OpCost = (IsFloat ? 2 : 1);
-
-  if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
-    // The operation is legal. Assume it costs 1.
-    // If the type is split to multiple registers, assume that there is some
-    // overhead to this.
-    // TODO: Once we have extract/insert subvector cost we need to use them.
-    if (LT.first > 1)
-      return LT.first * 2 * OpCost;
-    return LT.first * 1 * OpCost;
-  }
-
-  if (!TLI->isOperationExpand(ISD, LT.second)) {
-    // If the operation is custom lowered then assume
-    // thare the code is twice as expensive.
-    return LT.first * 2 * OpCost;
-  }
-
-  // Else, assume that we need to scalarize this op.
-  if (Ty->isVectorTy()) {
-    unsigned Num = Ty->getVectorNumElements();
-    unsigned Cost = TopTTI->getArithmeticInstrCost(Opcode, Ty->getScalarType());
-    // return the cost of multiple scalar invocation plus the cost of inserting
-    // and extracting the values.
-    return getScalarizationOverhead(Ty, true, true) + Num * Cost;
-  }
-
-  // We don't know anything about this scalar instruction.
-  return OpCost;
-}
-
-unsigned BasicTTI::getAltShuffleOverhead(Type *Ty) const {
-  assert(Ty->isVectorTy() && "Can only shuffle vectors");
-  unsigned Cost = 0;
-  // Shuffle cost is equal to the cost of extracting element from its argument
-  // plus the cost of inserting them onto the result vector.
-
-  // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from index
-  // 0 of first vector, index 1 of second vector,index 2 of first vector and
-  // finally index 3 of second vector and insert them at index <0,1,2,3> of
-  // result vector.
-  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
-    Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
-    Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
-  }
-  return Cost;
-}
-
-unsigned BasicTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
-                                  Type *SubTp) const {
-  if (Kind == SK_Alternate) {
-    return getAltShuffleOverhead(Tp);
-  }
-  return 1;
-}
-
-unsigned BasicTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                    Type *Src) const {
-  const TargetLoweringBase *TLI = getTLI();
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  assert(ISD && "Invalid opcode");
-
-  std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(Src);
-  std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(Dst);
-
-  // Check for NOOP conversions.
-  if (SrcLT.first == DstLT.first &&
-      SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
-
-      // Bitcast between types that are legalized to the same type are free.
-      if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc)
-        return 0;
-  }
-
-  if (Opcode == Instruction::Trunc &&
-      TLI->isTruncateFree(SrcLT.second, DstLT.second))
-    return 0;
-
-  if (Opcode == Instruction::ZExt &&
-      TLI->isZExtFree(SrcLT.second, DstLT.second))
-    return 0;
-
-  // If the cast is marked as legal (or promote) then assume low cost.
-  if (SrcLT.first == DstLT.first &&
-      TLI->isOperationLegalOrPromote(ISD, DstLT.second))
-    return 1;
-
-  // Handle scalar conversions.
-  if (!Src->isVectorTy() && !Dst->isVectorTy()) {
-
-    // Scalar bitcasts are usually free.
-    if (Opcode == Instruction::BitCast)
-      return 0;
-
-    // Just check the op cost. If the operation is legal then assume it costs 1.
-    if (!TLI->isOperationExpand(ISD, DstLT.second))
-      return  1;
-
-    // Assume that illegal scalar instruction are expensive.
-    return 4;
-  }
-
-  // Check vector-to-vector casts.
-  if (Dst->isVectorTy() && Src->isVectorTy()) {
-
-    // If the cast is between same-sized registers, then the check is simple.
-    if (SrcLT.first == DstLT.first &&
-        SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
-
-      // Assume that Zext is done using AND.
-      if (Opcode == Instruction::ZExt)
-        return 1;
-
-      // Assume that sext is done using SHL and SRA.
-      if (Opcode == Instruction::SExt)
-        return 2;
-
-      // Just check the op cost. If the operation is legal then assume it costs
-      // 1 and multiply by the type-legalization overhead.
-      if (!TLI->isOperationExpand(ISD, DstLT.second))
-        return SrcLT.first * 1;
-    }
-
-    // If we are converting vectors and the operation is illegal, or
-    // if the vectors are legalized to different types, estimate the
-    // scalarization costs.
-    unsigned Num = Dst->getVectorNumElements();
-    unsigned Cost = TopTTI->getCastInstrCost(Opcode, Dst->getScalarType(),
-                                             Src->getScalarType());
-
-    // Return the cost of multiple scalar invocation plus the cost of
-    // inserting and extracting the values.
-    return getScalarizationOverhead(Dst, true, true) + Num * Cost;
-  }
-
-  // We already handled vector-to-vector and scalar-to-scalar conversions. This
-  // is where we handle bitcast between vectors and scalars. We need to assume
-  //  that the conversion is scalarized in one way or another.
-  if (Opcode == Instruction::BitCast)
-    // Illegal bitcasts are done by storing and loading from a stack slot.
-    return (Src->isVectorTy()? getScalarizationOverhead(Src, false, true):0) +
-           (Dst->isVectorTy()? getScalarizationOverhead(Dst, true, false):0);
-
-  llvm_unreachable("Unhandled cast");
- }
-
-unsigned BasicTTI::getCFInstrCost(unsigned Opcode) const {
-  // Branches are assumed to be predicted.
-  return 0;
-}
-
-unsigned BasicTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                      Type *CondTy) const {
-  const TargetLoweringBase *TLI = getTLI();
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  assert(ISD && "Invalid opcode");
-
-  // Selects on vectors are actually vector selects.
-  if (ISD == ISD::SELECT) {
-    assert(CondTy && "CondTy must exist");
-    if (CondTy->isVectorTy())
-      ISD = ISD::VSELECT;
-  }
-
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
-
-  if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
-      !TLI->isOperationExpand(ISD, LT.second)) {
-    // The operation is legal. Assume it costs 1. Multiply
-    // by the type-legalization overhead.
-    return LT.first * 1;
-  }
-
-  // Otherwise, assume that the cast is scalarized.
-  if (ValTy->isVectorTy()) {
-    unsigned Num = ValTy->getVectorNumElements();
-    if (CondTy)
-      CondTy = CondTy->getScalarType();
-    unsigned Cost = TopTTI->getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
-                                               CondTy);
-
-    // Return the cost of multiple scalar invocation plus the cost of inserting
-    // and extracting the values.
-    return getScalarizationOverhead(ValTy, true, false) + Num * Cost;
-  }
-
-  // Unknown scalar opcode.
-  return 1;
-}
-
-unsigned BasicTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                      unsigned Index) const {
-  std::pair<unsigned, MVT> LT =  getTLI()->getTypeLegalizationCost(Val->getScalarType());
-
-  return LT.first;
-}
-
-unsigned BasicTTI::getMemoryOpCost(unsigned Opcode, Type *Src,
-                                   unsigned Alignment,
-                                   unsigned AddressSpace) const {
-  assert(!Src->isVoidTy() && "Invalid type");
-  std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(Src);
-
-  // Assuming that all loads of legal types cost 1.
-  unsigned Cost = LT.first;
-
-  if (Src->isVectorTy() &&
-      Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) {
-    // This is a vector load that legalizes to a larger type than the vector
-    // itself. Unless the corresponding extending load or truncating store is
-    // legal, then this will scalarize.
-    TargetLowering::LegalizeAction LA = TargetLowering::Expand;
-    EVT MemVT = getTLI()->getValueType(Src, true);
-    if (MemVT.isSimple() && MemVT != MVT::Other) {
-      if (Opcode == Instruction::Store)
-        LA = getTLI()->getTruncStoreAction(LT.second, MemVT.getSimpleVT());
-      else
-        LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, MemVT.getSimpleVT());
-    }
-
-    if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
-      // This is a vector load/store for some illegal type that is scalarized.
-      // We must account for the cost of building or decomposing the vector.
-      Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store,
-                                            Opcode == Instruction::Store);
-    }
-  }
-
-  return Cost;
-}
-
-unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                                         ArrayRef<Type *> Tys) const {
-  unsigned ISD = 0;
-  switch (IID) {
-  default: {
-    // Assume that we need to scalarize this intrinsic.
-    unsigned ScalarizationCost = 0;
-    unsigned ScalarCalls = 1;
-    if (RetTy->isVectorTy()) {
-      ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
-      ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
-    }
-    for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
-      if (Tys[i]->isVectorTy()) {
-        ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
-        ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
-      }
-    }
-
-    return ScalarCalls + ScalarizationCost;
-  }
-  // Look for intrinsics that can be lowered directly or turned into a scalar
-  // intrinsic call.
-  case Intrinsic::sqrt:    ISD = ISD::FSQRT;  break;
-  case Intrinsic::sin:     ISD = ISD::FSIN;   break;
-  case Intrinsic::cos:     ISD = ISD::FCOS;   break;
-  case Intrinsic::exp:     ISD = ISD::FEXP;   break;
-  case Intrinsic::exp2:    ISD = ISD::FEXP2;  break;
-  case Intrinsic::log:     ISD = ISD::FLOG;   break;
-  case Intrinsic::log10:   ISD = ISD::FLOG10; break;
-  case Intrinsic::log2:    ISD = ISD::FLOG2;  break;
-  case Intrinsic::fabs:    ISD = ISD::FABS;   break;
-  case Intrinsic::minnum:  ISD = ISD::FMINNUM; break;
-  case Intrinsic::maxnum:  ISD = ISD::FMAXNUM; break;
-  case Intrinsic::copysign: ISD = ISD::FCOPYSIGN; break;
-  case Intrinsic::floor:   ISD = ISD::FFLOOR; break;
-  case Intrinsic::ceil:    ISD = ISD::FCEIL;  break;
-  case Intrinsic::trunc:   ISD = ISD::FTRUNC; break;
-  case Intrinsic::nearbyint:
-                           ISD = ISD::FNEARBYINT; break;
-  case Intrinsic::rint:    ISD = ISD::FRINT;  break;
-  case Intrinsic::round:   ISD = ISD::FROUND; break;
-  case Intrinsic::pow:     ISD = ISD::FPOW;   break;
-  case Intrinsic::fma:     ISD = ISD::FMA;    break;
-  case Intrinsic::fmuladd: ISD = ISD::FMA;    break;
-  // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
-  case Intrinsic::lifetime_start:
-  case Intrinsic::lifetime_end:
-    return 0;
-  }
-
-  const TargetLoweringBase *TLI = getTLI();
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(RetTy);
-
-  if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
-    // The operation is legal. Assume it costs 1.
-    // If the type is split to multiple registers, assume that there is some
-    // overhead to this.
-    // TODO: Once we have extract/insert subvector cost we need to use them.
-    if (LT.first > 1)
-      return LT.first * 2;
-    return LT.first * 1;
-  }
-
-  if (!TLI->isOperationExpand(ISD, LT.second)) {
-    // If the operation is custom lowered then assume
-    // thare the code is twice as expensive.
-    return LT.first * 2;
-  }
-
-  // If we can't lower fmuladd into an FMA estimate the cost as a floating
-  // point mul followed by an add.
-  if (IID == Intrinsic::fmuladd)
-    return TopTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +
-           TopTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);
-
-  // Else, assume that we need to scalarize this intrinsic. For math builtins
-  // this will emit a costly libcall, adding call overhead and spills. Make it
-  // very expensive.
-  if (RetTy->isVectorTy()) {
-    unsigned Num = RetTy->getVectorNumElements();
-    unsigned Cost = TopTTI->getIntrinsicInstrCost(IID, RetTy->getScalarType(),
-                                                  Tys);
-    return 10 * Cost * Num;
-  }
-
-  // This is going to be turned into a library call, make it expensive.
-  return 10;
-}
-
-unsigned BasicTTI::getNumberOfParts(Type *Tp) const {
-  std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(Tp);
-  return LT.first;
-}
-
-unsigned BasicTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
-  return 0;
-}
+// This flag is used by the template base class for BasicTTIImpl, and here to
+// provide a definition.
+cl::opt<unsigned>
+    llvm::PartialUnrollingThreshold("partial-unrolling-threshold", cl::init(0),
+                                    cl::desc("Threshold for partial unrolling"),
+                                    cl::Hidden);
 
-unsigned BasicTTI::getReductionCost(unsigned Opcode, Type *Ty,
-                                    bool IsPairwise) const {
-  assert(Ty->isVectorTy() && "Expect a vector type");
-  unsigned NumVecElts = Ty->getVectorNumElements();
-  unsigned NumReduxLevels = Log2_32(NumVecElts);
-  unsigned ArithCost = NumReduxLevels *
-    TopTTI->getArithmeticInstrCost(Opcode, Ty);
-  // Assume the pairwise shuffles add a cost.
-  unsigned ShuffleCost =
-      NumReduxLevels * (IsPairwise + 1) *
-      TopTTI->getShuffleCost(SK_ExtractSubvector, Ty, NumVecElts / 2, Ty);
-  return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
-}
+BasicTTIImpl::BasicTTIImpl(const TargetMachine *TM, Function &F)
+    : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {}
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 2128da1..b8f05cd 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -601,8 +601,7 @@ static bool ProfitableToMerge(MachineBasicBlock *MBB1,
   // instructions that would be deleted in the merge.
   MachineFunction *MF = MBB1->getParent();
   if (EffectiveTailLen >= 2 &&
-      MF->getFunction()->getAttributes().
-        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
+      MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize) &&
       (I1 == MBB1->begin() || I2 == MBB2->begin()))
     return true;
 
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 092346b..f21d4d2 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -22,6 +22,7 @@ add_llvm_library(LLVMCodeGen
   ForwardControlFlowIntegrity.cpp
   GCMetadata.cpp
   GCMetadataPrinter.cpp
+  GCRootLowering.cpp
   GCStrategy.cpp
   GlobalMerge.cpp
   IfConversion.cpp
@@ -95,6 +96,7 @@ add_llvm_library(LLVMCodeGen
   ScheduleDAGPrinter.cpp
   ScoreboardHazardRecognizer.cpp
   ShadowStackGC.cpp
+  ShadowStackGCLowering.cpp
   SjLjEHPrepare.cpp
   SlotIndexes.cpp
   SpillPlacement.cpp
@@ -104,6 +106,7 @@ add_llvm_library(LLVMCodeGen
   StackSlotColoring.cpp
   StackMapLivenessAnalysis.cpp
   StackMaps.cpp
+  StatepointExampleGC.cpp
   TailDuplication.cpp
   TargetFrameLoweringImpl.cpp
   TargetInstrInfo.cpp
@@ -115,6 +118,11 @@ add_llvm_library(LLVMCodeGen
   TwoAddressInstructionPass.cpp
   UnreachableBlockElim.cpp
   VirtRegMap.cpp
+  WinEHPrepare.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGen
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGen/PBQP
   )
 
 add_dependencies(LLVMCodeGen intrinsics_gen)
diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index 56ecde0..034ffb3 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -14,9 +14,11 @@
 
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -178,3 +180,70 @@ void CCState::AnalyzeCallResult(MVT VT, CCAssignFn Fn) {
     llvm_unreachable(nullptr);
   }
 }
+
+static bool isValueTypeInRegForCC(CallingConv::ID CC, MVT VT) {
+  if (VT.isVector())
+    return true; // Assume -msse-regparm might be in effect.
+  if (!VT.isInteger())
+    return false;
+  if (CC == CallingConv::X86_VectorCall || CC == CallingConv::X86_FastCall)
+    return true;
+  return false;
+}
+
+void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs,
+                                          MVT VT, CCAssignFn Fn) {
+  unsigned SavedStackOffset = StackOffset;
+  unsigned NumLocs = Locs.size();
+
+  // Set the 'inreg' flag if it is used for this calling convention.
+  ISD::ArgFlagsTy Flags;
+  if (isValueTypeInRegForCC(CallingConv, VT))
+    Flags.setInReg();
+
+  // Allocate something of this value type repeatedly until we get assigned a
+  // location in memory.
+  bool HaveRegParm = true;
+  while (HaveRegParm) {
+    if (Fn(0, VT, VT, CCValAssign::Full, Flags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call has unhandled type " << EVT(VT).getEVTString()
+             << " while computing remaining regparms\n";
+#endif
+      llvm_unreachable(nullptr);
+    }
+    HaveRegParm = Locs.back().isRegLoc();
+  }
+
+  // Copy all the registers from the value locations we added.
+  assert(NumLocs < Locs.size() && "CC assignment failed to add location");
+  for (unsigned I = NumLocs, E = Locs.size(); I != E; ++I)
+    if (Locs[I].isRegLoc())
+      Regs.push_back(MCPhysReg(Locs[I].getLocReg()));
+
+  // Clear the assigned values and stack memory. We leave the registers marked
+  // as allocated so that future queries don't return the same registers, i.e.
+  // when i64 and f64 are both passed in GPRs.
+  StackOffset = SavedStackOffset;
+  Locs.resize(NumLocs);
+}
+
+void CCState::analyzeMustTailForwardedRegisters(
+    SmallVectorImpl<ForwardedRegister> &Forwards, ArrayRef<MVT> RegParmTypes,
+    CCAssignFn Fn) {
+  // Oftentimes calling conventions will not user register parameters for
+  // variadic functions, so we need to assume we're not variadic so that we get
+  // all the registers that might be used in a non-variadic call.
+  SaveAndRestore<bool> SavedVarArg(IsVarArg, false);
+
+  for (MVT RegVT : RegParmTypes) {
+    SmallVector<MCPhysReg, 8> RemainingRegs;
+    getRemainingRegParmsForType(RemainingRegs, RegVT, Fn);
+    const TargetLowering *TL = MF.getSubtarget().getTargetLowering();
+    const TargetRegisterClass *RC = TL->getRegClassFor(RegVT);
+    for (MCPhysReg PReg : RemainingRegs) {
+      unsigned VReg = MF.addLiveIn(PReg, RC);
+      Forwards.push_back(ForwardedRegister(VReg, PReg, RegVT));
+    }
+  }
+}
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index 307dec5..7c0068e 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -21,7 +21,6 @@ using namespace llvm;
 /// initializeCodeGen - Initialize all passes linked into the CodeGen library.
 void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeAtomicExpandPass(Registry);
-  initializeBasicTTIPass(Registry);
   initializeBranchFolderPassPass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeDeadMachineInstructionElimPass(Registry);
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 8d20848..c0d7dca 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
@@ -30,20 +31,22 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Statepoint.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/ValueMap.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -70,6 +73,10 @@ static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
   cl::desc("Disable branch optimizations in CodeGenPrepare"));
 
+static cl::opt<bool>
+    DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),
+                  cl::desc("Disable GC optimizations in CodeGenPrepare"));
+
 static cl::opt<bool> DisableSelectToBranch(
   "disable-cgp-select2branch", cl::Hidden, cl::init(false),
   cl::desc("Disable select to branch conversion."));
@@ -90,6 +97,16 @@ static cl::opt<bool> StressStoreExtract(
     "stress-cgp-store-extract", cl::Hidden, cl::init(false),
     cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));
 
+static cl::opt<bool> DisableExtLdPromotion(
+    "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
+    cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in "
+             "CodeGenPrepare"));
+
+static cl::opt<bool> StressExtLdPromotion(
+    "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
+    cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) "
+             "optimization in CodeGenPrepare"));
+
 namespace {
 typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
 struct TypeIsSExt {
@@ -98,6 +115,7 @@ struct TypeIsSExt {
   TypeIsSExt(Type *Ty, bool IsSExt) : Ty(Ty), IsSExt(IsSExt) {}
 };
 typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy;
+class TypePromotionTransaction;
 
   class CodeGenPrepare : public FunctionPass {
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
@@ -143,8 +161,8 @@ typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addRequired<TargetLibraryInfo>();
-      AU.addRequired<TargetTransformInfo>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
     }
 
   private:
@@ -152,12 +170,12 @@ typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy;
     bool EliminateMostlyEmptyBlocks(Function &F);
     bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
     void EliminateMostlyEmptyBlock(BasicBlock *BB);
-    bool OptimizeBlock(BasicBlock &BB);
-    bool OptimizeInst(Instruction *I);
+    bool OptimizeBlock(BasicBlock &BB, bool& ModifiedDT);
+    bool OptimizeInst(Instruction *I, bool& ModifiedDT);
     bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy);
     bool OptimizeInlineAsmInst(CallInst *CS);
-    bool OptimizeCallInst(CallInst *CI);
-    bool MoveExtToFormExtLoad(Instruction *I);
+    bool OptimizeCallInst(CallInst *CI, bool& ModifiedDT);
+    bool MoveExtToFormExtLoad(Instruction *&I);
     bool OptimizeExtUses(Instruction *I);
     bool OptimizeSelectInst(SelectInst *SI);
     bool OptimizeShuffleVectorInst(ShuffleVectorInst *SI);
@@ -165,6 +183,12 @@ typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy;
     bool DupRetToEnableTailCallOpts(BasicBlock *BB);
     bool PlaceDbgValues(Function &F);
     bool sinkAndCmp(Function &F);
+    bool ExtLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI,
+                        Instruction *&Inst,
+                        const SmallVectorImpl<Instruction *> &Exts,
+                        unsigned CreatedInst);
+    bool splitBranchCondition(Function &F);
+    bool simplifyOffsetableRelocate(Instruction &I);
   };
 }
 
@@ -187,14 +211,13 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
 
   ModifiedDT = false;
   if (TM)
-    TLI = TM->getSubtargetImpl()->getTargetLowering();
-  TLInfo = &getAnalysis<TargetLibraryInfo>();
-  TTI = &getAnalysis<TargetTransformInfo>();
+    TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+  TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::OptimizeForSize);
+  OptSize = F.hasFnAttribute(Attribute::OptimizeForSize);
 
   /// This optimization identifies DIV instructions that can be
   /// profitably bypassed and carried out with a shorter, faster divide.
@@ -218,15 +241,23 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   // into a single target instruction, push the mask and compare into branch
   // users. Do this before OptimizeBlock -> OptimizeInst ->
   // OptimizeCmpExpression, which perturbs the pattern being searched for.
-  if (!DisableBranchOpts)
+  if (!DisableBranchOpts) {
     EverMadeChange |= sinkAndCmp(F);
+    EverMadeChange |= splitBranchCondition(F);
+  }
 
   bool MadeChange = true;
   while (MadeChange) {
     MadeChange = false;
     for (Function::iterator I = F.begin(); I != F.end(); ) {
       BasicBlock *BB = I++;
-      MadeChange |= OptimizeBlock(*BB);
+      bool ModifiedDTOnIteration = false;
+      MadeChange |= OptimizeBlock(*BB, ModifiedDTOnIteration);
+
+      // Restart BB iteration if the dominator tree of the Function was changed
+      ModifiedDT |= ModifiedDTOnIteration;
+      if (ModifiedDTOnIteration)
+        break;
     }
     EverMadeChange |= MadeChange;
   }
@@ -236,9 +267,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   if (!DisableBranchOpts) {
     MadeChange = false;
     SmallPtrSet<BasicBlock*, 8> WorkList;
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-      SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
-      MadeChange |= ConstantFoldTerminator(BB, true);
+    for (BasicBlock &BB : F) {
+      SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB));
+      MadeChange |= ConstantFoldTerminator(&BB, true);
       if (!MadeChange) continue;
 
       for (SmallVectorImpl<BasicBlock*>::iterator
@@ -272,6 +303,16 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     EverMadeChange |= MadeChange;
   }
 
+  if (!DisableGCOpts) {
+    SmallVector<Instruction *, 2> Statepoints;
+    for (BasicBlock &BB : F)
+      for (Instruction &I : BB)
+        if (isStatepoint(I))
+          Statepoints.push_back(&I);
+    for (auto &I : Statepoints)
+      EverMadeChange |= simplifyOffsetableRelocate(*I);
+  }
+
   if (ModifiedDT && DT)
     DT->recalculate(F);
 
@@ -300,7 +341,7 @@ bool CodeGenPrepare::EliminateFallThrough(Function &F) {
       // Remember if SinglePred was the entry block of the function.
       // If so, we will need to move BB back to the entry position.
       bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
-      MergeBasicBlockIntoOnlyPred(BB, this);
+      MergeBasicBlockIntoOnlyPred(BB, DT);
 
       if (isEntry && BB != &BB->getParent()->getEntryBlock())
         BB->moveBefore(&BB->getParent()->getEntryBlock());
@@ -440,7 +481,7 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
       // Remember if SinglePred was the entry block of the function.  If so, we
       // will need to move BB back to the entry position.
       bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
-      MergeBasicBlockIntoOnlyPred(DestBB, this);
+      MergeBasicBlockIntoOnlyPred(DestBB, DT);
 
       if (isEntry && BB != &BB->getParent()->getEntryBlock())
         BB->moveBefore(&BB->getParent()->getEntryBlock());
@@ -495,6 +536,144 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
   DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
 }
 
+// Computes a map of base pointer relocation instructions to corresponding
+// derived pointer relocation instructions given a vector of all relocate calls
+static void computeBaseDerivedRelocateMap(
+    const SmallVectorImpl<User *> &AllRelocateCalls,
+    DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> &
+        RelocateInstMap) {
+  // Collect information in two maps: one primarily for locating the base object
+  // while filling the second map; the second map is the final structure holding
+  // a mapping between Base and corresponding Derived relocate calls
+  DenseMap<std::pair<unsigned, unsigned>, IntrinsicInst *> RelocateIdxMap;
+  for (auto &U : AllRelocateCalls) {
+    GCRelocateOperands ThisRelocate(U);
+    IntrinsicInst *I = cast<IntrinsicInst>(U);
+    auto K = std::make_pair(ThisRelocate.basePtrIndex(),
+                            ThisRelocate.derivedPtrIndex());
+    RelocateIdxMap.insert(std::make_pair(K, I));
+  }
+  for (auto &Item : RelocateIdxMap) {
+    std::pair<unsigned, unsigned> Key = Item.first;
+    if (Key.first == Key.second)
+      // Base relocation: nothing to insert
+      continue;
+
+    IntrinsicInst *I = Item.second;
+    auto BaseKey = std::make_pair(Key.first, Key.first);
+    IntrinsicInst *Base = RelocateIdxMap[BaseKey];
+    if (!Base)
+      // TODO: We might want to insert a new base object relocate and gep off
+      // that, if there are enough derived object relocates.
+      continue;
+    RelocateInstMap[Base].push_back(I);
+  }
+}
+
+// Accepts a GEP and extracts the operands into a vector provided they're all
+// small integer constants
+static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
+                                          SmallVectorImpl<Value *> &OffsetV) {
+  for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
+    // Only accept small constant integer operands
+    auto Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (!Op || Op->getZExtValue() > 20)
+      return false;
+  }
+
+  for (unsigned i = 1; i < GEP->getNumOperands(); i++)
+    OffsetV.push_back(GEP->getOperand(i));
+  return true;
+}
+
+// Takes a RelocatedBase (base pointer relocation instruction) and Targets to
+// replace, computes a replacement, and affects it.
+static bool
+simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase,
+                          const SmallVectorImpl<IntrinsicInst *> &Targets) {
+  bool MadeChange = false;
+  for (auto &ToReplace : Targets) {
+    GCRelocateOperands MasterRelocate(RelocatedBase);
+    GCRelocateOperands ThisRelocate(ToReplace);
+
+    assert(ThisRelocate.basePtrIndex() == MasterRelocate.basePtrIndex() &&
+           "Not relocating a derived object of the original base object");
+    if (ThisRelocate.basePtrIndex() == ThisRelocate.derivedPtrIndex()) {
+      // A duplicate relocate call. TODO: coalesce duplicates.
+      continue;
+    }
+
+    Value *Base = ThisRelocate.basePtr();
+    auto Derived = dyn_cast<GetElementPtrInst>(ThisRelocate.derivedPtr());
+    if (!Derived || Derived->getPointerOperand() != Base)
+      continue;
+
+    SmallVector<Value *, 2> OffsetV;
+    if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV))
+      continue;
+
+    // Create a Builder and replace the target callsite with a gep
+    IRBuilder<> Builder(ToReplace);
+    Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
+    Value *Replacement =
+        Builder.CreateGEP(RelocatedBase, makeArrayRef(OffsetV));
+    Instruction *ReplacementInst = cast<Instruction>(Replacement);
+    ReplacementInst->removeFromParent();
+    ReplacementInst->insertAfter(RelocatedBase);
+    Replacement->takeName(ToReplace);
+    ToReplace->replaceAllUsesWith(Replacement);
+    ToReplace->eraseFromParent();
+
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+// Turns this:
+//
+// %base = ...
+// %ptr = gep %base + 15
+// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
+// %base' = relocate(%tok, i32 4, i32 4)
+// %ptr' = relocate(%tok, i32 4, i32 5)
+// %val = load %ptr'
+//
+// into this:
+//
+// %base = ...
+// %ptr = gep %base + 15
+// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
+// %base' = gc.relocate(%tok, i32 4, i32 4)
+// %ptr' = gep %base' + 15
+// %val = load %ptr'
+bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
+  bool MadeChange = false;
+  SmallVector<User *, 2> AllRelocateCalls;
+
+  for (auto *U : I.users())
+    if (isGCRelocate(dyn_cast<Instruction>(U)))
+      // Collect all the relocate calls associated with a statepoint
+      AllRelocateCalls.push_back(U);
+
+  // We need atleast one base pointer relocation + one derived pointer
+  // relocation to mangle
+  if (AllRelocateCalls.size() < 2)
+    return false;
+
+  // RelocateInstMap is a mapping from the base relocate instruction to the
+  // corresponding derived relocate instructions
+  DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> RelocateInstMap;
+  computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
+  if (RelocateInstMap.empty())
+    return false;
+
+  for (auto &Item : RelocateInstMap)
+    // Item.first is the RelocatedBase to offset against
+    // Item.second is the vector of Targets to replace
+    MadeChange = simplifyRelocatesOffABase(Item.first, Item.second);
+  return MadeChange;
+}
+
 /// SinkCast - Sink the specified cast instruction into its user blocks
 static bool SinkCast(CastInst *CI) {
   BasicBlock *DefBB = CI->getParent();
@@ -822,23 +1001,211 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
   return MadeChange;
 }
 
-namespace {
-class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls {
-protected:
-  void replaceCall(Value *With) override {
-    CI->replaceAllUsesWith(With);
-    CI->eraseFromParent();
+//  ScalarizeMaskedLoad() translates masked load intrinsic, like 
+// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
+//                               <16 x i1> %mask, <16 x i32> %passthru)
+// to a chain of basic blocks, whith loading element one-by-one if
+// the appropriate mask bit is set
+// 
+//  %1 = bitcast i8* %addr to i32*
+//  %2 = extractelement <16 x i1> %mask, i32 0
+//  %3 = icmp eq i1 %2, true
+//  br i1 %3, label %cond.load, label %else
+//
+//cond.load:                                        ; preds = %0
+//  %4 = getelementptr i32* %1, i32 0
+//  %5 = load i32* %4
+//  %6 = insertelement <16 x i32> undef, i32 %5, i32 0
+//  br label %else
+//
+//else:                                             ; preds = %0, %cond.load
+//  %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
+//  %7 = extractelement <16 x i1> %mask, i32 1
+//  %8 = icmp eq i1 %7, true
+//  br i1 %8, label %cond.load1, label %else2
+//
+//cond.load1:                                       ; preds = %else
+//  %9 = getelementptr i32* %1, i32 1
+//  %10 = load i32* %9
+//  %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
+//  br label %else2
+//
+//else2:                                            ; preds = %else, %cond.load1
+//  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+//  %12 = extractelement <16 x i1> %mask, i32 2
+//  %13 = icmp eq i1 %12, true
+//  br i1 %13, label %cond.load4, label %else5
+//
+static void ScalarizeMaskedLoad(CallInst *CI) {
+  Value *Ptr  = CI->getArgOperand(0);
+  Value *Src0 = CI->getArgOperand(3);
+  Value *Mask = CI->getArgOperand(2);
+  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+  Type *EltTy = VecType->getElementType();
+
+  assert(VecType && "Unexpected return type of masked load intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  BasicBlock *CondBlock = nullptr;
+  BasicBlock *PrevIfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Bitcast %addr fron i8* to EltTy*
+  Type *NewPtrType =
+    EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+  Value *UndefVal = UndefValue::get(VecType);
+
+  // The result vector
+  Value *VResult = UndefVal;
+
+  PHINode *Phi = nullptr;
+  Value *PrevPhi = UndefVal;
+
+  unsigned VectorWidth = VecType->getNumElements();
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %to_load = icmp eq i1 %mask_1, true
+    //  br i1 %to_load, label %cond.load, label %else
+    //
+    if (Idx > 0) {
+      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+      Phi->addIncoming(VResult, CondBlock);
+      Phi->addIncoming(PrevPhi, PrevIfBlock);
+      PrevPhi = Phi;
+      VResult = Phi;
+    }
+
+    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1));
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+    
+    Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx));
+    LoadInst* Load = Builder.CreateLoad(Gep, false);
+    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
   }
-  bool isFoldable(unsigned SizeCIOp, unsigned, bool) const override {
-      if (ConstantInt *SizeCI =
-                             dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp)))
-        return SizeCI->isAllOnesValue();
-    return false;
+
+  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+  Phi->addIncoming(VResult, CondBlock);
+  Phi->addIncoming(PrevPhi, PrevIfBlock);
+  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+  CI->replaceAllUsesWith(NewI);
+  CI->eraseFromParent();
+}
+
+//  ScalarizeMaskedStore() translates masked store intrinsic, like
+// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
+//                               <16 x i1> %mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set
+//
+//   %1 = bitcast i8* %addr to i32*
+//   %2 = extractelement <16 x i1> %mask, i32 0
+//   %3 = icmp eq i1 %2, true
+//   br i1 %3, label %cond.store, label %else
+//
+// cond.store:                                       ; preds = %0
+//   %4 = extractelement <16 x i32> %val, i32 0
+//   %5 = getelementptr i32* %1, i32 0
+//   store i32 %4, i32* %5
+//   br label %else
+// 
+// else:                                             ; preds = %0, %cond.store
+//   %6 = extractelement <16 x i1> %mask, i32 1
+//   %7 = icmp eq i1 %6, true
+//   br i1 %7, label %cond.store1, label %else2
+// 
+// cond.store1:                                      ; preds = %else
+//   %8 = extractelement <16 x i32> %val, i32 1
+//   %9 = getelementptr i32* %1, i32 1
+//   store i32 %8, i32* %9
+//   br label %else2
+//   . . .
+static void ScalarizeMaskedStore(CallInst *CI) {
+  Value *Ptr  = CI->getArgOperand(1);
+  Value *Src = CI->getArgOperand(0);
+  Value *Mask = CI->getArgOperand(3);
+
+  VectorType *VecType = dyn_cast<VectorType>(Src->getType());
+  Type *EltTy = VecType->getElementType();
+
+  assert(VecType && "Unexpected data type in masked store intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Bitcast %addr fron i8* to EltTy*
+  Type *NewPtrType =
+    EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+
+  unsigned VectorWidth = VecType->getNumElements();
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %to_store = icmp eq i1 %mask_1, true
+    //  br i1 %to_load, label %cond.store, label %else
+    //
+    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1));
+
+    // Create "cond" block
+    //
+    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %store i32 %OneElt, i32* %EltAddr
+    //
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+    
+    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+    Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx));
+    Builder.CreateStore(OneElt, Gep);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    IfBlock = NewIfBlock;
   }
-};
-} // end anonymous namespace
+  CI->eraseFromParent();
+}
 
-bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
+bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
   BasicBlock *BB = CI->getParent();
 
   // Lower inline assembly if we can.
@@ -858,38 +1225,60 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
       return true;
   }
 
-  // Lower all uses of llvm.objectsize.*
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
-  if (II && II->getIntrinsicID() == Intrinsic::objectsize) {
-    bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
-    Type *ReturnTy = CI->getType();
-    Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
-
-    // Substituting this can cause recursive simplifications, which can
-    // invalidate our iterator.  Use a WeakVH to hold onto it in case this
-    // happens.
-    WeakVH IterHandle(CurInstIterator);
+  if (II) {
+    switch (II->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::objectsize: {
+      // Lower all uses of llvm.objectsize.*
+      bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
+      Type *ReturnTy = CI->getType();
+      Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
+
+      // Substituting this can cause recursive simplifications, which can
+      // invalidate our iterator.  Use a WeakVH to hold onto it in case this
+      // happens.
+      WeakVH IterHandle(CurInstIterator);
+
+      replaceAndRecursivelySimplify(CI, RetVal,
+                                    TLI ? TLI->getDataLayout() : nullptr,
+                                    TLInfo, ModifiedDT ? nullptr : DT);
 
-    replaceAndRecursivelySimplify(CI, RetVal,
-                                  TLI ? TLI->getDataLayout() : nullptr,
-                                  TLInfo, ModifiedDT ? nullptr : DT);
-
-    // If the iterator instruction was recursively deleted, start over at the
-    // start of the block.
-    if (IterHandle != CurInstIterator) {
-      CurInstIterator = BB->begin();
-      SunkAddrs.clear();
+      // If the iterator instruction was recursively deleted, start over at the
+      // start of the block.
+      if (IterHandle != CurInstIterator) {
+        CurInstIterator = BB->begin();
+        SunkAddrs.clear();
+      }
+      return true;
+    }
+    case Intrinsic::masked_load: {
+      // Scalarize unsupported vector masked load
+      if (!TTI->isLegalMaskedLoad(CI->getType(), 1)) {
+        ScalarizeMaskedLoad(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_store: {
+      if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType(), 1)) {
+        ScalarizeMaskedStore(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
     }
-    return true;
-  }
 
-  if (II && TLI) {
-    SmallVector<Value*, 2> PtrOps;
-    Type *AccessTy;
-    if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
-      while (!PtrOps.empty())
-        if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
-          return true;
+    if (TLI) {
+      SmallVector<Value*, 2> PtrOps;
+      Type *AccessTy;
+      if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
+        while (!PtrOps.empty())
+          if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
+            return true;
+    }
   }
 
   // From here on out we're working with named functions.
@@ -901,10 +1290,15 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
 
   // Lower all default uses of _chk calls.  This is very similar
   // to what InstCombineCalls does, but here we are only lowering calls
-  // that have the default "don't know" as the objectsize.  Anything else
-  // should be left alone.
-  CodeGenPrepareFortifiedLibCalls Simplifier;
-  return Simplifier.fold(CI, TD, TLInfo);
+  // to fortified library functions (e.g. __memcpy_chk) that have the default
+  // "don't know" as the objectsize.  Anything else should be left alone.
+  FortifiedLibCallSimplifier Simplifier(TD, TLInfo, true);
+  if (Value *V = Simplifier.optimizeCall(CI)) {
+    CI->replaceAllUsesWith(V);
+    CI->eraseFromParent();
+    return true;
+  }
+  return false;
 }
 
 /// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return
@@ -1561,6 +1955,7 @@ void TypePromotionTransaction::rollback(
 /// This encapsulates the logic for matching the target-legal addressing modes.
 class AddressingModeMatcher {
   SmallVectorImpl<Instruction*> &AddrModeInsts;
+  const TargetMachine &TM;
   const TargetLowering &TLI;
 
   /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
@@ -1584,13 +1979,15 @@ class AddressingModeMatcher {
   /// always returns true.
   bool IgnoreProfitability;
 
-  AddressingModeMatcher(SmallVectorImpl<Instruction*> &AMI,
-                        const TargetLowering &T, Type *AT,
-                        Instruction *MI, ExtAddrMode &AM,
-                        const SetOfInstrs &InsertedTruncs,
+  AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI,
+                        const TargetMachine &TM, Type *AT, Instruction *MI,
+                        ExtAddrMode &AM, const SetOfInstrs &InsertedTruncs,
                         InstrToOrigTy &PromotedInsts,
                         TypePromotionTransaction &TPT)
-      : AddrModeInsts(AMI), TLI(T), AccessTy(AT), MemoryInst(MI), AddrMode(AM),
+      : AddrModeInsts(AMI), TM(TM),
+        TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent())
+                 ->getTargetLowering()),
+        AccessTy(AT), MemoryInst(MI), AddrMode(AM),
         InsertedTruncs(InsertedTruncs), PromotedInsts(PromotedInsts), TPT(TPT) {
     IgnoreProfitability = false;
   }
@@ -1607,13 +2004,13 @@ public:
   static ExtAddrMode Match(Value *V, Type *AccessTy,
                            Instruction *MemoryInst,
                            SmallVectorImpl<Instruction*> &AddrModeInsts,
-                           const TargetLowering &TLI,
+                           const TargetMachine &TM,
                            const SetOfInstrs &InsertedTruncs,
                            InstrToOrigTy &PromotedInsts,
                            TypePromotionTransaction &TPT) {
     ExtAddrMode Result;
 
-    bool Success = AddressingModeMatcher(AddrModeInsts, TLI, AccessTy,
+    bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy,
                                          MemoryInst, Result, InsertedTruncs,
                                          PromotedInsts, TPT).MatchAddr(V, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
@@ -1718,6 +2115,23 @@ static bool MightBeFoldableInst(Instruction *I) {
   }
 }
 
+/// \brief Check whether or not \p Val is a legal instruction for \p TLI.
+/// \note \p Val is assumed to be the product of some type promotion.
+/// Therefore if \p Val has an undefined state in \p TLI, this is assumed
+/// to be legal, as the non-promoted value would have had the same state.
+static bool isPromotedInstructionLegal(const TargetLowering &TLI, Value *Val) {
+  Instruction *PromotedInst = dyn_cast<Instruction>(Val);
+  if (!PromotedInst)
+    return false;
+  int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());
+  // If the ISDOpcode is undefined, it was undefined before the promotion.
+  if (!ISDOpcode)
+    return true;
+  // Otherwise, check if the promoted instruction is legal or not.
+  return TLI.isOperationLegalOrCustom(
+      ISDOpcode, TLI.getValueType(PromotedInst->getType()));
+}
+
 /// \brief Hepler class to perform type promotion.
 class TypePromotionHelper {
   /// \brief Utility function to check whether or not a sign or zero extension
@@ -1747,46 +2161,59 @@ class TypePromotionHelper {
   /// \p PromotedInsts maps the instructions to their type before promotion.
   /// \p CreatedInsts[out] contains how many non-free instructions have been
   /// created to promote the operand of Ext.
+  /// Newly added extensions are inserted in \p Exts.
+  /// Newly added truncates are inserted in \p Truncs.
   /// Should never be called directly.
   /// \return The promoted value which is used instead of Ext.
-  static Value *promoteOperandForTruncAndAnyExt(Instruction *Ext,
-                                                TypePromotionTransaction &TPT,
-                                                InstrToOrigTy &PromotedInsts,
-                                                unsigned &CreatedInsts);
+  static Value *promoteOperandForTruncAndAnyExt(
+      Instruction *Ext, TypePromotionTransaction &TPT,
+      InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+      SmallVectorImpl<Instruction *> *Exts,
+      SmallVectorImpl<Instruction *> *Truncs);
 
   /// \brief Utility function to promote the operand of \p Ext when this
   /// operand is promotable and is not a supported trunc or sext.
   /// \p PromotedInsts maps the instructions to their type before promotion.
   /// \p CreatedInsts[out] contains how many non-free instructions have been
   /// created to promote the operand of Ext.
+  /// Newly added extensions are inserted in \p Exts.
+  /// Newly added truncates are inserted in \p Truncs.
   /// Should never be called directly.
   /// \return The promoted value which is used instead of Ext.
-  static Value *promoteOperandForOther(Instruction *Ext,
-                                       TypePromotionTransaction &TPT,
-                                       InstrToOrigTy &PromotedInsts,
-                                       unsigned &CreatedInsts, bool IsSExt);
+  static Value *
+  promoteOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT,
+                         InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+                         SmallVectorImpl<Instruction *> *Exts,
+                         SmallVectorImpl<Instruction *> *Truncs, bool IsSExt);
 
   /// \see promoteOperandForOther.
-  static Value *signExtendOperandForOther(Instruction *Ext,
-                                          TypePromotionTransaction &TPT,
-                                          InstrToOrigTy &PromotedInsts,
-                                          unsigned &CreatedInsts) {
-    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, true);
+  static Value *
+  signExtendOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT,
+                            InstrToOrigTy &PromotedInsts,
+                            unsigned &CreatedInsts,
+                            SmallVectorImpl<Instruction *> *Exts,
+                            SmallVectorImpl<Instruction *> *Truncs) {
+    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, Exts,
+                                  Truncs, true);
   }
 
   /// \see promoteOperandForOther.
-  static Value *zeroExtendOperandForOther(Instruction *Ext,
-                                          TypePromotionTransaction &TPT,
-                                          InstrToOrigTy &PromotedInsts,
-                                          unsigned &CreatedInsts) {
-    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, false);
+  static Value *
+  zeroExtendOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT,
+                            InstrToOrigTy &PromotedInsts,
+                            unsigned &CreatedInsts,
+                            SmallVectorImpl<Instruction *> *Exts,
+                            SmallVectorImpl<Instruction *> *Truncs) {
+    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, Exts,
+                                  Truncs, false);
   }
 
 public:
   /// Type for the utility function that promotes the operand of Ext.
   typedef Value *(*Action)(Instruction *Ext, TypePromotionTransaction &TPT,
-                           InstrToOrigTy &PromotedInsts,
-                           unsigned &CreatedInsts);
+                           InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+                           SmallVectorImpl<Instruction *> *Exts,
+                           SmallVectorImpl<Instruction *> *Truncs);
   /// \brief Given a sign/zero extend instruction \p Ext, return the approriate
   /// action to promote the operand of \p Ext instead of using Ext.
   /// \return NULL if no promotable action is possible with the current
@@ -1805,6 +2232,12 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
                                         Type *ConsideredExtType,
                                         const InstrToOrigTy &PromotedInsts,
                                         bool IsSExt) {
+  // The promotion helper does not know how to deal with vector types yet.
+  // To be able to fix that, we would need to fix the places where we
+  // statically extend, e.g., constants and such.
+  if (Inst->getType()->isVectorTy())
+    return false;
+
   // We can always get through zext.
   if (isa<ZExtInst>(Inst))
     return true;
@@ -1830,8 +2263,9 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
   // Check if we can use this operand in the extension.
   // If the type is larger than the result type of the extension,
   // we cannot.
-  if (OpndVal->getType()->getIntegerBitWidth() >
-      ConsideredExtType->getIntegerBitWidth())
+  if (!OpndVal->getType()->isIntegerTy() ||
+      OpndVal->getType()->getIntegerBitWidth() >
+          ConsideredExtType->getIntegerBitWidth())
     return false;
 
   // If the operand of the truncate is not an instruction, we will not have
@@ -1896,7 +2330,9 @@ TypePromotionHelper::Action TypePromotionHelper::getAction(
 
 Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
     llvm::Instruction *SExt, TypePromotionTransaction &TPT,
-    InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts) {
+    InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+    SmallVectorImpl<Instruction *> *Exts,
+    SmallVectorImpl<Instruction *> *Truncs) {
   // By construction, the operand of SExt is an instruction. Otherwise we cannot
   // get through it and this method should not be called.
   Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
@@ -1922,8 +2358,11 @@ Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
 
   // Check if the extension is still needed.
   Instruction *ExtInst = dyn_cast<Instruction>(ExtVal);
-  if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType())
+  if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) {
+    if (ExtInst && Exts)
+      Exts->push_back(ExtInst);
     return ExtVal;
+  }
 
   // At this point we have: ext ty opnd to ty.
   // Reassign the uses of ExtInst to the opnd and remove ExtInst.
@@ -1934,7 +2373,9 @@ Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
 
 Value *TypePromotionHelper::promoteOperandForOther(
     Instruction *Ext, TypePromotionTransaction &TPT,
-    InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts, bool IsSExt) {
+    InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+    SmallVectorImpl<Instruction *> *Exts,
+    SmallVectorImpl<Instruction *> *Truncs, bool IsSExt) {
   // By construction, the operand of Ext is an instruction. Otherwise we cannot
   // get through it and this method should not be called.
   Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0));
@@ -1949,6 +2390,8 @@ Value *TypePromotionHelper::promoteOperandForOther(
       ITrunc->removeFromParent();
       // Insert it just after the definition.
       ITrunc->insertAfter(ExtOpnd);
+      if (Truncs)
+        Truncs->push_back(ITrunc);
     }
 
     TPT.replaceAllUsesWith(ExtOpnd, Trunc);
@@ -2004,12 +2447,17 @@ Value *TypePromotionHelper::promoteOperandForOther(
     if (!ExtForOpnd) {
       // If yes, create a new one.
       DEBUG(dbgs() << "More operands to ext\n");
-      ExtForOpnd =
-          cast<Instruction>(IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
-                                   : TPT.createZExt(Ext, Opnd, Ext->getType()));
+      Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
+        : TPT.createZExt(Ext, Opnd, Ext->getType());
+      if (!isa<Instruction>(ValForExtOpnd)) {
+        TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd);
+        continue;
+      }
+      ExtForOpnd = cast<Instruction>(ValForExtOpnd);
       ++CreatedInsts;
     }
-
+    if (Exts)
+      Exts->push_back(ExtForOpnd);
     TPT.setOperand(ExtForOpnd, 0, Opnd);
 
     // Move the sign extension before the insertion point.
@@ -2047,16 +2495,7 @@ AddressingModeMatcher::IsPromotionProfitable(unsigned MatchedSize,
   // The promotion is neutral but it may help folding the sign extension in
   // loads for instance.
   // Check that we did not create an illegal instruction.
-  Instruction *PromotedInst = dyn_cast<Instruction>(PromotedOperand);
-  if (!PromotedInst)
-    return false;
-  int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());
-  // If the ISDOpcode is undefined, it was undefined before the promotion.
-  if (!ISDOpcode)
-    return true;
-  // Otherwise, check if the promoted instruction is legal or not.
-  return TLI.isOperationLegalOrCustom(
-      ISDOpcode, TLI.getValueType(PromotedInst->getType()));
+  return isPromotedInstructionLegal(TLI, PromotedOperand);
 }
 
 /// MatchOperationAddr - Given an instruction or constant expr, see if we can
@@ -2250,7 +2689,8 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
         TPT.getRestorationPoint();
     unsigned CreatedInsts = 0;
-    Value *PromotedOperand = TPH(Ext, TPT, PromotedInsts, CreatedInsts);
+    Value *PromotedOperand =
+        TPH(Ext, TPT, PromotedInsts, CreatedInsts, nullptr, nullptr);
     // SExt has been moved away.
     // Thus either it will be rematched later in the recursive calls or it is
     // gone. Anyway, we must not fold it into the addressing mode at this point.
@@ -2374,13 +2814,17 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
 /// inline asm call are due to memory operands.  If so, return true, otherwise
 /// return false.
 static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
-                                    const TargetLowering &TLI) {
-  TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI));
+                                    const TargetMachine &TM) {
+  const Function *F = CI->getParent()->getParent();
+  const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering();
+  const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo();
+  TargetLowering::AsmOperandInfoVector TargetConstraints =
+      TLI->ParseConstraints(TRI, ImmutableCallSite(CI));
   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
 
     // Compute the constraint code and ConstraintType to use.
-    TLI.ComputeConstraintToUse(OpInfo, SDValue());
+    TLI->ComputeConstraintToUse(OpInfo, SDValue());
 
     // If this asm operand is our Value*, and if it isn't an indirect memory
     // operand, we can't fold it!
@@ -2396,10 +2840,10 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
 /// FindAllMemoryUses - Recursively walk all the uses of I until we find a
 /// memory use.  If we find an obviously non-foldable instruction, return true.
 /// Add the ultimately found memory instructions to MemoryUses.
-static bool FindAllMemoryUses(Instruction *I,
-                SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses,
-                              SmallPtrSetImpl<Instruction*> &ConsideredInsts,
-                              const TargetLowering &TLI) {
+static bool FindAllMemoryUses(
+    Instruction *I,
+    SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
+    SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetMachine &TM) {
   // If we already considered this instruction, we're done.
   if (!ConsideredInsts.insert(I).second)
     return false;
@@ -2429,12 +2873,12 @@ static bool FindAllMemoryUses(Instruction *I,
       if (!IA) return true;
 
       // If this is a memory operand, we're cool, otherwise bail out.
-      if (!IsOperandAMemoryOperand(CI, IA, I, TLI))
+      if (!IsOperandAMemoryOperand(CI, IA, I, TM))
         return true;
       continue;
     }
 
-    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI))
+    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TM))
       return true;
   }
 
@@ -2522,7 +2966,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
   // uses.
   SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
   SmallPtrSet<Instruction*, 16> ConsideredInsts;
-  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI))
+  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM))
     return false;  // Has a non-memory, non-foldable use!
 
   // Now that we know that all uses of this instruction are part of a chain of
@@ -2547,7 +2991,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
     ExtAddrMode Result;
     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
         TPT.getRestorationPoint();
-    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy,
+    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TM, AddressAccessTy,
                                   MemoryInst, Result, InsertedTruncs,
                                   PromotedInsts, TPT);
     Matcher.IgnoreProfitability = true;
@@ -2630,7 +3074,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     // For non-PHIs, determine the addressing mode being computed.
     SmallVector<Instruction*, 16> NewAddrModeInsts;
     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
-        V, AccessTy, MemoryInst, NewAddrModeInsts, *TLI, InsertedTruncsSet,
+        V, AccessTy, MemoryInst, NewAddrModeInsts, *TM, InsertedTruncsSet,
         PromotedInsts, TPT);
 
     // This check is broken into two cases with very similar code to avoid using
@@ -2705,8 +3149,10 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
                  << *MemoryInst << "\n");
     if (SunkAddr->getType() != Addr->getType())
       SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
-  } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&
-               TM && TM->getSubtarget<TargetSubtargetInfo>().useAA())) {
+  } else if (AddrSinkUsingGEPs ||
+             (!AddrSinkUsingGEPs.getNumOccurrences() && TM &&
+              TM->getSubtargetImpl(*MemoryInst->getParent()->getParent())
+                  ->useAA())) {
     // By default, we use the GEP-based method when AA is used later. This
     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
@@ -2929,8 +3375,10 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
 bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) {
   bool MadeChange = false;
 
+  const TargetRegisterInfo *TRI =
+      TM->getSubtargetImpl(*CS->getParent()->getParent())->getRegisterInfo();
   TargetLowering::AsmOperandInfoVector
-    TargetConstraints = TLI->ParseConstraints(CS);
+    TargetConstraints = TLI->ParseConstraints(TRI, CS);
   unsigned ArgNo = 0;
   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
@@ -2949,26 +3397,186 @@ bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) {
   return MadeChange;
 }
 
+/// \brief Check if all the uses of \p Inst are equivalent (or free) zero or
+/// sign extensions.
+static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {
+  assert(!Inst->use_empty() && "Input must have at least one use");
+  const Instruction *FirstUser = cast<Instruction>(*Inst->user_begin());
+  bool IsSExt = isa<SExtInst>(FirstUser);
+  Type *ExtTy = FirstUser->getType();
+  for (const User *U : Inst->users()) {
+    const Instruction *UI = cast<Instruction>(U);
+    if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))
+      return false;
+    Type *CurTy = UI->getType();
+    // Same input and output types: Same instruction after CSE.
+    if (CurTy == ExtTy)
+      continue;
+
+    // If IsSExt is true, we are in this situation:
+    // a = Inst
+    // b = sext ty1 a to ty2
+    // c = sext ty1 a to ty3
+    // Assuming ty2 is shorter than ty3, this could be turned into:
+    // a = Inst
+    // b = sext ty1 a to ty2
+    // c = sext ty2 b to ty3
+    // However, the last sext is not free.
+    if (IsSExt)
+      return false;
+
+    // This is a ZExt, maybe this is free to extend from one type to another.
+    // In that case, we would not account for a different use.
+    Type *NarrowTy;
+    Type *LargeTy;
+    if (ExtTy->getScalarType()->getIntegerBitWidth() >
+        CurTy->getScalarType()->getIntegerBitWidth()) {
+      NarrowTy = CurTy;
+      LargeTy = ExtTy;
+    } else {
+      NarrowTy = ExtTy;
+      LargeTy = CurTy;
+    }
+
+    if (!TLI.isZExtFree(NarrowTy, LargeTy))
+      return false;
+  }
+  // All uses are the same or can be derived from one another for free.
+  return true;
+}
+
+/// \brief Try to form ExtLd by promoting \p Exts until they reach a
+/// load instruction.
+/// If an ext(load) can be formed, it is returned via \p LI for the load
+/// and \p Inst for the extension.
+/// Otherwise LI == nullptr and Inst == nullptr.
+/// When some promotion happened, \p TPT contains the proper state to
+/// revert them.
+///
+/// \return true when promoting was necessary to expose the ext(load)
+/// opportunity, false otherwise.
+///
+/// Example:
+/// \code
+/// %ld = load i32* %addr
+/// %add = add nuw i32 %ld, 4
+/// %zext = zext i32 %add to i64
+/// \endcode
+/// =>
+/// \code
+/// %ld = load i32* %addr
+/// %zext = zext i32 %ld to i64
+/// %add = add nuw i64 %zext, 4
+/// \encode
+/// Thanks to the promotion, we can match zext(load i32*) to i64.
+bool CodeGenPrepare::ExtLdPromotion(TypePromotionTransaction &TPT,
+                                    LoadInst *&LI, Instruction *&Inst,
+                                    const SmallVectorImpl<Instruction *> &Exts,
+                                    unsigned CreatedInsts = 0) {
+  // Iterate over all the extensions to see if one form an ext(load).
+  for (auto I : Exts) {
+    // Check if we directly have ext(load).
+    if ((LI = dyn_cast<LoadInst>(I->getOperand(0)))) {
+      Inst = I;
+      // No promotion happened here.
+      return false;
+    }
+    // Check whether or not we want to do any promotion.
+    if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion)
+      continue;
+    // Get the action to perform the promotion.
+    TypePromotionHelper::Action TPH = TypePromotionHelper::getAction(
+        I, InsertedTruncsSet, *TLI, PromotedInsts);
+    // Check if we can promote.
+    if (!TPH)
+      continue;
+    // Save the current state.
+    TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+        TPT.getRestorationPoint();
+    SmallVector<Instruction *, 4> NewExts;
+    unsigned NewCreatedInsts = 0;
+    // Promote.
+    Value *PromotedVal =
+        TPH(I, TPT, PromotedInsts, NewCreatedInsts, &NewExts, nullptr);
+    assert(PromotedVal &&
+           "TypePromotionHelper should have filtered out those cases");
+
+    // We would be able to merge only one extension in a load.
+    // Therefore, if we have more than 1 new extension we heuristically
+    // cut this search path, because it means we degrade the code quality.
+    // With exactly 2, the transformation is neutral, because we will merge
+    // one extension but leave one. However, we optimistically keep going,
+    // because the new extension may be removed too.
+    unsigned TotalCreatedInsts = CreatedInsts + NewCreatedInsts;
+    if (!StressExtLdPromotion &&
+        (TotalCreatedInsts > 1 ||
+         !isPromotedInstructionLegal(*TLI, PromotedVal))) {
+      // The promotion is not profitable, rollback to the previous state.
+      TPT.rollback(LastKnownGood);
+      continue;
+    }
+    // The promotion is profitable.
+    // Check if it exposes an ext(load).
+    (void)ExtLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInsts);
+    if (LI && (StressExtLdPromotion || NewCreatedInsts == 0 ||
+               // If we have created a new extension, i.e., now we have two
+               // extensions. We must make sure one of them is merged with
+               // the load, otherwise we may degrade the code quality.
+               (LI->hasOneUse() || hasSameExtUse(LI, *TLI))))
+      // Promotion happened.
+      return true;
+    // If this does not help to expose an ext(load) then, rollback.
+    TPT.rollback(LastKnownGood);
+  }
+  // None of the extension can form an ext(load).
+  LI = nullptr;
+  Inst = nullptr;
+  return false;
+}
+
 /// MoveExtToFormExtLoad - Move a zext or sext fed by a load into the same
 /// basic block as the load, unless conditions are unfavorable. This allows
 /// SelectionDAG to fold the extend into the load.
+/// \p I[in/out] the extension may be modified during the process if some
+/// promotions apply.
 ///
-bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) {
+bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *&I) {
+  // Try to promote a chain of computation if it allows to form
+  // an extended load.
+  TypePromotionTransaction TPT;
+  TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+    TPT.getRestorationPoint();
+  SmallVector<Instruction *, 1> Exts;
+  Exts.push_back(I);
   // Look for a load being extended.
-  LoadInst *LI = dyn_cast<LoadInst>(I->getOperand(0));
-  if (!LI) return false;
+  LoadInst *LI = nullptr;
+  Instruction *OldExt = I;
+  bool HasPromoted = ExtLdPromotion(TPT, LI, I, Exts);
+  if (!LI || !I) {
+    assert(!HasPromoted && !LI && "If we did not match any load instruction "
+                                  "the code must remain the same");
+    I = OldExt;
+    return false;
+  }
 
   // If they're already in the same block, there's nothing to do.
-  if (LI->getParent() == I->getParent())
+  // Make the cheap checks first if we did not promote.
+  // If we promoted, we need to check if it is indeed profitable.
+  if (!HasPromoted && LI->getParent() == I->getParent())
     return false;
 
+  EVT VT = TLI->getValueType(I->getType());
+  EVT LoadVT = TLI->getValueType(LI->getType());
+
   // If the load has other users and the truncate is not free, this probably
   // isn't worthwhile.
-  if (!LI->hasOneUse() &&
-      TLI && (TLI->isTypeLegal(TLI->getValueType(LI->getType())) ||
-              !TLI->isTypeLegal(TLI->getValueType(I->getType()))) &&
-      !TLI->isTruncateFree(I->getType(), LI->getType()))
+  if (!LI->hasOneUse() && TLI &&
+      (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) &&
+      !TLI->isTruncateFree(I->getType(), LI->getType())) {
+    I = OldExt;
+    TPT.rollback(LastKnownGood);
     return false;
+  }
 
   // Check whether the target supports casts folded into loads.
   unsigned LType;
@@ -2978,11 +3586,15 @@ bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) {
     assert(isa<SExtInst>(I) && "Unexpected ext type!");
     LType = ISD::SEXTLOAD;
   }
-  if (TLI && !TLI->isLoadExtLegal(LType, TLI->getValueType(LI->getType())))
+  if (TLI && !TLI->isLoadExtLegal(LType, VT, LoadVT)) {
+    I = OldExt;
+    TPT.rollback(LastKnownGood);
     return false;
+  }
 
   // Move the extend into the same block as the load, so that SelectionDAG
   // can fold it.
+  TPT.commit();
   I->removeFromParent();
   I->insertAfter(LI);
   ++NumExtsMoved;
@@ -3512,7 +4124,8 @@ void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
           isa<UndefValue>(Val) ||
               canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));
     } else
-      assert(0 && "Did you modified shouldPromote and forgot to update this?");
+      llvm_unreachable("Did you modified shouldPromote and forgot to update "
+                       "this?");
     ToBePromoted->setOperand(U.getOperandNo(), NewVal);
   }
   Transition->removeFromParent();
@@ -3575,7 +4188,7 @@ bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) {
   return false;
 }
 
-bool CodeGenPrepare::OptimizeInst(Instruction *I) {
+bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
   if (PHINode *P = dyn_cast<PHINode>(I)) {
     // It is possible for very late stage optimizations (such as SimplifyCFG)
     // to introduce PHI nodes too late to be cleaned up.  If we detect such a
@@ -3654,14 +4267,14 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
       GEPI->replaceAllUsesWith(NC);
       GEPI->eraseFromParent();
       ++NumGEPsElim;
-      OptimizeInst(NC);
+      OptimizeInst(NC, ModifiedDT);
       return true;
     }
     return false;
   }
 
   if (CallInst *CI = dyn_cast<CallInst>(I))
-    return OptimizeCallInst(CI);
+    return OptimizeCallInst(CI, ModifiedDT);
 
   if (SelectInst *SI = dyn_cast<SelectInst>(I))
     return OptimizeSelectInst(SI);
@@ -3678,14 +4291,16 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
 // In this pass we look for GEP and cast instructions that are used
 // across basic blocks and rewrite them to improve basic-block-at-a-time
 // selection.
-bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
+bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
   SunkAddrs.clear();
   bool MadeChange = false;
 
   CurInstIterator = BB.begin();
-  while (CurInstIterator != BB.end())
-    MadeChange |= OptimizeInst(CurInstIterator++);
-
+  while (CurInstIterator != BB.end()) {
+    MadeChange |= OptimizeInst(CurInstIterator++, ModifiedDT);
+    if (ModifiedDT)
+      return true;
+  }
   MadeChange |= DupRetToEnableTailCallOpts(&BB);
 
   return MadeChange;
@@ -3696,10 +4311,10 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
 // find a node corresponding to the value.
 bool CodeGenPrepare::PlaceDbgValues(Function &F) {
   bool MadeChange = false;
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+  for (BasicBlock &BB : F) {
     Instruction *PrevNonDbgInst = nullptr;
-    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
-      Instruction *Insn = BI; ++BI;
+    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
+      Instruction *Insn = BI++;
       DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
       // Leave dbg.values that refer to an alloca alone. These
       // instrinsics describe the address of a variable (= the alloca)
@@ -3793,3 +4408,233 @@ bool CodeGenPrepare::sinkAndCmp(Function &F) {
   }
   return MadeChange;
 }
+
+/// \brief Retrieve the probabilities of a conditional branch. Returns true on
+/// success, or returns false if no or invalid metadata was found.
+static bool extractBranchMetadata(BranchInst *BI,
+                                  uint64_t &ProbTrue, uint64_t &ProbFalse) {
+  assert(BI->isConditional() &&
+         "Looking for probabilities on unconditional branch?");
+  auto *ProfileData = BI->getMetadata(LLVMContext::MD_prof);
+  if (!ProfileData || ProfileData->getNumOperands() != 3)
+    return false;
+
+  const auto *CITrue =
+      mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(1));
+  const auto *CIFalse =
+      mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(2));
+  if (!CITrue || !CIFalse)
+    return false;
+
+  ProbTrue = CITrue->getValue().getZExtValue();
+  ProbFalse = CIFalse->getValue().getZExtValue();
+
+  return true;
+}
+
+/// \brief Scale down both weights to fit into uint32_t.
+static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
+  uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
+  uint32_t Scale = (NewMax / UINT32_MAX) + 1;
+  NewTrue = NewTrue / Scale;
+  NewFalse = NewFalse / Scale;
+}
+
+/// \brief Some targets prefer to split a conditional branch like:
+/// \code
+///   %0 = icmp ne i32 %a, 0
+///   %1 = icmp ne i32 %b, 0
+///   %or.cond = or i1 %0, %1
+///   br i1 %or.cond, label %TrueBB, label %FalseBB
+/// \endcode
+/// into multiple branch instructions like:
+/// \code
+///   bb1:
+///     %0 = icmp ne i32 %a, 0
+///     br i1 %0, label %TrueBB, label %bb2
+///   bb2:
+///     %1 = icmp ne i32 %b, 0
+///     br i1 %1, label %TrueBB, label %FalseBB
+/// \endcode
+/// This usually allows instruction selection to do even further optimizations
+/// and combine the compare with the branch instruction. Currently this is
+/// applied for targets which have "cheap" jump instructions.
+///
+/// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
+///
+bool CodeGenPrepare::splitBranchCondition(Function &F) {
+  if (!TM || TM->Options.EnableFastISel != true ||
+      !TLI || TLI->isJumpExpensive())
+    return false;
+
+  bool MadeChange = false;
+  for (auto &BB : F) {
+    // Does this BB end with the following?
+    //   %cond1 = icmp|fcmp|binary instruction ...
+    //   %cond2 = icmp|fcmp|binary instruction ...
+    //   %cond.or = or|and i1 %cond1, cond2
+    //   br i1 %cond.or label %dest1, label %dest2"
+    BinaryOperator *LogicOp;
+    BasicBlock *TBB, *FBB;
+    if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB)))
+      continue;
+
+    unsigned Opc;
+    Value *Cond1, *Cond2;
+    if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)),
+                             m_OneUse(m_Value(Cond2)))))
+      Opc = Instruction::And;
+    else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)),
+                                 m_OneUse(m_Value(Cond2)))))
+      Opc = Instruction::Or;
+    else
+      continue;
+
+    if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) ||
+        !match(Cond2, m_CombineOr(m_Cmp(), m_BinOp()))   )
+      continue;
+
+    DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
+
+    // Create a new BB.
+    auto *InsertBefore = std::next(Function::iterator(BB))
+        .getNodePtrUnchecked();
+    auto TmpBB = BasicBlock::Create(BB.getContext(),
+                                    BB.getName() + ".cond.split",
+                                    BB.getParent(), InsertBefore);
+
+    // Update original basic block by using the first condition directly by the
+    // branch instruction and removing the no longer needed and/or instruction.
+    auto *Br1 = cast<BranchInst>(BB.getTerminator());
+    Br1->setCondition(Cond1);
+    LogicOp->eraseFromParent();
+
+    // Depending on the conditon we have to either replace the true or the false
+    // successor of the original branch instruction.
+    if (Opc == Instruction::And)
+      Br1->setSuccessor(0, TmpBB);
+    else
+      Br1->setSuccessor(1, TmpBB);
+
+    // Fill in the new basic block.
+    auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB);
+    if (auto *I = dyn_cast<Instruction>(Cond2)) {
+      I->removeFromParent();
+      I->insertBefore(Br2);
+    }
+
+    // Update PHI nodes in both successors. The original BB needs to be
+    // replaced in one succesor's PHI nodes, because the branch comes now from
+    // the newly generated BB (NewBB). In the other successor we need to add one
+    // incoming edge to the PHI nodes, because both branch instructions target
+    // now the same successor. Depending on the original branch condition
+    // (and/or) we have to swap the successors (TrueDest, FalseDest), so that
+    // we perfrom the correct update for the PHI nodes.
+    // This doesn't change the successor order of the just created branch
+    // instruction (or any other instruction).
+    if (Opc == Instruction::Or)
+      std::swap(TBB, FBB);
+
+    // Replace the old BB with the new BB.
+    for (auto &I : *TBB) {
+      PHINode *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        break;
+      int i;
+      while ((i = PN->getBasicBlockIndex(&BB)) >= 0)
+        PN->setIncomingBlock(i, TmpBB);
+    }
+
+    // Add another incoming edge form the new BB.
+    for (auto &I : *FBB) {
+      PHINode *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        break;
+      auto *Val = PN->getIncomingValueForBlock(&BB);
+      PN->addIncoming(Val, TmpBB);
+    }
+
+    // Update the branch weights (from SelectionDAGBuilder::
+    // FindMergedConditions).
+    if (Opc == Instruction::Or) {
+      // Codegen X | Y as:
+      // BB1:
+      //   jmp_if_X TBB
+      //   jmp TmpBB
+      // TmpBB:
+      //   jmp_if_Y TBB
+      //   jmp FBB
+      //
+
+      // We have flexibility in setting Prob for BB1 and Prob for NewBB.
+      // The requirement is that
+      //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
+      //     = TrueProb for orignal BB.
+      // Assuming the orignal weights are A and B, one choice is to set BB1's
+      // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice
+      // assumes that
+      //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
+      // Another choice is to assume TrueProb for BB1 equals to TrueProb for
+      // TmpBB, but the math is more complicated.
+      uint64_t TrueWeight, FalseWeight;
+      if (extractBranchMetadata(Br1, TrueWeight, FalseWeight)) {
+        uint64_t NewTrueWeight = TrueWeight;
+        uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
+                         .createBranchWeights(TrueWeight, FalseWeight));
+
+        NewTrueWeight = TrueWeight;
+        NewFalseWeight = 2 * FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
+                         .createBranchWeights(TrueWeight, FalseWeight));
+      }
+    } else {
+      // Codegen X & Y as:
+      // BB1:
+      //   jmp_if_X TmpBB
+      //   jmp FBB
+      // TmpBB:
+      //   jmp_if_Y TBB
+      //   jmp FBB
+      //
+      //  This requires creation of TmpBB after CurBB.
+
+      // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
+      // The requirement is that
+      //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
+      //     = FalseProb for orignal BB.
+      // Assuming the orignal weights are A and B, one choice is to set BB1's
+      // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice
+      // assumes that
+      //   FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
+      uint64_t TrueWeight, FalseWeight;
+      if (extractBranchMetadata(Br1, TrueWeight, FalseWeight)) {
+        uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight;
+        uint64_t NewFalseWeight = FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
+                         .createBranchWeights(TrueWeight, FalseWeight));
+
+        NewTrueWeight = 2 * TrueWeight;
+        NewFalseWeight = FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
+                         .createBranchWeights(TrueWeight, FalseWeight));
+      }
+    }
+
+    // Request DOM Tree update.
+    // Note: No point in getting fancy here, since the DT info is never
+    // available to CodeGenPrepare and the existing update code is broken
+    // anyways.
+    ModifiedDT = true;
+
+    MadeChange = true;
+
+    DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
+          TmpBB->dump());
+  }
+  return MadeChange;
+}
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index 48213c1..c17a35d 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -59,6 +59,10 @@ bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
   if (MI->isInlineAsm())
     return false;
 
+  // Don't delete frame allocation labels.
+  if (MI->getOpcode() == TargetOpcode::FRAME_ALLOC)
+    return false;
+
   // Don't delete instructions with side effects.
   bool SawStore = false;
   if (!MI->isSafeToMove(TII, nullptr, SawStore) && !MI->isPHI())
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index 75b74d9..7b47a48 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -14,18 +14,12 @@
 
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/IR/CallSite.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "dwarfehprepare"
@@ -44,10 +38,13 @@ namespace {
 
   public:
     static char ID; // Pass identification, replacement for typeid.
+
+    // INITIALIZE_TM_PASS requires a default constructor, but it isn't used in
+    // practice.
+    DwarfEHPrepare() : FunctionPass(ID), TM(nullptr), RewindFunction(nullptr) {}
+
     DwarfEHPrepare(const TargetMachine *TM)
-        : FunctionPass(ID), TM(TM), RewindFunction(nullptr) {
-      initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
-    }
+        : FunctionPass(ID), TM(TM), RewindFunction(nullptr) {}
 
     bool runOnFunction(Function &Fn) override;
 
@@ -56,8 +53,6 @@ namespace {
       return false;
     }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override { }
-
     const char *getPassName() const override {
       return "Exception handling preparation";
     }
@@ -65,6 +60,8 @@ namespace {
 } // end anonymous namespace
 
 char DwarfEHPrepare::ID = 0;
+INITIALIZE_TM_PASS(DwarfEHPrepare, "dwarfehprepare", "Prepare DWARF exceptions",
+                   false, false)
 
 FunctionPass *llvm::createDwarfEHPass(const TargetMachine *TM) {
   return new DwarfEHPrepare(TM);
@@ -99,11 +96,11 @@ Value *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
   RI->eraseFromParent();
 
   if (EraseIVIs) {
-    if (SelIVI->getNumUses() == 0)
+    if (SelIVI->use_empty())
       SelIVI->eraseFromParent();
-    if (ExcIVI->getNumUses() == 0)
+    if (ExcIVI->use_empty())
       ExcIVI->eraseFromParent();
-    if (SelLoad && SelLoad->getNumUses() == 0)
+    if (SelLoad && SelLoad->use_empty())
       SelLoad->eraseFromParent();
   }
 
@@ -114,9 +111,8 @@ Value *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
 /// into calls to the appropriate _Unwind_Resume function.
 bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
   SmallVector<ResumeInst*, 16> Resumes;
-  for (Function::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I) {
-    TerminatorInst *TI = I->getTerminator();
-    if (ResumeInst *RI = dyn_cast<ResumeInst>(TI))
+  for (BasicBlock &BB : Fn) {
+    if (auto *RI = dyn_cast<ResumeInst>(BB.getTerminator()))
       Resumes.push_back(RI);
   }
 
@@ -124,9 +120,9 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
     return false;
 
   // Find the rewind function if we didn't already.
-  const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
+  const TargetLowering *TLI = TM->getSubtargetImpl(Fn)->getTargetLowering();
+  LLVMContext &Ctx = Fn.getContext();
   if (!RewindFunction) {
-    LLVMContext &Ctx = Resumes[0]->getContext();
     FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx),
                                           Type::getInt8PtrTy(Ctx), false);
     const char *RewindName = TLI->getLibcallName(RTLIB::UNWIND_RESUME);
@@ -134,7 +130,6 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
   }
 
   // Create the basic block where the _Unwind_Resume call will live.
-  LLVMContext &Ctx = Fn.getContext();
   unsigned ResumesSize = Resumes.size();
 
   if (ResumesSize == 1) {
@@ -159,9 +154,7 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
 
   // Extract the exception object from the ResumeInst and add it to the PHI node
   // that feeds the _Unwind_Resume call.
-  for (SmallVectorImpl<ResumeInst*>::iterator
-         I = Resumes.begin(), E = Resumes.end(); I != E; ++I) {
-    ResumeInst *RI = *I;
+  for (ResumeInst *RI : Resumes) {
     BasicBlock *Parent = RI->getParent();
     BranchInst::Create(UnwindBB, Parent);
 
@@ -181,6 +174,7 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
 }
 
 bool DwarfEHPrepare::runOnFunction(Function &Fn) {
+  assert(TM && "DWARF EH preparation requires a target machine");
   bool Changed = InsertUnwindResumeCalls(Fn);
   return Changed;
 }
diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
index 995606f..8f74271 100644
--- a/lib/CodeGen/EarlyIfConversion.cpp
+++ b/lib/CodeGen/EarlyIfConversion.cpp
@@ -777,15 +777,13 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
                << "********** Function: " << MF.getName() << '\n');
   // Only run if conversion if the target wants it.
-  if (!MF.getTarget()
-           .getSubtarget<TargetSubtargetInfo>()
-           .enableEarlyIfConversion())
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  if (!STI.enableEarlyIfConversion())
     return false;
 
-  TII = MF.getSubtarget().getInstrInfo();
-  TRI = MF.getSubtarget().getRegisterInfo();
-  SchedModel =
-    MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
+  TII = STI.getInstrInfo();
+  TRI = STI.getRegisterInfo();
+  SchedModel = STI.getSchedModel();
   MRI = &MF.getRegInfo();
   DomTree = &getAnalysis<MachineDominatorTree>();
   Loops = getAnalysisIfAvailable<MachineLoopInfo>();
diff --git a/lib/CodeGen/ErlangGC.cpp b/lib/CodeGen/ErlangGC.cpp
index 85b0893..024946d 100644
--- a/lib/CodeGen/ErlangGC.cpp
+++ b/lib/CodeGen/ErlangGC.cpp
@@ -27,56 +27,20 @@ using namespace llvm;
 
 namespace {
 
-  class ErlangGC : public GCStrategy {
-    MCSymbol *InsertLabel(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MI,
-                          DebugLoc DL) const;
-  public:
-    ErlangGC();
-    bool findCustomSafePoints(GCFunctionInfo &FI, MachineFunction &MF) override;
-  };
-
+class ErlangGC : public GCStrategy {
+public:
+  ErlangGC();
+};
 }
 
-static GCRegistry::Add<ErlangGC>
-X("erlang", "erlang-compatible garbage collector");
+static GCRegistry::Add<ErlangGC> X("erlang",
+                                   "erlang-compatible garbage collector");
 
-void llvm::linkErlangGC() { }
+void llvm::linkErlangGC() {}
 
 ErlangGC::ErlangGC() {
   InitRoots = false;
   NeededSafePoints = 1 << GC::PostCall;
   UsesMetadata = true;
   CustomRoots = false;
-  CustomSafePoints = true;
-}
-
-MCSymbol *ErlangGC::InsertLabel(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI,
-                                DebugLoc DL) const {
-  const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
-  MCSymbol *Label = MBB.getParent()->getContext().CreateTempSymbol();
-  BuildMI(MBB, MI, DL, TII->get(TargetOpcode::GC_LABEL)).addSym(Label);
-  return Label;
-}
-
-bool ErlangGC::findCustomSafePoints(GCFunctionInfo &FI, MachineFunction &MF) {
-  for (MachineFunction::iterator BBI = MF.begin(), BBE = MF.end(); BBI != BBE;
-       ++BBI)
-    for (MachineBasicBlock::iterator MI = BBI->begin(), ME = BBI->end();
-         MI != ME; ++MI)
-
-      if (MI->getDesc().isCall()) {
-
-        // Do not treat tail call sites as safe points.
-        if (MI->getDesc().isTerminator())
-          continue;
-
-        /* Code copied from VisitCallPoint(...) */
-        MachineBasicBlock::iterator RAI = MI; ++RAI;
-        MCSymbol* Label = InsertLabel(*MI->getParent(), RAI, MI->getDebugLoc());
-        FI.addSafePoint(GC::PostCall, Label, MI->getDebugLoc());
-      }
-
-  return false;
 }
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index 3680498..b3a22c8 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -22,6 +22,7 @@
 
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -74,6 +75,9 @@ struct DomainValue {
 
   // Is domain available?
   bool hasDomain(unsigned domain) const {
+    assert(domain <
+               static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
+           "undefined behavior");
     return AvailableDomains & (1u << domain);
   }
 
@@ -133,7 +137,7 @@ class ExeDepsFix : public MachineFunctionPass {
   MachineFunction *MF;
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
-  std::vector<int> AliasMap;
+  std::vector<SmallVector<int, 1>> AliasMap;
   const unsigned NumRegs;
   LiveReg *LiveRegs;
   typedef DenseMap<MachineBasicBlock*, LiveReg*> LiveOutMap;
@@ -169,8 +173,8 @@ public:
   }
 
 private:
-  // Register mapping.
-  int regIndex(unsigned Reg);
+  iterator_range<SmallVectorImpl<int>::const_iterator>
+  regIndizes(unsigned Reg) const;
 
   // DomainValue allocation.
   DomainValue *alloc(int domain = -1);
@@ -201,11 +205,13 @@ private:
 
 char ExeDepsFix::ID = 0;
 
-/// Translate TRI register number to an index into our smaller tables of
-/// interesting registers. Return -1 for boring registers.
-int ExeDepsFix::regIndex(unsigned Reg) {
+/// Translate TRI register number to a list of indizes into our stmaller tables
+/// of interesting registers.
+iterator_range<SmallVectorImpl<int>::const_iterator>
+ExeDepsFix::regIndizes(unsigned Reg) const {
   assert(Reg < AliasMap.size() && "Invalid register");
-  return AliasMap[Reg];
+  const auto &Entry = AliasMap[Reg];
+  return make_range(Entry.begin(), Entry.end());
 }
 
 DomainValue *ExeDepsFix::alloc(int domain) {
@@ -338,9 +344,11 @@ bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) {
   // All uses of B are referred to A.
   B->Next = retain(A);
 
-  for (unsigned rx = 0; rx != NumRegs; ++rx)
+  for (unsigned rx = 0; rx != NumRegs; ++rx) {
+    assert(LiveRegs && "no space allocated for live registers");
     if (LiveRegs[rx].Value == B)
       setLiveReg(rx, A);
+  }
   return true;
 }
 
@@ -370,13 +378,12 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
   if (MBB->pred_empty()) {
     for (MachineBasicBlock::livein_iterator i = MBB->livein_begin(),
          e = MBB->livein_end(); i != e; ++i) {
-      int rx = regIndex(*i);
-      if (rx < 0)
-        continue;
-      // Treat function live-ins as if they were defined just before the first
-      // instruction.  Usually, function arguments are set up immediately
-      // before the call.
-      LiveRegs[rx].Def = -1;
+      for (int rx : regIndizes(*i)) {
+        // Treat function live-ins as if they were defined just before the first
+        // instruction.  Usually, function arguments are set up immediately
+        // before the call.
+        LiveRegs[rx].Def = -1;
+      }
     }
     DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": entry\n");
     return;
@@ -467,26 +474,26 @@ void ExeDepsFix::visitInstr(MachineInstr *MI) {
 /// or undef use.
 bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
                                        unsigned Pref) {
-  int rx = regIndex(MI->getOperand(OpIdx).getReg());
-  if (rx < 0)
-    return false;
-
-  unsigned Clearance = CurInstr - LiveRegs[rx].Def;
-  DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
+  unsigned reg = MI->getOperand(OpIdx).getReg();
+  for (int rx : regIndizes(reg)) {
+    unsigned Clearance = CurInstr - LiveRegs[rx].Def;
+    DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
 
-  if (Pref > Clearance) {
-    DEBUG(dbgs() << ": Break dependency.\n");
-    return true;
-  }
-  // The current clearance seems OK, but we may be ignoring a def from a
-  // back-edge.
-  if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
-    DEBUG(dbgs() << ": OK .\n");
+    if (Pref > Clearance) {
+      DEBUG(dbgs() << ": Break dependency.\n");
+      continue;
+    }
+    // The current clearance seems OK, but we may be ignoring a def from a
+    // back-edge.
+    if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
+      DEBUG(dbgs() << ": OK .\n");
+      return false;
+    }
+    // A def from an unprocessed back-edge may make us break this dependency.
+    DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
     return false;
   }
-  // A def from an unprocessed back-edge may make us break this dependency.
-  DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
-  return false;
+  return true;
 }
 
 // Update def-ages for registers defined by MI.
@@ -514,26 +521,24 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
       break;
     if (MO.isUse())
       continue;
-    int rx = regIndex(MO.getReg());
-    if (rx < 0)
-      continue;
-
-    // This instruction explicitly defines rx.
-    DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
-                 << '\t' << *MI);
-
-    // Check clearance before partial register updates.
-    // Call breakDependence before setting LiveRegs[rx].Def.
-    unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
-    if (Pref && shouldBreakDependence(MI, i, Pref))
-      TII->breakPartialRegDependency(MI, i, TRI);
-
-    // How many instructions since rx was last written?
-    LiveRegs[rx].Def = CurInstr;
-
-    // Kill off domains redefined by generic instructions.
-    if (Kill)
-      kill(rx);
+    for (int rx : regIndizes(MO.getReg())) {
+      // This instruction explicitly defines rx.
+      DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
+                   << '\t' << *MI);
+
+      // Check clearance before partial register updates.
+      // Call breakDependence before setting LiveRegs[rx].Def.
+      unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
+      if (Pref && shouldBreakDependence(MI, i, Pref))
+        TII->breakPartialRegDependency(MI, i, TRI);
+
+      // How many instructions since rx was last written?
+      LiveRegs[rx].Def = CurInstr;
+
+      // Kill off domains redefined by generic instructions.
+      if (Kill)
+        kill(rx);
+    }
   }
   ++CurInstr;
 }
@@ -582,19 +587,19 @@ void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
                 e = mi->getDesc().getNumOperands(); i != e; ++i) {
     MachineOperand &mo = mi->getOperand(i);
     if (!mo.isReg()) continue;
-    int rx = regIndex(mo.getReg());
-    if (rx < 0) continue;
-    force(rx, domain);
+    for (int rx : regIndizes(mo.getReg())) {
+      force(rx, domain);
+    }
   }
 
   // Kill all defs and force them.
   for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
     MachineOperand &mo = mi->getOperand(i);
     if (!mo.isReg()) continue;
-    int rx = regIndex(mo.getReg());
-    if (rx < 0) continue;
-    kill(rx);
-    force(rx, domain);
+    for (int rx : regIndizes(mo.getReg())) {
+      kill(rx);
+      force(rx, domain);
+    }
   }
 }
 
@@ -611,9 +616,10 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
                   e = mi->getDesc().getNumOperands(); i != e; ++i) {
       MachineOperand &mo = mi->getOperand(i);
       if (!mo.isReg()) continue;
-      int rx = regIndex(mo.getReg());
-      if (rx < 0) continue;
-      if (DomainValue *dv = LiveRegs[rx].Value) {
+      for (int rx : regIndizes(mo.getReg())) {
+        DomainValue *dv = LiveRegs[rx].Value;
+        if (dv == nullptr)
+          continue;
         // Bitmask of domains that dv and available have in common.
         unsigned common = dv->getCommonDomains(available);
         // Is it possible to use this collapsed register for free?
@@ -645,6 +651,7 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   SmallVector<LiveReg, 4> Regs;
   for (SmallVectorImpl<int>::iterator i=used.begin(), e=used.end(); i!=e; ++i) {
     int rx = *i;
+    assert(LiveRegs && "no space allocated for live registers");
     const LiveReg &LR = LiveRegs[rx];
     // This useless DomainValue could have been missed above.
     if (!LR.Value->getCommonDomains(available)) {
@@ -684,9 +691,11 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
       continue;
 
     // If latest didn't merge, it is useless now. Kill all registers using it.
-    for (SmallVectorImpl<int>::iterator i=used.begin(), e=used.end(); i!=e; ++i)
-      if (LiveRegs[*i].Value == Latest)
-        kill(*i);
+    for (int i : used) {
+      assert(LiveRegs && "no space allocated for live registers");
+      if (LiveRegs[i].Value == Latest)
+        kill(i);
+    }
   }
 
   // dv is the DomainValue we are going to use for this instruction.
@@ -703,11 +712,11 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
                                   ii != ee; ++ii) {
     MachineOperand &mo = *ii;
     if (!mo.isReg()) continue;
-    int rx = regIndex(mo.getReg());
-    if (rx < 0) continue;
-    if (!LiveRegs[rx].Value || (mo.isDef() && LiveRegs[rx].Value != dv)) {
-      kill(rx);
-      setLiveReg(rx, dv);
+    for (int rx : regIndizes(mo.getReg())) {
+      if (!LiveRegs[rx].Value || (mo.isDef() && LiveRegs[rx].Value != dv)) {
+        kill(rx);
+        setLiveReg(rx, dv);
+      }
     }
   }
 }
@@ -735,13 +744,13 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
 
   // Initialize the AliasMap on the first use.
   if (AliasMap.empty()) {
-    // Given a PhysReg, AliasMap[PhysReg] is either the relevant index into RC,
-    // or -1.
-    AliasMap.resize(TRI->getNumRegs(), -1);
+    // Given a PhysReg, AliasMap[PhysReg] returns a list of indices into RC and
+    // therefore the LiveRegs array.
+    AliasMap.resize(TRI->getNumRegs());
     for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i)
       for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true);
            AI.isValid(); ++AI)
-        AliasMap[*AI] = i;
+        AliasMap[*AI].push_back(i);
   }
 
   MachineBasicBlock *Entry = MF->begin();
diff --git a/lib/CodeGen/ForwardControlFlowIntegrity.cpp b/lib/CodeGen/ForwardControlFlowIntegrity.cpp
index 5e7e853..63c3699 100644
--- a/lib/CodeGen/ForwardControlFlowIntegrity.cpp
+++ b/lib/CodeGen/ForwardControlFlowIntegrity.cpp
@@ -25,9 +25,9 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
diff --git a/lib/CodeGen/GCMetadata.cpp b/lib/CodeGen/GCMetadata.cpp
index ed40982..a2c5fce 100644
--- a/lib/CodeGen/GCMetadata.cpp
+++ b/lib/CodeGen/GCMetadata.cpp
@@ -24,22 +24,20 @@
 using namespace llvm;
 
 namespace {
-  
-  class Printer : public FunctionPass {
-    static char ID;
-    raw_ostream &OS;
-    
-  public:
-    explicit Printer(raw_ostream &OS) : FunctionPass(ID), OS(OS) {}
 
+class Printer : public FunctionPass {
+  static char ID;
+  raw_ostream &OS;
 
-    const char *getPassName() const override;
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
+public:
+  explicit Printer(raw_ostream &OS) : FunctionPass(ID), OS(OS) {}
 
-    bool runOnFunction(Function &F) override;
-    bool doFinalization(Module &M) override;
-  };
+  const char *getPassName() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
 
+  bool runOnFunction(Function &F) override;
+  bool doFinalization(Module &M) override;
+};
 }
 
 INITIALIZE_PASS(GCModuleInfo, "collector-metadata",
@@ -48,7 +46,7 @@ INITIALIZE_PASS(GCModuleInfo, "collector-metadata",
 // -----------------------------------------------------------------------------
 
 GCFunctionInfo::GCFunctionInfo(const Function &F, GCStrategy &S)
-  : F(F), S(S), FrameSize(~0LL) {}
+    : F(F), S(S), FrameSize(~0LL) {}
 
 GCFunctionInfo::~GCFunctionInfo() {}
 
@@ -56,51 +54,29 @@ GCFunctionInfo::~GCFunctionInfo() {}
 
 char GCModuleInfo::ID = 0;
 
-GCModuleInfo::GCModuleInfo()
-    : ImmutablePass(ID) {
+GCModuleInfo::GCModuleInfo() : ImmutablePass(ID) {
   initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
 }
 
-GCStrategy *GCModuleInfo::getOrCreateStrategy(const Module *M,
-                                              const std::string &Name) {
-  strategy_map_type::iterator NMI = StrategyMap.find(Name);
-  if (NMI != StrategyMap.end())
-    return NMI->getValue();
-  
-  for (GCRegistry::iterator I = GCRegistry::begin(),
-                            E = GCRegistry::end(); I != E; ++I) {
-    if (Name == I->getName()) {
-      std::unique_ptr<GCStrategy> S = I->instantiate();
-      S->M = M;
-      S->Name = Name;
-      StrategyMap[Name] = S.get();
-      StrategyList.push_back(std::move(S));
-      return StrategyList.back().get();
-    }
-  }
- 
-  dbgs() << "unsupported GC: " << Name << "\n";
-  llvm_unreachable(nullptr);
-}
-
 GCFunctionInfo &GCModuleInfo::getFunctionInfo(const Function &F) {
   assert(!F.isDeclaration() && "Can only get GCFunctionInfo for a definition!");
   assert(F.hasGC());
-  
+
   finfo_map_type::iterator I = FInfoMap.find(&F);
   if (I != FInfoMap.end())
     return *I->second;
-  
-  GCStrategy *S = getOrCreateStrategy(F.getParent(), F.getGC());
-  GCFunctionInfo *GFI = S->insertFunctionInfo(F);
+
+  GCStrategy *S = getGCStrategy(F.getGC());
+  Functions.push_back(make_unique<GCFunctionInfo>(F, *S));
+  GCFunctionInfo *GFI = Functions.back().get();
   FInfoMap[&F] = GFI;
   return *GFI;
 }
 
 void GCModuleInfo::clear() {
+  Functions.clear();
   FInfoMap.clear();
-  StrategyMap.clear();
-  StrategyList.clear();
+  GCStrategyList.clear();
 }
 
 // -----------------------------------------------------------------------------
@@ -111,7 +87,6 @@ FunctionPass *llvm::createGCInfoPrinter(raw_ostream &OS) {
   return new Printer(OS);
 }
 
-
 const char *Printer::getPassName() const {
   return "Print Garbage Collector Information";
 }
@@ -124,42 +99,49 @@ void Printer::getAnalysisUsage(AnalysisUsage &AU) const {
 
 static const char *DescKind(GC::PointKind Kind) {
   switch (Kind) {
-    case GC::Loop:     return "loop";
-    case GC::Return:   return "return";
-    case GC::PreCall:  return "pre-call";
-    case GC::PostCall: return "post-call";
+  case GC::Loop:
+    return "loop";
+  case GC::Return:
+    return "return";
+  case GC::PreCall:
+    return "pre-call";
+  case GC::PostCall:
+    return "post-call";
   }
   llvm_unreachable("Invalid point kind");
 }
 
 bool Printer::runOnFunction(Function &F) {
-  if (F.hasGC()) return false;
-  
+  if (F.hasGC())
+    return false;
+
   GCFunctionInfo *FD = &getAnalysis<GCModuleInfo>().getFunctionInfo(F);
-  
+
   OS << "GC roots for " << FD->getFunction().getName() << ":\n";
   for (GCFunctionInfo::roots_iterator RI = FD->roots_begin(),
-                                      RE = FD->roots_end(); RI != RE; ++RI)
+                                      RE = FD->roots_end();
+       RI != RE; ++RI)
     OS << "\t" << RI->Num << "\t" << RI->StackOffset << "[sp]\n";
-  
+
   OS << "GC safe points for " << FD->getFunction().getName() << ":\n";
-  for (GCFunctionInfo::iterator PI = FD->begin(),
-                                PE = FD->end(); PI != PE; ++PI) {
-    
-    OS << "\t" << PI->Label->getName() << ": "
-       << DescKind(PI->Kind) << ", live = {";
-    
+  for (GCFunctionInfo::iterator PI = FD->begin(), PE = FD->end(); PI != PE;
+       ++PI) {
+
+    OS << "\t" << PI->Label->getName() << ": " << DescKind(PI->Kind)
+       << ", live = {";
+
     for (GCFunctionInfo::live_iterator RI = FD->live_begin(PI),
-                                       RE = FD->live_end(PI);;) {
+                                       RE = FD->live_end(PI);
+         ;) {
       OS << " " << RI->Num;
       if (++RI == RE)
         break;
       OS << ",";
     }
-    
+
     OS << " }\n";
   }
-  
+
   return false;
 }
 
@@ -169,3 +151,23 @@ bool Printer::doFinalization(Module &M) {
   GMI->clear();
   return false;
 }
+
+
+GCStrategy *GCModuleInfo::getGCStrategy(const StringRef Name) {
+  // TODO: Arguably, just doing a linear search would be faster for small N
+  auto NMI = GCStrategyMap.find(Name);
+  if (NMI != GCStrategyMap.end())
+    return NMI->getValue();
+  
+  for (auto& Entry : GCRegistry::entries()) {
+    if (Name == Entry.getName()) {
+      std::unique_ptr<GCStrategy> S = Entry.instantiate();
+      S->Name = Name;
+      GCStrategyMap[Name] = S.get();
+      GCStrategyList.push_back(std::move(S));
+      return GCStrategyList.back().get();
+    }
+  }
+
+  report_fatal_error(std::string("unsupported GC: ") + Name);
+}
diff --git a/lib/CodeGen/GCMetadataPrinter.cpp b/lib/CodeGen/GCMetadataPrinter.cpp
index f80e9ce..bb8cfa1 100644
--- a/lib/CodeGen/GCMetadataPrinter.cpp
+++ b/lib/CodeGen/GCMetadataPrinter.cpp
@@ -14,14 +14,6 @@
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 using namespace llvm;
 
-GCMetadataPrinter::GCMetadataPrinter() { }
+GCMetadataPrinter::GCMetadataPrinter() {}
 
-GCMetadataPrinter::~GCMetadataPrinter() { }
-
-void GCMetadataPrinter::beginAssembly(AsmPrinter &AP) {
-  // Default is no action.
-}
-
-void GCMetadataPrinter::finishAssembly(AsmPrinter &AP) {
-  // Default is no action.
-}
+GCMetadataPrinter::~GCMetadataPrinter() {}
diff --git a/lib/CodeGen/GCRootLowering.cpp b/lib/CodeGen/GCRootLowering.cpp
new file mode 100644
index 0000000..9d38e4c
--- /dev/null
+++ b/lib/CodeGen/GCRootLowering.cpp
@@ -0,0 +1,351 @@
+//===-- GCRootLowering.cpp - Garbage collection infrastructure ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the lowering for the gc.root mechanism.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+/// LowerIntrinsics - This pass rewrites calls to the llvm.gcread or
+/// llvm.gcwrite intrinsics, replacing them with simple loads and stores as
+/// directed by the GCStrategy. It also performs automatic root initialization
+/// and custom intrinsic lowering.
+class LowerIntrinsics : public FunctionPass {
+  bool PerformDefaultLowering(Function &F, GCStrategy &Coll);
+
+public:
+  static char ID;
+
+  LowerIntrinsics();
+  const char *getPassName() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+};
+
+/// GCMachineCodeAnalysis - This is a target-independent pass over the machine
+/// function representation to identify safe points for the garbage collector
+/// in the machine code. It inserts labels at safe points and populates a
+/// GCMetadata record for each function.
+class GCMachineCodeAnalysis : public MachineFunctionPass {
+  GCFunctionInfo *FI;
+  MachineModuleInfo *MMI;
+  const TargetInstrInfo *TII;
+
+  void FindSafePoints(MachineFunction &MF);
+  void VisitCallPoint(MachineBasicBlock::iterator MI);
+  MCSymbol *InsertLabel(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                        DebugLoc DL) const;
+
+  void FindStackOffsets(MachineFunction &MF);
+
+public:
+  static char ID;
+
+  GCMachineCodeAnalysis();
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+}
+
+// -----------------------------------------------------------------------------
+
+INITIALIZE_PASS_BEGIN(LowerIntrinsics, "gc-lowering", "GC Lowering", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(GCModuleInfo)
+INITIALIZE_PASS_END(LowerIntrinsics, "gc-lowering", "GC Lowering", false, false)
+
+FunctionPass *llvm::createGCLoweringPass() { return new LowerIntrinsics(); }
+
+char LowerIntrinsics::ID = 0;
+
+LowerIntrinsics::LowerIntrinsics() : FunctionPass(ID) {
+  initializeLowerIntrinsicsPass(*PassRegistry::getPassRegistry());
+}
+
+const char *LowerIntrinsics::getPassName() const {
+  return "Lower Garbage Collection Instructions";
+}
+
+void LowerIntrinsics::getAnalysisUsage(AnalysisUsage &AU) const {
+  FunctionPass::getAnalysisUsage(AU);
+  AU.addRequired<GCModuleInfo>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+}
+
+static bool NeedsDefaultLoweringPass(const GCStrategy &C) {
+  // Default lowering is necessary only if read or write barriers have a default
+  // action. The default for roots is no action.
+  return !C.customWriteBarrier() || !C.customReadBarrier() ||
+         C.initializeRoots();
+}
+
+/// doInitialization - If this module uses the GC intrinsics, find them now.
+bool LowerIntrinsics::doInitialization(Module &M) {
+  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(MI && "LowerIntrinsics didn't require GCModuleInfo!?");
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (!I->isDeclaration() && I->hasGC())
+      MI->getFunctionInfo(*I); // Instantiate the GC strategy.
+
+  return false;
+}
+
+/// CouldBecomeSafePoint - Predicate to conservatively determine whether the
+/// instruction could introduce a safe point.
+static bool CouldBecomeSafePoint(Instruction *I) {
+  // The natural definition of instructions which could introduce safe points
+  // are:
+  //
+  //   - call, invoke (AfterCall, BeforeCall)
+  //   - phis (Loops)
+  //   - invoke, ret, unwind (Exit)
+  //
+  // However, instructions as seemingly inoccuous as arithmetic can become
+  // libcalls upon lowering (e.g., div i64 on a 32-bit platform), so instead
+  // it is necessary to take a conservative approach.
+
+  if (isa<AllocaInst>(I) || isa<GetElementPtrInst>(I) || isa<StoreInst>(I) ||
+      isa<LoadInst>(I))
+    return false;
+
+  // llvm.gcroot is safe because it doesn't do anything at runtime.
+  if (CallInst *CI = dyn_cast<CallInst>(I))
+    if (Function *F = CI->getCalledFunction())
+      if (unsigned IID = F->getIntrinsicID())
+        if (IID == Intrinsic::gcroot)
+          return false;
+
+  return true;
+}
+
+static bool InsertRootInitializers(Function &F, AllocaInst **Roots,
+                                   unsigned Count) {
+  // Scroll past alloca instructions.
+  BasicBlock::iterator IP = F.getEntryBlock().begin();
+  while (isa<AllocaInst>(IP))
+    ++IP;
+
+  // Search for initializers in the initial BB.
+  SmallPtrSet<AllocaInst *, 16> InitedRoots;
+  for (; !CouldBecomeSafePoint(IP); ++IP)
+    if (StoreInst *SI = dyn_cast<StoreInst>(IP))
+      if (AllocaInst *AI =
+              dyn_cast<AllocaInst>(SI->getOperand(1)->stripPointerCasts()))
+        InitedRoots.insert(AI);
+
+  // Add root initializers.
+  bool MadeChange = false;
+
+  for (AllocaInst **I = Roots, **E = Roots + Count; I != E; ++I)
+    if (!InitedRoots.count(*I)) {
+      StoreInst *SI = new StoreInst(
+          ConstantPointerNull::get(cast<PointerType>(
+              cast<PointerType>((*I)->getType())->getElementType())),
+          *I);
+      SI->insertAfter(*I);
+      MadeChange = true;
+    }
+
+  return MadeChange;
+}
+
+/// runOnFunction - Replace gcread/gcwrite intrinsics with loads and stores.
+/// Leave gcroot intrinsics; the code generator needs to see those.
+bool LowerIntrinsics::runOnFunction(Function &F) {
+  // Quick exit for functions that do not use GC.
+  if (!F.hasGC())
+    return false;
+
+  GCFunctionInfo &FI = getAnalysis<GCModuleInfo>().getFunctionInfo(F);
+  GCStrategy &S = FI.getStrategy();
+
+  bool MadeChange = false;
+
+  if (NeedsDefaultLoweringPass(S))
+    MadeChange |= PerformDefaultLowering(F, S);
+
+  return MadeChange;
+}
+
+bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) {
+  bool LowerWr = !S.customWriteBarrier();
+  bool LowerRd = !S.customReadBarrier();
+  bool InitRoots = S.initializeRoots();
+
+  SmallVector<AllocaInst *, 32> Roots;
+
+  bool MadeChange = false;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
+      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++)) {
+        Function *F = CI->getCalledFunction();
+        switch (F->getIntrinsicID()) {
+        case Intrinsic::gcwrite:
+          if (LowerWr) {
+            // Replace a write barrier with a simple store.
+            Value *St =
+                new StoreInst(CI->getArgOperand(0), CI->getArgOperand(2), CI);
+            CI->replaceAllUsesWith(St);
+            CI->eraseFromParent();
+          }
+          break;
+        case Intrinsic::gcread:
+          if (LowerRd) {
+            // Replace a read barrier with a simple load.
+            Value *Ld = new LoadInst(CI->getArgOperand(1), "", CI);
+            Ld->takeName(CI);
+            CI->replaceAllUsesWith(Ld);
+            CI->eraseFromParent();
+          }
+          break;
+        case Intrinsic::gcroot:
+          if (InitRoots) {
+            // Initialize the GC root, but do not delete the intrinsic. The
+            // backend needs the intrinsic to flag the stack slot.
+            Roots.push_back(
+                cast<AllocaInst>(CI->getArgOperand(0)->stripPointerCasts()));
+          }
+          break;
+        default:
+          continue;
+        }
+
+        MadeChange = true;
+      }
+    }
+  }
+
+  if (Roots.size())
+    MadeChange |= InsertRootInitializers(F, Roots.begin(), Roots.size());
+
+  return MadeChange;
+}
+
+// -----------------------------------------------------------------------------
+
+char GCMachineCodeAnalysis::ID = 0;
+char &llvm::GCMachineCodeAnalysisID = GCMachineCodeAnalysis::ID;
+
+INITIALIZE_PASS(GCMachineCodeAnalysis, "gc-analysis",
+                "Analyze Machine Code For Garbage Collection", false, false)
+
+GCMachineCodeAnalysis::GCMachineCodeAnalysis() : MachineFunctionPass(ID) {}
+
+void GCMachineCodeAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+  AU.addRequired<MachineModuleInfo>();
+  AU.addRequired<GCModuleInfo>();
+}
+
+MCSymbol *GCMachineCodeAnalysis::InsertLabel(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MI,
+                                             DebugLoc DL) const {
+  MCSymbol *Label = MBB.getParent()->getContext().CreateTempSymbol();
+  BuildMI(MBB, MI, DL, TII->get(TargetOpcode::GC_LABEL)).addSym(Label);
+  return Label;
+}
+
+void GCMachineCodeAnalysis::VisitCallPoint(MachineBasicBlock::iterator CI) {
+  // Find the return address (next instruction), too, so as to bracket the call
+  // instruction.
+  MachineBasicBlock::iterator RAI = CI;
+  ++RAI;
+
+  if (FI->getStrategy().needsSafePoint(GC::PreCall)) {
+    MCSymbol *Label = InsertLabel(*CI->getParent(), CI, CI->getDebugLoc());
+    FI->addSafePoint(GC::PreCall, Label, CI->getDebugLoc());
+  }
+
+  if (FI->getStrategy().needsSafePoint(GC::PostCall)) {
+    MCSymbol *Label = InsertLabel(*CI->getParent(), RAI, CI->getDebugLoc());
+    FI->addSafePoint(GC::PostCall, Label, CI->getDebugLoc());
+  }
+}
+
+void GCMachineCodeAnalysis::FindSafePoints(MachineFunction &MF) {
+  for (MachineFunction::iterator BBI = MF.begin(), BBE = MF.end(); BBI != BBE;
+       ++BBI)
+    for (MachineBasicBlock::iterator MI = BBI->begin(), ME = BBI->end();
+         MI != ME; ++MI)
+      if (MI->isCall()) {
+        // Do not treat tail or sibling call sites as safe points.  This is
+        // legal since any arguments passed to the callee which live in the
+        // remnants of the callers frame will be owned and updated by the
+        // callee if required.
+        if (MI->isTerminator())
+          continue;
+        VisitCallPoint(MI);
+      }
+}
+
+void GCMachineCodeAnalysis::FindStackOffsets(MachineFunction &MF) {
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  assert(TFI && "TargetRegisterInfo not available!");
+
+  for (GCFunctionInfo::roots_iterator RI = FI->roots_begin();
+       RI != FI->roots_end();) {
+    // If the root references a dead object, no need to keep it.
+    if (MF.getFrameInfo()->isDeadObjectIndex(RI->Num)) {
+      RI = FI->removeStackRoot(RI);
+    } else {
+      RI->StackOffset = TFI->getFrameIndexOffset(MF, RI->Num);
+      ++RI;
+    }
+  }
+}
+
+bool GCMachineCodeAnalysis::runOnMachineFunction(MachineFunction &MF) {
+  // Quick exit for functions that do not use GC.
+  if (!MF.getFunction()->hasGC())
+    return false;
+
+  FI = &getAnalysis<GCModuleInfo>().getFunctionInfo(*MF.getFunction());
+  if (!FI->getStrategy().needsSafePoints())
+    return false;
+
+  MMI = &getAnalysis<MachineModuleInfo>();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  // Find the size of the stack frame.
+  FI->setFrameSize(MF.getFrameInfo()->getStackSize());
+
+  // Find all safe points.
+  FindSafePoints(MF);
+
+  // Find the stack offsets for all roots.
+  FindStackOffsets(MF);
+
+  return false;
+}
diff --git a/lib/CodeGen/GCStrategy.cpp b/lib/CodeGen/GCStrategy.cpp
index b346657..554d326 100644
--- a/lib/CodeGen/GCStrategy.cpp
+++ b/lib/CodeGen/GCStrategy.cpp
@@ -1,4 +1,4 @@
-//===-- GCStrategy.cpp - Garbage collection infrastructure -----------------===//
+//===-- GCStrategy.cpp - Garbage Collector Description --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,417 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements target- and collector-independent garbage collection
-// infrastructure.
-//
-// GCMachineCodeAnalysis identifies the GC safe points in the machine code.
-// Roots are identified in SelectionDAGISel.
+// This file implements the policy object GCStrategy which describes the
+// behavior of a given garbage collector.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GCStrategy.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
-namespace {
-
-  /// LowerIntrinsics - This pass rewrites calls to the llvm.gcread or
-  /// llvm.gcwrite intrinsics, replacing them with simple loads and stores as
-  /// directed by the GCStrategy. It also performs automatic root initialization
-  /// and custom intrinsic lowering.
-  class LowerIntrinsics : public FunctionPass {
-    static bool NeedsDefaultLoweringPass(const GCStrategy &C);
-    static bool NeedsCustomLoweringPass(const GCStrategy &C);
-    static bool CouldBecomeSafePoint(Instruction *I);
-    bool PerformDefaultLowering(Function &F, GCStrategy &Coll);
-    static bool InsertRootInitializers(Function &F,
-                                       AllocaInst **Roots, unsigned Count);
-
-  public:
-    static char ID;
-
-    LowerIntrinsics();
-    const char *getPassName() const override;
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-    bool doInitialization(Module &M) override;
-    bool runOnFunction(Function &F) override;
-  };
-
-
-  /// GCMachineCodeAnalysis - This is a target-independent pass over the machine
-  /// function representation to identify safe points for the garbage collector
-  /// in the machine code. It inserts labels at safe points and populates a
-  /// GCMetadata record for each function.
-  class GCMachineCodeAnalysis : public MachineFunctionPass {
-    const TargetMachine *TM;
-    GCFunctionInfo *FI;
-    MachineModuleInfo *MMI;
-    const TargetInstrInfo *TII;
-
-    void FindSafePoints(MachineFunction &MF);
-    void VisitCallPoint(MachineBasicBlock::iterator MI);
-    MCSymbol *InsertLabel(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MI,
-                          DebugLoc DL) const;
-
-    void FindStackOffsets(MachineFunction &MF);
-
-  public:
-    static char ID;
-
-    GCMachineCodeAnalysis();
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-    bool runOnMachineFunction(MachineFunction &MF) override;
-  };
-
-}
-
-// -----------------------------------------------------------------------------
-
-GCStrategy::GCStrategy() :
-  NeededSafePoints(0),
-  CustomReadBarriers(false),
-  CustomWriteBarriers(false),
-  CustomRoots(false),
-  CustomSafePoints(false),
-  InitRoots(true),
-  UsesMetadata(false)
-{}
-
-bool GCStrategy::initializeCustomLowering(Module &M) { return false; }
-
-bool GCStrategy::performCustomLowering(Function &F) {
-  dbgs() << "gc " << getName() << " must override performCustomLowering.\n";
-  llvm_unreachable("must override performCustomLowering");
-}
-
-
-bool GCStrategy::findCustomSafePoints(GCFunctionInfo& FI, MachineFunction &F) {
-  dbgs() << "gc " << getName() << " must override findCustomSafePoints.\n";
-  llvm_unreachable(nullptr);
-}
-
-
-GCFunctionInfo *GCStrategy::insertFunctionInfo(const Function &F) {
-  Functions.push_back(make_unique<GCFunctionInfo>(F, *this));
-  return Functions.back().get();
-}
-
-// -----------------------------------------------------------------------------
-
-INITIALIZE_PASS_BEGIN(LowerIntrinsics, "gc-lowering", "GC Lowering",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(GCModuleInfo)
-INITIALIZE_PASS_END(LowerIntrinsics, "gc-lowering", "GC Lowering", false, false)
-
-FunctionPass *llvm::createGCLoweringPass() {
-  return new LowerIntrinsics();
-}
-
-char LowerIntrinsics::ID = 0;
-
-LowerIntrinsics::LowerIntrinsics()
-  : FunctionPass(ID) {
-    initializeLowerIntrinsicsPass(*PassRegistry::getPassRegistry());
-  }
-
-const char *LowerIntrinsics::getPassName() const {
-  return "Lower Garbage Collection Instructions";
-}
-
-void LowerIntrinsics::getAnalysisUsage(AnalysisUsage &AU) const {
-  FunctionPass::getAnalysisUsage(AU);
-  AU.addRequired<GCModuleInfo>();
-  AU.addPreserved<DominatorTreeWrapperPass>();
-}
-
-/// doInitialization - If this module uses the GC intrinsics, find them now.
-bool LowerIntrinsics::doInitialization(Module &M) {
-  // FIXME: This is rather antisocial in the context of a JIT since it performs
-  //        work against the entire module. But this cannot be done at
-  //        runFunction time (initializeCustomLowering likely needs to change
-  //        the module).
-  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
-  assert(MI && "LowerIntrinsics didn't require GCModuleInfo!?");
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-    if (!I->isDeclaration() && I->hasGC())
-      MI->getFunctionInfo(*I); // Instantiate the GC strategy.
-
-  bool MadeChange = false;
-  for (GCModuleInfo::iterator I = MI->begin(), E = MI->end(); I != E; ++I)
-    if (NeedsCustomLoweringPass(**I))
-      if ((*I)->initializeCustomLowering(M))
-        MadeChange = true;
-
-  return MadeChange;
-}
-
-bool LowerIntrinsics::InsertRootInitializers(Function &F, AllocaInst **Roots,
-                                                          unsigned Count) {
-  // Scroll past alloca instructions.
-  BasicBlock::iterator IP = F.getEntryBlock().begin();
-  while (isa<AllocaInst>(IP)) ++IP;
-
-  // Search for initializers in the initial BB.
-  SmallPtrSet<AllocaInst*,16> InitedRoots;
-  for (; !CouldBecomeSafePoint(IP); ++IP)
-    if (StoreInst *SI = dyn_cast<StoreInst>(IP))
-      if (AllocaInst *AI =
-          dyn_cast<AllocaInst>(SI->getOperand(1)->stripPointerCasts()))
-        InitedRoots.insert(AI);
-
-  // Add root initializers.
-  bool MadeChange = false;
-
-  for (AllocaInst **I = Roots, **E = Roots + Count; I != E; ++I)
-    if (!InitedRoots.count(*I)) {
-      StoreInst* SI = new StoreInst(ConstantPointerNull::get(cast<PointerType>(
-                        cast<PointerType>((*I)->getType())->getElementType())),
-                        *I);
-      SI->insertAfter(*I);
-      MadeChange = true;
-    }
-
-  return MadeChange;
-}
-
-bool LowerIntrinsics::NeedsDefaultLoweringPass(const GCStrategy &C) {
-  // Default lowering is necessary only if read or write barriers have a default
-  // action. The default for roots is no action.
-  return !C.customWriteBarrier()
-      || !C.customReadBarrier()
-      || C.initializeRoots();
-}
-
-bool LowerIntrinsics::NeedsCustomLoweringPass(const GCStrategy &C) {
-  // Custom lowering is only necessary if enabled for some action.
-  return C.customWriteBarrier()
-      || C.customReadBarrier()
-      || C.customRoots();
-}
-
-/// CouldBecomeSafePoint - Predicate to conservatively determine whether the
-/// instruction could introduce a safe point.
-bool LowerIntrinsics::CouldBecomeSafePoint(Instruction *I) {
-  // The natural definition of instructions which could introduce safe points
-  // are:
-  //
-  //   - call, invoke (AfterCall, BeforeCall)
-  //   - phis (Loops)
-  //   - invoke, ret, unwind (Exit)
-  //
-  // However, instructions as seemingly inoccuous as arithmetic can become
-  // libcalls upon lowering (e.g., div i64 on a 32-bit platform), so instead
-  // it is necessary to take a conservative approach.
-
-  if (isa<AllocaInst>(I) || isa<GetElementPtrInst>(I) ||
-      isa<StoreInst>(I) || isa<LoadInst>(I))
-    return false;
-
-  // llvm.gcroot is safe because it doesn't do anything at runtime.
-  if (CallInst *CI = dyn_cast<CallInst>(I))
-    if (Function *F = CI->getCalledFunction())
-      if (unsigned IID = F->getIntrinsicID())
-        if (IID == Intrinsic::gcroot)
-          return false;
-
-  return true;
-}
-
-/// runOnFunction - Replace gcread/gcwrite intrinsics with loads and stores.
-/// Leave gcroot intrinsics; the code generator needs to see those.
-bool LowerIntrinsics::runOnFunction(Function &F) {
-  // Quick exit for functions that do not use GC.
-  if (!F.hasGC())
-    return false;
-
-  GCFunctionInfo &FI = getAnalysis<GCModuleInfo>().getFunctionInfo(F);
-  GCStrategy &S = FI.getStrategy();
-
-  bool MadeChange = false;
-
-  if (NeedsDefaultLoweringPass(S))
-    MadeChange |= PerformDefaultLowering(F, S);
-
-  bool UseCustomLoweringPass = NeedsCustomLoweringPass(S);
-  if (UseCustomLoweringPass)
-    MadeChange |= S.performCustomLowering(F);
-
-  // Custom lowering may modify the CFG, so dominators must be recomputed.
-  if (UseCustomLoweringPass) {
-    if (DominatorTreeWrapperPass *DTWP =
-            getAnalysisIfAvailable<DominatorTreeWrapperPass>())
-      DTWP->getDomTree().recalculate(F);
-  }
-
-  return MadeChange;
-}
-
-bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) {
-  bool LowerWr = !S.customWriteBarrier();
-  bool LowerRd = !S.customReadBarrier();
-  bool InitRoots = S.initializeRoots();
-
-  SmallVector<AllocaInst*, 32> Roots;
-
-  bool MadeChange = false;
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
-      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++)) {
-        Function *F = CI->getCalledFunction();
-        switch (F->getIntrinsicID()) {
-        case Intrinsic::gcwrite:
-          if (LowerWr) {
-            // Replace a write barrier with a simple store.
-            Value *St = new StoreInst(CI->getArgOperand(0),
-                                      CI->getArgOperand(2), CI);
-            CI->replaceAllUsesWith(St);
-            CI->eraseFromParent();
-          }
-          break;
-        case Intrinsic::gcread:
-          if (LowerRd) {
-            // Replace a read barrier with a simple load.
-            Value *Ld = new LoadInst(CI->getArgOperand(1), "", CI);
-            Ld->takeName(CI);
-            CI->replaceAllUsesWith(Ld);
-            CI->eraseFromParent();
-          }
-          break;
-        case Intrinsic::gcroot:
-          if (InitRoots) {
-            // Initialize the GC root, but do not delete the intrinsic. The
-            // backend needs the intrinsic to flag the stack slot.
-            Roots.push_back(cast<AllocaInst>(
-                              CI->getArgOperand(0)->stripPointerCasts()));
-          }
-          break;
-        default:
-          continue;
-        }
-
-        MadeChange = true;
-      }
-    }
-  }
-
-  if (Roots.size())
-    MadeChange |= InsertRootInitializers(F, Roots.begin(), Roots.size());
-
-  return MadeChange;
-}
-
-// -----------------------------------------------------------------------------
-
-char GCMachineCodeAnalysis::ID = 0;
-char &llvm::GCMachineCodeAnalysisID = GCMachineCodeAnalysis::ID;
-
-INITIALIZE_PASS(GCMachineCodeAnalysis, "gc-analysis",
-                "Analyze Machine Code For Garbage Collection", false, false)
-
-GCMachineCodeAnalysis::GCMachineCodeAnalysis()
-  : MachineFunctionPass(ID) {}
-
-void GCMachineCodeAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
-  MachineFunctionPass::getAnalysisUsage(AU);
-  AU.setPreservesAll();
-  AU.addRequired<MachineModuleInfo>();
-  AU.addRequired<GCModuleInfo>();
-}
-
-MCSymbol *GCMachineCodeAnalysis::InsertLabel(MachineBasicBlock &MBB,
-                                             MachineBasicBlock::iterator MI,
-                                             DebugLoc DL) const {
-  MCSymbol *Label = MBB.getParent()->getContext().CreateTempSymbol();
-  BuildMI(MBB, MI, DL, TII->get(TargetOpcode::GC_LABEL)).addSym(Label);
-  return Label;
-}
-
-void GCMachineCodeAnalysis::VisitCallPoint(MachineBasicBlock::iterator CI) {
-  // Find the return address (next instruction), too, so as to bracket the call
-  // instruction.
-  MachineBasicBlock::iterator RAI = CI;
-  ++RAI;
-
-  if (FI->getStrategy().needsSafePoint(GC::PreCall)) {
-    MCSymbol* Label = InsertLabel(*CI->getParent(), CI, CI->getDebugLoc());
-    FI->addSafePoint(GC::PreCall, Label, CI->getDebugLoc());
-  }
-
-  if (FI->getStrategy().needsSafePoint(GC::PostCall)) {
-    MCSymbol* Label = InsertLabel(*CI->getParent(), RAI, CI->getDebugLoc());
-    FI->addSafePoint(GC::PostCall, Label, CI->getDebugLoc());
-  }
-}
-
-void GCMachineCodeAnalysis::FindSafePoints(MachineFunction &MF) {
-  for (MachineFunction::iterator BBI = MF.begin(),
-                                 BBE = MF.end(); BBI != BBE; ++BBI)
-    for (MachineBasicBlock::iterator MI = BBI->begin(),
-                                     ME = BBI->end(); MI != ME; ++MI)
-      if (MI->isCall())
-        VisitCallPoint(MI);
-}
-
-void GCMachineCodeAnalysis::FindStackOffsets(MachineFunction &MF) {
-  const TargetFrameLowering *TFI = TM->getSubtargetImpl()->getFrameLowering();
-  assert(TFI && "TargetRegisterInfo not available!");
-
-  for (GCFunctionInfo::roots_iterator RI = FI->roots_begin();
-       RI != FI->roots_end();) {
-    // If the root references a dead object, no need to keep it.
-    if (MF.getFrameInfo()->isDeadObjectIndex(RI->Num)) {
-      RI = FI->removeStackRoot(RI);
-    } else {
-      RI->StackOffset = TFI->getFrameIndexOffset(MF, RI->Num);
-      ++RI;
-    }
-  }
-}
-
-bool GCMachineCodeAnalysis::runOnMachineFunction(MachineFunction &MF) {
-  // Quick exit for functions that do not use GC.
-  if (!MF.getFunction()->hasGC())
-    return false;
-
-  FI = &getAnalysis<GCModuleInfo>().getFunctionInfo(*MF.getFunction());
-  if (!FI->getStrategy().needsSafePoints())
-    return false;
-
-  TM = &MF.getTarget();
-  MMI = &getAnalysis<MachineModuleInfo>();
-  TII = TM->getSubtargetImpl()->getInstrInfo();
-
-  // Find the size of the stack frame.
-  FI->setFrameSize(MF.getFrameInfo()->getStackSize());
-
-  // Find all safe points.
-  if (FI->getStrategy().customSafePoints()) {
-    FI->getStrategy().findCustomSafePoints(*FI, MF);
-  } else {
-    FindSafePoints(MF);
-  }
-
-  // Find the stack offsets for all roots.
-  FindStackOffsets(MF);
-
-  return false;
-}
+GCStrategy::GCStrategy()
+    : UseStatepoints(false), NeededSafePoints(0), CustomReadBarriers(false),
+      CustomWriteBarriers(false), CustomRoots(false), InitRoots(true),
+      UsesMetadata(false) {}
diff --git a/lib/CodeGen/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp
index 457d7d6..4188e5d 100644
--- a/lib/CodeGen/GlobalMerge.cpp
+++ b/lib/CodeGen/GlobalMerge.cpp
@@ -54,6 +54,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -64,7 +65,6 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -90,10 +90,16 @@ EnableGlobalMergeOnExternal("global-merge-on-external", cl::Hidden,
      cl::desc("Enable global merge pass on external linkage"),
      cl::init(false));
 
-STATISTIC(NumMerged      , "Number of globals merged");
+STATISTIC(NumMerged, "Number of globals merged");
 namespace {
   class GlobalMerge : public FunctionPass {
     const TargetMachine *TM;
+    const DataLayout *DL;
+    // FIXME: Infer the maximum possible offset depending on the actual users
+    // (these max offsets are different for the users inside Thumb or ARM
+    // functions), see the code that passes in the offset in the ARM backend
+    // for more information.
+    unsigned MaxOffset;
 
     bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                  Module &M, bool isConst, unsigned AddrSpace) const;
@@ -117,8 +123,10 @@ namespace {
 
   public:
     static char ID;             // Pass identification, replacement for typeid.
-    explicit GlobalMerge(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), TM(TM) {
+    explicit GlobalMerge(const TargetMachine *TM = nullptr,
+                         unsigned MaximalOffset = 0)
+        : FunctionPass(ID), TM(TM), DL(TM->getDataLayout()),
+          MaxOffset(MaximalOffset) {
       initializeGlobalMergePass(*PassRegistry::getPassRegistry());
     }
 
@@ -138,22 +146,16 @@ namespace {
 } // end anonymous namespace
 
 char GlobalMerge::ID = 0;
-INITIALIZE_TM_PASS(GlobalMerge, "global-merge", "Merge global variables",
-                   false, false)
+INITIALIZE_PASS_BEGIN(GlobalMerge, "global-merge", "Merge global variables",
+                      false, false)
+INITIALIZE_PASS_END(GlobalMerge, "global-merge", "Merge global variables",
+                    false, false)
 
 bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                           Module &M, bool isConst, unsigned AddrSpace) const {
-  const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
-  const DataLayout *DL = TLI->getDataLayout();
-
-  // FIXME: Infer the maximum possible offset depending on the actual users
-  // (these max offsets are different for the users inside Thumb or ARM
-  // functions)
-  unsigned MaxOffset = TLI->getMaximalGlobalOffset();
-
   // FIXME: Find better heuristics
   std::stable_sort(Globals.begin(), Globals.end(),
-                   [DL](const GlobalVariable *GV1, const GlobalVariable *GV2) {
+                   [this](const GlobalVariable *GV1, const GlobalVariable *GV2) {
     Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType();
     Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType();
 
@@ -282,9 +284,6 @@ bool GlobalMerge::doInitialization(Module &M) {
 
   DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals,
                                                         BSSGlobals;
-  const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
-  const DataLayout *DL = TLI->getDataLayout();
-  unsigned MaxOffset = TLI->getMaximalGlobalOffset();
   bool Changed = false;
   setMustKeepGlobalVariables(M);
 
@@ -357,6 +356,6 @@ bool GlobalMerge::doFinalization(Module &M) {
   return false;
 }
 
-Pass *llvm::createGlobalMergePass(const TargetMachine *TM) {
-  return new GlobalMerge(TM);
+Pass *llvm::createGlobalMergePass(const TargetMachine *TM, unsigned Offset) {
+  return new GlobalMerge(TM, Offset);
 }
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index e84d25d..7a29569 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -271,15 +271,13 @@ INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_END(IfConverter, "if-converter", "If Converter", false, false)
 
 bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
-  TLI = MF.getSubtarget().getTargetLowering();
-  TII = MF.getSubtarget().getInstrInfo();
-  TRI = MF.getSubtarget().getRegisterInfo();
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+  TLI = ST.getTargetLowering();
+  TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   MRI = &MF.getRegInfo();
-
-  const TargetSubtargetInfo &ST =
-    MF.getTarget().getSubtarget<TargetSubtargetInfo>();
   SchedModel.init(ST.getSchedModel(), &ST, TII);
 
   if (!TII) return false;
@@ -290,7 +288,7 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   if (!PreRegAlloc) {
     // Tail merge tend to expose more if-conversion opportunities.
     BranchFolder BF(true, false, *MBFI, *MBPI);
-    BFChange = BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
+    BFChange = BF.OptimizeFunction(MF, TII, ST.getRegisterInfo(),
                                    getAnalysisIfAvailable<MachineModuleInfo>());
   }
 
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 6a6e15d..f0d407f 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -508,6 +508,7 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
   SmallVector<std::pair<unsigned, VNInfo*>, 8> WorkList;
   WorkList.push_back(std::make_pair(UseReg, UseVNI));
 
+  LiveInterval &OrigLI = LIS.getInterval(Original);
   do {
     unsigned Reg;
     VNInfo *VNI;
@@ -521,8 +522,11 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
 
     // Trace through PHI-defs created by live range splitting.
     if (VNI->isPHIDef()) {
-      // Stop at original PHIs.  We don't know the value at the predecessors.
-      if (VNI->def == OrigVNI->def) {
+      // Stop at original PHIs.  We don't know the value at the
+      // predecessors. Look up the VNInfo for the current definition
+      // in OrigLI, to properly determine whether or not this phi was
+      // added by splitting.
+      if (VNI->def == OrigLI.getVNInfoAt(VNI->def)->def) {
         DEBUG(dbgs() << "orig phi value\n");
         SVI->second.DefByOrigPHI = true;
         SVI->second.AllDefsAreReloads = false;
@@ -542,7 +546,6 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
       // Separate all values dominated by OrigVNI into PHIs and non-PHIs.
       SmallVector<VNInfo*, 8> PHIs, NonPHIs;
       LiveInterval &LI = LIS.getInterval(Reg);
-      LiveInterval &OrigLI = LIS.getInterval(Original);
 
       for (LiveInterval::vni_iterator VI = LI.vni_begin(), VE = LI.vni_end();
            VI != VE; ++VI) {
@@ -573,8 +576,8 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
         std::tie(SVI, Inserted) =
           SibValues.insert(std::make_pair(NonPHI, SibValueInfo(Reg, NonPHI)));
         // Add all the PHIs as dependents of NonPHI.
-        for (unsigned pi = 0, pe = PHIs.size(); pi != pe; ++pi)
-          SVI->second.Deps.push_back(PHIs[pi]);
+        SVI->second.Deps.insert(SVI->second.Deps.end(), PHIs.begin(),
+                                PHIs.end());
         // This is the first time we see NonPHI, add it to the worklist.
         if (Inserted)
           WorkList.push_back(std::make_pair(Reg, NonPHI));
@@ -1088,7 +1091,8 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
   bool WasCopy = MI->isCopy();
   unsigned ImpReg = 0;
 
-  bool SpillSubRegs = (MI->getOpcode() == TargetOpcode::PATCHPOINT ||
+  bool SpillSubRegs = (MI->getOpcode() == TargetOpcode::STATEPOINT ||
+                       MI->getOpcode() == TargetOpcode::PATCHPOINT ||
                        MI->getOpcode() == TargetOpcode::STACKMAP);
 
   // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
@@ -1138,13 +1142,8 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
       continue;
     // FoldMI does not define this physreg. Remove the LI segment.
     assert(MO->isDead() && "Cannot fold physreg def");
-    for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units) {
-      if (LiveRange *LR = LIS.getCachedRegUnit(*Units)) {
-        SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
-        if (VNInfo *VNI = LR->getVNInfoAt(Idx))
-          LR->removeValNo(VNI);
-      }
-    }
+    SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
+    LIS.removePhysRegDefAt(Reg, Idx);
   }
 
   LIS.ReplaceMachineInstrInMaps(MI, FoldMI);
diff --git a/lib/CodeGen/JumpInstrTables.cpp b/lib/CodeGen/JumpInstrTables.cpp
index 20f775c..75fa261 100644
--- a/lib/CodeGen/JumpInstrTables.cpp
+++ b/lib/CodeGen/JumpInstrTables.cpp
@@ -13,7 +13,6 @@
 #define DEBUG_TYPE "jt"
 
 #include "llvm/CodeGen/JumpInstrTables.h"
-
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/JumpInstrTableInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -30,7 +29,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-
 #include <vector>
 
 using namespace llvm;
@@ -117,8 +115,8 @@ bool replaceGlobalValueIndirectUse(GlobalValue *GV, Value *V, Use *U) {
     if (!isa<GlobalAlias>(C))
       C->replaceUsesOfWithOnConstant(GV, V, U);
   } else {
-    assert(false && "The Use of a Function symbol is neither an instruction nor"
-                    " a constant");
+    llvm_unreachable("The Use of a Function symbol is neither an instruction "
+                     "nor a constant");
   }
 
   return true;
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 61face2..9c23368 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -12,23 +12,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Target/TargetMachine.h"
-
 #include "llvm/Analysis/JumpInstrTableInfo.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/ForwardControlFlowIntegrity.h"
 #include "llvm/CodeGen/JumpInstrTables.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
@@ -78,8 +78,10 @@ LLVMTargetMachine::LLVMTargetMachine(const Target &T, StringRef Triple,
   CodeGenInfo = T.createMCCodeGenInfo(Triple, RM, CM, OL);
 }
 
-void LLVMTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  PM.add(createBasicTargetTransformInfoPass(this));
+TargetIRAnalysis LLVMTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](Function &F) {
+    return TargetTransformInfo(BasicTTIImpl(this, F));
+  });
 }
 
 /// addPassesToX helper drives creation and initialization of TargetPassConfig.
@@ -90,7 +92,7 @@ static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM,
                                           AnalysisID StopAfter) {
 
   // Add internal analysis passes from the target machine.
-  TM->addAnalysisPasses(PM);
+  PM.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
 
   // Targets may override createPassConfig to provide a target-specific
   // subclass.
@@ -114,7 +116,7 @@ static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM,
   // all the per-module stuff we're generating, including MCContext.
   MachineModuleInfo *MMI = new MachineModuleInfo(
       *TM->getMCAsmInfo(), *TM->getSubtargetImpl()->getRegisterInfo(),
-      &TM->getSubtargetImpl()->getTargetLowering()->getObjFileLowering());
+      TM->getObjFileLowering());
   PM.add(MMI);
 
   // Set up a MachineFunction for the rest of CodeGen to work on.
@@ -222,13 +224,11 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
   }
 
   // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
-  FunctionPass *Printer = getTarget().createAsmPrinter(*this, *AsmStreamer);
+  FunctionPass *Printer =
+      getTarget().createAsmPrinter(*this, std::move(AsmStreamer));
   if (!Printer)
     return true;
 
-  // If successful, createAsmPrinter took ownership of AsmStreamer.
-  AsmStreamer.release();
-
   PM.add(Printer);
 
   return false;
@@ -262,20 +262,16 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
   if (!MCE || !MAB)
     return true;
 
-  std::unique_ptr<MCStreamer> AsmStreamer;
-  AsmStreamer.reset(getTarget()
-                        .createMCObjectStreamer(getTargetTriple(), *Ctx, *MAB,
-                                                Out, MCE, STI,
-                                                Options.MCOptions.MCRelaxAll));
+  std::unique_ptr<MCStreamer> AsmStreamer(getTarget().createMCObjectStreamer(
+      getTargetTriple(), *Ctx, *MAB, Out, MCE, STI,
+      Options.MCOptions.MCRelaxAll));
 
   // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
-  FunctionPass *Printer = getTarget().createAsmPrinter(*this, *AsmStreamer);
+  FunctionPass *Printer =
+      getTarget().createAsmPrinter(*this, std::move(AsmStreamer));
   if (!Printer)
     return true;
 
-  // If successful, createAsmPrinter took ownership of AsmStreamer.
-  AsmStreamer.release();
-
   PM.add(Printer);
 
   return false; // success!
diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
index b621e3b..9eaf7da 100644
--- a/lib/CodeGen/LexicalScopes.cpp
+++ b/lib/CodeGen/LexicalScopes.cpp
@@ -104,14 +104,6 @@ void LexicalScopes::extractLexicalScopes(
   }
 }
 
-LexicalScope *LexicalScopes::findInlinedScope(DebugLoc DL) {
-  MDNode *Scope = nullptr;
-  MDNode *IA = nullptr;
-  DL.getScopeAndInlinedAt(Scope, IA, MF->getFunction()->getContext());
-  auto I = InlinedLexicalScopeMap.find(std::make_pair(Scope, IA));
-  return I != InlinedLexicalScopeMap.end() ? &I->second : nullptr;
-}
-
 /// findLexicalScope - Find lexical scope, either regular or inlined, for the
 /// given DebugLoc. Return NULL if not found.
 LexicalScope *LexicalScopes::findLexicalScope(DebugLoc DL) {
@@ -168,11 +160,10 @@ LexicalScope *LexicalScopes::getOrCreateRegularScope(MDNode *Scope) {
   LexicalScope *Parent = nullptr;
   if (D.isLexicalBlock())
     Parent = getOrCreateLexicalScope(DebugLoc::getFromDILexicalBlock(Scope));
-  // FIXME: Use forward_as_tuple instead of make_tuple, once MSVC2012
-  // compatibility is no longer required.
-  I = LexicalScopeMap.emplace(std::piecewise_construct, std::make_tuple(Scope),
-                              std::make_tuple(Parent, DIDescriptor(Scope),
-                                              nullptr, false)).first;
+  I = LexicalScopeMap.emplace(std::piecewise_construct,
+                              std::forward_as_tuple(Scope),
+                              std::forward_as_tuple(Parent, DIDescriptor(Scope),
+                                                    nullptr, false)).first;
 
   if (!Parent) {
     assert(DIDescriptor(Scope).isSubprogram());
@@ -199,12 +190,11 @@ LexicalScope *LexicalScopes::getOrCreateInlinedScope(MDNode *ScopeNode,
   else
     Parent = getOrCreateInlinedScope(Scope.getContext(), InlinedAt);
 
-  // FIXME: Use forward_as_tuple instead of make_tuple, once MSVC2012
-  // compatibility is no longer required.
   I = InlinedLexicalScopeMap.emplace(std::piecewise_construct,
-                                     std::make_tuple(P),
-                                     std::make_tuple(Parent, Scope, InlinedAt,
-                                                     false)).first;
+                                     std::forward_as_tuple(P),
+                                     std::forward_as_tuple(Parent, Scope,
+                                                           InlinedAt, false))
+          .first;
   return &I->second;
 }
 
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 1624851..dc936a3 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -40,7 +40,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-
 #include <memory>
 
 using namespace llvm;
diff --git a/lib/CodeGen/LiveDebugVariables.h b/lib/CodeGen/LiveDebugVariables.h
index 7e3b361..9748329 100644
--- a/lib/CodeGen/LiveDebugVariables.h
+++ b/lib/CodeGen/LiveDebugVariables.h
@@ -22,8 +22,8 @@
 #define LLVM_LIB_CODEGEN_LIVEDEBUGVARIABLES_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/IR/DebugInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/IR/DebugInfo.h"
 
 namespace llvm {
 
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index ddb0032..fd7516d 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -26,11 +26,278 @@
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
 using namespace llvm;
 
+//===----------------------------------------------------------------------===//
+// Implementation of various methods necessary for calculation of live ranges.
+// The implementation of the methods abstracts from the concrete type of the
+// segment collection.
+//
+// Implementation of the class follows the Template design pattern. The base
+// class contains generic algorithms that call collection-specific methods,
+// which are provided in concrete subclasses. In order to avoid virtual calls
+// these methods are provided by means of C++ template instantiation.
+// The base class calls the methods of the subclass through method impl(),
+// which casts 'this' pointer to the type of the subclass.
+//
+//===----------------------------------------------------------------------===//
+
+template <typename ImplT, typename IteratorT, typename CollectionT>
+class CalcLiveRangeUtilBase {
+protected:
+  LiveRange *LR;
+
+protected:
+  CalcLiveRangeUtilBase(LiveRange *LR) : LR(LR) {}
+
+public:
+  typedef LiveRange::Segment Segment;
+  typedef IteratorT iterator;
+
+  VNInfo *createDeadDef(SlotIndex Def, VNInfo::Allocator &VNInfoAllocator) {
+    assert(!Def.isDead() && "Cannot define a value at the dead slot");
+
+    iterator I = impl().find(Def);
+    if (I == segments().end()) {
+      VNInfo *VNI = LR->getNextValue(Def, VNInfoAllocator);
+      impl().insertAtEnd(Segment(Def, Def.getDeadSlot(), VNI));
+      return VNI;
+    }
+
+    Segment *S = segmentAt(I);
+    if (SlotIndex::isSameInstr(Def, S->start)) {
+      assert(S->valno->def == S->start && "Inconsistent existing value def");
+
+      // It is possible to have both normal and early-clobber defs of the same
+      // register on an instruction. It doesn't make a lot of sense, but it is
+      // possible to specify in inline assembly.
+      //
+      // Just convert everything to early-clobber.
+      Def = std::min(Def, S->start);
+      if (Def != S->start)
+        S->start = S->valno->def = Def;
+      return S->valno;
+    }
+    assert(SlotIndex::isEarlierInstr(Def, S->start) && "Already live at def");
+    VNInfo *VNI = LR->getNextValue(Def, VNInfoAllocator);
+    segments().insert(I, Segment(Def, Def.getDeadSlot(), VNI));
+    return VNI;
+  }
+
+  VNInfo *extendInBlock(SlotIndex StartIdx, SlotIndex Use) {
+    if (segments().empty())
+      return nullptr;
+    iterator I =
+        impl().findInsertPos(Segment(Use.getPrevSlot(), Use, nullptr));
+    if (I == segments().begin())
+      return nullptr;
+    --I;
+    if (I->end <= StartIdx)
+      return nullptr;
+    if (I->end < Use)
+      extendSegmentEndTo(I, Use);
+    return I->valno;
+  }
+
+  /// This method is used when we want to extend the segment specified
+  /// by I to end at the specified endpoint. To do this, we should
+  /// merge and eliminate all segments that this will overlap
+  /// with. The iterator is not invalidated.
+  void extendSegmentEndTo(iterator I, SlotIndex NewEnd) {
+    assert(I != segments().end() && "Not a valid segment!");
+    Segment *S = segmentAt(I);
+    VNInfo *ValNo = I->valno;
+
+    // Search for the first segment that we can't merge with.
+    iterator MergeTo = std::next(I);
+    for (; MergeTo != segments().end() && NewEnd >= MergeTo->end; ++MergeTo)
+      assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
+
+    // If NewEnd was in the middle of a segment, make sure to get its endpoint.
+    S->end = std::max(NewEnd, std::prev(MergeTo)->end);
+
+    // If the newly formed segment now touches the segment after it and if they
+    // have the same value number, merge the two segments into one segment.
+    if (MergeTo != segments().end() && MergeTo->start <= I->end &&
+        MergeTo->valno == ValNo) {
+      S->end = MergeTo->end;
+      ++MergeTo;
+    }
+
+    // Erase any dead segments.
+    segments().erase(std::next(I), MergeTo);
+  }
+
+  /// This method is used when we want to extend the segment specified
+  /// by I to start at the specified endpoint.  To do this, we should
+  /// merge and eliminate all segments that this will overlap with.
+  iterator extendSegmentStartTo(iterator I, SlotIndex NewStart) {
+    assert(I != segments().end() && "Not a valid segment!");
+    Segment *S = segmentAt(I);
+    VNInfo *ValNo = I->valno;
+
+    // Search for the first segment that we can't merge with.
+    iterator MergeTo = I;
+    do {
+      if (MergeTo == segments().begin()) {
+        S->start = NewStart;
+        segments().erase(MergeTo, I);
+        return I;
+      }
+      assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
+      --MergeTo;
+    } while (NewStart <= MergeTo->start);
+
+    // If we start in the middle of another segment, just delete a range and
+    // extend that segment.
+    if (MergeTo->end >= NewStart && MergeTo->valno == ValNo) {
+      segmentAt(MergeTo)->end = S->end;
+    } else {
+      // Otherwise, extend the segment right after.
+      ++MergeTo;
+      Segment *MergeToSeg = segmentAt(MergeTo);
+      MergeToSeg->start = NewStart;
+      MergeToSeg->end = S->end;
+    }
+
+    segments().erase(std::next(MergeTo), std::next(I));
+    return MergeTo;
+  }
+
+  iterator addSegment(Segment S) {
+    SlotIndex Start = S.start, End = S.end;
+    iterator I = impl().findInsertPos(S);
+
+    // If the inserted segment starts in the middle or right at the end of
+    // another segment, just extend that segment to contain the segment of S.
+    if (I != segments().begin()) {
+      iterator B = std::prev(I);
+      if (S.valno == B->valno) {
+        if (B->start <= Start && B->end >= Start) {
+          extendSegmentEndTo(B, End);
+          return B;
+        }
+      } else {
+        // Check to make sure that we are not overlapping two live segments with
+        // different valno's.
+        assert(B->end <= Start &&
+               "Cannot overlap two segments with differing ValID's"
+               " (did you def the same reg twice in a MachineInstr?)");
+      }
+    }
+
+    // Otherwise, if this segment ends in the middle of, or right next
+    // to, another segment, merge it into that segment.
+    if (I != segments().end()) {
+      if (S.valno == I->valno) {
+        if (I->start <= End) {
+          I = extendSegmentStartTo(I, Start);
+
+          // If S is a complete superset of a segment, we may need to grow its
+          // endpoint as well.
+          if (End > I->end)
+            extendSegmentEndTo(I, End);
+          return I;
+        }
+      } else {
+        // Check to make sure that we are not overlapping two live segments with
+        // different valno's.
+        assert(I->start >= End &&
+               "Cannot overlap two segments with differing ValID's");
+      }
+    }
+
+    // Otherwise, this is just a new segment that doesn't interact with
+    // anything.
+    // Insert it.
+    return segments().insert(I, S);
+  }
+
+private:
+  ImplT &impl() { return *static_cast<ImplT *>(this); }
+
+  CollectionT &segments() { return impl().segmentsColl(); }
+
+  Segment *segmentAt(iterator I) { return const_cast<Segment *>(&(*I)); }
+};
+
+//===----------------------------------------------------------------------===//
+//   Instantiation of the methods for calculation of live ranges
+//   based on a segment vector.
+//===----------------------------------------------------------------------===//
+
+class CalcLiveRangeUtilVector;
+typedef CalcLiveRangeUtilBase<CalcLiveRangeUtilVector, LiveRange::iterator,
+                              LiveRange::Segments> CalcLiveRangeUtilVectorBase;
+
+class CalcLiveRangeUtilVector : public CalcLiveRangeUtilVectorBase {
+public:
+  CalcLiveRangeUtilVector(LiveRange *LR) : CalcLiveRangeUtilVectorBase(LR) {}
+
+private:
+  friend CalcLiveRangeUtilVectorBase;
+
+  LiveRange::Segments &segmentsColl() { return LR->segments; }
+
+  void insertAtEnd(const Segment &S) { LR->segments.push_back(S); }
+
+  iterator find(SlotIndex Pos) { return LR->find(Pos); }
+
+  iterator findInsertPos(Segment S) {
+    return std::upper_bound(LR->begin(), LR->end(), S.start);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+//   Instantiation of the methods for calculation of live ranges
+//   based on a segment set.
+//===----------------------------------------------------------------------===//
+
+class CalcLiveRangeUtilSet;
+typedef CalcLiveRangeUtilBase<CalcLiveRangeUtilSet,
+                              LiveRange::SegmentSet::iterator,
+                              LiveRange::SegmentSet> CalcLiveRangeUtilSetBase;
+
+class CalcLiveRangeUtilSet : public CalcLiveRangeUtilSetBase {
+public:
+  CalcLiveRangeUtilSet(LiveRange *LR) : CalcLiveRangeUtilSetBase(LR) {}
+
+private:
+  friend CalcLiveRangeUtilSetBase;
+
+  LiveRange::SegmentSet &segmentsColl() { return *LR->segmentSet; }
+
+  void insertAtEnd(const Segment &S) {
+    LR->segmentSet->insert(LR->segmentSet->end(), S);
+  }
+
+  iterator find(SlotIndex Pos) {
+    iterator I =
+        LR->segmentSet->upper_bound(Segment(Pos, Pos.getNextSlot(), nullptr));
+    if (I == LR->segmentSet->begin())
+      return I;
+    iterator PrevI = std::prev(I);
+    if (Pos < (*PrevI).end)
+      return PrevI;
+    return I;
+  }
+
+  iterator findInsertPos(Segment S) {
+    iterator I = LR->segmentSet->upper_bound(S);
+    if (I != LR->segmentSet->end() && !(S.start < *I))
+      ++I;
+    return I;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+//   LiveRange methods
+//===----------------------------------------------------------------------===//
+
 LiveRange::iterator LiveRange::find(SlotIndex Pos) {
   // This algorithm is basically std::upper_bound.
   // Unfortunately, std::upper_bound cannot be used with mixed types until we
@@ -51,30 +318,11 @@ LiveRange::iterator LiveRange::find(SlotIndex Pos) {
 
 VNInfo *LiveRange::createDeadDef(SlotIndex Def,
                                   VNInfo::Allocator &VNInfoAllocator) {
-  assert(!Def.isDead() && "Cannot define a value at the dead slot");
-  iterator I = find(Def);
-  if (I == end()) {
-    VNInfo *VNI = getNextValue(Def, VNInfoAllocator);
-    segments.push_back(Segment(Def, Def.getDeadSlot(), VNI));
-    return VNI;
-  }
-  if (SlotIndex::isSameInstr(Def, I->start)) {
-    assert(I->valno->def == I->start && "Inconsistent existing value def");
-
-    // It is possible to have both normal and early-clobber defs of the same
-    // register on an instruction. It doesn't make a lot of sense, but it is
-    // possible to specify in inline assembly.
-    //
-    // Just convert everything to early-clobber.
-    Def = std::min(Def, I->start);
-    if (Def != I->start)
-      I->start = I->valno->def = Def;
-    return I->valno;
-  }
-  assert(SlotIndex::isEarlierInstr(Def, I->start) && "Already live at def");
-  VNInfo *VNI = getNextValue(Def, VNInfoAllocator);
-  segments.insert(I, Segment(Def, Def.getDeadSlot(), VNI));
-  return VNI;
+  // Use the segment set, if it is available.
+  if (segmentSet != nullptr)
+    return CalcLiveRangeUtilSet(this).createDeadDef(Def, VNInfoAllocator);
+  // Otherwise use the segment vector.
+  return CalcLiveRangeUtilVector(this).createDeadDef(Def, VNInfoAllocator);
 }
 
 // overlaps - Return true if the intersection of the two live ranges is
@@ -185,6 +433,27 @@ bool LiveRange::overlaps(SlotIndex Start, SlotIndex End) const {
   return I != begin() && (--I)->end > Start;
 }
 
+bool LiveRange::covers(const LiveRange &Other) const {
+  if (empty())
+    return Other.empty();
+
+  const_iterator I = begin();
+  for (const Segment &O : Other.segments) {
+    I = advanceTo(I, O.start);
+    if (I == end() || I->start > O.start)
+      return false;
+
+    // Check adjacent live segments and see if we can get behind O.end.
+    while (I->end < O.end) {
+      const_iterator Last = I;
+      // Get next segment and abort if it was not adjacent.
+      ++I;
+      if (I == end() || Last->end != I->start)
+        return false;
+    }
+  }
+  return true;
+}
 
 /// ValNo is dead, remove it.  If it is the largest value number, just nuke it
 /// (and any other deleted values neighboring it), otherwise mark it as ~1U so
@@ -204,8 +473,8 @@ void LiveRange::markValNoForDeletion(VNInfo *ValNo) {
 void LiveRange::RenumberValues() {
   SmallPtrSet<VNInfo*, 8> Seen;
   valnos.clear();
-  for (const_iterator I = begin(), E = end(); I != E; ++I) {
-    VNInfo *VNI = I->valno;
+  for (const Segment &S : segments) {
+    VNInfo *VNI = S.valno;
     if (!Seen.insert(VNI).second)
       continue;
     assert(!VNI->isUnused() && "Unused valno used by live segment");
@@ -214,133 +483,35 @@ void LiveRange::RenumberValues() {
   }
 }
 
-/// This method is used when we want to extend the segment specified by I to end
-/// at the specified endpoint.  To do this, we should merge and eliminate all
-/// segments that this will overlap with.  The iterator is not invalidated.
-void LiveRange::extendSegmentEndTo(iterator I, SlotIndex NewEnd) {
-  assert(I != end() && "Not a valid segment!");
-  VNInfo *ValNo = I->valno;
-
-  // Search for the first segment that we can't merge with.
-  iterator MergeTo = std::next(I);
-  for (; MergeTo != end() && NewEnd >= MergeTo->end; ++MergeTo) {
-    assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
-  }
-
-  // If NewEnd was in the middle of a segment, make sure to get its endpoint.
-  I->end = std::max(NewEnd, std::prev(MergeTo)->end);
-
-  // If the newly formed segment now touches the segment after it and if they
-  // have the same value number, merge the two segments into one segment.
-  if (MergeTo != end() && MergeTo->start <= I->end &&
-      MergeTo->valno == ValNo) {
-    I->end = MergeTo->end;
-    ++MergeTo;
-  }
-
-  // Erase any dead segments.
-  segments.erase(std::next(I), MergeTo);
+void LiveRange::addSegmentToSet(Segment S) {
+  CalcLiveRangeUtilSet(this).addSegment(S);
 }
 
-
-/// This method is used when we want to extend the segment specified by I to
-/// start at the specified endpoint.  To do this, we should merge and eliminate
-/// all segments that this will overlap with.
-LiveRange::iterator
-LiveRange::extendSegmentStartTo(iterator I, SlotIndex NewStart) {
-  assert(I != end() && "Not a valid segment!");
-  VNInfo *ValNo = I->valno;
-
-  // Search for the first segment that we can't merge with.
-  iterator MergeTo = I;
-  do {
-    if (MergeTo == begin()) {
-      I->start = NewStart;
-      segments.erase(MergeTo, I);
-      return I;
-    }
-    assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
-    --MergeTo;
-  } while (NewStart <= MergeTo->start);
-
-  // If we start in the middle of another segment, just delete a range and
-  // extend that segment.
-  if (MergeTo->end >= NewStart && MergeTo->valno == ValNo) {
-    MergeTo->end = I->end;
-  } else {
-    // Otherwise, extend the segment right after.
-    ++MergeTo;
-    MergeTo->start = NewStart;
-    MergeTo->end = I->end;
+LiveRange::iterator LiveRange::addSegment(Segment S) {
+  // Use the segment set, if it is available.
+  if (segmentSet != nullptr) {
+    addSegmentToSet(S);
+    return end();
   }
-
-  segments.erase(std::next(MergeTo), std::next(I));
-  return MergeTo;
+  // Otherwise use the segment vector.
+  return CalcLiveRangeUtilVector(this).addSegment(S);
 }
 
-LiveRange::iterator LiveRange::addSegmentFrom(Segment S, iterator From) {
-  SlotIndex Start = S.start, End = S.end;
-  iterator it = std::upper_bound(From, end(), Start);
-
-  // If the inserted segment starts in the middle or right at the end of
-  // another segment, just extend that segment to contain the segment of S.
-  if (it != begin()) {
-    iterator B = std::prev(it);
-    if (S.valno == B->valno) {
-      if (B->start <= Start && B->end >= Start) {
-        extendSegmentEndTo(B, End);
-        return B;
-      }
-    } else {
-      // Check to make sure that we are not overlapping two live segments with
-      // different valno's.
-      assert(B->end <= Start &&
-             "Cannot overlap two segments with differing ValID's"
-             " (did you def the same reg twice in a MachineInstr?)");
-    }
-  }
-
-  // Otherwise, if this segment ends in the middle of, or right next to, another
-  // segment, merge it into that segment.
-  if (it != end()) {
-    if (S.valno == it->valno) {
-      if (it->start <= End) {
-        it = extendSegmentStartTo(it, Start);
-
-        // If S is a complete superset of a segment, we may need to grow its
-        // endpoint as well.
-        if (End > it->end)
-          extendSegmentEndTo(it, End);
-        return it;
-      }
-    } else {
-      // Check to make sure that we are not overlapping two live segments with
-      // different valno's.
-      assert(it->start >= End &&
-             "Cannot overlap two segments with differing ValID's");
-    }
-  }
-
-  // Otherwise, this is just a new segment that doesn't interact with anything.
-  // Insert it.
-  return segments.insert(it, S);
+void LiveRange::append(const Segment S) {
+  // Check that the segment belongs to the back of the list.
+  assert(segments.empty() || segments.back().end <= S.start);
+  segments.push_back(S);
 }
 
 /// extendInBlock - If this range is live before Kill in the basic
 /// block that starts at StartIdx, extend it to be live up to Kill and return
 /// the value. If there is no live range before Kill, return NULL.
 VNInfo *LiveRange::extendInBlock(SlotIndex StartIdx, SlotIndex Kill) {
-  if (empty())
-    return nullptr;
-  iterator I = std::upper_bound(begin(), end(), Kill.getPrevSlot());
-  if (I == begin())
-    return nullptr;
-  --I;
-  if (I->end <= StartIdx)
-    return nullptr;
-  if (I->end < Kill)
-    extendSegmentEndTo(I, Kill);
-  return I->valno;
+  // Use the segment set, if it is available.
+  if (segmentSet != nullptr)
+    return CalcLiveRangeUtilSet(this).extendInBlock(StartIdx, Kill);
+  // Otherwise use the segment vector.
+  return CalcLiveRangeUtilVector(this).extendInBlock(StartIdx, Kill);
 }
 
 /// Remove the specified segment from this range.  Note that the segment must
@@ -461,8 +632,8 @@ void LiveRange::join(LiveRange &Other,
   // This can leave Other in an invalid state because we're not coalescing
   // touching segments that now have identical values. That's OK since Other is
   // not supposed to be valid after calling join();
-  for (iterator I = Other.begin(), E = Other.end(); I != E; ++I)
-    I->valno = NewVNInfo[RHSValNoAssignments[I->valno->id]];
+  for (Segment &S : Other.segments)
+    S.valno = NewVNInfo[RHSValNoAssignments[S.valno->id]];
 
   // Update val# info. Renumber them and make sure they all belong to this
   // LiveRange now. Also remove dead val#'s.
@@ -482,8 +653,8 @@ void LiveRange::join(LiveRange &Other,
 
   // Okay, now insert the RHS live segments into the LHS.
   LiveRangeUpdater Updater(this);
-  for (iterator I = Other.begin(), E = Other.end(); I != E; ++I)
-    Updater.add(*I);
+  for (Segment &S : Other.segments)
+    Updater.add(S);
 }
 
 /// Merge all of the segments in RHS into this live range as the specified
@@ -493,8 +664,8 @@ void LiveRange::join(LiveRange &Other,
 void LiveRange::MergeSegmentsInAsValue(const LiveRange &RHS,
                                        VNInfo *LHSValNo) {
   LiveRangeUpdater Updater(this);
-  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I)
-    Updater.add(I->start, I->end, LHSValNo);
+  for (const Segment &S : RHS.segments)
+    Updater.add(S.start, S.end, LHSValNo);
 }
 
 /// MergeValueInAsValue - Merge all of the live segments of a specific val#
@@ -506,9 +677,9 @@ void LiveRange::MergeValueInAsValue(const LiveRange &RHS,
                                     const VNInfo *RHSValNo,
                                     VNInfo *LHSValNo) {
   LiveRangeUpdater Updater(this);
-  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I)
-    if (I->valno == RHSValNo)
-      Updater.add(I->start, I->end, LHSValNo);
+  for (const Segment &S : RHS.segments)
+    if (S.valno == RHSValNo)
+      Updater.add(S.start, S.end, LHSValNo);
 }
 
 /// MergeValueNumberInto - This method is called when two value nubmers
@@ -570,10 +741,258 @@ VNInfo *LiveRange::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
   return V2;
 }
 
+void LiveRange::flushSegmentSet() {
+  assert(segmentSet != nullptr && "segment set must have been created");
+  assert(
+      segments.empty() &&
+      "segment set can be used only initially before switching to the array");
+  segments.append(segmentSet->begin(), segmentSet->end());
+  delete segmentSet;
+  segmentSet = nullptr;
+  verify();
+}
+
+void LiveInterval::freeSubRange(SubRange *S) {
+  S->~SubRange();
+  // Memory was allocated with BumpPtr allocator and is not freed here.
+}
+
+void LiveInterval::removeEmptySubRanges() {
+  SubRange **NextPtr = &SubRanges;
+  SubRange *I = *NextPtr;
+  while (I != nullptr) {
+    if (!I->empty()) {
+      NextPtr = &I->Next;
+      I = *NextPtr;
+      continue;
+    }
+    // Skip empty subranges until we find the first nonempty one.
+    do {
+      SubRange *Next = I->Next;
+      freeSubRange(I);
+      I = Next;
+    } while (I != nullptr && I->empty());
+    *NextPtr = I;
+  }
+}
+
+void LiveInterval::clearSubRanges() {
+  for (SubRange *I = SubRanges, *Next; I != nullptr; I = Next) {
+    Next = I->Next;
+    freeSubRange(I);
+  }
+  SubRanges = nullptr;
+}
+
+/// Helper function for constructMainRangeFromSubranges(): Search the CFG
+/// backwards until we find a place covered by a LiveRange segment that actually
+/// has a valno set.
+static VNInfo *searchForVNI(const SlotIndexes &Indexes, LiveRange &LR,
+    const MachineBasicBlock *MBB,
+    SmallPtrSetImpl<const MachineBasicBlock*> &Visited) {
+  // We start the search at the end of MBB.
+  SlotIndex EndIdx = Indexes.getMBBEndIdx(MBB);
+  // In our use case we can't live the area covered by the live segments without
+  // finding an actual VNI def.
+  LiveRange::iterator I = LR.find(EndIdx.getPrevSlot());
+  assert(I != LR.end());
+  LiveRange::Segment &S = *I;
+  if (S.valno != nullptr)
+    return S.valno;
+
+  VNInfo *VNI = nullptr;
+  // Continue at predecessors (we could even go to idom with domtree available).
+  for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+    // Avoid going in circles.
+    if (!Visited.insert(Pred).second)
+      continue;
+
+    VNI = searchForVNI(Indexes, LR, Pred, Visited);
+    if (VNI != nullptr) {
+      S.valno = VNI;
+      break;
+    }
+  }
+
+  return VNI;
+}
+
+static void determineMissingVNIs(const SlotIndexes &Indexes, LiveInterval &LI) {
+  SmallPtrSet<const MachineBasicBlock*, 5> Visited;
+  for (LiveRange::Segment &S : LI.segments) {
+    if (S.valno != nullptr)
+      continue;
+    // This can only happen at the begin of a basic block.
+    assert(S.start.isBlock() && "valno should only be missing at block begin");
+
+    Visited.clear();
+    const MachineBasicBlock *MBB = Indexes.getMBBFromIndex(S.start);
+    for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+      VNInfo *VNI = searchForVNI(Indexes, LI, Pred, Visited);
+      if (VNI != nullptr) {
+        S.valno = VNI;
+        break;
+      }
+    }
+    assert(S.valno != nullptr && "could not determine valno");
+  }
+}
+
+void LiveInterval::constructMainRangeFromSubranges(
+    const SlotIndexes &Indexes, VNInfo::Allocator &VNIAllocator) {
+  // The basic observations on which this algorithm is based:
+  // - Each Def/ValNo in a subrange must have a corresponding def on the main
+  //   range, but not further defs/valnos are necessary.
+  // - If any of the subranges is live at a point the main liverange has to be
+  //   live too, conversily if no subrange is live the main range mustn't be
+  //   live either.
+  // We do this by scannig through all the subranges simultaneously creating new
+  // segments in the main range as segments start/ends come up in the subranges.
+  assert(hasSubRanges() && "expected subranges to be present");
+  assert(segments.empty() && valnos.empty() && "expected empty main range");
+
+  // Collect subrange, iterator pairs for the walk and determine first and last
+  // SlotIndex involved.
+  SmallVector<std::pair<const SubRange*, const_iterator>, 4> SRs;
+  SlotIndex First;
+  SlotIndex Last;
+  for (const SubRange &SR : subranges()) {
+    if (SR.empty())
+      continue;
+    SRs.push_back(std::make_pair(&SR, SR.begin()));
+    if (!First.isValid() || SR.segments.front().start < First)
+      First = SR.segments.front().start;
+    if (!Last.isValid() || SR.segments.back().end > Last)
+      Last = SR.segments.back().end;
+  }
+
+  // Walk over all subranges simultaneously.
+  Segment CurrentSegment;
+  bool ConstructingSegment = false;
+  bool NeedVNIFixup = false;
+  unsigned ActiveMask = 0;
+  SlotIndex Pos = First;
+  while (true) {
+    SlotIndex NextPos = Last;
+    enum {
+      NOTHING,
+      BEGIN_SEGMENT,
+      END_SEGMENT,
+    } Event = NOTHING;
+    // Which subregister lanes are affected by the current event.
+    unsigned EventMask = 0;
+    // Whether a BEGIN_SEGMENT is also a valno definition point.
+    bool IsDef = false;
+    // Find the next begin or end of a subrange segment. Combine masks if we
+    // have multiple begins/ends at the same position. Ends take precedence over
+    // Begins.
+    for (auto &SRP : SRs) {
+      const SubRange &SR = *SRP.first;
+      const_iterator &I = SRP.second;
+      // Advance iterator of subrange to a segment involving Pos; the earlier
+      // segments are already merged at this point.
+      while (I != SR.end() &&
+             (I->end < Pos ||
+              (I->end == Pos && (ActiveMask & SR.LaneMask) == 0)))
+        ++I;
+      if (I == SR.end())
+        continue;
+      if ((ActiveMask & SR.LaneMask) == 0 &&
+          Pos <= I->start && I->start <= NextPos) {
+        // Merge multiple begins at the same position.
+        if (I->start == NextPos && Event == BEGIN_SEGMENT) {
+          EventMask |= SR.LaneMask;
+          IsDef |= I->valno->def == I->start;
+        } else if (I->start < NextPos || Event != END_SEGMENT) {
+          Event = BEGIN_SEGMENT;
+          NextPos = I->start;
+          EventMask = SR.LaneMask;
+          IsDef = I->valno->def == I->start;
+        }
+      }
+      if ((ActiveMask & SR.LaneMask) != 0 &&
+          Pos <= I->end && I->end <= NextPos) {
+        // Merge multiple ends at the same position.
+        if (I->end == NextPos && Event == END_SEGMENT)
+          EventMask |= SR.LaneMask;
+        else {
+          Event = END_SEGMENT;
+          NextPos = I->end;
+          EventMask = SR.LaneMask;
+        }
+      }
+    }
+
+    // Advance scan position.
+    Pos = NextPos;
+    if (Event == BEGIN_SEGMENT) {
+      if (ConstructingSegment && IsDef) {
+        // Finish previous segment because we have to start a new one.
+        CurrentSegment.end = Pos;
+        append(CurrentSegment);
+        ConstructingSegment = false;
+      }
+
+      // Start a new segment if necessary.
+      if (!ConstructingSegment) {
+        // Determine value number for the segment.
+        VNInfo *VNI;
+        if (IsDef) {
+          VNI = getNextValue(Pos, VNIAllocator);
+        } else {
+          // We have to reuse an existing value number, if we are lucky
+          // then we already passed one of the predecessor blocks and determined
+          // its value number (with blocks in reverse postorder this would be
+          // always true but we have no such guarantee).
+          assert(Pos.isBlock());
+          const MachineBasicBlock *MBB = Indexes.getMBBFromIndex(Pos);
+          // See if any of the predecessor blocks has a lower number and a VNI
+          for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+            SlotIndex PredEnd = Indexes.getMBBEndIdx(Pred);
+            VNI = getVNInfoBefore(PredEnd);
+            if (VNI != nullptr)
+              break;
+          }
+          // Def will come later: We have to do an extra fixup pass.
+          if (VNI == nullptr)
+            NeedVNIFixup = true;
+        }
+
+        CurrentSegment.start = Pos;
+        CurrentSegment.valno = VNI;
+        ConstructingSegment = true;
+      }
+      ActiveMask |= EventMask;
+    } else if (Event == END_SEGMENT) {
+      assert(ConstructingSegment);
+      // Finish segment if no lane is active anymore.
+      ActiveMask &= ~EventMask;
+      if (ActiveMask == 0) {
+        CurrentSegment.end = Pos;
+        append(CurrentSegment);
+        ConstructingSegment = false;
+      }
+    } else {
+      // We reached the end of the last subranges and can stop.
+      assert(Event == NOTHING);
+      break;
+    }
+  }
+
+  // We might not be able to assign new valnos for all segments if the basic
+  // block containing the definition comes after a segment using the valno.
+  // Do a fixup pass for this uncommon case.
+  if (NeedVNIFixup)
+    determineMissingVNIs(Indexes, *this);
+
+  assert(ActiveMask == 0 && !ConstructingSegment && "all segments ended");
+  verify();
+}
+
 unsigned LiveInterval::getSize() const {
   unsigned Sum = 0;
-  for (const_iterator I = begin(), E = end(); I != E; ++I)
-    Sum += I->start.distance(I->end);
+  for (const Segment &S : segments)
+    Sum += S.start.distance(S.end);
   return Sum;
 }
 
@@ -591,9 +1010,9 @@ void LiveRange::print(raw_ostream &OS) const {
   if (empty())
     OS << "EMPTY";
   else {
-    for (const_iterator I = begin(), E = end(); I != E; ++I) {
-      OS << *I;
-      assert(I->valno == getValNumInfo(I->valno->id) && "Bad VNInfo");
+    for (const Segment &S : segments) {
+      OS << S;
+      assert(S.valno == getValNumInfo(S.valno->id) && "Bad VNInfo");
     }
   }
 
@@ -620,6 +1039,10 @@ void LiveRange::print(raw_ostream &OS) const {
 void LiveInterval::print(raw_ostream &OS) const {
   OS << PrintReg(reg) << ' ';
   super::print(OS);
+  // Print subranges
+  for (const SubRange &SR : subranges()) {
+    OS << format(" L%04X ", SR.LaneMask) << SR;
+  }
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -648,6 +1071,26 @@ void LiveRange::verify() const {
     }
   }
 }
+
+void LiveInterval::verify(const MachineRegisterInfo *MRI) const {
+  super::verify();
+
+  // Make sure SubRanges are fine and LaneMasks are disjunct.
+  unsigned Mask = 0;
+  unsigned MaxMask = MRI != nullptr ? MRI->getMaxLaneMaskForVReg(reg) : ~0u;
+  for (const SubRange &SR : subranges()) {
+    // Subrange lanemask should be disjunct to any previous subrange masks.
+    assert((Mask & SR.LaneMask) == 0);
+    Mask |= SR.LaneMask;
+
+    // subrange mask should not contained in maximum lane mask for the vreg.
+    assert((Mask & ~MaxMask) == 0);
+
+    SR.verify();
+    // Main liverange should cover subrange.
+    assert(covers(SR));
+  }
+}
 #endif
 
 
@@ -692,14 +1135,14 @@ void LiveRangeUpdater::print(raw_ostream &OS) const {
   OS << " updater with gap = " << (ReadI - WriteI)
      << ", last start = " << LastStart
      << ":\n  Area 1:";
-  for (LiveRange::const_iterator I = LR->begin(); I != WriteI; ++I)
-    OS << ' ' << *I;
+  for (const auto &S : make_range(LR->begin(), WriteI))
+    OS << ' ' << S;
   OS << "\n  Spills:";
   for (unsigned I = 0, E = Spills.size(); I != E; ++I)
     OS << ' ' << Spills[I];
   OS << "\n  Area 2:";
-  for (LiveRange::const_iterator I = ReadI, E = LR->end(); I != E; ++I)
-    OS << ' ' << *I;
+  for (const auto &S : make_range(ReadI, LR->end()))
+    OS << ' ' << S;
   OS << '\n';
 }
 
@@ -723,6 +1166,13 @@ static inline bool coalescable(const LiveRange::Segment &A,
 void LiveRangeUpdater::add(LiveRange::Segment Seg) {
   assert(LR && "Cannot add to a null destination");
 
+  // Fall back to the regular add method if the live range
+  // is using the segment set instead of the segment vector.
+  if (LR->segmentSet != nullptr) {
+    LR->addSegmentToSet(Seg);
+    return;
+  }
+
   // Flush the state if Start moves backwards.
   if (!LastStart.isValid() || LastStart > Seg.start) {
     if (isDirty())
@@ -860,9 +1310,7 @@ unsigned ConnectedVNInfoEqClasses::Classify(const LiveInterval *LI) {
   const VNInfo *used = nullptr, *unused = nullptr;
 
   // Determine connections.
-  for (LiveInterval::const_vni_iterator I = LI->vni_begin(), E = LI->vni_end();
-       I != E; ++I) {
-    const VNInfo *VNI = *I;
+  for (const VNInfo *VNI : LI->valnos) {
     // Group all unused values into one class.
     if (VNI->isUnused()) {
       if (unused)
@@ -938,6 +1386,8 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[],
     } else
       *J++ = *I;
   }
+  // TODO: do not cheat anymore by simply cleaning all subranges
+  LI.clearSubRanges();
   LI.segments.erase(J, E);
 
   // Transfer VNInfos to their new owners and renumber them.
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 1742e63..cc08045 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -62,6 +63,17 @@ static cl::opt<bool> EnablePrecomputePhysRegs(
 static bool EnablePrecomputePhysRegs = false;
 #endif // NDEBUG
 
+static cl::opt<bool> EnableSubRegLiveness(
+  "enable-subreg-liveness", cl::Hidden, cl::init(true),
+  cl::desc("Enable subregister liveness tracking."));
+
+namespace llvm {
+cl::opt<bool> UseSegmentSetForPhysRegs(
+    "use-segment-set-for-physregs", cl::Hidden, cl::init(true),
+    cl::desc(
+        "Use segment set for the computation of the live ranges of physregs."));
+}
+
 void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addRequired<AliasAnalysis>();
@@ -115,6 +127,10 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   AA = &getAnalysis<AliasAnalysis>();
   Indexes = &getAnalysis<SlotIndexes>();
   DomTree = &getAnalysis<MachineDominatorTree>();
+
+  if (EnableSubRegLiveness && MF->getSubtarget().enableSubRegLiveness())
+    MRI->enableSubRegLiveness(true);
+
   if (!LRCalc)
     LRCalc = new LiveRangeCalc();
 
@@ -183,9 +199,8 @@ void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
   assert(LRCalc && "LRCalc not initialized.");
   assert(LI.empty() && "Should only compute empty intervals.");
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
-  LRCalc->createDeadDefs(LI);
-  LRCalc->extendToUses(LI);
-  computeDeadValues(&LI, LI, nullptr, nullptr);
+  LRCalc->calculate(LI);
+  computeDeadValues(LI, nullptr);
 }
 
 void LiveIntervals::computeVirtRegs() {
@@ -260,6 +275,10 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
         LRCalc->extendToUses(LR, Reg);
     }
   }
+
+  // Flush the segment set to the segment vector.
+  if (UseSegmentSetForPhysRegs)
+    LR.flushSegmentSet();
 }
 
 
@@ -292,7 +311,8 @@ void LiveIntervals::computeLiveInRegUnits() {
         unsigned Unit = *Units;
         LiveRange *LR = RegUnitRanges[Unit];
         if (!LR) {
-          LR = RegUnitRanges[Unit] = new LiveRange();
+          // Use segment set to speed-up initial computation of the live range.
+          LR = RegUnitRanges[Unit] = new LiveRange(UseSegmentSetForPhysRegs);
           NewRanges.push_back(Unit);
         }
         VNInfo *VNI = LR->createDeadDef(Begin, getVNInfoAllocator());
@@ -312,6 +332,70 @@ void LiveIntervals::computeLiveInRegUnits() {
 }
 
 
+static void createSegmentsForValues(LiveRange &LR,
+      iterator_range<LiveInterval::vni_iterator> VNIs) {
+  for (auto VNI : VNIs) {
+    if (VNI->isUnused())
+      continue;
+    SlotIndex Def = VNI->def;
+    LR.addSegment(LiveRange::Segment(Def, Def.getDeadSlot(), VNI));
+  }
+}
+
+typedef SmallVector<std::pair<SlotIndex, VNInfo*>, 16> ShrinkToUsesWorkList;
+
+static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,
+                                 ShrinkToUsesWorkList &WorkList,
+                                 const LiveRange &OldRange) {
+  // Keep track of the PHIs that are in use.
+  SmallPtrSet<VNInfo*, 8> UsedPHIs;
+  // Blocks that have already been added to WorkList as live-out.
+  SmallPtrSet<MachineBasicBlock*, 16> LiveOut;
+
+  // Extend intervals to reach all uses in WorkList.
+  while (!WorkList.empty()) {
+    SlotIndex Idx = WorkList.back().first;
+    VNInfo *VNI = WorkList.back().second;
+    WorkList.pop_back();
+    const MachineBasicBlock *MBB = Indexes.getMBBFromIndex(Idx.getPrevSlot());
+    SlotIndex BlockStart = Indexes.getMBBStartIdx(MBB);
+
+    // Extend the live range for VNI to be live at Idx.
+    if (VNInfo *ExtVNI = LR.extendInBlock(BlockStart, Idx)) {
+      assert(ExtVNI == VNI && "Unexpected existing value number");
+      (void)ExtVNI;
+      // Is this a PHIDef we haven't seen before?
+      if (!VNI->isPHIDef() || VNI->def != BlockStart ||
+          !UsedPHIs.insert(VNI).second)
+        continue;
+      // The PHI is live, make sure the predecessors are live-out.
+      for (auto &Pred : MBB->predecessors()) {
+        if (!LiveOut.insert(Pred).second)
+          continue;
+        SlotIndex Stop = Indexes.getMBBEndIdx(Pred);
+        // A predecessor is not required to have a live-out value for a PHI.
+        if (VNInfo *PVNI = OldRange.getVNInfoBefore(Stop))
+          WorkList.push_back(std::make_pair(Stop, PVNI));
+      }
+      continue;
+    }
+
+    // VNI is live-in to MBB.
+    DEBUG(dbgs() << " live-in at " << BlockStart << '\n');
+    LR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI));
+
+    // Make sure VNI is live-out from the predecessors.
+    for (auto &Pred : MBB->predecessors()) {
+      if (!LiveOut.insert(Pred).second)
+        continue;
+      SlotIndex Stop = Indexes.getMBBEndIdx(Pred);
+      assert(OldRange.getVNInfoBefore(Stop) == VNI &&
+             "Wrong value out of predecessor");
+      WorkList.push_back(std::make_pair(Stop, VNI));
+    }
+  }
+}
+
 /// shrinkToUses - After removing some uses of a register, shrink its live
 /// range to just the remaining uses. This method does not compute reaching
 /// defs for new uses, and it doesn't remove dead defs.
@@ -320,11 +404,14 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
   DEBUG(dbgs() << "Shrink: " << *li << '\n');
   assert(TargetRegisterInfo::isVirtualRegister(li->reg)
          && "Can only shrink virtual registers");
-  // Find all the values used, including PHI kills.
-  SmallVector<std::pair<SlotIndex, VNInfo*>, 16> WorkList;
 
-  // Blocks that have already been added to WorkList as live-out.
-  SmallPtrSet<MachineBasicBlock*, 16> LiveOut;
+  // Shrink subregister live ranges.
+  for (LiveInterval::SubRange &S : li->subranges()) {
+    shrinkToUses(S, li->reg);
+  }
+
+  // Find all the values used, including PHI kills.
+  ShrinkToUsesWorkList WorkList;
 
   // Visit all instructions reading li->reg.
   for (MachineRegisterInfo::reg_instr_iterator
@@ -355,103 +442,126 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
 
   // Create new live ranges with only minimal live segments per def.
   LiveRange NewLR;
-  for (LiveInterval::vni_iterator I = li->vni_begin(), E = li->vni_end();
-       I != E; ++I) {
-    VNInfo *VNI = *I;
-    if (VNI->isUnused())
-      continue;
-    NewLR.addSegment(LiveRange::Segment(VNI->def, VNI->def.getDeadSlot(), VNI));
-  }
+  createSegmentsForValues(NewLR, make_range(li->vni_begin(), li->vni_end()));
+  extendSegmentsToUses(NewLR, *Indexes, WorkList, *li);
 
-  // Keep track of the PHIs that are in use.
-  SmallPtrSet<VNInfo*, 8> UsedPHIs;
+  // Move the trimmed segments back.
+  li->segments.swap(NewLR.segments);
 
-  // Extend intervals to reach all uses in WorkList.
-  while (!WorkList.empty()) {
-    SlotIndex Idx = WorkList.back().first;
-    VNInfo *VNI = WorkList.back().second;
-    WorkList.pop_back();
-    const MachineBasicBlock *MBB = getMBBFromIndex(Idx.getPrevSlot());
-    SlotIndex BlockStart = getMBBStartIdx(MBB);
+  // Handle dead values.
+  bool CanSeparate = computeDeadValues(*li, dead);
+  DEBUG(dbgs() << "Shrunk: " << *li << '\n');
+  return CanSeparate;
+}
 
-    // Extend the live range for VNI to be live at Idx.
-    if (VNInfo *ExtVNI = NewLR.extendInBlock(BlockStart, Idx)) {
-      (void)ExtVNI;
-      assert(ExtVNI == VNI && "Unexpected existing value number");
-      // Is this a PHIDef we haven't seen before?
-      if (!VNI->isPHIDef() || VNI->def != BlockStart ||
-          !UsedPHIs.insert(VNI).second)
-        continue;
-      // The PHI is live, make sure the predecessors are live-out.
-      for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
-           PE = MBB->pred_end(); PI != PE; ++PI) {
-        if (!LiveOut.insert(*PI).second)
-          continue;
-        SlotIndex Stop = getMBBEndIdx(*PI);
-        // A predecessor is not required to have a live-out value for a PHI.
-        if (VNInfo *PVNI = li->getVNInfoBefore(Stop))
-          WorkList.push_back(std::make_pair(Stop, PVNI));
+bool LiveIntervals::computeDeadValues(LiveInterval &LI,
+                                      SmallVectorImpl<MachineInstr*> *dead) {
+  bool PHIRemoved = false;
+  for (auto VNI : LI.valnos) {
+    if (VNI->isUnused())
+      continue;
+    SlotIndex Def = VNI->def;
+    LiveRange::iterator I = LI.FindSegmentContaining(Def);
+    assert(I != LI.end() && "Missing segment for VNI");
+
+    // Is the register live before? Otherwise we may have to add a read-undef
+    // flag for subregister defs.
+    if (MRI->tracksSubRegLiveness()) {
+      if ((I == LI.begin() || std::prev(I)->end < Def) && !VNI->isPHIDef()) {
+        MachineInstr *MI = getInstructionFromIndex(Def);
+        MI->addRegisterDefReadUndef(LI.reg);
       }
+    }
+
+    if (I->end != Def.getDeadSlot())
       continue;
+    if (VNI->isPHIDef()) {
+      // This is a dead PHI. Remove it.
+      VNI->markUnused();
+      LI.removeSegment(I);
+      DEBUG(dbgs() << "Dead PHI at " << Def << " may separate interval\n");
+      PHIRemoved = true;
+    } else {
+      // This is a dead def. Make sure the instruction knows.
+      MachineInstr *MI = getInstructionFromIndex(Def);
+      assert(MI && "No instruction defining live value");
+      MI->addRegisterDead(LI.reg, TRI);
+      if (dead && MI->allDefsAreDead()) {
+        DEBUG(dbgs() << "All defs dead: " << Def << '\t' << *MI);
+        dead->push_back(MI);
+      }
     }
+  }
+  return PHIRemoved;
+}
 
-    // VNI is live-in to MBB.
-    DEBUG(dbgs() << " live-in at " << BlockStart << '\n');
-    NewLR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI));
+void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg)
+{
+  DEBUG(dbgs() << "Shrink: " << SR << '\n');
+  assert(TargetRegisterInfo::isVirtualRegister(Reg)
+         && "Can only shrink virtual registers");
+  // Find all the values used, including PHI kills.
+  ShrinkToUsesWorkList WorkList;
 
-    // Make sure VNI is live-out from the predecessors.
-    for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
-         PE = MBB->pred_end(); PI != PE; ++PI) {
-      if (!LiveOut.insert(*PI).second)
+  // Visit all instructions reading Reg.
+  SlotIndex LastIdx;
+  for (MachineOperand &MO : MRI->reg_operands(Reg)) {
+    MachineInstr *UseMI = MO.getParent();
+    if (UseMI->isDebugValue())
+      continue;
+    // Maybe the operand is for a subregister we don't care about.
+    unsigned SubReg = MO.getSubReg();
+    if (SubReg != 0) {
+      unsigned SubRegMask = TRI->getSubRegIndexLaneMask(SubReg);
+      if ((SubRegMask & SR.LaneMask) == 0)
         continue;
-      SlotIndex Stop = getMBBEndIdx(*PI);
-      assert(li->getVNInfoBefore(Stop) == VNI &&
-             "Wrong value out of predecessor");
-      WorkList.push_back(std::make_pair(Stop, VNI));
     }
+    // We only need to visit each instruction once.
+    SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot();
+    if (Idx == LastIdx)
+      continue;
+    LastIdx = Idx;
+
+    LiveQueryResult LRQ = SR.Query(Idx);
+    VNInfo *VNI = LRQ.valueIn();
+    // For Subranges it is possible that only undef values are left in that
+    // part of the subregister, so there is no real liverange at the use
+    if (!VNI)
+      continue;
+
+    // Special case: An early-clobber tied operand reads and writes the
+    // register one slot early.
+    if (VNInfo *DefVNI = LRQ.valueDefined())
+      Idx = DefVNI->def;
+
+    WorkList.push_back(std::make_pair(Idx, VNI));
   }
 
-  // Handle dead values.
-  bool CanSeparate = false;
-  computeDeadValues(li, NewLR, &CanSeparate, dead);
+  // Create a new live ranges with only minimal live segments per def.
+  LiveRange NewLR;
+  createSegmentsForValues(NewLR, make_range(SR.vni_begin(), SR.vni_end()));
+  extendSegmentsToUses(NewLR, *Indexes, WorkList, SR);
 
-  // Move the trimmed segments back.
-  li->segments.swap(NewLR.segments);
-  DEBUG(dbgs() << "Shrunk: " << *li << '\n');
-  return CanSeparate;
-}
+  // Move the trimmed ranges back.
+  SR.segments.swap(NewLR.segments);
 
-void LiveIntervals::computeDeadValues(LiveInterval *li,
-                                      LiveRange &LR,
-                                      bool *CanSeparate,
-                                      SmallVectorImpl<MachineInstr*> *dead) {
-  for (LiveInterval::vni_iterator I = li->vni_begin(), E = li->vni_end();
-       I != E; ++I) {
-    VNInfo *VNI = *I;
+  // Remove dead PHI value numbers
+  for (auto VNI : SR.valnos) {
     if (VNI->isUnused())
       continue;
-    LiveRange::iterator LRI = LR.FindSegmentContaining(VNI->def);
-    assert(LRI != LR.end() && "Missing segment for PHI");
-    if (LRI->end != VNI->def.getDeadSlot())
+    const LiveRange::Segment *Segment = SR.getSegmentContaining(VNI->def);
+    assert(Segment != nullptr && "Missing segment for VNI");
+    if (Segment->end != VNI->def.getDeadSlot())
       continue;
     if (VNI->isPHIDef()) {
       // This is a dead PHI. Remove it.
       VNI->markUnused();
-      LR.removeSegment(LRI->start, LRI->end);
+      SR.removeSegment(*Segment);
       DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
-      if (CanSeparate)
-        *CanSeparate = true;
-    } else {
-      // This is a dead def. Make sure the instruction knows.
-      MachineInstr *MI = getInstructionFromIndex(VNI->def);
-      assert(MI && "No instruction defining live value");
-      MI->addRegisterDead(li->reg, TRI);
-      if (dead && MI->allDefsAreDead()) {
-        DEBUG(dbgs() << "All defs dead: " << VNI->def << '\t' << *MI);
-        dead->push_back(MI);
-      }
     }
   }
+
+  DEBUG(dbgs() << "Shrunk: " << SR << '\n');
 }
 
 void LiveIntervals::extendToIndices(LiveRange &LR,
@@ -462,26 +572,25 @@ void LiveIntervals::extendToIndices(LiveRange &LR,
     LRCalc->extend(LR, Indices[i]);
 }
 
-void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
+void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill,
                                SmallVectorImpl<SlotIndex> *EndPoints) {
-  LiveQueryResult LRQ = LI->Query(Kill);
-  VNInfo *VNI = LRQ.valueOut();
+  LiveQueryResult LRQ = LR.Query(Kill);
+  VNInfo *VNI = LRQ.valueOutOrDead();
   if (!VNI)
     return;
 
   MachineBasicBlock *KillMBB = Indexes->getMBBFromIndex(Kill);
-  SlotIndex MBBStart, MBBEnd;
-  std::tie(MBBStart, MBBEnd) = Indexes->getMBBRange(KillMBB);
+  SlotIndex MBBEnd = Indexes->getMBBEndIdx(KillMBB);
 
   // If VNI isn't live out from KillMBB, the value is trivially pruned.
   if (LRQ.endPoint() < MBBEnd) {
-    LI->removeSegment(Kill, LRQ.endPoint());
+    LR.removeSegment(Kill, LRQ.endPoint());
     if (EndPoints) EndPoints->push_back(LRQ.endPoint());
     return;
   }
 
   // VNI is live out of KillMBB.
-  LI->removeSegment(Kill, MBBEnd);
+  LR.removeSegment(Kill, MBBEnd);
   if (EndPoints) EndPoints->push_back(MBBEnd);
 
   // Find all blocks that are reachable from KillMBB without leaving VNI's live
@@ -498,8 +607,9 @@ void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
       MachineBasicBlock *MBB = *I;
 
       // Check if VNI is live in to MBB.
+      SlotIndex MBBStart, MBBEnd;
       std::tie(MBBStart, MBBEnd) = Indexes->getMBBRange(MBB);
-      LiveQueryResult LRQ = LI->Query(MBBStart);
+      LiveQueryResult LRQ = LR.Query(MBBStart);
       if (LRQ.valueIn() != VNI) {
         // This block isn't part of the VNI segment. Prune the search.
         I.skipChildren();
@@ -508,14 +618,14 @@ void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
 
       // Prune the search if VNI is killed in MBB.
       if (LRQ.endPoint() < MBBEnd) {
-        LI->removeSegment(MBBStart, LRQ.endPoint());
+        LR.removeSegment(MBBStart, LRQ.endPoint());
         if (EndPoints) EndPoints->push_back(LRQ.endPoint());
         I.skipChildren();
         continue;
       }
 
       // VNI is live through MBB.
-      LI->removeSegment(MBBStart, MBBEnd);
+      LR.removeSegment(MBBStart, MBBEnd);
       if (EndPoints) EndPoints->push_back(MBBEnd);
       ++I;
     }
@@ -528,14 +638,17 @@ void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
 
 void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
   // Keep track of regunit ranges.
-  SmallVector<std::pair<LiveRange*, LiveRange::iterator>, 8> RU;
+  SmallVector<std::pair<const LiveRange*, LiveRange::const_iterator>, 8> RU;
+  // Keep track of subregister ranges.
+  SmallVector<std::pair<const LiveInterval::SubRange*,
+                        LiveRange::const_iterator>, 4> SRs;
 
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
-    LiveInterval *LI = &getInterval(Reg);
-    if (LI->empty())
+    const LiveInterval &LI = getInterval(Reg);
+    if (LI.empty())
       continue;
 
     // Find the regunit intervals for the assigned register. They may overlap
@@ -543,15 +656,22 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
     RU.clear();
     for (MCRegUnitIterator Units(VRM->getPhys(Reg), TRI); Units.isValid();
          ++Units) {
-      LiveRange &RURanges = getRegUnit(*Units);
-      if (RURanges.empty())
+      const LiveRange &RURange = getRegUnit(*Units);
+      if (RURange.empty())
         continue;
-      RU.push_back(std::make_pair(&RURanges, RURanges.find(LI->begin()->end)));
+      RU.push_back(std::make_pair(&RURange, RURange.find(LI.begin()->end)));
+    }
+
+    if (MRI->tracksSubRegLiveness()) {
+      SRs.clear();
+      for (const LiveInterval::SubRange &SR : LI.subranges()) {
+        SRs.push_back(std::make_pair(&SR, SR.find(LI.begin()->end)));
+      }
     }
 
     // Every instruction that kills Reg corresponds to a segment range end
     // point.
-    for (LiveInterval::iterator RI = LI->begin(), RE = LI->end(); RI != RE;
+    for (LiveInterval::const_iterator RI = LI.begin(), RE = LI.end(); RI != RE;
          ++RI) {
       // A block index indicates an MBB edge.
       if (RI->end.isBlock())
@@ -568,23 +688,80 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
       //   BAR %EAX<kill>
       //
       // There should be no kill flag on FOO when %vreg5 is rewritten as %EAX.
-      bool CancelKill = false;
-      for (unsigned u = 0, e = RU.size(); u != e; ++u) {
-        LiveRange &RRanges = *RU[u].first;
-        LiveRange::iterator &I = RU[u].second;
-        if (I == RRanges.end())
+      for (auto &RUP : RU) {
+        const LiveRange &RURange = *RUP.first;
+        LiveRange::const_iterator &I = RUP.second;
+        if (I == RURange.end())
           continue;
-        I = RRanges.advanceTo(I, RI->end);
-        if (I == RRanges.end() || I->start >= RI->end)
+        I = RURange.advanceTo(I, RI->end);
+        if (I == RURange.end() || I->start >= RI->end)
           continue;
         // I is overlapping RI.
-        CancelKill = true;
-        break;
+        goto CancelKill;
+      }
+
+      if (MRI->tracksSubRegLiveness()) {
+        // When reading a partial undefined value we must not add a kill flag.
+        // The regalloc might have used the undef lane for something else.
+        // Example:
+        //     %vreg1 = ...              ; R32: %vreg1
+        //     %vreg2:high16 = ...       ; R64: %vreg2
+        //        = read %vreg2<kill>    ; R64: %vreg2
+        //        = read %vreg1          ; R32: %vreg1
+        // The <kill> flag is correct for %vreg2, but the register allocator may
+        // assign R0L to %vreg1, and R0 to %vreg2 because the low 32bits of R0
+        // are actually never written by %vreg2. After assignment the <kill>
+        // flag at the read instruction is invalid.
+        unsigned DefinedLanesMask;
+        if (!SRs.empty()) {
+          // Compute a mask of lanes that are defined.
+          DefinedLanesMask = 0;
+          for (auto &SRP : SRs) {
+            const LiveInterval::SubRange &SR = *SRP.first;
+            LiveRange::const_iterator &I = SRP.second;
+            if (I == SR.end())
+              continue;
+            I = SR.advanceTo(I, RI->end);
+            if (I == SR.end() || I->start >= RI->end)
+              continue;
+            // I is overlapping RI
+            DefinedLanesMask |= SR.LaneMask;
+          }
+        } else
+          DefinedLanesMask = ~0u;
+
+        bool IsFullWrite = false;
+        for (const MachineOperand &MO : MI->operands()) {
+          if (!MO.isReg() || MO.getReg() != Reg)
+            continue;
+          if (MO.isUse()) {
+            // Reading any undefined lanes?
+            unsigned UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
+            if ((UseMask & ~DefinedLanesMask) != 0)
+              goto CancelKill;
+          } else if (MO.getSubReg() == 0) {
+            // Writing to the full register?
+            assert(MO.isDef());
+            IsFullWrite = true;
+          }
+        }
+
+        // If an instruction writes to a subregister, a new segment starts in
+        // the LiveInterval. But as this is only overriding part of the register
+        // adding kill-flags is not correct here after registers have been
+        // assigned.
+        if (!IsFullWrite) {
+          // Next segment has to be adjacent in the subregister write case.
+          LiveRange::const_iterator N = std::next(RI);
+          if (N != LI.end() && N->start == RI->end)
+            goto CancelKill;
+        }
       }
-      if (CancelKill)
-        MI->clearRegisterKills(Reg, nullptr);
-      else
-        MI->addRegisterKilled(Reg, nullptr);
+
+      MI->addRegisterKilled(Reg, nullptr);
+      continue;
+CancelKill:
+      MI->clearRegisterKills(Reg, nullptr);
     }
   }
 }
@@ -615,9 +792,7 @@ LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const {
 
 bool
 LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const {
-  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
-       I != E; ++I) {
-    const VNInfo *PHI = *I;
+  for (const VNInfo *PHI : LI.valnos) {
     if (PHI->isUnused() || !PHI->isPHIDef())
       continue;
     const MachineBasicBlock *PHIMBB = getMBBFromIndex(PHI->def);
@@ -767,7 +942,16 @@ public:
         continue;
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
         LiveInterval &LI = LIS.getInterval(Reg);
-        updateRange(LI, Reg);
+        if (LI.hasSubRanges()) {
+          unsigned SubReg = MO->getSubReg();
+          unsigned LaneMask = TRI.getSubRegIndexLaneMask(SubReg);
+          for (LiveInterval::SubRange &S : LI.subranges()) {
+            if ((S.LaneMask & LaneMask) == 0)
+              continue;
+            updateRange(S, Reg, S.LaneMask);
+          }
+        }
+        updateRange(LI, Reg, 0);
         continue;
       }
 
@@ -775,7 +959,7 @@ public:
       // precomputed live range.
       for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
         if (LiveRange *LR = getRegUnitLI(*Units))
-          updateRange(*LR, *Units);
+          updateRange(*LR, *Units, 0);
     }
     if (hasRegMask)
       updateRegMaskSlots();
@@ -784,21 +968,24 @@ public:
 private:
   /// Update a single live range, assuming an instruction has been moved from
   /// OldIdx to NewIdx.
-  void updateRange(LiveRange &LR, unsigned Reg) {
+  void updateRange(LiveRange &LR, unsigned Reg, unsigned LaneMask) {
     if (!Updated.insert(&LR).second)
       return;
     DEBUG({
       dbgs() << "     ";
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
         dbgs() << PrintReg(Reg);
-      else
+        if (LaneMask != 0)
+          dbgs() << format(" L%04X", LaneMask);
+      } else {
         dbgs() << PrintRegUnit(Reg, &TRI);
+      }
       dbgs() << ":\t" << LR << '\n';
     });
     if (SlotIndex::isEarlierInstr(OldIdx, NewIdx))
       handleMoveDown(LR);
     else
-      handleMoveUp(LR, Reg);
+      handleMoveUp(LR, Reg, LaneMask);
     DEBUG(dbgs() << "        -->\t" << LR << '\n');
     LR.verify();
   }
@@ -911,7 +1098,7 @@ private:
   ///    Hoist kill to NewIdx, then scan for last kill between NewIdx and
   ///    OldIdx.
   ///
-  void handleMoveUp(LiveRange &LR, unsigned Reg) {
+  void handleMoveUp(LiveRange &LR, unsigned Reg, unsigned LaneMask) {
     // First look for a kill at OldIdx.
     LiveRange::iterator I = LR.find(OldIdx.getBaseIndex());
     LiveRange::iterator E = LR.end();
@@ -932,7 +1119,7 @@ private:
       if (I == E || !SlotIndex::isSameInstr(I->start, OldIdx)) {
         // No def, search for the new kill.
         // This can never be an early clobber kill since there is no def.
-        std::prev(I)->end = findLastUseBefore(Reg).getRegSlot();
+        std::prev(I)->end = findLastUseBefore(Reg, LaneMask).getRegSlot();
         return;
       }
     }
@@ -988,15 +1175,17 @@ private:
   }
 
   // Return the last use of reg between NewIdx and OldIdx.
-  SlotIndex findLastUseBefore(unsigned Reg) {
+  SlotIndex findLastUseBefore(unsigned Reg, unsigned LaneMask) {
 
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
       SlotIndex LastUse = NewIdx;
-      for (MachineRegisterInfo::use_instr_nodbg_iterator
-             UI = MRI.use_instr_nodbg_begin(Reg),
-             UE = MRI.use_instr_nodbg_end();
-           UI != UE; ++UI) {
-        const MachineInstr* MI = &*UI;
+      for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+        unsigned SubReg = MO.getSubReg();
+        if (SubReg != 0 && LaneMask != 0
+            && (TRI.getSubRegIndexLaneMask(SubReg) & LaneMask) == 0)
+          continue;
+
+        const MachineInstr *MI = MO.getParent();
         SlotIndex InstSlot = LIS.getSlotIndexes()->getInstructionIndex(MI);
         if (InstSlot > LastUse && InstSlot < OldIdx)
           LastUse = InstSlot;
@@ -1062,6 +1251,94 @@ void LiveIntervals::handleMoveIntoBundle(MachineInstr* MI,
   HME.updateAllRanges(MI);
 }
 
+void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
+                                        const MachineBasicBlock::iterator End,
+                                        const SlotIndex endIdx,
+                                        LiveRange &LR, const unsigned Reg,
+                                        const unsigned LaneMask) {
+  LiveInterval::iterator LII = LR.find(endIdx);
+  SlotIndex lastUseIdx;
+  if (LII != LR.end() && LII->start < endIdx)
+    lastUseIdx = LII->end;
+  else
+    --LII;
+
+  for (MachineBasicBlock::iterator I = End; I != Begin;) {
+    --I;
+    MachineInstr *MI = I;
+    if (MI->isDebugValue())
+      continue;
+
+    SlotIndex instrIdx = getInstructionIndex(MI);
+    bool isStartValid = getInstructionFromIndex(LII->start);
+    bool isEndValid = getInstructionFromIndex(LII->end);
+
+    // FIXME: This doesn't currently handle early-clobber or multiple removed
+    // defs inside of the region to repair.
+    for (MachineInstr::mop_iterator OI = MI->operands_begin(),
+         OE = MI->operands_end(); OI != OE; ++OI) {
+      const MachineOperand &MO = *OI;
+      if (!MO.isReg() || MO.getReg() != Reg)
+        continue;
+
+      unsigned SubReg = MO.getSubReg();
+      unsigned Mask = TRI->getSubRegIndexLaneMask(SubReg);
+      if ((Mask & LaneMask) == 0)
+        continue;
+
+      if (MO.isDef()) {
+        if (!isStartValid) {
+          if (LII->end.isDead()) {
+            SlotIndex prevStart;
+            if (LII != LR.begin())
+              prevStart = std::prev(LII)->start;
+
+            // FIXME: This could be more efficient if there was a
+            // removeSegment method that returned an iterator.
+            LR.removeSegment(*LII, true);
+            if (prevStart.isValid())
+              LII = LR.find(prevStart);
+            else
+              LII = LR.begin();
+          } else {
+            LII->start = instrIdx.getRegSlot();
+            LII->valno->def = instrIdx.getRegSlot();
+            if (MO.getSubReg() && !MO.isUndef())
+              lastUseIdx = instrIdx.getRegSlot();
+            else
+              lastUseIdx = SlotIndex();
+            continue;
+          }
+        }
+
+        if (!lastUseIdx.isValid()) {
+          VNInfo *VNI = LR.getNextValue(instrIdx.getRegSlot(), VNInfoAllocator);
+          LiveRange::Segment S(instrIdx.getRegSlot(),
+                               instrIdx.getDeadSlot(), VNI);
+          LII = LR.addSegment(S);
+        } else if (LII->start != instrIdx.getRegSlot()) {
+          VNInfo *VNI = LR.getNextValue(instrIdx.getRegSlot(), VNInfoAllocator);
+          LiveRange::Segment S(instrIdx.getRegSlot(), lastUseIdx, VNI);
+          LII = LR.addSegment(S);
+        }
+
+        if (MO.getSubReg() && !MO.isUndef())
+          lastUseIdx = instrIdx.getRegSlot();
+        else
+          lastUseIdx = SlotIndex();
+      } else if (MO.isUse()) {
+        // FIXME: This should probably be handled outside of this branch,
+        // either as part of the def case (for defs inside of the region) or
+        // after the loop over the region.
+        if (!isEndValid && !LII->end.isBlock())
+          LII->end = instrIdx.getRegSlot();
+        if (!lastUseIdx.isValid())
+          lastUseIdx = instrIdx.getRegSlot();
+      }
+    }
+  }
+}
+
 void
 LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator Begin,
@@ -1107,83 +1384,31 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
     if (!LI.hasAtLeastOneValue())
       continue;
 
-    LiveInterval::iterator LII = LI.find(endIdx);
-    SlotIndex lastUseIdx;
-    if (LII != LI.end() && LII->start < endIdx)
-      lastUseIdx = LII->end;
-    else
-      --LII;
-
-    for (MachineBasicBlock::iterator I = End; I != Begin;) {
-      --I;
-      MachineInstr *MI = I;
-      if (MI->isDebugValue())
-        continue;
-
-      SlotIndex instrIdx = getInstructionIndex(MI);
-      bool isStartValid = getInstructionFromIndex(LII->start);
-      bool isEndValid = getInstructionFromIndex(LII->end);
-
-      // FIXME: This doesn't currently handle early-clobber or multiple removed
-      // defs inside of the region to repair.
-      for (MachineInstr::mop_iterator OI = MI->operands_begin(),
-           OE = MI->operands_end(); OI != OE; ++OI) {
-        const MachineOperand &MO = *OI;
-        if (!MO.isReg() || MO.getReg() != Reg)
-          continue;
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      repairOldRegInRange(Begin, End, endIdx, S, Reg, S.LaneMask);
+    }
+    repairOldRegInRange(Begin, End, endIdx, LI, Reg);
+  }
+}
 
-        if (MO.isDef()) {
-          if (!isStartValid) {
-            if (LII->end.isDead()) {
-              SlotIndex prevStart;
-              if (LII != LI.begin())
-                prevStart = std::prev(LII)->start;
-
-              // FIXME: This could be more efficient if there was a
-              // removeSegment method that returned an iterator.
-              LI.removeSegment(*LII, true);
-              if (prevStart.isValid())
-                LII = LI.find(prevStart);
-              else
-                LII = LI.begin();
-            } else {
-              LII->start = instrIdx.getRegSlot();
-              LII->valno->def = instrIdx.getRegSlot();
-              if (MO.getSubReg() && !MO.isUndef())
-                lastUseIdx = instrIdx.getRegSlot();
-              else
-                lastUseIdx = SlotIndex();
-              continue;
-            }
-          }
+void LiveIntervals::removePhysRegDefAt(unsigned Reg, SlotIndex Pos) {
+  for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+    if (LiveRange *LR = getCachedRegUnit(*Units))
+      if (VNInfo *VNI = LR->getVNInfoAt(Pos))
+        LR->removeValNo(VNI);
+  }
+}
 
-          if (!lastUseIdx.isValid()) {
-            VNInfo *VNI = LI.getNextValue(instrIdx.getRegSlot(),
-                                          VNInfoAllocator);
-            LiveRange::Segment S(instrIdx.getRegSlot(),
-                                 instrIdx.getDeadSlot(), VNI);
-            LII = LI.addSegment(S);
-          } else if (LII->start != instrIdx.getRegSlot()) {
-            VNInfo *VNI = LI.getNextValue(instrIdx.getRegSlot(),
-                                          VNInfoAllocator);
-            LiveRange::Segment S(instrIdx.getRegSlot(), lastUseIdx, VNI);
-            LII = LI.addSegment(S);
-          }
+void LiveIntervals::removeVRegDefAt(LiveInterval &LI, SlotIndex Pos) {
+  VNInfo *VNI = LI.getVNInfoAt(Pos);
+  if (VNI == nullptr)
+    return;
+  LI.removeValNo(VNI);
 
-          if (MO.getSubReg() && !MO.isUndef())
-            lastUseIdx = instrIdx.getRegSlot();
-          else
-            lastUseIdx = SlotIndex();
-        } else if (MO.isUse()) {
-          // FIXME: This should probably be handled outside of this branch,
-          // either as part of the def case (for defs inside of the region) or
-          // after the loop over the region.
-          if (!isEndValid && !LII->end.isBlock())
-            LII->end = instrIdx.getRegSlot();
-          if (!lastUseIdx.isValid())
-            lastUseIdx = instrIdx.getRegSlot();
-        }
-      }
-    }
+  // Also remove the value in subranges.
+  for (LiveInterval::SubRange &S : LI.subranges()) {
+    if (VNInfo *SVNI = S.getVNInfoAt(Pos))
+      S.removeValNo(SVNI);
   }
+  LI.removeEmptySubRanges();
 }
diff --git a/lib/CodeGen/LiveIntervalUnion.cpp b/lib/CodeGen/LiveIntervalUnion.cpp
index d81221b..025d99c 100644
--- a/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/lib/CodeGen/LiveIntervalUnion.cpp
@@ -26,14 +26,14 @@ using namespace llvm;
 
 
 // Merge a LiveInterval's segments. Guarantee no overlaps.
-void LiveIntervalUnion::unify(LiveInterval &VirtReg) {
-  if (VirtReg.empty())
+void LiveIntervalUnion::unify(LiveInterval &VirtReg, const LiveRange &Range) {
+  if (Range.empty())
     return;
   ++Tag;
 
   // Insert each of the virtual register's live segments into the map.
-  LiveInterval::iterator RegPos = VirtReg.begin();
-  LiveInterval::iterator RegEnd = VirtReg.end();
+  LiveRange::const_iterator RegPos = Range.begin();
+  LiveRange::const_iterator RegEnd = Range.end();
   SegmentIter SegPos = Segments.find(RegPos->start);
 
   while (SegPos.valid()) {
@@ -53,14 +53,14 @@ void LiveIntervalUnion::unify(LiveInterval &VirtReg) {
 }
 
 // Remove a live virtual register's segments from this union.
-void LiveIntervalUnion::extract(LiveInterval &VirtReg) {
-  if (VirtReg.empty())
+void LiveIntervalUnion::extract(LiveInterval &VirtReg, const LiveRange &Range) {
+  if (Range.empty())
     return;
   ++Tag;
 
   // Remove each of the virtual register's live segments from the map.
-  LiveInterval::iterator RegPos = VirtReg.begin();
-  LiveInterval::iterator RegEnd = VirtReg.end();
+  LiveRange::const_iterator RegPos = Range.begin();
+  LiveRange::const_iterator RegEnd = Range.end();
   SegmentIter SegPos = Segments.find(RegPos->start);
 
   for (;;) {
@@ -70,7 +70,7 @@ void LiveIntervalUnion::extract(LiveInterval &VirtReg) {
       return;
 
     // Skip all segments that may have been coalesced.
-    RegPos = VirtReg.advanceTo(RegPos, SegPos.start());
+    RegPos = Range.advanceTo(RegPos, SegPos.start());
     if (RegPos == RegEnd)
       return;
 
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index a558e14..d804b39 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -19,6 +19,13 @@ using namespace llvm;
 
 #define DEBUG_TYPE "regalloc"
 
+void LiveRangeCalc::resetLiveOutMap() {
+  unsigned NumBlocks = MF->getNumBlockIDs();
+  Seen.clear();
+  Seen.resize(NumBlocks);
+  Map.resize(NumBlocks);
+}
+
 void LiveRangeCalc::reset(const MachineFunction *mf,
                           SlotIndexes *SI,
                           MachineDominatorTree *MDT,
@@ -28,126 +35,207 @@ void LiveRangeCalc::reset(const MachineFunction *mf,
   Indexes = SI;
   DomTree = MDT;
   Alloc = VNIA;
-
-  unsigned N = MF->getNumBlockIDs();
-  Seen.clear();
-  Seen.resize(N);
-  LiveOut.resize(N);
+  resetLiveOutMap();
   LiveIn.clear();
 }
 
 
-void LiveRangeCalc::createDeadDefs(LiveRange &LR, unsigned Reg) {
+static void createDeadDef(SlotIndexes &Indexes, VNInfo::Allocator &Alloc,
+                          LiveRange &LR, const MachineOperand &MO) {
+    const MachineInstr *MI = MO.getParent();
+    SlotIndex DefIdx =
+        Indexes.getInstructionIndex(MI).getRegSlot(MO.isEarlyClobber());
+
+    // Create the def in LR. This may find an existing def.
+    LR.createDeadDef(DefIdx, Alloc);
+}
+
+void LiveRangeCalc::calculate(LiveInterval &LI) {
   assert(MRI && Indexes && "call reset() first");
 
+  // Step 1: Create minimal live segments for every definition of Reg.
   // Visit all def operands. If the same instruction has multiple defs of Reg,
-  // LR.createDeadDef() will deduplicate.
-  for (MachineOperand &MO : MRI->def_operands(Reg)) {
-    const MachineInstr *MI = MO.getParent();
-    // Find the corresponding slot index.
-    SlotIndex Idx;
-    if (MI->isPHI())
-      // PHI defs begin at the basic block start index.
-      Idx = Indexes->getMBBStartIdx(MI->getParent());
-    else
-      // Instructions are either normal 'r', or early clobber 'e'.
-      Idx = Indexes->getInstructionIndex(MI)
-        .getRegSlot(MO.isEarlyClobber());
+  // createDeadDef() will deduplicate.
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
+  unsigned Reg = LI.reg;
+  for (const MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+    if (!MO.isDef() && !MO.readsReg())
+      continue;
 
-    // Create the def in LR. This may find an existing def.
-    LR.createDeadDef(Idx, *Alloc);
+    unsigned SubReg = MO.getSubReg();
+    if (LI.hasSubRanges() || (SubReg != 0 && MRI->tracksSubRegLiveness())) {
+      unsigned Mask = SubReg != 0 ? TRI.getSubRegIndexLaneMask(SubReg)
+                                  : MRI->getMaxLaneMaskForVReg(Reg);
+
+      // If this is the first time we see a subregister def, initialize
+      // subranges by creating a copy of the main range.
+      if (!LI.hasSubRanges() && !LI.empty()) {
+        unsigned ClassMask = MRI->getMaxLaneMaskForVReg(Reg);
+        LI.createSubRangeFrom(*Alloc, ClassMask, LI);
+      }
+
+      for (LiveInterval::SubRange &S : LI.subranges()) {
+        // A Mask for subregs common to the existing subrange and current def.
+        unsigned Common = S.LaneMask & Mask;
+        if (Common == 0)
+          continue;
+        // A Mask for subregs covered by the subrange but not the current def.
+        unsigned LRest = S.LaneMask & ~Mask;
+        LiveInterval::SubRange *CommonRange;
+        if (LRest != 0) {
+          // Split current subrange into Common and LRest ranges.
+          S.LaneMask = LRest;
+          CommonRange = LI.createSubRangeFrom(*Alloc, Common, S);
+        } else {
+          assert(Common == S.LaneMask);
+          CommonRange = &S;
+        }
+        if (MO.isDef())
+          createDeadDef(*Indexes, *Alloc, *CommonRange, MO);
+        Mask &= ~Common;
+      }
+      // Create a new SubRange for subregs we did not cover yet.
+      if (Mask != 0) {
+        LiveInterval::SubRange *NewRange = LI.createSubRange(*Alloc, Mask);
+        if (MO.isDef())
+          createDeadDef(*Indexes, *Alloc, *NewRange, MO);
+      }
+    }
+
+    // Create the def in the main liverange. We do not have to do this if
+    // subranges are tracked as we recreate the main range later in this case.
+    if (MO.isDef() && !LI.hasSubRanges())
+      createDeadDef(*Indexes, *Alloc, LI, MO);
+  }
+
+  // We may have created empty live ranges for partially undefined uses, we
+  // can't keep them because we won't find defs in them later.
+  LI.removeEmptySubRanges();
+
+  // Step 2: Extend live segments to all uses, constructing SSA form as
+  // necessary.
+  if (LI.hasSubRanges()) {
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      resetLiveOutMap();
+      extendToUses(S, Reg, S.LaneMask);
+    }
+    LI.clear();
+    LI.constructMainRangeFromSubranges(*Indexes, *Alloc);
+  } else {
+    resetLiveOutMap();
+    extendToUses(LI, Reg, ~0u);
   }
 }
 
 
-void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg) {
+void LiveRangeCalc::createDeadDefs(LiveRange &LR, unsigned Reg) {
   assert(MRI && Indexes && "call reset() first");
 
+  // Visit all def operands. If the same instruction has multiple defs of Reg,
+  // LR.createDeadDef() will deduplicate.
+  for (MachineOperand &MO : MRI->def_operands(Reg))
+    createDeadDef(*Indexes, *Alloc, LR, MO);
+}
+
+
+void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg, unsigned Mask) {
   // Visit all operands that read Reg. This may include partial defs.
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
   for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
     // Clear all kill flags. They will be reinserted after register allocation
     // by LiveIntervalAnalysis::addKillFlags().
     if (MO.isUse())
       MO.setIsKill(false);
+    else {
+      // We only care about uses, but on the main range (mask ~0u) this includes
+      // the "virtual" reads happening for subregister defs.
+      if (Mask != ~0u)
+        continue;
+    }
+
     if (!MO.readsReg())
       continue;
-    // MI is reading Reg. We may have visited MI before if it happens to be
-    // reading Reg multiple times. That is OK, extend() is idempotent.
+    unsigned SubReg = MO.getSubReg();
+    if (SubReg != 0) {
+      unsigned SubRegMask = TRI.getSubRegIndexLaneMask(SubReg);
+      // Ignore uses not covering the current subrange.
+      if ((SubRegMask & Mask) == 0)
+        continue;
+    }
+
+    // Determine the actual place of the use.
     const MachineInstr *MI = MO.getParent();
     unsigned OpNo = (&MO - &MI->getOperand(0));
-
-    // Find the SlotIndex being read.
-    SlotIndex Idx;
+    SlotIndex UseIdx;
     if (MI->isPHI()) {
       assert(!MO.isDef() && "Cannot handle PHI def of partial register.");
-      // PHI operands are paired: (Reg, PredMBB).
-      // Extend the live range to be live-out from PredMBB.
-      Idx = Indexes->getMBBEndIdx(MI->getOperand(OpNo+1).getMBB());
+      // The actual place where a phi operand is used is the end of the pred
+      // MBB. PHI operands are paired: (Reg, PredMBB).
+      UseIdx = Indexes->getMBBEndIdx(MI->getOperand(OpNo+1).getMBB());
     } else {
-      // This is a normal instruction.
-      Idx = Indexes->getInstructionIndex(MI).getRegSlot();
       // Check for early-clobber redefs.
+      bool isEarlyClobber = false;
       unsigned DefIdx;
-      if (MO.isDef()) {
-        if (MO.isEarlyClobber())
-          Idx = Idx.getRegSlot(true);
-      } else if (MI->isRegTiedToDefOperand(OpNo, &DefIdx)) {
+      if (MO.isDef())
+        isEarlyClobber = MO.isEarlyClobber();
+      else if (MI->isRegTiedToDefOperand(OpNo, &DefIdx)) {
         // FIXME: This would be a lot easier if tied early-clobber uses also
         // had an early-clobber flag.
-        if (MI->getOperand(DefIdx).isEarlyClobber())
-          Idx = Idx.getRegSlot(true);
+        isEarlyClobber = MI->getOperand(DefIdx).isEarlyClobber();
       }
+      UseIdx = Indexes->getInstructionIndex(MI).getRegSlot(isEarlyClobber);
     }
-    extend(LR, Idx, Reg);
+
+    // MI is reading Reg. We may have visited MI before if it happens to be
+    // reading Reg multiple times. That is OK, extend() is idempotent.
+    extend(LR, UseIdx, Reg);
   }
 }
 
 
-// Transfer information from the LiveIn vector to the live ranges.
-void LiveRangeCalc::updateLiveIns() {
+void LiveRangeCalc::updateFromLiveIns() {
   LiveRangeUpdater Updater;
-  for (SmallVectorImpl<LiveInBlock>::iterator I = LiveIn.begin(),
-         E = LiveIn.end(); I != E; ++I) {
-    if (!I->DomNode)
+  for (const LiveInBlock &I : LiveIn) {
+    if (!I.DomNode)
       continue;
-    MachineBasicBlock *MBB = I->DomNode->getBlock();
-    assert(I->Value && "No live-in value found");
+    MachineBasicBlock *MBB = I.DomNode->getBlock();
+    assert(I.Value && "No live-in value found");
     SlotIndex Start, End;
     std::tie(Start, End) = Indexes->getMBBRange(MBB);
 
-    if (I->Kill.isValid())
+    if (I.Kill.isValid())
       // Value is killed inside this block.
-      End = I->Kill;
+      End = I.Kill;
     else {
       // The value is live-through, update LiveOut as well.
       // Defer the Domtree lookup until it is needed.
       assert(Seen.test(MBB->getNumber()));
-      LiveOut[MBB] = LiveOutPair(I->Value, (MachineDomTreeNode *)nullptr);
+      Map[MBB] = LiveOutPair(I.Value, nullptr);
     }
-    Updater.setDest(&I->LR);
-    Updater.add(Start, End, I->Value);
+    Updater.setDest(&I.LR);
+    Updater.add(Start, End, I.Value);
   }
   LiveIn.clear();
 }
 
 
-void LiveRangeCalc::extend(LiveRange &LR, SlotIndex Kill, unsigned PhysReg) {
-  assert(Kill.isValid() && "Invalid SlotIndex");
+void LiveRangeCalc::extend(LiveRange &LR, SlotIndex Use, unsigned PhysReg) {
+  assert(Use.isValid() && "Invalid SlotIndex");
   assert(Indexes && "Missing SlotIndexes");
   assert(DomTree && "Missing dominator tree");
 
-  MachineBasicBlock *KillMBB = Indexes->getMBBFromIndex(Kill.getPrevSlot());
-  assert(KillMBB && "No MBB at Kill");
+  MachineBasicBlock *UseMBB = Indexes->getMBBFromIndex(Use.getPrevSlot());
+  assert(UseMBB && "No MBB at Use");
 
   // Is there a def in the same MBB we can extend?
-  if (LR.extendInBlock(Indexes->getMBBStartIdx(KillMBB), Kill))
+  if (LR.extendInBlock(Indexes->getMBBStartIdx(UseMBB), Use))
     return;
 
-  // Find the single reaching def, or determine if Kill is jointly dominated by
+  // Find the single reaching def, or determine if Use is jointly dominated by
   // multiple values, and we may need to create even more phi-defs to preserve
   // VNInfo SSA form.  Perform a search for all predecessor blocks where we
   // know the dominating VNInfo.
-  if (findReachingDefs(LR, *KillMBB, Kill, PhysReg))
+  if (findReachingDefs(LR, *UseMBB, Use, PhysReg))
     return;
 
   // When there were multiple different values, we may need new PHIs.
@@ -162,16 +250,16 @@ void LiveRangeCalc::calculateValues() {
   assert(Indexes && "Missing SlotIndexes");
   assert(DomTree && "Missing dominator tree");
   updateSSA();
-  updateLiveIns();
+  updateFromLiveIns();
 }
 
 
-bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
-                                     SlotIndex Kill, unsigned PhysReg) {
-  unsigned KillMBBNum = KillMBB.getNumber();
+bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
+                                     SlotIndex Use, unsigned PhysReg) {
+  unsigned UseMBBNum = UseMBB.getNumber();
 
   // Block numbers where LR should be live-in.
-  SmallVector<unsigned, 16> WorkList(1, KillMBBNum);
+  SmallVector<unsigned, 16> WorkList(1, UseMBBNum);
 
   // Remember if we have seen more than one value.
   bool UniqueVNI = true;
@@ -202,7 +290,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
 
        // Is this a known live-out block?
        if (Seen.test(Pred->getNumber())) {
-         if (VNInfo *VNI = LiveOut[Pred].first) {
+         if (VNInfo *VNI = Map[Pred].first) {
            if (TheVNI && TheVNI != VNI)
              UniqueVNI = false;
            TheVNI = VNI;
@@ -225,11 +313,11 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
        }
 
        // No, we need a live-in value for Pred as well
-       if (Pred != &KillMBB)
+       if (Pred != &UseMBB)
           WorkList.push_back(Pred->getNumber());
        else
-          // Loopback to KillMBB, so value is really live through.
-         Kill = SlotIndex();
+          // Loopback to UseMBB, so value is really live through.
+         Use = SlotIndex();
     }
   }
 
@@ -247,12 +335,11 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
          E = WorkList.end(); I != E; ++I) {
        SlotIndex Start, End;
        std::tie(Start, End) = Indexes->getMBBRange(*I);
-       // Trim the live range in KillMBB.
-       if (*I == KillMBBNum && Kill.isValid())
-         End = Kill;
+       // Trim the live range in UseMBB.
+       if (*I == UseMBBNum && Use.isValid())
+         End = Use;
        else
-         LiveOut[MF->getBlockNumbered(*I)] =
-           LiveOutPair(TheVNI, nullptr);
+         Map[MF->getBlockNumbered(*I)] = LiveOutPair(TheVNI, nullptr);
        Updater.add(Start, End, TheVNI);
     }
     return true;
@@ -265,8 +352,8 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
        I = WorkList.begin(), E = WorkList.end(); I != E; ++I) {
     MachineBasicBlock *MBB = MF->getBlockNumbered(*I);
     addLiveInBlock(LR, DomTree->getNode(MBB));
-    if (MBB == &KillMBB)
-      LiveIn.back().Kill = Kill;
+    if (MBB == &UseMBB)
+      LiveIn.back().Kill = Use;
   }
 
   return false;
@@ -285,9 +372,8 @@ void LiveRangeCalc::updateSSA() {
     Changes = 0;
     // Propagate live-out values down the dominator tree, inserting phi-defs
     // when necessary.
-    for (SmallVectorImpl<LiveInBlock>::iterator I = LiveIn.begin(),
-           E = LiveIn.end(); I != E; ++I) {
-      MachineDomTreeNode *Node = I->DomNode;
+    for (LiveInBlock &I : LiveIn) {
+      MachineDomTreeNode *Node = I.DomNode;
       // Skip block if the live-in value has already been determined.
       if (!Node)
         continue;
@@ -303,16 +389,16 @@ void LiveRangeCalc::updateSSA() {
       // immediate dominator. Check if any of them have live-out values that are
       // properly dominated by IDom. If so, we need a phi-def here.
       if (!needPHI) {
-        IDomValue = LiveOut[IDom->getBlock()];
+        IDomValue = Map[IDom->getBlock()];
 
         // Cache the DomTree node that defined the value.
         if (IDomValue.first && !IDomValue.second)
-          LiveOut[IDom->getBlock()].second = IDomValue.second =
+          Map[IDom->getBlock()].second = IDomValue.second =
             DomTree->getNode(Indexes->getMBBFromIndex(IDomValue.first->def));
 
         for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
                PE = MBB->pred_end(); PI != PE; ++PI) {
-          LiveOutPair &Value = LiveOut[*PI];
+          LiveOutPair &Value = Map[*PI];
           if (!Value.first || Value.first == IDomValue.first)
             continue;
 
@@ -334,7 +420,7 @@ void LiveRangeCalc::updateSSA() {
       // The value may be live-through even if Kill is set, as can happen when
       // we are called from extendRange. In that case LiveOutSeen is true, and
       // LiveOut indicates a foreign or missing value.
-      LiveOutPair &LOP = LiveOut[MBB];
+      LiveOutPair &LOP = Map[MBB];
 
       // Create a phi-def if required.
       if (needPHI) {
@@ -342,25 +428,25 @@ void LiveRangeCalc::updateSSA() {
         assert(Alloc && "Need VNInfo allocator to create PHI-defs");
         SlotIndex Start, End;
         std::tie(Start, End) = Indexes->getMBBRange(MBB);
-        LiveRange &LR = I->LR;
+        LiveRange &LR = I.LR;
         VNInfo *VNI = LR.getNextValue(Start, *Alloc);
-        I->Value = VNI;
+        I.Value = VNI;
         // This block is done, we know the final value.
-        I->DomNode = nullptr;
+        I.DomNode = nullptr;
 
-        // Add liveness since updateLiveIns now skips this node.
-        if (I->Kill.isValid())
-          LR.addSegment(LiveInterval::Segment(Start, I->Kill, VNI));
+        // Add liveness since updateFromLiveIns now skips this node.
+        if (I.Kill.isValid())
+          LR.addSegment(LiveInterval::Segment(Start, I.Kill, VNI));
         else {
           LR.addSegment(LiveInterval::Segment(Start, End, VNI));
           LOP = LiveOutPair(VNI, Node);
         }
       } else if (IDomValue.first) {
         // No phi-def here. Remember incoming value.
-        I->Value = IDomValue.first;
+        I.Value = IDomValue.first;
 
         // If the IDomValue is killed in the block, don't propagate through.
-        if (I->Kill.isValid())
+        if (I.Kill.isValid())
           continue;
 
         // Propagate IDomValue if it isn't killed:
diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
index 345d6c4..90bf971 100644
--- a/lib/CodeGen/LiveRangeCalc.h
+++ b/lib/CodeGen/LiveRangeCalc.h
@@ -40,12 +40,6 @@ class LiveRangeCalc {
   MachineDominatorTree *DomTree;
   VNInfo::Allocator *Alloc;
 
-  /// Seen - Bit vector of active entries in LiveOut, also used as a visited
-  /// set by findReachingDefs.  One entry per basic block, indexed by block
-  /// number.  This is kept as a separate bit vector because it can be cleared
-  /// quickly when switching live ranges.
-  BitVector Seen;
-
   /// LiveOutPair - A value and the block that defined it.  The domtree node is
   /// redundant, it can be computed as: MDT[Indexes.getMBBFromIndex(VNI->def)].
   typedef std::pair<VNInfo*, MachineDomTreeNode*> LiveOutPair;
@@ -53,8 +47,14 @@ class LiveRangeCalc {
   /// LiveOutMap - Map basic blocks to the value leaving the block.
   typedef IndexedMap<LiveOutPair, MBB2NumberFunctor> LiveOutMap;
 
-  /// LiveOut - Map each basic block where a live range is live out to the
-  /// live-out value and its defining block.
+  /// Bit vector of active entries in LiveOut, also used as a visited set by
+  /// findReachingDefs.  One entry per basic block, indexed by block number.
+  /// This is kept as a separate bit vector because it can be cleared quickly
+  /// when switching live ranges.
+  BitVector Seen;
+
+  /// Map each basic block where a live range is live out to the live-out value
+  /// and its defining block.
   ///
   /// For every basic block, MBB, one of these conditions shall be true:
   ///
@@ -70,7 +70,7 @@ class LiveRangeCalc {
   ///
   /// The map can be shared by multiple live ranges as long as no two are
   /// live-out of the same block.
-  LiveOutMap LiveOut;
+  LiveOutMap Map;
 
   /// LiveInBlock - Information about a basic block where a live range is known
   /// to be live-in, but the value has not yet been determined.
@@ -101,17 +101,17 @@ class LiveRangeCalc {
   /// used to add entries directly.
   SmallVector<LiveInBlock, 16> LiveIn;
 
-  /// Assuming that LI is live-in to KillMBB and killed at Kill, find the set
-  /// of defs that can reach it.
+  /// Assuming that @p LR is live-in to @p UseMBB, find the set of defs that can
+  /// reach it.
   ///
-  /// If only one def can reach Kill, all paths from the def to kill are added
-  /// to LI, and the function returns true.
+  /// If only one def can reach @p UseMBB, all paths from the def to @p UseMBB
+  /// are added to @p LR, and the function returns true.
   ///
-  /// If multiple values can reach Kill, the blocks that need LI to be live in
-  /// are added to the LiveIn array, and the function returns false.
+  /// If multiple values can reach @p UseMBB, the blocks that need @p LR to be
+  /// live in are added to the LiveIn array, and the function returns false.
   ///
   /// PhysReg, when set, is used to verify live-in lists on basic blocks.
-  bool findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
+  bool findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
                         SlotIndex Kill, unsigned PhysReg);
 
   /// updateSSA - Compute the values that will be live in to all requested
@@ -121,8 +121,18 @@ class LiveRangeCalc {
   /// blocks.  No values are read from the live ranges.
   void updateSSA();
 
-  /// Add liveness as specified in the LiveIn vector.
-  void updateLiveIns();
+  /// Transfer information from the LiveIn vector to the live ranges and update
+  /// the given @p LiveOuts.
+  void updateFromLiveIns();
+
+  /// Extend the live range of @p LR to reach all uses of Reg.
+  ///
+  /// All uses must be jointly dominated by existing liveness.  PHI-defs are
+  /// inserted as needed to preserve SSA form.
+  void extendToUses(LiveRange &LR, unsigned Reg, unsigned LaneMask);
+
+  /// Reset Map and Seen fields.
+  void resetLiveOutMap();
 
 public:
   LiveRangeCalc() : MF(nullptr), MRI(nullptr), Indexes(nullptr),
@@ -152,37 +162,33 @@ public:
   // Modify existing live ranges.
   //
 
-  /// extend - Extend the live range of LI to reach Kill.
+  /// Extend the live range of @p LR to reach @p Use.
   ///
-  /// The existing values in LI must be live so they jointly dominate Kill.  If
-  /// Kill is not dominated by a single existing value, PHI-defs are inserted
-  /// as required to preserve SSA form.  If Kill is known to be dominated by a
-  /// single existing value, Alloc may be null.
+  /// The existing values in @p LR must be live so they jointly dominate @p Use.
+  /// If @p Use is not dominated by a single existing value, PHI-defs are
+  /// inserted as required to preserve SSA form.
   ///
   /// PhysReg, when set, is used to verify live-in lists on basic blocks.
-  void extend(LiveRange &LR, SlotIndex Kill, unsigned PhysReg = 0);
+  void extend(LiveRange &LR, SlotIndex Use, unsigned PhysReg = 0);
 
   /// createDeadDefs - Create a dead def in LI for every def operand of Reg.
   /// Each instruction defining Reg gets a new VNInfo with a corresponding
   /// minimal live range.
   void createDeadDefs(LiveRange &LR, unsigned Reg);
 
-  /// createDeadDefs - Create a dead def in LI for every def of LI->reg.
-  void createDeadDefs(LiveInterval &LI) {
-    createDeadDefs(LI, LI.reg);
-  }
-
-  /// extendToUses - Extend the live range of LI to reach all uses of Reg.
+  /// Extend the live range of @p LR to reach all uses of Reg.
   ///
   /// All uses must be jointly dominated by existing liveness.  PHI-defs are
   /// inserted as needed to preserve SSA form.
-  void extendToUses(LiveRange &LR, unsigned Reg);
-
-  /// extendToUses - Extend the live range of LI to reach all uses of LI->reg.
-  void extendToUses(LiveInterval &LI) {
-    extendToUses(LI, LI.reg);
+  void extendToUses(LiveRange &LR, unsigned PhysReg) {
+    extendToUses(LR, PhysReg, ~0u);
   }
 
+  /// Calculates liveness for the register specified in live interval @p LI.
+  /// Creates subregister live ranges as needed if subreg liveness tracking is
+  /// enabled.
+  void calculate(LiveInterval &LI);
+
   //===--------------------------------------------------------------------===//
   // Low-level interface.
   //===--------------------------------------------------------------------===//
@@ -204,7 +210,7 @@ public:
   /// addLiveInBlock().
   void setLiveOutValue(MachineBasicBlock *MBB, VNInfo *VNI) {
     Seen.set(MBB->getNumber());
-    LiveOut[MBB] = LiveOutPair(VNI, nullptr);
+    Map[MBB] = LiveOutPair(VNI, nullptr);
   }
 
   /// addLiveInBlock - Add a block with an unknown live-in value.  This
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index a0fb712..0edc897 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -60,9 +60,7 @@ bool LiveRangeEdit::checkRematerializable(VNInfo *VNI,
 }
 
 void LiveRangeEdit::scanRemattable(AliasAnalysis *aa) {
-  for (LiveInterval::vni_iterator I = getParent().vni_begin(),
-       E = getParent().vni_end(); I != E; ++I) {
-    VNInfo *VNI = *I;
+  for (VNInfo *VNI : getParent().valnos) {
     if (VNI->isUnused())
       continue;
     MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
@@ -258,15 +256,8 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
       // Check if MI reads any unreserved physregs.
       if (Reg && MOI->readsReg() && !MRI.isReserved(Reg))
         ReadsPhysRegs = true;
-      else if (MOI->isDef()) {
-        for (MCRegUnitIterator Units(Reg, MRI.getTargetRegisterInfo());
-             Units.isValid(); ++Units) {
-          if (LiveRange *LR = LIS.getCachedRegUnit(*Units)) {
-            if (VNInfo *VNI = LR->getVNInfoAt(Idx))
-              LR->removeValNo(VNI);
-          }
-        }
-      }
+      else if (MOI->isDef())
+        LIS.removePhysRegDefAt(Reg, Idx);
       continue;
     }
     LiveInterval &LI = LIS.getInterval(Reg);
@@ -282,13 +273,11 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
 
     // Remove defined value.
     if (MOI->isDef()) {
-      if (VNInfo *VNI = LI.getVNInfoAt(Idx)) {
-        if (TheDelegate)
-          TheDelegate->LRE_WillShrinkVirtReg(LI.reg);
-        LI.removeValNo(VNI);
-        if (LI.empty())
-          RegsToErase.push_back(Reg);
-      }
+      if (TheDelegate && LI.getVNInfoAt(Idx) != nullptr)
+        TheDelegate->LRE_WillShrinkVirtReg(LI.reg);
+      LIS.removeVRegDefAt(LI, Idx);
+      if (LI.empty())
+        RegsToErase.push_back(Reg);
     }
   }
 
@@ -410,7 +399,7 @@ LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF,
   VirtRegAuxInfo VRAI(MF, LIS, Loops, MBFI);
   for (unsigned I = 0, Size = size(); I < Size; ++I) {
     LiveInterval &LI = LIS.getInterval(get(I));
-    if (MRI.recomputeRegClass(LI.reg, MF.getTarget()))
+    if (MRI.recomputeRegClass(LI.reg))
       DEBUG({
         const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
         dbgs() << "Inflated " << PrintReg(LI.reg) << " to "
diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
index a8cae08..154ce6f 100644
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
@@ -71,16 +72,44 @@ void LiveRegMatrix::releaseMemory() {
   }
 }
 
+template<typename Callable>
+bool foreachUnit(const TargetRegisterInfo *TRI, LiveInterval &VRegInterval,
+                 unsigned PhysReg, Callable Func) {
+  if (VRegInterval.hasSubRanges()) {
+    for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+      unsigned Unit = (*Units).first;
+      unsigned Mask = (*Units).second;
+      for (LiveInterval::SubRange &S : VRegInterval.subranges()) {
+        if (S.LaneMask & Mask) {
+          if (Func(Unit, S))
+            return true;
+          break;
+        }
+      }
+    }
+  } else {
+    for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+      if (Func(*Units, VRegInterval))
+        return true;
+    }
+  }
+  return false;
+}
+
 void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) {
   DEBUG(dbgs() << "assigning " << PrintReg(VirtReg.reg, TRI)
                << " to " << PrintReg(PhysReg, TRI) << ':');
   assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment");
   VRM->assignVirt2Phys(VirtReg.reg, PhysReg);
   MRI->setPhysRegUsed(PhysReg);
-  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
-    DEBUG(dbgs() << ' ' << PrintRegUnit(*Units, TRI));
-    Matrix[*Units].unify(VirtReg);
-  }
+
+  foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
+                                         const LiveRange &Range) {
+    DEBUG(dbgs() << ' ' << PrintRegUnit(Unit, TRI) << ' ' << Range);
+    Matrix[Unit].unify(VirtReg, Range);
+    return false;
+  });
+
   ++NumAssigned;
   DEBUG(dbgs() << '\n');
 }
@@ -90,10 +119,14 @@ void LiveRegMatrix::unassign(LiveInterval &VirtReg) {
   DEBUG(dbgs() << "unassigning " << PrintReg(VirtReg.reg, TRI)
                << " from " << PrintReg(PhysReg, TRI) << ':');
   VRM->clearVirt(VirtReg.reg);
-  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
-    DEBUG(dbgs() << ' ' << PrintRegUnit(*Units, TRI));
-    Matrix[*Units].extract(VirtReg);
-  }
+
+  foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
+                                         const LiveRange &Range) {
+    DEBUG(dbgs() << ' ' << PrintRegUnit(Unit, TRI));
+    Matrix[Unit].extract(VirtReg, Range);
+    return false;
+  });
+
   ++NumUnassigned;
   DEBUG(dbgs() << '\n');
 }
@@ -121,12 +154,13 @@ bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,
   if (VirtReg.empty())
     return false;
   CoalescerPair CP(VirtReg.reg, PhysReg, *TRI);
-  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
-    const LiveRange &UnitRange = LIS->getRegUnit(*Units);
-    if (VirtReg.overlaps(UnitRange, CP, *LIS->getSlotIndexes()))
-      return true;
-  }
-  return false;
+
+  bool Result = foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
+                                                       const LiveRange &Range) {
+    const LiveRange &UnitRange = LIS->getRegUnit(Unit);
+    return Range.overlaps(UnitRange, CP, *LIS->getSlotIndexes());
+  });
+  return Result;
 }
 
 LiveIntervalUnion::Query &LiveRegMatrix::query(LiveInterval &VirtReg,
diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp
index 5c5712f..e8bf687 100644
--- a/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -291,6 +291,7 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
       // Debug value, stackmap and patchpoint instructions can't be out of
       // range, so they don't need any updates.
       if (MI->isDebugValue() ||
+          MI->getOpcode() == TargetOpcode::STATEPOINT ||
           MI->getOpcode() == TargetOpcode::STACKMAP ||
           MI->getOpcode() == TargetOpcode::PATCHPOINT)
         continue;
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 3058b1a..3c73905 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -24,7 +24,6 @@
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/LeakDetector.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Support/Debug.h"
@@ -45,7 +44,6 @@ MachineBasicBlock::MachineBasicBlock(MachineFunction &mf, const BasicBlock *bb)
 }
 
 MachineBasicBlock::~MachineBasicBlock() {
-  LeakDetector::removeGarbageObject(this);
 }
 
 /// getSymbol - Return the MCSymbol for this basic block.
@@ -54,9 +52,7 @@ MCSymbol *MachineBasicBlock::getSymbol() const {
   if (!CachedMCSymbol) {
     const MachineFunction *MF = getParent();
     MCContext &Ctx = MF->getContext();
-    const TargetMachine &TM = MF->getTarget();
-    const char *Prefix =
-        TM.getSubtargetImpl()->getDataLayout()->getPrivateGlobalPrefix();
+    const char *Prefix = Ctx.getAsmInfo()->getPrivateLabelPrefix();
     CachedMCSymbol = Ctx.GetOrCreateSymbol(Twine(Prefix) + "BB" +
                                            Twine(MF->getFunctionNumber()) +
                                            "_" + Twine(getNumber()));
@@ -87,14 +83,11 @@ void ilist_traits<MachineBasicBlock>::addNodeToList(MachineBasicBlock *N) {
   for (MachineBasicBlock::instr_iterator
          I = N->instr_begin(), E = N->instr_end(); I != E; ++I)
     I->AddRegOperandsToUseLists(RegInfo);
-
-  LeakDetector::removeGarbageObject(N);
 }
 
 void ilist_traits<MachineBasicBlock>::removeNodeFromList(MachineBasicBlock *N) {
   N->getParent()->removeFromMBBNumbering(N->Number);
   N->Number = -1;
-  LeakDetector::addGarbageObject(N);
 }
 
 
@@ -109,8 +102,6 @@ void ilist_traits<MachineInstr>::addNodeToList(MachineInstr *N) {
   // use/def lists.
   MachineFunction *MF = Parent->getParent();
   N->AddRegOperandsToUseLists(MF->getRegInfo());
-
-  LeakDetector::removeGarbageObject(N);
 }
 
 /// removeNodeFromList (MI) - When we remove an instruction from a basic block
@@ -124,8 +115,6 @@ void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr *N) {
     N->RemoveRegOperandsFromUseLists(MF->getRegInfo());
 
   N->setParent(nullptr);
-
-  LeakDetector::addGarbageObject(N);
 }
 
 /// transferNodesFromList (MI) - When moving a range of instructions from one
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 08fd200..1b5c1f1 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -347,65 +347,61 @@ MachineBasicBlock *MachineBlockPlacement::selectBestSuccessor(
   uint32_t WeightScale = 0;
   uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale);
   DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n");
-  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
-                                        SE = BB->succ_end();
-       SI != SE; ++SI) {
-    if (BlockFilter && !BlockFilter->count(*SI))
+  for (MachineBasicBlock *Succ : BB->successors()) {
+    if (BlockFilter && !BlockFilter->count(Succ))
       continue;
-    BlockChain &SuccChain = *BlockToChain[*SI];
+    BlockChain &SuccChain = *BlockToChain[Succ];
     if (&SuccChain == &Chain) {
-      DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> Already merged!\n");
+      DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> Already merged!\n");
       continue;
     }
-    if (*SI != *SuccChain.begin()) {
-      DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> Mid chain!\n");
+    if (Succ != *SuccChain.begin()) {
+      DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> Mid chain!\n");
       continue;
     }
 
-    uint32_t SuccWeight = MBPI->getEdgeWeight(BB, *SI);
+    uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ);
     BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
 
     // Only consider successors which are either "hot", or wouldn't violate
     // any CFG constraints.
     if (SuccChain.LoopPredecessors != 0) {
       if (SuccProb < HotProb) {
-        DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> " << SuccProb
+        DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
                      << " (prob) (CFG conflict)\n");
         continue;
       }
 
-      // Make sure that a hot successor doesn't have a globally more important
-      // predecessor.
-      BlockFrequency CandidateEdgeFreq
-        = MBFI->getBlockFreq(BB) * SuccProb * HotProb.getCompl();
+      // Make sure that a hot successor doesn't have a globally more
+      // important predecessor.
+      BlockFrequency CandidateEdgeFreq =
+          MBFI->getBlockFreq(BB) * SuccProb * HotProb.getCompl();
       bool BadCFGConflict = false;
-      for (MachineBasicBlock::pred_iterator PI = (*SI)->pred_begin(),
-                                            PE = (*SI)->pred_end();
-           PI != PE; ++PI) {
-        if (*PI == *SI || (BlockFilter && !BlockFilter->count(*PI)) ||
-            BlockToChain[*PI] == &Chain)
+      for (MachineBasicBlock *Pred : Succ->predecessors()) {
+        if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) ||
+            BlockToChain[Pred] == &Chain)
           continue;
-        BlockFrequency PredEdgeFreq
-          = MBFI->getBlockFreq(*PI) * MBPI->getEdgeProbability(*PI, *SI);
+        BlockFrequency PredEdgeFreq =
+            MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
         if (PredEdgeFreq >= CandidateEdgeFreq) {
           BadCFGConflict = true;
           break;
         }
       }
       if (BadCFGConflict) {
-        DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> " << SuccProb
+        DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
                      << " (prob) (non-cold CFG conflict)\n");
         continue;
       }
     }
 
-    DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> " << SuccProb
+    DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
                  << " (prob)"
                  << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "")
                  << "\n");
     if (BestSucc && BestWeight >= SuccWeight)
       continue;
-    BestSucc = *SI;
+    BestSucc = Succ;
     BestWeight = SuccWeight;
   }
   return BestSucc;
@@ -1043,12 +1039,8 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
   // exclusively on the loop info here so that we can align backedges in
   // unnatural CFGs and backedges that were introduced purely because of the
   // loop rotations done during this layout pass.
-  if (F.getFunction()->getAttributes().
-        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize))
+  if (F.getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
     return;
-  unsigned Align = TLI->getPrefLoopAlignment();
-  if (!Align)
-    return;  // Don't care about loop alignment.
   if (FunctionChain.begin() == FunctionChain.end())
     return;  // Empty chain.
 
@@ -1066,6 +1058,10 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
     if (!L)
       continue;
 
+    unsigned Align = TLI->getPrefLoopAlignment(L);
+    if (!Align)
+      continue;  // Don't care about loop alignment.
+
     // If the block is cold relative to the function entry don't waste space
     // aligning it.
     BlockFrequency Freq = MBFI->getBlockFreq(*BI);
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index ae26967..21b9c5a 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -451,6 +451,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
 
   SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs;
   SmallVector<unsigned, 2> ImplicitDefsToUpdate;
+  SmallVector<unsigned, 2> ImplicitDefs;
   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ) {
     MachineInstr *MI = &*I;
     ++I;
@@ -542,6 +543,12 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
       // we should make sure it is not dead at CSMI.
       if (MO.isImplicit() && !MO.isDead() && CSMI->getOperand(i).isDead())
         ImplicitDefsToUpdate.push_back(i);
+
+      // Keep track of implicit defs of CSMI and MI, to clear possibly
+      // made-redundant kill flags.
+      if (MO.isImplicit() && !MO.isDead() && OldReg == NewReg)
+        ImplicitDefs.push_back(OldReg);
+
       if (OldReg == NewReg) {
         --NumDefs;
         continue;
@@ -573,8 +580,15 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
     // Actually perform the elimination.
     if (DoCSE) {
       for (unsigned i = 0, e = CSEPairs.size(); i != e; ++i) {
-        MRI->replaceRegWith(CSEPairs[i].first, CSEPairs[i].second);
-        MRI->clearKillFlags(CSEPairs[i].second);
+        unsigned OldReg = CSEPairs[i].first;
+        unsigned NewReg = CSEPairs[i].second;
+        // OldReg may have been unused but is used now, clear the Dead flag
+        MachineInstr *Def = MRI->getUniqueVRegDef(NewReg);
+        assert(Def != nullptr && "CSEd register has no unique definition?");
+        Def->clearRegisterDeads(NewReg);
+        // Replace with NewReg and clear kill flags which may be wrong now.
+        MRI->replaceRegWith(OldReg, NewReg);
+        MRI->clearKillFlags(NewReg);
       }
 
       // Go through implicit defs of CSMI and MI, if a def is not dead at MI,
@@ -582,6 +596,29 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
       for (unsigned i = 0, e = ImplicitDefsToUpdate.size(); i != e; ++i)
         CSMI->getOperand(ImplicitDefsToUpdate[i]).setIsDead(false);
 
+      // Go through implicit defs of CSMI and MI, and clear the kill flags on
+      // their uses in all the instructions between CSMI and MI.
+      // We might have made some of the kill flags redundant, consider:
+      //   subs  ... %NZCV<imp-def>        <- CSMI
+      //   csinc ... %NZCV<imp-use,kill>   <- this kill flag isn't valid anymore
+      //   subs  ... %NZCV<imp-def>        <- MI, to be eliminated
+      //   csinc ... %NZCV<imp-use,kill>
+      // Since we eliminated MI, and reused a register imp-def'd by CSMI
+      // (here %NZCV), that register, if it was killed before MI, should have
+      // that kill flag removed, because it's lifetime was extended.
+      if (CSMI->getParent() == MI->getParent()) {
+        for (MachineBasicBlock::iterator II = CSMI, IE = MI; II != IE; ++II)
+          for (auto ImplicitDef : ImplicitDefs)
+            if (MachineOperand *MO = II->findRegisterUseOperand(
+                    ImplicitDef, /*isKill=*/true, TRI))
+              MO->setIsKill(false);
+      } else {
+        // If the instructions aren't in the same BB, bail out and clear the
+        // kill flag on all uses of the imp-def'd register.
+        for (auto ImplicitDef : ImplicitDefs)
+          MRI->clearKillFlags(ImplicitDef);
+      }
+
       if (CrossMBBPhysDef) {
         // Add physical register defs now coming in from a predecessor to MBB
         // livein list.
@@ -606,6 +643,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
     }
     CSEPairs.clear();
     ImplicitDefsToUpdate.clear();
+    ImplicitDefs.clear();
   }
 
   return Changed;
diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp
index 2931258..41045ac 100644
--- a/lib/CodeGen/MachineCombiner.cpp
+++ b/lib/CodeGen/MachineCombiner.cpp
@@ -45,7 +45,7 @@ class MachineCombiner : public MachineFunctionPass {
 
   TargetSchedModel TSchedModel;
 
-  /// OptSize - True if optimizing for code size.
+  /// True if optimizing for code size.
   bool OptSize;
 
 public:
@@ -109,7 +109,7 @@ MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
   return DefInstr;
 }
 
-/// getDepth - Computes depth of instructions in vector \InsInstr.
+/// Computes depth of instructions in vector \InsInstr.
 ///
 /// \param InsInstrs is a vector of machine instructions
 /// \param InstrIdxForVirtReg is a dense map of virtual register to index
@@ -125,7 +125,7 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
   SmallVector<unsigned, 16> InstrDepth;
   assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
 
-  // Foreach instruction in in the new sequence compute the depth based on the
+  // For each instruction in the new sequence compute the depth based on the
   // operands. Use the trace information when possible. For new operands which
   // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
   for (auto *InstrPtr : InsInstrs) { // for each Use
@@ -169,8 +169,7 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
   return InstrDepth[NewRootIdx];
 }
 
-/// getLatency - Computes instruction latency as max of latency of defined
-/// operands
+/// Computes instruction latency as max of latency of defined operands.
 ///
 /// \param Root is a machine instruction that could be replaced by NewRoot.
 /// It is used to compute a more accurate latency information for NewRoot in
@@ -211,12 +210,12 @@ unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
   return NewRootLatency;
 }
 
-/// preservesCriticalPathlen - True when the new instruction sequence does not
+/// True when the new instruction sequence does not
 /// lengthen the critical path. The DAGCombine code sequence ends in MI
 /// (Machine Instruction) Root. The new code sequence ends in MI NewRoot. A
 /// necessary condition for the new sequence to replace the old sequence is that
-/// is cannot lengthen the critical path. This is decided by the formula
-/// (NewRootDepth + NewRootLatency) <=  (RootDepth + RootLatency + RootSlack)).
+/// it cannot lengthen the critical path. This is decided by the formula
+/// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)).
 /// The slack is the number of cycles Root can be delayed before the critical
 /// patch becomes longer.
 bool MachineCombiner::preservesCriticalPathLen(
@@ -264,8 +263,7 @@ void MachineCombiner::instr2instrSC(
     InstrsSC.push_back(SC);
   }
 }
-/// preservesResourceLen - True when the new instructions do not increase
-/// resource length
+/// True when the new instructions do not increase resource length
 bool MachineCombiner::preservesResourceLen(
     MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
     SmallVectorImpl<MachineInstr *> &InsInstrs,
@@ -300,7 +298,7 @@ bool MachineCombiner::preservesResourceLen(
 }
 
 /// \returns true when new instruction sequence should be generated
-/// independent if it lenghtens critical path or not
+/// independent if it lengthens critical path or not
 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
   if (OptSize && (NewSize < OldSize))
     return true;
@@ -309,7 +307,7 @@ bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
   return false;
 }
 
-/// combineInstructions - substitute a slow code sequence with a faster one by
+/// Substitute a slow code sequence with a faster one by
 /// evaluating instruction combining pattern.
 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
 /// combining based on machine trace metrics. Only combine a sequence of
@@ -406,8 +404,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
 }
 
 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
-  const TargetSubtargetInfo &STI =
-      MF.getTarget().getSubtarget<TargetSubtargetInfo>();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
   TII = STI.getInstrInfo();
   TRI = STI.getRegisterInfo();
   SchedModel = STI.getSchedModel();
@@ -416,8 +413,7 @@ bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
   Traces = &getAnalysis<MachineTraceMetrics>();
   MinInstr = 0;
 
-  OptSize = MF.getFunction()->getAttributes().hasAttribute(
-      AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+  OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
 
   DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
   if (!TII->useMachineCombiner()) {
diff --git a/lib/CodeGen/MachineDominanceFrontier.cpp b/lib/CodeGen/MachineDominanceFrontier.cpp
index 0bee846..acb7c48 100644
--- a/lib/CodeGen/MachineDominanceFrontier.cpp
+++ b/lib/CodeGen/MachineDominanceFrontier.cpp
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineDominanceFrontier.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/Analysis/DominanceFrontierImpl.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/Passes.h"
 
 
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 8a2b610..151a260 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -67,17 +67,14 @@ MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
                        STI->getFrameLowering()->isStackRealignable(),
                        !F->hasFnAttribute("no-realign-stack"));
 
-  if (Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::StackAlignment))
-    FrameInfo->ensureMaxAlignment(Fn->getAttributes().
-                                getStackAlignment(AttributeSet::FunctionIndex));
+  if (Fn->hasFnAttribute(Attribute::StackAlignment))
+    FrameInfo->ensureMaxAlignment(Fn->getFnStackAlignment());
 
   ConstantPool = new (Allocator) MachineConstantPool(TM);
   Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
 
   // FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn.
-  if (!Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                        Attribute::OptimizeForSize))
+  if (!Fn->hasFnAttribute(Attribute::OptimizeForSize))
     Alignment = std::max(Alignment,
                          STI->getTargetLowering()->getPrefFunctionAlignment());
 
@@ -462,7 +459,7 @@ unsigned MachineFunction::addLiveIn(unsigned PReg,
 /// normal 'L' label is returned.
 MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx,
                                         bool isLinkerPrivate) const {
-  const DataLayout *DL = getSubtarget().getDataLayout();
+  const DataLayout *DL = getTarget().getDataLayout();
   assert(JumpTableInfo && "No jump tables");
   assert(JTI < JumpTableInfo->getJumpTables().size() && "Invalid JTI!");
 
@@ -477,7 +474,7 @@ MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx,
 /// getPICBaseSymbol - Return a function-local symbol to represent the PIC
 /// base.
 MCSymbol *MachineFunction::getPICBaseSymbol() const {
-  const DataLayout *DL = getSubtarget().getDataLayout();
+  const DataLayout *DL = getTarget().getDataLayout();
   return Ctx.GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix())+
                                Twine(getFunctionNumber())+"$pb");
 }
@@ -587,13 +584,20 @@ int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size,
   return -++NumFixedObjects;
 }
 
+int MachineFrameInfo::CreateFrameAllocation(uint64_t Size) {
+  // Force the use of a frame pointer. The intention is that this intrinsic be
+  // used in conjunction with unwind mechanisms that leak the frame pointer.
+  setFrameAddressIsTaken(true);
+  Size = RoundUpToAlignment(Size, StackAlignment);
+  return CreateStackObject(Size, StackAlignment, false);
+}
+
 BitVector
 MachineFrameInfo::getPristineRegs(const MachineBasicBlock *MBB) const {
   assert(MBB && "MBB must be valid");
   const MachineFunction *MF = MBB->getParent();
   assert(MF && "MBB must be part of a MachineFunction");
-  const TargetMachine &TM = MF->getTarget();
-  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   BitVector BV(TRI->getNumRegs());
 
   // Before CSI is calculated, no registers are considered pristine. They can be
@@ -813,7 +817,7 @@ void MachineJumpTableInfo::dump() const { print(dbgs()); }
 void MachineConstantPoolValue::anchor() { }
 
 const DataLayout *MachineConstantPool::getDataLayout() const {
-  return TM.getSubtargetImpl()->getDataLayout();
+  return TM.getDataLayout();
 }
 
 Type *MachineConstantPoolEntry::getType() const {
@@ -835,13 +839,13 @@ MachineConstantPoolEntry::getSectionKind(const DataLayout *DL) const {
   switch (getRelocationInfo()) {
   default:
     llvm_unreachable("Unknown section kind");
-  case 2:
+  case Constant::GlobalRelocations:
     Kind = SectionKind::getReadOnlyWithRel();
     break;
-  case 1:
+  case Constant::LocalRelocation:
     Kind = SectionKind::getReadOnlyWithRelLocal();
     break;
-  case 0:
+  case Constant::NoRelocation:
     switch (DL->getTypeAllocSize(getType())) {
     case 4:
       Kind = SectionKind::getMergeableConst4();
@@ -853,7 +857,7 @@ MachineConstantPoolEntry::getSectionKind(const DataLayout *DL) const {
       Kind = SectionKind::getMergeableConst16();
       break;
     default:
-      Kind = SectionKind::getMergeableConst();
+      Kind = SectionKind::getReadOnly();
       break;
     }
   }
diff --git a/lib/CodeGen/MachineFunctionPass.cpp b/lib/CodeGen/MachineFunctionPass.cpp
index 789f204..aaf06a7 100644
--- a/lib/CodeGen/MachineFunctionPass.cpp
+++ b/lib/CodeGen/MachineFunctionPass.cpp
@@ -11,11 +11,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DominanceFrontier.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
 using namespace llvm;
 
 Pass *MachineFunctionPass::createPrinterPass(raw_ostream &O,
@@ -43,15 +50,13 @@ void MachineFunctionPass::getAnalysisUsage(AnalysisUsage &AU) const {
   // because CodeGen overloads that to mean preserving the MachineBasicBlock
   // CFG in addition to the LLVM IR CFG.
   AU.addPreserved<AliasAnalysis>();
-  AU.addPreserved("scalar-evolution");
-  AU.addPreserved("iv-users");
-  AU.addPreserved("memdep");
-  AU.addPreserved("live-values");
-  AU.addPreserved("domtree");
-  AU.addPreserved("domfrontier");
-  AU.addPreserved("loops");
-  AU.addPreserved("lda");
-  AU.addPreserved("stack-protector");
+  AU.addPreserved<DominanceFrontier>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<IVUsers>();
+  AU.addPreserved<LoopInfoWrapperPass>();
+  AU.addPreserved<MemoryDependenceAnalysis>();
+  AU.addPreserved<ScalarEvolution>();
+  AU.addPreserved<StackProtector>();
 
   FunctionPass::getAnalysisUsage(AU);
 }
diff --git a/lib/CodeGen/MachineFunctionPrinterPass.cpp b/lib/CodeGen/MachineFunctionPrinterPass.cpp
index dee3977..790f5ac 100644
--- a/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -52,7 +52,7 @@ char MachineFunctionPrinterPass::ID = 0;
 }
 
 char &llvm::MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
-INITIALIZE_PASS(MachineFunctionPrinterPass, "print-machineinstrs",
+INITIALIZE_PASS(MachineFunctionPrinterPass, "machineinstr-printer",
                 "Machine Function Printer", false, false)
 
 namespace llvm {
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 7ad0d94..981e4a3 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -397,7 +397,7 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
     break;
   case MachineOperand::MO_Metadata:
     OS << '<';
-    getMetadata()->printAsOperand(OS, /*PrintType=*/false);
+    getMetadata()->printAsOperand(OS);
     OS << '>';
     break;
   case MachineOperand::MO_MCSymbol:
@@ -537,7 +537,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineMemOperand &MMO) {
   if (const MDNode *TBAAInfo = MMO.getAAInfo().TBAA) {
     OS << "(tbaa=";
     if (TBAAInfo->getNumOperands() > 0)
-      TBAAInfo->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
+      TBAAInfo->getOperand(0)->printAsOperand(OS);
     else
       OS << "<unknown>";
     OS << ")";
@@ -548,7 +548,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineMemOperand &MMO) {
     OS << "(alias.scope=";
     if (ScopeInfo->getNumOperands() > 0)
       for (unsigned i = 0, ie = ScopeInfo->getNumOperands(); i != ie; ++i) {
-        ScopeInfo->getOperand(i)->printAsOperand(OS, /*PrintType=*/false);
+        ScopeInfo->getOperand(i)->printAsOperand(OS);
         if (i != ie-1)
           OS << ",";
       }
@@ -562,7 +562,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineMemOperand &MMO) {
     OS << "(noalias=";
     if (NoAliasInfo->getNumOperands() > 0)
       for (unsigned i = 0, ie = NoAliasInfo->getNumOperands(); i != ie; ++i) {
-        NoAliasInfo->getOperand(i)->printAsOperand(OS, /*PrintType=*/false);
+        NoAliasInfo->getOperand(i)->printAsOperand(OS);
         if (i != ie-1)
           OS << ",";
       }
@@ -595,10 +595,12 @@ void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) {
 /// implicit operands. It reserves space for the number of operands specified by
 /// the MCInstrDesc.
 MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid,
-                           const DebugLoc dl, bool NoImp)
-  : MCID(&tid), Parent(nullptr), Operands(nullptr), NumOperands(0),
-    Flags(0), AsmPrinterFlags(0),
-    NumMemRefs(0), MemRefs(nullptr), debugLoc(dl) {
+                           DebugLoc dl, bool NoImp)
+    : MCID(&tid), Parent(nullptr), Operands(nullptr), NumOperands(0), Flags(0),
+      AsmPrinterFlags(0), NumMemRefs(0), MemRefs(nullptr),
+      debugLoc(std::move(dl)) {
+  assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
+
   // Reserve space for the expected number of operands.
   if (unsigned NumOps = MCID->getNumOperands() +
     MCID->getNumImplicitDefs() + MCID->getNumImplicitUses()) {
@@ -617,12 +619,14 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
     Flags(0), AsmPrinterFlags(0),
     NumMemRefs(MI.NumMemRefs), MemRefs(MI.MemRefs),
     debugLoc(MI.getDebugLoc()) {
+  assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
+
   CapOperands = OperandCapacity::get(MI.getNumOperands());
   Operands = MF.allocateOperandArray(CapOperands);
 
   // Copy operands.
-  for (unsigned i = 0; i != MI.getNumOperands(); ++i)
-    addOperand(MF, MI.getOperand(i));
+  for (const MachineOperand &MO : MI.operands())
+    addOperand(MF, MO);
 
   // Copy all the sensible flags.
   setFlags(MI.Flags);
@@ -641,18 +645,18 @@ MachineRegisterInfo *MachineInstr::getRegInfo() {
 /// this instruction from their respective use lists.  This requires that the
 /// operands already be on their use lists.
 void MachineInstr::RemoveRegOperandsFromUseLists(MachineRegisterInfo &MRI) {
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-    if (Operands[i].isReg())
-      MRI.removeRegOperandFromUseList(&Operands[i]);
+  for (MachineOperand &MO : operands())
+    if (MO.isReg())
+      MRI.removeRegOperandFromUseList(&MO);
 }
 
 /// AddRegOperandsToUseLists - Add all of the register operands in
 /// this instruction from their respective use lists.  This requires that the
 /// operands not be on their use lists yet.
 void MachineInstr::AddRegOperandsToUseLists(MachineRegisterInfo &MRI) {
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-    if (Operands[i].isReg())
-      MRI.addRegOperandToUseList(&Operands[i]);
+  for (MachineOperand &MO : operands())
+    if (MO.isReg())
+      MRI.addRegOperandToUseList(&MO);
 }
 
 void MachineInstr::addOperand(const MachineOperand &Op) {
@@ -670,14 +674,8 @@ static void moveOperands(MachineOperand *Dst, MachineOperand *Src,
   if (MRI)
     return MRI->moveOperands(Dst, Src, NumOps);
 
-  // Here it would be convenient to call memmove, so that isn't allowed because
-  // MachineOperand has a constructor and so isn't a POD type.
-  if (Dst < Src)
-    for (unsigned i = 0; i != NumOps; ++i)
-      new (Dst + i) MachineOperand(Src[i]);
-  else
-    for (unsigned i = NumOps; i ; --i)
-      new (Dst + i - 1) MachineOperand(Src[i - 1]);
+  // MachineOperand is a trivially copyable type so we can just use memmove.
+  std::memmove(Dst, Src, NumOps * sizeof(MachineOperand));
 }
 
 /// addOperand - Add the specified operand to the instruction.  If it is an
@@ -922,8 +920,7 @@ void MachineInstr::eraseFromParentAndMarkDBGValuesForRemoval() {
   MachineInstr *MI = (MachineInstr *)this;
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || !MO.isDef())
       continue;
     unsigned Reg = MO.getReg();
@@ -1326,8 +1323,7 @@ unsigned MachineInstr::findTiedOperandIdx(unsigned OpIdx) const {
 /// clearKillInfo - Clears kill flags on all operands.
 ///
 void MachineInstr::clearKillInfo() {
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = getOperand(i);
+  for (MachineOperand &MO : operands()) {
     if (MO.isReg() && MO.isUse())
       MO.setIsKill(false);
   }
@@ -1340,15 +1336,13 @@ void MachineInstr::substituteRegister(unsigned FromReg,
   if (TargetRegisterInfo::isPhysicalRegister(ToReg)) {
     if (SubIdx)
       ToReg = RegInfo.getSubReg(ToReg, SubIdx);
-    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = getOperand(i);
+    for (MachineOperand &MO : operands()) {
       if (!MO.isReg() || MO.getReg() != FromReg)
         continue;
       MO.substPhysReg(ToReg, RegInfo);
     }
   } else {
-    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = getOperand(i);
+    for (MachineOperand &MO : operands()) {
       if (!MO.isReg() || MO.getReg() != FromReg)
         continue;
       MO.substVirtReg(ToReg, SubIdx, RegInfo);
@@ -1491,8 +1485,7 @@ bool MachineInstr::hasUnmodeledSideEffects() const {
 /// allDefsAreDead - Return true if all the defs of this instruction are dead.
 ///
 bool MachineInstr::allDefsAreDead() const {
-  for (unsigned i = 0, e = getNumOperands(); i < e; ++i) {
-    const MachineOperand &MO = getOperand(i);
+  for (const MachineOperand &MO : operands()) {
     if (!MO.isReg() || MO.isUse())
       continue;
     if (!MO.isDead())
@@ -1823,8 +1816,7 @@ void MachineInstr::clearRegisterKills(unsigned Reg,
                                       const TargetRegisterInfo *RegInfo) {
   if (!TargetRegisterInfo::isPhysicalRegister(Reg))
     RegInfo = nullptr;
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = getOperand(i);
+  for (MachineOperand &MO : operands()) {
     if (!MO.isReg() || !MO.isUse() || !MO.isKill())
       continue;
     unsigned OpReg = MO.getReg();
@@ -1885,6 +1877,22 @@ bool MachineInstr::addRegisterDead(unsigned Reg,
   return true;
 }
 
+void MachineInstr::clearRegisterDeads(unsigned Reg) {
+  for (MachineOperand &MO : operands()) {
+    if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg)
+      continue;
+    MO.setIsDead(false);
+  }
+}
+
+void MachineInstr::addRegisterDefReadUndef(unsigned Reg) {
+  for (MachineOperand &MO : operands()) {
+    if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg || MO.getSubReg() == 0)
+      continue;
+    MO.setIsUndef();
+  }
+}
+
 void MachineInstr::addRegisterDefined(unsigned Reg,
                                       const TargetRegisterInfo *RegInfo) {
   if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
@@ -1892,8 +1900,7 @@ void MachineInstr::addRegisterDefined(unsigned Reg,
     if (MO)
       return;
   } else {
-    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = getOperand(i);
+    for (const MachineOperand &MO : operands()) {
       if (MO.isReg() && MO.getReg() == Reg && MO.isDef() &&
           MO.getSubReg() == 0)
         return;
@@ -1907,8 +1914,7 @@ void MachineInstr::addRegisterDefined(unsigned Reg,
 void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
                                          const TargetRegisterInfo &TRI) {
   bool HasRegMask = false;
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = getOperand(i);
+  for (MachineOperand &MO : operands()) {
     if (MO.isRegMask()) {
       HasRegMask = true;
       continue;
@@ -1916,15 +1922,10 @@ void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
     if (!MO.isReg() || !MO.isDef()) continue;
     unsigned Reg = MO.getReg();
     if (!TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
-    bool Dead = true;
-    for (ArrayRef<unsigned>::iterator I = UsedRegs.begin(), E = UsedRegs.end();
-         I != E; ++I)
-      if (TRI.regsOverlap(*I, Reg)) {
-        Dead = false;
-        break;
-      }
     // If there are no uses, including partial uses, the def is dead.
-    if (Dead) MO.setIsDead();
+    if (std::none_of(UsedRegs.begin(), UsedRegs.end(),
+                     [&](unsigned Use) { return TRI.regsOverlap(Use, Reg); }))
+      MO.setIsDead();
   }
 
   // This is a call with a register mask operand.
@@ -1941,8 +1942,7 @@ MachineInstrExpressionTrait::getHashValue(const MachineInstr* const &MI) {
   SmallVector<size_t, 8> HashComponents;
   HashComponents.reserve(MI->getNumOperands() + 1);
   HashComponents.push_back(MI->getOpcode());
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (MO.isReg() && MO.isDef() &&
         TargetRegisterInfo::isVirtualRegister(MO.getReg()))
       continue;  // Skip virtual register defs.
@@ -1960,7 +1960,8 @@ void MachineInstr::emitError(StringRef Msg) const {
     if (getOperand(i-1).isMetadata() &&
         (LocMD = getOperand(i-1).getMetadata()) &&
         LocMD->getNumOperands() != 0) {
-      if (const ConstantInt *CI = dyn_cast<ConstantInt>(LocMD->getOperand(0))) {
+      if (const ConstantInt *CI =
+              mdconst::dyn_extract<ConstantInt>(LocMD->getOperand(0))) {
         LocCookie = CI->getZExtValue();
         break;
       }
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 2ab0467..64d0932 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -49,6 +49,11 @@ AvoidSpeculation("avoid-speculation",
                  cl::desc("MachineLICM should avoid speculation"),
                  cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+HoistCheapInsts("hoist-cheap-insts",
+                cl::desc("MachineLICM should hoist even cheap instructions"),
+                cl::init(false), cl::Hidden);
+
 STATISTIC(NumHoisted,
           "Number of machine instructions hoisted out of loops");
 STATISTIC(NumLowRP,
@@ -688,6 +693,10 @@ void MachineLICM::ExitScopeIfDone(MachineDomTreeNode *Node,
 /// one pass without iteration.
 ///
 void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
+  MachineBasicBlock *Preheader = getCurPreheader();
+  if (!Preheader)
+    return;
+
   SmallVector<MachineDomTreeNode*, 32> Scopes;
   SmallVector<MachineDomTreeNode*, 8> WorkList;
   DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> ParentMap;
@@ -695,7 +704,7 @@ void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
 
   // Perform a DFS walk to determine the order of visit.
   WorkList.push_back(HeaderN);
-  do {
+  while (!WorkList.empty()) {
     MachineDomTreeNode *Node = WorkList.pop_back_val();
     assert(Node && "Null dominator tree node?");
     MachineBasicBlock *BB = Node->getBlock();
@@ -729,28 +738,21 @@ void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
       ParentMap[Child] = Node;
       WorkList.push_back(Child);
     }
-  } while (!WorkList.empty());
+  }
 
-  if (Scopes.size() != 0) {
-    MachineBasicBlock *Preheader = getCurPreheader();
-    if (!Preheader)
-      return;
+  if (Scopes.size() == 0)
+    return;
 
-    // Compute registers which are livein into the loop headers.
-    RegSeen.clear();
-    BackTrace.clear();
-    InitRegPressure(Preheader);
-  }
+  // Compute registers which are livein into the loop headers.
+  RegSeen.clear();
+  BackTrace.clear();
+  InitRegPressure(Preheader);
 
   // Now perform LICM.
   for (unsigned i = 0, e = Scopes.size(); i != e; ++i) {
     MachineDomTreeNode *Node = Scopes[i];
     MachineBasicBlock *MBB = Node->getBlock();
 
-    MachineBasicBlock *Preheader = getCurPreheader();
-    if (!Preheader)
-      continue;
-
     EnterScope(MBB);
 
     // Process the block
@@ -1075,7 +1077,7 @@ bool MachineLICM::CanCauseHighRegPressure(DenseMap<unsigned, int> &Cost,
 
     // Don't hoist cheap instructions if they would increase register pressure,
     // even if we're under the limit.
-    if (CheapInstr)
+    if (CheapInstr && !HoistCheapInsts)
       return true;
 
     for (unsigned i = BackTrace.size(); i != 0; --i) {
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index eb3c0bf..fca7df0 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/ADT/PointerUnion.h"
+#include "llvm/Analysis/LibCallSemantics.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -273,9 +274,10 @@ bool MachineModuleInfo::doInitialization(Module &M) {
   CurCallSite = 0;
   CallsEHReturn = 0;
   CallsUnwindInit = 0;
-  DbgInfoAvailable = UsesVAFloatArgument = false; 
+  DbgInfoAvailable = UsesVAFloatArgument = UsesMorestackAddr = false;
   // Always emit some info, by default "no personality" info.
   Personalities.push_back(nullptr);
+  PersonalityTypeCache = EHPersonality::Unknown;
   AddrLabelSymbols = nullptr;
   TheModule = nullptr;
 
@@ -452,6 +454,14 @@ void MachineModuleInfo::addCleanup(MachineBasicBlock *LandingPad) {
   LP.TypeIds.push_back(0);
 }
 
+MCSymbol *
+MachineModuleInfo::addClauseForLandingPad(MachineBasicBlock *LandingPad) {
+  MCSymbol *ClauseLabel = Context.CreateTempSymbol();
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  LP.ClauseLabels.push_back(ClauseLabel);
+  return ClauseLabel;
+}
+
 /// TidyLandingPads - Remap landing pad labels and remove any deleted landing
 /// pads.
 void MachineModuleInfo::TidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap) {
@@ -546,11 +556,17 @@ try_next:;
 
 /// getPersonality - Return the personality function for the current function.
 const Function *MachineModuleInfo::getPersonality() const {
-  // FIXME: Until PR1414 will be fixed, we're using 1 personality function per
-  // function
-  return !LandingPads.empty() ? LandingPads[0].Personality : nullptr;
+  for (const LandingPadInfo &LPI : LandingPads)
+    if (LPI.Personality)
+      return LPI.Personality;
+  return nullptr;
 }
 
+EHPersonality MachineModuleInfo::getPersonalityType() {
+  if (PersonalityTypeCache == EHPersonality::Unknown)
+    PersonalityTypeCache = classifyEHPersonality(getPersonality());
+  return PersonalityTypeCache;
+}
 /// getPersonalityIndex - Return unique index for current personality
 /// function. NULL/first personality function should always get zero index.
 unsigned MachineModuleInfo::getPersonalityIndex() const {
diff --git a/lib/CodeGen/MachineRegionInfo.cpp b/lib/CodeGen/MachineRegionInfo.cpp
index 5a5035e..01d2c2e 100644
--- a/lib/CodeGen/MachineRegionInfo.cpp
+++ b/lib/CodeGen/MachineRegionInfo.cpp
@@ -1,8 +1,8 @@
 
 #include "llvm/CodeGen/MachineRegionInfo.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/RegionInfoImpl.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 
 #define DEBUG_TYPE "region"
 
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index e9612f3..32b7db1 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -24,7 +24,8 @@ using namespace llvm;
 void MachineRegisterInfo::Delegate::anchor() {}
 
 MachineRegisterInfo::MachineRegisterInfo(const MachineFunction *MF)
-  : MF(MF), TheDelegate(nullptr), IsSSA(true), TracksLiveness(true) {
+  : MF(MF), TheDelegate(nullptr), IsSSA(true), TracksLiveness(true),
+    TracksSubRegLiveness(false) {
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
   UsedRegUnits.resize(getTargetRegisterInfo()->getNumRegUnits());
@@ -60,8 +61,8 @@ MachineRegisterInfo::constrainRegClass(unsigned Reg,
 }
 
 bool
-MachineRegisterInfo::recomputeRegClass(unsigned Reg, const TargetMachine &TM) {
-  const TargetInstrInfo *TII = TM.getSubtargetImpl()->getInstrInfo();
+MachineRegisterInfo::recomputeRegClass(unsigned Reg) {
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   const TargetRegisterClass *OldRC = getRegClass(Reg);
   const TargetRegisterClass *NewRC =
     getTargetRegisterInfo()->getLargestLegalSuperClass(OldRC);
@@ -128,6 +129,7 @@ void MachineRegisterInfo::verifyUseList(unsigned Reg) const {
              << " use list MachineOperand " << MO
              << " has no parent instruction.\n";
       Valid = false;
+      continue;
     }
     MachineOperand *MO0 = &MI->getOperand(0);
     unsigned NumOps = MI->getNumOperands();
@@ -391,6 +393,14 @@ MachineRegisterInfo::EmitLiveInCopies(MachineBasicBlock *EntryMBB,
     }
 }
 
+unsigned MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const
+{
+  // Lane masks are only defined for vregs.
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  const TargetRegisterClass &TRC = *getRegClass(Reg);
+  return TRC.getLaneMask();
+}
+
 #ifndef NDEBUG
 void MachineRegisterInfo::dumpUses(unsigned Reg) const {
   for (MachineInstr &I : use_instructions(Reg))
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 261942f..89ac6a8 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -144,12 +144,12 @@ char MachineScheduler::ID = 0;
 
 char &llvm::MachineSchedulerID = MachineScheduler::ID;
 
-INITIALIZE_PASS_BEGIN(MachineScheduler, "misched",
+INITIALIZE_PASS_BEGIN(MachineScheduler, "machine-scheduler",
                       "Machine Instruction Scheduler", false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(MachineScheduler, "misched",
+INITIALIZE_PASS_END(MachineScheduler, "machine-scheduler",
                     "Machine Instruction Scheduler", false, false)
 
 MachineScheduler::MachineScheduler()
@@ -336,9 +336,7 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   if (skipOptnoneFunction(*mf.getFunction()))
     return false;
 
-  const TargetSubtargetInfo &ST =
-    mf.getTarget().getSubtarget<TargetSubtargetInfo>();
-  if (!ST.enablePostMachineScheduler()) {
+  if (!mf.getSubtarget().enablePostMachineScheduler()) {
     DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n");
     return false;
   }
@@ -430,9 +428,11 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
       // instruction stream until we find the nearest boundary.
       unsigned NumRegionInstrs = 0;
       MachineBasicBlock::iterator I = RegionEnd;
-      for(;I != MBB->begin(); --I, --RemainingInstrs, ++NumRegionInstrs) {
+      for(;I != MBB->begin(); --I, --RemainingInstrs) {
         if (isSchedBoundary(std::prev(I), MBB, MF, TII, IsPostRA))
           break;
+        if (!I->isDebugValue())
+          ++NumRegionInstrs;
       }
       // Notify the scheduler of the region, even if we may skip scheduling
       // it. Perhaps it still needs to be bundled.
@@ -1432,12 +1432,15 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
   // Check if either the dest or source is local. If it's live across a back
   // edge, it's not local. Note that if both vregs are live across the back
   // edge, we cannot successfully contrain the copy without cyclic scheduling.
-  unsigned LocalReg = DstReg;
-  unsigned GlobalReg = SrcReg;
+  // If both the copy's source and dest are local live intervals, then we
+  // should treat the dest as the global for the purpose of adding
+  // constraints. This adds edges from source's other uses to the copy.
+  unsigned LocalReg = SrcReg;
+  unsigned GlobalReg = DstReg;
   LiveInterval *LocalLI = &LIS->getInterval(LocalReg);
   if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) {
-    LocalReg = SrcReg;
-    GlobalReg = DstReg;
+    LocalReg = DstReg;
+    GlobalReg = SrcReg;
     LocalLI = &LIS->getInterval(LocalReg);
     if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx))
       return;
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index ba25bca..8337793 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -112,7 +112,7 @@ namespace {
     /// for the lifetime of an iteration.
     ///
     /// \return True if the edge is marked as toSplit, false otherwise.
-    /// False can be retruned if, for instance, this is not profitable.
+    /// False can be returned if, for instance, this is not profitable.
     bool PostponeSplitCriticalEdge(MachineInstr *MI,
                                    MachineBasicBlock *From,
                                    MachineBasicBlock *To,
@@ -504,7 +504,7 @@ bool MachineSinking::isProfitableToSinkTo(unsigned Reg, MachineInstr *MI,
   // If SuccToSinkTo post dominates then also it may be profitable if MI
   // can further profitably sinked into another block in next round.
   bool BreakPHIEdge = false;
-  // FIXME - If finding successor is compile time expensive then catch results.
+  // FIXME - If finding successor is compile time expensive then cache results.
   if (MachineBasicBlock *MBB2 = FindSuccToSinkTo(MI, SuccToSinkTo, BreakPHIEdge))
     return isProfitableToSinkTo(Reg, MI, SuccToSinkTo, MBB2);
 
@@ -553,19 +553,6 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
       if (!TII->isSafeToMoveRegClassDefs(MRI->getRegClass(Reg)))
         return nullptr;
 
-      // FIXME: This picks a successor to sink into based on having one
-      // successor that dominates all the uses.  However, there are cases where
-      // sinking can happen but where the sink point isn't a successor.  For
-      // example:
-      //
-      //   x = computation
-      //   if () {} else {}
-      //   use x
-      //
-      // the instruction could be sunk over the whole diamond for the
-      // if/then/else (or loop, etc), allowing it to be sunk into other blocks
-      // after that.
-
       // Virtual register defs can only be sunk if all their uses are in blocks
       // dominated by one of the successors.
       if (SuccToSinkTo) {
@@ -585,6 +572,23 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
       // higher priority, otherwise prioritize smaller loop depths.
       SmallVector<MachineBasicBlock*, 4> Succs(MBB->succ_begin(),
                                                MBB->succ_end());
+
+      // Handle cases where sinking can happen but where the sink point isn't a
+      // successor. For example:
+      //
+      //   x = computation
+      //   if () {} else {}
+      //   use x
+      //
+      const std::vector<MachineDomTreeNode *> &Children =
+        DT->getNode(MBB)->getChildren();
+      for (const auto &DTChild : Children)
+        // DomTree children of MBB that have MBB as immediate dominator are added.
+        if (DTChild->getIDom()->getBlock() == MI->getParent() &&
+            // Skip MBBs already added to the Succs vector above.
+            !MBB->isSuccessor(DTChild->getBlock()))
+          Succs.push_back(DTChild->getBlock());
+
       // Sort Successors according to their loop depth or block frequency info.
       std::stable_sort(
           Succs.begin(), Succs.end(),
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index 2cf87eb..8aacd1f 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -52,12 +52,11 @@ void MachineTraceMetrics::getAnalysisUsage(AnalysisUsage &AU) const {
 
 bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
-  TII = MF->getSubtarget().getInstrInfo();
-  TRI = MF->getSubtarget().getRegisterInfo();
+  const TargetSubtargetInfo &ST = MF->getSubtarget();
+  TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
   MRI = &MF->getRegInfo();
   Loops = &getAnalysis<MachineLoopInfo>();
-  const TargetSubtargetInfo &ST =
-    MF->getTarget().getSubtarget<TargetSubtargetInfo>();
   SchedModel.init(ST.getSchedModel(), &ST, TII);
   BlockInfo.resize(MF->getNumBlockIDs());
   ProcResourceCycles.resize(MF->getNumBlockIDs() *
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 99f0583..bdb094f 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -42,6 +42,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -54,16 +55,13 @@ namespace {
 
     MachineVerifier(Pass *pass, const char *b) :
       PASS(pass),
-      Banner(b),
-      OutFileName(getenv("LLVM_VERIFY_MACHINEINSTRS"))
+      Banner(b)
       {}
 
     bool runOnMachineFunction(MachineFunction &MF);
 
     Pass *const PASS;
     const char *Banner;
-    const char *const OutFileName;
-    raw_ostream *OS;
     const MachineFunction *MF;
     const TargetMachine *TM;
     const TargetInstrInfo *TII;
@@ -215,9 +213,9 @@ namespace {
     void report(const char *msg, const MachineBasicBlock *MBB,
                 const LiveInterval &LI);
     void report(const char *msg, const MachineFunction *MF,
-                const LiveRange &LR, unsigned Reg);
+                const LiveRange &LR, unsigned Reg, unsigned LaneMask);
     void report(const char *msg, const MachineBasicBlock *MBB,
-                const LiveRange &LR, unsigned Reg);
+                const LiveRange &LR, unsigned Reg, unsigned LaneMask);
 
     void verifyInlineAsm(const MachineInstr *MI);
 
@@ -230,20 +228,22 @@ namespace {
     void verifyLiveVariables();
     void verifyLiveIntervals();
     void verifyLiveInterval(const LiveInterval&);
-    void verifyLiveRangeValue(const LiveRange&, const VNInfo*, unsigned);
+    void verifyLiveRangeValue(const LiveRange&, const VNInfo*, unsigned,
+                              unsigned);
     void verifyLiveRangeSegment(const LiveRange&,
-                                const LiveRange::const_iterator I, unsigned);
-    void verifyLiveRange(const LiveRange&, unsigned);
+                                const LiveRange::const_iterator I, unsigned,
+                                unsigned);
+    void verifyLiveRange(const LiveRange&, unsigned, unsigned LaneMask = 0);
 
     void verifyStackFrame();
   };
 
   struct MachineVerifierPass : public MachineFunctionPass {
     static char ID; // Pass ID, replacement for typeid
-    const char *const Banner;
+    const std::string Banner;
 
-    MachineVerifierPass(const char *b = nullptr)
-      : MachineFunctionPass(ID), Banner(b) {
+    MachineVerifierPass(const std::string &banner = nullptr)
+      : MachineFunctionPass(ID), Banner(banner) {
         initializeMachineVerifierPassPass(*PassRegistry::getPassRegistry());
       }
 
@@ -253,7 +253,7 @@ namespace {
     }
 
     bool runOnMachineFunction(MachineFunction &MF) override {
-      MF.verify(this, Banner);
+      MF.verify(this, Banner.c_str());
       return false;
     }
   };
@@ -264,7 +264,7 @@ char MachineVerifierPass::ID = 0;
 INITIALIZE_PASS(MachineVerifierPass, "machineverifier",
                 "Verify generated machine code", false, false)
 
-FunctionPass *llvm::createMachineVerifierPass(const char *Banner) {
+FunctionPass *llvm::createMachineVerifierPass(const std::string &Banner) {
   return new MachineVerifierPass(Banner);
 }
 
@@ -274,22 +274,6 @@ void MachineFunction::verify(Pass *p, const char *Banner) const {
 }
 
 bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
-  raw_ostream *OutFile = nullptr;
-  if (OutFileName) {
-    std::error_code EC;
-    OutFile = new raw_fd_ostream(OutFileName, EC,
-                                 sys::fs::F_Append | sys::fs::F_Text);
-    if (EC) {
-      errs() << "Error opening '" << OutFileName << "': " << EC.message()
-             << '\n';
-      exit(1);
-    }
-
-    OS = OutFile;
-  } else {
-    OS = &errs();
-  }
-
   foundErrors = 0;
 
   this->MF = &MF;
@@ -324,7 +308,7 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
            MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI) {
       if (MBBI->getParent() != MFI) {
         report("Bad instruction parent pointer", MFI);
-        *OS << "Instruction: " << *MBBI;
+        errs() << "Instruction: " << *MBBI;
         continue;
       }
 
@@ -360,9 +344,7 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
   }
   visitMachineFunctionAfter();
 
-  if (OutFile)
-    delete OutFile;
-  else if (foundErrors)
+  if (foundErrors)
     report_fatal_error("Found "+Twine(foundErrors)+" machine code errors.");
 
   // Clean up.
@@ -379,70 +361,76 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
 
 void MachineVerifier::report(const char *msg, const MachineFunction *MF) {
   assert(MF);
-  *OS << '\n';
+  errs() << '\n';
   if (!foundErrors++) {
     if (Banner)
-      *OS << "# " << Banner << '\n';
-    MF->print(*OS, Indexes);
+      errs() << "# " << Banner << '\n';
+    MF->print(errs(), Indexes);
   }
-  *OS << "*** Bad machine code: " << msg << " ***\n"
+  errs() << "*** Bad machine code: " << msg << " ***\n"
       << "- function:    " << MF->getName() << "\n";
 }
 
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB) {
   assert(MBB);
   report(msg, MBB->getParent());
-  *OS << "- basic block: BB#" << MBB->getNumber()
+  errs() << "- basic block: BB#" << MBB->getNumber()
       << ' ' << MBB->getName()
       << " (" << (const void*)MBB << ')';
   if (Indexes)
-    *OS << " [" << Indexes->getMBBStartIdx(MBB)
+    errs() << " [" << Indexes->getMBBStartIdx(MBB)
         << ';' <<  Indexes->getMBBEndIdx(MBB) << ')';
-  *OS << '\n';
+  errs() << '\n';
 }
 
 void MachineVerifier::report(const char *msg, const MachineInstr *MI) {
   assert(MI);
   report(msg, MI->getParent());
-  *OS << "- instruction: ";
+  errs() << "- instruction: ";
   if (Indexes && Indexes->hasIndex(MI))
-    *OS << Indexes->getInstructionIndex(MI) << '\t';
-  MI->print(*OS, TM);
+    errs() << Indexes->getInstructionIndex(MI) << '\t';
+  MI->print(errs(), TM);
 }
 
 void MachineVerifier::report(const char *msg,
                              const MachineOperand *MO, unsigned MONum) {
   assert(MO);
   report(msg, MO->getParent());
-  *OS << "- operand " << MONum << ":   ";
-  MO->print(*OS, TM);
-  *OS << "\n";
+  errs() << "- operand " << MONum << ":   ";
+  MO->print(errs(), TM);
+  errs() << "\n";
 }
 
 void MachineVerifier::report(const char *msg, const MachineFunction *MF,
                              const LiveInterval &LI) {
   report(msg, MF);
-  *OS << "- interval:    " << LI << '\n';
+  errs() << "- interval:    " << LI << '\n';
 }
 
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
                              const LiveInterval &LI) {
   report(msg, MBB);
-  *OS << "- interval:    " << LI << '\n';
+  errs() << "- interval:    " << LI << '\n';
 }
 
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
-                             const LiveRange &LR, unsigned Reg) {
+                             const LiveRange &LR, unsigned Reg,
+                             unsigned LaneMask) {
   report(msg, MBB);
-  *OS << "- liverange:   " << LR << '\n';
-  *OS << "- register:    " << PrintReg(Reg, TRI) << '\n';
+  errs() << "- liverange:   " << LR << '\n';
+  errs() << "- register:    " << PrintReg(Reg, TRI) << '\n';
+  if (LaneMask != 0)
+    errs() << "- lanemask:    " << format("%04X\n", LaneMask);
 }
 
 void MachineVerifier::report(const char *msg, const MachineFunction *MF,
-                             const LiveRange &LR, unsigned Reg) {
+                             const LiveRange &LR, unsigned Reg,
+                             unsigned LaneMask) {
   report(msg, MF);
-  *OS << "- liverange:   " << LR << '\n';
-  *OS << "- register:    " << PrintReg(Reg, TRI) << '\n';
+  errs() << "- liverange:   " << LR << '\n';
+  errs() << "- register:    " << PrintReg(Reg, TRI) << '\n';
+  if (LaneMask != 0)
+    errs() << "- lanemask:    " << format("%04X\n", LaneMask);
 }
 
 void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
@@ -530,7 +518,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       report("MBB has successor that isn't part of the function.", MBB);
     if (!MBBInfoMap[*I].Preds.count(MBB)) {
       report("Inconsistent CFG", MBB);
-      *OS << "MBB is not in the predecessor list of the successor BB#"
+      errs() << "MBB is not in the predecessor list of the successor BB#"
           << (*I)->getNumber() << ".\n";
     }
   }
@@ -542,7 +530,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       report("MBB has predecessor that isn't part of the function.", MBB);
     if (!MBBInfoMap[*I].Succs.count(MBB)) {
       report("Inconsistent CFG", MBB);
-      *OS << "MBB is not in the successor list of the predecessor BB#"
+      errs() << "MBB is not in the successor list of the predecessor BB#"
           << (*I)->getNumber() << ".\n";
     }
   }
@@ -592,7 +580,11 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       }
     } else if (TBB && !FBB && Cond.empty()) {
       // Block unconditionally branches somewhere.
-      if (MBB->succ_size() != 1+LandingPadSuccs.size()) {
+      // If the block has exactly one successor, that happens to be a
+      // landingpad, accept it as valid control flow.
+      if (MBB->succ_size() != 1+LandingPadSuccs.size() &&
+          (MBB->succ_size() != 1 || LandingPadSuccs.size() != 1 ||
+           *MBB->succ_begin() != *LandingPadSuccs.begin())) {
         report("MBB exits via unconditional branch but doesn't have "
                "exactly one CFG successor!", MBB);
       } else if (!MBB->isSuccessor(TBB)) {
@@ -713,7 +705,7 @@ void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) {
     SlotIndex idx = Indexes->getInstructionIndex(MI);
     if (!(idx > lastIndex)) {
       report("Instruction index out of order", MI);
-      *OS << "Last instruction was at " << lastIndex << '\n';
+      errs() << "Last instruction was at " << lastIndex << '\n';
     }
     lastIndex = idx;
   }
@@ -726,7 +718,7 @@ void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) {
       FirstTerminator = MI;
   } else if (FirstTerminator) {
     report("Non-terminator instruction after the first terminator", MI);
-    *OS << "First terminator was:\t" << *FirstTerminator;
+    errs() << "First terminator was:\t" << *FirstTerminator;
   }
 }
 
@@ -778,7 +770,7 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   const MCInstrDesc &MCID = MI->getDesc();
   if (MI->getNumOperands() < MCID.getNumOperands()) {
     report("Too few operands", MI);
-    *OS << MCID.getNumOperands() << " operands expected, but "
+    errs() << MCID.getNumOperands() << " operands expected, but "
         << MI->getNumOperands() << " given.\n";
   }
 
@@ -908,7 +900,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
               TII->getRegClass(MCID, MONum, TRI, *MF)) {
           if (!DRC->contains(Reg)) {
             report("Illegal physical register for instruction", MO, MONum);
-            *OS << TRI->getName(Reg) << " is not a "
+            errs() << TRI->getName(Reg) << " is not a "
                 << TRI->getRegClassName(DRC) << " register.\n";
           }
         }
@@ -920,13 +912,13 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
             TRI->getSubClassWithSubReg(RC, SubIdx);
           if (!SRC) {
             report("Invalid subregister index for virtual register", MO, MONum);
-            *OS << "Register class " << TRI->getRegClassName(RC)
+            errs() << "Register class " << TRI->getRegClassName(RC)
                 << " does not support subreg index " << SubIdx << "\n";
             return;
           }
           if (RC != SRC) {
             report("Invalid register class for subregister index", MO, MONum);
-            *OS << "Register class " << TRI->getRegClassName(RC)
+            errs() << "Register class " << TRI->getRegClassName(RC)
                 << " does not fully support subreg index " << SubIdx << "\n";
             return;
           }
@@ -948,7 +940,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
           }
           if (!RC->hasSuperClassEq(DRC)) {
             report("Illegal virtual register for instruction", MO, MONum);
-            *OS << "Expected a " << TRI->getRegClassName(DRC)
+            errs() << "Expected a " << TRI->getRegClassName(DRC)
                 << " register, but got a " << TRI->getRegClassName(RC)
                 << " register\n";
           }
@@ -974,11 +966,11 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
       SlotIndex Idx = LiveInts->getInstructionIndex(MI);
       if (MI->mayLoad() && !LI.liveAt(Idx.getRegSlot(true))) {
         report("Instruction loads from dead spill slot", MO, MONum);
-        *OS << "Live stack: " << LI << '\n';
+        errs() << "Live stack: " << LI << '\n';
       }
       if (MI->mayStore() && !LI.liveAt(Idx.getRegSlot())) {
         report("Instruction stores to dead spill slot", MO, MONum);
-        *OS << "Live stack: " << LI << '\n';
+        errs() << "Live stack: " << LI << '\n';
       }
     }
     break;
@@ -1017,12 +1009,12 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
             LiveQueryResult LRQ = LR->Query(UseIdx);
             if (!LRQ.valueIn()) {
               report("No live segment at use", MO, MONum);
-              *OS << UseIdx << " is not live in " << PrintRegUnit(*Units, TRI)
+              errs() << UseIdx << " is not live in " << PrintRegUnit(*Units, TRI)
                   << ' ' << *LR << '\n';
             }
             if (MO->isKill() && !LRQ.isKill()) {
               report("Live range continues after kill flag", MO, MONum);
-              *OS << PrintRegUnit(*Units, TRI) << ' ' << *LR << '\n';
+              errs() << PrintRegUnit(*Units, TRI) << ' ' << *LR << '\n';
             }
           }
         }
@@ -1035,13 +1027,13 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
           LiveQueryResult LRQ = LI.Query(UseIdx);
           if (!LRQ.valueIn()) {
             report("No live segment at use", MO, MONum);
-            *OS << UseIdx << " is not live in " << LI << '\n';
+            errs() << UseIdx << " is not live in " << LI << '\n';
           }
           // Check for extra kill flags.
           // Note that we allow missing kill flags for now.
           if (MO->isKill() && !LRQ.isKill()) {
             report("Live range continues after kill flag", MO, MONum);
-            *OS << "Live range: " << LI << '\n';
+            errs() << "Live range: " << LI << '\n';
           }
         } else {
           report("Virtual register has no live interval", MO, MONum);
@@ -1053,7 +1045,37 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
     if (!regsLive.count(Reg)) {
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
         // Reserved registers may be used even when 'dead'.
-        if (!isReserved(Reg))
+        bool Bad = !isReserved(Reg);
+        // We are fine if just any subregister has a defined value.
+        if (Bad) {
+          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid();
+               ++SubRegs) {
+            if (regsLive.count(*SubRegs)) {
+              Bad = false;
+              break;
+            }
+          }
+        }
+        // If there is an additional implicit-use of a super register we stop
+        // here. By definition we are fine if the super register is not
+        // (completely) dead, if the complete super register is dead we will
+        // get a report for its operand.
+        if (Bad) {
+          for (const MachineOperand &MOP : MI->uses()) {
+            if (!MOP.isReg())
+              continue;
+            if (!MOP.isImplicit())
+              continue;
+            for (MCSubRegIterator SubRegs(MOP.getReg(), TRI); SubRegs.isValid();
+                 ++SubRegs) {
+              if (*SubRegs == Reg) {
+                Bad = false;
+                break;
+              }
+            }
+          }
+        }
+        if (Bad)
           report("Using an undefined physical register", MO, MONum);
       } else if (MRI->def_empty(Reg)) {
         report("Reading virtual register without a def", MO, MONum);
@@ -1094,19 +1116,19 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
           assert(VNI && "NULL valno is not allowed");
           if (VNI->def != DefIdx) {
             report("Inconsistent valno->def", MO, MONum);
-            *OS << "Valno " << VNI->id << " is not defined at "
+            errs() << "Valno " << VNI->id << " is not defined at "
               << DefIdx << " in " << LI << '\n';
           }
         } else {
           report("No live segment at def", MO, MONum);
-          *OS << DefIdx << " is not live in " << LI << '\n';
+          errs() << DefIdx << " is not live in " << LI << '\n';
         }
         // Check that, if the dead def flag is present, LiveInts agree.
         if (MO->isDead()) {
           LiveQueryResult LRQ = LI.Query(DefIdx);
           if (!LRQ.isDeadDef()) {
             report("Live range continues after dead def flag", MO, MONum);
-            *OS << "Live range: " << LI << '\n';
+            errs() << "Live range: " << LI << '\n';
           }
         }
       } else {
@@ -1148,7 +1170,7 @@ MachineVerifier::visitMachineBasicBlockAfter(const MachineBasicBlock *MBB) {
     SlotIndex stop = Indexes->getMBBEndIdx(MBB);
     if (!(stop > lastIndex)) {
       report("Block ends before last instruction index", MBB);
-      *OS << "Block ends at " << stop
+      errs() << "Block ends at " << stop
           << " last instruction was at " << lastIndex << '\n';
     }
     lastIndex = stop;
@@ -1250,7 +1272,7 @@ void MachineVerifier::checkPHIOps(const MachineBasicBlock *MBB) {
            PrE = MBB->pred_end(); PrI != PrE; ++PrI) {
       if (!seen.count(*PrI)) {
         report("Missing PHI operand", &BBI);
-        *OS << "BB#" << (*PrI)->getNumber()
+        errs() << "BB#" << (*PrI)->getNumber()
             << " is a predecessor according to the CFG.\n";
       }
     }
@@ -1281,7 +1303,7 @@ void MachineVerifier::visitMachineFunctionAfter() {
          ++I)
       if (MInfo.regsKilled.count(*I)) {
         report("Virtual register killed in block, but needed live out.", &MBB);
-        *OS << "Virtual register " << PrintReg(*I)
+        errs() << "Virtual register " << PrintReg(*I)
             << " is used after the block.\n";
       }
   }
@@ -1313,13 +1335,13 @@ void MachineVerifier::verifyLiveVariables() {
       if (MInfo.vregsRequired.count(Reg)) {
         if (!VI.AliveBlocks.test(MBB.getNumber())) {
           report("LiveVariables: Block missing from AliveBlocks", &MBB);
-          *OS << "Virtual register " << PrintReg(Reg)
+          errs() << "Virtual register " << PrintReg(Reg)
               << " must be live through the block.\n";
         }
       } else {
         if (VI.AliveBlocks.test(MBB.getNumber())) {
           report("LiveVariables: Block should not be in AliveBlocks", &MBB);
-          *OS << "Virtual register " << PrintReg(Reg)
+          errs() << "Virtual register " << PrintReg(Reg)
               << " is not needed live through the block.\n";
         }
       }
@@ -1338,7 +1360,7 @@ void MachineVerifier::verifyLiveIntervals() {
 
     if (!LiveInts->hasInterval(Reg)) {
       report("Missing live interval for virtual register", MF);
-      *OS << PrintReg(Reg, TRI) << " still has defs or uses\n";
+      errs() << PrintReg(Reg, TRI) << " still has defs or uses\n";
       continue;
     }
 
@@ -1354,38 +1376,40 @@ void MachineVerifier::verifyLiveIntervals() {
 }
 
 void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
-                                           const VNInfo *VNI,
-                                           unsigned Reg) {
+                                           const VNInfo *VNI, unsigned Reg,
+                                           unsigned LaneMask) {
   if (VNI->isUnused())
     return;
 
   const VNInfo *DefVNI = LR.getVNInfoAt(VNI->def);
 
   if (!DefVNI) {
-    report("Valno not live at def and not marked unused", MF, LR, Reg);
-    *OS << "Valno #" << VNI->id << '\n';
+    report("Valno not live at def and not marked unused", MF, LR, Reg,
+           LaneMask);
+    errs() << "Valno #" << VNI->id << '\n';
     return;
   }
 
   if (DefVNI != VNI) {
-    report("Live segment at def has different valno", MF, LR, Reg);
-    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
+    report("Live segment at def has different valno", MF, LR, Reg, LaneMask);
+    errs() << "Valno #" << VNI->id << " is defined at " << VNI->def
         << " where valno #" << DefVNI->id << " is live\n";
     return;
   }
 
   const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def);
   if (!MBB) {
-    report("Invalid definition index", MF, LR, Reg);
-    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
+    report("Invalid definition index", MF, LR, Reg, LaneMask);
+    errs() << "Valno #" << VNI->id << " is defined at " << VNI->def
         << " in " << LR << '\n';
     return;
   }
 
   if (VNI->isPHIDef()) {
     if (VNI->def != LiveInts->getMBBStartIdx(MBB)) {
-      report("PHIDef value is not defined at MBB start", MBB, LR, Reg);
-      *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
+      report("PHIDef value is not defined at MBB start", MBB, LR, Reg,
+             LaneMask);
+      errs() << "Valno #" << VNI->id << " is defined at " << VNI->def
           << ", not at the beginning of BB#" << MBB->getNumber() << '\n';
     }
     return;
@@ -1394,8 +1418,8 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
   // Non-PHI def.
   const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def);
   if (!MI) {
-    report("No instruction at def index", MBB, LR, Reg);
-    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+    report("No instruction at def index", MBB, LR, Reg, LaneMask);
+    errs() << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
     return;
   }
 
@@ -1413,6 +1437,9 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
             !TRI->hasRegUnit(MOI->getReg(), Reg))
           continue;
       }
+      if (LaneMask != 0 &&
+          (TRI->getSubRegIndexLaneMask(MOI->getSubReg()) & LaneMask) == 0)
+        continue;
       hasDef = true;
       if (MOI->isEarlyClobber())
         isEarlyClobber = true;
@@ -1420,7 +1447,7 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
 
     if (!hasDef) {
       report("Defining instruction does not modify register", MI);
-      *OS << "Valno #" << VNI->id << " in " << LR << '\n';
+      errs() << "Valno #" << VNI->id << " in " << LR << '\n';
     }
 
     // Early clobber defs begin at USE slots, but other defs must begin at
@@ -1428,51 +1455,52 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
     if (isEarlyClobber) {
       if (!VNI->def.isEarlyClobber()) {
         report("Early clobber def must be at an early-clobber slot", MBB, LR,
-               Reg);
-        *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+               Reg, LaneMask);
+        errs() << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
       }
     } else if (!VNI->def.isRegister()) {
       report("Non-PHI, non-early clobber def must be at a register slot",
-             MBB, LR, Reg);
-      *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+             MBB, LR, Reg, LaneMask);
+      errs() << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
     }
   }
 }
 
 void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
                                              const LiveRange::const_iterator I,
-                                             unsigned Reg) {
+                                             unsigned Reg, unsigned LaneMask) {
   const LiveRange::Segment &S = *I;
   const VNInfo *VNI = S.valno;
   assert(VNI && "Live segment has no valno");
 
   if (VNI->id >= LR.getNumValNums() || VNI != LR.getValNumInfo(VNI->id)) {
-    report("Foreign valno in live segment", MF, LR, Reg);
-    *OS << S << " has a bad valno\n";
+    report("Foreign valno in live segment", MF, LR, Reg, LaneMask);
+    errs() << S << " has a bad valno\n";
   }
 
   if (VNI->isUnused()) {
-    report("Live segment valno is marked unused", MF, LR, Reg);
-    *OS << S << '\n';
+    report("Live segment valno is marked unused", MF, LR, Reg, LaneMask);
+    errs() << S << '\n';
   }
 
   const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(S.start);
   if (!MBB) {
-    report("Bad start of live segment, no basic block", MF, LR, Reg);
-    *OS << S << '\n';
+    report("Bad start of live segment, no basic block", MF, LR, Reg, LaneMask);
+    errs() << S << '\n';
     return;
   }
   SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB);
   if (S.start != MBBStartIdx && S.start != VNI->def) {
-    report("Live segment must begin at MBB entry or valno def", MBB, LR, Reg);
-    *OS << S << '\n';
+    report("Live segment must begin at MBB entry or valno def", MBB, LR, Reg,
+           LaneMask);
+    errs() << S << '\n';
   }
 
   const MachineBasicBlock *EndMBB =
     LiveInts->getMBBFromIndex(S.end.getPrevSlot());
   if (!EndMBB) {
-    report("Bad end of live segment, no basic block", MF, LR, Reg);
-    *OS << S << '\n';
+    report("Bad end of live segment, no basic block", MF, LR, Reg, LaneMask);
+    errs() << S << '\n';
     return;
   }
 
@@ -1489,15 +1517,17 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
   const MachineInstr *MI =
     LiveInts->getInstructionFromIndex(S.end.getPrevSlot());
   if (!MI) {
-    report("Live segment doesn't end at a valid instruction", EndMBB, LR, Reg);
-    *OS << S << '\n';
+    report("Live segment doesn't end at a valid instruction", EndMBB, LR, Reg,
+           LaneMask);
+    errs() << S << '\n';
     return;
   }
 
   // The block slot must refer to a basic block boundary.
   if (S.end.isBlock()) {
-    report("Live segment ends at B slot of an instruction", EndMBB, LR, Reg);
-    *OS << S << '\n';
+    report("Live segment ends at B slot of an instruction", EndMBB, LR, Reg,
+           LaneMask);
+    errs() << S << '\n';
   }
 
   if (S.end.isDead()) {
@@ -1505,8 +1535,8 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
     // That means there must be a dead def.
     if (!SlotIndex::isSameInstr(S.start, S.end)) {
       report("Live segment ending at dead slot spans instructions", EndMBB, LR,
-             Reg);
-      *OS << S << '\n';
+             Reg, LaneMask);
+      errs() << S << '\n';
     }
   }
 
@@ -1515,8 +1545,9 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
   if (S.end.isEarlyClobber()) {
     if (I+1 == LR.end() || (I+1)->start != S.end) {
       report("Live segment ending at early clobber slot must be "
-             "redefined by an EC def in the same instruction", EndMBB, LR, Reg);
-      *OS << S << '\n';
+             "redefined by an EC def in the same instruction", EndMBB, LR, Reg,
+             LaneMask);
+      errs() << S << '\n';
     }
   }
 
@@ -1526,16 +1557,27 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
     // A live segment can end with either a redefinition, a kill flag on a
     // use, or a dead flag on a def.
     bool hasRead = false;
+    bool hasSubRegDef = false;
     for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
       if (!MOI->isReg() || MOI->getReg() != Reg)
         continue;
+      if (LaneMask != 0 &&
+          (LaneMask & TRI->getSubRegIndexLaneMask(MOI->getSubReg())) == 0)
+        continue;
+      if (MOI->isDef() && MOI->getSubReg() != 0)
+        hasSubRegDef = true;
       if (MOI->readsReg())
         hasRead = true;
     }
     if (!S.end.isDead()) {
       if (!hasRead) {
-        report("Instruction ending live segment doesn't read the register", MI);
-        *OS << S << " in " << LR << '\n';
+        // When tracking subregister liveness, the main range must start new
+        // values on partial register writes, even if there is no read.
+        if (!MRI->tracksSubRegLiveness() || LaneMask != 0 || !hasSubRegDef) {
+          report("Instruction ending live segment doesn't read the register",
+                 MI);
+          errs() << S << " in " << LR << '\n';
+        }
       }
     }
   }
@@ -1573,8 +1615,9 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
 
       // All predecessors must have a live-out value.
       if (!PVNI) {
-        report("Register not marked live out of predecessor", *PI, LR, Reg);
-        *OS << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber()
+        report("Register not marked live out of predecessor", *PI, LR, Reg,
+               LaneMask);
+        errs() << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber()
             << '@' << LiveInts->getMBBStartIdx(MFI) << ", not live before "
             << PEnd << '\n';
         continue;
@@ -1582,8 +1625,9 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
 
       // Only PHI-defs can take different predecessor values.
       if (!IsPHI && PVNI != VNI) {
-        report("Different value live out of predecessor", *PI, LR, Reg);
-        *OS << "Valno #" << PVNI->id << " live out of BB#"
+        report("Different value live out of predecessor", *PI, LR, Reg,
+               LaneMask);
+        errs() << "Valno #" << PVNI->id << " live out of BB#"
             << (*PI)->getNumber() << '@' << PEnd
             << "\nValno #" << VNI->id << " live into BB#" << MFI->getNumber()
             << '@' << LiveInts->getMBBStartIdx(MFI) << '\n';
@@ -1595,18 +1639,36 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
   }
 }
 
-void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg) {
-  for (LiveRange::const_vni_iterator I = LR.vni_begin(), E = LR.vni_end();
-       I != E; ++I)
-    verifyLiveRangeValue(LR, *I, Reg);
+void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg,
+                                      unsigned LaneMask) {
+  for (const VNInfo *VNI : LR.valnos)
+    verifyLiveRangeValue(LR, VNI, Reg, LaneMask);
 
   for (LiveRange::const_iterator I = LR.begin(), E = LR.end(); I != E; ++I)
-    verifyLiveRangeSegment(LR, I, Reg);
+    verifyLiveRangeSegment(LR, I, Reg, LaneMask);
 }
 
 void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
   verifyLiveRange(LI, LI.reg);
 
+  unsigned Reg = LI.reg;
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    unsigned Mask = 0;
+    unsigned MaxMask = MRI->getMaxLaneMaskForVReg(Reg);
+    for (const LiveInterval::SubRange &SR : LI.subranges()) {
+      if ((Mask & SR.LaneMask) != 0)
+        report("Lane masks of sub ranges overlap in live interval", MF, LI);
+      if ((SR.LaneMask & ~MaxMask) != 0)
+        report("Subrange lanemask is invalid", MF, LI);
+      Mask |= SR.LaneMask;
+      verifyLiveRange(SR, LI.reg, SR.LaneMask);
+      if (!LI.covers(SR))
+        report("A Subrange is not covered by the main range", MF, LI);
+    }
+  } else if (LI.hasSubRanges()) {
+    report("subregister liveness only allowed for virtual registers", MF, LI);
+  }
+
   // Check the LI only has one connected component.
   if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
     ConnectedVNInfoEqClasses ConEQ(*LiveInts);
@@ -1614,12 +1676,12 @@ void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
     if (NumComp > 1) {
       report("Multiple connected components in live interval", MF, LI);
       for (unsigned comp = 0; comp != NumComp; ++comp) {
-        *OS << comp << ": valnos";
+        errs() << comp << ": valnos";
         for (LiveInterval::const_vni_iterator I = LI.vni_begin(),
              E = LI.vni_end(); I!=E; ++I)
           if (comp == ConEQ.getEqClass(*I))
-            *OS << ' ' << (*I)->id;
-        *OS << '\n';
+            errs() << ' ' << (*I)->id;
+        errs() << '\n';
       }
     }
   }
@@ -1700,7 +1762,7 @@ void MachineVerifier::verifyStackFrame() {
                                                BBState.ExitValue;
         if (BBState.ExitIsSetup && AbsSPAdj != Size) {
           report("FrameDestroy <n> is after FrameSetup <m>", &I);
-          *OS << "FrameDestroy <" << Size << "> is after FrameSetup <"
+          errs() << "FrameDestroy <" << Size << "> is after FrameSetup <"
               << AbsSPAdj << ">.\n";
         }
         BBState.ExitValue += Size;
@@ -1717,7 +1779,7 @@ void MachineVerifier::verifyStackFrame() {
           (SPState[(*I)->getNumber()].ExitValue != BBState.EntryValue ||
            SPState[(*I)->getNumber()].ExitIsSetup != BBState.EntryIsSetup)) {
         report("The exit stack state of a predecessor is inconsistent.", MBB);
-        *OS << "Predecessor BB#" << (*I)->getNumber() << " has exit state ("
+        errs() << "Predecessor BB#" << (*I)->getNumber() << " has exit state ("
             << SPState[(*I)->getNumber()].ExitValue << ", "
             << SPState[(*I)->getNumber()].ExitIsSetup
             << "), while BB#" << MBB->getNumber() << " has entry state ("
@@ -1733,7 +1795,7 @@ void MachineVerifier::verifyStackFrame() {
           (SPState[(*I)->getNumber()].EntryValue != BBState.ExitValue ||
            SPState[(*I)->getNumber()].EntryIsSetup != BBState.ExitIsSetup)) {
         report("The entry stack state of a successor is inconsistent.", MBB);
-        *OS << "Successor BB#" << (*I)->getNumber() << " has entry state ("
+        errs() << "Successor BB#" << (*I)->getNumber() << " has entry state ("
             << SPState[(*I)->getNumber()].EntryValue << ", "
             << SPState[(*I)->getNumber()].EntryIsSetup
             << "), while BB#" << MBB->getNumber() << " has exit state ("
diff --git a/lib/CodeGen/OcamlGC.cpp b/lib/CodeGen/OcamlGC.cpp
index 48db200..17654a6 100644
--- a/lib/CodeGen/OcamlGC.cpp
+++ b/lib/CodeGen/OcamlGC.cpp
@@ -20,16 +20,15 @@
 using namespace llvm;
 
 namespace {
-  class OcamlGC : public GCStrategy {
-  public:
-    OcamlGC();
-  };
+class OcamlGC : public GCStrategy {
+public:
+  OcamlGC();
+};
 }
 
-static GCRegistry::Add<OcamlGC>
-X("ocaml", "ocaml 3.10-compatible GC");
+static GCRegistry::Add<OcamlGC> X("ocaml", "ocaml 3.10-compatible GC");
 
-void llvm::linkOcamlGC() { }
+void llvm::linkOcamlGC() {}
 
 OcamlGC::OcamlGC() {
   NeededSafePoints = 1 << GC::PostCall;
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index ec71d86..272d068 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -14,13 +14,12 @@
 
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -82,7 +81,9 @@ static cl::opt<bool> PrintGCInfo("print-gc", cl::Hidden,
     cl::desc("Dump garbage collector data"));
 static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden,
     cl::desc("Verify generated machine code"),
-    cl::init(getenv("LLVM_VERIFY_MACHINEINSTRS")!=nullptr));
+    cl::init(false),
+    cl::ZeroOrMore);
+
 static cl::opt<std::string>
 PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
                    cl::desc("Print machine instrs"),
@@ -235,8 +236,8 @@ TargetPassConfig::~TargetPassConfig() {
 // registers all common codegen passes.
 TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
   : ImmutablePass(ID), PM(&pm), StartAfter(nullptr), StopAfter(nullptr),
-    Started(true), Stopped(false), TM(tm), Impl(nullptr), Initialized(false),
-    DisableVerify(false),
+    Started(true), Stopped(false), AddingMachinePasses(false), TM(tm),
+    Impl(nullptr), Initialized(false), DisableVerify(false),
     EnableTailMerge(true) {
 
   Impl = new PassConfigImpl();
@@ -250,7 +251,7 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
   substitutePass(&PostRAMachineLICMID, &MachineLICMID);
 
   // Temporarily disable experimental passes.
-  const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
+  const TargetSubtargetInfo &ST = *TM->getSubtargetImpl();
   if (!ST.useMachineScheduler())
     disablePass(&MachineSchedulerID);
 }
@@ -304,7 +305,7 @@ IdentifyingPassPtr TargetPassConfig::getPassSubstitution(AnalysisID ID) const {
 /// a later pass or that it should stop after an earlier pass, then do not add
 /// the pass.  Finally, compare the current pass against the StartAfter
 /// and StopAfter options and change the Started/Stopped flags accordingly.
-void TargetPassConfig::addPass(Pass *P) {
+void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) {
   assert(!Initialized && "PassConfig is immutable");
 
   // Cache the Pass ID here in case the pass manager finds this pass is
@@ -313,10 +314,21 @@ void TargetPassConfig::addPass(Pass *P) {
   // and shouldn't reference it.
   AnalysisID PassID = P->getPassID();
 
-  if (Started && !Stopped)
+  if (Started && !Stopped) {
+    std::string Banner;
+    // Construct banner message before PM->add() as that may delete the pass.
+    if (AddingMachinePasses && (printAfter || verifyAfter))
+      Banner = std::string("After ") + std::string(P->getPassName());
     PM->add(P);
-  else
+    if (AddingMachinePasses) {
+      if (printAfter)
+        addPrintPass(Banner);
+      if (verifyAfter)
+        addVerifyPass(Banner);
+    }
+  } else {
     delete P;
+  }
   if (StopAfter == PassID)
     Stopped = true;
   if (StartAfter == PassID)
@@ -330,7 +342,8 @@ void TargetPassConfig::addPass(Pass *P) {
 ///
 /// addPass cannot return a pointer to the pass instance because is internal the
 /// PassManager and the instance we create here may already be freed.
-AnalysisID TargetPassConfig::addPass(AnalysisID PassID) {
+AnalysisID TargetPassConfig::addPass(AnalysisID PassID, bool verifyAfter,
+                                     bool printAfter) {
   IdentifyingPassPtr TargetID = getPassSubstitution(PassID);
   IdentifyingPassPtr FinalPtr = overridePass(PassID, TargetID);
   if (!FinalPtr.isValid())
@@ -345,7 +358,7 @@ AnalysisID TargetPassConfig::addPass(AnalysisID PassID) {
       llvm_unreachable("Pass ID not registered");
   }
   AnalysisID FinalID = P->getPassID();
-  addPass(P); // Ends the lifetime of P.
+  addPass(P, verifyAfter, printAfter); // Ends the lifetime of P.
 
   // Add the passes after the pass P if there is any.
   for (SmallVectorImpl<std::pair<AnalysisID, IdentifyingPassPtr> >::iterator
@@ -360,18 +373,25 @@ AnalysisID TargetPassConfig::addPass(AnalysisID PassID) {
         NP = Pass::createPass((*I).second.getID());
         assert(NP && "Pass ID not registered");
       }
-      addPass(NP);
+      addPass(NP, false, false);
     }
   }
   return FinalID;
 }
 
-void TargetPassConfig::printAndVerify(const char *Banner) {
+void TargetPassConfig::printAndVerify(const std::string &Banner) {
+  addPrintPass(Banner);
+  addVerifyPass(Banner);
+}
+
+void TargetPassConfig::addPrintPass(const std::string &Banner) {
   if (TM->shouldPrintMachineCode())
-    addPass(createMachineFunctionPrinterPass(dbgs(), Banner));
+    PM->add(createMachineFunctionPrinterPass(dbgs(), Banner));
+}
 
+void TargetPassConfig::addVerifyPass(const std::string &Banner) {
   if (VerifyMachineCode)
-    addPass(createMachineVerifierPass(Banner));
+    PM->add(createMachineVerifierPass(Banner));
 }
 
 /// Add common target configurable passes that perform LLVM IR to IR transforms
@@ -401,7 +421,10 @@ void TargetPassConfig::addIRPasses() {
       addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
   }
 
+  // Run GC lowering passes for builtin collectors
+  // TODO: add a pass insertion point here
   addPass(createGCLoweringPass());
+  addPass(createShadowStackGCLoweringPass());
 
   // Make sure that no unreachable blocks are instruction selected.
   addPass(createUnreachableBlockEliminationPass());
@@ -429,9 +452,11 @@ void TargetPassConfig::addPassesToHandleExceptions() {
     // FALLTHROUGH
   case ExceptionHandling::DwarfCFI:
   case ExceptionHandling::ARM:
-  case ExceptionHandling::ItaniumWinEH:
     addPass(createDwarfEHPass(TM));
     break;
+  case ExceptionHandling::WinEH:
+    addPass(createWinEHPass(TM));
+    break;
   case ExceptionHandling::None:
     addPass(createLowerInvokePass());
 
@@ -491,6 +516,8 @@ void TargetPassConfig::addISelPrepare() {
 /// TODO: We could use a single addPre/Post(ID) hook to allow pass injection
 /// before/after any target-independent pass. But it's currently overkill.
 void TargetPassConfig::addMachinePasses() {
+  AddingMachinePasses = true;
+
   // Insert a machine instr printer pass after the specified pass.
   // If -print-machineinstrs specified, print machineinstrs after all passes.
   if (StringRef(PrintMachineInstrs.getValue()).equals(""))
@@ -499,7 +526,7 @@ void TargetPassConfig::addMachinePasses() {
            .equals("option-unspecified")) {
     const PassRegistry *PR = PassRegistry::getPassRegistry();
     const PassInfo *TPI = PR->getPassInfo(PrintMachineInstrs.getValue());
-    const PassInfo *IPI = PR->getPassInfo(StringRef("print-machineinstrs"));
+    const PassInfo *IPI = PR->getPassInfo(StringRef("machineinstr-printer"));
     assert (TPI && IPI && "Pass ID not registered!");
     const char *TID = (const char *)(TPI->getTypeInfo());
     const char *IID = (const char *)(IPI->getTypeInfo());
@@ -510,8 +537,7 @@ void TargetPassConfig::addMachinePasses() {
   printAndVerify("After Instruction Selection");
 
   // Expand pseudo-instructions emitted by ISel.
-  if (addPass(&ExpandISelPseudosID))
-    printAndVerify("After ExpandISelPseudos");
+  addPass(&ExpandISelPseudosID);
 
   // Add passes that optimize machine instructions in SSA form.
   if (getOptLevel() != CodeGenOpt::None) {
@@ -519,12 +545,11 @@ void TargetPassConfig::addMachinePasses() {
   } else {
     // If the target requests it, assign local variables to stack slots relative
     // to one another and simplify frame index references where possible.
-    addPass(&LocalStackSlotAllocationID);
+    addPass(&LocalStackSlotAllocationID, false);
   }
 
   // Run pre-ra passes.
-  if (addPreRegAlloc())
-    printAndVerify("After PreRegAlloc passes");
+  addPreRegAlloc();
 
   // Run register allocation and passes that are tightly coupled with it,
   // including phi elimination and scheduling.
@@ -534,12 +559,10 @@ void TargetPassConfig::addMachinePasses() {
     addFastRegAlloc(createRegAllocPass(false));
 
   // Run post-ra passes.
-  if (addPostRegAlloc())
-    printAndVerify("After PostRegAlloc passes");
+  addPostRegAlloc();
 
   // Insert prolog/epilog code.  Eliminate abstract frame index references...
   addPass(&PrologEpilogCodeInserterID);
-  printAndVerify("After PrologEpilogCodeInserter");
 
   /// Add passes that optimize machine instructions after register allocation.
   if (getOptLevel() != CodeGenOpt::None)
@@ -547,11 +570,9 @@ void TargetPassConfig::addMachinePasses() {
 
   // Expand pseudo instructions before second scheduling pass.
   addPass(&ExpandPostRAPseudosID);
-  printAndVerify("After ExpandPostRAPseudos");
 
   // Run pre-sched2 passes.
-  if (addPreSched2())
-    printAndVerify("After PreSched2 passes");
+  addPreSched2();
 
   // Second pass scheduler.
   if (getOptLevel() != CodeGenOpt::None) {
@@ -559,66 +580,61 @@ void TargetPassConfig::addMachinePasses() {
       addPass(&PostMachineSchedulerID);
     else
       addPass(&PostRASchedulerID);
-    printAndVerify("After PostRAScheduler");
   }
 
   // GC
   if (addGCPasses()) {
     if (PrintGCInfo)
-      addPass(createGCInfoPrinter(dbgs()));
+      addPass(createGCInfoPrinter(dbgs()), false, false);
   }
 
   // Basic block placement.
   if (getOptLevel() != CodeGenOpt::None)
     addBlockPlacement();
 
-  if (addPreEmitPass())
-    printAndVerify("After PreEmit passes");
+  addPreEmitPass();
 
-  addPass(&StackMapLivenessID);
+  addPass(&StackMapLivenessID, false);
+
+  AddingMachinePasses = false;
 }
 
 /// Add passes that optimize machine instructions in SSA form.
 void TargetPassConfig::addMachineSSAOptimization() {
   // Pre-ra tail duplication.
-  if (addPass(&EarlyTailDuplicateID))
-    printAndVerify("After Pre-RegAlloc TailDuplicate");
+  addPass(&EarlyTailDuplicateID);
 
   // Optimize PHIs before DCE: removing dead PHI cycles may make more
   // instructions dead.
-  addPass(&OptimizePHIsID);
+  addPass(&OptimizePHIsID, false);
 
   // This pass merges large allocas. StackSlotColoring is a different pass
   // which merges spill slots.
-  addPass(&StackColoringID);
+  addPass(&StackColoringID, false);
 
   // If the target requests it, assign local variables to stack slots relative
   // to one another and simplify frame index references where possible.
-  addPass(&LocalStackSlotAllocationID);
+  addPass(&LocalStackSlotAllocationID, false);
 
   // With optimization, dead code should already be eliminated. However
   // there is one known exception: lowered code for arguments that are only
   // used by tail calls, where the tail calls reuse the incoming stack
   // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
   addPass(&DeadMachineInstructionElimID);
-  printAndVerify("After codegen DCE pass");
 
   // Allow targets to insert passes that improve instruction level parallelism,
   // like if-conversion. Such passes will typically need dominator trees and
   // loop info, just like LICM and CSE below.
-  if (addILPOpts())
-    printAndVerify("After ILP optimizations");
+  addILPOpts();
 
-  addPass(&MachineLICMID);
-  addPass(&MachineCSEID);
+  addPass(&MachineLICMID, false);
+  addPass(&MachineCSEID, false);
   addPass(&MachineSinkingID);
-  printAndVerify("After Machine LICM, CSE and Sinking passes");
 
-  addPass(&PeepholeOptimizerID);
+  addPass(&PeepholeOptimizerID, false);
   // Clean-up the dead code that may have been generated by peephole
   // rewriting.
   addPass(&DeadMachineInstructionElimID);
-  printAndVerify("After codegen peephole optimization pass");
 }
 
 //===---------------------------------------------------------------------===//
@@ -701,18 +717,17 @@ bool TargetPassConfig::usingDefaultRegAlloc() const {
 /// Add the minimum set of target-independent passes that are required for
 /// register allocation. No coalescing or scheduling.
 void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
-  addPass(&PHIEliminationID);
-  addPass(&TwoAddressInstructionPassID);
+  addPass(&PHIEliminationID, false);
+  addPass(&TwoAddressInstructionPassID, false);
 
   addPass(RegAllocPass);
-  printAndVerify("After Register Allocation");
 }
 
 /// Add standard target-independent passes that are tightly coupled with
 /// optimized register allocation, including coalescing, machine instruction
 /// scheduling, and register allocation itself.
 void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
-  addPass(&ProcessImplicitDefsID);
+  addPass(&ProcessImplicitDefsID, false);
 
   // LiveVariables currently requires pure SSA form.
   //
@@ -720,35 +735,30 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   // LiveVariables can be removed completely, and LiveIntervals can be directly
   // computed. (We still either need to regenerate kill flags after regalloc, or
   // preferably fix the scavenger to not depend on them).
-  addPass(&LiveVariablesID);
+  addPass(&LiveVariablesID, false);
 
   // Edge splitting is smarter with machine loop info.
-  addPass(&MachineLoopInfoID);
-  addPass(&PHIEliminationID);
+  addPass(&MachineLoopInfoID, false);
+  addPass(&PHIEliminationID, false);
 
   // Eventually, we want to run LiveIntervals before PHI elimination.
   if (EarlyLiveIntervals)
-    addPass(&LiveIntervalsID);
+    addPass(&LiveIntervalsID, false);
 
-  addPass(&TwoAddressInstructionPassID);
+  addPass(&TwoAddressInstructionPassID, false);
   addPass(&RegisterCoalescerID);
-  printAndVerify("After Register Coalescing");
 
   // PreRA instruction scheduling.
-  if (addPass(&MachineSchedulerID))
-    printAndVerify("After Machine Scheduling");
+  addPass(&MachineSchedulerID);
 
   // Add the selected register allocation pass.
   addPass(RegAllocPass);
-  printAndVerify("After Register Allocation, before rewriter");
 
   // Allow targets to change the register assignments before rewriting.
-  if (addPreRewrite())
-    printAndVerify("After pre-rewrite passes");
+  addPreRewrite();
 
   // Finally rewrite virtual registers.
   addPass(&VirtRegRewriterID);
-  printAndVerify("After Virtual Register Rewriter");
 
   // Perform stack slot coloring and post-ra machine LICM.
   //
@@ -760,8 +770,6 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   //
   // FIXME: can this move into MachineLateOptimization?
   addPass(&PostRAMachineLICMID);
-
-  printAndVerify("After StackSlotColoring and postra Machine LICM");
 }
 
 //===---------------------------------------------------------------------===//
@@ -771,34 +779,30 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
 /// Add passes that optimize machine instructions after register allocation.
 void TargetPassConfig::addMachineLateOptimization() {
   // Branch folding must be run after regalloc and prolog/epilog insertion.
-  if (addPass(&BranchFolderPassID))
-    printAndVerify("After BranchFolding");
+  addPass(&BranchFolderPassID);
 
   // Tail duplication.
   // Note that duplicating tail just increases code size and degrades
   // performance for targets that require Structured Control Flow.
   // In addition it can also make CFG irreducible. Thus we disable it.
-  if (!TM->requiresStructuredCFG() && addPass(&TailDuplicateID))
-    printAndVerify("After TailDuplicate");
+  if (!TM->requiresStructuredCFG())
+    addPass(&TailDuplicateID);
 
   // Copy propagation.
-  if (addPass(&MachineCopyPropagationID))
-    printAndVerify("After copy propagation pass");
+  addPass(&MachineCopyPropagationID);
 }
 
 /// Add standard GC passes.
 bool TargetPassConfig::addGCPasses() {
-  addPass(&GCMachineCodeAnalysisID);
+  addPass(&GCMachineCodeAnalysisID, false);
   return true;
 }
 
 /// Add standard basic block placement passes.
 void TargetPassConfig::addBlockPlacement() {
-  if (addPass(&MachineBlockPlacementID)) {
+  if (addPass(&MachineBlockPlacementID, false)) {
     // Run a separate pass to collect block placement statistics.
     if (EnableBlockPlacementStats)
       addPass(&MachineBlockPlacementStatsID);
-
-    printAndVerify("After machine block placement.");
   }
 }
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index a296aea..283d1f2 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -133,7 +133,8 @@ namespace {
     bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
                           SmallPtrSetImpl<MachineInstr*> &LocalMIs);
-    bool optimizeSelect(MachineInstr *MI);
+    bool optimizeSelect(MachineInstr *MI,
+                        SmallPtrSetImpl<MachineInstr *> &LocalMIs);
     bool optimizeCondBranch(MachineInstr *MI);
     bool optimizeCopyOrBitcast(MachineInstr *MI);
     bool optimizeCoalescableCopy(MachineInstr *MI);
@@ -482,7 +483,8 @@ bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr *MI,
 }
 
 /// Optimize a select instruction.
-bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI) {
+bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI,
+                            SmallPtrSetImpl<MachineInstr *> &LocalMIs) {
   unsigned TrueOp = 0;
   unsigned FalseOp = 0;
   bool Optimizable = false;
@@ -491,7 +493,7 @@ bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI) {
     return false;
   if (!Optimizable)
     return false;
-  if (!TII->optimizeSelect(MI))
+  if (!TII->optimizeSelect(MI, LocalMIs))
     return false;
   MI->eraseFromParent();
   ++NumSelects;
@@ -1072,6 +1074,13 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
     MachineBasicBlock *MBB = &*I;
 
     bool SeenMoveImm = false;
+
+    // During this forward scan, at some point it needs to answer the question
+    // "given a pointer to an MI in the current BB, is it located before or
+    // after the current instruction".
+    // To perform this, the following set keeps track of the MIs already seen
+    // during the scan, if a MI is not in the set, it is assumed to be located
+    // after. Newly created MIs have to be inserted in the set as well.
     SmallPtrSet<MachineInstr*, 16> LocalMIs;
     SmallSet<unsigned, 4> ImmDefRegs;
     DenseMap<unsigned, MachineInstr*> ImmDefMIs;
@@ -1102,7 +1111,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       if ((isUncoalescableCopy(*MI) &&
            optimizeUncoalescableCopy(MI, LocalMIs)) ||
           (MI->isCompare() && optimizeCmpInstr(MI, MBB)) ||
-          (MI->isSelect() && optimizeSelect(MI))) {
+          (MI->isSelect() && optimizeSelect(MI, LocalMIs))) {
         // MI is deleted.
         LocalMIs.erase(MI);
         Changed = true;
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 89e1d11..ad59fc9 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -282,9 +282,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
   } else {
     // Check that post-RA scheduling is enabled for this target.
     // This may upgrade the AntiDepMode.
-    const TargetSubtargetInfo &ST =
-        Fn.getTarget().getSubtarget<TargetSubtargetInfo>();
-    if (!enablePostRAScheduler(ST, PassConfig->getOptLevel(),
+    if (!enablePostRAScheduler(Fn.getSubtarget(), PassConfig->getOptLevel(),
                                AntiDepMode, CriticalPathRCs))
       return false;
   }
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 06530b9..6d29b98 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -495,7 +495,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
 
       unsigned Align = MFI->getObjectAlignment(i);
       // Adjust to alignment boundary
-      Offset = (Offset+Align-1)/Align*Align;
+      Offset = RoundUpToAlignment(Offset, Align);
 
       MFI->setObjectOffset(i, -Offset);        // Set the computed offset
     }
@@ -504,7 +504,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     for (int i = MaxCSFI; i >= MinCSFI ; --i) {
       unsigned Align = MFI->getObjectAlignment(i);
       // Adjust to alignment boundary
-      Offset = (Offset+Align-1)/Align*Align;
+      Offset = RoundUpToAlignment(Offset, Align);
 
       MFI->setObjectOffset(i, Offset);
       Offset += MFI->getObjectSize(i);
@@ -537,7 +537,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     unsigned Align = MFI->getLocalFrameMaxAlign();
 
     // Adjust to alignment boundary.
-    Offset = (Offset + Align - 1) / Align * Align;
+    Offset = RoundUpToAlignment(Offset, Align);
 
     DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
 
@@ -656,8 +656,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     // If the frame pointer is eliminated, all frame offsets will be relative to
     // SP not FP. Align to MaxAlign so this works.
     StackAlign = std::max(StackAlign, MaxAlign);
-    unsigned AlignMask = StackAlign - 1;
-    Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+    Offset = RoundUpToAlignment(Offset, StackAlign);
   }
 
   // Update frame info to pretend that this is part of the stack...
@@ -703,7 +702,8 @@ void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
 /// register references and actual offsets.
 ///
 void PEI::replaceFrameIndices(MachineFunction &Fn) {
-  if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do?
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+  if (!TFI.needsFrameIndexResolution(Fn)) return;
 
   // Store SPAdj at exit of a basic block.
   SmallVector<int, 8> SPState;
@@ -743,26 +743,19 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
   const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
   const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo();
   const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
-  bool StackGrowsDown =
-    TFI->getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
   int FrameSetupOpcode   = TII.getCallFrameSetupOpcode();
   int FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
 
   if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB);
 
+  bool InsideCallSequence = false;
+
   for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
 
     if (I->getOpcode() == FrameSetupOpcode ||
         I->getOpcode() == FrameDestroyOpcode) {
-      // Remember how much SP has been adjusted to create the call
-      // frame.
-      int Size = I->getOperand(0).getImm();
-
-      if ((!StackGrowsDown && I->getOpcode() == FrameSetupOpcode) ||
-          (StackGrowsDown && I->getOpcode() == FrameDestroyOpcode))
-        Size = -Size;
-
-      SPAdj += Size;
+      InsideCallSequence = (I->getOpcode() == FrameSetupOpcode);
+      SPAdj += TII.getSPAdjust(I);
 
       MachineBasicBlock::iterator PrevI = BB->end();
       if (I != BB->begin()) PrevI = std::prev(I);
@@ -797,6 +790,37 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
         continue;
       }
 
+      // TODO: This code should be commoned with the code for
+      // PATCHPOINT. There's no good reason for the difference in
+      // implementation other than historical accident.  The only
+      // remaining difference is the unconditional use of the stack
+      // pointer as the base register.
+      if (MI->getOpcode() == TargetOpcode::STATEPOINT) {
+        assert((!MI->isDebugValue() || i == 0) &&
+               "Frame indicies can only appear as the first operand of a "
+               "DBG_VALUE machine instruction");
+        unsigned Reg;
+        MachineOperand &Offset = MI->getOperand(i + 1);
+        const unsigned refOffset =
+          TFI->getFrameIndexReferenceFromSP(Fn, MI->getOperand(i).getIndex(),
+                                            Reg);
+
+        Offset.setImm(Offset.getImm() + refOffset);
+        MI->getOperand(i).ChangeToRegister(Reg, false /*isDef*/);
+        continue;
+      }
+
+      // Frame allocations are target independent. Simply swap the index with
+      // the offset.
+      if (MI->getOpcode() == TargetOpcode::FRAME_ALLOC) {
+        assert(TFI->hasFP(Fn) && "frame alloc requires FP");
+        MachineOperand &FI = MI->getOperand(i);
+        unsigned Reg;
+        int FrameOffset = TFI->getFrameIndexReference(Fn, FI.getIndex(), Reg);
+        FI.ChangeToImmediate(FrameOffset);
+        continue;
+      }
+
       // Some instructions (e.g. inline asm instructions) can have
       // multiple frame indices and/or cause eliminateFrameIndex
       // to insert more than one instruction. We need the register
@@ -823,6 +847,16 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
       break;
     }
 
+    // If we are looking at a call sequence, we need to keep track of
+    // the SP adjustment made by each instruction in the sequence.
+    // This includes both the frame setup/destroy pseudos (handled above),
+    // as well as other instructions that have side effects w.r.t the SP.
+    // Note that this must come after eliminateFrameIndex, because 
+    // if I itself referred to a frame index, we shouldn't count its own
+    // adjustment.
+    if (MI && InsideCallSequence)
+      SPAdj += TII.getSPAdjust(MI);
+
     if (DoIncr && I != BB->end()) ++I;
 
     // Update register states.
diff --git a/lib/CodeGen/RegAllocBase.cpp b/lib/CodeGen/RegAllocBase.cpp
index 122afd1..6b346f4 100644
--- a/lib/CodeGen/RegAllocBase.cpp
+++ b/lib/CodeGen/RegAllocBase.cpp
@@ -90,6 +90,7 @@ void RegAllocBase::allocatePhysRegs() {
     // Unused registers can appear when the spiller coalesces snippets.
     if (MRI->reg_nodbg_empty(VirtReg->reg)) {
       DEBUG(dbgs() << "Dropping unused " << *VirtReg << '\n');
+      aboutToRemoveInterval(*VirtReg);
       LIS->removeInterval(VirtReg->reg);
       continue;
     }
@@ -139,6 +140,7 @@ void RegAllocBase::allocatePhysRegs() {
       assert(!VRM->hasPhys(SplitVirtReg->reg) && "Register already assigned");
       if (MRI->reg_nodbg_empty(SplitVirtReg->reg)) {
         DEBUG(dbgs() << "not queueing unused  " << *SplitVirtReg << '\n');
+        aboutToRemoveInterval(*SplitVirtReg);
         LIS->removeInterval(SplitVirtReg->reg);
         continue;
       }
diff --git a/lib/CodeGen/RegAllocBase.h b/lib/CodeGen/RegAllocBase.h
index bbd79cd..659b8f5 100644
--- a/lib/CodeGen/RegAllocBase.h
+++ b/lib/CodeGen/RegAllocBase.h
@@ -96,6 +96,9 @@ protected:
   // Use this group name for NamedRegionTimer.
   static const char TimerGroupName[];
 
+  /// Method called when the allocator is about to remove a LiveInterval.
+  virtual void aboutToRemoveInterval(LiveInterval &LI) {}
+
 public:
   /// VerifyEnabled - True when -verify-regalloc is given.
   static bool VerifyEnabled;
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 8fc10b4..c621414 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -372,15 +372,23 @@ void RAFast::usePhysReg(MachineOperand &MO) {
     case regDisabled:
       break;
     case regReserved:
-      assert(TRI->isSuperRegister(PhysReg, Alias) &&
+      // Either PhysReg is a subregister of Alias and we mark the
+      // whole register as free, or PhysReg is the superregister of
+      // Alias and we mark all the aliases as disabled before freeing
+      // PhysReg.
+      // In the latter case, since PhysReg was disabled, this means that
+      // its value is defined only by physical sub-registers. This check
+      // is performed by the assert of the default case in this loop.
+      // Note: The value of the superregister may only be partial
+      // defined, that is why regDisabled is a valid state for aliases.
+      assert((TRI->isSuperRegister(PhysReg, Alias) ||
+              TRI->isSuperRegister(Alias, PhysReg)) &&
              "Instruction is not using a subregister of a reserved register");
-      // Leave the superregister in the working set.
-      PhysRegState[Alias] = regFree;
-      MO.getParent()->addRegisterKilled(Alias, TRI, true);
-      return;
+      // Fall through.
     case regFree:
       if (TRI->isSuperRegister(PhysReg, Alias)) {
         // Leave the superregister in the working set.
+        PhysRegState[Alias] = regFree;
         MO.getParent()->addRegisterKilled(Alias, TRI, true);
         return;
       }
@@ -1023,8 +1031,7 @@ void RAFast::AllocateBasicBlock() {
 
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
         if (!MRI->isAllocatable(Reg)) continue;
-        definePhysReg(MI, Reg, (MO.isImplicit() || MO.isDead()) ?
-                               regFree : regReserved);
+        definePhysReg(MI, Reg, MO.isDead() ? regFree : regReserved);
         continue;
       }
       LiveRegMap::iterator LRI = defineVirtReg(MI, i, Reg, CopySrc);
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 8ef5dcd..edc3294 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -296,6 +296,9 @@ class RAGreedy : public MachineFunctionPass,
   /// obtained from the TargetSubtargetInfo.
   bool EnableLocalReassign;
 
+  /// Set of broken hints that may be reconciled later because of eviction.
+  SmallSetVector<LiveInterval *, 8> SetOfBrokenHints;
+
 public:
   RAGreedy();
 
@@ -311,6 +314,7 @@ public:
   void enqueue(LiveInterval *LI) override;
   LiveInterval *dequeue() override;
   unsigned selectOrSplit(LiveInterval&, SmallVectorImpl<unsigned>&) override;
+  void aboutToRemoveInterval(LiveInterval &) override;
 
   /// Perform register allocation.
   bool runOnMachineFunction(MachineFunction &mf) override;
@@ -378,6 +382,24 @@ private:
                                    SmallVirtRegSet &, unsigned);
   bool tryRecoloringCandidates(PQueue &, SmallVectorImpl<unsigned> &,
                                SmallVirtRegSet &, unsigned);
+  void tryHintRecoloring(LiveInterval &);
+  void tryHintsRecoloring();
+
+  /// Model the information carried by one end of a copy.
+  struct HintInfo {
+    /// The frequency of the copy.
+    BlockFrequency Freq;
+    /// The virtual register or physical register.
+    unsigned Reg;
+    /// Its currently assigned register.
+    /// In case of a physical register Reg == PhysReg.
+    unsigned PhysReg;
+    HintInfo(BlockFrequency Freq, unsigned Reg, unsigned PhysReg)
+        : Freq(Freq), Reg(Reg), PhysReg(PhysReg) {}
+  };
+  typedef SmallVector<HintInfo, 4> HintsInfo;
+  BlockFrequency getBrokenHintFreq(const HintsInfo &, unsigned);
+  void collectHintInfo(unsigned, HintsInfo &);
 };
 } // end anonymous namespace
 
@@ -453,7 +475,9 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
 
 bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) {
   if (VRM->hasPhys(VirtReg)) {
-    Matrix->unassign(LIS->getInterval(VirtReg));
+    LiveInterval &LI = LIS->getInterval(VirtReg);
+    Matrix->unassign(LI);
+    aboutToRemoveInterval(LI);
     return true;
   }
   // Unassigned virtreg is probably in the priority queue.
@@ -2213,6 +2237,11 @@ unsigned RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg,
   return PhysReg;
 }
 
+void RAGreedy::aboutToRemoveInterval(LiveInterval &LI) {
+  // Do not keep invalid information around.
+  SetOfBrokenHints.remove(&LI);
+}
+
 void RAGreedy::initializeCSRCost() {
   // We use the larger one out of the command-line option and the value report
   // by TRI.
@@ -2238,6 +2267,170 @@ void RAGreedy::initializeCSRCost() {
     CSRCost = CSRCost.getFrequency() * (ActualEntry / FixedEntry);
 }
 
+/// \brief Collect the hint info for \p Reg.
+/// The results are stored into \p Out.
+/// \p Out is not cleared before being populated.
+void RAGreedy::collectHintInfo(unsigned Reg, HintsInfo &Out) {
+  for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
+    if (!Instr.isFullCopy())
+      continue;
+    // Look for the other end of the copy.
+    unsigned OtherReg = Instr.getOperand(0).getReg();
+    if (OtherReg == Reg) {
+      OtherReg = Instr.getOperand(1).getReg();
+      if (OtherReg == Reg)
+        continue;
+    }
+    // Get the current assignment.
+    unsigned OtherPhysReg = TargetRegisterInfo::isPhysicalRegister(OtherReg)
+                                ? OtherReg
+                                : VRM->getPhys(OtherReg);
+    // Push the collected information.
+    Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
+                           OtherPhysReg));
+  }
+}
+
+/// \brief Using the given \p List, compute the cost of the broken hints if
+/// \p PhysReg was used.
+/// \return The cost of \p List for \p PhysReg.
+BlockFrequency RAGreedy::getBrokenHintFreq(const HintsInfo &List,
+                                           unsigned PhysReg) {
+  BlockFrequency Cost = 0;
+  for (const HintInfo &Info : List) {
+    if (Info.PhysReg != PhysReg)
+      Cost += Info.Freq;
+  }
+  return Cost;
+}
+
+/// \brief Using the register assigned to \p VirtReg, try to recolor
+/// all the live ranges that are copy-related with \p VirtReg.
+/// The recoloring is then propagated to all the live-ranges that have
+/// been recolored and so on, until no more copies can be coalesced or
+/// it is not profitable.
+/// For a given live range, profitability is determined by the sum of the
+/// frequencies of the non-identity copies it would introduce with the old
+/// and new register.
+void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
+  // We have a broken hint, check if it is possible to fix it by
+  // reusing PhysReg for the copy-related live-ranges. Indeed, we evicted
+  // some register and PhysReg may be available for the other live-ranges.
+  SmallSet<unsigned, 4> Visited;
+  SmallVector<unsigned, 2> RecoloringCandidates;
+  HintsInfo Info;
+  unsigned Reg = VirtReg.reg;
+  unsigned PhysReg = VRM->getPhys(Reg);
+  // Start the recoloring algorithm from the input live-interval, then
+  // it will propagate to the ones that are copy-related with it.
+  Visited.insert(Reg);
+  RecoloringCandidates.push_back(Reg);
+
+  DEBUG(dbgs() << "Trying to reconcile hints for: " << PrintReg(Reg, TRI) << '('
+               << PrintReg(PhysReg, TRI) << ")\n");
+
+  do {
+    Reg = RecoloringCandidates.pop_back_val();
+
+    // We cannot recolor physcal register.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+
+    assert(VRM->hasPhys(Reg) && "We have unallocated variable!!");
+
+    // Get the live interval mapped with this virtual register to be able
+    // to check for the interference with the new color.
+    LiveInterval &LI = LIS->getInterval(Reg);
+    unsigned CurrPhys = VRM->getPhys(Reg);
+    // Check that the new color matches the register class constraints and
+    // that it is free for this live range.
+    if (CurrPhys != PhysReg && (!MRI->getRegClass(Reg)->contains(PhysReg) ||
+                                Matrix->checkInterference(LI, PhysReg)))
+      continue;
+
+    DEBUG(dbgs() << PrintReg(Reg, TRI) << '(' << PrintReg(CurrPhys, TRI)
+                 << ") is recolorable.\n");
+
+    // Gather the hint info.
+    Info.clear();
+    collectHintInfo(Reg, Info);
+    // Check if recoloring the live-range will increase the cost of the
+    // non-identity copies.
+    if (CurrPhys != PhysReg) {
+      DEBUG(dbgs() << "Checking profitability:\n");
+      BlockFrequency OldCopiesCost = getBrokenHintFreq(Info, CurrPhys);
+      BlockFrequency NewCopiesCost = getBrokenHintFreq(Info, PhysReg);
+      DEBUG(dbgs() << "Old Cost: " << OldCopiesCost.getFrequency()
+                   << "\nNew Cost: " << NewCopiesCost.getFrequency() << '\n');
+      if (OldCopiesCost < NewCopiesCost) {
+        DEBUG(dbgs() << "=> Not profitable.\n");
+        continue;
+      }
+      // At this point, the cost is either cheaper or equal. If it is
+      // equal, we consider this is profitable because it may expose
+      // more recoloring opportunities.
+      DEBUG(dbgs() << "=> Profitable.\n");
+      // Recolor the live-range.
+      Matrix->unassign(LI);
+      Matrix->assign(LI, PhysReg);
+    }
+    // Push all copy-related live-ranges to keep reconciling the broken
+    // hints.
+    for (const HintInfo &HI : Info) {
+      if (Visited.insert(HI.Reg).second)
+        RecoloringCandidates.push_back(HI.Reg);
+    }
+  } while (!RecoloringCandidates.empty());
+}
+
+/// \brief Try to recolor broken hints.
+/// Broken hints may be repaired by recoloring when an evicted variable
+/// freed up a register for a larger live-range.
+/// Consider the following example:
+/// BB1:
+///   a =
+///   b =
+/// BB2:
+///   ...
+///   = b
+///   = a
+/// Let us assume b gets split:
+/// BB1:
+///   a =
+///   b =
+/// BB2:
+///   c = b
+///   ...
+///   d = c
+///   = d
+///   = a
+/// Because of how the allocation work, b, c, and d may be assigned different
+/// colors. Now, if a gets evicted later:
+/// BB1:
+///   a =
+///   st a, SpillSlot
+///   b =
+/// BB2:
+///   c = b
+///   ...
+///   d = c
+///   = d
+///   e = ld SpillSlot
+///   = e
+/// This is likely that we can assign the same register for b, c, and d,
+/// getting rid of 2 copies.
+void RAGreedy::tryHintsRecoloring() {
+  for (LiveInterval *LI : SetOfBrokenHints) {
+    assert(TargetRegisterInfo::isVirtualRegister(LI->reg) &&
+           "Recoloring is possible only for virtual registers");
+    // Some dead defs may be around (e.g., because of debug uses).
+    // Ignore those.
+    if (!VRM->hasPhys(LI->reg))
+      continue;
+    tryHintRecoloring(*LI);
+  }
+}
+
 unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
                                      SmallVectorImpl<unsigned> &NewVRegs,
                                      SmallVirtRegSet &FixedRegisters,
@@ -2274,8 +2467,18 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
   // queue. The RS_Split ranges already failed to do this, and they should not
   // get a second chance until they have been split.
   if (Stage != RS_Split)
-    if (unsigned PhysReg = tryEvict(VirtReg, Order, NewVRegs, CostPerUseLimit))
+    if (unsigned PhysReg =
+            tryEvict(VirtReg, Order, NewVRegs, CostPerUseLimit)) {
+      unsigned Hint = MRI->getSimpleHint(VirtReg.reg);
+      // If VirtReg has a hint and that hint is broken record this
+      // virtual register as a recoloring candidate for broken hint.
+      // Indeed, since we evicted a variable in its neighborhood it is
+      // likely we can at least partially recolor some of the
+      // copy-related live-ranges.
+      if (Hint && Hint != PhysReg)
+        SetOfBrokenHints.insert(&VirtReg);
       return PhysReg;
+    }
 
   assert(NewVRegs.empty() && "Cannot append to existing NewVRegs");
 
@@ -2355,8 +2558,10 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   NextCascade = 1;
   IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI);
   GlobalCand.resize(32);  // This will grow as needed.
+  SetOfBrokenHints.clear();
 
   allocatePhysRegs();
+  tryHintsRecoloring();
   releaseMemory();
   return true;
 }
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index eb7e5633..77a42b3 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -126,7 +126,12 @@ private:
   void findVRegIntervalsToAlloc(const MachineFunction &MF, LiveIntervals &LIS);
 
   /// \brief Constructs an initial graph.
-  void initializeGraph(PBQPRAGraph &G);
+  void initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM, Spiller &VRegSpiller);
+
+  /// \brief Spill the given VReg.
+  void spillVReg(unsigned VReg, SmallVectorImpl<unsigned> &NewIntervals,
+                 MachineFunction &MF, LiveIntervals &LIS, VirtRegMap &VRM,
+                 Spiller &VRegSpiller);
 
   /// \brief Given a solved PBQP problem maps this solution back to a register
   /// assignment.
@@ -172,8 +177,6 @@ public:
 class Interference : public PBQPRAConstraint {
 private:
 
-private:
-
   typedef const PBQP::RegAlloc::AllowedRegVector* AllowedRegVecPtr;
   typedef std::pair<AllowedRegVecPtr, AllowedRegVecPtr> IMatrixKey;
   typedef DenseMap<IMatrixKey, PBQPRAGraph::MatrixPtr> IMatrixCache;
@@ -308,7 +311,7 @@ private:
                               PBQPRAGraph::NodeId MId, IMatrixCache &C) {
 
     const TargetRegisterInfo &TRI =
-      *G.getMetadata().MF.getTarget().getSubtargetImpl()->getRegisterInfo();
+        *G.getMetadata().MF.getSubtarget().getRegisterInfo();
 
     const auto &NRegs = G.getNodeMetadata(NId).getAllowedRegs();
     const auto &MRegs = G.getNodeMetadata(MId).getAllowedRegs();
@@ -342,7 +345,7 @@ public:
   void apply(PBQPRAGraph &G) override {
     MachineFunction &MF = G.getMetadata().MF;
     MachineBlockFrequencyInfo &MBFI = G.getMetadata().MBFI;
-    CoalescerPair CP(*MF.getTarget().getSubtargetImpl()->getRegisterInfo());
+    CoalescerPair CP(*MF.getSubtarget().getRegisterInfo());
 
     // Scan the machine function and add a coalescing cost whenever CoalescerPair
     // gives the Ok.
@@ -398,7 +401,7 @@ public:
             }
             PBQPRAGraph::RawMatrix Costs(G.getEdgeCosts(EId));
             addVirtRegCoalesce(Costs, *Allowed1, *Allowed2, CBenefit);
-            G.setEdgeCosts(EId, std::move(Costs));
+            G.updateEdgeCosts(EId, std::move(Costs));
           }
         }
       }
@@ -488,15 +491,21 @@ static bool isACalleeSavedRegister(unsigned reg, const TargetRegisterInfo &TRI,
   return false;
 }
 
-void RegAllocPBQP::initializeGraph(PBQPRAGraph &G) {
+void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM,
+                                   Spiller &VRegSpiller) {
   MachineFunction &MF = G.getMetadata().MF;
 
   LiveIntervals &LIS = G.getMetadata().LIS;
   const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo();
   const TargetRegisterInfo &TRI =
-    *G.getMetadata().MF.getTarget().getSubtargetImpl()->getRegisterInfo();
+      *G.getMetadata().MF.getSubtarget().getRegisterInfo();
+
+  std::vector<unsigned> Worklist(VRegsToAlloc.begin(), VRegsToAlloc.end());
+
+  while (!Worklist.empty()) {
+    unsigned VReg = Worklist.back();
+    Worklist.pop_back();
 
-  for (auto VReg : VRegsToAlloc) {
     const TargetRegisterClass *TRC = MRI.getRegClass(VReg);
     LiveInterval &VRegLI = LIS.getInterval(VReg);
 
@@ -531,6 +540,15 @@ void RegAllocPBQP::initializeGraph(PBQPRAGraph &G) {
       VRegAllowed.push_back(PReg);
     }
 
+    // Check for vregs that have no allowed registers. These should be
+    // pre-spilled and the new vregs added to the worklist.
+    if (VRegAllowed.empty()) {
+      SmallVector<unsigned, 8> NewVRegs;
+      spillVReg(VReg, NewVRegs, MF, LIS, VRM, VRegSpiller);
+      Worklist.insert(Worklist.end(), NewVRegs.begin(), NewVRegs.end());
+      continue;
+    }
+
     PBQPRAGraph::RawVector NodeCosts(VRegAllowed.size() + 1, 0);
 
     // Tweak cost of callee saved registers, as using then force spilling and
@@ -547,14 +565,40 @@ void RegAllocPBQP::initializeGraph(PBQPRAGraph &G) {
   }
 }
 
+void RegAllocPBQP::spillVReg(unsigned VReg,
+                             SmallVectorImpl<unsigned> &NewIntervals,
+                             MachineFunction &MF, LiveIntervals &LIS,
+                             VirtRegMap &VRM, Spiller &VRegSpiller) {
+
+  VRegsToAlloc.erase(VReg);
+  LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, MF, LIS, &VRM);
+  VRegSpiller.spill(LRE);
+
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  (void)TRI;
+  DEBUG(dbgs() << "VREG " << PrintReg(VReg, &TRI) << " -> SPILLED (Cost: "
+               << LRE.getParent().weight << ", New vregs: ");
+
+  // Copy any newly inserted live intervals into the list of regs to
+  // allocate.
+  for (LiveRangeEdit::iterator I = LRE.begin(), E = LRE.end();
+       I != E; ++I) {
+    const LiveInterval &LI = LIS.getInterval(*I);
+    assert(!LI.empty() && "Empty spill range.");
+    DEBUG(dbgs() << PrintReg(LI.reg, &TRI) << " ");
+    VRegsToAlloc.insert(LI.reg);
+  }
+
+  DEBUG(dbgs() << ")\n");
+}
+
 bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAGraph &G,
                                      const PBQP::Solution &Solution,
                                      VirtRegMap &VRM,
                                      Spiller &VRegSpiller) {
   MachineFunction &MF = G.getMetadata().MF;
   LiveIntervals &LIS = G.getMetadata().LIS;
-  const TargetRegisterInfo &TRI =
-    *MF.getTarget().getSubtargetImpl()->getRegisterInfo();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   (void)TRI;
 
   // Set to true if we have any spills
@@ -576,28 +620,11 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAGraph &G,
       assert(PReg != 0 && "Invalid preg selected.");
       VRM.assignVirt2Phys(VReg, PReg);
     } else {
-      VRegsToAlloc.erase(VReg);
-      SmallVector<unsigned, 8> NewSpills;
-      LiveRangeEdit LRE(&LIS.getInterval(VReg), NewSpills, MF, LIS, &VRM);
-      VRegSpiller.spill(LRE);
-
-      DEBUG(dbgs() << "VREG " << PrintReg(VReg, &TRI) << " -> SPILLED (Cost: "
-                   << LRE.getParent().weight << ", New vregs: ");
-
-      // Copy any newly inserted live intervals into the list of regs to
-      // allocate.
-      for (LiveRangeEdit::iterator I = LRE.begin(), E = LRE.end();
-           I != E; ++I) {
-        LiveInterval &LI = LIS.getInterval(*I);
-        assert(!LI.empty() && "Empty spill range.");
-        DEBUG(dbgs() << PrintReg(LI.reg, &TRI) << " ");
-        VRegsToAlloc.insert(LI.reg);
-      }
-
-      DEBUG(dbgs() << ")\n");
-
-      // We need another round if spill intervals were added.
-      AnotherRoundNeeded |= !LRE.empty();
+      // Spill VReg. If this introduces new intervals we'll need another round
+      // of allocation.
+      SmallVector<unsigned, 8> NewVRegs;
+      spillVReg(VReg, NewVRegs, MF, LIS, VRM, VRegSpiller);
+      AnotherRoundNeeded |= !NewVRegs.empty();
     }
   }
 
@@ -670,7 +697,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   // If there are non-empty intervals allocate them using pbqp.
   if (!VRegsToAlloc.empty()) {
 
-    const TargetSubtargetInfo &Subtarget = *MF.getTarget().getSubtargetImpl();
+    const TargetSubtargetInfo &Subtarget = MF.getSubtarget();
     std::unique_ptr<PBQPRAConstraintList> ConstraintsRoot =
       llvm::make_unique<PBQPRAConstraintList>();
     ConstraintsRoot->addConstraint(llvm::make_unique<SpillCosts>());
@@ -686,7 +713,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
       DEBUG(dbgs() << "  PBQP Regalloc round " << Round << ":\n");
 
       PBQPRAGraph G(PBQPRAGraph::GraphMetadata(MF, LIS, MBFI));
-      initializeGraph(G);
+      initializeGraph(G, VRM, *VRegSpiller);
       ConstraintsRoot->apply(G);
 
 #ifndef NDEBUG
@@ -699,7 +726,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
         raw_fd_ostream OS(GraphFileName, EC, sys::fs::F_Text);
         DEBUG(dbgs() << "Dumping graph for round " << Round << " to \""
               << GraphFileName << "\"\n");
-        G.dumpToStream(OS);
+        G.dump(OS);
       }
 #endif
 
@@ -719,6 +746,79 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
+namespace {
+// A helper class for printing node and register info in a consistent way
+class PrintNodeInfo {
+public:
+  typedef PBQP::RegAlloc::PBQPRAGraph Graph;
+  typedef PBQP::RegAlloc::PBQPRAGraph::NodeId NodeId;
+
+  PrintNodeInfo(NodeId NId, const Graph &G) : G(G), NId(NId) {}
+
+  void print(raw_ostream &OS) const {
+    const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo();
+    const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+    unsigned VReg = G.getNodeMetadata(NId).getVReg();
+    const char *RegClassName = TRI->getRegClassName(MRI.getRegClass(VReg));
+    OS << NId << " (" << RegClassName << ':' << PrintReg(VReg, TRI) << ')';
+  }
+
+private:
+  const Graph &G;
+  NodeId NId;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const PrintNodeInfo &PR) {
+  PR.print(OS);
+  return OS;
+}
+} // anonymous namespace
+
+void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {
+  for (auto NId : nodeIds()) {
+    const Vector &Costs = getNodeCosts(NId);
+    assert(Costs.getLength() != 0 && "Empty vector in graph.");
+    OS << PrintNodeInfo(NId, *this) << ": " << Costs << '\n';
+  }
+  OS << '\n';
+
+  for (auto EId : edgeIds()) {
+    NodeId N1Id = getEdgeNode1Id(EId);
+    NodeId N2Id = getEdgeNode2Id(EId);
+    assert(N1Id != N2Id && "PBQP graphs should not have self-edges.");
+    const Matrix &M = getEdgeCosts(EId);
+    assert(M.getRows() != 0 && "No rows in matrix.");
+    assert(M.getCols() != 0 && "No cols in matrix.");
+    OS << PrintNodeInfo(N1Id, *this) << ' ' << M.getRows() << " rows / ";
+    OS << PrintNodeInfo(N2Id, *this) << ' ' << M.getCols() << " cols:\n";
+    OS << M << '\n';
+  }
+}
+
+void PBQP::RegAlloc::PBQPRAGraph::dump() const { dump(dbgs()); }
+
+void PBQP::RegAlloc::PBQPRAGraph::printDot(raw_ostream &OS) const {
+  OS << "graph {\n";
+  for (auto NId : nodeIds()) {
+    OS << "  node" << NId << " [ label=\""
+       << PrintNodeInfo(NId, *this) << "\\n"
+       << getNodeCosts(NId) << "\" ]\n";
+  }
+
+  OS << "  edge [ len=" << nodeIds().size() << " ]\n";
+  for (auto EId : edgeIds()) {
+    OS << "  node" << getEdgeNode1Id(EId)
+       << " -- node" << getEdgeNode2Id(EId)
+       << " [ label=\"";
+    const Matrix &EdgeCosts = getEdgeCosts(EId);
+    for (unsigned i = 0; i < EdgeCosts.getRows(); ++i) {
+      OS << EdgeCosts.getRowAsVector(i) << "\\n";
+    }
+    OS << "\" ]\n";
+  }
+  OS << "}\n";
+}
+
 FunctionPass *llvm::createPBQPRegisterAllocator(char *customPassID) {
   return new RegAllocPBQP(customPassID);
 }
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
index e0d1aa2..ab33672 100644
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -47,6 +47,7 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
   }
 
   // Does this MF have different CSRs?
+  assert(TRI && "no register info set");
   const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
   if (Update || CSR != CalleeSaved) {
     // Build a CSRNum map. Every CSR alias gets an entry pointing to the last
@@ -76,6 +77,7 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
 /// registers filtered out. Volatile registers come first followed by CSR
 /// aliases ordered according to the CSR order specified by the target.
 void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
+  assert(RC && "no register class given");
   RCInfo &RCI = RegClass[RC->getID()];
 
   // Raw register count, including all reserved regs.
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 2d2dc92..1e4cfe8 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -57,12 +58,12 @@ EnableJoining("join-liveintervals",
               cl::desc("Coalesce copies (default=true)"),
               cl::init(true));
 
-// Temporary flag to test critical edge unsplitting.
+/// Temporary flag to test critical edge unsplitting.
 static cl::opt<bool>
 EnableJoinSplits("join-splitedges",
   cl::desc("Coalesce copies on split edges (default=subtarget)"), cl::Hidden);
 
-// Temporary flag to test global copy optimization.
+/// Temporary flag to test global copy optimization.
 static cl::opt<cl::boolOrDefault>
 EnableGlobalCopies("join-globalcopies",
   cl::desc("Coalesce copies that span blocks (default=subtarget)"),
@@ -86,6 +87,14 @@ namespace {
     AliasAnalysis *AA;
     RegisterClassInfo RegClassInfo;
 
+    /// A LaneMask to remember on which subregister live ranges we need to call
+    /// shrinkToUses() later.
+    unsigned ShrinkMask;
+
+    /// True if the main range of the currently coalesced intervals should be
+    /// checked for smaller live intervals.
+    bool ShrinkMainRange;
+
     /// \brief True if the coalescer should aggressively coalesce global copies
     /// in favor of keeping local copies.
     bool JoinGlobalCopies;
@@ -111,7 +120,7 @@ namespace {
     /// Recursively eliminate dead defs in DeadDefs.
     void eliminateDeadDefs();
 
-    /// LiveRangeEdit callback.
+    /// LiveRangeEdit callback for eliminateDeadDefs().
     void LRE_WillEraseInstruction(MachineInstr *MI) override;
 
     /// Coalesce the LocalWorkList.
@@ -124,16 +133,15 @@ namespace {
     /// copies that cannot yet be coalesced into WorkList.
     void copyCoalesceInMBB(MachineBasicBlock *MBB);
 
-    /// Try to coalesce all copies in CurrList. Return
-    /// true if any progress was made.
+    /// Tries to coalesce all copies in CurrList. Returns true if any progress
+    /// was made.
     bool copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList);
 
-    /// Attempt to join intervals corresponding to SrcReg/DstReg,
-    /// which are the src/dst of the copy instruction CopyMI.  This returns
-    /// true if the copy was successfully coalesced away. If it is not
-    /// currently possible to coalesce this interval, but it may be possible if
-    /// other things get coalesced, then it returns true by reference in
-    /// 'Again'.
+    /// Attempt to join intervals corresponding to SrcReg/DstReg, which are the
+    /// src/dst of the copy instruction CopyMI.  This returns true if the copy
+    /// was successfully coalesced away. If it is not currently possible to
+    /// coalesce this interval, but it may be possible if other things get
+    /// coalesced, then it returns true by reference in 'Again'.
     bool joinCopy(MachineInstr *TheCopy, bool &Again);
 
     /// Attempt to join these two intervals.  On failure, this
@@ -147,10 +155,23 @@ namespace {
     /// Attempt joining with a reserved physreg.
     bool joinReservedPhysReg(CoalescerPair &CP);
 
-    /// We found a non-trivially-coalescable copy. If
-    /// the source value number is defined by a copy from the destination reg
-    /// see if we can merge these two destination reg valno# into a single
-    /// value number, eliminating a copy.
+    /// Add the LiveRange @p ToMerge as a subregister liverange of @p LI.
+    /// Subranges in @p LI which only partially interfere with the desired
+    /// LaneMask are split as necessary. @p LaneMask are the lanes that
+    /// @p ToMerge will occupy in the coalescer register. @p LI has its subrange
+    /// lanemasks already adjusted to the coalesced register.
+    void mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge,
+                           unsigned LaneMask, CoalescerPair &CP);
+
+    /// Join the liveranges of two subregisters. Joins @p RRange into
+    /// @p LRange, @p RRange may be invalid afterwards.
+    void joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
+                          unsigned LaneMask, const CoalescerPair &CP);
+
+    /// We found a non-trivially-coalescable copy. If the source value number is
+    /// defined by a copy from the destination reg see if we can merge these two
+    /// destination reg valno# into a single value number, eliminating a copy.
+    /// This returns true if an interval was modified.
     bool adjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
 
     /// Return true if there are definitions of IntB
@@ -162,6 +183,7 @@ namespace {
     /// If the source value number is defined by a commutable instruction and
     /// its other operand is coalesced to the copy dest register, see if we
     /// can transform the copy into a noop by commuting the definition.
+    /// This returns true if an interval was modified.
     bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
 
     /// If the source of a copy is defined by a
@@ -169,21 +191,21 @@ namespace {
     bool reMaterializeTrivialDef(CoalescerPair &CP, MachineInstr *CopyMI,
                                  bool &IsDefCopy);
 
-    /// Return true if a physreg copy should be joined.
+    /// Return true if a copy involving a physreg should be joined.
     bool canJoinPhys(const CoalescerPair &CP);
 
-    /// Replace all defs and uses of SrcReg to DstReg and
-    /// update the subregister number if it is not zero. If DstReg is a
-    /// physical register and the existing subregister number of the def / use
-    /// being updated is not zero, make sure to set it to the correct physical
-    /// subregister.
+    /// Replace all defs and uses of SrcReg to DstReg and update the subregister
+    /// number if it is not zero. If DstReg is a physical register and the
+    /// existing subregister number of the def / use being updated is not zero,
+    /// make sure to set it to the correct physical subregister.
     void updateRegDefsUses(unsigned SrcReg, unsigned DstReg, unsigned SubIdx);
 
     /// Handle copies of undef values.
-    bool eliminateUndefCopy(MachineInstr *CopyMI, const CoalescerPair &CP);
+    /// Returns true if @p CopyMI was a copy of an undef value and eliminated.
+    bool eliminateUndefCopy(MachineInstr *CopyMI);
 
   public:
-    static char ID; // Class identification, replacement for typeinfo
+    static char ID; ///< Class identification, replacement for typeinfo
     RegisterCoalescer() : MachineFunctionPass(ID) {
       initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
     }
@@ -198,7 +220,7 @@ namespace {
     /// Implement the dump method.
     void print(raw_ostream &O, const Module* = nullptr) const override;
   };
-} /// end anonymous namespace
+} // end anonymous namespace
 
 char &llvm::RegisterCoalescerID = RegisterCoalescer::ID;
 
@@ -232,11 +254,11 @@ static bool isMoveInstr(const TargetRegisterInfo &tri, const MachineInstr *MI,
   return true;
 }
 
-// Return true if this block should be vacated by the coalescer to eliminate
-// branches. The important cases to handle in the coalescer are critical edges
-// split during phi elimination which contain only copies. Simple blocks that
-// contain non-branches should also be vacated, but this can be handled by an
-// earlier pass similar to early if-conversion.
+/// Return true if this block should be vacated by the coalescer to eliminate
+/// branches. The important cases to handle in the coalescer are critical edges
+/// split during phi elimination which contain only copies. Simple blocks that
+/// contain non-branches should also be vacated, but this can be handled by an
+/// earlier pass similar to early if-conversion.
 static bool isSplitEdge(const MachineBasicBlock *MBB) {
   if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
     return false;
@@ -401,27 +423,11 @@ void RegisterCoalescer::eliminateDeadDefs() {
                 nullptr, this).eliminateDeadDefs(DeadDefs);
 }
 
-// Callback from eliminateDeadDefs().
 void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) {
   // MI may be in WorkList. Make sure we don't visit it.
   ErasedInstrs.insert(MI);
 }
 
-/// We found a non-trivially-coalescable copy with IntA
-/// being the source and IntB being the dest, thus this defines a value number
-/// in IntB.  If the source value number (in IntA) is defined by a copy from B,
-/// see if we can merge these two pieces of B into a single value number,
-/// eliminating a copy.  For example:
-///
-///  A3 = B0
-///    ...
-///  B1 = A3      <- this copy
-///
-/// In this case, B0 can be extended to where the B1 copy lives, allowing the B1
-/// value number to be replaced with B0 (which simplifies the B liveinterval).
-///
-/// This returns true if an interval was modified.
-///
 bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
                                              MachineInstr *CopyMI) {
   assert(!CP.isPartial() && "This doesn't work for partial copies.");
@@ -433,6 +439,20 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
     LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
   SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
 
+  // We have a non-trivially-coalescable copy with IntA being the source and
+  // IntB being the dest, thus this defines a value number in IntB.  If the
+  // source value number (in IntA) is defined by a copy from B, see if we can
+  // merge these two pieces of B into a single value number, eliminating a copy.
+  // For example:
+  //
+  //  A3 = B0
+  //    ...
+  //  B1 = A3      <- this copy
+  //
+  // In this case, B0 can be extended to where the B1 copy lives, allowing the
+  // B1 value number to be replaced with B0 (which simplifies the B
+  // liveinterval).
+
   // BValNo is a value number in B that is defined by a copy from A.  'B1' in
   // the example above.
   LiveInterval::iterator BS = IntB.FindSegmentContaining(CopyIdx);
@@ -492,6 +512,16 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
   // Okay, merge "B1" into the same value number as "B0".
   if (BValNo != ValS->valno)
     IntB.MergeValueNumberInto(BValNo, ValS->valno);
+
+  // Do the same for the subregister segments.
+  for (LiveInterval::SubRange &S : IntB.subranges()) {
+    VNInfo *SubBValNo = S.getVNInfoAt(CopyIdx);
+    S.addSegment(LiveInterval::Segment(FillerStart, FillerEnd, SubBValNo));
+    VNInfo *SubValSNo = S.getVNInfoAt(AValNo->def.getPrevSlot());
+    if (SubBValNo != SubValSNo)
+      S.MergeValueNumberInto(SubBValNo, SubValSNo);
+  }
+
   DEBUG(dbgs() << "   result = " << IntB << '\n');
 
   // If the source instruction was killing the source register before the
@@ -512,8 +542,6 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
   return true;
 }
 
-/// Return true if there are definitions of IntB
-/// other than BValNo val# that can reach uses of AValno val# of IntA.
 bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
                                              LiveInterval &IntB,
                                              VNInfo *AValNo,
@@ -523,69 +551,75 @@ bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
   if (LIS->hasPHIKill(IntA, AValNo))
     return true;
 
-  for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
-       AI != AE; ++AI) {
-    if (AI->valno != AValNo) continue;
+  for (LiveRange::Segment &ASeg : IntA.segments) {
+    if (ASeg.valno != AValNo) continue;
     LiveInterval::iterator BI =
-      std::upper_bound(IntB.begin(), IntB.end(), AI->start);
+      std::upper_bound(IntB.begin(), IntB.end(), ASeg.start);
     if (BI != IntB.begin())
       --BI;
-    for (; BI != IntB.end() && AI->end >= BI->start; ++BI) {
+    for (; BI != IntB.end() && ASeg.end >= BI->start; ++BI) {
       if (BI->valno == BValNo)
         continue;
-      if (BI->start <= AI->start && BI->end > AI->start)
+      if (BI->start <= ASeg.start && BI->end > ASeg.start)
         return true;
-      if (BI->start > AI->start && BI->start < AI->end)
+      if (BI->start > ASeg.start && BI->start < ASeg.end)
         return true;
     }
   }
   return false;
 }
 
-/// We found a non-trivially-coalescable copy with
-/// IntA being the source and IntB being the dest, thus this defines a value
-/// number in IntB.  If the source value number (in IntA) is defined by a
-/// commutable instruction and its other operand is coalesced to the copy dest
-/// register, see if we can transform the copy into a noop by commuting the
-/// definition. For example,
-///
-///  A3 = op A2 B0<kill>
-///    ...
-///  B1 = A3      <- this copy
-///    ...
-///     = op A3   <- more uses
-///
-/// ==>
-///
-///  B2 = op B0 A2<kill>
-///    ...
-///  B1 = B2      <- now an identify copy
-///    ...
-///     = op B2   <- more uses
-///
-/// This returns true if an interval was modified.
-///
+/// Copy segements with value number @p SrcValNo from liverange @p Src to live
+/// range @Dst and use value number @p DstValNo there.
+static void addSegmentsWithValNo(LiveRange &Dst, VNInfo *DstValNo,
+                                 const LiveRange &Src, const VNInfo *SrcValNo)
+{
+  for (const LiveRange::Segment &S : Src.segments) {
+    if (S.valno != SrcValNo)
+      continue;
+    Dst.addSegment(LiveRange::Segment(S.start, S.end, DstValNo));
+  }
+}
+
 bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
                                                  MachineInstr *CopyMI) {
-  assert (!CP.isPhys());
-
-  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
+  assert(!CP.isPhys());
 
   LiveInterval &IntA =
-    LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+      LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
   LiveInterval &IntB =
-    LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+      LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+
+  // We found a non-trivially-coalescable copy with IntA being the source and
+  // IntB being the dest, thus this defines a value number in IntB.  If the
+  // source value number (in IntA) is defined by a commutable instruction and
+  // its other operand is coalesced to the copy dest register, see if we can
+  // transform the copy into a noop by commuting the definition. For example,
+  //
+  //  A3 = op A2 B0<kill>
+  //    ...
+  //  B1 = A3      <- this copy
+  //    ...
+  //     = op A3   <- more uses
+  //
+  // ==>
+  //
+  //  B2 = op B0 A2<kill>
+  //    ...
+  //  B1 = B2      <- now an identity copy
+  //    ...
+  //     = op B2   <- more uses
 
   // BValNo is a value number in B that is defined by a copy from A. 'B1' in
   // the example above.
+  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
   VNInfo *BValNo = IntB.getVNInfoAt(CopyIdx);
-  if (!BValNo || BValNo->def != CopyIdx)
-    return false;
+  assert(BValNo != nullptr && BValNo->def == CopyIdx);
 
   // AValNo is the value number in A that defines the copy, A3 in the example.
   VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx.getRegSlot(true));
-  assert(AValNo && "COPY source not live");
-  if (AValNo->isPHIDef() || AValNo->isUnused())
+  assert(AValNo && !AValNo->isUnused() && "COPY source not live");
+  if (AValNo->isPHIDef())
     return false;
   MachineInstr *DefMI = LIS->getInstructionFromIndex(AValNo->def);
   if (!DefMI)
@@ -652,8 +686,6 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
     MBB->insert(Pos, NewMI);
     MBB->erase(DefMI);
   }
-  unsigned OpIdx = NewMI->findRegisterUseOperandIdx(IntA.reg, false);
-  NewMI->getOperand(OpIdx).setIsKill();
 
   // If ALR and BLR overlaps and end of BLR extends beyond end of ALR, e.g.
   // A = or A, B
@@ -666,10 +698,13 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 
   // Update uses of IntA of the specific Val# with IntB.
   for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(IntA.reg),
-         UE = MRI->use_end(); UI != UE;) {
+                                         UE = MRI->use_end();
+       UI != UE; /* ++UI is below because of possible MI removal */) {
     MachineOperand &UseMO = *UI;
-    MachineInstr *UseMI = UseMO.getParent();
     ++UI;
+    if (UseMO.isUndef())
+      continue;
+    MachineInstr *UseMI = UseMO.getParent();
     if (UseMI->isDebugValue()) {
       // FIXME These don't have an instruction index.  Not clear we have enough
       // info to decide whether to do this replacement or not.  For now do it.
@@ -678,7 +713,8 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
     }
     SlotIndex UseIdx = LIS->getInstructionIndex(UseMI).getRegSlot(true);
     LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
-    if (US == IntA.end() || US->valno != AValNo)
+    assert(US != IntA.end() && "Use must be live");
+    if (US->valno != AValNo)
       continue;
     // Kill flags are no longer accurate. They are recomputed after RA.
     UseMO.setIsKill(false);
@@ -702,7 +738,16 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       continue;
     DEBUG(dbgs() << "\t\tnoop: " << DefIdx << '\t' << *UseMI);
     assert(DVNI->def == DefIdx);
-    BValNo = IntB.MergeValueNumberInto(BValNo, DVNI);
+    BValNo = IntB.MergeValueNumberInto(DVNI, BValNo);
+    for (LiveInterval::SubRange &S : IntB.subranges()) {
+      VNInfo *SubDVNI = S.getVNInfoAt(DefIdx);
+      if (!SubDVNI)
+        continue;
+      VNInfo *SubBValNo = S.getVNInfoAt(CopyIdx);
+      assert(SubBValNo->def == CopyIdx);
+      S.MergeValueNumberInto(SubDVNI, SubBValNo);
+    }
+
     ErasedInstrs.insert(UseMI);
     LIS->RemoveMachineInstrFromMaps(UseMI);
     UseMI->eraseFromParent();
@@ -710,23 +755,82 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 
   // Extend BValNo by merging in IntA live segments of AValNo. Val# definition
   // is updated.
-  VNInfo *ValNo = BValNo;
-  ValNo->def = AValNo->def;
-  for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
-       AI != AE; ++AI) {
-    if (AI->valno != AValNo) continue;
-    IntB.addSegment(LiveInterval::Segment(AI->start, AI->end, ValNo));
+  BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+  if (IntB.hasSubRanges()) {
+    if (!IntA.hasSubRanges()) {
+      unsigned Mask = MRI->getMaxLaneMaskForVReg(IntA.reg);
+      IntA.createSubRangeFrom(Allocator, Mask, IntA);
+    }
+    SlotIndex AIdx = CopyIdx.getRegSlot(true);
+    for (LiveInterval::SubRange &SA : IntA.subranges()) {
+      VNInfo *ASubValNo = SA.getVNInfoAt(AIdx);
+      assert(ASubValNo != nullptr);
+
+      unsigned AMask = SA.LaneMask;
+      for (LiveInterval::SubRange &SB : IntB.subranges()) {
+        unsigned BMask = SB.LaneMask;
+        unsigned Common = BMask & AMask;
+        if (Common == 0)
+          continue;
+
+        DEBUG(
+            dbgs() << format("\t\tCopy+Merge %04X into %04X\n", BMask, Common));
+        unsigned BRest = BMask & ~AMask;
+        LiveInterval::SubRange *CommonRange;
+        if (BRest != 0) {
+          SB.LaneMask = BRest;
+          DEBUG(dbgs() << format("\t\tReduce Lane to %04X\n", BRest));
+          // Duplicate SubRange for newly merged common stuff.
+          CommonRange = IntB.createSubRangeFrom(Allocator, Common, SB);
+        } else {
+          // We van reuse the L SubRange.
+          SB.LaneMask = Common;
+          CommonRange = &SB;
+        }
+        LiveRange RangeCopy(SB, Allocator);
+
+        VNInfo *BSubValNo = CommonRange->getVNInfoAt(CopyIdx);
+        assert(BSubValNo->def == CopyIdx);
+        BSubValNo->def = ASubValNo->def;
+        addSegmentsWithValNo(*CommonRange, BSubValNo, SA, ASubValNo);
+        AMask &= ~BMask;
+      }
+      if (AMask != 0) {
+        DEBUG(dbgs() << format("\t\tNew Lane %04X\n", AMask));
+        LiveRange *NewRange = IntB.createSubRange(Allocator, AMask);
+        VNInfo *BSubValNo = NewRange->getNextValue(CopyIdx, Allocator);
+        addSegmentsWithValNo(*NewRange, BSubValNo, SA, ASubValNo);
+      }
+    }
   }
+
+  BValNo->def = AValNo->def;
+  addSegmentsWithValNo(IntB, BValNo, IntA, AValNo);
   DEBUG(dbgs() << "\t\textended: " << IntB << '\n');
 
-  IntA.removeValNo(AValNo);
+  LIS->removeVRegDefAt(IntA, AValNo->def);
+
   DEBUG(dbgs() << "\t\ttrimmed:  " << IntA << '\n');
   ++numCommutes;
   return true;
 }
 
-/// If the source of a copy is defined by a trivial
-/// computation, replace the copy by rematerialize the definition.
+/// Returns true if @p MI defines the full vreg @p Reg, as opposed to just
+/// defining a subregister.
+static bool definesFullReg(const MachineInstr &MI, unsigned Reg) {
+  assert(!TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         "This code cannot handle physreg aliasing");
+  for (const MachineOperand &Op : MI.operands()) {
+    if (!Op.isReg() || !Op.isDef() || Op.getReg() != Reg)
+      continue;
+    // Return true if we define the full register or don't care about the value
+    // inside other subregisters.
+    if (Op.getSubReg() == 0 || Op.isUndef())
+      return true;
+  }
+  return false;
+}
+
 bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
                                                 MachineInstr *CopyMI,
                                                 bool &IsDefCopy) {
@@ -755,6 +859,8 @@ bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
     return false;
   if (!TII->isTriviallyReMaterializable(DefMI, AA))
     return false;
+  if (!definesFullReg(*DefMI, SrcReg))
+    return false;
   bool SawStore = false;
   if (!DefMI->isSafeToMove(TII, AA, SawStore))
     return false;
@@ -825,12 +931,13 @@ bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
     const TargetRegisterClass *NewRC = CP.getNewRC();
     unsigned NewIdx = NewMI->getOperand(0).getSubReg();
 
-    if (NewIdx)
-      NewRC = TRI->getMatchingSuperRegClass(NewRC, DefRC, NewIdx);
-    else
-      NewRC = TRI->getCommonSubClass(NewRC, DefRC);
-
-    assert(NewRC && "subreg chosen for remat incompatible with instruction");
+    if (DefRC != nullptr) {
+      if (NewIdx)
+        NewRC = TRI->getMatchingSuperRegClass(NewRC, DefRC, NewIdx);
+      else
+        NewRC = TRI->getCommonSubClass(NewRC, DefRC);
+      assert(NewRC && "subreg chosen for remat incompatible with instruction");
+    }
     MRI->setRegClass(DstReg, NewRC);
 
     updateRegDefsUses(DstReg, DstReg, DstIdx);
@@ -898,56 +1005,103 @@ bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
 
   // The source interval can become smaller because we removed a use.
   LIS->shrinkToUses(&SrcInt, &DeadDefs);
-  if (!DeadDefs.empty())
+  if (!DeadDefs.empty()) {
+    // If the virtual SrcReg is completely eliminated, update all DBG_VALUEs
+    // to describe DstReg instead.
+    for (MachineOperand &UseMO : MRI->use_operands(SrcReg)) {
+      MachineInstr *UseMI = UseMO.getParent();
+      if (UseMI->isDebugValue()) {
+        UseMO.setReg(DstReg);
+        DEBUG(dbgs() << "\t\tupdated: " << *UseMI);
+      }
+    }
     eliminateDeadDefs();
+  }
 
   return true;
 }
 
-/// ProcessImpicitDefs may leave some copies of <undef>
-/// values, it only removes local variables. When we have a copy like:
-///
-///   %vreg1 = COPY %vreg2<undef>
-///
-/// We delete the copy and remove the corresponding value number from %vreg1.
-/// Any uses of that value number are marked as <undef>.
-bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI,
-                                           const CoalescerPair &CP) {
+bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
+  // ProcessImpicitDefs may leave some copies of <undef> values, it only removes
+  // local variables. When we have a copy like:
+  //
+  //   %vreg1 = COPY %vreg2<undef>
+  //
+  // We delete the copy and remove the corresponding value number from %vreg1.
+  // Any uses of that value number are marked as <undef>.
+
+  // Note that we do not query CoalescerPair here but redo isMoveInstr as the
+  // CoalescerPair may have a new register class with adjusted subreg indices
+  // at this point.
+  unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+  isMoveInstr(*TRI, CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx);
+
   SlotIndex Idx = LIS->getInstructionIndex(CopyMI);
-  LiveInterval *SrcInt = &LIS->getInterval(CP.getSrcReg());
-  if (SrcInt->liveAt(Idx))
-    return false;
-  LiveInterval *DstInt = &LIS->getInterval(CP.getDstReg());
-  if (DstInt->liveAt(Idx))
+  const LiveInterval &SrcLI = LIS->getInterval(SrcReg);
+  // CopyMI is undef iff SrcReg is not live before the instruction.
+  if (SrcSubIdx != 0 && SrcLI.hasSubRanges()) {
+    unsigned SrcMask = TRI->getSubRegIndexLaneMask(SrcSubIdx);
+    for (const LiveInterval::SubRange &SR : SrcLI.subranges()) {
+      if ((SR.LaneMask & SrcMask) == 0)
+        continue;
+      if (SR.liveAt(Idx))
+        return false;
+    }
+  } else if (SrcLI.liveAt(Idx))
     return false;
 
-  // No intervals are live-in to CopyMI - it is undef.
-  if (CP.isFlipped())
-    DstInt = SrcInt;
-  SrcInt = nullptr;
+  DEBUG(dbgs() << "\tEliminating copy of <undef> value\n");
 
-  VNInfo *DeadVNI = DstInt->getVNInfoAt(Idx.getRegSlot());
-  assert(DeadVNI && "No value defined in DstInt");
-  DstInt->removeValNo(DeadVNI);
+  // Remove any DstReg segments starting at the instruction.
+  LiveInterval &DstLI = LIS->getInterval(DstReg);
+  SlotIndex RegIndex = Idx.getRegSlot();
+  // Remove value or merge with previous one in case of a subregister def.
+  if (VNInfo *PrevVNI = DstLI.getVNInfoAt(Idx)) {
+    VNInfo *VNI = DstLI.getVNInfoAt(RegIndex);
+    DstLI.MergeValueNumberInto(VNI, PrevVNI);
 
-  // Find new undef uses.
-  for (MachineOperand &MO : MRI->reg_nodbg_operands(DstInt->reg)) {
-    if (MO.isDef() || MO.isUndef())
+    // The affected subregister segments can be removed.
+    unsigned DstMask = TRI->getSubRegIndexLaneMask(DstSubIdx);
+    for (LiveInterval::SubRange &SR : DstLI.subranges()) {
+      if ((SR.LaneMask & DstMask) == 0)
+        continue;
+
+      VNInfo *SVNI = SR.getVNInfoAt(RegIndex);
+      assert(SVNI != nullptr && SlotIndex::isSameInstr(SVNI->def, RegIndex));
+      SR.removeValNo(SVNI);
+    }
+    DstLI.removeEmptySubRanges();
+  } else
+    LIS->removeVRegDefAt(DstLI, RegIndex);
+
+  // Mark uses as undef.
+  for (MachineOperand &MO : MRI->reg_nodbg_operands(DstReg)) {
+    if (MO.isDef() /*|| MO.isUndef()*/)
       continue;
-    MachineInstr *MI = MO.getParent();
-    SlotIndex Idx = LIS->getInstructionIndex(MI);
-    if (DstInt->liveAt(Idx))
+    const MachineInstr &MI = *MO.getParent();
+    SlotIndex UseIdx = LIS->getInstructionIndex(&MI);
+    unsigned UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
+    bool isLive;
+    if (UseMask != ~0u && DstLI.hasSubRanges()) {
+      isLive = false;
+      for (const LiveInterval::SubRange &SR : DstLI.subranges()) {
+        if ((SR.LaneMask & UseMask) == 0)
+          continue;
+        if (SR.liveAt(UseIdx)) {
+          isLive = true;
+          break;
+        }
+      }
+    } else
+      isLive = DstLI.liveAt(UseIdx);
+    if (isLive)
       continue;
     MO.setIsUndef(true);
-    DEBUG(dbgs() << "\tnew undef: " << Idx << '\t' << *MI);
+    DEBUG(dbgs() << "\tnew undef: " << UseIdx << '\t' << MI);
   }
   return true;
 }
 
-/// Replace all defs and uses of SrcReg to DstReg and update the subregister
-/// number if it is not zero. If DstReg is a physical register and the existing
-/// subregister number of the def / use being updated is not zero, make sure to
-/// set it to the correct physical subregister.
 void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
                                           unsigned DstReg,
                                           unsigned SubIdx) {
@@ -987,6 +1141,40 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
       if (SubIdx && MO.isDef())
         MO.setIsUndef(!Reads);
 
+      // A subreg use of a partially undef (super) register may be a complete
+      // undef use now and then has to be marked that way.
+      if (SubIdx != 0 && MO.isUse() && MRI->tracksSubRegLiveness()) {
+        if (!DstInt->hasSubRanges()) {
+          BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+          unsigned Mask = MRI->getMaxLaneMaskForVReg(DstInt->reg);
+          DstInt->createSubRangeFrom(Allocator, Mask, *DstInt);
+        }
+        unsigned Mask = TRI->getSubRegIndexLaneMask(SubIdx);
+        bool IsUndef = true;
+        SlotIndex MIIdx = UseMI->isDebugValue()
+          ? LIS->getSlotIndexes()->getIndexBefore(UseMI)
+          : LIS->getInstructionIndex(UseMI);
+        SlotIndex UseIdx = MIIdx.getRegSlot(true);
+        for (LiveInterval::SubRange &S : DstInt->subranges()) {
+          if ((S.LaneMask & Mask) == 0)
+            continue;
+          if (S.liveAt(UseIdx)) {
+            IsUndef = false;
+            break;
+          }
+        }
+        if (IsUndef) {
+          MO.setIsUndef(true);
+          // We found out some subregister use is actually reading an undefined
+          // value. In some cases the whole vreg has become undefined at this
+          // point so we have to potentially shrink the main range if the
+          // use was ending a live segment there.
+          LiveQueryResult Q = DstInt->Query(MIIdx);
+          if (Q.valueOut() == nullptr)
+            ShrinkMainRange = true;
+        }
+      }
+
       if (DstIsPhys)
         MO.substPhysReg(DstReg, *TRI);
       else
@@ -1002,29 +1190,23 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
   }
 }
 
-/// Return true if a copy involving a physreg should be joined.
 bool RegisterCoalescer::canJoinPhys(const CoalescerPair &CP) {
-  /// Always join simple intervals that are defined by a single copy from a
-  /// reserved register. This doesn't increase register pressure, so it is
-  /// always beneficial.
+  // Always join simple intervals that are defined by a single copy from a
+  // reserved register. This doesn't increase register pressure, so it is
+  // always beneficial.
   if (!MRI->isReserved(CP.getDstReg())) {
     DEBUG(dbgs() << "\tCan only merge into reserved registers.\n");
     return false;
   }
 
   LiveInterval &JoinVInt = LIS->getInterval(CP.getSrcReg());
-  if (CP.isFlipped() && JoinVInt.containsOneValue())
+  if (JoinVInt.containsOneValue())
     return true;
 
-  DEBUG(dbgs() << "\tCannot join defs into reserved register.\n");
+  DEBUG(dbgs() << "\tCannot join complex intervals into reserved register.\n");
   return false;
 }
 
-/// Attempt to join intervals corresponding to SrcReg/DstReg,
-/// which are the src/dst of the copy instruction CopyMI.  This returns true
-/// if the copy was successfully coalesced away. If it is not currently
-/// possible to coalesce this interval, but it may be possible if other
-/// things get coalesced, then it returns true by reference in 'Again'.
 bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
 
   Again = false;
@@ -1063,8 +1245,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   }
 
   // Eliminate undefs.
-  if (!CP.isPhys() && eliminateUndefCopy(CopyMI, CP)) {
-    DEBUG(dbgs() << "\tEliminated copy of <undef> value.\n");
+  if (!CP.isPhys() && eliminateUndefCopy(CopyMI)) {
     LIS->RemoveMachineInstrFromMaps(CopyMI);
     CopyMI->eraseFromParent();
     return false;  // Not coalescable.
@@ -1076,12 +1257,22 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   if (CP.getSrcReg() == CP.getDstReg()) {
     LiveInterval &LI = LIS->getInterval(CP.getSrcReg());
     DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n');
-    LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(CopyMI));
+    const SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI);
+    LiveQueryResult LRQ = LI.Query(CopyIdx);
     if (VNInfo *DefVNI = LRQ.valueDefined()) {
       VNInfo *ReadVNI = LRQ.valueIn();
       assert(ReadVNI && "No value before copy and no <undef> flag.");
       assert(ReadVNI != DefVNI && "Cannot read and define the same value.");
       LI.MergeValueNumberInto(DefVNI, ReadVNI);
+
+      // Process subregister liveranges.
+      for (LiveInterval::SubRange &S : LI.subranges()) {
+        LiveQueryResult SLRQ = S.Query(CopyIdx);
+        if (VNInfo *SDefVNI = SLRQ.valueDefined()) {
+          VNInfo *SReadVNI = SLRQ.valueIn();
+          S.MergeValueNumberInto(SDefVNI, SReadVNI);
+        }
+      }
       DEBUG(dbgs() << "\tMerged values:          " << LI << '\n');
     }
     LIS->RemoveMachineInstrFromMaps(CopyMI);
@@ -1124,6 +1315,9 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     });
   }
 
+  ShrinkMask = 0;
+  ShrinkMainRange = false;
+
   // Okay, attempt to join these two intervals.  On failure, this returns false.
   // Otherwise, if one of the intervals being joined is a physreg, this method
   // always canonicalizes DstInt to be it.  The output "SrcInt" will not have
@@ -1178,12 +1372,28 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
   updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
 
+  // Shrink subregister ranges if necessary.
+  if (ShrinkMask != 0) {
+    LiveInterval &LI = LIS->getInterval(CP.getDstReg());
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      if ((S.LaneMask & ShrinkMask) == 0)
+        continue;
+      DEBUG(dbgs() << "Shrink LaneUses (Lane "
+                   << format("%04X", S.LaneMask) << ")\n");
+      LIS->shrinkToUses(S, LI.reg);
+    }
+  }
+  if (ShrinkMainRange) {
+    LiveInterval &LI = LIS->getInterval(CP.getDstReg());
+    LIS->shrinkToUses(&LI);
+  }
+
   // SrcReg is guaranteed to be the register whose live interval that is
   // being merged.
   LIS->removeInterval(CP.getSrcReg());
 
   // Update regalloc hint.
-  TRI->UpdateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
+  TRI->updateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
 
   DEBUG({
     dbgs() << "\tSuccess: " << PrintReg(CP.getSrcReg(), TRI, CP.getSrcIdx())
@@ -1200,24 +1410,23 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   return true;
 }
 
-/// Attempt joining with a reserved physreg.
 bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
+  unsigned DstReg = CP.getDstReg();
   assert(CP.isPhys() && "Must be a physreg copy");
-  assert(MRI->isReserved(CP.getDstReg()) && "Not a reserved register");
+  assert(MRI->isReserved(DstReg) && "Not a reserved register");
   LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
   DEBUG(dbgs() << "\t\tRHS = " << RHS << '\n');
 
-  assert(CP.isFlipped() && RHS.containsOneValue() &&
-         "Invalid join with reserved register");
+  assert(RHS.containsOneValue() && "Invalid join with reserved register");
 
   // Optimization for reserved registers like ESP. We can only merge with a
-  // reserved physreg if RHS has a single value that is a copy of CP.DstReg().
+  // reserved physreg if RHS has a single value that is a copy of DstReg.
   // The live range of the reserved register will look like a set of dead defs
   // - we don't properly track the live range of reserved registers.
 
   // Deny any overlapping intervals.  This depends on all the reserved
   // register live ranges to look like dead defs.
-  for (MCRegUnitIterator UI(CP.getDstReg(), TRI); UI.isValid(); ++UI)
+  for (MCRegUnitIterator UI(DstReg, TRI); UI.isValid(); ++UI)
     if (RHS.overlaps(LIS->getRegUnit(*UI))) {
       DEBUG(dbgs() << "\t\tInterference: " << PrintRegUnit(*UI, TRI) << '\n');
       return false;
@@ -1229,7 +1438,46 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
   // defs are there.
 
   // Delete the identity copy.
-  MachineInstr *CopyMI = MRI->getVRegDef(RHS.reg);
+  MachineInstr *CopyMI;
+  if (CP.isFlipped()) {
+    CopyMI = MRI->getVRegDef(RHS.reg);
+  } else {
+    if (!MRI->hasOneNonDBGUse(RHS.reg)) {
+      DEBUG(dbgs() << "\t\tMultiple vreg uses!\n");
+      return false;
+    }
+
+    MachineInstr *DestMI = MRI->getVRegDef(RHS.reg);
+    CopyMI = &*MRI->use_instr_nodbg_begin(RHS.reg);
+    const SlotIndex CopyRegIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
+    const SlotIndex DestRegIdx = LIS->getInstructionIndex(DestMI).getRegSlot();
+
+    // We checked above that there are no interfering defs of the physical
+    // register. However, for this case, where we intent to move up the def of
+    // the physical register, we also need to check for interfering uses.
+    SlotIndexes *Indexes = LIS->getSlotIndexes();
+    for (SlotIndex SI = Indexes->getNextNonNullIndex(DestRegIdx);
+         SI != CopyRegIdx; SI = Indexes->getNextNonNullIndex(SI)) {
+      MachineInstr *MI = LIS->getInstructionFromIndex(SI);
+      if (MI->readsRegister(DstReg, TRI)) {
+        DEBUG(dbgs() << "\t\tInterference (read): " << *MI);
+        return false;
+      }
+    }
+
+    // We're going to remove the copy which defines a physical reserved
+    // register, so remove its valno, etc.
+    DEBUG(dbgs() << "\t\tRemoving phys reg def of " << DstReg << " at "
+          << CopyRegIdx << "\n");
+
+    LIS->removePhysRegDefAt(DstReg, CopyRegIdx);
+    // Create a new dead def at the new def location.
+    for (MCRegUnitIterator UI(DstReg, TRI); UI.isValid(); ++UI) {
+      LiveRange &LR = LIS->getRegUnit(*UI);
+      LR.createDeadDef(DestRegIdx, LIS->getVNInfoAllocator());
+    }
+  }
+
   LIS->RemoveMachineInstrFromMaps(CopyMI);
   CopyMI->eraseFromParent();
 
@@ -1306,15 +1554,29 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
 namespace {
 /// Track information about values in a single virtual register about to be
 /// joined. Objects of this class are always created in pairs - one for each
-/// side of the CoalescerPair.
+/// side of the CoalescerPair (or one for each lane of a side of the coalescer
+/// pair)
 class JoinVals {
-  LiveInterval &LI;
-
-  // Location of this register in the final joined register.
-  // Either CP.DstIdx or CP.SrcIdx.
-  unsigned SubIdx;
-
-  // Values that will be present in the final live range.
+  /// Live range we work on.
+  LiveRange &LR;
+  /// (Main) register we work on.
+  const unsigned Reg;
+
+  /// Reg (and therefore the values in this liverange) will end up as
+  /// subregister SubIdx in the coalesced register. Either CP.DstIdx or
+  /// CP.SrcIdx.
+  const unsigned SubIdx;
+  /// The LaneMask that this liverange will occupy the coalesced register. May
+  /// be smaller than the lanemask produced by SubIdx when merging subranges.
+  const unsigned LaneMask;
+
+  /// This is true when joining sub register ranges, false when joining main
+  /// ranges.
+  const bool SubRangeJoin;
+  /// Whether the current LiveInterval tracks subregister liveness.
+  const bool TrackSubRegLiveness;
+
+  /// Values that will be present in the final live range.
   SmallVectorImpl<VNInfo*> &NewVNInfo;
 
   const CoalescerPair &CP;
@@ -1322,75 +1584,75 @@ class JoinVals {
   SlotIndexes *Indexes;
   const TargetRegisterInfo *TRI;
 
-  // Value number assignments. Maps value numbers in LI to entries in NewVNInfo.
-  // This is suitable for passing to LiveInterval::join().
+  /// Value number assignments. Maps value numbers in LI to entries in
+  /// NewVNInfo. This is suitable for passing to LiveInterval::join().
   SmallVector<int, 8> Assignments;
 
-  // Conflict resolution for overlapping values.
+  /// Conflict resolution for overlapping values.
   enum ConflictResolution {
-    // No overlap, simply keep this value.
+    /// No overlap, simply keep this value.
     CR_Keep,
 
-    // Merge this value into OtherVNI and erase the defining instruction.
-    // Used for IMPLICIT_DEF, coalescable copies, and copies from external
-    // values.
+    /// Merge this value into OtherVNI and erase the defining instruction.
+    /// Used for IMPLICIT_DEF, coalescable copies, and copies from external
+    /// values.
     CR_Erase,
 
-    // Merge this value into OtherVNI but keep the defining instruction.
-    // This is for the special case where OtherVNI is defined by the same
-    // instruction.
+    /// Merge this value into OtherVNI but keep the defining instruction.
+    /// This is for the special case where OtherVNI is defined by the same
+    /// instruction.
     CR_Merge,
 
-    // Keep this value, and have it replace OtherVNI where possible. This
-    // complicates value mapping since OtherVNI maps to two different values
-    // before and after this def.
-    // Used when clobbering undefined or dead lanes.
+    /// Keep this value, and have it replace OtherVNI where possible. This
+    /// complicates value mapping since OtherVNI maps to two different values
+    /// before and after this def.
+    /// Used when clobbering undefined or dead lanes.
     CR_Replace,
 
-    // Unresolved conflict. Visit later when all values have been mapped.
+    /// Unresolved conflict. Visit later when all values have been mapped.
     CR_Unresolved,
 
-    // Unresolvable conflict. Abort the join.
+    /// Unresolvable conflict. Abort the join.
     CR_Impossible
   };
 
-  // Per-value info for LI. The lane bit masks are all relative to the final
-  // joined register, so they can be compared directly between SrcReg and
-  // DstReg.
+  /// Per-value info for LI. The lane bit masks are all relative to the final
+  /// joined register, so they can be compared directly between SrcReg and
+  /// DstReg.
   struct Val {
     ConflictResolution Resolution;
 
-    // Lanes written by this def, 0 for unanalyzed values.
+    /// Lanes written by this def, 0 for unanalyzed values.
     unsigned WriteLanes;
 
-    // Lanes with defined values in this register. Other lanes are undef and
-    // safe to clobber.
+    /// Lanes with defined values in this register. Other lanes are undef and
+    /// safe to clobber.
     unsigned ValidLanes;
 
-    // Value in LI being redefined by this def.
+    /// Value in LI being redefined by this def.
     VNInfo *RedefVNI;
 
-    // Value in the other live range that overlaps this def, if any.
+    /// Value in the other live range that overlaps this def, if any.
     VNInfo *OtherVNI;
 
-    // Is this value an IMPLICIT_DEF that can be erased?
-    //
-    // IMPLICIT_DEF values should only exist at the end of a basic block that
-    // is a predecessor to a phi-value. These IMPLICIT_DEF instructions can be
-    // safely erased if they are overlapping a live value in the other live
-    // interval.
-    //
-    // Weird control flow graphs and incomplete PHI handling in
-    // ProcessImplicitDefs can very rarely create IMPLICIT_DEF values with
-    // longer live ranges. Such IMPLICIT_DEF values should be treated like
-    // normal values.
+    /// Is this value an IMPLICIT_DEF that can be erased?
+    ///
+    /// IMPLICIT_DEF values should only exist at the end of a basic block that
+    /// is a predecessor to a phi-value. These IMPLICIT_DEF instructions can be
+    /// safely erased if they are overlapping a live value in the other live
+    /// interval.
+    ///
+    /// Weird control flow graphs and incomplete PHI handling in
+    /// ProcessImplicitDefs can very rarely create IMPLICIT_DEF values with
+    /// longer live ranges. Such IMPLICIT_DEF values should be treated like
+    /// normal values.
     bool ErasableImplicitDef;
 
-    // True when the live range of this value will be pruned because of an
-    // overlapping CR_Replace value in the other live range.
+    /// True when the live range of this value will be pruned because of an
+    /// overlapping CR_Replace value in the other live range.
     bool Pruned;
 
-    // True once Pruned above has been computed.
+    /// True once Pruned above has been computed.
     bool PrunedComputed;
 
     Val() : Resolution(CR_Keep), WriteLanes(0), ValidLanes(0),
@@ -1400,30 +1662,75 @@ class JoinVals {
     bool isAnalyzed() const { return WriteLanes != 0; }
   };
 
-  // One entry per value number in LI.
+  /// One entry per value number in LI.
   SmallVector<Val, 8> Vals;
 
-  unsigned computeWriteLanes(const MachineInstr *DefMI, bool &Redef);
-  VNInfo *stripCopies(VNInfo *VNI);
+  /// Compute the bitmask of lanes actually written by DefMI.
+  /// Set Redef if there are any partial register definitions that depend on the
+  /// previous value of the register.
+  unsigned computeWriteLanes(const MachineInstr *DefMI, bool &Redef) const;
+
+  /// Find the ultimate value that VNI was copied from.
+  std::pair<const VNInfo*,unsigned> followCopyChain(const VNInfo *VNI) const;
+
+  bool valuesIdentical(VNInfo *Val0, VNInfo *Val1, const JoinVals &Other) const;
+
+  /// Analyze ValNo in this live range, and set all fields of Vals[ValNo].
+  /// Return a conflict resolution when possible, but leave the hard cases as
+  /// CR_Unresolved.
+  /// Recursively calls computeAssignment() on this and Other, guaranteeing that
+  /// both OtherVNI and RedefVNI have been analyzed and mapped before returning.
+  /// The recursion always goes upwards in the dominator tree, making loops
+  /// impossible.
   ConflictResolution analyzeValue(unsigned ValNo, JoinVals &Other);
+
+  /// Compute the value assignment for ValNo in RI.
+  /// This may be called recursively by analyzeValue(), but never for a ValNo on
+  /// the stack.
   void computeAssignment(unsigned ValNo, JoinVals &Other);
+
+  /// Assuming ValNo is going to clobber some valid lanes in Other.LR, compute
+  /// the extent of the tainted lanes in the block.
+  ///
+  /// Multiple values in Other.LR can be affected since partial redefinitions
+  /// can preserve previously tainted lanes.
+  ///
+  ///   1 %dst = VLOAD           <-- Define all lanes in %dst
+  ///   2 %src = FOO             <-- ValNo to be joined with %dst:ssub0
+  ///   3 %dst:ssub1 = BAR       <-- Partial redef doesn't clear taint in ssub0
+  ///   4 %dst:ssub0 = COPY %src <-- Conflict resolved, ssub0 wasn't read
+  ///
+  /// For each ValNo in Other that is affected, add an (EndIndex, TaintedLanes)
+  /// entry to TaintedVals.
+  ///
+  /// Returns false if the tainted lanes extend beyond the basic block.
   bool taintExtent(unsigned, unsigned, JoinVals&,
                    SmallVectorImpl<std::pair<SlotIndex, unsigned> >&);
-  bool usesLanes(MachineInstr *MI, unsigned, unsigned, unsigned);
+
+  /// Return true if MI uses any of the given Lanes from Reg.
+  /// This does not include partial redefinitions of Reg.
+  bool usesLanes(const MachineInstr *MI, unsigned, unsigned, unsigned) const;
+
+  /// Determine if ValNo is a copy of a value number in LR or Other.LR that will
+  /// be pruned:
+  ///
+  ///   %dst = COPY %src
+  ///   %src = COPY %dst  <-- This value to be pruned.
+  ///   %dst = COPY %src  <-- This value is a copy of a pruned value.
   bool isPrunedValue(unsigned ValNo, JoinVals &Other);
 
 public:
-  JoinVals(LiveInterval &li, unsigned subIdx,
-           SmallVectorImpl<VNInfo*> &newVNInfo,
-           const CoalescerPair &cp,
-           LiveIntervals *lis,
-           const TargetRegisterInfo *tri)
-    : LI(li), SubIdx(subIdx), NewVNInfo(newVNInfo), CP(cp), LIS(lis),
-      Indexes(LIS->getSlotIndexes()), TRI(tri),
-      Assignments(LI.getNumValNums(), -1), Vals(LI.getNumValNums())
+  JoinVals(LiveRange &LR, unsigned Reg, unsigned SubIdx, unsigned LaneMask,
+           SmallVectorImpl<VNInfo*> &newVNInfo, const CoalescerPair &cp,
+           LiveIntervals *lis, const TargetRegisterInfo *TRI, bool SubRangeJoin,
+           bool TrackSubRegLiveness)
+    : LR(LR), Reg(Reg), SubIdx(SubIdx), LaneMask(LaneMask),
+      SubRangeJoin(SubRangeJoin), TrackSubRegLiveness(TrackSubRegLiveness),
+      NewVNInfo(newVNInfo), CP(cp), LIS(lis), Indexes(LIS->getSlotIndexes()),
+      TRI(TRI), Assignments(LR.getNumValNums(), -1), Vals(LR.getNumValNums())
   {}
 
-  /// Analyze defs in LI and compute a value mapping in NewVNInfo.
+  /// Analyze defs in LR and compute a value mapping in NewVNInfo.
   /// Returns false if any conflicts were impossible to resolve.
   bool mapValues(JoinVals &Other);
 
@@ -1431,10 +1738,16 @@ public:
   /// Returns false if any conflicts were impossible to resolve.
   bool resolveConflicts(JoinVals &Other);
 
-  /// Prune the live range of values in Other.LI where they would conflict with
-  /// CR_Replace values in LI. Collect end points for restoring the live range
+  /// Prune the live range of values in Other.LR where they would conflict with
+  /// CR_Replace values in LR. Collect end points for restoring the live range
   /// after joining.
-  void pruneValues(JoinVals &Other, SmallVectorImpl<SlotIndex> &EndPoints);
+  void pruneValues(JoinVals &Other, SmallVectorImpl<SlotIndex> &EndPoints,
+                   bool changeInstrs);
+
+  /// Removes subranges starting at copies that get removed. This sometimes
+  /// happens when undefined subranges are copied around. These ranges contain
+  /// no usefull information and can be removed.
+  void pruneSubRegValues(LiveInterval &LI, unsigned &ShrinkMask);
 
   /// Erase any machine instructions that have been coalesced away.
   /// Add erased instructions to ErasedInstrs.
@@ -1448,13 +1761,11 @@ public:
 };
 } // end anonymous namespace
 
-/// Compute the bitmask of lanes actually written by DefMI.
-/// Set Redef if there are any partial register definitions that depend on the
-/// previous value of the register.
-unsigned JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef) {
+unsigned JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef)
+  const {
   unsigned L = 0;
   for (ConstMIOperands MO(DefMI); MO.isValid(); ++MO) {
-    if (!MO->isReg() || MO->getReg() != LI.reg || !MO->isDef())
+    if (!MO->isReg() || MO->getReg() != Reg || !MO->isDef())
       continue;
     L |= TRI->getSubRegIndexLaneMask(
            TRI->composeSubRegIndices(SubIdx, MO->getSubReg()));
@@ -1464,36 +1775,71 @@ unsigned JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef) {
   return L;
 }
 
-/// Find the ultimate value that VNI was copied from.
-VNInfo *JoinVals::stripCopies(VNInfo *VNI) {
+std::pair<const VNInfo*, unsigned> JoinVals::followCopyChain(
+    const VNInfo *VNI) const {
+  unsigned Reg = this->Reg;
+
   while (!VNI->isPHIDef()) {
-    MachineInstr *MI = Indexes->getInstructionFromIndex(VNI->def);
+    SlotIndex Def = VNI->def;
+    MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
     assert(MI && "No defining instruction");
     if (!MI->isFullCopy())
+      return std::make_pair(VNI, Reg);
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+      return std::make_pair(VNI, Reg);
+
+    const LiveInterval &LI = LIS->getInterval(SrcReg);
+    const VNInfo *ValueIn;
+    // No subrange involved.
+    if (!SubRangeJoin || !LI.hasSubRanges()) {
+      LiveQueryResult LRQ = LI.Query(Def);
+      ValueIn = LRQ.valueIn();
+    } else {
+      // Query subranges. Pick the first matching one.
+      ValueIn = nullptr;
+      for (const LiveInterval::SubRange &S : LI.subranges()) {
+        // Transform lanemask to a mask in the joined live interval.
+        unsigned SMask = TRI->composeSubRegIndexLaneMask(SubIdx, S.LaneMask);
+        if ((SMask & LaneMask) == 0)
+          continue;
+        LiveQueryResult LRQ = S.Query(Def);
+        ValueIn = LRQ.valueIn();
+        break;
+      }
+    }
+    if (ValueIn == nullptr)
       break;
-    unsigned Reg = MI->getOperand(1).getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
-      break;
-    LiveQueryResult LRQ = LIS->getInterval(Reg).Query(VNI->def);
-    if (!LRQ.valueIn())
-      break;
-    VNI = LRQ.valueIn();
+    VNI = ValueIn;
+    Reg = SrcReg;
   }
-  return VNI;
+  return std::make_pair(VNI, Reg);
+}
+
+bool JoinVals::valuesIdentical(VNInfo *Value0, VNInfo *Value1,
+                               const JoinVals &Other) const {
+  const VNInfo *Orig0;
+  unsigned Reg0;
+  std::tie(Orig0, Reg0) = followCopyChain(Value0);
+  if (Orig0 == Value1)
+    return true;
+
+  const VNInfo *Orig1;
+  unsigned Reg1;
+  std::tie(Orig1, Reg1) = Other.followCopyChain(Value1);
+
+  // The values are equal if they are defined at the same place and use the
+  // same register. Note that we cannot compare VNInfos directly as some of
+  // them might be from a copy created in mergeSubRangeInto()  while the other
+  // is from the original LiveInterval.
+  return Orig0->def == Orig1->def && Reg0 == Reg1;
 }
 
-/// Analyze ValNo in this live range, and set all fields of Vals[ValNo].
-/// Return a conflict resolution when possible, but leave the hard cases as
-/// CR_Unresolved.
-/// Recursively calls computeAssignment() on this and Other, guaranteeing that
-/// both OtherVNI and RedefVNI have been analyzed and mapped before returning.
-/// The recursion always goes upwards in the dominator tree, making loops
-/// impossible.
 JoinVals::ConflictResolution
 JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   Val &V = Vals[ValNo];
   assert(!V.isAnalyzed() && "Value has already been analyzed!");
-  VNInfo *VNI = LI.getValNumInfo(ValNo);
+  VNInfo *VNI = LR.getValNumInfo(ValNo);
   if (VNI->isUnused()) {
     V.WriteLanes = ~0u;
     return CR_Keep;
@@ -1503,46 +1849,56 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   const MachineInstr *DefMI = nullptr;
   if (VNI->isPHIDef()) {
     // Conservatively assume that all lanes in a PHI are valid.
-    V.ValidLanes = V.WriteLanes = TRI->getSubRegIndexLaneMask(SubIdx);
+    unsigned Lanes = SubRangeJoin ? 1 : TRI->getSubRegIndexLaneMask(SubIdx);
+    V.ValidLanes = V.WriteLanes = Lanes;
   } else {
     DefMI = Indexes->getInstructionFromIndex(VNI->def);
-    bool Redef = false;
-    V.ValidLanes = V.WriteLanes = computeWriteLanes(DefMI, Redef);
-
-    // If this is a read-modify-write instruction, there may be more valid
-    // lanes than the ones written by this instruction.
-    // This only covers partial redef operands. DefMI may have normal use
-    // operands reading the register. They don't contribute valid lanes.
-    //
-    // This adds ssub1 to the set of valid lanes in %src:
-    //
-    //   %src:ssub1<def> = FOO
-    //
-    // This leaves only ssub1 valid, making any other lanes undef:
-    //
-    //   %src:ssub1<def,read-undef> = FOO %src:ssub2
-    //
-    // The <read-undef> flag on the def operand means that old lane values are
-    // not important.
-    if (Redef) {
-      V.RedefVNI = LI.Query(VNI->def).valueIn();
-      assert(V.RedefVNI && "Instruction is reading nonexistent value");
-      computeAssignment(V.RedefVNI->id, Other);
-      V.ValidLanes |= Vals[V.RedefVNI->id].ValidLanes;
-    }
+    assert(DefMI != nullptr);
+    if (SubRangeJoin) {
+      // We don't care about the lanes when joining subregister ranges.
+      V.ValidLanes = V.WriteLanes = 1;
+    } else {
+      bool Redef = false;
+      V.ValidLanes = V.WriteLanes = computeWriteLanes(DefMI, Redef);
+
+      // If this is a read-modify-write instruction, there may be more valid
+      // lanes than the ones written by this instruction.
+      // This only covers partial redef operands. DefMI may have normal use
+      // operands reading the register. They don't contribute valid lanes.
+      //
+      // This adds ssub1 to the set of valid lanes in %src:
+      //
+      //   %src:ssub1<def> = FOO
+      //
+      // This leaves only ssub1 valid, making any other lanes undef:
+      //
+      //   %src:ssub1<def,read-undef> = FOO %src:ssub2
+      //
+      // The <read-undef> flag on the def operand means that old lane values are
+      // not important.
+      if (Redef) {
+        V.RedefVNI = LR.Query(VNI->def).valueIn();
+        assert((TrackSubRegLiveness || V.RedefVNI) &&
+               "Instruction is reading nonexistent value");
+        if (V.RedefVNI != nullptr) {
+          computeAssignment(V.RedefVNI->id, Other);
+          V.ValidLanes |= Vals[V.RedefVNI->id].ValidLanes;
+        }
+      }
 
-    // An IMPLICIT_DEF writes undef values.
-    if (DefMI->isImplicitDef()) {
-      // We normally expect IMPLICIT_DEF values to be live only until the end
-      // of their block. If the value is really live longer and gets pruned in
-      // another block, this flag is cleared again.
-      V.ErasableImplicitDef = true;
-      V.ValidLanes &= ~V.WriteLanes;
+      // An IMPLICIT_DEF writes undef values.
+      if (DefMI->isImplicitDef()) {
+        // We normally expect IMPLICIT_DEF values to be live only until the end
+        // of their block. If the value is really live longer and gets pruned in
+        // another block, this flag is cleared again.
+        V.ErasableImplicitDef = true;
+        V.ValidLanes &= ~V.WriteLanes;
+      }
     }
   }
 
   // Find the value in Other that overlaps VNI->def, if any.
-  LiveQueryResult OtherLRQ = Other.LI.Query(VNI->def);
+  LiveQueryResult OtherLRQ = Other.LR.Query(VNI->def);
 
   // It is possible that both values are defined by the same instruction, or
   // the values are PHIs defined in the same block. When that happens, the two
@@ -1612,8 +1968,14 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
     return CR_Replace;
 
   // Check for simple erasable conflicts.
-  if (DefMI->isImplicitDef())
+  if (DefMI->isImplicitDef()) {
+    // We need the def for the subregister if there is nothing else live at the
+    // subrange at this point.
+    if (TrackSubRegLiveness
+        && (V.WriteLanes & (OtherV.ValidLanes | OtherV.WriteLanes)) == 0)
+      return CR_Replace;
     return CR_Erase;
+  }
 
   // Include the non-conflict where DefMI is a coalescable copy that kills
   // OtherVNI. We still want the copy erased and value numbers merged.
@@ -1634,8 +1996,8 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   //   %other = COPY %ext
   //   %this  = COPY %ext <-- Erase this copy
   //
-  if (DefMI->isFullCopy() && !CP.isPartial() &&
-      stripCopies(VNI) == stripCopies(V.OtherVNI))
+  if (DefMI->isFullCopy() && !CP.isPartial()
+      && valuesIdentical(VNI, V.OtherVNI, Other))
     return CR_Erase;
 
   // If the lanes written by this instruction were all undef in OtherVNI, it is
@@ -1670,7 +2032,7 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   // VNI is clobbering live lanes in OtherVNI, but there is still the
   // possibility that no instructions actually read the clobbered lanes.
   // If we're clobbering all the lanes in OtherVNI, at least one must be read.
-  // Otherwise Other.LI wouldn't be live here.
+  // Otherwise Other.RI wouldn't be live here.
   if ((TRI->getSubRegIndexLaneMask(Other.SubIdx) & ~V.WriteLanes) == 0)
     return CR_Impossible;
 
@@ -1691,9 +2053,6 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   return CR_Unresolved;
 }
 
-/// Compute the value assignment for ValNo in LI.
-/// This may be called recursively by analyzeValue(), but never for a ValNo on
-/// the stack.
 void JoinVals::computeAssignment(unsigned ValNo, JoinVals &Other) {
   Val &V = Vals[ValNo];
   if (V.isAnalyzed()) {
@@ -1709,73 +2068,64 @@ void JoinVals::computeAssignment(unsigned ValNo, JoinVals &Other) {
     assert(V.OtherVNI && "OtherVNI not assigned, can't merge.");
     assert(Other.Vals[V.OtherVNI->id].isAnalyzed() && "Missing recursion");
     Assignments[ValNo] = Other.Assignments[V.OtherVNI->id];
-    DEBUG(dbgs() << "\t\tmerge " << PrintReg(LI.reg) << ':' << ValNo << '@'
-                 << LI.getValNumInfo(ValNo)->def << " into "
-                 << PrintReg(Other.LI.reg) << ':' << V.OtherVNI->id << '@'
+    DEBUG(dbgs() << "\t\tmerge " << PrintReg(Reg) << ':' << ValNo << '@'
+                 << LR.getValNumInfo(ValNo)->def << " into "
+                 << PrintReg(Other.Reg) << ':' << V.OtherVNI->id << '@'
                  << V.OtherVNI->def << " --> @"
                  << NewVNInfo[Assignments[ValNo]]->def << '\n');
     break;
   case CR_Replace:
-  case CR_Unresolved:
+  case CR_Unresolved: {
     // The other value is going to be pruned if this join is successful.
     assert(V.OtherVNI && "OtherVNI not assigned, can't prune");
-    Other.Vals[V.OtherVNI->id].Pruned = true;
+    Val &OtherV = Other.Vals[V.OtherVNI->id];
+    // We cannot erase an IMPLICIT_DEF if we don't have valid values for all
+    // its lanes.
+    if ((OtherV.WriteLanes & ~V.ValidLanes) != 0 && TrackSubRegLiveness)
+      OtherV.ErasableImplicitDef = false;
+    OtherV.Pruned = true;
+  }
     // Fall through.
   default:
     // This value number needs to go in the final joined live range.
     Assignments[ValNo] = NewVNInfo.size();
-    NewVNInfo.push_back(LI.getValNumInfo(ValNo));
+    NewVNInfo.push_back(LR.getValNumInfo(ValNo));
     break;
   }
 }
 
 bool JoinVals::mapValues(JoinVals &Other) {
-  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
     computeAssignment(i, Other);
     if (Vals[i].Resolution == CR_Impossible) {
-      DEBUG(dbgs() << "\t\tinterference at " << PrintReg(LI.reg) << ':' << i
-                   << '@' << LI.getValNumInfo(i)->def << '\n');
+      DEBUG(dbgs() << "\t\tinterference at " << PrintReg(Reg) << ':' << i
+                   << '@' << LR.getValNumInfo(i)->def << '\n');
       return false;
     }
   }
   return true;
 }
 
-/// Assuming ValNo is going to clobber some valid lanes in Other.LI, compute
-/// the extent of the tainted lanes in the block.
-///
-/// Multiple values in Other.LI can be affected since partial redefinitions can
-/// preserve previously tainted lanes.
-///
-///   1 %dst = VLOAD           <-- Define all lanes in %dst
-///   2 %src = FOO             <-- ValNo to be joined with %dst:ssub0
-///   3 %dst:ssub1 = BAR       <-- Partial redef doesn't clear taint in ssub0
-///   4 %dst:ssub0 = COPY %src <-- Conflict resolved, ssub0 wasn't read
-///
-/// For each ValNo in Other that is affected, add an (EndIndex, TaintedLanes)
-/// entry to TaintedVals.
-///
-/// Returns false if the tainted lanes extend beyond the basic block.
 bool JoinVals::
 taintExtent(unsigned ValNo, unsigned TaintedLanes, JoinVals &Other,
             SmallVectorImpl<std::pair<SlotIndex, unsigned> > &TaintExtent) {
-  VNInfo *VNI = LI.getValNumInfo(ValNo);
+  VNInfo *VNI = LR.getValNumInfo(ValNo);
   MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
   SlotIndex MBBEnd = Indexes->getMBBEndIdx(MBB);
 
-  // Scan Other.LI from VNI.def to MBBEnd.
-  LiveInterval::iterator OtherI = Other.LI.find(VNI->def);
-  assert(OtherI != Other.LI.end() && "No conflict?");
+  // Scan Other.LR from VNI.def to MBBEnd.
+  LiveInterval::iterator OtherI = Other.LR.find(VNI->def);
+  assert(OtherI != Other.LR.end() && "No conflict?");
   do {
     // OtherI is pointing to a tainted value. Abort the join if the tainted
     // lanes escape the block.
     SlotIndex End = OtherI->end;
     if (End >= MBBEnd) {
-      DEBUG(dbgs() << "\t\ttaints global " << PrintReg(Other.LI.reg) << ':'
+      DEBUG(dbgs() << "\t\ttaints global " << PrintReg(Other.Reg) << ':'
                    << OtherI->valno->id << '@' << OtherI->start << '\n');
       return false;
     }
-    DEBUG(dbgs() << "\t\ttaints local " << PrintReg(Other.LI.reg) << ':'
+    DEBUG(dbgs() << "\t\ttaints local " << PrintReg(Other.Reg) << ':'
                  << OtherI->valno->id << '@' << OtherI->start
                  << " to " << End << '\n');
     // A dead def is not a problem.
@@ -1784,7 +2134,7 @@ taintExtent(unsigned ValNo, unsigned TaintedLanes, JoinVals &Other,
     TaintExtent.push_back(std::make_pair(End, TaintedLanes));
 
     // Check for another def in the MBB.
-    if (++OtherI == Other.LI.end() || OtherI->start >= MBBEnd)
+    if (++OtherI == Other.LR.end() || OtherI->start >= MBBEnd)
       break;
 
     // Lanes written by the new def are no longer tainted.
@@ -1796,10 +2146,8 @@ taintExtent(unsigned ValNo, unsigned TaintedLanes, JoinVals &Other,
   return true;
 }
 
-/// Return true if MI uses any of the given Lanes from Reg.
-/// This does not include partial redefinitions of Reg.
-bool JoinVals::usesLanes(MachineInstr *MI, unsigned Reg, unsigned SubIdx,
-                         unsigned Lanes) {
+bool JoinVals::usesLanes(const MachineInstr *MI, unsigned Reg, unsigned SubIdx,
+                         unsigned Lanes) const {
   if (MI->isDebugValue())
     return false;
   for (ConstMIOperands MO(MI); MO.isValid(); ++MO) {
@@ -1815,16 +2163,19 @@ bool JoinVals::usesLanes(MachineInstr *MI, unsigned Reg, unsigned SubIdx,
 }
 
 bool JoinVals::resolveConflicts(JoinVals &Other) {
-  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
     Val &V = Vals[i];
     assert (V.Resolution != CR_Impossible && "Unresolvable conflict");
     if (V.Resolution != CR_Unresolved)
       continue;
-    DEBUG(dbgs() << "\t\tconflict at " << PrintReg(LI.reg) << ':' << i
-                 << '@' << LI.getValNumInfo(i)->def << '\n');
+    DEBUG(dbgs() << "\t\tconflict at " << PrintReg(Reg) << ':' << i
+                 << '@' << LR.getValNumInfo(i)->def << '\n');
+    if (SubRangeJoin)
+      return false;
+
     ++NumLaneConflicts;
     assert(V.OtherVNI && "Inconsistent conflict resolution.");
-    VNInfo *VNI = LI.getValNumInfo(i);
+    VNInfo *VNI = LR.getValNumInfo(i);
     const Val &OtherV = Other.Vals[V.OtherVNI->id];
 
     // VNI is known to clobber some lanes in OtherVNI. If we go ahead with the
@@ -1854,7 +2205,7 @@ bool JoinVals::resolveConflicts(JoinVals &Other) {
     unsigned TaintNum = 0;
     for(;;) {
       assert(MI != MBB->end() && "Bad LastMI");
-      if (usesLanes(MI, Other.LI.reg, Other.SubIdx, TaintedLanes)) {
+      if (usesLanes(MI, Other.Reg, Other.SubIdx, TaintedLanes)) {
         DEBUG(dbgs() << "\t\ttainted lanes used by: " << *MI);
         return false;
       }
@@ -1876,13 +2227,6 @@ bool JoinVals::resolveConflicts(JoinVals &Other) {
   return true;
 }
 
-// Determine if ValNo is a copy of a value number in LI or Other.LI that will
-// be pruned:
-//
-//   %dst = COPY %src
-//   %src = COPY %dst  <-- This value to be pruned.
-//   %dst = COPY %src  <-- This value is a copy of a pruned value.
-//
 bool JoinVals::isPrunedValue(unsigned ValNo, JoinVals &Other) {
   Val &V = Vals[ValNo];
   if (V.Pruned || V.PrunedComputed)
@@ -1899,15 +2243,16 @@ bool JoinVals::isPrunedValue(unsigned ValNo, JoinVals &Other) {
 }
 
 void JoinVals::pruneValues(JoinVals &Other,
-                           SmallVectorImpl<SlotIndex> &EndPoints) {
-  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
-    SlotIndex Def = LI.getValNumInfo(i)->def;
+                           SmallVectorImpl<SlotIndex> &EndPoints,
+                           bool changeInstrs) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    SlotIndex Def = LR.getValNumInfo(i)->def;
     switch (Vals[i].Resolution) {
     case CR_Keep:
       break;
     case CR_Replace: {
-      // This value takes precedence over the value in Other.LI.
-      LIS->pruneValue(&Other.LI, Def, &EndPoints);
+      // This value takes precedence over the value in Other.LR.
+      LIS->pruneValue(Other.LR, Def, &EndPoints);
       // Check if we're replacing an IMPLICIT_DEF value. The IMPLICIT_DEF
       // instructions are only inserted to provide a live-out value for PHI
       // predecessors, so the instruction should simply go away once its value
@@ -1916,34 +2261,37 @@ void JoinVals::pruneValues(JoinVals &Other,
       bool EraseImpDef = OtherV.ErasableImplicitDef &&
                          OtherV.Resolution == CR_Keep;
       if (!Def.isBlock()) {
-        // Remove <def,read-undef> flags. This def is now a partial redef.
-        // Also remove <def,dead> flags since the joined live range will
-        // continue past this instruction.
-        for (MIOperands MO(Indexes->getInstructionFromIndex(Def));
-             MO.isValid(); ++MO)
-          if (MO->isReg() && MO->isDef() && MO->getReg() == LI.reg) {
-            MO->setIsUndef(EraseImpDef);
-            MO->setIsDead(false);
+        if (changeInstrs) {
+          // Remove <def,read-undef> flags. This def is now a partial redef.
+          // Also remove <def,dead> flags since the joined live range will
+          // continue past this instruction.
+          for (MIOperands MO(Indexes->getInstructionFromIndex(Def));
+               MO.isValid(); ++MO) {
+            if (MO->isReg() && MO->isDef() && MO->getReg() == Reg) {
+              MO->setIsUndef(EraseImpDef);
+              MO->setIsDead(false);
+            }
           }
+        }
         // This value will reach instructions below, but we need to make sure
         // the live range also reaches the instruction at Def.
         if (!EraseImpDef)
           EndPoints.push_back(Def);
       }
-      DEBUG(dbgs() << "\t\tpruned " << PrintReg(Other.LI.reg) << " at " << Def
-                   << ": " << Other.LI << '\n');
+      DEBUG(dbgs() << "\t\tpruned " << PrintReg(Other.Reg) << " at " << Def
+                   << ": " << Other.LR << '\n');
       break;
     }
     case CR_Erase:
     case CR_Merge:
       if (isPrunedValue(i, Other)) {
-        // This value is ultimately a copy of a pruned value in LI or Other.LI.
+        // This value is ultimately a copy of a pruned value in LR or Other.LR.
         // We can no longer trust the value mapping computed by
         // computeAssignment(), the value that was originally copied could have
         // been replaced.
-        LIS->pruneValue(&LI, Def, &EndPoints);
-        DEBUG(dbgs() << "\t\tpruned all of " << PrintReg(LI.reg) << " at "
-                     << Def << ": " << LI << '\n');
+        LIS->pruneValue(LR, Def, &EndPoints);
+        DEBUG(dbgs() << "\t\tpruned all of " << PrintReg(Reg) << " at "
+                     << Def << ": " << LR << '\n');
       }
       break;
     case CR_Unresolved:
@@ -1953,25 +2301,65 @@ void JoinVals::pruneValues(JoinVals &Other,
   }
 }
 
+void JoinVals::pruneSubRegValues(LiveInterval &LI, unsigned &ShrinkMask)
+{
+  // Look for values being erased.
+  bool DidPrune = false;
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    if (Vals[i].Resolution != CR_Erase)
+      continue;
+
+    // Check subranges at the point where the copy will be removed.
+    SlotIndex Def = LR.getValNumInfo(i)->def;
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      LiveQueryResult Q = S.Query(Def);
+
+      // If a subrange starts at the copy then an undefined value has been
+      // copied and we must remove that subrange value as well.
+      VNInfo *ValueOut = Q.valueOutOrDead();
+      if (ValueOut != nullptr && Q.valueIn() == nullptr) {
+        DEBUG(dbgs() << "\t\tPrune sublane " << format("%04X", S.LaneMask)
+                     << " at " << Def << "\n");
+        LIS->pruneValue(S, Def, nullptr);
+        DidPrune = true;
+        // Mark value number as unused.
+        ValueOut->markUnused();
+        continue;
+      }
+      // If a subrange ends at the copy, then a value was copied but only
+      // partially used later. Shrink the subregister range apropriately.
+      if (Q.valueIn() != nullptr && Q.valueOut() == nullptr) {
+        DEBUG(dbgs() << "\t\tDead uses at sublane "
+                     << format("%04X", S.LaneMask) << " at " << Def << "\n");
+        ShrinkMask |= S.LaneMask;
+      }
+    }
+  }
+  if (DidPrune)
+    LI.removeEmptySubRanges();
+}
+
 void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
                            SmallVectorImpl<unsigned> &ShrinkRegs) {
-  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
     // Get the def location before markUnused() below invalidates it.
-    SlotIndex Def = LI.getValNumInfo(i)->def;
+    SlotIndex Def = LR.getValNumInfo(i)->def;
     switch (Vals[i].Resolution) {
-    case CR_Keep:
+    case CR_Keep: {
       // If an IMPLICIT_DEF value is pruned, it doesn't serve a purpose any
       // longer. The IMPLICIT_DEF instructions are only inserted by
       // PHIElimination to guarantee that all PHI predecessors have a value.
       if (!Vals[i].ErasableImplicitDef || !Vals[i].Pruned)
         break;
-      // Remove value number i from LI. Note that this VNInfo is still present
-      // in NewVNInfo, so it will appear as an unused value number in the final
-      // joined interval.
-      LI.getValNumInfo(i)->markUnused();
-      LI.removeValNo(LI.getValNumInfo(i));
-      DEBUG(dbgs() << "\t\tremoved " << i << '@' << Def << ": " << LI << '\n');
+      // Remove value number i from LR.
+      VNInfo *VNI = LR.getValNumInfo(i);
+      LR.removeValNo(VNI);
+      // Note that this VNInfo is reused and still referenced in NewVNInfo,
+      // make it appear like an unused value number.
+      VNI->markUnused();
+      DEBUG(dbgs() << "\t\tremoved " << i << '@' << Def << ": " << LR << '\n');
       // FALL THROUGH.
+    }
 
     case CR_Erase: {
       MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
@@ -1994,12 +2382,96 @@ void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
   }
 }
 
+void RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
+                                         unsigned LaneMask,
+                                         const CoalescerPair &CP) {
+  SmallVector<VNInfo*, 16> NewVNInfo;
+  JoinVals RHSVals(RRange, CP.getSrcReg(), CP.getSrcIdx(), LaneMask,
+                   NewVNInfo, CP, LIS, TRI, true, true);
+  JoinVals LHSVals(LRange, CP.getDstReg(), CP.getDstIdx(), LaneMask,
+                   NewVNInfo, CP, LIS, TRI, true, true);
+
+  // Compute NewVNInfo and resolve conflicts (see also joinVirtRegs())
+  // Conflicts should already be resolved so the mapping/resolution should
+  // always succeed.
+  if (!LHSVals.mapValues(RHSVals) || !RHSVals.mapValues(LHSVals))
+    llvm_unreachable("Can't join subrange although main ranges are compatible");
+  if (!LHSVals.resolveConflicts(RHSVals) || !RHSVals.resolveConflicts(LHSVals))
+    llvm_unreachable("Can't join subrange although main ranges are compatible");
+
+  // The merging algorithm in LiveInterval::join() can't handle conflicting
+  // value mappings, so we need to remove any live ranges that overlap a
+  // CR_Replace resolution. Collect a set of end points that can be used to
+  // restore the live range after joining.
+  SmallVector<SlotIndex, 8> EndPoints;
+  LHSVals.pruneValues(RHSVals, EndPoints, false);
+  RHSVals.pruneValues(LHSVals, EndPoints, false);
+
+  LRange.verify();
+  RRange.verify();
+
+  // Join RRange into LHS.
+  LRange.join(RRange, LHSVals.getAssignments(), RHSVals.getAssignments(),
+              NewVNInfo);
+
+  DEBUG(dbgs() << "\t\tjoined lanes: " << LRange << "\n");
+  if (EndPoints.empty())
+    return;
+
+  // Recompute the parts of the live range we had to remove because of
+  // CR_Replace conflicts.
+  DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
+               << " points: " << LRange << '\n');
+  LIS->extendToIndices(LRange, EndPoints);
+}
+
+void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI,
+                                          const LiveRange &ToMerge,
+                                          unsigned LaneMask, CoalescerPair &CP) {
+  BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+  for (LiveInterval::SubRange &R : LI.subranges()) {
+    unsigned RMask = R.LaneMask;
+    // LaneMask of subregisters common to subrange R and ToMerge.
+    unsigned Common = RMask & LaneMask;
+    // There is nothing to do without common subregs.
+    if (Common == 0)
+      continue;
+
+    DEBUG(dbgs() << format("\t\tCopy+Merge %04X into %04X\n", RMask, Common));
+    // LaneMask of subregisters contained in the R range but not in ToMerge,
+    // they have to split into their own subrange.
+    unsigned LRest = RMask & ~LaneMask;
+    LiveInterval::SubRange *CommonRange;
+    if (LRest != 0) {
+      R.LaneMask = LRest;
+      DEBUG(dbgs() << format("\t\tReduce Lane to %04X\n", LRest));
+      // Duplicate SubRange for newly merged common stuff.
+      CommonRange = LI.createSubRangeFrom(Allocator, Common, R);
+    } else {
+      // Reuse the existing range.
+      R.LaneMask = Common;
+      CommonRange = &R;
+    }
+    LiveRange RangeCopy(ToMerge, Allocator);
+    joinSubRegRanges(*CommonRange, RangeCopy, Common, CP);
+    LaneMask &= ~RMask;
+  }
+
+  if (LaneMask != 0) {
+    DEBUG(dbgs() << format("\t\tNew Lane %04X\n", LaneMask));
+    LI.createSubRangeFrom(Allocator, LaneMask, ToMerge);
+  }
+}
+
 bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
   SmallVector<VNInfo*, 16> NewVNInfo;
   LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
   LiveInterval &LHS = LIS->getInterval(CP.getDstReg());
-  JoinVals RHSVals(RHS, CP.getSrcIdx(), NewVNInfo, CP, LIS, TRI);
-  JoinVals LHSVals(LHS, CP.getDstIdx(), NewVNInfo, CP, LIS, TRI);
+  bool TrackSubRegLiveness = MRI->tracksSubRegLiveness();
+  JoinVals RHSVals(RHS, CP.getSrcReg(), CP.getSrcIdx(), 0, NewVNInfo, CP, LIS,
+                   TRI, false, TrackSubRegLiveness);
+  JoinVals LHSVals(LHS, CP.getDstReg(), CP.getDstIdx(), 0, NewVNInfo, CP, LIS,
+                   TRI, false, TrackSubRegLiveness);
 
   DEBUG(dbgs() << "\t\tRHS = " << RHS
                << "\n\t\tLHS = " << LHS
@@ -2015,14 +2487,55 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
     return false;
 
   // All clear, the live ranges can be merged.
+  if (RHS.hasSubRanges() || LHS.hasSubRanges()) {
+    BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+
+    // Transform lanemasks from the LHS to masks in the coalesced register and
+    // create initial subranges if necessary.
+    unsigned DstIdx = CP.getDstIdx();
+    if (!LHS.hasSubRanges()) {
+      unsigned Mask = DstIdx == 0 ? CP.getNewRC()->getLaneMask()
+                                  : TRI->getSubRegIndexLaneMask(DstIdx);
+      // LHS must support subregs or we wouldn't be in this codepath.
+      assert(Mask != 0);
+      LHS.createSubRangeFrom(Allocator, Mask, LHS);
+    } else if (DstIdx != 0) {
+      // Transform LHS lanemasks to new register class if necessary.
+      for (LiveInterval::SubRange &R : LHS.subranges()) {
+        unsigned Mask = TRI->composeSubRegIndexLaneMask(DstIdx, R.LaneMask);
+        R.LaneMask = Mask;
+      }
+    }
+    DEBUG(dbgs() << "\t\tLHST = " << PrintReg(CP.getDstReg())
+                 << ' ' << LHS << '\n');
+
+    // Determine lanemasks of RHS in the coalesced register and merge subranges.
+    unsigned SrcIdx = CP.getSrcIdx();
+    if (!RHS.hasSubRanges()) {
+      unsigned Mask = SrcIdx == 0 ? CP.getNewRC()->getLaneMask()
+                                  : TRI->getSubRegIndexLaneMask(SrcIdx);
+      mergeSubRangeInto(LHS, RHS, Mask, CP);
+    } else {
+      // Pair up subranges and merge.
+      for (LiveInterval::SubRange &R : RHS.subranges()) {
+        unsigned Mask = TRI->composeSubRegIndexLaneMask(SrcIdx, R.LaneMask);
+        mergeSubRangeInto(LHS, R, Mask, CP);
+      }
+    }
+
+    DEBUG(dbgs() << "\tJoined SubRanges " << LHS << "\n");
+
+    LHSVals.pruneSubRegValues(LHS, ShrinkMask);
+    RHSVals.pruneSubRegValues(LHS, ShrinkMask);
+  }
 
   // The merging algorithm in LiveInterval::join() can't handle conflicting
   // value mappings, so we need to remove any live ranges that overlap a
   // CR_Replace resolution. Collect a set of end points that can be used to
   // restore the live range after joining.
   SmallVector<SlotIndex, 8> EndPoints;
-  LHSVals.pruneValues(RHSVals, EndPoints);
-  RHSVals.pruneValues(LHSVals, EndPoints);
+  LHSVals.pruneValues(RHSVals, EndPoints, true);
+  RHSVals.pruneValues(LHSVals, EndPoints, true);
 
   // Erase COPY and IMPLICIT_DEF instructions. This may cause some external
   // registers to require trimming.
@@ -2041,24 +2554,23 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
   MRI->clearKillFlags(LHS.reg);
   MRI->clearKillFlags(RHS.reg);
 
-  if (EndPoints.empty())
-    return true;
+  if (!EndPoints.empty()) {
+    // Recompute the parts of the live range we had to remove because of
+    // CR_Replace conflicts.
+    DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
+                 << " points: " << LHS << '\n');
+    LIS->extendToIndices((LiveRange&)LHS, EndPoints);
+  }
 
-  // Recompute the parts of the live range we had to remove because of
-  // CR_Replace conflicts.
-  DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
-               << " points: " << LHS << '\n');
-  LIS->extendToIndices(LHS, EndPoints);
   return true;
 }
 
-/// Attempt to join these two intervals.  On failure, this returns false.
 bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) {
   return CP.isPhys() ? joinReservedPhysReg(CP) : joinVirtRegs(CP);
 }
 
 namespace {
-// Information concerning MBB coalescing priority.
+/// Information concerning MBB coalescing priority.
 struct MBBPriorityInfo {
   MachineBasicBlock *MBB;
   unsigned Depth;
@@ -2069,10 +2581,10 @@ struct MBBPriorityInfo {
 };
 }
 
-// C-style comparator that sorts first based on the loop depth of the basic
-// block (the unsigned), and then on the MBB number.
-//
-// EnableGlobalCopies assumes that the primary sort key is loop depth.
+/// C-style comparator that sorts first based on the loop depth of the basic
+/// block (the unsigned), and then on the MBB number.
+///
+/// EnableGlobalCopies assumes that the primary sort key is loop depth.
 static int compareMBBPriority(const MBBPriorityInfo *LHS,
                               const MBBPriorityInfo *RHS) {
   // Deeper loops first
@@ -2112,8 +2624,6 @@ static bool isLocalCopy(MachineInstr *Copy, const LiveIntervals *LIS) {
     || LIS->intervalIsInOneMBB(LIS->getInterval(DstReg));
 }
 
-// Try joining WorkList copies starting from index From.
-// Null out any successful joins.
 bool RegisterCoalescer::
 copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
   bool Progress = false;
@@ -2224,15 +2734,14 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   MF = &fn;
   MRI = &fn.getRegInfo();
   TM = &fn.getTarget();
-  TRI = TM->getSubtargetImpl()->getRegisterInfo();
-  TII = TM->getSubtargetImpl()->getInstrInfo();
+  const TargetSubtargetInfo &STI = fn.getSubtarget();
+  TRI = STI.getRegisterInfo();
+  TII = STI.getInstrInfo();
   LIS = &getAnalysis<LiveIntervals>();
   AA = &getAnalysis<AliasAnalysis>();
   Loops = &getAnalysis<MachineLoopInfo>();
-
-  const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
   if (EnableGlobalCopies == cl::BOU_UNSET)
-    JoinGlobalCopies = ST.useMachineScheduler();
+    JoinGlobalCopies = STI.useMachineScheduler();
   else
     JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE);
 
@@ -2264,9 +2773,24 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
     unsigned Reg = InflateRegs[i];
     if (MRI->reg_nodbg_empty(Reg))
       continue;
-    if (MRI->recomputeRegClass(Reg, *TM)) {
+    if (MRI->recomputeRegClass(Reg)) {
       DEBUG(dbgs() << PrintReg(Reg) << " inflated to "
                    << TRI->getRegClassName(MRI->getRegClass(Reg)) << '\n');
+      LiveInterval &LI = LIS->getInterval(Reg);
+      unsigned MaxMask = MRI->getMaxLaneMaskForVReg(Reg);
+      if (MaxMask == 0) {
+        // If the inflated register class does not support subregisters anymore
+        // remove the subranges.
+        LI.clearSubRanges();
+      } else {
+#ifndef NDEBUG
+        // If subranges are still supported, then the same subregs should still
+        // be supported.
+        for (LiveInterval::SubRange &S : LI.subranges()) {
+          assert ((S.LaneMask & ~MaxMask) == 0);
+        }
+#endif
+      }
       ++NumInflated;
     }
   }
@@ -2277,7 +2801,6 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   return true;
 }
 
-/// Implement the dump method.
 void RegisterCoalescer::print(raw_ostream &O, const Module* m) const {
    LIS->print(O, m);
 }
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 6f8b337..76a7fef 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -36,8 +36,8 @@ static cl::opt<bool> StressSchedOpt(
 void SchedulingPriorityQueue::anchor() { }
 
 ScheduleDAG::ScheduleDAG(MachineFunction &mf)
-    : TM(mf.getTarget()), TII(TM.getSubtargetImpl()->getInstrInfo()),
-      TRI(TM.getSubtargetImpl()->getRegisterInfo()), MF(mf),
+    : TM(mf.getTarget()), TII(mf.getSubtarget().getInstrInfo()),
+      TRI(mf.getSubtarget().getRegisterInfo()), MF(mf),
       MRI(mf.getRegInfo()), EntrySU(), ExitSU() {
 #ifndef NDEBUG
   StressSched = StressSchedOpt;
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index dece643..78bfd23 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -44,25 +44,24 @@ using namespace llvm;
 
 static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
     cl::ZeroOrMore, cl::init(false),
-    cl::desc("Enable use of AA during MI GAD construction"));
+    cl::desc("Enable use of AA during MI DAG construction"));
 
 static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
-    cl::init(true), cl::desc("Enable use of TBAA during MI GAD construction"));
+    cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction"));
 
 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineLoopInfo *mli,
-                                     bool IsPostRAFlag,
-                                     bool RemoveKillFlags,
+                                     bool IsPostRAFlag, bool RemoveKillFlags,
                                      LiveIntervals *lis)
-  : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), LIS(lis),
-    IsPostRA(IsPostRAFlag), RemoveKillFlags(RemoveKillFlags),
-    CanHandleTerminators(false), FirstDbgValue(nullptr) {
+    : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), LIS(lis),
+      IsPostRA(IsPostRAFlag), RemoveKillFlags(RemoveKillFlags),
+      CanHandleTerminators(false), FirstDbgValue(nullptr) {
   assert((IsPostRA || LIS) && "PreRA scheduling requires LiveIntervals");
   DbgValues.clear();
   assert(!(IsPostRA && MRI.getNumVirtRegs()) &&
          "Virtual registers must be removed prior to PostRA scheduling");
 
-  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  const TargetSubtargetInfo &ST = mf.getSubtarget();
   SchedModel.init(ST.getSchedModel(), &ST, TII);
 }
 
@@ -253,7 +252,7 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
   assert(MO.isDef() && "expect physreg def");
 
   // Ask the target if address-backscheduling is desirable, and if so how much.
-  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
 
   for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
        Alias.isValid(); ++Alias) {
@@ -444,7 +443,7 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
       int DefOp = Def->findRegisterDefOperandIdx(Reg);
       dep.setLatency(SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx));
 
-      const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+      const TargetSubtargetInfo &ST = MF.getSubtarget();
       ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
       SU->addPred(dep);
     }
@@ -614,10 +613,10 @@ iterateChainSucc(AliasAnalysis *AA, const MachineFrameInfo *MFI,
   }
   // Track current depth.
   (*Depth)++;
-  // Iterate over chain dependencies only.
+  // Iterate over memory dependencies only.
   for (SUnit::const_succ_iterator I = SUb->Succs.begin(), E = SUb->Succs.end();
        I != E; ++I)
-    if (I->isCtrl())
+    if (I->isNormalMemoryOrBarrier())
       iterateChainSucc (AA, MFI, SUa, I->getSUnit(), ExitSU, Depth, Visited);
   return *Depth;
 }
@@ -644,11 +643,12 @@ static void adjustChainDeps(AliasAnalysis *AA, const MachineFrameInfo *MFI,
       Dep.setLatency(((*I)->getInstr()->mayLoad()) ? LatencyToLoad : 0);
       (*I)->addPred(Dep);
     }
-    // Now go through all the chain successors and iterate from them.
-    // Keep track of visited nodes.
+
+    // Iterate recursively over all previously added memory chain
+    // successors. Keep track of visited nodes.
     for (SUnit::const_succ_iterator J = (*I)->Succs.begin(),
          JE = (*I)->Succs.end(); J != JE; ++J)
-      if (J->isCtrl())
+      if (J->isNormalMemoryOrBarrier())
         iterateChainSucc (AA, MFI, SU, J->getSUnit(),
                           ExitSU, &Depth, Visited);
   }
@@ -742,7 +742,7 @@ void ScheduleDAGInstrs::initSUnits() {
 void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
                                         RegPressureTracker *RPTracker,
                                         PressureDiffs *PDiffs) {
-  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
   bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
                                                        : ST.useAA();
   AliasAnalysis *AAForDep = UseAA ? AA : nullptr;
@@ -891,7 +891,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
 
       // fall-through
     new_alias_chain:
-      // Chain all possibly aliasing memory references though SU.
+      // Chain all possibly aliasing memory references through SU.
       if (AliasChain) {
         unsigned ChainLatency = 0;
         if (AliasChain->getInstr()->mayLoad())
@@ -991,11 +991,9 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         // Add dependence on alias chain, if needed.
         if (AliasChain)
           addChainDependency(AAForDep, MFI, SU, AliasChain, RejectMemNodes);
-        // But we also should check dependent instructions for the
-        // SU in question.
-        adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
-                        TrueMemOrderLatency);
       }
+      adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
+                      TrueMemOrderLatency);
     } else if (MI->mayLoad()) {
       bool MayAlias = true;
       if (MI->isInvariantLoad(AA)) {
diff --git a/lib/CodeGen/SelectionDAG/Android.mk b/lib/CodeGen/SelectionDAG/Android.mk
index 0e52ee3..9501ad9 100644
--- a/lib/CodeGen/SelectionDAG/Android.mk
+++ b/lib/CodeGen/SelectionDAG/Android.mk
@@ -22,6 +22,7 @@ codegen_selectiondag_SRC_FILES := \
   SelectionDAGDumper.cpp \
   SelectionDAGISel.cpp \
   SelectionDAGPrinter.cpp \
+  StatepointLowering.cpp \
   TargetLowering.cpp \
   TargetSelectionDAGInfo.cpp
 
diff --git a/lib/CodeGen/SelectionDAG/CMakeLists.txt b/lib/CodeGen/SelectionDAG/CMakeLists.txt
index 75e8167..fbedf2c 100644
--- a/lib/CodeGen/SelectionDAG/CMakeLists.txt
+++ b/lib/CodeGen/SelectionDAG/CMakeLists.txt
@@ -19,6 +19,7 @@ add_llvm_library(LLVMSelectionDAG
   SelectionDAGDumper.cpp
   SelectionDAGISel.cpp
   SelectionDAGPrinter.cpp
+  StatepointLowering.cpp
   ScheduleDAGVLIW.cpp
   TargetLowering.cpp
   TargetSelectionDAGInfo.cpp
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a1291ed..6129401 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17,9 +17,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -303,6 +303,8 @@ namespace {
     SDValue visitEXTRACT_SUBVECTOR(SDNode *N);
     SDValue visitVECTOR_SHUFFLE(SDNode *N);
     SDValue visitINSERT_SUBVECTOR(SDNode *N);
+    SDValue visitMLOAD(SDNode *N);
+    SDValue visitMSTORE(SDNode *N);
 
     SDValue XformToShuffleWithZero(SDNode *N);
     SDValue ReassociateOps(unsigned Opc, SDLoc DL, SDValue LHS, SDValue RHS);
@@ -325,6 +327,7 @@ namespace {
     SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
                                          unsigned HiOp);
     SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
+    SDValue CombineExtLoad(SDNode *N);
     SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
     SDValue BuildSDIV(SDNode *N);
     SDValue BuildSDIVPow2(SDNode *N);
@@ -361,6 +364,28 @@ namespace {
     /// chain (aliasing node.)
     SDValue FindBetterChain(SDNode *N, SDValue Chain);
 
+    /// Holds a pointer to an LSBaseSDNode as well as information on where it
+    /// is located in a sequence of memory operations connected by a chain.
+    struct MemOpLink {
+      MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
+      MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
+      // Ptr to the mem node.
+      LSBaseSDNode *MemNode;
+      // Offset from the base ptr.
+      int64_t OffsetFromBase;
+      // What is the sequence number of this mem node.
+      // Lowest mem operand in the DAG starts at zero.
+      unsigned SequenceNum;
+    };
+
+    /// This is a helper function for MergeConsecutiveStores. When the source
+    /// elements of the consecutive stores are all constants or all extracted
+    /// vector elements, try to merge them into one larger store.
+    /// \return True if a merged store was created.
+    bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                         EVT MemVT, unsigned NumElem,
+                                         bool IsConstantSrc, bool UseVector);
+
     /// Merge consecutive store operations into a wide store.
     /// This optimization uses wide integers or vectors when possible.
     /// \return True if some memory operations were changed.
@@ -378,12 +403,9 @@ namespace {
     DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
         : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
           OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
-      AttributeSet FnAttrs =
-          DAG.getMachineFunction().getFunction()->getAttributes();
-      ForCodeSize =
-          FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                               Attribute::OptimizeForSize) ||
-          FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+      auto *F = DAG.getMachineFunction().getFunction();
+      ForCodeSize = F->hasFnAttribute(Attribute::OptimizeForSize) ||
+                    F->hasFnAttribute(Attribute::MinSize);
     }
 
     /// Runs the dag combiner on all nodes in the work list
@@ -444,7 +466,7 @@ void TargetLowering::DAGCombinerInfo::RemoveFromWorklist(SDNode *N) {
 }
 
 SDValue TargetLowering::DAGCombinerInfo::
-CombineTo(SDNode *N, const std::vector<SDValue> &To, bool AddTo) {
+CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo) {
   return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo);
 }
 
@@ -736,10 +758,9 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, SDLoc DL,
     if (SDNode *L = isConstantBuildVectorOrConstantInt(N0.getOperand(1))) {
       if (SDNode *R = isConstantBuildVectorOrConstantInt(N1)) {
         // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2))
-        SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, L, R);
-        if (!OpNode.getNode())
-          return SDValue();
-        return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
+        if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, L, R))
+          return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
+        return SDValue();
       }
       if (N0.hasOneUse()) {
         // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one
@@ -757,10 +778,9 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, SDLoc DL,
     if (SDNode *R = isConstantBuildVectorOrConstantInt(N1.getOperand(1))) {
       if (SDNode *L = isConstantBuildVectorOrConstantInt(N0)) {
         // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2))
-        SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, R, L);
-        if (!OpNode.getNode())
-          return SDValue();
-        return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
+        if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, R, L))
+          return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
+        return SDValue();
       }
       if (N1.hasOneUse()) {
         // reassoc. (op y, (op x, c1)) -> (op (op x, y), c1) iff x+c1 has one
@@ -785,11 +805,12 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
         N->dump(&DAG);
         dbgs() << "\nWith: ";
         To[0].getNode()->dump(&DAG);
-        dbgs() << " and " << NumTo-1 << " other values\n";
-        for (unsigned i = 0, e = NumTo; i != e; ++i)
-          assert((!To[i].getNode() ||
-                  N->getValueType(i) == To[i].getValueType()) &&
-                 "Cannot combine value to value of different type!"));
+        dbgs() << " and " << NumTo-1 << " other values\n");
+  for (unsigned i = 0, e = NumTo; i != e; ++i)
+    assert((!To[i].getNode() ||
+            N->getValueType(i) == To[i].getValueType()) &&
+           "Cannot combine value to value of different type!");
+
   WorklistRemover DeadNodes(*this);
   DAG.ReplaceAllUsesWith(N, To);
   if (AddTo) {
@@ -874,8 +895,8 @@ SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
     EVT MemVT = LD->getMemoryVT();
     ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
-      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT) ? ISD::ZEXTLOAD
-                                                  : ISD::EXTLOAD)
+      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD
+                                                       : ISD::EXTLOAD)
       : LD->getExtensionType();
     Replace = true;
     return DAG.getExtLoad(ExtType, dl, PVT,
@@ -1096,8 +1117,8 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
     LoadSDNode *LD = cast<LoadSDNode>(N);
     EVT MemVT = LD->getMemoryVT();
     ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
-      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT) ? ISD::ZEXTLOAD
-                                                  : ISD::EXTLOAD)
+      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD
+                                                       : ISD::EXTLOAD)
       : LD->getExtensionType();
     SDValue NewLD = DAG.getExtLoad(ExtType, dl, PVT,
                                    LD->getChain(), LD->getBasePtr(),
@@ -1160,10 +1181,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
   LegalTypes = Level >= AfterLegalizeTypes;
 
   // Early exit if this basic block is in an optnone function.
-  AttributeSet FnAttrs =
-    DAG.getMachineFunction().getFunction()->getAttributes();
-  if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                           Attribute::OptimizeNone))
+  if (DAG.getMachineFunction().getFunction()->hasFnAttribute(
+          Attribute::OptimizeNone))
     return;
 
   // Add all the dag nodes to the worklist.
@@ -1351,6 +1370,8 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::EXTRACT_SUBVECTOR:  return visitEXTRACT_SUBVECTOR(N);
   case ISD::VECTOR_SHUFFLE:     return visitVECTOR_SHUFFLE(N);
   case ISD::INSERT_SUBVECTOR:   return visitINSERT_SUBVECTOR(N);
+  case ISD::MLOAD:              return visitMLOAD(N);
+  case ISD::MSTORE:             return visitMSTORE(N);
   }
   return SDValue();
 }
@@ -1475,7 +1496,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
       switch (Op.getOpcode()) {
       case ISD::EntryToken:
         // Entry tokens don't need to be added to the list. They are
-        // rededundant.
+        // redundant.
         Changed = true;
         break;
 
@@ -1504,7 +1525,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
 
   SDValue Result;
 
-  // If we've change things around then replace token factor.
+  // If we've changed things around then replace token factor.
   if (Changed) {
     if (Ops.empty()) {
       // The entry token is the only possible outcome.
@@ -1514,8 +1535,11 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
       Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
     }
 
-    // Don't add users to work list.
-    return CombineTo(N, Result, false);
+    // Add users to worklist if AA is enabled, since it may introduce
+    // a lot of new chained token factors while removing memory deps.
+    bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
+      : DAG.getSubtarget().useAA();
+    return CombineTo(N, Result, UseAA /*add to worklist*/);
   }
 
   return Result;
@@ -1541,8 +1565,6 @@ SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
 SDValue DAGCombiner::visitADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
 
   // fold vector ops
@@ -1563,6 +1585,8 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   if (N1.getOpcode() == ISD::UNDEF)
     return N1;
   // fold (add c1, c2) -> c1+c2
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::ADD, VT, N0C, N1C);
   // canonicalize constant to RHS
@@ -1714,8 +1738,6 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
 SDValue DAGCombiner::visitADDC(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
 
   // If the flag result is dead, turn this into an ADD.
@@ -1725,6 +1747,8 @@ SDValue DAGCombiner::visitADDC(SDNode *N) {
                                  SDLoc(N), MVT::Glue));
 
   // canonicalize constant to RHS.
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && !N1C)
     return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N1, N0);
 
@@ -1756,10 +1780,10 @@ SDValue DAGCombiner::visitADDE(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue CarryIn = N->getOperand(2);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
 
   // canonicalize constant to RHS
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && !N1C)
     return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(),
                        N1, N0, CarryIn);
@@ -1786,10 +1810,6 @@ static SDValue tryFoldToZero(SDLoc DL, const TargetLowering &TLI, EVT VT,
 SDValue DAGCombiner::visitSUB(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getNode());
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
-  ConstantSDNode *N1C1 = N1.getOpcode() != ISD::ADD ? nullptr :
-    dyn_cast<ConstantSDNode>(N1.getOperand(1).getNode());
   EVT VT = N0.getValueType();
 
   // fold vector ops
@@ -1807,6 +1827,8 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (N0 == N1)
     return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes);
   // fold (sub c1, c2) -> c1-c2
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getNode());
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SUB, VT, N0C, N1C);
   // fold (sub x, c) -> (add x, -c)
@@ -1826,6 +1848,8 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1)
     return N0.getOperand(0);
   // fold C2-(A+C1) -> (C2-C1)-A
+  ConstantSDNode *N1C1 = N1.getOpcode() != ISD::ADD ? nullptr :
+    dyn_cast<ConstantSDNode>(N1.getOperand(1).getNode());
   if (N1.getOpcode() == ISD::ADD && N0C && N1C1) {
     SDValue NewC = DAG.getConstant(N0C->getAPIntValue() - N1C1->getAPIntValue(),
                                    VT);
@@ -1890,8 +1914,6 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
 SDValue DAGCombiner::visitSUBC(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
 
   // If the flag result is dead, turn this into an SUB.
@@ -1907,6 +1929,8 @@ SDValue DAGCombiner::visitSUBC(SDNode *N) {
                                  MVT::Glue));
 
   // fold (subc x, 0) -> x + no borrow
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N1C && N1C->isNullValue())
     return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, SDLoc(N),
                                         MVT::Glue));
@@ -2055,8 +2079,6 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
 SDValue DAGCombiner::visitSDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = isConstOrConstSplat(N0);
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   EVT VT = N->getValueType(0);
 
   // fold vector ops
@@ -2066,6 +2088,8 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
   }
 
   // fold (sdiv c1, c2) -> c1/c2
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N0C && N1C && !N1C->isNullValue())
     return DAG.FoldConstantArithmetic(ISD::SDIV, VT, N0C, N1C);
   // fold (sdiv X, 1) -> X
@@ -2145,8 +2169,6 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
 SDValue DAGCombiner::visitUDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = isConstOrConstSplat(N0);
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   EVT VT = N->getValueType(0);
 
   // fold vector ops
@@ -2156,6 +2178,8 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
   }
 
   // fold (udiv c1, c2) -> c1/c2
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N0C && N1C && !N1C->isNullValue())
     return DAG.FoldConstantArithmetic(ISD::UDIV, VT, N0C, N1C);
   // fold (udiv x, (1 << c)) -> x >>u c
@@ -2197,11 +2221,11 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
 SDValue DAGCombiner::visitSREM(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = isConstOrConstSplat(N0);
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   EVT VT = N->getValueType(0);
 
   // fold (srem c1, c2) -> c1%c2
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N0C && N1C && !N1C->isNullValue())
     return DAG.FoldConstantArithmetic(ISD::SREM, VT, N0C, N1C);
   // If we know the sign bits of both operands are zero, strength reduce to a
@@ -2239,11 +2263,11 @@ SDValue DAGCombiner::visitSREM(SDNode *N) {
 SDValue DAGCombiner::visitUREM(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = isConstOrConstSplat(N0);
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   EVT VT = N->getValueType(0);
 
   // fold (urem c1, c2) -> c1%c2
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N0C && N1C && !N1C->isNullValue())
     return DAG.FoldConstantArithmetic(ISD::UREM, VT, N0C, N1C);
   // fold (urem x, pow2) -> (and x, pow2-1)
@@ -2522,6 +2546,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   // fold (OP (zext x), (zext y)) -> (zext (OP x, y))
   // fold (OP (sext x), (sext y)) -> (sext (OP x, y))
   // fold (OP (aext x), (aext y)) -> (aext (OP x, y))
+  // fold (OP (bswap x), (bswap y)) -> (bswap (OP x, y))
   // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
   //
   // do not sink logical op inside of a vector extend, since it may combine
@@ -2529,6 +2554,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   EVT Op0VT = N0.getOperand(0).getValueType();
   if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
        N0.getOpcode() == ISD::SIGN_EXTEND ||
+       N0.getOpcode() == ISD::BSWAP ||
        // Avoid infinite looping with PromoteIntBinOp.
        (N0.getOpcode() == ISD::ANY_EXTEND &&
         (!LegalTypes || TLI.isTypeDesirableForOp(N->getOpcode(), Op0VT))) ||
@@ -2662,11 +2688,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
 SDValue DAGCombiner::visitAND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  SDValue LL, LR, RL, RR, CC0, CC1;
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N1.getValueType();
-  unsigned BitWidth = VT.getScalarType().getSizeInBits();
 
   // fold vector ops
   if (VT.isVector()) {
@@ -2698,6 +2720,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
     return DAG.getConstant(0, VT);
   // fold (and c1, c2) -> c1&c2
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::AND, VT, N0C, N1C);
   // canonicalize constant to RHS
@@ -2707,6 +2731,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   if (N1C && N1C->isAllOnesValue())
     return N0;
   // if (and x, c) is known to be zero, return 0
+  unsigned BitWidth = VT.getScalarType().getSizeInBits();
   if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
                                    APInt::getAllOnesValue(BitWidth)))
     return DAG.getConstant(0, VT);
@@ -2793,6 +2818,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     // actually legal and isn't going to get expanded, else this is a false
     // optimisation.
     bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD,
+                                                    Load->getValueType(0),
                                                     Load->getMemoryVT());
 
     // Resize the constant to the same size as the original memory access before
@@ -2838,6 +2864,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     }
   }
   // fold (and (setcc x), (setcc y)) -> (setcc (and x, y))
+  SDValue LL, LR, RL, RR, CC0, CC1;
   if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
     ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
     ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
@@ -2919,7 +2946,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
                            BitWidth - MemVT.getScalarType().getSizeInBits())) &&
         ((!LegalOperations && !LN0->isVolatile()) ||
-         TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
+         TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
                                        LN0->getChain(), LN0->getBasePtr(),
                                        MemVT, LN0->getMemOperand());
@@ -2939,7 +2966,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
                            BitWidth - MemVT.getScalarType().getSizeInBits())) &&
         ((!LegalOperations && !LN0->isVolatile()) ||
-         TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
+         TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
                                        LN0->getChain(), LN0->getBasePtr(),
                                        MemVT, LN0->getMemOperand());
@@ -2965,10 +2992,11 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       if (ActiveBits > 0 && APIntOps::isMask(ActiveBits, N1C->getAPIntValue())){
         EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
         EVT LoadedVT = LN0->getMemoryVT();
+        EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
 
         if (ExtVT == LoadedVT &&
-            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT))) {
-          EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
+            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy,
+                                                    ExtVT))) {
 
           SDValue NewLoad =
             DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy,
@@ -2983,7 +3011,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         // Do not generate loads of non-round integer types since these can
         // be expensive (and would be wrong if the type is not byte sized).
         if (!LN0->isVolatile() && LoadedVT.bitsGT(ExtVT) && ExtVT.isRound() &&
-            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT))) {
+            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy,
+                                                    ExtVT))) {
           EVT PtrType = LN0->getOperand(1).getValueType();
 
           unsigned Alignment = LN0->getAlignment();
@@ -3003,7 +3032,6 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
           AddToWorklist(NewPtr.getNode());
 
-          EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
           SDValue Load =
             DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy,
                            LN0->getChain(), NewPtr,
@@ -3313,9 +3341,6 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
 SDValue DAGCombiner::visitOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  SDValue LL, LR, RL, RR, CC0, CC1;
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N1.getValueType();
 
   // fold vector ops
@@ -3407,6 +3432,8 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), VT);
   }
   // fold (or c1, c2) -> c1|c2
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::OR, VT, N0C, N1C);
   // canonicalize constant to RHS
@@ -3440,15 +3467,15 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
              isa<ConstantSDNode>(N0.getOperand(1))) {
     ConstantSDNode *C1 = cast<ConstantSDNode>(N0.getOperand(1));
     if ((C1->getAPIntValue() & N1C->getAPIntValue()) != 0) {
-      SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, VT, N1C, C1);
-      if (!COR.getNode())
-        return SDValue();
-      return DAG.getNode(ISD::AND, SDLoc(N), VT,
-                         DAG.getNode(ISD::OR, SDLoc(N0), VT,
-                                     N0.getOperand(0), N1), COR);
+      if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, VT, N1C, C1))
+        return DAG.getNode(
+            ISD::AND, SDLoc(N), VT,
+            DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR);
+      return SDValue();
     }
   }
   // fold (or (setcc x), (setcc y)) -> (setcc (or x, y))
+  SDValue LL, LR, RL, RR, CC0, CC1;
   if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
     ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
     ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
@@ -3521,6 +3548,17 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     }
   }
 
+  // (or (and X, M), (and X, N)) -> (and X, (or M, N))
+  if (N0.getOpcode() == ISD::AND &&
+      N1.getOpcode() == ISD::AND &&
+      N0.getOperand(0) == N1.getOperand(0) &&
+      // Don't increase # computations.
+      (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
+    SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
+                            N0.getOperand(1), N1.getOperand(1));
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, N0.getOperand(0), X);
+  }
+
   // See if this is some rotate idiom.
   if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
     return SDValue(Rot, 0);
@@ -3790,9 +3828,6 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) {
 SDValue DAGCombiner::visitXOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  SDValue LHS, RHS, CC;
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
 
   // fold vector ops
@@ -3816,6 +3851,8 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   if (N1.getOpcode() == ISD::UNDEF)
     return N1;
   // fold (xor c1, c2) -> c1^c2
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::XOR, VT, N0C, N1C);
   // canonicalize constant to RHS
@@ -3830,6 +3867,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
     return RXOR;
 
   // fold !(x cc y) -> (x !cc y)
+  SDValue LHS, RHS, CC;
   if (TLI.isConstTrueVal(N1.getNode()) && isSetCCEquivalent(N0, LHS, RHS, CC)) {
     bool isInt = LHS.getValueType().isInteger();
     ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
@@ -4039,12 +4077,11 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {
 SDValue DAGCombiner::visitSHL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
   // fold vector ops
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (VT.isVector()) {
     SDValue FoldedVOp = SimplifyVBinOp(N);
     if (FoldedVOp.getNode()) return FoldedVOp;
@@ -4061,8 +4098,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
         if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC &&
             TLI.getBooleanContents(N00.getOperand(0).getValueType()) ==
                 TargetLowering::ZeroOrNegativeOneBooleanContent) {
-          SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, VT, N01CV, N1CV);
-          if (C.getNode())
+          if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, VT, N01CV, N1CV))
             return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C);
         }
       } else {
@@ -4072,6 +4108,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   }
 
   // fold (shl c1, c2) -> c1<<c2
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SHL, VT, N0C, N1C);
   // fold (shl 0, x) -> 0
@@ -4220,12 +4257,11 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
 SDValue DAGCombiner::visitSRA(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
   // fold vector ops
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (VT.isVector()) {
     SDValue FoldedVOp = SimplifyVBinOp(N);
     if (FoldedVOp.getNode()) return FoldedVOp;
@@ -4234,6 +4270,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   }
 
   // fold (sra c1, c2) -> (sra c1, c2)
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SRA, VT, N0C, N1C);
   // fold (sra 0, x) -> 0
@@ -4366,12 +4403,11 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
 SDValue DAGCombiner::visitSRL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
   // fold vector ops
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (VT.isVector()) {
     SDValue FoldedVOp = SimplifyVBinOp(N);
     if (FoldedVOp.getNode()) return FoldedVOp;
@@ -4380,6 +4416,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   }
 
   // fold (srl c1, c2) -> c1 >>u c2
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SRL, VT, N0C, N1C);
   // fold (srl 0, x) -> 0
@@ -4608,13 +4645,47 @@ SDValue DAGCombiner::visitCTPOP(SDNode *N) {
   return SDValue();
 }
 
+
+/// \brief Generate Min/Max node
+static SDValue combineMinNumMaxNum(SDLoc DL, EVT VT, SDValue LHS, SDValue RHS,
+                                   SDValue True, SDValue False,
+                                   ISD::CondCode CC, const TargetLowering &TLI,
+                                   SelectionDAG &DAG) {
+  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
+    return SDValue();
+
+  switch (CC) {
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETLT:
+  case ISD::SETLE:
+  case ISD::SETULT:
+  case ISD::SETULE: {
+    unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM;
+    if (TLI.isOperationLegal(Opcode, VT))
+      return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+    return SDValue();
+  }
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETGT:
+  case ISD::SETGE:
+  case ISD::SETUGT:
+  case ISD::SETUGE: {
+    unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM;
+    if (TLI.isOperationLegal(Opcode, VT))
+      return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+    return SDValue();
+  }
+  default:
+    return SDValue();
+  }
+}
+
 SDValue DAGCombiner::visitSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
-  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
   EVT VT = N->getValueType(0);
   EVT VT0 = N0.getValueType();
 
@@ -4622,12 +4693,14 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   if (N1 == N2)
     return N1;
   // fold (select true, X, Y) -> X
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   if (N0C && !N0C->isNullValue())
     return N1;
   // fold (select false, X, Y) -> Y
   if (N0C && N0C->isNullValue())
     return N2;
   // fold (select C, 1, X) -> (or C, X)
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (VT == MVT::i1 && N1C && N1C->getAPIntValue() == 1)
     return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N2);
   // fold (select C, 0, 1) -> (xor C, 1)
@@ -4639,6 +4712,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   // undiscoverable (or not reasonably discoverable). For example, it could be
   // in another basic block or it could require searching a complicated
   // expression.
+  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
   if (VT.isInteger() &&
       (VT0 == MVT::i1 || (VT0.isInteger() &&
                           TLI.getBooleanContents(false, false) ==
@@ -4687,6 +4761,28 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
 
   // fold selects based on a setcc into other things, such as min/max/abs
   if (N0.getOpcode() == ISD::SETCC) {
+    // select x, y (fcmp lt x, y) -> fminnum x, y
+    // select x, y (fcmp gt x, y) -> fmaxnum x, y
+    //
+    // This is OK if we don't care about what happens if either operand is a
+    // NaN.
+    //
+
+    // FIXME: Instead of testing for UnsafeFPMath, this should be checking for
+    // no signed zeros as well as no nans.
+    const TargetOptions &Options = DAG.getTarget().Options;
+    if (Options.UnsafeFPMath &&
+        VT.isFloatingPoint() && N0.hasOneUse() &&
+        DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+
+      SDValue FMinMax =
+          combineMinNumMaxNum(SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1),
+                              N1, N2, CC, TLI, DAG);
+      if (FMinMax)
+        return FMinMax;
+    }
+
     if ((!LegalOperations &&
          TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) ||
         TLI.isOperationLegal(ISD::SELECT_CC, VT))
@@ -4771,6 +4867,166 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
       TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
 }
 
+SDValue DAGCombiner::visitMSTORE(SDNode *N) {
+
+  if (Level >= AfterLegalizeTypes)
+    return SDValue();
+
+  MaskedStoreSDNode *MST = dyn_cast<MaskedStoreSDNode>(N);
+  SDValue Mask = MST->getMask();
+  SDValue Data  = MST->getValue();
+  SDLoc DL(N);
+
+  // If the MSTORE data type requires splitting and the mask is provided by a
+  // SETCC, then split both nodes and its operands before legalization. This
+  // prevents the type legalizer from unrolling SETCC into scalar comparisons
+  // and enables future optimizations (e.g. min/max pattern matching on X86).
+  if (Mask.getOpcode() == ISD::SETCC) {
+
+    // Check if any splitting is required.
+    if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) !=
+        TargetLowering::TypeSplitVector)
+      return SDValue();
+
+    SDValue MaskLo, MaskHi, Lo, Hi;
+    std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
+
+    EVT LoVT, HiVT;
+    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MST->getValueType(0));
+
+    SDValue Chain = MST->getChain();
+    SDValue Ptr   = MST->getBasePtr();
+
+    EVT MemoryVT = MST->getMemoryVT();
+    unsigned Alignment = MST->getOriginalAlignment();
+
+    // if Alignment is equal to the vector size,
+    // take the half of it for the second part
+    unsigned SecondHalfAlignment =
+      (Alignment == Data->getValueType(0).getSizeInBits()/8) ?
+         Alignment/2 : Alignment;
+
+    EVT LoMemVT, HiMemVT;
+    std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+    SDValue DataLo, DataHi;
+    std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
+
+    MachineMemOperand *MMO = DAG.getMachineFunction().
+      getMachineMemOperand(MST->getPointerInfo(),
+                           MachineMemOperand::MOStore,  LoMemVT.getStoreSize(),
+                           Alignment, MST->getAAInfo(), MST->getRanges());
+
+    Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
+                            MST->isTruncatingStore());
+
+    unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
+
+    MMO = DAG.getMachineFunction().
+      getMachineMemOperand(MST->getPointerInfo(),
+                           MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
+                           SecondHalfAlignment, MST->getAAInfo(),
+                           MST->getRanges());
+
+    Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
+                            MST->isTruncatingStore());
+
+    AddToWorklist(Lo.getNode());
+    AddToWorklist(Hi.getNode());
+
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMLOAD(SDNode *N) {
+
+  if (Level >= AfterLegalizeTypes)
+    return SDValue();
+
+  MaskedLoadSDNode *MLD = dyn_cast<MaskedLoadSDNode>(N);
+  SDValue Mask = MLD->getMask();
+  SDLoc DL(N);
+
+  // If the MLOAD result requires splitting and the mask is provided by a
+  // SETCC, then split both nodes and its operands before legalization. This
+  // prevents the type legalizer from unrolling SETCC into scalar comparisons
+  // and enables future optimizations (e.g. min/max pattern matching on X86).
+
+  if (Mask.getOpcode() == ISD::SETCC) {
+    EVT VT = N->getValueType(0);
+
+    // Check if any splitting is required.
+    if (TLI.getTypeAction(*DAG.getContext(), VT) !=
+        TargetLowering::TypeSplitVector)
+      return SDValue();
+
+    SDValue MaskLo, MaskHi, Lo, Hi;
+    std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
+
+    SDValue Src0 = MLD->getSrc0();
+    SDValue Src0Lo, Src0Hi;
+    std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL);
+
+    EVT LoVT, HiVT;
+    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
+
+    SDValue Chain = MLD->getChain();
+    SDValue Ptr   = MLD->getBasePtr();
+    EVT MemoryVT = MLD->getMemoryVT();
+    unsigned Alignment = MLD->getOriginalAlignment();
+
+    // if Alignment is equal to the vector size,
+    // take the half of it for the second part
+    unsigned SecondHalfAlignment =
+      (Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
+         Alignment/2 : Alignment;
+
+    EVT LoMemVT, HiMemVT;
+    std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+    MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MLD->getPointerInfo(),
+                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
+                         Alignment, MLD->getAAInfo(), MLD->getRanges());
+
+    Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, LoMemVT, MMO,
+                           ISD::NON_EXTLOAD);
+
+    unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
+
+    MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MLD->getPointerInfo(),
+                         MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
+                         SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());
+
+    Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
+                           ISD::NON_EXTLOAD);
+
+    AddToWorklist(Lo.getNode());
+    AddToWorklist(Hi.getNode());
+
+    // Build a factor node to remember that this load is independent of the
+    // other one.
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
+                        Hi.getValue(1));
+
+    // Legalized the chain result - switch anything that used the old chain to
+    // use the new one.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain);
+
+    SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+
+    SDValue RetOps[] = { LoadRes, Chain };
+    return DAG.getMergeValues(RetOps, DL);
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -4880,13 +5136,16 @@ SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
         return N2;    // cond always true -> true val
       else
         return N3;    // cond always false -> false val
-    }
-
-    // Fold to a simpler select_cc
-    if (SCC.getOpcode() == ISD::SETCC)
+    } else if (SCC->getOpcode() == ISD::UNDEF) {
+      // When the condition is UNDEF, just return the first operand. This is
+      // coherent the DAG creation, no setcc node is created in this case
+      return N2;
+    } else if (SCC.getOpcode() == ISD::SETCC) {
+      // Fold to a simpler select_cc
       return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N2.getValueType(),
                          SCC.getOperand(0), SCC.getOperand(1), N2, N3,
                          SCC.getOperand(2));
+    }
   }
 
   // If we can fold this based on the true/false value, do so.
@@ -5047,6 +5306,102 @@ void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
   }
 }
 
+// FIXME: Bring more similar combines here, common to sext/zext (maybe aext?).
+SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT DstVT = N->getValueType(0);
+  EVT SrcVT = N0.getValueType();
+
+  assert((N->getOpcode() == ISD::SIGN_EXTEND ||
+          N->getOpcode() == ISD::ZERO_EXTEND) &&
+         "Unexpected node type (not an extend)!");
+
+  // fold (sext (load x)) to multiple smaller sextloads; same for zext.
+  // For example, on a target with legal v4i32, but illegal v8i32, turn:
+  //   (v8i32 (sext (v8i16 (load x))))
+  // into:
+  //   (v8i32 (concat_vectors (v4i32 (sextload x)),
+  //                          (v4i32 (sextload (x + 16)))))
+  // Where uses of the original load, i.e.:
+  //   (v8i16 (load x))
+  // are replaced with:
+  //   (v8i16 (truncate
+  //     (v8i32 (concat_vectors (v4i32 (sextload x)),
+  //                            (v4i32 (sextload (x + 16)))))))
+  //
+  // This combine is only applicable to illegal, but splittable, vectors.
+  // All legal types, and illegal non-vector types, are handled elsewhere.
+  // This combine is controlled by TargetLowering::isVectorLoadExtDesirable.
+  //
+  if (N0->getOpcode() != ISD::LOAD)
+    return SDValue();
+
+  LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+
+  if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) ||
+      !N0.hasOneUse() || LN0->isVolatile() || !DstVT.isVector() ||
+      !DstVT.isPow2VectorType() || !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
+    return SDValue();
+
+  SmallVector<SDNode *, 4> SetCCs;
+  if (!ExtendUsesToFormExtLoad(N, N0, N->getOpcode(), SetCCs, TLI))
+    return SDValue();
+
+  ISD::LoadExtType ExtType =
+      N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+
+  // Try to split the vector types to get down to legal types.
+  EVT SplitSrcVT = SrcVT;
+  EVT SplitDstVT = DstVT;
+  while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) &&
+         SplitSrcVT.getVectorNumElements() > 1) {
+    SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first;
+    SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first;
+  }
+
+  if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT))
+    return SDValue();
+
+  SDLoc DL(N);
+  const unsigned NumSplits =
+      DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements();
+  const unsigned Stride = SplitSrcVT.getStoreSize();
+  SmallVector<SDValue, 4> Loads;
+  SmallVector<SDValue, 4> Chains;
+
+  SDValue BasePtr = LN0->getBasePtr();
+  for (unsigned Idx = 0; Idx < NumSplits; Idx++) {
+    const unsigned Offset = Idx * Stride;
+    const unsigned Align = MinAlign(LN0->getAlignment(), Offset);
+
+    SDValue SplitLoad = DAG.getExtLoad(
+        ExtType, DL, SplitDstVT, LN0->getChain(), BasePtr,
+        LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT,
+        LN0->isVolatile(), LN0->isNonTemporal(), LN0->isInvariant(),
+        Align, LN0->getAAInfo());
+
+    BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
+                          DAG.getConstant(Stride, BasePtr.getValueType()));
+
+    Loads.push_back(SplitLoad.getValue(0));
+    Chains.push_back(SplitLoad.getValue(1));
+  }
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);
+
+  CombineTo(N, NewValue);
+
+  // Replace uses of the original load (before extension)
+  // with a truncate of the concatenated sextloaded vectors.
+  SDValue Trunc =
+      DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue);
+  CombineTo(N0.getNode(), Trunc, NewChain);
+  ExtendSetCCUses(SetCCs, Trunc, NewValue, DL,
+                  (ISD::NodeType)N->getOpcode());
+  return SDValue(N, 0); // Return N so it doesn't get rechecked!
+}
+
 SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -5113,17 +5468,18 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   }
 
   // fold (sext (load x)) -> (sext (truncate (sextload x)))
-  // None of the supported targets knows how to perform load and sign extend
-  // on vectors in one instruction.  We only perform this transformation on
-  // scalars.
-  if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
-      ISD::isUNINDEXEDLoad(N0.getNode()) &&
-      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isLoadExtLegal(ISD::SEXTLOAD, N0.getValueType()))) {
+  // Only generate vector extloads when 1) they're legal, and 2) they are
+  // deemed desirable by the target.
+  if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      ((!LegalOperations && !VT.isVector() &&
+        !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()))) {
     bool DoXform = true;
     SmallVector<SDNode*, 4> SetCCs;
     if (!N0.hasOneUse())
       DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI);
+    if (VT.isVector())
+      DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
       SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
@@ -5140,6 +5496,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     }
   }
 
+  // fold (sext (load x)) to multiple smaller sextloads.
+  // Only on illegal but splittable vectors.
+  if (SDValue ExtLoad = CombineExtLoad(N))
+    return ExtLoad;
+
   // fold (sext (sextload x)) -> (sext (truncate (sextload x)))
   // fold (sext ( extload x)) -> (sext (truncate (sextload x)))
   if ((ISD::isSEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) &&
@@ -5147,7 +5508,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     EVT MemVT = LN0->getMemoryVT();
     if ((!LegalOperations && !LN0->isVolatile()) ||
-        TLI.isLoadExtLegal(ISD::SEXTLOAD, MemVT)) {
+        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
                                        LN0->getBasePtr(), MemVT,
@@ -5167,7 +5528,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
        N0.getOpcode() == ISD::XOR) &&
       isa<LoadSDNode>(N0.getOperand(0)) &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
-      TLI.isLoadExtLegal(ISD::SEXTLOAD, N0.getValueType()) &&
+      TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()) &&
       (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
     if (LN0->getExtensionType() != ISD::ZEXTLOAD && LN0->isUnindexed()) {
@@ -5403,17 +5764,18 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   }
 
   // fold (zext (load x)) -> (zext (truncate (zextload x)))
-  // None of the supported targets knows how to perform load and vector_zext
-  // on vectors in one instruction.  We only perform this transformation on
-  // scalars.
-  if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
-      ISD::isUNINDEXEDLoad(N0.getNode()) &&
-      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isLoadExtLegal(ISD::ZEXTLOAD, N0.getValueType()))) {
+  // Only generate vector extloads when 1) they're legal, and 2) they are
+  // deemed desirable by the target.
+  if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      ((!LegalOperations && !VT.isVector() &&
+        !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()))) {
     bool DoXform = true;
     SmallVector<SDNode*, 4> SetCCs;
     if (!N0.hasOneUse())
       DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI);
+    if (VT.isVector())
+      DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
@@ -5431,13 +5793,18 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     }
   }
 
+  // fold (zext (load x)) to multiple smaller zextloads.
+  // Only on illegal but splittable vectors.
+  if (SDValue ExtLoad = CombineExtLoad(N))
+    return ExtLoad;
+
   // fold (zext (and/or/xor (load x), cst)) ->
   //      (and/or/xor (zextload x), (zext cst))
   if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
        N0.getOpcode() == ISD::XOR) &&
       isa<LoadSDNode>(N0.getOperand(0)) &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
-      TLI.isLoadExtLegal(ISD::ZEXTLOAD, N0.getValueType()) &&
+      TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()) &&
       (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
     if (LN0->getExtensionType() != ISD::SEXTLOAD && LN0->isUnindexed()) {
@@ -5474,7 +5841,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     EVT MemVT = LN0->getMemoryVT();
     if ((!LegalOperations && !LN0->isVolatile()) ||
-        TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT)) {
+        TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT)) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
                                        LN0->getBasePtr(), MemVT,
@@ -5636,7 +6003,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   // scalars.
   if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
       ISD::isUNINDEXEDLoad(N0.getNode()) &&
-      TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType())) {
+      TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
     bool DoXform = true;
     SmallVector<SDNode*, 4> SetCCs;
     if (!N0.hasOneUse())
@@ -5666,7 +6033,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     ISD::LoadExtType ExtType = LN0->getExtensionType();
     EVT MemVT = LN0->getMemoryVT();
-    if (!LegalOperations || TLI.isLoadExtLegal(ExtType, MemVT)) {
+    if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) {
       SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N),
                                        VT, LN0->getChain(), LN0->getBasePtr(),
                                        MemVT, LN0->getMemOperand());
@@ -5795,7 +6162,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     ExtVT = EVT::getIntegerVT(*DAG.getContext(),
                               VT.getSizeInBits() - N01->getZExtValue());
   }
-  if (LegalOperations && !TLI.isLoadExtLegal(ExtType, ExtVT))
+  if (LegalOperations && !TLI.isLoadExtLegal(ExtType, VT, ExtVT))
     return SDValue();
 
   unsigned EVTBits = ExtVT.getSizeInBits();
@@ -5874,6 +6241,9 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
       LN0->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt)
     return SDValue();
 
+  if (!TLI.shouldReduceLoadWidth(LN0, ExtType, ExtVT))
+    return SDValue();
+
   EVT PtrType = N0.getOperand(1).getValueType();
 
   if (PtrType == MVT::Untyped || PtrType.isExtended())
@@ -5999,7 +6369,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
       ISD::isUNINDEXEDLoad(N0.getNode()) &&
       EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
@@ -6015,7 +6385,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
       N0.hasOneUse() &&
       EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
@@ -6318,19 +6688,15 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
 
   // If the input is a constant, let getNode fold it.
   if (isa<ConstantSDNode>(N0) || isa<ConstantFPSDNode>(N0)) {
-    SDValue Res = DAG.getNode(ISD::BITCAST, SDLoc(N), VT, N0);
-    if (Res.getNode() != N) {
-      if (!LegalOperations ||
-          TLI.isOperationLegal(Res.getNode()->getOpcode(), VT))
-        return Res;
-
-      // Folding it resulted in an illegal node, and it's too late to
-      // do that. Clean up the old node and forego the transformation.
-      // Ideally this won't happen very often, because instcombine
-      // and the earlier dagcombine runs (where illegal nodes are
-      // permitted) should have folded most of them already.
-      deleteAndRecombine(Res.getNode());
-    }
+    // If we can't allow illegal operations, we need to check that this is just
+    // a fp -> int or int -> conversion and that the resulting operation will
+    // be legal.
+    if (!LegalOperations ||
+        (isa<ConstantSDNode>(N0) && VT.isFloatingPoint() && !VT.isVector() &&
+         TLI.isOperationLegal(ISD::ConstantFP, VT)) ||
+        (isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() &&
+         TLI.isOperationLegal(ISD::Constant, VT)))
+      return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, N0);
   }
 
   // (conv (conv x, t1), t2) -> (conv x, t2)
@@ -6489,7 +6855,6 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
   if (SrcEltVT.isFloatingPoint()) {
     // Convert the input float vector to a int vector where the elements are the
     // same sizes.
-    assert((SrcEltVT == MVT::f32 || SrcEltVT == MVT::f64) && "Unknown FP VT!");
     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits());
     BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode();
     SrcEltVT = IntVT;
@@ -6498,7 +6863,6 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
   // Now we know the input is an integer vector.  If the output is a FP type,
   // convert to integer first, then to FP of the right size.
   if (DstEltVT.isFloatingPoint()) {
-    assert((DstEltVT == MVT::f32 || DstEltVT == MVT::f64) && "Unknown FP VT!");
     EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits());
     SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode();
 
@@ -6549,8 +6913,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
 
   for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
     if (BV->getOperand(i).getOpcode() == ISD::UNDEF) {
-      for (unsigned j = 0; j != NumOutputsPerInput; ++j)
-        Ops.push_back(DAG.getUNDEF(DstEltVT));
+      Ops.append(NumOutputsPerInput, DAG.getUNDEF(DstEltVT));
       continue;
     }
 
@@ -6575,6 +6938,133 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT, Ops);
 }
 
+// Attempt different variants of (fadd (fmul a, b), c) -> fma or fmad
+static SDValue performFaddFmulCombines(unsigned FusedOpcode,
+                                       bool Aggressive,
+                                       SDNode *N,
+                                       const TargetLowering &TLI,
+                                       SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  // fold (fadd (fmul x, y), z) -> (fma x, y, z)
+  if (N0.getOpcode() == ISD::FMUL &&
+      (Aggressive || N0->hasOneUse())) {
+    return DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                       N0.getOperand(0), N0.getOperand(1), N1);
+  }
+
+  // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
+  // Note: Commutes FADD operands.
+  if (N1.getOpcode() == ISD::FMUL &&
+      (Aggressive || N1->hasOneUse())) {
+    return DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                       N1.getOperand(0), N1.getOperand(1), N0);
+  }
+
+  // More folding opportunities when target permits.
+  if (Aggressive) {
+    // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z))
+    if (N0.getOpcode() == ISD::FMA &&
+        N0.getOperand(2).getOpcode() == ISD::FMUL) {
+      return DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                         N0.getOperand(0), N0.getOperand(1),
+                         DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                                     N0.getOperand(2).getOperand(0),
+                                     N0.getOperand(2).getOperand(1),
+                                     N1));
+    }
+
+    // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x))
+    if (N1->getOpcode() == ISD::FMA &&
+        N1.getOperand(2).getOpcode() == ISD::FMUL) {
+      return DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                         N1.getOperand(0), N1.getOperand(1),
+                         DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                                     N1.getOperand(2).getOperand(0),
+                                     N1.getOperand(2).getOperand(1),
+                                     N0));
+    }
+  }
+
+  return SDValue();
+}
+
+static SDValue performFsubFmulCombines(unsigned FusedOpcode,
+                                       bool Aggressive,
+                                       SDNode *N,
+                                       const TargetLowering &TLI,
+                                       SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  SDLoc SL(N);
+
+  // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+  if (N0.getOpcode() == ISD::FMUL &&
+      (Aggressive || N0->hasOneUse())) {
+    return DAG.getNode(FusedOpcode, SL, VT,
+                       N0.getOperand(0), N0.getOperand(1),
+                       DAG.getNode(ISD::FNEG, SL, VT, N1));
+  }
+
+  // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+  // Note: Commutes FSUB operands.
+  if (N1.getOpcode() == ISD::FMUL &&
+      (Aggressive || N1->hasOneUse()))
+    return DAG.getNode(FusedOpcode, SL, VT,
+                       DAG.getNode(ISD::FNEG, SL, VT,
+                                   N1.getOperand(0)),
+                       N1.getOperand(1), N0);
+
+  // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
+  if (N0.getOpcode() == ISD::FNEG &&
+      N0.getOperand(0).getOpcode() == ISD::FMUL &&
+      (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {
+    SDValue N00 = N0.getOperand(0).getOperand(0);
+    SDValue N01 = N0.getOperand(0).getOperand(1);
+    return DAG.getNode(FusedOpcode, SL, VT,
+                       DAG.getNode(ISD::FNEG, SL, VT, N00), N01,
+                       DAG.getNode(ISD::FNEG, SL, VT, N1));
+  }
+
+  // More folding opportunities when target permits.
+  if (Aggressive) {
+    // fold (fsub (fma x, y, (fmul u, v)), z)
+    //   -> (fma x, y (fma u, v, (fneg z)))
+    if (N0.getOpcode() == FusedOpcode &&
+        N0.getOperand(2).getOpcode() == ISD::FMUL) {
+      return DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                         N0.getOperand(0), N0.getOperand(1),
+                         DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                                     N0.getOperand(2).getOperand(0),
+                                     N0.getOperand(2).getOperand(1),
+                                     DAG.getNode(ISD::FNEG, SDLoc(N), VT,
+                                                 N1)));
+    }
+
+    // fold (fsub x, (fma y, z, (fmul u, v)))
+    //   -> (fma (fneg y), z, (fma (fneg u), v, x))
+    if (N1.getOpcode() == FusedOpcode &&
+        N1.getOperand(2).getOpcode() == ISD::FMUL) {
+      SDValue N20 = N1.getOperand(2).getOperand(0);
+      SDValue N21 = N1.getOperand(2).getOperand(1);
+      return DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                         DAG.getNode(ISD::FNEG, SDLoc(N), VT,
+                                     N1.getOperand(0)),
+                         N1.getOperand(1),
+                         DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                                     DAG.getNode(ISD::FNEG, SDLoc(N),  VT,
+                                                 N20),
+                                     N21, N0));
+    }
+  }
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitFADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -6714,23 +7204,55 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
     }
   } // enable-unsafe-fp-math
 
+  if (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)) {
+    // Assume if there is an fmad instruction that it should be aggressively
+    // used.
+    if (SDValue Fused = performFaddFmulCombines(ISD::FMAD, true, N, TLI, DAG))
+      return Fused;
+  }
+
   // FADD -> FMA combines:
   if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) &&
       TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) {
 
-    // fold (fadd (fmul x, y), z) -> (fma x, y, z)
-    if (N0.getOpcode() == ISD::FMUL &&
-        (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
-      return DAG.getNode(ISD::FMA, SDLoc(N), VT,
-                         N0.getOperand(0), N0.getOperand(1), N1);
+    if (!TLI.isOperationLegal(ISD::FMAD, VT)) {
+      // Don't form FMA if we are preferring FMAD.
+      if (SDValue Fused
+          = performFaddFmulCombines(ISD::FMA,
+                                    TLI.enableAggressiveFMAFusion(VT),
+                                    N, TLI, DAG)) {
+        return Fused;
+      }
+    }
 
-    // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
-    // Note: Commutes FADD operands.
-    if (N1.getOpcode() == ISD::FMUL &&
-        (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
-      return DAG.getNode(ISD::FMA, SDLoc(N), VT,
-                         N1.getOperand(0), N1.getOperand(1), N0);
+    // When FP_EXTEND nodes are free on the target, and there is an opportunity
+    // to combine into FMA, arrange such nodes accordingly.
+    if (TLI.isFPExtFree(VT)) {
+
+      // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
+      if (N0.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == ISD::FMUL)
+          return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N00.getOperand(0)),
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N00.getOperand(1)), N1);
+      }
+
+      // fold (fadd x, (fpext (fmul y, z)), z) -> (fma (fpext y), (fpext z), x)
+      // Note: Commutes FADD operands.
+      if (N1.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N10 = N1.getOperand(0);
+        if (N10.getOpcode() == ISD::FMUL)
+          return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N10.getOperand(0)),
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N10.getOperand(1)), N0);
+      }
+    }
   }
 
   return SDValue();
@@ -6792,37 +7314,95 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
     }
   }
 
+  if (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)) {
+    // Assume if there is an fmad instruction that it should be aggressively
+    // used.
+    if (SDValue Fused = performFsubFmulCombines(ISD::FMAD, true, N, TLI, DAG))
+      return Fused;
+  }
+
   // FSUB -> FMA combines:
   if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) &&
       TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) {
 
-    // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
-    if (N0.getOpcode() == ISD::FMUL &&
-        (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
-      return DAG.getNode(ISD::FMA, dl, VT,
-                         N0.getOperand(0), N0.getOperand(1),
-                         DAG.getNode(ISD::FNEG, dl, VT, N1));
-
-    // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
-    // Note: Commutes FSUB operands.
-    if (N1.getOpcode() == ISD::FMUL &&
-        (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
-      return DAG.getNode(ISD::FMA, dl, VT,
-                         DAG.getNode(ISD::FNEG, dl, VT,
-                         N1.getOperand(0)),
-                         N1.getOperand(1), N0);
-
-    // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
-    if (N0.getOpcode() == ISD::FNEG &&
-        N0.getOperand(0).getOpcode() == ISD::FMUL &&
-        ((N0->hasOneUse() && N0.getOperand(0).hasOneUse()) ||
-            TLI.enableAggressiveFMAFusion(VT))) {
-      SDValue N00 = N0.getOperand(0).getOperand(0);
-      SDValue N01 = N0.getOperand(0).getOperand(1);
-      return DAG.getNode(ISD::FMA, dl, VT,
-                         DAG.getNode(ISD::FNEG, dl, VT, N00), N01,
-                         DAG.getNode(ISD::FNEG, dl, VT, N1));
+    if (!TLI.isOperationLegal(ISD::FMAD, VT)) {
+      // Don't form FMA if we are preferring FMAD.
+
+      if (SDValue Fused
+          = performFsubFmulCombines(ISD::FMA,
+                                    TLI.enableAggressiveFMAFusion(VT),
+                                    N, TLI, DAG)) {
+        return Fused;
+      }
+    }
+
+    // When FP_EXTEND nodes are free on the target, and there is an opportunity
+    // to combine into FMA, arrange such nodes accordingly.
+    if (TLI.isFPExtFree(VT)) {
+      // fold (fsub (fpext (fmul x, y)), z)
+      //   -> (fma (fpext x), (fpext y), (fneg z))
+      if (N0.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == ISD::FMUL)
+          return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N00.getOperand(0)),
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N00.getOperand(1)),
+                             DAG.getNode(ISD::FNEG, SDLoc(N), VT, N1));
+      }
+
+      // fold (fsub x, (fpext (fmul y, z)))
+      //   -> (fma (fneg (fpext y)), (fpext z), x)
+      // Note: Commutes FSUB operands.
+      if (N1.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N10 = N1.getOperand(0);
+        if (N10.getOpcode() == ISD::FMUL)
+          return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                             DAG.getNode(ISD::FNEG, SDLoc(N), VT,
+                                         DAG.getNode(ISD::FP_EXTEND, SDLoc(N),
+                                                     VT, N10.getOperand(0))),
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N10.getOperand(1)),
+                             N0);
+      }
+
+      // fold (fsub (fpext (fneg (fmul, x, y))), z)
+      //   -> (fma (fneg (fpext x)), (fpext y), (fneg z))
+      if (N0.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == ISD::FNEG) {
+          SDValue N000 = N00.getOperand(0);
+          if (N000.getOpcode() == ISD::FMUL) {
+            return DAG.getNode(ISD::FMA, dl, VT,
+                               DAG.getNode(ISD::FNEG, dl, VT,
+                                           DAG.getNode(ISD::FP_EXTEND, SDLoc(N),
+                                                       VT, N000.getOperand(0))),
+                               DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                           N000.getOperand(1)),
+                               DAG.getNode(ISD::FNEG, dl, VT, N1));
+          }
+        }
+      }
+
+      // fold (fsub (fneg (fpext (fmul, x, y))), z)
+      //   -> (fma (fneg (fpext x)), (fpext y), (fneg z))
+      if (N0.getOpcode() == ISD::FNEG) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == ISD::FP_EXTEND) {
+          SDValue N000 = N00.getOperand(0);
+          if (N000.getOpcode() == ISD::FMUL) {
+            return DAG.getNode(ISD::FMA, dl, VT,
+                               DAG.getNode(ISD::FNEG, dl, VT,
+                                           DAG.getNode(ISD::FP_EXTEND, SDLoc(N),
+                                           VT, N000.getOperand(0))),
+                               DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                           N000.getOperand(1)),
+                               DAG.getNode(ISD::FNEG, dl, VT, N1));
+          }
+        }
+      }
     }
   }
 
@@ -7104,6 +7684,44 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
     }
   }
 
+  // Combine multiple FDIVs with the same divisor into multiple FMULs by the
+  // reciprocal.
+  // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
+  // Notice that this is not always beneficial. One reason is different target
+  // may have different costs for FDIV and FMUL, so sometimes the cost of two
+  // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
+  // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
+  if (Options.UnsafeFPMath) {
+    // Skip if current node is a reciprocal.
+    if (N0CFP && N0CFP->isExactlyValue(1.0))
+      return SDValue();
+
+    SmallVector<SDNode *, 4> Users;
+    // Find all FDIV users of the same divisor.
+    for (SDNode::use_iterator UI = N1.getNode()->use_begin(),
+                              UE = N1.getNode()->use_end();
+         UI != UE; ++UI) {
+      SDNode *User = UI.getUse().getUser();
+      if (User->getOpcode() == ISD::FDIV && User->getOperand(1) == N1)
+        Users.push_back(User);
+    }
+
+    if (TLI.combineRepeatedFPDivisors(Users.size())) {
+      SDValue FPOne = DAG.getConstantFP(1.0, VT); // floating point 1.0
+      SDValue Reciprocal = DAG.getNode(ISD::FDIV, SDLoc(N), VT, FPOne, N1);
+
+      // Dividend / Divisor -> Dividend * Reciprocal
+      for (auto I = Users.begin(), E = Users.end(); I != E; ++I) {
+        if ((*I)->getOperand(0) != FPOne) {
+          SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(*I), VT,
+                                        (*I)->getOperand(0), Reciprocal);
+          DAG.ReplaceAllUsesWith(*I, NewNode.getNode());
+        }
+      }
+      return SDValue();
+    }
+  }
+
   return SDValue();
 }
 
@@ -7122,7 +7740,8 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
 }
 
 SDValue DAGCombiner::visitFSQRT(SDNode *N) {
-  if (DAG.getTarget().Options.UnsafeFPMath) {
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      !TLI.isFsqrtCheap()) {
     // Compute this as X * (1/sqrt(X)) = X * (X ** -0.5)
     if (SDValue RV = BuildRsqrtEstimate(N->getOperand(0))) {
       EVT VT = RV.getValueType();
@@ -7198,11 +7817,11 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
 
 SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   EVT VT = N->getValueType(0);
   EVT OpVT = N0.getValueType();
 
   // fold (sint_to_fp c1) -> c1fp
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   if (N0C &&
       // ...but only if the target supports immediate floating-point values
       (!LegalOperations ||
@@ -7251,11 +7870,11 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
 
 SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   EVT VT = N->getValueType(0);
   EVT OpVT = N0.getValueType();
 
   // fold (uint_to_fp c1) -> c1fp
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   if (N0C &&
       // ...but only if the target supports immediate floating-point values
       (!LegalOperations ||
@@ -7289,6 +7908,50 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
   return SDValue();
 }
 
+// Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x
+static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP)
+    return SDValue();
+
+  SDValue Src = N0.getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP;
+  bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT;
+
+  // We can safely assume the conversion won't overflow the output range,
+  // because (for example) (uint8_t)18293.f is undefined behavior.
+
+  // Since we can assume the conversion won't overflow, our decision as to
+  // whether the input will fit in the float should depend on the minimum
+  // of the input range and output range.
+
+  // This means this is also safe for a signed input and unsigned output, since
+  // a negative input would lead to undefined behavior.
+  unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned;
+  unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned;
+  unsigned ActualSize = std::min(InputSize, OutputSize);
+  const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType());
+
+  // We can only fold away the float conversion if the input range can be
+  // represented exactly in the float range.
+  if (APFloat::semanticsPrecision(sem) >= ActualSize) {
+    if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) {
+      unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND
+                                                       : ISD::ZERO_EXTEND;
+      return DAG.getNode(ExtOp, SDLoc(N), VT, Src);
+    }
+    if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits())
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src);
+    if (SrcVT == VT)
+      return Src;
+    return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Src);
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
@@ -7298,7 +7961,7 @@ SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
   if (N0CFP)
     return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0);
 
-  return SDValue();
+  return FoldIntToFPToInt(N, DAG);
 }
 
 SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
@@ -7310,7 +7973,7 @@ SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
   if (N0CFP)
     return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0);
 
-  return SDValue();
+  return FoldIntToFPToInt(N, DAG);
 }
 
 SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
@@ -7329,11 +7992,16 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
 
   // fold (fp_round (fp_round x)) -> (fp_round x)
   if (N0.getOpcode() == ISD::FP_ROUND) {
-    // This is a value preserving truncation if both round's are.
-    bool IsTrunc = N->getConstantOperandVal(1) == 1 &&
-                   N0.getNode()->getConstantOperandVal(1) == 1;
-    return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0.getOperand(0),
-                       DAG.getIntPtrConstant(IsTrunc));
+    const bool NIsTrunc = N->getConstantOperandVal(1) == 1;
+    const bool N0IsTrunc = N0.getNode()->getConstantOperandVal(1) == 1;
+    // If the first fp_round isn't a value preserving truncation, it might
+    // introduce a tie in the second fp_round, that wouldn't occur in the
+    // single-step fp_round we want to fold to.
+    // In other words, double rounding isn't the same as rounding.
+    // Also, this is a value preserving truncation iff both fp_round's are.
+    if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc)
+      return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0.getOperand(0),
+                         DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc));
   }
 
   // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
@@ -7391,7 +8059,7 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
 
   // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
   if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
-       TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType())) {
+       TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
@@ -8923,7 +9591,7 @@ CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
   if (NotMaskLZ == 64) return Result;  // All zero mask.
 
   // See if we have a continuous run of bits.  If so, we have 0*1+0*
-  if (CountTrailingOnes_64(NotMask >> NotMaskTZ)+NotMaskTZ+NotMaskLZ != 64)
+  if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64)
     return Result;
 
   // Adjust NotMaskLZ down to be from the actual size of the int instead of i64.
@@ -9070,9 +9738,12 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
     unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
     unsigned NewBW = NextPowerOf2(MSB - ShAmt);
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
+    // The narrowing should be profitable, the load/store operation should be
+    // legal (or custom) and the store size should be equal to the NewVT width.
     while (NewBW < BitWidth &&
-           !(TLI.isOperationLegalOrCustom(Opc, NewVT) &&
-             TLI.isNarrowingProfitable(VT, NewVT))) {
+           (NewVT.getStoreSizeInBits() != NewBW ||
+            !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
+            !TLI.isNarrowingProfitable(VT, NewVT))) {
       NewBW = NextPowerOf2(NewBW);
       NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
     }
@@ -9272,36 +9943,139 @@ struct BaseIndexOffset {
   }
 };
 
-/// Holds a pointer to an LSBaseSDNode as well as information on where it
-/// is located in a sequence of memory operations connected by a chain.
-struct MemOpLink {
-  MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
-    MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
-  // Ptr to the mem node.
-  LSBaseSDNode *MemNode;
-  // Offset from the base ptr.
-  int64_t OffsetFromBase;
-  // What is the sequence number of this mem node.
-  // Lowest mem operand in the DAG starts at zero.
-  unsigned SequenceNum;
-};
+bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
+                  SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT,
+                  unsigned NumElem, bool IsConstantSrc, bool UseVector) {
+  // Make sure we have something to merge.
+  if (NumElem < 2)
+    return false;
+
+  int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
+  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+  unsigned EarliestNodeUsed = 0;
+
+  for (unsigned i=0; i < NumElem; ++i) {
+    // Find a chain for the new wide-store operand. Notice that some
+    // of the store nodes that we found may not be selected for inclusion
+    // in the wide store. The chain we use needs to be the chain of the
+    // earliest store node which is *used* and replaced by the wide store.
+    if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum)
+      EarliestNodeUsed = i;
+  }
+
+  // The earliest Node in the DAG.
+  LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode;
+  SDLoc DL(StoreNodes[0].MemNode);
+
+  SDValue StoredVal;
+  if (UseVector) {
+    // Find a legal type for the vector store.
+    EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
+    assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
+    if (IsConstantSrc) {
+      // A vector store with a constant source implies that the constant is
+      // zero; we only handle merging stores of constant zeros because the zero
+      // can be materialized without a load.
+      // It may be beneficial to loosen this restriction to allow non-zero
+      // store merging.
+      StoredVal = DAG.getConstant(0, Ty);
+    } else {
+      SmallVector<SDValue, 8> Ops;
+      for (unsigned i = 0; i < NumElem ; ++i) {
+        StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+        SDValue Val = St->getValue();
+        // All of the operands of a BUILD_VECTOR must have the same type.
+        if (Val.getValueType() != MemVT)
+          return false;
+        Ops.push_back(Val);
+      }
+
+      // Build the extracted vector elements back into a vector.
+      StoredVal = DAG.getNode(ISD::BUILD_VECTOR, DL, Ty, Ops);
+    }
+  } else {
+    // We should always use a vector store when merging extracted vector
+    // elements, so this path implies a store of constants.
+    assert(IsConstantSrc && "Merged vector elements should use vector store");
+
+    unsigned StoreBW = NumElem * ElementSizeBytes * 8;
+    APInt StoreInt(StoreBW, 0);
+
+    // Construct a single integer constant which is made of the smaller
+    // constant inputs.
+    bool IsLE = TLI.isLittleEndian();
+    for (unsigned i = 0; i < NumElem ; ++i) {
+      unsigned Idx = IsLE ? (NumElem - 1 - i) : i;
+      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
+      SDValue Val = St->getValue();
+      StoreInt <<= ElementSizeBytes*8;
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
+        StoreInt |= C->getAPIntValue().zext(StoreBW);
+      } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
+        StoreInt |= C->getValueAPF().bitcastToAPInt().zext(StoreBW);
+      } else {
+        llvm_unreachable("Invalid constant element type");
+      }
+    }
+
+    // Create the new Load and Store operations.
+    EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
+    StoredVal = DAG.getConstant(StoreInt, StoreTy);
+  }
+
+  SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal,
+                                  FirstInChain->getBasePtr(),
+                                  FirstInChain->getPointerInfo(),
+                                  false, false,
+                                  FirstInChain->getAlignment());
+
+  // Replace the first store with the new store
+  CombineTo(EarliestOp, NewStore);
+  // Erase all other stores.
+  for (unsigned i = 0; i < NumElem ; ++i) {
+    if (StoreNodes[i].MemNode == EarliestOp)
+      continue;
+    StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+    // ReplaceAllUsesWith will replace all uses that existed when it was
+    // called, but graph optimizations may cause new ones to appear. For
+    // example, the case in pr14333 looks like
+    //
+    //  St's chain -> St -> another store -> X
+    //
+    // And the only difference from St to the other store is the chain.
+    // When we change it's chain to be St's chain they become identical,
+    // get CSEed and the net result is that X is now a use of St.
+    // Since we know that St is redundant, just iterate.
+    while (!St->use_empty())
+      DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
+    deleteAndRecombine(St);
+  }
+
+  return true;
+}
 
 bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
+  if (OptLevel == CodeGenOpt::None)
+    return false;
+
   EVT MemVT = St->getMemoryVT();
   int64_t ElementSizeBytes = MemVT.getSizeInBits()/8;
-  bool NoVectors = DAG.getMachineFunction().getFunction()->getAttributes().
-    hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+  bool NoVectors = DAG.getMachineFunction().getFunction()->hasFnAttribute(
+      Attribute::NoImplicitFloat);
 
   // Don't merge vectors into wider inputs.
   if (MemVT.isVector() || !MemVT.isSimple())
     return false;
 
   // Perform an early exit check. Do not bother looking at stored values that
-  // are not constants or loads.
+  // are not constants, loads, or extracted vector elements.
   SDValue StoredVal = St->getValue();
   bool IsLoadSrc = isa<LoadSDNode>(StoredVal);
-  if (!isa<ConstantSDNode>(StoredVal) && !isa<ConstantFPSDNode>(StoredVal) &&
-      !IsLoadSrc)
+  bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) ||
+                       isa<ConstantFPSDNode>(StoredVal);
+  bool IsExtractVecEltSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT);
+
+  if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecEltSrc)
     return false;
 
   // Only look at ends of store sequences.
@@ -9443,7 +10217,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
 
   // Store the constants into memory as one consecutive store.
-  if (!IsLoadSrc) {
+  if (IsConstantSrc) {
     unsigned LastLegalType = 0;
     unsigned LastLegalVectorType = 0;
     bool NonZero = false;
@@ -9492,85 +10266,33 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
     bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
     unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType;
 
-    // Make sure we have something to merge.
-    if (NumElem < 2)
-      return false;
-
-    unsigned EarliestNodeUsed = 0;
-    for (unsigned i=0; i < NumElem; ++i) {
-      // Find a chain for the new wide-store operand. Notice that some
-      // of the store nodes that we found may not be selected for inclusion
-      // in the wide store. The chain we use needs to be the chain of the
-      // earliest store node which is *used* and replaced by the wide store.
-      if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum)
-        EarliestNodeUsed = i;
-    }
+    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
+                                           true, UseVector);
+  }
 
-    // The earliest Node in the DAG.
-    LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode;
-    SDLoc DL(StoreNodes[0].MemNode);
+  // When extracting multiple vector elements, try to store them
+  // in one vector store rather than a sequence of scalar stores.
+  if (IsExtractVecEltSrc) {
+    unsigned NumElem = 0;
+    for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) {
+      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
+      SDValue StoredVal = St->getValue();
+      // This restriction could be loosened.
+      // Bail out if any stored values are not elements extracted from a vector.
+      // It should be possible to handle mixed sources, but load sources need
+      // more careful handling (see the block of code below that handles
+      // consecutive loads).
+      if (StoredVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+        return false;
 
-    SDValue StoredVal;
-    if (UseVector) {
       // Find a legal type for the vector store.
-      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
-      assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
-      StoredVal = DAG.getConstant(0, Ty);
-    } else {
-      unsigned StoreBW = NumElem * ElementSizeBytes * 8;
-      APInt StoreInt(StoreBW, 0);
-
-      // Construct a single integer constant which is made of the smaller
-      // constant inputs.
-      bool IsLE = TLI.isLittleEndian();
-      for (unsigned i = 0; i < NumElem ; ++i) {
-        unsigned Idx = IsLE ?(NumElem - 1 - i) : i;
-        StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
-        SDValue Val = St->getValue();
-        StoreInt<<=ElementSizeBytes*8;
-        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
-          StoreInt|=C->getAPIntValue().zext(StoreBW);
-        } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
-          StoreInt|= C->getValueAPF().bitcastToAPInt().zext(StoreBW);
-        } else {
-          assert(false && "Invalid constant element type");
-        }
-      }
-
-      // Create the new Load and Store operations.
-      EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
-      StoredVal = DAG.getConstant(StoreInt, StoreTy);
-    }
-
-    SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal,
-                                    FirstInChain->getBasePtr(),
-                                    FirstInChain->getPointerInfo(),
-                                    false, false,
-                                    FirstInChain->getAlignment());
-
-    // Replace the first store with the new store
-    CombineTo(EarliestOp, NewStore);
-    // Erase all other stores.
-    for (unsigned i = 0; i < NumElem ; ++i) {
-      if (StoreNodes[i].MemNode == EarliestOp)
-        continue;
-      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      // ReplaceAllUsesWith will replace all uses that existed when it was
-      // called, but graph optimizations may cause new ones to appear. For
-      // example, the case in pr14333 looks like
-      //
-      //  St's chain -> St -> another store -> X
-      //
-      // And the only difference from St to the other store is the chain.
-      // When we change it's chain to be St's chain they become identical,
-      // get CSEed and the net result is that X is now a use of St.
-      // Since we know that St is redundant, just iterate.
-      while (!St->use_empty())
-        DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
-      deleteAndRecombine(St);
+      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
+      if (TLI.isTypeLegal(Ty))
+        NumElem = i + 1;
     }
 
-    return true;
+    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
+                                           false, true);
   }
 
   // Below we handle the case of multiple consecutive stores that
@@ -9668,9 +10390,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
       EVT LegalizedStoredValueTy =
         TLI.getTypeToTransformTo(*DAG.getContext(), StoreTy);
       if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
-          TLI.isLoadExtLegal(ISD::ZEXTLOAD, StoreTy) &&
-          TLI.isLoadExtLegal(ISD::SEXTLOAD, StoreTy) &&
-          TLI.isLoadExtLegal(ISD::EXTLOAD, StoreTy))
+          TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
+          TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
+          TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy))
         LastLegalIntegerType = i+1;
     }
   }
@@ -10108,7 +10830,8 @@ SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
   if (ResultVT.bitsGT(VecEltVT)) {
     // If the result type of vextract is wider than the load, then issue an
     // extending load instead.
-    ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, VecEltVT)
+    ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT,
+                                                  VecEltVT)
                                    ? ISD::ZEXTLOAD
                                    : ISD::EXTLOAD;
     Load = DAG.getExtLoad(
@@ -10474,6 +11197,11 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
   if (!TLI.isOperationLegalOrCustom(Opcode, NVT))
     return SDValue();
 
+  // Just because the floating-point vector type is legal does not necessarily
+  // mean that the corresponding integer vector type is.
+  if (!isTypeLegal(NVT))
+    return SDValue();
+
   SmallVector<SDValue, 8> Opnds;
   for (unsigned i = 0; i != NumInScalars; ++i) {
     SDValue In = N->getOperand(i);
@@ -10519,26 +11247,37 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
     return SDValue();
 
   SDValue VecIn1, VecIn2;
+  bool UsesZeroVector = false;
   for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue Op = N->getOperand(i);
     // Ignore undef inputs.
-    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (Op.getOpcode() == ISD::UNDEF) continue;
+
+    // See if we can combine this build_vector into a blend with a zero vector.
+    if (!VecIn2.getNode() && ((Op.getOpcode() == ISD::Constant &&
+        cast<ConstantSDNode>(Op.getNode())->isNullValue()) ||
+        (Op.getOpcode() == ISD::ConstantFP &&
+        cast<ConstantFPSDNode>(Op.getNode())->getValueAPF().isZero()))) {
+      UsesZeroVector = true;
+      continue;
+    }
 
     // If this input is something other than a EXTRACT_VECTOR_ELT with a
     // constant index, bail out.
-    if (N->getOperand(i).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        !isa<ConstantSDNode>(N->getOperand(i).getOperand(1))) {
+    if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(Op.getOperand(1))) {
       VecIn1 = VecIn2 = SDValue(nullptr, 0);
       break;
     }
 
     // We allow up to two distinct input vectors.
-    SDValue ExtractedFromVec = N->getOperand(i).getOperand(0);
+    SDValue ExtractedFromVec = Op.getOperand(0);
     if (ExtractedFromVec == VecIn1 || ExtractedFromVec == VecIn2)
       continue;
 
     if (!VecIn1.getNode()) {
       VecIn1 = ExtractedFromVec;
-    } else if (!VecIn2.getNode()) {
+    } else if (!VecIn2.getNode() && !UsesZeroVector) {
       VecIn2 = ExtractedFromVec;
     } else {
       // Too many inputs.
@@ -10549,55 +11288,93 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
 
   // If everything is good, we can make a shuffle operation.
   if (VecIn1.getNode()) {
+    unsigned InNumElements = VecIn1.getValueType().getVectorNumElements();
     SmallVector<int, 8> Mask;
     for (unsigned i = 0; i != NumInScalars; ++i) {
-      if (N->getOperand(i).getOpcode() == ISD::UNDEF) {
+      unsigned Opcode = N->getOperand(i).getOpcode();
+      if (Opcode == ISD::UNDEF) {
         Mask.push_back(-1);
         continue;
       }
 
+      // Operands can also be zero.
+      if (Opcode != ISD::EXTRACT_VECTOR_ELT) {
+        assert(UsesZeroVector &&
+               (Opcode == ISD::Constant || Opcode == ISD::ConstantFP) &&
+               "Unexpected node found!");
+        Mask.push_back(NumInScalars+i);
+        continue;
+      }
+
       // If extracting from the first vector, just use the index directly.
       SDValue Extract = N->getOperand(i);
       SDValue ExtVal = Extract.getOperand(1);
+      unsigned ExtIndex = cast<ConstantSDNode>(ExtVal)->getZExtValue();
       if (Extract.getOperand(0) == VecIn1) {
-        unsigned ExtIndex = cast<ConstantSDNode>(ExtVal)->getZExtValue();
-        if (ExtIndex > VT.getVectorNumElements())
-          return SDValue();
-
         Mask.push_back(ExtIndex);
         continue;
       }
 
-      // Otherwise, use InIdx + VecSize
-      unsigned Idx = cast<ConstantSDNode>(ExtVal)->getZExtValue();
-      Mask.push_back(Idx+NumInScalars);
+      // Otherwise, use InIdx + InputVecSize
+      Mask.push_back(InNumElements + ExtIndex);
     }
 
+    // Avoid introducing illegal shuffles with zero.
+    if (UsesZeroVector && !TLI.isVectorClearMaskLegal(Mask, VT))
+      return SDValue();
+
     // We can't generate a shuffle node with mismatched input and output types.
     // Attempt to transform a single input vector to the correct type.
     if ((VT != VecIn1.getValueType())) {
-      // We don't support shuffeling between TWO values of different types.
-      if (VecIn2.getNode())
+      // If the input vector type has a different base type to the output
+      // vector type, bail out.
+      EVT VTElemType = VT.getVectorElementType();
+      if ((VecIn1.getValueType().getVectorElementType() != VTElemType) ||
+          (VecIn2.getNode() &&
+           (VecIn2.getValueType().getVectorElementType() != VTElemType)))
         return SDValue();
 
+      // If the input vector is too small, widen it.
       // We only support widening of vectors which are half the size of the
       // output registers. For example XMM->YMM widening on X86 with AVX.
-      if (VecIn1.getValueType().getSizeInBits()*2 != VT.getSizeInBits())
-        return SDValue();
+      EVT VecInT = VecIn1.getValueType();
+      if (VecInT.getSizeInBits() * 2 == VT.getSizeInBits()) {
+        // If we only have one small input, widen it by adding undef values.
+        if (!VecIn2.getNode())
+          VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, VecIn1,
+                               DAG.getUNDEF(VecIn1.getValueType()));
+        else if (VecIn1.getValueType() == VecIn2.getValueType()) {
+          // If we have two small inputs of the same type, try to concat them.
+          VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, VecIn1, VecIn2);
+          VecIn2 = SDValue(nullptr, 0);
+        } else
+          return SDValue();
+      } else if (VecInT.getSizeInBits() == VT.getSizeInBits() * 2) {
+        // If the input vector is too large, try to split it.
+        // We don't support having two input vectors that are too large.
+        if (VecIn2.getNode())
+          return SDValue();
 
-      // If the input vector type has a different base type to the output
-      // vector type, bail out.
-      if (VecIn1.getValueType().getVectorElementType() !=
-          VT.getVectorElementType())
-        return SDValue();
+        if (!TLI.isExtractSubvectorCheap(VT, VT.getVectorNumElements()))
+          return SDValue();
 
-      // Widen the input vector by adding undef values.
-      VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
-                           VecIn1, DAG.getUNDEF(VecIn1.getValueType()));
+        // Try to replace VecIn1 with two extract_subvectors
+        // No need to update the masks, they should still be correct.
+        VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1,
+          DAG.getConstant(VT.getVectorNumElements(), TLI.getVectorIdxTy()));
+        VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1,
+          DAG.getConstant(0, TLI.getVectorIdxTy()));
+        UsesZeroVector = false;
+      } else
+        return SDValue();
     }
 
-    // If VecIn2 is unused then change it to undef.
-    VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+    if (UsesZeroVector)
+      VecIn2 = VT.isInteger() ? DAG.getConstant(0, VT) :
+                                DAG.getConstantFP(0.0, VT);
+    else
+      // If VecIn2 is unused then change it to undef.
+      VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
 
     // Check that we were able to transform all incoming values to the same
     // type.
@@ -10656,36 +11433,56 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
     }
   }
 
+  // Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR.
+  // We have already tested above for an UNDEF only concatenation.
   // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...))
   // -> (BUILD_VECTOR A, B, ..., C, D, ...)
-  if (N->getNumOperands() == 2 &&
-      N->getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
-      N->getOperand(1).getOpcode() == ISD::BUILD_VECTOR) {
-    EVT VT = N->getValueType(0);
-    SDValue N0 = N->getOperand(0);
-    SDValue N1 = N->getOperand(1);
+  auto IsBuildVectorOrUndef = [](const SDValue &Op) {
+    return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode();
+  };
+  bool AllBuildVectorsOrUndefs =
+      std::all_of(N->op_begin(), N->op_end(), IsBuildVectorOrUndef);
+  if (AllBuildVectorsOrUndefs) {
     SmallVector<SDValue, 8> Opnds;
-    unsigned BuildVecNumElts =  N0.getNumOperands();
-
-    EVT SclTy0 = N0.getOperand(0)->getValueType(0);
-    EVT SclTy1 = N1.getOperand(0)->getValueType(0);
-    if (SclTy0.isFloatingPoint()) {
-      for (unsigned i = 0; i != BuildVecNumElts; ++i)
-        Opnds.push_back(N0.getOperand(i));
-      for (unsigned i = 0; i != BuildVecNumElts; ++i)
-        Opnds.push_back(N1.getOperand(i));
-    } else {
+    EVT SVT = VT.getScalarType();
+
+    EVT MinVT = SVT;
+    if (!SVT.isFloatingPoint()) {
       // If BUILD_VECTOR are from built from integer, they may have different
-      // operand types. Get the smaller type and truncate all operands to it.
-      EVT MinTy = SclTy0.bitsLE(SclTy1) ? SclTy0 : SclTy1;
-      for (unsigned i = 0; i != BuildVecNumElts; ++i)
-        Opnds.push_back(DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinTy,
-                        N0.getOperand(i)));
-      for (unsigned i = 0; i != BuildVecNumElts; ++i)
-        Opnds.push_back(DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinTy,
-                        N1.getOperand(i)));
+      // operand types. Get the smallest type and truncate all operands to it.
+      bool FoundMinVT = false;
+      for (const SDValue &Op : N->ops())
+        if (ISD::BUILD_VECTOR == Op.getOpcode()) {
+          EVT OpSVT = Op.getOperand(0)->getValueType(0);
+          MinVT = (!FoundMinVT || OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT;
+          FoundMinVT = true;
+        }
+      assert(FoundMinVT && "Concat vector type mismatch");
+    }
+
+    for (const SDValue &Op : N->ops()) {
+      EVT OpVT = Op.getValueType();
+      unsigned NumElts = OpVT.getVectorNumElements();
+
+      if (ISD::UNDEF == Op.getOpcode())
+        for (unsigned i = 0; i != NumElts; ++i)
+          Opnds.push_back(DAG.getUNDEF(MinVT));
+
+      if (ISD::BUILD_VECTOR == Op.getOpcode()) {
+        if (SVT.isFloatingPoint()) {
+          assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch");
+          for (unsigned i = 0; i != NumElts; ++i)
+            Opnds.push_back(Op.getOperand(i));
+        } else {
+          for (unsigned i = 0; i != NumElts; ++i)
+            Opnds.push_back(
+                DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i)));
+        }
+      }
     }
 
+    assert(VT.getVectorNumElements() == Opnds.size() &&
+           "Concat vector type mismatch");
     return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Opnds);
   }
 
@@ -10881,7 +11678,8 @@ static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0,
   return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask());
 }
 
-// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat.
+// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
+// or turn a shuffle of a single concat into simpler shuffle then concat.
 static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
@@ -10895,6 +11693,18 @@ static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
   unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements();
   unsigned NumConcats = NumElts / NumElemsPerConcat;
 
+  // Special case: shuffle(concat(A,B)) can be more efficiently represented
+  // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high
+  // half vector elements.
+  if (NumElemsPerConcat * 2 == NumElts && N1.getOpcode() == ISD::UNDEF &&
+      std::all_of(SVN->getMask().begin() + NumElemsPerConcat,
+                  SVN->getMask().end(), [](int i) { return i == -1; })) {
+    N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1),
+                              ArrayRef<int>(SVN->getMask().begin(), NumElemsPerConcat));
+    N1 = DAG.getUNDEF(ConcatVT);
+    return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1);
+  }
+
   // Look at every vector that's inserted. We're looking for exact
   // subvector-sized copies from a concatenated vector
   for (unsigned I = 0; I != NumConcats; ++I) {
@@ -10993,7 +11803,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   }
 
   // If it is a splat, check if the argument vector is another splat or a
-  // build_vector with all scalar elements the same.
+  // build_vector.
   if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
     SDNode *V = N0.getNode();
 
@@ -11030,6 +11840,18 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       // Splat of <x, x, x, x>, return <x, x, x, x>
       if (AllSame)
         return N0;
+
+      // Canonicalize any other splat as a build_vector.
+      const SDValue &Splatted = V->getOperand(SVN->getSplatIndex());
+      SmallVector<SDValue, 8> Ops(NumElts, Splatted);
+      SDValue NewBV = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
+                                  V->getValueType(0), Ops);
+
+      // We may have jumped through bitcasts, so the type of the
+      // BUILD_VECTOR may not match the type of the shuffle.
+      if (V->getValueType(0) != VT)
+          NewBV = DAG.getNode(ISD::BITCAST, SDLoc(N), VT, NewBV);
+      return NewBV;
     }
   }
 
@@ -11050,121 +11872,11 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       return V;
   }
 
-  // If this shuffle node is simply a swizzle of another shuffle node,
-  // then try to simplify it.
-  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
-      N1.getOpcode() == ISD::UNDEF) {
-
-    ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
-
-    // The incoming shuffle must be of the same type as the result of the
-    // current shuffle.
-    assert(OtherSV->getOperand(0).getValueType() == VT &&
-           "Shuffle types don't match");
-
-    SmallVector<int, 4> Mask;
-    // Compute the combined shuffle mask.
-    for (unsigned i = 0; i != NumElts; ++i) {
-      int Idx = SVN->getMaskElt(i);
-      assert(Idx < (int)NumElts && "Index references undef operand");
-      // Next, this index comes from the first value, which is the incoming
-      // shuffle. Adopt the incoming index.
-      if (Idx >= 0)
-        Idx = OtherSV->getMaskElt(Idx);
-      Mask.push_back(Idx);
-    }
-
-    // Check if all indices in Mask are Undef. In case, propagate Undef.
-    bool isUndefMask = true;
-    for (unsigned i = 0; i != NumElts && isUndefMask; ++i)
-      isUndefMask &= Mask[i] < 0;
-
-    if (isUndefMask)
-      return DAG.getUNDEF(VT);
-
-    bool CommuteOperands = false;
-    if (N0.getOperand(1).getOpcode() != ISD::UNDEF) {
-      // To be valid, the combine shuffle mask should only reference elements
-      // from one of the two vectors in input to the inner shufflevector.
-      bool IsValidMask = true;
-      for (unsigned i = 0; i != NumElts && IsValidMask; ++i)
-        // See if the combined mask only reference undefs or elements coming
-        // from the first shufflevector operand.
-        IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] < NumElts;
-
-      if (!IsValidMask) {
-        IsValidMask = true;
-        for (unsigned i = 0; i != NumElts && IsValidMask; ++i)
-          // Check that all the elements come from the second shuffle operand.
-          IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] >= NumElts;
-        CommuteOperands = IsValidMask;
-      }
-
-      // Early exit if the combined shuffle mask is not valid.
-      if (!IsValidMask)
-        return SDValue();
-    }
-
-    // See if this pair of shuffles can be safely folded according to either
-    // of the following rules:
-    //   shuffle(shuffle(x, y), undef) -> x
-    //   shuffle(shuffle(x, undef), undef) -> x
-    //   shuffle(shuffle(x, y), undef) -> y
-    bool IsIdentityMask = true;
-    unsigned BaseMaskIndex = CommuteOperands ? NumElts : 0;
-    for (unsigned i = 0; i != NumElts && IsIdentityMask; ++i) {
-      // Skip Undefs.
-      if (Mask[i] < 0)
-        continue;
-
-      // The combined shuffle must map each index to itself.
-      IsIdentityMask = (unsigned)Mask[i] == i + BaseMaskIndex;
-    }
-
-    if (IsIdentityMask) {
-      if (CommuteOperands)
-        // optimize shuffle(shuffle(x, y), undef) -> y.
-        return OtherSV->getOperand(1);
-
-      // optimize shuffle(shuffle(x, undef), undef) -> x
-      // optimize shuffle(shuffle(x, y), undef) -> x
-      return OtherSV->getOperand(0);
-    }
-
-    // It may still be beneficial to combine the two shuffles if the
-    // resulting shuffle is legal.
-    if (TLI.isTypeLegal(VT)) {
-      if (!CommuteOperands) {
-        if (TLI.isShuffleMaskLegal(Mask, VT))
-          // shuffle(shuffle(x, undef, M1), undef, M2) -> shuffle(x, undef, M3).
-          // shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(x, undef, M3)
-          return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), N1,
-                                      &Mask[0]);
-      } else {
-        // Compute the commuted shuffle mask.
-        for (unsigned i = 0; i != NumElts; ++i) {
-          int idx = Mask[i];
-          if (idx < 0)
-            continue;
-          else if (idx < (int)NumElts)
-            Mask[i] = idx + NumElts;
-          else
-            Mask[i] = idx - NumElts;
-        }
-
-        if (TLI.isShuffleMaskLegal(Mask, VT))
-          //   shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(y, undef, M3)
-          return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(1), N1,
-                                      &Mask[0]);
-      }
-    }
-  }
-
   // Canonicalize shuffles according to rules:
   //  shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A)
   //  shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B)
   //  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
-  if (N1.getOpcode() == ISD::VECTOR_SHUFFLE && N0.getOpcode() != ISD::UNDEF &&
+  if (N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
       N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
       TLI.isTypeLegal(VT)) {
     // The incoming shuffle must be of the same type as the result of the
@@ -11183,13 +11895,13 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   }
 
   // Try to fold according to rules:
-  //   shuffle(shuffle(A, B, M0), B, M1) -> shuffle(A, B, M2)
-  //   shuffle(shuffle(A, B, M0), A, M1) -> shuffle(A, B, M2)
-  //   shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(A, B, M2)
-  //   shuffle(shuffle(A, Undef, M0), A, M1) -> shuffle(A, Undef, M2)
+  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
+  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
+  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
   // Don't try to fold shuffles with illegal type.
-  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
-      N1.getOpcode() != ISD::UNDEF && TLI.isTypeLegal(VT)) {
+  // Only fold if this shuffle is the only user of the other shuffle.
+  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) &&
+      Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
     ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
 
     // The incoming shuffle must be of the same type as the result of the
@@ -11197,14 +11909,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
     assert(OtherSV->getOperand(0).getValueType() == VT &&
            "Shuffle types don't match");
 
-    SDValue SV0 = OtherSV->getOperand(0);
-    SDValue SV1 = OtherSV->getOperand(1);
-    bool HasSameOp0 = N1 == SV0;
-    bool IsSV1Undef = SV1.getOpcode() == ISD::UNDEF;
-    if (!HasSameOp0 && !IsSV1Undef && N1 != SV1)
-      // Early exit.
-      return SDValue();
-
+    SDValue SV0, SV1;
     SmallVector<int, 4> Mask;
     // Compute the combined shuffle mask for a shuffle with SV0 as the first
     // operand, and SV1 as the second operand.
@@ -11216,14 +11921,49 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
         continue;
       }
 
+      SDValue CurrentVec;
       if (Idx < (int)NumElts) {
+        // This shuffle index refers to the inner shuffle N0. Lookup the inner
+        // shuffle mask to identify which vector is actually referenced.
         Idx = OtherSV->getMaskElt(Idx);
-        if (IsSV1Undef && Idx >= (int) NumElts)
-          Idx = -1;  // Propagate Undef.
-      } else
-        Idx = HasSameOp0 ? Idx - NumElts : Idx;
+        if (Idx < 0) {
+          // Propagate Undef.
+          Mask.push_back(Idx);
+          continue;
+        }
+
+        CurrentVec = (Idx < (int) NumElts) ? OtherSV->getOperand(0)
+                                           : OtherSV->getOperand(1);
+      } else {
+        // This shuffle index references an element within N1.
+        CurrentVec = N1;
+      }
+
+      // Simple case where 'CurrentVec' is UNDEF.
+      if (CurrentVec.getOpcode() == ISD::UNDEF) {
+        Mask.push_back(-1);
+        continue;
+      }
+
+      // Canonicalize the shuffle index. We don't know yet if CurrentVec
+      // will be the first or second operand of the combined shuffle.
+      Idx = Idx % NumElts;
+      if (!SV0.getNode() || SV0 == CurrentVec) {
+        // Ok. CurrentVec is the left hand side.
+        // Update the mask accordingly.
+        SV0 = CurrentVec;
+        Mask.push_back(Idx);
+        continue;
+      }
+
+      // Bail out if we cannot convert the shuffle pair into a single shuffle.
+      if (SV1.getNode() && SV1 != CurrentVec)
+        return SDValue();
 
-      Mask.push_back(Idx);
+      // Ok. CurrentVec is the right hand side.
+      // Update the mask accordingly.
+      SV1 = CurrentVec;
+      Mask.push_back(Idx + NumElts);
     }
 
     // Check if all indices in Mask are Undef. In case, propagate Undef.
@@ -11234,34 +11974,37 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
     if (isUndefMask)
       return DAG.getUNDEF(VT);
 
+    if (!SV0.getNode())
+      SV0 = DAG.getUNDEF(VT);
+    if (!SV1.getNode())
+      SV1 = DAG.getUNDEF(VT);
+
     // Avoid introducing shuffles with illegal mask.
-    if (TLI.isShuffleMaskLegal(Mask, VT)) {
-      if (IsSV1Undef)
-        //   shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(A, B, M2)
-        //   shuffle(shuffle(A, Undef, M0), A, M1) -> shuffle(A, Undef, M2)
-        return DAG.getVectorShuffle(VT, SDLoc(N), SV0, N1, &Mask[0]);
-      return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, &Mask[0]);
-    }
+    if (!TLI.isShuffleMaskLegal(Mask, VT)) {
+      // Compute the commuted shuffle mask and test again.
+      for (unsigned i = 0; i != NumElts; ++i) {
+        int idx = Mask[i];
+        if (idx < 0)
+          continue;
+        else if (idx < (int)NumElts)
+          Mask[i] = idx + NumElts;
+        else
+          Mask[i] = idx - NumElts;
+      }
 
-    // Compute the commuted shuffle mask.
-    for (unsigned i = 0; i != NumElts; ++i) {
-      int idx = Mask[i];
-      if (idx < 0)
-        continue;
-      else if (idx < (int)NumElts)
-        Mask[i] = idx + NumElts;
-      else
-        Mask[i] = idx - NumElts;
-    }
+      if (!TLI.isShuffleMaskLegal(Mask, VT))
+        return SDValue();
 
-    if (TLI.isShuffleMaskLegal(Mask, VT)) {
-      if (IsSV1Undef)
-        //   shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(B, A, M2)
-        return DAG.getVectorShuffle(VT, SDLoc(N), N1, SV0, &Mask[0]);
-      //   shuffle(shuffle(A, B, M0), B, M1) -> shuffle(B, A, M2)
-      //   shuffle(shuffle(A, B, M0), A, M1) -> shuffle(B, A, M2)
-      return DAG.getVectorShuffle(VT, SDLoc(N), SV1, SV0, &Mask[0]);
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
+      std::swap(SV0, SV1);
     }
+
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
+    return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, &Mask[0]);
   }
 
   return SDValue();
@@ -11322,9 +12065,11 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
           return SDValue();
       }
 
-      // Let's see if the target supports this vector_shuffle.
+      // Let's see if the target supports this vector_shuffle and make sure
+      // we're not running after operation legalization where it may have
+      // custom lowered the vector shuffles.
       EVT RVT = RHS.getValueType();
-      if (!TLI.isVectorClearMaskLegal(Indices, RVT))
+      if (LegalOperations || !TLI.isVectorClearMaskLegal(Indices, RVT))
         return SDValue();
 
       // Return the new VECTOR_SHUFFLE node.
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 8facbc2..1df4a1d 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -40,12 +40,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/FastISel.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -62,7 +63,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -728,6 +728,7 @@ bool FastISel::selectPatchpoint(const CallInst *I) {
   // For AnyRegCC the arguments are lowered later on manually.
   unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
   CallLoweringInfo CLI;
+  CLI.setIsPatchPoint();
   if (!lowerCallOperands(I, NumMetaOpers, NumCallArgs, Callee, IsAnyRegCC, CLI))
     return false;
 
@@ -1579,7 +1580,7 @@ FastISel::FastISel(FunctionLoweringInfo &FuncInfo,
                    bool SkipTargetIndependentISel)
     : FuncInfo(FuncInfo), MF(FuncInfo.MF), MRI(FuncInfo.MF->getRegInfo()),
       MFI(*FuncInfo.MF->getFrameInfo()), MCP(*FuncInfo.MF->getConstantPool()),
-      TM(FuncInfo.MF->getTarget()), DL(*MF->getSubtarget().getDataLayout()),
+      TM(FuncInfo.MF->getTarget()), DL(*TM.getDataLayout()),
       TII(*MF->getSubtarget().getInstrInfo()),
       TLI(*MF->getSubtarget().getTargetLowering()),
       TRI(*MF->getSubtarget().getRegisterInfo()), LibInfo(LibInfo),
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 86b9542..7e72dc6 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -133,16 +133,17 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
         ImmutableCallSite CS(I);
         if (isa<InlineAsm>(CS.getCalledValue())) {
           unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
+          const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
           std::vector<TargetLowering::AsmOperandInfo> Ops =
-            TLI->ParseConstraints(CS);
+              TLI->ParseConstraints(TRI, CS);
           for (size_t I = 0, E = Ops.size(); I != E; ++I) {
             TargetLowering::AsmOperandInfo &Op = Ops[I];
             if (Op.Type == InlineAsm::isClobber) {
               // Clobbers don't have SDValue operands, hence SDValue().
               TLI->ComputeConstraintToUse(Op, SDValue(), DAG);
               std::pair<unsigned, const TargetRegisterClass *> PhysReg =
-                  TLI->getRegForInlineAsmConstraint(Op.ConstraintCode,
-                                                   Op.ConstraintVT);
+                  TLI->getRegForInlineAsmConstraint(TRI, Op.ConstraintCode,
+                                                    Op.ConstraintVT);
               if (PhysReg.first == SP)
                 MF->getFrameInfo()->setHasInlineAsmWithSPAdjust(true);
             }
@@ -273,6 +274,7 @@ void FunctionLoweringInfo::clear() {
   ArgDbgValues.clear();
   ByValArgFrameIndexMap.clear();
   RegFixups.clear();
+  StatepointStackSlots.clear();
   PreferredExtendType.clear();
 }
 
@@ -470,60 +472,6 @@ void llvm::ComputeUsesVAFloatArgument(const CallInst &I,
   }
 }
 
-/// AddCatchInfo - Extract the personality and type infos from an eh.selector
-/// call, and add them to the specified machine basic block.
-void llvm::AddCatchInfo(const CallInst &I, MachineModuleInfo *MMI,
-                        MachineBasicBlock *MBB) {
-  // Inform the MachineModuleInfo of the personality for this landing pad.
-  const ConstantExpr *CE = cast<ConstantExpr>(I.getArgOperand(1));
-  assert(CE->getOpcode() == Instruction::BitCast &&
-         isa<Function>(CE->getOperand(0)) &&
-         "Personality should be a function");
-  MMI->addPersonality(MBB, cast<Function>(CE->getOperand(0)));
-
-  // Gather all the type infos for this landing pad and pass them along to
-  // MachineModuleInfo.
-  std::vector<const GlobalValue *> TyInfo;
-  unsigned N = I.getNumArgOperands();
-
-  for (unsigned i = N - 1; i > 1; --i) {
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(i))) {
-      unsigned FilterLength = CI->getZExtValue();
-      unsigned FirstCatch = i + FilterLength + !FilterLength;
-      assert(FirstCatch <= N && "Invalid filter length");
-
-      if (FirstCatch < N) {
-        TyInfo.reserve(N - FirstCatch);
-        for (unsigned j = FirstCatch; j < N; ++j)
-          TyInfo.push_back(ExtractTypeInfo(I.getArgOperand(j)));
-        MMI->addCatchTypeInfo(MBB, TyInfo);
-        TyInfo.clear();
-      }
-
-      if (!FilterLength) {
-        // Cleanup.
-        MMI->addCleanup(MBB);
-      } else {
-        // Filter.
-        TyInfo.reserve(FilterLength - 1);
-        for (unsigned j = i + 1; j < FirstCatch; ++j)
-          TyInfo.push_back(ExtractTypeInfo(I.getArgOperand(j)));
-        MMI->addFilterTypeInfo(MBB, TyInfo);
-        TyInfo.clear();
-      }
-
-      N = i;
-    }
-  }
-
-  if (N > 2) {
-    TyInfo.reserve(N - 2);
-    for (unsigned j = 2; j < N; ++j)
-      TyInfo.push_back(ExtractTypeInfo(I.getArgOperand(j)));
-    MMI->addCatchTypeInfo(MBB, TyInfo);
-  }
-}
-
 /// AddLandingPadInfo - Extract the exception handling information from the
 /// landingpad instruction and add them to the specified machine module info.
 void llvm::AddLandingPadInfo(const LandingPadInst &I, MachineModuleInfo &MMI,
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index a65f33e..93699a7 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -406,10 +406,10 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
     Type *Type = CP->getType();
     // MachineConstantPool wants an explicit alignment.
     if (Align == 0) {
-      Align = MF->getSubtarget().getDataLayout()->getPrefTypeAlignment(Type);
+      Align = MF->getTarget().getDataLayout()->getPrefTypeAlignment(Type);
       if (Align == 0) {
         // Alignment of vector types.  FIXME!
-        Align = MF->getSubtarget().getDataLayout()->getTypeAllocSize(Type);
+        Align = MF->getTarget().getDataLayout()->getTypeAllocSize(Type);
       }
     }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 5d17a5f..61c0a6f 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -40,7 +40,7 @@ using namespace llvm;
 #define DEBUG_TYPE "legalizedag"
 
 //===----------------------------------------------------------------------===//
-/// SelectionDAGLegalize - This takes an arbitrary SelectionDAG as input and
+/// This takes an arbitrary SelectionDAG as input and
 /// hacks on it until the target machine can handle it.  This involves
 /// eliminating value sizes the machine cannot handle (promoting small sizes to
 /// large sizes or splitting up large values into small values) as well as
@@ -86,7 +86,7 @@ private:
   void LegalizeLoadOps(SDNode *Node);
   void LegalizeStoreOps(SDNode *Node);
 
-  /// PerformInsertVectorEltInMemory - Some target cannot handle a variable
+  /// Some targets cannot handle a variable
   /// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
   /// is necessary to spill the vector being inserted into to memory, perform
   /// the insert there, and then read the result back.
@@ -95,7 +95,7 @@ private:
   SDValue ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
                                   SDValue Idx, SDLoc dl);
 
-  /// ShuffleWithNarrowerEltType - Return a vector shuffle operation which
+  /// Return a vector shuffle operation which
   /// performs the same shuffe in terms of order or result bytes, but on a type
   /// whose vector element type is narrower than the original shuffle type.
   /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
@@ -200,7 +200,7 @@ public:
 };
 }
 
-/// ShuffleWithNarrowerEltType - Return a vector shuffle operation which
+/// Return a vector shuffle operation which
 /// performs the same shuffe in terms of order or result bytes, but on a type
 /// whose vector element type is narrower than the original shuffle type.
 /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
@@ -232,7 +232,7 @@ SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  SDLoc dl,
   return DAG.getVectorShuffle(NVT, dl, N1, N2, &NewMask[0]);
 }
 
-/// ExpandConstantFP - Expands the ConstantFP node to an integer constant or
+/// Expands the ConstantFP node to an integer constant or
 /// a load from the constant pool.
 SDValue
 SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
@@ -260,7 +260,7 @@ SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
     if (ConstantFPSDNode::isValueValidForType(SVT, CFP->getValueAPF()) &&
         // Only do this if the target has a native EXTLOAD instruction from
         // smaller type.
-        TLI.isLoadExtLegal(ISD::EXTLOAD, SVT) &&
+        TLI.isLoadExtLegal(ISD::EXTLOAD, OrigVT, SVT) &&
         TLI.ShouldShrinkFPConstant(OrigVT)) {
       Type *SType = SVT.getTypeForEVT(*DAG.getContext());
       LLVMC = cast<ConstantFP>(ConstantExpr::getFPTrunc(LLVMC, SType));
@@ -286,7 +286,7 @@ SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
   return Result;
 }
 
-/// ExpandUnalignedStore - Expands an unaligned store to 2 half-size stores.
+/// Expands an unaligned store to 2 half-size stores.
 static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
                                  const TargetLowering &TLI,
                                  SelectionDAGLegalize *DAGLegalize) {
@@ -409,7 +409,7 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
   DAGLegalize->ReplaceNode(SDValue(ST, 0), Result);
 }
 
-/// ExpandUnalignedLoad - Expands an unaligned load to 2 half-size loads.
+/// Expands an unaligned load to 2 half-size loads.
 static void
 ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                     const TargetLowering &TLI,
@@ -561,8 +561,8 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
   ChainResult = TF;
 }
 
-/// PerformInsertVectorEltInMemory - Some target cannot handle a variable
-/// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
+/// Some target cannot handle a variable insertion index for the
+/// INSERT_VECTOR_ELT instruction.  In this case, it
 /// is necessary to spill the vector being inserted into to memory, perform
 /// the insert there, and then read the result back.
 SDValue SelectionDAGLegalize::
@@ -725,14 +725,13 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
             Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
             unsigned ABIAlignment= TLI.getDataLayout()->getABITypeAlignment(Ty);
             if (Align < ABIAlignment)
-              ExpandUnalignedStore(cast<StoreSDNode>(Node),
-                                   DAG, TLI, this);
+              ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
           }
           break;
         }
         case TargetLowering::Custom: {
           SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
-          if (Res.getNode())
+          if (Res && Res != SDValue(Node, 0))
             ReplaceNode(SDValue(Node, 0), Res);
           return;
         }
@@ -766,8 +765,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
         Value = DAG.getZeroExtendInReg(Value, dl, StVT);
         SDValue Result =
           DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                            NVT, isVolatile, isNonTemporal, Alignment,
-                            AAInfo);
+                            NVT, isVolatile, isNonTemporal, Alignment, AAInfo);
         ReplaceNode(SDValue(Node, 0), Result);
       } else if (StWidth & (StWidth - 1)) {
         // If not storing a power-of-2 number of bits, expand as two stores.
@@ -845,7 +843,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
         }
         case TargetLowering::Custom: {
           SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
-          if (Res.getNode())
+          if (Res && Res != SDValue(Node, 0))
             ReplaceNode(SDValue(Node, 0), Res);
           return;
         }
@@ -946,7 +944,8 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       // nice to have an effective generic way of getting these benefits...
       // Until such a way is found, don't insist on promoting i1 here.
       (SrcVT != MVT::i1 ||
-       TLI.getLoadExtAction(ExtType, MVT::i1) == TargetLowering::Promote)) {
+       TLI.getLoadExtAction(ExtType, Node->getValueType(0), MVT::i1) ==
+         TargetLowering::Promote)) {
     // Promote to a byte-sized load if not loading an integral number of
     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
     unsigned NewWidth = SrcVT.getStoreSizeInBits();
@@ -1058,7 +1057,8 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     Chain = Ch;
   } else {
     bool isCustom = false;
-    switch (TLI.getLoadExtAction(ExtType, SrcVT.getSimpleVT())) {
+    switch (TLI.getLoadExtAction(ExtType, Node->getValueType(0),
+                                 SrcVT.getSimpleVT())) {
     default: llvm_unreachable("This action is not supported yet!");
     case TargetLowering::Custom:
       isCustom = true;
@@ -1080,36 +1080,35 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
         unsigned AS = LD->getAddressSpace();
         unsigned Align = LD->getAlignment();
         if (!TLI.allowsMisalignedMemoryAccesses(MemVT, AS, Align)) {
-          Type *Ty =
-            LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-          unsigned ABIAlignment =
-            TLI.getDataLayout()->getABITypeAlignment(Ty);
+          Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+          unsigned ABIAlignment = TLI.getDataLayout()->getABITypeAlignment(Ty);
           if (Align < ABIAlignment){
-            ExpandUnalignedLoad(cast<LoadSDNode>(Node),
-                                DAG, TLI, Value, Chain);
+            ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, Value, Chain);
           }
         }
       }
       break;
     }
     case TargetLowering::Expand:
-      if (!TLI.isLoadExtLegal(ISD::EXTLOAD, SrcVT) &&
-          TLI.isTypeLegal(SrcVT)) {
-        SDValue Load = DAG.getLoad(SrcVT, dl, Chain, Ptr,
-                                   LD->getMemOperand());
-        unsigned ExtendOp;
-        switch (ExtType) {
-        case ISD::EXTLOAD:
-          ExtendOp = (SrcVT.isFloatingPoint() ?
-                      ISD::FP_EXTEND : ISD::ANY_EXTEND);
+      if (!TLI.isLoadExtLegal(ISD::EXTLOAD, Node->getValueType(0), SrcVT)) {
+        // If the source type is not legal, see if there is a legal extload to
+        // an intermediate type that we can then extend further.
+        EVT LoadVT = TLI.getRegisterType(SrcVT.getSimpleVT());
+        if (TLI.isTypeLegal(SrcVT) || // Same as SrcVT == LoadVT?
+            TLI.isLoadExtLegal(ExtType, LoadVT, SrcVT)) {
+          // If we are loading a legal type, this is a non-extload followed by a
+          // full extend.
+          ISD::LoadExtType MidExtType =
+              (LoadVT == SrcVT) ? ISD::NON_EXTLOAD : ExtType;
+
+          SDValue Load = DAG.getExtLoad(MidExtType, dl, LoadVT, Chain, Ptr,
+                                        SrcVT, LD->getMemOperand());
+          unsigned ExtendOp =
+              ISD::getExtForLoadExtType(SrcVT.isFloatingPoint(), ExtType);
+          Value = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
+          Chain = Load.getValue(1);
           break;
-        case ISD::SEXTLOAD: ExtendOp = ISD::SIGN_EXTEND; break;
-        case ISD::ZEXTLOAD: ExtendOp = ISD::ZERO_EXTEND; break;
-        default: llvm_unreachable("Unexpected extend load type!");
         }
-        Value = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
-        Chain = Load.getValue(1);
-        break;
       }
 
       assert(!SrcVT.isVector() &&
@@ -1133,8 +1132,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
                              Result.getValueType(),
                              Result, DAG.getValueType(SrcVT));
       else
-        ValRes = DAG.getZeroExtendInReg(Result, dl,
-                                        SrcVT.getScalarType());
+        ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType());
       Value = ValRes;
       Chain = Result.getValue(1);
       break;
@@ -1155,8 +1153,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
   }
 }
 
-/// LegalizeOp - Return a legal replacement for the given operation, with
-/// all legal operands.
+/// Return a legal replacement for the given operation, with all legal operands.
 void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   DEBUG(dbgs() << "\nLegalizing: "; Node->dump(&DAG));
 
@@ -1642,8 +1639,8 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
   Results.push_back(Tmp2);
 }
 
-/// LegalizeSetCCCondCode - Legalize a SETCC with given LHS and RHS and
-/// condition code CC on the current target.
+/// Legalize a SETCC with given LHS and RHS and condition code CC on the current
+/// target.
 ///
 /// If the SETCC has been legalized using AND / OR, then the legalized node
 /// will be stored in LHS. RHS and CC will be set to SDValue(). NeedInvert
@@ -1757,7 +1754,7 @@ bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
   return false;
 }
 
-/// EmitStackConvert - Emit a store/load combination to the stack.  This stores
+/// Emit a store/load combination to the stack.  This stores
 /// SrcOp to a stack slot of type SlotVT, truncating it if needed.  It then does
 /// a load from the stack slot to DestVT, extending it if needed.
 /// The resultant code need not be legal.
@@ -1917,7 +1914,7 @@ ExpandBVWithShuffles(SDNode *Node, SelectionDAG &DAG,
   return true;
 }
 
-/// ExpandBUILD_VECTOR - Expand a BUILD_VECTOR node on targets that don't
+/// Expand a BUILD_VECTOR node on targets that don't
 /// support the operation, but do support the resultant vector type.
 SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
   unsigned NumElems = Node->getNumOperands();
@@ -2029,7 +2026,7 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
   return ExpandVectorBuildThroughStack(Node);
 }
 
-// ExpandLibCall - Expand a node into a call to a libcall.  If the result value
+// Expand a node into a call to a libcall.  If the result value
 // does not fit into a register, return the lo part and set the hi part to the
 // by-reg argument.  If it does fit into a single register, return the result
 // and leave the Hi part unset.
@@ -2077,7 +2074,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
   return CallInfo.first;
 }
 
-/// ExpandLibCall - Generate a libcall taking the given operands as arguments
+/// Generate a libcall taking the given operands as arguments
 /// and returning a result of type RetVT.
 SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
                                             const SDValue *Ops, unsigned NumOps,
@@ -2108,7 +2105,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
   return CallInfo.first;
 }
 
-// ExpandChainLibCall - Expand a node into a call to a libcall. Similar to
+// Expand a node into a call to a libcall. Similar to
 // ExpandLibCall except that the first operand is the in-chain.
 std::pair<SDValue, SDValue>
 SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
@@ -2178,7 +2175,7 @@ SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
   return ExpandLibCall(LC, Node, isSigned);
 }
 
-/// isDivRemLibcallAvailable - Return true if divmod libcall is available.
+/// Return true if divmod libcall is available.
 static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
                                      const TargetLowering &TLI) {
   RTLIB::Libcall LC;
@@ -2194,8 +2191,7 @@ static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
   return TLI.getLibcallName(LC) != nullptr;
 }
 
-/// useDivRem - Only issue divrem libcall if both quotient and remainder are
-/// needed.
+/// Only issue divrem libcall if both quotient and remainder are needed.
 static bool useDivRem(SDNode *Node, bool isSigned, bool isDIV) {
   // The other use might have been replaced with a divrem already.
   unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
@@ -2220,8 +2216,7 @@ static bool useDivRem(SDNode *Node, bool isSigned, bool isDIV) {
   return false;
 }
 
-/// ExpandDivRemLibCall - Issue libcalls to __{u}divmod to compute div / rem
-/// pairs.
+/// Issue libcalls to __{u}divmod to compute div / rem pairs.
 void
 SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
                                           SmallVectorImpl<SDValue> &Results) {
@@ -2283,7 +2278,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   Results.push_back(Rem);
 }
 
-/// isSinCosLibcallAvailable - Return true if sincos libcall is available.
+/// Return true if sincos libcall is available.
 static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
   RTLIB::Libcall LC;
   switch (Node->getSimpleValueType(0).SimpleTy) {
@@ -2297,8 +2292,8 @@ static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
   return TLI.getLibcallName(LC) != nullptr;
 }
 
-/// canCombineSinCosLibcall - Return true if sincos libcall is available and
-/// can be used to combine sin and cos.
+/// Return true if sincos libcall is available and can be used to combine sin
+/// and cos.
 static bool canCombineSinCosLibcall(SDNode *Node, const TargetLowering &TLI,
                                     const TargetMachine &TM) {
   if (!isSinCosLibcallAvailable(Node, TLI))
@@ -2311,8 +2306,7 @@ static bool canCombineSinCosLibcall(SDNode *Node, const TargetLowering &TLI,
   return true;
 }
 
-/// useSinCos - Only issue sincos libcall if both sin and cos are
-/// needed.
+/// Only issue sincos libcall if both sin and cos are needed.
 static bool useSinCos(SDNode *Node) {
   unsigned OtherOpcode = Node->getOpcode() == ISD::FSIN
     ? ISD::FCOS : ISD::FSIN;
@@ -2330,8 +2324,7 @@ static bool useSinCos(SDNode *Node) {
   return false;
 }
 
-/// ExpandSinCosLibCall - Issue libcalls to sincos to compute sin / cos
-/// pairs.
+/// Issue libcalls to sincos to compute sin / cos pairs.
 void
 SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
                                           SmallVectorImpl<SDValue> &Results) {
@@ -2396,7 +2389,7 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
                                 MachinePointerInfo(), false, false, false, 0));
 }
 
-/// ExpandLegalINT_TO_FP - This function is responsible for legalizing a
+/// This function is responsible for legalizing a
 /// INT_TO_FP operation of the specified operand when the target requests that
 /// we expand it.  At this point, we know that the result and operand types are
 /// legal for the target.
@@ -2594,7 +2587,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
   return DAG.getNode(ISD::FADD, dl, DestVT, Tmp1, FudgeInReg);
 }
 
-/// PromoteLegalINT_TO_FP - This function is responsible for legalizing a
+/// This function is responsible for legalizing a
 /// *INT_TO_FP operation of the specified operand when the target requests that
 /// we promote it.  At this point, we know that the result and operand types are
 /// legal for the target, and that there is a legal UINT_TO_FP or SINT_TO_FP
@@ -2636,7 +2629,7 @@ SDValue SelectionDAGLegalize::PromoteLegalINT_TO_FP(SDValue LegalOp,
                                  dl, NewInTy, LegalOp));
 }
 
-/// PromoteLegalFP_TO_INT - This function is responsible for legalizing a
+/// This function is responsible for legalizing a
 /// FP_TO_*INT operation of the specified operand when the target requests that
 /// we promote it.  At this point, we know that the result and operand types are
 /// legal for the target, and that there is a legal FP_TO_UINT or FP_TO_SINT
@@ -2680,8 +2673,7 @@ SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp,
   return DAG.getNode(ISD::TRUNCATE, dl, DestVT, Operation);
 }
 
-/// ExpandBSWAP - Open code the operations for BSWAP of the specified operation.
-///
+/// Open code the operations for BSWAP of the specified operation.
 SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, SDLoc dl) {
   EVT VT = Op.getValueType();
   EVT SHVT = TLI.getShiftAmountTy(VT);
@@ -2727,8 +2719,7 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, SDLoc dl) {
   }
 }
 
-/// ExpandBitCount - Expand the specified bitcount instruction into operations.
-///
+/// Expand the specified bitcount instruction into operations.
 SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
                                              SDLoc dl) {
   switch (Opc) {
@@ -3528,6 +3519,9 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                                       RTLIB::FMA_F80, RTLIB::FMA_F128,
                                       RTLIB::FMA_PPCF128));
     break;
+  case ISD::FMAD:
+    llvm_unreachable("Illegal fmad should never be formed");
+
   case ISD::FADD:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64,
                                       RTLIB::ADD_F80, RTLIB::ADD_F128,
@@ -3554,6 +3548,21 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     break;
   }
   case ISD::FP_TO_FP16: {
+    if (!TM.Options.UseSoftFloat && TM.Options.UnsafeFPMath) {
+      SDValue Op = Node->getOperand(0);
+      MVT SVT = Op.getSimpleValueType();
+      if ((SVT == MVT::f64 || SVT == MVT::f80) &&
+          TLI.isOperationLegalOrCustom(ISD::FP_TO_FP16, MVT::f32)) {
+        // Under fastmath, we can expand this node into a fround followed by
+        // a float-half conversion.
+        SDValue FloatVal = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Op,
+                                       DAG.getIntPtrConstant(0));
+        Results.push_back(
+            DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, FloatVal));
+        break;
+      }
+    }
+
     RTLIB::Libcall LC =
         RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::f16);
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_fp16");
@@ -4319,8 +4328,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     ReplaceNode(Node, Results.data());
 }
 
-// SelectionDAG::Legalize - This is the entry point for the file.
-//
+/// This is the entry point for the file.
 void SelectionDAG::Legalize() {
   AssignTopologicalOrder();
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 4591e79..b596715 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -658,7 +658,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
                            NVT, N->getOperand(0));
   return TLI.makeLibCall(DAG, LC,
                          TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
-                         &Op, 1, false, dl).first;
+                         &Op, 1, Signed, dl).first;
 }
 
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index b73bb0a..5507c70 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -66,6 +66,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::EXTRACT_VECTOR_ELT:
                          Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break;
   case ISD::LOAD:        Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N));break;
+  case ISD::MLOAD:       Res = PromoteIntRes_MLOAD(cast<MaskedLoadSDNode>(N));break;
   case ISD::SELECT:      Res = PromoteIntRes_SELECT(N); break;
   case ISD::VSELECT:     Res = PromoteIntRes_VSELECT(N); break;
   case ISD::SELECT_CC:   Res = PromoteIntRes_SELECT_CC(N); break;
@@ -454,6 +455,24 @@ SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
   return Res;
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue ExtSrc0 = GetPromotedInteger(N->getSrc0());
+
+  SDValue Mask = N->getMask();
+  EVT NewMaskVT = getSetCCResultType(NVT);
+  if (NewMaskVT != N->getMask().getValueType())
+    Mask = PromoteTargetBoolean(Mask, NewMaskVT);
+  SDLoc dl(N);
+
+  SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(),
+                                  Mask, ExtSrc0, N->getMemoryVT(),
+                                  N->getMemOperand(), ISD::SEXTLOAD);
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
 /// Promote the overflow flag of an overflowing arithmetic node.
 SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   // Simply change the return type of the boolean result.
@@ -825,6 +844,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::SINT_TO_FP:   Res = PromoteIntOp_SINT_TO_FP(N); break;
   case ISD::STORE:        Res = PromoteIntOp_STORE(cast<StoreSDNode>(N),
                                                    OpNo); break;
+  case ISD::MSTORE:       Res = PromoteIntOp_MSTORE(cast<MaskedStoreSDNode>(N),
+                                                    OpNo); break;
+  case ISD::MLOAD:        Res = PromoteIntOp_MLOAD(cast<MaskedLoadSDNode>(N),
+                                                    OpNo); break;
   case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
   case ISD::FP16_TO_FP:
   case ISD::UINT_TO_FP:   Res = PromoteIntOp_UINT_TO_FP(N); break;
@@ -1091,6 +1114,64 @@ SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
                            N->getMemoryVT(), N->getMemOperand());
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo){
+
+  assert(OpNo == 2 && "Only know how to promote the mask!");
+  SDValue DataOp = N->getValue();
+  EVT DataVT = DataOp.getValueType();
+  SDValue Mask = N->getMask();
+  EVT MaskVT = Mask.getValueType();
+  SDLoc dl(N);
+
+  bool TruncateStore = false;
+  if (!TLI.isTypeLegal(DataVT)) {
+    if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) {
+      DataOp = GetPromotedInteger(DataOp);
+      Mask = PromoteTargetBoolean(Mask, DataOp.getValueType());
+      TruncateStore = true;
+    }
+    else {
+      assert(getTypeAction(DataVT) == TargetLowering::TypeWidenVector &&
+             "Unexpected data legalization in MSTORE");
+      DataOp = GetWidenedVector(DataOp);
+
+      if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
+        Mask = GetWidenedVector(Mask);
+      else {
+        EVT BoolVT = getSetCCResultType(DataOp.getValueType());
+
+        // We can't use ModifyToType() because we should fill the mask with
+        // zeroes
+        unsigned WidenNumElts = BoolVT.getVectorNumElements();
+        unsigned MaskNumElts = MaskVT.getVectorNumElements();
+
+        unsigned NumConcat = WidenNumElts / MaskNumElts;
+        SmallVector<SDValue, 16> Ops(NumConcat);
+        SDValue ZeroVal = DAG.getConstant(0, MaskVT);
+        Ops[0] = Mask;
+        for (unsigned i = 1; i != NumConcat; ++i)
+          Ops[i] = ZeroVal;
+
+        Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
+      }
+    }
+  }
+  else
+    Mask = PromoteTargetBoolean(N->getMask(), DataOp.getValueType());
+  return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask,
+                            N->getMemoryVT(), N->getMemOperand(),
+                            TruncateStore);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo){
+  assert(OpNo == 2 && "Only know how to promote the mask!");
+  EVT DataVT = N->getValueType(0);
+  SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
+  SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
+  NewOps[OpNo] = Mask;
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op);
@@ -2936,17 +3017,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SHUFFLE(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
-  unsigned NumElts = VT.getVectorNumElements();
-  SmallVector<int, 8> NewMask;
-  for (unsigned i = 0; i != NumElts; ++i) {
-    NewMask.push_back(SV->getMaskElt(i));
-  }
+  ArrayRef<int> NewMask = SV->getMask().slice(0, VT.getVectorNumElements());
 
   SDValue V0 = GetPromotedInteger(N->getOperand(0));
   SDValue V1 = GetPromotedInteger(N->getOperand(1));
   EVT OutVT = V0.getValueType();
 
-  return DAG.getVectorShuffle(OutVT, dl, V0, V1, &NewMask[0]);
+  return DAG.getVectorShuffle(OutVT, dl, V0, V1, NewMask);
 }
 
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index bd7dacf..ebf6b28 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -921,6 +921,17 @@ bool DAGTypeLegalizer::CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult) {
     // The target didn't want to custom lower it after all.
     return false;
 
+  // When called from DAGTypeLegalizer::ExpandIntegerResult, we might need to
+  // provide the same kind of custom splitting behavior.
+  if (Results.size() == N->getNumValues() + 1 && LegalizeResult) {
+    // We've legalized a return type by splitting it. If there is a chain,
+    // replace that too.
+    SetExpandedInteger(SDValue(N, 0), Results[0], Results[1]);
+    if (N->getNumValues() > 1)
+      ReplaceValueWith(SDValue(N, 1), Results[2]);
+    return true;
+  }
+
   // Make everything that once used N's values now use those in Results instead.
   assert(Results.size() == N->getNumValues() &&
          "Custom lowering returned the wrong number of results!");
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 30f412b..cef3fc9 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -240,6 +240,7 @@ private:
   SDValue PromoteIntRes_FP_TO_FP16(SDNode *N);
   SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
   SDValue PromoteIntRes_LOAD(LoadSDNode *N);
+  SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N);
   SDValue PromoteIntRes_Overflow(SDNode *N);
   SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_SDIV(SDNode *N);
@@ -285,6 +286,8 @@ private:
   SDValue PromoteIntOp_TRUNCATE(SDNode *N);
   SDValue PromoteIntOp_UINT_TO_FP(SDNode *N);
   SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
 
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
 
@@ -578,6 +581,7 @@ private:
   void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_MLOAD(MaskedLoadSDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -594,6 +598,7 @@ private:
   SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
   SDValue SplitVecOp_TRUNCATE(SDNode *N);
   SDValue SplitVecOp_VSETCC(SDNode *N);
@@ -627,6 +632,7 @@ private:
   SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
   SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
   SDValue WidenVecRes_LOAD(SDNode* N);
+  SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
   SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
   SDValue WidenVecRes_SIGN_EXTEND_INREG(SDNode* N);
   SDValue WidenVecRes_SELECT(SDNode* N);
@@ -653,6 +659,7 @@ private:
   SDValue WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue WidenVecOp_STORE(SDNode* N);
+  SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo);
   SDValue WidenVecOp_SETCC(SDNode* N);
 
   SDValue WidenVecOp_Convert(SDNode *N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index b5af7b7..03c2734 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -200,12 +200,15 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
     ISD::LoadExtType ExtType = LD->getExtensionType();
     if (LD->getMemoryVT().isVector() && ExtType != ISD::NON_EXTLOAD)
-      switch (TLI.getLoadExtAction(LD->getExtensionType(), LD->getMemoryVT())) {
+      switch (TLI.getLoadExtAction(LD->getExtensionType(), LD->getValueType(0),
+                                   LD->getMemoryVT())) {
       default: llvm_unreachable("This action is not supported yet!");
       case TargetLowering::Legal:
         return TranslateLegalizeResults(Op, Result);
       case TargetLowering::Custom:
         if (SDValue Lowered = TLI.LowerOperation(Result, DAG)) {
+          if (Lowered == Result)
+            return TranslateLegalizeResults(Op, Lowered);
           Changed = true;
           if (Lowered->getNumValues() != Op->getNumValues()) {
             // This expanded to something other than the load. Assume the
@@ -231,9 +234,11 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
       default: llvm_unreachable("This action is not supported yet!");
       case TargetLowering::Legal:
         return TranslateLegalizeResults(Op, Result);
-      case TargetLowering::Custom:
-        Changed = true;
-        return TranslateLegalizeResults(Op, TLI.LowerOperation(Result, DAG));
+      case TargetLowering::Custom: {
+        SDValue Lowered = TLI.LowerOperation(Result, DAG);
+        Changed = Lowered != Result;
+        return TranslateLegalizeResults(Op, Lowered);
+      }
       case TargetLowering::Expand:
         Changed = true;
         return LegalizeOp(ExpandStore(Op));
@@ -389,7 +394,8 @@ SDValue VectorLegalizer::Promote(SDValue Op) {
       if (Op.getOperand(j)
               .getValueType()
               .getVectorElementType()
-              .isFloatingPoint())
+              .isFloatingPoint() &&
+          NVT.isVector() && NVT.getVectorElementType().isFloatingPoint())
         Operands[j] = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op.getOperand(j));
       else
         Operands[j] = DAG.getNode(ISD::BITCAST, dl, NVT, Op.getOperand(j));
@@ -398,8 +404,9 @@ SDValue VectorLegalizer::Promote(SDValue Op) {
   }
 
   Op = DAG.getNode(Op.getOpcode(), dl, NVT, Operands);
-  if (VT.isFloatingPoint() ||
-      (VT.isVector() && VT.getVectorElementType().isFloatingPoint()))
+  if ((VT.isFloatingPoint() && NVT.isFloatingPoint()) ||
+      (VT.isVector() && VT.getVectorElementType().isFloatingPoint() &&
+       NVT.isVector() && NVT.getVectorElementType().isFloatingPoint()))
     return DAG.getNode(ISD::FP_ROUND, dl, VT, Op, DAG.getIntPtrConstant(0));
   else
     return DAG.getNode(ISD::BITCAST, dl, VT, Op);
@@ -509,7 +516,8 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
         ScalarLoad = DAG.getLoad(WideVT, dl, Chain, BasePTR,
                                  LD->getPointerInfo().getWithOffset(Offset),
                                  LD->isVolatile(), LD->isNonTemporal(),
-                                 LD->isInvariant(), LD->getAlignment(),
+                                 LD->isInvariant(),
+                                 MinAlign(LD->getAlignment(), Offset),
                                  LD->getAAInfo());
       } else {
         EVT LoadVT = WideVT;
@@ -521,7 +529,8 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
                                     LD->getPointerInfo().getWithOffset(Offset),
                                     LoadVT, LD->isVolatile(),
                                     LD->isNonTemporal(), LD->isInvariant(),
-                                    LD->getAlignment(), LD->getAAInfo());
+                                    MinAlign(LD->getAlignment(), Offset),
+                                    LD->getAAInfo());
       }
 
       RemainingBytes -= LoadBytes;
@@ -553,9 +562,9 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
       BitOffset += SrcEltBits;
       if (BitOffset >= WideBits) {
         WideIdx++;
-        Offset -= WideBits;
-        if (Offset > 0) {
-          ShAmt = DAG.getConstant(SrcEltBits - Offset,
+        BitOffset -= WideBits;
+        if (BitOffset > 0) {
+          ShAmt = DAG.getConstant(SrcEltBits - BitOffset,
                                   TLI.getShiftAmountTy(WideVT));
           Hi = DAG.getNode(ISD::SHL, dl, WideVT, LoadVals[WideIdx], ShAmt);
           Hi = DAG.getNode(ISD::AND, dl, WideVT, Hi, SrcEltBitMask);
@@ -592,7 +601,7 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
                 Chain, BasePTR, LD->getPointerInfo().getWithOffset(Idx * Stride),
                 SrcVT.getScalarType(),
                 LD->isVolatile(), LD->isNonTemporal(), LD->isInvariant(),
-                LD->getAlignment(), LD->getAAInfo());
+                MinAlign(LD->getAlignment(), Idx * Stride), LD->getAAInfo());
 
       BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
                          DAG.getConstant(Stride, BasePTR.getValueType()));
@@ -651,7 +660,8 @@ SDValue VectorLegalizer::ExpandStore(SDValue Op) {
     // This scalar TruncStore may be illegal, but we legalize it later.
     SDValue Store = DAG.getTruncStore(Chain, dl, Ex, BasePTR,
                ST->getPointerInfo().getWithOffset(Idx*Stride), MemSclVT,
-               isVolatile, isNonTemporal, Alignment, AAInfo);
+               isVolatile, isNonTemporal, MinAlign(Alignment, Idx*Stride),
+               AAInfo);
 
     BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
                                DAG.getConstant(Stride, BasePTR.getValueType()));
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 27f63d2..63671f7 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -597,6 +597,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::LOAD:
     SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
     break;
+  case ISD::MLOAD:
+    SplitVecRes_MLOAD(cast<MaskedLoadSDNode>(N), Lo, Hi);
+    break;
   case ISD::SETCC:
     SplitVecRes_SETCC(N, Lo, Hi);
     break;
@@ -979,6 +982,67 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
   ReplaceValueWith(SDValue(LD, 1), Ch);
 }
 
+void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT LoVT, HiVT;
+  SDLoc dl(MLD);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
+
+  SDValue Ch = MLD->getChain();
+  SDValue Ptr = MLD->getBasePtr();
+  SDValue Mask = MLD->getMask();
+  unsigned Alignment = MLD->getOriginalAlignment();
+  ISD::LoadExtType ExtType = MLD->getExtensionType();
+
+  // if Alignment is equal to the vector size,
+  // take the half of it for the second part
+  unsigned SecondHalfAlignment =
+    (Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
+     Alignment/2 : Alignment;
+
+  SDValue MaskLo, MaskHi;
+  std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+
+  EVT MemoryVT = MLD->getMemoryVT();
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  SDValue Src0 = MLD->getSrc0();
+  SDValue Src0Lo, Src0Hi;
+  std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MLD->getPointerInfo(), 
+                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
+                         Alignment, MLD->getAAInfo(), MLD->getRanges());
+
+  Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, Src0Lo, LoMemVT, MMO,
+                         ExtType);
+
+  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
+
+  MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MLD->getPointerInfo(), 
+                         MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
+                         SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());
+
+  Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
+                         ExtType);
+
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                   Hi.getValue(1));
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(MLD, 1), Ch);
+
+}
+
 void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
   assert(N->getValueType(0).isVector() &&
          N->getOperand(0).getValueType().isVector() &&
@@ -1234,6 +1298,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::STORE:
       Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
       break;
+    case ISD::MSTORE:
+      Res = SplitVecOp_MSTORE(cast<MaskedStoreSDNode>(N), OpNo);
+      break;
     case ISD::VSELECT:
       Res = SplitVecOp_VSELECT(N, OpNo);
       break;
@@ -1395,6 +1462,58 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
                         MachinePointerInfo(), EltVT, false, false, false, 0);
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
+                                            unsigned OpNo) {
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  SDValue Mask = N->getMask();
+  SDValue Data = N->getValue();
+  EVT MemoryVT = N->getMemoryVT();
+  unsigned Alignment = N->getOriginalAlignment();
+  SDLoc DL(N);
+  
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  SDValue DataLo, DataHi;
+  GetSplitVector(Data, DataLo, DataHi);
+  SDValue MaskLo, MaskHi;
+  GetSplitVector(Mask, MaskLo, MaskHi);
+
+  // if Alignment is equal to the vector size,
+  // take the half of it for the second part
+  unsigned SecondHalfAlignment =
+    (Alignment == Data->getValueType(0).getSizeInBits()/8) ?
+       Alignment/2 : Alignment;
+
+  SDValue Lo, Hi;
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(N->getPointerInfo(), 
+                         MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
+                         Alignment, N->getAAInfo(), N->getRanges());
+
+  Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
+                          N->isTruncatingStore());
+
+  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+  Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
+
+  MMO = DAG.getMachineFunction().
+    getMachineMemOperand(N->getPointerInfo(), 
+                         MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
+                         SecondHalfAlignment, N->getAAInfo(), N->getRanges());
+
+  Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
+                          N->isTruncatingStore());
+
+
+  // Build a factor node to remember that this store is independent of the
+  // other one.
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+
+}
+
 SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   assert(N->isUnindexed() && "Indexed store of vector?");
   assert(OpNo == 1 && "Can only split the stored value");
@@ -1599,6 +1718,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::VECTOR_SHUFFLE:
     Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N));
     break;
+  case ISD::MLOAD:
+    Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N));
+    break;
 
   case ISD::ADD:
   case ISD::AND:
@@ -2289,6 +2411,44 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
   return Result;
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
+  
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0));
+  SDValue Mask = N->getMask();
+  EVT MaskVT = Mask.getValueType();
+  SDValue Src0 = GetWidenedVector(N->getSrc0());
+  ISD::LoadExtType ExtType = N->getExtensionType();
+  SDLoc dl(N);
+
+  if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
+    Mask = GetWidenedVector(Mask);
+  else {
+    EVT BoolVT = getSetCCResultType(WidenVT);
+
+    // We can't use ModifyToType() because we should fill the mask with
+    // zeroes
+    unsigned WidenNumElts = BoolVT.getVectorNumElements();
+    unsigned MaskNumElts = MaskVT.getVectorNumElements();
+
+    unsigned NumConcat = WidenNumElts / MaskNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue ZeroVal = DAG.getConstant(0, MaskVT);
+    Ops[0] = Mask;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = ZeroVal;
+
+    Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
+  }
+
+  SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(),
+                                  Mask, Src0, N->getMemoryVT(),
+                                  N->getMemOperand(), ExtType);
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N),
@@ -2434,6 +2594,7 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::EXTRACT_SUBVECTOR:  Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break;
   case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
   case ISD::STORE:              Res = WidenVecOp_STORE(N); break;
+  case ISD::MSTORE:             Res = WidenVecOp_MSTORE(N, OpNo); break;
   case ISD::SETCC:              Res = WidenVecOp_SETCC(N); break;
 
   case ISD::ANY_EXTEND:
@@ -2632,6 +2793,42 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
     return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
 }
 
+SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
+  MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
+  SDValue Mask = MST->getMask();
+  EVT MaskVT = Mask.getValueType();
+  SDValue StVal = MST->getValue();
+  // Widen the value
+  SDValue WideVal = GetWidenedVector(StVal);
+  SDLoc dl(N);
+
+  if (OpNo == 2 || getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
+    Mask = GetWidenedVector(Mask);
+  else {
+    // The mask should be widened as well
+    EVT BoolVT = getSetCCResultType(WideVal.getValueType());
+    // We can't use ModifyToType() because we should fill the mask with
+    // zeroes
+    unsigned WidenNumElts = BoolVT.getVectorNumElements();
+    unsigned MaskNumElts = MaskVT.getVectorNumElements();
+
+    unsigned NumConcat = WidenNumElts / MaskNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue ZeroVal = DAG.getConstant(0, MaskVT);
+    Ops[0] = Mask;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = ZeroVal;
+
+    Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
+  }
+  assert(Mask.getValueType().getVectorNumElements() ==
+         WideVal.getValueType().getVectorNumElements() &&
+         "Mask and data vectors should have the same number of elements");
+  return DAG.getMaskedStore(MST->getChain(), dl, WideVal, MST->getBasePtr(),
+                            Mask, MST->getMemoryVT(), MST->getMemOperand(),
+                            false);
+}
+
 SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
   SDValue InOp0 = GetWidenedVector(N->getOperand(0));
   SDValue InOp1 = GetWidenedVector(N->getOperand(1));
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 8b9f618..3853ada 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -137,13 +137,9 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
 }
 
 // Helper for AddGlue to clone node operands.
-static void CloneNodeWithValues(SDNode *N, SelectionDAG *DAG,
-                                SmallVectorImpl<EVT> &VTs,
+static void CloneNodeWithValues(SDNode *N, SelectionDAG *DAG, ArrayRef<EVT> VTs,
                                 SDValue ExtraOper = SDValue()) {
-  SmallVector<SDValue, 8> Ops;
-  for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I)
-    Ops.push_back(N->getOperand(I));
-
+  SmallVector<SDValue, 8> Ops(N->op_begin(), N->op_end());
   if (ExtraOper.getNode())
     Ops.push_back(ExtraOper);
 
@@ -165,7 +161,6 @@ static void CloneNodeWithValues(SDNode *N, SelectionDAG *DAG,
 }
 
 static bool AddGlue(SDNode *N, SDValue Glue, bool AddGlue, SelectionDAG *DAG) {
-  SmallVector<EVT, 4> VTs;
   SDNode *GlueDestNode = Glue.getNode();
 
   // Don't add glue from a node to itself.
@@ -179,9 +174,7 @@ static bool AddGlue(SDNode *N, SDValue Glue, bool AddGlue, SelectionDAG *DAG) {
   // Don't add glue to something that already has a glue value.
   if (N->getValueType(N->getNumValues() - 1) == MVT::Glue) return false;
 
-  for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
-    VTs.push_back(N->getValueType(I));
-
+  SmallVector<EVT, 4> VTs(N->value_begin(), N->value_end());
   if (AddGlue)
     VTs.push_back(MVT::Glue);
 
@@ -197,11 +190,8 @@ static void RemoveUnusedGlue(SDNode *N, SelectionDAG *DAG) {
           !N->hasAnyUseOfValue(N->getNumValues() - 1)) &&
          "expected an unused glue value");
 
-  SmallVector<EVT, 4> VTs;
-  for (unsigned I = 0, E = N->getNumValues()-1; I != E; ++I)
-    VTs.push_back(N->getValueType(I));
-
-  CloneNodeWithValues(N, DAG, VTs);
+  CloneNodeWithValues(N, DAG,
+                      makeArrayRef(N->value_begin(), N->getNumValues() - 1));
 }
 
 /// ClusterNeighboringLoads - Force nearby loads together by "gluing" them.
@@ -551,6 +541,14 @@ void ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs() {
     NodeNumDefs = 0;
     return;
   }
+  if (POpc == TargetOpcode::PATCHPOINT &&
+      Node->getValueType(0) == MVT::Other) {
+    // PATCHPOINT is defined to have one result, but it might really have none
+    // if we're not using CallingConv::AnyReg. Don't mistake the chain for a
+    // real definition.
+    NodeNumDefs = 0;
+    return;
+  }
   unsigned NRegDefs = SchedDAG->TII->get(Node->getMachineOpcode()).getNumDefs();
   // Some instructions define regs that are not represented in the selection DAG
   // (e.g. unused flags). See tMOVi8. Make sure we don't access past NumValues.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 7961e66..9466f4d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -234,10 +234,10 @@ bool ISD::allOperandsUndef(const SDNode *N) {
   return true;
 }
 
-ISD::NodeType ISD::getExtForLoadExtType(ISD::LoadExtType ExtType) {
+ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) {
   switch (ExtType) {
   case ISD::EXTLOAD:
-    return ISD::ANY_EXTEND;
+    return IsFP ? ISD::FP_EXTEND : ISD::ANY_EXTEND;
   case ISD::SEXTLOAD:
     return ISD::SIGN_EXTEND;
   case ISD::ZEXTLOAD:
@@ -1484,6 +1484,34 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
   if (N1.getOpcode() == ISD::UNDEF)
     commuteShuffle(N1, N2, MaskVec);
 
+  // If shuffling a splat, try to blend the splat instead. We do this here so
+  // that even when this arises during lowering we don't have to re-handle it.
+  auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) {
+    BitVector UndefElements;
+    SDValue Splat = BV->getSplatValue(&UndefElements);
+    if (!Splat)
+      return;
+
+    for (int i = 0; i < (int)NElts; ++i) {
+      if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + (int)NElts))
+        continue;
+
+      // If this input comes from undef, mark it as such.
+      if (UndefElements[MaskVec[i] - Offset]) {
+        MaskVec[i] = -1;
+        continue;
+      }
+
+      // If we can blend a non-undef lane, use that instead.
+      if (!UndefElements[i])
+        MaskVec[i] = i + Offset;
+    }
+  };
+  if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
+    BlendSplat(N1BV, 0);
+  if (auto *N2BV = dyn_cast<BuildVectorSDNode>(N2))
+    BlendSplat(N2BV, NElts);
+
   // Canonicalize all index into lhs, -> shuffle lhs, undef
   // Canonicalize all index into rhs, -> shuffle rhs, undef
   bool AllLHS = true, AllRHS = true;
@@ -1513,9 +1541,10 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
     return getUNDEF(VT);
 
   // If Identity shuffle return that node.
-  bool Identity = true;
+  bool Identity = true, AllSame = true;
   for (unsigned i = 0; i != NElts; ++i) {
     if (MaskVec[i] >= 0 && MaskVec[i] != (int)i) Identity = false;
+    if (MaskVec[i] != MaskVec[0]) AllSame = false;
   }
   if (Identity && NElts)
     return N1;
@@ -1537,18 +1566,35 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
       if (Splat && Splat.getOpcode() == ISD::UNDEF)
         return getUNDEF(VT);
 
+      bool SameNumElts =
+          V.getValueType().getVectorNumElements() == VT.getVectorNumElements();
+
       // We only have a splat which can skip shuffles if there is a splatted
       // value and no undef lanes rearranged by the shuffle.
       if (Splat && UndefElements.none()) {
         // Splat of <x, x, ..., x>, return <x, x, ..., x>, provided that the
         // number of elements match or the value splatted is a zero constant.
-        if (V.getValueType().getVectorNumElements() ==
-            VT.getVectorNumElements())
+        if (SameNumElts)
           return N1;
         if (auto *C = dyn_cast<ConstantSDNode>(Splat))
           if (C->isNullValue())
             return N1;
       }
+
+      // If the shuffle itself creates a splat, build the vector directly.
+      if (AllSame && SameNumElts) {
+        const SDValue &Splatted = BV->getOperand(MaskVec[0]);
+        SmallVector<SDValue, 8> Ops(NElts, Splatted);
+
+        EVT BuildVT = BV->getValueType(0);
+        SDValue NewBV = getNode(ISD::BUILD_VECTOR, dl, BuildVT, Ops);
+
+        // We may have jumped through bitcasts, so the type of the
+        // BUILD_VECTOR may not match the type of the shuffle.
+        if (BuildVT != VT)
+          NewBV = getNode(ISD::BITCAST, dl, VT, NewBV);
+        return NewBV;
+      }
     }
   }
 
@@ -2323,6 +2369,21 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     KnownZero = APInt::getHighBitsSet(BitWidth, Leaders);
     break;
   }
+  case ISD::EXTRACT_ELEMENT: {
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    const unsigned Index =
+      cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    const unsigned BitWidth = Op.getValueType().getSizeInBits();
+
+    // Remove low part of known bits mask
+    KnownZero = KnownZero.getHiBits(KnownZero.getBitWidth() - Index * BitWidth);
+    KnownOne = KnownOne.getHiBits(KnownOne.getBitWidth() - Index * BitWidth);
+
+    // Remove high part of known bit mask
+    KnownZero = KnownZero.trunc(BitWidth);
+    KnownOne = KnownOne.trunc(BitWidth);
+    break;
+  }
   case ISD::FrameIndex:
   case ISD::TargetFrameIndex:
     if (unsigned Align = InferPtrAlignment(Op)) {
@@ -2522,6 +2583,21 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
     // FIXME: it's tricky to do anything useful for this, but it is an important
     // case for targets like X86.
     break;
+  case ISD::EXTRACT_ELEMENT: {
+    const int KnownSign = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    const int BitWidth = Op.getValueType().getSizeInBits();
+    const int Items =
+      Op.getOperand(0).getValueType().getSizeInBits() / BitWidth;
+
+    // Get reverse index (starting from 1), Op1 value indexes elements from
+    // little end. Sign starts at big end.
+    const int rIndex = Items - 1 -
+      cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+    // If the sign portion ends in our element the substraction gives correct
+    // result. Otherwise it gives either negative or > bitwidth result
+    return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0);
+  }
   }
 
   // If we are looking at the loaded value of the SDNode.
@@ -2683,6 +2759,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
       return getConstantFP(apf, VT);
     }
     case ISD::BITCAST:
+      if (VT == MVT::f16 && C->getValueType(0) == MVT::i16)
+        return getConstantFP(APFloat(APFloat::IEEEhalf, Val), VT);
       if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
         return getConstantFP(APFloat(APFloat::IEEEsingle, Val), VT);
       else if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
@@ -2756,7 +2834,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
       return getConstant(api, VT);
     }
     case ISD::BITCAST:
-      if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
+      if (VT == MVT::i16 && C->getValueType(0) == MVT::f16)
+        return getConstant((uint16_t)V.bitcastToAPInt().getZExtValue(), VT);
+      else if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
         return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), VT);
       else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
         return getConstant(V.bitcastToAPInt().getZExtValue(), VT);
@@ -3379,8 +3459,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
   }
 
   // Perform trivial constant folding.
-  SDValue SV = FoldConstantArithmetic(Opcode, VT, N1.getNode(), N2.getNode());
-  if (SV.getNode()) return SV;
+  if (SDValue SV =
+          FoldConstantArithmetic(Opcode, VT, N1.getNode(), N2.getNode()))
+    return SV;
 
   // Canonicalize constant to RHS if commutative.
   if (N1C && !N2C && isCommutativeBinOp(Opcode)) {
@@ -3564,7 +3645,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
       const APFloat &V3 = N3CFP->getValueAPF();
       APFloat::opStatus s =
         V1.fusedMultiplyAdd(V2, V3, APFloat::rmNearestTiesToEven);
-      if (s != APFloat::opInvalidOp)
+      if (!TLI->hasFloatingPointExceptions() || s != APFloat::opInvalidOp)
         return getConstantFP(V1, VT);
     }
     break;
@@ -3913,9 +3994,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  bool OptSize =
-    MF.getFunction()->getAttributes().
-      hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+  bool OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
@@ -4028,8 +4107,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  bool OptSize = MF.getFunction()->getAttributes().
-    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+  bool OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
@@ -4123,8 +4201,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, SDLoc dl,
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  bool OptSize = MF.getFunction()->getAttributes().
-    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+  bool OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
@@ -4214,11 +4291,13 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst,
 
   // Then check to see if we should lower the memcpy with target-specific
   // code. If the target chooses to do this, this is the next best.
-  SDValue Result =
-      TSI->EmitTargetCodeForMemcpy(*this, dl, Chain, Dst, Src, Size, Align,
-                                   isVol, AlwaysInline, DstPtrInfo, SrcPtrInfo);
-  if (Result.getNode())
-    return Result;
+  if (TSI) {
+    SDValue Result = TSI->EmitTargetCodeForMemcpy(
+        *this, dl, Chain, Dst, Src, Size, Align, isVol, AlwaysInline,
+        DstPtrInfo, SrcPtrInfo);
+    if (Result.getNode())
+      return Result;
+  }
 
   // If we really need inline code and the target declined to provide it,
   // use a (potentially long) sequence of loads and stores.
@@ -4280,10 +4359,12 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, SDLoc dl, SDValue Dst,
 
   // Then check to see if we should lower the memmove with target-specific
   // code. If the target chooses to do this, this is the next best.
-  SDValue Result = TSI->EmitTargetCodeForMemmove(
-      *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo, SrcPtrInfo);
-  if (Result.getNode())
-    return Result;
+  if (TSI) {
+    SDValue Result = TSI->EmitTargetCodeForMemmove(
+        *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo, SrcPtrInfo);
+    if (Result.getNode())
+      return Result;
+  }
 
   // FIXME: If the memmove is volatile, lowering it to plain libc memmove may
   // not be safe.  See memcpy above for more details.
@@ -4332,10 +4413,12 @@ SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst,
 
   // Then check to see if we should lower the memset with target-specific
   // code. If the target chooses to do this, this is the next best.
-  SDValue Result = TSI->EmitTargetCodeForMemset(*this, dl, Chain, Dst, Src,
-                                                Size, Align, isVol, DstPtrInfo);
-  if (Result.getNode())
-    return Result;
+  if (TSI) {
+    SDValue Result = TSI->EmitTargetCodeForMemset(
+        *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo);
+    if (Result.getNode())
+      return Result;
+  }
 
   // Emit a library call.
   Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(*getContext());
@@ -4680,10 +4763,10 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
     assert(VT.isInteger() == MemVT.isInteger() &&
            "Cannot convert from FP to Int or Int -> FP!");
     assert(VT.isVector() == MemVT.isVector() &&
-           "Cannot use trunc store to convert to or from a vector!");
+           "Cannot use an ext load to convert to or from a vector!");
     assert((!VT.isVector() ||
             VT.getVectorNumElements() == MemVT.getVectorNumElements()) &&
-           "Cannot use trunc store to change the number of vector elements!");
+           "Cannot use an ext load to change the number of vector elements!");
   }
 
   bool Indexed = AM != ISD::UNINDEXED;
@@ -4917,6 +5000,61 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, SDLoc dl, SDValue Base,
   return SDValue(N, 0);
 }
 
+SDValue
+SelectionDAG::getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain,
+                            SDValue Ptr, SDValue Mask, SDValue Src0, EVT MemVT,
+                            MachineMemOperand *MMO, ISD::LoadExtType ExtTy) {
+
+  SDVTList VTs = getVTList(VT, MVT::Other);
+  SDValue Ops[] = { Chain, Ptr, Mask, Src0 };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(ExtTy, ISD::UNINDEXED,
+                                     MMO->isVolatile(),
+                                     MMO->isNonTemporal(),
+                                     MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<MaskedLoadSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) MaskedLoadSDNode(dl.getIROrder(),
+                                             dl.getDebugLoc(), Ops, 4, VTs,
+                                             ExtTy, MemVT, MMO);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
+                                     SDValue Ptr, SDValue Mask, EVT MemVT,
+                                     MachineMemOperand *MMO, bool isTrunc) {
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+  EVT VT = Val.getValueType();
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Ops[] = { Chain, Ptr, Mask, Val };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(),
+                                     MMO->isNonTemporal(), MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<MaskedStoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) MaskedStoreSDNode(dl.getIROrder(),
+                                                    dl.getDebugLoc(), Ops, 4,
+                                                    VTs, isTrunc, MemVT, MMO);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
 SDValue SelectionDAG::getVAArg(EVT VT, SDLoc dl,
                                SDValue Chain, SDValue Ptr,
                                SDValue SV,
@@ -6495,11 +6633,25 @@ bool SelectionDAG::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base,
     return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
   }
 
-  // Handle X+C
-  if (isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc &&
-      cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes)
-    return true;
-
+  // Handle X + C.
+  if (isBaseWithConstantOffset(Loc)) {
+    int64_t LocOffset = cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();
+    if (Loc.getOperand(0) == BaseLoc) {
+      // If the base location is a simple address with no offset itself, then
+      // the second load's first add operand should be the base address.
+      if (LocOffset == Dist * (int)Bytes)
+        return true;
+    } else if (isBaseWithConstantOffset(BaseLoc)) {
+      // The base location itself has an offset, so subtract that value from the
+      // second load's offset before comparing to distance * size.
+      int64_t BOffset =
+        cast<ConstantSDNode>(BaseLoc.getOperand(1))->getSExtValue();
+      if (Loc.getOperand(0) == BaseLoc.getOperand(0)) {
+        if ((LocOffset - BOffset) == Dist * (int)Bytes)
+          return true;
+      }
+    }
+  }
   const GlobalValue *GV1 = nullptr;
   const GlobalValue *GV2 = nullptr;
   int64_t Offset1 = 0;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 8f582f1..097b618 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -16,9 +16,11 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
@@ -46,6 +48,8 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -54,7 +58,6 @@
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetSelectionDAGInfo.h"
@@ -564,6 +567,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, SDLoc DL,
   } else if (NumParts > 0) {
     // If the intermediate type was expanded, split each the value into
     // legal parts.
+    assert(NumIntermediates != 0 && "division by zero");
     assert(NumParts % NumIntermediates == 0 &&
            "Must expand into a divisible number of parts!");
     unsigned Factor = NumParts / NumIntermediates;
@@ -865,7 +869,7 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa,
   AA = &aa;
   GFI = gfi;
   LibInfo = li;
-  DL = DAG.getSubtarget().getDataLayout();
+  DL = DAG.getTarget().getDataLayout();
   Context = DAG.getContext();
   LPadToCallSiteMap.clear();
 }
@@ -884,6 +888,7 @@ void SelectionDAGBuilder::clear() {
   CurInst = nullptr;
   HasTailCall = false;
   SDNodeOrder = LowestSDNodeOrder;
+  StatepointLowering.clear();
 }
 
 /// clearDanglingDebugInfo - Clear the dangling debug information
@@ -1234,24 +1239,29 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     unsigned NumValues = ValueVTs.size();
     if (NumValues) {
       SDValue RetOp = getValue(I.getOperand(0));
-      for (unsigned j = 0, f = NumValues; j != f; ++j) {
-        EVT VT = ValueVTs[j];
 
-        ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+      const Function *F = I.getParent()->getParent();
+
+      ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+      if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                          Attribute::SExt))
+        ExtendKind = ISD::SIGN_EXTEND;
+      else if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                               Attribute::ZExt))
+        ExtendKind = ISD::ZERO_EXTEND;
+
+      LLVMContext &Context = F->getContext();
+      bool RetInReg = F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                                      Attribute::InReg);
 
-        const Function *F = I.getParent()->getParent();
-        if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
-                                            Attribute::SExt))
-          ExtendKind = ISD::SIGN_EXTEND;
-        else if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
-                                                 Attribute::ZExt))
-          ExtendKind = ISD::ZERO_EXTEND;
+      for (unsigned j = 0; j != NumValues; ++j) {
+        EVT VT = ValueVTs[j];
 
         if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
-          VT = TLI.getTypeForExtArgOrReturn(*DAG.getContext(), VT, ExtendKind);
+          VT = TLI.getTypeForExtArgOrReturn(Context, VT, ExtendKind);
 
-        unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), VT);
-        MVT PartVT = TLI.getRegisterType(*DAG.getContext(), VT);
+        unsigned NumParts = TLI.getNumRegisters(Context, VT);
+        MVT PartVT = TLI.getRegisterType(Context, VT);
         SmallVector<SDValue, 4> Parts(NumParts);
         getCopyToParts(DAG, getCurSDLoc(),
                        SDValue(RetOp.getNode(), RetOp.getResNo() + j),
@@ -1259,8 +1269,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
 
         // 'inreg' on function refers to return value
         ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
-        if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
-                                            Attribute::InReg))
+        if (RetInReg)
           Flags.setInReg();
 
         // Propagate extension type if any
@@ -1405,7 +1414,7 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
         if (TM.Options.NoNaNsFPMath)
           Condition = getFCmpCodeWithoutNaN(Condition);
       } else {
-        Condition = ISD::SETEQ; // silence warning.
+        (void)Condition; // silence warning.
         llvm_unreachable("Unknown compare instruction");
       }
 
@@ -1947,7 +1956,7 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
   SDValue ShiftOp = DAG.getCopyFromReg(getControlRoot(), getCurSDLoc(),
                                        Reg, VT);
   SDValue Cmp;
-  unsigned PopCount = CountPopulation_64(B.Mask);
+  unsigned PopCount = countPopulation(B.Mask);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (PopCount == 1) {
     // Testing for a single bit; just compare the shift count with what it
@@ -1959,7 +1968,7 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
     // There is only one zero bit in the range, test for it directly.
     Cmp = DAG.getSetCC(
         getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(), VT), ShiftOp,
-        DAG.getConstant(CountTrailingOnes_64(B.Mask), VT), ISD::SETNE);
+        DAG.getConstant(countTrailingOnes(B.Mask), VT), ISD::SETNE);
   } else {
     // Make desired shift
     SDValue SwitchVal = DAG.getNode(ISD::SHL, getCurSDLoc(), VT,
@@ -2062,10 +2071,14 @@ void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
   // Get the two live-in registers as SDValues. The physregs have already been
   // copied into virtual registers.
   SDValue Ops[2];
-  Ops[0] = DAG.getZExtOrTrunc(
-      DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
-                         FuncInfo.ExceptionPointerVirtReg, TLI.getPointerTy()),
-      getCurSDLoc(), ValueVTs[0]);
+  if (FuncInfo.ExceptionPointerVirtReg) {
+    Ops[0] = DAG.getZExtOrTrunc(
+        DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
+                           FuncInfo.ExceptionPointerVirtReg, TLI.getPointerTy()),
+        getCurSDLoc(), ValueVTs[0]);
+  } else {
+    Ops[0] = DAG.getConstant(0, TLI.getPointerTy());
+  }
   Ops[1] = DAG.getZExtOrTrunc(
       DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
                          FuncInfo.ExceptionSelectorVirtReg, TLI.getPointerTy()),
@@ -2077,6 +2090,27 @@ void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
   setValue(&LP, Res);
 }
 
+unsigned
+SelectionDAGBuilder::visitLandingPadClauseBB(GlobalValue *ClauseGV,
+                                             MachineBasicBlock *LPadBB) {
+  SDValue Chain = getControlRoot();
+
+  // Get the typeid that we will dispatch on later.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy());
+  unsigned VReg = FuncInfo.MF->getRegInfo().createVirtualRegister(RC);
+  unsigned TypeID = DAG.getMachineFunction().getMMI().getTypeIDFor(ClauseGV);
+  SDValue Sel = DAG.getConstant(TypeID, TLI.getPointerTy());
+  Chain = DAG.getCopyToReg(Chain, getCurSDLoc(), VReg, Sel);
+
+  // Branch to the main landing pad block.
+  MachineBasicBlock *ClauseMBB = FuncInfo.MBB;
+  ClauseMBB->addSuccessor(LPadBB);
+  DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, Chain,
+                          DAG.getBasicBlock(LPadBB)));
+  return VReg;
+}
+
 /// handleSmallSwitchCaseRange - Emit a series of specific tests (suitable for
 /// small case ranges).
 bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
@@ -2363,17 +2397,8 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
                                                   CaseRecVector& WorkList,
                                                   const Value* SV,
                                                   MachineBasicBlock* SwitchBB) {
-  // Get the MachineFunction which holds the current MBB.  This is used when
-  // inserting any additional MBBs necessary to represent the switch.
-  MachineFunction *CurMF = FuncInfo.MF;
-
-  // Figure out which block is immediately after the current one.
-  MachineFunction::iterator BBI = CR.CaseBB;
-  ++BBI;
-
   Case& FrontCase = *CR.Range.first;
   Case& BackCase  = *(CR.Range.second-1);
-  const BasicBlock *LLVMBB = CR.CaseBB->getBasicBlock();
 
   // Size is the number of Cases represented by this range.
   unsigned Size = CR.Range.second - CR.Range.first;
@@ -2395,6 +2420,7 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
   DEBUG(dbgs() << "Selecting best pivot: \n"
                << "First: " << First << ", Last: " << Last <<'\n'
                << "LSize: " << LSize << ", RSize: " << RSize << '\n');
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   for (CaseItr I = CR.Range.first, J=I+1, E = CR.Range.second;
        J!=E; ++I, ++J) {
     const APInt &LEnd = cast<ConstantInt>(I->High)->getValue();
@@ -2404,13 +2430,17 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
            "Invalid case distance");
     // Use volatile double here to avoid excess precision issues on some hosts,
     // e.g. that use 80-bit X87 registers.
+    // Only consider the density of sub-ranges that actually have sufficient
+    // entries to be lowered as a jump table.
     volatile double LDensity =
-       (double)LSize.roundToDouble() /
-                           (LEnd - First + 1ULL).roundToDouble();
+        LSize.ult(TLI.getMinimumJumpTableEntries())
+            ? 0.0
+            : LSize.roundToDouble() / (LEnd - First + 1ULL).roundToDouble();
     volatile double RDensity =
-      (double)RSize.roundToDouble() /
-                           (Last - RBegin + 1ULL).roundToDouble();
-    volatile double Metric = Range.logBase2()*(LDensity+RDensity);
+        RSize.ult(TLI.getMinimumJumpTableEntries())
+            ? 0.0
+            : RSize.roundToDouble() / (Last - RBegin + 1ULL).roundToDouble();
+    volatile double Metric = Range.logBase2() * (LDensity + RDensity);
     // Should always split in some non-trivial place
     DEBUG(dbgs() <<"=>Step\n"
                  << "LEnd: " << LEnd << ", RBegin: " << RBegin << '\n'
@@ -2427,13 +2457,25 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
     RSize -= J->size();
   }
 
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (areJTsAllowed(TLI)) {
-    // If our case is dense we *really* should handle it earlier!
-    assert((FMetric > 0) && "Should handle dense range earlier!");
-  } else {
+  if (FMetric == 0 || !areJTsAllowed(TLI))
     Pivot = CR.Range.first + Size/2;
-  }
+  splitSwitchCase(CR, Pivot, WorkList, SV, SwitchBB);
+  return true;
+}
+
+void SelectionDAGBuilder::splitSwitchCase(CaseRec &CR, CaseItr Pivot,
+                                          CaseRecVector &WorkList,
+                                          const Value *SV,
+                                          MachineBasicBlock *SwitchBB) {
+  // Get the MachineFunction which holds the current MBB.  This is used when
+  // inserting any additional MBBs necessary to represent the switch.
+  MachineFunction *CurMF = FuncInfo.MF;
+
+  // Figure out which block is immediately after the current one.
+  MachineFunction::iterator BBI = CR.CaseBB;
+  ++BBI;
+
+  const BasicBlock *LLVMBB = CR.CaseBB->getBasicBlock();
 
   CaseRange LHSR(CR.Range.first, Pivot);
   CaseRange RHSR(Pivot, CR.Range.second);
@@ -2446,10 +2488,9 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
   // LHS's Case Value, and that Case Value is exactly one less than the
   // Pivot's Value, then we can branch directly to the LHS's Target,
   // rather than creating a leaf node for it.
-  if ((LHSR.second - LHSR.first) == 1 &&
-      LHSR.first->High == CR.GE &&
+  if ((LHSR.second - LHSR.first) == 1 && LHSR.first->High == CR.GE &&
       cast<ConstantInt>(C)->getValue() ==
-      (cast<ConstantInt>(CR.GE)->getValue() + 1LL)) {
+          (cast<ConstantInt>(CR.GE)->getValue() + 1LL)) {
     TrueBB = LHSR.first->BB;
   } else {
     TrueBB = CurMF->CreateMachineBasicBlock(LLVMBB);
@@ -2466,12 +2507,12 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
   // the current Case Value, rather than emitting a RHS leaf node for it.
   if ((RHSR.second - RHSR.first) == 1 && CR.LT &&
       cast<ConstantInt>(RHSR.first->Low)->getValue() ==
-      (cast<ConstantInt>(CR.LT)->getValue() - 1LL)) {
+          (cast<ConstantInt>(CR.LT)->getValue() - 1LL)) {
     FalseBB = RHSR.first->BB;
   } else {
     FalseBB = CurMF->CreateMachineBasicBlock(LLVMBB);
     CurMF->insert(BBI, FalseBB);
-    WorkList.push_back(CaseRec(FalseBB,CR.LT,C,RHSR));
+    WorkList.push_back(CaseRec(FalseBB, CR.LT, C, RHSR));
 
     // Put SV in a virtual register to make it available from the new blocks.
     ExportFromCurrentBlock(SV);
@@ -2486,8 +2527,6 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
     visitSwitchCase(CB, SwitchBB);
   else
     SwitchCases.push_back(CB);
-
-  return true;
 }
 
 /// handleBitTestsSwitchCase - if current case range has few destination and
@@ -2514,15 +2553,14 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
     return false;
 
   size_t numCmps = 0;
-  for (CaseItr I = CR.Range.first, E = CR.Range.second;
-       I!=E; ++I) {
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) {
     // Single case counts one, case range - two.
     numCmps += (I->Low == I->High ? 1 : 2);
   }
 
   // Count unique destinations
   SmallSet<MachineBasicBlock*, 4> Dests;
-  for (CaseItr I = CR.Range.first, E = CR.Range.second; I!=E; ++I) {
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) {
     Dests.insert(I->BB);
     if (Dests.size() > 3)
       // Don't bother the code below, if there are too much unique destinations
@@ -2629,9 +2667,8 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
 void SelectionDAGBuilder::Clusterify(CaseVector& Cases,
                                      const SwitchInst& SI) {
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
-  // Start with "simple" cases
-  for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
-       i != e; ++i) {
+  // Start with "simple" cases.
+  for (SwitchInst::ConstCaseIt i : SI.cases()) {
     const BasicBlock *SuccBB = i.getCaseSuccessor();
     MachineBasicBlock *SMBB = FuncInfo.MBBMap[SuccBB];
 
@@ -2694,32 +2731,58 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
 
   // Figure out which block is immediately after the current one.
   MachineBasicBlock *NextBlock = nullptr;
+  if (SwitchMBB + 1 != FuncInfo.MF->end())
+    NextBlock = SwitchMBB + 1;
+
+
+  // Create a vector of Cases, sorted so that we can efficiently create a binary
+  // search tree from them.
+  CaseVector Cases;
+  Clusterify(Cases, SI);
+
+  // Get the default destination MBB.
   MachineBasicBlock *Default = FuncInfo.MBBMap[SI.getDefaultDest()];
 
-  // If there is only the default destination, branch to it if it is not the
-  // next basic block.  Otherwise, just fall through.
-  if (!SI.getNumCases()) {
-    // Update machine-CFG edges.
+  if (isa<UnreachableInst>(SI.getDefaultDest()->getFirstNonPHIOrDbg()) &&
+      !Cases.empty()) {
+    // Replace an unreachable default destination with the most popular case
+    // destination.
+    DenseMap<const BasicBlock *, unsigned> Popularity;
+    unsigned MaxPop = 0;
+    const BasicBlock *MaxBB = nullptr;
+    for (auto I : SI.cases()) {
+      const BasicBlock *BB = I.getCaseSuccessor();
+      if (++Popularity[BB] > MaxPop) {
+        MaxPop = Popularity[BB];
+        MaxBB = BB;
+      }
+    }
 
-    // If this is not a fall-through branch, emit the branch.
+    // Set new default.
+    assert(MaxPop > 0);
+    assert(MaxBB);
+    Default = FuncInfo.MBBMap[MaxBB];
+
+    // Remove cases that were pointing to the destination that is now the default.
+    Cases.erase(std::remove_if(Cases.begin(), Cases.end(),
+                               [&](const Case &C) { return C.BB == Default; }),
+                Cases.end());
+  }
+
+  // If there is only the default destination, go there directly.
+  if (Cases.empty()) {
+    // Update machine-CFG edges.
     SwitchMBB->addSuccessor(Default);
-    if (Default != NextBlock)
-      DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
-                              MVT::Other, getControlRoot(),
-                              DAG.getBasicBlock(Default)));
 
+    // If this is not a fall-through branch, emit the branch.
+    if (Default != NextBlock) {
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
+                              getControlRoot(), DAG.getBasicBlock(Default)));
+    }
     return;
   }
 
-  // If there are any non-default case statements, create a vector of Cases
-  // representing each one, and sort the vector so that we can efficiently
-  // create a binary search tree from them.
-  CaseVector Cases;
-  Clusterify(Cases, SI);
-
-  // Get the Value to be switched on and default basic blocks, which will be
-  // inserted into CaseBlock records, representing basic blocks in the binary
-  // search tree.
+  // Get the Value to be switched on.
   const Value *SV = SI.getCondition();
 
   // Push the initial CaseRec onto the worklist
@@ -3613,6 +3676,74 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
   DAG.setRoot(StoreNode);
 }
 
+void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) {
+  SDLoc sdl = getCurSDLoc();
+
+  // llvm.masked.store.*(Src0, Ptr, alignemt, Mask)
+  Value  *PtrOperand = I.getArgOperand(1);
+  SDValue Ptr = getValue(PtrOperand);
+  SDValue Src0 = getValue(I.getArgOperand(0));
+  SDValue Mask = getValue(I.getArgOperand(3));
+  EVT VT = Src0.getValueType();
+  unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(2)))->getZExtValue();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlignment(VT);
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
+
+  MachineMemOperand *MMO =
+    DAG.getMachineFunction().
+    getMachineMemOperand(MachinePointerInfo(PtrOperand),
+                          MachineMemOperand::MOStore,  VT.getStoreSize(),
+                          Alignment, AAInfo);
+  SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, VT,
+                                         MMO, false);
+  DAG.setRoot(StoreNode);
+  setValue(&I, StoreNode);
+}
+
+void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) {
+  SDLoc sdl = getCurSDLoc();
+
+  // @llvm.masked.load.*(Ptr, alignment, Mask, Src0)
+  Value  *PtrOperand = I.getArgOperand(0);
+  SDValue Ptr = getValue(PtrOperand);
+  SDValue Src0 = getValue(I.getArgOperand(3));
+  SDValue Mask = getValue(I.getArgOperand(2));
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(I.getType());
+  unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(1)))->getZExtValue();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlignment(VT);
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
+  const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
+
+  SDValue InChain = DAG.getRoot();
+  if (AA->pointsToConstantMemory(
+      AliasAnalysis::Location(PtrOperand,
+                              AA->getTypeStoreSize(I.getType()),
+                              AAInfo))) {
+    // Do not serialize (non-volatile) loads of constant memory with anything.
+    InChain = DAG.getEntryNode();
+  }
+
+  MachineMemOperand *MMO =
+    DAG.getMachineFunction().
+    getMachineMemOperand(MachinePointerInfo(PtrOperand),
+                          MachineMemOperand::MOLoad,  VT.getStoreSize(),
+                          Alignment, AAInfo, Ranges);
+
+  SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO,
+                                   ISD::NON_EXTLOAD);
+  SDValue OutChain = Load.getValue(1);
+  DAG.setRoot(OutChain);
+  setValue(&I, Load);
+}
+
 void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
   SDLoc dl = getCurSDLoc();
   AtomicOrdering SuccessOrder = I.getSuccessOrdering();
@@ -4460,11 +4591,10 @@ static SDValue ExpandPowI(SDLoc DL, SDValue LHS, SDValue RHS,
       return DAG.getConstantFP(1.0, LHS.getValueType());
 
     const Function *F = DAG.getMachineFunction().getFunction();
-    if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                         Attribute::OptimizeForSize) ||
+    if (!F->hasFnAttribute(Attribute::OptimizeForSize) ||
         // If optimizing for size, don't insert too many multiplies.  This
         // inserts up to 5 multiplies.
-        CountPopulation_32(Val)+Log2_32(Val) < 7) {
+        countPopulation(Val) + Log2_32(Val) < 7) {
       // We use the simple binary decomposition method to generate the multiply
       // sequence.  There are more optimal ways to do this (for example,
       // powi(x,15) generates one more multiply than it should), but this has
@@ -4623,7 +4753,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   case Intrinsic::read_register: {
     Value *Reg = I.getArgOperand(0);
-    SDValue RegName = DAG.getMDNode(cast<MDNode>(Reg));
+    SDValue RegName =
+        DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
     EVT VT = TLI.getValueType(I.getType());
     setValue(&I, DAG.getNode(ISD::READ_REGISTER, sdl, VT, RegName));
     return nullptr;
@@ -4632,7 +4763,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     Value *Reg = I.getArgOperand(0);
     Value *RegValue = I.getArgOperand(1);
     SDValue Chain = getValue(RegValue).getOperand(0);
-    SDValue RegName = DAG.getMDNode(cast<MDNode>(Reg));
+    SDValue RegName =
+        DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
     DAG.setRoot(DAG.getNode(ISD::WRITE_REGISTER, sdl, MVT::Other, Chain,
                             RegName, getValue(RegValue)));
     return nullptr;
@@ -4642,6 +4774,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::longjmp:
     return &"_longjmp"[!TLI.usesUnderscoreLongJmp()];
   case Intrinsic::memcpy: {
+    // FIXME: this definition of "user defined address space" is x86-specific
     // Assert for address < 256 since we support only user defined address
     // spaces.
     assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
@@ -4662,6 +4795,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   }
   case Intrinsic::memset: {
+    // FIXME: this definition of "user defined address space" is x86-specific
     // Assert for address < 256 since we support only user defined address
     // spaces.
     assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
@@ -4679,6 +4813,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   }
   case Intrinsic::memmove: {
+    // FIXME: this definition of "user defined address space" is x86-specific
     // Assert for address < 256 since we support only user defined address
     // spaces.
     assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
@@ -4914,6 +5049,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   }
 
+  case Intrinsic::masked_load:
+    visitMaskedLoad(I);
+    return nullptr;
+  case Intrinsic::masked_store:
+    visitMaskedStore(I);
+    return nullptr;
   case Intrinsic::x86_mmx_pslli_w:
   case Intrinsic::x86_mmx_pslli_d:
   case Intrinsic::x86_mmx_pslli_q:
@@ -5459,6 +5600,78 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     visitPatchpoint(&I);
     return nullptr;
   }
+  case Intrinsic::experimental_gc_statepoint: {
+    visitStatepoint(I);
+    return nullptr;
+  }
+  case Intrinsic::experimental_gc_result_int:
+  case Intrinsic::experimental_gc_result_float:
+  case Intrinsic::experimental_gc_result_ptr:
+  case Intrinsic::experimental_gc_result: {
+    visitGCResult(I);
+    return nullptr;
+  }
+  case Intrinsic::experimental_gc_relocate: {
+    visitGCRelocate(I);
+    return nullptr;
+  }
+  case Intrinsic::instrprof_increment:
+    llvm_unreachable("instrprof failed to lower an increment");
+
+  case Intrinsic::frameallocate: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
+
+    // Do the allocation and map it as a normal value.
+    // FIXME: Maybe we should add this to the alloca map so that we don't have
+    // to register allocate it?
+    uint64_t Size = cast<ConstantInt>(I.getArgOperand(0))->getZExtValue();
+    int Alloc = MF.getFrameInfo()->CreateFrameAllocation(Size);
+    MVT PtrVT = TLI.getPointerTy(0);
+    SDValue FIVal = DAG.getFrameIndex(Alloc, PtrVT);
+    setValue(&I, FIVal);
+
+    // Directly emit a FRAME_ALLOC machine instr. Label assignment emission is
+    // the same on all targets.
+    MCSymbol *FrameAllocSym =
+        MF.getMMI().getContext().getOrCreateFrameAllocSymbol(MF.getName());
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl,
+            TII->get(TargetOpcode::FRAME_ALLOC))
+        .addSym(FrameAllocSym)
+        .addFrameIndex(Alloc);
+
+    return nullptr;
+  }
+
+  case Intrinsic::framerecover: {
+    // i8* @llvm.framerecover(i8* %fn, i8* %fp)
+    MachineFunction &MF = DAG.getMachineFunction();
+    MVT PtrVT = TLI.getPointerTy(0);
+
+    // Get the symbol that defines the frame offset.
+    Function *Fn = cast<Function>(I.getArgOperand(0)->stripPointerCasts());
+    MCSymbol *FrameAllocSym =
+        MF.getMMI().getContext().getOrCreateFrameAllocSymbol(Fn->getName());
+
+    // Create a TargetExternalSymbol for the label to avoid any target lowering
+    // that would make this PC relative.
+    StringRef Name = FrameAllocSym->getName();
+    assert(Name.size() == strlen(Name.data()) && "not null terminated");
+    SDValue OffsetSym = DAG.getTargetExternalSymbol(Name.data(), PtrVT);
+    SDValue OffsetVal =
+        DAG.getNode(ISD::FRAME_ALLOC_RECOVER, sdl, PtrVT, OffsetSym);
+
+    // Add the offset to the FP.
+    Value *FP = I.getArgOperand(1);
+    SDValue FPVal = getValue(FP);
+    SDValue Add = DAG.getNode(ISD::ADD, sdl, PtrVT, FPVal, OffsetVal);
+    setValue(&I, Add);
+
+    return nullptr;
+  }
+  case Intrinsic::eh_begincatch:
+  case Intrinsic::eh_endcatch:
+    llvm_unreachable("begin/end catch intrinsics not lowered in codegen");
   }
 }
 
@@ -5491,9 +5704,8 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
 
     CLI.setChain(getRoot());
   }
-
-  const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering();
-  std::pair<SDValue, SDValue> Result = TLI->LowerCallTo(CLI);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
 
   assert((CLI.IsTailCall || Result.second.getNode()) &&
          "Non-null chain expected with non-tail call!");
@@ -6191,9 +6403,10 @@ static void GetRegistersForValue(SelectionDAG &DAG,
 
   // If this is a constraint for a single physreg, or a constraint for a
   // register class, find it.
-  std::pair<unsigned, const TargetRegisterClass*> PhysReg =
-    TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
-                                     OpInfo.ConstraintVT);
+  std::pair<unsigned, const TargetRegisterClass *> PhysReg =
+      TLI.getRegForInlineAsmConstraint(MF.getSubtarget().getRegisterInfo(),
+                                       OpInfo.ConstraintCode,
+                                       OpInfo.ConstraintVT);
 
   unsigned NumRegs = 1;
   if (OpInfo.ConstraintVT != MVT::Other) {
@@ -6289,8 +6502,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   SDISelAsmOperandInfoVector ConstraintOperands;
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  TargetLowering::AsmOperandInfoVector
-    TargetConstraints = TLI.ParseConstraints(CS);
+  TargetLowering::AsmOperandInfoVector TargetConstraints =
+      TLI.ParseConstraints(DAG.getSubtarget().getRegisterInfo(), CS);
 
   bool hasMemory = false;
 
@@ -6382,12 +6595,13 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
 
       if (OpInfo.ConstraintVT != Input.ConstraintVT) {
-        std::pair<unsigned, const TargetRegisterClass*> MatchRC =
-          TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
-                                            OpInfo.ConstraintVT);
-        std::pair<unsigned, const TargetRegisterClass*> InputRC =
-          TLI.getRegForInlineAsmConstraint(Input.ConstraintCode,
-                                            Input.ConstraintVT);
+	const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+        std::pair<unsigned, const TargetRegisterClass *> MatchRC =
+            TLI.getRegForInlineAsmConstraint(TRI, OpInfo.ConstraintCode,
+                                             OpInfo.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass *> InputRC =
+            TLI.getRegForInlineAsmConstraint(TRI, Input.ConstraintCode,
+                                             Input.ConstraintVT);
         if ((OpInfo.ConstraintVT.isInteger() !=
              Input.ConstraintVT.isInteger()) ||
             (MatchRC.second != InputRC.second)) {
@@ -6848,7 +7062,8 @@ std::pair<SDValue, SDValue>
 SelectionDAGBuilder::lowerCallOperands(ImmutableCallSite CS, unsigned ArgIdx,
                                        unsigned NumArgs, SDValue Callee,
                                        bool UseVoidTy,
-                                       MachineBasicBlock *LandingPad) {
+                                       MachineBasicBlock *LandingPad,
+                                       bool IsPatchPoint) {
   TargetLowering::ArgListTy Args;
   Args.reserve(NumArgs);
 
@@ -6871,7 +7086,7 @@ SelectionDAGBuilder::lowerCallOperands(ImmutableCallSite CS, unsigned ArgIdx,
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot())
     .setCallee(CS.getCallingConv(), retTy, Callee, std::move(Args), NumArgs)
-    .setDiscardResult(CS->use_empty());
+    .setDiscardResult(CS->use_empty()).setIsPatchPoint(IsPatchPoint);
 
   return lowerInvokable(CLI, LandingPad);
 }
@@ -7003,7 +7218,7 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
   unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
   std::pair<SDValue, SDValue> Result =
     lowerCallOperands(CS, NumMetaOpers, NumCallArgs, Callee, IsAnyRegCC,
-                      LandingPad);
+                      LandingPad, true);
 
   SDNode *CallEnd = Result.second.getNode();
   if (HasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
@@ -7051,8 +7266,7 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
 
   // Push the arguments from the call instruction up to the register mask.
   SDNode::op_iterator e = HasGlue ? Call->op_end()-2 : Call->op_end()-1;
-  for (SDNode::op_iterator i = Call->op_begin()+2; i != e; ++i)
-    Ops.push_back(*i);
+  Ops.append(Call->op_begin() + 2, e);
 
   // Push live variables for the stack map.
   addStackMapLiveVars(CS, NumMetaOpers + NumArgs, Ops, *this);
@@ -7251,11 +7465,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       }
       if (Args[i].isNest)
         Flags.setNest();
-      if (NeedsRegBlock) {
+      if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
-        if (Value == NumValues - 1)
-          Flags.setInConsecutiveRegsLast();
-      }
       Flags.setOrigAlign(OriginalAlignment);
 
       MVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT);
@@ -7304,6 +7515,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         CLI.Outs.push_back(MyFlags);
         CLI.OutVals.push_back(Parts[j]);
       }
+
+      if (NeedsRegBlock && Value == NumValues - 1)
+        CLI.Outs[CLI.Outs.size() - 1].Flags.setInConsecutiveRegsLast();
     }
   }
 
@@ -7460,7 +7674,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     ISD::ArgFlagsTy Flags;
     Flags.setSRet();
     MVT RegisterVT = TLI->getRegisterType(*DAG.getContext(), ValueVTs[0]);
-    ISD::InputArg RetArg(Flags, RegisterVT, ValueVTs[0], true, 0, 0);
+    ISD::InputArg RetArg(Flags, RegisterVT, ValueVTs[0], true,
+                         ISD::InputArg::NoArgIndex, 0);
     Ins.push_back(RetArg);
   }
 
@@ -7518,11 +7733,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       }
       if (F.getAttributes().hasAttribute(Idx, Attribute::Nest))
         Flags.setNest();
-      if (NeedsRegBlock) {
+      if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
-        if (Value == NumValues - 1)
-          Flags.setInConsecutiveRegsLast();
-      }
       Flags.setOrigAlign(OriginalAlignment);
 
       MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
@@ -7537,6 +7749,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
           MyFlags.Flags.setOrigAlign(1);
         Ins.push_back(MyFlags);
       }
+      if (NeedsRegBlock && Value == NumValues - 1)
+        Ins[Ins.size() - 1].Flags.setInConsecutiveRegsLast();
       PartBase += VT.getStoreSize();
     }
   }
@@ -7671,7 +7885,6 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
   assert(i == InVals.size() && "Argument register count mismatch!");
 
   // Finally, if the target has anything special to do, allow it to do so.
-  // FIXME: this should insert code into the DAG!
   EmitFunctionEntryCode();
 }
 
@@ -7762,6 +7975,7 @@ MachineBasicBlock *
 SelectionDAGBuilder::StackProtectorDescriptor::
 AddSuccessorMBB(const BasicBlock *BB,
                 MachineBasicBlock *ParentMBB,
+                bool IsLikely,
                 MachineBasicBlock *SuccMBB) {
   // If SuccBB has not been created yet, create it.
   if (!SuccMBB) {
@@ -7771,6 +7985,7 @@ AddSuccessorMBB(const BasicBlock *BB,
     MF->insert(++BBI, SuccMBB);
   }
   // Add it as a successor of ParentMBB.
-  ParentMBB->addSuccessor(SuccMBB);
+  ParentMBB->addSuccessor(
+      SuccMBB, BranchProbabilityInfo::getBranchWeightStackProtector(IsLikely));
   return SuccMBB;
 }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index f74e652..ad7411f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -14,11 +14,13 @@
 #ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SELECTIONDAGBUILDER_H
 #define LLVM_LIB_CODEGEN_SELECTIONDAG_SELECTIONDAGBUILDER_H
 
+#include "StatepointLowering.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLowering.h"
@@ -115,6 +117,10 @@ public:
   /// get simple disambiguation between loads without worrying about alias
   /// analysis.
   SmallVector<SDValue, 8> PendingLoads;
+
+  /// State used while lowering a statepoint sequence (gc_statepoint,
+  /// gc_relocate, and gc_result).  See StatepointLowering.hpp/cpp for details.
+  StatepointLoweringState StatepointLowering;
 private:
 
   /// PendingExports - CopyToReg nodes that copy values to virtual registers
@@ -417,8 +423,8 @@ private:
       assert(!shouldEmitStackProtector() && "Stack Protector Descriptor is "
              "already initialized!");
       ParentMBB = MBB;
-      SuccessMBB = AddSuccessorMBB(BB, MBB);
-      FailureMBB = AddSuccessorMBB(BB, MBB, FailureMBB);
+      SuccessMBB = AddSuccessorMBB(BB, MBB, /* IsLikely */ true);
+      FailureMBB = AddSuccessorMBB(BB, MBB, /* IsLikely */ false, FailureMBB);
       if (!Guard)
         Guard = StackProtCheckCall.getArgOperand(0);
     }
@@ -487,9 +493,10 @@ private:
 
     /// Add a successor machine basic block to ParentMBB. If the successor mbb
     /// has not been created yet (i.e. if SuccMBB = 0), then the machine basic
-    /// block will be created.
+    /// block will be created. Assign a large weight if IsLikely is true.
     MachineBasicBlock *AddSuccessorMBB(const BasicBlock *BB,
                                        MachineBasicBlock *ParentMBB,
+                                       bool IsLikely,
                                        MachineBasicBlock *SuccMBB = nullptr);
   };
 
@@ -612,6 +619,13 @@ public:
     N = NewN;
   }
 
+  void removeValue(const Value *V) {
+    // This is to support hack in lowerCallFromStatepoint
+    // Should be removed when hack is resolved
+    if (NodeMap.count(V))
+      NodeMap.erase(V);
+  }
+
   void setUnusedArgValue(const Value *V, SDValue NewN) {
     SDValue &N = UnusedArgNodeMap[V];
     assert(!N.getNode() && "Already set a value for this node!");
@@ -640,12 +654,15 @@ public:
           unsigned NumArgs,
           SDValue Callee,
           bool UseVoidTy = false,
-          MachineBasicBlock *LandingPad = nullptr);
+          MachineBasicBlock *LandingPad = nullptr,
+          bool IsPatchPoint = false);
 
   /// UpdateSplitBlock - When an MBB was split during scheduling, update the
   /// references that need to refer to the last resulting block.
   void UpdateSplitBlock(MachineBasicBlock *First, MachineBasicBlock *Last);
 
+  // This function is responsible for the whole statepoint lowering process.
+  void LowerStatepoint(ImmutableStatepoint Statepoint);
 private:
   std::pair<SDValue, SDValue> lowerInvokable(
           TargetLowering::CallLoweringInfo &CLI,
@@ -673,6 +690,8 @@ private:
                                CaseRecVector& WorkList,
                                const Value* SV,
                                MachineBasicBlock *SwitchBB);
+  void splitSwitchCase(CaseRec &CR, CaseItr Pivot, CaseRecVector &WorkList,
+                       const Value *SV, MachineBasicBlock *SwitchBB);
   bool handleBitTestsSwitchCase(CaseRec& CR,
                                 CaseRecVector& WorkList,
                                 const Value* SV,
@@ -699,6 +718,8 @@ public:
   void visitJumpTable(JumpTable &JT);
   void visitJumpTableHeader(JumpTable &JT, JumpTableHeader &JTH,
                             MachineBasicBlock *SwitchBB);
+  unsigned visitLandingPadClauseBB(GlobalValue *ClauseGV,
+                                   MachineBasicBlock *LPadMBB);
 
 private:
   // These all get lowered before this pass.
@@ -756,6 +777,8 @@ private:
   void visitAlloca(const AllocaInst &I);
   void visitLoad(const LoadInst &I);
   void visitStore(const StoreInst &I);
+  void visitMaskedLoad(const CallInst &I);
+  void visitMaskedStore(const CallInst &I);
   void visitAtomicCmpXchg(const AtomicCmpXchgInst &I);
   void visitAtomicRMW(const AtomicRMWInst &I);
   void visitFence(const FenceInst &I);
@@ -784,6 +807,11 @@ private:
   void visitPatchpoint(ImmutableCallSite CS,
                        MachineBasicBlock *LandingPad = nullptr);
 
+  // These three are implemented in StatepointLowering.cpp
+  void visitStatepoint(const CallInst &I);
+  void visitGCRelocate(const CallInst &I);
+  void visitGCResult(const CallInst &I);
+
   void visitUserOp1(const Instruction &I) {
     llvm_unreachable("UserOp1 should not exist at instruction selection time!");
   }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index c9f6cff..17eff94 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -187,6 +187,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FMUL:                       return "fmul";
   case ISD::FDIV:                       return "fdiv";
   case ISD::FMA:                        return "fma";
+  case ISD::FMAD:                       return "fmad";
   case ISD::FREM:                       return "frem";
   case ISD::FCOPYSIGN:                  return "fcopysign";
   case ISD::FGETSIGN:                   return "fgetsign";
@@ -269,6 +270,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     // Other operators
   case ISD::LOAD:                       return "load";
   case ISD::STORE:                      return "store";
+  case ISD::MLOAD:                      return "masked_load";
+  case ISD::MSTORE:                     return "masked_store";
   case ISD::VAARG:                      return "vaarg";
   case ISD::VACOPY:                     return "vacopy";
   case ISD::VAEND:                      return "vaend";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 79109b7..5e867cf 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/GCStrategy.h"
 #include "ScheduleDAGSDNodes.h"
 #include "SelectionDAGBuilder.h"
 #include "llvm/ADT/PostOrderIterator.h"
@@ -19,10 +19,11 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
-#include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -31,6 +32,7 @@
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
@@ -40,6 +42,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -47,7 +50,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -181,6 +183,10 @@ UseMBPI("use-mbpi",
         cl::init(true), cl::Hidden);
 
 #ifndef NDEBUG
+static cl::opt<std::string>
+FilterDAGBasicBlockName("filter-view-dags", cl::Hidden,
+                        cl::desc("Only display the basic block whose name "
+                                 "matches this for all view-*-dags options"));
 static cl::opt<bool>
 ViewDAGCombine1("view-dag-combine1-dags", cl::Hidden,
           cl::desc("Pop up a window to show dags before the first "
@@ -345,7 +351,8 @@ SelectionDAGISel::SelectionDAGISel(TargetMachine &tm,
     initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
     initializeAliasAnalysisAnalysisGroup(*PassRegistry::getPassRegistry());
     initializeBranchProbabilityInfoPass(*PassRegistry::getPassRegistry());
-    initializeTargetLibraryInfoPass(*PassRegistry::getPassRegistry());
+    initializeTargetLibraryInfoWrapperPassPass(
+        *PassRegistry::getPassRegistry());
   }
 
 SelectionDAGISel::~SelectionDAGISel() {
@@ -359,7 +366,7 @@ void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<AliasAnalysis>();
   AU.addRequired<GCModuleInfo>();
   AU.addPreserved<GCModuleInfo>();
-  AU.addRequired<TargetLibraryInfo>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
   if (UseMBPI && OptLevel != CodeGenOpt::None)
     AU.addRequired<BranchProbabilityInfo>();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -372,7 +379,7 @@ void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
 ///
 /// This is required for correctness, so it must be done at -O0.
 ///
-static void SplitCriticalSideEffectEdges(Function &Fn, Pass *SDISel) {
+static void SplitCriticalSideEffectEdges(Function &Fn, AliasAnalysis *AA) {
   // Loop for blocks with phi nodes.
   for (Function::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
     PHINode *PN = dyn_cast<PHINode>(BB->begin());
@@ -396,8 +403,9 @@ static void SplitCriticalSideEffectEdges(Function &Fn, Pass *SDISel) {
           continue;
 
         // Okay, we have to split this edge.
-        SplitCriticalEdge(Pred->getTerminator(),
-                          GetSuccessorNumber(Pred, BB), SDISel, true);
+        SplitCriticalEdge(
+            Pred->getTerminator(), GetSuccessorNumber(Pred, BB),
+            CriticalEdgeSplittingOptions(AA).setMergeIdenticalEdges());
         goto ReprocessBlock;
       }
   }
@@ -429,12 +437,12 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
   AA = &getAnalysis<AliasAnalysis>();
-  LibInfo = &getAnalysis<TargetLibraryInfo>();
+  LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
 
   DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
-  SplitCriticalSideEffectEdges(const_cast<Function&>(Fn), this);
+  SplitCriticalSideEffectEdges(const_cast<Function&>(Fn), AA);
 
   CurDAG->init(*MF);
   FuncInfo->set(Fn, *MF, CurDAG);
@@ -650,6 +658,12 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   std::string BlockName;
   int BlockNumber = -1;
   (void)BlockNumber;
+  bool MatchFilterBB = false; (void)MatchFilterBB;
+#ifndef NDEBUG
+  MatchFilterBB = (FilterDAGBasicBlockName.empty() ||
+                   FilterDAGBasicBlockName ==
+                       FuncInfo->MBB->getBasicBlock()->getName().str());
+#endif
 #ifdef NDEBUG
   if (ViewDAGCombine1 || ViewLegalizeTypesDAGs || ViewLegalizeDAGs ||
       ViewDAGCombine2 || ViewDAGCombineLT || ViewISelDAGs || ViewSchedDAGs ||
@@ -663,7 +677,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   DEBUG(dbgs() << "Initial selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
 
-  if (ViewDAGCombine1) CurDAG->viewGraph("dag-combine1 input for " + BlockName);
+  if (ViewDAGCombine1 && MatchFilterBB)
+    CurDAG->viewGraph("dag-combine1 input for " + BlockName);
 
   // Run the DAG combiner in pre-legalize mode.
   {
@@ -676,8 +691,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
   // Second step, hack on the DAG until it only uses operations and types that
   // the target supports.
-  if (ViewLegalizeTypesDAGs) CurDAG->viewGraph("legalize-types input for " +
-                                               BlockName);
+  if (ViewLegalizeTypesDAGs && MatchFilterBB)
+    CurDAG->viewGraph("legalize-types input for " + BlockName);
 
   bool Changed;
   {
@@ -691,7 +706,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   CurDAG->NewNodesMustHaveLegalTypes = true;
 
   if (Changed) {
-    if (ViewDAGCombineLT)
+    if (ViewDAGCombineLT && MatchFilterBB)
       CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
 
     // Run the DAG combiner in post-type-legalize mode.
@@ -717,7 +732,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
       CurDAG->LegalizeTypes();
     }
 
-    if (ViewDAGCombineLT)
+    if (ViewDAGCombineLT && MatchFilterBB)
       CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
 
     // Run the DAG combiner in post-type-legalize mode.
@@ -731,7 +746,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
           << BlockNumber << " '" << BlockName << "'\n"; CurDAG->dump());
   }
 
-  if (ViewLegalizeDAGs) CurDAG->viewGraph("legalize input for " + BlockName);
+  if (ViewLegalizeDAGs && MatchFilterBB)
+    CurDAG->viewGraph("legalize input for " + BlockName);
 
   {
     NamedRegionTimer T("DAG Legalization", GroupName, TimePassesIsEnabled);
@@ -741,7 +757,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   DEBUG(dbgs() << "Legalized selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
 
-  if (ViewDAGCombine2) CurDAG->viewGraph("dag-combine2 input for " + BlockName);
+  if (ViewDAGCombine2 && MatchFilterBB)
+    CurDAG->viewGraph("dag-combine2 input for " + BlockName);
 
   // Run the DAG combiner in post-legalize mode.
   {
@@ -755,7 +772,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   if (OptLevel != CodeGenOpt::None)
     ComputeLiveOutVRegInfo();
 
-  if (ViewISelDAGs) CurDAG->viewGraph("isel input for " + BlockName);
+  if (ViewISelDAGs && MatchFilterBB)
+    CurDAG->viewGraph("isel input for " + BlockName);
 
   // Third, instruction select all of the operations to machine code, adding the
   // code to the MachineBasicBlock.
@@ -767,7 +785,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   DEBUG(dbgs() << "Selected selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
 
-  if (ViewSchedDAGs) CurDAG->viewGraph("scheduler input for " + BlockName);
+  if (ViewSchedDAGs && MatchFilterBB)
+    CurDAG->viewGraph("scheduler input for " + BlockName);
 
   // Schedule machine code.
   ScheduleDAGSDNodes *Scheduler = CreateScheduler();
@@ -777,7 +796,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     Scheduler->Run(CurDAG, FuncInfo->MBB);
   }
 
-  if (ViewSUnitDAGs) Scheduler->viewGraph();
+  if (ViewSUnitDAGs && MatchFilterBB) Scheduler->viewGraph();
 
   // Emit machine code to BB.  This can change 'BB' to the last block being
   // inserted into.
@@ -892,6 +911,8 @@ void SelectionDAGISel::DoInstructionSelection() {
 void SelectionDAGISel::PrepareEHLandingPad() {
   MachineBasicBlock *MBB = FuncInfo->MBB;
 
+  const TargetRegisterClass *PtrRC = TLI->getRegClassFor(TLI->getPointerTy());
+
   // Add a label to mark the beginning of the landing pad.  Deletion of the
   // landing pad can thus be detected via the MachineModuleInfo.
   MCSymbol *Label = MF->getMMI().addLandingPad(MBB);
@@ -903,8 +924,73 @@ void SelectionDAGISel::PrepareEHLandingPad() {
   BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
     .addSym(Label);
 
+  // If this is an MSVC-style personality function, we need to split the landing
+  // pad into several BBs.
+  const BasicBlock *LLVMBB = MBB->getBasicBlock();
+  const LandingPadInst *LPadInst = LLVMBB->getLandingPadInst();
+  MF->getMMI().addPersonality(
+      MBB, cast<Function>(LPadInst->getPersonalityFn()->stripPointerCasts()));
+  if (MF->getMMI().getPersonalityType() == EHPersonality::MSVC_Win64SEH) {
+    // Make virtual registers and a series of labels that fill in values for the
+    // clauses.
+    auto &RI = MF->getRegInfo();
+    FuncInfo->ExceptionSelectorVirtReg = RI.createVirtualRegister(PtrRC);
+
+    // Get all invoke BBs that will unwind into the clause BBs.
+    SmallVector<MachineBasicBlock *, 4> InvokeBBs(MBB->pred_begin(),
+                                                  MBB->pred_end());
+
+    // Emit separate machine basic blocks with separate labels for each clause
+    // before the main landing pad block.
+    MachineInstrBuilder SelectorPHI = BuildMI(
+        *MBB, MBB->begin(), SDB->getCurDebugLoc(), TII->get(TargetOpcode::PHI),
+        FuncInfo->ExceptionSelectorVirtReg);
+    for (unsigned I = 0, E = LPadInst->getNumClauses(); I != E; ++I) {
+      // Skip filter clauses, we can't implement them yet.
+      if (LPadInst->isFilter(I))
+        continue;
+
+      MachineBasicBlock *ClauseBB = MF->CreateMachineBasicBlock(LLVMBB);
+      MF->insert(MBB, ClauseBB);
+
+      // Add the edge from the invoke to the clause.
+      for (MachineBasicBlock *InvokeBB : InvokeBBs)
+        InvokeBB->addSuccessor(ClauseBB);
+
+      // Mark the clause as a landing pad or MI passes will delete it.
+      ClauseBB->setIsLandingPad();
+
+      GlobalValue *ClauseGV = ExtractTypeInfo(LPadInst->getClause(I));
+
+      // Start the BB with a label.
+      MCSymbol *ClauseLabel = MF->getMMI().addClauseForLandingPad(MBB);
+      BuildMI(*ClauseBB, ClauseBB->begin(), SDB->getCurDebugLoc(), II)
+          .addSym(ClauseLabel);
+
+      // Construct a simple BB that defines a register with the typeid constant.
+      FuncInfo->MBB = ClauseBB;
+      FuncInfo->InsertPt = ClauseBB->end();
+      unsigned VReg = SDB->visitLandingPadClauseBB(ClauseGV, MBB);
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+
+      // Add the typeid virtual register to the phi in the main landing pad.
+      SelectorPHI.addReg(VReg).addMBB(ClauseBB);
+    }
+
+    // Remove the edge from the invoke to the lpad.
+    for (MachineBasicBlock *InvokeBB : InvokeBBs)
+      InvokeBB->removeSuccessor(MBB);
+
+    // Restore FuncInfo back to its previous state and select the main landing
+    // pad block.
+    FuncInfo->MBB = MBB;
+    FuncInfo->InsertPt = MBB->end();
+    return;
+  }
+
   // Mark exception register as live in.
-  const TargetRegisterClass *PtrRC = TLI->getRegClassFor(TLI->getPointerTy());
   if (unsigned Reg = TLI->getExceptionPointerRegister())
     FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
 
diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
new file mode 100644
index 0000000..1271f6b
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -0,0 +1,679 @@
+//===-- StatepointLowering.cpp - SDAGBuilder's statepoint code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes support code use by SelectionDAGBuilder when lowering a
+// statepoint sequence in SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "StatepointLowering.h"
+#include "SelectionDAGBuilder.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "statepoint-lowering"
+
+STATISTIC(NumSlotsAllocatedForStatepoints,
+          "Number of stack slots allocated for statepoints");
+STATISTIC(NumOfStatepoints, "Number of statepoint nodes encountered");
+STATISTIC(StatepointMaxSlotsRequired,
+          "Maximum number of stack slots required for a singe statepoint");
+
+void
+StatepointLoweringState::startNewStatepoint(SelectionDAGBuilder &Builder) {
+  // Consistency check
+  assert(PendingGCRelocateCalls.empty() &&
+         "Trying to visit statepoint before finished processing previous one");
+  Locations.clear();
+  RelocLocations.clear();
+  NextSlotToAllocate = 0;
+  // Need to resize this on each safepoint - we need the two to stay in
+  // sync and the clear patterns of a SelectionDAGBuilder have no relation
+  // to FunctionLoweringInfo.
+  AllocatedStackSlots.resize(Builder.FuncInfo.StatepointStackSlots.size());
+  for (size_t i = 0; i < AllocatedStackSlots.size(); i++) {
+    AllocatedStackSlots[i] = false;
+  }
+}
+void StatepointLoweringState::clear() {
+  Locations.clear();
+  RelocLocations.clear();
+  AllocatedStackSlots.clear();
+  assert(PendingGCRelocateCalls.empty() &&
+         "cleared before statepoint sequence completed");
+}
+
+SDValue
+StatepointLoweringState::allocateStackSlot(EVT ValueType,
+                                           SelectionDAGBuilder &Builder) {
+
+  NumSlotsAllocatedForStatepoints++;
+
+  // The basic scheme here is to first look for a previously created stack slot
+  // which is not in use (accounting for the fact arbitrary slots may already
+  // be reserved), or to create a new stack slot and use it.
+
+  // If this doesn't succeed in 40000 iterations, something is seriously wrong
+  for (int i = 0; i < 40000; i++) {
+    assert(Builder.FuncInfo.StatepointStackSlots.size() ==
+               AllocatedStackSlots.size() &&
+           "broken invariant");
+    const size_t NumSlots = AllocatedStackSlots.size();
+    assert(NextSlotToAllocate <= NumSlots && "broken invariant");
+
+    if (NextSlotToAllocate >= NumSlots) {
+      assert(NextSlotToAllocate == NumSlots);
+      // record stats
+      if (NumSlots + 1 > StatepointMaxSlotsRequired) {
+        StatepointMaxSlotsRequired = NumSlots + 1;
+      }
+
+      SDValue SpillSlot = Builder.DAG.CreateStackTemporary(ValueType);
+      const unsigned FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+      Builder.FuncInfo.StatepointStackSlots.push_back(FI);
+      AllocatedStackSlots.push_back(true);
+      return SpillSlot;
+    }
+    if (!AllocatedStackSlots[NextSlotToAllocate]) {
+      const int FI = Builder.FuncInfo.StatepointStackSlots[NextSlotToAllocate];
+      AllocatedStackSlots[NextSlotToAllocate] = true;
+      return Builder.DAG.getFrameIndex(FI, ValueType);
+    }
+    // Note: We deliberately choose to advance this only on the failing path.
+    // Doing so on the suceeding path involes a bit of complexity that caused a
+    // minor bug previously.  Unless performance shows this matters, please
+    // keep this code as simple as possible.
+    NextSlotToAllocate++;
+  }
+  llvm_unreachable("infinite loop?");
+}
+
+/// Try to find existing copies of the incoming values in stack slots used for
+/// statepoint spilling.  If we can find a spill slot for the incoming value,
+/// mark that slot as allocated, and reuse the same slot for this safepoint.
+/// This helps to avoid series of loads and stores that only serve to resuffle
+/// values on the stack between calls.
+static void reservePreviousStackSlotForValue(SDValue Incoming,
+                                             SelectionDAGBuilder &Builder) {
+
+  if (isa<ConstantSDNode>(Incoming) || isa<FrameIndexSDNode>(Incoming)) {
+    // We won't need to spill this, so no need to check for previously
+    // allocated stack slots
+    return;
+  }
+
+  SDValue Loc = Builder.StatepointLowering.getLocation(Incoming);
+  if (Loc.getNode()) {
+    // duplicates in input
+    return;
+  }
+
+  // Search back for the load from a stack slot pattern to find the original
+  // slot we allocated for this value.  We could extend this to deal with
+  // simple modification patterns, but simple dealing with trivial load/store
+  // sequences helps a lot already.
+  if (LoadSDNode *Load = dyn_cast<LoadSDNode>(Incoming)) {
+    if (auto *FI = dyn_cast<FrameIndexSDNode>(Load->getBasePtr())) {
+      const int Index = FI->getIndex();
+      auto Itr = std::find(Builder.FuncInfo.StatepointStackSlots.begin(),
+                           Builder.FuncInfo.StatepointStackSlots.end(), Index);
+      if (Itr == Builder.FuncInfo.StatepointStackSlots.end()) {
+        // not one of the lowering stack slots, can't reuse!
+        // TODO: Actually, we probably could reuse the stack slot if the value
+        // hasn't changed at all, but we'd need to look for intervening writes
+        return;
+      } else {
+        // This is one of our dedicated lowering slots
+        const int Offset =
+            std::distance(Builder.FuncInfo.StatepointStackSlots.begin(), Itr);
+        if (Builder.StatepointLowering.isStackSlotAllocated(Offset)) {
+          // stack slot already assigned to someone else, can't use it!
+          // TODO: currently we reserve space for gc arguments after doing
+          // normal allocation for deopt arguments.  We should reserve for
+          // _all_ deopt and gc arguments, then start allocating.  This
+          // will prevent some moves being inserted when vm state changes,
+          // but gc state doesn't between two calls.
+          return;
+        }
+        // Reserve this stack slot
+        Builder.StatepointLowering.reserveStackSlot(Offset);
+      }
+
+      // Cache this slot so we find it when going through the normal
+      // assignment loop.
+      SDValue Loc =
+          Builder.DAG.getTargetFrameIndex(Index, Incoming.getValueType());
+
+      Builder.StatepointLowering.setLocation(Incoming, Loc);
+    }
+  }
+
+  // TODO: handle case where a reloaded value flows through a phi to
+  // another safepoint.  e.g.
+  // bb1:
+  //  a' = relocated...
+  // bb2: % pred: bb1, bb3, bb4, etc.
+  //  a_phi = phi(a', ...)
+  // statepoint ... a_phi
+  // NOTE: This will require reasoning about cross basic block values.  This is
+  // decidedly non trivial and this might not be the right place to do it.  We
+  // don't really have the information we need here...
+
+  // TODO: handle simple updates.  If a value is modified and the original
+  // value is no longer live, it would be nice to put the modified value in the
+  // same slot.  This allows folding of the memory accesses for some
+  // instructions types (like an increment).
+  // statepoint (i)
+  // i1 = i+1
+  // statepoint (i1)
+}
+
+/// Remove any duplicate (as SDValues) from the derived pointer pairs.  This
+/// is not required for correctness.  It's purpose is to reduce the size of
+/// StackMap section.  It has no effect on the number of spill slots required
+/// or the actual lowering.
+static void removeDuplicatesGCPtrs(SmallVectorImpl<const Value *> &Bases,
+                                   SmallVectorImpl<const Value *> &Ptrs,
+                                   SmallVectorImpl<const Value *> &Relocs,
+                                   SelectionDAGBuilder &Builder) {
+
+  // This is horribly ineffecient, but I don't care right now
+  SmallSet<SDValue, 64> Seen;
+
+  SmallVector<const Value *, 64> NewBases, NewPtrs, NewRelocs;
+  for (size_t i = 0; i < Ptrs.size(); i++) {
+    SDValue SD = Builder.getValue(Ptrs[i]);
+    // Only add non-duplicates
+    if (Seen.count(SD) == 0) {
+      NewBases.push_back(Bases[i]);
+      NewPtrs.push_back(Ptrs[i]);
+      NewRelocs.push_back(Relocs[i]);
+    }
+    Seen.insert(SD);
+  }
+  assert(Bases.size() >= NewBases.size());
+  assert(Ptrs.size() >= NewPtrs.size());
+  assert(Relocs.size() >= NewRelocs.size());
+  Bases = NewBases;
+  Ptrs = NewPtrs;
+  Relocs = NewRelocs;
+  assert(Ptrs.size() == Bases.size());
+  assert(Ptrs.size() == Relocs.size());
+}
+
+/// Extract call from statepoint, lower it and return pointer to the
+/// call node. Also update NodeMap so that getValue(statepoint) will
+/// reference lowered call result
+static SDNode *lowerCallFromStatepoint(ImmutableStatepoint StatepointSite,
+                                       SelectionDAGBuilder &Builder) {
+
+  ImmutableCallSite CS(StatepointSite.getCallSite());
+
+  // Lower the actual call itself - This is a bit of a hack, but we want to
+  // avoid modifying the actual lowering code.  This is similiar in intent to
+  // the LowerCallOperands mechanism used by PATCHPOINT, but is structured
+  // differently.  Hopefully, this is slightly more robust w.r.t. calling
+  // convention, return values, and other function attributes.
+  Value *ActualCallee = const_cast<Value *>(StatepointSite.actualCallee());
+
+  std::vector<Value *> Args;
+  CallInst::const_op_iterator arg_begin = StatepointSite.call_args_begin();
+  CallInst::const_op_iterator arg_end = StatepointSite.call_args_end();
+  Args.insert(Args.end(), arg_begin, arg_end);
+  // TODO: remove the creation of a new instruction!  We should not be
+  // modifying the IR (even temporarily) at this point.
+  CallInst *Tmp = CallInst::Create(ActualCallee, Args);
+  Tmp->setTailCall(CS.isTailCall());
+  Tmp->setCallingConv(CS.getCallingConv());
+  Tmp->setAttributes(CS.getAttributes());
+  Builder.LowerCallTo(Tmp, Builder.getValue(ActualCallee), false);
+
+  // Handle the return value of the call iff any.
+  const bool HasDef = !Tmp->getType()->isVoidTy();
+  if (HasDef) {
+    // The value of the statepoint itself will be the value of call itself.
+    // We'll replace the actually call node shortly.  gc_result will grab
+    // this value.
+    Builder.setValue(CS.getInstruction(), Builder.getValue(Tmp));
+  } else {
+    // The token value is never used from here on, just generate a poison value
+    Builder.setValue(CS.getInstruction(), Builder.DAG.getIntPtrConstant(-1));
+  }
+  // Remove the fake entry we created so we don't have a hanging reference
+  // after we delete this node.
+  Builder.removeValue(Tmp);
+  delete Tmp;
+  Tmp = nullptr;
+
+  // Search for the call node
+  // The following code is essentially reverse engineering X86's
+  // LowerCallTo.
+  SDNode *CallNode = nullptr;
+
+  // We just emitted a call, so it should be last thing generated
+  SDValue Chain = Builder.DAG.getRoot();
+
+  // Find closest CALLSEQ_END walking back through lowered nodes if needed
+  SDNode *CallEnd = Chain.getNode();
+  int Sanity = 0;
+  while (CallEnd->getOpcode() != ISD::CALLSEQ_END) {
+    CallEnd = CallEnd->getGluedNode();
+    assert(CallEnd && "Can not find call node");
+    assert(Sanity < 20 && "should have found call end already");
+    Sanity++;
+  }
+  assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
+         "Expected a callseq node.");
+  assert(CallEnd->getGluedNode());
+
+  // Step back inside the CALLSEQ
+  CallNode = CallEnd->getGluedNode();
+  return CallNode;
+}
+
+/// Callect all gc pointers coming into statepoint intrinsic, clean them up,
+/// and return two arrays:
+///   Bases - base pointers incoming to this statepoint
+///   Ptrs - derived pointers incoming to this statepoint
+///   Relocs - the gc_relocate corresponding to each base/ptr pair
+/// Elements of this arrays should be in one-to-one correspondence with each
+/// other i.e Bases[i], Ptrs[i] are from the same gcrelocate call
+static void
+getIncomingStatepointGCValues(SmallVectorImpl<const Value *> &Bases,
+                              SmallVectorImpl<const Value *> &Ptrs,
+                              SmallVectorImpl<const Value *> &Relocs,
+                              ImmutableStatepoint StatepointSite,
+                              SelectionDAGBuilder &Builder) {
+  for (GCRelocateOperands relocateOpers :
+         StatepointSite.getRelocates(StatepointSite)) {
+    Relocs.push_back(relocateOpers.getUnderlyingCallSite().getInstruction());
+    Bases.push_back(relocateOpers.basePtr());
+    Ptrs.push_back(relocateOpers.derivedPtr());
+  }
+
+  // Remove any redundant llvm::Values which map to the same SDValue as another
+  // input.  Also has the effect of removing duplicates in the original
+  // llvm::Value input list as well.  This is a useful optimization for
+  // reducing the size of the StackMap section.  It has no other impact.
+  removeDuplicatesGCPtrs(Bases, Ptrs, Relocs, Builder);
+
+  assert(Bases.size() == Ptrs.size() && Ptrs.size() == Relocs.size());
+}
+
+/// Spill a value incoming to the statepoint. It might be either part of
+/// vmstate
+/// or gcstate. In both cases unconditionally spill it on the stack unless it
+/// is a null constant. Return pair with first element being frame index
+/// containing saved value and second element with outgoing chain from the
+/// emitted store
+static std::pair<SDValue, SDValue>
+spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
+                             SelectionDAGBuilder &Builder) {
+  SDValue Loc = Builder.StatepointLowering.getLocation(Incoming);
+
+  // Emit new store if we didn't do it for this ptr before
+  if (!Loc.getNode()) {
+    Loc = Builder.StatepointLowering.allocateStackSlot(Incoming.getValueType(),
+                                                       Builder);
+    assert(isa<FrameIndexSDNode>(Loc));
+    int Index = cast<FrameIndexSDNode>(Loc)->getIndex();
+    // We use TargetFrameIndex so that isel will not select it into LEA
+    Loc = Builder.DAG.getTargetFrameIndex(Index, Incoming.getValueType());
+
+    // TODO: We can create TokenFactor node instead of
+    //       chaining stores one after another, this may allow
+    //       a bit more optimal scheduling for them
+    Chain = Builder.DAG.getStore(Chain, Builder.getCurSDLoc(), Incoming, Loc,
+                                 MachinePointerInfo::getFixedStack(Index),
+                                 false, false, 0);
+
+    Builder.StatepointLowering.setLocation(Incoming, Loc);
+  }
+
+  assert(Loc.getNode());
+  return std::make_pair(Loc, Chain);
+}
+
+/// Lower a single value incoming to a statepoint node.  This value can be
+/// either a deopt value or a gc value, the handling is the same.  We special
+/// case constants and allocas, then fall back to spilling if required.
+static void lowerIncomingStatepointValue(SDValue Incoming,
+                                         SmallVectorImpl<SDValue> &Ops,
+                                         SelectionDAGBuilder &Builder) {
+  SDValue Chain = Builder.getRoot();
+
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Incoming)) {
+    // If the original value was a constant, make sure it gets recorded as
+    // such in the stackmap.  This is required so that the consumer can
+    // parse any internal format to the deopt state.  It also handles null
+    // pointers and other constant pointers in GC states
+    Ops.push_back(
+        Builder.DAG.getTargetConstant(StackMaps::ConstantOp, MVT::i64));
+    Ops.push_back(Builder.DAG.getTargetConstant(C->getSExtValue(), MVT::i64));
+  } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) {
+    // This handles allocas as arguments to the statepoint
+    const TargetLowering &TLI = Builder.DAG.getTargetLoweringInfo();
+    Ops.push_back(
+        Builder.DAG.getTargetFrameIndex(FI->getIndex(), TLI.getPointerTy()));
+  } else {
+    // Otherwise, locate a spill slot and explicitly spill it so it
+    // can be found by the runtime later.  We currently do not support
+    // tracking values through callee saved registers to their eventual
+    // spill location.  This would be a useful optimization, but would
+    // need to be optional since it requires a lot of complexity on the
+    // runtime side which not all would support.
+    std::pair<SDValue, SDValue> Res =
+        spillIncomingStatepointValue(Incoming, Chain, Builder);
+    Ops.push_back(Res.first);
+    Chain = Res.second;
+  }
+
+  Builder.DAG.setRoot(Chain);
+}
+
+/// Lower deopt state and gc pointer arguments of the statepoint.  The actual
+/// lowering is described in lowerIncomingStatepointValue.  This function is
+/// responsible for lowering everything in the right position and playing some
+/// tricks to avoid redundant stack manipulation where possible.  On
+/// completion, 'Ops' will contain ready to use operands for machine code
+/// statepoint. The chain nodes will have already been created and the DAG root
+/// will be set to the last value spilled (if any were).
+static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
+                                    ImmutableStatepoint StatepointSite,
+                                    SelectionDAGBuilder &Builder) {
+
+  // Lower the deopt and gc arguments for this statepoint.  Layout will
+  // be: deopt argument length, deopt arguments.., gc arguments...
+
+  SmallVector<const Value *, 64> Bases, Ptrs, Relocations;
+  getIncomingStatepointGCValues(Bases, Ptrs, Relocations,
+                                StatepointSite, Builder);
+
+#ifndef NDEBUG
+  // Check that each of the gc pointer and bases we've gotten out of the
+  // safepoint is something the strategy thinks might be a pointer into the GC
+  // heap.  This is basically just here to help catch errors during statepoint
+  // insertion. TODO: This should actually be in the Verifier, but we can't get
+  // to the GCStrategy from there (yet).
+  if (Builder.GFI) {
+    GCStrategy &S = Builder.GFI->getStrategy();
+    for (const Value *V : Bases) {
+      auto Opt = S.isGCManagedPointer(V);
+      if (Opt.hasValue()) {
+        assert(Opt.getValue() &&
+               "non gc managed base pointer found in statepoint");
+      }
+    }
+    for (const Value *V : Ptrs) {
+      auto Opt = S.isGCManagedPointer(V);
+      if (Opt.hasValue()) {
+        assert(Opt.getValue() &&
+               "non gc managed derived pointer found in statepoint");
+      }
+    }
+    for (const Value *V : Relocations) {
+      auto Opt = S.isGCManagedPointer(V);
+      if (Opt.hasValue()) {
+        assert(Opt.getValue() && "non gc managed pointer relocated");
+      }
+    }
+  }
+#endif
+
+
+
+  // Before we actually start lowering (and allocating spill slots for values),
+  // reserve any stack slots which we judge to be profitable to reuse for a
+  // particular value.  This is purely an optimization over the code below and
+  // doesn't change semantics at all.  It is important for performance that we
+  // reserve slots for both deopt and gc values before lowering either.
+  for (auto I = StatepointSite.vm_state_begin() + 1,
+            E = StatepointSite.vm_state_end();
+       I != E; ++I) {
+    Value *V = *I;
+    SDValue Incoming = Builder.getValue(V);
+    reservePreviousStackSlotForValue(Incoming, Builder);
+  }
+  for (unsigned i = 0; i < Bases.size() * 2; ++i) {
+    // Even elements will contain base, odd elements - derived ptr
+    const Value *V = i % 2 ? Bases[i / 2] : Ptrs[i / 2];
+    SDValue Incoming = Builder.getValue(V);
+    reservePreviousStackSlotForValue(Incoming, Builder);
+  }
+
+  // First, prefix the list with the number of unique values to be
+  // lowered.  Note that this is the number of *Values* not the
+  // number of SDValues required to lower them.
+  const int NumVMSArgs = StatepointSite.numTotalVMSArgs();
+  Ops.push_back(
+      Builder.DAG.getTargetConstant(StackMaps::ConstantOp, MVT::i64));
+  Ops.push_back(Builder.DAG.getTargetConstant(NumVMSArgs, MVT::i64));
+
+  assert(NumVMSArgs + 1 == std::distance(StatepointSite.vm_state_begin(),
+                                         StatepointSite.vm_state_end()));
+
+  // The vm state arguments are lowered in an opaque manner.  We do
+  // not know what type of values are contained within.  We skip the
+  // first one since that happens to be the total number we lowered
+  // explicitly just above.  We could have left it in the loop and
+  // not done it explicitly, but it's far easier to understand this
+  // way.
+  for (auto I = StatepointSite.vm_state_begin() + 1,
+            E = StatepointSite.vm_state_end();
+       I != E; ++I) {
+    const Value *V = *I;
+    SDValue Incoming = Builder.getValue(V);
+    lowerIncomingStatepointValue(Incoming, Ops, Builder);
+  }
+
+  // Finally, go ahead and lower all the gc arguments.  There's no prefixed
+  // length for this one.  After lowering, we'll have the base and pointer
+  // arrays interwoven with each (lowered) base pointer immediately followed by
+  // it's (lowered) derived pointer.  i.e
+  // (base[0], ptr[0], base[1], ptr[1], ...)
+  for (unsigned i = 0; i < Bases.size() * 2; ++i) {
+    // Even elements will contain base, odd elements - derived ptr
+    const Value *V = i % 2 ? Bases[i / 2] : Ptrs[i / 2];
+    SDValue Incoming = Builder.getValue(V);
+    lowerIncomingStatepointValue(Incoming, Ops, Builder);
+  }
+}
+
+void SelectionDAGBuilder::visitStatepoint(const CallInst &CI) {
+  // Check some preconditions for sanity
+  assert(isStatepoint(&CI) &&
+         "function called must be the statepoint function");
+
+  LowerStatepoint(ImmutableStatepoint(&CI));
+}
+
+void SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP) {
+  // The basic scheme here is that information about both the original call and
+  // the safepoint is encoded in the CallInst.  We create a temporary call and
+  // lower it, then reverse engineer the calling sequence.
+
+  NumOfStatepoints++;
+  // Clear state
+  StatepointLowering.startNewStatepoint(*this);
+
+  ImmutableCallSite CS(ISP.getCallSite());
+
+#ifndef NDEBUG
+  // Consistency check
+  for (const User *U : CS->users()) {
+    const CallInst *Call = cast<CallInst>(U);
+    if (isGCRelocate(Call))
+      StatepointLowering.scheduleRelocCall(*Call);
+  }
+#endif
+
+#ifndef NDEBUG
+  // If this is a malformed statepoint, report it early to simplify debugging.
+  // This should catch any IR level mistake that's made when constructing or
+  // transforming statepoints.
+  ISP.verify();
+
+  // Check that the associated GCStrategy expects to encounter statepoints.
+  // TODO: This if should become an assert.  For now, we allow the GCStrategy
+  // to be optional for backwards compatibility.  This will only last a short
+  // period (i.e. a couple of weeks).
+  if (GFI) {
+    assert(GFI->getStrategy().useStatepoints() &&
+           "GCStrategy does not expect to encounter statepoints");
+  }
+#endif
+
+
+  // Lower statepoint vmstate and gcstate arguments
+  SmallVector<SDValue, 10> LoweredArgs;
+  lowerStatepointMetaArgs(LoweredArgs, ISP, *this);
+
+  // Get call node, we will replace it later with statepoint
+  SDNode *CallNode = lowerCallFromStatepoint(ISP, *this);
+
+  // Construct the actual STATEPOINT node with all the appropriate arguments
+  // and return values.
+
+  // TODO: Currently, all of these operands are being marked as read/write in
+  // PrologEpilougeInserter.cpp, we should special case the VMState arguments
+  // and flags to be read-only.
+  SmallVector<SDValue, 40> Ops;
+
+  // Calculate and push starting position of vmstate arguments
+  // Call Node: Chain, Target, {Args}, RegMask, [Glue]
+  SDValue Glue;
+  if (CallNode->getGluedNode()) {
+    // Glue is always last operand
+    Glue = CallNode->getOperand(CallNode->getNumOperands() - 1);
+  }
+  // Get number of arguments incoming directly into call node
+  unsigned NumCallRegArgs =
+      CallNode->getNumOperands() - (Glue.getNode() ? 4 : 3);
+  Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, MVT::i32));
+
+  // Add call target
+  SDValue CallTarget = SDValue(CallNode->getOperand(1).getNode(), 0);
+  Ops.push_back(CallTarget);
+
+  // Add call arguments
+  // Get position of register mask in the call
+  SDNode::op_iterator RegMaskIt;
+  if (Glue.getNode())
+    RegMaskIt = CallNode->op_end() - 2;
+  else
+    RegMaskIt = CallNode->op_end() - 1;
+  Ops.insert(Ops.end(), CallNode->op_begin() + 2, RegMaskIt);
+
+  // Add a leading constant argument with the Flags and the calling convention
+  // masked together
+  CallingConv::ID CallConv = CS.getCallingConv();
+  int Flags = dyn_cast<ConstantInt>(CS.getArgument(2))->getZExtValue();
+  assert(Flags == 0 && "not expected to be used");
+  Ops.push_back(DAG.getTargetConstant(StackMaps::ConstantOp, MVT::i64));
+  Ops.push_back(
+      DAG.getTargetConstant(Flags | ((unsigned)CallConv << 1), MVT::i64));
+
+  // Insert all vmstate and gcstate arguments
+  Ops.insert(Ops.end(), LoweredArgs.begin(), LoweredArgs.end());
+
+  // Add register mask from call node
+  Ops.push_back(*RegMaskIt);
+
+  // Add chain
+  Ops.push_back(CallNode->getOperand(0));
+
+  // Same for the glue, but we add it only if original call had it
+  if (Glue.getNode())
+    Ops.push_back(Glue);
+
+  // Compute return values.  Provide a glue output since we consume one as
+  // input.  This allows someone else to chain off us as needed.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  SDNode *StatepointMCNode = DAG.getMachineNode(TargetOpcode::STATEPOINT,
+                                                getCurSDLoc(), NodeTys, Ops);
+
+  // Replace original call
+  DAG.ReplaceAllUsesWith(CallNode, StatepointMCNode); // This may update Root
+  // Remove originall call node
+  DAG.DeleteNode(CallNode);
+
+  // DON'T set the root - under the assumption that it's already set past the
+  // inserted node we created.
+
+  // TODO: A better future implementation would be to emit a single variable
+  // argument, variable return value STATEPOINT node here and then hookup the
+  // return value of each gc.relocate to the respective output of the
+  // previously emitted STATEPOINT value.  Unfortunately, this doesn't appear
+  // to actually be possible today.
+}
+
+void SelectionDAGBuilder::visitGCResult(const CallInst &CI) {
+  // The result value of the gc_result is simply the result of the actual
+  // call.  We've already emitted this, so just grab the value.
+  Instruction *I = cast<Instruction>(CI.getArgOperand(0));
+  assert(isStatepoint(I) &&
+         "first argument must be a statepoint token");
+
+  setValue(&CI, getValue(I));
+}
+
+void SelectionDAGBuilder::visitGCRelocate(const CallInst &CI) {
+#ifndef NDEBUG
+  // Consistency check
+  StatepointLowering.relocCallVisited(CI);
+#endif
+
+  GCRelocateOperands relocateOpers(&CI);
+  SDValue SD = getValue(relocateOpers.derivedPtr());
+
+  if (isa<ConstantSDNode>(SD) || isa<FrameIndexSDNode>(SD)) {
+    // We didn't need to spill these special cases (constants and allocas).
+    // See the handling in spillIncomingValueForStatepoint for detail.
+    setValue(&CI, SD);
+    return;
+  }
+
+  SDValue Loc = StatepointLowering.getRelocLocation(SD);
+  // Emit new load if we did not emit it before
+  if (!Loc.getNode()) {
+    SDValue SpillSlot = StatepointLowering.getLocation(SD);
+    int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+
+    // Be conservative: flush all pending loads
+    // TODO: Probably we can be less restrictive on this,
+    // it may allow more scheduling opprtunities
+    SDValue Chain = getRoot();
+
+    Loc = DAG.getLoad(SpillSlot.getValueType(), getCurSDLoc(), Chain,
+                      SpillSlot, MachinePointerInfo::getFixedStack(FI), false,
+                      false, false, 0);
+
+    StatepointLowering.setRelocLocation(SD, Loc);
+
+    // Again, be conservative, don't emit pending loads
+    DAG.setRoot(Loc.getValue(1));
+  }
+
+  assert(Loc.getNode());
+  setValue(&CI, Loc);
+}
diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.h b/lib/CodeGen/SelectionDAG/StatepointLowering.h
new file mode 100644
index 0000000..673112c
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.h
@@ -0,0 +1,138 @@
+//===-- StatepointLowering.h - SDAGBuilder's statepoint code -*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes support code use by SelectionDAGBuilder when lowering a
+// statepoint sequence in SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include <vector>
+
+namespace llvm {
+class SelectionDAGBuilder;
+
+/// This class tracks both per-statepoint and per-selectiondag information.
+/// For each statepoint it tracks locations of it's gc valuess (incoming and
+/// relocated) and list of gcreloc calls scheduled for visiting (this is
+/// used for a debug mode consistency check only).  The spill slot tracking
+/// works in concert with information in FunctionLoweringInfo.
+class StatepointLoweringState {
+public:
+  StatepointLoweringState() : NextSlotToAllocate(0) {
+  }
+
+  /// Reset all state tracking for a newly encountered safepoint.  Also
+  /// performs some consistency checking.
+  void startNewStatepoint(SelectionDAGBuilder &Builder);
+
+  /// Clear the memory usage of this object.  This is called from
+  /// SelectionDAGBuilder::clear.  We require this is never called in the
+  /// midst of processing a statepoint sequence.
+  void clear();
+
+  /// Returns the spill location of a value incoming to the current
+  /// statepoint.  Will return SDValue() if this value hasn't been
+  /// spilled.  Otherwise, the value has already been spilled and no
+  /// further action is required by the caller.
+  SDValue getLocation(SDValue val) {
+    if (!Locations.count(val))
+      return SDValue();
+    return Locations[val];
+  }
+  void setLocation(SDValue val, SDValue Location) {
+    assert(!Locations.count(val) &&
+           "Trying to allocate already allocated location");
+    Locations[val] = Location;
+  }
+
+  /// Returns the relocated value for a given input pointer. Will
+  /// return SDValue() if this value hasn't yet been reloaded from
+  /// it's stack slot after the statepoint.  Otherwise, the value
+  /// has already been reloaded and the SDValue of that reload will
+  /// be returned. Note that VMState values are spilled but not
+  /// reloaded (since they don't change at the safepoint unless
+  /// also listed in the GC pointer section) and will thus never
+  /// be in this map
+  SDValue getRelocLocation(SDValue val) {
+    if (!RelocLocations.count(val))
+      return SDValue();
+    return RelocLocations[val];
+  }
+  void setRelocLocation(SDValue val, SDValue Location) {
+    assert(!RelocLocations.count(val) &&
+           "Trying to allocate already allocated location");
+    RelocLocations[val] = Location;
+  }
+
+  /// Record the fact that we expect to encounter a given gc_relocate
+  /// before the next statepoint.  If we don't see it, we'll report
+  /// an assertion.
+  void scheduleRelocCall(const CallInst &RelocCall) {
+    PendingGCRelocateCalls.push_back(&RelocCall);
+  }
+  /// Remove this gc_relocate from the list we're expecting to see
+  /// before the next statepoint.  If we weren't expecting to see
+  /// it, we'll report an assertion.
+  void relocCallVisited(const CallInst &RelocCall) {
+    SmallVectorImpl<const CallInst *>::iterator itr =
+        std::find(PendingGCRelocateCalls.begin(), PendingGCRelocateCalls.end(),
+                  &RelocCall);
+    assert(itr != PendingGCRelocateCalls.end() &&
+           "Visited unexpected gcrelocate call");
+    PendingGCRelocateCalls.erase(itr);
+  }
+
+  // TODO: Should add consistency tracking to ensure we encounter
+  // expected gc_result calls too.
+
+  /// Get a stack slot we can use to store an value of type ValueType.  This
+  /// will hopefully be a recylced slot from another statepoint.
+  SDValue allocateStackSlot(EVT ValueType, SelectionDAGBuilder &Builder);
+
+  void reserveStackSlot(int Offset) {
+    assert(Offset >= 0 && Offset < (int)AllocatedStackSlots.size() &&
+           "out of bounds");
+    assert(!AllocatedStackSlots[Offset] && "already reserved!");
+    assert(NextSlotToAllocate <= (unsigned)Offset && "consistency!");
+    AllocatedStackSlots[Offset] = true;
+  }
+  bool isStackSlotAllocated(int Offset) {
+    assert(Offset >= 0 && Offset < (int)AllocatedStackSlots.size() &&
+           "out of bounds");
+    return AllocatedStackSlots[Offset];
+  }
+
+private:
+  /// Maps pre-relocation value (gc pointer directly incoming into statepoint)
+  /// into it's location (currently only stack slots)
+  DenseMap<SDValue, SDValue> Locations;
+  /// Map pre-relocated value into it's new relocated location
+  DenseMap<SDValue, SDValue> RelocLocations;
+
+  /// A boolean indicator for each slot listed in the FunctionInfo as to
+  /// whether it has been used in the current statepoint.  Since we try to
+  /// preserve stack slots across safepoints, there can be gaps in which
+  /// slots have been allocated.
+  SmallVector<bool, 50> AllocatedStackSlots;
+
+  /// Points just beyond the last slot known to have been allocated
+  unsigned NextSlotToAllocate;
+
+  /// Keep track of pending gcrelocate calls for consistency check
+  SmallVector<const CallInst *, 10> PendingGCRelocateCalls;
+};
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 9aef5ed..0a3c926 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -793,19 +793,26 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 
     APInt MsbMask = APInt::getHighBitsSet(BitWidth, 1);
     // If we only care about the highest bit, don't bother shifting right.
-    if (MsbMask == DemandedMask) {
+    if (MsbMask == NewMask) {
       unsigned ShAmt = ExVT.getScalarType().getSizeInBits();
       SDValue InOp = Op.getOperand(0);
-
-      // Compute the correct shift amount type, which must be getShiftAmountTy
-      // for scalar types after legalization.
-      EVT ShiftAmtTy = Op.getValueType();
-      if (TLO.LegalTypes() && !ShiftAmtTy.isVector())
-        ShiftAmtTy = getShiftAmountTy(ShiftAmtTy);
-
-      SDValue ShiftAmt = TLO.DAG.getConstant(BitWidth - ShAmt, ShiftAmtTy);
-      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl,
-                                            Op.getValueType(), InOp, ShiftAmt));
+      unsigned VTBits = Op->getValueType(0).getScalarType().getSizeInBits();
+      bool AlreadySignExtended =
+        TLO.DAG.ComputeNumSignBits(InOp) >= VTBits-ShAmt+1;
+      // However if the input is already sign extended we expect the sign
+      // extension to be dropped altogether later and do not simplify.
+      if (!AlreadySignExtended) {
+        // Compute the correct shift amount type, which must be getShiftAmountTy
+        // for scalar types after legalization.
+        EVT ShiftAmtTy = Op.getValueType();
+        if (TLO.LegalTypes() && !ShiftAmtTy.isVector())
+          ShiftAmtTy = getShiftAmountTy(ShiftAmtTy);
+
+        SDValue ShiftAmt = TLO.DAG.getConstant(BitWidth - ShAmt, ShiftAmtTy);
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl,
+                                                 Op.getValueType(), InOp,
+                                                 ShiftAmt));
+      }
     }
 
     // Sign extension.  Compute the demanded bits in the result that are not
@@ -1283,36 +1290,53 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     }
 
     // (zext x) == C --> x == (trunc C)
-    if (DCI.isBeforeLegalize() && N0->hasOneUse() &&
-        (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+    // (sext x) == C --> x == (trunc C)
+    if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+        DCI.isBeforeLegalize() && N0->hasOneUse()) {
       unsigned MinBits = N0.getValueSizeInBits();
-      SDValue PreZExt;
+      SDValue PreExt;
+      bool Signed = false;
       if (N0->getOpcode() == ISD::ZERO_EXTEND) {
         // ZExt
         MinBits = N0->getOperand(0).getValueSizeInBits();
-        PreZExt = N0->getOperand(0);
+        PreExt = N0->getOperand(0);
       } else if (N0->getOpcode() == ISD::AND) {
         // DAGCombine turns costly ZExts into ANDs
         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0->getOperand(1)))
           if ((C->getAPIntValue()+1).isPowerOf2()) {
             MinBits = C->getAPIntValue().countTrailingOnes();
-            PreZExt = N0->getOperand(0);
+            PreExt = N0->getOperand(0);
           }
+      } else if (N0->getOpcode() == ISD::SIGN_EXTEND) {
+        // SExt
+        MinBits = N0->getOperand(0).getValueSizeInBits();
+        PreExt = N0->getOperand(0);
+        Signed = true;
       } else if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(N0)) {
-        // ZEXTLOAD
+        // ZEXTLOAD / SEXTLOAD
         if (LN0->getExtensionType() == ISD::ZEXTLOAD) {
           MinBits = LN0->getMemoryVT().getSizeInBits();
-          PreZExt = N0;
+          PreExt = N0;
+        } else if (LN0->getExtensionType() == ISD::SEXTLOAD) {
+          Signed = true;
+          MinBits = LN0->getMemoryVT().getSizeInBits();
+          PreExt = N0;
         }
       }
 
+      // Figure out how many bits we need to preserve this constant.
+      unsigned ReqdBits = Signed ?
+        C1.getBitWidth() - C1.getNumSignBits() + 1 :
+        C1.getActiveBits();
+
       // Make sure we're not losing bits from the constant.
       if (MinBits > 0 &&
-          MinBits < C1.getBitWidth() && MinBits >= C1.getActiveBits()) {
+          MinBits < C1.getBitWidth() &&
+          MinBits >= ReqdBits) {
         EVT MinVT = EVT::getIntegerVT(*DAG.getContext(), MinBits);
         if (isTypeDesirableForOp(ISD::SETCC, MinVT)) {
           // Will get folded away.
-          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MinVT, PreZExt);
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MinVT, PreExt);
           SDValue C = DAG.getConstant(C1.trunc(MinBits), MinVT);
           return DAG.getSetCC(dl, VT, Trunc, C, Cond);
         }
@@ -2163,9 +2187,10 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   }
 }
 
-std::pair<unsigned, const TargetRegisterClass*> TargetLowering::
-getRegForInlineAsmConstraint(const std::string &Constraint,
-                             MVT VT) const {
+std::pair<unsigned, const TargetRegisterClass *>
+TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI,
+                                             const std::string &Constraint,
+                                             MVT VT) const {
   if (Constraint.empty() || Constraint[0] != '{')
     return std::make_pair(0u, static_cast<TargetRegisterClass*>(nullptr));
   assert(*(Constraint.end()-1) == '}' && "Not a brace enclosed constraint?");
@@ -2177,8 +2202,6 @@ getRegForInlineAsmConstraint(const std::string &Constraint,
     std::make_pair(0u, static_cast<const TargetRegisterClass*>(nullptr));
 
   // Figure out which register class contains this reg.
-  const TargetRegisterInfo *RI =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   for (TargetRegisterInfo::regclass_iterator RCI = RI->regclass_begin(),
        E = RI->regclass_end(); RCI != E; ++RCI) {
     const TargetRegisterClass *RC = *RCI;
@@ -2231,8 +2254,9 @@ unsigned TargetLowering::AsmOperandInfo::getMatchedOperand() const {
 /// and also tie in the associated operand values.
 /// If this returns an empty vector, and if the constraint string itself
 /// isn't empty, there was an error parsing.
-TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
-    ImmutableCallSite CS) const {
+TargetLowering::AsmOperandInfoVector
+TargetLowering::ParseConstraints(const TargetRegisterInfo *TRI,
+                                 ImmutableCallSite CS) const {
   /// ConstraintOperands - Information about all of the constraints.
   AsmOperandInfoVector ConstraintOperands;
   const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
@@ -2323,7 +2347,7 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
   }
 
   // If we have multiple alternative constraints, select the best alternative.
-  if (ConstraintOperands.size()) {
+  if (!ConstraintOperands.empty()) {
     if (maCount) {
       unsigned bestMAIndex = 0;
       int bestWeight = -1;
@@ -2394,12 +2418,12 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
       AsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
 
       if (OpInfo.ConstraintVT != Input.ConstraintVT) {
-        std::pair<unsigned, const TargetRegisterClass*> MatchRC =
-          getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
-                                       OpInfo.ConstraintVT);
-        std::pair<unsigned, const TargetRegisterClass*> InputRC =
-          getRegForInlineAsmConstraint(Input.ConstraintCode,
-                                       Input.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass *> MatchRC =
+            getRegForInlineAsmConstraint(TRI, OpInfo.ConstraintCode,
+                                         OpInfo.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass *> InputRC =
+            getRegForInlineAsmConstraint(TRI, Input.ConstraintCode,
+                                         Input.ConstraintVT);
         if ((OpInfo.ConstraintVT.isInteger() !=
              Input.ConstraintVT.isInteger()) ||
             (MatchRC.second != InputRC.second)) {
diff --git a/lib/CodeGen/ShadowStackGC.cpp b/lib/CodeGen/ShadowStackGC.cpp
index 0be00f0..b12e943 100644
--- a/lib/CodeGen/ShadowStackGC.cpp
+++ b/lib/CodeGen/ShadowStackGC.cpp
@@ -38,416 +38,18 @@ using namespace llvm;
 #define DEBUG_TYPE "shadowstackgc"
 
 namespace {
-
-  class ShadowStackGC : public GCStrategy {
-    /// RootChain - This is the global linked-list that contains the chain of GC
-    /// roots.
-    GlobalVariable *Head;
-
-    /// StackEntryTy - Abstract type of a link in the shadow stack.
-    ///
-    StructType *StackEntryTy;
-    StructType *FrameMapTy;
-
-    /// Roots - GC roots in the current function. Each is a pair of the
-    /// intrinsic call and its corresponding alloca.
-    std::vector<std::pair<CallInst*,AllocaInst*> > Roots;
-
-  public:
-    ShadowStackGC();
-
-    bool initializeCustomLowering(Module &M) override;
-    bool performCustomLowering(Function &F) override;
-
-  private:
-    bool IsNullValue(Value *V);
-    Constant *GetFrameMap(Function &F);
-    Type* GetConcreteStackEntryType(Function &F);
-    void CollectRoots(Function &F);
-    static GetElementPtrInst *CreateGEP(LLVMContext &Context, 
-                                        IRBuilder<> &B, Value *BasePtr,
-                                        int Idx1, const char *Name);
-    static GetElementPtrInst *CreateGEP(LLVMContext &Context,
-                                        IRBuilder<> &B, Value *BasePtr,
-                                        int Idx1, int Idx2, const char *Name);
-  };
-
+class ShadowStackGC : public GCStrategy {
+public:
+  ShadowStackGC();
+};
 }
 
 static GCRegistry::Add<ShadowStackGC>
-X("shadow-stack", "Very portable GC for uncooperative code generators");
-
-namespace {
-  /// EscapeEnumerator - This is a little algorithm to find all escape points
-  /// from a function so that "finally"-style code can be inserted. In addition
-  /// to finding the existing return and unwind instructions, it also (if
-  /// necessary) transforms any call instructions into invokes and sends them to
-  /// a landing pad.
-  ///
-  /// It's wrapped up in a state machine using the same transform C# uses for
-  /// 'yield return' enumerators, This transform allows it to be non-allocating.
-  class EscapeEnumerator {
-    Function &F;
-    const char *CleanupBBName;
-
-    // State.
-    int State;
-    Function::iterator StateBB, StateE;
-    IRBuilder<> Builder;
-
-  public:
-    EscapeEnumerator(Function &F, const char *N = "cleanup")
-      : F(F), CleanupBBName(N), State(0), Builder(F.getContext()) {}
-
-    IRBuilder<> *Next() {
-      switch (State) {
-      default:
-        return nullptr;
-
-      case 0:
-        StateBB = F.begin();
-        StateE = F.end();
-        State = 1;
-
-      case 1:
-        // Find all 'return', 'resume', and 'unwind' instructions.
-        while (StateBB != StateE) {
-          BasicBlock *CurBB = StateBB++;
-
-          // Branches and invokes do not escape, only unwind, resume, and return
-          // do.
-          TerminatorInst *TI = CurBB->getTerminator();
-          if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI))
-            continue;
-
-          Builder.SetInsertPoint(TI->getParent(), TI);
-          return &Builder;
-        }
-
-        State = 2;
-
-        // Find all 'call' instructions.
-        SmallVector<Instruction*,16> Calls;
-        for (Function::iterator BB = F.begin(),
-                                E = F.end(); BB != E; ++BB)
-          for (BasicBlock::iterator II = BB->begin(),
-                                    EE = BB->end(); II != EE; ++II)
-            if (CallInst *CI = dyn_cast<CallInst>(II))
-              if (!CI->getCalledFunction() ||
-                  !CI->getCalledFunction()->getIntrinsicID())
-                Calls.push_back(CI);
-
-        if (Calls.empty())
-          return nullptr;
+    X("shadow-stack", "Very portable GC for uncooperative code generators");
 
-        // Create a cleanup block.
-        LLVMContext &C = F.getContext();
-        BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
-        Type *ExnTy = StructType::get(Type::getInt8PtrTy(C),
-                                      Type::getInt32Ty(C), nullptr);
-        Constant *PersFn =
-          F.getParent()->
-          getOrInsertFunction("__gcc_personality_v0",
-                              FunctionType::get(Type::getInt32Ty(C), true));
-        LandingPadInst *LPad = LandingPadInst::Create(ExnTy, PersFn, 1,
-                                                      "cleanup.lpad",
-                                                      CleanupBB);
-        LPad->setCleanup(true);
-        ResumeInst *RI = ResumeInst::Create(LPad, CleanupBB);
+void llvm::linkShadowStackGC() {}
 
-        // Transform the 'call' instructions into 'invoke's branching to the
-        // cleanup block. Go in reverse order to make prettier BB names.
-        SmallVector<Value*,16> Args;
-        for (unsigned I = Calls.size(); I != 0; ) {
-          CallInst *CI = cast<CallInst>(Calls[--I]);
-
-          // Split the basic block containing the function call.
-          BasicBlock *CallBB = CI->getParent();
-          BasicBlock *NewBB =
-            CallBB->splitBasicBlock(CI, CallBB->getName() + ".cont");
-
-          // Remove the unconditional branch inserted at the end of CallBB.
-          CallBB->getInstList().pop_back();
-          NewBB->getInstList().remove(CI);
-
-          // Create a new invoke instruction.
-          Args.clear();
-          CallSite CS(CI);
-          Args.append(CS.arg_begin(), CS.arg_end());
-
-          InvokeInst *II = InvokeInst::Create(CI->getCalledValue(),
-                                              NewBB, CleanupBB,
-                                              Args, CI->getName(), CallBB);
-          II->setCallingConv(CI->getCallingConv());
-          II->setAttributes(CI->getAttributes());
-          CI->replaceAllUsesWith(II);
-          delete CI;
-        }
-
-        Builder.SetInsertPoint(RI->getParent(), RI);
-        return &Builder;
-      }
-    }
-  };
-}
-
-// -----------------------------------------------------------------------------
-
-void llvm::linkShadowStackGC() { }
-
-ShadowStackGC::ShadowStackGC() : Head(nullptr), StackEntryTy(nullptr) {
+ShadowStackGC::ShadowStackGC() {
   InitRoots = true;
   CustomRoots = true;
 }
-
-Constant *ShadowStackGC::GetFrameMap(Function &F) {
-  // doInitialization creates the abstract type of this value.
-  Type *VoidPtr = Type::getInt8PtrTy(F.getContext());
-
-  // Truncate the ShadowStackDescriptor if some metadata is null.
-  unsigned NumMeta = 0;
-  SmallVector<Constant*, 16> Metadata;
-  for (unsigned I = 0; I != Roots.size(); ++I) {
-    Constant *C = cast<Constant>(Roots[I].first->getArgOperand(1));
-    if (!C->isNullValue())
-      NumMeta = I + 1;
-    Metadata.push_back(ConstantExpr::getBitCast(C, VoidPtr));
-  }
-  Metadata.resize(NumMeta);
-
-  Type *Int32Ty = Type::getInt32Ty(F.getContext());
-  
-  Constant *BaseElts[] = {
-    ConstantInt::get(Int32Ty, Roots.size(), false),
-    ConstantInt::get(Int32Ty, NumMeta, false),
-  };
-
-  Constant *DescriptorElts[] = {
-    ConstantStruct::get(FrameMapTy, BaseElts),
-    ConstantArray::get(ArrayType::get(VoidPtr, NumMeta), Metadata)
-  };
-
-  Type *EltTys[] = { DescriptorElts[0]->getType(),DescriptorElts[1]->getType()};
-  StructType *STy = StructType::create(EltTys, "gc_map."+utostr(NumMeta));
-  
-  Constant *FrameMap = ConstantStruct::get(STy, DescriptorElts);
-
-  // FIXME: Is this actually dangerous as WritingAnLLVMPass.html claims? Seems
-  //        that, short of multithreaded LLVM, it should be safe; all that is
-  //        necessary is that a simple Module::iterator loop not be invalidated.
-  //        Appending to the GlobalVariable list is safe in that sense.
-  //
-  //        All of the output passes emit globals last. The ExecutionEngine
-  //        explicitly supports adding globals to the module after
-  //        initialization.
-  //
-  //        Still, if it isn't deemed acceptable, then this transformation needs
-  //        to be a ModulePass (which means it cannot be in the 'llc' pipeline
-  //        (which uses a FunctionPassManager (which segfaults (not asserts) if
-  //        provided a ModulePass))).
-  Constant *GV = new GlobalVariable(*F.getParent(), FrameMap->getType(), true,
-                                    GlobalVariable::InternalLinkage,
-                                    FrameMap, "__gc_" + F.getName());
-
-  Constant *GEPIndices[2] = {
-                          ConstantInt::get(Type::getInt32Ty(F.getContext()), 0),
-                          ConstantInt::get(Type::getInt32Ty(F.getContext()), 0)
-                          };
-  return ConstantExpr::getGetElementPtr(GV, GEPIndices);
-}
-
-Type* ShadowStackGC::GetConcreteStackEntryType(Function &F) {
-  // doInitialization creates the generic version of this type.
-  std::vector<Type*> EltTys;
-  EltTys.push_back(StackEntryTy);
-  for (size_t I = 0; I != Roots.size(); I++)
-    EltTys.push_back(Roots[I].second->getAllocatedType());
-  
-  return StructType::create(EltTys, "gc_stackentry."+F.getName().str());
-}
-
-/// doInitialization - If this module uses the GC intrinsics, find them now. If
-/// not, exit fast.
-bool ShadowStackGC::initializeCustomLowering(Module &M) {
-  // struct FrameMap {
-  //   int32_t NumRoots; // Number of roots in stack frame.
-  //   int32_t NumMeta;  // Number of metadata descriptors. May be < NumRoots.
-  //   void *Meta[];     // May be absent for roots without metadata.
-  // };
-  std::vector<Type*> EltTys;
-  // 32 bits is ok up to a 32GB stack frame. :)
-  EltTys.push_back(Type::getInt32Ty(M.getContext()));
-  // Specifies length of variable length array. 
-  EltTys.push_back(Type::getInt32Ty(M.getContext()));
-  FrameMapTy = StructType::create(EltTys, "gc_map");
-  PointerType *FrameMapPtrTy = PointerType::getUnqual(FrameMapTy);
-
-  // struct StackEntry {
-  //   ShadowStackEntry *Next; // Caller's stack entry.
-  //   FrameMap *Map;          // Pointer to constant FrameMap.
-  //   void *Roots[];          // Stack roots (in-place array, so we pretend).
-  // };
-  
-  StackEntryTy = StructType::create(M.getContext(), "gc_stackentry");
-  
-  EltTys.clear();
-  EltTys.push_back(PointerType::getUnqual(StackEntryTy));
-  EltTys.push_back(FrameMapPtrTy);
-  StackEntryTy->setBody(EltTys);
-  PointerType *StackEntryPtrTy = PointerType::getUnqual(StackEntryTy);
-
-  // Get the root chain if it already exists.
-  Head = M.getGlobalVariable("llvm_gc_root_chain");
-  if (!Head) {
-    // If the root chain does not exist, insert a new one with linkonce
-    // linkage!
-    Head = new GlobalVariable(M, StackEntryPtrTy, false,
-                              GlobalValue::LinkOnceAnyLinkage,
-                              Constant::getNullValue(StackEntryPtrTy),
-                              "llvm_gc_root_chain");
-  } else if (Head->hasExternalLinkage() && Head->isDeclaration()) {
-    Head->setInitializer(Constant::getNullValue(StackEntryPtrTy));
-    Head->setLinkage(GlobalValue::LinkOnceAnyLinkage);
-  }
-
-  return true;
-}
-
-bool ShadowStackGC::IsNullValue(Value *V) {
-  if (Constant *C = dyn_cast<Constant>(V))
-    return C->isNullValue();
-  return false;
-}
-
-void ShadowStackGC::CollectRoots(Function &F) {
-  // FIXME: Account for original alignment. Could fragment the root array.
-  //   Approach 1: Null initialize empty slots at runtime. Yuck.
-  //   Approach 2: Emit a map of the array instead of just a count.
-
-  assert(Roots.empty() && "Not cleaned up?");
-
-  SmallVector<std::pair<CallInst*, AllocaInst*>, 16> MetaRoots;
-
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;)
-      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++))
-        if (Function *F = CI->getCalledFunction())
-          if (F->getIntrinsicID() == Intrinsic::gcroot) {
-            std::pair<CallInst*, AllocaInst*> Pair = std::make_pair(
-              CI, cast<AllocaInst>(CI->getArgOperand(0)->stripPointerCasts()));
-            if (IsNullValue(CI->getArgOperand(1)))
-              Roots.push_back(Pair);
-            else
-              MetaRoots.push_back(Pair);
-          }
-
-  // Number roots with metadata (usually empty) at the beginning, so that the
-  // FrameMap::Meta array can be elided.
-  Roots.insert(Roots.begin(), MetaRoots.begin(), MetaRoots.end());
-}
-
-GetElementPtrInst *
-ShadowStackGC::CreateGEP(LLVMContext &Context, IRBuilder<> &B, Value *BasePtr,
-                         int Idx, int Idx2, const char *Name) {
-  Value *Indices[] = { ConstantInt::get(Type::getInt32Ty(Context), 0),
-                       ConstantInt::get(Type::getInt32Ty(Context), Idx),
-                       ConstantInt::get(Type::getInt32Ty(Context), Idx2) };
-  Value* Val = B.CreateGEP(BasePtr, Indices, Name);
-
-  assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
-
-  return dyn_cast<GetElementPtrInst>(Val);
-}
-
-GetElementPtrInst *
-ShadowStackGC::CreateGEP(LLVMContext &Context, IRBuilder<> &B, Value *BasePtr,
-                         int Idx, const char *Name) {
-  Value *Indices[] = { ConstantInt::get(Type::getInt32Ty(Context), 0),
-                       ConstantInt::get(Type::getInt32Ty(Context), Idx) };
-  Value *Val = B.CreateGEP(BasePtr, Indices, Name);
-
-  assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
-
-  return dyn_cast<GetElementPtrInst>(Val);
-}
-
-/// runOnFunction - Insert code to maintain the shadow stack.
-bool ShadowStackGC::performCustomLowering(Function &F) {
-  LLVMContext &Context = F.getContext();
-  
-  // Find calls to llvm.gcroot.
-  CollectRoots(F);
-
-  // If there are no roots in this function, then there is no need to add a
-  // stack map entry for it.
-  if (Roots.empty())
-    return false;
-
-  // Build the constant map and figure the type of the shadow stack entry.
-  Value *FrameMap = GetFrameMap(F);
-  Type *ConcreteStackEntryTy = GetConcreteStackEntryType(F);
-
-  // Build the shadow stack entry at the very start of the function.
-  BasicBlock::iterator IP = F.getEntryBlock().begin();
-  IRBuilder<> AtEntry(IP->getParent(), IP);
-
-  Instruction *StackEntry = AtEntry.CreateAlloca(ConcreteStackEntryTy, nullptr,
-                                                 "gc_frame");
-
-  while (isa<AllocaInst>(IP)) ++IP;
-  AtEntry.SetInsertPoint(IP->getParent(), IP);
-
-  // Initialize the map pointer and load the current head of the shadow stack.
-  Instruction *CurrentHead  = AtEntry.CreateLoad(Head, "gc_currhead");
-  Instruction *EntryMapPtr  = CreateGEP(Context, AtEntry, StackEntry,
-                                        0,1,"gc_frame.map");
-  AtEntry.CreateStore(FrameMap, EntryMapPtr);
-
-  // After all the allocas...
-  for (unsigned I = 0, E = Roots.size(); I != E; ++I) {
-    // For each root, find the corresponding slot in the aggregate...
-    Value *SlotPtr = CreateGEP(Context, AtEntry, StackEntry, 1 + I, "gc_root");
-
-    // And use it in lieu of the alloca.
-    AllocaInst *OriginalAlloca = Roots[I].second;
-    SlotPtr->takeName(OriginalAlloca);
-    OriginalAlloca->replaceAllUsesWith(SlotPtr);
-  }
-
-  // Move past the original stores inserted by GCStrategy::InitRoots. This isn't
-  // really necessary (the collector would never see the intermediate state at
-  // runtime), but it's nicer not to push the half-initialized entry onto the
-  // shadow stack.
-  while (isa<StoreInst>(IP)) ++IP;
-  AtEntry.SetInsertPoint(IP->getParent(), IP);
-
-  // Push the entry onto the shadow stack.
-  Instruction *EntryNextPtr = CreateGEP(Context, AtEntry,
-                                        StackEntry,0,0,"gc_frame.next");
-  Instruction *NewHeadVal   = CreateGEP(Context, AtEntry, 
-                                        StackEntry, 0, "gc_newhead");
-  AtEntry.CreateStore(CurrentHead, EntryNextPtr);
-  AtEntry.CreateStore(NewHeadVal, Head);
-
-  // For each instruction that escapes...
-  EscapeEnumerator EE(F, "gc_cleanup");
-  while (IRBuilder<> *AtExit = EE.Next()) {
-    // Pop the entry from the shadow stack. Don't reuse CurrentHead from
-    // AtEntry, since that would make the value live for the entire function.
-    Instruction *EntryNextPtr2 = CreateGEP(Context, *AtExit, StackEntry, 0, 0,
-                                           "gc_frame.next");
-    Value *SavedHead = AtExit->CreateLoad(EntryNextPtr2, "gc_savedhead");
-                       AtExit->CreateStore(SavedHead, Head);
-  }
-
-  // Delete the original allocas (which are no longer used) and the intrinsic
-  // calls (which are no longer valid). Doing this last avoids invalidating
-  // iterators.
-  for (unsigned I = 0, E = Roots.size(); I != E; ++I) {
-    Roots[I].first->eraseFromParent();
-    Roots[I].second->eraseFromParent();
-  }
-
-  Roots.clear();
-  return true;
-}
diff --git a/lib/CodeGen/ShadowStackGCLowering.cpp b/lib/CodeGen/ShadowStackGCLowering.cpp
new file mode 100644
index 0000000..f6393a5
--- /dev/null
+++ b/lib/CodeGen/ShadowStackGCLowering.cpp
@@ -0,0 +1,457 @@
+//===-- ShadowStackGCLowering.cpp - Custom lowering for shadow-stack gc ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom lowering code required by the shadow-stack GC
+// strategy.  
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "shadowstackgclowering"
+
+namespace {
+
+class ShadowStackGCLowering : public FunctionPass {
+  /// RootChain - This is the global linked-list that contains the chain of GC
+  /// roots.
+  GlobalVariable *Head;
+
+  /// StackEntryTy - Abstract type of a link in the shadow stack.
+  ///
+  StructType *StackEntryTy;
+  StructType *FrameMapTy;
+
+  /// Roots - GC roots in the current function. Each is a pair of the
+  /// intrinsic call and its corresponding alloca.
+  std::vector<std::pair<CallInst *, AllocaInst *>> Roots;
+
+public:
+  static char ID;
+  ShadowStackGCLowering();
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+private:
+  bool IsNullValue(Value *V);
+  Constant *GetFrameMap(Function &F);
+  Type *GetConcreteStackEntryType(Function &F);
+  void CollectRoots(Function &F);
+  static GetElementPtrInst *CreateGEP(LLVMContext &Context, IRBuilder<> &B,
+                                      Value *BasePtr, int Idx1,
+                                      const char *Name);
+  static GetElementPtrInst *CreateGEP(LLVMContext &Context, IRBuilder<> &B,
+                                      Value *BasePtr, int Idx1, int Idx2,
+                                      const char *Name);
+};
+}
+
+INITIALIZE_PASS_BEGIN(ShadowStackGCLowering, "shadow-stack-gc-lowering",
+                      "Shadow Stack GC Lowering", false, false)
+INITIALIZE_PASS_DEPENDENCY(GCModuleInfo)
+INITIALIZE_PASS_END(ShadowStackGCLowering, "shadow-stack-gc-lowering",
+                    "Shadow Stack GC Lowering", false, false)
+
+FunctionPass *llvm::createShadowStackGCLoweringPass() { return new ShadowStackGCLowering(); }
+
+char ShadowStackGCLowering::ID = 0;
+
+ShadowStackGCLowering::ShadowStackGCLowering()
+  : FunctionPass(ID), Head(nullptr), StackEntryTy(nullptr),
+    FrameMapTy(nullptr) {
+  initializeShadowStackGCLoweringPass(*PassRegistry::getPassRegistry());
+}
+
+namespace {
+/// EscapeEnumerator - This is a little algorithm to find all escape points
+/// from a function so that "finally"-style code can be inserted. In addition
+/// to finding the existing return and unwind instructions, it also (if
+/// necessary) transforms any call instructions into invokes and sends them to
+/// a landing pad.
+///
+/// It's wrapped up in a state machine using the same transform C# uses for
+/// 'yield return' enumerators, This transform allows it to be non-allocating.
+class EscapeEnumerator {
+  Function &F;
+  const char *CleanupBBName;
+
+  // State.
+  int State;
+  Function::iterator StateBB, StateE;
+  IRBuilder<> Builder;
+
+public:
+  EscapeEnumerator(Function &F, const char *N = "cleanup")
+      : F(F), CleanupBBName(N), State(0), Builder(F.getContext()) {}
+
+  IRBuilder<> *Next() {
+    switch (State) {
+    default:
+      return nullptr;
+
+    case 0:
+      StateBB = F.begin();
+      StateE = F.end();
+      State = 1;
+
+    case 1:
+      // Find all 'return', 'resume', and 'unwind' instructions.
+      while (StateBB != StateE) {
+        BasicBlock *CurBB = StateBB++;
+
+        // Branches and invokes do not escape, only unwind, resume, and return
+        // do.
+        TerminatorInst *TI = CurBB->getTerminator();
+        if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI))
+          continue;
+
+        Builder.SetInsertPoint(TI->getParent(), TI);
+        return &Builder;
+      }
+
+      State = 2;
+
+      // Find all 'call' instructions.
+      SmallVector<Instruction *, 16> Calls;
+      for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+        for (BasicBlock::iterator II = BB->begin(), EE = BB->end(); II != EE;
+             ++II)
+          if (CallInst *CI = dyn_cast<CallInst>(II))
+            if (!CI->getCalledFunction() ||
+                !CI->getCalledFunction()->getIntrinsicID())
+              Calls.push_back(CI);
+
+      if (Calls.empty())
+        return nullptr;
+
+      // Create a cleanup block.
+      LLVMContext &C = F.getContext();
+      BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
+      Type *ExnTy =
+          StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C), nullptr);
+      Constant *PersFn = F.getParent()->getOrInsertFunction(
+          "__gcc_personality_v0", FunctionType::get(Type::getInt32Ty(C), true));
+      LandingPadInst *LPad =
+          LandingPadInst::Create(ExnTy, PersFn, 1, "cleanup.lpad", CleanupBB);
+      LPad->setCleanup(true);
+      ResumeInst *RI = ResumeInst::Create(LPad, CleanupBB);
+
+      // Transform the 'call' instructions into 'invoke's branching to the
+      // cleanup block. Go in reverse order to make prettier BB names.
+      SmallVector<Value *, 16> Args;
+      for (unsigned I = Calls.size(); I != 0;) {
+        CallInst *CI = cast<CallInst>(Calls[--I]);
+
+        // Split the basic block containing the function call.
+        BasicBlock *CallBB = CI->getParent();
+        BasicBlock *NewBB =
+            CallBB->splitBasicBlock(CI, CallBB->getName() + ".cont");
+
+        // Remove the unconditional branch inserted at the end of CallBB.
+        CallBB->getInstList().pop_back();
+        NewBB->getInstList().remove(CI);
+
+        // Create a new invoke instruction.
+        Args.clear();
+        CallSite CS(CI);
+        Args.append(CS.arg_begin(), CS.arg_end());
+
+        InvokeInst *II =
+            InvokeInst::Create(CI->getCalledValue(), NewBB, CleanupBB, Args,
+                               CI->getName(), CallBB);
+        II->setCallingConv(CI->getCallingConv());
+        II->setAttributes(CI->getAttributes());
+        CI->replaceAllUsesWith(II);
+        delete CI;
+      }
+
+      Builder.SetInsertPoint(RI->getParent(), RI);
+      return &Builder;
+    }
+  }
+};
+}
+
+
+Constant *ShadowStackGCLowering::GetFrameMap(Function &F) {
+  // doInitialization creates the abstract type of this value.
+  Type *VoidPtr = Type::getInt8PtrTy(F.getContext());
+
+  // Truncate the ShadowStackDescriptor if some metadata is null.
+  unsigned NumMeta = 0;
+  SmallVector<Constant *, 16> Metadata;
+  for (unsigned I = 0; I != Roots.size(); ++I) {
+    Constant *C = cast<Constant>(Roots[I].first->getArgOperand(1));
+    if (!C->isNullValue())
+      NumMeta = I + 1;
+    Metadata.push_back(ConstantExpr::getBitCast(C, VoidPtr));
+  }
+  Metadata.resize(NumMeta);
+
+  Type *Int32Ty = Type::getInt32Ty(F.getContext());
+
+  Constant *BaseElts[] = {
+      ConstantInt::get(Int32Ty, Roots.size(), false),
+      ConstantInt::get(Int32Ty, NumMeta, false),
+  };
+
+  Constant *DescriptorElts[] = {
+      ConstantStruct::get(FrameMapTy, BaseElts),
+      ConstantArray::get(ArrayType::get(VoidPtr, NumMeta), Metadata)};
+
+  Type *EltTys[] = {DescriptorElts[0]->getType(), DescriptorElts[1]->getType()};
+  StructType *STy = StructType::create(EltTys, "gc_map." + utostr(NumMeta));
+
+  Constant *FrameMap = ConstantStruct::get(STy, DescriptorElts);
+
+  // FIXME: Is this actually dangerous as WritingAnLLVMPass.html claims? Seems
+  //        that, short of multithreaded LLVM, it should be safe; all that is
+  //        necessary is that a simple Module::iterator loop not be invalidated.
+  //        Appending to the GlobalVariable list is safe in that sense.
+  //
+  //        All of the output passes emit globals last. The ExecutionEngine
+  //        explicitly supports adding globals to the module after
+  //        initialization.
+  //
+  //        Still, if it isn't deemed acceptable, then this transformation needs
+  //        to be a ModulePass (which means it cannot be in the 'llc' pipeline
+  //        (which uses a FunctionPassManager (which segfaults (not asserts) if
+  //        provided a ModulePass))).
+  Constant *GV = new GlobalVariable(*F.getParent(), FrameMap->getType(), true,
+                                    GlobalVariable::InternalLinkage, FrameMap,
+                                    "__gc_" + F.getName());
+
+  Constant *GEPIndices[2] = {
+      ConstantInt::get(Type::getInt32Ty(F.getContext()), 0),
+      ConstantInt::get(Type::getInt32Ty(F.getContext()), 0)};
+  return ConstantExpr::getGetElementPtr(GV, GEPIndices);
+}
+
+Type *ShadowStackGCLowering::GetConcreteStackEntryType(Function &F) {
+  // doInitialization creates the generic version of this type.
+  std::vector<Type *> EltTys;
+  EltTys.push_back(StackEntryTy);
+  for (size_t I = 0; I != Roots.size(); I++)
+    EltTys.push_back(Roots[I].second->getAllocatedType());
+
+  return StructType::create(EltTys, "gc_stackentry." + F.getName().str());
+}
+
+/// doInitialization - If this module uses the GC intrinsics, find them now. If
+/// not, exit fast.
+bool ShadowStackGCLowering::doInitialization(Module &M) {
+  bool Active = false;
+  for (Function &F : M) {
+    if (F.hasGC() && F.getGC() == std::string("shadow-stack")) {
+      Active = true;
+      break;
+    }
+  }
+  if (!Active)
+    return false;
+  
+  // struct FrameMap {
+  //   int32_t NumRoots; // Number of roots in stack frame.
+  //   int32_t NumMeta;  // Number of metadata descriptors. May be < NumRoots.
+  //   void *Meta[];     // May be absent for roots without metadata.
+  // };
+  std::vector<Type *> EltTys;
+  // 32 bits is ok up to a 32GB stack frame. :)
+  EltTys.push_back(Type::getInt32Ty(M.getContext()));
+  // Specifies length of variable length array.
+  EltTys.push_back(Type::getInt32Ty(M.getContext()));
+  FrameMapTy = StructType::create(EltTys, "gc_map");
+  PointerType *FrameMapPtrTy = PointerType::getUnqual(FrameMapTy);
+
+  // struct StackEntry {
+  //   ShadowStackEntry *Next; // Caller's stack entry.
+  //   FrameMap *Map;          // Pointer to constant FrameMap.
+  //   void *Roots[];          // Stack roots (in-place array, so we pretend).
+  // };
+
+  StackEntryTy = StructType::create(M.getContext(), "gc_stackentry");
+
+  EltTys.clear();
+  EltTys.push_back(PointerType::getUnqual(StackEntryTy));
+  EltTys.push_back(FrameMapPtrTy);
+  StackEntryTy->setBody(EltTys);
+  PointerType *StackEntryPtrTy = PointerType::getUnqual(StackEntryTy);
+
+  // Get the root chain if it already exists.
+  Head = M.getGlobalVariable("llvm_gc_root_chain");
+  if (!Head) {
+    // If the root chain does not exist, insert a new one with linkonce
+    // linkage!
+    Head = new GlobalVariable(
+        M, StackEntryPtrTy, false, GlobalValue::LinkOnceAnyLinkage,
+        Constant::getNullValue(StackEntryPtrTy), "llvm_gc_root_chain");
+  } else if (Head->hasExternalLinkage() && Head->isDeclaration()) {
+    Head->setInitializer(Constant::getNullValue(StackEntryPtrTy));
+    Head->setLinkage(GlobalValue::LinkOnceAnyLinkage);
+  }
+
+  return true;
+}
+
+bool ShadowStackGCLowering::IsNullValue(Value *V) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return C->isNullValue();
+  return false;
+}
+
+void ShadowStackGCLowering::CollectRoots(Function &F) {
+  // FIXME: Account for original alignment. Could fragment the root array.
+  //   Approach 1: Null initialize empty slots at runtime. Yuck.
+  //   Approach 2: Emit a map of the array instead of just a count.
+
+  assert(Roots.empty() && "Not cleaned up?");
+
+  SmallVector<std::pair<CallInst *, AllocaInst *>, 16> MetaRoots;
+
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;)
+      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++))
+        if (Function *F = CI->getCalledFunction())
+          if (F->getIntrinsicID() == Intrinsic::gcroot) {
+            std::pair<CallInst *, AllocaInst *> Pair = std::make_pair(
+                CI,
+                cast<AllocaInst>(CI->getArgOperand(0)->stripPointerCasts()));
+            if (IsNullValue(CI->getArgOperand(1)))
+              Roots.push_back(Pair);
+            else
+              MetaRoots.push_back(Pair);
+          }
+
+  // Number roots with metadata (usually empty) at the beginning, so that the
+  // FrameMap::Meta array can be elided.
+  Roots.insert(Roots.begin(), MetaRoots.begin(), MetaRoots.end());
+}
+
+GetElementPtrInst *ShadowStackGCLowering::CreateGEP(LLVMContext &Context,
+                                            IRBuilder<> &B, Value *BasePtr,
+                                            int Idx, int Idx2,
+                                            const char *Name) {
+  Value *Indices[] = {ConstantInt::get(Type::getInt32Ty(Context), 0),
+                      ConstantInt::get(Type::getInt32Ty(Context), Idx),
+                      ConstantInt::get(Type::getInt32Ty(Context), Idx2)};
+  Value *Val = B.CreateGEP(BasePtr, Indices, Name);
+
+  assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
+
+  return dyn_cast<GetElementPtrInst>(Val);
+}
+
+GetElementPtrInst *ShadowStackGCLowering::CreateGEP(LLVMContext &Context,
+                                            IRBuilder<> &B, Value *BasePtr,
+                                            int Idx, const char *Name) {
+  Value *Indices[] = {ConstantInt::get(Type::getInt32Ty(Context), 0),
+                      ConstantInt::get(Type::getInt32Ty(Context), Idx)};
+  Value *Val = B.CreateGEP(BasePtr, Indices, Name);
+
+  assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
+
+  return dyn_cast<GetElementPtrInst>(Val);
+}
+
+/// runOnFunction - Insert code to maintain the shadow stack.
+bool ShadowStackGCLowering::runOnFunction(Function &F) {
+  // Quick exit for functions that do not use the shadow stack GC.
+  if (!F.hasGC() ||
+      F.getGC() != std::string("shadow-stack"))
+    return false;
+  
+  LLVMContext &Context = F.getContext();
+
+  // Find calls to llvm.gcroot.
+  CollectRoots(F);
+
+  // If there are no roots in this function, then there is no need to add a
+  // stack map entry for it.
+  if (Roots.empty())
+    return false;
+
+  // Build the constant map and figure the type of the shadow stack entry.
+  Value *FrameMap = GetFrameMap(F);
+  Type *ConcreteStackEntryTy = GetConcreteStackEntryType(F);
+
+  // Build the shadow stack entry at the very start of the function.
+  BasicBlock::iterator IP = F.getEntryBlock().begin();
+  IRBuilder<> AtEntry(IP->getParent(), IP);
+
+  Instruction *StackEntry =
+      AtEntry.CreateAlloca(ConcreteStackEntryTy, nullptr, "gc_frame");
+
+  while (isa<AllocaInst>(IP))
+    ++IP;
+  AtEntry.SetInsertPoint(IP->getParent(), IP);
+
+  // Initialize the map pointer and load the current head of the shadow stack.
+  Instruction *CurrentHead = AtEntry.CreateLoad(Head, "gc_currhead");
+  Instruction *EntryMapPtr =
+      CreateGEP(Context, AtEntry, StackEntry, 0, 1, "gc_frame.map");
+  AtEntry.CreateStore(FrameMap, EntryMapPtr);
+
+  // After all the allocas...
+  for (unsigned I = 0, E = Roots.size(); I != E; ++I) {
+    // For each root, find the corresponding slot in the aggregate...
+    Value *SlotPtr = CreateGEP(Context, AtEntry, StackEntry, 1 + I, "gc_root");
+
+    // And use it in lieu of the alloca.
+    AllocaInst *OriginalAlloca = Roots[I].second;
+    SlotPtr->takeName(OriginalAlloca);
+    OriginalAlloca->replaceAllUsesWith(SlotPtr);
+  }
+
+  // Move past the original stores inserted by GCStrategy::InitRoots. This isn't
+  // really necessary (the collector would never see the intermediate state at
+  // runtime), but it's nicer not to push the half-initialized entry onto the
+  // shadow stack.
+  while (isa<StoreInst>(IP))
+    ++IP;
+  AtEntry.SetInsertPoint(IP->getParent(), IP);
+
+  // Push the entry onto the shadow stack.
+  Instruction *EntryNextPtr =
+      CreateGEP(Context, AtEntry, StackEntry, 0, 0, "gc_frame.next");
+  Instruction *NewHeadVal =
+      CreateGEP(Context, AtEntry, StackEntry, 0, "gc_newhead");
+  AtEntry.CreateStore(CurrentHead, EntryNextPtr);
+  AtEntry.CreateStore(NewHeadVal, Head);
+
+  // For each instruction that escapes...
+  EscapeEnumerator EE(F, "gc_cleanup");
+  while (IRBuilder<> *AtExit = EE.Next()) {
+    // Pop the entry from the shadow stack. Don't reuse CurrentHead from
+    // AtEntry, since that would make the value live for the entire function.
+    Instruction *EntryNextPtr2 =
+        CreateGEP(Context, *AtExit, StackEntry, 0, 0, "gc_frame.next");
+    Value *SavedHead = AtExit->CreateLoad(EntryNextPtr2, "gc_savedhead");
+    AtExit->CreateStore(SavedHead, Head);
+  }
+
+  // Delete the original allocas (which are no longer used) and the intrinsic
+  // calls (which are no longer valid). Doing this last avoids invalidating
+  // iterators.
+  for (unsigned I = 0, E = Roots.size(); I != E; ++I) {
+    Roots[I].first->eraseFromParent();
+    Roots[I].second->eraseFromParent();
+  }
+
+  Roots.clear();
+  return true;
+}
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 7fd8107..35e4292 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -191,7 +191,7 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F,
   // Create an alloca for the incoming jump buffer ptr and the new jump buffer
   // that needs to be restored on all exits from the function. This is an alloca
   // because the value needs to be added to the global context list.
-  const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
+  const TargetLowering *TLI = TM->getSubtargetImpl(F)->getTargetLowering();
   unsigned Align =
       TLI->getDataLayout()->getPrefTypeAlignment(FunctionContextTy);
   FuncCtx = new AllocaInst(FunctionContextTy, nullptr, Align, "fn_context",
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index ea7b914..dab1dfe 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -120,10 +120,9 @@ void SplitAnalysis::analyzeUses() {
 
   // First get all the defs from the interval values. This provides the correct
   // slots for early clobbers.
-  for (LiveInterval::const_vni_iterator I = CurLI->vni_begin(),
-       E = CurLI->vni_end(); I != E; ++I)
-    if (!(*I)->isPHIDef() && !(*I)->isUnused())
-      UseSlots.push_back((*I)->def);
+  for (const VNInfo *VNI : CurLI->valnos)
+    if (!VNI->isPHIDef() && !VNI->isUnused())
+      UseSlots.push_back(VNI->def);
 
   // Get use slots form the use-def chain.
   const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -624,8 +623,7 @@ void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
   AssignI.setMap(RegAssign);
 
   for (unsigned i = 0, e = Copies.size(); i != e; ++i) {
-    VNInfo *VNI = Copies[i];
-    SlotIndex Def = VNI->def;
+    SlotIndex Def = Copies[i]->def;
     MachineInstr *MI = LIS.getInstructionFromIndex(Def);
     assert(MI && "No instruction for back-copy");
 
@@ -636,13 +634,12 @@ void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
     while (!AtBegin && (--MBBI)->isDebugValue());
 
     DEBUG(dbgs() << "Removing " << Def << '\t' << *MI);
-    LI->removeValNo(VNI);
+    LIS.removeVRegDefAt(*LI, Def);
     LIS.RemoveMachineInstrFromMaps(MI);
     MI->eraseFromParent();
 
-    // Adjust RegAssign if a register assignment is killed at VNI->def.  We
-    // want to avoid calculating the live range of the source register if
-    // possible.
+    // Adjust RegAssign if a register assignment is killed at Def. We want to
+    // avoid calculating the live range of the source register if possible.
     AssignI.find(Def.getPrevSlot());
     if (!AssignI.valid() || AssignI.start() >= Def)
       continue;
@@ -727,9 +724,7 @@ void SplitEditor::hoistCopiesForSize() {
 
   // Find the nearest common dominator for parent values with multiple
   // back-copies.  If a single back-copy dominates, put it in DomPair.second.
-  for (LiveInterval::vni_iterator VI = LI->vni_begin(), VE = LI->vni_end();
-       VI != VE; ++VI) {
-    VNInfo *VNI = *VI;
+  for (VNInfo *VNI : LI->valnos) {
     if (VNI->isUnused())
       continue;
     VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
@@ -802,9 +797,7 @@ void SplitEditor::hoistCopiesForSize() {
   // Remove redundant back-copies that are now known to be dominated by another
   // def with the same value.
   SmallVector<VNInfo*, 8> BackCopies;
-  for (LiveInterval::vni_iterator VI = LI->vni_begin(), VE = LI->vni_end();
-       VI != VE; ++VI) {
-    VNInfo *VNI = *VI;
+  for (VNInfo *VNI : LI->valnos) {
     if (VNI->isUnused())
       continue;
     VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
@@ -823,16 +816,15 @@ void SplitEditor::hoistCopiesForSize() {
 bool SplitEditor::transferValues() {
   bool Skipped = false;
   RegAssignMap::const_iterator AssignI = RegAssign.begin();
-  for (LiveInterval::const_iterator ParentI = Edit->getParent().begin(),
-         ParentE = Edit->getParent().end(); ParentI != ParentE; ++ParentI) {
-    DEBUG(dbgs() << "  blit " << *ParentI << ':');
-    VNInfo *ParentVNI = ParentI->valno;
+  for (const LiveRange::Segment &S : Edit->getParent()) {
+    DEBUG(dbgs() << "  blit " << S << ':');
+    VNInfo *ParentVNI = S.valno;
     // RegAssign has holes where RegIdx 0 should be used.
-    SlotIndex Start = ParentI->start;
+    SlotIndex Start = S.start;
     AssignI.advanceTo(Start);
     do {
       unsigned RegIdx;
-      SlotIndex End = ParentI->end;
+      SlotIndex End = S.end;
       if (!AssignI.valid()) {
         RegIdx = 0;
       } else if (AssignI.start() <= Start) {
@@ -917,7 +909,7 @@ bool SplitEditor::transferValues() {
         ++MBB;
       }
       Start = End;
-    } while (Start != ParentI->end);
+    } while (Start != S.end);
     DEBUG(dbgs() << '\n');
   }
 
@@ -930,9 +922,7 @@ bool SplitEditor::transferValues() {
 
 void SplitEditor::extendPHIKillRanges() {
     // Extend live ranges to be live-out for successor PHI values.
-  for (LiveInterval::const_vni_iterator I = Edit->getParent().vni_begin(),
-       E = Edit->getParent().vni_end(); I != E; ++I) {
-    const VNInfo *PHIVNI = *I;
+  for (const VNInfo *PHIVNI : Edit->getParent().valnos) {
     if (PHIVNI->isUnused() || !PHIVNI->isPHIDef())
       continue;
     unsigned RegIdx = RegAssign.lookup(PHIVNI->def);
@@ -1006,12 +996,11 @@ void SplitEditor::deleteRematVictims() {
   SmallVector<MachineInstr*, 8> Dead;
   for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I){
     LiveInterval *LI = &LIS.getInterval(*I);
-    for (LiveInterval::const_iterator LII = LI->begin(), LIE = LI->end();
-           LII != LIE; ++LII) {
+    for (const LiveRange::Segment &S : LI->segments) {
       // Dead defs end at the dead slot.
-      if (LII->end != LII->valno->def.getDeadSlot())
+      if (S.end != S.valno->def.getDeadSlot())
         continue;
-      MachineInstr *MI = LIS.getInstructionFromIndex(LII->valno->def);
+      MachineInstr *MI = LIS.getInstructionFromIndex(S.valno->def);
       assert(MI && "Missing instruction for dead def");
       MI->addRegisterDead(LI->reg, &TRI);
 
@@ -1036,9 +1025,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
   // the inserted copies.
 
   // Add the original defs from the parent interval.
-  for (LiveInterval::const_vni_iterator I = Edit->getParent().vni_begin(),
-         E = Edit->getParent().vni_end(); I != E; ++I) {
-    const VNInfo *ParentVNI = *I;
+  for (const VNInfo *ParentVNI : Edit->getParent().valnos) {
     if (ParentVNI->isUnused())
       continue;
     unsigned RegIdx = RegAssign.lookup(ParentVNI->def);
diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
index dcf1b44..faf94b6 100644
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp
@@ -463,7 +463,8 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
     if (!VI.Var)
       continue;
     if (SlotRemap.count(VI.Slot)) {
-      DEBUG(dbgs()<<"Remapping debug info for ["<<VI.Var->getName()<<"].\n");
+      DEBUG(dbgs() << "Remapping debug info for ["
+                   << DIVariable(VI.Var).getName() << "].\n");
       VI.Slot = SlotRemap[VI.Slot];
       FixedDbg++;
     }
diff --git a/lib/CodeGen/StackMapLivenessAnalysis.cpp b/lib/CodeGen/StackMapLivenessAnalysis.cpp
index c2ee87a..767f43a 100644
--- a/lib/CodeGen/StackMapLivenessAnalysis.cpp
+++ b/lib/CodeGen/StackMapLivenessAnalysis.cpp
@@ -123,5 +123,7 @@ uint32_t *StackMapLiveness::createRegisterMask() const {
   for (LivePhysRegs::const_iterator RI = LiveRegs.begin(), RE = LiveRegs.end();
        RI != RE; ++RI)
     Mask[*RI / 32] |= 1U << (*RI % 32);
+
+  TRI->adjustStackMapLiveOutMask(Mask);
   return Mask;
 }
diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp
index d3791c3..5d46419 100644
--- a/lib/CodeGen/StackMaps.cpp
+++ b/lib/CodeGen/StackMaps.cpp
@@ -84,8 +84,7 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
     switch (MOI->getImm()) {
     default: llvm_unreachable("Unrecognized operand type.");
     case StackMaps::DirectMemRefOp: {
-      unsigned Size =
-          AP.TM.getSubtargetImpl()->getDataLayout()->getPointerSizeInBits();
+      unsigned Size = AP.TM.getDataLayout()->getPointerSizeInBits();
       assert((Size % 8) == 0 && "Need pointer size in bytes.");
       Size /= 8;
       unsigned Reg = (++MOI)->getReg();
@@ -241,7 +240,7 @@ void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint64_t ID,
   // entry.
   const MCExpr *CSOffsetExpr = MCBinaryExpr::CreateSub(
     MCSymbolRefExpr::Create(MILabel, OutContext),
-    MCSymbolRefExpr::Create(AP.CurrentFnSym, OutContext),
+    MCSymbolRefExpr::Create(AP.CurrentFnSymForSize, OutContext),
     OutContext);
 
   CSInfos.emplace_back(CSOffsetExpr, ID, std::move(Locations),
@@ -286,6 +285,18 @@ void StackMaps::recordPatchPoint(const MachineInstr &MI) {
   }
 #endif
 }
+void StackMaps::recordStatepoint(const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::STATEPOINT &&
+         "expected statepoint");
+
+  StatepointOpers opers(&MI);
+  // Record all the deopt and gc operands (they're contiguous and run from the
+  // initial index to the end of the operand list)
+  const unsigned StartIdx = opers.getVarIdx();
+  recordStackMapOpers(MI, 0xABCDEF00,
+                      MI.operands_begin() + StartIdx, MI.operands_end(),
+                      false);
+}
 
 /// Emit the stackmap header.
 ///
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index 45f97ac..d1fae95 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/Passes.h"
@@ -31,6 +32,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -86,10 +88,9 @@ bool StackProtector::runOnFunction(Function &Fn) {
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  TLI = TM->getSubtargetImpl()->getTargetLowering();
+  TLI = TM->getSubtargetImpl(Fn)->getTargetLowering();
 
-  Attribute Attr = Fn.getAttributes().getAttribute(
-      AttributeSet::FunctionIndex, "stack-protector-buffer-size");
+  Attribute Attr = Fn.getFnAttribute("stack-protector-buffer-size");
   if (Attr.isStringAttribute() &&
       Attr.getValueAsString().getAsInteger(10, SSPBufferSize))
       return false; // Invalid integer string
@@ -199,31 +200,24 @@ bool StackProtector::HasAddressTaken(const Instruction *AI) {
 bool StackProtector::RequiresStackProtector() {
   bool Strong = false;
   bool NeedsProtector = false;
-  if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                      Attribute::StackProtectReq)) {
+  if (F->hasFnAttribute(Attribute::StackProtectReq)) {
     NeedsProtector = true;
     Strong = true; // Use the same heuristic as strong to determine SSPLayout
-  } else if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                             Attribute::StackProtectStrong))
+  } else if (F->hasFnAttribute(Attribute::StackProtectStrong))
     Strong = true;
-  else if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                            Attribute::StackProtect))
+  else if (!F->hasFnAttribute(Attribute::StackProtect))
     return false;
 
-  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
-    BasicBlock *BB = I;
-
-    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;
-         ++II) {
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+  for (const BasicBlock &BB : *F) {
+    for (const Instruction &I : BB) {
+      if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
         if (AI->isArrayAllocation()) {
           // SSP-Strong: Enable protectors for any call to alloca, regardless
           // of size.
           if (Strong)
             return true;
 
-          if (const ConstantInt *CI =
-                  dyn_cast<ConstantInt>(AI->getArraySize())) {
+          if (const auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) {
             if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) {
               // A call to alloca with size >= SSPBufferSize requires
               // stack protectors.
@@ -335,7 +329,7 @@ static CallInst *FindPotentialTailCall(BasicBlock *BB, ReturnInst *RI,
 /// Returns true if the platform/triple supports the stackprotectorcreate pseudo
 /// node.
 static bool CreatePrologue(Function *F, Module *M, ReturnInst *RI,
-                           const TargetLoweringBase *TLI, const Triple &Trip,
+                           const TargetLoweringBase *TLI, const Triple &TT,
                            AllocaInst *&AI, Value *&StackGuardVar) {
   bool SupportsSelectionDAGSP = false;
   PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
@@ -344,9 +338,10 @@ static bool CreatePrologue(Function *F, Module *M, ReturnInst *RI,
     Constant *OffsetVal =
         ConstantInt::get(Type::getInt32Ty(RI->getContext()), Offset);
 
-    StackGuardVar = ConstantExpr::getIntToPtr(
-        OffsetVal, PointerType::get(PtrTy, AddressSpace));
-  } else if (Trip.getOS() == llvm::Triple::OpenBSD) {
+    StackGuardVar =
+        ConstantExpr::getIntToPtr(OffsetVal, PointerType::get(PtrTy,
+                                                              AddressSpace));
+  } else if (TT.isOSOpenBSD()) {
     StackGuardVar = M->getOrInsertGlobal("__guard_local", PtrTy);
     cast<GlobalValue>(StackGuardVar)
         ->setVisibility(GlobalValue::HiddenVisibility);
@@ -399,14 +394,13 @@ bool StackProtector::InsertStackProtectors() {
         InsertionPt = RI;
         // At this point we know that BB has a return statement so it *DOES*
         // have a terminator.
-        assert(InsertionPt != nullptr && "BB must have a terminator instruction at "
-                                   "this point.");
+        assert(InsertionPt != nullptr &&
+               "BB must have a terminator instruction at this point.");
       }
 
       Function *Intrinsic =
           Intrinsic::getDeclaration(M, Intrinsic::stackprotectorcheck);
       CallInst::Create(Intrinsic, StackGuardVar, "", InsertionPt);
-
     } else {
       // If we do not support SelectionDAG based tail calls, generate IR level
       // tail calls.
@@ -459,11 +453,17 @@ bool StackProtector::InsertStackProtectors() {
       LoadInst *LI1 = B.CreateLoad(StackGuardVar);
       LoadInst *LI2 = B.CreateLoad(AI);
       Value *Cmp = B.CreateICmpEQ(LI1, LI2);
-      B.CreateCondBr(Cmp, NewBB, FailBB);
+      unsigned SuccessWeight =
+          BranchProbabilityInfo::getBranchWeightStackProtector(true);
+      unsigned FailureWeight =
+          BranchProbabilityInfo::getBranchWeightStackProtector(false);
+      MDNode *Weights = MDBuilder(F->getContext())
+                            .createBranchWeights(SuccessWeight, FailureWeight);
+      B.CreateCondBr(Cmp, NewBB, FailBB, Weights);
     }
   }
 
-  // Return if we didn't modify any basic blocks. I.e., there are no return
+  // Return if we didn't modify any basic blocks. i.e., there are no return
   // statements in the function.
   if (!HasPrologue)
     return false;
@@ -477,15 +477,17 @@ BasicBlock *StackProtector::CreateFailBB() {
   LLVMContext &Context = F->getContext();
   BasicBlock *FailBB = BasicBlock::Create(Context, "CallStackCheckFailBlk", F);
   IRBuilder<> B(FailBB);
-  if (Trip.getOS() == llvm::Triple::OpenBSD) {
-    Constant *StackChkFail = M->getOrInsertFunction(
-        "__stack_smash_handler", Type::getVoidTy(Context),
-        Type::getInt8PtrTy(Context), nullptr);
+  if (Trip.isOSOpenBSD()) {
+    Constant *StackChkFail =
+        M->getOrInsertFunction("__stack_smash_handler",
+                               Type::getVoidTy(Context),
+                               Type::getInt8PtrTy(Context), nullptr);
 
     B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH"));
   } else {
-    Constant *StackChkFail = M->getOrInsertFunction(
-        "__stack_chk_fail", Type::getVoidTy(Context), nullptr);
+    Constant *StackChkFail =
+        M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context),
+                               nullptr);
     B.CreateCall(StackChkFail);
   }
   B.CreateUnreachable();
diff --git a/lib/CodeGen/StatepointExampleGC.cpp b/lib/CodeGen/StatepointExampleGC.cpp
new file mode 100644
index 0000000..95dfd75
--- /dev/null
+++ b/lib/CodeGen/StatepointExampleGC.cpp
@@ -0,0 +1,55 @@
+//===-- StatepointDefaultGC.cpp - The default statepoint GC strategy ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a GCStrategy which serves as an example for the usage
+// of a statepoint based lowering strategy.  This GCStrategy is intended to
+// suitable as a default implementation usable with any collector which can
+// consume the standard stackmap format generated by statepoints, uses the
+// default addrespace to distinguish between gc managed and non-gc managed
+// pointers, and has reasonable relocation semantics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Value.h"
+
+using namespace llvm;
+
+namespace {
+class StatepointGC : public GCStrategy {
+public:
+  StatepointGC() {
+    UseStatepoints = true;
+    // These options are all gc.root specific, we specify them so that the
+    // gc.root lowering code doesn't run.
+    InitRoots = false;
+    NeededSafePoints = 0;
+    UsesMetadata = false;
+    CustomRoots = false;
+  }
+  Optional<bool> isGCManagedPointer(const Value *V) const override {
+    // Method is only valid on pointer typed values.
+    PointerType *PT = cast<PointerType>(V->getType());
+    // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
+    // GC managed heap.  We know that a pointer into this heap needs to be
+    // updated and that no other pointer does.  Note that addrspace(1) is used
+    // only as an example, it has no special meaning, and is not reserved for
+    // GC usage.
+    return (1 == PT->getAddressSpace());
+  }
+};
+}
+
+static GCRegistry::Add<StatepointGC> X("statepoint-example",
+                                       "an example strategy for statepoint");
+
+namespace llvm {
+void linkStatepointExampleGC() {}
+}
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 4377236..04b3992 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -560,8 +560,7 @@ TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF,
   // compensate for the duplication.
   unsigned MaxDuplicateCount;
   if (TailDuplicateSize.getNumOccurrences() == 0 &&
-      MF.getFunction()->getAttributes().
-        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize))
+      MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
     MaxDuplicateCount = 1;
   else
     MaxDuplicateCount = TailDuplicateSize;
diff --git a/lib/CodeGen/TargetFrameLoweringImpl.cpp b/lib/CodeGen/TargetFrameLoweringImpl.cpp
index 1557d10..e3f0191 100644
--- a/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -42,3 +42,8 @@ int TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF,
   FrameReg = RI->getFrameRegister(MF);
   return getFrameIndexOffset(MF, FI);
 }
+
+bool TargetFrameLowering::needsFrameIndexResolution(
+    const MachineFunction &MF) const {
+  return MF.getFrameInfo()->hasStackObjects();
+}
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index ab45f89..2566c1f 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -307,7 +308,7 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
 
   assert(RC->getSize() >= (Offset + Size) && "bad subregister range");
 
-  if (!TM->getSubtargetImpl()->getDataLayout()->isLittleEndian()) {
+  if (!TM->getDataLayout()->isLittleEndian()) {
     Offset = RC->getSize() - (Offset + Size);
   }
   return true;
@@ -644,6 +645,28 @@ isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI,
   return true;
 }
 
+int TargetInstrInfo::getSPAdjust(const MachineInstr *MI) const {
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  bool StackGrowsDown =
+    TFI->getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  int FrameSetupOpcode = getCallFrameSetupOpcode();
+  int FrameDestroyOpcode = getCallFrameDestroyOpcode();
+
+  if (MI->getOpcode() != FrameSetupOpcode &&
+      MI->getOpcode() != FrameDestroyOpcode)
+    return 0;
+ 
+  int SPAdj = MI->getOperand(0).getImm();
+
+  if ((!StackGrowsDown && MI->getOpcode() == FrameSetupOpcode) ||
+       (StackGrowsDown && MI->getOpcode() == FrameDestroyOpcode))
+    SPAdj = -SPAdj;
+
+  return SPAdj;
+}
+
 /// isSchedulingBoundary - Test if the given instruction should be
 /// considered a scheduling boundary. This primarily includes labels
 /// and terminators.
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index e833fd3..9048a44 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -414,7 +414,7 @@ static void InitLibcallNames(const char **Names, const Triple &TT) {
     Names[RTLIB::SINCOS_PPCF128] = nullptr;
   }
 
-  if (TT.getOS() != Triple::OpenBSD) {
+  if (!TT.isOSOpenBSD()) {
     Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = "__stack_chk_fail";
   } else {
     // These are generally not available.
@@ -696,7 +696,7 @@ static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
 
 /// NOTE: The TargetMachine owns TLOF.
 TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
-    : TM(tm), DL(TM.getSubtargetImpl()->getDataLayout()) {
+    : TM(tm), DL(TM.getDataLayout()) {
   initActions();
 
   // Perform these initializations only once.
@@ -710,10 +710,12 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
   HasMultipleConditionRegisters = false;
   HasExtractBitsInsn = false;
   IntDivIsCheap = false;
+  FsqrtIsCheap = false;
   Pow2SDivIsCheap = false;
   JumpIsExpensive = false;
   PredictableSelectIsExpensive = false;
   MaskAndBranchFoldingIsLegal = false;
+  EnableExtLdPromotion = false;
   HasFloatingPointExceptions = true;
   StackPointerRegisterToSaveRestore = 0;
   ExceptionPointerRegister = 0;
@@ -747,37 +749,33 @@ void TargetLoweringBase::initActions() {
   memset(TargetDAGCombineArray, 0, array_lengthof(TargetDAGCombineArray));
 
   // Set default actions for various operations.
-  for (unsigned VT = 0; VT != (unsigned)MVT::LAST_VALUETYPE; ++VT) {
+  for (MVT VT : MVT::all_valuetypes()) {
     // Default all indexed load / store to expand.
     for (unsigned IM = (unsigned)ISD::PRE_INC;
          IM != (unsigned)ISD::LAST_INDEXED_MODE; ++IM) {
-      setIndexedLoadAction(IM, (MVT::SimpleValueType)VT, Expand);
-      setIndexedStoreAction(IM, (MVT::SimpleValueType)VT, Expand);
+      setIndexedLoadAction(IM, VT, Expand);
+      setIndexedStoreAction(IM, VT, Expand);
     }
 
     // Most backends expect to see the node which just returns the value loaded.
-    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
-                       (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand);
 
     // These operations default to expand.
-    setOperationAction(ISD::FGETSIGN, (MVT::SimpleValueType)VT, Expand);
-    setOperationAction(ISD::CONCAT_VECTORS, (MVT::SimpleValueType)VT, Expand);
-    setOperationAction(ISD::FMINNUM, (MVT::SimpleValueType)VT, Expand);
-    setOperationAction(ISD::FMAXNUM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FGETSIGN, VT, Expand);
+    setOperationAction(ISD::CONCAT_VECTORS, VT, Expand);
+    setOperationAction(ISD::FMINNUM, VT, Expand);
+    setOperationAction(ISD::FMAXNUM, VT, Expand);
+    setOperationAction(ISD::FMAD, VT, Expand);
 
     // These library functions default to expand.
-    setOperationAction(ISD::FROUND, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FROUND, VT, Expand);
 
     // These operations default to expand for vector types.
-    if (VT >= MVT::FIRST_VECTOR_VALUETYPE &&
-        VT <= MVT::LAST_VECTOR_VALUETYPE) {
-      setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG,
-                         (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG,
-                         (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG,
-                         (MVT::SimpleValueType)VT, Expand);
+    if (VT.isVector()) {
+      setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+      setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, VT, Expand);
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Expand);
+      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Expand);
     }
   }
 
@@ -897,6 +895,138 @@ bool TargetLoweringBase::canOpTrap(unsigned Op, EVT VT) const {
   }
 }
 
+TargetLoweringBase::LegalizeKind
+TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
+  // If this is a simple type, use the ComputeRegisterProp mechanism.
+  if (VT.isSimple()) {
+    MVT SVT = VT.getSimpleVT();
+    assert((unsigned)SVT.SimpleTy < array_lengthof(TransformToType));
+    MVT NVT = TransformToType[SVT.SimpleTy];
+    LegalizeTypeAction LA = ValueTypeActions.getTypeAction(SVT);
+
+    assert((LA == TypeLegal || LA == TypeSoftenFloat ||
+            ValueTypeActions.getTypeAction(NVT) != TypePromoteInteger) &&
+           "Promote may not follow Expand or Promote");
+
+    if (LA == TypeSplitVector)
+      return LegalizeKind(LA,
+                          EVT::getVectorVT(Context, SVT.getVectorElementType(),
+                                           SVT.getVectorNumElements() / 2));
+    if (LA == TypeScalarizeVector)
+      return LegalizeKind(LA, SVT.getVectorElementType());
+    return LegalizeKind(LA, NVT);
+  }
+
+  // Handle Extended Scalar Types.
+  if (!VT.isVector()) {
+    assert(VT.isInteger() && "Float types must be simple");
+    unsigned BitSize = VT.getSizeInBits();
+    // First promote to a power-of-two size, then expand if necessary.
+    if (BitSize < 8 || !isPowerOf2_32(BitSize)) {
+      EVT NVT = VT.getRoundIntegerType(Context);
+      assert(NVT != VT && "Unable to round integer VT");
+      LegalizeKind NextStep = getTypeConversion(Context, NVT);
+      // Avoid multi-step promotion.
+      if (NextStep.first == TypePromoteInteger)
+        return NextStep;
+      // Return rounded integer type.
+      return LegalizeKind(TypePromoteInteger, NVT);
+    }
+
+    return LegalizeKind(TypeExpandInteger,
+                        EVT::getIntegerVT(Context, VT.getSizeInBits() / 2));
+  }
+
+  // Handle vector types.
+  unsigned NumElts = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+
+  // Vectors with only one element are always scalarized.
+  if (NumElts == 1)
+    return LegalizeKind(TypeScalarizeVector, EltVT);
+
+  // Try to widen vector elements until the element type is a power of two and
+  // promote it to a legal type later on, for example:
+  // <3 x i8> -> <4 x i8> -> <4 x i32>
+  if (EltVT.isInteger()) {
+    // Vectors with a number of elements that is not a power of two are always
+    // widened, for example <3 x i8> -> <4 x i8>.
+    if (!VT.isPow2VectorType()) {
+      NumElts = (unsigned)NextPowerOf2(NumElts);
+      EVT NVT = EVT::getVectorVT(Context, EltVT, NumElts);
+      return LegalizeKind(TypeWidenVector, NVT);
+    }
+
+    // Examine the element type.
+    LegalizeKind LK = getTypeConversion(Context, EltVT);
+
+    // If type is to be expanded, split the vector.
+    //  <4 x i140> -> <2 x i140>
+    if (LK.first == TypeExpandInteger)
+      return LegalizeKind(TypeSplitVector,
+                          EVT::getVectorVT(Context, EltVT, NumElts / 2));
+
+    // Promote the integer element types until a legal vector type is found
+    // or until the element integer type is too big. If a legal type was not
+    // found, fallback to the usual mechanism of widening/splitting the
+    // vector.
+    EVT OldEltVT = EltVT;
+    while (1) {
+      // Increase the bitwidth of the element to the next pow-of-two
+      // (which is greater than 8 bits).
+      EltVT = EVT::getIntegerVT(Context, 1 + EltVT.getSizeInBits())
+                  .getRoundIntegerType(Context);
+
+      // Stop trying when getting a non-simple element type.
+      // Note that vector elements may be greater than legal vector element
+      // types. Example: X86 XMM registers hold 64bit element on 32bit
+      // systems.
+      if (!EltVT.isSimple())
+        break;
+
+      // Build a new vector type and check if it is legal.
+      MVT NVT = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
+      // Found a legal promoted vector type.
+      if (NVT != MVT() && ValueTypeActions.getTypeAction(NVT) == TypeLegal)
+        return LegalizeKind(TypePromoteInteger,
+                            EVT::getVectorVT(Context, EltVT, NumElts));
+    }
+
+    // Reset the type to the unexpanded type if we did not find a legal vector
+    // type with a promoted vector element type.
+    EltVT = OldEltVT;
+  }
+
+  // Try to widen the vector until a legal type is found.
+  // If there is no wider legal type, split the vector.
+  while (1) {
+    // Round up to the next power of 2.
+    NumElts = (unsigned)NextPowerOf2(NumElts);
+
+    // If there is no simple vector type with this many elements then there
+    // cannot be a larger legal vector type.  Note that this assumes that
+    // there are no skipped intermediate vector types in the simple types.
+    if (!EltVT.isSimple())
+      break;
+    MVT LargerVector = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
+    if (LargerVector == MVT())
+      break;
+
+    // If this type is legal then widen the vector.
+    if (ValueTypeActions.getTypeAction(LargerVector) == TypeLegal)
+      return LegalizeKind(TypeWidenVector, LargerVector);
+  }
+
+  // Widen odd vectors to next power of two.
+  if (!VT.isPow2VectorType()) {
+    EVT NVT = VT.getPow2VectorType(Context);
+    return LegalizeKind(TypeWidenVector, NVT);
+  }
+
+  // Vectors with illegal element types are expanded.
+  EVT NVT = EVT::getVectorVT(Context, EltVT, VT.getVectorNumElements() / 2);
+  return LegalizeKind(TypeSplitVector, NVT);
+}
 
 static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
                                           unsigned &NumIntermediates,
@@ -992,10 +1122,15 @@ TargetLoweringBase::emitPatchPoint(MachineInstr *MI,
     // Add a new memory operand for this FI.
     const MachineFrameInfo &MFI = *MF.getFrameInfo();
     assert(MFI.getObjectOffset(FI) != -1);
+
+    unsigned Flags = MachineMemOperand::MOLoad;
+    if (MI->getOpcode() == TargetOpcode::STATEPOINT) {
+      Flags |= MachineMemOperand::MOStore;
+      Flags |= MachineMemOperand::MOVolatile;
+    }
     MachineMemOperand *MMO = MF.getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(FI), MachineMemOperand::MOLoad,
-        TM.getSubtargetImpl()->getDataLayout()->getPointerSize(),
-        MFI.getObjectAlignment(FI));
+        MachinePointerInfo::getFixedStack(FI), Flags,
+        TM.getDataLayout()->getPointerSize(), MFI.getObjectAlignment(FI));
     MIB->addMemOperand(MF, MMO);
 
     // Replace the instruction and update the operand index.
@@ -1009,10 +1144,9 @@ TargetLoweringBase::emitPatchPoint(MachineInstr *MI,
 
 /// findRepresentativeClass - Return the largest legal super-reg register class
 /// of the register class for the specified type and its associated "cost".
-std::pair<const TargetRegisterClass*, uint8_t>
-TargetLoweringBase::findRepresentativeClass(MVT VT) const {
-  const TargetRegisterInfo *TRI =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
+std::pair<const TargetRegisterClass *, uint8_t>
+TargetLoweringBase::findRepresentativeClass(const TargetRegisterInfo *TRI,
+                                            MVT VT) const {
   const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy];
   if (!RC)
     return std::make_pair(RC, 0);
@@ -1038,7 +1172,8 @@ TargetLoweringBase::findRepresentativeClass(MVT VT) const {
 
 /// computeRegisterProperties - Once all of the register classes are added,
 /// this allows us to compute derived properties we expose.
-void TargetLoweringBase::computeRegisterProperties() {
+void TargetLoweringBase::computeRegisterProperties(
+    const TargetRegisterInfo *TRI) {
   static_assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE,
                 "Too many value types for ValueTypeActions to hold!");
 
@@ -1220,7 +1355,7 @@ void TargetLoweringBase::computeRegisterProperties() {
   for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
     const TargetRegisterClass* RRC;
     uint8_t Cost;
-    std::tie(RRC, Cost) = findRepresentativeClass((MVT::SimpleValueType)i);
+    std::tie(RRC, Cost) = findRepresentativeClass(TRI, (MVT::SimpleValueType)i);
     RepRegClassForVT[i] = RRC;
     RepRegClassCostForVT[i] = Cost;
   }
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index efd15e1..c1b34f7 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -71,12 +71,10 @@ void TargetLoweringObjectFileELF::emitPersonalityValue(MCStreamer &Streamer,
   const MCSection *Sec = getContext().getELFSection(NameData,
                                                     ELF::SHT_PROGBITS,
                                                     Flags,
-                                                    SectionKind::getDataRel(),
                                                     0, Label->getName());
-  unsigned Size = TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
+  unsigned Size = TM.getDataLayout()->getPointerSize();
   Streamer.SwitchSection(Sec);
-  Streamer.EmitValueToAlignment(
-      TM.getSubtargetImpl()->getDataLayout()->getPointerABIAlignment());
+  Streamer.EmitValueToAlignment(TM.getDataLayout()->getPointerABIAlignment());
   Streamer.EmitSymbolAttribute(Label, MCSA_ELF_TypeObject);
   const MCExpr *E = MCConstantExpr::Create(Size, getContext());
   Streamer.EmitELFSize(Label, E);
@@ -166,9 +164,7 @@ static unsigned getELFSectionType(StringRef Name, SectionKind K) {
   return ELF::SHT_PROGBITS;
 }
 
-
-static unsigned
-getELFSectionFlags(SectionKind K) {
+static unsigned getELFSectionFlags(SectionKind K) {
   unsigned Flags = 0;
 
   if (!K.isMetadata())
@@ -183,9 +179,7 @@ getELFSectionFlags(SectionKind K) {
   if (K.isThreadLocal())
     Flags |= ELF::SHF_TLS;
 
-  // K.isMergeableConst() is left out to honour PR4650
-  if (K.isMergeableCString() || K.isMergeableConst4() ||
-      K.isMergeableConst8() || K.isMergeableConst16())
+  if (K.isMergeableCString() || K.isMergeableConst())
     Flags |= ELF::SHF_MERGE;
 
   if (K.isMergeableCString())
@@ -222,120 +216,121 @@ const MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
   }
   return getContext().getELFSection(SectionName,
                                     getELFSectionType(SectionName, Kind), Flags,
-                                    Kind, /*EntrySize=*/0, Group);
+                                    /*EntrySize=*/0, Group);
 }
 
-/// getSectionPrefixForGlobal - Return the section prefix name used by options
-/// FunctionsSections and DataSections.
+/// Return the section prefix name used by options FunctionsSections and
+/// DataSections.
 static StringRef getSectionPrefixForGlobal(SectionKind Kind) {
-  if (Kind.isText())                 return ".text.";
-  if (Kind.isReadOnly())             return ".rodata.";
-  if (Kind.isBSS())                  return ".bss.";
-
-  if (Kind.isThreadData())           return ".tdata.";
-  if (Kind.isThreadBSS())            return ".tbss.";
-
-  if (Kind.isDataNoRel())            return ".data.";
-  if (Kind.isDataRelLocal())         return ".data.rel.local.";
-  if (Kind.isDataRel())              return ".data.rel.";
-  if (Kind.isReadOnlyWithRelLocal()) return ".data.rel.ro.local.";
-
+  if (Kind.isText())
+    return ".text";
+  if (Kind.isReadOnly())
+    return ".rodata";
+  if (Kind.isBSS())
+    return ".bss";
+  if (Kind.isThreadData())
+    return ".tdata";
+  if (Kind.isThreadBSS())
+    return ".tbss";
+  if (Kind.isDataNoRel())
+    return ".data";
+  if (Kind.isDataRelLocal())
+    return ".data.rel.local";
+  if (Kind.isDataRel())
+    return ".data.rel";
+  if (Kind.isReadOnlyWithRelLocal())
+    return ".data.rel.ro.local";
   assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
-  return ".data.rel.ro.";
+  return ".data.rel.ro";
 }
 
 const MCSection *TargetLoweringObjectFileELF::
 SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
                        Mangler &Mang, const TargetMachine &TM) const {
+  unsigned Flags = getELFSectionFlags(Kind);
+
   // If we have -ffunction-section or -fdata-section then we should emit the
   // global value to a uniqued section specifically for it.
-  bool EmitUniquedSection;
-  if (Kind.isText())
-    EmitUniquedSection = TM.getFunctionSections();
-  else
-    EmitUniquedSection = TM.getDataSections();
-
-  // If this global is linkonce/weak and the target handles this by emitting it
-  // into a 'uniqued' section name, create and return the section now.
-  if ((GV->isWeakForLinker() || EmitUniquedSection || GV->hasComdat()) &&
-      !Kind.isCommon()) {
-    StringRef Prefix = getSectionPrefixForGlobal(Kind);
-
-    SmallString<128> Name(Prefix);
-    TM.getNameWithPrefix(Name, GV, Mang, true);
-
-    StringRef Group = "";
-    unsigned Flags = getELFSectionFlags(Kind);
-    if (GV->isWeakForLinker() || GV->hasComdat()) {
-      if (const Comdat *C = getELFComdat(GV))
-        Group = C->getName();
-      else
-        Group = Name.substr(Prefix.size());
-      Flags |= ELF::SHF_GROUP;
+  bool EmitUniqueSection = false;
+  if (!(Flags & ELF::SHF_MERGE) && !Kind.isCommon()) {
+    if (Kind.isText())
+      EmitUniqueSection = TM.getFunctionSections();
+    else
+      EmitUniqueSection = TM.getDataSections();
+  }
+  EmitUniqueSection |= GV->hasComdat();
+
+  unsigned EntrySize = 0;
+  if (Kind.isMergeableCString()) {
+    if (Kind.isMergeable2ByteCString()) {
+      EntrySize = 2;
+    } else if (Kind.isMergeable4ByteCString()) {
+      EntrySize = 4;
+    } else {
+      EntrySize = 1;
+      assert(Kind.isMergeable1ByteCString() && "unknown string width");
+    }
+  } else if (Kind.isMergeableConst()) {
+    if (Kind.isMergeableConst4()) {
+      EntrySize = 4;
+    } else if (Kind.isMergeableConst8()) {
+      EntrySize = 8;
+    } else {
+      assert(Kind.isMergeableConst16() && "unknown data width");
+      EntrySize = 16;
     }
-
-    return getContext().getELFSection(Name.str(),
-                                      getELFSectionType(Name.str(), Kind),
-                                      Flags, Kind, 0, Group);
   }
 
-  if (Kind.isText()) return TextSection;
-
-  if (Kind.isMergeable1ByteCString() ||
-      Kind.isMergeable2ByteCString() ||
-      Kind.isMergeable4ByteCString()) {
+  StringRef Group = "";
+  if (const Comdat *C = getELFComdat(GV)) {
+    Flags |= ELF::SHF_GROUP;
+    Group = C->getName();
+  }
 
+  bool UniqueSectionNames = TM.getUniqueSectionNames();
+  SmallString<128> Name;
+  if (Kind.isMergeableCString()) {
     // We also need alignment here.
     // FIXME: this is getting the alignment of the character, not the
     // alignment of the global!
     unsigned Align =
-        TM.getSubtargetImpl()->getDataLayout()->getPreferredAlignment(
-            cast<GlobalVariable>(GV));
-
-    const char *SizeSpec = ".rodata.str1.";
-    if (Kind.isMergeable2ByteCString())
-      SizeSpec = ".rodata.str2.";
-    else if (Kind.isMergeable4ByteCString())
-      SizeSpec = ".rodata.str4.";
-    else
-      assert(Kind.isMergeable1ByteCString() && "unknown string width");
+        TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV));
 
-
-    std::string Name = SizeSpec + utostr(Align);
-    return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
-                                      ELF::SHF_ALLOC |
-                                      ELF::SHF_MERGE |
-                                      ELF::SHF_STRINGS,
-                                      Kind);
+    std::string SizeSpec = ".rodata.str" + utostr(EntrySize) + ".";
+    Name = SizeSpec + utostr(Align);
+  } else if (Kind.isMergeableConst()) {
+    Name = ".rodata.cst";
+    Name += utostr(EntrySize);
+  } else {
+    Name = getSectionPrefixForGlobal(Kind);
   }
 
-  if (Kind.isMergeableConst()) {
-    if (Kind.isMergeableConst4() && MergeableConst4Section)
-      return MergeableConst4Section;
-    if (Kind.isMergeableConst8() && MergeableConst8Section)
-      return MergeableConst8Section;
-    if (Kind.isMergeableConst16() && MergeableConst16Section)
-      return MergeableConst16Section;
-    return ReadOnlySection;  // .const
+  if (EmitUniqueSection && UniqueSectionNames) {
+    Name.push_back('.');
+    TM.getNameWithPrefix(Name, GV, Mang, true);
   }
+  return getContext().getELFSection(Name, getELFSectionType(Name, Kind), Flags,
+                                    EntrySize, Group,
+                                    EmitUniqueSection && !UniqueSectionNames);
+}
+
+const MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable(
+    const Function &F, Mangler &Mang, const TargetMachine &TM) const {
+  // If the function can be removed, produce a unique section so that
+  // the table doesn't prevent the removal.
+  const Comdat *C = F.getComdat();
+  bool EmitUniqueSection = TM.getFunctionSections() || C;
+  if (!EmitUniqueSection)
+    return ReadOnlySection;
 
-  if (Kind.isReadOnly())             return ReadOnlySection;
-
-  if (Kind.isThreadData())           return TLSDataSection;
-  if (Kind.isThreadBSS())            return TLSBSSSection;
-
-  // Note: we claim that common symbols are put in BSSSection, but they are
-  // really emitted with the magic .comm directive, which creates a symbol table
-  // entry but not a section.
-  if (Kind.isBSS() || Kind.isCommon()) return BSSSection;
-
-  if (Kind.isDataNoRel())            return DataSection;
-  if (Kind.isDataRelLocal())         return DataRelLocalSection;
-  if (Kind.isDataRel())              return DataRelSection;
-  if (Kind.isReadOnlyWithRelLocal()) return DataRelROLocalSection;
+  return SelectSectionForGlobal(&F, SectionKind::getReadOnly(), Mang, TM);
+}
 
-  assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
-  return DataRelROSection;
+bool TargetLoweringObjectFileELF::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  // We can always create relative relocations, so use another section
+  // that can be marked non-executable.
+  return false;
 }
 
 /// getSectionForConstant - Given a mergeable constant with the
@@ -366,7 +361,6 @@ static const MCSectionELF *getStaticStructorSection(MCContext &Ctx,
   std::string Name;
   unsigned Type;
   unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE;
-  SectionKind Kind = SectionKind::getDataRel();
   StringRef COMDAT = KeySym ? KeySym->getName() : "";
 
   if (KeySym)
@@ -398,7 +392,7 @@ static const MCSectionELF *getStaticStructorSection(MCContext &Ctx,
     Type = ELF::SHT_PROGBITS;
   }
 
-  return Ctx.getELFSection(Name, Type, Flags, Kind, 0, COMDAT);
+  return Ctx.getELFSection(Name, Type, Flags, 0, COMDAT);
 }
 
 const MCSection *TargetLoweringObjectFileELF::getStaticCtorSection(
@@ -419,16 +413,10 @@ TargetLoweringObjectFileELF::InitializeELF(bool UseInitArray_) {
   if (!UseInitArray)
     return;
 
-  StaticCtorSection =
-    getContext().getELFSection(".init_array", ELF::SHT_INIT_ARRAY,
-                               ELF::SHF_WRITE |
-                               ELF::SHF_ALLOC,
-                               SectionKind::getDataRel());
-  StaticDtorSection =
-    getContext().getELFSection(".fini_array", ELF::SHT_FINI_ARRAY,
-                               ELF::SHF_WRITE |
-                               ELF::SHF_ALLOC,
-                               SectionKind::getDataRel());
+  StaticCtorSection = getContext().getELFSection(
+      ".init_array", ELF::SHT_INIT_ARRAY, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+  StaticDtorSection = getContext().getELFSection(
+      ".fini_array", ELF::SHT_FINI_ARRAY, ELF::SHF_WRITE | ELF::SHF_ALLOC);
 }
 
 //===----------------------------------------------------------------------===//
@@ -464,14 +452,15 @@ emitModuleFlags(MCStreamer &Streamer,
       continue;
 
     StringRef Key = MFE.Key->getString();
-    Value *Val = MFE.Val;
+    Metadata *Val = MFE.Val;
 
     if (Key == "Objective-C Image Info Version") {
-      VersionVal = cast<ConstantInt>(Val)->getZExtValue();
+      VersionVal = mdconst::extract<ConstantInt>(Val)->getZExtValue();
     } else if (Key == "Objective-C Garbage Collection" ||
                Key == "Objective-C GC Only" ||
-               Key == "Objective-C Is Simulated") {
-      ImageInfoFlags |= cast<ConstantInt>(Val)->getZExtValue();
+               Key == "Objective-C Is Simulated" ||
+               Key == "Objective-C Image Swift Version") {
+      ImageInfoFlags |= mdconst::extract<ConstantInt>(Val)->getZExtValue();
     } else if (Key == "Objective-C Image Info Section") {
       SectionVal = cast<MDString>(Val)->getString();
     } else if (Key == "Linker Options") {
@@ -572,60 +561,6 @@ const MCSection *TargetLoweringObjectFileMachO::getExplicitSectionGlobal(
   return S;
 }
 
-bool TargetLoweringObjectFileMachO::isSectionAtomizableBySymbols(
-    const MCSection &Section) const {
-    const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
-
-    // Sections holding 1 byte strings are atomized based on the data
-    // they contain.
-    // Sections holding 2 byte strings require symbols in order to be
-    // atomized.
-    // There is no dedicated section for 4 byte strings.
-    if (SMO.getKind().isMergeable1ByteCString())
-      return false;
-
-    if (SMO.getSegmentName() == "__TEXT" &&
-        SMO.getSectionName() == "__objc_classname" &&
-        SMO.getType() == MachO::S_CSTRING_LITERALS)
-      return false;
-
-    if (SMO.getSegmentName() == "__TEXT" &&
-        SMO.getSectionName() == "__objc_methname" &&
-        SMO.getType() == MachO::S_CSTRING_LITERALS)
-      return false;
-
-    if (SMO.getSegmentName() == "__TEXT" &&
-        SMO.getSectionName() == "__objc_methtype" &&
-        SMO.getType() == MachO::S_CSTRING_LITERALS)
-      return false;
-
-    if (SMO.getSegmentName() == "__DATA" &&
-        SMO.getSectionName() == "__cfstring")
-      return false;
-
-    // no_dead_strip sections are not atomized in practice.
-    if (SMO.hasAttribute(MachO::S_ATTR_NO_DEAD_STRIP))
-      return false;
-
-    switch (SMO.getType()) {
-    default:
-      return true;
-
-      // These sections are atomized at the element boundaries without using
-      // symbols.
-    case MachO::S_4BYTE_LITERALS:
-    case MachO::S_8BYTE_LITERALS:
-    case MachO::S_16BYTE_LITERALS:
-    case MachO::S_LITERAL_POINTERS:
-    case MachO::S_NON_LAZY_SYMBOL_POINTERS:
-    case MachO::S_LAZY_SYMBOL_POINTERS:
-    case MachO::S_MOD_INIT_FUNC_POINTERS:
-    case MachO::S_MOD_TERM_FUNC_POINTERS:
-    case MachO::S_INTERPOSING:
-      return false;
-    }
-}
-
 const MCSection *TargetLoweringObjectFileMachO::
 SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
                        Mangler &Mang, const TargetMachine &TM) const {
@@ -648,16 +583,14 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
 
   // FIXME: Alignment check should be handled by section classifier.
   if (Kind.isMergeable1ByteCString() &&
-      TM.getSubtargetImpl()->getDataLayout()->getPreferredAlignment(
-          cast<GlobalVariable>(GV)) < 32)
+      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32)
     return CStringSection;
 
   // Do not put 16-bit arrays in the UString section if they have an
   // externally visible label, this runs into issues with certain linker
   // versions.
   if (Kind.isMergeable2ByteCString() && !GV->hasExternalLinkage() &&
-      TM.getSubtargetImpl()->getDataLayout()->getPreferredAlignment(
-          cast<GlobalVariable>(GV)) < 32)
+      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32)
     return UStringSection;
 
   // With MachO only variables whose corresponding symbol starts with 'l' or
@@ -854,7 +787,7 @@ const MCSection *TargetLoweringObjectFileCOFF::getExplicitSectionGlobal(
   unsigned Characteristics = getCOFFSectionFlags(Kind);
   StringRef Name = GV->getSection();
   StringRef COMDATSymName = "";
-  if ((GV->isWeakForLinker() || GV->hasComdat()) && !Kind.isCommon()) {
+  if (GV->hasComdat()) {
     Selection = getSelectionForCOFF(GV);
     const GlobalValue *ComdatGV;
     if (Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
@@ -901,12 +834,7 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
   else
     EmitUniquedSection = TM.getDataSections();
 
-  // If this global is linkonce/weak and the target handles this by emitting it
-  // into a 'uniqued' section name, create and return the section now.
-  // Section names depend on the name of the symbol which is not feasible if the
-  // symbol has private linkage.
-  if ((GV->isWeakForLinker() || EmitUniquedSection || GV->hasComdat()) &&
-      !Kind.isCommon()) {
+  if ((EmitUniquedSection && !Kind.isCommon()) || GV->hasComdat()) {
     const char *Name = getCOFFSectionNameForUniqueGlobal(Kind);
     unsigned Characteristics = getCOFFSectionFlags(Kind);
 
@@ -965,7 +893,7 @@ emitModuleFlags(MCStreamer &Streamer,
        i = ModuleFlags.begin(), e = ModuleFlags.end(); i != e; ++i) {
     const Module::ModuleFlagEntry &MFE = *i;
     StringRef Key = MFE.Key->getString();
-    Value *Val = MFE.Val;
+    Metadata *Val = MFE.Val;
     if (Key == "Linker Options") {
       LinkerOptions = cast<MDNode>(Val);
       break;
@@ -982,21 +910,10 @@ emitModuleFlags(MCStreamer &Streamer,
     MDNode *MDOptions = cast<MDNode>(LinkerOptions->getOperand(i));
     for (unsigned ii = 0, ie = MDOptions->getNumOperands(); ii != ie; ++ii) {
       MDString *MDOption = cast<MDString>(MDOptions->getOperand(ii));
-      StringRef Op = MDOption->getString();
       // Lead with a space for consistency with our dllexport implementation.
-      std::string Escaped(" ");
-      if (Op.find(" ") != StringRef::npos) {
-        // The PE-COFF spec says args with spaces must be quoted.  It doesn't say
-        // how to escape quotes, but it probably uses this algorithm:
-        // http://msdn.microsoft.com/en-us/library/17w5ykft(v=vs.85).aspx
-        // FIXME: Reuse escaping code from Support/Windows/Program.inc
-        Escaped.push_back('\"');
-        Escaped.append(Op);
-        Escaped.push_back('\"');
-      } else {
-        Escaped.append(Op);
-      }
-      Streamer.EmitBytes(Escaped);
+      std::string Directive(" ");
+      Directive.append(MDOption->getString());
+      Streamer.EmitBytes(Directive);
     }
   }
 }
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index e218a83..1bbe6e1 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1515,9 +1515,9 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
   const TargetMachine &TM = MF->getTarget();
   MRI = &MF->getRegInfo();
-  TII = TM.getSubtargetImpl()->getInstrInfo();
-  TRI = TM.getSubtargetImpl()->getRegisterInfo();
-  InstrItins = TM.getSubtargetImpl()->getInstrItineraryData();
+  TII = MF->getSubtarget().getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  InstrItins = MF->getSubtarget().getInstrItineraryData();
   LV = getAnalysisIfAvailable<LiveVariables>();
   LIS = getAnalysisIfAvailable<LiveIntervals>();
   AA = &getAnalysis<AliasAnalysis>();
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index 7824f92..d393e10 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -88,7 +88,7 @@ bool UnreachableBlockElim::runOnFunction(Function &F) {
     DeadBlocks[i]->eraseFromParent();
   }
 
-  return DeadBlocks.size();
+  return !DeadBlocks.empty();
 }
 
 
@@ -204,5 +204,5 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
 
   F.RenumberBlocks();
 
-  return (DeadBlocks.size() || ModifiedPHI);
+  return (!DeadBlocks.empty() || ModifiedPHI);
 }
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 0d17d43..7d3b0ce 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -252,20 +252,41 @@ void VirtRegRewriter::addMBBLiveIns() {
     unsigned PhysReg = VRM->getPhys(VirtReg);
     assert(PhysReg != VirtRegMap::NO_PHYS_REG && "Unmapped virtual register.");
 
-    // Scan the segments of LI.
-    for (LiveInterval::const_iterator I = LI.begin(), E = LI.end(); I != E;
-         ++I) {
-      if (!Indexes->findLiveInMBBs(I->start, I->end, LiveIn))
-        continue;
-      for (unsigned i = 0, e = LiveIn.size(); i != e; ++i)
-        if (!LiveIn[i]->isLiveIn(PhysReg))
-          LiveIn[i]->addLiveIn(PhysReg);
-      LiveIn.clear();
+    if (LI.hasSubRanges()) {
+      for (LiveInterval::SubRange &S : LI.subranges()) {
+        for (const auto &Seg : S.segments) {
+          if (!Indexes->findLiveInMBBs(Seg.start, Seg.end, LiveIn))
+            continue;
+          for (MCSubRegIndexIterator SR(PhysReg, TRI); SR.isValid(); ++SR) {
+            unsigned SubReg = SR.getSubReg();
+            unsigned SubRegIndex = SR.getSubRegIndex();
+            unsigned SubRegLaneMask = TRI->getSubRegIndexLaneMask(SubRegIndex);
+            if ((SubRegLaneMask & S.LaneMask) == 0)
+              continue;
+            for (unsigned i = 0, e = LiveIn.size(); i != e; ++i) {
+              if (!LiveIn[i]->isLiveIn(SubReg))
+                LiveIn[i]->addLiveIn(SubReg);
+            }
+          }
+          LiveIn.clear();
+        }
+      }
+    } else {
+      // Scan the segments of LI.
+      for (const auto &Seg : LI.segments) {
+        if (!Indexes->findLiveInMBBs(Seg.start, Seg.end, LiveIn))
+          continue;
+        for (unsigned i = 0, e = LiveIn.size(); i != e; ++i)
+          if (!LiveIn[i]->isLiveIn(PhysReg))
+            LiveIn[i]->addLiveIn(PhysReg);
+        LiveIn.clear();
+      }
     }
   }
 }
 
 void VirtRegRewriter::rewrite() {
+  bool NoSubRegLiveness = !MRI->tracksSubRegLiveness();
   SmallVector<unsigned, 8> SuperDeads;
   SmallVector<unsigned, 8> SuperDefs;
   SmallVector<unsigned, 8> SuperKills;
@@ -347,7 +368,8 @@ void VirtRegRewriter::rewrite() {
           // A virtual register kill refers to the whole register, so we may
           // have to add <imp-use,kill> operands for the super-register.  A
           // partial redef always kills and redefines the super-register.
-          if (MO.readsReg() && (MO.isDef() || MO.isKill()))
+          if (NoSubRegLiveness && MO.readsReg()
+              && (MO.isDef() || MO.isKill()))
             SuperKills.push_back(PhysReg);
 
           if (MO.isDef()) {
@@ -358,10 +380,12 @@ void VirtRegRewriter::rewrite() {
             MO.setIsUndef(false);
 
             // Also add implicit defs for the super-register.
-            if (MO.isDead())
-              SuperDeads.push_back(PhysReg);
-            else
-              SuperDefs.push_back(PhysReg);
+            if (NoSubRegLiveness) {
+              if (MO.isDead())
+                SuperDeads.push_back(PhysReg);
+              else
+                SuperDefs.push_back(PhysReg);
+            }
           }
 
           // PhysReg operands cannot have subregister indexes.
diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp
new file mode 100644
index 0000000..6f712a9
--- /dev/null
+++ b/lib/CodeGen/WinEHPrepare.cpp
@@ -0,0 +1,626 @@
+//===-- WinEHPrepare - Prepare exception handling for code generation ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers LLVM IR exception handling into something closer to what the
+// backend wants. It snifs the personality function to see which kind of
+// preparation is necessary. If the personality function uses the Itanium LSDA,
+// this pass delegates to the DWARF EH preparation pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <memory>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "winehprepare"
+
+namespace {
+
+struct HandlerAllocas {
+  TinyPtrVector<AllocaInst *> Allocas;
+  int ParentFrameAllocationIndex;
+};
+
+// This map is used to model frame variable usage during outlining, to
+// construct a structure type to hold the frame variables in a frame
+// allocation block, and to remap the frame variable allocas (including
+// spill locations as needed) to GEPs that get the variable from the
+// frame allocation structure.
+typedef MapVector<AllocaInst *, HandlerAllocas> FrameVarInfoMap;
+
+class WinEHPrepare : public FunctionPass {
+  std::unique_ptr<FunctionPass> DwarfPrepare;
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  WinEHPrepare(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), DwarfPrepare(createDwarfEHPass(TM)) {}
+
+  bool runOnFunction(Function &Fn) override;
+
+  bool doFinalization(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  const char *getPassName() const override {
+    return "Windows exception handling preparation";
+  }
+
+private:
+  bool prepareCPPEHHandlers(Function &F,
+                            SmallVectorImpl<LandingPadInst *> &LPads);
+  bool outlineCatchHandler(Function *SrcFn, Constant *SelectorType,
+                           LandingPadInst *LPad, CallInst *&EHAlloc,
+                           AllocaInst *&EHObjPtr, FrameVarInfoMap &VarInfo);
+};
+
+class WinEHFrameVariableMaterializer : public ValueMaterializer {
+public:
+  WinEHFrameVariableMaterializer(Function *OutlinedFn,
+                                 FrameVarInfoMap &FrameVarInfo);
+  ~WinEHFrameVariableMaterializer() {}
+
+  virtual Value *materializeValueFor(Value *V) override;
+
+private:
+  FrameVarInfoMap &FrameVarInfo;
+  IRBuilder<> Builder;
+};
+
+class WinEHCatchDirector : public CloningDirector {
+public:
+  WinEHCatchDirector(LandingPadInst *LPI, Function *CatchFn, Value *Selector,
+                     Value *EHObj, FrameVarInfoMap &VarInfo)
+      : LPI(LPI), CurrentSelector(Selector->stripPointerCasts()), EHObj(EHObj),
+        Materializer(CatchFn, VarInfo),
+        SelectorIDType(Type::getInt32Ty(LPI->getContext())),
+        Int8PtrType(Type::getInt8PtrTy(LPI->getContext())) {}
+
+  CloningAction handleInstruction(ValueToValueMapTy &VMap,
+                                  const Instruction *Inst,
+                                  BasicBlock *NewBB) override;
+
+  ValueMaterializer *getValueMaterializer() override { return &Materializer; }
+
+private:
+  LandingPadInst *LPI;
+  Value *CurrentSelector;
+  Value *EHObj;
+  WinEHFrameVariableMaterializer Materializer;
+  Type *SelectorIDType;
+  Type *Int8PtrType;
+
+  const Value *ExtractedEHPtr;
+  const Value *ExtractedSelector;
+  const Value *EHPtrStoreAddr;
+  const Value *SelectorStoreAddr;
+};
+} // end anonymous namespace
+
+char WinEHPrepare::ID = 0;
+INITIALIZE_TM_PASS(WinEHPrepare, "winehprepare", "Prepare Windows exceptions",
+                   false, false)
+
+FunctionPass *llvm::createWinEHPass(const TargetMachine *TM) {
+  return new WinEHPrepare(TM);
+}
+
+static bool isMSVCPersonality(EHPersonality Pers) {
+  return Pers == EHPersonality::MSVC_Win64SEH ||
+         Pers == EHPersonality::MSVC_CXX;
+}
+
+bool WinEHPrepare::runOnFunction(Function &Fn) {
+  SmallVector<LandingPadInst *, 4> LPads;
+  SmallVector<ResumeInst *, 4> Resumes;
+  for (BasicBlock &BB : Fn) {
+    if (auto *LP = BB.getLandingPadInst())
+      LPads.push_back(LP);
+    if (auto *Resume = dyn_cast<ResumeInst>(BB.getTerminator()))
+      Resumes.push_back(Resume);
+  }
+
+  // No need to prepare functions that lack landing pads.
+  if (LPads.empty())
+    return false;
+
+  // Classify the personality to see what kind of preparation we need.
+  EHPersonality Pers = classifyEHPersonality(LPads.back()->getPersonalityFn());
+
+  // Delegate through to the DWARF pass if this is unrecognized.
+  if (!isMSVCPersonality(Pers))
+    return DwarfPrepare->runOnFunction(Fn);
+
+  // FIXME: This only returns true if the C++ EH handlers were outlined.
+  //        When that code is complete, it should always return whatever
+  //        prepareCPPEHHandlers returns.
+  if (Pers == EHPersonality::MSVC_CXX && prepareCPPEHHandlers(Fn, LPads))
+    return true;
+
+  // FIXME: SEH Cleanups are unimplemented. Replace them with unreachable.
+  if (Resumes.empty())
+    return false;
+
+  for (ResumeInst *Resume : Resumes) {
+    IRBuilder<>(Resume).CreateUnreachable();
+    Resume->eraseFromParent();
+  }
+
+  return true;
+}
+
+bool WinEHPrepare::doFinalization(Module &M) {
+  return DwarfPrepare->doFinalization(M);
+}
+
+void WinEHPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
+  DwarfPrepare->getAnalysisUsage(AU);
+}
+
+bool WinEHPrepare::prepareCPPEHHandlers(
+    Function &F, SmallVectorImpl<LandingPadInst *> &LPads) {
+  // These containers are used to re-map frame variables that are used in
+  // outlined catch and cleanup handlers.  They will be populated as the
+  // handlers are outlined.
+  FrameVarInfoMap FrameVarInfo;
+  SmallVector<CallInst *, 4> HandlerAllocs;
+  SmallVector<AllocaInst *, 4> HandlerEHObjPtrs;
+
+  bool HandlersOutlined = false;
+
+  for (LandingPadInst *LPad : LPads) {
+    // Look for evidence that this landingpad has already been processed.
+    bool LPadHasActionList = false;
+    BasicBlock *LPadBB = LPad->getParent();
+    for (Instruction &Inst : LPadBB->getInstList()) {
+      // FIXME: Make this an intrinsic.
+      if (auto *Call = dyn_cast<CallInst>(&Inst))
+        if (Call->getCalledFunction()->getName() == "llvm.eh.actions") {
+          LPadHasActionList = true;
+          break;
+        }
+    }
+
+    // If we've already outlined the handlers for this landingpad,
+    // there's nothing more to do here.
+    if (LPadHasActionList)
+      continue;
+
+    for (unsigned Idx = 0, NumClauses = LPad->getNumClauses(); Idx < NumClauses;
+         ++Idx) {
+      if (LPad->isCatch(Idx)) {
+        // Create a new instance of the handler data structure in the
+        // HandlerData vector.
+        CallInst *EHAlloc = nullptr;
+        AllocaInst *EHObjPtr = nullptr;
+        bool Outlined = outlineCatchHandler(&F, LPad->getClause(Idx), LPad,
+                                            EHAlloc, EHObjPtr, FrameVarInfo);
+        if (Outlined) {
+          HandlersOutlined = true;
+          // These values must be resolved after all handlers have been
+          // outlined.
+          if (EHAlloc)
+            HandlerAllocs.push_back(EHAlloc);
+          if (EHObjPtr)
+            HandlerEHObjPtrs.push_back(EHObjPtr);
+        }
+      } // End if (isCatch)
+    }   // End for each clause
+  }     // End for each landingpad
+
+  // If nothing got outlined, there is no more processing to be done.
+  if (!HandlersOutlined)
+    return false;
+
+  // FIXME: We will replace the landingpad bodies with llvm.eh.actions
+  //        calls and indirect branches here and then delete blocks
+  //        which are no longer reachable.  That will get rid of the
+  //        handlers that we have outlined.  There is code below
+  //        that looks for allocas with no uses in the parent function.
+  //        That will only happen after the pruning is implemented.
+
+  // Remap the frame variables.
+  SmallVector<Type *, 2> StructTys;
+  StructTys.push_back(Type::getInt32Ty(F.getContext()));   // EH state
+  StructTys.push_back(Type::getInt8PtrTy(F.getContext())); // EH object
+
+  // Start the index at two since we always have the above fields at 0 and 1.
+  int Idx = 2;
+
+  // FIXME: Sort the FrameVarInfo vector by the ParentAlloca size and alignment
+  //        and add padding as necessary to provide the proper alignment.
+
+  // Map the alloca instructions to the corresponding index in the
+  // frame allocation structure.  If any alloca is used only in a single
+  // handler and is not used in the parent frame after outlining, it will
+  // be assigned an index of -1, meaning the handler can keep its
+  // "temporary" alloca and the original alloca can be erased from the
+  // parent function.  If we later encounter this alloca in a second
+  // handler, we will assign it a place in the frame allocation structure
+  // at that time.  Since the instruction replacement doesn't happen until
+  // all the entries in the HandlerData have been processed this isn't a
+  // problem.
+  for (auto &VarInfoEntry : FrameVarInfo) {
+    AllocaInst *ParentAlloca = VarInfoEntry.first;
+    HandlerAllocas &AllocaInfo = VarInfoEntry.second;
+
+    // If the instruction still has uses in the parent function or if it is
+    // referenced by more than one handler, add it to the frame allocation
+    // structure.
+    if (ParentAlloca->getNumUses() != 0 || AllocaInfo.Allocas.size() > 1) {
+      Type *VarTy = ParentAlloca->getAllocatedType();
+      StructTys.push_back(VarTy);
+      AllocaInfo.ParentFrameAllocationIndex = Idx++;
+    } else {
+      // If the variable is not used in the parent frame and it is only used
+      // in one handler, the alloca can be removed from the parent frame
+      // and the handler will keep its "temporary" alloca to define the value.
+      // An element index of -1 is used to indicate this condition.
+      AllocaInfo.ParentFrameAllocationIndex = -1;
+    }
+  }
+
+  // Having filled the StructTys vector and assigned an index to each element,
+  // we can now create the structure.
+  StructType *EHDataStructTy = StructType::create(
+      F.getContext(), StructTys, "struct." + F.getName().str() + ".ehdata");
+  IRBuilder<> Builder(F.getParent()->getContext());
+
+  // Create a frame allocation.
+  Module *M = F.getParent();
+  LLVMContext &Context = M->getContext();
+  BasicBlock *Entry = &F.getEntryBlock();
+  Builder.SetInsertPoint(Entry->getFirstInsertionPt());
+  Function *FrameAllocFn =
+      Intrinsic::getDeclaration(M, Intrinsic::frameallocate);
+  uint64_t EHAllocSize = M->getDataLayout()->getTypeAllocSize(EHDataStructTy);
+  Value *FrameAllocArgs[] = {
+      ConstantInt::get(Type::getInt32Ty(Context), EHAllocSize)};
+  CallInst *FrameAlloc =
+      Builder.CreateCall(FrameAllocFn, FrameAllocArgs, "frame.alloc");
+
+  Value *FrameEHData = Builder.CreateBitCast(
+      FrameAlloc, EHDataStructTy->getPointerTo(), "eh.data");
+
+  // Now visit each handler that is using the structure and bitcast its EHAlloc
+  // value to be a pointer to the frame alloc structure.
+  DenseMap<Function *, Value *> EHDataMap;
+  for (CallInst *EHAlloc : HandlerAllocs) {
+    // The EHAlloc has no uses at this time, so we need to just insert the
+    // cast before the next instruction. There is always a next instruction.
+    BasicBlock::iterator II = EHAlloc;
+    ++II;
+    Builder.SetInsertPoint(cast<Instruction>(II));
+    Value *EHData = Builder.CreateBitCast(
+        EHAlloc, EHDataStructTy->getPointerTo(), "eh.data");
+    EHDataMap[EHAlloc->getParent()->getParent()] = EHData;
+  }
+
+  // Next, replace the place-holder EHObjPtr allocas with GEP instructions
+  // that pull the EHObjPtr from the frame alloc structure
+  for (AllocaInst *EHObjPtr : HandlerEHObjPtrs) {
+    Value *EHData = EHDataMap[EHObjPtr->getParent()->getParent()];
+    Builder.SetInsertPoint(EHObjPtr);
+    Value *ElementPtr = Builder.CreateConstInBoundsGEP2_32(EHData, 0, 1);
+    EHObjPtr->replaceAllUsesWith(ElementPtr);
+    EHObjPtr->removeFromParent();
+    ElementPtr->takeName(EHObjPtr);
+    delete EHObjPtr;
+  }
+
+  // Finally, replace all of the temporary allocas for frame variables used in
+  // the outlined handlers and the original frame allocas with GEP instructions
+  // that get the equivalent pointer from the frame allocation struct.
+  for (auto &VarInfoEntry : FrameVarInfo) {
+    AllocaInst *ParentAlloca = VarInfoEntry.first;
+    HandlerAllocas &AllocaInfo = VarInfoEntry.second;
+    int Idx = AllocaInfo.ParentFrameAllocationIndex;
+
+    // If we have an index of -1 for this instruction, it means it isn't used
+    // outside of this handler.  In that case, we just keep the "temporary"
+    // alloca in the handler and erase the original alloca from the parent.
+    if (Idx == -1) {
+      ParentAlloca->eraseFromParent();
+    } else {
+      // Otherwise, we replace the parent alloca and all outlined allocas
+      // which map to it with GEP instructions.
+
+      // First replace the original alloca.
+      Builder.SetInsertPoint(ParentAlloca);
+      Builder.SetCurrentDebugLocation(ParentAlloca->getDebugLoc());
+      Value *ElementPtr =
+          Builder.CreateConstInBoundsGEP2_32(FrameEHData, 0, Idx);
+      ParentAlloca->replaceAllUsesWith(ElementPtr);
+      ParentAlloca->removeFromParent();
+      ElementPtr->takeName(ParentAlloca);
+      delete ParentAlloca;
+
+      // Next replace all outlined allocas that are mapped to it.
+      for (AllocaInst *TempAlloca : AllocaInfo.Allocas) {
+        Value *EHData = EHDataMap[TempAlloca->getParent()->getParent()];
+        // FIXME: Sink this GEP into the blocks where it is used.
+        Builder.SetInsertPoint(TempAlloca);
+        Builder.SetCurrentDebugLocation(TempAlloca->getDebugLoc());
+        ElementPtr = Builder.CreateConstInBoundsGEP2_32(EHData, 0, Idx);
+        TempAlloca->replaceAllUsesWith(ElementPtr);
+        TempAlloca->removeFromParent();
+        ElementPtr->takeName(TempAlloca);
+        delete TempAlloca;
+      }
+    } // end else of if (Idx == -1)
+  }   // End for each FrameVarInfo entry.
+
+  return HandlersOutlined;
+}
+
+bool WinEHPrepare::outlineCatchHandler(Function *SrcFn, Constant *SelectorType,
+                                       LandingPadInst *LPad, CallInst *&EHAlloc,
+                                       AllocaInst *&EHObjPtr,
+                                       FrameVarInfoMap &VarInfo) {
+  Module *M = SrcFn->getParent();
+  LLVMContext &Context = M->getContext();
+
+  // Create a new function to receive the handler contents.
+  Type *Int8PtrType = Type::getInt8PtrTy(Context);
+  std::vector<Type *> ArgTys;
+  ArgTys.push_back(Int8PtrType);
+  ArgTys.push_back(Int8PtrType);
+  FunctionType *FnType = FunctionType::get(Int8PtrType, ArgTys, false);
+  Function *CatchHandler = Function::Create(
+      FnType, GlobalVariable::ExternalLinkage, SrcFn->getName() + ".catch", M);
+
+  // Generate a standard prolog to setup the frame recovery structure.
+  IRBuilder<> Builder(Context);
+  BasicBlock *Entry = BasicBlock::Create(Context, "catch.entry");
+  CatchHandler->getBasicBlockList().push_front(Entry);
+  Builder.SetInsertPoint(Entry);
+  Builder.SetCurrentDebugLocation(LPad->getDebugLoc());
+
+  // The outlined handler will be called with the parent's frame pointer as
+  // its second argument. To enable the handler to access variables from
+  // the parent frame, we use that pointer to get locate a special block
+  // of memory that was allocated using llvm.eh.allocateframe for this
+  // purpose.  During the outlining process we will determine which frame
+  // variables are used in handlers and create a structure that maps these
+  // variables into the frame allocation block.
+  //
+  // The frame allocation block also contains an exception state variable
+  // used by the runtime and a pointer to the exception object pointer
+  // which will be filled in by the runtime for use in the handler.
+  Function *RecoverFrameFn =
+      Intrinsic::getDeclaration(M, Intrinsic::framerecover);
+  Value *RecoverArgs[] = {Builder.CreateBitCast(SrcFn, Int8PtrType, ""),
+                          &(CatchHandler->getArgumentList().back())};
+  EHAlloc = Builder.CreateCall(RecoverFrameFn, RecoverArgs, "eh.alloc");
+
+  // This alloca is only temporary.  We'll be replacing it once we know all the
+  // frame variables that need to go in the frame allocation structure.
+  EHObjPtr = Builder.CreateAlloca(Int8PtrType, 0, "eh.obj.ptr");
+
+  // This will give us a raw pointer to the exception object, which
+  // corresponds to the formal parameter of the catch statement.  If the
+  // handler uses this object, we will generate code during the outlining
+  // process to cast the pointer to the appropriate type and deference it
+  // as necessary.  The un-outlined landing pad code represents the
+  // exception object as the result of the llvm.eh.begincatch call.
+  Value *EHObj = Builder.CreateLoad(EHObjPtr, false, "eh.obj");
+
+  ValueToValueMapTy VMap;
+
+  // FIXME: Map other values referenced in the filter handler.
+
+  WinEHCatchDirector Director(LPad, CatchHandler, SelectorType, EHObj, VarInfo);
+
+  SmallVector<ReturnInst *, 8> Returns;
+  ClonedCodeInfo InlinedFunctionInfo;
+
+  BasicBlock::iterator II = LPad;
+
+  CloneAndPruneIntoFromInst(CatchHandler, SrcFn, ++II, VMap,
+                            /*ModuleLevelChanges=*/false, Returns, "",
+                            &InlinedFunctionInfo,
+                            SrcFn->getParent()->getDataLayout(), &Director);
+
+  // Move all the instructions in the first cloned block into our entry block.
+  BasicBlock *FirstClonedBB = std::next(Function::iterator(Entry));
+  Entry->getInstList().splice(Entry->end(), FirstClonedBB->getInstList());
+  FirstClonedBB->eraseFromParent();
+
+  return true;
+}
+
+CloningDirector::CloningAction WinEHCatchDirector::handleInstruction(
+    ValueToValueMapTy &VMap, const Instruction *Inst, BasicBlock *NewBB) {
+  // Intercept instructions which extract values from the landing pad aggregate.
+  if (auto *Extract = dyn_cast<ExtractValueInst>(Inst)) {
+    if (Extract->getAggregateOperand() == LPI) {
+      assert(Extract->getNumIndices() == 1 &&
+             "Unexpected operation: extracting both landing pad values");
+      assert((*(Extract->idx_begin()) == 0 || *(Extract->idx_begin()) == 1) &&
+             "Unexpected operation: extracting an unknown landing pad element");
+
+      if (*(Extract->idx_begin()) == 0) {
+        // Element 0 doesn't directly corresponds to anything in the WinEH
+        // scheme.
+        // It will be stored to a memory location, then later loaded and finally
+        // the loaded value will be used as the argument to an
+        // llvm.eh.begincatch
+        // call.  We're tracking it here so that we can skip the store and load.
+        ExtractedEHPtr = Inst;
+      } else {
+        // Element 1 corresponds to the filter selector.  We'll map it to 1 for
+        // matching purposes, but it will also probably be stored to memory and
+        // reloaded, so we need to track the instuction so that we can map the
+        // loaded value too.
+        VMap[Inst] = ConstantInt::get(SelectorIDType, 1);
+        ExtractedSelector = Inst;
+      }
+
+      // Tell the caller not to clone this instruction.
+      return CloningDirector::SkipInstruction;
+    }
+    // Other extract value instructions just get cloned.
+    return CloningDirector::CloneInstruction;
+  }
+
+  if (auto *Store = dyn_cast<StoreInst>(Inst)) {
+    // Look for and suppress stores of the extracted landingpad values.
+    const Value *StoredValue = Store->getValueOperand();
+    if (StoredValue == ExtractedEHPtr) {
+      EHPtrStoreAddr = Store->getPointerOperand();
+      return CloningDirector::SkipInstruction;
+    }
+    if (StoredValue == ExtractedSelector) {
+      SelectorStoreAddr = Store->getPointerOperand();
+      return CloningDirector::SkipInstruction;
+    }
+
+    // Any other store just gets cloned.
+    return CloningDirector::CloneInstruction;
+  }
+
+  if (auto *Load = dyn_cast<LoadInst>(Inst)) {
+    // Look for loads of (previously suppressed) landingpad values.
+    // The EHPtr load can be ignored (it should only be used as
+    // an argument to llvm.eh.begincatch), but the selector value
+    // needs to be mapped to a constant value of 1 to be used to
+    // simplify the branching to always flow to the current handler.
+    const Value *LoadAddr = Load->getPointerOperand();
+    if (LoadAddr == EHPtrStoreAddr) {
+      VMap[Inst] = UndefValue::get(Int8PtrType);
+      return CloningDirector::SkipInstruction;
+    }
+    if (LoadAddr == SelectorStoreAddr) {
+      VMap[Inst] = ConstantInt::get(SelectorIDType, 1);
+      return CloningDirector::SkipInstruction;
+    }
+
+    // Any other loads just get cloned.
+    return CloningDirector::CloneInstruction;
+  }
+
+  if (match(Inst, m_Intrinsic<Intrinsic::eh_begincatch>())) {
+    // The argument to the call is some form of the first element of the
+    // landingpad aggregate value, but that doesn't matter.  It isn't used
+    // here.
+    // The return value of this instruction, however, is used to access the
+    // EH object pointer.  We have generated an instruction to get that value
+    // from the EH alloc block, so we can just map to that here.
+    VMap[Inst] = EHObj;
+    return CloningDirector::SkipInstruction;
+  }
+  if (match(Inst, m_Intrinsic<Intrinsic::eh_endcatch>())) {
+    auto *IntrinCall = dyn_cast<IntrinsicInst>(Inst);
+    // It might be interesting to track whether or not we are inside a catch
+    // function, but that might make the algorithm more brittle than it needs
+    // to be.
+
+    // The end catch call can occur in one of two places: either in a
+    // landingpad
+    // block that is part of the catch handlers exception mechanism, or at the
+    // end of the catch block.  If it occurs in a landing pad, we must skip it
+    // and continue so that the landing pad gets cloned.
+    // FIXME: This case isn't fully supported yet and shouldn't turn up in any
+    //        of the test cases until it is.
+    if (IntrinCall->getParent()->isLandingPad())
+      return CloningDirector::SkipInstruction;
+
+    // If an end catch occurs anywhere else the next instruction should be an
+    // unconditional branch instruction that we want to replace with a return
+    // to the the address of the branch target.
+    const BasicBlock *EndCatchBB = IntrinCall->getParent();
+    const TerminatorInst *Terminator = EndCatchBB->getTerminator();
+    const BranchInst *Branch = dyn_cast<BranchInst>(Terminator);
+    assert(Branch && Branch->isUnconditional());
+    assert(std::next(BasicBlock::const_iterator(IntrinCall)) ==
+           BasicBlock::const_iterator(Branch));
+
+    ReturnInst::Create(NewBB->getContext(),
+                       BlockAddress::get(Branch->getSuccessor(0)), NewBB);
+
+    // We just added a terminator to the cloned block.
+    // Tell the caller to stop processing the current basic block so that
+    // the branch instruction will be skipped.
+    return CloningDirector::StopCloningBB;
+  }
+  if (match(Inst, m_Intrinsic<Intrinsic::eh_typeid_for>())) {
+    auto *IntrinCall = dyn_cast<IntrinsicInst>(Inst);
+    Value *Selector = IntrinCall->getArgOperand(0)->stripPointerCasts();
+    // This causes a replacement that will collapse the landing pad CFG based
+    // on the filter function we intend to match.
+    if (Selector == CurrentSelector)
+      VMap[Inst] = ConstantInt::get(SelectorIDType, 1);
+    else
+      VMap[Inst] = ConstantInt::get(SelectorIDType, 0);
+    // Tell the caller not to clone this instruction.
+    return CloningDirector::SkipInstruction;
+  }
+
+  // Continue with the default cloning behavior.
+  return CloningDirector::CloneInstruction;
+}
+
+WinEHFrameVariableMaterializer::WinEHFrameVariableMaterializer(
+    Function *OutlinedFn, FrameVarInfoMap &FrameVarInfo)
+    : FrameVarInfo(FrameVarInfo), Builder(OutlinedFn->getContext()) {
+  Builder.SetInsertPoint(&OutlinedFn->getEntryBlock());
+  // FIXME: Do something with the FrameVarMapped so that it is shared across the
+  // function.
+}
+
+Value *WinEHFrameVariableMaterializer::materializeValueFor(Value *V) {
+  // If we're asked to materialize an alloca variable, we temporarily
+  // create a matching alloca in the outlined function.  When all the
+  // outlining is complete, we'll collect these into a structure and
+  // replace these temporary allocas with GEPs referencing the frame
+  // allocation block.
+  if (auto *AV = dyn_cast<AllocaInst>(V)) {
+    AllocaInst *NewAlloca = Builder.CreateAlloca(
+        AV->getAllocatedType(), AV->getArraySize(), AV->getName());
+    FrameVarInfo[AV].Allocas.push_back(NewAlloca);
+    return NewAlloca;
+  }
+
+// FIXME: Do PHI nodes need special handling?
+
+// FIXME: Are there other cases we can handle better?  GEP, ExtractValue, etc.
+
+// FIXME: This doesn't work during cloning because it finds an instruction
+//        in the use list that isn't yet part of a basic block.
+#if 0
+  // If we're asked to remap some other instruction, we'll need to
+  // spill it to an alloca variable in the parent function and add a
+  // temporary alloca in the outlined function to be processed as
+  // described above.
+  Instruction *Inst = dyn_cast<Instruction>(V);
+  if (Inst) {
+    AllocaInst *Spill = DemoteRegToStack(*Inst, true);
+    AllocaInst *NewAlloca = Builder.CreateAlloca(Spill->getAllocatedType(),
+                                                 Spill->getArraySize());
+    FrameVarMap[AV] = NewAlloca;
+    return NewAlloca;
+  }
+#endif
+
+  return nullptr;
+}
diff --git a/lib/DebugInfo/Android.mk b/lib/DebugInfo/Android.mk
deleted file mode 100644
index e777e9c..0000000
--- a/lib/DebugInfo/Android.mk
+++ /dev/null
@@ -1,53 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-
-debuginfo_SRC_FILES := \
-  DIContext.cpp \
-  DWARFAbbreviationDeclaration.cpp \
-  DWARFAcceleratorTable.cpp \
-  DWARFCompileUnit.cpp \
-  DWARFContext.cpp \
-  DWARFDebugAbbrev.cpp \
-  DWARFDebugArangeSet.cpp \
-  DWARFDebugAranges.cpp \
-  DWARFDebugFrame.cpp \
-  DWARFDebugInfoEntry.cpp \
-  DWARFDebugLine.cpp \
-  DWARFDebugLoc.cpp \
-  DWARFDebugRangeList.cpp \
-  DWARFFormValue.cpp \
-  DWARFTypeUnit.cpp \
-  DWARFUnit.cpp \
-
-# For the host
-# =====================================================
-include $(CLEAR_VARS)
-
-REQUIRES_RTTI := 1
-
-LOCAL_SRC_FILES := $(debuginfo_SRC_FILES)
-
-LOCAL_MODULE:= libLLVMDebugInfo
-
-LOCAL_MODULE_TAGS := optional
-
-include $(LLVM_HOST_BUILD_MK)
-include $(LLVM_GEN_INTRINSICS_MK)
-include $(BUILD_HOST_STATIC_LIBRARY)
-
-# For the device
-# =====================================================
-ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
-include $(CLEAR_VARS)
-
-REQUIRES_RTTI := 1
-
-LOCAL_SRC_FILES := $(debuginfo_SRC_FILES)
-
-LOCAL_MODULE:= libLLVMDebugInfo
-
-LOCAL_MODULE_TAGS := optional
-
-include $(LLVM_DEVICE_BUILD_MK)
-include $(LLVM_GEN_INTRINSICS_MK)
-include $(BUILD_STATIC_LIBRARY)
-endif
diff --git a/lib/DebugInfo/CMakeLists.txt b/lib/DebugInfo/CMakeLists.txt
index 81fc84d..645d92f 100644
--- a/lib/DebugInfo/CMakeLists.txt
+++ b/lib/DebugInfo/CMakeLists.txt
@@ -1,18 +1,4 @@
-add_llvm_library(LLVMDebugInfo
-  DIContext.cpp
-  DWARFAbbreviationDeclaration.cpp
-  DWARFAcceleratorTable.cpp
-  DWARFCompileUnit.cpp
-  DWARFContext.cpp
-  DWARFDebugAbbrev.cpp
-  DWARFDebugArangeSet.cpp
-  DWARFDebugAranges.cpp
-  DWARFDebugFrame.cpp
-  DWARFDebugInfoEntry.cpp
-  DWARFDebugLine.cpp
-  DWARFDebugLoc.cpp
-  DWARFDebugRangeList.cpp
-  DWARFFormValue.cpp
-  DWARFTypeUnit.cpp
-  DWARFUnit.cpp
-  )
+
+add_subdirectory(DWARF)
+add_subdirectory(PDB)
+
diff --git a/lib/DebugInfo/DIContext.cpp b/lib/DebugInfo/DIContext.cpp
deleted file mode 100644
index 01aecf8..0000000
--- a/lib/DebugInfo/DIContext.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//===-- DIContext.cpp -----------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/DIContext.h"
-#include "DWARFContext.h"
-using namespace llvm;
-
-DIContext::~DIContext() {}
-
-DIContext *DIContext::getDWARFContext(const object::ObjectFile &Obj) {
-  return new DWARFContextInMemory(Obj);
-}
diff --git a/lib/DebugInfo/DWARF/Android.mk b/lib/DebugInfo/DWARF/Android.mk
new file mode 100644
index 0000000..3c8222f
--- /dev/null
+++ b/lib/DebugInfo/DWARF/Android.mk
@@ -0,0 +1,54 @@
+LOCAL_PATH:= $(call my-dir)
+
+debuginfo_dwarf_SRC_FILES := \
+  DIContext.cpp \
+  DWARFAbbreviationDeclaration.cpp \
+  DWARFAcceleratorTable.cpp \
+  DWARFCompileUnit.cpp \
+  DWARFContext.cpp \
+  DWARFDebugAbbrev.cpp \
+  DWARFDebugArangeSet.cpp \
+  DWARFDebugAranges.cpp \
+  DWARFDebugFrame.cpp \
+  DWARFDebugInfoEntry.cpp \
+  DWARFDebugLine.cpp \
+  DWARFDebugLoc.cpp \
+  DWARFDebugRangeList.cpp \
+  DWARFFormValue.cpp \
+  DWARFTypeUnit.cpp \
+  DWARFUnit.cpp \
+  SyntaxHighlighting.cpp
+
+# For the host
+# =====================================================
+include $(CLEAR_VARS)
+
+REQUIRES_RTTI := 1
+
+LOCAL_SRC_FILES := $(debuginfo_dwarf_SRC_FILES)
+
+LOCAL_MODULE:= libLLVMDebugInfoDWARF
+
+LOCAL_MODULE_TAGS := optional
+
+include $(LLVM_HOST_BUILD_MK)
+include $(LLVM_GEN_INTRINSICS_MK)
+include $(BUILD_HOST_STATIC_LIBRARY)
+
+# For the device
+# =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
+include $(CLEAR_VARS)
+
+REQUIRES_RTTI := 1
+
+LOCAL_SRC_FILES := $(debuginfo_dwarf_SRC_FILES)
+
+LOCAL_MODULE:= libLLVMDebugInfoDWARF
+
+LOCAL_MODULE_TAGS := optional
+
+include $(LLVM_DEVICE_BUILD_MK)
+include $(LLVM_GEN_INTRINSICS_MK)
+include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/DebugInfo/DWARF/CMakeLists.txt b/lib/DebugInfo/DWARF/CMakeLists.txt
new file mode 100644
index 0000000..8c6d495
--- /dev/null
+++ b/lib/DebugInfo/DWARF/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_llvm_library(LLVMDebugInfoDWARF
+  DIContext.cpp
+  DWARFAbbreviationDeclaration.cpp
+  DWARFAcceleratorTable.cpp
+  DWARFCompileUnit.cpp
+  DWARFContext.cpp
+  DWARFDebugAbbrev.cpp
+  DWARFDebugArangeSet.cpp
+  DWARFDebugAranges.cpp
+  DWARFDebugFrame.cpp
+  DWARFDebugInfoEntry.cpp
+  DWARFDebugLine.cpp
+  DWARFDebugLoc.cpp
+  DWARFDebugRangeList.cpp
+  DWARFFormValue.cpp
+  DWARFTypeUnit.cpp
+  DWARFUnit.cpp
+  SyntaxHighlighting.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/DebugInfo/DWARF
+  )
diff --git a/lib/DebugInfo/DWARF/DIContext.cpp b/lib/DebugInfo/DWARF/DIContext.cpp
new file mode 100644
index 0000000..a1c6ca4
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DIContext.cpp
@@ -0,0 +1,18 @@
+//===-- DIContext.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+using namespace llvm;
+
+DIContext::~DIContext() {}
+
+DIContext *DIContext::getDWARFContext(const object::ObjectFile &Obj) {
+  return new DWARFContextInMemory(Obj);
+}
diff --git a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
new file mode 100644
index 0000000..9314c9e
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
@@ -0,0 +1,97 @@
+//===-- DWARFAbbreviationDeclaration.cpp ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+using namespace dwarf;
+
+void DWARFAbbreviationDeclaration::clear() {
+  Code = 0;
+  Tag = 0;
+  HasChildren = false;
+  AttributeSpecs.clear();
+}
+
+DWARFAbbreviationDeclaration::DWARFAbbreviationDeclaration() {
+  clear();
+}
+
+bool
+DWARFAbbreviationDeclaration::extract(DataExtractor Data, uint32_t* OffsetPtr) {
+  clear();
+  Code = Data.getULEB128(OffsetPtr);
+  if (Code == 0) {
+    return false;
+  }
+  Tag = Data.getULEB128(OffsetPtr);
+  uint8_t ChildrenByte = Data.getU8(OffsetPtr);
+  HasChildren = (ChildrenByte == DW_CHILDREN_yes);
+
+  while (true) {
+    uint32_t CurOffset = *OffsetPtr;
+    uint16_t Attr = Data.getULEB128(OffsetPtr);
+    if (CurOffset == *OffsetPtr) {
+      clear();
+      return false;
+    }
+    CurOffset = *OffsetPtr;
+    uint16_t Form = Data.getULEB128(OffsetPtr);
+    if (CurOffset == *OffsetPtr) {
+      clear();
+      return false;
+    }
+    if (Attr == 0 && Form == 0)
+      break;
+    AttributeSpecs.push_back(AttributeSpec(Attr, Form));
+  }
+
+  if (Tag == 0) {
+    clear();
+    return false;
+  }
+  return true;
+}
+
+void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
+  const char *tagString = TagString(getTag());
+  OS << '[' << getCode() << "] ";
+  if (tagString)
+    OS << tagString;
+  else
+    OS << format("DW_TAG_Unknown_%x", getTag());
+  OS << "\tDW_CHILDREN_" << (hasChildren() ? "yes" : "no") << '\n';
+  for (const AttributeSpec &Spec : AttributeSpecs) {
+    OS << '\t';
+    const char *attrString = AttributeString(Spec.Attr);
+    if (attrString)
+      OS << attrString;
+    else
+      OS << format("DW_AT_Unknown_%x", Spec.Attr);
+    OS << '\t';
+    const char *formString = FormEncodingString(Spec.Form);
+    if (formString)
+      OS << formString;
+    else
+      OS << format("DW_FORM_Unknown_%x", Spec.Form);
+    OS << '\n';
+  }
+  OS << '\n';
+}
+
+uint32_t
+DWARFAbbreviationDeclaration::findAttributeIndex(uint16_t attr) const {
+  for (uint32_t i = 0, e = AttributeSpecs.size(); i != e; ++i) {
+    if (AttributeSpecs[i].Attr == attr)
+      return i;
+  }
+  return -1U;
+}
diff --git a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
new file mode 100644
index 0000000..8ae0543
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -0,0 +1,132 @@
+//===--- DWARFAcceleratorTable.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+bool DWARFAcceleratorTable::extract() {
+  uint32_t Offset = 0;
+
+  // Check that we can at least read the header.
+  if (!AccelSection.isValidOffset(offsetof(Header, HeaderDataLength)+4))
+    return false;
+
+  Hdr.Magic = AccelSection.getU32(&Offset);
+  Hdr.Version = AccelSection.getU16(&Offset);
+  Hdr.HashFunction = AccelSection.getU16(&Offset);
+  Hdr.NumBuckets = AccelSection.getU32(&Offset);
+  Hdr.NumHashes = AccelSection.getU32(&Offset);
+  Hdr.HeaderDataLength = AccelSection.getU32(&Offset);
+
+  // Check that we can read all the hashes and offsets from the
+  // section (see SourceLevelDebugging.rst for the structure of the index).
+  if (!AccelSection.isValidOffset(sizeof(Hdr) + Hdr.HeaderDataLength +
+                                  Hdr.NumBuckets*4 + Hdr.NumHashes*8))
+    return false;
+
+  HdrData.DIEOffsetBase = AccelSection.getU32(&Offset);
+  uint32_t NumAtoms = AccelSection.getU32(&Offset);
+
+  for (unsigned i = 0; i < NumAtoms; ++i) {
+    uint16_t AtomType = AccelSection.getU16(&Offset);
+    uint16_t AtomForm = AccelSection.getU16(&Offset);
+    HdrData.Atoms.push_back(std::make_pair(AtomType, AtomForm));
+  }
+
+  return true;
+}
+
+void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
+  // Dump the header.
+  OS << "Magic = " << format("0x%08x", Hdr.Magic) << '\n'
+     << "Version = " << format("0x%04x", Hdr.Version) << '\n'
+     << "Hash function = " << format("0x%08x", Hdr.HashFunction) << '\n'
+     << "Bucket count = " << Hdr.NumBuckets << '\n'
+     << "Hashes count = " << Hdr.NumHashes << '\n'
+     << "HeaderData length = " << Hdr.HeaderDataLength << '\n'
+     << "DIE offset base = " << HdrData.DIEOffsetBase << '\n'
+     << "Number of atoms = " << HdrData.Atoms.size() << '\n';
+
+  unsigned i = 0;
+  SmallVector<DWARFFormValue, 3> AtomForms;
+  for (const auto &Atom: HdrData.Atoms) {
+    OS << format("Atom[%d] Type: ", i++);
+    if (const char *TypeString = dwarf::AtomTypeString(Atom.first))
+      OS << TypeString;
+    else
+      OS << format("DW_ATOM_Unknown_0x%x", Atom.first);
+    OS << " Form: ";
+    if (const char *FormString = dwarf::FormEncodingString(Atom.second))
+      OS << FormString;
+    else
+      OS << format("DW_FORM_Unknown_0x%x", Atom.second);
+    OS << '\n';
+    AtomForms.push_back(DWARFFormValue(Atom.second));
+  }
+
+  // Now go through the actual tables and dump them.
+  uint32_t Offset = sizeof(Hdr) + Hdr.HeaderDataLength;
+  unsigned HashesBase = Offset + Hdr.NumBuckets * 4;
+  unsigned OffsetsBase = HashesBase + Hdr.NumHashes * 4;
+
+  for (unsigned Bucket = 0; Bucket < Hdr.NumBuckets; ++Bucket) {
+    unsigned Index = AccelSection.getU32(&Offset);
+
+    OS << format("Bucket[%d]\n", Bucket);
+    if (Index == UINT32_MAX) {
+      OS << "  EMPTY\n";
+      continue;
+    }
+
+    for (unsigned HashIdx = Index; HashIdx < Hdr.NumHashes; ++HashIdx) {
+      unsigned HashOffset = HashesBase + HashIdx*4;
+      unsigned OffsetsOffset = OffsetsBase + HashIdx*4;
+      uint32_t Hash = AccelSection.getU32(&HashOffset);
+
+      if (Hash % Hdr.NumBuckets != Bucket)
+        break;
+
+      unsigned DataOffset = AccelSection.getU32(&OffsetsOffset);
+      OS << format("  Hash = 0x%08x Offset = 0x%08x\n", Hash, DataOffset);
+      if (!AccelSection.isValidOffset(DataOffset)) {
+        OS << "    Invalid section offset\n";
+        continue;
+      }
+      while (AccelSection.isValidOffsetForDataOfSize(DataOffset, 4)) {
+        unsigned StringOffset = AccelSection.getU32(&DataOffset);
+        RelocAddrMap::const_iterator Reloc = Relocs.find(DataOffset-4);
+        if (Reloc != Relocs.end())
+          StringOffset += Reloc->second.second;
+        if (!StringOffset)
+          break;
+        OS << format("    Name: %08x \"%s\"\n", StringOffset,
+                     StringSection.getCStr(&StringOffset));
+        unsigned NumData = AccelSection.getU32(&DataOffset);
+        for (unsigned Data = 0; Data < NumData; ++Data) {
+          OS << format("    Data[%d] => ", Data);
+          unsigned i = 0;
+          for (auto &Atom : AtomForms) {
+            OS << format("{Atom[%d]: ", i++);
+            if (Atom.extractValue(AccelSection, &DataOffset, nullptr))
+              Atom.dump(OS, nullptr);
+            else
+              OS << "Error extracting the value";
+            OS << "} ";
+          }
+          OS << '\n';
+        }
+      }
+    }
+  }
+}
+}
diff --git a/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp b/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
new file mode 100644
index 0000000..01e7247
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
@@ -0,0 +1,32 @@
+//===-- DWARFCompileUnit.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+void DWARFCompileUnit::dump(raw_ostream &OS) {
+  OS << format("0x%08x", getOffset()) << ": Compile Unit:"
+     << " length = " << format("0x%08x", getLength())
+     << " version = " << format("0x%04x", getVersion())
+     << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
+     << " addr_size = " << format("0x%02x", getAddressByteSize())
+     << " (next unit at " << format("0x%08x", getNextUnitOffset())
+     << ")\n";
+
+  const DWARFDebugInfoEntryMinimal *CU = getCompileUnitDIE(false);
+  assert(CU && "Null Compile Unit?");
+  CU->dump(OS, this, -1U);
+}
+
+// VTable anchor.
+DWARFCompileUnit::~DWARFCompileUnit() {
+}
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
new file mode 100644
index 0000000..3b42700
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -0,0 +1,696 @@
+//===-- DWARFContext.cpp --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
+#include "llvm/Support/Compression.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+using namespace dwarf;
+using namespace object;
+
+#define DEBUG_TYPE "dwarf"
+
+typedef DWARFDebugLine::LineTable DWARFLineTable;
+typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind;
+typedef DILineInfoSpecifier::FunctionNameKind FunctionNameKind;
+
+static void dumpPubSection(raw_ostream &OS, StringRef Name, StringRef Data,
+                           bool LittleEndian, bool GnuStyle) {
+  OS << "\n." << Name << " contents:\n";
+  DataExtractor pubNames(Data, LittleEndian, 0);
+  uint32_t offset = 0;
+  while (pubNames.isValidOffset(offset)) {
+    OS << "length = " << format("0x%08x", pubNames.getU32(&offset));
+    OS << " version = " << format("0x%04x", pubNames.getU16(&offset));
+    OS << " unit_offset = " << format("0x%08x", pubNames.getU32(&offset));
+    OS << " unit_size = " << format("0x%08x", pubNames.getU32(&offset)) << '\n';
+    if (GnuStyle)
+      OS << "Offset     Linkage  Kind     Name\n";
+    else
+      OS << "Offset     Name\n";
+
+    while (offset < Data.size()) {
+      uint32_t dieRef = pubNames.getU32(&offset);
+      if (dieRef == 0)
+        break;
+      OS << format("0x%8.8x ", dieRef);
+      if (GnuStyle) {
+        PubIndexEntryDescriptor desc(pubNames.getU8(&offset));
+        OS << format("%-8s", dwarf::GDBIndexEntryLinkageString(desc.Linkage))
+           << ' ' << format("%-8s", dwarf::GDBIndexEntryKindString(desc.Kind))
+           << ' ';
+      }
+      OS << '\"' << pubNames.getCStr(&offset) << "\"\n";
+    }
+  }
+}
+
+static void dumpAccelSection(raw_ostream &OS, StringRef Name,
+                             const DWARFSection& Section, StringRef StringSection,
+                             bool LittleEndian) {
+  DataExtractor AccelSection(Section.Data, LittleEndian, 0);
+  DataExtractor StrData(StringSection, LittleEndian, 0);
+  OS << "\n." << Name << " contents:\n";
+  DWARFAcceleratorTable Accel(AccelSection, StrData, Section.Relocs);
+  if (!Accel.extract())
+    return;
+  Accel.dump(OS);
+}
+
+void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
+  if (DumpType == DIDT_All || DumpType == DIDT_Abbrev) {
+    OS << ".debug_abbrev contents:\n";
+    getDebugAbbrev()->dump(OS);
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_AbbrevDwo)
+    if (const DWARFDebugAbbrev *D = getDebugAbbrevDWO()) {
+      OS << "\n.debug_abbrev.dwo contents:\n";
+      D->dump(OS);
+    }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Info) {
+    OS << "\n.debug_info contents:\n";
+    for (const auto &CU : compile_units())
+      CU->dump(OS);
+  }
+
+  if ((DumpType == DIDT_All || DumpType == DIDT_InfoDwo) &&
+      getNumDWOCompileUnits()) {
+    OS << "\n.debug_info.dwo contents:\n";
+    for (const auto &DWOCU : dwo_compile_units())
+      DWOCU->dump(OS);
+  }
+
+  if ((DumpType == DIDT_All || DumpType == DIDT_Types) && getNumTypeUnits()) {
+    OS << "\n.debug_types contents:\n";
+    for (const auto &TUS : type_unit_sections())
+      for (const auto &TU : TUS)
+        TU->dump(OS);
+  }
+
+  if ((DumpType == DIDT_All || DumpType == DIDT_TypesDwo) &&
+      getNumDWOTypeUnits()) {
+    OS << "\n.debug_types.dwo contents:\n";
+    for (const auto &DWOTUS : dwo_type_unit_sections())
+      for (const auto &DWOTU : DWOTUS)
+        DWOTU->dump(OS);
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Loc) {
+    OS << "\n.debug_loc contents:\n";
+    getDebugLoc()->dump(OS);
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_LocDwo) {
+    OS << "\n.debug_loc.dwo contents:\n";
+    getDebugLocDWO()->dump(OS);
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Frames) {
+    OS << "\n.debug_frame contents:\n";
+    getDebugFrame()->dump(OS);
+  }
+
+  uint32_t offset = 0;
+  if (DumpType == DIDT_All || DumpType == DIDT_Aranges) {
+    OS << "\n.debug_aranges contents:\n";
+    DataExtractor arangesData(getARangeSection(), isLittleEndian(), 0);
+    DWARFDebugArangeSet set;
+    while (set.extract(arangesData, &offset))
+      set.dump(OS);
+  }
+
+  uint8_t savedAddressByteSize = 0;
+  if (DumpType == DIDT_All || DumpType == DIDT_Line) {
+    OS << "\n.debug_line contents:\n";
+    for (const auto &CU : compile_units()) {
+      savedAddressByteSize = CU->getAddressByteSize();
+      unsigned stmtOffset =
+          CU->getCompileUnitDIE()->getAttributeValueAsSectionOffset(
+              CU.get(), DW_AT_stmt_list, -1U);
+      if (stmtOffset != -1U) {
+        DataExtractor lineData(getLineSection().Data, isLittleEndian(),
+                               savedAddressByteSize);
+        DWARFDebugLine::LineTable LineTable;
+        LineTable.parse(lineData, &getLineSection().Relocs, &stmtOffset);
+        LineTable.dump(OS);
+      }
+    }
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_LineDwo) {
+    OS << "\n.debug_line.dwo contents:\n";
+    unsigned stmtOffset = 0;
+    DataExtractor lineData(getLineDWOSection().Data, isLittleEndian(),
+                           savedAddressByteSize);
+    DWARFDebugLine::LineTable LineTable;
+    while (LineTable.Prologue.parse(lineData, &stmtOffset)) {
+      LineTable.dump(OS);
+      LineTable.clear();
+    }
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Str) {
+    OS << "\n.debug_str contents:\n";
+    DataExtractor strData(getStringSection(), isLittleEndian(), 0);
+    offset = 0;
+    uint32_t strOffset = 0;
+    while (const char *s = strData.getCStr(&offset)) {
+      OS << format("0x%8.8x: \"%s\"\n", strOffset, s);
+      strOffset = offset;
+    }
+  }
+
+  if ((DumpType == DIDT_All || DumpType == DIDT_StrDwo) &&
+      !getStringDWOSection().empty()) {
+    OS << "\n.debug_str.dwo contents:\n";
+    DataExtractor strDWOData(getStringDWOSection(), isLittleEndian(), 0);
+    offset = 0;
+    uint32_t strDWOOffset = 0;
+    while (const char *s = strDWOData.getCStr(&offset)) {
+      OS << format("0x%8.8x: \"%s\"\n", strDWOOffset, s);
+      strDWOOffset = offset;
+    }
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Ranges) {
+    OS << "\n.debug_ranges contents:\n";
+    // In fact, different compile units may have different address byte
+    // sizes, but for simplicity we just use the address byte size of the last
+    // compile unit (there is no easy and fast way to associate address range
+    // list and the compile unit it describes).
+    DataExtractor rangesData(getRangeSection(), isLittleEndian(),
+                             savedAddressByteSize);
+    offset = 0;
+    DWARFDebugRangeList rangeList;
+    while (rangeList.extract(rangesData, &offset))
+      rangeList.dump(OS);
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Pubnames)
+    dumpPubSection(OS, "debug_pubnames", getPubNamesSection(),
+                   isLittleEndian(), false);
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Pubtypes)
+    dumpPubSection(OS, "debug_pubtypes", getPubTypesSection(),
+                   isLittleEndian(), false);
+
+  if (DumpType == DIDT_All || DumpType == DIDT_GnuPubnames)
+    dumpPubSection(OS, "debug_gnu_pubnames", getGnuPubNamesSection(),
+                   isLittleEndian(), true /* GnuStyle */);
+
+  if (DumpType == DIDT_All || DumpType == DIDT_GnuPubtypes)
+    dumpPubSection(OS, "debug_gnu_pubtypes", getGnuPubTypesSection(),
+                   isLittleEndian(), true /* GnuStyle */);
+
+  if ((DumpType == DIDT_All || DumpType == DIDT_StrOffsetsDwo) &&
+      !getStringOffsetDWOSection().empty()) {
+    OS << "\n.debug_str_offsets.dwo contents:\n";
+    DataExtractor strOffsetExt(getStringOffsetDWOSection(), isLittleEndian(),
+                               0);
+    offset = 0;
+    uint64_t size = getStringOffsetDWOSection().size();
+    while (offset < size) {
+      OS << format("0x%8.8x: ", offset);
+      OS << format("%8.8x\n", strOffsetExt.getU32(&offset));
+    }
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_AppleNames)
+    dumpAccelSection(OS, "apple_names", getAppleNamesSection(),
+                     getStringSection(), isLittleEndian());
+
+  if (DumpType == DIDT_All || DumpType == DIDT_AppleTypes)
+    dumpAccelSection(OS, "apple_types", getAppleTypesSection(),
+                     getStringSection(), isLittleEndian());
+
+  if (DumpType == DIDT_All || DumpType == DIDT_AppleNamespaces)
+    dumpAccelSection(OS, "apple_namespaces", getAppleNamespacesSection(),
+                     getStringSection(), isLittleEndian());
+
+  if (DumpType == DIDT_All || DumpType == DIDT_AppleObjC)
+    dumpAccelSection(OS, "apple_objc", getAppleObjCSection(),
+                     getStringSection(), isLittleEndian());
+}
+
+const DWARFDebugAbbrev *DWARFContext::getDebugAbbrev() {
+  if (Abbrev)
+    return Abbrev.get();
+
+  DataExtractor abbrData(getAbbrevSection(), isLittleEndian(), 0);
+
+  Abbrev.reset(new DWARFDebugAbbrev());
+  Abbrev->extract(abbrData);
+  return Abbrev.get();
+}
+
+const DWARFDebugAbbrev *DWARFContext::getDebugAbbrevDWO() {
+  if (AbbrevDWO)
+    return AbbrevDWO.get();
+
+  DataExtractor abbrData(getAbbrevDWOSection(), isLittleEndian(), 0);
+  AbbrevDWO.reset(new DWARFDebugAbbrev());
+  AbbrevDWO->extract(abbrData);
+  return AbbrevDWO.get();
+}
+
+const DWARFDebugLoc *DWARFContext::getDebugLoc() {
+  if (Loc)
+    return Loc.get();
+
+  DataExtractor LocData(getLocSection().Data, isLittleEndian(), 0);
+  Loc.reset(new DWARFDebugLoc(getLocSection().Relocs));
+  // assume all compile units have the same address byte size
+  if (getNumCompileUnits())
+    Loc->parse(LocData, getCompileUnitAtIndex(0)->getAddressByteSize());
+  return Loc.get();
+}
+
+const DWARFDebugLocDWO *DWARFContext::getDebugLocDWO() {
+  if (LocDWO)
+    return LocDWO.get();
+
+  DataExtractor LocData(getLocDWOSection().Data, isLittleEndian(), 0);
+  LocDWO.reset(new DWARFDebugLocDWO());
+  LocDWO->parse(LocData);
+  return LocDWO.get();
+}
+
+const DWARFDebugAranges *DWARFContext::getDebugAranges() {
+  if (Aranges)
+    return Aranges.get();
+
+  Aranges.reset(new DWARFDebugAranges());
+  Aranges->generate(this);
+  return Aranges.get();
+}
+
+const DWARFDebugFrame *DWARFContext::getDebugFrame() {
+  if (DebugFrame)
+    return DebugFrame.get();
+
+  // There's a "bug" in the DWARFv3 standard with respect to the target address
+  // size within debug frame sections. While DWARF is supposed to be independent
+  // of its container, FDEs have fields with size being "target address size",
+  // which isn't specified in DWARF in general. It's only specified for CUs, but
+  // .eh_frame can appear without a .debug_info section. Follow the example of
+  // other tools (libdwarf) and extract this from the container (ObjectFile
+  // provides this information). This problem is fixed in DWARFv4
+  // See this dwarf-discuss discussion for more details:
+  // http://lists.dwarfstd.org/htdig.cgi/dwarf-discuss-dwarfstd.org/2011-December/001173.html
+  DataExtractor debugFrameData(getDebugFrameSection(), isLittleEndian(),
+                               getAddressSize());
+  DebugFrame.reset(new DWARFDebugFrame());
+  DebugFrame->parse(debugFrameData);
+  return DebugFrame.get();
+}
+
+const DWARFLineTable *
+DWARFContext::getLineTableForUnit(DWARFUnit *cu) {
+  if (!Line)
+    Line.reset(new DWARFDebugLine(&getLineSection().Relocs));
+
+  unsigned stmtOffset =
+      cu->getCompileUnitDIE()->getAttributeValueAsSectionOffset(
+          cu, DW_AT_stmt_list, -1U);
+  if (stmtOffset == -1U)
+    return nullptr; // No line table for this compile unit.
+
+  // See if the line table is cached.
+  if (const DWARFLineTable *lt = Line->getLineTable(stmtOffset))
+    return lt;
+
+  // We have to parse it first.
+  DataExtractor lineData(getLineSection().Data, isLittleEndian(),
+                         cu->getAddressByteSize());
+  return Line->getOrParseLineTable(lineData, stmtOffset);
+}
+
+void DWARFContext::parseCompileUnits() {
+  CUs.parse(*this, getInfoSection());
+}
+
+void DWARFContext::parseTypeUnits() {
+  if (!TUs.empty())
+    return;
+  for (const auto &I : getTypesSections()) {
+    TUs.push_back(DWARFUnitSection<DWARFTypeUnit>());
+    TUs.back().parse(*this, I.second);
+  }
+}
+
+void DWARFContext::parseDWOCompileUnits() {
+  DWOCUs.parseDWO(*this, getInfoDWOSection());
+}
+
+void DWARFContext::parseDWOTypeUnits() {
+  if (!DWOTUs.empty())
+    return;
+  for (const auto &I : getTypesDWOSections()) {
+    DWOTUs.push_back(DWARFUnitSection<DWARFTypeUnit>());
+    DWOTUs.back().parseDWO(*this, I.second);
+  }
+}
+
+DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t Offset) {
+  parseCompileUnits();
+  return CUs.getUnitForOffset(Offset);
+}
+
+DWARFCompileUnit *DWARFContext::getCompileUnitForAddress(uint64_t Address) {
+  // First, get the offset of the compile unit.
+  uint32_t CUOffset = getDebugAranges()->findAddress(Address);
+  // Retrieve the compile unit.
+  return getCompileUnitForOffset(CUOffset);
+}
+
+static bool getFunctionNameForAddress(DWARFCompileUnit *CU, uint64_t Address,
+                                      FunctionNameKind Kind,
+                                      std::string &FunctionName) {
+  if (Kind == FunctionNameKind::None)
+    return false;
+  // The address may correspond to instruction in some inlined function,
+  // so we have to build the chain of inlined functions and take the
+  // name of the topmost function in it.
+  const DWARFDebugInfoEntryInlinedChain &InlinedChain =
+      CU->getInlinedChainForAddress(Address);
+  if (InlinedChain.DIEs.size() == 0)
+    return false;
+  const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain.DIEs[0];
+  if (const char *Name =
+          TopFunctionDIE.getSubroutineName(InlinedChain.U, Kind)) {
+    FunctionName = Name;
+    return true;
+  }
+  return false;
+}
+
+DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
+                                               DILineInfoSpecifier Spec) {
+  DILineInfo Result;
+
+  DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
+  if (!CU)
+    return Result;
+  getFunctionNameForAddress(CU, Address, Spec.FNKind, Result.FunctionName);
+  if (Spec.FLIKind != FileLineInfoKind::None) {
+    if (const DWARFLineTable *LineTable = getLineTableForUnit(CU))
+      LineTable->getFileLineInfoForAddress(Address, CU->getCompilationDir(),
+                                           Spec.FLIKind, Result);
+  }
+  return Result;
+}
+
+DILineInfoTable
+DWARFContext::getLineInfoForAddressRange(uint64_t Address, uint64_t Size,
+                                         DILineInfoSpecifier Spec) {
+  DILineInfoTable  Lines;
+  DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
+  if (!CU)
+    return Lines;
+
+  std::string FunctionName = "<invalid>";
+  getFunctionNameForAddress(CU, Address, Spec.FNKind, FunctionName);
+
+  // If the Specifier says we don't need FileLineInfo, just
+  // return the top-most function at the starting address.
+  if (Spec.FLIKind == FileLineInfoKind::None) {
+    DILineInfo Result;
+    Result.FunctionName = FunctionName;
+    Lines.push_back(std::make_pair(Address, Result));
+    return Lines;
+  }
+
+  const DWARFLineTable *LineTable = getLineTableForUnit(CU);
+
+  // Get the index of row we're looking for in the line table.
+  std::vector<uint32_t> RowVector;
+  if (!LineTable->lookupAddressRange(Address, Size, RowVector))
+    return Lines;
+
+  for (uint32_t RowIndex : RowVector) {
+    // Take file number and line/column from the row.
+    const DWARFDebugLine::Row &Row = LineTable->Rows[RowIndex];
+    DILineInfo Result;
+    LineTable->getFileNameByIndex(Row.File, CU->getCompilationDir(),
+                                  Spec.FLIKind, Result.FileName);
+    Result.FunctionName = FunctionName;
+    Result.Line = Row.Line;
+    Result.Column = Row.Column;
+    Lines.push_back(std::make_pair(Row.Address, Result));
+  }
+
+  return Lines;
+}
+
+DIInliningInfo
+DWARFContext::getInliningInfoForAddress(uint64_t Address,
+                                        DILineInfoSpecifier Spec) {
+  DIInliningInfo InliningInfo;
+
+  DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
+  if (!CU)
+    return InliningInfo;
+
+  const DWARFLineTable *LineTable = nullptr;
+  const DWARFDebugInfoEntryInlinedChain &InlinedChain =
+      CU->getInlinedChainForAddress(Address);
+  if (InlinedChain.DIEs.size() == 0) {
+    // If there is no DIE for address (e.g. it is in unavailable .dwo file),
+    // try to at least get file/line info from symbol table.
+    if (Spec.FLIKind != FileLineInfoKind::None) {
+      DILineInfo Frame;
+      LineTable = getLineTableForUnit(CU);
+      if (LineTable &&
+          LineTable->getFileLineInfoForAddress(Address, CU->getCompilationDir(),
+                                               Spec.FLIKind, Frame))
+        InliningInfo.addFrame(Frame);
+    }
+    return InliningInfo;
+  }
+
+  uint32_t CallFile = 0, CallLine = 0, CallColumn = 0;
+  for (uint32_t i = 0, n = InlinedChain.DIEs.size(); i != n; i++) {
+    const DWARFDebugInfoEntryMinimal &FunctionDIE = InlinedChain.DIEs[i];
+    DILineInfo Frame;
+    // Get function name if necessary.
+    if (const char *Name =
+            FunctionDIE.getSubroutineName(InlinedChain.U, Spec.FNKind))
+      Frame.FunctionName = Name;
+    if (Spec.FLIKind != FileLineInfoKind::None) {
+      if (i == 0) {
+        // For the topmost frame, initialize the line table of this
+        // compile unit and fetch file/line info from it.
+        LineTable = getLineTableForUnit(CU);
+        // For the topmost routine, get file/line info from line table.
+        if (LineTable)
+          LineTable->getFileLineInfoForAddress(Address, CU->getCompilationDir(),
+                                               Spec.FLIKind, Frame);
+      } else {
+        // Otherwise, use call file, call line and call column from
+        // previous DIE in inlined chain.
+        if (LineTable)
+          LineTable->getFileNameByIndex(CallFile, CU->getCompilationDir(),
+                                        Spec.FLIKind, Frame.FileName);
+        Frame.Line = CallLine;
+        Frame.Column = CallColumn;
+      }
+      // Get call file/line/column of a current DIE.
+      if (i + 1 < n) {
+        FunctionDIE.getCallerFrame(InlinedChain.U, CallFile, CallLine,
+                                   CallColumn);
+      }
+    }
+    InliningInfo.addFrame(Frame);
+  }
+  return InliningInfo;
+}
+
+static bool consumeCompressedDebugSectionHeader(StringRef &data,
+                                                uint64_t &OriginalSize) {
+  // Consume "ZLIB" prefix.
+  if (!data.startswith("ZLIB"))
+    return false;
+  data = data.substr(4);
+  // Consume uncompressed section size (big-endian 8 bytes).
+  DataExtractor extractor(data, false, 8);
+  uint32_t Offset = 0;
+  OriginalSize = extractor.getU64(&Offset);
+  if (Offset == 0)
+    return false;
+  data = data.substr(Offset);
+  return true;
+}
+
+DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj)
+    : IsLittleEndian(Obj.isLittleEndian()),
+      AddressSize(Obj.getBytesInAddress()) {
+  for (const SectionRef &Section : Obj.sections()) {
+    StringRef name;
+    Section.getName(name);
+    // Skip BSS and Virtual sections, they aren't interesting.
+    bool IsBSS = Section.isBSS();
+    if (IsBSS)
+      continue;
+    bool IsVirtual = Section.isVirtual();
+    if (IsVirtual)
+      continue;
+    StringRef data;
+    Section.getContents(data);
+
+    name = name.substr(name.find_first_not_of("._")); // Skip . and _ prefixes.
+
+    // Check if debug info section is compressed with zlib.
+    if (name.startswith("zdebug_")) {
+      uint64_t OriginalSize;
+      if (!zlib::isAvailable() ||
+          !consumeCompressedDebugSectionHeader(data, OriginalSize))
+        continue;
+      UncompressedSections.resize(UncompressedSections.size() + 1);
+      if (zlib::uncompress(data, UncompressedSections.back(), OriginalSize) !=
+          zlib::StatusOK) {
+        UncompressedSections.pop_back();
+        continue;
+      }
+      // Make data point to uncompressed section contents and save its contents.
+      name = name.substr(1);
+      data = UncompressedSections.back();
+    }
+
+    StringRef *SectionData =
+        StringSwitch<StringRef *>(name)
+            .Case("debug_info", &InfoSection.Data)
+            .Case("debug_abbrev", &AbbrevSection)
+            .Case("debug_loc", &LocSection.Data)
+            .Case("debug_line", &LineSection.Data)
+            .Case("debug_aranges", &ARangeSection)
+            .Case("debug_frame", &DebugFrameSection)
+            .Case("debug_str", &StringSection)
+            .Case("debug_ranges", &RangeSection)
+            .Case("debug_pubnames", &PubNamesSection)
+            .Case("debug_pubtypes", &PubTypesSection)
+            .Case("debug_gnu_pubnames", &GnuPubNamesSection)
+            .Case("debug_gnu_pubtypes", &GnuPubTypesSection)
+            .Case("debug_info.dwo", &InfoDWOSection.Data)
+            .Case("debug_abbrev.dwo", &AbbrevDWOSection)
+            .Case("debug_loc.dwo", &LocDWOSection.Data)
+            .Case("debug_line.dwo", &LineDWOSection.Data)
+            .Case("debug_str.dwo", &StringDWOSection)
+            .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
+            .Case("debug_addr", &AddrSection)
+            .Case("apple_names", &AppleNamesSection.Data)
+            .Case("apple_types", &AppleTypesSection.Data)
+            .Case("apple_namespaces", &AppleNamespacesSection.Data)
+            .Case("apple_namespac", &AppleNamespacesSection.Data)
+            .Case("apple_objc", &AppleObjCSection.Data)
+            // Any more debug info sections go here.
+            .Default(nullptr);
+    if (SectionData) {
+      *SectionData = data;
+      if (name == "debug_ranges") {
+        // FIXME: Use the other dwo range section when we emit it.
+        RangeDWOSection = data;
+      }
+    } else if (name == "debug_types") {
+      // Find debug_types data by section rather than name as there are
+      // multiple, comdat grouped, debug_types sections.
+      TypesSections[Section].Data = data;
+    } else if (name == "debug_types.dwo") {
+      TypesDWOSections[Section].Data = data;
+    }
+
+    section_iterator RelocatedSection = Section.getRelocatedSection();
+    if (RelocatedSection == Obj.section_end())
+      continue;
+
+    StringRef RelSecName;
+    RelocatedSection->getName(RelSecName);
+    RelSecName = RelSecName.substr(
+        RelSecName.find_first_not_of("._")); // Skip . and _ prefixes.
+
+    // TODO: Add support for relocations in other sections as needed.
+    // Record relocations for the debug_info and debug_line sections.
+    RelocAddrMap *Map = StringSwitch<RelocAddrMap*>(RelSecName)
+        .Case("debug_info", &InfoSection.Relocs)
+        .Case("debug_loc", &LocSection.Relocs)
+        .Case("debug_info.dwo", &InfoDWOSection.Relocs)
+        .Case("debug_line", &LineSection.Relocs)
+        .Case("apple_names", &AppleNamesSection.Relocs)
+        .Case("apple_types", &AppleTypesSection.Relocs)
+        .Case("apple_namespaces", &AppleNamespacesSection.Relocs)
+        .Case("apple_namespac", &AppleNamespacesSection.Relocs)
+        .Case("apple_objc", &AppleObjCSection.Relocs)
+        .Default(nullptr);
+    if (!Map) {
+      // Find debug_types relocs by section rather than name as there are
+      // multiple, comdat grouped, debug_types sections.
+      if (RelSecName == "debug_types")
+        Map = &TypesSections[*RelocatedSection].Relocs;
+      else if (RelSecName == "debug_types.dwo")
+        Map = &TypesDWOSections[*RelocatedSection].Relocs;
+      else
+        continue;
+    }
+
+    if (Section.relocation_begin() != Section.relocation_end()) {
+      uint64_t SectionSize = RelocatedSection->getSize();
+      for (const RelocationRef &Reloc : Section.relocations()) {
+        uint64_t Address;
+        Reloc.getOffset(Address);
+        uint64_t Type;
+        Reloc.getType(Type);
+        uint64_t SymAddr = 0;
+        object::symbol_iterator Sym = Reloc.getSymbol();
+        if (Sym != Obj.symbol_end())
+          Sym->getAddress(SymAddr);
+
+        object::RelocVisitor V(Obj);
+        object::RelocToApply R(V.visit(Type, Reloc, SymAddr));
+        if (V.error()) {
+          SmallString<32> Name;
+          std::error_code ec(Reloc.getTypeName(Name));
+          if (ec) {
+            errs() << "Aaaaaa! Nameless relocation! Aaaaaa!\n";
+          }
+          errs() << "error: failed to compute relocation: "
+                 << Name << "\n";
+          continue;
+        }
+
+        if (Address + R.Width > SectionSize) {
+          errs() << "error: " << R.Width << "-byte relocation starting "
+                 << Address << " bytes into section " << name << " which is "
+                 << SectionSize << " bytes long.\n";
+          continue;
+        }
+        if (R.Width > 8) {
+          errs() << "error: can't handle a relocation of more than 8 bytes at "
+                    "a time.\n";
+          continue;
+        }
+        DEBUG(dbgs() << "Writing " << format("%p", R.Value)
+                     << " at " << format("%p", Address)
+                     << " with width " << format("%d", R.Width)
+                     << "\n");
+        Map->insert(std::make_pair(Address, std::make_pair(R.Width, R.Value)));
+      }
+    }
+  }
+}
+
+void DWARFContextInMemory::anchor() { }
diff --git a/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp b/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
new file mode 100644
index 0000000..e63e289
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
@@ -0,0 +1,115 @@
+//===-- DWARFDebugAbbrev.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+DWARFAbbreviationDeclarationSet::DWARFAbbreviationDeclarationSet() {
+  clear();
+}
+
+void DWARFAbbreviationDeclarationSet::clear() {
+  Offset = 0;
+  FirstAbbrCode = 0;
+  Decls.clear();
+}
+
+bool DWARFAbbreviationDeclarationSet::extract(DataExtractor Data,
+                                              uint32_t *OffsetPtr) {
+  clear();
+  const uint32_t BeginOffset = *OffsetPtr;
+  Offset = BeginOffset;
+  DWARFAbbreviationDeclaration AbbrDecl;
+  uint32_t PrevAbbrCode = 0;
+  while (AbbrDecl.extract(Data, OffsetPtr)) {
+    if (FirstAbbrCode == 0) {
+      FirstAbbrCode = AbbrDecl.getCode();
+    } else {
+      if (PrevAbbrCode + 1 != AbbrDecl.getCode()) {
+        // Codes are not consecutive, can't do O(1) lookups.
+        FirstAbbrCode = UINT32_MAX;
+      }
+    }
+    PrevAbbrCode = AbbrDecl.getCode();
+    Decls.push_back(std::move(AbbrDecl));
+  }
+  return BeginOffset != *OffsetPtr;
+}
+
+void DWARFAbbreviationDeclarationSet::dump(raw_ostream &OS) const {
+  for (const auto &Decl : Decls)
+    Decl.dump(OS);
+}
+
+const DWARFAbbreviationDeclaration *
+DWARFAbbreviationDeclarationSet::getAbbreviationDeclaration(
+    uint32_t AbbrCode) const {
+  if (FirstAbbrCode == UINT32_MAX) {
+    for (const auto &Decl : Decls) {
+      if (Decl.getCode() == AbbrCode)
+        return &Decl;
+    }
+    return nullptr;
+  }
+  if (AbbrCode < FirstAbbrCode || AbbrCode >= FirstAbbrCode + Decls.size())
+    return nullptr;
+  return &Decls[AbbrCode - FirstAbbrCode];
+}
+
+DWARFDebugAbbrev::DWARFDebugAbbrev() {
+  clear();
+}
+
+void DWARFDebugAbbrev::clear() {
+  AbbrDeclSets.clear();
+  PrevAbbrOffsetPos = AbbrDeclSets.end();
+}
+
+void DWARFDebugAbbrev::extract(DataExtractor Data) {
+  clear();
+
+  uint32_t Offset = 0;
+  DWARFAbbreviationDeclarationSet AbbrDecls;
+  while (Data.isValidOffset(Offset)) {
+    uint32_t CUAbbrOffset = Offset;
+    if (!AbbrDecls.extract(Data, &Offset))
+      break;
+    AbbrDeclSets[CUAbbrOffset] = std::move(AbbrDecls);
+  }
+}
+
+void DWARFDebugAbbrev::dump(raw_ostream &OS) const {
+  if (AbbrDeclSets.empty()) {
+    OS << "< EMPTY >\n";
+    return;
+  }
+
+  for (const auto &I : AbbrDeclSets) {
+    OS << format("Abbrev table for offset: 0x%8.8" PRIx64 "\n", I.first);
+    I.second.dump(OS);
+  }
+}
+
+const DWARFAbbreviationDeclarationSet*
+DWARFDebugAbbrev::getAbbreviationDeclarationSet(uint64_t CUAbbrOffset) const {
+  const auto End = AbbrDeclSets.end();
+  if (PrevAbbrOffsetPos != End && PrevAbbrOffsetPos->first == CUAbbrOffset) {
+    return &(PrevAbbrOffsetPos->second);
+  }
+
+  const auto Pos = AbbrDeclSets.find(CUAbbrOffset);
+  if (Pos != End) {
+    PrevAbbrOffsetPos = Pos;
+    return &(Pos->second);
+  }
+
+  return nullptr;
+}
diff --git a/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp b/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
new file mode 100644
index 0000000..67589cd
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
@@ -0,0 +1,104 @@
+//===-- DWARFDebugArangeSet.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+using namespace llvm;
+
+void DWARFDebugArangeSet::clear() {
+  Offset = -1U;
+  std::memset(&HeaderData, 0, sizeof(Header));
+  ArangeDescriptors.clear();
+}
+
+bool
+DWARFDebugArangeSet::extract(DataExtractor data, uint32_t *offset_ptr) {
+  if (data.isValidOffset(*offset_ptr)) {
+    ArangeDescriptors.clear();
+    Offset = *offset_ptr;
+
+    // 7.20 Address Range Table
+    //
+    // Each set of entries in the table of address ranges contained in
+    // the .debug_aranges section begins with a header consisting of: a
+    // 4-byte length containing the length of the set of entries for this
+    // compilation unit, not including the length field itself; a 2-byte
+    // version identifier containing the value 2 for DWARF Version 2; a
+    // 4-byte offset into the.debug_infosection; a 1-byte unsigned integer
+    // containing the size in bytes of an address (or the offset portion of
+    // an address for segmented addressing) on the target system; and a
+    // 1-byte unsigned integer containing the size in bytes of a segment
+    // descriptor on the target system. This header is followed by a series
+    // of tuples. Each tuple consists of an address and a length, each in
+    // the size appropriate for an address on the target architecture.
+    HeaderData.Length = data.getU32(offset_ptr);
+    HeaderData.Version = data.getU16(offset_ptr);
+    HeaderData.CuOffset = data.getU32(offset_ptr);
+    HeaderData.AddrSize = data.getU8(offset_ptr);
+    HeaderData.SegSize = data.getU8(offset_ptr);
+
+    // Perform basic validation of the header fields.
+    if (!data.isValidOffsetForDataOfSize(Offset, HeaderData.Length) ||
+        (HeaderData.AddrSize != 4 && HeaderData.AddrSize != 8)) {
+      clear();
+      return false;
+    }
+
+    // The first tuple following the header in each set begins at an offset
+    // that is a multiple of the size of a single tuple (that is, twice the
+    // size of an address). The header is padded, if necessary, to the
+    // appropriate boundary.
+    const uint32_t header_size = *offset_ptr - Offset;
+    const uint32_t tuple_size = HeaderData.AddrSize * 2;
+    uint32_t first_tuple_offset = 0;
+    while (first_tuple_offset < header_size)
+      first_tuple_offset += tuple_size;
+
+    *offset_ptr = Offset + first_tuple_offset;
+
+    Descriptor arangeDescriptor;
+
+    static_assert(sizeof(arangeDescriptor.Address) ==
+                      sizeof(arangeDescriptor.Length),
+                  "Different datatypes for addresses and sizes!");
+    assert(sizeof(arangeDescriptor.Address) >= HeaderData.AddrSize);
+
+    while (data.isValidOffset(*offset_ptr)) {
+      arangeDescriptor.Address = data.getUnsigned(offset_ptr, HeaderData.AddrSize);
+      arangeDescriptor.Length = data.getUnsigned(offset_ptr, HeaderData.AddrSize);
+
+      // Each set of tuples is terminated by a 0 for the address and 0
+      // for the length.
+      if (arangeDescriptor.Address || arangeDescriptor.Length)
+        ArangeDescriptors.push_back(arangeDescriptor);
+      else
+        break; // We are done if we get a zero address and length
+    }
+
+    return !ArangeDescriptors.empty();
+  }
+  return false;
+}
+
+void DWARFDebugArangeSet::dump(raw_ostream &OS) const {
+  OS << format("Address Range Header: length = 0x%8.8x, version = 0x%4.4x, ",
+               HeaderData.Length, HeaderData.Version)
+     << format("cu_offset = 0x%8.8x, addr_size = 0x%2.2x, seg_size = 0x%2.2x\n",
+               HeaderData.CuOffset, HeaderData.AddrSize, HeaderData.SegSize);
+
+  const uint32_t hex_width = HeaderData.AddrSize * 2;
+  for (const auto &Desc : ArangeDescriptors) {
+    OS << format("[0x%*.*" PRIx64 " -", hex_width, hex_width, Desc.Address)
+       << format(" 0x%*.*" PRIx64 ")\n",
+                 hex_width, hex_width, Desc.getEndAddress());
+  }
+}
diff --git a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
new file mode 100644
index 0000000..27a02c4
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
@@ -0,0 +1,129 @@
+//===-- DWARFDebugAranges.cpp -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFDebugAranges.h"
+#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <set>
+using namespace llvm;
+
+void DWARFDebugAranges::extract(DataExtractor DebugArangesData) {
+  if (!DebugArangesData.isValidOffset(0))
+    return;
+  uint32_t Offset = 0;
+  DWARFDebugArangeSet Set;
+
+  while (Set.extract(DebugArangesData, &Offset)) {
+    uint32_t CUOffset = Set.getCompileUnitDIEOffset();
+    for (const auto &Desc : Set.descriptors()) {
+      uint64_t LowPC = Desc.Address;
+      uint64_t HighPC = Desc.getEndAddress();
+      appendRange(CUOffset, LowPC, HighPC);
+    }
+    ParsedCUOffsets.insert(CUOffset);
+  }
+}
+
+void DWARFDebugAranges::generate(DWARFContext *CTX) {
+  clear();
+  if (!CTX)
+    return;
+
+  // Extract aranges from .debug_aranges section.
+  DataExtractor ArangesData(CTX->getARangeSection(), CTX->isLittleEndian(), 0);
+  extract(ArangesData);
+
+  // Generate aranges from DIEs: even if .debug_aranges section is present,
+  // it may describe only a small subset of compilation units, so we need to
+  // manually build aranges for the rest of them.
+  for (const auto &CU : CTX->compile_units()) {
+    uint32_t CUOffset = CU->getOffset();
+    if (ParsedCUOffsets.insert(CUOffset).second) {
+      DWARFAddressRangesVector CURanges;
+      CU->collectAddressRanges(CURanges);
+      for (const auto &R : CURanges) {
+        appendRange(CUOffset, R.first, R.second);
+      }
+    }
+  }
+
+  construct();
+}
+
+void DWARFDebugAranges::clear() {
+  Endpoints.clear();
+  Aranges.clear();
+  ParsedCUOffsets.clear();
+}
+
+void DWARFDebugAranges::appendRange(uint32_t CUOffset, uint64_t LowPC,
+                                    uint64_t HighPC) {
+  if (LowPC >= HighPC)
+    return;
+  Endpoints.emplace_back(LowPC, CUOffset, true);
+  Endpoints.emplace_back(HighPC, CUOffset, false);
+}
+
+void DWARFDebugAranges::construct() {
+  std::multiset<uint32_t> ValidCUs;  // Maintain the set of CUs describing
+                                     // a current address range.
+  std::sort(Endpoints.begin(), Endpoints.end());
+  uint64_t PrevAddress = -1ULL;
+  for (const auto &E : Endpoints) {
+    if (PrevAddress < E.Address && ValidCUs.size() > 0) {
+      // If the address range between two endpoints is described by some
+      // CU, first try to extend the last range in Aranges. If we can't
+      // do it, start a new range.
+      if (!Aranges.empty() && Aranges.back().HighPC() == PrevAddress &&
+          ValidCUs.find(Aranges.back().CUOffset) != ValidCUs.end()) {
+        Aranges.back().setHighPC(E.Address);
+      } else {
+        Aranges.emplace_back(PrevAddress, E.Address, *ValidCUs.begin());
+      }
+    }
+    // Update the set of valid CUs.
+    if (E.IsRangeStart) {
+      ValidCUs.insert(E.CUOffset);
+    } else {
+      auto CUPos = ValidCUs.find(E.CUOffset);
+      assert(CUPos != ValidCUs.end());
+      ValidCUs.erase(CUPos);
+    }
+    PrevAddress = E.Address;
+  }
+  assert(ValidCUs.empty());
+
+  // Endpoints are not needed now.
+  std::vector<RangeEndpoint> EmptyEndpoints;
+  EmptyEndpoints.swap(Endpoints);
+}
+
+uint32_t DWARFDebugAranges::findAddress(uint64_t Address) const {
+  if (!Aranges.empty()) {
+    Range range(Address);
+    RangeCollIterator begin = Aranges.begin();
+    RangeCollIterator end = Aranges.end();
+    RangeCollIterator pos =
+        std::lower_bound(begin, end, range);
+
+    if (pos != end && pos->containsAddress(Address)) {
+      return pos->CUOffset;
+    } else if (pos != begin) {
+      --pos;
+      if (pos->containsAddress(Address))
+        return pos->CUOffset;
+    }
+  }
+  return -1U;
+}
diff --git a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
new file mode 100644
index 0000000..7d77290
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -0,0 +1,511 @@
+//===-- DWARFDebugFrame.h - Parsing of .debug_frame -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace dwarf;
+
+
+/// \brief Abstract frame entry defining the common interface concrete
+/// entries implement.
+class llvm::FrameEntry {
+public:
+  enum FrameKind {FK_CIE, FK_FDE};
+  FrameEntry(FrameKind K, uint64_t Offset, uint64_t Length)
+      : Kind(K), Offset(Offset), Length(Length) {}
+
+  virtual ~FrameEntry() {
+  }
+
+  FrameKind getKind() const { return Kind; }
+  virtual uint64_t getOffset() const { return Offset; }
+
+  /// \brief Parse and store a sequence of CFI instructions from Data,
+  /// starting at *Offset and ending at EndOffset. If everything
+  /// goes well, *Offset should be equal to EndOffset when this method
+  /// returns. Otherwise, an error occurred.
+  virtual void parseInstructions(DataExtractor Data, uint32_t *Offset,
+                                 uint32_t EndOffset);
+
+  /// \brief Dump the entry header to the given output stream.
+  virtual void dumpHeader(raw_ostream &OS) const = 0;
+
+  /// \brief Dump the entry's instructions to the given output stream.
+  virtual void dumpInstructions(raw_ostream &OS) const;
+
+protected:
+  const FrameKind Kind;
+
+  /// \brief Offset of this entry in the section.
+  uint64_t Offset;
+
+  /// \brief Entry length as specified in DWARF.
+  uint64_t Length;
+
+  /// An entry may contain CFI instructions. An instruction consists of an
+  /// opcode and an optional sequence of operands.
+  typedef std::vector<uint64_t> Operands;
+  struct Instruction {
+    Instruction(uint8_t Opcode)
+      : Opcode(Opcode)
+    {}
+
+    uint8_t Opcode;
+    Operands Ops;
+  };
+
+  std::vector<Instruction> Instructions;
+
+  /// Convenience methods to add a new instruction with the given opcode and
+  /// operands to the Instructions vector.
+  void addInstruction(uint8_t Opcode) {
+    Instructions.push_back(Instruction(Opcode));
+  }
+
+  void addInstruction(uint8_t Opcode, uint64_t Operand1) {
+    Instructions.push_back(Instruction(Opcode));
+    Instructions.back().Ops.push_back(Operand1);
+  }
+
+  void addInstruction(uint8_t Opcode, uint64_t Operand1, uint64_t Operand2) {
+    Instructions.push_back(Instruction(Opcode));
+    Instructions.back().Ops.push_back(Operand1);
+    Instructions.back().Ops.push_back(Operand2);
+  }
+};
+
+
+// See DWARF standard v3, section 7.23
+const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0;
+const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f;
+
+void FrameEntry::parseInstructions(DataExtractor Data, uint32_t *Offset,
+                                   uint32_t EndOffset) {
+  while (*Offset < EndOffset) {
+    uint8_t Opcode = Data.getU8(Offset);
+    // Some instructions have a primary opcode encoded in the top bits.
+    uint8_t Primary = Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK;
+
+    if (Primary) {
+      // If it's a primary opcode, the first operand is encoded in the bottom
+      // bits of the opcode itself.
+      uint64_t Op1 = Opcode & DWARF_CFI_PRIMARY_OPERAND_MASK;
+      switch (Primary) {
+        default: llvm_unreachable("Impossible primary CFI opcode");
+        case DW_CFA_advance_loc:
+        case DW_CFA_restore:
+          addInstruction(Primary, Op1);
+          break;
+        case DW_CFA_offset:
+          addInstruction(Primary, Op1, Data.getULEB128(Offset));
+          break;
+      }
+    } else {
+      // Extended opcode - its value is Opcode itself.
+      switch (Opcode) {
+        default: llvm_unreachable("Invalid extended CFI opcode");
+        case DW_CFA_nop:
+        case DW_CFA_remember_state:
+        case DW_CFA_restore_state:
+        case DW_CFA_GNU_window_save:
+          // No operands
+          addInstruction(Opcode);
+          break;
+        case DW_CFA_set_loc:
+          // Operands: Address
+          addInstruction(Opcode, Data.getAddress(Offset));
+          break;
+        case DW_CFA_advance_loc1:
+          // Operands: 1-byte delta
+          addInstruction(Opcode, Data.getU8(Offset));
+          break;
+        case DW_CFA_advance_loc2:
+          // Operands: 2-byte delta
+          addInstruction(Opcode, Data.getU16(Offset));
+          break;
+        case DW_CFA_advance_loc4:
+          // Operands: 4-byte delta
+          addInstruction(Opcode, Data.getU32(Offset));
+          break;
+        case DW_CFA_restore_extended:
+        case DW_CFA_undefined:
+        case DW_CFA_same_value:
+        case DW_CFA_def_cfa_register:
+        case DW_CFA_def_cfa_offset:
+          // Operands: ULEB128
+          addInstruction(Opcode, Data.getULEB128(Offset));
+          break;
+        case DW_CFA_def_cfa_offset_sf:
+          // Operands: SLEB128
+          addInstruction(Opcode, Data.getSLEB128(Offset));
+          break;
+        case DW_CFA_offset_extended:
+        case DW_CFA_register:
+        case DW_CFA_def_cfa:
+        case DW_CFA_val_offset:
+          // Operands: ULEB128, ULEB128
+          addInstruction(Opcode, Data.getULEB128(Offset),
+                                 Data.getULEB128(Offset));
+          break;
+        case DW_CFA_offset_extended_sf:
+        case DW_CFA_def_cfa_sf:
+        case DW_CFA_val_offset_sf:
+          // Operands: ULEB128, SLEB128
+          addInstruction(Opcode, Data.getULEB128(Offset),
+                                 Data.getSLEB128(Offset));
+          break;
+        case DW_CFA_def_cfa_expression:
+        case DW_CFA_expression:
+        case DW_CFA_val_expression:
+          // TODO: implement this
+          report_fatal_error("Values with expressions not implemented yet!");
+      }
+    }
+  }
+}
+
+namespace {
+/// \brief DWARF Common Information Entry (CIE)
+class CIE : public FrameEntry {
+public:
+  // CIEs (and FDEs) are simply container classes, so the only sensible way to
+  // create them is by providing the full parsed contents in the constructor.
+  CIE(uint64_t Offset, uint64_t Length, uint8_t Version,
+      SmallString<8> Augmentation, uint64_t CodeAlignmentFactor,
+      int64_t DataAlignmentFactor, uint64_t ReturnAddressRegister)
+      : FrameEntry(FK_CIE, Offset, Length), Version(Version),
+        Augmentation(std::move(Augmentation)),
+        CodeAlignmentFactor(CodeAlignmentFactor),
+        DataAlignmentFactor(DataAlignmentFactor),
+        ReturnAddressRegister(ReturnAddressRegister) {}
+
+  ~CIE() {
+  }
+
+  uint64_t getCodeAlignmentFactor() const { return CodeAlignmentFactor; }
+  int64_t getDataAlignmentFactor() const { return DataAlignmentFactor; }
+
+  void dumpHeader(raw_ostream &OS) const override {
+    OS << format("%08x %08x %08x CIE",
+                 (uint32_t)Offset, (uint32_t)Length, DW_CIE_ID)
+       << "\n";
+    OS << format("  Version:               %d\n", Version);
+    OS << "  Augmentation:          \"" << Augmentation << "\"\n";
+    OS << format("  Code alignment factor: %u\n",
+                 (uint32_t)CodeAlignmentFactor);
+    OS << format("  Data alignment factor: %d\n",
+                 (int32_t)DataAlignmentFactor);
+    OS << format("  Return address column: %d\n",
+                 (int32_t)ReturnAddressRegister);
+    OS << "\n";
+  }
+
+  static bool classof(const FrameEntry *FE) {
+    return FE->getKind() == FK_CIE;
+  }
+
+private:
+  /// The following fields are defined in section 6.4.1 of the DWARF standard v3
+  uint8_t Version;
+  SmallString<8> Augmentation;
+  uint64_t CodeAlignmentFactor;
+  int64_t DataAlignmentFactor;
+  uint64_t ReturnAddressRegister;
+};
+
+
+/// \brief DWARF Frame Description Entry (FDE)
+class FDE : public FrameEntry {
+public:
+  // Each FDE has a CIE it's "linked to". Our FDE contains is constructed with
+  // an offset to the CIE (provided by parsing the FDE header). The CIE itself
+  // is obtained lazily once it's actually required.
+  FDE(uint64_t Offset, uint64_t Length, int64_t LinkedCIEOffset,
+      uint64_t InitialLocation, uint64_t AddressRange,
+      CIE *Cie)
+      : FrameEntry(FK_FDE, Offset, Length), LinkedCIEOffset(LinkedCIEOffset),
+        InitialLocation(InitialLocation), AddressRange(AddressRange),
+        LinkedCIE(Cie) {}
+
+  ~FDE() {
+  }
+
+  CIE *getLinkedCIE() const { return LinkedCIE; }
+
+  void dumpHeader(raw_ostream &OS) const override {
+    OS << format("%08x %08x %08x FDE ",
+                 (uint32_t)Offset, (uint32_t)Length, (int32_t)LinkedCIEOffset);
+    OS << format("cie=%08x pc=%08x...%08x\n",
+                 (int32_t)LinkedCIEOffset,
+                 (uint32_t)InitialLocation,
+                 (uint32_t)InitialLocation + (uint32_t)AddressRange);
+  }
+
+  static bool classof(const FrameEntry *FE) {
+    return FE->getKind() == FK_FDE;
+  }
+
+private:
+  /// The following fields are defined in section 6.4.1 of the DWARF standard v3
+  uint64_t LinkedCIEOffset;
+  uint64_t InitialLocation;
+  uint64_t AddressRange;
+  CIE *LinkedCIE;
+};
+
+/// \brief Types of operands to CF instructions.
+enum OperandType {
+  OT_Unset,
+  OT_None,
+  OT_Address,
+  OT_Offset,
+  OT_FactoredCodeOffset,
+  OT_SignedFactDataOffset,
+  OT_UnsignedFactDataOffset,
+  OT_Register,
+  OT_Expression
+};
+
+} // end anonymous namespace
+
+/// \brief Initialize the array describing the types of operands.
+static ArrayRef<OperandType[2]> getOperandTypes() {
+  static OperandType OpTypes[DW_CFA_restore+1][2];
+
+#define DECLARE_OP2(OP, OPTYPE0, OPTYPE1)       \
+  do {                                          \
+    OpTypes[OP][0] = OPTYPE0;                   \
+    OpTypes[OP][1] = OPTYPE1;                   \
+  } while (0)
+#define DECLARE_OP1(OP, OPTYPE0) DECLARE_OP2(OP, OPTYPE0, OT_None)
+#define DECLARE_OP0(OP) DECLARE_OP1(OP, OT_None)
+
+  DECLARE_OP1(DW_CFA_set_loc, OT_Address);
+  DECLARE_OP1(DW_CFA_advance_loc, OT_FactoredCodeOffset);
+  DECLARE_OP1(DW_CFA_advance_loc1, OT_FactoredCodeOffset);
+  DECLARE_OP1(DW_CFA_advance_loc2, OT_FactoredCodeOffset);
+  DECLARE_OP1(DW_CFA_advance_loc4, OT_FactoredCodeOffset);
+  DECLARE_OP1(DW_CFA_MIPS_advance_loc8, OT_FactoredCodeOffset);
+  DECLARE_OP2(DW_CFA_def_cfa, OT_Register, OT_Offset);
+  DECLARE_OP2(DW_CFA_def_cfa_sf, OT_Register, OT_SignedFactDataOffset);
+  DECLARE_OP1(DW_CFA_def_cfa_register, OT_Register);
+  DECLARE_OP1(DW_CFA_def_cfa_offset, OT_Offset);
+  DECLARE_OP1(DW_CFA_def_cfa_offset_sf, OT_SignedFactDataOffset);
+  DECLARE_OP1(DW_CFA_def_cfa_expression, OT_Expression);
+  DECLARE_OP1(DW_CFA_undefined, OT_Register);
+  DECLARE_OP1(DW_CFA_same_value, OT_Register);
+  DECLARE_OP2(DW_CFA_offset, OT_Register, OT_UnsignedFactDataOffset);
+  DECLARE_OP2(DW_CFA_offset_extended, OT_Register, OT_UnsignedFactDataOffset);
+  DECLARE_OP2(DW_CFA_offset_extended_sf, OT_Register, OT_SignedFactDataOffset);
+  DECLARE_OP2(DW_CFA_val_offset, OT_Register, OT_UnsignedFactDataOffset);
+  DECLARE_OP2(DW_CFA_val_offset_sf, OT_Register, OT_SignedFactDataOffset);
+  DECLARE_OP2(DW_CFA_register, OT_Register, OT_Register);
+  DECLARE_OP2(DW_CFA_expression, OT_Register, OT_Expression);
+  DECLARE_OP2(DW_CFA_val_expression, OT_Register, OT_Expression);
+  DECLARE_OP1(DW_CFA_restore, OT_Register);
+  DECLARE_OP1(DW_CFA_restore_extended, OT_Register);
+  DECLARE_OP0(DW_CFA_remember_state);
+  DECLARE_OP0(DW_CFA_restore_state);
+  DECLARE_OP0(DW_CFA_GNU_window_save);
+  DECLARE_OP1(DW_CFA_GNU_args_size, OT_Offset);
+  DECLARE_OP0(DW_CFA_nop);
+
+#undef DECLARE_OP0
+#undef DECLARE_OP1
+#undef DECLARE_OP2
+  return ArrayRef<OperandType[2]>(&OpTypes[0], DW_CFA_restore+1);
+}
+
+static ArrayRef<OperandType[2]> OpTypes = getOperandTypes();
+
+/// \brief Print \p Opcode's operand number \p OperandIdx which has
+/// value \p Operand.
+static void printOperand(raw_ostream &OS, uint8_t Opcode, unsigned OperandIdx,
+                         uint64_t Operand, uint64_t CodeAlignmentFactor,
+                         int64_t DataAlignmentFactor) {
+  assert(OperandIdx < 2);
+  OperandType Type = OpTypes[Opcode][OperandIdx];
+
+  switch (Type) {
+  case OT_Unset:
+    OS << " Unsupported " << (OperandIdx ? "second" : "first") << " operand to";
+    if (const char *OpcodeName = CallFrameString(Opcode))
+      OS << " " << OpcodeName;
+    else
+      OS << format(" Opcode %x",  Opcode);
+    break;
+  case OT_None:
+    break;
+  case OT_Address:
+    OS << format(" %" PRIx64, Operand);
+    break;
+  case OT_Offset:
+    // The offsets are all encoded in a unsigned form, but in practice
+    // consumers use them signed. It's most certainly legacy due to
+    // the lack of signed variants in the first Dwarf standards.
+    OS << format(" %+" PRId64, int64_t(Operand));
+    break;
+  case OT_FactoredCodeOffset: // Always Unsigned
+    if (CodeAlignmentFactor)
+      OS << format(" %" PRId64, Operand * CodeAlignmentFactor);
+    else
+      OS << format(" %" PRId64 "*code_alignment_factor" , Operand);
+    break;
+  case OT_SignedFactDataOffset:
+    if (DataAlignmentFactor)
+      OS << format(" %" PRId64, int64_t(Operand) * DataAlignmentFactor);
+    else
+      OS << format(" %" PRId64 "*data_alignment_factor" , int64_t(Operand));
+    break;
+  case OT_UnsignedFactDataOffset:
+    if (DataAlignmentFactor)
+      OS << format(" %" PRId64, Operand * DataAlignmentFactor);
+    else
+      OS << format(" %" PRId64 "*data_alignment_factor" , Operand);
+    break;
+  case OT_Register:
+    OS << format(" reg%" PRId64, Operand);
+    break;
+  case OT_Expression:
+    OS << " expression";
+    break;
+  }
+}
+
+void FrameEntry::dumpInstructions(raw_ostream &OS) const {
+  uint64_t CodeAlignmentFactor = 0;
+  int64_t DataAlignmentFactor = 0;
+  const CIE *Cie = dyn_cast<CIE>(this);
+
+  if (!Cie)
+    Cie = cast<FDE>(this)->getLinkedCIE();
+  if (Cie) {
+    CodeAlignmentFactor = Cie->getCodeAlignmentFactor();
+    DataAlignmentFactor = Cie->getDataAlignmentFactor();
+  }
+
+  for (const auto &Instr : Instructions) {
+    uint8_t Opcode = Instr.Opcode;
+    if (Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK)
+      Opcode &= DWARF_CFI_PRIMARY_OPCODE_MASK;
+    OS << "  " << CallFrameString(Opcode) << ":";
+    for (unsigned i = 0; i < Instr.Ops.size(); ++i)
+      printOperand(OS, Opcode, i, Instr.Ops[i], CodeAlignmentFactor,
+                   DataAlignmentFactor);
+    OS << '\n';
+  }
+}
+
+DWARFDebugFrame::DWARFDebugFrame() {
+}
+
+DWARFDebugFrame::~DWARFDebugFrame() {
+}
+
+static void LLVM_ATTRIBUTE_UNUSED dumpDataAux(DataExtractor Data,
+                                              uint32_t Offset, int Length) {
+  errs() << "DUMP: ";
+  for (int i = 0; i < Length; ++i) {
+    uint8_t c = Data.getU8(&Offset);
+    errs().write_hex(c); errs() << " ";
+  }
+  errs() << "\n";
+}
+
+
+void DWARFDebugFrame::parse(DataExtractor Data) {
+  uint32_t Offset = 0;
+  DenseMap<uint32_t, CIE *> CIEs;
+
+  while (Data.isValidOffset(Offset)) {
+    uint32_t StartOffset = Offset;
+
+    bool IsDWARF64 = false;
+    uint64_t Length = Data.getU32(&Offset);
+    uint64_t Id;
+
+    if (Length == UINT32_MAX) {
+      // DWARF-64 is distinguished by the first 32 bits of the initial length
+      // field being 0xffffffff. Then, the next 64 bits are the actual entry
+      // length.
+      IsDWARF64 = true;
+      Length = Data.getU64(&Offset);
+    }
+
+    // At this point, Offset points to the next field after Length.
+    // Length is the structure size excluding itself. Compute an offset one
+    // past the end of the structure (needed to know how many instructions to
+    // read).
+    // TODO: For honest DWARF64 support, DataExtractor will have to treat
+    //       offset_ptr as uint64_t*
+    uint32_t EndStructureOffset = Offset + static_cast<uint32_t>(Length);
+
+    // The Id field's size depends on the DWARF format
+    Id = Data.getUnsigned(&Offset, IsDWARF64 ? 8 : 4);
+    bool IsCIE = ((IsDWARF64 && Id == DW64_CIE_ID) || Id == DW_CIE_ID);
+
+    if (IsCIE) {
+      // Note: this is specifically DWARFv3 CIE header structure. It was
+      // changed in DWARFv4. We currently don't support reading DWARFv4
+      // here because LLVM itself does not emit it (and LLDB doesn't
+      // support it either).
+      uint8_t Version = Data.getU8(&Offset);
+      const char *Augmentation = Data.getCStr(&Offset);
+      uint64_t CodeAlignmentFactor = Data.getULEB128(&Offset);
+      int64_t DataAlignmentFactor = Data.getSLEB128(&Offset);
+      uint64_t ReturnAddressRegister = Data.getULEB128(&Offset);
+
+      auto Cie = make_unique<CIE>(StartOffset, Length, Version,
+                                  StringRef(Augmentation), CodeAlignmentFactor,
+                                  DataAlignmentFactor, ReturnAddressRegister);
+      CIEs[StartOffset] = Cie.get();
+      Entries.emplace_back(std::move(Cie));
+    } else {
+      // FDE
+      uint64_t CIEPointer = Id;
+      uint64_t InitialLocation = Data.getAddress(&Offset);
+      uint64_t AddressRange = Data.getAddress(&Offset);
+
+      Entries.emplace_back(new FDE(StartOffset, Length, CIEPointer,
+                                   InitialLocation, AddressRange,
+                                   CIEs[CIEPointer]));
+    }
+
+    Entries.back()->parseInstructions(Data, &Offset, EndStructureOffset);
+
+    if (Offset != EndStructureOffset) {
+      std::string Str;
+      raw_string_ostream OS(Str);
+      OS << format("Parsing entry instructions at %lx failed", StartOffset);
+      report_fatal_error(Str);
+    }
+  }
+}
+
+
+void DWARFDebugFrame::dump(raw_ostream &OS) const {
+  OS << "\n";
+  for (const auto &Entry : Entries) {
+    Entry->dumpHeader(OS);
+    Entry->dumpInstructions(OS);
+    OS << "\n";
+  }
+}
+
diff --git a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
new file mode 100644
index 0000000..e963b7c
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
@@ -0,0 +1,459 @@
+//===-- DWARFDebugInfoEntry.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SyntaxHighlighting.h"
+#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+using namespace dwarf;
+using namespace syntax;
+
+// Small helper to extract a DIE pointed by a reference
+// attribute. It looks up the Unit containing the DIE and calls
+// DIE.extractFast with the right unit. Returns new unit on success,
+// nullptr otherwise.
+static const DWARFUnit *findUnitAndExtractFast(DWARFDebugInfoEntryMinimal &DIE,
+                                               const DWARFUnit *Unit,
+                                               uint32_t *Offset) {
+  Unit = Unit->getUnitSection().getUnitForOffset(*Offset);
+  return (Unit && DIE.extractFast(Unit, Offset)) ? Unit : nullptr;
+}
+
+void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS, DWARFUnit *u,
+                                      unsigned recurseDepth,
+                                      unsigned indent) const {
+  DataExtractor debug_info_data = u->getDebugInfoExtractor();
+  uint32_t offset = Offset;
+
+  if (debug_info_data.isValidOffset(offset)) {
+    uint32_t abbrCode = debug_info_data.getULEB128(&offset);
+    WithColor(OS, syntax::Address).get() << format("\n0x%8.8x: ", Offset);
+
+    if (abbrCode) {
+      if (AbbrevDecl) {
+          const char *tagString = TagString(getTag());
+          if (tagString)
+            WithColor(OS, syntax::Tag).get().indent(indent) << tagString;
+          else
+            WithColor(OS, syntax::Tag).get().indent(indent) <<
+              format("DW_TAG_Unknown_%x", getTag());
+
+        OS << format(" [%u] %c\n", abbrCode,
+                     AbbrevDecl->hasChildren() ? '*' : ' ');
+
+        // Dump all data in the DIE for the attributes.
+        for (const auto &AttrSpec : AbbrevDecl->attributes()) {
+          dumpAttribute(OS, u, &offset, AttrSpec.Attr, AttrSpec.Form, indent);
+        }
+
+        const DWARFDebugInfoEntryMinimal *child = getFirstChild();
+        if (recurseDepth > 0 && child) {
+          while (child) {
+            child->dump(OS, u, recurseDepth-1, indent+2);
+            child = child->getSibling();
+          }
+        }
+      } else {
+        OS << "Abbreviation code not found in 'debug_abbrev' class for code: "
+           << abbrCode << '\n';
+      }
+    } else {
+      OS.indent(indent) << "NULL\n";
+    }
+  }
+}
+
+static void dumpApplePropertyAttribute(raw_ostream &OS, uint64_t Val) {
+  OS << " (";
+  do {
+    uint64_t Shift = countTrailingZeros(Val);
+    assert(Shift < 64 && "undefined behavior");
+    uint64_t Bit = 1ULL << Shift;
+    if (const char *PropName = ApplePropertyString(Bit))
+      OS << PropName;
+    else
+      OS << format("DW_APPLE_PROPERTY_0x%" PRIx64, Bit);
+    if (!(Val ^= Bit))
+      break;
+    OS << ", ";
+  } while (true);
+  OS << ")";
+}
+
+static void dumpRanges(raw_ostream &OS, const DWARFAddressRangesVector& Ranges,
+                       unsigned AddressSize, unsigned Indent) {
+  if (Ranges.empty())
+    return;
+
+  for (const auto &Range: Ranges) {
+    OS << '\n';
+    OS.indent(Indent);
+    OS << format("[0x%0*" PRIx64 " - 0x%0*" PRIx64 ")",
+                 AddressSize*2, Range.first,
+                 AddressSize*2, Range.second);
+  }
+}
+
+void DWARFDebugInfoEntryMinimal::dumpAttribute(raw_ostream &OS,
+                                               DWARFUnit *u,
+                                               uint32_t *offset_ptr,
+                                               uint16_t attr, uint16_t form,
+                                               unsigned indent) const {
+  const char BaseIndent[] = "            ";
+  OS << BaseIndent;
+  OS.indent(indent+2);
+  const char *attrString = AttributeString(attr);
+  if (attrString)
+    WithColor(OS, syntax::Attribute) << attrString;
+  else
+    WithColor(OS, syntax::Attribute).get() << format("DW_AT_Unknown_%x", attr);
+
+  const char *formString = FormEncodingString(form);
+  if (formString)
+    OS << " [" << formString << ']';
+  else
+    OS << format(" [DW_FORM_Unknown_%x]", form);
+
+  DWARFFormValue formValue(form);
+
+  if (!formValue.extractValue(u->getDebugInfoExtractor(), offset_ptr, u))
+    return;
+
+  OS << "\t(";
+  
+  const char *Name = nullptr;
+  std::string File;
+  auto Color = syntax::Enumerator;
+  if (attr == DW_AT_decl_file || attr == DW_AT_call_file) {
+  Color = syntax::String;
+    if (const auto *LT = u->getContext().getLineTableForUnit(u))
+      if (LT->getFileNameByIndex(
+             formValue.getAsUnsignedConstant().getValue(),
+             u->getCompilationDir(),
+             DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, File)) {
+        File = '"' + File + '"';
+        Name = File.c_str();
+      }
+  } else if (Optional<uint64_t> Val = formValue.getAsUnsignedConstant())
+    Name = AttributeValueString(attr, *Val);
+
+  if (Name)
+    WithColor(OS, Color) << Name;
+  else if (attr == DW_AT_decl_line || attr == DW_AT_call_line)
+    OS << *formValue.getAsUnsignedConstant();
+  else
+    formValue.dump(OS, u);
+
+  // We have dumped the attribute raw value. For some attributes
+  // having both the raw value and the pretty-printed value is
+  // interesting. These attributes are handled below.
+  if ((attr == DW_AT_specification || attr == DW_AT_abstract_origin) &&
+      // The signature references aren't handled.
+      formValue.getForm() != DW_FORM_ref_sig8) {
+    uint32_t Ref = formValue.getAsReference(u).getValue();
+    DWARFDebugInfoEntryMinimal DIE;
+    if (const DWARFUnit *RefU = findUnitAndExtractFast(DIE, u, &Ref))
+      if (const char *Ref = DIE.getName(RefU, DINameKind::LinkageName))
+        OS << " \"" << Ref << '\"';
+  } else if (attr == DW_AT_APPLE_property_attribute) {
+    if (Optional<uint64_t> OptVal = formValue.getAsUnsignedConstant())
+      dumpApplePropertyAttribute(OS, *OptVal);
+  } else if (attr == DW_AT_ranges) {
+    dumpRanges(OS, getAddressRanges(u), u->getAddressByteSize(),
+               sizeof(BaseIndent)+indent+4);
+  }
+
+  OS << ")\n";
+}
+
+bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFUnit *U,
+                                             uint32_t *OffsetPtr) {
+  Offset = *OffsetPtr;
+  DataExtractor DebugInfoData = U->getDebugInfoExtractor();
+  uint32_t UEndOffset = U->getNextUnitOffset();
+  if (Offset >= UEndOffset || !DebugInfoData.isValidOffset(Offset))
+    return false;
+  uint64_t AbbrCode = DebugInfoData.getULEB128(OffsetPtr);
+  if (0 == AbbrCode) {
+    // NULL debug tag entry.
+    AbbrevDecl = nullptr;
+    return true;
+  }
+  AbbrevDecl = U->getAbbreviations()->getAbbreviationDeclaration(AbbrCode);
+  if (nullptr == AbbrevDecl) {
+    // Restore the original offset.
+    *OffsetPtr = Offset;
+    return false;
+  }
+  ArrayRef<uint8_t> FixedFormSizes = DWARFFormValue::getFixedFormSizes(
+      U->getAddressByteSize(), U->getVersion());
+  assert(FixedFormSizes.size() > 0);
+
+  // Skip all data in the .debug_info for the attributes
+  for (const auto &AttrSpec : AbbrevDecl->attributes()) {
+    uint16_t Form = AttrSpec.Form;
+
+    uint8_t FixedFormSize =
+        (Form < FixedFormSizes.size()) ? FixedFormSizes[Form] : 0;
+    if (FixedFormSize)
+      *OffsetPtr += FixedFormSize;
+    else if (!DWARFFormValue::skipValue(Form, DebugInfoData, OffsetPtr, U)) {
+      // Restore the original offset.
+      *OffsetPtr = Offset;
+      return false;
+    }
+  }
+  return true;
+}
+
+bool DWARFDebugInfoEntryMinimal::isSubprogramDIE() const {
+  return getTag() == DW_TAG_subprogram;
+}
+
+bool DWARFDebugInfoEntryMinimal::isSubroutineDIE() const {
+  uint32_t Tag = getTag();
+  return Tag == DW_TAG_subprogram ||
+         Tag == DW_TAG_inlined_subroutine;
+}
+
+bool DWARFDebugInfoEntryMinimal::getAttributeValue(
+    const DWARFUnit *U, const uint16_t Attr, DWARFFormValue &FormValue) const {
+  if (!AbbrevDecl)
+    return false;
+
+  uint32_t AttrIdx = AbbrevDecl->findAttributeIndex(Attr);
+  if (AttrIdx == -1U)
+    return false;
+
+  DataExtractor DebugInfoData = U->getDebugInfoExtractor();
+  uint32_t DebugInfoOffset = getOffset();
+
+  // Skip the abbreviation code so we are at the data for the attributes
+  DebugInfoData.getULEB128(&DebugInfoOffset);
+
+  // Skip preceding attribute values.
+  for (uint32_t i = 0; i < AttrIdx; ++i) {
+    DWARFFormValue::skipValue(AbbrevDecl->getFormByIndex(i),
+                              DebugInfoData, &DebugInfoOffset, U);
+  }
+
+  FormValue = DWARFFormValue(AbbrevDecl->getFormByIndex(AttrIdx));
+  return FormValue.extractValue(DebugInfoData, &DebugInfoOffset, U);
+}
+
+const char *DWARFDebugInfoEntryMinimal::getAttributeValueAsString(
+    const DWARFUnit *U, const uint16_t Attr, const char *FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<const char *> Result = FormValue.getAsCString(U);
+  return Result.hasValue() ? Result.getValue() : FailValue;
+}
+
+uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsAddress(
+    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<uint64_t> Result = FormValue.getAsAddress(U);
+  return Result.hasValue() ? Result.getValue() : FailValue;
+}
+
+uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsUnsignedConstant(
+    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<uint64_t> Result = FormValue.getAsUnsignedConstant();
+  return Result.hasValue() ? Result.getValue() : FailValue;
+}
+
+uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsReference(
+    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<uint64_t> Result = FormValue.getAsReference(U);
+  return Result.hasValue() ? Result.getValue() : FailValue;
+}
+
+uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsSectionOffset(
+    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<uint64_t> Result = FormValue.getAsSectionOffset();
+  return Result.hasValue() ? Result.getValue() : FailValue;
+}
+
+uint64_t
+DWARFDebugInfoEntryMinimal::getRangesBaseAttribute(const DWARFUnit *U,
+                                                   uint64_t FailValue) const {
+  uint64_t Result =
+      getAttributeValueAsSectionOffset(U, DW_AT_ranges_base, -1ULL);
+  if (Result != -1ULL)
+    return Result;
+  return getAttributeValueAsSectionOffset(U, DW_AT_GNU_ranges_base, FailValue);
+}
+
+bool DWARFDebugInfoEntryMinimal::getLowAndHighPC(const DWARFUnit *U,
+                                                 uint64_t &LowPC,
+                                                 uint64_t &HighPC) const {
+  LowPC = getAttributeValueAsAddress(U, DW_AT_low_pc, -1ULL);
+  if (LowPC == -1ULL)
+    return false;
+  HighPC = getAttributeValueAsAddress(U, DW_AT_high_pc, -1ULL);
+  if (HighPC == -1ULL) {
+    // Since DWARF4, DW_AT_high_pc may also be of class constant, in which case
+    // it represents function size.
+    HighPC = getAttributeValueAsUnsignedConstant(U, DW_AT_high_pc, -1ULL);
+    if (HighPC != -1ULL)
+      HighPC += LowPC;
+  }
+  return (HighPC != -1ULL);
+}
+
+DWARFAddressRangesVector
+DWARFDebugInfoEntryMinimal::getAddressRanges(const DWARFUnit *U) const {
+  if (isNULL())
+    return DWARFAddressRangesVector();
+  // Single range specified by low/high PC.
+  uint64_t LowPC, HighPC;
+  if (getLowAndHighPC(U, LowPC, HighPC)) {
+    return DWARFAddressRangesVector(1, std::make_pair(LowPC, HighPC));
+  }
+  // Multiple ranges from .debug_ranges section.
+  uint32_t RangesOffset =
+      getAttributeValueAsSectionOffset(U, DW_AT_ranges, -1U);
+  if (RangesOffset != -1U) {
+    DWARFDebugRangeList RangeList;
+    if (U->extractRangeList(RangesOffset, RangeList))
+      return RangeList.getAbsoluteRanges(U->getBaseAddress());
+  }
+  return DWARFAddressRangesVector();
+}
+
+void DWARFDebugInfoEntryMinimal::collectChildrenAddressRanges(
+    const DWARFUnit *U, DWARFAddressRangesVector& Ranges) const {
+  if (isNULL())
+    return;
+  if (isSubprogramDIE()) {
+    const auto &DIERanges = getAddressRanges(U);
+    Ranges.insert(Ranges.end(), DIERanges.begin(), DIERanges.end());
+  }
+
+  const DWARFDebugInfoEntryMinimal *Child = getFirstChild();
+  while (Child) {
+    Child->collectChildrenAddressRanges(U, Ranges);
+    Child = Child->getSibling();
+  }
+}
+
+bool DWARFDebugInfoEntryMinimal::addressRangeContainsAddress(
+    const DWARFUnit *U, const uint64_t Address) const {
+  for (const auto& R : getAddressRanges(U)) {
+    if (R.first <= Address && Address < R.second)
+      return true;
+  }
+  return false;
+}
+
+const char *
+DWARFDebugInfoEntryMinimal::getSubroutineName(const DWARFUnit *U,
+                                              DINameKind Kind) const {
+  if (!isSubroutineDIE())
+    return nullptr;
+  return getName(U, Kind);
+}
+
+const char *
+DWARFDebugInfoEntryMinimal::getName(const DWARFUnit *U,
+                                    DINameKind Kind) const {
+  if (Kind == DINameKind::None)
+    return nullptr;
+  // Try to get mangled name only if it was asked for.
+  if (Kind == DINameKind::LinkageName) {
+    if (const char *name =
+            getAttributeValueAsString(U, DW_AT_MIPS_linkage_name, nullptr))
+      return name;
+    if (const char *name =
+            getAttributeValueAsString(U, DW_AT_linkage_name, nullptr))
+      return name;
+  }
+  if (const char *name = getAttributeValueAsString(U, DW_AT_name, nullptr))
+    return name;
+  // Try to get name from specification DIE.
+  uint32_t spec_ref =
+      getAttributeValueAsReference(U, DW_AT_specification, -1U);
+  if (spec_ref != -1U) {
+    DWARFDebugInfoEntryMinimal spec_die;
+    if (const DWARFUnit *RefU = findUnitAndExtractFast(spec_die, U, &spec_ref)) {
+      if (const char *name = spec_die.getName(RefU, Kind))
+        return name;
+    }
+  }
+  // Try to get name from abstract origin DIE.
+  uint32_t abs_origin_ref =
+      getAttributeValueAsReference(U, DW_AT_abstract_origin, -1U);
+  if (abs_origin_ref != -1U) {
+    DWARFDebugInfoEntryMinimal abs_origin_die;
+    if (const DWARFUnit *RefU = findUnitAndExtractFast(abs_origin_die, U,
+                                                       &abs_origin_ref)) {
+      if (const char *name = abs_origin_die.getName(RefU, Kind))
+        return name;
+    }
+  }
+  return nullptr;
+}
+
+void DWARFDebugInfoEntryMinimal::getCallerFrame(const DWARFUnit *U,
+                                                uint32_t &CallFile,
+                                                uint32_t &CallLine,
+                                                uint32_t &CallColumn) const {
+  CallFile = getAttributeValueAsUnsignedConstant(U, DW_AT_call_file, 0);
+  CallLine = getAttributeValueAsUnsignedConstant(U, DW_AT_call_line, 0);
+  CallColumn = getAttributeValueAsUnsignedConstant(U, DW_AT_call_column, 0);
+}
+
+DWARFDebugInfoEntryInlinedChain
+DWARFDebugInfoEntryMinimal::getInlinedChainForAddress(
+    const DWARFUnit *U, const uint64_t Address) const {
+  DWARFDebugInfoEntryInlinedChain InlinedChain;
+  InlinedChain.U = U;
+  if (isNULL())
+    return InlinedChain;
+  for (const DWARFDebugInfoEntryMinimal *DIE = this; DIE; ) {
+    // Append current DIE to inlined chain only if it has correct tag
+    // (e.g. it is not a lexical block).
+    if (DIE->isSubroutineDIE()) {
+      InlinedChain.DIEs.push_back(*DIE);
+    }
+    // Try to get child which also contains provided address.
+    const DWARFDebugInfoEntryMinimal *Child = DIE->getFirstChild();
+    while (Child) {
+      if (Child->addressRangeContainsAddress(U, Address)) {
+        // Assume there is only one such child.
+        break;
+      }
+      Child = Child->getSibling();
+    }
+    DIE = Child;
+  }
+  // Reverse the obtained chain to make the root of inlined chain last.
+  std::reverse(InlinedChain.DIEs.begin(), InlinedChain.DIEs.end());
+  return InlinedChain;
+}
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
new file mode 100644
index 0000000..b63af6a
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -0,0 +1,698 @@
+//===-- DWARFDebugLine.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+using namespace dwarf;
+typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind;
+
+DWARFDebugLine::Prologue::Prologue() {
+  clear();
+}
+
+void DWARFDebugLine::Prologue::clear() {
+  TotalLength = Version = PrologueLength = 0;
+  MinInstLength = MaxOpsPerInst = DefaultIsStmt = LineBase = LineRange = 0;
+  OpcodeBase = 0;
+  StandardOpcodeLengths.clear();
+  IncludeDirectories.clear();
+  FileNames.clear();
+}
+
+void DWARFDebugLine::Prologue::dump(raw_ostream &OS) const {
+  OS << "Line table prologue:\n"
+     << format("    total_length: 0x%8.8x\n", TotalLength)
+     << format("         version: %u\n", Version)
+     << format(" prologue_length: 0x%8.8x\n", PrologueLength)
+     << format(" min_inst_length: %u\n", MinInstLength)
+     << format(Version >= 4 ? "max_ops_per_inst: %u\n" : "", MaxOpsPerInst)
+     << format(" default_is_stmt: %u\n", DefaultIsStmt)
+     << format("       line_base: %i\n", LineBase)
+     << format("      line_range: %u\n", LineRange)
+     << format("     opcode_base: %u\n", OpcodeBase);
+
+  for (uint32_t i = 0; i < StandardOpcodeLengths.size(); ++i)
+    OS << format("standard_opcode_lengths[%s] = %u\n", LNStandardString(i+1),
+                 StandardOpcodeLengths[i]);
+
+  if (!IncludeDirectories.empty())
+    for (uint32_t i = 0; i < IncludeDirectories.size(); ++i)
+      OS << format("include_directories[%3u] = '", i+1)
+         << IncludeDirectories[i] << "'\n";
+
+  if (!FileNames.empty()) {
+    OS << "                Dir  Mod Time   File Len   File Name\n"
+       << "                ---- ---------- ---------- -----------"
+          "----------------\n";
+    for (uint32_t i = 0; i < FileNames.size(); ++i) {
+      const FileNameEntry& fileEntry = FileNames[i];
+      OS << format("file_names[%3u] %4" PRIu64 " ", i+1, fileEntry.DirIdx)
+         << format("0x%8.8" PRIx64 " 0x%8.8" PRIx64 " ",
+                   fileEntry.ModTime, fileEntry.Length)
+         << fileEntry.Name << '\n';
+    }
+  }
+}
+
+bool DWARFDebugLine::Prologue::parse(DataExtractor debug_line_data,
+                                     uint32_t *offset_ptr) {
+  const uint32_t prologue_offset = *offset_ptr;
+
+  clear();
+  TotalLength = debug_line_data.getU32(offset_ptr);
+  Version = debug_line_data.getU16(offset_ptr);
+  if (Version < 2)
+    return false;
+
+  PrologueLength = debug_line_data.getU32(offset_ptr);
+  const uint32_t end_prologue_offset = PrologueLength + *offset_ptr;
+  MinInstLength = debug_line_data.getU8(offset_ptr);
+  if (Version >= 4)
+    MaxOpsPerInst = debug_line_data.getU8(offset_ptr);
+  DefaultIsStmt = debug_line_data.getU8(offset_ptr);
+  LineBase = debug_line_data.getU8(offset_ptr);
+  LineRange = debug_line_data.getU8(offset_ptr);
+  OpcodeBase = debug_line_data.getU8(offset_ptr);
+
+  StandardOpcodeLengths.reserve(OpcodeBase - 1);
+  for (uint32_t i = 1; i < OpcodeBase; ++i) {
+    uint8_t op_len = debug_line_data.getU8(offset_ptr);
+    StandardOpcodeLengths.push_back(op_len);
+  }
+
+  while (*offset_ptr < end_prologue_offset) {
+    const char *s = debug_line_data.getCStr(offset_ptr);
+    if (s && s[0])
+      IncludeDirectories.push_back(s);
+    else
+      break;
+  }
+
+  while (*offset_ptr < end_prologue_offset) {
+    const char *name = debug_line_data.getCStr(offset_ptr);
+    if (name && name[0]) {
+      FileNameEntry fileEntry;
+      fileEntry.Name = name;
+      fileEntry.DirIdx = debug_line_data.getULEB128(offset_ptr);
+      fileEntry.ModTime = debug_line_data.getULEB128(offset_ptr);
+      fileEntry.Length = debug_line_data.getULEB128(offset_ptr);
+      FileNames.push_back(fileEntry);
+    } else {
+      break;
+    }
+  }
+
+  if (*offset_ptr != end_prologue_offset) {
+    fprintf(stderr, "warning: parsing line table prologue at 0x%8.8x should"
+                    " have ended at 0x%8.8x but it ended at 0x%8.8x\n",
+            prologue_offset, end_prologue_offset, *offset_ptr);
+    return false;
+  }
+  return true;
+}
+
+DWARFDebugLine::Row::Row(bool default_is_stmt) {
+  reset(default_is_stmt);
+}
+
+void DWARFDebugLine::Row::postAppend() {
+  BasicBlock = false;
+  PrologueEnd = false;
+  EpilogueBegin = false;
+}
+
+void DWARFDebugLine::Row::reset(bool default_is_stmt) {
+  Address = 0;
+  Line = 1;
+  Column = 0;
+  File = 1;
+  Isa = 0;
+  Discriminator = 0;
+  IsStmt = default_is_stmt;
+  BasicBlock = false;
+  EndSequence = false;
+  PrologueEnd = false;
+  EpilogueBegin = false;
+}
+
+void DWARFDebugLine::Row::dump(raw_ostream &OS) const {
+  OS << format("0x%16.16" PRIx64 " %6u %6u", Address, Line, Column)
+     << format(" %6u %3u %13u ", File, Isa, Discriminator)
+     << (IsStmt ? " is_stmt" : "")
+     << (BasicBlock ? " basic_block" : "")
+     << (PrologueEnd ? " prologue_end" : "")
+     << (EpilogueBegin ? " epilogue_begin" : "")
+     << (EndSequence ? " end_sequence" : "")
+     << '\n';
+}
+
+DWARFDebugLine::Sequence::Sequence() {
+  reset();
+}
+
+void DWARFDebugLine::Sequence::reset() {
+  LowPC = 0;
+  HighPC = 0;
+  FirstRowIndex = 0;
+  LastRowIndex = 0;
+  Empty = true;
+}
+
+DWARFDebugLine::LineTable::LineTable() {
+  clear();
+}
+
+void DWARFDebugLine::LineTable::dump(raw_ostream &OS) const {
+  Prologue.dump(OS);
+  OS << '\n';
+
+  if (!Rows.empty()) {
+    OS << "Address            Line   Column File   ISA Discriminator Flags\n"
+       << "------------------ ------ ------ ------ --- ------------- "
+          "-------------\n";
+    for (const Row &R : Rows) {
+      R.dump(OS);
+    }
+  }
+}
+
+void DWARFDebugLine::LineTable::clear() {
+  Prologue.clear();
+  Rows.clear();
+  Sequences.clear();
+}
+
+DWARFDebugLine::ParsingState::ParsingState(struct LineTable *LT)
+    : LineTable(LT), RowNumber(0) {
+  resetRowAndSequence();
+}
+
+void DWARFDebugLine::ParsingState::resetRowAndSequence() {
+  Row.reset(LineTable->Prologue.DefaultIsStmt);
+  Sequence.reset();
+}
+
+void DWARFDebugLine::ParsingState::appendRowToMatrix(uint32_t offset) {
+  if (Sequence.Empty) {
+    // Record the beginning of instruction sequence.
+    Sequence.Empty = false;
+    Sequence.LowPC = Row.Address;
+    Sequence.FirstRowIndex = RowNumber;
+  }
+  ++RowNumber;
+  LineTable->appendRow(Row);
+  if (Row.EndSequence) {
+    // Record the end of instruction sequence.
+    Sequence.HighPC = Row.Address;
+    Sequence.LastRowIndex = RowNumber;
+    if (Sequence.isValid())
+      LineTable->appendSequence(Sequence);
+    Sequence.reset();
+  }
+  Row.postAppend();
+}
+
+const DWARFDebugLine::LineTable *
+DWARFDebugLine::getLineTable(uint32_t offset) const {
+  LineTableConstIter pos = LineTableMap.find(offset);
+  if (pos != LineTableMap.end())
+    return &pos->second;
+  return nullptr;
+}
+
+const DWARFDebugLine::LineTable *
+DWARFDebugLine::getOrParseLineTable(DataExtractor debug_line_data,
+                                    uint32_t offset) {
+  std::pair<LineTableIter, bool> pos =
+    LineTableMap.insert(LineTableMapTy::value_type(offset, LineTable()));
+  LineTable *LT = &pos.first->second;
+  if (pos.second) {
+    if (!LT->parse(debug_line_data, RelocMap, &offset))
+      return nullptr;
+  }
+  return LT;
+}
+
+bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
+                                      const RelocAddrMap *RMap,
+                                      uint32_t *offset_ptr) {
+  const uint32_t debug_line_offset = *offset_ptr;
+
+  clear();
+
+  if (!Prologue.parse(debug_line_data, offset_ptr)) {
+    // Restore our offset and return false to indicate failure!
+    *offset_ptr = debug_line_offset;
+    return false;
+  }
+
+  const uint32_t end_offset = debug_line_offset + Prologue.TotalLength +
+                              sizeof(Prologue.TotalLength);
+
+  ParsingState State(this);
+
+  while (*offset_ptr < end_offset) {
+    uint8_t opcode = debug_line_data.getU8(offset_ptr);
+
+    if (opcode == 0) {
+      // Extended Opcodes always start with a zero opcode followed by
+      // a uleb128 length so you can skip ones you don't know about
+      uint32_t ext_offset = *offset_ptr;
+      uint64_t len = debug_line_data.getULEB128(offset_ptr);
+      uint32_t arg_size = len - (*offset_ptr - ext_offset);
+
+      uint8_t sub_opcode = debug_line_data.getU8(offset_ptr);
+      switch (sub_opcode) {
+      case DW_LNE_end_sequence:
+        // Set the end_sequence register of the state machine to true and
+        // append a row to the matrix using the current values of the
+        // state-machine registers. Then reset the registers to the initial
+        // values specified above. Every statement program sequence must end
+        // with a DW_LNE_end_sequence instruction which creates a row whose
+        // address is that of the byte after the last target machine instruction
+        // of the sequence.
+        State.Row.EndSequence = true;
+        State.appendRowToMatrix(*offset_ptr);
+        State.resetRowAndSequence();
+        break;
+
+      case DW_LNE_set_address:
+        // Takes a single relocatable address as an operand. The size of the
+        // operand is the size appropriate to hold an address on the target
+        // machine. Set the address register to the value given by the
+        // relocatable address. All of the other statement program opcodes
+        // that affect the address register add a delta to it. This instruction
+        // stores a relocatable value into it instead.
+        {
+          // If this address is in our relocation map, apply the relocation.
+          RelocAddrMap::const_iterator AI = RMap->find(*offset_ptr);
+          if (AI != RMap->end()) {
+             const std::pair<uint8_t, int64_t> &R = AI->second;
+             State.Row.Address =
+                 debug_line_data.getAddress(offset_ptr) + R.second;
+          } else
+            State.Row.Address = debug_line_data.getAddress(offset_ptr);
+        }
+        break;
+
+      case DW_LNE_define_file:
+        // Takes 4 arguments. The first is a null terminated string containing
+        // a source file name. The second is an unsigned LEB128 number
+        // representing the directory index of the directory in which the file
+        // was found. The third is an unsigned LEB128 number representing the
+        // time of last modification of the file. The fourth is an unsigned
+        // LEB128 number representing the length in bytes of the file. The time
+        // and length fields may contain LEB128(0) if the information is not
+        // available.
+        //
+        // The directory index represents an entry in the include_directories
+        // section of the statement program prologue. The index is LEB128(0)
+        // if the file was found in the current directory of the compilation,
+        // LEB128(1) if it was found in the first directory in the
+        // include_directories section, and so on. The directory index is
+        // ignored for file names that represent full path names.
+        //
+        // The files are numbered, starting at 1, in the order in which they
+        // appear; the names in the prologue come before names defined by
+        // the DW_LNE_define_file instruction. These numbers are used in the
+        // the file register of the state machine.
+        {
+          FileNameEntry fileEntry;
+          fileEntry.Name = debug_line_data.getCStr(offset_ptr);
+          fileEntry.DirIdx = debug_line_data.getULEB128(offset_ptr);
+          fileEntry.ModTime = debug_line_data.getULEB128(offset_ptr);
+          fileEntry.Length = debug_line_data.getULEB128(offset_ptr);
+          Prologue.FileNames.push_back(fileEntry);
+        }
+        break;
+
+      case DW_LNE_set_discriminator:
+        State.Row.Discriminator = debug_line_data.getULEB128(offset_ptr);
+        break;
+
+      default:
+        // Length doesn't include the zero opcode byte or the length itself, but
+        // it does include the sub_opcode, so we have to adjust for that below
+        (*offset_ptr) += arg_size;
+        break;
+      }
+    } else if (opcode < Prologue.OpcodeBase) {
+      switch (opcode) {
+      // Standard Opcodes
+      case DW_LNS_copy:
+        // Takes no arguments. Append a row to the matrix using the
+        // current values of the state-machine registers. Then set
+        // the basic_block register to false.
+        State.appendRowToMatrix(*offset_ptr);
+        break;
+
+      case DW_LNS_advance_pc:
+        // Takes a single unsigned LEB128 operand, multiplies it by the
+        // min_inst_length field of the prologue, and adds the
+        // result to the address register of the state machine.
+        State.Row.Address +=
+            debug_line_data.getULEB128(offset_ptr) * Prologue.MinInstLength;
+        break;
+
+      case DW_LNS_advance_line:
+        // Takes a single signed LEB128 operand and adds that value to
+        // the line register of the state machine.
+        State.Row.Line += debug_line_data.getSLEB128(offset_ptr);
+        break;
+
+      case DW_LNS_set_file:
+        // Takes a single unsigned LEB128 operand and stores it in the file
+        // register of the state machine.
+        State.Row.File = debug_line_data.getULEB128(offset_ptr);
+        break;
+
+      case DW_LNS_set_column:
+        // Takes a single unsigned LEB128 operand and stores it in the
+        // column register of the state machine.
+        State.Row.Column = debug_line_data.getULEB128(offset_ptr);
+        break;
+
+      case DW_LNS_negate_stmt:
+        // Takes no arguments. Set the is_stmt register of the state
+        // machine to the logical negation of its current value.
+        State.Row.IsStmt = !State.Row.IsStmt;
+        break;
+
+      case DW_LNS_set_basic_block:
+        // Takes no arguments. Set the basic_block register of the
+        // state machine to true
+        State.Row.BasicBlock = true;
+        break;
+
+      case DW_LNS_const_add_pc:
+        // Takes no arguments. Add to the address register of the state
+        // machine the address increment value corresponding to special
+        // opcode 255. The motivation for DW_LNS_const_add_pc is this:
+        // when the statement program needs to advance the address by a
+        // small amount, it can use a single special opcode, which occupies
+        // a single byte. When it needs to advance the address by up to
+        // twice the range of the last special opcode, it can use
+        // DW_LNS_const_add_pc followed by a special opcode, for a total
+        // of two bytes. Only if it needs to advance the address by more
+        // than twice that range will it need to use both DW_LNS_advance_pc
+        // and a special opcode, requiring three or more bytes.
+        {
+          uint8_t adjust_opcode = 255 - Prologue.OpcodeBase;
+          uint64_t addr_offset =
+              (adjust_opcode / Prologue.LineRange) * Prologue.MinInstLength;
+          State.Row.Address += addr_offset;
+        }
+        break;
+
+      case DW_LNS_fixed_advance_pc:
+        // Takes a single uhalf operand. Add to the address register of
+        // the state machine the value of the (unencoded) operand. This
+        // is the only extended opcode that takes an argument that is not
+        // a variable length number. The motivation for DW_LNS_fixed_advance_pc
+        // is this: existing assemblers cannot emit DW_LNS_advance_pc or
+        // special opcodes because they cannot encode LEB128 numbers or
+        // judge when the computation of a special opcode overflows and
+        // requires the use of DW_LNS_advance_pc. Such assemblers, however,
+        // can use DW_LNS_fixed_advance_pc instead, sacrificing compression.
+        State.Row.Address += debug_line_data.getU16(offset_ptr);
+        break;
+
+      case DW_LNS_set_prologue_end:
+        // Takes no arguments. Set the prologue_end register of the
+        // state machine to true
+        State.Row.PrologueEnd = true;
+        break;
+
+      case DW_LNS_set_epilogue_begin:
+        // Takes no arguments. Set the basic_block register of the
+        // state machine to true
+        State.Row.EpilogueBegin = true;
+        break;
+
+      case DW_LNS_set_isa:
+        // Takes a single unsigned LEB128 operand and stores it in the
+        // column register of the state machine.
+        State.Row.Isa = debug_line_data.getULEB128(offset_ptr);
+        break;
+
+      default:
+        // Handle any unknown standard opcodes here. We know the lengths
+        // of such opcodes because they are specified in the prologue
+        // as a multiple of LEB128 operands for each opcode.
+        {
+          assert(opcode - 1U < Prologue.StandardOpcodeLengths.size());
+          uint8_t opcode_length = Prologue.StandardOpcodeLengths[opcode - 1];
+          for (uint8_t i = 0; i < opcode_length; ++i)
+            debug_line_data.getULEB128(offset_ptr);
+        }
+        break;
+      }
+    } else {
+      // Special Opcodes
+
+      // A special opcode value is chosen based on the amount that needs
+      // to be added to the line and address registers. The maximum line
+      // increment for a special opcode is the value of the line_base
+      // field in the header, plus the value of the line_range field,
+      // minus 1 (line base + line range - 1). If the desired line
+      // increment is greater than the maximum line increment, a standard
+      // opcode must be used instead of a special opcode. The "address
+      // advance" is calculated by dividing the desired address increment
+      // by the minimum_instruction_length field from the header. The
+      // special opcode is then calculated using the following formula:
+      //
+      //  opcode = (desired line increment - line_base) +
+      //           (line_range * address advance) + opcode_base
+      //
+      // If the resulting opcode is greater than 255, a standard opcode
+      // must be used instead.
+      //
+      // To decode a special opcode, subtract the opcode_base from the
+      // opcode itself to give the adjusted opcode. The amount to
+      // increment the address register is the result of the adjusted
+      // opcode divided by the line_range multiplied by the
+      // minimum_instruction_length field from the header. That is:
+      //
+      //  address increment = (adjusted opcode / line_range) *
+      //                      minimum_instruction_length
+      //
+      // The amount to increment the line register is the line_base plus
+      // the result of the adjusted opcode modulo the line_range. That is:
+      //
+      // line increment = line_base + (adjusted opcode % line_range)
+
+      uint8_t adjust_opcode = opcode - Prologue.OpcodeBase;
+      uint64_t addr_offset =
+          (adjust_opcode / Prologue.LineRange) * Prologue.MinInstLength;
+      int32_t line_offset =
+          Prologue.LineBase + (adjust_opcode % Prologue.LineRange);
+      State.Row.Line += line_offset;
+      State.Row.Address += addr_offset;
+      State.appendRowToMatrix(*offset_ptr);
+    }
+  }
+
+  if (!State.Sequence.Empty) {
+    fprintf(stderr, "warning: last sequence in debug line table is not"
+                    "terminated!\n");
+  }
+
+  // Sort all sequences so that address lookup will work faster.
+  if (!Sequences.empty()) {
+    std::sort(Sequences.begin(), Sequences.end(), Sequence::orderByLowPC);
+    // Note: actually, instruction address ranges of sequences should not
+    // overlap (in shared objects and executables). If they do, the address
+    // lookup would still work, though, but result would be ambiguous.
+    // We don't report warning in this case. For example,
+    // sometimes .so compiled from multiple object files contains a few
+    // rudimentary sequences for address ranges [0x0, 0xsomething).
+  }
+
+  return end_offset;
+}
+
+uint32_t DWARFDebugLine::LineTable::lookupAddress(uint64_t address) const {
+  uint32_t unknown_index = UINT32_MAX;
+  if (Sequences.empty())
+    return unknown_index;
+  // First, find an instruction sequence containing the given address.
+  DWARFDebugLine::Sequence sequence;
+  sequence.LowPC = address;
+  SequenceIter first_seq = Sequences.begin();
+  SequenceIter last_seq = Sequences.end();
+  SequenceIter seq_pos = std::lower_bound(first_seq, last_seq, sequence,
+      DWARFDebugLine::Sequence::orderByLowPC);
+  DWARFDebugLine::Sequence found_seq;
+  if (seq_pos == last_seq) {
+    found_seq = Sequences.back();
+  } else if (seq_pos->LowPC == address) {
+    found_seq = *seq_pos;
+  } else {
+    if (seq_pos == first_seq)
+      return unknown_index;
+    found_seq = *(seq_pos - 1);
+  }
+  if (!found_seq.containsPC(address))
+    return unknown_index;
+  // Search for instruction address in the rows describing the sequence.
+  // Rows are stored in a vector, so we may use arithmetical operations with
+  // iterators.
+  DWARFDebugLine::Row row;
+  row.Address = address;
+  RowIter first_row = Rows.begin() + found_seq.FirstRowIndex;
+  RowIter last_row = Rows.begin() + found_seq.LastRowIndex;
+  RowIter row_pos = std::lower_bound(first_row, last_row, row,
+      DWARFDebugLine::Row::orderByAddress);
+  if (row_pos == last_row) {
+    return found_seq.LastRowIndex - 1;
+  }
+  uint32_t index = found_seq.FirstRowIndex + (row_pos - first_row);
+  if (row_pos->Address > address) {
+    if (row_pos == first_row)
+      return unknown_index;
+    else
+      index--;
+  }
+  return index;
+}
+
+bool DWARFDebugLine::LineTable::lookupAddressRange(
+    uint64_t address, uint64_t size, std::vector<uint32_t> &result) const {
+  if (Sequences.empty())
+    return false;
+  uint64_t end_addr = address + size;
+  // First, find an instruction sequence containing the given address.
+  DWARFDebugLine::Sequence sequence;
+  sequence.LowPC = address;
+  SequenceIter first_seq = Sequences.begin();
+  SequenceIter last_seq = Sequences.end();
+  SequenceIter seq_pos = std::lower_bound(first_seq, last_seq, sequence,
+      DWARFDebugLine::Sequence::orderByLowPC);
+  if (seq_pos == last_seq || seq_pos->LowPC != address) {
+    if (seq_pos == first_seq)
+      return false;
+    seq_pos--;
+  }
+  if (!seq_pos->containsPC(address))
+    return false;
+
+  SequenceIter start_pos = seq_pos;
+
+  // Add the rows from the first sequence to the vector, starting with the
+  // index we just calculated
+
+  while (seq_pos != last_seq && seq_pos->LowPC < end_addr) {
+    DWARFDebugLine::Sequence cur_seq = *seq_pos;
+    uint32_t first_row_index;
+    uint32_t last_row_index;
+    if (seq_pos == start_pos) {
+      // For the first sequence, we need to find which row in the sequence is the
+      // first in our range. Rows are stored in a vector, so we may use
+      // arithmetical operations with iterators.
+      DWARFDebugLine::Row row;
+      row.Address = address;
+      RowIter first_row = Rows.begin() + cur_seq.FirstRowIndex;
+      RowIter last_row = Rows.begin() + cur_seq.LastRowIndex;
+      RowIter row_pos = std::upper_bound(first_row, last_row, row,
+                                         DWARFDebugLine::Row::orderByAddress);
+      // The 'row_pos' iterator references the first row that is greater than
+      // our start address. Unless that's the first row, we want to start at
+      // the row before that.
+      first_row_index = cur_seq.FirstRowIndex + (row_pos - first_row);
+      if (row_pos != first_row)
+        --first_row_index;
+    } else
+      first_row_index = cur_seq.FirstRowIndex;
+
+    // For the last sequence in our range, we need to figure out the last row in
+    // range.  For all other sequences we can go to the end of the sequence.
+    if (cur_seq.HighPC > end_addr) {
+      DWARFDebugLine::Row row;
+      row.Address = end_addr;
+      RowIter first_row = Rows.begin() + cur_seq.FirstRowIndex;
+      RowIter last_row = Rows.begin() + cur_seq.LastRowIndex;
+      RowIter row_pos = std::upper_bound(first_row, last_row, row,
+                                         DWARFDebugLine::Row::orderByAddress);
+      // The 'row_pos' iterator references the first row that is greater than
+      // our end address.  The row before that is the last row we want.
+      last_row_index = cur_seq.FirstRowIndex + (row_pos - first_row) - 1;
+    } else
+      // Contrary to what you might expect, DWARFDebugLine::SequenceLastRowIndex
+      // isn't a valid index within the current sequence.  It's that plus one.
+      last_row_index = cur_seq.LastRowIndex - 1;
+
+    for (uint32_t i = first_row_index; i <= last_row_index; ++i) {
+      result.push_back(i);
+    }
+
+    ++seq_pos;
+  }
+
+  return true;
+}
+
+bool
+DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
+                                              const char *CompDir,
+                                              FileLineInfoKind Kind,
+                                              std::string &Result) const {
+  if (FileIndex == 0 || FileIndex > Prologue.FileNames.size() ||
+      Kind == FileLineInfoKind::None)
+    return false;
+  const FileNameEntry &Entry = Prologue.FileNames[FileIndex - 1];
+  const char *FileName = Entry.Name;
+  if (Kind != FileLineInfoKind::AbsoluteFilePath ||
+      sys::path::is_absolute(FileName)) {
+    Result = FileName;
+    return true;
+  }
+
+  SmallString<16> FilePath;
+  uint64_t IncludeDirIndex = Entry.DirIdx;
+  const char *IncludeDir = "";
+  // Be defensive about the contents of Entry.
+  if (IncludeDirIndex > 0 &&
+      IncludeDirIndex <= Prologue.IncludeDirectories.size())
+    IncludeDir = Prologue.IncludeDirectories[IncludeDirIndex - 1];
+
+  // We may still need to append compilation directory of compile unit.
+  // We know that FileName is not absolute, the only way to have an
+  // absolute path at this point would be if IncludeDir is absolute.
+  if (CompDir && Kind == FileLineInfoKind::AbsoluteFilePath &&
+      sys::path::is_relative(IncludeDir))
+    sys::path::append(FilePath, CompDir);
+
+  // sys::path::append skips empty strings.
+  sys::path::append(FilePath, IncludeDir, FileName);
+  Result = FilePath.str();
+  return true;
+}
+
+bool
+DWARFDebugLine::LineTable::getFileLineInfoForAddress(uint64_t Address,
+                                                     const char *CompDir,
+                                                     FileLineInfoKind Kind,
+                                                     DILineInfo &Result) const {
+  // Get the index of row we're looking for in the line table.
+  uint32_t RowIndex = lookupAddress(Address);
+  if (RowIndex == -1U)
+    return false;
+  // Take file number and line/column from the row.
+  const auto &Row = Rows[RowIndex];
+  if (!getFileNameByIndex(Row.File, CompDir, Kind, Result.FileName))
+    return false;
+  Result.Line = Row.Line;
+  Result.Column = Row.Column;
+  return true;
+}
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
new file mode 100644
index 0000000..fdb6dd2
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -0,0 +1,128 @@
+//===-- DWARFDebugLoc.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+void DWARFDebugLoc::dump(raw_ostream &OS) const {
+  for (const LocationList &L : Locations) {
+    OS << format("0x%8.8x: ", L.Offset);
+    const unsigned Indent = 12;
+    for (const Entry &E : L.Entries) {
+      if (&E != L.Entries.begin())
+        OS.indent(Indent);
+      OS << "Beginning address offset: " << format("0x%016" PRIx64, E.Begin)
+         << '\n';
+      OS.indent(Indent) << "   Ending address offset: "
+                        << format("0x%016" PRIx64, E.End) << '\n';
+      OS.indent(Indent) << "    Location description: ";
+      for (unsigned char Loc : E.Loc) {
+        OS << format("%2.2x ", Loc);
+      }
+      OS << "\n\n";
+    }
+  }
+}
+
+void DWARFDebugLoc::parse(DataExtractor data, unsigned AddressSize) {
+  uint32_t Offset = 0;
+  while (data.isValidOffset(Offset+AddressSize-1)) {
+    Locations.resize(Locations.size() + 1);
+    LocationList &Loc = Locations.back();
+    Loc.Offset = Offset;
+    // 2.6.2 Location Lists
+    // A location list entry consists of:
+    while (true) {
+      Entry E;
+      RelocAddrMap::const_iterator AI = RelocMap.find(Offset);
+      // 1. A beginning address offset. ...
+      E.Begin = data.getUnsigned(&Offset, AddressSize);
+      if (AI != RelocMap.end())
+        E.Begin += AI->second.second;
+
+      AI = RelocMap.find(Offset);
+      // 2. An ending address offset. ...
+      E.End = data.getUnsigned(&Offset, AddressSize);
+      if (AI != RelocMap.end())
+        E.End += AI->second.second;
+
+      // The end of any given location list is marked by an end of list entry,
+      // which consists of a 0 for the beginning address offset and a 0 for the
+      // ending address offset.
+      if (E.Begin == 0 && E.End == 0)
+        break;
+
+      unsigned Bytes = data.getU16(&Offset);
+      // A single location description describing the location of the object...
+      StringRef str = data.getData().substr(Offset, Bytes);
+      Offset += Bytes;
+      E.Loc.reserve(str.size());
+      std::copy(str.begin(), str.end(), std::back_inserter(E.Loc));
+      Loc.Entries.push_back(std::move(E));
+    }
+  }
+  if (data.isValidOffset(Offset))
+    llvm::errs() << "error: failed to consume entire .debug_loc section\n";
+}
+
+void DWARFDebugLocDWO::parse(DataExtractor data) {
+  uint32_t Offset = 0;
+  while (data.isValidOffset(Offset)) {
+    Locations.resize(Locations.size() + 1);
+    LocationList &Loc = Locations.back();
+    Loc.Offset = Offset;
+    dwarf::LocationListEntry Kind;
+    while ((Kind = static_cast<dwarf::LocationListEntry>(
+                data.getU8(&Offset))) != dwarf::DW_LLE_end_of_list_entry) {
+
+      if (Kind != dwarf::DW_LLE_start_length_entry) {
+        llvm::errs() << "error: dumping support for LLE of kind " << (int)Kind
+                     << " not implemented\n";
+        return;
+      }
+
+      Entry E;
+
+      E.Start = data.getULEB128(&Offset);
+      E.Length = data.getU32(&Offset);
+
+      unsigned Bytes = data.getU16(&Offset);
+      // A single location description describing the location of the object...
+      StringRef str = data.getData().substr(Offset, Bytes);
+      Offset += Bytes;
+      E.Loc.resize(str.size());
+      std::copy(str.begin(), str.end(), E.Loc.begin());
+
+      Loc.Entries.push_back(std::move(E));
+    }
+  }
+}
+
+void DWARFDebugLocDWO::dump(raw_ostream &OS) const {
+  for (const LocationList &L : Locations) {
+    OS << format("0x%8.8x: ", L.Offset);
+    const unsigned Indent = 12;
+    for (const Entry &E : L.Entries) {
+      if (&E != L.Entries.begin())
+        OS.indent(Indent);
+      OS << "Beginning address index: " << E.Start << '\n';
+      OS.indent(Indent) << "                 Length: " << E.Length << '\n';
+      OS.indent(Indent) << "   Location description: ";
+      for (unsigned char Loc : E.Loc)
+        OS << format("%2.2x ", Loc);
+      OS << "\n\n";
+    }
+  }
+}
+
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
new file mode 100644
index 0000000..d5df688
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -0,0 +1,69 @@
+//===-- DWARFDebugRangesList.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+void DWARFDebugRangeList::clear() {
+  Offset = -1U;
+  AddressSize = 0;
+  Entries.clear();
+}
+
+bool DWARFDebugRangeList::extract(DataExtractor data, uint32_t *offset_ptr) {
+  clear();
+  if (!data.isValidOffset(*offset_ptr))
+    return false;
+  AddressSize = data.getAddressSize();
+  if (AddressSize != 4 && AddressSize != 8)
+    return false;
+  Offset = *offset_ptr;
+  while (true) {
+    RangeListEntry entry;
+    uint32_t prev_offset = *offset_ptr;
+    entry.StartAddress = data.getAddress(offset_ptr);
+    entry.EndAddress = data.getAddress(offset_ptr);
+    // Check that both values were extracted correctly.
+    if (*offset_ptr != prev_offset + 2 * AddressSize) {
+      clear();
+      return false;
+    }
+    if (entry.isEndOfListEntry())
+      break;
+    Entries.push_back(entry);
+  }
+  return true;
+}
+
+void DWARFDebugRangeList::dump(raw_ostream &OS) const {
+  for (const RangeListEntry &RLE : Entries) {
+    const char *format_str = (AddressSize == 4
+                              ? "%08x %08"  PRIx64 " %08"  PRIx64 "\n"
+                              : "%08x %016" PRIx64 " %016" PRIx64 "\n");
+    OS << format(format_str, Offset, RLE.StartAddress, RLE.EndAddress);
+  }
+  OS << format("%08x <End of list>\n", Offset);
+}
+
+DWARFAddressRangesVector
+DWARFDebugRangeList::getAbsoluteRanges(uint64_t BaseAddress) const {
+  DWARFAddressRangesVector Res;
+  for (const RangeListEntry &RLE : Entries) {
+    if (RLE.isBaseAddressSelectionEntry(AddressSize)) {
+      BaseAddress = RLE.EndAddress;
+    } else {
+      Res.push_back(std::make_pair(BaseAddress + RLE.StartAddress,
+                                   BaseAddress + RLE.EndAddress));
+    }
+  }
+  return Res;
+}
diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
new file mode 100644
index 0000000..45bd197
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -0,0 +1,565 @@
+//===-- DWARFFormValue.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SyntaxHighlighting.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+using namespace llvm;
+using namespace dwarf;
+using namespace syntax;
+
+namespace {
+uint8_t getRefAddrSize(uint8_t AddrSize, uint16_t Version) {
+  // FIXME: Support DWARF64.
+  return (Version == 2) ? AddrSize : 4;
+}
+
+template <uint8_t AddrSize, uint8_t RefAddrSize>
+ArrayRef<uint8_t> makeFixedFormSizesArrayRef() {
+  static const uint8_t sizes[] = {
+    0,           // 0x00 unused
+    AddrSize,    // 0x01 DW_FORM_addr
+    0,           // 0x02 unused
+    0,           // 0x03 DW_FORM_block2
+    0,           // 0x04 DW_FORM_block4
+    2,           // 0x05 DW_FORM_data2
+    4,           // 0x06 DW_FORM_data4
+    8,           // 0x07 DW_FORM_data8
+    0,           // 0x08 DW_FORM_string
+    0,           // 0x09 DW_FORM_block
+    0,           // 0x0a DW_FORM_block1
+    1,           // 0x0b DW_FORM_data1
+    1,           // 0x0c DW_FORM_flag
+    0,           // 0x0d DW_FORM_sdata
+    4,           // 0x0e DW_FORM_strp
+    0,           // 0x0f DW_FORM_udata
+    RefAddrSize, // 0x10 DW_FORM_ref_addr
+    1,           // 0x11 DW_FORM_ref1
+    2,           // 0x12 DW_FORM_ref2
+    4,           // 0x13 DW_FORM_ref4
+    8,           // 0x14 DW_FORM_ref8
+    0,           // 0x15 DW_FORM_ref_udata
+    0,           // 0x16 DW_FORM_indirect
+    4,           // 0x17 DW_FORM_sec_offset
+    0,           // 0x18 DW_FORM_exprloc
+    0,           // 0x19 DW_FORM_flag_present
+  };
+  return makeArrayRef(sizes);
+}
+}
+
+ArrayRef<uint8_t> DWARFFormValue::getFixedFormSizes(uint8_t AddrSize,
+                                                    uint16_t Version) {
+  uint8_t RefAddrSize = getRefAddrSize(AddrSize, Version);
+  if (AddrSize == 4 && RefAddrSize == 4)
+    return makeFixedFormSizesArrayRef<4, 4>();
+  if (AddrSize == 4 && RefAddrSize == 8)
+    return makeFixedFormSizesArrayRef<4, 8>();
+  if (AddrSize == 8 && RefAddrSize == 4)
+    return makeFixedFormSizesArrayRef<8, 4>();
+  if (AddrSize == 8 && RefAddrSize == 8)
+    return makeFixedFormSizesArrayRef<8, 8>();
+  return None;
+}
+
+static const DWARFFormValue::FormClass DWARF4FormClasses[] = {
+  DWARFFormValue::FC_Unknown,       // 0x0
+  DWARFFormValue::FC_Address,       // 0x01 DW_FORM_addr
+  DWARFFormValue::FC_Unknown,       // 0x02 unused
+  DWARFFormValue::FC_Block,         // 0x03 DW_FORM_block2
+  DWARFFormValue::FC_Block,         // 0x04 DW_FORM_block4
+  DWARFFormValue::FC_Constant,      // 0x05 DW_FORM_data2
+  // --- These can be FC_SectionOffset in DWARF3 and below:
+  DWARFFormValue::FC_Constant,      // 0x06 DW_FORM_data4
+  DWARFFormValue::FC_Constant,      // 0x07 DW_FORM_data8
+  // ---
+  DWARFFormValue::FC_String,        // 0x08 DW_FORM_string
+  DWARFFormValue::FC_Block,         // 0x09 DW_FORM_block
+  DWARFFormValue::FC_Block,         // 0x0a DW_FORM_block1
+  DWARFFormValue::FC_Constant,      // 0x0b DW_FORM_data1
+  DWARFFormValue::FC_Flag,          // 0x0c DW_FORM_flag
+  DWARFFormValue::FC_Constant,      // 0x0d DW_FORM_sdata
+  DWARFFormValue::FC_String,        // 0x0e DW_FORM_strp
+  DWARFFormValue::FC_Constant,      // 0x0f DW_FORM_udata
+  DWARFFormValue::FC_Reference,     // 0x10 DW_FORM_ref_addr
+  DWARFFormValue::FC_Reference,     // 0x11 DW_FORM_ref1
+  DWARFFormValue::FC_Reference,     // 0x12 DW_FORM_ref2
+  DWARFFormValue::FC_Reference,     // 0x13 DW_FORM_ref4
+  DWARFFormValue::FC_Reference,     // 0x14 DW_FORM_ref8
+  DWARFFormValue::FC_Reference,     // 0x15 DW_FORM_ref_udata
+  DWARFFormValue::FC_Indirect,      // 0x16 DW_FORM_indirect
+  DWARFFormValue::FC_SectionOffset, // 0x17 DW_FORM_sec_offset
+  DWARFFormValue::FC_Exprloc,       // 0x18 DW_FORM_exprloc
+  DWARFFormValue::FC_Flag,          // 0x19 DW_FORM_flag_present
+};
+
+bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
+  // First, check DWARF4 form classes.
+  if (Form < ArrayRef<FormClass>(DWARF4FormClasses).size() &&
+      DWARF4FormClasses[Form] == FC)
+    return true;
+  // Check DW_FORM_ref_sig8 from DWARF4.
+  if (Form == DW_FORM_ref_sig8)
+    return (FC == FC_Reference);
+  // Check for some DWARF5 forms.
+  if (Form == DW_FORM_GNU_addr_index)
+    return (FC == FC_Address);
+  if (Form == DW_FORM_GNU_str_index)
+    return (FC == FC_String);
+  // In DWARF3 DW_FORM_data4 and DW_FORM_data8 served also as a section offset.
+  // Don't check for DWARF version here, as some producers may still do this
+  // by mistake.
+  if ((Form == DW_FORM_data4 || Form == DW_FORM_data8) &&
+      FC == FC_SectionOffset)
+    return true;
+  return false;
+}
+
+bool DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
+                                  const DWARFUnit *cu) {
+  bool indirect = false;
+  bool is_block = false;
+  Value.data = nullptr;
+  // Read the value for the form into value and follow and DW_FORM_indirect
+  // instances we run into
+  do {
+    indirect = false;
+    switch (Form) {
+    case DW_FORM_addr:
+    case DW_FORM_ref_addr: {
+      if (!cu)
+        return false;
+      uint16_t AddrSize =
+          (Form == DW_FORM_addr)
+              ? cu->getAddressByteSize()
+              : getRefAddrSize(cu->getAddressByteSize(), cu->getVersion());
+      RelocAddrMap::const_iterator AI = cu->getRelocMap()->find(*offset_ptr);
+      if (AI != cu->getRelocMap()->end()) {
+        const std::pair<uint8_t, int64_t> &R = AI->second;
+        Value.uval = data.getUnsigned(offset_ptr, AddrSize) + R.second;
+      } else
+        Value.uval = data.getUnsigned(offset_ptr, AddrSize);
+      break;
+    }
+    case DW_FORM_exprloc:
+    case DW_FORM_block:
+      Value.uval = data.getULEB128(offset_ptr);
+      is_block = true;
+      break;
+    case DW_FORM_block1:
+      Value.uval = data.getU8(offset_ptr);
+      is_block = true;
+      break;
+    case DW_FORM_block2:
+      Value.uval = data.getU16(offset_ptr);
+      is_block = true;
+      break;
+    case DW_FORM_block4:
+      Value.uval = data.getU32(offset_ptr);
+      is_block = true;
+      break;
+    case DW_FORM_data1:
+    case DW_FORM_ref1:
+    case DW_FORM_flag:
+      Value.uval = data.getU8(offset_ptr);
+      break;
+    case DW_FORM_data2:
+    case DW_FORM_ref2:
+      Value.uval = data.getU16(offset_ptr);
+      break;
+    case DW_FORM_data4:
+    case DW_FORM_ref4: {
+      Value.uval = data.getU32(offset_ptr);
+      if (!cu)
+        break;
+      RelocAddrMap::const_iterator AI = cu->getRelocMap()->find(*offset_ptr-4);
+      if (AI != cu->getRelocMap()->end())
+        Value.uval += AI->second.second;
+      break;
+    }
+    case DW_FORM_data8:
+    case DW_FORM_ref8:
+      Value.uval = data.getU64(offset_ptr);
+      break;
+    case DW_FORM_sdata:
+      Value.sval = data.getSLEB128(offset_ptr);
+      break;
+    case DW_FORM_strp: {
+      Value.uval = data.getU32(offset_ptr);
+      if (!cu)
+        break;
+      RelocAddrMap::const_iterator AI = cu->getRelocMap()->find(*offset_ptr-4);
+      if (AI != cu->getRelocMap()->end())
+        Value.uval += AI->second.second;
+      break;
+    }
+    case DW_FORM_udata:
+    case DW_FORM_ref_udata:
+      Value.uval = data.getULEB128(offset_ptr);
+      break;
+    case DW_FORM_string:
+      Value.cstr = data.getCStr(offset_ptr);
+      break;
+    case DW_FORM_indirect:
+      Form = data.getULEB128(offset_ptr);
+      indirect = true;
+      break;
+    case DW_FORM_sec_offset: {
+      // FIXME: This is 64-bit for DWARF64.
+      Value.uval = data.getU32(offset_ptr);
+      if (!cu)
+        break;
+      RelocAddrMap::const_iterator AI = cu->getRelocMap()->find(*offset_ptr-4);
+      if (AI != cu->getRelocMap()->end())
+        Value.uval +=  AI->second.second;
+      break;
+    }
+    case DW_FORM_flag_present:
+      Value.uval = 1;
+      break;
+    case DW_FORM_ref_sig8:
+      Value.uval = data.getU64(offset_ptr);
+      break;
+    case DW_FORM_GNU_addr_index:
+    case DW_FORM_GNU_str_index:
+      Value.uval = data.getULEB128(offset_ptr);
+      break;
+    default:
+      return false;
+    }
+  } while (indirect);
+
+  if (is_block) {
+    StringRef str = data.getData().substr(*offset_ptr, Value.uval);
+    Value.data = nullptr;
+    if (!str.empty()) {
+      Value.data = reinterpret_cast<const uint8_t *>(str.data());
+      *offset_ptr += Value.uval;
+    }
+  }
+
+  return true;
+}
+
+bool
+DWARFFormValue::skipValue(DataExtractor debug_info_data, uint32_t* offset_ptr,
+                          const DWARFUnit *cu) const {
+  return DWARFFormValue::skipValue(Form, debug_info_data, offset_ptr, cu);
+}
+
+bool
+DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
+                          uint32_t *offset_ptr, const DWARFUnit *cu) {
+  bool indirect = false;
+  do {
+    switch (form) {
+    // Blocks if inlined data that have a length field and the data bytes
+    // inlined in the .debug_info
+    case DW_FORM_exprloc:
+    case DW_FORM_block: {
+      uint64_t size = debug_info_data.getULEB128(offset_ptr);
+      *offset_ptr += size;
+      return true;
+    }
+    case DW_FORM_block1: {
+      uint8_t size = debug_info_data.getU8(offset_ptr);
+      *offset_ptr += size;
+      return true;
+    }
+    case DW_FORM_block2: {
+      uint16_t size = debug_info_data.getU16(offset_ptr);
+      *offset_ptr += size;
+      return true;
+    }
+    case DW_FORM_block4: {
+      uint32_t size = debug_info_data.getU32(offset_ptr);
+      *offset_ptr += size;
+      return true;
+    }
+
+    // Inlined NULL terminated C-strings
+    case DW_FORM_string:
+      debug_info_data.getCStr(offset_ptr);
+      return true;
+
+    // Compile unit address sized values
+    case DW_FORM_addr:
+      *offset_ptr += cu->getAddressByteSize();
+      return true;
+    case DW_FORM_ref_addr:
+      *offset_ptr += getRefAddrSize(cu->getAddressByteSize(), cu->getVersion());
+      return true;
+
+    // 0 byte values - implied from the form.
+    case DW_FORM_flag_present:
+      return true;
+
+    // 1 byte values
+    case DW_FORM_data1:
+    case DW_FORM_flag:
+    case DW_FORM_ref1:
+      *offset_ptr += 1;
+      return true;
+
+    // 2 byte values
+    case DW_FORM_data2:
+    case DW_FORM_ref2:
+      *offset_ptr += 2;
+      return true;
+
+    // 4 byte values
+    case DW_FORM_strp:
+    case DW_FORM_data4:
+    case DW_FORM_ref4:
+      *offset_ptr += 4;
+      return true;
+
+    // 8 byte values
+    case DW_FORM_data8:
+    case DW_FORM_ref8:
+    case DW_FORM_ref_sig8:
+      *offset_ptr += 8;
+      return true;
+
+    // signed or unsigned LEB 128 values
+    //  case DW_FORM_APPLE_db_str:
+    case DW_FORM_sdata:
+    case DW_FORM_udata:
+    case DW_FORM_ref_udata:
+    case DW_FORM_GNU_str_index:
+    case DW_FORM_GNU_addr_index:
+      debug_info_data.getULEB128(offset_ptr);
+      return true;
+
+    case DW_FORM_indirect:
+      indirect = true;
+      form = debug_info_data.getULEB128(offset_ptr);
+      break;
+
+    // FIXME: 4 for DWARF32, 8 for DWARF64.
+    case DW_FORM_sec_offset:
+      *offset_ptr += 4;
+      return true;
+
+    default:
+      return false;
+    }
+  } while (indirect);
+  return true;
+}
+
+void
+DWARFFormValue::dump(raw_ostream &OS, const DWARFUnit *cu) const {
+  uint64_t uvalue = Value.uval;
+  bool cu_relative_offset = false;
+
+  switch (Form) {
+  case DW_FORM_addr:      OS << format("0x%016" PRIx64, uvalue); break;
+  case DW_FORM_GNU_addr_index: {
+    OS << format(" indexed (%8.8x) address = ", (uint32_t)uvalue);
+    uint64_t Address;
+    if (cu->getAddrOffsetSectionItem(uvalue, Address))
+      OS << format("0x%016" PRIx64, Address);
+    else
+      OS << "<no .debug_addr section>";
+    break;
+  }
+  case DW_FORM_flag_present: OS << "true"; break;
+  case DW_FORM_flag:
+  case DW_FORM_data1:     OS << format("0x%02x", (uint8_t)uvalue); break;
+  case DW_FORM_data2:     OS << format("0x%04x", (uint16_t)uvalue); break;
+  case DW_FORM_data4:     OS << format("0x%08x", (uint32_t)uvalue); break;
+  case DW_FORM_ref_sig8:
+  case DW_FORM_data8:     OS << format("0x%016" PRIx64, uvalue); break;
+  case DW_FORM_string:
+    OS << '"';
+    OS.write_escaped(Value.cstr);
+    OS << '"';
+    break;
+  case DW_FORM_exprloc:
+  case DW_FORM_block:
+  case DW_FORM_block1:
+  case DW_FORM_block2:
+  case DW_FORM_block4:
+    if (uvalue > 0) {
+      switch (Form) {
+      case DW_FORM_exprloc:
+      case DW_FORM_block:  OS << format("<0x%" PRIx64 "> ", uvalue);     break;
+      case DW_FORM_block1: OS << format("<0x%2.2x> ", (uint8_t)uvalue);  break;
+      case DW_FORM_block2: OS << format("<0x%4.4x> ", (uint16_t)uvalue); break;
+      case DW_FORM_block4: OS << format("<0x%8.8x> ", (uint32_t)uvalue); break;
+      default: break;
+      }
+
+      const uint8_t* data_ptr = Value.data;
+      if (data_ptr) {
+        // uvalue contains size of block
+        const uint8_t* end_data_ptr = data_ptr + uvalue;
+        while (data_ptr < end_data_ptr) {
+          OS << format("%2.2x ", *data_ptr);
+          ++data_ptr;
+        }
+      }
+      else
+        OS << "NULL";
+    }
+    break;
+
+  case DW_FORM_sdata:     OS << Value.sval; break;
+  case DW_FORM_udata:     OS << Value.uval; break;
+  case DW_FORM_strp: {
+    OS << format(" .debug_str[0x%8.8x] = ", (uint32_t)uvalue);
+    Optional<const char *> DbgStr = getAsCString(cu);
+    if (DbgStr.hasValue()) {
+      raw_ostream &COS = WithColor(OS, syntax::String);
+      COS << '"';
+      COS.write_escaped(DbgStr.getValue());
+      COS << '"';
+    }
+    break;
+  }
+  case DW_FORM_GNU_str_index: {
+    OS << format(" indexed (%8.8x) string = ", (uint32_t)uvalue);
+    Optional<const char *> DbgStr = getAsCString(cu);
+    if (DbgStr.hasValue()) {
+      raw_ostream &COS = WithColor(OS, syntax::String);
+      COS << '"';
+      COS.write_escaped(DbgStr.getValue());
+      COS << '"';
+    }
+    break;
+  }
+  case DW_FORM_ref_addr:
+    OS << format("0x%016" PRIx64, uvalue);
+    break;
+  case DW_FORM_ref1:
+    cu_relative_offset = true;
+    OS << format("cu + 0x%2.2x", (uint8_t)uvalue);
+    break;
+  case DW_FORM_ref2:
+    cu_relative_offset = true;
+    OS << format("cu + 0x%4.4x", (uint16_t)uvalue);
+    break;
+  case DW_FORM_ref4:
+    cu_relative_offset = true;
+    OS << format("cu + 0x%4.4x", (uint32_t)uvalue);
+    break;
+  case DW_FORM_ref8:
+    cu_relative_offset = true;
+    OS << format("cu + 0x%8.8" PRIx64, uvalue);
+    break;
+  case DW_FORM_ref_udata:
+    cu_relative_offset = true;
+    OS << format("cu + 0x%" PRIx64, uvalue);
+    break;
+
+    // All DW_FORM_indirect attributes should be resolved prior to calling
+    // this function
+  case DW_FORM_indirect:
+    OS << "DW_FORM_indirect";
+    break;
+
+    // Should be formatted to 64-bit for DWARF64.
+  case DW_FORM_sec_offset:
+    OS << format("0x%08x", (uint32_t)uvalue);
+    break;
+
+  default:
+    OS << format("DW_FORM(0x%4.4x)", Form);
+    break;
+  }
+
+  if (cu_relative_offset) {
+    OS << " => {";
+    WithColor(OS, syntax::Address).get()
+      << format("0x%8.8" PRIx64, uvalue + (cu ? cu->getOffset() : 0));
+    OS << "}";
+  }
+}
+
+Optional<const char *> DWARFFormValue::getAsCString(const DWARFUnit *U) const {
+  if (!isFormClass(FC_String))
+    return None;
+  if (Form == DW_FORM_string)
+    return Value.cstr;
+  if (!U)
+    return None;
+  uint32_t Offset = Value.uval;
+  if (Form == DW_FORM_GNU_str_index) {
+    uint32_t StrOffset;
+    if (!U->getStringOffsetSectionItem(Offset, StrOffset))
+      return None;
+    Offset = StrOffset;
+  }
+  if (const char *Str = U->getStringExtractor().getCStr(&Offset)) {
+    return Str;
+  }
+  return None;
+}
+
+Optional<uint64_t> DWARFFormValue::getAsAddress(const DWARFUnit *U) const {
+  if (!isFormClass(FC_Address))
+    return None;
+  if (Form == DW_FORM_GNU_addr_index) {
+    uint32_t Index = Value.uval;
+    uint64_t Result;
+    if (!U || !U->getAddrOffsetSectionItem(Index, Result))
+      return None;
+    return Result;
+  }
+  return Value.uval;
+}
+
+Optional<uint64_t> DWARFFormValue::getAsReference(const DWARFUnit *U) const {
+  if (!isFormClass(FC_Reference))
+    return None;
+  switch (Form) {
+  case DW_FORM_ref1:
+  case DW_FORM_ref2:
+  case DW_FORM_ref4:
+  case DW_FORM_ref8:
+  case DW_FORM_ref_udata:
+    if (!U)
+      return None;
+    return Value.uval + U->getOffset();
+  case DW_FORM_ref_addr:
+    return Value.uval;
+  // FIXME: Add proper support for DW_FORM_ref_sig8
+  default:
+    return Value.uval;
+  }
+}
+
+Optional<uint64_t> DWARFFormValue::getAsSectionOffset() const {
+  if (!isFormClass(FC_SectionOffset))
+    return None;
+  return Value.uval;
+}
+
+Optional<uint64_t> DWARFFormValue::getAsUnsignedConstant() const {
+  if ((!isFormClass(FC_Constant) && !isFormClass(FC_Flag))
+      || Form == DW_FORM_sdata)
+    return None;
+  return Value.uval;
+}
+
+Optional<ArrayRef<uint8_t>> DWARFFormValue::getAsBlock() const {
+  if (!isFormClass(FC_Block) && !isFormClass(FC_Exprloc))
+    return None;
+  return ArrayRef<uint8_t>(Value.data, Value.uval);
+}
+
diff --git a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
new file mode 100644
index 0000000..65c7bff
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
@@ -0,0 +1,39 @@
+//===-- DWARFTypeUnit.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+bool DWARFTypeUnit::extractImpl(DataExtractor debug_info,
+                                uint32_t *offset_ptr) {
+  if (!DWARFUnit::extractImpl(debug_info, offset_ptr))
+    return false;
+  TypeHash = debug_info.getU64(offset_ptr);
+  TypeOffset = debug_info.getU32(offset_ptr);
+  return TypeOffset < getLength();
+}
+
+void DWARFTypeUnit::dump(raw_ostream &OS) {
+  OS << format("0x%08x", getOffset()) << ": Type Unit:"
+     << " length = " << format("0x%08x", getLength())
+     << " version = " << format("0x%04x", getVersion())
+     << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
+     << " addr_size = " << format("0x%02x", getAddressByteSize())
+     << " type_signature = " << format("0x%16" PRIx64, TypeHash)
+     << " type_offset = " << format("0x%04x", TypeOffset)
+     << " (next unit at " << format("0x%08x", getNextUnitOffset())
+     << ")\n";
+
+  const DWARFDebugInfoEntryMinimal *CU = getCompileUnitDIE(false);
+  assert(CU && "Null Compile Unit?");
+  CU->dump(OS, this, -1U);
+}
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
new file mode 100644
index 0000000..d4ecd69
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -0,0 +1,377 @@
+//===-- DWARFUnit.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Path.h"
+#include <cstdio>
+
+using namespace llvm;
+using namespace dwarf;
+
+void DWARFUnitSectionBase::parse(DWARFContext &C, const DWARFSection &Section) {
+  parseImpl(C, Section, C.getDebugAbbrev(), C.getRangeSection(),
+            C.getStringSection(), StringRef(), C.getAddrSection(),
+            C.isLittleEndian());
+}
+
+void DWARFUnitSectionBase::parseDWO(DWARFContext &C,
+                                    const DWARFSection &DWOSection) {
+  parseImpl(C, DWOSection, C.getDebugAbbrevDWO(), C.getRangeDWOSection(),
+            C.getStringDWOSection(), C.getStringOffsetDWOSection(),
+            C.getAddrSection(), C.isLittleEndian());
+}
+
+DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section,
+                     const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
+                     StringRef SOS, StringRef AOS, bool LE,
+                     const DWARFUnitSectionBase &UnitSection)
+    : Context(DC), InfoSection(Section), Abbrev(DA), RangeSection(RS),
+      StringSection(SS), StringOffsetSection(SOS), AddrOffsetSection(AOS),
+      isLittleEndian(LE), UnitSection(UnitSection) {
+  clear();
+}
+
+DWARFUnit::~DWARFUnit() {
+}
+
+bool DWARFUnit::getAddrOffsetSectionItem(uint32_t Index,
+                                                uint64_t &Result) const {
+  uint32_t Offset = AddrOffsetSectionBase + Index * AddrSize;
+  if (AddrOffsetSection.size() < Offset + AddrSize)
+    return false;
+  DataExtractor DA(AddrOffsetSection, isLittleEndian, AddrSize);
+  Result = DA.getAddress(&Offset);
+  return true;
+}
+
+bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index,
+                                                  uint32_t &Result) const {
+  // FIXME: string offset section entries are 8-byte for DWARF64.
+  const uint32_t ItemSize = 4;
+  uint32_t Offset = Index * ItemSize;
+  if (StringOffsetSection.size() < Offset + ItemSize)
+    return false;
+  DataExtractor DA(StringOffsetSection, isLittleEndian, 0);
+  Result = DA.getU32(&Offset);
+  return true;
+}
+
+bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
+  Length = debug_info.getU32(offset_ptr);
+  Version = debug_info.getU16(offset_ptr);
+  uint64_t AbbrOffset = debug_info.getU32(offset_ptr);
+  AddrSize = debug_info.getU8(offset_ptr);
+
+  bool LengthOK = debug_info.isValidOffset(getNextUnitOffset() - 1);
+  bool VersionOK = DWARFContext::isSupportedVersion(Version);
+  bool AddrSizeOK = AddrSize == 4 || AddrSize == 8;
+
+  if (!LengthOK || !VersionOK || !AddrSizeOK)
+    return false;
+
+  Abbrevs = Abbrev->getAbbreviationDeclarationSet(AbbrOffset);
+  if (Abbrevs == nullptr)
+    return false;
+
+  return true;
+}
+
+bool DWARFUnit::extract(DataExtractor debug_info, uint32_t *offset_ptr) {
+  clear();
+
+  Offset = *offset_ptr;
+
+  if (debug_info.isValidOffset(*offset_ptr)) {
+    if (extractImpl(debug_info, offset_ptr))
+      return true;
+
+    // reset the offset to where we tried to parse from if anything went wrong
+    *offset_ptr = Offset;
+  }
+
+  return false;
+}
+
+bool DWARFUnit::extractRangeList(uint32_t RangeListOffset,
+                                        DWARFDebugRangeList &RangeList) const {
+  // Require that compile unit is extracted.
+  assert(DieArray.size() > 0);
+  DataExtractor RangesData(RangeSection, isLittleEndian, AddrSize);
+  uint32_t ActualRangeListOffset = RangeSectionBase + RangeListOffset;
+  return RangeList.extract(RangesData, &ActualRangeListOffset);
+}
+
+void DWARFUnit::clear() {
+  Offset = 0;
+  Length = 0;
+  Version = 0;
+  Abbrevs = nullptr;
+  AddrSize = 0;
+  BaseAddr = 0;
+  RangeSectionBase = 0;
+  AddrOffsetSectionBase = 0;
+  clearDIEs(false);
+  DWO.reset();
+}
+
+const char *DWARFUnit::getCompilationDir() {
+  extractDIEsIfNeeded(true);
+  if (DieArray.empty())
+    return nullptr;
+  return DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, nullptr);
+}
+
+uint64_t DWARFUnit::getDWOId() {
+  extractDIEsIfNeeded(true);
+  const uint64_t FailValue = -1ULL;
+  if (DieArray.empty())
+    return FailValue;
+  return DieArray[0]
+      .getAttributeValueAsUnsignedConstant(this, DW_AT_GNU_dwo_id, FailValue);
+}
+
+void DWARFUnit::setDIERelations() {
+  if (DieArray.size() <= 1)
+    return;
+
+  std::vector<DWARFDebugInfoEntryMinimal *> ParentChain;
+  DWARFDebugInfoEntryMinimal *SiblingChain = nullptr;
+  for (auto &DIE : DieArray) {
+    if (SiblingChain) {
+      SiblingChain->setSibling(&DIE);
+    }
+    if (const DWARFAbbreviationDeclaration *AbbrDecl =
+            DIE.getAbbreviationDeclarationPtr()) {
+      // Normal DIE.
+      if (AbbrDecl->hasChildren()) {
+        ParentChain.push_back(&DIE);
+        SiblingChain = nullptr;
+      } else {
+        SiblingChain = &DIE;
+      }
+    } else {
+      // NULL entry terminates the sibling chain.
+      SiblingChain = ParentChain.back();
+      ParentChain.pop_back();
+    }
+  }
+  assert(SiblingChain == nullptr || SiblingChain == &DieArray[0]);
+  assert(ParentChain.empty());
+}
+
+void DWARFUnit::extractDIEsToVector(
+    bool AppendCUDie, bool AppendNonCUDies,
+    std::vector<DWARFDebugInfoEntryMinimal> &Dies) const {
+  if (!AppendCUDie && !AppendNonCUDies)
+    return;
+
+  // Set the offset to that of the first DIE and calculate the start of the
+  // next compilation unit header.
+  uint32_t DIEOffset = Offset + getHeaderSize();
+  uint32_t NextCUOffset = getNextUnitOffset();
+  DWARFDebugInfoEntryMinimal DIE;
+  uint32_t Depth = 0;
+  bool IsCUDie = true;
+
+  while (DIEOffset < NextCUOffset && DIE.extractFast(this, &DIEOffset)) {
+    if (IsCUDie) {
+      if (AppendCUDie)
+        Dies.push_back(DIE);
+      if (!AppendNonCUDies)
+        break;
+      // The average bytes per DIE entry has been seen to be
+      // around 14-20 so let's pre-reserve the needed memory for
+      // our DIE entries accordingly.
+      Dies.reserve(Dies.size() + getDebugInfoSize() / 14);
+      IsCUDie = false;
+    } else {
+      Dies.push_back(DIE);
+    }
+
+    if (const DWARFAbbreviationDeclaration *AbbrDecl =
+            DIE.getAbbreviationDeclarationPtr()) {
+      // Normal DIE
+      if (AbbrDecl->hasChildren())
+        ++Depth;
+    } else {
+      // NULL DIE.
+      if (Depth > 0)
+        --Depth;
+      if (Depth == 0)
+        break;  // We are done with this compile unit!
+    }
+  }
+
+  // Give a little bit of info if we encounter corrupt DWARF (our offset
+  // should always terminate at or before the start of the next compilation
+  // unit header).
+  if (DIEOffset > NextCUOffset)
+    fprintf(stderr, "warning: DWARF compile unit extends beyond its "
+                    "bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), DIEOffset);
+}
+
+size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
+  if ((CUDieOnly && DieArray.size() > 0) ||
+      DieArray.size() > 1)
+    return 0; // Already parsed.
+
+  bool HasCUDie = DieArray.size() > 0;
+  extractDIEsToVector(!HasCUDie, !CUDieOnly, DieArray);
+
+  if (DieArray.empty())
+    return 0;
+
+  // If CU DIE was just parsed, copy several attribute values from it.
+  if (!HasCUDie) {
+    uint64_t BaseAddr =
+        DieArray[0].getAttributeValueAsAddress(this, DW_AT_low_pc, -1ULL);
+    if (BaseAddr == -1ULL)
+      BaseAddr = DieArray[0].getAttributeValueAsAddress(this, DW_AT_entry_pc, 0);
+    setBaseAddress(BaseAddr);
+    AddrOffsetSectionBase = DieArray[0].getAttributeValueAsSectionOffset(
+        this, DW_AT_GNU_addr_base, 0);
+    RangeSectionBase = DieArray[0].getAttributeValueAsSectionOffset(
+        this, DW_AT_ranges_base, 0);
+    // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
+    // skeleton CU DIE, so that DWARF users not aware of it are not broken.
+  }
+
+  setDIERelations();
+  return DieArray.size();
+}
+
+DWARFUnit::DWOHolder::DWOHolder(StringRef DWOPath)
+    : DWOFile(), DWOContext(), DWOU(nullptr) {
+  auto Obj = object::ObjectFile::createObjectFile(DWOPath);
+  if (!Obj)
+    return;
+  DWOFile = std::move(Obj.get());
+  DWOContext.reset(
+      cast<DWARFContext>(DIContext::getDWARFContext(*DWOFile.getBinary())));
+  if (DWOContext->getNumDWOCompileUnits() > 0)
+    DWOU = DWOContext->getDWOCompileUnitAtIndex(0);
+}
+
+bool DWARFUnit::parseDWO() {
+  if (DWO.get())
+    return false;
+  extractDIEsIfNeeded(true);
+  if (DieArray.empty())
+    return false;
+  const char *DWOFileName =
+      DieArray[0].getAttributeValueAsString(this, DW_AT_GNU_dwo_name, nullptr);
+  if (!DWOFileName)
+    return false;
+  const char *CompilationDir =
+      DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, nullptr);
+  SmallString<16> AbsolutePath;
+  if (sys::path::is_relative(DWOFileName) && CompilationDir != nullptr) {
+    sys::path::append(AbsolutePath, CompilationDir);
+  }
+  sys::path::append(AbsolutePath, DWOFileName);
+  DWO = llvm::make_unique<DWOHolder>(AbsolutePath);
+  DWARFUnit *DWOCU = DWO->getUnit();
+  // Verify that compile unit in .dwo file is valid.
+  if (!DWOCU || DWOCU->getDWOId() != getDWOId()) {
+    DWO.reset();
+    return false;
+  }
+  // Share .debug_addr and .debug_ranges section with compile unit in .dwo
+  DWOCU->setAddrOffsetSection(AddrOffsetSection, AddrOffsetSectionBase);
+  uint32_t DWORangesBase = DieArray[0].getRangesBaseAttribute(this, 0);
+  DWOCU->setRangesSection(RangeSection, DWORangesBase);
+  return true;
+}
+
+void DWARFUnit::clearDIEs(bool KeepCUDie) {
+  if (DieArray.size() > (unsigned)KeepCUDie) {
+    // std::vectors never get any smaller when resized to a smaller size,
+    // or when clear() or erase() are called, the size will report that it
+    // is smaller, but the memory allocated remains intact (call capacity()
+    // to see this). So we need to create a temporary vector and swap the
+    // contents which will cause just the internal pointers to be swapped
+    // so that when temporary vector goes out of scope, it will destroy the
+    // contents.
+    std::vector<DWARFDebugInfoEntryMinimal> TmpArray;
+    DieArray.swap(TmpArray);
+    // Save at least the compile unit DIE
+    if (KeepCUDie)
+      DieArray.push_back(TmpArray.front());
+  }
+}
+
+void DWARFUnit::collectAddressRanges(DWARFAddressRangesVector &CURanges) {
+  // First, check if CU DIE describes address ranges for the unit.
+  const auto &CUDIERanges = getCompileUnitDIE()->getAddressRanges(this);
+  if (!CUDIERanges.empty()) {
+    CURanges.insert(CURanges.end(), CUDIERanges.begin(), CUDIERanges.end());
+    return;
+  }
+
+  // This function is usually called if there in no .debug_aranges section
+  // in order to produce a compile unit level set of address ranges that
+  // is accurate. If the DIEs weren't parsed, then we don't want all dies for
+  // all compile units to stay loaded when they weren't needed. So we can end
+  // up parsing the DWARF and then throwing them all away to keep memory usage
+  // down.
+  const bool ClearDIEs = extractDIEsIfNeeded(false) > 1;
+  DieArray[0].collectChildrenAddressRanges(this, CURanges);
+
+  // Collect address ranges from DIEs in .dwo if necessary.
+  bool DWOCreated = parseDWO();
+  if (DWO.get())
+    DWO->getUnit()->collectAddressRanges(CURanges);
+  if (DWOCreated)
+    DWO.reset();
+
+  // Keep memory down by clearing DIEs if this generate function
+  // caused them to be parsed.
+  if (ClearDIEs)
+    clearDIEs(true);
+}
+
+const DWARFDebugInfoEntryMinimal *
+DWARFUnit::getSubprogramForAddress(uint64_t Address) {
+  extractDIEsIfNeeded(false);
+  for (const DWARFDebugInfoEntryMinimal &DIE : DieArray) {
+    if (DIE.isSubprogramDIE() &&
+        DIE.addressRangeContainsAddress(this, Address)) {
+      return &DIE;
+    }
+  }
+  return nullptr;
+}
+
+DWARFDebugInfoEntryInlinedChain
+DWARFUnit::getInlinedChainForAddress(uint64_t Address) {
+  // First, find a subprogram that contains the given address (the root
+  // of inlined chain).
+  const DWARFUnit *ChainCU = nullptr;
+  const DWARFDebugInfoEntryMinimal *SubprogramDIE =
+      getSubprogramForAddress(Address);
+  if (SubprogramDIE) {
+    ChainCU = this;
+  } else {
+    // Try to look for subprogram DIEs in the DWO file.
+    parseDWO();
+    if (DWO.get()) {
+      SubprogramDIE = DWO->getUnit()->getSubprogramForAddress(Address);
+      if (SubprogramDIE)
+        ChainCU = DWO->getUnit();
+    }
+  }
+
+  // Get inlined chain rooted at this subprogram DIE.
+  if (!SubprogramDIE)
+    return DWARFDebugInfoEntryInlinedChain();
+  return SubprogramDIE->getInlinedChainForAddress(ChainCU, Address);
+}
diff --git a/lib/DebugInfo/DWARF/LLVMBuild.txt b/lib/DebugInfo/DWARF/LLVMBuild.txt
new file mode 100644
index 0000000..9f8b104
--- /dev/null
+++ b/lib/DebugInfo/DWARF/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/DebugInfo/DWARF/LLVMBuild.txt ----------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = DebugInfoDWARF
+parent = DebugInfo
+required_libraries = Object Support
diff --git a/lib/DebugInfo/DWARF/Makefile b/lib/DebugInfo/DWARF/Makefile
new file mode 100644
index 0000000..8633373
--- /dev/null
+++ b/lib/DebugInfo/DWARF/Makefile
@@ -0,0 +1,14 @@
+##===- lib/DebugInfo/DWARF/Makefile ------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMDebugInfoDWARF
+BUILD_ARCHIVE := 1
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp b/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp
new file mode 100644
index 0000000..a6b4c65
--- /dev/null
+++ b/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp
@@ -0,0 +1,37 @@
+//===-- SyntaxHighlighting.cpp ----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SyntaxHighlighting.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+using namespace dwarf;
+using namespace syntax;
+
+static cl::opt<cl::boolOrDefault>
+    UseColor("color",
+             cl::desc("use colored syntax highlighting (default=autodetect)"),
+             cl::init(cl::BOU_UNSET));
+
+WithColor::WithColor(llvm::raw_ostream &OS, enum HighlightColor Type) : OS(OS) {
+  // Detect color from terminal type unless the user passed the --color option.
+  if (UseColor == cl::BOU_UNSET ? OS.has_colors() : UseColor == cl::BOU_TRUE) {
+    switch (Type) {
+    case Address:    OS.changeColor(llvm::raw_ostream::YELLOW);  break;
+    case String:     OS.changeColor(llvm::raw_ostream::GREEN);   break;
+    case Tag:        OS.changeColor(llvm::raw_ostream::BLUE);    break;
+    case Attribute:  OS.changeColor(llvm::raw_ostream::CYAN);    break;
+    case Enumerator: OS.changeColor(llvm::raw_ostream::MAGENTA); break;
+    }
+  }
+}
+
+WithColor::~WithColor() {
+  if (UseColor == cl::BOU_UNSET ? OS.has_colors() : UseColor == cl::BOU_TRUE)
+    OS.resetColor();
+}
diff --git a/lib/DebugInfo/DWARF/SyntaxHighlighting.h b/lib/DebugInfo/DWARF/SyntaxHighlighting.h
new file mode 100644
index 0000000..946a313
--- /dev/null
+++ b/lib/DebugInfo/DWARF/SyntaxHighlighting.h
@@ -0,0 +1,39 @@
+//===-- SyntaxHighlighting.h ------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_DEBUGINFO_SYNTAXHIGHLIGHTING_H
+#define LLVM_LIB_DEBUGINFO_SYNTAXHIGHLIGHTING_H
+
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace dwarf {
+namespace syntax {
+
+// Symbolic names for various syntax elements.
+enum HighlightColor { Address, String, Tag, Attribute, Enumerator };
+
+/// An RAII object that temporarily switches an output stream to a
+/// specific color.
+class WithColor {
+  llvm::raw_ostream &OS;
+
+public:
+  /// To be used like this: WithColor(OS, syntax::String) << "text";
+  WithColor(llvm::raw_ostream &OS, enum HighlightColor Type);
+  ~WithColor();
+
+  llvm::raw_ostream& get() { return OS; }
+  operator llvm::raw_ostream& () { return OS; }
+};
+}
+}
+}
+
+#endif
diff --git a/lib/DebugInfo/DWARF/module.modulemap b/lib/DebugInfo/DWARF/module.modulemap
new file mode 100644
index 0000000..c2f624f
--- /dev/null
+++ b/lib/DebugInfo/DWARF/module.modulemap
@@ -0,0 +1 @@
+module DebugInfoDWARF { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp
deleted file mode 100644
index c3e570e..0000000
--- a/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-//===-- DWARFAbbreviationDeclaration.cpp ----------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFAbbreviationDeclaration.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-using namespace dwarf;
-
-void DWARFAbbreviationDeclaration::clear() {
-  Code = 0;
-  Tag = 0;
-  HasChildren = false;
-  AttributeSpecs.clear();
-}
-
-DWARFAbbreviationDeclaration::DWARFAbbreviationDeclaration() {
-  clear();
-}
-
-bool
-DWARFAbbreviationDeclaration::extract(DataExtractor Data, uint32_t* OffsetPtr) {
-  clear();
-  Code = Data.getULEB128(OffsetPtr);
-  if (Code == 0) {
-    return false;
-  }
-  Tag = Data.getULEB128(OffsetPtr);
-  uint8_t ChildrenByte = Data.getU8(OffsetPtr);
-  HasChildren = (ChildrenByte == DW_CHILDREN_yes);
-
-  while (true) {
-    uint32_t CurOffset = *OffsetPtr;
-    uint16_t Attr = Data.getULEB128(OffsetPtr);
-    if (CurOffset == *OffsetPtr) {
-      clear();
-      return false;
-    }
-    CurOffset = *OffsetPtr;
-    uint16_t Form = Data.getULEB128(OffsetPtr);
-    if (CurOffset == *OffsetPtr) {
-      clear();
-      return false;
-    }
-    if (Attr == 0 && Form == 0)
-      break;
-    AttributeSpecs.push_back(AttributeSpec(Attr, Form));
-  }
-
-  if (Tag == 0) {
-    clear();
-    return false;
-  }
-  return true;
-}
-
-void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
-  const char *tagString = TagString(getTag());
-  OS << '[' << getCode() << "] ";
-  if (tagString)
-    OS << tagString;
-  else
-    OS << format("DW_TAG_Unknown_%x", getTag());
-  OS << "\tDW_CHILDREN_" << (hasChildren() ? "yes" : "no") << '\n';
-  for (const AttributeSpec &Spec : AttributeSpecs) {
-    OS << '\t';
-    const char *attrString = AttributeString(Spec.Attr);
-    if (attrString)
-      OS << attrString;
-    else
-      OS << format("DW_AT_Unknown_%x", Spec.Attr);
-    OS << '\t';
-    const char *formString = FormEncodingString(Spec.Form);
-    if (formString)
-      OS << formString;
-    else
-      OS << format("DW_FORM_Unknown_%x", Spec.Form);
-    OS << '\n';
-  }
-  OS << '\n';
-}
-
-uint32_t
-DWARFAbbreviationDeclaration::findAttributeIndex(uint16_t attr) const {
-  for (uint32_t i = 0, e = AttributeSpecs.size(); i != e; ++i) {
-    if (AttributeSpecs[i].Attr == attr)
-      return i;
-  }
-  return -1U;
-}
diff --git a/lib/DebugInfo/DWARFAcceleratorTable.cpp b/lib/DebugInfo/DWARFAcceleratorTable.cpp
deleted file mode 100644
index 703274d..0000000
--- a/lib/DebugInfo/DWARFAcceleratorTable.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-//===--- DWARFAcceleratorTable.cpp ----------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFAcceleratorTable.h"
-
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-
-bool DWARFAcceleratorTable::extract() {
-  uint32_t Offset = 0;
-
-  // Check that we can at least read the header.
-  if (!AccelSection.isValidOffset(offsetof(Header, HeaderDataLength)+4))
-    return false;
-
-  Hdr.Magic = AccelSection.getU32(&Offset);
-  Hdr.Version = AccelSection.getU16(&Offset);
-  Hdr.HashFunction = AccelSection.getU16(&Offset);
-  Hdr.NumBuckets = AccelSection.getU32(&Offset);
-  Hdr.NumHashes = AccelSection.getU32(&Offset);
-  Hdr.HeaderDataLength = AccelSection.getU32(&Offset);
-
-  // Check that we can read all the hashes and offsets from the
-  // section (see SourceLevelDebugging.rst for the structure of the index).
-  if (!AccelSection.isValidOffset(sizeof(Hdr) + Hdr.HeaderDataLength +
-                                  Hdr.NumBuckets*4 + Hdr.NumHashes*8))
-    return false;
-
-  HdrData.DIEOffsetBase = AccelSection.getU32(&Offset);
-  uint32_t NumAtoms = AccelSection.getU32(&Offset);
-
-  for (unsigned i = 0; i < NumAtoms; ++i) {
-    uint16_t AtomType = AccelSection.getU16(&Offset);
-    uint16_t AtomForm = AccelSection.getU16(&Offset);
-    HdrData.Atoms.push_back(std::make_pair(AtomType, AtomForm));
-  }
-
-  return true;
-}
-
-void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
-  // Dump the header.
-  OS << "Magic = " << format("0x%08x", Hdr.Magic) << '\n'
-     << "Version = " << format("0x%04x", Hdr.Version) << '\n'
-     << "Hash function = " << format("0x%08x", Hdr.HashFunction) << '\n'
-     << "Bucket count = " << Hdr.NumBuckets << '\n'
-     << "Hashes count = " << Hdr.NumHashes << '\n'
-     << "HeaderData length = " << Hdr.HeaderDataLength << '\n'
-     << "DIE offset base = " << HdrData.DIEOffsetBase << '\n'
-     << "Number of atoms = " << HdrData.Atoms.size() << '\n';
-
-  unsigned i = 0;
-  SmallVector<DWARFFormValue, 3> AtomForms;
-  for (const auto &Atom: HdrData.Atoms) {
-    OS << format("Atom[%d] Type: ", i++);
-    if (const char *TypeString = dwarf::AtomTypeString(Atom.first))
-      OS << TypeString;
-    else
-      OS << format("DW_ATOM_Unknown_0x%x", Atom.first);
-    OS << " Form: ";
-    if (const char *FormString = dwarf::FormEncodingString(Atom.second))
-      OS << FormString;
-    else
-      OS << format("DW_FORM_Unknown_0x%x", Atom.second);
-    OS << '\n';
-    AtomForms.push_back(DWARFFormValue(Atom.second));
-  }
-
-  // Now go through the actual tables and dump them.
-  uint32_t Offset = sizeof(Hdr) + Hdr.HeaderDataLength;
-  unsigned HashesBase = Offset + Hdr.NumBuckets * 4;
-  unsigned OffsetsBase = HashesBase + Hdr.NumHashes * 4;
-
-  for (unsigned Bucket = 0; Bucket < Hdr.NumBuckets; ++Bucket) {
-    unsigned Index = AccelSection.getU32(&Offset);
-
-    OS << format("Bucket[%d]\n", Bucket);
-    if (Index == UINT32_MAX) {
-      OS << "  EMPTY\n";
-      continue;
-    }
-
-    for (unsigned HashIdx = Index; HashIdx < Hdr.NumHashes; ++HashIdx) {
-      unsigned HashOffset = HashesBase + HashIdx*4;
-      unsigned OffsetsOffset = OffsetsBase + HashIdx*4;
-      uint32_t Hash = AccelSection.getU32(&HashOffset);
-
-      if (Hash % Hdr.NumBuckets != Bucket)
-        break;
-
-      unsigned DataOffset = AccelSection.getU32(&OffsetsOffset);
-      OS << format("  Hash = 0x%08x Offset = 0x%08x\n", Hash, DataOffset);
-      if (!AccelSection.isValidOffset(DataOffset)) {
-        OS << "    Invalid section offset\n";
-        continue;
-      }
-      while (AccelSection.isValidOffsetForDataOfSize(DataOffset, 4)) {
-        unsigned StringOffset = AccelSection.getU32(&DataOffset);
-        RelocAddrMap::const_iterator Reloc = Relocs.find(DataOffset-4);
-        if (Reloc != Relocs.end())
-          StringOffset += Reloc->second.second;
-        if (!StringOffset)
-          break;
-        OS << format("    Name: %08x \"%s\"\n", StringOffset,
-                     StringSection.getCStr(&StringOffset));
-        unsigned NumData = AccelSection.getU32(&DataOffset);
-        for (unsigned Data = 0; Data < NumData; ++Data) {
-          OS << format("    Data[%d] => ", Data);
-          unsigned i = 0;
-          for (auto &Atom : AtomForms) {
-            OS << format("{Atom[%d]: ", i++);
-            if (Atom.extractValue(AccelSection, &DataOffset, nullptr))
-              Atom.dump(OS, nullptr);
-            else
-              OS << "Error extracting the value";
-            OS << "} ";
-          }
-          OS << '\n';
-        }
-      }
-    }
-  }
-}
-}
diff --git a/lib/DebugInfo/DWARFAcceleratorTable.h b/lib/DebugInfo/DWARFAcceleratorTable.h
deleted file mode 100644
index 7dc9591..0000000
--- a/lib/DebugInfo/DWARFAcceleratorTable.h
+++ /dev/null
@@ -1,51 +0,0 @@
-//===--- DWARFAcceleratorTable.h --------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFRelocMap.h"
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/DWARFFormValue.h"
-
-#include <cstdint>
-
-namespace llvm {
-
-class DWARFAcceleratorTable {
-
-  struct Header {
-    uint32_t Magic;
-    uint16_t Version;
-    uint16_t HashFunction;
-    uint32_t NumBuckets;
-    uint32_t NumHashes;
-    uint32_t HeaderDataLength;
-  };
-
-  struct HeaderData {
-    typedef uint16_t AtomType;
-    typedef uint16_t Form;
-    uint32_t DIEOffsetBase;
-    SmallVector<std::pair<AtomType, Form>, 3> Atoms;
-  };
-
-  struct Header Hdr;
-  struct HeaderData HdrData;
-  DataExtractor AccelSection;
-  DataExtractor StringSection;
-  const RelocAddrMap& Relocs;
-public:
-  DWARFAcceleratorTable(DataExtractor AccelSection, DataExtractor StringSection,
-                        const RelocAddrMap &Relocs)
-    : AccelSection(AccelSection), StringSection(StringSection), Relocs(Relocs) {}
-
-  bool extract();
-  void dump(raw_ostream &OS) const;
-};
-
-}
diff --git a/lib/DebugInfo/DWARFCompileUnit.cpp b/lib/DebugInfo/DWARFCompileUnit.cpp
deleted file mode 100644
index 33869d8..0000000
--- a/lib/DebugInfo/DWARFCompileUnit.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-//===-- DWARFCompileUnit.cpp ----------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFCompileUnit.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-void DWARFCompileUnit::dump(raw_ostream &OS) {
-  OS << format("0x%08x", getOffset()) << ": Compile Unit:"
-     << " length = " << format("0x%08x", getLength())
-     << " version = " << format("0x%04x", getVersion())
-     << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
-     << " addr_size = " << format("0x%02x", getAddressByteSize())
-     << " (next unit at " << format("0x%08x", getNextUnitOffset())
-     << ")\n";
-
-  const DWARFDebugInfoEntryMinimal *CU = getCompileUnitDIE(false);
-  assert(CU && "Null Compile Unit?");
-  CU->dump(OS, this, -1U);
-}
-
-// VTable anchor.
-DWARFCompileUnit::~DWARFCompileUnit() {
-}
diff --git a/lib/DebugInfo/DWARFCompileUnit.h b/lib/DebugInfo/DWARFCompileUnit.h
deleted file mode 100644
index b3190b18..0000000
--- a/lib/DebugInfo/DWARFCompileUnit.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- DWARFCompileUnit.h --------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_DEBUGINFO_DWARFCOMPILEUNIT_H
-#define LLVM_LIB_DEBUGINFO_DWARFCOMPILEUNIT_H
-
-#include "DWARFUnit.h"
-
-namespace llvm {
-
-class DWARFCompileUnit : public DWARFUnit {
-public:
-  DWARFCompileUnit(DWARFContext &Context, const DWARFSection &Section,
-                   const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
-                   StringRef SOS, StringRef AOS, bool LE,
-                   const DWARFUnitSectionBase &UnitSection)
-      : DWARFUnit(Context, Section, DA, RS, SS, SOS, AOS, LE, UnitSection) {}
-  void dump(raw_ostream &OS);
-  // VTable anchor.
-  ~DWARFCompileUnit() override;
-};
-
-}
-
-#endif
diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp
deleted file mode 100644
index 9a2c7cc..0000000
--- a/lib/DebugInfo/DWARFContext.cpp
+++ /dev/null
@@ -1,697 +0,0 @@
-//===-- DWARFContext.cpp --------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFContext.h"
-#include "DWARFDebugArangeSet.h"
-#include "DWARFAcceleratorTable.h"
-
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/Compression.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-using namespace llvm;
-using namespace dwarf;
-using namespace object;
-
-#define DEBUG_TYPE "dwarf"
-
-typedef DWARFDebugLine::LineTable DWARFLineTable;
-typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind;
-typedef DILineInfoSpecifier::FunctionNameKind FunctionNameKind;
-
-static void dumpPubSection(raw_ostream &OS, StringRef Name, StringRef Data,
-                           bool LittleEndian, bool GnuStyle) {
-  OS << "\n." << Name << " contents:\n";
-  DataExtractor pubNames(Data, LittleEndian, 0);
-  uint32_t offset = 0;
-  while (pubNames.isValidOffset(offset)) {
-    OS << "length = " << format("0x%08x", pubNames.getU32(&offset));
-    OS << " version = " << format("0x%04x", pubNames.getU16(&offset));
-    OS << " unit_offset = " << format("0x%08x", pubNames.getU32(&offset));
-    OS << " unit_size = " << format("0x%08x", pubNames.getU32(&offset)) << '\n';
-    if (GnuStyle)
-      OS << "Offset     Linkage  Kind     Name\n";
-    else
-      OS << "Offset     Name\n";
-
-    while (offset < Data.size()) {
-      uint32_t dieRef = pubNames.getU32(&offset);
-      if (dieRef == 0)
-        break;
-      OS << format("0x%8.8x ", dieRef);
-      if (GnuStyle) {
-        PubIndexEntryDescriptor desc(pubNames.getU8(&offset));
-        OS << format("%-8s", dwarf::GDBIndexEntryLinkageString(desc.Linkage))
-           << ' ' << format("%-8s", dwarf::GDBIndexEntryKindString(desc.Kind))
-           << ' ';
-      }
-      OS << '\"' << pubNames.getCStr(&offset) << "\"\n";
-    }
-  }
-}
-
-static void dumpAccelSection(raw_ostream &OS, StringRef Name,
-                             const DWARFSection& Section, StringRef StringSection,
-                             bool LittleEndian) {
-  DataExtractor AccelSection(Section.Data, LittleEndian, 0);
-  DataExtractor StrData(StringSection, LittleEndian, 0);
-  OS << "\n." << Name << " contents:\n";
-  DWARFAcceleratorTable Accel(AccelSection, StrData, Section.Relocs);
-  if (!Accel.extract())
-    return;
-  Accel.dump(OS);
-}
-
-void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
-  if (DumpType == DIDT_All || DumpType == DIDT_Abbrev) {
-    OS << ".debug_abbrev contents:\n";
-    getDebugAbbrev()->dump(OS);
-  }
-
-  if (DumpType == DIDT_All || DumpType == DIDT_AbbrevDwo)
-    if (const DWARFDebugAbbrev *D = getDebugAbbrevDWO()) {
-      OS << "\n.debug_abbrev.dwo contents:\n";
-      D->dump(OS);
-    }
-
-  if (DumpType == DIDT_All || DumpType == DIDT_Info) {
-    OS << "\n.debug_info contents:\n";
-    for (const auto &CU : compile_units())
-      CU->dump(OS);
-  }
-
-  if ((DumpType == DIDT_All || DumpType == DIDT_InfoDwo) &&
-      getNumDWOCompileUnits()) {
-    OS << "\n.debug_info.dwo contents:\n";
-    for (const auto &DWOCU : dwo_compile_units())
-      DWOCU->dump(OS);
-  }
-
-  if ((DumpType == DIDT_All || DumpType == DIDT_Types) && getNumTypeUnits()) {
-    OS << "\n.debug_types contents:\n";
-    for (const auto &TUS : type_unit_sections())
-      for (const auto &TU : TUS)
-        TU->dump(OS);
-  }
-
-  if ((DumpType == DIDT_All || DumpType == DIDT_TypesDwo) &&
-      getNumDWOTypeUnits()) {
-    OS << "\n.debug_types.dwo contents:\n";
-    for (const auto &DWOTUS : dwo_type_unit_sections())
-      for (const auto &DWOTU : DWOTUS)
-        DWOTU->dump(OS);
-  }
-
-  if (DumpType == DIDT_All || DumpType == DIDT_Loc) {
-    OS << "\n.debug_loc contents:\n";
-    getDebugLoc()->dump(OS);
-  }
-
-  if (DumpType == DIDT_All || DumpType == DIDT_LocDwo) {
-    OS << "\n.debug_loc.dwo contents:\n";
-    getDebugLocDWO()->dump(OS);
-  }
-
-  if (DumpType == DIDT_All || DumpType == DIDT_Frames) {
-    OS << "\n.debug_frame contents:\n";
-    getDebugFrame()->dump(OS);
-  }
-
-  uint32_t offset = 0;
-  if (DumpType == DIDT_All || DumpType == DIDT_Aranges) {
-    OS << "\n.debug_aranges contents:\n";
-    DataExtractor arangesData(getARangeSection(), isLittleEndian(), 0);
-    DWARFDebugArangeSet set;
-    while (set.extract(arangesData, &offset))
-      set.dump(OS);
-  }
-
-  uint8_t savedAddressByteSize = 0;
-  if (DumpType == DIDT_All || DumpType == DIDT_Line) {
-    OS << "\n.debug_line contents:\n";
-    for (const auto &CU : compile_units()) {
-      savedAddressByteSize = CU->getAddressByteSize();
-      unsigned stmtOffset =
-          CU->getCompileUnitDIE()->getAttributeValueAsSectionOffset(
-              CU.get(), DW_AT_stmt_list, -1U);
-      if (stmtOffset != -1U) {
-        DataExtractor lineData(getLineSection().Data, isLittleEndian(),
-                               savedAddressByteSize);
-        DWARFDebugLine::LineTable LineTable;
-        LineTable.parse(lineData, &getLineSection().Relocs, &stmtOffset);
-        LineTable.dump(OS);
-      }
-    }
-  }
-
-  if (DumpType == DIDT_All || DumpType == DIDT_LineDwo) {
-    OS << "\n.debug_line.dwo contents:\n";
-    unsigned stmtOffset = 0;
-    DataExtractor lineData(getLineDWOSection().Data, isLittleEndian(),
-                           savedAddressByteSize);
-    DWARFDebugLine::LineTable LineTable;
-    while (LineTable.Prologue.parse(lineData, &stmtOffset)) {
-      LineTable.dump(OS);
-      LineTable.clear();
-    }
-  }
-
-  if (DumpType == DIDT_All || DumpType == DIDT_Str) {
-    OS << "\n.debug_str contents:\n";
-    DataExtractor strData(getStringSection(), isLittleEndian(), 0);
-    offset = 0;
-    uint32_t strOffset = 0;
-    while (const char *s = strData.getCStr(&offset)) {
-      OS << format("0x%8.8x: \"%s\"\n", strOffset, s);
-      strOffset = offset;
-    }
-  }
-
-  if ((DumpType == DIDT_All || DumpType == DIDT_StrDwo) &&
-      !getStringDWOSection().empty()) {
-    OS << "\n.debug_str.dwo contents:\n";
-    DataExtractor strDWOData(getStringDWOSection(), isLittleEndian(), 0);
-    offset = 0;
-    uint32_t strDWOOffset = 0;
-    while (const char *s = strDWOData.getCStr(&offset)) {
-      OS << format("0x%8.8x: \"%s\"\n", strDWOOffset, s);
-      strDWOOffset = offset;
-    }
-  }
-
-  if (DumpType == DIDT_All || DumpType == DIDT_Ranges) {
-    OS << "\n.debug_ranges contents:\n";
-    // In fact, different compile units may have different address byte
-    // sizes, but for simplicity we just use the address byte size of the last
-    // compile unit (there is no easy and fast way to associate address range
-    // list and the compile unit it describes).
-    DataExtractor rangesData(getRangeSection(), isLittleEndian(),
-                             savedAddressByteSize);
-    offset = 0;
-    DWARFDebugRangeList rangeList;
-    while (rangeList.extract(rangesData, &offset))
-      rangeList.dump(OS);
-  }
-
-  if (DumpType == DIDT_All || DumpType == DIDT_Pubnames)
-    dumpPubSection(OS, "debug_pubnames", getPubNamesSection(),
-                   isLittleEndian(), false);
-
-  if (DumpType == DIDT_All || DumpType == DIDT_Pubtypes)
-    dumpPubSection(OS, "debug_pubtypes", getPubTypesSection(),
-                   isLittleEndian(), false);
-
-  if (DumpType == DIDT_All || DumpType == DIDT_GnuPubnames)
-    dumpPubSection(OS, "debug_gnu_pubnames", getGnuPubNamesSection(),
-                   isLittleEndian(), true /* GnuStyle */);
-
-  if (DumpType == DIDT_All || DumpType == DIDT_GnuPubtypes)
-    dumpPubSection(OS, "debug_gnu_pubtypes", getGnuPubTypesSection(),
-                   isLittleEndian(), true /* GnuStyle */);
-
-  if ((DumpType == DIDT_All || DumpType == DIDT_StrOffsetsDwo) &&
-      !getStringOffsetDWOSection().empty()) {
-    OS << "\n.debug_str_offsets.dwo contents:\n";
-    DataExtractor strOffsetExt(getStringOffsetDWOSection(), isLittleEndian(),
-                               0);
-    offset = 0;
-    uint64_t size = getStringOffsetDWOSection().size();
-    while (offset < size) {
-      OS << format("0x%8.8x: ", offset);
-      OS << format("%8.8x\n", strOffsetExt.getU32(&offset));
-    }
-  }
-
-  if (DumpType == DIDT_All || DumpType == DIDT_AppleNames)
-    dumpAccelSection(OS, "apple_names", getAppleNamesSection(),
-                     getStringSection(), isLittleEndian());
-
-  if (DumpType == DIDT_All || DumpType == DIDT_AppleTypes)
-    dumpAccelSection(OS, "apple_types", getAppleTypesSection(),
-                     getStringSection(), isLittleEndian());
-
-  if (DumpType == DIDT_All || DumpType == DIDT_AppleNamespaces)
-    dumpAccelSection(OS, "apple_namespaces", getAppleNamespacesSection(),
-                     getStringSection(), isLittleEndian());
-
-  if (DumpType == DIDT_All || DumpType == DIDT_AppleObjC)
-    dumpAccelSection(OS, "apple_objc", getAppleObjCSection(),
-                     getStringSection(), isLittleEndian());
-}
-
-const DWARFDebugAbbrev *DWARFContext::getDebugAbbrev() {
-  if (Abbrev)
-    return Abbrev.get();
-
-  DataExtractor abbrData(getAbbrevSection(), isLittleEndian(), 0);
-
-  Abbrev.reset(new DWARFDebugAbbrev());
-  Abbrev->extract(abbrData);
-  return Abbrev.get();
-}
-
-const DWARFDebugAbbrev *DWARFContext::getDebugAbbrevDWO() {
-  if (AbbrevDWO)
-    return AbbrevDWO.get();
-
-  DataExtractor abbrData(getAbbrevDWOSection(), isLittleEndian(), 0);
-  AbbrevDWO.reset(new DWARFDebugAbbrev());
-  AbbrevDWO->extract(abbrData);
-  return AbbrevDWO.get();
-}
-
-const DWARFDebugLoc *DWARFContext::getDebugLoc() {
-  if (Loc)
-    return Loc.get();
-
-  DataExtractor LocData(getLocSection().Data, isLittleEndian(), 0);
-  Loc.reset(new DWARFDebugLoc(getLocSection().Relocs));
-  // assume all compile units have the same address byte size
-  if (getNumCompileUnits())
-    Loc->parse(LocData, getCompileUnitAtIndex(0)->getAddressByteSize());
-  return Loc.get();
-}
-
-const DWARFDebugLocDWO *DWARFContext::getDebugLocDWO() {
-  if (LocDWO)
-    return LocDWO.get();
-
-  DataExtractor LocData(getLocDWOSection().Data, isLittleEndian(), 0);
-  LocDWO.reset(new DWARFDebugLocDWO());
-  LocDWO->parse(LocData);
-  return LocDWO.get();
-}
-
-const DWARFDebugAranges *DWARFContext::getDebugAranges() {
-  if (Aranges)
-    return Aranges.get();
-
-  Aranges.reset(new DWARFDebugAranges());
-  Aranges->generate(this);
-  return Aranges.get();
-}
-
-const DWARFDebugFrame *DWARFContext::getDebugFrame() {
-  if (DebugFrame)
-    return DebugFrame.get();
-
-  // There's a "bug" in the DWARFv3 standard with respect to the target address
-  // size within debug frame sections. While DWARF is supposed to be independent
-  // of its container, FDEs have fields with size being "target address size",
-  // which isn't specified in DWARF in general. It's only specified for CUs, but
-  // .eh_frame can appear without a .debug_info section. Follow the example of
-  // other tools (libdwarf) and extract this from the container (ObjectFile
-  // provides this information). This problem is fixed in DWARFv4
-  // See this dwarf-discuss discussion for more details:
-  // http://lists.dwarfstd.org/htdig.cgi/dwarf-discuss-dwarfstd.org/2011-December/001173.html
-  DataExtractor debugFrameData(getDebugFrameSection(), isLittleEndian(),
-                               getAddressSize());
-  DebugFrame.reset(new DWARFDebugFrame());
-  DebugFrame->parse(debugFrameData);
-  return DebugFrame.get();
-}
-
-const DWARFLineTable *
-DWARFContext::getLineTableForUnit(DWARFUnit *cu) {
-  if (!Line)
-    Line.reset(new DWARFDebugLine(&getLineSection().Relocs));
-
-  unsigned stmtOffset =
-      cu->getCompileUnitDIE()->getAttributeValueAsSectionOffset(
-          cu, DW_AT_stmt_list, -1U);
-  if (stmtOffset == -1U)
-    return nullptr; // No line table for this compile unit.
-
-  // See if the line table is cached.
-  if (const DWARFLineTable *lt = Line->getLineTable(stmtOffset))
-    return lt;
-
-  // We have to parse it first.
-  DataExtractor lineData(getLineSection().Data, isLittleEndian(),
-                         cu->getAddressByteSize());
-  return Line->getOrParseLineTable(lineData, stmtOffset);
-}
-
-void DWARFContext::parseCompileUnits() {
-  CUs.parse(*this, getInfoSection());
-}
-
-void DWARFContext::parseTypeUnits() {
-  if (!TUs.empty())
-    return;
-  for (const auto &I : getTypesSections()) {
-    TUs.push_back(DWARFUnitSection<DWARFTypeUnit>());
-    TUs.back().parse(*this, I.second);
-  }
-}
-
-void DWARFContext::parseDWOCompileUnits() {
-  DWOCUs.parseDWO(*this, getInfoDWOSection());
-}
-
-void DWARFContext::parseDWOTypeUnits() {
-  if (!DWOTUs.empty())
-    return;
-  for (const auto &I : getTypesDWOSections()) {
-    DWOTUs.push_back(DWARFUnitSection<DWARFTypeUnit>());
-    DWOTUs.back().parseDWO(*this, I.second);
-  }
-}
-
-DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t Offset) {
-  parseCompileUnits();
-  return CUs.getUnitForOffset(Offset);
-}
-
-DWARFCompileUnit *DWARFContext::getCompileUnitForAddress(uint64_t Address) {
-  // First, get the offset of the compile unit.
-  uint32_t CUOffset = getDebugAranges()->findAddress(Address);
-  // Retrieve the compile unit.
-  return getCompileUnitForOffset(CUOffset);
-}
-
-static bool getFunctionNameForAddress(DWARFCompileUnit *CU, uint64_t Address,
-                                      FunctionNameKind Kind,
-                                      std::string &FunctionName) {
-  if (Kind == FunctionNameKind::None)
-    return false;
-  // The address may correspond to instruction in some inlined function,
-  // so we have to build the chain of inlined functions and take the
-  // name of the topmost function in it.
-  const DWARFDebugInfoEntryInlinedChain &InlinedChain =
-      CU->getInlinedChainForAddress(Address);
-  if (InlinedChain.DIEs.size() == 0)
-    return false;
-  const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain.DIEs[0];
-  if (const char *Name =
-          TopFunctionDIE.getSubroutineName(InlinedChain.U, Kind)) {
-    FunctionName = Name;
-    return true;
-  }
-  return false;
-}
-
-DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
-                                               DILineInfoSpecifier Spec) {
-  DILineInfo Result;
-
-  DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
-  if (!CU)
-    return Result;
-  getFunctionNameForAddress(CU, Address, Spec.FNKind, Result.FunctionName);
-  if (Spec.FLIKind != FileLineInfoKind::None) {
-    if (const DWARFLineTable *LineTable = getLineTableForUnit(CU))
-      LineTable->getFileLineInfoForAddress(Address, CU->getCompilationDir(),
-                                           Spec.FLIKind, Result);
-  }
-  return Result;
-}
-
-DILineInfoTable
-DWARFContext::getLineInfoForAddressRange(uint64_t Address, uint64_t Size,
-                                         DILineInfoSpecifier Spec) {
-  DILineInfoTable  Lines;
-  DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
-  if (!CU)
-    return Lines;
-
-  std::string FunctionName = "<invalid>";
-  getFunctionNameForAddress(CU, Address, Spec.FNKind, FunctionName);
-
-  // If the Specifier says we don't need FileLineInfo, just
-  // return the top-most function at the starting address.
-  if (Spec.FLIKind == FileLineInfoKind::None) {
-    DILineInfo Result;
-    Result.FunctionName = FunctionName;
-    Lines.push_back(std::make_pair(Address, Result));
-    return Lines;
-  }
-
-  const DWARFLineTable *LineTable = getLineTableForUnit(CU);
-
-  // Get the index of row we're looking for in the line table.
-  std::vector<uint32_t> RowVector;
-  if (!LineTable->lookupAddressRange(Address, Size, RowVector))
-    return Lines;
-
-  for (uint32_t RowIndex : RowVector) {
-    // Take file number and line/column from the row.
-    const DWARFDebugLine::Row &Row = LineTable->Rows[RowIndex];
-    DILineInfo Result;
-    LineTable->getFileNameByIndex(Row.File, CU->getCompilationDir(),
-                                  Spec.FLIKind, Result.FileName);
-    Result.FunctionName = FunctionName;
-    Result.Line = Row.Line;
-    Result.Column = Row.Column;
-    Lines.push_back(std::make_pair(Row.Address, Result));
-  }
-
-  return Lines;
-}
-
-DIInliningInfo
-DWARFContext::getInliningInfoForAddress(uint64_t Address,
-                                        DILineInfoSpecifier Spec) {
-  DIInliningInfo InliningInfo;
-
-  DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
-  if (!CU)
-    return InliningInfo;
-
-  const DWARFLineTable *LineTable = nullptr;
-  const DWARFDebugInfoEntryInlinedChain &InlinedChain =
-      CU->getInlinedChainForAddress(Address);
-  if (InlinedChain.DIEs.size() == 0) {
-    // If there is no DIE for address (e.g. it is in unavailable .dwo file),
-    // try to at least get file/line info from symbol table.
-    if (Spec.FLIKind != FileLineInfoKind::None) {
-      DILineInfo Frame;
-      LineTable = getLineTableForUnit(CU);
-      if (LineTable &&
-          LineTable->getFileLineInfoForAddress(Address, CU->getCompilationDir(),
-                                               Spec.FLIKind, Frame))
-        InliningInfo.addFrame(Frame);
-    }
-    return InliningInfo;
-  }
-
-  uint32_t CallFile = 0, CallLine = 0, CallColumn = 0;
-  for (uint32_t i = 0, n = InlinedChain.DIEs.size(); i != n; i++) {
-    const DWARFDebugInfoEntryMinimal &FunctionDIE = InlinedChain.DIEs[i];
-    DILineInfo Frame;
-    // Get function name if necessary.
-    if (const char *Name =
-            FunctionDIE.getSubroutineName(InlinedChain.U, Spec.FNKind))
-      Frame.FunctionName = Name;
-    if (Spec.FLIKind != FileLineInfoKind::None) {
-      if (i == 0) {
-        // For the topmost frame, initialize the line table of this
-        // compile unit and fetch file/line info from it.
-        LineTable = getLineTableForUnit(CU);
-        // For the topmost routine, get file/line info from line table.
-        if (LineTable)
-          LineTable->getFileLineInfoForAddress(Address, CU->getCompilationDir(),
-                                               Spec.FLIKind, Frame);
-      } else {
-        // Otherwise, use call file, call line and call column from
-        // previous DIE in inlined chain.
-        if (LineTable)
-          LineTable->getFileNameByIndex(CallFile, CU->getCompilationDir(),
-                                        Spec.FLIKind, Frame.FileName);
-        Frame.Line = CallLine;
-        Frame.Column = CallColumn;
-      }
-      // Get call file/line/column of a current DIE.
-      if (i + 1 < n) {
-        FunctionDIE.getCallerFrame(InlinedChain.U, CallFile, CallLine,
-                                   CallColumn);
-      }
-    }
-    InliningInfo.addFrame(Frame);
-  }
-  return InliningInfo;
-}
-
-static bool consumeCompressedDebugSectionHeader(StringRef &data,
-                                                uint64_t &OriginalSize) {
-  // Consume "ZLIB" prefix.
-  if (!data.startswith("ZLIB"))
-    return false;
-  data = data.substr(4);
-  // Consume uncompressed section size (big-endian 8 bytes).
-  DataExtractor extractor(data, false, 8);
-  uint32_t Offset = 0;
-  OriginalSize = extractor.getU64(&Offset);
-  if (Offset == 0)
-    return false;
-  data = data.substr(Offset);
-  return true;
-}
-
-DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj)
-    : IsLittleEndian(Obj.isLittleEndian()),
-      AddressSize(Obj.getBytesInAddress()) {
-  for (const SectionRef &Section : Obj.sections()) {
-    StringRef name;
-    Section.getName(name);
-    // Skip BSS and Virtual sections, they aren't interesting.
-    bool IsBSS = Section.isBSS();
-    if (IsBSS)
-      continue;
-    bool IsVirtual = Section.isVirtual();
-    if (IsVirtual)
-      continue;
-    StringRef data;
-    Section.getContents(data);
-
-    name = name.substr(name.find_first_not_of("._")); // Skip . and _ prefixes.
-
-    // Check if debug info section is compressed with zlib.
-    if (name.startswith("zdebug_")) {
-      uint64_t OriginalSize;
-      if (!zlib::isAvailable() ||
-          !consumeCompressedDebugSectionHeader(data, OriginalSize))
-        continue;
-      UncompressedSections.resize(UncompressedSections.size() + 1);
-      if (zlib::uncompress(data, UncompressedSections.back(), OriginalSize) !=
-          zlib::StatusOK) {
-        UncompressedSections.pop_back();
-        continue;
-      }
-      // Make data point to uncompressed section contents and save its contents.
-      name = name.substr(1);
-      data = UncompressedSections.back();
-    }
-
-    StringRef *SectionData =
-        StringSwitch<StringRef *>(name)
-            .Case("debug_info", &InfoSection.Data)
-            .Case("debug_abbrev", &AbbrevSection)
-            .Case("debug_loc", &LocSection.Data)
-            .Case("debug_line", &LineSection.Data)
-            .Case("debug_aranges", &ARangeSection)
-            .Case("debug_frame", &DebugFrameSection)
-            .Case("debug_str", &StringSection)
-            .Case("debug_ranges", &RangeSection)
-            .Case("debug_pubnames", &PubNamesSection)
-            .Case("debug_pubtypes", &PubTypesSection)
-            .Case("debug_gnu_pubnames", &GnuPubNamesSection)
-            .Case("debug_gnu_pubtypes", &GnuPubTypesSection)
-            .Case("debug_info.dwo", &InfoDWOSection.Data)
-            .Case("debug_abbrev.dwo", &AbbrevDWOSection)
-            .Case("debug_loc.dwo", &LocDWOSection.Data)
-            .Case("debug_line.dwo", &LineDWOSection.Data)
-            .Case("debug_str.dwo", &StringDWOSection)
-            .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
-            .Case("debug_addr", &AddrSection)
-            .Case("apple_names", &AppleNamesSection.Data)
-            .Case("apple_types", &AppleTypesSection.Data)
-            .Case("apple_namespaces", &AppleNamespacesSection.Data)
-            .Case("apple_namespac", &AppleNamespacesSection.Data)
-            .Case("apple_objc", &AppleObjCSection.Data)
-            // Any more debug info sections go here.
-            .Default(nullptr);
-    if (SectionData) {
-      *SectionData = data;
-      if (name == "debug_ranges") {
-        // FIXME: Use the other dwo range section when we emit it.
-        RangeDWOSection = data;
-      }
-    } else if (name == "debug_types") {
-      // Find debug_types data by section rather than name as there are
-      // multiple, comdat grouped, debug_types sections.
-      TypesSections[Section].Data = data;
-    } else if (name == "debug_types.dwo") {
-      TypesDWOSections[Section].Data = data;
-    }
-
-    section_iterator RelocatedSection = Section.getRelocatedSection();
-    if (RelocatedSection == Obj.section_end())
-      continue;
-
-    StringRef RelSecName;
-    RelocatedSection->getName(RelSecName);
-    RelSecName = RelSecName.substr(
-        RelSecName.find_first_not_of("._")); // Skip . and _ prefixes.
-
-    // TODO: Add support for relocations in other sections as needed.
-    // Record relocations for the debug_info and debug_line sections.
-    RelocAddrMap *Map = StringSwitch<RelocAddrMap*>(RelSecName)
-        .Case("debug_info", &InfoSection.Relocs)
-        .Case("debug_loc", &LocSection.Relocs)
-        .Case("debug_info.dwo", &InfoDWOSection.Relocs)
-        .Case("debug_line", &LineSection.Relocs)
-        .Case("apple_names", &AppleNamesSection.Relocs)
-        .Case("apple_types", &AppleTypesSection.Relocs)
-        .Case("apple_namespaces", &AppleNamespacesSection.Relocs)
-        .Case("apple_namespac", &AppleNamespacesSection.Relocs)
-        .Case("apple_objc", &AppleObjCSection.Relocs)
-        .Default(nullptr);
-    if (!Map) {
-      // Find debug_types relocs by section rather than name as there are
-      // multiple, comdat grouped, debug_types sections.
-      if (RelSecName == "debug_types")
-        Map = &TypesSections[*RelocatedSection].Relocs;
-      else if (RelSecName == "debug_types.dwo")
-        Map = &TypesDWOSections[*RelocatedSection].Relocs;
-      else
-        continue;
-    }
-
-    if (Section.relocation_begin() != Section.relocation_end()) {
-      uint64_t SectionSize = RelocatedSection->getSize();
-      for (const RelocationRef &Reloc : Section.relocations()) {
-        uint64_t Address;
-        Reloc.getOffset(Address);
-        uint64_t Type;
-        Reloc.getType(Type);
-        uint64_t SymAddr = 0;
-        object::symbol_iterator Sym = Reloc.getSymbol();
-        if (Sym != Obj.symbol_end())
-          Sym->getAddress(SymAddr);
-
-        object::RelocVisitor V(Obj);
-        object::RelocToApply R(V.visit(Type, Reloc, SymAddr));
-        if (V.error()) {
-          SmallString<32> Name;
-          std::error_code ec(Reloc.getTypeName(Name));
-          if (ec) {
-            errs() << "Aaaaaa! Nameless relocation! Aaaaaa!\n";
-          }
-          errs() << "error: failed to compute relocation: "
-                 << Name << "\n";
-          continue;
-        }
-
-        if (Address + R.Width > SectionSize) {
-          errs() << "error: " << R.Width << "-byte relocation starting "
-                 << Address << " bytes into section " << name << " which is "
-                 << SectionSize << " bytes long.\n";
-          continue;
-        }
-        if (R.Width > 8) {
-          errs() << "error: can't handle a relocation of more than 8 bytes at "
-                    "a time.\n";
-          continue;
-        }
-        DEBUG(dbgs() << "Writing " << format("%p", R.Value)
-                     << " at " << format("%p", Address)
-                     << " with width " << format("%d", R.Width)
-                     << "\n");
-        Map->insert(std::make_pair(Address, std::make_pair(R.Width, R.Value)));
-      }
-    }
-  }
-}
-
-void DWARFContextInMemory::anchor() { }
diff --git a/lib/DebugInfo/DWARFContext.h b/lib/DebugInfo/DWARFContext.h
deleted file mode 100644
index dd3fcc7..0000000
--- a/lib/DebugInfo/DWARFContext.h
+++ /dev/null
@@ -1,292 +0,0 @@
-//===-- DWARFContext.h ------------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===/
-
-#ifndef LLVM_LIB_DEBUGINFO_DWARFCONTEXT_H
-#define LLVM_LIB_DEBUGINFO_DWARFCONTEXT_H
-
-#include "DWARFCompileUnit.h"
-#include "DWARFDebugAranges.h"
-#include "DWARFDebugFrame.h"
-#include "DWARFDebugLine.h"
-#include "DWARFDebugLoc.h"
-#include "DWARFDebugRangeList.h"
-#include "DWARFSection.h"
-#include "DWARFTypeUnit.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/DIContext.h"
-#include <vector>
-
-namespace llvm {
-
-/// DWARFContext
-/// This data structure is the top level entity that deals with dwarf debug
-/// information parsing. The actual data is supplied through pure virtual
-/// methods that a concrete implementation provides.
-class DWARFContext : public DIContext {
-
-  DWARFUnitSection<DWARFCompileUnit> CUs;
-  std::vector<DWARFUnitSection<DWARFTypeUnit>> TUs;
-  std::unique_ptr<DWARFDebugAbbrev> Abbrev;
-  std::unique_ptr<DWARFDebugLoc> Loc;
-  std::unique_ptr<DWARFDebugAranges> Aranges;
-  std::unique_ptr<DWARFDebugLine> Line;
-  std::unique_ptr<DWARFDebugFrame> DebugFrame;
-
-  DWARFUnitSection<DWARFCompileUnit> DWOCUs;
-  std::vector<DWARFUnitSection<DWARFTypeUnit>> DWOTUs;
-  std::unique_ptr<DWARFDebugAbbrev> AbbrevDWO;
-  std::unique_ptr<DWARFDebugLocDWO> LocDWO;
-
-  DWARFContext(DWARFContext &) LLVM_DELETED_FUNCTION;
-  DWARFContext &operator=(DWARFContext &) LLVM_DELETED_FUNCTION;
-
-  /// Read compile units from the debug_info section (if necessary)
-  /// and store them in CUs.
-  void parseCompileUnits();
-
-  /// Read type units from the debug_types sections (if necessary)
-  /// and store them in TUs.
-  void parseTypeUnits();
-
-  /// Read compile units from the debug_info.dwo section (if necessary)
-  /// and store them in DWOCUs.
-  void parseDWOCompileUnits();
-
-  /// Read type units from the debug_types.dwo section (if necessary)
-  /// and store them in DWOTUs.
-  void parseDWOTypeUnits();
-
-public:
-  DWARFContext() : DIContext(CK_DWARF) {}
-
-  static bool classof(const DIContext *DICtx) {
-    return DICtx->getKind() == CK_DWARF;
-  }
-
-  void dump(raw_ostream &OS, DIDumpType DumpType = DIDT_All) override;
-
-  typedef DWARFUnitSection<DWARFCompileUnit>::iterator_range cu_iterator_range;
-  typedef DWARFUnitSection<DWARFTypeUnit>::iterator_range tu_iterator_range;
-  typedef iterator_range<std::vector<DWARFUnitSection<DWARFTypeUnit>>::iterator> tu_section_iterator_range;
-
-  /// Get compile units in this context.
-  cu_iterator_range compile_units() {
-    parseCompileUnits();
-    return cu_iterator_range(CUs.begin(), CUs.end());
-  }
-
-  /// Get type units in this context.
-  tu_section_iterator_range type_unit_sections() {
-    parseTypeUnits();
-    return tu_section_iterator_range(TUs.begin(), TUs.end());
-  }
-
-  /// Get compile units in the DWO context.
-  cu_iterator_range dwo_compile_units() {
-    parseDWOCompileUnits();
-    return cu_iterator_range(DWOCUs.begin(), DWOCUs.end());
-  }
-
-  /// Get type units in the DWO context.
-  tu_section_iterator_range dwo_type_unit_sections() {
-    parseDWOTypeUnits();
-    return tu_section_iterator_range(DWOTUs.begin(), DWOTUs.end());
-  }
-
-  /// Get the number of compile units in this context.
-  unsigned getNumCompileUnits() {
-    parseCompileUnits();
-    return CUs.size();
-  }
-
-  /// Get the number of compile units in this context.
-  unsigned getNumTypeUnits() {
-    parseTypeUnits();
-    return TUs.size();
-  }
-
-  /// Get the number of compile units in the DWO context.
-  unsigned getNumDWOCompileUnits() {
-    parseDWOCompileUnits();
-    return DWOCUs.size();
-  }
-
-  /// Get the number of compile units in the DWO context.
-  unsigned getNumDWOTypeUnits() {
-    parseDWOTypeUnits();
-    return DWOTUs.size();
-  }
-
-  /// Get the compile unit at the specified index for this compile unit.
-  DWARFCompileUnit *getCompileUnitAtIndex(unsigned index) {
-    parseCompileUnits();
-    return CUs[index].get();
-  }
-
-  /// Get the compile unit at the specified index for the DWO compile units.
-  DWARFCompileUnit *getDWOCompileUnitAtIndex(unsigned index) {
-    parseDWOCompileUnits();
-    return DWOCUs[index].get();
-  }
-
-  /// Get a pointer to the parsed DebugAbbrev object.
-  const DWARFDebugAbbrev *getDebugAbbrev();
-
-  /// Get a pointer to the parsed DebugLoc object.
-  const DWARFDebugLoc *getDebugLoc();
-
-  /// Get a pointer to the parsed dwo abbreviations object.
-  const DWARFDebugAbbrev *getDebugAbbrevDWO();
-
-  /// Get a pointer to the parsed DebugLoc object.
-  const DWARFDebugLocDWO *getDebugLocDWO();
-
-  /// Get a pointer to the parsed DebugAranges object.
-  const DWARFDebugAranges *getDebugAranges();
-
-  /// Get a pointer to the parsed frame information object.
-  const DWARFDebugFrame *getDebugFrame();
-
-  /// Get a pointer to a parsed line table corresponding to a compile unit.
-  const DWARFDebugLine::LineTable *getLineTableForUnit(DWARFUnit *cu);
-
-  DILineInfo getLineInfoForAddress(uint64_t Address,
-      DILineInfoSpecifier Specifier = DILineInfoSpecifier()) override;
-  DILineInfoTable getLineInfoForAddressRange(uint64_t Address, uint64_t Size,
-      DILineInfoSpecifier Specifier = DILineInfoSpecifier()) override;
-  DIInliningInfo getInliningInfoForAddress(uint64_t Address,
-      DILineInfoSpecifier Specifier = DILineInfoSpecifier()) override;
-
-  virtual bool isLittleEndian() const = 0;
-  virtual uint8_t getAddressSize() const = 0;
-  virtual const DWARFSection &getInfoSection() = 0;
-  typedef MapVector<object::SectionRef, DWARFSection,
-                    std::map<object::SectionRef, unsigned>> TypeSectionMap;
-  virtual const TypeSectionMap &getTypesSections() = 0;
-  virtual StringRef getAbbrevSection() = 0;
-  virtual const DWARFSection &getLocSection() = 0;
-  virtual StringRef getARangeSection() = 0;
-  virtual StringRef getDebugFrameSection() = 0;
-  virtual const DWARFSection &getLineSection() = 0;
-  virtual StringRef getStringSection() = 0;
-  virtual StringRef getRangeSection() = 0;
-  virtual StringRef getPubNamesSection() = 0;
-  virtual StringRef getPubTypesSection() = 0;
-  virtual StringRef getGnuPubNamesSection() = 0;
-  virtual StringRef getGnuPubTypesSection() = 0;
-
-  // Sections for DWARF5 split dwarf proposal.
-  virtual const DWARFSection &getInfoDWOSection() = 0;
-  virtual const TypeSectionMap &getTypesDWOSections() = 0;
-  virtual StringRef getAbbrevDWOSection() = 0;
-  virtual const DWARFSection &getLineDWOSection() = 0;
-  virtual const DWARFSection &getLocDWOSection() = 0;
-  virtual StringRef getStringDWOSection() = 0;
-  virtual StringRef getStringOffsetDWOSection() = 0;
-  virtual StringRef getRangeDWOSection() = 0;
-  virtual StringRef getAddrSection() = 0;
-  virtual const DWARFSection& getAppleNamesSection() = 0;
-  virtual const DWARFSection& getAppleTypesSection() = 0;
-  virtual const DWARFSection& getAppleNamespacesSection() = 0;
-  virtual const DWARFSection& getAppleObjCSection() = 0;
-
-  static bool isSupportedVersion(unsigned version) {
-    return version == 2 || version == 3 || version == 4;
-  }
-private:
-  /// Return the compile unit that includes an offset (relative to .debug_info).
-  DWARFCompileUnit *getCompileUnitForOffset(uint32_t Offset);
-
-  /// Return the compile unit which contains instruction with provided
-  /// address.
-  DWARFCompileUnit *getCompileUnitForAddress(uint64_t Address);
-};
-
-/// DWARFContextInMemory is the simplest possible implementation of a
-/// DWARFContext. It assumes all content is available in memory and stores
-/// pointers to it.
-class DWARFContextInMemory : public DWARFContext {
-  virtual void anchor();
-  bool IsLittleEndian;
-  uint8_t AddressSize;
-  DWARFSection InfoSection;
-  TypeSectionMap TypesSections;
-  StringRef AbbrevSection;
-  DWARFSection LocSection;
-  StringRef ARangeSection;
-  StringRef DebugFrameSection;
-  DWARFSection LineSection;
-  StringRef StringSection;
-  StringRef RangeSection;
-  StringRef PubNamesSection;
-  StringRef PubTypesSection;
-  StringRef GnuPubNamesSection;
-  StringRef GnuPubTypesSection;
-
-  // Sections for DWARF5 split dwarf proposal.
-  DWARFSection InfoDWOSection;
-  TypeSectionMap TypesDWOSections;
-  StringRef AbbrevDWOSection;
-  DWARFSection LineDWOSection;
-  DWARFSection LocDWOSection;
-  StringRef StringDWOSection;
-  StringRef StringOffsetDWOSection;
-  StringRef RangeDWOSection;
-  StringRef AddrSection;
-  DWARFSection AppleNamesSection;
-  DWARFSection AppleTypesSection;
-  DWARFSection AppleNamespacesSection;
-  DWARFSection AppleObjCSection;
-
-  SmallVector<SmallString<32>, 4> UncompressedSections;
-
-public:
-  DWARFContextInMemory(const object::ObjectFile &Obj);
-  bool isLittleEndian() const override { return IsLittleEndian; }
-  uint8_t getAddressSize() const override { return AddressSize; }
-  const DWARFSection &getInfoSection() override { return InfoSection; }
-  const TypeSectionMap &getTypesSections() override { return TypesSections; }
-  StringRef getAbbrevSection() override { return AbbrevSection; }
-  const DWARFSection &getLocSection() override { return LocSection; }
-  StringRef getARangeSection() override { return ARangeSection; }
-  StringRef getDebugFrameSection() override { return DebugFrameSection; }
-  const DWARFSection &getLineSection() override { return LineSection; }
-  StringRef getStringSection() override { return StringSection; }
-  StringRef getRangeSection() override { return RangeSection; }
-  StringRef getPubNamesSection() override { return PubNamesSection; }
-  StringRef getPubTypesSection() override { return PubTypesSection; }
-  StringRef getGnuPubNamesSection() override { return GnuPubNamesSection; }
-  StringRef getGnuPubTypesSection() override { return GnuPubTypesSection; }
-  const DWARFSection& getAppleNamesSection() override { return AppleNamesSection; }
-  const DWARFSection& getAppleTypesSection() override { return AppleTypesSection; }
-  const DWARFSection& getAppleNamespacesSection() override { return AppleNamespacesSection; }
-  const DWARFSection& getAppleObjCSection() override { return AppleObjCSection; }
-
-  // Sections for DWARF5 split dwarf proposal.
-  const DWARFSection &getInfoDWOSection() override { return InfoDWOSection; }
-  const TypeSectionMap &getTypesDWOSections() override {
-    return TypesDWOSections;
-  }
-  StringRef getAbbrevDWOSection() override { return AbbrevDWOSection; }
-  const DWARFSection &getLineDWOSection() override { return LineDWOSection; }
-  const DWARFSection &getLocDWOSection() override { return LocDWOSection; }
-  StringRef getStringDWOSection() override { return StringDWOSection; }
-  StringRef getStringOffsetDWOSection() override {
-    return StringOffsetDWOSection;
-  }
-  StringRef getRangeDWOSection() override { return RangeDWOSection; }
-  StringRef getAddrSection() override {
-    return AddrSection;
-  }
-};
-
-}
-
-#endif
diff --git a/lib/DebugInfo/DWARFDebugAbbrev.cpp b/lib/DebugInfo/DWARFDebugAbbrev.cpp
deleted file mode 100644
index c1a088e..0000000
--- a/lib/DebugInfo/DWARFDebugAbbrev.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-//===-- DWARFDebugAbbrev.cpp ----------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFDebugAbbrev.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-DWARFAbbreviationDeclarationSet::DWARFAbbreviationDeclarationSet() {
-  clear();
-}
-
-void DWARFAbbreviationDeclarationSet::clear() {
-  Offset = 0;
-  FirstAbbrCode = 0;
-  Decls.clear();
-}
-
-bool DWARFAbbreviationDeclarationSet::extract(DataExtractor Data,
-                                              uint32_t *OffsetPtr) {
-  clear();
-  const uint32_t BeginOffset = *OffsetPtr;
-  Offset = BeginOffset;
-  DWARFAbbreviationDeclaration AbbrDecl;
-  uint32_t PrevAbbrCode = 0;
-  while (AbbrDecl.extract(Data, OffsetPtr)) {
-    if (FirstAbbrCode == 0) {
-      FirstAbbrCode = AbbrDecl.getCode();
-    } else {
-      if (PrevAbbrCode + 1 != AbbrDecl.getCode()) {
-        // Codes are not consecutive, can't do O(1) lookups.
-        FirstAbbrCode = UINT32_MAX;
-      }
-    }
-    PrevAbbrCode = AbbrDecl.getCode();
-    Decls.push_back(std::move(AbbrDecl));
-  }
-  return BeginOffset != *OffsetPtr;
-}
-
-void DWARFAbbreviationDeclarationSet::dump(raw_ostream &OS) const {
-  for (const auto &Decl : Decls)
-    Decl.dump(OS);
-}
-
-const DWARFAbbreviationDeclaration *
-DWARFAbbreviationDeclarationSet::getAbbreviationDeclaration(
-    uint32_t AbbrCode) const {
-  if (FirstAbbrCode == UINT32_MAX) {
-    for (const auto &Decl : Decls) {
-      if (Decl.getCode() == AbbrCode)
-        return &Decl;
-    }
-    return nullptr;
-  }
-  if (AbbrCode < FirstAbbrCode || AbbrCode >= FirstAbbrCode + Decls.size())
-    return nullptr;
-  return &Decls[AbbrCode - FirstAbbrCode];
-}
-
-DWARFDebugAbbrev::DWARFDebugAbbrev() {
-  clear();
-}
-
-void DWARFDebugAbbrev::clear() {
-  AbbrDeclSets.clear();
-  PrevAbbrOffsetPos = AbbrDeclSets.end();
-}
-
-void DWARFDebugAbbrev::extract(DataExtractor Data) {
-  clear();
-
-  uint32_t Offset = 0;
-  DWARFAbbreviationDeclarationSet AbbrDecls;
-  while (Data.isValidOffset(Offset)) {
-    uint32_t CUAbbrOffset = Offset;
-    if (!AbbrDecls.extract(Data, &Offset))
-      break;
-    AbbrDeclSets[CUAbbrOffset] = std::move(AbbrDecls);
-  }
-}
-
-void DWARFDebugAbbrev::dump(raw_ostream &OS) const {
-  if (AbbrDeclSets.empty()) {
-    OS << "< EMPTY >\n";
-    return;
-  }
-
-  for (const auto &I : AbbrDeclSets) {
-    OS << format("Abbrev table for offset: 0x%8.8" PRIx64 "\n", I.first);
-    I.second.dump(OS);
-  }
-}
-
-const DWARFAbbreviationDeclarationSet*
-DWARFDebugAbbrev::getAbbreviationDeclarationSet(uint64_t CUAbbrOffset) const {
-  const auto End = AbbrDeclSets.end();
-  if (PrevAbbrOffsetPos != End && PrevAbbrOffsetPos->first == CUAbbrOffset) {
-    return &(PrevAbbrOffsetPos->second);
-  }
-
-  const auto Pos = AbbrDeclSets.find(CUAbbrOffset);
-  if (Pos != End) {
-    PrevAbbrOffsetPos = Pos;
-    return &(Pos->second);
-  }
-
-  return nullptr;
-}
diff --git a/lib/DebugInfo/DWARFDebugAbbrev.h b/lib/DebugInfo/DWARFDebugAbbrev.h
deleted file mode 100644
index 4b3b814..0000000
--- a/lib/DebugInfo/DWARFDebugAbbrev.h
+++ /dev/null
@@ -1,63 +0,0 @@
-//===-- DWARFDebugAbbrev.h --------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGABBREV_H
-#define LLVM_LIB_DEBUGINFO_DWARFDEBUGABBREV_H
-
-#include "DWARFAbbreviationDeclaration.h"
-#include <list>
-#include <map>
-#include <vector>
-
-namespace llvm {
-
-class DWARFAbbreviationDeclarationSet {
-  uint32_t Offset;
-  /// Code of the first abbreviation, if all abbreviations in the set have
-  /// consecutive codes. UINT32_MAX otherwise.
-  uint32_t FirstAbbrCode;
-  std::vector<DWARFAbbreviationDeclaration> Decls;
-
-public:
-  DWARFAbbreviationDeclarationSet();
-
-  uint32_t getOffset() const { return Offset; }
-  void dump(raw_ostream &OS) const;
-  bool extract(DataExtractor Data, uint32_t *OffsetPtr);
-
-  const DWARFAbbreviationDeclaration *
-  getAbbreviationDeclaration(uint32_t AbbrCode) const;
-
-private:
-  void clear();
-};
-
-class DWARFDebugAbbrev {
-  typedef std::map<uint64_t, DWARFAbbreviationDeclarationSet>
-    DWARFAbbreviationDeclarationSetMap;
-
-  DWARFAbbreviationDeclarationSetMap AbbrDeclSets;
-  mutable DWARFAbbreviationDeclarationSetMap::const_iterator PrevAbbrOffsetPos;
-
-public:
-  DWARFDebugAbbrev();
-
-  const DWARFAbbreviationDeclarationSet *
-  getAbbreviationDeclarationSet(uint64_t CUAbbrOffset) const;
-
-  void dump(raw_ostream &OS) const;
-  void extract(DataExtractor Data);
-
-private:
-  void clear();
-};
-
-}
-
-#endif
diff --git a/lib/DebugInfo/DWARFDebugArangeSet.cpp b/lib/DebugInfo/DWARFDebugArangeSet.cpp
deleted file mode 100644
index c0a33ce..0000000
--- a/lib/DebugInfo/DWARFDebugArangeSet.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-//===-- DWARFDebugArangeSet.cpp -------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFDebugArangeSet.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-using namespace llvm;
-
-void DWARFDebugArangeSet::clear() {
-  Offset = -1U;
-  std::memset(&HeaderData, 0, sizeof(Header));
-  ArangeDescriptors.clear();
-}
-
-bool
-DWARFDebugArangeSet::extract(DataExtractor data, uint32_t *offset_ptr) {
-  if (data.isValidOffset(*offset_ptr)) {
-    ArangeDescriptors.clear();
-    Offset = *offset_ptr;
-
-    // 7.20 Address Range Table
-    //
-    // Each set of entries in the table of address ranges contained in
-    // the .debug_aranges section begins with a header consisting of: a
-    // 4-byte length containing the length of the set of entries for this
-    // compilation unit, not including the length field itself; a 2-byte
-    // version identifier containing the value 2 for DWARF Version 2; a
-    // 4-byte offset into the.debug_infosection; a 1-byte unsigned integer
-    // containing the size in bytes of an address (or the offset portion of
-    // an address for segmented addressing) on the target system; and a
-    // 1-byte unsigned integer containing the size in bytes of a segment
-    // descriptor on the target system. This header is followed by a series
-    // of tuples. Each tuple consists of an address and a length, each in
-    // the size appropriate for an address on the target architecture.
-    HeaderData.Length = data.getU32(offset_ptr);
-    HeaderData.Version = data.getU16(offset_ptr);
-    HeaderData.CuOffset = data.getU32(offset_ptr);
-    HeaderData.AddrSize = data.getU8(offset_ptr);
-    HeaderData.SegSize = data.getU8(offset_ptr);
-
-    // Perform basic validation of the header fields.
-    if (!data.isValidOffsetForDataOfSize(Offset, HeaderData.Length) ||
-        (HeaderData.AddrSize != 4 && HeaderData.AddrSize != 8)) {
-      clear();
-      return false;
-    }
-
-    // The first tuple following the header in each set begins at an offset
-    // that is a multiple of the size of a single tuple (that is, twice the
-    // size of an address). The header is padded, if necessary, to the
-    // appropriate boundary.
-    const uint32_t header_size = *offset_ptr - Offset;
-    const uint32_t tuple_size = HeaderData.AddrSize * 2;
-    uint32_t first_tuple_offset = 0;
-    while (first_tuple_offset < header_size)
-      first_tuple_offset += tuple_size;
-
-    *offset_ptr = Offset + first_tuple_offset;
-
-    Descriptor arangeDescriptor;
-
-    static_assert(sizeof(arangeDescriptor.Address) ==
-                      sizeof(arangeDescriptor.Length),
-                  "Different datatypes for addresses and sizes!");
-    assert(sizeof(arangeDescriptor.Address) >= HeaderData.AddrSize);
-
-    while (data.isValidOffset(*offset_ptr)) {
-      arangeDescriptor.Address = data.getUnsigned(offset_ptr, HeaderData.AddrSize);
-      arangeDescriptor.Length = data.getUnsigned(offset_ptr, HeaderData.AddrSize);
-
-      // Each set of tuples is terminated by a 0 for the address and 0
-      // for the length.
-      if (arangeDescriptor.Address || arangeDescriptor.Length)
-        ArangeDescriptors.push_back(arangeDescriptor);
-      else
-        break; // We are done if we get a zero address and length
-    }
-
-    return !ArangeDescriptors.empty();
-  }
-  return false;
-}
-
-void DWARFDebugArangeSet::dump(raw_ostream &OS) const {
-  OS << format("Address Range Header: length = 0x%8.8x, version = 0x%4.4x, ",
-               HeaderData.Length, HeaderData.Version)
-     << format("cu_offset = 0x%8.8x, addr_size = 0x%2.2x, seg_size = 0x%2.2x\n",
-               HeaderData.CuOffset, HeaderData.AddrSize, HeaderData.SegSize);
-
-  const uint32_t hex_width = HeaderData.AddrSize * 2;
-  for (const auto &Desc : ArangeDescriptors) {
-    OS << format("[0x%*.*" PRIx64 " -", hex_width, hex_width, Desc.Address)
-       << format(" 0x%*.*" PRIx64 ")\n",
-                 hex_width, hex_width, Desc.getEndAddress());
-  }
-}
diff --git a/lib/DebugInfo/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARFDebugAranges.cpp
deleted file mode 100644
index fe7e46d..0000000
--- a/lib/DebugInfo/DWARFDebugAranges.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-//===-- DWARFDebugAranges.cpp -----------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFDebugAranges.h"
-#include "DWARFCompileUnit.h"
-#include "DWARFContext.h"
-#include "DWARFDebugArangeSet.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <set>
-using namespace llvm;
-
-void DWARFDebugAranges::extract(DataExtractor DebugArangesData) {
-  if (!DebugArangesData.isValidOffset(0))
-    return;
-  uint32_t Offset = 0;
-  DWARFDebugArangeSet Set;
-
-  while (Set.extract(DebugArangesData, &Offset)) {
-    uint32_t CUOffset = Set.getCompileUnitDIEOffset();
-    for (const auto &Desc : Set.descriptors()) {
-      uint64_t LowPC = Desc.Address;
-      uint64_t HighPC = Desc.getEndAddress();
-      appendRange(CUOffset, LowPC, HighPC);
-    }
-    ParsedCUOffsets.insert(CUOffset);
-  }
-}
-
-void DWARFDebugAranges::generate(DWARFContext *CTX) {
-  clear();
-  if (!CTX)
-    return;
-
-  // Extract aranges from .debug_aranges section.
-  DataExtractor ArangesData(CTX->getARangeSection(), CTX->isLittleEndian(), 0);
-  extract(ArangesData);
-
-  // Generate aranges from DIEs: even if .debug_aranges section is present,
-  // it may describe only a small subset of compilation units, so we need to
-  // manually build aranges for the rest of them.
-  for (const auto &CU : CTX->compile_units()) {
-    uint32_t CUOffset = CU->getOffset();
-    if (ParsedCUOffsets.insert(CUOffset).second) {
-      DWARFAddressRangesVector CURanges;
-      CU->collectAddressRanges(CURanges);
-      for (const auto &R : CURanges) {
-        appendRange(CUOffset, R.first, R.second);
-      }
-    }
-  }
-
-  construct();
-}
-
-void DWARFDebugAranges::clear() {
-  Endpoints.clear();
-  Aranges.clear();
-  ParsedCUOffsets.clear();
-}
-
-void DWARFDebugAranges::appendRange(uint32_t CUOffset, uint64_t LowPC,
-                                    uint64_t HighPC) {
-  if (LowPC >= HighPC)
-    return;
-  Endpoints.emplace_back(LowPC, CUOffset, true);
-  Endpoints.emplace_back(HighPC, CUOffset, false);
-}
-
-void DWARFDebugAranges::construct() {
-  std::multiset<uint32_t> ValidCUs;  // Maintain the set of CUs describing
-                                     // a current address range.
-  std::sort(Endpoints.begin(), Endpoints.end());
-  uint64_t PrevAddress = -1ULL;
-  for (const auto &E : Endpoints) {
-    if (PrevAddress < E.Address && ValidCUs.size() > 0) {
-      // If the address range between two endpoints is described by some
-      // CU, first try to extend the last range in Aranges. If we can't
-      // do it, start a new range.
-      if (!Aranges.empty() && Aranges.back().HighPC() == PrevAddress &&
-          ValidCUs.find(Aranges.back().CUOffset) != ValidCUs.end()) {
-        Aranges.back().setHighPC(E.Address);
-      } else {
-        Aranges.emplace_back(PrevAddress, E.Address, *ValidCUs.begin());
-      }
-    }
-    // Update the set of valid CUs.
-    if (E.IsRangeStart) {
-      ValidCUs.insert(E.CUOffset);
-    } else {
-      auto CUPos = ValidCUs.find(E.CUOffset);
-      assert(CUPos != ValidCUs.end());
-      ValidCUs.erase(CUPos);
-    }
-    PrevAddress = E.Address;
-  }
-  assert(ValidCUs.empty());
-
-  // Endpoints are not needed now.
-  std::vector<RangeEndpoint> EmptyEndpoints;
-  EmptyEndpoints.swap(Endpoints);
-}
-
-uint32_t DWARFDebugAranges::findAddress(uint64_t Address) const {
-  if (!Aranges.empty()) {
-    Range range(Address);
-    RangeCollIterator begin = Aranges.begin();
-    RangeCollIterator end = Aranges.end();
-    RangeCollIterator pos =
-        std::lower_bound(begin, end, range);
-
-    if (pos != end && pos->containsAddress(Address)) {
-      return pos->CUOffset;
-    } else if (pos != begin) {
-      --pos;
-      if (pos->containsAddress(Address))
-        return pos->CUOffset;
-    }
-  }
-  return -1U;
-}
diff --git a/lib/DebugInfo/DWARFDebugFrame.cpp b/lib/DebugInfo/DWARFDebugFrame.cpp
deleted file mode 100644
index dfa7e82..0000000
--- a/lib/DebugInfo/DWARFDebugFrame.cpp
+++ /dev/null
@@ -1,374 +0,0 @@
-//===-- DWARFDebugFrame.h - Parsing of .debug_frame -------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFDebugFrame.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include <string>
-#include <vector>
-
-using namespace llvm;
-using namespace dwarf;
-
-
-/// \brief Abstract frame entry defining the common interface concrete
-/// entries implement.
-class llvm::FrameEntry {
-public:
-  enum FrameKind {FK_CIE, FK_FDE};
-  FrameEntry(FrameKind K, uint64_t Offset, uint64_t Length)
-      : Kind(K), Offset(Offset), Length(Length) {}
-
-  virtual ~FrameEntry() {
-  }
-
-  FrameKind getKind() const { return Kind; }
-  virtual uint64_t getOffset() const { return Offset; }
-
-  /// \brief Parse and store a sequence of CFI instructions from Data,
-  /// starting at *Offset and ending at EndOffset. If everything
-  /// goes well, *Offset should be equal to EndOffset when this method
-  /// returns. Otherwise, an error occurred.
-  virtual void parseInstructions(DataExtractor Data, uint32_t *Offset,
-                                 uint32_t EndOffset);
-
-  /// \brief Dump the entry header to the given output stream.
-  virtual void dumpHeader(raw_ostream &OS) const = 0;
-
-  /// \brief Dump the entry's instructions to the given output stream.
-  virtual void dumpInstructions(raw_ostream &OS) const;
-
-protected:
-  const FrameKind Kind;
-
-  /// \brief Offset of this entry in the section.
-  uint64_t Offset;
-
-  /// \brief Entry length as specified in DWARF.
-  uint64_t Length;
-
-  /// An entry may contain CFI instructions. An instruction consists of an
-  /// opcode and an optional sequence of operands.
-  typedef std::vector<uint64_t> Operands;
-  struct Instruction {
-    Instruction(uint8_t Opcode)
-      : Opcode(Opcode)
-    {}
-
-    uint8_t Opcode;
-    Operands Ops;
-  };
-
-  std::vector<Instruction> Instructions;
-
-  /// Convenience methods to add a new instruction with the given opcode and
-  /// operands to the Instructions vector.
-  void addInstruction(uint8_t Opcode) {
-    Instructions.push_back(Instruction(Opcode));
-  }
-
-  void addInstruction(uint8_t Opcode, uint64_t Operand1) {
-    Instructions.push_back(Instruction(Opcode));
-    Instructions.back().Ops.push_back(Operand1);
-  }
-
-  void addInstruction(uint8_t Opcode, uint64_t Operand1, uint64_t Operand2) {
-    Instructions.push_back(Instruction(Opcode));
-    Instructions.back().Ops.push_back(Operand1);
-    Instructions.back().Ops.push_back(Operand2);
-  }
-};
-
-
-// See DWARF standard v3, section 7.23
-const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0;
-const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f;
-
-void FrameEntry::parseInstructions(DataExtractor Data, uint32_t *Offset,
-                                   uint32_t EndOffset) {
-  while (*Offset < EndOffset) {
-    uint8_t Opcode = Data.getU8(Offset);
-    // Some instructions have a primary opcode encoded in the top bits.
-    uint8_t Primary = Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK;
-
-    if (Primary) {
-      // If it's a primary opcode, the first operand is encoded in the bottom
-      // bits of the opcode itself.
-      uint64_t Op1 = Opcode & DWARF_CFI_PRIMARY_OPERAND_MASK;
-      switch (Primary) {
-        default: llvm_unreachable("Impossible primary CFI opcode");
-        case DW_CFA_advance_loc:
-        case DW_CFA_restore:
-          addInstruction(Primary, Op1);
-          break;
-        case DW_CFA_offset:
-          addInstruction(Primary, Op1, Data.getULEB128(Offset));
-          break;
-      }
-    } else {
-      // Extended opcode - its value is Opcode itself.
-      switch (Opcode) {
-        default: llvm_unreachable("Invalid extended CFI opcode");
-        case DW_CFA_nop:
-        case DW_CFA_remember_state:
-        case DW_CFA_restore_state:
-        case DW_CFA_GNU_window_save:
-          // No operands
-          addInstruction(Opcode);
-          break;
-        case DW_CFA_set_loc:
-          // Operands: Address
-          addInstruction(Opcode, Data.getAddress(Offset));
-          break;
-        case DW_CFA_advance_loc1:
-          // Operands: 1-byte delta
-          addInstruction(Opcode, Data.getU8(Offset));
-          break;
-        case DW_CFA_advance_loc2:
-          // Operands: 2-byte delta
-          addInstruction(Opcode, Data.getU16(Offset));
-          break;
-        case DW_CFA_advance_loc4:
-          // Operands: 4-byte delta
-          addInstruction(Opcode, Data.getU32(Offset));
-          break;
-        case DW_CFA_restore_extended:
-        case DW_CFA_undefined:
-        case DW_CFA_same_value:
-        case DW_CFA_def_cfa_register:
-        case DW_CFA_def_cfa_offset:
-          // Operands: ULEB128
-          addInstruction(Opcode, Data.getULEB128(Offset));
-          break;
-        case DW_CFA_def_cfa_offset_sf:
-          // Operands: SLEB128
-          addInstruction(Opcode, Data.getSLEB128(Offset));
-          break;
-        case DW_CFA_offset_extended:
-        case DW_CFA_register:
-        case DW_CFA_def_cfa:
-        case DW_CFA_val_offset:
-          // Operands: ULEB128, ULEB128
-          addInstruction(Opcode, Data.getULEB128(Offset),
-                                 Data.getULEB128(Offset));
-          break;
-        case DW_CFA_offset_extended_sf:
-        case DW_CFA_def_cfa_sf:
-        case DW_CFA_val_offset_sf:
-          // Operands: ULEB128, SLEB128
-          addInstruction(Opcode, Data.getULEB128(Offset),
-                                 Data.getSLEB128(Offset));
-          break;
-        case DW_CFA_def_cfa_expression:
-        case DW_CFA_expression:
-        case DW_CFA_val_expression:
-          // TODO: implement this
-          report_fatal_error("Values with expressions not implemented yet!");
-      }
-    }
-  }
-}
-
-
-void FrameEntry::dumpInstructions(raw_ostream &OS) const {
-  // TODO: at the moment only instruction names are dumped. Expand this to
-  // dump operands as well.
-  for (const auto &Instr : Instructions) {
-    uint8_t Opcode = Instr.Opcode;
-    if (Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK)
-      Opcode &= DWARF_CFI_PRIMARY_OPCODE_MASK;
-    OS << "  " << CallFrameString(Opcode) << ":\n";
-  }
-}
-
-
-namespace {
-/// \brief DWARF Common Information Entry (CIE)
-class CIE : public FrameEntry {
-public:
-  // CIEs (and FDEs) are simply container classes, so the only sensible way to
-  // create them is by providing the full parsed contents in the constructor.
-  CIE(uint64_t Offset, uint64_t Length, uint8_t Version,
-      SmallString<8> Augmentation, uint64_t CodeAlignmentFactor,
-      int64_t DataAlignmentFactor, uint64_t ReturnAddressRegister)
-      : FrameEntry(FK_CIE, Offset, Length), Version(Version),
-        Augmentation(std::move(Augmentation)),
-        CodeAlignmentFactor(CodeAlignmentFactor),
-        DataAlignmentFactor(DataAlignmentFactor),
-        ReturnAddressRegister(ReturnAddressRegister) {}
-
-  ~CIE() {
-  }
-
-  void dumpHeader(raw_ostream &OS) const override {
-    OS << format("%08x %08x %08x CIE",
-                 (uint32_t)Offset, (uint32_t)Length, DW_CIE_ID)
-       << "\n";
-    OS << format("  Version:               %d\n", Version);
-    OS << "  Augmentation:          \"" << Augmentation << "\"\n";
-    OS << format("  Code alignment factor: %u\n",
-                 (uint32_t)CodeAlignmentFactor);
-    OS << format("  Data alignment factor: %d\n",
-                 (int32_t)DataAlignmentFactor);
-    OS << format("  Return address column: %d\n",
-                 (int32_t)ReturnAddressRegister);
-    OS << "\n";
-  }
-
-  static bool classof(const FrameEntry *FE) {
-    return FE->getKind() == FK_CIE;
-  }
-
-private:
-  /// The following fields are defined in section 6.4.1 of the DWARF standard v3
-  uint8_t Version;
-  SmallString<8> Augmentation;
-  uint64_t CodeAlignmentFactor;
-  int64_t DataAlignmentFactor;
-  uint64_t ReturnAddressRegister;
-};
-
-
-/// \brief DWARF Frame Description Entry (FDE)
-class FDE : public FrameEntry {
-public:
-  // Each FDE has a CIE it's "linked to". Our FDE contains is constructed with
-  // an offset to the CIE (provided by parsing the FDE header). The CIE itself
-  // is obtained lazily once it's actually required.
-  FDE(uint64_t Offset, uint64_t Length, int64_t LinkedCIEOffset,
-      uint64_t InitialLocation, uint64_t AddressRange)
-      : FrameEntry(FK_FDE, Offset, Length), LinkedCIEOffset(LinkedCIEOffset),
-        InitialLocation(InitialLocation), AddressRange(AddressRange),
-        LinkedCIE(nullptr) {}
-
-  ~FDE() {
-  }
-
-  void dumpHeader(raw_ostream &OS) const override {
-    OS << format("%08x %08x %08x FDE ",
-                 (uint32_t)Offset, (uint32_t)Length, (int32_t)LinkedCIEOffset);
-    OS << format("cie=%08x pc=%08x...%08x\n",
-                 (int32_t)LinkedCIEOffset,
-                 (uint32_t)InitialLocation,
-                 (uint32_t)InitialLocation + (uint32_t)AddressRange);
-    if (LinkedCIE) {
-      OS << format("%p\n", LinkedCIE);
-    }
-  }
-
-  static bool classof(const FrameEntry *FE) {
-    return FE->getKind() == FK_FDE;
-  }
-
-private:
-  /// The following fields are defined in section 6.4.1 of the DWARF standard v3
-  uint64_t LinkedCIEOffset;
-  uint64_t InitialLocation;
-  uint64_t AddressRange;
-  CIE *LinkedCIE;
-};
-} // end anonymous namespace
-
-
-DWARFDebugFrame::DWARFDebugFrame() {
-}
-
-DWARFDebugFrame::~DWARFDebugFrame() {
-}
-
-static void LLVM_ATTRIBUTE_UNUSED dumpDataAux(DataExtractor Data,
-                                              uint32_t Offset, int Length) {
-  errs() << "DUMP: ";
-  for (int i = 0; i < Length; ++i) {
-    uint8_t c = Data.getU8(&Offset);
-    errs().write_hex(c); errs() << " ";
-  }
-  errs() << "\n";
-}
-
-
-void DWARFDebugFrame::parse(DataExtractor Data) {
-  uint32_t Offset = 0;
-
-  while (Data.isValidOffset(Offset)) {
-    uint32_t StartOffset = Offset;
-
-    bool IsDWARF64 = false;
-    uint64_t Length = Data.getU32(&Offset);
-    uint64_t Id;
-
-    if (Length == UINT32_MAX) {
-      // DWARF-64 is distinguished by the first 32 bits of the initial length
-      // field being 0xffffffff. Then, the next 64 bits are the actual entry
-      // length.
-      IsDWARF64 = true;
-      Length = Data.getU64(&Offset);
-    }
-
-    // At this point, Offset points to the next field after Length.
-    // Length is the structure size excluding itself. Compute an offset one
-    // past the end of the structure (needed to know how many instructions to
-    // read).
-    // TODO: For honest DWARF64 support, DataExtractor will have to treat
-    //       offset_ptr as uint64_t*
-    uint32_t EndStructureOffset = Offset + static_cast<uint32_t>(Length);
-
-    // The Id field's size depends on the DWARF format
-    Id = Data.getUnsigned(&Offset, IsDWARF64 ? 8 : 4);
-    bool IsCIE = ((IsDWARF64 && Id == DW64_CIE_ID) || Id == DW_CIE_ID);
-
-    if (IsCIE) {
-      // Note: this is specifically DWARFv3 CIE header structure. It was
-      // changed in DWARFv4. We currently don't support reading DWARFv4
-      // here because LLVM itself does not emit it (and LLDB doesn't
-      // support it either).
-      uint8_t Version = Data.getU8(&Offset);
-      const char *Augmentation = Data.getCStr(&Offset);
-      uint64_t CodeAlignmentFactor = Data.getULEB128(&Offset);
-      int64_t DataAlignmentFactor = Data.getSLEB128(&Offset);
-      uint64_t ReturnAddressRegister = Data.getULEB128(&Offset);
-
-      Entries.emplace_back(new CIE(StartOffset, Length, Version,
-                                   StringRef(Augmentation), CodeAlignmentFactor,
-                                   DataAlignmentFactor, ReturnAddressRegister));
-    } else {
-      // FDE
-      uint64_t CIEPointer = Id;
-      uint64_t InitialLocation = Data.getAddress(&Offset);
-      uint64_t AddressRange = Data.getAddress(&Offset);
-
-      Entries.emplace_back(new FDE(StartOffset, Length, CIEPointer,
-                                   InitialLocation, AddressRange));
-    }
-
-    Entries.back()->parseInstructions(Data, &Offset, EndStructureOffset);
-
-    if (Offset != EndStructureOffset) {
-      std::string Str;
-      raw_string_ostream OS(Str);
-      OS << format("Parsing entry instructions at %lx failed", StartOffset);
-      report_fatal_error(Str);
-    }
-  }
-}
-
-
-void DWARFDebugFrame::dump(raw_ostream &OS) const {
-  OS << "\n";
-  for (const auto &Entry : Entries) {
-    Entry->dumpHeader(OS);
-    Entry->dumpInstructions(OS);
-    OS << "\n";
-  }
-}
-
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
deleted file mode 100644
index 583e700..0000000
--- a/lib/DebugInfo/DWARFDebugInfoEntry.cpp
+++ /dev/null
@@ -1,451 +0,0 @@
-//===-- DWARFDebugInfoEntry.cpp -------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFDebugInfoEntry.h"
-#include "DWARFCompileUnit.h"
-#include "DWARFContext.h"
-#include "DWARFDebugAbbrev.h"
-#include "llvm/DebugInfo/DWARFFormValue.h"
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-using namespace dwarf;
-
-// Small helper to extract a DIE pointed by a reference
-// attribute. It looks up the Unit containing the DIE and calls
-// DIE.extractFast with the right unit. Returns new unit on success,
-// nullptr otherwise.
-static const DWARFUnit *findUnitAndExtractFast(DWARFDebugInfoEntryMinimal &DIE,
-                                               const DWARFUnit *Unit,
-                                               uint32_t *Offset) {
-  Unit = Unit->getUnitSection().getUnitForOffset(*Offset);
-  return (Unit && DIE.extractFast(Unit, Offset)) ? Unit : nullptr;
-}
-
-void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS, DWARFUnit *u,
-                                      unsigned recurseDepth,
-                                      unsigned indent) const {
-  DataExtractor debug_info_data = u->getDebugInfoExtractor();
-  uint32_t offset = Offset;
-
-  if (debug_info_data.isValidOffset(offset)) {
-    uint32_t abbrCode = debug_info_data.getULEB128(&offset);
-
-    OS << format("\n0x%8.8x: ", Offset);
-    if (abbrCode) {
-      if (AbbrevDecl) {
-        const char *tagString = TagString(getTag());
-        if (tagString)
-          OS.indent(indent) << tagString;
-        else
-          OS.indent(indent) << format("DW_TAG_Unknown_%x", getTag());
-        OS << format(" [%u] %c\n", abbrCode,
-                     AbbrevDecl->hasChildren() ? '*' : ' ');
-
-        // Dump all data in the DIE for the attributes.
-        for (const auto &AttrSpec : AbbrevDecl->attributes()) {
-          dumpAttribute(OS, u, &offset, AttrSpec.Attr, AttrSpec.Form, indent);
-        }
-
-        const DWARFDebugInfoEntryMinimal *child = getFirstChild();
-        if (recurseDepth > 0 && child) {
-          while (child) {
-            child->dump(OS, u, recurseDepth-1, indent+2);
-            child = child->getSibling();
-          }
-        }
-      } else {
-        OS << "Abbreviation code not found in 'debug_abbrev' class for code: "
-           << abbrCode << '\n';
-      }
-    } else {
-      OS.indent(indent) << "NULL\n";
-    }
-  }
-}
-
-static void dumpApplePropertyAttribute(raw_ostream &OS, uint64_t Val) {
-  OS << " (";
-  do {
-    uint64_t Bit = 1ULL << countTrailingZeros(Val);
-    if (const char *PropName = ApplePropertyString(Bit))
-      OS << PropName;
-    else
-      OS << format("DW_APPLE_PROPERTY_0x%" PRIx64, Bit);
-    if (!(Val ^= Bit))
-      break;
-    OS << ", ";
-  } while (true);
-  OS << ")";
-}
-
-static void dumpRanges(raw_ostream &OS, const DWARFAddressRangesVector& Ranges,
-                       unsigned AddressSize, unsigned Indent) {
-  if (Ranges.empty())
-    return;
-
-  for (const auto &Range: Ranges) {
-    OS << '\n';
-    OS.indent(Indent);
-    OS << format("[0x%0*" PRIx64 " - 0x%0*" PRIx64 ")",
-                 AddressSize*2, Range.first,
-                 AddressSize*2, Range.second);
-  }
-}
-
-void DWARFDebugInfoEntryMinimal::dumpAttribute(raw_ostream &OS,
-                                               DWARFUnit *u,
-                                               uint32_t *offset_ptr,
-                                               uint16_t attr, uint16_t form,
-                                               unsigned indent) const {
-  const char BaseIndent[] = "            ";
-  OS << BaseIndent;
-  OS.indent(indent+2);
-  const char *attrString = AttributeString(attr);
-  if (attrString)
-    OS << attrString;
-  else
-    OS << format("DW_AT_Unknown_%x", attr);
-  const char *formString = FormEncodingString(form);
-  if (formString)
-    OS << " [" << formString << ']';
-  else
-    OS << format(" [DW_FORM_Unknown_%x]", form);
-
-  DWARFFormValue formValue(form);
-
-  if (!formValue.extractValue(u->getDebugInfoExtractor(), offset_ptr, u))
-    return;
-
-  OS << "\t(";
-  
-  const char *Name = nullptr;
-  std::string File;
-  if (attr == DW_AT_decl_file || attr == DW_AT_call_file) {
-    if (const auto *LT = u->getContext().getLineTableForUnit(u))
-      if (LT->getFileNameByIndex(
-             formValue.getAsUnsignedConstant().getValue(),
-             u->getCompilationDir(),
-             DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, File)) {
-        File = '"' + File + '"';
-        Name = File.c_str();
-      }
-  } else if (Optional<uint64_t> Val = formValue.getAsUnsignedConstant())
-    Name = AttributeValueString(attr, *Val);
-
-  if (Name) {
-    OS << Name;
-  } else if (attr == DW_AT_decl_line || attr == DW_AT_call_line) {
-    OS << *formValue.getAsUnsignedConstant();
-  } else {
-    formValue.dump(OS, u);
-  }
-
-  // We have dumped the attribute raw value. For some attributes
-  // having both the raw value and the pretty-printed value is
-  // interesting. These attributes are handled below.
-  if ((attr == DW_AT_specification || attr == DW_AT_abstract_origin) &&
-      // The signature references aren't handled.
-      formValue.getForm() != DW_FORM_ref_sig8) {
-    uint32_t Ref = formValue.getAsReference(u).getValue();
-    DWARFDebugInfoEntryMinimal DIE;
-    if (const DWARFUnit *RefU = findUnitAndExtractFast(DIE, u, &Ref))
-      if (const char *Ref = DIE.getName(RefU, DINameKind::LinkageName))
-        OS << " \"" << Ref << '\"';
-  } else if (attr == DW_AT_APPLE_property_attribute) {
-    if (Optional<uint64_t> OptVal = formValue.getAsUnsignedConstant())
-      dumpApplePropertyAttribute(OS, *OptVal);
-  } else if (attr == DW_AT_ranges) {
-    dumpRanges(OS, getAddressRanges(u), u->getAddressByteSize(),
-               sizeof(BaseIndent)+indent+4);
-  }
-
-  OS << ")\n";
-}
-
-bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFUnit *U,
-                                             uint32_t *OffsetPtr) {
-  Offset = *OffsetPtr;
-  DataExtractor DebugInfoData = U->getDebugInfoExtractor();
-  uint32_t UEndOffset = U->getNextUnitOffset();
-  if (Offset >= UEndOffset || !DebugInfoData.isValidOffset(Offset))
-    return false;
-  uint64_t AbbrCode = DebugInfoData.getULEB128(OffsetPtr);
-  if (0 == AbbrCode) {
-    // NULL debug tag entry.
-    AbbrevDecl = nullptr;
-    return true;
-  }
-  AbbrevDecl = U->getAbbreviations()->getAbbreviationDeclaration(AbbrCode);
-  if (nullptr == AbbrevDecl) {
-    // Restore the original offset.
-    *OffsetPtr = Offset;
-    return false;
-  }
-  ArrayRef<uint8_t> FixedFormSizes = DWARFFormValue::getFixedFormSizes(
-      U->getAddressByteSize(), U->getVersion());
-  assert(FixedFormSizes.size() > 0);
-
-  // Skip all data in the .debug_info for the attributes
-  for (const auto &AttrSpec : AbbrevDecl->attributes()) {
-    uint16_t Form = AttrSpec.Form;
-
-    uint8_t FixedFormSize =
-        (Form < FixedFormSizes.size()) ? FixedFormSizes[Form] : 0;
-    if (FixedFormSize)
-      *OffsetPtr += FixedFormSize;
-    else if (!DWARFFormValue::skipValue(Form, DebugInfoData, OffsetPtr, U)) {
-      // Restore the original offset.
-      *OffsetPtr = Offset;
-      return false;
-    }
-  }
-  return true;
-}
-
-bool DWARFDebugInfoEntryMinimal::isSubprogramDIE() const {
-  return getTag() == DW_TAG_subprogram;
-}
-
-bool DWARFDebugInfoEntryMinimal::isSubroutineDIE() const {
-  uint32_t Tag = getTag();
-  return Tag == DW_TAG_subprogram ||
-         Tag == DW_TAG_inlined_subroutine;
-}
-
-bool DWARFDebugInfoEntryMinimal::getAttributeValue(
-    const DWARFUnit *U, const uint16_t Attr, DWARFFormValue &FormValue) const {
-  if (!AbbrevDecl)
-    return false;
-
-  uint32_t AttrIdx = AbbrevDecl->findAttributeIndex(Attr);
-  if (AttrIdx == -1U)
-    return false;
-
-  DataExtractor DebugInfoData = U->getDebugInfoExtractor();
-  uint32_t DebugInfoOffset = getOffset();
-
-  // Skip the abbreviation code so we are at the data for the attributes
-  DebugInfoData.getULEB128(&DebugInfoOffset);
-
-  // Skip preceding attribute values.
-  for (uint32_t i = 0; i < AttrIdx; ++i) {
-    DWARFFormValue::skipValue(AbbrevDecl->getFormByIndex(i),
-                              DebugInfoData, &DebugInfoOffset, U);
-  }
-
-  FormValue = DWARFFormValue(AbbrevDecl->getFormByIndex(AttrIdx));
-  return FormValue.extractValue(DebugInfoData, &DebugInfoOffset, U);
-}
-
-const char *DWARFDebugInfoEntryMinimal::getAttributeValueAsString(
-    const DWARFUnit *U, const uint16_t Attr, const char *FailValue) const {
-  DWARFFormValue FormValue;
-  if (!getAttributeValue(U, Attr, FormValue))
-    return FailValue;
-  Optional<const char *> Result = FormValue.getAsCString(U);
-  return Result.hasValue() ? Result.getValue() : FailValue;
-}
-
-uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsAddress(
-    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
-  DWARFFormValue FormValue;
-  if (!getAttributeValue(U, Attr, FormValue))
-    return FailValue;
-  Optional<uint64_t> Result = FormValue.getAsAddress(U);
-  return Result.hasValue() ? Result.getValue() : FailValue;
-}
-
-uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsUnsignedConstant(
-    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
-  DWARFFormValue FormValue;
-  if (!getAttributeValue(U, Attr, FormValue))
-    return FailValue;
-  Optional<uint64_t> Result = FormValue.getAsUnsignedConstant();
-  return Result.hasValue() ? Result.getValue() : FailValue;
-}
-
-uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsReference(
-    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
-  DWARFFormValue FormValue;
-  if (!getAttributeValue(U, Attr, FormValue))
-    return FailValue;
-  Optional<uint64_t> Result = FormValue.getAsReference(U);
-  return Result.hasValue() ? Result.getValue() : FailValue;
-}
-
-uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsSectionOffset(
-    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
-  DWARFFormValue FormValue;
-  if (!getAttributeValue(U, Attr, FormValue))
-    return FailValue;
-  Optional<uint64_t> Result = FormValue.getAsSectionOffset();
-  return Result.hasValue() ? Result.getValue() : FailValue;
-}
-
-uint64_t
-DWARFDebugInfoEntryMinimal::getRangesBaseAttribute(const DWARFUnit *U,
-                                                   uint64_t FailValue) const {
-  uint64_t Result =
-      getAttributeValueAsSectionOffset(U, DW_AT_ranges_base, -1ULL);
-  if (Result != -1ULL)
-    return Result;
-  return getAttributeValueAsSectionOffset(U, DW_AT_GNU_ranges_base, FailValue);
-}
-
-bool DWARFDebugInfoEntryMinimal::getLowAndHighPC(const DWARFUnit *U,
-                                                 uint64_t &LowPC,
-                                                 uint64_t &HighPC) const {
-  LowPC = getAttributeValueAsAddress(U, DW_AT_low_pc, -1ULL);
-  if (LowPC == -1ULL)
-    return false;
-  HighPC = getAttributeValueAsAddress(U, DW_AT_high_pc, -1ULL);
-  if (HighPC == -1ULL) {
-    // Since DWARF4, DW_AT_high_pc may also be of class constant, in which case
-    // it represents function size.
-    HighPC = getAttributeValueAsUnsignedConstant(U, DW_AT_high_pc, -1ULL);
-    if (HighPC != -1ULL)
-      HighPC += LowPC;
-  }
-  return (HighPC != -1ULL);
-}
-
-DWARFAddressRangesVector
-DWARFDebugInfoEntryMinimal::getAddressRanges(const DWARFUnit *U) const {
-  if (isNULL())
-    return DWARFAddressRangesVector();
-  // Single range specified by low/high PC.
-  uint64_t LowPC, HighPC;
-  if (getLowAndHighPC(U, LowPC, HighPC)) {
-    return DWARFAddressRangesVector(1, std::make_pair(LowPC, HighPC));
-  }
-  // Multiple ranges from .debug_ranges section.
-  uint32_t RangesOffset =
-      getAttributeValueAsSectionOffset(U, DW_AT_ranges, -1U);
-  if (RangesOffset != -1U) {
-    DWARFDebugRangeList RangeList;
-    if (U->extractRangeList(RangesOffset, RangeList))
-      return RangeList.getAbsoluteRanges(U->getBaseAddress());
-  }
-  return DWARFAddressRangesVector();
-}
-
-void DWARFDebugInfoEntryMinimal::collectChildrenAddressRanges(
-    const DWARFUnit *U, DWARFAddressRangesVector& Ranges) const {
-  if (isNULL())
-    return;
-  if (isSubprogramDIE()) {
-    const auto &DIERanges = getAddressRanges(U);
-    Ranges.insert(Ranges.end(), DIERanges.begin(), DIERanges.end());
-  }
-
-  const DWARFDebugInfoEntryMinimal *Child = getFirstChild();
-  while (Child) {
-    Child->collectChildrenAddressRanges(U, Ranges);
-    Child = Child->getSibling();
-  }
-}
-
-bool DWARFDebugInfoEntryMinimal::addressRangeContainsAddress(
-    const DWARFUnit *U, const uint64_t Address) const {
-  for (const auto& R : getAddressRanges(U)) {
-    if (R.first <= Address && Address < R.second)
-      return true;
-  }
-  return false;
-}
-
-const char *
-DWARFDebugInfoEntryMinimal::getSubroutineName(const DWARFUnit *U,
-                                              DINameKind Kind) const {
-  if (!isSubroutineDIE())
-    return nullptr;
-  return getName(U, Kind);
-}
-
-const char *
-DWARFDebugInfoEntryMinimal::getName(const DWARFUnit *U,
-                                    DINameKind Kind) const {
-  if (Kind == DINameKind::None)
-    return nullptr;
-  // Try to get mangled name only if it was asked for.
-  if (Kind == DINameKind::LinkageName) {
-    if (const char *name =
-            getAttributeValueAsString(U, DW_AT_MIPS_linkage_name, nullptr))
-      return name;
-    if (const char *name =
-            getAttributeValueAsString(U, DW_AT_linkage_name, nullptr))
-      return name;
-  }
-  if (const char *name = getAttributeValueAsString(U, DW_AT_name, nullptr))
-    return name;
-  // Try to get name from specification DIE.
-  uint32_t spec_ref =
-      getAttributeValueAsReference(U, DW_AT_specification, -1U);
-  if (spec_ref != -1U) {
-    DWARFDebugInfoEntryMinimal spec_die;
-    if (const DWARFUnit *RefU = findUnitAndExtractFast(spec_die, U, &spec_ref)) {
-      if (const char *name = spec_die.getName(RefU, Kind))
-        return name;
-    }
-  }
-  // Try to get name from abstract origin DIE.
-  uint32_t abs_origin_ref =
-      getAttributeValueAsReference(U, DW_AT_abstract_origin, -1U);
-  if (abs_origin_ref != -1U) {
-    DWARFDebugInfoEntryMinimal abs_origin_die;
-    if (const DWARFUnit *RefU = findUnitAndExtractFast(abs_origin_die, U,
-                                                       &abs_origin_ref)) {
-      if (const char *name = abs_origin_die.getName(RefU, Kind))
-        return name;
-    }
-  }
-  return nullptr;
-}
-
-void DWARFDebugInfoEntryMinimal::getCallerFrame(const DWARFUnit *U,
-                                                uint32_t &CallFile,
-                                                uint32_t &CallLine,
-                                                uint32_t &CallColumn) const {
-  CallFile = getAttributeValueAsUnsignedConstant(U, DW_AT_call_file, 0);
-  CallLine = getAttributeValueAsUnsignedConstant(U, DW_AT_call_line, 0);
-  CallColumn = getAttributeValueAsUnsignedConstant(U, DW_AT_call_column, 0);
-}
-
-DWARFDebugInfoEntryInlinedChain
-DWARFDebugInfoEntryMinimal::getInlinedChainForAddress(
-    const DWARFUnit *U, const uint64_t Address) const {
-  DWARFDebugInfoEntryInlinedChain InlinedChain;
-  InlinedChain.U = U;
-  if (isNULL())
-    return InlinedChain;
-  for (const DWARFDebugInfoEntryMinimal *DIE = this; DIE; ) {
-    // Append current DIE to inlined chain only if it has correct tag
-    // (e.g. it is not a lexical block).
-    if (DIE->isSubroutineDIE()) {
-      InlinedChain.DIEs.push_back(*DIE);
-    }
-    // Try to get child which also contains provided address.
-    const DWARFDebugInfoEntryMinimal *Child = DIE->getFirstChild();
-    while (Child) {
-      if (Child->addressRangeContainsAddress(U, Address)) {
-        // Assume there is only one such child.
-        break;
-      }
-      Child = Child->getSibling();
-    }
-    DIE = Child;
-  }
-  // Reverse the obtained chain to make the root of inlined chain last.
-  std::reverse(InlinedChain.DIEs.begin(), InlinedChain.DIEs.end());
-  return InlinedChain;
-}
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.h b/lib/DebugInfo/DWARFDebugInfoEntry.h
deleted file mode 100644
index 7e7efb9..0000000
--- a/lib/DebugInfo/DWARFDebugInfoEntry.h
+++ /dev/null
@@ -1,160 +0,0 @@
-//===-- DWARFDebugInfoEntry.h -----------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGINFOENTRY_H
-#define LLVM_LIB_DEBUGINFO_DWARFDEBUGINFOENTRY_H
-
-#include "DWARFAbbreviationDeclaration.h"
-#include "DWARFDebugRangeList.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/DIContext.h"
-#include "llvm/Support/DataTypes.h"
-
-namespace llvm {
-
-class DWARFDebugAranges;
-class DWARFCompileUnit;
-class DWARFUnit;
-class DWARFContext;
-class DWARFFormValue;
-struct DWARFDebugInfoEntryInlinedChain;
-
-/// DWARFDebugInfoEntryMinimal - A DIE with only the minimum required data.
-class DWARFDebugInfoEntryMinimal {
-  /// Offset within the .debug_info of the start of this entry.
-  uint32_t Offset;
-
-  /// How many to add to "this" to get the sibling.
-  uint32_t SiblingIdx;
-
-  const DWARFAbbreviationDeclaration *AbbrevDecl;
-public:
-  DWARFDebugInfoEntryMinimal()
-    : Offset(0), SiblingIdx(0), AbbrevDecl(nullptr) {}
-
-  void dump(raw_ostream &OS, DWARFUnit *u, unsigned recurseDepth,
-            unsigned indent = 0) const;
-  void dumpAttribute(raw_ostream &OS, DWARFUnit *u, uint32_t *offset_ptr,
-                     uint16_t attr, uint16_t form, unsigned indent = 0) const;
-
-  /// Extracts a debug info entry, which is a child of a given unit,
-  /// starting at a given offset. If DIE can't be extracted, returns false and
-  /// doesn't change OffsetPtr.
-  bool extractFast(const DWARFUnit *U, uint32_t *OffsetPtr);
-
-  uint32_t getTag() const { return AbbrevDecl ? AbbrevDecl->getTag() : 0; }
-  bool isNULL() const { return AbbrevDecl == nullptr; }
-
-  /// Returns true if DIE represents a subprogram (not inlined).
-  bool isSubprogramDIE() const;
-  /// Returns true if DIE represents a subprogram or an inlined
-  /// subroutine.
-  bool isSubroutineDIE() const;
-
-  uint32_t getOffset() const { return Offset; }
-  bool hasChildren() const { return !isNULL() && AbbrevDecl->hasChildren(); }
-
-  // We know we are kept in a vector of contiguous entries, so we know
-  // our sibling will be some index after "this".
-  const DWARFDebugInfoEntryMinimal *getSibling() const {
-    return SiblingIdx > 0 ? this + SiblingIdx : nullptr;
-  }
-
-  // We know we are kept in a vector of contiguous entries, so we know
-  // we don't need to store our child pointer, if we have a child it will
-  // be the next entry in the list...
-  const DWARFDebugInfoEntryMinimal *getFirstChild() const {
-    return hasChildren() ? this + 1 : nullptr;
-  }
-
-  void setSibling(const DWARFDebugInfoEntryMinimal *Sibling) {
-    if (Sibling) {
-      // We know we are kept in a vector of contiguous entries, so we know
-      // our sibling will be some index after "this".
-      SiblingIdx = Sibling - this;
-    } else
-      SiblingIdx = 0;
-  }
-
-  const DWARFAbbreviationDeclaration *getAbbreviationDeclarationPtr() const {
-    return AbbrevDecl;
-  }
-
-  bool getAttributeValue(const DWARFUnit *U, const uint16_t Attr,
-                         DWARFFormValue &FormValue) const;
-
-  const char *getAttributeValueAsString(const DWARFUnit *U, const uint16_t Attr,
-                                        const char *FailValue) const;
-
-  uint64_t getAttributeValueAsAddress(const DWARFUnit *U, const uint16_t Attr,
-                                      uint64_t FailValue) const;
-
-  uint64_t getAttributeValueAsUnsignedConstant(const DWARFUnit *U,
-                                               const uint16_t Attr,
-                                               uint64_t FailValue) const;
-
-  uint64_t getAttributeValueAsReference(const DWARFUnit *U, const uint16_t Attr,
-                                        uint64_t FailValue) const;
-
-  uint64_t getAttributeValueAsSectionOffset(const DWARFUnit *U,
-                                            const uint16_t Attr,
-                                            uint64_t FailValue) const;
-
-  uint64_t getRangesBaseAttribute(const DWARFUnit *U, uint64_t FailValue) const;
-
-  /// Retrieves DW_AT_low_pc and DW_AT_high_pc from CU.
-  /// Returns true if both attributes are present.
-  bool getLowAndHighPC(const DWARFUnit *U, uint64_t &LowPC,
-                       uint64_t &HighPC) const;
-
-  DWARFAddressRangesVector getAddressRanges(const DWARFUnit *U) const;
-
-  void collectChildrenAddressRanges(const DWARFUnit *U,
-                                    DWARFAddressRangesVector &Ranges) const;
-
-  bool addressRangeContainsAddress(const DWARFUnit *U,
-                                   const uint64_t Address) const;
-
-  /// If a DIE represents a subprogram (or inlined subroutine),
-  /// returns its mangled name (or short name, if mangled is missing).
-  /// This name may be fetched from specification or abstract origin
-  /// for this subprogram. Returns null if no name is found.
-  const char *getSubroutineName(const DWARFUnit *U, DINameKind Kind) const;
-
-  /// Return the DIE name resolving DW_AT_sepcification or
-  /// DW_AT_abstract_origin references if necessary.
-  /// Returns null if no name is found.
-  const char *getName(const DWARFUnit *U, DINameKind Kind) const;
-
-  /// Retrieves values of DW_AT_call_file, DW_AT_call_line and
-  /// DW_AT_call_column from DIE (or zeroes if they are missing).
-  void getCallerFrame(const DWARFUnit *U, uint32_t &CallFile,
-                      uint32_t &CallLine, uint32_t &CallColumn) const;
-
-  /// Get inlined chain for a given address, rooted at the current DIE.
-  /// Returns empty chain if address is not contained in address range
-  /// of current DIE.
-  DWARFDebugInfoEntryInlinedChain
-  getInlinedChainForAddress(const DWARFUnit *U, const uint64_t Address) const;
-};
-
-/// DWARFDebugInfoEntryInlinedChain - represents a chain of inlined_subroutine
-/// DIEs, (possibly ending with subprogram DIE), all of which are contained
-/// in some concrete inlined instance tree. Address range for each DIE
-/// (except the last DIE) in this chain is contained in address
-/// range for next DIE in the chain.
-struct DWARFDebugInfoEntryInlinedChain {
-  DWARFDebugInfoEntryInlinedChain() : U(nullptr) {}
-  SmallVector<DWARFDebugInfoEntryMinimal, 4> DIEs;
-  const DWARFUnit *U;
-};
-
-}
-
-#endif
diff --git a/lib/DebugInfo/DWARFDebugLine.cpp b/lib/DebugInfo/DWARFDebugLine.cpp
deleted file mode 100644
index a6ee461..0000000
--- a/lib/DebugInfo/DWARFDebugLine.cpp
+++ /dev/null
@@ -1,698 +0,0 @@
-//===-- DWARFDebugLine.cpp ------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFDebugLine.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-using namespace llvm;
-using namespace dwarf;
-typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind;
-
-DWARFDebugLine::Prologue::Prologue() {
-  clear();
-}
-
-void DWARFDebugLine::Prologue::clear() {
-  TotalLength = Version = PrologueLength = 0;
-  MinInstLength = MaxOpsPerInst = DefaultIsStmt = LineBase = LineRange = 0;
-  OpcodeBase = 0;
-  StandardOpcodeLengths.clear();
-  IncludeDirectories.clear();
-  FileNames.clear();
-}
-
-void DWARFDebugLine::Prologue::dump(raw_ostream &OS) const {
-  OS << "Line table prologue:\n"
-     << format("    total_length: 0x%8.8x\n", TotalLength)
-     << format("         version: %u\n", Version)
-     << format(" prologue_length: 0x%8.8x\n", PrologueLength)
-     << format(" min_inst_length: %u\n", MinInstLength)
-     << format(Version >= 4 ? "max_ops_per_inst: %u\n" : "", MaxOpsPerInst)
-     << format(" default_is_stmt: %u\n", DefaultIsStmt)
-     << format("       line_base: %i\n", LineBase)
-     << format("      line_range: %u\n", LineRange)
-     << format("     opcode_base: %u\n", OpcodeBase);
-
-  for (uint32_t i = 0; i < StandardOpcodeLengths.size(); ++i)
-    OS << format("standard_opcode_lengths[%s] = %u\n", LNStandardString(i+1),
-                 StandardOpcodeLengths[i]);
-
-  if (!IncludeDirectories.empty())
-    for (uint32_t i = 0; i < IncludeDirectories.size(); ++i)
-      OS << format("include_directories[%3u] = '", i+1)
-         << IncludeDirectories[i] << "'\n";
-
-  if (!FileNames.empty()) {
-    OS << "                Dir  Mod Time   File Len   File Name\n"
-       << "                ---- ---------- ---------- -----------"
-          "----------------\n";
-    for (uint32_t i = 0; i < FileNames.size(); ++i) {
-      const FileNameEntry& fileEntry = FileNames[i];
-      OS << format("file_names[%3u] %4" PRIu64 " ", i+1, fileEntry.DirIdx)
-         << format("0x%8.8" PRIx64 " 0x%8.8" PRIx64 " ",
-                   fileEntry.ModTime, fileEntry.Length)
-         << fileEntry.Name << '\n';
-    }
-  }
-}
-
-bool DWARFDebugLine::Prologue::parse(DataExtractor debug_line_data,
-                                     uint32_t *offset_ptr) {
-  const uint32_t prologue_offset = *offset_ptr;
-
-  clear();
-  TotalLength = debug_line_data.getU32(offset_ptr);
-  Version = debug_line_data.getU16(offset_ptr);
-  if (Version < 2)
-    return false;
-
-  PrologueLength = debug_line_data.getU32(offset_ptr);
-  const uint32_t end_prologue_offset = PrologueLength + *offset_ptr;
-  MinInstLength = debug_line_data.getU8(offset_ptr);
-  if (Version >= 4)
-    MaxOpsPerInst = debug_line_data.getU8(offset_ptr);
-  DefaultIsStmt = debug_line_data.getU8(offset_ptr);
-  LineBase = debug_line_data.getU8(offset_ptr);
-  LineRange = debug_line_data.getU8(offset_ptr);
-  OpcodeBase = debug_line_data.getU8(offset_ptr);
-
-  StandardOpcodeLengths.reserve(OpcodeBase - 1);
-  for (uint32_t i = 1; i < OpcodeBase; ++i) {
-    uint8_t op_len = debug_line_data.getU8(offset_ptr);
-    StandardOpcodeLengths.push_back(op_len);
-  }
-
-  while (*offset_ptr < end_prologue_offset) {
-    const char *s = debug_line_data.getCStr(offset_ptr);
-    if (s && s[0])
-      IncludeDirectories.push_back(s);
-    else
-      break;
-  }
-
-  while (*offset_ptr < end_prologue_offset) {
-    const char *name = debug_line_data.getCStr(offset_ptr);
-    if (name && name[0]) {
-      FileNameEntry fileEntry;
-      fileEntry.Name = name;
-      fileEntry.DirIdx = debug_line_data.getULEB128(offset_ptr);
-      fileEntry.ModTime = debug_line_data.getULEB128(offset_ptr);
-      fileEntry.Length = debug_line_data.getULEB128(offset_ptr);
-      FileNames.push_back(fileEntry);
-    } else {
-      break;
-    }
-  }
-
-  if (*offset_ptr != end_prologue_offset) {
-    fprintf(stderr, "warning: parsing line table prologue at 0x%8.8x should"
-                    " have ended at 0x%8.8x but it ended at 0x%8.8x\n",
-            prologue_offset, end_prologue_offset, *offset_ptr);
-    return false;
-  }
-  return true;
-}
-
-DWARFDebugLine::Row::Row(bool default_is_stmt) {
-  reset(default_is_stmt);
-}
-
-void DWARFDebugLine::Row::postAppend() {
-  BasicBlock = false;
-  PrologueEnd = false;
-  EpilogueBegin = false;
-}
-
-void DWARFDebugLine::Row::reset(bool default_is_stmt) {
-  Address = 0;
-  Line = 1;
-  Column = 0;
-  File = 1;
-  Isa = 0;
-  Discriminator = 0;
-  IsStmt = default_is_stmt;
-  BasicBlock = false;
-  EndSequence = false;
-  PrologueEnd = false;
-  EpilogueBegin = false;
-}
-
-void DWARFDebugLine::Row::dump(raw_ostream &OS) const {
-  OS << format("0x%16.16" PRIx64 " %6u %6u", Address, Line, Column)
-     << format(" %6u %3u %13u ", File, Isa, Discriminator)
-     << (IsStmt ? " is_stmt" : "")
-     << (BasicBlock ? " basic_block" : "")
-     << (PrologueEnd ? " prologue_end" : "")
-     << (EpilogueBegin ? " epilogue_begin" : "")
-     << (EndSequence ? " end_sequence" : "")
-     << '\n';
-}
-
-DWARFDebugLine::Sequence::Sequence() {
-  reset();
-}
-
-void DWARFDebugLine::Sequence::reset() {
-  LowPC = 0;
-  HighPC = 0;
-  FirstRowIndex = 0;
-  LastRowIndex = 0;
-  Empty = true;
-}
-
-DWARFDebugLine::LineTable::LineTable() {
-  clear();
-}
-
-void DWARFDebugLine::LineTable::dump(raw_ostream &OS) const {
-  Prologue.dump(OS);
-  OS << '\n';
-
-  if (!Rows.empty()) {
-    OS << "Address            Line   Column File   ISA Discriminator Flags\n"
-       << "------------------ ------ ------ ------ --- ------------- "
-          "-------------\n";
-    for (const Row &R : Rows) {
-      R.dump(OS);
-    }
-  }
-}
-
-void DWARFDebugLine::LineTable::clear() {
-  Prologue.clear();
-  Rows.clear();
-  Sequences.clear();
-}
-
-DWARFDebugLine::ParsingState::ParsingState(struct LineTable *LT)
-    : LineTable(LT), RowNumber(0) {
-  resetRowAndSequence();
-}
-
-void DWARFDebugLine::ParsingState::resetRowAndSequence() {
-  Row.reset(LineTable->Prologue.DefaultIsStmt);
-  Sequence.reset();
-}
-
-void DWARFDebugLine::ParsingState::appendRowToMatrix(uint32_t offset) {
-  if (Sequence.Empty) {
-    // Record the beginning of instruction sequence.
-    Sequence.Empty = false;
-    Sequence.LowPC = Row.Address;
-    Sequence.FirstRowIndex = RowNumber;
-  }
-  ++RowNumber;
-  LineTable->appendRow(Row);
-  if (Row.EndSequence) {
-    // Record the end of instruction sequence.
-    Sequence.HighPC = Row.Address;
-    Sequence.LastRowIndex = RowNumber;
-    if (Sequence.isValid())
-      LineTable->appendSequence(Sequence);
-    Sequence.reset();
-  }
-  Row.postAppend();
-}
-
-const DWARFDebugLine::LineTable *
-DWARFDebugLine::getLineTable(uint32_t offset) const {
-  LineTableConstIter pos = LineTableMap.find(offset);
-  if (pos != LineTableMap.end())
-    return &pos->second;
-  return nullptr;
-}
-
-const DWARFDebugLine::LineTable *
-DWARFDebugLine::getOrParseLineTable(DataExtractor debug_line_data,
-                                    uint32_t offset) {
-  std::pair<LineTableIter, bool> pos =
-    LineTableMap.insert(LineTableMapTy::value_type(offset, LineTable()));
-  LineTable *LT = &pos.first->second;
-  if (pos.second) {
-    if (!LT->parse(debug_line_data, RelocMap, &offset))
-      return nullptr;
-  }
-  return LT;
-}
-
-bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
-                                      const RelocAddrMap *RMap,
-                                      uint32_t *offset_ptr) {
-  const uint32_t debug_line_offset = *offset_ptr;
-
-  clear();
-
-  if (!Prologue.parse(debug_line_data, offset_ptr)) {
-    // Restore our offset and return false to indicate failure!
-    *offset_ptr = debug_line_offset;
-    return false;
-  }
-
-  const uint32_t end_offset = debug_line_offset + Prologue.TotalLength +
-                              sizeof(Prologue.TotalLength);
-
-  ParsingState State(this);
-
-  while (*offset_ptr < end_offset) {
-    uint8_t opcode = debug_line_data.getU8(offset_ptr);
-
-    if (opcode == 0) {
-      // Extended Opcodes always start with a zero opcode followed by
-      // a uleb128 length so you can skip ones you don't know about
-      uint32_t ext_offset = *offset_ptr;
-      uint64_t len = debug_line_data.getULEB128(offset_ptr);
-      uint32_t arg_size = len - (*offset_ptr - ext_offset);
-
-      uint8_t sub_opcode = debug_line_data.getU8(offset_ptr);
-      switch (sub_opcode) {
-      case DW_LNE_end_sequence:
-        // Set the end_sequence register of the state machine to true and
-        // append a row to the matrix using the current values of the
-        // state-machine registers. Then reset the registers to the initial
-        // values specified above. Every statement program sequence must end
-        // with a DW_LNE_end_sequence instruction which creates a row whose
-        // address is that of the byte after the last target machine instruction
-        // of the sequence.
-        State.Row.EndSequence = true;
-        State.appendRowToMatrix(*offset_ptr);
-        State.resetRowAndSequence();
-        break;
-
-      case DW_LNE_set_address:
-        // Takes a single relocatable address as an operand. The size of the
-        // operand is the size appropriate to hold an address on the target
-        // machine. Set the address register to the value given by the
-        // relocatable address. All of the other statement program opcodes
-        // that affect the address register add a delta to it. This instruction
-        // stores a relocatable value into it instead.
-        {
-          // If this address is in our relocation map, apply the relocation.
-          RelocAddrMap::const_iterator AI = RMap->find(*offset_ptr);
-          if (AI != RMap->end()) {
-             const std::pair<uint8_t, int64_t> &R = AI->second;
-             State.Row.Address =
-                 debug_line_data.getAddress(offset_ptr) + R.second;
-          } else
-            State.Row.Address = debug_line_data.getAddress(offset_ptr);
-        }
-        break;
-
-      case DW_LNE_define_file:
-        // Takes 4 arguments. The first is a null terminated string containing
-        // a source file name. The second is an unsigned LEB128 number
-        // representing the directory index of the directory in which the file
-        // was found. The third is an unsigned LEB128 number representing the
-        // time of last modification of the file. The fourth is an unsigned
-        // LEB128 number representing the length in bytes of the file. The time
-        // and length fields may contain LEB128(0) if the information is not
-        // available.
-        //
-        // The directory index represents an entry in the include_directories
-        // section of the statement program prologue. The index is LEB128(0)
-        // if the file was found in the current directory of the compilation,
-        // LEB128(1) if it was found in the first directory in the
-        // include_directories section, and so on. The directory index is
-        // ignored for file names that represent full path names.
-        //
-        // The files are numbered, starting at 1, in the order in which they
-        // appear; the names in the prologue come before names defined by
-        // the DW_LNE_define_file instruction. These numbers are used in the
-        // the file register of the state machine.
-        {
-          FileNameEntry fileEntry;
-          fileEntry.Name = debug_line_data.getCStr(offset_ptr);
-          fileEntry.DirIdx = debug_line_data.getULEB128(offset_ptr);
-          fileEntry.ModTime = debug_line_data.getULEB128(offset_ptr);
-          fileEntry.Length = debug_line_data.getULEB128(offset_ptr);
-          Prologue.FileNames.push_back(fileEntry);
-        }
-        break;
-
-      case DW_LNE_set_discriminator:
-        State.Row.Discriminator = debug_line_data.getULEB128(offset_ptr);
-        break;
-
-      default:
-        // Length doesn't include the zero opcode byte or the length itself, but
-        // it does include the sub_opcode, so we have to adjust for that below
-        (*offset_ptr) += arg_size;
-        break;
-      }
-    } else if (opcode < Prologue.OpcodeBase) {
-      switch (opcode) {
-      // Standard Opcodes
-      case DW_LNS_copy:
-        // Takes no arguments. Append a row to the matrix using the
-        // current values of the state-machine registers. Then set
-        // the basic_block register to false.
-        State.appendRowToMatrix(*offset_ptr);
-        break;
-
-      case DW_LNS_advance_pc:
-        // Takes a single unsigned LEB128 operand, multiplies it by the
-        // min_inst_length field of the prologue, and adds the
-        // result to the address register of the state machine.
-        State.Row.Address +=
-            debug_line_data.getULEB128(offset_ptr) * Prologue.MinInstLength;
-        break;
-
-      case DW_LNS_advance_line:
-        // Takes a single signed LEB128 operand and adds that value to
-        // the line register of the state machine.
-        State.Row.Line += debug_line_data.getSLEB128(offset_ptr);
-        break;
-
-      case DW_LNS_set_file:
-        // Takes a single unsigned LEB128 operand and stores it in the file
-        // register of the state machine.
-        State.Row.File = debug_line_data.getULEB128(offset_ptr);
-        break;
-
-      case DW_LNS_set_column:
-        // Takes a single unsigned LEB128 operand and stores it in the
-        // column register of the state machine.
-        State.Row.Column = debug_line_data.getULEB128(offset_ptr);
-        break;
-
-      case DW_LNS_negate_stmt:
-        // Takes no arguments. Set the is_stmt register of the state
-        // machine to the logical negation of its current value.
-        State.Row.IsStmt = !State.Row.IsStmt;
-        break;
-
-      case DW_LNS_set_basic_block:
-        // Takes no arguments. Set the basic_block register of the
-        // state machine to true
-        State.Row.BasicBlock = true;
-        break;
-
-      case DW_LNS_const_add_pc:
-        // Takes no arguments. Add to the address register of the state
-        // machine the address increment value corresponding to special
-        // opcode 255. The motivation for DW_LNS_const_add_pc is this:
-        // when the statement program needs to advance the address by a
-        // small amount, it can use a single special opcode, which occupies
-        // a single byte. When it needs to advance the address by up to
-        // twice the range of the last special opcode, it can use
-        // DW_LNS_const_add_pc followed by a special opcode, for a total
-        // of two bytes. Only if it needs to advance the address by more
-        // than twice that range will it need to use both DW_LNS_advance_pc
-        // and a special opcode, requiring three or more bytes.
-        {
-          uint8_t adjust_opcode = 255 - Prologue.OpcodeBase;
-          uint64_t addr_offset =
-              (adjust_opcode / Prologue.LineRange) * Prologue.MinInstLength;
-          State.Row.Address += addr_offset;
-        }
-        break;
-
-      case DW_LNS_fixed_advance_pc:
-        // Takes a single uhalf operand. Add to the address register of
-        // the state machine the value of the (unencoded) operand. This
-        // is the only extended opcode that takes an argument that is not
-        // a variable length number. The motivation for DW_LNS_fixed_advance_pc
-        // is this: existing assemblers cannot emit DW_LNS_advance_pc or
-        // special opcodes because they cannot encode LEB128 numbers or
-        // judge when the computation of a special opcode overflows and
-        // requires the use of DW_LNS_advance_pc. Such assemblers, however,
-        // can use DW_LNS_fixed_advance_pc instead, sacrificing compression.
-        State.Row.Address += debug_line_data.getU16(offset_ptr);
-        break;
-
-      case DW_LNS_set_prologue_end:
-        // Takes no arguments. Set the prologue_end register of the
-        // state machine to true
-        State.Row.PrologueEnd = true;
-        break;
-
-      case DW_LNS_set_epilogue_begin:
-        // Takes no arguments. Set the basic_block register of the
-        // state machine to true
-        State.Row.EpilogueBegin = true;
-        break;
-
-      case DW_LNS_set_isa:
-        // Takes a single unsigned LEB128 operand and stores it in the
-        // column register of the state machine.
-        State.Row.Isa = debug_line_data.getULEB128(offset_ptr);
-        break;
-
-      default:
-        // Handle any unknown standard opcodes here. We know the lengths
-        // of such opcodes because they are specified in the prologue
-        // as a multiple of LEB128 operands for each opcode.
-        {
-          assert(opcode - 1U < Prologue.StandardOpcodeLengths.size());
-          uint8_t opcode_length = Prologue.StandardOpcodeLengths[opcode - 1];
-          for (uint8_t i = 0; i < opcode_length; ++i)
-            debug_line_data.getULEB128(offset_ptr);
-        }
-        break;
-      }
-    } else {
-      // Special Opcodes
-
-      // A special opcode value is chosen based on the amount that needs
-      // to be added to the line and address registers. The maximum line
-      // increment for a special opcode is the value of the line_base
-      // field in the header, plus the value of the line_range field,
-      // minus 1 (line base + line range - 1). If the desired line
-      // increment is greater than the maximum line increment, a standard
-      // opcode must be used instead of a special opcode. The "address
-      // advance" is calculated by dividing the desired address increment
-      // by the minimum_instruction_length field from the header. The
-      // special opcode is then calculated using the following formula:
-      //
-      //  opcode = (desired line increment - line_base) +
-      //           (line_range * address advance) + opcode_base
-      //
-      // If the resulting opcode is greater than 255, a standard opcode
-      // must be used instead.
-      //
-      // To decode a special opcode, subtract the opcode_base from the
-      // opcode itself to give the adjusted opcode. The amount to
-      // increment the address register is the result of the adjusted
-      // opcode divided by the line_range multiplied by the
-      // minimum_instruction_length field from the header. That is:
-      //
-      //  address increment = (adjusted opcode / line_range) *
-      //                      minimum_instruction_length
-      //
-      // The amount to increment the line register is the line_base plus
-      // the result of the adjusted opcode modulo the line_range. That is:
-      //
-      // line increment = line_base + (adjusted opcode % line_range)
-
-      uint8_t adjust_opcode = opcode - Prologue.OpcodeBase;
-      uint64_t addr_offset =
-          (adjust_opcode / Prologue.LineRange) * Prologue.MinInstLength;
-      int32_t line_offset =
-          Prologue.LineBase + (adjust_opcode % Prologue.LineRange);
-      State.Row.Line += line_offset;
-      State.Row.Address += addr_offset;
-      State.appendRowToMatrix(*offset_ptr);
-    }
-  }
-
-  if (!State.Sequence.Empty) {
-    fprintf(stderr, "warning: last sequence in debug line table is not"
-                    "terminated!\n");
-  }
-
-  // Sort all sequences so that address lookup will work faster.
-  if (!Sequences.empty()) {
-    std::sort(Sequences.begin(), Sequences.end(), Sequence::orderByLowPC);
-    // Note: actually, instruction address ranges of sequences should not
-    // overlap (in shared objects and executables). If they do, the address
-    // lookup would still work, though, but result would be ambiguous.
-    // We don't report warning in this case. For example,
-    // sometimes .so compiled from multiple object files contains a few
-    // rudimentary sequences for address ranges [0x0, 0xsomething).
-  }
-
-  return end_offset;
-}
-
-uint32_t DWARFDebugLine::LineTable::lookupAddress(uint64_t address) const {
-  uint32_t unknown_index = UINT32_MAX;
-  if (Sequences.empty())
-    return unknown_index;
-  // First, find an instruction sequence containing the given address.
-  DWARFDebugLine::Sequence sequence;
-  sequence.LowPC = address;
-  SequenceIter first_seq = Sequences.begin();
-  SequenceIter last_seq = Sequences.end();
-  SequenceIter seq_pos = std::lower_bound(first_seq, last_seq, sequence,
-      DWARFDebugLine::Sequence::orderByLowPC);
-  DWARFDebugLine::Sequence found_seq;
-  if (seq_pos == last_seq) {
-    found_seq = Sequences.back();
-  } else if (seq_pos->LowPC == address) {
-    found_seq = *seq_pos;
-  } else {
-    if (seq_pos == first_seq)
-      return unknown_index;
-    found_seq = *(seq_pos - 1);
-  }
-  if (!found_seq.containsPC(address))
-    return unknown_index;
-  // Search for instruction address in the rows describing the sequence.
-  // Rows are stored in a vector, so we may use arithmetical operations with
-  // iterators.
-  DWARFDebugLine::Row row;
-  row.Address = address;
-  RowIter first_row = Rows.begin() + found_seq.FirstRowIndex;
-  RowIter last_row = Rows.begin() + found_seq.LastRowIndex;
-  RowIter row_pos = std::lower_bound(first_row, last_row, row,
-      DWARFDebugLine::Row::orderByAddress);
-  if (row_pos == last_row) {
-    return found_seq.LastRowIndex - 1;
-  }
-  uint32_t index = found_seq.FirstRowIndex + (row_pos - first_row);
-  if (row_pos->Address > address) {
-    if (row_pos == first_row)
-      return unknown_index;
-    else
-      index--;
-  }
-  return index;
-}
-
-bool DWARFDebugLine::LineTable::lookupAddressRange(
-    uint64_t address, uint64_t size, std::vector<uint32_t> &result) const {
-  if (Sequences.empty())
-    return false;
-  uint64_t end_addr = address + size;
-  // First, find an instruction sequence containing the given address.
-  DWARFDebugLine::Sequence sequence;
-  sequence.LowPC = address;
-  SequenceIter first_seq = Sequences.begin();
-  SequenceIter last_seq = Sequences.end();
-  SequenceIter seq_pos = std::lower_bound(first_seq, last_seq, sequence,
-      DWARFDebugLine::Sequence::orderByLowPC);
-  if (seq_pos == last_seq || seq_pos->LowPC != address) {
-    if (seq_pos == first_seq)
-      return false;
-    seq_pos--;
-  }
-  if (!seq_pos->containsPC(address))
-    return false;
-
-  SequenceIter start_pos = seq_pos;
-
-  // Add the rows from the first sequence to the vector, starting with the
-  // index we just calculated
-
-  while (seq_pos != last_seq && seq_pos->LowPC < end_addr) {
-    DWARFDebugLine::Sequence cur_seq = *seq_pos;
-    uint32_t first_row_index;
-    uint32_t last_row_index;
-    if (seq_pos == start_pos) {
-      // For the first sequence, we need to find which row in the sequence is the
-      // first in our range. Rows are stored in a vector, so we may use
-      // arithmetical operations with iterators.
-      DWARFDebugLine::Row row;
-      row.Address = address;
-      RowIter first_row = Rows.begin() + cur_seq.FirstRowIndex;
-      RowIter last_row = Rows.begin() + cur_seq.LastRowIndex;
-      RowIter row_pos = std::upper_bound(first_row, last_row, row,
-                                         DWARFDebugLine::Row::orderByAddress);
-      // The 'row_pos' iterator references the first row that is greater than
-      // our start address. Unless that's the first row, we want to start at
-      // the row before that.
-      first_row_index = cur_seq.FirstRowIndex + (row_pos - first_row);
-      if (row_pos != first_row)
-        --first_row_index;
-    } else
-      first_row_index = cur_seq.FirstRowIndex;
-
-    // For the last sequence in our range, we need to figure out the last row in
-    // range.  For all other sequences we can go to the end of the sequence.
-    if (cur_seq.HighPC > end_addr) {
-      DWARFDebugLine::Row row;
-      row.Address = end_addr;
-      RowIter first_row = Rows.begin() + cur_seq.FirstRowIndex;
-      RowIter last_row = Rows.begin() + cur_seq.LastRowIndex;
-      RowIter row_pos = std::upper_bound(first_row, last_row, row,
-                                         DWARFDebugLine::Row::orderByAddress);
-      // The 'row_pos' iterator references the first row that is greater than
-      // our end address.  The row before that is the last row we want.
-      last_row_index = cur_seq.FirstRowIndex + (row_pos - first_row) - 1;
-    } else
-      // Contrary to what you might expect, DWARFDebugLine::SequenceLastRowIndex
-      // isn't a valid index within the current sequence.  It's that plus one.
-      last_row_index = cur_seq.LastRowIndex - 1;
-
-    for (uint32_t i = first_row_index; i <= last_row_index; ++i) {
-      result.push_back(i);
-    }
-
-    ++seq_pos;
-  }
-
-  return true;
-}
-
-bool
-DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
-                                              const char *CompDir,
-                                              FileLineInfoKind Kind,
-                                              std::string &Result) const {
-  if (FileIndex == 0 || FileIndex > Prologue.FileNames.size() ||
-      Kind == FileLineInfoKind::None)
-    return false;
-  const FileNameEntry &Entry = Prologue.FileNames[FileIndex - 1];
-  const char *FileName = Entry.Name;
-  if (Kind != FileLineInfoKind::AbsoluteFilePath ||
-      sys::path::is_absolute(FileName)) {
-    Result = FileName;
-    return true;
-  }
-
-  SmallString<16> FilePath;
-  uint64_t IncludeDirIndex = Entry.DirIdx;
-  const char *IncludeDir = "";
-  // Be defensive about the contents of Entry.
-  if (IncludeDirIndex > 0 &&
-      IncludeDirIndex <= Prologue.IncludeDirectories.size())
-    IncludeDir = Prologue.IncludeDirectories[IncludeDirIndex - 1];
-
-  // We may still need to append compilation directory of compile unit.
-  // We know that FileName is not absolute, the only way to have an
-  // absolute path at this point would be if IncludeDir is absolute.
-  if (CompDir && Kind == FileLineInfoKind::AbsoluteFilePath &&
-      sys::path::is_relative(IncludeDir))
-    sys::path::append(FilePath, CompDir);
-
-  // sys::path::append skips empty strings.
-  sys::path::append(FilePath, IncludeDir, FileName);
-  Result = FilePath.str();
-  return true;
-}
-
-bool
-DWARFDebugLine::LineTable::getFileLineInfoForAddress(uint64_t Address,
-                                                     const char *CompDir,
-                                                     FileLineInfoKind Kind,
-                                                     DILineInfo &Result) const {
-  // Get the index of row we're looking for in the line table.
-  uint32_t RowIndex = lookupAddress(Address);
-  if (RowIndex == -1U)
-    return false;
-  // Take file number and line/column from the row.
-  const auto &Row = Rows[RowIndex];
-  if (!getFileNameByIndex(Row.File, CompDir, Kind, Result.FileName))
-    return false;
-  Result.Line = Row.Line;
-  Result.Column = Row.Column;
-  return true;
-}
diff --git a/lib/DebugInfo/DWARFDebugLine.h b/lib/DebugInfo/DWARFDebugLine.h
deleted file mode 100644
index 7a6f1bd..0000000
--- a/lib/DebugInfo/DWARFDebugLine.h
+++ /dev/null
@@ -1,238 +0,0 @@
-//===-- DWARFDebugLine.h ----------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGLINE_H
-#define LLVM_LIB_DEBUGINFO_DWARFDEBUGLINE_H
-
-#include "DWARFRelocMap.h"
-#include "llvm/DebugInfo/DIContext.h"
-#include "llvm/Support/DataExtractor.h"
-#include <map>
-#include <string>
-#include <vector>
-
-namespace llvm {
-
-class raw_ostream;
-
-class DWARFDebugLine {
-public:
-  DWARFDebugLine(const RelocAddrMap* LineInfoRelocMap) : RelocMap(LineInfoRelocMap) {}
-  struct FileNameEntry {
-    FileNameEntry() : Name(nullptr), DirIdx(0), ModTime(0), Length(0) {}
-
-    const char *Name;
-    uint64_t DirIdx;
-    uint64_t ModTime;
-    uint64_t Length;
-  };
-
-  struct Prologue {
-    Prologue();
-
-    // The size in bytes of the statement information for this compilation unit
-    // (not including the total_length field itself).
-    uint32_t TotalLength;
-    // Version identifier for the statement information format.
-    uint16_t Version;
-    // The number of bytes following the prologue_length field to the beginning
-    // of the first byte of the statement program itself.
-    uint32_t PrologueLength;
-    // The size in bytes of the smallest target machine instruction. Statement
-    // program opcodes that alter the address register first multiply their
-    // operands by this value.
-    uint8_t MinInstLength;
-    // The maximum number of individual operations that may be encoded in an
-    // instruction.
-    uint8_t MaxOpsPerInst;
-    // The initial value of theis_stmtregister.
-    uint8_t DefaultIsStmt;
-    // This parameter affects the meaning of the special opcodes. See below.
-    int8_t LineBase;
-    // This parameter affects the meaning of the special opcodes. See below.
-    uint8_t LineRange;
-    // The number assigned to the first special opcode.
-    uint8_t OpcodeBase;
-    std::vector<uint8_t> StandardOpcodeLengths;
-    std::vector<const char*> IncludeDirectories;
-    std::vector<FileNameEntry> FileNames;
-
-    // Length of the prologue in bytes.
-    uint32_t getLength() const {
-      return PrologueLength + sizeof(TotalLength) + sizeof(Version) +
-             sizeof(PrologueLength);
-    }
-    // Length of the line table data in bytes (not including the prologue).
-    uint32_t getStatementTableLength() const {
-      return TotalLength + sizeof(TotalLength) - getLength();
-    }
-    int32_t getMaxLineIncrementForSpecialOpcode() const {
-      return LineBase + (int8_t)LineRange - 1;
-    }
-
-    void clear();
-    void dump(raw_ostream &OS) const;
-    bool parse(DataExtractor debug_line_data, uint32_t *offset_ptr);
-  };
-
-  // Standard .debug_line state machine structure.
-  struct Row {
-    explicit Row(bool default_is_stmt = false);
-
-    /// Called after a row is appended to the matrix.
-    void postAppend();
-    void reset(bool default_is_stmt);
-    void dump(raw_ostream &OS) const;
-
-    static bool orderByAddress(const Row& LHS, const Row& RHS) {
-      return LHS.Address < RHS.Address;
-    }
-
-    // The program-counter value corresponding to a machine instruction
-    // generated by the compiler.
-    uint64_t Address;
-    // An unsigned integer indicating a source line number. Lines are numbered
-    // beginning at 1. The compiler may emit the value 0 in cases where an
-    // instruction cannot be attributed to any source line.
-    uint32_t Line;
-    // An unsigned integer indicating a column number within a source line.
-    // Columns are numbered beginning at 1. The value 0 is reserved to indicate
-    // that a statement begins at the 'left edge' of the line.
-    uint16_t Column;
-    // An unsigned integer indicating the identity of the source file
-    // corresponding to a machine instruction.
-    uint16_t File;
-    // An unsigned integer whose value encodes the applicable instruction set
-    // architecture for the current instruction.
-    uint8_t Isa;
-    // An unsigned integer representing the DWARF path discriminator value
-    // for this location.
-    uint32_t Discriminator;
-    // A boolean indicating that the current instruction is the beginning of a
-    // statement.
-    uint8_t IsStmt:1,
-            // A boolean indicating that the current instruction is the
-            // beginning of a basic block.
-            BasicBlock:1,
-            // A boolean indicating that the current address is that of the
-            // first byte after the end of a sequence of target machine
-            // instructions.
-            EndSequence:1,
-            // A boolean indicating that the current address is one (of possibly
-            // many) where execution should be suspended for an entry breakpoint
-            // of a function.
-            PrologueEnd:1,
-            // A boolean indicating that the current address is one (of possibly
-            // many) where execution should be suspended for an exit breakpoint
-            // of a function.
-            EpilogueBegin:1;
-  };
-
-  // Represents a series of contiguous machine instructions. Line table for each
-  // compilation unit may consist of multiple sequences, which are not
-  // guaranteed to be in the order of ascending instruction address.
-  struct Sequence {
-    // Sequence describes instructions at address range [LowPC, HighPC)
-    // and is described by line table rows [FirstRowIndex, LastRowIndex).
-    uint64_t LowPC;
-    uint64_t HighPC;
-    unsigned FirstRowIndex;
-    unsigned LastRowIndex;
-    bool Empty;
-
-    Sequence();
-    void reset();
-
-    static bool orderByLowPC(const Sequence& LHS, const Sequence& RHS) {
-      return LHS.LowPC < RHS.LowPC;
-    }
-    bool isValid() const {
-      return !Empty && (LowPC < HighPC) && (FirstRowIndex < LastRowIndex);
-    }
-    bool containsPC(uint64_t pc) const {
-      return (LowPC <= pc && pc < HighPC);
-    }
-  };
-
-  struct LineTable {
-    LineTable();
-
-    void appendRow(const DWARFDebugLine::Row &R) {
-      Rows.push_back(R);
-    }
-    void appendSequence(const DWARFDebugLine::Sequence &S) {
-      Sequences.push_back(S);
-    }
-
-    // Returns the index of the row with file/line info for a given address,
-    // or -1 if there is no such row.
-    uint32_t lookupAddress(uint64_t address) const;
-
-    bool lookupAddressRange(uint64_t address, uint64_t size,
-                            std::vector<uint32_t> &result) const;
-
-    // Extracts filename by its index in filename table in prologue.
-    // Returns true on success.
-    bool getFileNameByIndex(uint64_t FileIndex, const char *CompDir,
-                            DILineInfoSpecifier::FileLineInfoKind Kind,
-                            std::string &Result) const;
-
-    // Fills the Result argument with the file and line information
-    // corresponding to Address. Returns true on success.
-    bool getFileLineInfoForAddress(uint64_t Address, const char *CompDir, 
-                                   DILineInfoSpecifier::FileLineInfoKind Kind,
-                                   DILineInfo &Result) const;
-
-    void dump(raw_ostream &OS) const;
-    void clear();
-
-    /// Parse prologue and all rows.
-    bool parse(DataExtractor debug_line_data, const RelocAddrMap *RMap,
-               uint32_t *offset_ptr);
-
-    struct Prologue Prologue;
-    typedef std::vector<Row> RowVector;
-    typedef RowVector::const_iterator RowIter;
-    typedef std::vector<Sequence> SequenceVector;
-    typedef SequenceVector::const_iterator SequenceIter;
-    RowVector Rows;
-    SequenceVector Sequences;
-  };
-
-  const LineTable *getLineTable(uint32_t offset) const;
-  const LineTable *getOrParseLineTable(DataExtractor debug_line_data,
-                                       uint32_t offset);
-
-private:
-  struct ParsingState {
-    ParsingState(struct LineTable *LT);
-
-    void resetRowAndSequence();
-    void appendRowToMatrix(uint32_t offset);
-
-    // Line table we're currently parsing.
-    struct LineTable *LineTable;
-    // The row number that starts at zero for the prologue, and increases for
-    // each row added to the matrix.
-    unsigned RowNumber;
-    struct Row Row;
-    struct Sequence Sequence;
-  };
-
-  typedef std::map<uint32_t, LineTable> LineTableMapTy;
-  typedef LineTableMapTy::iterator LineTableIter;
-  typedef LineTableMapTy::const_iterator LineTableConstIter;
-
-  const RelocAddrMap *RelocMap;
-  LineTableMapTy LineTableMap;
-};
-
-}
-
-#endif
diff --git a/lib/DebugInfo/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARFDebugLoc.cpp
deleted file mode 100644
index e4aa5dc..0000000
--- a/lib/DebugInfo/DWARFDebugLoc.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-//===-- DWARFDebugLoc.cpp -------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFDebugLoc.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Dwarf.h"
-
-using namespace llvm;
-
-void DWARFDebugLoc::dump(raw_ostream &OS) const {
-  for (const LocationList &L : Locations) {
-    OS << format("0x%8.8x: ", L.Offset);
-    const unsigned Indent = 12;
-    for (const Entry &E : L.Entries) {
-      if (&E != L.Entries.begin())
-        OS.indent(Indent);
-      OS << "Beginning address offset: " << format("0x%016" PRIx64, E.Begin)
-         << '\n';
-      OS.indent(Indent) << "   Ending address offset: "
-                        << format("0x%016" PRIx64, E.End) << '\n';
-      OS.indent(Indent) << "    Location description: ";
-      for (unsigned char Loc : E.Loc) {
-        OS << format("%2.2x ", Loc);
-      }
-      OS << "\n\n";
-    }
-  }
-}
-
-void DWARFDebugLoc::parse(DataExtractor data, unsigned AddressSize) {
-  uint32_t Offset = 0;
-  while (data.isValidOffset(Offset+AddressSize-1)) {
-    Locations.resize(Locations.size() + 1);
-    LocationList &Loc = Locations.back();
-    Loc.Offset = Offset;
-    // 2.6.2 Location Lists
-    // A location list entry consists of:
-    while (true) {
-      Entry E;
-      RelocAddrMap::const_iterator AI = RelocMap.find(Offset);
-      // 1. A beginning address offset. ...
-      E.Begin = data.getUnsigned(&Offset, AddressSize);
-      if (AI != RelocMap.end())
-        E.Begin += AI->second.second;
-
-      AI = RelocMap.find(Offset);
-      // 2. An ending address offset. ...
-      E.End = data.getUnsigned(&Offset, AddressSize);
-      if (AI != RelocMap.end())
-        E.End += AI->second.second;
-
-      // The end of any given location list is marked by an end of list entry,
-      // which consists of a 0 for the beginning address offset and a 0 for the
-      // ending address offset.
-      if (E.Begin == 0 && E.End == 0)
-        break;
-
-      unsigned Bytes = data.getU16(&Offset);
-      // A single location description describing the location of the object...
-      StringRef str = data.getData().substr(Offset, Bytes);
-      Offset += Bytes;
-      E.Loc.reserve(str.size());
-      std::copy(str.begin(), str.end(), std::back_inserter(E.Loc));
-      Loc.Entries.push_back(std::move(E));
-    }
-  }
-  if (data.isValidOffset(Offset))
-    llvm::errs() << "error: failed to consume entire .debug_loc section\n";
-}
-
-void DWARFDebugLocDWO::parse(DataExtractor data) {
-  uint32_t Offset = 0;
-  while (data.isValidOffset(Offset)) {
-    Locations.resize(Locations.size() + 1);
-    LocationList &Loc = Locations.back();
-    Loc.Offset = Offset;
-    dwarf::LocationListEntry Kind;
-    while ((Kind = static_cast<dwarf::LocationListEntry>(
-                data.getU8(&Offset))) != dwarf::DW_LLE_end_of_list_entry) {
-
-      if (Kind != dwarf::DW_LLE_start_length_entry) {
-        llvm::errs() << "error: dumping support for LLE of kind " << (int)Kind
-                     << " not implemented\n";
-        return;
-      }
-
-      Entry E;
-
-      E.Start = data.getULEB128(&Offset);
-      E.Length = data.getU32(&Offset);
-
-      unsigned Bytes = data.getU16(&Offset);
-      // A single location description describing the location of the object...
-      StringRef str = data.getData().substr(Offset, Bytes);
-      Offset += Bytes;
-      E.Loc.resize(str.size());
-      std::copy(str.begin(), str.end(), E.Loc.begin());
-
-      Loc.Entries.push_back(std::move(E));
-    }
-  }
-}
-
-void DWARFDebugLocDWO::dump(raw_ostream &OS) const {
-  for (const LocationList &L : Locations) {
-    OS << format("0x%8.8x: ", L.Offset);
-    const unsigned Indent = 12;
-    for (const Entry &E : L.Entries) {
-      if (&E != L.Entries.begin())
-        OS.indent(Indent);
-      OS << "Beginning address index: " << E.Start << '\n';
-      OS.indent(Indent) << "                 Length: " << E.Length << '\n';
-      OS.indent(Indent) << "   Location description: ";
-      for (unsigned char Loc : E.Loc)
-        OS << format("%2.2x ", Loc);
-      OS << "\n\n";
-    }
-  }
-}
-
diff --git a/lib/DebugInfo/DWARFDebugLoc.h b/lib/DebugInfo/DWARFDebugLoc.h
deleted file mode 100644
index 50110b3..0000000
--- a/lib/DebugInfo/DWARFDebugLoc.h
+++ /dev/null
@@ -1,81 +0,0 @@
-//===-- DWARFDebugLoc.h -----------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGLOC_H
-#define LLVM_LIB_DEBUGINFO_DWARFDEBUGLOC_H
-
-#include "DWARFRelocMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/DataExtractor.h"
-
-namespace llvm {
-
-class raw_ostream;
-
-class DWARFDebugLoc {
-  /// A single location within a location list.
-  struct Entry {
-    /// The beginning address of the instruction range.
-    uint64_t Begin;
-    /// The ending address of the instruction range.
-    uint64_t End;
-    /// The location of the variable within the specified range.
-    SmallVector<unsigned char, 4> Loc;
-  };
-
-  /// A list of locations that contain one variable.
-  struct LocationList {
-    /// The beginning offset where this location list is stored in the debug_loc
-    /// section.
-    unsigned Offset;
-    /// All the locations in which the variable is stored.
-    SmallVector<Entry, 2> Entries;
-  };
-
-  typedef SmallVector<LocationList, 4> LocationLists;
-
-  /// A list of all the variables in the debug_loc section, each one describing
-  /// the locations in which the variable is stored.
-  LocationLists Locations;
-
-  /// A map used to resolve binary relocations.
-  const RelocAddrMap &RelocMap;
-
-public:
-  DWARFDebugLoc(const RelocAddrMap &LocRelocMap) : RelocMap(LocRelocMap) {}
-  /// Print the location lists found within the debug_loc section.
-  void dump(raw_ostream &OS) const;
-  /// Parse the debug_loc section accessible via the 'data' parameter using the
-  /// specified address size to interpret the address ranges.
-  void parse(DataExtractor data, unsigned AddressSize);
-};
-
-class DWARFDebugLocDWO {
-  struct Entry {
-    uint64_t Start;
-    uint32_t Length;
-    SmallVector<unsigned char, 4> Loc;
-  };
-
-  struct LocationList {
-    unsigned Offset;
-    SmallVector<Entry, 2> Entries;
-  };
-
-  typedef SmallVector<LocationList, 4> LocationLists;
-
-  LocationLists Locations;
-
-public:
-  void parse(DataExtractor data);
-  void dump(raw_ostream &OS) const;
-};
-}
-
-#endif
diff --git a/lib/DebugInfo/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARFDebugRangeList.cpp
deleted file mode 100644
index 07b23b3..0000000
--- a/lib/DebugInfo/DWARFDebugRangeList.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-//===-- DWARFDebugRangesList.cpp ------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFDebugRangeList.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-void DWARFDebugRangeList::clear() {
-  Offset = -1U;
-  AddressSize = 0;
-  Entries.clear();
-}
-
-bool DWARFDebugRangeList::extract(DataExtractor data, uint32_t *offset_ptr) {
-  clear();
-  if (!data.isValidOffset(*offset_ptr))
-    return false;
-  AddressSize = data.getAddressSize();
-  if (AddressSize != 4 && AddressSize != 8)
-    return false;
-  Offset = *offset_ptr;
-  while (true) {
-    RangeListEntry entry;
-    uint32_t prev_offset = *offset_ptr;
-    entry.StartAddress = data.getAddress(offset_ptr);
-    entry.EndAddress = data.getAddress(offset_ptr);
-    // Check that both values were extracted correctly.
-    if (*offset_ptr != prev_offset + 2 * AddressSize) {
-      clear();
-      return false;
-    }
-    if (entry.isEndOfListEntry())
-      break;
-    Entries.push_back(entry);
-  }
-  return true;
-}
-
-void DWARFDebugRangeList::dump(raw_ostream &OS) const {
-  for (const RangeListEntry &RLE : Entries) {
-    const char *format_str = (AddressSize == 4
-                              ? "%08x %08"  PRIx64 " %08"  PRIx64 "\n"
-                              : "%08x %016" PRIx64 " %016" PRIx64 "\n");
-    OS << format(format_str, Offset, RLE.StartAddress, RLE.EndAddress);
-  }
-  OS << format("%08x <End of list>\n", Offset);
-}
-
-DWARFAddressRangesVector
-DWARFDebugRangeList::getAbsoluteRanges(uint64_t BaseAddress) const {
-  DWARFAddressRangesVector Res;
-  for (const RangeListEntry &RLE : Entries) {
-    if (RLE.isBaseAddressSelectionEntry(AddressSize)) {
-      BaseAddress = RLE.EndAddress;
-    } else {
-      Res.push_back(std::make_pair(BaseAddress + RLE.StartAddress,
-                                   BaseAddress + RLE.EndAddress));
-    }
-  }
-  return Res;
-}
diff --git a/lib/DebugInfo/DWARFFormValue.cpp b/lib/DebugInfo/DWARFFormValue.cpp
deleted file mode 100644
index 69b9771..0000000
--- a/lib/DebugInfo/DWARFFormValue.cpp
+++ /dev/null
@@ -1,557 +0,0 @@
-//===-- DWARFFormValue.cpp ------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/DWARFFormValue.h"
-#include "DWARFCompileUnit.h"
-#include "DWARFContext.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-using namespace llvm;
-using namespace dwarf;
-
-namespace {
-uint8_t getRefAddrSize(uint8_t AddrSize, uint16_t Version) {
-  // FIXME: Support DWARF64.
-  return (Version == 2) ? AddrSize : 4;
-}
-
-template <uint8_t AddrSize, uint8_t RefAddrSize>
-ArrayRef<uint8_t> makeFixedFormSizesArrayRef() {
-  static const uint8_t sizes[] = {
-    0,           // 0x00 unused
-    AddrSize,    // 0x01 DW_FORM_addr
-    0,           // 0x02 unused
-    0,           // 0x03 DW_FORM_block2
-    0,           // 0x04 DW_FORM_block4
-    2,           // 0x05 DW_FORM_data2
-    4,           // 0x06 DW_FORM_data4
-    8,           // 0x07 DW_FORM_data8
-    0,           // 0x08 DW_FORM_string
-    0,           // 0x09 DW_FORM_block
-    0,           // 0x0a DW_FORM_block1
-    1,           // 0x0b DW_FORM_data1
-    1,           // 0x0c DW_FORM_flag
-    0,           // 0x0d DW_FORM_sdata
-    4,           // 0x0e DW_FORM_strp
-    0,           // 0x0f DW_FORM_udata
-    RefAddrSize, // 0x10 DW_FORM_ref_addr
-    1,           // 0x11 DW_FORM_ref1
-    2,           // 0x12 DW_FORM_ref2
-    4,           // 0x13 DW_FORM_ref4
-    8,           // 0x14 DW_FORM_ref8
-    0,           // 0x15 DW_FORM_ref_udata
-    0,           // 0x16 DW_FORM_indirect
-    4,           // 0x17 DW_FORM_sec_offset
-    0,           // 0x18 DW_FORM_exprloc
-    0,           // 0x19 DW_FORM_flag_present
-  };
-  return makeArrayRef(sizes);
-}
-}
-
-ArrayRef<uint8_t> DWARFFormValue::getFixedFormSizes(uint8_t AddrSize,
-                                                    uint16_t Version) {
-  uint8_t RefAddrSize = getRefAddrSize(AddrSize, Version);
-  if (AddrSize == 4 && RefAddrSize == 4)
-    return makeFixedFormSizesArrayRef<4, 4>();
-  if (AddrSize == 4 && RefAddrSize == 8)
-    return makeFixedFormSizesArrayRef<4, 8>();
-  if (AddrSize == 8 && RefAddrSize == 4)
-    return makeFixedFormSizesArrayRef<8, 4>();
-  if (AddrSize == 8 && RefAddrSize == 8)
-    return makeFixedFormSizesArrayRef<8, 8>();
-  return None;
-}
-
-static const DWARFFormValue::FormClass DWARF4FormClasses[] = {
-  DWARFFormValue::FC_Unknown,       // 0x0
-  DWARFFormValue::FC_Address,       // 0x01 DW_FORM_addr
-  DWARFFormValue::FC_Unknown,       // 0x02 unused
-  DWARFFormValue::FC_Block,         // 0x03 DW_FORM_block2
-  DWARFFormValue::FC_Block,         // 0x04 DW_FORM_block4
-  DWARFFormValue::FC_Constant,      // 0x05 DW_FORM_data2
-  // --- These can be FC_SectionOffset in DWARF3 and below:
-  DWARFFormValue::FC_Constant,      // 0x06 DW_FORM_data4
-  DWARFFormValue::FC_Constant,      // 0x07 DW_FORM_data8
-  // ---
-  DWARFFormValue::FC_String,        // 0x08 DW_FORM_string
-  DWARFFormValue::FC_Block,         // 0x09 DW_FORM_block
-  DWARFFormValue::FC_Block,         // 0x0a DW_FORM_block1
-  DWARFFormValue::FC_Constant,      // 0x0b DW_FORM_data1
-  DWARFFormValue::FC_Flag,          // 0x0c DW_FORM_flag
-  DWARFFormValue::FC_Constant,      // 0x0d DW_FORM_sdata
-  DWARFFormValue::FC_String,        // 0x0e DW_FORM_strp
-  DWARFFormValue::FC_Constant,      // 0x0f DW_FORM_udata
-  DWARFFormValue::FC_Reference,     // 0x10 DW_FORM_ref_addr
-  DWARFFormValue::FC_Reference,     // 0x11 DW_FORM_ref1
-  DWARFFormValue::FC_Reference,     // 0x12 DW_FORM_ref2
-  DWARFFormValue::FC_Reference,     // 0x13 DW_FORM_ref4
-  DWARFFormValue::FC_Reference,     // 0x14 DW_FORM_ref8
-  DWARFFormValue::FC_Reference,     // 0x15 DW_FORM_ref_udata
-  DWARFFormValue::FC_Indirect,      // 0x16 DW_FORM_indirect
-  DWARFFormValue::FC_SectionOffset, // 0x17 DW_FORM_sec_offset
-  DWARFFormValue::FC_Exprloc,       // 0x18 DW_FORM_exprloc
-  DWARFFormValue::FC_Flag,          // 0x19 DW_FORM_flag_present
-};
-
-bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
-  // First, check DWARF4 form classes.
-  if (Form < ArrayRef<FormClass>(DWARF4FormClasses).size() &&
-      DWARF4FormClasses[Form] == FC)
-    return true;
-  // Check DW_FORM_ref_sig8 from DWARF4.
-  if (Form == DW_FORM_ref_sig8)
-    return (FC == FC_Reference);
-  // Check for some DWARF5 forms.
-  if (Form == DW_FORM_GNU_addr_index)
-    return (FC == FC_Address);
-  if (Form == DW_FORM_GNU_str_index)
-    return (FC == FC_String);
-  // In DWARF3 DW_FORM_data4 and DW_FORM_data8 served also as a section offset.
-  // Don't check for DWARF version here, as some producers may still do this
-  // by mistake.
-  if ((Form == DW_FORM_data4 || Form == DW_FORM_data8) &&
-      FC == FC_SectionOffset)
-    return true;
-  return false;
-}
-
-bool DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
-                                  const DWARFUnit *cu) {
-  bool indirect = false;
-  bool is_block = false;
-  Value.data = nullptr;
-  // Read the value for the form into value and follow and DW_FORM_indirect
-  // instances we run into
-  do {
-    indirect = false;
-    switch (Form) {
-    case DW_FORM_addr:
-    case DW_FORM_ref_addr: {
-      if (!cu)
-        return false;
-      uint16_t AddrSize =
-          (Form == DW_FORM_addr)
-              ? cu->getAddressByteSize()
-              : getRefAddrSize(cu->getAddressByteSize(), cu->getVersion());
-      RelocAddrMap::const_iterator AI = cu->getRelocMap()->find(*offset_ptr);
-      if (AI != cu->getRelocMap()->end()) {
-        const std::pair<uint8_t, int64_t> &R = AI->second;
-        Value.uval = data.getUnsigned(offset_ptr, AddrSize) + R.second;
-      } else
-        Value.uval = data.getUnsigned(offset_ptr, AddrSize);
-      break;
-    }
-    case DW_FORM_exprloc:
-    case DW_FORM_block:
-      Value.uval = data.getULEB128(offset_ptr);
-      is_block = true;
-      break;
-    case DW_FORM_block1:
-      Value.uval = data.getU8(offset_ptr);
-      is_block = true;
-      break;
-    case DW_FORM_block2:
-      Value.uval = data.getU16(offset_ptr);
-      is_block = true;
-      break;
-    case DW_FORM_block4:
-      Value.uval = data.getU32(offset_ptr);
-      is_block = true;
-      break;
-    case DW_FORM_data1:
-    case DW_FORM_ref1:
-    case DW_FORM_flag:
-      Value.uval = data.getU8(offset_ptr);
-      break;
-    case DW_FORM_data2:
-    case DW_FORM_ref2:
-      Value.uval = data.getU16(offset_ptr);
-      break;
-    case DW_FORM_data4:
-    case DW_FORM_ref4: {
-      Value.uval = data.getU32(offset_ptr);
-      if (!cu)
-        break;
-      RelocAddrMap::const_iterator AI = cu->getRelocMap()->find(*offset_ptr-4);
-      if (AI != cu->getRelocMap()->end())
-        Value.uval += AI->second.second;
-      break;
-    }
-    case DW_FORM_data8:
-    case DW_FORM_ref8:
-      Value.uval = data.getU64(offset_ptr);
-      break;
-    case DW_FORM_sdata:
-      Value.sval = data.getSLEB128(offset_ptr);
-      break;
-    case DW_FORM_strp: {
-      Value.uval = data.getU32(offset_ptr);
-      if (!cu)
-        break;
-      RelocAddrMap::const_iterator AI = cu->getRelocMap()->find(*offset_ptr-4);
-      if (AI != cu->getRelocMap()->end())
-        Value.uval += AI->second.second;
-      break;
-    }
-    case DW_FORM_udata:
-    case DW_FORM_ref_udata:
-      Value.uval = data.getULEB128(offset_ptr);
-      break;
-    case DW_FORM_string:
-      Value.cstr = data.getCStr(offset_ptr);
-      break;
-    case DW_FORM_indirect:
-      Form = data.getULEB128(offset_ptr);
-      indirect = true;
-      break;
-    case DW_FORM_sec_offset: {
-      // FIXME: This is 64-bit for DWARF64.
-      Value.uval = data.getU32(offset_ptr);
-      if (!cu)
-        break;
-      RelocAddrMap::const_iterator AI = cu->getRelocMap()->find(*offset_ptr-4);
-      if (AI != cu->getRelocMap()->end())
-        Value.uval +=  AI->second.second;
-      break;
-    }
-    case DW_FORM_flag_present:
-      Value.uval = 1;
-      break;
-    case DW_FORM_ref_sig8:
-      Value.uval = data.getU64(offset_ptr);
-      break;
-    case DW_FORM_GNU_addr_index:
-    case DW_FORM_GNU_str_index:
-      Value.uval = data.getULEB128(offset_ptr);
-      break;
-    default:
-      return false;
-    }
-  } while (indirect);
-
-  if (is_block) {
-    StringRef str = data.getData().substr(*offset_ptr, Value.uval);
-    Value.data = nullptr;
-    if (!str.empty()) {
-      Value.data = reinterpret_cast<const uint8_t *>(str.data());
-      *offset_ptr += Value.uval;
-    }
-  }
-
-  return true;
-}
-
-bool
-DWARFFormValue::skipValue(DataExtractor debug_info_data, uint32_t* offset_ptr,
-                          const DWARFUnit *cu) const {
-  return DWARFFormValue::skipValue(Form, debug_info_data, offset_ptr, cu);
-}
-
-bool
-DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
-                          uint32_t *offset_ptr, const DWARFUnit *cu) {
-  bool indirect = false;
-  do {
-    switch (form) {
-    // Blocks if inlined data that have a length field and the data bytes
-    // inlined in the .debug_info
-    case DW_FORM_exprloc:
-    case DW_FORM_block: {
-      uint64_t size = debug_info_data.getULEB128(offset_ptr);
-      *offset_ptr += size;
-      return true;
-    }
-    case DW_FORM_block1: {
-      uint8_t size = debug_info_data.getU8(offset_ptr);
-      *offset_ptr += size;
-      return true;
-    }
-    case DW_FORM_block2: {
-      uint16_t size = debug_info_data.getU16(offset_ptr);
-      *offset_ptr += size;
-      return true;
-    }
-    case DW_FORM_block4: {
-      uint32_t size = debug_info_data.getU32(offset_ptr);
-      *offset_ptr += size;
-      return true;
-    }
-
-    // Inlined NULL terminated C-strings
-    case DW_FORM_string:
-      debug_info_data.getCStr(offset_ptr);
-      return true;
-
-    // Compile unit address sized values
-    case DW_FORM_addr:
-      *offset_ptr += cu->getAddressByteSize();
-      return true;
-    case DW_FORM_ref_addr:
-      *offset_ptr += getRefAddrSize(cu->getAddressByteSize(), cu->getVersion());
-      return true;
-
-    // 0 byte values - implied from the form.
-    case DW_FORM_flag_present:
-      return true;
-
-    // 1 byte values
-    case DW_FORM_data1:
-    case DW_FORM_flag:
-    case DW_FORM_ref1:
-      *offset_ptr += 1;
-      return true;
-
-    // 2 byte values
-    case DW_FORM_data2:
-    case DW_FORM_ref2:
-      *offset_ptr += 2;
-      return true;
-
-    // 4 byte values
-    case DW_FORM_strp:
-    case DW_FORM_data4:
-    case DW_FORM_ref4:
-      *offset_ptr += 4;
-      return true;
-
-    // 8 byte values
-    case DW_FORM_data8:
-    case DW_FORM_ref8:
-    case DW_FORM_ref_sig8:
-      *offset_ptr += 8;
-      return true;
-
-    // signed or unsigned LEB 128 values
-    //  case DW_FORM_APPLE_db_str:
-    case DW_FORM_sdata:
-    case DW_FORM_udata:
-    case DW_FORM_ref_udata:
-    case DW_FORM_GNU_str_index:
-    case DW_FORM_GNU_addr_index:
-      debug_info_data.getULEB128(offset_ptr);
-      return true;
-
-    case DW_FORM_indirect:
-      indirect = true;
-      form = debug_info_data.getULEB128(offset_ptr);
-      break;
-
-    // FIXME: 4 for DWARF32, 8 for DWARF64.
-    case DW_FORM_sec_offset:
-      *offset_ptr += 4;
-      return true;
-
-    default:
-      return false;
-    }
-  } while (indirect);
-  return true;
-}
-
-void
-DWARFFormValue::dump(raw_ostream &OS, const DWARFUnit *cu) const {
-  uint64_t uvalue = Value.uval;
-  bool cu_relative_offset = false;
-
-  switch (Form) {
-  case DW_FORM_addr:      OS << format("0x%016" PRIx64, uvalue); break;
-  case DW_FORM_GNU_addr_index: {
-    OS << format(" indexed (%8.8x) address = ", (uint32_t)uvalue);
-    uint64_t Address;
-    if (cu->getAddrOffsetSectionItem(uvalue, Address))
-      OS << format("0x%016" PRIx64, Address);
-    else
-      OS << "<no .debug_addr section>";
-    break;
-  }
-  case DW_FORM_flag_present: OS << "true"; break;
-  case DW_FORM_flag:
-  case DW_FORM_data1:     OS << format("0x%02x", (uint8_t)uvalue); break;
-  case DW_FORM_data2:     OS << format("0x%04x", (uint16_t)uvalue); break;
-  case DW_FORM_data4:     OS << format("0x%08x", (uint32_t)uvalue); break;
-  case DW_FORM_ref_sig8:
-  case DW_FORM_data8:     OS << format("0x%016" PRIx64, uvalue); break;
-  case DW_FORM_string:
-    OS << '"';
-    OS.write_escaped(Value.cstr);
-    OS << '"';
-    break;
-  case DW_FORM_exprloc:
-  case DW_FORM_block:
-  case DW_FORM_block1:
-  case DW_FORM_block2:
-  case DW_FORM_block4:
-    if (uvalue > 0) {
-      switch (Form) {
-      case DW_FORM_exprloc:
-      case DW_FORM_block:  OS << format("<0x%" PRIx64 "> ", uvalue);     break;
-      case DW_FORM_block1: OS << format("<0x%2.2x> ", (uint8_t)uvalue);  break;
-      case DW_FORM_block2: OS << format("<0x%4.4x> ", (uint16_t)uvalue); break;
-      case DW_FORM_block4: OS << format("<0x%8.8x> ", (uint32_t)uvalue); break;
-      default: break;
-      }
-
-      const uint8_t* data_ptr = Value.data;
-      if (data_ptr) {
-        // uvalue contains size of block
-        const uint8_t* end_data_ptr = data_ptr + uvalue;
-        while (data_ptr < end_data_ptr) {
-          OS << format("%2.2x ", *data_ptr);
-          ++data_ptr;
-        }
-      }
-      else
-        OS << "NULL";
-    }
-    break;
-
-  case DW_FORM_sdata:     OS << Value.sval; break;
-  case DW_FORM_udata:     OS << Value.uval; break;
-  case DW_FORM_strp: {
-    OS << format(" .debug_str[0x%8.8x] = ", (uint32_t)uvalue);
-    Optional<const char *> DbgStr = getAsCString(cu);
-    if (DbgStr.hasValue()) {
-      OS << '"';
-      OS.write_escaped(DbgStr.getValue());
-      OS << '"';
-    }
-    break;
-  }
-  case DW_FORM_GNU_str_index: {
-    OS << format(" indexed (%8.8x) string = ", (uint32_t)uvalue);
-    Optional<const char *> DbgStr = getAsCString(cu);
-    if (DbgStr.hasValue()) {
-      OS << '"';
-      OS.write_escaped(DbgStr.getValue());
-      OS << '"';
-    }
-    break;
-  }
-  case DW_FORM_ref_addr:
-    OS << format("0x%016" PRIx64, uvalue);
-    break;
-  case DW_FORM_ref1:
-    cu_relative_offset = true;
-    OS << format("cu + 0x%2.2x", (uint8_t)uvalue);
-    break;
-  case DW_FORM_ref2:
-    cu_relative_offset = true;
-    OS << format("cu + 0x%4.4x", (uint16_t)uvalue);
-    break;
-  case DW_FORM_ref4:
-    cu_relative_offset = true;
-    OS << format("cu + 0x%4.4x", (uint32_t)uvalue);
-    break;
-  case DW_FORM_ref8:
-    cu_relative_offset = true;
-    OS << format("cu + 0x%8.8" PRIx64, uvalue);
-    break;
-  case DW_FORM_ref_udata:
-    cu_relative_offset = true;
-    OS << format("cu + 0x%" PRIx64, uvalue);
-    break;
-
-    // All DW_FORM_indirect attributes should be resolved prior to calling
-    // this function
-  case DW_FORM_indirect:
-    OS << "DW_FORM_indirect";
-    break;
-
-    // Should be formatted to 64-bit for DWARF64.
-  case DW_FORM_sec_offset:
-    OS << format("0x%08x", (uint32_t)uvalue);
-    break;
-
-  default:
-    OS << format("DW_FORM(0x%4.4x)", Form);
-    break;
-  }
-
-  if (cu_relative_offset)
-    OS << format(" => {0x%8.8" PRIx64 "}", uvalue + (cu ? cu->getOffset() : 0));
-}
-
-Optional<const char *> DWARFFormValue::getAsCString(const DWARFUnit *U) const {
-  if (!isFormClass(FC_String))
-    return None;
-  if (Form == DW_FORM_string)
-    return Value.cstr;
-  if (!U)
-    return None;
-  uint32_t Offset = Value.uval;
-  if (Form == DW_FORM_GNU_str_index) {
-    uint32_t StrOffset;
-    if (!U->getStringOffsetSectionItem(Offset, StrOffset))
-      return None;
-    Offset = StrOffset;
-  }
-  if (const char *Str = U->getStringExtractor().getCStr(&Offset)) {
-    return Str;
-  }
-  return None;
-}
-
-Optional<uint64_t> DWARFFormValue::getAsAddress(const DWARFUnit *U) const {
-  if (!isFormClass(FC_Address))
-    return None;
-  if (Form == DW_FORM_GNU_addr_index) {
-    uint32_t Index = Value.uval;
-    uint64_t Result;
-    if (!U || !U->getAddrOffsetSectionItem(Index, Result))
-      return None;
-    return Result;
-  }
-  return Value.uval;
-}
-
-Optional<uint64_t> DWARFFormValue::getAsReference(const DWARFUnit *U) const {
-  if (!isFormClass(FC_Reference))
-    return None;
-  switch (Form) {
-  case DW_FORM_ref1:
-  case DW_FORM_ref2:
-  case DW_FORM_ref4:
-  case DW_FORM_ref8:
-  case DW_FORM_ref_udata:
-    if (!U)
-      return None;
-    return Value.uval + U->getOffset();
-  case DW_FORM_ref_addr:
-    return Value.uval;
-  // FIXME: Add proper support for DW_FORM_ref_sig8
-  default:
-    return Value.uval;
-  }
-}
-
-Optional<uint64_t> DWARFFormValue::getAsSectionOffset() const {
-  if (!isFormClass(FC_SectionOffset))
-    return None;
-  return Value.uval;
-}
-
-Optional<uint64_t> DWARFFormValue::getAsUnsignedConstant() const {
-  if ((!isFormClass(FC_Constant) && !isFormClass(FC_Flag))
-      || Form == DW_FORM_sdata)
-    return None;
-  return Value.uval;
-}
-
-Optional<ArrayRef<uint8_t>> DWARFFormValue::getAsBlock() const {
-  if (!isFormClass(FC_Block) && !isFormClass(FC_Exprloc))
-    return None;
-  return ArrayRef<uint8_t>(Value.data, Value.uval);
-}
-
diff --git a/lib/DebugInfo/DWARFSection.h b/lib/DebugInfo/DWARFSection.h
deleted file mode 100644
index 3aaf0ff..0000000
--- a/lib/DebugInfo/DWARFSection.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===-- DWARFSection.h ------------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_DEBUGINFO_DWARFSECTION_H
-#define LLVM_LIB_DEBUGINFO_DWARFSECTION_H
-
-#include "DWARFRelocMap.h"
-
-namespace llvm {
-
-struct DWARFSection {
-  StringRef Data;
-  RelocAddrMap Relocs;
-};
-
-}
-
-#endif
diff --git a/lib/DebugInfo/DWARFTypeUnit.cpp b/lib/DebugInfo/DWARFTypeUnit.cpp
deleted file mode 100644
index 303bf70..0000000
--- a/lib/DebugInfo/DWARFTypeUnit.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===-- DWARFTypeUnit.cpp -------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFTypeUnit.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-bool DWARFTypeUnit::extractImpl(DataExtractor debug_info,
-                                uint32_t *offset_ptr) {
-  if (!DWARFUnit::extractImpl(debug_info, offset_ptr))
-    return false;
-  TypeHash = debug_info.getU64(offset_ptr);
-  TypeOffset = debug_info.getU32(offset_ptr);
-  return TypeOffset < getLength();
-}
-
-void DWARFTypeUnit::dump(raw_ostream &OS) {
-  OS << format("0x%08x", getOffset()) << ": Type Unit:"
-     << " length = " << format("0x%08x", getLength())
-     << " version = " << format("0x%04x", getVersion())
-     << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
-     << " addr_size = " << format("0x%02x", getAddressByteSize())
-     << " type_signature = " << format("0x%16" PRIx64, TypeHash)
-     << " type_offset = " << format("0x%04x", TypeOffset)
-     << " (next unit at " << format("0x%08x", getNextUnitOffset())
-     << ")\n";
-
-  const DWARFDebugInfoEntryMinimal *CU = getCompileUnitDIE(false);
-  assert(CU && "Null Compile Unit?");
-  CU->dump(OS, this, -1U);
-}
diff --git a/lib/DebugInfo/DWARFTypeUnit.h b/lib/DebugInfo/DWARFTypeUnit.h
deleted file mode 100644
index 7471b5a..0000000
--- a/lib/DebugInfo/DWARFTypeUnit.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===-- DWARFTypeUnit.h -----------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_DEBUGINFO_DWARFTYPEUNIT_H
-#define LLVM_LIB_DEBUGINFO_DWARFTYPEUNIT_H
-
-#include "DWARFUnit.h"
-
-namespace llvm {
-
-class DWARFTypeUnit : public DWARFUnit {
-private:
-  uint64_t TypeHash;
-  uint32_t TypeOffset;
-public:
-  DWARFTypeUnit(DWARFContext &Context, const DWARFSection &Section,
-                const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
-                StringRef SOS, StringRef AOS, bool LE,
-                const DWARFUnitSectionBase &UnitSection)
-      : DWARFUnit(Context, Section, DA, RS, SS, SOS, AOS, LE, UnitSection) {}
-  uint32_t getHeaderSize() const override {
-    return DWARFUnit::getHeaderSize() + 12;
-  }
-  void dump(raw_ostream &OS);
-protected:
-  bool extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) override;
-};
-
-}
-
-#endif
-
diff --git a/lib/DebugInfo/DWARFUnit.cpp b/lib/DebugInfo/DWARFUnit.cpp
deleted file mode 100644
index 82c4529..0000000
--- a/lib/DebugInfo/DWARFUnit.cpp
+++ /dev/null
@@ -1,377 +0,0 @@
-//===-- DWARFUnit.cpp -----------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFUnit.h"
-#include "DWARFContext.h"
-#include "llvm/DebugInfo/DWARFFormValue.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Path.h"
-#include <cstdio>
-
-using namespace llvm;
-using namespace dwarf;
-
-void DWARFUnitSectionBase::parse(DWARFContext &C, const DWARFSection &Section) {
-  parseImpl(C, Section, C.getDebugAbbrev(), C.getRangeSection(),
-            C.getStringSection(), StringRef(), C.getAddrSection(),
-            C.isLittleEndian());
-}
-
-void DWARFUnitSectionBase::parseDWO(DWARFContext &C,
-                                    const DWARFSection &DWOSection) {
-  parseImpl(C, DWOSection, C.getDebugAbbrevDWO(), C.getRangeDWOSection(),
-            C.getStringDWOSection(), C.getStringOffsetDWOSection(),
-            C.getAddrSection(), C.isLittleEndian());
-}
-
-DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section,
-                     const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
-                     StringRef SOS, StringRef AOS, bool LE,
-                     const DWARFUnitSectionBase &UnitSection)
-    : Context(DC), InfoSection(Section), Abbrev(DA), RangeSection(RS),
-      StringSection(SS), StringOffsetSection(SOS), AddrOffsetSection(AOS),
-      isLittleEndian(LE), UnitSection(UnitSection) {
-  clear();
-}
-
-DWARFUnit::~DWARFUnit() {
-}
-
-bool DWARFUnit::getAddrOffsetSectionItem(uint32_t Index,
-                                                uint64_t &Result) const {
-  uint32_t Offset = AddrOffsetSectionBase + Index * AddrSize;
-  if (AddrOffsetSection.size() < Offset + AddrSize)
-    return false;
-  DataExtractor DA(AddrOffsetSection, isLittleEndian, AddrSize);
-  Result = DA.getAddress(&Offset);
-  return true;
-}
-
-bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index,
-                                                  uint32_t &Result) const {
-  // FIXME: string offset section entries are 8-byte for DWARF64.
-  const uint32_t ItemSize = 4;
-  uint32_t Offset = Index * ItemSize;
-  if (StringOffsetSection.size() < Offset + ItemSize)
-    return false;
-  DataExtractor DA(StringOffsetSection, isLittleEndian, 0);
-  Result = DA.getU32(&Offset);
-  return true;
-}
-
-bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
-  Length = debug_info.getU32(offset_ptr);
-  Version = debug_info.getU16(offset_ptr);
-  uint64_t AbbrOffset = debug_info.getU32(offset_ptr);
-  AddrSize = debug_info.getU8(offset_ptr);
-
-  bool LengthOK = debug_info.isValidOffset(getNextUnitOffset() - 1);
-  bool VersionOK = DWARFContext::isSupportedVersion(Version);
-  bool AddrSizeOK = AddrSize == 4 || AddrSize == 8;
-
-  if (!LengthOK || !VersionOK || !AddrSizeOK)
-    return false;
-
-  Abbrevs = Abbrev->getAbbreviationDeclarationSet(AbbrOffset);
-  if (Abbrevs == nullptr)
-    return false;
-
-  return true;
-}
-
-bool DWARFUnit::extract(DataExtractor debug_info, uint32_t *offset_ptr) {
-  clear();
-
-  Offset = *offset_ptr;
-
-  if (debug_info.isValidOffset(*offset_ptr)) {
-    if (extractImpl(debug_info, offset_ptr))
-      return true;
-
-    // reset the offset to where we tried to parse from if anything went wrong
-    *offset_ptr = Offset;
-  }
-
-  return false;
-}
-
-bool DWARFUnit::extractRangeList(uint32_t RangeListOffset,
-                                        DWARFDebugRangeList &RangeList) const {
-  // Require that compile unit is extracted.
-  assert(DieArray.size() > 0);
-  DataExtractor RangesData(RangeSection, isLittleEndian, AddrSize);
-  uint32_t ActualRangeListOffset = RangeSectionBase + RangeListOffset;
-  return RangeList.extract(RangesData, &ActualRangeListOffset);
-}
-
-void DWARFUnit::clear() {
-  Offset = 0;
-  Length = 0;
-  Version = 0;
-  Abbrevs = nullptr;
-  AddrSize = 0;
-  BaseAddr = 0;
-  RangeSectionBase = 0;
-  AddrOffsetSectionBase = 0;
-  clearDIEs(false);
-  DWO.reset();
-}
-
-const char *DWARFUnit::getCompilationDir() {
-  extractDIEsIfNeeded(true);
-  if (DieArray.empty())
-    return nullptr;
-  return DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, nullptr);
-}
-
-uint64_t DWARFUnit::getDWOId() {
-  extractDIEsIfNeeded(true);
-  const uint64_t FailValue = -1ULL;
-  if (DieArray.empty())
-    return FailValue;
-  return DieArray[0]
-      .getAttributeValueAsUnsignedConstant(this, DW_AT_GNU_dwo_id, FailValue);
-}
-
-void DWARFUnit::setDIERelations() {
-  if (DieArray.size() <= 1)
-    return;
-
-  std::vector<DWARFDebugInfoEntryMinimal *> ParentChain;
-  DWARFDebugInfoEntryMinimal *SiblingChain = nullptr;
-  for (auto &DIE : DieArray) {
-    if (SiblingChain) {
-      SiblingChain->setSibling(&DIE);
-    }
-    if (const DWARFAbbreviationDeclaration *AbbrDecl =
-            DIE.getAbbreviationDeclarationPtr()) {
-      // Normal DIE.
-      if (AbbrDecl->hasChildren()) {
-        ParentChain.push_back(&DIE);
-        SiblingChain = nullptr;
-      } else {
-        SiblingChain = &DIE;
-      }
-    } else {
-      // NULL entry terminates the sibling chain.
-      SiblingChain = ParentChain.back();
-      ParentChain.pop_back();
-    }
-  }
-  assert(SiblingChain == nullptr || SiblingChain == &DieArray[0]);
-  assert(ParentChain.empty());
-}
-
-void DWARFUnit::extractDIEsToVector(
-    bool AppendCUDie, bool AppendNonCUDies,
-    std::vector<DWARFDebugInfoEntryMinimal> &Dies) const {
-  if (!AppendCUDie && !AppendNonCUDies)
-    return;
-
-  // Set the offset to that of the first DIE and calculate the start of the
-  // next compilation unit header.
-  uint32_t DIEOffset = Offset + getHeaderSize();
-  uint32_t NextCUOffset = getNextUnitOffset();
-  DWARFDebugInfoEntryMinimal DIE;
-  uint32_t Depth = 0;
-  bool IsCUDie = true;
-
-  while (DIEOffset < NextCUOffset && DIE.extractFast(this, &DIEOffset)) {
-    if (IsCUDie) {
-      if (AppendCUDie)
-        Dies.push_back(DIE);
-      if (!AppendNonCUDies)
-        break;
-      // The average bytes per DIE entry has been seen to be
-      // around 14-20 so let's pre-reserve the needed memory for
-      // our DIE entries accordingly.
-      Dies.reserve(Dies.size() + getDebugInfoSize() / 14);
-      IsCUDie = false;
-    } else {
-      Dies.push_back(DIE);
-    }
-
-    if (const DWARFAbbreviationDeclaration *AbbrDecl =
-            DIE.getAbbreviationDeclarationPtr()) {
-      // Normal DIE
-      if (AbbrDecl->hasChildren())
-        ++Depth;
-    } else {
-      // NULL DIE.
-      if (Depth > 0)
-        --Depth;
-      if (Depth == 0)
-        break;  // We are done with this compile unit!
-    }
-  }
-
-  // Give a little bit of info if we encounter corrupt DWARF (our offset
-  // should always terminate at or before the start of the next compilation
-  // unit header).
-  if (DIEOffset > NextCUOffset)
-    fprintf(stderr, "warning: DWARF compile unit extends beyond its "
-                    "bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), DIEOffset);
-}
-
-size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
-  if ((CUDieOnly && DieArray.size() > 0) ||
-      DieArray.size() > 1)
-    return 0; // Already parsed.
-
-  bool HasCUDie = DieArray.size() > 0;
-  extractDIEsToVector(!HasCUDie, !CUDieOnly, DieArray);
-
-  if (DieArray.empty())
-    return 0;
-
-  // If CU DIE was just parsed, copy several attribute values from it.
-  if (!HasCUDie) {
-    uint64_t BaseAddr =
-        DieArray[0].getAttributeValueAsAddress(this, DW_AT_low_pc, -1ULL);
-    if (BaseAddr == -1ULL)
-      BaseAddr = DieArray[0].getAttributeValueAsAddress(this, DW_AT_entry_pc, 0);
-    setBaseAddress(BaseAddr);
-    AddrOffsetSectionBase = DieArray[0].getAttributeValueAsSectionOffset(
-        this, DW_AT_GNU_addr_base, 0);
-    RangeSectionBase = DieArray[0].getAttributeValueAsSectionOffset(
-        this, DW_AT_ranges_base, 0);
-    // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
-    // skeleton CU DIE, so that DWARF users not aware of it are not broken.
-  }
-
-  setDIERelations();
-  return DieArray.size();
-}
-
-DWARFUnit::DWOHolder::DWOHolder(StringRef DWOPath)
-    : DWOFile(), DWOContext(), DWOU(nullptr) {
-  auto Obj = object::ObjectFile::createObjectFile(DWOPath);
-  if (!Obj)
-    return;
-  DWOFile = std::move(Obj.get());
-  DWOContext.reset(
-      cast<DWARFContext>(DIContext::getDWARFContext(*DWOFile.getBinary())));
-  if (DWOContext->getNumDWOCompileUnits() > 0)
-    DWOU = DWOContext->getDWOCompileUnitAtIndex(0);
-}
-
-bool DWARFUnit::parseDWO() {
-  if (DWO.get())
-    return false;
-  extractDIEsIfNeeded(true);
-  if (DieArray.empty())
-    return false;
-  const char *DWOFileName =
-      DieArray[0].getAttributeValueAsString(this, DW_AT_GNU_dwo_name, nullptr);
-  if (!DWOFileName)
-    return false;
-  const char *CompilationDir =
-      DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, nullptr);
-  SmallString<16> AbsolutePath;
-  if (sys::path::is_relative(DWOFileName) && CompilationDir != nullptr) {
-    sys::path::append(AbsolutePath, CompilationDir);
-  }
-  sys::path::append(AbsolutePath, DWOFileName);
-  DWO = llvm::make_unique<DWOHolder>(AbsolutePath);
-  DWARFUnit *DWOCU = DWO->getUnit();
-  // Verify that compile unit in .dwo file is valid.
-  if (!DWOCU || DWOCU->getDWOId() != getDWOId()) {
-    DWO.reset();
-    return false;
-  }
-  // Share .debug_addr and .debug_ranges section with compile unit in .dwo
-  DWOCU->setAddrOffsetSection(AddrOffsetSection, AddrOffsetSectionBase);
-  uint32_t DWORangesBase = DieArray[0].getRangesBaseAttribute(this, 0);
-  DWOCU->setRangesSection(RangeSection, DWORangesBase);
-  return true;
-}
-
-void DWARFUnit::clearDIEs(bool KeepCUDie) {
-  if (DieArray.size() > (unsigned)KeepCUDie) {
-    // std::vectors never get any smaller when resized to a smaller size,
-    // or when clear() or erase() are called, the size will report that it
-    // is smaller, but the memory allocated remains intact (call capacity()
-    // to see this). So we need to create a temporary vector and swap the
-    // contents which will cause just the internal pointers to be swapped
-    // so that when temporary vector goes out of scope, it will destroy the
-    // contents.
-    std::vector<DWARFDebugInfoEntryMinimal> TmpArray;
-    DieArray.swap(TmpArray);
-    // Save at least the compile unit DIE
-    if (KeepCUDie)
-      DieArray.push_back(TmpArray.front());
-  }
-}
-
-void DWARFUnit::collectAddressRanges(DWARFAddressRangesVector &CURanges) {
-  // First, check if CU DIE describes address ranges for the unit.
-  const auto &CUDIERanges = getCompileUnitDIE()->getAddressRanges(this);
-  if (!CUDIERanges.empty()) {
-    CURanges.insert(CURanges.end(), CUDIERanges.begin(), CUDIERanges.end());
-    return;
-  }
-
-  // This function is usually called if there in no .debug_aranges section
-  // in order to produce a compile unit level set of address ranges that
-  // is accurate. If the DIEs weren't parsed, then we don't want all dies for
-  // all compile units to stay loaded when they weren't needed. So we can end
-  // up parsing the DWARF and then throwing them all away to keep memory usage
-  // down.
-  const bool ClearDIEs = extractDIEsIfNeeded(false) > 1;
-  DieArray[0].collectChildrenAddressRanges(this, CURanges);
-
-  // Collect address ranges from DIEs in .dwo if necessary.
-  bool DWOCreated = parseDWO();
-  if (DWO.get())
-    DWO->getUnit()->collectAddressRanges(CURanges);
-  if (DWOCreated)
-    DWO.reset();
-
-  // Keep memory down by clearing DIEs if this generate function
-  // caused them to be parsed.
-  if (ClearDIEs)
-    clearDIEs(true);
-}
-
-const DWARFDebugInfoEntryMinimal *
-DWARFUnit::getSubprogramForAddress(uint64_t Address) {
-  extractDIEsIfNeeded(false);
-  for (const DWARFDebugInfoEntryMinimal &DIE : DieArray) {
-    if (DIE.isSubprogramDIE() &&
-        DIE.addressRangeContainsAddress(this, Address)) {
-      return &DIE;
-    }
-  }
-  return nullptr;
-}
-
-DWARFDebugInfoEntryInlinedChain
-DWARFUnit::getInlinedChainForAddress(uint64_t Address) {
-  // First, find a subprogram that contains the given address (the root
-  // of inlined chain).
-  const DWARFUnit *ChainCU = nullptr;
-  const DWARFDebugInfoEntryMinimal *SubprogramDIE =
-      getSubprogramForAddress(Address);
-  if (SubprogramDIE) {
-    ChainCU = this;
-  } else {
-    // Try to look for subprogram DIEs in the DWO file.
-    parseDWO();
-    if (DWO.get()) {
-      SubprogramDIE = DWO->getUnit()->getSubprogramForAddress(Address);
-      if (SubprogramDIE)
-        ChainCU = DWO->getUnit();
-    }
-  }
-
-  // Get inlined chain rooted at this subprogram DIE.
-  if (!SubprogramDIE)
-    return DWARFDebugInfoEntryInlinedChain();
-  return SubprogramDIE->getInlinedChainForAddress(ChainCU, Address);
-}
diff --git a/lib/DebugInfo/DWARFUnit.h b/lib/DebugInfo/DWARFUnit.h
deleted file mode 100644
index 786f00f..0000000
--- a/lib/DebugInfo/DWARFUnit.h
+++ /dev/null
@@ -1,245 +0,0 @@
-//===-- DWARFUnit.h ---------------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_DEBUGINFO_DWARFUNIT_H
-#define LLVM_LIB_DEBUGINFO_DWARFUNIT_H
-
-#include "DWARFDebugAbbrev.h"
-#include "DWARFDebugInfoEntry.h"
-#include "DWARFDebugRangeList.h"
-#include "DWARFRelocMap.h"
-#include "DWARFSection.h"
-#include <vector>
-
-namespace llvm {
-
-namespace object {
-class ObjectFile;
-}
-
-class DWARFContext;
-class DWARFDebugAbbrev;
-class DWARFUnit;
-class StringRef;
-class raw_ostream;
-
-/// Base class for all DWARFUnitSection classes. This provides the
-/// functionality common to all unit types.
-class DWARFUnitSectionBase {
-public:
-  /// Returns the Unit that contains the given section offset in the
-  /// same section this Unit originated from.
-  virtual DWARFUnit *getUnitForOffset(uint32_t Offset) const = 0;
-
-  void parse(DWARFContext &C, const DWARFSection &Section);
-  void parseDWO(DWARFContext &C, const DWARFSection &DWOSection);
-
-protected:
-  virtual void parseImpl(DWARFContext &Context, const DWARFSection &Section,
-                         const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
-                         StringRef SOS, StringRef AOS, bool isLittleEndian) = 0;
-
-  ~DWARFUnitSectionBase() {}
-};
-
-/// Concrete instance of DWARFUnitSection, specialized for one Unit type.
-template<typename UnitType>
-class DWARFUnitSection final : public SmallVector<std::unique_ptr<UnitType>, 1>,
-                               public DWARFUnitSectionBase {
-
-  struct UnitOffsetComparator {
-    bool operator()(uint32_t LHS,
-                    const std::unique_ptr<UnitType> &RHS) const {
-      return LHS < RHS->getNextUnitOffset();
-    }
-  };
-
-  bool Parsed;
-
-public:
-  DWARFUnitSection() : Parsed(false) {}
-  DWARFUnitSection(DWARFUnitSection &&DUS) :
-    SmallVector<std::unique_ptr<UnitType>, 1>(std::move(DUS)), Parsed(DUS.Parsed) {}
-
-  typedef llvm::SmallVectorImpl<std::unique_ptr<UnitType>> UnitVector;
-  typedef typename UnitVector::iterator iterator;
-  typedef llvm::iterator_range<typename UnitVector::iterator> iterator_range;
-
-  UnitType *getUnitForOffset(uint32_t Offset) const override {
-    auto *CU = std::upper_bound(this->begin(), this->end(), Offset,
-                                UnitOffsetComparator());
-    if (CU != this->end())
-      return CU->get();
-    return nullptr;
-  }
-
-private:
-  void parseImpl(DWARFContext &Context, const DWARFSection &Section,
-                 const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
-                 StringRef SOS, StringRef AOS, bool LE) override {
-    if (Parsed)
-      return;
-    DataExtractor Data(Section.Data, LE, 0);
-    uint32_t Offset = 0;
-    while (Data.isValidOffset(Offset)) {
-      auto U = llvm::make_unique<UnitType>(Context, Section, DA, RS, SS, SOS,
-                                           AOS, LE, *this);
-      if (!U->extract(Data, &Offset))
-        break;
-      this->push_back(std::move(U));
-      Offset = this->back()->getNextUnitOffset();
-    }
-    Parsed = true;
-  }
-};
-
-class DWARFUnit {
-  DWARFContext &Context;
-  // Section containing this DWARFUnit.
-  const DWARFSection &InfoSection;
-
-  const DWARFDebugAbbrev *Abbrev;
-  StringRef RangeSection;
-  uint32_t RangeSectionBase;
-  StringRef StringSection;
-  StringRef StringOffsetSection;
-  StringRef AddrOffsetSection;
-  uint32_t AddrOffsetSectionBase;
-  bool isLittleEndian;
-  const DWARFUnitSectionBase &UnitSection;
-
-  uint32_t Offset;
-  uint32_t Length;
-  uint16_t Version;
-  const DWARFAbbreviationDeclarationSet *Abbrevs;
-  uint8_t AddrSize;
-  uint64_t BaseAddr;
-  // The compile unit debug information entry items.
-  std::vector<DWARFDebugInfoEntryMinimal> DieArray;
-
-  class DWOHolder {
-    object::OwningBinary<object::ObjectFile> DWOFile;
-    std::unique_ptr<DWARFContext> DWOContext;
-    DWARFUnit *DWOU;
-  public:
-    DWOHolder(StringRef DWOPath);
-    DWARFUnit *getUnit() const { return DWOU; }
-  };
-  std::unique_ptr<DWOHolder> DWO;
-
-protected:
-  virtual bool extractImpl(DataExtractor debug_info, uint32_t *offset_ptr);
-  /// Size in bytes of the unit header.
-  virtual uint32_t getHeaderSize() const { return 11; }
-
-public:
-  DWARFUnit(DWARFContext &Context, const DWARFSection &Section,
-            const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
-            StringRef SOS, StringRef AOS, bool LE,
-            const DWARFUnitSectionBase &UnitSection);
-
-  virtual ~DWARFUnit();
-
-  DWARFContext& getContext() const { return Context; }
-
-  StringRef getStringSection() const { return StringSection; }
-  StringRef getStringOffsetSection() const { return StringOffsetSection; }
-  void setAddrOffsetSection(StringRef AOS, uint32_t Base) {
-    AddrOffsetSection = AOS;
-    AddrOffsetSectionBase = Base;
-  }
-  void setRangesSection(StringRef RS, uint32_t Base) {
-    RangeSection = RS;
-    RangeSectionBase = Base;
-  }
-
-  bool getAddrOffsetSectionItem(uint32_t Index, uint64_t &Result) const;
-  // FIXME: Result should be uint64_t in DWARF64.
-  bool getStringOffsetSectionItem(uint32_t Index, uint32_t &Result) const;
-
-  DataExtractor getDebugInfoExtractor() const {
-    return DataExtractor(InfoSection.Data, isLittleEndian, AddrSize);
-  }
-  DataExtractor getStringExtractor() const {
-    return DataExtractor(StringSection, false, 0);
-  }
-
-  const RelocAddrMap *getRelocMap() const { return &InfoSection.Relocs; }
-
-  bool extract(DataExtractor debug_info, uint32_t* offset_ptr);
-
-  /// extractRangeList - extracts the range list referenced by this compile
-  /// unit from .debug_ranges section. Returns true on success.
-  /// Requires that compile unit is already extracted.
-  bool extractRangeList(uint32_t RangeListOffset,
-                        DWARFDebugRangeList &RangeList) const;
-  void clear();
-  uint32_t getOffset() const { return Offset; }
-  uint32_t getNextUnitOffset() const { return Offset + Length + 4; }
-  uint32_t getLength() const { return Length; }
-  uint16_t getVersion() const { return Version; }
-  const DWARFAbbreviationDeclarationSet *getAbbreviations() const {
-    return Abbrevs;
-  }
-  uint8_t getAddressByteSize() const { return AddrSize; }
-  uint64_t getBaseAddress() const { return BaseAddr; }
-
-  void setBaseAddress(uint64_t base_addr) {
-    BaseAddr = base_addr;
-  }
-
-  const DWARFDebugInfoEntryMinimal *
-  getCompileUnitDIE(bool extract_cu_die_only = true) {
-    extractDIEsIfNeeded(extract_cu_die_only);
-    return DieArray.empty() ? nullptr : &DieArray[0];
-  }
-
-  const char *getCompilationDir();
-  uint64_t getDWOId();
-
-  void collectAddressRanges(DWARFAddressRangesVector &CURanges);
-
-  /// getInlinedChainForAddress - fetches inlined chain for a given address.
-  /// Returns empty chain if there is no subprogram containing address. The
-  /// chain is valid as long as parsed compile unit DIEs are not cleared.
-  DWARFDebugInfoEntryInlinedChain getInlinedChainForAddress(uint64_t Address);
-
-  /// getUnitSection - Return the DWARFUnitSection containing this unit.
-  const DWARFUnitSectionBase &getUnitSection() const { return UnitSection; }
-
-private:
-  /// Size in bytes of the .debug_info data associated with this compile unit.
-  size_t getDebugInfoSize() const { return Length + 4 - getHeaderSize(); }
-
-  /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
-  /// hasn't already been done. Returns the number of DIEs parsed at this call.
-  size_t extractDIEsIfNeeded(bool CUDieOnly);
-  /// extractDIEsToVector - Appends all parsed DIEs to a vector.
-  void extractDIEsToVector(bool AppendCUDie, bool AppendNonCUDIEs,
-                           std::vector<DWARFDebugInfoEntryMinimal> &DIEs) const;
-  /// setDIERelations - We read in all of the DIE entries into our flat list
-  /// of DIE entries and now we need to go back through all of them and set the
-  /// parent, sibling and child pointers for quick DIE navigation.
-  void setDIERelations();
-  /// clearDIEs - Clear parsed DIEs to keep memory usage low.
-  void clearDIEs(bool KeepCUDie);
-
-  /// parseDWO - Parses .dwo file for current compile unit. Returns true if
-  /// it was actually constructed.
-  bool parseDWO();
-
-  /// getSubprogramForAddress - Returns subprogram DIE with address range
-  /// encompassing the provided address. The pointer is alive as long as parsed
-  /// compile unit DIEs are not cleared.
-  const DWARFDebugInfoEntryMinimal *getSubprogramForAddress(uint64_t Address);
-};
-
-}
-
-#endif
diff --git a/lib/DebugInfo/LLVMBuild.txt b/lib/DebugInfo/LLVMBuild.txt
index f347d5e..7a8e8ba 100644
--- a/lib/DebugInfo/LLVMBuild.txt
+++ b/lib/DebugInfo/LLVMBuild.txt
@@ -15,8 +15,10 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = DWARF PDB
+
 [component_0]
-type = Library
+type = Group
 name = DebugInfo
-parent = Libraries
-required_libraries = Object Support
+parent = $ROOT
diff --git a/lib/DebugInfo/Makefile b/lib/DebugInfo/Makefile
index 1292b57..27a5e1f 100644
--- a/lib/DebugInfo/Makefile
+++ b/lib/DebugInfo/Makefile
@@ -6,9 +6,10 @@
 # License. See LICENSE.TXT for details.
 #
 ##===----------------------------------------------------------------------===##
-
 LEVEL = ../..
-LIBRARYNAME = LLVMDebugInfo
-BUILD_ARCHIVE := 1
 
-include $(LEVEL)/Makefile.common
+include $(LEVEL)/Makefile.config
+
+PARALLEL_DIRS := DWARF PDB
+
+include $(LEVEL)/Makefile.common
+\ No newline at end of file
diff --git a/lib/DebugInfo/PDB/Android.mk b/lib/DebugInfo/PDB/Android.mk
new file mode 100644
index 0000000..c8d4fd1
--- /dev/null
+++ b/lib/DebugInfo/PDB/Android.mk
@@ -0,0 +1,75 @@
+LOCAL_PATH:= $(call my-dir)
+
+# No dia support
+debuginfo_pdb_SRC_FILES := \
+  IPDBSourceFile.cpp \
+  PDB.cpp \
+  PDBExtras.cpp \
+  PDBInterfaceAnchors.cpp \
+  PDBSymbolAnnotation.cpp \
+  PDBSymbolBlock.cpp \
+  PDBSymbolCompiland.cpp \
+  PDBSymbolCompilandDetails.cpp \
+  PDBSymbolCompilandEnv.cpp \
+  PDBSymbol.cpp \
+  PDBSymbolCustom.cpp \
+  PDBSymbolData.cpp \
+  PDBSymbolExe.cpp \
+  PDBSymbolFunc.cpp \
+  PDBSymbolFuncDebugEnd.cpp \
+  PDBSymbolFuncDebugStart.cpp \
+  PDBSymbolLabel.cpp \
+  PDBSymbolPublicSymbol.cpp \
+  PDBSymbolThunk.cpp \
+  PDBSymbolTypeArray.cpp \
+  PDBSymbolTypeBaseClass.cpp \
+  PDBSymbolTypeBuiltin.cpp \
+  PDBSymbolTypeCustom.cpp \
+  PDBSymbolTypeDimension.cpp \
+  PDBSymbolTypeEnum.cpp \
+  PDBSymbolTypeFriend.cpp \
+  PDBSymbolTypeFunctionArg.cpp \
+  PDBSymbolTypeFunctionSig.cpp \
+  PDBSymbolTypeManaged.cpp \
+  PDBSymbolTypePointer.cpp \
+  PDBSymbolTypeTypedef.cpp \
+  PDBSymbolTypeUDT.cpp \
+  PDBSymbolTypeVTable.cpp \
+  PDBSymbolTypeVTableShape.cpp \
+  PDBSymbolUnknown.cpp \
+  PDBSymbolUsingNamespace.cpp \
+  PDBSymDumper.cpp
+
+# For the host
+# =====================================================
+include $(CLEAR_VARS)
+
+REQUIRES_RTTI := 1
+
+LOCAL_SRC_FILES := $(debuginfo_pdb_SRC_FILES)
+
+LOCAL_MODULE:= libLLVMDebugInfoPDB
+
+LOCAL_MODULE_TAGS := optional
+
+include $(LLVM_HOST_BUILD_MK)
+include $(LLVM_GEN_INTRINSICS_MK)
+include $(BUILD_HOST_STATIC_LIBRARY)
+
+# For the device
+# =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
+include $(CLEAR_VARS)
+
+REQUIRES_RTTI := 1
+
+LOCAL_SRC_FILES := $(debuginfo_pdb_SRC_FILES)
+
+LOCAL_MODULE:= libLLVMDebugInfoPDB
+
+LOCAL_MODULE_TAGS := optional
+
+include $(LLVM_DEVICE_BUILD_MK)
+include $(LLVM_GEN_INTRINSICS_MK)
+include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/DebugInfo/PDB/CMakeLists.txt b/lib/DebugInfo/PDB/CMakeLists.txt
new file mode 100644
index 0000000..87e357e
--- /dev/null
+++ b/lib/DebugInfo/PDB/CMakeLists.txt
@@ -0,0 +1,76 @@
+macro(add_pdb_impl_folder group)
+  list(APPEND PDB_IMPL_SOURCES ${ARGN})
+  source_group(${group} FILES ${ARGN})
+endmacro()
+
+if(HAVE_DIA_SDK)
+  include_directories(${MSVC_DIA_SDK_DIR}/include)
+  set(LIBPDB_LINK_FOLDERS "${MSVC_DIA_SDK_DIR}\\lib")
+  if (CMAKE_SIZEOF_VOID_P EQUAL 8)
+    set(LIBPDB_LINK_FOLDERS "${LIBPDB_LINK_FOLDERS}\\amd64")
+  endif()
+  set(LIBPDB_ADDITIONAL_LIBRARIES "${LIBPDB_LINK_FOLDERS}\\diaguids.lib")
+
+  add_pdb_impl_folder(DIA
+    DIA/DIADataStream.cpp
+    DIA/DIAEnumDebugStreams.cpp
+    DIA/DIAEnumLineNumbers.cpp
+    DIA/DIAEnumSourceFiles.cpp
+    DIA/DIAEnumSymbols.cpp
+    DIA/DIALineNumber.cpp
+    DIA/DIARawSymbol.cpp
+    DIA/DIASession.cpp
+    DIA/DIASourceFile.cpp
+    )
+
+    set(LIBPDB_ADDITIONAL_HEADER_DIRS "${LLVM_MAIN_INCLUDE_DIR}/llvm/DebugInfo/PDB/DIA")
+
+endif()
+
+list(APPEND LIBPDB_ADDITIONAL_HEADER_DIRS "${LLVM_MAIN_INCLUDE_DIR}/llvm/DebugInfo/PDB")
+
+add_llvm_library(LLVMDebugInfoPDB
+  IPDBSourceFile.cpp
+  PDB.cpp
+  PDBExtras.cpp
+  PDBInterfaceAnchors.cpp
+  PDBSymbol.cpp
+  PDBSymbolAnnotation.cpp
+  PDBSymbolBlock.cpp
+  PDBSymbolCompiland.cpp
+  PDBSymbolCompilandDetails.cpp
+  PDBSymbolCompilandEnv.cpp
+  PDBSymbolCustom.cpp
+  PDBSymbolData.cpp
+  PDBSymbolExe.cpp
+  PDBSymbolFunc.cpp
+  PDBSymbolFuncDebugEnd.cpp
+  PDBSymbolFuncDebugStart.cpp
+  PDBSymbolLabel.cpp
+  PDBSymbolPublicSymbol.cpp
+  PDBSymbolThunk.cpp
+  PDBSymbolTypeArray.cpp
+  PDBSymbolTypeBaseClass.cpp
+  PDBSymbolTypeBuiltin.cpp
+  PDBSymbolTypeCustom.cpp
+  PDBSymbolTypeDimension.cpp
+  PDBSymbolTypeEnum.cpp
+  PDBSymbolTypeFriend.cpp
+  PDBSymbolTypeFunctionArg.cpp
+  PDBSymbolTypeFunctionSig.cpp
+  PDBSymbolTypeManaged.cpp
+  PDBSymbolTypePointer.cpp
+  PDBSymbolTypeTypedef.cpp
+  PDBSymbolTypeUDT.cpp
+  PDBSymbolTypeVTable.cpp
+  PDBSymbolTypeVTableShape.cpp
+  PDBSymbolUnknown.cpp
+  PDBSymbolUsingNamespace.cpp
+  PDBSymDumper.cpp
+  ${PDB_IMPL_SOURCES}
+
+  ADDITIONAL_HEADER_DIRS
+  ${LIBPDB_ADDITIONAL_HEADER_DIRS}
+  )
+
+target_link_libraries(LLVMDebugInfoPDB ${cmake_2_8_12_INTERFACE} "${LIBPDB_ADDITIONAL_LIBRARIES}")
diff --git a/lib/DebugInfo/PDB/DIA/DIADataStream.cpp b/lib/DebugInfo/PDB/DIA/DIADataStream.cpp
new file mode 100644
index 0000000..e0e1b27
--- /dev/null
+++ b/lib/DebugInfo/PDB/DIA/DIADataStream.cpp
@@ -0,0 +1,73 @@
+//===- DIADataStream.cpp - DIA implementation of IPDBDataStream -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/DIA/DIADataStream.h"
+#include "llvm/Support/ConvertUTF.h"
+
+using namespace llvm;
+
+DIADataStream::DIADataStream(CComPtr<IDiaEnumDebugStreamData> DiaStreamData)
+    : StreamData(DiaStreamData) {}
+
+uint32_t DIADataStream::getRecordCount() const {
+  LONG Count = 0;
+  return (S_OK == StreamData->get_Count(&Count)) ? Count : 0;
+}
+
+std::string DIADataStream::getName() const {
+  CComBSTR Name16;
+  if (S_OK != StreamData->get_name(&Name16))
+    return std::string();
+
+  std::string Name8;
+  llvm::ArrayRef<char> Name16Bytes(reinterpret_cast<char *>(Name16.m_str),
+                                   Name16.ByteLength());
+  if (!llvm::convertUTF16ToUTF8String(Name16Bytes, Name8))
+    return std::string();
+  return Name8;
+}
+
+llvm::Optional<DIADataStream::RecordType>
+DIADataStream::getItemAtIndex(uint32_t Index) const {
+  RecordType Record;
+  DWORD RecordSize = 0;
+  StreamData->Item(Index, 0, &RecordSize, nullptr);
+  if (RecordSize == 0)
+    return llvm::Optional<RecordType>();
+
+  Record.resize(RecordSize);
+  if (S_OK != StreamData->Item(Index, RecordSize, &RecordSize, &Record[0]))
+    return llvm::Optional<RecordType>();
+  return Record;
+}
+
+bool DIADataStream::getNext(RecordType &Record) {
+  Record.clear();
+  DWORD RecordSize = 0;
+  ULONG CountFetched = 0;
+  StreamData->Next(1, 0, &RecordSize, nullptr, &CountFetched);
+  if (RecordSize == 0)
+    return false;
+
+  Record.resize(RecordSize);
+  if (S_OK ==
+      StreamData->Next(1, RecordSize, &RecordSize, &Record[0], &CountFetched))
+    return false;
+  return true;
+}
+
+void DIADataStream::reset() { StreamData->Reset(); }
+
+DIADataStream *DIADataStream::clone() const {
+  CComPtr<IDiaEnumDebugStreamData> EnumeratorClone;
+  if (S_OK != StreamData->Clone(&EnumeratorClone))
+    return nullptr;
+
+  return new DIADataStream(EnumeratorClone);
+}
diff --git a/lib/DebugInfo/PDB/DIA/DIAEnumDebugStreams.cpp b/lib/DebugInfo/PDB/DIA/DIAEnumDebugStreams.cpp
new file mode 100644
index 0000000..23c6489
--- /dev/null
+++ b/lib/DebugInfo/PDB/DIA/DIAEnumDebugStreams.cpp
@@ -0,0 +1,53 @@
+//==- DIAEnumDebugStreams.cpp - DIA Debug Stream Enumerator impl -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/DIA/DIADataStream.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumDebugStreams.h"
+
+using namespace llvm;
+
+DIAEnumDebugStreams::DIAEnumDebugStreams(
+    CComPtr<IDiaEnumDebugStreams> DiaEnumerator)
+    : Enumerator(DiaEnumerator) {}
+
+uint32_t DIAEnumDebugStreams::getChildCount() const {
+  LONG Count = 0;
+  return (S_OK == Enumerator->get_Count(&Count)) ? Count : 0;
+}
+
+std::unique_ptr<IPDBDataStream>
+DIAEnumDebugStreams::getChildAtIndex(uint32_t Index) const {
+  CComPtr<IDiaEnumDebugStreamData> Item;
+  VARIANT VarIndex;
+  VarIndex.vt = VT_I4;
+  VarIndex.lVal = Index;
+  if (S_OK != Enumerator->Item(VarIndex, &Item))
+    return nullptr;
+
+  return std::unique_ptr<IPDBDataStream>(new DIADataStream(Item));
+}
+
+std::unique_ptr<IPDBDataStream> DIAEnumDebugStreams::getNext() {
+  CComPtr<IDiaEnumDebugStreamData> Item;
+  ULONG NumFetched = 0;
+  if (S_OK != Enumerator->Next(1, &Item, &NumFetched))
+    return nullptr;
+
+  return std::unique_ptr<IPDBDataStream>(new DIADataStream(Item));
+}
+
+void DIAEnumDebugStreams::reset() { Enumerator->Reset(); }
+
+DIAEnumDebugStreams *DIAEnumDebugStreams::clone() const {
+  CComPtr<IDiaEnumDebugStreams> EnumeratorClone;
+  if (S_OK != Enumerator->Clone(&EnumeratorClone))
+    return nullptr;
+  return new DIAEnumDebugStreams(EnumeratorClone);
+}
diff --git a/lib/DebugInfo/PDB/DIA/DIAEnumLineNumbers.cpp b/lib/DebugInfo/PDB/DIA/DIAEnumLineNumbers.cpp
new file mode 100644
index 0000000..32a9af2
--- /dev/null
+++ b/lib/DebugInfo/PDB/DIA/DIAEnumLineNumbers.cpp
@@ -0,0 +1,50 @@
+//==- DIAEnumLineNumbers.cpp - DIA Line Number Enumerator impl ---*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumLineNumbers.h"
+#include "llvm/DebugInfo/PDB/DIA/DIALineNumber.h"
+
+using namespace llvm;
+
+DIAEnumLineNumbers::DIAEnumLineNumbers(
+    CComPtr<IDiaEnumLineNumbers> DiaEnumerator)
+    : Enumerator(DiaEnumerator) {}
+
+uint32_t DIAEnumLineNumbers::getChildCount() const {
+  LONG Count = 0;
+  return (S_OK == Enumerator->get_Count(&Count)) ? Count : 0;
+}
+
+std::unique_ptr<IPDBLineNumber>
+DIAEnumLineNumbers::getChildAtIndex(uint32_t Index) const {
+  CComPtr<IDiaLineNumber> Item;
+  if (S_OK != Enumerator->Item(Index, &Item))
+    return nullptr;
+
+  return std::unique_ptr<IPDBLineNumber>(new DIALineNumber(Item));
+}
+
+std::unique_ptr<IPDBLineNumber> DIAEnumLineNumbers::getNext() {
+  CComPtr<IDiaLineNumber> Item;
+  ULONG NumFetched = 0;
+  if (S_OK != Enumerator->Next(1, &Item, &NumFetched))
+    return nullptr;
+
+  return std::unique_ptr<IPDBLineNumber>(new DIALineNumber(Item));
+}
+
+void DIAEnumLineNumbers::reset() { Enumerator->Reset(); }
+
+DIAEnumLineNumbers *DIAEnumLineNumbers::clone() const {
+  CComPtr<IDiaEnumLineNumbers> EnumeratorClone;
+  if (S_OK != Enumerator->Clone(&EnumeratorClone))
+    return nullptr;
+  return new DIAEnumLineNumbers(EnumeratorClone);
+}
diff --git a/lib/DebugInfo/PDB/DIA/DIAEnumSourceFiles.cpp b/lib/DebugInfo/PDB/DIA/DIAEnumSourceFiles.cpp
new file mode 100644
index 0000000..1a94610
--- /dev/null
+++ b/lib/DebugInfo/PDB/DIA/DIAEnumSourceFiles.cpp
@@ -0,0 +1,50 @@
+//==- DIAEnumSourceFiles.cpp - DIA Source File Enumerator impl ---*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumSourceFiles.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASourceFile.h"
+
+using namespace llvm;
+
+DIAEnumSourceFiles::DIAEnumSourceFiles(
+    const DIASession &PDBSession, CComPtr<IDiaEnumSourceFiles> DiaEnumerator)
+    : Session(PDBSession), Enumerator(DiaEnumerator) {}
+
+uint32_t DIAEnumSourceFiles::getChildCount() const {
+  LONG Count = 0;
+  return (S_OK == Enumerator->get_Count(&Count)) ? Count : 0;
+}
+
+std::unique_ptr<IPDBSourceFile>
+DIAEnumSourceFiles::getChildAtIndex(uint32_t Index) const {
+  CComPtr<IDiaSourceFile> Item;
+  if (S_OK != Enumerator->Item(Index, &Item))
+    return nullptr;
+
+  return std::unique_ptr<IPDBSourceFile>(new DIASourceFile(Session, Item));
+}
+
+std::unique_ptr<IPDBSourceFile> DIAEnumSourceFiles::getNext() {
+  CComPtr<IDiaSourceFile> Item;
+  ULONG NumFetched = 0;
+  if (S_OK != Enumerator->Next(1, &Item, &NumFetched))
+    return nullptr;
+
+  return std::unique_ptr<IPDBSourceFile>(new DIASourceFile(Session, Item));
+}
+
+void DIAEnumSourceFiles::reset() { Enumerator->Reset(); }
+
+DIAEnumSourceFiles *DIAEnumSourceFiles::clone() const {
+  CComPtr<IDiaEnumSourceFiles> EnumeratorClone;
+  if (S_OK != Enumerator->Clone(&EnumeratorClone))
+    return nullptr;
+  return new DIAEnumSourceFiles(Session, EnumeratorClone);
+}
diff --git a/lib/DebugInfo/PDB/DIA/DIAEnumSymbols.cpp b/lib/DebugInfo/PDB/DIA/DIAEnumSymbols.cpp
new file mode 100644
index 0000000..6754d9a
--- /dev/null
+++ b/lib/DebugInfo/PDB/DIA/DIAEnumSymbols.cpp
@@ -0,0 +1,54 @@
+//==- DIAEnumSymbols.cpp - DIA Symbol Enumerator impl ------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumSymbols.h"
+#include "llvm/DebugInfo/PDB/DIA/DIARawSymbol.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
+
+using namespace llvm;
+
+DIAEnumSymbols::DIAEnumSymbols(const DIASession &PDBSession,
+                               CComPtr<IDiaEnumSymbols> DiaEnumerator)
+    : Session(PDBSession), Enumerator(DiaEnumerator) {}
+
+uint32_t DIAEnumSymbols::getChildCount() const {
+  LONG Count = 0;
+  return (S_OK == Enumerator->get_Count(&Count)) ? Count : 0;
+}
+
+std::unique_ptr<PDBSymbol>
+DIAEnumSymbols::getChildAtIndex(uint32_t Index) const {
+  CComPtr<IDiaSymbol> Item;
+  if (S_OK != Enumerator->Item(Index, &Item))
+    return nullptr;
+
+  std::unique_ptr<DIARawSymbol> RawSymbol(new DIARawSymbol(Session, Item));
+  return std::unique_ptr<PDBSymbol>(PDBSymbol::create(Session, std::move(RawSymbol)));
+}
+
+std::unique_ptr<PDBSymbol> DIAEnumSymbols::getNext() {
+  CComPtr<IDiaSymbol> Item;
+  ULONG NumFetched = 0;
+  if (S_OK != Enumerator->Next(1, &Item, &NumFetched))
+    return nullptr;
+
+  std::unique_ptr<DIARawSymbol> RawSymbol(new DIARawSymbol(Session, Item));
+  return std::unique_ptr<PDBSymbol>(
+      PDBSymbol::create(Session, std::move(RawSymbol)));
+}
+
+void DIAEnumSymbols::reset() { Enumerator->Reset(); }
+
+DIAEnumSymbols *DIAEnumSymbols::clone() const {
+  CComPtr<IDiaEnumSymbols> EnumeratorClone;
+  if (S_OK != Enumerator->Clone(&EnumeratorClone))
+    return nullptr;
+  return new DIAEnumSymbols(Session, EnumeratorClone);
+}
diff --git a/lib/DebugInfo/PDB/DIA/DIALineNumber.cpp b/lib/DebugInfo/PDB/DIA/DIALineNumber.cpp
new file mode 100644
index 0000000..c5577f1
--- /dev/null
+++ b/lib/DebugInfo/PDB/DIA/DIALineNumber.cpp
@@ -0,0 +1,75 @@
+//===- DIALineNumber.cpp - DIA implementation of IPDBLineNumber -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/DIA/DIALineNumber.h"
+
+using namespace llvm;
+
+DIALineNumber::DIALineNumber(CComPtr<IDiaLineNumber> DiaLineNumber)
+    : LineNumber(DiaLineNumber) {}
+
+uint32_t DIALineNumber::getLineNumber() const {
+  DWORD Line = 0;
+  return (S_OK == LineNumber->get_lineNumber(&Line)) ? Line : 0;
+}
+
+uint32_t DIALineNumber::getLineNumberEnd() const {
+  DWORD LineEnd = 0;
+  return (S_OK == LineNumber->get_lineNumberEnd(&LineEnd)) ? LineEnd : 0;
+}
+
+uint32_t DIALineNumber::getColumnNumber() const {
+  DWORD Column = 0;
+  return (S_OK == LineNumber->get_columnNumber(&Column)) ? Column : 0;
+}
+
+uint32_t DIALineNumber::getColumnNumberEnd() const {
+  DWORD ColumnEnd = 0;
+  return (S_OK == LineNumber->get_columnNumberEnd(&ColumnEnd)) ? ColumnEnd : 0;
+}
+
+uint32_t DIALineNumber::getAddressSection() const {
+  DWORD Section = 0;
+  return (S_OK == LineNumber->get_addressSection(&Section)) ? Section : 0;
+}
+
+uint32_t DIALineNumber::getAddressOffset() const {
+  DWORD Offset = 0;
+  return (S_OK == LineNumber->get_addressOffset(&Offset)) ? Offset : 0;
+}
+
+uint32_t DIALineNumber::getRelativeVirtualAddress() const {
+  DWORD RVA = 0;
+  return (S_OK == LineNumber->get_relativeVirtualAddress(&RVA)) ? RVA : 0;
+}
+
+uint64_t DIALineNumber::getVirtualAddress() const {
+  ULONGLONG Addr = 0;
+  return (S_OK == LineNumber->get_virtualAddress(&Addr)) ? Addr : 0;
+}
+
+uint32_t DIALineNumber::getLength() const {
+  DWORD Length = 0;
+  return (S_OK == LineNumber->get_length(&Length)) ? Length : 0;
+}
+
+uint32_t DIALineNumber::getSourceFileId() const {
+  DWORD Id = 0;
+  return (S_OK == LineNumber->get_sourceFileId(&Id)) ? Id : 0;
+}
+
+uint32_t DIALineNumber::getCompilandId() const {
+  DWORD Id = 0;
+  return (S_OK == LineNumber->get_compilandId(&Id)) ? Id : 0;
+}
+
+bool DIALineNumber::isStatement() const {
+  BOOL Statement = 0;
+  return (S_OK == LineNumber->get_statement(&Statement)) ? Statement : false;
+}
diff --git a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
new file mode 100644
index 0000000..abe0ab5
--- /dev/null
+++ b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
@@ -0,0 +1,1095 @@
+//===- DIARawSymbol.cpp - DIA implementation of IPDBRawSymbol ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumSymbols.h"
+#include "llvm/DebugInfo/PDB/DIA/DIARawSymbol.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+Variant VariantFromVARIANT(const VARIANT &V) {
+  Variant Result;
+  switch (V.vt) {
+  case VT_I1:
+    Result.Int8 = V.cVal;
+    Result.Type = PDB_VariantType::Int8;
+    break;
+  case VT_I2:
+    Result.Int16 = V.iVal;
+    Result.Type = PDB_VariantType::Int16;
+    break;
+  case VT_I4:
+    Result.Int32 = V.intVal;
+    Result.Type = PDB_VariantType::Int32;
+    break;
+  case VT_I8:
+    Result.Int64 = V.llVal;
+    Result.Type = PDB_VariantType::Int64;
+    break;
+  case VT_UI1:
+    Result.UInt8 = V.bVal;
+    Result.Type = PDB_VariantType::UInt8;
+    break;
+  case VT_UI2:
+    Result.UInt16 = V.uiVal;
+    Result.Type = PDB_VariantType::UInt16;
+    break;
+  case VT_UI4:
+    Result.UInt32 = V.uintVal;
+    Result.Type = PDB_VariantType::UInt32;
+    break;
+  case VT_UI8:
+    Result.UInt64 = V.ullVal;
+    Result.Type = PDB_VariantType::UInt64;
+    break;
+  case VT_BOOL:
+    Result.Bool = (V.boolVal == VARIANT_TRUE) ? true : false;
+    Result.Type = PDB_VariantType::Bool;
+    break;
+  case VT_R4:
+    Result.Single = V.fltVal;
+    Result.Type = PDB_VariantType::Single;
+    break;
+  case VT_R8:
+    Result.Double = V.dblVal;
+    Result.Type = PDB_VariantType::Double;
+    break;
+  default:
+    Result.Type = PDB_VariantType::Unknown;
+    break;
+  }
+  return Result;
+}
+
+template <typename ArgType>
+ArgType PrivateGetDIAValue(IDiaSymbol *Symbol,
+                           HRESULT (__stdcall IDiaSymbol::*Method)(ArgType *)) {
+  ArgType Value;
+  if (S_OK == (Symbol->*Method)(&Value))
+    return static_cast<ArgType>(Value);
+
+  return ArgType();
+}
+
+template <typename ArgType, typename RetType>
+RetType PrivateGetDIAValue(IDiaSymbol *Symbol,
+                           HRESULT (__stdcall IDiaSymbol::*Method)(ArgType *)) {
+  ArgType Value;
+  if (S_OK == (Symbol->*Method)(&Value))
+    return static_cast<RetType>(Value);
+
+  return RetType();
+}
+
+std::string
+PrivateGetDIAValue(IDiaSymbol *Symbol,
+                   HRESULT (__stdcall IDiaSymbol::*Method)(BSTR *)) {
+  CComBSTR Result16;
+  if (S_OK != (Symbol->*Method)(&Result16))
+    return std::string();
+
+  const char *SrcBytes = reinterpret_cast<const char *>(Result16.m_str);
+  llvm::ArrayRef<char> SrcByteArray(SrcBytes, Result16.ByteLength());
+  std::string Result8;
+  if (!llvm::convertUTF16ToUTF8String(SrcByteArray, Result8))
+    return std::string();
+  return Result8;
+}
+
+PDB_UniqueId
+PrivateGetDIAValue(IDiaSymbol *Symbol,
+                   HRESULT (__stdcall IDiaSymbol::*Method)(GUID *)) {
+  GUID Result;
+  if (S_OK != (Symbol->*Method)(&Result))
+    return PDB_UniqueId();
+
+  static_assert(sizeof(PDB_UniqueId) == sizeof(GUID),
+                "PDB_UniqueId is the wrong size!");
+  PDB_UniqueId IdResult;
+  ::memcpy(&IdResult, &Result, sizeof(GUID));
+  return IdResult;
+}
+
+template <typename ArgType>
+void DumpDIAValue(llvm::raw_ostream &OS, int Indent, StringRef Name,
+                  IDiaSymbol *Symbol,
+                  HRESULT (__stdcall IDiaSymbol::*Method)(ArgType *)) {
+  ArgType Value;
+  if (S_OK == (Symbol->*Method)(&Value)) {
+    OS << "\n";
+    OS.indent(Indent);
+    OS << Name << ": " << Value;
+  }
+}
+
+void DumpDIAValue(llvm::raw_ostream &OS, int Indent, StringRef Name,
+                  IDiaSymbol *Symbol,
+                  HRESULT (__stdcall IDiaSymbol::*Method)(BSTR *)) {
+  BSTR Value = nullptr;
+  if (S_OK != (Symbol->*Method)(&Value))
+    return;
+  const char *Bytes = reinterpret_cast<const char *>(Value);
+  ArrayRef<char> ByteArray(Bytes, ::SysStringByteLen(Value));
+  std::string Result;
+  if (llvm::convertUTF16ToUTF8String(ByteArray, Result)) {
+    OS << "\n";
+    OS.indent(Indent);
+    OS << Name << ": " << Result;
+  }
+  ::SysFreeString(Value);
+}
+
+void DumpDIAValue(llvm::raw_ostream &OS, int Indent, StringRef Name,
+                  IDiaSymbol *Symbol,
+                  HRESULT (__stdcall IDiaSymbol::*Method)(VARIANT *)) {
+  VARIANT Value;
+  Value.vt = VT_EMPTY;
+  if (S_OK != (Symbol->*Method)(&Value))
+    return;
+  OS << "\n";
+  OS.indent(Indent);
+  Variant V = VariantFromVARIANT(Value);
+  OS << V;
+}
+}
+
+namespace llvm {
+raw_ostream &operator<<(raw_ostream &OS, const GUID &Guid) {
+  const PDB_UniqueId *Id = reinterpret_cast<const PDB_UniqueId *>(&Guid);
+  OS << *Id;
+  return OS;
+}
+}
+
+DIARawSymbol::DIARawSymbol(const DIASession &PDBSession,
+                           CComPtr<IDiaSymbol> DiaSymbol)
+    : Session(PDBSession), Symbol(DiaSymbol) {}
+
+#define RAW_METHOD_DUMP(Stream, Method)                                        \
+  DumpDIAValue(Stream, Indent, StringRef(#Method), Symbol, &IDiaSymbol::Method);
+
+void DIARawSymbol::dump(raw_ostream &OS, int Indent) const {
+  RAW_METHOD_DUMP(OS, get_access)
+  RAW_METHOD_DUMP(OS, get_addressOffset)
+  RAW_METHOD_DUMP(OS, get_addressSection)
+  RAW_METHOD_DUMP(OS, get_age)
+  RAW_METHOD_DUMP(OS, get_arrayIndexTypeId)
+  RAW_METHOD_DUMP(OS, get_backEndMajor)
+  RAW_METHOD_DUMP(OS, get_backEndMinor)
+  RAW_METHOD_DUMP(OS, get_backEndBuild)
+  RAW_METHOD_DUMP(OS, get_backEndQFE)
+  RAW_METHOD_DUMP(OS, get_baseDataOffset)
+  RAW_METHOD_DUMP(OS, get_baseDataSlot)
+  RAW_METHOD_DUMP(OS, get_baseSymbolId)
+  RAW_METHOD_DUMP(OS, get_baseType)
+  RAW_METHOD_DUMP(OS, get_bitPosition)
+  RAW_METHOD_DUMP(OS, get_callingConvention)
+  RAW_METHOD_DUMP(OS, get_classParentId)
+  RAW_METHOD_DUMP(OS, get_compilerName)
+  RAW_METHOD_DUMP(OS, get_count)
+  RAW_METHOD_DUMP(OS, get_countLiveRanges)
+  RAW_METHOD_DUMP(OS, get_frontEndMajor)
+  RAW_METHOD_DUMP(OS, get_frontEndMinor)
+  RAW_METHOD_DUMP(OS, get_frontEndBuild)
+  RAW_METHOD_DUMP(OS, get_frontEndQFE)
+  RAW_METHOD_DUMP(OS, get_lexicalParentId)
+  RAW_METHOD_DUMP(OS, get_libraryName)
+  RAW_METHOD_DUMP(OS, get_liveRangeStartAddressOffset)
+  RAW_METHOD_DUMP(OS, get_liveRangeStartAddressSection)
+  RAW_METHOD_DUMP(OS, get_liveRangeStartRelativeVirtualAddress)
+  RAW_METHOD_DUMP(OS, get_localBasePointerRegisterId)
+  RAW_METHOD_DUMP(OS, get_lowerBoundId)
+  RAW_METHOD_DUMP(OS, get_memorySpaceKind)
+  RAW_METHOD_DUMP(OS, get_name)
+  RAW_METHOD_DUMP(OS, get_numberOfAcceleratorPointerTags)
+  RAW_METHOD_DUMP(OS, get_numberOfColumns)
+  RAW_METHOD_DUMP(OS, get_numberOfModifiers)
+  RAW_METHOD_DUMP(OS, get_numberOfRegisterIndices)
+  RAW_METHOD_DUMP(OS, get_numberOfRows)
+  RAW_METHOD_DUMP(OS, get_objectFileName)
+  RAW_METHOD_DUMP(OS, get_oemId)
+  RAW_METHOD_DUMP(OS, get_oemSymbolId)
+  RAW_METHOD_DUMP(OS, get_offsetInUdt)
+  RAW_METHOD_DUMP(OS, get_platform)
+  RAW_METHOD_DUMP(OS, get_rank)
+  RAW_METHOD_DUMP(OS, get_registerId)
+  RAW_METHOD_DUMP(OS, get_registerType)
+  RAW_METHOD_DUMP(OS, get_relativeVirtualAddress)
+  RAW_METHOD_DUMP(OS, get_samplerSlot)
+  RAW_METHOD_DUMP(OS, get_signature)
+  RAW_METHOD_DUMP(OS, get_sizeInUdt)
+  RAW_METHOD_DUMP(OS, get_slot)
+  RAW_METHOD_DUMP(OS, get_sourceFileName)
+  RAW_METHOD_DUMP(OS, get_stride)
+  RAW_METHOD_DUMP(OS, get_subTypeId)
+  RAW_METHOD_DUMP(OS, get_symbolsFileName)
+  RAW_METHOD_DUMP(OS, get_symIndexId)
+  RAW_METHOD_DUMP(OS, get_targetOffset)
+  RAW_METHOD_DUMP(OS, get_targetRelativeVirtualAddress)
+  RAW_METHOD_DUMP(OS, get_targetVirtualAddress)
+  RAW_METHOD_DUMP(OS, get_targetSection)
+  RAW_METHOD_DUMP(OS, get_textureSlot)
+  RAW_METHOD_DUMP(OS, get_timeStamp)
+  RAW_METHOD_DUMP(OS, get_token)
+  RAW_METHOD_DUMP(OS, get_typeId)
+  RAW_METHOD_DUMP(OS, get_uavSlot)
+  RAW_METHOD_DUMP(OS, get_undecoratedName)
+  RAW_METHOD_DUMP(OS, get_unmodifiedTypeId)
+  RAW_METHOD_DUMP(OS, get_upperBoundId)
+  RAW_METHOD_DUMP(OS, get_virtualBaseDispIndex)
+  RAW_METHOD_DUMP(OS, get_virtualBaseOffset)
+  RAW_METHOD_DUMP(OS, get_virtualTableShapeId)
+  RAW_METHOD_DUMP(OS, get_dataKind)
+  RAW_METHOD_DUMP(OS, get_symTag)
+  RAW_METHOD_DUMP(OS, get_guid)
+  RAW_METHOD_DUMP(OS, get_offset)
+  RAW_METHOD_DUMP(OS, get_thisAdjust)
+  RAW_METHOD_DUMP(OS, get_virtualBasePointerOffset)
+  RAW_METHOD_DUMP(OS, get_locationType)
+  RAW_METHOD_DUMP(OS, get_machineType)
+  RAW_METHOD_DUMP(OS, get_thunkOrdinal)
+  RAW_METHOD_DUMP(OS, get_length)
+  RAW_METHOD_DUMP(OS, get_liveRangeLength)
+  RAW_METHOD_DUMP(OS, get_virtualAddress)
+  RAW_METHOD_DUMP(OS, get_udtKind)
+  RAW_METHOD_DUMP(OS, get_constructor)
+  RAW_METHOD_DUMP(OS, get_customCallingConvention)
+  RAW_METHOD_DUMP(OS, get_farReturn)
+  RAW_METHOD_DUMP(OS, get_code)
+  RAW_METHOD_DUMP(OS, get_compilerGenerated)
+  RAW_METHOD_DUMP(OS, get_constType)
+  RAW_METHOD_DUMP(OS, get_editAndContinueEnabled)
+  RAW_METHOD_DUMP(OS, get_function)
+  RAW_METHOD_DUMP(OS, get_stride)
+  RAW_METHOD_DUMP(OS, get_noStackOrdering)
+  RAW_METHOD_DUMP(OS, get_hasAlloca)
+  RAW_METHOD_DUMP(OS, get_hasAssignmentOperator)
+  RAW_METHOD_DUMP(OS, get_isCTypes)
+  RAW_METHOD_DUMP(OS, get_hasCastOperator)
+  RAW_METHOD_DUMP(OS, get_hasDebugInfo)
+  RAW_METHOD_DUMP(OS, get_hasEH)
+  RAW_METHOD_DUMP(OS, get_hasEHa)
+  RAW_METHOD_DUMP(OS, get_hasInlAsm)
+  RAW_METHOD_DUMP(OS, get_framePointerPresent)
+  RAW_METHOD_DUMP(OS, get_inlSpec)
+  RAW_METHOD_DUMP(OS, get_interruptReturn)
+  RAW_METHOD_DUMP(OS, get_hasLongJump)
+  RAW_METHOD_DUMP(OS, get_hasManagedCode)
+  RAW_METHOD_DUMP(OS, get_hasNestedTypes)
+  RAW_METHOD_DUMP(OS, get_noInline)
+  RAW_METHOD_DUMP(OS, get_noReturn)
+  RAW_METHOD_DUMP(OS, get_optimizedCodeDebugInfo)
+  RAW_METHOD_DUMP(OS, get_overloadedOperator)
+  RAW_METHOD_DUMP(OS, get_hasSEH)
+  RAW_METHOD_DUMP(OS, get_hasSecurityChecks)
+  RAW_METHOD_DUMP(OS, get_hasSetJump)
+  RAW_METHOD_DUMP(OS, get_strictGSCheck)
+  RAW_METHOD_DUMP(OS, get_isAcceleratorGroupSharedLocal)
+  RAW_METHOD_DUMP(OS, get_isAcceleratorPointerTagLiveRange)
+  RAW_METHOD_DUMP(OS, get_isAcceleratorStubFunction)
+  RAW_METHOD_DUMP(OS, get_isAggregated)
+  RAW_METHOD_DUMP(OS, get_intro)
+  RAW_METHOD_DUMP(OS, get_isCVTCIL)
+  RAW_METHOD_DUMP(OS, get_isConstructorVirtualBase)
+  RAW_METHOD_DUMP(OS, get_isCxxReturnUdt)
+  RAW_METHOD_DUMP(OS, get_isDataAligned)
+  RAW_METHOD_DUMP(OS, get_isHLSLData)
+  RAW_METHOD_DUMP(OS, get_isHotpatchable)
+  RAW_METHOD_DUMP(OS, get_indirectVirtualBaseClass)
+  RAW_METHOD_DUMP(OS, get_isInterfaceUdt)
+  RAW_METHOD_DUMP(OS, get_intrinsic)
+  RAW_METHOD_DUMP(OS, get_isLTCG)
+  RAW_METHOD_DUMP(OS, get_isLocationControlFlowDependent)
+  RAW_METHOD_DUMP(OS, get_isMSILNetmodule)
+  RAW_METHOD_DUMP(OS, get_isMatrixRowMajor)
+  RAW_METHOD_DUMP(OS, get_managed)
+  RAW_METHOD_DUMP(OS, get_msil)
+  RAW_METHOD_DUMP(OS, get_isMultipleInheritance)
+  RAW_METHOD_DUMP(OS, get_isNaked)
+  RAW_METHOD_DUMP(OS, get_nested)
+  RAW_METHOD_DUMP(OS, get_isOptimizedAway)
+  RAW_METHOD_DUMP(OS, get_packed)
+  RAW_METHOD_DUMP(OS, get_isPointerBasedOnSymbolValue)
+  RAW_METHOD_DUMP(OS, get_isPointerToDataMember)
+  RAW_METHOD_DUMP(OS, get_isPointerToMemberFunction)
+  RAW_METHOD_DUMP(OS, get_pure)
+  RAW_METHOD_DUMP(OS, get_RValueReference)
+  RAW_METHOD_DUMP(OS, get_isRefUdt)
+  RAW_METHOD_DUMP(OS, get_reference)
+  RAW_METHOD_DUMP(OS, get_restrictedType)
+  RAW_METHOD_DUMP(OS, get_isReturnValue)
+  RAW_METHOD_DUMP(OS, get_isSafeBuffers)
+  RAW_METHOD_DUMP(OS, get_scoped)
+  RAW_METHOD_DUMP(OS, get_isSdl)
+  RAW_METHOD_DUMP(OS, get_isSingleInheritance)
+  RAW_METHOD_DUMP(OS, get_isSplitted)
+  RAW_METHOD_DUMP(OS, get_isStatic)
+  RAW_METHOD_DUMP(OS, get_isStripped)
+  RAW_METHOD_DUMP(OS, get_unalignedType)
+  RAW_METHOD_DUMP(OS, get_notReached)
+  RAW_METHOD_DUMP(OS, get_isValueUdt)
+  RAW_METHOD_DUMP(OS, get_virtual)
+  RAW_METHOD_DUMP(OS, get_virtualBaseClass)
+  RAW_METHOD_DUMP(OS, get_isVirtualInheritance)
+  RAW_METHOD_DUMP(OS, get_volatileType)
+  RAW_METHOD_DUMP(OS, get_wasInlined)
+  RAW_METHOD_DUMP(OS, get_unused)
+  RAW_METHOD_DUMP(OS, get_value)
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+DIARawSymbol::findChildren(PDB_SymType Type) const {
+  enum SymTagEnum EnumVal = static_cast<enum SymTagEnum>(Type);
+
+  CComPtr<IDiaEnumSymbols> DiaEnumerator;
+  if (S_OK != Symbol->findChildrenEx(EnumVal, nullptr, nsNone, &DiaEnumerator))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+DIARawSymbol::findChildren(PDB_SymType Type, StringRef Name,
+                           PDB_NameSearchFlags Flags) const {
+  llvm::SmallVector<UTF16, 32> Name16;
+  llvm::convertUTF8ToUTF16String(Name, Name16);
+
+  enum SymTagEnum EnumVal = static_cast<enum SymTagEnum>(Type);
+  DWORD CompareFlags = static_cast<DWORD>(Flags);
+  wchar_t *Name16Str = reinterpret_cast<wchar_t *>(Name16.data());
+
+  CComPtr<IDiaEnumSymbols> DiaEnumerator;
+  if (S_OK !=
+      Symbol->findChildrenEx(EnumVal, Name16Str, CompareFlags, &DiaEnumerator))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+DIARawSymbol::findChildrenByRVA(PDB_SymType Type, StringRef Name,
+                                PDB_NameSearchFlags Flags, uint32_t RVA) const {
+  llvm::SmallVector<UTF16, 32> Name16;
+  llvm::convertUTF8ToUTF16String(Name, Name16);
+
+  enum SymTagEnum EnumVal = static_cast<enum SymTagEnum>(Type);
+  DWORD CompareFlags = static_cast<DWORD>(Flags);
+  wchar_t *Name16Str = reinterpret_cast<wchar_t *>(Name16.data());
+
+  CComPtr<IDiaEnumSymbols> DiaEnumerator;
+  if (S_OK !=
+      Symbol->findChildrenExByRVA(EnumVal, Name16Str, CompareFlags, RVA,
+                                  &DiaEnumerator))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+DIARawSymbol::findInlineFramesByRVA(uint32_t RVA) const {
+  CComPtr<IDiaEnumSymbols> DiaEnumerator;
+  if (S_OK != Symbol->findInlineFramesByRVA(RVA, &DiaEnumerator))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
+}
+
+void DIARawSymbol::getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes) const {
+  bytes.clear();
+
+  DWORD DataSize = 0;
+  Symbol->get_dataBytes(0, &DataSize, nullptr);
+  if (DataSize == 0)
+    return;
+
+  bytes.resize(DataSize);
+  Symbol->get_dataBytes(DataSize, &DataSize, bytes.data());
+}
+
+PDB_MemberAccess DIARawSymbol::getAccess() const {
+  return PrivateGetDIAValue<DWORD, PDB_MemberAccess>(Symbol,
+                                                     &IDiaSymbol::get_access);
+}
+
+uint32_t DIARawSymbol::getAddressOffset() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_addressOffset);
+}
+
+uint32_t DIARawSymbol::getAddressSection() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_addressSection);
+}
+
+uint32_t DIARawSymbol::getAge() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_age);
+}
+
+uint32_t DIARawSymbol::getArrayIndexTypeId() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_arrayIndexTypeId);
+}
+
+void DIARawSymbol::getBackEndVersion(VersionInfo &Version) const {
+  Version.Major = PrivateGetDIAValue(Symbol, &IDiaSymbol::get_backEndMajor);
+  Version.Minor = PrivateGetDIAValue(Symbol, &IDiaSymbol::get_backEndMinor);
+  Version.Build = PrivateGetDIAValue(Symbol, &IDiaSymbol::get_backEndBuild);
+  Version.QFE = PrivateGetDIAValue(Symbol, &IDiaSymbol::get_backEndQFE);
+}
+
+uint32_t DIARawSymbol::getBaseDataOffset() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_baseDataOffset);
+}
+
+uint32_t DIARawSymbol::getBaseDataSlot() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_baseDataSlot);
+}
+
+uint32_t DIARawSymbol::getBaseSymbolId() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_baseSymbolId);
+}
+
+PDB_BuiltinType DIARawSymbol::getBuiltinType() const {
+  return PrivateGetDIAValue<DWORD, PDB_BuiltinType>(Symbol,
+                                                    &IDiaSymbol::get_baseType);
+}
+
+uint32_t DIARawSymbol::getBitPosition() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_bitPosition);
+}
+
+PDB_CallingConv DIARawSymbol::getCallingConvention() const {
+  return PrivateGetDIAValue<DWORD, PDB_CallingConv>(
+      Symbol, &IDiaSymbol::get_callingConvention);
+}
+
+uint32_t DIARawSymbol::getClassParentId() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_classParentId);
+}
+
+std::string DIARawSymbol::getCompilerName() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_compilerName);
+}
+
+uint32_t DIARawSymbol::getCount() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_count);
+}
+
+uint32_t DIARawSymbol::getCountLiveRanges() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_countLiveRanges);
+}
+
+void DIARawSymbol::getFrontEndVersion(VersionInfo &Version) const {
+  Version.Major = PrivateGetDIAValue(Symbol, &IDiaSymbol::get_frontEndMajor);
+  Version.Minor = PrivateGetDIAValue(Symbol, &IDiaSymbol::get_frontEndMinor);
+  Version.Build = PrivateGetDIAValue(Symbol, &IDiaSymbol::get_frontEndBuild);
+  Version.QFE = PrivateGetDIAValue(Symbol, &IDiaSymbol::get_frontEndQFE);
+}
+
+PDB_Lang DIARawSymbol::getLanguage() const {
+  return PrivateGetDIAValue<DWORD, PDB_Lang>(Symbol, &IDiaSymbol::get_language);
+}
+
+uint32_t DIARawSymbol::getLexicalParentId() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_lexicalParentId);
+}
+
+std::string DIARawSymbol::getLibraryName() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_libraryName);
+}
+
+uint32_t DIARawSymbol::getLiveRangeStartAddressOffset() const {
+  return PrivateGetDIAValue(Symbol,
+                            &IDiaSymbol::get_liveRangeStartAddressOffset);
+}
+
+uint32_t DIARawSymbol::getLiveRangeStartAddressSection() const {
+  return PrivateGetDIAValue(Symbol,
+                            &IDiaSymbol::get_liveRangeStartAddressSection);
+}
+
+uint32_t DIARawSymbol::getLiveRangeStartRelativeVirtualAddress() const {
+  return PrivateGetDIAValue(
+      Symbol, &IDiaSymbol::get_liveRangeStartRelativeVirtualAddress);
+}
+
+PDB_RegisterId DIARawSymbol::getLocalBasePointerRegisterId() const {
+  return PrivateGetDIAValue<DWORD, PDB_RegisterId>(
+      Symbol, &IDiaSymbol::get_localBasePointerRegisterId);
+}
+
+uint32_t DIARawSymbol::getLowerBoundId() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_lowerBoundId);
+}
+
+uint32_t DIARawSymbol::getMemorySpaceKind() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_memorySpaceKind);
+}
+
+std::string DIARawSymbol::getName() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_name);
+}
+
+uint32_t DIARawSymbol::getNumberOfAcceleratorPointerTags() const {
+  return PrivateGetDIAValue(Symbol,
+                            &IDiaSymbol::get_numberOfAcceleratorPointerTags);
+}
+
+uint32_t DIARawSymbol::getNumberOfColumns() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_numberOfColumns);
+}
+
+uint32_t DIARawSymbol::getNumberOfModifiers() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_numberOfModifiers);
+}
+
+uint32_t DIARawSymbol::getNumberOfRegisterIndices() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_numberOfRegisterIndices);
+}
+
+uint32_t DIARawSymbol::getNumberOfRows() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_numberOfRows);
+}
+
+std::string DIARawSymbol::getObjectFileName() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_objectFileName);
+}
+
+uint32_t DIARawSymbol::getOemId() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_oemId);
+}
+
+uint32_t DIARawSymbol::getOemSymbolId() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_oemSymbolId);
+}
+
+uint32_t DIARawSymbol::getOffsetInUdt() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_offsetInUdt);
+}
+
+PDB_Cpu DIARawSymbol::getPlatform() const {
+  return PrivateGetDIAValue<DWORD, PDB_Cpu>(Symbol, &IDiaSymbol::get_platform);
+}
+
+uint32_t DIARawSymbol::getRank() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_rank);
+}
+
+PDB_RegisterId DIARawSymbol::getRegisterId() const {
+  return PrivateGetDIAValue<DWORD, PDB_RegisterId>(Symbol,
+                                                   &IDiaSymbol::get_registerId);
+}
+
+uint32_t DIARawSymbol::getRegisterType() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_registerType);
+}
+
+uint32_t DIARawSymbol::getRelativeVirtualAddress() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_relativeVirtualAddress);
+}
+
+uint32_t DIARawSymbol::getSamplerSlot() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_samplerSlot);
+}
+
+uint32_t DIARawSymbol::getSignature() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_signature);
+}
+
+uint32_t DIARawSymbol::getSizeInUdt() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_sizeInUdt);
+}
+
+uint32_t DIARawSymbol::getSlot() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_slot);
+}
+
+std::string DIARawSymbol::getSourceFileName() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_sourceFileName);
+}
+
+uint32_t DIARawSymbol::getStride() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_stride);
+}
+
+uint32_t DIARawSymbol::getSubTypeId() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_subTypeId);
+}
+
+std::string DIARawSymbol::getSymbolsFileName() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_symbolsFileName);
+}
+
+uint32_t DIARawSymbol::getSymIndexId() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_symIndexId);
+}
+
+uint32_t DIARawSymbol::getTargetOffset() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_targetOffset);
+}
+
+uint32_t DIARawSymbol::getTargetRelativeVirtualAddress() const {
+  return PrivateGetDIAValue(Symbol,
+                            &IDiaSymbol::get_targetRelativeVirtualAddress);
+}
+
+uint64_t DIARawSymbol::getTargetVirtualAddress() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_targetVirtualAddress);
+}
+
+uint32_t DIARawSymbol::getTargetSection() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_targetSection);
+}
+
+uint32_t DIARawSymbol::getTextureSlot() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_textureSlot);
+}
+
+uint32_t DIARawSymbol::getTimeStamp() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_timeStamp);
+}
+
+uint32_t DIARawSymbol::getToken() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_token);
+}
+
+uint32_t DIARawSymbol::getTypeId() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_typeId);
+}
+
+uint32_t DIARawSymbol::getUavSlot() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_uavSlot);
+}
+
+std::string DIARawSymbol::getUndecoratedName() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_undecoratedName);
+}
+
+uint32_t DIARawSymbol::getUnmodifiedTypeId() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_unmodifiedTypeId);
+}
+
+uint32_t DIARawSymbol::getUpperBoundId() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_upperBoundId);
+}
+
+Variant DIARawSymbol::getValue() const {
+  VARIANT Value;
+  Value.vt = VT_EMPTY;
+  if (S_OK != Symbol->get_value(&Value))
+    return Variant();
+
+  return VariantFromVARIANT(Value);
+}
+
+uint32_t DIARawSymbol::getVirtualBaseDispIndex() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_virtualBaseDispIndex);
+}
+
+uint32_t DIARawSymbol::getVirtualBaseOffset() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_virtualBaseOffset);
+}
+
+uint32_t DIARawSymbol::getVirtualTableShapeId() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_virtualTableShapeId);
+}
+
+PDB_DataKind DIARawSymbol::getDataKind() const {
+  return PrivateGetDIAValue<DWORD, PDB_DataKind>(Symbol,
+                                                 &IDiaSymbol::get_dataKind);
+}
+
+PDB_SymType DIARawSymbol::getSymTag() const {
+  return PrivateGetDIAValue<DWORD, PDB_SymType>(Symbol,
+                                                &IDiaSymbol::get_symTag);
+}
+
+PDB_UniqueId DIARawSymbol::getGuid() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_guid);
+}
+
+int32_t DIARawSymbol::getOffset() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_offset);
+}
+
+int32_t DIARawSymbol::getThisAdjust() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_thisAdjust);
+}
+
+int32_t DIARawSymbol::getVirtualBasePointerOffset() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_virtualBasePointerOffset);
+}
+
+PDB_LocType DIARawSymbol::getLocationType() const {
+  return PrivateGetDIAValue<DWORD, PDB_LocType>(Symbol,
+                                                &IDiaSymbol::get_locationType);
+}
+
+PDB_Machine DIARawSymbol::getMachineType() const {
+  return PrivateGetDIAValue<DWORD, PDB_Machine>(Symbol,
+                                                &IDiaSymbol::get_machineType);
+}
+
+PDB_ThunkOrdinal DIARawSymbol::getThunkOrdinal() const {
+  return PrivateGetDIAValue<DWORD, PDB_ThunkOrdinal>(
+      Symbol, &IDiaSymbol::get_thunkOrdinal);
+}
+
+uint64_t DIARawSymbol::getLength() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_length);
+}
+
+uint64_t DIARawSymbol::getLiveRangeLength() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_liveRangeLength);
+}
+
+uint64_t DIARawSymbol::getVirtualAddress() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_virtualAddress);
+}
+
+PDB_UdtType DIARawSymbol::getUdtKind() const {
+  return PrivateGetDIAValue<DWORD, PDB_UdtType>(Symbol,
+                                                &IDiaSymbol::get_udtKind);
+}
+
+bool DIARawSymbol::hasConstructor() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_constructor);
+}
+
+bool DIARawSymbol::hasCustomCallingConvention() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_customCallingConvention);
+}
+
+bool DIARawSymbol::hasFarReturn() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_farReturn);
+}
+
+bool DIARawSymbol::isCode() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_code);
+}
+
+bool DIARawSymbol::isCompilerGenerated() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_compilerGenerated);
+}
+
+bool DIARawSymbol::isConstType() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_constType);
+}
+
+bool DIARawSymbol::isEditAndContinueEnabled() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_editAndContinueEnabled);
+}
+
+bool DIARawSymbol::isFunction() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_function);
+}
+
+bool DIARawSymbol::getAddressTaken() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_addressTaken);
+}
+
+bool DIARawSymbol::getNoStackOrdering() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_noStackOrdering);
+}
+
+bool DIARawSymbol::hasAlloca() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_hasAlloca);
+}
+
+bool DIARawSymbol::hasAssignmentOperator() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_hasAssignmentOperator);
+}
+
+bool DIARawSymbol::hasCTypes() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isCTypes);
+}
+
+bool DIARawSymbol::hasCastOperator() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_hasCastOperator);
+}
+
+bool DIARawSymbol::hasDebugInfo() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_hasDebugInfo);
+}
+
+bool DIARawSymbol::hasEH() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_hasEH);
+}
+
+bool DIARawSymbol::hasEHa() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_hasEHa);
+}
+
+bool DIARawSymbol::hasInlAsm() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_hasInlAsm);
+}
+
+bool DIARawSymbol::hasInlineAttribute() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_inlSpec);
+}
+
+bool DIARawSymbol::hasInterruptReturn() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_interruptReturn);
+}
+
+bool DIARawSymbol::hasFramePointer() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_framePointerPresent);
+}
+
+bool DIARawSymbol::hasLongJump() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_hasLongJump);
+}
+
+bool DIARawSymbol::hasManagedCode() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_hasManagedCode);
+}
+
+bool DIARawSymbol::hasNestedTypes() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_hasNestedTypes);
+}
+
+bool DIARawSymbol::hasNoInlineAttribute() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_noInline);
+}
+
+bool DIARawSymbol::hasNoReturnAttribute() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_noReturn);
+}
+
+bool DIARawSymbol::hasOptimizedCodeDebugInfo() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_optimizedCodeDebugInfo);
+}
+
+bool DIARawSymbol::hasOverloadedOperator() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_overloadedOperator);
+}
+
+bool DIARawSymbol::hasSEH() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_hasSEH);
+}
+
+bool DIARawSymbol::hasSecurityChecks() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_hasSecurityChecks);
+}
+
+bool DIARawSymbol::hasSetJump() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_hasSetJump);
+}
+
+bool DIARawSymbol::hasStrictGSCheck() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_strictGSCheck);
+}
+
+bool DIARawSymbol::isAcceleratorGroupSharedLocal() const {
+  return PrivateGetDIAValue(Symbol,
+                            &IDiaSymbol::get_isAcceleratorGroupSharedLocal);
+}
+
+bool DIARawSymbol::isAcceleratorPointerTagLiveRange() const {
+  return PrivateGetDIAValue(Symbol,
+                            &IDiaSymbol::get_isAcceleratorPointerTagLiveRange);
+}
+
+bool DIARawSymbol::isAcceleratorStubFunction() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isAcceleratorStubFunction);
+}
+
+bool DIARawSymbol::isAggregated() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isAggregated);
+}
+
+bool DIARawSymbol::isIntroVirtualFunction() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_intro);
+}
+
+bool DIARawSymbol::isCVTCIL() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isCVTCIL);
+}
+
+bool DIARawSymbol::isConstructorVirtualBase() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isConstructorVirtualBase);
+}
+
+bool DIARawSymbol::isCxxReturnUdt() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isCxxReturnUdt);
+}
+
+bool DIARawSymbol::isDataAligned() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isDataAligned);
+}
+
+bool DIARawSymbol::isHLSLData() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isHLSLData);
+}
+
+bool DIARawSymbol::isHotpatchable() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isHotpatchable);
+}
+
+bool DIARawSymbol::isIndirectVirtualBaseClass() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_indirectVirtualBaseClass);
+}
+
+bool DIARawSymbol::isInterfaceUdt() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isInterfaceUdt);
+}
+
+bool DIARawSymbol::isIntrinsic() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_intrinsic);
+}
+
+bool DIARawSymbol::isLTCG() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isLTCG);
+}
+
+bool DIARawSymbol::isLocationControlFlowDependent() const {
+  return PrivateGetDIAValue(Symbol,
+                            &IDiaSymbol::get_isLocationControlFlowDependent);
+}
+
+bool DIARawSymbol::isMSILNetmodule() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isMSILNetmodule);
+}
+
+bool DIARawSymbol::isMatrixRowMajor() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isMatrixRowMajor);
+}
+
+bool DIARawSymbol::isManagedCode() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_managed);
+}
+
+bool DIARawSymbol::isMSILCode() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_msil);
+}
+
+bool DIARawSymbol::isMultipleInheritance() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isMultipleInheritance);
+}
+
+bool DIARawSymbol::isNaked() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isNaked);
+}
+
+bool DIARawSymbol::isNested() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_nested);
+}
+
+bool DIARawSymbol::isOptimizedAway() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isOptimizedAway);
+}
+
+bool DIARawSymbol::isPacked() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_packed);
+}
+
+bool DIARawSymbol::isPointerBasedOnSymbolValue() const {
+  return PrivateGetDIAValue(Symbol,
+                            &IDiaSymbol::get_isPointerBasedOnSymbolValue);
+}
+
+bool DIARawSymbol::isPointerToDataMember() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isPointerToDataMember);
+}
+
+bool DIARawSymbol::isPointerToMemberFunction() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isPointerToMemberFunction);
+}
+
+bool DIARawSymbol::isPureVirtual() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_pure);
+}
+
+bool DIARawSymbol::isRValueReference() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_RValueReference);
+}
+
+bool DIARawSymbol::isRefUdt() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isRefUdt);
+}
+
+bool DIARawSymbol::isReference() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_reference);
+}
+
+bool DIARawSymbol::isRestrictedType() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_restrictedType);
+}
+
+bool DIARawSymbol::isReturnValue() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isReturnValue);
+}
+
+bool DIARawSymbol::isSafeBuffers() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isSafeBuffers);
+}
+
+bool DIARawSymbol::isScoped() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_scoped);
+}
+
+bool DIARawSymbol::isSdl() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isSdl);
+}
+
+bool DIARawSymbol::isSingleInheritance() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isSingleInheritance);
+}
+
+bool DIARawSymbol::isSplitted() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isSplitted);
+}
+
+bool DIARawSymbol::isStatic() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isStatic);
+}
+
+bool DIARawSymbol::hasPrivateSymbols() const {
+  // hasPrivateSymbols is the opposite of isStripped, but we expose
+  // hasPrivateSymbols as a more intuitive interface.
+  return !PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isStripped);
+}
+
+bool DIARawSymbol::isUnalignedType() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_unalignedType);
+}
+
+bool DIARawSymbol::isUnreached() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_notReached);
+}
+
+bool DIARawSymbol::isValueUdt() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isValueUdt);
+}
+
+bool DIARawSymbol::isVirtual() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_virtual);
+}
+
+bool DIARawSymbol::isVirtualBaseClass() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_virtualBaseClass);
+}
+
+bool DIARawSymbol::isVirtualInheritance() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_isVirtualInheritance);
+}
+
+bool DIARawSymbol::isVolatileType() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_volatileType);
+}
+
+bool DIARawSymbol::wasInlined() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_wasInlined);
+}
+
+std::string DIARawSymbol::getUnused() const {
+  return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_unused);
+}
diff --git a/lib/DebugInfo/PDB/DIA/DIASession.cpp b/lib/DebugInfo/PDB/DIA/DIASession.cpp
new file mode 100644
index 0000000..24791f2
--- /dev/null
+++ b/lib/DebugInfo/PDB/DIA/DIASession.cpp
@@ -0,0 +1,117 @@
+//===- DIASession.cpp - DIA implementation of IPDBSession -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumDebugStreams.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumSourceFiles.h"
+#include "llvm/DebugInfo/PDB/DIA/DIARawSymbol.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASourceFile.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
+#include "llvm/Support/ConvertUTF.h"
+
+using namespace llvm;
+
+namespace {}
+
+DIASession::DIASession(CComPtr<IDiaSession> DiaSession) : Session(DiaSession) {}
+
+DIASession *DIASession::createFromPdb(StringRef Path) {
+  CComPtr<IDiaDataSource> DataSource;
+  CComPtr<IDiaSession> Session;
+
+  // We assume that CoInitializeEx has already been called by the executable.
+  HRESULT Result = ::CoCreateInstance(CLSID_DiaSource, nullptr,
+                                      CLSCTX_INPROC_SERVER, IID_IDiaDataSource,
+                                      reinterpret_cast<LPVOID *>(&DataSource));
+  if (FAILED(Result))
+    return nullptr;
+
+  llvm::SmallVector<UTF16, 128> Path16;
+  if (!llvm::convertUTF8ToUTF16String(Path, Path16))
+    return nullptr;
+
+  const wchar_t *Path16Str = reinterpret_cast<const wchar_t*>(Path16.data());
+  if (FAILED(DataSource->loadDataFromPdb(Path16Str)))
+    return nullptr;
+
+  if (FAILED(DataSource->openSession(&Session)))
+    return nullptr;
+  return new DIASession(Session);
+}
+
+uint64_t DIASession::getLoadAddress() const {
+  uint64_t LoadAddress;
+  bool success = (S_OK == Session->get_loadAddress(&LoadAddress));
+  return (success) ? LoadAddress : 0;
+}
+
+void DIASession::setLoadAddress(uint64_t Address) {
+  Session->put_loadAddress(Address);
+}
+
+std::unique_ptr<PDBSymbolExe> DIASession::getGlobalScope() const {
+  CComPtr<IDiaSymbol> GlobalScope;
+  if (S_OK != Session->get_globalScope(&GlobalScope))
+    return nullptr;
+
+  auto RawSymbol = llvm::make_unique<DIARawSymbol>(*this, GlobalScope);
+  auto PdbSymbol(PDBSymbol::create(*this, std::move(RawSymbol)));
+  std::unique_ptr<PDBSymbolExe> ExeSymbol(
+      static_cast<PDBSymbolExe *>(PdbSymbol.release()));
+  return ExeSymbol;
+}
+
+std::unique_ptr<PDBSymbol> DIASession::getSymbolById(uint32_t SymbolId) const {
+  CComPtr<IDiaSymbol> LocatedSymbol;
+  if (S_OK != Session->symbolById(SymbolId, &LocatedSymbol))
+    return nullptr;
+
+  auto RawSymbol = llvm::make_unique<DIARawSymbol>(*this, LocatedSymbol);
+  return PDBSymbol::create(*this, std::move(RawSymbol));
+}
+
+std::unique_ptr<IPDBEnumSourceFiles> DIASession::getAllSourceFiles() const {
+  CComPtr<IDiaEnumSourceFiles> Files;
+  if (S_OK != Session->findFile(nullptr, nullptr, nsNone, &Files))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumSourceFiles>(*this, Files);
+}
+
+std::unique_ptr<IPDBEnumSourceFiles> DIASession::getSourceFilesForCompiland(
+    const PDBSymbolCompiland &Compiland) const {
+  CComPtr<IDiaEnumSourceFiles> Files;
+
+  const DIARawSymbol &RawSymbol =
+      static_cast<const DIARawSymbol &>(Compiland.getRawSymbol());
+  if (S_OK !=
+      Session->findFile(RawSymbol.getDiaSymbol(), nullptr, nsNone, &Files))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumSourceFiles>(*this, Files);
+}
+
+std::unique_ptr<IPDBSourceFile>
+DIASession::getSourceFileById(uint32_t FileId) const {
+  CComPtr<IDiaSourceFile> LocatedFile;
+  if (S_OK != Session->findFileById(FileId, &LocatedFile))
+    return nullptr;
+
+  return llvm::make_unique<DIASourceFile>(*this, LocatedFile);
+}
+
+std::unique_ptr<IPDBEnumDataStreams> DIASession::getDebugStreams() const {
+  CComPtr<IDiaEnumDebugStreams> DiaEnumerator;
+  if (S_OK != Session->getEnumDebugStreams(&DiaEnumerator))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumDebugStreams>(DiaEnumerator);
+}
diff --git a/lib/DebugInfo/PDB/DIA/DIASourceFile.cpp b/lib/DebugInfo/PDB/DIA/DIASourceFile.cpp
new file mode 100644
index 0000000..0a9c444
--- /dev/null
+++ b/lib/DebugInfo/PDB/DIA/DIASourceFile.cpp
@@ -0,0 +1,67 @@
+//===- DIASourceFile.cpp - DIA implementation of IPDBSourceFile -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumSymbols.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASourceFile.h"
+#include "llvm/Support/ConvertUTF.h"
+
+using namespace llvm;
+
+DIASourceFile::DIASourceFile(const DIASession &PDBSession,
+                             CComPtr<IDiaSourceFile> DiaSourceFile)
+    : Session(PDBSession), SourceFile(DiaSourceFile) {}
+
+std::string DIASourceFile::getFileName() const {
+  CComBSTR FileName16;
+  HRESULT Result = SourceFile->get_fileName(&FileName16);
+  if (S_OK != Result)
+    return std::string();
+
+  std::string FileName8;
+  llvm::ArrayRef<char> FileNameBytes(reinterpret_cast<char *>(FileName16.m_str),
+                                     FileName16.ByteLength());
+  llvm::convertUTF16ToUTF8String(FileNameBytes, FileName8);
+  return FileName8;
+}
+
+uint32_t DIASourceFile::getUniqueId() const {
+  DWORD Id;
+  return (S_OK == SourceFile->get_uniqueId(&Id)) ? Id : 0;
+}
+
+std::string DIASourceFile::getChecksum() const {
+  DWORD ByteSize = 0;
+  HRESULT Result = SourceFile->get_checksum(0, &ByteSize, nullptr);
+  if (ByteSize == 0)
+    return std::string();
+  std::vector<BYTE> ChecksumBytes(ByteSize);
+  Result = SourceFile->get_checksum(ByteSize, &ByteSize, &ChecksumBytes[0]);
+  if (S_OK != Result)
+    return std::string();
+  return std::string(ChecksumBytes.begin(), ChecksumBytes.end());
+}
+
+PDB_Checksum DIASourceFile::getChecksumType() const {
+  DWORD Type;
+  HRESULT Result = SourceFile->get_checksumType(&Type);
+  if (S_OK != Result)
+    return PDB_Checksum::None;
+  return static_cast<PDB_Checksum>(Type);
+}
+
+std::unique_ptr<IPDBEnumSymbols> DIASourceFile::getCompilands() const {
+  CComPtr<IDiaEnumSymbols> DiaEnumerator;
+  HRESULT Result = SourceFile->get_compilands(&DiaEnumerator);
+  if (S_OK != Result)
+    return nullptr;
+
+  return std::unique_ptr<IPDBEnumSymbols>(
+      new DIAEnumSymbols(Session, DiaEnumerator));
+}
diff --git a/lib/DebugInfo/PDB/IPDBSourceFile.cpp b/lib/DebugInfo/PDB/IPDBSourceFile.cpp
new file mode 100644
index 0000000..3abe59d
--- /dev/null
+++ b/lib/DebugInfo/PDB/IPDBSourceFile.cpp
@@ -0,0 +1,32 @@
+//===- IPDBSourceFile.cpp - base interface for a PDB source file *- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
+
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+IPDBSourceFile::~IPDBSourceFile() {}
+
+void IPDBSourceFile::dump(raw_ostream &OS, int Indent) const {
+  OS.indent(Indent);
+  PDB_Checksum ChecksumType = getChecksumType();
+  OS << "[";
+  if (ChecksumType != PDB_Checksum::None) {
+    OS << ChecksumType << ": ";
+    std::string Checksum = getChecksum();
+    for (uint8_t c : Checksum)
+      OS << format_hex_no_prefix(c, 2, true);
+  } else
+    OS << "No checksum";
+  OS << "] " << getFileName() << "\n";
+}
diff --git a/lib/DebugInfo/PDB/LLVMBuild.txt b/lib/DebugInfo/PDB/LLVMBuild.txt
new file mode 100644
index 0000000..690598a
--- /dev/null
+++ b/lib/DebugInfo/PDB/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/DebugInfo/PDB/LLVMBuild.txt ------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = DebugInfoPDB
+parent = DebugInfo
+required_libraries = Support
+
diff --git a/lib/DebugInfo/PDB/Makefile b/lib/DebugInfo/PDB/Makefile
new file mode 100644
index 0000000..444019e
--- /dev/null
+++ b/lib/DebugInfo/PDB/Makefile
@@ -0,0 +1,14 @@
+##===- lib/DebugInfo/PDB/Makefile --------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMDebugInfoPDB
+BUILD_ARCHIVE := 1
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/DebugInfo/PDB/PDB.cpp b/lib/DebugInfo/PDB/PDB.cpp
new file mode 100644
index 0000000..aa84c28
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDB.cpp
@@ -0,0 +1,30 @@
+//===- PDB.cpp - base header file for creating a PDB reader -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDB.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Config/config.h"
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/PDB.h"
+
+#if HAVE_DIA_SDK
+#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
+#endif
+
+using namespace llvm;
+
+std::unique_ptr<IPDBSession> llvm::createPDBReader(PDB_ReaderType Type,
+                                                   StringRef Path) {
+  // Create the correct concrete instance type based on the value of Type.
+#if HAVE_DIA_SDK
+  return std::unique_ptr<DIASession>(DIASession::createFromPdb(Path));
+#endif
+  return nullptr;
+}
diff --git a/lib/DebugInfo/PDB/PDBExtras.cpp b/lib/DebugInfo/PDB/PDBExtras.cpp
new file mode 100644
index 0000000..1002b2e
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBExtras.cpp
@@ -0,0 +1,346 @@
+//===- PDBExtras.cpp - helper functions and classes for PDBs -----*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
+
+#include "llvm/ADT/ArrayRef.h"
+
+using namespace llvm;
+
+#define CASE_OUTPUT_ENUM_CLASS_STR(Class, Value, Str, Stream)                  \
+  case Class::Value:                                                           \
+    Stream << Str;                                                             \
+    break;
+
+#define CASE_OUTPUT_ENUM_CLASS_NAME(Class, Value, Stream)                      \
+  CASE_OUTPUT_ENUM_CLASS_STR(Class, Value, #Value, Stream)
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_VariantType &Type) {
+  switch (Type) {
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_VariantType, Bool, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_VariantType, Single, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_VariantType, Double, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_VariantType, Int8, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_VariantType, Int16, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_VariantType, Int32, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_VariantType, Int64, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_VariantType, UInt8, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_VariantType, UInt16, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_VariantType, UInt32, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_VariantType, UInt64, OS)
+    default:
+      OS << "Unknown";
+  }
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_CallingConv &Conv) {
+  OS << "__";
+  switch (Conv) {
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearCdecl, "cdecl", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, FarCdecl, "cdecl", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearPascal, "pascal", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, FarPascal, "pascal", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearFastcall, "fastcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, FarFastcall, "fastcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Skipped, "skippedcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearStdcall, "stdcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, FarStdcall, "stdcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearSyscall, "syscall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, FarSyscall, "syscall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Thiscall, "thiscall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, MipsCall, "mipscall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Generic, "genericcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Alphacall, "alphacall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Ppccall, "ppccall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, SuperHCall, "superhcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Armcall, "armcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, AM33call, "am33call", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Tricall, "tricall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Sh5call, "sh5call", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, M32R, "m32rcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Clrcall, "clrcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Inline, "inlinecall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearVectorcall, "vectorcall",
+                               OS)
+  default:
+    OS << "unknowncall";
+  }
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_DataKind &Data) {
+  switch (Data) {
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_DataKind, Unknown, "unknown", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_DataKind, Local, "local", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_DataKind, StaticLocal, "static local", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_DataKind, Param, "param", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_DataKind, ObjectPtr, "this ptr", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_DataKind, FileStatic, "static global", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_DataKind, Global, "global", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_DataKind, Member, "member", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_DataKind, StaticMember, "static member", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_DataKind, Constant, "const", OS)
+  }
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_RegisterId &Reg) {
+  switch (Reg) {
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, AL, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, CL, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, DL, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, BL, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, AH, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, CH, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, DH, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, BH, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, AX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, CX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, DX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, BX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, SP, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, BP, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, SI, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, DI, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, EAX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, ECX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, EDX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, EBX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, ESP, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, EBP, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, ESI, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, EDI, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, ES, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, CS, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, SS, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, DS, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, FS, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, GS, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, IP, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, RAX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, RBX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, RCX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, RDX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, RSI, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, RDI, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, RBP, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, RSP, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, R8, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, R9, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, R10, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, R11, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, R12, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, R13, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, R14, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, R15, OS)
+  default:
+    OS << static_cast<int>(Reg);
+  }
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_LocType &Loc) {
+  switch (Loc) {
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_LocType, Static, "static", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_LocType, TLS, "tls", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_LocType, RegRel, "regrel", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_LocType, ThisRel, "thisrel", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_LocType, Enregistered, "register", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_LocType, BitField, "bitfield", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_LocType, Slot, "slot", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_LocType, IlRel, "IL rel", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_LocType, MetaData, "metadata", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_LocType, Constant, "constant", OS)
+  default:
+    OS << "Unknown";
+  }
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_ThunkOrdinal &Thunk) {
+  switch (Thunk) {
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_ThunkOrdinal, BranchIsland, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_ThunkOrdinal, Pcode, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_ThunkOrdinal, Standard, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_ThunkOrdinal, ThisAdjustor, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_ThunkOrdinal, TrampIncremental, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_ThunkOrdinal, UnknownLoad, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_ThunkOrdinal, Vcall, OS)
+  }
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_Checksum &Checksum) {
+  switch (Checksum) {
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Checksum, None, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Checksum, MD5, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Checksum, SHA1, OS)
+  }
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_Lang &Lang) {
+  switch (Lang) {
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, C, OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_Lang, Cpp, "C++", OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Fortran, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Masm, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Pascal, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Basic, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Cobol, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Link, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Cvtres, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Cvtpgd, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, CSharp, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, VB, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, ILAsm, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Java, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, JScript, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, MSIL, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, HLSL, OS)
+  }
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_SymType &Tag) {
+  switch (Tag) {
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, Exe, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, Compiland, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, CompilandDetails, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, CompilandEnv, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, Function, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, Block, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, Data, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, Annotation, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, Label, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, PublicSymbol, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, UDT, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, Enum, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, FunctionSig, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, PointerType, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, ArrayType, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, BuiltinType, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, Typedef, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, BaseClass, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, Friend, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, FunctionArg, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, FuncDebugStart, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, FuncDebugEnd, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, UsingNamespace, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, VTableShape, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, VTable, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, Custom, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, Thunk, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, CustomType, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, ManagedType, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, Dimension, OS)
+  default:
+    OS << "Unknown";
+  }
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_BuiltinType &Type) {
+  switch (Type) {
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, Void, "void", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, Char, "char", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, WCharT, "wchar_t", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, Int, "int", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, UInt, "uint", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, Float, "float", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, BCD, "BCD", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, Bool, "bool", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, Long, "long", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, ULong, "ulong", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, Currency, "CURRENCY", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, Date, "DATE", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, Variant, "VARIANT", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, Complex, "complex", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, Bitfield, "bitfield", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, BSTR, "BSTR", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_BuiltinType, HResult, "HRESULT", OS)
+  default:
+    break;
+  }
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_UniqueId &Id) {
+  static const char *Lookup = "0123456789ABCDEF";
+
+  static_assert(sizeof(PDB_UniqueId) == 16, "Expected 16-byte GUID");
+  ArrayRef<uint8_t> GuidBytes(reinterpret_cast<const uint8_t*>(&Id), 16);
+  OS << "{";
+  for (int i=0; i < 16;) {
+    uint8_t Byte = GuidBytes[i];
+    uint8_t HighNibble = (Byte >> 4) & 0xF;
+    uint8_t LowNibble = Byte & 0xF;
+    OS << Lookup[HighNibble] << Lookup[LowNibble];
+    ++i;
+    if (i>=4 && i<=10 && i%2==0)
+      OS << "-";
+  }
+  OS << "}";
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const Variant &Value) {
+  switch (Value.Type) {
+    case PDB_VariantType::Bool:
+      OS << (Value.Bool ? "true" : "false");
+      break;
+    case PDB_VariantType::Double:
+      OS << Value.Double;
+      break;
+    case PDB_VariantType::Int16:
+      OS << Value.Int16;
+      break;
+    case PDB_VariantType::Int32:
+      OS << Value.Int32;
+      break;
+    case PDB_VariantType::Int64:
+      OS << Value.Int64;
+      break;
+    case PDB_VariantType::Int8:
+      OS << Value.Int8;
+      break;
+    case PDB_VariantType::Single:
+      OS << Value.Single;
+      break;
+    case PDB_VariantType::UInt16:
+      OS << Value.Double;
+      break;
+    case PDB_VariantType::UInt32:
+      OS << Value.UInt32;
+      break;
+    case PDB_VariantType::UInt64:
+      OS << Value.UInt64;
+      break;
+    case PDB_VariantType::UInt8:
+      OS << Value.UInt8;
+      break;
+    default:
+      OS << Value.Type;
+  }
+  OS << " {" << Value.Type << "}";
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const VersionInfo &Version) {
+  OS << Version.Major << "." << Version.Minor << "." << Version.Build;
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const TagStats &Stats) {
+  for (auto Tag : Stats) {
+    OS << Tag.first << ":" << Tag.second << " ";
+  }
+  return OS;
+}
diff --git a/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp b/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
new file mode 100644
index 0000000..7b6268d
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
@@ -0,0 +1,28 @@
+//===- PDBInterfaceAnchors.h - defines class anchor funcions ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Class anchors are necessary per the LLVM Coding style guide, to ensure that
+// the vtable is only generated in this object file, and not in every object
+// file that incldues the corresponding header.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/IPDBDataStream.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+
+using namespace llvm;
+
+IPDBSession::~IPDBSession() {}
+
+IPDBDataStream::~IPDBDataStream() {}
+
+IPDBRawSymbol::~IPDBRawSymbol() {}
+
+IPDBLineNumber::~IPDBLineNumber() {}
diff --git a/lib/DebugInfo/PDB/PDBSymDumper.cpp b/lib/DebugInfo/PDB/PDBSymDumper.cpp
new file mode 100644
index 0000000..0f29c74
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymDumper.cpp
@@ -0,0 +1,177 @@
+//===- PDBSymDumper.cpp - ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+#define PDB_SYMDUMP_UNREACHABLE(Type)                                          \
+  if (RequireImpl)                                                             \
+    llvm_unreachable("Attempt to dump " #Type " with no dump implementation");
+
+PDBSymDumper::PDBSymDumper(bool ShouldRequireImpl)
+    : RequireImpl(ShouldRequireImpl) {}
+
+PDBSymDumper::~PDBSymDumper() {}
+
+void PDBSymDumper::dump(const PDBSymbolAnnotation &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolAnnotation)
+}
+
+void PDBSymDumper::dump(const PDBSymbolBlock &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolBlock)
+}
+
+void PDBSymDumper::dump(const PDBSymbolCompiland &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolCompiland)
+}
+
+void PDBSymDumper::dump(const PDBSymbolCompilandDetails &Symbol,
+                        raw_ostream &OS, int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolCompilandDetails)
+}
+
+void PDBSymDumper::dump(const PDBSymbolCompilandEnv &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolCompilandEnv)
+}
+
+void PDBSymDumper::dump(const PDBSymbolCustom &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolCustom)
+}
+
+void PDBSymDumper::dump(const PDBSymbolData &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolData)
+}
+
+void PDBSymDumper::dump(const PDBSymbolExe &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolExe)
+}
+
+void PDBSymDumper::dump(const PDBSymbolFunc &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolFunc)
+}
+
+void PDBSymDumper::dump(const PDBSymbolFuncDebugEnd &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolFuncDebugEnd)
+}
+
+void PDBSymDumper::dump(const PDBSymbolFuncDebugStart &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolFuncDebugStart)
+}
+
+void PDBSymDumper::dump(const PDBSymbolLabel &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolLabel)
+}
+
+void PDBSymDumper::dump(const PDBSymbolPublicSymbol &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolPublicSymbol)
+}
+
+void PDBSymDumper::dump(const PDBSymbolThunk &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolThunk)
+}
+
+void PDBSymDumper::dump(const PDBSymbolTypeArray &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolTypeArray)
+}
+
+void PDBSymDumper::dump(const PDBSymbolTypeBaseClass &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolTypeBaseClass)
+}
+
+void PDBSymDumper::dump(const PDBSymbolTypeBuiltin &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolTypeBuiltin)
+}
+
+void PDBSymDumper::dump(const PDBSymbolTypeCustom &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolTypeCustom)
+}
+
+void PDBSymDumper::dump(const PDBSymbolTypeDimension &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolTypeDimension)
+}
+
+void PDBSymDumper::dump(const PDBSymbolTypeEnum &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolTypeEnum)
+}
+
+void PDBSymDumper::dump(const PDBSymbolTypeFriend &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolTypeFriend)
+}
+
+void PDBSymDumper::dump(const PDBSymbolTypeFunctionArg &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolTypeFunctionArg)
+}
+
+void PDBSymDumper::dump(const PDBSymbolTypeFunctionSig &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolTypeFunctionSig)
+}
+
+void PDBSymDumper::dump(const PDBSymbolTypeManaged &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolTypeManaged)
+}
+
+void PDBSymDumper::dump(const PDBSymbolTypePointer &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolTypePointer)
+}
+
+void PDBSymDumper::dump(const PDBSymbolTypeTypedef &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolTypeTypedef)
+}
+
+void PDBSymDumper::dump(const PDBSymbolTypeUDT &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolTypeUDT)
+}
+
+void PDBSymDumper::dump(const PDBSymbolTypeVTable &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolTypeVTable)
+}
+
+void PDBSymDumper::dump(const PDBSymbolTypeVTableShape &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolTypeVTableShape)
+}
+
+void PDBSymDumper::dump(const PDBSymbolUnknown &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolUnknown)
+}
+
+void PDBSymDumper::dump(const PDBSymbolUsingNamespace &Symbol, raw_ostream &OS,
+                        int Indent) {
+  PDB_SYMDUMP_UNREACHABLE(PDBSymbolUsingNamespace)
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbol.cpp b/lib/DebugInfo/PDB/PDBSymbol.cpp
new file mode 100644
index 0000000..f9aaf3a
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbol.cpp
@@ -0,0 +1,151 @@
+//===- PDBSymbol.cpp - base class for user-facing symbol types --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolAnnotation.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolBlock.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCustom.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolLabel.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolThunk.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeArray.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolUnknown.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include <memory>
+#include <utility>
+
+#include <memory>
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbol::PDBSymbol(const IPDBSession &PDBSession,
+                     std::unique_ptr<IPDBRawSymbol> Symbol)
+    : Session(PDBSession), RawSymbol(std::move(Symbol)) {}
+
+PDBSymbol::~PDBSymbol() {}
+
+#define FACTORY_SYMTAG_CASE(Tag, Type)                                         \
+  case PDB_SymType::Tag:                                                       \
+    return std::unique_ptr<PDBSymbol>(new Type(PDBSession, std::move(Symbol)));
+
+std::unique_ptr<PDBSymbol>
+PDBSymbol::create(const IPDBSession &PDBSession,
+                  std::unique_ptr<IPDBRawSymbol> Symbol) {
+  switch (Symbol->getSymTag()) {
+    FACTORY_SYMTAG_CASE(Exe, PDBSymbolExe)
+    FACTORY_SYMTAG_CASE(Compiland, PDBSymbolCompiland)
+    FACTORY_SYMTAG_CASE(CompilandDetails, PDBSymbolCompilandDetails)
+    FACTORY_SYMTAG_CASE(CompilandEnv, PDBSymbolCompilandEnv)
+    FACTORY_SYMTAG_CASE(Function, PDBSymbolFunc)
+    FACTORY_SYMTAG_CASE(Block, PDBSymbolBlock)
+    FACTORY_SYMTAG_CASE(Data, PDBSymbolData)
+    FACTORY_SYMTAG_CASE(Annotation, PDBSymbolAnnotation)
+    FACTORY_SYMTAG_CASE(Label, PDBSymbolLabel)
+    FACTORY_SYMTAG_CASE(PublicSymbol, PDBSymbolPublicSymbol)
+    FACTORY_SYMTAG_CASE(UDT, PDBSymbolTypeUDT)
+    FACTORY_SYMTAG_CASE(Enum, PDBSymbolTypeEnum)
+    FACTORY_SYMTAG_CASE(FunctionSig, PDBSymbolTypeFunctionSig)
+    FACTORY_SYMTAG_CASE(PointerType, PDBSymbolTypePointer)
+    FACTORY_SYMTAG_CASE(ArrayType, PDBSymbolTypeArray)
+    FACTORY_SYMTAG_CASE(BuiltinType, PDBSymbolTypeBuiltin)
+    FACTORY_SYMTAG_CASE(Typedef, PDBSymbolTypeTypedef)
+    FACTORY_SYMTAG_CASE(BaseClass, PDBSymbolTypeBaseClass)
+    FACTORY_SYMTAG_CASE(Friend, PDBSymbolTypeFriend)
+    FACTORY_SYMTAG_CASE(FunctionArg, PDBSymbolTypeFunctionArg)
+    FACTORY_SYMTAG_CASE(FuncDebugStart, PDBSymbolFuncDebugStart)
+    FACTORY_SYMTAG_CASE(FuncDebugEnd, PDBSymbolFuncDebugEnd)
+    FACTORY_SYMTAG_CASE(UsingNamespace, PDBSymbolUsingNamespace)
+    FACTORY_SYMTAG_CASE(VTableShape, PDBSymbolTypeVTableShape)
+    FACTORY_SYMTAG_CASE(VTable, PDBSymbolTypeVTable)
+    FACTORY_SYMTAG_CASE(Custom, PDBSymbolCustom)
+    FACTORY_SYMTAG_CASE(Thunk, PDBSymbolThunk)
+    FACTORY_SYMTAG_CASE(CustomType, PDBSymbolTypeCustom)
+    FACTORY_SYMTAG_CASE(ManagedType, PDBSymbolTypeManaged)
+    FACTORY_SYMTAG_CASE(Dimension, PDBSymbolTypeDimension)
+  default:
+    return std::unique_ptr<PDBSymbol>(
+        new PDBSymbolUnknown(PDBSession, std::move(Symbol)));
+  }
+}
+
+#define TRY_DUMP_TYPE(Type)                                                    \
+  if (const Type *DerivedThis = dyn_cast<Type>(this))                          \
+    Dumper.dump(OS, Indent, *DerivedThis);
+
+#define ELSE_TRY_DUMP_TYPE(Type, Dumper) else TRY_DUMP_TYPE(Type, Dumper)
+
+void PDBSymbol::defaultDump(raw_ostream &OS, int Indent) const {
+  RawSymbol->dump(OS, Indent);
+}
+
+PDB_SymType PDBSymbol::getSymTag() const { return RawSymbol->getSymTag(); }
+
+std::unique_ptr<IPDBEnumSymbols> PDBSymbol::findAllChildren() const {
+  return findAllChildren(PDB_SymType::None);
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+PDBSymbol::findAllChildren(PDB_SymType Type) const {
+  return RawSymbol->findChildren(Type);
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+PDBSymbol::findChildren(PDB_SymType Type, StringRef Name,
+                        PDB_NameSearchFlags Flags) const {
+  return RawSymbol->findChildren(Type, Name, Flags);
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+PDBSymbol::findChildrenByRVA(PDB_SymType Type, StringRef Name,
+                             PDB_NameSearchFlags Flags, uint32_t RVA) const {
+  return RawSymbol->findChildrenByRVA(Type, Name, Flags, RVA);
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+PDBSymbol::findInlineFramesByRVA(uint32_t RVA) const {
+  return RawSymbol->findInlineFramesByRVA(RVA);
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+PDBSymbol::getChildStats(TagStats &Stats) const {
+  std::unique_ptr<IPDBEnumSymbols> Result(findAllChildren());
+  Stats.clear();
+  while (auto Child = Result->getNext()) {
+    ++Stats[Child->getSymTag()];
+  }
+  Result->reset();
+  return Result;
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp b/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
new file mode 100644
index 0000000..4c76e3b
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
@@ -0,0 +1,25 @@
+//===- PDBSymbolAnnotation.cpp - --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolAnnotation.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolAnnotation::PDBSymbolAnnotation(const IPDBSession &PDBSession,
+                                         std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolAnnotation::dump(raw_ostream &OS, int Indent,
+                               PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolBlock.cpp b/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
new file mode 100644
index 0000000..bb159d5
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
@@ -0,0 +1,26 @@
+//===- PDBSymbolBlock.cpp - -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolBlock.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolBlock::PDBSymbolBlock(const IPDBSession &PDBSession,
+                               std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolBlock::dump(raw_ostream &OS, int Indent,
+                          PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp b/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
new file mode 100644
index 0000000..0c9b190
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
@@ -0,0 +1,25 @@
+//===- PDBSymbolCompiland.cpp - compiland details --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolCompiland::PDBSymbolCompiland(const IPDBSession &PDBSession,
+                                       std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolCompiland::dump(raw_ostream &OS, int Indent,
+                              PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp b/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
new file mode 100644
index 0000000..208d68f
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
@@ -0,0 +1,26 @@
+//===- PDBSymbolCompilandDetails.cpp - compiland details --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolCompilandDetails::PDBSymbolCompilandDetails(
+    const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolCompilandDetails::dump(raw_ostream &OS, int Indent,
+                                     PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp b/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
new file mode 100644
index 0000000..c54b8fb
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
@@ -0,0 +1,32 @@
+//===- PDBSymbolCompilandEnv.cpp - compiland env variables ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h"
+
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolCompilandEnv::PDBSymbolCompilandEnv(
+    const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+std::string PDBSymbolCompilandEnv::getValue() const {
+  // call RawSymbol->getValue() and convert the result to an std::string.
+  return std::string();
+}
+
+void PDBSymbolCompilandEnv::dump(raw_ostream &OS, int Indent,
+                                 PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolCustom.cpp b/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
new file mode 100644
index 0000000..1b6b50b
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
@@ -0,0 +1,31 @@
+//===- PDBSymbolCustom.cpp - compiler-specific types ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolCustom.h"
+
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolCustom::PDBSymbolCustom(const IPDBSession &PDBSession,
+                                 std::unique_ptr<IPDBRawSymbol> CustomSymbol)
+    : PDBSymbol(PDBSession, std::move(CustomSymbol)) {}
+
+void PDBSymbolCustom::getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes) {
+  RawSymbol->getDataBytes(bytes);
+}
+
+void PDBSymbolCustom::dump(raw_ostream &OS, int Indent,
+                           PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
+\ No newline at end of file
diff --git a/lib/DebugInfo/PDB/PDBSymbolData.cpp b/lib/DebugInfo/PDB/PDBSymbolData.cpp
new file mode 100644
index 0000000..6bf7e0f
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolData.cpp
@@ -0,0 +1,30 @@
+//===- PDBSymbolData.cpp - PDB data (e.g. variable) accessors ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolData::PDBSymbolData(const IPDBSession &PDBSession,
+                             std::unique_ptr<IPDBRawSymbol> DataSymbol)
+    : PDBSymbol(PDBSession, std::move(DataSymbol)) {}
+
+std::unique_ptr<PDBSymbol> PDBSymbolData::getType() const {
+  return Session.getSymbolById(getTypeId());
+}
+
+void PDBSymbolData::dump(raw_ostream &OS, int Indent,
+                         PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
+\ No newline at end of file
diff --git a/lib/DebugInfo/PDB/PDBSymbolExe.cpp b/lib/DebugInfo/PDB/PDBSymbolExe.cpp
new file mode 100644
index 0000000..ef09193
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolExe.cpp
@@ -0,0 +1,25 @@
+//===- PDBSymbolExe.cpp - ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolExe::PDBSymbolExe(const IPDBSession &PDBSession,
+                           std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolExe::dump(raw_ostream &OS, int Indent,
+                        PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolFunc.cpp b/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
new file mode 100644
index 0000000..e2d859f
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
@@ -0,0 +1,104 @@
+//===- PDBSymbolFunc.cpp - --------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
+
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
+
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+namespace {
+class FunctionArgEnumerator : public IPDBEnumChildren<PDBSymbolData> {
+public:
+  typedef ConcreteSymbolEnumerator<PDBSymbolData> ArgEnumeratorType;
+
+  FunctionArgEnumerator(const IPDBSession &PDBSession,
+                        const PDBSymbolFunc &PDBFunc)
+      : Session(PDBSession), Func(PDBFunc) {
+    // Arguments can appear multiple times if they have live range
+    // information, so we only take the first occurrence.
+    std::unordered_set<std::string> SeenNames;
+    auto DataChildren = Func.findAllChildren<PDBSymbolData>();
+    while (auto Child = DataChildren->getNext()) {
+      if (Child->getDataKind() == PDB_DataKind::Param) {
+        std::string Name = Child->getName();
+        if (SeenNames.find(Name) != SeenNames.end())
+          continue;
+        Args.push_back(std::move(Child));
+        SeenNames.insert(Name);
+      }
+    }
+    reset();
+  }
+
+  uint32_t getChildCount() const { return Args.size(); }
+
+  std::unique_ptr<PDBSymbolData> getChildAtIndex(uint32_t Index) const {
+    if (Index >= Args.size())
+      return nullptr;
+
+    return Session.getConcreteSymbolById<PDBSymbolData>(
+        Args[Index]->getSymIndexId());
+  }
+
+  std::unique_ptr<PDBSymbolData> getNext() {
+    if (CurIter == Args.end())
+      return nullptr;
+    const auto &Result = **CurIter;
+    ++CurIter;
+    return Session.getConcreteSymbolById<PDBSymbolData>(Result.getSymIndexId());
+  }
+
+  void reset() { CurIter = Args.empty() ? Args.end() : Args.begin(); }
+
+  FunctionArgEnumerator *clone() const {
+    return new FunctionArgEnumerator(Session, Func);
+  }
+
+private:
+  typedef std::vector<std::unique_ptr<PDBSymbolData>> ArgListType;
+  const IPDBSession &Session;
+  const PDBSymbolFunc &Func;
+  ArgListType Args;
+  ArgListType::const_iterator CurIter;
+};
+}
+
+PDBSymbolFunc::PDBSymbolFunc(const IPDBSession &PDBSession,
+                             std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+std::unique_ptr<PDBSymbolTypeFunctionSig> PDBSymbolFunc::getSignature() const {
+  return Session.getConcreteSymbolById<PDBSymbolTypeFunctionSig>(getTypeId());
+}
+
+std::unique_ptr<IPDBEnumChildren<PDBSymbolData>>
+PDBSymbolFunc::getArguments() const {
+  return llvm::make_unique<FunctionArgEnumerator>(Session, *this);
+}
+
+std::unique_ptr<PDBSymbolTypeUDT> PDBSymbolFunc::getClassParent() const {
+  return Session.getConcreteSymbolById<PDBSymbolTypeUDT>(getClassParentId());
+}
+
+void PDBSymbolFunc::dump(raw_ostream &OS, int Indent,
+                         PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp b/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
new file mode 100644
index 0000000..c207488
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
@@ -0,0 +1,26 @@
+//===- PDBSymbolFuncDebugEnd.cpp - ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolFuncDebugEnd::PDBSymbolFuncDebugEnd(
+    const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolFuncDebugEnd::dump(raw_ostream &OS, int Indent,
+                                 PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp b/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
new file mode 100644
index 0000000..83df22e
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
@@ -0,0 +1,26 @@
+//===- PDBSymbolFuncDebugStart.cpp - ----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolFuncDebugStart::PDBSymbolFuncDebugStart(
+    const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolFuncDebugStart::dump(raw_ostream &OS, int Indent,
+                                   PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolLabel.cpp b/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
new file mode 100644
index 0000000..ce569e2
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
@@ -0,0 +1,25 @@
+//===- PDBSymbolLabel.cpp - -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolLabel.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolLabel::PDBSymbolLabel(const IPDBSession &PDBSession,
+                               std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolLabel::dump(raw_ostream &OS, int Indent,
+                          PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp b/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
new file mode 100644
index 0000000..a7f156c
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
@@ -0,0 +1,26 @@
+//===- PDBSymbolPublicSymbol.cpp - ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolPublicSymbol::PDBSymbolPublicSymbol(
+    const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolPublicSymbol::dump(raw_ostream &OS, int Indent,
+                                 PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolThunk.cpp b/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
new file mode 100644
index 0000000..edade83
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
@@ -0,0 +1,25 @@
+//===- PDBSymbolThunk.cpp - -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolThunk.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolThunk::PDBSymbolThunk(const IPDBSession &PDBSession,
+                               std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolThunk::dump(raw_ostream &OS, int Indent,
+                          PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
new file mode 100644
index 0000000..ffe6c80
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
@@ -0,0 +1,30 @@
+//===- PDBSymbolTypeArray.cpp - ---------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeArray.h"
+
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolTypeArray::PDBSymbolTypeArray(const IPDBSession &PDBSession,
+                                       std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+std::unique_ptr<PDBSymbol> PDBSymbolTypeArray::getElementType() const {
+  return Session.getSymbolById(getTypeId());
+}
+
+void PDBSymbolTypeArray::dump(raw_ostream &OS, int Indent,
+                              PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
new file mode 100644
index 0000000..c44cc52
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
@@ -0,0 +1,26 @@
+//===- PDBSymbolTypeBaseClass.cpp - -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolTypeBaseClass::PDBSymbolTypeBaseClass(
+    const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolTypeBaseClass::dump(raw_ostream &OS, int Indent,
+                                  PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
new file mode 100644
index 0000000..f0c94c7
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
@@ -0,0 +1,25 @@
+//===- PDBSymbolTypeBuiltin.cpp - ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolTypeBuiltin::PDBSymbolTypeBuiltin(
+    const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolTypeBuiltin::dump(raw_ostream &OS, int Indent,
+                                PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
new file mode 100644
index 0000000..0fa8f45
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
@@ -0,0 +1,26 @@
+//===- PDBSymbolTypeCustom.cpp - --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolTypeCustom::PDBSymbolTypeCustom(const IPDBSession &PDBSession,
+                                         std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolTypeCustom::dump(raw_ostream &OS, int Indent,
+                               PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
new file mode 100644
index 0000000..47fb08d
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
@@ -0,0 +1,27 @@
+//===- PDBSymbolTypeDimension.cpp - --------------------------------*- C++
+//-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolTypeDimension::PDBSymbolTypeDimension(
+    const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolTypeDimension::dump(raw_ostream &OS, int Indent,
+                                  PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
new file mode 100644
index 0000000..121d41e
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
@@ -0,0 +1,25 @@
+//===- PDBSymbolTypeEnum.cpp - --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolTypeEnum::PDBSymbolTypeEnum(const IPDBSession &PDBSession,
+                                     std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolTypeEnum::dump(raw_ostream &OS, int Indent,
+                             PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
new file mode 100644
index 0000000..b2bf72e
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
@@ -0,0 +1,26 @@
+//===- PDBSymbolTypeFriend.cpp - --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolTypeFriend::PDBSymbolTypeFriend(const IPDBSession &PDBSession,
+                                         std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolTypeFriend::dump(raw_ostream &OS, int Indent,
+                               PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
new file mode 100644
index 0000000..f394c04
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
@@ -0,0 +1,25 @@
+//===- PDBSymbolTypeFunctionArg.cpp - --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolTypeFunctionArg::PDBSymbolTypeFunctionArg(
+    const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolTypeFunctionArg::dump(raw_ostream &OS, int Indent,
+                                    PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
new file mode 100644
index 0000000..1ba397b
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
@@ -0,0 +1,89 @@
+//===- PDBSymbolTypeFunctionSig.cpp - --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
+
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+namespace {
+class FunctionArgEnumerator : public IPDBEnumSymbols {
+public:
+  typedef ConcreteSymbolEnumerator<PDBSymbolTypeFunctionArg> ArgEnumeratorType;
+
+  FunctionArgEnumerator(const IPDBSession &PDBSession,
+                        const PDBSymbolTypeFunctionSig &Sig)
+      : Session(PDBSession),
+        Enumerator(Sig.findAllChildren<PDBSymbolTypeFunctionArg>()) {}
+
+  FunctionArgEnumerator(const IPDBSession &PDBSession,
+                        std::unique_ptr<ArgEnumeratorType> ArgEnumerator)
+      : Session(PDBSession), Enumerator(std::move(ArgEnumerator)) {}
+
+  uint32_t getChildCount() const { return Enumerator->getChildCount(); }
+
+  std::unique_ptr<PDBSymbol> getChildAtIndex(uint32_t Index) const {
+    auto FunctionArgSymbol = Enumerator->getChildAtIndex(Index);
+    if (!FunctionArgSymbol)
+      return nullptr;
+    return Session.getSymbolById(FunctionArgSymbol->getTypeId());
+  }
+
+  std::unique_ptr<PDBSymbol> getNext() {
+    auto FunctionArgSymbol = Enumerator->getNext();
+    if (!FunctionArgSymbol)
+      return nullptr;
+    return Session.getSymbolById(FunctionArgSymbol->getTypeId());
+  }
+
+  void reset() { Enumerator->reset(); }
+
+  MyType *clone() const {
+    std::unique_ptr<ArgEnumeratorType> Clone(Enumerator->clone());
+    return new FunctionArgEnumerator(Session, std::move(Clone));
+  }
+
+private:
+  const IPDBSession &Session;
+  std::unique_ptr<ArgEnumeratorType> Enumerator;
+};
+}
+
+PDBSymbolTypeFunctionSig::PDBSymbolTypeFunctionSig(
+    const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+std::unique_ptr<PDBSymbol> PDBSymbolTypeFunctionSig::getReturnType() const {
+  return Session.getSymbolById(getTypeId());
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+PDBSymbolTypeFunctionSig::getArguments() const {
+  return llvm::make_unique<FunctionArgEnumerator>(Session, *this);
+}
+
+std::unique_ptr<PDBSymbol> PDBSymbolTypeFunctionSig::getClassParent() const {
+  uint32_t ClassId = getClassParentId();
+  if (ClassId == 0)
+    return nullptr;
+  return Session.getSymbolById(ClassId);
+}
+
+void PDBSymbolTypeFunctionSig::dump(raw_ostream &OS, int Indent,
+                                    PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
new file mode 100644
index 0000000..e04fb66
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
@@ -0,0 +1,26 @@
+//===- PDBSymboTypelManaged.cpp - ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolTypeManaged::PDBSymbolTypeManaged(
+    const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolTypeManaged::dump(raw_ostream &OS, int Indent,
+                                PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp b/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
new file mode 100644
index 0000000..d274bf5
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
@@ -0,0 +1,30 @@
+//===- PDBSymbolTypePointer.cpp -----------------------------------*- C++ -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
+
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolTypePointer::PDBSymbolTypePointer(
+    const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+std::unique_ptr<PDBSymbol> PDBSymbolTypePointer::getPointeeType() const {
+  return Session.getSymbolById(getTypeId());
+}
+
+void PDBSymbolTypePointer::dump(raw_ostream &OS, int Indent,
+                                PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
new file mode 100644
index 0000000..12e3ead
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
@@ -0,0 +1,25 @@
+//===- PDBSymbolTypeTypedef.cpp ---------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolTypeTypedef::PDBSymbolTypeTypedef(
+    const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolTypeTypedef::dump(raw_ostream &OS, int Indent,
+                                PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
new file mode 100644
index 0000000..8a72368
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
@@ -0,0 +1,25 @@
+//===- PDBSymbolTypeUDT.cpp - --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolTypeUDT::PDBSymbolTypeUDT(const IPDBSession &PDBSession,
+                                   std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolTypeUDT::dump(raw_ostream &OS, int Indent,
+                            PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
new file mode 100644
index 0000000..a100526
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
@@ -0,0 +1,25 @@
+//===- PDBSymbolTypeVTable.cpp - --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolTypeVTable::PDBSymbolTypeVTable(const IPDBSession &PDBSession,
+                                         std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolTypeVTable::dump(raw_ostream &OS, int Indent,
+                               PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
new file mode 100644
index 0000000..6aaa668
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
@@ -0,0 +1,26 @@
+//===- PDBSymbolTypeVTableShape.cpp - ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolTypeVTableShape::PDBSymbolTypeVTableShape(
+    const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolTypeVTableShape::dump(raw_ostream &OS, int Indent,
+                                    PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp b/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp
new file mode 100644
index 0000000..9cfb88a
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp
@@ -0,0 +1,26 @@
+//===- PDBSymbolUnknown.cpp - -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolUnknown.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolUnknown::PDBSymbolUnknown(const IPDBSession &PDBSession,
+                                   std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolUnknown::dump(raw_ostream &OS, int Indent,
+                            PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp b/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
new file mode 100644
index 0000000..9176dfb
--- /dev/null
+++ b/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
@@ -0,0 +1,26 @@
+//===- PDBSymbolUsingNamespace.cpp - ------------------- --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+#include <utility>
+
+using namespace llvm;
+
+PDBSymbolUsingNamespace::PDBSymbolUsingNamespace(
+    const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
+    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+
+void PDBSymbolUsingNamespace::dump(raw_ostream &OS, int Indent,
+                                   PDBSymDumper &Dumper) const {
+  Dumper.dump(*this, OS, Indent);
+}
diff --git a/lib/DebugInfo/module.modulemap b/lib/DebugInfo/module.modulemap
deleted file mode 100644
index 1fe5ab1..0000000
--- a/lib/DebugInfo/module.modulemap
+++ /dev/null
@@ -1 +0,0 @@
-module DebugInfo { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/ExecutionEngine/Android.mk b/lib/ExecutionEngine/Android.mk
index 9f1befd..6578e2b 100644
--- a/lib/ExecutionEngine/Android.mk
+++ b/lib/ExecutionEngine/Android.mk
@@ -7,7 +7,8 @@ include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
 	ExecutionEngineBindings.cpp \
 	ExecutionEngine.cpp \
-	RTDyldMemoryManager.cpp \
+	GDBRegistrationListener.cpp \
+	SectionMemoryManager.cpp \
 	TargetSelect.cpp
 
 LOCAL_MODULE:= libLLVMExecutionEngine
diff --git a/lib/ExecutionEngine/CMakeLists.txt b/lib/ExecutionEngine/CMakeLists.txt
index fae5bb9..e8a18d3 100644
--- a/lib/ExecutionEngine/CMakeLists.txt
+++ b/lib/ExecutionEngine/CMakeLists.txt
@@ -3,13 +3,17 @@
 add_llvm_library(LLVMExecutionEngine
   ExecutionEngine.cpp
   ExecutionEngineBindings.cpp
-  JITEventListener.cpp
-  RTDyldMemoryManager.cpp
+  GDBRegistrationListener.cpp
+  SectionMemoryManager.cpp
   TargetSelect.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/ExecutionEngine
   )
 
 add_subdirectory(Interpreter)
 add_subdirectory(MCJIT)
+add_subdirectory(Orc)
 add_subdirectory(RuntimeDyld)
 
 if( LLVM_USE_OPROFILE )
diff --git a/lib/ExecutionEngine/EventListenerCommon.h b/lib/ExecutionEngine/EventListenerCommon.h
index 66645d7..6453099 100644
--- a/lib/ExecutionEngine/EventListenerCommon.h
+++ b/lib/ExecutionEngine/EventListenerCommon.h
@@ -26,13 +26,13 @@ namespace jitprofiling {
 
 class FilenameCache {
   // Holds the filename of each Scope, so that we can pass a null-terminated
-  // string into oprofile.  Use an AssertingVH rather than a ValueMap because we
-  // shouldn't be modifying any MDNodes while this map is alive.
-  DenseMap<AssertingVH<MDNode>, std::string> Filenames;
-  DenseMap<AssertingVH<MDNode>, std::string> Paths;
+  // string into oprofile.  
+  DenseMap<const MDNode *, std::string> Filenames;
+  DenseMap<const MDNode *, std::string> Paths;
 
  public:
   const char *getFilename(MDNode *Scope) {
+    assert(Scope->isResolved() && "Expected Scope to be resolved");
     std::string &Filename = Filenames[Scope];
     if (Filename.empty()) {
       DIScope DIScope(Scope);
@@ -42,6 +42,7 @@ class FilenameCache {
   }
 
   const char *getFullPath(MDNode *Scope) {
+    assert(Scope->isResolved() && "Expected Scope to be resolved");
     std::string &P = Paths[Scope];
     if (P.empty()) {
       DIScope DIScope(Scope);
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index 5a6d656..12e0e6a 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -16,8 +16,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
-#include "llvm/ExecutionEngine/ObjectBuffer.h"
-#include "llvm/ExecutionEngine/ObjectCache.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -43,17 +42,20 @@ using namespace llvm;
 STATISTIC(NumInitBytes, "Number of bytes of global vars initialized");
 STATISTIC(NumGlobals  , "Number of global vars initialized");
 
-// Pin the vtable to this file.
-void ObjectCache::anchor() {}
-void ObjectBuffer::anchor() {}
-void ObjectBufferStream::anchor() {}
-
 ExecutionEngine *(*ExecutionEngine::MCJITCtor)(
     std::unique_ptr<Module> M, std::string *ErrorStr,
-    RTDyldMemoryManager *MCJMM, std::unique_ptr<TargetMachine> TM) = nullptr;
+    std::unique_ptr<RTDyldMemoryManager> MCJMM,
+    std::unique_ptr<TargetMachine> TM) = nullptr;
+
+ExecutionEngine *(*ExecutionEngine::OrcMCJITReplacementCtor)(
+  std::string *ErrorStr, std::unique_ptr<RTDyldMemoryManager> OrcJMM,
+  std::unique_ptr<TargetMachine> TM) = nullptr;
+
 ExecutionEngine *(*ExecutionEngine::InterpCtor)(std::unique_ptr<Module> M,
                                                 std::string *ErrorStr) =nullptr;
 
+void JITEventListener::anchor() {}
+
 ExecutionEngine::ExecutionEngine(std::unique_ptr<Module> M)
   : EEState(*this),
     LazyFunctionCreator(nullptr) {
@@ -140,7 +142,8 @@ bool ExecutionEngine::removeModule(Module *M) {
 
 Function *ExecutionEngine::FindFunctionNamed(const char *FnName) {
   for (unsigned i = 0, e = Modules.size(); i != e; ++i) {
-    if (Function *F = Modules[i]->getFunction(FnName))
+    Function *F = Modules[i]->getFunction(FnName);
+    if (F && !F->isDeclaration())
       return F;
   }
   return nullptr;
@@ -396,6 +399,23 @@ int ExecutionEngine::runFunctionAsMain(Function *Fn,
   return runFunction(Fn, GVArgs).IntVal.getZExtValue();
 }
 
+EngineBuilder::EngineBuilder() {
+  InitEngine();
+}
+
+EngineBuilder::EngineBuilder(std::unique_ptr<Module> M)
+  : M(std::move(M)), MCJMM(nullptr) {
+  InitEngine();
+}
+
+EngineBuilder::~EngineBuilder() {}
+
+EngineBuilder &EngineBuilder::setMCJITMemoryManager(
+                                   std::unique_ptr<RTDyldMemoryManager> mcjmm) {
+  MCJMM = std::move(mcjmm);
+  return *this;
+}
+
 void EngineBuilder::InitEngine() {
   WhichEngine = EngineKind::Either;
   ErrorStr = nullptr;
@@ -404,6 +424,7 @@ void EngineBuilder::InitEngine() {
   Options = TargetOptions();
   RelocModel = Reloc::Default;
   CMModel = CodeModel::JITDefault;
+  UseOrcMCJITReplacement = false;
 
 // IR module verification is enabled by default in debug builds, and disabled
 // by default in release builds.
@@ -446,9 +467,14 @@ ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
     }
 
     ExecutionEngine *EE = nullptr;
-    if (ExecutionEngine::MCJITCtor)
-      EE = ExecutionEngine::MCJITCtor(std::move(M), ErrorStr, MCJMM,
+    if (ExecutionEngine::OrcMCJITReplacementCtor && UseOrcMCJITReplacement) {
+      EE = ExecutionEngine::OrcMCJITReplacementCtor(ErrorStr, std::move(MCJMM),
+                                                    std::move(TheTM));
+      EE->addModule(std::move(M));
+    } else if (ExecutionEngine::MCJITCtor)
+      EE = ExecutionEngine::MCJITCtor(std::move(M), ErrorStr, std::move(MCJMM),
                                       std::move(TheTM));
+
     if (EE) {
       EE->setVerifyModules(VerifyModules);
       return EE;
diff --git a/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index 58271df..aaa53f0 100644
--- a/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -188,7 +188,8 @@ LLVMBool LLVMCreateMCJITCompilerForModule(
          .setCodeModel(unwrap(options.CodeModel))
          .setTargetOptions(targetOptions);
   if (options.MCJMM)
-    builder.setMCJITMemoryManager(unwrap(options.MCJMM));
+    builder.setMCJITMemoryManager(
+      std::unique_ptr<RTDyldMemoryManager>(unwrap(options.MCJMM)));
   if (ExecutionEngine *JIT = builder.create()) {
     *OutJIT = wrap(JIT);
     return 0;
@@ -327,6 +328,14 @@ void *LLVMGetPointerToGlobal(LLVMExecutionEngineRef EE, LLVMValueRef Global) {
   return unwrap(EE)->getPointerToGlobal(unwrap<GlobalValue>(Global));
 }
 
+uint64_t LLVMGetGlobalValueAddress(LLVMExecutionEngineRef EE, const char *Name) {
+  return unwrap(EE)->getGlobalValueAddress(Name);
+}
+
+uint64_t LLVMGetFunctionAddress(LLVMExecutionEngineRef EE, const char *Name) {
+  return unwrap(EE)->getFunctionAddress(Name);
+}
+
 /*===-- Operations on memory managers -------------------------------------===*/
 
 namespace {
diff --git a/lib/ExecutionEngine/GDBRegistrationListener.cpp b/lib/ExecutionEngine/GDBRegistrationListener.cpp
new file mode 100644
index 0000000..8ef878c
--- /dev/null
+++ b/lib/ExecutionEngine/GDBRegistrationListener.cpp
@@ -0,0 +1,247 @@
+//===----- GDBRegistrationListener.cpp - Registers objects with GDB -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/MutexGuard.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+// This must be kept in sync with gdb/gdb/jit.h .
+extern "C" {
+
+  typedef enum {
+    JIT_NOACTION = 0,
+    JIT_REGISTER_FN,
+    JIT_UNREGISTER_FN
+  } jit_actions_t;
+
+  struct jit_code_entry {
+    struct jit_code_entry *next_entry;
+    struct jit_code_entry *prev_entry;
+    const char *symfile_addr;
+    uint64_t symfile_size;
+  };
+
+  struct jit_descriptor {
+    uint32_t version;
+    // This should be jit_actions_t, but we want to be specific about the
+    // bit-width.
+    uint32_t action_flag;
+    struct jit_code_entry *relevant_entry;
+    struct jit_code_entry *first_entry;
+  };
+
+  // We put information about the JITed function in this global, which the
+  // debugger reads.  Make sure to specify the version statically, because the
+  // debugger checks the version before we can set it during runtime.
+  struct jit_descriptor __jit_debug_descriptor = { 1, 0, nullptr, nullptr };
+
+  // Debuggers puts a breakpoint in this function.
+  LLVM_ATTRIBUTE_NOINLINE void __jit_debug_register_code() {
+    // The noinline and the asm prevent calls to this function from being
+    // optimized out.
+#if !defined(_MSC_VER)
+    asm volatile("":::"memory");
+#endif
+  }
+
+}
+
+namespace {
+
+struct RegisteredObjectInfo {
+  RegisteredObjectInfo() {}
+
+  RegisteredObjectInfo(std::size_t Size, jit_code_entry *Entry,
+                       OwningBinary<ObjectFile> Obj)
+    : Size(Size), Entry(Entry), Obj(std::move(Obj)) {}
+
+  RegisteredObjectInfo(RegisteredObjectInfo &&Other)
+    : Size(Other.Size), Entry(Other.Entry), Obj(std::move(Other.Obj)) {}
+
+  RegisteredObjectInfo& operator=(RegisteredObjectInfo &&Other) {
+    Size = Other.Size;
+    Entry = Other.Entry;
+    Obj = std::move(Other.Obj);
+    return *this;
+  }
+
+  std::size_t Size;
+  jit_code_entry *Entry;
+  OwningBinary<ObjectFile> Obj;
+};
+
+// Buffer for an in-memory object file in executable memory
+typedef llvm::DenseMap< const char*, RegisteredObjectInfo>
+  RegisteredObjectBufferMap;
+
+/// Global access point for the JIT debugging interface designed for use with a
+/// singleton toolbox. Handles thread-safe registration and deregistration of
+/// object files that are in executable memory managed by the client of this
+/// class.
+class GDBJITRegistrationListener : public JITEventListener {
+  /// A map of in-memory object files that have been registered with the
+  /// JIT interface.
+  RegisteredObjectBufferMap ObjectBufferMap;
+
+public:
+  /// Instantiates the JIT service.
+  GDBJITRegistrationListener() : ObjectBufferMap() {}
+
+  /// Unregisters each object that was previously registered and releases all
+  /// internal resources.
+  virtual ~GDBJITRegistrationListener();
+
+  /// Creates an entry in the JIT registry for the buffer @p Object,
+  /// which must contain an object file in executable memory with any
+  /// debug information for the debugger.
+  void NotifyObjectEmitted(const ObjectFile &Object,
+                           const RuntimeDyld::LoadedObjectInfo &L) override;
+
+  /// Removes the internal registration of @p Object, and
+  /// frees associated resources.
+  /// Returns true if @p Object was found in ObjectBufferMap.
+  void NotifyFreeingObject(const ObjectFile &Object) override;
+
+private:
+  /// Deregister the debug info for the given object file from the debugger
+  /// and delete any temporary copies.  This private method does not remove
+  /// the function from Map so that it can be called while iterating over Map.
+  void deregisterObjectInternal(RegisteredObjectBufferMap::iterator I);
+};
+
+/// Lock used to serialize all jit registration events, since they
+/// modify global variables.
+ManagedStatic<sys::Mutex> JITDebugLock;
+
+/// Do the registration.
+void NotifyDebugger(jit_code_entry* JITCodeEntry) {
+  __jit_debug_descriptor.action_flag = JIT_REGISTER_FN;
+
+  // Insert this entry at the head of the list.
+  JITCodeEntry->prev_entry = nullptr;
+  jit_code_entry* NextEntry = __jit_debug_descriptor.first_entry;
+  JITCodeEntry->next_entry = NextEntry;
+  if (NextEntry) {
+    NextEntry->prev_entry = JITCodeEntry;
+  }
+  __jit_debug_descriptor.first_entry = JITCodeEntry;
+  __jit_debug_descriptor.relevant_entry = JITCodeEntry;
+  __jit_debug_register_code();
+}
+
+GDBJITRegistrationListener::~GDBJITRegistrationListener() {
+  // Free all registered object files.
+  llvm::MutexGuard locked(*JITDebugLock);
+  for (RegisteredObjectBufferMap::iterator I = ObjectBufferMap.begin(),
+                                           E = ObjectBufferMap.end();
+       I != E; ++I) {
+    // Call the private method that doesn't update the map so our iterator
+    // doesn't break.
+    deregisterObjectInternal(I);
+  }
+  ObjectBufferMap.clear();
+}
+
+void GDBJITRegistrationListener::NotifyObjectEmitted(
+                                       const ObjectFile &Object,
+                                       const RuntimeDyld::LoadedObjectInfo &L) {
+
+  OwningBinary<ObjectFile> DebugObj = L.getObjectForDebug(Object);
+
+  // Bail out if debug objects aren't supported.
+  if (!DebugObj.getBinary())
+    return;
+
+  const char *Buffer = DebugObj.getBinary()->getMemoryBufferRef().getBufferStart();
+  size_t      Size = DebugObj.getBinary()->getMemoryBufferRef().getBufferSize();
+
+  const char *Key = Object.getMemoryBufferRef().getBufferStart();
+
+  assert(Key && "Attempt to register a null object with a debugger.");
+  llvm::MutexGuard locked(*JITDebugLock);
+  assert(ObjectBufferMap.find(Key) == ObjectBufferMap.end() &&
+         "Second attempt to perform debug registration.");
+  jit_code_entry* JITCodeEntry = new jit_code_entry();
+
+  if (!JITCodeEntry) {
+    llvm::report_fatal_error(
+      "Allocation failed when registering a JIT entry!\n");
+  } else {
+    JITCodeEntry->symfile_addr = Buffer;
+    JITCodeEntry->symfile_size = Size;
+
+    ObjectBufferMap[Key] = RegisteredObjectInfo(Size, JITCodeEntry,
+                                                std::move(DebugObj));
+    NotifyDebugger(JITCodeEntry);
+  }
+}
+
+void GDBJITRegistrationListener::NotifyFreeingObject(const ObjectFile& Object) {
+  const char *Key = Object.getMemoryBufferRef().getBufferStart();
+  llvm::MutexGuard locked(*JITDebugLock);
+  RegisteredObjectBufferMap::iterator I = ObjectBufferMap.find(Key);
+
+  if (I != ObjectBufferMap.end()) {
+    deregisterObjectInternal(I);
+    ObjectBufferMap.erase(I);
+  }
+}
+
+void GDBJITRegistrationListener::deregisterObjectInternal(
+    RegisteredObjectBufferMap::iterator I) {
+
+  jit_code_entry*& JITCodeEntry = I->second.Entry;
+
+  // Do the unregistration.
+  {
+    __jit_debug_descriptor.action_flag = JIT_UNREGISTER_FN;
+
+    // Remove the jit_code_entry from the linked list.
+    jit_code_entry* PrevEntry = JITCodeEntry->prev_entry;
+    jit_code_entry* NextEntry = JITCodeEntry->next_entry;
+
+    if (NextEntry) {
+      NextEntry->prev_entry = PrevEntry;
+    }
+    if (PrevEntry) {
+      PrevEntry->next_entry = NextEntry;
+    }
+    else {
+      assert(__jit_debug_descriptor.first_entry == JITCodeEntry);
+      __jit_debug_descriptor.first_entry = NextEntry;
+    }
+
+    // Tell the debugger which entry we removed, and unregister the code.
+    __jit_debug_descriptor.relevant_entry = JITCodeEntry;
+    __jit_debug_register_code();
+  }
+
+  delete JITCodeEntry;
+  JITCodeEntry = nullptr;
+}
+
+llvm::ManagedStatic<GDBJITRegistrationListener> GDBRegListener;
+
+} // end namespace
+
+namespace llvm {
+
+JITEventListener* JITEventListener::createGDBRegistrationListener() {
+  return &*GDBRegListener;
+}
+
+} // namespace llvm
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index b23ca88..aa32452 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -13,25 +13,24 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Config/config.h"
+#include "EventListenerCommon.h"
+#include "IntelJITEventsWrapper.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/DebugInfo/DWARF/DIContext.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
-
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/DebugInfo/DIContext.h"
-#include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Errno.h"
-#include "llvm/IR/ValueHandle.h"
-#include "EventListenerCommon.h"
-#include "IntelJITEventsWrapper.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 using namespace llvm::jitprofiling;
+using namespace llvm::object;
 
 #define DEBUG_TYPE "amplifier-jit-event-listener"
 
@@ -48,6 +47,7 @@ class IntelJITEventListener : public JITEventListener {
   typedef DenseMap<const void *, MethodAddressVector>  ObjectMap;
 
   ObjectMap  LoadedObjectMap;
+  std::map<const char*, OwningBinary<ObjectFile>> DebugObjects;
 
 public:
   IntelJITEventListener(IntelJITEventsWrapper* libraryWrapper) {
@@ -57,9 +57,10 @@ public:
   ~IntelJITEventListener() {
   }
 
-  virtual void NotifyObjectEmitted(const ObjectImage &Obj);
+  void NotifyObjectEmitted(const ObjectFile &Obj,
+                           const RuntimeDyld::LoadedObjectInfo &L) override;
 
-  virtual void NotifyFreeingObject(const ObjectImage &Obj);
+  void NotifyFreeingObject(const ObjectFile &Obj) override;
 };
 
 static LineNumberInfo DILineInfoToIntelJITFormat(uintptr_t StartAddress,
@@ -95,23 +96,29 @@ static iJIT_Method_Load FunctionDescToIntelJITFormat(
   return Result;
 }
 
-void IntelJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
+void IntelJITEventListener::NotifyObjectEmitted(
+                                       const ObjectFile &Obj,
+                                       const RuntimeDyld::LoadedObjectInfo &L) {
+
+  OwningBinary<ObjectFile> DebugObjOwner = L.getObjectForDebug(Obj);
+  const ObjectFile &DebugObj = *DebugObjOwner.getBinary();
+
   // Get the address of the object image for use as a unique identifier
-  const void* ObjData = Obj.getData().data();
-  DIContext* Context = DIContext::getDWARFContext(*Obj.getObjectFile());
+  const void* ObjData = DebugObj.getData().data();
+  DIContext* Context = DIContext::getDWARFContext(DebugObj);
   MethodAddressVector Functions;
 
   // Use symbol info to iterate functions in the object.
-  for (object::symbol_iterator I = Obj.begin_symbols(),
-                               E = Obj.end_symbols();
+  for (symbol_iterator I = DebugObj.symbol_begin(),
+                       E = DebugObj.symbol_end();
                         I != E;
                         ++I) {
     std::vector<LineNumberInfo> LineInfo;
     std::string SourceFileName;
 
-    object::SymbolRef::Type SymType;
+    SymbolRef::Type SymType;
     if (I->getType(SymType)) continue;
-    if (SymType == object::SymbolRef::ST_Function) {
+    if (SymType == SymbolRef::ST_Function) {
       StringRef  Name;
       uint64_t   Addr;
       uint64_t   Size;
@@ -141,6 +148,18 @@ void IntelJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
           FunctionMessage.line_number_size = 0;
           FunctionMessage.line_number_table = 0;
         } else {
+          // Source line information for the address range is provided as 
+          // a code offset for the start of the corresponding sub-range and
+          // a source line. JIT API treats offsets in LineNumberInfo structures
+          // as the end of the corresponding code region. The start of the code
+          // is taken from the previous element. Need to shift the elements.
+
+          LineNumberInfo last = LineInfo.back();
+          last.Offset = FunctionMessage.method_size;
+          LineInfo.push_back(last);
+          for (size_t i = LineInfo.size() - 2; i > 0; --i)
+            LineInfo[i].LineNumber = LineInfo[i - 1].LineNumber;
+
           SourceFileName = Lines.front().second.FileName;
           FunctionMessage.source_file_name = const_cast<char *>(SourceFileName.c_str());
           FunctionMessage.line_number_size = LineInfo.size();
@@ -162,11 +181,18 @@ void IntelJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
   // registered function addresses for each loaded object.  We will
   // use the MethodIDs map to get the registered ID for each function.
   LoadedObjectMap[ObjData] = Functions;
+  DebugObjects[Obj.getData().data()] = std::move(DebugObjOwner);
 }
 
-void IntelJITEventListener::NotifyFreeingObject(const ObjectImage &Obj) {
+void IntelJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) {
+  // This object may not have been registered with the listener. If it wasn't,
+  // bail out.
+  if (DebugObjects.find(Obj.getData().data()) == DebugObjects.end())
+    return;
+
   // Get the address of the object image for use as a unique identifier
-  const void* ObjData = Obj.getData().data();
+  const ObjectFile &DebugObj = *DebugObjects[Obj.getData().data()].getBinary();
+  const void* ObjData = DebugObj.getData().data();
 
   // Get the object's function list from LoadedObjectMap
   ObjectMap::iterator OI = LoadedObjectMap.find(ObjData);
@@ -190,6 +216,7 @@ void IntelJITEventListener::NotifyFreeingObject(const ObjectImage &Obj) {
 
   // Erase the object from LoadedObjectMap
   LoadedObjectMap.erase(OI);
+  DebugObjects.erase(Obj.getData().data());
 }
 
 }  // anonymous namespace.
diff --git a/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt b/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt
index e36493e..1247cbd 100644
--- a/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt
+++ b/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt
@@ -21,4 +21,4 @@
 type = OptionalLibrary
 name = IntelJITEvents
 parent = ExecutionEngine
-required_libraries = Core DebugInfo Support
+required_libraries = Core DebugInfoDWARF Support
diff --git a/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c b/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c
index 7b507de..e966889 100644
--- a/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c
+++ b/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c
@@ -24,6 +24,7 @@
 #else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #include <pthread.h>
 #include <dlfcn.h>
+#include <stdint.h>
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #include <malloc.h>
 #include <stdlib.h>
@@ -371,7 +372,7 @@ static int loadiJIT_Funcs()
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
     FUNC_NotifyEvent = (TPNotify)GetProcAddress(m_libHandle, "NotifyEvent");
 #else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-    FUNC_NotifyEvent = (TPNotify)dlsym(m_libHandle, "NotifyEvent");
+    FUNC_NotifyEvent = (TPNotify)(intptr_t)dlsym(m_libHandle, "NotifyEvent");
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
     if (!FUNC_NotifyEvent) 
     {
@@ -382,7 +383,7 @@ static int loadiJIT_Funcs()
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
     FUNC_Initialize = (TPInitialize)GetProcAddress(m_libHandle, "Initialize");
 #else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-    FUNC_Initialize = (TPInitialize)dlsym(m_libHandle, "Initialize");
+    FUNC_Initialize = (TPInitialize)(intptr_t)dlsym(m_libHandle, "Initialize");
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
     if (!FUNC_Initialize) 
     {
diff --git a/lib/ExecutionEngine/JITEventListener.cpp b/lib/ExecutionEngine/JITEventListener.cpp
deleted file mode 100644
index 2a6a007..0000000
--- a/lib/ExecutionEngine/JITEventListener.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-//===-- JITEventListener.cpp ----------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/JITEventListener.h"
-
-using namespace llvm;
-
-// Out-of-line definition of the virtual destructor as this is the key function.
-JITEventListener::~JITEventListener() {}
diff --git a/lib/ExecutionEngine/LLVMBuild.txt b/lib/ExecutionEngine/LLVMBuild.txt
index ecae078..8fdda9a 100644
--- a/lib/ExecutionEngine/LLVMBuild.txt
+++ b/lib/ExecutionEngine/LLVMBuild.txt
@@ -16,10 +16,10 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = Interpreter MCJIT RuntimeDyld IntelJITEvents OProfileJIT
+subdirectories = Interpreter MCJIT RuntimeDyld IntelJITEvents OProfileJIT Orc
 
 [component_0]
 type = Library
 name = ExecutionEngine
 parent = Libraries
-required_libraries = Core MC Support
+required_libraries = Core MC Object Support RuntimeDyld
diff --git a/lib/ExecutionEngine/MCJIT/Android.mk b/lib/ExecutionEngine/MCJIT/Android.mk
index 0314958..5827212 100644
--- a/lib/ExecutionEngine/MCJIT/Android.mk
+++ b/lib/ExecutionEngine/MCJIT/Android.mk
@@ -4,9 +4,8 @@ LOCAL_PATH:= $(call my-dir)
 # =====================================================
 include $(CLEAR_VARS)
 
-LOCAL_SRC_FILES :=	\
-	MCJIT.cpp \
-	SectionMemoryManager.cpp
+LOCAL_SRC_FILES := \
+  MCJIT.cpp
 
 LOCAL_MODULE:= libLLVMMCJIT
 
diff --git a/lib/ExecutionEngine/MCJIT/CMakeLists.txt b/lib/ExecutionEngine/MCJIT/CMakeLists.txt
index 088635a..2911a50 100644
--- a/lib/ExecutionEngine/MCJIT/CMakeLists.txt
+++ b/lib/ExecutionEngine/MCJIT/CMakeLists.txt
@@ -1,4 +1,3 @@
 add_llvm_library(LLVMMCJIT
   MCJIT.cpp
-  SectionMemoryManager.cpp
   )
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index da5f037..e500d3d 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -11,26 +11,25 @@
 #include "llvm/ExecutionEngine/GenericValue.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
-#include "llvm/ExecutionEngine/ObjectBuffer.h"
-#include "llvm/ExecutionEngine/ObjectImage.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Object/Archive.h"
-#include "llvm/PassManager.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/MutexGuard.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
+void ObjectCache::anchor() {}
+
 namespace {
 
 static struct RegisterJIT {
@@ -44,21 +43,24 @@ extern "C" void LLVMLinkInMCJIT() {
 
 ExecutionEngine *MCJIT::createJIT(std::unique_ptr<Module> M,
                                   std::string *ErrorStr,
-                                  RTDyldMemoryManager *MemMgr,
+                                  std::unique_ptr<RTDyldMemoryManager> MemMgr,
                                   std::unique_ptr<TargetMachine> TM) {
   // Try to register the program as a source of symbols to resolve against.
   //
   // FIXME: Don't do this here.
   sys::DynamicLibrary::LoadLibraryPermanently(nullptr, nullptr);
 
-  return new MCJIT(std::move(M), std::move(TM),
-                   MemMgr ? MemMgr : new SectionMemoryManager());
+  std::unique_ptr<RTDyldMemoryManager> MM = std::move(MemMgr);
+  if (!MM)
+    MM = std::unique_ptr<SectionMemoryManager>(new SectionMemoryManager());
+
+  return new MCJIT(std::move(M), std::move(TM), std::move(MM));
 }
 
 MCJIT::MCJIT(std::unique_ptr<Module> M, std::unique_ptr<TargetMachine> tm,
-             RTDyldMemoryManager *MM)
+             std::unique_ptr<RTDyldMemoryManager> MM)
     : ExecutionEngine(std::move(M)), TM(std::move(tm)), Ctx(nullptr),
-      MemMgr(this, MM), Dyld(&MemMgr), ObjCache(nullptr) {
+      MemMgr(this, std::move(MM)), Dyld(&MemMgr), ObjCache(nullptr) {
   // FIXME: We are managing our modules, so we do not want the base class
   // ExecutionEngine to manage them as well. To avoid double destruction
   // of the first (and only) module added in ExecutionEngine constructor
@@ -73,7 +75,8 @@ MCJIT::MCJIT(std::unique_ptr<Module> M, std::unique_ptr<TargetMachine> tm,
   Modules.clear();
 
   OwnedModules.addModule(std::move(First));
-  setDataLayout(TM->getSubtargetImpl()->getDataLayout());
+  setDataLayout(TM->getDataLayout());
+  RegisterJITEventListener(JITEventListener::createGDBRegistrationListener());
 }
 
 MCJIT::~MCJIT() {
@@ -99,13 +102,13 @@ bool MCJIT::removeModule(Module *M) {
 }
 
 void MCJIT::addObjectFile(std::unique_ptr<object::ObjectFile> Obj) {
-  std::unique_ptr<ObjectImage> LoadedObject = Dyld.loadObject(std::move(Obj));
-  if (!LoadedObject || Dyld.hasError())
+  std::unique_ptr<RuntimeDyld::LoadedObjectInfo> L = Dyld.loadObject(*Obj);
+  if (Dyld.hasError())
     report_fatal_error(Dyld.getErrorString());
 
-  NotifyObjectEmitted(*LoadedObject);
+  NotifyObjectEmitted(*Obj, *L);
 
-  LoadedObjects.push_back(std::move(LoadedObject));
+  LoadedObjects.push_back(std::move(Obj));
 }
 
 void MCJIT::addObjectFile(object::OwningBinary<object::ObjectFile> Obj) {
@@ -125,43 +128,45 @@ void MCJIT::setObjectCache(ObjectCache* NewCache) {
   ObjCache = NewCache;
 }
 
-std::unique_ptr<ObjectBufferStream> MCJIT::emitObject(Module *M) {
+std::unique_ptr<MemoryBuffer> MCJIT::emitObject(Module *M) {
   MutexGuard locked(lock);
 
   // This must be a module which has already been added but not loaded to this
   // MCJIT instance, since these conditions are tested by our caller,
   // generateCodeForModule.
 
-  PassManager PM;
+  legacy::PassManager PM;
 
-  M->setDataLayout(TM->getSubtargetImpl()->getDataLayout());
+  M->setDataLayout(TM->getDataLayout());
   PM.add(new DataLayoutPass());
 
   // The RuntimeDyld will take ownership of this shortly
-  std::unique_ptr<ObjectBufferStream> CompiledObject(new ObjectBufferStream());
+  SmallVector<char, 4096> ObjBufferSV;
+  raw_svector_ostream ObjStream(ObjBufferSV);
 
   // Turn the machine code intermediate representation into bytes in memory
   // that may be executed.
-  if (TM->addPassesToEmitMC(PM, Ctx, CompiledObject->getOStream(),
-                            !getVerifyModules())) {
+  if (TM->addPassesToEmitMC(PM, Ctx, ObjStream, !getVerifyModules()))
     report_fatal_error("Target does not support MC emission!");
-  }
 
   // Initialize passes.
   PM.run(*M);
   // Flush the output buffer to get the generated code into memory
-  CompiledObject->flush();
+  ObjStream.flush();
+
+  std::unique_ptr<MemoryBuffer> CompiledObjBuffer(
+                                new ObjectMemoryBuffer(std::move(ObjBufferSV)));
 
   // If we have an object cache, tell it about the new object.
   // Note that we're using the compiled image, not the loaded image (as below).
   if (ObjCache) {
     // MemoryBuffer is a thin wrapper around the actual memory, so it's OK
     // to create a temporary object here and delete it after the call.
-    MemoryBufferRef MB = CompiledObject->getMemBuffer();
+    MemoryBufferRef MB = CompiledObjBuffer->getMemBufferRef();
     ObjCache->notifyObjectCompiled(M, MB);
   }
 
-  return CompiledObject;
+  return CompiledObjBuffer;
 }
 
 void MCJIT::generateCodeForModule(Module *M) {
@@ -176,14 +181,10 @@ void MCJIT::generateCodeForModule(Module *M) {
   if (OwnedModules.hasModuleBeenLoaded(M))
     return;
 
-  std::unique_ptr<ObjectBuffer> ObjectToLoad;
+  std::unique_ptr<MemoryBuffer> ObjectToLoad;
   // Try to load the pre-compiled object from cache if possible
-  if (ObjCache) {
-    if (std::unique_ptr<MemoryBuffer> PreCompiledObject =
-            ObjCache->getObject(M))
-      ObjectToLoad =
-          llvm::make_unique<ObjectBuffer>(std::move(PreCompiledObject));
-  }
+  if (ObjCache)
+    ObjectToLoad = ObjCache->getObject(M);
 
   // If the cache did not contain a suitable object, compile the object
   if (!ObjectToLoad) {
@@ -193,17 +194,18 @@ void MCJIT::generateCodeForModule(Module *M) {
 
   // Load the object into the dynamic linker.
   // MCJIT now owns the ObjectImage pointer (via its LoadedObjects list).
-  std::unique_ptr<ObjectImage> LoadedObject =
-      Dyld.loadObject(std::move(ObjectToLoad));
-  if (!LoadedObject)
-    report_fatal_error(Dyld.getErrorString());
+  ErrorOr<std::unique_ptr<object::ObjectFile>> LoadedObject =
+    object::ObjectFile::createObjectFile(ObjectToLoad->getMemBufferRef());
+  std::unique_ptr<RuntimeDyld::LoadedObjectInfo> L =
+    Dyld.loadObject(*LoadedObject.get());
 
-  // FIXME: Make this optional, maybe even move it to a JIT event listener
-  LoadedObject->registerWithDebugger();
+  if (Dyld.hasError())
+    report_fatal_error(Dyld.getErrorString());
 
-  NotifyObjectEmitted(*LoadedObject);
+  NotifyObjectEmitted(*LoadedObject.get(), *L);
 
-  LoadedObjects.push_back(std::move(LoadedObject));
+  Buffers.push_back(std::move(ObjectToLoad));
+  LoadedObjects.push_back(std::move(*LoadedObject));
 
   OwnedModules.markModuleAsLoaded(M);
 }
@@ -253,7 +255,7 @@ void MCJIT::finalizeModule(Module *M) {
 }
 
 uint64_t MCJIT::getExistingSymbolAddress(const std::string &Name) {
-  Mangler Mang(TM->getSubtargetImpl()->getDataLayout());
+  Mangler Mang(TM->getDataLayout());
   SmallString<128> FullName;
   Mang.getNameWithPrefix(FullName, Name);
   return Dyld.getSymbolLoadAddress(FullName);
@@ -353,7 +355,7 @@ uint64_t MCJIT::getFunctionAddress(const std::string &Name) {
 void *MCJIT::getPointerToFunction(Function *F) {
   MutexGuard locked(lock);
 
-  Mangler Mang(TM->getSubtargetImpl()->getDataLayout());
+  Mangler Mang(TM->getDataLayout());
   SmallString<128> Name;
   TM->getNameWithPrefix(Name, F, Mang);
 
@@ -406,7 +408,8 @@ Function *MCJIT::FindFunctionNamedInModulePtrSet(const char *FnName,
                                                  ModulePtrSet::iterator I,
                                                  ModulePtrSet::iterator E) {
   for (; I != E; ++I) {
-    if (Function *F = (*I)->getFunction(FnName))
+    Function *F = (*I)->getFunction(FnName);
+    if (F && !F->isDeclaration())
       return F;
   }
   return nullptr;
@@ -549,6 +552,7 @@ void MCJIT::RegisterJITEventListener(JITEventListener *L) {
   MutexGuard locked(lock);
   EventListeners.push_back(L);
 }
+
 void MCJIT::UnregisterJITEventListener(JITEventListener *L) {
   if (!L)
     return;
@@ -559,14 +563,17 @@ void MCJIT::UnregisterJITEventListener(JITEventListener *L) {
     EventListeners.pop_back();
   }
 }
-void MCJIT::NotifyObjectEmitted(const ObjectImage& Obj) {
+
+void MCJIT::NotifyObjectEmitted(const object::ObjectFile& Obj,
+                                const RuntimeDyld::LoadedObjectInfo &L) {
   MutexGuard locked(lock);
-  MemMgr.notifyObjectLoaded(this, &Obj);
+  MemMgr.notifyObjectLoaded(this, Obj);
   for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
-    EventListeners[I]->NotifyObjectEmitted(Obj);
+    EventListeners[I]->NotifyObjectEmitted(Obj, L);
   }
 }
-void MCJIT::NotifyFreeingObject(const ObjectImage& Obj) {
+
+void MCJIT::NotifyFreeingObject(const object::ObjectFile& Obj) {
   MutexGuard locked(lock);
   for (JITEventListener *L : EventListeners)
     L->NotifyFreeingObject(Obj);
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index bc943b9..de4a8f6 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -15,7 +15,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
-#include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/ExecutionEngine/ObjectMemoryBuffer.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/IR/Module.h"
 
@@ -28,8 +28,9 @@ class MCJIT;
 // to that object.
 class LinkingMemoryManager : public RTDyldMemoryManager {
 public:
-  LinkingMemoryManager(MCJIT *Parent, RTDyldMemoryManager *MM)
-    : ParentEngine(Parent), ClientMM(MM) {}
+  LinkingMemoryManager(MCJIT *Parent,
+                       std::unique_ptr<RTDyldMemoryManager> MM)
+    : ParentEngine(Parent), ClientMM(std::move(MM)) {}
 
   uint64_t getSymbolAddress(const std::string &Name) override;
 
@@ -57,7 +58,7 @@ public:
   }
 
   void notifyObjectLoaded(ExecutionEngine *EE,
-                          const ObjectImage *Obj) override {
+                          const object::ObjectFile &Obj) override {
     ClientMM->notifyObjectLoaded(EE, Obj);
   }
 
@@ -102,7 +103,7 @@ private:
 
 class MCJIT : public ExecutionEngine {
   MCJIT(std::unique_ptr<Module> M, std::unique_ptr<TargetMachine> tm,
-        RTDyldMemoryManager *MemMgr);
+        std::unique_ptr<RTDyldMemoryManager> MemMgr);
 
   typedef llvm::SmallPtrSet<Module *, 4> ModulePtrSet;
 
@@ -222,7 +223,7 @@ class MCJIT : public ExecutionEngine {
   SmallVector<object::OwningBinary<object::Archive>, 2> Archives;
   SmallVector<std::unique_ptr<MemoryBuffer>, 2> Buffers;
 
-  SmallVector<std::unique_ptr<ObjectImage>, 2> LoadedObjects;
+  SmallVector<std::unique_ptr<object::ObjectFile>, 2> LoadedObjects;
 
   // An optional ObjectCache to be notified of compiled objects and used to
   // perform lookup of pre-compiled code to avoid re-compilation.
@@ -325,7 +326,7 @@ public:
 
   static ExecutionEngine *createJIT(std::unique_ptr<Module> M,
                                     std::string *ErrorStr,
-                                    RTDyldMemoryManager *MemMgr,
+                                    std::unique_ptr<RTDyldMemoryManager> MemMgr,
                                     std::unique_ptr<TargetMachine> TM);
 
   // @}
@@ -341,10 +342,11 @@ protected:
   /// this function call is expected to be the contained module.  The module
   /// is passed as a parameter here to prepare for multiple module support in
   /// the future.
-  std::unique_ptr<ObjectBufferStream> emitObject(Module *M);
+  std::unique_ptr<MemoryBuffer> emitObject(Module *M);
 
-  void NotifyObjectEmitted(const ObjectImage& Obj);
-  void NotifyFreeingObject(const ObjectImage& Obj);
+  void NotifyObjectEmitted(const object::ObjectFile& Obj,
+                           const RuntimeDyld::LoadedObjectInfo &L);
+  void NotifyFreeingObject(const object::ObjectFile& Obj);
 
   uint64_t getExistingSymbolAddress(const std::string &Name);
   Module *findModuleForSymbol(const std::string &Name,
diff --git a/lib/ExecutionEngine/MCJIT/ObjectBuffer.h b/lib/ExecutionEngine/MCJIT/ObjectBuffer.h
new file mode 100644
index 0000000..92310f3
--- /dev/null
+++ b/lib/ExecutionEngine/MCJIT/ObjectBuffer.h
@@ -0,0 +1,48 @@
+//===--- ObjectBuffer.h - Utility class to wrap object memory ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a wrapper class to hold the memory into which an
+// object will be generated.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_OBJECTBUFFER_H
+#define LLVM_EXECUTIONENGINE_OBJECTBUFFER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class ObjectMemoryBuffer : public MemoryBuffer {
+public:
+  template <unsigned N>
+  ObjectMemoryBuffer(SmallVector<char, N> SV)
+    : SV(SV), BufferName("<in-memory object>") {
+    init(this->SV.begin(), this->SV.end(), false);
+  }
+
+  template <unsigned N>
+  ObjectMemoryBuffer(SmallVector<char, N> SV, StringRef Name)
+    : SV(SV), BufferName(Name) {
+    init(this->SV.begin(), this->SV.end(), false);
+  }
+  const char* getBufferIdentifier() const override { return BufferName.c_str(); }
+
+  BufferKind getBufferKind() const override { return MemoryBuffer_Malloc; }
+
+private:
+  SmallVector<char, 4096> SV;
+  std::string BufferName;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/ExecutionEngine/Makefile b/lib/ExecutionEngine/Makefile
index cf71432..e9a5b79 100644
--- a/lib/ExecutionEngine/Makefile
+++ b/lib/ExecutionEngine/Makefile
@@ -11,7 +11,7 @@ LIBRARYNAME = LLVMExecutionEngine
 
 include $(LEVEL)/Makefile.config
 
-PARALLEL_DIRS = Interpreter MCJIT RuntimeDyld
+PARALLEL_DIRS = Interpreter MCJIT Orc RuntimeDyld
 
 ifeq ($(USE_INTEL_JITEVENTS), 1)
 PARALLEL_DIRS += IntelJITEvents
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
index 5a8ccb6..9ab4003 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
@@ -13,49 +13,50 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Config/config.h"
+#include "EventListenerCommon.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
-
+#include "llvm/ExecutionEngine/OProfileWrapper.h"
+#include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/ExecutionEngine/ObjectImage.h"
-#include "llvm/ExecutionEngine/OProfileWrapper.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Errno.h"
-#include "EventListenerCommon.h"
-
+#include "llvm/Support/raw_ostream.h"
 #include <dirent.h>
 #include <fcntl.h>
 
 using namespace llvm;
 using namespace llvm::jitprofiling;
+using namespace llvm::object;
 
 #define DEBUG_TYPE "oprofile-jit-event-listener"
 
 namespace {
 
 class OProfileJITEventListener : public JITEventListener {
-  OProfileWrapper& Wrapper;
+  std::unique_ptr<OProfileWrapper> Wrapper;
 
   void initialize();
+  std::map<const char*, OwningBinary<ObjectFile>> DebugObjects;
 
 public:
-  OProfileJITEventListener(OProfileWrapper& LibraryWrapper)
-  : Wrapper(LibraryWrapper) {
+  OProfileJITEventListener(std::unique_ptr<OProfileWrapper> LibraryWrapper)
+    : Wrapper(std::move(LibraryWrapper)) {
     initialize();
   }
 
   ~OProfileJITEventListener();
 
-  virtual void NotifyObjectEmitted(const ObjectImage &Obj);
+  void NotifyObjectEmitted(const ObjectFile &Obj,
+                           const RuntimeDyld::LoadedObjectInfo &L) override;
 
-  virtual void NotifyFreeingObject(const ObjectImage &Obj);
+  void NotifyFreeingObject(const ObjectFile &Obj) override;
 };
 
 void OProfileJITEventListener::initialize() {
-  if (!Wrapper.op_open_agent()) {
+  if (!Wrapper->op_open_agent()) {
     const std::string err_str = sys::StrError();
     DEBUG(dbgs() << "Failed to connect to OProfile agent: " << err_str << "\n");
   } else {
@@ -64,8 +65,8 @@ void OProfileJITEventListener::initialize() {
 }
 
 OProfileJITEventListener::~OProfileJITEventListener() {
-  if (Wrapper.isAgentAvailable()) {
-    if (Wrapper.op_close_agent() == -1) {
+  if (Wrapper->isAgentAvailable()) {
+    if (Wrapper->op_close_agent() == -1) {
       const std::string err_str = sys::StrError();
       DEBUG(dbgs() << "Failed to disconnect from OProfile agent: "
                    << err_str << "\n");
@@ -75,17 +76,22 @@ OProfileJITEventListener::~OProfileJITEventListener() {
   }
 }
 
-void OProfileJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
-  if (!Wrapper.isAgentAvailable()) {
+void OProfileJITEventListener::NotifyObjectEmitted(
+                                       const ObjectFile &Obj,
+                                       const RuntimeDyld::LoadedObjectInfo &L) {
+  if (!Wrapper->isAgentAvailable()) {
     return;
   }
 
+  OwningBinary<ObjectFile> DebugObjOwner = L.getObjectForDebug(Obj);
+  const ObjectFile &DebugObj = *DebugObjOwner.getBinary();
+
   // Use symbol info to iterate functions in the object.
-  for (object::symbol_iterator I = Obj.begin_symbols(), E = Obj.end_symbols();
+  for (symbol_iterator I = DebugObj.symbol_begin(), E = DebugObj.symbol_end();
        I != E; ++I) {
-    object::SymbolRef::Type SymType;
+    SymbolRef::Type SymType;
     if (I->getType(SymType)) continue;
-    if (SymType == object::SymbolRef::ST_Function) {
+    if (SymType == SymbolRef::ST_Function) {
       StringRef  Name;
       uint64_t   Addr;
       uint64_t   Size;
@@ -93,7 +99,7 @@ void OProfileJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
       if (I->getAddress(Addr)) continue;
       if (I->getSize(Size)) continue;
 
-      if (Wrapper.op_write_native_code(Name.data(), Addr, (void*)Addr, Size)
+      if (Wrapper->op_write_native_code(Name.data(), Addr, (void*)Addr, Size)
                         == -1) {
         DEBUG(dbgs() << "Failed to tell OProfile about native function "
           << Name << " at ["
@@ -103,45 +109,48 @@ void OProfileJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
       // TODO: support line number info (similar to IntelJITEventListener.cpp)
     }
   }
-}
 
-void OProfileJITEventListener::NotifyFreeingObject(const ObjectImage &Obj) {
-  if (!Wrapper.isAgentAvailable()) {
-    return;
-  }
-
-  // Use symbol info to iterate functions in the object.
-  for (object::symbol_iterator I = Obj.begin_symbols(), E = Obj.end_symbols();
-       I != E; ++I) {
-    object::SymbolRef::Type SymType;
-    if (I->getType(SymType)) continue;
-    if (SymType == object::SymbolRef::ST_Function) {
-      uint64_t   Addr;
-      if (I->getAddress(Addr)) continue;
+  DebugObjects[Obj.getData().data()] = std::move(DebugObjOwner);
+}
 
-      if (Wrapper.op_unload_native_code(Addr) == -1) {
-        DEBUG(dbgs()
-          << "Failed to tell OProfile about unload of native function at "
-          << (void*)Addr << "\n");
-        continue;
+void OProfileJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) {
+  if (Wrapper->isAgentAvailable()) {
+
+    // If there was no agent registered when the original object was loaded then
+    // we won't have created a debug object for it, so bail out.
+    if (DebugObjects.find(Obj.getData().data()) == DebugObjects.end())
+      return;
+
+    const ObjectFile &DebugObj = *DebugObjects[Obj.getData().data()].getBinary();
+
+    // Use symbol info to iterate functions in the object.
+    for (symbol_iterator I = DebugObj.symbol_begin(),
+                         E = DebugObj.symbol_end();
+         I != E; ++I) {
+      SymbolRef::Type SymType;
+      if (I->getType(SymType)) continue;
+      if (SymType == SymbolRef::ST_Function) {
+        uint64_t   Addr;
+        if (I->getAddress(Addr)) continue;
+
+        if (Wrapper->op_unload_native_code(Addr) == -1) {
+          DEBUG(dbgs()
+                << "Failed to tell OProfile about unload of native function at "
+                << (void*)Addr << "\n");
+          continue;
+        }
       }
     }
   }
+
+  DebugObjects.erase(Obj.getData().data());
 }
 
 }  // anonymous namespace.
 
 namespace llvm {
 JITEventListener *JITEventListener::createOProfileJITEventListener() {
-  static std::unique_ptr<OProfileWrapper> JITProfilingWrapper(
-      new OProfileWrapper);
-  return new OProfileJITEventListener(*JITProfilingWrapper);
-}
-
-// for testing
-JITEventListener *JITEventListener::createOProfileJITEventListener(
-                                      OProfileWrapper* TestImpl) {
-  return new OProfileJITEventListener(*TestImpl);
+  return new OProfileJITEventListener(llvm::make_unique<OProfileWrapper>());
 }
 
 } // namespace llvm
diff --git a/lib/ExecutionEngine/Orc/Android.mk b/lib/ExecutionEngine/Orc/Android.mk
new file mode 100644
index 0000000..61c1daf
--- /dev/null
+++ b/lib/ExecutionEngine/Orc/Android.mk
@@ -0,0 +1,18 @@
+LOCAL_PATH:= $(call my-dir)
+
+# For the host
+# =====================================================
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+  CloneSubModule.cpp \
+  IndirectionUtils.cpp \
+  OrcMCJITReplacement.cpp \
+  OrcTargetSupport.cpp
+
+LOCAL_MODULE:= libLLVMOrcJIT
+
+LOCAL_MODULE_TAGS := optional
+
+include $(LLVM_HOST_BUILD_MK)
+include $(BUILD_HOST_STATIC_LIBRARY)
diff --git a/lib/ExecutionEngine/Orc/CMakeLists.txt b/lib/ExecutionEngine/Orc/CMakeLists.txt
new file mode 100644
index 0000000..b0a8445
--- /dev/null
+++ b/lib/ExecutionEngine/Orc/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_llvm_library(LLVMOrcJIT
+  CloneSubModule.cpp
+  IndirectionUtils.cpp
+  OrcMCJITReplacement.cpp
+  OrcTargetSupport.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/ExecutionEngine/Orc
+  )
diff --git a/lib/ExecutionEngine/Orc/CloneSubModule.cpp b/lib/ExecutionEngine/Orc/CloneSubModule.cpp
new file mode 100644
index 0000000..a3196ad
--- /dev/null
+++ b/lib/ExecutionEngine/Orc/CloneSubModule.cpp
@@ -0,0 +1,108 @@
+#include "llvm/ExecutionEngine/Orc/CloneSubModule.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+namespace llvm {
+namespace orc {
+
+void copyGVInitializer(GlobalVariable &New, const GlobalVariable &Orig,
+                             ValueToValueMapTy &VMap) {
+  if (Orig.hasInitializer())
+    New.setInitializer(MapValue(Orig.getInitializer(), VMap));
+}
+
+void copyFunctionBody(Function &New, const Function &Orig,
+                            ValueToValueMapTy &VMap) {
+  if (!Orig.isDeclaration()) {
+    Function::arg_iterator DestI = New.arg_begin();
+    for (Function::const_arg_iterator J = Orig.arg_begin(); J != Orig.arg_end();
+         ++J) {
+      DestI->setName(J->getName());
+      VMap[J] = DestI++;
+    }
+
+    SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned.
+    CloneFunctionInto(&New, &Orig, VMap, /*ModuleLevelChanges=*/true, Returns);
+  }
+}
+
+void CloneSubModule(llvm::Module &Dst, const Module &Src,
+                    HandleGlobalVariableFtor HandleGlobalVariable,
+                    HandleFunctionFtor HandleFunction, bool CloneInlineAsm) {
+
+  ValueToValueMapTy VMap;
+
+  if (CloneInlineAsm)
+    Dst.appendModuleInlineAsm(Src.getModuleInlineAsm());
+
+  // Copy global variables (but not initializers, yet).
+  for (Module::const_global_iterator I = Src.global_begin(), E = Src.global_end();
+       I != E; ++I) {
+    GlobalVariable *GV = new GlobalVariable(
+        Dst, I->getType()->getElementType(), I->isConstant(), I->getLinkage(),
+        (Constant *)nullptr, I->getName(), (GlobalVariable *)nullptr,
+        I->getThreadLocalMode(), I->getType()->getAddressSpace());
+    GV->copyAttributesFrom(I);
+    VMap[I] = GV;
+  }
+
+  // Loop over the functions in the module, making external functions as before
+  for (Module::const_iterator I = Src.begin(), E = Src.end(); I != E; ++I) {
+    Function *NF =
+        Function::Create(cast<FunctionType>(I->getType()->getElementType()),
+                         I->getLinkage(), I->getName(), &Dst);
+    NF->copyAttributesFrom(I);
+    VMap[I] = NF;
+  }
+
+  // Loop over the aliases in the module
+  for (Module::const_alias_iterator I = Src.alias_begin(), E = Src.alias_end();
+       I != E; ++I) {
+    auto *PTy = cast<PointerType>(I->getType());
+    auto *GA =
+        GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
+                            I->getLinkage(), I->getName(), &Dst);
+    GA->copyAttributesFrom(I);
+    VMap[I] = GA;
+  }
+
+  // Now that all of the things that global variable initializer can refer to
+  // have been created, loop through and copy the global variable referrers
+  // over...  We also set the attributes on the global now.
+  for (Module::const_global_iterator I = Src.global_begin(), E = Src.global_end();
+       I != E; ++I) {
+    GlobalVariable &GV = *cast<GlobalVariable>(VMap[I]);
+    HandleGlobalVariable(GV, *I, VMap);
+  }
+
+  // Similarly, copy over function bodies now...
+  //
+  for (Module::const_iterator I = Src.begin(), E = Src.end(); I != E; ++I) {
+    Function &F = *cast<Function>(VMap[I]);
+    HandleFunction(F, *I, VMap);
+  }
+
+  // And aliases
+  for (Module::const_alias_iterator I = Src.alias_begin(), E = Src.alias_end();
+       I != E; ++I) {
+    GlobalAlias *GA = cast<GlobalAlias>(VMap[I]);
+    if (const Constant *C = I->getAliasee())
+      GA->setAliasee(MapValue(C, VMap));
+  }
+
+  // And named metadata....
+  for (Module::const_named_metadata_iterator I = Src.named_metadata_begin(),
+                                             E = Src.named_metadata_end();
+       I != E; ++I) {
+    const NamedMDNode &NMD = *I;
+    NamedMDNode *NewNMD = Dst.getOrInsertNamedMetadata(NMD.getName());
+    for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
+      NewNMD->addOperand(MapMetadata(NMD.getOperand(i), VMap));
+  }
+
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
new file mode 100644
index 0000000..61c947f
--- /dev/null
+++ b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -0,0 +1,118 @@
+//===---- IndirectionUtils.cpp - Utilities for call indirection in Orc ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/Orc/CloneSubModule.h"
+#include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/IRBuilder.h"
+#include <set>
+
+namespace llvm {
+namespace orc {
+
+GlobalVariable* createImplPointer(Function &F, const Twine &Name,
+                                  Constant *Initializer) {
+  assert(F.getParent() && "Function isn't in a module.");
+  if (!Initializer)
+    Initializer = Constant::getNullValue(F.getType());
+  Module &M = *F.getParent();
+  return new GlobalVariable(M, F.getType(), false, GlobalValue::ExternalLinkage,
+                            Initializer, Name, nullptr,
+                            GlobalValue::NotThreadLocal, 0, true);
+}
+
+void makeStub(Function &F, GlobalVariable &ImplPointer) {
+  assert(F.isDeclaration() && "Can't turn a definition into a stub.");
+  assert(F.getParent() && "Function isn't in a module.");
+  Module &M = *F.getParent();
+  BasicBlock *EntryBlock = BasicBlock::Create(M.getContext(), "entry", &F);
+  IRBuilder<> Builder(EntryBlock);
+  LoadInst *ImplAddr = Builder.CreateLoad(&ImplPointer);
+  std::vector<Value*> CallArgs;
+  for (auto &A : F.args())
+    CallArgs.push_back(&A);
+  CallInst *Call = Builder.CreateCall(ImplAddr, CallArgs);
+  Call->setTailCall();
+  Builder.CreateRet(Call);
+}
+
+void partition(Module &M, const ModulePartitionMap &PMap) {
+
+  for (auto &KVPair : PMap) {
+
+    auto ExtractGlobalVars =
+      [&](GlobalVariable &New, const GlobalVariable &Orig,
+          ValueToValueMapTy &VMap) {
+        if (KVPair.second.count(&Orig)) {
+          copyGVInitializer(New, Orig, VMap);
+        }
+        if (New.getLinkage() == GlobalValue::PrivateLinkage) {
+          New.setLinkage(GlobalValue::ExternalLinkage);
+          New.setVisibility(GlobalValue::HiddenVisibility);
+        }
+      };
+
+    auto ExtractFunctions =
+      [&](Function &New, const Function &Orig, ValueToValueMapTy &VMap) {
+        if (KVPair.second.count(&Orig))
+          copyFunctionBody(New, Orig, VMap);
+        if (New.getLinkage() == GlobalValue::InternalLinkage) {
+          New.setLinkage(GlobalValue::ExternalLinkage);
+          New.setVisibility(GlobalValue::HiddenVisibility);
+        }
+      };
+
+    CloneSubModule(*KVPair.first, M, ExtractGlobalVars, ExtractFunctions,
+                   false);
+  }
+}
+
+FullyPartitionedModule fullyPartition(Module &M) {
+  FullyPartitionedModule MP;
+
+  ModulePartitionMap PMap;
+
+  for (auto &F : M) {
+
+    if (F.isDeclaration())
+      continue;
+
+    std::string NewModuleName = (M.getName() + "." + F.getName()).str();
+    MP.Functions.push_back(
+      llvm::make_unique<Module>(NewModuleName, M.getContext()));
+    MP.Functions.back()->setDataLayout(M.getDataLayout());
+    PMap[MP.Functions.back().get()].insert(&F);
+  }
+
+  MP.GlobalVars =
+    llvm::make_unique<Module>((M.getName() + ".globals_and_stubs").str(),
+                              M.getContext());
+  MP.GlobalVars->setDataLayout(M.getDataLayout());
+
+  MP.Commons =
+    llvm::make_unique<Module>((M.getName() + ".commons").str(), M.getContext());
+  MP.Commons->setDataLayout(M.getDataLayout());
+
+  // Make sure there's at least an empty set for the stubs map or we'll fail
+  // to clone anything for it (including the decls).
+  PMap[MP.GlobalVars.get()] = ModulePartitionMap::mapped_type();
+  for (auto &GV : M.globals())
+    if (GV.getLinkage() == GlobalValue::CommonLinkage)
+      PMap[MP.Commons.get()].insert(&GV);
+    else
+      PMap[MP.GlobalVars.get()].insert(&GV);
+
+  partition(M, PMap);
+
+  return MP;
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/lib/ExecutionEngine/Orc/LLVMBuild.txt b/lib/ExecutionEngine/Orc/LLVMBuild.txt
new file mode 100644
index 0000000..8f05172
--- /dev/null
+++ b/lib/ExecutionEngine/Orc/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/ExecutionEngine/MCJIT/LLVMBuild.txt ----------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = OrcJIT
+parent = ExecutionEngine
+required_libraries = Core ExecutionEngine Object RuntimeDyld Support TransformUtils
diff --git a/lib/ExecutionEngine/Orc/Makefile b/lib/ExecutionEngine/Orc/Makefile
new file mode 100644
index 0000000..ac30234
--- /dev/null
+++ b/lib/ExecutionEngine/Orc/Makefile
@@ -0,0 +1,13 @@
+##===- lib/ExecutionEngine/OrcJIT/Makefile -----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMOrcJIT
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
new file mode 100644
index 0000000..48fd31e
--- /dev/null
+++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
@@ -0,0 +1,128 @@
+//===-------- OrcMCJITReplacement.cpp - Orc-based MCJIT replacement -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "OrcMCJITReplacement.h"
+#include "llvm/ExecutionEngine/GenericValue.h"
+
+namespace {
+
+static struct RegisterJIT {
+  RegisterJIT() { llvm::orc::OrcMCJITReplacement::Register(); }
+} JITRegistrator;
+
+}
+
+extern "C" void LLVMLinkInOrcMCJITReplacement() {}
+
+namespace llvm {
+namespace orc {
+
+GenericValue
+OrcMCJITReplacement::runFunction(Function *F,
+                                 const std::vector<GenericValue> &ArgValues) {
+  assert(F && "Function *F was null at entry to run()");
+
+  void *FPtr = getPointerToFunction(F);
+  assert(FPtr && "Pointer to fn's code was null after getPointerToFunction");
+  FunctionType *FTy = F->getFunctionType();
+  Type *RetTy = FTy->getReturnType();
+
+  assert((FTy->getNumParams() == ArgValues.size() ||
+          (FTy->isVarArg() && FTy->getNumParams() <= ArgValues.size())) &&
+         "Wrong number of arguments passed into function!");
+  assert(FTy->getNumParams() == ArgValues.size() &&
+         "This doesn't support passing arguments through varargs (yet)!");
+
+  // Handle some common cases first.  These cases correspond to common `main'
+  // prototypes.
+  if (RetTy->isIntegerTy(32) || RetTy->isVoidTy()) {
+    switch (ArgValues.size()) {
+    case 3:
+      if (FTy->getParamType(0)->isIntegerTy(32) &&
+          FTy->getParamType(1)->isPointerTy() &&
+          FTy->getParamType(2)->isPointerTy()) {
+        int (*PF)(int, char **, const char **) =
+            (int (*)(int, char **, const char **))(intptr_t)FPtr;
+
+        // Call the function.
+        GenericValue rv;
+        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue(),
+                                 (char **)GVTOP(ArgValues[1]),
+                                 (const char **)GVTOP(ArgValues[2])));
+        return rv;
+      }
+      break;
+    case 2:
+      if (FTy->getParamType(0)->isIntegerTy(32) &&
+          FTy->getParamType(1)->isPointerTy()) {
+        int (*PF)(int, char **) = (int (*)(int, char **))(intptr_t)FPtr;
+
+        // Call the function.
+        GenericValue rv;
+        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue(),
+                                 (char **)GVTOP(ArgValues[1])));
+        return rv;
+      }
+      break;
+    case 1:
+      if (FTy->getNumParams() == 1 && FTy->getParamType(0)->isIntegerTy(32)) {
+        GenericValue rv;
+        int (*PF)(int) = (int (*)(int))(intptr_t)FPtr;
+        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue()));
+        return rv;
+      }
+      break;
+    }
+  }
+
+  // Handle cases where no arguments are passed first.
+  if (ArgValues.empty()) {
+    GenericValue rv;
+    switch (RetTy->getTypeID()) {
+    default:
+      llvm_unreachable("Unknown return type for function call!");
+    case Type::IntegerTyID: {
+      unsigned BitWidth = cast<IntegerType>(RetTy)->getBitWidth();
+      if (BitWidth == 1)
+        rv.IntVal = APInt(BitWidth, ((bool (*)())(intptr_t)FPtr)());
+      else if (BitWidth <= 8)
+        rv.IntVal = APInt(BitWidth, ((char (*)())(intptr_t)FPtr)());
+      else if (BitWidth <= 16)
+        rv.IntVal = APInt(BitWidth, ((short (*)())(intptr_t)FPtr)());
+      else if (BitWidth <= 32)
+        rv.IntVal = APInt(BitWidth, ((int (*)())(intptr_t)FPtr)());
+      else if (BitWidth <= 64)
+        rv.IntVal = APInt(BitWidth, ((int64_t (*)())(intptr_t)FPtr)());
+      else
+        llvm_unreachable("Integer types > 64 bits not supported");
+      return rv;
+    }
+    case Type::VoidTyID:
+      rv.IntVal = APInt(32, ((int (*)())(intptr_t)FPtr)());
+      return rv;
+    case Type::FloatTyID:
+      rv.FloatVal = ((float (*)())(intptr_t)FPtr)();
+      return rv;
+    case Type::DoubleTyID:
+      rv.DoubleVal = ((double (*)())(intptr_t)FPtr)();
+      return rv;
+    case Type::X86_FP80TyID:
+    case Type::FP128TyID:
+    case Type::PPC_FP128TyID:
+      llvm_unreachable("long double not supported yet");
+    case Type::PointerTyID:
+      return PTOGV(((void *(*)())(intptr_t)FPtr)());
+    }
+  }
+
+  llvm_unreachable("Full-featured argument passing not supported yet!");
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
new file mode 100644
index 0000000..1b7b161
--- /dev/null
+++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
@@ -0,0 +1,332 @@
+//===---- OrcMCJITReplacement.h - Orc based MCJIT replacement ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Orc based MCJIT replacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_EXECUTIONENGINE_ORC_ORCMCJITREPLACEMENT_H
+#define LLVM_LIB_EXECUTIONENGINE_ORC_ORCMCJITREPLACEMENT_H
+
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/LazyEmittingLayer.h"
+#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/Object/Archive.h"
+
+namespace llvm {
+namespace orc {
+
+class OrcMCJITReplacement : public ExecutionEngine {
+
+  class ForwardingRTDyldMM : public RTDyldMemoryManager {
+  public:
+    ForwardingRTDyldMM(OrcMCJITReplacement &M) : M(M) {}
+
+    uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                                 unsigned SectionID,
+                                 StringRef SectionName) override {
+      uint8_t *Addr =
+          M.MM->allocateCodeSection(Size, Alignment, SectionID, SectionName);
+      M.SectionsAllocatedSinceLastLoad.insert(Addr);
+      return Addr;
+    }
+
+    uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                                 unsigned SectionID, StringRef SectionName,
+                                 bool IsReadOnly) override {
+      uint8_t *Addr = M.MM->allocateDataSection(Size, Alignment, SectionID,
+                                                SectionName, IsReadOnly);
+      M.SectionsAllocatedSinceLastLoad.insert(Addr);
+      return Addr;
+    }
+
+    void reserveAllocationSpace(uintptr_t CodeSize, uintptr_t DataSizeRO,
+                                uintptr_t DataSizeRW) override {
+      return M.MM->reserveAllocationSpace(CodeSize, DataSizeRO, DataSizeRW);
+    }
+
+    bool needsToReserveAllocationSpace() override {
+      return M.MM->needsToReserveAllocationSpace();
+    }
+
+    void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
+                          size_t Size) override {
+      return M.MM->registerEHFrames(Addr, LoadAddr, Size);
+    }
+
+    void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr,
+                            size_t Size) override {
+      return M.MM->deregisterEHFrames(Addr, LoadAddr, Size);
+    }
+
+    uint64_t getSymbolAddress(const std::string &Name) override {
+      return M.getSymbolAddressWithoutMangling(Name);
+    }
+
+    void *getPointerToNamedFunction(const std::string &Name,
+                                    bool AbortOnFailure = true) override {
+      return M.MM->getPointerToNamedFunction(Name, AbortOnFailure);
+    }
+
+    void notifyObjectLoaded(ExecutionEngine *EE,
+                            const object::ObjectFile &O) override {
+      return M.MM->notifyObjectLoaded(EE, O);
+    }
+
+    bool finalizeMemory(std::string *ErrMsg = nullptr) override {
+      // Each set of objects loaded will be finalized exactly once, but since
+      // symbol lookup during relocation may recursively trigger the
+      // loading/relocation of other modules, and since we're forwarding all
+      // finalizeMemory calls to a single underlying memory manager, we need to
+      // defer forwarding the call on until all necessary objects have been
+      // loaded. Otherwise, during the relocation of a leaf object, we will end
+      // up finalizing memory, causing a crash further up the stack when we
+      // attempt to apply relocations to finalized memory.
+      // To avoid finalizing too early, look at how many objects have been
+      // loaded but not yet finalized. This is a bit of a hack that relies on
+      // the fact that we're lazily emitting object files: The only way you can
+      // get more than one set of objects loaded but not yet finalized is if
+      // they were loaded during relocation of another set.
+      if (M.UnfinalizedSections.size() == 1)
+        return M.MM->finalizeMemory(ErrMsg);
+      return false;
+    }
+
+  private:
+    OrcMCJITReplacement &M;
+  };
+
+private:
+
+  static ExecutionEngine *
+  createOrcMCJITReplacement(std::string *ErrorMsg,
+                            std::unique_ptr<RTDyldMemoryManager> OrcJMM,
+                            std::unique_ptr<TargetMachine> TM) {
+    return new OrcMCJITReplacement(std::move(OrcJMM), std::move(TM));
+  }
+
+public:
+  static void Register() {
+    OrcMCJITReplacementCtor = createOrcMCJITReplacement;
+  }
+
+  OrcMCJITReplacement(std::unique_ptr<RTDyldMemoryManager> MM,
+                      std::unique_ptr<TargetMachine> TM)
+      : TM(std::move(TM)), MM(std::move(MM)), Mang(this->TM->getDataLayout()),
+        NotifyObjectLoaded(*this), NotifyFinalized(*this),
+        ObjectLayer(ObjectLayerT::CreateRTDyldMMFtor(), NotifyObjectLoaded,
+                    NotifyFinalized),
+        CompileLayer(ObjectLayer, SimpleCompiler(*this->TM)),
+        LazyEmitLayer(CompileLayer) {
+    setDataLayout(this->TM->getDataLayout());
+  }
+
+  void addModule(std::unique_ptr<Module> M) override {
+
+    // If this module doesn't have a DataLayout attached then attach the
+    // default.
+    if (!M->getDataLayout())
+      M->setDataLayout(getDataLayout());
+
+    Modules.push_back(std::move(M));
+    std::vector<Module *> Ms;
+    Ms.push_back(&*Modules.back());
+    LazyEmitLayer.addModuleSet(std::move(Ms),
+                               llvm::make_unique<ForwardingRTDyldMM>(*this));
+  }
+
+  void addObjectFile(std::unique_ptr<object::ObjectFile> O) override {
+    std::vector<std::unique_ptr<object::ObjectFile>> Objs;
+    Objs.push_back(std::move(O));
+    ObjectLayer.addObjectSet(std::move(Objs),
+                             llvm::make_unique<ForwardingRTDyldMM>(*this));
+  }
+
+  void addObjectFile(object::OwningBinary<object::ObjectFile> O) override {
+    std::unique_ptr<object::ObjectFile> Obj;
+    std::unique_ptr<MemoryBuffer> Buf;
+    std::tie(Obj, Buf) = O.takeBinary();
+    std::vector<std::unique_ptr<object::ObjectFile>> Objs;
+    Objs.push_back(std::move(Obj));
+    auto H =
+      ObjectLayer.addObjectSet(std::move(Objs),
+                               llvm::make_unique<ForwardingRTDyldMM>(*this));
+
+    std::vector<std::unique_ptr<MemoryBuffer>> Bufs;
+    Bufs.push_back(std::move(Buf));
+    ObjectLayer.takeOwnershipOfBuffers(H, std::move(Bufs));
+  }
+
+  void addArchive(object::OwningBinary<object::Archive> A) override {
+    Archives.push_back(std::move(A));
+  }
+
+  uint64_t getSymbolAddress(StringRef Name) {
+    return getSymbolAddressWithoutMangling(Mangle(Name));
+  }
+
+  void finalizeObject() override {
+    // This is deprecated - Aim to remove in ExecutionEngine.
+    // REMOVE IF POSSIBLE - Doesn't make sense for New JIT.
+  }
+
+  void mapSectionAddress(const void *LocalAddress,
+                         uint64_t TargetAddress) override {
+    for (auto &P : UnfinalizedSections)
+      if (P.second.count(LocalAddress))
+        ObjectLayer.mapSectionAddress(P.first, LocalAddress, TargetAddress);
+  }
+
+  uint64_t getGlobalValueAddress(const std::string &Name) override {
+    return getSymbolAddress(Name);
+  }
+
+  uint64_t getFunctionAddress(const std::string &Name) override {
+    return getSymbolAddress(Name);
+  }
+
+  void *getPointerToFunction(Function *F) override {
+    uint64_t FAddr = getSymbolAddress(F->getName());
+    return reinterpret_cast<void *>(static_cast<uintptr_t>(FAddr));
+  }
+
+  void *getPointerToNamedFunction(StringRef Name,
+                                  bool AbortOnFailure = true) override {
+    uint64_t Addr = getSymbolAddress(Name);
+    if (!Addr && AbortOnFailure)
+      llvm_unreachable("Missing symbol!");
+    return reinterpret_cast<void *>(static_cast<uintptr_t>(Addr));
+  }
+
+  GenericValue runFunction(Function *F,
+                           const std::vector<GenericValue> &ArgValues) override;
+
+  void setObjectCache(ObjectCache *NewCache) override {
+    CompileLayer.setObjectCache(NewCache);
+  }
+
+private:
+  uint64_t getSymbolAddressWithoutMangling(StringRef Name) {
+    if (uint64_t Addr = LazyEmitLayer.findSymbol(Name, false).getAddress())
+      return Addr;
+    if (uint64_t Addr = MM->getSymbolAddress(Name))
+      return Addr;
+    if (uint64_t Addr = scanArchives(Name))
+      return Addr;
+
+    return 0;
+  }
+
+  uint64_t scanArchives(StringRef Name) {
+    for (object::OwningBinary<object::Archive> &OB : Archives) {
+      object::Archive *A = OB.getBinary();
+      // Look for our symbols in each Archive
+      object::Archive::child_iterator ChildIt = A->findSym(Name);
+      if (ChildIt != A->child_end()) {
+        // FIXME: Support nested archives?
+        ErrorOr<std::unique_ptr<object::Binary>> ChildBinOrErr =
+            ChildIt->getAsBinary();
+        if (ChildBinOrErr.getError())
+          continue;
+        std::unique_ptr<object::Binary> &ChildBin = ChildBinOrErr.get();
+        if (ChildBin->isObject()) {
+          std::vector<std::unique_ptr<object::ObjectFile>> ObjSet;
+          ObjSet.push_back(std::unique_ptr<object::ObjectFile>(
+              static_cast<object::ObjectFile *>(ChildBin.release())));
+          ObjectLayer.addObjectSet(
+              std::move(ObjSet), llvm::make_unique<ForwardingRTDyldMM>(*this));
+          if (uint64_t Addr = ObjectLayer.findSymbol(Name, true).getAddress())
+            return Addr;
+        }
+      }
+    }
+    return 0;
+  }
+
+  class NotifyObjectLoadedT {
+  public:
+    typedef std::vector<std::unique_ptr<object::ObjectFile>> ObjListT;
+    typedef std::vector<std::unique_ptr<RuntimeDyld::LoadedObjectInfo>>
+        LoadedObjInfoListT;
+
+    NotifyObjectLoadedT(OrcMCJITReplacement &M) : M(M) {}
+
+    void operator()(ObjectLinkingLayerBase::ObjSetHandleT H,
+                    const ObjListT &Objects,
+                    const LoadedObjInfoListT &Infos) const {
+      M.UnfinalizedSections[H] = std::move(M.SectionsAllocatedSinceLastLoad);
+      M.SectionsAllocatedSinceLastLoad = SectionAddrSet();
+      assert(Objects.size() == Infos.size() &&
+             "Incorrect number of Infos for Objects.");
+      for (unsigned I = 0; I < Objects.size(); ++I)
+        M.MM->notifyObjectLoaded(&M, *Objects[I]);
+    };
+
+  private:
+    OrcMCJITReplacement &M;
+  };
+
+  class NotifyFinalizedT {
+  public:
+    NotifyFinalizedT(OrcMCJITReplacement &M) : M(M) {}
+    void operator()(ObjectLinkingLayerBase::ObjSetHandleT H) {
+      M.UnfinalizedSections.erase(H);
+    }
+
+  private:
+    OrcMCJITReplacement &M;
+  };
+
+  std::string Mangle(StringRef Name) {
+    std::string MangledName;
+    {
+      raw_string_ostream MangledNameStream(MangledName);
+      Mang.getNameWithPrefix(MangledNameStream, Name);
+    }
+    return MangledName;
+  }
+
+  typedef ObjectLinkingLayer<NotifyObjectLoadedT> ObjectLayerT;
+  typedef IRCompileLayer<ObjectLayerT> CompileLayerT;
+  typedef LazyEmittingLayer<CompileLayerT> LazyEmitLayerT;
+
+  std::unique_ptr<TargetMachine> TM;
+  std::unique_ptr<RTDyldMemoryManager> MM;
+  Mangler Mang;
+
+  NotifyObjectLoadedT NotifyObjectLoaded;
+  NotifyFinalizedT NotifyFinalized;
+
+  ObjectLayerT ObjectLayer;
+  CompileLayerT CompileLayer;
+  LazyEmitLayerT LazyEmitLayer;
+
+  // We need to store ObjLayerT::ObjSetHandles for each of the object sets
+  // that have been emitted but not yet finalized so that we can forward the
+  // mapSectionAddress calls appropriately.
+  typedef std::set<const void *> SectionAddrSet;
+  struct ObjSetHandleCompare {
+    bool operator()(ObjectLayerT::ObjSetHandleT H1,
+                    ObjectLayerT::ObjSetHandleT H2) const {
+      return &*H1 < &*H2;
+    }
+  };
+  SectionAddrSet SectionsAllocatedSinceLastLoad;
+  std::map<ObjectLayerT::ObjSetHandleT, SectionAddrSet, ObjSetHandleCompare>
+      UnfinalizedSections;
+
+  std::vector<object::OwningBinary<object::Archive>> Archives;
+};
+
+} // End namespace orc.
+} // End namespace llvm.
+
+#endif // LLVM_LIB_EXECUTIONENGINE_ORC_MCJITREPLACEMENT_H
diff --git a/lib/ExecutionEngine/Orc/OrcTargetSupport.cpp b/lib/ExecutionEngine/Orc/OrcTargetSupport.cpp
new file mode 100644
index 0000000..b5dda8e
--- /dev/null
+++ b/lib/ExecutionEngine/Orc/OrcTargetSupport.cpp
@@ -0,0 +1,128 @@
+#include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/Orc/OrcTargetSupport.h"
+#include <array>
+
+using namespace llvm::orc;
+
+namespace {
+
+std::array<const char *, 12> X86GPRsToSave = {{
+    "rbp", "rbx", "r12", "r13", "r14", "r15", // Callee saved.
+    "rdi", "rsi", "rdx", "rcx", "r8", "r9",   // Int args.
+}};
+
+std::array<const char *, 8> X86XMMsToSave = {{
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" // FP args
+}};
+
+template <typename OStream> unsigned saveX86Regs(OStream &OS) {
+  for (const auto &GPR : X86GPRsToSave)
+    OS << "  pushq   %" << GPR << "\n";
+
+  OS << "  subq    $" << (16 * X86XMMsToSave.size()) << ", %rsp\n";
+
+  for (unsigned i = 0; i < X86XMMsToSave.size(); ++i)
+    OS << "  movdqu  %" << X86XMMsToSave[i] << ", "
+       << (16 * (X86XMMsToSave.size() - i - 1)) << "(%rsp)\n";
+
+  return (8 * X86GPRsToSave.size()) + (16 * X86XMMsToSave.size());
+}
+
+template <typename OStream> void restoreX86Regs(OStream &OS) {
+  for (unsigned i = 0; i < X86XMMsToSave.size(); ++i)
+    OS << "  movdqu  " << (16 * i) << "(%rsp), %"
+       << X86XMMsToSave[(X86XMMsToSave.size() - i - 1)] << "\n";
+  OS << "  addq    $" << (16 * X86XMMsToSave.size()) << ", %rsp\n";
+
+  for (unsigned i = 0; i < X86GPRsToSave.size(); ++i)
+    OS << "  popq    %" << X86GPRsToSave[X86GPRsToSave.size() - i - 1] << "\n";
+}
+
+template <typename TargetT>
+uint64_t executeCompileCallback(JITCompileCallbackManagerBase<TargetT> *JCBM,
+                                TargetAddress CallbackID) {
+  return JCBM->executeCompileCallback(CallbackID);
+}
+
+}
+
+namespace llvm {
+namespace orc {
+
+const char* OrcX86_64::ResolverBlockName = "orc_resolver_block";
+
+void OrcX86_64::insertResolverBlock(
+    Module &M, JITCompileCallbackManagerBase<OrcX86_64> &JCBM) {
+  auto CallbackPtr = executeCompileCallback<OrcX86_64>;
+  uint64_t CallbackAddr =
+      static_cast<uint64_t>(reinterpret_cast<uintptr_t>(CallbackPtr));
+
+  std::ostringstream AsmStream;
+  Triple TT(M.getTargetTriple());
+
+  if (TT.getOS() == Triple::Darwin)
+    AsmStream << ".section __TEXT,__text,regular,pure_instructions\n"
+              << ".align 4, 0x90\n";
+  else
+    AsmStream << ".text\n"
+              << ".align 16, 0x90\n";
+
+  AsmStream << "jit_callback_manager_addr:\n"
+            << "  .quad " << &JCBM << "\n"
+            << ResolverBlockName << ":\n";
+
+  uint64_t ReturnAddrOffset = saveX86Regs(AsmStream);
+
+  // Compute index, load object address, and call JIT.
+  AsmStream << "  leaq    jit_callback_manager_addr(%rip), %rdi\n"
+            << "  movq    (%rdi), %rdi\n"
+            << "  movq    " << ReturnAddrOffset << "(%rsp), %rsi\n"
+            << "  movabsq $" << CallbackAddr << ", %rax\n"
+            << "  callq   *%rax\n"
+            << "  movq    %rax, " << ReturnAddrOffset << "(%rsp)\n";
+
+  restoreX86Regs(AsmStream);
+
+  AsmStream << "  retq\n";
+
+  M.appendModuleInlineAsm(AsmStream.str());
+}
+
+OrcX86_64::LabelNameFtor
+OrcX86_64::insertCompileCallbackTrampolines(Module &M,
+                                            TargetAddress ResolverBlockAddr,
+                                            unsigned NumCalls,
+                                            unsigned StartIndex) {
+  const char *ResolverBlockPtrName = "Lorc_resolve_block_addr";
+
+  std::ostringstream AsmStream;
+  Triple TT(M.getTargetTriple());
+
+  if (TT.getOS() == Triple::Darwin)
+    AsmStream << ".section __TEXT,__text,regular,pure_instructions\n"
+              << ".align 4, 0x90\n";
+  else
+    AsmStream << ".text\n"
+              << ".align 16, 0x90\n";
+
+  AsmStream << ResolverBlockPtrName << ":\n"
+            << "  .quad " << ResolverBlockAddr << "\n";
+
+  auto GetLabelName =
+    [=](unsigned I) {
+      std::ostringstream LabelStream;
+      LabelStream << "orc_jcc_" << (StartIndex + I);
+      return LabelStream.str();
+  };
+
+  for (unsigned I = 0; I < NumCalls; ++I)
+    AsmStream << GetLabelName(I) << ":\n"
+              << "  callq *" << ResolverBlockPtrName << "(%rip)\n";
+
+  M.appendModuleInlineAsm(AsmStream.str());
+
+  return GetLabelName;
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/lib/ExecutionEngine/RTDyldMemoryManager.cpp b/lib/ExecutionEngine/RTDyldMemoryManager.cpp
deleted file mode 100644
index 51b2d0f..0000000
--- a/lib/ExecutionEngine/RTDyldMemoryManager.cpp
+++ /dev/null
@@ -1,282 +0,0 @@
-//===-- RTDyldMemoryManager.cpp - Memory manager for MC-JIT -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Implementation of the runtime dynamic memory manager base class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Config/config.h"
-#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <cstdlib>
-
-#ifdef __linux__
-  // These includes used by RTDyldMemoryManager::getPointerToNamedFunction()
-  // for Glibc trickery. See comments in this function for more information.
-  #ifdef HAVE_SYS_STAT_H
-    #include <sys/stat.h>
-  #endif
-  #include <fcntl.h>
-  #include <unistd.h>
-#endif
-
-namespace llvm {
-
-RTDyldMemoryManager::~RTDyldMemoryManager() {}
-
-// Determine whether we can register EH tables.
-#if (defined(__GNUC__) && !defined(__ARM_EABI__) && !defined(__ia64__) && \
-     !defined(__SEH__) && !defined(__USING_SJLJ_EXCEPTIONS__))
-#define HAVE_EHTABLE_SUPPORT 1
-#else
-#define HAVE_EHTABLE_SUPPORT 0
-#endif
-
-#if HAVE_EHTABLE_SUPPORT
-extern "C" void __register_frame(void*);
-extern "C" void __deregister_frame(void*);
-#else
-// The building compiler does not have __(de)register_frame but
-// it may be found at runtime in a dynamically-loaded library.
-// For example, this happens when building LLVM with Visual C++
-// but using the MingW runtime.
-void __register_frame(void *p) {
-  static bool Searched = false;
-  static void *rf = 0;
-
-  if (!Searched) {
-    Searched = true;
-    rf = llvm::sys::DynamicLibrary::SearchForAddressOfSymbol(
-                                                      "__register_frame");
-  }
-  if (rf)
-    ((void (*)(void *))rf)(p);
-}
-
-void __deregister_frame(void *p) {
-  static bool Searched = false;
-  static void *df = 0;
-
-  if (!Searched) {
-    Searched = true;
-    df = llvm::sys::DynamicLibrary::SearchForAddressOfSymbol(
-                                                      "__deregister_frame");
-  }
-  if (df)
-    ((void (*)(void *))df)(p);
-}
-#endif
-
-#ifdef __APPLE__
-
-static const char *processFDE(const char *Entry, bool isDeregister) {
-  const char *P = Entry;
-  uint32_t Length = *((const uint32_t *)P);
-  P += 4;
-  uint32_t Offset = *((const uint32_t *)P);
-  if (Offset != 0) {
-    if (isDeregister)
-      __deregister_frame(const_cast<char *>(Entry));
-    else
-      __register_frame(const_cast<char *>(Entry));
-  }
-  return P + Length;
-}
-
-// This implementation handles frame registration for local targets.
-// Memory managers for remote targets should re-implement this function
-// and use the LoadAddr parameter.
-void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
-                                           uint64_t LoadAddr,
-                                           size_t Size) {
-  // On OS X OS X __register_frame takes a single FDE as an argument.
-  // See http://lists.cs.uiuc.edu/pipermail/llvmdev/2013-April/061768.html
-  const char *P = (const char *)Addr;
-  const char *End = P + Size;
-  do  {
-    P = processFDE(P, false);
-  } while(P != End);
-}
-
-void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr,
-                                           uint64_t LoadAddr,
-                                           size_t Size) {
-  const char *P = (const char *)Addr;
-  const char *End = P + Size;
-  do  {
-    P = processFDE(P, true);
-  } while(P != End);
-}
-
-#else
-
-void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
-                                           uint64_t LoadAddr,
-                                           size_t Size) {
-  // On Linux __register_frame takes a single argument: 
-  // a pointer to the start of the .eh_frame section.
-
-  // How can it find the end? Because crtendS.o is linked 
-  // in and it has an .eh_frame section with four zero chars.
-  __register_frame(Addr);
-}
-
-void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr,
-                                           uint64_t LoadAddr,
-                                           size_t Size) {
-  __deregister_frame(Addr);
-}
-
-#endif
-
-static int jit_noop() {
-  return 0;
-}
-
-// ARM math functions are statically linked on Android from libgcc.a, but not
-// available at runtime for dynamic linking. On Linux these are usually placed
-// in libgcc_s.so so can be found by normal dynamic lookup.
-#if defined(__BIONIC__) && defined(__arm__)
-// List of functions which are statically linked on Android and can be generated
-// by LLVM. This is done as a nested macro which is used once to declare the
-// imported functions with ARM_MATH_DECL and once to compare them to the
-// user-requested symbol in getSymbolAddress with ARM_MATH_CHECK. The test
-// assumes that all functions start with __aeabi_ and getSymbolAddress must be
-// modified if that changes.
-#define ARM_MATH_IMPORTS(PP) \
-  PP(__aeabi_d2f) \
-  PP(__aeabi_d2iz) \
-  PP(__aeabi_d2lz) \
-  PP(__aeabi_d2uiz) \
-  PP(__aeabi_d2ulz) \
-  PP(__aeabi_dadd) \
-  PP(__aeabi_dcmpeq) \
-  PP(__aeabi_dcmpge) \
-  PP(__aeabi_dcmpgt) \
-  PP(__aeabi_dcmple) \
-  PP(__aeabi_dcmplt) \
-  PP(__aeabi_dcmpun) \
-  PP(__aeabi_ddiv) \
-  PP(__aeabi_dmul) \
-  PP(__aeabi_dsub) \
-  PP(__aeabi_f2d) \
-  PP(__aeabi_f2iz) \
-  PP(__aeabi_f2lz) \
-  PP(__aeabi_f2uiz) \
-  PP(__aeabi_f2ulz) \
-  PP(__aeabi_fadd) \
-  PP(__aeabi_fcmpeq) \
-  PP(__aeabi_fcmpge) \
-  PP(__aeabi_fcmpgt) \
-  PP(__aeabi_fcmple) \
-  PP(__aeabi_fcmplt) \
-  PP(__aeabi_fcmpun) \
-  PP(__aeabi_fdiv) \
-  PP(__aeabi_fmul) \
-  PP(__aeabi_fsub) \
-  PP(__aeabi_i2d) \
-  PP(__aeabi_i2f) \
-  PP(__aeabi_idiv) \
-  PP(__aeabi_idivmod) \
-  PP(__aeabi_l2d) \
-  PP(__aeabi_l2f) \
-  PP(__aeabi_lasr) \
-  PP(__aeabi_ldivmod) \
-  PP(__aeabi_llsl) \
-  PP(__aeabi_llsr) \
-  PP(__aeabi_lmul) \
-  PP(__aeabi_ui2d) \
-  PP(__aeabi_ui2f) \
-  PP(__aeabi_uidiv) \
-  PP(__aeabi_uidivmod) \
-  PP(__aeabi_ul2d) \
-  PP(__aeabi_ul2f) \
-  PP(__aeabi_uldivmod)
-
-// Declare statically linked math functions on ARM. The function declarations
-// here do not have the correct prototypes for each function in
-// ARM_MATH_IMPORTS, but it doesn't matter because only the symbol addresses are
-// needed. In particular the __aeabi_*divmod functions do not have calling
-// conventions which match any C prototype.
-#define ARM_MATH_DECL(name) extern "C" void name();
-ARM_MATH_IMPORTS(ARM_MATH_DECL)
-#undef ARM_MATH_DECL
-#endif
-
-uint64_t
-RTDyldMemoryManager::getSymbolAddressInProcess(const std::string &Name) {
-  // This implementation assumes that the host program is the target.
-  // Clients generating code for a remote target should implement their own
-  // memory manager.
-#if defined(__linux__) && defined(__GLIBC__)
-  //===--------------------------------------------------------------------===//
-  // Function stubs that are invoked instead of certain library calls
-  //
-  // Force the following functions to be linked in to anything that uses the
-  // JIT. This is a hack designed to work around the all-too-clever Glibc
-  // strategy of making these functions work differently when inlined vs. when
-  // not inlined, and hiding their real definitions in a separate archive file
-  // that the dynamic linker can't see. For more info, search for
-  // 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
-  if (Name == "stat") return (uint64_t)&stat;
-  if (Name == "fstat") return (uint64_t)&fstat;
-  if (Name == "lstat") return (uint64_t)&lstat;
-  if (Name == "stat64") return (uint64_t)&stat64;
-  if (Name == "fstat64") return (uint64_t)&fstat64;
-  if (Name == "lstat64") return (uint64_t)&lstat64;
-  if (Name == "atexit") return (uint64_t)&atexit;
-  if (Name == "mknod") return (uint64_t)&mknod;
-#endif // __linux__ && __GLIBC__
-  
-  // See ARM_MATH_IMPORTS definition for explanation
-#if defined(__BIONIC__) && defined(__arm__)
-  if (Name.compare(0, 8, "__aeabi_") == 0) {
-    // Check if the user has requested any of the functions listed in
-    // ARM_MATH_IMPORTS, and if so redirect to the statically linked symbol.
-#define ARM_MATH_CHECK(fn) if (Name == #fn) return (uint64_t)&fn;
-    ARM_MATH_IMPORTS(ARM_MATH_CHECK)
-#undef ARM_MATH_CHECK
-  }
-#endif
-
-  // We should not invoke parent's ctors/dtors from generated main()!
-  // On Mingw and Cygwin, the symbol __main is resolved to
-  // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors
-  // (and register wrong callee's dtors with atexit(3)).
-  // We expect ExecutionEngine::runStaticConstructorsDestructors()
-  // is called before ExecutionEngine::runFunctionAsMain() is called.
-  if (Name == "__main") return (uint64_t)&jit_noop;
-
-  // Try to demangle Name before looking it up in the process, otherwise symbol
-  // '_<Name>' (if present) will shadow '<Name>', and there will be no way to
-  // refer to the latter.
-
-  const char *NameStr = Name.c_str();
-
-  if (NameStr[0] == '_')
-    if (void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr + 1))
-      return (uint64_t)Ptr;
-
-  // If we Name did not require demangling, or we failed to find the demangled
-  // name, try again without demangling.
-  return (uint64_t)sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
-}
-
-void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name,
-                                                     bool AbortOnFailure) {
-  uint64_t Addr = getSymbolAddress(Name);
-
-  if (!Addr && AbortOnFailure)
-    report_fatal_error("Program used external function '" + Name +
-                       "' which could not be resolved!");
-  return (void*)Addr;
-}
-
-} // namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/Android.mk b/lib/ExecutionEngine/RuntimeDyld/Android.mk
index eb2e438..76aae67 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Android.mk
+++ b/lib/ExecutionEngine/RuntimeDyld/Android.mk
@@ -4,12 +4,12 @@ LOCAL_PATH:= $(call my-dir)
 # =====================================================
 include $(CLEAR_VARS)
 
-LOCAL_SRC_FILES :=	\
-	GDBRegistrar.cpp \
-	RuntimeDyld.cpp \
-	RuntimeDyldChecker.cpp \
-	RuntimeDyldELF.cpp \
-	RuntimeDyldMachO.cpp
+LOCAL_SRC_FILES := \
+  RTDyldMemoryManager.cpp \
+  RuntimeDyldChecker.cpp \
+  RuntimeDyld.cpp \
+  RuntimeDyldELF.cpp \
+  RuntimeDyldMachO.cpp
 
 LOCAL_MODULE:= libLLVMRuntimeDyld
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt b/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt
index eb1a60b..12bbcc6 100644
--- a/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt
+++ b/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_llvm_library(LLVMRuntimeDyld
-  GDBRegistrar.cpp
+  RTDyldMemoryManager.cpp
   RuntimeDyld.cpp
   RuntimeDyldChecker.cpp
   RuntimeDyldELF.cpp
diff --git a/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp b/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp
deleted file mode 100644
index dfa3a20..0000000
--- a/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-//===-- GDBRegistrar.cpp - Registers objects with GDB ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "JITRegistrar.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Mutex.h"
-#include "llvm/Support/MutexGuard.h"
-#include "llvm/Support/ManagedStatic.h"
-
-using namespace llvm;
-
-// This must be kept in sync with gdb/gdb/jit.h .
-extern "C" {
-
-  typedef enum {
-    JIT_NOACTION = 0,
-    JIT_REGISTER_FN,
-    JIT_UNREGISTER_FN
-  } jit_actions_t;
-
-  struct jit_code_entry {
-    struct jit_code_entry *next_entry;
-    struct jit_code_entry *prev_entry;
-    const char *symfile_addr;
-    uint64_t symfile_size;
-  };
-
-  struct jit_descriptor {
-    uint32_t version;
-    // This should be jit_actions_t, but we want to be specific about the
-    // bit-width.
-    uint32_t action_flag;
-    struct jit_code_entry *relevant_entry;
-    struct jit_code_entry *first_entry;
-  };
-
-  // We put information about the JITed function in this global, which the
-  // debugger reads.  Make sure to specify the version statically, because the
-  // debugger checks the version before we can set it during runtime.
-  struct jit_descriptor __jit_debug_descriptor = { 1, 0, nullptr, nullptr };
-
-  // Debuggers puts a breakpoint in this function.
-  LLVM_ATTRIBUTE_NOINLINE void __jit_debug_register_code() {
-    // The noinline and the asm prevent calls to this function from being
-    // optimized out.
-#if !defined(_MSC_VER)
-    asm volatile("":::"memory");
-#endif
-  }
-
-}
-
-namespace {
-
-// Buffer for an in-memory object file in executable memory
-typedef llvm::DenseMap< const char*,
-                        std::pair<std::size_t, jit_code_entry*> >
-  RegisteredObjectBufferMap;
-
-/// Global access point for the JIT debugging interface designed for use with a
-/// singleton toolbox. Handles thread-safe registration and deregistration of
-/// object files that are in executable memory managed by the client of this
-/// class.
-class GDBJITRegistrar : public JITRegistrar {
-  /// A map of in-memory object files that have been registered with the
-  /// JIT interface.
-  RegisteredObjectBufferMap ObjectBufferMap;
-
-public:
-  /// Instantiates the JIT service.
-  GDBJITRegistrar() : ObjectBufferMap() {}
-
-  /// Unregisters each object that was previously registered and releases all
-  /// internal resources.
-  virtual ~GDBJITRegistrar();
-
-  /// Creates an entry in the JIT registry for the buffer @p Object,
-  /// which must contain an object file in executable memory with any
-  /// debug information for the debugger.
-  void registerObject(const ObjectBuffer &Object) override;
-
-  /// Removes the internal registration of @p Object, and
-  /// frees associated resources.
-  /// Returns true if @p Object was found in ObjectBufferMap.
-  bool deregisterObject(const ObjectBuffer &Object) override;
-
-private:
-  /// Deregister the debug info for the given object file from the debugger
-  /// and delete any temporary copies.  This private method does not remove
-  /// the function from Map so that it can be called while iterating over Map.
-  void deregisterObjectInternal(RegisteredObjectBufferMap::iterator I);
-};
-
-/// Lock used to serialize all jit registration events, since they
-/// modify global variables.
-ManagedStatic<sys::Mutex> JITDebugLock;
-
-/// Do the registration.
-void NotifyDebugger(jit_code_entry* JITCodeEntry) {
-  __jit_debug_descriptor.action_flag = JIT_REGISTER_FN;
-
-  // Insert this entry at the head of the list.
-  JITCodeEntry->prev_entry = nullptr;
-  jit_code_entry* NextEntry = __jit_debug_descriptor.first_entry;
-  JITCodeEntry->next_entry = NextEntry;
-  if (NextEntry) {
-    NextEntry->prev_entry = JITCodeEntry;
-  }
-  __jit_debug_descriptor.first_entry = JITCodeEntry;
-  __jit_debug_descriptor.relevant_entry = JITCodeEntry;
-  __jit_debug_register_code();
-}
-
-GDBJITRegistrar::~GDBJITRegistrar() {
-  // Free all registered object files.
-  llvm::MutexGuard locked(*JITDebugLock);
-  for (RegisteredObjectBufferMap::iterator I = ObjectBufferMap.begin(), E = ObjectBufferMap.end();
-       I != E; ++I) {
-    // Call the private method that doesn't update the map so our iterator
-    // doesn't break.
-    deregisterObjectInternal(I);
-  }
-  ObjectBufferMap.clear();
-}
-
-void GDBJITRegistrar::registerObject(const ObjectBuffer &Object) {
-
-  const char *Buffer = Object.getBufferStart();
-  size_t      Size = Object.getBufferSize();
-
-  assert(Buffer && "Attempt to register a null object with a debugger.");
-  llvm::MutexGuard locked(*JITDebugLock);
-  assert(ObjectBufferMap.find(Buffer) == ObjectBufferMap.end() &&
-         "Second attempt to perform debug registration.");
-  jit_code_entry* JITCodeEntry = new jit_code_entry();
-
-  if (!JITCodeEntry) {
-    llvm::report_fatal_error(
-      "Allocation failed when registering a JIT entry!\n");
-  } else {
-    JITCodeEntry->symfile_addr = Buffer;
-    JITCodeEntry->symfile_size = Size;
-
-    ObjectBufferMap[Buffer] = std::make_pair(Size, JITCodeEntry);
-    NotifyDebugger(JITCodeEntry);
-  }
-}
-
-bool GDBJITRegistrar::deregisterObject(const ObjectBuffer& Object) {
-  const char *Buffer = Object.getBufferStart();
-  llvm::MutexGuard locked(*JITDebugLock);
-  RegisteredObjectBufferMap::iterator I = ObjectBufferMap.find(Buffer);
-
-  if (I != ObjectBufferMap.end()) {
-    deregisterObjectInternal(I);
-    ObjectBufferMap.erase(I);
-    return true;
-  }
-  return false;
-}
-
-void GDBJITRegistrar::deregisterObjectInternal(
-    RegisteredObjectBufferMap::iterator I) {
-
-  jit_code_entry*& JITCodeEntry = I->second.second;
-
-  // Do the unregistration.
-  {
-    __jit_debug_descriptor.action_flag = JIT_UNREGISTER_FN;
-
-    // Remove the jit_code_entry from the linked list.
-    jit_code_entry* PrevEntry = JITCodeEntry->prev_entry;
-    jit_code_entry* NextEntry = JITCodeEntry->next_entry;
-
-    if (NextEntry) {
-      NextEntry->prev_entry = PrevEntry;
-    }
-    if (PrevEntry) {
-      PrevEntry->next_entry = NextEntry;
-    }
-    else {
-      assert(__jit_debug_descriptor.first_entry == JITCodeEntry);
-      __jit_debug_descriptor.first_entry = NextEntry;
-    }
-
-    // Tell the debugger which entry we removed, and unregister the code.
-    __jit_debug_descriptor.relevant_entry = JITCodeEntry;
-    __jit_debug_register_code();
-  }
-
-  delete JITCodeEntry;
-  JITCodeEntry = nullptr;
-}
-
-llvm::ManagedStatic<GDBJITRegistrar> TheRegistrar;
-
-} // end namespace
-
-namespace llvm {
-
-JITRegistrar& JITRegistrar::getGDBRegistrar() {
-  return *TheRegistrar;
-}
-
-} // namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h b/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
deleted file mode 100644
index 636011f..0000000
--- a/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
+++ /dev/null
@@ -1,44 +0,0 @@
-//===-- JITRegistrar.h - Registers objects with a debugger ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_JITREGISTRAR_H
-#define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_JITREGISTRAR_H
-
-#include "llvm/ExecutionEngine/ObjectBuffer.h"
-
-namespace llvm {
-
-/// Global access point for the JIT debugging interface.
-class JITRegistrar {
-  virtual void anchor();
-public:
-  /// Instantiates the JIT service.
-  JITRegistrar() {}
-
-  /// Unregisters each object that was previously registered and releases all
-  /// internal resources.
-  virtual ~JITRegistrar() {}
-
-  /// Creates an entry in the JIT registry for the buffer @p Object,
-  /// which must contain an object file in executable memory with any
-  /// debug information for the debugger.
-  virtual void registerObject(const ObjectBuffer &Object) = 0;
-
-  /// Removes the internal registration of @p Object, and
-  /// frees associated resources.
-  /// Returns true if @p Object was previously registered.
-  virtual bool deregisterObject(const ObjectBuffer &Object) = 0;
-
-  /// Returns a reference to a GDB JIT registrar singleton
-  static JITRegistrar& getGDBRegistrar();
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
deleted file mode 100644
index 9bbf6a0d..0000000
--- a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
+++ /dev/null
@@ -1,86 +0,0 @@
-//===-- ObjectImageCommon.h - Format independent executuable object image -===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares a file format independent ObjectImage class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_OBJECTIMAGECOMMON_H
-#define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_OBJECTIMAGECOMMON_H
-
-#include "llvm/ExecutionEngine/ObjectBuffer.h"
-#include "llvm/ExecutionEngine/ObjectImage.h"
-#include "llvm/Object/ObjectFile.h"
-
-#include <memory>
-
-namespace llvm {
-
-namespace object {
-  class ObjectFile;
-}
-
-class ObjectImageCommon : public ObjectImage {
-  ObjectImageCommon(); // = delete
-  ObjectImageCommon(const ObjectImageCommon &other); // = delete
-  void anchor() override;
-
-protected:
-  std::unique_ptr<object::ObjectFile> ObjFile;
-
-  // This form of the constructor allows subclasses to use
-  // format-specific subclasses of ObjectFile directly
-  ObjectImageCommon(std::unique_ptr<ObjectBuffer> Input,
-                    std::unique_ptr<object::ObjectFile> Obj)
-      : ObjectImage(std::move(Input)), ObjFile(std::move(Obj)) {}
-
-public:
-  ObjectImageCommon(std::unique_ptr<ObjectBuffer> Input)
-      : ObjectImage(std::move(Input)) {
-    // FIXME: error checking? createObjectFile returns an ErrorOr<ObjectFile*>
-    // and should probably be checked for failure.
-    MemoryBufferRef Buf = Buffer->getMemBuffer();
-    ObjFile = std::move(object::ObjectFile::createObjectFile(Buf).get());
-  }
-  ObjectImageCommon(std::unique_ptr<object::ObjectFile> Input)
-  : ObjectImage(nullptr), ObjFile(std::move(Input))  {}
-  virtual ~ObjectImageCommon() { }
-
-  object::symbol_iterator begin_symbols() const override
-      { return ObjFile->symbol_begin(); }
-  object::symbol_iterator end_symbols() const override
-      { return ObjFile->symbol_end(); }
-
-  object::section_iterator begin_sections() const override
-      { return ObjFile->section_begin(); }
-  object::section_iterator end_sections() const override
-      { return ObjFile->section_end(); }
-
-  /* Triple::ArchType */ unsigned getArch() const override
-      { return ObjFile->getArch(); }
-
-  StringRef getData() const override { return ObjFile->getData(); }
-
-  object::ObjectFile* getObjectFile() const override { return ObjFile.get(); }
-
-  // Subclasses can override these methods to update the image with loaded
-  // addresses for sections and common symbols
-  void updateSectionAddress(const object::SectionRef &Sec,
-                            uint64_t Addr) override {}
-  void updateSymbolAddress(const object::SymbolRef &Sym,
-                           uint64_t Addr) override {}
-
-  // Subclasses can override these methods to provide JIT debugging support
-  void registerWithDebugger() override {}
-  void deregisterWithDebugger() override {}
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
new file mode 100644
index 0000000..2a5e4f8
--- /dev/null
+++ b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
@@ -0,0 +1,294 @@
+//===-- RTDyldMemoryManager.cpp - Memory manager for MC-JIT -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the runtime dynamic memory manager base class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstdlib>
+
+#ifdef __linux__
+  // These includes used by RTDyldMemoryManager::getPointerToNamedFunction()
+  // for Glibc trickery. See comments in this function for more information.
+  #ifdef HAVE_SYS_STAT_H
+    #include <sys/stat.h>
+  #endif
+  #include <fcntl.h>
+  #include <unistd.h>
+#endif
+
+namespace llvm {
+
+RTDyldMemoryManager::~RTDyldMemoryManager() {}
+
+// Determine whether we can register EH tables.
+#if (defined(__GNUC__) && !defined(__ARM_EABI__) && !defined(__ia64__) && \
+     !defined(__SEH__) && !defined(__USING_SJLJ_EXCEPTIONS__))
+#define HAVE_EHTABLE_SUPPORT 1
+#else
+#define HAVE_EHTABLE_SUPPORT 0
+#endif
+
+#if HAVE_EHTABLE_SUPPORT
+extern "C" void __register_frame(void*);
+extern "C" void __deregister_frame(void*);
+#else
+// The building compiler does not have __(de)register_frame but
+// it may be found at runtime in a dynamically-loaded library.
+// For example, this happens when building LLVM with Visual C++
+// but using the MingW runtime.
+void __register_frame(void *p) {
+  static bool Searched = false;
+  static void *rf = 0;
+
+  if (!Searched) {
+    Searched = true;
+    rf = llvm::sys::DynamicLibrary::SearchForAddressOfSymbol(
+                                                      "__register_frame");
+  }
+  if (rf)
+    ((void (*)(void *))rf)(p);
+}
+
+void __deregister_frame(void *p) {
+  static bool Searched = false;
+  static void *df = 0;
+
+  if (!Searched) {
+    Searched = true;
+    df = llvm::sys::DynamicLibrary::SearchForAddressOfSymbol(
+                                                      "__deregister_frame");
+  }
+  if (df)
+    ((void (*)(void *))df)(p);
+}
+#endif
+
+#ifdef __APPLE__
+
+static const char *processFDE(const char *Entry, bool isDeregister) {
+  const char *P = Entry;
+  uint32_t Length = *((const uint32_t *)P);
+  P += 4;
+  uint32_t Offset = *((const uint32_t *)P);
+  if (Offset != 0) {
+    if (isDeregister)
+      __deregister_frame(const_cast<char *>(Entry));
+    else
+      __register_frame(const_cast<char *>(Entry));
+  }
+  return P + Length;
+}
+
+// This implementation handles frame registration for local targets.
+// Memory managers for remote targets should re-implement this function
+// and use the LoadAddr parameter.
+void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
+                                           uint64_t LoadAddr,
+                                           size_t Size) {
+  // On OS X OS X __register_frame takes a single FDE as an argument.
+  // See http://lists.cs.uiuc.edu/pipermail/llvmdev/2013-April/061768.html
+  const char *P = (const char *)Addr;
+  const char *End = P + Size;
+  do  {
+    P = processFDE(P, false);
+  } while(P != End);
+}
+
+void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr,
+                                           uint64_t LoadAddr,
+                                           size_t Size) {
+  const char *P = (const char *)Addr;
+  const char *End = P + Size;
+  do  {
+    P = processFDE(P, true);
+  } while(P != End);
+}
+
+#else
+
+void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
+                                           uint64_t LoadAddr,
+                                           size_t Size) {
+  // On Linux __register_frame takes a single argument: 
+  // a pointer to the start of the .eh_frame section.
+
+  // How can it find the end? Because crtendS.o is linked 
+  // in and it has an .eh_frame section with four zero chars.
+  __register_frame(Addr);
+}
+
+void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr,
+                                           uint64_t LoadAddr,
+                                           size_t Size) {
+  __deregister_frame(Addr);
+}
+
+#endif
+
+static int jit_noop() {
+  return 0;
+}
+
+// ARM math functions are statically linked on Android from libgcc.a, but not
+// available at runtime for dynamic linking. On Linux these are usually placed
+// in libgcc_s.so so can be found by normal dynamic lookup.
+#if defined(__BIONIC__) && defined(__arm__)
+// List of functions which are statically linked on Android and can be generated
+// by LLVM. This is done as a nested macro which is used once to declare the
+// imported functions with ARM_MATH_DECL and once to compare them to the
+// user-requested symbol in getSymbolAddress with ARM_MATH_CHECK. The test
+// assumes that all functions start with __aeabi_ and getSymbolAddress must be
+// modified if that changes.
+#define ARM_MATH_IMPORTS(PP) \
+  PP(__aeabi_d2f) \
+  PP(__aeabi_d2iz) \
+  PP(__aeabi_d2lz) \
+  PP(__aeabi_d2uiz) \
+  PP(__aeabi_d2ulz) \
+  PP(__aeabi_dadd) \
+  PP(__aeabi_dcmpeq) \
+  PP(__aeabi_dcmpge) \
+  PP(__aeabi_dcmpgt) \
+  PP(__aeabi_dcmple) \
+  PP(__aeabi_dcmplt) \
+  PP(__aeabi_dcmpun) \
+  PP(__aeabi_ddiv) \
+  PP(__aeabi_dmul) \
+  PP(__aeabi_dsub) \
+  PP(__aeabi_f2d) \
+  PP(__aeabi_f2iz) \
+  PP(__aeabi_f2lz) \
+  PP(__aeabi_f2uiz) \
+  PP(__aeabi_f2ulz) \
+  PP(__aeabi_fadd) \
+  PP(__aeabi_fcmpeq) \
+  PP(__aeabi_fcmpge) \
+  PP(__aeabi_fcmpgt) \
+  PP(__aeabi_fcmple) \
+  PP(__aeabi_fcmplt) \
+  PP(__aeabi_fcmpun) \
+  PP(__aeabi_fdiv) \
+  PP(__aeabi_fmul) \
+  PP(__aeabi_fsub) \
+  PP(__aeabi_i2d) \
+  PP(__aeabi_i2f) \
+  PP(__aeabi_idiv) \
+  PP(__aeabi_idivmod) \
+  PP(__aeabi_l2d) \
+  PP(__aeabi_l2f) \
+  PP(__aeabi_lasr) \
+  PP(__aeabi_ldivmod) \
+  PP(__aeabi_llsl) \
+  PP(__aeabi_llsr) \
+  PP(__aeabi_lmul) \
+  PP(__aeabi_ui2d) \
+  PP(__aeabi_ui2f) \
+  PP(__aeabi_uidiv) \
+  PP(__aeabi_uidivmod) \
+  PP(__aeabi_ul2d) \
+  PP(__aeabi_ul2f) \
+  PP(__aeabi_uldivmod)
+
+// Declare statically linked math functions on ARM. The function declarations
+// here do not have the correct prototypes for each function in
+// ARM_MATH_IMPORTS, but it doesn't matter because only the symbol addresses are
+// needed. In particular the __aeabi_*divmod functions do not have calling
+// conventions which match any C prototype.
+#define ARM_MATH_DECL(name) extern "C" void name();
+ARM_MATH_IMPORTS(ARM_MATH_DECL)
+#undef ARM_MATH_DECL
+#endif
+
+#if defined(__linux__) && defined(__GLIBC__) && \
+      (defined(__i386__) || defined(__x86_64__))
+extern "C" LLVM_ATTRIBUTE_WEAK void __morestack();
+#endif
+
+uint64_t
+RTDyldMemoryManager::getSymbolAddressInProcess(const std::string &Name) {
+  // This implementation assumes that the host program is the target.
+  // Clients generating code for a remote target should implement their own
+  // memory manager.
+#if defined(__linux__) && defined(__GLIBC__)
+  //===--------------------------------------------------------------------===//
+  // Function stubs that are invoked instead of certain library calls
+  //
+  // Force the following functions to be linked in to anything that uses the
+  // JIT. This is a hack designed to work around the all-too-clever Glibc
+  // strategy of making these functions work differently when inlined vs. when
+  // not inlined, and hiding their real definitions in a separate archive file
+  // that the dynamic linker can't see. For more info, search for
+  // 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
+  if (Name == "stat") return (uint64_t)&stat;
+  if (Name == "fstat") return (uint64_t)&fstat;
+  if (Name == "lstat") return (uint64_t)&lstat;
+  if (Name == "stat64") return (uint64_t)&stat64;
+  if (Name == "fstat64") return (uint64_t)&fstat64;
+  if (Name == "lstat64") return (uint64_t)&lstat64;
+  if (Name == "atexit") return (uint64_t)&atexit;
+  if (Name == "mknod") return (uint64_t)&mknod;
+
+#if defined(__i386__) || defined(__x86_64__)
+  // __morestack lives in libgcc, a static library.
+  if (&__morestack && Name == "__morestack")
+    return (uint64_t)&__morestack;
+#endif
+#endif // __linux__ && __GLIBC__
+  
+  // See ARM_MATH_IMPORTS definition for explanation
+#if defined(__BIONIC__) && defined(__arm__)
+  if (Name.compare(0, 8, "__aeabi_") == 0) {
+    // Check if the user has requested any of the functions listed in
+    // ARM_MATH_IMPORTS, and if so redirect to the statically linked symbol.
+#define ARM_MATH_CHECK(fn) if (Name == #fn) return (uint64_t)&fn;
+    ARM_MATH_IMPORTS(ARM_MATH_CHECK)
+#undef ARM_MATH_CHECK
+  }
+#endif
+
+  // We should not invoke parent's ctors/dtors from generated main()!
+  // On Mingw and Cygwin, the symbol __main is resolved to
+  // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors
+  // (and register wrong callee's dtors with atexit(3)).
+  // We expect ExecutionEngine::runStaticConstructorsDestructors()
+  // is called before ExecutionEngine::runFunctionAsMain() is called.
+  if (Name == "__main") return (uint64_t)&jit_noop;
+
+  // Try to demangle Name before looking it up in the process, otherwise symbol
+  // '_<Name>' (if present) will shadow '<Name>', and there will be no way to
+  // refer to the latter.
+
+  const char *NameStr = Name.c_str();
+
+  if (NameStr[0] == '_')
+    if (void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr + 1))
+      return (uint64_t)Ptr;
+
+  // If we Name did not require demangling, or we failed to find the demangled
+  // name, try again without demangling.
+  return (uint64_t)sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
+}
+
+void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name,
+                                                     bool AbortOnFailure) {
+  uint64_t Addr = getSymbolAddress(Name);
+
+  if (!Addr && AbortOnFailure)
+    report_fatal_error("Program used external function '" + Name +
+                       "' which could not be resolved!");
+  return (void*)Addr;
+}
+
+} // namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index c7c67f6..54f1a1c 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -12,13 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
-#include "JITRegistrar.h"
-#include "ObjectImageCommon.h"
 #include "RuntimeDyldCheckerImpl.h"
 #include "RuntimeDyldELF.h"
 #include "RuntimeDyldImpl.h"
 #include "RuntimeDyldMachO.h"
-#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MutexGuard.h"
 
@@ -30,10 +28,8 @@ using namespace llvm::object;
 // Empty out-of-line virtual destructor as the key function.
 RuntimeDyldImpl::~RuntimeDyldImpl() {}
 
-// Pin the JITRegistrar's and ObjectImage*'s vtables to this file.
-void JITRegistrar::anchor() {}
-void ObjectImage::anchor() {}
-void ObjectImageCommon::anchor() {}
+// Pin LoadedObjectInfo's vtables to this file.
+void RuntimeDyld::LoadedObjectInfo::anchor() {}
 
 namespace llvm {
 
@@ -139,93 +135,88 @@ static std::error_code getOffset(const SymbolRef &Sym, uint64_t &Result) {
   return object_error::success;
 }
 
-std::unique_ptr<ObjectImage>
-RuntimeDyldImpl::loadObject(std::unique_ptr<ObjectImage> Obj) {
+std::pair<unsigned, unsigned>
+RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
   MutexGuard locked(lock);
 
-  if (!Obj)
-    return nullptr;
+  // Grab the first Section ID. We'll use this later to construct the underlying
+  // range for the returned LoadedObjectInfo.
+  unsigned SectionsAddedBeginIdx = Sections.size();
 
   // Save information about our target
-  Arch = (Triple::ArchType)Obj->getArch();
-  IsTargetLittleEndian = Obj->getObjectFile()->isLittleEndian();
+  Arch = (Triple::ArchType)Obj.getArch();
+  IsTargetLittleEndian = Obj.isLittleEndian();
 
   // Compute the memory size required to load all sections to be loaded
   // and pass this information to the memory manager
   if (MemMgr->needsToReserveAllocationSpace()) {
     uint64_t CodeSize = 0, DataSizeRO = 0, DataSizeRW = 0;
-    computeTotalAllocSize(*Obj, CodeSize, DataSizeRO, DataSizeRW);
+    computeTotalAllocSize(Obj, CodeSize, DataSizeRO, DataSizeRW);
     MemMgr->reserveAllocationSpace(CodeSize, DataSizeRO, DataSizeRW);
   }
 
-  // Symbols found in this object
-  StringMap<SymbolLoc> LocalSymbols;
   // Used sections from the object file
   ObjSectionToIDMap LocalSections;
 
   // Common symbols requiring allocation, with their sizes and alignments
-  CommonSymbolMap CommonSymbols;
-  // Maximum required total memory to allocate all common symbols
-  uint64_t CommonSize = 0;
+  CommonSymbolList CommonSymbols;
 
   // Parse symbols
   DEBUG(dbgs() << "Parse symbols:\n");
-  for (symbol_iterator I = Obj->begin_symbols(), E = Obj->end_symbols(); I != E;
+  for (symbol_iterator I = Obj.symbol_begin(), E = Obj.symbol_end(); I != E;
        ++I) {
-    object::SymbolRef::Type SymType;
-    StringRef Name;
-    Check(I->getType(SymType));
-    Check(I->getName(Name));
-
     uint32_t Flags = I->getFlags();
 
     bool IsCommon = Flags & SymbolRef::SF_Common;
-    if (IsCommon) {
-      // Add the common symbols to a list.  We'll allocate them all below.
-      if (!GlobalSymbolTable.count(Name)) {
-        uint32_t Align;
-        Check(I->getAlignment(Align));
-        uint64_t Size = 0;
-        Check(I->getSize(Size));
-        CommonSize += Size + Align;
-        CommonSymbols[*I] = CommonSymbolInfo(Size, Align);
-      }
-    } else {
+    if (IsCommon)
+      CommonSymbols.push_back(*I);
+    else {
+      object::SymbolRef::Type SymType;
+      Check(I->getType(SymType));
+
       if (SymType == object::SymbolRef::ST_Function ||
           SymType == object::SymbolRef::ST_Data ||
           SymType == object::SymbolRef::ST_Unknown) {
+
+        StringRef Name;
         uint64_t SectOffset;
-        StringRef SectionData;
-        section_iterator SI = Obj->end_sections();
+        Check(I->getName(Name));
         Check(getOffset(*I, SectOffset));
+        section_iterator SI = Obj.section_end();
         Check(I->getSection(SI));
-        if (SI == Obj->end_sections())
+        if (SI == Obj.section_end())
           continue;
+        StringRef SectionData;
         Check(SI->getContents(SectionData));
         bool IsCode = SI->isText();
         unsigned SectionID =
-            findOrEmitSection(*Obj, *SI, IsCode, LocalSections);
-        LocalSymbols[Name.data()] = SymbolLoc(SectionID, SectOffset);
-        DEBUG(dbgs() << "\tOffset: " << format("%p", (uintptr_t)SectOffset)
-                     << " flags: " << Flags << " SID: " << SectionID);
-        GlobalSymbolTable[Name] = SymbolLoc(SectionID, SectOffset);
+            findOrEmitSection(Obj, *SI, IsCode, LocalSections);
+        DEBUG(dbgs() << "\tType: " << SymType << " Name: " << Name
+                     << " SID: " << SectionID << " Offset: "
+                     << format("%p", (uintptr_t)SectOffset)
+                     << " flags: " << Flags << "\n");
+        SymbolInfo::Visibility Vis =
+          (Flags & SymbolRef::SF_Exported) ?
+            SymbolInfo::Default : SymbolInfo::Hidden;
+        GlobalSymbolTable[Name] = SymbolInfo(SectionID, SectOffset, Vis);
       }
     }
-    DEBUG(dbgs() << "\tType: " << SymType << " Name: " << Name << "\n");
   }
 
   // Allocate common symbols
-  if (CommonSize != 0)
-    emitCommonSymbols(*Obj, CommonSymbols, CommonSize, GlobalSymbolTable);
+  emitCommonSymbols(Obj, CommonSymbols);
 
   // Parse and process relocations
   DEBUG(dbgs() << "Parse relocations:\n");
-  for (section_iterator SI = Obj->begin_sections(), SE = Obj->end_sections();
+  for (section_iterator SI = Obj.section_begin(), SE = Obj.section_end();
        SI != SE; ++SI) {
     unsigned SectionID = 0;
     StubMap Stubs;
     section_iterator RelocatedSection = SI->getRelocatedSection();
 
+    if (RelocatedSection == SE)
+      continue;
+
     relocation_iterator I = SI->relocation_begin();
     relocation_iterator E = SI->relocation_end();
 
@@ -234,23 +225,24 @@ RuntimeDyldImpl::loadObject(std::unique_ptr<ObjectImage> Obj) {
 
     bool IsCode = RelocatedSection->isText();
     SectionID =
-        findOrEmitSection(*Obj, *RelocatedSection, IsCode, LocalSections);
+        findOrEmitSection(Obj, *RelocatedSection, IsCode, LocalSections);
     DEBUG(dbgs() << "\tSectionID: " << SectionID << "\n");
 
     for (; I != E;)
-      I = processRelocationRef(SectionID, I, *Obj, LocalSections, LocalSymbols,
-                               Stubs);
+      I = processRelocationRef(SectionID, I, Obj, LocalSections, Stubs);
 
     // If there is an attached checker, notify it about the stubs for this
     // section so that they can be verified.
     if (Checker)
-      Checker->registerStubMap(Obj->getImageName(), SectionID, Stubs);
+      Checker->registerStubMap(Obj.getFileName(), SectionID, Stubs);
   }
 
   // Give the subclasses a chance to tie-up any loose ends.
-  finalizeLoad(*Obj, LocalSections);
+  finalizeLoad(Obj, LocalSections);
+
+  unsigned SectionsAddedEndIdx = Sections.size();
 
-  return Obj;
+  return std::make_pair(SectionsAddedBeginIdx, SectionsAddedEndIdx);
 }
 
 // A helper method for computeTotalAllocSize.
@@ -268,9 +260,37 @@ computeAllocationSizeForSections(std::vector<uint64_t> &SectionSizes,
   return TotalSize;
 }
 
+static bool isRequiredForExecution(const SectionRef &Section) {
+  const ObjectFile *Obj = Section.getObject();
+  if (auto *ELFObj = dyn_cast<object::ELFObjectFileBase>(Obj))
+    return ELFObj->getSectionFlags(Section) & ELF::SHF_ALLOC;
+  assert(isa<MachOObjectFile>(Obj));
+  return true;
+ }
+
+static bool isReadOnlyData(const SectionRef &Section) {
+  const ObjectFile *Obj = Section.getObject();
+  if (auto *ELFObj = dyn_cast<object::ELFObjectFileBase>(Obj))
+    return !(ELFObj->getSectionFlags(Section) &
+             (ELF::SHF_WRITE | ELF::SHF_EXECINSTR));
+  assert(isa<MachOObjectFile>(Obj));
+  return false;
+}
+
+static bool isZeroInit(const SectionRef &Section) {
+  const ObjectFile *Obj = Section.getObject();
+  if (auto *ELFObj = dyn_cast<object::ELFObjectFileBase>(Obj))
+    return ELFObj->getSectionType(Section) == ELF::SHT_NOBITS;
+
+  auto *MachO = cast<MachOObjectFile>(Obj);
+  unsigned SectionType = MachO->getSectionType(Section);
+  return SectionType == MachO::S_ZEROFILL ||
+         SectionType == MachO::S_GB_ZEROFILL;
+}
+
 // Compute an upper bound of the memory size that is required to load all
 // sections
-void RuntimeDyldImpl::computeTotalAllocSize(ObjectImage &Obj,
+void RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
                                             uint64_t &CodeSize,
                                             uint64_t &DataSizeRO,
                                             uint64_t &DataSizeRW) {
@@ -282,11 +302,11 @@ void RuntimeDyldImpl::computeTotalAllocSize(ObjectImage &Obj,
 
   // Collect sizes of all sections to be loaded;
   // also determine the max alignment of all sections
-  for (section_iterator SI = Obj.begin_sections(), SE = Obj.end_sections();
+  for (section_iterator SI = Obj.section_begin(), SE = Obj.section_end();
        SI != SE; ++SI) {
     const SectionRef &Section = *SI;
 
-    bool IsRequired = Section.isRequiredForExecution();
+    bool IsRequired = isRequiredForExecution(Section);
 
     // Consider only the sections that are required to be loaded for execution
     if (IsRequired) {
@@ -294,7 +314,7 @@ void RuntimeDyldImpl::computeTotalAllocSize(ObjectImage &Obj,
       uint64_t DataSize = Section.getSize();
       uint64_t Alignment64 = Section.getAlignment();
       bool IsCode = Section.isText();
-      bool IsReadOnly = Section.isReadOnlyData();
+      bool IsReadOnly = isReadOnlyData(Section);
       Check(Section.getName(Name));
       unsigned Alignment = (unsigned)Alignment64 & 0xffffffffL;
 
@@ -328,7 +348,7 @@ void RuntimeDyldImpl::computeTotalAllocSize(ObjectImage &Obj,
 
   // Compute the size of all common symbols
   uint64_t CommonSize = 0;
-  for (symbol_iterator I = Obj.begin_symbols(), E = Obj.end_symbols(); I != E;
+  for (symbol_iterator I = Obj.symbol_begin(), E = Obj.symbol_end(); I != E;
        ++I) {
     uint32_t Flags = I->getFlags();
     if (Flags & SymbolRef::SF_Common) {
@@ -353,7 +373,7 @@ void RuntimeDyldImpl::computeTotalAllocSize(ObjectImage &Obj,
 }
 
 // compute stub buffer size for the given section
-unsigned RuntimeDyldImpl::computeSectionStubBufSize(ObjectImage &Obj,
+unsigned RuntimeDyldImpl::computeSectionStubBufSize(const ObjectFile &Obj,
                                                     const SectionRef &Section) {
   unsigned StubSize = getMaxStubSize();
   if (StubSize == 0) {
@@ -363,7 +383,7 @@ unsigned RuntimeDyldImpl::computeSectionStubBufSize(ObjectImage &Obj,
   // necessary section allocation size in loadObject by walking all the sections
   // once.
   unsigned StubBufSize = 0;
-  for (section_iterator SI = Obj.begin_sections(), SE = Obj.end_sections();
+  for (section_iterator SI = Obj.section_begin(), SE = Obj.section_end();
        SI != SE; ++SI) {
     section_iterator RelSecI = SI->getRelocatedSection();
     if (!(RelSecI == Section))
@@ -418,46 +438,77 @@ void RuntimeDyldImpl::writeBytesUnaligned(uint64_t Value, uint8_t *Dst,
   }
 }
 
-void RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
-                                        const CommonSymbolMap &CommonSymbols,
-                                        uint64_t TotalSize,
-                                        SymbolTableMap &SymbolTable) {
+void RuntimeDyldImpl::emitCommonSymbols(const ObjectFile &Obj,
+                                        CommonSymbolList &CommonSymbols) {
+  if (CommonSymbols.empty())
+    return;
+
+  uint64_t CommonSize = 0;
+  CommonSymbolList SymbolsToAllocate;
+
+  DEBUG(dbgs() << "Processing common symbols...\n");
+
+  for (const auto &Sym : CommonSymbols) {
+    StringRef Name;
+    Check(Sym.getName(Name));
+
+    // Skip common symbols already elsewhere.
+    if (GlobalSymbolTable.count(Name) ||
+        MemMgr->getSymbolAddressInLogicalDylib(Name)) {
+      DEBUG(dbgs() << "\tSkipping already emitted common symbol '" << Name
+                   << "'\n");
+      continue;
+    }
+
+    uint32_t Align = 0;
+    uint64_t Size = 0;
+    Check(Sym.getAlignment(Align));
+    Check(Sym.getSize(Size));
+
+    CommonSize += Align + Size;
+    SymbolsToAllocate.push_back(Sym);
+  }
+
   // Allocate memory for the section
   unsigned SectionID = Sections.size();
-  uint8_t *Addr = MemMgr->allocateDataSection(TotalSize, sizeof(void *),
+  uint8_t *Addr = MemMgr->allocateDataSection(CommonSize, sizeof(void *),
                                               SectionID, StringRef(), false);
   if (!Addr)
     report_fatal_error("Unable to allocate memory for common symbols!");
   uint64_t Offset = 0;
-  Sections.push_back(SectionEntry("<common symbols>", Addr, TotalSize, 0));
-  memset(Addr, 0, TotalSize);
+  Sections.push_back(SectionEntry("<common symbols>", Addr, CommonSize, 0));
+  memset(Addr, 0, CommonSize);
 
   DEBUG(dbgs() << "emitCommonSection SectionID: " << SectionID << " new addr: "
-               << format("%p", Addr) << " DataSize: " << TotalSize << "\n");
+               << format("%p", Addr) << " DataSize: " << CommonSize << "\n");
 
   // Assign the address of each symbol
-  for (CommonSymbolMap::const_iterator it = CommonSymbols.begin(),
-       itEnd = CommonSymbols.end(); it != itEnd; ++it) {
-    uint64_t Size = it->second.first;
-    uint64_t Align = it->second.second;
+  for (auto &Sym : SymbolsToAllocate) {
+    uint32_t Align;
+    uint64_t Size;
     StringRef Name;
-    it->first.getName(Name);
+    Check(Sym.getAlignment(Align));
+    Check(Sym.getSize(Size));
+    Check(Sym.getName(Name));
     if (Align) {
       // This symbol has an alignment requirement.
       uint64_t AlignOffset = OffsetToAlignment((uint64_t)Addr, Align);
       Addr += AlignOffset;
       Offset += AlignOffset;
-      DEBUG(dbgs() << "Allocating common symbol " << Name << " address "
-                   << format("%p\n", Addr));
     }
-    Obj.updateSymbolAddress(it->first, (uint64_t)Addr);
-    SymbolTable[Name.data()] = SymbolLoc(SectionID, Offset);
+    uint32_t Flags = Sym.getFlags();
+    SymbolInfo::Visibility Vis =
+      (Flags & SymbolRef::SF_Exported) ?
+        SymbolInfo::Default : SymbolInfo::Hidden;
+    DEBUG(dbgs() << "Allocating common symbol " << Name << " address "
+                 << format("%p", Addr) << "\n");
+    GlobalSymbolTable[Name] = SymbolInfo(SectionID, Offset, Vis);
     Offset += Size;
     Addr += Size;
   }
 }
 
-unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
+unsigned RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
                                       const SectionRef &Section, bool IsCode) {
 
   StringRef data;
@@ -468,10 +519,10 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
   unsigned PaddingSize = 0;
   unsigned StubBufSize = 0;
   StringRef Name;
-  bool IsRequired = Section.isRequiredForExecution();
+  bool IsRequired = isRequiredForExecution(Section);
   bool IsVirtual = Section.isVirtual();
-  bool IsZeroInit = Section.isZeroInit();
-  bool IsReadOnly = Section.isReadOnlyData();
+  bool IsZeroInit = isZeroInit(Section);
+  bool IsReadOnly = isReadOnlyData(Section);
   uint64_t DataSize = Section.getSize();
   Check(Section.getName(Name));
 
@@ -521,7 +572,6 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
                  << " new addr: " << format("%p", Addr)
                  << " DataSize: " << DataSize << " StubBufSize: " << StubBufSize
                  << " Allocate: " << Allocate << "\n");
-    Obj.updateSectionAddress(Section, (uint64_t)Addr);
   } else {
     // Even if we didn't load the section, we need to record an entry for it
     // to handle later processing (and by 'handle' I mean don't do anything
@@ -537,12 +587,12 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
   Sections.push_back(SectionEntry(Name, Addr, DataSize, (uintptr_t)pData));
 
   if (Checker)
-    Checker->registerSection(Obj.getImageName(), SectionID);
+    Checker->registerSection(Obj.getFileName(), SectionID);
 
   return SectionID;
 }
 
-unsigned RuntimeDyldImpl::findOrEmitSection(ObjectImage &Obj,
+unsigned RuntimeDyldImpl::findOrEmitSection(const ObjectFile &Obj,
                                             const SectionRef &Section,
                                             bool IsCode,
                                             ObjSectionToIDMap &LocalSections) {
@@ -568,14 +618,15 @@ void RuntimeDyldImpl::addRelocationForSymbol(const RelocationEntry &RE,
   // Relocation by symbol.  If the symbol is found in the global symbol table,
   // create an appropriate section relocation.  Otherwise, add it to
   // ExternalSymbolRelocations.
-  SymbolTableMap::const_iterator Loc = GlobalSymbolTable.find(SymbolName);
+  RTDyldSymbolTable::const_iterator Loc = GlobalSymbolTable.find(SymbolName);
   if (Loc == GlobalSymbolTable.end()) {
     ExternalSymbolRelocations[SymbolName].push_back(RE);
   } else {
     // Copy the RE since we want to modify its addend.
     RelocationEntry RECopy = RE;
-    RECopy.Addend += Loc->second.second;
-    Relocations[Loc->second.first].push_back(RECopy);
+    const auto &SymInfo = Loc->second;
+    RECopy.Addend += SymInfo.getOffset();
+    Relocations[SymInfo.getSectionID()].push_back(RECopy);
   }
 }
 
@@ -700,7 +751,7 @@ void RuntimeDyldImpl::resolveExternalSymbols() {
       resolveRelocationList(Relocs, 0);
     } else {
       uint64_t Addr = 0;
-      SymbolTableMap::const_iterator Loc = GlobalSymbolTable.find(Name);
+      RTDyldSymbolTable::const_iterator Loc = GlobalSymbolTable.find(Name);
       if (Loc == GlobalSymbolTable.end()) {
         // This is an external symbol, try to get its address from
         // MemoryManager.
@@ -715,8 +766,9 @@ void RuntimeDyldImpl::resolveExternalSymbols() {
       } else {
         // We found the symbol in our global table.  It was probably in a
         // Module that we loaded previously.
-        SymbolLoc SymLoc = Loc->second;
-        Addr = getSectionLoadAddress(SymLoc.first) + SymLoc.second;
+        const auto &SymInfo = Loc->second;
+        Addr = getSectionLoadAddress(SymInfo.getSectionID()) +
+               SymInfo.getOffset();
       }
 
       // FIXME: Implement error handling that doesn't kill the host program!
@@ -739,6 +791,16 @@ void RuntimeDyldImpl::resolveExternalSymbols() {
 
 //===----------------------------------------------------------------------===//
 // RuntimeDyld class implementation
+
+uint64_t RuntimeDyld::LoadedObjectInfo::getSectionLoadAddress(
+                                                  StringRef SectionName) const {
+  for (unsigned I = BeginIdx; I != EndIdx; ++I)
+    if (RTDyld.Sections[I].Name == SectionName)
+      return RTDyld.Sections[I].LoadAddress;
+
+  return 0;
+}
+
 RuntimeDyld::RuntimeDyld(RTDyldMemoryManager *mm) {
   // FIXME: There's a potential issue lurking here if a single instance of
   // RuntimeDyld is used to load multiple objects.  The current implementation
@@ -772,78 +834,23 @@ createRuntimeDyldMachO(Triple::ArchType Arch, RTDyldMemoryManager *MM,
   return Dyld;
 }
 
-std::unique_ptr<ObjectImage>
-RuntimeDyld::loadObject(std::unique_ptr<ObjectFile> InputObject) {
-  std::unique_ptr<ObjectImage> InputImage;
-
-  ObjectFile &Obj = *InputObject;
-
-  if (InputObject->isELF()) {
-    InputImage.reset(RuntimeDyldELF::createObjectImageFromFile(std::move(InputObject)));
-    if (!Dyld)
+std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
+RuntimeDyld::loadObject(const ObjectFile &Obj) {
+  if (!Dyld) {
+    if (Obj.isELF())
       Dyld = createRuntimeDyldELF(MM, ProcessAllSections, Checker);
-  } else if (InputObject->isMachO()) {
-    InputImage.reset(RuntimeDyldMachO::createObjectImageFromFile(std::move(InputObject)));
-    if (!Dyld)
+    else if (Obj.isMachO())
       Dyld = createRuntimeDyldMachO(
-          static_cast<Triple::ArchType>(InputImage->getArch()), MM,
-          ProcessAllSections, Checker);
-  } else
-    report_fatal_error("Incompatible object format!");
-
-  if (!Dyld->isCompatibleFile(&Obj))
-    report_fatal_error("Incompatible object format!");
-
-  return Dyld->loadObject(std::move(InputImage));
-}
-
-std::unique_ptr<ObjectImage>
-RuntimeDyld::loadObject(std::unique_ptr<ObjectBuffer> InputBuffer) {
-  std::unique_ptr<ObjectImage> InputImage;
-  sys::fs::file_magic Type = sys::fs::identify_magic(InputBuffer->getBuffer());
-  auto *InputBufferPtr = InputBuffer.get();
-
-  switch (Type) {
-  case sys::fs::file_magic::elf:
-  case sys::fs::file_magic::elf_relocatable:
-  case sys::fs::file_magic::elf_executable:
-  case sys::fs::file_magic::elf_shared_object:
-  case sys::fs::file_magic::elf_core:
-    InputImage = RuntimeDyldELF::createObjectImage(std::move(InputBuffer));
-    if (!Dyld)
-      Dyld = createRuntimeDyldELF(MM, ProcessAllSections, Checker);
-    break;
-  case sys::fs::file_magic::macho_object:
-  case sys::fs::file_magic::macho_executable:
-  case sys::fs::file_magic::macho_fixed_virtual_memory_shared_lib:
-  case sys::fs::file_magic::macho_core:
-  case sys::fs::file_magic::macho_preload_executable:
-  case sys::fs::file_magic::macho_dynamically_linked_shared_lib:
-  case sys::fs::file_magic::macho_dynamic_linker:
-  case sys::fs::file_magic::macho_bundle:
-  case sys::fs::file_magic::macho_dynamically_linked_shared_lib_stub:
-  case sys::fs::file_magic::macho_dsym_companion:
-    InputImage = RuntimeDyldMachO::createObjectImage(std::move(InputBuffer));
-    if (!Dyld)
-      Dyld = createRuntimeDyldMachO(
-          static_cast<Triple::ArchType>(InputImage->getArch()), MM,
-          ProcessAllSections, Checker);
-    break;
-  case sys::fs::file_magic::unknown:
-  case sys::fs::file_magic::bitcode:
-  case sys::fs::file_magic::archive:
-  case sys::fs::file_magic::coff_object:
-  case sys::fs::file_magic::coff_import_library:
-  case sys::fs::file_magic::pecoff_executable:
-  case sys::fs::file_magic::macho_universal_binary:
-  case sys::fs::file_magic::windows_resource:
-    report_fatal_error("Incompatible object format!");
+               static_cast<Triple::ArchType>(Obj.getArch()), MM,
+               ProcessAllSections, Checker);
+    else
+      report_fatal_error("Incompatible object format!");
   }
 
-  if (!Dyld->isCompatibleFormat(InputBufferPtr))
+  if (!Dyld->isCompatibleFile(Obj))
     report_fatal_error("Incompatible object format!");
 
-  return Dyld->loadObject(std::move(InputImage));
+  return Dyld->loadObject(Obj);
 }
 
 void *RuntimeDyld::getSymbolAddress(StringRef Name) const {
@@ -858,6 +865,12 @@ uint64_t RuntimeDyld::getSymbolLoadAddress(StringRef Name) const {
   return Dyld->getSymbolLoadAddress(Name);
 }
 
+uint64_t RuntimeDyld::getExportedSymbolLoadAddress(StringRef Name) const {
+  if (!Dyld)
+    return 0;
+  return Dyld->getExportedSymbolLoadAddress(Name);
+}
+
 void RuntimeDyld::resolveRelocations() { Dyld->resolveRelocations(); }
 
 void RuntimeDyld::reassignSectionAddress(unsigned SectionID, uint64_t Addr) {
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
index 8818349..976a434 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
@@ -8,13 +8,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/STLExtras.h"
+#include "RuntimeDyldCheckerImpl.h"
+#include "RuntimeDyldImpl.h"
 #include "llvm/ExecutionEngine/RuntimeDyldChecker.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Path.h"
-#include "RuntimeDyldCheckerImpl.h"
-#include "RuntimeDyldImpl.h"
 #include <cctype>
 #include <memory>
 
@@ -260,9 +260,7 @@ private:
                    << "'. Instruction has only "
                    << format("%i", Inst.getNumOperands())
                    << " operands.\nInstruction is:\n  ";
-      Inst.dump_pretty(ErrMsgStream,
-                       Checker.Disassembler->getContext().getAsmInfo(),
-                       Checker.InstPrinter);
+      Inst.dump_pretty(ErrMsgStream, Checker.InstPrinter);
       return std::make_pair(EvalResult(ErrMsgStream.str()), "");
     }
 
@@ -272,9 +270,7 @@ private:
       raw_string_ostream ErrMsgStream(ErrMsg);
       ErrMsgStream << "Operand '" << format("%i", OpIdx) << "' of instruction '"
                    << Symbol << "' is not an immediate.\nInstruction is:\n  ";
-      Inst.dump_pretty(ErrMsgStream,
-                       Checker.Disassembler->getContext().getAsmInfo(),
-                       Checker.InstPrinter);
+      Inst.dump_pretty(ErrMsgStream, Checker.InstPrinter);
 
       return std::make_pair(EvalResult(ErrMsgStream.str()), "");
     }
@@ -740,7 +736,9 @@ uint64_t RuntimeDyldCheckerImpl::getSymbolLinkerAddr(StringRef Symbol) const {
 }
 
 uint64_t RuntimeDyldCheckerImpl::getSymbolRemoteAddr(StringRef Symbol) const {
-  return getRTDyld().getAnySymbolRemoteAddress(Symbol);
+  if (uint64_t InternalSymbolAddr = getRTDyld().getSymbolLoadAddress(Symbol))
+      return InternalSymbolAddr;
+  return getRTDyld().MemMgr->getSymbolAddress(Symbol);
 }
 
 uint64_t RuntimeDyldCheckerImpl::readMemoryAtAddr(uint64_t SrcAddr,
@@ -848,14 +846,16 @@ std::pair<uint64_t, std::string> RuntimeDyldCheckerImpl::getStubAddrFor(
 
 StringRef
 RuntimeDyldCheckerImpl::getSubsectionStartingAt(StringRef Name) const {
-  RuntimeDyldImpl::SymbolTableMap::const_iterator pos =
+  RTDyldSymbolTable::const_iterator pos =
       getRTDyld().GlobalSymbolTable.find(Name);
   if (pos == getRTDyld().GlobalSymbolTable.end())
     return StringRef();
-  RuntimeDyldImpl::SymbolLoc Loc = pos->second;
-  uint8_t *SectionAddr = getRTDyld().getSectionAddress(Loc.first);
-  return StringRef(reinterpret_cast<const char *>(SectionAddr) + Loc.second,
-                   getRTDyld().Sections[Loc.first].Size - Loc.second);
+  const auto &SymInfo = pos->second;
+  uint8_t *SectionAddr = getRTDyld().getSectionAddress(SymInfo.getSectionID());
+  return StringRef(reinterpret_cast<const char *>(SectionAddr) +
+                     SymInfo.getOffset(),
+                   getRTDyld().Sections[SymInfo.getSectionID()].Size -
+                     SymInfo.getOffset());
 }
 
 void RuntimeDyldCheckerImpl::registerSection(
@@ -885,9 +885,10 @@ void RuntimeDyldCheckerImpl::registerStubMap(
       // If this is a (Section, Offset) pair, do a reverse lookup in the
       // global symbol table to find the name.
       for (auto &GSTEntry : getRTDyld().GlobalSymbolTable) {
-        if (GSTEntry.second.first == StubMapEntry.first.SectionID &&
-            GSTEntry.second.second ==
-                static_cast<uint64_t>(StubMapEntry.first.Offset)) {
+        const auto &SymInfo = GSTEntry.second;
+        if (SymInfo.getSectionID() == StubMapEntry.first.SectionID &&
+            SymInfo.getOffset() ==
+              static_cast<uint64_t>(StubMapEntry.first.Offset)) {
           SymbolName = GSTEntry.first();
           break;
         }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index d95cffe..0f3ca0f 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -12,27 +12,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "RuntimeDyldELF.h"
-#include "JITRegistrar.h"
-#include "ObjectImageCommon.h"
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/ExecutionEngine/ObjectBuffer.h"
-#include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 using namespace llvm::object;
 
 #define DEBUG_TYPE "dyld"
 
-namespace {
-
 static inline std::error_code check(std::error_code Err) {
   if (Err) {
     report_fatal_error(Err.message());
@@ -40,6 +36,8 @@ static inline std::error_code check(std::error_code Err) {
   return Err;
 }
 
+namespace {
+
 template <class ELFT> class DyldELFObject : public ELFObjectFile<ELFT> {
   LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
 
@@ -52,16 +50,12 @@ template <class ELFT> class DyldELFObject : public ELFObjectFile<ELFT> {
 
   typedef typename ELFDataTypeTypedefHelper<ELFT>::value_type addr_type;
 
-  std::unique_ptr<ObjectFile> UnderlyingFile;
-
 public:
-  DyldELFObject(std::unique_ptr<ObjectFile> UnderlyingFile,
-                MemoryBufferRef Wrapper, std::error_code &ec);
-
   DyldELFObject(MemoryBufferRef Wrapper, std::error_code &ec);
 
   void updateSectionAddress(const SectionRef &Sec, uint64_t Addr);
-  void updateSymbolAddress(const SymbolRef &Sym, uint64_t Addr);
+
+  void updateSymbolAddress(const SymbolRef &SymRef, uint64_t Addr);
 
   // Methods for type inquiry through isa, cast and dyn_cast
   static inline bool classof(const Binary *v) {
@@ -71,42 +65,10 @@ public:
   static inline bool classof(const ELFObjectFile<ELFT> *v) {
     return v->isDyldType();
   }
-};
-
-template <class ELFT> class ELFObjectImage : public ObjectImageCommon {
-  bool Registered;
 
-public:
-  ELFObjectImage(std::unique_ptr<ObjectBuffer> Input,
-                 std::unique_ptr<DyldELFObject<ELFT>> Obj)
-      : ObjectImageCommon(std::move(Input), std::move(Obj)), Registered(false) {
-  }
-
-  virtual ~ELFObjectImage() {
-    if (Registered)
-      deregisterWithDebugger();
-  }
-
-  // Subclasses can override these methods to update the image with loaded
-  // addresses for sections and common symbols
-  void updateSectionAddress(const SectionRef &Sec, uint64_t Addr) override {
-    static_cast<DyldELFObject<ELFT>*>(getObjectFile())
-        ->updateSectionAddress(Sec, Addr);
-  }
+};
 
-  void updateSymbolAddress(const SymbolRef &Sym, uint64_t Addr) override {
-    static_cast<DyldELFObject<ELFT>*>(getObjectFile())
-        ->updateSymbolAddress(Sym, Addr);
-  }
 
-  void registerWithDebugger() override {
-    JITRegistrar::getGDBRegistrar().registerObject(*Buffer);
-    Registered = true;
-  }
-  void deregisterWithDebugger() override {
-    JITRegistrar::getGDBRegistrar().deregisterObject(*Buffer);
-  }
-};
 
 // The MemoryBuffer passed into this constructor is just a wrapper around the
 // actual memory.  Ultimately, the Binary parent class will take ownership of
@@ -118,14 +80,6 @@ DyldELFObject<ELFT>::DyldELFObject(MemoryBufferRef Wrapper, std::error_code &EC)
 }
 
 template <class ELFT>
-DyldELFObject<ELFT>::DyldELFObject(std::unique_ptr<ObjectFile> UnderlyingFile,
-                                   MemoryBufferRef Wrapper, std::error_code &EC)
-    : ELFObjectFile<ELFT>(Wrapper, EC),
-      UnderlyingFile(std::move(UnderlyingFile)) {
-  this->isDyldELFObject = true;
-}
-
-template <class ELFT>
 void DyldELFObject<ELFT>::updateSectionAddress(const SectionRef &Sec,
                                                uint64_t Addr) {
   DataRefImpl ShdrRef = Sec.getRawDataRefImpl();
@@ -149,10 +103,89 @@ void DyldELFObject<ELFT>::updateSymbolAddress(const SymbolRef &SymRef,
   sym->st_value = static_cast<addr_type>(Addr);
 }
 
+class LoadedELFObjectInfo : public RuntimeDyld::LoadedObjectInfo {
+public:
+  LoadedELFObjectInfo(RuntimeDyldImpl &RTDyld, unsigned BeginIdx,
+                      unsigned EndIdx)
+    : RuntimeDyld::LoadedObjectInfo(RTDyld, BeginIdx, EndIdx) {}
+
+  OwningBinary<ObjectFile>
+  getObjectForDebug(const ObjectFile &Obj) const override;
+};
+
+template <typename ELFT>
+std::unique_ptr<DyldELFObject<ELFT>>
+createRTDyldELFObject(MemoryBufferRef Buffer,
+                      const LoadedELFObjectInfo &L,
+                      std::error_code &ec) {
+  typedef typename ELFFile<ELFT>::Elf_Shdr Elf_Shdr;
+  typedef typename ELFDataTypeTypedefHelper<ELFT>::value_type addr_type;
+
+  std::unique_ptr<DyldELFObject<ELFT>> Obj =
+    llvm::make_unique<DyldELFObject<ELFT>>(Buffer, ec);
+
+  // Iterate over all sections in the object.
+  for (const auto &Sec : Obj->sections()) {
+    StringRef SectionName;
+    Sec.getName(SectionName);
+    if (SectionName != "") {
+      DataRefImpl ShdrRef = Sec.getRawDataRefImpl();
+      Elf_Shdr *shdr = const_cast<Elf_Shdr *>(
+          reinterpret_cast<const Elf_Shdr *>(ShdrRef.p));
+
+      if (uint64_t SecLoadAddr = L.getSectionLoadAddress(SectionName)) {
+        // This assumes that the address passed in matches the target address
+        // bitness. The template-based type cast handles everything else.
+        shdr->sh_addr = static_cast<addr_type>(SecLoadAddr);
+      }
+    }
+  }
+
+  return Obj;
+}
+
+OwningBinary<ObjectFile> createELFDebugObject(const ObjectFile &Obj,
+                                              const LoadedELFObjectInfo &L) {
+  assert(Obj.isELF() && "Not an ELF object file.");
+
+  std::unique_ptr<MemoryBuffer> Buffer =
+    MemoryBuffer::getMemBufferCopy(Obj.getData(), Obj.getFileName());
+
+  std::error_code ec;
+
+  std::unique_ptr<ObjectFile> DebugObj;
+  if (Obj.getBytesInAddress() == 4 && Obj.isLittleEndian()) {
+    typedef ELFType<support::little, 2, false> ELF32LE;
+    DebugObj = createRTDyldELFObject<ELF32LE>(Buffer->getMemBufferRef(), L, ec);
+  } else if (Obj.getBytesInAddress() == 4 && !Obj.isLittleEndian()) {
+    typedef ELFType<support::big, 2, false> ELF32BE;
+    DebugObj = createRTDyldELFObject<ELF32BE>(Buffer->getMemBufferRef(), L, ec);
+  } else if (Obj.getBytesInAddress() == 8 && !Obj.isLittleEndian()) {
+    typedef ELFType<support::big, 2, true> ELF64BE;
+    DebugObj = createRTDyldELFObject<ELF64BE>(Buffer->getMemBufferRef(), L, ec);
+  } else if (Obj.getBytesInAddress() == 8 && Obj.isLittleEndian()) {
+    typedef ELFType<support::little, 2, true> ELF64LE;
+    DebugObj = createRTDyldELFObject<ELF64LE>(Buffer->getMemBufferRef(), L, ec);
+  } else
+    llvm_unreachable("Unexpected ELF format");
+
+  assert(!ec && "Could not construct copy ELF object file");
+
+  return OwningBinary<ObjectFile>(std::move(DebugObj), std::move(Buffer));
+}
+
+OwningBinary<ObjectFile>
+LoadedELFObjectInfo::getObjectForDebug(const ObjectFile &Obj) const {
+  return createELFDebugObject(Obj, *this);
+}
+
 } // namespace
 
 namespace llvm {
 
+RuntimeDyldELF::RuntimeDyldELF(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {}
+RuntimeDyldELF::~RuntimeDyldELF() {}
+
 void RuntimeDyldELF::registerEHFrames() {
   if (!MemMgr)
     return;
@@ -180,83 +213,14 @@ void RuntimeDyldELF::deregisterEHFrames() {
   RegisteredEHFrameSections.clear();
 }
 
-ObjectImage *
-RuntimeDyldELF::createObjectImageFromFile(std::unique_ptr<object::ObjectFile> ObjFile) {
-  if (!ObjFile)
-    return nullptr;
-
-  std::error_code ec;
-  MemoryBufferRef Buffer = ObjFile->getMemoryBufferRef();
-
-  if (ObjFile->getBytesInAddress() == 4 && ObjFile->isLittleEndian()) {
-    auto Obj =
-        llvm::make_unique<DyldELFObject<ELFType<support::little, 2, false>>>(
-            std::move(ObjFile), Buffer, ec);
-    return new ELFObjectImage<ELFType<support::little, 2, false>>(
-        nullptr, std::move(Obj));
-  } else if (ObjFile->getBytesInAddress() == 4 && !ObjFile->isLittleEndian()) {
-    auto Obj =
-        llvm::make_unique<DyldELFObject<ELFType<support::big, 2, false>>>(
-            std::move(ObjFile), Buffer, ec);
-    return new ELFObjectImage<ELFType<support::big, 2, false>>(nullptr, std::move(Obj));
-  } else if (ObjFile->getBytesInAddress() == 8 && !ObjFile->isLittleEndian()) {
-    auto Obj = llvm::make_unique<DyldELFObject<ELFType<support::big, 2, true>>>(
-        std::move(ObjFile), Buffer, ec);
-    return new ELFObjectImage<ELFType<support::big, 2, true>>(nullptr,
-                                                              std::move(Obj));
-  } else if (ObjFile->getBytesInAddress() == 8 && ObjFile->isLittleEndian()) {
-    auto Obj =
-        llvm::make_unique<DyldELFObject<ELFType<support::little, 2, true>>>(
-            std::move(ObjFile), Buffer, ec);
-    return new ELFObjectImage<ELFType<support::little, 2, true>>(
-        nullptr, std::move(Obj));
-  } else
-    llvm_unreachable("Unexpected ELF format");
+std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
+RuntimeDyldELF::loadObject(const object::ObjectFile &O) {
+  unsigned SectionStartIdx, SectionEndIdx;
+  std::tie(SectionStartIdx, SectionEndIdx) = loadObjectImpl(O);
+  return llvm::make_unique<LoadedELFObjectInfo>(*this, SectionStartIdx,
+                                                SectionEndIdx);
 }
 
-std::unique_ptr<ObjectImage>
-RuntimeDyldELF::createObjectImage(std::unique_ptr<ObjectBuffer> Buffer) {
-  if (Buffer->getBufferSize() < ELF::EI_NIDENT)
-    llvm_unreachable("Unexpected ELF object size");
-  std::pair<unsigned char, unsigned char> Ident =
-      std::make_pair((uint8_t)Buffer->getBufferStart()[ELF::EI_CLASS],
-                     (uint8_t)Buffer->getBufferStart()[ELF::EI_DATA]);
-  std::error_code ec;
-
-  MemoryBufferRef Buf = Buffer->getMemBuffer();
-
-  if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB) {
-    auto Obj =
-        llvm::make_unique<DyldELFObject<ELFType<support::little, 4, false>>>(
-            Buf, ec);
-    return llvm::make_unique<
-        ELFObjectImage<ELFType<support::little, 4, false>>>(std::move(Buffer),
-                                                            std::move(Obj));
-  }
-  if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2MSB) {
-    auto Obj =
-        llvm::make_unique<DyldELFObject<ELFType<support::big, 4, false>>>(Buf,
-                                                                          ec);
-    return llvm::make_unique<ELFObjectImage<ELFType<support::big, 4, false>>>(
-        std::move(Buffer), std::move(Obj));
-  }
-  if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2MSB) {
-    auto Obj = llvm::make_unique<DyldELFObject<ELFType<support::big, 8, true>>>(
-        Buf, ec);
-    return llvm::make_unique<ELFObjectImage<ELFType<support::big, 8, true>>>(
-        std::move(Buffer), std::move(Obj));
-  }
-  assert(Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2LSB &&
-         "Unexpected ELF format");
-  auto Obj =
-      llvm::make_unique<DyldELFObject<ELFType<support::little, 8, true>>>(Buf,
-                                                                          ec);
-  return llvm::make_unique<ELFObjectImage<ELFType<support::little, 8, true>>>(
-      std::move(Buffer), std::move(Obj));
-}
-
-RuntimeDyldELF::~RuntimeDyldELF() {}
-
 void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
                                              uint64_t Offset, uint64_t Value,
                                              uint32_t Type, int64_t Addend,
@@ -615,7 +579,7 @@ void RuntimeDyldELF::resolveMIPSRelocation(const SectionEntry &Section,
 }
 
 // Return the .TOC. section and offset.
-void RuntimeDyldELF::findPPC64TOCSection(ObjectImage &Obj,
+void RuntimeDyldELF::findPPC64TOCSection(const ObjectFile &Obj,
                                          ObjSectionToIDMap &LocalSections,
                                          RelocationValueRef &Rel) {
   // Set a default SectionID in case we do not find a TOC section below.
@@ -628,7 +592,7 @@ void RuntimeDyldELF::findPPC64TOCSection(ObjectImage &Obj,
 
   // The TOC consists of sections .got, .toc, .tocbss, .plt in that
   // order. The TOC starts where the first of these sections starts.
-  for (section_iterator si = Obj.begin_sections(), se = Obj.end_sections();
+  for (section_iterator si = Obj.section_begin(), se = Obj.section_end();
        si != se; ++si) {
 
     StringRef SectionName;
@@ -650,15 +614,15 @@ void RuntimeDyldELF::findPPC64TOCSection(ObjectImage &Obj,
 
 // Returns the sections and offset associated with the ODP entry referenced
 // by Symbol.
-void RuntimeDyldELF::findOPDEntrySection(ObjectImage &Obj,
+void RuntimeDyldELF::findOPDEntrySection(const ObjectFile &Obj,
                                          ObjSectionToIDMap &LocalSections,
                                          RelocationValueRef &Rel) {
   // Get the ELF symbol value (st_value) to compare with Relocation offset in
   // .opd entries
-  for (section_iterator si = Obj.begin_sections(), se = Obj.end_sections();
+  for (section_iterator si = Obj.section_begin(), se = Obj.section_end();
        si != se; ++si) {
     section_iterator RelSecI = si->getRelocatedSection();
-    if (RelSecI == Obj.end_sections())
+    if (RelSecI == Obj.section_end())
       continue;
 
     StringRef RelSectionName;
@@ -700,7 +664,7 @@ void RuntimeDyldELF::findOPDEntrySection(ObjectImage &Obj,
       if (Rel.Addend != (int64_t)TargetSymbolOffset)
         continue;
 
-      section_iterator tsi(Obj.end_sections());
+      section_iterator tsi(Obj.section_end());
       check(TargetSymbol->getSection(tsi));
       bool IsCode = tsi->isText();
       Rel.SectionID = findOrEmitSection(Obj, (*tsi), IsCode, LocalSections);
@@ -935,8 +899,9 @@ void RuntimeDyldELF::resolveRelocation(const SectionEntry &Section,
 }
 
 relocation_iterator RuntimeDyldELF::processRelocationRef(
-    unsigned SectionID, relocation_iterator RelI, ObjectImage &Obj,
-    ObjSectionToIDMap &ObjSectionToID, const SymbolTableMap &Symbols,
+    unsigned SectionID, relocation_iterator RelI,
+    const ObjectFile &Obj,
+    ObjSectionToIDMap &ObjSectionToID,
     StubMap &Stubs) {
   uint64_t RelType;
   Check(RelI->getType(RelType));
@@ -946,66 +911,60 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
 
   // Obtain the symbol name which is referenced in the relocation
   StringRef TargetName;
-  if (Symbol != Obj.end_symbols())
+  if (Symbol != Obj.symbol_end())
     Symbol->getName(TargetName);
   DEBUG(dbgs() << "\t\tRelType: " << RelType << " Addend: " << Addend
                << " TargetName: " << TargetName << "\n");
   RelocationValueRef Value;
   // First search for the symbol in the local symbol table
-  SymbolTableMap::const_iterator lsi = Symbols.end();
   SymbolRef::Type SymType = SymbolRef::ST_Unknown;
-  if (Symbol != Obj.end_symbols()) {
-    lsi = Symbols.find(TargetName.data());
+
+  // Search for the symbol in the global symbol table
+  RTDyldSymbolTable::const_iterator gsi = GlobalSymbolTable.end();
+  if (Symbol != Obj.symbol_end()) {
+    gsi = GlobalSymbolTable.find(TargetName.data());
     Symbol->getType(SymType);
   }
-  if (lsi != Symbols.end()) {
-    Value.SectionID = lsi->second.first;
-    Value.Offset = lsi->second.second;
-    Value.Addend = lsi->second.second + Addend;
+  if (gsi != GlobalSymbolTable.end()) {
+    const auto &SymInfo = gsi->second;
+    Value.SectionID = SymInfo.getSectionID();
+    Value.Offset = SymInfo.getOffset();
+    Value.Addend = SymInfo.getOffset() + Addend;
   } else {
-    // Search for the symbol in the global symbol table
-    SymbolTableMap::const_iterator gsi = GlobalSymbolTable.end();
-    if (Symbol != Obj.end_symbols())
-      gsi = GlobalSymbolTable.find(TargetName.data());
-    if (gsi != GlobalSymbolTable.end()) {
-      Value.SectionID = gsi->second.first;
-      Value.Offset = gsi->second.second;
-      Value.Addend = gsi->second.second + Addend;
-    } else {
-      switch (SymType) {
-      case SymbolRef::ST_Debug: {
-        // TODO: Now ELF SymbolRef::ST_Debug = STT_SECTION, it's not obviously
-        // and can be changed by another developers. Maybe best way is add
-        // a new symbol type ST_Section to SymbolRef and use it.
-        section_iterator si(Obj.end_sections());
-        Symbol->getSection(si);
-        if (si == Obj.end_sections())
-          llvm_unreachable("Symbol section not found, bad object file format!");
-        DEBUG(dbgs() << "\t\tThis is section symbol\n");
-        bool isCode = si->isText();
-        Value.SectionID = findOrEmitSection(Obj, (*si), isCode, ObjSectionToID);
-        Value.Addend = Addend;
-        break;
-      }
-      case SymbolRef::ST_Data:
-      case SymbolRef::ST_Unknown: {
-        Value.SymbolName = TargetName.data();
-        Value.Addend = Addend;
-
-        // Absolute relocations will have a zero symbol ID (STN_UNDEF), which
-        // will manifest here as a NULL symbol name.
-        // We can set this as a valid (but empty) symbol name, and rely
-        // on addRelocationForSymbol to handle this.
-        if (!Value.SymbolName)
-          Value.SymbolName = "";
-        break;
-      }
-      default:
-        llvm_unreachable("Unresolved symbol type!");
-        break;
-      }
+    switch (SymType) {
+    case SymbolRef::ST_Debug: {
+      // TODO: Now ELF SymbolRef::ST_Debug = STT_SECTION, it's not obviously
+      // and can be changed by another developers. Maybe best way is add
+      // a new symbol type ST_Section to SymbolRef and use it.
+      section_iterator si(Obj.section_end());
+      Symbol->getSection(si);
+      if (si == Obj.section_end())
+        llvm_unreachable("Symbol section not found, bad object file format!");
+      DEBUG(dbgs() << "\t\tThis is section symbol\n");
+      bool isCode = si->isText();
+      Value.SectionID = findOrEmitSection(Obj, (*si), isCode, ObjSectionToID);
+      Value.Addend = Addend;
+      break;
+    }
+    case SymbolRef::ST_Data:
+    case SymbolRef::ST_Unknown: {
+      Value.SymbolName = TargetName.data();
+      Value.Addend = Addend;
+
+      // Absolute relocations will have a zero symbol ID (STN_UNDEF), which
+      // will manifest here as a NULL symbol name.
+      // We can set this as a valid (but empty) symbol name, and rely
+      // on addRelocationForSymbol to handle this.
+      if (!Value.SymbolName)
+        Value.SymbolName = "";
+      break;
+    }
+    default:
+      llvm_unreachable("Unresolved symbol type!");
+      break;
     }
   }
+
   uint64_t Offset;
   Check(RelI->getOffset(Offset));
 
@@ -1135,7 +1094,7 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
     if (RelType == ELF::R_PPC64_REL24) {
       // Determine ABI variant in use for this object.
       unsigned AbiVariant;
-      Obj.getObjectFile()->getPlatformFlags(AbiVariant);
+      Obj.getPlatformFlags(AbiVariant);
       AbiVariant &= ELF::EF_PPC64_ABI;
       // A PPC branch relocation will need a stub function if the target is
       // an external symbol (Symbol::ST_Unknown) or if the target address
@@ -1495,7 +1454,7 @@ uint64_t RuntimeDyldELF::findGOTEntry(uint64_t LoadAddress, uint64_t Offset) {
   return 0;
 }
 
-void RuntimeDyldELF::finalizeLoad(ObjectImage &ObjImg,
+void RuntimeDyldELF::finalizeLoad(const ObjectFile &Obj,
                                   ObjSectionToIDMap &SectionMap) {
   // If necessary, allocate the global offset table
   if (MemMgr) {
@@ -1533,15 +1492,8 @@ void RuntimeDyldELF::finalizeLoad(ObjectImage &ObjImg,
   }
 }
 
-bool RuntimeDyldELF::isCompatibleFormat(const ObjectBuffer *Buffer) const {
-  if (Buffer->getBufferSize() < strlen(ELF::ElfMagic))
-    return false;
-  return (memcmp(Buffer->getBufferStart(), ELF::ElfMagic,
-                 strlen(ELF::ElfMagic))) == 0;
-}
-
-bool RuntimeDyldELF::isCompatibleFile(const object::ObjectFile *Obj) const {
-  return Obj->isELF();
+bool RuntimeDyldELF::isCompatibleFile(const object::ObjectFile &Obj) const {
+  return Obj.isELF();
 }
 
 } // namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 4aeab81..b4414b0 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -28,9 +28,11 @@ std::error_code Check(std::error_code Err) {
   }
   return Err;
 }
+
 } // end anonymous namespace
 
 class RuntimeDyldELF : public RuntimeDyldImpl {
+
   void resolveRelocation(const SectionEntry &Section, uint64_t Offset,
                          uint64_t Value, uint32_t Type, int64_t Addend,
                          uint64_t SymOffset = 0);
@@ -81,9 +83,11 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
       return 1;
   }
 
-  void findPPC64TOCSection(ObjectImage &Obj, ObjSectionToIDMap &LocalSections,
+  void findPPC64TOCSection(const ObjectFile &Obj,
+                           ObjSectionToIDMap &LocalSections,
                            RelocationValueRef &Rel);
-  void findOPDEntrySection(ObjectImage &Obj, ObjSectionToIDMap &LocalSections,
+  void findOPDEntrySection(const ObjectFile &Obj,
+                           ObjSectionToIDMap &LocalSections,
                            RelocationValueRef &Rel);
 
   uint64_t findGOTEntry(uint64_t LoadAddr, uint64_t Offset);
@@ -104,24 +108,23 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
   SmallVector<SID, 2> RegisteredEHFrameSections;
 
 public:
-  RuntimeDyldELF(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {}
+  RuntimeDyldELF(RTDyldMemoryManager *mm);
+  virtual ~RuntimeDyldELF();
+
+  std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
+  loadObject(const object::ObjectFile &O) override;
 
   void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override;
   relocation_iterator
   processRelocationRef(unsigned SectionID, relocation_iterator RelI,
-                       ObjectImage &Obj, ObjSectionToIDMap &ObjSectionToID,
-                       const SymbolTableMap &Symbols, StubMap &Stubs) override;
-  bool isCompatibleFormat(const ObjectBuffer *Buffer) const override;
-  bool isCompatibleFile(const object::ObjectFile *Buffer) const override;
+                       const ObjectFile &Obj,
+                       ObjSectionToIDMap &ObjSectionToID,
+                       StubMap &Stubs) override;
+  bool isCompatibleFile(const object::ObjectFile &Obj) const override;
   void registerEHFrames() override;
   void deregisterEHFrames() override;
-  void finalizeLoad(ObjectImage &ObjImg,
+  void finalizeLoad(const ObjectFile &Obj,
                     ObjSectionToIDMap &SectionMap) override;
-  virtual ~RuntimeDyldELF();
-
-  static std::unique_ptr<ObjectImage>
-  createObjectImage(std::unique_ptr<ObjectBuffer> InputBuffer);
-  static ObjectImage *createObjectImageFromFile(std::unique_ptr<object::ObjectFile> Obj);
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 69ea3b4..f37a9a7 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -18,7 +18,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/ExecutionEngine/ObjectImage.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/ExecutionEngine/RuntimeDyldChecker.h"
 #include "llvm/Object/ObjectFile.h"
@@ -37,7 +36,6 @@ using namespace llvm::object;
 
 namespace llvm {
 
-class ObjectBuffer;
 class Twine;
 
 /// SectionEntry - represents a section emitted into memory by the dynamic
@@ -158,16 +156,31 @@ public:
   }
 };
 
-class RuntimeDyldImpl {
-  friend class RuntimeDyldCheckerImpl;
+/// @brief Symbol info for RuntimeDyld.
+class SymbolInfo {
+public:
+  typedef enum { Hidden = 0, Default = 1 } Visibility;
+
+  SymbolInfo() : Offset(0), SectionID(0), Vis(Hidden) {}
+
+  SymbolInfo(unsigned SectionID, uint64_t Offset, Visibility Vis)
+    : Offset(Offset), SectionID(SectionID), Vis(Vis) {}
+
+  unsigned getSectionID() const { return SectionID; }
+  uint64_t getOffset() const { return Offset; }
+  Visibility getVisibility() const { return Vis; }
+
 private:
+  uint64_t Offset;
+  unsigned SectionID : 31;
+  Visibility Vis : 1;
+};
 
-  uint64_t getAnySymbolRemoteAddress(StringRef Symbol) {
-    if (uint64_t InternalSymbolAddr = getSymbolLoadAddress(Symbol))
-      return InternalSymbolAddr;
-    return MemMgr->getSymbolAddress(Symbol);
-  }
+typedef StringMap<SymbolInfo> RTDyldSymbolTable;
 
+class RuntimeDyldImpl {
+  friend class RuntimeDyld::LoadedObjectInfo;
+  friend class RuntimeDyldCheckerImpl;
 protected:
   // The MemoryManager to load objects into.
   RTDyldMemoryManager *MemMgr;
@@ -187,16 +200,11 @@ protected:
   // references it.
   typedef std::map<SectionRef, unsigned> ObjSectionToIDMap;
 
-  // A global symbol table for symbols from all loaded modules.  Maps the
-  // symbol name to a (SectionID, offset in section) pair.
-  typedef std::pair<unsigned, uintptr_t> SymbolLoc;
-  typedef StringMap<SymbolLoc> SymbolTableMap;
-  SymbolTableMap GlobalSymbolTable;
+  // A global symbol table for symbols from all loaded modules.
+  RTDyldSymbolTable GlobalSymbolTable;
 
-  // Pair representing the size and alignment requirement for a common symbol.
-  typedef std::pair<unsigned, unsigned> CommonSymbolInfo;
   // Keep a map of common symbols to their info pairs
-  typedef std::map<SymbolRef, CommonSymbolInfo> CommonSymbolMap;
+  typedef std::vector<SymbolRef> CommonSymbolList;
 
   // For each symbol, keep a list of relocations based on it. Anytime
   // its address is reassigned (the JIT re-compiled the function, e.g.),
@@ -296,14 +304,13 @@ protected:
   /// \brief Given the common symbols discovered in the object file, emit a
   /// new section for them and update the symbol mappings in the object and
   /// symbol table.
-  void emitCommonSymbols(ObjectImage &Obj, const CommonSymbolMap &CommonSymbols,
-                         uint64_t TotalSize, SymbolTableMap &SymbolTable);
+  void emitCommonSymbols(const ObjectFile &Obj, CommonSymbolList &CommonSymbols);
 
   /// \brief Emits section data from the object file to the MemoryManager.
   /// \param IsCode if it's true then allocateCodeSection() will be
   ///        used for emits, else allocateDataSection() will be used.
   /// \return SectionID.
-  unsigned emitSection(ObjectImage &Obj, const SectionRef &Section,
+  unsigned emitSection(const ObjectFile &Obj, const SectionRef &Section,
                        bool IsCode);
 
   /// \brief Find Section in LocalSections. If the secton is not found - emit
@@ -311,7 +318,7 @@ protected:
   /// \param IsCode if it's true then allocateCodeSection() will be
   ///        used for emmits, else allocateDataSection() will be used.
   /// \return SectionID.
-  unsigned findOrEmitSection(ObjectImage &Obj, const SectionRef &Section,
+  unsigned findOrEmitSection(const ObjectFile &Obj, const SectionRef &Section,
                              bool IsCode, ObjSectionToIDMap &LocalSections);
 
   // \brief Add a relocation entry that uses the given section.
@@ -339,8 +346,8 @@ protected:
   /// \return Iterator to the next relocation that needs to be parsed.
   virtual relocation_iterator
   processRelocationRef(unsigned SectionID, relocation_iterator RelI,
-                       ObjectImage &Obj, ObjSectionToIDMap &ObjSectionToID,
-                       const SymbolTableMap &Symbols, StubMap &Stubs) = 0;
+                       const ObjectFile &Obj, ObjSectionToIDMap &ObjSectionToID,
+                       StubMap &Stubs) = 0;
 
   /// \brief Resolve relocations to external symbols.
   void resolveExternalSymbols();
@@ -351,13 +358,16 @@ protected:
 
   // \brief Compute an upper bound of the memory that is required to load all
   // sections
-  void computeTotalAllocSize(ObjectImage &Obj, uint64_t &CodeSize,
+  void computeTotalAllocSize(const ObjectFile &Obj, uint64_t &CodeSize,
                              uint64_t &DataSizeRO, uint64_t &DataSizeRW);
 
   // \brief Compute the stub buffer size required for a section
-  unsigned computeSectionStubBufSize(ObjectImage &Obj,
+  unsigned computeSectionStubBufSize(const ObjectFile &Obj,
                                      const SectionRef &Section);
 
+  // \brief Implementation of the generic part of the loadObject algorithm.
+  std::pair<unsigned, unsigned> loadObjectImpl(const object::ObjectFile &Obj);
+
 public:
   RuntimeDyldImpl(RTDyldMemoryManager *mm)
     : MemMgr(mm), Checker(nullptr), ProcessAllSections(false), HasError(false) {
@@ -373,27 +383,37 @@ public:
     this->Checker = Checker;
   }
 
-  std::unique_ptr<ObjectImage>
-  loadObject(std::unique_ptr<ObjectImage> InputObject);
+  virtual std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
+  loadObject(const object::ObjectFile &Obj) = 0;
 
   uint8_t* getSymbolAddress(StringRef Name) const {
     // FIXME: Just look up as a function for now. Overly simple of course.
     // Work in progress.
-    SymbolTableMap::const_iterator pos = GlobalSymbolTable.find(Name);
+    RTDyldSymbolTable::const_iterator pos = GlobalSymbolTable.find(Name);
     if (pos == GlobalSymbolTable.end())
       return nullptr;
-    SymbolLoc Loc = pos->second;
-    return getSectionAddress(Loc.first) + Loc.second;
+    const auto &SymInfo = pos->second;
+    return getSectionAddress(SymInfo.getSectionID()) + SymInfo.getOffset();
   }
 
   uint64_t getSymbolLoadAddress(StringRef Name) const {
     // FIXME: Just look up as a function for now. Overly simple of course.
     // Work in progress.
-    SymbolTableMap::const_iterator pos = GlobalSymbolTable.find(Name);
+    RTDyldSymbolTable::const_iterator pos = GlobalSymbolTable.find(Name);
     if (pos == GlobalSymbolTable.end())
       return 0;
-    SymbolLoc Loc = pos->second;
-    return getSectionLoadAddress(Loc.first) + Loc.second;
+    const auto &SymInfo = pos->second;
+    return getSectionLoadAddress(SymInfo.getSectionID()) + SymInfo.getOffset();
+  }
+
+  uint64_t getExportedSymbolLoadAddress(StringRef Name) const {
+    RTDyldSymbolTable::const_iterator pos = GlobalSymbolTable.find(Name);
+    if (pos == GlobalSymbolTable.end())
+      return 0;
+    const auto &SymInfo = pos->second;
+    if (SymInfo.getVisibility() == SymbolInfo::Hidden)
+      return 0;
+    return getSectionLoadAddress(SymInfo.getSectionID()) + SymInfo.getOffset();
   }
 
   void resolveRelocations();
@@ -411,14 +431,14 @@ public:
   // Get the error message.
   StringRef getErrorString() { return ErrorStr; }
 
-  virtual bool isCompatibleFormat(const ObjectBuffer *Buffer) const = 0;
-  virtual bool isCompatibleFile(const ObjectFile *Obj) const = 0;
+  virtual bool isCompatibleFile(const ObjectFile &Obj) const = 0;
 
   virtual void registerEHFrames();
 
   virtual void deregisterEHFrames();
 
-  virtual void finalizeLoad(ObjectImage &ObjImg, ObjSectionToIDMap &SectionMap) {}
+  virtual void finalizeLoad(const ObjectFile &ObjImg,
+                            ObjSectionToIDMap &SectionMap) {}
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index d3d6f5d..2d39662 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -12,19 +12,34 @@
 //===----------------------------------------------------------------------===//
 
 #include "RuntimeDyldMachO.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-
-#include "Targets/RuntimeDyldMachOARM.h"
 #include "Targets/RuntimeDyldMachOAArch64.h"
+#include "Targets/RuntimeDyldMachOARM.h"
 #include "Targets/RuntimeDyldMachOI386.h"
 #include "Targets/RuntimeDyldMachOX86_64.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 
 using namespace llvm;
 using namespace llvm::object;
 
 #define DEBUG_TYPE "dyld"
 
+namespace {
+
+class LoadedMachOObjectInfo : public RuntimeDyld::LoadedObjectInfo {
+public:
+  LoadedMachOObjectInfo(RuntimeDyldImpl &RTDyld, unsigned BeginIdx,
+                        unsigned EndIdx)
+    : RuntimeDyld::LoadedObjectInfo(RTDyld, BeginIdx, EndIdx) {}
+
+  OwningBinary<ObjectFile>
+  getObjectForDebug(const ObjectFile &Obj) const override {
+    return OwningBinary<ObjectFile>();
+  }
+};
+
+}
+
 namespace llvm {
 
 int64_t RuntimeDyldMachO::memcpyAddend(const RelocationEntry &RE) const {
@@ -35,12 +50,11 @@ int64_t RuntimeDyldMachO::memcpyAddend(const RelocationEntry &RE) const {
 }
 
 RelocationValueRef RuntimeDyldMachO::getRelocationValueRef(
-    ObjectImage &ObjImg, const relocation_iterator &RI,
-    const RelocationEntry &RE, ObjSectionToIDMap &ObjSectionToID,
-    const SymbolTableMap &Symbols) {
+    const ObjectFile &BaseTObj, const relocation_iterator &RI,
+    const RelocationEntry &RE, ObjSectionToIDMap &ObjSectionToID) {
 
   const MachOObjectFile &Obj =
-      static_cast<const MachOObjectFile &>(*ObjImg.getObjectFile());
+      static_cast<const MachOObjectFile &>(BaseTObj);
   MachO::any_relocation_info RelInfo =
       Obj.getRelocation(RI->getRawDataRefImpl());
   RelocationValueRef Value;
@@ -50,24 +64,20 @@ RelocationValueRef RuntimeDyldMachO::getRelocationValueRef(
     symbol_iterator Symbol = RI->getSymbol();
     StringRef TargetName;
     Symbol->getName(TargetName);
-    SymbolTableMap::const_iterator SI = Symbols.find(TargetName.data());
-    if (SI != Symbols.end()) {
-      Value.SectionID = SI->second.first;
-      Value.Offset = SI->second.second + RE.Addend;
+    RTDyldSymbolTable::const_iterator SI =
+      GlobalSymbolTable.find(TargetName.data());
+    if (SI != GlobalSymbolTable.end()) {
+      const auto &SymInfo = SI->second;
+      Value.SectionID = SymInfo.getSectionID();
+      Value.Offset = SymInfo.getOffset() + RE.Addend;
     } else {
-      SI = GlobalSymbolTable.find(TargetName.data());
-      if (SI != GlobalSymbolTable.end()) {
-        Value.SectionID = SI->second.first;
-        Value.Offset = SI->second.second + RE.Addend;
-      } else {
-        Value.SymbolName = TargetName.data();
-        Value.Offset = RE.Addend;
-      }
+      Value.SymbolName = TargetName.data();
+      Value.Offset = RE.Addend;
     }
   } else {
     SectionRef Sec = Obj.getRelocationSection(RelInfo);
     bool IsCode = Sec.isText();
-    Value.SectionID = findOrEmitSection(ObjImg, Sec, IsCode, ObjSectionToID);
+    Value.SectionID = findOrEmitSection(Obj, Sec, IsCode, ObjSectionToID);
     uint64_t Addr = Sec.getAddress();
     Value.Offset = RE.Addend - Addr;
   }
@@ -76,11 +86,11 @@ RelocationValueRef RuntimeDyldMachO::getRelocationValueRef(
 }
 
 void RuntimeDyldMachO::makeValueAddendPCRel(RelocationValueRef &Value,
-                                            ObjectImage &ObjImg,
+                                            const ObjectFile &BaseTObj,
                                             const relocation_iterator &RI,
                                             unsigned OffsetToNextPC) {
   const MachOObjectFile &Obj =
-      static_cast<const MachOObjectFile &>(*ObjImg.getObjectFile());
+      static_cast<const MachOObjectFile &>(BaseTObj);
   MachO::any_relocation_info RelInfo =
       Obj.getRelocation(RI->getRawDataRefImpl());
 
@@ -125,7 +135,7 @@ RuntimeDyldMachO::getSectionByAddress(const MachOObjectFile &Obj,
 
 // Populate __pointers section.
 void RuntimeDyldMachO::populateIndirectSymbolPointersSection(
-                                                    MachOObjectFile &Obj,
+                                                    const MachOObjectFile &Obj,
                                                     const SectionRef &PTSection,
                                                     unsigned PTSectionID) {
   assert(!Obj.is64Bit() &&
@@ -163,28 +173,12 @@ void RuntimeDyldMachO::populateIndirectSymbolPointersSection(
   }
 }
 
-bool
-RuntimeDyldMachO::isCompatibleFormat(const ObjectBuffer *InputBuffer) const {
-  if (InputBuffer->getBufferSize() < 4)
-    return false;
-  StringRef Magic(InputBuffer->getBufferStart(), 4);
-  if (Magic == "\xFE\xED\xFA\xCE")
-    return true;
-  if (Magic == "\xCE\xFA\xED\xFE")
-    return true;
-  if (Magic == "\xFE\xED\xFA\xCF")
-    return true;
-  if (Magic == "\xCF\xFA\xED\xFE")
-    return true;
-  return false;
-}
-
-bool RuntimeDyldMachO::isCompatibleFile(const object::ObjectFile *Obj) const {
-  return Obj->isMachO();
+bool RuntimeDyldMachO::isCompatibleFile(const object::ObjectFile &Obj) const {
+  return Obj.isMachO();
 }
 
 template <typename Impl>
-void RuntimeDyldMachOCRTPBase<Impl>::finalizeLoad(ObjectImage &ObjImg,
+void RuntimeDyldMachOCRTPBase<Impl>::finalizeLoad(const ObjectFile &ObjImg,
                                                   ObjSectionToIDMap &SectionMap) {
   unsigned EHFrameSID = RTDYLD_INVALID_SECTION_ID;
   unsigned TextSID = RTDYLD_INVALID_SECTION_ID;
@@ -284,7 +278,7 @@ void RuntimeDyldMachOCRTPBase<Impl>::registerEHFrames() {
 }
 
 std::unique_ptr<RuntimeDyldMachO>
-llvm::RuntimeDyldMachO::create(Triple::ArchType Arch, RTDyldMemoryManager *MM) {
+RuntimeDyldMachO::create(Triple::ArchType Arch, RTDyldMemoryManager *MM) {
   switch (Arch) {
   default:
     llvm_unreachable("Unsupported target for RuntimeDyldMachO.");
@@ -296,4 +290,12 @@ llvm::RuntimeDyldMachO::create(Triple::ArchType Arch, RTDyldMemoryManager *MM) {
   }
 }
 
+std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
+RuntimeDyldMachO::loadObject(const object::ObjectFile &O) {
+  unsigned SectionStartIdx, SectionEndIdx;
+  std::tie(SectionStartIdx, SectionEndIdx) = loadObjectImpl(O);
+  return llvm::make_unique<LoadedMachOObjectInfo>(*this, SectionStartIdx,
+                                                  SectionEndIdx);
+}
+
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index 7583474..f8bfc03 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_RUNTIMEDYLDMACHO_H
 #define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_RUNTIMEDYLDMACHO_H
 
-#include "ObjectImageCommon.h"
 #include "RuntimeDyldImpl.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Support/Format.h"
@@ -61,10 +60,11 @@ protected:
   /// filled in, since immediate encodings are highly target/opcode specific.
   /// For targets/opcodes with simple, contiguous immediates (e.g. X86) the
   /// memcpyAddend method can be used to read the immediate.
-  RelocationEntry getRelocationEntry(unsigned SectionID, ObjectImage &ObjImg,
+  RelocationEntry getRelocationEntry(unsigned SectionID,
+                                     const ObjectFile &BaseTObj,
                                      const relocation_iterator &RI) const {
     const MachOObjectFile &Obj =
-      static_cast<const MachOObjectFile &>(*ObjImg.getObjectFile());
+      static_cast<const MachOObjectFile &>(BaseTObj);
     MachO::any_relocation_info RelInfo =
       Obj.getRelocation(RI->getRawDataRefImpl());
 
@@ -87,14 +87,14 @@ protected:
   /// In both cases the Addend field is *NOT* fixed up to be PC-relative. That
   /// should be done by the caller where appropriate by calling makePCRel on
   /// the RelocationValueRef.
-  RelocationValueRef getRelocationValueRef(ObjectImage &ObjImg,
+  RelocationValueRef getRelocationValueRef(const ObjectFile &BaseTObj,
                                            const relocation_iterator &RI,
                                            const RelocationEntry &RE,
-                                           ObjSectionToIDMap &ObjSectionToID,
-                                           const SymbolTableMap &Symbols);
+                                           ObjSectionToIDMap &ObjSectionToID);
 
   /// Make the RelocationValueRef addend PC-relative.
-  void makeValueAddendPCRel(RelocationValueRef &Value, ObjectImage &ObjImg,
+  void makeValueAddendPCRel(RelocationValueRef &Value,
+                            const ObjectFile &BaseTObj,
                             const relocation_iterator &RI,
                             unsigned OffsetToNextPC);
 
@@ -107,31 +107,22 @@ protected:
 
 
   // Populate __pointers section.
-  void populateIndirectSymbolPointersSection(MachOObjectFile &Obj,
+  void populateIndirectSymbolPointersSection(const MachOObjectFile &Obj,
                                              const SectionRef &PTSection,
                                              unsigned PTSectionID);
 
 public:
-  /// Create an ObjectImage from the given ObjectBuffer.
-  static std::unique_ptr<ObjectImage>
-  createObjectImage(std::unique_ptr<ObjectBuffer> InputBuffer) {
-    return llvm::make_unique<ObjectImageCommon>(std::move(InputBuffer));
-  }
-
-  /// Create an ObjectImage from the given ObjectFile.
-  static ObjectImage *
-  createObjectImageFromFile(std::unique_ptr<object::ObjectFile> InputObject) {
-    return new ObjectImageCommon(std::move(InputObject));
-  }
 
   /// Create a RuntimeDyldMachO instance for the given target architecture.
   static std::unique_ptr<RuntimeDyldMachO> create(Triple::ArchType Arch,
                                                   RTDyldMemoryManager *mm);
 
+  std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
+  loadObject(const object::ObjectFile &O) override;
+
   SectionEntry &getSection(unsigned SectionID) { return Sections[SectionID]; }
 
-  bool isCompatibleFormat(const ObjectBuffer *Buffer) const override;
-  bool isCompatibleFile(const object::ObjectFile *Obj) const override;
+  bool isCompatibleFile(const object::ObjectFile &Obj) const override;
 };
 
 /// RuntimeDyldMachOTarget - Templated base class for generic MachO linker
@@ -153,7 +144,7 @@ private:
 public:
   RuntimeDyldMachOCRTPBase(RTDyldMemoryManager *mm) : RuntimeDyldMachO(mm) {}
 
-  void finalizeLoad(ObjectImage &ObjImg,
+  void finalizeLoad(const ObjectFile &Obj,
                     ObjSectionToIDMap &SectionMap) override;
   void registerEHFrames() override;
 };
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h
index f5cf9ac..196fa62 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h
@@ -183,8 +183,8 @@ public:
       assert(isInt<33>(Addend) && "Invalid page reloc value.");
 
       // Encode the addend into the instruction.
-      uint32_t ImmLoValue = (uint32_t)(Addend << 17) & 0x60000000;
-      uint32_t ImmHiValue = (uint32_t)(Addend >> 9) & 0x00FFFFE0;
+      uint32_t ImmLoValue = ((uint64_t)Addend << 17) & 0x60000000;
+      uint32_t ImmHiValue = ((uint64_t)Addend >> 9) & 0x00FFFFE0;
       *p = (*p & 0x9F00001F) | ImmHiValue | ImmLoValue;
       break;
     }
@@ -243,10 +243,11 @@ public:
 
   relocation_iterator
   processRelocationRef(unsigned SectionID, relocation_iterator RelI,
-                       ObjectImage &ObjImg, ObjSectionToIDMap &ObjSectionToID,
-                       const SymbolTableMap &Symbols, StubMap &Stubs) override {
+                       const ObjectFile &BaseObjT,
+                       ObjSectionToIDMap &ObjSectionToID,
+                       StubMap &Stubs) override {
     const MachOObjectFile &Obj =
-        static_cast<const MachOObjectFile &>(*ObjImg.getObjectFile());
+      static_cast<const MachOObjectFile &>(BaseObjT);
     MachO::any_relocation_info RelInfo =
         Obj.getRelocation(RelI->getRawDataRefImpl());
 
@@ -268,10 +269,10 @@ public:
       RelInfo = Obj.getRelocation(RelI->getRawDataRefImpl());
     }
 
-    RelocationEntry RE(getRelocationEntry(SectionID, ObjImg, RelI));
+    RelocationEntry RE(getRelocationEntry(SectionID, Obj, RelI));
     RE.Addend = decodeAddend(RE);
     RelocationValueRef Value(
-        getRelocationValueRef(ObjImg, RelI, RE, ObjSectionToID, Symbols));
+        getRelocationValueRef(Obj, RelI, RE, ObjSectionToID));
 
     assert((ExplicitAddend == 0 || RE.Addend == 0) && "Relocation has "\
       "ARM64_RELOC_ADDEND and embedded addend in the instruction.");
@@ -282,7 +283,7 @@ public:
 
     bool IsExtern = Obj.getPlainRelocationExternal(RelInfo);
     if (!IsExtern && RE.IsPCRel)
-      makeValueAddendPCRel(Value, ObjImg, RelI, 1 << RE.Size);
+      makeValueAddendPCRel(Value, Obj, RelI, 1 << RE.Size);
 
     RE.Addend = Value.Offset;
 
@@ -359,7 +360,7 @@ public:
     }
   }
 
-  void finalizeSection(ObjectImage &ObjImg, unsigned SectionID,
+  void finalizeSection(const ObjectFile &Obj, unsigned SectionID,
                        const SectionRef &Section) {}
 
 private:
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
index 9766751..09e430e 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
@@ -49,29 +49,30 @@ public:
 
   relocation_iterator
   processRelocationRef(unsigned SectionID, relocation_iterator RelI,
-                       ObjectImage &ObjImg, ObjSectionToIDMap &ObjSectionToID,
-                       const SymbolTableMap &Symbols, StubMap &Stubs) override {
+                       const ObjectFile &BaseObjT,
+                       ObjSectionToIDMap &ObjSectionToID,
+                       StubMap &Stubs) override {
     const MachOObjectFile &Obj =
-        static_cast<const MachOObjectFile &>(*ObjImg.getObjectFile());
+        static_cast<const MachOObjectFile &>(BaseObjT);
     MachO::any_relocation_info RelInfo =
         Obj.getRelocation(RelI->getRawDataRefImpl());
     uint32_t RelType = Obj.getAnyRelocationType(RelInfo);
 
     if (Obj.isRelocationScattered(RelInfo)) {
       if (RelType == MachO::ARM_RELOC_HALF_SECTDIFF)
-        return processHALFSECTDIFFRelocation(SectionID, RelI, ObjImg,
+        return processHALFSECTDIFFRelocation(SectionID, RelI, Obj,
                                              ObjSectionToID);
       else
         return ++++RelI;
     }
 
-    RelocationEntry RE(getRelocationEntry(SectionID, ObjImg, RelI));
+    RelocationEntry RE(getRelocationEntry(SectionID, Obj, RelI));
     RE.Addend = decodeAddend(RE);
     RelocationValueRef Value(
-        getRelocationValueRef(ObjImg, RelI, RE, ObjSectionToID, Symbols));
+        getRelocationValueRef(Obj, RelI, RE, ObjSectionToID));
 
     if (RE.IsPCRel)
-      makeValueAddendPCRel(Value, ObjImg, RelI, 8);
+      makeValueAddendPCRel(Value, Obj, RelI, 8);
 
     if ((RE.RelType & 0xf) == MachO::ARM_RELOC_BR24)
       processBranchRelocation(RE, Value, Stubs);
@@ -154,15 +155,14 @@ public:
     }
   }
 
-  void finalizeSection(ObjectImage &ObjImg, unsigned SectionID,
+  void finalizeSection(const ObjectFile &Obj, unsigned SectionID,
                        const SectionRef &Section) {
     StringRef Name;
     Section.getName(Name);
 
     if (Name == "__nl_symbol_ptr")
-      populateIndirectSymbolPointersSection(
-                                 cast<MachOObjectFile>(*ObjImg.getObjectFile()),
-                                 Section, SectionID);
+      populateIndirectSymbolPointersSection(cast<MachOObjectFile>(Obj),
+                                            Section, SectionID);
   }
 
 private:
@@ -199,25 +199,25 @@ private:
 
   relocation_iterator
   processHALFSECTDIFFRelocation(unsigned SectionID, relocation_iterator RelI,
-                                ObjectImage &Obj,
+                                const ObjectFile &BaseTObj,
                                 ObjSectionToIDMap &ObjSectionToID) {
-    const MachOObjectFile *MachO =
-        static_cast<const MachOObjectFile *>(Obj.getObjectFile());
+    const MachOObjectFile &MachO =
+        static_cast<const MachOObjectFile&>(BaseTObj);
     MachO::any_relocation_info RE =
-        MachO->getRelocation(RelI->getRawDataRefImpl());
+        MachO.getRelocation(RelI->getRawDataRefImpl());
 
 
     // For a half-diff relocation the length bits actually record whether this
     // is a movw/movt, and whether this is arm or thumb.
     // Bit 0 indicates movw (b0 == 0) or movt (b0 == 1).
     // Bit 1 indicates arm (b1 == 0) or thumb (b1 == 1).
-    unsigned HalfDiffKindBits = MachO->getAnyRelocationLength(RE);
+    unsigned HalfDiffKindBits = MachO.getAnyRelocationLength(RE);
     if (HalfDiffKindBits & 0x2)
       llvm_unreachable("Thumb not yet supported.");
 
     SectionEntry &Section = Sections[SectionID];
-    uint32_t RelocType = MachO->getAnyRelocationType(RE);
-    bool IsPCRel = MachO->getAnyRelocationPCRel(RE);
+    uint32_t RelocType = MachO.getAnyRelocationType(RE);
+    bool IsPCRel = MachO.getAnyRelocationPCRel(RE);
     uint64_t Offset;
     RelI->getOffset(Offset);
     uint8_t *LocalAddress = Section.Address + Offset;
@@ -226,27 +226,27 @@ private:
 
     ++RelI;
     MachO::any_relocation_info RE2 =
-        MachO->getRelocation(RelI->getRawDataRefImpl());
-    uint32_t AddrA = MachO->getScatteredRelocationValue(RE);
-    section_iterator SAI = getSectionByAddress(*MachO, AddrA);
-    assert(SAI != MachO->section_end() && "Can't find section for address A");
+      MachO.getRelocation(RelI->getRawDataRefImpl());
+    uint32_t AddrA = MachO.getScatteredRelocationValue(RE);
+    section_iterator SAI = getSectionByAddress(MachO, AddrA);
+    assert(SAI != MachO.section_end() && "Can't find section for address A");
     uint64_t SectionABase = SAI->getAddress();
     uint64_t SectionAOffset = AddrA - SectionABase;
     SectionRef SectionA = *SAI;
     bool IsCode = SectionA.isText();
     uint32_t SectionAID =
-        findOrEmitSection(Obj, SectionA, IsCode, ObjSectionToID);
+        findOrEmitSection(MachO, SectionA, IsCode, ObjSectionToID);
 
-    uint32_t AddrB = MachO->getScatteredRelocationValue(RE2);
-    section_iterator SBI = getSectionByAddress(*MachO, AddrB);
-    assert(SBI != MachO->section_end() && "Can't find section for address B");
+    uint32_t AddrB = MachO.getScatteredRelocationValue(RE2);
+    section_iterator SBI = getSectionByAddress(MachO, AddrB);
+    assert(SBI != MachO.section_end() && "Can't find section for address B");
     uint64_t SectionBBase = SBI->getAddress();
     uint64_t SectionBOffset = AddrB - SectionBBase;
     SectionRef SectionB = *SBI;
     uint32_t SectionBID =
-        findOrEmitSection(Obj, SectionB, IsCode, ObjSectionToID);
+        findOrEmitSection(MachO, SectionB, IsCode, ObjSectionToID);
 
-    uint32_t OtherHalf = MachO->getAnyRelocationAddress(RE2) & 0xffff;
+    uint32_t OtherHalf = MachO.getAnyRelocationAddress(RE2) & 0xffff;
     unsigned Shift = (HalfDiffKindBits & 0x1) ? 16 : 0;
     uint32_t FullImmVal = (Immediate << Shift) | (OtherHalf << (16 - Shift));
     int64_t Addend = FullImmVal - (AddrA - AddrB);
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
index 258b847..67d7027 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
@@ -31,10 +31,11 @@ public:
 
   relocation_iterator
   processRelocationRef(unsigned SectionID, relocation_iterator RelI,
-                       ObjectImage &ObjImg, ObjSectionToIDMap &ObjSectionToID,
-                       const SymbolTableMap &Symbols, StubMap &Stubs) override {
+                       const ObjectFile &BaseObjT,
+                       ObjSectionToIDMap &ObjSectionToID,
+                       StubMap &Stubs) override {
     const MachOObjectFile &Obj =
-        static_cast<const MachOObjectFile &>(*ObjImg.getObjectFile());
+        static_cast<const MachOObjectFile &>(BaseObjT);
     MachO::any_relocation_info RelInfo =
         Obj.getRelocation(RelI->getRawDataRefImpl());
     uint32_t RelType = Obj.getAnyRelocationType(RelInfo);
@@ -42,18 +43,18 @@ public:
     if (Obj.isRelocationScattered(RelInfo)) {
       if (RelType == MachO::GENERIC_RELOC_SECTDIFF ||
           RelType == MachO::GENERIC_RELOC_LOCAL_SECTDIFF)
-        return processSECTDIFFRelocation(SectionID, RelI, ObjImg,
+        return processSECTDIFFRelocation(SectionID, RelI, Obj,
                                          ObjSectionToID);
       else if (RelType == MachO::GENERIC_RELOC_VANILLA)
-        return processI386ScatteredVANILLA(SectionID, RelI, ObjImg,
+        return processI386ScatteredVANILLA(SectionID, RelI, Obj,
                                            ObjSectionToID);
       llvm_unreachable("Unhandled scattered relocation.");
     }
 
-    RelocationEntry RE(getRelocationEntry(SectionID, ObjImg, RelI));
+    RelocationEntry RE(getRelocationEntry(SectionID, Obj, RelI));
     RE.Addend = memcpyAddend(RE);
     RelocationValueRef Value(
-        getRelocationValueRef(ObjImg, RelI, RE, ObjSectionToID, Symbols));
+        getRelocationValueRef(Obj, RelI, RE, ObjSectionToID));
 
     // Addends for external, PC-rel relocations on i386 point back to the zero
     // offset. Calculate the final offset from the relocation target instead.
@@ -66,7 +67,7 @@ public:
     //   Value.Addend += RelocAddr + 4;
     // }
     if (RE.IsPCRel)
-      makeValueAddendPCRel(Value, ObjImg, RelI, 1 << RE.Size);
+      makeValueAddendPCRel(Value, Obj, RelI, 1 << RE.Size);
 
     RE.Addend = Value.Offset;
 
@@ -110,34 +111,32 @@ public:
     }
   }
 
-  void finalizeSection(ObjectImage &ObjImg, unsigned SectionID,
+  void finalizeSection(const ObjectFile &Obj, unsigned SectionID,
                        const SectionRef &Section) {
     StringRef Name;
     Section.getName(Name);
 
     if (Name == "__jump_table")
-      populateJumpTable(cast<MachOObjectFile>(*ObjImg.getObjectFile()), Section,
-                        SectionID);
+      populateJumpTable(cast<MachOObjectFile>(Obj), Section, SectionID);
     else if (Name == "__pointers")
-      populateIndirectSymbolPointersSection(
-                                 cast<MachOObjectFile>(*ObjImg.getObjectFile()),
-                                 Section, SectionID);
+      populateIndirectSymbolPointersSection(cast<MachOObjectFile>(Obj),
+                                            Section, SectionID);
   }
 
 private:
   relocation_iterator
   processSECTDIFFRelocation(unsigned SectionID, relocation_iterator RelI,
-                            ObjectImage &Obj,
+                            const ObjectFile &BaseObjT,
                             ObjSectionToIDMap &ObjSectionToID) {
-    const MachOObjectFile *MachO =
-        static_cast<const MachOObjectFile *>(Obj.getObjectFile());
+    const MachOObjectFile &Obj =
+        static_cast<const MachOObjectFile&>(BaseObjT);
     MachO::any_relocation_info RE =
-        MachO->getRelocation(RelI->getRawDataRefImpl());
+        Obj.getRelocation(RelI->getRawDataRefImpl());
 
     SectionEntry &Section = Sections[SectionID];
-    uint32_t RelocType = MachO->getAnyRelocationType(RE);
-    bool IsPCRel = MachO->getAnyRelocationPCRel(RE);
-    unsigned Size = MachO->getAnyRelocationLength(RE);
+    uint32_t RelocType = Obj.getAnyRelocationType(RE);
+    bool IsPCRel = Obj.getAnyRelocationPCRel(RE);
+    unsigned Size = Obj.getAnyRelocationLength(RE);
     uint64_t Offset;
     RelI->getOffset(Offset);
     uint8_t *LocalAddress = Section.Address + Offset;
@@ -146,11 +145,11 @@ private:
 
     ++RelI;
     MachO::any_relocation_info RE2 =
-        MachO->getRelocation(RelI->getRawDataRefImpl());
+        Obj.getRelocation(RelI->getRawDataRefImpl());
 
-    uint32_t AddrA = MachO->getScatteredRelocationValue(RE);
-    section_iterator SAI = getSectionByAddress(*MachO, AddrA);
-    assert(SAI != MachO->section_end() && "Can't find section for address A");
+    uint32_t AddrA = Obj.getScatteredRelocationValue(RE);
+    section_iterator SAI = getSectionByAddress(Obj, AddrA);
+    assert(SAI != Obj.section_end() && "Can't find section for address A");
     uint64_t SectionABase = SAI->getAddress();
     uint64_t SectionAOffset = AddrA - SectionABase;
     SectionRef SectionA = *SAI;
@@ -158,9 +157,9 @@ private:
     uint32_t SectionAID =
         findOrEmitSection(Obj, SectionA, IsCode, ObjSectionToID);
 
-    uint32_t AddrB = MachO->getScatteredRelocationValue(RE2);
-    section_iterator SBI = getSectionByAddress(*MachO, AddrB);
-    assert(SBI != MachO->section_end() && "Can't find section for address B");
+    uint32_t AddrB = Obj.getScatteredRelocationValue(RE2);
+    section_iterator SBI = getSectionByAddress(Obj, AddrB);
+    assert(SBI != Obj.section_end() && "Can't find section for address B");
     uint64_t SectionBBase = SBI->getAddress();
     uint64_t SectionBOffset = AddrB - SectionBBase;
     SectionRef SectionB = *SBI;
@@ -186,26 +185,27 @@ private:
   }
 
   relocation_iterator processI386ScatteredVANILLA(
-      unsigned SectionID, relocation_iterator RelI, ObjectImage &Obj,
+      unsigned SectionID, relocation_iterator RelI,
+      const ObjectFile &BaseObjT,
       RuntimeDyldMachO::ObjSectionToIDMap &ObjSectionToID) {
-    const MachOObjectFile *MachO =
-        static_cast<const MachOObjectFile *>(Obj.getObjectFile());
+    const MachOObjectFile &Obj =
+        static_cast<const MachOObjectFile&>(BaseObjT);
     MachO::any_relocation_info RE =
-        MachO->getRelocation(RelI->getRawDataRefImpl());
+        Obj.getRelocation(RelI->getRawDataRefImpl());
 
     SectionEntry &Section = Sections[SectionID];
-    uint32_t RelocType = MachO->getAnyRelocationType(RE);
-    bool IsPCRel = MachO->getAnyRelocationPCRel(RE);
-    unsigned Size = MachO->getAnyRelocationLength(RE);
+    uint32_t RelocType = Obj.getAnyRelocationType(RE);
+    bool IsPCRel = Obj.getAnyRelocationPCRel(RE);
+    unsigned Size = Obj.getAnyRelocationLength(RE);
     uint64_t Offset;
     RelI->getOffset(Offset);
     uint8_t *LocalAddress = Section.Address + Offset;
     unsigned NumBytes = 1 << Size;
     int64_t Addend = readBytesUnaligned(LocalAddress, NumBytes);
 
-    unsigned SymbolBaseAddr = MachO->getScatteredRelocationValue(RE);
-    section_iterator TargetSI = getSectionByAddress(*MachO, SymbolBaseAddr);
-    assert(TargetSI != MachO->section_end() && "Can't find section for symbol");
+    unsigned SymbolBaseAddr = Obj.getScatteredRelocationValue(RE);
+    section_iterator TargetSI = getSectionByAddress(Obj, SymbolBaseAddr);
+    assert(TargetSI != Obj.section_end() && "Can't find section for symbol");
     uint64_t SectionBaseAddr = TargetSI->getAddress();
     SectionRef TargetSection = *TargetSI;
     bool IsCode = TargetSection.isText();
@@ -221,7 +221,7 @@ private:
   }
 
   // Populate stubs in __jump_table section.
-  void populateJumpTable(MachOObjectFile &Obj, const SectionRef &JTSection,
+  void populateJumpTable(const MachOObjectFile &Obj, const SectionRef &JTSection,
                          unsigned JTSectionID) {
     assert(!Obj.is64Bit() &&
            "__jump_table section not supported in 64-bit MachO.");
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
index 84d9e80..0734017 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
@@ -31,24 +31,25 @@ public:
 
   relocation_iterator
   processRelocationRef(unsigned SectionID, relocation_iterator RelI,
-                       ObjectImage &ObjImg, ObjSectionToIDMap &ObjSectionToID,
-                       const SymbolTableMap &Symbols, StubMap &Stubs) override {
+                       const ObjectFile &BaseObjT,
+                       ObjSectionToIDMap &ObjSectionToID,
+                       StubMap &Stubs) override {
     const MachOObjectFile &Obj =
-        static_cast<const MachOObjectFile &>(*ObjImg.getObjectFile());
+      static_cast<const MachOObjectFile &>(BaseObjT);
     MachO::any_relocation_info RelInfo =
         Obj.getRelocation(RelI->getRawDataRefImpl());
 
     assert(!Obj.isRelocationScattered(RelInfo) &&
            "Scattered relocations not supported on X86_64");
 
-    RelocationEntry RE(getRelocationEntry(SectionID, ObjImg, RelI));
+    RelocationEntry RE(getRelocationEntry(SectionID, Obj, RelI));
     RE.Addend = memcpyAddend(RE);
     RelocationValueRef Value(
-        getRelocationValueRef(ObjImg, RelI, RE, ObjSectionToID, Symbols));
+        getRelocationValueRef(Obj, RelI, RE, ObjSectionToID));
 
     bool IsExtern = Obj.getPlainRelocationExternal(RelInfo);
     if (!IsExtern && RE.IsPCRel)
-      makeValueAddendPCRel(Value, ObjImg, RelI, 1 << RE.Size);
+      makeValueAddendPCRel(Value, Obj, RelI, 1 << RE.Size);
 
     if (RE.RelType == MachO::X86_64_RELOC_GOT ||
         RE.RelType == MachO::X86_64_RELOC_GOT_LOAD)
@@ -97,7 +98,7 @@ public:
     }
   }
 
-  void finalizeSection(ObjectImage &ObjImg, unsigned SectionID,
+  void finalizeSection(const ObjectFile &Obj, unsigned SectionID,
                        const SectionRef &Section) {}
 
 private:
diff --git a/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp b/lib/ExecutionEngine/SectionMemoryManager.cpp
index 5986084..5986084 100644
--- a/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
+++ b/lib/ExecutionEngine/SectionMemoryManager.cpp
diff --git a/lib/Fuzzer/CMakeLists.txt b/lib/Fuzzer/CMakeLists.txt
new file mode 100644
index 0000000..81e51d1
--- /dev/null
+++ b/lib/Fuzzer/CMakeLists.txt
@@ -0,0 +1,21 @@
+# Disable the coverage instrumentation for the fuzzer itself.
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -fsanitize-coverage=0")
+if( LLVM_USE_SANITIZE_COVERAGE  )
+  add_library(LLVMFuzzerNoMain OBJECT
+    FuzzerCrossOver.cpp
+    FuzzerDriver.cpp
+    FuzzerIO.cpp
+    FuzzerLoop.cpp
+    FuzzerMutate.cpp
+    FuzzerSanitizerOptions.cpp
+    FuzzerUtil.cpp
+    )
+  add_library(LLVMFuzzer STATIC
+    FuzzerMain.cpp
+    $<TARGET_OBJECTS:LLVMFuzzerNoMain>
+    )
+
+  if( LLVM_INCLUDE_TESTS )
+    add_subdirectory(test)
+  endif()
+endif()
diff --git a/lib/Fuzzer/FuzzerCrossOver.cpp b/lib/Fuzzer/FuzzerCrossOver.cpp
new file mode 100644
index 0000000..94af6d5
--- /dev/null
+++ b/lib/Fuzzer/FuzzerCrossOver.cpp
@@ -0,0 +1,47 @@
+//===- FuzzerCrossOver.cpp - Cross over two test inputs -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Cross over test inputs.
+//===----------------------------------------------------------------------===//
+
+#include "FuzzerInternal.h"
+#include <algorithm>
+
+namespace fuzzer {
+
+// Cross A and B, store the result (ap to MaxLen bytes) in U.
+void CrossOver(const Unit &A, const Unit &B, Unit *U, size_t MaxLen) {
+  size_t Size = rand() % MaxLen + 1;
+  U->clear();
+  const Unit *V = &A;
+  size_t PosA = 0;
+  size_t PosB = 0;
+  size_t *Pos = &PosA;
+  while (U->size() < Size && (PosA < A.size() || PosB < B.size())) {
+    // Merge a part of V into U.
+    size_t SizeLeftU = Size - U->size();
+    if (*Pos < V->size()) {
+      size_t SizeLeftV = V->size() - *Pos;
+      size_t MaxExtraSize = std::min(SizeLeftU, SizeLeftV);
+      size_t ExtraSize = rand() % MaxExtraSize + 1;
+      U->insert(U->end(), V->begin() + *Pos, V->begin() + *Pos + ExtraSize);
+      (*Pos) += ExtraSize;
+    }
+
+    // Use the other Unit on the next iteration.
+    if (Pos == &PosA) {
+      Pos = &PosB;
+      V = &B;
+    } else {
+      Pos = &PosA;
+      V = &A;
+    }
+  }
+}
+
+}  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp
new file mode 100644
index 0000000..1746afd
--- /dev/null
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@@ -0,0 +1,199 @@
+//===- FuzzerDriver.cpp - FuzzerDriver function and flags -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// FuzzerDriver and flag parsing.
+//===----------------------------------------------------------------------===//
+
+#include "FuzzerInterface.h"
+#include "FuzzerInternal.h"
+
+#include <cstring>
+#include <unistd.h>
+#include <iostream>
+#include <thread>
+#include <atomic>
+#include <mutex>
+
+namespace fuzzer {
+
+// Program arguments.
+struct FlagDescription {
+  const char *Name;
+  const char *Description;
+  int   Default;
+  int   *Flag;
+};
+
+struct {
+#define FUZZER_FLAG(Type, Name, Default, Description) Type Name;
+#include "FuzzerFlags.def"
+#undef FUZZER_FLAG
+} Flags;
+
+static FlagDescription FlagDescriptions [] {
+#define FUZZER_FLAG(Type, Name, Default, Description) {#Name, Description, Default, &Flags.Name},
+#include "FuzzerFlags.def"
+#undef FUZZER_FLAG
+};
+
+static const size_t kNumFlags =
+    sizeof(FlagDescriptions) / sizeof(FlagDescriptions[0]);
+
+static std::vector<std::string> inputs;
+static const char *ProgName;
+
+static void PrintHelp() {
+  std::cerr << "Usage: " << ProgName
+            << " [-flag1=val1 [-flag2=val2 ...] ] [dir1 [dir2 ...] ]\n";
+  std::cerr << "\nFlags: (strictly in form -flag=value)\n";
+  size_t MaxFlagLen = 0;
+  for (size_t F = 0; F < kNumFlags; F++)
+    MaxFlagLen = std::max(strlen(FlagDescriptions[F].Name), MaxFlagLen);
+
+  for (size_t F = 0; F < kNumFlags; F++) {
+    const auto &D = FlagDescriptions[F];
+    std::cerr << "  " << D.Name;
+    for (size_t i = 0, n = MaxFlagLen - strlen(D.Name); i < n; i++)
+      std::cerr << " ";
+    std::cerr << "\t";
+    std::cerr << D.Default << "\t" << D.Description << "\n";
+  }
+}
+
+static const char *FlagValue(const char *Param, const char *Name) {
+  size_t Len = strlen(Name);
+  if (Param[0] == '-' && strstr(Param + 1, Name) == Param + 1 &&
+      Param[Len + 1] == '=')
+      return &Param[Len + 2];
+  return nullptr;
+}
+
+static bool ParseOneFlag(const char *Param) {
+  if (Param[0] != '-') return false;
+  for (size_t F = 0; F < kNumFlags; F++) {
+    const char *Name = FlagDescriptions[F].Name;
+    const char *Str = FlagValue(Param, Name);
+    if (Str)  {
+      int Val = std::stol(Str);
+      *FlagDescriptions[F].Flag = Val;
+      if (Flags.verbosity >= 2)
+        std::cerr << "Flag: " << Name << " " << Val << "\n";
+      return true;
+    }
+  }
+  PrintHelp();
+  exit(1);
+}
+
+// We don't use any library to minimize dependencies.
+static void ParseFlags(int argc, char **argv) {
+  for (size_t F = 0; F < kNumFlags; F++)
+    *FlagDescriptions[F].Flag = FlagDescriptions[F].Default;
+  for (int A = 1; A < argc; A++) {
+    if (ParseOneFlag(argv[A])) continue;
+    inputs.push_back(argv[A]);
+  }
+}
+
+static void WorkerThread(const std::string &Cmd, std::atomic<int> *Counter,
+                        int NumJobs, std::atomic<bool> *HasErrors) {
+  static std::mutex CerrMutex;
+  while (true) {
+    int C = (*Counter)++;
+    if (C >= NumJobs) break;
+    std::string Log = "fuzz-" + std::to_string(C) + ".log";
+    std::string ToRun = Cmd + " > " + Log + " 2>&1\n";
+    if (Flags.verbosity)
+      std::cerr << ToRun;
+    int ExitCode = system(ToRun.c_str());
+    if (ExitCode != 0)
+      *HasErrors = true;
+    std::lock_guard<std::mutex> Lock(CerrMutex);
+    std::cerr << "================== Job " << C
+              << " exited with exit code " << ExitCode
+              << " =================\n";
+    fuzzer::CopyFileToErr(Log);
+  }
+}
+
+static int RunInMultipleProcesses(int argc, char **argv, int NumWorkers,
+                                  int NumJobs) {
+  std::atomic<int> Counter(0);
+  std::atomic<bool> HasErrors(false);
+  std::string Cmd;
+  for (int i = 0; i < argc; i++) {
+    if (FlagValue(argv[i], "jobs") || FlagValue(argv[i], "workers")) continue;
+    Cmd += argv[i];
+    Cmd += " ";
+  }
+  std::vector<std::thread> V;
+  for (int i = 0; i < NumWorkers; i++)
+    V.push_back(std::thread(WorkerThread, Cmd, &Counter, NumJobs, &HasErrors));
+  for (auto &T : V)
+    T.join();
+  return HasErrors ? 1 : 0;
+}
+
+int FuzzerDriver(int argc, char **argv, UserCallback Callback) {
+  using namespace fuzzer;
+
+  ProgName = argv[0];
+  ParseFlags(argc, argv);
+  if (Flags.help) {
+    PrintHelp();
+    return 0;
+  }
+
+  if (Flags.workers > 0 && Flags.jobs > 0)
+    return RunInMultipleProcesses(argc, argv, Flags.workers, Flags.jobs);
+
+  Fuzzer::FuzzingOptions Options;
+  Options.Verbosity = Flags.verbosity;
+  Options.MaxLen = Flags.max_len;
+  Options.DoCrossOver = Flags.cross_over;
+  Options.MutateDepth = Flags.mutate_depth;
+  Options.ExitOnFirst = Flags.exit_on_first;
+  Options.UseFullCoverageSet = Flags.use_full_coverage_set;
+  Options.UseCoveragePairs = Flags.use_coverage_pairs;
+  Options.PreferSmallDuringInitialShuffle =
+      Flags.prefer_small_during_initial_shuffle;
+  if (Flags.runs >= 0)
+    Options.MaxNumberOfRuns = Flags.runs;
+  if (!inputs.empty())
+    Options.OutputCorpus = inputs[0];
+  Fuzzer F(Callback, Options);
+
+  unsigned seed = Flags.seed;
+  // Initialize seed.
+  if (seed == 0)
+    seed = time(0) * 10000 + getpid();
+  if (Flags.verbosity)
+    std::cerr << "Seed: " << seed << "\n";
+  srand(seed);
+
+  // Timer
+  if (Flags.timeout > 0)
+    SetTimer(Flags.timeout);
+
+  for (auto &inp : inputs)
+    F.ReadDir(inp);
+
+  if (F.CorpusSize() == 0)
+    F.AddToCorpus(Unit());  // Can't fuzz empty corpus, so add an empty input.
+  F.ShuffleAndMinimize();
+  if (Flags.save_minimized_corpus)
+    F.SaveCorpus();
+  F.Loop(Flags.iterations < 0 ? INT_MAX : Flags.iterations);
+  if (Flags.verbosity)
+    std::cerr << "Done " << F.getTotalNumberOfRuns()
+              << " runs in " << F.secondsSinceProcessStartUp()
+              << " seconds\n";
+  return 0;
+}
+
+}  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def
new file mode 100644
index 0000000..068f245
--- /dev/null
+++ b/lib/Fuzzer/FuzzerFlags.def
@@ -0,0 +1,45 @@
+//===- FuzzerFlags.def - Run-time flags -------------------------*- C++ -* ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Flags. FUZZER_FLAG macro should be defined at the point of inclusion.
+// We are not using any flag parsing library for better portability and
+// independence.
+//===----------------------------------------------------------------------===//
+FUZZER_FLAG(int, verbosity, 1, "Verbosity level.")
+FUZZER_FLAG(int, seed, 0, "Random seed. If 0, seed is generated.")
+FUZZER_FLAG(int, iterations, -1,
+            "Number of iterations of the fuzzer internal loop"
+            " (-1 for infinite iterations).")
+FUZZER_FLAG(int, runs, -1,
+            "Number of individual test runs (-1 for infinite runs).")
+FUZZER_FLAG(int, max_len, 64, "Maximal length of the test input.")
+FUZZER_FLAG(int, cross_over, 1, "If 1, cross over inputs.")
+FUZZER_FLAG(int, mutate_depth, 5,
+            "Apply this number of consecutive mutations to each input.")
+FUZZER_FLAG(
+    int, prefer_small_during_initial_shuffle, -1,
+    "If 1, always prefer smaller inputs during the initial corpus shuffle."
+    " If 0, never do that. If -1, do it sometimes.")
+FUZZER_FLAG(int, exit_on_first, 0,
+            "If 1, exit after the first new interesting input is found.")
+FUZZER_FLAG(int, timeout, -1, "Timeout in seconds (if positive).")
+FUZZER_FLAG(int, help, 0, "Print help.")
+FUZZER_FLAG(
+    int, save_minimized_corpus, 0,
+    "If 1, the minimized corpus is saved into the first input directory")
+FUZZER_FLAG(int, use_full_coverage_set, 0,
+            "Experimental: Maximize the number of different full"
+            " coverage sets as opposed to maximizing the total coverage."
+            " This is potentially MUCH slower, but may discover more paths.")
+FUZZER_FLAG(int, use_coverage_pairs, 0,
+            "Experimental: Maximize the number of different coverage pairs.")
+FUZZER_FLAG(int, jobs, 0, "Number of jobs to run. If jobs >= 1 we spawn"
+                          " this number of jobs in separate worker processes"
+                          " with stdout/stderr redirected to fuzz-JOB.log.")
+FUZZER_FLAG(int, workers, 0,
+            "Number of simultaneous worker processes to run the jobs.")
diff --git a/lib/Fuzzer/FuzzerIO.cpp b/lib/Fuzzer/FuzzerIO.cpp
new file mode 100644
index 0000000..224808c
--- /dev/null
+++ b/lib/Fuzzer/FuzzerIO.cpp
@@ -0,0 +1,57 @@
+//===- FuzzerIO.cpp - IO utils. -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// IO functions.
+//===----------------------------------------------------------------------===//
+#include "FuzzerInternal.h"
+#include <iostream>
+#include <iterator>
+#include <fstream>
+#include <dirent.h>
+namespace fuzzer {
+
+static std::vector<std::string> ListFilesInDir(const std::string &Dir) {
+  std::vector<std::string> V;
+  DIR *D = opendir(Dir.c_str());
+  if (!D) return V;
+  while (auto E = readdir(D)) {
+    if (E->d_type == DT_REG || E->d_type == DT_LNK)
+      V.push_back(E->d_name);
+  }
+  closedir(D);
+  return V;
+}
+
+Unit FileToVector(const std::string &Path) {
+  std::ifstream T(Path);
+  return Unit((std::istreambuf_iterator<char>(T)),
+              std::istreambuf_iterator<char>());
+}
+
+void CopyFileToErr(const std::string &Path) {
+  std::ifstream T(Path);
+  std::copy(std::istreambuf_iterator<char>(T), std::istreambuf_iterator<char>(),
+            std::ostream_iterator<char>(std::cerr, ""));
+}
+
+void WriteToFile(const Unit &U, const std::string &Path) {
+  std::ofstream OF(Path);
+  OF.write((const char*)U.data(), U.size());
+}
+
+void ReadDirToVectorOfUnits(const char *Path, std::vector<Unit> *V) {
+  for (auto &X : ListFilesInDir(Path))
+    V->push_back(FileToVector(DirPlusFile(Path, X)));
+}
+
+std::string DirPlusFile(const std::string &DirPath,
+                        const std::string &FileName) {
+  return DirPath + "/" + FileName;
+}
+
+}  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerInterface.h b/lib/Fuzzer/FuzzerInterface.h
new file mode 100644
index 0000000..49d8c0f
--- /dev/null
+++ b/lib/Fuzzer/FuzzerInterface.h
@@ -0,0 +1,25 @@
+//===- FuzzerInterface.h - Interface header for the Fuzzer ------*- C++ -* ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Define the interface between the Fuzzer and the library being tested.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_FUZZER_INTERFACE_H
+#define LLVM_FUZZER_INTERFACE_H
+
+#include <cstddef>
+#include <cstdint>
+
+namespace fuzzer {
+
+typedef void (*UserCallback)(const uint8_t *data, size_t size);
+int FuzzerDriver(int argc, char **argv, UserCallback Callback);
+
+}  // namespace fuzzer
+
+#endif  // LLVM_FUZZER_INTERFACE_H
diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h
new file mode 100644
index 0000000..980b00e
--- /dev/null
+++ b/lib/Fuzzer/FuzzerInternal.h
@@ -0,0 +1,104 @@
+//===- FuzzerInternal.h - Internal header for the Fuzzer --------*- C++ -* ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Define the main class fuzzer::Fuzzer and most functions.
+//===----------------------------------------------------------------------===//
+#include <cassert>
+#include <climits>
+#include <chrono>
+#include <cstddef>
+#include <cstdlib>
+#include <string>
+#include <vector>
+#include <unordered_set>
+
+#include "FuzzerInterface.h"
+
+namespace fuzzer {
+typedef std::vector<uint8_t> Unit;
+using namespace std::chrono;
+
+Unit ReadFile(const char *Path);
+void ReadDirToVectorOfUnits(const char *Path, std::vector<Unit> *V);
+void WriteToFile(const Unit &U, const std::string &Path);
+void CopyFileToErr(const std::string &Path);
+// Returns "Dir/FileName" or equivalent for the current OS.
+std::string DirPlusFile(const std::string &DirPath,
+                        const std::string &FileName);
+
+void Mutate(Unit *U, size_t MaxLen);
+
+void CrossOver(const Unit &A, const Unit &B, Unit *U, size_t MaxLen);
+
+void Print(const Unit &U, const char *PrintAfter = "");
+void PrintASCII(const Unit &U, const char *PrintAfter = "");
+std::string Hash(const Unit &U);
+void SetTimer(int Seconds);
+
+class Fuzzer {
+ public:
+  struct FuzzingOptions {
+    int Verbosity = 1;
+    int MaxLen = 0;
+    bool DoCrossOver = true;
+    int  MutateDepth = 5;
+    bool ExitOnFirst = false;
+    bool UseFullCoverageSet  = false;
+    bool UseCoveragePairs = false;
+    int PreferSmallDuringInitialShuffle = -1;
+    size_t MaxNumberOfRuns = ULONG_MAX;
+    std::string OutputCorpus;
+  };
+  Fuzzer(UserCallback Callback, FuzzingOptions Options)
+      : Callback(Callback), Options(Options) {
+    SetDeathCallback();
+  }
+  void AddToCorpus(const Unit &U) { Corpus.push_back(U); }
+  size_t Loop(size_t NumIterations);
+  void ShuffleAndMinimize();
+  size_t CorpusSize() const { return Corpus.size(); }
+  void ReadDir(const std::string &Path) {
+    ReadDirToVectorOfUnits(Path.c_str(), &Corpus);
+  }
+  // Save the current corpus to OutputCorpus.
+  void SaveCorpus();
+
+  size_t secondsSinceProcessStartUp() {
+    return duration_cast<seconds>(system_clock::now() - ProcessStartTime)
+        .count();
+  }
+
+  size_t getTotalNumberOfRuns() { return TotalNumberOfRuns; }
+
+  static void AlarmCallback();
+
+ private:
+  size_t MutateAndTestOne(Unit *U);
+  size_t RunOne(const Unit &U);
+  size_t RunOneMaximizeTotalCoverage(const Unit &U);
+  size_t RunOneMaximizeFullCoverageSet(const Unit &U);
+  size_t RunOneMaximizeCoveragePairs(const Unit &U);
+  void WriteToOutputCorpus(const Unit &U);
+  static void WriteToCrash(const Unit &U, const char *Prefix);
+
+  void SetDeathCallback();
+  static void DeathCallback();
+  static Unit CurrentUnit;
+
+  size_t TotalNumberOfRuns = 0;
+
+  std::vector<Unit> Corpus;
+  std::unordered_set<uintptr_t> FullCoverageSets;
+  std::unordered_set<uint64_t>  CoveragePairs;
+  UserCallback Callback;
+  FuzzingOptions Options;
+  system_clock::time_point ProcessStartTime = system_clock::now();
+  static system_clock::time_point UnitStartTime;
+};
+
+};  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
new file mode 100644
index 0000000..70b63eb
--- /dev/null
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -0,0 +1,233 @@
+//===- FuzzerLoop.cpp - Fuzzer's main loop --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Fuzzer's main loop.
+//===----------------------------------------------------------------------===//
+
+#include "FuzzerInternal.h"
+#include <sanitizer/coverage_interface.h>
+#include <algorithm>
+#include <iostream>
+
+namespace fuzzer {
+
+// static
+Unit Fuzzer::CurrentUnit;
+system_clock::time_point Fuzzer::UnitStartTime;
+
+void Fuzzer::SetDeathCallback() {
+  __sanitizer_set_death_callback(DeathCallback);
+}
+
+void Fuzzer::DeathCallback() {
+  std::cerr << "DEATH: " <<  std::endl;
+  Print(CurrentUnit, "\n");
+  PrintASCII(CurrentUnit, "\n");
+  WriteToCrash(CurrentUnit, "crash-");
+}
+
+void Fuzzer::AlarmCallback() {
+  size_t Seconds =
+      duration_cast<seconds>(system_clock::now() - UnitStartTime).count();
+  std::cerr << "ALARM: working on the last Unit for " << Seconds << " seconds"
+            << std::endl;
+  if (Seconds >= 3) {
+    Print(CurrentUnit, "\n");
+    PrintASCII(CurrentUnit, "\n");
+    WriteToCrash(CurrentUnit, "timeout-");
+  }
+  exit(1);
+}
+
+void Fuzzer::ShuffleAndMinimize() {
+  bool PreferSmall =
+      (Options.PreferSmallDuringInitialShuffle == 1 ||
+       (Options.PreferSmallDuringInitialShuffle == -1 && rand() % 2));
+  if (Options.Verbosity)
+    std::cerr << "Shuffle: Size: " << Corpus.size()
+              << " prefer small: " << PreferSmall
+              << "\n";
+  std::vector<Unit> NewCorpus;
+  std::random_shuffle(Corpus.begin(), Corpus.end());
+  if (PreferSmall)
+    std::stable_sort(
+        Corpus.begin(), Corpus.end(),
+        [](const Unit &A, const Unit &B) { return A.size() < B.size(); });
+  size_t MaxCov = 0;
+  Unit &U = CurrentUnit;
+  for (const auto &C : Corpus) {
+    for (size_t First = 0; First < 1; First++) {
+      U.clear();
+      size_t Last = std::min(First + Options.MaxLen, C.size());
+      U.insert(U.begin(), C.begin() + First, C.begin() + Last);
+      size_t NewCoverage = RunOne(U);
+      if (NewCoverage) {
+        MaxCov = NewCoverage;
+        NewCorpus.push_back(U);
+        if (Options.Verbosity >= 2)
+          std::cerr << "NEW0: " << NewCoverage
+                    << " L " << U.size()
+                    << "\n";
+      }
+    }
+  }
+  Corpus = NewCorpus;
+  if (Options.Verbosity)
+    std::cerr << "Shuffle done: " << Corpus.size() << " IC: " << MaxCov << "\n";
+}
+
+size_t Fuzzer::RunOne(const Unit &U) {
+  UnitStartTime = system_clock::now();
+  TotalNumberOfRuns++;
+  if (Options.UseFullCoverageSet)
+    return RunOneMaximizeFullCoverageSet(U);
+  if (Options.UseCoveragePairs)
+    return RunOneMaximizeCoveragePairs(U);
+  return RunOneMaximizeTotalCoverage(U);
+}
+
+static uintptr_t HashOfArrayOfPCs(uintptr_t *PCs, uintptr_t NumPCs) {
+  uintptr_t Res = 0;
+  for (uintptr_t i = 0; i < NumPCs; i++) {
+    Res = (Res + PCs[i]) * 7;
+  }
+  return Res;
+}
+
+// Experimental. Does not yet scale.
+// Fuly reset the current coverage state, run a single unit,
+// collect all coverage pairs and return non-zero if a new pair is observed.
+size_t Fuzzer::RunOneMaximizeCoveragePairs(const Unit &U) {
+  __sanitizer_reset_coverage();
+  Callback(U.data(), U.size());
+  uintptr_t *PCs;
+  uintptr_t NumPCs = __sanitizer_get_coverage_guards(&PCs);
+  bool HasNewPairs = false;
+  for (uintptr_t i = 0; i < NumPCs; i++) {
+    if (!PCs[i]) continue;
+    for (uintptr_t j = 0; j < NumPCs; j++) {
+      if (!PCs[j]) continue;
+      uint64_t Pair = (i << 32) | j;
+      HasNewPairs |= CoveragePairs.insert(Pair).second;
+    }
+  }
+  if (HasNewPairs)
+    return CoveragePairs.size();
+  return 0;
+}
+
+// Experimental.
+// Fuly reset the current coverage state, run a single unit,
+// compute a hash function from the full coverage set,
+// return non-zero if the hash value is new.
+// This produces tons of new units and as is it's only suitable for small tests,
+// e.g. test/FullCoverageSetTest.cpp. FIXME: make it scale.
+size_t Fuzzer::RunOneMaximizeFullCoverageSet(const Unit &U) {
+  __sanitizer_reset_coverage();
+  Callback(U.data(), U.size());
+  uintptr_t *PCs;
+  uintptr_t NumPCs =__sanitizer_get_coverage_guards(&PCs);
+  if (FullCoverageSets.insert(HashOfArrayOfPCs(PCs, NumPCs)).second)
+    return FullCoverageSets.size();
+  return 0;
+}
+
+size_t Fuzzer::RunOneMaximizeTotalCoverage(const Unit &U) {
+  size_t OldCoverage = __sanitizer_get_total_unique_coverage();
+  Callback(U.data(), U.size());
+  size_t NewCoverage = __sanitizer_get_total_unique_coverage();
+  if (!(TotalNumberOfRuns & (TotalNumberOfRuns - 1)) && Options.Verbosity) {
+    size_t Seconds = secondsSinceProcessStartUp();
+    std::cerr
+        << "#" << TotalNumberOfRuns
+        << "\tcov: " << NewCoverage
+        << "\texec/s: " << (Seconds ? TotalNumberOfRuns / Seconds : 0) << "\n";
+  }
+  if (NewCoverage > OldCoverage)
+    return NewCoverage;
+  return 0;
+}
+
+void Fuzzer::WriteToOutputCorpus(const Unit &U) {
+  if (Options.OutputCorpus.empty()) return;
+  std::string Path = DirPlusFile(Options.OutputCorpus, Hash(U));
+  WriteToFile(U, Path);
+  if (Options.Verbosity >= 2)
+    std::cerr << "Written to " << Path << std::endl;
+}
+
+void Fuzzer::WriteToCrash(const Unit &U, const char *Prefix) {
+  std::string Path = Prefix + Hash(U);
+  WriteToFile(U, Path);
+  std::cerr << "CRASHED; file written to " << Path << std::endl;
+}
+
+void Fuzzer::SaveCorpus() {
+  if (Options.OutputCorpus.empty()) return;
+  for (const auto &U : Corpus)
+    WriteToFile(U, DirPlusFile(Options.OutputCorpus, Hash(U)));
+  if (Options.Verbosity)
+    std::cerr << "Written corpus of " << Corpus.size() << " files to "
+              << Options.OutputCorpus << "\n";
+}
+
+size_t Fuzzer::MutateAndTestOne(Unit *U) {
+  size_t NewUnits = 0;
+  for (int i = 0; i < Options.MutateDepth; i++) {
+    if (TotalNumberOfRuns >= Options.MaxNumberOfRuns)
+      return NewUnits;
+    Mutate(U, Options.MaxLen);
+    size_t NewCoverage = RunOne(*U);
+    if (NewCoverage) {
+      Corpus.push_back(*U);
+      NewUnits++;
+      if (Options.Verbosity) {
+        std::cerr << "#" << TotalNumberOfRuns
+                  << "\tNEW: " << NewCoverage
+                  << " L: " << U->size()
+                  << " S: " << Corpus.size()
+                  << " I: " << i
+                  << "\t";
+        if (U->size() < 30) {
+          PrintASCII(*U);
+          std::cerr << "\t";
+          Print(*U);
+        }
+        std::cerr << "\n";
+      }
+      WriteToOutputCorpus(*U);
+      if (Options.ExitOnFirst)
+        exit(0);
+    }
+  }
+  return NewUnits;
+}
+
+size_t Fuzzer::Loop(size_t NumIterations) {
+  size_t NewUnits = 0;
+  for (size_t i = 1; i <= NumIterations; i++) {
+    for (size_t J1 = 0; J1 < Corpus.size(); J1++) {
+      if (TotalNumberOfRuns >= Options.MaxNumberOfRuns)
+        return NewUnits;
+      // First, simply mutate the unit w/o doing crosses.
+      CurrentUnit = Corpus[J1];
+      NewUnits += MutateAndTestOne(&CurrentUnit);
+      // Now, cross with others.
+      if (Options.DoCrossOver) {
+        for (size_t J2 = 0; J2 < Corpus.size(); J2++) {
+          CurrentUnit.clear();
+          CrossOver(Corpus[J1], Corpus[J2], &CurrentUnit, Options.MaxLen);
+          NewUnits += MutateAndTestOne(&CurrentUnit);
+        }
+      }
+    }
+  }
+  return NewUnits;
+}
+
+}  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerMain.cpp b/lib/Fuzzer/FuzzerMain.cpp
new file mode 100644
index 0000000..d0c3df3
--- /dev/null
+++ b/lib/Fuzzer/FuzzerMain.cpp
@@ -0,0 +1,20 @@
+//===- FuzzerMain.cpp - main() function and flags -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// main() and flags.
+//===----------------------------------------------------------------------===//
+
+#include "FuzzerInterface.h"
+#include "FuzzerInternal.h"
+
+// This function should be defined by the user.
+extern "C" void TestOneInput(const uint8_t *Data, size_t Size);
+
+int main(int argc, char **argv) {
+  return fuzzer::FuzzerDriver(argc, argv, TestOneInput);
+}
diff --git a/lib/Fuzzer/FuzzerMutate.cpp b/lib/Fuzzer/FuzzerMutate.cpp
new file mode 100644
index 0000000..b28264a
--- /dev/null
+++ b/lib/Fuzzer/FuzzerMutate.cpp
@@ -0,0 +1,70 @@
+//===- FuzzerMutate.cpp - Mutate a test input -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Mutate a test input.
+//===----------------------------------------------------------------------===//
+
+#include "FuzzerInternal.h"
+
+namespace fuzzer {
+
+static char FlipRandomBit(char X) {
+  int Bit = rand() % 8;
+  char Mask = 1 << Bit;
+  char R;
+  if (X & (1 << Bit))
+    R = X & ~Mask;
+  else
+    R = X | Mask;
+  assert(R != X);
+  return R;
+}
+
+static char RandCh() {
+  if (rand() % 2) return rand();
+  const char *Special = "!*'();:@&=+$,/?%#[]123ABCxyz-`~.";
+  return Special[rand() % (sizeof(Special) - 1)];
+}
+
+// Mutate U in place.
+void Mutate(Unit *U, size_t MaxLen) {
+  assert(MaxLen > 0);
+  assert(U->size() <= MaxLen);
+  if (U->empty()) {
+    for (size_t i = 0; i < MaxLen; i++)
+      U->push_back(RandCh());
+    return;
+  }
+  assert(!U->empty());
+  switch (rand() % 3) {
+  case 0:
+    if (U->size() > 1) {
+      U->erase(U->begin() + rand() % U->size());
+      break;
+    }
+    [[clang::fallthrough]];
+  case 1:
+    if (U->size() < MaxLen) {
+      U->insert(U->begin() + rand() % U->size(), RandCh());
+    } else { // At MaxLen.
+      uint8_t Ch = RandCh();
+      size_t Idx = rand() % U->size();
+      (*U)[Idx] = Ch;
+    }
+    break;
+  default:
+    {
+      size_t Idx = rand() % U->size();
+      (*U)[Idx] = FlipRandomBit((*U)[Idx]);
+    }
+    break;
+  }
+  assert(!U->empty());
+}
+
+}  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerSanitizerOptions.cpp b/lib/Fuzzer/FuzzerSanitizerOptions.cpp
new file mode 100644
index 0000000..1c58f3a
--- /dev/null
+++ b/lib/Fuzzer/FuzzerSanitizerOptions.cpp
@@ -0,0 +1,18 @@
+//===- FuzzerSanitizerOptions.cpp - default flags for sanitizers ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Set default options for sanitizers while running the fuzzer.
+// Options reside in a separate file, so if we don't want to set the default
+// options we simply do not link this file in.
+// ASAN options:
+//   * don't dump the coverage to disk.
+//   * enable coverage by default.
+//===----------------------------------------------------------------------===//
+extern "C" const char *__asan_default_options() {
+  return "coverage_pcs=0:coverage=1";
+}
diff --git a/lib/Fuzzer/FuzzerUtil.cpp b/lib/Fuzzer/FuzzerUtil.cpp
new file mode 100644
index 0000000..679f289
--- /dev/null
+++ b/lib/Fuzzer/FuzzerUtil.cpp
@@ -0,0 +1,61 @@
+//===- FuzzerUtil.cpp - Misc utils ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Misc utils.
+//===----------------------------------------------------------------------===//
+
+#include "FuzzerInternal.h"
+#include <iostream>
+#include <sys/time.h>
+#include <cassert>
+#include <cstring>
+#include <signal.h>
+
+namespace fuzzer {
+
+void Print(const Unit &v, const char *PrintAfter) {
+  std::cerr << v.size() << ": ";
+  for (auto x : v)
+    std::cerr << (unsigned) x << " ";
+  std::cerr << PrintAfter;
+}
+
+void PrintASCII(const Unit &U, const char *PrintAfter) {
+  for (auto X : U)
+    std::cerr << (char)((isascii(X) && X >= ' ') ? X : '?');
+  std::cerr << PrintAfter;
+}
+
+std::string Hash(const Unit &in) {
+  size_t h1 = 0, h2 = 0;
+  for (auto x : in) {
+    h1 += x;
+    h1 *= 5;
+    h2 += x;
+    h2 *= 7;
+  }
+  return std::to_string(h1) + std::to_string(h2);
+}
+
+static void AlarmHandler(int, siginfo_t *, void *) {
+  Fuzzer::AlarmCallback();
+}
+
+void SetTimer(int Seconds) {
+  struct itimerval T {{Seconds, 0}, {Seconds, 0}};
+  std::cerr << "SetTimer " << Seconds << "\n";
+  int Res = setitimer(ITIMER_REAL, &T, nullptr);
+  assert(Res == 0);
+  struct sigaction sigact;
+  memset(&sigact, 0, sizeof(sigact));
+  sigact.sa_sigaction = AlarmHandler;
+  Res = sigaction(SIGALRM, &sigact, 0);
+  assert(Res == 0);
+}
+
+}  // namespace fuzzer
diff --git a/lib/Fuzzer/README.txt b/lib/Fuzzer/README.txt
new file mode 100644
index 0000000..e4d6b4f
--- /dev/null
+++ b/lib/Fuzzer/README.txt
@@ -0,0 +1,112 @@
+===============================
+Fuzzer -- a library for coverage-guided fuzz testing.
+===============================
+
+This library is intended primarily for in-process coverage-guided fuzz testing
+(fuzzing) of other libraries. The typical workflow looks like this:
+
+  * Build the Fuzzer library as a static archive (or just a set of .o files).
+    Note that the Fuzzer contains the main() function.
+    Preferably do *not* use sanitizers while building the Fuzzer.
+  * Build the library you are going to test with -fsanitize-coverage=[234]
+    and one of the sanitizers. We recommend to build the library in several
+    different modes (e.g. asan, msan, lsan, ubsan, etc) and even using different
+    optimizations options (e.g. -O0, -O1, -O2) to diversify testing.
+  * Build a test driver using the same options as the library.
+    The test driver is a C/C++ file containing interesting calls to the library
+    inside a single function:
+    extern "C" void TestOneInput(const uint8_t *Data, size_t Size);
+  * Link the Fuzzer, the library and the driver together into an executable
+    using the same sanitizer options as for the library.
+  * Collect the initial corpus of inputs for the
+    fuzzer (a directory with test inputs, one file per input).
+    The better your inputs are the faster you will find something interesting.
+    Also try to keep your inputs small, otherwise the Fuzzer will run too slow.
+  * Run the fuzzer with the test corpus. As new interesting test cases are
+    discovered they will be added to the corpus. If a bug is discovered by
+    the sanitizer (asan, etc) it will be reported as usual and the reproducer
+    will be written to disk.
+    Each Fuzzer process is single-threaded (unless the library starts its own
+    threads). You can run the Fuzzer on the same corpus in multiple processes.
+    in parallel. For run-time options run the Fuzzer binary with '-help=1'.
+
+
+The Fuzzer is similar in concept to AFL (http://lcamtuf.coredump.cx/afl/),
+but uses in-process Fuzzing, which is more fragile, more restrictive, but
+potentially much faster as it has no overhead for process start-up.
+It uses LLVM's "Sanitizer Coverage" instrumentation to get in-process
+coverage-feedback https://code.google.com/p/address-sanitizer/wiki/AsanCoverage
+
+The code resides in the LLVM repository and is (or will be) used by various
+parts of LLVM, but the Fuzzer itself does not (and should not) depend on any
+part of LLVM and can be used for other projects. Ideally, the Fuzzer's code
+should not have any external dependencies. Right now it uses STL, which may need
+to be fixed later. See also F.A.Q. below.
+
+Examples of usage in LLVM:
+  * clang-format-fuzzer. The inputs are random pieces of C++-like text.
+  * Build (make sure to use fresh clang as the host compiler):
+    cmake -GNinja  -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ \
+    -DLLVM_USE_SANITIZER=Address -DLLVM_USE_SANITIZE_COVERAGE=YES \
+    /path/to/llvm -DCMAKE_BUILD_TYPE=Release
+    ninja clang-format-fuzzer
+  * Optionally build other kinds of binaries (asan+Debug, msan, ubsan, etc)
+  * TODO: commit the pre-fuzzed corpus to svn (?).
+  * Run:
+      clang-format-fuzzer CORPUS_DIR
+
+Toy example (see SimpleTest.cpp):
+a simple function that does something interesting if it receives bytes "Hi!".
+  # Build the Fuzzer with asan:
+  % clang++ -std=c++11 -fsanitize=address -fsanitize-coverage=3 -O1 -g \
+     Fuzzer*.cpp test/SimpleTest.cpp
+  # Run the fuzzer with no corpus (assuming on empty input)
+  % ./a.out
+
+===============================================================================
+F.A.Q.
+
+Q. Why Fuzzer does not use any of the LLVM support?
+A. There are two reasons.
+First, we want this library to be used outside of the LLVM w/o users having to
+build the rest of LLVM. This may sound unconvincing for many LLVM folks,
+but in practice the need for building the whole LLVM frightens many potential
+users -- and we want more users to use this code.
+Second, there is a subtle technical reason not to rely on the rest of LLVM, or
+any other large body of code (maybe not even STL). When coverage instrumentation
+is enabled, it will also instrument the LLVM support code which will blow up the
+coverage set of the process (since the fuzzer is in-process). In other words, by
+using more external dependencies we will slow down the fuzzer while the main
+reason for it to exist is extreme speed.
+
+Q. What about Windows then? The Fuzzer contains code that does not build on
+Windows.
+A. The sanitizer coverage support does not work on Windows either as of 01/2015.
+Once it's there, we'll need to re-implement OS-specific parts (I/O, signals).
+
+Q. When this Fuzzer is not a good solution for a problem?
+A.
+  * If the test inputs are validated by the target library and the validator
+    asserts/crashes on invalid inputs, the in-process fuzzer is not applicable
+    (we could use fork() w/o exec, but it comes with extra overhead).
+  * Bugs in the target library may accumulate w/o being detected. E.g. a memory
+    corruption that goes undetected at first and then leads to a crash while
+    testing another input. This is why it is highly recommended to run this
+    in-process fuzzer with all sanitizers to detect most bugs on the spot.
+  * It is harder to protect the in-process fuzzer from excessive memory
+    consumption and infinite loops in the target library (still possible).
+  * The target library should not have significant global state that is not
+    reset between the runs.
+  * Many interesting target libs are not designed in a way that supports
+    the in-process fuzzer interface (e.g. require a file path instead of a
+    byte array).
+  * If a single test run takes a considerable fraction of a second (or
+    more) the speed benefit from the in-process fuzzer is negligible.
+  * If the target library runs persistent threads (that outlive
+    execution of one test) the fuzzing results will be unreliable.
+
+Q. So, what exactly this Fuzzer is good for?
+A. This Fuzzer might be a good choice for testing libraries that have relatively
+small inputs, each input takes < 1ms to run, and the library code is not expected
+to crash on invalid inputs.
+Examples: regular expression matchers, text or binary format parsers.
diff --git a/lib/Fuzzer/test/CMakeLists.txt b/lib/Fuzzer/test/CMakeLists.txt
new file mode 100644
index 0000000..bed9cd8
--- /dev/null
+++ b/lib/Fuzzer/test/CMakeLists.txt
@@ -0,0 +1,61 @@
+# Build all these tests with -O0, otherwise optimizations may merge some
+# basic blocks and we'll fail to discover the targets.
+# Also enable the coverage instrumentation back (it is disabled
+# for the Fuzzer lib)
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O0 -fsanitize-coverage=4")
+
+set(Tests
+  FourIndependentBranchesTest
+  FullCoverageSetTest
+  InfiniteTest
+  NullDerefTest
+  SimpleTest
+  TimeoutTest
+  )
+
+set(TestBinaries)
+
+foreach(Test ${Tests})
+  add_executable(LLVMFuzzer-${Test}
+    EXCLUDE_FROM_ALL
+    ${Test}.cpp
+    )
+  target_link_libraries(LLVMFuzzer-${Test}
+    LLVMFuzzer
+    )
+  set(TestBinaries ${TestBinaries} LLVMFuzzer-${Test})
+endforeach()
+
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
+  )
+
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/unit/lit.site.cfg.in
+  ${CMAKE_CURRENT_BINARY_DIR}/unit/lit.site.cfg
+  )
+
+include_directories(..)
+include_directories(${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include)
+
+add_executable(LLVMFuzzer-Unittest
+  FuzzerUnittest.cpp
+  $<TARGET_OBJECTS:LLVMFuzzerNoMain>
+  )
+
+target_link_libraries(LLVMFuzzer-Unittest
+  gtest
+  gtest_main
+  )
+
+set(TestBinaries ${TestBinaries} LLVMFuzzer-Unittest)
+
+set_target_properties(${TestBinaries}
+  PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+
+add_lit_testsuite(check-fuzzer "Running Fuzzer tests"
+    ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS ${TestBinaries} FileCheck not
+    )
diff --git a/lib/Fuzzer/test/FourIndependentBranchesTest.cpp b/lib/Fuzzer/test/FourIndependentBranchesTest.cpp
new file mode 100644
index 0000000..171668b
--- /dev/null
+++ b/lib/Fuzzer/test/FourIndependentBranchesTest.cpp
@@ -0,0 +1,18 @@
+// Simple test for a fuzzer. The fuzzer must find the string "FUZZ".
+#include <cstdint>
+#include <cstdlib>
+#include <cstddef>
+#include <iostream>
+
+extern "C" void TestOneInput(const uint8_t *Data, size_t Size) {
+  int bits = 0;
+  if (Size > 0 && Data[0] == 'F') bits |= 1;
+  if (Size > 1 && Data[1] == 'U') bits |= 2;
+  if (Size > 2 && Data[2] == 'Z') bits |= 4;
+  if (Size > 3 && Data[3] == 'Z') bits |= 8;
+  if (bits == 15) {
+    std::cerr <<  "BINGO!\n";
+    exit(1);
+  }
+}
+
diff --git a/lib/Fuzzer/test/FullCoverageSetTest.cpp b/lib/Fuzzer/test/FullCoverageSetTest.cpp
new file mode 100644
index 0000000..d4f8c11
--- /dev/null
+++ b/lib/Fuzzer/test/FullCoverageSetTest.cpp
@@ -0,0 +1,20 @@
+// Simple test for a fuzzer. The fuzzer must find the string "FUZZER".
+#include <cstdint>
+#include <cstdlib>
+#include <cstddef>
+#include <iostream>
+
+extern "C" void TestOneInput(const uint8_t *Data, size_t Size) {
+  int bits = 0;
+  if (Size > 0 && Data[0] == 'F') bits |= 1;
+  if (Size > 1 && Data[1] == 'U') bits |= 2;
+  if (Size > 2 && Data[2] == 'Z') bits |= 4;
+  if (Size > 3 && Data[3] == 'Z') bits |= 8;
+  if (Size > 4 && Data[4] == 'E') bits |= 16;
+  if (Size > 5 && Data[5] == 'R') bits |= 32;
+  if (bits == 63) {
+    std::cerr <<  "BINGO!\n";
+    exit(1);
+  }
+}
+
diff --git a/lib/Fuzzer/test/FuzzerUnittest.cpp b/lib/Fuzzer/test/FuzzerUnittest.cpp
new file mode 100644
index 0000000..368a0f2
--- /dev/null
+++ b/lib/Fuzzer/test/FuzzerUnittest.cpp
@@ -0,0 +1,62 @@
+#include "FuzzerInternal.h"
+#include "gtest/gtest.h"
+#include <set>
+
+// For now, have TestOneInput just to make it link.
+// Later we may want to make unittests that actually call TestOneInput.
+extern "C" void TestOneInput(const uint8_t *Data, size_t Size) {
+  abort();
+}
+
+TEST(Fuzzer, CrossOver) {
+  using namespace fuzzer;
+  Unit A({0, 1, 2}), B({5, 6, 7});
+  Unit C;
+  Unit Expected[] = {
+       { 0 },
+       { 0, 1 },
+       { 0, 5 },
+       { 0, 1, 2 },
+       { 0, 1, 5 },
+       { 0, 5, 1 },
+       { 0, 5, 6 },
+       { 0, 1, 2, 5 },
+       { 0, 1, 5, 2 },
+       { 0, 1, 5, 6 },
+       { 0, 5, 1, 2 },
+       { 0, 5, 1, 6 },
+       { 0, 5, 6, 1 },
+       { 0, 5, 6, 7 },
+       { 0, 1, 2, 5, 6 },
+       { 0, 1, 5, 2, 6 },
+       { 0, 1, 5, 6, 2 },
+       { 0, 1, 5, 6, 7 },
+       { 0, 5, 1, 2, 6 },
+       { 0, 5, 1, 6, 2 },
+       { 0, 5, 1, 6, 7 },
+       { 0, 5, 6, 1, 2 },
+       { 0, 5, 6, 1, 7 },
+       { 0, 5, 6, 7, 1 },
+       { 0, 1, 2, 5, 6, 7 },
+       { 0, 1, 5, 2, 6, 7 },
+       { 0, 1, 5, 6, 2, 7 },
+       { 0, 1, 5, 6, 7, 2 },
+       { 0, 5, 1, 2, 6, 7 },
+       { 0, 5, 1, 6, 2, 7 },
+       { 0, 5, 1, 6, 7, 2 },
+       { 0, 5, 6, 1, 2, 7 },
+       { 0, 5, 6, 1, 7, 2 },
+       { 0, 5, 6, 7, 1, 2 }
+  };
+  for (size_t Len = 1; Len < 8; Len++) {
+    std::set<Unit> FoundUnits, ExpectedUnitsWitThisLength;
+    for (int Iter = 0; Iter < 3000; Iter++) {
+      CrossOver(A, B, &C, Len);
+      FoundUnits.insert(C);
+    }
+    for (const Unit &U : Expected)
+      if (U.size() <= Len)
+        ExpectedUnitsWitThisLength.insert(U);
+    EXPECT_EQ(ExpectedUnitsWitThisLength, FoundUnits);
+  }
+}
diff --git a/lib/Fuzzer/test/InfiniteTest.cpp b/lib/Fuzzer/test/InfiniteTest.cpp
new file mode 100644
index 0000000..dcb3030
--- /dev/null
+++ b/lib/Fuzzer/test/InfiniteTest.cpp
@@ -0,0 +1,20 @@
+// Simple test for a fuzzer. The fuzzer must find the string "Hi!".
+#include <cstdint>
+#include <cstdlib>
+#include <cstddef>
+#include <iostream>
+
+static volatile int Sink;
+
+extern "C" void TestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size > 0 && Data[0] == 'H') {
+    Sink = 1;
+    if (Size > 1 && Data[1] == 'i') {
+      Sink = 2;
+      if (Size > 2 && Data[2] == '!') {
+        Sink = 2;
+      }
+    }
+  }
+}
+
diff --git a/lib/Fuzzer/test/NullDerefTest.cpp b/lib/Fuzzer/test/NullDerefTest.cpp
new file mode 100644
index 0000000..8811e38
--- /dev/null
+++ b/lib/Fuzzer/test/NullDerefTest.cpp
@@ -0,0 +1,22 @@
+// Simple test for a fuzzer. The fuzzer must find the string "Hi!".
+#include <cstdint>
+#include <cstdlib>
+#include <cstddef>
+#include <iostream>
+
+static volatile int Sink;
+static volatile int *Null = 0;
+
+extern "C" void TestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size > 0 && Data[0] == 'H') {
+    Sink = 1;
+    if (Size > 1 && Data[1] == 'i') {
+      Sink = 2;
+      if (Size > 2 && Data[2] == '!') {
+        std::cout << "Found the target, dereferencing NULL\n";
+        *Null = 1;
+      }
+    }
+  }
+}
+
diff --git a/lib/Fuzzer/test/SimpleTest.cpp b/lib/Fuzzer/test/SimpleTest.cpp
new file mode 100644
index 0000000..adb90ce
--- /dev/null
+++ b/lib/Fuzzer/test/SimpleTest.cpp
@@ -0,0 +1,21 @@
+// Simple test for a fuzzer. The fuzzer must find the string "Hi!".
+#include <cstdint>
+#include <cstdlib>
+#include <cstddef>
+#include <iostream>
+
+static volatile int Sink;
+
+extern "C" void TestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size > 0 && Data[0] == 'H') {
+    Sink = 1;
+    if (Size > 1 && Data[1] == 'i') {
+      Sink = 2;
+      if (Size > 2 && Data[2] == '!') {
+        std::cout << "Found the target, exiting\n";
+        exit(0);
+      }
+    }
+  }
+}
+
diff --git a/lib/Fuzzer/test/TimeoutTest.cpp b/lib/Fuzzer/test/TimeoutTest.cpp
new file mode 100644
index 0000000..23683ce
--- /dev/null
+++ b/lib/Fuzzer/test/TimeoutTest.cpp
@@ -0,0 +1,22 @@
+// Simple test for a fuzzer. The fuzzer must find the string "Hi!".
+#include <cstdint>
+#include <cstdlib>
+#include <cstddef>
+#include <iostream>
+
+static volatile int Sink;
+
+extern "C" void TestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size > 0 && Data[0] == 'H') {
+    Sink = 1;
+    if (Size > 1 && Data[1] == 'i') {
+      Sink = 2;
+      if (Size > 2 && Data[2] == '!') {
+        Sink = 2;
+        while (Sink)
+          ;
+      }
+    }
+  }
+}
+
diff --git a/lib/Fuzzer/test/fuzzer.test b/lib/Fuzzer/test/fuzzer.test
new file mode 100644
index 0000000..1e42e72
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer.test
@@ -0,0 +1,19 @@
+RUN: ./LLVMFuzzer-SimpleTest 2>&1 | FileCheck %s --check-prefix=SimpleTest
+SimpleTest: Found the target, exiting
+
+RUN: not ./LLVMFuzzer-InfiniteTest -timeout=2 2>&1 | FileCheck %s --check-prefix=InfiniteTest
+InfiniteTest: ALARM: working on the last Unit for
+InfiniteTest-NOT: CRASHED; file written to timeout
+
+RUN: not ./LLVMFuzzer-TimeoutTest -timeout=5 2>&1 | FileCheck %s --check-prefix=TimeoutTest
+TimeoutTest: ALARM: working on the last Unit for
+TimeoutTest: CRASHED; file written to timeout
+
+RUN: not ./LLVMFuzzer-NullDerefTest 2>&1 | FileCheck %s --check-prefix=NullDerefTest
+NullDerefTest: CRASHED; file written to crash-
+
+RUN: not ./LLVMFuzzer-FullCoverageSetTest -timeout=15 -seed=1 -mutate_depth=2 -use_full_coverage_set=1 2>&1 | FileCheck %s --check-prefix=FullCoverageSetTest
+FullCoverageSetTest: BINGO
+
+RUN: not ./LLVMFuzzer-FourIndependentBranchesTest -timeout=15 -seed=1 -use_coverage_pairs=1 2>&1 | FileCheck %s --check-prefix=FourIndependentBranchesTest
+FourIndependentBranchesTest: BINGO
diff --git a/lib/Fuzzer/test/lit.cfg b/lib/Fuzzer/test/lit.cfg
new file mode 100644
index 0000000..834a16ae
--- /dev/null
+++ b/lib/Fuzzer/test/lit.cfg
@@ -0,0 +1,14 @@
+import lit.formats
+
+config.name = "LLVMFuzzer"
+config.test_format = lit.formats.ShTest(True)
+config.suffixes = ['.test']
+config.test_source_root = os.path.dirname(__file__)
+
+# Tweak PATH to include llvm tools dir.
+llvm_tools_dir = getattr(config, 'llvm_tools_dir', None)
+if (not llvm_tools_dir) or (not os.path.exists(llvm_tools_dir)):
+  lit_config.fatal("Invalid llvm_tools_dir config attribute: %r" % llvm_tools_dir)
+path = os.path.pathsep.join((llvm_tools_dir, config.environment['PATH']))
+config.environment['PATH'] = path
+
diff --git a/lib/Fuzzer/test/lit.site.cfg.in b/lib/Fuzzer/test/lit.site.cfg.in
new file mode 100644
index 0000000..e520db8
--- /dev/null
+++ b/lib/Fuzzer/test/lit.site.cfg.in
@@ -0,0 +1,3 @@
+config.test_exec_root = "@CMAKE_CURRENT_BINARY_DIR@"
+config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
diff --git a/lib/Fuzzer/test/unit/lit.cfg b/lib/Fuzzer/test/unit/lit.cfg
new file mode 100644
index 0000000..0cc3193
--- /dev/null
+++ b/lib/Fuzzer/test/unit/lit.cfg
@@ -0,0 +1,7 @@
+import lit.formats
+
+config.name = "LLVMFuzzer-Unittest"
+print config.test_exec_root
+config.test_format = lit.formats.GoogleTest(".", "Unittest")
+config.suffixes = []
+config.test_source_root = config.test_exec_root
diff --git a/lib/Fuzzer/test/unit/lit.site.cfg.in b/lib/Fuzzer/test/unit/lit.site.cfg.in
new file mode 100644
index 0000000..114daf4
--- /dev/null
+++ b/lib/Fuzzer/test/unit/lit.site.cfg.in
@@ -0,0 +1,2 @@
+config.test_exec_root = "@CMAKE_CURRENT_BINARY_DIR@"
+lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/unit/lit.cfg")
diff --git a/lib/IR/Android.mk b/lib/IR/Android.mk
index a3632cf..2ca02f7 100644
--- a/lib/IR/Android.mk
+++ b/lib/IR/Android.mk
@@ -12,6 +12,7 @@ vmcore_SRC_FILES := \
   Core.cpp \
   DataLayout.cpp \
   DebugInfo.cpp \
+  DebugInfoMetadata.cpp \
   DebugLoc.cpp \
   DiagnosticInfo.cpp \
   DiagnosticPrinter.cpp \
@@ -29,15 +30,16 @@ vmcore_SRC_FILES := \
   IntrinsicInst.cpp \
   LLVMContext.cpp \
   LLVMContextImpl.cpp \
-  LeakDetector.cpp \
   LegacyPassManager.cpp \
   Mangler.cpp \
   MDBuilder.cpp \
   Metadata.cpp \
+  MetadataTracking.cpp \
   Module.cpp \
   Pass.cpp \
   PassManager.cpp \
   PassRegistry.cpp \
+  Statepoint.cpp \
   Type.cpp \
   TypeFinder.cpp \
   Use.cpp \
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 1961a20..de0e614 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -101,6 +101,11 @@ static OrderMap orderModule(const Module *M) {
     if (F.hasPrefixData())
       if (!isa<GlobalValue>(F.getPrefixData()))
         orderValue(F.getPrefixData(), OM);
+
+    if (F.hasPrologueData())
+      if (!isa<GlobalValue>(F.getPrologueData()))
+        orderValue(F.getPrologueData(), OM);
+
     orderValue(&F, OM);
 
     if (F.isDeclaration())
@@ -282,6 +287,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::AnyReg:        Out << "anyregcc"; break;
   case CallingConv::PreserveMost:  Out << "preserve_mostcc"; break;
   case CallingConv::PreserveAll:   Out << "preserve_allcc"; break;
+  case CallingConv::GHC:           Out << "ghccc"; break;
   case CallingConv::X86_StdCall:   Out << "x86_stdcallcc"; break;
   case CallingConv::X86_FastCall:  Out << "x86_fastcallcc"; break;
   case CallingConv::X86_ThisCall:  Out << "x86_thiscallcc"; break;
@@ -600,8 +606,8 @@ private:
   /// Add all of the functions arguments, basic blocks, and instructions.
   void processFunction();
 
-  SlotTracker(const SlotTracker &) LLVM_DELETED_FUNCTION;
-  void operator=(const SlotTracker &) LLVM_DELETED_FUNCTION;
+  SlotTracker(const SlotTracker &) = delete;
+  void operator=(const SlotTracker &) = delete;
 };
 
 SlotTracker *createSlotTracker(const Module *M) {
@@ -628,13 +634,6 @@ static SlotTracker *createSlotTracker(const Value *V) {
   if (const Function *Func = dyn_cast<Function>(V))
     return new SlotTracker(Func);
 
-  if (const MDNode *MD = dyn_cast<MDNode>(V)) {
-    if (!MD->isFunctionLocal())
-      return new SlotTracker(MD->getFunction());
-
-    return new SlotTracker((Function *)nullptr);
-  }
-
   return nullptr;
 }
 
@@ -647,16 +646,14 @@ static SlotTracker *createSlotTracker(const Value *V) {
 // Module level constructor. Causes the contents of the Module (sans functions)
 // to be added to the slot table.
 SlotTracker::SlotTracker(const Module *M)
-  : TheModule(M), TheFunction(nullptr), FunctionProcessed(false),
-    mNext(0), fNext(0),  mdnNext(0), asNext(0) {
-}
+    : TheModule(M), TheFunction(nullptr), FunctionProcessed(false), mNext(0),
+      fNext(0), mdnNext(0), asNext(0) {}
 
 // Function level constructor. Causes the contents of the Module and the one
 // function provided to be added to the slot table.
 SlotTracker::SlotTracker(const Function *F)
-  : TheModule(F ? F->getParent() : nullptr), TheFunction(F),
-    FunctionProcessed(false), mNext(0), fNext(0), mdnNext(0), asNext(0) {
-}
+    : TheModule(F ? F->getParent() : nullptr), TheFunction(F),
+      FunctionProcessed(false), mNext(0), fNext(0), mdnNext(0), asNext(0) {}
 
 inline void SlotTracker::initialize() {
   if (TheModule) {
@@ -738,8 +735,9 @@ void SlotTracker::processFunction() {
         if (Function *F = CI->getCalledFunction())
           if (F->isIntrinsic())
             for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
-              if (MDNode *N = dyn_cast_or_null<MDNode>(I->getOperand(i)))
-                CreateMetadataSlot(N);
+              if (auto *V = dyn_cast_or_null<MetadataAsValue>(I->getOperand(i)))
+                if (MDNode *N = dyn_cast<MDNode>(V->getMetadata()))
+                  CreateMetadataSlot(N);
 
         // Add all the call attributes to the table.
         AttributeSet Attrs = CI->getAttributes().getFnAttributes();
@@ -850,16 +848,10 @@ void SlotTracker::CreateFunctionSlot(const Value *V) {
 void SlotTracker::CreateMetadataSlot(const MDNode *N) {
   assert(N && "Can't insert a null Value into SlotTracker!");
 
-  // Don't insert if N is a function-local metadata, these are always printed
-  // inline.
-  if (!N->isFunctionLocal()) {
-    mdn_iterator I = mdnMap.find(N);
-    if (I != mdnMap.end())
-      return;
-
-    unsigned DestSlot = mdnNext++;
-    mdnMap[N] = DestSlot;
-  }
+  unsigned DestSlot = mdnNext;
+  if (!mdnMap.insert(std::make_pair(N, DestSlot)).second)
+    return;
+  ++mdnNext;
 
   // Recursively add any MDNodes referenced by operands.
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
@@ -888,6 +880,11 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
                                    SlotTracker *Machine,
                                    const Module *Context);
 
+static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
+                                   TypePrinting *TypePrinter,
+                                   SlotTracker *Machine, const Module *Context,
+                                   bool FromValue = false);
+
 static const char *getPredicateText(unsigned predicate) {
   const char * pred = "unknown";
   switch (predicate) {
@@ -1252,20 +1249,21 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
   Out << "<placeholder or erroneous Constant>";
 }
 
-static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
-                                    TypePrinting *TypePrinter,
-                                    SlotTracker *Machine,
-                                    const Module *Context) {
+static void writeMDTuple(raw_ostream &Out, const MDTuple *Node,
+                         TypePrinting *TypePrinter, SlotTracker *Machine,
+                         const Module *Context) {
   Out << "!{";
   for (unsigned mi = 0, me = Node->getNumOperands(); mi != me; ++mi) {
-    const Value *V = Node->getOperand(mi);
-    if (!V)
+    const Metadata *MD = Node->getOperand(mi);
+    if (!MD)
       Out << "null";
-    else {
+    else if (auto *MDV = dyn_cast<ValueAsMetadata>(MD)) {
+      Value *V = MDV->getValue();
       TypePrinter->print(V->getType(), Out);
       Out << ' ';
-      WriteAsOperandInternal(Out, Node->getOperand(mi),
-                             TypePrinter, Machine, Context);
+      WriteAsOperandInternal(Out, V, TypePrinter, Machine, Context);
+    } else {
+      WriteAsOperandInternal(Out, MD, TypePrinter, Machine, Context);
     }
     if (mi + 1 != me)
       Out << ", ";
@@ -1274,6 +1272,618 @@ static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
   Out << "}";
 }
 
+namespace {
+struct FieldSeparator {
+  bool Skip;
+  const char *Sep;
+  FieldSeparator(const char *Sep = ", ") : Skip(true), Sep(Sep) {}
+};
+raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
+  if (FS.Skip) {
+    FS.Skip = false;
+    return OS;
+  }
+  return OS << FS.Sep;
+}
+} // end namespace
+
+static void writeMetadataAsOperand(raw_ostream &Out, const Metadata *MD,
+                                   TypePrinting *TypePrinter,
+                                   SlotTracker *Machine,
+                                   const Module *Context) {
+  if (!MD) {
+    Out << "null";
+    return;
+  }
+  WriteAsOperandInternal(Out, MD, TypePrinter, Machine, Context);
+}
+
+static void writeTag(raw_ostream &Out, FieldSeparator &FS, const DebugNode *N) {
+  Out << FS << "tag: ";
+  if (const char *Tag = dwarf::TagString(N->getTag()))
+    Out << Tag;
+  else
+    Out << N->getTag();
+}
+
+static void writeGenericDebugNode(raw_ostream &Out, const GenericDebugNode *N,
+                                  TypePrinting *TypePrinter,
+                                  SlotTracker *Machine, const Module *Context) {
+  Out << "!GenericDebugNode(";
+  FieldSeparator FS;
+  writeTag(Out, FS, N);
+  if (!N->getHeader().empty()) {
+    Out << FS << "header: \"";
+    PrintEscapedString(N->getHeader(), Out);
+    Out << "\"";
+  }
+  if (N->getNumDwarfOperands()) {
+    Out << FS << "operands: {";
+    FieldSeparator IFS;
+    for (auto &I : N->dwarf_operands()) {
+      Out << IFS;
+      writeMetadataAsOperand(Out, I, TypePrinter, Machine, Context);
+    }
+    Out << "}";
+  }
+  Out << ")";
+}
+
+static void writeMDLocation(raw_ostream &Out, const MDLocation *DL,
+                            TypePrinting *TypePrinter, SlotTracker *Machine,
+                            const Module *Context) {
+  Out << "!MDLocation(";
+  FieldSeparator FS;
+  // Always output the line, since 0 is a relevant and important value for it.
+  Out << FS << "line: " << DL->getLine();
+  if (DL->getColumn())
+    Out << FS << "column: " << DL->getColumn();
+  Out << FS << "scope: ";
+  WriteAsOperandInternal(Out, DL->getScope(), TypePrinter, Machine, Context);
+  if (DL->getInlinedAt()) {
+    Out << FS << "inlinedAt: ";
+    WriteAsOperandInternal(Out, DL->getInlinedAt(), TypePrinter, Machine,
+                           Context);
+  }
+  Out << ")";
+}
+
+static void writeMDSubrange(raw_ostream &Out, const MDSubrange *N,
+                            TypePrinting *, SlotTracker *, const Module *) {
+  Out << "!MDSubrange(";
+  FieldSeparator FS;
+  Out << FS << "count: " << N->getCount();
+  if (N->getLo())
+    Out << FS << "lowerBound: " << N->getLo();
+  Out << ")";
+}
+
+static void writeMDEnumerator(raw_ostream &Out, const MDEnumerator *N,
+                              TypePrinting *, SlotTracker *, const Module *) {
+  Out << "!MDEnumerator(";
+  FieldSeparator FS;
+  Out << FS << "name: \"" << N->getName() << "\"";
+  Out << FS << "value: " << N->getValue();
+  Out << ")";
+}
+
+static void writeMDBasicType(raw_ostream &Out, const MDBasicType *N,
+                             TypePrinting *, SlotTracker *, const Module *) {
+  Out << "!MDBasicType(";
+  FieldSeparator FS;
+  writeTag(Out, FS, N);
+  if (!N->getName().empty())
+    Out << FS << "name: \"" << N->getName() << "\"";
+  if (N->getSizeInBits())
+    Out << FS << "size: " << N->getSizeInBits();
+  if (N->getAlignInBits())
+    Out << FS << "align: " << N->getAlignInBits();
+  if (unsigned Encoding = N->getEncoding()) {
+    Out << FS << "encoding: ";
+    if (const char *S = dwarf::AttributeEncodingString(Encoding))
+      Out << S;
+    else
+      Out << Encoding;
+  }
+  Out << ")";
+}
+
+static void writeDIFlags(raw_ostream &Out, unsigned Flags) {
+  SmallVector<unsigned, 8> SplitFlags;
+  unsigned Extra = DIDescriptor::splitFlags(Flags, SplitFlags);
+
+  FieldSeparator FS(" | ");
+  for (unsigned F : SplitFlags) {
+    const char *StringF = DIDescriptor::getFlagString(F);
+    assert(StringF && "Expected valid flag");
+    Out << FS << StringF;
+  }
+  if (Extra || SplitFlags.empty())
+    Out << FS << Extra;
+}
+
+static void writeMDDerivedType(raw_ostream &Out, const MDDerivedType *N,
+                               TypePrinting *TypePrinter, SlotTracker *Machine,
+                               const Module *Context) {
+  Out << "!MDDerivedType(";
+  FieldSeparator FS;
+  writeTag(Out, FS, N);
+  if (!N->getName().empty())
+    Out << FS << "name: \"" << N->getName() << "\"";
+  if (N->getFile()) {
+    Out << FS << "file: ";
+    writeMetadataAsOperand(Out, N->getFile(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getLine())
+    Out << FS << "line: " << N->getLine();
+  if (N->getScope()) {
+    Out << FS << "scope: ";
+    writeMetadataAsOperand(Out, N->getScope(), TypePrinter, Machine, Context);
+  }
+  Out << FS << "baseType: ";
+  writeMetadataAsOperand(Out, N->getBaseType(), TypePrinter, Machine, Context);
+  if (N->getSizeInBits())
+    Out << FS << "size: " << N->getSizeInBits();
+  if (N->getAlignInBits())
+    Out << FS << "align: " << N->getAlignInBits();
+  if (N->getOffsetInBits())
+    Out << FS << "offset: " << N->getOffsetInBits();
+  if (auto Flags = N->getFlags()) {
+    Out << FS << "flags: ";
+    writeDIFlags(Out, Flags);
+  }
+  if (N->getExtraData()) {
+    Out << FS << "extraData: ";
+    writeMetadataAsOperand(Out, N->getExtraData(), TypePrinter, Machine,
+                           Context);
+  }
+  Out << ")";
+}
+
+static void writeMDCompositeType(raw_ostream &Out, const MDCompositeType *N,
+                                 TypePrinting *TypePrinter,
+                                 SlotTracker *Machine, const Module *Context) {
+  Out << "!MDCompositeType(";
+  FieldSeparator FS;
+  writeTag(Out, FS, N);
+  if (!N->getName().empty())
+    Out << FS << "name: \"" << N->getName() << "\"";
+  if (N->getFile()) {
+    Out << FS << "file: ";
+    writeMetadataAsOperand(Out, N->getFile(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getLine())
+    Out << FS << "line: " << N->getLine();
+  if (N->getScope()) {
+    Out << FS << "scope: ";
+    writeMetadataAsOperand(Out, N->getScope(), TypePrinter, Machine, Context);
+  }
+  if (N->getBaseType()) {
+    Out << FS << "baseType: ";
+    writeMetadataAsOperand(Out, N->getBaseType(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getSizeInBits())
+    Out << FS << "size: " << N->getSizeInBits();
+  if (N->getAlignInBits())
+    Out << FS << "align: " << N->getAlignInBits();
+  if (N->getOffsetInBits())
+    Out << FS << "offset: " << N->getOffsetInBits();
+  if (auto Flags = N->getFlags()) {
+    Out << FS << "flags: ";
+    writeDIFlags(Out, Flags);
+  }
+  if (N->getElements()) {
+    Out << FS << "elements: ";
+    writeMetadataAsOperand(Out, N->getElements(), TypePrinter, Machine,
+                           Context);
+  }
+  if (unsigned Lang = N->getRuntimeLang()) {
+    Out << FS << "runtimeLang: ";
+    if (const char *S = dwarf::LanguageString(Lang))
+      Out << S;
+    else
+      Out << Lang;
+  }
+
+  if (N->getVTableHolder()) {
+    Out << FS << "vtableHolder: ";
+    writeMetadataAsOperand(Out, N->getVTableHolder(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getTemplateParams()) {
+    Out << FS << "templateParams: ";
+    writeMetadataAsOperand(Out, N->getTemplateParams(), TypePrinter, Machine,
+                           Context);
+  }
+  if (!N->getIdentifier().empty())
+    Out << FS << "identifier: \"" << N->getIdentifier() << "\"";
+  Out << ")";
+}
+
+static void writeMDSubroutineType(raw_ostream &Out, const MDSubroutineType *N,
+                                  TypePrinting *TypePrinter,
+                                  SlotTracker *Machine, const Module *Context) {
+  Out << "!MDSubroutineType(";
+  FieldSeparator FS;
+  if (auto Flags = N->getFlags()) {
+    Out << FS << "flags: ";
+    writeDIFlags(Out, Flags);
+  }
+  Out << FS << "types: ";
+  writeMetadataAsOperand(Out, N->getTypeArray(), TypePrinter, Machine, Context);
+  Out << ")";
+}
+
+static void writeMDFile(raw_ostream &Out, const MDFile *N, TypePrinting *,
+                        SlotTracker *, const Module *) {
+  Out << "!MDFile(";
+  FieldSeparator FS;
+  Out << FS << "filename: \"" << N->getFilename() << "\"";
+  Out << FS << "directory: \"" << N->getDirectory() << "\"";
+  Out << ")";
+}
+
+static void writeMDCompileUnit(raw_ostream &Out, const MDCompileUnit *N,
+                               TypePrinting *TypePrinter, SlotTracker *Machine,
+                               const Module *Context) {
+  Out << "!MDCompileUnit(";
+  FieldSeparator FS;
+  Out << FS << "language: ";
+  if (const char *Lang = dwarf::LanguageString(N->getSourceLanguage()))
+    Out << Lang;
+  else
+    Out << N->getSourceLanguage();
+  Out << FS << "file: ";
+  writeMetadataAsOperand(Out, N->getFile(), TypePrinter, Machine, Context);
+  if (!N->getProducer().empty())
+    Out << FS << "producer: \"" << N->getProducer() << "\"";
+  Out << FS << "isOptimized: " << (N->isOptimized() ? "true" : "false");
+  if (!N->getFlags().empty())
+    Out << FS << "flags: \"" << N->getFlags() << "\"";
+  Out << FS << "runtimeVersion: " << N->getRuntimeVersion();
+  if (!N->getSplitDebugFilename().empty())
+    Out << FS << "splitDebugFilename: \"" << N->getSplitDebugFilename() << "\"";
+  Out << FS << "emissionKind: " << N->getEmissionKind();
+  if (N->getEnumTypes()) {
+    Out << FS << "enums: ";
+    writeMetadataAsOperand(Out, N->getEnumTypes(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getRetainedTypes()) {
+    Out << FS << "retainedTypes: ";
+    writeMetadataAsOperand(Out, N->getRetainedTypes(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getSubprograms()) {
+    Out << FS << "subprograms: ";
+    writeMetadataAsOperand(Out, N->getSubprograms(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getGlobalVariables()) {
+    Out << FS << "globals: ";
+    writeMetadataAsOperand(Out, N->getGlobalVariables(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getImportedEntities()) {
+    Out << FS << "imports: ";
+    writeMetadataAsOperand(Out, N->getImportedEntities(), TypePrinter, Machine,
+                           Context);
+  }
+  Out << ")";
+}
+
+static void writeMDSubprogram(raw_ostream &Out, const MDSubprogram *N,
+                              TypePrinting *TypePrinter, SlotTracker *Machine,
+                              const Module *Context) {
+  Out << "!MDSubprogram(";
+  FieldSeparator FS;
+  Out << FS << "scope: ";
+  writeMetadataAsOperand(Out, N->getScope(), TypePrinter, Machine, Context);
+  Out << FS << "name: \"" << N->getName() << "\"";
+  if (!N->getLinkageName().empty())
+    Out << FS << "linkageName: \"" << N->getLinkageName() << "\"";
+  if (N->getFile()) {
+    Out << FS << "file: ";
+    writeMetadataAsOperand(Out, N->getFile(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getLine())
+    Out << FS << "line: " << N->getLine();
+  if (N->getType()) {
+    Out << FS << "type: ";
+    writeMetadataAsOperand(Out, N->getType(), TypePrinter, Machine,
+                           Context);
+  }
+  Out << FS << "isLocal: " << (N->isLocalToUnit() ? "true" : "false");
+  Out << FS << "isDefinition: " << (N->isDefinition() ? "true" : "false");
+  if (N->getScopeLine())
+    Out << FS << "scopeLine: " << N->getScopeLine();
+  if (N->getContainingType()) {
+    Out << FS << "containingType: ";
+    writeMetadataAsOperand(Out, N->getContainingType(), TypePrinter, Machine,
+                           Context);
+  }
+  if (unsigned V = N->getVirtuality()) {
+    Out << FS << "virtuality: ";
+    if (const char *S = dwarf::VirtualityString(V))
+      Out << S;
+    else
+      Out << V;
+  }
+  if (N->getVirtualIndex())
+    Out << FS << "virtualIndex: " << N->getVirtualIndex();
+  if (auto Flags = N->getFlags()) {
+    Out << FS << "flags: ";
+    writeDIFlags(Out, Flags);
+  }
+  Out << FS << "isOptimized: " << (N->isOptimized() ? "true" : "false");
+  if (N->getFunction()) {
+    Out << FS << "function: ";
+    writeMetadataAsOperand(Out, N->getFunction(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getTemplateParams()) {
+    Out << FS << "templateParams: ";
+    writeMetadataAsOperand(Out, N->getTemplateParams(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getDeclaration()) {
+    Out << FS << "declaration: ";
+    writeMetadataAsOperand(Out, N->getDeclaration(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getVariables()) {
+    Out << FS << "variables: ";
+    writeMetadataAsOperand(Out, N->getVariables(), TypePrinter, Machine,
+                           Context);
+  }
+  Out << ")";
+}
+
+static void writeMDLexicalBlock(raw_ostream &Out, const MDLexicalBlock *N,
+                              TypePrinting *TypePrinter, SlotTracker *Machine,
+                              const Module *Context) {
+  Out << "!MDLexicalBlock(";
+  FieldSeparator FS;
+  Out << FS << "scope: ";
+  writeMetadataAsOperand(Out, N->getScope(), TypePrinter, Machine, Context);
+  if (N->getFile()) {
+    Out << FS << "file: ";
+    writeMetadataAsOperand(Out, N->getFile(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getLine())
+    Out << FS << "line: " << N->getLine();
+  if (N->getColumn())
+    Out << FS << "column: " << N->getColumn();
+  Out << ")";
+}
+
+static void writeMDLexicalBlockFile(raw_ostream &Out,
+                                    const MDLexicalBlockFile *N,
+                                    TypePrinting *TypePrinter,
+                                    SlotTracker *Machine,
+                                    const Module *Context) {
+  Out << "!MDLexicalBlockFile(";
+  FieldSeparator FS;
+  Out << FS << "scope: ";
+  writeMetadataAsOperand(Out, N->getScope(), TypePrinter, Machine, Context);
+  if (N->getFile()) {
+    Out << FS << "file: ";
+    writeMetadataAsOperand(Out, N->getFile(), TypePrinter, Machine,
+                           Context);
+  }
+  Out << FS << "discriminator: " << N->getDiscriminator();
+  Out << ")";
+}
+
+static void writeMDNamespace(raw_ostream &Out, const MDNamespace *N,
+                             TypePrinting *TypePrinter, SlotTracker *Machine,
+                             const Module *Context) {
+  Out << "!MDNamespace(";
+  FieldSeparator FS;
+  Out << FS << "scope: ";
+  writeMetadataAsOperand(Out, N->getScope(), TypePrinter, Machine, Context);
+  if (N->getFile()) {
+    Out << FS << "file: ";
+    writeMetadataAsOperand(Out, N->getFile(), TypePrinter, Machine, Context);
+  }
+  if (!N->getName().empty())
+    Out << FS << "name: \"" << N->getName() << "\"";
+  if (N->getLine())
+    Out << FS << "line: " << N->getLine();
+  Out << ")";
+}
+
+static void writeMDTemplateTypeParameter(raw_ostream &Out,
+                                         const MDTemplateTypeParameter *N,
+                                         TypePrinting *TypePrinter,
+                                         SlotTracker *Machine,
+                                         const Module *Context) {
+  Out << "!MDTemplateTypeParameter(";
+  FieldSeparator FS;
+  Out << FS << "name: \"" << N->getName() << "\"";
+  Out << FS << "type: ";
+  writeMetadataAsOperand(Out, N->getType(), TypePrinter, Machine, Context);
+  Out << ")";
+}
+
+static void writeMDTemplateValueParameter(raw_ostream &Out,
+                                          const MDTemplateValueParameter *N,
+                                          TypePrinting *TypePrinter,
+                                          SlotTracker *Machine,
+                                          const Module *Context) {
+  Out << "!MDTemplateValueParameter(";
+  FieldSeparator FS;
+  writeTag(Out, FS, N);
+  Out << FS << "name: \"" << N->getName() << "\"";
+  Out << FS << "type: ";
+  writeMetadataAsOperand(Out, N->getType(), TypePrinter, Machine, Context);
+  Out << FS << "value: ";
+  writeMetadataAsOperand(Out, N->getValue(), TypePrinter, Machine, Context);
+  Out << ")";
+}
+
+static void writeMDGlobalVariable(raw_ostream &Out, const MDGlobalVariable *N,
+                                  TypePrinting *TypePrinter,
+                                  SlotTracker *Machine, const Module *Context) {
+  Out << "!MDGlobalVariable(";
+  FieldSeparator FS;
+  Out << FS << "scope: ";
+  writeMetadataAsOperand(Out, N->getScope(), TypePrinter, Machine, Context);
+  Out << FS << "name: \"" << N->getName() << "\"";
+  if (!N->getLinkageName().empty())
+    Out << FS << "linkageName: \"" << N->getLinkageName() << "\"";
+  if (N->getFile()) {
+    Out << FS << "file: ";
+    writeMetadataAsOperand(Out, N->getFile(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getLine())
+    Out << FS << "line: " << N->getLine();
+  if (N->getType()) {
+    Out << FS << "type: ";
+    writeMetadataAsOperand(Out, N->getType(), TypePrinter, Machine,
+                           Context);
+  }
+  Out << FS << "isLocal: " << (N->isLocalToUnit() ? "true" : "false");
+  Out << FS << "isDefinition: " << (N->isDefinition() ? "true" : "false");
+  if (N->getVariable()) {
+    Out << FS << "variable: ";
+    writeMetadataAsOperand(Out, N->getVariable(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getStaticDataMemberDeclaration()) {
+    Out << FS << "declaration: ";
+    writeMetadataAsOperand(Out, N->getStaticDataMemberDeclaration(),
+                           TypePrinter, Machine, Context);
+  }
+  Out << ")";
+}
+
+static void writeMDLocalVariable(raw_ostream &Out, const MDLocalVariable *N,
+                                 TypePrinting *TypePrinter,
+                                 SlotTracker *Machine, const Module *Context) {
+  Out << "!MDLocalVariable(";
+  FieldSeparator FS;
+  writeTag(Out, FS, N);
+  Out << FS << "scope: ";
+  writeMetadataAsOperand(Out, N->getScope(), TypePrinter, Machine, Context);
+  Out << FS << "name: \"" << N->getName() << "\"";
+  if (N->getFile()) {
+    Out << FS << "file: ";
+    writeMetadataAsOperand(Out, N->getFile(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getLine())
+    Out << FS << "line: " << N->getLine();
+  if (N->getType()) {
+    Out << FS << "type: ";
+    writeMetadataAsOperand(Out, N->getType(), TypePrinter, Machine,
+                           Context);
+  }
+  if (N->getTag() == dwarf::DW_TAG_arg_variable || N->getArg())
+    Out << FS << "arg: " << N->getArg();
+  if (auto Flags = N->getFlags()) {
+    Out << FS << "flags: ";
+    writeDIFlags(Out, Flags);
+  }
+  if (N->getInlinedAt()) {
+    Out << FS << "inlinedAt: ";
+    writeMetadataAsOperand(Out, N->getInlinedAt(), TypePrinter, Machine,
+                           Context);
+  }
+  Out << ")";
+}
+
+static void writeMDExpression(raw_ostream &Out, const MDExpression *N,
+                              TypePrinting *TypePrinter, SlotTracker *Machine,
+                              const Module *Context) {
+  Out << "!MDExpression(";
+  FieldSeparator FS;
+  if (N->isValid()) {
+    for (auto I = N->expr_op_begin(), E = N->expr_op_end(); I != E; ++I) {
+      const char *OpStr = dwarf::OperationEncodingString(I->getOp());
+      assert(OpStr && "Expected valid opcode");
+
+      Out << FS << OpStr;
+      for (unsigned A = 0, AE = I->getNumArgs(); A != AE; ++A)
+        Out << FS << I->getArg(A);
+    }
+  } else {
+    for (const auto &I : N->getElements())
+      Out << FS << I;
+  }
+  Out << ")";
+}
+
+static void writeMDObjCProperty(raw_ostream &Out, const MDObjCProperty *N,
+                                TypePrinting *TypePrinter, SlotTracker *Machine,
+                                const Module *Context) {
+  Out << "!MDObjCProperty(";
+  FieldSeparator FS;
+  Out << FS << "name: \"" << N->getName() << "\"";
+  if (N->getFile()) {
+    Out << FS << "file: ";
+    writeMetadataAsOperand(Out, N->getFile(), TypePrinter, Machine, Context);
+  }
+  if (N->getLine())
+    Out << FS << "line: " << N->getLine();
+  if (!N->getSetterName().empty())
+    Out << FS << "setter: \"" << N->getSetterName() << "\"";
+  if (!N->getGetterName().empty())
+    Out << FS << "getter: \"" << N->getGetterName() << "\"";
+  if (N->getAttributes())
+    Out << FS << "attributes: " << N->getAttributes();
+  if (N->getType()) {
+    Out << FS << "type: ";
+    writeMetadataAsOperand(Out, N->getType(), TypePrinter, Machine, Context);
+  }
+  Out << ")";
+}
+
+static void writeMDImportedEntity(raw_ostream &Out, const MDImportedEntity *N,
+                                  TypePrinting *TypePrinter,
+                                  SlotTracker *Machine, const Module *Context) {
+  Out << "!MDImportedEntity(";
+  FieldSeparator FS;
+  writeTag(Out, FS, N);
+  Out << FS << "scope: ";
+  writeMetadataAsOperand(Out, N->getScope(), TypePrinter, Machine, Context);
+  if (N->getEntity()) {
+    Out << FS << "entity: ";
+    writeMetadataAsOperand(Out, N->getEntity(), TypePrinter, Machine, Context);
+  }
+  if (N->getLine())
+    Out << FS << "line: " << N->getLine();
+  Out << FS << "name: \"" << N->getName() << "\"";
+  Out << ")";
+}
+
+
+static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
+                                    TypePrinting *TypePrinter,
+                                    SlotTracker *Machine,
+                                    const Module *Context) {
+  assert(!Node->isTemporary() && "Unexpected forward declaration");
+
+  if (Node->isDistinct())
+    Out << "distinct ";
+
+  switch (Node->getMetadataID()) {
+  default:
+    llvm_unreachable("Expected uniquable MDNode");
+#define HANDLE_MDNODE_LEAF(CLASS)                                              \
+  case Metadata::CLASS##Kind:                                                  \
+    write##CLASS(Out, cast<CLASS>(Node), TypePrinter, Machine, Context);       \
+    break;
+#include "llvm/IR/Metadata.def"
+  }
+}
+
 // Full implementation of printing a Value as an operand with support for
 // TypePrinting, etc.
 static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
@@ -1309,31 +1919,9 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
     return;
   }
 
-  if (const MDNode *N = dyn_cast<MDNode>(V)) {
-    if (N->isFunctionLocal()) {
-      // Print metadata inline, not via slot reference number.
-      WriteMDNodeBodyInternal(Out, N, TypePrinter, Machine, Context);
-      return;
-    }
-
-    if (!Machine) {
-      if (N->isFunctionLocal())
-        Machine = new SlotTracker(N->getFunction());
-      else
-        Machine = new SlotTracker(Context);
-    }
-    int Slot = Machine->getMetadataSlot(N);
-    if (Slot == -1)
-      Out << "<badref>";
-    else
-      Out << '!' << Slot;
-    return;
-  }
-
-  if (const MDString *MDS = dyn_cast<MDString>(V)) {
-    Out << "!\"";
-    PrintEscapedString(MDS->getString(), Out);
-    Out << '"';
+  if (auto *MD = dyn_cast<MetadataAsValue>(V)) {
+    WriteAsOperandInternal(Out, MD->getMetadata(), TypePrinter, Machine,
+                           Context, /* FromValue */ true);
     return;
   }
 
@@ -1376,6 +1964,40 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
     Out << "<badref>";
 }
 
+static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
+                                   TypePrinting *TypePrinter,
+                                   SlotTracker *Machine, const Module *Context,
+                                   bool FromValue) {
+  if (const MDNode *N = dyn_cast<MDNode>(MD)) {
+    if (!Machine)
+      Machine = new SlotTracker(Context);
+    int Slot = Machine->getMetadataSlot(N);
+    if (Slot == -1)
+      // Give the pointer value instead of "badref", since this comes up all
+      // the time when debugging.
+      Out << "<" << N << ">";
+    else
+      Out << '!' << Slot;
+    return;
+  }
+
+  if (const MDString *MDS = dyn_cast<MDString>(MD)) {
+    Out << "!\"";
+    PrintEscapedString(MDS->getString(), Out);
+    Out << '"';
+    return;
+  }
+
+  auto *V = cast<ValueAsMetadata>(MD);
+  assert(TypePrinter && "TypePrinter required for metadata values");
+  assert((FromValue || !isa<LocalAsMetadata>(V)) &&
+         "Unexpected function-local metadata outside of value argument");
+
+  TypePrinter->print(V->getValue()->getType(), Out);
+  Out << ' ';
+  WriteAsOperandInternal(Out, V->getValue(), TypePrinter, Machine, Context);
+}
+
 void AssemblyWriter::init() {
   if (!TheModule)
     return;
@@ -1672,6 +2294,24 @@ static void PrintThreadLocalModel(GlobalVariable::ThreadLocalMode TLM,
   }
 }
 
+static void maybePrintComdat(formatted_raw_ostream &Out,
+                             const GlobalObject &GO) {
+  const Comdat *C = GO.getComdat();
+  if (!C)
+    return;
+
+  if (isa<GlobalVariable>(GO))
+    Out << ',';
+  Out << " comdat";
+
+  if (GO.getName() == C->getName())
+    return;
+
+  Out << '(';
+  PrintLLVMName(Out, C->getName(), ComdatPrefix);
+  Out << ')';
+}
+
 void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   if (GV->isMaterializable())
     Out << "; Materializable\n";
@@ -1705,10 +2345,7 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
     PrintEscapedString(GV->getSection(), Out);
     Out << '"';
   }
-  if (GV->hasComdat()) {
-    Out << ", comdat ";
-    PrintLLVMName(Out, GV->getComdat()->getName(), ComdatPrefix);
-  }
+  maybePrintComdat(Out, *GV);
   if (GV->getAlignment())
     Out << ", align " << GV->getAlignment();
 
@@ -1889,10 +2526,7 @@ void AssemblyWriter::printFunction(const Function *F) {
     PrintEscapedString(F->getSection(), Out);
     Out << '"';
   }
-  if (F->hasComdat()) {
-    Out << " comdat ";
-    PrintLLVMName(Out, F->getComdat()->getName(), ComdatPrefix);
-  }
+  maybePrintComdat(Out, *F);
   if (F->getAlignment())
     Out << " align " << F->getAlignment();
   if (F->hasGC())
@@ -1901,6 +2535,11 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << " prefix ";
     writeOperand(F->getPrefixData(), true);
   }
+  if (F->hasPrologueData()) {
+    Out << " prologue ";
+    writeOperand(F->getPrologueData(), true);
+  }
+
   if (F->isDeclaration()) {
     Out << '\n';
   } else {
@@ -2340,7 +2979,7 @@ static void WriteMDNodeComment(const MDNode *Node,
   if (Node->getNumOperands() < 1)
     return;
 
-  Value *Op = Node->getOperand(0);
+  Metadata *Op = Node->getOperand(0);
   if (!Op || !isa<MDString>(Op))
     return;
 
@@ -2359,8 +2998,9 @@ static void WriteMDNodeComment(const MDNode *Node,
 }
 
 void AssemblyWriter::writeMDNode(unsigned Slot, const MDNode *Node) {
-  Out << '!' << Slot << " = metadata ";
+  Out << '!' << Slot << " = ";
   printMDNodeBody(Node);
+  Out << "\n";
 }
 
 void AssemblyWriter::writeAllMDNodes() {
@@ -2378,7 +3018,6 @@ void AssemblyWriter::writeAllMDNodes() {
 void AssemblyWriter::printMDNodeBody(const MDNode *Node) {
   WriteMDNodeBodyInternal(Out, Node, &TypePrinter, &Machine, TheModule);
   WriteMDNodeComment(Node, Out);
-  Out << "\n";
 }
 
 void AssemblyWriter::writeAllAttributeGroups() {
@@ -2511,18 +3150,14 @@ void Value::print(raw_ostream &ROS) const {
       W.printFunction(F);
     else
       W.printAlias(cast<GlobalAlias>(GV));
-  } else if (const MDNode *N = dyn_cast<MDNode>(this)) {
-    const Function *F = N->getFunction();
-    SlotTracker SlotTable(F);
-    AssemblyWriter W(OS, SlotTable, F ? F->getParent() : nullptr, nullptr);
-    W.printMDNodeBody(N);
+  } else if (const MetadataAsValue *V = dyn_cast<MetadataAsValue>(this)) {
+    V->getMetadata()->print(ROS);
   } else if (const Constant *C = dyn_cast<Constant>(this)) {
     TypePrinting TypePrinter;
     TypePrinter.print(C->getType(), OS);
     OS << ' ';
     WriteConstantInternal(OS, C, TypePrinter, nullptr, nullptr);
-  } else if (isa<InlineAsm>(this) || isa<MDString>(this) ||
-             isa<Argument>(this)) {
+  } else if (isa<InlineAsm>(this) || isa<Argument>(this)) {
     this->printAsOperand(OS);
   } else {
     llvm_unreachable("Unknown value to print out!");
@@ -2532,9 +3167,8 @@ void Value::print(raw_ostream &ROS) const {
 void Value::printAsOperand(raw_ostream &O, bool PrintType, const Module *M) const {
   // Fast path: Don't construct and populate a TypePrinting object if we
   // won't be needing any types printed.
-  if (!PrintType &&
-      ((!isa<Constant>(this) && !isa<MDNode>(this)) ||
-       hasName() || isa<GlobalValue>(this))) {
+  if (!PrintType && ((!isa<Constant>(this) && !isa<MetadataAsValue>(this)) ||
+                     hasName() || isa<GlobalValue>(this))) {
     WriteAsOperandInternal(O, this, nullptr, nullptr, M);
     return;
   }
@@ -2553,17 +3187,54 @@ void Value::printAsOperand(raw_ostream &O, bool PrintType, const Module *M) cons
   WriteAsOperandInternal(O, this, &TypePrinter, nullptr, M);
 }
 
+void Metadata::print(raw_ostream &ROS) const {
+  formatted_raw_ostream OS(ROS);
+  if (auto *N = dyn_cast<MDNode>(this)) {
+    SlotTracker SlotTable(static_cast<Function *>(nullptr));
+    AssemblyWriter W(OS, SlotTable, nullptr, nullptr);
+    W.printMDNodeBody(N);
+
+    return;
+  }
+  printAsOperand(OS);
+}
+
+void Metadata::printAsOperand(raw_ostream &ROS, bool PrintType,
+                              const Module *M) const {
+  formatted_raw_ostream OS(ROS);
+
+  std::unique_ptr<TypePrinting> TypePrinter;
+  if (PrintType) {
+    TypePrinter.reset(new TypePrinting);
+    if (M)
+      TypePrinter->incorporateTypes(*M);
+  }
+  WriteAsOperandInternal(OS, this, TypePrinter.get(), nullptr, M,
+                         /* FromValue */ true);
+}
+
 // Value::dump - allow easy printing of Values from the debugger.
+LLVM_DUMP_METHOD
 void Value::dump() const { print(dbgs()); dbgs() << '\n'; }
 
 // Type::dump - allow easy printing of Types from the debugger.
+LLVM_DUMP_METHOD
 void Type::dump() const { print(dbgs()); dbgs() << '\n'; }
 
 // Module::dump() - Allow printing of Modules from the debugger.
+LLVM_DUMP_METHOD
 void Module::dump() const { print(dbgs(), nullptr); }
 
 // \brief Allow printing of Comdats from the debugger.
+LLVM_DUMP_METHOD
 void Comdat::dump() const { print(dbgs()); }
 
 // NamedMDNode::dump() - Allow printing of NamedMDNodes from the debugger.
+LLVM_DUMP_METHOD
 void NamedMDNode::dump() const { print(dbgs()); }
+
+LLVM_DUMP_METHOD
+void Metadata::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
diff --git a/lib/IR/AsmWriter.h b/lib/IR/AsmWriter.h
index 60da5ad..7716fa6 100644
--- a/lib/IR/AsmWriter.h
+++ b/lib/IR/AsmWriter.h
@@ -42,8 +42,8 @@ SlotTracker *createSlotTracker(const Module *M);
 //===----------------------------------------------------------------------===//
 
 class TypePrinting {
-  TypePrinting(const TypePrinting &) LLVM_DELETED_FUNCTION;
-  void operator=(const TypePrinting&) LLVM_DELETED_FUNCTION;
+  TypePrinting(const TypePrinting &) = delete;
+  void operator=(const TypePrinting&) = delete;
 public:
 
   /// NamedTypes - The named types that are used by the current module.
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index 0448dc1..199c318 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -33,8 +33,8 @@ class AttributeImpl : public FoldingSetNode {
   unsigned char KindID; ///< Holds the AttrEntryKind of the attribute
 
   // AttributesImpl is uniqued, these should not be publicly available.
-  void operator=(const AttributeImpl &) LLVM_DELETED_FUNCTION;
-  AttributeImpl(const AttributeImpl &) LLVM_DELETED_FUNCTION;
+  void operator=(const AttributeImpl &) = delete;
+  AttributeImpl(const AttributeImpl &) = delete;
 
 protected:
   enum AttrEntryKind {
@@ -151,8 +151,8 @@ class AttributeSetNode : public FoldingSetNode {
   }
 
   // AttributesSetNode is uniqued, these should not be publicly available.
-  void operator=(const AttributeSetNode &) LLVM_DELETED_FUNCTION;
-  AttributeSetNode(const AttributeSetNode &) LLVM_DELETED_FUNCTION;
+  void operator=(const AttributeSetNode &) = delete;
+  AttributeSetNode(const AttributeSetNode &) = delete;
 public:
   static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
 
@@ -199,8 +199,8 @@ class AttributeSetImpl : public FoldingSetNode {
   }
 
   // AttributesSet is uniqued, these should not be publicly available.
-  void operator=(const AttributeSetImpl &) LLVM_DELETED_FUNCTION;
-  AttributeSetImpl(const AttributeSetImpl &) LLVM_DELETED_FUNCTION;
+  void operator=(const AttributeSetImpl &) = delete;
+  AttributeSetImpl(const AttributeSetImpl &) = delete;
 public:
   AttributeSetImpl(LLVMContext &C,
                    ArrayRef<std::pair<unsigned, AttributeSetNode *> > Attrs)
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index 04545ea..daac6b5 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -835,6 +835,13 @@ AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
   return get(C, AttrSet);
 }
 
+AttributeSet AttributeSet::addDereferenceableAttr(LLVMContext &C, unsigned Index,
+                                                  uint64_t Bytes) const {
+  llvm::AttrBuilder B;
+  B.addDereferenceableAttr(Bytes);
+  return addAttributes(C, Index, AttributeSet::get(C, Index, B));
+}
+
 //===----------------------------------------------------------------------===//
 // AttributeSet Accessor Methods
 //===----------------------------------------------------------------------===//
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index c24dfea..0da7784 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -15,9 +15,9 @@
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
@@ -60,6 +60,21 @@ static bool UpgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID,
   return true;
 }
 
+// Upgrade the declarations of AVX-512 cmp intrinsic functions whose 8-bit
+// immediates have changed their type from i32 to i8.
+static bool UpgradeAVX512CmpIntrinsic(Function *F, Intrinsic::ID IID,
+                                      Function *&NewFn) {
+  // Check that the last argument is an i32.
+  Type *LastArgType = F->getFunctionType()->getParamType(2);
+  if (!LastArgType->isIntegerTy(32))
+    return false;
+
+  // Move this function aside and map down.
+  F->setName(F->getName() + ".old");
+  NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
+  return true;
+}
+
 static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   assert(F && "Illegal to upgrade a non-existent Function.");
 
@@ -148,6 +163,14 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name == "x86.avx.vbroadcast.ss" ||
         Name == "x86.avx.vbroadcast.ss.256" ||
         Name == "x86.avx.vbroadcast.sd.256" ||
+        Name == "x86.sse2.psll.dq" ||
+        Name == "x86.sse2.psrl.dq" ||
+        Name == "x86.avx2.psll.dq" ||
+        Name == "x86.avx2.psrl.dq" ||
+        Name == "x86.sse2.psll.dq.bs" ||
+        Name == "x86.sse2.psrl.dq.bs" ||
+        Name == "x86.avx2.psll.dq.bs" ||
+        Name == "x86.avx2.psrl.dq.bs" ||
         (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) {
       NewFn = nullptr;
       return true;
@@ -206,6 +229,88 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
       return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw,
                                               NewFn);
 
+    if (Name == "x86.avx512.mask.cmp.ps.512")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_cmp_ps_512,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.cmp.pd.512")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_cmp_pd_512,
+                                       NewFn);
+
+    if (Name == "x86.avx512.mask.cmp.b.512")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_cmp_b_512,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.cmp.w.512")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_cmp_w_512,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.cmp.d.512")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_cmp_d_512,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.cmp.q.512")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_cmp_q_512,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.ucmp.b.512")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_ucmp_b_512,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.ucmp.w.512")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_ucmp_w_512,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.ucmp.d.512")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_ucmp_d_512,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.ucmp.q.512")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_ucmp_q_512,
+                                       NewFn);
+
+    if (Name == "x86.avx512.mask.cmp.b.256")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_cmp_b_256,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.cmp.w.256")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_cmp_w_256,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.cmp.d.256")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_cmp_d_256,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.cmp.q.256")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_cmp_q_256,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.ucmp.b.256")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_ucmp_b_256,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.ucmp.w.256")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_ucmp_w_256,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.ucmp.d.256")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_ucmp_d_256,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.ucmp.q.256")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_ucmp_q_256,
+                                       NewFn);
+
+    if (Name == "x86.avx512.mask.cmp.b.128")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_cmp_b_128,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.cmp.w.128")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_cmp_w_128,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.cmp.d.128")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_cmp_d_128,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.cmp.q.128")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_cmp_q_128,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.ucmp.b.128")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_ucmp_b_128,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.ucmp.w.128")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_ucmp_w_128,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.ucmp.d.128")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_ucmp_d_128,
+                                       NewFn);
+    if (Name == "x86.avx512.mask.ucmp.q.128")
+      return UpgradeAVX512CmpIntrinsic(F, Intrinsic::x86_avx512_mask_ucmp_q_128,
+                                       NewFn);
+
     // frcz.ss/sd may need to have an argument dropped
     if (Name.startswith("x86.xop.vfrcz.ss") && F->arg_size() == 2) {
       F->setName(Name + ".old");
@@ -260,14 +365,89 @@ static MDNode *getNodeField(const MDNode *DbgNode, unsigned Elt) {
   return dyn_cast_or_null<MDNode>(DbgNode->getOperand(Elt));
 }
 
-static DIExpression getExpression(Value *VarOperand, Function *F) {
+static MetadataAsValue *getExpression(Value *VarOperand, Function *F) {
   // Old-style DIVariables have an optional expression as the 8th element.
-  DIExpression Expr(getNodeField(cast<MDNode>(VarOperand), 8));
+  DIExpression Expr(getNodeField(
+      cast<MDNode>(cast<MetadataAsValue>(VarOperand)->getMetadata()), 8));
   if (!Expr) {
-    DIBuilder DIB(*F->getParent());
+    DIBuilder DIB(*F->getParent(), /*AllowUnresolved*/ false);
     Expr = DIB.createExpression();
   }
-  return Expr;
+  return MetadataAsValue::get(F->getContext(), Expr);
+}
+
+// Handles upgrading SSE2 and AVX2 PSLLDQ intrinsics by converting them
+// to byte shuffles.
+static Value *UpgradeX86PSLLDQIntrinsics(IRBuilder<> &Builder, LLVMContext &C,
+                                         Value *Op, unsigned NumLanes,
+                                         unsigned Shift) {
+  // Each lane is 16 bytes.
+  unsigned NumElts = NumLanes * 16;
+
+  // Bitcast from a 64-bit element type to a byte element type.
+  Op = Builder.CreateBitCast(Op,
+                             VectorType::get(Type::getInt8Ty(C), NumElts),
+                             "cast");
+  // We'll be shuffling in zeroes.
+  Value *Res = ConstantVector::getSplat(NumElts, Builder.getInt8(0));
+
+  // If shift is less than 16, emit a shuffle to move the bytes. Otherwise,
+  // we'll just return the zero vector.
+  if (Shift < 16) {
+    SmallVector<Constant*, 32> Idxs;
+    // 256-bit version is split into two 16-byte lanes.
+    for (unsigned l = 0; l != NumElts; l += 16)
+      for (unsigned i = 0; i != 16; ++i) {
+        unsigned Idx = NumElts + i - Shift;
+        if (Idx < NumElts)
+          Idx -= NumElts - 16; // end of lane, switch operand.
+        Idxs.push_back(Builder.getInt32(Idx + l));
+      }
+
+    Res = Builder.CreateShuffleVector(Res, Op, ConstantVector::get(Idxs));
+  }
+
+  // Bitcast back to a 64-bit element type.
+  return Builder.CreateBitCast(Res,
+                               VectorType::get(Type::getInt64Ty(C), 2*NumLanes),
+                               "cast");
+}
+
+// Handles upgrading SSE2 and AVX2 PSRLDQ intrinsics by converting them
+// to byte shuffles.
+static Value *UpgradeX86PSRLDQIntrinsics(IRBuilder<> &Builder, LLVMContext &C,
+                                         Value *Op, unsigned NumLanes,
+                                         unsigned Shift) {
+  // Each lane is 16 bytes.
+  unsigned NumElts = NumLanes * 16;
+
+  // Bitcast from a 64-bit element type to a byte element type.
+  Op = Builder.CreateBitCast(Op,
+                             VectorType::get(Type::getInt8Ty(C), NumElts),
+                             "cast");
+  // We'll be shuffling in zeroes.
+  Value *Res = ConstantVector::getSplat(NumElts, Builder.getInt8(0));
+
+  // If shift is less than 16, emit a shuffle to move the bytes. Otherwise,
+  // we'll just return the zero vector.
+  if (Shift < 16) {
+    SmallVector<Constant*, 32> Idxs;
+    // 256-bit version is split into two 16-byte lanes.
+    for (unsigned l = 0; l != NumElts; l += 16)
+      for (unsigned i = 0; i != 16; ++i) {
+        unsigned Idx = i + Shift;
+        if (Idx >= 16)
+          Idx += NumElts - 16; // end of lane, switch operand.
+        Idxs.push_back(Builder.getInt32(Idx + l));
+      }
+
+    Res = Builder.CreateShuffleVector(Op, Res, ConstantVector::get(Idxs));
+  }
+
+  // Bitcast back to a 64-bit element type.
+  return Builder.CreateBitCast(Res,
+                               VectorType::get(Type::getInt64Ty(C), 2*NumLanes),
+                               "cast");
 }
 
 // UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the
@@ -306,8 +486,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Builder.SetInsertPoint(CI->getParent(), CI);
 
       Module *M = F->getParent();
-      SmallVector<Value *, 1> Elts;
-      Elts.push_back(ConstantInt::get(Type::getInt32Ty(C), 1));
+      SmallVector<Metadata *, 1> Elts;
+      Elts.push_back(
+          ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
       MDNode *Node = MDNode::get(C, Elts);
 
       Value *Arg0 = CI->getArgOperand(0);
@@ -359,9 +540,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
         Imm = 4;
       else if (Name.startswith("ne"))
         Imm = 5;
-      else if (Name.startswith("true"))
-        Imm = 6;
       else if (Name.startswith("false"))
+        Imm = 6;
+      else if (Name.startswith("true"))
         Imm = 7;
       else
         llvm_unreachable("Unknown condition");
@@ -388,6 +569,46 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       for (unsigned I = 0; I < EltNum; ++I)
         Rep = Builder.CreateInsertElement(Rep, Load,
                                           ConstantInt::get(I32Ty, I));
+    } else if (Name == "llvm.x86.sse2.psll.dq") {
+      // 128-bit shift left specified in bits.
+      unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      Rep = UpgradeX86PSLLDQIntrinsics(Builder, C, CI->getArgOperand(0), 1,
+                                       Shift / 8); // Shift is in bits.
+    } else if (Name == "llvm.x86.sse2.psrl.dq") {
+      // 128-bit shift right specified in bits.
+      unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      Rep = UpgradeX86PSRLDQIntrinsics(Builder, C, CI->getArgOperand(0), 1,
+                                       Shift / 8); // Shift is in bits.
+    } else if (Name == "llvm.x86.avx2.psll.dq") {
+      // 256-bit shift left specified in bits.
+      unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      Rep = UpgradeX86PSLLDQIntrinsics(Builder, C, CI->getArgOperand(0), 2,
+                                       Shift / 8); // Shift is in bits.
+    } else if (Name == "llvm.x86.avx2.psrl.dq") {
+      // 256-bit shift right specified in bits.
+      unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      Rep = UpgradeX86PSRLDQIntrinsics(Builder, C, CI->getArgOperand(0), 2,
+                                       Shift / 8); // Shift is in bits.
+    } else if (Name == "llvm.x86.sse2.psll.dq.bs") {
+      // 128-bit shift left specified in bytes.
+      unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      Rep = UpgradeX86PSLLDQIntrinsics(Builder, C, CI->getArgOperand(0), 1,
+                                       Shift);
+    } else if (Name == "llvm.x86.sse2.psrl.dq.bs") {
+      // 128-bit shift right specified in bytes.
+      unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      Rep = UpgradeX86PSRLDQIntrinsics(Builder, C, CI->getArgOperand(0), 1,
+                                       Shift);
+    } else if (Name == "llvm.x86.avx2.psll.dq.bs") {
+      // 256-bit shift left specified in bytes.
+      unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      Rep = UpgradeX86PSLLDQIntrinsics(Builder, C, CI->getArgOperand(0), 2,
+                                       Shift);
+    } else if (Name == "llvm.x86.avx2.psrl.dq.bs") {
+      // 256-bit shift right specified in bytes.
+      unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      Rep = UpgradeX86PSRLDQIntrinsics(Builder, C, CI->getArgOperand(0), 2,
+                                       Shift);
     } else {
       bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
       if (Name == "llvm.x86.avx.vpermil.pd.256")
@@ -545,6 +766,21 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     CI->eraseFromParent();
     return;
   }
+  case Intrinsic::x86_avx512_mask_cmp_ps_512:
+  case Intrinsic::x86_avx512_mask_cmp_pd_512: {
+    // Need to truncate the last argument from i32 to i8 -- this argument models
+    // an inherently 8-bit immediate operand to these x86 instructions.
+    SmallVector<Value *, 5> Args(CI->arg_operands().begin(),
+                                 CI->arg_operands().end());
+
+    // Replace the last argument with a trunc.
+    Args[2] = Builder.CreateTrunc(Args[2], Type::getInt8Ty(C), "trunc");
+
+    CallInst *NewCall = Builder.CreateCall(NewFn, Args);
+    CI->replaceAllUsesWith(NewCall);
+    CI->eraseFromParent();
+    return;
+  }
   }
 }
 
@@ -578,22 +814,18 @@ void llvm::UpgradeInstWithTBAATag(Instruction *I) {
     return;
 
   if (MD->getNumOperands() == 3) {
-    Value *Elts[] = {
-      MD->getOperand(0),
-      MD->getOperand(1)
-    };
+    Metadata *Elts[] = {MD->getOperand(0), MD->getOperand(1)};
     MDNode *ScalarType = MDNode::get(I->getContext(), Elts);
     // Create a MDNode <ScalarType, ScalarType, offset 0, const>
-    Value *Elts2[] = {
-      ScalarType, ScalarType,
-      Constant::getNullValue(Type::getInt64Ty(I->getContext())),
-      MD->getOperand(2)
-    };
+    Metadata *Elts2[] = {ScalarType, ScalarType,
+                         ConstantAsMetadata::get(Constant::getNullValue(
+                             Type::getInt64Ty(I->getContext()))),
+                         MD->getOperand(2)};
     I->setMetadata(LLVMContext::MD_tbaa, MDNode::get(I->getContext(), Elts2));
   } else {
     // Create a MDNode <MD, MD, offset 0>
-    Value *Elts[] = {MD, MD,
-      Constant::getNullValue(Type::getInt64Ty(I->getContext()))};
+    Metadata *Elts[] = {MD, MD, ConstantAsMetadata::get(Constant::getNullValue(
+                                    Type::getInt64Ty(I->getContext())))};
     I->setMetadata(LLVMContext::MD_tbaa, MDNode::get(I->getContext(), Elts));
   }
 }
diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp
index 5ed9bed..b3b3cbf 100644
--- a/lib/IR/BasicBlock.cpp
+++ b/lib/IR/BasicBlock.cpp
@@ -19,7 +19,6 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LeakDetector.h"
 #include "llvm/IR/Type.h"
 #include <algorithm>
 using namespace llvm;
@@ -47,9 +46,6 @@ BasicBlock::BasicBlock(LLVMContext &C, const Twine &Name, Function *NewParent,
                        BasicBlock *InsertBefore)
   : Value(Type::getLabelTy(C), Value::BasicBlockVal), Parent(nullptr) {
 
-  // Make sure that we get added to a function
-  LeakDetector::addGarbageObject(this);
-
   if (NewParent)
     insertInto(NewParent, InsertBefore);
   else
@@ -94,14 +90,8 @@ BasicBlock::~BasicBlock() {
 }
 
 void BasicBlock::setParent(Function *parent) {
-  if (getParent())
-    LeakDetector::addGarbageObject(this);
-
   // Set Parent=parent, updating instruction symtab entries as appropriate.
   InstList.setSymTabObject(&Parent, parent);
-
-  if (getParent())
-    LeakDetector::removeGarbageObject(this);
 }
 
 void BasicBlock::removeFromParent() {
@@ -249,6 +239,20 @@ BasicBlock *BasicBlock::getUniquePredecessor() {
   return PredBB;
 }
 
+BasicBlock *BasicBlock::getUniqueSuccessor() {
+  succ_iterator SI = succ_begin(this), E = succ_end(this);
+  if (SI == E) return NULL; // No successors
+  BasicBlock *SuccBB = *SI;
+  ++SI;
+  for (;SI != E; ++SI) {
+    if (*SI != SuccBB)
+      return NULL;
+    // The same successor appears multiple times in the successor list.
+    // This is OK.
+  }
+  return SuccBB;
+}
+
 /// removePredecessor - This method is used to notify a BasicBlock that the
 /// specified Predecessor of the block is no longer able to reach it.  This is
 /// actually not used to update the Predecessor list, but is actually used to
diff --git a/lib/IR/CMakeLists.txt b/lib/IR/CMakeLists.txt
index b3889e6..9fef0b2 100644
--- a/lib/IR/CMakeLists.txt
+++ b/lib/IR/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_library(LLVMCore
   DIBuilder.cpp
   DataLayout.cpp
   DebugInfo.cpp
+  DebugInfoMetadata.cpp
   DebugLoc.cpp
   DiagnosticInfo.cpp
   DiagnosticPrinter.cpp
@@ -27,15 +28,16 @@ add_llvm_library(LLVMCore
   IntrinsicInst.cpp
   LLVMContext.cpp
   LLVMContextImpl.cpp
-  LeakDetector.cpp
   LegacyPassManager.cpp
   MDBuilder.cpp
   Mangler.cpp
   Metadata.cpp
+  MetadataTracking.cpp
   Module.cpp
   Pass.cpp
   PassManager.cpp
   PassRegistry.cpp
+  Statepoint.cpp
   Type.cpp
   TypeFinder.cpp
   Use.cpp
@@ -45,6 +47,9 @@ add_llvm_library(LLVMCore
   ValueSymbolTable.cpp
   ValueTypes.cpp
   Verifier.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/IR
   )
 
 add_dependencies(LLVMCore intrinsics_gen)
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index cdfb41f..a915d28 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -27,12 +27,14 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MathExtras.h"
 #include <limits>
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 //===----------------------------------------------------------------------===//
 //                ConstantFold*Instruction Implementations
@@ -913,49 +915,70 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
         return C1;
       return Constant::getNullValue(C1->getType());   // undef & X -> 0
     case Instruction::Mul: {
-      ConstantInt *CI;
-      // X * undef -> undef   if X is odd or undef
-      if (((CI = dyn_cast<ConstantInt>(C1)) && CI->getValue()[0]) ||
-          ((CI = dyn_cast<ConstantInt>(C2)) && CI->getValue()[0]) ||
-          (isa<UndefValue>(C1) && isa<UndefValue>(C2)))
-        return UndefValue::get(C1->getType());
+      // undef * undef -> undef
+      if (isa<UndefValue>(C1) && isa<UndefValue>(C2))
+        return C1;
+      const APInt *CV;
+      // X * undef -> undef   if X is odd
+      if (match(C1, m_APInt(CV)) || match(C2, m_APInt(CV)))
+        if ((*CV)[0])
+          return UndefValue::get(C1->getType());
 
       // X * undef -> 0       otherwise
       return Constant::getNullValue(C1->getType());
     }
-    case Instruction::UDiv:
     case Instruction::SDiv:
+    case Instruction::UDiv:
+      // X / undef -> undef
+      if (match(C1, m_Zero()))
+        return C2;
+      // undef / 0 -> undef
       // undef / 1 -> undef
-      if (Opcode == Instruction::UDiv || Opcode == Instruction::SDiv)
-        if (ConstantInt *CI2 = dyn_cast<ConstantInt>(C2))
-          if (CI2->isOne())
-            return C1;
-      // FALL THROUGH
+      if (match(C2, m_Zero()) || match(C2, m_One()))
+        return C1;
+      // undef / X -> 0       otherwise
+      return Constant::getNullValue(C1->getType());
     case Instruction::URem:
     case Instruction::SRem:
-      if (!isa<UndefValue>(C2))                    // undef / X -> 0
-        return Constant::getNullValue(C1->getType());
-      return C2;                                   // X / undef -> undef
+      // X % undef -> undef
+      if (match(C2, m_Undef()))
+        return C2;
+      // undef % 0 -> undef
+      if (match(C2, m_Zero()))
+        return C1;
+      // undef % X -> 0       otherwise
+      return Constant::getNullValue(C1->getType());
     case Instruction::Or:                          // X | undef -> -1
       if (isa<UndefValue>(C1) && isa<UndefValue>(C2)) // undef | undef -> undef
         return C1;
       return Constant::getAllOnesValue(C1->getType()); // undef | X -> ~0
     case Instruction::LShr:
-      if (isa<UndefValue>(C2) && isa<UndefValue>(C1))
-        return C1;                                  // undef lshr undef -> undef
-      return Constant::getNullValue(C1->getType()); // X lshr undef -> 0
-                                                    // undef lshr X -> 0
+      // X >>l undef -> undef
+      if (isa<UndefValue>(C2))
+        return C2;
+      // undef >>l 0 -> undef
+      if (match(C2, m_Zero()))
+        return C1;
+      // undef >>l X -> 0
+      return Constant::getNullValue(C1->getType());
     case Instruction::AShr:
-      if (!isa<UndefValue>(C2))                     // undef ashr X --> all ones
-        return Constant::getAllOnesValue(C1->getType());
-      else if (isa<UndefValue>(C1)) 
-        return C1;                                  // undef ashr undef -> undef
-      else
-        return C1;                                  // X ashr undef --> X
+      // X >>a undef -> undef
+      if (isa<UndefValue>(C2))
+        return C2;
+      // undef >>a 0 -> undef
+      if (match(C2, m_Zero()))
+        return C1;
+      // TODO: undef >>a X -> undef if the shift is exact
+      // undef >>a X -> 0
+      return Constant::getNullValue(C1->getType());
     case Instruction::Shl:
-      if (isa<UndefValue>(C2) && isa<UndefValue>(C1))
-        return C1;                                  // undef shl undef -> undef
-      // undef << X -> 0   or   X << undef -> 0
+      // X << undef -> undef
+      if (isa<UndefValue>(C2))
+        return C2;
+      // undef << 0 -> undef
+      if (match(C2, m_Zero()))
+        return C1;
+      // undef << X -> 0
       return Constant::getNullValue(C1->getType());
     }
   }
@@ -1259,15 +1282,17 @@ static int IdxCompare(Constant *C1, Constant *C2, Type *ElTy) {
   if (!isa<ConstantInt>(C1) || !isa<ConstantInt>(C2))
     return -2; // don't know!
 
-  // Ok, we have two differing integer indices.  Sign extend them to be the same
-  // type.  Long is always big enough, so we use it.
-  if (!C1->getType()->isIntegerTy(64))
-    C1 = ConstantExpr::getSExt(C1, Type::getInt64Ty(C1->getContext()));
+  // We cannot compare the indices if they don't fit in an int64_t.
+  if (cast<ConstantInt>(C1)->getValue().getActiveBits() > 64 ||
+      cast<ConstantInt>(C2)->getValue().getActiveBits() > 64)
+    return -2; // don't know!
 
-  if (!C2->getType()->isIntegerTy(64))
-    C2 = ConstantExpr::getSExt(C2, Type::getInt64Ty(C1->getContext()));
+  // Ok, we have two differing integer indices.  Sign extend them to be the same
+  // type.
+  int64_t C1Val = cast<ConstantInt>(C1)->getSExtValue();
+  int64_t C2Val = cast<ConstantInt>(C2)->getSExtValue();
 
-  if (C1 == C2) return 0;  // They are equal
+  if (C1Val == C2Val) return 0;  // They are equal
 
   // If the type being indexed over is really just a zero sized type, there is
   // no pointer difference being made here.
@@ -1276,8 +1301,7 @@ static int IdxCompare(Constant *C1, Constant *C2, Type *ElTy) {
 
   // If they are really different, now that they are the same type, then we
   // found a difference!
-  if (cast<ConstantInt>(C1)->getSExtValue() < 
-      cast<ConstantInt>(C2)->getSExtValue())
+  if (C1Val < C2Val)
     return -1;
   else
     return 1;
@@ -1348,9 +1372,24 @@ static FCmpInst::Predicate evaluateFCmpRelation(Constant *V1, Constant *V2) {
 
 static ICmpInst::Predicate areGlobalsPotentiallyEqual(const GlobalValue *GV1,
                                                       const GlobalValue *GV2) {
+  auto isGlobalUnsafeForEquality = [](const GlobalValue *GV) {
+    if (GV->hasExternalWeakLinkage() || GV->hasWeakAnyLinkage())
+      return true;
+    if (const auto *GVar = dyn_cast<GlobalVariable>(GV)) {
+      Type *Ty = GVar->getType()->getPointerElementType();
+      // A global with opaque type might end up being zero sized.
+      if (!Ty->isSized())
+        return true;
+      // A global with an empty type might lie at the address of any other
+      // global.
+      if (Ty->isEmptyTy())
+        return true;
+    }
+    return false;
+  };
   // Don't try to decide equality of aliases.
   if (!isa<GlobalAlias>(GV1) && !isa<GlobalAlias>(GV2))
-    if (!GV1->hasExternalWeakLinkage() || !GV2->hasExternalWeakLinkage())
+    if (!isGlobalUnsafeForEquality(GV1) && !isGlobalUnsafeForEquality(GV2))
       return ICmpInst::ICMP_NE;
   return ICmpInst::BAD_ICMP_PREDICATE;
 }
@@ -2040,8 +2079,7 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
       if (PerformFold) {
         SmallVector<Value*, 16> NewIndices;
         NewIndices.reserve(Idxs.size() + CE->getNumOperands());
-        for (unsigned i = 1, e = CE->getNumOperands()-1; i != e; ++i)
-          NewIndices.push_back(CE->getOperand(i));
+        NewIndices.append(CE->op_begin() + 1, CE->op_end() - 1);
 
         // Add the last index of the source with the first index of the new GEP.
         // Make sure to handle the case when they are actually different types.
@@ -2050,9 +2088,15 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
         if (!Idx0->isNullValue()) {
           Type *IdxTy = Combined->getType();
           if (IdxTy != Idx0->getType()) {
-            Type *Int64Ty = Type::getInt64Ty(IdxTy->getContext());
-            Constant *C1 = ConstantExpr::getSExtOrBitCast(Idx0, Int64Ty);
-            Constant *C2 = ConstantExpr::getSExtOrBitCast(Combined, Int64Ty);
+            unsigned CommonExtendedWidth =
+                std::max(IdxTy->getIntegerBitWidth(),
+                         Idx0->getType()->getIntegerBitWidth());
+            CommonExtendedWidth = std::max(CommonExtendedWidth, 64U);
+
+            Type *CommonTy =
+                Type::getIntNTy(IdxTy->getContext(), CommonExtendedWidth);
+            Constant *C1 = ConstantExpr::getSExtOrBitCast(Idx0, CommonTy);
+            Constant *C2 = ConstantExpr::getSExtOrBitCast(Combined, CommonTy);
             Combined = ConstantExpr::get(Instruction::Add, C1, C2);
           } else {
             Combined =
@@ -2125,14 +2169,20 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
             Constant *PrevIdx = cast<Constant>(Idxs[i-1]);
             Constant *Div = ConstantExpr::getSDiv(CI, Factor);
 
+            unsigned CommonExtendedWidth =
+                std::max(PrevIdx->getType()->getIntegerBitWidth(),
+                         Div->getType()->getIntegerBitWidth());
+            CommonExtendedWidth = std::max(CommonExtendedWidth, 64U);
+
             // Before adding, extend both operands to i64 to avoid
             // overflow trouble.
-            if (!PrevIdx->getType()->isIntegerTy(64))
-              PrevIdx = ConstantExpr::getSExt(PrevIdx,
-                                           Type::getInt64Ty(Div->getContext()));
-            if (!Div->getType()->isIntegerTy(64))
-              Div = ConstantExpr::getSExt(Div,
-                                          Type::getInt64Ty(Div->getContext()));
+            if (!PrevIdx->getType()->isIntegerTy(CommonExtendedWidth))
+              PrevIdx = ConstantExpr::getSExt(
+                  PrevIdx,
+                  Type::getIntNTy(Div->getContext(), CommonExtendedWidth));
+            if (!Div->getType()->isIntegerTy(CommonExtendedWidth))
+              Div = ConstantExpr::getSExt(
+                  Div, Type::getIntNTy(Div->getContext(), CommonExtendedWidth));
 
             NewIdxs[i-1] = ConstantExpr::getAdd(PrevIdx, Div);
           } else {
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index e0cb835..0bf61a7 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -257,11 +257,11 @@ Constant *Constant::getAggregateElement(unsigned Elt) const {
   if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
     return Elt < CV->getNumOperands() ? CV->getOperand(Elt) : nullptr;
 
-  if (const ConstantAggregateZero *CAZ =dyn_cast<ConstantAggregateZero>(this))
-    return CAZ->getElementValue(Elt);
+  if (const ConstantAggregateZero *CAZ = dyn_cast<ConstantAggregateZero>(this))
+    return Elt < CAZ->getNumElements() ? CAZ->getElementValue(Elt) : nullptr;
 
   if (const UndefValue *UV = dyn_cast<UndefValue>(this))
-    return UV->getElementValue(Elt);
+    return Elt < UV->getNumElements() ? UV->getElementValue(Elt) : nullptr;
 
   if (const ConstantDataSequential *CDS =dyn_cast<ConstantDataSequential>(this))
     return Elt < CDS->getNumElements() ? CDS->getElementAsConstant(Elt)
@@ -554,19 +554,17 @@ Constant *ConstantInt::getFalse(Type *Ty) {
                                   ConstantInt::getFalse(Ty->getContext()));
 }
 
-
-// Get a ConstantInt from an APInt. Note that the value stored in the DenseMap 
-// as the key, is a DenseMapAPIntKeyInfo::KeyTy which has provided the
-// operator== and operator!= to ensure that the DenseMap doesn't attempt to
-// compare APInt's of different widths, which would violate an APInt class
-// invariant which generates an assertion.
+// Get a ConstantInt from an APInt.
 ConstantInt *ConstantInt::get(LLVMContext &Context, const APInt &V) {
-  // Get the corresponding integer type for the bit width of the value.
-  IntegerType *ITy = IntegerType::get(Context, V.getBitWidth());
   // get an existing value or the insertion position
   LLVMContextImpl *pImpl = Context.pImpl;
-  ConstantInt *&Slot = pImpl->IntConstants[DenseMapAPIntKeyInfo::KeyTy(V, ITy)];
-  if (!Slot) Slot = new ConstantInt(ITy, V);
+  ConstantInt *&Slot = pImpl->IntConstants[V];
+  if (!Slot) {
+    // Get the corresponding integer type for the bit width of the value.
+    IntegerType *ITy = IntegerType::get(Context, V.getBitWidth());
+    Slot = new ConstantInt(ITy, V);
+  }
+  assert(Slot->getType() == IntegerType::get(Context, V.getBitWidth()));
   return Slot;
 }
 
@@ -689,7 +687,7 @@ Constant *ConstantFP::getZeroValueForNegation(Type *Ty) {
 ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
   LLVMContextImpl* pImpl = Context.pImpl;
 
-  ConstantFP *&Slot = pImpl->FPConstants[DenseMapAPFloatKeyInfo::KeyTy(V)];
+  ConstantFP *&Slot = pImpl->FPConstants[V];
 
   if (!Slot) {
     Type *Ty;
@@ -766,6 +764,14 @@ Constant *ConstantAggregateZero::getElementValue(unsigned Idx) const {
   return getStructElement(Idx);
 }
 
+unsigned ConstantAggregateZero::getNumElements() const {
+  const Type *Ty = getType();
+  if (const auto *AT = dyn_cast<ArrayType>(Ty))
+    return AT->getNumElements();
+  if (const auto *VT = dyn_cast<VectorType>(Ty))
+    return VT->getNumElements();
+  return Ty->getStructNumElements();
+}
 
 //===----------------------------------------------------------------------===//
 //                         UndefValue Implementation
@@ -799,7 +805,14 @@ UndefValue *UndefValue::getElementValue(unsigned Idx) const {
   return getStructElement(Idx);
 }
 
-
+unsigned UndefValue::getNumElements() const {
+  const Type *Ty = getType();
+  if (const auto *AT = dyn_cast<ArrayType>(Ty))
+    return AT->getNumElements();
+  if (const auto *VT = dyn_cast<VectorType>(Ty))
+    return VT->getNumElements();
+  return Ty->getStructNumElements();
+}
 
 //===----------------------------------------------------------------------===//
 //                            ConstantXXX Classes
@@ -898,23 +911,25 @@ Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef<Constant*> V) {
 
     if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
       if (CFP->getType()->isFloatTy()) {
-        SmallVector<float, 16> Elts;
+        SmallVector<uint32_t, 16> Elts;
         for (unsigned i = 0, e = V.size(); i != e; ++i)
           if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
-            Elts.push_back(CFP->getValueAPF().convertToFloat());
+            Elts.push_back(
+                CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
           else
             break;
         if (Elts.size() == V.size())
-          return ConstantDataArray::get(C->getContext(), Elts);
+          return ConstantDataArray::getFP(C->getContext(), Elts);
       } else if (CFP->getType()->isDoubleTy()) {
-        SmallVector<double, 16> Elts;
+        SmallVector<uint64_t, 16> Elts;
         for (unsigned i = 0, e = V.size(); i != e; ++i)
           if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
-            Elts.push_back(CFP->getValueAPF().convertToDouble());
+            Elts.push_back(
+                CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
           else
             break;
         if (Elts.size() == V.size())
-          return ConstantDataArray::get(C->getContext(), Elts);
+          return ConstantDataArray::getFP(C->getContext(), Elts);
       }
     }
   }
@@ -1084,23 +1099,25 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
 
     if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
       if (CFP->getType()->isFloatTy()) {
-        SmallVector<float, 16> Elts;
+        SmallVector<uint32_t, 16> Elts;
         for (unsigned i = 0, e = V.size(); i != e; ++i)
           if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
-            Elts.push_back(CFP->getValueAPF().convertToFloat());
+            Elts.push_back(
+                CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
           else
             break;
         if (Elts.size() == V.size())
-          return ConstantDataVector::get(C->getContext(), Elts);
+          return ConstantDataVector::getFP(C->getContext(), Elts);
       } else if (CFP->getType()->isDoubleTy()) {
-        SmallVector<double, 16> Elts;
+        SmallVector<uint64_t, 16> Elts;
         for (unsigned i = 0, e = V.size(); i != e; ++i)
           if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
-            Elts.push_back(CFP->getValueAPF().convertToDouble());
+            Elts.push_back(
+                CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
           else
             break;
         if (Elts.size() == V.size())
-          return ConstantDataVector::get(C->getContext(), Elts);
+          return ConstantDataVector::getFP(C->getContext(), Elts);
       }
     }
   }
@@ -2531,7 +2548,31 @@ Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<float> Elts) {
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<double> Elts) {
   Type *Ty = ArrayType::get(Type::getDoubleTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty);
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 8), Ty);
+}
+
+/// getFP() constructors - Return a constant with array type with an element
+/// count and element type of float with precision matching the number of
+/// bits in the ArrayRef passed in. (i.e. half for 16bits, float for 32bits,
+/// double for 64bits) Note that this can return a ConstantAggregateZero
+/// object.
+Constant *ConstantDataArray::getFP(LLVMContext &Context,
+                                   ArrayRef<uint16_t> Elts) {
+  Type *Ty = VectorType::get(Type::getHalfTy(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 2), Ty);
+}
+Constant *ConstantDataArray::getFP(LLVMContext &Context,
+                                   ArrayRef<uint32_t> Elts) {
+  Type *Ty = ArrayType::get(Type::getFloatTy(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 4), Ty);
+}
+Constant *ConstantDataArray::getFP(LLVMContext &Context,
+                                   ArrayRef<uint64_t> Elts) {
+  Type *Ty = ArrayType::get(Type::getDoubleTy(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 8), Ty);
 }
 
 /// getString - This method constructs a CDS and initializes it with a text
@@ -2584,7 +2625,31 @@ Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<float> Elts) {
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<double> Elts) {
   Type *Ty = VectorType::get(Type::getDoubleTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty);
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 8), Ty);
+}
+
+/// getFP() constructors - Return a constant with vector type with an element
+/// count and element type of float with the precision matching the number of
+/// bits in the ArrayRef passed in.  (i.e. half for 16bits, float for 32bits,
+/// double for 64bits) Note that this can return a ConstantAggregateZero
+/// object.
+Constant *ConstantDataVector::getFP(LLVMContext &Context,
+                                    ArrayRef<uint16_t> Elts) {
+  Type *Ty = VectorType::get(Type::getHalfTy(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 2), Ty);
+}
+Constant *ConstantDataVector::getFP(LLVMContext &Context,
+                                    ArrayRef<uint32_t> Elts) {
+  Type *Ty = VectorType::get(Type::getFloatTy(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 4), Ty);
+}
+Constant *ConstantDataVector::getFP(LLVMContext &Context,
+                                    ArrayRef<uint64_t> Elts) {
+  Type *Ty = VectorType::get(Type::getDoubleTy(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 8), Ty);
 }
 
 Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) {
@@ -2610,13 +2675,14 @@ Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) {
 
   if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
     if (CFP->getType()->isFloatTy()) {
-      SmallVector<float, 16> Elts(NumElts, CFP->getValueAPF().convertToFloat());
-      return get(V->getContext(), Elts);
+      SmallVector<uint32_t, 16> Elts(
+          NumElts, CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
+      return getFP(V->getContext(), Elts);
     }
     if (CFP->getType()->isDoubleTy()) {
-      SmallVector<double, 16> Elts(NumElts,
-                                   CFP->getValueAPF().convertToDouble());
-      return get(V->getContext(), Elts);
+      SmallVector<uint64_t, 16> Elts(
+          NumElts, CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
+      return getFP(V->getContext(), Elts);
     }
   }
   return ConstantVector::getSplat(NumElts, V);
@@ -2654,13 +2720,13 @@ APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const {
   default:
     llvm_unreachable("Accessor can only be used when element is float/double!");
   case Type::FloatTyID: {
-      const float *FloatPrt = reinterpret_cast<const float *>(EltPtr);
-      return APFloat(*const_cast<float *>(FloatPrt));
-    }
+    auto EltVal = *reinterpret_cast<const uint32_t *>(EltPtr);
+    return APFloat(APFloat::IEEEsingle, APInt(32, EltVal));
+  }
   case Type::DoubleTyID: {
-      const double *DoublePtr = reinterpret_cast<const double *>(EltPtr);
-      return APFloat(*const_cast<double *>(DoublePtr));
-    }
+    auto EltVal = *reinterpret_cast<const uint64_t *>(EltPtr);
+    return APFloat(APFloat::IEEEdouble, APInt(64, EltVal));
+  }
   }
 }
 
diff --git a/lib/IR/ConstantsContext.h b/lib/IR/ConstantsContext.h
index 571dec2..c1dfcf1 100644
--- a/lib/IR/ConstantsContext.h
+++ b/lib/IR/ConstantsContext.h
@@ -34,7 +34,7 @@ namespace llvm {
 /// behind the scenes to implement unary constant exprs.
 class UnaryConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
 public:
   // allocate space for exactly one operand
   void *operator new(size_t s) {
@@ -51,7 +51,7 @@ public:
 /// behind the scenes to implement binary constant exprs.
 class BinaryConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
 public:
   // allocate space for exactly two operands
   void *operator new(size_t s) {
@@ -72,7 +72,7 @@ public:
 /// behind the scenes to implement select constant exprs.
 class SelectConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
 public:
   // allocate space for exactly three operands
   void *operator new(size_t s) {
@@ -93,7 +93,7 @@ public:
 /// extractelement constant exprs.
 class ExtractElementConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
 public:
   // allocate space for exactly two operands
   void *operator new(size_t s) {
@@ -114,7 +114,7 @@ public:
 /// insertelement constant exprs.
 class InsertElementConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
 public:
   // allocate space for exactly three operands
   void *operator new(size_t s) {
@@ -136,7 +136,7 @@ public:
 /// shufflevector constant exprs.
 class ShuffleVectorConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
 public:
   // allocate space for exactly three operands
   void *operator new(size_t s) {
@@ -161,7 +161,7 @@ public:
 /// extractvalue constant exprs.
 class ExtractValueConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
 public:
   // allocate space for exactly one operand
   void *operator new(size_t s) {
@@ -186,7 +186,7 @@ public:
 /// insertvalue constant exprs.
 class InsertValueConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
 public:
   // allocate space for exactly one operand
   void *operator new(size_t s) {
@@ -233,7 +233,7 @@ public:
 // needed in order to store the predicate value for these instructions.
 class CompareConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t, unsigned) = delete;
 public:
   // allocate space for exactly two operands
   void *operator new(size_t s) {
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 3576137..f007616 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -26,8 +26,8 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
@@ -556,12 +556,31 @@ int LLVMHasMetadata(LLVMValueRef Inst) {
 }
 
 LLVMValueRef LLVMGetMetadata(LLVMValueRef Inst, unsigned KindID) {
-  return wrap(unwrap<Instruction>(Inst)->getMetadata(KindID));
+  auto *I = unwrap<Instruction>(Inst);
+  assert(I && "Expected instruction");
+  if (auto *MD = I->getMetadata(KindID))
+    return wrap(MetadataAsValue::get(I->getContext(), MD));
+  return nullptr;
+}
+
+// MetadataAsValue uses a canonical format which strips the actual MDNode for
+// MDNode with just a single constant value, storing just a ConstantAsMetadata
+// This undoes this canonicalization, reconstructing the MDNode.
+static MDNode *extractMDNode(MetadataAsValue *MAV) {
+  Metadata *MD = MAV->getMetadata();
+  assert((isa<MDNode>(MD) || isa<ConstantAsMetadata>(MD)) &&
+      "Expected a metadata node or a canonicalized constant");
+
+  if (MDNode *N = dyn_cast<MDNode>(MD))
+    return N;
+
+  return MDNode::get(MAV->getContext(), MD);
 }
 
-void LLVMSetMetadata(LLVMValueRef Inst, unsigned KindID, LLVMValueRef MD) {
-  unwrap<Instruction>(Inst)
-      ->setMetadata(KindID, MD ? unwrap<MDNode>(MD) : nullptr);
+void LLVMSetMetadata(LLVMValueRef Inst, unsigned KindID, LLVMValueRef Val) {
+  MDNode *N = Val ? extractMDNode(unwrap<MetadataAsValue>(Val)) : nullptr;
+
+  unwrap<Instruction>(Inst)->setMetadata(KindID, N);
 }
 
 /*--.. Conversion functions ................................................--*/
@@ -573,6 +592,21 @@ void LLVMSetMetadata(LLVMValueRef Inst, unsigned KindID, LLVMValueRef MD) {
 
 LLVM_FOR_EACH_VALUE_SUBCLASS(LLVM_DEFINE_VALUE_CAST)
 
+LLVMValueRef LLVMIsAMDNode(LLVMValueRef Val) {
+  if (auto *MD = dyn_cast_or_null<MetadataAsValue>(unwrap(Val)))
+    if (isa<MDNode>(MD->getMetadata()) ||
+        isa<ValueAsMetadata>(MD->getMetadata()))
+      return Val;
+  return nullptr;
+}
+
+LLVMValueRef LLVMIsAMDString(LLVMValueRef Val) {
+  if (auto *MD = dyn_cast_or_null<MetadataAsValue>(unwrap(Val)))
+    if (isa<MDString>(MD->getMetadata()))
+      return Val;
+  return nullptr;
+}
+
 /*--.. Operations on Uses ..................................................--*/
 LLVMUseRef LLVMGetFirstUse(LLVMValueRef Val) {
   Value *V = unwrap(Val);
@@ -598,10 +632,28 @@ LLVMValueRef LLVMGetUsedValue(LLVMUseRef U) {
 }
 
 /*--.. Operations on Users .................................................--*/
+
+static LLVMValueRef getMDNodeOperandImpl(LLVMContext &Context, const MDNode *N,
+                                         unsigned Index) {
+  Metadata *Op = N->getOperand(Index);
+  if (!Op)
+    return nullptr;
+  if (auto *C = dyn_cast<ConstantAsMetadata>(Op))
+    return wrap(C->getValue());
+  return wrap(MetadataAsValue::get(Context, Op));
+}
+
 LLVMValueRef LLVMGetOperand(LLVMValueRef Val, unsigned Index) {
   Value *V = unwrap(Val);
-  if (MDNode *MD = dyn_cast<MDNode>(V))
-      return wrap(MD->getOperand(Index));
+  if (auto *MD = dyn_cast<MetadataAsValue>(V)) {
+    if (auto *L = dyn_cast<ValueAsMetadata>(MD->getMetadata())) {
+      assert(Index == 0 && "Function-local metadata can only have one operand");
+      return wrap(L->getValue());
+    }
+    return getMDNodeOperandImpl(V->getContext(),
+                                cast<MDNode>(MD->getMetadata()), Index);
+  }
+
   return wrap(cast<User>(V)->getOperand(Index));
 }
 
@@ -616,8 +668,9 @@ void LLVMSetOperand(LLVMValueRef Val, unsigned Index, LLVMValueRef Op) {
 
 int LLVMGetNumOperands(LLVMValueRef Val) {
   Value *V = unwrap(Val);
-  if (MDNode *MD = dyn_cast<MDNode>(V))
-      return MD->getNumOperands();
+  if (isa<MetadataAsValue>(V))
+    return LLVMGetMDNodeNumOperands(Val);
+
   return cast<User>(V)->getNumOperands();
 }
 
@@ -658,7 +711,9 @@ LLVMValueRef LLVMConstPointerNull(LLVMTypeRef Ty) {
 
 LLVMValueRef LLVMMDStringInContext(LLVMContextRef C, const char *Str,
                                    unsigned SLen) {
-  return wrap(MDString::get(*unwrap(C), StringRef(Str, SLen)));
+  LLVMContext &Context = *unwrap(C);
+  return wrap(MetadataAsValue::get(
+      Context, MDString::get(Context, StringRef(Str, SLen))));
 }
 
 LLVMValueRef LLVMMDString(const char *Str, unsigned SLen) {
@@ -667,8 +722,29 @@ LLVMValueRef LLVMMDString(const char *Str, unsigned SLen) {
 
 LLVMValueRef LLVMMDNodeInContext(LLVMContextRef C, LLVMValueRef *Vals,
                                  unsigned Count) {
-  return wrap(MDNode::get(*unwrap(C),
-                          makeArrayRef(unwrap<Value>(Vals, Count), Count)));
+  LLVMContext &Context = *unwrap(C);
+  SmallVector<Metadata *, 8> MDs;
+  for (auto *OV : makeArrayRef(Vals, Count)) {
+    Value *V = unwrap(OV);
+    Metadata *MD;
+    if (!V)
+      MD = nullptr;
+    else if (auto *C = dyn_cast<Constant>(V))
+      MD = ConstantAsMetadata::get(C);
+    else if (auto *MDV = dyn_cast<MetadataAsValue>(V)) {
+      MD = MDV->getMetadata();
+      assert(!isa<LocalAsMetadata>(MD) && "Unexpected function-local metadata "
+                                          "outside of direct argument to call");
+    } else {
+      // This is function-local metadata.  Pretend to make an MDNode.
+      assert(Count == 1 &&
+             "Expected only one operand to function-local metadata");
+      return wrap(MetadataAsValue::get(Context, LocalAsMetadata::get(V)));
+    }
+
+    MDs.push_back(MD);
+  }
+  return wrap(MetadataAsValue::get(Context, MDNode::get(Context, MDs)));
 }
 
 LLVMValueRef LLVMMDNode(LLVMValueRef *Vals, unsigned Count) {
@@ -676,25 +752,35 @@ LLVMValueRef LLVMMDNode(LLVMValueRef *Vals, unsigned Count) {
 }
 
 const char *LLVMGetMDString(LLVMValueRef V, unsigned* Len) {
-  if (const MDString *S = dyn_cast<MDString>(unwrap(V))) {
-    *Len = S->getString().size();
-    return S->getString().data();
-  }
+  if (const auto *MD = dyn_cast<MetadataAsValue>(unwrap(V)))
+    if (const MDString *S = dyn_cast<MDString>(MD->getMetadata())) {
+      *Len = S->getString().size();
+      return S->getString().data();
+    }
   *Len = 0;
   return nullptr;
 }
 
 unsigned LLVMGetMDNodeNumOperands(LLVMValueRef V)
 {
-  return cast<MDNode>(unwrap(V))->getNumOperands();
+  auto *MD = cast<MetadataAsValue>(unwrap(V));
+  if (isa<ValueAsMetadata>(MD->getMetadata()))
+    return 1;
+  return cast<MDNode>(MD->getMetadata())->getNumOperands();
 }
 
 void LLVMGetMDNodeOperands(LLVMValueRef V, LLVMValueRef *Dest)
 {
-  const MDNode *N = cast<MDNode>(unwrap(V));
+  auto *MD = cast<MetadataAsValue>(unwrap(V));
+  if (auto *MDV = dyn_cast<ValueAsMetadata>(MD->getMetadata())) {
+    *Dest = wrap(MDV->getValue());
+    return;
+  }
+  const auto *N = cast<MDNode>(MD->getMetadata());
   const unsigned numOperands = N->getNumOperands();
+  LLVMContext &Context = unwrap(V)->getContext();
   for (unsigned i = 0; i < numOperands; i++)
-    Dest[i] = wrap(N->getOperand(i));
+    Dest[i] = getMDNodeOperandImpl(Context, N, i);
 }
 
 unsigned LLVMGetNamedMetadataNumOperands(LLVMModuleRef M, const char* name)
@@ -710,8 +796,9 @@ void LLVMGetNamedMetadataOperands(LLVMModuleRef M, const char* name, LLVMValueRe
   NamedMDNode *N = unwrap(M)->getNamedMetadata(name);
   if (!N)
     return;
+  LLVMContext &Context = unwrap(M)->getContext();
   for (unsigned i=0;i<N->getNumOperands();i++)
-    Dest[i] = wrap(N->getOperand(i));
+    Dest[i] = wrap(MetadataAsValue::get(Context, N->getOperand(i)));
 }
 
 void LLVMAddNamedMetadataOperand(LLVMModuleRef M, const char* name,
@@ -720,9 +807,9 @@ void LLVMAddNamedMetadataOperand(LLVMModuleRef M, const char* name,
   NamedMDNode *N = unwrap(M)->getOrInsertNamedMetadata(name);
   if (!N)
     return;
-  MDNode *Op = Val ? unwrap<MDNode>(Val) : nullptr;
-  if (Op)
-    N->addOperand(Op);
+  if (!Val)
+    return;
+  N->addOperand(extractMDNode(unwrap<MetadataAsValue>(Val)));
 }
 
 /*--.. Operations on scalar constants ......................................--*/
@@ -1543,7 +1630,7 @@ LLVMValueRef LLVMAddAlias(LLVMModuleRef M, LLVMTypeRef Ty, LLVMValueRef Aliasee,
   auto *PTy = cast<PointerType>(unwrap(Ty));
   return wrap(GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
                                   GlobalValue::ExternalLinkage, Name,
-                                  unwrap<GlobalObject>(Aliasee), unwrap(M)));
+                                  unwrap<Constant>(Aliasee), unwrap(M)));
 }
 
 /*--.. Operations on functions .............................................--*/
@@ -2092,13 +2179,16 @@ void LLVMDisposeBuilder(LLVMBuilderRef Builder) {
 /*--.. Metadata builders ...................................................--*/
 
 void LLVMSetCurrentDebugLocation(LLVMBuilderRef Builder, LLVMValueRef L) {
-  MDNode *Loc = L ? unwrap<MDNode>(L) : nullptr;
+  MDNode *Loc =
+      L ? cast<MDNode>(unwrap<MetadataAsValue>(L)->getMetadata()) : nullptr;
   unwrap(Builder)->SetCurrentDebugLocation(DebugLoc::getFromDILocation(Loc));
 }
 
 LLVMValueRef LLVMGetCurrentDebugLocation(LLVMBuilderRef Builder) {
-  return wrap(unwrap(Builder)->getCurrentDebugLocation()
-              .getAsMDNode(unwrap(Builder)->getContext()));
+  LLVMContext &Context = unwrap(Builder)->getContext();
+  return wrap(MetadataAsValue::get(
+      Context,
+      unwrap(Builder)->getCurrentDebugLocation().getAsMDNode(Context)));
 }
 
 void LLVMSetInstDebugLocation(LLVMBuilderRef Builder, LLVMValueRef Inst) {
@@ -2755,11 +2845,11 @@ LLVMPassRegistryRef LLVMGetGlobalPassRegistry(void) {
 /*===-- Pass Manager ------------------------------------------------------===*/
 
 LLVMPassManagerRef LLVMCreatePassManager() {
-  return wrap(new PassManager());
+  return wrap(new legacy::PassManager());
 }
 
 LLVMPassManagerRef LLVMCreateFunctionPassManagerForModule(LLVMModuleRef M) {
-  return wrap(new FunctionPassManager(unwrap(M)));
+  return wrap(new legacy::FunctionPassManager(unwrap(M)));
 }
 
 LLVMPassManagerRef LLVMCreateFunctionPassManager(LLVMModuleProviderRef P) {
@@ -2768,19 +2858,19 @@ LLVMPassManagerRef LLVMCreateFunctionPassManager(LLVMModuleProviderRef P) {
 }
 
 LLVMBool LLVMRunPassManager(LLVMPassManagerRef PM, LLVMModuleRef M) {
-  return unwrap<PassManager>(PM)->run(*unwrap(M));
+  return unwrap<legacy::PassManager>(PM)->run(*unwrap(M));
 }
 
 LLVMBool LLVMInitializeFunctionPassManager(LLVMPassManagerRef FPM) {
-  return unwrap<FunctionPassManager>(FPM)->doInitialization();
+  return unwrap<legacy::FunctionPassManager>(FPM)->doInitialization();
 }
 
 LLVMBool LLVMRunFunctionPassManager(LLVMPassManagerRef FPM, LLVMValueRef F) {
-  return unwrap<FunctionPassManager>(FPM)->run(*unwrap<Function>(F));
+  return unwrap<legacy::FunctionPassManager>(FPM)->run(*unwrap<Function>(F));
 }
 
 LLVMBool LLVMFinalizeFunctionPassManager(LLVMPassManagerRef FPM) {
-  return unwrap<FunctionPassManager>(FPM)->doFinalization();
+  return unwrap<legacy::FunctionPassManager>(FPM)->doFinalization();
 }
 
 void LLVMDisposePassManager(LLVMPassManagerRef PM) {
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index 4fe2be6..2cb27ca 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -25,15 +25,24 @@ using namespace llvm::dwarf;
 
 namespace {
 class HeaderBuilder {
+  /// \brief Whether there are any fields yet.
+  ///
+  /// Note that this is not equivalent to \c Chars.empty(), since \a concat()
+  /// may have been called already with an empty string.
+  bool IsEmpty;
   SmallVector<char, 256> Chars;
 
 public:
-  explicit HeaderBuilder(Twine T) { T.toVector(Chars); }
-  HeaderBuilder(const HeaderBuilder &X) : Chars(X.Chars) {}
-  HeaderBuilder(HeaderBuilder &&X) : Chars(std::move(X.Chars)) {}
+  HeaderBuilder() : IsEmpty(true) {}
+  HeaderBuilder(const HeaderBuilder &X) : IsEmpty(X.IsEmpty), Chars(X.Chars) {}
+  HeaderBuilder(HeaderBuilder &&X)
+      : IsEmpty(X.IsEmpty), Chars(std::move(X.Chars)) {}
 
   template <class Twineable> HeaderBuilder &concat(Twineable &&X) {
-    Chars.push_back(0);
+    if (IsEmpty)
+      IsEmpty = false;
+    else
+      Chars.push_back(0);
     Twine(X).toVector(Chars);
     return *this;
   }
@@ -43,26 +52,37 @@ public:
   }
 
   static HeaderBuilder get(unsigned Tag) {
-    return HeaderBuilder("0x" + Twine::utohexstr(Tag));
+    return HeaderBuilder().concat("0x" + Twine::utohexstr(Tag));
   }
 };
 }
 
-DIBuilder::DIBuilder(Module &m)
+DIBuilder::DIBuilder(Module &m, bool AllowUnresolvedNodes)
     : M(m), VMContext(M.getContext()), TempEnumTypes(nullptr),
       TempRetainTypes(nullptr), TempSubprograms(nullptr), TempGVs(nullptr),
-      DeclareFn(nullptr), ValueFn(nullptr) {}
+      DeclareFn(nullptr), ValueFn(nullptr),
+      AllowUnresolvedNodes(AllowUnresolvedNodes) {}
+
+void DIBuilder::trackIfUnresolved(MDNode *N) {
+  if (!N)
+    return;
+  if (N->isResolved())
+    return;
+
+  assert(AllowUnresolvedNodes && "Cannot handle unresolved nodes");
+  UnresolvedNodes.emplace_back(N);
+}
 
 void DIBuilder::finalize() {
   DIArray Enums = getOrCreateArray(AllEnumTypes);
   DIType(TempEnumTypes).replaceAllUsesWith(Enums);
 
-  SmallVector<Value *, 16> RetainValues;
+  SmallVector<Metadata *, 16> RetainValues;
   // Declarations and definitions of the same type may be retained. Some
   // clients RAUW these pairs, leaving duplicates in the retained types
   // list. Use a set to remove the duplicates while we transform the
   // TrackingVHs back into Values.
-  SmallPtrSet<Value *, 16> RetainSet;
+  SmallPtrSet<Metadata *, 16> RetainSet;
   for (unsigned I = 0, E = AllRetainTypes.size(); I < E; I++)
     if (RetainSet.insert(AllRetainTypes[I]).second)
       RetainValues.push_back(AllRetainTypes[I]);
@@ -74,9 +94,8 @@ void DIBuilder::finalize() {
   for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
     DISubprogram SP(SPs.getElement(i));
     if (MDNode *Temp = SP.getVariablesNodes()) {
-      SmallVector<Value *, 4> Variables;
-      for (Value *V : PreservedVariables.lookup(SP))
-        Variables.push_back(V);
+      const auto &PV = PreservedVariables.lookup(SP);
+      SmallVector<Metadata *, 4> Variables(PV.begin(), PV.end());
       DIArray AV = getOrCreateArray(Variables);
       DIType(Temp).replaceAllUsesWith(AV);
     }
@@ -85,11 +104,20 @@ void DIBuilder::finalize() {
   DIArray GVs = getOrCreateArray(AllGVs);
   DIType(TempGVs).replaceAllUsesWith(GVs);
 
-  SmallVector<Value *, 16> RetainValuesI;
-  for (unsigned I = 0, E = AllImportedModules.size(); I < E; I++)
-    RetainValuesI.push_back(AllImportedModules[I]);
+  SmallVector<Metadata *, 16> RetainValuesI(AllImportedModules.begin(),
+                                            AllImportedModules.end());
   DIArray IMs = getOrCreateArray(RetainValuesI);
   DIType(TempImportedModules).replaceAllUsesWith(IMs);
+
+  // Now that all temp nodes have been replaced or deleted, resolve remaining
+  // cycles.
+  for (const auto &N : UnresolvedNodes)
+    if (N && !N->isResolved())
+      N->resolveCycles();
+  UnresolvedNodes.clear();
+
+  // Can't handle unresolved nodes anymore.
+  AllowUnresolvedNodes = false;
 }
 
 /// If N is compile unit return NULL otherwise return N.
@@ -102,10 +130,8 @@ static MDNode *getNonCompileUnitScope(MDNode *N) {
 static MDNode *createFilePathPair(LLVMContext &VMContext, StringRef Filename,
                                   StringRef Directory) {
   assert(!Filename.empty() && "Unable to create file without name");
-  Value *Pair[] = {
-    MDString::get(VMContext, Filename),
-    MDString::get(VMContext, Directory)
-  };
+  Metadata *Pair[] = {MDString::get(VMContext, Filename),
+                      MDString::get(VMContext, Directory)};
   return MDNode::get(VMContext, Pair);
 }
 
@@ -117,35 +143,35 @@ DICompileUnit DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
                                            DebugEmissionKind Kind,
                                            bool EmitDebugInfo) {
 
-  assert(((Lang <= dwarf::DW_LANG_OCaml && Lang >= dwarf::DW_LANG_C89) ||
+  assert(((Lang <= dwarf::DW_LANG_Fortran08 && Lang >= dwarf::DW_LANG_C89) ||
           (Lang <= dwarf::DW_LANG_hi_user && Lang >= dwarf::DW_LANG_lo_user)) &&
          "Invalid Language tag");
   assert(!Filename.empty() &&
          "Unable to create compile unit without filename");
-  Value *TElts[] = {HeaderBuilder::get(DW_TAG_base_type).get(VMContext)};
-  TempEnumTypes = MDNode::getTemporary(VMContext, TElts);
-
-  TempRetainTypes = MDNode::getTemporary(VMContext, TElts);
-
-  TempSubprograms = MDNode::getTemporary(VMContext, TElts);
-
-  TempGVs = MDNode::getTemporary(VMContext, TElts);
-
-  TempImportedModules = MDNode::getTemporary(VMContext, TElts);
-
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_compile_unit)
-                       .concat(Lang)
-                       .concat(Producer)
-                       .concat(isOptimized)
-                       .concat(Flags)
-                       .concat(RunTimeVer)
-                       .concat(SplitName)
-                       .concat(Kind)
-                       .get(VMContext),
-                   createFilePathPair(VMContext, Filename, Directory),
-                   TempEnumTypes, TempRetainTypes, TempSubprograms, TempGVs,
-                   TempImportedModules};
 
+  // TODO: Once we make MDCompileUnit distinct, stop using temporaries here
+  // (just start with operands assigned to nullptr).
+  TempEnumTypes = MDTuple::getTemporary(VMContext, None).release();
+  TempRetainTypes = MDTuple::getTemporary(VMContext, None).release();
+  TempSubprograms = MDTuple::getTemporary(VMContext, None).release();
+  TempGVs = MDTuple::getTemporary(VMContext, None).release();
+  TempImportedModules = MDTuple::getTemporary(VMContext, None).release();
+
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_compile_unit)
+                          .concat(Lang)
+                          .concat(Producer)
+                          .concat(isOptimized)
+                          .concat(Flags)
+                          .concat(RunTimeVer)
+                          .concat(SplitName)
+                          .concat(Kind)
+                          .get(VMContext),
+                      createFilePathPair(VMContext, Filename, Directory),
+                      TempEnumTypes, TempRetainTypes, TempSubprograms, TempGVs,
+                      TempImportedModules};
+
+  // TODO: Switch to getDistinct().  We never want to merge compile units based
+  // on contents.
   MDNode *CUNode = MDNode::get(VMContext, Elts);
 
   // Create a named metadata so that it is easier to find cu in a module.
@@ -158,20 +184,21 @@ DICompileUnit DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
     NMD->addOperand(CUNode);
   }
 
+  trackIfUnresolved(CUNode);
   return DICompileUnit(CUNode);
 }
 
 static DIImportedEntity
 createImportedModule(LLVMContext &C, dwarf::Tag Tag, DIScope Context,
-                     Value *NS, unsigned Line, StringRef Name,
-                     SmallVectorImpl<TrackingVH<MDNode>> &AllImportedModules) {
+                     Metadata *NS, unsigned Line, StringRef Name,
+                     SmallVectorImpl<TrackingMDNodeRef> &AllImportedModules) {
   const MDNode *R;
-  Value *Elts[] = {HeaderBuilder::get(Tag).concat(Line).concat(Name).get(C),
-                   Context, NS};
+  Metadata *Elts[] = {HeaderBuilder::get(Tag).concat(Line).concat(Name).get(C),
+                      Context, NS};
   R = MDNode::get(C, Elts);
   DIImportedEntity M(R);
   assert(M.Verify() && "Imported module should be valid");
-  AllImportedModules.push_back(TrackingVH<MDNode>(M));
+  AllImportedModules.emplace_back(M.get());
   return M;
 }
 
@@ -194,7 +221,8 @@ DIImportedEntity DIBuilder::createImportedDeclaration(DIScope Context,
                                                       unsigned Line, StringRef Name) {
   // Make sure to use the unique identifier based metadata reference for
   // types that have one.
-  Value *V = Decl.isType() ? static_cast<Value*>(DIType(Decl).getRef()) : Decl;
+  Metadata *V =
+      Decl.isType() ? static_cast<Metadata *>(DIType(Decl).getRef()) : Decl;
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_declaration,
                                 Context, V, Line, Name,
                                 AllImportedModules);
@@ -208,16 +236,18 @@ DIImportedEntity DIBuilder::createImportedDeclaration(DIScope Context,
 }
 
 DIFile DIBuilder::createFile(StringRef Filename, StringRef Directory) {
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_file_type).get(VMContext),
-                   createFilePathPair(VMContext, Filename, Directory)};
+  Metadata *Elts[] = {
+      HeaderBuilder::get(dwarf::DW_TAG_file_type).get(VMContext),
+      createFilePathPair(VMContext, Filename, Directory)};
   return DIFile(MDNode::get(VMContext, Elts));
 }
 
 DIEnumerator DIBuilder::createEnumerator(StringRef Name, int64_t Val) {
   assert(!Name.empty() && "Unable to create enumerator without name");
-  Value *Elts[] = {
-      HeaderBuilder::get(dwarf::DW_TAG_enumerator).concat(Name).concat(Val).get(
-          VMContext)};
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_enumerator)
+                          .concat(Name)
+                          .concat(Val)
+                          .get(VMContext)};
   return DIEnumerator(MDNode::get(VMContext, Elts));
 }
 
@@ -225,7 +255,7 @@ DIBasicType DIBuilder::createUnspecifiedType(StringRef Name) {
   assert(!Name.empty() && "Unable to create type without name");
   // Unspecified types are encoded in DIBasicType format. Line number, filename,
   // size, alignment, offset and flags are always empty here.
-  Value *Elts[] = {
+  Metadata *Elts[] = {
       HeaderBuilder::get(dwarf::DW_TAG_unspecified_type)
           .concat(Name)
           .concat(0)
@@ -251,7 +281,7 @@ DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
   assert(!Name.empty() && "Unable to create type without name");
   // Basic types are encoded in DIBasicType format. Line number, filename,
   // offset and flags are always empty here.
-  Value *Elts[] = {
+  Metadata *Elts[] = {
       HeaderBuilder::get(dwarf::DW_TAG_base_type)
           .concat(Name)
           .concat(0) // Line
@@ -269,17 +299,17 @@ DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
 
 DIDerivedType DIBuilder::createQualifiedType(unsigned Tag, DIType FromTy) {
   // Qualified types are encoded in DIDerivedType format.
-  Value *Elts[] = {HeaderBuilder::get(Tag)
-                       .concat(StringRef()) // Name
-                       .concat(0)           // Line
-                       .concat(0)           // Size
-                       .concat(0)           // Align
-                       .concat(0)           // Offset
-                       .concat(0)           // Flags
-                       .get(VMContext),
-                   nullptr, // Filename
-                   nullptr, // Unused
-                   FromTy.getRef()};
+  Metadata *Elts[] = {HeaderBuilder::get(Tag)
+                          .concat(StringRef()) // Name
+                          .concat(0)           // Line
+                          .concat(0)           // Size
+                          .concat(0)           // Align
+                          .concat(0)           // Offset
+                          .concat(0)           // Flags
+                          .get(VMContext),
+                      nullptr, // Filename
+                      nullptr, // Unused
+                      FromTy.getRef()};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
@@ -287,68 +317,69 @@ DIDerivedType
 DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits,
                              uint64_t AlignInBits, StringRef Name) {
   // Pointer types are encoded in DIDerivedType format.
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_pointer_type)
-                       .concat(Name)
-                       .concat(0) // Line
-                       .concat(SizeInBits)
-                       .concat(AlignInBits)
-                       .concat(0) // Offset
-                       .concat(0) // Flags
-                       .get(VMContext),
-                   nullptr, // Filename
-                   nullptr, // Unused
-                   PointeeTy.getRef()};
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_pointer_type)
+                          .concat(Name)
+                          .concat(0) // Line
+                          .concat(SizeInBits)
+                          .concat(AlignInBits)
+                          .concat(0) // Offset
+                          .concat(0) // Flags
+                          .get(VMContext),
+                      nullptr, // Filename
+                      nullptr, // Unused
+                      PointeeTy.getRef()};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
-DIDerivedType DIBuilder::createMemberPointerType(DIType PointeeTy,
-                                                 DIType Base) {
+DIDerivedType
+DIBuilder::createMemberPointerType(DIType PointeeTy, DIType Base,
+                                   uint64_t SizeInBits, uint64_t AlignInBits) {
   // Pointer types are encoded in DIDerivedType format.
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_ptr_to_member_type)
-                       .concat(StringRef())
-                       .concat(0) // Line
-                       .concat(0) // Size
-                       .concat(0) // Align
-                       .concat(0) // Offset
-                       .concat(0) // Flags
-                       .get(VMContext),
-                   nullptr, // Filename
-                   nullptr, // Unused
-                   PointeeTy.getRef(), Base.getRef()};
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_ptr_to_member_type)
+                          .concat(StringRef())
+                          .concat(0) // Line
+                          .concat(SizeInBits) // Size
+                          .concat(AlignInBits) // Align
+                          .concat(0) // Offset
+                          .concat(0) // Flags
+                          .get(VMContext),
+                      nullptr, // Filename
+                      nullptr, // Unused
+                      PointeeTy.getRef(), Base.getRef()};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 DIDerivedType DIBuilder::createReferenceType(unsigned Tag, DIType RTy) {
   assert(RTy.isType() && "Unable to create reference type");
   // References are encoded in DIDerivedType format.
-  Value *Elts[] = {HeaderBuilder::get(Tag)
-                       .concat(StringRef()) // Name
-                       .concat(0)           // Line
-                       .concat(0)           // Size
-                       .concat(0)           // Align
-                       .concat(0)           // Offset
-                       .concat(0)           // Flags
-                       .get(VMContext),
-                   nullptr, // Filename
-                   nullptr, // TheCU,
-                   RTy.getRef()};
+  Metadata *Elts[] = {HeaderBuilder::get(Tag)
+                          .concat(StringRef()) // Name
+                          .concat(0)           // Line
+                          .concat(0)           // Size
+                          .concat(0)           // Align
+                          .concat(0)           // Offset
+                          .concat(0)           // Flags
+                          .get(VMContext),
+                      nullptr, // Filename
+                      nullptr, // TheCU,
+                      RTy.getRef()};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 DIDerivedType DIBuilder::createTypedef(DIType Ty, StringRef Name, DIFile File,
                                        unsigned LineNo, DIDescriptor Context) {
   // typedefs are encoded in DIDerivedType format.
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_typedef)
-                       .concat(Name)
-                       .concat(LineNo)
-                       .concat(0) // Size
-                       .concat(0) // Align
-                       .concat(0) // Offset
-                       .concat(0) // Flags
-                       .get(VMContext),
-                   File.getFileNode(),
-                   DIScope(getNonCompileUnitScope(Context)).getRef(),
-                   Ty.getRef()};
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_typedef)
+                          .concat(Name)
+                          .concat(LineNo)
+                          .concat(0) // Size
+                          .concat(0) // Align
+                          .concat(0) // Offset
+                          .concat(0) // Flags
+                          .get(VMContext),
+                      File.getFileNode(),
+                      DIScope(getNonCompileUnitScope(Context)).getRef(),
+                      Ty.getRef()};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
@@ -356,15 +387,15 @@ DIDerivedType DIBuilder::createFriend(DIType Ty, DIType FriendTy) {
   // typedefs are encoded in DIDerivedType format.
   assert(Ty.isType() && "Invalid type!");
   assert(FriendTy.isType() && "Invalid friend type!");
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_friend)
-                       .concat(StringRef()) // Name
-                       .concat(0)           // Line
-                       .concat(0)           // Size
-                       .concat(0)           // Align
-                       .concat(0)           // Offset
-                       .concat(0)           // Flags
-                       .get(VMContext),
-                   nullptr, Ty.getRef(), FriendTy.getRef()};
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_friend)
+                          .concat(StringRef()) // Name
+                          .concat(0)           // Line
+                          .concat(0)           // Size
+                          .concat(0)           // Align
+                          .concat(0)           // Offset
+                          .concat(0)           // Flags
+                          .get(VMContext),
+                      nullptr, Ty.getRef(), FriendTy.getRef()};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
@@ -373,16 +404,17 @@ DIDerivedType DIBuilder::createInheritance(DIType Ty, DIType BaseTy,
                                            unsigned Flags) {
   assert(Ty.isType() && "Unable to create inheritance");
   // TAG_inheritance is encoded in DIDerivedType format.
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_inheritance)
-                       .concat(StringRef()) // Name
-                       .concat(0)           // Line
-                       .concat(0)           // Size
-                       .concat(0)           // Align
-                       .concat(BaseOffset)
-                       .concat(Flags)
-                       .get(VMContext),
-                   nullptr, Ty.getRef(), BaseTy.getRef()};
-  return DIDerivedType(MDNode::get(VMContext, Elts));
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_inheritance)
+                          .concat(StringRef()) // Name
+                          .concat(0)           // Line
+                          .concat(0)           // Size
+                          .concat(0)           // Align
+                          .concat(BaseOffset)
+                          .concat(Flags)
+                          .get(VMContext),
+                      nullptr, Ty.getRef(), BaseTy.getRef()};
+  auto R = DIDerivedType(MDNode::get(VMContext, Elts));
+  return R;
 }
 
 DIDerivedType DIBuilder::createMemberType(DIDescriptor Scope, StringRef Name,
@@ -392,20 +424,26 @@ DIDerivedType DIBuilder::createMemberType(DIDescriptor Scope, StringRef Name,
                                           uint64_t OffsetInBits, unsigned Flags,
                                           DIType Ty) {
   // TAG_member is encoded in DIDerivedType format.
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_member)
-                       .concat(Name)
-                       .concat(LineNumber)
-                       .concat(SizeInBits)
-                       .concat(AlignInBits)
-                       .concat(OffsetInBits)
-                       .concat(Flags)
-                       .get(VMContext),
-                   File.getFileNode(),
-                   DIScope(getNonCompileUnitScope(Scope)).getRef(),
-                   Ty.getRef()};
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_member)
+                          .concat(Name)
+                          .concat(LineNumber)
+                          .concat(SizeInBits)
+                          .concat(AlignInBits)
+                          .concat(OffsetInBits)
+                          .concat(Flags)
+                          .get(VMContext),
+                      File.getFileNode(),
+                      DIScope(getNonCompileUnitScope(Scope)).getRef(),
+                      Ty.getRef()};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
+static Metadata *getConstantOrNull(Constant *C) {
+  if (C)
+    return ConstantAsMetadata::get(C);
+  return nullptr;
+}
+
 DIDerivedType DIBuilder::createStaticMemberType(DIDescriptor Scope,
                                                 StringRef Name, DIFile File,
                                                 unsigned LineNumber, DIType Ty,
@@ -413,17 +451,17 @@ DIDerivedType DIBuilder::createStaticMemberType(DIDescriptor Scope,
                                                 llvm::Constant *Val) {
   // TAG_member is encoded in DIDerivedType format.
   Flags |= DIDescriptor::FlagStaticMember;
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_member)
-                       .concat(Name)
-                       .concat(LineNumber)
-                       .concat(0) // Size
-                       .concat(0) // Align
-                       .concat(0) // Offset
-                       .concat(Flags)
-                       .get(VMContext),
-                   File.getFileNode(),
-                   DIScope(getNonCompileUnitScope(Scope)).getRef(), Ty.getRef(),
-                   Val};
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_member)
+                          .concat(Name)
+                          .concat(LineNumber)
+                          .concat(0) // Size
+                          .concat(0) // Align
+                          .concat(0) // Offset
+                          .concat(Flags)
+                          .get(VMContext),
+                      File.getFileNode(),
+                      DIScope(getNonCompileUnitScope(Scope)).getRef(),
+                      Ty.getRef(), getConstantOrNull(Val)};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
@@ -434,16 +472,16 @@ DIDerivedType DIBuilder::createObjCIVar(StringRef Name, DIFile File,
                                         uint64_t OffsetInBits, unsigned Flags,
                                         DIType Ty, MDNode *PropertyNode) {
   // TAG_member is encoded in DIDerivedType format.
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_member)
-                       .concat(Name)
-                       .concat(LineNumber)
-                       .concat(SizeInBits)
-                       .concat(AlignInBits)
-                       .concat(OffsetInBits)
-                       .concat(Flags)
-                       .get(VMContext),
-                   File.getFileNode(), getNonCompileUnitScope(File), Ty,
-                   PropertyNode};
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_member)
+                          .concat(Name)
+                          .concat(LineNumber)
+                          .concat(SizeInBits)
+                          .concat(AlignInBits)
+                          .concat(OffsetInBits)
+                          .concat(Flags)
+                          .get(VMContext),
+                      File.getFileNode(), getNonCompileUnitScope(File), Ty,
+                      PropertyNode};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
@@ -451,69 +489,65 @@ DIObjCProperty
 DIBuilder::createObjCProperty(StringRef Name, DIFile File, unsigned LineNumber,
                               StringRef GetterName, StringRef SetterName,
                               unsigned PropertyAttributes, DIType Ty) {
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_APPLE_property)
-                       .concat(Name)
-                       .concat(LineNumber)
-                       .concat(GetterName)
-                       .concat(SetterName)
-                       .concat(PropertyAttributes)
-                       .get(VMContext),
-                   File, Ty};
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_APPLE_property)
+                          .concat(Name)
+                          .concat(LineNumber)
+                          .concat(GetterName)
+                          .concat(SetterName)
+                          .concat(PropertyAttributes)
+                          .get(VMContext),
+                      File, Ty};
   return DIObjCProperty(MDNode::get(VMContext, Elts));
 }
 
 DITemplateTypeParameter
 DIBuilder::createTemplateTypeParameter(DIDescriptor Context, StringRef Name,
-                                       DIType Ty, MDNode *File, unsigned LineNo,
-                                       unsigned ColumnNo) {
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_template_type_parameter)
-                       .concat(Name)
-                       .concat(LineNo)
-                       .concat(ColumnNo)
-                       .get(VMContext),
-                   DIScope(getNonCompileUnitScope(Context)).getRef(),
-                   Ty.getRef(), File};
+                                       DIType Ty) {
+  assert(!DIScope(getNonCompileUnitScope(Context)).getRef() &&
+         "Expected compile unit");
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_template_type_parameter)
+                          .concat(Name)
+                          .concat(0)
+                          .concat(0)
+                          .get(VMContext),
+                      nullptr, Ty.getRef(), nullptr};
   return DITemplateTypeParameter(MDNode::get(VMContext, Elts));
 }
 
-static DITemplateValueParameter createTemplateValueParameterHelper(
-    LLVMContext &VMContext, unsigned Tag, DIDescriptor Context, StringRef Name,
-    DIType Ty, Value *Val, MDNode *File, unsigned LineNo, unsigned ColumnNo) {
-  Value *Elts[] = {
-      HeaderBuilder::get(Tag).concat(Name).concat(LineNo).concat(ColumnNo).get(
-          VMContext),
-      DIScope(getNonCompileUnitScope(Context)).getRef(), Ty.getRef(), Val,
-      File};
+static DITemplateValueParameter
+createTemplateValueParameterHelper(LLVMContext &VMContext, unsigned Tag,
+                                   DIDescriptor Context, StringRef Name,
+                                   DIType Ty, Metadata *MD) {
+  assert(!DIScope(getNonCompileUnitScope(Context)).getRef() &&
+         "Expected compile unit");
+  Metadata *Elts[] = {
+      HeaderBuilder::get(Tag).concat(Name).concat(0).concat(0).get(VMContext),
+      nullptr, Ty.getRef(), MD, nullptr};
   return DITemplateValueParameter(MDNode::get(VMContext, Elts));
 }
 
 DITemplateValueParameter
 DIBuilder::createTemplateValueParameter(DIDescriptor Context, StringRef Name,
-                                        DIType Ty, Constant *Val, MDNode *File,
-                                        unsigned LineNo, unsigned ColumnNo) {
+                                        DIType Ty, Constant *Val) {
   return createTemplateValueParameterHelper(
-      VMContext, dwarf::DW_TAG_template_value_parameter, Context, Name, Ty, Val,
-      File, LineNo, ColumnNo);
+      VMContext, dwarf::DW_TAG_template_value_parameter, Context, Name, Ty,
+      getConstantOrNull(Val));
 }
 
 DITemplateValueParameter
 DIBuilder::createTemplateTemplateParameter(DIDescriptor Context, StringRef Name,
-                                           DIType Ty, StringRef Val,
-                                           MDNode *File, unsigned LineNo,
-                                           unsigned ColumnNo) {
+                                           DIType Ty, StringRef Val) {
   return createTemplateValueParameterHelper(
       VMContext, dwarf::DW_TAG_GNU_template_template_param, Context, Name, Ty,
-      MDString::get(VMContext, Val), File, LineNo, ColumnNo);
+      MDString::get(VMContext, Val));
 }
 
 DITemplateValueParameter
 DIBuilder::createTemplateParameterPack(DIDescriptor Context, StringRef Name,
-                                       DIType Ty, DIArray Val,
-                                       MDNode *File, unsigned LineNo,
-                                       unsigned ColumnNo) {
+                                       DIType Ty, DIArray Val) {
   return createTemplateValueParameterHelper(
       VMContext, dwarf::DW_TAG_GNU_template_parameter_pack, Context, Name, Ty,
-      Val, File, LineNo, ColumnNo);
+      Val);
 }
 
 DICompositeType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
@@ -529,7 +563,7 @@ DICompositeType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
   assert((!Context || Context.isScope() || Context.isType()) &&
          "createClassType should be called with a valid Context");
   // TAG_class_type is encoded in DICompositeType format.
-  Value *Elts[] = {
+  Metadata *Elts[] = {
       HeaderBuilder::get(dwarf::DW_TAG_class_type)
           .concat(Name)
           .concat(LineNumber)
@@ -548,6 +582,7 @@ DICompositeType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
          "createClassType should return a DICompositeType");
   if (!UniqueIdentifier.empty())
     retainType(R);
+  trackIfUnresolved(R);
   return R;
 }
 
@@ -562,7 +597,7 @@ DICompositeType DIBuilder::createStructType(DIDescriptor Context,
                                             DIType VTableHolder,
                                             StringRef UniqueIdentifier) {
  // TAG_structure_type is encoded in DICompositeType format.
-  Value *Elts[] = {
+  Metadata *Elts[] = {
       HeaderBuilder::get(dwarf::DW_TAG_structure_type)
           .concat(Name)
           .concat(LineNumber)
@@ -581,6 +616,7 @@ DICompositeType DIBuilder::createStructType(DIDescriptor Context,
          "createStructType should return a DICompositeType");
   if (!UniqueIdentifier.empty())
     retainType(R);
+  trackIfUnresolved(R);
   return R;
 }
 
@@ -592,7 +628,7 @@ DICompositeType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name,
                                            unsigned RunTimeLang,
                                            StringRef UniqueIdentifier) {
   // TAG_union_type is encoded in DICompositeType format.
-  Value *Elts[] = {
+  Metadata *Elts[] = {
       HeaderBuilder::get(dwarf::DW_TAG_union_type)
           .concat(Name)
           .concat(LineNumber)
@@ -609,6 +645,7 @@ DICompositeType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name,
   DICompositeType R(MDNode::get(VMContext, Elts));
   if (!UniqueIdentifier.empty())
     retainType(R);
+  trackIfUnresolved(R);
   return R;
 }
 
@@ -616,7 +653,7 @@ DISubroutineType DIBuilder::createSubroutineType(DIFile File,
                                                  DITypeArray ParameterTypes,
                                                  unsigned Flags) {
   // TAG_subroutine_type is encoded in DICompositeType format.
-  Value *Elts[] = {
+  Metadata *Elts[] = {
       HeaderBuilder::get(dwarf::DW_TAG_subroutine_type)
           .concat(StringRef())
           .concat(0)     // Line
@@ -637,7 +674,7 @@ DICompositeType DIBuilder::createEnumerationType(
     uint64_t SizeInBits, uint64_t AlignInBits, DIArray Elements,
     DIType UnderlyingType, StringRef UniqueIdentifier) {
   // TAG_enumeration_type is encoded in DICompositeType format.
-  Value *Elts[] = {
+  Metadata *Elts[] = {
       HeaderBuilder::get(dwarf::DW_TAG_enumeration_type)
           .concat(Name)
           .concat(LineNumber)
@@ -655,13 +692,14 @@ DICompositeType DIBuilder::createEnumerationType(
   AllEnumTypes.push_back(CTy);
   if (!UniqueIdentifier.empty())
     retainType(CTy);
+  trackIfUnresolved(CTy);
   return CTy;
 }
 
 DICompositeType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
                                            DIType Ty, DIArray Subscripts) {
   // TAG_array_type is encoded in DICompositeType format.
-  Value *Elts[] = {
+  Metadata *Elts[] = {
       HeaderBuilder::get(dwarf::DW_TAG_array_type)
           .concat(StringRef())
           .concat(0) // Line
@@ -676,13 +714,15 @@ DICompositeType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
       Ty.getRef(), Subscripts, nullptr, nullptr,
       nullptr // Type Identifer
   };
-  return DICompositeType(MDNode::get(VMContext, Elts));
+  DICompositeType R(MDNode::get(VMContext, Elts));
+  trackIfUnresolved(R);
+  return R;
 }
 
 DICompositeType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits,
                                             DIType Ty, DIArray Subscripts) {
   // A vector is an array type with the FlagVector flag applied.
-  Value *Elts[] = {
+  Metadata *Elts[] = {
       HeaderBuilder::get(dwarf::DW_TAG_array_type)
           .concat("")
           .concat(0) // Line
@@ -697,7 +737,9 @@ DICompositeType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits,
       Ty.getRef(), Subscripts, nullptr, nullptr,
       nullptr // Type Identifer
   };
-  return DICompositeType(MDNode::get(VMContext, Elts));
+  DICompositeType R(MDNode::get(VMContext, Elts));
+  trackIfUnresolved(R);
+  return R;
 }
 
 static HeaderBuilder setTypeFlagsInHeader(StringRef Header,
@@ -710,19 +752,20 @@ static HeaderBuilder setTypeFlagsInHeader(StringRef Header,
     Flags = 0;
   Flags |= FlagsToSet;
 
-  return HeaderBuilder(Twine(I.getPrefix())).concat(Flags).concat(
-      I.getSuffix());
+  return HeaderBuilder()
+      .concat(I.getPrefix())
+      .concat(Flags)
+      .concat(I.getSuffix());
 }
 
 static DIType createTypeWithFlags(LLVMContext &Context, DIType Ty,
                                   unsigned FlagsToSet) {
-  SmallVector<Value *, 9> Elts;
+  SmallVector<Metadata *, 9> Elts;
   MDNode *N = Ty;
   assert(N && "Unexpected input DIType!");
   // Update header field.
   Elts.push_back(setTypeFlagsInHeader(Ty.getHeader(), FlagsToSet).get(Context));
-  for (unsigned I = 1, E = N->getNumOperands(); I != E; ++I)
-    Elts.push_back(N->getOperand(I));
+  Elts.append(N->op_begin() + 1, N->op_end());
 
   return DIType(MDNode::get(Context, Elts));
 }
@@ -740,9 +783,7 @@ DIType DIBuilder::createObjectPointerType(DIType Ty) {
   return createTypeWithFlags(VMContext, Ty, Flags);
 }
 
-void DIBuilder::retainType(DIType T) {
-  AllRetainTypes.push_back(TrackingVH<MDNode>(T));
-}
+void DIBuilder::retainType(DIType T) { AllRetainTypes.emplace_back(T); }
 
 DIBasicType DIBuilder::createUnspecifiedParameter() {
   return DIBasicType();
@@ -754,7 +795,7 @@ DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, DIDescriptor Scope,
                              uint64_t SizeInBits, uint64_t AlignInBits,
                              StringRef UniqueIdentifier) {
   // Create a temporary MDNode.
-  Value *Elts[] = {
+  Metadata *Elts[] = {
       HeaderBuilder::get(Tag)
           .concat(Name)
           .concat(Line)
@@ -775,22 +816,23 @@ DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, DIDescriptor Scope,
          "createForwardDecl result should be a DIType");
   if (!UniqueIdentifier.empty())
     retainType(RetTy);
+  trackIfUnresolved(RetTy);
   return RetTy;
 }
 
-DICompositeType DIBuilder::createReplaceableForwardDecl(
+DICompositeType DIBuilder::createReplaceableCompositeType(
     unsigned Tag, StringRef Name, DIDescriptor Scope, DIFile F, unsigned Line,
     unsigned RuntimeLang, uint64_t SizeInBits, uint64_t AlignInBits,
-    StringRef UniqueIdentifier) {
+    unsigned Flags, StringRef UniqueIdentifier) {
   // Create a temporary MDNode.
-  Value *Elts[] = {
+  Metadata *Elts[] = {
       HeaderBuilder::get(Tag)
           .concat(Name)
           .concat(Line)
           .concat(SizeInBits)
           .concat(AlignInBits)
           .concat(0) // Offset
-          .concat(DIDescriptor::FlagFwdDecl)
+          .concat(Flags)
           .concat(RuntimeLang)
           .get(VMContext),
       F.getFileNode(), DIScope(getNonCompileUnitScope(Scope)).getRef(), nullptr,
@@ -798,21 +840,21 @@ DICompositeType DIBuilder::createReplaceableForwardDecl(
       nullptr, // TemplateParams
       UniqueIdentifier.empty() ? nullptr
                                : MDString::get(VMContext, UniqueIdentifier)};
-  MDNode *Node = MDNode::getTemporary(VMContext, Elts);
-  DICompositeType RetTy(Node);
+  DICompositeType RetTy(MDNode::getTemporary(VMContext, Elts).release());
   assert(RetTy.isCompositeType() &&
          "createReplaceableForwardDecl result should be a DIType");
   if (!UniqueIdentifier.empty())
     retainType(RetTy);
+  trackIfUnresolved(RetTy);
   return RetTy;
 }
 
-DIArray DIBuilder::getOrCreateArray(ArrayRef<Value *> Elements) {
+DIArray DIBuilder::getOrCreateArray(ArrayRef<Metadata *> Elements) {
   return DIArray(MDNode::get(VMContext, Elements));
 }
 
-DITypeArray DIBuilder::getOrCreateTypeArray(ArrayRef<Value *> Elements) {
-  SmallVector<llvm::Value *, 16> Elts; 
+DITypeArray DIBuilder::getOrCreateTypeArray(ArrayRef<Metadata *> Elements) {
+  SmallVector<llvm::Metadata *, 16> Elts;
   for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
     if (Elements[i] && isa<MDNode>(Elements[i]))
       Elts.push_back(DIType(cast<MDNode>(Elements[i])).getRef());
@@ -823,10 +865,10 @@ DITypeArray DIBuilder::getOrCreateTypeArray(ArrayRef<Value *> Elements) {
 }
 
 DISubrange DIBuilder::getOrCreateSubrange(int64_t Lo, int64_t Count) {
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_subrange_type)
-                       .concat(Lo)
-                       .concat(Count)
-                       .get(VMContext)};
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_subrange_type)
+                          .concat(Lo)
+                          .concat(Count)
+                          .get(VMContext)};
 
   return DISubrange(MDNode::get(VMContext, Elts));
 }
@@ -835,17 +877,24 @@ static DIGlobalVariable createGlobalVariableHelper(
     LLVMContext &VMContext, DIDescriptor Context, StringRef Name,
     StringRef LinkageName, DIFile F, unsigned LineNumber, DITypeRef Ty,
     bool isLocalToUnit, Constant *Val, MDNode *Decl, bool isDefinition,
-    std::function<MDNode *(ArrayRef<Value *>)> CreateFunc) {
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_variable)
-                       .concat(Name)
-                       .concat(Name)
-                       .concat(LinkageName)
-                       .concat(LineNumber)
-                       .concat(isLocalToUnit)
-                       .concat(isDefinition)
-                       .get(VMContext),
-                   DIScope(getNonCompileUnitScope(Context)).getRef(), F, Ty, Val,
-                   DIDescriptor(Decl)};
+    std::function<MDNode *(ArrayRef<Metadata *>)> CreateFunc) {
+
+  MDNode *TheCtx = getNonCompileUnitScope(Context);
+  if (DIScope(TheCtx).isCompositeType()) {
+    assert(!DICompositeType(TheCtx).getIdentifier() &&
+           "Context of a global variable should not be a type with identifier");
+  }
+
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_variable)
+                          .concat(Name)
+                          .concat(Name)
+                          .concat(LinkageName)
+                          .concat(LineNumber)
+                          .concat(isLocalToUnit)
+                          .concat(isDefinition)
+                          .get(VMContext),
+                      TheCtx, F, Ty, getConstantOrNull(Val),
+                      DIDescriptor(Decl)};
 
   return DIGlobalVariable(CreateFunc(Elts));
 }
@@ -854,13 +903,13 @@ DIGlobalVariable DIBuilder::createGlobalVariable(
     DIDescriptor Context, StringRef Name, StringRef LinkageName, DIFile F,
     unsigned LineNumber, DITypeRef Ty, bool isLocalToUnit, Constant *Val,
     MDNode *Decl) {
-  return createGlobalVariableHelper(VMContext, Context, Name, LinkageName, F,
-                                    LineNumber, Ty, isLocalToUnit, Val, Decl, true,
-                                    [&] (ArrayRef<Value *> Elts) -> MDNode * {
-                                      MDNode *Node = MDNode::get(VMContext, Elts);
-                                      AllGVs.push_back(Node);
-                                      return Node;
-                                    });
+  return createGlobalVariableHelper(
+      VMContext, Context, Name, LinkageName, F, LineNumber, Ty, isLocalToUnit,
+      Val, Decl, true, [&](ArrayRef<Metadata *> Elts) -> MDNode *{
+        MDNode *Node = MDNode::get(VMContext, Elts);
+        AllGVs.push_back(Node);
+        return Node;
+      });
 }
 
 DIGlobalVariable DIBuilder::createTempGlobalVariableFwdDecl(
@@ -868,10 +917,10 @@ DIGlobalVariable DIBuilder::createTempGlobalVariableFwdDecl(
     unsigned LineNumber, DITypeRef Ty, bool isLocalToUnit, Constant *Val,
     MDNode *Decl) {
   return createGlobalVariableHelper(VMContext, Context, Name, LinkageName, F,
-                                    LineNumber, Ty, isLocalToUnit, Val, Decl, false,
-                                    [&] (ArrayRef<Value *> Elts) {
-                                      return MDNode::getTemporary(VMContext, Elts);
-                                    });
+                                    LineNumber, Ty, isLocalToUnit, Val, Decl,
+                                    false, [&](ArrayRef<Metadata *> Elts) {
+    return MDNode::getTemporary(VMContext, Elts).release();
+  });
 }
 
 DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope,
@@ -882,12 +931,12 @@ DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope,
   DIDescriptor Context(getNonCompileUnitScope(Scope));
   assert((!Context || Context.isScope()) &&
          "createLocalVariable should be called with a valid Context");
-  Value *Elts[] = {HeaderBuilder::get(Tag)
-                       .concat(Name)
-                       .concat(LineNo | (ArgNo << 24))
-                       .concat(Flags)
-                       .get(VMContext),
-                   getNonCompileUnitScope(Scope), File, Ty};
+  Metadata *Elts[] = {HeaderBuilder::get(Tag)
+                          .concat(Name)
+                          .concat(LineNo | (ArgNo << 24))
+                          .concat(Flags)
+                          .get(VMContext),
+                      getNonCompileUnitScope(Scope), File, Ty};
   MDNode *Node = MDNode::get(VMContext, Elts);
   if (AlwaysPreserve) {
     // The optimizer may remove local variable. If there is an interest
@@ -895,7 +944,7 @@ DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope,
     // named mdnode.
     DISubprogram Fn(getDISubprogram(Scope));
     assert(Fn && "Missing subprogram for local variable");
-    PreservedVariables[Fn].push_back(Node);
+    PreservedVariables[Fn].emplace_back(Node);
   }
   DIVariable RetVar(Node);
   assert(RetVar.isVariable() &&
@@ -903,17 +952,23 @@ DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope,
   return RetVar;
 }
 
-DIExpression DIBuilder::createExpression(ArrayRef<int64_t> Addr) {
+DIExpression DIBuilder::createExpression(ArrayRef<uint64_t> Addr) {
   auto Header = HeaderBuilder::get(DW_TAG_expression);
-  for (int64_t I : Addr)
+  for (uint64_t I : Addr)
     Header.concat(I);
-  Value *Elts[] = {Header.get(VMContext)};
+  Metadata *Elts[] = {Header.get(VMContext)};
   return DIExpression(MDNode::get(VMContext, Elts));
 }
 
-DIExpression DIBuilder::createPieceExpression(unsigned OffsetInBytes,
-                                              unsigned SizeInBytes) {
-  int64_t Addr[] = {dwarf::DW_OP_piece, OffsetInBytes, SizeInBytes};
+DIExpression DIBuilder::createExpression(ArrayRef<int64_t> Signed) {
+  // TODO: Remove the callers of this signed version and delete.
+  SmallVector<uint64_t, 8> Addr(Signed.begin(), Signed.end());
+  return createExpression(Addr);
+}
+
+DIExpression DIBuilder::createBitPieceExpression(unsigned OffsetInBits,
+                                                 unsigned SizeInBits) {
+  int64_t Addr[] = {dwarf::DW_OP_bit_piece, OffsetInBits, SizeInBits};
   return createExpression(Addr);
 }
 
@@ -932,31 +987,30 @@ DISubprogram DIBuilder::createFunction(DIScopeRef Context, StringRef Name,
                         Flags, isOptimized, Fn, TParams, Decl);
 }
 
-static DISubprogram
-createFunctionHelper(LLVMContext &VMContext, DIDescriptor Context, StringRef Name,
-                     StringRef LinkageName, DIFile File, unsigned LineNo,
-                     DICompositeType Ty, bool isLocalToUnit, bool isDefinition,
-                     unsigned ScopeLine, unsigned Flags, bool isOptimized,
-                     Function *Fn, MDNode *TParams, MDNode *Decl, MDNode *Vars,
-                     std::function<MDNode *(ArrayRef<Value *>)> CreateFunc) {
+static DISubprogram createFunctionHelper(
+    LLVMContext &VMContext, DIDescriptor Context, StringRef Name,
+    StringRef LinkageName, DIFile File, unsigned LineNo, DICompositeType Ty,
+    bool isLocalToUnit, bool isDefinition, unsigned ScopeLine, unsigned Flags,
+    bool isOptimized, Function *Fn, MDNode *TParams, MDNode *Decl, MDNode *Vars,
+    std::function<MDNode *(ArrayRef<Metadata *>)> CreateFunc) {
   assert(Ty.getTag() == dwarf::DW_TAG_subroutine_type &&
          "function types should be subroutines");
-  Value *Elts[] = {
-      HeaderBuilder::get(dwarf::DW_TAG_subprogram)
-          .concat(Name)
-          .concat(Name)
-          .concat(LinkageName)
-          .concat(LineNo)
-          .concat(isLocalToUnit)
-          .concat(isDefinition)
-          .concat(0)
-          .concat(0)
-          .concat(Flags)
-          .concat(isOptimized)
-          .concat(ScopeLine)
-          .get(VMContext),
-      File.getFileNode(), DIScope(getNonCompileUnitScope(Context)).getRef(), Ty,
-      nullptr, Fn, TParams, Decl, Vars};
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_subprogram)
+                          .concat(Name)
+                          .concat(Name)
+                          .concat(LinkageName)
+                          .concat(LineNo)
+                          .concat(isLocalToUnit)
+                          .concat(isDefinition)
+                          .concat(0)
+                          .concat(0)
+                          .concat(Flags)
+                          .concat(isOptimized)
+                          .concat(ScopeLine)
+                          .get(VMContext),
+                      File.getFileNode(),
+                      DIScope(getNonCompileUnitScope(Context)).getRef(), Ty,
+                      nullptr, getConstantOrNull(Fn), TParams, Decl, Vars};
 
   DISubprogram S(CreateFunc(Elts));
   assert(S.isSubprogram() &&
@@ -973,17 +1027,18 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context, StringRef Name,
                                        bool isOptimized, Function *Fn,
                                        MDNode *TParams, MDNode *Decl) {
   return createFunctionHelper(VMContext, Context, Name, LinkageName, File,
-                              LineNo, Ty, isLocalToUnit, isDefinition, ScopeLine,
-                              Flags, isOptimized, Fn, TParams, Decl,
-                              MDNode::getTemporary(VMContext, None),
-                              [&] (ArrayRef<Value *> Elts) -> MDNode *{
-                                MDNode *Node = MDNode::get(VMContext, Elts);
-                                // Create a named metadata so that we
-                                // do not lose this mdnode.
-                                if (isDefinition)
-                                  AllSubprograms.push_back(Node);
-                                return Node;
-                              });
+                              LineNo, Ty, isLocalToUnit, isDefinition,
+                              ScopeLine, Flags, isOptimized, Fn, TParams, Decl,
+                              MDNode::getTemporary(VMContext, None).release(),
+                              [&](ArrayRef<Metadata *> Elts) -> MDNode *{
+    MDNode *Node = MDNode::get(VMContext, Elts);
+    // Create a named metadata so that we
+    // do not lose this mdnode.
+    if (isDefinition)
+      AllSubprograms.push_back(Node);
+    trackIfUnresolved(Node);
+    return Node;
+  });
 }
 
 DISubprogram
@@ -995,11 +1050,11 @@ DIBuilder::createTempFunctionFwdDecl(DIDescriptor Context, StringRef Name,
                                      bool isOptimized, Function *Fn,
                                      MDNode *TParams, MDNode *Decl) {
   return createFunctionHelper(VMContext, Context, Name, LinkageName, File,
-                              LineNo, Ty, isLocalToUnit, isDefinition, ScopeLine,
-                              Flags, isOptimized, Fn, TParams, Decl, nullptr,
-                              [&] (ArrayRef<Value *> Elts) {
-                                return MDNode::getTemporary(VMContext, Elts);
-                              });
+                              LineNo, Ty, isLocalToUnit, isDefinition,
+                              ScopeLine, Flags, isOptimized, Fn, TParams, Decl,
+                              nullptr, [&](ArrayRef<Metadata *> Elts) {
+    return MDNode::getTemporary(VMContext, Elts).release();
+  });
 }
 
 DISubprogram DIBuilder::createMethod(DIDescriptor Context, StringRef Name,
@@ -1015,37 +1070,39 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context, StringRef Name,
   assert(getNonCompileUnitScope(Context) &&
          "Methods should have both a Context and a context that isn't "
          "the compile unit.");
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_subprogram)
-                       .concat(Name)
-                       .concat(Name)
-                       .concat(LinkageName)
-                       .concat(LineNo)
-                       .concat(isLocalToUnit)
-                       .concat(isDefinition)
-                       .concat(VK)
-                       .concat(VIndex)
-                       .concat(Flags)
-                       .concat(isOptimized)
-                       .concat(LineNo)
-                       // FIXME: Do we want to use different scope/lines?
-                       .get(VMContext),
-                   F.getFileNode(), DIScope(Context).getRef(), Ty,
-                   VTableHolder.getRef(), Fn, TParam, nullptr, nullptr};
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_subprogram)
+                          .concat(Name)
+                          .concat(Name)
+                          .concat(LinkageName)
+                          .concat(LineNo)
+                          .concat(isLocalToUnit)
+                          .concat(isDefinition)
+                          .concat(VK)
+                          .concat(VIndex)
+                          .concat(Flags)
+                          .concat(isOptimized)
+                          .concat(LineNo)
+                          // FIXME: Do we want to use different scope/lines?
+                          .get(VMContext),
+                      F.getFileNode(), DIScope(Context).getRef(), Ty,
+                      VTableHolder.getRef(), getConstantOrNull(Fn), TParam,
+                      nullptr, nullptr};
   MDNode *Node = MDNode::get(VMContext, Elts);
   if (isDefinition)
     AllSubprograms.push_back(Node);
   DISubprogram S(Node);
   assert(S.isSubprogram() && "createMethod should return a valid DISubprogram");
+  trackIfUnresolved(S);
   return S;
 }
 
 DINameSpace DIBuilder::createNameSpace(DIDescriptor Scope, StringRef Name,
                                        DIFile File, unsigned LineNo) {
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_namespace)
-                       .concat(Name)
-                       .concat(LineNo)
-                       .get(VMContext),
-                   File.getFileNode(), getNonCompileUnitScope(Scope)};
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_namespace)
+                          .concat(Name)
+                          .concat(LineNo)
+                          .get(VMContext),
+                      File.getFileNode(), getNonCompileUnitScope(Scope)};
   DINameSpace R(MDNode::get(VMContext, Elts));
   assert(R.Verify() &&
          "createNameSpace should return a verifiable DINameSpace");
@@ -1055,10 +1112,10 @@ DINameSpace DIBuilder::createNameSpace(DIDescriptor Scope, StringRef Name,
 DILexicalBlockFile DIBuilder::createLexicalBlockFile(DIDescriptor Scope,
                                                      DIFile File,
                                                      unsigned Discriminator) {
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_lexical_block)
-                       .concat(Discriminator)
-                       .get(VMContext),
-                   File.getFileNode(), Scope};
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_lexical_block)
+                          .concat(Discriminator)
+                          .get(VMContext),
+                      File.getFileNode(), Scope};
   DILexicalBlockFile R(MDNode::get(VMContext, Elts));
   assert(
       R.Verify() &&
@@ -1077,41 +1134,52 @@ DILexicalBlock DIBuilder::createLexicalBlock(DIDescriptor Scope, DIFile File,
 
   // Defeat MDNode uniquing for lexical blocks by using unique id.
   static unsigned int unique_id = 0;
-  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_lexical_block)
-                       .concat(Line)
-                       .concat(Col)
-                       .concat(unique_id++)
-                       .get(VMContext),
-                   File.getFileNode(), getNonCompileUnitScope(Scope)};
+  Metadata *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_lexical_block)
+                          .concat(Line)
+                          .concat(Col)
+                          .concat(unique_id++)
+                          .get(VMContext),
+                      File.getFileNode(), getNonCompileUnitScope(Scope)};
   DILexicalBlock R(MDNode::get(VMContext, Elts));
   assert(R.Verify() &&
          "createLexicalBlock should return a verifiable DILexicalBlock");
   return R;
 }
 
+static Value *getDbgIntrinsicValueImpl(LLVMContext &VMContext, Value *V) {
+  assert(V && "no value passed to dbg intrinsic");
+  return MetadataAsValue::get(VMContext, ValueAsMetadata::get(V));
+}
+
 Instruction *DIBuilder::insertDeclare(Value *Storage, DIVariable VarInfo,
                                       DIExpression Expr,
                                       Instruction *InsertBefore) {
-  assert(Storage && "no storage passed to dbg.declare");
   assert(VarInfo.isVariable() &&
          "empty or invalid DIVariable passed to dbg.declare");
   if (!DeclareFn)
     DeclareFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare);
 
-  Value *Args[] = {MDNode::get(Storage->getContext(), Storage), VarInfo, Expr};
+  trackIfUnresolved(VarInfo);
+  trackIfUnresolved(Expr);
+  Value *Args[] = {getDbgIntrinsicValueImpl(VMContext, Storage),
+                   MetadataAsValue::get(VMContext, VarInfo),
+                   MetadataAsValue::get(VMContext, Expr)};
   return CallInst::Create(DeclareFn, Args, "", InsertBefore);
 }
 
 Instruction *DIBuilder::insertDeclare(Value *Storage, DIVariable VarInfo,
                                       DIExpression Expr,
                                       BasicBlock *InsertAtEnd) {
-  assert(Storage && "no storage passed to dbg.declare");
   assert(VarInfo.isVariable() &&
          "empty or invalid DIVariable passed to dbg.declare");
   if (!DeclareFn)
     DeclareFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare);
 
-  Value *Args[] = {MDNode::get(Storage->getContext(), Storage), VarInfo, Expr};
+  trackIfUnresolved(VarInfo);
+  trackIfUnresolved(Expr);
+  Value *Args[] = {getDbgIntrinsicValueImpl(VMContext, Storage),
+                   MetadataAsValue::get(VMContext, VarInfo),
+                   MetadataAsValue::get(VMContext, Expr)};
 
   // If this block already has a terminator then insert this intrinsic
   // before the terminator.
@@ -1131,9 +1199,12 @@ Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V, uint64_t Offset,
   if (!ValueFn)
     ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
 
-  Value *Args[] = {MDNode::get(V->getContext(), V),
-                   ConstantInt::get(Type::getInt64Ty(V->getContext()), Offset),
-                   VarInfo, Expr};
+  trackIfUnresolved(VarInfo);
+  trackIfUnresolved(Expr);
+  Value *Args[] = {getDbgIntrinsicValueImpl(VMContext, V),
+                   ConstantInt::get(Type::getInt64Ty(VMContext), Offset),
+                   MetadataAsValue::get(VMContext, VarInfo),
+                   MetadataAsValue::get(VMContext, Expr)};
   return CallInst::Create(ValueFn, Args, "", InsertBefore);
 }
 
@@ -1147,8 +1218,43 @@ Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V, uint64_t Offset,
   if (!ValueFn)
     ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
 
-  Value *Args[] = {MDNode::get(V->getContext(), V),
-                   ConstantInt::get(Type::getInt64Ty(V->getContext()), Offset),
-                   VarInfo, Expr};
+  trackIfUnresolved(VarInfo);
+  trackIfUnresolved(Expr);
+  Value *Args[] = {getDbgIntrinsicValueImpl(VMContext, V),
+                   ConstantInt::get(Type::getInt64Ty(VMContext), Offset),
+                   MetadataAsValue::get(VMContext, VarInfo),
+                   MetadataAsValue::get(VMContext, Expr)};
   return CallInst::Create(ValueFn, Args, "", InsertAtEnd);
 }
+
+void DIBuilder::replaceVTableHolder(DICompositeType &T, DICompositeType VTableHolder) {
+  T.setContainingType(VTableHolder);
+
+  // If this didn't create a self-reference, just return.
+  if (T != VTableHolder)
+    return;
+
+  // Look for unresolved operands.  T will drop RAUW support, orphaning any
+  // cycles underneath it.
+  if (T->isResolved())
+    for (const MDOperand &O : T->operands())
+      if (auto *N = dyn_cast_or_null<MDNode>(O))
+        trackIfUnresolved(N);
+}
+
+void DIBuilder::replaceArrays(DICompositeType &T, DIArray Elements,
+                              DIArray TParams) {
+  T.setArrays(Elements, TParams);
+
+  // If T isn't resolved, there's no problem.
+  if (!T->isResolved())
+    return;
+
+  // If "T" is resolved, it may be due to a self-reference cycle.  Track the
+  // arrays explicitly if they're unresolved, or else the cycles will be
+  // orphaned.
+  if (Elements)
+    trackIfUnresolved(Elements);
+  if (TParams)
+    trackIfUnresolved(TParams);
+}
diff --git a/lib/IR/DataLayout.cpp b/lib/IR/DataLayout.cpp
index 8a057f5..9c1dee0 100644
--- a/lib/IR/DataLayout.cpp
+++ b/lib/IR/DataLayout.cpp
@@ -197,8 +197,10 @@ void DataLayout::reset(StringRef Desc) {
 static std::pair<StringRef, StringRef> split(StringRef Str, char Separator) {
   assert(!Str.empty() && "parse error, string can't be empty here");
   std::pair<StringRef, StringRef> Split = Str.split(Separator);
-  assert((!Split.second.empty() || Split.first == Str) &&
-         "a trailing separator is not allowed");
+  if (Split.second.empty() && Split.first != Str)
+    report_fatal_error("Trailing separator in datalayout string");
+  if (!Split.second.empty() && Split.first.empty())
+    report_fatal_error("Expected token before separator in datalayout string");
   return Split;
 }
 
@@ -213,7 +215,8 @@ static unsigned getInt(StringRef R) {
 
 /// Convert bits into bytes. Assert if not a byte width multiple.
 static unsigned inBytes(unsigned Bits) {
-  assert(Bits % 8 == 0 && "number of bits must be a byte width multiple");
+  if (Bits % 8)
+    report_fatal_error("number of bits must be a byte width multiple");
   return Bits / 8;
 }
 
@@ -247,14 +250,20 @@ void DataLayout::parseSpecifier(StringRef Desc) {
     case 'p': {
       // Address space.
       unsigned AddrSpace = Tok.empty() ? 0 : getInt(Tok);
-      assert(AddrSpace < 1 << 24 &&
-             "Invalid address space, must be a 24bit integer");
+      if (!isUInt<24>(AddrSpace))
+        report_fatal_error("Invalid address space, must be a 24bit integer");
 
       // Size.
+      if (Rest.empty())
+        report_fatal_error(
+            "Missing size specification for pointer in datalayout string");
       Split = split(Rest, ':');
       unsigned PointerMemSize = inBytes(getInt(Tok));
 
       // ABI alignment.
+      if (Rest.empty())
+        report_fatal_error(
+            "Missing alignment specification for pointer in datalayout string");
       Split = split(Rest, ':');
       unsigned PointerABIAlign = inBytes(getInt(Tok));
 
@@ -285,10 +294,14 @@ void DataLayout::parseSpecifier(StringRef Desc) {
       // Bit size.
       unsigned Size = Tok.empty() ? 0 : getInt(Tok);
 
-      assert((AlignType != AGGREGATE_ALIGN || Size == 0) &&
-             "These specifications don't have a size");
+      if (AlignType == AGGREGATE_ALIGN && Size != 0)
+        report_fatal_error(
+            "Sized aggregate specification in datalayout string");
 
       // ABI alignment.
+      if (Rest.empty())
+        report_fatal_error(
+            "Missing alignment specification in datalayout string");
       Split = split(Rest, ':');
       unsigned ABIAlign = inBytes(getInt(Tok));
 
@@ -306,7 +319,9 @@ void DataLayout::parseSpecifier(StringRef Desc) {
     case 'n':  // Native integer types.
       for (;;) {
         unsigned Width = getInt(Tok);
-        assert(Width != 0 && "width must be non-zero");
+        if (Width == 0)
+          report_fatal_error(
+              "Zero width native integer type in datalayout string");
         LegalIntWidths.push_back(Width);
         if (Rest.empty())
           break;
@@ -318,11 +333,15 @@ void DataLayout::parseSpecifier(StringRef Desc) {
       break;
     }
     case 'm':
-      assert(Tok.empty());
-      assert(Rest.size() == 1);
+      if (!Tok.empty())
+        report_fatal_error("Unexpected trailing characters after mangling specifier in datalayout string");
+      if (Rest.empty())
+        report_fatal_error("Expected mangling specifier in datalayout string");
+      if (Rest.size() > 1)
+        report_fatal_error("Unknown mangling specifier in datalayout string");
       switch(Rest[0]) {
       default:
-        llvm_unreachable("Unknown mangling in datalayout string");
+        report_fatal_error("Unknown mangling in datalayout string");
       case 'e':
         ManglingMode = MM_ELF;
         break;
@@ -338,7 +357,7 @@ void DataLayout::parseSpecifier(StringRef Desc) {
       }
       break;
     default:
-      llvm_unreachable("Unknown specifier in datalayout string");
+      report_fatal_error("Unknown specifier in datalayout string");
       break;
     }
   }
@@ -369,9 +388,17 @@ bool DataLayout::operator==(const DataLayout &Other) const {
 void
 DataLayout::setAlignment(AlignTypeEnum align_type, unsigned abi_align,
                          unsigned pref_align, uint32_t bit_width) {
-  assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
-  assert(pref_align < (1 << 16) && "Alignment doesn't fit in bitfield");
-  assert(bit_width < (1 << 24) && "Bit width doesn't fit in bitfield");
+  if (!isUInt<24>(bit_width))
+    report_fatal_error("Invalid bit width, must be a 24bit integer");
+  if (!isUInt<16>(abi_align))
+    report_fatal_error("Invalid ABI alignment, must be a 16bit integer");
+  if (!isUInt<16>(pref_align))
+    report_fatal_error("Invalid preferred alignment, must be a 16bit integer");
+
+  if (pref_align < abi_align)
+    report_fatal_error(
+        "Preferred alignment cannot be less than the ABI alignment");
+
   for (LayoutAlignElem &Elem : Alignments) {
     if (Elem.AlignType == (unsigned)align_type &&
         Elem.TypeBitWidth == bit_width) {
@@ -397,7 +424,10 @@ DataLayout::findPointerLowerBound(uint32_t AddressSpace) {
 void DataLayout::setPointerAlignment(uint32_t AddrSpace, unsigned ABIAlign,
                                      unsigned PrefAlign,
                                      uint32_t TypeByteWidth) {
-  assert(ABIAlign <= PrefAlign && "Preferred alignment worse than ABI!");
+  if (PrefAlign < ABIAlign)
+    report_fatal_error(
+        "Preferred alignment cannot be less than the ABI alignment");
+
   PointersTy::iterator I = findPointerLowerBound(AddrSpace);
   if (I == Pointers.end() || I->AddressSpace != AddrSpace) {
     Pointers.insert(I, PointerAlignElem::get(AddrSpace, ABIAlign, PrefAlign,
diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index bb5161d..6590661 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
@@ -36,6 +37,48 @@ using namespace llvm::dwarf;
 // DIDescriptor
 //===----------------------------------------------------------------------===//
 
+unsigned DIDescriptor::getFlag(StringRef Flag) {
+  return StringSwitch<unsigned>(Flag)
+#define HANDLE_DI_FLAG(ID, NAME) .Case("DIFlag" #NAME, Flag##NAME)
+#include "llvm/IR/DebugInfoFlags.def"
+      .Default(0);
+}
+
+const char *DIDescriptor::getFlagString(unsigned Flag) {
+  switch (Flag) {
+  default:
+    return "";
+#define HANDLE_DI_FLAG(ID, NAME)                                               \
+  case Flag##NAME:                                                             \
+    return "DIFlag" #NAME;
+#include "llvm/IR/DebugInfoFlags.def"
+  }
+}
+
+unsigned DIDescriptor::splitFlags(unsigned Flags,
+                                  SmallVectorImpl<unsigned> &SplitFlags) {
+  // Accessibility flags need to be specially handled, since they're packed
+  // together.
+  if (unsigned A = Flags & FlagAccessibility) {
+    if (A == FlagPrivate)
+      SplitFlags.push_back(FlagPrivate);
+    else if (A == FlagProtected)
+      SplitFlags.push_back(FlagProtected);
+    else
+      SplitFlags.push_back(FlagPublic);
+    Flags &= ~A;
+  }
+
+#define HANDLE_DI_FLAG(ID, NAME)                                               \
+  if (unsigned Bit = Flags & ID) {                                             \
+    SplitFlags.push_back(Bit);                                                 \
+    Flags &= ~Bit;                                                             \
+  }
+#include "llvm/IR/DebugInfoFlags.def"
+
+  return Flags;
+}
+
 bool DIDescriptor::Verify() const {
   return DbgNode &&
          (DIDerivedType(DbgNode).Verify() ||
@@ -52,7 +95,7 @@ bool DIDescriptor::Verify() const {
           DIImportedEntity(DbgNode).Verify() || DIExpression(DbgNode).Verify());
 }
 
-static Value *getField(const MDNode *DbgNode, unsigned Elt) {
+static Metadata *getField(const MDNode *DbgNode, unsigned Elt) {
   if (!DbgNode || Elt >= DbgNode->getNumOperands())
     return nullptr;
   return DbgNode->getOperand(Elt);
@@ -73,25 +116,17 @@ StringRef DIDescriptor::getStringField(unsigned Elt) const {
 }
 
 uint64_t DIDescriptor::getUInt64Field(unsigned Elt) const {
-  if (!DbgNode)
-    return 0;
-
-  if (Elt < DbgNode->getNumOperands())
-    if (ConstantInt *CI =
-            dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
+  if (auto *C = getConstantField(Elt))
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(C))
       return CI->getZExtValue();
 
   return 0;
 }
 
 int64_t DIDescriptor::getInt64Field(unsigned Elt) const {
-  if (!DbgNode)
-    return 0;
-
-  if (Elt < DbgNode->getNumOperands())
-    if (ConstantInt *CI =
-            dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
-      return CI->getSExtValue();
+  if (auto *C = getConstantField(Elt))
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(C))
+      return CI->getZExtValue();
 
   return 0;
 }
@@ -102,12 +137,7 @@ DIDescriptor DIDescriptor::getDescriptorField(unsigned Elt) const {
 }
 
 GlobalVariable *DIDescriptor::getGlobalVariableField(unsigned Elt) const {
-  if (!DbgNode)
-    return nullptr;
-
-  if (Elt < DbgNode->getNumOperands())
-    return dyn_cast_or_null<GlobalVariable>(DbgNode->getOperand(Elt));
-  return nullptr;
+  return dyn_cast_or_null<GlobalVariable>(getConstantField(Elt));
 }
 
 Constant *DIDescriptor::getConstantField(unsigned Elt) const {
@@ -115,17 +145,14 @@ Constant *DIDescriptor::getConstantField(unsigned Elt) const {
     return nullptr;
 
   if (Elt < DbgNode->getNumOperands())
-    return dyn_cast_or_null<Constant>(DbgNode->getOperand(Elt));
+    if (auto *C =
+            dyn_cast_or_null<ConstantAsMetadata>(DbgNode->getOperand(Elt)))
+      return C->getValue();
   return nullptr;
 }
 
 Function *DIDescriptor::getFunctionField(unsigned Elt) const {
-  if (!DbgNode)
-    return nullptr;
-
-  if (Elt < DbgNode->getNumOperands())
-    return dyn_cast_or_null<Function>(DbgNode->getOperand(Elt));
-  return nullptr;
+  return dyn_cast_or_null<Function>(getConstantField(Elt));
 }
 
 void DIDescriptor::replaceFunctionField(unsigned Elt, Function *F) {
@@ -134,7 +161,7 @@ void DIDescriptor::replaceFunctionField(unsigned Elt, Function *F) {
 
   if (Elt < DbgNode->getNumOperands()) {
     MDNode *Node = const_cast<MDNode *>(DbgNode);
-    Node->replaceOperandWith(Elt, F);
+    Node->replaceOperandWith(Elt, F ? ConstantAsMetadata::get(F) : nullptr);
   }
 }
 
@@ -163,18 +190,32 @@ uint64_t DIExpression::getElement(unsigned Idx) const {
   return getHeaderFieldAs<int64_t>(I);
 }
 
-bool DIExpression::isVariablePiece() const {
-  return getNumElements() && getElement(0) == dwarf::DW_OP_piece;
+bool DIExpression::isBitPiece() const {
+  unsigned N = getNumElements();
+  return N >=3 && getElement(N-3) == dwarf::DW_OP_bit_piece;
+}
+
+uint64_t DIExpression::getBitPieceOffset() const {
+  assert(isBitPiece() && "not a piece");
+  return getElement(getNumElements()-2);
+}
+
+uint64_t DIExpression::getBitPieceSize() const {
+  assert(isBitPiece() && "not a piece");
+  return getElement(getNumElements()-1);
 }
 
-uint64_t DIExpression::getPieceOffset() const {
-  assert(isVariablePiece());
-  return getElement(1);
+DIExpression::iterator DIExpression::begin() const {
+ return DIExpression::iterator(*this);
 }
 
-uint64_t DIExpression::getPieceSize() const {
-  assert(isVariablePiece());
-  return getElement(2);
+DIExpression::iterator DIExpression::end() const {
+ return DIExpression::iterator();
+}
+
+DIExpression::Operand DIExpression::Operand::getNext() const {
+  iterator it(I);
+  return *(++it);
 }
 
 //===----------------------------------------------------------------------===//
@@ -182,7 +223,7 @@ uint64_t DIExpression::getPieceSize() const {
 //===----------------------------------------------------------------------===//
 
 bool DIDescriptor::isSubroutineType() const {
-  return isCompositeType() && getTag() == dwarf::DW_TAG_subroutine_type;
+  return DbgNode && getTag() == dwarf::DW_TAG_subroutine_type;
 }
 
 bool DIDescriptor::isBasicType() const {
@@ -256,8 +297,7 @@ bool DIDescriptor::isSubprogram() const {
 }
 
 bool DIDescriptor::isGlobalVariable() const {
-  return DbgNode && (getTag() == dwarf::DW_TAG_variable ||
-                     getTag() == dwarf::DW_TAG_constant);
+  return DbgNode && getTag() == dwarf::DW_TAG_variable;
 }
 
 bool DIDescriptor::isScope() const {
@@ -347,27 +387,23 @@ void DIDescriptor::replaceAllUsesWith(LLVMContext &VMContext, DIDescriptor D) {
   // itself.
   const MDNode *DN = D;
   if (DbgNode == DN) {
-    SmallVector<Value*, 10> Ops(DbgNode->getNumOperands());
-    for (size_t i = 0; i != Ops.size(); ++i)
-      Ops[i] = DbgNode->getOperand(i);
+    SmallVector<Metadata *, 10> Ops(DbgNode->op_begin(), DbgNode->op_end());
     DN = MDNode::get(VMContext, Ops);
   }
 
-  MDNode *Node = const_cast<MDNode *>(DbgNode);
-  const Value *V = cast_or_null<Value>(DN);
-  Node->replaceAllUsesWith(const_cast<Value *>(V));
+  assert(DbgNode->isTemporary() && "Expected temporary node");
+  auto *Node = const_cast<MDNode *>(DbgNode);
+  Node->replaceAllUsesWith(const_cast<MDNode *>(DN));
   MDNode::deleteTemporary(Node);
   DbgNode = DN;
 }
 
 void DIDescriptor::replaceAllUsesWith(MDNode *D) {
-
   assert(DbgNode && "Trying to replace an unverified type!");
   assert(DbgNode != D && "This replacement should always happen");
-  MDNode *Node = const_cast<MDNode *>(DbgNode);
-  const MDNode *DN = D;
-  const Value *V = cast_or_null<Value>(DN);
-  Node->replaceAllUsesWith(const_cast<Value *>(V));
+  assert(DbgNode->isTemporary() && "Expected temporary node");
+  auto *Node = const_cast<MDNode *>(DbgNode);
+  Node->replaceAllUsesWith(D);
   MDNode::deleteTemporary(Node);
 }
 
@@ -392,21 +428,14 @@ bool DIObjCProperty::Verify() const {
 }
 
 /// \brief Check if a field at position Elt of a MDNode is a MDNode.
-///
-/// We currently allow an empty string and an integer.
-/// But we don't allow a non-empty string in a MDNode field.
 static bool fieldIsMDNode(const MDNode *DbgNode, unsigned Elt) {
-  // FIXME: This function should return true, if the field is null or the field
-  // is indeed a MDNode: return !Fld || isa<MDNode>(Fld).
-  Value *Fld = getField(DbgNode, Elt);
-  if (Fld && isa<MDString>(Fld) && !cast<MDString>(Fld)->getString().empty())
-    return false;
-  return true;
+  Metadata *Fld = getField(DbgNode, Elt);
+  return !Fld || isa<MDNode>(Fld);
 }
 
 /// \brief Check if a field at position Elt of a MDNode is a MDString.
 static bool fieldIsMDString(const MDNode *DbgNode, unsigned Elt) {
-  Value *Fld = getField(DbgNode, Elt);
+  Metadata *Fld = getField(DbgNode, Elt);
   return !Fld || isa<MDString>(Fld);
 }
 
@@ -432,7 +461,9 @@ static bool isScopeRef(const Metadata *MD) {
     return true;
   if (auto *S = dyn_cast<MDString>(MD))
     return !S->getString().empty();
-  return isa<MDNode>(MD);
+  if (auto *N = dyn_cast<MDNode>(MD))
+    return DIScope(N).isScope();
+  return false;
 }
 
 /// \brief Check if a field at position Elt of a MDNode can be a ScopeRef.
@@ -440,6 +471,17 @@ static bool fieldIsScopeRef(const MDNode *DbgNode, unsigned Elt) {
   return isScopeRef(dyn_cast_or_null<Metadata>(getField(DbgNode, Elt)));
 }
 
+#ifndef NDEBUG
+/// \brief Check if a value can be a DescriptorRef.
+static bool isDescriptorRef(const Metadata *MD) {
+  if (!MD)
+    return true;
+  if (auto *S = dyn_cast<MDString>(MD))
+    return !S->getString().empty();
+  return isa<MDNode>(MD);
+}
+#endif
+
 bool DIType::Verify() const {
   if (!isType())
     return false;
@@ -533,7 +575,6 @@ bool DISubprogram::Verify() const {
   // If a DISubprogram has an llvm::Function*, then scope chains from all
   // instructions within the function should lead to this DISubprogram.
   if (auto *F = getFunction()) {
-    LLVMContext &Ctxt = F->getContext();
     for (auto &BB : *F) {
       for (auto &I : BB) {
         DebugLoc DL = I.getDebugLoc();
@@ -543,15 +584,19 @@ bool DISubprogram::Verify() const {
         MDNode *Scope = nullptr;
         MDNode *IA = nullptr;
         // walk the inlined-at scopes
-        while (DL.getScopeAndInlinedAt(Scope, IA, F->getContext()), IA)
+        while ((IA = DL.getInlinedAt()))
           DL = DebugLoc::getFromDILocation(IA);
-        DL.getScopeAndInlinedAt(Scope, IA, Ctxt);
+        DL.getScopeAndInlinedAt(Scope, IA);
+        if (!Scope)
+          return false;
         assert(!IA);
         while (!DIDescriptor(Scope).isSubprogram()) {
           DILexicalBlockFile D(Scope);
           Scope = D.isLexicalBlockFile()
                       ? D.getScope()
-                      : DebugLoc::getFromDILexicalBlock(Scope).getScope(Ctxt);
+                      : DebugLoc::getFromDILexicalBlock(Scope).getScope();
+          if (!Scope)
+            return false;
         }
         if (!DISubprogram(Scope).describes(F))
           return false;
@@ -567,8 +612,8 @@ bool DIGlobalVariable::Verify() const {
 
   if (getDisplayName().empty())
     return false;
-  // Make sure context @ field 1 is a ScopeRef.
-  if (!fieldIsScopeRef(DbgNode, 1))
+  // Make sure context @ field 1 is an MDNode.
+  if (!fieldIsMDNode(DbgNode, 1))
     return false;
   // Make sure that type @ field 3 is a DITypeRef.
   if (!fieldIsTypeRef(DbgNode, 3))
@@ -609,14 +654,29 @@ bool DIExpression::Verify() const {
   if (!DbgNode)
     return true;
 
-  return isExpression() && DbgNode->getNumOperands() == 1;
+  if (!(isExpression() && DbgNode->getNumOperands() == 1))
+    return false;
+
+  for (auto Op : *this)
+    switch (Op) {
+    case DW_OP_bit_piece:
+      // Must be the last element of the expression.
+      return std::distance(Op.getBase(), DIHeaderFieldIterator()) == 3;
+    case DW_OP_plus:
+      if (std::distance(Op.getBase(), DIHeaderFieldIterator()) < 2)
+        return false;
+      break;
+    case DW_OP_deref:
+      break;
+    default:
+      // Other operators are not yet supported by the backend.
+      return false;
+    }
+  return true;
 }
 
 bool DILocation::Verify() const {
-  if (!DbgNode)
-    return false;
-
-  return DbgNode->getNumOperands() == 4;
+  return DbgNode && isa<MDLocation>(DbgNode);
 }
 
 bool DINameSpace::Verify() const {
@@ -678,19 +738,19 @@ MDString *DICompositeType::getIdentifier() const {
 static void VerifySubsetOf(const MDNode *LHS, const MDNode *RHS) {
   for (unsigned i = 0; i != LHS->getNumOperands(); ++i) {
     // Skip the 'empty' list (that's a single i32 0, rather than truly empty).
-    if (i == 0 && isa<ConstantInt>(LHS->getOperand(i)))
+    if (i == 0 && mdconst::hasa<ConstantInt>(LHS->getOperand(i)))
       continue;
     const MDNode *E = cast<MDNode>(LHS->getOperand(i));
     bool found = false;
     for (unsigned j = 0; !found && j != RHS->getNumOperands(); ++j)
-      found = E == RHS->getOperand(j);
+      found = (E == cast<MDNode>(RHS->getOperand(j)));
     assert(found && "Losing a member during member list replacement");
   }
 }
 #endif
 
 void DICompositeType::setArraysHelper(MDNode *Elements, MDNode *TParams) {
-  TrackingVH<MDNode> N(*this);
+  TrackingMDNodeRef N(*this);
   if (Elements) {
 #ifndef NDEBUG
     // Check that the new list of members contains all the old members as well.
@@ -714,7 +774,7 @@ DIScopeRef DIScope::getRef() const {
 }
 
 void DICompositeType::setContainingType(DICompositeType ContainingType) {
-  TrackingVH<MDNode> N(*this);
+  TrackingMDNodeRef N(*this);
   N->replaceOperandWith(5, ContainingType.getRef());
   DbgNode = N;
 }
@@ -748,8 +808,8 @@ DIArray DISubprogram::getVariables() const {
   return DIArray(getNodeField(DbgNode, 8));
 }
 
-Value *DITemplateValueParameter::getValue() const {
-  return getField(DbgNode, 3);
+Metadata *DITemplateValueParameter::getValue() const {
+  return DbgNode->getOperand(3);
 }
 
 DIScopeRef DIScope::getContext() const {
@@ -851,16 +911,12 @@ void DICompileUnit::replaceGlobalVariables(DIArray GlobalVariables) {
 
 DILocation DILocation::copyWithNewScope(LLVMContext &Ctx,
                                         DILexicalBlockFile NewScope) {
-  SmallVector<Value *, 10> Elts;
   assert(Verify());
-  for (unsigned I = 0; I < DbgNode->getNumOperands(); ++I) {
-    if (I != 2)
-      Elts.push_back(DbgNode->getOperand(I));
-    else
-      Elts.push_back(NewScope);
-  }
-  MDNode *NewDIL = MDNode::get(Ctx, Elts);
-  return DILocation(NewDIL);
+  assert(NewScope && "Expected valid scope");
+
+  const auto *Old = cast<MDLocation>(DbgNode);
+  return DILocation(MDLocation::get(Ctx, Old->getLine(), Old->getColumn(),
+                                    NewScope, Old->getInlinedAt()));
 }
 
 unsigned DILocation::computeNewDiscriminator(LLVMContext &Ctx) {
@@ -875,9 +931,8 @@ DIVariable llvm::createInlinedVariable(MDNode *DV, MDNode *InlinedScope,
     return cleanseInlinedVariable(DV, VMContext);
 
   // Insert inlined scope.
-  SmallVector<Value *, 8> Elts;
-  for (unsigned I = 0, E = DIVariableInlinedAtIndex; I != E; ++I)
-    Elts.push_back(DV->getOperand(I));
+  SmallVector<Metadata *, 8> Elts(DV->op_begin(),
+                                  DV->op_begin() + DIVariableInlinedAtIndex);
   Elts.push_back(InlinedScope);
 
   DIVariable Inlined(MDNode::get(VMContext, Elts));
@@ -891,9 +946,8 @@ DIVariable llvm::cleanseInlinedVariable(MDNode *DV, LLVMContext &VMContext) {
     return DIVariable(DV);
 
   // Remove inlined scope.
-  SmallVector<Value *, 8> Elts;
-  for (unsigned I = 0, E = DIVariableInlinedAtIndex; I != E; ++I)
-    Elts.push_back(DV->getOperand(I));
+  SmallVector<Metadata *, 8> Elts(DV->op_begin(),
+                                  DV->op_begin() + DIVariableInlinedAtIndex);
 
   DIVariable Cleansed(MDNode::get(VMContext, Elts));
   assert(Cleansed.Verify() && "Expected to create a DIVariable");
@@ -923,7 +977,7 @@ DISubprogram llvm::getDISubprogram(const Function *F) {
     if (Inst == BB.end())
       continue;
     DebugLoc DLoc = Inst->getDebugLoc();
-    const MDNode *Scope = DLoc.getScopeNode(F->getParent()->getContext());
+    const MDNode *Scope = DLoc.getScopeNode();
     DISubprogram Subprogram = getDISubprogram(Scope);
     return Subprogram.describes(F) ? Subprogram : DISubprogram();
   }
@@ -1005,7 +1059,7 @@ void DebugInfoFinder::processModule(const Module &M) {
       for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i) {
         DIGlobalVariable DIG(GVs.getElement(i));
         if (addGlobalVariable(DIG)) {
-          processScope(DIG.getContext().resolve(TypeIdentifierMap));
+          processScope(DIG.getContext());
           processType(DIG.getType().resolve(TypeIdentifierMap));
         }
       }
@@ -1106,11 +1160,9 @@ void DebugInfoFinder::processSubprogram(DISubprogram SP) {
     DIDescriptor Element = TParams.getElement(I);
     if (Element.isTemplateTypeParameter()) {
       DITemplateTypeParameter TType(Element);
-      processScope(TType.getContext().resolve(TypeIdentifierMap));
       processType(TType.getType().resolve(TypeIdentifierMap));
     } else if (Element.isTemplateValueParameter()) {
       DITemplateValueParameter TVal(Element);
-      processScope(TVal.getContext().resolve(TypeIdentifierMap));
       processType(TVal.getType().resolve(TypeIdentifierMap));
     }
   }
@@ -1401,24 +1453,22 @@ void DIVariable::printInternal(raw_ostream &OS) const {
 }
 
 void DIExpression::printInternal(raw_ostream &OS) const {
-  for (unsigned I = 0; I < getNumElements(); ++I) {
-    uint64_t OpCode = getElement(I);
-    OS << " [" << OperationEncodingString(OpCode);
-    switch (OpCode) {
+  for (auto Op : *this) {
+    OS << " [" << OperationEncodingString(Op);
+    switch (Op) {
     case DW_OP_plus: {
-      OS << " " << getElement(++I);
+      OS << " " << Op.getArg(1);
       break;
     }
-    case DW_OP_piece: {
-      unsigned Offset = getElement(++I);
-      unsigned Size = getElement(++I);
-      OS << " offset=" << Offset << ", size=" << Size;
+    case DW_OP_bit_piece: {
+      OS << " offset=" << Op.getArg(1) << ", size=" << Op.getArg(2);
       break;
     }
+    case DW_OP_deref:
+      // No arguments.
+      break;
     default:
-      // Else bail out early. This may be a line table entry.
-      OS << "Unknown]";
-      return;
+      llvm_unreachable("unhandled operation");
     }
     OS << "]";
   }
@@ -1467,6 +1517,10 @@ void DIVariable::printExtendedName(raw_ostream &OS) const {
   }
 }
 
+template <> DIRef<DIDescriptor>::DIRef(const Metadata *V) : Val(V) {
+  assert(isDescriptorRef(V) &&
+         "DIDescriptorRef should be a MDString or MDNode");
+}
 template <> DIRef<DIScope>::DIRef(const Metadata *V) : Val(V) {
   assert(isScopeRef(V) && "DIScopeRef should be a MDString or MDNode");
 }
@@ -1475,6 +1529,10 @@ template <> DIRef<DIType>::DIRef(const Metadata *V) : Val(V) {
 }
 
 template <>
+DIDescriptorRef DIDescriptor::getFieldAs<DIDescriptorRef>(unsigned Elt) const {
+  return DIDescriptorRef(cast_or_null<Metadata>(getField(DbgNode, Elt)));
+}
+template <>
 DIScopeRef DIDescriptor::getFieldAs<DIScopeRef>(unsigned Elt) const {
   return DIScopeRef(cast_or_null<Metadata>(getField(DbgNode, Elt)));
 }
@@ -1530,10 +1588,10 @@ bool llvm::StripDebugInfo(Module &M) {
 }
 
 unsigned llvm::getDebugMetadataVersionFromModule(const Module &M) {
-  Value *Val = M.getModuleFlag("Debug Info Version");
-  if (!Val)
-    return 0;
-  return cast<ConstantInt>(Val)->getZExtValue();
+  if (auto *Val = mdconst::dyn_extract_or_null<ConstantInt>(
+          M.getModuleFlag("Debug Info Version")))
+    return Val->getZExtValue();
+  return 0;
 }
 
 llvm::DenseMap<const llvm::Function *, llvm::DISubprogram>
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
new file mode 100644
index 0000000..89ec1bc
--- /dev/null
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -0,0 +1,418 @@
+//===- DebugInfoMetadata.cpp - Implement debug info metadata --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the debug info Metadata classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "LLVMContextImpl.h"
+#include "MetadataImpl.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+MDLocation::MDLocation(LLVMContext &C, StorageType Storage, unsigned Line,
+                       unsigned Column, ArrayRef<Metadata *> MDs)
+    : MDNode(C, MDLocationKind, Storage, MDs) {
+  assert((MDs.size() == 1 || MDs.size() == 2) &&
+         "Expected a scope and optional inlined-at");
+
+  // Set line and column.
+  assert(Column < (1u << 16) && "Expected 16-bit column");
+
+  SubclassData32 = Line;
+  SubclassData16 = Column;
+}
+
+static void adjustColumn(unsigned &Column) {
+  // Set to unknown on overflow.  We only have 16 bits to play with here.
+  if (Column >= (1u << 16))
+    Column = 0;
+}
+
+MDLocation *MDLocation::getImpl(LLVMContext &Context, unsigned Line,
+                                unsigned Column, Metadata *Scope,
+                                Metadata *InlinedAt, StorageType Storage,
+                                bool ShouldCreate) {
+  // Fixup column.
+  adjustColumn(Column);
+
+  if (Storage == Uniqued) {
+    if (auto *N =
+            getUniqued(Context.pImpl->MDLocations,
+                       MDLocationInfo::KeyTy(Line, Column, Scope, InlinedAt)))
+      return N;
+    if (!ShouldCreate)
+      return nullptr;
+  } else {
+    assert(ShouldCreate && "Expected non-uniqued nodes to always be created");
+  }
+
+  SmallVector<Metadata *, 2> Ops;
+  Ops.push_back(Scope);
+  if (InlinedAt)
+    Ops.push_back(InlinedAt);
+  return storeImpl(new (Ops.size())
+                       MDLocation(Context, Storage, Line, Column, Ops),
+                   Storage, Context.pImpl->MDLocations);
+}
+
+static StringRef getString(const MDString *S) {
+  if (S)
+    return S->getString();
+  return StringRef();
+}
+
+#ifndef NDEBUG
+static bool isCanonical(const MDString *S) {
+  return !S || !S->getString().empty();
+}
+#endif
+
+GenericDebugNode *GenericDebugNode::getImpl(LLVMContext &Context, unsigned Tag,
+                                            MDString *Header,
+                                            ArrayRef<Metadata *> DwarfOps,
+                                            StorageType Storage,
+                                            bool ShouldCreate) {
+  unsigned Hash = 0;
+  if (Storage == Uniqued) {
+    GenericDebugNodeInfo::KeyTy Key(Tag, getString(Header), DwarfOps);
+    if (auto *N = getUniqued(Context.pImpl->GenericDebugNodes, Key))
+      return N;
+    if (!ShouldCreate)
+      return nullptr;
+    Hash = Key.getHash();
+  } else {
+    assert(ShouldCreate && "Expected non-uniqued nodes to always be created");
+  }
+
+  // Use a nullptr for empty headers.
+  assert(isCanonical(Header) && "Expected canonical MDString");
+  Metadata *PreOps[] = {Header};
+  return storeImpl(new (DwarfOps.size() + 1) GenericDebugNode(
+                       Context, Storage, Hash, Tag, PreOps, DwarfOps),
+                   Storage, Context.pImpl->GenericDebugNodes);
+}
+
+void GenericDebugNode::recalculateHash() {
+  setHash(GenericDebugNodeInfo::KeyTy::calculateHash(this));
+}
+
+#define UNWRAP_ARGS_IMPL(...) __VA_ARGS__
+#define UNWRAP_ARGS(ARGS) UNWRAP_ARGS_IMPL ARGS
+#define DEFINE_GETIMPL_LOOKUP(CLASS, ARGS)                                     \
+  do {                                                                         \
+    if (Storage == Uniqued) {                                                  \
+      if (auto *N = getUniqued(Context.pImpl->CLASS##s,                        \
+                               CLASS##Info::KeyTy(UNWRAP_ARGS(ARGS))))         \
+        return N;                                                              \
+      if (!ShouldCreate)                                                       \
+        return nullptr;                                                        \
+    } else {                                                                   \
+      assert(ShouldCreate &&                                                   \
+             "Expected non-uniqued nodes to always be created");               \
+    }                                                                          \
+  } while (false)
+#define DEFINE_GETIMPL_STORE(CLASS, ARGS, OPS)                                 \
+  return storeImpl(new (ArrayRef<Metadata *>(OPS).size())                      \
+                       CLASS(Context, Storage, UNWRAP_ARGS(ARGS), OPS),        \
+                   Storage, Context.pImpl->CLASS##s)
+#define DEFINE_GETIMPL_STORE_NO_OPS(CLASS, ARGS)                               \
+  return storeImpl(new (0u) CLASS(Context, Storage, UNWRAP_ARGS(ARGS)),        \
+                   Storage, Context.pImpl->CLASS##s)
+#define DEFINE_GETIMPL_STORE_NO_CONSTRUCTOR_ARGS(CLASS, OPS)                   \
+  return storeImpl(new (ArrayRef<Metadata *>(OPS).size())                      \
+                       CLASS(Context, Storage, OPS),                           \
+                   Storage, Context.pImpl->CLASS##s)
+
+MDSubrange *MDSubrange::getImpl(LLVMContext &Context, int64_t Count, int64_t Lo,
+                                StorageType Storage, bool ShouldCreate) {
+  DEFINE_GETIMPL_LOOKUP(MDSubrange, (Count, Lo));
+  DEFINE_GETIMPL_STORE_NO_OPS(MDSubrange, (Count, Lo));
+}
+
+MDEnumerator *MDEnumerator::getImpl(LLVMContext &Context, int64_t Value,
+                                    MDString *Name, StorageType Storage,
+                                    bool ShouldCreate) {
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(MDEnumerator, (Value, getString(Name)));
+  Metadata *Ops[] = {Name};
+  DEFINE_GETIMPL_STORE(MDEnumerator, (Value), Ops);
+}
+
+MDBasicType *MDBasicType::getImpl(LLVMContext &Context, unsigned Tag,
+                                  MDString *Name, uint64_t SizeInBits,
+                                  uint64_t AlignInBits, unsigned Encoding,
+                                  StorageType Storage, bool ShouldCreate) {
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(
+      MDBasicType, (Tag, getString(Name), SizeInBits, AlignInBits, Encoding));
+  Metadata *Ops[] = {nullptr, nullptr, Name};
+  DEFINE_GETIMPL_STORE(MDBasicType, (Tag, SizeInBits, AlignInBits, Encoding),
+                       Ops);
+}
+
+MDDerivedType *MDDerivedType::getImpl(
+    LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
+    unsigned Line, Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
+    uint64_t AlignInBits, uint64_t OffsetInBits, unsigned Flags,
+    Metadata *ExtraData, StorageType Storage, bool ShouldCreate) {
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(MDDerivedType, (Tag, getString(Name), File, Line, Scope,
+                                        BaseType, SizeInBits, AlignInBits,
+                                        OffsetInBits, Flags, ExtraData));
+  Metadata *Ops[] = {File, Scope, Name, BaseType, ExtraData};
+  DEFINE_GETIMPL_STORE(
+      MDDerivedType, (Tag, Line, SizeInBits, AlignInBits, OffsetInBits, Flags),
+      Ops);
+}
+
+MDCompositeType *MDCompositeType::getImpl(
+    LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
+    unsigned Line, Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
+    uint64_t AlignInBits, uint64_t OffsetInBits, unsigned Flags,
+    Metadata *Elements, unsigned RuntimeLang, Metadata *VTableHolder,
+    Metadata *TemplateParams, MDString *Identifier, StorageType Storage,
+    bool ShouldCreate) {
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(MDCompositeType,
+                        (Tag, getString(Name), File, Line, Scope, BaseType,
+                         SizeInBits, AlignInBits, OffsetInBits, Flags, Elements,
+                         RuntimeLang, VTableHolder, TemplateParams,
+                         getString(Identifier)));
+  Metadata *Ops[] = {File,     Scope,        Name,           BaseType,
+                     Elements, VTableHolder, TemplateParams, Identifier};
+  DEFINE_GETIMPL_STORE(MDCompositeType, (Tag, Line, RuntimeLang, SizeInBits,
+                                         AlignInBits, OffsetInBits, Flags),
+                       Ops);
+}
+
+MDSubroutineType *MDSubroutineType::getImpl(LLVMContext &Context,
+                                            unsigned Flags, Metadata *TypeArray,
+                                            StorageType Storage,
+                                            bool ShouldCreate) {
+  DEFINE_GETIMPL_LOOKUP(MDSubroutineType, (Flags, TypeArray));
+  Metadata *Ops[] = {nullptr,   nullptr, nullptr, nullptr,
+                     TypeArray, nullptr, nullptr, nullptr};
+  DEFINE_GETIMPL_STORE(MDSubroutineType, (Flags), Ops);
+}
+
+MDFile *MDFile::getImpl(LLVMContext &Context, MDString *Filename,
+                        MDString *Directory, StorageType Storage,
+                        bool ShouldCreate) {
+  assert(isCanonical(Filename) && "Expected canonical MDString");
+  assert(isCanonical(Directory) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(MDFile, (getString(Filename), getString(Directory)));
+  Metadata *Ops[] = {Filename, Directory};
+  DEFINE_GETIMPL_STORE_NO_CONSTRUCTOR_ARGS(MDFile, Ops);
+}
+
+MDCompileUnit *MDCompileUnit::getImpl(
+    LLVMContext &Context, unsigned SourceLanguage, Metadata *File,
+    MDString *Producer, bool IsOptimized, MDString *Flags,
+    unsigned RuntimeVersion, MDString *SplitDebugFilename,
+    unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
+    Metadata *Subprograms, Metadata *GlobalVariables,
+    Metadata *ImportedEntities, StorageType Storage, bool ShouldCreate) {
+  assert(isCanonical(Producer) && "Expected canonical MDString");
+  assert(isCanonical(Flags) && "Expected canonical MDString");
+  assert(isCanonical(SplitDebugFilename) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(
+      MDCompileUnit,
+      (SourceLanguage, File, getString(Producer), IsOptimized, getString(Flags),
+       RuntimeVersion, getString(SplitDebugFilename), EmissionKind, EnumTypes,
+       RetainedTypes, Subprograms, GlobalVariables, ImportedEntities));
+  Metadata *Ops[] = {File, Producer, Flags, SplitDebugFilename, EnumTypes,
+                     RetainedTypes, Subprograms, GlobalVariables,
+                     ImportedEntities};
+  DEFINE_GETIMPL_STORE(
+      MDCompileUnit,
+      (SourceLanguage, IsOptimized, RuntimeVersion, EmissionKind), Ops);
+}
+
+MDSubprogram *MDSubprogram::getImpl(
+    LLVMContext &Context, Metadata *Scope, MDString *Name,
+    MDString *LinkageName, Metadata *File, unsigned Line, Metadata *Type,
+    bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine,
+    Metadata *ContainingType, unsigned Virtuality, unsigned VirtualIndex,
+    unsigned Flags, bool IsOptimized, Metadata *Function,
+    Metadata *TemplateParams, Metadata *Declaration, Metadata *Variables,
+    StorageType Storage, bool ShouldCreate) {
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  assert(isCanonical(LinkageName) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(MDSubprogram,
+                        (Scope, getString(Name), getString(LinkageName), File,
+                         Line, Type, IsLocalToUnit, IsDefinition, ScopeLine,
+                         ContainingType, Virtuality, VirtualIndex, Flags,
+                         IsOptimized, Function, TemplateParams, Declaration,
+                         Variables));
+  Metadata *Ops[] = {File,           Scope,       Name,           Name,
+                     LinkageName,    Type,        ContainingType, Function,
+                     TemplateParams, Declaration, Variables};
+  DEFINE_GETIMPL_STORE(MDSubprogram,
+                       (Line, ScopeLine, Virtuality, VirtualIndex, Flags,
+                        IsLocalToUnit, IsDefinition, IsOptimized),
+                       Ops);
+}
+
+void MDSubprogram::replaceFunction(Function *F) {
+  replaceFunction(F ? ConstantAsMetadata::get(F)
+                    : static_cast<ConstantAsMetadata *>(nullptr));
+}
+
+MDLexicalBlock *MDLexicalBlock::getImpl(LLVMContext &Context, Metadata *Scope,
+                                        Metadata *File, unsigned Line,
+                                        unsigned Column, StorageType Storage,
+                                        bool ShouldCreate) {
+  DEFINE_GETIMPL_LOOKUP(MDLexicalBlock, (Scope, File, Line, Column));
+  Metadata *Ops[] = {File, Scope};
+  DEFINE_GETIMPL_STORE(MDLexicalBlock, (Line, Column), Ops);
+}
+
+MDLexicalBlockFile *MDLexicalBlockFile::getImpl(LLVMContext &Context,
+                                                Metadata *Scope, Metadata *File,
+                                                unsigned Discriminator,
+                                                StorageType Storage,
+                                                bool ShouldCreate) {
+  DEFINE_GETIMPL_LOOKUP(MDLexicalBlockFile, (Scope, File, Discriminator));
+  Metadata *Ops[] = {File, Scope};
+  DEFINE_GETIMPL_STORE(MDLexicalBlockFile, (Discriminator), Ops);
+}
+
+MDNamespace *MDNamespace::getImpl(LLVMContext &Context, Metadata *Scope,
+                                  Metadata *File, MDString *Name, unsigned Line,
+                                  StorageType Storage, bool ShouldCreate) {
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(MDNamespace, (Scope, File, getString(Name), Line));
+  Metadata *Ops[] = {File, Scope, Name};
+  DEFINE_GETIMPL_STORE(MDNamespace, (Line), Ops);
+}
+
+MDTemplateTypeParameter *MDTemplateTypeParameter::getImpl(LLVMContext &Context,
+                                                          MDString *Name,
+                                                          Metadata *Type,
+                                                          StorageType Storage,
+                                                          bool ShouldCreate) {
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(MDTemplateTypeParameter, (getString(Name), Type));
+  Metadata *Ops[] = {Name, Type};
+  DEFINE_GETIMPL_STORE_NO_CONSTRUCTOR_ARGS(MDTemplateTypeParameter, Ops);
+}
+
+MDTemplateValueParameter *MDTemplateValueParameter::getImpl(
+    LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *Type,
+    Metadata *Value, StorageType Storage, bool ShouldCreate) {
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(MDTemplateValueParameter,
+                        (Tag, getString(Name), Type, Value));
+  Metadata *Ops[] = {Name, Type, Value};
+  DEFINE_GETIMPL_STORE(MDTemplateValueParameter, (Tag), Ops);
+}
+
+MDGlobalVariable *
+MDGlobalVariable::getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
+                          MDString *LinkageName, Metadata *File, unsigned Line,
+                          Metadata *Type, bool IsLocalToUnit, bool IsDefinition,
+                          Metadata *Variable,
+                          Metadata *StaticDataMemberDeclaration,
+                          StorageType Storage, bool ShouldCreate) {
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  assert(isCanonical(LinkageName) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(MDGlobalVariable,
+                        (Scope, getString(Name), getString(LinkageName), File,
+                         Line, Type, IsLocalToUnit, IsDefinition, Variable,
+                         StaticDataMemberDeclaration));
+  Metadata *Ops[] = {Scope, Name,        File,     Type,
+                     Name,  LinkageName, Variable, StaticDataMemberDeclaration};
+  DEFINE_GETIMPL_STORE(MDGlobalVariable, (Line, IsLocalToUnit, IsDefinition),
+                       Ops);
+}
+
+MDLocalVariable *MDLocalVariable::getImpl(
+    LLVMContext &Context, unsigned Tag, Metadata *Scope, MDString *Name,
+    Metadata *File, unsigned Line, Metadata *Type, unsigned Arg, unsigned Flags,
+    Metadata *InlinedAt, StorageType Storage, bool ShouldCreate) {
+  // Truncate Arg to 8 bits.
+  //
+  // FIXME: This is gross (and should be changed to an assert or removed), but
+  // it matches historical behaviour for now.
+  Arg &= (1u << 8) - 1;
+
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(MDLocalVariable, (Tag, Scope, getString(Name), File,
+                                          Line, Type, Arg, Flags, InlinedAt));
+  Metadata *Ops[] = {Scope, Name, File, Type, InlinedAt};
+  DEFINE_GETIMPL_STORE(MDLocalVariable, (Tag, Line, Arg, Flags), Ops);
+}
+
+MDExpression *MDExpression::getImpl(LLVMContext &Context,
+                                    ArrayRef<uint64_t> Elements,
+                                    StorageType Storage, bool ShouldCreate) {
+  DEFINE_GETIMPL_LOOKUP(MDExpression, (Elements));
+  DEFINE_GETIMPL_STORE_NO_OPS(MDExpression, (Elements));
+}
+
+unsigned MDExpression::ExprOperand::getSize() const {
+  switch (getOp()) {
+  case dwarf::DW_OP_bit_piece:
+    return 3;
+  case dwarf::DW_OP_plus:
+    return 2;
+  default:
+    return 1;
+  }
+}
+
+bool MDExpression::isValid() const {
+  for (auto I = expr_op_begin(), E = expr_op_end(); I != E; ++I) {
+    // Check that there's space for the operand.
+    if (I->get() + I->getSize() > E->get())
+      return false;
+
+    // Check that the operand is valid.
+    switch (I->getOp()) {
+    default:
+      return false;
+    case dwarf::DW_OP_bit_piece:
+      // Piece expressions must be at the end.
+      return I->get() + I->getSize() == E->get();
+    case dwarf::DW_OP_plus:
+    case dwarf::DW_OP_deref:
+      break;
+    }
+  }
+  return true;
+}
+
+MDObjCProperty *MDObjCProperty::getImpl(
+    LLVMContext &Context, MDString *Name, Metadata *File, unsigned Line,
+    MDString *GetterName, MDString *SetterName, unsigned Attributes,
+    Metadata *Type, StorageType Storage, bool ShouldCreate) {
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  assert(isCanonical(GetterName) && "Expected canonical MDString");
+  assert(isCanonical(SetterName) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(MDObjCProperty,
+                        (getString(Name), File, Line, getString(GetterName),
+                         getString(SetterName), Attributes, Type));
+  Metadata *Ops[] = {Name, File, GetterName, SetterName, Type};
+  DEFINE_GETIMPL_STORE(MDObjCProperty, (Line, Attributes), Ops);
+}
+
+MDImportedEntity *MDImportedEntity::getImpl(LLVMContext &Context, unsigned Tag,
+                                            Metadata *Scope, Metadata *Entity,
+                                            unsigned Line, MDString *Name,
+                                            StorageType Storage,
+                                            bool ShouldCreate) {
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(MDImportedEntity,
+                        (Tag, Scope, Entity, Line, getString(Name)));
+  Metadata *Ops[] = {Scope, Entity, Name};
+  DEFINE_GETIMPL_STORE(MDImportedEntity, (Tag, Line), Ops);
+}
diff --git a/lib/IR/DebugLoc.cpp b/lib/IR/DebugLoc.cpp
index 718da85..e1bf795 100644
--- a/lib/IR/DebugLoc.cpp
+++ b/lib/IR/DebugLoc.cpp
@@ -17,67 +17,29 @@ using namespace llvm;
 // DebugLoc Implementation
 //===----------------------------------------------------------------------===//
 
-MDNode *DebugLoc::getScope(const LLVMContext &Ctx) const {
-  if (ScopeIdx == 0) return nullptr;
-  
-  if (ScopeIdx > 0) {
-    // Positive ScopeIdx is an index into ScopeRecords, which has no inlined-at
-    // position specified.
-    assert(unsigned(ScopeIdx) <= Ctx.pImpl->ScopeRecords.size() &&
-           "Invalid ScopeIdx!");
-    return Ctx.pImpl->ScopeRecords[ScopeIdx-1].get();
-  }
-  
-  // Otherwise, the index is in the ScopeInlinedAtRecords array.
-  assert(unsigned(-ScopeIdx) <= Ctx.pImpl->ScopeInlinedAtRecords.size() &&
-         "Invalid ScopeIdx");
-  return Ctx.pImpl->ScopeInlinedAtRecords[-ScopeIdx-1].first.get();
-}
+unsigned DebugLoc::getLine() const { return DILocation(Loc).getLineNumber(); }
+unsigned DebugLoc::getCol() const { return DILocation(Loc).getColumnNumber(); }
+
+MDNode *DebugLoc::getScope() const { return DILocation(Loc).getScope(); }
 
-MDNode *DebugLoc::getInlinedAt(const LLVMContext &Ctx) const {
-  // Positive ScopeIdx is an index into ScopeRecords, which has no inlined-at
-  // position specified.  Zero is invalid.
-  if (ScopeIdx >= 0) return nullptr;
-  
-  // Otherwise, the index is in the ScopeInlinedAtRecords array.
-  assert(unsigned(-ScopeIdx) <= Ctx.pImpl->ScopeInlinedAtRecords.size() &&
-         "Invalid ScopeIdx");
-  return Ctx.pImpl->ScopeInlinedAtRecords[-ScopeIdx-1].second.get();
+MDNode *DebugLoc::getInlinedAt() const {
+  return DILocation(Loc).getOrigLocation();
 }
 
 /// Return both the Scope and the InlinedAt values.
-void DebugLoc::getScopeAndInlinedAt(MDNode *&Scope, MDNode *&IA,
-                                    const LLVMContext &Ctx) const {
-  if (ScopeIdx == 0) {
-    Scope = IA = nullptr;
-    return;
-  }
-  
-  if (ScopeIdx > 0) {
-    // Positive ScopeIdx is an index into ScopeRecords, which has no inlined-at
-    // position specified.
-    assert(unsigned(ScopeIdx) <= Ctx.pImpl->ScopeRecords.size() &&
-           "Invalid ScopeIdx!");
-    Scope = Ctx.pImpl->ScopeRecords[ScopeIdx-1].get();
-    IA = nullptr;
-    return;
-  }
-  
-  // Otherwise, the index is in the ScopeInlinedAtRecords array.
-  assert(unsigned(-ScopeIdx) <= Ctx.pImpl->ScopeInlinedAtRecords.size() &&
-         "Invalid ScopeIdx");
-  Scope = Ctx.pImpl->ScopeInlinedAtRecords[-ScopeIdx-1].first.get();
-  IA    = Ctx.pImpl->ScopeInlinedAtRecords[-ScopeIdx-1].second.get();
+void DebugLoc::getScopeAndInlinedAt(MDNode *&Scope, MDNode *&IA) const {
+  Scope = getScope();
+  IA = getInlinedAt();
 }
 
-MDNode *DebugLoc::getScopeNode(const LLVMContext &Ctx) const {
-  if (MDNode *InlinedAt = getInlinedAt(Ctx))
-    return DebugLoc::getFromDILocation(InlinedAt).getScopeNode(Ctx);
-  return getScope(Ctx);
+MDNode *DebugLoc::getScopeNode() const {
+  if (MDNode *InlinedAt = getInlinedAt())
+    return DebugLoc::getFromDILocation(InlinedAt).getScopeNode();
+  return getScope();
 }
 
-DebugLoc DebugLoc::getFnDebugLoc(const LLVMContext &Ctx) const {
-  const MDNode *Scope = getScopeNode(Ctx);
+DebugLoc DebugLoc::getFnDebugLoc() const {
+  const MDNode *Scope = getScopeNode();
   DISubprogram SP = getDISubprogram(Scope);
   if (SP.isSubprogram())
     return DebugLoc::get(SP.getScopeLineNumber(), 0, SP);
@@ -87,53 +49,23 @@ DebugLoc DebugLoc::getFnDebugLoc(const LLVMContext &Ctx) const {
 
 DebugLoc DebugLoc::get(unsigned Line, unsigned Col,
                        MDNode *Scope, MDNode *InlinedAt) {
-  DebugLoc Result;
-  
   // If no scope is available, this is an unknown location.
-  if (!Scope) return Result;
+  if (!Scope)
+    return DebugLoc();
 
-  // Saturate line and col to "unknown".
-  if (Col > 255) Col = 0;
-  if (Line >= (1 << 24)) Line = 0;
-  Result.LineCol = Line | (Col << 24);
-  
-  LLVMContext &Ctx = Scope->getContext();
-  
-  // If there is no inlined-at location, use the ScopeRecords array.
-  if (!InlinedAt)
-    Result.ScopeIdx = Ctx.pImpl->getOrAddScopeRecordIdxEntry(Scope, 0);
-  else
-    Result.ScopeIdx = Ctx.pImpl->getOrAddScopeInlinedAtIdxEntry(Scope,
-                                                                InlinedAt, 0);
-
-  return Result;
+  return getFromDILocation(
+      MDLocation::get(Scope->getContext(), Line, Col, Scope, InlinedAt));
 }
 
 /// getAsMDNode - This method converts the compressed DebugLoc node into a
 /// DILocation-compatible MDNode.
-MDNode *DebugLoc::getAsMDNode(const LLVMContext &Ctx) const {
-  if (isUnknown()) return nullptr;
-  
-  MDNode *Scope, *IA;
-  getScopeAndInlinedAt(Scope, IA, Ctx);
-  assert(Scope && "If scope is null, this should be isUnknown()");
-  
-  LLVMContext &Ctx2 = Scope->getContext();
-  Type *Int32 = Type::getInt32Ty(Ctx2);
-  Value *Elts[] = {
-    ConstantInt::get(Int32, getLine()), ConstantInt::get(Int32, getCol()),
-    Scope, IA
-  };
-  return MDNode::get(Ctx2, Elts);
-}
+MDNode *DebugLoc::getAsMDNode() const { return Loc; }
 
 /// getFromDILocation - Translate the DILocation quad into a DebugLoc.
 DebugLoc DebugLoc::getFromDILocation(MDNode *N) {
-  DILocation Loc(N);
-  MDNode *Scope = Loc.getScope();
-  if (!Scope) return DebugLoc();
-  return get(Loc.getLineNumber(), Loc.getColumnNumber(), Scope,
-             Loc.getOrigLocation());
+  DebugLoc Loc;
+  Loc.Loc.reset(N);
+  return Loc;
 }
 
 /// getFromDILexicalBlock - Translate the DILexicalBlock into a DebugLoc.
@@ -145,26 +77,26 @@ DebugLoc DebugLoc::getFromDILexicalBlock(MDNode *N) {
              nullptr);
 }
 
-void DebugLoc::dump(const LLVMContext &Ctx) const {
+void DebugLoc::dump() const {
 #ifndef NDEBUG
   if (!isUnknown()) {
     dbgs() << getLine();
     if (getCol() != 0)
       dbgs() << ',' << getCol();
-    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(getInlinedAt(Ctx));
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(getInlinedAt());
     if (!InlinedAtDL.isUnknown()) {
       dbgs() << " @ ";
-      InlinedAtDL.dump(Ctx);
+      InlinedAtDL.dump();
     } else
       dbgs() << "\n";
   }
 #endif
 }
 
-void DebugLoc::print(const LLVMContext &Ctx, raw_ostream &OS) const {
+void DebugLoc::print(raw_ostream &OS) const {
   if (!isUnknown()) {
     // Print source line info.
-    DIScope Scope(getScope(Ctx));
+    DIScope Scope(getScope());
     assert((!Scope || Scope.isScope()) &&
            "Scope of a DebugLoc should be null or a DIScope.");
     if (Scope)
@@ -174,179 +106,11 @@ void DebugLoc::print(const LLVMContext &Ctx, raw_ostream &OS) const {
     OS << ':' << getLine();
     if (getCol() != 0)
       OS << ':' << getCol();
-    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(getInlinedAt(Ctx));
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(getInlinedAt());
     if (!InlinedAtDL.isUnknown()) {
       OS << " @[ ";
-      InlinedAtDL.print(Ctx, OS);
+      InlinedAtDL.print(OS);
       OS << " ]";
     }
   }
 }
-
-//===----------------------------------------------------------------------===//
-// DenseMap specialization
-//===----------------------------------------------------------------------===//
-
-unsigned DenseMapInfo<DebugLoc>::getHashValue(const DebugLoc &Key) {
-  return static_cast<unsigned>(hash_combine(Key.LineCol, Key.ScopeIdx));
-}
-
-//===----------------------------------------------------------------------===//
-// LLVMContextImpl Implementation
-//===----------------------------------------------------------------------===//
-
-int LLVMContextImpl::getOrAddScopeRecordIdxEntry(MDNode *Scope,
-                                                 int ExistingIdx) {
-  // If we already have an entry for this scope, return it.
-  int &Idx = ScopeRecordIdx[Scope];
-  if (Idx) return Idx;
-  
-  // If we don't have an entry, but ExistingIdx is specified, use it.
-  if (ExistingIdx)
-    return Idx = ExistingIdx;
-  
-  // Otherwise add a new entry.
-  
-  // Start out ScopeRecords with a minimal reasonable size to avoid
-  // excessive reallocation starting out.
-  if (ScopeRecords.empty())
-    ScopeRecords.reserve(128);
-  
-  // Index is biased by 1 for index.
-  Idx = ScopeRecords.size()+1;
-  ScopeRecords.push_back(DebugRecVH(Scope, this, Idx));
-  return Idx;
-}
-
-int LLVMContextImpl::getOrAddScopeInlinedAtIdxEntry(MDNode *Scope, MDNode *IA,
-                                                    int ExistingIdx) {
-  // If we already have an entry, return it.
-  int &Idx = ScopeInlinedAtIdx[std::make_pair(Scope, IA)];
-  if (Idx) return Idx;
-  
-  // If we don't have an entry, but ExistingIdx is specified, use it.
-  if (ExistingIdx)
-    return Idx = ExistingIdx;
-  
-  // Start out ScopeInlinedAtRecords with a minimal reasonable size to avoid
-  // excessive reallocation starting out.
-  if (ScopeInlinedAtRecords.empty())
-    ScopeInlinedAtRecords.reserve(128);
-    
-  // Index is biased by 1 and negated.
-  Idx = -ScopeInlinedAtRecords.size()-1;
-  ScopeInlinedAtRecords.push_back(std::make_pair(DebugRecVH(Scope, this, Idx),
-                                                 DebugRecVH(IA, this, Idx)));
-  return Idx;
-}
-
-
-//===----------------------------------------------------------------------===//
-// DebugRecVH Implementation
-//===----------------------------------------------------------------------===//
-
-/// deleted - The MDNode this is pointing to got deleted, so this pointer needs
-/// to drop to null and we need remove our entry from the DenseMap.
-void DebugRecVH::deleted() {
-  // If this is a non-canonical reference, just drop the value to null, we know
-  // it doesn't have a map entry.
-  if (Idx == 0) {
-    setValPtr(nullptr);
-    return;
-  }
-    
-  MDNode *Cur = get();
-  
-  // If the index is positive, it is an entry in ScopeRecords.
-  if (Idx > 0) {
-    assert(Ctx->ScopeRecordIdx[Cur] == Idx && "Mapping out of date!");
-    Ctx->ScopeRecordIdx.erase(Cur);
-    // Reset this VH to null and we're done.
-    setValPtr(nullptr);
-    Idx = 0;
-    return;
-  }
-  
-  // Otherwise, it is an entry in ScopeInlinedAtRecords, we don't know if it
-  // is the scope or the inlined-at record entry.
-  assert(unsigned(-Idx-1) < Ctx->ScopeInlinedAtRecords.size());
-  std::pair<DebugRecVH, DebugRecVH> &Entry = Ctx->ScopeInlinedAtRecords[-Idx-1];
-  assert((this == &Entry.first || this == &Entry.second) &&
-         "Mapping out of date!");
-  
-  MDNode *OldScope = Entry.first.get();
-  MDNode *OldInlinedAt = Entry.second.get();
-  assert(OldScope && OldInlinedAt &&
-         "Entry should be non-canonical if either val dropped to null");
-
-  // Otherwise, we do have an entry in it, nuke it and we're done.
-  assert(Ctx->ScopeInlinedAtIdx[std::make_pair(OldScope, OldInlinedAt)] == Idx&&
-         "Mapping out of date");
-  Ctx->ScopeInlinedAtIdx.erase(std::make_pair(OldScope, OldInlinedAt));
-  
-  // Reset this VH to null.  Drop both 'Idx' values to null to indicate that
-  // we're in non-canonical form now.
-  setValPtr(nullptr);
-  Entry.first.Idx = Entry.second.Idx = 0;
-}
-
-void DebugRecVH::allUsesReplacedWith(Value *NewVa) {
-  // If being replaced with a non-mdnode value (e.g. undef) handle this as if
-  // the mdnode got deleted.
-  MDNode *NewVal = dyn_cast<MDNode>(NewVa);
-  if (!NewVal) return deleted();
-
-  // If this is a non-canonical reference, just change it, we know it already
-  // doesn't have a map entry.
-  if (Idx == 0) {
-    setValPtr(NewVa);
-    return;
-  }
-  
-  MDNode *OldVal = get();
-  assert(OldVal != NewVa && "Node replaced with self?");
-  
-  // If the index is positive, it is an entry in ScopeRecords.
-  if (Idx > 0) {
-    assert(Ctx->ScopeRecordIdx[OldVal] == Idx && "Mapping out of date!");
-    Ctx->ScopeRecordIdx.erase(OldVal);
-    setValPtr(NewVal);
-
-    int NewEntry = Ctx->getOrAddScopeRecordIdxEntry(NewVal, Idx);
-    
-    // If NewVal already has an entry, this becomes a non-canonical reference,
-    // just drop Idx to 0 to signify this.
-    if (NewEntry != Idx)
-      Idx = 0;
-    return;
-  }
-  
-  // Otherwise, it is an entry in ScopeInlinedAtRecords, we don't know if it
-  // is the scope or the inlined-at record entry.
-  assert(unsigned(-Idx-1) < Ctx->ScopeInlinedAtRecords.size());
-  std::pair<DebugRecVH, DebugRecVH> &Entry = Ctx->ScopeInlinedAtRecords[-Idx-1];
-  assert((this == &Entry.first || this == &Entry.second) &&
-         "Mapping out of date!");
-  
-  MDNode *OldScope = Entry.first.get();
-  MDNode *OldInlinedAt = Entry.second.get();
-  assert(OldScope && OldInlinedAt &&
-         "Entry should be non-canonical if either val dropped to null");
-  
-  // Otherwise, we do have an entry in it, nuke it and we're done.
-  assert(Ctx->ScopeInlinedAtIdx[std::make_pair(OldScope, OldInlinedAt)] == Idx&&
-         "Mapping out of date");
-  Ctx->ScopeInlinedAtIdx.erase(std::make_pair(OldScope, OldInlinedAt));
-  
-  // Reset this VH to the new value.
-  setValPtr(NewVal);
-
-  int NewIdx = Ctx->getOrAddScopeInlinedAtIdxEntry(Entry.first.get(),
-                                                   Entry.second.get(), Idx);
-  // If NewVal already has an entry, this becomes a non-canonical reference,
-  // just drop Idx to 0 to signify this.
-  if (NewIdx != Idx) {
-    std::pair<DebugRecVH, DebugRecVH> &Entry=Ctx->ScopeInlinedAtRecords[-Idx-1];
-    Entry.first.Idx = Entry.second.Idx = 0;
-  }
-}
diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp
index 37cce2b..cfb699a 100644
--- a/lib/IR/DiagnosticInfo.cpp
+++ b/lib/IR/DiagnosticInfo.cpp
@@ -98,7 +98,8 @@ DiagnosticInfoInlineAsm::DiagnosticInfoInlineAsm(const Instruction &I,
       Instr(&I) {
   if (const MDNode *SrcLoc = I.getMetadata("srcloc")) {
     if (SrcLoc->getNumOperands() != 0)
-      if (const ConstantInt *CI = dyn_cast<ConstantInt>(SrcLoc->getOperand(0)))
+      if (const auto *CI =
+              mdconst::dyn_extract<ConstantInt>(SrcLoc->getOperand(0)))
         LocCookie = CI->getZExtValue();
   }
 }
diff --git a/lib/IR/Dominators.cpp b/lib/IR/Dominators.cpp
index d6649d6..9b6ff1e 100644
--- a/lib/IR/Dominators.cpp
+++ b/lib/IR/Dominators.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -298,10 +299,45 @@ void DominatorTree::verifyDomTree() const {
 }
 
 //===----------------------------------------------------------------------===//
+//  DominatorTreeAnalysis and related pass implementations
+//===----------------------------------------------------------------------===//
+//
+// This implements the DominatorTreeAnalysis which is used with the new pass
+// manager. It also implements some methods from utility passes.
+//
+//===----------------------------------------------------------------------===//
+
+DominatorTree DominatorTreeAnalysis::run(Function &F) {
+  DominatorTree DT;
+  DT.recalculate(F);
+  return DT;
+}
+
+char DominatorTreeAnalysis::PassID;
+
+DominatorTreePrinterPass::DominatorTreePrinterPass(raw_ostream &OS) : OS(OS) {}
+
+PreservedAnalyses DominatorTreePrinterPass::run(Function &F,
+                                                FunctionAnalysisManager *AM) {
+  OS << "DominatorTree for function: " << F.getName() << "\n";
+  AM->getResult<DominatorTreeAnalysis>(F).print(OS);
+
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses DominatorTreeVerifierPass::run(Function &F,
+                                                 FunctionAnalysisManager *AM) {
+  AM->getResult<DominatorTreeAnalysis>(F).verifyDomTree();
+
+  return PreservedAnalyses::all();
+}
+
+//===----------------------------------------------------------------------===//
 //  DominatorTreeWrapperPass Implementation
 //===----------------------------------------------------------------------===//
 //
-// The implementation details of the wrapper pass that holds a DominatorTree.
+// The implementation details of the wrapper pass that holds a DominatorTree
+// suitable for use with the legacy pass manager.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index 32b2ec5..33e1526 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -23,7 +23,6 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LeakDetector.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/RWMutex.h"
@@ -46,20 +45,13 @@ Argument::Argument(Type *Ty, const Twine &Name, Function *Par)
   : Value(Ty, Value::ArgumentVal) {
   Parent = nullptr;
 
-  // Make sure that we get added to a function
-  LeakDetector::addGarbageObject(this);
-
   if (Par)
     Par->getArgumentList().push_back(this);
   setName(Name);
 }
 
 void Argument::setParent(Function *parent) {
-  if (getParent())
-    LeakDetector::addGarbageObject(this);
   Parent = parent;
-  if (getParent())
-    LeakDetector::removeGarbageObject(this);
 }
 
 /// getArgNo - Return the index of this formal argument in its containing
@@ -260,9 +252,6 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, const Twine &name,
   if (Ty->getNumParams())
     setValueSubclassData(1);   // Set the "has lazy arguments" bit.
 
-  // Make sure that we get added to a function
-  LeakDetector::addGarbageObject(this);
-
   if (ParentModule)
     ParentModule->getFunctionList().push_back(this);
 
@@ -298,7 +287,7 @@ void Function::BuildLazyArguments() const {
 
   // Clear the lazy arguments bit.
   unsigned SDC = getSubclassDataFromValue();
-  const_cast<Function*>(this)->setValueSubclassData(SDC &= ~1);
+  const_cast<Function*>(this)->setValueSubclassData(SDC &= ~(1<<0));
 }
 
 size_t Function::arg_size() const {
@@ -309,11 +298,7 @@ bool Function::arg_empty() const {
 }
 
 void Function::setParent(Module *parent) {
-  if (getParent())
-    LeakDetector::addGarbageObject(this);
   Parent = parent;
-  if (getParent())
-    LeakDetector::removeGarbageObject(this);
 }
 
 // dropAllReferences() - This function causes all the subinstructions to "let
@@ -335,8 +320,9 @@ void Function::dropAllReferences() {
   while (!BasicBlocks.empty())
     BasicBlocks.begin()->eraseFromParent();
 
-  // Prefix data is stored in a side table.
+  // Prefix and prologue data are stored in a side table.
   setPrefixData(nullptr);
+  setPrologueData(nullptr);
 }
 
 void Function::addAttribute(unsigned i, Attribute::AttrKind attr) {
@@ -357,6 +343,12 @@ void Function::removeAttributes(unsigned i, AttributeSet attrs) {
   setAttributes(PAL);
 }
 
+void Function::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
+  setAttributes(PAL);
+}
+
 // Maintain the GC name for each function in an on-the-side table. This saves
 // allocating an additional word in Function for programs which do not use GC
 // (i.e., most programs) at the cost of increased overhead for clients which do
@@ -416,6 +408,10 @@ void Function::copyAttributesFrom(const GlobalValue *Src) {
     setPrefixData(SrcF->getPrefixData());
   else
     setPrefixData(nullptr);
+  if (SrcF->hasPrologueData())
+    setPrologueData(SrcF->getPrologueData());
+  else
+    setPrologueData(nullptr);
 }
 
 /// getIntrinsicID - This method returns the ID number of the specified
@@ -456,7 +452,19 @@ unsigned Function::lookupIntrinsicID() const {
 }
 
 /// Returns a stable mangling for the type specified for use in the name
-/// mangling scheme used by 'any' types in intrinsic signatures.
+/// mangling scheme used by 'any' types in intrinsic signatures.  The mangling
+/// of named types is simply their name.  Manglings for unnamed types consist
+/// of a prefix ('p' for pointers, 'a' for arrays, 'f_' for functions)
+/// combined with the mangling of their component types.  A vararg function
+/// type will have a suffix of 'vararg'.  Since function types can contain
+/// other function types, we close a function type mangling with suffix 'f'
+/// which can't be confused with it's prefix.  This ensures we don't have
+/// collisions between two unrelated function types. Otherwise, you might
+/// parse ffXX as f(fXX) or f(fX)X.  (X is a placeholder for any other type.)
+/// Manglings of integers, floats, and vectors ('i', 'f', and 'v' prefix in most
+/// cases) fall back to the MVT codepath, where they could be mangled to
+/// 'x86mmx', for example; matching on derived types is not sufficient to mangle
+/// everything.
 static std::string getMangledTypeStr(Type* Ty) {
   std::string Result;
   if (PointerType* PTyp = dyn_cast<PointerType>(Ty)) {
@@ -476,7 +484,8 @@ static std::string getMangledTypeStr(Type* Ty) {
       Result += getMangledTypeStr(FT->getParamType(i));
     if (FT->isVarArg())
       Result += "vararg";
-    Result += "f"; //ensure distinguishable
+    // Ensure nested function types are distinguishable.
+    Result += "f"; 
   } else if (Ty)
     Result += EVT::getEVT(Ty).getEVTString();
   return Result;
@@ -537,7 +546,10 @@ enum IIT_Info {
   IIT_ANYPTR = 26,
   IIT_V1   = 27,
   IIT_VARARG = 28,
-  IIT_HALF_VEC_ARG = 29
+  IIT_HALF_VEC_ARG = 29,
+  IIT_SAME_VEC_WIDTH_ARG = 30,
+  IIT_PTR_TO_ARG = 31,
+  IIT_VEC_OF_PTRS_TO_ELT = 32
 };
 
 
@@ -645,6 +657,24 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
                                              ArgInfo));
     return;
   }
+  case IIT_SAME_VEC_WIDTH_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::SameVecWidthArgument,
+                                             ArgInfo));
+    return;
+  }
+  case IIT_PTR_TO_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::PtrToArgument,
+                                             ArgInfo));
+    return;
+  }
+  case IIT_VEC_OF_PTRS_TO_ELT: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::VecOfPtrsToElt,
+                                             ArgInfo));
+    return;
+  }
   case IIT_EMPTYSTRUCT:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0));
     return;
@@ -752,7 +782,28 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::HalfVecArgument:
     return VectorType::getHalfElementsVectorType(cast<VectorType>(
                                                   Tys[D.getArgumentNumber()]));
+  case IITDescriptor::SameVecWidthArgument: {
+    Type *EltTy = DecodeFixedType(Infos, Tys, Context);
+    Type *Ty = Tys[D.getArgumentNumber()];
+    if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+      return VectorType::get(EltTy, VTy->getNumElements());
+    }
+    llvm_unreachable("unhandled");
+  }
+  case IITDescriptor::PtrToArgument: {
+    Type *Ty = Tys[D.getArgumentNumber()];
+    return PointerType::getUnqual(Ty);
+  }
+  case IITDescriptor::VecOfPtrsToElt: {
+    Type *Ty = Tys[D.getArgumentNumber()];
+    VectorType *VTy = dyn_cast<VectorType>(Ty);
+    if (!VTy)
+      llvm_unreachable("Expected an argument of Vector Type");
+    Type *EltTy = VTy->getVectorElementType();
+    return VectorType::get(PointerType::getUnqual(EltTy),
+                           VTy->getNumElements());
   }
+ }
   llvm_unreachable("unhandled");
 }
 
@@ -871,11 +922,40 @@ void Function::setPrefixData(Constant *PrefixData) {
       PDHolder->setOperand(0, PrefixData);
     else
       PDHolder = ReturnInst::Create(getContext(), PrefixData);
-    SCData |= 2;
+    SCData |= (1<<1);
   } else {
     delete PDHolder;
     PDMap.erase(this);
-    SCData &= ~2;
+    SCData &= ~(1<<1);
   }
   setValueSubclassData(SCData);
 }
+
+Constant *Function::getPrologueData() const {
+  assert(hasPrologueData());
+  const LLVMContextImpl::PrologueDataMapTy &SOMap =
+      getContext().pImpl->PrologueDataMap;
+  assert(SOMap.find(this) != SOMap.end());
+  return cast<Constant>(SOMap.find(this)->second->getReturnValue());
+}
+
+void Function::setPrologueData(Constant *PrologueData) {
+  if (!PrologueData && !hasPrologueData())
+    return;
+
+  unsigned PDData = getSubclassDataFromValue();
+  LLVMContextImpl::PrologueDataMapTy &PDMap = getContext().pImpl->PrologueDataMap;
+  ReturnInst *&PDHolder = PDMap[this];
+  if (PrologueData) {
+    if (PDHolder)
+      PDHolder->setOperand(0, PrologueData);
+    else
+      PDHolder = ReturnInst::Create(getContext(), PrologueData);
+    PDData |= (1<<2);
+  } else {
+    delete PDHolder;
+    PDMap.erase(this);
+    PDData &= ~(1<<2);
+  }
+  setValueSubclassData(PDData);
+}
diff --git a/lib/IR/GCOV.cpp b/lib/IR/GCOV.cpp
index 245c500..08f44e0 100644
--- a/lib/IR/GCOV.cpp
+++ b/lib/IR/GCOV.cpp
@@ -28,12 +28,16 @@ using namespace llvm;
 
 /// readGCNO - Read GCNO buffer.
 bool GCOVFile::readGCNO(GCOVBuffer &Buffer) {
-  if (!Buffer.readGCNOFormat()) return false;
-  if (!Buffer.readGCOVVersion(Version)) return false;
+  if (!Buffer.readGCNOFormat())
+    return false;
+  if (!Buffer.readGCOVVersion(Version))
+    return false;
 
-  if (!Buffer.readInt(Checksum)) return false;
+  if (!Buffer.readInt(Checksum))
+    return false;
   while (true) {
-    if (!Buffer.readFunctionTag()) break;
+    if (!Buffer.readFunctionTag())
+      break;
     auto GFun = make_unique<GCOVFunction>(*this);
     if (!GFun->readGCNO(Buffer, Version))
       return false;
@@ -48,19 +52,22 @@ bool GCOVFile::readGCNO(GCOVBuffer &Buffer) {
 /// called after readGCNO().
 bool GCOVFile::readGCDA(GCOVBuffer &Buffer) {
   assert(GCNOInitialized && "readGCDA() can only be called after readGCNO()");
-  if (!Buffer.readGCDAFormat()) return false;
+  if (!Buffer.readGCDAFormat())
+    return false;
   GCOV::GCOVVersion GCDAVersion;
-  if (!Buffer.readGCOVVersion(GCDAVersion)) return false;
+  if (!Buffer.readGCOVVersion(GCDAVersion))
+    return false;
   if (Version != GCDAVersion) {
     errs() << "GCOV versions do not match.\n";
     return false;
   }
 
   uint32_t GCDAChecksum;
-  if (!Buffer.readInt(GCDAChecksum)) return false;
+  if (!Buffer.readInt(GCDAChecksum))
+    return false;
   if (Checksum != GCDAChecksum) {
-    errs() << "File checksums do not match: " << Checksum << " != "
-           << GCDAChecksum << ".\n";
+    errs() << "File checksums do not match: " << Checksum
+           << " != " << GCDAChecksum << ".\n";
     return false;
   }
   for (size_t i = 0, e = Functions.size(); i < e; ++i) {
@@ -74,15 +81,20 @@ bool GCOVFile::readGCDA(GCOVBuffer &Buffer) {
   if (Buffer.readObjectTag()) {
     uint32_t Length;
     uint32_t Dummy;
-    if (!Buffer.readInt(Length)) return false;
-    if (!Buffer.readInt(Dummy)) return false; // checksum
-    if (!Buffer.readInt(Dummy)) return false; // num
-    if (!Buffer.readInt(RunCount)) return false;
-    Buffer.advanceCursor(Length-3);
+    if (!Buffer.readInt(Length))
+      return false;
+    if (!Buffer.readInt(Dummy))
+      return false; // checksum
+    if (!Buffer.readInt(Dummy))
+      return false; // num
+    if (!Buffer.readInt(RunCount))
+      return false;
+    Buffer.advanceCursor(Length - 3);
   }
   while (Buffer.readProgramTag()) {
     uint32_t Length;
-    if (!Buffer.readInt(Length)) return false;
+    if (!Buffer.readInt(Length))
+      return false;
     Buffer.advanceCursor(Length);
     ++ProgramCount;
   }
@@ -112,21 +124,28 @@ void GCOVFile::collectLineCounts(FileInfo &FI) {
 /// occurs.
 bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
   uint32_t Dummy;
-  if (!Buff.readInt(Dummy)) return false; // Function header length
-  if (!Buff.readInt(Ident)) return false;
-  if (!Buff.readInt(Checksum)) return false;
+  if (!Buff.readInt(Dummy))
+    return false; // Function header length
+  if (!Buff.readInt(Ident))
+    return false;
+  if (!Buff.readInt(Checksum))
+    return false;
   if (Version != GCOV::V402) {
     uint32_t CfgChecksum;
-    if (!Buff.readInt(CfgChecksum)) return false;
+    if (!Buff.readInt(CfgChecksum))
+      return false;
     if (Parent.getChecksum() != CfgChecksum) {
       errs() << "File checksums do not match: " << Parent.getChecksum()
              << " != " << CfgChecksum << " in (" << Name << ").\n";
       return false;
     }
   }
-  if (!Buff.readString(Name)) return false;
-  if (!Buff.readString(Filename)) return false;
-  if (!Buff.readInt(LineNumber)) return false;
+  if (!Buff.readString(Name))
+    return false;
+  if (!Buff.readString(Filename))
+    return false;
+  if (!Buff.readInt(LineNumber))
+    return false;
 
   // read blocks.
   if (!Buff.readBlockTag()) {
@@ -134,19 +153,23 @@ bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
     return false;
   }
   uint32_t BlockCount;
-  if (!Buff.readInt(BlockCount)) return false;
+  if (!Buff.readInt(BlockCount))
+    return false;
   for (uint32_t i = 0, e = BlockCount; i != e; ++i) {
-    if (!Buff.readInt(Dummy)) return false; // Block flags;
+    if (!Buff.readInt(Dummy))
+      return false; // Block flags;
     Blocks.push_back(make_unique<GCOVBlock>(*this, i));
   }
 
   // read edges.
   while (Buff.readEdgeTag()) {
     uint32_t EdgeCount;
-    if (!Buff.readInt(EdgeCount)) return false;
+    if (!Buff.readInt(EdgeCount))
+      return false;
     EdgeCount = (EdgeCount - 1) / 2;
     uint32_t BlockNo;
-    if (!Buff.readInt(BlockNo)) return false;
+    if (!Buff.readInt(BlockNo))
+      return false;
     if (BlockNo >= BlockCount) {
       errs() << "Unexpected block number: " << BlockNo << " (in " << Name
              << ").\n";
@@ -154,12 +177,14 @@ bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
     }
     for (uint32_t i = 0, e = EdgeCount; i != e; ++i) {
       uint32_t Dst;
-      if (!Buff.readInt(Dst)) return false;
+      if (!Buff.readInt(Dst))
+        return false;
       Edges.push_back(make_unique<GCOVEdge>(*Blocks[BlockNo], *Blocks[Dst]));
       GCOVEdge *Edge = Edges.back().get();
       Blocks[BlockNo]->addDstEdge(Edge);
       Blocks[Dst]->addSrcEdge(Edge);
-      if (!Buff.readInt(Dummy)) return false; // Edge flag
+      if (!Buff.readInt(Dummy))
+        return false; // Edge flag
     }
   }
 
@@ -167,11 +192,13 @@ bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
   while (Buff.readLineTag()) {
     uint32_t LineTableLength;
     // Read the length of this line table.
-    if (!Buff.readInt(LineTableLength)) return false;
-    uint32_t EndPos = Buff.getCursor() + LineTableLength*4;
+    if (!Buff.readInt(LineTableLength))
+      return false;
+    uint32_t EndPos = Buff.getCursor() + LineTableLength * 4;
     uint32_t BlockNo;
     // Read the block number this table is associated with.
-    if (!Buff.readInt(BlockNo)) return false;
+    if (!Buff.readInt(BlockNo))
+      return false;
     if (BlockNo >= BlockCount) {
       errs() << "Unexpected block number: " << BlockNo << " (in " << Name
              << ").\n";
@@ -180,13 +207,15 @@ bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
     GCOVBlock &Block = *Blocks[BlockNo];
     // Read the word that pads the beginning of the line table. This may be a
     // flag of some sort, but seems to always be zero.
-    if (!Buff.readInt(Dummy)) return false;
+    if (!Buff.readInt(Dummy))
+      return false;
 
     // Line information starts here and continues up until the last word.
     if (Buff.getCursor() != (EndPos - sizeof(uint32_t))) {
       StringRef F;
       // Read the source file name.
-      if (!Buff.readString(F)) return false;
+      if (!Buff.readString(F))
+        return false;
       if (Filename != F) {
         errs() << "Multiple sources for a single basic block: " << Filename
                << " != " << F << " (in " << Name << ").\n";
@@ -195,17 +224,21 @@ bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
       // Read lines up to, but not including, the null terminator.
       while (Buff.getCursor() < (EndPos - 2 * sizeof(uint32_t))) {
         uint32_t Line;
-        if (!Buff.readInt(Line)) return false;
+        if (!Buff.readInt(Line))
+          return false;
         // Line 0 means this instruction was injected by the compiler. Skip it.
-        if (!Line) continue;
+        if (!Line)
+          continue;
         Block.addLine(Line);
       }
       // Read the null terminator.
-      if (!Buff.readInt(Dummy)) return false;
+      if (!Buff.readInt(Dummy))
+        return false;
     }
     // The last word is either a flag or padding, it isn't clear which. Skip
     // over it.
-    if (!Buff.readInt(Dummy)) return false;
+    if (!Buff.readInt(Dummy))
+      return false;
   }
   return true;
 }
@@ -214,27 +247,31 @@ bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
 /// occurs.
 bool GCOVFunction::readGCDA(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
   uint32_t Dummy;
-  if (!Buff.readInt(Dummy)) return false; // Function header length
+  if (!Buff.readInt(Dummy))
+    return false; // Function header length
 
   uint32_t GCDAIdent;
-  if (!Buff.readInt(GCDAIdent)) return false;
+  if (!Buff.readInt(GCDAIdent))
+    return false;
   if (Ident != GCDAIdent) {
-    errs() << "Function identifiers do not match: " << Ident << " != "
-           << GCDAIdent << " (in " << Name << ").\n";
+    errs() << "Function identifiers do not match: " << Ident
+           << " != " << GCDAIdent << " (in " << Name << ").\n";
     return false;
   }
 
   uint32_t GCDAChecksum;
-  if (!Buff.readInt(GCDAChecksum)) return false;
+  if (!Buff.readInt(GCDAChecksum))
+    return false;
   if (Checksum != GCDAChecksum) {
-    errs() << "Function checksums do not match: " << Checksum << " != "
-           << GCDAChecksum << " (in " << Name << ").\n";
+    errs() << "Function checksums do not match: " << Checksum
+           << " != " << GCDAChecksum << " (in " << Name << ").\n";
     return false;
   }
 
   uint32_t CfgChecksum;
   if (Version != GCOV::V402) {
-    if (!Buff.readInt(CfgChecksum)) return false;
+    if (!Buff.readInt(CfgChecksum))
+      return false;
     if (Parent.getChecksum() != CfgChecksum) {
       errs() << "File checksums do not match: " << Parent.getChecksum()
              << " != " << CfgChecksum << " (in " << Name << ").\n";
@@ -243,7 +280,8 @@ bool GCOVFunction::readGCDA(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
   }
 
   StringRef GCDAName;
-  if (!Buff.readString(GCDAName)) return false;
+  if (!Buff.readString(GCDAName))
+    return false;
   if (Name != GCDAName) {
     errs() << "Function names do not match: " << Name << " != " << GCDAName
            << ".\n";
@@ -256,26 +294,28 @@ bool GCOVFunction::readGCDA(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
   }
 
   uint32_t Count;
-  if (!Buff.readInt(Count)) return false;
+  if (!Buff.readInt(Count))
+    return false;
   Count /= 2;
 
   // This for loop adds the counts for each block. A second nested loop is
   // required to combine the edge counts that are contained in the GCDA file.
   for (uint32_t BlockNo = 0; Count > 0; ++BlockNo) {
     // The last block is always reserved for exit block
-    if (BlockNo >= Blocks.size()-1) {
+    if (BlockNo >= Blocks.size() - 1) {
       errs() << "Unexpected number of edges (in " << Name << ").\n";
       return false;
     }
     GCOVBlock &Block = *Blocks[BlockNo];
     for (size_t EdgeNo = 0, End = Block.getNumDstEdges(); EdgeNo < End;
-           ++EdgeNo) {
+         ++EdgeNo) {
       if (Count == 0) {
         errs() << "Unexpected number of edges (in " << Name << ").\n";
         return false;
       }
       uint64_t ArcCount;
-      if (!Buff.readInt64(ArcCount)) return false;
+      if (!Buff.readInt64(ArcCount))
+        return false;
       Block.addCount(EdgeNo, ArcCount);
       --Count;
     }
@@ -349,9 +389,8 @@ void GCOVBlock::sortDstEdges() {
 /// collectLineCounts - Collect line counts. This must be used after
 /// reading .gcno and .gcda files.
 void GCOVBlock::collectLineCounts(FileInfo &FI) {
-  for (SmallVectorImpl<uint32_t>::iterator I = Lines.begin(),
-         E = Lines.end(); I != E; ++I)
-    FI.addBlockLine(Parent.getFilename(), *I, this);
+  for (uint32_t N : Lines)
+    FI.addBlockLine(Parent.getFilename(), N, this);
 }
 
 /// dump - Dump GCOVBlock content to dbgs() for debugging purposes.
@@ -359,25 +398,20 @@ void GCOVBlock::dump() const {
   dbgs() << "Block : " << Number << " Counter : " << Counter << "\n";
   if (!SrcEdges.empty()) {
     dbgs() << "\tSource Edges : ";
-    for (EdgeIterator I = SrcEdges.begin(), E = SrcEdges.end(); I != E; ++I) {
-      const GCOVEdge *Edge = *I;
+    for (const GCOVEdge *Edge : SrcEdges)
       dbgs() << Edge->Src.Number << " (" << Edge->Count << "), ";
-    }
     dbgs() << "\n";
   }
   if (!DstEdges.empty()) {
     dbgs() << "\tDestination Edges : ";
-    for (EdgeIterator I = DstEdges.begin(), E = DstEdges.end(); I != E; ++I) {
-      const GCOVEdge *Edge = *I;
+    for (const GCOVEdge *Edge : DstEdges)
       dbgs() << Edge->Dst.Number << " (" << Edge->Count << "), ";
-    }
     dbgs() << "\n";
   }
   if (!Lines.empty()) {
     dbgs() << "\tLines : ";
-    for (SmallVectorImpl<uint32_t>::const_iterator I = Lines.begin(),
-           E = Lines.end(); I != E; ++I)
-      dbgs() << (*I) << ",";
+    for (uint32_t N : Lines)
+      dbgs() << (N) << ",";
     dbgs() << "\n";
   }
 }
@@ -389,7 +423,7 @@ void GCOVBlock::dump() const {
 static uint32_t safeDiv(uint64_t Numerator, uint64_t Divisor) {
   if (!Numerator)
     return 0;
-  return Numerator/Divisor;
+  return Numerator / Divisor;
 }
 
 // This custom division function mimics gcov's branch ouputs:
@@ -401,7 +435,7 @@ static uint32_t branchDiv(uint64_t Numerator, uint64_t Divisor) {
   if (Numerator == Divisor)
     return 100;
 
-  uint8_t Res = (Numerator*100+Divisor/2) / Divisor;
+  uint8_t Res = (Numerator * 100 + Divisor / 2) / Divisor;
   if (Res == 0)
     return 1;
   if (Res == 100)
@@ -410,9 +444,8 @@ static uint32_t branchDiv(uint64_t Numerator, uint64_t Divisor) {
 }
 
 struct formatBranchInfo {
-  formatBranchInfo(const GCOVOptions &Options, uint64_t Count,
-                   uint64_t Total) :
-    Options(Options), Count(Count), Total(Total) {}
+  formatBranchInfo(const GCOVOptions &Options, uint64_t Count, uint64_t Total)
+      : Options(Options), Count(Count), Total(Total) {}
 
   void print(raw_ostream &OS) const {
     if (!Total)
@@ -437,6 +470,7 @@ namespace {
 class LineConsumer {
   std::unique_ptr<MemoryBuffer> Buffer;
   StringRef Remaining;
+
 public:
   LineConsumer(StringRef Filename) {
     ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
@@ -508,8 +542,7 @@ std::string FileInfo::getCoveragePath(StringRef Filename,
   if (Options.LongFileNames && !Filename.equals(MainFilename))
     CoveragePath =
         mangleCoveragePath(MainFilename, Options.PreservePaths) + "##";
-  CoveragePath +=
-      mangleCoveragePath(Filename, Options.PreservePaths) + ".gcov";
+  CoveragePath += mangleCoveragePath(Filename, Options.PreservePaths) + ".gcov";
   return CoveragePath;
 }
 
@@ -529,47 +562,44 @@ FileInfo::openCoveragePath(StringRef CoveragePath) {
 }
 
 /// print -  Print source files with collected line count information.
-void FileInfo::print(StringRef MainFilename, StringRef GCNOFile,
-                     StringRef GCDAFile) {
-  for (StringMap<LineData>::const_iterator I = LineInfo.begin(),
-         E = LineInfo.end(); I != E; ++I) {
-    StringRef Filename = I->first();
+void FileInfo::print(raw_ostream &InfoOS, StringRef MainFilename,
+                     StringRef GCNOFile, StringRef GCDAFile) {
+  for (const auto &LI : LineInfo) {
+    StringRef Filename = LI.first();
     auto AllLines = LineConsumer(Filename);
 
     std::string CoveragePath = getCoveragePath(Filename, MainFilename);
-    std::unique_ptr<raw_ostream> S = openCoveragePath(CoveragePath);
-    raw_ostream &OS = *S;
+    std::unique_ptr<raw_ostream> CovStream = openCoveragePath(CoveragePath);
+    raw_ostream &CovOS = *CovStream;
 
-    OS << "        -:    0:Source:" << Filename << "\n";
-    OS << "        -:    0:Graph:" << GCNOFile << "\n";
-    OS << "        -:    0:Data:" << GCDAFile << "\n";
-    OS << "        -:    0:Runs:" << RunCount << "\n";
-    OS << "        -:    0:Programs:" << ProgramCount << "\n";
+    CovOS << "        -:    0:Source:" << Filename << "\n";
+    CovOS << "        -:    0:Graph:" << GCNOFile << "\n";
+    CovOS << "        -:    0:Data:" << GCDAFile << "\n";
+    CovOS << "        -:    0:Runs:" << RunCount << "\n";
+    CovOS << "        -:    0:Programs:" << ProgramCount << "\n";
 
-    const LineData &Line = I->second;
+    const LineData &Line = LI.second;
     GCOVCoverage FileCoverage(Filename);
-    for (uint32_t LineIndex = 0;
-         LineIndex < Line.LastLine || !AllLines.empty(); ++LineIndex) {
+    for (uint32_t LineIndex = 0; LineIndex < Line.LastLine || !AllLines.empty();
+         ++LineIndex) {
       if (Options.BranchInfo) {
         FunctionLines::const_iterator FuncsIt = Line.Functions.find(LineIndex);
         if (FuncsIt != Line.Functions.end())
-          printFunctionSummary(OS, FuncsIt->second);
+          printFunctionSummary(CovOS, FuncsIt->second);
       }
 
       BlockLines::const_iterator BlocksIt = Line.Blocks.find(LineIndex);
       if (BlocksIt == Line.Blocks.end()) {
         // No basic blocks are on this line. Not an executable line of code.
-        OS << "        -:";
-        AllLines.printNext(OS, LineIndex + 1);
+        CovOS << "        -:";
+        AllLines.printNext(CovOS, LineIndex + 1);
       } else {
         const BlockVector &Blocks = BlocksIt->second;
 
         // Add up the block counts to form line counts.
         DenseMap<const GCOVFunction *, bool> LineExecs;
         uint64_t LineCount = 0;
-        for (BlockVector::const_iterator I = Blocks.begin(), E = Blocks.end();
-               I != E; ++I) {
-          const GCOVBlock *Block = *I;
+        for (const GCOVBlock *Block : Blocks) {
           if (Options.AllBlocks) {
             // Only take the highest block count for that line.
             uint64_t BlockCount = Block->getCount();
@@ -593,8 +623,8 @@ void FileInfo::print(StringRef MainFilename, StringRef GCNOFile,
             //    one of the blocks are executed.
             const GCOVFunction *Function = &Block->getParent();
             if (FuncCoverages.find(Function) == FuncCoverages.end()) {
-              std::pair<const GCOVFunction *, GCOVCoverage>
-                KeyValue(Function, GCOVCoverage(Function->getName()));
+              std::pair<const GCOVFunction *, GCOVCoverage> KeyValue(
+                  Function, GCOVCoverage(Function->getName()));
               FuncCoverages.insert(KeyValue);
             }
             GCOVCoverage &FuncCoverage = FuncCoverages.find(Function)->second;
@@ -615,32 +645,30 @@ void FileInfo::print(StringRef MainFilename, StringRef GCNOFile,
         }
 
         if (LineCount == 0)
-          OS << "    #####:";
+          CovOS << "    #####:";
         else {
-          OS << format("%9" PRIu64 ":", LineCount);
+          CovOS << format("%9" PRIu64 ":", LineCount);
           ++FileCoverage.LinesExec;
         }
         ++FileCoverage.LogicalLines;
 
-        AllLines.printNext(OS, LineIndex + 1);
+        AllLines.printNext(CovOS, LineIndex + 1);
 
         uint32_t BlockNo = 0;
         uint32_t EdgeNo = 0;
-        for (BlockVector::const_iterator I = Blocks.begin(), E = Blocks.end();
-               I != E; ++I) {
-          const GCOVBlock *Block = *I;
-
+        for (const GCOVBlock *Block : Blocks) {
           // Only print block and branch information at the end of the block.
-          if (Block->getLastLine() != LineIndex+1)
+          if (Block->getLastLine() != LineIndex + 1)
             continue;
           if (Options.AllBlocks)
-            printBlockInfo(OS, *Block, LineIndex, BlockNo);
+            printBlockInfo(CovOS, *Block, LineIndex, BlockNo);
           if (Options.BranchInfo) {
             size_t NumEdges = Block->getNumDstEdges();
             if (NumEdges > 1)
-              printBranchInfo(OS, *Block, FileCoverage, EdgeNo);
+              printBranchInfo(CovOS, *Block, FileCoverage, EdgeNo);
             else if (Options.UncondBranch && NumEdges == 1)
-              printUncondBranchInfo(OS, EdgeNo, (*Block->dst_begin())->Count);
+              printUncondBranchInfo(CovOS, EdgeNo,
+                                    (*Block->dst_begin())->Count);
           }
         }
       }
@@ -650,30 +678,25 @@ void FileInfo::print(StringRef MainFilename, StringRef GCNOFile,
 
   // FIXME: There is no way to detect calls given current instrumentation.
   if (Options.FuncCoverage)
-    printFuncCoverage();
-  printFileCoverage();
+    printFuncCoverage(InfoOS);
+  printFileCoverage(InfoOS);
   return;
 }
 
 /// printFunctionSummary - Print function and block summary.
 void FileInfo::printFunctionSummary(raw_ostream &OS,
                                     const FunctionVector &Funcs) const {
-  for (FunctionVector::const_iterator I = Funcs.begin(), E = Funcs.end();
-         I != E; ++I) {
-    const GCOVFunction *Func = *I;
+  for (const GCOVFunction *Func : Funcs) {
     uint64_t EntryCount = Func->getEntryCount();
     uint32_t BlocksExec = 0;
-    for (GCOVFunction::BlockIterator I = Func->block_begin(),
-           E = Func->block_end(); I != E; ++I) {
-      const GCOVBlock &Block = **I;
+    for (const GCOVBlock &Block : Func->blocks())
       if (Block.getNumDstEdges() && Block.getCount())
-          ++BlocksExec;
-    }
+        ++BlocksExec;
 
     OS << "function " << Func->getName() << " called " << EntryCount
-       << " returned " << safeDiv(Func->getExitCount()*100, EntryCount)
+       << " returned " << safeDiv(Func->getExitCount() * 100, EntryCount)
        << "% blocks executed "
-       << safeDiv(BlocksExec*100, Func->getNumBlocks()-1) << "%\n";
+       << safeDiv(BlocksExec * 100, Func->getNumBlocks() - 1) << "%\n";
   }
 }
 
@@ -684,7 +707,7 @@ void FileInfo::printBlockInfo(raw_ostream &OS, const GCOVBlock &Block,
     OS << "    $$$$$:";
   else
     OS << format("%9" PRIu64 ":", Block.getCount());
-  OS << format("%5u-block %2u\n", LineIndex+1, BlockNo++);
+  OS << format("%5u-block %2u\n", LineIndex + 1, BlockNo++);
 }
 
 /// printBranchInfo - Print conditional branch probabilities.
@@ -692,29 +715,29 @@ void FileInfo::printBranchInfo(raw_ostream &OS, const GCOVBlock &Block,
                                GCOVCoverage &Coverage, uint32_t &EdgeNo) {
   SmallVector<uint64_t, 16> BranchCounts;
   uint64_t TotalCounts = 0;
-  for (GCOVBlock::EdgeIterator I = Block.dst_begin(), E = Block.dst_end();
-         I != E; ++I) {
-    const GCOVEdge *Edge = *I;
+  for (const GCOVEdge *Edge : Block.dsts()) {
     BranchCounts.push_back(Edge->Count);
     TotalCounts += Edge->Count;
-    if (Block.getCount()) ++Coverage.BranchesExec;
-    if (Edge->Count) ++Coverage.BranchesTaken;
+    if (Block.getCount())
+      ++Coverage.BranchesExec;
+    if (Edge->Count)
+      ++Coverage.BranchesTaken;
     ++Coverage.Branches;
 
     if (Options.FuncCoverage) {
       const GCOVFunction *Function = &Block.getParent();
       GCOVCoverage &FuncCoverage = FuncCoverages.find(Function)->second;
-      if (Block.getCount()) ++FuncCoverage.BranchesExec;
-      if (Edge->Count) ++FuncCoverage.BranchesTaken;
+      if (Block.getCount())
+        ++FuncCoverage.BranchesExec;
+      if (Edge->Count)
+        ++FuncCoverage.BranchesTaken;
       ++FuncCoverage.Branches;
     }
   }
 
-  for (SmallVectorImpl<uint64_t>::const_iterator I = BranchCounts.begin(),
-         E = BranchCounts.end(); I != E; ++I) {
+  for (uint64_t N : BranchCounts)
     OS << format("branch %2u ", EdgeNo++)
-       << formatBranchInfo(Options, *I, TotalCounts) << "\n";
-  }
+       << formatBranchInfo(Options, N, TotalCounts) << "\n";
 }
 
 /// printUncondBranchInfo - Print unconditional branch probabilities.
@@ -726,46 +749,45 @@ void FileInfo::printUncondBranchInfo(raw_ostream &OS, uint32_t &EdgeNo,
 
 // printCoverage - Print generic coverage info used by both printFuncCoverage
 // and printFileCoverage.
-void FileInfo::printCoverage(const GCOVCoverage &Coverage) const {
-  outs() << format("Lines executed:%.2f%% of %u\n",
-                   double(Coverage.LinesExec)*100/Coverage.LogicalLines,
-                   Coverage.LogicalLines);
+void FileInfo::printCoverage(raw_ostream &OS,
+                             const GCOVCoverage &Coverage) const {
+  OS << format("Lines executed:%.2f%% of %u\n",
+               double(Coverage.LinesExec) * 100 / Coverage.LogicalLines,
+               Coverage.LogicalLines);
   if (Options.BranchInfo) {
     if (Coverage.Branches) {
-      outs() << format("Branches executed:%.2f%% of %u\n",
-                       double(Coverage.BranchesExec)*100/Coverage.Branches,
-                       Coverage.Branches);
-      outs() << format("Taken at least once:%.2f%% of %u\n",
-                       double(Coverage.BranchesTaken)*100/Coverage.Branches,
-                       Coverage.Branches);
+      OS << format("Branches executed:%.2f%% of %u\n",
+                   double(Coverage.BranchesExec) * 100 / Coverage.Branches,
+                   Coverage.Branches);
+      OS << format("Taken at least once:%.2f%% of %u\n",
+                   double(Coverage.BranchesTaken) * 100 / Coverage.Branches,
+                   Coverage.Branches);
     } else {
-      outs() << "No branches\n";
+      OS << "No branches\n";
     }
-    outs() << "No calls\n"; // to be consistent with gcov
+    OS << "No calls\n"; // to be consistent with gcov
   }
 }
 
 // printFuncCoverage - Print per-function coverage info.
-void FileInfo::printFuncCoverage() const {
-  for (FuncCoverageMap::const_iterator I = FuncCoverages.begin(),
-                                       E = FuncCoverages.end(); I != E; ++I) {
-    const GCOVCoverage &Coverage = I->second;
-    outs() << "Function '" << Coverage.Name << "'\n";
-    printCoverage(Coverage);
-    outs() << "\n";
+void FileInfo::printFuncCoverage(raw_ostream &OS) const {
+  for (const auto &FC : FuncCoverages) {
+    const GCOVCoverage &Coverage = FC.second;
+    OS << "Function '" << Coverage.Name << "'\n";
+    printCoverage(OS, Coverage);
+    OS << "\n";
   }
 }
 
 // printFileCoverage - Print per-file coverage info.
-void FileInfo::printFileCoverage() const {
-  for (FileCoverageList::const_iterator I = FileCoverages.begin(),
-                                        E = FileCoverages.end(); I != E; ++I) {
-    const std::string &Filename = I->first;
-    const GCOVCoverage &Coverage = I->second;
-    outs() << "File '" << Coverage.Name << "'\n";
-    printCoverage(Coverage);
+void FileInfo::printFileCoverage(raw_ostream &OS) const {
+  for (const auto &FC : FileCoverages) {
+    const std::string &Filename = FC.first;
+    const GCOVCoverage &Coverage = FC.second;
+    OS << "File '" << Coverage.Name << "'\n";
+    printCoverage(OS, Coverage);
     if (!Options.NoOutput)
-      outs() << Coverage.Name << ":creating '" << Filename << "'\n";
-    outs() << "\n";
+      OS << Coverage.Name << ":creating '" << Filename << "'\n";
+    OS << "\n";
   }
 }
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index e181d62..54197d9 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -18,7 +18,6 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/LeakDetector.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -159,8 +158,6 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
            "Initializer should be the same type as the GlobalVariable!");
     Op<0>() = InitVal;
   }
-
-  LeakDetector::addGarbageObject(this);
 }
 
 GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant,
@@ -180,8 +177,6 @@ GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant,
     Op<0>() = InitVal;
   }
 
-  LeakDetector::addGarbageObject(this);
-
   if (Before)
     Before->getParent()->getGlobalList().insert(Before, this);
   else
@@ -189,11 +184,7 @@ GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant,
 }
 
 void GlobalVariable::setParent(Module *parent) {
-  if (getParent())
-    LeakDetector::addGarbageObject(this);
   Parent = parent;
-  if (getParent())
-    LeakDetector::removeGarbageObject(this);
 }
 
 void GlobalVariable::removeFromParent() {
@@ -259,7 +250,6 @@ GlobalAlias::GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Link,
                          Module *ParentModule)
     : GlobalValue(PointerType::get(Ty, AddressSpace), Value::GlobalAliasVal,
                   &Op<0>(), 1, Link, Name) {
-  LeakDetector::addGarbageObject(this);
   Op<0>() = Aliasee;
 
   if (ParentModule)
@@ -296,11 +286,7 @@ GlobalAlias *GlobalAlias::create(const Twine &Name, GlobalValue *Aliasee) {
 }
 
 void GlobalAlias::setParent(Module *parent) {
-  if (getParent())
-    LeakDetector::addGarbageObject(this);
   Parent = parent;
-  if (getParent())
-    LeakDetector::removeGarbageObject(this);
 }
 
 void GlobalAlias::removeFromParent() {
diff --git a/lib/IR/IRBuilder.cpp b/lib/IR/IRBuilder.cpp
index a4c5d97..90303b2 100644
--- a/lib/IR/IRBuilder.cpp
+++ b/lib/IR/IRBuilder.cpp
@@ -53,8 +53,9 @@ Value *IRBuilderBase::getCastedInt8PtrValue(Value *Ptr) {
 }
 
 static CallInst *createCallHelper(Value *Callee, ArrayRef<Value *> Ops,
-                                  IRBuilderBase *Builder) {
-  CallInst *CI = CallInst::Create(Callee, Ops, "");
+                                  IRBuilderBase *Builder,
+                                  const Twine& Name="") {
+  CallInst *CI = CallInst::Create(Callee, Ops, Name);
   Builder->GetInsertBlock()->getInstList().insert(Builder->GetInsertPoint(),CI);
   Builder->SetInstDebugLocation(CI);
   return CI;  
@@ -183,3 +184,117 @@ CallInst *IRBuilderBase::CreateAssumption(Value *Cond) {
   return createCallHelper(FnAssume, Ops, this);
 }
 
+/// Create a call to a Masked Load intrinsic.
+/// Ptr      - the base pointer for the load
+/// Align    - alignment of the source location
+/// Mask     - an vector of booleans which indicates what vector lanes should
+///            be accessed in memory
+/// PassThru - a pass-through value that is used to fill the masked-off lanes
+///            of the result
+/// Name     - name of the result variable
+CallInst *IRBuilderBase::CreateMaskedLoad(Value *Ptr, unsigned Align,
+                                          Value *Mask, Value *PassThru,
+                                          const Twine &Name) {
+  assert(Ptr->getType()->isPointerTy() && "Ptr must be of pointer type");
+  // DataTy is the overloaded type
+  Type *DataTy = cast<PointerType>(Ptr->getType())->getElementType();
+  assert(DataTy->isVectorTy() && "Ptr should point to a vector");
+  if (!PassThru)
+    PassThru = UndefValue::get(DataTy);
+  Value *Ops[] = { Ptr, getInt32(Align), Mask,  PassThru};
+  return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, DataTy, Name);
+}
+
+/// Create a call to a Masked Store intrinsic.
+/// Val   - the data to be stored,
+/// Ptr   - the base pointer for the store
+/// Align - alignment of the destination location
+/// Mask  - an vector of booleans which indicates what vector lanes should
+///         be accessed in memory
+CallInst *IRBuilderBase::CreateMaskedStore(Value *Val, Value *Ptr,
+                                           unsigned Align, Value *Mask) {
+  Value *Ops[] = { Val, Ptr, getInt32(Align), Mask };
+  // Type of the data to be stored - the only one overloaded type
+  return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, Val->getType());
+}
+
+/// Create a call to a Masked intrinsic, with given intrinsic Id,
+/// an array of operands - Ops, and one overloaded type - DataTy
+CallInst *IRBuilderBase::CreateMaskedIntrinsic(unsigned Id,
+                                               ArrayRef<Value *> Ops,
+                                               Type *DataTy,
+                                               const Twine &Name) {
+  Module *M = BB->getParent()->getParent();
+  Type *OverloadedTypes[] = { DataTy };
+  Value *TheFn = Intrinsic::getDeclaration(M, (Intrinsic::ID)Id, OverloadedTypes);
+  return createCallHelper(TheFn, Ops, this, Name);
+}
+
+CallInst *IRBuilderBase::CreateGCStatepoint(Value *ActualCallee,
+                                            ArrayRef<Value *> CallArgs,
+                                            ArrayRef<Value *> DeoptArgs,
+                                            ArrayRef<Value *> GCArgs,
+                                            const Twine &Name) {
+ // Extract out the type of the callee.
+ PointerType *FuncPtrType = cast<PointerType>(ActualCallee->getType());
+ assert(isa<FunctionType>(FuncPtrType->getElementType()) &&
+        "actual callee must be a callable value");
+
+ 
+ Module *M = BB->getParent()->getParent();
+ // Fill in the one generic type'd argument (the function is also vararg)
+ Type *ArgTypes[] = { FuncPtrType };
+ Function *FnStatepoint =
+   Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_statepoint,
+                             ArgTypes);
+
+ std::vector<llvm::Value *> args;
+ args.push_back(ActualCallee);
+ args.push_back(getInt32(CallArgs.size()));
+ args.push_back(getInt32(0 /*unused*/));
+ args.insert(args.end(), CallArgs.begin(), CallArgs.end());
+ args.push_back(getInt32(DeoptArgs.size()));
+ args.insert(args.end(), DeoptArgs.begin(), DeoptArgs.end());
+ args.insert(args.end(), GCArgs.begin(), GCArgs.end());
+
+ return createCallHelper(FnStatepoint, args, this, Name);
+}
+
+CallInst *IRBuilderBase::CreateGCStatepoint(Value *ActualCallee,
+                                            ArrayRef<Use> CallArgs,
+                                            ArrayRef<Value *> DeoptArgs,
+                                            ArrayRef<Value *> GCArgs,
+                                            const Twine &Name) {
+  std::vector<Value *> VCallArgs;
+  for (auto &U : CallArgs)
+    VCallArgs.push_back(U.get());
+  return CreateGCStatepoint(ActualCallee, VCallArgs, DeoptArgs, GCArgs, Name);
+}
+
+CallInst *IRBuilderBase::CreateGCResult(Instruction *Statepoint,
+                                       Type *ResultType,
+                                       const Twine &Name) {
+ Intrinsic::ID ID = Intrinsic::experimental_gc_result;
+ Module *M = BB->getParent()->getParent();
+ Type *Types[] = {ResultType};
+ Value *FnGCResult = Intrinsic::getDeclaration(M, ID, Types);
+
+ Value *Args[] = {Statepoint};
+ return createCallHelper(FnGCResult, Args, this, Name);
+}
+
+CallInst *IRBuilderBase::CreateGCRelocate(Instruction *Statepoint,
+                                         int BaseOffset,
+                                         int DerivedOffset,
+                                         Type *ResultType,
+                                         const Twine &Name) {
+ Module *M = BB->getParent()->getParent();
+ Type *Types[] = {ResultType};
+ Value *FnGCRelocate =
+   Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate, Types);
+
+ Value *Args[] = {Statepoint,
+                  getInt32(BaseOffset),
+                  getInt32(DerivedOffset)};
+ return createCallHelper(FnGCRelocate, Args, this, Name);
+}
diff --git a/lib/IR/IRPrintingPasses.cpp b/lib/IR/IRPrintingPasses.cpp
index c8a1747..91ccfbb 100644
--- a/lib/IR/IRPrintingPasses.cpp
+++ b/lib/IR/IRPrintingPasses.cpp
@@ -24,8 +24,8 @@ PrintModulePass::PrintModulePass() : OS(dbgs()) {}
 PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner)
     : OS(OS), Banner(Banner) {}
 
-PreservedAnalyses PrintModulePass::run(Module *M) {
-  OS << Banner << *M;
+PreservedAnalyses PrintModulePass::run(Module &M) {
+  OS << Banner << M;
   return PreservedAnalyses::all();
 }
 
@@ -33,8 +33,8 @@ PrintFunctionPass::PrintFunctionPass() : OS(dbgs()) {}
 PrintFunctionPass::PrintFunctionPass(raw_ostream &OS, const std::string &Banner)
     : OS(OS), Banner(Banner) {}
 
-PreservedAnalyses PrintFunctionPass::run(Function *F) {
-  OS << Banner << static_cast<Value &>(*F);
+PreservedAnalyses PrintFunctionPass::run(Function &F) {
+  OS << Banner << static_cast<Value &>(F);
   return PreservedAnalyses::all();
 }
 
@@ -50,7 +50,7 @@ public:
       : ModulePass(ID), P(OS, Banner) {}
 
   bool runOnModule(Module &M) override {
-    P.run(&M);
+    P.run(M);
     return false;
   }
 
@@ -70,7 +70,7 @@ public:
 
   // This pass just prints a banner followed by the function as it's processed.
   bool runOnFunction(Function &F) override {
-    P.run(&F);
+    P.run(F);
     return false;
   }
 
diff --git a/lib/IR/InlineAsm.cpp b/lib/IR/InlineAsm.cpp
index 16d874f..5b73561 100644
--- a/lib/IR/InlineAsm.cpp
+++ b/lib/IR/InlineAsm.cpp
@@ -73,7 +73,7 @@ bool InlineAsm::ConstraintInfo::Parse(StringRef Str,
   unsigned multipleAlternativeCount = Str.count('|') + 1;
   unsigned multipleAlternativeIndex = 0;
   ConstraintCodeVector *pCodes = &Codes;
-  
+
   // Initialize
   isMultipleAlternative = (multipleAlternativeCount > 1 ? true : false);
   if (isMultipleAlternative) {
@@ -99,12 +99,12 @@ bool InlineAsm::ConstraintInfo::Parse(StringRef Str,
     ++I;
     Type = isOutput;
   }
-  
+
   if (*I == '*') {
     isIndirect = true;
     ++I;
   }
-  
+
   if (I == E) return true;  // Just a prefix, like "==" or "~".
   
   // Parse the modifiers.
@@ -228,7 +228,10 @@ InlineAsm::ParseConstraints(StringRef Constraints) {
     I = ConstraintEnd;
     if (I != E) {
       ++I;
-      if (I == E) { Result.clear(); break; }    // don't allow "xyz,"
+      if (I == E) {
+        Result.clear();
+        break;
+      } // don't allow "xyz,"
     }
   }
   
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 3ee66f5..92c6e9f 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -15,7 +15,6 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LeakDetector.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
@@ -24,8 +23,6 @@ using namespace llvm;
 Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
                          Instruction *InsertBefore)
   : User(ty, Value::InstructionVal + it, Ops, NumOps), Parent(nullptr) {
-  // Make sure that we get added to a basicblock
-  LeakDetector::addGarbageObject(this);
 
   // If requested, insert this instruction into a basic block...
   if (InsertBefore) {
@@ -42,8 +39,6 @@ const DataLayout *Instruction::getDataLayout() const {
 Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
                          BasicBlock *InsertAtEnd)
   : User(ty, Value::InstructionVal + it, Ops, NumOps), Parent(nullptr) {
-  // Make sure that we get added to a basicblock
-  LeakDetector::addGarbageObject(this);
 
   // append this instruction into the basic block
   assert(InsertAtEnd && "Basic block to append to may not be NULL!");
@@ -60,12 +55,6 @@ Instruction::~Instruction() {
 
 
 void Instruction::setParent(BasicBlock *P) {
-  if (getParent()) {
-    if (!P) LeakDetector::addGarbageObject(this);
-  } else {
-    if (P) LeakDetector::removeGarbageObject(this);
-  }
-
   Parent = P;
 }
 
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 57a4f0b..7136923 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -346,6 +346,12 @@ void CallInst::removeAttribute(unsigned i, Attribute attr) {
   setAttributes(PAL);
 }
 
+void CallInst::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
+  setAttributes(PAL);
+}
+
 bool CallInst::hasFnAttrImpl(Attribute::AttrKind A) const {
   if (AttributeList.hasAttribute(AttributeSet::FunctionIndex, A))
     return true;
@@ -605,6 +611,12 @@ void InvokeInst::removeAttribute(unsigned i, Attribute attr) {
   setAttributes(PAL);
 }
 
+void InvokeInst::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
+  setAttributes(PAL);
+}
+
 LandingPadInst *InvokeInst::getLandingPadInst() const {
   return cast<LandingPadInst>(getUnwindDest()->getFirstNonPHI());
 }
@@ -796,11 +808,8 @@ void BranchInst::swapSuccessors() {
     return;
 
   // The first operand is the name. Fetch them backwards and build a new one.
-  Value *Ops[] = {
-    ProfileData->getOperand(0),
-    ProfileData->getOperand(2),
-    ProfileData->getOperand(1)
-  };
+  Metadata *Ops[] = {ProfileData->getOperand(0), ProfileData->getOperand(2),
+                     ProfileData->getOperand(1)};
   setMetadata(LLVMContext::MD_prof,
               MDNode::get(ProfileData->getContext(), Ops));
 }
@@ -2076,7 +2085,7 @@ float FPMathOperator::getFPAccuracy() const {
       cast<Instruction>(this)->getMetadata(LLVMContext::MD_fpmath);
   if (!MD)
     return 0.0;
-  ConstantFP *Accuracy = cast<ConstantFP>(MD->getOperand(0));
+  ConstantFP *Accuracy = mdconst::extract<ConstantFP>(MD->getOperand(0));
   return Accuracy->getValueAPF().convertToFloat();
 }
 
@@ -2559,6 +2568,17 @@ CastInst *CastInst::CreatePointerBitCastOrAddrSpaceCast(
   return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
 }
 
+CastInst *CastInst::CreateBitOrPointerCast(Value *S, Type *Ty,
+                                           const Twine &Name,
+                                           Instruction *InsertBefore) {
+  if (S->getType()->isPointerTy() && Ty->isIntegerTy())
+    return Create(Instruction::PtrToInt, S, Ty, Name, InsertBefore);
+  if (S->getType()->isIntegerTy() && Ty->isPointerTy())
+    return Create(Instruction::IntToPtr, S, Ty, Name, InsertBefore);
+
+  return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
+}
+
 CastInst *CastInst::CreateIntegerCast(Value *C, Type *Ty,
                                       bool isSigned, const Twine &Name,
                                       Instruction *InsertBefore) {
@@ -2716,6 +2736,18 @@ bool CastInst::isBitCastable(Type *SrcTy, Type *DestTy) {
   return true;
 }
 
+bool CastInst::isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy,
+                                          const DataLayout *DL) {
+  if (auto *PtrTy = dyn_cast<PointerType>(SrcTy))
+    if (auto *IntTy = dyn_cast<IntegerType>(DestTy))
+      return DL && IntTy->getBitWidth() == DL->getPointerTypeSizeInBits(PtrTy);
+  if (auto *PtrTy = dyn_cast<PointerType>(DestTy))
+    if (auto *IntTy = dyn_cast<IntegerType>(SrcTy))
+      return DL && IntTy->getBitWidth() == DL->getPointerTypeSizeInBits(PtrTy);
+
+  return isBitCastable(SrcTy, DestTy);
+}
+
 // Provide a way to get a "cast" where the cast opcode is inferred from the
 // types and size of the operand. This, basically, is a parallel of the
 // logic in the castIsValid function below.  This axiom should hold:
@@ -2832,10 +2864,6 @@ CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
   // Check for type sanity on the arguments
   Type *SrcTy = S->getType();
 
-  // If this is a cast to the same type then it's trivially true.
-  if (SrcTy == DstTy)
-    return true;
-
   if (!SrcTy->isFirstClassType() || !DstTy->isFirstClassType() ||
       SrcTy->isAggregateType() || DstTy->isAggregateType())
     return false;
diff --git a/lib/IR/IntrinsicInst.cpp b/lib/IR/IntrinsicInst.cpp
index 5725284..b9b5a29 100644
--- a/lib/IR/IntrinsicInst.cpp
+++ b/lib/IR/IntrinsicInst.cpp
@@ -49,15 +49,25 @@ Value *DbgInfoIntrinsic::StripCast(Value *C) {
   return dyn_cast<GlobalVariable>(C);
 }
 
+static Value *getValueImpl(Value *Op) {
+  auto *MD = cast<MetadataAsValue>(Op)->getMetadata();
+  if (auto *V = dyn_cast<ValueAsMetadata>(MD))
+    return V->getValue();
+
+  // When the value goes to null, it gets replaced by an empty MDNode.
+  assert(!cast<MDNode>(MD)->getNumOperands() && "Expected an empty MDNode");
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 /// DbgDeclareInst - This represents the llvm.dbg.declare instruction.
 ///
 
 Value *DbgDeclareInst::getAddress() const {
-  if (MDNode* MD = cast_or_null<MDNode>(getArgOperand(0)))
-    return MD->getOperand(0);
-  else
+  if (!getArgOperand(0))
     return nullptr;
+
+  return getValueImpl(getArgOperand(0));
 }
 
 //===----------------------------------------------------------------------===//
@@ -65,9 +75,7 @@ Value *DbgDeclareInst::getAddress() const {
 ///
 
 const Value *DbgValueInst::getValue() const {
-  return cast<MDNode>(getArgOperand(0))->getOperand(0);
+  return const_cast<DbgValueInst *>(this)->getValue();
 }
 
-Value *DbgValueInst::getValue() {
-  return cast<MDNode>(getArgOperand(0))->getOperand(0);
-}
+Value *DbgValueInst::getValue() { return getValueImpl(getArgOperand(0)); }
diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp
index c62bc09..b6d95c4 100644
--- a/lib/IR/LLVMContext.cpp
+++ b/lib/IR/LLVMContext.cpp
@@ -229,28 +229,10 @@ void LLVMContext::emitError(unsigned LocCookie, const Twine &ErrorStr) {
 // Metadata Kind Uniquing
 //===----------------------------------------------------------------------===//
 
-#ifndef NDEBUG
-/// isValidName - Return true if Name is a valid custom metadata handler name.
-static bool isValidName(StringRef MDName) {
-  if (MDName.empty())
-    return false;
-
-  if (!std::isalpha(static_cast<unsigned char>(MDName[0])))
-    return false;
-
-  for (StringRef::iterator I = MDName.begin() + 1, E = MDName.end(); I != E;
-       ++I) {
-    if (!std::isalnum(static_cast<unsigned char>(*I)) && *I != '_' &&
-        *I != '-' && *I != '.')
-      return false;
-  }
-  return true;
-}
-#endif
-
 /// getMDKindID - Return a unique non-zero ID for the specified metadata kind.
 unsigned LLVMContext::getMDKindID(StringRef Name) const {
-  assert(isValidName(Name) && "Invalid MDNode name");
+  assert(!std::isdigit(Name.front()) &&
+         "Named metadata may not start with a digit");
 
   // If this is new, assign it its ID.
   return pImpl->CustomMDKindNames.insert(std::make_pair(
diff --git a/lib/IR/LLVMContextImpl.cpp b/lib/IR/LLVMContextImpl.cpp
index 3fd0bb3..d717b92 100644
--- a/lib/IR/LLVMContextImpl.cpp
+++ b/lib/IR/LLVMContextImpl.cpp
@@ -72,9 +72,31 @@ LLVMContextImpl::~LLVMContextImpl() {
   // the container. Avoid iterators during this operation:
   while (!OwnedModules.empty())
     delete *OwnedModules.begin();
-  
-  // Free the constants.  This is important to do here to ensure that they are
-  // freed before the LeakDetector is torn down.
+
+  // Drop references for MDNodes.  Do this before Values get deleted to avoid
+  // unnecessary RAUW when nodes are still unresolved.
+  for (auto *I : DistinctMDNodes)
+    I->dropAllReferences();
+#define HANDLE_MDNODE_LEAF(CLASS)                                              \
+  for (auto *I : CLASS##s)                                                     \
+    I->dropAllReferences();
+#include "llvm/IR/Metadata.def"
+
+  // Also drop references that come from the Value bridges.
+  for (auto &Pair : ValuesAsMetadata)
+    Pair.second->dropUsers();
+  for (auto &Pair : MetadataAsValues)
+    Pair.second->dropUse();
+
+  // Destroy MDNodes.
+  for (MDNode *I : DistinctMDNodes)
+    I->deleteAsSubclass();
+#define HANDLE_MDNODE_LEAF(CLASS)                                              \
+  for (CLASS *I : CLASS##s)                                                    \
+    delete I;
+#include "llvm/IR/Metadata.def"
+
+  // Free the constants.
   std::for_each(ExprConstants.map_begin(), ExprConstants.map_end(),
                 DropFirst());
   std::for_each(ArrayConstants.map_begin(), ArrayConstants.map_end(),
@@ -120,23 +142,81 @@ LLVMContextImpl::~LLVMContextImpl() {
     delete &*Elem;
   }
 
-  // Destroy MDNodes.  ~MDNode can move and remove nodes between the MDNodeSet
-  // and the NonUniquedMDNodes sets, so copy the values out first.
-  SmallVector<GenericMDNode *, 8> MDNodes;
-  MDNodes.reserve(MDNodeSet.size() + NonUniquedMDNodes.size());
-  MDNodes.append(MDNodeSet.begin(), MDNodeSet.end());
-  MDNodes.append(NonUniquedMDNodes.begin(), NonUniquedMDNodes.end());
-  for (GenericMDNode *I : MDNodes)
-    I->dropAllReferences();
-  for (GenericMDNode *I : MDNodes)
-    delete I;
-  assert(MDNodeSet.empty() && NonUniquedMDNodes.empty() &&
-         "Destroying all MDNodes didn't empty the Context's sets.");
+  // Destroy MetadataAsValues.
+  {
+    SmallVector<MetadataAsValue *, 8> MDVs;
+    MDVs.reserve(MetadataAsValues.size());
+    for (auto &Pair : MetadataAsValues)
+      MDVs.push_back(Pair.second);
+    MetadataAsValues.clear();
+    for (auto *V : MDVs)
+      delete V;
+  }
+
+  // Destroy ValuesAsMetadata.
+  for (auto &Pair : ValuesAsMetadata)
+    delete Pair.second;
 
   // Destroy MDStrings.
   MDStringCache.clear();
 }
 
+void LLVMContextImpl::dropTriviallyDeadConstantArrays() {
+  bool Changed;
+  do {
+    Changed = false;
+
+    for (auto I = ArrayConstants.map_begin(), E = ArrayConstants.map_end();
+         I != E; ) {
+      auto *C = I->first;
+      I++;
+      if (C->use_empty()) {
+        Changed = true;
+        C->destroyConstant();
+      }
+    }
+
+  } while (Changed);
+}
+
+void Module::dropTriviallyDeadConstantArrays() {
+  Context.pImpl->dropTriviallyDeadConstantArrays();
+}
+
+namespace llvm {
+/// \brief Make MDOperand transparent for hashing.
+///
+/// This overload of an implementation detail of the hashing library makes
+/// MDOperand hash to the same value as a \a Metadata pointer.
+///
+/// Note that overloading \a hash_value() as follows:
+///
+/// \code
+///     size_t hash_value(const MDOperand &X) { return hash_value(X.get()); }
+/// \endcode
+///
+/// does not cause MDOperand to be transparent.  In particular, a bare pointer
+/// doesn't get hashed before it's combined, whereas \a MDOperand would.
+static const Metadata *get_hashable_data(const MDOperand &X) { return X.get(); }
+}
+
+unsigned MDNodeOpsKey::calculateHash(MDNode *N, unsigned Offset) {
+  unsigned Hash = hash_combine_range(N->op_begin() + Offset, N->op_end());
+#ifndef NDEBUG
+  {
+    SmallVector<Metadata *, 8> MDs(N->op_begin() + Offset, N->op_end());
+    unsigned RawHash = calculateHash(MDs);
+    assert(Hash == RawHash &&
+           "Expected hash of MDOperand to equal hash of Metadata*");
+  }
+#endif
+  return Hash;
+}
+
+unsigned MDNodeOpsKey::calculateHash(ArrayRef<Metadata *> Ops) {
+  return hash_combine_range(Ops.begin(), Ops.end());
+}
+
 // ConstantsContext anchors
 void UnaryConstantExpr::anchor() { }
 
@@ -157,3 +237,4 @@ void InsertValueConstantExpr::anchor() { }
 void GetElementPtrConstantExpr::anchor() { }
 
 void CompareConstantExpr::anchor() { }
+
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index e743ec3..4631246 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -17,7 +17,6 @@
 
 #include "AttributeImpl.h"
 #include "ConstantsContext.h"
-#include "LeaksContext.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -28,6 +27,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
@@ -41,60 +41,38 @@ class ConstantFP;
 class DiagnosticInfoOptimizationRemark;
 class DiagnosticInfoOptimizationRemarkMissed;
 class DiagnosticInfoOptimizationRemarkAnalysis;
+class GCStrategy;
 class LLVMContext;
 class Type;
 class Value;
 
 struct DenseMapAPIntKeyInfo {
-  struct KeyTy {
-    APInt val;
-    Type* type;
-    KeyTy(const APInt& V, Type* Ty) : val(V), type(Ty) {}
-    bool operator==(const KeyTy& that) const {
-      return type == that.type && this->val == that.val;
-    }
-    bool operator!=(const KeyTy& that) const {
-      return !this->operator==(that);
-    }
-    friend hash_code hash_value(const KeyTy &Key) {
-      return hash_combine(Key.type, Key.val);
-    }
-  };
-  static inline KeyTy getEmptyKey() { return KeyTy(APInt(1,0), nullptr); }
-  static inline KeyTy getTombstoneKey() { return KeyTy(APInt(1,1), nullptr); }
-  static unsigned getHashValue(const KeyTy &Key) {
+  static inline APInt getEmptyKey() {
+    APInt V(nullptr, 0);
+    V.VAL = 0;
+    return V;
+  }
+  static inline APInt getTombstoneKey() {
+    APInt V(nullptr, 0);
+    V.VAL = 1;
+    return V;
+  }
+  static unsigned getHashValue(const APInt &Key) {
     return static_cast<unsigned>(hash_value(Key));
   }
-  static bool isEqual(const KeyTy &LHS, const KeyTy &RHS) {
-    return LHS == RHS;
+  static bool isEqual(const APInt &LHS, const APInt &RHS) {
+    return LHS.getBitWidth() == RHS.getBitWidth() && LHS == RHS;
   }
 };
 
 struct DenseMapAPFloatKeyInfo {
-  struct KeyTy {
-    APFloat val;
-    KeyTy(const APFloat& V) : val(V){}
-    bool operator==(const KeyTy& that) const {
-      return this->val.bitwiseIsEqual(that.val);
-    }
-    bool operator!=(const KeyTy& that) const {
-      return !this->operator==(that);
-    }
-    friend hash_code hash_value(const KeyTy &Key) {
-      return hash_combine(Key.val);
-    }
-  };
-  static inline KeyTy getEmptyKey() { 
-    return KeyTy(APFloat(APFloat::Bogus,1));
-  }
-  static inline KeyTy getTombstoneKey() { 
-    return KeyTy(APFloat(APFloat::Bogus,2)); 
-  }
-  static unsigned getHashValue(const KeyTy &Key) {
+  static inline APFloat getEmptyKey() { return APFloat(APFloat::Bogus, 1); }
+  static inline APFloat getTombstoneKey() { return APFloat(APFloat::Bogus, 2); }
+  static unsigned getHashValue(const APFloat &Key) {
     return static_cast<unsigned>(hash_value(Key));
   }
-  static bool isEqual(const KeyTy &LHS, const KeyTy &RHS) {
-    return LHS == RHS;
+  static bool isEqual(const APFloat &LHS, const APFloat &RHS) {
+    return LHS.bitwiseIsEqual(RHS);
   }
 };
 
@@ -104,9 +82,8 @@ struct AnonStructTypeKeyInfo {
     bool isPacked;
     KeyTy(const ArrayRef<Type*>& E, bool P) :
       ETypes(E), isPacked(P) {}
-    KeyTy(const StructType* ST) :
-      ETypes(ArrayRef<Type*>(ST->element_begin(), ST->element_end())),
-      isPacked(ST->isPacked()) {}
+    KeyTy(const StructType *ST)
+        : ETypes(ST->elements()), isPacked(ST->isPacked()) {}
     bool operator==(const KeyTy& that) const {
       if (isPacked != that.isPacked)
         return false;
@@ -149,10 +126,9 @@ struct FunctionTypeKeyInfo {
     bool isVarArg;
     KeyTy(const Type* R, const ArrayRef<Type*>& P, bool V) :
       ReturnType(R), Params(P), isVarArg(V) {}
-    KeyTy(const FunctionType* FT) :
-      ReturnType(FT->getReturnType()),
-      Params(makeArrayRef(FT->param_begin(), FT->param_end())),
-      isVarArg(FT->isVarArg()) {}
+    KeyTy(const FunctionType *FT)
+        : ReturnType(FT->getReturnType()), Params(FT->params()),
+          isVarArg(FT->isVarArg()) {}
     bool operator==(const KeyTy& that) const {
       if (ReturnType != that.ReturnType)
         return false;
@@ -191,78 +167,690 @@ struct FunctionTypeKeyInfo {
   }
 };
 
-/// \brief DenseMapInfo for GenericMDNode.
+/// \brief Structure for hashing arbitrary MDNode operands.
+class MDNodeOpsKey {
+  ArrayRef<Metadata *> RawOps;
+  ArrayRef<MDOperand> Ops;
+
+  unsigned Hash;
+
+protected:
+  MDNodeOpsKey(ArrayRef<Metadata *> Ops)
+      : RawOps(Ops), Hash(calculateHash(Ops)) {}
+
+  template <class NodeTy>
+  MDNodeOpsKey(const NodeTy *N, unsigned Offset = 0)
+      : Ops(N->op_begin() + Offset, N->op_end()), Hash(N->getHash()) {}
+
+  template <class NodeTy>
+  bool compareOps(const NodeTy *RHS, unsigned Offset = 0) const {
+    if (getHash() != RHS->getHash())
+      return false;
+
+    assert((RawOps.empty() || Ops.empty()) && "Two sets of operands?");
+    return RawOps.empty() ? compareOps(Ops, RHS, Offset)
+                          : compareOps(RawOps, RHS, Offset);
+  }
+
+  static unsigned calculateHash(MDNode *N, unsigned Offset = 0);
+
+private:
+  template <class T>
+  static bool compareOps(ArrayRef<T> Ops, const MDNode *RHS, unsigned Offset) {
+    if (Ops.size() != RHS->getNumOperands() - Offset)
+      return false;
+    return std::equal(Ops.begin(), Ops.end(), RHS->op_begin() + Offset);
+  }
+
+  static unsigned calculateHash(ArrayRef<Metadata *> Ops);
+
+public:
+  unsigned getHash() const { return Hash; }
+};
+
+template <class NodeTy> struct MDNodeKeyImpl;
+template <class NodeTy> struct MDNodeInfo;
+
+/// \brief DenseMapInfo for MDTuple.
 ///
 /// Note that we don't need the is-function-local bit, since that's implicit in
 /// the operands.
-struct GenericMDNodeInfo {
-  struct KeyTy {
-    ArrayRef<Value *> Ops;
-    unsigned Hash;
-
-    KeyTy(ArrayRef<Value *> Ops)
-        : Ops(Ops), Hash(hash_combine_range(Ops.begin(), Ops.end())) {}
-
-    KeyTy(GenericMDNode *N, SmallVectorImpl<Value *> &Storage) {
-      Storage.resize(N->getNumOperands());
-      for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I)
-        Storage[I] = N->getOperand(I);
-      Ops = Storage;
-      Hash = hash_combine_range(Ops.begin(), Ops.end());
-    }
+template <> struct MDNodeKeyImpl<MDTuple> : MDNodeOpsKey {
+  MDNodeKeyImpl(ArrayRef<Metadata *> Ops) : MDNodeOpsKey(Ops) {}
+  MDNodeKeyImpl(const MDTuple *N) : MDNodeOpsKey(N) {}
 
-    bool operator==(const GenericMDNode *RHS) const {
-      if (RHS == getEmptyKey() || RHS == getTombstoneKey())
-        return false;
-      if (Hash != RHS->getHash() || Ops.size() != RHS->getNumOperands())
-        return false;
-      for (unsigned I = 0, E = Ops.size(); I != E; ++I)
-        if (Ops[I] != RHS->getOperand(I))
-          return false;
-      return true;
-    }
-  };
-  static inline GenericMDNode *getEmptyKey() {
-    return DenseMapInfo<GenericMDNode *>::getEmptyKey();
+  bool isKeyOf(const MDTuple *RHS) const { return compareOps(RHS); }
+
+  unsigned getHashValue() const { return getHash(); }
+
+  static unsigned calculateHash(MDTuple *N) {
+    return MDNodeOpsKey::calculateHash(N);
   }
-  static inline GenericMDNode *getTombstoneKey() {
-    return DenseMapInfo<GenericMDNode *>::getTombstoneKey();
+};
+
+/// \brief DenseMapInfo for MDLocation.
+template <> struct MDNodeKeyImpl<MDLocation> {
+  unsigned Line;
+  unsigned Column;
+  Metadata *Scope;
+  Metadata *InlinedAt;
+
+  MDNodeKeyImpl(unsigned Line, unsigned Column, Metadata *Scope,
+                Metadata *InlinedAt)
+      : Line(Line), Column(Column), Scope(Scope), InlinedAt(InlinedAt) {}
+
+  MDNodeKeyImpl(const MDLocation *L)
+      : Line(L->getLine()), Column(L->getColumn()), Scope(L->getScope()),
+        InlinedAt(L->getInlinedAt()) {}
+
+  bool isKeyOf(const MDLocation *RHS) const {
+    return Line == RHS->getLine() && Column == RHS->getColumn() &&
+           Scope == RHS->getScope() && InlinedAt == RHS->getInlinedAt();
   }
-  static unsigned getHashValue(const KeyTy &Key) { return Key.Hash; }
-  static unsigned getHashValue(const GenericMDNode *U) {
-    return U->getHash();
+  unsigned getHashValue() const {
+    return hash_combine(Line, Column, Scope, InlinedAt);
   }
-  static bool isEqual(const KeyTy &LHS, const GenericMDNode *RHS) {
-    return LHS == RHS;
+};
+
+/// \brief DenseMapInfo for GenericDebugNode.
+template <> struct MDNodeKeyImpl<GenericDebugNode> : MDNodeOpsKey {
+  unsigned Tag;
+  StringRef Header;
+  MDNodeKeyImpl(unsigned Tag, StringRef Header, ArrayRef<Metadata *> DwarfOps)
+      : MDNodeOpsKey(DwarfOps), Tag(Tag), Header(Header) {}
+  MDNodeKeyImpl(const GenericDebugNode *N)
+      : MDNodeOpsKey(N, 1), Tag(N->getTag()), Header(N->getHeader()) {}
+
+  bool isKeyOf(const GenericDebugNode *RHS) const {
+    return Tag == RHS->getTag() && Header == RHS->getHeader() &&
+           compareOps(RHS, 1);
   }
-  static bool isEqual(const GenericMDNode *LHS, const GenericMDNode *RHS) {
-    return LHS == RHS;
+
+  unsigned getHashValue() const { return hash_combine(getHash(), Tag, Header); }
+
+  static unsigned calculateHash(GenericDebugNode *N) {
+    return MDNodeOpsKey::calculateHash(N, 1);
   }
 };
 
-/// DebugRecVH - This is a CallbackVH used to keep the Scope -> index maps
-/// up to date as MDNodes mutate.  This class is implemented in DebugLoc.cpp.
-class DebugRecVH : public CallbackVH {
-  /// Ctx - This is the LLVM Context being referenced.
-  LLVMContextImpl *Ctx;
-  
-  /// Idx - The index into either ScopeRecordIdx or ScopeInlinedAtRecords that
-  /// this reference lives in.  If this is zero, then it represents a
-  /// non-canonical entry that has no DenseMap value.  This can happen due to
-  /// RAUW.
-  int Idx;
-public:
-  DebugRecVH(MDNode *n, LLVMContextImpl *ctx, int idx)
-    : CallbackVH(n), Ctx(ctx), Idx(idx) {}
-  
-  MDNode *get() const {
-    return cast_or_null<MDNode>(getValPtr());
+template <> struct MDNodeKeyImpl<MDSubrange> {
+  int64_t Count;
+  int64_t Lo;
+
+  MDNodeKeyImpl(int64_t Count, int64_t Lo) : Count(Count), Lo(Lo) {}
+  MDNodeKeyImpl(const MDSubrange *N) : Count(N->getCount()), Lo(N->getLo()) {}
+
+  bool isKeyOf(const MDSubrange *RHS) const {
+    return Count == RHS->getCount() && Lo == RHS->getLo();
   }
+  unsigned getHashValue() const { return hash_combine(Count, Lo); }
+};
+
+template <> struct MDNodeKeyImpl<MDEnumerator> {
+  int64_t Value;
+  StringRef Name;
 
-  void deleted() override;
-  void allUsesReplacedWith(Value *VNew) override;
+  MDNodeKeyImpl(int64_t Value, StringRef Name) : Value(Value), Name(Name) {}
+  MDNodeKeyImpl(const MDEnumerator *N)
+      : Value(N->getValue()), Name(N->getName()) {}
+
+  bool isKeyOf(const MDEnumerator *RHS) const {
+    return Value == RHS->getValue() && Name == RHS->getName();
+  }
+  unsigned getHashValue() const { return hash_combine(Value, Name); }
 };
-  
+
+template <> struct MDNodeKeyImpl<MDBasicType> {
+  unsigned Tag;
+  StringRef Name;
+  uint64_t SizeInBits;
+  uint64_t AlignInBits;
+  unsigned Encoding;
+
+  MDNodeKeyImpl(unsigned Tag, StringRef Name, uint64_t SizeInBits,
+                uint64_t AlignInBits, unsigned Encoding)
+      : Tag(Tag), Name(Name), SizeInBits(SizeInBits), AlignInBits(AlignInBits),
+        Encoding(Encoding) {}
+  MDNodeKeyImpl(const MDBasicType *N)
+      : Tag(N->getTag()), Name(N->getName()), SizeInBits(N->getSizeInBits()),
+        AlignInBits(N->getAlignInBits()), Encoding(N->getEncoding()) {}
+
+  bool isKeyOf(const MDBasicType *RHS) const {
+    return Tag == RHS->getTag() && Name == RHS->getName() &&
+           SizeInBits == RHS->getSizeInBits() &&
+           AlignInBits == RHS->getAlignInBits() &&
+           Encoding == RHS->getEncoding();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(Tag, Name, SizeInBits, AlignInBits, Encoding);
+  }
+};
+
+template <> struct MDNodeKeyImpl<MDDerivedType> {
+  unsigned Tag;
+  StringRef Name;
+  Metadata *File;
+  unsigned Line;
+  Metadata *Scope;
+  Metadata *BaseType;
+  uint64_t SizeInBits;
+  uint64_t AlignInBits;
+  uint64_t OffsetInBits;
+  unsigned Flags;
+  Metadata *ExtraData;
+
+  MDNodeKeyImpl(unsigned Tag, StringRef Name, Metadata *File, unsigned Line,
+                Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
+                uint64_t AlignInBits, uint64_t OffsetInBits, unsigned Flags,
+                Metadata *ExtraData)
+      : Tag(Tag), Name(Name), File(File), Line(Line), Scope(Scope),
+        BaseType(BaseType), SizeInBits(SizeInBits), AlignInBits(AlignInBits),
+        OffsetInBits(OffsetInBits), Flags(Flags), ExtraData(ExtraData) {}
+  MDNodeKeyImpl(const MDDerivedType *N)
+      : Tag(N->getTag()), Name(N->getName()), File(N->getFile()),
+        Line(N->getLine()), Scope(N->getScope()), BaseType(N->getBaseType()),
+        SizeInBits(N->getSizeInBits()), AlignInBits(N->getAlignInBits()),
+        OffsetInBits(N->getOffsetInBits()), Flags(N->getFlags()),
+        ExtraData(N->getExtraData()) {}
+
+  bool isKeyOf(const MDDerivedType *RHS) const {
+    return Tag == RHS->getTag() && Name == RHS->getName() &&
+           File == RHS->getFile() && Line == RHS->getLine() &&
+           Scope == RHS->getScope() && BaseType == RHS->getBaseType() &&
+           SizeInBits == RHS->getSizeInBits() &&
+           AlignInBits == RHS->getAlignInBits() &&
+           OffsetInBits == RHS->getOffsetInBits() && Flags == RHS->getFlags() &&
+           ExtraData == RHS->getExtraData();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                        AlignInBits, OffsetInBits, Flags, ExtraData);
+  }
+};
+
+template <> struct MDNodeKeyImpl<MDCompositeType> {
+  unsigned Tag;
+  StringRef Name;
+  Metadata *File;
+  unsigned Line;
+  Metadata *Scope;
+  Metadata *BaseType;
+  uint64_t SizeInBits;
+  uint64_t AlignInBits;
+  uint64_t OffsetInBits;
+  unsigned Flags;
+  Metadata *Elements;
+  unsigned RuntimeLang;
+  Metadata *VTableHolder;
+  Metadata *TemplateParams;
+  StringRef Identifier;
+
+  MDNodeKeyImpl(unsigned Tag, StringRef Name, Metadata *File, unsigned Line,
+                Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
+                uint64_t AlignInBits, uint64_t OffsetInBits, unsigned Flags,
+                Metadata *Elements, unsigned RuntimeLang,
+                Metadata *VTableHolder, Metadata *TemplateParams,
+                StringRef Identifier)
+      : Tag(Tag), Name(Name), File(File), Line(Line), Scope(Scope),
+        BaseType(BaseType), SizeInBits(SizeInBits), AlignInBits(AlignInBits),
+        OffsetInBits(OffsetInBits), Flags(Flags), Elements(Elements),
+        RuntimeLang(RuntimeLang), VTableHolder(VTableHolder),
+        TemplateParams(TemplateParams), Identifier(Identifier) {}
+  MDNodeKeyImpl(const MDCompositeType *N)
+      : Tag(N->getTag()), Name(N->getName()), File(N->getFile()),
+        Line(N->getLine()), Scope(N->getScope()), BaseType(N->getBaseType()),
+        SizeInBits(N->getSizeInBits()), AlignInBits(N->getAlignInBits()),
+        OffsetInBits(N->getOffsetInBits()), Flags(N->getFlags()),
+        Elements(N->getElements()), RuntimeLang(N->getRuntimeLang()),
+        VTableHolder(N->getVTableHolder()),
+        TemplateParams(N->getTemplateParams()), Identifier(N->getIdentifier()) {
+  }
+
+  bool isKeyOf(const MDCompositeType *RHS) const {
+    return Tag == RHS->getTag() && Name == RHS->getName() &&
+           File == RHS->getFile() && Line == RHS->getLine() &&
+           Scope == RHS->getScope() && BaseType == RHS->getBaseType() &&
+           SizeInBits == RHS->getSizeInBits() &&
+           AlignInBits == RHS->getAlignInBits() &&
+           OffsetInBits == RHS->getOffsetInBits() && Flags == RHS->getFlags() &&
+           Elements == RHS->getElements() &&
+           RuntimeLang == RHS->getRuntimeLang() &&
+           VTableHolder == RHS->getVTableHolder() &&
+           TemplateParams == RHS->getTemplateParams() &&
+           Identifier == RHS->getIdentifier();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                        AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
+                        VTableHolder, TemplateParams, Identifier);
+  }
+};
+
+template <> struct MDNodeKeyImpl<MDSubroutineType> {
+  unsigned Flags;
+  Metadata *TypeArray;
+
+  MDNodeKeyImpl(int64_t Flags, Metadata *TypeArray)
+      : Flags(Flags), TypeArray(TypeArray) {}
+  MDNodeKeyImpl(const MDSubroutineType *N)
+      : Flags(N->getFlags()), TypeArray(N->getTypeArray()) {}
+
+  bool isKeyOf(const MDSubroutineType *RHS) const {
+    return Flags == RHS->getFlags() && TypeArray == RHS->getTypeArray();
+  }
+  unsigned getHashValue() const { return hash_combine(Flags, TypeArray); }
+};
+
+template <> struct MDNodeKeyImpl<MDFile> {
+  StringRef Filename;
+  StringRef Directory;
+
+  MDNodeKeyImpl(StringRef Filename, StringRef Directory)
+      : Filename(Filename), Directory(Directory) {}
+  MDNodeKeyImpl(const MDFile *N)
+      : Filename(N->getFilename()), Directory(N->getDirectory()) {}
+
+  bool isKeyOf(const MDFile *RHS) const {
+    return Filename == RHS->getFilename() && Directory == RHS->getDirectory();
+  }
+  unsigned getHashValue() const { return hash_combine(Filename, Directory); }
+};
+
+template <> struct MDNodeKeyImpl<MDCompileUnit> {
+  unsigned SourceLanguage;
+  Metadata *File;
+  StringRef Producer;
+  bool IsOptimized;
+  StringRef Flags;
+  unsigned RuntimeVersion;
+  StringRef SplitDebugFilename;
+  unsigned EmissionKind;
+  Metadata *EnumTypes;
+  Metadata *RetainedTypes;
+  Metadata *Subprograms;
+  Metadata *GlobalVariables;
+  Metadata *ImportedEntities;
+
+  MDNodeKeyImpl(unsigned SourceLanguage, Metadata *File, StringRef Producer,
+                bool IsOptimized, StringRef Flags, unsigned RuntimeVersion,
+                StringRef SplitDebugFilename, unsigned EmissionKind,
+                Metadata *EnumTypes, Metadata *RetainedTypes,
+                Metadata *Subprograms, Metadata *GlobalVariables,
+                Metadata *ImportedEntities)
+      : SourceLanguage(SourceLanguage), File(File), Producer(Producer),
+        IsOptimized(IsOptimized), Flags(Flags), RuntimeVersion(RuntimeVersion),
+        SplitDebugFilename(SplitDebugFilename), EmissionKind(EmissionKind),
+        EnumTypes(EnumTypes), RetainedTypes(RetainedTypes),
+        Subprograms(Subprograms), GlobalVariables(GlobalVariables),
+        ImportedEntities(ImportedEntities) {}
+  MDNodeKeyImpl(const MDCompileUnit *N)
+      : SourceLanguage(N->getSourceLanguage()), File(N->getFile()),
+        Producer(N->getProducer()), IsOptimized(N->isOptimized()),
+        Flags(N->getFlags()), RuntimeVersion(N->getRuntimeVersion()),
+        SplitDebugFilename(N->getSplitDebugFilename()),
+        EmissionKind(N->getEmissionKind()), EnumTypes(N->getEnumTypes()),
+        RetainedTypes(N->getRetainedTypes()), Subprograms(N->getSubprograms()),
+        GlobalVariables(N->getGlobalVariables()),
+        ImportedEntities(N->getImportedEntities()) {}
+
+  bool isKeyOf(const MDCompileUnit *RHS) const {
+    return SourceLanguage == RHS->getSourceLanguage() &&
+           File == RHS->getFile() && Producer == RHS->getProducer() &&
+           IsOptimized == RHS->isOptimized() && Flags == RHS->getFlags() &&
+           RuntimeVersion == RHS->getRuntimeVersion() &&
+           SplitDebugFilename == RHS->getSplitDebugFilename() &&
+           EmissionKind == RHS->getEmissionKind() &&
+           EnumTypes == RHS->getEnumTypes() &&
+           RetainedTypes == RHS->getRetainedTypes() &&
+           Subprograms == RHS->getSubprograms() &&
+           GlobalVariables == RHS->getGlobalVariables() &&
+           ImportedEntities == RHS->getImportedEntities();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(SourceLanguage, File, Producer, IsOptimized, Flags,
+                        RuntimeVersion, SplitDebugFilename, EmissionKind,
+                        EnumTypes, RetainedTypes, Subprograms, GlobalVariables,
+                        ImportedEntities);
+  }
+};
+
+template <> struct MDNodeKeyImpl<MDSubprogram> {
+  Metadata *Scope;
+  StringRef Name;
+  StringRef LinkageName;
+  Metadata *File;
+  unsigned Line;
+  Metadata *Type;
+  bool IsLocalToUnit;
+  bool IsDefinition;
+  unsigned ScopeLine;
+  Metadata *ContainingType;
+  unsigned Virtuality;
+  unsigned VirtualIndex;
+  unsigned Flags;
+  bool IsOptimized;
+  Metadata *Function;
+  Metadata *TemplateParams;
+  Metadata *Declaration;
+  Metadata *Variables;
+
+  MDNodeKeyImpl(Metadata *Scope, StringRef Name, StringRef LinkageName,
+                Metadata *File, unsigned Line, Metadata *Type,
+                bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine,
+                Metadata *ContainingType, unsigned Virtuality,
+                unsigned VirtualIndex, unsigned Flags, bool IsOptimized,
+                Metadata *Function, Metadata *TemplateParams,
+                Metadata *Declaration, Metadata *Variables)
+      : Scope(Scope), Name(Name), LinkageName(LinkageName), File(File),
+        Line(Line), Type(Type), IsLocalToUnit(IsLocalToUnit),
+        IsDefinition(IsDefinition), ScopeLine(ScopeLine),
+        ContainingType(ContainingType), Virtuality(Virtuality),
+        VirtualIndex(VirtualIndex), Flags(Flags), IsOptimized(IsOptimized),
+        Function(Function), TemplateParams(TemplateParams),
+        Declaration(Declaration), Variables(Variables) {}
+  MDNodeKeyImpl(const MDSubprogram *N)
+      : Scope(N->getScope()), Name(N->getName()),
+        LinkageName(N->getLinkageName()), File(N->getFile()),
+        Line(N->getLine()), Type(N->getType()),
+        IsLocalToUnit(N->isLocalToUnit()), IsDefinition(N->isDefinition()),
+        ScopeLine(N->getScopeLine()), ContainingType(N->getContainingType()),
+        Virtuality(N->getVirtuality()), VirtualIndex(N->getVirtualIndex()),
+        Flags(N->getFlags()), IsOptimized(N->isOptimized()),
+        Function(N->getFunction()), TemplateParams(N->getTemplateParams()),
+        Declaration(N->getDeclaration()), Variables(N->getVariables()) {}
+
+  bool isKeyOf(const MDSubprogram *RHS) const {
+    return Scope == RHS->getScope() && Name == RHS->getName() &&
+           LinkageName == RHS->getLinkageName() && File == RHS->getFile() &&
+           Line == RHS->getLine() && Type == RHS->getType() &&
+           IsLocalToUnit == RHS->isLocalToUnit() &&
+           IsDefinition == RHS->isDefinition() &&
+           ScopeLine == RHS->getScopeLine() &&
+           ContainingType == RHS->getContainingType() &&
+           Virtuality == RHS->getVirtuality() &&
+           VirtualIndex == RHS->getVirtualIndex() && Flags == RHS->getFlags() &&
+           IsOptimized == RHS->isOptimized() &&
+           Function == RHS->getFunction() &&
+           TemplateParams == RHS->getTemplateParams() &&
+           Declaration == RHS->getDeclaration() &&
+           Variables == RHS->getVariables();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(Scope, Name, LinkageName, File, Line, Type,
+                        IsLocalToUnit, IsDefinition, ScopeLine, ContainingType,
+                        Virtuality, VirtualIndex, Flags, IsOptimized, Function,
+                        TemplateParams, Declaration, Variables);
+  }
+};
+
+template <> struct MDNodeKeyImpl<MDLexicalBlock> {
+  Metadata *Scope;
+  Metadata *File;
+  unsigned Line;
+  unsigned Column;
+
+  MDNodeKeyImpl(Metadata *Scope, Metadata *File, unsigned Line, unsigned Column)
+      : Scope(Scope), File(File), Line(Line), Column(Column) {}
+  MDNodeKeyImpl(const MDLexicalBlock *N)
+      : Scope(N->getScope()), File(N->getFile()), Line(N->getLine()),
+        Column(N->getColumn()) {}
+
+  bool isKeyOf(const MDLexicalBlock *RHS) const {
+    return Scope == RHS->getScope() && File == RHS->getFile() &&
+           Line == RHS->getLine() && Column == RHS->getColumn();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(Scope, File, Line, Column);
+  }
+};
+
+template <> struct MDNodeKeyImpl<MDLexicalBlockFile> {
+  Metadata *Scope;
+  Metadata *File;
+  unsigned Discriminator;
+
+  MDNodeKeyImpl(Metadata *Scope, Metadata *File, unsigned Discriminator)
+      : Scope(Scope), File(File), Discriminator(Discriminator) {}
+  MDNodeKeyImpl(const MDLexicalBlockFile *N)
+      : Scope(N->getScope()), File(N->getFile()),
+        Discriminator(N->getDiscriminator()) {}
+
+  bool isKeyOf(const MDLexicalBlockFile *RHS) const {
+    return Scope == RHS->getScope() && File == RHS->getFile() &&
+           Discriminator == RHS->getDiscriminator();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(Scope, File, Discriminator);
+  }
+};
+
+template <> struct MDNodeKeyImpl<MDNamespace> {
+  Metadata *Scope;
+  Metadata *File;
+  StringRef Name;
+  unsigned Line;
+
+  MDNodeKeyImpl(Metadata *Scope, Metadata *File, StringRef Name, unsigned Line)
+      : Scope(Scope), File(File), Name(Name), Line(Line) {}
+  MDNodeKeyImpl(const MDNamespace *N)
+      : Scope(N->getScope()), File(N->getFile()), Name(N->getName()),
+        Line(N->getLine()) {}
+
+  bool isKeyOf(const MDNamespace *RHS) const {
+    return Scope == RHS->getScope() && File == RHS->getFile() &&
+           Name == RHS->getName() && Line == RHS->getLine();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(Scope, File, Name, Line);
+  }
+};
+
+template <> struct MDNodeKeyImpl<MDTemplateTypeParameter> {
+  StringRef Name;
+  Metadata *Type;
+
+  MDNodeKeyImpl(StringRef Name, Metadata *Type) : Name(Name), Type(Type) {}
+  MDNodeKeyImpl(const MDTemplateTypeParameter *N)
+      : Name(N->getName()), Type(N->getType()) {}
+
+  bool isKeyOf(const MDTemplateTypeParameter *RHS) const {
+    return Name == RHS->getName() && Type == RHS->getType();
+  }
+  unsigned getHashValue() const { return hash_combine(Name, Type); }
+};
+
+template <> struct MDNodeKeyImpl<MDTemplateValueParameter> {
+  unsigned Tag;
+  StringRef Name;
+  Metadata *Type;
+  Metadata *Value;
+
+  MDNodeKeyImpl(unsigned Tag, StringRef Name, Metadata *Type, Metadata *Value)
+      : Tag(Tag), Name(Name), Type(Type), Value(Value) {}
+  MDNodeKeyImpl(const MDTemplateValueParameter *N)
+      : Tag(N->getTag()), Name(N->getName()), Type(N->getType()),
+        Value(N->getValue()) {}
+
+  bool isKeyOf(const MDTemplateValueParameter *RHS) const {
+    return Tag == RHS->getTag() && Name == RHS->getName() &&
+           Type == RHS->getType() && Value == RHS->getValue();
+  }
+  unsigned getHashValue() const { return hash_combine(Tag, Name, Type, Value); }
+};
+
+template <> struct MDNodeKeyImpl<MDGlobalVariable> {
+  Metadata *Scope;
+  StringRef Name;
+  StringRef LinkageName;
+  Metadata *File;
+  unsigned Line;
+  Metadata *Type;
+  bool IsLocalToUnit;
+  bool IsDefinition;
+  Metadata *Variable;
+  Metadata *StaticDataMemberDeclaration;
+
+  MDNodeKeyImpl(Metadata *Scope, StringRef Name, StringRef LinkageName,
+                Metadata *File, unsigned Line, Metadata *Type,
+                bool IsLocalToUnit, bool IsDefinition, Metadata *Variable,
+                Metadata *StaticDataMemberDeclaration)
+      : Scope(Scope), Name(Name), LinkageName(LinkageName), File(File),
+        Line(Line), Type(Type), IsLocalToUnit(IsLocalToUnit),
+        IsDefinition(IsDefinition), Variable(Variable),
+        StaticDataMemberDeclaration(StaticDataMemberDeclaration) {}
+  MDNodeKeyImpl(const MDGlobalVariable *N)
+      : Scope(N->getScope()), Name(N->getName()),
+        LinkageName(N->getLinkageName()), File(N->getFile()),
+        Line(N->getLine()), Type(N->getType()),
+        IsLocalToUnit(N->isLocalToUnit()), IsDefinition(N->isDefinition()),
+        Variable(N->getVariable()),
+        StaticDataMemberDeclaration(N->getStaticDataMemberDeclaration()) {}
+
+  bool isKeyOf(const MDGlobalVariable *RHS) const {
+    return Scope == RHS->getScope() && Name == RHS->getName() &&
+           LinkageName == RHS->getLinkageName() && File == RHS->getFile() &&
+           Line == RHS->getLine() && Type == RHS->getType() &&
+           IsLocalToUnit == RHS->isLocalToUnit() &&
+           IsDefinition == RHS->isDefinition() &&
+           Variable == RHS->getVariable() &&
+           StaticDataMemberDeclaration == RHS->getStaticDataMemberDeclaration();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(Scope, Name, LinkageName, File, Line, Type,
+                        IsLocalToUnit, IsDefinition, Variable,
+                        StaticDataMemberDeclaration);
+  }
+};
+
+template <> struct MDNodeKeyImpl<MDLocalVariable> {
+  unsigned Tag;
+  Metadata *Scope;
+  StringRef Name;
+  Metadata *File;
+  unsigned Line;
+  Metadata *Type;
+  unsigned Arg;
+  unsigned Flags;
+  Metadata *InlinedAt;
+
+  MDNodeKeyImpl(unsigned Tag, Metadata *Scope, StringRef Name, Metadata *File,
+                unsigned Line, Metadata *Type, unsigned Arg, unsigned Flags,
+                Metadata *InlinedAt)
+      : Tag(Tag), Scope(Scope), Name(Name), File(File), Line(Line), Type(Type),
+        Arg(Arg), Flags(Flags), InlinedAt(InlinedAt) {}
+  MDNodeKeyImpl(const MDLocalVariable *N)
+      : Tag(N->getTag()), Scope(N->getScope()), Name(N->getName()),
+        File(N->getFile()), Line(N->getLine()), Type(N->getType()),
+        Arg(N->getArg()), Flags(N->getFlags()), InlinedAt(N->getInlinedAt()) {}
+
+  bool isKeyOf(const MDLocalVariable *RHS) const {
+    return Tag == RHS->getTag() && Scope == RHS->getScope() &&
+           Name == RHS->getName() && File == RHS->getFile() &&
+           Line == RHS->getLine() && Type == RHS->getType() &&
+           Arg == RHS->getArg() && Flags == RHS->getFlags() &&
+           InlinedAt == RHS->getInlinedAt();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(Tag, Scope, Name, File, Line, Type, Arg, Flags,
+                        InlinedAt);
+  }
+};
+
+template <> struct MDNodeKeyImpl<MDExpression> {
+  ArrayRef<uint64_t> Elements;
+
+  MDNodeKeyImpl(ArrayRef<uint64_t> Elements) : Elements(Elements) {}
+  MDNodeKeyImpl(const MDExpression *N) : Elements(N->getElements()) {}
+
+  bool isKeyOf(const MDExpression *RHS) const {
+    return Elements == RHS->getElements();
+  }
+  unsigned getHashValue() const {
+    return hash_combine_range(Elements.begin(), Elements.end());
+  }
+};
+
+template <> struct MDNodeKeyImpl<MDObjCProperty> {
+  StringRef Name;
+  Metadata *File;
+  unsigned Line;
+  StringRef GetterName;
+  StringRef SetterName;
+  unsigned Attributes;
+  Metadata *Type;
+
+  MDNodeKeyImpl(StringRef Name, Metadata *File, unsigned Line,
+                StringRef GetterName, StringRef SetterName, unsigned Attributes,
+                Metadata *Type)
+      : Name(Name), File(File), Line(Line), GetterName(GetterName),
+        SetterName(SetterName), Attributes(Attributes), Type(Type) {}
+  MDNodeKeyImpl(const MDObjCProperty *N)
+      : Name(N->getName()), File(N->getFile()), Line(N->getLine()),
+        GetterName(N->getGetterName()), SetterName(N->getSetterName()),
+        Attributes(N->getAttributes()), Type(N->getType()) {}
+
+  bool isKeyOf(const MDObjCProperty *RHS) const {
+    return Name == RHS->getName() && File == RHS->getFile() &&
+           Line == RHS->getLine() && GetterName == RHS->getGetterName() &&
+           SetterName == RHS->getSetterName() &&
+           Attributes == RHS->getAttributes() && Type == RHS->getType();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(Name, File, Line, GetterName, SetterName, Attributes,
+                        Type);
+  }
+};
+
+template <> struct MDNodeKeyImpl<MDImportedEntity> {
+  unsigned Tag;
+  Metadata *Scope;
+  Metadata *Entity;
+  unsigned Line;
+  StringRef Name;
+
+  MDNodeKeyImpl(unsigned Tag, Metadata *Scope, Metadata *Entity, unsigned Line,
+                StringRef Name)
+      : Tag(Tag), Scope(Scope), Entity(Entity), Line(Line), Name(Name) {}
+  MDNodeKeyImpl(const MDImportedEntity *N)
+      : Tag(N->getTag()), Scope(N->getScope()), Entity(N->getEntity()),
+        Line(N->getLine()), Name(N->getName()) {}
+
+  bool isKeyOf(const MDImportedEntity *RHS) const {
+    return Tag == RHS->getTag() && Scope == RHS->getScope() &&
+           Entity == RHS->getEntity() && Line == RHS->getLine() &&
+           Name == RHS->getName();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(Tag, Scope, Entity, Line, Name);
+  }
+};
+
+/// \brief DenseMapInfo for MDNode subclasses.
+template <class NodeTy> struct MDNodeInfo {
+  typedef MDNodeKeyImpl<NodeTy> KeyTy;
+  static inline NodeTy *getEmptyKey() {
+    return DenseMapInfo<NodeTy *>::getEmptyKey();
+  }
+  static inline NodeTy *getTombstoneKey() {
+    return DenseMapInfo<NodeTy *>::getTombstoneKey();
+  }
+  static unsigned getHashValue(const KeyTy &Key) { return Key.getHashValue(); }
+  static unsigned getHashValue(const NodeTy *N) {
+    return KeyTy(N).getHashValue();
+  }
+  static bool isEqual(const KeyTy &LHS, const NodeTy *RHS) {
+    if (RHS == getEmptyKey() || RHS == getTombstoneKey())
+      return false;
+    return LHS.isKeyOf(RHS);
+  }
+  static bool isEqual(const NodeTy *LHS, const NodeTy *RHS) {
+    return LHS == RHS;
+  }
+};
+
+#define HANDLE_MDNODE_LEAF(CLASS) typedef MDNodeInfo<CLASS> CLASS##Info;
+#include "llvm/IR/Metadata.def"
+
 class LLVMContextImpl {
 public:
   /// OwnedModules - The set of modules instantiated in this context, and which
@@ -279,12 +867,10 @@ public:
   LLVMContext::YieldCallbackTy YieldCallback;
   void *YieldOpaqueHandle;
 
-  typedef DenseMap<DenseMapAPIntKeyInfo::KeyTy, ConstantInt *,
-                   DenseMapAPIntKeyInfo> IntMapTy;
+  typedef DenseMap<APInt, ConstantInt *, DenseMapAPIntKeyInfo> IntMapTy;
   IntMapTy IntConstants;
-  
-  typedef DenseMap<DenseMapAPFloatKeyInfo::KeyTy, ConstantFP*, 
-                         DenseMapAPFloatKeyInfo> FPMapTy;
+
+  typedef DenseMap<APFloat, ConstantFP *, DenseMapAPFloatKeyInfo> FPMapTy;
   FPMapTy FPConstants;
 
   FoldingSet<AttributeImpl> AttrsSet;
@@ -292,14 +878,17 @@ public:
   FoldingSet<AttributeSetNode> AttrsSetNodes;
 
   StringMap<MDString> MDStringCache;
+  DenseMap<Value *, ValueAsMetadata *> ValuesAsMetadata;
+  DenseMap<Metadata *, MetadataAsValue *> MetadataAsValues;
 
-  DenseSet<GenericMDNode *, GenericMDNodeInfo> MDNodeSet;
+#define HANDLE_MDNODE_LEAF(CLASS) DenseSet<CLASS *, CLASS##Info> CLASS##s;
+#include "llvm/IR/Metadata.def"
 
   // MDNodes may be uniqued or not uniqued.  When they're not uniqued, they
   // aren't in the MDNodeSet, but they're still shared between objects, so no
   // one object can destroy them.  This set allows us to at least destroy them
   // on Context destruction.
-  SmallPtrSet<GenericMDNode *, 1> NonUniquedMDNodes;
+  SmallPtrSet<MDNode *, 1> DistinctMDNodes;
 
   DenseMap<Type*, ConstantAggregateZero*> CAZConstants;
 
@@ -326,9 +915,7 @@ public:
 
   ConstantInt *TheTrueVal;
   ConstantInt *TheFalseVal;
-  
-  LeakDetectorImpl<Value> LLVMObjects;
-  
+
   // Basic type instances.
   Type VoidTy, LabelTy, HalfTy, FloatTy, DoubleTy, MetadataTy;
   Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy;
@@ -340,11 +927,11 @@ public:
   BumpPtrAllocator TypeAllocator;
   
   DenseMap<unsigned, IntegerType*> IntegerTypes;
-  
-  typedef DenseMap<FunctionType*, bool, FunctionTypeKeyInfo> FunctionTypeMap;
-  FunctionTypeMap FunctionTypes;
-  typedef DenseMap<StructType*, bool, AnonStructTypeKeyInfo> StructTypeMap;
-  StructTypeMap AnonStructTypes;
+
+  typedef DenseSet<FunctionType *, FunctionTypeKeyInfo> FunctionTypeSet;
+  FunctionTypeSet FunctionTypes;
+  typedef DenseSet<StructType *, AnonStructTypeKeyInfo> StructTypeSet;
+  StructTypeSet AnonStructTypes;
   StringMap<StructType*> NamedStructTypes;
   unsigned NamedStructTypesUniqueID;
     
@@ -362,32 +949,14 @@ public:
   
   /// CustomMDKindNames - Map to hold the metadata string to ID mapping.
   StringMap<unsigned> CustomMDKindNames;
-  
-  typedef std::pair<unsigned, TrackingVH<MDNode> > MDPairTy;
+
+  typedef std::pair<unsigned, TrackingMDNodeRef> MDPairTy;
   typedef SmallVector<MDPairTy, 2> MDMapTy;
 
   /// MetadataStore - Collection of per-instruction metadata used in this
   /// context.
   DenseMap<const Instruction *, MDMapTy> MetadataStore;
   
-  /// ScopeRecordIdx - This is the index in ScopeRecords for an MDNode scope
-  /// entry with no "inlined at" element.
-  DenseMap<MDNode*, int> ScopeRecordIdx;
-  
-  /// ScopeRecords - These are the actual mdnodes (in a value handle) for an
-  /// index.  The ValueHandle ensures that ScopeRecordIdx stays up to date if
-  /// the MDNode is RAUW'd.
-  std::vector<DebugRecVH> ScopeRecords;
-  
-  /// ScopeInlinedAtIdx - This is the index in ScopeInlinedAtRecords for an
-  /// scope/inlined-at pair.
-  DenseMap<std::pair<MDNode*, MDNode*>, int> ScopeInlinedAtIdx;
-  
-  /// ScopeInlinedAtRecords - These are the actual mdnodes (in value handles)
-  /// for an index.  The ValueHandle ensures that ScopeINlinedAtIdx stays up
-  /// to date.
-  std::vector<std::pair<DebugRecVH, DebugRecVH> > ScopeInlinedAtRecords;
-
   /// DiscriminatorTable - This table maps file:line locations to an
   /// integer representing the next DWARF path discriminator to assign to
   /// instructions in different blocks at the same location.
@@ -403,11 +972,20 @@ public:
   typedef DenseMap<const Function *, ReturnInst *> PrefixDataMapTy;
   PrefixDataMapTy PrefixDataMap;
 
+  /// \brief Mapping from a function to its prologue data, which is stored as
+  /// the operand of an unparented ReturnInst so that the prologue data has a
+  /// Use.
+  typedef DenseMap<const Function *, ReturnInst *> PrologueDataMapTy;
+  PrologueDataMapTy PrologueDataMap;
+
   int getOrAddScopeRecordIdxEntry(MDNode *N, int ExistingIdx);
   int getOrAddScopeInlinedAtIdxEntry(MDNode *Scope, MDNode *IA,int ExistingIdx);
-  
+
   LLVMContextImpl(LLVMContext &C);
   ~LLVMContextImpl();
+
+  /// Destroy the ConstantArrays if they are not used.
+  void dropTriviallyDeadConstantArrays();
 };
 
 }
diff --git a/lib/IR/LeakDetector.cpp b/lib/IR/LeakDetector.cpp
deleted file mode 100644
index 6f71627..0000000
--- a/lib/IR/LeakDetector.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-//===-- LeakDetector.cpp - Implement LeakDetector interface ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the LeakDetector class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/IR/LeakDetector.h"
-#include "LLVMContextImpl.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Mutex.h"
-#include "llvm/Support/Threading.h"
-using namespace llvm;
-
-static ManagedStatic<sys::SmartMutex<true> > ObjectsLock;
-static ManagedStatic<LeakDetectorImpl<void> > Objects;
-
-static void clearGarbage(LLVMContext &Context) {
-  Objects->clear();
-  Context.pImpl->LLVMObjects.clear();
-}
-
-void LeakDetector::addGarbageObjectImpl(void *Object) {
-  sys::SmartScopedLock<true> Lock(*ObjectsLock);
-  Objects->addGarbage(Object);
-}
-
-void LeakDetector::addGarbageObjectImpl(const Value *Object) {
-  LLVMContextImpl *pImpl = Object->getContext().pImpl;
-  pImpl->LLVMObjects.addGarbage(Object);
-}
-
-void LeakDetector::removeGarbageObjectImpl(void *Object) {
-  sys::SmartScopedLock<true> Lock(*ObjectsLock);
-  Objects->removeGarbage(Object);
-}
-
-void LeakDetector::removeGarbageObjectImpl(const Value *Object) {
-  LLVMContextImpl *pImpl = Object->getContext().pImpl;
-  pImpl->LLVMObjects.removeGarbage(Object);
-}
-
-void LeakDetector::checkForGarbageImpl(LLVMContext &Context, 
-                                       const std::string &Message) {
-  LLVMContextImpl *pImpl = Context.pImpl;
-  sys::SmartScopedLock<true> Lock(*ObjectsLock);
-  
-  Objects->setName("GENERIC");
-  pImpl->LLVMObjects.setName("LLVM");
-  
-  // use non-short-circuit version so that both checks are performed
-  if (Objects->hasGarbage(Message) |
-      pImpl->LLVMObjects.hasGarbage(Message))
-    errs() << "\nThis is probably because you removed an object, but didn't "
-           << "delete it.  Please check your code for memory leaks.\n";
-
-  // Clear out results so we don't get duplicate warnings on
-  // next call...
-  clearGarbage(Context);
-}
diff --git a/lib/IR/LeaksContext.h b/lib/IR/LeaksContext.h
deleted file mode 100644
index 3e485ab..0000000
--- a/lib/IR/LeaksContext.h
+++ /dev/null
@@ -1,98 +0,0 @@
-//===- LeaksContext.h - LeadDetector Implementation ------------*- C++ -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//  This file defines various helper methods and classes used by
-// LLVMContextImpl for leaks detectors.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_IR_LEAKSCONTEXT_H
-#define LLVM_LIB_IR_LEAKSCONTEXT_H
-
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-
-template <class T>
-struct PrinterTrait {
-  static void print(const T* P) { errs() << P; }
-};
-
-template<>
-struct PrinterTrait<Value> {
-  static void print(const Value* P) { errs() << *P; }
-};
-
-template <typename T>
-struct LeakDetectorImpl {
-  explicit LeakDetectorImpl(const char* const name = "") : 
-    Cache(nullptr), Name(name) { }
-
-  void clear() {
-    Cache = nullptr;
-    Ts.clear();
-  }
-    
-  void setName(const char* n) { 
-    Name = n;
-  }
-    
-  // Because the most common usage pattern, by far, is to add a
-  // garbage object, then remove it immediately, we optimize this
-  // case.  When an object is added, it is not added to the set
-  // immediately, it is added to the CachedValue Value.  If it is
-  // immediately removed, no set search need be performed.
-  void addGarbage(const T* o) {
-    assert(Ts.count(o) == 0 && "Object already in set!");
-    if (Cache) {
-      assert(Cache != o && "Object already in set!");
-      Ts.insert(Cache);
-    }
-    Cache = o;
-  }
-
-  void removeGarbage(const T* o) {
-    if (o == Cache)
-      Cache = nullptr; // Cache hit
-    else
-      Ts.erase(o);
-  }
-
-  bool hasGarbage(const std::string& Message) {
-    addGarbage(nullptr); // Flush the Cache
-
-    assert(!Cache && "No value should be cached anymore!");
-
-    if (!Ts.empty()) {
-      errs() << "Leaked " << Name << " objects found: " << Message << ":\n";
-      for (typename SmallPtrSet<const T*, 8>::iterator I = Ts.begin(),
-           E = Ts.end(); I != E; ++I) {
-        errs() << '\t';
-        PrinterTrait<T>::print(*I);
-        errs() << '\n';
-      }
-      errs() << '\n';
-
-      return true;
-    }
-    
-    return false;
-  }
-
-private:
-  SmallPtrSet<const T*, 8> Ts;
-  const T* Cache;
-  const char* Name;
-};
-
-}
-
-#endif
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index 28fa74c..fa8d50e 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -227,10 +227,7 @@ public:
     Pass(PT_PassManager, ID), PMDataManager(),
     PMTopLevelManager(new FPPassManager()), wasRun(false) {}
 
-  /// add - Add a pass to the queue of passes to run.  This passes ownership of
-  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
-  /// will be destroyed as well, so there is no need to delete the pass.  This
-  /// implies that all passes MUST be allocated with 'new'.
+  /// \copydoc FunctionPassManager::add()
   void add(Pass *P) {
     schedulePass(P);
   }
@@ -398,10 +395,7 @@ public:
     Pass(PT_PassManager, ID), PMDataManager(),
                               PMTopLevelManager(new MPPassManager()) {}
 
-  /// add - Add a pass to the queue of passes to run.  This passes ownership of
-  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
-  /// will be destroyed as well, so there is no need to delete the pass.  This
-  /// implies that all passes MUST be allocated with 'new'.
+  /// \copydoc PassManager::add()
   void add(Pass *P) {
     schedulePass(P);
   }
@@ -606,8 +600,7 @@ void PMTopLevelManager::schedulePass(Pass *P) {
   // If P is an analysis pass and it is available then do not
   // generate the analysis again. Stale analysis info should not be
   // available at this point.
-  const PassInfo *PI =
-    PassRegistry::getPassRegistry()->getPassInfo(P->getPassID());
+  const PassInfo *PI = findAnalysisPassInfo(P->getPassID());
   if (PI && PI->isAnalysis() && findAnalysisPass(P->getPassID())) {
     delete P;
     return;
@@ -625,7 +618,7 @@ void PMTopLevelManager::schedulePass(Pass *P) {
 
       Pass *AnalysisPass = findAnalysisPass(*I);
       if (!AnalysisPass) {
-        const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+        const PassInfo *PI = findAnalysisPassInfo(*I);
 
         if (!PI) {
           // Pass P is not in the global PassRegistry
@@ -722,8 +715,7 @@ Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
       return *I;
 
     // If Pass not found then check the interfaces implemented by Immutable Pass
-    const PassInfo *PassInf =
-      PassRegistry::getPassRegistry()->getPassInfo(PI);
+    const PassInfo *PassInf = findAnalysisPassInfo(PI);
     assert(PassInf && "Expected all immutable passes to be initialized");
     const std::vector<const PassInfo*> &ImmPI =
       PassInf->getInterfacesImplemented();
@@ -737,6 +729,17 @@ Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
   return nullptr;
 }
 
+const PassInfo *PMTopLevelManager::findAnalysisPassInfo(AnalysisID AID) const {
+  const PassInfo *&PI = AnalysisPassInfos[AID];
+  if (!PI)
+    PI = PassRegistry::getPassRegistry()->getPassInfo(AID);
+  else
+    assert(PI == PassRegistry::getPassRegistry()->getPassInfo(AID) &&
+           "The pass info pointer changed for an analysis ID!");
+
+  return PI;
+}
+
 // Print passes managed by this top level manager.
 void PMTopLevelManager::dumpPasses() const {
 
@@ -765,8 +768,7 @@ void PMTopLevelManager::dumpArguments() const {
   dbgs() << "Pass Arguments: ";
   for (SmallVectorImpl<ImmutablePass *>::const_iterator I =
        ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
-    if (const PassInfo *PI =
-        PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID())) {
+    if (const PassInfo *PI = findAnalysisPassInfo((*I)->getPassID())) {
       assert(PI && "Expected all immutable passes to be initialized");
       if (!PI->isAnalysisGroup())
         dbgs() << " -" << PI->getPassArgument();
@@ -830,7 +832,7 @@ void PMDataManager::recordAvailableAnalysis(Pass *P) {
 
   // This pass is the current implementation of all of the interfaces it
   // implements as well.
-  const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI);
+  const PassInfo *PInf = TPM->findAnalysisPassInfo(PI);
   if (!PInf) return;
   const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
   for (unsigned i = 0, e = II.size(); i != e; ++i)
@@ -963,7 +965,7 @@ void PMDataManager::freePass(Pass *P, StringRef Msg,
   }
 
   AnalysisID PI = P->getPassID();
-  if (const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI)) {
+  if (const PassInfo *PInf = TPM->findAnalysisPassInfo(PI)) {
     // Remove the pass itself (if it is not already removed).
     AvailableAnalysis.erase(PI);
 
@@ -1043,7 +1045,7 @@ void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
   for (SmallVectorImpl<AnalysisID>::iterator
          I = ReqAnalysisNotAvailable.begin(),
          E = ReqAnalysisNotAvailable.end() ;I != E; ++I) {
-    const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+    const PassInfo *PI = TPM->findAnalysisPassInfo(*I);
     Pass *AnalysisPass = PI->createPass();
     this->addLowerLevelRequiredPass(P, AnalysisPass);
   }
@@ -1148,7 +1150,7 @@ void PMDataManager::dumpPassArguments() const {
       PMD->dumpPassArguments();
     else
       if (const PassInfo *PI =
-            PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID()))
+            TPM->findAnalysisPassInfo((*I)->getPassID()))
         if (!PI->isAnalysisGroup())
           dbgs() << " -" << PI->getPassArgument();
   }
@@ -1224,7 +1226,7 @@ void PMDataManager::dumpAnalysisUsage(StringRef Msg, const Pass *P,
   dbgs() << (const void*)P << std::string(getDepth()*2+3, ' ') << Msg << " Analyses:";
   for (unsigned i = 0; i != Set.size(); ++i) {
     if (i) dbgs() << ',';
-    const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(Set[i]);
+    const PassInfo *PInf = TPM->findAnalysisPassInfo(Set[i]);
     if (!PInf) {
       // Some preserved passes, such as AliasAnalysis, may not be initialized by
       // all drivers.
@@ -1389,11 +1391,6 @@ FunctionPassManager::~FunctionPassManager() {
   delete FPM;
 }
 
-/// add - Add a pass to the queue of passes to run.  This passes
-/// ownership of the Pass to the PassManager.  When the
-/// PassManager_X is destroyed, the pass will be destroyed as well, so
-/// there is no need to delete the pass. (TODO delete passes.)
-/// This implies that all passes MUST be allocated with 'new'.
 void FunctionPassManager::add(Pass *P) {
   FPM->add(P);
 }
@@ -1669,8 +1666,8 @@ void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
 
     OnTheFlyManagers[P] = FPP;
   }
-  const PassInfo * RequiredPassPI =
-    PassRegistry::getPassRegistry()->getPassInfo(RequiredPass->getPassID());
+  const PassInfo *RequiredPassPI =
+      TPM->findAnalysisPassInfo(RequiredPass->getPassID());
 
   Pass *FoundPass = nullptr;
   if (RequiredPassPI && RequiredPassPI->isAnalysis()) {
@@ -1749,10 +1746,6 @@ PassManager::~PassManager() {
   delete PM;
 }
 
-/// add - Add a pass to the queue of passes to run.  This passes ownership of
-/// the Pass to the PassManager.  When the PassManager is destroyed, the pass
-/// will be destroyed as well, so there is no need to delete the pass.  This
-/// implies that all passes MUST be allocated with 'new'.
 void PassManager::add(Pass *P) {
   PM->add(P);
 }
diff --git a/lib/IR/MDBuilder.cpp b/lib/IR/MDBuilder.cpp
index 3ec613c..a901011 100644
--- a/lib/IR/MDBuilder.cpp
+++ b/lib/IR/MDBuilder.cpp
@@ -21,11 +21,16 @@ MDString *MDBuilder::createString(StringRef Str) {
   return MDString::get(Context, Str);
 }
 
+ConstantAsMetadata *MDBuilder::createConstant(Constant *C) {
+  return ConstantAsMetadata::get(C);
+}
+
 MDNode *MDBuilder::createFPMath(float Accuracy) {
   if (Accuracy == 0.0)
     return nullptr;
   assert(Accuracy > 0.0 && "Invalid fpmath accuracy!");
-  Value *Op = ConstantFP::get(Type::getFloatTy(Context), Accuracy);
+  auto *Op =
+      createConstant(ConstantFP::get(Type::getFloatTy(Context), Accuracy));
   return MDNode::get(Context, Op);
 }
 
@@ -38,33 +43,38 @@ MDNode *MDBuilder::createBranchWeights(uint32_t TrueWeight,
 MDNode *MDBuilder::createBranchWeights(ArrayRef<uint32_t> Weights) {
   assert(Weights.size() >= 2 && "Need at least two branch weights!");
 
-  SmallVector<Value *, 4> Vals(Weights.size() + 1);
+  SmallVector<Metadata *, 4> Vals(Weights.size() + 1);
   Vals[0] = createString("branch_weights");
 
   Type *Int32Ty = Type::getInt32Ty(Context);
   for (unsigned i = 0, e = Weights.size(); i != e; ++i)
-    Vals[i + 1] = ConstantInt::get(Int32Ty, Weights[i]);
+    Vals[i + 1] = createConstant(ConstantInt::get(Int32Ty, Weights[i]));
 
   return MDNode::get(Context, Vals);
 }
 
 MDNode *MDBuilder::createRange(const APInt &Lo, const APInt &Hi) {
   assert(Lo.getBitWidth() == Hi.getBitWidth() && "Mismatched bitwidths!");
+
+  Type *Ty = IntegerType::get(Context, Lo.getBitWidth());
+  return createRange(ConstantInt::get(Ty, Lo), ConstantInt::get(Ty, Hi));
+}
+
+MDNode *MDBuilder::createRange(Constant *Lo, Constant *Hi) {
   // If the range is everything then it is useless.
   if (Hi == Lo)
     return nullptr;
 
   // Return the range [Lo, Hi).
-  Type *Ty = IntegerType::get(Context, Lo.getBitWidth());
-  Value *Range[2] = {ConstantInt::get(Ty, Lo), ConstantInt::get(Ty, Hi)};
+  Metadata *Range[2] = {createConstant(Lo), createConstant(Hi)};
   return MDNode::get(Context, Range);
 }
 
 MDNode *MDBuilder::createAnonymousAARoot(StringRef Name, MDNode *Extra) {
   // To ensure uniqueness the root node is self-referential.
-  MDNode *Dummy = MDNode::getTemporary(Context, None);
+  auto Dummy = MDNode::getTemporary(Context, None);
 
-  SmallVector<Value *, 3> Args(1, Dummy);
+  SmallVector<Metadata *, 3> Args(1, Dummy.get());
   if (Extra)
     Args.push_back(Extra);
   if (!Name.empty())
@@ -76,7 +86,7 @@ MDNode *MDBuilder::createAnonymousAARoot(StringRef Name, MDNode *Extra) {
   //   !1 = metadata !{metadata !0} <- root
   // Replace the dummy operand with the root node itself and delete the dummy.
   Root->replaceOperandWith(0, Root);
-  MDNode::deleteTemporary(Dummy);
+
   // We now have
   //   !1 = metadata !{metadata !1} <- self-referential root
   return Root;
@@ -92,10 +102,10 @@ MDNode *MDBuilder::createTBAANode(StringRef Name, MDNode *Parent,
                                   bool isConstant) {
   if (isConstant) {
     Constant *Flags = ConstantInt::get(Type::getInt64Ty(Context), 1);
-    Value *Ops[3] = {createString(Name), Parent, Flags};
+    Metadata *Ops[3] = {createString(Name), Parent, createConstant(Flags)};
     return MDNode::get(Context, Ops);
   } else {
-    Value *Ops[2] = {createString(Name), Parent};
+    Metadata *Ops[2] = {createString(Name), Parent};
     return MDNode::get(Context, Ops);
   }
 }
@@ -105,18 +115,18 @@ MDNode *MDBuilder::createAliasScopeDomain(StringRef Name) {
 }
 
 MDNode *MDBuilder::createAliasScope(StringRef Name, MDNode *Domain) {
-  Value *Ops[2] = { createString(Name), Domain };
+  Metadata *Ops[2] = {createString(Name), Domain};
   return MDNode::get(Context, Ops);
 }
 
 /// \brief Return metadata for a tbaa.struct node with the given
 /// struct field descriptions.
 MDNode *MDBuilder::createTBAAStructNode(ArrayRef<TBAAStructField> Fields) {
-  SmallVector<Value *, 4> Vals(Fields.size() * 3);
+  SmallVector<Metadata *, 4> Vals(Fields.size() * 3);
   Type *Int64 = Type::getInt64Ty(Context);
   for (unsigned i = 0, e = Fields.size(); i != e; ++i) {
-    Vals[i * 3 + 0] = ConstantInt::get(Int64, Fields[i].Offset);
-    Vals[i * 3 + 1] = ConstantInt::get(Int64, Fields[i].Size);
+    Vals[i * 3 + 0] = createConstant(ConstantInt::get(Int64, Fields[i].Offset));
+    Vals[i * 3 + 1] = createConstant(ConstantInt::get(Int64, Fields[i].Size));
     Vals[i * 3 + 2] = Fields[i].TBAA;
   }
   return MDNode::get(Context, Vals);
@@ -126,12 +136,12 @@ MDNode *MDBuilder::createTBAAStructNode(ArrayRef<TBAAStructField> Fields) {
 /// with the given name, a list of pairs (offset, field type in the type DAG).
 MDNode *MDBuilder::createTBAAStructTypeNode(
     StringRef Name, ArrayRef<std::pair<MDNode *, uint64_t>> Fields) {
-  SmallVector<Value *, 4> Ops(Fields.size() * 2 + 1);
+  SmallVector<Metadata *, 4> Ops(Fields.size() * 2 + 1);
   Type *Int64 = Type::getInt64Ty(Context);
   Ops[0] = createString(Name);
   for (unsigned i = 0, e = Fields.size(); i != e; ++i) {
     Ops[i * 2 + 1] = Fields[i].first;
-    Ops[i * 2 + 2] = ConstantInt::get(Int64, Fields[i].second);
+    Ops[i * 2 + 2] = createConstant(ConstantInt::get(Int64, Fields[i].second));
   }
   return MDNode::get(Context, Ops);
 }
@@ -141,7 +151,7 @@ MDNode *MDBuilder::createTBAAStructTypeNode(
 MDNode *MDBuilder::createTBAAScalarTypeNode(StringRef Name, MDNode *Parent,
                                             uint64_t Offset) {
   ConstantInt *Off = ConstantInt::get(Type::getInt64Ty(Context), Offset);
-  Value *Ops[3] = {createString(Name), Parent, Off};
+  Metadata *Ops[3] = {createString(Name), Parent, createConstant(Off)};
   return MDNode::get(Context, Ops);
 }
 
@@ -150,6 +160,7 @@ MDNode *MDBuilder::createTBAAScalarTypeNode(StringRef Name, MDNode *Parent,
 MDNode *MDBuilder::createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType,
                                            uint64_t Offset) {
   Type *Int64 = Type::getInt64Ty(Context);
-  Value *Ops[3] = {BaseType, AccessType, ConstantInt::get(Int64, Offset)};
+  Metadata *Ops[3] = {BaseType, AccessType,
+                      createConstant(ConstantInt::get(Int64, Offset))};
   return MDNode::get(Context, Ops);
 }
diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp
index 27ba9f7..0ad3c5c 100644
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp
@@ -1,4 +1,4 @@
-//===-- Metadata.cpp - Implement Metadata classes -------------------------===//
+//===- Metadata.cpp - Implement Metadata classes --------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,6 +13,7 @@
 
 #include "llvm/IR/Metadata.h"
 #include "LLVMContextImpl.h"
+#include "MetadataImpl.h"
 #include "SymbolTableListTraitsImpl.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -20,23 +21,342 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LeakDetector.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueHandle.h"
 
 using namespace llvm;
 
-Metadata::Metadata(LLVMContext &Context, unsigned ID)
-    : Value(Type::getMetadataTy(Context), ID) {}
+MetadataAsValue::MetadataAsValue(Type *Ty, Metadata *MD)
+    : Value(Ty, MetadataAsValueVal), MD(MD) {
+  track();
+}
+
+MetadataAsValue::~MetadataAsValue() {
+  getType()->getContext().pImpl->MetadataAsValues.erase(MD);
+  untrack();
+}
+
+/// \brief Canonicalize metadata arguments to intrinsics.
+///
+/// To support bitcode upgrades (and assembly semantic sugar) for \a
+/// MetadataAsValue, we need to canonicalize certain metadata.
+///
+///   - nullptr is replaced by an empty MDNode.
+///   - An MDNode with a single null operand is replaced by an empty MDNode.
+///   - An MDNode whose only operand is a \a ConstantAsMetadata gets skipped.
+///
+/// This maintains readability of bitcode from when metadata was a type of
+/// value, and these bridges were unnecessary.
+static Metadata *canonicalizeMetadataForValue(LLVMContext &Context,
+                                              Metadata *MD) {
+  if (!MD)
+    // !{}
+    return MDNode::get(Context, None);
+
+  // Return early if this isn't a single-operand MDNode.
+  auto *N = dyn_cast<MDNode>(MD);
+  if (!N || N->getNumOperands() != 1)
+    return MD;
+
+  if (!N->getOperand(0))
+    // !{}
+    return MDNode::get(Context, None);
+
+  if (auto *C = dyn_cast<ConstantAsMetadata>(N->getOperand(0)))
+    // Look through the MDNode.
+    return C;
+
+  return MD;
+}
+
+MetadataAsValue *MetadataAsValue::get(LLVMContext &Context, Metadata *MD) {
+  MD = canonicalizeMetadataForValue(Context, MD);
+  auto *&Entry = Context.pImpl->MetadataAsValues[MD];
+  if (!Entry)
+    Entry = new MetadataAsValue(Type::getMetadataTy(Context), MD);
+  return Entry;
+}
+
+MetadataAsValue *MetadataAsValue::getIfExists(LLVMContext &Context,
+                                              Metadata *MD) {
+  MD = canonicalizeMetadataForValue(Context, MD);
+  auto &Store = Context.pImpl->MetadataAsValues;
+  return Store.lookup(MD);
+}
+
+void MetadataAsValue::handleChangedMetadata(Metadata *MD) {
+  LLVMContext &Context = getContext();
+  MD = canonicalizeMetadataForValue(Context, MD);
+  auto &Store = Context.pImpl->MetadataAsValues;
+
+  // Stop tracking the old metadata.
+  Store.erase(this->MD);
+  untrack();
+  this->MD = nullptr;
+
+  // Start tracking MD, or RAUW if necessary.
+  auto *&Entry = Store[MD];
+  if (Entry) {
+    replaceAllUsesWith(Entry);
+    delete this;
+    return;
+  }
+
+  this->MD = MD;
+  track();
+  Entry = this;
+}
+
+void MetadataAsValue::track() {
+  if (MD)
+    MetadataTracking::track(&MD, *MD, *this);
+}
+
+void MetadataAsValue::untrack() {
+  if (MD)
+    MetadataTracking::untrack(MD);
+}
+
+void ReplaceableMetadataImpl::addRef(void *Ref, OwnerTy Owner) {
+  bool WasInserted =
+      UseMap.insert(std::make_pair(Ref, std::make_pair(Owner, NextIndex)))
+          .second;
+  (void)WasInserted;
+  assert(WasInserted && "Expected to add a reference");
+
+  ++NextIndex;
+  assert(NextIndex != 0 && "Unexpected overflow");
+}
+
+void ReplaceableMetadataImpl::dropRef(void *Ref) {
+  bool WasErased = UseMap.erase(Ref);
+  (void)WasErased;
+  assert(WasErased && "Expected to drop a reference");
+}
+
+void ReplaceableMetadataImpl::moveRef(void *Ref, void *New,
+                                      const Metadata &MD) {
+  auto I = UseMap.find(Ref);
+  assert(I != UseMap.end() && "Expected to move a reference");
+  auto OwnerAndIndex = I->second;
+  UseMap.erase(I);
+  bool WasInserted = UseMap.insert(std::make_pair(New, OwnerAndIndex)).second;
+  (void)WasInserted;
+  assert(WasInserted && "Expected to add a reference");
+
+  // Check that the references are direct if there's no owner.
+  (void)MD;
+  assert((OwnerAndIndex.first || *static_cast<Metadata **>(Ref) == &MD) &&
+         "Reference without owner must be direct");
+  assert((OwnerAndIndex.first || *static_cast<Metadata **>(New) == &MD) &&
+         "Reference without owner must be direct");
+}
+
+void ReplaceableMetadataImpl::replaceAllUsesWith(Metadata *MD) {
+  assert(!(MD && isa<MDNode>(MD) && cast<MDNode>(MD)->isTemporary()) &&
+         "Expected non-temp node");
+
+  if (UseMap.empty())
+    return;
+
+  // Copy out uses since UseMap will get touched below.
+  typedef std::pair<void *, std::pair<OwnerTy, uint64_t>> UseTy;
+  SmallVector<UseTy, 8> Uses(UseMap.begin(), UseMap.end());
+  std::sort(Uses.begin(), Uses.end(), [](const UseTy &L, const UseTy &R) {
+    return L.second.second < R.second.second;
+  });
+  for (const auto &Pair : Uses) {
+    // Check that this Ref hasn't disappeared after RAUW (when updating a
+    // previous Ref).
+    if (!UseMap.count(Pair.first))
+      continue;
+
+    OwnerTy Owner = Pair.second.first;
+    if (!Owner) {
+      // Update unowned tracking references directly.
+      Metadata *&Ref = *static_cast<Metadata **>(Pair.first);
+      Ref = MD;
+      if (MD)
+        MetadataTracking::track(Ref);
+      UseMap.erase(Pair.first);
+      continue;
+    }
+
+    // Check for MetadataAsValue.
+    if (Owner.is<MetadataAsValue *>()) {
+      Owner.get<MetadataAsValue *>()->handleChangedMetadata(MD);
+      continue;
+    }
+
+    // There's a Metadata owner -- dispatch.
+    Metadata *OwnerMD = Owner.get<Metadata *>();
+    switch (OwnerMD->getMetadataID()) {
+#define HANDLE_METADATA_LEAF(CLASS)                                            \
+  case Metadata::CLASS##Kind:                                                  \
+    cast<CLASS>(OwnerMD)->handleChangedOperand(Pair.first, MD);                \
+    continue;
+#include "llvm/IR/Metadata.def"
+    default:
+      llvm_unreachable("Invalid metadata subclass");
+    }
+  }
+  assert(UseMap.empty() && "Expected all uses to be replaced");
+}
+
+void ReplaceableMetadataImpl::resolveAllUses(bool ResolveUsers) {
+  if (UseMap.empty())
+    return;
+
+  if (!ResolveUsers) {
+    UseMap.clear();
+    return;
+  }
+
+  // Copy out uses since UseMap could get touched below.
+  typedef std::pair<void *, std::pair<OwnerTy, uint64_t>> UseTy;
+  SmallVector<UseTy, 8> Uses(UseMap.begin(), UseMap.end());
+  std::sort(Uses.begin(), Uses.end(), [](const UseTy &L, const UseTy &R) {
+    return L.second.second < R.second.second;
+  });
+  UseMap.clear();
+  for (const auto &Pair : Uses) {
+    auto Owner = Pair.second.first;
+    if (!Owner)
+      continue;
+    if (Owner.is<MetadataAsValue *>())
+      continue;
+
+    // Resolve MDNodes that point at this.
+    auto *OwnerMD = dyn_cast<MDNode>(Owner.get<Metadata *>());
+    if (!OwnerMD)
+      continue;
+    if (OwnerMD->isResolved())
+      continue;
+    OwnerMD->decrementUnresolvedOperandCount();
+  }
+}
+
+static Function *getLocalFunction(Value *V) {
+  assert(V && "Expected value");
+  if (auto *A = dyn_cast<Argument>(V))
+    return A->getParent();
+  if (BasicBlock *BB = cast<Instruction>(V)->getParent())
+    return BB->getParent();
+  return nullptr;
+}
+
+ValueAsMetadata *ValueAsMetadata::get(Value *V) {
+  assert(V && "Unexpected null Value");
+
+  auto &Context = V->getContext();
+  auto *&Entry = Context.pImpl->ValuesAsMetadata[V];
+  if (!Entry) {
+    assert((isa<Constant>(V) || isa<Argument>(V) || isa<Instruction>(V)) &&
+           "Expected constant or function-local value");
+    assert(!V->NameAndIsUsedByMD.getInt() &&
+           "Expected this to be the only metadata use");
+    V->NameAndIsUsedByMD.setInt(true);
+    if (auto *C = dyn_cast<Constant>(V))
+      Entry = new ConstantAsMetadata(C);
+    else
+      Entry = new LocalAsMetadata(V);
+  }
+
+  return Entry;
+}
+
+ValueAsMetadata *ValueAsMetadata::getIfExists(Value *V) {
+  assert(V && "Unexpected null Value");
+  return V->getContext().pImpl->ValuesAsMetadata.lookup(V);
+}
+
+void ValueAsMetadata::handleDeletion(Value *V) {
+  assert(V && "Expected valid value");
+
+  auto &Store = V->getType()->getContext().pImpl->ValuesAsMetadata;
+  auto I = Store.find(V);
+  if (I == Store.end())
+    return;
+
+  // Remove old entry from the map.
+  ValueAsMetadata *MD = I->second;
+  assert(MD && "Expected valid metadata");
+  assert(MD->getValue() == V && "Expected valid mapping");
+  Store.erase(I);
+
+  // Delete the metadata.
+  MD->replaceAllUsesWith(nullptr);
+  delete MD;
+}
+
+void ValueAsMetadata::handleRAUW(Value *From, Value *To) {
+  assert(From && "Expected valid value");
+  assert(To && "Expected valid value");
+  assert(From != To && "Expected changed value");
+  assert(From->getType() == To->getType() && "Unexpected type change");
+
+  LLVMContext &Context = From->getType()->getContext();
+  auto &Store = Context.pImpl->ValuesAsMetadata;
+  auto I = Store.find(From);
+  if (I == Store.end()) {
+    assert(!From->NameAndIsUsedByMD.getInt() &&
+           "Expected From not to be used by metadata");
+    return;
+  }
+
+  // Remove old entry from the map.
+  assert(From->NameAndIsUsedByMD.getInt() &&
+         "Expected From to be used by metadata");
+  From->NameAndIsUsedByMD.setInt(false);
+  ValueAsMetadata *MD = I->second;
+  assert(MD && "Expected valid metadata");
+  assert(MD->getValue() == From && "Expected valid mapping");
+  Store.erase(I);
+
+  if (isa<LocalAsMetadata>(MD)) {
+    if (auto *C = dyn_cast<Constant>(To)) {
+      // Local became a constant.
+      MD->replaceAllUsesWith(ConstantAsMetadata::get(C));
+      delete MD;
+      return;
+    }
+    if (getLocalFunction(From) && getLocalFunction(To) &&
+        getLocalFunction(From) != getLocalFunction(To)) {
+      // Function changed.
+      MD->replaceAllUsesWith(nullptr);
+      delete MD;
+      return;
+    }
+  } else if (!isa<Constant>(To)) {
+    // Changed to function-local value.
+    MD->replaceAllUsesWith(nullptr);
+    delete MD;
+    return;
+  }
+
+  auto *&Entry = Store[To];
+  if (Entry) {
+    // The target already exists.
+    MD->replaceAllUsesWith(Entry);
+    delete MD;
+    return;
+  }
+
+  // Update MD in place (and update the map entry).
+  assert(!To->NameAndIsUsedByMD.getInt() &&
+         "Expected this to be the only metadata use");
+  To->NameAndIsUsedByMD.setInt(true);
+  MD->V = To;
+  Entry = MD;
+}
 
 //===----------------------------------------------------------------------===//
 // MDString implementation.
 //
 
-void MDString::anchor() { }
-
 MDString *MDString::get(LLVMContext &Context, StringRef Str) {
   auto &Store = Context.pImpl->MDStringCache;
   auto I = Store.find(Str);
@@ -44,353 +364,398 @@ MDString *MDString::get(LLVMContext &Context, StringRef Str) {
     return &I->second;
 
   auto *Entry =
-      StringMapEntry<MDString>::Create(Str, Store.getAllocator(), Context);
+      StringMapEntry<MDString>::Create(Str, Store.getAllocator(), MDString());
   bool WasInserted = Store.insert(Entry);
   (void)WasInserted;
   assert(WasInserted && "Expected entry to be inserted");
+  Entry->second.Entry = Entry;
   return &Entry->second;
 }
 
 StringRef MDString::getString() const {
-  return StringMapEntry<MDString>::GetStringMapEntryFromValue(*this).first();
+  assert(Entry && "Expected to find string map entry");
+  return Entry->first();
 }
 
 //===----------------------------------------------------------------------===//
-// MDNodeOperand implementation.
+// MDNode implementation.
 //
 
-// Use CallbackVH to hold MDNode operands.
-namespace llvm {
-class MDNodeOperand : public CallbackVH {
-  MDNode *getParent() {
-    MDNodeOperand *Cur = this;
+void *MDNode::operator new(size_t Size, unsigned NumOps) {
+  void *Ptr = ::operator new(Size + NumOps * sizeof(MDOperand));
+  MDOperand *O = static_cast<MDOperand *>(Ptr);
+  for (MDOperand *E = O + NumOps; O != E; ++O)
+    (void)new (O) MDOperand;
+  return O;
+}
+
+void MDNode::operator delete(void *Mem) {
+  MDNode *N = static_cast<MDNode *>(Mem);
+  MDOperand *O = static_cast<MDOperand *>(Mem);
+  for (MDOperand *E = O - N->NumOperands; O != E; --O)
+    (O - 1)->~MDOperand();
+  ::operator delete(O);
+}
 
-    while (Cur->getValPtrInt() != 1)
-      ++Cur;
+MDNode::MDNode(LLVMContext &Context, unsigned ID, StorageType Storage,
+               ArrayRef<Metadata *> Ops1, ArrayRef<Metadata *> Ops2)
+    : Metadata(ID, Storage), NumOperands(Ops1.size() + Ops2.size()),
+      NumUnresolved(0), Context(Context) {
+  unsigned Op = 0;
+  for (Metadata *MD : Ops1)
+    setOperand(Op++, MD);
+  for (Metadata *MD : Ops2)
+    setOperand(Op++, MD);
+
+  if (isDistinct())
+    return;
 
-    assert(Cur->getValPtrInt() == 1 &&
-           "Couldn't find the end of the operand list!");
-    return reinterpret_cast<MDNode *>(Cur + 1);
-  }
+  if (isUniqued())
+    // Check whether any operands are unresolved, requiring re-uniquing.  If
+    // not, don't support RAUW.
+    if (!countUnresolvedOperands())
+      return;
 
-public:
-  MDNodeOperand() {}
-  virtual ~MDNodeOperand();
+  this->Context.makeReplaceable(make_unique<ReplaceableMetadataImpl>(Context));
+}
 
-  void set(Value *V) {
-    unsigned IsLast = this->getValPtrInt();
-    this->setValPtr(V);
-    this->setAsLastOperand(IsLast);
+TempMDNode MDNode::clone() const {
+  switch (getMetadataID()) {
+  default:
+    llvm_unreachable("Invalid MDNode subclass");
+#define HANDLE_MDNODE_LEAF(CLASS)                                              \
+  case CLASS##Kind:                                                            \
+    return cast<CLASS>(this)->cloneImpl();
+#include "llvm/IR/Metadata.def"
   }
+}
 
-  /// \brief Accessor method to mark the operand as the first in the list.
-  void setAsLastOperand(unsigned I) { this->setValPtrInt(I); }
+static bool isOperandUnresolved(Metadata *Op) {
+  if (auto *N = dyn_cast_or_null<MDNode>(Op))
+    return !N->isResolved();
+  return false;
+}
 
-  void deleted() override;
-  void allUsesReplacedWith(Value *NV) override;
-};
-} // end namespace llvm.
+unsigned MDNode::countUnresolvedOperands() {
+  assert(NumUnresolved == 0 && "Expected unresolved ops to be uncounted");
+  NumUnresolved = std::count_if(op_begin(), op_end(), isOperandUnresolved);
+  return NumUnresolved;
+}
 
-// Provide out-of-line definition to prevent weak vtable.
-MDNodeOperand::~MDNodeOperand() {}
+void MDNode::makeUniqued() {
+  assert(isTemporary() && "Expected this to be temporary");
+  assert(!isResolved() && "Expected this to be unresolved");
 
-void MDNodeOperand::deleted() {
-  getParent()->replaceOperand(this, nullptr);
-}
+  // Make this 'uniqued'.
+  Storage = Uniqued;
+  if (!countUnresolvedOperands())
+    resolve();
 
-void MDNodeOperand::allUsesReplacedWith(Value *NV) {
-  getParent()->replaceOperand(this, NV);
+  assert(isUniqued() && "Expected this to be uniqued");
 }
 
-//===----------------------------------------------------------------------===//
-// MDNode implementation.
-//
+void MDNode::makeDistinct() {
+  assert(isTemporary() && "Expected this to be temporary");
+  assert(!isResolved() && "Expected this to be unresolved");
+
+  // Pretend to be uniqued, resolve the node, and then store in distinct table.
+  Storage = Uniqued;
+  resolve();
+  storeDistinctInContext();
 
-/// \brief Get the MDNodeOperand's coallocated on the end of the MDNode.
-static MDNodeOperand *getOperandPtr(MDNode *N, unsigned Op) {
-  // Use <= instead of < to permit a one-past-the-end address.
-  assert(Op <= N->getNumOperands() && "Invalid operand number");
-  return reinterpret_cast<MDNodeOperand *>(N) - N->getNumOperands() + Op;
+  assert(isDistinct() && "Expected this to be distinct");
+  assert(isResolved() && "Expected this to be resolved");
 }
 
-void MDNode::replaceOperandWith(unsigned i, Value *Val) {
-  MDNodeOperand *Op = getOperandPtr(this, i);
-  replaceOperand(Op, Val);
+void MDNode::resolve() {
+  assert(isUniqued() && "Expected this to be uniqued");
+  assert(!isResolved() && "Expected this to be unresolved");
+
+  // Move the map, so that this immediately looks resolved.
+  auto Uses = Context.takeReplaceableUses();
+  NumUnresolved = 0;
+  assert(isResolved() && "Expected this to be resolved");
+
+  // Drop RAUW support.
+  Uses->resolveAllUses();
 }
 
-void *MDNode::operator new(size_t Size, unsigned NumOps) {
-  void *Ptr = ::operator new(Size + NumOps * sizeof(MDNodeOperand));
-  MDNodeOperand *Op = static_cast<MDNodeOperand *>(Ptr);
-  if (NumOps) {
-    MDNodeOperand *Last = Op + NumOps;
-    for (; Op != Last; ++Op)
-      new (Op) MDNodeOperand();
-    (Op - 1)->setAsLastOperand(1);
-  }
-  return Op;
+void MDNode::resolveAfterOperandChange(Metadata *Old, Metadata *New) {
+  assert(NumUnresolved != 0 && "Expected unresolved operands");
+
+  // Check if an operand was resolved.
+  if (!isOperandUnresolved(Old)) {
+    if (isOperandUnresolved(New))
+      // An operand was un-resolved!
+      ++NumUnresolved;
+  } else if (!isOperandUnresolved(New))
+    decrementUnresolvedOperandCount();
 }
 
-void MDNode::operator delete(void *Mem) {
-  MDNode *N = static_cast<MDNode *>(Mem);
-  MDNodeOperand *Op = static_cast<MDNodeOperand *>(Mem);
-  for (unsigned I = 0, E = N->NumOperands; I != E; ++I)
-    (--Op)->~MDNodeOperand();
-  ::operator delete(Op);
+void MDNode::decrementUnresolvedOperandCount() {
+  if (!--NumUnresolved)
+    // Last unresolved operand has just been resolved.
+    resolve();
 }
 
-MDNode::MDNode(LLVMContext &C, unsigned ID, ArrayRef<Value *> Vals,
-               bool isFunctionLocal)
-    : Metadata(C, ID) {
-  NumOperands = Vals.size();
+void MDNode::resolveCycles() {
+  if (isResolved())
+    return;
 
-  if (isFunctionLocal)
-    setValueSubclassData(getSubclassDataFromValue() | FunctionLocalBit);
+  // Resolve this node immediately.
+  resolve();
 
-  // Initialize the operand list.
-  unsigned i = 0;
-  for (MDNodeOperand *Op = getOperandPtr(this, 0), *E = Op + NumOperands;
-       Op != E; ++Op, ++i)
-    Op->set(Vals[i]);
-}
+  // Resolve all operands.
+  for (const auto &Op : operands()) {
+    auto *N = dyn_cast_or_null<MDNode>(Op);
+    if (!N)
+      continue;
 
-GenericMDNode::~GenericMDNode() {
-  LLVMContextImpl *pImpl = getType()->getContext().pImpl;
-  if (isNotUniqued()) {
-    pImpl->NonUniquedMDNodes.erase(this);
-  } else {
-    pImpl->MDNodeSet.erase(this);
+    assert(!N->isTemporary() &&
+           "Expected all forward declarations to be resolved");
+    if (!N->isResolved())
+      N->resolveCycles();
   }
 }
 
-void GenericMDNode::dropAllReferences() {
-  for (MDNodeOperand *Op = getOperandPtr(this, 0), *E = Op + NumOperands;
-       Op != E; ++Op)
-    Op->set(nullptr);
+static bool hasSelfReference(MDNode *N) {
+  for (Metadata *MD : N->operands())
+    if (MD == N)
+      return true;
+  return false;
+}
+
+MDNode *MDNode::replaceWithPermanentImpl() {
+  if (hasSelfReference(this))
+    return replaceWithDistinctImpl();
+  return replaceWithUniquedImpl();
 }
 
-static const Function *getFunctionForValue(Value *V) {
-  if (!V) return nullptr;
-  if (Instruction *I = dyn_cast<Instruction>(V)) {
-    BasicBlock *BB = I->getParent();
-    return BB ? BB->getParent() : nullptr;
+MDNode *MDNode::replaceWithUniquedImpl() {
+  // Try to uniquify in place.
+  MDNode *UniquedNode = uniquify();
+
+  if (UniquedNode == this) {
+    makeUniqued();
+    return this;
   }
-  if (Argument *A = dyn_cast<Argument>(V))
-    return A->getParent();
-  if (BasicBlock *BB = dyn_cast<BasicBlock>(V))
-    return BB->getParent();
-  if (MDNode *MD = dyn_cast<MDNode>(V))
-    return MD->getFunction();
-  return nullptr;
+
+  // Collision, so RAUW instead.
+  replaceAllUsesWith(UniquedNode);
+  deleteAsSubclass();
+  return UniquedNode;
 }
 
-#ifndef NDEBUG
-static const Function *assertLocalFunction(const MDNode *N) {
-  if (!N->isFunctionLocal()) return nullptr;
+MDNode *MDNode::replaceWithDistinctImpl() {
+  makeDistinct();
+  return this;
+}
 
-  // FIXME: This does not handle cyclic function local metadata.
-  const Function *F = nullptr, *NewF = nullptr;
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    if (Value *V = N->getOperand(i)) {
-      if (MDNode *MD = dyn_cast<MDNode>(V))
-        NewF = assertLocalFunction(MD);
-      else
-        NewF = getFunctionForValue(V);
-    }
-    if (!F)
-      F = NewF;
-    else
-      assert((NewF == nullptr || F == NewF) &&
-             "inconsistent function-local metadata");
-  }
-  return F;
-}
-#endif
-
-// getFunction - If this metadata is function-local and recursively has a
-// function-local operand, return the first such operand's parent function.
-// Otherwise, return null. getFunction() should not be used for performance-
-// critical code because it recursively visits all the MDNode's operands.  
-const Function *MDNode::getFunction() const {
-#ifndef NDEBUG
-  return assertLocalFunction(this);
-#else
-  if (!isFunctionLocal()) return nullptr;
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-    if (const Function *F = getFunctionForValue(getOperand(i)))
-      return F;
-  return nullptr;
-#endif
+void MDTuple::recalculateHash() {
+  setHash(MDTupleInfo::KeyTy::calculateHash(this));
 }
 
-/// \brief Check if the Value  would require a function-local MDNode.
-static bool isFunctionLocalValue(Value *V) {
-  return isa<Instruction>(V) || isa<Argument>(V) || isa<BasicBlock>(V) ||
-         (isa<MDNode>(V) && cast<MDNode>(V)->isFunctionLocal());
+void MDNode::dropAllReferences() {
+  for (unsigned I = 0, E = NumOperands; I != E; ++I)
+    setOperand(I, nullptr);
+  if (!isResolved()) {
+    Context.getReplaceableUses()->resolveAllUses(/* ResolveUsers */ false);
+    (void)Context.takeReplaceableUses();
+  }
 }
 
-MDNode *MDNode::getMDNode(LLVMContext &Context, ArrayRef<Value*> Vals,
-                          FunctionLocalness FL, bool Insert) {
-  auto &Store = Context.pImpl->MDNodeSet;
+void MDNode::handleChangedOperand(void *Ref, Metadata *New) {
+  unsigned Op = static_cast<MDOperand *>(Ref) - op_begin();
+  assert(Op < getNumOperands() && "Expected valid operand");
 
-  GenericMDNodeInfo::KeyTy Key(Vals);
-  auto I = Store.find_as(Key);
-  if (I != Store.end())
-    return *I;
-  if (!Insert)
-    return nullptr;
+  if (!isUniqued()) {
+    // This node is not uniqued.  Just set the operand and be done with it.
+    setOperand(Op, New);
+    return;
+  }
 
-  bool isFunctionLocal = false;
-  switch (FL) {
-  case FL_Unknown:
-    for (Value *V : Vals) {
-      if (!V) continue;
-      if (isFunctionLocalValue(V)) {
-        isFunctionLocal = true;
-        break;
-      }
-    }
-    break;
-  case FL_No:
-    isFunctionLocal = false;
-    break;
-  case FL_Yes:
-    isFunctionLocal = true;
+  // This node is uniqued.
+  eraseFromStore();
+
+  Metadata *Old = getOperand(Op);
+  setOperand(Op, New);
+
+  // Drop uniquing for self-reference cycles.
+  if (New == this) {
+    if (!isResolved())
+      resolve();
+    storeDistinctInContext();
+    return;
+  }
+
+  // Re-unique the node.
+  auto *Uniqued = uniquify();
+  if (Uniqued == this) {
+    if (!isResolved())
+      resolveAfterOperandChange(Old, New);
+    return;
+  }
+
+  // Collision.
+  if (!isResolved()) {
+    // Still unresolved, so RAUW.
+    //
+    // First, clear out all operands to prevent any recursion (similar to
+    // dropAllReferences(), but we still need the use-list).
+    for (unsigned O = 0, E = getNumOperands(); O != E; ++O)
+      setOperand(O, nullptr);
+    Context.getReplaceableUses()->replaceAllUsesWith(Uniqued);
+    deleteAsSubclass();
+    return;
+  }
+
+  // Store in non-uniqued form if RAUW isn't possible.
+  storeDistinctInContext();
+}
+
+void MDNode::deleteAsSubclass() {
+  switch (getMetadataID()) {
+  default:
+    llvm_unreachable("Invalid subclass of MDNode");
+#define HANDLE_MDNODE_LEAF(CLASS)                                              \
+  case CLASS##Kind:                                                            \
+    delete cast<CLASS>(this);                                                  \
     break;
+#include "llvm/IR/Metadata.def"
   }
+}
 
-  // Coallocate space for the node and Operands together, then placement new.
-  GenericMDNode *N =
-      new (Vals.size()) GenericMDNode(Context, Vals, isFunctionLocal);
+template <class T, class InfoT>
+static T *uniquifyImpl(T *N, DenseSet<T *, InfoT> &Store) {
+  if (T *U = getUniqued(Store, N))
+    return U;
 
-  N->Hash = Key.Hash;
   Store.insert(N);
   return N;
 }
 
-MDNode *MDNode::get(LLVMContext &Context, ArrayRef<Value*> Vals) {
-  return getMDNode(Context, Vals, FL_Unknown);
-}
+template <class NodeTy> struct MDNode::HasCachedHash {
+  typedef char Yes[1];
+  typedef char No[2];
+  template <class U, U Val> struct SFINAE {};
 
-MDNode *MDNode::getWhenValsUnresolved(LLVMContext &Context,
-                                      ArrayRef<Value*> Vals,
-                                      bool isFunctionLocal) {
-  return getMDNode(Context, Vals, isFunctionLocal ? FL_Yes : FL_No);
-}
+  template <class U>
+  static Yes &check(SFINAE<void (U::*)(unsigned), &U::setHash> *);
+  template <class U> static No &check(...);
 
-MDNode *MDNode::getIfExists(LLVMContext &Context, ArrayRef<Value*> Vals) {
-  return getMDNode(Context, Vals, FL_Unknown, false);
-}
+  static const bool value = sizeof(check<NodeTy>(nullptr)) == sizeof(Yes);
+};
 
-MDNode *MDNode::getTemporary(LLVMContext &Context, ArrayRef<Value*> Vals) {
-  MDNode *N = new (Vals.size()) MDNodeFwdDecl(Context, Vals, FL_No);
-  N->setValueSubclassData(N->getSubclassDataFromValue() | NotUniquedBit);
-  LeakDetector::addGarbageObject(N);
-  return N;
+MDNode *MDNode::uniquify() {
+  assert(!hasSelfReference(this) && "Cannot uniquify a self-referencing node");
+
+  // Try to insert into uniquing store.
+  switch (getMetadataID()) {
+  default:
+    llvm_unreachable("Invalid subclass of MDNode");
+#define HANDLE_MDNODE_LEAF(CLASS)                                              \
+  case CLASS##Kind: {                                                          \
+    CLASS *SubclassThis = cast<CLASS>(this);                                   \
+    std::integral_constant<bool, HasCachedHash<CLASS>::value>                  \
+        ShouldRecalculateHash;                                                 \
+    dispatchRecalculateHash(SubclassThis, ShouldRecalculateHash);              \
+    return uniquifyImpl(SubclassThis, getContext().pImpl->CLASS##s);           \
+  }
+#include "llvm/IR/Metadata.def"
+  }
 }
 
-void MDNode::deleteTemporary(MDNode *N) {
-  assert(N->use_empty() && "Temporary MDNode has uses!");
-  assert(isa<MDNodeFwdDecl>(N) && "Expected forward declaration");
-  assert((N->getSubclassDataFromValue() & NotUniquedBit) &&
-         "Temporary MDNode does not have NotUniquedBit set!");
-  LeakDetector::removeGarbageObject(N);
-  delete cast<MDNodeFwdDecl>(N);
-}
-
-/// \brief Return specified operand.
-Value *MDNode::getOperand(unsigned i) const {
-  assert(i < getNumOperands() && "Invalid operand number");
-  return *getOperandPtr(const_cast<MDNode*>(this), i);
-}
-
-void MDNode::setIsNotUniqued() {
-  setValueSubclassData(getSubclassDataFromValue() | NotUniquedBit);
-  LLVMContextImpl *pImpl = getType()->getContext().pImpl;
-  auto *G = cast<GenericMDNode>(this);
-  G->Hash = 0;
-  pImpl->NonUniquedMDNodes.insert(G);
-}
-
-// Replace value from this node's operand list.
-void MDNode::replaceOperand(MDNodeOperand *Op, Value *To) {
-  Value *From = *Op;
-
-  // If is possible that someone did GV->RAUW(inst), replacing a global variable
-  // with an instruction or some other function-local object.  If this is a
-  // non-function-local MDNode, it can't point to a function-local object.
-  // Handle this case by implicitly dropping the MDNode reference to null.
-  // Likewise if the MDNode is function-local but for a different function.
-  if (To && isFunctionLocalValue(To)) {
-    if (!isFunctionLocal())
-      To = nullptr;
-    else {
-      const Function *F = getFunction();
-      const Function *FV = getFunctionForValue(To);
-      // Metadata can be function-local without having an associated function.
-      // So only consider functions to have changed if non-null.
-      if (F && FV && F != FV)
-        To = nullptr;
-    }
+void MDNode::eraseFromStore() {
+  switch (getMetadataID()) {
+  default:
+    llvm_unreachable("Invalid subclass of MDNode");
+#define HANDLE_MDNODE_LEAF(CLASS)                                              \
+  case CLASS##Kind:                                                            \
+    getContext().pImpl->CLASS##s.erase(cast<CLASS>(this));                     \
+    break;
+#include "llvm/IR/Metadata.def"
   }
-  
-  if (From == To)
-    return;
+}
 
-  // If this node is already not being uniqued (because one of the operands
-  // already went to null), then there is nothing else to do here.
-  if (isNotUniqued()) {
-    Op->set(To);
-    return;
+MDTuple *MDTuple::getImpl(LLVMContext &Context, ArrayRef<Metadata *> MDs,
+                          StorageType Storage, bool ShouldCreate) {
+  unsigned Hash = 0;
+  if (Storage == Uniqued) {
+    MDTupleInfo::KeyTy Key(MDs);
+    if (auto *N = getUniqued(Context.pImpl->MDTuples, Key))
+      return N;
+    if (!ShouldCreate)
+      return nullptr;
+    Hash = Key.getHash();
+  } else {
+    assert(ShouldCreate && "Expected non-uniqued nodes to always be created");
   }
 
-  auto &Store = getContext().pImpl->MDNodeSet;
-  auto *N = cast<GenericMDNode>(this);
+  return storeImpl(new (MDs.size()) MDTuple(Context, Storage, Hash, MDs),
+                   Storage, Context.pImpl->MDTuples);
+}
+
+void MDNode::deleteTemporary(MDNode *N) {
+  assert(N->isTemporary() && "Expected temporary node");
+  N->replaceAllUsesWith(nullptr);
+  N->deleteAsSubclass();
+}
 
-  // Remove "this" from the context map.
-  Store.erase(N);
+void MDNode::storeDistinctInContext() {
+  assert(isResolved() && "Expected resolved nodes");
+  Storage = Distinct;
+
+  // Reset the hash.
+  switch (getMetadataID()) {
+  default:
+    llvm_unreachable("Invalid subclass of MDNode");
+#define HANDLE_MDNODE_LEAF(CLASS)                                              \
+  case CLASS##Kind: {                                                          \
+    std::integral_constant<bool, HasCachedHash<CLASS>::value> ShouldResetHash; \
+    dispatchResetHash(cast<CLASS>(this), ShouldResetHash);                     \
+    break;                                                                     \
+  }
+#include "llvm/IR/Metadata.def"
+  }
 
-  // Update the operand.
-  Op->set(To);
+  getContext().pImpl->DistinctMDNodes.insert(this);
+}
 
-  // If we are dropping an argument to null, we choose to not unique the MDNode
-  // anymore.  This commonly occurs during destruction, and uniquing these
-  // brings little reuse.  Also, this means we don't need to include
-  // isFunctionLocal bits in the hash for MDNodes.
-  if (!To) {
-    setIsNotUniqued();
+void MDNode::replaceOperandWith(unsigned I, Metadata *New) {
+  if (getOperand(I) == New)
     return;
-  }
 
-  // Now that the node is out of the table, get ready to reinsert it.  First,
-  // check to see if another node with the same operands already exists in the
-  // set.  If so, then this node is redundant.
-  SmallVector<Value *, 8> Vals;
-  GenericMDNodeInfo::KeyTy Key(N, Vals);
-  auto I = Store.find_as(Key);
-  if (I != Store.end()) {
-    N->replaceAllUsesWith(*I);
-    delete N;
+  if (!isUniqued()) {
+    setOperand(I, New);
     return;
   }
 
-  N->Hash = Key.Hash;
-  Store.insert(N);
+  handleChangedOperand(mutable_begin() + I, New);
+}
 
-  // If this MDValue was previously function-local but no longer is, clear
-  // its function-local flag.
-  if (isFunctionLocal() && !isFunctionLocalValue(To)) {
-    bool isStillFunctionLocal = false;
-    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-      Value *V = getOperand(i);
-      if (!V) continue;
-      if (isFunctionLocalValue(V)) {
-        isStillFunctionLocal = true;
-        break;
+void MDNode::setOperand(unsigned I, Metadata *New) {
+  assert(I < NumOperands);
+  mutable_begin()[I].reset(New, isUniqued() ? this : nullptr);
+}
+
+/// \brief Get a node, or a self-reference that looks like it.
+///
+/// Special handling for finding self-references, for use by \a
+/// MDNode::concatenate() and \a MDNode::intersect() to maintain behaviour from
+/// when self-referencing nodes were still uniqued.  If the first operand has
+/// the same operands as \c Ops, return the first operand instead.
+static MDNode *getOrSelfReference(LLVMContext &Context,
+                                  ArrayRef<Metadata *> Ops) {
+  if (!Ops.empty())
+    if (MDNode *N = dyn_cast_or_null<MDNode>(Ops[0]))
+      if (N->getNumOperands() == Ops.size() && N == N->getOperand(0)) {
+        for (unsigned I = 1, E = Ops.size(); I != E; ++I)
+          if (Ops[I] != N->getOperand(I))
+            return MDNode::get(Context, Ops);
+        return N;
       }
-    }
-    if (!isStillFunctionLocal)
-      setValueSubclassData(getSubclassDataFromValue() & ~FunctionLocalBit);
-  }
+
+  return MDNode::get(Context, Ops);
 }
 
 MDNode *MDNode::concatenate(MDNode *A, MDNode *B) {
@@ -399,41 +764,50 @@ MDNode *MDNode::concatenate(MDNode *A, MDNode *B) {
   if (!B)
     return A;
 
-  SmallVector<Value *, 4> Vals(A->getNumOperands() +
-                               B->getNumOperands());
-
-  unsigned j = 0;
-  for (unsigned i = 0, ie = A->getNumOperands(); i != ie; ++i)
-    Vals[j++] = A->getOperand(i);
-  for (unsigned i = 0, ie = B->getNumOperands(); i != ie; ++i)
-    Vals[j++] = B->getOperand(i);
+  SmallVector<Metadata *, 4> MDs;
+  MDs.reserve(A->getNumOperands() + B->getNumOperands());
+  MDs.append(A->op_begin(), A->op_end());
+  MDs.append(B->op_begin(), B->op_end());
 
-  return MDNode::get(A->getContext(), Vals);
+  // FIXME: This preserves long-standing behaviour, but is it really the right
+  // behaviour?  Or was that an unintended side-effect of node uniquing?
+  return getOrSelfReference(A->getContext(), MDs);
 }
 
 MDNode *MDNode::intersect(MDNode *A, MDNode *B) {
   if (!A || !B)
     return nullptr;
 
-  SmallVector<Value *, 4> Vals;
-  for (unsigned i = 0, ie = A->getNumOperands(); i != ie; ++i) {
-    Value *V = A->getOperand(i);
-    for (unsigned j = 0, je = B->getNumOperands(); j != je; ++j)
-      if (V == B->getOperand(j)) {
-        Vals.push_back(V);
-        break;
-      }
-  }
+  SmallVector<Metadata *, 4> MDs;
+  for (Metadata *MD : A->operands())
+    if (std::find(B->op_begin(), B->op_end(), MD) != B->op_end())
+      MDs.push_back(MD);
+
+  // FIXME: This preserves long-standing behaviour, but is it really the right
+  // behaviour?  Or was that an unintended side-effect of node uniquing?
+  return getOrSelfReference(A->getContext(), MDs);
+}
+
+MDNode *MDNode::getMostGenericAliasScope(MDNode *A, MDNode *B) {
+  if (!A || !B)
+    return nullptr;
+
+  SmallVector<Metadata *, 4> MDs(B->op_begin(), B->op_end());
+  for (Metadata *MD : A->operands())
+    if (std::find(B->op_begin(), B->op_end(), MD) == B->op_end())
+      MDs.push_back(MD);
 
-  return MDNode::get(A->getContext(), Vals);
+  // FIXME: This preserves long-standing behaviour, but is it really the right
+  // behaviour?  Or was that an unintended side-effect of node uniquing?
+  return getOrSelfReference(A->getContext(), MDs);
 }
 
 MDNode *MDNode::getMostGenericFPMath(MDNode *A, MDNode *B) {
   if (!A || !B)
     return nullptr;
 
-  APFloat AVal = cast<ConstantFP>(A->getOperand(0))->getValueAPF();
-  APFloat BVal = cast<ConstantFP>(B->getOperand(0))->getValueAPF();
+  APFloat AVal = mdconst::extract<ConstantFP>(A->getOperand(0))->getValueAPF();
+  APFloat BVal = mdconst::extract<ConstantFP>(B->getOperand(0))->getValueAPF();
   if (AVal.compare(BVal) == APFloat::cmpLessThan)
     return A;
   return B;
@@ -447,25 +821,27 @@ static bool canBeMerged(const ConstantRange &A, const ConstantRange &B) {
   return !A.intersectWith(B).isEmptySet() || isContiguous(A, B);
 }
 
-static bool tryMergeRange(SmallVectorImpl<Value *> &EndPoints, ConstantInt *Low,
-                          ConstantInt *High) {
+static bool tryMergeRange(SmallVectorImpl<ConstantInt *> &EndPoints,
+                          ConstantInt *Low, ConstantInt *High) {
   ConstantRange NewRange(Low->getValue(), High->getValue());
   unsigned Size = EndPoints.size();
-  APInt LB = cast<ConstantInt>(EndPoints[Size - 2])->getValue();
-  APInt LE = cast<ConstantInt>(EndPoints[Size - 1])->getValue();
+  APInt LB = EndPoints[Size - 2]->getValue();
+  APInt LE = EndPoints[Size - 1]->getValue();
   ConstantRange LastRange(LB, LE);
   if (canBeMerged(NewRange, LastRange)) {
     ConstantRange Union = LastRange.unionWith(NewRange);
     Type *Ty = High->getType();
-    EndPoints[Size - 2] = ConstantInt::get(Ty, Union.getLower());
-    EndPoints[Size - 1] = ConstantInt::get(Ty, Union.getUpper());
+    EndPoints[Size - 2] =
+        cast<ConstantInt>(ConstantInt::get(Ty, Union.getLower()));
+    EndPoints[Size - 1] =
+        cast<ConstantInt>(ConstantInt::get(Ty, Union.getUpper()));
     return true;
   }
   return false;
 }
 
-static void addRange(SmallVectorImpl<Value *> &EndPoints, ConstantInt *Low,
-                     ConstantInt *High) {
+static void addRange(SmallVectorImpl<ConstantInt *> &EndPoints,
+                     ConstantInt *Low, ConstantInt *High) {
   if (!EndPoints.empty())
     if (tryMergeRange(EndPoints, Low, High))
       return;
@@ -487,31 +863,33 @@ MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) {
 
   // First, walk both lists in older of the lower boundary of each interval.
   // At each step, try to merge the new interval to the last one we adedd.
-  SmallVector<Value*, 4> EndPoints;
+  SmallVector<ConstantInt *, 4> EndPoints;
   int AI = 0;
   int BI = 0;
   int AN = A->getNumOperands() / 2;
   int BN = B->getNumOperands() / 2;
   while (AI < AN && BI < BN) {
-    ConstantInt *ALow = cast<ConstantInt>(A->getOperand(2 * AI));
-    ConstantInt *BLow = cast<ConstantInt>(B->getOperand(2 * BI));
+    ConstantInt *ALow = mdconst::extract<ConstantInt>(A->getOperand(2 * AI));
+    ConstantInt *BLow = mdconst::extract<ConstantInt>(B->getOperand(2 * BI));
 
     if (ALow->getValue().slt(BLow->getValue())) {
-      addRange(EndPoints, ALow, cast<ConstantInt>(A->getOperand(2 * AI + 1)));
+      addRange(EndPoints, ALow,
+               mdconst::extract<ConstantInt>(A->getOperand(2 * AI + 1)));
       ++AI;
     } else {
-      addRange(EndPoints, BLow, cast<ConstantInt>(B->getOperand(2 * BI + 1)));
+      addRange(EndPoints, BLow,
+               mdconst::extract<ConstantInt>(B->getOperand(2 * BI + 1)));
       ++BI;
     }
   }
   while (AI < AN) {
-    addRange(EndPoints, cast<ConstantInt>(A->getOperand(2 * AI)),
-             cast<ConstantInt>(A->getOperand(2 * AI + 1)));
+    addRange(EndPoints, mdconst::extract<ConstantInt>(A->getOperand(2 * AI)),
+             mdconst::extract<ConstantInt>(A->getOperand(2 * AI + 1)));
     ++AI;
   }
   while (BI < BN) {
-    addRange(EndPoints, cast<ConstantInt>(B->getOperand(2 * BI)),
-             cast<ConstantInt>(B->getOperand(2 * BI + 1)));
+    addRange(EndPoints, mdconst::extract<ConstantInt>(B->getOperand(2 * BI)),
+             mdconst::extract<ConstantInt>(B->getOperand(2 * BI + 1)));
     ++BI;
   }
 
@@ -519,8 +897,8 @@ MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) {
   // the last and first ones.
   unsigned Size = EndPoints.size();
   if (Size > 4) {
-    ConstantInt *FB = cast<ConstantInt>(EndPoints[0]);
-    ConstantInt *FE = cast<ConstantInt>(EndPoints[1]);
+    ConstantInt *FB = EndPoints[0];
+    ConstantInt *FE = EndPoints[1];
     if (tryMergeRange(EndPoints, FB, FE)) {
       for (unsigned i = 0; i < Size - 2; ++i) {
         EndPoints[i] = EndPoints[i + 2];
@@ -532,26 +910,29 @@ MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) {
   // If in the end we have a single range, it is possible that it is now the
   // full range. Just drop the metadata in that case.
   if (EndPoints.size() == 2) {
-    ConstantRange Range(cast<ConstantInt>(EndPoints[0])->getValue(),
-                        cast<ConstantInt>(EndPoints[1])->getValue());
+    ConstantRange Range(EndPoints[0]->getValue(), EndPoints[1]->getValue());
     if (Range.isFullSet())
       return nullptr;
   }
 
-  return MDNode::get(A->getContext(), EndPoints);
+  SmallVector<Metadata *, 4> MDs;
+  MDs.reserve(EndPoints.size());
+  for (auto *I : EndPoints)
+    MDs.push_back(ConstantAsMetadata::get(I));
+  return MDNode::get(A->getContext(), MDs);
 }
 
 //===----------------------------------------------------------------------===//
 // NamedMDNode implementation.
 //
 
-static SmallVector<TrackingVH<MDNode>, 4> &getNMDOps(void *Operands) {
-  return *(SmallVector<TrackingVH<MDNode>, 4> *)Operands;
+static SmallVector<TrackingMDRef, 4> &getNMDOps(void *Operands) {
+  return *(SmallVector<TrackingMDRef, 4> *)Operands;
 }
 
 NamedMDNode::NamedMDNode(const Twine &N)
     : Name(N.str()), Parent(nullptr),
-      Operands(new SmallVector<TrackingVH<MDNode>, 4>()) {}
+      Operands(new SmallVector<TrackingMDRef, 4>()) {}
 
 NamedMDNode::~NamedMDNode() {
   dropAllReferences();
@@ -564,13 +945,15 @@ unsigned NamedMDNode::getNumOperands() const {
 
 MDNode *NamedMDNode::getOperand(unsigned i) const {
   assert(i < getNumOperands() && "Invalid Operand number!");
-  return &*getNMDOps(Operands)[i];
+  auto *N = getNMDOps(Operands)[i].get();
+  return cast_or_null<MDNode>(N);
 }
 
-void NamedMDNode::addOperand(MDNode *M) {
-  assert(!M->isFunctionLocal() &&
-         "NamedMDNode operands must not be function-local!");
-  getNMDOps(Operands).push_back(TrackingVH<MDNode>(M));
+void NamedMDNode::addOperand(MDNode *M) { getNMDOps(Operands).emplace_back(M); }
+
+void NamedMDNode::setOperand(unsigned I, MDNode *New) {
+  assert(I < getNumOperands() && "Invalid operand number");
+  getNMDOps(Operands)[I].reset(New);
 }
 
 void NamedMDNode::eraseFromParent() {
@@ -630,7 +1013,7 @@ void Instruction::dropUnknownMetadata(ArrayRef<unsigned> KnownIDs) {
       continue;
     }
 
-    Info[I] = Info.back();
+    Info[I] = std::move(Info.back());
     Info.pop_back();
     --E;
   }
@@ -667,13 +1050,14 @@ void Instruction::setMetadata(unsigned KindID, MDNode *Node) {
       // Handle replacement of an existing value.
       for (auto &P : Info)
         if (P.first == KindID) {
-          P.second = Node;
+          P.second.reset(Node);
           return;
         }
     }
 
     // No replacement, just add it to the list.
-    Info.push_back(std::make_pair(KindID, Node));
+    Info.emplace_back(std::piecewise_construct, std::make_tuple(KindID),
+                      std::make_tuple(Node));
     return;
   }
 
@@ -695,7 +1079,7 @@ void Instruction::setMetadata(unsigned KindID, MDNode *Node) {
   // Handle removal of an existing value.
   for (unsigned i = 0, e = Info.size(); i != e; ++i)
     if (Info[i].first == KindID) {
-      Info[i] = Info.back();
+      Info[i] = std::move(Info.back());
       Info.pop_back();
       assert(!Info.empty() && "Removing last entry should be handled above");
       return;
@@ -712,8 +1096,8 @@ void Instruction::setAAMetadata(const AAMDNodes &N) {
 MDNode *Instruction::getMetadataImpl(unsigned KindID) const {
   // Handle 'dbg' as a special case since it is not stored in the hash table.
   if (KindID == LLVMContext::MD_dbg)
-    return DbgLoc.getAsMDNode(getContext());
-  
+    return DbgLoc.getAsMDNode();
+
   if (!hasMetadataHashEntry()) return nullptr;
   
   LLVMContextImpl::MDMapTy &Info = getContext().pImpl->MetadataStore[this];
@@ -731,8 +1115,8 @@ void Instruction::getAllMetadataImpl(
   
   // Handle 'dbg' as a special case since it is not stored in the hash table.
   if (!DbgLoc.isUnknown()) {
-    Result.push_back(std::make_pair((unsigned)LLVMContext::MD_dbg,
-                                    DbgLoc.getAsMDNode(getContext())));
+    Result.push_back(
+        std::make_pair((unsigned)LLVMContext::MD_dbg, DbgLoc.getAsMDNode()));
     if (!hasMetadataHashEntry()) return;
   }
   
@@ -743,7 +1127,9 @@ void Instruction::getAllMetadataImpl(
     getContext().pImpl->MetadataStore.find(this)->second;
   assert(!Info.empty() && "Shouldn't have called this");
 
-  Result.append(Info.begin(), Info.end());
+  Result.reserve(Result.size() + Info.size());
+  for (auto &I : Info)
+    Result.push_back(std::make_pair(I.first, cast<MDNode>(I.second.get())));
 
   // Sort the resulting array so it is stable.
   if (Result.size() > 1)
@@ -759,7 +1145,9 @@ void Instruction::getAllMetadataOtherThanDebugLocImpl(
   const LLVMContextImpl::MDMapTy &Info =
     getContext().pImpl->MetadataStore.find(this)->second;
   assert(!Info.empty() && "Shouldn't have called this");
-  Result.append(Info.begin(), Info.end());
+  Result.reserve(Result.size() + Info.size());
+  for (auto &I : Info)
+    Result.push_back(std::make_pair(I.first, cast<MDNode>(I.second.get())));
 
   // Sort the resulting array so it is stable.
   if (Result.size() > 1)
@@ -773,4 +1161,3 @@ void Instruction::clearMetadataHashEntries() {
   getContext().pImpl->MetadataStore.erase(this);
   setHasMetadataHashEntry(false);
 }
-
diff --git a/lib/IR/MetadataImpl.h b/lib/IR/MetadataImpl.h
new file mode 100644
index 0000000..662a50e
--- /dev/null
+++ b/lib/IR/MetadataImpl.h
@@ -0,0 +1,46 @@
+//===- MetadataImpl.h - Helpers for implementing metadata -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file has private helpers for implementing metadata types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_METADATAIMPL_H
+#define LLVM_IR_METADATAIMPL_H
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/IR/Metadata.h"
+
+namespace llvm {
+
+template <class T, class InfoT>
+static T *getUniqued(DenseSet<T *, InfoT> &Store,
+                     const typename InfoT::KeyTy &Key) {
+  auto I = Store.find_as(Key);
+  return I == Store.end() ? nullptr : *I;
+}
+
+template <class T, class StoreT>
+T *MDNode::storeImpl(T *N, StorageType Storage, StoreT &Store) {
+  switch (Storage) {
+  case Uniqued:
+    Store.insert(N);
+    break;
+  case Distinct:
+    N->storeDistinctInContext();
+    break;
+  case Temporary:
+    break;
+  }
+  return N;
+}
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/IR/MetadataTracking.cpp b/lib/IR/MetadataTracking.cpp
new file mode 100644
index 0000000..47f0b93
--- /dev/null
+++ b/lib/IR/MetadataTracking.cpp
@@ -0,0 +1,55 @@
+//===- MetadataTracking.cpp - Implement metadata tracking -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Metadata tracking.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/MetadataTracking.h"
+#include "llvm/IR/Metadata.h"
+
+using namespace llvm;
+
+ReplaceableMetadataImpl *ReplaceableMetadataImpl::get(Metadata &MD) {
+  if (auto *N = dyn_cast<MDNode>(&MD))
+    return N->Context.getReplaceableUses();
+  return dyn_cast<ValueAsMetadata>(&MD);
+}
+
+bool MetadataTracking::track(void *Ref, Metadata &MD, OwnerTy Owner) {
+  assert(Ref && "Expected live reference");
+  assert((Owner || *static_cast<Metadata **>(Ref) == &MD) &&
+         "Reference without owner must be direct");
+  if (auto *R = ReplaceableMetadataImpl::get(MD)) {
+    R->addRef(Ref, Owner);
+    return true;
+  }
+  return false;
+}
+
+void MetadataTracking::untrack(void *Ref, Metadata &MD) {
+  assert(Ref && "Expected live reference");
+  if (auto *R = ReplaceableMetadataImpl::get(MD))
+    R->dropRef(Ref);
+}
+
+bool MetadataTracking::retrack(void *Ref, Metadata &MD, void *New) {
+  assert(Ref && "Expected live reference");
+  assert(New && "Expected live reference");
+  assert(Ref != New && "Expected change");
+  if (auto *R = ReplaceableMetadataImpl::get(MD)) {
+    R->moveRef(Ref, New, MD);
+    return true;
+  }
+  return false;
+}
+
+bool MetadataTracking::isReplaceable(const Metadata &MD) {
+  return ReplaceableMetadataImpl::get(const_cast<Metadata &>(MD));
+}
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index 14e534b..b0abe8c 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -22,7 +22,7 @@
 #include "llvm/IR/GVMaterializer.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LeakDetector.h"
+#include "llvm/IR/TypeFinder.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/RandomNumberGenerator.h"
@@ -46,7 +46,7 @@ template class llvm::SymbolTableListTraits<GlobalAlias, Module>;
 //
 
 Module::Module(StringRef MID, LLVMContext &C)
-    : Context(C), Materializer(), ModuleID(MID), RNG(nullptr), DL("") {
+    : Context(C), Materializer(), ModuleID(MID), DL("") {
   ValSymTab = new ValueSymbolTable();
   NamedMDSymTab = new StringMap<NamedMDNode *>();
   Context.addModule(this);
@@ -61,9 +61,27 @@ Module::~Module() {
   NamedMDList.clear();
   delete ValSymTab;
   delete static_cast<StringMap<NamedMDNode *> *>(NamedMDSymTab);
-  delete RNG;
 }
 
+RandomNumberGenerator *Module::createRNG(const Pass* P) const {
+  SmallString<32> Salt(P->getPassName());
+
+  // This RNG is guaranteed to produce the same random stream only
+  // when the Module ID and thus the input filename is the same. This
+  // might be problematic if the input filename extension changes
+  // (e.g. from .c to .bc or .ll).
+  //
+  // We could store this salt in NamedMetadata, but this would make
+  // the parameter non-const. This would unfortunately make this
+  // interface unusable by any Machine passes, since they only have a
+  // const reference to their IR Module. Alternatively we can always
+  // store salt metadata from the Module constructor.
+  Salt += sys::path::filename(getModuleIdentifier());
+
+  return new RandomNumberGenerator(Salt);
+}
+
+
 /// getNamedValue - Return the first global value in the module with
 /// the specified name, of arbitrary type.  This method returns null
 /// if a global with the specified name is not found.
@@ -259,8 +277,8 @@ void Module::eraseNamedMetadata(NamedMDNode *NMD) {
   NamedMDList.erase(NMD);
 }
 
-bool Module::isValidModFlagBehavior(Value *V, ModFlagBehavior &MFB) {
-  if (ConstantInt *Behavior = dyn_cast<ConstantInt>(V)) {
+bool Module::isValidModFlagBehavior(Metadata *MD, ModFlagBehavior &MFB) {
+  if (ConstantInt *Behavior = mdconst::dyn_extract_or_null<ConstantInt>(MD)) {
     uint64_t Val = Behavior->getLimitedValue();
     if (Val >= ModFlagBehaviorFirstVal && Val <= ModFlagBehaviorLastVal) {
       MFB = static_cast<ModFlagBehavior>(Val);
@@ -280,11 +298,11 @@ getModuleFlagsMetadata(SmallVectorImpl<ModuleFlagEntry> &Flags) const {
     ModFlagBehavior MFB;
     if (Flag->getNumOperands() >= 3 &&
         isValidModFlagBehavior(Flag->getOperand(0), MFB) &&
-        isa<MDString>(Flag->getOperand(1))) {
+        dyn_cast_or_null<MDString>(Flag->getOperand(1))) {
       // Check the operands of the MDNode before accessing the operands.
       // The verifier will actually catch these failures.
       MDString *Key = cast<MDString>(Flag->getOperand(1));
-      Value *Val = Flag->getOperand(2);
+      Metadata *Val = Flag->getOperand(2);
       Flags.push_back(ModuleFlagEntry(MFB, Key, Val));
     }
   }
@@ -292,7 +310,7 @@ getModuleFlagsMetadata(SmallVectorImpl<ModuleFlagEntry> &Flags) const {
 
 /// Return the corresponding value if Key appears in module flags, otherwise
 /// return null.
-Value *Module::getModuleFlag(StringRef Key) const {
+Metadata *Module::getModuleFlag(StringRef Key) const {
   SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
   getModuleFlagsMetadata(ModuleFlags);
   for (const ModuleFlagEntry &MFE : ModuleFlags) {
@@ -320,14 +338,18 @@ NamedMDNode *Module::getOrInsertModuleFlagsMetadata() {
 /// metadata. It will create the module-level flags named metadata if it doesn't
 /// already exist.
 void Module::addModuleFlag(ModFlagBehavior Behavior, StringRef Key,
-                           Value *Val) {
+                           Metadata *Val) {
   Type *Int32Ty = Type::getInt32Ty(Context);
-  Value *Ops[3] = {
-    ConstantInt::get(Int32Ty, Behavior), MDString::get(Context, Key), Val
-  };
+  Metadata *Ops[3] = {
+      ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Behavior)),
+      MDString::get(Context, Key), Val};
   getOrInsertModuleFlagsMetadata()->addOperand(MDNode::get(Context, Ops));
 }
 void Module::addModuleFlag(ModFlagBehavior Behavior, StringRef Key,
+                           Constant *Val) {
+  addModuleFlag(Behavior, Key, ConstantAsMetadata::get(Val));
+}
+void Module::addModuleFlag(ModFlagBehavior Behavior, StringRef Key,
                            uint32_t Val) {
   Type *Int32Ty = Type::getInt32Ty(Context);
   addModuleFlag(Behavior, Key, ConstantInt::get(Int32Ty, Val));
@@ -335,7 +357,7 @@ void Module::addModuleFlag(ModFlagBehavior Behavior, StringRef Key,
 void Module::addModuleFlag(MDNode *Node) {
   assert(Node->getNumOperands() == 3 &&
          "Invalid number of operands for module flag!");
-  assert(isa<ConstantInt>(Node->getOperand(0)) &&
+  assert(mdconst::hasa<ConstantInt>(Node->getOperand(0)) &&
          isa<MDString>(Node->getOperand(1)) &&
          "Invalid operand types for module flag!");
   getOrInsertModuleFlagsMetadata()->addOperand(Node);
@@ -369,16 +391,6 @@ const DataLayout *Module::getDataLayout() const {
   return &DL;
 }
 
-// We want reproducible builds, but ModuleID may be a full path so we just use
-// the filename to salt the RNG (although it is not guaranteed to be unique).
-RandomNumberGenerator &Module::getRNG() const {
-  if (RNG == nullptr) {
-    StringRef Salt = sys::path::filename(ModuleID);
-    RNG = new RandomNumberGenerator(Salt);
-  }
-  return *RNG;
-}
-
 //===----------------------------------------------------------------------===//
 // Methods to control the materialization of GlobalValues in the Module.
 //
@@ -425,6 +437,19 @@ std::error_code Module::materializeAllPermanently() {
 // Other module related stuff.
 //
 
+std::vector<StructType *> Module::getIdentifiedStructTypes() const {
+  // If we have a materializer, it is possible that some unread function
+  // uses a type that is currently not visible to a TypeFinder, so ask
+  // the materializer which types it created.
+  if (Materializer)
+    return Materializer->getIdentifiedStructTypes();
+
+  std::vector<StructType *> Ret;
+  TypeFinder SrcStructTypes;
+  SrcStructTypes.run(*this, true);
+  Ret.assign(SrcStructTypes.begin(), SrcStructTypes.end());
+  return Ret;
+}
 
 // dropAllReferences() - This function causes all the subelements to "let go"
 // of all references that they are maintaining.  This allows one to 'delete' a
@@ -445,10 +470,10 @@ void Module::dropAllReferences() {
 }
 
 unsigned Module::getDwarfVersion() const {
-  Value *Val = getModuleFlag("Dwarf Version");
+  auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("Dwarf Version"));
   if (!Val)
     return dwarf::DWARF_VERSION;
-  return cast<ConstantInt>(Val)->getZExtValue();
+  return cast<ConstantInt>(Val->getValue())->getZExtValue();
 }
 
 Comdat *Module::getOrInsertComdat(StringRef Name) {
@@ -458,12 +483,13 @@ Comdat *Module::getOrInsertComdat(StringRef Name) {
 }
 
 PICLevel::Level Module::getPICLevel() const {
-  Value *Val = getModuleFlag("PIC Level");
+  auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("PIC Level"));
 
   if (Val == NULL)
     return PICLevel::Default;
 
-  return static_cast<PICLevel::Level>(cast<ConstantInt>(Val)->getZExtValue());
+  return static_cast<PICLevel::Level>(
+      cast<ConstantInt>(Val->getValue())->getZExtValue());
 }
 
 void Module::setPICLevel(PICLevel::Level PL) {
diff --git a/lib/IR/Pass.cpp b/lib/IR/Pass.cpp
index 91d86ae..df45460 100644
--- a/lib/IR/Pass.cpp
+++ b/lib/IR/Pass.cpp
@@ -223,8 +223,8 @@ void PassRegistrationListener::enumeratePasses() {
   PassRegistry::getPassRegistry()->enumerateWith(this);
 }
 
-PassNameParser::PassNameParser()
-    : Opt(nullptr) {
+PassNameParser::PassNameParser(cl::Option &O)
+    : cl::parser<const PassInfo *>(O) {
   PassRegistry::getPassRegistry()->addRegistrationListener(this);
 }
 
diff --git a/lib/IR/PassManager.cpp b/lib/IR/PassManager.cpp
index 2e2a7cb..a5f407c 100644
--- a/lib/IR/PassManager.cpp
+++ b/lib/IR/PassManager.cpp
@@ -10,174 +10,13 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
-static cl::opt<bool>
-DebugPM("debug-pass-manager", cl::Hidden,
-        cl::desc("Print pass management debugging information"));
-
-PreservedAnalyses ModulePassManager::run(Module *M, ModuleAnalysisManager *AM) {
-  PreservedAnalyses PA = PreservedAnalyses::all();
-
-  if (DebugPM)
-    dbgs() << "Starting module pass manager run.\n";
-
-  for (unsigned Idx = 0, Size = Passes.size(); Idx != Size; ++Idx) {
-    if (DebugPM)
-      dbgs() << "Running module pass: " << Passes[Idx]->name() << "\n";
-
-    PreservedAnalyses PassPA = Passes[Idx]->run(M, AM);
-    if (AM)
-      AM->invalidate(M, PassPA);
-    PA.intersect(std::move(PassPA));
-
-    M->getContext().yield();
-  }
-
-  if (DebugPM)
-    dbgs() << "Finished module pass manager run.\n";
-
-  return PA;
-}
-
-ModuleAnalysisManager::ResultConceptT &
-ModuleAnalysisManager::getResultImpl(void *PassID, Module *M) {
-  ModuleAnalysisResultMapT::iterator RI;
-  bool Inserted;
-  std::tie(RI, Inserted) = ModuleAnalysisResults.insert(std::make_pair(
-      PassID, std::unique_ptr<detail::AnalysisResultConcept<Module *>>()));
-
-  // If we don't have a cached result for this module, look up the pass and run
-  // it to produce a result, which we then add to the cache.
-  if (Inserted)
-    RI->second = lookupPass(PassID).run(M, this);
-
-  return *RI->second;
-}
-
-ModuleAnalysisManager::ResultConceptT *
-ModuleAnalysisManager::getCachedResultImpl(void *PassID, Module *M) const {
-  ModuleAnalysisResultMapT::const_iterator RI =
-      ModuleAnalysisResults.find(PassID);
-  return RI == ModuleAnalysisResults.end() ? nullptr : &*RI->second;
-}
-
-void ModuleAnalysisManager::invalidateImpl(void *PassID, Module *M) {
-  ModuleAnalysisResults.erase(PassID);
-}
-
-void ModuleAnalysisManager::invalidateImpl(Module *M,
-                                           const PreservedAnalyses &PA) {
-  // FIXME: This is a total hack based on the fact that erasure doesn't
-  // invalidate iteration for DenseMap.
-  for (ModuleAnalysisResultMapT::iterator I = ModuleAnalysisResults.begin(),
-                                          E = ModuleAnalysisResults.end();
-       I != E; ++I)
-    if (I->second->invalidate(M, PA))
-      ModuleAnalysisResults.erase(I);
-}
-
-PreservedAnalyses FunctionPassManager::run(Function *F,
-                                           FunctionAnalysisManager *AM) {
-  PreservedAnalyses PA = PreservedAnalyses::all();
-
-  if (DebugPM)
-    dbgs() << "Starting function pass manager run.\n";
-
-  for (unsigned Idx = 0, Size = Passes.size(); Idx != Size; ++Idx) {
-    if (DebugPM)
-      dbgs() << "Running function pass: " << Passes[Idx]->name() << "\n";
-
-    PreservedAnalyses PassPA = Passes[Idx]->run(F, AM);
-    if (AM)
-      AM->invalidate(F, PassPA);
-    PA.intersect(std::move(PassPA));
-
-    F->getContext().yield();
-  }
-
-  if (DebugPM)
-    dbgs() << "Finished function pass manager run.\n";
-
-  return PA;
-}
-
-bool FunctionAnalysisManager::empty() const {
-  assert(FunctionAnalysisResults.empty() ==
-             FunctionAnalysisResultLists.empty() &&
-         "The storage and index of analysis results disagree on how many there "
-         "are!");
-  return FunctionAnalysisResults.empty();
-}
-
-void FunctionAnalysisManager::clear() {
-  FunctionAnalysisResults.clear();
-  FunctionAnalysisResultLists.clear();
-}
-
-FunctionAnalysisManager::ResultConceptT &
-FunctionAnalysisManager::getResultImpl(void *PassID, Function *F) {
-  FunctionAnalysisResultMapT::iterator RI;
-  bool Inserted;
-  std::tie(RI, Inserted) = FunctionAnalysisResults.insert(std::make_pair(
-      std::make_pair(PassID, F), FunctionAnalysisResultListT::iterator()));
-
-  // If we don't have a cached result for this function, look up the pass and
-  // run it to produce a result, which we then add to the cache.
-  if (Inserted) {
-    FunctionAnalysisResultListT &ResultList = FunctionAnalysisResultLists[F];
-    ResultList.emplace_back(PassID, lookupPass(PassID).run(F, this));
-    RI->second = std::prev(ResultList.end());
-  }
-
-  return *RI->second->second;
-}
-
-FunctionAnalysisManager::ResultConceptT *
-FunctionAnalysisManager::getCachedResultImpl(void *PassID, Function *F) const {
-  FunctionAnalysisResultMapT::const_iterator RI =
-      FunctionAnalysisResults.find(std::make_pair(PassID, F));
-  return RI == FunctionAnalysisResults.end() ? nullptr : &*RI->second->second;
-}
-
-void FunctionAnalysisManager::invalidateImpl(void *PassID, Function *F) {
-  FunctionAnalysisResultMapT::iterator RI =
-      FunctionAnalysisResults.find(std::make_pair(PassID, F));
-  if (RI == FunctionAnalysisResults.end())
-    return;
-
-  FunctionAnalysisResultLists[F].erase(RI->second);
-}
-
-void FunctionAnalysisManager::invalidateImpl(Function *F,
-                                             const PreservedAnalyses &PA) {
-  // Clear all the invalidated results associated specifically with this
-  // function.
-  SmallVector<void *, 8> InvalidatedPassIDs;
-  FunctionAnalysisResultListT &ResultsList = FunctionAnalysisResultLists[F];
-  for (FunctionAnalysisResultListT::iterator I = ResultsList.begin(),
-                                             E = ResultsList.end();
-       I != E;)
-    if (I->second->invalidate(F, PA)) {
-      InvalidatedPassIDs.push_back(I->first);
-      I = ResultsList.erase(I);
-    } else {
-      ++I;
-    }
-  while (!InvalidatedPassIDs.empty())
-    FunctionAnalysisResults.erase(
-        std::make_pair(InvalidatedPassIDs.pop_back_val(), F));
-  if (ResultsList.empty())
-    FunctionAnalysisResultLists.erase(F);
-}
-
 char FunctionAnalysisManagerModuleProxy::PassID;
 
 FunctionAnalysisManagerModuleProxy::Result
-FunctionAnalysisManagerModuleProxy::run(Module *M) {
+FunctionAnalysisManagerModuleProxy::run(Module &M) {
   assert(FAM->empty() && "Function analyses ran prior to the module proxy!");
   return Result(*FAM);
 }
@@ -189,7 +28,7 @@ FunctionAnalysisManagerModuleProxy::Result::~Result() {
 }
 
 bool FunctionAnalysisManagerModuleProxy::Result::invalidate(
-    Module *M, const PreservedAnalyses &PA) {
+    Module &M, const PreservedAnalyses &PA) {
   // If this proxy isn't marked as preserved, then we can't even invalidate
   // individual function analyses, there may be an invalid set of Function
   // objects in the cache making it impossible to incrementally preserve them.
diff --git a/lib/IR/Statepoint.cpp b/lib/IR/Statepoint.cpp
new file mode 100644
index 0000000..83ee611
--- /dev/null
+++ b/lib/IR/Statepoint.cpp
@@ -0,0 +1,77 @@
+//===-- IR/Statepoint.cpp -- gc.statepoint utilities ---  -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// 
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace std;
+using namespace llvm;
+
+bool llvm::isStatepoint(const ImmutableCallSite &CS) {
+  if (!CS.getInstruction()) {
+    // This is not a call site
+    return false;
+  }
+
+  const Function *F = CS.getCalledFunction();
+  return (F && F->getIntrinsicID() == Intrinsic::experimental_gc_statepoint);
+}
+bool llvm::isStatepoint(const Value *inst) {
+  if (isa<InvokeInst>(inst) || isa<CallInst>(inst)) {
+    ImmutableCallSite CS(inst);
+    return isStatepoint(CS);
+  }
+  return false;
+}
+bool llvm::isStatepoint(const Value &inst) {
+  return isStatepoint(&inst);
+}
+
+bool llvm::isGCRelocate(const ImmutableCallSite &CS) {
+  if (!CS.getInstruction()) {
+    // This is not a call site
+    return false;
+  }
+
+  return isGCRelocate(CS.getInstruction());
+}
+bool llvm::isGCRelocate(const Value *inst) {
+  if (const CallInst *call = dyn_cast<CallInst>(inst)) {
+    if (const Function *F = call->getCalledFunction()) {
+      return F->getIntrinsicID() == Intrinsic::experimental_gc_relocate;
+    }
+  }
+  return false;
+}
+
+bool llvm::isGCResult(const ImmutableCallSite &CS) {
+  if (!CS.getInstruction()) {
+    // This is not a call site
+    return false;
+  }
+
+  return isGCResult(CS.getInstruction());
+}
+bool llvm::isGCResult(const Value *inst) {
+  if (const CallInst *call = dyn_cast<CallInst>(inst)) {
+    if (Function *F = call->getCalledFunction()) {
+      return (F->getIntrinsicID() == Intrinsic::experimental_gc_result_int ||
+              F->getIntrinsicID() == Intrinsic::experimental_gc_result_float ||
+              F->getIntrinsicID() == Intrinsic::experimental_gc_result_ptr ||
+              F->getIntrinsicID() == Intrinsic::experimental_gc_result);
+    }
+  }
+  return false;
+}
diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index 0458b5f..65060dc 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -360,8 +360,7 @@ FunctionType *FunctionType::get(Type *ReturnType,
                                 ArrayRef<Type*> Params, bool isVarArg) {
   LLVMContextImpl *pImpl = ReturnType->getContext().pImpl;
   FunctionTypeKeyInfo::KeyTy Key(ReturnType, Params, isVarArg);
-  LLVMContextImpl::FunctionTypeMap::iterator I =
-    pImpl->FunctionTypes.find_as(Key);
+  auto I = pImpl->FunctionTypes.find_as(Key);
   FunctionType *FT;
 
   if (I == pImpl->FunctionTypes.end()) {
@@ -369,9 +368,9 @@ FunctionType *FunctionType::get(Type *ReturnType,
       Allocate(sizeof(FunctionType) + sizeof(Type*) * (Params.size() + 1),
                AlignOf<FunctionType>::Alignment);
     new (FT) FunctionType(ReturnType, Params, isVarArg);
-    pImpl->FunctionTypes[FT] = true;
+    pImpl->FunctionTypes.insert(FT);
   } else {
-    FT = I->first;
+    FT = *I;
   }
 
   return FT;
@@ -404,8 +403,7 @@ StructType *StructType::get(LLVMContext &Context, ArrayRef<Type*> ETypes,
                             bool isPacked) {
   LLVMContextImpl *pImpl = Context.pImpl;
   AnonStructTypeKeyInfo::KeyTy Key(ETypes, isPacked);
-  LLVMContextImpl::StructTypeMap::iterator I =
-    pImpl->AnonStructTypes.find_as(Key);
+  auto I = pImpl->AnonStructTypes.find_as(Key);
   StructType *ST;
 
   if (I == pImpl->AnonStructTypes.end()) {
@@ -413,9 +411,9 @@ StructType *StructType::get(LLVMContext &Context, ArrayRef<Type*> ETypes,
     ST = new (Context.pImpl->TypeAllocator) StructType(Context);
     ST->setSubclassData(SCDB_IsLiteral);  // Literal struct.
     ST->setBody(ETypes, isPacked);
-    Context.pImpl->AnonStructTypes[ST] = true;
+    Context.pImpl->AnonStructTypes.insert(ST);
   } else {
-    ST = I->first;
+    ST = *I;
   }
 
   return ST;
@@ -710,9 +708,10 @@ VectorType::VectorType(Type *ElType, unsigned NumEl)
 VectorType *VectorType::get(Type *elementType, unsigned NumElements) {
   Type *ElementType = const_cast<Type*>(elementType);
   assert(NumElements > 0 && "#Elements of a VectorType must be greater than 0");
-  assert(isValidElementType(ElementType) &&
-         "Elements of a VectorType must be a primitive type");
-  
+  assert(isValidElementType(ElementType) && "Element type of a VectorType must "
+                                            "be an integer, floating point, or "
+                                            "pointer type.");
+
   LLVMContextImpl *pImpl = ElementType->getContext().pImpl;
   VectorType *&Entry = ElementType->getContext().pImpl
     ->VectorTypes[std::make_pair(ElementType, NumElements)];
diff --git a/lib/IR/TypeFinder.cpp b/lib/IR/TypeFinder.cpp
index 6796075..e2fb8f8 100644
--- a/lib/IR/TypeFinder.cpp
+++ b/lib/IR/TypeFinder.cpp
@@ -47,6 +47,9 @@ void TypeFinder::run(const Module &M, bool onlyNamed) {
     if (FI->hasPrefixData())
       incorporateValue(FI->getPrefixData());
 
+    if (FI->hasPrologueData())
+      incorporateValue(FI->getPrologueData());
+
     // First incorporate the arguments.
     for (Function::const_arg_iterator AI = FI->arg_begin(),
            AE = FI->arg_end(); AI != AE; ++AI)
@@ -122,8 +125,13 @@ void TypeFinder::incorporateType(Type *Ty) {
 /// other ways.  GlobalValues, basic blocks, instructions, and inst operands are
 /// all explicitly enumerated.
 void TypeFinder::incorporateValue(const Value *V) {
-  if (const MDNode *M = dyn_cast<MDNode>(V))
-    return incorporateMDNode(M);
+  if (const auto *M = dyn_cast<MetadataAsValue>(V)) {
+    if (const auto *N = dyn_cast<MDNode>(M->getMetadata()))
+      return incorporateMDNode(N);
+    if (const auto *MDV = dyn_cast<ValueAsMetadata>(M->getMetadata()))
+      return incorporateValue(MDV->getValue());
+    return;
+  }
 
   if (!isa<Constant>(V) || isa<GlobalValue>(V)) return;
 
@@ -149,11 +157,21 @@ void TypeFinder::incorporateValue(const Value *V) {
 /// find types hiding within.
 void TypeFinder::incorporateMDNode(const MDNode *V) {
   // Already visited?
-  if (!VisitedConstants.insert(V).second)
+  if (!VisitedMetadata.insert(V).second)
     return;
 
   // Look in operands for types.
-  for (unsigned i = 0, e = V->getNumOperands(); i != e; ++i)
-    if (Value *Op = V->getOperand(i))
-      incorporateValue(Op);
+  for (unsigned i = 0, e = V->getNumOperands(); i != e; ++i) {
+    Metadata *Op = V->getOperand(i);
+    if (!Op)
+      continue;
+    if (auto *N = dyn_cast<MDNode>(Op)) {
+      incorporateMDNode(N);
+      continue;
+    }
+    if (auto *C = dyn_cast<ConstantAsMetadata>(Op)) {
+      incorporateValue(C->getValue());
+      continue;
+    }
+  }
 }
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index 4e0c11f1..7d205f9 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -23,9 +23,10 @@
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LeakDetector.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Statepoint.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/Debug.h"
@@ -44,9 +45,8 @@ static inline Type *checkType(Type *Ty) {
 }
 
 Value::Value(Type *ty, unsigned scid)
-    : VTy(checkType(ty)), UseList(nullptr), Name(nullptr), SubclassID(scid),
-      HasValueHandle(0), SubclassOptionalData(0), SubclassData(0),
-      NumOperands(0) {
+    : VTy(checkType(ty)), UseList(nullptr), SubclassID(scid), HasValueHandle(0),
+      SubclassOptionalData(0), SubclassData(0), NumOperands(0) {
   // FIXME: Why isn't this in the subclass gunk??
   // Note, we cannot call isa<CallInst> before the CallInst has been
   // constructed.
@@ -63,6 +63,8 @@ Value::~Value() {
   // Notify all ValueHandles (if present) that this value is going away.
   if (HasValueHandle)
     ValueHandleBase::ValueIsDeleted(this);
+  if (isUsedByMetadata())
+    ValueAsMetadata::handleDeletion(this);
 
 #ifndef NDEBUG      // Only in -g mode...
   // Check to make sure that there are no uses of this value that are still
@@ -82,11 +84,14 @@ Value::~Value() {
 
   // If this value is named, destroy the name.  This should not be in a symtab
   // at this point.
-  if (Name && SubclassID != MDStringVal)
-    Name->Destroy();
+  destroyValueName();
+}
 
-  // There should be no uses of this object anymore, remove it.
-  LeakDetector::removeGarbageObject(this);
+void Value::destroyValueName() {
+  ValueName *Name = getValueName();
+  if (Name)
+    Name->Destroy();
+  setValueName(nullptr);
 }
 
 bool Value::hasNUses(unsigned N) const {
@@ -146,9 +151,7 @@ static bool getSymTab(Value *V, ValueSymbolTable *&ST) {
   } else if (Argument *A = dyn_cast<Argument>(V)) {
     if (Function *P = A->getParent())
       ST = &P->getValueSymbolTable();
-  } else if (isa<MDString>(V))
-    return true;
-  else {
+  } else {
     assert(isa<Constant>(V) && "Unknown value type!");
     return true;  // no name is setable for this.
   }
@@ -159,14 +162,12 @@ StringRef Value::getName() const {
   // Make sure the empty string is still a C string. For historical reasons,
   // some clients want to call .data() on the result and expect it to be null
   // terminated.
-  if (!Name) return StringRef("", 0);
-  return Name->getKey();
+  if (!getValueName())
+    return StringRef("", 0);
+  return getValueName()->getKey();
 }
 
 void Value::setName(const Twine &NewName) {
-  assert(SubclassID != MDStringVal &&
-         "Cannot set the name of MDString with this method!");
-
   // Fast path for common IRBuilder case of setName("") when there is no name.
   if (NewName.isTriviallyEmpty() && !hasName())
     return;
@@ -193,20 +194,17 @@ void Value::setName(const Twine &NewName) {
   if (!ST) { // No symbol table to update?  Just do the change.
     if (NameRef.empty()) {
       // Free the name for this value.
-      Name->Destroy();
-      Name = nullptr;
+      destroyValueName();
       return;
     }
 
-    if (Name)
-      Name->Destroy();
-
     // NOTE: Could optimize for the case the name is shrinking to not deallocate
     // then reallocated.
+    destroyValueName();
 
     // Create the new name.
-    Name = ValueName::Create(NameRef);
-    Name->setValue(this);
+    setValueName(ValueName::Create(NameRef));
+    getValueName()->setValue(this);
     return;
   }
 
@@ -214,21 +212,18 @@ void Value::setName(const Twine &NewName) {
   // then reallocated.
   if (hasName()) {
     // Remove old name.
-    ST->removeValueName(Name);
-    Name->Destroy();
-    Name = nullptr;
+    ST->removeValueName(getValueName());
+    destroyValueName();
 
     if (NameRef.empty())
       return;
   }
 
   // Name is changing to something new.
-  Name = ST->createValueName(NameRef, this);
+  setValueName(ST->createValueName(NameRef, this));
 }
 
 void Value::takeName(Value *V) {
-  assert(SubclassID != MDStringVal && "Cannot take the name of an MDString!");
-
   ValueSymbolTable *ST = nullptr;
   // If this value has a name, drop it.
   if (hasName()) {
@@ -242,9 +237,8 @@ void Value::takeName(Value *V) {
 
     // Remove old name.
     if (ST)
-      ST->removeValueName(Name);
-    Name->Destroy();
-    Name = nullptr;
+      ST->removeValueName(getValueName());
+    destroyValueName();
   }
 
   // Now we know that this has no name.
@@ -270,9 +264,9 @@ void Value::takeName(Value *V) {
   // This works even if both values have no symtab yet.
   if (ST == VST) {
     // Take the name!
-    Name = V->Name;
-    V->Name = nullptr;
-    Name->setValue(this);
+    setValueName(V->getValueName());
+    V->setValueName(nullptr);
+    getValueName()->setValue(this);
     return;
   }
 
@@ -280,10 +274,10 @@ void Value::takeName(Value *V) {
   // then reinsert it into ST.
 
   if (VST)
-    VST->removeValueName(V->Name);
-  Name = V->Name;
-  V->Name = nullptr;
-  Name->setValue(this);
+    VST->removeValueName(V->getValueName());
+  setValueName(V->getValueName());
+  V->setValueName(nullptr);
+  getValueName()->setValue(this);
 
   if (ST)
     ST->reinsertValue(this);
@@ -334,6 +328,8 @@ void Value::replaceAllUsesWith(Value *New) {
   // Notify all ValueHandles (if present) that this value is going away.
   if (HasValueHandle)
     ValueHandleBase::ValueIsRAUWd(this, New);
+  if (isUsedByMetadata())
+    ValueAsMetadata::handleRAUW(this, New);
 
   while (!use_empty()) {
     Use &U = *UseList;
@@ -353,6 +349,28 @@ void Value::replaceAllUsesWith(Value *New) {
     BB->replaceSuccessorsPhiUsesWith(cast<BasicBlock>(New));
 }
 
+// Like replaceAllUsesWith except it does not handle constants or basic blocks.
+// This routine leaves uses within BB.
+void Value::replaceUsesOutsideBlock(Value *New, BasicBlock *BB) {
+  assert(New && "Value::replaceUsesOutsideBlock(<null>, BB) is invalid!");
+  assert(!contains(New, this) &&
+         "this->replaceUsesOutsideBlock(expr(this), BB) is NOT valid!");
+  assert(New->getType() == getType() &&
+         "replaceUses of value with new value of different type!");
+  assert(BB && "Basic block that may contain a use of 'New' must be defined\n");
+
+  use_iterator UI = use_begin(), E = use_end();
+  for (; UI != E;) {
+    Use &U = *UI;
+    ++UI;
+    auto *Usr = dyn_cast<Instruction>(U.getUser());
+    if (Usr && Usr->getParent() == BB)
+      continue;
+    U.set(New);
+  }
+  return;
+}
+
 namespace {
 // Various metrics for how much to strip off of pointers.
 enum PointerStripKind {
@@ -480,7 +498,7 @@ static bool isDereferenceablePointer(const Value *V, const DataLayout *DL,
   // is at least as large as for the resulting pointer type, then
   // we can look through the bitcast.
   if (DL)
-    if (const BitCastInst* BC = dyn_cast<BitCastInst>(V)) {
+    if (const BitCastOperator *BC = dyn_cast<BitCastOperator>(V)) {
       Type *STy = BC->getSrcTy()->getPointerElementType(),
            *DTy = BC->getDestTy()->getPointerElementType();
       if (STy->isSized() && DTy->isSized() &&
@@ -554,6 +572,13 @@ static bool isDereferenceablePointer(const Value *V, const DataLayout *DL,
     return true;
   }
 
+  // For gc.relocate, look through relocations
+  if (const IntrinsicInst *I = dyn_cast<IntrinsicInst>(V))
+    if (I->getIntrinsicID() == Intrinsic::experimental_gc_relocate) {
+      GCRelocateOperands RelocateInst(I);
+      return isDereferenceablePointer(RelocateInst.derivedPtr(), DL, Visited);
+    }
+
   if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(V))
     return isDereferenceablePointer(ASC->getOperand(0), DL, Visited);
 
@@ -629,7 +654,7 @@ void ValueHandleBase::AddToExistingUseList(ValueHandleBase **List) {
   setPrevPtr(List);
   if (Next) {
     Next->setPrevPtr(&Next);
-    assert(VP.getPointer() == Next->VP.getPointer() && "Added to wrong list?");
+    assert(V == Next->V && "Added to wrong list?");
   }
 }
 
@@ -644,14 +669,14 @@ void ValueHandleBase::AddToExistingUseListAfter(ValueHandleBase *List) {
 }
 
 void ValueHandleBase::AddToUseList() {
-  assert(VP.getPointer() && "Null pointer doesn't have a use list!");
+  assert(V && "Null pointer doesn't have a use list!");
 
-  LLVMContextImpl *pImpl = VP.getPointer()->getContext().pImpl;
+  LLVMContextImpl *pImpl = V->getContext().pImpl;
 
-  if (VP.getPointer()->HasValueHandle) {
+  if (V->HasValueHandle) {
     // If this value already has a ValueHandle, then it must be in the
     // ValueHandles map already.
-    ValueHandleBase *&Entry = pImpl->ValueHandles[VP.getPointer()];
+    ValueHandleBase *&Entry = pImpl->ValueHandles[V];
     assert(Entry && "Value doesn't have any handles?");
     AddToExistingUseList(&Entry);
     return;
@@ -665,10 +690,10 @@ void ValueHandleBase::AddToUseList() {
   DenseMap<Value*, ValueHandleBase*> &Handles = pImpl->ValueHandles;
   const void *OldBucketPtr = Handles.getPointerIntoBucketsArray();
 
-  ValueHandleBase *&Entry = Handles[VP.getPointer()];
+  ValueHandleBase *&Entry = Handles[V];
   assert(!Entry && "Value really did already have handles?");
   AddToExistingUseList(&Entry);
-  VP.getPointer()->HasValueHandle = true;
+  V->HasValueHandle = true;
 
   // If reallocation didn't happen or if this was the first insertion, don't
   // walk the table.
@@ -680,14 +705,14 @@ void ValueHandleBase::AddToUseList() {
   // Okay, reallocation did happen.  Fix the Prev Pointers.
   for (DenseMap<Value*, ValueHandleBase*>::iterator I = Handles.begin(),
        E = Handles.end(); I != E; ++I) {
-    assert(I->second && I->first == I->second->VP.getPointer() &&
+    assert(I->second && I->first == I->second->V &&
            "List invariant broken!");
     I->second->setPrevPtr(&I->second);
   }
 }
 
 void ValueHandleBase::RemoveFromUseList() {
-  assert(VP.getPointer() && VP.getPointer()->HasValueHandle &&
+  assert(V && V->HasValueHandle &&
          "Pointer doesn't have a use list!");
 
   // Unlink this from its use list.
@@ -704,11 +729,11 @@ void ValueHandleBase::RemoveFromUseList() {
   // If the Next pointer was null, then it is possible that this was the last
   // ValueHandle watching VP.  If so, delete its entry from the ValueHandles
   // map.
-  LLVMContextImpl *pImpl = VP.getPointer()->getContext().pImpl;
+  LLVMContextImpl *pImpl = V->getContext().pImpl;
   DenseMap<Value*, ValueHandleBase*> &Handles = pImpl->ValueHandles;
   if (Handles.isPointerIntoBucketsArray(PrevPtr)) {
-    Handles.erase(VP.getPointer());
-    VP.getPointer()->HasValueHandle = false;
+    Handles.erase(V);
+    V->HasValueHandle = false;
   }
 }
 
diff --git a/lib/IR/ValueSymbolTable.cpp b/lib/IR/ValueSymbolTable.cpp
index 2b23f6d..4f078f0 100644
--- a/lib/IR/ValueSymbolTable.cpp
+++ b/lib/IR/ValueSymbolTable.cpp
@@ -38,8 +38,8 @@ void ValueSymbolTable::reinsertValue(Value* V) {
   assert(V->hasName() && "Can't insert nameless Value into symbol table");
 
   // Try inserting the name, assuming it won't conflict.
-  if (vmap.insert(V->Name)) {
-    //DEBUG(dbgs() << " Inserted value: " << V->Name << ": " << *V << "\n");
+  if (vmap.insert(V->getValueName())) {
+    //DEBUG(dbgs() << " Inserted value: " << V->getValueName() << ": " << *V << "\n");
     return;
   }
   
@@ -47,8 +47,8 @@ void ValueSymbolTable::reinsertValue(Value* V) {
   SmallString<256> UniqueName(V->getName().begin(), V->getName().end());
 
   // The name is too already used, just free it so we can allocate a new name.
-  V->Name->Destroy();
-  
+  V->getValueName()->Destroy();
+
   unsigned BaseSize = UniqueName.size();
   while (1) {
     // Trim any suffix off and append the next number.
@@ -59,7 +59,7 @@ void ValueSymbolTable::reinsertValue(Value* V) {
     auto IterBool = vmap.insert(std::make_pair(UniqueName, V));
     if (IterBool.second) {
       // Newly inserted name.  Success!
-      V->Name = &*IterBool.first;
+      V->setValueName(&*IterBool.first);
      //DEBUG(dbgs() << " Inserted value: " << UniqueName << ": " << *V << "\n");
       return;
     }
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 9698dbd..d01e138 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -68,6 +68,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/Statepoint.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -101,6 +102,13 @@ struct VerifierSupport {
     }
   }
 
+  void WriteMetadata(const Metadata *MD) {
+    if (!MD)
+      return;
+    MD->printAsOperand(OS, true, M);
+    OS << '\n';
+  }
+
   void WriteType(Type *T) {
     if (!T)
       return;
@@ -127,6 +135,24 @@ struct VerifierSupport {
     Broken = true;
   }
 
+  void CheckFailed(const Twine &Message, const Metadata *V1, const Metadata *V2,
+                   const Metadata *V3 = nullptr, const Metadata *V4 = nullptr) {
+    OS << Message.str() << "\n";
+    WriteMetadata(V1);
+    WriteMetadata(V2);
+    WriteMetadata(V3);
+    WriteMetadata(V4);
+    Broken = true;
+  }
+
+  void CheckFailed(const Twine &Message, const Metadata *V1,
+                   const Value *V2 = nullptr) {
+    OS << Message.str() << "\n";
+    WriteMetadata(V1);
+    WriteValue(V2);
+    Broken = true;
+  }
+
   void CheckFailed(const Twine &Message, const Value *V1, Type *T2,
                    const Value *V3 = nullptr) {
     OS << Message.str() << "\n";
@@ -155,7 +181,6 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   friend class InstVisitor<Verifier>;
 
   LLVMContext *Context;
-  const DataLayout *DL;
   DominatorTree DT;
 
   /// \brief When verifying a basic block, keep track of all of the
@@ -166,17 +191,21 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   SmallPtrSet<Instruction *, 16> InstsInThisBlock;
 
   /// \brief Keep track of the metadata nodes that have been checked already.
-  SmallPtrSet<MDNode *, 32> MDNodes;
+  SmallPtrSet<const Metadata *, 32> MDNodes;
 
   /// \brief The personality function referenced by the LandingPadInsts.
   /// All LandingPadInsts within the same function must use the same
   /// personality function.
   const Value *PersonalityFn;
 
+  /// \brief Whether we've seen a call to @llvm.frameallocate in this function
+  /// already.
+  bool SawFrameAllocate;
+
 public:
   explicit Verifier(raw_ostream &OS = dbgs())
-      : VerifierSupport(OS), Context(nullptr), DL(nullptr),
-        PersonalityFn(nullptr) {}
+      : VerifierSupport(OS), Context(nullptr), PersonalityFn(nullptr),
+        SawFrameAllocate(false) {}
 
   bool verify(const Function &F) {
     M = F.getParent();
@@ -211,6 +240,7 @@ public:
     visit(const_cast<Function &>(F));
     InstsInThisBlock.clear();
     PersonalityFn = nullptr;
+    SawFrameAllocate = false;
 
     return !Broken;
   }
@@ -260,7 +290,9 @@ private:
   void visitAliaseeSubExpr(SmallPtrSetImpl<const GlobalAlias *> &Visited,
                            const GlobalAlias &A, const Constant &C);
   void visitNamedMDNode(const NamedMDNode &NMD);
-  void visitMDNode(MDNode &MD, Function *F);
+  void visitMDNode(const MDNode &MD);
+  void visitMetadataAsValue(const MetadataAsValue &MD, Function *F);
+  void visitValueAsMetadata(const ValueAsMetadata &MD, Function *F);
   void visitComdat(const Comdat &C);
   void visitModuleIdents(const Module &M);
   void visitModuleFlags(const Module &M);
@@ -271,6 +303,8 @@ private:
   void visitBasicBlock(BasicBlock &BB);
   void visitRangeMetadata(Instruction& I, MDNode* Range, Type* Ty);
 
+#define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS) void visit##CLASS(const CLASS &N);
+#include "llvm/IR/Metadata.def"
 
   // InstVisitor overrides...
   using InstVisitor<Verifier>::visit;
@@ -337,8 +371,8 @@ private:
   void VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
                            const Value *V);
 
-  void VerifyBitcastType(const Value *V, Type *DestTy, Type *SrcTy);
   void VerifyConstantExprBitcastType(const ConstantExpr *CE);
+  void VerifyStatepoint(ImmutableCallSite CS);
 };
 class DebugInfoVerifier : public VerifierSupport {
 public:
@@ -484,8 +518,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
       continue;
 
     if (const User *U = dyn_cast<User>(V)) {
-      for (unsigned I = 0, N = U->getNumOperands(); I != N; ++I)
-        WorkStack.push_back(U->getOperand(I));
+      WorkStack.append(U->op_begin(), U->op_end());
     }
 
     if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
@@ -559,59 +592,210 @@ void Verifier::visitNamedMDNode(const NamedMDNode &NMD) {
     if (!MD)
       continue;
 
-    Assert1(!MD->isFunctionLocal(),
-            "Named metadata operand cannot be function local!", MD);
-    visitMDNode(*MD, nullptr);
+    visitMDNode(*MD);
   }
 }
 
-void Verifier::visitMDNode(MDNode &MD, Function *F) {
+void Verifier::visitMDNode(const MDNode &MD) {
   // Only visit each node once.  Metadata can be mutually recursive, so this
   // avoids infinite recursion here, as well as being an optimization.
   if (!MDNodes.insert(&MD).second)
     return;
 
+  switch (MD.getMetadataID()) {
+  default:
+    llvm_unreachable("Invalid MDNode subclass");
+  case Metadata::MDTupleKind:
+    break;
+#define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS)                                  \
+  case Metadata::CLASS##Kind:                                                  \
+    visit##CLASS(cast<CLASS>(MD));                                             \
+    break;
+#include "llvm/IR/Metadata.def"
+  }
+
   for (unsigned i = 0, e = MD.getNumOperands(); i != e; ++i) {
-    Value *Op = MD.getOperand(i);
+    Metadata *Op = MD.getOperand(i);
     if (!Op)
       continue;
-    if (isa<Constant>(Op) || isa<MDString>(Op))
+    Assert2(!isa<LocalAsMetadata>(Op), "Invalid operand for global metadata!",
+            &MD, Op);
+    if (auto *N = dyn_cast<MDNode>(Op)) {
+      visitMDNode(*N);
       continue;
-    if (MDNode *N = dyn_cast<MDNode>(Op)) {
-      Assert2(MD.isFunctionLocal() || !N->isFunctionLocal(),
-              "Global metadata operand cannot be function local!", &MD, N);
-      visitMDNode(*N, F);
+    }
+    if (auto *V = dyn_cast<ValueAsMetadata>(Op)) {
+      visitValueAsMetadata(*V, nullptr);
       continue;
     }
-    Assert2(MD.isFunctionLocal(), "Invalid operand for global metadata!", &MD, Op);
-
-    // If this was an instruction, bb, or argument, verify that it is in the
-    // function that we expect.
-    Function *ActualF = nullptr;
-    if (Instruction *I = dyn_cast<Instruction>(Op))
-      ActualF = I->getParent()->getParent();
-    else if (BasicBlock *BB = dyn_cast<BasicBlock>(Op))
-      ActualF = BB->getParent();
-    else if (Argument *A = dyn_cast<Argument>(Op))
-      ActualF = A->getParent();
-    assert(ActualF && "Unimplemented function local metadata case!");
-
-    Assert2(ActualF == F, "function-local metadata used in wrong function",
-            &MD, Op);
   }
+
+  // Check these last, so we diagnose problems in operands first.
+  Assert1(!MD.isTemporary(), "Expected no forward declarations!", &MD);
+  Assert1(MD.isResolved(), "All nodes should be resolved!", &MD);
+}
+
+void Verifier::visitValueAsMetadata(const ValueAsMetadata &MD, Function *F) {
+  Assert1(MD.getValue(), "Expected valid value", &MD);
+  Assert2(!MD.getValue()->getType()->isMetadataTy(),
+          "Unexpected metadata round-trip through values", &MD, MD.getValue());
+
+  auto *L = dyn_cast<LocalAsMetadata>(&MD);
+  if (!L)
+    return;
+
+  Assert1(F, "function-local metadata used outside a function", L);
+
+  // If this was an instruction, bb, or argument, verify that it is in the
+  // function that we expect.
+  Function *ActualF = nullptr;
+  if (Instruction *I = dyn_cast<Instruction>(L->getValue())) {
+    Assert2(I->getParent(), "function-local metadata not in basic block", L, I);
+    ActualF = I->getParent()->getParent();
+  } else if (BasicBlock *BB = dyn_cast<BasicBlock>(L->getValue()))
+    ActualF = BB->getParent();
+  else if (Argument *A = dyn_cast<Argument>(L->getValue()))
+    ActualF = A->getParent();
+  assert(ActualF && "Unimplemented function local metadata case!");
+
+  Assert1(ActualF == F, "function-local metadata used in wrong function", L);
+}
+
+void Verifier::visitMetadataAsValue(const MetadataAsValue &MDV, Function *F) {
+  Metadata *MD = MDV.getMetadata();
+  if (auto *N = dyn_cast<MDNode>(MD)) {
+    visitMDNode(*N);
+    return;
+  }
+
+  // Only visit each node once.  Metadata can be mutually recursive, so this
+  // avoids infinite recursion here, as well as being an optimization.
+  if (!MDNodes.insert(MD).second)
+    return;
+
+  if (auto *V = dyn_cast<ValueAsMetadata>(MD))
+    visitValueAsMetadata(*V, F);
+}
+
+void Verifier::visitMDLocation(const MDLocation &N) {
+  Assert1(N.getScope(), "location requires a valid scope", &N);
+  if (auto *IA = N.getInlinedAt())
+    Assert2(isa<MDLocation>(IA), "inlined-at should be a location", &N, IA);
+}
+
+void Verifier::visitGenericDebugNode(const GenericDebugNode &N) {
+  Assert1(N.getTag(), "invalid tag", &N);
+}
+
+void Verifier::visitMDSubrange(const MDSubrange &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_subrange_type, "invalid tag", &N);
+}
+
+void Verifier::visitMDEnumerator(const MDEnumerator &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_enumerator, "invalid tag", &N);
+}
+
+void Verifier::visitMDBasicType(const MDBasicType &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_base_type ||
+              N.getTag() == dwarf::DW_TAG_unspecified_type,
+          "invalid tag", &N);
+}
+
+void Verifier::visitMDDerivedType(const MDDerivedType &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_typedef ||
+              N.getTag() == dwarf::DW_TAG_pointer_type ||
+              N.getTag() == dwarf::DW_TAG_ptr_to_member_type ||
+              N.getTag() == dwarf::DW_TAG_reference_type ||
+              N.getTag() == dwarf::DW_TAG_rvalue_reference_type ||
+              N.getTag() == dwarf::DW_TAG_const_type ||
+              N.getTag() == dwarf::DW_TAG_volatile_type ||
+              N.getTag() == dwarf::DW_TAG_restrict_type ||
+              N.getTag() == dwarf::DW_TAG_member ||
+              N.getTag() == dwarf::DW_TAG_inheritance ||
+              N.getTag() == dwarf::DW_TAG_friend,
+          "invalid tag", &N);
+}
+
+void Verifier::visitMDCompositeType(const MDCompositeType &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_array_type ||
+              N.getTag() == dwarf::DW_TAG_structure_type ||
+              N.getTag() == dwarf::DW_TAG_union_type ||
+              N.getTag() == dwarf::DW_TAG_enumeration_type ||
+              N.getTag() == dwarf::DW_TAG_subroutine_type ||
+              N.getTag() == dwarf::DW_TAG_class_type,
+          "invalid tag", &N);
+}
+
+void Verifier::visitMDSubroutineType(const MDSubroutineType &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_subroutine_type, "invalid tag", &N);
+}
+
+void Verifier::visitMDFile(const MDFile &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_file_type, "invalid tag", &N);
+}
+
+void Verifier::visitMDCompileUnit(const MDCompileUnit &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_compile_unit, "invalid tag", &N);
+}
+
+void Verifier::visitMDSubprogram(const MDSubprogram &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_subprogram, "invalid tag", &N);
+}
+
+void Verifier::visitMDLexicalBlock(const MDLexicalBlock &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_lexical_block, "invalid tag", &N);
+}
+
+void Verifier::visitMDLexicalBlockFile(const MDLexicalBlockFile &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_lexical_block, "invalid tag", &N);
+}
+
+void Verifier::visitMDNamespace(const MDNamespace &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_namespace, "invalid tag", &N);
+}
+
+void Verifier::visitMDTemplateTypeParameter(const MDTemplateTypeParameter &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_template_type_parameter, "invalid tag",
+          &N);
+}
+
+void Verifier::visitMDTemplateValueParameter(
+    const MDTemplateValueParameter &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_template_value_parameter ||
+              N.getTag() == dwarf::DW_TAG_GNU_template_template_param ||
+              N.getTag() == dwarf::DW_TAG_GNU_template_parameter_pack,
+          "invalid tag", &N);
+}
+
+void Verifier::visitMDGlobalVariable(const MDGlobalVariable &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_variable, "invalid tag", &N);
+}
+
+void Verifier::visitMDLocalVariable(const MDLocalVariable &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_auto_variable ||
+              N.getTag() == dwarf::DW_TAG_arg_variable,
+          "invalid tag", &N);
+}
+
+void Verifier::visitMDExpression(const MDExpression &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_expression, "invalid tag", &N);
+  Assert1(N.isValid(), "invalid expression", &N);
+}
+
+void Verifier::visitMDObjCProperty(const MDObjCProperty &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_APPLE_property, "invalid tag", &N);
+}
+
+void Verifier::visitMDImportedEntity(const MDImportedEntity &N) {
+  Assert1(N.getTag() == dwarf::DW_TAG_imported_module ||
+              N.getTag() == dwarf::DW_TAG_imported_declaration,
+          "invalid tag", &N);
 }
 
 void Verifier::visitComdat(const Comdat &C) {
-  // All Comdat::SelectionKind values other than Comdat::Any require a
-  // GlobalValue with the same name as the Comdat.
-  const GlobalValue *GV = M->getNamedValue(C.getName());
-  if (C.getSelectionKind() != Comdat::Any)
-    Assert1(GV,
-            "comdat selection kind requires a global value with the same name",
-            &C);
   // The Module is invalid if the GlobalValue has private linkage.  Entities
   // with private linkage don't have entries in the symbol table.
-  if (GV)
+  if (const GlobalValue *GV = M->getNamedValue(C.getName()))
     Assert1(!GV->hasPrivateLinkage(), "comdat global value has private linkage",
             GV);
 }
@@ -627,7 +811,7 @@ void Verifier::visitModuleIdents(const Module &M) {
     const MDNode *N = Idents->getOperand(i);
     Assert1(N->getNumOperands() == 1,
             "incorrect number of operands in llvm.ident metadata", N);
-    Assert1(isa<MDString>(N->getOperand(0)),
+    Assert1(dyn_cast_or_null<MDString>(N->getOperand(0)),
             ("invalid value for llvm.ident metadata entry operand"
              "(the operand should be a string)"),
             N->getOperand(0));
@@ -649,7 +833,7 @@ void Verifier::visitModuleFlags(const Module &M) {
   for (unsigned I = 0, E = Requirements.size(); I != E; ++I) {
     const MDNode *Requirement = Requirements[I];
     const MDString *Flag = cast<MDString>(Requirement->getOperand(0));
-    const Value *ReqValue = Requirement->getOperand(1);
+    const Metadata *ReqValue = Requirement->getOperand(1);
 
     const MDNode *Op = SeenIDs.lookup(Flag);
     if (!Op) {
@@ -678,14 +862,14 @@ Verifier::visitModuleFlag(const MDNode *Op,
   Module::ModFlagBehavior MFB;
   if (!Module::isValidModFlagBehavior(Op->getOperand(0), MFB)) {
     Assert1(
-        dyn_cast<ConstantInt>(Op->getOperand(0)),
+        mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(0)),
         "invalid behavior operand in module flag (expected constant integer)",
         Op->getOperand(0));
     Assert1(false,
             "invalid behavior operand in module flag (unexpected constant)",
             Op->getOperand(0));
   }
-  MDString *ID = dyn_cast<MDString>(Op->getOperand(1));
+  MDString *ID = dyn_cast_or_null<MDString>(Op->getOperand(1));
   Assert1(ID,
           "invalid ID operand in module flag (expected metadata string)",
           Op->getOperand(1));
@@ -960,48 +1144,13 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
   }
 }
 
-void Verifier::VerifyBitcastType(const Value *V, Type *DestTy, Type *SrcTy) {
-  // Get the size of the types in bits, we'll need this later
-  unsigned SrcBitSize = SrcTy->getPrimitiveSizeInBits();
-  unsigned DestBitSize = DestTy->getPrimitiveSizeInBits();
-
-  // BitCast implies a no-op cast of type only. No bits change.
-  // However, you can't cast pointers to anything but pointers.
-  Assert1(SrcTy->isPointerTy() == DestTy->isPointerTy(),
-          "Bitcast requires both operands to be pointer or neither", V);
-  Assert1(SrcBitSize == DestBitSize,
-          "Bitcast requires types of same width", V);
-
-  // Disallow aggregates.
-  Assert1(!SrcTy->isAggregateType(),
-          "Bitcast operand must not be aggregate", V);
-  Assert1(!DestTy->isAggregateType(),
-          "Bitcast type must not be aggregate", V);
-
-  // Without datalayout, assume all address spaces are the same size.
-  // Don't check if both types are not pointers.
-  // Skip casts between scalars and vectors.
-  if (!DL ||
-      !SrcTy->isPtrOrPtrVectorTy() ||
-      !DestTy->isPtrOrPtrVectorTy() ||
-      SrcTy->isVectorTy() != DestTy->isVectorTy()) {
+void Verifier::VerifyConstantExprBitcastType(const ConstantExpr *CE) {
+  if (CE->getOpcode() != Instruction::BitCast)
     return;
-  }
-
-  unsigned SrcAS = SrcTy->getPointerAddressSpace();
-  unsigned DstAS = DestTy->getPointerAddressSpace();
-
-  Assert1(SrcAS == DstAS,
-          "Bitcasts between pointers of different address spaces is not legal."
-          "Use AddrSpaceCast instead.", V);
-}
 
-void Verifier::VerifyConstantExprBitcastType(const ConstantExpr *CE) {
-  if (CE->getOpcode() == Instruction::BitCast) {
-    Type *SrcTy = CE->getOperand(0)->getType();
-    Type *DstTy = CE->getType();
-    VerifyBitcastType(CE, DstTy, SrcTy);
-  }
+  Assert1(CastInst::castIsValid(Instruction::BitCast, CE->getOperand(0),
+                                CE->getType()),
+          "Invalid bitcast", CE);
 }
 
 bool Verifier::VerifyAttributeCount(AttributeSet Attrs, unsigned Params) {
@@ -1018,6 +1167,105 @@ bool Verifier::VerifyAttributeCount(AttributeSet Attrs, unsigned Params) {
   return false;
 }
 
+/// \brief Verify that statepoint intrinsic is well formed.
+void Verifier::VerifyStatepoint(ImmutableCallSite CS) {
+  assert(CS.getCalledFunction() &&
+         CS.getCalledFunction()->getIntrinsicID() ==
+           Intrinsic::experimental_gc_statepoint);
+
+  const Instruction &CI = *CS.getInstruction();
+
+  Assert1(!CS.doesNotAccessMemory() &&
+          !CS.onlyReadsMemory(),
+          "gc.statepoint must read and write memory to preserve "
+          "reordering restrictions required by safepoint semantics", &CI);
+    
+  const Value *Target = CS.getArgument(0);
+  const PointerType *PT = dyn_cast<PointerType>(Target->getType());
+  Assert2(PT && PT->getElementType()->isFunctionTy(),
+          "gc.statepoint callee must be of function pointer type",
+          &CI, Target);
+  FunctionType *TargetFuncType = cast<FunctionType>(PT->getElementType());
+
+  const Value *NumCallArgsV = CS.getArgument(1);
+  Assert1(isa<ConstantInt>(NumCallArgsV),
+          "gc.statepoint number of arguments to underlying call "
+          "must be constant integer", &CI);
+  const int NumCallArgs = cast<ConstantInt>(NumCallArgsV)->getZExtValue();
+  Assert1(NumCallArgs >= 0,
+          "gc.statepoint number of arguments to underlying call "
+          "must be positive", &CI);
+  const int NumParams = (int)TargetFuncType->getNumParams();
+  if (TargetFuncType->isVarArg()) {
+    Assert1(NumCallArgs >= NumParams,
+            "gc.statepoint mismatch in number of vararg call args", &CI);
+
+    // TODO: Remove this limitation
+    Assert1(TargetFuncType->getReturnType()->isVoidTy(),
+            "gc.statepoint doesn't support wrapping non-void "
+            "vararg functions yet", &CI);
+  } else
+    Assert1(NumCallArgs == NumParams,
+            "gc.statepoint mismatch in number of call args", &CI);
+
+  const Value *Unused = CS.getArgument(2);
+  Assert1(isa<ConstantInt>(Unused) &&
+          cast<ConstantInt>(Unused)->isNullValue(),
+          "gc.statepoint parameter #3 must be zero", &CI);
+
+  // Verify that the types of the call parameter arguments match
+  // the type of the wrapped callee.
+  for (int i = 0; i < NumParams; i++) {
+    Type *ParamType = TargetFuncType->getParamType(i);
+    Type *ArgType = CS.getArgument(3+i)->getType();
+    Assert1(ArgType == ParamType,
+            "gc.statepoint call argument does not match wrapped "
+            "function type", &CI);
+  }
+  const int EndCallArgsInx = 2+NumCallArgs;
+  const Value *NumDeoptArgsV = CS.getArgument(EndCallArgsInx+1);
+  Assert1(isa<ConstantInt>(NumDeoptArgsV),
+          "gc.statepoint number of deoptimization arguments "
+          "must be constant integer", &CI);
+  const int NumDeoptArgs = cast<ConstantInt>(NumDeoptArgsV)->getZExtValue();
+  Assert1(NumDeoptArgs >= 0,
+          "gc.statepoint number of deoptimization arguments "
+          "must be positive", &CI);
+
+  Assert1(4 + NumCallArgs + NumDeoptArgs <= (int)CS.arg_size(),
+          "gc.statepoint too few arguments according to length fields", &CI);
+    
+  // Check that the only uses of this gc.statepoint are gc.result or 
+  // gc.relocate calls which are tied to this statepoint and thus part
+  // of the same statepoint sequence
+  for (const User *U : CI.users()) {
+    const CallInst *Call = dyn_cast<const CallInst>(U);
+    Assert2(Call, "illegal use of statepoint token", &CI, U);
+    if (!Call) continue;
+    Assert2(isGCRelocate(Call) || isGCResult(Call),
+            "gc.result or gc.relocate are the only value uses"
+            "of a gc.statepoint", &CI, U);
+    if (isGCResult(Call)) {
+      Assert2(Call->getArgOperand(0) == &CI,
+              "gc.result connected to wrong gc.statepoint",
+              &CI, Call);
+    } else if (isGCRelocate(Call)) {
+      Assert2(Call->getArgOperand(0) == &CI,
+              "gc.relocate connected to wrong gc.statepoint",
+              &CI, Call);
+    }
+  }
+
+  // Note: It is legal for a single derived pointer to be listed multiple
+  // times.  It's non-optimal, but it is legal.  It can also happen after
+  // insertion if we strip a bitcast away.
+  // Note: It is really tempting to check that each base is relocated and
+  // that a derived pointer is never reused as a base pointer.  This turns
+  // out to be problematic since optimizations run after safepoint insertion
+  // can recognize equality properties that the insertion logic doesn't know
+  // about.  See example statepoint.ll in the verifier subdirectory
+}
+
 // visitFunction - Verify that a function is ok.
 //
 void Verifier::visitFunction(const Function &F) {
@@ -1101,7 +1349,7 @@ void Verifier::visitFunction(const Function &F) {
 
     // Check the entry node
     const BasicBlock *Entry = &F.getEntryBlock();
-    Assert1(pred_begin(Entry) == pred_end(Entry),
+    Assert1(pred_empty(Entry),
             "Entry block to function must not have predecessors!", Entry);
 
     // The address of the entry block cannot be taken, unless it is dead.
@@ -1482,9 +1730,9 @@ void Verifier::visitIntToPtrInst(IntToPtrInst &I) {
 }
 
 void Verifier::visitBitCastInst(BitCastInst &I) {
-  Type *SrcTy = I.getOperand(0)->getType();
-  Type *DestTy = I.getType();
-  VerifyBitcastType(&I, DestTy, SrcTy);
+  Assert1(
+      CastInst::castIsValid(Instruction::BitCast, I.getOperand(0), I.getType()),
+      "Invalid bitcast", &I);
   visitInstruction(I);
 }
 
@@ -1732,6 +1980,13 @@ void Verifier::visitInvokeInst(InvokeInst &II) {
   Assert1(II.getUnwindDest()->isLandingPad(),
           "The unwind destination does not have a landingpad instruction!",&II);
 
+  if (Function *F = II.getCalledFunction())
+    // TODO: Ideally we should use visitIntrinsicFunction here. But it uses
+    //       CallInst as an input parameter. It not woth updating this whole
+    //       function only to support statepoint verification.
+    if (F->getIntrinsicID() == Intrinsic::experimental_gc_statepoint)
+      VerifyStatepoint(ImmutableCallSite(&II));
+
   visitTerminatorInst(II);
 }
 
@@ -1906,9 +2161,11 @@ void Verifier::visitRangeMetadata(Instruction& I,
   
   ConstantRange LastRange(1); // Dummy initial value
   for (unsigned i = 0; i < NumRanges; ++i) {
-    ConstantInt *Low = dyn_cast<ConstantInt>(Range->getOperand(2*i));
+    ConstantInt *Low =
+        mdconst::dyn_extract<ConstantInt>(Range->getOperand(2 * i));
     Assert1(Low, "The lower limit must be an integer!", Low);
-    ConstantInt *High = dyn_cast<ConstantInt>(Range->getOperand(2*i + 1));
+    ConstantInt *High =
+        mdconst::dyn_extract<ConstantInt>(Range->getOperand(2 * i + 1));
     Assert1(High, "The upper limit must be an integer!", High);
     Assert1(High->getType() == Low->getType() &&
             High->getType() == Ty, "Range types must match instruction type!",
@@ -1931,9 +2188,9 @@ void Verifier::visitRangeMetadata(Instruction& I,
   }
   if (NumRanges > 2) {
     APInt FirstLow =
-      dyn_cast<ConstantInt>(Range->getOperand(0))->getValue();
+        mdconst::dyn_extract<ConstantInt>(Range->getOperand(0))->getValue();
     APInt FirstHigh =
-      dyn_cast<ConstantInt>(Range->getOperand(1))->getValue();
+        mdconst::dyn_extract<ConstantInt>(Range->getOperand(1))->getValue();
     ConstantRange FirstRange(FirstLow, FirstHigh);
     Assert1(FirstRange.intersectWith(LastRange).isEmptySet(),
             "Intervals are overlapping", Range);
@@ -2229,7 +2486,8 @@ void Verifier::visitInstruction(Instruction &I) {
       Assert1(!F->isIntrinsic() || isa<CallInst>(I) ||
               F->getIntrinsicID() == Intrinsic::donothing ||
               F->getIntrinsicID() == Intrinsic::experimental_patchpoint_void ||
-              F->getIntrinsicID() == Intrinsic::experimental_patchpoint_i64,
+              F->getIntrinsicID() == Intrinsic::experimental_patchpoint_i64 ||
+              F->getIntrinsicID() == Intrinsic::experimental_gc_statepoint,
               "Cannot invoke an intrinsinc other than"
               " donothing or patchpoint", &I);
       Assert1(F->getParent() == M, "Referencing function in another module!",
@@ -2277,8 +2535,8 @@ void Verifier::visitInstruction(Instruction &I) {
     Assert1(I.getType()->isFPOrFPVectorTy(),
             "fpmath requires a floating point result!", &I);
     Assert1(MD->getNumOperands() == 1, "fpmath takes one operand!", &I);
-    Value *Op0 = MD->getOperand(0);
-    if (ConstantFP *CFP0 = dyn_cast_or_null<ConstantFP>(Op0)) {
+    if (ConstantFP *CFP0 =
+            mdconst::dyn_extract_or_null<ConstantFP>(MD->getOperand(0))) {
       APFloat Accuracy = CFP0->getValueAPF();
       Assert1(Accuracy.isFiniteNonZero() && !Accuracy.isNegative(),
               "fpmath accuracy not a positive number!", &I);
@@ -2362,6 +2620,7 @@ bool Verifier::VerifyIntrinsicType(Type *Ty,
     ArgTys.push_back(Ty);
 
     switch (D.getArgumentKind()) {
+    case IITDescriptor::AK_Any:        return false; // Success
     case IITDescriptor::AK_AnyInteger: return !Ty->isIntOrIntVectorTy();
     case IITDescriptor::AK_AnyFloat:   return !Ty->isFPOrFPVectorTy();
     case IITDescriptor::AK_AnyVector:  return !isa<VectorType>(Ty);
@@ -2405,6 +2664,43 @@ bool Verifier::VerifyIntrinsicType(Type *Ty,
            !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
            VectorType::getHalfElementsVectorType(
                          cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+  case IITDescriptor::SameVecWidthArgument: {
+    if (D.getArgumentNumber() >= ArgTys.size())
+      return true;
+    VectorType * ReferenceType =
+      dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
+    VectorType *ThisArgType = dyn_cast<VectorType>(Ty);
+    if (!ThisArgType || !ReferenceType || 
+        (ReferenceType->getVectorNumElements() !=
+         ThisArgType->getVectorNumElements()))
+      return true;
+    return VerifyIntrinsicType(ThisArgType->getVectorElementType(),
+                               Infos, ArgTys);
+  }
+  case IITDescriptor::PtrToArgument: {
+    if (D.getArgumentNumber() >= ArgTys.size())
+      return true;
+    Type * ReferenceType = ArgTys[D.getArgumentNumber()];
+    PointerType *ThisArgType = dyn_cast<PointerType>(Ty);
+    return (!ThisArgType || ThisArgType->getElementType() != ReferenceType);
+  }
+  case IITDescriptor::VecOfPtrsToElt: {
+    if (D.getArgumentNumber() >= ArgTys.size())
+      return true;
+    VectorType * ReferenceType =
+      dyn_cast<VectorType> (ArgTys[D.getArgumentNumber()]);
+    VectorType *ThisArgVecTy = dyn_cast<VectorType>(Ty);
+    if (!ThisArgVecTy || !ReferenceType || 
+        (ReferenceType->getVectorNumElements() !=
+         ThisArgVecTy->getVectorNumElements()))
+      return true;
+    PointerType *ThisArgEltTy =
+      dyn_cast<PointerType>(ThisArgVecTy->getVectorElementType());
+    if (!ThisArgEltTy)
+      return true;
+    return (!(ThisArgEltTy->getElementType() ==
+            ReferenceType->getVectorElementType()));
+  }
   }
   llvm_unreachable("unhandled");
 }
@@ -2482,8 +2778,8 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   // If the intrinsic takes MDNode arguments, verify that they are either global
   // or are local to *this* function.
   for (unsigned i = 0, e = CI.getNumArgOperands(); i != e; ++i)
-    if (MDNode *MD = dyn_cast<MDNode>(CI.getArgOperand(i)))
-      visitMDNode(*MD, CI.getParent()->getParent());
+    if (auto *MD = dyn_cast<MetadataAsValue>(CI.getArgOperand(i)))
+      visitMetadataAsValue(*MD, CI.getParent()->getParent());
 
   switch (ID) {
   default:
@@ -2495,11 +2791,8 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
             "constant int", &CI);
     break;
   case Intrinsic::dbg_declare: {  // llvm.dbg.declare
-    Assert1(CI.getArgOperand(0) && isa<MDNode>(CI.getArgOperand(0)),
-                "invalid llvm.dbg.declare intrinsic call 1", &CI);
-    MDNode *MD = cast<MDNode>(CI.getArgOperand(0));
-    Assert1(MD->getNumOperands() == 1,
-                "invalid llvm.dbg.declare intrinsic call 2", &CI);
+    Assert1(CI.getArgOperand(0) && isa<MetadataAsValue>(CI.getArgOperand(0)),
+            "invalid llvm.dbg.declare intrinsic call 1", &CI);
   } break;
   case Intrinsic::memcpy:
   case Intrinsic::memmove:
@@ -2559,7 +2852,142 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
     Assert1(isa<ConstantInt>(CI.getArgOperand(1)),
             "llvm.invariant.end parameter #2 must be a constant integer", &CI);
     break;
+
+  case Intrinsic::frameallocate: {
+    BasicBlock *BB = CI.getParent();
+    Assert1(BB == &BB->getParent()->front(),
+            "llvm.frameallocate used outside of entry block", &CI);
+    Assert1(!SawFrameAllocate,
+            "multiple calls to llvm.frameallocate in one function", &CI);
+    SawFrameAllocate = true;
+    Assert1(isa<ConstantInt>(CI.getArgOperand(0)),
+            "llvm.frameallocate argument must be constant integer size", &CI);
+    break;
+  }
+  case Intrinsic::framerecover: {
+    Value *FnArg = CI.getArgOperand(0)->stripPointerCasts();
+    Function *Fn = dyn_cast<Function>(FnArg);
+    Assert1(Fn && !Fn->isDeclaration(), "llvm.framerecover first "
+            "argument must be function defined in this module", &CI);
+    break;
+  }
+
+  case Intrinsic::experimental_gc_statepoint:
+    Assert1(!CI.isInlineAsm(),
+            "gc.statepoint support for inline assembly unimplemented", &CI);
+
+    VerifyStatepoint(ImmutableCallSite(&CI));
+    break;
+  case Intrinsic::experimental_gc_result_int:
+  case Intrinsic::experimental_gc_result_float:
+  case Intrinsic::experimental_gc_result_ptr:
+  case Intrinsic::experimental_gc_result: {
+    // Are we tied to a statepoint properly?
+    CallSite StatepointCS(CI.getArgOperand(0));
+    const Function *StatepointFn =
+      StatepointCS.getInstruction() ? StatepointCS.getCalledFunction() : nullptr;
+    Assert2(StatepointFn && StatepointFn->isDeclaration() &&
+            StatepointFn->getIntrinsicID() == Intrinsic::experimental_gc_statepoint,
+	    "gc.result operand #1 must be from a statepoint",
+	    &CI, CI.getArgOperand(0));
+
+    // Assert that result type matches wrapped callee.
+    const Value *Target = StatepointCS.getArgument(0);
+    const PointerType *PT = cast<PointerType>(Target->getType());
+    const FunctionType *TargetFuncType =
+      cast<FunctionType>(PT->getElementType());
+    Assert1(CI.getType() == TargetFuncType->getReturnType(),
+            "gc.result result type does not match wrapped callee",
+            &CI);
+    break;
   }
+  case Intrinsic::experimental_gc_relocate: {
+    Assert1(CI.getNumArgOperands() == 3, "wrong number of arguments", &CI);
+
+    // Check that this relocate is correctly tied to the statepoint
+
+    // This is case for relocate on the unwinding path of an invoke statepoint
+    if (ExtractValueInst *ExtractValue =
+          dyn_cast<ExtractValueInst>(CI.getArgOperand(0))) {
+      Assert1(isa<LandingPadInst>(ExtractValue->getAggregateOperand()),
+              "gc relocate on unwind path incorrectly linked to the statepoint",
+              &CI);
+
+      const BasicBlock *invokeBB =
+        ExtractValue->getParent()->getUniquePredecessor();
+
+      // Landingpad relocates should have only one predecessor with invoke
+      // statepoint terminator
+      Assert1(invokeBB,
+              "safepoints should have unique landingpads",
+              ExtractValue->getParent());
+      Assert1(invokeBB->getTerminator(),
+              "safepoint block should be well formed",
+              invokeBB);
+      Assert1(isStatepoint(invokeBB->getTerminator()),
+              "gc relocate should be linked to a statepoint",
+              invokeBB);
+    }
+    else {
+      // In all other cases relocate should be tied to the statepoint directly.
+      // This covers relocates on a normal return path of invoke statepoint and
+      // relocates of a call statepoint
+      auto Token = CI.getArgOperand(0);
+      Assert2(isa<Instruction>(Token) && isStatepoint(cast<Instruction>(Token)),
+              "gc relocate is incorrectly tied to the statepoint",
+              &CI, Token);
+    }
+
+    // Verify rest of the relocate arguments
+
+    GCRelocateOperands ops(&CI);
+    ImmutableCallSite StatepointCS(ops.statepoint());
+
+    // Both the base and derived must be piped through the safepoint
+    Value* Base = CI.getArgOperand(1);
+    Assert1(isa<ConstantInt>(Base),
+            "gc.relocate operand #2 must be integer offset", &CI);
+    
+    Value* Derived = CI.getArgOperand(2);
+    Assert1(isa<ConstantInt>(Derived),
+            "gc.relocate operand #3 must be integer offset", &CI);
+
+    const int BaseIndex = cast<ConstantInt>(Base)->getZExtValue();
+    const int DerivedIndex = cast<ConstantInt>(Derived)->getZExtValue();
+    // Check the bounds
+    Assert1(0 <= BaseIndex &&
+            BaseIndex < (int)StatepointCS.arg_size(),
+            "gc.relocate: statepoint base index out of bounds", &CI);
+    Assert1(0 <= DerivedIndex &&
+            DerivedIndex < (int)StatepointCS.arg_size(),
+            "gc.relocate: statepoint derived index out of bounds", &CI);
+
+    // Check that BaseIndex and DerivedIndex fall within the 'gc parameters'
+    // section of the statepoint's argument
+    const int NumCallArgs =
+      cast<ConstantInt>(StatepointCS.getArgument(1))->getZExtValue();
+    const int NumDeoptArgs =
+      cast<ConstantInt>(StatepointCS.getArgument(NumCallArgs + 3))->getZExtValue();
+    const int GCParamArgsStart = NumCallArgs + NumDeoptArgs + 4;
+    const int GCParamArgsEnd = StatepointCS.arg_size();
+    Assert1(GCParamArgsStart <= BaseIndex &&
+            BaseIndex < GCParamArgsEnd,
+            "gc.relocate: statepoint base index doesn't fall within the "
+	    "'gc parameters' section of the statepoint call", &CI);
+    Assert1(GCParamArgsStart <= DerivedIndex &&
+            DerivedIndex < GCParamArgsEnd,
+            "gc.relocate: statepoint derived index doesn't fall within the "
+	    "'gc parameters' section of the statepoint call", &CI);
+
+
+    // Assert that the result type matches the type of the relocated pointer
+    GCRelocateOperands Operands(&CI);
+    Assert1(Operands.derivedPtr()->getType() == CI.getType(),
+            "gc.relocate: relocating a pointer shouldn't change its type",
+            &CI);
+    break;
+  }
+  };
 }
 
 void DebugInfoVerifier::verifyDebugInfo() {
@@ -2605,12 +3033,20 @@ void DebugInfoVerifier::processCallInst(DebugInfoFinder &Finder,
   if (Function *F = CI.getCalledFunction())
     if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID())
       switch (ID) {
-      case Intrinsic::dbg_declare:
-        Finder.processDeclare(*M, cast<DbgDeclareInst>(&CI));
+      case Intrinsic::dbg_declare: {
+        auto *DDI = cast<DbgDeclareInst>(&CI);
+        Finder.processDeclare(*M, DDI);
+        if (auto E = DDI->getExpression())
+          Assert1(DIExpression(E).Verify(), "DIExpression does not Verify!", E);
         break;
-      case Intrinsic::dbg_value:
-        Finder.processValue(*M, cast<DbgValueInst>(&CI));
+      }
+      case Intrinsic::dbg_value: {
+        auto *DVI = cast<DbgValueInst>(&CI);
+        Finder.processValue(*M, DVI);
+        if (auto E = DVI->getExpression())
+          Assert1(DIExpression(E).Verify(), "DIExpression does not Verify!", E);
         break;
+      }
       default:
         break;
       }
@@ -2722,15 +3158,15 @@ ModulePass *llvm::createDebugInfoVerifierPass(bool FatalErrors) {
   return new DebugInfoVerifierLegacyPass(FatalErrors);
 }
 
-PreservedAnalyses VerifierPass::run(Module *M) {
-  if (verifyModule(*M, &dbgs()) && FatalErrors)
+PreservedAnalyses VerifierPass::run(Module &M) {
+  if (verifyModule(M, &dbgs()) && FatalErrors)
     report_fatal_error("Broken module found, compilation aborted!");
 
   return PreservedAnalyses::all();
 }
 
-PreservedAnalyses VerifierPass::run(Function *F) {
-  if (verifyFunction(*F, &dbgs()) && FatalErrors)
+PreservedAnalyses VerifierPass::run(Function &F) {
+  if (verifyFunction(F, &dbgs()) && FatalErrors)
     report_fatal_error("Broken function found, compilation aborted!");
 
   return PreservedAnalyses::all();
diff --git a/lib/IRReader/CMakeLists.txt b/lib/IRReader/CMakeLists.txt
index cf10d8b..2c0e61b 100644
--- a/lib/IRReader/CMakeLists.txt
+++ b/lib/IRReader/CMakeLists.txt
@@ -1,3 +1,6 @@
 add_llvm_library(LLVMIRReader
   IRReader.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/IRReader
   )
diff --git a/lib/LTO/CMakeLists.txt b/lib/LTO/CMakeLists.txt
index 8e00bcb..1c099bb 100644
--- a/lib/LTO/CMakeLists.txt
+++ b/lib/LTO/CMakeLists.txt
@@ -1,4 +1,9 @@
 add_llvm_library(LLVMLTO
   LTOModule.cpp
   LTOCodeGenerator.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/LTO
   )
+
+add_dependencies(LLVMLTO intrinsics_gen)
diff --git a/lib/LTO/LLVMBuild.txt b/lib/LTO/LLVMBuild.txt
index b9178e9..dfd424f 100644
--- a/lib/LTO/LLVMBuild.txt
+++ b/lib/LTO/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = LTO
 parent = Libraries
-required_libraries = BitReader BitWriter Core IPA IPO InstCombine Linker MC ObjCARC Object Scalar Support Target TransformUtils CodeGen
+required_libraries = Analysis BitReader BitWriter CodeGen Core IPA IPO InstCombine Linker MC ObjCARC Object Scalar Support Target
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index c663d43..61c2749 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -15,6 +15,8 @@
 #include "llvm/LTO/LTOCodeGenerator.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/Config/config.h"
@@ -24,6 +26,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
@@ -33,7 +36,6 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/SubtargetFeature.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
@@ -44,7 +46,6 @@
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -81,16 +82,27 @@ void LTOCodeGenerator::initialize() {
   CodeModel = LTO_CODEGEN_PIC_MODEL_DEFAULT;
   DiagHandler = nullptr;
   DiagContext = nullptr;
+  OwnedModule = nullptr;
 
   initializeLTOPasses();
 }
 
+void LTOCodeGenerator::destroyMergedModule() {
+  if (OwnedModule) {
+    assert(IRLinker.getModule() == &OwnedModule->getModule() &&
+           "The linker's module should be the same as the owned module");
+    delete OwnedModule;
+    OwnedModule = nullptr;
+  } else if (IRLinker.getModule())
+    IRLinker.deleteModule();
+}
+
 LTOCodeGenerator::~LTOCodeGenerator() {
+  destroyMergedModule();
+
   delete TargetMach;
   TargetMach = nullptr;
 
-  IRLinker.deleteModule();
-
   for (std::vector<char *>::iterator I = CodegenOptions.begin(),
                                      E = CodegenOptions.end();
        I != E; ++I)
@@ -108,7 +120,7 @@ void LTOCodeGenerator::initializeLTOPasses() {
   initializeGlobalOptPass(R);
   initializeConstantMergePass(R);
   initializeDAHPass(R);
-  initializeInstCombinerPass(R);
+  initializeInstructionCombiningPassPass(R);
   initializeSimpleInlinerPass(R);
   initializePruneEHPass(R);
   initializeGlobalDCEPass(R);
@@ -140,6 +152,22 @@ bool LTOCodeGenerator::addModule(LTOModule *mod) {
   return !ret;
 }
 
+void LTOCodeGenerator::setModule(LTOModule *Mod) {
+  assert(&Mod->getModule().getContext() == &Context &&
+         "Expected module in same context");
+
+  // Delete the old merged module.
+  destroyMergedModule();
+  AsmUndefinedRefs.clear();
+
+  OwnedModule = Mod;
+  IRLinker.setModule(&Mod->getModule());
+
+  const std::vector<const char*> &Undefs = Mod->getAsmUndefinedRefs();
+  for (int I = 0, E = Undefs.size(); I != E; ++I)
+    AsmUndefinedRefs[Undefs[I]] = 1;
+}
+
 void LTOCodeGenerator::setTargetOptions(TargetOptions options) {
   Options = options;
 }
@@ -201,12 +229,8 @@ bool LTOCodeGenerator::writeMergedModules(const char *path,
   return true;
 }
 
-bool LTOCodeGenerator::compile_to_file(const char** name,
-                                       bool disableOpt,
-                                       bool disableInline,
-                                       bool disableGVNLoadPRE,
-                                       bool disableVectorization,
-                                       std::string& errMsg) {
+bool LTOCodeGenerator::compileOptimizedToFile(const char **name,
+                                              std::string &errMsg) {
   // make unique temp .o file to put generated object file
   SmallString<128> Filename;
   int FD;
@@ -220,9 +244,7 @@ bool LTOCodeGenerator::compile_to_file(const char** name,
   // generate object file
   tool_output_file objFile(Filename.c_str(), FD);
 
-  bool genResult =
-      generateObjectFile(objFile.os(), disableOpt, disableInline,
-                         disableGVNLoadPRE, disableVectorization, errMsg);
+  bool genResult = compileOptimized(objFile.os(), errMsg);
   objFile.os().close();
   if (objFile.os().has_error()) {
     objFile.os().clear_error();
@@ -241,15 +263,10 @@ bool LTOCodeGenerator::compile_to_file(const char** name,
   return true;
 }
 
-const void* LTOCodeGenerator::compile(size_t* length,
-                                      bool disableOpt,
-                                      bool disableInline,
-                                      bool disableGVNLoadPRE,
-                                      bool disableVectorization,
-                                      std::string& errMsg) {
+const void *LTOCodeGenerator::compileOptimized(size_t *length,
+                                               std::string &errMsg) {
   const char *name;
-  if (!compile_to_file(&name, disableOpt, disableInline, disableGVNLoadPRE,
-                       disableVectorization, errMsg))
+  if (!compileOptimizedToFile(&name, errMsg))
     return nullptr;
 
   // read .o file into memory buffer
@@ -272,6 +289,33 @@ const void* LTOCodeGenerator::compile(size_t* length,
   return NativeObjectFile->getBufferStart();
 }
 
+
+bool LTOCodeGenerator::compile_to_file(const char **name,
+                                       bool disableOpt,
+                                       bool disableInline,
+                                       bool disableGVNLoadPRE,
+                                       bool disableVectorization,
+                                       std::string &errMsg) {
+  if (!optimize(disableOpt, disableInline, disableGVNLoadPRE,
+                disableVectorization, errMsg))
+    return false;
+
+  return compileOptimizedToFile(name, errMsg);
+}
+
+const void* LTOCodeGenerator::compile(size_t *length,
+                                      bool disableOpt,
+                                      bool disableInline,
+                                      bool disableGVNLoadPRE,
+                                      bool disableVectorization,
+                                      std::string &errMsg) {
+  if (!optimize(disableOpt, disableInline, disableGVNLoadPRE,
+                disableVectorization, errMsg))
+    return nullptr;
+
+  return compileOptimized(length, errMsg);
+}
+
 bool LTOCodeGenerator::determineTarget(std::string &errMsg) {
   if (TargetMach)
     return true;
@@ -368,10 +412,13 @@ static void findUsedValues(GlobalVariable *LLVMUsed,
       UsedValues.insert(GV);
 }
 
+// Collect names of runtime library functions. User-defined functions with the
+// same names are added to llvm.compiler.used to prevent them from being
+// deleted by optimizations.
 static void accumulateAndSortLibcalls(std::vector<StringRef> &Libcalls,
                                       const TargetLibraryInfo& TLI,
-                                      const TargetLowering *Lowering)
-{
+                                      const Module &Mod,
+                                      const TargetMachine &TM) {
   // TargetLibraryInfo has info on C runtime library calls on the current
   // target.
   for (unsigned I = 0, E = static_cast<unsigned>(LibFunc::NumLibFuncs);
@@ -381,14 +428,21 @@ static void accumulateAndSortLibcalls(std::vector<StringRef> &Libcalls,
       Libcalls.push_back(TLI.getName(F));
   }
 
-  // TargetLowering has info on library calls that CodeGen expects to be
-  // available, both from the C runtime and compiler-rt.
-  if (Lowering)
-    for (unsigned I = 0, E = static_cast<unsigned>(RTLIB::UNKNOWN_LIBCALL);
-         I != E; ++I)
-      if (const char *Name
-          = Lowering->getLibcallName(static_cast<RTLIB::Libcall>(I)))
-        Libcalls.push_back(Name);
+  SmallPtrSet<const TargetLowering *, 1> TLSet;
+
+  for (const Function &F : Mod) {
+    const TargetLowering *Lowering =
+        TM.getSubtargetImpl(F)->getTargetLowering();
+
+    if (Lowering && TLSet.insert(Lowering).second)
+      // TargetLowering has info on library calls that CodeGen expects to be
+      // available, both from the C runtime and compiler-rt.
+      for (unsigned I = 0, E = static_cast<unsigned>(RTLIB::UNKNOWN_LIBCALL);
+           I != E; ++I)
+        if (const char *Name =
+                Lowering->getLibcallName(static_cast<RTLIB::Libcall>(I)))
+          Libcalls.push_back(Name);
+  }
 
   array_pod_sort(Libcalls.begin(), Libcalls.end());
   Libcalls.erase(std::unique(Libcalls.begin(), Libcalls.end()),
@@ -401,18 +455,19 @@ void LTOCodeGenerator::applyScopeRestrictions() {
   Module *mergedModule = IRLinker.getModule();
 
   // Start off with a verification pass.
-  PassManager passes;
+  legacy::PassManager passes;
   passes.add(createVerifierPass());
   passes.add(createDebugInfoVerifierPass());
 
   // mark which symbols can not be internalized
-  Mangler Mangler(TargetMach->getSubtargetImpl()->getDataLayout());
+  Mangler Mangler(TargetMach->getDataLayout());
   std::vector<const char*> MustPreserveList;
   SmallPtrSet<GlobalValue*, 8> AsmUsed;
   std::vector<StringRef> Libcalls;
-  TargetLibraryInfo TLI(Triple(TargetMach->getTargetTriple()));
-  accumulateAndSortLibcalls(
-      Libcalls, TLI, TargetMach->getSubtargetImpl()->getTargetLowering());
+  TargetLibraryInfoImpl TLII(Triple(TargetMach->getTargetTriple()));
+  TargetLibraryInfo TLI(TLII);
+
+  accumulateAndSortLibcalls(Libcalls, TLI, *mergedModule, *TargetMach);
 
   for (Module::iterator f = mergedModule->begin(),
          e = mergedModule->end(); f != e; ++f)
@@ -457,12 +512,11 @@ void LTOCodeGenerator::applyScopeRestrictions() {
 }
 
 /// Optimize merged modules using various IPO passes
-bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
-                                          bool DisableOpt,
-                                          bool DisableInline,
-                                          bool DisableGVNLoadPRE,
-                                          bool DisableVectorization,
-                                          std::string &errMsg) {
+bool LTOCodeGenerator::optimize(bool DisableOpt,
+                                bool DisableInline,
+                                bool DisableGVNLoadPRE,
+                                bool DisableVectorization,
+                                std::string &errMsg) {
   if (!this->determineTarget(errMsg))
     return false;
 
@@ -472,10 +526,14 @@ bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
   this->applyScopeRestrictions();
 
   // Instantiate the pass manager to organize the passes.
-  PassManager passes;
+  legacy::PassManager passes;
 
   // Add an appropriate DataLayout instance for this module...
-  mergedModule->setDataLayout(TargetMach->getSubtargetImpl()->getDataLayout());
+  mergedModule->setDataLayout(TargetMach->getDataLayout());
+
+  passes.add(new DataLayoutPass());
+  passes.add(
+      createTargetTransformInfoWrapperPass(TargetMach->getTargetIRAnalysis()));
 
   Triple TargetTriple(TargetMach->getTargetTriple());
   PassManagerBuilder PMB;
@@ -484,15 +542,30 @@ bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
   PMB.SLPVectorize = !DisableVectorization;
   if (!DisableInline)
     PMB.Inliner = createFunctionInliningPass();
-  PMB.LibraryInfo = new TargetLibraryInfo(TargetTriple);
+  PMB.LibraryInfo = new TargetLibraryInfoImpl(TargetTriple);
   if (DisableOpt)
     PMB.OptLevel = 0;
   PMB.VerifyInput = true;
   PMB.VerifyOutput = true;
 
-  PMB.populateLTOPassManager(passes, TargetMach);
+  PMB.populateLTOPassManager(passes);
 
-  PassManager codeGenPasses;
+  // Run our queue of passes all at once now, efficiently.
+  passes.run(*mergedModule);
+
+  return true;
+}
+
+bool LTOCodeGenerator::compileOptimized(raw_ostream &out, std::string &errMsg) {
+  if (!this->determineTarget(errMsg))
+    return false;
+
+  Module *mergedModule = IRLinker.getModule();
+
+  // Mark which symbols can not be internalized
+  this->applyScopeRestrictions();
+
+  legacy::PassManager codeGenPasses;
 
   codeGenPasses.add(new DataLayoutPass());
 
@@ -508,9 +581,6 @@ bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
     return false;
   }
 
-  // Run our queue of passes all at once now, efficiently.
-  passes.run(*mergedModule);
-
   // Run the code generator, and write assembly file
   codeGenPasses.run(*mergedModule);
 
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
index 4108ef2..0d07791 100644
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
@@ -146,6 +147,44 @@ LTOModule *LTOModule::createInContext(const void *mem, size_t length,
   return makeLTOModule(Buffer, options, errMsg, Context);
 }
 
+static Module *parseBitcodeFileImpl(MemoryBufferRef Buffer,
+                                    LLVMContext &Context, bool ShouldBeLazy,
+                                    std::string &ErrMsg) {
+
+  // Find the buffer.
+  ErrorOr<MemoryBufferRef> MBOrErr =
+      IRObjectFile::findBitcodeInMemBuffer(Buffer);
+  if (std::error_code EC = MBOrErr.getError()) {
+    ErrMsg = EC.message();
+    return nullptr;
+  }
+
+  std::function<void(const DiagnosticInfo &)> DiagnosticHandler =
+      [&ErrMsg](const DiagnosticInfo &DI) {
+        raw_string_ostream Stream(ErrMsg);
+        DiagnosticPrinterRawOStream DP(Stream);
+        DI.print(DP);
+      };
+
+  if (!ShouldBeLazy) {
+    // Parse the full file.
+    ErrorOr<Module *> M =
+        parseBitcodeFile(*MBOrErr, Context, DiagnosticHandler);
+    if (!M)
+      return nullptr;
+    return *M;
+  }
+
+  // Parse lazily.
+  std::unique_ptr<MemoryBuffer> LightweightBuf =
+      MemoryBuffer::getMemBuffer(*MBOrErr, false);
+  ErrorOr<Module *> M = getLazyBitcodeModule(std::move(LightweightBuf), Context,
+                                             DiagnosticHandler);
+  if (!M)
+    return nullptr;
+  return *M;
+}
+
 LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
                                     TargetOptions options, std::string &errMsg,
                                     LLVMContext *Context) {
@@ -155,18 +194,13 @@ LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
     Context = OwnedContext.get();
   }
 
-  ErrorOr<MemoryBufferRef> MBOrErr =
-      IRObjectFile::findBitcodeInMemBuffer(Buffer);
-  if (std::error_code EC = MBOrErr.getError()) {
-    errMsg = EC.message();
+  // If we own a context, we know this is being used only for symbol
+  // extraction, not linking.  Be lazy in that case.
+  std::unique_ptr<Module> M(parseBitcodeFileImpl(
+      Buffer, *Context,
+      /* ShouldBeLazy */ static_cast<bool>(OwnedContext), errMsg));
+  if (!M)
     return nullptr;
-  }
-  ErrorOr<Module *> MOrErr = parseBitcodeFile(*MBOrErr, *Context);
-  if (std::error_code EC = MOrErr.getError()) {
-    errMsg = EC.message();
-    return nullptr;
-  }
-  std::unique_ptr<Module> M(MOrErr.get());
 
   std::string TripleStr = M->getTargetTriple();
   if (TripleStr.empty())
@@ -195,7 +229,7 @@ LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
 
   TargetMachine *target = march->createTargetMachine(TripleStr, CPU, FeatureStr,
                                                      options);
-  M->setDataLayout(target->getSubtargetImpl()->getDataLayout());
+  M->setDataLayout(target->getDataLayout());
 
   std::unique_ptr<object::IRObjectFile> IRObj(
       new object::IRObjectFile(Buffer, std::move(M)));
@@ -604,7 +638,7 @@ bool LTOModule::parseSymbols(std::string &errMsg) {
 /// parseMetadata - Parse metadata from the module
 void LTOModule::parseMetadata() {
   // Linker Options
-  if (Value *Val = getModule().getModuleFlag("Linker Options")) {
+  if (Metadata *Val = getModule().getModuleFlag("Linker Options")) {
     MDNode *LinkerOptions = cast<MDNode>(Val);
     for (unsigned i = 0, e = LinkerOptions->getNumOperands(); i != e; ++i) {
       MDNode *MDOptions = cast<MDNode>(LinkerOptions->getOperand(i));
@@ -615,10 +649,8 @@ void LTOModule::parseMetadata() {
         // here.
         StringRef Op =
             _linkeropt_strings.insert(MDOption->getString()).first->first();
-        StringRef DepLibName = _target->getSubtargetImpl()
-                                   ->getTargetLowering()
-                                   ->getObjFileLowering()
-                                   .getDepLibFromLinkerOpt(Op);
+        StringRef DepLibName =
+            _target->getObjFileLowering()->getDepLibFromLinkerOpt(Op);
         if (!DepLibName.empty())
           _deplibs.push_back(DepLibName.data());
         else if (!Op.empty())
diff --git a/lib/LineEditor/CMakeLists.txt b/lib/LineEditor/CMakeLists.txt
index 0dec256..0d2bada 100644
--- a/lib/LineEditor/CMakeLists.txt
+++ b/lib/LineEditor/CMakeLists.txt
@@ -5,7 +5,10 @@ endif()
 add_llvm_library(LLVMLineEditor
   LineEditor.cpp
 
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/LineEditor
+
   LINK_LIBS
   LLVMSupport
   ${link_libs}
-)
+  )
diff --git a/lib/Linker/CMakeLists.txt b/lib/Linker/CMakeLists.txt
index 221b55a..5a1f31a 100644
--- a/lib/Linker/CMakeLists.txt
+++ b/lib/Linker/CMakeLists.txt
@@ -1,3 +1,6 @@
 add_llvm_library(LLVMLinker
   LinkModules.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Linker
   )
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 8321bcf..e6d9acc 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -13,10 +13,14 @@
 
 #include "llvm/Linker/Linker.h"
 #include "llvm-c/Linker.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
@@ -36,8 +40,6 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 namespace {
-typedef SmallPtrSet<StructType *, 32> TypeSet;
-
 class TypeMapTy : public ValueMapTypeRemapper {
   /// This is a mapping from a source type to a destination type to use.
   DenseMap<Type*, Type*> MappedTypes;
@@ -47,6 +49,8 @@ class TypeMapTy : public ValueMapTypeRemapper {
   /// roll back.
   SmallVector<Type*, 16> SpeculativeTypes;
 
+  SmallVector<StructType*, 16> SpeculativeDstOpaqueTypes;
+
   /// This is a list of non-opaque structs in the source module that are mapped
   /// to an opaque struct in the destination module.
   SmallVector<StructType*, 16> SrcDefinitionsToResolve;
@@ -56,71 +60,79 @@ class TypeMapTy : public ValueMapTypeRemapper {
   SmallPtrSet<StructType*, 16> DstResolvedOpaqueTypes;
 
 public:
-  TypeMapTy(TypeSet &Set) : DstStructTypesSet(Set) {}
+  TypeMapTy(Linker::IdentifiedStructTypeSet &DstStructTypesSet)
+      : DstStructTypesSet(DstStructTypesSet) {}
 
-  TypeSet &DstStructTypesSet;
+  Linker::IdentifiedStructTypeSet &DstStructTypesSet;
   /// Indicate that the specified type in the destination module is conceptually
   /// equivalent to the specified type in the source module.
   void addTypeMapping(Type *DstTy, Type *SrcTy);
 
-  /// linkDefinedTypeBodies - Produce a body for an opaque type in the dest
-  /// module from a type definition in the source module.
+  /// Produce a body for an opaque type in the dest module from a type
+  /// definition in the source module.
   void linkDefinedTypeBodies();
 
   /// Return the mapped type to use for the specified input type from the
   /// source module.
   Type *get(Type *SrcTy);
+  Type *get(Type *SrcTy, SmallPtrSet<StructType *, 8> &Visited);
 
-  FunctionType *get(FunctionType *T) {return cast<FunctionType>(get((Type*)T));}
+  void finishType(StructType *DTy, StructType *STy, ArrayRef<Type *> ETypes);
+
+  FunctionType *get(FunctionType *T) {
+    return cast<FunctionType>(get((Type *)T));
+  }
 
   /// Dump out the type map for debugging purposes.
   void dump() const {
-    for (DenseMap<Type*, Type*>::const_iterator
-           I = MappedTypes.begin(), E = MappedTypes.end(); I != E; ++I) {
+    for (auto &Pair : MappedTypes) {
       dbgs() << "TypeMap: ";
-      I->first->print(dbgs());
+      Pair.first->print(dbgs());
       dbgs() << " => ";
-      I->second->print(dbgs());
+      Pair.second->print(dbgs());
       dbgs() << '\n';
     }
   }
 
 private:
-  Type *getImpl(Type *T);
-  /// Implement the ValueMapTypeRemapper interface.
-  Type *remapType(Type *SrcTy) override {
-    return get(SrcTy);
-  }
+  Type *remapType(Type *SrcTy) override { return get(SrcTy); }
 
   bool areTypesIsomorphic(Type *DstTy, Type *SrcTy);
 };
 }
 
 void TypeMapTy::addTypeMapping(Type *DstTy, Type *SrcTy) {
-  Type *&Entry = MappedTypes[SrcTy];
-  if (Entry) return;
-
-  if (DstTy == SrcTy) {
-    Entry = DstTy;
-    return;
-  }
+  assert(SpeculativeTypes.empty());
+  assert(SpeculativeDstOpaqueTypes.empty());
 
   // Check to see if these types are recursively isomorphic and establish a
   // mapping between them if so.
   if (!areTypesIsomorphic(DstTy, SrcTy)) {
     // Oops, they aren't isomorphic.  Just discard this request by rolling out
     // any speculative mappings we've established.
-    for (unsigned i = 0, e = SpeculativeTypes.size(); i != e; ++i)
-      MappedTypes.erase(SpeculativeTypes[i]);
+    for (Type *Ty : SpeculativeTypes)
+      MappedTypes.erase(Ty);
+
+    SrcDefinitionsToResolve.resize(SrcDefinitionsToResolve.size() -
+                                   SpeculativeDstOpaqueTypes.size());
+    for (StructType *Ty : SpeculativeDstOpaqueTypes)
+      DstResolvedOpaqueTypes.erase(Ty);
+  } else {
+    for (Type *Ty : SpeculativeTypes)
+      if (auto *STy = dyn_cast<StructType>(Ty))
+        if (STy->hasName())
+          STy->setName("");
   }
   SpeculativeTypes.clear();
+  SpeculativeDstOpaqueTypes.clear();
 }
 
 /// Recursively walk this pair of types, returning true if they are isomorphic,
 /// false if they are not.
 bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
   // Two types with differing kinds are clearly not isomorphic.
-  if (DstTy->getTypeID() != SrcTy->getTypeID()) return false;
+  if (DstTy->getTypeID() != SrcTy->getTypeID())
+    return false;
 
   // If we have an entry in the MappedTypes table, then we have our answer.
   Type *&Entry = MappedTypes[SrcTy];
@@ -147,14 +159,15 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
 
     // Mapping a non-opaque source type to an opaque dest.  If this is the first
     // type that we're mapping onto this destination type then we succeed.  Keep
-    // the dest, but fill it in later.  This doesn't need to be speculative.  If
-    // this is the second (different) type that we're trying to map onto the
-    // same opaque type then we fail.
+    // the dest, but fill it in later. If this is the second (different) type
+    // that we're trying to map onto the same opaque type then we fail.
     if (cast<StructType>(DstTy)->isOpaque()) {
       // We can only map one source type onto the opaque destination type.
       if (!DstResolvedOpaqueTypes.insert(cast<StructType>(DstTy)).second)
         return false;
       SrcDefinitionsToResolve.push_back(SSTy);
+      SpeculativeTypes.push_back(SrcTy);
+      SpeculativeDstOpaqueTypes.push_back(cast<StructType>(DstTy));
       Entry = DstTy;
       return true;
     }
@@ -192,162 +205,153 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
   Entry = DstTy;
   SpeculativeTypes.push_back(SrcTy);
 
-  for (unsigned i = 0, e = SrcTy->getNumContainedTypes(); i != e; ++i)
-    if (!areTypesIsomorphic(DstTy->getContainedType(i),
-                            SrcTy->getContainedType(i)))
+  for (unsigned I = 0, E = SrcTy->getNumContainedTypes(); I != E; ++I)
+    if (!areTypesIsomorphic(DstTy->getContainedType(I),
+                            SrcTy->getContainedType(I)))
       return false;
 
   // If everything seems to have lined up, then everything is great.
   return true;
 }
 
-/// Produce a body for an opaque type in the dest module from a type definition
-/// in the source module.
 void TypeMapTy::linkDefinedTypeBodies() {
   SmallVector<Type*, 16> Elements;
-  SmallString<16> TmpName;
-
-  // Note that processing entries in this loop (calling 'get') can add new
-  // entries to the SrcDefinitionsToResolve vector.
-  while (!SrcDefinitionsToResolve.empty()) {
-    StructType *SrcSTy = SrcDefinitionsToResolve.pop_back_val();
+  for (StructType *SrcSTy : SrcDefinitionsToResolve) {
     StructType *DstSTy = cast<StructType>(MappedTypes[SrcSTy]);
-
-    // TypeMap is a many-to-one mapping, if there were multiple types that
-    // provide a body for DstSTy then previous iterations of this loop may have
-    // already handled it.  Just ignore this case.
-    if (!DstSTy->isOpaque()) continue;
-    assert(!SrcSTy->isOpaque() && "Not resolving a definition?");
+    assert(DstSTy->isOpaque());
 
     // Map the body of the source type over to a new body for the dest type.
     Elements.resize(SrcSTy->getNumElements());
-    for (unsigned i = 0, e = Elements.size(); i != e; ++i)
-      Elements[i] = getImpl(SrcSTy->getElementType(i));
+    for (unsigned I = 0, E = Elements.size(); I != E; ++I)
+      Elements[I] = get(SrcSTy->getElementType(I));
 
     DstSTy->setBody(Elements, SrcSTy->isPacked());
+  }
+  SrcDefinitionsToResolve.clear();
+  DstResolvedOpaqueTypes.clear();
+}
 
-    // If DstSTy has no name or has a longer name than STy, then viciously steal
-    // STy's name.
-    if (!SrcSTy->hasName()) continue;
-    StringRef SrcName = SrcSTy->getName();
+void TypeMapTy::finishType(StructType *DTy, StructType *STy,
+                           ArrayRef<Type *> ETypes) {
+  DTy->setBody(ETypes, STy->isPacked());
 
-    if (!DstSTy->hasName() || DstSTy->getName().size() > SrcName.size()) {
-      TmpName.insert(TmpName.end(), SrcName.begin(), SrcName.end());
-      SrcSTy->setName("");
-      DstSTy->setName(TmpName.str());
-      TmpName.clear();
-    }
+  // Steal STy's name.
+  if (STy->hasName()) {
+    SmallString<16> TmpName = STy->getName();
+    STy->setName("");
+    DTy->setName(TmpName);
   }
 
-  DstResolvedOpaqueTypes.clear();
+  DstStructTypesSet.addNonOpaque(DTy);
 }
 
 Type *TypeMapTy::get(Type *Ty) {
-  Type *Result = getImpl(Ty);
-
-  // If this caused a reference to any struct type, resolve it before returning.
-  if (!SrcDefinitionsToResolve.empty())
-    linkDefinedTypeBodies();
-  return Result;
+  SmallPtrSet<StructType *, 8> Visited;
+  return get(Ty, Visited);
 }
 
-/// This is the recursive version of get().
-Type *TypeMapTy::getImpl(Type *Ty) {
+Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) {
   // If we already have an entry for this type, return it.
   Type **Entry = &MappedTypes[Ty];
-  if (*Entry) return *Entry;
+  if (*Entry)
+    return *Entry;
 
-  // If this is not a named struct type, then just map all of the elements and
-  // then rebuild the type from inside out.
-  if (!isa<StructType>(Ty) || cast<StructType>(Ty)->isLiteral()) {
-    // If there are no element types to map, then the type is itself.  This is
-    // true for the anonymous {} struct, things like 'float', integers, etc.
-    if (Ty->getNumContainedTypes() == 0)
-      return *Entry = Ty;
+  // These are types that LLVM itself will unique.
+  bool IsUniqued = !isa<StructType>(Ty) || cast<StructType>(Ty)->isLiteral();
 
-    // Remap all of the elements, keeping track of whether any of them change.
-    bool AnyChange = false;
-    SmallVector<Type*, 4> ElementTypes;
-    ElementTypes.resize(Ty->getNumContainedTypes());
-    for (unsigned i = 0, e = Ty->getNumContainedTypes(); i != e; ++i) {
-      ElementTypes[i] = getImpl(Ty->getContainedType(i));
-      AnyChange |= ElementTypes[i] != Ty->getContainedType(i);
+#ifndef NDEBUG
+  if (!IsUniqued) {
+    for (auto &Pair : MappedTypes) {
+      assert(!(Pair.first != Ty && Pair.second == Ty) &&
+             "mapping to a source type");
     }
+  }
+#endif
 
-    // If we found our type while recursively processing stuff, just use it.
-    Entry = &MappedTypes[Ty];
-    if (*Entry) return *Entry;
+  if (!IsUniqued && !Visited.insert(cast<StructType>(Ty)).second) {
+    StructType *DTy = StructType::create(Ty->getContext());
+    return *Entry = DTy;
+  }
 
-    // If all of the element types mapped directly over, then the type is usable
-    // as-is.
-    if (!AnyChange)
-      return *Entry = Ty;
+  // If this is not a recursive type, then just map all of the elements and
+  // then rebuild the type from inside out.
+  SmallVector<Type *, 4> ElementTypes;
+
+  // If there are no element types to map, then the type is itself.  This is
+  // true for the anonymous {} struct, things like 'float', integers, etc.
+  if (Ty->getNumContainedTypes() == 0 && IsUniqued)
+    return *Entry = Ty;
+
+  // Remap all of the elements, keeping track of whether any of them change.
+  bool AnyChange = false;
+  ElementTypes.resize(Ty->getNumContainedTypes());
+  for (unsigned I = 0, E = Ty->getNumContainedTypes(); I != E; ++I) {
+    ElementTypes[I] = get(Ty->getContainedType(I), Visited);
+    AnyChange |= ElementTypes[I] != Ty->getContainedType(I);
+  }
 
-    // Otherwise, rebuild a modified type.
-    switch (Ty->getTypeID()) {
-    default: llvm_unreachable("unknown derived type to remap");
-    case Type::ArrayTyID:
-      return *Entry = ArrayType::get(ElementTypes[0],
-                                     cast<ArrayType>(Ty)->getNumElements());
-    case Type::VectorTyID:
-      return *Entry = VectorType::get(ElementTypes[0],
-                                      cast<VectorType>(Ty)->getNumElements());
-    case Type::PointerTyID:
-      return *Entry = PointerType::get(ElementTypes[0],
-                                      cast<PointerType>(Ty)->getAddressSpace());
-    case Type::FunctionTyID:
-      return *Entry = FunctionType::get(ElementTypes[0],
-                                        makeArrayRef(ElementTypes).slice(1),
-                                        cast<FunctionType>(Ty)->isVarArg());
-    case Type::StructTyID:
-      // Note that this is only reached for anonymous structs.
-      return *Entry = StructType::get(Ty->getContext(), ElementTypes,
-                                      cast<StructType>(Ty)->isPacked());
+  // If we found our type while recursively processing stuff, just use it.
+  Entry = &MappedTypes[Ty];
+  if (*Entry) {
+    if (auto *DTy = dyn_cast<StructType>(*Entry)) {
+      if (DTy->isOpaque()) {
+        auto *STy = cast<StructType>(Ty);
+        finishType(DTy, STy, ElementTypes);
+      }
     }
+    return *Entry;
   }
 
-  // Otherwise, this is an unmapped named struct.  If the struct can be directly
-  // mapped over, just use it as-is.  This happens in a case when the linked-in
-  // module has something like:
-  //   %T = type {%T*, i32}
-  //   @GV = global %T* null
-  // where T does not exist at all in the destination module.
-  //
-  // The other case we watch for is when the type is not in the destination
-  // module, but that it has to be rebuilt because it refers to something that
-  // is already mapped.  For example, if the destination module has:
-  //  %A = type { i32 }
-  // and the source module has something like
-  //  %A' = type { i32 }
-  //  %B = type { %A'* }
-  //  @GV = global %B* null
-  // then we want to create a new type: "%B = type { %A*}" and have it take the
-  // pristine "%B" name from the source module.
-  //
-  // To determine which case this is, we have to recursively walk the type graph
-  // speculating that we'll be able to reuse it unmodified.  Only if this is
-  // safe would we map the entire thing over.  Because this is an optimization,
-  // and is not required for the prettiness of the linked module, we just skip
-  // it and always rebuild a type here.
-  StructType *STy = cast<StructType>(Ty);
-
-  // If the type is opaque, we can just use it directly.
-  if (STy->isOpaque()) {
-    // A named structure type from src module is used. Add it to the Set of
-    // identified structs in the destination module.
-    DstStructTypesSet.insert(STy);
-    return *Entry = STy;
-  }
+  // If all of the element types mapped directly over and the type is not
+  // a nomed struct, then the type is usable as-is.
+  if (!AnyChange && IsUniqued)
+    return *Entry = Ty;
+
+  // Otherwise, rebuild a modified type.
+  switch (Ty->getTypeID()) {
+  default:
+    llvm_unreachable("unknown derived type to remap");
+  case Type::ArrayTyID:
+    return *Entry = ArrayType::get(ElementTypes[0],
+                                   cast<ArrayType>(Ty)->getNumElements());
+  case Type::VectorTyID:
+    return *Entry = VectorType::get(ElementTypes[0],
+                                    cast<VectorType>(Ty)->getNumElements());
+  case Type::PointerTyID:
+    return *Entry = PointerType::get(ElementTypes[0],
+                                     cast<PointerType>(Ty)->getAddressSpace());
+  case Type::FunctionTyID:
+    return *Entry = FunctionType::get(ElementTypes[0],
+                                      makeArrayRef(ElementTypes).slice(1),
+                                      cast<FunctionType>(Ty)->isVarArg());
+  case Type::StructTyID: {
+    auto *STy = cast<StructType>(Ty);
+    bool IsPacked = STy->isPacked();
+    if (IsUniqued)
+      return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked);
+
+    // If the type is opaque, we can just use it directly.
+    if (STy->isOpaque()) {
+      DstStructTypesSet.addOpaque(STy);
+      return *Entry = Ty;
+    }
+
+    if (StructType *OldT =
+            DstStructTypesSet.findNonOpaque(ElementTypes, IsPacked)) {
+      STy->setName("");
+      return *Entry = OldT;
+    }
 
-  // Otherwise we create a new type and resolve its body later.  This will be
-  // resolved by the top level of get().
-  SrcDefinitionsToResolve.push_back(STy);
-  StructType *DTy = StructType::create(STy->getContext());
-  // A new identified structure type was created. Add it to the set of
-  // identified structs in the destination module.
-  DstStructTypesSet.insert(DTy);
-  DstResolvedOpaqueTypes.insert(DTy);
-  return *Entry = DTy;
+    if (!AnyChange) {
+      DstStructTypesSet.addNonOpaque(STy);
+      return *Entry = Ty;
+    }
+
+    StructType *DTy = StructType::create(Ty->getContext());
+    finishType(DTy, STy, ElementTypes);
+    return *Entry = DTy;
+  }
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -355,149 +359,148 @@ Type *TypeMapTy::getImpl(Type *Ty) {
 //===----------------------------------------------------------------------===//
 
 namespace {
-  class ModuleLinker;
-
-  /// Creates prototypes for functions that are lazily linked on the fly. This
-  /// speeds up linking for modules with many/ lazily linked functions of which
-  /// few get used.
-  class ValueMaterializerTy : public ValueMaterializer {
-    TypeMapTy &TypeMap;
-    Module *DstM;
-    std::vector<Function*> &LazilyLinkFunctions;
-  public:
-    ValueMaterializerTy(TypeMapTy &TypeMap, Module *DstM,
-                        std::vector<Function*> &LazilyLinkFunctions) :
-      ValueMaterializer(), TypeMap(TypeMap), DstM(DstM),
-      LazilyLinkFunctions(LazilyLinkFunctions) {
-    }
+class ModuleLinker;
 
-    Value *materializeValueFor(Value *V) override;
-  };
+/// Creates prototypes for functions that are lazily linked on the fly. This
+/// speeds up linking for modules with many/ lazily linked functions of which
+/// few get used.
+class ValueMaterializerTy : public ValueMaterializer {
+  TypeMapTy &TypeMap;
+  Module *DstM;
+  std::vector<GlobalValue *> &LazilyLinkGlobalValues;
 
-  namespace {
-  class LinkDiagnosticInfo : public DiagnosticInfo {
-    const Twine &Msg;
+public:
+  ValueMaterializerTy(TypeMapTy &TypeMap, Module *DstM,
+                      std::vector<GlobalValue *> &LazilyLinkGlobalValues)
+      : ValueMaterializer(), TypeMap(TypeMap), DstM(DstM),
+        LazilyLinkGlobalValues(LazilyLinkGlobalValues) {}
 
-  public:
-    LinkDiagnosticInfo(DiagnosticSeverity Severity, const Twine &Msg);
-    void print(DiagnosticPrinter &DP) const override;
-  };
-  LinkDiagnosticInfo::LinkDiagnosticInfo(DiagnosticSeverity Severity,
-                                         const Twine &Msg)
-      : DiagnosticInfo(DK_Linker, Severity), Msg(Msg) {}
-  void LinkDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; }
-  }
+  Value *materializeValueFor(Value *V) override;
+};
 
-  /// This is an implementation class for the LinkModules function, which is the
-  /// entrypoint for this file.
-  class ModuleLinker {
-    Module *DstM, *SrcM;
+class LinkDiagnosticInfo : public DiagnosticInfo {
+  const Twine &Msg;
 
-    TypeMapTy TypeMap;
-    ValueMaterializerTy ValMaterializer;
+public:
+  LinkDiagnosticInfo(DiagnosticSeverity Severity, const Twine &Msg);
+  void print(DiagnosticPrinter &DP) const override;
+};
+LinkDiagnosticInfo::LinkDiagnosticInfo(DiagnosticSeverity Severity,
+                                       const Twine &Msg)
+    : DiagnosticInfo(DK_Linker, Severity), Msg(Msg) {}
+void LinkDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; }
+
+/// This is an implementation class for the LinkModules function, which is the
+/// entrypoint for this file.
+class ModuleLinker {
+  Module *DstM, *SrcM;
+
+  TypeMapTy TypeMap;
+  ValueMaterializerTy ValMaterializer;
+
+  /// Mapping of values from what they used to be in Src, to what they are now
+  /// in DstM.  ValueToValueMapTy is a ValueMap, which involves some overhead
+  /// due to the use of Value handles which the Linker doesn't actually need,
+  /// but this allows us to reuse the ValueMapper code.
+  ValueToValueMapTy ValueMap;
+
+  struct AppendingVarInfo {
+    GlobalVariable *NewGV;   // New aggregate global in dest module.
+    const Constant *DstInit; // Old initializer from dest module.
+    const Constant *SrcInit; // Old initializer from src module.
+  };
 
-    /// Mapping of values from what they used to be in Src, to what they are now
-    /// in DstM.  ValueToValueMapTy is a ValueMap, which involves some overhead
-    /// due to the use of Value handles which the Linker doesn't actually need,
-    /// but this allows us to reuse the ValueMapper code.
-    ValueToValueMapTy ValueMap;
+  std::vector<AppendingVarInfo> AppendingVars;
 
-    struct AppendingVarInfo {
-      GlobalVariable *NewGV;   // New aggregate global in dest module.
-      const Constant *DstInit; // Old initializer from dest module.
-      const Constant *SrcInit; // Old initializer from src module.
-    };
+  // Set of items not to link in from source.
+  SmallPtrSet<const Value *, 16> DoNotLinkFromSource;
 
-    std::vector<AppendingVarInfo> AppendingVars;
+  // Vector of GlobalValues to lazily link in.
+  std::vector<GlobalValue *> LazilyLinkGlobalValues;
 
-    // Set of items not to link in from source.
-    SmallPtrSet<const Value*, 16> DoNotLinkFromSource;
+  /// Functions that have replaced other functions.
+  SmallPtrSet<const Function *, 16> OverridingFunctions;
 
-    // Vector of functions to lazily link in.
-    std::vector<Function*> LazilyLinkFunctions;
+  DiagnosticHandlerFunction DiagnosticHandler;
 
-    Linker::DiagnosticHandlerFunction DiagnosticHandler;
+public:
+  ModuleLinker(Module *dstM, Linker::IdentifiedStructTypeSet &Set, Module *srcM,
+               DiagnosticHandlerFunction DiagnosticHandler)
+      : DstM(dstM), SrcM(srcM), TypeMap(Set),
+        ValMaterializer(TypeMap, DstM, LazilyLinkGlobalValues),
+        DiagnosticHandler(DiagnosticHandler) {}
 
-  public:
-    ModuleLinker(Module *dstM, TypeSet &Set, Module *srcM,
-                 Linker::DiagnosticHandlerFunction DiagnosticHandler)
-        : DstM(dstM), SrcM(srcM), TypeMap(Set),
-          ValMaterializer(TypeMap, DstM, LazilyLinkFunctions),
-          DiagnosticHandler(DiagnosticHandler) {}
+  bool run();
 
-    bool run();
+private:
+  bool shouldLinkFromSource(bool &LinkFromSrc, const GlobalValue &Dest,
+                            const GlobalValue &Src);
 
-  private:
-    bool shouldLinkFromSource(bool &LinkFromSrc, const GlobalValue &Dest,
-                              const GlobalValue &Src);
+  /// Helper method for setting a message and returning an error code.
+  bool emitError(const Twine &Message) {
+    DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message));
+    return true;
+  }
 
-    /// Helper method for setting a message and returning an error code.
-    bool emitError(const Twine &Message) {
-      DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message));
-      return true;
-    }
+  void emitWarning(const Twine &Message) {
+    DiagnosticHandler(LinkDiagnosticInfo(DS_Warning, Message));
+  }
 
-    void emitWarning(const Twine &Message) {
-      DiagnosticHandler(LinkDiagnosticInfo(DS_Warning, Message));
-    }
+  bool getComdatLeader(Module *M, StringRef ComdatName,
+                       const GlobalVariable *&GVar);
+  bool computeResultingSelectionKind(StringRef ComdatName,
+                                     Comdat::SelectionKind Src,
+                                     Comdat::SelectionKind Dst,
+                                     Comdat::SelectionKind &Result,
+                                     bool &LinkFromSrc);
+  std::map<const Comdat *, std::pair<Comdat::SelectionKind, bool>>
+      ComdatsChosen;
+  bool getComdatResult(const Comdat *SrcC, Comdat::SelectionKind &SK,
+                       bool &LinkFromSrc);
+
+  /// Given a global in the source module, return the global in the
+  /// destination module that is being linked to, if any.
+  GlobalValue *getLinkedToGlobal(const GlobalValue *SrcGV) {
+    // If the source has no name it can't link.  If it has local linkage,
+    // there is no name match-up going on.
+    if (!SrcGV->hasName() || SrcGV->hasLocalLinkage())
+      return nullptr;
+
+    // Otherwise see if we have a match in the destination module's symtab.
+    GlobalValue *DGV = DstM->getNamedValue(SrcGV->getName());
+    if (!DGV)
+      return nullptr;
+
+    // If we found a global with the same name in the dest module, but it has
+    // internal linkage, we are really not doing any linkage here.
+    if (DGV->hasLocalLinkage())
+      return nullptr;
+
+    // Otherwise, we do in fact link to the destination global.
+    return DGV;
+  }
 
-    bool getComdatLeader(Module *M, StringRef ComdatName,
-                         const GlobalVariable *&GVar);
-    bool computeResultingSelectionKind(StringRef ComdatName,
-                                       Comdat::SelectionKind Src,
-                                       Comdat::SelectionKind Dst,
-                                       Comdat::SelectionKind &Result,
-                                       bool &LinkFromSrc);
-    std::map<const Comdat *, std::pair<Comdat::SelectionKind, bool>>
-        ComdatsChosen;
-    bool getComdatResult(const Comdat *SrcC, Comdat::SelectionKind &SK,
-                         bool &LinkFromSrc);
-
-    /// Given a global in the source module, return the global in the
-    /// destination module that is being linked to, if any.
-    GlobalValue *getLinkedToGlobal(const GlobalValue *SrcGV) {
-      // If the source has no name it can't link.  If it has local linkage,
-      // there is no name match-up going on.
-      if (!SrcGV->hasName() || SrcGV->hasLocalLinkage())
-        return nullptr;
-
-      // Otherwise see if we have a match in the destination module's symtab.
-      GlobalValue *DGV = DstM->getNamedValue(SrcGV->getName());
-      if (!DGV) return nullptr;
-
-      // If we found a global with the same name in the dest module, but it has
-      // internal linkage, we are really not doing any linkage here.
-      if (DGV->hasLocalLinkage())
-        return nullptr;
-
-      // Otherwise, we do in fact link to the destination global.
-      return DGV;
-    }
+  void computeTypeMapping();
 
-    void computeTypeMapping();
+  void upgradeMismatchedGlobalArray(StringRef Name);
+  void upgradeMismatchedGlobals();
 
-    void upgradeMismatchedGlobalArray(StringRef Name);
-    void upgradeMismatchedGlobals();
+  bool linkAppendingVarProto(GlobalVariable *DstGV,
+                             const GlobalVariable *SrcGV);
 
-    bool linkAppendingVarProto(GlobalVariable *DstGV,
-                               const GlobalVariable *SrcGV);
+  bool linkGlobalValueProto(GlobalValue *GV);
+  bool linkModuleFlagsMetadata();
 
-    bool linkGlobalValueProto(GlobalValue *GV);
-    GlobalValue *linkGlobalVariableProto(const GlobalVariable *SGVar,
-                                         GlobalValue *DGV, bool LinkFromSrc);
-    GlobalValue *linkFunctionProto(const Function *SF, GlobalValue *DGV,
-                                   bool LinkFromSrc);
-    GlobalValue *linkGlobalAliasProto(const GlobalAlias *SGA, GlobalValue *DGV,
-                                      bool LinkFromSrc);
+  void linkAppendingVarInit(const AppendingVarInfo &AVI);
 
-    bool linkModuleFlagsMetadata();
+  void linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src);
+  bool linkFunctionBody(Function &Dst, Function &Src);
+  void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src);
+  bool linkGlobalValueBody(GlobalValue &Src);
 
-    void linkAppendingVarInit(const AppendingVarInfo &AVI);
-    void linkGlobalInits();
-    void linkFunctionBody(Function *Dst, Function *Src);
-    void linkAliasBodies();
-    void linkNamedMDNodes();
-  };
+  void linkNamedMDNodes();
+  void stripReplacedSubprograms();
+};
 }
 
 /// The LLVM SymbolTable class autorenames globals that conflict in the symbol
@@ -524,17 +527,7 @@ static void forceRenaming(GlobalValue *GV, StringRef Name) {
 /// copy additional attributes (those not needed to construct a GlobalValue)
 /// from the SrcGV to the DestGV.
 static void copyGVAttributes(GlobalValue *DestGV, const GlobalValue *SrcGV) {
-  // Use the maximum alignment, rather than just copying the alignment of SrcGV.
-  auto *DestGO = dyn_cast<GlobalObject>(DestGV);
-  unsigned Alignment;
-  if (DestGO)
-    Alignment = std::max(DestGO->getAlignment(), SrcGV->getAlignment());
-
   DestGV->copyAttributesFrom(SrcGV);
-
-  if (DestGO)
-    DestGO->setAlignment(Alignment);
-
   forceRenaming(DestGV, SrcGV->getName());
 }
 
@@ -551,22 +544,71 @@ static bool isLessConstraining(GlobalValue::VisibilityTypes a,
   return false;
 }
 
+/// Loop through the global variables in the src module and merge them into the
+/// dest module.
+static GlobalVariable *copyGlobalVariableProto(TypeMapTy &TypeMap, Module &DstM,
+                                               const GlobalVariable *SGVar) {
+  // No linking to be performed or linking from the source: simply create an
+  // identical version of the symbol over in the dest module... the
+  // initializer will be filled in later by LinkGlobalInits.
+  GlobalVariable *NewDGV = new GlobalVariable(
+      DstM, TypeMap.get(SGVar->getType()->getElementType()),
+      SGVar->isConstant(), SGVar->getLinkage(), /*init*/ nullptr,
+      SGVar->getName(), /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
+      SGVar->getType()->getAddressSpace());
+
+  return NewDGV;
+}
+
+/// Link the function in the source module into the destination module if
+/// needed, setting up mapping information.
+static Function *copyFunctionProto(TypeMapTy &TypeMap, Module &DstM,
+                                   const Function *SF) {
+  // If there is no linkage to be performed or we are linking from the source,
+  // bring SF over.
+  return Function::Create(TypeMap.get(SF->getFunctionType()), SF->getLinkage(),
+                          SF->getName(), &DstM);
+}
+
+/// Set up prototypes for any aliases that come over from the source module.
+static GlobalAlias *copyGlobalAliasProto(TypeMapTy &TypeMap, Module &DstM,
+                                         const GlobalAlias *SGA) {
+  // If there is no linkage to be performed or we're linking from the source,
+  // bring over SGA.
+  auto *PTy = cast<PointerType>(TypeMap.get(SGA->getType()));
+  return GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
+                             SGA->getLinkage(), SGA->getName(), &DstM);
+}
+
+static GlobalValue *copyGlobalValueProto(TypeMapTy &TypeMap, Module &DstM,
+                                         const GlobalValue *SGV) {
+  GlobalValue *NewGV;
+  if (auto *SGVar = dyn_cast<GlobalVariable>(SGV))
+    NewGV = copyGlobalVariableProto(TypeMap, DstM, SGVar);
+  else if (auto *SF = dyn_cast<Function>(SGV))
+    NewGV = copyFunctionProto(TypeMap, DstM, SF);
+  else
+    NewGV = copyGlobalAliasProto(TypeMap, DstM, cast<GlobalAlias>(SGV));
+  copyGVAttributes(NewGV, SGV);
+  return NewGV;
+}
+
 Value *ValueMaterializerTy::materializeValueFor(Value *V) {
-  Function *SF = dyn_cast<Function>(V);
-  if (!SF)
+  auto *SGV = dyn_cast<GlobalValue>(V);
+  if (!SGV)
     return nullptr;
 
-  Function *DF = Function::Create(TypeMap.get(SF->getFunctionType()),
-                                  SF->getLinkage(), SF->getName(), DstM);
-  copyGVAttributes(DF, SF);
+  GlobalValue *DGV = copyGlobalValueProto(TypeMap, *DstM, SGV);
 
-  if (Comdat *SC = SF->getComdat()) {
-    Comdat *DC = DstM->getOrInsertComdat(SC->getName());
-    DF->setComdat(DC);
+  if (Comdat *SC = SGV->getComdat()) {
+    if (auto *DGO = dyn_cast<GlobalObject>(DGV)) {
+      Comdat *DC = DstM->getOrInsertComdat(SC->getName());
+      DGO->setComdat(DC);
+    }
   }
 
-  LazilyLinkFunctions.push_back(SF);
-  return DF;
+  LazilyLinkGlobalValues.push_back(SGV);
+  return DGV;
 }
 
 bool ModuleLinker::getComdatLeader(Module *M, StringRef ComdatName,
@@ -767,73 +809,73 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
 /// types 'Foo' but one got renamed when the module was loaded into the same
 /// LLVMContext.
 void ModuleLinker::computeTypeMapping() {
-  // Incorporate globals.
-  for (Module::global_iterator I = SrcM->global_begin(),
-       E = SrcM->global_end(); I != E; ++I) {
-    GlobalValue *DGV = getLinkedToGlobal(I);
-    if (!DGV) continue;
+  for (GlobalValue &SGV : SrcM->globals()) {
+    GlobalValue *DGV = getLinkedToGlobal(&SGV);
+    if (!DGV)
+      continue;
 
-    if (!DGV->hasAppendingLinkage() || !I->hasAppendingLinkage()) {
-      TypeMap.addTypeMapping(DGV->getType(), I->getType());
+    if (!DGV->hasAppendingLinkage() || !SGV.hasAppendingLinkage()) {
+      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
       continue;
     }
 
     // Unify the element type of appending arrays.
     ArrayType *DAT = cast<ArrayType>(DGV->getType()->getElementType());
-    ArrayType *SAT = cast<ArrayType>(I->getType()->getElementType());
+    ArrayType *SAT = cast<ArrayType>(SGV.getType()->getElementType());
     TypeMap.addTypeMapping(DAT->getElementType(), SAT->getElementType());
   }
 
-  // Incorporate functions.
-  for (Module::iterator I = SrcM->begin(), E = SrcM->end(); I != E; ++I) {
-    if (GlobalValue *DGV = getLinkedToGlobal(I))
-      TypeMap.addTypeMapping(DGV->getType(), I->getType());
+  for (GlobalValue &SGV : *SrcM) {
+    if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
+      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
+  }
+
+  for (GlobalValue &SGV : SrcM->aliases()) {
+    if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
+      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
   }
 
   // Incorporate types by name, scanning all the types in the source module.
   // At this point, the destination module may have a type "%foo = { i32 }" for
   // example.  When the source module got loaded into the same LLVMContext, if
   // it had the same type, it would have been renamed to "%foo.42 = { i32 }".
-  TypeFinder SrcStructTypes;
-  SrcStructTypes.run(*SrcM, true);
-  SmallPtrSet<StructType*, 32> SrcStructTypesSet(SrcStructTypes.begin(),
-                                                 SrcStructTypes.end());
-
-  for (unsigned i = 0, e = SrcStructTypes.size(); i != e; ++i) {
-    StructType *ST = SrcStructTypes[i];
-    if (!ST->hasName()) continue;
+  std::vector<StructType *> Types = SrcM->getIdentifiedStructTypes();
+  for (StructType *ST : Types) {
+    if (!ST->hasName())
+      continue;
 
     // Check to see if there is a dot in the name followed by a digit.
     size_t DotPos = ST->getName().rfind('.');
     if (DotPos == 0 || DotPos == StringRef::npos ||
         ST->getName().back() == '.' ||
-        !isdigit(static_cast<unsigned char>(ST->getName()[DotPos+1])))
+        !isdigit(static_cast<unsigned char>(ST->getName()[DotPos + 1])))
       continue;
 
     // Check to see if the destination module has a struct with the prefix name.
-    if (StructType *DST = DstM->getTypeByName(ST->getName().substr(0, DotPos)))
-      // Don't use it if this actually came from the source module. They're in
-      // the same LLVMContext after all. Also don't use it unless the type is
-      // actually used in the destination module. This can happen in situations
-      // like this:
-      //
-      //      Module A                         Module B
-      //      --------                         --------
-      //   %Z = type { %A }                %B = type { %C.1 }
-      //   %A = type { %B.1, [7 x i8] }    %C.1 = type { i8* }
-      //   %B.1 = type { %C }              %A.2 = type { %B.3, [5 x i8] }
-      //   %C = type { i8* }               %B.3 = type { %C.1 }
-      //
-      // When we link Module B with Module A, the '%B' in Module B is
-      // used. However, that would then use '%C.1'. But when we process '%C.1',
-      // we prefer to take the '%C' version. So we are then left with both
-      // '%C.1' and '%C' being used for the same types. This leads to some
-      // variables using one type and some using the other.
-      if (!SrcStructTypesSet.count(DST) && TypeMap.DstStructTypesSet.count(DST))
-        TypeMap.addTypeMapping(DST, ST);
-  }
+    StructType *DST = DstM->getTypeByName(ST->getName().substr(0, DotPos));
+    if (!DST)
+      continue;
 
-  // Don't bother incorporating aliases, they aren't generally typed well.
+    // Don't use it if this actually came from the source module. They're in
+    // the same LLVMContext after all. Also don't use it unless the type is
+    // actually used in the destination module. This can happen in situations
+    // like this:
+    //
+    //      Module A                         Module B
+    //      --------                         --------
+    //   %Z = type { %A }                %B = type { %C.1 }
+    //   %A = type { %B.1, [7 x i8] }    %C.1 = type { i8* }
+    //   %B.1 = type { %C }              %A.2 = type { %B.3, [5 x i8] }
+    //   %C = type { i8* }               %B.3 = type { %C.1 }
+    //
+    // When we link Module B with Module A, the '%B' in Module B is
+    // used. However, that would then use '%C.1'. But when we process '%C.1',
+    // we prefer to take the '%C' version. So we are then left with both
+    // '%C.1' and '%C' being used for the same types. This leads to some
+    // variables using one type and some using the other.
+    if (TypeMap.DstStructTypesSet.hasType(DST))
+      TypeMap.addTypeMapping(DST, ST);
+  }
 
   // Now that we have discovered all of the type equivalences, get a body for
   // any 'opaque' types in the dest module that are now resolved.
@@ -1030,118 +1072,53 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
     return false;
 
   GlobalValue *NewGV;
-  if (auto *SGVar = dyn_cast<GlobalVariable>(SGV)) {
-    NewGV = linkGlobalVariableProto(SGVar, DGV, LinkFromSrc);
-    if (!NewGV)
-      return true;
-  } else if (auto *SF = dyn_cast<Function>(SGV)) {
-    NewGV = linkFunctionProto(SF, DGV, LinkFromSrc);
+  if (!LinkFromSrc) {
+    NewGV = DGV;
   } else {
-    NewGV = linkGlobalAliasProto(cast<GlobalAlias>(SGV), DGV, LinkFromSrc);
-  }
-
-  if (NewGV) {
-    if (NewGV != DGV)
-      copyGVAttributes(NewGV, SGV);
-
-    NewGV->setUnnamedAddr(HasUnnamedAddr);
-    NewGV->setVisibility(Visibility);
-
-    if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
-      if (C)
-        NewGO->setComdat(C);
+    // If the GV is to be lazily linked, don't create it just yet.
+    // The ValueMaterializerTy will deal with creating it if it's used.
+    if (!DGV && (SGV->hasLocalLinkage() || SGV->hasLinkOnceLinkage() ||
+                 SGV->hasAvailableExternallyLinkage())) {
+      DoNotLinkFromSource.insert(SGV);
+      return false;
     }
 
-    // Make sure to remember this mapping.
-    if (NewGV != DGV) {
-      if (DGV) {
-        DGV->replaceAllUsesWith(
-            ConstantExpr::getBitCast(NewGV, DGV->getType()));
-        DGV->eraseFromParent();
-      }
-      ValueMap[SGV] = NewGV;
-    }
-  }
+    NewGV = copyGlobalValueProto(TypeMap, *DstM, SGV);
 
-  return false;
-}
+    if (DGV && isa<Function>(DGV))
+      if (auto *NewF = dyn_cast<Function>(NewGV))
+        OverridingFunctions.insert(NewF);
+  }
 
-/// Loop through the global variables in the src module and merge them into the
-/// dest module.
-GlobalValue *ModuleLinker::linkGlobalVariableProto(const GlobalVariable *SGVar,
-                                                   GlobalValue *DGV,
-                                                   bool LinkFromSrc) {
-  unsigned Alignment = 0;
-  bool ClearConstant = false;
+  NewGV->setUnnamedAddr(HasUnnamedAddr);
+  NewGV->setVisibility(Visibility);
 
-  if (DGV) {
-    if (DGV->hasCommonLinkage() && SGVar->hasCommonLinkage())
-      Alignment = std::max(SGVar->getAlignment(), DGV->getAlignment());
+  if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
+    if (C)
+      NewGO->setComdat(C);
 
-    auto *DGVar = dyn_cast<GlobalVariable>(DGV);
-    if (!SGVar->isConstant() || (DGVar && !DGVar->isConstant()))
-      ClearConstant = true;
+    if (DGV && DGV->hasCommonLinkage() && SGV->hasCommonLinkage())
+      NewGO->setAlignment(std::max(DGV->getAlignment(), SGV->getAlignment()));
   }
 
-  if (!LinkFromSrc) {
-    if (auto *NewGVar = dyn_cast<GlobalVariable>(DGV)) {
-      if (Alignment)
-        NewGVar->setAlignment(Alignment);
-      if (NewGVar->isDeclaration() && ClearConstant)
-        NewGVar->setConstant(false);
-    }
-    return DGV;
+  if (auto *NewGVar = dyn_cast<GlobalVariable>(NewGV)) {
+    auto *DGVar = dyn_cast_or_null<GlobalVariable>(DGV);
+    auto *SGVar = dyn_cast<GlobalVariable>(SGV);
+    if (DGVar && SGVar && DGVar->isDeclaration() && SGVar->isDeclaration() &&
+        (!DGVar->isConstant() || !SGVar->isConstant()))
+      NewGVar->setConstant(false);
   }
 
-  // No linking to be performed or linking from the source: simply create an
-  // identical version of the symbol over in the dest module... the
-  // initializer will be filled in later by LinkGlobalInits.
-  GlobalVariable *NewDGV = new GlobalVariable(
-      *DstM, TypeMap.get(SGVar->getType()->getElementType()),
-      SGVar->isConstant(), SGVar->getLinkage(), /*init*/ nullptr,
-      SGVar->getName(), /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
-      SGVar->getType()->getAddressSpace());
-
-  if (Alignment)
-    NewDGV->setAlignment(Alignment);
-
-  return NewDGV;
-}
-
-/// Link the function in the source module into the destination module if
-/// needed, setting up mapping information.
-GlobalValue *ModuleLinker::linkFunctionProto(const Function *SF,
-                                             GlobalValue *DGV,
-                                             bool LinkFromSrc) {
-  if (!LinkFromSrc)
-    return DGV;
-
-  // If the function is to be lazily linked, don't create it just yet.
-  // The ValueMaterializerTy will deal with creating it if it's used.
-  if (!DGV && (SF->hasLocalLinkage() || SF->hasLinkOnceLinkage() ||
-               SF->hasAvailableExternallyLinkage())) {
-    DoNotLinkFromSource.insert(SF);
-    return nullptr;
+  // Make sure to remember this mapping.
+  if (NewGV != DGV) {
+    if (DGV) {
+      DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewGV, DGV->getType()));
+      DGV->eraseFromParent();
+    }
+    ValueMap[SGV] = NewGV;
   }
 
-  // If there is no linkage to be performed or we are linking from the source,
-  // bring SF over.
-  return Function::Create(TypeMap.get(SF->getFunctionType()), SF->getLinkage(),
-                          SF->getName(), DstM);
-}
-
-/// Set up prototypes for any aliases that come over from the source module.
-GlobalValue *ModuleLinker::linkGlobalAliasProto(const GlobalAlias *SGA,
-                                                GlobalValue *DGV,
-                                                bool LinkFromSrc) {
-  if (!LinkFromSrc)
-    return DGV;
-
-  // If there is no linkage to be performed or we're linking from the source,
-  // bring over SGA.
-  auto *PTy = cast<PointerType>(TypeMap.get(SGA->getType()));
-  return GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
-                             SGA->getLinkage(), SGA->getName(), DstM);
+  return false;
 }
 
 static void getArrayElements(const Constant *C,
@@ -1186,70 +1163,80 @@ void ModuleLinker::linkAppendingVarInit(const AppendingVarInfo &AVI) {
 
 /// Update the initializers in the Dest module now that all globals that may be
 /// referenced are in Dest.
-void ModuleLinker::linkGlobalInits() {
-  // Loop over all of the globals in the src module, mapping them over as we go
-  for (Module::const_global_iterator I = SrcM->global_begin(),
-       E = SrcM->global_end(); I != E; ++I) {
-
-    // Only process initialized GV's or ones not already in dest.
-    if (!I->hasInitializer() || DoNotLinkFromSource.count(I)) continue;
-
-    // Grab destination global variable.
-    GlobalVariable *DGV = cast<GlobalVariable>(ValueMap[I]);
-    // Figure out what the initializer looks like in the dest module.
-    DGV->setInitializer(MapValue(I->getInitializer(), ValueMap,
-                                 RF_None, &TypeMap, &ValMaterializer));
-  }
+void ModuleLinker::linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src) {
+  // Figure out what the initializer looks like in the dest module.
+  Dst.setInitializer(MapValue(Src.getInitializer(), ValueMap, RF_None, &TypeMap,
+                              &ValMaterializer));
 }
 
 /// Copy the source function over into the dest function and fix up references
 /// to values. At this point we know that Dest is an external function, and
 /// that Src is not.
-void ModuleLinker::linkFunctionBody(Function *Dst, Function *Src) {
-  assert(Src && Dst && Dst->isDeclaration() && !Src->isDeclaration());
+bool ModuleLinker::linkFunctionBody(Function &Dst, Function &Src) {
+  assert(Dst.isDeclaration() && !Src.isDeclaration());
+
+  // Materialize if needed.
+  if (std::error_code EC = Src.materialize())
+    return emitError(EC.message());
+
+  // Link in the prefix data.
+  if (Src.hasPrefixData())
+    Dst.setPrefixData(MapValue(Src.getPrefixData(), ValueMap, RF_None, &TypeMap,
+                               &ValMaterializer));
+
+  // Link in the prologue data.
+  if (Src.hasPrologueData())
+    Dst.setPrologueData(MapValue(Src.getPrologueData(), ValueMap, RF_None,
+                                 &TypeMap, &ValMaterializer));
 
   // Go through and convert function arguments over, remembering the mapping.
-  Function::arg_iterator DI = Dst->arg_begin();
-  for (Function::arg_iterator I = Src->arg_begin(), E = Src->arg_end();
-       I != E; ++I, ++DI) {
-    DI->setName(I->getName());  // Copy the name over.
+  Function::arg_iterator DI = Dst.arg_begin();
+  for (Argument &Arg : Src.args()) {
+    DI->setName(Arg.getName());  // Copy the name over.
 
     // Add a mapping to our mapping.
-    ValueMap[I] = DI;
+    ValueMap[&Arg] = DI;
+    ++DI;
   }
 
   // Splice the body of the source function into the dest function.
-  Dst->getBasicBlockList().splice(Dst->end(), Src->getBasicBlockList());
+  Dst.getBasicBlockList().splice(Dst.end(), Src.getBasicBlockList());
 
   // At this point, all of the instructions and values of the function are now
   // copied over.  The only problem is that they are still referencing values in
   // the Source function as operands.  Loop through all of the operands of the
   // functions and patch them up to point to the local versions.
-  for (Function::iterator BB = Dst->begin(), BE = Dst->end(); BB != BE; ++BB)
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-      RemapInstruction(I, ValueMap, RF_IgnoreMissingEntries, &TypeMap,
+  for (BasicBlock &BB : Dst)
+    for (Instruction &I : BB)
+      RemapInstruction(&I, ValueMap, RF_IgnoreMissingEntries, &TypeMap,
                        &ValMaterializer);
 
   // There is no need to map the arguments anymore.
-  for (Function::arg_iterator I = Src->arg_begin(), E = Src->arg_end();
-       I != E; ++I)
-    ValueMap.erase(I);
+  for (Argument &Arg : Src.args())
+    ValueMap.erase(&Arg);
+
+  Src.Dematerialize();
+  return false;
+}
 
+void ModuleLinker::linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src) {
+  Constant *Aliasee = Src.getAliasee();
+  Constant *Val =
+      MapValue(Aliasee, ValueMap, RF_None, &TypeMap, &ValMaterializer);
+  Dst.setAliasee(Val);
 }
 
-/// Insert all of the aliases in Src into the Dest module.
-void ModuleLinker::linkAliasBodies() {
-  for (Module::alias_iterator I = SrcM->alias_begin(), E = SrcM->alias_end();
-       I != E; ++I) {
-    if (DoNotLinkFromSource.count(I))
-      continue;
-    if (Constant *Aliasee = I->getAliasee()) {
-      GlobalAlias *DA = cast<GlobalAlias>(ValueMap[I]);
-      Constant *Val =
-          MapValue(Aliasee, ValueMap, RF_None, &TypeMap, &ValMaterializer);
-      DA->setAliasee(Val);
-    }
+bool ModuleLinker::linkGlobalValueBody(GlobalValue &Src) {
+  Value *Dst = ValueMap[&Src];
+  assert(Dst);
+  if (auto *F = dyn_cast<Function>(&Src))
+    return linkFunctionBody(cast<Function>(*Dst), *F);
+  if (auto *GVar = dyn_cast<GlobalVariable>(&Src)) {
+    linkGlobalInit(cast<GlobalVariable>(*Dst), *GVar);
+    return false;
   }
+  linkAliasBody(cast<GlobalAlias>(*Dst), cast<GlobalAlias>(Src));
+  return false;
 }
 
 /// Insert all of the named MDNodes in Src into the Dest module.
@@ -1262,8 +1249,50 @@ void ModuleLinker::linkNamedMDNodes() {
     NamedMDNode *DestNMD = DstM->getOrInsertNamedMetadata(I->getName());
     // Add Src elements into Dest node.
     for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
-      DestNMD->addOperand(MapValue(I->getOperand(i), ValueMap,
-                                   RF_None, &TypeMap, &ValMaterializer));
+      DestNMD->addOperand(MapMetadata(I->getOperand(i), ValueMap, RF_None,
+                                      &TypeMap, &ValMaterializer));
+  }
+}
+
+/// Drop DISubprograms that have been superseded.
+///
+/// FIXME: this creates an asymmetric result: we strip losing subprograms from
+/// DstM, but leave losing subprograms in SrcM.  Instead we should also strip
+/// losers from SrcM, but this requires extra plumbing in MapMetadata.
+void ModuleLinker::stripReplacedSubprograms() {
+  // Avoid quadratic runtime by returning early when there's nothing to do.
+  if (OverridingFunctions.empty())
+    return;
+
+  // Move the functions now, so the set gets cleared even on early returns.
+  auto Functions = std::move(OverridingFunctions);
+  OverridingFunctions.clear();
+
+  // Drop subprograms whose functions have been overridden by the new compile
+  // unit.
+  NamedMDNode *CompileUnits = DstM->getNamedMetadata("llvm.dbg.cu");
+  if (!CompileUnits)
+    return;
+  for (unsigned I = 0, E = CompileUnits->getNumOperands(); I != E; ++I) {
+    DICompileUnit CU(CompileUnits->getOperand(I));
+    assert(CU && "Expected valid compile unit");
+
+    DITypedArray<DISubprogram> SPs(CU.getSubprograms());
+    assert(SPs && "Expected valid subprogram array");
+
+    SmallVector<Metadata *, 16> NewSPs;
+    NewSPs.reserve(SPs.getNumElements());
+    for (unsigned S = 0, SE = SPs.getNumElements(); S != SE; ++S) {
+      DISubprogram SP = SPs.getElement(S);
+      if (SP && SP.getFunction() && Functions.count(SP.getFunction()))
+        continue;
+
+      NewSPs.push_back(SP);
+    }
+
+    // Redirect operand to the overriding subprogram.
+    if (NewSPs.size() != SPs.getNumElements())
+      CU.replaceSubprograms(DIArray(MDNode::get(DstM->getContext(), NewSPs)));
   }
 }
 
@@ -1284,17 +1313,17 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
   }
 
   // First build a map of the existing module flags and requirements.
-  DenseMap<MDString*, MDNode*> Flags;
+  DenseMap<MDString *, std::pair<MDNode *, unsigned>> Flags;
   SmallSetVector<MDNode*, 16> Requirements;
   for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) {
     MDNode *Op = DstModFlags->getOperand(I);
-    ConstantInt *Behavior = cast<ConstantInt>(Op->getOperand(0));
+    ConstantInt *Behavior = mdconst::extract<ConstantInt>(Op->getOperand(0));
     MDString *ID = cast<MDString>(Op->getOperand(1));
 
     if (Behavior->getZExtValue() == Module::Require) {
       Requirements.insert(cast<MDNode>(Op->getOperand(2)));
     } else {
-      Flags[ID] = Op;
+      Flags[ID] = std::make_pair(Op, I);
     }
   }
 
@@ -1303,9 +1332,12 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
   bool HasErr = false;
   for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) {
     MDNode *SrcOp = SrcModFlags->getOperand(I);
-    ConstantInt *SrcBehavior = cast<ConstantInt>(SrcOp->getOperand(0));
+    ConstantInt *SrcBehavior =
+        mdconst::extract<ConstantInt>(SrcOp->getOperand(0));
     MDString *ID = cast<MDString>(SrcOp->getOperand(1));
-    MDNode *DstOp = Flags.lookup(ID);
+    MDNode *DstOp;
+    unsigned DstIndex;
+    std::tie(DstOp, DstIndex) = Flags.lookup(ID);
     unsigned SrcBehaviorValue = SrcBehavior->getZExtValue();
 
     // If this is a requirement, add it and continue.
@@ -1320,13 +1352,14 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
 
     // If there is no existing flag with this ID, just add it.
     if (!DstOp) {
-      Flags[ID] = SrcOp;
+      Flags[ID] = std::make_pair(SrcOp, DstModFlags->getNumOperands());
       DstModFlags->addOperand(SrcOp);
       continue;
     }
 
     // Otherwise, perform a merge.
-    ConstantInt *DstBehavior = cast<ConstantInt>(DstOp->getOperand(0));
+    ConstantInt *DstBehavior =
+        mdconst::extract<ConstantInt>(DstOp->getOperand(0));
     unsigned DstBehaviorValue = DstBehavior->getZExtValue();
 
     // If either flag has override behavior, handle it first.
@@ -1340,8 +1373,8 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
       continue;
     } else if (SrcBehaviorValue == Module::Override) {
       // Update the destination flag to that of the source.
-      DstOp->replaceOperandWith(0, SrcBehavior);
-      DstOp->replaceOperandWith(2, SrcOp->getOperand(2));
+      DstModFlags->setOperand(DstIndex, SrcOp);
+      Flags[ID].first = SrcOp;
       continue;
     }
 
@@ -1352,6 +1385,13 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
       continue;
     }
 
+    auto replaceDstValue = [&](MDNode *New) {
+      Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New};
+      MDNode *Flag = MDNode::get(DstM->getContext(), FlagOps);
+      DstModFlags->setOperand(DstIndex, Flag);
+      Flags[ID].first = Flag;
+    };
+
     // Perform the merge for standard behavior types.
     switch (SrcBehaviorValue) {
     case Module::Require:
@@ -1375,29 +1415,23 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
     case Module::Append: {
       MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
       MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
-      unsigned NumOps = DstValue->getNumOperands() + SrcValue->getNumOperands();
-      Value **VP, **Values = VP = new Value*[NumOps];
-      for (unsigned i = 0, e = DstValue->getNumOperands(); i != e; ++i, ++VP)
-        *VP = DstValue->getOperand(i);
-      for (unsigned i = 0, e = SrcValue->getNumOperands(); i != e; ++i, ++VP)
-        *VP = SrcValue->getOperand(i);
-      DstOp->replaceOperandWith(2, MDNode::get(DstM->getContext(),
-                                               ArrayRef<Value*>(Values,
-                                                                NumOps)));
-      delete[] Values;
+      SmallVector<Metadata *, 8> MDs;
+      MDs.reserve(DstValue->getNumOperands() + SrcValue->getNumOperands());
+      MDs.append(DstValue->op_begin(), DstValue->op_end());
+      MDs.append(SrcValue->op_begin(), SrcValue->op_end());
+
+      replaceDstValue(MDNode::get(DstM->getContext(), MDs));
       break;
     }
     case Module::AppendUnique: {
-      SmallSetVector<Value*, 16> Elts;
+      SmallSetVector<Metadata *, 16> Elts;
       MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
       MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
-      for (unsigned i = 0, e = DstValue->getNumOperands(); i != e; ++i)
-        Elts.insert(DstValue->getOperand(i));
-      for (unsigned i = 0, e = SrcValue->getNumOperands(); i != e; ++i)
-        Elts.insert(SrcValue->getOperand(i));
-      DstOp->replaceOperandWith(2, MDNode::get(DstM->getContext(),
-                                               ArrayRef<Value*>(Elts.begin(),
-                                                                Elts.end())));
+      Elts.insert(DstValue->op_begin(), DstValue->op_end());
+      Elts.insert(SrcValue->op_begin(), SrcValue->op_end());
+
+      replaceDstValue(MDNode::get(DstM->getContext(),
+                                  makeArrayRef(Elts.begin(), Elts.end())));
       break;
     }
     }
@@ -1407,9 +1441,9 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
   for (unsigned I = 0, E = Requirements.size(); I != E; ++I) {
     MDNode *Requirement = Requirements[I];
     MDString *Flag = cast<MDString>(Requirement->getOperand(0));
-    Value *ReqValue = Requirement->getOperand(1);
+    Metadata *ReqValue = Requirement->getOperand(1);
 
-    MDNode *Op = Flags[Flag];
+    MDNode *Op = Flags[Flag].first;
     if (!Op || Op->getOperand(2) != ReqValue) {
       HasErr |= emitError("linking module flags '" + Flag->getString() +
                           "': does not have the required value");
@@ -1420,6 +1454,28 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
   return HasErr;
 }
 
+// This function returns true if the triples match.
+static bool triplesMatch(const Triple &T0, const Triple &T1) {
+  // If vendor is apple, ignore the version number.
+  if (T0.getVendor() == Triple::Apple)
+    return T0.getArch() == T1.getArch() &&
+           T0.getSubArch() == T1.getSubArch() &&
+           T0.getVendor() == T1.getVendor() &&
+           T0.getOS() == T1.getOS();
+
+  return T0 == T1;
+}
+
+// This function returns the merged triple.
+static std::string mergeTriples(const Triple &SrcTriple, const Triple &DstTriple) {
+  // If vendor is apple, pick the triple with the larger version number.
+  if (SrcTriple.getVendor() == Triple::Apple)
+    if (DstTriple.isOSVersionLT(SrcTriple))
+      return SrcTriple.str();
+
+  return DstTriple.str();
+}
+
 bool ModuleLinker::run() {
   assert(DstM && "Null destination module");
   assert(SrcM && "Null source module");
@@ -1429,10 +1485,6 @@ bool ModuleLinker::run() {
   if (!DstM->getDataLayout() && SrcM->getDataLayout())
     DstM->setDataLayout(SrcM->getDataLayout());
 
-  // Copy the target triple from the source to dest if the dest's is empty.
-  if (DstM->getTargetTriple().empty() && !SrcM->getTargetTriple().empty())
-    DstM->setTargetTriple(SrcM->getTargetTriple());
-
   if (SrcM->getDataLayout() && DstM->getDataLayout() &&
       *SrcM->getDataLayout() != *DstM->getDataLayout()) {
     emitWarning("Linking two modules of different data layouts: '" +
@@ -1441,14 +1493,21 @@ bool ModuleLinker::run() {
                 DstM->getModuleIdentifier() + "' is '" +
                 DstM->getDataLayoutStr() + "'\n");
   }
-  if (!SrcM->getTargetTriple().empty() &&
-      DstM->getTargetTriple() != SrcM->getTargetTriple()) {
+
+  // Copy the target triple from the source to dest if the dest's is empty.
+  if (DstM->getTargetTriple().empty() && !SrcM->getTargetTriple().empty())
+    DstM->setTargetTriple(SrcM->getTargetTriple());
+
+  Triple SrcTriple(SrcM->getTargetTriple()), DstTriple(DstM->getTargetTriple());
+
+  if (!SrcM->getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple))
     emitWarning("Linking two modules of different target triples: " +
                 SrcM->getModuleIdentifier() + "' is '" +
                 SrcM->getTargetTriple() + "' whereas '" +
                 DstM->getModuleIdentifier() + "' is '" +
                 DstM->getTargetTriple() + "'\n");
-  }
+
+  DstM->setTargetTriple(mergeTriples(SrcTriple, DstTriple));
 
   // Append the module inline asm string.
   if (!SrcM->getModuleInlineAsm().empty()) {
@@ -1502,33 +1561,39 @@ bool ModuleLinker::run() {
   for (unsigned i = 0, e = AppendingVars.size(); i != e; ++i)
     linkAppendingVarInit(AppendingVars[i]);
 
+  for (const auto &Entry : DstM->getComdatSymbolTable()) {
+    const Comdat &C = Entry.getValue();
+    if (C.getSelectionKind() == Comdat::Any)
+      continue;
+    const GlobalValue *GV = SrcM->getNamedValue(C.getName());
+    assert(GV);
+    MapValue(GV, ValueMap, RF_None, &TypeMap, &ValMaterializer);
+  }
+
   // Link in the function bodies that are defined in the source module into
   // DstM.
-  for (Module::iterator SF = SrcM->begin(), E = SrcM->end(); SF != E; ++SF) {
-    // Skip if not linking from source.
-    if (DoNotLinkFromSource.count(SF)) continue;
-
-    Function *DF = cast<Function>(ValueMap[SF]);
-    if (SF->hasPrefixData()) {
-      // Link in the prefix data.
-      DF->setPrefixData(MapValue(
-          SF->getPrefixData(), ValueMap, RF_None, &TypeMap, &ValMaterializer));
-    }
-
-    // Materialize if needed.
-    if (std::error_code EC = SF->materialize())
-      return emitError(EC.message());
-
+  for (Function &SF : *SrcM) {
     // Skip if no body (function is external).
-    if (SF->isDeclaration())
+    if (SF.isDeclaration())
+      continue;
+
+    // Skip if not linking from source.
+    if (DoNotLinkFromSource.count(&SF))
       continue;
 
-    linkFunctionBody(DF, SF);
-    SF->Dematerialize();
+    if (linkGlobalValueBody(SF))
+      return true;
   }
 
   // Resolve all uses of aliases with aliasees.
-  linkAliasBodies();
+  for (GlobalAlias &Src : SrcM->aliases()) {
+    if (DoNotLinkFromSource.count(&Src))
+      continue;
+    linkGlobalValueBody(Src);
+  }
+
+  // Strip replaced subprograms before linking together compile units.
+  stripReplacedSubprograms();
 
   // Remap all of the named MDNodes in Src into the DstM module. We do this
   // after linking GlobalValues so that MDNodes that reference GlobalValues
@@ -1541,57 +1606,106 @@ bool ModuleLinker::run() {
 
   // Update the initializers in the DstM module now that all globals that may
   // be referenced are in DstM.
-  linkGlobalInits();
+  for (GlobalVariable &Src : SrcM->globals()) {
+    // Only process initialized GV's or ones not already in dest.
+    if (!Src.hasInitializer() || DoNotLinkFromSource.count(&Src))
+      continue;
+    linkGlobalValueBody(Src);
+  }
 
   // Process vector of lazily linked in functions.
-  bool LinkedInAnyFunctions;
-  do {
-    LinkedInAnyFunctions = false;
-
-    for(std::vector<Function*>::iterator I = LazilyLinkFunctions.begin(),
-        E = LazilyLinkFunctions.end(); I != E; ++I) {
-      Function *SF = *I;
-      if (!SF)
-        continue;
+  while (!LazilyLinkGlobalValues.empty()) {
+    GlobalValue *SGV = LazilyLinkGlobalValues.back();
+    LazilyLinkGlobalValues.pop_back();
 
-      Function *DF = cast<Function>(ValueMap[SF]);
-      if (SF->hasPrefixData()) {
-        // Link in the prefix data.
-        DF->setPrefixData(MapValue(SF->getPrefixData(),
-                                   ValueMap,
-                                   RF_None,
-                                   &TypeMap,
-                                   &ValMaterializer));
-      }
+    assert(!SGV->isDeclaration() && "users should not pass down decls");
+    if (linkGlobalValueBody(*SGV))
+      return true;
+  }
+
+  return false;
+}
 
-      // Materialize if needed.
-      if (std::error_code EC = SF->materialize())
-        return emitError(EC.message());
+Linker::StructTypeKeyInfo::KeyTy::KeyTy(ArrayRef<Type *> E, bool P)
+    : ETypes(E), IsPacked(P) {}
 
-      // Skip if no body (function is external).
-      if (SF->isDeclaration())
-        continue;
+Linker::StructTypeKeyInfo::KeyTy::KeyTy(const StructType *ST)
+    : ETypes(ST->elements()), IsPacked(ST->isPacked()) {}
+
+bool Linker::StructTypeKeyInfo::KeyTy::operator==(const KeyTy &That) const {
+  if (IsPacked != That.IsPacked)
+    return false;
+  if (ETypes != That.ETypes)
+    return false;
+  return true;
+}
 
-      // Erase from vector *before* the function body is linked - linkFunctionBody could
-      // invalidate I.
-      LazilyLinkFunctions.erase(I);
+bool Linker::StructTypeKeyInfo::KeyTy::operator!=(const KeyTy &That) const {
+  return !this->operator==(That);
+}
 
-      // Link in function body.
-      linkFunctionBody(DF, SF);
-      SF->Dematerialize();
+StructType *Linker::StructTypeKeyInfo::getEmptyKey() {
+  return DenseMapInfo<StructType *>::getEmptyKey();
+}
 
-      // Set flag to indicate we may have more functions to lazily link in
-      // since we linked in a function.
-      LinkedInAnyFunctions = true;
-      break;
-    }
-  } while (LinkedInAnyFunctions);
+StructType *Linker::StructTypeKeyInfo::getTombstoneKey() {
+  return DenseMapInfo<StructType *>::getTombstoneKey();
+}
 
-  // Now that all of the types from the source are used, resolve any structs
-  // copied over to the dest that didn't exist there.
-  TypeMap.linkDefinedTypeBodies();
+unsigned Linker::StructTypeKeyInfo::getHashValue(const KeyTy &Key) {
+  return hash_combine(hash_combine_range(Key.ETypes.begin(), Key.ETypes.end()),
+                      Key.IsPacked);
+}
 
-  return false;
+unsigned Linker::StructTypeKeyInfo::getHashValue(const StructType *ST) {
+  return getHashValue(KeyTy(ST));
+}
+
+bool Linker::StructTypeKeyInfo::isEqual(const KeyTy &LHS,
+                                        const StructType *RHS) {
+  if (RHS == getEmptyKey() || RHS == getTombstoneKey())
+    return false;
+  return LHS == KeyTy(RHS);
+}
+
+bool Linker::StructTypeKeyInfo::isEqual(const StructType *LHS,
+                                        const StructType *RHS) {
+  if (RHS == getEmptyKey())
+    return LHS == getEmptyKey();
+
+  if (RHS == getTombstoneKey())
+    return LHS == getTombstoneKey();
+
+  return KeyTy(LHS) == KeyTy(RHS);
+}
+
+void Linker::IdentifiedStructTypeSet::addNonOpaque(StructType *Ty) {
+  assert(!Ty->isOpaque());
+  NonOpaqueStructTypes.insert(Ty);
+}
+
+void Linker::IdentifiedStructTypeSet::addOpaque(StructType *Ty) {
+  assert(Ty->isOpaque());
+  OpaqueStructTypes.insert(Ty);
+}
+
+StructType *
+Linker::IdentifiedStructTypeSet::findNonOpaque(ArrayRef<Type *> ETypes,
+                                               bool IsPacked) {
+  Linker::StructTypeKeyInfo::KeyTy Key(ETypes, IsPacked);
+  auto I = NonOpaqueStructTypes.find_as(Key);
+  if (I == NonOpaqueStructTypes.end())
+    return nullptr;
+  return *I;
+}
+
+bool Linker::IdentifiedStructTypeSet::hasType(StructType *Ty) {
+  if (Ty->isOpaque())
+    return OpaqueStructTypes.count(Ty);
+  auto I = NonOpaqueStructTypes.find(Ty);
+  if (I == NonOpaqueStructTypes.end())
+    return false;
+  return *I == Ty;
 }
 
 void Linker::init(Module *M, DiagnosticHandlerFunction DiagnosticHandler) {
@@ -1600,7 +1714,12 @@ void Linker::init(Module *M, DiagnosticHandlerFunction DiagnosticHandler) {
 
   TypeFinder StructTypes;
   StructTypes.run(*M, true);
-  IdentifiedStructTypes.insert(StructTypes.begin(), StructTypes.end());
+  for (StructType *Ty : StructTypes) {
+    if (Ty->isOpaque())
+      IdentifiedStructTypes.addOpaque(Ty);
+    else
+      IdentifiedStructTypes.addNonOpaque(Ty);
+  }
 }
 
 Linker::Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler) {
@@ -1624,7 +1743,13 @@ void Linker::deleteModule() {
 bool Linker::linkInModule(Module *Src) {
   ModuleLinker TheLinker(Composite, IdentifiedStructTypes, Src,
                          DiagnosticHandler);
-  return TheLinker.run();
+  bool RetCode = TheLinker.run();
+  Composite->dropTriviallyDeadConstantArrays();
+  return RetCode;
+}
+
+void Linker::setModule(Module *Dst) {
+  init(Dst, DiagnosticHandler);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1652,7 +1777,7 @@ bool Linker::LinkModules(Module *Dest, Module *Src) {
 //===----------------------------------------------------------------------===//
 
 LLVMBool LLVMLinkModules(LLVMModuleRef Dest, LLVMModuleRef Src,
-                         LLVMLinkerMode Mode, char **OutMessages) {
+                         unsigned Unused, char **OutMessages) {
   Module *D = unwrap(Dest);
   std::string Message;
   raw_string_ostream Stream(Message);
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index 7181bdc..ddddd49 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -46,6 +46,9 @@ add_llvm_library(LLVMMC
   WinCOFFObjectWriter.cpp
   WinCOFFStreamer.cpp
   YAML.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/MC
   )
 
 add_subdirectory(MCParser)
diff --git a/lib/MC/ConstantPools.cpp b/lib/MC/ConstantPools.cpp
index c4cea60..a239a8f 100644
--- a/lib/MC/ConstantPools.cpp
+++ b/lib/MC/ConstantPools.cpp
@@ -11,10 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 #include "llvm/ADT/MapVector.h"
+#include "llvm/MC/ConstantPools.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/ConstantPools.h"
 
 using namespace llvm;
 //
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index e4442e1..4819905 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -31,8 +31,8 @@
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <vector>
 using namespace llvm;
@@ -219,7 +219,7 @@ class ELFObjectWriter : public MCObjectWriter {
                                   const MCSymbolData *SD, uint64_t C,
                                   unsigned Type) const;
 
-    void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
+    void RecordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
                           const MCFragment *Fragment, const MCFixup &Fixup,
                           MCValue Target, bool &IsPCRel,
                           uint64_t &FixedValue) override;
@@ -247,12 +247,12 @@ class ELFObjectWriter : public MCObjectWriter {
                             const RevGroupMapTy &RevGroupMap,
                             unsigned NumRegularSections);
 
-    void ComputeIndexMap(MCAssembler &Asm,
+    void computeIndexMap(MCAssembler &Asm,
                          SectionIndexMapTy &SectionIndexMap,
-                         const RelMapTy &RelMap);
+                         RelMapTy &RelMap);
 
-    void CreateRelocationSections(MCAssembler &Asm, MCAsmLayout &Layout,
-                                  RelMapTy &RelMap);
+    MCSectionData *createRelocationSection(MCAssembler &Asm,
+                                           const MCSectionData &SD);
 
     void CompressDebugSections(MCAssembler &Asm, MCAsmLayout &Layout);
 
@@ -260,23 +260,23 @@ class ELFObjectWriter : public MCObjectWriter {
                           const RelMapTy &RelMap);
 
     void CreateMetadataSections(MCAssembler &Asm, MCAsmLayout &Layout,
-                                SectionIndexMapTy &SectionIndexMap,
-                                const RelMapTy &RelMap);
+                                SectionIndexMapTy &SectionIndexMap);
 
     // Create the sections that show up in the symbol table. Currently
     // those are the .note.GNU-stack section and the group sections.
-    void CreateIndexedSections(MCAssembler &Asm, MCAsmLayout &Layout,
+    void createIndexedSections(MCAssembler &Asm, MCAsmLayout &Layout,
                                GroupMapTy &GroupMap,
                                RevGroupMapTy &RevGroupMap,
                                SectionIndexMapTy &SectionIndexMap,
-                               const RelMapTy &RelMap);
+                               RelMapTy &RelMap);
 
     void ExecutePostLayoutBinding(MCAssembler &Asm,
                                   const MCAsmLayout &Layout) override;
 
-    void WriteSectionHeader(MCAssembler &Asm, const GroupMapTy &GroupMap,
+    void writeSectionHeader(MCAssembler &Asm, const GroupMapTy &GroupMap,
                             const MCAsmLayout &Layout,
                             const SectionIndexMapTy &SectionIndexMap,
+                            const RelMapTy &RelMap,
                             const SectionOffsetMapTy &SectionOffsetMap);
 
     void ComputeSectionOrder(MCAssembler &Asm,
@@ -299,8 +299,9 @@ class ELFObjectWriter : public MCObjectWriter {
                                            bool IsPCRel) const override;
 
     void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
-    void WriteSection(MCAssembler &Asm,
+    void writeSection(MCAssembler &Asm,
                       const SectionIndexMapTy &SectionIndexMap,
+                      const RelMapTy &RelMap,
                       uint32_t GroupSymbolIndex,
                       uint64_t Offset, uint64_t Size, uint64_t Alignment,
                       const MCSectionELF &Section);
@@ -325,8 +326,7 @@ void SymbolTableWriter::createSymtabShndx() {
 
   MCContext &Ctx = Asm.getContext();
   const MCSectionELF *SymtabShndxSection =
-      Ctx.getELFSection(".symtab_shndxr", ELF::SHT_SYMTAB_SHNDX, 0,
-                        SectionKind::getReadOnly(), 4, "");
+      Ctx.getELFSection(".symtab_shndxr", ELF::SHT_SYMTAB_SHNDX, 0, 4, "");
   MCSectionData *SymtabShndxSD =
       &Asm.getOrCreateSectionData(*SymtabShndxSection);
   SymtabShndxSD->setAlignment(4);
@@ -789,13 +789,11 @@ static const MCSymbol *getWeakRef(const MCSymbolRefExpr &Ref) {
   return nullptr;
 }
 
-void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
+void ELFObjectWriter::RecordRelocation(MCAssembler &Asm,
                                        const MCAsmLayout &Layout,
                                        const MCFragment *Fragment,
-                                       const MCFixup &Fixup,
-                                       MCValue Target,
-                                       bool &IsPCRel,
-                                       uint64_t &FixedValue) {
+                                       const MCFixup &Fixup, MCValue Target,
+                                       bool &IsPCRel, uint64_t &FixedValue) {
   const MCSectionData *FixupSection = Fragment->getParent();
   uint64_t C = Target.getConstant();
   uint64_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
@@ -946,9 +944,9 @@ bool ELFObjectWriter::isLocal(const MCSymbolData &Data, bool isUsedInReloc) {
   return true;
 }
 
-void ELFObjectWriter::ComputeIndexMap(MCAssembler &Asm,
+void ELFObjectWriter::computeIndexMap(MCAssembler &Asm,
                                       SectionIndexMapTy &SectionIndexMap,
-                                      const RelMapTy &RelMap) {
+                                      RelMapTy &RelMap) {
   unsigned Index = 1;
   for (MCAssembler::iterator it = Asm.begin(),
          ie = Asm.end(); it != ie; ++it) {
@@ -961,16 +959,20 @@ void ELFObjectWriter::ComputeIndexMap(MCAssembler &Asm,
 
   for (MCAssembler::iterator it = Asm.begin(),
          ie = Asm.end(); it != ie; ++it) {
+    const MCSectionData &SD = *it;
     const MCSectionELF &Section =
-      static_cast<const MCSectionELF &>(it->getSection());
+      static_cast<const MCSectionELF &>(SD.getSection());
     if (Section.getType() == ELF::SHT_GROUP ||
         Section.getType() == ELF::SHT_REL ||
         Section.getType() == ELF::SHT_RELA)
       continue;
     SectionIndexMap[&Section] = Index++;
-    const MCSectionELF *RelSection = RelMap.lookup(&Section);
-    if (RelSection)
+    if (MCSectionData *RelSD = createRelocationSection(Asm, SD)) {
+      const MCSectionELF *RelSection =
+          static_cast<const MCSectionELF *>(&RelSD->getSection());
+      RelMap[RelSection] = &Section;
       SectionIndexMap[RelSection] = Index++;
+    }
   }
 }
 
@@ -1035,16 +1037,43 @@ ELFObjectWriter::computeSymbolTable(MCAssembler &Asm, const MCAsmLayout &Layout,
       assert(MSD.SectionIndex && "Invalid section index!");
     }
 
-    // The @@@ in symbol version is replaced with @ in undefined symbols and
-    // @@ in defined ones.
+    // The @@@ in symbol version is replaced with @ in undefined symbols and @@
+    // in defined ones.
+    //
+    // FIXME: All name handling should be done before we get to the writer,
+    // including dealing with GNU-style version suffixes.  Fixing this isn't
+    // trivial.
+    //
+    // We thus have to be careful to not perform the symbol version replacement
+    // blindly:
+    //
+    // The ELF format is used on Windows by the MCJIT engine.  Thus, on
+    // Windows, the ELFObjectWriter can encounter symbols mangled using the MS
+    // Visual Studio C++ name mangling scheme. Symbols mangled using the MSVC
+    // C++ name mangling can legally have "@@@" as a sub-string. In that case,
+    // the EFLObjectWriter should not interpret the "@@@" sub-string as
+    // specifying GNU-style symbol versioning. The ELFObjectWriter therefore
+    // checks for the MSVC C++ name mangling prefix which is either "?", "@?",
+    // "__imp_?" or "__imp_@?".
+    //
+    // It would have been interesting to perform the MS mangling prefix check
+    // only when the target triple is of the form *-pc-windows-elf. But, it
+    // seems that this information is not easily accessible from the
+    // ELFObjectWriter.
     StringRef Name = Symbol.getName();
-    SmallString<32> Buf;
-    size_t Pos = Name.find("@@@");
-    if (Pos != StringRef::npos) {
-      Buf += Name.substr(0, Pos);
-      unsigned Skip = MSD.SectionIndex == ELF::SHN_UNDEF ? 2 : 1;
-      Buf += Name.substr(Pos + Skip);
-      Name = Buf;
+    if (!Name.startswith("?") && !Name.startswith("@?") &&
+        !Name.startswith("__imp_?") && !Name.startswith("__imp_@?")) {
+      // This symbol isn't following the MSVC C++ name mangling convention. We
+      // can thus safely interpret the @@@ in symbol names as specifying symbol
+      // versioning.
+      SmallString<32> Buf;
+      size_t Pos = Name.find("@@@");
+      if (Pos != StringRef::npos) {
+        Buf += Name.substr(0, Pos);
+        unsigned Skip = MSD.SectionIndex == ELF::SHN_UNDEF ? 2 : 1;
+        Buf += Name.substr(Pos + Skip);
+        Name = Buf;
+      }
     }
 
     // Sections have their own string table
@@ -1093,44 +1122,37 @@ ELFObjectWriter::computeSymbolTable(MCAssembler &Asm, const MCAsmLayout &Layout,
     UndefinedSymbolData[i].SymbolData->setIndex(Index++);
 }
 
-void ELFObjectWriter::CreateRelocationSections(MCAssembler &Asm,
-                                               MCAsmLayout &Layout,
-                                               RelMapTy &RelMap) {
-  for (MCAssembler::const_iterator it = Asm.begin(),
-         ie = Asm.end(); it != ie; ++it) {
-    const MCSectionData &SD = *it;
-    if (Relocations[&SD].empty())
-      continue;
-
-    MCContext &Ctx = Asm.getContext();
-    const MCSectionELF &Section =
-      static_cast<const MCSectionELF&>(SD.getSection());
+MCSectionData *
+ELFObjectWriter::createRelocationSection(MCAssembler &Asm,
+                                         const MCSectionData &SD) {
+  if (Relocations[&SD].empty())
+    return nullptr;
 
-    const StringRef SectionName = Section.getSectionName();
-    std::string RelaSectionName = hasRelocationAddend() ? ".rela" : ".rel";
-    RelaSectionName += SectionName;
+  MCContext &Ctx = Asm.getContext();
+  const MCSectionELF &Section =
+      static_cast<const MCSectionELF &>(SD.getSection());
 
-    unsigned EntrySize;
-    if (hasRelocationAddend())
-      EntrySize = is64Bit() ? sizeof(ELF::Elf64_Rela) : sizeof(ELF::Elf32_Rela);
-    else
-      EntrySize = is64Bit() ? sizeof(ELF::Elf64_Rel) : sizeof(ELF::Elf32_Rel);
+  const StringRef SectionName = Section.getSectionName();
+  std::string RelaSectionName = hasRelocationAddend() ? ".rela" : ".rel";
+  RelaSectionName += SectionName;
 
-    unsigned Flags = 0;
-    StringRef Group = "";
-    if (Section.getFlags() & ELF::SHF_GROUP) {
-      Flags = ELF::SHF_GROUP;
-      Group = Section.getGroup()->getName();
-    }
+  unsigned EntrySize;
+  if (hasRelocationAddend())
+    EntrySize = is64Bit() ? sizeof(ELF::Elf64_Rela) : sizeof(ELF::Elf32_Rela);
+  else
+    EntrySize = is64Bit() ? sizeof(ELF::Elf64_Rel) : sizeof(ELF::Elf32_Rel);
 
-    const MCSectionELF *RelaSection =
-      Ctx.getELFSection(RelaSectionName, hasRelocationAddend() ?
-                        ELF::SHT_RELA : ELF::SHT_REL, Flags,
-                        SectionKind::getReadOnly(),
-                        EntrySize, Group);
-    RelMap[&Section] = RelaSection;
-    Asm.getOrCreateSectionData(*RelaSection);
+  unsigned Flags = 0;
+  StringRef Group = "";
+  if (Section.getFlags() & ELF::SHF_GROUP) {
+    Flags = ELF::SHF_GROUP;
+    Group = Section.getGroup()->getName();
   }
+
+  const MCSectionELF *RelaSection = Ctx.getELFSection(
+      RelaSectionName, hasRelocationAddend() ? ELF::SHT_RELA : ELF::SHT_REL,
+      Flags, EntrySize, Group, true);
+  return &Asm.getOrCreateSectionData(*RelaSection);
 }
 
 static SmallVector<char, 128>
@@ -1280,20 +1302,21 @@ void ELFObjectWriter::CompressDebugSections(MCAssembler &Asm,
 
 void ELFObjectWriter::WriteRelocations(MCAssembler &Asm, MCAsmLayout &Layout,
                                        const RelMapTy &RelMap) {
-  for (MCAssembler::const_iterator it = Asm.begin(),
-         ie = Asm.end(); it != ie; ++it) {
-    const MCSectionData &SD = *it;
-    const MCSectionELF &Section =
-      static_cast<const MCSectionELF&>(SD.getSection());
+  for (MCAssembler::iterator it = Asm.begin(), ie = Asm.end(); it != ie; ++it) {
+    MCSectionData &RelSD = *it;
+    const MCSectionELF &RelSection =
+        static_cast<const MCSectionELF &>(RelSD.getSection());
 
-    const MCSectionELF *RelaSection = RelMap.lookup(&Section);
-    if (!RelaSection)
+    unsigned Type = RelSection.getType();
+    if (Type != ELF::SHT_REL && Type != ELF::SHT_RELA)
       continue;
-    MCSectionData &RelaSD = Asm.getOrCreateSectionData(*RelaSection);
-    RelaSD.setAlignment(is64Bit() ? 8 : 4);
 
-    MCDataFragment *F = new MCDataFragment(&RelaSD);
-    WriteRelocationsFragment(Asm, F, &*it);
+    const MCSectionELF *Section = RelMap.lookup(&RelSection);
+    MCSectionData &SD = Asm.getOrCreateSectionData(*Section);
+    RelSD.setAlignment(is64Bit() ? 8 : 4);
+
+    MCDataFragment *F = new MCDataFragment(&RelSD);
+    WriteRelocationsFragment(Asm, F, &SD);
   }
 }
 
@@ -1374,10 +1397,8 @@ void ELFObjectWriter::WriteRelocationsFragment(const MCAssembler &Asm,
   }
 }
 
-void ELFObjectWriter::CreateMetadataSections(MCAssembler &Asm,
-                                             MCAsmLayout &Layout,
-                                             SectionIndexMapTy &SectionIndexMap,
-                                             const RelMapTy &RelMap) {
+void ELFObjectWriter::CreateMetadataSections(
+    MCAssembler &Asm, MCAsmLayout &Layout, SectionIndexMapTy &SectionIndexMap) {
   MCContext &Ctx = Asm.getContext();
   MCDataFragment *F;
 
@@ -1385,29 +1406,26 @@ void ELFObjectWriter::CreateMetadataSections(MCAssembler &Asm,
 
   // We construct .shstrtab, .symtab and .strtab in this order to match gnu as.
   const MCSectionELF *ShstrtabSection =
-    Ctx.getELFSection(".shstrtab", ELF::SHT_STRTAB, 0,
-                      SectionKind::getReadOnly());
+      Ctx.getELFSection(".shstrtab", ELF::SHT_STRTAB, 0);
   MCSectionData &ShstrtabSD = Asm.getOrCreateSectionData(*ShstrtabSection);
   ShstrtabSD.setAlignment(1);
+  ShstrtabIndex = SectionIndexMap.size() + 1;
+  SectionIndexMap[ShstrtabSection] = ShstrtabIndex;
 
   const MCSectionELF *SymtabSection =
     Ctx.getELFSection(".symtab", ELF::SHT_SYMTAB, 0,
-                      SectionKind::getReadOnly(),
                       EntrySize, "");
   MCSectionData &SymtabSD = Asm.getOrCreateSectionData(*SymtabSection);
   SymtabSD.setAlignment(is64Bit() ? 8 : 4);
+  SymbolTableIndex = SectionIndexMap.size() + 1;
+  SectionIndexMap[SymtabSection] = SymbolTableIndex;
 
   const MCSectionELF *StrtabSection;
-  StrtabSection = Ctx.getELFSection(".strtab", ELF::SHT_STRTAB, 0,
-                                    SectionKind::getReadOnly());
+  StrtabSection = Ctx.getELFSection(".strtab", ELF::SHT_STRTAB, 0);
   MCSectionData &StrtabSD = Asm.getOrCreateSectionData(*StrtabSection);
   StrtabSD.setAlignment(1);
-
-  ComputeIndexMap(Asm, SectionIndexMap, RelMap);
-
-  ShstrtabIndex = SectionIndexMap.lookup(ShstrtabSection);
-  SymbolTableIndex = SectionIndexMap.lookup(SymtabSection);
-  StringTableIndex = SectionIndexMap.lookup(StrtabSection);
+  StringTableIndex = SectionIndexMap.size() + 1;
+  SectionIndexMap[StrtabSection] = StringTableIndex;
 
   // Symbol table
   F = new MCDataFragment(&SymtabSD);
@@ -1430,12 +1448,12 @@ void ELFObjectWriter::CreateMetadataSections(MCAssembler &Asm,
                           ShStrTabBuilder.data().end());
 }
 
-void ELFObjectWriter::CreateIndexedSections(MCAssembler &Asm,
+void ELFObjectWriter::createIndexedSections(MCAssembler &Asm,
                                             MCAsmLayout &Layout,
                                             GroupMapTy &GroupMap,
                                             RevGroupMapTy &RevGroupMap,
                                             SectionIndexMapTy &SectionIndexMap,
-                                            const RelMapTy &RelMap) {
+                                            RelMapTy &RelMap) {
   MCContext &Ctx = Asm.getContext();
 
   // Build the groups
@@ -1459,7 +1477,7 @@ void ELFObjectWriter::CreateIndexedSections(MCAssembler &Asm,
     GroupMap[Group] = SignatureSymbol;
   }
 
-  ComputeIndexMap(Asm, SectionIndexMap, RelMap);
+  computeIndexMap(Asm, SectionIndexMap, RelMap);
 
   // Add sections to the groups
   for (MCAssembler::const_iterator it = Asm.begin(), ie = Asm.end();
@@ -1477,8 +1495,9 @@ void ELFObjectWriter::CreateIndexedSections(MCAssembler &Asm,
   }
 }
 
-void ELFObjectWriter::WriteSection(MCAssembler &Asm,
+void ELFObjectWriter::writeSection(MCAssembler &Asm,
                                    const SectionIndexMapTy &SectionIndexMap,
+                                   const RelMapTy &RelMap,
                                    uint32_t GroupSymbolIndex,
                                    uint64_t Offset, uint64_t Size,
                                    uint64_t Alignment,
@@ -1494,23 +1513,9 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm,
 
   case ELF::SHT_REL:
   case ELF::SHT_RELA: {
-    const MCSectionELF *SymtabSection;
-    const MCSectionELF *InfoSection;
-    SymtabSection = Asm.getContext().getELFSection(".symtab", ELF::SHT_SYMTAB,
-                                                   0,
-                                                   SectionKind::getReadOnly());
-    sh_link = SectionIndexMap.lookup(SymtabSection);
+    sh_link = SymbolTableIndex;
     assert(sh_link && ".symtab not found");
-
-    // Remove ".rel" and ".rela" prefixes.
-    unsigned SecNameLen = (Section.getType() == ELF::SHT_REL) ? 4 : 5;
-    StringRef SectionName = Section.getSectionName().substr(SecNameLen);
-    StringRef GroupName =
-        Section.getGroup() ? Section.getGroup()->getName() : "";
-
-    InfoSection = Asm.getContext().getELFSection(SectionName, ELF::SHT_PROGBITS,
-                                                 0, SectionKind::getReadOnly(),
-                                                 0, GroupName);
+    const MCSectionELF *InfoSection = RelMap.find(&Section)->second;
     sh_info = SectionIndexMap.lookup(InfoSection);
     break;
   }
@@ -1554,18 +1559,14 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm,
       Section.getType() == ELF::SHT_ARM_EXIDX) {
     StringRef SecName(Section.getSectionName());
     if (SecName == ".ARM.exidx") {
-      sh_link = SectionIndexMap.lookup(
-        Asm.getContext().getELFSection(".text",
-                                       ELF::SHT_PROGBITS,
-                                       ELF::SHF_EXECINSTR | ELF::SHF_ALLOC,
-                                       SectionKind::getText()));
+      sh_link = SectionIndexMap.lookup(Asm.getContext().getELFSection(
+          ".text", ELF::SHT_PROGBITS, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC));
     } else if (SecName.startswith(".ARM.exidx")) {
       StringRef GroupName =
           Section.getGroup() ? Section.getGroup()->getName() : "";
       sh_link = SectionIndexMap.lookup(Asm.getContext().getELFSection(
           SecName.substr(sizeof(".ARM.exidx") - 1), ELF::SHT_PROGBITS,
-          ELF::SHF_EXECINSTR | ELF::SHF_ALLOC, SectionKind::getText(), 0,
-          GroupName));
+          ELF::SHF_EXECINSTR | ELF::SHF_ALLOC, 0, GroupName));
     }
   }
 
@@ -1625,11 +1626,10 @@ void ELFObjectWriter::WriteDataSectionData(MCAssembler &Asm,
   }
 }
 
-void ELFObjectWriter::WriteSectionHeader(MCAssembler &Asm,
-                                         const GroupMapTy &GroupMap,
-                                         const MCAsmLayout &Layout,
-                                      const SectionIndexMapTy &SectionIndexMap,
-                                   const SectionOffsetMapTy &SectionOffsetMap) {
+void ELFObjectWriter::writeSectionHeader(
+    MCAssembler &Asm, const GroupMapTy &GroupMap, const MCAsmLayout &Layout,
+    const SectionIndexMapTy &SectionIndexMap, const RelMapTy &RelMap,
+    const SectionOffsetMapTy &SectionOffsetMap) {
   const unsigned NumSections = Asm.size() + 1;
 
   std::vector<const MCSectionELF*> Sections;
@@ -1660,7 +1660,7 @@ void ELFObjectWriter::WriteSectionHeader(MCAssembler &Asm,
 
     uint64_t Size = GetSectionAddressSize(Layout, SD);
 
-    WriteSection(Asm, SectionIndexMap, GroupSymbolIndex,
+    writeSection(Asm, SectionIndexMap, RelMap, GroupSymbolIndex,
                  SectionOffsetMap.lookup(&Section), Size,
                  SD.getAlignment(), Section);
   }
@@ -1707,10 +1707,8 @@ void ELFObjectWriter::WriteObject(MCAssembler &Asm,
   CompressDebugSections(Asm, const_cast<MCAsmLayout &>(Layout));
 
   DenseMap<const MCSectionELF*, const MCSectionELF*> RelMap;
-  CreateRelocationSections(Asm, const_cast<MCAsmLayout&>(Layout), RelMap);
-
   const unsigned NumUserAndRelocSections = Asm.size();
-  CreateIndexedSections(Asm, const_cast<MCAsmLayout&>(Layout), GroupMap,
+  createIndexedSections(Asm, const_cast<MCAsmLayout&>(Layout), GroupMap,
                         RevGroupMap, SectionIndexMap, RelMap);
   const unsigned AllSections = Asm.size();
   const unsigned NumIndexedSections = AllSections - NumUserAndRelocSections;
@@ -1725,8 +1723,7 @@ void ELFObjectWriter::WriteObject(MCAssembler &Asm,
 
   CreateMetadataSections(const_cast<MCAssembler&>(Asm),
                          const_cast<MCAsmLayout&>(Layout),
-                         SectionIndexMap,
-                         RelMap);
+                         SectionIndexMap);
 
   uint64_t NaturalAlignment = is64Bit() ? 8 : 4;
   uint64_t HeaderSize = is64Bit() ? sizeof(ELF::Elf64_Ehdr) :
@@ -1783,7 +1780,7 @@ void ELFObjectWriter::WriteObject(MCAssembler &Asm,
   WriteZeros(Padding);
 
   // ... then the section header table ...
-  WriteSectionHeader(Asm, GroupMap, Layout, SectionIndexMap,
+  writeSectionHeader(Asm, GroupMap, Layout, SectionIndexMap, RelMap,
                      SectionOffsetMap);
 
   // ... and then the remaining sections ...
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 2fb558f..04b8042 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -40,6 +40,7 @@ MCAsmInfo::MCAsmInfo() {
   LabelSuffix = ":";
   UseAssignmentForEHBegin = false;
   PrivateGlobalPrefix = "L";
+  PrivateLabelPrefix = PrivateGlobalPrefix;
   LinkerPrivateGlobalPrefix = "";
   InlineAsmStart = "APP";
   InlineAsmEnd = "NO_APP";
@@ -71,6 +72,7 @@ MCAsmInfo::MCAsmInfo() {
   HasSingleParameterDotFile = true;
   HasIdentDirective = false;
   HasNoDeadStrip = false;
+  WeakDirective = "\t.weak\t";
   WeakRefDirective = nullptr;
   HasWeakDefDirective = false;
   HasWeakDefCanBeHiddenDirective = false;
@@ -107,6 +109,10 @@ MCAsmInfo::MCAsmInfo() {
 MCAsmInfo::~MCAsmInfo() {
 }
 
+bool MCAsmInfo::isSectionAtomizableBySymbols(const MCSection &Section) const {
+  return false;
+}
+
 const MCExpr *
 MCAsmInfo::getExprForPersonalitySymbol(const MCSymbol *Sym,
                                        unsigned Encoding,
diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index 66a138b..a2a2504 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp
@@ -15,10 +15,46 @@
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 using namespace llvm;
 
-void MCAsmInfoDarwin::anchor() { }
+bool MCAsmInfoDarwin::isSectionAtomizableBySymbols(
+    const MCSection &Section) const {
+  const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
+
+  // Sections holding 1 byte strings are atomized based on the data they
+  // contain.
+  // Sections holding 2 byte strings require symbols in order to be atomized.
+  // There is no dedicated section for 4 byte strings.
+  if (SMO.getType() == MachO::S_CSTRING_LITERALS)
+    return false;
+
+  if (SMO.getSegmentName() == "__DATA" && SMO.getSectionName() == "__cfstring")
+    return false;
+
+  if (SMO.getSegmentName() == "__DATA" &&
+      SMO.getSectionName() == "__objc_classrefs")
+    return false;
+
+  switch (SMO.getType()) {
+  default:
+    return true;
+
+  // These sections are atomized at the element boundaries without using
+  // symbols.
+  case MachO::S_4BYTE_LITERALS:
+  case MachO::S_8BYTE_LITERALS:
+  case MachO::S_16BYTE_LITERALS:
+  case MachO::S_LITERAL_POINTERS:
+  case MachO::S_NON_LAZY_SYMBOL_POINTERS:
+  case MachO::S_LAZY_SYMBOL_POINTERS:
+  case MachO::S_MOD_INIT_FUNC_POINTERS:
+  case MachO::S_MOD_TERM_FUNC_POINTERS:
+  case MachO::S_INTERPOSING:
+    return false;
+  }
+}
 
 MCAsmInfoDarwin::MCAsmInfoDarwin() {
   // Common settings for all Darwin targets.
diff --git a/lib/MC/MCAsmInfoELF.cpp b/lib/MC/MCAsmInfoELF.cpp
index 9f70d8d..cd61a43 100644
--- a/lib/MC/MCAsmInfoELF.cpp
+++ b/lib/MC/MCAsmInfoELF.cpp
@@ -22,12 +22,12 @@ void MCAsmInfoELF::anchor() { }
 
 const MCSection *
 MCAsmInfoELF::getNonexecutableStackSection(MCContext &Ctx) const {
-  return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS,
-                           0, SectionKind::getMetadata());
+  return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, 0);
 }
 
 MCAsmInfoELF::MCAsmInfoELF() {
   HasIdentDirective = true;
   WeakRefDirective = "\t.weak\t";
   PrivateGlobalPrefix = ".L";
+  PrivateLabelPrefix = ".L";
 }
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index f60c7fc..2312cd5 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
@@ -32,7 +32,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
 #include <cctype>
-#include <unordered_map>
 using namespace llvm;
 
 namespace {
@@ -436,14 +435,18 @@ bool MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_Internal:       OS << "\t.internal\t";        break;
   case MCSA_LazyReference:  OS << "\t.lazy_reference\t";  break;
   case MCSA_Local:          OS << "\t.local\t";           break;
-  case MCSA_NoDeadStrip:    OS << "\t.no_dead_strip\t";   break;
+  case MCSA_NoDeadStrip:
+    if (!MAI->hasNoDeadStrip())
+      return false;
+    OS << "\t.no_dead_strip\t";
+    break;
   case MCSA_SymbolResolver: OS << "\t.symbol_resolver\t"; break;
   case MCSA_PrivateExtern:
     OS << "\t.private_extern\t";
     break;
   case MCSA_Protected:      OS << "\t.protected\t";       break;
   case MCSA_Reference:      OS << "\t.reference\t";       break;
-  case MCSA_Weak:           OS << "\t.weak\t";            break;
+  case MCSA_Weak:           OS << MAI->getWeakDirective(); break;
   case MCSA_WeakDefinition:
     OS << "\t.weak_definition\t";
     break;
@@ -682,7 +685,11 @@ void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
       // We truncate our partial emission to fit within the bounds of the
       // emission domain.  This produces nicer output and silences potential
       // truncation warnings when round tripping through another assembler.
-      ValueToEmit &= ~0ULL >> (64 - EmissionSize * 8);
+      uint64_t Shift = 64 - EmissionSize * 8;
+      assert(Shift < static_cast<uint64_t>(
+                         std::numeric_limits<unsigned long long>::digits) &&
+             "undefined behavior");
+      ValueToEmit &= ~0ULL >> Shift;
       EmitIntValue(ValueToEmit, EmissionSize);
       Emitted += EmissionSize;
     }
@@ -865,8 +872,6 @@ void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                           unsigned Isa,
                                           unsigned Discriminator,
                                           StringRef FileName) {
-  this->MCStreamer::EmitDwarfLocDirective(FileNo, Line, Column, Flags,
-                                          Isa, Discriminator, FileName);
   OS << "\t.loc\t" << FileNo << " " << Line << " " << Column;
   if (Flags & DWARF2_FLAG_BASIC_BLOCK)
     OS << " basic_block";
@@ -896,6 +901,8 @@ void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
        << Line << ':' << Column;
   }
   EmitEOL();
+  this->MCStreamer::EmitDwarfLocDirective(FileNo, Line, Column, Flags,
+                                          Isa, Discriminator, FileName);
 }
 
 MCSymbol *MCAsmStreamer::getDwarfLineTableSymbol(unsigned CUID) {
@@ -1249,7 +1256,7 @@ void MCAsmStreamer::EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &S
 
   // Show the MCInst if enabled.
   if (ShowInst) {
-    Inst.dump_pretty(GetCommentOS(), MAI, InstPrinter.get(), "\n ");
+    Inst.dump_pretty(GetCommentOS(), InstPrinter.get(), "\n ");
     GetCommentOS() << "\n";
   }
 
@@ -1257,7 +1264,7 @@ void MCAsmStreamer::EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &S
   if (InstPrinter)
     InstPrinter->printInst(&Inst, OS, "");
   else
-    Inst.print(OS, MAI);
+    Inst.print(OS);
   EmitEOL();
 }
 
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 85d0c13..50ce845 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
@@ -20,6 +21,7 @@
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Debug.h"
@@ -27,7 +29,6 @@
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/MC/MCSectionELF.h"
 #include <tuple>
 using namespace llvm;
 
@@ -200,7 +201,17 @@ const MCSymbol *MCAsmLayout::getBaseSymbol(const MCSymbol &Symbol) const {
   if (!A)
     return nullptr;
 
-  return &A->getSymbol();
+  const MCSymbol &ASym = A->getSymbol();
+  const MCAssembler &Asm = getAssembler();
+  const MCSymbolData &ASD = Asm.getSymbolData(ASym);
+  if (ASD.isCommon()) {
+    // FIXME: we should probably add a SMLoc to MCExpr.
+    Asm.getContext().FatalError(SMLoc(),
+                                "Common symbol " + ASym.getName() +
+                                    " cannot be used in assignment expr");
+  }
+
+  return &ASym;
 }
 
 uint64_t MCAsmLayout::getSectionAddressSize(const MCSectionData *SD) const {
@@ -424,6 +435,16 @@ bool MCAssembler::isThumbFunc(const MCSymbol *Symbol) const {
   return true;
 }
 
+void MCAssembler::addLocalUsedInReloc(const MCSymbol &Sym) {
+  assert(Sym.isTemporary());
+  LocalsUsedInReloc.insert(&Sym);
+}
+
+bool MCAssembler::isLocalUsedInReloc(const MCSymbol &Sym) const {
+  assert(Sym.isTemporary());
+  return LocalsUsedInReloc.count(&Sym);
+}
+
 bool MCAssembler::isSymbolLinkerVisible(const MCSymbol &Symbol) const {
   // Non-temporary labels should always be visible to the linker.
   if (!Symbol.isTemporary())
@@ -433,8 +454,10 @@ bool MCAssembler::isSymbolLinkerVisible(const MCSymbol &Symbol) const {
   if (!Symbol.isInSection())
     return false;
 
-  // Otherwise, check if the section requires symbols even for temporary labels.
-  return getBackend().doesSectionRequireSymbols(Symbol.getSection());
+  if (isLocalUsedInReloc(Symbol))
+    return true;
+
+  return false;
 }
 
 const MCSymbolData *MCAssembler::getAtom(const MCSymbolData *SD) const {
@@ -448,8 +471,8 @@ const MCSymbolData *MCAssembler::getAtom(const MCSymbolData *SD) const {
 
   // Non-linker visible symbols in sections which can't be atomized have no
   // defining atom.
-  if (!getBackend().isSectionAtomizable(
-        SD->getFragment()->getParent()->getSection()))
+  if (!getContext().getAsmInfo()->isSectionAtomizableBySymbols(
+          SD->getFragment()->getParent()->getSection()))
     return nullptr;
 
   // Otherwise, return the atom for the containing fragment.
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index 8630b25..721edd4 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -130,6 +130,11 @@ MCSymbol *MCContext::getOrCreateSectionSymbol(const MCSectionELF &Section) {
   return Sym;
 }
 
+MCSymbol *MCContext::getOrCreateFrameAllocSymbol(StringRef FuncName) {
+  return GetOrCreateSymbol(Twine(MAI->getPrivateGlobalPrefix()) +
+                           "frameallocation_" + FuncName);
+}
+
 MCSymbol *MCContext::CreateSymbol(StringRef Name) {
   // Determine whether this is an assembler temporary or normal label, if used.
   bool isTemporary = false;
@@ -247,10 +252,9 @@ getMachOSection(StringRef Segment, StringRef Section,
                                             Reserved2, Kind);
 }
 
-const MCSectionELF *MCContext::
-getELFSection(StringRef Section, unsigned Type, unsigned Flags,
-              SectionKind Kind) {
-  return getELFSection(Section, Type, Flags, Kind, 0, "");
+const MCSectionELF *MCContext::getELFSection(StringRef Section, unsigned Type,
+                                             unsigned Flags) {
+  return getELFSection(Section, Type, Flags, 0, "");
 }
 
 void MCContext::renameELFSection(const MCSectionELF *Section, StringRef Name) {
@@ -266,35 +270,45 @@ void MCContext::renameELFSection(const MCSectionELF *Section, StringRef Name) {
   const_cast<MCSectionELF*>(Section)->setSectionName(CachedName);
 }
 
-const MCSectionELF *MCContext::
-getELFSection(StringRef Section, unsigned Type, unsigned Flags,
-              SectionKind Kind, unsigned EntrySize, StringRef Group) {
+const MCSectionELF *MCContext::getELFSection(StringRef Section, unsigned Type,
+                                             unsigned Flags, unsigned EntrySize,
+                                             StringRef Group, bool Unique) {
   // Do the lookup, if we have a hit, return it.
   auto IterBool = ELFUniquingMap.insert(
       std::make_pair(SectionGroupPair(Section, Group), nullptr));
   auto &Entry = *IterBool.first;
-  if (!IterBool.second) return Entry.second;
-
-  // Possibly refine the entry size first.
-  if (!EntrySize) {
-    EntrySize = MCSectionELF::DetermineEntrySize(Kind);
-  }
+  if (!IterBool.second && !Unique)
+    return Entry.second;
 
   MCSymbol *GroupSym = nullptr;
   if (!Group.empty())
     GroupSym = GetOrCreateSymbol(Group);
 
   StringRef CachedName = Entry.first.first;
+
+  SectionKind Kind;
+  if (Flags & ELF::SHF_EXECINSTR)
+    Kind = SectionKind::getText();
+  else
+    Kind = SectionKind::getReadOnly();
+
   MCSectionELF *Result = new (*this)
-      MCSectionELF(CachedName, Type, Flags, Kind, EntrySize, GroupSym);
-  Entry.second = Result;
+      MCSectionELF(CachedName, Type, Flags, Kind, EntrySize, GroupSym, Unique);
+  if (!Unique)
+    Entry.second = Result;
   return Result;
 }
 
+const MCSectionELF *MCContext::getELFSection(StringRef Section, unsigned Type,
+                                             unsigned Flags, unsigned EntrySize,
+                                             StringRef Group) {
+  return getELFSection(Section, Type, Flags, EntrySize, Group, false);
+}
+
 const MCSectionELF *MCContext::CreateELFGroupSection() {
   MCSectionELF *Result =
-    new (*this) MCSectionELF(".group", ELF::SHT_GROUP, 0,
-                             SectionKind::getReadOnly(), 4, nullptr);
+      new (*this) MCSectionELF(".group", ELF::SHT_GROUP, 0,
+                               SectionKind::getReadOnly(), 4, nullptr, false);
   return Result;
 }
 
diff --git a/lib/MC/MCDisassembler/Disassembler.cpp b/lib/MC/MCDisassembler/Disassembler.cpp
index d0d7f30..d9f01d0 100644
--- a/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/lib/MC/MCDisassembler/Disassembler.cpp
@@ -77,7 +77,7 @@ LLVMCreateDisasmCPUFeatures(const char *Triple, const char *CPU,
     return nullptr;
 
   std::unique_ptr<MCSymbolizer> Symbolizer(TheTarget->createMCSymbolizer(
-      Triple, GetOpInfo, SymbolLookUp, DisInfo, Ctx, RelInfo.release()));
+      Triple, GetOpInfo, SymbolLookUp, DisInfo, Ctx, std::move(RelInfo)));
   DisAsm->setSymbolizer(std::move(Symbolizer));
 
   // Set up the instruction printer.
@@ -151,10 +151,10 @@ static void emitComments(LLVMDisasmContext *DC,
   DC->CommentStream.resync();
 }
 
-/// \brief Gets latency information for \p Inst form the itinerary
+/// \brief Gets latency information for \p Inst from the itinerary
 /// scheduling model, based on \p DC information.
 /// \return The maximum expected latency over all the operands or -1
-/// if no information are available.
+/// if no information is available.
 static int getItineraryLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
   const int NoInformationAvailable = -1;
 
@@ -179,7 +179,7 @@ static int getItineraryLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
 
 /// \brief Gets latency information for \p Inst, based on \p DC information.
 /// \return The maximum expected latency over all the definitions or -1
-/// if no information are available.
+/// if no information is available.
 static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
   // Try to compute scheduling information.
   const MCSubtargetInfo *STI = DC->getSubtargetInfo();
@@ -220,7 +220,7 @@ static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
 static void emitLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
   int Latency = getLatency(DC, Inst);
 
-  // Report only interesting latency.
+  // Report only interesting latencies.
   if (Latency < 2)
     return;
 
diff --git a/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp b/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp
index 0145623..f306e4e 100644
--- a/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp
+++ b/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp
@@ -186,13 +186,11 @@ void MCExternalSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
 namespace llvm {
 MCSymbolizer *createMCSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
                                  LLVMSymbolLookupCallback SymbolLookUp,
-                                 void *DisInfo,
-                                 MCContext *Ctx,
-                                 MCRelocationInfo *RelInfo) {
+                                 void *DisInfo, MCContext *Ctx,
+                                 std::unique_ptr<MCRelocationInfo> &&RelInfo) {
   assert(Ctx && "No MCContext given for symbolic disassembly");
 
-  return new MCExternalSymbolizer(*Ctx,
-                                  std::unique_ptr<MCRelocationInfo>(RelInfo),
-                                  GetOpInfo, SymbolLookUp, DisInfo);
+  return new MCExternalSymbolizer(*Ctx, std::move(RelInfo), GetOpInfo,
+                                  SymbolLookUp, DisInfo);
 }
 }
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index 5effb01..5d96914 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -1045,11 +1045,16 @@ static void emitEncodingByte(MCObjectStreamer &Streamer, unsigned Encoding) {
 void FrameEmitterImpl::EmitCFIInstruction(MCObjectStreamer &Streamer,
                                           const MCCFIInstruction &Instr) {
   int dataAlignmentFactor = getDataAlignmentFactor(Streamer);
+  auto *MRI = Streamer.getContext().getRegisterInfo();
 
   switch (Instr.getOperation()) {
   case MCCFIInstruction::OpRegister: {
     unsigned Reg1 = Instr.getRegister();
     unsigned Reg2 = Instr.getRegister2();
+    if (!IsEH) {
+      Reg1 = MRI->getDwarfRegNum(MRI->getLLVMRegNum(Reg1, true), false);
+      Reg2 = MRI->getDwarfRegNum(MRI->getLLVMRegNum(Reg2, true), false);
+    }
     Streamer.EmitIntValue(dwarf::DW_CFA_register, 1);
     Streamer.EmitULEB128IntValue(Reg1);
     Streamer.EmitULEB128IntValue(Reg2);
@@ -1082,8 +1087,11 @@ void FrameEmitterImpl::EmitCFIInstruction(MCObjectStreamer &Streamer,
     return;
   }
   case MCCFIInstruction::OpDefCfa: {
+    unsigned Reg = Instr.getRegister();
+    if (!IsEH)
+      Reg = MRI->getDwarfRegNum(MRI->getLLVMRegNum(Reg, true), false);
     Streamer.EmitIntValue(dwarf::DW_CFA_def_cfa, 1);
-    Streamer.EmitULEB128IntValue(Instr.getRegister());
+    Streamer.EmitULEB128IntValue(Reg);
     CFAOffset = -Instr.getOffset();
     Streamer.EmitULEB128IntValue(CFAOffset);
 
@@ -1091,8 +1099,11 @@ void FrameEmitterImpl::EmitCFIInstruction(MCObjectStreamer &Streamer,
   }
 
   case MCCFIInstruction::OpDefCfaRegister: {
+    unsigned Reg = Instr.getRegister();
+    if (!IsEH)
+      Reg = MRI->getDwarfRegNum(MRI->getLLVMRegNum(Reg, true), false);
     Streamer.EmitIntValue(dwarf::DW_CFA_def_cfa_register, 1);
-    Streamer.EmitULEB128IntValue(Instr.getRegister());
+    Streamer.EmitULEB128IntValue(Reg);
 
     return;
   }
@@ -1103,6 +1114,9 @@ void FrameEmitterImpl::EmitCFIInstruction(MCObjectStreamer &Streamer,
       Instr.getOperation() == MCCFIInstruction::OpRelOffset;
 
     unsigned Reg = Instr.getRegister();
+    if (!IsEH)
+      Reg = MRI->getDwarfRegNum(MRI->getLLVMRegNum(Reg, true), false);
+
     int Offset = Instr.getOffset();
     if (IsRelative)
       Offset -= CFAOffset;
@@ -1136,6 +1150,8 @@ void FrameEmitterImpl::EmitCFIInstruction(MCObjectStreamer &Streamer,
   }
   case MCCFIInstruction::OpRestore: {
     unsigned Reg = Instr.getRegister();
+    if (!IsEH)
+      Reg = MRI->getDwarfRegNum(MRI->getLLVMRegNum(Reg, true), false);
     Streamer.EmitIntValue(dwarf::DW_CFA_restore | Reg, 1);
     return;
   }
@@ -1290,10 +1306,10 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCObjectStreamer &streamer,
   if (CIEVersion == 1) {
     assert(MRI->getRARegister() <= 255 &&
            "DWARF 2 encodes return_address_register in one byte");
-    streamer.EmitIntValue(MRI->getDwarfRegNum(MRI->getRARegister(), true), 1);
+    streamer.EmitIntValue(MRI->getDwarfRegNum(MRI->getRARegister(), IsEH), 1);
   } else {
     streamer.EmitULEB128IntValue(
-        MRI->getDwarfRegNum(MRI->getRARegister(), true));
+        MRI->getDwarfRegNum(MRI->getRARegister(), IsEH));
   }
 
   // Augmentation Data Length (optional)
diff --git a/lib/MC/MCELF.cpp b/lib/MC/MCELF.cpp
index 386c209..3690634 100644
--- a/lib/MC/MCELF.cpp
+++ b/lib/MC/MCELF.cpp
@@ -21,7 +21,7 @@ namespace llvm {
 
 void MCELF::SetBinding(MCSymbolData &SD, unsigned Binding) {
   assert(Binding == ELF::STB_LOCAL || Binding == ELF::STB_GLOBAL ||
-         Binding == ELF::STB_WEAK);
+         Binding == ELF::STB_WEAK || Binding == ELF::STB_GNU_UNIQUE);
   uint32_t OtherFlags = SD.getFlags() & ~(0xf << ELF_STB_Shift);
   SD.setFlags(OtherFlags | (Binding << ELF_STB_Shift));
 }
@@ -29,7 +29,7 @@ void MCELF::SetBinding(MCSymbolData &SD, unsigned Binding) {
 unsigned MCELF::GetBinding(const MCSymbolData &SD) {
   uint32_t Binding = (SD.getFlags() & (0xf << ELF_STB_Shift)) >> ELF_STB_Shift;
   assert(Binding == ELF::STB_LOCAL || Binding == ELF::STB_GLOBAL ||
-         Binding == ELF::STB_WEAK);
+         Binding == ELF::STB_WEAK || Binding == ELF::STB_GNU_UNIQUE);
   return Binding;
 }
 
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index bdc4a84..199825e 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -171,10 +171,16 @@ bool MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     return false;
 
   case MCSA_NoDeadStrip:
-  case MCSA_ELF_TypeGnuUniqueObject:
     // Ignore for now.
     break;
 
+  case MCSA_ELF_TypeGnuUniqueObject:
+    MCELF::SetType(SD, CombineSymbolTypes(MCELF::GetType(SD), ELF::STT_OBJECT));
+    MCELF::SetBinding(SD, ELF::STB_GNU_UNIQUE);
+    SD.setExternal(true);
+    BindingExplicitlySet.insert(Symbol);
+    break;
+
   case MCSA_Global:
     MCELF::SetBinding(SD, ELF::STB_GLOBAL);
     SD.setExternal(true);
@@ -253,11 +259,8 @@ void MCELFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   MCELF::SetType(SD, ELF::STT_OBJECT);
 
   if (MCELF::GetBinding(SD) == ELF_STB_Local) {
-    const MCSection *Section = getAssembler().getContext().getELFSection(".bss",
-                                                         ELF::SHT_NOBITS,
-                                                         ELF::SHF_WRITE |
-                                                         ELF::SHF_ALLOC,
-                                                         SectionKind::getBSS());
+    const MCSection *Section = getAssembler().getContext().getELFSection(
+        ".bss", ELF::SHT_NOBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
 
     AssignSection(Symbol, Section);
 
@@ -312,8 +315,7 @@ void MCELFStreamer::EmitFileDirective(StringRef Filename) {
 
 void MCELFStreamer::EmitIdent(StringRef IdentString) {
   const MCSection *Comment = getAssembler().getContext().getELFSection(
-      ".comment", ELF::SHT_PROGBITS, ELF::SHF_MERGE | ELF::SHF_STRINGS,
-      SectionKind::getReadOnly(), 1, "");
+      ".comment", ELF::SHT_PROGBITS, ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, "");
   PushSection();
   SwitchSection(Comment);
   if (!SeenIdent) {
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index 6e648b2..709dc6b 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -197,6 +197,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_ARM_TARGET1: return "target1";
   case VK_ARM_TARGET2: return "target2";
   case VK_ARM_PREL31: return "prel31";
+  case VK_ARM_SBREL: return "sbrel";
   case VK_ARM_TLSLDO: return "tlsldo";
   case VK_ARM_TLSCALL: return "tlscall";
   case VK_ARM_TLSDESC: return "tlsdesc";
@@ -286,164 +287,87 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
 
 MCSymbolRefExpr::VariantKind
 MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
-  return StringSwitch<VariantKind>(Name)
-    .Case("GOT", VK_GOT)
+  return StringSwitch<VariantKind>(Name.lower())
     .Case("got", VK_GOT)
-    .Case("GOTOFF", VK_GOTOFF)
     .Case("gotoff", VK_GOTOFF)
-    .Case("GOTPCREL", VK_GOTPCREL)
     .Case("gotpcrel", VK_GOTPCREL)
-    .Case("GOT_PREL", VK_GOTPCREL)
     .Case("got_prel", VK_GOTPCREL)
-    .Case("GOTTPOFF", VK_GOTTPOFF)
     .Case("gottpoff", VK_GOTTPOFF)
-    .Case("INDNTPOFF", VK_INDNTPOFF)
     .Case("indntpoff", VK_INDNTPOFF)
-    .Case("NTPOFF", VK_NTPOFF)
     .Case("ntpoff", VK_NTPOFF)
-    .Case("GOTNTPOFF", VK_GOTNTPOFF)
     .Case("gotntpoff", VK_GOTNTPOFF)
-    .Case("PLT", VK_PLT)
     .Case("plt", VK_PLT)
-    .Case("TLSGD", VK_TLSGD)
     .Case("tlsgd", VK_TLSGD)
-    .Case("TLSLD", VK_TLSLD)
     .Case("tlsld", VK_TLSLD)
-    .Case("TLSLDM", VK_TLSLDM)
     .Case("tlsldm", VK_TLSLDM)
-    .Case("TPOFF", VK_TPOFF)
     .Case("tpoff", VK_TPOFF)
-    .Case("DTPOFF", VK_DTPOFF)
     .Case("dtpoff", VK_DTPOFF)
-    .Case("TLVP", VK_TLVP)
     .Case("tlvp", VK_TLVP)
-    .Case("TLVPPAGE", VK_TLVPPAGE)
     .Case("tlvppage", VK_TLVPPAGE)
-    .Case("TLVPPAGEOFF", VK_TLVPPAGEOFF)
     .Case("tlvppageoff", VK_TLVPPAGEOFF)
-    .Case("PAGE", VK_PAGE)
     .Case("page", VK_PAGE)
-    .Case("PAGEOFF", VK_PAGEOFF)
     .Case("pageoff", VK_PAGEOFF)
-    .Case("GOTPAGE", VK_GOTPAGE)
     .Case("gotpage", VK_GOTPAGE)
-    .Case("GOTPAGEOFF", VK_GOTPAGEOFF)
     .Case("gotpageoff", VK_GOTPAGEOFF)
-    .Case("IMGREL", VK_COFF_IMGREL32)
     .Case("imgrel", VK_COFF_IMGREL32)
-    .Case("SECREL32", VK_SECREL)
     .Case("secrel32", VK_SECREL)
-    .Case("L", VK_PPC_LO)
     .Case("l", VK_PPC_LO)
-    .Case("H", VK_PPC_HI)
     .Case("h", VK_PPC_HI)
-    .Case("HA", VK_PPC_HA)
     .Case("ha", VK_PPC_HA)
-    .Case("HIGHER", VK_PPC_HIGHER)
     .Case("higher", VK_PPC_HIGHER)
-    .Case("HIGHERA", VK_PPC_HIGHERA)
     .Case("highera", VK_PPC_HIGHERA)
-    .Case("HIGHEST", VK_PPC_HIGHEST)
     .Case("highest", VK_PPC_HIGHEST)
-    .Case("HIGHESTA", VK_PPC_HIGHESTA)
     .Case("highesta", VK_PPC_HIGHESTA)
-    .Case("GOT@L", VK_PPC_GOT_LO)
     .Case("got@l", VK_PPC_GOT_LO)
-    .Case("GOT@H", VK_PPC_GOT_HI)
     .Case("got@h", VK_PPC_GOT_HI)
-    .Case("GOT@HA", VK_PPC_GOT_HA)
     .Case("got@ha", VK_PPC_GOT_HA)
-    .Case("TOCBASE", VK_PPC_TOCBASE)
+    .Case("local", VK_PPC_LOCAL)
     .Case("tocbase", VK_PPC_TOCBASE)
-    .Case("TOC", VK_PPC_TOC)
     .Case("toc", VK_PPC_TOC)
-    .Case("TOC@L", VK_PPC_TOC_LO)
     .Case("toc@l", VK_PPC_TOC_LO)
-    .Case("TOC@H", VK_PPC_TOC_HI)
     .Case("toc@h", VK_PPC_TOC_HI)
-    .Case("TOC@HA", VK_PPC_TOC_HA)
     .Case("toc@ha", VK_PPC_TOC_HA)
-    .Case("TLS", VK_PPC_TLS)
     .Case("tls", VK_PPC_TLS)
-    .Case("DTPMOD", VK_PPC_DTPMOD)
     .Case("dtpmod", VK_PPC_DTPMOD)
-    .Case("TPREL", VK_PPC_TPREL)
     .Case("tprel", VK_PPC_TPREL)
-    .Case("TPREL@L", VK_PPC_TPREL_LO)
     .Case("tprel@l", VK_PPC_TPREL_LO)
-    .Case("TPREL@H", VK_PPC_TPREL_HI)
     .Case("tprel@h", VK_PPC_TPREL_HI)
-    .Case("TPREL@HA", VK_PPC_TPREL_HA)
     .Case("tprel@ha", VK_PPC_TPREL_HA)
-    .Case("TPREL@HIGHER", VK_PPC_TPREL_HIGHER)
     .Case("tprel@higher", VK_PPC_TPREL_HIGHER)
-    .Case("TPREL@HIGHERA", VK_PPC_TPREL_HIGHERA)
     .Case("tprel@highera", VK_PPC_TPREL_HIGHERA)
-    .Case("TPREL@HIGHEST", VK_PPC_TPREL_HIGHEST)
     .Case("tprel@highest", VK_PPC_TPREL_HIGHEST)
-    .Case("TPREL@HIGHESTA", VK_PPC_TPREL_HIGHESTA)
     .Case("tprel@highesta", VK_PPC_TPREL_HIGHESTA)
-    .Case("DTPREL", VK_PPC_DTPREL)
     .Case("dtprel", VK_PPC_DTPREL)
-    .Case("DTPREL@L", VK_PPC_DTPREL_LO)
     .Case("dtprel@l", VK_PPC_DTPREL_LO)
-    .Case("DTPREL@H", VK_PPC_DTPREL_HI)
     .Case("dtprel@h", VK_PPC_DTPREL_HI)
-    .Case("DTPREL@HA", VK_PPC_DTPREL_HA)
     .Case("dtprel@ha", VK_PPC_DTPREL_HA)
-    .Case("DTPREL@HIGHER", VK_PPC_DTPREL_HIGHER)
     .Case("dtprel@higher", VK_PPC_DTPREL_HIGHER)
-    .Case("DTPREL@HIGHERA", VK_PPC_DTPREL_HIGHERA)
     .Case("dtprel@highera", VK_PPC_DTPREL_HIGHERA)
-    .Case("DTPREL@HIGHEST", VK_PPC_DTPREL_HIGHEST)
     .Case("dtprel@highest", VK_PPC_DTPREL_HIGHEST)
-    .Case("DTPREL@HIGHESTA", VK_PPC_DTPREL_HIGHESTA)
     .Case("dtprel@highesta", VK_PPC_DTPREL_HIGHESTA)
-    .Case("GOT@TPREL", VK_PPC_GOT_TPREL)
     .Case("got@tprel", VK_PPC_GOT_TPREL)
-    .Case("GOT@TPREL@L", VK_PPC_GOT_TPREL_LO)
     .Case("got@tprel@l", VK_PPC_GOT_TPREL_LO)
-    .Case("GOT@TPREL@H", VK_PPC_GOT_TPREL_HI)
     .Case("got@tprel@h", VK_PPC_GOT_TPREL_HI)
-    .Case("GOT@TPREL@HA", VK_PPC_GOT_TPREL_HA)
     .Case("got@tprel@ha", VK_PPC_GOT_TPREL_HA)
-    .Case("GOT@DTPREL", VK_PPC_GOT_DTPREL)
     .Case("got@dtprel", VK_PPC_GOT_DTPREL)
-    .Case("GOT@DTPREL@L", VK_PPC_GOT_DTPREL_LO)
     .Case("got@dtprel@l", VK_PPC_GOT_DTPREL_LO)
-    .Case("GOT@DTPREL@H", VK_PPC_GOT_DTPREL_HI)
     .Case("got@dtprel@h", VK_PPC_GOT_DTPREL_HI)
-    .Case("GOT@DTPREL@HA", VK_PPC_GOT_DTPREL_HA)
     .Case("got@dtprel@ha", VK_PPC_GOT_DTPREL_HA)
-    .Case("GOT@TLSGD", VK_PPC_GOT_TLSGD)
     .Case("got@tlsgd", VK_PPC_GOT_TLSGD)
-    .Case("GOT@TLSGD@L", VK_PPC_GOT_TLSGD_LO)
     .Case("got@tlsgd@l", VK_PPC_GOT_TLSGD_LO)
-    .Case("GOT@TLSGD@H", VK_PPC_GOT_TLSGD_HI)
     .Case("got@tlsgd@h", VK_PPC_GOT_TLSGD_HI)
-    .Case("GOT@TLSGD@HA", VK_PPC_GOT_TLSGD_HA)
     .Case("got@tlsgd@ha", VK_PPC_GOT_TLSGD_HA)
-    .Case("GOT@TLSLD", VK_PPC_GOT_TLSLD)
     .Case("got@tlsld", VK_PPC_GOT_TLSLD)
-    .Case("GOT@TLSLD@L", VK_PPC_GOT_TLSLD_LO)
     .Case("got@tlsld@l", VK_PPC_GOT_TLSLD_LO)
-    .Case("GOT@TLSLD@H", VK_PPC_GOT_TLSLD_HI)
     .Case("got@tlsld@h", VK_PPC_GOT_TLSLD_HI)
-    .Case("GOT@TLSLD@HA", VK_PPC_GOT_TLSLD_HA)
     .Case("got@tlsld@ha", VK_PPC_GOT_TLSLD_HA)
-    .Case("NONE", VK_ARM_NONE)
     .Case("none", VK_ARM_NONE)
-    .Case("TARGET1", VK_ARM_TARGET1)
     .Case("target1", VK_ARM_TARGET1)
-    .Case("TARGET2", VK_ARM_TARGET2)
     .Case("target2", VK_ARM_TARGET2)
-    .Case("PREL31", VK_ARM_PREL31)
     .Case("prel31", VK_ARM_PREL31)
-    .Case("TLSLDO", VK_ARM_TLSLDO)
+    .Case("sbrel", VK_ARM_SBREL)
     .Case("tlsldo", VK_ARM_TLSLDO)
-    .Case("TLSCALL", VK_ARM_TLSCALL)
     .Case("tlscall", VK_ARM_TLSCALL)
-    .Case("TLSDESC", VK_ARM_TLSDESC)
     .Case("tlsdesc", VK_ARM_TLSDESC)
     .Default(VK_Invalid);
 }
diff --git a/lib/MC/MCInst.cpp b/lib/MC/MCInst.cpp
index d7b80f5..7ef69be 100644
--- a/lib/MC/MCInst.cpp
+++ b/lib/MC/MCInst.cpp
@@ -15,7 +15,7 @@
 
 using namespace llvm;
 
-void MCOperand::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
+void MCOperand::print(raw_ostream &OS) const {
   OS << "<MCOperand ";
   if (!isValid())
     OS << "INVALID";
@@ -34,22 +34,21 @@ void MCOperand::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCOperand::dump() const {
-  print(dbgs(), nullptr);
+  print(dbgs());
   dbgs() << "\n";
 }
 #endif
 
-void MCInst::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
+void MCInst::print(raw_ostream &OS) const {
   OS << "<MCInst " << getOpcode();
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     OS << " ";
-    getOperand(i).print(OS, MAI);
+    getOperand(i).print(OS);
   }
   OS << ">";
 }
 
-void MCInst::dump_pretty(raw_ostream &OS, const MCAsmInfo *MAI,
-                         const MCInstPrinter *Printer,
+void MCInst::dump_pretty(raw_ostream &OS, const MCInstPrinter *Printer,
                          StringRef Separator) const {
   OS << "<MCInst #" << getOpcode();
 
@@ -59,14 +58,14 @@ void MCInst::dump_pretty(raw_ostream &OS, const MCAsmInfo *MAI,
 
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     OS << Separator;
-    getOperand(i).print(OS, MAI);
+    getOperand(i).print(OS);
   }
   OS << ">";
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCInst::dump() const {
-  print(dbgs(), nullptr);
+  print(dbgs());
   dbgs() << "\n";
 }
 #endif
diff --git a/lib/MC/MCInstPrinter.cpp b/lib/MC/MCInstPrinter.cpp
index ba71245..0dc3121 100644
--- a/lib/MC/MCInstPrinter.cpp
+++ b/lib/MC/MCInstPrinter.cpp
@@ -69,11 +69,11 @@ static bool needsLeadingZero(uint64_t Value)
   return false;
 }
 
-format_object1<int64_t> MCInstPrinter::formatDec(const int64_t Value) const {
+format_object<int64_t> MCInstPrinter::formatDec(int64_t Value) const {
   return format("%" PRId64, Value);
 }
 
-format_object1<int64_t> MCInstPrinter::formatHex(const int64_t Value) const {
+format_object<int64_t> MCInstPrinter::formatHex(int64_t Value) const {
   switch(PrintHexStyle) {
   case HexStyle::C:
     if (Value < 0)
@@ -96,7 +96,7 @@ format_object1<int64_t> MCInstPrinter::formatHex(const int64_t Value) const {
   llvm_unreachable("unsupported print style");
 }
 
-format_object1<uint64_t> MCInstPrinter::formatHex(const uint64_t Value) const {
+format_object<uint64_t> MCInstPrinter::formatHex(uint64_t Value) const {
   switch(PrintHexStyle) {
   case HexStyle::C:
      return format("0x%" PRIx64, Value);
diff --git a/lib/MC/MCLinkerOptimizationHint.cpp b/lib/MC/MCLinkerOptimizationHint.cpp
index 3f8d620..7739878 100644
--- a/lib/MC/MCLinkerOptimizationHint.cpp
+++ b/lib/MC/MCLinkerOptimizationHint.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCAsmLayout.h"
-#include "llvm/Support/LEB128.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/LEB128.h"
 
 using namespace llvm;
 
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index a147c3d..79eab49 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -183,7 +183,7 @@ void MCMachOStreamer::EmitDataRegionEnd() {
   if (!getAssembler().getBackend().hasDataInCodeSupport())
     return;
   std::vector<DataRegionData> &Regions = getAssembler().getDataRegions();
-  assert(Regions.size() && "Mismatched .end_data_region!");
+  assert(!Regions.empty() && "Mismatched .end_data_region!");
   DataRegionData &Data = Regions.back();
   assert(!Data.End && "Mismatched .end_data_region!");
   // Create a temporary label to mark the end of the data region.
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 1b88462..11c9cc2 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -273,6 +273,12 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
   case Triple::mips64el:
     FDECFIEncoding = dwarf::DW_EH_PE_sdata8;
     break;
+  case Triple::x86_64:
+    FDECFIEncoding = dwarf::DW_EH_PE_pcrel |
+                     ((CMModel == CodeModel::Large) ? dwarf::DW_EH_PE_sdata8
+                                                    : dwarf::DW_EH_PE_sdata4);
+
+    break;
   default:
     FDECFIEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
     break;
@@ -401,7 +407,7 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
   // platform.
   EHSectionType = ELF::SHT_PROGBITS;
   EHSectionFlags = ELF::SHF_ALLOC;
-  if (T.getOS() == Triple::Solaris) {
+  if (T.isOSSolaris()) {
     if (T.getArch() == Triple::x86_64)
       EHSectionType = ELF::SHT_X86_64_UNWIND;
     else
@@ -410,83 +416,54 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
 
 
   // ELF
-  BSSSection =
-    Ctx->getELFSection(".bss", ELF::SHT_NOBITS,
-                       ELF::SHF_WRITE | ELF::SHF_ALLOC,
-                       SectionKind::getBSS());
+  BSSSection = Ctx->getELFSection(".bss", ELF::SHT_NOBITS,
+                                  ELF::SHF_WRITE | ELF::SHF_ALLOC);
 
-  TextSection =
-    Ctx->getELFSection(".text", ELF::SHT_PROGBITS,
-                       ELF::SHF_EXECINSTR |
-                       ELF::SHF_ALLOC,
-                       SectionKind::getText());
+  TextSection = Ctx->getELFSection(".text", ELF::SHT_PROGBITS,
+                                   ELF::SHF_EXECINSTR | ELF::SHF_ALLOC);
 
-  DataSection =
-    Ctx->getELFSection(".data", ELF::SHT_PROGBITS,
-                       ELF::SHF_WRITE |ELF::SHF_ALLOC,
-                       SectionKind::getDataRel());
+  DataSection = Ctx->getELFSection(".data", ELF::SHT_PROGBITS,
+                                   ELF::SHF_WRITE | ELF::SHF_ALLOC);
 
   ReadOnlySection =
-    Ctx->getELFSection(".rodata", ELF::SHT_PROGBITS,
-                       ELF::SHF_ALLOC,
-                       SectionKind::getReadOnly());
+      Ctx->getELFSection(".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
 
   TLSDataSection =
-    Ctx->getELFSection(".tdata", ELF::SHT_PROGBITS,
-                       ELF::SHF_ALLOC | ELF::SHF_TLS |
-                       ELF::SHF_WRITE,
-                       SectionKind::getThreadData());
-
-  TLSBSSSection =
-    Ctx->getELFSection(".tbss", ELF::SHT_NOBITS,
-                       ELF::SHF_ALLOC | ELF::SHF_TLS |
-                       ELF::SHF_WRITE,
-                       SectionKind::getThreadBSS());
-
-  DataRelSection =
-    Ctx->getELFSection(".data.rel", ELF::SHT_PROGBITS,
-                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                       SectionKind::getDataRel());
-
-  DataRelLocalSection =
-    Ctx->getELFSection(".data.rel.local", ELF::SHT_PROGBITS,
-                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                       SectionKind::getDataRelLocal());
-
-  DataRelROSection =
-    Ctx->getELFSection(".data.rel.ro", ELF::SHT_PROGBITS,
-                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                       SectionKind::getReadOnlyWithRel());
-
-  DataRelROLocalSection =
-    Ctx->getELFSection(".data.rel.ro.local", ELF::SHT_PROGBITS,
-                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                       SectionKind::getReadOnlyWithRelLocal());
+      Ctx->getELFSection(".tdata", ELF::SHT_PROGBITS,
+                         ELF::SHF_ALLOC | ELF::SHF_TLS | ELF::SHF_WRITE);
+
+  TLSBSSSection = Ctx->getELFSection(
+      ".tbss", ELF::SHT_NOBITS, ELF::SHF_ALLOC | ELF::SHF_TLS | ELF::SHF_WRITE);
+
+  DataRelSection = Ctx->getELFSection(".data.rel", ELF::SHT_PROGBITS,
+                                      ELF::SHF_ALLOC | ELF::SHF_WRITE);
+
+  DataRelLocalSection = Ctx->getELFSection(".data.rel.local", ELF::SHT_PROGBITS,
+                                           ELF::SHF_ALLOC | ELF::SHF_WRITE);
+
+  DataRelROSection = Ctx->getELFSection(".data.rel.ro", ELF::SHT_PROGBITS,
+                                        ELF::SHF_ALLOC | ELF::SHF_WRITE);
+
+  DataRelROLocalSection = Ctx->getELFSection(
+      ".data.rel.ro.local", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_WRITE);
 
   MergeableConst4Section =
-    Ctx->getELFSection(".rodata.cst4", ELF::SHT_PROGBITS,
-                       ELF::SHF_ALLOC |ELF::SHF_MERGE,
-                       SectionKind::getMergeableConst4());
+      Ctx->getELFSection(".rodata.cst4", ELF::SHT_PROGBITS,
+                         ELF::SHF_ALLOC | ELF::SHF_MERGE, 4, "");
 
   MergeableConst8Section =
-    Ctx->getELFSection(".rodata.cst8", ELF::SHT_PROGBITS,
-                       ELF::SHF_ALLOC |ELF::SHF_MERGE,
-                       SectionKind::getMergeableConst8());
+      Ctx->getELFSection(".rodata.cst8", ELF::SHT_PROGBITS,
+                         ELF::SHF_ALLOC | ELF::SHF_MERGE, 8, "");
 
   MergeableConst16Section =
-    Ctx->getELFSection(".rodata.cst16", ELF::SHT_PROGBITS,
-                       ELF::SHF_ALLOC |ELF::SHF_MERGE,
-                       SectionKind::getMergeableConst16());
+      Ctx->getELFSection(".rodata.cst16", ELF::SHT_PROGBITS,
+                         ELF::SHF_ALLOC | ELF::SHF_MERGE, 16, "");
 
-  StaticCtorSection =
-    Ctx->getELFSection(".ctors", ELF::SHT_PROGBITS,
-                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                       SectionKind::getDataRel());
+  StaticCtorSection = Ctx->getELFSection(".ctors", ELF::SHT_PROGBITS,
+                                         ELF::SHF_ALLOC | ELF::SHF_WRITE);
 
-  StaticDtorSection =
-    Ctx->getELFSection(".dtors", ELF::SHT_PROGBITS,
-                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                       SectionKind::getDataRel());
+  StaticDtorSection = Ctx->getELFSection(".dtors", ELF::SHT_PROGBITS,
+                                         ELF::SHF_ALLOC | ELF::SHF_WRITE);
 
   // Exception Handling Sections.
 
@@ -494,103 +471,68 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
   // it contains relocatable pointers.  In PIC mode, this is probably a big
   // runtime hit for C++ apps.  Either the contents of the LSDA need to be
   // adjusted or this should be a data section.
-  LSDASection =
-    Ctx->getELFSection(".gcc_except_table", ELF::SHT_PROGBITS,
-                       ELF::SHF_ALLOC,
-                       SectionKind::getReadOnly());
+  LSDASection = Ctx->getELFSection(".gcc_except_table", ELF::SHT_PROGBITS,
+                                   ELF::SHF_ALLOC);
 
   COFFDebugSymbolsSection = nullptr;
 
   // Debug Info Sections.
   DwarfAbbrevSection =
-    Ctx->getELFSection(".debug_abbrev", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
-  DwarfInfoSection =
-    Ctx->getELFSection(".debug_info", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
-  DwarfLineSection =
-    Ctx->getELFSection(".debug_line", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
-  DwarfFrameSection =
-    Ctx->getELFSection(".debug_frame", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".debug_abbrev", ELF::SHT_PROGBITS, 0);
+  DwarfInfoSection = Ctx->getELFSection(".debug_info", ELF::SHT_PROGBITS, 0);
+  DwarfLineSection = Ctx->getELFSection(".debug_line", ELF::SHT_PROGBITS, 0);
+  DwarfFrameSection = Ctx->getELFSection(".debug_frame", ELF::SHT_PROGBITS, 0);
   DwarfPubNamesSection =
-    Ctx->getELFSection(".debug_pubnames", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".debug_pubnames", ELF::SHT_PROGBITS, 0);
   DwarfPubTypesSection =
-    Ctx->getELFSection(".debug_pubtypes", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".debug_pubtypes", ELF::SHT_PROGBITS, 0);
   DwarfGnuPubNamesSection =
-    Ctx->getELFSection(".debug_gnu_pubnames", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".debug_gnu_pubnames", ELF::SHT_PROGBITS, 0);
   DwarfGnuPubTypesSection =
-    Ctx->getELFSection(".debug_gnu_pubtypes", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".debug_gnu_pubtypes", ELF::SHT_PROGBITS, 0);
   DwarfStrSection =
-    Ctx->getELFSection(".debug_str", ELF::SHT_PROGBITS,
-                       ELF::SHF_MERGE | ELF::SHF_STRINGS,
-                       SectionKind::getMergeable1ByteCString());
-  DwarfLocSection =
-    Ctx->getELFSection(".debug_loc", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".debug_str", ELF::SHT_PROGBITS,
+                         ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, "");
+  DwarfLocSection = Ctx->getELFSection(".debug_loc", ELF::SHT_PROGBITS, 0);
   DwarfARangesSection =
-    Ctx->getELFSection(".debug_aranges", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".debug_aranges", ELF::SHT_PROGBITS, 0);
   DwarfRangesSection =
-    Ctx->getELFSection(".debug_ranges", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".debug_ranges", ELF::SHT_PROGBITS, 0);
   DwarfMacroInfoSection =
-    Ctx->getELFSection(".debug_macinfo", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".debug_macinfo", ELF::SHT_PROGBITS, 0);
 
   // DWARF5 Experimental Debug Info
 
   // Accelerator Tables
   DwarfAccelNamesSection =
-    Ctx->getELFSection(".apple_names", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".apple_names", ELF::SHT_PROGBITS, 0);
   DwarfAccelObjCSection =
-    Ctx->getELFSection(".apple_objc", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".apple_objc", ELF::SHT_PROGBITS, 0);
   DwarfAccelNamespaceSection =
-    Ctx->getELFSection(".apple_namespaces", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".apple_namespaces", ELF::SHT_PROGBITS, 0);
   DwarfAccelTypesSection =
-    Ctx->getELFSection(".apple_types", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".apple_types", ELF::SHT_PROGBITS, 0);
 
   // Fission Sections
   DwarfInfoDWOSection =
-    Ctx->getELFSection(".debug_info.dwo", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".debug_info.dwo", ELF::SHT_PROGBITS, 0);
   DwarfTypesDWOSection =
-    Ctx->getELFSection(".debug_types.dwo", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".debug_types.dwo", ELF::SHT_PROGBITS, 0);
   DwarfAbbrevDWOSection =
-    Ctx->getELFSection(".debug_abbrev.dwo", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".debug_abbrev.dwo", ELF::SHT_PROGBITS, 0);
   DwarfStrDWOSection =
-    Ctx->getELFSection(".debug_str.dwo", ELF::SHT_PROGBITS,
-                       ELF::SHF_MERGE | ELF::SHF_STRINGS,
-                       SectionKind::getMergeable1ByteCString());
+      Ctx->getELFSection(".debug_str.dwo", ELF::SHT_PROGBITS,
+                         ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, "");
   DwarfLineDWOSection =
-    Ctx->getELFSection(".debug_line.dwo", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".debug_line.dwo", ELF::SHT_PROGBITS, 0);
   DwarfLocDWOSection =
-    Ctx->getELFSection(".debug_loc.dwo", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".debug_loc.dwo", ELF::SHT_PROGBITS, 0);
   DwarfStrOffDWOSection =
-    Ctx->getELFSection(".debug_str_offsets.dwo", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
-  DwarfAddrSection =
-    Ctx->getELFSection(".debug_addr", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
+      Ctx->getELFSection(".debug_str_offsets.dwo", ELF::SHT_PROGBITS, 0);
+  DwarfAddrSection = Ctx->getELFSection(".debug_addr", ELF::SHT_PROGBITS, 0);
 
   StackMapSection =
-    Ctx->getELFSection(".llvm_stackmaps", ELF::SHT_PROGBITS,
-                       ELF::SHF_ALLOC,
-                       SectionKind::getMetadata());
-
+      Ctx->getELFSection(".llvm_stackmaps", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
 }
 
 
@@ -678,129 +620,153 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
   DwarfAbbrevSection =
     Ctx->getCOFFSection(".debug_abbrev",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
   DwarfInfoSection =
     Ctx->getCOFFSection(".debug_info",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
   DwarfLineSection =
     Ctx->getCOFFSection(".debug_line",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
   DwarfFrameSection =
     Ctx->getCOFFSection(".debug_frame",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
   DwarfPubNamesSection =
     Ctx->getCOFFSection(".debug_pubnames",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
   DwarfPubTypesSection =
     Ctx->getCOFFSection(".debug_pubtypes",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
   DwarfGnuPubNamesSection =
     Ctx->getCOFFSection(".debug_gnu_pubnames",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
   DwarfGnuPubTypesSection =
     Ctx->getCOFFSection(".debug_gnu_pubtypes",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
   DwarfStrSection =
     Ctx->getCOFFSection(".debug_str",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
   DwarfLocSection =
     Ctx->getCOFFSection(".debug_loc",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
   DwarfARangesSection =
     Ctx->getCOFFSection(".debug_aranges",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
   DwarfRangesSection =
     Ctx->getCOFFSection(".debug_ranges",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
   DwarfMacroInfoSection =
     Ctx->getCOFFSection(".debug_macinfo",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
   DwarfInfoDWOSection =
-      Ctx->getCOFFSection(".debug_info.dwo",
-                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                          COFF::IMAGE_SCN_MEM_READ,
-                          SectionKind::getMetadata());
+    Ctx->getCOFFSection(".debug_info.dwo",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
   DwarfTypesDWOSection =
-      Ctx->getCOFFSection(".debug_types.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                                  COFF::IMAGE_SCN_MEM_READ,
-                          SectionKind::getMetadata());
+    Ctx->getCOFFSection(".debug_types.dwo",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
   DwarfAbbrevDWOSection =
-      Ctx->getCOFFSection(".debug_abbrev.dwo",
-                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                          COFF::IMAGE_SCN_MEM_READ,
-                          SectionKind::getMetadata());
+    Ctx->getCOFFSection(".debug_abbrev.dwo",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
   DwarfStrDWOSection =
-      Ctx->getCOFFSection(".debug_str.dwo",
-                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                          COFF::IMAGE_SCN_MEM_READ,
-                          SectionKind::getMetadata());
+    Ctx->getCOFFSection(".debug_str.dwo",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
   DwarfLineDWOSection =
-      Ctx->getCOFFSection(".debug_line.dwo",
-                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                          COFF::IMAGE_SCN_MEM_READ,
-                          SectionKind::getMetadata());
+    Ctx->getCOFFSection(".debug_line.dwo",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
   DwarfLocDWOSection =
-      Ctx->getCOFFSection(".debug_loc.dwo",
-                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                          COFF::IMAGE_SCN_MEM_READ,
-                          SectionKind::getMetadata());
+    Ctx->getCOFFSection(".debug_loc.dwo",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
   DwarfStrOffDWOSection =
-      Ctx->getCOFFSection(".debug_str_offsets.dwo",
-                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                          COFF::IMAGE_SCN_MEM_READ,
-                          SectionKind::getMetadata());
-
+    Ctx->getCOFFSection(".debug_str_offsets.dwo",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
   DwarfAddrSection =
     Ctx->getCOFFSection(".debug_addr",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
-
   DwarfAccelNamesSection =
-      Ctx->getCOFFSection(".apple_names",
-                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                          COFF::IMAGE_SCN_MEM_READ,
-                          SectionKind::getMetadata());
+    Ctx->getCOFFSection(".apple_names",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
   DwarfAccelNamespaceSection =
-      Ctx->getCOFFSection(".apple_namespaces",
-                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                          COFF::IMAGE_SCN_MEM_READ,
-                          SectionKind::getMetadata());
+    Ctx->getCOFFSection(".apple_namespaces",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
   DwarfAccelTypesSection =
-      Ctx->getCOFFSection(".apple_types",
-                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                          COFF::IMAGE_SCN_MEM_READ,
-                          SectionKind::getMetadata());
+    Ctx->getCOFFSection(".apple_types",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
   DwarfAccelObjCSection =
-      Ctx->getCOFFSection(".apple_objc",
-                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                          COFF::IMAGE_SCN_MEM_READ,
-                          SectionKind::getMetadata());
+    Ctx->getCOFFSection(".apple_objc",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
 
   DrectveSection =
     Ctx->getCOFFSection(".drectve",
@@ -878,7 +844,7 @@ void MCObjectFileInfo::InitMCObjectFileInfo(StringRef T, Reloc::Model relocm,
 
 const MCSection *MCObjectFileInfo::getDwarfTypesSection(uint64_t Hash) const {
   return Ctx->getELFSection(".debug_types", ELF::SHT_PROGBITS, ELF::SHF_GROUP,
-                            SectionKind::getMetadata(), 0, utostr(Hash));
+                            0, utostr(Hash));
 }
 
 void MCObjectFileInfo::InitEHFrameSection() {
@@ -892,9 +858,7 @@ void MCObjectFileInfo::InitEHFrameSection() {
                            SectionKind::getReadOnly());
   else if (Env == IsELF)
     EHFrameSection =
-      Ctx->getELFSection(".eh_frame", EHSectionType,
-                         EHSectionFlags,
-                         SectionKind::getDataRel());
+        Ctx->getELFSection(".eh_frame", EHSectionType, EHSectionFlags);
   else
     EHFrameSection =
       Ctx->getCOFFSection(".eh_frame",
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index 21e6867..08fe501 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -405,7 +405,9 @@ void MCObjectStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue) {
 }
 
 void MCObjectStreamer::EmitZeros(uint64_t NumBytes) {
-  unsigned ItemSize = getCurrentSection().first->isVirtualSection() ? 0 : 1;
+  const MCSection *Sec = getCurrentSection().first;
+  assert(Sec && "need a section");
+  unsigned ItemSize = Sec->isVirtualSection() ? 0 : 1;
   insert(new MCFillFragment(0, ItemSize, NumBytes));
 }
 
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index de7d961..ef6a540 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -111,8 +111,8 @@ struct ParseStatementInfo {
 
 /// \brief The concrete assembly parser instance.
 class AsmParser : public MCAsmParser {
-  AsmParser(const AsmParser &) LLVM_DELETED_FUNCTION;
-  void operator=(const AsmParser &) LLVM_DELETED_FUNCTION;
+  AsmParser(const AsmParser &) = delete;
+  void operator=(const AsmParser &) = delete;
 private:
   AsmLexer Lexer;
   MCContext &Ctx;
@@ -1298,6 +1298,9 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       Sym = getContext().GetOrCreateSymbol(IDVal);
     } else
       Sym = Ctx.CreateDirectionalLocalSymbol(LocalLabelVal);
+
+    Sym->redefineIfPossible();
+
     if (!Sym->isUndefined() || Sym->isVariable())
       return Error(IDLoc, "invalid symbol redefinition");
 
@@ -1595,14 +1598,18 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
   // directive for the instruction.
   if (!HadError && getContext().getGenDwarfForAssembly() &&
       getContext().getGenDwarfSectionSyms().count(
-        getStreamer().getCurrentSection().first)) {
-
-    unsigned Line = SrcMgr.FindLineNumber(IDLoc, CurBuffer);
+          getStreamer().getCurrentSection().first)) {
+    unsigned Line;
+    if (ActiveMacros.empty())
+      Line = SrcMgr.FindLineNumber(IDLoc, CurBuffer);
+    else
+      Line = SrcMgr.FindLineNumber(ActiveMacros.back()->InstantiationLoc,
+                                   ActiveMacros.back()->ExitBuffer);
 
     // If we previously parsed a cpp hash file line comment then make sure the
     // current Dwarf File is for the CppHashFilename if not then emit the
     // Dwarf File table for it and adjust the line number for the .loc.
-    if (CppHashFilename.size() != 0) {
+    if (CppHashFilename.size()) {
       unsigned FileNumber = getStreamer().EmitDwarfFileDirective(
           0, StringRef(), CppHashFilename);
       getContext().setGenDwarfFileNumber(FileNumber);
@@ -2213,6 +2220,8 @@ bool AsmParser::parseAssignment(StringRef Name, bool allow_redef,
   } else
     Sym = getContext().GetOrCreateSymbol(Name);
 
+  Sym->setRedefinable(allow_redef);
+
   // Do the assignment.
   Out.EmitAssignment(Sym, Value);
   if (NoDeadStrip)
@@ -3266,7 +3275,7 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
   MCAsmMacroParameters Parameters;
   while (getLexer().isNot(AsmToken::EndOfStatement)) {
 
-    if (Parameters.size() && Parameters.back().Vararg)
+    if (!Parameters.empty() && Parameters.back().Vararg)
       return Error(Lexer.getLoc(),
                    "Vararg parameter '" + Parameters.back().Name +
                    "' should be last one in the list of parameters.");
@@ -3627,21 +3636,27 @@ bool AsmParser::parseDirectiveSpace(StringRef IDVal) {
 }
 
 /// parseDirectiveLEB128
-/// ::= (.sleb128 | .uleb128) expression
+/// ::= (.sleb128 | .uleb128) [ expression (, expression)* ]
 bool AsmParser::parseDirectiveLEB128(bool Signed) {
   checkForValidSection();
   const MCExpr *Value;
 
-  if (parseExpression(Value))
-    return true;
+  for (;;) {
+    if (parseExpression(Value))
+      return true;
 
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in directive");
+    if (Signed)
+      getStreamer().EmitSLEB128Value(Value);
+    else
+      getStreamer().EmitULEB128Value(Value);
 
-  if (Signed)
-    getStreamer().EmitSLEB128Value(Value);
-  else
-    getStreamer().EmitULEB128Value(Value);
+    if (getLexer().is(AsmToken::EndOfStatement))
+      break;
+
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in directive");
+    Lex();
+  }
 
   return false;
 }
@@ -4662,7 +4677,7 @@ bool AsmParser::parseMSInlineAsm(
       OS << "$$";
       break;
     case AOK_Label:
-      OS << Ctx.getAsmInfo()->getPrivateGlobalPrefix() << AR.Label;
+      OS << Ctx.getAsmInfo()->getPrivateLabelPrefix() << AR.Label;
       break;
     case AOK_Input:
       OS << '$' << InputIdx++;
diff --git a/lib/MC/MCParser/CMakeLists.txt b/lib/MC/MCParser/CMakeLists.txt
index 222f237..957c94e 100644
--- a/lib/MC/MCParser/CMakeLists.txt
+++ b/lib/MC/MCParser/CMakeLists.txt
@@ -8,4 +8,7 @@ add_llvm_library(LLVMMCParser
   MCAsmParser.cpp
   MCAsmParserExtension.cpp
   MCTargetAsmParser.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/MCParser
   )
diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp
index 6f82e6e..18bdb03 100644
--- a/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/lib/MC/MCParser/COFFAsmParser.cpp
@@ -582,7 +582,7 @@ bool COFFAsmParser::ParseSEHDirectiveHandlerData(StringRef, SMLoc) {
 }
 
 bool COFFAsmParser::ParseSEHDirectivePushReg(StringRef, SMLoc L) {
-  unsigned Reg;
+  unsigned Reg = 0;
   if (ParseSEHRegisterNumber(Reg))
     return true;
 
@@ -595,7 +595,7 @@ bool COFFAsmParser::ParseSEHDirectivePushReg(StringRef, SMLoc L) {
 }
 
 bool COFFAsmParser::ParseSEHDirectiveSetFrame(StringRef, SMLoc L) {
-  unsigned Reg;
+  unsigned Reg = 0;
   int64_t Off;
   if (ParseSEHRegisterNumber(Reg))
     return true;
@@ -636,7 +636,7 @@ bool COFFAsmParser::ParseSEHDirectiveAllocStack(StringRef, SMLoc) {
 }
 
 bool COFFAsmParser::ParseSEHDirectiveSaveReg(StringRef, SMLoc L) {
-  unsigned Reg;
+  unsigned Reg = 0;
   int64_t Off;
   if (ParseSEHRegisterNumber(Reg))
     return true;
@@ -663,7 +663,7 @@ bool COFFAsmParser::ParseSEHDirectiveSaveReg(StringRef, SMLoc L) {
 // FIXME: This method is inherently x86-specific. It should really be in the
 // x86 backend.
 bool COFFAsmParser::ParseSEHDirectiveSaveXMM(StringRef, SMLoc L) {
-  unsigned Reg;
+  unsigned Reg = 0;
   int64_t Off;
   if (ParseSEHRegisterNumber(Reg))
     return true;
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index e302004..7a120a1 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -199,8 +199,7 @@ bool ELFAsmParser::ParseSectionSwitch(StringRef Section, unsigned Type,
       return true;
   }
 
-  getStreamer().SwitchSection(getContext().getELFSection(
-                                Section, Type, Flags, Kind),
+  getStreamer().SwitchSection(getContext().getELFSection(Section, Type, Flags),
                               Subsection);
 
   return false;
@@ -269,40 +268,6 @@ bool ELFAsmParser::ParseSectionName(StringRef &SectionName) {
   return false;
 }
 
-static SectionKind computeSectionKind(unsigned Flags, unsigned ElemSize) {
-  if (Flags & ELF::SHF_EXECINSTR)
-    return SectionKind::getText();
-  if (Flags & ELF::SHF_TLS)
-    return SectionKind::getThreadData();
-  if (Flags & ELF::SHF_MERGE) {
-    if (Flags & ELF::SHF_STRINGS) {
-      switch (ElemSize) {
-      default:
-        break;
-      case 1:
-        return SectionKind::getMergeable1ByteCString();
-      case 2:
-        return SectionKind::getMergeable2ByteCString();
-      case 4:
-        return SectionKind::getMergeable4ByteCString();
-      }
-    } else {
-      switch (ElemSize) {
-      default:
-        break;
-      case 4:
-        return SectionKind::getMergeableConst4();
-      case 8:
-        return SectionKind::getMergeableConst8();
-      case 16:
-        return SectionKind::getMergeableConst16();
-      }
-    }
-  }
-
-  return SectionKind::getDataRel();
-}
-
 static unsigned parseSectionFlags(StringRef flagsStr, bool *UseLastGroup) {
   unsigned flags = 0;
 
@@ -413,6 +378,8 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
   unsigned Flags = 0;
   const MCExpr *Subsection = nullptr;
   bool UseLastGroup = false;
+  StringRef UniqueStr;
+  bool Unique = false;
 
   // Set the defaults first.
   if (SectionName == ".fini" || SectionName == ".init" ||
@@ -497,6 +464,14 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
             return TokError("Linkage must be 'comdat'");
         }
       }
+      if (getLexer().is(AsmToken::Comma)) {
+        Lex();
+        if (getParser().parseIdentifier(UniqueStr))
+          return TokError("expected identifier in directive");
+        if (UniqueStr != "unique")
+          return TokError("expected 'unique'");
+        Unique = true;
+      }
     }
   }
 
@@ -544,9 +519,8 @@ EndStmt:
       }
   }
 
-  SectionKind Kind = computeSectionKind(Flags, Size);
   const MCSection *ELFSection = getContext().getELFSection(
-      SectionName, Type, Flags, Kind, Size, GroupName);
+      SectionName, Type, Flags, Size, GroupName, Unique);
   getStreamer().SwitchSection(ELFSection, Subsection);
 
   if (getContext().getGenDwarfForAssembly()) {
@@ -697,9 +671,7 @@ bool ELFAsmParser::ParseDirectiveVersion(StringRef, SMLoc) {
 
   Lex();
 
-  const MCSection *Note =
-    getContext().getELFSection(".note", ELF::SHT_NOTE, 0,
-                               SectionKind::getReadOnly());
+  const MCSection *Note = getContext().getELFSection(".note", ELF::SHT_NOTE, 0);
 
   getStreamer().PushSection();
   getStreamer().SwitchSection(Note);
diff --git a/lib/MC/MCSectionCOFF.cpp b/lib/MC/MCSectionCOFF.cpp
index e95845f0..4d6298c 100644
--- a/lib/MC/MCSectionCOFF.cpp
+++ b/lib/MC/MCSectionCOFF.cpp
@@ -47,6 +47,10 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
   }
 
   OS << "\t.section\t" << getSectionName() << ",\"";
+  if (getCharacteristics() & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
+    OS << 'd';
+  if (getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
+    OS << 'b';
   if (getCharacteristics() & COFF::IMAGE_SCN_MEM_EXECUTE)
     OS << 'x';
   if (getCharacteristics() & COFF::IMAGE_SCN_MEM_WRITE)
@@ -55,10 +59,6 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << 'r';
   else
     OS << 'y';
-  if (getCharacteristics() & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
-    OS << 'd';
-  if (getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
-    OS << 'b';
   if (getCharacteristics() & COFF::IMAGE_SCN_LNK_REMOVE)
     OS << 'n';
   if (getCharacteristics() & COFF::IMAGE_SCN_MEM_SHARED)
diff --git a/lib/MC/MCSectionELF.cpp b/lib/MC/MCSectionELF.cpp
index a29bb97..da38682 100644
--- a/lib/MC/MCSectionELF.cpp
+++ b/lib/MC/MCSectionELF.cpp
@@ -24,6 +24,9 @@ MCSectionELF::~MCSectionELF() {} // anchor.
 bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name,
                                               const MCAsmInfo &MAI) const {
 
+  if (Unique)
+    return false;
+
   // FIXME: Does .section .bss/.data/.text work everywhere??
   if (Name == ".text" || Name == ".data" ||
       (Name == ".bss" && !MAI.usesELFSectionDirectiveForBSS()))
@@ -144,6 +147,10 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     printName(OS, Group->getName());
     OS << ",comdat";
   }
+
+  if (Unique)
+    OS << ",unique";
+
   OS << '\n';
 
   if (Subsection)
@@ -157,13 +164,3 @@ bool MCSectionELF::UseCodeAlign() const {
 bool MCSectionELF::isVirtualSection() const {
   return getType() == ELF::SHT_NOBITS;
 }
-
-unsigned MCSectionELF::DetermineEntrySize(SectionKind Kind) {
-  if (Kind.isMergeable1ByteCString()) return 1;
-  if (Kind.isMergeable2ByteCString()) return 2;
-  if (Kind.isMergeable4ByteCString()) return 4;
-  if (Kind.isMergeableConst4())       return 4;
-  if (Kind.isMergeableConst8())       return 8;
-  if (Kind.isMergeableConst16())      return 16;
-  return 0;
-}
diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp
index b8e42bd..ca3894b 100644
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp
@@ -35,7 +35,7 @@ MCSubtargetInfo::InitCPUSchedModel(StringRef CPU) {
 }
 
 void
-MCSubtargetInfo::InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS,
+MCSubtargetInfo::InitMCSubtargetInfo(StringRef TT, StringRef C, StringRef FS,
                                      ArrayRef<SubtargetFeatureKV> PF,
                                      ArrayRef<SubtargetFeatureKV> PD,
                                      const SubtargetInfoKV *ProcSched,
@@ -46,6 +46,7 @@ MCSubtargetInfo::InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS,
                                      const unsigned *OC,
                                      const unsigned *FP) {
   TargetTriple = TT;
+  CPU = C;
   ProcFeatures = PF;
   ProcDesc = PD;
   ProcSchedModels = ProcSched;
diff --git a/lib/MC/MCTargetOptions.cpp b/lib/MC/MCTargetOptions.cpp
index 3093ba2..1258d9e 100644
--- a/lib/MC/MCTargetOptions.cpp
+++ b/lib/MC/MCTargetOptions.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCTargetOptions.h"
 
 namespace llvm {
@@ -15,6 +16,10 @@ MCTargetOptions::MCTargetOptions()
     : SanitizeAddress(false), MCRelaxAll(false), MCNoExecStack(false),
       MCFatalWarnings(false), MCSaveTempLabels(false),
       MCUseDwarfDirectory(false), ShowMCEncoding(false), ShowMCInst(false),
-      AsmVerbose(false), DwarfVersion(0) {}
+      AsmVerbose(false), DwarfVersion(0), ABIName() {}
+
+StringRef MCTargetOptions::getABIName() const {
+  return ABIName;
+}
 
 } // end namespace llvm
diff --git a/lib/MC/MCValue.cpp b/lib/MC/MCValue.cpp
index 9dfc56e..5512e03 100644
--- a/lib/MC/MCValue.cpp
+++ b/lib/MC/MCValue.cpp
@@ -15,7 +15,7 @@
 
 using namespace llvm;
 
-void MCValue::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
+void MCValue::print(raw_ostream &OS) const {
   if (isAbsolute()) {
     OS << getConstant();
     return;
@@ -39,7 +39,7 @@ void MCValue::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCValue::dump() const {
-  print(dbgs(), nullptr);
+  print(dbgs());
 }
 #endif
 
diff --git a/lib/MC/MCWinEH.cpp b/lib/MC/MCWinEH.cpp
index f0c354f..47eaf0f 100644
--- a/lib/MC/MCWinEH.cpp
+++ b/lib/MC/MCWinEH.cpp
@@ -17,52 +17,45 @@
 
 namespace llvm {
 namespace WinEH {
-static StringRef getSectionSuffix(const MCSymbol *Function) {
-  if (!Function || !Function->isInSection())
-    return "";
-
-  const MCSection *FunctionSection = &Function->getSection();
-  if (const auto Section = dyn_cast<MCSectionCOFF>(FunctionSection)) {
-    StringRef Name = Section->getSectionName();
-    size_t Dollar = Name.find('$');
-    size_t Dot = Name.find('.', 1);
-
-    if (Dollar == StringRef::npos && Dot == StringRef::npos)
-      return "";
-    if (Dot == StringRef::npos)
-      return Name.substr(Dollar);
-    if (Dollar == StringRef::npos || Dot < Dollar)
-      return Name.substr(Dot);
-
-    return Name.substr(Dollar);
-  }
-
-  return "";
-}
 
+/// We can't have one section for all .pdata or .xdata because the Microsoft
+/// linker seems to want all code relocations to refer to the same object file
+/// section. If the code described is comdat, create a new comdat section
+/// associated with that comdat. If the code described is not in the main .text
+/// section, make a new section for it. Otherwise use the main unwind info
+/// section.
 static const MCSection *getUnwindInfoSection(
     StringRef SecName, const MCSectionCOFF *UnwindSec, const MCSymbol *Function,
     MCContext &Context) {
-  // If Function is in a COMDAT, get or create an unwind info section in that
-  // COMDAT group.
   if (Function && Function->isInSection()) {
+    // If Function is in a COMDAT, get or create an unwind info section in that
+    // COMDAT group.
     const MCSectionCOFF *FunctionSection =
         cast<MCSectionCOFF>(&Function->getSection());
     if (FunctionSection->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) {
       return Context.getAssociativeCOFFSection(
           UnwindSec, FunctionSection->getCOMDATSymbol());
     }
+
+    // If Function is in a section other than .text, create a new .pdata section.
+    // Otherwise use the plain .pdata section.
+    if (const auto *Section = dyn_cast<MCSectionCOFF>(FunctionSection)) {
+      StringRef CodeSecName = Section->getSectionName();
+      if (CodeSecName == ".text")
+        return UnwindSec;
+
+      if (CodeSecName.startswith(".text$"))
+        CodeSecName = CodeSecName.substr(6);
+
+      return Context.getCOFFSection(
+          (SecName + Twine('$') + CodeSecName).str(),
+          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ,
+          SectionKind::getDataRel());
+    }
   }
 
-  // If Function is in a section other than .text, create a new .pdata section.
-  // Otherwise use the plain .pdata section.
-  StringRef Suffix = getSectionSuffix(Function);
-  if (Suffix.empty())
-    return UnwindSec;
-  return Context.getCOFFSection((SecName + Suffix).str(),
-                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getDataRel());
+  return UnwindSec;
+
 }
 
 const MCSection *UnwindEmitter::getPDataSection(const MCSymbol *Function,
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index 577c4b7..588d424 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -418,7 +418,7 @@ void MachObjectWriter::WriteLinkeditLoadCommand(uint32_t Type,
 static unsigned ComputeLinkerOptionsLoadCommandSize(
   const std::vector<std::string> &Options, bool is64Bit)
 {
-  unsigned Size = sizeof(MachO::linker_options_command);
+  unsigned Size = sizeof(MachO::linker_option_command);
   for (unsigned i = 0, e = Options.size(); i != e; ++i)
     Size += Options[i].size() + 1;
   return RoundUpToAlignment(Size, is64Bit ? 8 : 4);
@@ -431,10 +431,10 @@ void MachObjectWriter::WriteLinkerOptionsLoadCommand(
   uint64_t Start = OS.tell();
   (void) Start;
 
-  Write32(MachO::LC_LINKER_OPTIONS);
+  Write32(MachO::LC_LINKER_OPTION);
   Write32(Size);
   Write32(Options.size());
-  uint64_t BytesWritten = sizeof(MachO::linker_options_command);
+  uint64_t BytesWritten = sizeof(MachO::linker_option_command);
   for (unsigned i = 0, e = Options.size(); i != e; ++i) {
     // Write each string, including the null byte.
     const std::string &Option = Options[i];
@@ -448,14 +448,11 @@ void MachObjectWriter::WriteLinkerOptionsLoadCommand(
   assert(OS.tell() - Start == Size);
 }
 
-
-void MachObjectWriter::RecordRelocation(const MCAssembler &Asm,
+void MachObjectWriter::RecordRelocation(MCAssembler &Asm,
                                         const MCAsmLayout &Layout,
                                         const MCFragment *Fragment,
-                                        const MCFixup &Fixup,
-                                        MCValue Target,
-                                        bool &IsPCRel,
-                                        uint64_t &FixedValue) {
+                                        const MCFixup &Fixup, MCValue Target,
+                                        bool &IsPCRel, uint64_t &FixedValue) {
   TargetObjectWriter->RecordRelocation(this, Asm, Layout, Fragment, Fixup,
                                        Target, FixedValue);
 }
@@ -616,6 +613,22 @@ void MachObjectWriter::ComputeSymbolTable(
     ExternalSymbolData[i].SymbolData->setIndex(Index++);
   for (unsigned i = 0, e = UndefinedSymbolData.size(); i != e; ++i)
     UndefinedSymbolData[i].SymbolData->setIndex(Index++);
+
+  for (const MCSectionData &SD : Asm) {
+    std::vector<RelAndSymbol> &Relocs = Relocations[&SD];
+    for (RelAndSymbol &Rel : Relocs) {
+      if (!Rel.Sym)
+        continue;
+
+      // Set the Index and the IsExtern bit.
+      unsigned Index = Rel.Sym->getIndex();
+      assert(isInt<24>(Index));
+      if (IsLittleEndian)
+        Rel.MRE.r_word1 = (Rel.MRE.r_word1 & (-1 << 24)) | Index | (1 << 27);
+      else
+        Rel.MRE.r_word1 = (Rel.MRE.r_word1 & 0xff) | Index << 8 | (1 << 4);
+    }
+  }
 }
 
 void MachObjectWriter::computeSectionAddresses(const MCAssembler &Asm,
@@ -662,10 +675,6 @@ void MachObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
   // Mark symbol difference expressions in variables (from .set or = directives)
   // as absolute.
   markAbsoluteVariableSymbols(Asm, Layout);
-
-  // Compute symbol table information and bind symbol indices.
-  ComputeSymbolTable(Asm, LocalSymbolData, ExternalSymbolData,
-                     UndefinedSymbolData);
 }
 
 bool MachObjectWriter::
@@ -749,6 +758,10 @@ IsSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
 
 void MachObjectWriter::WriteObject(MCAssembler &Asm,
                                    const MCAsmLayout &Layout) {
+  // Compute symbol table information and bind symbol indices.
+  ComputeSymbolTable(Asm, LocalSymbolData, ExternalSymbolData,
+                     UndefinedSymbolData);
+
   unsigned NumSections = Asm.size();
   const MCAssembler::VersionMinInfoType &VersionInfo =
     Layout.getAssembler().getVersionMinInfo();
@@ -839,7 +852,7 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
   uint64_t RelocTableEnd = SectionDataStart + SectionDataFileSize;
   for (MCAssembler::const_iterator it = Asm.begin(),
          ie = Asm.end(); it != ie; ++it) {
-    std::vector<MachO::any_relocation_info> &Relocs = Relocations[it];
+    std::vector<RelAndSymbol> &Relocs = Relocations[it];
     unsigned NumRelocs = Relocs.size();
     uint64_t SectionStart = SectionDataStart + getSectionAddress(it);
     WriteSection(Asm, Layout, *it, SectionStart, RelocTableEnd, NumRelocs);
@@ -933,10 +946,10 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
          ie = Asm.end(); it != ie; ++it) {
     // Write the section relocation entries, in reverse order to match 'as'
     // (approximately, the exact algorithm is more complicated than this).
-    std::vector<MachO::any_relocation_info> &Relocs = Relocations[it];
+    std::vector<RelAndSymbol> &Relocs = Relocations[it];
     for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
-      Write32(Relocs[e - i - 1].r_word0);
-      Write32(Relocs[e - i - 1].r_word1);
+      Write32(Relocs[e - i - 1].MRE.r_word0);
+      Write32(Relocs[e - i - 1].MRE.r_word1);
     }
   }
 
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index 1046e04..c519a9d 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -13,9 +13,9 @@
 
 #include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
@@ -175,7 +175,7 @@ public:
                                               const MCFragment &FB, bool InSet,
                                               bool IsPCRel) const override;
 
-  void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
+  void RecordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
                         MCValue Target, bool &IsPCRel,
                         uint64_t &FixedValue) override;
@@ -266,12 +266,12 @@ COFFSymbol *WinCOFFObjectWriter::createSymbol(StringRef Name) {
   return createCOFFEntity<COFFSymbol>(Name, Symbols);
 }
 
-COFFSymbol *WinCOFFObjectWriter::GetOrCreateCOFFSymbol(const MCSymbol * Symbol){
+COFFSymbol *WinCOFFObjectWriter::GetOrCreateCOFFSymbol(const MCSymbol *Symbol) {
   symbol_map::iterator i = SymbolMap.find(Symbol);
   if (i != SymbolMap.end())
     return i->second;
-  COFFSymbol *RetSymbol
-    = createCOFFEntity<COFFSymbol>(Symbol->getName(), Symbols);
+  COFFSymbol *RetSymbol =
+      createCOFFEntity<COFFSymbol>(Symbol->getName(), Symbols);
   SymbolMap[Symbol] = RetSymbol;
   return RetSymbol;
 }
@@ -640,7 +640,7 @@ void WinCOFFObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
                                                    const MCAsmLayout &Layout) {
   // "Define" each section & symbol. This creates section & symbol
   // entries in the staging area.
-  for (const auto & Section : Asm)
+  for (const auto &Section : Asm)
     DefineSection(Section);
 
   for (MCSymbolData &SD : Asm.symbols())
@@ -661,13 +661,9 @@ bool WinCOFFObjectWriter::IsSymbolRefDifferenceFullyResolvedImpl(
                                                                 InSet, IsPCRel);
 }
 
-void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
-                                           const MCAsmLayout &Layout,
-                                           const MCFragment *Fragment,
-                                           const MCFixup &Fixup,
-                                           MCValue Target,
-                                           bool &IsPCRel,
-                                           uint64_t &FixedValue) {
+void WinCOFFObjectWriter::RecordRelocation(
+    MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment,
+    const MCFixup &Fixup, MCValue Target, bool &IsPCRel, uint64_t &FixedValue) {
   assert(Target.getSymA() && "Relocation must reference a symbol!");
 
   const MCSymbol &Symbol = Target.getSymA()->getSymbol();
@@ -710,17 +706,22 @@ void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
     CrossSection = &Symbol.getSection() != &B->getSection();
 
     // Offset of the symbol in the section
-    int64_t a = Layout.getSymbolOffset(&B_SD);
-
-    // Offset of the relocation in the section
-    int64_t b = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+    int64_t OffsetOfB = Layout.getSymbolOffset(&B_SD);
 
-    FixedValue = b - a;
     // In the case where we have SymbA and SymB, we just need to store the delta
     // between the two symbols.  Update FixedValue to account for the delta, and
     // skip recording the relocation.
-    if (!CrossSection)
+    if (!CrossSection) {
+      int64_t OffsetOfA = Layout.getSymbolOffset(&A_SD);
+      FixedValue = (OffsetOfA - OffsetOfB) + Target.getConstant();
       return;
+    }
+
+    // Offset of the relocation in the section
+    int64_t OffsetOfRelocation =
+        Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+
+    FixedValue = OffsetOfRelocation - OffsetOfB;
   } else {
     FixedValue = Target.getConstant();
   }
@@ -741,8 +742,9 @@ void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
   ++Reloc.Symb->Relocations;
 
   Reloc.Data.VirtualAddress += Fixup.getOffset();
-  Reloc.Data.Type = TargetObjectWriter->getRelocType(Target, Fixup,
-                                                     CrossSection);
+  Reloc.Data.Type =
+      TargetObjectWriter->getRelocType(Target, Fixup, CrossSection,
+                                       Asm.getBackend());
 
   // FIXME: Can anyone explain what this does other than adjust for the size
   // of the offset?
@@ -835,7 +837,7 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
 
     unsigned Offset = 0;
     unsigned Length = FI->size();
-    for (auto & Aux : file->Aux) {
+    for (auto &Aux : file->Aux) {
       Aux.AuxType = ATFile;
 
       if (Length > SymbolSize) {
@@ -881,7 +883,7 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
       SetSymbolName(*S);
 
   // Fixup weak external references.
-  for (auto & Symbol : Symbols) {
+  for (auto &Symbol : Symbols) {
     if (Symbol->Other) {
       assert(Symbol->Index != -1);
       assert(Symbol->Aux.size() == 1 && "Symbol must contain one aux symbol!");
@@ -892,7 +894,7 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
   }
 
   // Fixup associative COMDAT sections.
-  for (auto & Section : Sections) {
+  for (auto &Section : Sections) {
     if (Section->Symbol->Aux[0].Aux.SectionDefinition.Selection !=
         COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
       continue;
@@ -928,7 +930,7 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
     offset += COFF::Header16Size;
   offset += COFF::SectionSize * Header.NumberOfSections;
 
-  for (const auto & Section : Asm) {
+  for (const auto &Section : Asm) {
     COFFSection *Sec = SectionMap[&Section.getSection()];
 
     if (Sec->Number == -1)
@@ -937,6 +939,8 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
     Sec->Header.SizeOfRawData = Layout.getSectionAddressSize(&Section);
 
     if (IsPhysicalSection(Sec)) {
+      // Align the section data to a four byte boundary.
+      offset = RoundUpToAlignment(offset, 4);
       Sec->Header.PointerToRawData = offset;
 
       offset += Sec->Header.SizeOfRawData;
@@ -961,7 +965,7 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
 
       offset += COFF::RelocationSize * Sec->Relocations.size();
 
-      for (auto & Relocation : Sec->Relocations) {
+      for (auto &Relocation : Sec->Relocations) {
         assert(Relocation.Symb->Index != -1);
         Relocation.Data.SymbolTableIndex = Relocation.Symb->Index;
       }
@@ -991,7 +995,7 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
     sections::iterator i, ie;
     MCAssembler::const_iterator j, je;
 
-    for (auto & Section : Sections) {
+    for (auto &Section : Sections) {
       if (Section->Number != -1) {
         if (Section->Relocations.size() >= 0xffff)
           Section->Header.Characteristics |= COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
@@ -1007,9 +1011,15 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
         continue;
 
       if ((*i)->Header.PointerToRawData != 0) {
-        assert(OS.tell() == (*i)->Header.PointerToRawData &&
+        assert(OS.tell() <= (*i)->Header.PointerToRawData &&
                "Section::PointerToRawData is insane!");
 
+        unsigned SectionDataPadding = (*i)->Header.PointerToRawData - OS.tell();
+        assert(SectionDataPadding < 4 &&
+               "Should only need at most three bytes of padding!");
+
+        WriteZeros(SectionDataPadding);
+
         Asm.writeSectionData(j, Layout);
       }
 
@@ -1027,7 +1037,7 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
           WriteRelocation(r);
         }
 
-        for (const auto & Relocation : (*i)->Relocations)
+        for (const auto &Relocation : (*i)->Relocations)
           WriteRelocation(Relocation.Data);
       } else
         assert((*i)->Header.PointerToRelocations == 0 &&
@@ -1038,7 +1048,7 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
   assert(OS.tell() == Header.PointerToSymbolTable &&
          "Header::PointerToSymbolTable is insane!");
 
-  for (auto & Symbol : Symbols)
+  for (auto &Symbol : Symbols)
     if (Symbol->Index != -1)
       WriteSymbol(*Symbol);
 
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 6a8054d..41a3da7 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
@@ -23,6 +22,7 @@
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWinCOFFStreamer.h"
diff --git a/lib/Makefile b/lib/Makefile
index 0ddf917..52fdaaf 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -10,8 +10,8 @@ LEVEL = ..
 
 include $(LEVEL)/Makefile.config
 
-PARALLEL_DIRS := IR AsmParser Bitcode Analysis Transforms CodeGen Target \
-                 ExecutionEngine Linker LTO MC Object Option DebugInfo   \
+PARALLEL_DIRS := IR AsmParser Bitcode Analysis Transforms CodeGen Target      \
+                 ExecutionEngine Linker LTO MC Object Option DebugInfo        \
                  IRReader LineEditor ProfileData
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Object/Archive.cpp b/lib/Object/Archive.cpp
index d169dbe..43b0771 100644
--- a/lib/Object/Archive.cpp
+++ b/lib/Object/Archive.cpp
@@ -22,6 +22,7 @@ using namespace llvm;
 using namespace object;
 
 static const char *const Magic = "!<arch>\n";
+static const char *const ThinMagic = "!<thin>\n";
 
 void Archive::anchor() { }
 
@@ -86,7 +87,10 @@ Archive::Child::Child(const Archive *Parent, const char *Start)
 
   const ArchiveMemberHeader *Header =
       reinterpret_cast<const ArchiveMemberHeader *>(Start);
-  Data = StringRef(Start, sizeof(ArchiveMemberHeader) + Header->getSize());
+  uint64_t Size = sizeof(ArchiveMemberHeader);
+  if (!Parent->IsThin || Header->getName() == "/" || Header->getName() == "//")
+    Size += Header->getSize();
+  Data = StringRef(Start, Size);
 
   // Setup StartOfFile and PaddingBytes.
   StartOfFile = sizeof(ArchiveMemberHeader);
@@ -100,6 +104,16 @@ Archive::Child::Child(const Archive *Parent, const char *Start)
   }
 }
 
+uint64_t Archive::Child::getSize() const {
+  if (Parent->IsThin)
+    return getHeader()->getSize();
+  return Data.size() - StartOfFile;
+}
+
+uint64_t Archive::Child::getRawSize() const {
+  return getHeader()->getSize();
+}
+
 Archive::Child Archive::Child::getNext() const {
   size_t SpaceToSkip = Data.size();
   // If it's odd, add 1 to make it even.
@@ -115,6 +129,13 @@ Archive::Child Archive::Child::getNext() const {
   return Child(Parent, NextLoc);
 }
 
+uint64_t Archive::Child::getChildOffset() const {
+  const char *a = Parent->Data.getBuffer().data();
+  const char *c = Data.data();
+  uint64_t offset = c - a;
+  return offset;
+}
+
 ErrorOr<StringRef> Archive::Child::getName() const {
   StringRef name = getRawName();
   // Check if it's a special name.
@@ -141,7 +162,7 @@ ErrorOr<StringRef> Archive::Child::getName() const {
       return object_error::parse_failed;
 
     // GNU long file names end with a /.
-    if (Parent->kind() == K_GNU) {
+    if (Parent->kind() == K_GNU || Parent->kind() == K_MIPS64) {
       StringRef::size_type End = StringRef(addr).find('/');
       return StringRef(addr, End);
     }
@@ -186,9 +207,13 @@ ErrorOr<std::unique_ptr<Archive>> Archive::create(MemoryBufferRef Source) {
 
 Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
     : Binary(Binary::ID_Archive, Source), SymbolTable(child_end()) {
+  StringRef Buffer = Data.getBuffer();
   // Check for sufficient magic.
-  if (Data.getBufferSize() < 8 ||
-      StringRef(Data.getBufferStart(), 8) != Magic) {
+  if (Buffer.startswith(ThinMagic)) {
+    IsThin = true;
+  } else if (Buffer.startswith(Magic)) {
+    IsThin = false;
+  } else {
     ec = object_error::invalid_file_type;
     return;
   }
@@ -248,8 +273,16 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
     return;
   }
 
-  if (Name == "/") {
+  // MIPS 64-bit ELF archives use a special format of a symbol table.
+  // This format is marked by `ar_name` field equals to "/SYM64/".
+  // For detailed description see page 96 in the following document:
+  // http://techpubs.sgi.com/library/manuals/4000/007-4658-001/pdf/007-4658-001.pdf
+
+  bool has64SymTable = false;
+  if (Name == "/" || Name == "/SYM64/") {
     SymbolTable = i;
+    if (Name == "/SYM64/")
+      has64SymTable = true;
 
     ++i;
     if (i == e) {
@@ -260,7 +293,7 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
   }
 
   if (Name == "//") {
-    Format = K_GNU;
+    Format = has64SymTable ? K_MIPS64 : K_GNU;
     StringTable = i;
     ++i;
     FirstRegular = i;
@@ -269,7 +302,7 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
   }
 
   if (Name[0] != '/') {
-    Format = K_GNU;
+    Format = has64SymTable ? K_MIPS64 : K_GNU;
     FirstRegular = i;
     ec = object_error::success;
     return;
@@ -323,11 +356,18 @@ StringRef Archive::Symbol::getName() const {
 
 ErrorOr<Archive::child_iterator> Archive::Symbol::getMember() const {
   const char *Buf = Parent->SymbolTable->getBuffer().begin();
-  const char *Offsets = Buf + 4;
+  const char *Offsets = Buf;
+  if (Parent->kind() == K_MIPS64)
+    Offsets += sizeof(uint64_t);
+  else
+    Offsets += sizeof(uint32_t);
   uint32_t Offset = 0;
   if (Parent->kind() == K_GNU) {
-    Offset = *(reinterpret_cast<const support::ubig32_t*>(Offsets)
-               + SymbolIndex);
+    Offset =
+        *(reinterpret_cast<const support::ubig32_t *>(Offsets) + SymbolIndex);
+  } else if (Parent->kind() == K_MIPS64) {
+    Offset =
+        *(reinterpret_cast<const support::ubig64_t *>(Offsets) + SymbolIndex);
   } else if (Parent->kind() == K_BSD) {
     // The SymbolIndex is an index into the ranlib structs that start at
     // Offsets (the first uint32_t is the number of bytes of the ranlib
@@ -341,8 +381,8 @@ ErrorOr<Archive::child_iterator> Archive::Symbol::getMember() const {
     uint32_t MemberCount = *reinterpret_cast<const support::ulittle32_t*>(Buf);
     
     // Skip offsets.
-    Buf += sizeof(support::ulittle32_t)
-           + (MemberCount * sizeof(support::ulittle32_t));
+    Buf += sizeof(support::ulittle32_t) +
+           (MemberCount * sizeof(support::ulittle32_t));
 
     uint32_t SymbolCount = *reinterpret_cast<const support::ulittle32_t*>(Buf);
 
@@ -424,6 +464,9 @@ Archive::symbol_iterator Archive::symbol_begin() const {
     uint32_t symbol_count = 0;
     symbol_count = *reinterpret_cast<const support::ubig32_t*>(buf);
     buf += sizeof(uint32_t) + (symbol_count * (sizeof(uint32_t)));
+  } else if (kind() == K_MIPS64) {
+    uint64_t symbol_count = *reinterpret_cast<const support::ubig64_t *>(buf);
+    buf += sizeof(uint64_t) + (symbol_count * (sizeof(uint64_t)));
   } else if (kind() == K_BSD) {
     // The __.SYMDEF or "__.SYMDEF SORTED" member starts with a uint32_t
     // which is the number of bytes of ranlib structs that follow.  The ranlib
@@ -461,6 +504,8 @@ Archive::symbol_iterator Archive::symbol_end() const {
   uint32_t symbol_count = 0;
   if (kind() == K_GNU) {
     symbol_count = *reinterpret_cast<const support::ubig32_t*>(buf);
+  } else if (kind() == K_MIPS64) {
+    symbol_count = *reinterpret_cast<const support::ubig64_t*>(buf);
   } else if (kind() == K_BSD) {
     symbol_count = (*reinterpret_cast<const support::ulittle32_t *>(buf)) /
                    (sizeof(uint32_t) * 2);
@@ -470,8 +515,7 @@ Archive::symbol_iterator Archive::symbol_end() const {
     buf += 4 + (member_count * 4); // Skip offsets.
     symbol_count = *reinterpret_cast<const support::ulittle32_t*>(buf);
   }
-  return symbol_iterator(
-    Symbol(this, symbol_count, 0));
+  return symbol_iterator(Symbol(this, symbol_count, 0));
 }
 
 Archive::child_iterator Archive::findSym(StringRef name) const {
diff --git a/lib/Object/Binary.cpp b/lib/Object/Binary.cpp
index c56eeb1..a2b167a 100644
--- a/lib/Object/Binary.cpp
+++ b/lib/Object/Binary.cpp
@@ -58,6 +58,7 @@ ErrorOr<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer,
     case sys::fs::file_magic::macho_bundle:
     case sys::fs::file_magic::macho_dynamically_linked_shared_lib_stub:
     case sys::fs::file_magic::macho_dsym_companion:
+    case sys::fs::file_magic::macho_kext_bundle:
     case sys::fs::file_magic::coff_object:
     case sys::fs::file_magic::coff_import_library:
     case sys::fs::file_magic::pecoff_executable:
diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt
index 5b08e42..37add22 100644
--- a/lib/Object/CMakeLists.txt
+++ b/lib/Object/CMakeLists.txt
@@ -14,4 +14,7 @@ add_llvm_library(LLVMObject
   ObjectFile.cpp
   RecordStreamer.cpp
   SymbolicFile.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Object
   )
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index d5ff7d6..cde6fdc 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -362,39 +362,11 @@ bool COFFObjectFile::isSectionBSS(DataRefImpl Ref) const {
   return Sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
 }
 
-bool COFFObjectFile::isSectionRequiredForExecution(DataRefImpl Ref) const {
-  // Sections marked 'Info', 'Remove', or 'Discardable' aren't required for
-  // execution.
-  const coff_section *Sec = toSec(Ref);
-  return !(Sec->Characteristics &
-           (COFF::IMAGE_SCN_LNK_INFO | COFF::IMAGE_SCN_LNK_REMOVE |
-            COFF::IMAGE_SCN_MEM_DISCARDABLE));
-}
-
 bool COFFObjectFile::isSectionVirtual(DataRefImpl Ref) const {
   const coff_section *Sec = toSec(Ref);
   return Sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
 }
 
-bool COFFObjectFile::isSectionZeroInit(DataRefImpl Ref) const {
-  const coff_section *Sec = toSec(Ref);
-  return Sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
-}
-
-bool COFFObjectFile::isSectionReadOnlyData(DataRefImpl Ref) const {
-  const coff_section *Sec = toSec(Ref);
-  // Check if it's any sort of data section.
-  if (!(Sec->Characteristics & (COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
-                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)))
-    return false;
-  // If it's writable or executable or contains code, it isn't read-only data.
-  if (Sec->Characteristics &
-      (COFF::IMAGE_SCN_CNT_CODE | COFF::IMAGE_SCN_MEM_EXECUTE |
-       COFF::IMAGE_SCN_MEM_WRITE))
-    return false;
-  return true;
-}
-
 bool COFFObjectFile::sectionContainsSymbol(DataRefImpl SecRef,
                                            DataRefImpl SymbRef) const {
   const coff_section *Sec = toSec(SecRef);
@@ -414,7 +386,8 @@ static uint32_t getNumberOfRelocations(const coff_section *Sec,
     if (getObject(FirstReloc, M, reinterpret_cast<const coff_relocation*>(
         base + Sec->PointerToRelocations)))
       return 0;
-    return FirstReloc->VirtualAddress;
+    // -1 to exclude this first relocation entry.
+    return FirstReloc->VirtualAddress - 1;
   }
   return Sec->NumberOfRelocations;
 }
@@ -1060,7 +1033,7 @@ symbol_iterator COFFObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
   else if (SymbolTable32)
     Ref.p = reinterpret_cast<uintptr_t>(SymbolTable32 + R->SymbolTableIndex);
   else
-    return symbol_end();
+    llvm_unreachable("no symbol table pointer!");
   return symbol_iterator(SymbolRef(Ref, this));
 }
 
diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp
index 11099bd..398e9e4 100644
--- a/lib/Object/ELF.cpp
+++ b/lib/Object/ELF.cpp
@@ -12,716 +12,71 @@
 namespace llvm {
 namespace object {
 
-#define LLVM_ELF_SWITCH_RELOC_TYPE_NAME(enum)                                  \
-  case ELF::enum:                                                              \
-    return #enum;                                                              \
+#define ELF_RELOC(name, value)                                          \
+  case ELF::name:                                                       \
+    return #name;                                                       \
 
 StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) {
   switch (Machine) {
   case ELF::EM_X86_64:
     switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PLT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_COPY);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GLOB_DAT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_JUMP_SLOT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_RELATIVE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_32S);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPMOD64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPOFF64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TPOFF64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSGD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSLD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTTPOFF);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TPOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTOFF64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOT64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPCREL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPLT64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PLTOFF64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_SIZE32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_SIZE64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC32_TLSDESC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSDESC_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSDESC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_IRELATIVE);
+#include "llvm/Support/ELFRelocs/x86_64.def"
     default:
       break;
     }
     break;
   case ELF::EM_386:
     switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PLT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_COPY);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GLOB_DAT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_JUMP_SLOT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_RELATIVE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOTOFF);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOTPC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_32PLT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_TPOFF);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_IE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GOTIE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_PUSH);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_POP);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_PUSH);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_POP);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDO_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_IE_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LE_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DTPMOD32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DTPOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_TPOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GOTDESC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DESC_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DESC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_IRELATIVE);
+#include "llvm/Support/ELFRelocs/i386.def"
     default:
       break;
     }
     break;
   case ELF::EM_MIPS:
     switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_REL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_26);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GPREL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_LITERAL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PC16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_CALL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GPREL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SHIFT5);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SHIFT6);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_DISP);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_PAGE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_OFST);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SUB);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_INSERT_A);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_INSERT_B);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_DELETE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_HIGHER);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_HIGHEST);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_CALL_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_CALL_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SCN_DISP);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_REL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_ADD_IMMEDIATE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PJUMP);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_RELGOT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_JALR);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPMOD32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPMOD64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_GD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_LDM);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_GOTTPREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GLOB_DAT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PC21_S2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PC26_S2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PC18_S3);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PC19_S2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PCHI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PCLO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS16_GOT16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS16_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS16_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_COPY);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_JUMP_SLOT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_26_S1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_GOT16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_PC16_S1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_CALL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_GOT_DISP);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_GOT_PAGE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_GOT_OFST);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_GD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_LDM);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_DTPREL_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_DTPREL_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_TPREL_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_TPREL_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_NUM);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PC32);
+#include "llvm/Support/ELFRelocs/Mips.def"
     default:
       break;
     }
     break;
   case ELF::EM_AARCH64:
     switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G0_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G1_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G2_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G3);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LD_PREL_LO19);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_PREL_LO21);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_PREL_PG_HI21);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADD_ABS_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST8_ABS_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TSTBR14);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_CONDBR19);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_JUMP26);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_CALL26);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST16_ABS_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST32_ABS_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST64_ABS_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST128_ABS_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_GOTREL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_GOTREL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_GOT_PAGE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LD64_GOT_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_HI12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST8_DTPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST16_DTPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST32_DTPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST64_DTPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_MOVW_GOTTPREL_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_LD_GOTTPREL_PREL19);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G1_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G0_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_HI12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST8_TPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST16_TPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST32_TPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST64_TPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_ADR_PAGE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_LD64_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_ADD_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_COPY);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_GLOB_DAT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_JUMP_SLOT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_RELATIVE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLS_DTPREL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLS_DTPMOD64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLS_TPREL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_IRELATIVE);
+#include "llvm/Support/ELFRelocs/AArch64.def"
     default:
       break;
     }
     break;
   case ELF::EM_ARM:
     switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PC24);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_REL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_ABS5);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_SBREL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_PC8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BREL_ADJ);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DESC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_SWI8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_XPC25);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_XPC22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DTPMOD32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DTPOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_TPOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_COPY);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GLOB_DAT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_JUMP_SLOT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_RELATIVE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BASE_PREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_BREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PLT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_JUMP24);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP24);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BASE_ABS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_7_0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_15_8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_23_15);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SBREL_11_0_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SBREL_19_12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SBREL_27_20_CK);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TARGET1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_SBREL31);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_V4BX);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TARGET2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PREL31);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_ABS_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_ABS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_PREL_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_PREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_ABS_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_ABS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_PREL_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_PREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP19);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP6);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_ALU_PREL_11_0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_PC12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS32_NOI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_REL32_NOI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G0_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G1_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G0_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G1_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_BREL_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_BREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_BREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_BREL_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_BREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_BREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_GOTDESC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DESCSEQ);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PLT32_ABS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_ABS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_PREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_BREL12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTOFF12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTRELAX);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GNU_VTENTRY);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GNU_VTINHERIT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP11);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_GD32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDM32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDO32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_IE32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LE32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LE12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_IE12GP);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_3);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_4);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_5);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_6);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_7);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_9);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_11);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_13);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_14);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_15);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ME_TOO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_DESCSEQ16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_DESCSEQ32);
+#include "llvm/Support/ELFRelocs/ARM.def"
     default:
       break;
     }
     break;
   case ELF::EM_HEXAGON:
     switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B22_PCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B15_PCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B7_PCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_3);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_HL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B13_PCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B9_PCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B32_PCREL_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B22_PCREL_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B15_PCREL_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B13_PCREL_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B9_PCREL_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B7_PCREL_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_16_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_12_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_11_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_10_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_9_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_8_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_7_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32_PCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_COPY);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GLOB_DAT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_JMP_SLOT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_RELATIVE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_PLT_B22_PCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPMOD_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_PLT_B22_PCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_6_PCREL_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_32_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_16_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_11_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_32_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_16_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_11_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_32_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_16_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_11_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_32_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_16_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_11_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_32_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_16_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_32_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_16_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_11_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_32_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_16_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_11_X);
+#include "llvm/Support/ELFRelocs/Hexagon.def"
     default:
       break;
     }
     break;
   case ELF::EM_PPC:
     switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR24);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR14);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR14_BRTAKEN);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR14_BRNTAKEN);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL24);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL14);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL14_BRTAKEN);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL14_BRNTAKEN);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_PLTREL24);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPMOD32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLSGD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLSLD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16_HA);
+#include "llvm/Support/ELFRelocs/PowerPC.def"
     default:
       break;
     }
     break;
   case ELF::EM_PPC64:
     switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR24);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR14);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR14_BRTAKEN);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR14_BRNTAKEN);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL24);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL14);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL14_BRTAKEN);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL14_BRNTAKEN);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHER);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHERA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHEST);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHESTA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_LO_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_LO_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_LO_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TLS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPMOD64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_LO_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_LO_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_LO_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHER);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHERA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHEST);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHESTA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_LO_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHER);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHERA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHEST);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHESTA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TLSGD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TLSLD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16_HA);
+#include "llvm/Support/ELFRelocs/PowerPC64.def"
     default:
       break;
     }
     break;
   case ELF::EM_S390:
     switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_COPY);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GLOB_DAT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_JMP_SLOT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_RELATIVE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTOFF);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC16DBL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT16DBL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC32DBL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT32DBL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPCDBL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTENT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTOFF16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTOFF64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLTENT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLTOFF16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLTOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLTOFF64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LOAD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GDCALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDCALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GD32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GD64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDM32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDM64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_IE32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_IE64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_IEENT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LE32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LE64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDO32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDO64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_DTPMOD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_DTPOFF);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_TPOFF);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_20);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT20);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT20);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE20);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_IRELATIVE);
+#include "llvm/Support/ELFRelocs/SystemZ.def"
     default:
       break;
     }
@@ -730,90 +85,7 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) {
   case ELF::EM_SPARC32PLUS:
   case ELF::EM_SPARCV9:
     switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_DISP8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_DISP16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_DISP32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_WDISP30);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_WDISP22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_HI22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_13);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_LO10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_GOT10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_GOT13);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_GOT22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_PC10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_PC22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_WPLT30);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_COPY);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_GLOB_DAT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_JMP_SLOT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_RELATIVE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_UA32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_PLT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_HIPLT22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_LOPLT10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_PCPLT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_PCPLT22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_PCPLT10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_11);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_OLO10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_HH22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_HM10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_LM22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_PC_HH22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_PC_HM10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_PC_LM22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_WDISP16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_WDISP19);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_7);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_5);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_6);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_DISP64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_PLT64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_HIX22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_LOX10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_H44);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_M44);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_L44);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_REGISTER);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_UA64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_UA16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_GD_HI22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_GD_LO10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_GD_ADD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_GD_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_LDM_HI22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_LDM_LO10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_LDM_ADD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_LDM_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_LDO_HIX22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_LDO_LOX10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_LDO_ADD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_IE_HI22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_IE_LO10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_IE_LD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_IE_LDX);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_IE_ADD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_LE_HIX22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_LE_LOX10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_DTPMOD32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_DTPMOD64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_DTPOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_DTPOFF64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_TPOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_TLS_TPOFF64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_GOTDATA_HIX22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_GOTDATA_LOX22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_GOTDATA_OP_HIX22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_GOTDATA_OP_LOX22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_SPARC_GOTDATA_OP);
+#include "llvm/Support/ELFRelocs/Sparc.def"
     default:
       break;
     }
@@ -824,7 +96,7 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) {
   return "Unknown";
 }
 
-#undef LLVM_ELF_SWITCH_RELOC_TYPE_NAME
+#undef ELF_RELOC
 
 } // end namespace object
 } // end namespace llvm
diff --git a/lib/Object/ELFYAML.cpp b/lib/Object/ELFYAML.cpp
index f513c11..cce05cf 100644
--- a/lib/Object/ELFYAML.cpp
+++ b/lib/Object/ELFYAML.cpp
@@ -30,6 +30,7 @@ ScalarEnumerationTraits<ELFYAML::ELF_ET>::enumeration(IO &IO,
   ECase(ET_DYN)
   ECase(ET_CORE)
 #undef ECase
+  IO.enumFallback<Hex16>(Value);
 }
 
 void
@@ -414,354 +415,44 @@ void ScalarBitSetTraits<ELFYAML::ELF_STO>::bitset(IO &IO,
 #undef BCaseMask
 }
 
+void ScalarEnumerationTraits<ELFYAML::ELF_RSS>::enumeration(
+    IO &IO, ELFYAML::ELF_RSS &Value) {
+#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+  ECase(RSS_UNDEF)
+  ECase(RSS_GP)
+  ECase(RSS_GP0)
+  ECase(RSS_LOC)
+#undef ECase
+}
+
 void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
     IO &IO, ELFYAML::ELF_REL &Value) {
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
   assert(Object && "The IO context is not initialized");
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+#define ELF_RELOC(X, Y) IO.enumCase(Value, #X, ELF::X);
   switch (Object->Header.Machine) {
   case ELF::EM_X86_64:
-    ECase(R_X86_64_NONE)
-    ECase(R_X86_64_64)
-    ECase(R_X86_64_PC32)
-    ECase(R_X86_64_GOT32)
-    ECase(R_X86_64_PLT32)
-    ECase(R_X86_64_COPY)
-    ECase(R_X86_64_GLOB_DAT)
-    ECase(R_X86_64_JUMP_SLOT)
-    ECase(R_X86_64_RELATIVE)
-    ECase(R_X86_64_GOTPCREL)
-    ECase(R_X86_64_32)
-    ECase(R_X86_64_32S)
-    ECase(R_X86_64_16)
-    ECase(R_X86_64_PC16)
-    ECase(R_X86_64_8)
-    ECase(R_X86_64_PC8)
-    ECase(R_X86_64_DTPMOD64)
-    ECase(R_X86_64_DTPOFF64)
-    ECase(R_X86_64_TPOFF64)
-    ECase(R_X86_64_TLSGD)
-    ECase(R_X86_64_TLSLD)
-    ECase(R_X86_64_DTPOFF32)
-    ECase(R_X86_64_GOTTPOFF)
-    ECase(R_X86_64_TPOFF32)
-    ECase(R_X86_64_PC64)
-    ECase(R_X86_64_GOTOFF64)
-    ECase(R_X86_64_GOTPC32)
-    ECase(R_X86_64_GOT64)
-    ECase(R_X86_64_GOTPCREL64)
-    ECase(R_X86_64_GOTPC64)
-    ECase(R_X86_64_GOTPLT64)
-    ECase(R_X86_64_PLTOFF64)
-    ECase(R_X86_64_SIZE32)
-    ECase(R_X86_64_SIZE64)
-    ECase(R_X86_64_GOTPC32_TLSDESC)
-    ECase(R_X86_64_TLSDESC_CALL)
-    ECase(R_X86_64_TLSDESC)
-    ECase(R_X86_64_IRELATIVE)
+#include "llvm/Support/ELFRelocs/x86_64.def"
     break;
   case ELF::EM_MIPS:
-    ECase(R_MIPS_NONE)
-    ECase(R_MIPS_16)
-    ECase(R_MIPS_32)
-    ECase(R_MIPS_REL32)
-    ECase(R_MIPS_26)
-    ECase(R_MIPS_HI16)
-    ECase(R_MIPS_LO16)
-    ECase(R_MIPS_GPREL16)
-    ECase(R_MIPS_LITERAL)
-    ECase(R_MIPS_GOT16)
-    ECase(R_MIPS_PC16)
-    ECase(R_MIPS_CALL16)
-    ECase(R_MIPS_GPREL32)
-    ECase(R_MIPS_UNUSED1)
-    ECase(R_MIPS_UNUSED2)
-    ECase(R_MIPS_SHIFT5)
-    ECase(R_MIPS_SHIFT6)
-    ECase(R_MIPS_64)
-    ECase(R_MIPS_GOT_DISP)
-    ECase(R_MIPS_GOT_PAGE)
-    ECase(R_MIPS_GOT_OFST)
-    ECase(R_MIPS_GOT_HI16)
-    ECase(R_MIPS_GOT_LO16)
-    ECase(R_MIPS_SUB)
-    ECase(R_MIPS_INSERT_A)
-    ECase(R_MIPS_INSERT_B)
-    ECase(R_MIPS_DELETE)
-    ECase(R_MIPS_HIGHER)
-    ECase(R_MIPS_HIGHEST)
-    ECase(R_MIPS_CALL_HI16)
-    ECase(R_MIPS_CALL_LO16)
-    ECase(R_MIPS_SCN_DISP)
-    ECase(R_MIPS_REL16)
-    ECase(R_MIPS_ADD_IMMEDIATE)
-    ECase(R_MIPS_PJUMP)
-    ECase(R_MIPS_RELGOT)
-    ECase(R_MIPS_JALR)
-    ECase(R_MIPS_TLS_DTPMOD32)
-    ECase(R_MIPS_TLS_DTPREL32)
-    ECase(R_MIPS_TLS_DTPMOD64)
-    ECase(R_MIPS_TLS_DTPREL64)
-    ECase(R_MIPS_TLS_GD)
-    ECase(R_MIPS_TLS_LDM)
-    ECase(R_MIPS_TLS_DTPREL_HI16)
-    ECase(R_MIPS_TLS_DTPREL_LO16)
-    ECase(R_MIPS_TLS_GOTTPREL)
-    ECase(R_MIPS_TLS_TPREL32)
-    ECase(R_MIPS_TLS_TPREL64)
-    ECase(R_MIPS_TLS_TPREL_HI16)
-    ECase(R_MIPS_TLS_TPREL_LO16)
-    ECase(R_MIPS_GLOB_DAT)
-    ECase(R_MIPS_PC21_S2)
-    ECase(R_MIPS_PC26_S2)
-    ECase(R_MIPS_PC18_S3)
-    ECase(R_MIPS_PC19_S2)
-    ECase(R_MIPS_PCHI16)
-    ECase(R_MIPS_PCLO16)
-    ECase(R_MIPS16_GOT16)
-    ECase(R_MIPS16_HI16)
-    ECase(R_MIPS16_LO16)
-    ECase(R_MIPS_COPY)
-    ECase(R_MIPS_JUMP_SLOT)
-    ECase(R_MICROMIPS_26_S1)
-    ECase(R_MICROMIPS_HI16)
-    ECase(R_MICROMIPS_LO16)
-    ECase(R_MICROMIPS_GOT16)
-    ECase(R_MICROMIPS_PC16_S1)
-    ECase(R_MICROMIPS_CALL16)
-    ECase(R_MICROMIPS_GOT_DISP)
-    ECase(R_MICROMIPS_GOT_PAGE)
-    ECase(R_MICROMIPS_GOT_OFST)
-    ECase(R_MICROMIPS_TLS_GD)
-    ECase(R_MICROMIPS_TLS_LDM)
-    ECase(R_MICROMIPS_TLS_DTPREL_HI16)
-    ECase(R_MICROMIPS_TLS_DTPREL_LO16)
-    ECase(R_MICROMIPS_TLS_TPREL_HI16)
-    ECase(R_MICROMIPS_TLS_TPREL_LO16)
-    ECase(R_MIPS_NUM)
-    ECase(R_MIPS_PC32)
+#include "llvm/Support/ELFRelocs/Mips.def"
     break;
   case ELF::EM_HEXAGON:
-    ECase(R_HEX_NONE)
-    ECase(R_HEX_B22_PCREL)
-    ECase(R_HEX_B15_PCREL)
-    ECase(R_HEX_B7_PCREL)
-    ECase(R_HEX_LO16)
-    ECase(R_HEX_HI16)
-    ECase(R_HEX_32)
-    ECase(R_HEX_16)
-    ECase(R_HEX_8)
-    ECase(R_HEX_GPREL16_0)
-    ECase(R_HEX_GPREL16_1)
-    ECase(R_HEX_GPREL16_2)
-    ECase(R_HEX_GPREL16_3)
-    ECase(R_HEX_HL16)
-    ECase(R_HEX_B13_PCREL)
-    ECase(R_HEX_B9_PCREL)
-    ECase(R_HEX_B32_PCREL_X)
-    ECase(R_HEX_32_6_X)
-    ECase(R_HEX_B22_PCREL_X)
-    ECase(R_HEX_B15_PCREL_X)
-    ECase(R_HEX_B13_PCREL_X)
-    ECase(R_HEX_B9_PCREL_X)
-    ECase(R_HEX_B7_PCREL_X)
-    ECase(R_HEX_16_X)
-    ECase(R_HEX_12_X)
-    ECase(R_HEX_11_X)
-    ECase(R_HEX_10_X)
-    ECase(R_HEX_9_X)
-    ECase(R_HEX_8_X)
-    ECase(R_HEX_7_X)
-    ECase(R_HEX_6_X)
-    ECase(R_HEX_32_PCREL)
-    ECase(R_HEX_COPY)
-    ECase(R_HEX_GLOB_DAT)
-    ECase(R_HEX_JMP_SLOT)
-    ECase(R_HEX_RELATIVE)
-    ECase(R_HEX_PLT_B22_PCREL)
-    ECase(R_HEX_GOTREL_LO16)
-    ECase(R_HEX_GOTREL_HI16)
-    ECase(R_HEX_GOTREL_32)
-    ECase(R_HEX_GOT_LO16)
-    ECase(R_HEX_GOT_HI16)
-    ECase(R_HEX_GOT_32)
-    ECase(R_HEX_GOT_16)
-    ECase(R_HEX_DTPMOD_32)
-    ECase(R_HEX_DTPREL_LO16)
-    ECase(R_HEX_DTPREL_HI16)
-    ECase(R_HEX_DTPREL_32)
-    ECase(R_HEX_DTPREL_16)
-    ECase(R_HEX_GD_PLT_B22_PCREL)
-    ECase(R_HEX_GD_GOT_LO16)
-    ECase(R_HEX_GD_GOT_HI16)
-    ECase(R_HEX_GD_GOT_32)
-    ECase(R_HEX_GD_GOT_16)
-    ECase(R_HEX_IE_LO16)
-    ECase(R_HEX_IE_HI16)
-    ECase(R_HEX_IE_32)
-    ECase(R_HEX_IE_GOT_LO16)
-    ECase(R_HEX_IE_GOT_HI16)
-    ECase(R_HEX_IE_GOT_32)
-    ECase(R_HEX_IE_GOT_16)
-    ECase(R_HEX_TPREL_LO16)
-    ECase(R_HEX_TPREL_HI16)
-    ECase(R_HEX_TPREL_32)
-    ECase(R_HEX_TPREL_16)
-    ECase(R_HEX_6_PCREL_X)
-    ECase(R_HEX_GOTREL_32_6_X)
-    ECase(R_HEX_GOTREL_16_X)
-    ECase(R_HEX_GOTREL_11_X)
-    ECase(R_HEX_GOT_32_6_X)
-    ECase(R_HEX_GOT_16_X)
-    ECase(R_HEX_GOT_11_X)
-    ECase(R_HEX_DTPREL_32_6_X)
-    ECase(R_HEX_DTPREL_16_X)
-    ECase(R_HEX_DTPREL_11_X)
-    ECase(R_HEX_GD_GOT_32_6_X)
-    ECase(R_HEX_GD_GOT_16_X)
-    ECase(R_HEX_GD_GOT_11_X)
-    ECase(R_HEX_IE_32_6_X)
-    ECase(R_HEX_IE_16_X)
-    ECase(R_HEX_IE_GOT_32_6_X)
-    ECase(R_HEX_IE_GOT_16_X)
-    ECase(R_HEX_IE_GOT_11_X)
-    ECase(R_HEX_TPREL_32_6_X)
-    ECase(R_HEX_TPREL_16_X)
-    ECase(R_HEX_TPREL_11_X)
+#include "llvm/Support/ELFRelocs/Hexagon.def"
     break;
   case ELF::EM_386:
-    ECase(R_386_NONE)
-    ECase(R_386_32)
-    ECase(R_386_PC32)
-    ECase(R_386_GOT32)
-    ECase(R_386_PLT32)
-    ECase(R_386_COPY)
-    ECase(R_386_GLOB_DAT)
-    ECase(R_386_JUMP_SLOT)
-    ECase(R_386_RELATIVE)
-    ECase(R_386_GOTOFF)
-    ECase(R_386_GOTPC)
-    ECase(R_386_32PLT)
-    ECase(R_386_TLS_TPOFF)
-    ECase(R_386_TLS_IE)
-    ECase(R_386_TLS_GOTIE)
-    ECase(R_386_TLS_LE)
-    ECase(R_386_TLS_GD)
-    ECase(R_386_TLS_LDM)
-    ECase(R_386_16)
-    ECase(R_386_PC16)
-    ECase(R_386_8)
-    ECase(R_386_PC8)
-    ECase(R_386_TLS_GD_32)
-    ECase(R_386_TLS_GD_PUSH)
-    ECase(R_386_TLS_GD_CALL)
-    ECase(R_386_TLS_GD_POP)
-    ECase(R_386_TLS_LDM_32)
-    ECase(R_386_TLS_LDM_PUSH)
-    ECase(R_386_TLS_LDM_CALL)
-    ECase(R_386_TLS_LDM_POP)
-    ECase(R_386_TLS_LDO_32)
-    ECase(R_386_TLS_IE_32)
-    ECase(R_386_TLS_LE_32)
-    ECase(R_386_TLS_DTPMOD32)
-    ECase(R_386_TLS_DTPOFF32)
-    ECase(R_386_TLS_TPOFF32)
-    ECase(R_386_TLS_GOTDESC)
-    ECase(R_386_TLS_DESC_CALL)
-    ECase(R_386_TLS_DESC)
-    ECase(R_386_IRELATIVE)
-    ECase(R_386_NUM)
+#include "llvm/Support/ELFRelocs/i386.def"
     break;
   case ELF::EM_AARCH64:
-    ECase(R_AARCH64_NONE)
-    ECase(R_AARCH64_ABS64)
-    ECase(R_AARCH64_ABS32)
-    ECase(R_AARCH64_ABS16)
-    ECase(R_AARCH64_PREL64)
-    ECase(R_AARCH64_PREL32)
-    ECase(R_AARCH64_PREL16)
-    ECase(R_AARCH64_MOVW_UABS_G0)
-    ECase(R_AARCH64_MOVW_UABS_G0_NC)
-    ECase(R_AARCH64_MOVW_UABS_G1)
-    ECase(R_AARCH64_MOVW_UABS_G1_NC)
-    ECase(R_AARCH64_MOVW_UABS_G2)
-    ECase(R_AARCH64_MOVW_UABS_G2_NC)
-    ECase(R_AARCH64_MOVW_UABS_G3)
-    ECase(R_AARCH64_MOVW_SABS_G0)
-    ECase(R_AARCH64_MOVW_SABS_G1)
-    ECase(R_AARCH64_MOVW_SABS_G2)
-    ECase(R_AARCH64_LD_PREL_LO19)
-    ECase(R_AARCH64_ADR_PREL_LO21)
-    ECase(R_AARCH64_ADR_PREL_PG_HI21)
-    ECase(R_AARCH64_ADD_ABS_LO12_NC)
-    ECase(R_AARCH64_LDST8_ABS_LO12_NC)
-    ECase(R_AARCH64_TSTBR14)
-    ECase(R_AARCH64_CONDBR19)
-    ECase(R_AARCH64_JUMP26)
-    ECase(R_AARCH64_CALL26)
-    ECase(R_AARCH64_LDST16_ABS_LO12_NC)
-    ECase(R_AARCH64_LDST32_ABS_LO12_NC)
-    ECase(R_AARCH64_LDST64_ABS_LO12_NC)
-    ECase(R_AARCH64_LDST128_ABS_LO12_NC)
-    ECase(R_AARCH64_GOTREL64)
-    ECase(R_AARCH64_GOTREL32)
-    ECase(R_AARCH64_ADR_GOT_PAGE)
-    ECase(R_AARCH64_LD64_GOT_LO12_NC)
-    ECase(R_AARCH64_TLSLD_MOVW_DTPREL_G2)
-    ECase(R_AARCH64_TLSLD_MOVW_DTPREL_G1)
-    ECase(R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC)
-    ECase(R_AARCH64_TLSLD_MOVW_DTPREL_G0)
-    ECase(R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC)
-    ECase(R_AARCH64_TLSLD_ADD_DTPREL_HI12)
-    ECase(R_AARCH64_TLSLD_ADD_DTPREL_LO12)
-    ECase(R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC)
-    ECase(R_AARCH64_TLSLD_LDST8_DTPREL_LO12)
-    ECase(R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC)
-    ECase(R_AARCH64_TLSLD_LDST16_DTPREL_LO12)
-    ECase(R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC)
-    ECase(R_AARCH64_TLSLD_LDST32_DTPREL_LO12)
-    ECase(R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC)
-    ECase(R_AARCH64_TLSLD_LDST64_DTPREL_LO12)
-    ECase(R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC)
-    ECase(R_AARCH64_TLSIE_MOVW_GOTTPREL_G1)
-    ECase(R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC)
-    ECase(R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21)
-    ECase(R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC)
-    ECase(R_AARCH64_TLSIE_LD_GOTTPREL_PREL19)
-    ECase(R_AARCH64_TLSLE_MOVW_TPREL_G2)
-    ECase(R_AARCH64_TLSLE_MOVW_TPREL_G1)
-    ECase(R_AARCH64_TLSLE_MOVW_TPREL_G1_NC)
-    ECase(R_AARCH64_TLSLE_MOVW_TPREL_G0)
-    ECase(R_AARCH64_TLSLE_MOVW_TPREL_G0_NC)
-    ECase(R_AARCH64_TLSLE_ADD_TPREL_HI12)
-    ECase(R_AARCH64_TLSLE_ADD_TPREL_LO12)
-    ECase(R_AARCH64_TLSLE_ADD_TPREL_LO12_NC)
-    ECase(R_AARCH64_TLSLE_LDST8_TPREL_LO12)
-    ECase(R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC)
-    ECase(R_AARCH64_TLSLE_LDST16_TPREL_LO12)
-    ECase(R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC)
-    ECase(R_AARCH64_TLSLE_LDST32_TPREL_LO12)
-    ECase(R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC)
-    ECase(R_AARCH64_TLSLE_LDST64_TPREL_LO12)
-    ECase(R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC)
-    ECase(R_AARCH64_TLSDESC_ADR_PAGE)
-    ECase(R_AARCH64_TLSDESC_LD64_LO12_NC)
-    ECase(R_AARCH64_TLSDESC_ADD_LO12_NC)
-    ECase(R_AARCH64_TLSDESC_CALL)
-    ECase(R_AARCH64_COPY)
-    ECase(R_AARCH64_GLOB_DAT)
-    ECase(R_AARCH64_JUMP_SLOT)
-    ECase(R_AARCH64_RELATIVE)
-    ECase(R_AARCH64_TLS_DTPREL64)
-    ECase(R_AARCH64_TLS_DTPMOD64)
-    ECase(R_AARCH64_TLS_TPREL64)
-    ECase(R_AARCH64_TLSDESC)
-    ECase(R_AARCH64_IRELATIVE)
+#include "llvm/Support/ELFRelocs/AArch64.def"
+    break;
+  case ELF::EM_ARM:
+#include "llvm/Support/ELFRelocs/ARM.def"
     break;
   default:
     llvm_unreachable("Unsupported architecture");
   }
-#undef ECase
+#undef ELF_RELOC
 }
 
 void MappingTraits<ELFYAML::FileHeader>::mapping(IO &IO,
@@ -815,6 +506,7 @@ static void commonSectionMapping(IO &IO, ELFYAML::Section &Section) {
   IO.mapOptional("Address", Section.Address, Hex64(0));
   IO.mapOptional("Link", Section.Link, StringRef());
   IO.mapOptional("AddressAlign", Section.AddressAlign, Hex64(0));
+  IO.mapOptional("Info", Section.Info, StringRef());
 }
 
 static void sectionMapping(IO &IO, ELFYAML::RawContentSection &Section) {
@@ -825,10 +517,19 @@ static void sectionMapping(IO &IO, ELFYAML::RawContentSection &Section) {
 
 static void sectionMapping(IO &IO, ELFYAML::RelocationSection &Section) {
   commonSectionMapping(IO, Section);
-  IO.mapOptional("Info", Section.Info, StringRef());
   IO.mapOptional("Relocations", Section.Relocations);
 }
 
+static void groupSectionMapping(IO &IO, ELFYAML::Group &group) {
+  commonSectionMapping(IO, group);
+  IO.mapRequired("Members", group.Members);
+}
+
+void MappingTraits<ELFYAML::SectionOrType>::mapping(
+    IO &IO, ELFYAML::SectionOrType &sectionOrType) {
+  IO.mapRequired("SectionOrType", sectionOrType.sectionNameOrType);
+}
+
 void MappingTraits<std::unique_ptr<ELFYAML::Section>>::mapping(
     IO &IO, std::unique_ptr<ELFYAML::Section> &Section) {
   ELFYAML::ELF_SHT sectionType;
@@ -844,6 +545,11 @@ void MappingTraits<std::unique_ptr<ELFYAML::Section>>::mapping(
       Section.reset(new ELFYAML::RelocationSection());
     sectionMapping(IO, *cast<ELFYAML::RelocationSection>(Section.get()));
     break;
+  case ELF::SHT_GROUP:
+    if (!IO.outputting())
+      Section.reset(new ELFYAML::Group());
+    groupSectionMapping(IO, *cast<ELFYAML::Group>(Section.get()));
+    break;
   default:
     if (!IO.outputting())
       Section.reset(new ELFYAML::RawContentSection());
@@ -859,12 +565,49 @@ StringRef MappingTraits<std::unique_ptr<ELFYAML::Section>>::validate(
   return "Section size must be greater or equal to the content size";
 }
 
+namespace {
+struct NormalizedMips64RelType {
+  NormalizedMips64RelType(IO &)
+      : Type(ELFYAML::ELF_REL(ELF::R_MIPS_NONE)),
+        Type2(ELFYAML::ELF_REL(ELF::R_MIPS_NONE)),
+        Type3(ELFYAML::ELF_REL(ELF::R_MIPS_NONE)),
+        SpecSym(ELFYAML::ELF_REL(ELF::RSS_UNDEF)) {}
+  NormalizedMips64RelType(IO &, ELFYAML::ELF_REL Original)
+      : Type(Original & 0xFF), Type2(Original >> 8 & 0xFF),
+        Type3(Original >> 16 & 0xFF), SpecSym(Original >> 24 & 0xFF) {}
+
+  ELFYAML::ELF_REL denormalize(IO &) {
+    ELFYAML::ELF_REL Res = Type | Type2 << 8 | Type3 << 16 | SpecSym << 24;
+    return Res;
+  }
+
+  ELFYAML::ELF_REL Type;
+  ELFYAML::ELF_REL Type2;
+  ELFYAML::ELF_REL Type3;
+  ELFYAML::ELF_RSS SpecSym;
+};
+}
+
 void MappingTraits<ELFYAML::Relocation>::mapping(IO &IO,
                                                  ELFYAML::Relocation &Rel) {
+  const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
+  assert(Object && "The IO context is not initialized");
+
   IO.mapRequired("Offset", Rel.Offset);
   IO.mapRequired("Symbol", Rel.Symbol);
-  IO.mapRequired("Type", Rel.Type);
-  IO.mapOptional("Addend", Rel.Addend);
+
+  if (Object->Header.Machine == ELFYAML::ELF_EM(ELF::EM_MIPS) &&
+      Object->Header.Class == ELFYAML::ELF_ELFCLASS(ELF::ELFCLASS64)) {
+    MappingNormalization<NormalizedMips64RelType, ELFYAML::ELF_REL> Key(
+        IO, Rel.Type);
+    IO.mapRequired("Type", Key->Type);
+    IO.mapOptional("Type2", Key->Type2, ELFYAML::ELF_REL(ELF::R_MIPS_NONE));
+    IO.mapOptional("Type3", Key->Type3, ELFYAML::ELF_REL(ELF::R_MIPS_NONE));
+    IO.mapOptional("SpecSym", Key->SpecSym, ELFYAML::ELF_RSS(ELF::RSS_UNDEF));
+  } else
+    IO.mapRequired("Type", Rel.Type);
+
+  IO.mapOptional("Addend", Rel.Addend, (int64_t)0);
 }
 
 void MappingTraits<ELFYAML::Object>::mapping(IO &IO, ELFYAML::Object &Object) {
diff --git a/lib/Object/IRObjectFile.cpp b/lib/Object/IRObjectFile.cpp
index 7256a2f..a2cbdcd 100644
--- a/lib/Object/IRObjectFile.cpp
+++ b/lib/Object/IRObjectFile.cpp
@@ -14,17 +14,17 @@
 #include "llvm/Object/IRObjectFile.h"
 #include "RecordStreamer.h"
 #include "llvm/Bitcode/ReaderWriter.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/GVMaterializer.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
@@ -73,6 +73,7 @@ IRObjectFile::IRObjectFile(MemoryBufferRef Object, std::unique_ptr<Module> Mod)
   MCContext MCCtx(MAI.get(), MRI.get(), &MOFI);
   MOFI.InitMCObjectFileInfo(Triple, Reloc::Default, CodeModel::Default, MCCtx);
   std::unique_ptr<RecordStreamer> Streamer(new RecordStreamer(MCCtx));
+  T->createNullTargetStreamer(*Streamer);
 
   std::unique_ptr<MemoryBuffer> Buffer(MemoryBuffer::getMemBuffer(InlineAsm));
   SourceMgr SrcMgr;
@@ -116,7 +117,7 @@ IRObjectFile::IRObjectFile(MemoryBufferRef Object, std::unique_ptr<Module> Mod)
 IRObjectFile::~IRObjectFile() {
  }
 
-static const GlobalValue *getGV(DataRefImpl &Symb) {
+static GlobalValue *getGV(DataRefImpl &Symb) {
   if ((Symb.p & 3) == 3)
     return nullptr;
 
@@ -181,6 +182,8 @@ void IRObjectFile::moveSymbolNext(DataRefImpl &Symb) const {
     Res = (Index << 2) | 3;
     break;
   }
+  default:
+    llvm_unreachable("unreachable case");
   }
 
   Symb.p = Res;
@@ -235,10 +238,9 @@ uint32_t IRObjectFile::getSymbolFlags(DataRefImpl Symb) const {
   return Res;
 }
 
-const GlobalValue *IRObjectFile::getSymbolGV(DataRefImpl Symb) const {
-  const GlobalValue *GV = getGV(Symb);
-  return GV;
-}
+GlobalValue *IRObjectFile::getSymbolGV(DataRefImpl Symb) { return getGV(Symb); }
+
+std::unique_ptr<Module> IRObjectFile::takeModule() { return std::move(M); }
 
 basic_symbol_iterator IRObjectFile::symbol_begin_impl() const {
   Module::const_iterator I = M->begin();
@@ -291,8 +293,8 @@ ErrorOr<MemoryBufferRef> IRObjectFile::findBitcodeInMemBuffer(MemoryBufferRef Ob
 }
 
 ErrorOr<std::unique_ptr<IRObjectFile>>
-llvm::object::IRObjectFile::createIRObjectFile(MemoryBufferRef Object,
-                                               LLVMContext &Context) {
+llvm::object::IRObjectFile::create(MemoryBufferRef Object,
+                                   LLVMContext &Context) {
   ErrorOr<MemoryBufferRef> BCOrErr = findBitcodeInMemBuffer(Object);
   if (!BCOrErr)
     return BCOrErr.getError();
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index bbef639..4a1c311 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -38,8 +38,12 @@ namespace {
   };
 }
 
-template<typename T>
+template <typename T>
 static T getStruct(const MachOObjectFile *O, const char *P) {
+  // Don't read before the beginning or past the end of the file
+  if (P < O->getData().begin() || P + sizeof(T) > O->getData().end())
+    report_fatal_error("Malformed MachO file.");
+
   T Cmd;
   memcpy(&Cmd, P, sizeof(T));
   if (O->isLittleEndian() != sys::IsLittleEndianHost)
@@ -47,15 +51,26 @@ static T getStruct(const MachOObjectFile *O, const char *P) {
   return Cmd;
 }
 
+template <typename SegmentCmd>
+static uint32_t getSegmentLoadCommandNumSections(const SegmentCmd &S,
+                                                 uint32_t Cmdsize) {
+  const unsigned SectionSize = sizeof(SegmentCmd);
+  if (S.nsects > std::numeric_limits<uint32_t>::max() / SectionSize ||
+      S.nsects * SectionSize > Cmdsize - sizeof(S))
+    report_fatal_error(
+        "Number of sections too large for size of load command.");
+  return S.nsects;
+}
+
 static uint32_t
 getSegmentLoadCommandNumSections(const MachOObjectFile *O,
                                  const MachOObjectFile::LoadCommandInfo &L) {
-  if (O->is64Bit()) {
-    MachO::segment_command_64 S = O->getSegment64LoadCommand(L);
-    return S.nsects;
-  }
-  MachO::segment_command S = O->getSegmentLoadCommand(L);
-  return S.nsects;
+  if (O->is64Bit())
+    return getSegmentLoadCommandNumSections(O->getSegment64LoadCommand(L),
+                                            L.C.cmdsize);
+
+  return getSegmentLoadCommandNumSections(O->getSegmentLoadCommand(L),
+                                          L.C.cmdsize);
 }
 
 static bool isPageZeroSegment(const MachOObjectFile *O,
@@ -233,9 +248,13 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
                                  bool Is64bits, std::error_code &EC)
     : ObjectFile(getMachOType(IsLittleEndian, Is64bits), Object),
       SymtabLoadCmd(nullptr), DysymtabLoadCmd(nullptr),
-      DataInCodeLoadCmd(nullptr), DyldInfoLoadCmd(nullptr),
-      UuidLoadCmd(nullptr), HasPageZeroSegment(false) {
+      DataInCodeLoadCmd(nullptr), LinkOptHintsLoadCmd(nullptr),
+      DyldInfoLoadCmd(nullptr), UuidLoadCmd(nullptr),
+      HasPageZeroSegment(false) {
   uint32_t LoadCommandCount = this->getHeader().ncmds;
+  if (LoadCommandCount == 0)
+    return;
+
   MachO::LoadCommandType SegmentLoadType = is64Bit() ?
     MachO::LC_SEGMENT_64 : MachO::LC_SEGMENT;
 
@@ -262,6 +281,13 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
         return;
       }
       DataInCodeLoadCmd = Load.Ptr;
+    } else if (Load.C.cmd == MachO::LC_LINKER_OPTIMIZATION_HINT) {
+      // Multiple linker optimization hint tables
+      if (LinkOptHintsLoadCmd) {
+        EC = object_error::parse_failed;
+        return;
+      }
+      LinkOptHintsLoadCmd = Load.Ptr;
     } else if (Load.C.cmd == MachO::LC_DYLD_INFO || 
                Load.C.cmd == MachO::LC_DYLD_INFO_ONLY) {
       // Multiple dyldinfo load commands
@@ -278,6 +304,12 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
       }
       UuidLoadCmd = Load.Ptr;
     } else if (Load.C.cmd == SegmentLoadType) {
+      const unsigned SegmentLoadSize = this->is64Bit()
+                                           ? sizeof(MachO::segment_command_64)
+                                           : sizeof(MachO::segment_command);
+      if (Load.C.cmdsize < SegmentLoadSize)
+        report_fatal_error("Segment load command size is too small.");
+
       uint32_t NumSections = getSegmentLoadCommandNumSections(this, Load);
       for (unsigned J = 0; J < NumSections; ++J) {
         const char *Sec = getSectionPtr(this, Load, J);
@@ -312,10 +344,19 @@ std::error_code MachOObjectFile::getSymbolName(DataRefImpl Symb,
   StringRef StringTable = getStringTableData();
   MachO::nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   const char *Start = &StringTable.data()[Entry.n_strx];
+  if (Start < getData().begin() || Start >= getData().end())
+    report_fatal_error(
+        "Symbol name entry points before beginning or past end of file.");
   Res = StringRef(Start);
   return object_error::success;
 }
 
+unsigned MachOObjectFile::getSectionType(SectionRef Sec) const {
+  DataRefImpl DRI = Sec.getRawDataRefImpl();
+  uint32_t Flags = getSectionFlags(this, DRI);
+  return Flags & MachO::SECTION_TYPE;
+}
+
 // getIndirectName() returns the name of the alias'ed symbol who's string table
 // index is in the n_value field.
 std::error_code MachOObjectFile::getIndirectName(DataRefImpl Symb,
@@ -469,6 +510,9 @@ uint32_t MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const {
       if (Value && Value != UnknownAddressOrSize)
         Result |= SymbolRef::SF_Common;
     }
+
+    if (!(MachOType & MachO::N_PEXT))
+      Result |= SymbolRef::SF_Exported;
   }
 
   if (MachOFlags & (MachO::N_WEAK_REF | MachO::N_WEAK_DEF))
@@ -575,32 +619,11 @@ bool MachOObjectFile::isSectionBSS(DataRefImpl Sec) const {
           SectionType == MachO::S_GB_ZEROFILL);
 }
 
-bool MachOObjectFile::isSectionRequiredForExecution(DataRefImpl Sect) const {
-  // FIXME: Unimplemented.
-  return true;
-}
-
 bool MachOObjectFile::isSectionVirtual(DataRefImpl Sec) const {
   // FIXME: Unimplemented.
   return false;
 }
 
-bool MachOObjectFile::isSectionZeroInit(DataRefImpl Sec) const {
-  uint32_t Flags = getSectionFlags(this, Sec);
-  unsigned SectionType = Flags & MachO::SECTION_TYPE;
-  return SectionType == MachO::S_ZEROFILL ||
-         SectionType == MachO::S_GB_ZEROFILL;
-}
-
-bool MachOObjectFile::isSectionReadOnlyData(DataRefImpl Sec) const {
-  // Consider using the code from isSectionText to look for __const sections.
-  // Alternately, emit S_ATTR_PURE_INSTRUCTIONS and/or S_ATTR_SOME_INSTRUCTIONS
-  // to use section attributes to distinguish code from data.
-
-  // FIXME: Unimplemented.
-  return false;
-}
-
 bool MachOObjectFile::sectionContainsSymbol(DataRefImpl Sec,
                                             DataRefImpl Symb) const {
   SymbolRef::Type ST;
@@ -1213,7 +1236,8 @@ basic_symbol_iterator MachOObjectFile::getSymbolByIndex(unsigned Index) const {
     return basic_symbol_iterator(SymbolRef(DRI, this));
 
   MachO::symtab_command Symtab = getSymtabLoadCommand();
-  assert(Index < Symtab.nsyms && "Requested symbol index is out of range.");
+  if (Index >= Symtab.nsyms)
+    report_fatal_error("Requested symbol index is out of range.");
   unsigned SymbolTableEntrySize =
     is64Bit() ? sizeof(MachO::nlist_64) : sizeof(MachO::nlist);
   DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, Symtab.symoff));
@@ -1655,7 +1679,10 @@ void ExportEntry::moveNext() {
 iterator_range<export_iterator> 
 MachOObjectFile::exports(ArrayRef<uint8_t> Trie) {
   ExportEntry Start(Trie);
-  Start.moveToFirst();
+  if (Trie.size() == 0)
+    Start.moveToEnd();
+  else
+    Start.moveToFirst();
 
   ExportEntry Finish(Trie);
   Finish.moveToEnd();
@@ -2114,6 +2141,8 @@ MachOObjectFile::getSectionFinalSegmentName(DataRefImpl Sec) const {
 
 ArrayRef<char>
 MachOObjectFile::getSectionRawName(DataRefImpl Sec) const {
+  if (Sec.d.a >= Sections.size())
+    report_fatal_error("getSectionRawName: Invalid section index");
   const section_base *Base =
     reinterpret_cast<const section_base *>(Sections[Sec.d.a]);
   return makeArrayRef(Base->sectname);
@@ -2121,6 +2150,8 @@ MachOObjectFile::getSectionRawName(DataRefImpl Sec) const {
 
 ArrayRef<char>
 MachOObjectFile::getSectionRawFinalSegmentName(DataRefImpl Sec) const {
+  if (Sec.d.a >= Sections.size())
+    report_fatal_error("getSectionRawFinalSegmentName: Invalid section index");
   const section_base *Base =
     reinterpret_cast<const section_base *>(Sections[Sec.d.a]);
   return makeArrayRef(Base->segname);
@@ -2211,6 +2242,8 @@ MachOObjectFile::getFirstLoadCommandInfo() const {
                                     sizeof(MachO::mach_header);
   Load.Ptr = getPtr(this, HeaderSize);
   Load.C = getStruct<MachO::load_command>(this, Load.Ptr);
+  if (Load.C.cmdsize < 8)
+    report_fatal_error("Load command with size < 8 bytes.");
   return Load;
 }
 
@@ -2219,14 +2252,22 @@ MachOObjectFile::getNextLoadCommandInfo(const LoadCommandInfo &L) const {
   MachOObjectFile::LoadCommandInfo Next;
   Next.Ptr = L.Ptr + L.C.cmdsize;
   Next.C = getStruct<MachO::load_command>(this, Next.Ptr);
+  if (Next.C.cmdsize < 8)
+    report_fatal_error("Load command with size < 8 bytes.");
   return Next;
 }
 
 MachO::section MachOObjectFile::getSection(DataRefImpl DRI) const {
+  // TODO: What if Sections.size() == 0?
+  if (DRI.d.a >= Sections.size())
+    report_fatal_error("getSection: Invalid section index.");
   return getStruct<MachO::section>(this, Sections[DRI.d.a]);
 }
 
 MachO::section_64 MachOObjectFile::getSection64(DataRefImpl DRI) const {
+  // TODO: What if Sections.size() == 0?
+  if (DRI.d.a >= Sections.size())
+    report_fatal_error("getSection64: Invalid section index.");
   return getStruct<MachO::section_64>(this, Sections[DRI.d.a]);
 }
 
@@ -2269,9 +2310,9 @@ MachOObjectFile::getSegment64LoadCommand(const LoadCommandInfo &L) const {
   return getStruct<MachO::segment_command_64>(this, L.Ptr);
 }
 
-MachO::linker_options_command
-MachOObjectFile::getLinkerOptionsLoadCommand(const LoadCommandInfo &L) const {
-  return getStruct<MachO::linker_options_command>(this, L.Ptr);
+MachO::linker_option_command
+MachOObjectFile::getLinkerOptionLoadCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::linker_option_command>(this, L.Ptr);
 }
 
 MachO::version_min_command
@@ -2299,6 +2340,11 @@ MachOObjectFile::getUuidCommand(const LoadCommandInfo &L) const {
   return getStruct<MachO::uuid_command>(this, L.Ptr);
 }
 
+MachO::rpath_command
+MachOObjectFile::getRpathCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::rpath_command>(this, L.Ptr);
+}
+
 MachO::source_version_command
 MachOObjectFile::getSourceVersionCommand(const LoadCommandInfo &L) const {
   return getStruct<MachO::source_version_command>(this, L.Ptr);
@@ -2309,6 +2355,50 @@ MachOObjectFile::getEntryPointCommand(const LoadCommandInfo &L) const {
   return getStruct<MachO::entry_point_command>(this, L.Ptr);
 }
 
+MachO::encryption_info_command
+MachOObjectFile::getEncryptionInfoCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::encryption_info_command>(this, L.Ptr);
+}
+
+MachO::encryption_info_command_64
+MachOObjectFile::getEncryptionInfoCommand64(const LoadCommandInfo &L) const {
+  return getStruct<MachO::encryption_info_command_64>(this, L.Ptr);
+}
+
+MachO::sub_framework_command
+MachOObjectFile::getSubFrameworkCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::sub_framework_command>(this, L.Ptr);
+}
+
+MachO::sub_umbrella_command
+MachOObjectFile::getSubUmbrellaCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::sub_umbrella_command>(this, L.Ptr);
+}
+
+MachO::sub_library_command
+MachOObjectFile::getSubLibraryCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::sub_library_command>(this, L.Ptr);
+}
+
+MachO::sub_client_command
+MachOObjectFile::getSubClientCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::sub_client_command>(this, L.Ptr);
+}
+
+MachO::routines_command
+MachOObjectFile::getRoutinesCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::routines_command>(this, L.Ptr);
+}
+
+MachO::routines_command_64
+MachOObjectFile::getRoutinesCommand64(const LoadCommandInfo &L) const {
+  return getStruct<MachO::routines_command_64>(this, L.Ptr);
+}
+
+MachO::thread_command
+MachOObjectFile::getThreadCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::thread_command>(this, L.Ptr);
+}
 
 MachO::any_relocation_info
 MachOObjectFile::getRelocation(DataRefImpl Rel) const {
@@ -2415,6 +2505,21 @@ MachOObjectFile::getDataInCodeLoadCommand() const {
   return Cmd;
 }
 
+MachO::linkedit_data_command
+MachOObjectFile::getLinkOptHintsLoadCommand() const {
+  if (LinkOptHintsLoadCmd)
+    return getStruct<MachO::linkedit_data_command>(this, LinkOptHintsLoadCmd);
+
+  // If there is no LinkOptHintsLoadCmd return a load command with zero'ed
+  // fields.
+  MachO::linkedit_data_command Cmd;
+  Cmd.cmd = MachO::LC_LINKER_OPTIMIZATION_HINT;
+  Cmd.cmdsize = sizeof(MachO::linkedit_data_command);
+  Cmd.dataoff = 0;
+  Cmd.datasize = 0;
+  return Cmd;
+}
+
 ArrayRef<uint8_t> MachOObjectFile::getDyldInfoRebaseOpcodes() const {
   if (!DyldInfoLoadCmd) 
     return ArrayRef<uint8_t>();
diff --git a/lib/Object/MachOUniversal.cpp b/lib/Object/MachOUniversal.cpp
index 77aeb63..a01c838 100644
--- a/lib/Object/MachOUniversal.cpp
+++ b/lib/Object/MachOUniversal.cpp
@@ -12,9 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/MachOUniversal.h"
+#include "llvm/Object/Archive.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
-#include "llvm/Object/Archive.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -79,20 +79,16 @@ MachOUniversalBinary::ObjectForArch::getAsObjectFile() const {
   return object_error::parse_failed;
 }
 
-std::error_code MachOUniversalBinary::ObjectForArch::getAsArchive(
-    std::unique_ptr<Archive> &Result) const {
-  if (Parent) {
-    StringRef ParentData = Parent->getData();
-    StringRef ObjectData = ParentData.substr(Header.offset, Header.size);
-    StringRef ObjectName = Parent->getFileName();
-    MemoryBufferRef ObjBuffer(ObjectData, ObjectName);
-    ErrorOr<std::unique_ptr<Archive>> Obj = Archive::create(ObjBuffer);
-    if (std::error_code EC = Obj.getError())
-      return EC;
-    Result = std::move(Obj.get());
-    return object_error::success;
-  }
-  return object_error::parse_failed;
+ErrorOr<std::unique_ptr<Archive>>
+MachOUniversalBinary::ObjectForArch::getAsArchive() const {
+  if (!Parent)
+    return object_error::parse_failed;
+
+  StringRef ParentData = Parent->getData();
+  StringRef ObjectData = ParentData.substr(Header.offset, Header.size);
+  StringRef ObjectName = Parent->getFileName();
+  MemoryBufferRef ObjBuffer(ObjectData, ObjectName);
+  return Archive::create(ObjBuffer);
 }
 
 void MachOUniversalBinary::anchor() { }
diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp
index fd78271..01b7654 100644
--- a/lib/Object/ObjectFile.cpp
+++ b/lib/Object/ObjectFile.cpp
@@ -76,6 +76,7 @@ ObjectFile::createObjectFile(MemoryBufferRef Object, sys::fs::file_magic Type) {
   case sys::fs::file_magic::macho_bundle:
   case sys::fs::file_magic::macho_dynamically_linked_shared_lib_stub:
   case sys::fs::file_magic::macho_dsym_companion:
+  case sys::fs::file_magic::macho_kext_bundle:
     return createMachOObjectFile(Object);
   case sys::fs::file_magic::coff_object:
   case sys::fs::file_magic::coff_import_library:
diff --git a/lib/Object/SymbolicFile.cpp b/lib/Object/SymbolicFile.cpp
index ffd3dbc..854e68e 100644
--- a/lib/Object/SymbolicFile.cpp
+++ b/lib/Object/SymbolicFile.cpp
@@ -33,7 +33,7 @@ ErrorOr<std::unique_ptr<SymbolicFile>> SymbolicFile::createSymbolicFile(
   switch (Type) {
   case sys::fs::file_magic::bitcode:
     if (Context)
-      return IRObjectFile::createIRObjectFile(Object, *Context);
+      return IRObjectFile::create(Object, *Context);
   // Fallthrough
   case sys::fs::file_magic::unknown:
   case sys::fs::file_magic::archive:
@@ -53,6 +53,7 @@ ErrorOr<std::unique_ptr<SymbolicFile>> SymbolicFile::createSymbolicFile(
   case sys::fs::file_magic::macho_bundle:
   case sys::fs::file_magic::macho_dynamically_linked_shared_lib_stub:
   case sys::fs::file_magic::macho_dsym_companion:
+  case sys::fs::file_magic::macho_kext_bundle:
   case sys::fs::file_magic::coff_import_library:
   case sys::fs::file_magic::pecoff_executable:
     return ObjectFile::createObjectFile(Object, Type);
@@ -69,7 +70,7 @@ ErrorOr<std::unique_ptr<SymbolicFile>> SymbolicFile::createSymbolicFile(
     if (!BCData)
       return std::move(Obj);
 
-    return IRObjectFile::createIRObjectFile(
+    return IRObjectFile::create(
         MemoryBufferRef(BCData->getBuffer(), Object.getBufferIdentifier()),
         *Context);
   }
diff --git a/lib/Option/Arg.cpp b/lib/Option/Arg.cpp
index 4c8da58..af632d6 100644
--- a/lib/Option/Arg.cpp
+++ b/lib/Option/Arg.cpp
@@ -83,15 +83,13 @@ void Arg::renderAsInput(const ArgList &Args, ArgStringList &Output) const {
     return;
   }
 
-  for (unsigned i = 0, e = getNumValues(); i != e; ++i)
-    Output.push_back(getValue(i));
+  Output.append(Values.begin(), Values.end());
 }
 
 void Arg::render(const ArgList &Args, ArgStringList &Output) const {
   switch (getOption().getRenderStyle()) {
   case Option::RenderValuesStyle:
-    for (unsigned i = 0, e = getNumValues(); i != e; ++i)
-      Output.push_back(getValue(i));
+    Output.append(Values.begin(), Values.end());
     break;
 
   case Option::RenderCommaJoinedStyle: {
@@ -109,14 +107,12 @@ void Arg::render(const ArgList &Args, ArgStringList &Output) const {
  case Option::RenderJoinedStyle:
     Output.push_back(Args.GetOrMakeJoinedArgString(
                        getIndex(), getSpelling(), getValue(0)));
-    for (unsigned i = 1, e = getNumValues(); i != e; ++i)
-      Output.push_back(getValue(i));
+    Output.append(Values.begin() + 1, Values.end());
     break;
 
   case Option::RenderSeparateStyle:
     Output.push_back(Args.MakeArgString(getSpelling()));
-    for (unsigned i = 0, e = getNumValues(); i != e; ++i)
-      Output.push_back(getValue(i));
+    Output.append(Values.begin(), Values.end());
     break;
   }
 }
diff --git a/lib/Option/ArgList.cpp b/lib/Option/ArgList.cpp
index 041e552..85e956f 100644
--- a/lib/Option/ArgList.cpp
+++ b/lib/Option/ArgList.cpp
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Option/ArgList.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/Option.h"
@@ -253,8 +253,8 @@ void ArgList::AddAllArgValues(ArgStringList &Output, OptSpecifier Id0,
                               OptSpecifier Id1, OptSpecifier Id2) const {
   for (auto Arg : filtered(Id0, Id1, Id2)) {
     Arg->claim();
-    for (unsigned i = 0, e = Arg->getNumValues(); i != e; ++i)
-      Output.push_back(Arg->getValue(i));
+    const auto &Values = Arg->getValues();
+    Output.append(Values.begin(), Values.end());
   }
 }
 
diff --git a/lib/Option/CMakeLists.txt b/lib/Option/CMakeLists.txt
index 1cd7d3a..8eb0860 100644
--- a/lib/Option/CMakeLists.txt
+++ b/lib/Option/CMakeLists.txt
@@ -3,4 +3,7 @@ add_llvm_library(LLVMOption
   ArgList.cpp
   Option.cpp
   OptTable.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Option
   )
diff --git a/lib/ProfileData/CMakeLists.txt b/lib/ProfileData/CMakeLists.txt
index b9d472d..282760f 100644
--- a/lib/ProfileData/CMakeLists.txt
+++ b/lib/ProfileData/CMakeLists.txt
@@ -8,4 +8,7 @@ add_llvm_library(LLVMProfileData
   SampleProf.cpp
   SampleProfReader.cpp
   SampleProfWriter.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/ProfileData
   )
diff --git a/lib/ProfileData/CoverageMapping.cpp b/lib/ProfileData/CoverageMapping.cpp
index 0ccebc2..31213d7 100644
--- a/lib/ProfileData/CoverageMapping.cpp
+++ b/lib/ProfileData/CoverageMapping.cpp
@@ -13,10 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/CoverageMapping.h"
-
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ProfileData/CoverageMappingReader.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/Support/Debug.h"
@@ -179,31 +178,32 @@ void FunctionRecordIterator::skipOtherFiles() {
 }
 
 ErrorOr<std::unique_ptr<CoverageMapping>>
-CoverageMapping::load(ObjectFileCoverageMappingReader &CoverageReader,
+CoverageMapping::load(CoverageMappingReader &CoverageReader,
                       IndexedInstrProfReader &ProfileReader) {
   auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
 
   std::vector<uint64_t> Counts;
   for (const auto &Record : CoverageReader) {
+    CounterMappingContext Ctx(Record.Expressions);
+
     Counts.clear();
     if (std::error_code EC = ProfileReader.getFunctionCounts(
             Record.FunctionName, Record.FunctionHash, Counts)) {
-      if (EC != instrprof_error::hash_mismatch &&
-          EC != instrprof_error::unknown_function)
+      if (EC == instrprof_error::hash_mismatch) {
+        Coverage->MismatchedFunctionCount++;
+        continue;
+      } else if (EC != instrprof_error::unknown_function)
         return EC;
-      Coverage->MismatchedFunctionCount++;
-      continue;
-    }
+    } else
+      Ctx.setCounts(Counts);
 
-    assert(Counts.size() != 0 && "Function's counts are empty");
-    FunctionRecord Function(Record.FunctionName, Record.Filenames,
-                            Counts.front());
-    CounterMappingContext Ctx(Record.Expressions, Counts);
+    assert(!Record.MappingRegions.empty() && "Function has no regions");
+    FunctionRecord Function(Record.FunctionName, Record.Filenames);
     for (const auto &Region : Record.MappingRegions) {
       ErrorOr<int64_t> ExecutionCount = Ctx.evaluate(Region.Count);
       if (!ExecutionCount)
         break;
-      Function.CountedRegions.push_back(CountedRegion(Region, *ExecutionCount));
+      Function.pushRegion(Region, *ExecutionCount);
     }
     if (Function.CountedRegions.size() != Record.MappingRegions.size()) {
       Coverage->MismatchedFunctionCount++;
@@ -219,15 +219,18 @@ CoverageMapping::load(ObjectFileCoverageMappingReader &CoverageReader,
 ErrorOr<std::unique_ptr<CoverageMapping>>
 CoverageMapping::load(StringRef ObjectFilename, StringRef ProfileFilename) {
   auto CounterMappingBuff = MemoryBuffer::getFileOrSTDIN(ObjectFilename);
-  if (auto EC = CounterMappingBuff.getError())
+  if (std::error_code EC = CounterMappingBuff.getError())
     return EC;
-  ObjectFileCoverageMappingReader CoverageReader(CounterMappingBuff.get());
-  if (auto EC = CoverageReader.readHeader())
+  auto CoverageReaderOrErr =
+      BinaryCoverageReader::create(CounterMappingBuff.get());
+  if (std::error_code EC = CoverageReaderOrErr.getError())
     return EC;
-  std::unique_ptr<IndexedInstrProfReader> ProfileReader;
-  if (auto EC = IndexedInstrProfReader::create(ProfileFilename, ProfileReader))
+  auto CoverageReader = std::move(CoverageReaderOrErr.get());
+  auto ProfileReaderOrErr = IndexedInstrProfReader::create(ProfileFilename);
+  if (auto EC = ProfileReaderOrErr.getError())
     return EC;
-  return load(CoverageReader, *ProfileReader);
+  auto ProfileReader = std::move(ProfileReaderOrErr.get());
+  return load(*CoverageReader, *ProfileReader);
 }
 
 namespace {
@@ -305,20 +308,22 @@ class SegmentBuilder {
 public:
   /// Build a list of CoverageSegments from a sorted list of Regions.
   std::vector<CoverageSegment> buildSegments(ArrayRef<CountedRegion> Regions) {
+    const CountedRegion *PrevRegion = nullptr;
     for (const auto &Region : Regions) {
       // Pop any regions that end before this one starts.
       while (!ActiveRegions.empty() &&
              ActiveRegions.back()->endLoc() <= Region.startLoc())
         popRegion();
-      if (Segments.size() && Segments.back().Line == Region.LineStart &&
-          Segments.back().Col == Region.ColumnStart) {
-        if (Region.Kind != coverage::CounterMappingRegion::SkippedRegion)
+      if (PrevRegion && PrevRegion->startLoc() == Region.startLoc() &&
+          PrevRegion->endLoc() == Region.endLoc()) {
+        if (Region.Kind == coverage::CounterMappingRegion::CodeRegion)
           Segments.back().addCount(Region.ExecutionCount);
       } else {
         // Add this region to the stack.
         ActiveRegions.push_back(&Region);
         startSegment(Region);
       }
+      PrevRegion = &Region;
     }
     // Pop any regions that are left in the stack.
     while (!ActiveRegions.empty())
@@ -331,50 +336,47 @@ public:
 std::vector<StringRef> CoverageMapping::getUniqueSourceFiles() const {
   std::vector<StringRef> Filenames;
   for (const auto &Function : getCoveredFunctions())
-    for (const auto &Filename : Function.Filenames)
-      Filenames.push_back(Filename);
+    Filenames.insert(Filenames.end(), Function.Filenames.begin(),
+                     Function.Filenames.end());
   std::sort(Filenames.begin(), Filenames.end());
   auto Last = std::unique(Filenames.begin(), Filenames.end());
   Filenames.erase(Last, Filenames.end());
   return Filenames;
 }
 
-static Optional<unsigned> findMainViewFileID(StringRef SourceFile,
-                                             const FunctionRecord &Function) {
-  llvm::SmallVector<bool, 8> IsExpandedFile(Function.Filenames.size(), false);
-  llvm::SmallVector<bool, 8> FilenameEquivalence(Function.Filenames.size(),
-                                                 false);
+static SmallBitVector gatherFileIDs(StringRef SourceFile,
+                                    const FunctionRecord &Function) {
+  SmallBitVector FilenameEquivalence(Function.Filenames.size(), false);
   for (unsigned I = 0, E = Function.Filenames.size(); I < E; ++I)
     if (SourceFile == Function.Filenames[I])
       FilenameEquivalence[I] = true;
+  return FilenameEquivalence;
+}
+
+static Optional<unsigned> findMainViewFileID(StringRef SourceFile,
+                                             const FunctionRecord &Function) {
+  SmallBitVector IsNotExpandedFile(Function.Filenames.size(), true);
+  SmallBitVector FilenameEquivalence = gatherFileIDs(SourceFile, Function);
   for (const auto &CR : Function.CountedRegions)
     if (CR.Kind == CounterMappingRegion::ExpansionRegion &&
         FilenameEquivalence[CR.FileID])
-      IsExpandedFile[CR.ExpandedFileID] = true;
-  for (unsigned I = 0, E = Function.Filenames.size(); I < E; ++I)
-    if (FilenameEquivalence[I] && !IsExpandedFile[I])
-      return I;
-  return None;
+      IsNotExpandedFile[CR.ExpandedFileID] = false;
+  IsNotExpandedFile &= FilenameEquivalence;
+  int I = IsNotExpandedFile.find_first();
+  if (I == -1)
+    return None;
+  return I;
 }
 
 static Optional<unsigned> findMainViewFileID(const FunctionRecord &Function) {
-  llvm::SmallVector<bool, 8> IsExpandedFile(Function.Filenames.size(), false);
+  SmallBitVector IsNotExpandedFile(Function.Filenames.size(), true);
   for (const auto &CR : Function.CountedRegions)
     if (CR.Kind == CounterMappingRegion::ExpansionRegion)
-      IsExpandedFile[CR.ExpandedFileID] = true;
-  for (unsigned I = 0, E = Function.Filenames.size(); I < E; ++I)
-    if (!IsExpandedFile[I])
-      return I;
-  return None;
-}
-
-static SmallSet<unsigned, 8> gatherFileIDs(StringRef SourceFile,
-                                           const FunctionRecord &Function) {
-  SmallSet<unsigned, 8> IDs;
-  for (unsigned I = 0, E = Function.Filenames.size(); I < E; ++I)
-    if (SourceFile == Function.Filenames[I])
-      IDs.insert(I);
-  return IDs;
+      IsNotExpandedFile[CR.ExpandedFileID] = false;
+  int I = IsNotExpandedFile.find_first();
+  if (I == -1)
+    return None;
+  return I;
 }
 
 /// Sort a nested sequence of regions from a single file.
@@ -402,7 +404,7 @@ CoverageData CoverageMapping::getCoverageForFile(StringRef Filename) {
       continue;
     auto FileIDs = gatherFileIDs(Filename, Function);
     for (const auto &CR : Function.CountedRegions)
-      if (FileIDs.count(CR.FileID)) {
+      if (FileIDs.test(CR.FileID)) {
         Regions.push_back(CR);
         if (isExpansion(CR, *MainFileID))
           FileCoverage.Expansions.emplace_back(CR, Function);
@@ -410,6 +412,7 @@ CoverageData CoverageMapping::getCoverageForFile(StringRef Filename) {
   }
 
   sortNestedRegions(Regions.begin(), Regions.end());
+  DEBUG(dbgs() << "Emitting segments for file: " << Filename << "\n");
   FileCoverage.Segments = SegmentBuilder().buildSegments(Regions);
 
   return FileCoverage;
@@ -429,8 +432,8 @@ CoverageMapping::getInstantiations(StringRef Filename) {
   for (const auto &InstantiationSet : InstantiationSetCollector) {
     if (InstantiationSet.second.size() < 2)
       continue;
-    for (auto Function : InstantiationSet.second)
-      Result.push_back(Function);
+    Result.insert(Result.end(), InstantiationSet.second.begin(),
+                  InstantiationSet.second.end());
   }
   return Result;
 }
@@ -451,6 +454,7 @@ CoverageMapping::getCoverageForFunction(const FunctionRecord &Function) {
     }
 
   sortNestedRegions(Regions.begin(), Regions.end());
+  DEBUG(dbgs() << "Emitting segments for function: " << Function.Name << "\n");
   FunctionCoverage.Segments = SegmentBuilder().buildSegments(Regions);
 
   return FunctionCoverage;
@@ -469,6 +473,8 @@ CoverageMapping::getCoverageForExpansion(const ExpansionRecord &Expansion) {
     }
 
   sortNestedRegions(Regions.begin(), Regions.end());
+  DEBUG(dbgs() << "Emitting segments for expansion of file " << Expansion.FileID
+               << "\n");
   ExpansionCoverage.Segments = SegmentBuilder().buildSegments(Regions);
 
   return ExpansionCoverage;
diff --git a/lib/ProfileData/CoverageMappingReader.cpp b/lib/ProfileData/CoverageMappingReader.cpp
index 6476d28..d32f1da 100644
--- a/lib/ProfileData/CoverageMappingReader.cpp
+++ b/lib/ProfileData/CoverageMappingReader.cpp
@@ -173,15 +173,12 @@ std::error_code RawCoverageMappingReader::readMappingRegionsSubArray(
     }
 
     // Read the source range.
-    uint64_t LineStartDelta, CodeBeforeColumnStart, NumLines, ColumnEnd;
+    uint64_t LineStartDelta, ColumnStart, NumLines, ColumnEnd;
     if (auto Err =
             readIntMax(LineStartDelta, std::numeric_limits<unsigned>::max()))
       return Err;
-    if (auto Err = readULEB128(CodeBeforeColumnStart))
+    if (auto Err = readULEB128(ColumnStart))
       return Err;
-    bool HasCodeBefore = CodeBeforeColumnStart & 1;
-    uint64_t ColumnStart = CodeBeforeColumnStart >>
-                           CounterMappingRegion::EncodingHasCodeBeforeBits;
     if (ColumnStart > std::numeric_limits<unsigned>::max())
       return error(instrprof_error::malformed);
     if (auto Err = readIntMax(NumLines, std::numeric_limits<unsigned>::max()))
@@ -214,14 +211,13 @@ std::error_code RawCoverageMappingReader::readMappingRegionsSubArray(
     });
 
     MappingRegions.push_back(CounterMappingRegion(
-        C, InferredFileID, LineStart, ColumnStart, LineStart + NumLines,
-        ColumnEnd, HasCodeBefore, Kind));
-    MappingRegions.back().ExpandedFileID = ExpandedFileID;
+        C, InferredFileID, ExpandedFileID, LineStart, ColumnStart,
+        LineStart + NumLines, ColumnEnd, Kind));
   }
   return success();
 }
 
-std::error_code RawCoverageMappingReader::read(CoverageMappingRecord &Record) {
+std::error_code RawCoverageMappingReader::read() {
 
   // Read the virtual file mapping.
   llvm::SmallVector<unsigned, 8> VirtualFileMapping;
@@ -287,23 +283,9 @@ std::error_code RawCoverageMappingReader::read(CoverageMappingRecord &Record) {
     }
   }
 
-  Record.FunctionName = FunctionName;
-  Record.Filenames = Filenames;
-  Record.Expressions = Expressions;
-  Record.MappingRegions = MappingRegions;
   return success();
 }
 
-ObjectFileCoverageMappingReader::ObjectFileCoverageMappingReader(
-    StringRef FileName)
-    : CurrentRecord(0) {
-  auto File = llvm::object::ObjectFile::createObjectFile(FileName);
-  if (!File)
-    error(File.getError());
-  else
-    Object = std::move(File.get());
-}
-
 namespace {
 /// \brief The coverage mapping data for a single function.
 /// It points to the function's name.
@@ -352,7 +334,7 @@ struct SectionData {
 template <typename T>
 std::error_code readCoverageMappingData(
     SectionData &ProfileNames, StringRef Data,
-    std::vector<ObjectFileCoverageMappingReader::ProfileMappingRecord> &Records,
+    std::vector<BinaryCoverageReader::ProfileMappingRecord> &Records,
     std::vector<StringRef> &Filenames) {
   llvm::DenseSet<T> UniqueFunctionMappingData;
 
@@ -418,7 +400,7 @@ std::error_code readCoverageMappingData(
               ProfileNames.get(MappingRecord.FunctionNamePtr,
                                MappingRecord.FunctionNameSize, FunctionName))
         return Err;
-      Records.push_back(ObjectFileCoverageMappingReader::ProfileMappingRecord(
+      Records.push_back(BinaryCoverageReader::ProfileMappingRecord(
           Version, FunctionName, MappingRecord.FunctionHash, Mapping,
           FilenamesBegin, Filenames.size() - FilenamesBegin));
     }
@@ -429,9 +411,12 @@ std::error_code readCoverageMappingData(
 
 static const char *TestingFormatMagic = "llvmcovmtestdata";
 
-static std::error_code decodeTestingFormat(StringRef Data,
-                                           SectionData &ProfileNames,
-                                           StringRef &CoverageMapping) {
+static std::error_code loadTestingFormat(StringRef Data,
+                                         SectionData &ProfileNames,
+                                         StringRef &CoverageMapping,
+                                         uint8_t &BytesInAddress) {
+  BytesInAddress = 8;
+
   Data = Data.substr(StringRef(TestingFormatMagic).size());
   if (Data.size() < 1)
     return instrprof_error::truncated;
@@ -456,98 +441,96 @@ static std::error_code decodeTestingFormat(StringRef Data,
   return instrprof_error::success;
 }
 
-ObjectFileCoverageMappingReader::ObjectFileCoverageMappingReader(
-    std::unique_ptr<MemoryBuffer> &ObjectBuffer, sys::fs::file_magic Type)
-    : CurrentRecord(0) {
-  if (ObjectBuffer->getBuffer().startswith(TestingFormatMagic)) {
-    // This is a special format used for testing.
-    SectionData ProfileNames;
-    StringRef CoverageMapping;
-    if (auto Err = decodeTestingFormat(ObjectBuffer->getBuffer(), ProfileNames,
-                                       CoverageMapping)) {
-      error(Err);
-      return;
-    }
-    error(readCoverageMappingData<uint64_t>(ProfileNames, CoverageMapping,
-                                            MappingRecords, Filenames));
-    Object = OwningBinary<ObjectFile>(std::unique_ptr<ObjectFile>(),
-                                      std::move(ObjectBuffer));
-    return;
-  }
-
-  auto File = object::ObjectFile::createObjectFile(
-      ObjectBuffer->getMemBufferRef(), Type);
-  if (!File)
-    error(File.getError());
-  else
-    Object = OwningBinary<ObjectFile>(std::move(File.get()),
-                                      std::move(ObjectBuffer));
-}
-
-std::error_code ObjectFileCoverageMappingReader::readHeader() {
-  const ObjectFile *OF = Object.getBinary();
-  if (!OF)
-    return getError();
-  auto BytesInAddress = OF->getBytesInAddress();
-  if (BytesInAddress != 4 && BytesInAddress != 8)
-    return error(instrprof_error::malformed);
+static std::error_code loadBinaryFormat(MemoryBufferRef ObjectBuffer,
+                                        SectionData &ProfileNames,
+                                        StringRef &CoverageMapping,
+                                        uint8_t &BytesInAddress) {
+  auto ObjectFileOrErr = object::ObjectFile::createObjectFile(ObjectBuffer);
+  if (std::error_code EC = ObjectFileOrErr.getError())
+    return EC;
+  auto OF = std::move(ObjectFileOrErr.get());
+  BytesInAddress = OF->getBytesInAddress();
 
   // Look for the sections that we are interested in.
   int FoundSectionCount = 0;
-  SectionRef ProfileNames, CoverageMapping;
+  SectionRef NamesSection, CoverageSection;
   for (const auto &Section : OF->sections()) {
     StringRef Name;
     if (auto Err = Section.getName(Name))
       return Err;
     if (Name == "__llvm_prf_names") {
-      ProfileNames = Section;
+      NamesSection = Section;
     } else if (Name == "__llvm_covmap") {
-      CoverageMapping = Section;
+      CoverageSection = Section;
     } else
       continue;
     ++FoundSectionCount;
   }
   if (FoundSectionCount != 2)
-    return error(instrprof_error::bad_header);
+    return instrprof_error::bad_header;
 
   // Get the contents of the given sections.
-  StringRef Data;
-  if (auto Err = CoverageMapping.getContents(Data))
-    return Err;
-  SectionData ProfileNamesData;
-  if (auto Err = ProfileNamesData.load(ProfileNames))
-    return Err;
+  if (std::error_code EC = CoverageSection.getContents(CoverageMapping))
+    return EC;
+  if (std::error_code EC = ProfileNames.load(NamesSection))
+    return EC;
 
-  // Load the data from the found sections.
-  std::error_code Err;
-  if (BytesInAddress == 4)
-    Err = readCoverageMappingData<uint32_t>(ProfileNamesData, Data,
-                                            MappingRecords, Filenames);
+  return std::error_code();
+}
+
+ErrorOr<std::unique_ptr<BinaryCoverageReader>>
+BinaryCoverageReader::create(std::unique_ptr<MemoryBuffer> &ObjectBuffer) {
+  std::unique_ptr<BinaryCoverageReader> Reader(new BinaryCoverageReader());
+
+  SectionData Profile;
+  StringRef Coverage;
+  uint8_t BytesInAddress;
+  std::error_code EC;
+  if (ObjectBuffer->getBuffer().startswith(TestingFormatMagic))
+    // This is a special format used for testing.
+    EC = loadTestingFormat(ObjectBuffer->getBuffer(), Profile, Coverage,
+                           BytesInAddress);
   else
-    Err = readCoverageMappingData<uint64_t>(ProfileNamesData, Data,
-                                            MappingRecords, Filenames);
-  if (Err)
-    return error(Err);
+    EC = loadBinaryFormat(ObjectBuffer->getMemBufferRef(), Profile, Coverage,
+                          BytesInAddress);
+  if (EC)
+    return EC;
 
-  return success();
+  if (BytesInAddress == 4)
+    EC = readCoverageMappingData<uint32_t>(
+        Profile, Coverage, Reader->MappingRecords, Reader->Filenames);
+  else if (BytesInAddress == 8)
+    EC = readCoverageMappingData<uint64_t>(
+        Profile, Coverage, Reader->MappingRecords, Reader->Filenames);
+  else
+    return instrprof_error::malformed;
+  if (EC)
+    return EC;
+  return std::move(Reader);
 }
 
 std::error_code
-ObjectFileCoverageMappingReader::readNextRecord(CoverageMappingRecord &Record) {
+BinaryCoverageReader::readNextRecord(CoverageMappingRecord &Record) {
   if (CurrentRecord >= MappingRecords.size())
-    return error(instrprof_error::eof);
+    return instrprof_error::eof;
 
   FunctionsFilenames.clear();
   Expressions.clear();
   MappingRegions.clear();
   auto &R = MappingRecords[CurrentRecord];
   RawCoverageMappingReader Reader(
-      R.FunctionName, R.CoverageMapping,
-      makeArrayRef(Filenames.data() + R.FilenamesBegin, R.FilenamesSize),
+      R.CoverageMapping,
+      makeArrayRef(Filenames).slice(R.FilenamesBegin, R.FilenamesSize),
       FunctionsFilenames, Expressions, MappingRegions);
-  if (auto Err = Reader.read(Record))
+  if (auto Err = Reader.read())
     return Err;
+
+  Record.FunctionName = R.FunctionName;
   Record.FunctionHash = R.FunctionHash;
+  Record.Filenames = FunctionsFilenames;
+  Record.Expressions = Expressions;
+  Record.MappingRegions = MappingRegions;
+
   ++CurrentRecord;
-  return success();
+  return std::error_code();
 }
diff --git a/lib/ProfileData/CoverageMappingWriter.cpp b/lib/ProfileData/CoverageMappingWriter.cpp
index 6969c2a..d90d2f5 100644
--- a/lib/ProfileData/CoverageMappingWriter.cpp
+++ b/lib/ProfileData/CoverageMappingWriter.cpp
@@ -109,7 +109,7 @@ static void writeCounter(ArrayRef<CounterExpression> Expressions, Counter C,
 void CoverageMappingWriter::write(raw_ostream &OS) {
   // Sort the regions in an ascending order by the file id and the starting
   // location.
-  std::sort(MappingRegions.begin(), MappingRegions.end());
+  std::stable_sort(MappingRegions.begin(), MappingRegions.end());
 
   // Write out the fileid -> filename mapping.
   encodeULEB128(VirtualFileMapping.size(), OS);
@@ -172,11 +172,7 @@ void CoverageMappingWriter::write(raw_ostream &OS) {
     }
     assert(I->LineStart >= PrevLineStart);
     encodeULEB128(I->LineStart - PrevLineStart, OS);
-    uint64_t CodeBeforeColumnStart =
-        uint64_t(I->HasCodeBefore) |
-        (uint64_t(I->ColumnStart)
-         << CounterMappingRegion::EncodingHasCodeBeforeBits);
-    encodeULEB128(CodeBeforeColumnStart, OS);
+    encodeULEB128(I->ColumnStart, OS);
     assert(I->LineEnd >= I->LineStart);
     encodeULEB128(I->LineEnd - I->LineStart, OS);
     encodeULEB128(I->ColumnEnd, OS);
diff --git a/lib/ProfileData/InstrProfIndexed.h b/lib/ProfileData/InstrProfIndexed.h
index c2bc46c..ebca7b2 100644
--- a/lib/ProfileData/InstrProfIndexed.h
+++ b/lib/ProfileData/InstrProfIndexed.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_PROFILEDATA_INSTRPROFINDEXED_H
 #define LLVM_LIB_PROFILEDATA_INSTRPROFINDEXED_H
 
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MD5.h"
 
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index 0160a64..01e199d 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -13,10 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/InstrProfReader.h"
-#include "llvm/ProfileData/InstrProf.h"
-
 #include "InstrProfIndexed.h"
-
+#include "llvm/ProfileData/InstrProf.h"
 #include <cassert>
 
 using namespace llvm;
@@ -27,12 +25,7 @@ setupMemoryBuffer(std::string Path) {
       MemoryBuffer::getFileOrSTDIN(Path);
   if (std::error_code EC = BufferOrErr.getError())
     return EC;
-  auto Buffer = std::move(BufferOrErr.get());
-
-  // Sanity check the file.
-  if (Buffer->getBufferSize() > std::numeric_limits<unsigned>::max())
-    return instrprof_error::too_large;
-  return std::move(Buffer);
+  return std::move(BufferOrErr.get());
 }
 
 static std::error_code initializeReader(InstrProfReader &Reader) {
@@ -45,10 +38,16 @@ InstrProfReader::create(std::string Path) {
   auto BufferOrError = setupMemoryBuffer(Path);
   if (std::error_code EC = BufferOrError.getError())
     return EC;
+  return InstrProfReader::create(std::move(BufferOrError.get()));
+}
 
-  auto Buffer = std::move(BufferOrError.get());
-  std::unique_ptr<InstrProfReader> Result;
+ErrorOr<std::unique_ptr<InstrProfReader>>
+InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
+  // Sanity check the buffer.
+  if (Buffer->getBufferSize() > std::numeric_limits<unsigned>::max())
+    return instrprof_error::too_large;
 
+  std::unique_ptr<InstrProfReader> Result;
   // Create the reader.
   if (IndexedInstrProfReader::hasFormat(*Buffer))
     Result.reset(new IndexedInstrProfReader(std::move(Buffer)));
@@ -66,21 +65,32 @@ InstrProfReader::create(std::string Path) {
   return std::move(Result);
 }
 
-std::error_code IndexedInstrProfReader::create(
-    std::string Path, std::unique_ptr<IndexedInstrProfReader> &Result) {
+ErrorOr<std::unique_ptr<IndexedInstrProfReader>>
+IndexedInstrProfReader::create(std::string Path) {
   // Set up the buffer to read.
   auto BufferOrError = setupMemoryBuffer(Path);
   if (std::error_code EC = BufferOrError.getError())
     return EC;
+  return IndexedInstrProfReader::create(std::move(BufferOrError.get()));
+}
+
+
+ErrorOr<std::unique_ptr<IndexedInstrProfReader>>
+IndexedInstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
+  // Sanity check the buffer.
+  if (Buffer->getBufferSize() > std::numeric_limits<unsigned>::max())
+    return instrprof_error::too_large;
 
-  auto Buffer = std::move(BufferOrError.get());
   // Create the reader.
   if (!IndexedInstrProfReader::hasFormat(*Buffer))
     return instrprof_error::bad_magic;
-  Result.reset(new IndexedInstrProfReader(std::move(Buffer)));
+  auto Result = llvm::make_unique<IndexedInstrProfReader>(std::move(Buffer));
 
   // Initialize the reader and return the result.
-  return initializeReader(*Result);
+  if (std::error_code EC = initializeReader(*Result))
+    return EC;
+
+  return std::move(Result);
 }
 
 void InstrProfIterator::Increment() {
diff --git a/lib/ProfileData/InstrProfWriter.cpp b/lib/ProfileData/InstrProfWriter.cpp
index ad1b876..2188543 100644
--- a/lib/ProfileData/InstrProfWriter.cpp
+++ b/lib/ProfileData/InstrProfWriter.cpp
@@ -13,12 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/InstrProfWriter.h"
+#include "InstrProfIndexed.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/OnDiskHashTable.h"
 
-#include "InstrProfIndexed.h"
-
 using namespace llvm;
 
 namespace {
@@ -107,7 +106,7 @@ InstrProfWriter::addFunctionCounts(StringRef FunctionName,
   return instrprof_error::success;
 }
 
-void InstrProfWriter::write(raw_fd_ostream &OS) {
+std::pair<uint64_t, uint64_t> InstrProfWriter::writeImpl(raw_ostream &OS) {
   OnDiskChainedHashTableGenerator<InstrProfRecordTrait> Generator;
 
   // Populate the hash table generator.
@@ -129,7 +128,32 @@ void InstrProfWriter::write(raw_fd_ostream &OS) {
   // Write the hash table.
   uint64_t HashTableStart = Generator.Emit(OS);
 
+  return std::make_pair(HashTableStartLoc, HashTableStart);
+}
+
+void InstrProfWriter::write(raw_fd_ostream &OS) {
+  // Write the hash table.
+  auto TableStart = writeImpl(OS);
+
   // Go back and fill in the hash table start.
-  OS.seek(HashTableStartLoc);
-  LE.write<uint64_t>(HashTableStart);
+  using namespace support;
+  OS.seek(TableStart.first);
+  endian::Writer<little>(OS).write<uint64_t>(TableStart.second);
+}
+
+std::unique_ptr<MemoryBuffer> InstrProfWriter::writeBuffer() {
+  std::string Data;
+  llvm::raw_string_ostream OS(Data);
+  // Write the hash table.
+  auto TableStart = writeImpl(OS);
+  OS.flush();
+
+  // Go back and fill in the hash table start.
+  using namespace support;
+  uint64_t Bytes = endian::byte_swap<uint64_t, little>(TableStart.second);
+  Data.replace(TableStart.first, sizeof(uint64_t), (const char *)&Bytes,
+               sizeof(uint64_t));
+
+  // Return this in an aligned memory buffer.
+  return MemoryBuffer::getMemBufferCopy(Data);
 }
diff --git a/lib/ProfileData/SampleProfWriter.cpp b/lib/ProfileData/SampleProfWriter.cpp
index 8525045..c95267a 100644
--- a/lib/ProfileData/SampleProfWriter.cpp
+++ b/lib/ProfileData/SampleProfWriter.cpp
@@ -21,9 +21,9 @@
 #include "llvm/ProfileData/SampleProfWriter.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Regex.h"
 
 using namespace llvm::sampleprof;
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 295b16c..393ecf4 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -1823,7 +1823,7 @@ APFloat::fusedMultiplyAdd(const APFloat &multiplicand,
     /* If two numbers add (exactly) to zero, IEEE 754 decrees it is a
        positive zero unless rounding to minus infinity, except that
        adding two like-signed zeroes gives that zero.  */
-    if (category == fcZero && sign != addend.sign)
+    if (category == fcZero && !(fs & opUnderflow) && sign != addend.sign)
       sign = (rounding_mode == rmTowardNegative);
   } else {
     fs = multiplySpecials(multiplicand);
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index c20eeb2..50a639c 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -713,7 +713,7 @@ unsigned APInt::countLeadingZerosSlowCase() const {
 
 unsigned APInt::countLeadingOnes() const {
   if (isSingleWord())
-    return CountLeadingOnes_64(VAL << (APINT_BITS_PER_WORD - BitWidth));
+    return llvm::countLeadingOnes(VAL << (APINT_BITS_PER_WORD - BitWidth));
 
   unsigned highWordBits = BitWidth % APINT_BITS_PER_WORD;
   unsigned shift;
@@ -724,13 +724,13 @@ unsigned APInt::countLeadingOnes() const {
     shift = APINT_BITS_PER_WORD - highWordBits;
   }
   int i = getNumWords() - 1;
-  unsigned Count = CountLeadingOnes_64(pVal[i] << shift);
+  unsigned Count = llvm::countLeadingOnes(pVal[i] << shift);
   if (Count == highWordBits) {
     for (i--; i >= 0; --i) {
       if (pVal[i] == -1ULL)
         Count += APINT_BITS_PER_WORD;
       else {
-        Count += CountLeadingOnes_64(pVal[i]);
+        Count += llvm::countLeadingOnes(pVal[i]);
         break;
       }
     }
@@ -756,14 +756,14 @@ unsigned APInt::countTrailingOnesSlowCase() const {
   for (; i < getNumWords() && pVal[i] == -1ULL; ++i)
     Count += APINT_BITS_PER_WORD;
   if (i < getNumWords())
-    Count += CountTrailingOnes_64(pVal[i]);
+    Count += llvm::countTrailingOnes(pVal[i]);
   return std::min(Count, BitWidth);
 }
 
 unsigned APInt::countPopulationSlowCase() const {
   unsigned Count = 0;
   for (unsigned i = 0; i < getNumWords(); ++i)
-    Count += CountPopulation_64(pVal[i]);
+    Count += llvm::countPopulation(pVal[i]);
   return Count;
 }
 
@@ -1956,6 +1956,18 @@ APInt APInt::srem(const APInt &RHS) const {
 
 void APInt::udivrem(const APInt &LHS, const APInt &RHS,
                     APInt &Quotient, APInt &Remainder) {
+  assert(LHS.BitWidth == RHS.BitWidth && "Bit widths must be the same");
+
+  // First, deal with the easy case
+  if (LHS.isSingleWord()) {
+    assert(RHS.VAL != 0 && "Divide by zero?");
+    uint64_t QuotVal = LHS.VAL / RHS.VAL;
+    uint64_t RemVal = LHS.VAL % RHS.VAL;
+    Quotient = APInt(LHS.BitWidth, QuotVal);
+    Remainder = APInt(LHS.BitWidth, RemVal);
+    return;
+  }
+
   // Get some size facts about the dividend and divisor
   unsigned lhsBits  = LHS.getActiveBits();
   unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1);
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index fa62591..a44c1a3 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -121,30 +121,10 @@ add_llvm_library(LLVMSupport
   Valgrind.cpp
   Watchdog.cpp
 
-  ADDITIONAL_HEADERS
-  Unix/Host.inc
-  Unix/Memory.inc
-  Unix/Mutex.inc
-  Unix/Path.inc
-  Unix/Process.inc
-  Unix/Program.inc
-  Unix/RWMutex.inc
-  Unix/Signals.inc
-  Unix/ThreadLocal.inc
-  Unix/TimeValue.inc
-  Unix/Watchdog.inc
-  Windows/DynamicLibrary.inc
-  Windows/Host.inc
-  Windows/Memory.inc
-  Windows/Mutex.inc
-  Windows/Path.inc
-  Windows/Process.inc
-  Windows/Program.inc
-  Windows/RWMutex.inc
-  Windows/Signals.inc
-  Windows/ThreadLocal.inc
-  Windows/TimeValue.inc
-  Windows/Watchdog.inc
+  ADDITIONAL_HEADER_DIRS
+  Unix
+  Windows
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Support
 
   LINK_LIBS ${system_libs}
   )
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index 985c877..b49ec36 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -44,7 +44,8 @@ using namespace cl;
 //===----------------------------------------------------------------------===//
 // Template instantiations and anchors.
 //
-namespace llvm { namespace cl {
+namespace llvm {
+namespace cl {
 TEMPLATE_INSTANTIATION(class basic_parser<bool>);
 TEMPLATE_INSTANTIATION(class basic_parser<boolOrDefault>);
 TEMPLATE_INSTANTIATION(class basic_parser<int>);
@@ -60,7 +61,8 @@ TEMPLATE_INSTANTIATION(class opt<int>);
 TEMPLATE_INSTANTIATION(class opt<std::string>);
 TEMPLATE_INSTANTIATION(class opt<char>);
 TEMPLATE_INSTANTIATION(class opt<bool>);
-} } // end namespace llvm::cl
+}
+} // end namespace llvm::cl
 
 // Pin the vtables to this file.
 void GenericOptionValue::anchor() {}
@@ -81,151 +83,193 @@ void StringSaver::anchor() {}
 
 //===----------------------------------------------------------------------===//
 
-// Globals for name and overview of program.  Program name is not a string to
-// avoid static ctor/dtor issues.
-static char ProgramName[80] = "<premain>";
-static const char *ProgramOverview = nullptr;
+namespace {
 
-// This collects additional help to be printed.
-static ManagedStatic<std::vector<const char*> > MoreHelp;
+class CommandLineParser {
+public:
+  // Globals for name and overview of program.  Program name is not a string to
+  // avoid static ctor/dtor issues.
+  std::string ProgramName;
+  const char *ProgramOverview;
 
-extrahelp::extrahelp(const char *Help)
-  : morehelp(Help) {
-  MoreHelp->push_back(Help);
-}
+  // This collects additional help to be printed.
+  std::vector<const char *> MoreHelp;
 
-static bool OptionListChanged = false;
+  SmallVector<Option *, 4> PositionalOpts;
+  SmallVector<Option *, 4> SinkOpts;
+  StringMap<Option *> OptionsMap;
 
-// MarkOptionsChanged - Internal helper function.
-void cl::MarkOptionsChanged() {
-  OptionListChanged = true;
-}
+  Option *ConsumeAfterOpt; // The ConsumeAfter option if it exists.
 
-/// RegisteredOptionList - This is the list of the command line options that
-/// have statically constructed themselves.
-static Option *RegisteredOptionList = nullptr;
+  // This collects the different option categories that have been registered.
+  SmallPtrSet<OptionCategory *, 16> RegisteredOptionCategories;
 
-void Option::addArgument() {
-  assert(!NextRegistered && "argument multiply registered!");
+  CommandLineParser() : ProgramOverview(nullptr), ConsumeAfterOpt(nullptr) {}
 
-  NextRegistered = RegisteredOptionList;
-  RegisteredOptionList = this;
-  MarkOptionsChanged();
-}
+  void ParseCommandLineOptions(int argc, const char *const *argv,
+                               const char *Overview);
 
-void Option::removeArgument() {
-  if (RegisteredOptionList == this) {
-    RegisteredOptionList = NextRegistered;
-    MarkOptionsChanged();
-    return;
+  void addLiteralOption(Option &Opt, const char *Name) {
+    if (!Opt.hasArgStr()) {
+      if (!OptionsMap.insert(std::make_pair(Name, &Opt)).second) {
+        errs() << ProgramName << ": CommandLine Error: Option '" << Name
+               << "' registered more than once!\n";
+        report_fatal_error("inconsistency in registered CommandLine options");
+      }
+    }
   }
-  Option *O = RegisteredOptionList;
-  for (; O->NextRegistered != this; O = O->NextRegistered)
-    ;
-  O->NextRegistered = NextRegistered;
-  MarkOptionsChanged();
-}
-
-// This collects the different option categories that have been registered.
-typedef SmallPtrSet<OptionCategory*,16> OptionCatSet;
-static ManagedStatic<OptionCatSet> RegisteredOptionCategories;
-
-// Initialise the general option category.
-OptionCategory llvm::cl::GeneralCategory("General options");
-
-void OptionCategory::registerCategory() {
-  assert(std::count_if(RegisteredOptionCategories->begin(),
-                       RegisteredOptionCategories->end(),
-                       [this](const OptionCategory *Category) {
-                         return getName() == Category->getName();
-                       }) == 0 && "Duplicate option categories");
 
-  RegisteredOptionCategories->insert(this);
-}
-
-//===----------------------------------------------------------------------===//
-// Basic, shared command line option processing machinery.
-//
-
-/// GetOptionInfo - Scan the list of registered options, turning them into data
-/// structures that are easier to handle.
-static void GetOptionInfo(SmallVectorImpl<Option*> &PositionalOpts,
-                          SmallVectorImpl<Option*> &SinkOpts,
-                          StringMap<Option*> &OptionsMap) {
-  bool HadErrors = false;
-  SmallVector<const char*, 16> OptionNames;
-  Option *CAOpt = nullptr;  // The ConsumeAfter option if it exists.
-  for (Option *O = RegisteredOptionList; O; O = O->getNextRegisteredOption()) {
-    // If this option wants to handle multiple option names, get the full set.
-    // This handles enum options like "-O1 -O2" etc.
-    O->getExtraOptionNames(OptionNames);
-    if (O->ArgStr[0])
-      OptionNames.push_back(O->ArgStr);
-
-    // Handle named options.
-    for (size_t i = 0, e = OptionNames.size(); i != e; ++i) {
+  void addOption(Option *O) {
+    bool HadErrors = false;
+    if (O->ArgStr[0]) {
       // Add argument to the argument map!
-      if (!OptionsMap.insert(std::make_pair(OptionNames[i], O)).second) {
-        errs() << ProgramName << ": CommandLine Error: Option '"
-               << OptionNames[i] << "' registered more than once!\n";
+      if (!OptionsMap.insert(std::make_pair(O->ArgStr, O)).second) {
+        errs() << ProgramName << ": CommandLine Error: Option '" << O->ArgStr
+               << "' registered more than once!\n";
         HadErrors = true;
       }
     }
 
-    OptionNames.clear();
-
     // Remember information about positional options.
     if (O->getFormattingFlag() == cl::Positional)
       PositionalOpts.push_back(O);
     else if (O->getMiscFlags() & cl::Sink) // Remember sink options
       SinkOpts.push_back(O);
     else if (O->getNumOccurrencesFlag() == cl::ConsumeAfter) {
-      if (CAOpt) {
+      if (ConsumeAfterOpt) {
         O->error("Cannot specify more than one option with cl::ConsumeAfter!");
         HadErrors = true;
       }
-      CAOpt = O;
+      ConsumeAfterOpt = O;
+    }
+
+    // Fail hard if there were errors. These are strictly unrecoverable and
+    // indicate serious issues such as conflicting option names or an
+    // incorrectly
+    // linked LLVM distribution.
+    if (HadErrors)
+      report_fatal_error("inconsistency in registered CommandLine options");
+  }
+
+  void removeOption(Option *O) {
+    SmallVector<const char *, 16> OptionNames;
+    O->getExtraOptionNames(OptionNames);
+    if (O->ArgStr[0])
+      OptionNames.push_back(O->ArgStr);
+    for (auto Name : OptionNames)
+      OptionsMap.erase(StringRef(Name));
+
+    if (O->getFormattingFlag() == cl::Positional)
+      for (auto Opt = PositionalOpts.begin(); Opt != PositionalOpts.end();
+           ++Opt) {
+        if (*Opt == O) {
+          PositionalOpts.erase(Opt);
+          break;
+        }
+      }
+    else if (O->getMiscFlags() & cl::Sink)
+      for (auto Opt = SinkOpts.begin(); Opt != SinkOpts.end(); ++Opt) {
+        if (*Opt == O) {
+          SinkOpts.erase(Opt);
+          break;
+        }
+      }
+    else if (O == ConsumeAfterOpt)
+      ConsumeAfterOpt = nullptr;
+  }
+
+  bool hasOptions() {
+    return (!OptionsMap.empty() || !PositionalOpts.empty() ||
+            nullptr != ConsumeAfterOpt);
+  }
+
+  void updateArgStr(Option *O, const char *NewName) {
+    if (!OptionsMap.insert(std::make_pair(NewName, O)).second) {
+      errs() << ProgramName << ": CommandLine Error: Option '" << O->ArgStr
+             << "' registered more than once!\n";
+      report_fatal_error("inconsistency in registered CommandLine options");
     }
+    OptionsMap.erase(StringRef(O->ArgStr));
   }
 
-  if (CAOpt)
-    PositionalOpts.push_back(CAOpt);
+  void printOptionValues();
 
-  // Make sure that they are in order of registration not backwards.
-  std::reverse(PositionalOpts.begin(), PositionalOpts.end());
+  void registerCategory(OptionCategory *cat) {
+    assert(std::count_if(RegisteredOptionCategories.begin(),
+                         RegisteredOptionCategories.end(),
+                         [cat](const OptionCategory *Category) {
+                           return cat->getName() == Category->getName();
+                         }) == 0 &&
+           "Duplicate option categories");
+
+    RegisteredOptionCategories.insert(cat);
+  }
 
-  // Fail hard if there were errors. These are strictly unrecoverable and
-  // indicate serious issues such as conflicting option names or an incorrectly
-  // linked LLVM distribution.
-  if (HadErrors)
-    report_fatal_error("inconsistency in registered CommandLine options");
+private:
+  Option *LookupOption(StringRef &Arg, StringRef &Value);
+};
+
+} // namespace
+
+static ManagedStatic<CommandLineParser> GlobalParser;
+
+void cl::AddLiteralOption(Option &O, const char *Name) {
+  GlobalParser->addLiteralOption(O, Name);
+}
+
+extrahelp::extrahelp(const char *Help) : morehelp(Help) {
+  GlobalParser->MoreHelp.push_back(Help);
+}
+
+void Option::addArgument() {
+  GlobalParser->addOption(this);
+  FullyInitialized = true;
+}
+
+void Option::removeArgument() { GlobalParser->removeOption(this); }
+
+void Option::setArgStr(const char *S) {
+  if (FullyInitialized)
+    GlobalParser->updateArgStr(this, S);
+  ArgStr = S;
 }
 
+// Initialise the general option category.
+OptionCategory llvm::cl::GeneralCategory("General options");
+
+void OptionCategory::registerCategory() {
+  GlobalParser->registerCategory(this);
+}
+
+//===----------------------------------------------------------------------===//
+// Basic, shared command line option processing machinery.
+//
 
 /// LookupOption - Lookup the option specified by the specified option on the
 /// command line.  If there is a value specified (after an equal sign) return
 /// that as well.  This assumes that leading dashes have already been stripped.
-static Option *LookupOption(StringRef &Arg, StringRef &Value,
-                            const StringMap<Option*> &OptionsMap) {
+Option *CommandLineParser::LookupOption(StringRef &Arg, StringRef &Value) {
   // Reject all dashes.
-  if (Arg.empty()) return nullptr;
+  if (Arg.empty())
+    return nullptr;
 
   size_t EqualPos = Arg.find('=');
 
   // If we have an equals sign, remember the value.
   if (EqualPos == StringRef::npos) {
     // Look up the option.
-    StringMap<Option*>::const_iterator I = OptionsMap.find(Arg);
+    StringMap<Option *>::const_iterator I = OptionsMap.find(Arg);
     return I != OptionsMap.end() ? I->second : nullptr;
   }
 
   // If the argument before the = is a valid option name, we match.  If not,
   // return Arg unmolested.
-  StringMap<Option*>::const_iterator I =
-    OptionsMap.find(Arg.substr(0, EqualPos));
-  if (I == OptionsMap.end()) return nullptr;
+  StringMap<Option *>::const_iterator I =
+      OptionsMap.find(Arg.substr(0, EqualPos));
+  if (I == OptionsMap.end())
+    return nullptr;
 
-  Value = Arg.substr(EqualPos+1);
+  Value = Arg.substr(EqualPos + 1);
   Arg = Arg.substr(0, EqualPos);
   return I->second;
 }
@@ -235,23 +279,25 @@ static Option *LookupOption(StringRef &Arg, StringRef &Value,
 /// (after an equal sign) return that as well.  This assumes that leading dashes
 /// have already been stripped.
 static Option *LookupNearestOption(StringRef Arg,
-                                   const StringMap<Option*> &OptionsMap,
+                                   const StringMap<Option *> &OptionsMap,
                                    std::string &NearestString) {
   // Reject all dashes.
-  if (Arg.empty()) return nullptr;
+  if (Arg.empty())
+    return nullptr;
 
   // Split on any equal sign.
   std::pair<StringRef, StringRef> SplitArg = Arg.split('=');
-  StringRef &LHS = SplitArg.first;  // LHS == Arg when no '=' is present.
+  StringRef &LHS = SplitArg.first; // LHS == Arg when no '=' is present.
   StringRef &RHS = SplitArg.second;
 
   // Find the closest match.
   Option *Best = nullptr;
   unsigned BestDistance = 0;
-  for (StringMap<Option*>::const_iterator it = OptionsMap.begin(),
-         ie = OptionsMap.end(); it != ie; ++it) {
+  for (StringMap<Option *>::const_iterator it = OptionsMap.begin(),
+                                           ie = OptionsMap.end();
+       it != ie; ++it) {
     Option *O = it->second;
-    SmallVector<const char*, 16> OptionNames;
+    SmallVector<const char *, 16> OptionNames;
     O->getExtraOptionNames(OptionNames);
     if (O->ArgStr[0])
       OptionNames.push_back(O->ArgStr);
@@ -261,7 +307,7 @@ static Option *LookupNearestOption(StringRef Arg,
     for (size_t i = 0, e = OptionNames.size(); i != e; ++i) {
       StringRef Name = OptionNames[i];
       unsigned Distance = StringRef(Name).edit_distance(
-        Flag, /*AllowReplacements=*/true, /*MaxEditDistance=*/BestDistance);
+          Flag, /*AllowReplacements=*/true, /*MaxEditDistance=*/BestDistance);
       if (!Best || Distance < BestDistance) {
         Best = O;
         BestDistance = Distance;
@@ -292,8 +338,8 @@ static bool CommaSeparateAndAddOccurrence(Option *Handler, unsigned pos,
       if (Handler->addOccurrence(pos, ArgName, Val.substr(0, Pos), MultiArg))
         return true;
       // Erase the portion before the comma, AND the comma.
-      Val = Val.substr(Pos+1);
-      Value.substr(Pos+1);  // Increment the original value pointer as well.
+      Val = Val.substr(Pos + 1);
+      Value.substr(Pos + 1); // Increment the original value pointer as well.
       // Check for another comma.
       Pos = Val.find(',');
     }
@@ -320,9 +366,10 @@ static inline bool ProvideOption(Option *Handler, StringRef ArgName,
   switch (Handler->getValueExpectedFlag()) {
   case ValueRequired:
     if (!Value.data()) { // No value specified?
-      if (i+1 >= argc)
+      if (i + 1 >= argc)
         return Handler->error("requires a value!");
       // Steal the next argument, like for '-o filename'
+      assert(argv && "null check");
       Value = argv[++i];
     }
     break;
@@ -332,8 +379,8 @@ static inline bool ProvideOption(Option *Handler, StringRef ArgName,
                             " with ValueDisallowed modifier!");
 
     if (Value.data())
-      return Handler->error("does not allow a value! '" +
-                            Twine(Value) + "' specified.");
+      return Handler->error("does not allow a value! '" + Twine(Value) +
+                            "' specified.");
     break;
   case ValueOptional:
     break;
@@ -354,8 +401,9 @@ static inline bool ProvideOption(Option *Handler, StringRef ArgName,
   }
 
   while (NumAdditionalVals > 0) {
-    if (i+1 >= argc)
+    if (i + 1 >= argc)
       return Handler->error("not enough values!");
+    assert(argv && "null check");
     Value = argv[++i];
 
     if (CommaSeparateAndAddOccurrence(Handler, i, ArgName, Value, MultiArg))
@@ -371,7 +419,6 @@ static bool ProvidePositionalOption(Option *Handler, StringRef Arg, int i) {
   return ProvideOption(Handler, Handler->ArgStr, Arg, 0, nullptr, Dummy);
 }
 
-
 // Option predicates...
 static inline bool isGrouping(const Option *O) {
   return O->getFormattingFlag() == cl::Grouping;
@@ -387,39 +434,42 @@ static inline bool isPrefixedOrGrouping(const Option *O) {
 // otherwise return null.
 //
 static Option *getOptionPred(StringRef Name, size_t &Length,
-                             bool (*Pred)(const Option*),
-                             const StringMap<Option*> &OptionsMap) {
+                             bool (*Pred)(const Option *),
+                             const StringMap<Option *> &OptionsMap) {
 
-  StringMap<Option*>::const_iterator OMI = OptionsMap.find(Name);
+  StringMap<Option *>::const_iterator OMI = OptionsMap.find(Name);
 
   // Loop while we haven't found an option and Name still has at least two
   // characters in it (so that the next iteration will not be the empty
   // string.
   while (OMI == OptionsMap.end() && Name.size() > 1) {
-    Name = Name.substr(0, Name.size()-1);   // Chop off the last character.
+    Name = Name.substr(0, Name.size() - 1); // Chop off the last character.
     OMI = OptionsMap.find(Name);
   }
 
   if (OMI != OptionsMap.end() && Pred(OMI->second)) {
     Length = Name.size();
-    return OMI->second;    // Found one!
+    return OMI->second; // Found one!
   }
-  return nullptr;          // No option found!
+  return nullptr; // No option found!
 }
 
 /// HandlePrefixedOrGroupedOption - The specified argument string (which started
 /// with at least one '-') does not fully match an available option.  Check to
 /// see if this is a prefix or grouped option.  If so, split arg into output an
 /// Arg/Value pair and return the Option to parse it with.
-static Option *HandlePrefixedOrGroupedOption(StringRef &Arg, StringRef &Value,
-                                             bool &ErrorParsing,
-                                         const StringMap<Option*> &OptionsMap) {
-  if (Arg.size() == 1) return nullptr;
+static Option *
+HandlePrefixedOrGroupedOption(StringRef &Arg, StringRef &Value,
+                              bool &ErrorParsing,
+                              const StringMap<Option *> &OptionsMap) {
+  if (Arg.size() == 1)
+    return nullptr;
 
   // Do the lookup!
   size_t Length = 0;
   Option *PGOpt = getOptionPred(Arg, Length, isPrefixedOrGrouping, OptionsMap);
-  if (!PGOpt) return nullptr;
+  if (!PGOpt)
+    return nullptr;
 
   // If the option is a prefixed option, then the value is simply the
   // rest of the name...  so fall through to later processing, by
@@ -445,8 +495,8 @@ static Option *HandlePrefixedOrGroupedOption(StringRef &Arg, StringRef &Value,
     assert(PGOpt->getValueExpectedFlag() != cl::ValueRequired &&
            "Option can not be cl::Grouping AND cl::ValueRequired!");
     int Dummy = 0;
-    ErrorParsing |= ProvideOption(PGOpt, OneArgName,
-                                  StringRef(), 0, nullptr, Dummy);
+    ErrorParsing |=
+        ProvideOption(PGOpt, OneArgName, StringRef(), 0, nullptr, Dummy);
 
     // Get the next grouping option.
     PGOpt = getOptionPred(Arg, Length, isGrouping, OptionsMap);
@@ -456,8 +506,6 @@ static Option *HandlePrefixedOrGroupedOption(StringRef &Arg, StringRef &Value,
   return PGOpt;
 }
 
-
-
 static bool RequiresValue(const Option *O) {
   return O->getNumOccurrencesFlag() == cl::Required ||
          O->getNumOccurrencesFlag() == cl::OneOrMore;
@@ -468,17 +516,11 @@ static bool EatsUnboundedNumberOfValues(const Option *O) {
          O->getNumOccurrencesFlag() == cl::OneOrMore;
 }
 
-static bool isWhitespace(char C) {
-  return strchr(" \t\n\r\f\v", C);
-}
+static bool isWhitespace(char C) { return strchr(" \t\n\r\f\v", C); }
 
-static bool isQuote(char C) {
-  return C == '\"' || C == '\'';
-}
+static bool isQuote(char C) { return C == '\"' || C == '\''; }
 
-static bool isGNUSpecial(char C) {
-  return strchr("\\\"\' ", C);
-}
+static bool isGNUSpecial(char C) { return strchr("\\\"\' ", C); }
 
 void cl::TokenizeGNUCommandLine(StringRef Src, StringSaver &Saver,
                                 SmallVectorImpl<const char *> &NewArgv,
@@ -493,13 +535,14 @@ void cl::TokenizeGNUCommandLine(StringRef Src, StringSaver &Saver,
           NewArgv.push_back(nullptr);
         ++I;
       }
-      if (I == E) break;
+      if (I == E)
+        break;
     }
 
     // Backslashes can escape backslashes, spaces, and other quotes.  Otherwise
     // they are literal.  This makes it much easier to read Windows file paths.
     if (I + 1 < E && Src[I] == '\\' && isGNUSpecial(Src[I + 1])) {
-      ++I;  // Skip the escape.
+      ++I; // Skip the escape.
       Token.push_back(Src[I]);
       continue;
     }
@@ -514,7 +557,8 @@ void cl::TokenizeGNUCommandLine(StringRef Src, StringSaver &Saver,
         Token.push_back(Src[I]);
         ++I;
       }
-      if (I == E) break;
+      if (I == E)
+        break;
       continue;
     }
 
@@ -654,6 +698,12 @@ void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
     NewArgv.push_back(nullptr);
 }
 
+// It is called byte order marker but the UTF-8 BOM is actually not affected
+// by the host system's endianness.
+static bool hasUTF8ByteOrderMark(ArrayRef<char> S) {
+  return (S.size() >= 3 && S[0] == '\xef' && S[1] == '\xbb' && S[2] == '\xbf');
+}
+
 static bool ExpandResponseFile(const char *FName, StringSaver &Saver,
                                TokenizerCallback Tokenizer,
                                SmallVectorImpl<const char *> &NewArgv,
@@ -673,6 +723,11 @@ static bool ExpandResponseFile(const char *FName, StringSaver &Saver,
       return false;
     Str = StringRef(UTF8Buf);
   }
+  // If we see UTF-8 BOM sequence at the beginning of a file, we shall remove
+  // these bytes before parsing.
+  // Reference: http://en.wikipedia.org/wiki/UTF-8#Byte_order_mark
+  else if (hasUTF8ByteOrderMark(BufRef))
+    Str = StringRef(BufRef.data() + 3, BufRef.size() - 3);
 
   // Tokenize the contents into NewArgv.
   Tokenizer(Str, Saver, NewArgv, MarkEOLs);
@@ -689,7 +744,7 @@ bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
   bool AllExpanded = true;
 
   // Don't cache Argv.size() because it can change.
-  for (unsigned I = 0; I != Argv.size(); ) {
+  for (unsigned I = 0; I != Argv.size();) {
     const char *Arg = Argv[I];
     // Check if it is an EOL marker
     if (Arg == nullptr) {
@@ -726,22 +781,23 @@ bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
 }
 
 namespace {
-  class StrDupSaver : public StringSaver {
-    std::vector<char*> Dups;
-  public:
-    ~StrDupSaver() {
-      for (std::vector<char *>::iterator I = Dups.begin(), E = Dups.end();
-           I != E; ++I) {
-        char *Dup = *I;
-        free(Dup);
-      }
-    }
-    const char *SaveString(const char *Str) override {
-      char *Dup = strdup(Str);
-      Dups.push_back(Dup);
-      return Dup;
+class StrDupSaver : public StringSaver {
+  std::vector<char *> Dups;
+
+public:
+  ~StrDupSaver() {
+    for (std::vector<char *>::iterator I = Dups.begin(), E = Dups.end(); I != E;
+         ++I) {
+      char *Dup = *I;
+      free(Dup);
     }
-  };
+  }
+  const char *SaveString(const char *Str) override {
+    char *Dup = strdup(Str);
+    Dups.push_back(Dup);
+    return Dup;
+  }
+};
 }
 
 /// ParseEnvironmentOptions - An alternative entry point to the
@@ -773,31 +829,25 @@ void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
   ParseCommandLineOptions(newArgc, &newArgv[0], Overview);
 }
 
-void cl::ParseCommandLineOptions(int argc, const char * const *argv,
+void cl::ParseCommandLineOptions(int argc, const char *const *argv,
                                  const char *Overview) {
-  // Process all registered options.
-  SmallVector<Option*, 4> PositionalOpts;
-  SmallVector<Option*, 4> SinkOpts;
-  StringMap<Option*> Opts;
-  GetOptionInfo(PositionalOpts, SinkOpts, Opts);
+  GlobalParser->ParseCommandLineOptions(argc, argv, Overview);
+}
 
-  assert((!Opts.empty() || !PositionalOpts.empty()) &&
-         "No options specified!");
+void CommandLineParser::ParseCommandLineOptions(int argc,
+                                                const char *const *argv,
+                                                const char *Overview) {
+  assert(hasOptions() && "No options specified!");
 
   // Expand response files.
-  SmallVector<const char *, 20> newArgv;
-  for (int i = 0; i != argc; ++i)
-    newArgv.push_back(argv[i]);
+  SmallVector<const char *, 20> newArgv(argv, argv + argc);
   StrDupSaver Saver;
   ExpandResponseFiles(Saver, TokenizeGNUCommandLine, newArgv);
   argv = &newArgv[0];
   argc = static_cast<int>(newArgv.size());
 
   // Copy the program name into ProgName, making sure not to overflow it.
-  StringRef ProgName = sys::path::filename(argv[0]);
-  size_t Len = std::min(ProgName.size(), size_t(79));
-  memcpy(ProgramName, ProgName.data(), Len);
-  ProgramName[Len] = '\0';
+  ProgramName = sys::path::filename(argv[0]);
 
   ProgramOverview = Overview;
   bool ErrorParsing = false;
@@ -808,29 +858,26 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
   // Determine whether or not there are an unlimited number of positionals
   bool HasUnlimitedPositionals = false;
 
-  Option *ConsumeAfterOpt = nullptr;
+  if (ConsumeAfterOpt) {
+    assert(PositionalOpts.size() > 0 &&
+           "Cannot specify cl::ConsumeAfter without a positional argument!");
+  }
   if (!PositionalOpts.empty()) {
-    if (PositionalOpts[0]->getNumOccurrencesFlag() == cl::ConsumeAfter) {
-      assert(PositionalOpts.size() > 1 &&
-             "Cannot specify cl::ConsumeAfter without a positional argument!");
-      ConsumeAfterOpt = PositionalOpts[0];
-    }
 
     // Calculate how many positional values are _required_.
     bool UnboundedFound = false;
-    for (size_t i = ConsumeAfterOpt ? 1 : 0, e = PositionalOpts.size();
-         i != e; ++i) {
+    for (size_t i = 0, e = PositionalOpts.size(); i != e; ++i) {
       Option *Opt = PositionalOpts[i];
       if (RequiresValue(Opt))
         ++NumPositionalRequired;
       else if (ConsumeAfterOpt) {
         // ConsumeAfter cannot be combined with "optional" positional options
         // unless there is only one positional argument...
-        if (PositionalOpts.size() > 2)
-          ErrorParsing |=
-            Opt->error("error - this positional option will never be matched, "
-                       "because it does not Require a value, and a "
-                       "cl::ConsumeAfter option is active!");
+        if (PositionalOpts.size() > 1)
+          ErrorParsing |= Opt->error(
+              "error - this positional option will never be matched, "
+              "because it does not Require a value, and a "
+              "cl::ConsumeAfter option is active!");
       } else if (UnboundedFound && !Opt->ArgStr[0]) {
         // This option does not "require" a value...  Make sure this option is
         // not specified after an option that eats all extra arguments, or this
@@ -840,6 +887,9 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
                                    "another positional argument will match an "
                                    "unbounded number of values, and this option"
                                    " does not require a value!");
+        errs() << ProgramName << ": CommandLine Error: Option '" << Opt->ArgStr
+               << "' is all messed up!\n";
+        errs() << PositionalOpts.size();
       }
       UnboundedFound |= EatsUnboundedNumberOfValues(Opt);
     }
@@ -849,7 +899,7 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
   // PositionalVals - A vector of "positional" arguments we accumulate into
   // the process at the end.
   //
-  SmallVector<std::pair<StringRef,unsigned>, 4> PositionalVals;
+  SmallVector<std::pair<StringRef, unsigned>, 4> PositionalVals;
 
   // If the program has named positional arguments, and the name has been run
   // across, keep track of which positional argument was named.  Otherwise put
@@ -857,7 +907,7 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
   Option *ActivePositionalArg = nullptr;
 
   // Loop over all of the arguments... processing them.
-  bool DashDashFound = false;  // Have we read '--'?
+  bool DashDashFound = false; // Have we read '--'?
   for (int i = 1; i < argc; ++i) {
     Option *Handler = nullptr;
     Option *NearestHandler = nullptr;
@@ -865,17 +915,6 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
     StringRef Value;
     StringRef ArgName = "";
 
-    // If the option list changed, this means that some command line
-    // option has just been registered or deregistered.  This can occur in
-    // response to things like -load, etc.  If this happens, rescan the options.
-    if (OptionListChanged) {
-      PositionalOpts.clear();
-      SinkOpts.clear();
-      Opts.clear();
-      GetOptionInfo(PositionalOpts, SinkOpts, Opts);
-      OptionListChanged = false;
-    }
-
     // Check to see if this is a positional argument.  This argument is
     // considered to be positional if it doesn't start with '-', if it is "-"
     // itself, or if we have seen "--" already.
@@ -884,19 +923,19 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
       // Positional argument!
       if (ActivePositionalArg) {
         ProvidePositionalOption(ActivePositionalArg, argv[i], i);
-        continue;  // We are done!
+        continue; // We are done!
       }
 
       if (!PositionalOpts.empty()) {
-        PositionalVals.push_back(std::make_pair(argv[i],i));
+        PositionalVals.push_back(std::make_pair(argv[i], i));
 
         // All of the positional arguments have been fulfulled, give the rest to
         // the consume after option... if it's specified...
         //
         if (PositionalVals.size() >= NumPositionalRequired && ConsumeAfterOpt) {
           for (++i; i < argc; ++i)
-            PositionalVals.push_back(std::make_pair(argv[i],i));
-          break;   // Handle outside of the argument processing loop...
+            PositionalVals.push_back(std::make_pair(argv[i], i));
+          break; // Handle outside of the argument processing loop...
         }
 
         // Delay processing positional arguments until the end...
@@ -904,59 +943,60 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
       }
     } else if (argv[i][0] == '-' && argv[i][1] == '-' && argv[i][2] == 0 &&
                !DashDashFound) {
-      DashDashFound = true;  // This is the mythical "--"?
-      continue;              // Don't try to process it as an argument itself.
+      DashDashFound = true; // This is the mythical "--"?
+      continue;             // Don't try to process it as an argument itself.
     } else if (ActivePositionalArg &&
                (ActivePositionalArg->getMiscFlags() & PositionalEatsArgs)) {
       // If there is a positional argument eating options, check to see if this
       // option is another positional argument.  If so, treat it as an argument,
       // otherwise feed it to the eating positional.
-      ArgName = argv[i]+1;
+      ArgName = argv[i] + 1;
       // Eat leading dashes.
       while (!ArgName.empty() && ArgName[0] == '-')
         ArgName = ArgName.substr(1);
 
-      Handler = LookupOption(ArgName, Value, Opts);
+      Handler = LookupOption(ArgName, Value);
       if (!Handler || Handler->getFormattingFlag() != cl::Positional) {
         ProvidePositionalOption(ActivePositionalArg, argv[i], i);
-        continue;  // We are done!
+        continue; // We are done!
       }
 
-    } else {     // We start with a '-', must be an argument.
-      ArgName = argv[i]+1;
+    } else { // We start with a '-', must be an argument.
+      ArgName = argv[i] + 1;
       // Eat leading dashes.
       while (!ArgName.empty() && ArgName[0] == '-')
         ArgName = ArgName.substr(1);
 
-      Handler = LookupOption(ArgName, Value, Opts);
+      Handler = LookupOption(ArgName, Value);
 
       // Check to see if this "option" is really a prefixed or grouped argument.
       if (!Handler)
-        Handler = HandlePrefixedOrGroupedOption(ArgName, Value,
-                                                ErrorParsing, Opts);
+        Handler = HandlePrefixedOrGroupedOption(ArgName, Value, ErrorParsing,
+                                                OptionsMap);
 
       // Otherwise, look for the closest available option to report to the user
       // in the upcoming error.
       if (!Handler && SinkOpts.empty())
-        NearestHandler = LookupNearestOption(ArgName, Opts,
-                                             NearestHandlerString);
+        NearestHandler =
+            LookupNearestOption(ArgName, OptionsMap, NearestHandlerString);
     }
 
     if (!Handler) {
       if (SinkOpts.empty()) {
-        errs() << ProgramName << ": Unknown command line argument '"
-             << argv[i] << "'.  Try: '" << argv[0] << " -help'\n";
+        errs() << ProgramName << ": Unknown command line argument '" << argv[i]
+               << "'.  Try: '" << argv[0] << " -help'\n";
 
         if (NearestHandler) {
           // If we know a near match, report it as well.
-          errs() << ProgramName << ": Did you mean '-"
-                 << NearestHandlerString << "'?\n";
+          errs() << ProgramName << ": Did you mean '-" << NearestHandlerString
+                 << "'?\n";
         }
 
         ErrorParsing = true;
       } else {
-        for (SmallVectorImpl<Option*>::iterator I = SinkOpts.begin(),
-               E = SinkOpts.end(); I != E ; ++I)
+        for (SmallVectorImpl<Option *>::iterator I = SinkOpts.begin(),
+                                                 E = SinkOpts.end();
+             I != E; ++I)
           (*I)->addOccurrence(i, "", argv[i]);
       }
       continue;
@@ -973,17 +1013,16 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
   // Check and handle positional arguments now...
   if (NumPositionalRequired > PositionalVals.size()) {
     errs() << ProgramName
-         << ": Not enough positional command line arguments specified!\n"
-         << "Must specify at least " << NumPositionalRequired
-         << " positional arguments: See: " << argv[0] << " -help\n";
+           << ": Not enough positional command line arguments specified!\n"
+           << "Must specify at least " << NumPositionalRequired
+           << " positional arguments: See: " << argv[0] << " -help\n";
 
     ErrorParsing = true;
   } else if (!HasUnlimitedPositionals &&
              PositionalVals.size() > PositionalOpts.size()) {
-    errs() << ProgramName
-         << ": Too many positional arguments specified!\n"
-         << "Can specify at most " << PositionalOpts.size()
-         << " positional arguments: See: " << argv[0] << " -help\n";
+    errs() << ProgramName << ": Too many positional arguments specified!\n"
+           << "Can specify at most " << PositionalOpts.size()
+           << " positional arguments: See: " << argv[0] << " -help\n";
     ErrorParsing = true;
 
   } else if (!ConsumeAfterOpt) {
@@ -994,7 +1033,7 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
         ProvidePositionalOption(PositionalOpts[i], PositionalVals[ValNo].first,
                                 PositionalVals[ValNo].second);
         ValNo++;
-        --NumPositionalRequired;  // We fulfilled our duty...
+        --NumPositionalRequired; // We fulfilled our duty...
       }
 
       // If we _can_ give this option more arguments, do so now, as long as we
@@ -1002,13 +1041,13 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
       // option even _WANTS_ any more.
       //
       bool Done = PositionalOpts[i]->getNumOccurrencesFlag() == cl::Required;
-      while (NumVals-ValNo > NumPositionalRequired && !Done) {
+      while (NumVals - ValNo > NumPositionalRequired && !Done) {
         switch (PositionalOpts[i]->getNumOccurrencesFlag()) {
         case cl::Optional:
-          Done = true;          // Optional arguments want _at most_ one value
-          // FALL THROUGH
-        case cl::ZeroOrMore:    // Zero or more will take all they can get...
-        case cl::OneOrMore:     // One or more will take all they can get...
+          Done = true; // Optional arguments want _at most_ one value
+        // FALL THROUGH
+        case cl::ZeroOrMore: // Zero or more will take all they can get...
+        case cl::OneOrMore:  // One or more will take all they can get...
           ProvidePositionalOption(PositionalOpts[i],
                                   PositionalVals[ValNo].first,
                                   PositionalVals[ValNo].second);
@@ -1016,7 +1055,7 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
           break;
         default:
           llvm_unreachable("Internal error, unexpected NumOccurrences flag in "
-                 "positional argument processing!");
+                           "positional argument processing!");
         }
       }
     }
@@ -1036,8 +1075,8 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
     // positional option and keep the rest for the consume after.  The above
     // loop would have assigned no values to positional options in this case.
     //
-    if (PositionalOpts.size() == 2 && ValNo == 0 && !PositionalVals.empty()) {
-      ErrorParsing |= ProvidePositionalOption(PositionalOpts[1],
+    if (PositionalOpts.size() == 1 && ValNo == 0 && !PositionalVals.empty()) {
+      ErrorParsing |= ProvidePositionalOption(PositionalOpts[0],
                                               PositionalVals[ValNo].first,
                                               PositionalVals[ValNo].second);
       ValNo++;
@@ -1046,13 +1085,13 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
     // Handle over all of the rest of the arguments to the
     // cl::ConsumeAfter command line option...
     for (; ValNo != PositionalVals.size(); ++ValNo)
-      ErrorParsing |= ProvidePositionalOption(ConsumeAfterOpt,
-                                              PositionalVals[ValNo].first,
-                                              PositionalVals[ValNo].second);
+      ErrorParsing |=
+          ProvidePositionalOption(ConsumeAfterOpt, PositionalVals[ValNo].first,
+                                  PositionalVals[ValNo].second);
   }
 
   // Loop over args and make sure all required args are specified!
-  for (const auto &Opt : Opts) {
+  for (const auto &Opt : OptionsMap) {
     switch (Opt.second->getNumOccurrencesFlag()) {
     case Required:
     case OneOrMore:
@@ -1060,7 +1099,7 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
         Opt.second->error("must be specified at least once!");
         ErrorParsing = true;
       }
-      // Fall through
+    // Fall through
     default:
       break;
     }
@@ -1070,19 +1109,16 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
   // Note that if ReadResponseFiles == true, this must be done before the
   // memory allocated for the expanded command line is free()d below.
   DEBUG(dbgs() << "Args: ";
-        for (int i = 0; i < argc; ++i)
-          dbgs() << argv[i] << ' ';
-        dbgs() << '\n';
-       );
+        for (int i = 0; i < argc; ++i) dbgs() << argv[i] << ' ';
+        dbgs() << '\n';);
 
   // Free all of the memory allocated to the map.  Command line options may only
   // be processed once!
-  Opts.clear();
-  PositionalOpts.clear();
-  MoreHelp->clear();
+  MoreHelp.clear();
 
   // If we had an error processing our arguments, don't let the program execute
-  if (ErrorParsing) exit(1);
+  if (ErrorParsing)
+    exit(1);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1090,20 +1126,21 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
 //
 
 bool Option::error(const Twine &Message, StringRef ArgName) {
-  if (!ArgName.data()) ArgName = ArgStr;
+  if (!ArgName.data())
+    ArgName = ArgStr;
   if (ArgName.empty())
-    errs() << HelpStr;  // Be nice for positional arguments
+    errs() << HelpStr; // Be nice for positional arguments
   else
-    errs() << ProgramName << ": for the -" << ArgName;
+    errs() << GlobalParser->ProgramName << ": for the -" << ArgName;
 
   errs() << " option: " << Message << "\n";
   return true;
 }
 
-bool Option::addOccurrence(unsigned pos, StringRef ArgName,
-                           StringRef Value, bool MultiArg) {
+bool Option::addOccurrence(unsigned pos, StringRef ArgName, StringRef Value,
+                           bool MultiArg) {
   if (!MultiArg)
-    NumOccurrences++;   // Increment the number of times we have been seen
+    NumOccurrences++; // Increment the number of times we have been seen
 
   switch (getNumOccurrencesFlag()) {
   case Optional:
@@ -1113,21 +1150,22 @@ bool Option::addOccurrence(unsigned pos, StringRef ArgName,
   case Required:
     if (NumOccurrences > 1)
       return error("must occur exactly one time!", ArgName);
-    // Fall through
+  // Fall through
   case OneOrMore:
   case ZeroOrMore:
-  case ConsumeAfter: break;
+  case ConsumeAfter:
+    break;
   }
 
   return handleOccurrence(pos, ArgName, Value);
 }
 
-
 // getValueStr - Get the value description string, using "DefaultMsg" if nothing
 // has been specified yet.
 //
 static const char *getValueStr(const Option &O, const char *DefaultMsg) {
-  if (O.ValueStr[0] == 0) return DefaultMsg;
+  if (O.ValueStr[0] == 0)
+    return DefaultMsg;
   return O.ValueStr;
 }
 
@@ -1136,9 +1174,7 @@ static const char *getValueStr(const Option &O, const char *DefaultMsg) {
 //
 
 // Return the width of the option tag for printing...
-size_t alias::getOptionWidth() const {
-  return std::strlen(ArgStr)+6;
-}
+size_t alias::getOptionWidth() const { return std::strlen(ArgStr) + 6; }
 
 static void printHelpStr(StringRef HelpStr, size_t Indent,
                          size_t FirstLineIndentedBy) {
@@ -1167,7 +1203,7 @@ void alias::printOptionInfo(size_t GlobalWidth) const {
 size_t basic_parser_impl::getOptionWidth(const Option &O) const {
   size_t Len = std::strlen(O.ArgStr);
   if (const char *ValName = getValueName())
-    Len += std::strlen(getValueStr(O, ValName))+3;
+    Len += std::strlen(getValueStr(O, ValName)) + 3;
 
   return Len + 6;
 }
@@ -1188,14 +1224,13 @@ void basic_parser_impl::printOptionInfo(const Option &O,
 void basic_parser_impl::printOptionName(const Option &O,
                                         size_t GlobalWidth) const {
   outs() << "  -" << O.ArgStr;
-  outs().indent(GlobalWidth-std::strlen(O.ArgStr));
+  outs().indent(GlobalWidth - std::strlen(O.ArgStr));
 }
 
-
 // parser<bool> implementation
 //
-bool parser<bool>::parse(Option &O, StringRef ArgName,
-                         StringRef Arg, bool &Value) {
+bool parser<bool>::parse(Option &O, StringRef ArgName, StringRef Arg,
+                         bool &Value) {
   if (Arg == "" || Arg == "true" || Arg == "TRUE" || Arg == "True" ||
       Arg == "1") {
     Value = true;
@@ -1212,8 +1247,8 @@ bool parser<bool>::parse(Option &O, StringRef ArgName,
 
 // parser<boolOrDefault> implementation
 //
-bool parser<boolOrDefault>::parse(Option &O, StringRef ArgName,
-                                  StringRef Arg, boolOrDefault &Value) {
+bool parser<boolOrDefault>::parse(Option &O, StringRef ArgName, StringRef Arg,
+                                  boolOrDefault &Value) {
   if (Arg == "" || Arg == "true" || Arg == "TRUE" || Arg == "True" ||
       Arg == "1") {
     Value = BOU_TRUE;
@@ -1230,8 +1265,8 @@ bool parser<boolOrDefault>::parse(Option &O, StringRef ArgName,
 
 // parser<int> implementation
 //
-bool parser<int>::parse(Option &O, StringRef ArgName,
-                        StringRef Arg, int &Value) {
+bool parser<int>::parse(Option &O, StringRef ArgName, StringRef Arg,
+                        int &Value) {
   if (Arg.getAsInteger(0, Value))
     return O.error("'" + Arg + "' value invalid for integer argument!");
   return false;
@@ -1239,8 +1274,8 @@ bool parser<int>::parse(Option &O, StringRef ArgName,
 
 // parser<unsigned> implementation
 //
-bool parser<unsigned>::parse(Option &O, StringRef ArgName,
-                             StringRef Arg, unsigned &Value) {
+bool parser<unsigned>::parse(Option &O, StringRef ArgName, StringRef Arg,
+                             unsigned &Value) {
 
   if (Arg.getAsInteger(0, Value))
     return O.error("'" + Arg + "' value invalid for uint argument!");
@@ -1250,7 +1285,8 @@ bool parser<unsigned>::parse(Option &O, StringRef ArgName,
 // parser<unsigned long long> implementation
 //
 bool parser<unsigned long long>::parse(Option &O, StringRef ArgName,
-                                      StringRef Arg, unsigned long long &Value){
+                                       StringRef Arg,
+                                       unsigned long long &Value) {
 
   if (Arg.getAsInteger(0, Value))
     return O.error("'" + Arg + "' value invalid for uint argument!");
@@ -1269,13 +1305,13 @@ static bool parseDouble(Option &O, StringRef Arg, double &Value) {
   return false;
 }
 
-bool parser<double>::parse(Option &O, StringRef ArgName,
-                           StringRef Arg, double &Val) {
+bool parser<double>::parse(Option &O, StringRef ArgName, StringRef Arg,
+                           double &Val) {
   return parseDouble(O, Arg, Val);
 }
 
-bool parser<float>::parse(Option &O, StringRef ArgName,
-                          StringRef Arg, float &Val) {
+bool parser<float>::parse(Option &O, StringRef ArgName, StringRef Arg,
+                          float &Val) {
   double dVal;
   if (parseDouble(O, Arg, dVal))
     return true;
@@ -1283,8 +1319,6 @@ bool parser<float>::parse(Option &O, StringRef ArgName,
   return false;
 }
 
-
-
 // generic_parser_base implementation
 //
 
@@ -1301,18 +1335,17 @@ unsigned generic_parser_base::findOption(const char *Name) {
   return e;
 }
 
-
 // Return the width of the option tag for printing...
 size_t generic_parser_base::getOptionWidth(const Option &O) const {
   if (O.hasArgStr()) {
-    size_t Size = std::strlen(O.ArgStr)+6;
+    size_t Size = std::strlen(O.ArgStr) + 6;
     for (unsigned i = 0, e = getNumOptions(); i != e; ++i)
-      Size = std::max(Size, std::strlen(getOption(i))+8);
+      Size = std::max(Size, std::strlen(getOption(i)) + 8);
     return Size;
   } else {
     size_t BaseSize = 0;
     for (unsigned i = 0, e = getNumOptions(); i != e; ++i)
-      BaseSize = std::max(BaseSize, std::strlen(getOption(i))+8);
+      BaseSize = std::max(BaseSize, std::strlen(getOption(i)) + 8);
     return BaseSize;
   }
 }
@@ -1327,7 +1360,7 @@ void generic_parser_base::printOptionInfo(const Option &O,
     printHelpStr(O.HelpStr, GlobalWidth, std::strlen(O.ArgStr) + 6);
 
     for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
-      size_t NumSpaces = GlobalWidth-strlen(getOption(i))-8;
+      size_t NumSpaces = GlobalWidth - strlen(getOption(i)) - 8;
       outs() << "    =" << getOption(i);
       outs().indent(NumSpaces) << " -   " << getDescription(i) << '\n';
     }
@@ -1347,12 +1380,11 @@ static const size_t MaxOptWidth = 8; // arbitrary spacing for printOptionDiff
 // printGenericOptionDiff - Print the value of this option and it's default.
 //
 // "Generic" options have each value mapped to a name.
-void generic_parser_base::
-printGenericOptionDiff(const Option &O, const GenericOptionValue &Value,
-                       const GenericOptionValue &Default,
-                       size_t GlobalWidth) const {
+void generic_parser_base::printGenericOptionDiff(
+    const Option &O, const GenericOptionValue &Value,
+    const GenericOptionValue &Default, size_t GlobalWidth) const {
   outs() << "  -" << O.ArgStr;
-  outs().indent(GlobalWidth-std::strlen(O.ArgStr));
+  outs().indent(GlobalWidth - std::strlen(O.ArgStr));
 
   unsigned NumOpts = getNumOptions();
   for (unsigned i = 0; i != NumOpts; ++i) {
@@ -1377,25 +1409,25 @@ printGenericOptionDiff(const Option &O, const GenericOptionValue &Value,
 
 // printOptionDiff - Specializations for printing basic value types.
 //
-#define PRINT_OPT_DIFF(T)                                               \
-  void parser<T>::                                                      \
-  printOptionDiff(const Option &O, T V, OptionValue<T> D,               \
-                  size_t GlobalWidth) const {                           \
-    printOptionName(O, GlobalWidth);                                    \
-    std::string Str;                                                    \
-    {                                                                   \
-      raw_string_ostream SS(Str);                                       \
-      SS << V;                                                          \
-    }                                                                   \
-    outs() << "= " << Str;                                              \
-    size_t NumSpaces = MaxOptWidth > Str.size() ? MaxOptWidth - Str.size() : 0;\
-    outs().indent(NumSpaces) << " (default: ";                          \
-    if (D.hasValue())                                                   \
-      outs() << D.getValue();                                           \
-    else                                                                \
-      outs() << "*no default*";                                         \
-    outs() << ")\n";                                                    \
-  }                                                                     \
+#define PRINT_OPT_DIFF(T)                                                      \
+  void parser<T>::printOptionDiff(const Option &O, T V, OptionValue<T> D,      \
+                                  size_t GlobalWidth) const {                  \
+    printOptionName(O, GlobalWidth);                                           \
+    std::string Str;                                                           \
+    {                                                                          \
+      raw_string_ostream SS(Str);                                              \
+      SS << V;                                                                 \
+    }                                                                          \
+    outs() << "= " << Str;                                                     \
+    size_t NumSpaces =                                                         \
+        MaxOptWidth > Str.size() ? MaxOptWidth - Str.size() : 0;               \
+    outs().indent(NumSpaces) << " (default: ";                                 \
+    if (D.hasValue())                                                          \
+      outs() << D.getValue();                                                  \
+    else                                                                       \
+      outs() << "*no default*";                                                \
+    outs() << ")\n";                                                           \
+  }
 
 PRINT_OPT_DIFF(bool)
 PRINT_OPT_DIFF(boolOrDefault)
@@ -1406,9 +1438,9 @@ PRINT_OPT_DIFF(double)
 PRINT_OPT_DIFF(float)
 PRINT_OPT_DIFF(char)
 
-void parser<std::string>::
-printOptionDiff(const Option &O, StringRef V, OptionValue<std::string> D,
-                size_t GlobalWidth) const {
+void parser<std::string>::printOptionDiff(const Option &O, StringRef V,
+                                          OptionValue<std::string> D,
+                                          size_t GlobalWidth) const {
   printOptionName(O, GlobalWidth);
   outs() << "= " << V;
   size_t NumSpaces = MaxOptWidth > V.size() ? MaxOptWidth - V.size() : 0;
@@ -1421,8 +1453,8 @@ printOptionDiff(const Option &O, StringRef V, OptionValue<std::string> D,
 }
 
 // Print a placeholder for options that don't yet support printOptionDiff().
-void basic_parser_impl::
-printOptionNoValue(const Option &O, size_t GlobalWidth) const {
+void basic_parser_impl::printOptionNoValue(const Option &O,
+                                           size_t GlobalWidth) const {
   printOptionName(O, GlobalWidth);
   outs() << "= *cannot print option value*\n";
 }
@@ -1432,19 +1464,18 @@ printOptionNoValue(const Option &O, size_t GlobalWidth) const {
 //
 
 static int OptNameCompare(const void *LHS, const void *RHS) {
-  typedef std::pair<const char *, Option*> pair_ty;
+  typedef std::pair<const char *, Option *> pair_ty;
 
-  return strcmp(((const pair_ty*)LHS)->first, ((const pair_ty*)RHS)->first);
+  return strcmp(((const pair_ty *)LHS)->first, ((const pair_ty *)RHS)->first);
 }
 
 // Copy Options into a vector so we can sort them as we like.
-static void
-sortOpts(StringMap<Option*> &OptMap,
-         SmallVectorImpl< std::pair<const char *, Option*> > &Opts,
-         bool ShowHidden) {
-  SmallPtrSet<Option*, 128> OptionSet;  // Duplicate option detection.
+static void sortOpts(StringMap<Option *> &OptMap,
+                     SmallVectorImpl<std::pair<const char *, Option *>> &Opts,
+                     bool ShowHidden) {
+  SmallPtrSet<Option *, 128> OptionSet; // Duplicate option detection.
 
-  for (StringMap<Option*>::iterator I = OptMap.begin(), E = OptMap.end();
+  for (StringMap<Option *>::iterator I = OptMap.begin(), E = OptMap.end();
        I != E; ++I) {
     // Ignore really-hidden options.
     if (I->second->getOptionHiddenFlag() == ReallyHidden)
@@ -1458,8 +1489,8 @@ sortOpts(StringMap<Option*> &OptMap,
     if (!OptionSet.insert(I->second).second)
       continue;
 
-    Opts.push_back(std::pair<const char *, Option*>(I->getKey().data(),
-                                                    I->second));
+    Opts.push_back(
+        std::pair<const char *, Option *>(I->getKey().data(), I->second));
   }
 
   // Sort the options list alphabetically.
@@ -1471,7 +1502,8 @@ namespace {
 class HelpPrinter {
 protected:
   const bool ShowHidden;
-  typedef SmallVector<std::pair<const char *, Option*>,128> StrOptionPairVector;
+  typedef SmallVector<std::pair<const char *, Option *>, 128>
+      StrOptionPairVector;
   // Print the options. Opts is assumed to be alphabetically sorted.
   virtual void printOptions(StrOptionPairVector &Opts, size_t MaxArgLen) {
     for (size_t i = 0, e = Opts.size(); i != e; ++i)
@@ -1484,36 +1516,26 @@ public:
 
   // Invoke the printer.
   void operator=(bool Value) {
-    if (Value == false) return;
-
-    // Get all the options.
-    SmallVector<Option*, 4> PositionalOpts;
-    SmallVector<Option*, 4> SinkOpts;
-    StringMap<Option*> OptMap;
-    GetOptionInfo(PositionalOpts, SinkOpts, OptMap);
+    if (Value == false)
+      return;
 
     StrOptionPairVector Opts;
-    sortOpts(OptMap, Opts, ShowHidden);
-
-    if (ProgramOverview)
-      outs() << "OVERVIEW: " << ProgramOverview << "\n";
+    sortOpts(GlobalParser->OptionsMap, Opts, ShowHidden);
 
-    outs() << "USAGE: " << ProgramName << " [options]";
+    if (GlobalParser->ProgramOverview)
+      outs() << "OVERVIEW: " << GlobalParser->ProgramOverview << "\n";
 
-    // Print out the positional options.
-    Option *CAOpt = nullptr;   // The cl::ConsumeAfter option, if it exists...
-    if (!PositionalOpts.empty() &&
-        PositionalOpts[0]->getNumOccurrencesFlag() == ConsumeAfter)
-      CAOpt = PositionalOpts[0];
+    outs() << "USAGE: " << GlobalParser->ProgramName << " [options]";
 
-    for (size_t i = CAOpt != nullptr, e = PositionalOpts.size(); i != e; ++i) {
-      if (PositionalOpts[i]->ArgStr[0])
-        outs() << " --" << PositionalOpts[i]->ArgStr;
-      outs() << " " << PositionalOpts[i]->HelpStr;
+    for (auto Opt : GlobalParser->PositionalOpts) {
+      if (Opt->ArgStr[0])
+        outs() << " --" << Opt->ArgStr;
+      outs() << " " << Opt->HelpStr;
     }
 
     // Print the consume after option info if it exists...
-    if (CAOpt) outs() << " " << CAOpt->HelpStr;
+    if (GlobalParser->ConsumeAfterOpt)
+      outs() << " " << GlobalParser->ConsumeAfterOpt->HelpStr;
 
     outs() << "\n\n";
 
@@ -1526,11 +1548,9 @@ public:
     printOptions(Opts, MaxArgLen);
 
     // Print any extra help the user has declared.
-    for (std::vector<const char *>::iterator I = MoreHelp->begin(),
-                                             E = MoreHelp->end();
-         I != E; ++I)
-      outs() << *I;
-    MoreHelp->clear();
+    for (auto I : GlobalParser->MoreHelp)
+      outs() << I;
+    GlobalParser->MoreHelp.clear();
 
     // Halt the program since help information was printed
     exit(0);
@@ -1549,17 +1569,17 @@ public:
   }
 
   // Make sure we inherit our base class's operator=()
-  using HelpPrinter::operator= ;
+  using HelpPrinter::operator=;
 
 protected:
   void printOptions(StrOptionPairVector &Opts, size_t MaxArgLen) override {
     std::vector<OptionCategory *> SortedCategories;
-    std::map<OptionCategory *, std::vector<Option *> > CategorizedOptions;
+    std::map<OptionCategory *, std::vector<Option *>> CategorizedOptions;
 
     // Collect registered option categories into vector in preparation for
     // sorting.
-    for (OptionCatSet::const_iterator I = RegisteredOptionCategories->begin(),
-                                      E = RegisteredOptionCategories->end();
+    for (auto I = GlobalParser->RegisteredOptionCategories.begin(),
+              E = GlobalParser->RegisteredOptionCategories.end();
          I != E; ++I) {
       SortedCategories.push_back(*I);
     }
@@ -1631,9 +1651,9 @@ private:
 
 public:
   explicit HelpPrinterWrapper(HelpPrinter &UncategorizedPrinter,
-                              CategorizedHelpPrinter &CategorizedPrinter) :
-    UncategorizedPrinter(UncategorizedPrinter),
-    CategorizedPrinter(CategorizedPrinter) { }
+                              CategorizedHelpPrinter &CategorizedPrinter)
+      : UncategorizedPrinter(UncategorizedPrinter),
+        CategorizedPrinter(CategorizedPrinter) {}
 
   // Invoke the printer.
   void operator=(bool Value);
@@ -1648,7 +1668,6 @@ static HelpPrinter UncategorizedHiddenPrinter(true);
 static CategorizedHelpPrinter CategorizedNormalPrinter(false);
 static CategorizedHelpPrinter CategorizedHiddenPrinter(true);
 
-
 // Declare HelpPrinter wrappers that will decide whether or not to invoke
 // a categorizing help printer
 static HelpPrinterWrapper WrappedNormalPrinter(UncategorizedNormalPrinter,
@@ -1656,41 +1675,45 @@ static HelpPrinterWrapper WrappedNormalPrinter(UncategorizedNormalPrinter,
 static HelpPrinterWrapper WrappedHiddenPrinter(UncategorizedHiddenPrinter,
                                                CategorizedHiddenPrinter);
 
+// Define a category for generic options that all tools should have.
+static cl::OptionCategory GenericCategory("Generic Options");
+
 // Define uncategorized help printers.
 // -help-list is hidden by default because if Option categories are being used
 // then -help behaves the same as -help-list.
-static cl::opt<HelpPrinter, true, parser<bool> >
-HLOp("help-list",
-     cl::desc("Display list of available options (-help-list-hidden for more)"),
-     cl::location(UncategorizedNormalPrinter), cl::Hidden, cl::ValueDisallowed);
+static cl::opt<HelpPrinter, true, parser<bool>> HLOp(
+    "help-list",
+    cl::desc("Display list of available options (-help-list-hidden for more)"),
+    cl::location(UncategorizedNormalPrinter), cl::Hidden, cl::ValueDisallowed,
+    cl::cat(GenericCategory));
 
-static cl::opt<HelpPrinter, true, parser<bool> >
-HLHOp("help-list-hidden",
-     cl::desc("Display list of all available options"),
-     cl::location(UncategorizedHiddenPrinter), cl::Hidden, cl::ValueDisallowed);
+static cl::opt<HelpPrinter, true, parser<bool>>
+    HLHOp("help-list-hidden", cl::desc("Display list of all available options"),
+          cl::location(UncategorizedHiddenPrinter), cl::Hidden,
+          cl::ValueDisallowed, cl::cat(GenericCategory));
 
 // Define uncategorized/categorized help printers. These printers change their
 // behaviour at runtime depending on whether one or more Option categories have
 // been declared.
-static cl::opt<HelpPrinterWrapper, true, parser<bool> >
-HOp("help", cl::desc("Display available options (-help-hidden for more)"),
-    cl::location(WrappedNormalPrinter), cl::ValueDisallowed);
-
-static cl::opt<HelpPrinterWrapper, true, parser<bool> >
-HHOp("help-hidden", cl::desc("Display all available options"),
-     cl::location(WrappedHiddenPrinter), cl::Hidden, cl::ValueDisallowed);
-
-
-
-static cl::opt<bool>
-PrintOptions("print-options",
-             cl::desc("Print non-default options after command line parsing"),
-             cl::Hidden, cl::init(false));
-
-static cl::opt<bool>
-PrintAllOptions("print-all-options",
-                cl::desc("Print all option values after command line parsing"),
-                cl::Hidden, cl::init(false));
+static cl::opt<HelpPrinterWrapper, true, parser<bool>>
+    HOp("help", cl::desc("Display available options (-help-hidden for more)"),
+        cl::location(WrappedNormalPrinter), cl::ValueDisallowed,
+        cl::cat(GenericCategory));
+
+static cl::opt<HelpPrinterWrapper, true, parser<bool>>
+    HHOp("help-hidden", cl::desc("Display all available options"),
+         cl::location(WrappedHiddenPrinter), cl::Hidden, cl::ValueDisallowed,
+         cl::cat(GenericCategory));
+
+static cl::opt<bool> PrintOptions(
+    "print-options",
+    cl::desc("Print non-default options after command line parsing"),
+    cl::Hidden, cl::init(false), cl::cat(GenericCategory));
+
+static cl::opt<bool> PrintAllOptions(
+    "print-all-options",
+    cl::desc("Print all option values after command line parsing"), cl::Hidden,
+    cl::init(false), cl::cat(GenericCategory));
 
 void HelpPrinterWrapper::operator=(bool Value) {
   if (Value == false)
@@ -1699,29 +1722,25 @@ void HelpPrinterWrapper::operator=(bool Value) {
   // Decide which printer to invoke. If more than one option category is
   // registered then it is useful to show the categorized help instead of
   // uncategorized help.
-  if (RegisteredOptionCategories->size() > 1) {
+  if (GlobalParser->RegisteredOptionCategories.size() > 1) {
     // unhide -help-list option so user can have uncategorized output if they
     // want it.
     HLOp.setHiddenFlag(NotHidden);
 
     CategorizedPrinter = true; // Invoke categorized printer
-  }
-  else
+  } else
     UncategorizedPrinter = true; // Invoke uncategorized printer
 }
 
 // Print the value of each option.
-void cl::PrintOptionValues() {
-  if (!PrintOptions && !PrintAllOptions) return;
+void cl::PrintOptionValues() { GlobalParser->printOptionValues(); }
 
-  // Get all the options.
-  SmallVector<Option*, 4> PositionalOpts;
-  SmallVector<Option*, 4> SinkOpts;
-  StringMap<Option*> OptMap;
-  GetOptionInfo(PositionalOpts, SinkOpts, OptMap);
+void CommandLineParser::printOptionValues() {
+  if (!PrintOptions && !PrintAllOptions)
+    return;
 
-  SmallVector<std::pair<const char *, Option*>, 128> Opts;
-  sortOpts(OptMap, Opts, /*ShowHidden*/true);
+  SmallVector<std::pair<const char *, Option *>, 128> Opts;
+  sortOpts(OptionsMap, Opts, /*ShowHidden*/ true);
 
   // Compute the maximum argument length...
   size_t MaxArgLen = 0;
@@ -1734,7 +1753,7 @@ void cl::PrintOptionValues() {
 
 static void (*OverrideVersionPrinter)() = nullptr;
 
-static std::vector<void (*)()>* ExtraVersionPrinters = nullptr;
+static std::vector<void (*)()> *ExtraVersionPrinters = nullptr;
 
 namespace {
 class VersionPrinter {
@@ -1756,7 +1775,8 @@ public:
     OS << " with assertions";
 #endif
     std::string CPU = sys::getHostCPUName();
-    if (CPU == "generic") CPU = "(unknown)";
+    if (CPU == "generic")
+      CPU = "(unknown)";
     OS << ".\n"
 #if (ENABLE_TIMESTAMPS == 1)
        << "  Built " << __DATE__ << " (" << __TIME__ << ").\n"
@@ -1765,7 +1785,8 @@ public:
        << "  Host CPU: " << CPU << '\n';
   }
   void operator=(bool OptionWasSpecified) {
-    if (!OptionWasSpecified) return;
+    if (!OptionWasSpecified)
+      return;
 
     if (OverrideVersionPrinter != nullptr) {
       (*OverrideVersionPrinter)();
@@ -1788,13 +1809,13 @@ public:
 };
 } // End anonymous namespace
 
-
 // Define the --version option that prints out the LLVM version for the tool
 static VersionPrinter VersionPrinterInstance;
 
-static cl::opt<VersionPrinter, true, parser<bool> >
-VersOp("version", cl::desc("Display the version of this program"),
-    cl::location(VersionPrinterInstance), cl::ValueDisallowed);
+static cl::opt<VersionPrinter, true, parser<bool>>
+    VersOp("version", cl::desc("Display the version of this program"),
+           cl::location(VersionPrinterInstance), cl::ValueDisallowed,
+           cl::cat(GenericCategory));
 
 // Utility function for printing the help message.
 void cl::PrintHelpMessage(bool Hidden, bool Categorized) {
@@ -1816,13 +1837,9 @@ void cl::PrintHelpMessage(bool Hidden, bool Categorized) {
 }
 
 /// Utility function for printing version number.
-void cl::PrintVersionMessage() {
-  VersionPrinterInstance.print();
-}
+void cl::PrintVersionMessage() { VersionPrinterInstance.print(); }
 
-void cl::SetVersionPrinter(void (*func)()) {
-  OverrideVersionPrinter = func;
-}
+void cl::SetVersionPrinter(void (*func)()) { OverrideVersionPrinter = func; }
 
 void cl::AddExtraVersionPrinter(void (*func)()) {
   if (!ExtraVersionPrinters)
@@ -1831,14 +1848,27 @@ void cl::AddExtraVersionPrinter(void (*func)()) {
   ExtraVersionPrinters->push_back(func);
 }
 
-void cl::getRegisteredOptions(StringMap<Option*> &Map)
-{
-  // Get all the options.
-  SmallVector<Option*, 4> PositionalOpts; //NOT USED
-  SmallVector<Option*, 4> SinkOpts;  //NOT USED
-  assert(Map.size() == 0 && "StringMap must be empty");
-  GetOptionInfo(PositionalOpts, SinkOpts, Map);
-  return;
+StringMap<Option *> &cl::getRegisteredOptions() {
+  return GlobalParser->OptionsMap;
+}
+
+void cl::HideUnrelatedOptions(cl::OptionCategory &Category) {
+  for (auto &I : GlobalParser->OptionsMap) {
+    if (I.second->Category != &Category &&
+        I.second->Category != &GenericCategory)
+      I.second->setHiddenFlag(cl::ReallyHidden);
+  }
+}
+
+void cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *> Categories) {
+  auto CategoriesBegin = Categories.begin();
+  auto CategoriesEnd = Categories.end();
+  for (auto &I : GlobalParser->OptionsMap) {
+    if (std::find(CategoriesBegin, CategoriesEnd, I.second->Category) ==
+            CategoriesEnd &&
+        I.second->Category != &GenericCategory)
+      I.second->setHiddenFlag(cl::ReallyHidden);
+  }
 }
 
 void LLVMParseCommandLineOptions(int argc, const char *const *argv,
diff --git a/lib/Support/Compression.cpp b/lib/Support/Compression.cpp
index c32eb213..17ae295 100644
--- a/lib/Support/Compression.cpp
+++ b/lib/Support/Compression.cpp
@@ -54,6 +54,9 @@ zlib::Status zlib::compress(StringRef InputBuffer,
   Status Res = encodeZlibReturnValue(::compress2(
       (Bytef *)CompressedBuffer.data(), &CompressedSize,
       (const Bytef *)InputBuffer.data(), InputBuffer.size(), CLevel));
+  // Tell MemorySanitizer that zlib output buffer is fully initialized.
+  // This avoids a false report when running LLVM with uninstrumented ZLib.
+  __msan_unpoison(CompressedBuffer.data(), CompressedSize);
   CompressedBuffer.resize(CompressedSize);
   return Res;
 }
@@ -65,6 +68,9 @@ zlib::Status zlib::uncompress(StringRef InputBuffer,
   Status Res = encodeZlibReturnValue(::uncompress(
       (Bytef *)UncompressedBuffer.data(), (uLongf *)&UncompressedSize,
       (const Bytef *)InputBuffer.data(), InputBuffer.size()));
+  // Tell MemorySanitizer that zlib output buffer is fully initialized.
+  // This avoids a false report when running LLVM with uninstrumented ZLib.
+  __msan_unpoison(UncompressedBuffer.data(), UncompressedSize);
   UncompressedBuffer.resize(UncompressedSize);
   return Res;
 }
diff --git a/lib/Support/ConvertUTFWrapper.cpp b/lib/Support/ConvertUTFWrapper.cpp
index e45335d..1bbef23 100644
--- a/lib/Support/ConvertUTFWrapper.cpp
+++ b/lib/Support/ConvertUTFWrapper.cpp
@@ -109,8 +109,9 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
   if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE)
     Src++;
 
-  // Just allocate enough space up front.  We'll shrink it later.
-  Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
+  // Just allocate enough space up front.  We'll shrink it later.  Allocate
+  // enough that we can fit a null terminator without reallocating.
+  Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + 1);
   UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]);
   UTF8 *DstEnd = Dst + Out.size();
 
@@ -124,6 +125,46 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
   }
 
   Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]);
+  Out.push_back(0);
+  Out.pop_back();
+  return true;
+}
+
+bool convertUTF8ToUTF16String(StringRef SrcUTF8,
+                              SmallVectorImpl<UTF16> &DstUTF16) {
+  assert(DstUTF16.empty());
+
+  // Avoid OOB by returning early on empty input.
+  if (SrcUTF8.empty()) {
+    DstUTF16.push_back(0);
+    DstUTF16.pop_back();
+    return true;
+  }
+
+  const UTF8 *Src = reinterpret_cast<const UTF8 *>(SrcUTF8.begin());
+  const UTF8 *SrcEnd = reinterpret_cast<const UTF8 *>(SrcUTF8.end());
+
+  // Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding
+  // as UTF-16 should always require the same amount or less code units than the
+  // UTF-8 encoding.  Allocate one extra byte for the null terminator though,
+  // so that someone calling DstUTF16.data() gets a null terminated string.
+  // We resize down later so we don't have to worry that this over allocates.
+  DstUTF16.resize(SrcUTF8.size()+1);
+  UTF16 *Dst = &DstUTF16[0];
+  UTF16 *DstEnd = Dst + DstUTF16.size();
+
+  ConversionResult CR =
+      ConvertUTF8toUTF16(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
+  assert(CR != targetExhausted);
+
+  if (CR != conversionOK) {
+    DstUTF16.clear();
+    return false;
+  }
+
+  DstUTF16.resize(Dst - &DstUTF16[0]);
+  DstUTF16.push_back(0);
+  DstUTF16.pop_back();
   return true;
 }
 
diff --git a/lib/Support/Debug.cpp b/lib/Support/Debug.cpp
index 8246542..9c58ae8 100644
--- a/lib/Support/Debug.cpp
+++ b/lib/Support/Debug.cpp
@@ -25,15 +25,51 @@
 
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/circular_raw_ostream.h"
-#include "llvm/Support/ManagedStatic.h"
+
+#undef isCurrentDebugType
+#undef setCurrentDebugType
 
 using namespace llvm;
 
+// Even though LLVM might be built with NDEBUG, define symbols that the code
+// built without NDEBUG can depend on via the llvm/Support/Debug.h header.
+namespace llvm {
+/// Exported boolean set by the -debug option.
+bool DebugFlag = false;
+
+static ManagedStatic<std::vector<std::string>> CurrentDebugType;
+
+/// Return true if the specified string is the debug type
+/// specified on the command line, or if none was specified on the command line
+/// with the -debug-only=X option.
+bool isCurrentDebugType(const char *DebugType) {
+  if (CurrentDebugType->empty())
+    return true;
+  // see if DebugType is in list. Note: do not use find() as that forces us to
+  // unnecessarily create an std::string instance.
+  for (auto d : *CurrentDebugType) {
+    if (d == DebugType)
+      return true;
+  }
+  return false;
+}
+
+/// Set the current debug type, as if the -debug-only=X
+/// option were specified.  Note that DebugFlag also needs to be set to true for
+/// debug output to be produced.
+///
+void setCurrentDebugType(const char *Type) {
+  CurrentDebugType->clear();
+  CurrentDebugType->push_back(Type);
+}
+
+} // namespace llvm
+
 // All Debug.h functionality is a no-op in NDEBUG mode.
 #ifndef NDEBUG
-bool llvm::DebugFlag;  // DebugFlag - Exported boolean set by the -debug option
 
 // -debug - Command line option to enable the DEBUG statements in the passes.
 // This flag may only be enabled in debug builds.
@@ -51,14 +87,14 @@ DebugBufferSize("debug-buffer-size",
                 cl::Hidden,
                 cl::init(0));
 
-static ManagedStatic<std::string> CurrentDebugType;
-
 namespace {
 
 struct DebugOnlyOpt {
   void operator=(const std::string &Val) const {
-    DebugFlag |= !Val.empty();
-    *CurrentDebugType = Val;
+    if (Val.empty())
+      return;
+    DebugFlag = true;
+    CurrentDebugType->push_back(Val);
   }
 };
 
@@ -68,7 +104,7 @@ static DebugOnlyOpt DebugOnlyOptLoc;
 
 static cl::opt<DebugOnlyOpt, true, cl::parser<std::string> >
 DebugOnly("debug-only", cl::desc("Enable a specific type of debug output"),
-          cl::Hidden, cl::value_desc("debug string"),
+          cl::Hidden, cl::ZeroOrMore, cl::value_desc("debug string"),
           cl::location(DebugOnlyOptLoc), cl::ValueRequired);
 
 // Signal handlers - dump debug output on termination.
@@ -82,22 +118,6 @@ static void debug_user_sig_handler(void *Cookie) {
   dbgout->flushBufferWithBanner();
 }
 
-// isCurrentDebugType - Return true if the specified string is the debug type
-// specified on the command line, or if none was specified on the command line
-// with the -debug-only=X option.
-//
-bool llvm::isCurrentDebugType(const char *DebugType) {
-  return CurrentDebugType->empty() || DebugType == *CurrentDebugType;
-}
-
-/// setCurrentDebugType - Set the current debug type, as if the -debug-only=X
-/// option were specified.  Note that DebugFlag also needs to be set to true for
-/// debug output to be produced.
-///
-void llvm::setCurrentDebugType(const char *Type) {
-  *CurrentDebugType = Type;
-}
-
 /// dbgs - Return a circular-buffered debug stream.
 raw_ostream &llvm::dbgs() {
   // Do one-time initialization in a thread-safe way.
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index 4b6337e..95c4bc3 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Dwarf.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
@@ -19,87 +20,19 @@ using namespace dwarf;
 
 const char *llvm::dwarf::TagString(unsigned Tag) {
   switch (Tag) {
-  case DW_TAG_array_type:                return "DW_TAG_array_type";
-  case DW_TAG_class_type:                return "DW_TAG_class_type";
-  case DW_TAG_entry_point:               return "DW_TAG_entry_point";
-  case DW_TAG_enumeration_type:          return "DW_TAG_enumeration_type";
-  case DW_TAG_formal_parameter:          return "DW_TAG_formal_parameter";
-  case DW_TAG_imported_declaration:      return "DW_TAG_imported_declaration";
-  case DW_TAG_label:                     return "DW_TAG_label";
-  case DW_TAG_lexical_block:             return "DW_TAG_lexical_block";
-  case DW_TAG_member:                    return "DW_TAG_member";
-  case DW_TAG_pointer_type:              return "DW_TAG_pointer_type";
-  case DW_TAG_reference_type:            return "DW_TAG_reference_type";
-  case DW_TAG_compile_unit:              return "DW_TAG_compile_unit";
-  case DW_TAG_string_type:               return "DW_TAG_string_type";
-  case DW_TAG_structure_type:            return "DW_TAG_structure_type";
-  case DW_TAG_subroutine_type:           return "DW_TAG_subroutine_type";
-  case DW_TAG_typedef:                   return "DW_TAG_typedef";
-  case DW_TAG_union_type:                return "DW_TAG_union_type";
-  case DW_TAG_unspecified_parameters:    return "DW_TAG_unspecified_parameters";
-  case DW_TAG_variant:                   return "DW_TAG_variant";
-  case DW_TAG_common_block:              return "DW_TAG_common_block";
-  case DW_TAG_common_inclusion:          return "DW_TAG_common_inclusion";
-  case DW_TAG_inheritance:               return "DW_TAG_inheritance";
-  case DW_TAG_inlined_subroutine:        return "DW_TAG_inlined_subroutine";
-  case DW_TAG_module:                    return "DW_TAG_module";
-  case DW_TAG_ptr_to_member_type:        return "DW_TAG_ptr_to_member_type";
-  case DW_TAG_set_type:                  return "DW_TAG_set_type";
-  case DW_TAG_subrange_type:             return "DW_TAG_subrange_type";
-  case DW_TAG_with_stmt:                 return "DW_TAG_with_stmt";
-  case DW_TAG_access_declaration:        return "DW_TAG_access_declaration";
-  case DW_TAG_base_type:                 return "DW_TAG_base_type";
-  case DW_TAG_catch_block:               return "DW_TAG_catch_block";
-  case DW_TAG_const_type:                return "DW_TAG_const_type";
-  case DW_TAG_constant:                  return "DW_TAG_constant";
-  case DW_TAG_enumerator:                return "DW_TAG_enumerator";
-  case DW_TAG_file_type:                 return "DW_TAG_file_type";
-  case DW_TAG_friend:                    return "DW_TAG_friend";
-  case DW_TAG_namelist:                  return "DW_TAG_namelist";
-  case DW_TAG_namelist_item:             return "DW_TAG_namelist_item";
-  case DW_TAG_packed_type:               return "DW_TAG_packed_type";
-  case DW_TAG_subprogram:                return "DW_TAG_subprogram";
-  case DW_TAG_template_type_parameter:   return "DW_TAG_template_type_parameter";
-  case DW_TAG_template_value_parameter:  return "DW_TAG_template_value_parameter";
-  case DW_TAG_thrown_type:               return "DW_TAG_thrown_type";
-  case DW_TAG_try_block:                 return "DW_TAG_try_block";
-  case DW_TAG_variant_part:              return "DW_TAG_variant_part";
-  case DW_TAG_variable:                  return "DW_TAG_variable";
-  case DW_TAG_volatile_type:             return "DW_TAG_volatile_type";
-  case DW_TAG_dwarf_procedure:           return "DW_TAG_dwarf_procedure";
-  case DW_TAG_restrict_type:             return "DW_TAG_restrict_type";
-  case DW_TAG_interface_type:            return "DW_TAG_interface_type";
-  case DW_TAG_namespace:                 return "DW_TAG_namespace";
-  case DW_TAG_imported_module:           return "DW_TAG_imported_module";
-  case DW_TAG_unspecified_type:          return "DW_TAG_unspecified_type";
-  case DW_TAG_partial_unit:              return "DW_TAG_partial_unit";
-  case DW_TAG_imported_unit:             return "DW_TAG_imported_unit";
-  case DW_TAG_condition:                 return "DW_TAG_condition";
-  case DW_TAG_shared_type:               return "DW_TAG_shared_type";
-  case DW_TAG_lo_user:                   return "DW_TAG_lo_user";
-  case DW_TAG_hi_user:                   return "DW_TAG_hi_user";
-  case DW_TAG_auto_variable:             return "DW_TAG_auto_variable";
-  case DW_TAG_arg_variable:              return "DW_TAG_arg_variable";
-  case DW_TAG_expression:                return "DW_TAG_expression";
-  case DW_TAG_rvalue_reference_type:     return "DW_TAG_rvalue_reference_type";
-  case DW_TAG_template_alias:            return "DW_TAG_template_alias";
-  case DW_TAG_coarray_type:              return "DW_TAG_coarray_type";
-  case DW_TAG_generic_subrange:          return "DW_TAG_generic_subrange";
-  case DW_TAG_dynamic_type:              return "DW_TAG_dynamic_type";
-  case DW_TAG_MIPS_loop:                 return "DW_TAG_MIPS_loop";
-  case DW_TAG_type_unit:                 return "DW_TAG_type_unit";
-  case DW_TAG_format_label:              return "DW_TAG_format_label";
-  case DW_TAG_function_template:         return "DW_TAG_function_template";
-  case DW_TAG_class_template:            return "DW_TAG_class_template";
-  case DW_TAG_GNU_template_template_param:
-    return "DW_TAG_GNU_template_template_param";
-  case DW_TAG_GNU_template_parameter_pack:
-    return "DW_TAG_GNU_template_parameter_pack";
-  case DW_TAG_GNU_formal_parameter_pack:
-    return "DW_TAG_GNU_formal_parameter_pack";
-  case DW_TAG_APPLE_property:            return "DW_TAG_APPLE_property";
+  default: return nullptr;
+#define HANDLE_DW_TAG(ID, NAME)                                                \
+  case DW_TAG_##NAME:                                                          \
+    return "DW_TAG_" #NAME;
+#include "llvm/Support/Dwarf.def"
   }
-  return nullptr;
+}
+
+unsigned llvm::dwarf::getTag(StringRef TagString) {
+  return StringSwitch<unsigned>(TagString)
+#define HANDLE_DW_TAG(ID, NAME) .Case("DW_TAG_" #NAME, DW_TAG_##NAME)
+#include "llvm/Support/Dwarf.def"
+      .Default(DW_TAG_invalid);
 }
 
 const char *llvm::dwarf::ChildrenString(unsigned Children) {
@@ -306,193 +239,36 @@ const char *llvm::dwarf::FormEncodingString(unsigned Encoding) {
 
 const char *llvm::dwarf::OperationEncodingString(unsigned Encoding) {
   switch (Encoding) {
-  case DW_OP_addr:                       return "DW_OP_addr";
-  case DW_OP_deref:                      return "DW_OP_deref";
-  case DW_OP_const1u:                    return "DW_OP_const1u";
-  case DW_OP_const1s:                    return "DW_OP_const1s";
-  case DW_OP_const2u:                    return "DW_OP_const2u";
-  case DW_OP_const2s:                    return "DW_OP_const2s";
-  case DW_OP_const4u:                    return "DW_OP_const4u";
-  case DW_OP_const4s:                    return "DW_OP_const4s";
-  case DW_OP_const8u:                    return "DW_OP_const8u";
-  case DW_OP_const8s:                    return "DW_OP_const8s";
-  case DW_OP_constu:                     return "DW_OP_constu";
-  case DW_OP_consts:                     return "DW_OP_consts";
-  case DW_OP_dup:                        return "DW_OP_dup";
-  case DW_OP_drop:                       return "DW_OP_drop";
-  case DW_OP_over:                       return "DW_OP_over";
-  case DW_OP_pick:                       return "DW_OP_pick";
-  case DW_OP_swap:                       return "DW_OP_swap";
-  case DW_OP_rot:                        return "DW_OP_rot";
-  case DW_OP_xderef:                     return "DW_OP_xderef";
-  case DW_OP_abs:                        return "DW_OP_abs";
-  case DW_OP_and:                        return "DW_OP_and";
-  case DW_OP_div:                        return "DW_OP_div";
-  case DW_OP_minus:                      return "DW_OP_minus";
-  case DW_OP_mod:                        return "DW_OP_mod";
-  case DW_OP_mul:                        return "DW_OP_mul";
-  case DW_OP_neg:                        return "DW_OP_neg";
-  case DW_OP_not:                        return "DW_OP_not";
-  case DW_OP_or:                         return "DW_OP_or";
-  case DW_OP_plus:                       return "DW_OP_plus";
-  case DW_OP_plus_uconst:                return "DW_OP_plus_uconst";
-  case DW_OP_shl:                        return "DW_OP_shl";
-  case DW_OP_shr:                        return "DW_OP_shr";
-  case DW_OP_shra:                       return "DW_OP_shra";
-  case DW_OP_xor:                        return "DW_OP_xor";
-  case DW_OP_skip:                       return "DW_OP_skip";
-  case DW_OP_bra:                        return "DW_OP_bra";
-  case DW_OP_eq:                         return "DW_OP_eq";
-  case DW_OP_ge:                         return "DW_OP_ge";
-  case DW_OP_gt:                         return "DW_OP_gt";
-  case DW_OP_le:                         return "DW_OP_le";
-  case DW_OP_lt:                         return "DW_OP_lt";
-  case DW_OP_ne:                         return "DW_OP_ne";
-  case DW_OP_lit0:                       return "DW_OP_lit0";
-  case DW_OP_lit1:                       return "DW_OP_lit1";
-  case DW_OP_lit2:                       return "DW_OP_lit2";
-  case DW_OP_lit3:                       return "DW_OP_lit3";
-  case DW_OP_lit4:                       return "DW_OP_lit4";
-  case DW_OP_lit5:                       return "DW_OP_lit5";
-  case DW_OP_lit6:                       return "DW_OP_lit6";
-  case DW_OP_lit7:                       return "DW_OP_lit7";
-  case DW_OP_lit8:                       return "DW_OP_lit8";
-  case DW_OP_lit9:                       return "DW_OP_lit9";
-  case DW_OP_lit10:                      return "DW_OP_lit10";
-  case DW_OP_lit11:                      return "DW_OP_lit11";
-  case DW_OP_lit12:                      return "DW_OP_lit12";
-  case DW_OP_lit13:                      return "DW_OP_lit13";
-  case DW_OP_lit14:                      return "DW_OP_lit14";
-  case DW_OP_lit15:                      return "DW_OP_lit15";
-  case DW_OP_lit16:                      return "DW_OP_lit16";
-  case DW_OP_lit17:                      return "DW_OP_lit17";
-  case DW_OP_lit18:                      return "DW_OP_lit18";
-  case DW_OP_lit19:                      return "DW_OP_lit19";
-  case DW_OP_lit20:                      return "DW_OP_lit20";
-  case DW_OP_lit21:                      return "DW_OP_lit21";
-  case DW_OP_lit22:                      return "DW_OP_lit22";
-  case DW_OP_lit23:                      return "DW_OP_lit23";
-  case DW_OP_lit24:                      return "DW_OP_lit24";
-  case DW_OP_lit25:                      return "DW_OP_lit25";
-  case DW_OP_lit26:                      return "DW_OP_lit26";
-  case DW_OP_lit27:                      return "DW_OP_lit27";
-  case DW_OP_lit28:                      return "DW_OP_lit28";
-  case DW_OP_lit29:                      return "DW_OP_lit29";
-  case DW_OP_lit30:                      return "DW_OP_lit30";
-  case DW_OP_lit31:                      return "DW_OP_lit31";
-  case DW_OP_reg0:                       return "DW_OP_reg0";
-  case DW_OP_reg1:                       return "DW_OP_reg1";
-  case DW_OP_reg2:                       return "DW_OP_reg2";
-  case DW_OP_reg3:                       return "DW_OP_reg3";
-  case DW_OP_reg4:                       return "DW_OP_reg4";
-  case DW_OP_reg5:                       return "DW_OP_reg5";
-  case DW_OP_reg6:                       return "DW_OP_reg6";
-  case DW_OP_reg7:                       return "DW_OP_reg7";
-  case DW_OP_reg8:                       return "DW_OP_reg8";
-  case DW_OP_reg9:                       return "DW_OP_reg9";
-  case DW_OP_reg10:                      return "DW_OP_reg10";
-  case DW_OP_reg11:                      return "DW_OP_reg11";
-  case DW_OP_reg12:                      return "DW_OP_reg12";
-  case DW_OP_reg13:                      return "DW_OP_reg13";
-  case DW_OP_reg14:                      return "DW_OP_reg14";
-  case DW_OP_reg15:                      return "DW_OP_reg15";
-  case DW_OP_reg16:                      return "DW_OP_reg16";
-  case DW_OP_reg17:                      return "DW_OP_reg17";
-  case DW_OP_reg18:                      return "DW_OP_reg18";
-  case DW_OP_reg19:                      return "DW_OP_reg19";
-  case DW_OP_reg20:                      return "DW_OP_reg20";
-  case DW_OP_reg21:                      return "DW_OP_reg21";
-  case DW_OP_reg22:                      return "DW_OP_reg22";
-  case DW_OP_reg23:                      return "DW_OP_reg23";
-  case DW_OP_reg24:                      return "DW_OP_reg24";
-  case DW_OP_reg25:                      return "DW_OP_reg25";
-  case DW_OP_reg26:                      return "DW_OP_reg26";
-  case DW_OP_reg27:                      return "DW_OP_reg27";
-  case DW_OP_reg28:                      return "DW_OP_reg28";
-  case DW_OP_reg29:                      return "DW_OP_reg29";
-  case DW_OP_reg30:                      return "DW_OP_reg30";
-  case DW_OP_reg31:                      return "DW_OP_reg31";
-  case DW_OP_breg0:                      return "DW_OP_breg0";
-  case DW_OP_breg1:                      return "DW_OP_breg1";
-  case DW_OP_breg2:                      return "DW_OP_breg2";
-  case DW_OP_breg3:                      return "DW_OP_breg3";
-  case DW_OP_breg4:                      return "DW_OP_breg4";
-  case DW_OP_breg5:                      return "DW_OP_breg5";
-  case DW_OP_breg6:                      return "DW_OP_breg6";
-  case DW_OP_breg7:                      return "DW_OP_breg7";
-  case DW_OP_breg8:                      return "DW_OP_breg8";
-  case DW_OP_breg9:                      return "DW_OP_breg9";
-  case DW_OP_breg10:                     return "DW_OP_breg10";
-  case DW_OP_breg11:                     return "DW_OP_breg11";
-  case DW_OP_breg12:                     return "DW_OP_breg12";
-  case DW_OP_breg13:                     return "DW_OP_breg13";
-  case DW_OP_breg14:                     return "DW_OP_breg14";
-  case DW_OP_breg15:                     return "DW_OP_breg15";
-  case DW_OP_breg16:                     return "DW_OP_breg16";
-  case DW_OP_breg17:                     return "DW_OP_breg17";
-  case DW_OP_breg18:                     return "DW_OP_breg18";
-  case DW_OP_breg19:                     return "DW_OP_breg19";
-  case DW_OP_breg20:                     return "DW_OP_breg20";
-  case DW_OP_breg21:                     return "DW_OP_breg21";
-  case DW_OP_breg22:                     return "DW_OP_breg22";
-  case DW_OP_breg23:                     return "DW_OP_breg23";
-  case DW_OP_breg24:                     return "DW_OP_breg24";
-  case DW_OP_breg25:                     return "DW_OP_breg25";
-  case DW_OP_breg26:                     return "DW_OP_breg26";
-  case DW_OP_breg27:                     return "DW_OP_breg27";
-  case DW_OP_breg28:                     return "DW_OP_breg28";
-  case DW_OP_breg29:                     return "DW_OP_breg29";
-  case DW_OP_breg30:                     return "DW_OP_breg30";
-  case DW_OP_breg31:                     return "DW_OP_breg31";
-  case DW_OP_regx:                       return "DW_OP_regx";
-  case DW_OP_fbreg:                      return "DW_OP_fbreg";
-  case DW_OP_bregx:                      return "DW_OP_bregx";
-  case DW_OP_piece:                      return "DW_OP_piece";
-  case DW_OP_deref_size:                 return "DW_OP_deref_size";
-  case DW_OP_xderef_size:                return "DW_OP_xderef_size";
-  case DW_OP_nop:                        return "DW_OP_nop";
-  case DW_OP_push_object_address:        return "DW_OP_push_object_address";
-  case DW_OP_call2:                      return "DW_OP_call2";
-  case DW_OP_call4:                      return "DW_OP_call4";
-  case DW_OP_call_ref:                   return "DW_OP_call_ref";
-  case DW_OP_form_tls_address:           return "DW_OP_form_tls_address";
-  case DW_OP_call_frame_cfa:             return "DW_OP_call_frame_cfa";
-  case DW_OP_bit_piece:                  return "DW_OP_bit_piece";
-  case DW_OP_implicit_value:             return "DW_OP_implicit_value";
-  case DW_OP_stack_value:                return "DW_OP_stack_value";
-
-  // GNU thread-local storage
-  case DW_OP_GNU_push_tls_address:       return "DW_OP_GNU_push_tls_address";
-
-  // DWARF5 Fission Proposal Op Extensions
-  case DW_OP_GNU_addr_index:             return "DW_OP_GNU_addr_index";
-  case DW_OP_GNU_const_index:            return "DW_OP_GNU_const_index";
+  default: return nullptr;
+#define HANDLE_DW_OP(ID, NAME)                                                 \
+  case DW_OP_##NAME:                                                           \
+    return "DW_OP_" #NAME;
+#include "llvm/Support/Dwarf.def"
   }
-  return nullptr;
+}
+
+unsigned llvm::dwarf::getOperationEncoding(StringRef OperationEncodingString) {
+  return StringSwitch<unsigned>(OperationEncodingString)
+#define HANDLE_DW_OP(ID, NAME) .Case("DW_OP_" #NAME, DW_OP_##NAME)
+#include "llvm/Support/Dwarf.def"
+      .Default(0);
 }
 
 const char *llvm::dwarf::AttributeEncodingString(unsigned Encoding) {
   switch (Encoding) {
-  case DW_ATE_address:                   return "DW_ATE_address";
-  case DW_ATE_boolean:                   return "DW_ATE_boolean";
-  case DW_ATE_complex_float:             return "DW_ATE_complex_float";
-  case DW_ATE_float:                     return "DW_ATE_float";
-  case DW_ATE_signed:                    return "DW_ATE_signed";
-  case DW_ATE_signed_char:               return "DW_ATE_signed_char";
-  case DW_ATE_unsigned:                  return "DW_ATE_unsigned";
-  case DW_ATE_unsigned_char:             return "DW_ATE_unsigned_char";
-  case DW_ATE_imaginary_float:           return "DW_ATE_imaginary_float";
-  case DW_ATE_UTF:                       return "DW_ATE_UTF";
-  case DW_ATE_packed_decimal:            return "DW_ATE_packed_decimal";
-  case DW_ATE_numeric_string:            return "DW_ATE_numeric_string";
-  case DW_ATE_edited:                    return "DW_ATE_edited";
-  case DW_ATE_signed_fixed:              return "DW_ATE_signed_fixed";
-  case DW_ATE_unsigned_fixed:            return "DW_ATE_unsigned_fixed";
-  case DW_ATE_decimal_float:             return "DW_ATE_decimal_float";
-  case DW_ATE_lo_user:                   return "DW_ATE_lo_user";
-  case DW_ATE_hi_user:                   return "DW_ATE_hi_user";
+  default: return nullptr;
+#define HANDLE_DW_ATE(ID, NAME)                                                \
+  case DW_ATE_##NAME:                                                          \
+    return "DW_ATE_" #NAME;
+#include "llvm/Support/Dwarf.def"
   }
-  return nullptr;
+}
+
+unsigned llvm::dwarf::getAttributeEncoding(StringRef EncodingString) {
+  return StringSwitch<unsigned>(EncodingString)
+#define HANDLE_DW_ATE(ID, NAME) .Case("DW_ATE_" #NAME, DW_ATE_##NAME)
+#include "llvm/Support/Dwarf.def"
+      .Default(0);
 }
 
 const char *llvm::dwarf::DecimalSignString(unsigned Sign) {
@@ -538,47 +314,39 @@ const char *llvm::dwarf::VisibilityString(unsigned Visibility) {
 
 const char *llvm::dwarf::VirtualityString(unsigned Virtuality) {
   switch (Virtuality) {
-  case DW_VIRTUALITY_none:               return "DW_VIRTUALITY_none";
-  case DW_VIRTUALITY_virtual:            return "DW_VIRTUALITY_virtual";
-  case DW_VIRTUALITY_pure_virtual:       return "DW_VIRTUALITY_pure_virtual";
+  default:
+    return nullptr;
+#define HANDLE_DW_VIRTUALITY(ID, NAME)                                         \
+  case DW_VIRTUALITY_##NAME:                                                   \
+    return "DW_VIRTUALITY_" #NAME;
+#include "llvm/Support/Dwarf.def"
   }
-  return nullptr;
+}
+
+unsigned llvm::dwarf::getVirtuality(StringRef VirtualityString) {
+  return StringSwitch<unsigned>(VirtualityString)
+#define HANDLE_DW_VIRTUALITY(ID, NAME)                                         \
+  .Case("DW_VIRTUALITY_" #NAME, DW_VIRTUALITY_##NAME)
+#include "llvm/Support/Dwarf.def"
+      .Default(DW_VIRTUALITY_invalid);
 }
 
 const char *llvm::dwarf::LanguageString(unsigned Language) {
   switch (Language) {
-  case DW_LANG_C89:                      return "DW_LANG_C89";
-  case DW_LANG_C:                        return "DW_LANG_C";
-  case DW_LANG_Ada83:                    return "DW_LANG_Ada83";
-  case DW_LANG_C_plus_plus:              return "DW_LANG_C_plus_plus";
-  case DW_LANG_Cobol74:                  return "DW_LANG_Cobol74";
-  case DW_LANG_Cobol85:                  return "DW_LANG_Cobol85";
-  case DW_LANG_Fortran77:                return "DW_LANG_Fortran77";
-  case DW_LANG_Fortran90:                return "DW_LANG_Fortran90";
-  case DW_LANG_Pascal83:                 return "DW_LANG_Pascal83";
-  case DW_LANG_Modula2:                  return "DW_LANG_Modula2";
-  case DW_LANG_Java:                     return "DW_LANG_Java";
-  case DW_LANG_C99:                      return "DW_LANG_C99";
-  case DW_LANG_Ada95:                    return "DW_LANG_Ada95";
-  case DW_LANG_Fortran95:                return "DW_LANG_Fortran95";
-  case DW_LANG_PLI:                      return "DW_LANG_PLI";
-  case DW_LANG_ObjC:                     return "DW_LANG_ObjC";
-  case DW_LANG_ObjC_plus_plus:           return "DW_LANG_ObjC_plus_plus";
-  case DW_LANG_UPC:                      return "DW_LANG_UPC";
-  case DW_LANG_D:                        return "DW_LANG_D";
-  case DW_LANG_Python:                   return "DW_LANG_Python";
-  case DW_LANG_OpenCL:                   return "DW_LANG_OpenCL";
-  case DW_LANG_Go:                       return "DW_LANG_Go";
-  case DW_LANG_Modula3:                  return "DW_LANG_Modula3";
-  case DW_LANG_Haskell:                  return "DW_LANG_Haskell";
-  case DW_LANG_C_plus_plus_03:           return "DW_LANG_C_plus_plus_03";
-  case DW_LANG_C_plus_plus_11:           return "DW_LANG_C_plus_plus_11";
-  case DW_LANG_OCaml:                    return "DW_LANG_OCaml";
-  case DW_LANG_lo_user:                  return "DW_LANG_lo_user";
-  case DW_LANG_Mips_Assembler:           return "DW_LANG_Mips_Assembler";
-  case DW_LANG_hi_user:                  return "DW_LANG_hi_user";
+  default:
+    return nullptr;
+#define HANDLE_DW_LANG(ID, NAME)                                               \
+  case DW_LANG_##NAME:                                                         \
+    return "DW_LANG_" #NAME;
+#include "llvm/Support/Dwarf.def"
   }
-  return nullptr;
+}
+
+unsigned llvm::dwarf::getLanguage(StringRef LanguageString) {
+  return StringSwitch<unsigned>(LanguageString)
+#define HANDLE_DW_LANG(ID, NAME) .Case("DW_LANG_" #NAME, DW_LANG_##NAME)
+#include "llvm/Support/Dwarf.def"
+      .Default(0);
 }
 
 const char *llvm::dwarf::CaseString(unsigned Case) {
diff --git a/lib/Support/Errno.cpp b/lib/Support/Errno.cpp
index 1eefa3e..3ba2a12 100644
--- a/lib/Support/Errno.cpp
+++ b/lib/Support/Errno.cpp
@@ -35,12 +35,14 @@ std::string StrError() {
 #endif  // HAVE_ERRNO_H
 
 std::string StrError(int errnum) {
-  const int MaxErrStrLen = 2000;
-  char buffer[MaxErrStrLen];
-  buffer[0] = '\0';
   std::string str;
   if (errnum == 0)
     return str;
+#if defined(HAVE_STRERROR_R) || HAVE_DECL_STRERROR_S
+  const int MaxErrStrLen = 2000;
+  char buffer[MaxErrStrLen];
+  buffer[0] = '\0';
+#endif  
 
 #ifdef HAVE_STRERROR_R
   // strerror_r is thread-safe.
diff --git a/lib/Support/ErrorHandling.cpp b/lib/Support/ErrorHandling.cpp
index 8e65066..a25e21a 100644
--- a/lib/Support/ErrorHandling.cpp
+++ b/lib/Support/ErrorHandling.cpp
@@ -19,10 +19,10 @@
 #include "llvm/Config/config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/MutexGuard.h"
+#include "llvm/Support/Signals.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/WindowsError.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/lib/Support/FileOutputBuffer.cpp b/lib/Support/FileOutputBuffer.cpp
index c62655d..b176a8b 100644
--- a/lib/Support/FileOutputBuffer.cpp
+++ b/lib/Support/FileOutputBuffer.cpp
@@ -12,12 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Errc.h"
-#include "llvm/Support/FileOutputBuffer.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <system_error>
 
+#if !defined(_MSC_VER) && !defined(__MINGW32__)
+#include <unistd.h>
+#else
+#include <io.h>
+#endif
+
 using llvm::sys::fs::mapped_file_region;
 
 namespace llvm {
@@ -71,10 +77,17 @@ FileOutputBuffer::create(StringRef FilePath, size_t Size,
   if (EC)
     return EC;
 
+  EC = sys::fs::resize_file(FD, Size);
+  if (EC)
+    return EC;
+
   auto MappedFile = llvm::make_unique<mapped_file_region>(
-      FD, true, mapped_file_region::readwrite, Size, 0, EC);
+      FD, mapped_file_region::readwrite, Size, 0, EC);
+  int Ret = close(FD);
   if (EC)
     return EC;
+  if (Ret)
+    return std::error_code(errno, std::generic_category());
 
   Result.reset(
       new FileOutputBuffer(std::move(MappedFile), FilePath, TempFilePath));
@@ -82,16 +95,10 @@ FileOutputBuffer::create(StringRef FilePath, size_t Size,
   return std::error_code();
 }
 
-std::error_code FileOutputBuffer::commit(int64_t NewSmallerSize) {
+std::error_code FileOutputBuffer::commit() {
   // Unmap buffer, letting OS flush dirty pages to file on disk.
   Region.reset();
 
-  // If requested, resize file as part of commit.
-  if ( NewSmallerSize != -1 ) {
-    std::error_code EC = sys::fs::resize_file(Twine(TempPath), NewSmallerSize);
-    if (EC)
-      return EC;
-  }
 
   // Rename file to final name.
   return sys::fs::rename(Twine(TempPath), Twine(FinalPath));
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 8782e2e..42bc342 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -17,8 +17,8 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Config/config.h"
-#include "llvm/Support/DataStream.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include <string.h>
 
@@ -49,6 +49,26 @@
 
 using namespace llvm;
 
+#if defined(__linux__)
+static ssize_t LLVM_ATTRIBUTE_UNUSED readCpuInfo(void *Buf, size_t Size) {
+  // Note: We cannot mmap /proc/cpuinfo here and then process the resulting
+  // memory buffer because the 'file' has 0 size (it can be read from only
+  // as a stream).
+
+  int FD;
+  std::error_code EC = sys::fs::openFileForRead("/proc/cpuinfo", FD);
+  if (EC) {
+    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << EC.message() << "\n");
+    return -1;
+  }
+  int Ret = read(FD, Buf, Size);
+  int CloseStatus = close(FD);
+  if (CloseStatus)
+    return -1;
+  return Ret;
+}
+#endif
+
 #if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
  || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
 
@@ -117,18 +137,13 @@ static bool GetX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
             "c" (subleaf));
     return false;
   #elif defined(_MSC_VER)
-    // __cpuidex was added in MSVC++ 9.0 SP1
-    #if (_MSC_VER > 1500) || (_MSC_VER == 1500 && _MSC_FULL_VER >= 150030729)
-      int registers[4];
-      __cpuidex(registers, value, subleaf);
-      *rEAX = registers[0];
-      *rEBX = registers[1];
-      *rECX = registers[2];
-      *rEDX = registers[3];
-      return false;
-    #else
-      return true;
-    #endif
+    int registers[4];
+    __cpuidex(registers, value, subleaf);
+    *rEAX = registers[0];
+    *rEBX = registers[1];
+    *rECX = registers[2];
+    *rEDX = registers[3];
+    return false;
   #else
     return true;
   #endif
@@ -489,22 +504,12 @@ StringRef sys::getHostCPUName() {
   // processor type. On Linux, this is exposed through the /proc/cpuinfo file.
   const char *generic = "generic";
 
-  // Note: We cannot mmap /proc/cpuinfo here and then process the resulting
-  // memory buffer because the 'file' has 0 size (it can be read from only
-  // as a stream).
-
-  std::string Err;
-  DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
-  if (!DS) {
-    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n");
-    return generic;
-  }
-
   // The cpu line is second (after the 'processor: 0' line), so if this
   // buffer is too small then something has changed (or is wrong).
   char buffer[1024];
-  size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer));
-  delete DS;
+  ssize_t CPUInfoSize = readCpuInfo(buffer, sizeof(buffer));
+  if (CPUInfoSize == -1)
+    return generic;
 
   const char *CPUInfoStart = buffer;
   const char *CPUInfoEnd = buffer + CPUInfoSize;
@@ -578,22 +583,13 @@ StringRef sys::getHostCPUName() {
 StringRef sys::getHostCPUName() {
   // The cpuid register on arm is not accessible from user space. On Linux,
   // it is exposed through the /proc/cpuinfo file.
-  // Note: We cannot mmap /proc/cpuinfo here and then process the resulting
-  // memory buffer because the 'file' has 0 size (it can be read from only
-  // as a stream).
-
-  std::string Err;
-  DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
-  if (!DS) {
-    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n");
-    return "generic";
-  }
 
   // Read 1024 bytes from /proc/cpuinfo, which should contain the CPU part line
   // in all cases.
   char buffer[1024];
-  size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer));
-  delete DS;
+  ssize_t CPUInfoSize = readCpuInfo(buffer, sizeof(buffer));
+  if (CPUInfoSize == -1)
+    return "generic";
 
   StringRef Str(buffer, CPUInfoSize);
 
@@ -643,22 +639,13 @@ StringRef sys::getHostCPUName() {
 #elif defined(__linux__) && defined(__s390x__)
 StringRef sys::getHostCPUName() {
   // STIDP is a privileged operation, so use /proc/cpuinfo instead.
-  // Note: We cannot mmap /proc/cpuinfo here and then process the resulting
-  // memory buffer because the 'file' has 0 size (it can be read from only
-  // as a stream).
-
-  std::string Err;
-  DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
-  if (!DS) {
-    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n");
-    return "generic";
-  }
 
   // The "processor 0:" line comes after a fair amount of other information,
   // including a cache breakdown, but this should be plenty.
   char buffer[2048];
-  size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer));
-  delete DS;
+  ssize_t CPUInfoSize = readCpuInfo(buffer, sizeof(buffer));
+  if (CPUInfoSize == -1)
+    return "generic";
 
   StringRef Str(buffer, CPUInfoSize);
   SmallVector<StringRef, 32> Lines;
@@ -690,18 +677,12 @@ StringRef sys::getHostCPUName() {
 
 #if defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
 bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
-  std::string Err;
-  DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
-  if (!DS) {
-    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n");
-    return false;
-  }
-
   // Read 1024 bytes from /proc/cpuinfo, which should contain the Features line
   // in all cases.
   char buffer[1024];
-  size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer));
-  delete DS;
+  ssize_t CPUInfoSize = readCpuInfo(buffer, sizeof(buffer));
+  if (CPUInfoSize == -1)
+    return false;
 
   StringRef Str(buffer, CPUInfoSize);
 
diff --git a/lib/Support/LockFileManager.cpp b/lib/Support/LockFileManager.cpp
index 5b82c36..ec3158c 100644
--- a/lib/Support/LockFileManager.cpp
+++ b/lib/Support/LockFileManager.cpp
@@ -186,9 +186,8 @@ LockFileManager::WaitForUnlockResult LockFileManager::waitForUnlock() {
   Interval.tv_sec = 0;
   Interval.tv_nsec = 1000000;
 #endif
-  // Don't wait more than five minutes for the file to appear.
-  unsigned MaxSeconds = 300;
-  bool LockFileGone = false;
+  // Don't wait more than one minute for the file to appear.
+  const unsigned MaxSeconds = 60;
   do {
     // Sleep for the designated interval, to allow the owning process time to
     // finish up and remove the lock file.
@@ -199,47 +198,18 @@ LockFileManager::WaitForUnlockResult LockFileManager::waitForUnlock() {
 #else
     nanosleep(&Interval, nullptr);
 #endif
-    bool LockFileJustDisappeared = false;
 
-    // If the lock file is still expected to be there, check whether it still
-    // is.
-    if (!LockFileGone) {
-      if (sys::fs::access(LockFileName.c_str(), sys::fs::AccessMode::Exist) ==
-          errc::no_such_file_or_directory) {
-        LockFileGone = true;
-        LockFileJustDisappeared = true;
-      }
+    if (sys::fs::access(LockFileName.c_str(), sys::fs::AccessMode::Exist) ==
+        errc::no_such_file_or_directory) {
+      // If the original file wasn't created, somone thought the lock was dead.
+      if (!sys::fs::exists(FileName.str()))
+        return Res_OwnerDied;
+      return Res_Success;
     }
 
-    // If the lock file is no longer there, check if the original file is
-    // available now.
-    if (LockFileGone) {
-      if (sys::fs::exists(FileName.str())) {
-        return Res_Success;
-      }
-
-      // The lock file is gone, so now we're waiting for the original file to
-      // show up. If this just happened, reset our waiting intervals and keep
-      // waiting.
-      if (LockFileJustDisappeared) {
-        MaxSeconds = 5;
-
-#if LLVM_ON_WIN32
-        Interval = 1;
-#else
-        Interval.tv_sec = 0;
-        Interval.tv_nsec = 1000000;
-#endif
-        continue;
-      }
-    }
-
-    // If we're looking for the lock file to disappear, but the process
-    // owning the lock died without cleaning up, just bail out.
-    if (!LockFileGone &&
-        !processStillExecuting((*Owner).first, (*Owner).second)) {
+    // If the process owning the lock died without cleaning up, just bail out.
+    if (!processStillExecuting((*Owner).first, (*Owner).second))
       return Res_OwnerDied;
-    }
 
     // Exponentially increase the time we wait for the lock to be removed.
 #if LLVM_ON_WIN32
@@ -263,3 +233,7 @@ LockFileManager::WaitForUnlockResult LockFileManager::waitForUnlock() {
   // Give up.
   return Res_Timeout;
 }
+
+std::error_code LockFileManager::unsafeRemoveLockFile() {
+  return sys::fs::remove(LockFileName.str());
+}
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 7eb0752..379db88 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -203,8 +203,8 @@ class MemoryBufferMMapFile : public MemoryBuffer {
 
 public:
   MemoryBufferMMapFile(bool RequiresNullTerminator, int FD, uint64_t Len,
-                       uint64_t Offset, std::error_code EC)
-      : MFR(FD, false, sys::fs::mapped_file_region::readonly,
+                       uint64_t Offset, std::error_code &EC)
+      : MFR(FD, sys::fs::mapped_file_region::readonly,
             getLegalMapSize(Len, Offset), getLegalMapOffset(Offset), EC) {
     if (!EC) {
       const char *Start = getStart(Len, Offset);
@@ -330,7 +330,7 @@ static ErrorOr<std::unique_ptr<MemoryBuffer>>
 getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
                 uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,
                 bool IsVolatileSize) {
-  static int PageSize = sys::process::get_self()->page_size();
+  static int PageSize = sys::Process::getPageSize();
 
   // Default is to map the full file.
   if (MapSize == uint64_t(-1)) {
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index a7a9919..a11bb7f 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -14,9 +14,9 @@
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Errc.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include <cctype>
 #include <cstdio>
@@ -888,6 +888,14 @@ bool is_other(file_status status) {
          !is_directory(status);
 }
 
+std::error_code is_other(const Twine &Path, bool &Result) {
+  file_status FileStatus;
+  if (std::error_code EC = status(Path, FileStatus))
+    return EC;
+  Result = is_other(FileStatus);
+  return std::error_code();
+}
+
 void directory_entry::replace_filename(const Twine &filename, file_status st) {
   SmallString<128> path(Path.begin(), Path.end());
   path::remove_filename(path);
@@ -952,7 +960,7 @@ file_magic identify_magic(StringRef Magic) {
         unsigned low  = Data2MSB ? 17 : 16;
         if (Magic[high] == 0)
           switch (Magic[low]) {
-            default: break;
+            default: return file_magic::elf;
             case 1: return file_magic::elf_relocatable;
             case 2: return file_magic::elf_executable;
             case 3: return file_magic::elf_shared_object;
@@ -1004,6 +1012,7 @@ file_magic identify_magic(StringRef Magic) {
         case 8: return file_magic::macho_bundle;
         case 9: return file_magic::macho_dynamically_linked_shared_lib_stub;
         case 10: return file_magic::macho_dsym_companion;
+        case 11: return file_magic::macho_kext_bundle;
       }
       break;
     }
diff --git a/lib/Support/PrettyStackTrace.cpp b/lib/Support/PrettyStackTrace.cpp
index 987778a..f9f8cab 100644
--- a/lib/Support/PrettyStackTrace.cpp
+++ b/lib/Support/PrettyStackTrace.cpp
@@ -16,9 +16,8 @@
 #include "llvm-c/Core.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"     // Get autoconf configuration settings
-#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Signals.h"
-#include "llvm/Support/ThreadLocal.h"
 #include "llvm/Support/Watchdog.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -28,7 +27,17 @@
 
 using namespace llvm;
 
-static ManagedStatic<sys::ThreadLocal<const PrettyStackTraceEntry> > PrettyStackTraceHead;
+// If backtrace support is not enabled, compile out support for pretty stack
+// traces.  This has the secondary effect of not requiring thread local storage
+// when backtrace support is disabled.
+#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
+
+// We need a thread local pointer to manage the stack of our stack trace
+// objects, but we *really* cannot tolerate destructors running and do not want
+// to pay any overhead of synchronizing. As a consequence, we use a raw
+// thread-local variable.
+static LLVM_THREAD_LOCAL const PrettyStackTraceEntry *PrettyStackTraceHead =
+    nullptr;
 
 static unsigned PrintStack(const PrettyStackTraceEntry *Entry, raw_ostream &OS){
   unsigned NextID = 0;
@@ -46,12 +55,12 @@ static unsigned PrintStack(const PrettyStackTraceEntry *Entry, raw_ostream &OS){
 /// PrintCurStackTrace - Print the current stack trace to the specified stream.
 static void PrintCurStackTrace(raw_ostream &OS) {
   // Don't print an empty trace.
-  if (!PrettyStackTraceHead->get()) return;
+  if (!PrettyStackTraceHead) return;
   
   // If there are pretty stack frames registered, walk and emit them.
   OS << "Stack dump:\n";
   
-  PrintStack(PrettyStackTraceHead->get(), OS);
+  PrintStack(PrettyStackTraceHead, OS);
   OS.flush();
 }
 
@@ -99,28 +108,23 @@ static void CrashHandler(void *) {
 #endif
 }
 
+// defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
+#endif
+
 PrettyStackTraceEntry::PrettyStackTraceEntry() {
+#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
   // Link ourselves.
-  NextEntry = PrettyStackTraceHead->get();
-  PrettyStackTraceHead->set(this);
+  NextEntry = PrettyStackTraceHead;
+  PrettyStackTraceHead = this;
+#endif
 }
 
 PrettyStackTraceEntry::~PrettyStackTraceEntry() {
-  // Do nothing if PrettyStackTraceHead is uninitialized. This can only happen
-  // if a shutdown occurred after we created the PrettyStackTraceEntry. That
-  // does occur in the following idiom:
-  //
-  // PrettyStackTraceProgram X(...);
-  // llvm_shutdown_obj Y;
-  //
-  // Without this check, we may end up removing ourselves from the stack trace
-  // after PrettyStackTraceHead has already been destroyed.
-  if (!PrettyStackTraceHead.isConstructed())
-    return;
-  
-  assert(PrettyStackTraceHead->get() == this &&
+#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
+  assert(PrettyStackTraceHead == this &&
          "Pretty stack trace entry destruction is out of order");
-  PrettyStackTraceHead->set(getNextEntry());
+  PrettyStackTraceHead = getNextEntry();
+#endif
 }
 
 void PrettyStackTraceString::print(raw_ostream &OS) const {
@@ -135,15 +139,19 @@ void PrettyStackTraceProgram::print(raw_ostream &OS) const {
   OS << '\n';
 }
 
+#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
 static bool RegisterCrashPrinter() {
   sys::AddSignalHandler(CrashHandler, nullptr);
   return false;
 }
+#endif
 
 void llvm::EnablePrettyStackTrace() {
+#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
   // The first time this is called, we register the crash printer.
   static bool HandlerRegistered = RegisterCrashPrinter();
   (void)HandlerRegistered;
+#endif
 }
 
 void LLVMEnablePrettyStackTrace() {
diff --git a/lib/Support/Process.cpp b/lib/Support/Process.cpp
index 0d42e0e..ad67e1b 100644
--- a/lib/Support/Process.cpp
+++ b/lib/Support/Process.cpp
@@ -26,25 +26,6 @@ using namespace sys;
 //===          independent code.
 //===----------------------------------------------------------------------===//
 
-// Empty virtual destructor to anchor the vtable for the process class.
-process::~process() {}
-
-self_process *process::get_self() {
-  // Use a function local static for thread safe initialization and allocate it
-  // as a raw pointer to ensure it is never destroyed.
-  static self_process *SP = new self_process();
-
-  return SP;
-}
-
-// The destructor for the self_process subclass must never actually be
-// executed. There should be at most one instance of this class, and that
-// instance should live until the process terminates to avoid the potential for
-// racy accesses during shutdown.
-self_process::~self_process() {
-  llvm_unreachable("This destructor must never be executed!");
-}
-
 /// \brief A helper function to compute the elapsed wall-time since the program
 /// started.
 ///
@@ -63,12 +44,6 @@ static TimeValue getElapsedWallTime() {
 /// create race conditions during program startup or shutdown.
 static volatile TimeValue DummyTimeValue = getElapsedWallTime();
 
-// Implement this routine by using the static helpers above. They're already
-// portable.
-TimeValue self_process::get_wall_time() const {
-  return getElapsedWallTime();
-}
-
 Optional<std::string> Process::FindInEnvPath(const std::string& EnvName,
                                              const std::string& FileName)
 {
diff --git a/lib/Support/RandomNumberGenerator.cpp b/lib/Support/RandomNumberGenerator.cpp
index c50e7cb..2943137 100644
--- a/lib/Support/RandomNumberGenerator.cpp
+++ b/lib/Support/RandomNumberGenerator.cpp
@@ -7,16 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements random number generation (RNG).
+// This file implements deterministic random number generation (RNG).
 // The current implementation is NOT cryptographically secure as it uses
 // the C++11 <random> facilities.
 //
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "rng"
-#include "llvm/Support/RandomNumberGenerator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/RandomNumberGenerator.h"
 
 using namespace llvm;
 
@@ -31,31 +31,25 @@ Seed("rng-seed", cl::value_desc("seed"),
 RandomNumberGenerator::RandomNumberGenerator(StringRef Salt) {
   DEBUG(
     if (Seed == 0)
-      errs() << "Warning! Using unseeded random number generator.\n"
+      dbgs() << "Warning! Using unseeded random number generator.\n"
   );
 
-  // Combine seed and salt using std::seed_seq.
-  // Entropy: Seed-low, Seed-high, Salt...
+  // Combine seed and salts using std::seed_seq.
+  // Data: Seed-low, Seed-high, Salt
+  // Note: std::seed_seq can only store 32-bit values, even though we
+  // are using a 64-bit RNG. This isn't a problem since the Mersenne
+  // twister constructor copies these correctly into its initial state.
   std::vector<uint32_t> Data;
-  Data.reserve(2 + Salt.size()/4 + 1);
+  Data.reserve(2 + Salt.size());
   Data.push_back(Seed);
   Data.push_back(Seed >> 32);
 
-  uint32_t Pack = 0;
-  for (size_t I = 0; I < Salt.size(); ++I) {
-    Pack <<= 8;
-    Pack += Salt[I];
-
-    if (I%4 == 3)
-      Data.push_back(Pack);
-  }
-  Data.push_back(Pack);
+  std::copy(Salt.begin(), Salt.end(), Data.end());
 
   std::seed_seq SeedSeq(Data.begin(), Data.end());
   Generator.seed(SeedSeq);
 }
 
-uint64_t RandomNumberGenerator::next(uint64_t Max) {
-  std::uniform_int_distribution<uint64_t> distribution(0, Max - 1);
-  return distribution(Generator);
+uint_fast64_t RandomNumberGenerator::operator()() {
+  return Generator();
 }
diff --git a/lib/Support/ScaledNumber.cpp b/lib/Support/ScaledNumber.cpp
index fc6d4e7..6f6699c 100644
--- a/lib/Support/ScaledNumber.cpp
+++ b/lib/Support/ScaledNumber.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ScaledNumber.h"
-
 #include "llvm/ADT/APFloat.h"
 #include "llvm/Support/Debug.h"
 
@@ -169,6 +168,7 @@ static std::string toStringAPFloat(uint64_t D, int E, unsigned Precision) {
   int Shift = 63 - (NewE - E);
   assert(Shift <= LeadingZeros);
   assert(Shift == LeadingZeros || NewE == ScaledNumbers::MaxScale);
+  assert(Shift >= 0 && Shift < 64 && "undefined behavior");
   D <<= Shift;
   E = NewE;
 
diff --git a/lib/Support/SmallPtrSet.cpp b/lib/Support/SmallPtrSet.cpp
index c87ee7d..358c8e8 100644
--- a/lib/Support/SmallPtrSet.cpp
+++ b/lib/Support/SmallPtrSet.cpp
@@ -50,11 +50,12 @@ SmallPtrSetImplBase::insert_imp(const void *Ptr) {
     }
     // Otherwise, hit the big set case, which will call grow.
   }
-  
-  if (NumElements*4 >= CurArraySize*3) {
+
+  if (LLVM_UNLIKELY(NumElements * 4 >= CurArraySize * 3)) {
     // If more than 3/4 of the array is full, grow.
     Grow(CurArraySize < 64 ? 128 : CurArraySize*2);
-  } else if (CurArraySize-(NumElements+NumTombstones) < CurArraySize/8) {
+  } else if (LLVM_UNLIKELY(CurArraySize - (NumElements + NumTombstones) <
+                           CurArraySize / 8)) {
     // If fewer of 1/8 of the array is empty (meaning that many are filled with
     // tombstones), rehash.
     Grow(CurArraySize);
@@ -107,16 +108,16 @@ const void * const *SmallPtrSetImplBase::FindBucketFor(const void *Ptr) const {
   const void *const *Array = CurArray;
   const void *const *Tombstone = nullptr;
   while (1) {
-    // Found Ptr's bucket?
-    if (Array[Bucket] == Ptr)
-      return Array+Bucket;
-    
     // If we found an empty bucket, the pointer doesn't exist in the set.
     // Return a tombstone if we've seen one so far, or the empty bucket if
     // not.
-    if (Array[Bucket] == getEmptyMarker())
+    if (LLVM_LIKELY(Array[Bucket] == getEmptyMarker()))
       return Tombstone ? Tombstone : Array+Bucket;
-    
+
+    // Found Ptr's bucket?
+    if (LLVM_LIKELY(Array[Bucket] == Ptr))
+      return Array+Bucket;
+
     // If this is a tombstone, remember it.  If Ptr ends up not in the set, we
     // prefer to return it than something that would require more probing.
     if (Array[Bucket] == getTombstoneMarker() && !Tombstone)
diff --git a/lib/Support/SpecialCaseList.cpp b/lib/Support/SpecialCaseList.cpp
index 785cc60..c312cc1 100644
--- a/lib/Support/SpecialCaseList.cpp
+++ b/lib/Support/SpecialCaseList.cpp
@@ -46,19 +46,27 @@ struct SpecialCaseList::Entry {
   }
 };
 
-SpecialCaseList::SpecialCaseList() : Entries() {}
+SpecialCaseList::SpecialCaseList() : Entries(), Regexps(), IsCompiled(false) {}
 
-std::unique_ptr<SpecialCaseList> SpecialCaseList::create(StringRef Path,
-                                                         std::string &Error) {
-  if (Path.empty())
-    return std::unique_ptr<SpecialCaseList>(new SpecialCaseList());
-  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
-      MemoryBuffer::getFile(Path);
-  if (std::error_code EC = FileOrErr.getError()) {
-    Error = (Twine("Can't open file '") + Path + "': " + EC.message()).str();
-    return nullptr;
+std::unique_ptr<SpecialCaseList>
+SpecialCaseList::create(const std::vector<std::string> &Paths,
+                        std::string &Error) {
+  std::unique_ptr<SpecialCaseList> SCL(new SpecialCaseList());
+  for (auto Path : Paths) {
+    ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
+        MemoryBuffer::getFile(Path);
+    if (std::error_code EC = FileOrErr.getError()) {
+      Error = (Twine("can't open file '") + Path + "': " + EC.message()).str();
+      return nullptr;
+    }
+    std::string ParseError;
+    if (!SCL->parse(FileOrErr.get().get(), ParseError)) {
+      Error = (Twine("error parsing file '") + Path + "': " + ParseError).str();
+      return nullptr;
+    }
   }
-  return create(FileOrErr.get().get(), Error);
+  SCL->compile();
+  return SCL;
 }
 
 std::unique_ptr<SpecialCaseList> SpecialCaseList::create(const MemoryBuffer *MB,
@@ -66,12 +74,14 @@ std::unique_ptr<SpecialCaseList> SpecialCaseList::create(const MemoryBuffer *MB,
   std::unique_ptr<SpecialCaseList> SCL(new SpecialCaseList());
   if (!SCL->parse(MB, Error))
     return nullptr;
+  SCL->compile();
   return SCL;
 }
 
-std::unique_ptr<SpecialCaseList> SpecialCaseList::createOrDie(StringRef Path) {
+std::unique_ptr<SpecialCaseList>
+SpecialCaseList::createOrDie(const std::vector<std::string> &Paths) {
   std::string Error;
-  if (auto SCL = create(Path, Error))
+  if (auto SCL = create(Paths, Error))
     return SCL;
   report_fatal_error(Error);
 }
@@ -80,12 +90,8 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB, std::string &Error) {
   // Iterate through each line in the blacklist file.
   SmallVector<StringRef, 16> Lines;
   SplitString(MB->getBuffer(), Lines, "\n\r");
-  StringMap<StringMap<std::string> > Regexps;
-  assert(Entries.empty() &&
-         "parse() should be called on an empty SpecialCaseList");
   int LineNo = 1;
-  for (SmallVectorImpl<StringRef>::iterator I = Lines.begin(), E = Lines.end();
-       I != E; ++I, ++LineNo) {
+  for (auto I = Lines.begin(), E = Lines.end(); I != E; ++I, ++LineNo) {
     // Ignore empty lines and lines starting with "#"
     if (I->empty() || I->startswith("#"))
       continue;
@@ -94,7 +100,7 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB, std::string &Error) {
     StringRef Prefix = SplitLine.first;
     if (SplitLine.second.empty()) {
       // Missing ':' in the line.
-      Error = (Twine("Malformed line ") + Twine(LineNo) + ": '" +
+      Error = (Twine("malformed line ") + Twine(LineNo) + ": '" +
                SplitLine.first + "'").str();
       return false;
     }
@@ -119,7 +125,7 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB, std::string &Error) {
     Regex CheckRE(Regexp);
     std::string REError;
     if (!CheckRE.isValid(REError)) {
-      Error = (Twine("Malformed regex in line ") + Twine(LineNo) + ": '" +
+      Error = (Twine("malformed regex in line ") + Twine(LineNo) + ": '" +
                SplitLine.second + "': " + REError).str();
       return false;
     }
@@ -129,10 +135,14 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB, std::string &Error) {
       Regexps[Prefix][Category] += "|";
     Regexps[Prefix][Category] += "^" + Regexp + "$";
   }
+  return true;
+}
 
+void SpecialCaseList::compile() {
+  assert(!IsCompiled && "compile() should only be called once");
   // Iterate through each of the prefixes, and create Regexs for them.
-  for (StringMap<StringMap<std::string> >::const_iterator I = Regexps.begin(),
-                                                          E = Regexps.end();
+  for (StringMap<StringMap<std::string>>::const_iterator I = Regexps.begin(),
+                                                         E = Regexps.end();
        I != E; ++I) {
     for (StringMap<std::string>::const_iterator II = I->second.begin(),
                                                 IE = I->second.end();
@@ -140,13 +150,15 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB, std::string &Error) {
       Entries[I->getKey()][II->getKey()].RegEx.reset(new Regex(II->getValue()));
     }
   }
-  return true;
+  Regexps.clear();
+  IsCompiled = true;
 }
 
 SpecialCaseList::~SpecialCaseList() {}
 
 bool SpecialCaseList::inSection(StringRef Section, StringRef Query,
                                 StringRef Category) const {
+  assert(IsCompiled && "SpecialCaseList::compile() was not called!");
   StringMap<StringMap<Entry> >::const_iterator I = Entries.find(Section);
   if (I == Entries.end()) return false;
   StringMap<Entry>::const_iterator II = I->second.find(Category);
diff --git a/lib/Support/StreamingMemoryObject.cpp b/lib/Support/StreamingMemoryObject.cpp
index 68beeef..f39bc56 100644
--- a/lib/Support/StreamingMemoryObject.cpp
+++ b/lib/Support/StreamingMemoryObject.cpp
@@ -45,8 +45,8 @@ private:
     return static_cast<std::ptrdiff_t>(address) < LastChar - FirstChar;
   }
 
-  RawMemoryObject(const RawMemoryObject&) LLVM_DELETED_FUNCTION;
-  void operator=(const RawMemoryObject&) LLVM_DELETED_FUNCTION;
+  RawMemoryObject(const RawMemoryObject&) = delete;
+  void operator=(const RawMemoryObject&) = delete;
 };
 
 uint64_t RawMemoryObject::readBytes(uint8_t *Buf, uint64_t Size,
diff --git a/lib/Support/StringMap.cpp b/lib/Support/StringMap.cpp
index ddb7349..7be9466 100644
--- a/lib/Support/StringMap.cpp
+++ b/lib/Support/StringMap.cpp
@@ -188,9 +188,10 @@ unsigned StringMapImpl::RehashTable(unsigned BucketNo) {
   // If the hash table is now more than 3/4 full, or if fewer than 1/8 of
   // the buckets are empty (meaning that many are filled with tombstones),
   // grow/rehash the table.
-  if (NumItems*4 > NumBuckets*3) {
+  if (LLVM_UNLIKELY(NumItems * 4 > NumBuckets * 3)) {
     NewSize = NumBuckets*2;
-  } else if (NumBuckets-(NumItems+NumTombstones) <= NumBuckets/8) {
+  } else if (LLVM_UNLIKELY(NumBuckets - (NumItems + NumTombstones) <=
+                           NumBuckets / 8)) {
     NewSize = NumBuckets;
   } else {
     return BucketNo;
diff --git a/lib/Support/ThreadLocal.cpp b/lib/Support/ThreadLocal.cpp
index 2dec9eb..9da1603 100644
--- a/lib/Support/ThreadLocal.cpp
+++ b/lib/Support/ThreadLocal.cpp
@@ -31,58 +31,14 @@ void ThreadLocalImpl::setInstance(const void* d) {
   void **pd = reinterpret_cast<void**>(&data);
   *pd = const_cast<void*>(d);
 }
-const void* ThreadLocalImpl::getInstance() {
+void *ThreadLocalImpl::getInstance() {
   void **pd = reinterpret_cast<void**>(&data);
   return *pd;
 }
 void ThreadLocalImpl::removeInstance() {
-  setInstance(0);
-}
-}
-#else
-
-#if defined(HAVE_PTHREAD_H) && defined(HAVE_PTHREAD_GETSPECIFIC)
-
-#include <cassert>
-#include <pthread.h>
-#include <stdlib.h>
-
-namespace llvm {
-using namespace sys;
-
-ThreadLocalImpl::ThreadLocalImpl() : data() {
-  static_assert(sizeof(pthread_key_t) <= sizeof(data), "size too big");
-  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
-  int errorcode = pthread_key_create(key, nullptr);
-  assert(errorcode == 0);
-  (void) errorcode;
-}
-
-ThreadLocalImpl::~ThreadLocalImpl() {
-  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
-  int errorcode = pthread_key_delete(*key);
-  assert(errorcode == 0);
-  (void) errorcode;
-}
-
-void ThreadLocalImpl::setInstance(const void* d) {
-  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
-  int errorcode = pthread_setspecific(*key, d);
-  assert(errorcode == 0);
-  (void) errorcode;
-}
-
-const void* ThreadLocalImpl::getInstance() {
-  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
-  return pthread_getspecific(*key);
-}
-
-void ThreadLocalImpl::removeInstance() {
   setInstance(nullptr);
 }
-
 }
-
 #elif defined(LLVM_ON_UNIX)
 #include "Unix/ThreadLocal.inc"
 #elif defined( LLVM_ON_WIN32)
@@ -90,4 +46,3 @@ void ThreadLocalImpl::removeInstance() {
 #else
 #warning Neither LLVM_ON_UNIX nor LLVM_ON_WIN32 set in Support/ThreadLocal.cpp
 #endif
-#endif
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 4a4773e..e74b23c 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -23,6 +23,7 @@ const char *Triple::getArchTypeName(ArchType Kind) {
   case aarch64_be:  return "aarch64_be";
   case arm:         return "arm";
   case armeb:       return "armeb";
+  case bpf:         return "bpf";
   case hexagon:     return "hexagon";
   case mips:        return "mips";
   case mipsel:      return "mipsel";
@@ -33,6 +34,7 @@ const char *Triple::getArchTypeName(ArchType Kind) {
   case ppc64le:     return "powerpc64le";
   case ppc:         return "powerpc";
   case r600:        return "r600";
+  case amdgcn:      return "amdgcn";
   case sparc:       return "sparc";
   case sparcv9:     return "sparcv9";
   case systemz:     return "s390x";
@@ -82,7 +84,10 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
 
   case hexagon:     return "hexagon";
 
-  case r600:        return "r600";
+  case amdgcn:
+  case r600:        return "amdgpu";
+
+  case bpf:         return "bpf";
 
   case sparcv9:
   case sparc:       return "sparc";
@@ -157,6 +162,8 @@ const char *Triple::getOSTypeName(OSType Kind) {
   case AIX: return "aix";
   case CUDA: return "cuda";
   case NVCL: return "nvcl";
+  case AMDHSA: return "amdhsa";
+  case PS4: return "ps4";
   }
 
   llvm_unreachable("Invalid OSType");
@@ -188,6 +195,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("arm64", aarch64) // "arm64" is an alias for "aarch64"
     .Case("arm", arm)
     .Case("armeb", armeb)
+    .Case("bpf", bpf)
     .Case("mips", mips)
     .Case("mipsel", mipsel)
     .Case("mips64", mips64)
@@ -198,6 +206,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("ppc", ppc)
     .Case("ppc64le", ppc64le)
     .Case("r600", r600)
+    .Case("amdgcn", amdgcn)
     .Case("hexagon", hexagon)
     .Case("sparc", sparc)
     .Case("sparcv9", sparcv9)
@@ -242,13 +251,21 @@ static Triple::ArchType parseARMArch(StringRef ArchName) {
 
   if (ArchName.startswith("armv")) {
     offset = 3;
-    arch = Triple::arm;
+    if (ArchName.endswith("eb")) {
+      arch = Triple::armeb;
+      ArchName = ArchName.substr(0, ArchName.size() - 2);
+    } else
+      arch = Triple::arm;
   } else if (ArchName.startswith("armebv")) {
     offset = 5;
     arch = Triple::armeb;
   } else if (ArchName.startswith("thumbv")) {
     offset = 5;
-    arch = Triple::thumb;
+    if (ArchName.endswith("eb")) {
+      arch = Triple::thumbeb;
+      ArchName = ArchName.substr(0, ArchName.size() - 2);
+    } else
+      arch = Triple::thumb;
   } else if (ArchName.startswith("thumbebv")) {
     offset = 7;
     arch = Triple::thumbeb;
@@ -258,7 +275,7 @@ static Triple::ArchType parseARMArch(StringRef ArchName) {
     .Cases("v3", "v3m", isThumb ? Triple::UnknownArch : arch)
     .Cases("v4", "v4t", arch)
     .Cases("v5", "v5e", "v5t", "v5te", "v5tej", arch)
-    .Cases("v6", "v6j", "v6k", "v6m", arch)
+    .Cases("v6", "v6j", "v6k", "v6m", "v6sm", arch)
     .Cases("v6t2", "v6z", "v6zk", arch)
     .Cases("v7", "v7a", "v7em", "v7l", arch)
     .Cases("v7m", "v7r", "v7s", arch)
@@ -267,6 +284,8 @@ static Triple::ArchType parseARMArch(StringRef ArchName) {
 }
 
 static Triple::ArchType parseArch(StringRef ArchName) {
+  Triple::ArchType ARMArch(parseARMArch(ArchName));
+
   return StringSwitch<Triple::ArchType>(ArchName)
     .Cases("i386", "i486", "i586", "i686", Triple::x86)
     // FIXME: Do we need to support these?
@@ -276,15 +295,18 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Cases("powerpc64", "ppu", Triple::ppc64)
     .Case("powerpc64le", Triple::ppc64le)
     .Case("xscale", Triple::arm)
-    .StartsWith("arm", parseARMArch(ArchName))
-    .StartsWith("thumb", parseARMArch(ArchName))
-    .StartsWith("aarch64", parseARMArch(ArchName))
+    .Case("xscaleeb", Triple::armeb)
+    .StartsWith("arm", ARMArch)
+    .StartsWith("thumb", ARMArch)
+    .StartsWith("aarch64", ARMArch)
     .Case("msp430", Triple::msp430)
     .Cases("mips", "mipseb", "mipsallegrex", Triple::mips)
     .Cases("mipsel", "mipsallegrexel", Triple::mipsel)
     .Cases("mips64", "mips64eb", Triple::mips64)
     .Case("mips64el", Triple::mips64el)
     .Case("r600", Triple::r600)
+    .Case("amdgcn", Triple::amdgcn)
+    .Case("bpf", Triple::bpf)
     .Case("hexagon", Triple::hexagon)
     .Case("s390x", Triple::systemz)
     .Case("sparc", Triple::sparc)
@@ -345,6 +367,8 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("aix", Triple::AIX)
     .StartsWith("cuda", Triple::CUDA)
     .StartsWith("nvcl", Triple::NVCL)
+    .StartsWith("amdhsa", Triple::AMDHSA)
+    .StartsWith("ps4", Triple::PS4)
     .Default(Triple::UnknownOS);
 }
 
@@ -373,6 +397,9 @@ static Triple::ObjectFormatType parseFormat(StringRef EnvironmentName) {
 }
 
 static Triple::SubArchType parseSubArch(StringRef SubArchName) {
+  if (SubArchName.endswith("eb"))
+    SubArchName = SubArchName.substr(0, SubArchName.size() - 2);
+
   return StringSwitch<Triple::SubArchType>(SubArchName)
     .EndsWith("v8", Triple::ARMSubArch_v8)
     .EndsWith("v8a", Triple::ARMSubArch_v8)
@@ -385,6 +412,7 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
     .EndsWith("v7s", Triple::ARMSubArch_v7s)
     .EndsWith("v6", Triple::ARMSubArch_v6)
     .EndsWith("v6m", Triple::ARMSubArch_v6m)
+    .EndsWith("v6sm", Triple::ARMSubArch_v6m)
     .EndsWith("v6t2", Triple::ARMSubArch_v6t2)
     .EndsWith("v5", Triple::ARMSubArch_v5)
     .EndsWith("v5e", Triple::ARMSubArch_v5)
@@ -788,7 +816,11 @@ void Triple::setOS(OSType Kind) {
 }
 
 void Triple::setEnvironment(EnvironmentType Kind) {
-  setEnvironmentName(getEnvironmentTypeName(Kind));
+  if (ObjectFormat == getDefaultFormat(*this))
+    return setEnvironmentName(getEnvironmentTypeName(Kind));
+
+  setEnvironmentName((getEnvironmentTypeName(Kind) + Twine("-") +
+                      getObjectFormatTypeName(ObjectFormat)).str());
 }
 
 void Triple::setObjectFormat(ObjectFormatType Kind) {
@@ -862,6 +894,8 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
 
   case llvm::Triple::aarch64:
   case llvm::Triple::aarch64_be:
+  case llvm::Triple::amdgcn:
+  case llvm::Triple::bpf:
   case llvm::Triple::le64:
   case llvm::Triple::mips64:
   case llvm::Triple::mips64el:
@@ -897,6 +931,8 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::UnknownArch:
   case Triple::aarch64:
   case Triple::aarch64_be:
+  case Triple::amdgcn:
+  case Triple::bpf:
   case Triple::msp430:
   case Triple::systemz:
   case Triple::ppc64le:
@@ -958,8 +994,10 @@ Triple Triple::get64BitArchVariant() const {
 
   case Triple::aarch64:
   case Triple::aarch64_be:
+  case Triple::bpf:
   case Triple::le64:
   case Triple::amdil64:
+  case Triple::amdgcn:
   case Triple::hsail64:
   case Triple::spir64:
   case Triple::mips64:
@@ -1013,6 +1051,8 @@ const char *Triple::getARMCPUForArch(StringRef MArch) const {
     offset = 5;
   if (offset != StringRef::npos && MArch.substr(offset, 2) == "eb")
     offset += 2;
+  if (MArch.endswith("eb"))
+    MArch = MArch.substr(0, MArch.size() - 2);
   if (offset != StringRef::npos)
     result = llvm::StringSwitch<const char *>(MArch.substr(offset))
       .Cases("v2", "v2a", "arm2")
@@ -1027,7 +1067,7 @@ const char *Triple::getARMCPUForArch(StringRef MArch) const {
       .Case("v6j", "arm1136j-s")
       .Cases("v6z", "v6zk", "arm1176jzf-s")
       .Case("v6t2", "arm1156t2-s")
-      .Cases("v6m", "v6-m", "cortex-m0")
+      .Cases("v6m", "v6-m", "v6sm", "v6s-m", "cortex-m0")
       .Cases("v7", "v7a", "v7-a", "v7l", "v7-l", "cortex-a8")
       .Cases("v7s", "v7-s", "swift")
       .Cases("v7r", "v7-r", "cortex-r4")
diff --git a/lib/Support/Unix/Host.inc b/lib/Support/Unix/Host.inc
index fcb3638..4572171 100644
--- a/lib/Support/Unix/Host.inc
+++ b/lib/Support/Unix/Host.inc
@@ -16,12 +16,12 @@
 //===          is guaranteed to work on *all* UNIX variants.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Config/config.h"
-#include "llvm/ADT/StringRef.h"
 #include "Unix.h"
-#include <sys/utsname.h>
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Config/config.h"
 #include <cctype>
 #include <string>
+#include <sys/utsname.h>
 
 using namespace llvm;
 
@@ -35,29 +35,15 @@ static std::string getOSVersion() {
 }
 
 std::string sys::getDefaultTargetTriple() {
-  StringRef TargetTripleString(LLVM_DEFAULT_TARGET_TRIPLE);
-  std::pair<StringRef, StringRef> ArchSplit = TargetTripleString.split('-');
-
-  // Normalize the arch, since the target triple may not actually match the
-  // target.
-  std::string Arch = ArchSplit.first;
-
-  std::string Triple(Arch);
-  Triple += '-';
-  Triple += ArchSplit.second;
-
-  // Force i<N>86 to i386.
-  if (Triple[0] == 'i' && isdigit(Triple[1]) &&
-      Triple[2] == '8' && Triple[3] == '6')
-    Triple[1] = '3';
+  std::string TargetTripleString(LLVM_DEFAULT_TARGET_TRIPLE);
 
   // On darwin, we want to update the version to match that of the
   // target.
-  std::string::size_type DarwinDashIdx = Triple.find("-darwin");
+  std::string::size_type DarwinDashIdx = TargetTripleString.find("-darwin");
   if (DarwinDashIdx != std::string::npos) {
-    Triple.resize(DarwinDashIdx + strlen("-darwin"));
-    Triple += getOSVersion();
+    TargetTripleString.resize(DarwinDashIdx + strlen("-darwin"));
+    TargetTripleString += getOSVersion();
   }
 
-  return Triple::normalize(Triple);
+  return Triple::normalize(TargetTripleString);
 }
diff --git a/lib/Support/Unix/Memory.inc b/lib/Support/Unix/Memory.inc
index c9d89a8..c421ee8 100644
--- a/lib/Support/Unix/Memory.inc
+++ b/lib/Support/Unix/Memory.inc
@@ -88,7 +88,7 @@ Memory::allocateMappedMemory(size_t NumBytes,
   if (NumBytes == 0)
     return MemoryBlock();
 
-  static const size_t PageSize = process::get_self()->page_size();
+  static const size_t PageSize = Process::getPageSize();
   const size_t NumPages = (NumBytes+PageSize-1)/PageSize;
 
   int fd = -1;
@@ -181,7 +181,7 @@ Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
                     std::string *ErrMsg) {
   if (NumBytes == 0) return MemoryBlock();
 
-  size_t PageSize = process::get_self()->page_size();
+  size_t PageSize = Process::getPageSize();
   size_t NumPages = (NumBytes+PageSize-1)/PageSize;
 
   int fd = -1;
@@ -333,23 +333,12 @@ void Memory::InvalidateInstructionCache(const void *Addr,
   for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
     asm volatile("icbi 0, %0" : : "r"(Line));
   asm volatile("isync");
-#  elif (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
+#  elif (defined(__arm__) || defined(__aarch64__) || defined(__mips__)) && \
+        defined(__GNUC__)
   // FIXME: Can we safely always call this for __GNUC__ everywhere?
   const char *Start = static_cast<const char *>(Addr);
   const char *End = Start + Len;
   __clear_cache(const_cast<char *>(Start), const_cast<char *>(End));
-#  elif defined(__mips__)
-  const char *Start = static_cast<const char *>(Addr);
-#    if defined(ANDROID)
-  // The declaration of "cacheflush" in Android bionic:
-  // extern int cacheflush(long start, long end, long flags);
-  const char *End = Start + Len;
-  long LStart = reinterpret_cast<long>(const_cast<char *>(Start));
-  long LEnd = reinterpret_cast<long>(const_cast<char *>(End));
-  cacheflush(LStart, LEnd, BCACHE);
-#    else
-  cacheflush(const_cast<char *>(Start), Len, BCACHE);
-#    endif
 #  endif
 
 #endif  // end apple
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index 634d404..973d010 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -62,31 +62,6 @@
 
 using namespace llvm;
 
-namespace {
-  /// This class automatically closes the given file descriptor when it goes out
-  /// of scope. You can take back explicit ownership of the file descriptor by
-  /// calling take(). The destructor does not verify that close was successful.
-  /// Therefore, never allow this class to call close on a file descriptor that
-  /// has been read from or written to.
-  struct AutoFD {
-    int FileDescriptor;
-
-    AutoFD(int fd) : FileDescriptor(fd) {}
-    ~AutoFD() {
-      if (FileDescriptor >= 0)
-        ::close(FileDescriptor);
-    }
-
-    int take() {
-      int ret = FileDescriptor;
-      FileDescriptor = -1;
-      return ret;
-    }
-
-    operator int() const {return FileDescriptor;}
-  };
-}
-
 namespace llvm {
 namespace sys  {
 namespace fs {
@@ -175,7 +150,7 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
       // /proc is not always mounted under Linux (chroot for example).
       ssize_t len = readlink(aPath.str().c_str(), exe_path, sizeof(exe_path));
       if (len >= 0)
-          return StringRef(exe_path, len);
+          return std::string(exe_path, len);
   } else {
       // Fall back to the classical detection.
       if (getprogpath(exe_path, argv0) != NULL)
@@ -311,11 +286,8 @@ std::error_code rename(const Twine &from, const Twine &to) {
   return std::error_code();
 }
 
-std::error_code resize_file(const Twine &path, uint64_t size) {
-  SmallString<128> path_storage;
-  StringRef p = path.toNullTerminatedStringRef(path_storage);
-
-  if (::truncate(p.begin(), size) == -1)
+std::error_code resize_file(int FD, uint64_t Size) {
+  if (::ftruncate(FD, Size) == -1)
     return std::error_code(errno, std::generic_category());
 
   return std::error_code();
@@ -440,80 +412,28 @@ std::error_code setLastModificationAndAccessTime(int FD, TimeValue Time) {
 #endif
 }
 
-std::error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
-  AutoFD ScopedFD(FD);
-  if (!CloseFD)
-    ScopedFD.take();
-
-  // Figure out how large the file is.
-  struct stat FileInfo;
-  if (fstat(FD, &FileInfo) == -1)
-    return std::error_code(errno, std::generic_category());
-  uint64_t FileSize = FileInfo.st_size;
-
-  if (Size == 0)
-    Size = FileSize;
-  else if (FileSize < Size) {
-    // We need to grow the file.
-    if (ftruncate(FD, Size) == -1)
-      return std::error_code(errno, std::generic_category());
-  }
+std::error_code mapped_file_region::init(int FD, uint64_t Offset,
+                                         mapmode Mode) {
+  assert(Size != 0);
 
   int flags = (Mode == readwrite) ? MAP_SHARED : MAP_PRIVATE;
   int prot = (Mode == readonly) ? PROT_READ : (PROT_READ | PROT_WRITE);
-#ifdef MAP_FILE
-  flags |= MAP_FILE;
-#endif
   Mapping = ::mmap(nullptr, Size, prot, flags, FD, Offset);
   if (Mapping == MAP_FAILED)
     return std::error_code(errno, std::generic_category());
   return std::error_code();
 }
 
-mapped_file_region::mapped_file_region(const Twine &path,
-                                       mapmode mode,
-                                       uint64_t length,
-                                       uint64_t offset,
-                                       std::error_code &ec)
-  : Mode(mode)
-  , Size(length)
-  , Mapping() {
-  // Make sure that the requested size fits within SIZE_T.
-  if (length > std::numeric_limits<size_t>::max()) {
-    ec = make_error_code(errc::invalid_argument);
-    return;
-  }
-
-  SmallString<128> path_storage;
-  StringRef name = path.toNullTerminatedStringRef(path_storage);
-  int oflags = (mode == readonly) ? O_RDONLY : O_RDWR;
-  int ofd = ::open(name.begin(), oflags);
-  if (ofd == -1) {
-    ec = std::error_code(errno, std::generic_category());
-    return;
-  }
-
-  ec = init(ofd, true, offset);
-  if (ec)
-    Mapping = nullptr;
-}
-
-mapped_file_region::mapped_file_region(int fd,
-                                       bool closefd,
-                                       mapmode mode,
-                                       uint64_t length,
-                                       uint64_t offset,
-                                       std::error_code &ec)
-  : Mode(mode)
-  , Size(length)
-  , Mapping() {
+mapped_file_region::mapped_file_region(int fd, mapmode mode, uint64_t length,
+                                       uint64_t offset, std::error_code &ec)
+    : Size(length), Mapping() {
   // Make sure that the requested size fits within SIZE_T.
   if (length > std::numeric_limits<size_t>::max()) {
     ec = make_error_code(errc::invalid_argument);
     return;
   }
 
-  ec = init(fd, closefd, offset);
+  ec = init(fd, offset, mode);
   if (ec)
     Mapping = nullptr;
 }
@@ -523,16 +443,6 @@ mapped_file_region::~mapped_file_region() {
     ::munmap(Mapping, Size);
 }
 
-mapped_file_region::mapped_file_region(mapped_file_region &&other)
-  : Mode(other.Mode), Size(other.Size), Mapping(other.Mapping) {
-  other.Mapping = nullptr;
-}
-
-mapped_file_region::mapmode mapped_file_region::flags() const {
-  assert(Mapping && "Mapping failed but used anyway!");
-  return Mode;
-}
-
 uint64_t mapped_file_region::size() const {
   assert(Mapping && "Mapping failed but used anyway!");
   return Size;
@@ -540,7 +450,6 @@ uint64_t mapped_file_region::size() const {
 
 char *mapped_file_region::data() const {
   assert(Mapping && "Mapping failed but used anyway!");
-  assert(Mode != readonly && "Cannot get non-const data for readonly mapping!");
   return reinterpret_cast<char*>(Mapping);
 }
 
@@ -550,7 +459,7 @@ const char *mapped_file_region::const_data() const {
 }
 
 int mapped_file_region::alignment() {
-  return process::get_self()->page_size();
+  return Process::getPageSize();
 }
 
 std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index a429bb3..df13bd2 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -39,6 +39,9 @@
     !defined(__OpenBSD__) && !defined(__Bitrig__)
 #include <malloc.h>
 #endif
+#if defined(HAVE_MALLCTL)
+#include <malloc_np.h>
+#endif
 #ifdef HAVE_MALLOC_MALLOC_H
 #include <malloc/malloc.h>
 #endif
@@ -57,10 +60,6 @@
 using namespace llvm;
 using namespace sys;
 
-process::id_type self_process::get_id() {
-  return getpid();
-}
-
 static std::pair<TimeValue, TimeValue> getRUsageTimes() {
 #if defined(HAVE_GETRUSAGE)
   struct rusage RU;
@@ -80,43 +79,19 @@ static std::pair<TimeValue, TimeValue> getRUsageTimes() {
 #endif
 }
 
-TimeValue self_process::get_user_time() const {
-#if _POSIX_TIMERS > 0 && _POSIX_CPUTIME > 0
-  // Try to get a high resolution CPU timer.
-  struct timespec TS;
-  if (::clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &TS) == 0)
-    return TimeValue(static_cast<TimeValue::SecondsType>(TS.tv_sec),
-                     static_cast<TimeValue::NanoSecondsType>(TS.tv_nsec));
-#endif
-
-  // Otherwise fall back to rusage based timing.
-  return getRUsageTimes().first;
-}
-
-TimeValue self_process::get_system_time() const {
-  // We can only collect system time by inspecting the results of getrusage.
-  return getRUsageTimes().second;
-}
-
 // On Cygwin, getpagesize() returns 64k(AllocationGranularity) and
 // offset in mmap(3) should be aligned to the AllocationGranularity.
-static unsigned getPageSize() {
+unsigned Process::getPageSize() {
 #if defined(HAVE_GETPAGESIZE)
-  const int page_size = ::getpagesize();
+  static const int page_size = ::getpagesize();
 #elif defined(HAVE_SYSCONF)
-  long page_size = ::sysconf(_SC_PAGE_SIZE);
+  static long page_size = ::sysconf(_SC_PAGE_SIZE);
 #else
 #warning Cannot get the page size on this machine
 #endif
   return static_cast<unsigned>(page_size);
 }
 
-// This constructor guaranteed to be run exactly once on a single thread, and
-// sets up various process invariants that can be queried cheaply from then on.
-self_process::self_process() : PageSize(getPageSize()) {
-}
-
-
 size_t Process::GetMallocUsage() {
 #if defined(HAVE_MALLINFO)
   struct mallinfo mi;
@@ -126,6 +101,12 @@ size_t Process::GetMallocUsage() {
   malloc_statistics_t Stats;
   malloc_zone_statistics(malloc_default_zone(), &Stats);
   return Stats.size_in_use;   // darwin
+#elif defined(HAVE_MALLCTL)
+  size_t alloc, sz;
+  sz = sizeof(size_t);
+  if (mallctl("stats.allocated", &alloc, &sz, NULL, 0) == 0)
+    return alloc;
+  return 0;
 #elif defined(HAVE_SBRK)
   // Note this is only an approximation and more closely resembles
   // the value returned by mallinfo in the arena field.
@@ -133,8 +114,7 @@ size_t Process::GetMallocUsage() {
   char *EndOfMemory = (char*)sbrk(0);
   if (EndOfMemory != ((char*)-1) && StartOfMemory != ((char*)-1))
     return EndOfMemory - StartOfMemory;
-  else
-    return 0;
+  return 0;
 #else
 #warning Cannot get malloc info on this platform
   return 0;
@@ -219,8 +199,8 @@ public:
   }
 
 private:
-  FDCloser(const FDCloser &) LLVM_DELETED_FUNCTION;
-  void operator=(const FDCloser &) LLVM_DELETED_FUNCTION;
+  FDCloser(const FDCloser &) = delete;
+  void operator=(const FDCloser &) = delete;
 
   int &FD;
   bool KeepOpen;
diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc
index 0670ad3..baf2767 100644
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc
@@ -42,7 +42,18 @@
 #define  _RESTRICT_KYWD
 #endif
 #include <spawn.h>
-#if !defined(__APPLE__)
+
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+
+#if defined(__APPLE__) && !(defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE)
+#define USE_NSGETENVIRON 1
+#else
+#define USE_NSGETENVIRON 0
+#endif
+
+#if !USE_NSGETENVIRON
   extern char **environ;
 #else
 #include <crt_externs.h> // _NSGetEnviron
@@ -63,11 +74,12 @@ ErrorOr<std::string> sys::findProgramByName(StringRef Name,
   if (Name.find('/') != StringRef::npos)
     return std::string(Name);
 
-  if (Paths.empty()) {
-    SmallVector<StringRef, 16> SearchPaths;
-    SplitString(std::getenv("PATH"), SearchPaths, ":");
-    return findProgramByName(Name, SearchPaths);
-  }
+  SmallVector<StringRef, 16> EnvironmentPaths;
+  if (Paths.empty())
+    if (const char *PathEnv = std::getenv("PATH")) {
+      SplitString(PathEnv, EnvironmentPaths, ":");
+      Paths = EnvironmentPaths;
+    }
 
   for (auto Path : Paths) {
     if (Path.empty())
@@ -216,7 +228,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
     }
 
     if (!envp)
-#if !defined(__APPLE__)
+#if !USE_NSGETENVIRON
       envp = const_cast<const char **>(environ);
 #else
       // environ is missing in dylibs.
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index e8f4643..665c7de 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -480,6 +480,8 @@ static void PrintStackTraceSignalHandler(void *) {
   PrintStackTrace(stderr);
 }
 
+void llvm::sys::DisableSystemDialogsOnCrash() {}
+
 /// PrintStackTraceOnErrorSignal - When an error signal (such as SIGABRT or
 /// SIGSEGV) is delivered to the process, print a stack trace and then exit.
 void llvm::sys::PrintStackTraceOnErrorSignal() {
diff --git a/lib/Support/Unix/ThreadLocal.inc b/lib/Support/Unix/ThreadLocal.inc
index f14d0fa..31c3f38 100644
--- a/lib/Support/Unix/ThreadLocal.inc
+++ b/lib/Support/Unix/ThreadLocal.inc
@@ -16,11 +16,54 @@
 //===          is guaranteed to work on *all* UNIX variants.
 //===----------------------------------------------------------------------===//
 
+#if defined(HAVE_PTHREAD_H) && defined(HAVE_PTHREAD_GETSPECIFIC)
+
+#include <cassert>
+#include <pthread.h>
+#include <stdlib.h>
+
+namespace llvm {
+using namespace sys;
+
+ThreadLocalImpl::ThreadLocalImpl() : data() {
+  static_assert(sizeof(pthread_key_t) <= sizeof(data), "size too big");
+  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
+  int errorcode = pthread_key_create(key, nullptr);
+  assert(errorcode == 0);
+  (void) errorcode;
+}
+
+ThreadLocalImpl::~ThreadLocalImpl() {
+  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
+  int errorcode = pthread_key_delete(*key);
+  assert(errorcode == 0);
+  (void) errorcode;
+}
+
+void ThreadLocalImpl::setInstance(const void* d) {
+  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
+  int errorcode = pthread_setspecific(*key, d);
+  assert(errorcode == 0);
+  (void) errorcode;
+}
+
+void *ThreadLocalImpl::getInstance() {
+  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
+  return pthread_getspecific(*key);
+}
+
+void ThreadLocalImpl::removeInstance() {
+  setInstance(nullptr);
+}
+
+}
+#else
 namespace llvm {
 using namespace sys;
 ThreadLocalImpl::ThreadLocalImpl() : data() { }
 ThreadLocalImpl::~ThreadLocalImpl() { }
 void ThreadLocalImpl::setInstance(const void* d) { data = const_cast<void*>(d);}
-const void* ThreadLocalImpl::getInstance() { return data; }
+void *ThreadLocalImpl::getInstance() { return data; }
 void ThreadLocalImpl::removeInstance() { setInstance(0); }
 }
+#endif
diff --git a/lib/Support/Valgrind.cpp b/lib/Support/Valgrind.cpp
index 2c6d6aa..facf8d9 100644
--- a/lib/Support/Valgrind.cpp
+++ b/lib/Support/Valgrind.cpp
@@ -53,7 +53,6 @@ void llvm::sys::ValgrindDiscardTranslations(const void *Addr, size_t Len) {
 
 #endif  // !HAVE_VALGRIND_VALGRIND_H
 
-#if LLVM_ENABLE_THREADS != 0 && !defined(NDEBUG)
 // These functions require no implementation, tsan just looks at the arguments
 // they're called with. However, they are required to be weak as some other
 // application or library may already be providing these definitions for the
@@ -72,4 +71,4 @@ void AnnotateIgnoreWritesBegin(const char *file, int line) {}
 LLVM_ATTRIBUTE_WEAK void AnnotateIgnoreWritesEnd(const char *file, int line);
 void AnnotateIgnoreWritesEnd(const char *file, int line) {}
 }
-#endif
+
diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index 365031c..d8b5702 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -44,6 +44,7 @@ using namespace llvm;
 
 using llvm::sys::windows::UTF8ToUTF16;
 using llvm::sys::windows::UTF16ToUTF8;
+using llvm::sys::path::widenPath;
 
 static std::error_code windows_error(DWORD E) {
   return mapWindowsError(E);
@@ -59,11 +60,15 @@ static bool is_separator(const wchar_t value) {
   }
 }
 
+namespace llvm {
+namespace sys  {
+namespace path {
+
 // Convert a UTF-8 path to UTF-16.  Also, if the absolute equivalent of the
 // path is longer than CreateDirectory can tolerate, make it absolute and
 // prefixed by '\\?\'.
-static std::error_code widenPath(const Twine &Path8,
-                                 SmallVectorImpl<wchar_t> &Path16) {
+std::error_code widenPath(const Twine &Path8,
+                          SmallVectorImpl<wchar_t> &Path16) {
   const size_t MaxDirLen = MAX_PATH - 12; // Must leave room for 8.3 filename.
 
   // Several operations would convert Path8 to SmallString; more efficient to
@@ -111,9 +116,8 @@ static std::error_code widenPath(const Twine &Path8,
   // Just use the caller's original path.
   return UTF8ToUTF16(Path8Str, Path16);
 }
+} // end namespace path
 
-namespace llvm {
-namespace sys  {
 namespace fs {
 
 std::string getMainExecutable(const char *argv0, void *MainExecAddr) {
@@ -268,21 +272,12 @@ std::error_code rename(const Twine &from, const Twine &to) {
   return ec;
 }
 
-std::error_code resize_file(const Twine &path, uint64_t size) {
-  SmallVector<wchar_t, 128> path_utf16;
-
-  if (std::error_code ec = widenPath(path, path_utf16))
-    return ec;
-
-  int fd = ::_wopen(path_utf16.begin(), O_BINARY | _O_RDWR, S_IWRITE);
-  if (fd == -1)
-    return std::error_code(errno, std::generic_category());
+std::error_code resize_file(int FD, uint64_t Size) {
 #ifdef HAVE__CHSIZE_S
-  errno_t error = ::_chsize_s(fd, size);
+  errno_t error = ::_chsize_s(FD, Size);
 #else
-  errno_t error = ::_chsize(fd, size);
+  errno_t error = ::_chsize(FD, Size);
 #endif
-  ::close(fd);
   return std::error_code(error, std::generic_category());
 }
 
@@ -463,17 +458,15 @@ std::error_code setLastModificationAndAccessTime(int FD, TimeValue Time) {
   return std::error_code();
 }
 
-std::error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
-  FileDescriptor = FD;
+std::error_code mapped_file_region::init(int FD, uint64_t Offset,
+                                         mapmode Mode) {
   // Make sure that the requested size fits within SIZE_T.
-  if (Size > std::numeric_limits<SIZE_T>::max()) {
-    if (FileDescriptor) {
-      if (CloseFD)
-        _close(FileDescriptor);
-    } else
-      ::CloseHandle(FileHandle);
+  if (Size > std::numeric_limits<SIZE_T>::max())
     return make_error_code(errc::invalid_argument);
-  }
+
+  HANDLE FileHandle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
+  if (FileHandle == INVALID_HANDLE_VALUE)
+    return make_error_code(errc::bad_file_descriptor);
 
   DWORD flprotect;
   switch (Mode) {
@@ -482,18 +475,13 @@ std::error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset)
   case priv:      flprotect = PAGE_WRITECOPY; break;
   }
 
-  FileMappingHandle =
+  HANDLE FileMappingHandle =
       ::CreateFileMappingW(FileHandle, 0, flprotect,
                            (Offset + Size) >> 32,
                            (Offset + Size) & 0xffffffff,
                            0);
   if (FileMappingHandle == NULL) {
     std::error_code ec = windows_error(GetLastError());
-    if (FileDescriptor) {
-      if (CloseFD)
-        _close(FileDescriptor);
-    } else
-      ::CloseHandle(FileHandle);
     return ec;
   }
 
@@ -511,11 +499,6 @@ std::error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset)
   if (Mapping == NULL) {
     std::error_code ec = windows_error(GetLastError());
     ::CloseHandle(FileMappingHandle);
-    if (FileDescriptor) {
-      if (CloseFD)
-        _close(FileDescriptor);
-    } else
-      ::CloseHandle(FileHandle);
     return ec;
   }
 
@@ -526,11 +509,6 @@ std::error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset)
       std::error_code ec = windows_error(GetLastError());
       ::UnmapViewOfFile(Mapping);
       ::CloseHandle(FileMappingHandle);
-      if (FileDescriptor) {
-        if (CloseFD)
-          _close(FileDescriptor);
-      } else
-        ::CloseHandle(FileHandle);
       return ec;
     }
     Size = mbi.RegionSize;
@@ -539,84 +517,15 @@ std::error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset)
   // Close all the handles except for the view. It will keep the other handles
   // alive.
   ::CloseHandle(FileMappingHandle);
-  if (FileDescriptor) {
-    if (CloseFD)
-      _close(FileDescriptor); // Also closes FileHandle.
-  } else
-    ::CloseHandle(FileHandle);
   return std::error_code();
 }
 
-mapped_file_region::mapped_file_region(const Twine &path,
-                                       mapmode mode,
-                                       uint64_t length,
-                                       uint64_t offset,
-                                       std::error_code &ec)
-  : Mode(mode)
-  , Size(length)
-  , Mapping()
-  , FileDescriptor()
-  , FileHandle(INVALID_HANDLE_VALUE)
-  , FileMappingHandle() {
-  SmallVector<wchar_t, 128> path_utf16;
-
-  // Convert path to UTF-16.
-  if ((ec = widenPath(path, path_utf16)))
-    return;
-
-  // Get file handle for creating a file mapping.
-  FileHandle = ::CreateFileW(c_str(path_utf16),
-                             Mode == readonly ? GENERIC_READ
-                                              : GENERIC_READ | GENERIC_WRITE,
-                             Mode == readonly ? FILE_SHARE_READ
-                                              : 0,
-                             0,
-                             Mode == readonly ? OPEN_EXISTING
-                                              : OPEN_ALWAYS,
-                             Mode == readonly ? FILE_ATTRIBUTE_READONLY
-                                              : FILE_ATTRIBUTE_NORMAL,
-                             0);
-  if (FileHandle == INVALID_HANDLE_VALUE) {
-    ec = windows_error(::GetLastError());
-    return;
-  }
-
-  FileDescriptor = 0;
-  ec = init(FileDescriptor, true, offset);
-  if (ec) {
-    Mapping = FileMappingHandle = 0;
-    FileHandle = INVALID_HANDLE_VALUE;
-    FileDescriptor = 0;
-  }
-}
-
-mapped_file_region::mapped_file_region(int fd,
-                                       bool closefd,
-                                       mapmode mode,
-                                       uint64_t length,
-                                       uint64_t offset,
-                                       std::error_code &ec)
-  : Mode(mode)
-  , Size(length)
-  , Mapping()
-  , FileDescriptor(fd)
-  , FileHandle(INVALID_HANDLE_VALUE)
-  , FileMappingHandle() {
-  FileHandle = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
-  if (FileHandle == INVALID_HANDLE_VALUE) {
-    if (closefd)
-      _close(FileDescriptor);
-    FileDescriptor = 0;
-    ec = make_error_code(errc::bad_file_descriptor);
-    return;
-  }
-
-  ec = init(FileDescriptor, closefd, offset);
-  if (ec) {
-    Mapping = FileMappingHandle = 0;
-    FileHandle = INVALID_HANDLE_VALUE;
-    FileDescriptor = 0;
-  }
+mapped_file_region::mapped_file_region(int fd, mapmode mode, uint64_t length,
+                                       uint64_t offset, std::error_code &ec)
+    : Size(length), Mapping() {
+  ec = init(fd, offset, mode);
+  if (ec)
+    Mapping = 0;
 }
 
 mapped_file_region::~mapped_file_region() {
@@ -624,30 +533,12 @@ mapped_file_region::~mapped_file_region() {
     ::UnmapViewOfFile(Mapping);
 }
 
-mapped_file_region::mapped_file_region(mapped_file_region &&other)
-  : Mode(other.Mode)
-  , Size(other.Size)
-  , Mapping(other.Mapping)
-  , FileDescriptor(other.FileDescriptor)
-  , FileHandle(other.FileHandle)
-  , FileMappingHandle(other.FileMappingHandle) {
-  other.Mapping = other.FileMappingHandle = 0;
-  other.FileHandle = INVALID_HANDLE_VALUE;
-  other.FileDescriptor = 0;
-}
-
-mapped_file_region::mapmode mapped_file_region::flags() const {
-  assert(Mapping && "Mapping failed but used anyway!");
-  return Mode;
-}
-
 uint64_t mapped_file_region::size() const {
   assert(Mapping && "Mapping failed but used anyway!");
   return Size;
 }
 
 char *mapped_file_region::data() const {
-  assert(Mode != readonly && "Cannot get non-const data for readonly mapping!");
   assert(Mapping && "Mapping failed but used anyway!");
   return reinterpret_cast<char*>(Mapping);
 }
diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index 3819e63..854eac7 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc
@@ -49,10 +49,6 @@
 using namespace llvm;
 using namespace sys;
 
-process::id_type self_process::get_id() {
-  return GetCurrentProcessId();
-}
-
 static TimeValue getTimeValueFromFILETIME(FILETIME Time) {
   ULARGE_INTEGER TimeInteger;
   TimeInteger.LowPart = Time.dwLowDateTime;
@@ -65,28 +61,10 @@ static TimeValue getTimeValueFromFILETIME(FILETIME Time) {
           (TimeInteger.QuadPart % 10000000) * 100));
 }
 
-TimeValue self_process::get_user_time() const {
-  FILETIME ProcCreate, ProcExit, KernelTime, UserTime;
-  if (GetProcessTimes(GetCurrentProcess(), &ProcCreate, &ProcExit, &KernelTime,
-                      &UserTime) == 0)
-    return TimeValue();
-
-  return getTimeValueFromFILETIME(UserTime);
-}
-
-TimeValue self_process::get_system_time() const {
-  FILETIME ProcCreate, ProcExit, KernelTime, UserTime;
-  if (GetProcessTimes(GetCurrentProcess(), &ProcCreate, &ProcExit, &KernelTime,
-                      &UserTime) == 0)
-    return TimeValue();
-
-  return getTimeValueFromFILETIME(KernelTime);
-}
-
 // This function retrieves the page size using GetNativeSystemInfo() and is
 // present solely so it can be called once to initialize the self_process member
 // below.
-static unsigned getPageSize() {
+static unsigned computePageSize() {
   // GetNativeSystemInfo() provides the physical page size which may differ
   // from GetSystemInfo() in 32-bit applications running under WOW64.
   SYSTEM_INFO info;
@@ -96,12 +74,11 @@ static unsigned getPageSize() {
   return static_cast<unsigned>(info.dwPageSize);
 }
 
-// This constructor guaranteed to be run exactly once on a single thread, and
-// sets up various process invariants that can be queried cheaply from then on.
-self_process::self_process() : PageSize(getPageSize()) {
+unsigned Process::getPageSize() {
+  static unsigned Ret = computePageSize();
+  return Ret;
 }
 
-
 size_t
 Process::GetMallocUsage()
 {
diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc
index 72c2a58..c370077 100644
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc
@@ -15,8 +15,8 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/WindowsError.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cstdio>
 #include <fcntl.h>
 #include <io.h>
@@ -62,7 +62,8 @@ ErrorOr<std::string> sys::findProgramByName(StringRef Name,
   SmallVector<StringRef, 12> PathExts;
   PathExts.push_back("");
   PathExts.push_back(".exe"); // FIXME: This must be in %PATHEXT%.
-  SplitString(std::getenv("PATHEXT"), PathExts, ";");
+  if (const char *PathExtEnv = std::getenv("PATHEXT"))
+    SplitString(PathExtEnv, PathExts, ";");
 
   SmallVector<wchar_t, MAX_PATH> U16Result;
   DWORD Len = MAX_PATH;
@@ -117,14 +118,19 @@ static HANDLE RedirectIO(const StringRef *path, int fd, std::string* ErrMsg) {
   sa.bInheritHandle = TRUE;
 
   SmallVector<wchar_t, 128> fnameUnicode;
-  if (windows::UTF8ToUTF16(fname, fnameUnicode))
-    return INVALID_HANDLE_VALUE;
-
+  if (path->empty()) {
+    // Don't play long-path tricks on "NUL".
+    if (windows::UTF8ToUTF16(fname, fnameUnicode))
+      return INVALID_HANDLE_VALUE;
+  } else {
+    if (path::widenPath(fname, fnameUnicode))
+      return INVALID_HANDLE_VALUE;
+  }
   h = CreateFileW(fnameUnicode.data(), fd ? GENERIC_WRITE : GENERIC_READ,
                   FILE_SHARE_READ, &sa, fd == 0 ? OPEN_EXISTING : CREATE_ALWAYS,
                   FILE_ATTRIBUTE_NORMAL, NULL);
   if (h == INVALID_HANDLE_VALUE) {
-    MakeErrMsg(ErrMsg, std::string(fname) + ": Can't open file for " +
+    MakeErrMsg(ErrMsg, fname + ": Can't open file for " +
         (fd ? "input: " : "output: "));
   }
 
@@ -322,7 +328,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
   fflush(stderr);
 
   SmallVector<wchar_t, MAX_PATH> ProgramUtf16;
-  if (std::error_code ec = windows::UTF8ToUTF16(Program, ProgramUtf16)) {
+  if (std::error_code ec = path::widenPath(Program, ProgramUtf16)) {
     SetLastError(ec.value());
     MakeErrMsg(ErrMsg,
                std::string("Unable to convert application name to UTF-16"));
diff --git a/lib/Support/Windows/Signals.inc b/lib/Support/Windows/Signals.inc
index 35ba6f8..aa1aa72 100644
--- a/lib/Support/Windows/Signals.inc
+++ b/lib/Support/Windows/Signals.inc
@@ -13,6 +13,7 @@
 
 #include "llvm/Support/FileSystem.h"
 #include <algorithm>
+#include <signal.h>
 #include <stdio.h>
 #include <vector>
 
@@ -165,7 +166,6 @@ static std::vector<std::string> *FilesToRemove = NULL;
 static std::vector<std::pair<void(*)(void*), void*> > *CallBacksToRun = 0;
 static bool RegisteredUnhandledExceptionFilter = false;
 static bool CleanupExecuted = false;
-static bool ExitOnUnhandledExceptions = false;
 static PTOP_LEVEL_EXCEPTION_FILTER OldFilter = NULL;
 
 // Windows creates a new thread to execute the console handler when an event
@@ -184,7 +184,8 @@ namespace llvm {
 /// AvoidMessageBoxHook - Emulates hitting "retry" from an "abort, retry,
 /// ignore" CRT debug report dialog.  "retry" raises an exception which
 /// ultimately triggers our stack dumper.
-static int AvoidMessageBoxHook(int ReportType, char *Message, int *Return) {
+static LLVM_ATTRIBUTE_UNUSED int
+AvoidMessageBoxHook(int ReportType, char *Message, int *Return) {
   // Set *Return to the retry code for the return value of _CrtDbgReport:
   // http://msdn.microsoft.com/en-us/library/8hyw4sy7(v=vs.71).aspx
   // This may also trigger just-in-time debugging via DebugBreak().
@@ -196,6 +197,12 @@ static int AvoidMessageBoxHook(int ReportType, char *Message, int *Return) {
 
 #endif
 
+extern "C" void HandleAbort(int Sig) {
+  if (Sig == SIGABRT) {
+    LLVM_BUILTIN_TRAP;
+  }
+}
+
 static void RegisterHandler() {
 #if __MINGW32__ && !defined(__MINGW64_VERSION_MAJOR)
   // On MinGW.org, we need to load up the symbols explicitly, because the
@@ -226,17 +233,6 @@ static void RegisterHandler() {
   OldFilter = SetUnhandledExceptionFilter(LLVMUnhandledExceptionFilter);
   SetConsoleCtrlHandler(LLVMConsoleCtrlHandler, TRUE);
 
-  // Environment variable to disable any kind of crash dialog.
-  if (getenv("LLVM_DISABLE_CRASH_REPORT")) {
-#ifdef _MSC_VER
-    _CrtSetReportHook(AvoidMessageBoxHook);
-#endif
-    SetErrorMode(SEM_FAILCRITICALERRORS |
-                 SEM_NOGPFAULTERRORBOX |
-                 SEM_NOOPENFILEERRORBOX);
-    ExitOnUnhandledExceptions = true;
-  }
-
   // IMPORTANT NOTE: Caller must call LeaveCriticalSection(&CriticalSection) or
   // else multi-threading problems will ensue.
 }
@@ -276,9 +272,29 @@ void sys::DontRemoveFileOnSignal(StringRef Filename) {
   LeaveCriticalSection(&CriticalSection);
 }
 
+void sys::DisableSystemDialogsOnCrash() {
+  // Crash to stack trace handler on abort.
+  signal(SIGABRT, HandleAbort);
+
+  // The following functions are not reliably accessible on MinGW.
+#ifdef _MSC_VER
+  // We're already handling writing a "something went wrong" message.
+  _set_abort_behavior(0, _WRITE_ABORT_MSG);
+  // Disable Dr. Watson.
+  _set_abort_behavior(0, _CALL_REPORTFAULT);
+  _CrtSetReportHook(AvoidMessageBoxHook);
+#endif
+
+  // Disable standard error dialog box.
+  SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOGPFAULTERRORBOX |
+               SEM_NOOPENFILEERRORBOX);
+  _set_error_mode(_OUT_TO_STDERR);
+}
+
 /// PrintStackTraceOnErrorSignal - When an error signal (such as SIBABRT or
 /// SIGSEGV) is delivered to the process, print a stack trace and then exit.
 void sys::PrintStackTraceOnErrorSignal() {
+  DisableSystemDialogsOnCrash();
   RegisterHandler();
   LeaveCriticalSection(&CriticalSection);
 }
@@ -437,14 +453,7 @@ static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep) {
     fputc('\n', stderr);
   }
 
-  if (ExitOnUnhandledExceptions)
-    _exit(ep->ExceptionRecord->ExceptionCode);
-
-  // Allow dialog box to pop up allowing choice to start debugger.
-  if (OldFilter)
-    return (*OldFilter)(ep);
-  else
-    return EXCEPTION_CONTINUE_SEARCH;
+  _exit(ep->ExceptionRecord->ExceptionCode);
 }
 
 static BOOL WINAPI LLVMConsoleCtrlHandler(DWORD dwCtrlType) {
diff --git a/lib/Support/Windows/ThreadLocal.inc b/lib/Support/Windows/ThreadLocal.inc
index 14ce619..b9cb8ff 100644
--- a/lib/Support/Windows/ThreadLocal.inc
+++ b/lib/Support/Windows/ThreadLocal.inc
@@ -34,7 +34,7 @@ ThreadLocalImpl::~ThreadLocalImpl() {
   TlsFree(*tls);
 }
 
-const void* ThreadLocalImpl::getInstance() {
+void *ThreadLocalImpl::getInstance() {
   DWORD* tls = reinterpret_cast<DWORD*>(&data);
   return TlsGetValue(*tls);
 }
diff --git a/lib/Support/Windows/WindowsSupport.h b/lib/Support/Windows/WindowsSupport.h
index 6d9c5fb..5bb0b8d 100644
--- a/lib/Support/Windows/WindowsSupport.h
+++ b/lib/Support/Windows/WindowsSupport.h
@@ -19,6 +19,9 @@
 //===          is guaranteed to work on *all* Win32 variants.
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_SUPPORT_WINDOWSSUPPORT_H
+#define LLVM_SUPPORT_WINDOWSSUPPORT_H
+
 // mingw-w64 tends to define it as 0x0502 in its headers.
 #undef _WIN32_WINNT
 #undef _WIN32_IE
@@ -30,6 +33,7 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h" // Get build system configuration settings
 #include "llvm/Support/Compiler.h"
 #include <system_error>
@@ -88,7 +92,7 @@ public:
   }
 
   // True if Handle is valid.
-  LLVM_EXPLICIT operator bool() const {
+  explicit operator bool() const {
     return HandleTraits::IsValid(Handle) ? true : false;
   }
 
@@ -162,6 +166,11 @@ c_str(SmallVectorImpl<T> &str) {
 }
 
 namespace sys {
+namespace path {
+std::error_code widenPath(const Twine &Path8,
+                          SmallVectorImpl<wchar_t> &Path16);
+} // end namespace path
+
 namespace windows {
 std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
 std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
@@ -172,3 +181,5 @@ std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
 } // end namespace windows
 } // end namespace sys
 } // end namespace llvm.
+
+#endif
diff --git a/lib/Support/Windows/explicit_symbols.inc b/lib/Support/Windows/explicit_symbols.inc
index cd56b13..bbbf7ea 100644
--- a/lib/Support/Windows/explicit_symbols.inc
+++ b/lib/Support/Windows/explicit_symbols.inc
@@ -10,9 +10,15 @@
 #ifdef HAVE___CHKSTK
   EXPLICIT_SYMBOL(__chkstk)
 #endif
+#ifdef HAVE___CHKSTK_MS
+  EXPLICIT_SYMBOL(__chkstk_ms)
+#endif
 #ifdef HAVE____CHKSTK
   EXPLICIT_SYMBOL(___chkstk)
 #endif
+#ifdef HAVE____CHKSTK_MS
+  EXPLICIT_SYMBOL(___chkstk_ms)
+#endif
 #ifdef HAVE___MAIN
   EXPLICIT_SYMBOL(__main) // FIXME: Don't call it.
 #endif
diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp
index 4688ff1..6ae7945 100644
--- a/lib/Support/YAMLParser.cpp
+++ b/lib/Support/YAMLParser.cpp
@@ -1528,12 +1528,10 @@ Stream::~Stream() {}
 bool Stream::failed() { return scanner->failed(); }
 
 void Stream::printError(Node *N, const Twine &Msg) {
-  SmallVector<SMRange, 1> Ranges;
-  Ranges.push_back(N->getSourceRange());
   scanner->printError( N->getSourceRange().Start
                      , SourceMgr::DK_Error
                      , Msg
-                     , Ranges);
+                     , N->getSourceRange());
 }
 
 document_iterator Stream::begin() {
@@ -1570,11 +1568,11 @@ std::string Node::getVerbatimTag() const {
     if (Raw.find_last_of('!') == 0) {
       Ret = Doc->getTagMap().find("!")->second;
       Ret += Raw.substr(1);
-      return std::move(Ret);
+      return Ret;
     } else if (Raw.startswith("!!")) {
       Ret = Doc->getTagMap().find("!!")->second;
       Ret += Raw.substr(2);
-      return std::move(Ret);
+      return Ret;
     } else {
       StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
       std::map<StringRef, StringRef>::const_iterator It =
@@ -1588,7 +1586,7 @@ std::string Node::getVerbatimTag() const {
         setError(Twine("Unknown tag handle ") + TagHandle, T);
       }
       Ret += Raw.substr(Raw.find_last_of('!') + 1);
-      return std::move(Ret);
+      return Ret;
     }
   }
 
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index 81edca2..43a0e10 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -8,12 +8,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Errc.h"
-#include "llvm/Support/YAMLTraits.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
 #include <cstring>
@@ -233,6 +233,13 @@ bool Input::matchEnumScalar(const char *Str, bool) {
   return false;
 }
 
+bool Input::matchEnumFallback() {
+  if (ScalarMatchFound)
+    return false;
+  ScalarMatchFound = true;
+  return true;
+}
+
 void Input::endEnumScalar() {
   if (!ScalarMatchFound) {
     setError(CurrentNode, "unknown enumerated scalar");
@@ -508,6 +515,13 @@ bool Output::matchEnumScalar(const char *Str, bool Match) {
   return false;
 }
 
+bool Output::matchEnumFallback() {
+  if (EnumerationMatchFound)
+    return false;
+  EnumerationMatchFound = true;
+  return true;
+}
+
 void Output::endEnumScalar() {
   if (!EnumerationMatchFound)
     llvm_unreachable("bad runtime enum value");
@@ -669,7 +683,7 @@ StringRef ScalarTraits<StringRef>::input(StringRef Scalar, void *,
   Val = Scalar;
   return StringRef();
 }
- 
+
 void ScalarTraits<std::string>::output(const std::string &Val, void *,
                                      raw_ostream &Out) {
   Out << Val;
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index bbbbe4a..051e2dd 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -242,7 +242,7 @@ raw_ostream &raw_ostream::operator<<(double N) {
 
   char buf[16];
   unsigned len;
-  len = snprintf(buf, sizeof(buf), "%e", N);
+  len = format("%e", N).snprint(buf, sizeof(buf));
   if (len <= sizeof(buf) - 2) {
     if (len >= 5 && buf[len - 5] == 'e' && buf[len - 3] == '0') {
       int cs = buf[len - 4];
@@ -312,6 +312,7 @@ raw_ostream &raw_ostream::write(const char *Ptr, size_t Size) {
     // than the buffer. Directly write the chunk that is a multiple of the
     // preferred buffer size and put the remainder in the buffer.
     if (LLVM_UNLIKELY(OutBufCur == OutBufStart)) {
+      assert(NumBytes != 0 && "undefined behavior");
       size_t BytesToWrite = Size - (Size % NumBytes);
       write_impl(Ptr, BytesToWrite);
       size_t BytesRemaining = Size - BytesToWrite;
@@ -409,9 +410,12 @@ raw_ostream &raw_ostream::operator<<(const FormattedString &FS) {
 raw_ostream &raw_ostream::operator<<(const FormattedNumber &FN) {
   if (FN.Hex) {
     unsigned Nibbles = (64 - countLeadingZeros(FN.HexValue)+3)/4;
-    unsigned Width = (FN.Width > Nibbles+2) ? FN.Width : Nibbles+2;
-        
+    unsigned PrefixChars = FN.HexPrefix ? 2 : 0;
+    unsigned Width = std::max(FN.Width, Nibbles + PrefixChars);
+
     char NumberBuffer[20] = "0x0000000000000000";
+    if (!FN.HexPrefix)
+      NumberBuffer[1] = '0';
     char *EndPtr = NumberBuffer+Width;
     char *CurPtr = EndPtr;
     const char A = FN.Upper ? 'A' : 'a';
diff --git a/lib/Support/regcomp.c b/lib/Support/regcomp.c
index 0b5b765..ebde64f 100644
--- a/lib/Support/regcomp.c
+++ b/lib/Support/regcomp.c
@@ -49,6 +49,14 @@
 #include "regcclass.h"
 #include "regcname.h"
 
+#include "llvm/Config/config.h"
+#if HAVE_STDINT_H
+#include <stdint.h>
+#else
+/* Pessimistically bound memory use */
+#define SIZE_MAX UINT_MAX
+#endif
+
 /*
  * parse structure, passed up and down to avoid global variables and
  * other clumsinesses
@@ -1069,6 +1077,8 @@ allocset(struct parse *p)
 
 		p->ncsalloc += CHAR_BIT;
 		nc = p->ncsalloc;
+		if (nc > SIZE_MAX / sizeof(cset))
+			goto nomem;
 		assert(nc % CHAR_BIT == 0);
 		nbytes = nc / CHAR_BIT * css;
 
@@ -1412,6 +1422,11 @@ enlarge(struct parse *p, sopno size)
 	if (p->ssize >= size)
 		return;
 
+	if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) {
+		SETERROR(REG_ESPACE);
+		return;
+	}
+
 	sp = (sop *)realloc(p->strip, size*sizeof(sop));
 	if (sp == NULL) {
 		SETERROR(REG_ESPACE);
@@ -1428,6 +1443,12 @@ static void
 stripsnug(struct parse *p, struct re_guts *g)
 {
 	g->nstates = p->slen;
+	if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) {
+		g->strip = p->strip;
+		SETERROR(REG_ESPACE);
+		return;
+	}
+
 	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
 	if (g->strip == NULL) {
 		SETERROR(REG_ESPACE);
diff --git a/lib/TableGen/CMakeLists.txt b/lib/TableGen/CMakeLists.txt
index fb70218..9333b65 100644
--- a/lib/TableGen/CMakeLists.txt
+++ b/lib/TableGen/CMakeLists.txt
@@ -7,4 +7,7 @@ add_llvm_library(LLVMTableGen
   TableGenBackend.cpp
   TGLexer.cpp
   TGParser.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/TableGen
   )
diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp
index 2578cc2..c440451 100644
--- a/lib/TableGen/Main.cpp
+++ b/lib/TableGen/Main.cpp
@@ -64,11 +64,8 @@ static int createDependencyFile(const TGParser &Parser, const char *argv0) {
     return 1;
   }
   DepOut.os() << OutputFilename << ":";
-  const TGLexer::DependenciesMapTy &Dependencies = Parser.getDependencies();
-  for (TGLexer::DependenciesMapTy::const_iterator I = Dependencies.begin(),
-                                                  E = Dependencies.end();
-       I != E; ++I) {
-    DepOut.os() << " " << I->first;
+  for (const auto &Dep : Parser.getDependencies()) {
+    DepOut.os() << ' ' << Dep.first;
   }
   DepOut.os() << "\n";
   DepOut.keep();
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index 34e3ab4..4ae9903 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -812,7 +812,7 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
             return VarInit::get(MCName, RV->getType());
           }
         }
-
+        assert(CurRec && "NULL pointer");
         if (Record *D = (CurRec->getRecords()).getDef(Name))
           return DefInit::get(D);
 
@@ -1629,7 +1629,7 @@ std::string DagInit::getAsString() const {
   std::string Result = "(" + Val->getAsString();
   if (!ValName.empty())
     Result += ":" + ValName;
-  if (Args.size()) {
+  if (!Args.empty()) {
     Result += " " + Args[0]->getAsString();
     if (!ArgNames[0].empty()) Result += ":$" + ArgNames[0];
     for (unsigned i = 1, e = Args.size(); i != e; ++i) {
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index 4d4bbe9..44f6a6e 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@@ -224,7 +224,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
     if (AddValue(CurRec, SubMultiClass.RefRange.Start, SMCVals[i]))
       return true;
 
-  int newDefStart = CurMC->DefPrototypes.size();
+  unsigned newDefStart = CurMC->DefPrototypes.size();
 
   // Add all of the defs in the subclass into the current multiclass.
   for (MultiClass::RecordVector::const_iterator i = SMC->DefPrototypes.begin(),
@@ -239,7 +239,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
       if (AddValue(NewDef.get(), SubMultiClass.RefRange.Start, MCVals[i]))
         return true;
 
-    CurMC->DefPrototypes.push_back(NewDef.release());
+    CurMC->DefPrototypes.push_back(std::move(NewDef));
   }
 
   const std::vector<Init *> &SMCTArgs = SMC->Rec.getTemplateArgs();
@@ -269,14 +269,9 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
 
       // If a value is specified for this template arg, set it in the
       // new defs now.
-      for (MultiClass::RecordVector::iterator j =
-             CurMC->DefPrototypes.begin() + newDefStart,
-             jend = CurMC->DefPrototypes.end();
-           j != jend;
-           ++j) {
-        Record *Def = *j;
-
-        if (SetValue(Def, SubMultiClass.RefRange.Start, SMCTArgs[i],
+      for (const auto &Def :
+             makeArrayRef(CurMC->DefPrototypes).slice(newDefStart)) {
+        if (SetValue(Def.get(), SubMultiClass.RefRange.Start, SMCTArgs[i],
                      std::vector<unsigned>(),
                      SubMultiClass.TemplateArgs[i]))
           return true;
@@ -340,26 +335,20 @@ bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){
   // This is the bottom of the recursion. We have all of the iterator values
   // for this point in the iteration space.  Instantiate a new record to
   // reflect this combination of values.
-  Record *IterRec = new Record(*CurRec);
+  auto IterRec = make_unique<Record>(*CurRec);
 
   // Set the iterator values now.
   for (unsigned i = 0, e = IterVals.size(); i != e; ++i) {
     VarInit *IterVar = IterVals[i].IterVar;
     TypedInit *IVal = dyn_cast<TypedInit>(IterVals[i].IterValue);
-    if (!IVal) {
-      Error(Loc, "foreach iterator value is untyped");
-      delete IterRec;
-      return true;
-    }
+    if (!IVal)
+      return Error(Loc, "foreach iterator value is untyped");
 
     IterRec->addValue(RecordVal(IterVar->getName(), IVal->getType(), false));
 
-    if (SetValue(IterRec, Loc, IterVar->getName(),
-                 std::vector<unsigned>(), IVal)) {
-      Error(Loc, "when instantiating this def");
-      delete IterRec;
-      return true;
-    }
+    if (SetValue(IterRec.get(), Loc, IterVar->getName(),
+                 std::vector<unsigned>(), IVal))
+      return Error(Loc, "when instantiating this def");
 
     // Resolve it next.
     IterRec->resolveReferencesTo(IterRec->getValue(IterVar->getName()));
@@ -370,17 +359,15 @@ bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){
 
   if (Records.getDef(IterRec->getNameInitAsString())) {
     // If this record is anonymous, it's no problem, just generate a new name
-    if (IterRec->isAnonymous())
-      IterRec->setName(GetNewAnonymousName());
-    else {
-      Error(Loc, "def already exists: " + IterRec->getNameInitAsString());
-      delete IterRec;
-      return true;
-    }
+    if (!IterRec->isAnonymous())
+      return Error(Loc, "def already exists: " +IterRec->getNameInitAsString());
+
+    IterRec->setName(GetNewAnonymousName());
   }
 
-  Records.addDef(IterRec);
-  IterRec->resolveReferences();
+  Record *IterRecSave = IterRec.get(); // Keep a copy before release.
+  Records.addDef(std::move(IterRec));
+  IterRecSave->resolveReferences();
   return false;
 }
 
@@ -398,8 +385,7 @@ static bool isObjectStart(tgtok::TokKind K) {
 /// GetNewAnonymousName - Generate a unique anonymous name that can be used as
 /// an identifier.
 std::string TGParser::GetNewAnonymousName() {
-  unsigned Tmp = AnonCounter++; // MSVC2012 ICEs without this.
-  return "anonymous_" + utostr(Tmp);
+  return "anonymous_" + utostr(AnonCounter++);
 }
 
 /// ParseObjectName - If an object name is specified, return it.  Otherwise,
@@ -467,7 +453,7 @@ MultiClass *TGParser::ParseMultiClassID() {
     return nullptr;
   }
 
-  MultiClass *Result = MultiClasses[Lex.getCurStrVal()];
+  MultiClass *Result = MultiClasses[Lex.getCurStrVal()].get();
   if (!Result)
     TokError("Couldn't find multiclass '" + Lex.getCurStrVal() + "'");
 
@@ -1247,26 +1233,26 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     SMLoc EndLoc = Lex.getLoc();
 
     // Create the new record, set it as CurRec temporarily.
-    Record *NewRec = new Record(GetNewAnonymousName(), NameLoc, Records,
-                                /*IsAnonymous=*/true);
+    auto NewRecOwner = llvm::make_unique<Record>(GetNewAnonymousName(), NameLoc,
+                                                 Records, /*IsAnonymous=*/true);
+    Record *NewRec = NewRecOwner.get(); // Keep a copy since we may release.
     SubClassReference SCRef;
     SCRef.RefRange = SMRange(NameLoc, EndLoc);
     SCRef.Rec = Class;
     SCRef.TemplateArgs = ValueList;
     // Add info about the subclass to NewRec.
-    if (AddSubClass(NewRec, SCRef)) {
-      delete NewRec;
+    if (AddSubClass(NewRec, SCRef))
       return nullptr;
-    }
+
     if (!CurMultiClass) {
       NewRec->resolveReferences();
-      Records.addDef(NewRec);
+      Records.addDef(std::move(NewRecOwner));
     } else {
       // This needs to get resolved once the multiclass template arguments are
       // known before any use.
       NewRec->setResolveFirst(true);
       // Otherwise, we're inside a multiclass, add it to the multiclass.
-      CurMultiClass->DefPrototypes.push_back(NewRec);
+      CurMultiClass->DefPrototypes.push_back(std::move(NewRecOwner));
 
       // Copy the template arguments for the multiclass into the def.
       const std::vector<Init *> &TArgs =
@@ -1689,7 +1675,7 @@ std::vector<Init*> TGParser::ParseValueList(Record *CurRec, Record *ArgsRec,
   unsigned int ArgN = 0;
   if (ArgsRec && !EltTy) {
     const std::vector<Init *> &TArgs = ArgsRec->getTemplateArgs();
-    if (!TArgs.size()) {
+    if (TArgs.empty()) {
       TokError("template argument provided to non-template class");
       return std::vector<Init*>();
     }
@@ -2036,27 +2022,23 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) {
   Lex.Lex();  // Eat the 'def' token.
 
   // Parse ObjectName and make a record for it.
-  Record *CurRec;
-  bool CurRecOwnershipTransferred = false;
+  std::unique_ptr<Record> CurRecOwner;
   Init *Name = ParseObjectName(CurMultiClass);
   if (Name)
-    CurRec = new Record(Name, DefLoc, Records);
+    CurRecOwner = make_unique<Record>(Name, DefLoc, Records);
   else
-    CurRec = new Record(GetNewAnonymousName(), DefLoc, Records,
-                        /*IsAnonymous=*/true);
+    CurRecOwner = llvm::make_unique<Record>(GetNewAnonymousName(), DefLoc,
+                                            Records, /*IsAnonymous=*/true);
+  Record *CurRec = CurRecOwner.get(); // Keep a copy since we may release.
 
   if (!CurMultiClass && Loops.empty()) {
     // Top-level def definition.
 
     // Ensure redefinition doesn't happen.
-    if (Records.getDef(CurRec->getNameInitAsString())) {
-      Error(DefLoc, "def '" + CurRec->getNameInitAsString()
-            + "' already defined");
-      delete CurRec;
-      return true;
-    }
-    Records.addDef(CurRec);
-    CurRecOwnershipTransferred = true;
+    if (Records.getDef(CurRec->getNameInitAsString()))
+      return Error(DefLoc, "def '" + CurRec->getNameInitAsString()+
+                   "' already defined");
+    Records.addDef(std::move(CurRecOwner));
 
     if (ParseObjectBody(CurRec))
       return true;
@@ -2066,24 +2048,17 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) {
     // before this object, instantiated prior to defs derived from this object,
     // and this available for indirect name resolution when defs derived from
     // this object are instantiated.
-    if (ParseObjectBody(CurRec)) {
-      delete CurRec;
+    if (ParseObjectBody(CurRec))
       return true;
-    }
 
     // Otherwise, a def inside a multiclass, add it to the multiclass.
     for (unsigned i = 0, e = CurMultiClass->DefPrototypes.size(); i != e; ++i)
       if (CurMultiClass->DefPrototypes[i]->getNameInit()
-          == CurRec->getNameInit()) {
-        Error(DefLoc, "def '" + CurRec->getNameInitAsString() +
-              "' already defined in this multiclass!");
-        delete CurRec;
-        return true;
-      }
-    CurMultiClass->DefPrototypes.push_back(CurRec);
-    CurRecOwnershipTransferred = true;
+          == CurRec->getNameInit())
+        return Error(DefLoc, "def '" + CurRec->getNameInitAsString() +
+                     "' already defined in this multiclass!");
+    CurMultiClass->DefPrototypes.push_back(std::move(CurRecOwner));
   } else if (ParseObjectBody(CurRec)) {
-    delete CurRec;
     return true;
   }
 
@@ -2109,15 +2084,10 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) {
   }
 
   if (ProcessForeachDefs(CurRec, DefLoc)) {
-    Error(DefLoc,
-          "Could not process loops for def" + CurRec->getNameInitAsString());
-    if (!CurRecOwnershipTransferred)
-      delete CurRec;
-    return true;
+    return Error(DefLoc, "Could not process loops for def" +
+                 CurRec->getNameInitAsString());
   }
 
-  if (!CurRecOwnershipTransferred)
-    delete CurRec;
   return false;
 }
 
@@ -2193,8 +2163,10 @@ bool TGParser::ParseClass() {
                       + "' already defined");
   } else {
     // If this is the first reference to this class, create and add it.
-    CurRec = new Record(Lex.getCurStrVal(), Lex.getLoc(), Records);
-    Records.addClass(CurRec);
+    auto NewRec =
+        llvm::make_unique<Record>(Lex.getCurStrVal(), Lex.getLoc(), Records);
+    CurRec = NewRec.get();
+    Records.addClass(std::move(NewRec));
   }
   Lex.Lex(); // eat the name.
 
@@ -2312,11 +2284,14 @@ bool TGParser::ParseMultiClass() {
     return TokError("expected identifier after multiclass for name");
   std::string Name = Lex.getCurStrVal();
 
-  if (MultiClasses.count(Name))
+  auto Result =
+    MultiClasses.insert(std::make_pair(Name,
+                    llvm::make_unique<MultiClass>(Name, Lex.getLoc(),Records)));
+
+  if (!Result.second)
     return TokError("multiclass '" + Name + "' already defined");
 
-  CurMultiClass = MultiClasses[Name] = new MultiClass(Name, 
-                                                      Lex.getLoc(), Records);
+  CurMultiClass = Result.first->second.get();
   Lex.Lex();  // Eat the identifier.
 
   // If there are template args, parse them.
@@ -2352,25 +2327,24 @@ bool TGParser::ParseMultiClass() {
   if (Lex.getCode() != tgtok::l_brace) {
     if (!inherits)
       return TokError("expected '{' in multiclass definition");
-    else if (Lex.getCode() != tgtok::semi)
+    if (Lex.getCode() != tgtok::semi)
       return TokError("expected ';' in multiclass definition");
-    else
-      Lex.Lex();  // eat the ';'.
+    Lex.Lex();  // eat the ';'.
   } else {
     if (Lex.Lex() == tgtok::r_brace)  // eat the '{'.
       return TokError("multiclass must contain at least one def");
 
     while (Lex.getCode() != tgtok::r_brace) {
       switch (Lex.getCode()) {
-        default:
-          return TokError("expected 'let', 'def' or 'defm' in multiclass body");
-        case tgtok::Let:
-        case tgtok::Def:
-        case tgtok::Defm:
-        case tgtok::Foreach:
-          if (ParseObject(CurMultiClass))
-            return true;
-         break;
+      default:
+        return TokError("expected 'let', 'def' or 'defm' in multiclass body");
+      case tgtok::Let:
+      case tgtok::Def:
+      case tgtok::Defm:
+      case tgtok::Foreach:
+        if (ParseObject(CurMultiClass))
+          return true;
+        break;
       }
     }
     Lex.Lex();  // eat the '}'.
@@ -2416,22 +2390,21 @@ InstantiateMulticlassDef(MultiClass &MC,
   // Make a trail of SMLocs from the multiclass instantiations.
   SmallVector<SMLoc, 4> Locs(1, DefmPrefixRange.Start);
   Locs.append(DefProto->getLoc().begin(), DefProto->getLoc().end());
-  Record *CurRec = new Record(DefName, Locs, Records, IsAnonymous);
+  auto CurRec = make_unique<Record>(DefName, Locs, Records, IsAnonymous);
 
   SubClassReference Ref;
   Ref.RefRange = DefmPrefixRange;
   Ref.Rec = DefProto;
-  AddSubClass(CurRec, Ref);
+  AddSubClass(CurRec.get(), Ref);
 
   // Set the value for NAME. We don't resolve references to it 'til later,
   // though, so that uses in nested multiclass names don't get
   // confused.
-  if (SetValue(CurRec, Ref.RefRange.Start, "NAME", std::vector<unsigned>(),
-               DefmPrefix)) {
+  if (SetValue(CurRec.get(), Ref.RefRange.Start, "NAME",
+               std::vector<unsigned>(), DefmPrefix)) {
     Error(DefmPrefixRange.Start, "Could not resolve "
           + CurRec->getNameInitAsString() + ":NAME to '"
           + DefmPrefix->getAsUnquotedString() + "'");
-    delete CurRec;
     return nullptr;
   }
 
@@ -2463,14 +2436,17 @@ InstantiateMulticlassDef(MultiClass &MC,
       Error(DefmPrefixRange.Start, "def '" + CurRec->getNameInitAsString() +
             "' already defined, instantiating defm with subdef '" + 
             DefProto->getNameInitAsString() + "'");
-      delete CurRec;
       return nullptr;
     }
 
-    Records.addDef(CurRec);
+    Record *CurRecSave = CurRec.get(); // Keep a copy before we release.
+    Records.addDef(std::move(CurRec));
+    return CurRecSave;
   }
 
-  return CurRec;
+  // FIXME This is bad but the ownership transfer to caller is pretty messy.
+  // The unique_ptr in this function at least protects the exits above.
+  return CurRec.release();
 }
 
 bool TGParser::ResolveMulticlassDefArgs(MultiClass &MC,
@@ -2526,7 +2502,7 @@ bool TGParser::ResolveMulticlassDef(MultiClass &MC,
         == CurRec->getNameInit())
       return Error(DefmPrefixLoc, "defm '" + CurRec->getNameInitAsString() +
                    "' already defined in this multiclass!");
-  CurMultiClass->DefPrototypes.push_back(CurRec);
+  CurMultiClass->DefPrototypes.push_back(std::unique_ptr<Record>(CurRec));
 
   // Copy the template arguments for the multiclass into the new def.
   const std::vector<Init *> &TA =
@@ -2576,7 +2552,7 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
     // To instantiate a multiclass, we need to first get the multiclass, then
     // instantiate each def contained in the multiclass with the SubClassRef
     // template parameters.
-    MultiClass *MC = MultiClasses[Ref.Rec->getName()];
+    MultiClass *MC = MultiClasses[Ref.Rec->getName()].get();
     assert(MC && "Didn't lookup multiclass correctly?");
     std::vector<Init*> &TemplateVals = Ref.TemplateArgs;
 
@@ -2588,7 +2564,7 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
 
     // Loop over all the def's in the multiclass, instantiating each one.
     for (unsigned i = 0, e = MC->DefPrototypes.size(); i != e; ++i) {
-      Record *DefProto = MC->DefPrototypes[i];
+      Record *DefProto = MC->DefPrototypes[i].get();
 
       Record *CurRec = InstantiateMulticlassDef(*MC, DefProto, DefmPrefix,
                                                 SMRange(DefmLoc,
diff --git a/lib/TableGen/TGParser.h b/lib/TableGen/TGParser.h
index 79994cb..22a00e5 100644
--- a/lib/TableGen/TGParser.h
+++ b/lib/TableGen/TGParser.h
@@ -55,7 +55,7 @@ namespace llvm {
 class TGParser {
   TGLexer Lex;
   std::vector<std::vector<LetRecord> > LetStack;
-  std::map<std::string, MultiClass*> MultiClasses;
+  std::map<std::string, std::unique_ptr<MultiClass>> MultiClasses;
 
   /// Loops - Keep track of any foreach loops we are within.
   ///
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index e96d18b..21106c9 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -40,9 +40,6 @@ FunctionPass *createAArch64ConditionOptimizerPass();
 FunctionPass *createAArch64AddressTypePromotionPass();
 FunctionPass *createAArch64A57FPLoadBalancing();
 FunctionPass *createAArch64A53Fix835769();
-/// \brief Creates an ARM-specific Target Transformation Info pass.
-ImmutablePass *
-createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM);
 
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index e6a27c3..dff48f9 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -91,6 +91,8 @@ def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
 
 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
 def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
+// FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
+def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>;
 def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp
index 852a635..dd401c6 100644
--- a/lib/Target/AArch64/AArch64A53Fix835769.cpp
+++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -16,8 +16,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
-#include "AArch64InstrInfo.h"
-#include "AArch64Subtarget.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -26,6 +24,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
 
@@ -79,7 +78,7 @@ static bool isSecondInstructionInSequence(MachineInstr *MI) {
 
 namespace {
 class AArch64A53Fix835769 : public MachineFunctionPass {
-  const AArch64InstrInfo *TII;
+  const TargetInstrInfo *TII;
 
 public:
   static char ID;
@@ -107,17 +106,13 @@ char AArch64A53Fix835769::ID = 0;
 
 bool
 AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) {
-  const TargetMachine &TM = F.getTarget();
-
-  bool Changed = false;
   DEBUG(dbgs() << "***** AArch64A53Fix835769 *****\n");
-
-  TII = TM.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  bool Changed = false;
+  TII = F.getSubtarget().getInstrInfo();
 
   for (auto &MBB : F) {
     Changed |= runOnBasicBlock(MBB);
   }
-
   return Changed;
 }
 
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 2503764..2cf3c22 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -38,8 +38,8 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -96,6 +96,10 @@ static bool isMla(MachineInstr *MI) {
   }
 }
 
+namespace llvm {
+static void initializeAArch64A57FPLoadBalancingPass(PassRegistry &);
+}
+
 //===----------------------------------------------------------------------===//
 
 namespace {
@@ -109,14 +113,15 @@ static const char *ColorNames[2] = { "Even", "Odd" };
 class Chain;
 
 class AArch64A57FPLoadBalancing : public MachineFunctionPass {
-  const AArch64InstrInfo *TII;
   MachineRegisterInfo *MRI;
   const TargetRegisterInfo *TRI;
   RegisterClassInfo RCI;
 
 public:
   static char ID;
-  explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) {}
+  explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) {
+    initializeAArch64A57FPLoadBalancingPass(*PassRegistry::getPassRegistry());
+  }
 
   bool runOnMachineFunction(MachineFunction &F) override;
 
@@ -143,8 +148,16 @@ private:
   Color getColor(unsigned Register);
   Chain *getAndEraseNext(Color PreferredColor, std::vector<Chain*> &L);
 };
+}
+
 char AArch64A57FPLoadBalancing::ID = 0;
 
+INITIALIZE_PASS_BEGIN(AArch64A57FPLoadBalancing, DEBUG_TYPE,
+                      "AArch64 A57 FP Load-Balancing", false, false)
+INITIALIZE_PASS_END(AArch64A57FPLoadBalancing, DEBUG_TYPE,
+                    "AArch64 A57 FP Load-Balancing", false, false)
+
+namespace {
 /// A Chain is a sequence of instructions that are linked together by 
 /// an accumulation operand. For example:
 ///
@@ -259,7 +272,7 @@ public:
   }
 
   /// Return true if this chain starts before Other.
-  bool startsBefore(Chain *Other) {
+  bool startsBefore(const Chain *Other) const {
     return StartInstIdx < Other->StartInstIdx;
   }
 
@@ -297,10 +310,8 @@ bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
   bool Changed = false;
   DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n");
 
-  const TargetMachine &TM = F.getTarget();
   MRI = &F.getRegInfo();
   TRI = F.getRegInfo().getTargetRegisterInfo();
-  TII = TM.getSubtarget<AArch64Subtarget>().getInstrInfo();
   RCI.runOnMachineFunction(F);
 
   for (auto &MBB : F) {
@@ -431,10 +442,17 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
   // chains that we cannot change before we look at those we can,
   // so the parity counter is updated and we know what color we should
   // change them to!
+  // Final tie-break with instruction order so pass output is stable (i.e. not
+  // dependent on malloc'd pointer values).
   std::sort(GV.begin(), GV.end(), [](const Chain *G1, const Chain *G2) {
       if (G1->size() != G2->size())
         return G1->size() > G2->size();
-      return G1->requiresFixup() > G2->requiresFixup();
+      if (G1->requiresFixup() != G2->requiresFixup())
+        return G1->requiresFixup() > G2->requiresFixup();
+      // Make sure startsBefore() produces a stable final order.
+      assert((G1 == G2 || (G1->startsBefore(G2) ^ G2->startsBefore(G1))) &&
+             "Starts before not total order!");
+      return G1->startsBefore(G2);
     });
 
   Color PreferredColor = Parity < 0 ? Color::Even : Color::Odd;
@@ -481,10 +499,16 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
     RS.forward(I);
     AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID));
 
-    // Remove any registers clobbered by a regmask.
+    // Remove any registers clobbered by a regmask or any def register that is
+    // immediately dead.
     for (auto J : I->operands()) {
       if (J.isRegMask())
         AvailableRegs.clearBitsNotInMask(J.getRegMask());
+
+      if (J.isReg() && J.isDef() && AvailableRegs[J.getReg()]) {
+        assert(J.isDead() && "Non-dead def should have been removed by now!");
+        AvailableRegs.reset(J.getReg());
+      }
     }
   }
 
diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 5afe0f4..f27dfc9 100644
--- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -64,7 +64,7 @@ STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
 namespace {
 class AArch64AdvSIMDScalar : public MachineFunctionPass {
   MachineRegisterInfo *MRI;
-  const AArch64InstrInfo *TII;
+  const TargetInstrInfo *TII;
 
 private:
   // isProfitableToTransform - Predicate function to determine whether an
@@ -268,7 +268,7 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
   return TransformAll;
 }
 
-static MachineInstr *insertCopy(const AArch64InstrInfo *TII, MachineInstr *MI,
+static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr *MI,
                                 unsigned Dst, unsigned Src, bool IsKill) {
   MachineInstrBuilder MIB =
       BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY),
@@ -376,10 +376,8 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
   bool Changed = false;
   DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
 
-  const TargetMachine &TM = mf.getTarget();
   MRI = &mf.getRegInfo();
-  TII = static_cast<const AArch64InstrInfo *>(
-      TM.getSubtargetImpl()->getInstrInfo());
+  TII = mf.getSubtarget().getInstrInfo();
 
   // Just check things on a one-block-at-a-time basis.
   for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 8bee4f5..d64d851 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -43,19 +43,13 @@ using namespace llvm;
 namespace {
 
 class AArch64AsmPrinter : public AsmPrinter {
-  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
-  /// make the right decision when printing asm code for different targets.
-  const AArch64Subtarget *Subtarget;
-
   AArch64MCInstLower MCInstLowering;
   StackMaps SM;
 
 public:
-  AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : AsmPrinter(TM, Streamer),
-        Subtarget(&TM.getSubtarget<AArch64Subtarget>()),
-        MCInstLowering(OutContext, *this), SM(*this), AArch64FI(nullptr),
-        LOHLabelCounter(0) {}
+  AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this),
+        SM(*this), AArch64FI(nullptr), LOHLabelCounter(0) {}
 
   const char *getPassName() const override {
     return "AArch64 Assembly Printer";
@@ -124,7 +118,8 @@ private:
 //===----------------------------------------------------------------------===//
 
 void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
-  if (Subtarget->isTargetMachO()) {
+  Triple TT(TM.getTargetTriple());
+  if (TT.isOSBinFormatMachO()) {
     // Funny Darwin hack: This flag tells the linker that no global symbols
     // contain code that falls through to other global symbols (e.g. the obvious
     // implementation of multiple entry points).  If this doesn't occur, the
@@ -135,7 +130,7 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
   }
 
   // Emit a .data.rel section containing any stubs that were created.
-  if (Subtarget->isTargetELF()) {
+  if (TT.isOSBinFormatELF()) {
     const TargetLoweringObjectFileELF &TLOFELF =
       static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
 
@@ -145,7 +140,7 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
     MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
     if (!Stubs.empty()) {
       OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
-      const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
+      const DataLayout *TD = TM.getDataLayout();
 
       for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
         OutStreamer.EmitLabel(Stubs[i].first);
@@ -252,8 +247,8 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
                                            const TargetRegisterClass *RC,
                                            bool isVector, raw_ostream &O) {
   assert(MO.isReg() && "Should only get here with a register!");
-  const AArch64RegisterInfo *RI = static_cast<const AArch64RegisterInfo *>(
-      TM.getSubtargetImpl()->getRegisterInfo());
+  const AArch64RegisterInfo *RI =
+      MF->getSubtarget<AArch64Subtarget>().getRegisterInfo();
   unsigned Reg = MO.getReg();
   unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
   assert(RI->regsOverlap(RegToPrint, Reg));
@@ -381,8 +376,23 @@ void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
   unsigned NumNOPBytes = MI.getOperand(1).getImm();
 
   SM.recordStackMap(MI);
-  // Emit padding.
   assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+
+  // Scan ahead to trim the shadow.
+  const MachineBasicBlock &MBB = *MI.getParent();
+  MachineBasicBlock::const_iterator MII(MI);
+  ++MII;
+  while (NumNOPBytes > 0) {
+    if (MII == MBB.end() || MII->isCall() ||
+        MII->getOpcode() == AArch64::DBG_VALUE ||
+        MII->getOpcode() == TargetOpcode::PATCHPOINT ||
+        MII->getOpcode() == TargetOpcode::STACKMAP)
+      break;
+    ++MII;
+    NumNOPBytes -= 4;
+  }
+
+  // Emit nops.
   for (unsigned i = 0; i < NumNOPBytes; i += 4)
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
 }
diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
index e2b6367..d973234 100644
--- a/lib/Target/AArch64/AArch64BranchRelaxation.cpp
+++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
@@ -476,9 +476,7 @@ bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
 
   DEBUG(dbgs() << "***** AArch64BranchRelaxation *****\n");
 
-  TII = (const AArch64InstrInfo *)MF->getTarget()
-            .getSubtargetImpl()
-            ->getInstrInfo();
+  TII = (const AArch64InstrInfo *)MF->getSubtarget().getInstrInfo();
 
   // Renumber all of the machine basic blocks in the function, guaranteeing that
   // the numbers agree with the position of the block in the function.
diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h
new file mode 100644
index 0000000..1e2d1c3
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CallingConvention.h
@@ -0,0 +1,141 @@
+//=== AArch64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the AArch64 Calling Convention
+// that aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace {
+using namespace llvm;
+
+static const uint16_t XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
+                                    AArch64::X3, AArch64::X4, AArch64::X5,
+                                    AArch64::X6, AArch64::X7};
+static const uint16_t HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
+                                    AArch64::H3, AArch64::H4, AArch64::H5,
+                                    AArch64::H6, AArch64::H7};
+static const uint16_t SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
+                                    AArch64::S3, AArch64::S4, AArch64::S5,
+                                    AArch64::S6, AArch64::S7};
+static const uint16_t DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
+                                    AArch64::D3, AArch64::D4, AArch64::D5,
+                                    AArch64::D6, AArch64::D7};
+static const uint16_t QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
+                                    AArch64::Q3, AArch64::Q4, AArch64::Q5,
+                                    AArch64::Q6, AArch64::Q7};
+
+static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
+                             MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
+                             CCState &State, unsigned SlotAlign) {
+  unsigned Size = LocVT.getSizeInBits() / 8;
+  unsigned StackAlign = State.getMachineFunction()
+                            .getTarget()
+                            .getDataLayout()
+                            ->getStackAlignment();
+  unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign);
+
+  for (auto &It : PendingMembers) {
+    It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign)));
+    State.addLoc(It);
+    SlotAlign = 1;
+  }
+
+  // All pending members have now been allocated
+  PendingMembers.clear();
+  return true;
+}
+
+/// The Darwin variadic PCS places anonymous arguments in 8-byte stack slots. An
+/// [N x Ty] type must still be contiguous in memory though.
+static bool CC_AArch64_Custom_Stack_Block(
+      unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+      ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+  // Add the argument to the list to be allocated once we know the size of the
+  // block.
+  PendingMembers.push_back(
+      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+  if (!ArgFlags.isInConsecutiveRegsLast())
+    return true;
+
+  return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8);
+}
+
+/// Given an [N x Ty] block, it should be passed in a consecutive sequence of
+/// registers. If no such sequence is available, mark the rest of the registers
+/// of that type as used and place the argument on the stack.
+static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                    CCValAssign::LocInfo &LocInfo,
+                                    ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  // Try to allocate a contiguous block of registers, each of the correct
+  // size to hold one member.
+  ArrayRef<uint16_t> RegList;
+  if (LocVT.SimpleTy == MVT::i64)
+    RegList = XRegList;
+  else if (LocVT.SimpleTy == MVT::f16)
+    RegList = HRegList;
+  else if (LocVT.SimpleTy == MVT::f32 || LocVT.is32BitVector())
+    RegList = SRegList;
+  else if (LocVT.SimpleTy == MVT::f64 || LocVT.is64BitVector())
+    RegList = DRegList;
+  else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector())
+    RegList = QRegList;
+  else {
+    // Not an array we want to split up after all.
+    return false;
+  }
+
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+  // Add the argument to the list to be allocated once we know the size of the
+  // block.
+  PendingMembers.push_back(
+      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+  if (!ArgFlags.isInConsecutiveRegsLast())
+    return true;
+
+  unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
+  if (RegResult) {
+    for (auto &It : PendingMembers) {
+      It.convertToReg(RegResult);
+      State.addLoc(It);
+      ++RegResult;
+    }
+    PendingMembers.clear();
+    return true;
+  }
+
+  // Mark all regs in the class as unavailable
+  for (auto Reg : RegList)
+    State.AllocateReg(Reg);
+
+  const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
+      State.getMachineFunction().getSubtarget());
+  unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8;
+
+  return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign);
+}
+
+}
+
+#endif
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 9e707e4..4691e94 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -16,7 +16,7 @@ class CCIfAlign<string Align, CCAction A> :
   CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
 /// CCIfBigEndian - Match only if we're in big endian mode.
 class CCIfBigEndian<CCAction A> :
-  CCIf<"State.getMachineFunction().getSubtarget().getDataLayout()->isBigEndian()", A>;
+  CCIf<"State.getMachineFunction().getTarget().getDataLayout()->isBigEndian()", A>;
 
 //===----------------------------------------------------------------------===//
 // ARM AAPCS64 Calling Convention
@@ -40,6 +40,8 @@ def CC_AArch64_AAPCS : CallingConv<[
   // slot is 64-bit.
   CCIfByVal<CCPassByVal<8, 8>>,
 
+  CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
+
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -119,6 +121,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[
   // slot is 64-bit.
   CCIfByVal<CCPassByVal<8, 8>>,
 
+  CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
+
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -159,6 +163,8 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
 
+  CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Stack_Block">>,
+
   // Handle all scalar types as either i64 or f64.
   CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
   CCIfType<[f16, f32],     CCPromoteToType<f64>>,
@@ -198,6 +204,44 @@ def RetCC_AArch64_WebKit_JS : CallingConv<[
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
 ]>;
 
+//===----------------------------------------------------------------------===//
+// ARM64 Calling Convention for GHC
+//===----------------------------------------------------------------------===//
+
+// This calling convention is specific to the Glasgow Haskell Compiler.
+// The only documentation is the GHC source code, specifically the C header
+// file:
+//
+//     https://github.com/ghc/ghc/blob/master/includes/stg/MachRegs.h
+//
+// which defines the registers for the Spineless Tagless G-Machine (STG) that
+// GHC uses to implement lazy evaluation. The generic STG machine has a set of
+// registers which are mapped to appropriate set of architecture specific
+// registers for each CPU architecture.
+//
+// The STG Machine is documented here:
+//
+//    https://ghc.haskell.org/trac/ghc/wiki/Commentary/Compiler/GeneratedCode
+//
+// The AArch64 register mapping is under the heading "The ARMv8/AArch64 ABI
+// register mapping".
+
+def CC_AArch64_GHC : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, f128], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>,
+  CCIfType<[f32], CCAssignToReg<[S8, S9, S10, S11]>>,
+  CCIfType<[f64], CCAssignToReg<[D12, D13, D14, D15]>>,
+
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
+  CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>>
+]>;
+
 // FIXME: LR is only callee-saved in the sense that *we* preserve it and are
 // presumably a callee to someone. External functions may not do so, but this
 // is currently safe since BL has LR as an implicit-def and what happens after a
@@ -243,3 +287,4 @@ def CSR_AArch64_AllRegs
                            (sequence "S%u", 0, 31), (sequence "D%u", 0, 31),
                            (sequence "Q%u", 0, 31))>;
 
+def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>;
diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index aab8e38..3b74481 100644
--- a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -92,9 +92,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
   MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I,
                                        unsigned TLSBaseAddrReg) {
     MachineFunction *MF = I->getParent()->getParent();
-    const AArch64TargetMachine *TM =
-        static_cast<const AArch64TargetMachine *>(&MF->getTarget());
-    const AArch64InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
+    const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
     // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
     // code sequence assumes the address will be.
@@ -112,9 +110,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
   // inserting a copy instruction after I. Returns the new instruction.
   MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
     MachineFunction *MF = I->getParent()->getParent();
-    const AArch64TargetMachine *TM =
-        static_cast<const AArch64TargetMachine *>(&MF->getTarget());
-    const AArch64InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
+    const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
     // Create a virtual register for the TLS base address.
     MachineRegisterInfo &RegInfo = MF->getRegInfo();
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
index 87b545b..938dcb3 100644
--- a/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -285,9 +285,7 @@ static void initReachingDef(MachineFunction &MF,
                             BlockToSetOfInstrsPerColor &ReachableUses,
                             const MapRegToId &RegToId,
                             const MachineInstr *DummyOp, bool ADRPMode) {
-  const TargetMachine &TM = MF.getTarget();
-  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
-
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   unsigned NbReg = RegToId.size();
 
   for (MachineBasicBlock &MBB : MF) {
@@ -1026,8 +1024,7 @@ static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId,
 }
 
 bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
-  const TargetMachine &TM = MF.getTarget();
-  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
 
   MapRegToId RegToId;
@@ -1043,8 +1040,7 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
 
   MachineInstr *DummyOp = nullptr;
   if (BasicBlockScopeOnly) {
-    const AArch64InstrInfo *TII = static_cast<const AArch64InstrInfo *>(
-        TM.getSubtargetImpl()->getInstrInfo());
+    const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
     // For local analysis, create a dummy operation to record uses that are not
     // local.
     DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc());
diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index 0fbd3c6..e68571f 100644
--- a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -304,7 +304,7 @@ bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI,
 bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
                << "********** Function: " << MF.getName() << '\n');
-  TII = MF.getTarget().getSubtargetImpl()->getInstrInfo();
+  TII = MF.getSubtarget().getInstrInfo();
   DomTree = &getAnalysis<MachineDominatorTree>();
 
   bool Changed = false;
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 54f53dc..fccd8df 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -893,15 +893,13 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
                << "********** Function: " << MF.getName() << '\n');
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getSubtarget().getRegisterInfo();
-  SchedModel =
-      MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
+  SchedModel = MF.getSubtarget().getSchedModel();
   MRI = &MF.getRegInfo();
   DomTree = &getAnalysis<MachineDominatorTree>();
   Loops = getAnalysisIfAvailable<MachineLoopInfo>();
   Traces = &getAnalysis<MachineTraceMetrics>();
   MinInstr = nullptr;
-  MinSize = MF.getFunction()->getAttributes().hasAttribute(
-      AttributeSet::FunctionIndex, Attribute::MinSize);
+  MinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
 
   bool Changed = false;
   CmpConv.runOnMachineFunction(MF);
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index c850680..41b1132 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -229,7 +229,7 @@ static bool isStartChunk(uint64_t Chunk) {
   if (Chunk == 0 || Chunk == UINT64_MAX)
     return false;
 
-  return (CountLeadingOnes_64(Chunk) + countTrailingZeros(Chunk)) == 64;
+  return isMask_64(~Chunk);
 }
 
 /// \brief Check whether this chunk matches the pattern '0...1...' This pattern
@@ -239,7 +239,7 @@ static bool isEndChunk(uint64_t Chunk) {
   if (Chunk == 0 || Chunk == UINT64_MAX)
     return false;
 
-  return (countLeadingZeros(Chunk) + CountTrailingOnes_64(Chunk)) == 64;
+  return isMask_64(Chunk);
 }
 
 /// \brief Clear or set all bits in the chunk at the given index.
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 612cb00..61017c1 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
+#include "AArch64CallingConvention.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
@@ -244,9 +245,10 @@ public:
   unsigned fastMaterializeFloatZero(const ConstantFP* CF) override;
 
   explicit AArch64FastISel(FunctionLoweringInfo &FuncInfo,
-                         const TargetLibraryInfo *LibInfo)
+                           const TargetLibraryInfo *LibInfo)
       : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) {
-    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+    Subtarget =
+        &static_cast<const AArch64Subtarget &>(FuncInfo.MF->getSubtarget());
     Context = &FuncInfo.Fn->getContext();
   }
 
@@ -301,6 +303,8 @@ static unsigned getImplicitScaleFactor(MVT VT) {
 CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
   if (CC == CallingConv::WebKit_JS)
     return CC_AArch64_WebKit_JS;
+  if (CC == CallingConv::GHC)
+    return CC_AArch64_GHC;
   return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS;
 }
 
@@ -366,6 +370,24 @@ unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) {
     return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
   }
 
+  // For the MachO large code model materialize the FP constant in code.
+  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
+    unsigned Opc1 = Is64Bit ? AArch64::MOVi64imm : AArch64::MOVi32imm;
+    const TargetRegisterClass *RC = Is64Bit ?
+        &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+    unsigned TmpReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc1), TmpReg)
+        .addImm(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(TmpReg, getKillRegState(true));
+
+    return ResultReg;
+  }
+
   // Materialize via constant pool.  MachineConstantPool wants an explicit
   // alignment.
   unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
@@ -752,7 +774,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
     if (Addr.getOffsetReg())
       break;
 
-    if (DL.getTypeSizeInBits(Ty) != 8)
+    if (!Ty || DL.getTypeSizeInBits(Ty) != 8)
       break;
 
     const Value *LHS = U->getOperand(0);
@@ -2112,15 +2134,15 @@ bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
 
   int TestBit = -1;
   bool IsCmpNE;
-  if ((Predicate == CmpInst::ICMP_EQ) || (Predicate == CmpInst::ICMP_NE)) {
-    if (const auto *C = dyn_cast<Constant>(LHS))
-      if (C->isNullValue())
-        std::swap(LHS, RHS);
-
-    if (!isa<Constant>(RHS))
-      return false;
+  switch (Predicate) {
+  default:
+    return false;
+  case CmpInst::ICMP_EQ:
+  case CmpInst::ICMP_NE:
+    if (isa<Constant>(LHS) && cast<Constant>(LHS)->isNullValue())
+      std::swap(LHS, RHS);
 
-    if (!cast<Constant>(RHS)->isNullValue())
+    if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue())
       return false;
 
     if (const auto *AI = dyn_cast<BinaryOperator>(LHS))
@@ -2143,26 +2165,27 @@ bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
       TestBit = 0;
 
     IsCmpNE = Predicate == CmpInst::ICMP_NE;
-  } else if (Predicate == CmpInst::ICMP_SLT) {
-    if (!isa<Constant>(RHS))
-      return false;
-
-    if (!cast<Constant>(RHS)->isNullValue())
+    break;
+  case CmpInst::ICMP_SLT:
+  case CmpInst::ICMP_SGE:
+    if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue())
       return false;
 
     TestBit = BW - 1;
-    IsCmpNE = true;
-  } else if (Predicate == CmpInst::ICMP_SGT) {
+    IsCmpNE = Predicate == CmpInst::ICMP_SLT;
+    break;
+  case CmpInst::ICMP_SGT:
+  case CmpInst::ICMP_SLE:
     if (!isa<ConstantInt>(RHS))
       return false;
 
-    if (cast<ConstantInt>(RHS)->getValue() != -1)
+    if (cast<ConstantInt>(RHS)->getValue() != APInt(BW, -1, true))
       return false;
 
     TestBit = BW - 1;
-    IsCmpNE = false;
-  } else
-    return false;
+    IsCmpNE = Predicate == CmpInst::ICMP_SLE;
+    break;
+  } // end switch
 
   static const unsigned OpcTable[2][2][2] = {
     { {AArch64::CBZW,  AArch64::CBZX },
@@ -3302,8 +3325,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     MFI->setFrameAddressIsTaken(true);
 
     const AArch64RegisterInfo *RegInfo =
-        static_cast<const AArch64RegisterInfo *>(
-            TM.getSubtargetImpl()->getRegisterInfo());
+        static_cast<const AArch64RegisterInfo *>(Subtarget->getRegisterInfo());
     unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
     unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index a7779d6..84bf317 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -64,8 +64,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
     return false;
   // Don't use the red zone if the function explicitly asks us not to.
   // This is typically used for kernel code.
-  if (MF.getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::NoRedZone))
+  if (MF.getFunction()->hasFnAttribute(Attribute::NoRedZone))
     return false;
 
   const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -167,7 +166,7 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
   if (CSI.empty())
     return;
 
-  const DataLayout *TD = MF.getSubtarget().getDataLayout();
+  const DataLayout *TD = MF.getTarget().getDataLayout();
   bool HasFP = hasFP(MF);
 
   // Calculate amount of bytes used for return address storing.
@@ -196,7 +195,8 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
     unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
         nullptr, DwarfReg, Offset - TotalSkipped));
     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
   }
 }
 
@@ -214,6 +214,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
   bool HasFP = hasFP(MF);
   DebugLoc DL = MBB.findDebugLoc(MBBI);
 
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
   int NumBytes = (int)MFI->getStackSize();
   if (!AFI->hasStackFrame()) {
     assert(!HasFP && "unexpected function without stack frame but with FP");
@@ -234,7 +239,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
     } else if (NumBytes) {
       ++NumRedZoneFunctions;
     }
@@ -301,7 +307,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
     TII->copyPhysReg(MBB, MBBI, DL, AArch64::X19, AArch64::SP, false);
 
   if (needsFrameMoves) {
-    const DataLayout *TD = MF.getSubtarget().getDataLayout();
+    const DataLayout *TD = MF.getTarget().getDataLayout();
     const int StackGrowth = -TD->getPointerSize(0);
     unsigned FramePtr = RegInfo->getFrameRegister(MF);
 
@@ -377,26 +383,30 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
 
       // Record the location of the stored LR
       unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true);
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
 
       // Record the location of the stored FP
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
     } else {
       // Encode the stack size of the leaf function.
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize()));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
     }
 
     // Now emit the moves for whatever callee saved regs we have.
@@ -445,6 +455,11 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   int NumBytes = MFI->getStackSize();
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
 
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
   // Initial and residual are named for consitency with the prologue. Note that
   // in the epilogue, the residual adjustment is executed first.
   uint64_t ArgumentPopSize = 0;
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 87a6d80..ac11c4d 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -53,12 +53,10 @@ public:
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    AttributeSet FnAttrs = MF.getFunction()->getAttributes();
     ForCodeSize =
-        FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                             Attribute::OptimizeForSize) ||
-        FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
-    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+        MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) ||
+        MF.getFunction()->hasFnAttribute(Attribute::MinSize);
+    Subtarget = &MF.getSubtarget<AArch64Subtarget>();
     return SelectionDAGISel::runOnMachineFunction(MF);
   }
 
@@ -134,8 +132,8 @@ public:
 
   /// Generic helper for the createDTuple/createQTuple
   /// functions. Those should almost always be called instead.
-  SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
-                      unsigned SubRegs[]);
+  SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[],
+                      const unsigned SubRegs[]);
 
   SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
 
@@ -569,6 +567,27 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
   return isWorthFolding(N);
 }
 
+/// If there's a use of this ADDlow that's not itself a load/store then we'll
+/// need to create a real ADD instruction from it anyway and there's no point in
+/// folding it into the mem op. Theoretically, it shouldn't matter, but there's
+/// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
+/// leads to duplaicated ADRP instructions.
+static bool isWorthFoldingADDlow(SDValue N) {
+  for (auto Use : N->uses()) {
+    if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
+        Use->getOpcode() != ISD::ATOMIC_LOAD &&
+        Use->getOpcode() != ISD::ATOMIC_STORE)
+      return false;
+
+    // ldar and stlr have much more restrictive addressing modes (just a
+    // register).
+    if (cast<MemSDNode>(Use)->getOrdering() > Monotonic)
+      return false;
+  }
+
+  return true;
+}
+
 /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
 /// immediate" address.  The "Size" argument is the size in bytes of the memory
 /// reference, which determines the scale.
@@ -582,7 +601,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
     return true;
   }
 
-  if (N.getOpcode() == AArch64ISD::ADDlow) {
+  if (N.getOpcode() == AArch64ISD::ADDlow && isWorthFoldingADDlow(N)) {
     GlobalAddressSDNode *GAN =
         dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
     Base = N.getOperand(0);
@@ -594,7 +613,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
     unsigned Alignment = GV->getAlignment();
     const DataLayout *DL = TLI->getDataLayout();
     Type *Ty = GV->getType()->getElementType();
-    if (Alignment == 0 && Ty->isSized() && !Subtarget->isTargetDarwin())
+    if (Alignment == 0 && Ty->isSized())
       Alignment = DL->getABITypeAlignment(Ty);
 
     if (Alignment >= Size)
@@ -869,26 +888,26 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
 }
 
 SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = {
+  static const unsigned RegClassIDs[] = {
       AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
-  static unsigned SubRegs[] = { AArch64::dsub0, AArch64::dsub1,
-                                AArch64::dsub2, AArch64::dsub3 };
+  static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
+                                     AArch64::dsub2, AArch64::dsub3};
 
   return createTuple(Regs, RegClassIDs, SubRegs);
 }
 
 SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = {
+  static const unsigned RegClassIDs[] = {
       AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
-  static unsigned SubRegs[] = { AArch64::qsub0, AArch64::qsub1,
-                                AArch64::qsub2, AArch64::qsub3 };
+  static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
+                                     AArch64::qsub2, AArch64::qsub3};
 
   return createTuple(Regs, RegClassIDs, SubRegs);
 }
 
 SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
-                                         unsigned RegClassIDs[],
-                                         unsigned SubRegs[]) {
+                                         const unsigned RegClassIDs[],
+                                         const unsigned SubRegs[]) {
   // There's no special register-class for a vector-list of 1 element: it's just
   // a vector.
   if (Regs.size() == 1)
@@ -1033,13 +1052,10 @@ SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs,
   EVT VT = N->getValueType(0);
   SDValue Chain = N->getOperand(0);
 
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(2)); // Mem operand;
-  Ops.push_back(Chain);
+  SDValue Ops[] = {N->getOperand(2), // Mem operand;
+                   Chain};
 
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
+  EVT ResTys[] = {MVT::Untyped, MVT::Other};
 
   SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
   SDValue SuperReg = SDValue(Ld, 0);
@@ -1057,15 +1073,12 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
   EVT VT = N->getValueType(0);
   SDValue Chain = N->getOperand(0);
 
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(1)); // Mem operand
-  Ops.push_back(N->getOperand(2)); // Incremental
-  Ops.push_back(Chain);
+  SDValue Ops[] = {N->getOperand(1), // Mem operand
+                   N->getOperand(2), // Incremental
+                   Chain};
 
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::i64); // Type of the write back register
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
+  EVT ResTys[] = {MVT::i64, // Type of the write back register
+                  MVT::Untyped, MVT::Other};
 
   SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
 
@@ -1096,10 +1109,7 @@ SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
   SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
   SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
 
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(N->getOperand(NumVecs + 2));
-  Ops.push_back(N->getOperand(0));
+  SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)};
   SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
 
   return St;
@@ -1109,20 +1119,18 @@ SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
                                              unsigned Opc) {
   SDLoc dl(N);
   EVT VT = N->getOperand(2)->getValueType(0);
-  SmallVector<EVT, 2> ResTys;
-  ResTys.push_back(MVT::i64);   // Type of the write back register
-  ResTys.push_back(MVT::Other); // Type for the Chain
+  EVT ResTys[] = {MVT::i64,    // Type of the write back register
+                  MVT::Other}; // Type for the Chain
 
   // Form a REG_SEQUENCE to force register allocation.
   bool Is128Bit = VT.getSizeInBits() == 128;
   SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
   SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
 
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(N->getOperand(NumVecs + 1)); // base register
-  Ops.push_back(N->getOperand(NumVecs + 2)); // Incremental
-  Ops.push_back(N->getOperand(0)); // Chain
+  SDValue Ops[] = {RegSeq,
+                   N->getOperand(NumVecs + 1), // base register
+                   N->getOperand(NumVecs + 2), // Incremental
+                   N->getOperand(0)};          // Chain
   SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
 
   return St;
@@ -1176,18 +1184,13 @@ SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
 
   SDValue RegSeq = createQTuple(Regs);
 
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
+  EVT ResTys[] = {MVT::Untyped, MVT::Other};
 
   unsigned LaneNo =
       cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
 
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
-  Ops.push_back(N->getOperand(NumVecs + 3));
-  Ops.push_back(N->getOperand(0));
+  SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, MVT::i64),
+                   N->getOperand(NumVecs + 3), N->getOperand(0)};
   SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
   SDValue SuperReg = SDValue(Ld, 0);
 
@@ -1221,20 +1224,17 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
 
   SDValue RegSeq = createQTuple(Regs);
 
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::i64); // Type of the write back register
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
+  EVT ResTys[] = {MVT::i64, // Type of the write back register
+                  MVT::Untyped, MVT::Other};
 
   unsigned LaneNo =
       cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
 
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); // Lane Number
-  Ops.push_back(N->getOperand(NumVecs + 2)); // Base register
-  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
-  Ops.push_back(N->getOperand(0));
+  SDValue Ops[] = {RegSeq,
+                   CurDAG->getTargetConstant(LaneNo, MVT::i64), // Lane Number
+                   N->getOperand(NumVecs + 2),                  // Base register
+                   N->getOperand(NumVecs + 3),                  // Incremental
+                   N->getOperand(0)};
   SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
 
   // Update uses of the write back register
@@ -1282,11 +1282,8 @@ SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
   unsigned LaneNo =
       cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
 
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
-  Ops.push_back(N->getOperand(NumVecs + 3));
-  Ops.push_back(N->getOperand(0));
+  SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, MVT::i64),
+                   N->getOperand(NumVecs + 3), N->getOperand(0)};
   SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
 
   // Transfer memoperands.
@@ -1312,19 +1309,16 @@ SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
 
   SDValue RegSeq = createQTuple(Regs);
 
-  SmallVector<EVT, 2> ResTys;
-  ResTys.push_back(MVT::i64);   // Type of the write back register
-  ResTys.push_back(MVT::Other);
+  EVT ResTys[] = {MVT::i64, // Type of the write back register
+                  MVT::Other};
 
   unsigned LaneNo =
       cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
 
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
-  Ops.push_back(N->getOperand(NumVecs + 2)); // Base Register
-  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
-  Ops.push_back(N->getOperand(0));
+  SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, MVT::i64),
+                   N->getOperand(NumVecs + 2), // Base Register
+                   N->getOperand(NumVecs + 3), // Incremental
+                   N->getOperand(0)};
   SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
 
   // Transfer memoperands.
@@ -1403,12 +1397,17 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
   } else
     return false;
 
-  assert((BiggerPattern || (Srl_imm > 0 && Srl_imm < VT.getSizeInBits())) &&
-         "bad amount in shift node!");
+  // Bail out on large immediates. This happens when no proper
+  // combining/constant folding was performed.
+  if (!BiggerPattern && (Srl_imm <= 0 || Srl_imm >= VT.getSizeInBits())) {
+    DEBUG((dbgs() << N
+           << ": Found large shift immediate, this should not happen\n"));
+    return false;
+  }
 
   LSB = Srl_imm;
-  MSB = Srl_imm + (VT == MVT::i32 ? CountTrailingOnes_32(And_imm)
-                                  : CountTrailingOnes_64(And_imm)) -
+  MSB = Srl_imm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(And_imm)
+                                  : countTrailingOnes<uint64_t>(And_imm)) -
         1;
   if (ClampMSB)
     // Since we're moving the extend before the right shift operation, we need
@@ -1452,7 +1451,7 @@ static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
     return false;
 
   // Check whether we really have several bits extract here.
-  unsigned BitWide = 64 - CountLeadingOnes_64(~(And_mask >> Srl_imm));
+  unsigned BitWide = 64 - countLeadingOnes(~(And_mask >> Srl_imm));
   if (BitWide && isMask_64(And_mask >> Srl_imm)) {
     if (N->getValueType(0) == MVT::i32)
       Opc = AArch64::UBFMWri;
@@ -1508,7 +1507,14 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
   } else
     return false;
 
-  assert(Shl_imm < VT.getSizeInBits() && "bad amount in shift node!");
+  // Missing combines/constant folding may have left us with strange
+  // constants.
+  if (Shl_imm >= VT.getSizeInBits()) {
+    DEBUG((dbgs() << N
+           << ": Found large shift immediate, this should not happen\n"));
+    return false;
+  }
+
   uint64_t Srl_imm = 0;
   if (!isIntImmediate(N->getOperand(1), Srl_imm))
     return false;
@@ -1851,7 +1857,7 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
     return false;
 
   ShiftAmount = countTrailingZeros(NonZeroBits);
-  MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount);
+  MaskWidth = countTrailingOnes(NonZeroBits >> ShiftAmount);
 
   // BFI encompasses sufficiently many nodes that it's worth inserting an extra
   // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
@@ -2229,11 +2235,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       SDValue MemAddr = Node->getOperand(4);
 
       // Place arguments in the right order.
-      SmallVector<SDValue, 7> Ops;
-      Ops.push_back(ValLo);
-      Ops.push_back(ValHi);
-      Ops.push_back(MemAddr);
-      Ops.push_back(Chain);
+      SDValue Ops[] = {ValLo, ValHi, MemAddr, Chain};
 
       SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
       // Transfer memoperands.
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7c94d83..a1b324e 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64ISelLowering.h"
+#include "AArch64CallingConvention.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64PerfectShuffle.h"
 #include "AArch64Subtarget.h"
@@ -66,10 +67,9 @@ EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
                          cl::desc("Allow AArch64 SLI/SRI formation"),
                          cl::init(false));
 
-
-AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM)
-    : TargetLowering(TM) {
-  Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
+                                             const AArch64Subtarget &STI)
+    : TargetLowering(TM), Subtarget(&STI) {
 
   // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
   // we have to make something up. Arbitrarily, choose ZeroOrOne.
@@ -111,7 +111,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM)
   }
 
   // Compute derived properties from the register classes
-  computeRegisterProperties();
+  computeRegisterProperties(Subtarget->getRegisterInfo());
 
   // Provide all sorts of operation actions
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
@@ -386,13 +386,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM)
     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
   }
 
+  // Make floating-point constants legal for the large code model, so they don't
+  // become loads from the constant pool.
+  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
+    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+    setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+  }
+
   // AArch64 does not have floating-point extending loads, i1 sign-extending
   // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
-  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+  for (MVT VT : MVT::fp_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
+  }
+  for (MVT VT : MVT::integer_valuetypes())
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
+
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
@@ -531,26 +542,22 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM)
     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
     // Likewise, narrowing and extending vector loads/stores aren't handled
     // directly.
-    for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
-
-      setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
-                         Expand);
-
-      setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
-
-      setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
-
-      for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-           InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
-        setTruncStoreAction((MVT::SimpleValueType)VT,
-                            (MVT::SimpleValueType)InnerVT, Expand);
-      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    for (MVT VT : MVT::vector_valuetypes()) {
+      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+
+      setOperationAction(ISD::MULHS, VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+      setOperationAction(ISD::MULHU, VT, Expand);
+      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+      setOperationAction(ISD::BSWAP, VT, Expand);
+
+      for (MVT InnerVT : MVT::vector_valuetypes()) {
+        setTruncStoreAction(VT, InnerVT, Expand);
+        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+      }
     }
 
     // AArch64 has implementations of a lot of rounding-like FP operations.
@@ -615,7 +622,8 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
   setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
   setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
   setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
-  setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
+  for (MVT InnerVT : MVT::all_valuetypes())
+    setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand);
 
   // CNT supports only B element sizes.
   if (VT != MVT::v8i8 && VT != MVT::v16i8)
@@ -722,13 +730,6 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
   return MVT::i64;
 }
 
-unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
-  // FIXME: On AArch64, this depends on the type.
-  // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes().
-  // and the offset has to be a multiple of the related size in bytes.
-  return 4095;
-}
-
 FastISel *
 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
                                       const TargetLibraryInfo *libInfo) const {
@@ -869,9 +870,8 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
   // EndBB:
   //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
 
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   DebugLoc DL = MI->getDebugLoc();
   MachineFunction::iterator It = MBB;
@@ -1330,10 +1330,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
 
 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
                                              RTLIB::Libcall Call) const {
-  SmallVector<SDValue, 2> Ops;
-  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
-    Ops.push_back(Op.getOperand(i));
-
+  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
   return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
                      SDLoc(Op)).first;
 }
@@ -1561,10 +1558,7 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
   else
     LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
 
-  SmallVector<SDValue, 2> Ops;
-  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
-    Ops.push_back(Op.getOperand(i));
-
+  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
   return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
                      SDLoc(Op)).first;
 }
@@ -1981,6 +1975,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
     llvm_unreachable("Unsupported calling convention.");
   case CallingConv::WebKit_JS:
     return CC_AArch64_WebKit_JS;
+  case CallingConv::GHC:
+    return CC_AArch64_GHC;
   case CallingConv::C:
   case CallingConv::Fast:
     if (!Subtarget->isTargetDarwin())
@@ -2012,18 +2008,19 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   unsigned CurArgIdx = 0;
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT ValVT = Ins[i].VT;
-    std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Ins[i].OrigArgIndex;
-
-    // Get type of the original argument.
-    EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
-    MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
-    // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
-    if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
-      ValVT = MVT::i8;
-    else if (ActualMVT == MVT::i16)
-      ValVT = MVT::i16;
+    if (Ins[i].isOrigArg()) {
+      std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[i].getOrigArgIndex();
 
+      // Get type of the original argument.
+      EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
+      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
+      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+        ValVT = MVT::i8;
+      else if (ActualMVT == MVT::i16)
+        ValVT = MVT::i16;
+    }
     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
     bool Res =
         AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
@@ -2106,7 +2103,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
 
       uint32_t BEAlign = 0;
-      if (ArgSize < 8 && !Subtarget->isLittleEndian())
+      if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
+          !Ins[i].Flags.isInConsecutiveRegs())
         BEAlign = 8 - ArgSize;
 
       int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
@@ -2198,8 +2196,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
                                           AArch64::X3, AArch64::X4, AArch64::X5,
                                           AArch64::X6, AArch64::X7 };
   static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
-  unsigned FirstVariadicGPR =
-      CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
+  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
 
   unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
   int GPRIdx = 0;
@@ -2227,8 +2224,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
         AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
         AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
     static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
-    unsigned FirstVariadicFPR =
-        CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
+    unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
 
     unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
     int FPRIdx = 0;
@@ -2349,7 +2345,9 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   // cannot rely on the linker replacing the tail call with a return.
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = G->getGlobal();
-    if (GV->hasExternalWeakLinkage())
+    const Triple TT(getTargetMachine().getTargetTriple());
+    if (GV->hasExternalWeakLinkage() &&
+        (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
       return false;
   }
 
@@ -2660,7 +2658,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
                                         : VA.getValVT().getSizeInBits();
       OpSize = (OpSize + 7) / 8;
-      if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
+      if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
+          !Flags.isInConsecutiveRegs()) {
         if (OpSize < 8)
           BEAlign = 8 - OpSize;
       }
@@ -2782,19 +2781,16 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Add a register mask operand representing the call-preserved registers.
   const uint32_t *Mask;
-  const TargetRegisterInfo *TRI =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
-  const AArch64RegisterInfo *ARI =
-      static_cast<const AArch64RegisterInfo *>(TRI);
+  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
   if (IsThisReturn) {
     // For 'this' returns, use the X0-preserving mask if applicable
-    Mask = ARI->getThisReturnPreservedMask(CallConv);
+    Mask = TRI->getThisReturnPreservedMask(CallConv);
     if (!Mask) {
       IsThisReturn = false;
-      Mask = ARI->getCallPreservedMask(CallConv);
+      Mask = TRI->getCallPreservedMask(CallConv);
     }
   } else
-    Mask = ARI->getCallPreservedMask(CallConv);
+    Mask = TRI->getCallPreservedMask(CallConv);
 
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
@@ -3014,11 +3010,8 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
   // TLS calls preserve all registers except those that absolutely must be
   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
   // silly).
-  const TargetRegisterInfo *TRI =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
-  const AArch64RegisterInfo *ARI =
-      static_cast<const AArch64RegisterInfo *>(TRI);
-  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
+  const uint32_t *Mask =
+      Subtarget->getRegisterInfo()->getTLSCallPreservedMask();
 
   // Finally, we can make the call. This is just a degenerate version of a
   // normal AArch64 call node: x0 takes the address of the descriptor, and
@@ -3065,11 +3058,8 @@ SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
   // TLS calls preserve all registers except those that absolutely must be
   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
   // silly).
-  const TargetRegisterInfo *TRI =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
-  const AArch64RegisterInfo *ARI =
-      static_cast<const AArch64RegisterInfo *>(TRI);
-  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
+  const uint32_t *Mask =
+      Subtarget->getRegisterInfo()->getTLSCallPreservedMask();
 
   // The function takes only one argument: the address of the descriptor itself
   // in X0.
@@ -3259,8 +3249,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
       OFCC = getInvertedCondCode(OFCC);
     SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
 
-    return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
-                       CCVal, Overflow);
+    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                       Overflow);
   }
 
   if (LHS.getValueType().isInteger()) {
@@ -3429,8 +3419,8 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
 }
 
 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
-  if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
+  if (DAG.getMachineFunction().getFunction()->hasFnAttribute(
+          Attribute::NoImplicitFloat))
     return SDValue();
 
   if (!Subtarget->hasNEON())
@@ -3447,18 +3437,12 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
   SDValue Val = Op.getOperand(0);
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
-  SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
 
-  SDValue VecVal;
-  if (VT == MVT::i32) {
-    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
-    VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec,
-                                       VecVal);
-  } else {
-    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
-  }
+  if (VT == MVT::i32)
+    Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
+  Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
 
-  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
+  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
   SDValue UaddLV = DAG.getNode(
       ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
       DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop);
@@ -4279,7 +4263,8 @@ AArch64TargetLowering::getSingleConstraintMatchWeight(
 
 std::pair<unsigned, const TargetRegisterClass *>
 AArch64TargetLowering::getRegForInlineAsmConstraint(
-    const std::string &Constraint, MVT VT) const {
+    const TargetRegisterInfo *TRI, const std::string &Constraint,
+    MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
@@ -4308,7 +4293,7 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
   // Use the default implementation in TargetLowering to convert the register
   // constraint into a member of a register class.
   std::pair<unsigned, const TargetRegisterClass *> Res;
-  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 
   // Not found as a standard register?
   if (!Res.second) {
@@ -4615,19 +4600,21 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
       // The extraction can just take the second half
       Src.ShuffleVec =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
-                      DAG.getIntPtrConstant(NumSrcElts));
+                      DAG.getConstant(NumSrcElts, MVT::i64));
       Src.WindowBase = -NumSrcElts;
     } else if (Src.MaxElt < NumSrcElts) {
       // The extraction can just take the first half
-      Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT,
-                                   Src.ShuffleVec, DAG.getIntPtrConstant(0));
+      Src.ShuffleVec =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(0, MVT::i64));
     } else {
       // An actual VEXT is needed
-      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT,
-                                     Src.ShuffleVec, DAG.getIntPtrConstant(0));
+      SDValue VEXTSrc1 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(0, MVT::i64));
       SDValue VEXTSrc2 =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
-                      DAG.getIntPtrConstant(NumSrcElts));
+                      DAG.getConstant(NumSrcElts, MVT::i64));
       unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
 
       Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
@@ -6270,6 +6257,8 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
                                     AArch64CC::CondCode CC, bool NoNans, EVT VT,
                                     SDLoc dl, SelectionDAG &DAG) {
   EVT SrcVT = LHS.getValueType();
+  assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
+         "function only supposed to emit natural comparisons");
 
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
   APInt CnstBits(VT.getSizeInBits(), 0);
@@ -6364,13 +6353,15 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
+  EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
   SDLoc dl(Op);
 
   if (LHS.getValueType().getVectorElementType().isInteger()) {
     assert(LHS.getValueType() == RHS.getValueType());
     AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
-    return EmitVectorComparison(LHS, RHS, AArch64CC, false, Op.getValueType(),
-                                dl, DAG);
+    SDValue Cmp =
+        EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
+    return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
   }
 
   assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
@@ -6384,19 +6375,21 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
 
   bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
   SDValue Cmp =
-      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
+      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
   if (!Cmp.getNode())
     return SDValue();
 
   if (CC2 != AArch64CC::AL) {
     SDValue Cmp2 =
-        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
+        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
     if (!Cmp2.getNode())
       return SDValue();
 
-    Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2);
+    Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
   }
 
+  Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
+
   if (ShouldInvert)
     return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
 
@@ -6534,6 +6527,34 @@ bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   return NumBits1 > NumBits2;
 }
 
+/// Check if it is profitable to hoist instruction in then/else to if.
+/// Not profitable if I and it's user can form a FMA instruction
+/// because we prefer FMSUB/FMADD.
+bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
+  if (I->getOpcode() != Instruction::FMul)
+    return true;
+
+  if (I->getNumUses() != 1)
+    return true;
+
+  Instruction *User = I->user_back();
+
+  if (User &&
+      !(User->getOpcode() == Instruction::FSub ||
+        User->getOpcode() == Instruction::FAdd))
+    return true;
+
+  const TargetOptions &Options = getTargetMachine().Options;
+  EVT VT = getValueType(User->getOperand(0)->getType());
+
+  if (isFMAFasterThanFMulAndFAdd(VT) &&
+      isOperationLegalOrCustom(ISD::FMA, VT) &&
+      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath))
+    return false;
+
+  return true;
+}
+
 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
 // 64-bit GPR.
 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
@@ -6604,8 +6625,7 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
   bool Fast;
   const Function *F = MF.getFunction();
   if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
-      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::NoImplicitFloat) &&
+      !F->hasFnAttribute(Attribute::NoImplicitFloat) &&
       (memOpAlign(SrcAlign, DstAlign, 16) ||
        (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
     return MVT::f128;
@@ -6948,7 +6968,8 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
   return SDValue();
 }
 
-static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
+                                     const AArch64Subtarget *Subtarget) {
   // First try to optimize away the conversion when it's conditionally from
   // a constant. Vectors only.
   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
@@ -6967,7 +6988,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
   // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
   // This eliminates an "integer-to-vector-move UOP and improve throughput.
   SDValue N0 = N->getOperand(0);
-  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+  if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
       // Do not change the width of a volatile load.
       !cast<LoadSDNode>(N0)->isVolatile()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
@@ -7756,9 +7777,9 @@ static SDValue performExtendCombine(SDNode *N,
   EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
                                LoVT.getVectorNumElements());
   Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getIntPtrConstant(0));
+                   DAG.getConstant(0, MVT::i64));
   Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+                   DAG.getConstant(InNVT.getVectorNumElements(), MVT::i64));
   Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
 
@@ -7839,14 +7860,13 @@ static SDValue performSTORECombine(SDNode *N,
     return SDValue();
 
   // Cyclone has bad performance on unaligned 16B stores when crossing line and
-  // page boundries. We want to split such stores.
+  // page boundaries. We want to split such stores.
   if (!Subtarget->isCyclone())
     return SDValue();
 
   // Don't split at Oz.
   MachineFunction &MF = DAG.getMachineFunction();
-  bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
-      AttributeSet::FunctionIndex, Attribute::MinSize);
+  bool IsMinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
   if (IsMinSize)
     return SDValue();
 
@@ -7880,9 +7900,9 @@ static SDValue performSTORECombine(SDNode *N,
   EVT HalfVT =
       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
   SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
-                                   DAG.getIntPtrConstant(0));
+                                   DAG.getConstant(0, MVT::i64));
   SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
-                                   DAG.getIntPtrConstant(NumElts));
+                                   DAG.getConstant(NumElts, MVT::i64));
   SDValue BasePtr = S->getBasePtr();
   SDValue NewST1 =
       DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
@@ -7973,7 +7993,7 @@ static SDValue performPostLD1Combine(SDNode *N,
                                            LoadSDN->getMemOperand());
 
     // Update the uses.
-    std::vector<SDValue> NewResults;
+    SmallVector<SDValue, 2> NewResults;
     NewResults.push_back(SDValue(LD, 0));             // The result of load
     NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
     DCI.CombineTo(LD, NewResults);
@@ -8478,6 +8498,12 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
   // largest real NEON comparison is 64-bits per lane, which means the result is
   // at most 32-bits and an illegal vector. Just bail out for now.
   EVT SrcVT = N0.getOperand(0).getValueType();
+
+  // Don't try to do this optimization when the setcc itself has i1 operands.
+  // There are no legal vectors of i1, so this would be pointless.
+  if (SrcVT == MVT::i1)
+    return SDValue();
+
   int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
   if (!ResVT.isVector() || NumMaskElts == 0)
     return SDValue();
@@ -8518,7 +8544,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performMulCombine(N, DAG, DCI, Subtarget);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
-    return performIntToFpCombine(N, DAG);
+    return performIntToFpCombine(N, DAG, Subtarget);
   case ISD::OR:
     return performORCombine(N, DCI, Subtarget);
   case ISD::INTRINSIC_WO_CHAIN:
@@ -8696,13 +8722,12 @@ bool AArch64TargetLowering::getPostIndexedAddressParts(
 
 static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                   SelectionDAG &DAG) {
-  if (N->getValueType(0) != MVT::i16)
-    return;
-
   SDLoc DL(N);
   SDValue Op = N->getOperand(0);
-  assert(Op.getValueType() == MVT::f16 &&
-         "Inconsistent bitcast? Only 16-bit types should be i16 or f16");
+
+  if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16)
+    return;
+
   Op = SDValue(
       DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
                          DAG.getUNDEF(MVT::i32), Op,
@@ -8732,6 +8757,12 @@ bool AArch64TargetLowering::useLoadStackGuardNode() const {
   return true;
 }
 
+bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
+  // Combine multiple FDIVs with the same divisor into multiple FMULs by the
+  // reciprocal if there are three or more FDIVs.
+  return NumUsers > 2;
+}
+
 TargetLoweringBase::LegalizeTypeAction
 AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
   MVT SVT = VT.getSimpleVT();
@@ -8836,3 +8867,8 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
                 Val, Stxr->getFunctionType()->getParamType(0)),
       Addr);
 }
+
+bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
+    Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
+  return Ty->isArrayTy();
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 2f5708d..e973364 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
@@ -207,7 +208,8 @@ class AArch64TargetLowering : public TargetLowering {
   bool RequireStrictAlign;
 
 public:
-  explicit AArch64TargetLowering(const TargetMachine &TM);
+  explicit AArch64TargetLowering(const TargetMachine &TM,
+                                 const AArch64Subtarget &STI);
 
   /// Selects the correct CCAssignFn for a given CallingConvention value.
   CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
@@ -222,7 +224,7 @@ public:
   MVT getScalarShiftAmountTy(EVT LHSTy) const override;
 
   /// allowsMisalignedMemoryAccesses - Returns true if the target allows
-  /// unaligned memory accesses. of the specified type.
+  /// unaligned memory accesses of the specified type.
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
                                       unsigned Align = 1,
                                       bool *Fast = nullptr) const override {
@@ -244,10 +246,6 @@ public:
   /// getFunctionAlignment - Return the Log2 alignment of this function.
   unsigned getFunctionAlignment(const Function *F) const;
 
-  /// getMaximalGlobalOffset - Returns the maximal possible offset which can
-  /// be used for loads / stores from the global.
-  unsigned getMaximalGlobalOffset() const override;
-
   /// Returns true if a cast between SrcAS and DestAS is a noop.
   bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
     // Addrspacecasts are always noops.
@@ -285,6 +283,8 @@ public:
   bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
   bool isTruncateFree(EVT VT1, EVT VT2) const override;
 
+  bool isProfitableToHoist(Instruction *I) const override;
+
   bool isZExtFree(Type *Ty1, Type *Ty2) const override;
   bool isZExtFree(EVT VT1, EVT VT2) const override;
   bool isZExtFree(SDValue Val, EVT VT2) const override;
@@ -440,6 +440,7 @@ private:
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         std::vector<SDNode *> *Created) const override;
+  bool combineRepeatedFPDivisors(unsigned NumUsers) const override;
 
   ConstraintType
   getConstraintType(const std::string &Constraint) const override;
@@ -452,7 +453,8 @@ private:
                                  const char *constraint) const override;
 
   std::pair<unsigned, const TargetRegisterClass *>
-  getRegForInlineAsmConstraint(const std::string &Constraint,
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               const std::string &Constraint,
                                MVT VT) const override;
   void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
                                     std::vector<SDValue> &Ops,
@@ -472,6 +474,10 @@ private:
 
   void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
+
+  bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
+                                                 CallingConv::ID CallConv,
+                                                 bool isVarArg) const override;
 };
 
 namespace AArch64 {
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 2b0f5d2..d295c02 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -4383,7 +4383,7 @@ class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
 }
 
 multiclass SIMDVectorLShiftLongBySizeBHS {
-  let neverHasSideEffects = 1 in {
+  let hasSideEffects = 0 in {
   def v8i8  : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
                                              "shll", ".8h",  ".8b", "8">;
   def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 2dbb31c..64cec55 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -12,9 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
+#include "AArch64MachineCombinerPattern.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
-#include "AArch64MachineCombinerPattern.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
@@ -707,9 +707,8 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) {
   assert(MBB && "Can't get MachineBasicBlock here");
   MachineFunction *MF = MBB->getParent();
   assert(MF && "Can't get MachineFunction here");
-  const TargetMachine *TM = &MF->getTarget();
-  const TargetInstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
-  const TargetRegisterInfo *TRI = TM->getSubtargetImpl()->getRegisterInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
 
   for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 30bf650..d8f1274 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -16,8 +16,8 @@
 
 #include "AArch64.h"
 #include "AArch64RegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "AArch64GenInstrInfo.inc"
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 252ed40..6e4c0b0 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -481,6 +481,24 @@ def trunc_imm : SDNodeXForm<imm, [{
 def : Pat<(i64 i64imm_32bit:$src),
           (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
 
+// Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model).
+def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+  N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i32);
+}]>;
+
+def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+  N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i64);
+}]>;
+
+
+def : Pat<(f32 fpimm:$in),
+  (COPY_TO_REGCLASS (MOVi32imm (bitcast_fpimm_to_i32 f32:$in)), FPR32)>;
+def : Pat<(f64 fpimm:$in),
+  (COPY_TO_REGCLASS (MOVi64imm (bitcast_fpimm_to_i64 f64:$in)), FPR64)>;
+
+
 // Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
 // sequences.
 def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
@@ -639,6 +657,10 @@ def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
           (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
 def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
           (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)),
+          (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)),
+          (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
 } // AddedComplexity = 7
 
 let AddedComplexity = 5 in {
@@ -789,7 +811,7 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
 //===----------------------------------------------------------------------===//
 // Bitfield immediate extraction instruction.
 //===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 defm EXTR : ExtractImm<"extr">;
 def : InstAlias<"ror $dst, $src, $shift",
             (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
@@ -804,7 +826,7 @@ def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
 //===----------------------------------------------------------------------===//
 // Other bitfield immediate instructions.
 //===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 defm BFM  : BitfieldImmWith2RegArgs<0b01, "bfm">;
 defm SBFM : BitfieldImm<0b00, "sbfm">;
 defm UBFM : BitfieldImm<0b10, "ubfm">;
@@ -977,9 +999,9 @@ def : InstAlias<"cneg $dst, $src, $cc",
 // PC-relative instructions.
 //===----------------------------------------------------------------------===//
 let isReMaterializable = 1 in {
-let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in {
+let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
 def ADR  : ADRI<0, "adr", adrlabel, []>;
-} // neverHasSideEffects = 1
+} // hasSideEffects = 0
 
 def ADRP : ADRI<1, "adrp", adrplabel,
                 [(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>;
@@ -1867,6 +1889,33 @@ let Predicates = [IsLE] in {
 }
 } // AddedComplexity = 10
 
+// Match stores from lane 0 to the appropriate subreg's store.
+multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop,
+                              ValueType VecTy, ValueType STy,
+                              SubRegIndex SubRegIdx,
+                              Instruction STRW, Instruction STRX> {
+
+  def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
+                     (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+            (STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
+                  GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
+                     (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+            (STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
+                  GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 19 in {
+  defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>;
+  defm : VecROStoreLane0Pat<ro16,      store   , v8i16, i16, hsub, STRHroW, STRHroX>;
+  defm : VecROStoreLane0Pat<ro32, truncstorei32, v4i32, i32, ssub, STRSroW, STRSroX>;
+  defm : VecROStoreLane0Pat<ro32,      store   , v4i32, i32, ssub, STRSroW, STRSroX>;
+  defm : VecROStoreLane0Pat<ro32,      store   , v4f32, f32, ssub, STRSroW, STRSroX>;
+  defm : VecROStoreLane0Pat<ro64,      store   , v2i64, i64, dsub, STRDroW, STRDroX>;
+  defm : VecROStoreLane0Pat<ro64,      store   , v2f64, f64, dsub, STRDroW, STRDroX>;
+}
+
 //---
 // (unsigned immediate)
 defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str",
@@ -3667,29 +3716,21 @@ defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, INSvi32lane>;
 
 
 // Floating point vector extractions are codegen'd as either a sequence of
-// subregister extractions, possibly fed by an INS if the lane number is
-// anything other than zero.
+// subregister extractions, or a MOV (aka CPY here, alias for DUP) if
+// the lane number is anything other than zero.
 def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
           (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
 def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
           (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
 def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
           (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
+
 def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
-          (f64 (EXTRACT_SUBREG
-            (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0,
-                         V128:$Rn, VectorIndexD:$idx),
-            dsub))>;
+          (f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>;
 def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
-          (f32 (EXTRACT_SUBREG
-            (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0,
-                         V128:$Rn, VectorIndexS:$idx),
-            ssub))>;
+          (f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>;
 def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx),
-          (f16 (EXTRACT_SUBREG
-            (INSvi16lane (v8f16 (IMPLICIT_DEF)), 0,
-                         V128:$Rn, VectorIndexH:$idx),
-            hsub))>;
+          (f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>;
 
 // All concat_vectors operations are canonicalised to act on i64 vectors for
 // AArch64. In the general case we need an instruction, which had just as well be
@@ -4124,7 +4165,7 @@ def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
 // AdvSIMD indexed element
 //----------------------------------------------------------------------------
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   defm FMLA  : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
   defm FMLS  : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
 }
@@ -4678,7 +4719,7 @@ defm LD1R          : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
 defm LD2R          : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
 defm LD3R          : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
 defm LD4R          : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
-let mayLoad = 1, neverHasSideEffects = 1 in {
+let mayLoad = 1, hasSideEffects = 0 in {
 defm LD1 : SIMDLdSingleBTied<0, 0b000,       "ld1", VecListOneb,   GPR64pi1>;
 defm LD1 : SIMDLdSingleHTied<0, 0b010, 0,    "ld1", VecListOneh,   GPR64pi2>;
 defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes,   GPR64pi4>;
@@ -4768,7 +4809,7 @@ defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh, GPR64pi2>;
 defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
 defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
 
-let AddedComplexity = 15 in
+let AddedComplexity = 19 in
 class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
                     ValueType VTy, ValueType STy, Instruction ST1>
   : Pat<(scalar_store
@@ -4784,7 +4825,7 @@ def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
 def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
 def : St1Lane128Pat<store,         VectorIndexH, v8f16, f16, ST1i16>;
 
-let AddedComplexity = 15 in
+let AddedComplexity = 19 in
 class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
                    ValueType VTy, ValueType STy, Instruction ST1>
   : Pat<(scalar_store
@@ -4848,7 +4889,7 @@ defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
 defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
 defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>;
 
-let mayStore = 1, neverHasSideEffects = 1 in {
+let mayStore = 1, hasSideEffects = 0 in {
 defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
 defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   GPR64pi4>;
 defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   GPR64pi8>;
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 8157981..8463ce6 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -135,6 +135,8 @@ static bool isUnscaledLdst(unsigned Opc) {
     return true;
   case AArch64::LDURXi:
     return true;
+  case AArch64::LDURSWi:
+    return true;
   }
 }
 
@@ -173,6 +175,9 @@ int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
   case AArch64::LDRXui:
   case AArch64::LDURXi:
     return 8;
+  case AArch64::LDRSWui:
+  case AArch64::LDURSWi:
+    return 4;
   }
 }
 
@@ -210,6 +215,9 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
   case AArch64::LDRXui:
   case AArch64::LDURXi:
     return AArch64::LDPXi;
+  case AArch64::LDRSWui:
+  case AArch64::LDURSWi:
+    return AArch64::LDPSWi;
   }
 }
 
@@ -237,6 +245,8 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
     return AArch64::LDRWpre;
   case AArch64::LDRXui:
     return AArch64::LDRXpre;
+  case AArch64::LDRSWui:
+    return AArch64::LDRSWpre;
   }
 }
 
@@ -264,6 +274,8 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
     return AArch64::LDRWpost;
   case AArch64::LDRXui:
     return AArch64::LDRXpost;
+  case AArch64::LDRSWui:
+    return AArch64::LDRSWpost;
   }
 }
 
@@ -780,6 +792,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
     case AArch64::LDRQui:
     case AArch64::LDRXui:
     case AArch64::LDRWui:
+    case AArch64::LDRSWui:
     // do the unscaled versions as well
     case AArch64::STURSi:
     case AArch64::STURDi:
@@ -790,7 +803,8 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
     case AArch64::LDURDi:
     case AArch64::LDURQi:
     case AArch64::LDURWi:
-    case AArch64::LDURXi: {
+    case AArch64::LDURXi:
+    case AArch64::LDURSWi: {
       // If this is a volatile load/store, don't mess with it.
       if (MI->hasOrderedMemoryRef()) {
         ++MBBI;
@@ -931,10 +945,8 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
 }
 
 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
-  const TargetMachine &TM = Fn.getTarget();
-  TII = static_cast<const AArch64InstrInfo *>(
-      TM.getSubtargetImpl()->getInstrInfo());
-  TRI = TM.getSubtargetImpl()->getRegisterInfo();
+  TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo());
+  TRI = Fn.getSubtarget().getRegisterInfo();
 
   bool Modified = false;
   for (auto &MBB : Fn)
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
index f942c4e..4690177 100644
--- a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -235,7 +235,7 @@ bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd,
           costs[i + 1][j + 1] = sameParityMax + 1.0;
     }
   }
-  G.setEdgeCosts(edge, std::move(costs));
+  G.updateEdgeCosts(edge, std::move(costs));
 
   return true;
 }
@@ -312,7 +312,7 @@ void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd,
               costs[i + 1][j + 1] = sameParityMax + 1.0;
         }
       }
-      G.setEdgeCosts(edge, std::move(costs));
+      G.updateEdgeCosts(edge, std::move(costs));
     }
   }
 }
@@ -328,7 +328,7 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) {
   const MachineFunction &MF = G.getMetadata().MF;
   LiveIntervals &LIs = G.getMetadata().LIS;
 
-  TRI = MF.getTarget().getSubtargetImpl()->getRegisterInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   DEBUG(MF.dump());
 
   for (const auto &MBB: MF) {
diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp
index 16c33b7..c037c86 100644
--- a/lib/Target/AArch64/AArch64PromoteConstant.cpp
+++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -22,7 +22,7 @@
 
 #include "AArch64.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Constants.h"
@@ -31,6 +31,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
@@ -112,44 +113,42 @@ private:
     AU.addPreserved<DominatorTreeWrapperPass>();
   }
 
-  /// Type to store a list of User.
-  typedef SmallVector<Value::user_iterator, 4> Users;
+  /// Type to store a list of Uses.
+  typedef SmallVector<Use *, 4> Uses;
   /// Map an insertion point to all the uses it dominates.
-  typedef DenseMap<Instruction *, Users> InsertionPoints;
+  typedef DenseMap<Instruction *, Uses> InsertionPoints;
   /// Map a function to the required insertion point of load for a
   /// global variable.
   typedef DenseMap<Function *, InsertionPoints> InsertionPointsPerFunc;
 
   /// Find the closest point that dominates the given Use.
-  Instruction *findInsertionPoint(Value::user_iterator &Use);
+  Instruction *findInsertionPoint(Use &Use);
 
   /// Check if the given insertion point is dominated by an existing
   /// insertion point.
   /// If true, the given use is added to the list of dominated uses for
   /// the related existing point.
   /// \param NewPt the insertion point to be checked
-  /// \param UseIt the use to be added into the list of dominated uses
+  /// \param Use the use to be added into the list of dominated uses
   /// \param InsertPts existing insertion points
   /// \pre NewPt and all instruction in InsertPts belong to the same function
   /// \return true if one of the insertion point in InsertPts dominates NewPt,
   ///         false otherwise
-  bool isDominated(Instruction *NewPt, Value::user_iterator &UseIt,
-                   InsertionPoints &InsertPts);
+  bool isDominated(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts);
 
   /// Check if the given insertion point can be merged with an existing
   /// insertion point in a common dominator.
   /// If true, the given use is added to the list of the created insertion
   /// point.
   /// \param NewPt the insertion point to be checked
-  /// \param UseIt the use to be added into the list of dominated uses
+  /// \param Use the use to be added into the list of dominated uses
   /// \param InsertPts existing insertion points
   /// \pre NewPt and all instruction in InsertPts belong to the same function
   /// \pre isDominated returns false for the exact same parameters.
   /// \return true if it exists an insertion point in InsertPts that could
   ///         have been merged with NewPt in a common dominator,
   ///         false otherwise
-  bool tryAndMerge(Instruction *NewPt, Value::user_iterator &UseIt,
-                   InsertionPoints &InsertPts);
+  bool tryAndMerge(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts);
 
   /// Compute the minimal insertion points to dominates all the interesting
   /// uses of value.
@@ -182,21 +181,19 @@ private:
   bool promoteConstant(Constant *Cst);
 
   /// Transfer the list of dominated uses of IPI to NewPt in InsertPts.
-  /// Append UseIt to this list and delete the entry of IPI in InsertPts.
-  static void appendAndTransferDominatedUses(Instruction *NewPt,
-                                             Value::user_iterator &UseIt,
+  /// Append Use to this list and delete the entry of IPI in InsertPts.
+  static void appendAndTransferDominatedUses(Instruction *NewPt, Use &Use,
                                              InsertionPoints::iterator &IPI,
                                              InsertionPoints &InsertPts) {
     // Record the dominated use.
-    IPI->second.push_back(UseIt);
+    IPI->second.push_back(&Use);
     // Transfer the dominated uses of IPI to NewPt
     // Inserting into the DenseMap may invalidate existing iterator.
     // Keep a copy of the key to find the iterator to erase.
     Instruction *OldInstr = IPI->first;
-    InsertPts.insert(InsertionPoints::value_type(NewPt, IPI->second));
+    InsertPts[NewPt] = std::move(IPI->second);
     // Erase IPI.
-    IPI = InsertPts.find(OldInstr);
-    InsertPts.erase(IPI);
+    InsertPts.erase(OldInstr);
   }
 };
 } // end anonymous namespace
@@ -328,23 +325,18 @@ static bool shouldConvert(const Constant *Cst) {
   return isConstantUsingVectorTy(Cst->getType());
 }
 
-Instruction *
-AArch64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) {
+Instruction *AArch64PromoteConstant::findInsertionPoint(Use &Use) {
+  Instruction *User = cast<Instruction>(Use.getUser());
+
   // If this user is a phi, the insertion point is in the related
   // incoming basic block.
-  PHINode *PhiInst = dyn_cast<PHINode>(*Use);
-  Instruction *InsertionPoint;
-  if (PhiInst)
-    InsertionPoint =
-        PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator();
-  else
-    InsertionPoint = dyn_cast<Instruction>(*Use);
-  assert(InsertionPoint && "User is not an instruction!");
-  return InsertionPoint;
+  if (PHINode *PhiInst = dyn_cast<PHINode>(User))
+    return PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator();
+
+  return User;
 }
 
-bool AArch64PromoteConstant::isDominated(Instruction *NewPt,
-                                         Value::user_iterator &UseIt,
+bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Use &Use,
                                          InsertionPoints &InsertPts) {
 
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
@@ -363,15 +355,14 @@ bool AArch64PromoteConstant::isDominated(Instruction *NewPt,
       DEBUG(dbgs() << "Insertion point dominated by:\n");
       DEBUG(IPI.first->print(dbgs()));
       DEBUG(dbgs() << '\n');
-      IPI.second.push_back(UseIt);
+      IPI.second.push_back(&Use);
       return true;
     }
   }
   return false;
 }
 
-bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt,
-                                         Value::user_iterator &UseIt,
+bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use,
                                          InsertionPoints &InsertPts) {
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
       *NewPt->getParent()->getParent()).getDomTree();
@@ -391,7 +382,7 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt,
       DEBUG(dbgs() << "Merge insertion point with:\n");
       DEBUG(IPI->first->print(dbgs()));
       DEBUG(dbgs() << "\nat considered insertion point.\n");
-      appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
+      appendAndTransferDominatedUses(NewPt, Use, IPI, InsertPts);
       return true;
     }
 
@@ -415,7 +406,7 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt,
     DEBUG(dbgs() << '\n');
     DEBUG(NewPt->print(dbgs()));
     DEBUG(dbgs() << '\n');
-    appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
+    appendAndTransferDominatedUses(NewPt, Use, IPI, InsertPts);
     return true;
   }
   return false;
@@ -424,22 +415,22 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt,
 void AArch64PromoteConstant::computeInsertionPoints(
     Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) {
   DEBUG(dbgs() << "** Compute insertion points **\n");
-  for (Value::user_iterator UseIt = Val->user_begin(),
-                            EndUseIt = Val->user_end();
-       UseIt != EndUseIt; ++UseIt) {
+  for (Use &Use : Val->uses()) {
+    Instruction *User = dyn_cast<Instruction>(Use.getUser());
+
     // If the user is not an Instruction, we cannot modify it.
-    if (!isa<Instruction>(*UseIt))
+    if (!User)
       continue;
 
     // Filter out uses that should not be converted.
-    if (!shouldConvertUse(Val, cast<Instruction>(*UseIt), UseIt.getOperandNo()))
+    if (!shouldConvertUse(Val, User, Use.getOperandNo()))
       continue;
 
-    DEBUG(dbgs() << "Considered use, opidx " << UseIt.getOperandNo() << ":\n");
-    DEBUG((*UseIt)->print(dbgs()));
+    DEBUG(dbgs() << "Considered use, opidx " << Use.getOperandNo() << ":\n");
+    DEBUG(User->print(dbgs()));
     DEBUG(dbgs() << '\n');
 
-    Instruction *InsertionPoint = findInsertionPoint(UseIt);
+    Instruction *InsertionPoint = findInsertionPoint(Use);
 
     DEBUG(dbgs() << "Considered insertion point:\n");
     DEBUG(InsertionPoint->print(dbgs()));
@@ -449,17 +440,17 @@ void AArch64PromoteConstant::computeInsertionPoints(
     // by another one.
     InsertionPoints &InsertPts =
         InsPtsPerFunc[InsertionPoint->getParent()->getParent()];
-    if (isDominated(InsertionPoint, UseIt, InsertPts))
+    if (isDominated(InsertionPoint, Use, InsertPts))
       continue;
     // This insertion point is useful, check if we can merge some insertion
     // point in a common dominator or if NewPt dominates an existing one.
-    if (tryAndMerge(InsertionPoint, UseIt, InsertPts))
+    if (tryAndMerge(InsertionPoint, Use, InsertPts))
       continue;
 
     DEBUG(dbgs() << "Keep considered insertion point\n");
 
     // It is definitely useful by its own
-    InsertPts[InsertionPoint].push_back(UseIt);
+    InsertPts[InsertionPoint].push_back(&Use);
   }
 }
 
@@ -470,41 +461,32 @@ bool AArch64PromoteConstant::insertDefinitions(
   bool HasChanged = false;
 
   // Traverse all insertion points in all the function.
-  for (InsertionPointsPerFunc::iterator FctToInstPtsIt = InsPtsPerFunc.begin(),
-                                        EndIt = InsPtsPerFunc.end();
-       FctToInstPtsIt != EndIt; ++FctToInstPtsIt) {
-    InsertionPoints &InsertPts = FctToInstPtsIt->second;
+  for (const auto &FctToInstPtsIt : InsPtsPerFunc) {
+    const InsertionPoints &InsertPts = FctToInstPtsIt.second;
 // Do more checking for debug purposes.
 #ifndef NDEBUG
     DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
-        *FctToInstPtsIt->first).getDomTree();
+                            *FctToInstPtsIt.first).getDomTree();
 #endif
-    GlobalVariable *PromotedGV;
     assert(!InsertPts.empty() && "Empty uses does not need a definition");
 
-    Module *M = FctToInstPtsIt->first->getParent();
-    DenseMap<Module *, GlobalVariable *>::iterator MapIt =
-        ModuleToMergedGV.find(M);
-    if (MapIt == ModuleToMergedGV.end()) {
+    Module *M = FctToInstPtsIt.first->getParent();
+    GlobalVariable *&PromotedGV = ModuleToMergedGV[M];
+    if (!PromotedGV) {
       PromotedGV = new GlobalVariable(
           *M, Cst->getType(), true, GlobalValue::InternalLinkage, nullptr,
           "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
       PromotedGV->setInitializer(Cst);
-      ModuleToMergedGV[M] = PromotedGV;
       DEBUG(dbgs() << "Global replacement: ");
       DEBUG(PromotedGV->print(dbgs()));
       DEBUG(dbgs() << '\n');
       ++NumPromoted;
       HasChanged = true;
-    } else {
-      PromotedGV = MapIt->second;
     }
 
-    for (InsertionPoints::iterator IPI = InsertPts.begin(),
-                                   EndIPI = InsertPts.end();
-         IPI != EndIPI; ++IPI) {
+    for (const auto &IPI : InsertPts) {
       // Create the load of the global variable.
-      IRBuilder<> Builder(IPI->first->getParent(), IPI->first);
+      IRBuilder<> Builder(IPI.first->getParent(), IPI.first);
       LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV);
       DEBUG(dbgs() << "**********\n");
       DEBUG(dbgs() << "New def: ");
@@ -512,18 +494,15 @@ bool AArch64PromoteConstant::insertDefinitions(
       DEBUG(dbgs() << '\n');
 
       // Update the dominated uses.
-      Users &DominatedUsers = IPI->second;
-      for (Value::user_iterator Use : DominatedUsers) {
+      for (Use *Use : IPI.second) {
 #ifndef NDEBUG
-        assert((DT.dominates(LoadedCst, cast<Instruction>(*Use)) ||
-                (isa<PHINode>(*Use) &&
-                 DT.dominates(LoadedCst, findInsertionPoint(Use)))) &&
+        assert(DT.dominates(LoadedCst, findInsertionPoint(*Use)) &&
                "Inserted definition does not dominate all its uses!");
 #endif
-        DEBUG(dbgs() << "Use to update " << Use.getOperandNo() << ":");
-        DEBUG(Use->print(dbgs()));
+        DEBUG(dbgs() << "Use to update " << Use->getOperandNo() << ":");
+        DEBUG(Use->getUser()->print(dbgs()));
         DEBUG(dbgs() << '\n');
-        Use->setOperand(Use.getOperandNo(), LoadedCst);
+        Use->set(LoadedCst);
         ++NumPromotedUses;
       }
     }
@@ -556,22 +535,19 @@ bool AArch64PromoteConstant::runOnFunction(Function &F) {
   // global variable. Create as few loads of this variable as possible and
   // update the uses accordingly.
   bool LocalChange = false;
-  SmallSet<Constant *, 8> AlreadyChecked;
-
-  for (auto &MBB : F) {
-    for (auto &MI : MBB) {
-      // Traverse the operand, looking for constant vectors. Replace them by a
-      // load of a global variable of constant vector type.
-      for (unsigned OpIdx = 0, EndOpIdx = MI.getNumOperands();
-           OpIdx != EndOpIdx; ++OpIdx) {
-        Constant *Cst = dyn_cast<Constant>(MI.getOperand(OpIdx));
-        // There is no point in promoting global values as they are already
-        // global. Do not promote constant expressions either, as they may
-        // require some code expansion.
-        if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
-            AlreadyChecked.insert(Cst).second)
-          LocalChange |= promoteConstant(Cst);
-      }
+  SmallPtrSet<Constant *, 8> AlreadyChecked;
+
+  for (Instruction &I : inst_range(&F)) {
+    // Traverse the operand, looking for constant vectors. Replace them by a
+    // load of a global variable of constant vector type.
+    for (Value *Op : I.operand_values()) {
+      Constant *Cst = dyn_cast<Constant>(Op);
+      // There is no point in promoting global values as they are already
+      // global. Do not promote constant expressions either, as they may
+      // require some code expansion.
+      if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
+          AlreadyChecked.insert(Cst).second)
+        LocalChange |= promoteConstant(Cst);
     }
   }
   return LocalChange;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index d734d43..206cdbb 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -33,6 +33,10 @@ using namespace llvm;
 #define GET_REGINFO_TARGET_DESC
 #include "AArch64GenRegisterInfo.inc"
 
+static cl::opt<bool>
+ReserveX18("aarch64-reserve-x18", cl::Hidden,
+          cl::desc("Reserve X18, making it unavailable as GPR"));
+
 AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo *tii,
                                          const AArch64Subtarget *sti)
     : AArch64GenRegisterInfo(AArch64::LR), TII(tii), STI(sti) {}
@@ -40,6 +44,10 @@ AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo *tii,
 const MCPhysReg *
 AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::GHC)
+    // GHC set of callee saved regs is empty as all those regs are
+    // used for passing STG regs around
+    return CSR_AArch64_NoRegs_SaveList;
   if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_SaveList;
   else
@@ -48,6 +56,9 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 const uint32_t *
 AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  if (CC == CallingConv::GHC)
+    // This is academic becase all GHC calls are (supposed to be) tail calls
+    return CSR_AArch64_NoRegs_RegMask;
   if (CC == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_RegMask;
   else
@@ -63,7 +74,7 @@ const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
 }
 
 const uint32_t *
-AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
+AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID CC) const {
   // This should return a register mask that is the same as that returned by
   // getCallPreservedMask but that additionally preserves the register used for
   // the first i64 argument (which must also be the register used to return a
@@ -71,6 +82,7 @@ AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
   //
   // In case that the calling convention does not use the same register for
   // both, the function should return NULL (does not currently apply)
+  assert(CC != CallingConv::GHC && "should not be GHC calling convention.");
   return CSR_AArch64_AAPCS_ThisReturn_RegMask;
 }
 
@@ -90,7 +102,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(AArch64::W29);
   }
 
-  if (STI->isTargetDarwin()) {
+  if (STI->isTargetDarwin() || ReserveX18) {
     Reserved.set(AArch64::X18); // Platform register
     Reserved.set(AArch64::W18);
   }
@@ -117,7 +129,7 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
     return true;
   case AArch64::X18:
   case AArch64::W18:
-    return STI->isTargetDarwin();
+    return STI->isTargetDarwin() || ReserveX18;
   case AArch64::FP:
   case AArch64::W29:
     return TFI->hasFP(MF) || STI->isTargetDarwin();
@@ -379,7 +391,7 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   case AArch64::GPR64commonRegClassID:
     return 32 - 1                                      // XZR/SP
            - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP
-           - STI->isTargetDarwin() // X18 reserved as platform register
+           - (STI->isTargetDarwin() || ReserveX18) // X18 reserved as platform register
            - hasBasePointer(MF);   // X19
   case AArch64::FPR8RegClassID:
   case AArch64::FPR16RegClassID:
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 0cfd582..b9c5399 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -28,15 +28,14 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
   // Check to see if there is a specialized entry-point for memory zeroing.
   ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
   ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
+  const AArch64Subtarget &STI =
+      DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
   const char *bzeroEntry =
-      (V && V->isNullValue())
-          ? DAG.getTarget().getSubtarget<AArch64Subtarget>().getBZeroEntry()
-          : nullptr;
+      (V && V->isNullValue()) ? STI.getBZeroEntry() : nullptr;
   // For small size (< 256), it is not beneficial to use bzero
   // instead of memset.
   if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
-    const AArch64TargetLowering &TLI =
-        *DAG.getTarget().getSubtarget<AArch64Subtarget>().getTargetLowering();
+    const AArch64TargetLowering &TLI = *STI.getTargetLowering();
 
     EVT IntPtr = TLI.getPointerTy();
     Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 0c36e8f..85b44a2 100644
--- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -30,7 +30,6 @@ class AArch64StorePairSuppress : public MachineFunctionPass {
   const AArch64InstrInfo *TII;
   const TargetRegisterInfo *TRI;
   const MachineRegisterInfo *MRI;
-  MachineFunction *MF;
   TargetSchedModel SchedModel;
   MachineTraceMetrics *Traces;
   MachineTraceMetrics::Ensemble *MinInstr;
@@ -115,20 +114,16 @@ bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
   }
 }
 
-bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
-  MF = &mf;
-  TII =
-      static_cast<const AArch64InstrInfo *>(MF->getSubtarget().getInstrInfo());
-  TRI = MF->getSubtarget().getRegisterInfo();
-  MRI = &MF->getRegInfo();
-  const TargetSubtargetInfo &ST =
-      MF->getTarget().getSubtarget<TargetSubtargetInfo>();
+bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+  TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
+  TRI = ST.getRegisterInfo();
+  MRI = &MF.getRegInfo();
   SchedModel.init(ST.getSchedModel(), &ST, TII);
-
   Traces = &getAnalysis<MachineTraceMetrics>();
   MinInstr = nullptr;
 
-  DEBUG(dbgs() << "*** " << getPassName() << ": " << MF->getName() << '\n');
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << MF.getName() << '\n');
 
   if (!SchedModel.hasInstrSchedModel()) {
     DEBUG(dbgs() << "  Skipping pass: no machine model present.\n");
@@ -139,7 +134,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
   // precisely determine whether a store pair can be formed. But we do want to
   // filter out most situations where we can't form store pairs to avoid
   // computing trace metrics in those cases.
-  for (auto &MBB : *MF) {
+  for (auto &MBB : MF) {
     bool SuppressSTP = false;
     unsigned PrevBaseReg = 0;
     for (auto &MI : MBB) {
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index 47b5d54..c613025 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -48,17 +48,10 @@ AArch64Subtarget::AArch64Subtarget(const std::string &TT,
                                    const TargetMachine &TM, bool LittleEndian)
     : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
       HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false),
-      HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU),
-      TargetTriple(TT),
-      // This nested ternary is horrible, but DL needs to be properly
-      // initialized
-      // before TLInfo is constructed.
-      DL(isTargetMachO()
-             ? "e-m:o-i64:64-i128:128-n32:64-S128"
-             : (LittleEndian ? "e-m:e-i64:64-i128:128-n32:64-S128"
-                             : "E-m:e-i64:64-i128:128-n32:64-S128")),
-      FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS)),
-      TSInfo(&DL), TLInfo(TM) {}
+      HasZeroCycleRegMove(false), HasZeroCycleZeroing(false),
+      IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(),
+      InstrInfo(initializeSubtargetDependencies(FS)),
+      TSInfo(TM.getDataLayout()), TLInfo(TM, *this) {}
 
 /// ClassifyGlobalReference - Find the target operand flags that describe
 /// how a global value should be referenced for the current subtarget.
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index e2740f1..d418cc5 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -48,13 +48,14 @@ protected:
   // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
   bool HasZeroCycleZeroing;
 
+  bool IsLittle;
+
   /// CPUString - String name of used CPU.
   std::string CPUString;
 
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
 
-  const DataLayout DL;
   AArch64FrameLowering FrameLowering;
   AArch64InstrInfo InstrInfo;
   AArch64SelectionDAGInfo TSInfo;
@@ -82,7 +83,6 @@ public:
     return &TLInfo;
   }
   const AArch64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
-  const DataLayout *getDataLayout() const override { return &DL; }
   const AArch64RegisterInfo *getRegisterInfo() const override {
     return &getInstrInfo()->getRegisterInfo();
   }
@@ -100,7 +100,7 @@ public:
   bool hasCrypto() const { return HasCrypto; }
   bool hasCRC() const { return HasCRC; }
 
-  bool isLittleEndian() const { return DL.isLittleEndian(); }
+  bool isLittleEndian() const { return IsLittle; }
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
   bool isTargetIOS() const { return TargetTriple.isiOS(); }
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index beed8e0..d73d0b3 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -13,10 +13,11 @@
 #include "AArch64.h"
 #include "AArch64TargetMachine.h"
 #include "AArch64TargetObjectFile.h"
+#include "AArch64TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/IR/Function.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
@@ -112,6 +113,13 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
                                            CodeGenOpt::Level OL,
                                            bool LittleEndian)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      // This nested ternary is horrible, but DL needs to be properly
+      // initialized
+      // before TLInfo is constructed.
+      DL(Triple(TT).isOSBinFormatMachO()
+             ? "e-m:o-i64:64-i128:128-n32:64-S128"
+             : (LittleEndian ? "e-m:e-i64:64-i128:128-n32:64-S128"
+                             : "E-m:e-i64:64-i128:128-n32:64-S128")),
       TLOF(createTLOF(Triple(getTargetTriple()))),
       Subtarget(TT, CPU, FS, *this, LittleEndian), isLittle(LittleEndian) {
   initAsmInfo();
@@ -121,11 +129,8 @@ AArch64TargetMachine::~AArch64TargetMachine() {}
 
 const AArch64Subtarget *
 AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
-  AttributeSet FnAttrs = F.getAttributes();
-  Attribute CPUAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
-  Attribute FSAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
 
   std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
                         ? CPUAttr.getValueAsString().str()
@@ -181,19 +186,17 @@ public:
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addILPOpts() override;
-  bool addPreRegAlloc() override;
-  bool addPostRegAlloc() override;
-  bool addPreSched2() override;
-  bool addPreEmitPass() override;
+  void addPreRegAlloc() override;
+  void addPostRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
 };
 } // namespace
 
-void AArch64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  // Add first the target-independent BasicTTI pass, then our AArch64 pass. This
-  // allows the AArch64 pass to delegate to the target independent layer when
-  // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(this));
-  PM.add(createAArch64TargetTransformInfoPass(this));
+TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](Function &F) {
+    return TargetTransformInfo(AArch64TTIImpl(this, F));
+  });
 }
 
 TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
@@ -233,8 +236,11 @@ bool AArch64PassConfig::addPreISel() {
   // get a chance to be merged
   if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant)
     addPass(createAArch64PromoteConstantPass());
+  // FIXME: On AArch64, this depends on the type.
+  // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes().
+  // and the offset has to be a multiple of the related size in bytes.
   if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createGlobalMergePass(TM));
+    addPass(createGlobalMergePass(TM, 4095));
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createAArch64AddressTypePromotionPass());
 
@@ -246,7 +252,7 @@ bool AArch64PassConfig::addInstSelector() {
 
   // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many
   // references to _TLS_MODULE_BASE_ as possible.
-  if (TM->getSubtarget<AArch64Subtarget>().isTargetELF() &&
+  if (Triple(TM->getTargetTriple()).isOSBinFormatELF() &&
       getOptLevel() != CodeGenOpt::None)
     addPass(createAArch64CleanupLocalDynamicTLSPass());
 
@@ -267,7 +273,7 @@ bool AArch64PassConfig::addILPOpts() {
   return true;
 }
 
-bool AArch64PassConfig::addPreRegAlloc() {
+void AArch64PassConfig::addPreRegAlloc() {
   // Use AdvSIMD scalar instructions whenever profitable.
   if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar) {
     addPass(createAArch64AdvSIMDScalar());
@@ -275,10 +281,9 @@ bool AArch64PassConfig::addPreRegAlloc() {
     // be register coaleascer friendly.
     addPass(&PeepholeOptimizerID);
   }
-  return true;
 }
 
-bool AArch64PassConfig::addPostRegAlloc() {
+void AArch64PassConfig::addPostRegAlloc() {
   // Change dead register definitions to refer to the zero register.
   if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
     addPass(createAArch64DeadRegisterDefinitions());
@@ -288,26 +293,23 @@ bool AArch64PassConfig::addPostRegAlloc() {
       usingDefaultRegAlloc())
     // Improve performance for some FP/SIMD code for A57.
     addPass(createAArch64A57FPLoadBalancing());
-  return true;
 }
 
-bool AArch64PassConfig::addPreSched2() {
+void AArch64PassConfig::addPreSched2() {
   // Expand some pseudo instructions to allow proper scheduling.
   addPass(createAArch64ExpandPseudoPass());
   // Use load/store pair instructions when possible.
   if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
     addPass(createAArch64LoadStoreOptimizationPass());
-  return true;
 }
 
-bool AArch64PassConfig::addPreEmitPass() {
+void AArch64PassConfig::addPreEmitPass() {
   if (EnableA53Fix835769)
     addPass(createAArch64A53Fix835769());
   // Relax conditional branch instructions if they're otherwise out of
   // range of their destination.
   addPass(createAArch64BranchRelaxation());
   if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
-      TM->getSubtarget<AArch64Subtarget>().isTargetMachO())
+      Triple(TM->getTargetTriple()).isOSBinFormatMachO())
     addPass(createAArch64CollectLOHPass());
-  return true;
 }
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 75c65c5..7143adf 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -23,6 +23,7 @@ namespace llvm {
 
 class AArch64TargetMachine : public LLVMTargetMachine {
 protected:
+  const DataLayout DL;
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   AArch64Subtarget Subtarget;
   mutable StringMap<std::unique_ptr<AArch64Subtarget>> SubtargetMap;
@@ -35,6 +36,7 @@ public:
 
   ~AArch64TargetMachine() override;
 
+  const DataLayout *getDataLayout() const override { return &DL; }
   const AArch64Subtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
@@ -43,8 +45,8 @@ public:
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  /// \brief Register AArch64 analysis passes with a pass manager.
-  void addAnalysisPasses(PassManagerBase &PM) override;
+  /// \brief Get the TargetIRAnalysis for this target.
+  TargetIRAnalysis getTargetIRAnalysis() override;
 
   TargetLoweringObjectFile* getObjFileLowering() const override {
     return TLOF.get();
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b1a2914..0646d85 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===//
+//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,18 +6,11 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-/// \file
-/// This file implements a TargetTransformInfo analysis pass specific to the
-/// AArch64 target machine. It uses the target's detailed information to provide
-/// more precise answers to certain TTI queries, while letting the target
-/// independent and default TTI implementations handle the rest.
-///
-//===----------------------------------------------------------------------===//
 
-#include "AArch64.h"
-#include "AArch64TargetMachine.h"
+#include "AArch64TargetTransformInfo.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
@@ -26,130 +19,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aarch64tti"
 
-// Declare the pass initialization routine locally as target-specific passes
-// don't have a target-wide initialization entry point, and so we rely on the
-// pass constructor initialization.
-namespace llvm {
-void initializeAArch64TTIPass(PassRegistry &);
-}
-
-namespace {
-
-class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
-  const AArch64TargetMachine *TM;
-  const AArch64Subtarget *ST;
-  const AArch64TargetLowering *TLI;
-
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
-
-public:
-  AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
-    llvm_unreachable("This pass cannot be directly constructed");
-  }
-
-  AArch64TTI(const AArch64TargetMachine *TM)
-      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
-        TLI(TM->getSubtargetImpl()->getTargetLowering()) {
-    initializeAArch64TTIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void initializePass() override { pushTTIStack(this); }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    TargetTransformInfo::getAnalysisUsage(AU);
-  }
-
-  /// Pass identification.
-  static char ID;
-
-  /// Provide necessary pointer adjustments for the two base classes.
-  void *getAdjustedAnalysisPointer(const void *ID) override {
-    if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo *)this;
-    return this;
-  }
-
-  /// \name Scalar TTI Implementations
-  /// @{
-  unsigned getIntImmCost(int64_t Val) const;
-  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
-  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                         Type *Ty) const override;
-  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                         Type *Ty) const override;
-  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
-
-  /// @}
-
-  /// \name Vector TTI Implementations
-  /// @{
-
-  unsigned getNumberOfRegisters(bool Vector) const override {
-    if (Vector) {
-      if (ST->hasNEON())
-        return 32;
-      return 0;
-    }
-    return 31;
-  }
-
-  unsigned getRegisterBitWidth(bool Vector) const override {
-    if (Vector) {
-      if (ST->hasNEON())
-        return 128;
-      return 0;
-    }
-    return 64;
-  }
-
-  unsigned getMaxInterleaveFactor() const override;
-
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
-      override;
-
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
-      override;
-
-  unsigned getArithmeticInstrCost(
-      unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
-      OperandValueKind Opd2Info = OK_AnyValue,
-      OperandValueProperties Opd1PropInfo = OP_None,
-      OperandValueProperties Opd2PropInfo = OP_None) const override;
-
-  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
-
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
-      override;
-
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) const override;
-
-  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const override;
-
-  void getUnrollingPreferences(const Function *F, Loop *L,
-                               UnrollingPreferences &UP) const override;
-
-
-  /// @}
-};
-
-} // end anonymous namespace
-
-INITIALIZE_AG_PASS(AArch64TTI, TargetTransformInfo, "aarch64tti",
-                   "AArch64 Target Transform Info", true, true, false)
-char AArch64TTI::ID = 0;
-
-ImmutablePass *
-llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) {
-  return new AArch64TTI(TM);
-}
-
 /// \brief Calculate the cost of materializing a 64-bit value. This helper
 /// method might only calculate a fraction of a larger immediate. Therefore it
 /// is valid to return a cost of ZERO.
-unsigned AArch64TTI::getIntImmCost(int64_t Val) const {
+unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) {
   // Check if the immediate can be encoded within an instruction.
   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
     return 0;
@@ -163,7 +36,7 @@ unsigned AArch64TTI::getIntImmCost(int64_t Val) const {
 }
 
 /// \brief Calculate the cost of materializing the given constant.
-unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -187,25 +60,25 @@ unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
   return std::max(1U, Cost);
 }
 
-unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
-                                 const APInt &Imm, Type *Ty) const {
+unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
+                                       const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
   // There is no cost model for constants with a bit size of 0. Return TCC_Free
   // here, so that constant hoisting will ignore this constant.
   if (BitSize == 0)
-    return TCC_Free;
+    return TTI::TCC_Free;
 
   unsigned ImmIdx = ~0U;
   switch (Opcode) {
   default:
-    return TCC_Free;
+    return TTI::TCC_Free;
   case Instruction::GetElementPtr:
     // Always hoist the base address of a GetElementPtr.
     if (Idx == 0)
-      return 2 * TCC_Basic;
-    return TCC_Free;
+      return 2 * TTI::TCC_Basic;
+    return TTI::TCC_Free;
   case Instruction::Store:
     ImmIdx = 0;
     break;
@@ -227,7 +100,7 @@ unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
   case Instruction::LShr:
   case Instruction::AShr:
     if (Idx == 1)
-      return TCC_Free;
+      return TTI::TCC_Free;
     break;
   case Instruction::Trunc:
   case Instruction::ZExt:
@@ -245,26 +118,27 @@ unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
 
   if (Idx == ImmIdx) {
     unsigned NumConstants = (BitSize + 63) / 64;
-    unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
-    return (Cost <= NumConstants * TCC_Basic)
-      ? static_cast<unsigned>(TCC_Free) : Cost;
+    unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+    return (Cost <= NumConstants * TTI::TCC_Basic)
+               ? static_cast<unsigned>(TTI::TCC_Free)
+               : Cost;
   }
-  return AArch64TTI::getIntImmCost(Imm, Ty);
+  return AArch64TTIImpl::getIntImmCost(Imm, Ty);
 }
 
-unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
-                                 const APInt &Imm, Type *Ty) const {
+unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                                       const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
   // There is no cost model for constants with a bit size of 0. Return TCC_Free
   // here, so that constant hoisting will ignore this constant.
   if (BitSize == 0)
-    return TCC_Free;
+    return TTI::TCC_Free;
 
   switch (IID) {
   default:
-    return TCC_Free;
+    return TTI::TCC_Free;
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::ssub_with_overflow:
@@ -273,35 +147,36 @@ unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
   case Intrinsic::umul_with_overflow:
     if (Idx == 1) {
       unsigned NumConstants = (BitSize + 63) / 64;
-      unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
-      return (Cost <= NumConstants * TCC_Basic)
-        ? static_cast<unsigned>(TCC_Free) : Cost;
+      unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+      return (Cost <= NumConstants * TTI::TCC_Basic)
+                 ? static_cast<unsigned>(TTI::TCC_Free)
+                 : Cost;
     }
     break;
   case Intrinsic::experimental_stackmap:
     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
-      return TCC_Free;
+      return TTI::TCC_Free;
     break;
   case Intrinsic::experimental_patchpoint_void:
   case Intrinsic::experimental_patchpoint_i64:
     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
-      return TCC_Free;
+      return TTI::TCC_Free;
     break;
   }
-  return AArch64TTI::getIntImmCost(Imm, Ty);
+  return AArch64TTIImpl::getIntImmCost(Imm, Ty);
 }
 
-AArch64TTI::PopcntSupportKind
-AArch64TTI::getPopcntSupport(unsigned TyWidth) const {
+TargetTransformInfo::PopcntSupportKind
+AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
   if (TyWidth == 32 || TyWidth == 64)
-    return PSK_FastHardware;
+    return TTI::PSK_FastHardware;
   // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
-  return PSK_Software;
+  return TTI::PSK_Software;
 }
 
-unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                    Type *Src) const {
+unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                          Type *Src) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -309,7 +184,7 @@ unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   EVT DstTy = TLI->getValueType(Dst);
 
   if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+    return BaseT::getCastInstrCost(Opcode, Dst, Src);
 
   static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
     // LowerVectorINT_TO_FP:
@@ -380,11 +255,11 @@ unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   if (Idx != -1)
     return ConversionTbl[Idx].Cost;
 
-  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+  return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                      unsigned Index) const {
+unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                            unsigned Index) {
   assert(Val->isVectorTy() && "This must be a vector type");
 
   if (Index != -1U) {
@@ -408,10 +283,10 @@ unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
   return 2;
 }
 
-unsigned AArch64TTI::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
-    OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
-    OperandValueProperties Opd2PropInfo) const {
+unsigned AArch64TTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
   // Legalize the type.
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
 
@@ -442,8 +317,8 @@ unsigned AArch64TTI::getArithmeticInstrCost(
 
   switch (ISD) {
   default:
-    return TargetTransformInfo::getArithmeticInstrCost(
-        Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                         Opd1PropInfo, Opd2PropInfo);
   case ISD::ADD:
   case ISD::MUL:
   case ISD::XOR:
@@ -455,7 +330,7 @@ unsigned AArch64TTI::getArithmeticInstrCost(
   }
 }
 
-unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
@@ -470,8 +345,8 @@ unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
   return 1;
 }
 
-unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                      Type *CondTy) const {
+unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                            Type *CondTy) {
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   // We don't lower vector selects well that are wider than the register width.
@@ -498,12 +373,12 @@ unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
         return VectorSelectTbl[Idx].Cost;
     }
   }
-  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 }
 
-unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
-                                   unsigned Alignment,
-                                   unsigned AddressSpace) const {
+unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                         unsigned Alignment,
+                                         unsigned AddressSpace) {
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
 
   if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
@@ -531,7 +406,7 @@ unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
   return LT.first;
 }
 
-unsigned AArch64TTI::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const {
+unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
   unsigned Cost = 0;
   for (auto *I : Tys) {
     if (!I->isVectorTy())
@@ -543,14 +418,94 @@ unsigned AArch64TTI::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const {
   return Cost;
 }
 
-unsigned AArch64TTI::getMaxInterleaveFactor() const {
+unsigned AArch64TTIImpl::getMaxInterleaveFactor() {
   if (ST->isCortexA57())
     return 4;
   return 2;
 }
 
-void AArch64TTI::getUnrollingPreferences(const Function *F, Loop *L,
-                                         UnrollingPreferences &UP) const {
+void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
+                                             TTI::UnrollingPreferences &UP) {
   // Disable partial & runtime unrolling on -Os.
   UP.PartialOptSizeThreshold = 0;
 }
+
+Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                                         Type *ExpectedType) {
+  switch (Inst->getIntrinsicID()) {
+  default:
+    return nullptr;
+  case Intrinsic::aarch64_neon_st2:
+  case Intrinsic::aarch64_neon_st3:
+  case Intrinsic::aarch64_neon_st4: {
+    // Create a struct type
+    StructType *ST = dyn_cast<StructType>(ExpectedType);
+    if (!ST)
+      return nullptr;
+    unsigned NumElts = Inst->getNumArgOperands() - 1;
+    if (ST->getNumElements() != NumElts)
+      return nullptr;
+    for (unsigned i = 0, e = NumElts; i != e; ++i) {
+      if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
+        return nullptr;
+    }
+    Value *Res = UndefValue::get(ExpectedType);
+    IRBuilder<> Builder(Inst);
+    for (unsigned i = 0, e = NumElts; i != e; ++i) {
+      Value *L = Inst->getArgOperand(i);
+      Res = Builder.CreateInsertValue(Res, L, i);
+    }
+    return Res;
+  }
+  case Intrinsic::aarch64_neon_ld2:
+  case Intrinsic::aarch64_neon_ld3:
+  case Intrinsic::aarch64_neon_ld4:
+    if (Inst->getType() == ExpectedType)
+      return Inst;
+    return nullptr;
+  }
+}
+
+bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
+                                        MemIntrinsicInfo &Info) {
+  switch (Inst->getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::aarch64_neon_ld2:
+  case Intrinsic::aarch64_neon_ld3:
+  case Intrinsic::aarch64_neon_ld4:
+    Info.ReadMem = true;
+    Info.WriteMem = false;
+    Info.Vol = false;
+    Info.NumMemRefs = 1;
+    Info.PtrVal = Inst->getArgOperand(0);
+    break;
+  case Intrinsic::aarch64_neon_st2:
+  case Intrinsic::aarch64_neon_st3:
+  case Intrinsic::aarch64_neon_st4:
+    Info.ReadMem = false;
+    Info.WriteMem = true;
+    Info.Vol = false;
+    Info.NumMemRefs = 1;
+    Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
+    break;
+  }
+
+  switch (Inst->getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::aarch64_neon_ld2:
+  case Intrinsic::aarch64_neon_st2:
+    Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
+    break;
+  case Intrinsic::aarch64_neon_ld3:
+  case Intrinsic::aarch64_neon_st3:
+    Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
+    break;
+  case Intrinsic::aarch64_neon_ld4:
+  case Intrinsic::aarch64_neon_st4:
+    Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
+    break;
+  }
+  return true;
+}
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
new file mode 100644
index 0000000..dd3fd1f
--- /dev/null
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -0,0 +1,147 @@
+//===-- AArch64TargetTransformInfo.h - AArch64 specific TTI -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// AArch64 target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+
+namespace llvm {
+
+class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
+  typedef BasicTTIImplBase<AArch64TTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const AArch64TargetMachine *TM;
+  const AArch64Subtarget *ST;
+  const AArch64TargetLowering *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+
+  const AArch64Subtarget *getST() const { return ST; }
+  const AArch64TargetLowering *getTLI() const { return TLI; }
+
+  enum MemIntrinsicType {
+    VECTOR_LDST_TWO_ELEMENTS,
+    VECTOR_LDST_THREE_ELEMENTS,
+    VECTOR_LDST_FOUR_ELEMENTS
+  };
+
+public:
+  explicit AArch64TTIImpl(const AArch64TargetMachine *TM, Function &F)
+      : BaseT(TM), TM(TM), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  // Provide value semantics. MSVC requires that we spell all of these out.
+  AArch64TTIImpl(const AArch64TTIImpl &Arg)
+      : BaseT(static_cast<const BaseT &>(Arg)), TM(Arg.TM), ST(Arg.ST),
+        TLI(Arg.TLI) {}
+  AArch64TTIImpl(AArch64TTIImpl &&Arg)
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), TM(std::move(Arg.TM)),
+        ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {}
+  AArch64TTIImpl &operator=(const AArch64TTIImpl &RHS) {
+    BaseT::operator=(static_cast<const BaseT &>(RHS));
+    TM = RHS.TM;
+    ST = RHS.ST;
+    TLI = RHS.TLI;
+    return *this;
+  }
+  AArch64TTIImpl &operator=(AArch64TTIImpl &&RHS) {
+    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
+    TM = std::move(RHS.TM);
+    ST = std::move(RHS.ST);
+    TLI = std::move(RHS.TLI);
+    return *this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+
+  using BaseT::getIntImmCost;
+  unsigned getIntImmCost(int64_t Val);
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty);
+  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                         Type *Ty);
+  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                         Type *Ty);
+  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector) {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 32;
+      return 0;
+    }
+    return 31;
+  }
+
+  unsigned getRegisterBitWidth(bool Vector) {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 128;
+      return 0;
+    }
+    return 64;
+  }
+
+  unsigned getMaxInterleaveFactor();
+
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+
+  unsigned getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+
+  unsigned getAddressComputationCost(Type *Ty, bool IsComplex);
+
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace);
+
+  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
+
+  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                           Type *ExpectedType);
+
+  bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
+
+  /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 98e0ea8..1960c99 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -210,9 +210,9 @@ private:
   struct SysRegOp {
     const char *Data;
     unsigned Length;
-    uint64_t FeatureBits; // We need to pass through information about which
-                          // core we are compiling for so that the SysReg
-                          // Mappers can appropriately conditionalize.
+    uint32_t MRSReg;
+    uint32_t MSRReg;
+    uint32_t PStateField;
   };
 
   struct SysCRImmOp {
@@ -374,11 +374,6 @@ public:
     return StringRef(SysReg.Data, SysReg.Length);
   }
 
-  uint64_t getSysRegFeatureBits() const {
-    assert(Kind == k_SysReg && "Invalid access!");
-    return SysReg.FeatureBits;
-  }
-
   unsigned getSysCR() const {
     assert(Kind == k_SysCR && "Invalid access!");
     return SysCRImm.Val;
@@ -855,28 +850,17 @@ public:
   bool isMRSSystemRegister() const {
     if (!isSysReg()) return false;
 
-    bool IsKnownRegister;
-    auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits());
-    Mapper.fromString(getSysReg(), IsKnownRegister);
-
-    return IsKnownRegister;
+    return SysReg.MRSReg != -1U;
   }
   bool isMSRSystemRegister() const {
     if (!isSysReg()) return false;
 
-    bool IsKnownRegister;
-    auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits());
-    Mapper.fromString(getSysReg(), IsKnownRegister);
-
-    return IsKnownRegister;
+    return SysReg.MSRReg != -1U;
   }
   bool isSystemPStateField() const {
     if (!isSysReg()) return false;
 
-    bool IsKnownRegister;
-    AArch64PState::PStateMapper().fromString(getSysReg(), IsKnownRegister);
-
-    return IsKnownRegister;
+    return SysReg.PStateField != -1U;
   }
   bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
   bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
@@ -1454,31 +1438,19 @@ public:
   void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
 
-    bool Valid;
-    auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits());
-    uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
-
-    Inst.addOperand(MCOperand::CreateImm(Bits));
+    Inst.addOperand(MCOperand::CreateImm(SysReg.MRSReg));
   }
 
   void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
 
-    bool Valid;
-    auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits());
-    uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
-
-    Inst.addOperand(MCOperand::CreateImm(Bits));
+    Inst.addOperand(MCOperand::CreateImm(SysReg.MSRReg));
   }
 
   void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
 
-    bool Valid;
-    uint32_t Bits =
-        AArch64PState::PStateMapper().fromString(getSysReg(), Valid);
-
-    Inst.addOperand(MCOperand::CreateImm(Bits));
+    Inst.addOperand(MCOperand::CreateImm(SysReg.PStateField));
   }
 
   void addSysCROperands(MCInst &Inst, unsigned N) const {
@@ -1645,12 +1617,17 @@ public:
     return Op;
   }
 
-  static std::unique_ptr<AArch64Operand>
-  CreateSysReg(StringRef Str, SMLoc S, uint64_t FeatureBits, MCContext &Ctx) {
+  static std::unique_ptr<AArch64Operand> CreateSysReg(StringRef Str, SMLoc S,
+                                                      uint32_t MRSReg,
+                                                      uint32_t MSRReg,
+                                                      uint32_t PStateField,
+                                                      MCContext &Ctx) {
     auto Op = make_unique<AArch64Operand>(k_SysReg, Ctx);
     Op->SysReg.Data = Str.data();
     Op->SysReg.Length = Str.size();
-    Op->SysReg.FeatureBits = FeatureBits;
+    Op->SysReg.MRSReg = MRSReg;
+    Op->SysReg.MSRReg = MSRReg;
+    Op->SysReg.PStateField = PStateField;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
@@ -2643,8 +2620,24 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
   if (Tok.isNot(AsmToken::Identifier))
     return MatchOperand_NoMatch;
 
-  Operands.push_back(AArch64Operand::CreateSysReg(Tok.getString(), getLoc(),
-                     STI.getFeatureBits(), getContext()));
+  bool IsKnown;
+  auto MRSMapper = AArch64SysReg::MRSMapper(STI.getFeatureBits());
+  uint32_t MRSReg = MRSMapper.fromString(Tok.getString(), IsKnown);
+  assert(IsKnown == (MRSReg != -1U) &&
+         "register should be -1 if and only if it's unknown");
+
+  auto MSRMapper = AArch64SysReg::MSRMapper(STI.getFeatureBits());
+  uint32_t MSRReg = MSRMapper.fromString(Tok.getString(), IsKnown);
+  assert(IsKnown == (MSRReg != -1U) &&
+         "register should be -1 if and only if it's unknown");
+
+  uint32_t PStateField =
+      AArch64PState::PStateMapper().fromString(Tok.getString(), IsKnown);
+  assert(IsKnown == (PStateField != -1U) &&
+         "register should be -1 if and only if it's unknown");
+
+  Operands.push_back(AArch64Operand::CreateSysReg(
+      Tok.getString(), getLoc(), MRSReg, MSRReg, PStateField, getContext()));
   Parser.Lex(); // Eat identifier
 
   return MatchOperand_Success;
@@ -3927,7 +3920,6 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   }
 
   llvm_unreachable("Implement any new match types added!");
-  return true;
 }
 
 /// ParseDirective parses the arm specific directives
@@ -4140,7 +4132,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
   Parser.Lex(); // Consume the EndOfStatement
 
   auto pair = std::make_pair(IsVector, RegNum);
-  if (!RegisterReqs.insert(std::make_pair(Name, pair)).second)
+  if (RegisterReqs.insert(std::make_pair(Name, pair)).first->second != pair)
     Warning(L, "ignoring redefinition of register alias '" + Name + "'");
 
   return true;
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 878e29c..fb25089 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -221,13 +221,11 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
 static MCSymbolizer *
 createAArch64ExternalSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
-                              LLVMSymbolLookupCallback SymbolLookUp,
-                              void *DisInfo, MCContext *Ctx,
-                              MCRelocationInfo *RelInfo) {
-  return new llvm::AArch64ExternalSymbolizer(
-                                     *Ctx,
-                                     std::unique_ptr<MCRelocationInfo>(RelInfo),
-                                     GetOpInfo, SymbolLookUp, DisInfo);
+                                LLVMSymbolLookupCallback SymbolLookUp,
+                                void *DisInfo, MCContext *Ctx,
+                                std::unique_ptr<MCRelocationInfo> &&RelInfo) {
+  return new llvm::AArch64ExternalSymbolizer(*Ctx, move(RelInfo), GetOpInfo,
+                                             SymbolLookUp, DisInfo);
 }
 
 extern "C" void LLVMInitializeAArch64Disassembler() {
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 1dc506a..ed24343 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -51,7 +51,7 @@ enum ShiftExtendType {
 /// getShiftName - Get the string encoding for the shift type.
 static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) {
   switch (ST) {
-  default: assert(false && "unhandled shift type!");
+  default: llvm_unreachable("unhandled shift type!");
   case AArch64_AM::LSL: return "lsl";
   case AArch64_AM::LSR: return "lsr";
   case AArch64_AM::ASR: return "asr";
@@ -236,21 +236,22 @@ static inline bool processLogicalImmediate(uint64_t Imm, unsigned RegSize,
 
   if (isShiftedMask_64(Imm)) {
     I = countTrailingZeros(Imm);
-    CTO = CountTrailingOnes_64(Imm >> I);
+    assert(I < 64 && "undefined behavior");
+    CTO = countTrailingOnes(Imm >> I);
   } else {
     Imm |= ~Mask;
     if (!isShiftedMask_64(~Imm))
       return false;
 
-    unsigned CLO = CountLeadingOnes_64(Imm);
+    unsigned CLO = countLeadingOnes(Imm);
     I = 64 - CLO;
-    CTO = CLO + CountTrailingOnes_64(Imm) - (64 - Size);
+    CTO = CLO + countTrailingOnes(Imm) - (64 - Size);
   }
 
   // Encode in Immr the number of RORs it would take to get *from* 0^m 1^n
-  // to our target value, where i is the number of RORs to go the opposite
+  // to our target value, where I is the number of RORs to go the opposite
   // direction.
-  assert(Size > I && "I should be smaller than element Size");
+  assert(Size > I && "I should be smaller than element size");
   unsigned Immr = (Size - I) & (Size - 1);
 
   // If size has a 1 in the n'th bit, create a value that has zeroes in
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 0bc2f77..423da65 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -13,8 +13,8 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
@@ -132,7 +132,7 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
   int64_t SignedValue = static_cast<int64_t>(Value);
   switch (Kind) {
   default:
-    assert(false && "Unknown fixup kind!");
+    llvm_unreachable("Unknown fixup kind!");
   case AArch64::fixup_aarch64_pcrel_adr_imm21:
     if (SignedValue > 2097151 || SignedValue < -2097152)
       report_fatal_error("fixup value out of range");
@@ -239,7 +239,7 @@ bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
 
 void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
                                          MCInst &Res) const {
-  assert(false && "AArch64AsmBackend::relaxInstruction() unimplemented");
+  llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented");
 }
 
 bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
@@ -317,42 +317,6 @@ public:
                                          MachO::CPU_SUBTYPE_ARM64_ALL);
   }
 
-  bool doesSectionRequireSymbols(const MCSection &Section) const override {
-    // Any section for which the linker breaks things into atoms needs to
-    // preserve symbols, including assembler local symbols, to identify
-    // those atoms. These sections are:
-    // Sections of type:
-    //
-    //    S_CSTRING_LITERALS  (e.g. __cstring)
-    //    S_LITERAL_POINTERS  (e.g.  objc selector pointers)
-    //    S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS
-    //
-    // Sections named:
-    //
-    //    __TEXT,__eh_frame
-    //    __TEXT,__ustring
-    //    __DATA,__cfstring
-    //    __DATA,__objc_classrefs
-    //    __DATA,__objc_catlist
-    //
-    // FIXME: It would be better if the compiler used actual linker local
-    // symbols for each of these sections rather than preserving what
-    // are ostensibly assembler local symbols.
-    const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
-    return (SMO.getType() == MachO::S_CSTRING_LITERALS ||
-            SMO.getType() == MachO::S_4BYTE_LITERALS ||
-            SMO.getType() == MachO::S_8BYTE_LITERALS ||
-            SMO.getType() == MachO::S_16BYTE_LITERALS ||
-            SMO.getType() == MachO::S_LITERAL_POINTERS ||
-            (SMO.getSegmentName() == "__TEXT" &&
-             (SMO.getSectionName() == "__eh_frame" ||
-              SMO.getSectionName() == "__ustring")) ||
-            (SMO.getSegmentName() == "__DATA" &&
-             (SMO.getSectionName() == "__cfstring" ||
-              SMO.getSectionName() == "__objc_classrefs" ||
-              SMO.getSectionName() == "__objc_catlist")));
-  }
-
   /// \brief Generate the compact unwind encoding from the CFI directives.
   uint32_t generateCompactUnwindEncoding(
                              ArrayRef<MCCFIInstruction> Instrs) const override {
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index e05191e..5ea49c3 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -78,7 +78,7 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
       if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC)
         return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
       if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC)
-        return ELF::R_AARCH64_TLSDESC_ADR_PAGE;
+        return ELF::R_AARCH64_TLSDESC_ADR_PAGE21;
       llvm_unreachable("invalid symbol kind for ADRP relocation");
     case AArch64::fixup_aarch64_pcrel_branch26:
       return ELF::R_AARCH64_JUMP26;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 60e9c19..8dc6c30 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -177,7 +177,9 @@ private:
     MCELF::SetType(SD, ELF::STT_NOTYPE);
     MCELF::SetBinding(SD, ELF::STB_LOCAL);
     SD.setExternal(false);
-    Symbol->setSection(*getCurrentSection().first);
+    auto Sec = getCurrentSection().first;
+    assert(Sec && "need a section");
+    Symbol->setSection(*Sec);
 
     const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
     Symbol->setVariableValue(Value);
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 70b9329..f048474 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -37,6 +37,7 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
   AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
 
   PrivateGlobalPrefix = "L";
+  PrivateLabelPrefix = "L";
   SeparatorString = "%%";
   CommentString = ";";
   PointerSize = CalleeSaveStackSlotSize = 8;
@@ -79,6 +80,7 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) {
 
   CommentString = "//";
   PrivateGlobalPrefix = ".L";
+  PrivateLabelPrefix = ".L";
   Code32Directive = ".code\t32";
 
   Data16bitsDirective = "\t.hword\t";
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 5d03c21..9b88de7 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 
 #include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
 class Target;
@@ -27,7 +28,7 @@ struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
                               MCStreamer &Streamer) const override;
 };
 
-struct AArch64MCAsmInfoELF : public MCAsmInfo {
+struct AArch64MCAsmInfoELF : public MCAsmInfoELF {
   explicit AArch64MCAsmInfoELF(StringRef TT);
 };
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index c306b11..4756a19 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -437,8 +437,7 @@ AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
     return 3;
   }
 
-  assert(false && "Invalid value for vector shift amount!");
-  return 0;
+  llvm_unreachable("Invalid value for vector shift amount!");
 }
 
 uint32_t
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index e12a24b..0d9385d 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -10,6 +10,7 @@
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -33,7 +34,7 @@ public:
       : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype,
                                  /*UseAggressiveSymbolFolding=*/true) {}
 
-  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+  void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
                         const MCAsmLayout &Layout, const MCFragment *Fragment,
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue) override;
@@ -112,8 +113,36 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
   }
 }
 
+static bool canUseLocalRelocation(const MCSectionMachO &Section,
+                                  const MCSymbol &Symbol, unsigned Log2Size) {
+  // Debug info sections can use local relocations.
+  if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+    return true;
+
+  // Otherwise, only pointer sized relocations are supported.
+  if (Log2Size != 3)
+    return false;
+
+  // But only if they don't point to a few forbidden sections.
+  if (!Symbol.isInSection())
+    return true;
+  const MCSectionMachO &RefSec = cast<MCSectionMachO>(Symbol.getSection());
+  if (RefSec.getType() == MachO::S_CSTRING_LITERALS)
+    return false;
+
+  if (RefSec.getSegmentName() == "__DATA" &&
+      RefSec.getSectionName() == "__cfstring")
+    return false;
+
+  if (RefSec.getSegmentName() == "__DATA" &&
+      RefSec.getSectionName() == "__objc_classrefs")
+    return false;
+
+  return true;
+}
+
 void AArch64MachObjectWriter::RecordRelocation(
-    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout,
     const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
     uint64_t &FixedValue) {
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
@@ -123,9 +152,9 @@ void AArch64MachObjectWriter::RecordRelocation(
   unsigned Log2Size = 0;
   int64_t Value = 0;
   unsigned Index = 0;
-  unsigned IsExtern = 0;
   unsigned Type = 0;
   unsigned Kind = Fixup.getKind();
+  const MCSymbolData *RelSymbol = nullptr;
 
   FixupOffset += Fixup.getOffset();
 
@@ -171,10 +200,8 @@ void AArch64MachObjectWriter::RecordRelocation(
     // FIXME: Should this always be extern?
     // SymbolNum of 0 indicates the absolute section.
     Type = MachO::ARM64_RELOC_UNSIGNED;
-    Index = 0;
 
     if (IsPCRel) {
-      IsExtern = 1;
       Asm.getContext().FatalError(Fixup.getLoc(),
                                   "PC relative absolute relocation!");
 
@@ -198,15 +225,12 @@ void AArch64MachObjectWriter::RecordRelocation(
         Layout.getSymbolOffset(&B_SD) ==
             Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) {
       // SymB is the PC, so use a PC-rel pointer-to-GOT relocation.
-      Index = A_Base->getIndex();
-      IsExtern = 1;
       Type = MachO::ARM64_RELOC_POINTER_TO_GOT;
       IsPCRel = 1;
       MachO::any_relocation_info MRE;
       MRE.r_word0 = FixupOffset;
-      MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                     (IsExtern << 27) | (Type << 28));
-      Writer->addRelocation(Fragment->getParent(), MRE);
+      MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+      Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
       return;
     } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
                Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
@@ -252,26 +276,31 @@ void AArch64MachObjectWriter::RecordRelocation(
                   ? 0
                   : Writer->getSymbolAddress(B_Base, Layout));
 
-    Index = A_Base->getIndex();
-    IsExtern = 1;
     Type = MachO::ARM64_RELOC_UNSIGNED;
 
     MachO::any_relocation_info MRE;
     MRE.r_word0 = FixupOffset;
-    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                   (IsExtern << 27) | (Type << 28));
-    Writer->addRelocation(Fragment->getParent(), MRE);
+    MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+    Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
 
-    Index = B_Base->getIndex();
-    IsExtern = 1;
+    RelSymbol = B_Base;
     Type = MachO::ARM64_RELOC_SUBTRACTOR;
   } else { // A + constant
     const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
-    const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
-    const MCSymbolData *Base = Asm.getAtom(&SD);
     const MCSectionMachO &Section = static_cast<const MCSectionMachO &>(
         Fragment->getParent()->getSection());
 
+    bool CanUseLocalRelocation =
+        canUseLocalRelocation(Section, *Symbol, Log2Size);
+    if (Symbol->isTemporary() && (Value || !CanUseLocalRelocation)) {
+      const MCSection &Sec = Symbol->getSection();
+      if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
+        Asm.addLocalUsedInReloc(*Symbol);
+    }
+
+    const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+    const MCSymbolData *Base = Asm.getAtom(&SD);
+
     // If the symbol is a variable and we weren't able to get a Base for it
     // (i.e., it's not in the symbol table associated with a section) resolve
     // the relocation based its expansion instead.
@@ -310,16 +339,13 @@ void AArch64MachObjectWriter::RecordRelocation(
     // sections, and for pointer-sized relocations (.quad), we allow section
     // relocations.  It's code sections that run into trouble.
     if (Base) {
-      Index = Base->getIndex();
-      IsExtern = 1;
+      RelSymbol = Base;
 
       // Add the local offset, if needed.
       if (Base != &SD)
         Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
     } else if (Symbol->isInSection()) {
-      // Pointer-sized relocations can use a local relocation. Otherwise,
-      // we have to be in a debug info section.
-      if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3)
+      if (!CanUseLocalRelocation)
         Asm.getContext().FatalError(
             Fixup.getLoc(),
             "unsupported relocation of local symbol '" + Symbol->getName() +
@@ -329,7 +355,6 @@ void AArch64MachObjectWriter::RecordRelocation(
       const MCSectionData &SymSD =
           Asm.getSectionData(SD.getSymbol().getSection());
       Index = SymSD.getOrdinal() + 1;
-      IsExtern = 0;
       Value += Writer->getSymbolAddress(&SD, Layout);
 
       if (IsPCRel)
@@ -362,16 +387,16 @@ void AArch64MachObjectWriter::RecordRelocation(
 
     MachO::any_relocation_info MRE;
     MRE.r_word0 = FixupOffset;
-    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                   (IsExtern << 27) | (Type << 28));
-    Writer->addRelocation(Fragment->getParent(), MRE);
+    MRE.r_word1 =
+        (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+    Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
 
     // Now set up the Addend relocation.
     Type = MachO::ARM64_RELOC_ADDEND;
     Index = Value;
+    RelSymbol = nullptr;
     IsPCRel = 0;
     Log2Size = 2;
-    IsExtern = 0;
 
     // Put zero into the instruction itself. The addend is in the relocation.
     Value = 0;
@@ -383,9 +408,9 @@ void AArch64MachObjectWriter::RecordRelocation(
   // struct relocation_info (8 bytes)
   MachO::any_relocation_info MRE;
   MRE.r_word0 = FixupOffset;
-  MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                 (IsExtern << 27) | (Type << 28));
-  Writer->addRelocation(Fragment->getParent(), MRE);
+  MRE.r_word1 =
+      (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
 }
 
 MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 02db53a..d3cc068 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -34,16 +34,12 @@ FunctionPass *createA15SDOptimizerPass();
 FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
 FunctionPass *createARMExpandPseudoPass();
 FunctionPass *createARMGlobalBaseRegPass();
-FunctionPass *createARMGlobalMergePass(const TargetLowering* tli);
 FunctionPass *createARMConstantIslandPass();
 FunctionPass *createMLxExpansionPass();
 FunctionPass *createThumb2ITBlockPass();
 FunctionPass *createARMOptimizeBarriersPass();
 FunctionPass *createThumb2SizeReductionPass();
 
-/// \brief Creates an ARM-specific Target Transformation Info pass.
-ImmutablePass *createARMTargetTransformInfoPass(const ARMBaseTargetMachine *TM);
-
 void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                   ARMAsmPrinter &AP);
 
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 80b976b..f080c60 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -147,6 +147,11 @@ def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass",
 def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
                                        "NaCl trap">;
 
+// RenderScript-specific support for 64-bit long types on all targets
+def FeatureLong64 : SubtargetFeature<"long64", "UseLong64",
+                                     "true",
+                                     "long type is forced to be 64-bit">;
+
 // ARM ISAs.
 def HasV4TOps   : SubtargetFeature<"v4t", "HasV4TOps", "true",
                                    "Support ARM v4T instructions">;
@@ -270,17 +275,6 @@ def ProcKrait   : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
                                     FeatureHWDivARM]>;
 
 
-def FeatureAPCS  : SubtargetFeature<"apcs", "TargetABI", "ARM_ABI_APCS",
-                                   "Use the APCS ABI">;
-
-def FeatureAAPCS : SubtargetFeature<"aapcs", "TargetABI", "ARM_ABI_AAPCS",
-                                   "Use the AAPCS ABI">;
-
-// RenderScript-specific support for 64-bit long types on all targets
-def FeatureLong64 : SubtargetFeature<"long64", "UseLong64",
-                                     "true",
-                                     "long type is forced to be 64-bit">;
-
 class ProcNoItin<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
 
@@ -336,6 +330,12 @@ def : Processor<"mpcore",           ARMV6Itineraries, [HasV6Ops, FeatureVFP2,
 // V6M Processors.
 def : Processor<"cortex-m0",        ARMV6Itineraries, [HasV6MOps, FeatureNoARM,
                                                        FeatureDB, FeatureMClass]>;
+def : Processor<"cortex-m0plus",    ARMV6Itineraries, [HasV6MOps, FeatureNoARM,
+                                                       FeatureDB, FeatureMClass]>;
+def : Processor<"cortex-m1",        ARMV6Itineraries, [HasV6MOps, FeatureNoARM,
+                                                       FeatureDB, FeatureMClass]>;
+def : Processor<"sc000",            ARMV6Itineraries, [HasV6MOps, FeatureNoARM,
+                                                       FeatureDB, FeatureMClass]>;
 
 // V6T2 Processors.
 def : Processor<"arm1156t2-s",      ARMV6Itineraries, [HasV6T2Ops,
@@ -395,10 +395,20 @@ def : ProcessorModel<"cortex-r5",   CortexA8Model,
                                      FeatureHasRAS, FeatureVFPOnlySP,
                                      FeatureD16, FeatureRClass]>;
 
+// FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5.
+def : ProcessorModel<"cortex-r7",   CortexA8Model,
+                                    [ProcR5, HasV7Ops, FeatureDB,
+                                     FeatureVFP3, FeatureDSPThumb2,
+                                     FeatureHasRAS, FeatureVFPOnlySP,
+                                     FeatureD16, FeatureMP, FeatureRClass]>;
+
 // V7M Processors.
 def : ProcNoItin<"cortex-m3",       [HasV7Ops,
                                      FeatureThumb2, FeatureNoARM, FeatureDB,
                                      FeatureHWDiv, FeatureMClass]>;
+def : ProcNoItin<"sc300",           [HasV7Ops,
+                                     FeatureThumb2, FeatureNoARM, FeatureDB,
+                                     FeatureHWDiv, FeatureMClass]>;
 
 // V7EM Processors.
 def : ProcNoItin<"cortex-m4",       [HasV7Ops,
@@ -427,6 +437,10 @@ def : ProcNoItin<"cortex-a53",      [ProcA53, HasV8Ops, FeatureAClass,
 def : ProcNoItin<"cortex-a57",      [ProcA57, HasV8Ops, FeatureAClass,
                                     FeatureDB, FeatureFPARMv8,
                                     FeatureNEON, FeatureDSPThumb2]>;
+// FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
+def : ProcNoItin<"cortex-a72",      [ProcA57, HasV8Ops, FeatureAClass,
+                                    FeatureDB, FeatureFPARMv8,
+                                    FeatureNEON, FeatureDSPThumb2]>;
 
 // Cyclone is very similar to swift
 def : ProcessorModel<"cyclone",     SwiftModel,
diff --git a/lib/Target/ARM/ARMArchExtName.def b/lib/Target/ARM/ARMArchExtName.def
new file mode 100644
index 0000000..d6da50c
--- /dev/null
+++ b/lib/Target/ARM/ARMArchExtName.def
@@ -0,0 +1,30 @@
+//===-- ARMArchExtName.def - List of the ARM Extension names ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the list of the supported ARM Architecture Extension
+// names. These can be used to enable the extension through .arch_extension
+// attribute
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+#ifndef ARM_ARCHEXT_NAME
+#error "You must define ARM_ARCHEXT_NAME(NAME, ID) before including ARMArchExtName.h"
+#endif
+
+ARM_ARCHEXT_NAME("crc", CRC)
+ARM_ARCHEXT_NAME("crypto", CRYPTO)
+ARM_ARCHEXT_NAME("fp", FP)
+ARM_ARCHEXT_NAME("idiv", HWDIV)
+ARM_ARCHEXT_NAME("mp", MP)
+ARM_ARCHEXT_NAME("sec", SEC)
+ARM_ARCHEXT_NAME("virt", VIRT)
+
+#undef ARM_ARCHEXT_NAME
diff --git a/lib/Target/ARM/ARMArchExtName.h b/lib/Target/ARM/ARMArchExtName.h
new file mode 100644
index 0000000..bc1157a
--- /dev/null
+++ b/lib/Target/ARM/ARMArchExtName.h
@@ -0,0 +1,26 @@
+//===-- ARMArchExtName.h - List of the ARM Extension names ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMARCHEXTNAME_H
+#define LLVM_LIB_TARGET_ARM_ARMARCHEXTNAME_H
+
+namespace llvm {
+namespace ARM {
+
+enum ArchExtKind {
+  INVALID_ARCHEXT = 0
+
+#define ARM_ARCHEXT_NAME(NAME, ID) , ID
+#include "ARMArchExtName.def"
+};
+
+} // namespace ARM
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 695fd4d..2544a01 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -16,6 +16,7 @@
 #include "ARM.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMFPUName.h"
+#include "ARMArchExtName.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
@@ -57,6 +58,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
+ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM,
+                             std::unique_ptr<MCStreamer> Streamer)
+    : AsmPrinter(TM, std::move(Streamer)), AFI(nullptr), MCP(nullptr),
+      InConstantPool(false) {}
+
 void ARMAsmPrinter::EmitFunctionBodyEnd() {
   // Make sure to terminate any constant pools that were at the end
   // of the function.
@@ -76,8 +82,7 @@ void ARMAsmPrinter::EmitFunctionEntryLabel() {
 }
 
 void ARMAsmPrinter::EmitXXStructor(const Constant *CV) {
-  uint64_t Size =
-      TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(CV->getType());
+  uint64_t Size = TM.getDataLayout()->getTypeAllocSize(CV->getType());
   assert(Size && "C++ constructor pointer had zero size!");
 
   const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts());
@@ -99,6 +104,7 @@ void ARMAsmPrinter::EmitXXStructor(const Constant *CV) {
 bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   AFI = MF.getInfo<ARMFunctionInfo>();
   MCP = MF.getConstantPool();
+  Subtarget = &MF.getSubtarget<ARMSubtarget>();
 
   SetupMachineFunction(MF);
 
@@ -120,6 +126,23 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   // Emit the rest of the function body.
   EmitFunctionBody();
 
+  // If we need V4T thumb mode Register Indirect Jump pads, emit them.
+  // These are created per function, rather than per TU, since it's
+  // relatively easy to exceed the thumb branch range within a TU.
+  if (! ThumbIndirectPads.empty()) {
+    OutStreamer.EmitAssemblerFlag(MCAF_Code16);
+    EmitAlignment(1);
+    for (unsigned i = 0, e = ThumbIndirectPads.size(); i < e; i++) {
+      OutStreamer.EmitLabel(ThumbIndirectPads[i].second);
+      EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tBX)
+        .addReg(ThumbIndirectPads[i].first)
+        // Add predicate operands.
+        .addImm(ARMCC::AL)
+        .addReg(0));
+    }
+    ThumbIndirectPads.clear();
+  }
+
   // We didn't modify anything.
   return false;
 }
@@ -183,7 +206,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
 
 MCSymbol *ARMAsmPrinter::
 GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   SmallString<60> Name;
   raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "JTI"
     << getFunctionNumber() << '_' << uid << '_' << uid2;
@@ -192,7 +215,7 @@ GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const {
 
 
 MCSymbol *ARMAsmPrinter::GetARMSJLJEHLabel() const {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   SmallString<60> Name;
   raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "SJLJEH"
     << getFunctionNumber();
@@ -414,7 +437,8 @@ void ARMAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
 }
 
 void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
-  if (Subtarget->isTargetMachO()) {
+  Triple TT(TM.getTargetTriple());
+  if (TT.isOSBinFormatMachO()) {
     Reloc::Model RelocM = TM.getRelocationModel();
     if (RelocM == Reloc::PIC_ || RelocM == Reloc::DynamicNoPIC) {
       // Declare all the text sections up front (before the DWARF sections
@@ -477,10 +501,17 @@ void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
   OutStreamer.EmitAssemblerFlag(MCAF_SyntaxUnified);
 
   // Emit ARM Build Attributes
-  if (Subtarget->isTargetELF())
+  if (TT.isOSBinFormatELF())
     emitAttributes();
 
-  if (!M.getModuleInlineAsm().empty() && Subtarget->isThumb())
+  // Use the triple's architecture and subarchitecture to determine
+  // if we're thumb for the purposes of the top level code16 assembler
+  // flag.
+  bool isThumb = TT.getArch() == Triple::thumb ||
+                 TT.getArch() == Triple::thumbeb ||
+                 TT.getSubArch() == Triple::ARMSubArch_v7m ||
+                 TT.getSubArch() == Triple::ARMSubArch_v6m;
+  if (!M.getModuleInlineAsm().empty() && isThumb)
     OutStreamer.EmitAssemblerFlag(MCAF_Code16);
 }
 
@@ -509,7 +540,8 @@ emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
 
 
 void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
-  if (Subtarget->isTargetMachO()) {
+  Triple TT(TM.getTargetTriple());
+  if (TT.isOSBinFormatMachO()) {
     // All darwin targets use mach-o.
     const TargetLoweringObjectFileMachO &TLOFMacho =
       static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
@@ -552,7 +584,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
   }
 
   // Emit a .data.rel section containing any stubs that were created.
-  if (Subtarget->isTargetELF()) {
+  if (TT.isOSBinFormatELF()) {
     const TargetLoweringObjectFileELF &TLOFELF =
       static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
 
@@ -562,7 +594,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
     if (!Stubs.empty()) {
       OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
-      const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
+      const DataLayout *TD = TM.getDataLayout();
 
       for (auto &stub: Stubs) {
         OutStreamer.EmitLabel(stub.first);
@@ -612,69 +644,96 @@ void ARMAsmPrinter::emitAttributes() {
   MCTargetStreamer &TS = *OutStreamer.getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
 
-  ATS.switchVendor("aeabi");
+  ATS.emitTextAttribute(ARMBuildAttrs::conformance, "2.09");
 
-  std::string CPUString = Subtarget->getCPUString();
+  ATS.switchVendor("aeabi");
 
-  // FIXME: remove krait check when GNU tools support krait cpu
-  if (CPUString != "generic" && CPUString != "krait")
-    ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString);
+  // Compute ARM ELF Attributes based on the default subtarget that
+  // we'd have constructed. The existing ARM behavior isn't LTO clean
+  // anyhow.
+  // FIXME: For ifunc related functions we could iterate over and look
+  // for a feature string that doesn't match the default one.
+  StringRef TT = TM.getTargetTriple();
+  StringRef CPU = TM.getTargetCPU();
+  StringRef FS = TM.getTargetFeatureString();
+  std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU);
+  if (!FS.empty()) {
+    if (!ArchFS.empty())
+      ArchFS = ArchFS + "," + FS.str();
+    else
+      ArchFS = FS;
+  }
+  const ARMBaseTargetMachine &ATM =
+      static_cast<const ARMBaseTargetMachine &>(TM);
+  const ARMSubtarget STI(TT, CPU, ArchFS, ATM, ATM.isLittleEndian());
+
+  std::string CPUString = STI.getCPUString();
+
+  if (CPUString != "generic") {
+    // FIXME: remove krait check when GNU tools support krait cpu
+    if (STI.isKrait()) {
+      ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a9");
+      // We consider krait as a "cortex-a9" + hwdiv CPU
+      // Enable hwdiv through ".arch_extension idiv"
+      if (STI.hasDivide() || STI.hasDivideInARMMode())
+        ATS.emitArchExtension(ARM::HWDIV);
+    } else
+      ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString);
+  }
 
-  ATS.emitAttribute(ARMBuildAttrs::CPU_arch,
-                    getArchForCPU(CPUString, Subtarget));
+  ATS.emitAttribute(ARMBuildAttrs::CPU_arch, getArchForCPU(CPUString, &STI));
 
   // Tag_CPU_arch_profile must have the default value of 0 when "Architecture
   // profile is not applicable (e.g. pre v7, or cross-profile code)".
-  if (Subtarget->hasV7Ops()) {
-    if (Subtarget->isAClass()) {
+  if (STI.hasV7Ops()) {
+    if (STI.isAClass()) {
       ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
                         ARMBuildAttrs::ApplicationProfile);
-    } else if (Subtarget->isRClass()) {
+    } else if (STI.isRClass()) {
       ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
                         ARMBuildAttrs::RealTimeProfile);
-    } else if (Subtarget->isMClass()) {
+    } else if (STI.isMClass()) {
       ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
                         ARMBuildAttrs::MicroControllerProfile);
     }
   }
 
-  ATS.emitAttribute(ARMBuildAttrs::ARM_ISA_use, Subtarget->hasARMOps() ?
-                      ARMBuildAttrs::Allowed : ARMBuildAttrs::Not_Allowed);
-  if (Subtarget->isThumb1Only()) {
-    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                      ARMBuildAttrs::Allowed);
-  } else if (Subtarget->hasThumb2()) {
+  ATS.emitAttribute(ARMBuildAttrs::ARM_ISA_use,
+                    STI.hasARMOps() ? ARMBuildAttrs::Allowed
+                                    : ARMBuildAttrs::Not_Allowed);
+  if (STI.isThumb1Only()) {
+    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use, ARMBuildAttrs::Allowed);
+  } else if (STI.hasThumb2()) {
     ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
                       ARMBuildAttrs::AllowThumb32);
   }
 
-  if (Subtarget->hasNEON()) {
+  if (STI.hasNEON()) {
     /* NEON is not exactly a VFP architecture, but GAS emit one of
      * neon/neon-fp-armv8/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */
-    if (Subtarget->hasFPARMv8()) {
-      if (Subtarget->hasCrypto())
+    if (STI.hasFPARMv8()) {
+      if (STI.hasCrypto())
         ATS.emitFPU(ARM::CRYPTO_NEON_FP_ARMV8);
       else
         ATS.emitFPU(ARM::NEON_FP_ARMV8);
-    }
-    else if (Subtarget->hasVFP4())
+    } else if (STI.hasVFP4())
       ATS.emitFPU(ARM::NEON_VFPV4);
     else
       ATS.emitFPU(ARM::NEON);
     // Emit Tag_Advanced_SIMD_arch for ARMv8 architecture
-    if (Subtarget->hasV8Ops())
+    if (STI.hasV8Ops())
       ATS.emitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
                         ARMBuildAttrs::AllowNeonARMv8);
   } else {
-    if (Subtarget->hasFPARMv8())
+    if (STI.hasFPARMv8())
       // FPv5 and FP-ARMv8 have the same instructions, so are modeled as one
       // FPU, but there are two different names for it depending on the CPU.
-      ATS.emitFPU(Subtarget->hasD16() ? ARM::FPV5_D16 : ARM::FP_ARMV8);
-    else if (Subtarget->hasVFP4())
-      ATS.emitFPU(Subtarget->hasD16() ? ARM::VFPV4_D16 : ARM::VFPV4);
-    else if (Subtarget->hasVFP3())
-      ATS.emitFPU(Subtarget->hasD16() ? ARM::VFPV3_D16 : ARM::VFPV3);
-    else if (Subtarget->hasVFP2())
+      ATS.emitFPU(STI.hasD16() ? ARM::FPV5_D16 : ARM::FP_ARMV8);
+    else if (STI.hasVFP4())
+      ATS.emitFPU(STI.hasD16() ? ARM::VFPV4_D16 : ARM::VFPV4);
+    else if (STI.hasVFP3())
+      ATS.emitFPU(STI.hasD16() ? ARM::VFPV3_D16 : ARM::VFPV3);
+    else if (STI.hasVFP2())
       ATS.emitFPU(ARM::VFPV2);
   }
 
@@ -694,11 +753,42 @@ void ARMAsmPrinter::emitAttributes() {
 
   // Signal various FP modes.
   if (!TM.Options.UnsafeFPMath) {
-    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, ARMBuildAttrs::Allowed);
-    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions,
-                      ARMBuildAttrs::Allowed);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
+                      ARMBuildAttrs::IEEEDenormals);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions, ARMBuildAttrs::Allowed);
+
+    // If the user has permitted this code to choose the IEEE 754
+    // rounding at run-time, emit the rounding attribute.
+    if (TM.Options.HonorSignDependentRoundingFPMathOption)
+      ATS.emitAttribute(ARMBuildAttrs::ABI_FP_rounding, ARMBuildAttrs::Allowed);
+  } else {
+    if (!STI.hasVFP2()) {
+      // When the target doesn't have an FPU (by design or
+      // intention), the assumptions made on the software support
+      // mirror that of the equivalent hardware support *if it
+      // existed*. For v7 and better we indicate that denormals are
+      // flushed preserving sign, and for V6 we indicate that
+      // denormals are flushed to positive zero.
+      if (STI.hasV7Ops())
+        ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
+                          ARMBuildAttrs::PreserveFPSign);
+    } else if (STI.hasVFP3()) {
+      // In VFPv4, VFPv4U, VFPv3, or VFPv3U, it is preserved. That is,
+      // the sign bit of the zero matches the sign bit of the input or
+      // result that is being flushed to zero.
+      ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
+                        ARMBuildAttrs::PreserveFPSign);
+    }
+    // For VFPv2 implementations it is implementation defined as
+    // to whether denormals are flushed to positive zero or to
+    // whatever the sign of zero is (ARM v7AR ARM 2.7.5). Historically
+    // LLVM has chosen to flush this to positive zero (most likely for
+    // GCC compatibility), so that's the chosen value here (the
+    // absence of its emission implies zero).
   }
 
+  // TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath is the
+  // equivalent of GCC's -ffinite-math-only flag.
   if (TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath)
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
                       ARMBuildAttrs::Allowed);
@@ -706,7 +796,7 @@ void ARMAsmPrinter::emitAttributes() {
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
                       ARMBuildAttrs::AllowIEE754);
 
-  if (Subtarget->allowsUnalignedMem())
+  if (STI.allowsUnalignedMem())
     ATS.emitAttribute(ARMBuildAttrs::CPU_unaligned_access,
                       ARMBuildAttrs::Allowed);
   else
@@ -719,21 +809,28 @@ void ARMAsmPrinter::emitAttributes() {
   ATS.emitAttribute(ARMBuildAttrs::ABI_align_preserved, 1);
 
   // ABI_HardFP_use attribute to indicate single precision FP.
-  if (Subtarget->isFPOnlySP())
+  if (STI.isFPOnlySP())
     ATS.emitAttribute(ARMBuildAttrs::ABI_HardFP_use,
                       ARMBuildAttrs::HardFPSinglePrecision);
 
   // Hard float.  Use both S and D registers and conform to AAPCS-VFP.
-  if (Subtarget->isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard)
+  if (STI.isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard)
     ATS.emitAttribute(ARMBuildAttrs::ABI_VFP_args, ARMBuildAttrs::HardFPAAPCS);
 
   // FIXME: Should we signal R9 usage?
 
-  if (Subtarget->hasFP16())
-      ATS.emitAttribute(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP);
+  if (STI.hasFP16())
+    ATS.emitAttribute(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP);
+
+  // FIXME: To support emitting this build attribute as GCC does, the
+  // -mfp16-format option and associated plumbing must be
+  // supported. For now the __fp16 type is exposed by default, so this
+  // attribute should be emitted with value 1.
+  ATS.emitAttribute(ARMBuildAttrs::ABI_FP_16bit_format,
+                    ARMBuildAttrs::FP16FormatIEEE);
 
-  if (Subtarget->hasMPExtension())
-      ATS.emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP);
+  if (STI.hasMPExtension())
+    ATS.emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP);
 
   // Hardware divide in ARM mode is part of base arch, starting from ARMv8.
   // If only Thumb hwdiv is present, it must also be in base arch (ARMv7-R/M).
@@ -741,14 +838,14 @@ void ARMAsmPrinter::emitAttributes() {
   // arch, supplying -hwdiv downgrades the effective arch, via ClearImpliedBits.
   // AllowDIVExt is only emitted if hwdiv isn't available in the base arch;
   // otherwise, the default value (AllowDIVIfExists) applies.
-  if (Subtarget->hasDivideInARMMode() && !Subtarget->hasV8Ops())
-      ATS.emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt);
+  if (STI.hasDivideInARMMode() && !STI.hasV8Ops())
+    ATS.emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt);
 
   if (MMI) {
     if (const Module *SourceModule = MMI->getModule()) {
       // ABI_PCS_wchar_t to indicate wchar_t width
       // FIXME: There is no way to emit value 0 (wchar_t prohibited).
-      if (auto WCharWidthValue = cast_or_null<ConstantInt>(
+      if (auto WCharWidthValue = mdconst::extract_or_null<ConstantInt>(
               SourceModule->getModuleFlag("wchar_size"))) {
         int WCharWidth = WCharWidthValue->getZExtValue();
         assert((WCharWidth == 2 || WCharWidth == 4) &&
@@ -759,7 +856,7 @@ void ARMAsmPrinter::emitAttributes() {
       // ABI_enum_size to indicate enum width
       // FIXME: There is no way to emit value 0 (enums prohibited) or value 3
       //        (all enums contain a value needing 32 bits to encode).
-      if (auto EnumWidthValue = cast_or_null<ConstantInt>(
+      if (auto EnumWidthValue = mdconst::extract_or_null<ConstantInt>(
               SourceModule->getModuleFlag("min_enum_size"))) {
         int EnumWidth = EnumWidthValue->getZExtValue();
         assert((EnumWidth == 1 || EnumWidth == 4) &&
@@ -774,22 +871,20 @@ void ARMAsmPrinter::emitAttributes() {
   // it as another callee-saved register, but not as SB or a TLS pointer; It
   // would instead be nicer to push this from the frontend as metadata, as we do
   // for the wchar and enum size tags
-  if (Subtarget->isR9Reserved())
-      ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use,
-                        ARMBuildAttrs::R9Reserved);
+  if (STI.isR9Reserved())
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use, ARMBuildAttrs::R9Reserved);
   else
-      ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use,
-                        ARMBuildAttrs::R9IsGPR);
-
-  if (Subtarget->hasTrustZone() && Subtarget->hasVirtualization())
-      ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
-                        ARMBuildAttrs::AllowTZVirtualization);
-  else if (Subtarget->hasTrustZone())
-      ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
-                        ARMBuildAttrs::AllowTZ);
-  else if (Subtarget->hasVirtualization())
-      ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
-                        ARMBuildAttrs::AllowVirtualization);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use, ARMBuildAttrs::R9IsGPR);
+
+  if (STI.hasTrustZone() && STI.hasVirtualization())
+    ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                      ARMBuildAttrs::AllowTZVirtualization);
+  else if (STI.hasTrustZone())
+    ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                      ARMBuildAttrs::AllowTZ);
+  else if (STI.hasVirtualization())
+    ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                      ARMBuildAttrs::AllowVirtualization);
 
   ATS.finishAttributeSection();
 }
@@ -858,9 +953,8 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
 
 void ARMAsmPrinter::
 EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
-  int Size =
-      TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(MCPV->getType());
+  const DataLayout *DL = TM.getDataLayout();
+  int Size = TM.getDataLayout()->getTypeAllocSize(MCPV->getType());
 
   ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV);
 
@@ -1176,7 +1270,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
 #include "ARMGenMCPseudoLowering.inc"
 
 void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
 
   // If we just ended a constant pool, mark it as such.
   if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
@@ -1251,18 +1345,34 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case ARM::tBX_CALL: {
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tMOVr)
-      .addReg(ARM::LR)
-      .addReg(ARM::PC)
-      // Add predicate operands.
-      .addImm(ARMCC::AL)
-      .addReg(0));
+    if (Subtarget->hasV5TOps())
+      llvm_unreachable("Expected BLX to be selected for v5t+");
+
+    // On ARM v4t, when doing a call from thumb mode, we need to ensure
+    // that the saved lr has its LSB set correctly (the arch doesn't
+    // have blx).
+    // So here we generate a bl to a small jump pad that does bx rN.
+    // The jump pads are emitted after the function body.
+
+    unsigned TReg = MI->getOperand(0).getReg();
+    MCSymbol *TRegSym = nullptr;
+    for (unsigned i = 0, e = ThumbIndirectPads.size(); i < e; i++) {
+      if (ThumbIndirectPads[i].first == TReg) {
+        TRegSym = ThumbIndirectPads[i].second;
+        break;
+      }
+    }
 
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tBX)
-      .addReg(MI->getOperand(0).getReg())
-      // Add predicate operands.
-      .addImm(ARMCC::AL)
-      .addReg(0));
+    if (!TRegSym) {
+      TRegSym = OutContext.CreateTempSymbol();
+      ThumbIndirectPads.push_back(std::make_pair(TReg, TRegSym));
+    }
+
+    // Create a link-saving branch to the Reg Indirect Jump Pad.
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tBL)
+        // Predicate comes first here.
+        .addImm(ARMCC::AL).addReg(0)
+        .addExpr(MCSymbolRefExpr::Create(TRegSym, OutContext)));
     return;
   }
   case ARM::BMOVPCRX_CALL: {
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index 5ff20ce..50cb954 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -20,6 +20,7 @@ class ARMFunctionInfo;
 class MCOperand;
 class MachineConstantPool;
 class MachineOperand;
+class MCSymbol;
 
 namespace ARM {
   enum DW_ISA {
@@ -45,12 +46,14 @@ class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
   /// InConstantPool - Maintain state when emitting a sequence of constant
   /// pool entries so we can properly mark them as data regions.
   bool InConstantPool;
+
+  /// ThumbIndirectPads - These maintain a per-function list of jump pad
+  /// labels used for ARMv4t thumb code to make register indirect calls.
+  SmallVector<std::pair<unsigned, MCSymbol*>, 4> ThumbIndirectPads;
+
 public:
-  explicit ARMAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer), AFI(nullptr), MCP(nullptr),
-      InConstantPool(false) {
-    Subtarget = &TM.getSubtarget<ARMSubtarget>();
-  }
+  explicit ARMAsmPrinter(TargetMachine &TM,
+                         std::unique_ptr<MCStreamer> Streamer);
 
   const char *getPassName() const override {
     return "ARM Assembly / Object Emitter";
@@ -100,12 +103,13 @@ private:
                                    const MachineInstr *MI);
 
 public:
-  unsigned getISAEncoding() override {
+  unsigned getISAEncoding(const Function *F) override {
     // ARM/Darwin adds ISA to the DWARF info for each function.
-    if (!Subtarget->isTargetMachO())
+    Triple TT(TM.getTargetTriple());
+    if (!TT.isOSBinFormatMachO())
       return 0;
-    return Subtarget->isThumb() ?
-      ARM::DW_ISA_ARM_thumb : ARM::DW_ISA_ARM_arm;
+    const ARMSubtarget &STI = TM.getSubtarget<ARMSubtarget>(*F);
+    return STI.isThumb() ? ARM::DW_ISA_ARM_thumb : ARM::DW_ISA_ARM_arm;
   }
 
 private:
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 7a315c4..29ee22e 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1836,8 +1836,10 @@ bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr *MI,
   return false;
 }
 
-MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
-                                               bool PreferFalse) const {
+MachineInstr *
+ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
+                                 SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+                                 bool PreferFalse) const {
   assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) &&
          "Unknown select instruction");
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
@@ -1885,6 +1887,10 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
   NewMI.addOperand(FalseReg);
   NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
 
+  // Update SeenMIs set: register newly created MI and erase removed DefMI.
+  SeenMIs.insert(NewMI);
+  SeenMIs.erase(DefMI);
+
   // The caller will erase MI, but not DefMI.
   DefMI->eraseFromParent();
   return NewMI;
@@ -1985,8 +1991,7 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
                                       unsigned NumBytes) {
   // This optimisation potentially adds lots of load and store
   // micro-operations, it's only really a great benefit to code-size.
-  if (!MF.getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::MinSize))
+  if (!MF.getFunction()->hasFnAttribute(Attribute::MinSize))
     return false;
 
   // If only one register is pushed/popped, LLVM can use an LDR/STR
@@ -2394,7 +2399,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   else if (MI->getParent() != CmpInstr->getParent() || CmpValue != 0) {
     // Conservatively refuse to convert an instruction which isn't in the same
     // BB as the comparison.
-    // For CMPri, we need to check Sub, thus we can't return here.
+    // For CMPri w/ CmpValue != 0, a Sub may still be a candidate.
+    // Thus we cannot return here.
     if (CmpInstr->getOpcode() == ARM::CMPri ||
        CmpInstr->getOpcode() == ARM::t2CMPri)
       MI = nullptr;
@@ -2473,8 +2479,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   case ARM::t2EORrr:
   case ARM::t2EORri: {
     // Scan forward for the use of CPSR
-    // When checking against MI: if it's a conditional code requires
-    // checking of V bit, then this is not safe to do.
+    // When checking against MI: if it's a conditional code that requires
+    // checking of the V bit or C bit, then this is not safe to do.
     // It is safe to remove CmpInstr if CPSR is redefined or killed.
     // If we are done with the basic block, we need to check whether CPSR is
     // live-out.
@@ -2541,19 +2547,30 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
             OperandsToUpdate.push_back(
                 std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
           }
-        } else
+        } else {
+          // No Sub, so this is x = <op> y, z; cmp x, 0.
           switch (CC) {
-          default:
+          case ARMCC::EQ: // Z
+          case ARMCC::NE: // Z
+          case ARMCC::MI: // N
+          case ARMCC::PL: // N
+          case ARMCC::AL: // none
             // CPSR can be used multiple times, we should continue.
             break;
-          case ARMCC::VS:
-          case ARMCC::VC:
-          case ARMCC::GE:
-          case ARMCC::LT:
-          case ARMCC::GT:
-          case ARMCC::LE:
+          case ARMCC::HS: // C
+          case ARMCC::LO: // C
+          case ARMCC::VS: // V
+          case ARMCC::VC: // V
+          case ARMCC::HI: // C Z
+          case ARMCC::LS: // C Z
+          case ARMCC::GE: // N V
+          case ARMCC::LT: // N V
+          case ARMCC::GT: // Z N V
+          case ARMCC::LE: // Z N V
+            // The instruction uses the V bit or C bit which is not safe.
             return false;
           }
+        }
       }
     }
 
@@ -3647,9 +3664,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     // instructions).
     if (Latency > 0 && Subtarget.isThumb2()) {
       const MachineFunction *MF = DefMI->getParent()->getParent();
-      if (MF->getFunction()->getAttributes().
-            hasAttribute(AttributeSet::FunctionIndex,
-                         Attribute::OptimizeForSize))
+      if (MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
         --Latency;
     }
     return Latency;
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 0ae291b..ecbcf5c 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -261,7 +261,9 @@ public:
                      unsigned &TrueOp, unsigned &FalseOp,
                      bool &Optimizable) const override;
 
-  MachineInstr *optimizeSelect(MachineInstr *MI, bool) const override;
+  MachineInstr *optimizeSelect(MachineInstr *MI,
+                               SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+                               bool) const override;
 
   /// FoldImmediate - 'Reg' is known to be defined by a move immediate
   /// instruction, try to fold the immediate into the use instruction.
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 6dc0493..7574727 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -60,9 +60,8 @@ ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMSubtarget &sti)
 
 const MCPhysReg*
 ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  const MCPhysReg *RegList = (STI.isTargetIOS() && !STI.isAAPCS_ABI())
-                                ? CSR_iOS_SaveList
-                                : CSR_AAPCS_SaveList;
+  const MCPhysReg *RegList =
+      STI.isTargetDarwin() ? CSR_iOS_SaveList : CSR_AAPCS_SaveList;
 
   if (!MF) return RegList;
 
@@ -95,8 +94,7 @@ ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
   if (CC == CallingConv::GHC)
     // This is academic becase all GHC calls are (supposed to be) tail calls
     return CSR_NoRegs_RegMask;
-  return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
-    ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
+  return STI.isTargetDarwin() ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
 }
 
 const uint32_t*
@@ -117,8 +115,8 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(CallingConv::ID CC) const {
   if (CC == CallingConv::GHC)
     // This is academic becase all GHC calls are (supposed to be) tail calls
     return nullptr;
-  return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
-    ? CSR_iOS_ThisReturn_RegMask : CSR_AAPCS_ThisReturn_RegMask;
+  return STI.isTargetDarwin() ? CSR_iOS_ThisReturn_RegMask
+                              : CSR_AAPCS_ThisReturn_RegMask;
 }
 
 BitVector ARMBaseRegisterInfo::
@@ -266,7 +264,7 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
 }
 
 void
-ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
+ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg,
                                         MachineFunction &MF) const {
   MachineRegisterInfo *MRI = &MF.getRegInfo();
   std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
@@ -356,10 +354,7 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
     return false;
   // We may also need a base pointer if there are dynamic allocas or stack
   // pointer adjustments around calls.
-  if (MF.getTarget()
-          .getSubtargetImpl()
-          ->getFrameLowering()
-          ->hasReservedCallFrame(MF))
+  if (MF.getSubtarget().getFrameLowering()->hasReservedCallFrame(MF))
     return true;
   // A base pointer is required and allowed.  Check that it isn't too late to
   // reserve it.
@@ -370,14 +365,10 @@ bool ARMBaseRegisterInfo::
 needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *F = MF.getFunction();
-  unsigned StackAlign = MF.getTarget()
-                            .getSubtargetImpl()
-                            ->getFrameLowering()
-                            ->getStackAlignment();
-  bool requiresRealignment =
-    ((MFI->getMaxAlignment() > StackAlign) ||
-     F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                     Attribute::StackAlignment));
+  unsigned StackAlign =
+      MF.getSubtarget().getFrameLowering()->getStackAlignment();
+  bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
+                              F->hasFnAttribute(Attribute::StackAlignment));
 
   return requiresRealignment && canRealignStack(MF);
 }
@@ -555,12 +546,13 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
   //        and pick a real one.
   Offset += 128; // 128 bytes of spill slots
 
-  // If there is a frame pointer, try using it.
+  // If there's a frame pointer and the addressing mode allows it, try using it.
   // The FP is only available if there is no dynamic realignment. We
   // don't know for sure yet whether we'll need that, so we guess based
   // on whether there are any local variables that would trigger it.
   unsigned StackAlign = TFI->getStackAlignment();
-  if (TFI->hasFP(MF) &&
+  if (TFI->hasFP(MF) && 
+      (MI->getDesc().TSFlags & ARMII::AddrModeMask) != ARMII::AddrModeT1_s &&
       !((MFI->getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) {
     if (isFrameOffsetLegal(MI, FPOffset))
       return false;
@@ -677,7 +669,7 @@ bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
     NumBits = 8;
     break;
   case ARMII::AddrModeT1_s:
-    NumBits = 5;
+    NumBits = 8;
     Scale = 4;
     isSigned = false;
     break;
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index e9bc412..17027c2 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -135,7 +135,7 @@ public:
                              const MachineFunction &MF,
                              const VirtRegMap *VRM) const override;
 
-  void UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
+  void updateRegAllocHint(unsigned Reg, unsigned NewReg,
                           MachineFunction &MF) const override;
 
   bool avoidWriteAfterWrite(const TargetRegisterClass *RC) const override;
diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h
index bd07236..d687568 100644
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h
@@ -31,7 +31,7 @@ static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   static const MCPhysReg RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
 
   // Try to get the first register.
-  if (unsigned Reg = State.AllocateReg(RegList, 4))
+  if (unsigned Reg = State.AllocateReg(RegList))
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   else {
     // For the 2nd half of a v2f64, do not fail.
@@ -46,7 +46,7 @@ static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   }
 
   // Try to get the second register.
-  if (unsigned Reg = State.AllocateReg(RegList, 4))
+  if (unsigned Reg = State.AllocateReg(RegList))
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   else
     State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
@@ -76,11 +76,11 @@ static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   static const MCPhysReg ShadowRegList[] = { ARM::R0, ARM::R1 };
   static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
 
-  unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList, 2);
+  unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList);
   if (Reg == 0) {
 
     // If we had R3 unallocated only, now we still must to waste it.
-    Reg = State.AllocateReg(GPRArgRegs, 4);
+    Reg = State.AllocateReg(GPRArgRegs);
     assert((!Reg || Reg == ARM::R3) && "Wrong GPRs usage for f64");
 
     // For the 2nd half of a v2f64, do not just fail.
@@ -126,7 +126,7 @@ static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
   static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
 
-  unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2);
+  unsigned Reg = State.AllocateReg(HiRegList, LoRegList);
   if (Reg == 0)
     return false; // we didn't handle it
 
@@ -160,6 +160,8 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                    State);
 }
 
+static const uint16_t RRegList[] = { ARM::R0,  ARM::R1,  ARM::R2,  ARM::R3 };
+
 static const uint16_t SRegList[] = { ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
                                      ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
                                      ARM::S8,  ARM::S9,  ARM::S10, ARM::S11,
@@ -168,85 +170,114 @@ static const uint16_t DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3,
                                      ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
 static const uint16_t QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
 
+
 // Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA
 // has InConsecutiveRegs set, and that the last member also has
 // InConsecutiveRegsLast set. We must process all members of the HA before
 // we can allocate it, as we need to know the total number of registers that
 // will be needed in order to (attempt to) allocate a contiguous block.
-static bool CC_ARM_AAPCS_Custom_HA(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                   CCValAssign::LocInfo &LocInfo,
-                                   ISD::ArgFlagsTy &ArgFlags, CCState &State) {
-  SmallVectorImpl<CCValAssign> &PendingHAMembers = State.getPendingLocs();
+static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
+                                          MVT &LocVT,
+                                          CCValAssign::LocInfo &LocInfo,
+                                          ISD::ArgFlagsTy &ArgFlags,
+                                          CCState &State) {
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
 
   // AAPCS HFAs must have 1-4 elements, all of the same type
-  assert(PendingHAMembers.size() < 4);
-  if (PendingHAMembers.size() > 0)
-    assert(PendingHAMembers[0].getLocVT() == LocVT);
+  if (PendingMembers.size() > 0)
+    assert(PendingMembers[0].getLocVT() == LocVT);
 
   // Add the argument to the list to be allocated once we know the size of the
-  // HA
-  PendingHAMembers.push_back(
-      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
-
-  if (ArgFlags.isInConsecutiveRegsLast()) {
-    assert(PendingHAMembers.size() > 0 && PendingHAMembers.size() <= 4 &&
-           "Homogeneous aggregates must have between 1 and 4 members");
-
-    // Try to allocate a contiguous block of registers, each of the correct
-    // size to hold one member.
-    const uint16_t *RegList;
-    unsigned NumRegs;
-    switch (LocVT.SimpleTy) {
-    case MVT::f32:
-      RegList = SRegList;
-      NumRegs = 16;
-      break;
-    case MVT::f64:
-      RegList = DRegList;
-      NumRegs = 8;
-      break;
-    case MVT::v2f64:
-      RegList = QRegList;
-      NumRegs = 4;
-      break;
-    default:
-      llvm_unreachable("Unexpected member type for HA");
-      break;
-    }
+  // aggregate. Store the type's required alignmnent as extra info for later: in
+  // the [N x i64] case all trace has been removed by the time we actually get
+  // to do allocation.
+  PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo,
+                                                   ArgFlags.getOrigAlign()));
 
-    unsigned RegResult =
-        State.AllocateRegBlock(RegList, NumRegs, PendingHAMembers.size());
-
-    if (RegResult) {
-      for (SmallVectorImpl<CCValAssign>::iterator It = PendingHAMembers.begin();
-           It != PendingHAMembers.end(); ++It) {
-        It->convertToReg(RegResult);
-        State.addLoc(*It);
-        ++RegResult;
-      }
-      PendingHAMembers.clear();
-      return true;
-    }
+  if (!ArgFlags.isInConsecutiveRegsLast())
+    return true;
+
+  // Try to allocate a contiguous block of registers, each of the correct
+  // size to hold one member.
+  unsigned Align = std::min(PendingMembers[0].getExtraInfo(), 8U);
 
-    // Register allocation failed, fall back to the stack
+  ArrayRef<uint16_t> RegList;
+  switch (LocVT.SimpleTy) {
+  case MVT::i32: {
+    RegList = RRegList;
+    unsigned RegIdx = State.getFirstUnallocated(RegList);
 
-    // Mark all VFP regs as unavailable (AAPCS rule C.2.vfp)
-    for (unsigned regNo = 0; regNo < 16; ++regNo)
-      State.AllocateReg(SRegList[regNo]);
+    // First consume all registers that would give an unaligned object. Whether
+    // we go on stack or in regs, no-one will be using them in future.
+    unsigned RegAlign = RoundUpToAlignment(Align, 4) / 4;
+    while (RegIdx % RegAlign != 0 && RegIdx < RegList.size())
+      State.AllocateReg(RegList[RegIdx++]);
 
-    unsigned Size = LocVT.getSizeInBits() / 8;
-    unsigned Align = std::min(Size, 8U);
+    break;
+  }
+  case MVT::f32:
+    RegList = SRegList;
+    break;
+  case MVT::f64:
+    RegList = DRegList;
+    break;
+  case MVT::v2f64:
+    RegList = QRegList;
+    break;
+  default:
+    llvm_unreachable("Unexpected member type for block aggregate");
+    break;
+  }
+
+  unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
+  if (RegResult) {
+    for (SmallVectorImpl<CCValAssign>::iterator It = PendingMembers.begin();
+         It != PendingMembers.end(); ++It) {
+      It->convertToReg(RegResult);
+      State.addLoc(*It);
+      ++RegResult;
+    }
+    PendingMembers.clear();
+    return true;
+  }
+
+  // Register allocation failed, we'll be needing the stack
+  unsigned Size = LocVT.getSizeInBits() / 8;
+  if (LocVT == MVT::i32 && State.getNextStackOffset() == 0) {
+    // If nothing else has used the stack until this point, a non-HFA aggregate
+    // can be split between regs and stack.
+    unsigned RegIdx = State.getFirstUnallocated(RegList);
+    for (auto &It : PendingMembers) {
+      if (RegIdx >= RegList.size())
+        It.convertToMem(State.AllocateStack(Size, Size));
+      else
+        It.convertToReg(State.AllocateReg(RegList[RegIdx++]));
 
-    for (auto It : PendingHAMembers) {
-      It.convertToMem(State.AllocateStack(Size, Align));
       State.addLoc(It);
     }
+    PendingMembers.clear();
+    return true;
+  } else if (LocVT != MVT::i32)
+    RegList = SRegList;
+
+  // Mark all regs as unavailable (AAPCS rule C.2.vfp for VFP, C.6 for core)
+  for (auto Reg : RegList)
+    State.AllocateReg(Reg);
 
-    // All pending members have now been allocated
-    PendingHAMembers.clear();
+  for (auto &It : PendingMembers) {
+    It.convertToMem(State.AllocateStack(Size, Align));
+    State.addLoc(It);
+
+    // After the first item has been allocated, the rest are packed as tightly
+    // as possible. (E.g. an incoming i64 would have starting Align of 8, but
+    // we'll be allocating a bunch of i32 slots).
+    Align = Size;
   }
 
-  // This will be allocated by the last member of the HA
+  // All pending members have now been allocated
+  PendingMembers.clear();
+
+  // This will be allocated by the last member of the aggregate
   return true;
 }
 
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 526089b..7dd21ecbe 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -175,7 +175,7 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
   // HFAs are passed in a contiguous block of registers, or on the stack
-  CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_HA">>,
+  CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_Aggregate">>,
 
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 29405eb..9966cd7 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -383,11 +383,9 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
                << MCP->getConstants().size() << " CP entries, aligned to "
                << MCP->getConstantPoolAlignment() << " bytes *****\n");
 
-  TII = (const ARMBaseInstrInfo *)MF->getTarget()
-            .getSubtargetImpl()
-            ->getInstrInfo();
+  STI = &static_cast<const ARMSubtarget &>(MF->getSubtarget());
+  TII = STI->getInstrInfo();
   AFI = MF->getInfo<ARMFunctionInfo>();
-  STI = &MF->getTarget().getSubtarget<ARMSubtarget>();
 
   isThumb = AFI->isThumbFunction();
   isThumb1 = AFI->isThumb1OnlyFunction();
@@ -532,7 +530,7 @@ ARMConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
   // identity mapping of CPI's to CPE's.
   const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
 
-  const DataLayout &TD = *MF->getSubtarget().getDataLayout();
+  const DataLayout &TD = *MF->getTarget().getDataLayout();
   for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
     unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
     assert(Size >= 4 && "Too small constant pool entry");
@@ -1270,7 +1268,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
       unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
       ImmBranches.push_back(ImmBranch(&UserMBB->back(),
                                       MaxDisp, false, UncondBr));
-      BBInfo[UserMBB->getNumber()].Size += Delta;
+      computeBlockSize(UserMBB);
       adjustBBOffsetsAfter(UserMBB);
       return;
     }
@@ -1952,7 +1950,9 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
       DEBUG(dbgs() << "Shrink JT: " << *MI << "     addr: " << *AddrMI
                    << "      lea: " << *LeaMI);
       unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT;
-      MachineInstr *NewJTMI = BuildMI(MBB, MI->getDebugLoc(), TII->get(Opc))
+      MachineBasicBlock::iterator MI_JT = MI;
+      MachineInstr *NewJTMI =
+        BuildMI(*MBB, MI_JT, MI->getDebugLoc(), TII->get(Opc))
         .addReg(IdxReg, getKillRegState(IdxRegKill))
         .addJumpTableIndex(JTI, JTOP.getTargetFlags())
         .addImm(MI->getOperand(JTOpIdx+1).getImm());
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 2d80518..4438f50 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -22,8 +22,8 @@
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h" // FIXME: for debug only. remove!
@@ -887,6 +887,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
           unsigned MaxAlign = MFI->getMaxAlignment();
           assert (!AFI->isThumb1OnlyFunction());
           // Emit bic r6, r6, MaxAlign
+          assert(MaxAlign <= 256 && "The BIC instruction cannot encode "
+                                    "immediates larger than 256 with all lower "
+                                    "bits set.");
           unsigned bicOpc = AFI->isThumbFunction() ?
             ARM::t2BICri : ARM::BICri;
           AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
@@ -980,7 +983,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned LDRLITOpc = IsARM ? ARM::LDRi12 : ARM::tLDRpci;
       unsigned PICAddOpc =
           IsARM
-              ? (Opcode == ARM::LDRLIT_ga_pcrel_ldr ? ARM::PICADD : ARM::PICLDR)
+              ? (Opcode == ARM::LDRLIT_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD)
               : ARM::tPICADD;
 
       // We need a new const-pool entry to load from.
@@ -1129,7 +1132,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       // Add the source operands (D subregs).
       unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
       unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
-      MIB.addReg(D0).addReg(D1);
+      MIB.addReg(D0, SrcIsKill ? RegState::Kill : 0)
+         .addReg(D1, SrcIsKill ? RegState::Kill : 0);
 
       if (SrcIsKill)      // Add an implicit kill for the Q register.
         MIB->addRegisterKilled(SrcReg, TRI, true);
@@ -1342,11 +1346,9 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
 }
 
 bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
-  const TargetMachine &TM = MF.getTarget();
-  TII = static_cast<const ARMBaseInstrInfo *>(
-      TM.getSubtargetImpl()->getInstrInfo());
-  TRI = TM.getSubtargetImpl()->getRegisterInfo();
-  STI = &TM.getSubtarget<ARMSubtarget>();
+  STI = &static_cast<const ARMSubtarget &>(MF.getSubtarget());
+  TII = STI->getInstrInfo();
+  TRI = STI->getRegisterInfo();
   AFI = MF.getInfo<ARMFunctionInfo>();
 
   bool Modified = false;
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index a5f635e..375d394 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -93,11 +93,11 @@ class ARMFastISel final : public FastISel {
     explicit ARMFastISel(FunctionLoweringInfo &funcInfo,
                          const TargetLibraryInfo *libInfo)
         : FastISel(funcInfo, libInfo),
+          Subtarget(
+              &static_cast<const ARMSubtarget &>(funcInfo.MF->getSubtarget())),
           M(const_cast<Module &>(*funcInfo.Fn->getParent())),
-          TM(funcInfo.MF->getTarget()),
-          TII(*TM.getSubtargetImpl()->getInstrInfo()),
-          TLI(*TM.getSubtargetImpl()->getTargetLowering()) {
-      Subtarget = &TM.getSubtarget<ARMSubtarget>();
+          TM(funcInfo.MF->getTarget()), TII(*Subtarget->getInstrInfo()),
+          TLI(*Subtarget->getTargetLowering()) {
       AFI = funcInfo.MF->getInfo<ARMFunctionInfo>();
       isThumb2 = AFI->isThumbFunction();
       Context = &funcInfo.Fn->getContext();
@@ -189,9 +189,7 @@ class ARMFastISel final : public FastISel {
     unsigned ARMSelectCallOp(bool UseReg);
     unsigned ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT);
 
-    const TargetLowering *getTargetLowering() {
-      return TM.getSubtargetImpl()->getTargetLowering();
-    }
+    const TargetLowering *getTargetLowering() { return &TLI; }
 
     // Call handling routines.
   private:
@@ -586,9 +584,8 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
 
   Reloc::Model RelocM = TM.getRelocationModel();
   bool IsIndirect = Subtarget->GVIsIndirectSymbol(GV, RelocM);
-  const TargetRegisterClass *RC = isThumb2 ?
-    (const TargetRegisterClass*)&ARM::rGPRRegClass :
-    (const TargetRegisterClass*)&ARM::GPRRegClass;
+  const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass
+                                           : &ARM::GPRRegClass;
   unsigned DestReg = createResultReg(RC);
 
   // FastISel TLS support on non-MachO is broken, punt to SelectionDAG.
@@ -893,9 +890,8 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3) {
   // put the alloca address into a register, set the base type back to
   // register and continue. This should almost never happen.
   if (needsLowering && Addr.BaseType == Address::FrameIndexBase) {
-    const TargetRegisterClass *RC = isThumb2 ?
-      (const TargetRegisterClass*)&ARM::tGPRRegClass :
-      (const TargetRegisterClass*)&ARM::GPRRegClass;
+    const TargetRegisterClass *RC = isThumb2 ? &ARM::tGPRRegClass
+                                             : &ARM::GPRRegClass;
     unsigned ResultReg = createResultReg(RC);
     unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -1094,9 +1090,8 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
     // This is mostly going to be Neon/vector support.
     default: return false;
     case MVT::i1: {
-      unsigned Res = createResultReg(isThumb2 ?
-        (const TargetRegisterClass*)&ARM::tGPRRegClass :
-        (const TargetRegisterClass*)&ARM::GPRRegClass);
+      unsigned Res = createResultReg(isThumb2 ? &ARM::tGPRRegClass
+                                              : &ARM::GPRRegClass);
       unsigned Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri;
       SrcReg = constrainOperandRegClass(TII.get(Opc), SrcReg, 1);
       AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -1500,9 +1495,8 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
   // Now set a register based on the comparison. Explicitly set the predicates
   // here.
   unsigned MovCCOpc = isThumb2 ? ARM::t2MOVCCi : ARM::MOVCCi;
-  const TargetRegisterClass *RC = isThumb2 ?
-    (const TargetRegisterClass*)&ARM::rGPRRegClass :
-    (const TargetRegisterClass*)&ARM::GPRRegClass;
+  const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass
+                                           : &ARM::GPRRegClass;
   unsigned DestReg = createResultReg(RC);
   Constant *Zero = ConstantInt::get(Type::getInt32Ty(*Context), 0);
   unsigned ZeroReg = fastMaterializeConstant(Zero);
@@ -2490,19 +2484,12 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
     MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
     MFI->setFrameAddressIsTaken(true);
 
-    unsigned LdrOpc;
-    const TargetRegisterClass *RC;
-    if (isThumb2) {
-      LdrOpc =  ARM::t2LDRi12;
-      RC = (const TargetRegisterClass*)&ARM::tGPRRegClass;
-    } else {
-      LdrOpc =  ARM::LDRi12;
-      RC = (const TargetRegisterClass*)&ARM::GPRRegClass;
-    }
+    unsigned LdrOpc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12;
+    const TargetRegisterClass *RC = isThumb2 ? &ARM::tGPRRegClass
+                                             : &ARM::GPRRegClass;
 
     const ARMBaseRegisterInfo *RegInfo =
-        static_cast<const ARMBaseRegisterInfo *>(
-            TM.getSubtargetImpl()->getRegisterInfo());
+        static_cast<const ARMBaseRegisterInfo *>(Subtarget->getRegisterInfo());
     unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
     unsigned SrcReg = FramePtr;
 
@@ -3075,13 +3062,13 @@ namespace llvm {
   FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo,
                                 const TargetLibraryInfo *libInfo) {
     const TargetMachine &TM = funcInfo.MF->getTarget();
-
-    const ARMSubtarget *Subtarget = &TM.getSubtarget<ARMSubtarget>();
+    const ARMSubtarget &STI =
+        static_cast<const ARMSubtarget &>(funcInfo.MF->getSubtarget());
     // Thumb2 support on iOS; ARM support on iOS, Linux and NaCl.
     bool UseFastISel = false;
-    UseFastISel |= Subtarget->isTargetMachO() && !Subtarget->isThumb1Only();
-    UseFastISel |= Subtarget->isTargetLinux() && !Subtarget->isThumb();
-    UseFastISel |= Subtarget->isTargetNaCl() && !Subtarget->isThumb();
+    UseFastISel |= STI.isTargetMachO() && !STI.isThumb1Only();
+    UseFastISel |= STI.isTargetLinux() && !STI.isThumb();
+    UseFastISel |= STI.isTargetNaCl() && !STI.isThumb();
 
     if (UseFastISel) {
       // iOS always has a FP for backtracking, force other targets
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 80add7a..5a5bd57 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -164,9 +164,13 @@ static int sizeOfSPAdjustment(const MachineInstr *MI) {
 static bool WindowsRequiresStackProbe(const MachineFunction &MF,
                                       size_t StackSizeInBytes) {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  if (MFI->getStackProtectorIndex() > 0)
-    return StackSizeInBytes >= 4080;
-  return StackSizeInBytes >= 4096;
+  const Function *F = MF.getFunction();
+  unsigned StackProbeSize = (MFI->getStackProtectorIndex() > 0) ? 4080 : 4096;
+  if (F->hasFnAttribute("stack-probe-size"))
+    F->getFnAttribute("stack-probe-size")
+        .getValueAsString()
+        .getAsInteger(0, StackProbeSize);
+  return StackSizeInBytes >= StackProbeSize;
 }
 
 namespace {
@@ -203,12 +207,77 @@ struct StackAdjustingInsts {
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
       BuildMI(MBB, std::next(Info.I), dl,
-              TII.get(TargetOpcode::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+              TII.get(TargetOpcode::CFI_INSTRUCTION))
+              .addCFIIndex(CFIIndex)
+              .setMIFlags(MachineInstr::FrameSetup);
     }
   }
 };
 }
 
+/// Emit an instruction sequence that will align the address in
+/// register Reg by zero-ing out the lower bits.  For versions of the
+/// architecture that support Neon, this must be done in a single
+/// instruction, since skipAlignedDPRCS2Spills assumes it is done in a
+/// single instruction. That function only gets called when optimizing
+/// spilling of D registers on a core with the Neon instruction set
+/// present.
+static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
+                                     const TargetInstrInfo &TII,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     DebugLoc DL, const unsigned Reg,
+                                     const unsigned Alignment,
+                                     const bool MustBeSingleInstruction) {
+  const ARMSubtarget &AST =
+      static_cast<const ARMSubtarget &>(MF.getSubtarget());
+  const bool CanUseBFC = AST.hasV6T2Ops() || AST.hasV7Ops();
+  const unsigned AlignMask = Alignment - 1;
+  const unsigned NrBitsToZero = countTrailingZeros(Alignment);
+  assert(!AFI->isThumb1OnlyFunction() && "Thumb1 not supported");
+  if (!AFI->isThumbFunction()) {
+    // if the BFC instruction is available, use that to zero the lower
+    // bits:
+    //   bfc Reg, #0, log2(Alignment)
+    // otherwise use BIC, if the mask to zero the required number of bits
+    // can be encoded in the bic immediate field
+    //   bic Reg, Reg, Alignment-1
+    // otherwise, emit
+    //   lsr Reg, Reg, log2(Alignment)
+    //   lsl Reg, Reg, log2(Alignment)
+    if (CanUseBFC) {
+      AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BFC), Reg)
+                         .addReg(Reg, RegState::Kill)
+                         .addImm(~AlignMask));
+    } else if (AlignMask <= 255) {
+      AddDefaultCC(
+          AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BICri), Reg)
+                             .addReg(Reg, RegState::Kill)
+                             .addImm(AlignMask)));
+    } else {
+      assert(!MustBeSingleInstruction &&
+             "Shouldn't call emitAligningInstructions demanding a single "
+             "instruction to be emitted for large stack alignment for a target "
+             "without BFC.");
+      AddDefaultCC(AddDefaultPred(
+          BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
+              .addReg(Reg, RegState::Kill)
+              .addImm(ARM_AM::getSORegOpc(ARM_AM::lsr, NrBitsToZero))));
+      AddDefaultCC(AddDefaultPred(
+          BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
+              .addReg(Reg, RegState::Kill)
+              .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, NrBitsToZero))));
+    }
+  } else {
+    // Since this is only reached for Thumb-2 targets, the BFC instruction
+    // should always be available.
+    assert(CanUseBFC);
+    AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::t2BFC), Reg)
+                       .addReg(Reg, RegState::Kill)
+                       .addImm(~AlignMask));
+  }
+}
+
 void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front();
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -218,15 +287,12 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   MCContext &Context = MMI.getContext();
   const TargetMachine &TM = MF.getTarget();
   const MCRegisterInfo *MRI = Context.getRegisterInfo();
-  const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
-      TM.getSubtargetImpl()->getRegisterInfo());
-  const ARMBaseInstrInfo &TII = *static_cast<const ARMBaseInstrInfo *>(
-                                    TM.getSubtargetImpl()->getInstrInfo());
+  const ARMBaseRegisterInfo *RegInfo = STI.getRegisterInfo();
+  const ARMBaseInstrInfo &TII = *STI.getInstrInfo();
   assert(!AFI->isThumb1OnlyFunction() &&
          "This emitPrologue does not support Thumb1!");
   bool isARM = !AFI->isThumbFunction();
-  unsigned Align =
-      TM.getSubtargetImpl()->getFrameLowering()->getStackAlignment();
+  unsigned Align = STI.getFrameLowering()->getStackAlignment();
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   unsigned NumBytes = MFI->getStackSize();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
@@ -451,13 +517,15 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
           nullptr, MRI->getDwarfRegNum(FramePtr, true),
           -(ArgRegsSaveSize - FramePtrOffsetInPush)));
       BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
     } else {
       unsigned CFIIndex =
           MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(
               nullptr, MRI->getDwarfRegNum(FramePtr, true)));
       BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
     }
   }
 
@@ -491,7 +559,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
         CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
             nullptr, MRI->getDwarfRegNum(Reg, true), MFI->getObjectOffset(FI)));
         BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-            .addCFIIndex(CFIIndex);
+            .addCFIIndex(CFIIndex)
+            .setMIFlags(MachineInstr::FrameSetup);
         break;
       }
     }
@@ -514,7 +583,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
           unsigned CFIIndex = MMI.addFrameInst(
               MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
           BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-              .addCFIIndex(CFIIndex);
+              .addCFIIndex(CFIIndex)
+              .setMIFlags(MachineInstr::FrameSetup);
         }
         break;
       }
@@ -535,7 +605,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
         unsigned CFIIndex = MMI.addFrameInst(
             MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
         BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-            .addCFIIndex(CFIIndex);
+            .addCFIIndex(CFIIndex)
+            .setMIFlags(MachineInstr::FrameSetup);
       }
     }
   }
@@ -561,28 +632,24 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   // realigned.
   if (!AFI->getNumAlignedDPRCS2Regs() && RegInfo->needsStackRealignment(MF)) {
     unsigned MaxAlign = MFI->getMaxAlignment();
-    assert (!AFI->isThumb1OnlyFunction());
+    assert(!AFI->isThumb1OnlyFunction());
     if (!AFI->isThumbFunction()) {
-      // Emit bic sp, sp, MaxAlign
-      AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
-                                          TII.get(ARM::BICri), ARM::SP)
-                                  .addReg(ARM::SP, RegState::Kill)
-                                  .addImm(MaxAlign-1)));
+      emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::SP, MaxAlign,
+                               false);
     } else {
-      // We cannot use sp as source/dest register here, thus we're emitting the
-      // following sequence:
+      // We cannot use sp as source/dest register here, thus we're using r4 to
+      // perform the calculations. We're emitting the following sequence:
       // mov r4, sp
-      // bic r4, r4, MaxAlign
+      // -- use emitAligningInstructions to produce best sequence to zero
+      // -- out lower bits in r4
       // mov sp, r4
       // FIXME: It will be better just to find spare register here.
       AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
-        .addReg(ARM::SP, RegState::Kill));
-      AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
-                                          TII.get(ARM::t2BICri), ARM::R4)
-                                  .addReg(ARM::R4, RegState::Kill)
-                                  .addImm(MaxAlign-1)));
+                         .addReg(ARM::SP, RegState::Kill));
+      emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign,
+                               false);
       AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
-        .addReg(ARM::R4, RegState::Kill));
+                         .addReg(ARM::R4, RegState::Kill));
     }
 
     AFI->setShouldRestoreSPFromFP(true);
@@ -612,11 +679,59 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
     AFI->setShouldRestoreSPFromFP(true);
 }
 
+// Resolve TCReturn pseudo-instruction
+void ARMFrameLowering::fixTCReturn(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc dl = MBBI->getDebugLoc();
+  const ARMBaseInstrInfo &TII =
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  if (!(RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri))
+    return;
+
+  // Tail call return: adjust the stack pointer and jump to callee.
+  MBBI = MBB.getLastNonDebugInstr();
+  MachineOperand &JumpTarget = MBBI->getOperand(0);
+
+  // Jump to label or value in register.
+  if (RetOpcode == ARM::TCRETURNdi) {
+    unsigned TCOpcode = STI.isThumb() ?
+             (STI.isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND) :
+             ARM::TAILJMPd;
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode));
+    if (JumpTarget.isGlobal())
+      MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                           JumpTarget.getTargetFlags());
+    else {
+      assert(JumpTarget.isSymbol());
+      MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+                            JumpTarget.getTargetFlags());
+    }
+
+    // Add the default predicate in Thumb mode.
+    if (STI.isThumb()) MIB.addImm(ARMCC::AL).addReg(0);
+  } else if (RetOpcode == ARM::TCRETURNri) {
+    BuildMI(MBB, MBBI, dl,
+            TII.get(STI.isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)).
+      addReg(JumpTarget.getReg(), RegState::Kill);
+  }
+
+  MachineInstr *NewMI = std::prev(MBBI);
+  for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
+    NewMI->addOperand(MBBI->getOperand(i));
+
+  // Delete the pseudo instruction TCRETURN.
+  MBB.erase(MBBI);
+  MBBI = NewMI;
+}
+
 void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
-  unsigned RetOpcode = MBBI->getOpcode();
   DebugLoc dl = MBBI->getDebugLoc();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -627,18 +742,17 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
          "This emitEpilogue does not support Thumb1!");
   bool isARM = !AFI->isThumbFunction();
 
-  unsigned Align = MF.getTarget()
-                       .getSubtargetImpl()
-                       ->getFrameLowering()
-                       ->getStackAlignment();
+  unsigned Align = STI.getFrameLowering()->getStackAlignment();
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   int NumBytes = (int)MFI->getStackSize();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
 
   // All calls are tail calls in GHC calling conv, and functions have no
   // prologue/epilogue.
-  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC) {
+    fixTCReturn(MF, MBB);
     return;
+  }
 
   if (!AFI->hasStackFrame()) {
     if (NumBytes - ArgRegsSaveSize != 0)
@@ -717,42 +831,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
     if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
   }
 
-  if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri) {
-    // Tail call return: adjust the stack pointer and jump to callee.
-    MBBI = MBB.getLastNonDebugInstr();
-    MachineOperand &JumpTarget = MBBI->getOperand(0);
-
-    // Jump to label or value in register.
-    if (RetOpcode == ARM::TCRETURNdi) {
-      unsigned TCOpcode = STI.isThumb() ?
-               (STI.isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND) :
-               ARM::TAILJMPd;
-      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode));
-      if (JumpTarget.isGlobal())
-        MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
-                             JumpTarget.getTargetFlags());
-      else {
-        assert(JumpTarget.isSymbol());
-        MIB.addExternalSymbol(JumpTarget.getSymbolName(),
-                              JumpTarget.getTargetFlags());
-      }
-
-      // Add the default predicate in Thumb mode.
-      if (STI.isThumb()) MIB.addImm(ARMCC::AL).addReg(0);
-    } else if (RetOpcode == ARM::TCRETURNri) {
-      BuildMI(MBB, MBBI, dl,
-              TII.get(STI.isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)).
-        addReg(JumpTarget.getReg(), RegState::Kill);
-    }
-
-    MachineInstr *NewMI = std::prev(MBBI);
-    for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
-      NewMI->addOperand(MBBI->getOperand(i));
-
-    // Delete the pseudo instruction TCRETURN.
-    MBB.erase(MBBI);
-    MBBI = NewMI;
-  }
+  fixTCReturn(MF, MBB);
 
   if (ArgRegsSaveSize)
     emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize);
@@ -1062,15 +1141,16 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   // The immediate is <= 64, so it doesn't need any special encoding.
   unsigned Opc = isThumb ? ARM::t2SUBri : ARM::SUBri;
   AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
-                              .addReg(ARM::SP)
-                              .addImm(8 * NumAlignedDPRCS2Regs)));
+                                  .addReg(ARM::SP)
+                                  .addImm(8 * NumAlignedDPRCS2Regs)));
 
-  // bic r4, r4, #align-1
-  Opc = isThumb ? ARM::t2BICri : ARM::BICri;
   unsigned MaxAlign = MF.getFrameInfo()->getMaxAlignment();
-  AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
-                              .addReg(ARM::R4, RegState::Kill)
-                              .addImm(MaxAlign - 1)));
+  // We must set parameter MustBeSingleInstruction to true, since
+  // skipAlignedDPRCS2Spills expects exactly 3 instructions to perform
+  // stack alignment.  Luckily, this can always be done since all ARM
+  // architecture versions that support Neon also support the BFC
+  // instruction.
+  emitAligningInstructions(MF, AFI, TII, MBB, MI, DL, ARM::R4, MaxAlign, true);
 
   // mov sp, r4
   // The stack pointer must be adjusted before spilling anything, otherwise
@@ -1387,25 +1467,20 @@ static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) {
     return;
 
   // Naked functions don't spill callee-saved registers.
-  if (MF.getFunction()->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                                     Attribute::Naked))
+  if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
     return;
 
   // We are planning to use NEON instructions vst1 / vld1.
-  if (!MF.getTarget().getSubtarget<ARMSubtarget>().hasNEON())
+  if (!static_cast<const ARMSubtarget &>(MF.getSubtarget()).hasNEON())
     return;
 
   // Don't bother if the default stack alignment is sufficiently high.
-  if (MF.getTarget()
-          .getSubtargetImpl()
-          ->getFrameLowering()
-          ->getStackAlignment() >= 8)
+  if (MF.getSubtarget().getFrameLowering()->getStackAlignment() >= 8)
     return;
 
   // Aligned spills require stack realignment.
-  const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
-      MF.getSubtarget().getRegisterInfo());
-  if (!RegInfo->canRealignStack(MF))
+  if (!static_cast<const ARMBaseRegisterInfo *>(
+           MF.getSubtarget().getRegisterInfo())->canRealignStack(MF))
     return;
 
   // We always spill contiguous d-registers starting from d8. Count how many
@@ -1789,7 +1864,7 @@ static const uint64_t kSplitStackAvailable = 256;
 void ARMFrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   unsigned Opcode;
   unsigned CFIIndex;
-  const ARMSubtarget *ST = &MF.getTarget().getSubtarget<ARMSubtarget>();
+  const ARMSubtarget *ST = &MF.getSubtarget<ARMSubtarget>();
   bool Thumb = ST->isThumb();
 
   // Sadly, this currently doesn't support varargs, platforms other than
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index a83b773..b7be436 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -31,6 +31,8 @@ public:
   void emitPrologue(MachineFunction &MF) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
+  void fixTCReturn(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index 0e4f81c..a84603b 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -44,10 +44,9 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
     if (LastMI && (MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) {
       MachineInstr *DefMI = LastMI;
       const MCInstrDesc &LastMCID = LastMI->getDesc();
-      const TargetMachine &TM =
-        MI->getParent()->getParent()->getTarget();
+      const MachineFunction *MF = MI->getParent()->getParent();
       const ARMBaseInstrInfo &TII = *static_cast<const ARMBaseInstrInfo *>(
-                                        TM.getSubtargetImpl()->getInstrInfo());
+                                        MF->getSubtarget().getInstrInfo());
 
       // Skip over one non-VFP / NEON instruction.
       if (!LastMI->isBarrier() &&
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 6941579..6ebf640 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -70,7 +70,7 @@ public:
 
   bool runOnMachineFunction(MachineFunction &MF) override {
     // Reset the subtarget each time through.
-    Subtarget = &MF.getTarget().getSubtarget<ARMSubtarget>();
+    Subtarget = &MF.getSubtarget<ARMSubtarget>();
     SelectionDAGISel::runOnMachineFunction(MF);
     return true;
   }
@@ -992,18 +992,24 @@ bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,
   Addr = N;
 
   unsigned Alignment = 0;
-  if (LSBaseSDNode *LSN = dyn_cast<LSBaseSDNode>(Parent)) {
+
+  MemSDNode *MemN = cast<MemSDNode>(Parent);
+
+  if (isa<LSBaseSDNode>(MemN) ||
+      ((MemN->getOpcode() == ARMISD::VST1_UPD ||
+        MemN->getOpcode() == ARMISD::VLD1_UPD) &&
+       MemN->getConstantOperandVal(MemN->getNumOperands() - 1) == 1)) {
     // This case occurs only for VLD1-lane/dup and VST1-lane instructions.
     // The maximum alignment is equal to the memory size being referenced.
-    unsigned LSNAlign = LSN->getAlignment();
-    unsigned MemSize = LSN->getMemoryVT().getSizeInBits() / 8;
-    if (LSNAlign >= MemSize && MemSize > 1)
+    unsigned MMOAlign = MemN->getAlignment();
+    unsigned MemSize = MemN->getMemoryVT().getSizeInBits() / 8;
+    if (MMOAlign >= MemSize && MemSize > 1)
       Alignment = MemSize;
   } else {
     // All other uses of addrmode6 are for intrinsics.  For now just record
     // the raw alignment value; it will be refined later based on the legal
     // alignment operands for the intrinsic.
-    Alignment = cast<MemIntrinsicSDNode>(Parent)->getAlignment();
+    Alignment = MemN->getAlignment();
   }
 
   Align = CurDAG->getTargetConstant(Alignment, MVT::i32);
@@ -1191,6 +1197,11 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
                                             SDValue &Base, SDValue &OffImm) {
   if (N.getOpcode() == ISD::FrameIndex) {
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    // Only multiples of 4 are allowed for the offset, so the frame object
+    // alignment must be at least 4.
+    MachineFrameInfo *MFI = MF->getFrameInfo();
+    if (MFI->getObjectAlignment(FI) < 4)
+      MFI->setObjectAlignment(FI, 4);
     Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
     OffImm = CurDAG->getTargetConstant(0, MVT::i32);
     return true;
@@ -1208,6 +1219,11 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        // For LHS+RHS to result in an offset that's a multiple of 4 the object
+        // indexed by the LHS must be 4-byte aligned.
+        MachineFrameInfo *MFI = MF->getFrameInfo();
+        if (MFI->getObjectAlignment(FI) < 4)
+          MFI->setObjectAlignment(FI, 4);
         Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
@@ -1784,6 +1800,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
   case MVT::v8i16: OpcodeIndex = 1; break;
   case MVT::v4f32:
   case MVT::v4i32: OpcodeIndex = 2; break;
+  case MVT::v2f64:
   case MVT::v2i64: OpcodeIndex = 3;
     assert(NumVecs == 1 && "v2i64 type only supported for VLD1");
     break;
@@ -1920,6 +1937,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   case MVT::v8i16: OpcodeIndex = 1; break;
   case MVT::v4f32:
   case MVT::v4i32: OpcodeIndex = 2; break;
+  case MVT::v2f64:
   case MVT::v2i64: OpcodeIndex = 3;
     assert(NumVecs == 1 && "v2i64 type only supported for VST1");
     break;
@@ -2290,7 +2308,7 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
         assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
 
         // Note: The width operand is encoded as width-1.
-        unsigned Width = CountTrailingOnes_32(And_imm) - 1;
+        unsigned Width = countTrailingOnes(And_imm) - 1;
         unsigned LSB = Srl_imm;
 
         SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
@@ -2494,6 +2512,11 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
     SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
     if (Subtarget->isThumb1Only()) {
+      // Set the alignment of the frame object to 4, to avoid having to generate
+      // more than one ADD
+      MachineFrameInfo *MFI = MF->getFrameInfo();
+      if (MFI->getObjectAlignment(FI) < 4)
+        MFI->setObjectAlignment(FI, 4);
       return CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI,
                                   CurDAG->getTargetConstant(0, MVT::i32));
     } else {
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 0d0d81f..56290aa 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -156,11 +156,11 @@ void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
 }
 
-ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM)
-    : TargetLowering(TM) {
-  Subtarget = &TM.getSubtarget<ARMSubtarget>();
-  RegInfo = TM.getSubtargetImpl()->getRegisterInfo();
-  Itins = TM.getSubtargetImpl()->getInstrItineraryData();
+ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
+                                     const ARMSubtarget &STI)
+    : TargetLowering(TM), Subtarget(&STI) {
+  RegInfo = Subtarget->getRegisterInfo();
+  Itins = Subtarget->getInstrItineraryData();
 
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
@@ -404,22 +404,20 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM)
     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
   }
 
-  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
-    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
-      setTruncStoreAction((MVT::SimpleValueType)VT,
-                          (MVT::SimpleValueType)InnerVT, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+  for (MVT VT : MVT::vector_valuetypes()) {
+    for (MVT InnerVT : MVT::vector_valuetypes()) {
+      setTruncStoreAction(VT, InnerVT, Expand);
+      setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+    }
 
-    setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
-    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
-    setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
-    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::MULHS, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 
-    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::BSWAP, VT, Expand);
   }
 
   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
@@ -567,15 +565,18 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM)
     setTargetDAGCombine(ISD::FP_TO_SINT);
     setTargetDAGCombine(ISD::FP_TO_UINT);
     setTargetDAGCombine(ISD::FDIV);
+    setTargetDAGCombine(ISD::LOAD);
 
     // It is legal to extload from v4i8 to v4i16 or v4i32.
     MVT Tys[6] = {MVT::v8i8, MVT::v4i8, MVT::v2i8,
                   MVT::v4i16, MVT::v2i16,
                   MVT::v2i32};
     for (unsigned i = 0; i < 6; ++i) {
-      setLoadExtAction(ISD::EXTLOAD, Tys[i], Legal);
-      setLoadExtAction(ISD::ZEXTLOAD, Tys[i], Legal);
-      setLoadExtAction(ISD::SEXTLOAD, Tys[i], Legal);
+      for (MVT VT : MVT::integer_vector_valuetypes()) {
+        setLoadExtAction(ISD::EXTLOAD, VT, Tys[i], Legal);
+        setLoadExtAction(ISD::ZEXTLOAD, VT, Tys[i], Legal);
+        setLoadExtAction(ISD::SEXTLOAD, VT, Tys[i], Legal);
+      }
     }
   }
 
@@ -617,11 +618,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM)
     setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
   }
 
-  computeRegisterProperties();
+  computeRegisterProperties(Subtarget->getRegisterInfo());
 
   // ARM does not have floating-point extending loads.
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  for (MVT VT : MVT::fp_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+  }
 
   // ... or truncating stores
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
@@ -629,7 +632,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM)
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 
   // ARM does not have i1 sign extending load.
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  for (MVT VT : MVT::integer_valuetypes())
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 
   // ARM supports all 4 flavors of integer indexed load / store.
   if (!Subtarget->isThumb1Only()) {
@@ -963,13 +967,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM)
 // of the difficulty prior to coalescing of modeling operand register classes
 // due to the common occurrence of cross class copies and subregister insertions
 // and extractions.
-std::pair<const TargetRegisterClass*, uint8_t>
-ARMTargetLowering::findRepresentativeClass(MVT VT) const{
+std::pair<const TargetRegisterClass *, uint8_t>
+ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+                                           MVT VT) const {
   const TargetRegisterClass *RRC = nullptr;
   uint8_t Cost = 1;
   switch (VT.SimpleTy) {
   default:
-    return TargetLowering::findRepresentativeClass(VT);
+    return TargetLowering::findRepresentativeClass(TRI, VT);
   // Use DPR as representative register class for all floating point
   // and vector types. Since there are 32 SPR registers and 32 DPR registers so
   // the cost is 1 for both f32 and f64.
@@ -1166,12 +1171,6 @@ ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
   return ARM::createFastISel(funcInfo, libInfo);
 }
 
-/// getMaximalGlobalOffset - Returns the maximal possible offset which can
-/// be used for loads / stores from the global.
-unsigned ARMTargetLowering::getMaximalGlobalOffset() const {
-  return (Subtarget->isThumb1Only() ? 127 : 4095);
-}
-
 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
   unsigned NumVals = N->getNumValues();
   if (!NumVals)
@@ -1190,8 +1189,7 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
 
   // Load are scheduled for latency even if there instruction itinerary
   // is not available.
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
 
   if (MCID.getNumDefs() == 0)
@@ -1783,8 +1781,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // FIXME: handle tail calls differently.
   unsigned CallOpc;
-  bool HasMinSizeAttr = MF.getFunction()->getAttributes().hasAttribute(
-      AttributeSet::FunctionIndex, Attribute::MinSize);
+  bool HasMinSizeAttr = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
   if (Subtarget->isThumb()) {
     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
@@ -1815,9 +1812,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Add a register mask operand representing the call-preserved registers.
   if (!isTailCall) {
     const uint32_t *Mask;
-    const TargetRegisterInfo *TRI =
-        getTargetMachine().getSubtargetImpl()->getRegisterInfo();
-    const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI);
+    const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
     if (isThisReturn) {
       // For 'this' returns, use the R0-preserving mask if applicable
       Mask = ARI->getThisReturnPreservedMask(CallConv);
@@ -1865,7 +1860,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 void
 ARMTargetLowering::HandleByVal(
     CCState *State, unsigned &size, unsigned Align) const {
-  unsigned reg = State->AllocateReg(GPRArgRegs, 4);
+  unsigned reg = State->AllocateReg(GPRArgRegs);
   assert((State->getCallOrPrologue() == Prologue ||
           State->getCallOrPrologue() == Call) &&
          "unhandled ParmContext");
@@ -1875,7 +1870,7 @@ ARMTargetLowering::HandleByVal(
       unsigned AlignInRegs = Align / 4;
       unsigned Waste = (ARM::R4 - reg) % AlignInRegs;
       for (unsigned i = 0; i < Waste; ++i)
-        reg = State->AllocateReg(GPRArgRegs, 4);
+        reg = State->AllocateReg(GPRArgRegs);
     }
     if (reg != 0) {
       unsigned excess = 4 * (ARM::R4 - reg);
@@ -1886,7 +1881,7 @@ ARMTargetLowering::HandleByVal(
       // remained registers.
       const unsigned NSAAOffset = State->getNextStackOffset();
       if (Subtarget->isAAPCS_ABI() && NSAAOffset != 0 && size > excess) {
-        while (State->AllocateReg(GPRArgRegs, 4))
+        while (State->AllocateReg(GPRArgRegs))
           ;
         return;
       }
@@ -1903,7 +1898,7 @@ ARMTargetLowering::HandleByVal(
       // Note, first register is allocated in the beginning of function already,
       // allocate remained amount of registers we need.
       for (unsigned i = reg+1; i != ByValRegEnd; ++i)
-        State->AllocateReg(GPRArgRegs, 4);
+        State->AllocateReg(GPRArgRegs);
       // A byval parameter that is split between registers and memory needs its
       // size truncated here.
       // In the case where the entire structure fits in registers, we set the
@@ -2025,7 +2020,9 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   // cannot rely on the linker replacing the tail call with a return.
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = G->getGlobal();
-    if (GV->hasExternalWeakLinkage())
+    const Triple TT(getTargetMachine().getTargetTriple());
+    if (GV->hasExternalWeakLinkage() &&
+        (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
       return false;
   }
 
@@ -2084,8 +2081,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
       // the caller's fixed stack objects.
       MachineFrameInfo *MFI = MF.getFrameInfo();
       const MachineRegisterInfo *MRI = &MF.getRegInfo();
-      const TargetInstrInfo *TII =
-          getTargetMachine().getSubtargetImpl()->getInstrInfo();
+      const TargetInstrInfo *TII = Subtarget->getInstrInfo();
       for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
            i != e;
            ++i, ++realArgIdx) {
@@ -2837,16 +2833,11 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
     NumGPRs = REnd - RBegin;
   } else {
     unsigned int firstUnalloced;
-    firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs,
-                                                sizeof(GPRArgRegs) /
-                                                sizeof(GPRArgRegs[0]));
+    firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs);
     NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0;
   }
 
-  unsigned Align = MF.getTarget()
-                       .getSubtargetImpl()
-                       ->getFrameLowering()
-                       ->getStackAlignment();
+  unsigned Align = Subtarget->getFrameLowering()->getStackAlignment();
   ArgRegsSize = NumGPRs * 4;
 
   // If parameter is split between stack and GPRs...
@@ -2913,8 +2904,7 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
     firstRegToSaveIndex = RBegin - ARM::R0;
     lastRegToSaveIndex = REnd - ARM::R0;
   } else {
-    firstRegToSaveIndex = CCInfo.getFirstUnallocated
-      (GPRArgRegs, array_lengthof(GPRArgRegs));
+    firstRegToSaveIndex = CCInfo.getFirstUnallocated(GPRArgRegs);
     lastRegToSaveIndex = 4;
   }
 
@@ -3087,8 +3077,11 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-    std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Ins[VA.getValNo()].OrigArgIndex;
+    if (Ins[VA.getValNo()].isOrigArg()) {
+      std::advance(CurOrigArg,
+                   Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
+    }
     // Arguments stored in registers.
     if (VA.isRegLoc()) {
       EVT RegVT = VA.getLocVT();
@@ -3129,9 +3122,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
         else if (RegVT == MVT::v2f64)
           RC = &ARM::QPRRegClass;
         else if (RegVT == MVT::i32)
-          RC = AFI->isThumb1OnlyFunction() ?
-            (const TargetRegisterClass*)&ARM::tGPRRegClass :
-            (const TargetRegisterClass*)&ARM::GPRRegClass;
+          RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
+                                           : &ARM::GPRRegClass;
         else
           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
@@ -3169,7 +3161,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
       assert(VA.isMemLoc());
       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
 
-      int index = ArgLocs[i].getValNo();
+      int index = VA.getValNo();
 
       // Some Ins[] entries become multiple ArgLoc[] entries.
       // Process them only once.
@@ -3182,6 +3174,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
           // Since they could be overwritten by lowering of arguments in case of
           // a tail call.
           if (Flags.isByVal()) {
+            assert(Ins[index].isOrigArg() &&
+                   "Byval arguments cannot be implicit");
             unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
 
             ByValStoreOffset = RoundUpToAlignment(ByValStoreOffset, Flags.getByValAlign());
@@ -3596,8 +3590,8 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
     // inverting the compare condition, swapping 'less' and 'greater') and
     // sometimes need to swap the operands to the VSEL (which inverts the
     // condition in the sense of firing whenever the previous condition didn't)
-    if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
-                                      TrueVal.getValueType() == MVT::f64)) {
+    if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
+                                    TrueVal.getValueType() == MVT::f64)) {
       ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
       if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
           CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
@@ -3616,8 +3610,8 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   FPCCToARMCC(CC, CondCode, CondCode2);
 
   // Try to generate VSEL on ARMv8.
-  if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
-                                    TrueVal.getValueType() == MVT::f64)) {
+  if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
+                                  TrueVal.getValueType() == MVT::f64)) {
     // We can select VMAXNM/VMINNM from a compare followed by a select with the
     // same operands, as follows:
     //   c = fcmp [ogt, olt, ugt, ult] a, b
@@ -4483,6 +4477,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
+  EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
   EVT VT = Op.getValueType();
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   SDLoc dl(Op);
@@ -4512,8 +4507,8 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
       TmpOp0 = Op0;
       TmpOp1 = Op1;
       Opc = ISD::OR;
-      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
-      Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1);
+      Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
+      Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1);
       break;
     case ISD::SETUO: Invert = true; // Fallthrough
     case ISD::SETO:
@@ -4521,8 +4516,8 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
       TmpOp0 = Op0;
       TmpOp1 = Op1;
       Opc = ISD::OR;
-      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
-      Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1);
+      Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
+      Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1);
       break;
     }
   } else {
@@ -4556,8 +4551,8 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
 
       if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
         Opc = ARMISD::VTST;
-        Op0 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(0));
-        Op1 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(1));
+        Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
+        Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
         Invert = !Invert;
       }
     }
@@ -4583,22 +4578,24 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   if (SingleOp.getNode()) {
     switch (Opc) {
     case ARMISD::VCEQ:
-      Result = DAG.getNode(ARMISD::VCEQZ, dl, VT, SingleOp); break;
+      Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break;
     case ARMISD::VCGE:
-      Result = DAG.getNode(ARMISD::VCGEZ, dl, VT, SingleOp); break;
+      Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break;
     case ARMISD::VCLEZ:
-      Result = DAG.getNode(ARMISD::VCLEZ, dl, VT, SingleOp); break;
+      Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break;
     case ARMISD::VCGT:
-      Result = DAG.getNode(ARMISD::VCGTZ, dl, VT, SingleOp); break;
+      Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break;
     case ARMISD::VCLTZ:
-      Result = DAG.getNode(ARMISD::VCLTZ, dl, VT, SingleOp); break;
+      Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break;
     default:
-      Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+      Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
     }
   } else {
-     Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+     Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
   }
 
+  Result = DAG.getSExtOrTrunc(Result, dl, VT);
+
   if (Invert)
     Result = DAG.getNOT(dl, Result, VT);
 
@@ -6497,8 +6494,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
 void ARMTargetLowering::
 SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
                        MachineBasicBlock *DispatchBB, int FI) const {
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
@@ -6515,9 +6511,8 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
     ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
   unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
 
-  const TargetRegisterClass *TRC = isThumb ?
-    (const TargetRegisterClass*)&ARM::tGPRRegClass :
-    (const TargetRegisterClass*)&ARM::GPRRegClass;
+  const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
+                                           : &ARM::GPRRegClass;
 
   // Grab constant pool and fixed stack memory operands.
   MachineMemOperand *CPMMO =
@@ -6613,8 +6608,7 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
 
 MachineBasicBlock *ARMTargetLowering::
 EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
@@ -6622,9 +6616,8 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
   MachineFrameInfo *MFI = MF->getFrameInfo();
   int FI = MFI->getFunctionContextIndex();
 
-  const TargetRegisterClass *TRC = Subtarget->isThumb() ?
-    (const TargetRegisterClass*)&ARM::tGPRRegClass :
-    (const TargetRegisterClass*)&ARM::GPRnopcRegClass;
+  const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
+                                                        : &ARM::GPRnopcRegClass;
 
   // Get a mapping of the call site numbers to all of the landing pads they're
   // associated with.
@@ -7129,8 +7122,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
   // This pseudo instruction has 3 operands: dst, src, size
   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
   // Otherwise, we will generate unrolled scalar copies.
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction::iterator It = BB;
   ++It;
@@ -7156,9 +7148,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
     UnitSize = 2;
   } else {
     // Check whether we can use NEON instructions.
-    if (!MF->getFunction()->getAttributes().
-          hasAttribute(AttributeSet::FunctionIndex,
-                       Attribute::NoImplicitFloat) &&
+    if (!MF->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
         Subtarget->hasNEON()) {
       if ((Align % 16 == 0) && SizeVal >= 16)
         UnitSize = 16;
@@ -7172,14 +7162,11 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
 
   // Select the correct opcode and register class for unit size load/store
   bool IsNeon = UnitSize >= 8;
-  TRC = (IsThumb1 || IsThumb2) ? (const TargetRegisterClass *)&ARM::tGPRRegClass
-                               : (const TargetRegisterClass *)&ARM::GPRRegClass;
+  TRC = (IsThumb1 || IsThumb2) ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
   if (IsNeon)
-    VecTRC = UnitSize == 16
-                 ? (const TargetRegisterClass *)&ARM::DPairRegClass
-                 : UnitSize == 8
-                       ? (const TargetRegisterClass *)&ARM::DPRRegClass
-                       : nullptr;
+    VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
+                            : UnitSize == 8 ? &ARM::DPRRegClass
+                                            : nullptr;
 
   unsigned BytesLeft = SizeVal % UnitSize;
   unsigned LoopSize = SizeVal - BytesLeft;
@@ -7364,7 +7351,7 @@ MachineBasicBlock *
 ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
                                        MachineBasicBlock *MBB) const {
   const TargetMachine &TM = getTargetMachine();
-  const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   assert(Subtarget->isTargetWindows() &&
@@ -7429,8 +7416,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
 MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
   switch (MI->getOpcode()) {
@@ -7627,9 +7613,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     MachineRegisterInfo &MRI = Fn->getRegInfo();
     // In Thumb mode S must not be specified if source register is the SP or
     // PC and if destination register is the SP, so restrict register class
-    unsigned NewRsbDstReg = MRI.createVirtualRegister(isThumb2 ?
-      (const TargetRegisterClass*)&ARM::rGPRRegClass :
-      (const TargetRegisterClass*)&ARM::GPRRegClass);
+    unsigned NewRsbDstReg =
+      MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
 
     // Transfer the remainder of BB and its successor edges to sinkMBB.
     SinkBB->splice(SinkBB->begin(), BB,
@@ -7694,8 +7679,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
   // Rename pseudo opcodes.
   unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode());
   if (NewOpc) {
-    const ARMBaseInstrInfo *TII = static_cast<const ARMBaseInstrInfo *>(
-        getTargetMachine().getSubtargetImpl()->getInstrInfo());
+    const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
     MCID = &TII->get(NewOpc);
 
     assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 &&
@@ -8059,29 +8043,35 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
   else
     IsLeftOperandMUL = true;
   if (MULOp == SDValue())
-     return SDValue();
+    return SDValue();
 
   // Figure out the right opcode.
   unsigned Opc = MULOp->getOpcode();
   unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
 
   // Figure out the high and low input values to the MLAL node.
-  SDValue* HiMul = &MULOp;
   SDValue* HiAdd = nullptr;
   SDValue* LoMul = nullptr;
   SDValue* LowAdd = nullptr;
 
+  // Ensure that ADDE is from high result of ISD::SMUL_LOHI.
+  if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1)))
+    return SDValue();
+
   if (IsLeftOperandMUL)
     HiAdd = &AddeOp1;
   else
     HiAdd = &AddeOp0;
 
 
-  if (AddcOp0->getOpcode() == Opc) {
+  // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node
+  // whose low result is fed to the ADDC we are checking.
+
+  if (AddcOp0 == MULOp.getValue(0)) {
     LoMul = &AddcOp0;
     LowAdd = &AddcOp1;
   }
-  if (AddcOp1->getOpcode() == Opc) {
+  if (AddcOp1 == MULOp.getValue(0)) {
     LoMul = &AddcOp1;
     LowAdd = &AddcOp0;
   }
@@ -8089,9 +8079,6 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
   if (!LoMul)
     return SDValue();
 
-  if (LoMul->getNode() != HiMul->getNode())
-    return SDValue();
-
   // Create the merged node.
   SelectionDAG &DAG = DCI.DAG;
 
@@ -8583,7 +8570,10 @@ static SDValue PerformBFICombine(SDNode *N,
     unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
     unsigned LSB = countTrailingZeros(~InvMask);
     unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
-    unsigned Mask = (1 << Width)-1;
+    assert(Width <
+               static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
+           "undefined behavior");
+    unsigned Mask = (1u << Width) - 1;
     unsigned Mask2 = N11C->getZExtValue();
     if ((Mask & (~Mask2)) == 0)
       return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
@@ -8655,147 +8645,6 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-/// PerformSTORECombine - Target-specific dag combine xforms for
-/// ISD::STORE.
-static SDValue PerformSTORECombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI) {
-  StoreSDNode *St = cast<StoreSDNode>(N);
-  if (St->isVolatile())
-    return SDValue();
-
-  // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
-  // pack all of the elements in one place.  Next, store to memory in fewer
-  // chunks.
-  SDValue StVal = St->getValue();
-  EVT VT = StVal.getValueType();
-  if (St->isTruncatingStore() && VT.isVector()) {
-    SelectionDAG &DAG = DCI.DAG;
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    EVT StVT = St->getMemoryVT();
-    unsigned NumElems = VT.getVectorNumElements();
-    assert(StVT != VT && "Cannot truncate to the same type");
-    unsigned FromEltSz = VT.getVectorElementType().getSizeInBits();
-    unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits();
-
-    // From, To sizes and ElemCount must be pow of two
-    if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
-
-    // We are going to use the original vector elt for storing.
-    // Accumulated smaller vector elements must be a multiple of the store size.
-    if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
-
-    unsigned SizeRatio  = FromEltSz / ToEltSz;
-    assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
-
-    // Create a type on which we perform the shuffle.
-    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
-                                     NumElems*SizeRatio);
-    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
-
-    SDLoc DL(St);
-    SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
-    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i < NumElems; ++i)
-      ShuffleVec[i] = TLI.isBigEndian() ? (i+1) * SizeRatio - 1 : i * SizeRatio;
-
-    // Can't shuffle using an illegal type.
-    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
-
-    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
-                                DAG.getUNDEF(WideVec.getValueType()),
-                                ShuffleVec.data());
-    // At this point all of the data is stored at the bottom of the
-    // register. We now need to save it to mem.
-
-    // Find the largest store unit
-    MVT StoreType = MVT::i8;
-    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
-         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
-      MVT Tp = (MVT::SimpleValueType)tp;
-      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
-        StoreType = Tp;
-    }
-    // Didn't find a legal store type.
-    if (!TLI.isTypeLegal(StoreType))
-      return SDValue();
-
-    // Bitcast the original vector into a vector of store-size units
-    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
-            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
-    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
-    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
-    SmallVector<SDValue, 8> Chains;
-    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
-                                        TLI.getPointerTy());
-    SDValue BasePtr = St->getBasePtr();
-
-    // Perform one or more big stores into memory.
-    unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
-    for (unsigned I = 0; I < E; I++) {
-      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
-                                   StoreType, ShuffWide,
-                                   DAG.getIntPtrConstant(I));
-      SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
-                                St->getPointerInfo(), St->isVolatile(),
-                                St->isNonTemporal(), St->getAlignment());
-      BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
-                            Increment);
-      Chains.push_back(Ch);
-    }
-    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
-  }
-
-  if (!ISD::isNormalStore(St))
-    return SDValue();
-
-  // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
-  // ARM stores of arguments in the same cache line.
-  if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
-      StVal.getNode()->hasOneUse()) {
-    SelectionDAG  &DAG = DCI.DAG;
-    bool isBigEndian = DAG.getTargetLoweringInfo().isBigEndian();
-    SDLoc DL(St);
-    SDValue BasePtr = St->getBasePtr();
-    SDValue NewST1 = DAG.getStore(St->getChain(), DL,
-                                  StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ),
-                                  BasePtr, St->getPointerInfo(), St->isVolatile(),
-                                  St->isNonTemporal(), St->getAlignment());
-
-    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
-                                    DAG.getConstant(4, MVT::i32));
-    return DAG.getStore(NewST1.getValue(0), DL,
-                        StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
-                        OffsetPtr, St->getPointerInfo(), St->isVolatile(),
-                        St->isNonTemporal(),
-                        std::min(4U, St->getAlignment() / 2));
-  }
-
-  if (StVal.getValueType() != MVT::i64 ||
-      StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
-    return SDValue();
-
-  // Bitcast an i64 store extracted from a vector to f64.
-  // Otherwise, the i64 value will be legalized to a pair of i32 values.
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc dl(StVal);
-  SDValue IntVec = StVal.getOperand(0);
-  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
-                                 IntVec.getValueType().getVectorNumElements());
-  SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
-  SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
-                               Vec, StVal.getOperand(1));
-  dl = SDLoc(N);
-  SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
-  // Make the DAGCombiner fold the bitcasts.
-  DCI.AddToWorklist(Vec.getNode());
-  DCI.AddToWorklist(ExtElt.getNode());
-  DCI.AddToWorklist(V.getNode());
-  return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
-                      St->getPointerInfo(), St->isVolatile(),
-                      St->isNonTemporal(), St->getAlignment(),
-                      St->getAAInfo());
-}
-
 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
 /// are normal, non-volatile loads.  If so, it is profitable to bitcast an
 /// i64 vector to have f64 elements, since the value can then be loaded
@@ -9016,18 +8865,20 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
                               DAG.getUNDEF(VT), NewMask.data());
 }
 
-/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and
-/// NEON load/store intrinsics to merge base address updates.
+/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
+/// NEON load/store intrinsics, and generic vector load/stores, to merge
+/// base address updates.
+/// For generic load/stores, the memory type is assumed to be a vector.
+/// The caller is assumed to have checked legality.
 static SDValue CombineBaseUpdate(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
-  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
-    return SDValue();
-
   SelectionDAG &DAG = DCI.DAG;
-  bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
-                      N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
-  unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
+  const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
+                            N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+  const bool isStore = N->getOpcode() == ISD::STORE;
+  const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
   SDValue Addr = N->getOperand(AddrOpIdx);
+  MemSDNode *MemN = cast<MemSDNode>(N);
 
   // Search for a use of the address operand that is an increment.
   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
@@ -9043,7 +8894,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
       continue;
 
     // Find the new opcode for the updating load/store.
-    bool isLoad = true;
+    bool isLoadOp = true;
     bool isLaneOp = false;
     unsigned NewOpc = 0;
     unsigned NumVecs = 0;
@@ -9066,19 +8917,19 @@ static SDValue CombineBaseUpdate(SDNode *N,
       case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
         NumVecs = 4; isLaneOp = true; break;
       case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
-        NumVecs = 1; isLoad = false; break;
+        NumVecs = 1; isLoadOp = false; break;
       case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
-        NumVecs = 2; isLoad = false; break;
+        NumVecs = 2; isLoadOp = false; break;
       case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
-        NumVecs = 3; isLoad = false; break;
+        NumVecs = 3; isLoadOp = false; break;
       case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
-        NumVecs = 4; isLoad = false; break;
+        NumVecs = 4; isLoadOp = false; break;
       case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
-        NumVecs = 2; isLoad = false; isLaneOp = true; break;
+        NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
       case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
-        NumVecs = 3; isLoad = false; isLaneOp = true; break;
+        NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
       case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
-        NumVecs = 4; isLoad = false; isLaneOp = true; break;
+        NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
       }
     } else {
       isLaneOp = true;
@@ -9087,15 +8938,24 @@ static SDValue CombineBaseUpdate(SDNode *N,
       case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
       case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
       case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
+      case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
+        NumVecs = 1; isLaneOp = false; break;
+      case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
+        NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
       }
     }
 
     // Find the size of memory referenced by the load/store.
     EVT VecTy;
-    if (isLoad)
+    if (isLoadOp) {
       VecTy = N->getValueType(0);
-    else
+    } else if (isIntrinsic) {
       VecTy = N->getOperand(AddrOpIdx+1).getValueType();
+    } else {
+      assert(isStore && "Node has to be a load, a store, or an intrinsic!");
+      VecTy = N->getOperand(1).getValueType();
+    }
+
     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
     if (isLaneOp)
       NumBytes /= VecTy.getVectorNumElements();
@@ -9112,32 +8972,99 @@ static SDValue CombineBaseUpdate(SDNode *N,
       continue;
     }
 
+    // OK, we found an ADD we can fold into the base update.
+    // Now, create a _UPD node, taking care of not breaking alignment.
+
+    EVT AlignedVecTy = VecTy;
+    unsigned Alignment = MemN->getAlignment();
+
+    // If this is a less-than-standard-aligned load/store, change the type to
+    // match the standard alignment.
+    // The alignment is overlooked when selecting _UPD variants; and it's
+    // easier to introduce bitcasts here than fix that.
+    // There are 3 ways to get to this base-update combine:
+    // - intrinsics: they are assumed to be properly aligned (to the standard
+    //   alignment of the memory type), so we don't need to do anything.
+    // - ARMISD::VLDx nodes: they are only generated from the aforementioned
+    //   intrinsics, so, likewise, there's nothing to do.
+    // - generic load/store instructions: the alignment is specified as an
+    //   explicit operand, rather than implicitly as the standard alignment
+    //   of the memory type (like the intrisics).  We need to change the
+    //   memory type to match the explicit alignment.  That way, we don't
+    //   generate non-standard-aligned ARMISD::VLDx nodes.
+    if (isa<LSBaseSDNode>(N)) {
+      if (Alignment == 0)
+        Alignment = 1;
+      if (Alignment < VecTy.getScalarSizeInBits() / 8) {
+        MVT EltTy = MVT::getIntegerVT(Alignment * 8);
+        assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
+        assert(!isLaneOp && "Unexpected generic load/store lane.");
+        unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
+        AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
+      }
+      // Don't set an explicit alignment on regular load/stores that we want
+      // to transform to VLD/VST 1_UPD nodes.
+      // This matches the behavior of regular load/stores, which only get an
+      // explicit alignment if the MMO alignment is larger than the standard
+      // alignment of the memory type.
+      // Intrinsics, however, always get an explicit alignment, set to the
+      // alignment of the MMO.
+      Alignment = 1;
+    }
+
     // Create the new updating load/store node.
+    // First, create an SDVTList for the new updating node's results.
     EVT Tys[6];
-    unsigned NumResultVecs = (isLoad ? NumVecs : 0);
+    unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
     unsigned n;
     for (n = 0; n < NumResultVecs; ++n)
-      Tys[n] = VecTy;
+      Tys[n] = AlignedVecTy;
     Tys[n++] = MVT::i32;
     Tys[n] = MVT::Other;
     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
+
+    // Then, gather the new node's operands.
     SmallVector<SDValue, 8> Ops;
     Ops.push_back(N->getOperand(0)); // incoming chain
     Ops.push_back(N->getOperand(AddrOpIdx));
     Ops.push_back(Inc);
-    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
-      Ops.push_back(N->getOperand(i));
+
+    if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
+      // Try to match the intrinsic's signature
+      Ops.push_back(StN->getValue());
+    } else {
+      // Loads (and of course intrinsics) match the intrinsics' signature,
+      // so just add all but the alignment operand.
+      for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
+        Ops.push_back(N->getOperand(i));
+    }
+
+    // For all node types, the alignment operand is always the last one.
+    Ops.push_back(DAG.getConstant(Alignment, MVT::i32));
+
+    // If this is a non-standard-aligned STORE, the penultimate operand is the
+    // stored value.  Bitcast it to the aligned type.
+    if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
+      SDValue &StVal = Ops[Ops.size()-2];
+      StVal = DAG.getNode(ISD::BITCAST, SDLoc(N), AlignedVecTy, StVal);
     }
-    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
+
     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
-                                           Ops, MemInt->getMemoryVT(),
-                                           MemInt->getMemOperand());
+                                           Ops, AlignedVecTy,
+                                           MemN->getMemOperand());
 
     // Update the uses.
-    std::vector<SDValue> NewResults;
-    for (unsigned i = 0; i < NumResultVecs; ++i) {
+    SmallVector<SDValue, 5> NewResults;
+    for (unsigned i = 0; i < NumResultVecs; ++i)
       NewResults.push_back(SDValue(UpdN.getNode(), i));
+
+    // If this is an non-standard-aligned LOAD, the first result is the loaded
+    // value.  Bitcast it to the expected result type.
+    if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
+      SDValue &LdVal = NewResults[0];
+      LdVal = DAG.getNode(ISD::BITCAST, SDLoc(N), VecTy, LdVal);
     }
+
     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
     DCI.CombineTo(N, NewResults);
     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
@@ -9147,6 +9074,14 @@ static SDValue CombineBaseUpdate(SDNode *N,
   return SDValue();
 }
 
+static SDValue PerformVLDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  return CombineBaseUpdate(N, DCI);
+}
+
 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
 /// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
@@ -9260,6 +9195,164 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
   return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
 }
 
+static SDValue PerformLOADCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+
+  // If this is a legal vector load, try to combine it into a VLD1_UPD.
+  if (ISD::isNormalLoad(N) && VT.isVector() &&
+      DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return CombineBaseUpdate(N, DCI);
+
+  return SDValue();
+}
+
+/// PerformSTORECombine - Target-specific dag combine xforms for
+/// ISD::STORE.
+static SDValue PerformSTORECombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  if (St->isVolatile())
+    return SDValue();
+
+  // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
+  // pack all of the elements in one place.  Next, store to memory in fewer
+  // chunks.
+  SDValue StVal = St->getValue();
+  EVT VT = StVal.getValueType();
+  if (St->isTruncatingStore() && VT.isVector()) {
+    SelectionDAG &DAG = DCI.DAG;
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    EVT StVT = St->getMemoryVT();
+    unsigned NumElems = VT.getVectorNumElements();
+    assert(StVT != VT && "Cannot truncate to the same type");
+    unsigned FromEltSz = VT.getVectorElementType().getSizeInBits();
+    unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits();
+
+    // From, To sizes and ElemCount must be pow of two
+    if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
+
+    // We are going to use the original vector elt for storing.
+    // Accumulated smaller vector elements must be a multiple of the store size.
+    if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
+
+    unsigned SizeRatio  = FromEltSz / ToEltSz;
+    assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
+
+    // Create a type on which we perform the shuffle.
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
+                                     NumElems*SizeRatio);
+    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SDLoc DL(St);
+    SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i < NumElems; ++i)
+      ShuffleVec[i] = TLI.isBigEndian() ? (i+1) * SizeRatio - 1 : i * SizeRatio;
+
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
+                                DAG.getUNDEF(WideVec.getValueType()),
+                                ShuffleVec.data());
+    // At this point all of the data is stored at the bottom of the
+    // register. We now need to save it to mem.
+
+    // Find the largest store unit
+    MVT StoreType = MVT::i8;
+    for (MVT Tp : MVT::integer_valuetypes()) {
+      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
+        StoreType = Tp;
+    }
+    // Didn't find a legal store type.
+    if (!TLI.isTypeLegal(StoreType))
+      return SDValue();
+
+    // Bitcast the original vector into a vector of store-size units
+    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
+    SmallVector<SDValue, 8> Chains;
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue BasePtr = St->getBasePtr();
+
+    // Perform one or more big stores into memory.
+    unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
+    for (unsigned I = 0; I < E; I++) {
+      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                                   StoreType, ShuffWide,
+                                   DAG.getIntPtrConstant(I));
+      SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+      BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
+                            Increment);
+      Chains.push_back(Ch);
+    }
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  }
+
+  if (!ISD::isNormalStore(St))
+    return SDValue();
+
+  // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
+  // ARM stores of arguments in the same cache line.
+  if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
+      StVal.getNode()->hasOneUse()) {
+    SelectionDAG  &DAG = DCI.DAG;
+    bool isBigEndian = DAG.getTargetLoweringInfo().isBigEndian();
+    SDLoc DL(St);
+    SDValue BasePtr = St->getBasePtr();
+    SDValue NewST1 = DAG.getStore(St->getChain(), DL,
+                                  StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ),
+                                  BasePtr, St->getPointerInfo(), St->isVolatile(),
+                                  St->isNonTemporal(), St->getAlignment());
+
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+                                    DAG.getConstant(4, MVT::i32));
+    return DAG.getStore(NewST1.getValue(0), DL,
+                        StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
+                        OffsetPtr, St->getPointerInfo(), St->isVolatile(),
+                        St->isNonTemporal(),
+                        std::min(4U, St->getAlignment() / 2));
+  }
+
+  if (StVal.getValueType() == MVT::i64 &&
+      StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+
+    // Bitcast an i64 store extracted from a vector to f64.
+    // Otherwise, the i64 value will be legalized to a pair of i32 values.
+    SelectionDAG &DAG = DCI.DAG;
+    SDLoc dl(StVal);
+    SDValue IntVec = StVal.getOperand(0);
+    EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
+                                   IntVec.getValueType().getVectorNumElements());
+    SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
+    SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                                 Vec, StVal.getOperand(1));
+    dl = SDLoc(N);
+    SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
+    // Make the DAGCombiner fold the bitcasts.
+    DCI.AddToWorklist(Vec.getNode());
+    DCI.AddToWorklist(ExtElt.getNode());
+    DCI.AddToWorklist(V.getNode());
+    return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
+                        St->getPointerInfo(), St->isVolatile(),
+                        St->isNonTemporal(), St->getAlignment(),
+                        St->getAAInfo());
+  }
+
+  // If this is a legal vector store, try to combine it into a VST1_UPD.
+  if (ISD::isNormalStore(N) && VT.isVector() &&
+      DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return CombineBaseUpdate(N, DCI);
+
+  return SDValue();
+}
+
 // isConstVecPow2 - Return true if each vector element is a power of 2, all
 // elements are the same constant, C, and Log2(C) ranges from 1 to 32.
 static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C)
@@ -9316,16 +9409,18 @@ static SDValue PerformVCVTCombine(SDNode *N,
 
   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
-  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32 ||
+      NumLanes > 4) {
     // These instructions only exist converting from f32 to i32. We can handle
     // smaller integers by generating an extra truncate, but larger ones would
-    // be lossy.
+    // be lossy. We also can't handle more then 4 lanes, since these intructions
+    // only support v2i32/v4i32 types.
     return SDValue();
   }
 
   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
     Intrinsic::arm_neon_vcvtfp2fxu;
-  unsigned NumLanes = Op.getValueType().getVectorNumElements();
   SDValue FixConv =  DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
                                  NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
                                  DAG.getConstant(IntrinsicOpcode, MVT::i32), N0,
@@ -9848,10 +9943,11 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
   case ISD::SELECT_CC:  return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
+  case ISD::LOAD:       return PerformLOADCombine(N, DCI);
   case ARMISD::VLD2DUP:
   case ARMISD::VLD3DUP:
   case ARMISD::VLD4DUP:
-    return CombineBaseUpdate(N, DCI);
+    return PerformVLDCombine(N, DCI);
   case ARMISD::BUILD_VECTOR:
     return PerformARMBUILD_VECTORCombine(N, DCI);
   case ISD::INTRINSIC_VOID:
@@ -9871,7 +9967,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
     case Intrinsic::arm_neon_vst2lane:
     case Intrinsic::arm_neon_vst3lane:
     case Intrinsic::arm_neon_vst4lane:
-      return CombineBaseUpdate(N, DCI);
+      return PerformVLDCombine(N, DCI);
     default: break;
     }
     break;
@@ -9934,10 +10030,8 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
   const Function *F = MF.getFunction();
 
   // See if we can use NEON instructions for this...
-  if ((!IsMemset || ZeroMemset) &&
-      Subtarget->hasNEON() &&
-      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::NoImplicitFloat)) {
+  if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
+      !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
     bool Fast;
     if (Size >= 16 &&
         (memOpAlign(SrcAlign, DstAlign, 16) ||
@@ -10535,7 +10629,8 @@ ARMTargetLowering::getSingleConstraintMatchWeight(
 
 typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
 RCPair
-ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+ARMTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                const std::string &Constraint,
                                                 MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC ARM Constraint Letters
@@ -10581,7 +10676,7 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   if (StringRef("{cc}").equals_lower(Constraint))
     return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
 
-  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
 
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
@@ -10861,11 +10956,7 @@ bool ARM::isBitFieldInvertedMask(unsigned v) {
 
   // there can be 1's on either or both "outsides", all the "inside"
   // bits must be 0's
-  unsigned TO = CountTrailingOnes_32(v);
-  unsigned LO = CountLeadingOnes_32(v);
-  v = (v >> TO) << TO;
-  v = (v << LO) >> LO;
-  return v == 0;
+  return isShiftedMask_32(~v);
 }
 
 /// isFPImmLegal - Returns true if the target can instruction select the
@@ -11114,7 +11205,7 @@ bool ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
 
 // This has so far only been implemented for MachO.
 bool ARMTargetLowering::useLoadStackGuardNode() const {
-  return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO;
+  return Subtarget->isTargetMachO();
 }
 
 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
@@ -11274,7 +11365,9 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
   return (Members > 0 && Members <= 4);
 }
 
-/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate.
+/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
+/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
+/// passing according to AAPCS rules.
 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
   if (getEffectiveCallingConv(CallConv, isVarArg) !=
@@ -11283,7 +11376,9 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
 
   HABaseType Base = HA_UNKNOWN;
   uint64_t Members = 0;
-  bool result = isHomogeneousAggregate(Ty, Base, Members);
-  DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump());
-  return result;
+  bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
+  DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
+
+  bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
+  return IsHA || IsIntArray;
 }
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 89b0c31..ec1407d 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -232,7 +232,8 @@ namespace llvm {
 
   class ARMTargetLowering : public TargetLowering {
   public:
-    explicit ARMTargetLowering(const TargetMachine &TM);
+    explicit ARMTargetLowering(const TargetMachine &TM,
+                               const ARMSubtarget &STI);
 
     unsigned getJumpTableEncoding() const override;
 
@@ -332,9 +333,10 @@ namespace llvm {
     ConstraintWeight getSingleConstraintMatchWeight(
       AsmOperandInfo &info, const char *constraint) const override;
 
-    std::pair<unsigned, const TargetRegisterClass*>
-      getRegForInlineAsmConstraint(const std::string &Constraint,
-                                   MVT VT) const override;
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 const std::string &Constraint,
+                                 MVT VT) const override;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
@@ -352,10 +354,6 @@ namespace llvm {
     /// specified value type.
     const TargetRegisterClass *getRegClassFor(MVT VT) const override;
 
-    /// getMaximalGlobalOffset - Returns the maximal possible offset which can
-    /// be used for loads / stores from the global.
-    unsigned getMaximalGlobalOffset() const override;
-
     /// Returns true if a cast between SrcAS and DestAS is a noop.
     bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
       // Addrspacecasts are always noops.
@@ -414,8 +412,9 @@ namespace llvm {
                                    unsigned &Cost) const override;
 
   protected:
-    std::pair<const TargetRegisterClass*, uint8_t>
-    findRepresentativeClass(MVT VT) const override;
+    std::pair<const TargetRegisterClass *, uint8_t>
+    findRepresentativeClass(const TargetRegisterInfo *TRI,
+                            MVT VT) const override;
 
   private:
     /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 17d1ffa..bc617f0 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -93,7 +93,7 @@ unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
 void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI,
                                         Reloc::Model RM) const {
   MachineFunction &MF = *MI->getParent()->getParent();
-  const ARMSubtarget &Subtarget = MF.getTarget().getSubtarget<ARMSubtarget>();
+  const ARMSubtarget &Subtarget = MF.getSubtarget<ARMSubtarget>();
 
   if (!Subtarget.useMovt(MF)) {
     if (RM == Reloc::PIC_)
@@ -144,21 +144,20 @@ namespace {
       ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
       if (AFI->getGlobalBaseReg() == 0)
         return false;
-
-      const ARMTargetMachine *TM =
-        static_cast<const ARMTargetMachine *>(&MF.getTarget());
-      if (TM->getRelocationModel() != Reloc::PIC_)
+      const ARMSubtarget &STI =
+          static_cast<const ARMSubtarget &>(MF.getSubtarget());
+      const TargetMachine &TM = MF.getTarget();
+      if (TM.getRelocationModel() != Reloc::PIC_)
         return false;
 
       LLVMContext *Context = &MF.getFunction()->getContext();
       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-      unsigned PCAdj = TM->getSubtarget<ARMSubtarget>().isThumb() ? 4 : 8;
+      unsigned PCAdj = STI.isThumb() ? 4 : 8;
       ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
           *Context, "_GLOBAL_OFFSET_TABLE_", ARMPCLabelIndex, PCAdj);
 
-      unsigned Align =
-          TM->getSubtargetImpl()->getDataLayout()->getPrefTypeAlignment(
-              Type::getInt32PtrTy(*Context));
+      unsigned Align = TM.getDataLayout()->getPrefTypeAlignment(
+          Type::getInt32PtrTy(*Context));
       unsigned Idx = MF.getConstantPool()->getConstantPoolIndex(CPV, Align);
 
       MachineBasicBlock &FirstMBB = MF.front();
@@ -166,9 +165,8 @@ namespace {
       DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
       unsigned TempReg =
           MF.getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
-      unsigned Opc = TM->getSubtarget<ARMSubtarget>().isThumb2() ?
-                     ARM::t2LDRpci : ARM::LDRcp;
-      const TargetInstrInfo &TII = *TM->getSubtargetImpl()->getInstrInfo();
+      unsigned Opc = STI.isThumb2() ? ARM::t2LDRpci : ARM::LDRcp;
+      const TargetInstrInfo &TII = *STI.getInstrInfo();
       MachineInstrBuilder MIB = BuildMI(FirstMBB, MBBI, DL,
                                         TII.get(Opc), TempReg)
                                 .addConstantPoolIndex(Idx);
@@ -178,15 +176,13 @@ namespace {
 
       // Fix the GOT address by adding pc.
       unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
-      Opc = TM->getSubtarget<ARMSubtarget>().isThumb2() ? ARM::tPICADD
-                                                        : ARM::PICADD;
+      Opc = STI.isThumb2() ? ARM::tPICADD : ARM::PICADD;
       MIB = BuildMI(FirstMBB, MBBI, DL, TII.get(Opc), GlobalBaseReg)
                 .addReg(TempReg)
                 .addImm(ARMPCLabelIndex);
       if (Opc == ARM::PICADD)
         AddDefaultPred(MIB);
 
-
       return true;
     }
 
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 3177114..126c552 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -263,8 +263,6 @@ def IsNotMClass      : Predicate<"!Subtarget->isMClass()">,
                                                     "!armv*m">;
 def IsARM            : Predicate<"!Subtarget->isThumb()">,
                                  AssemblerPredicate<"!ModeThumb", "arm-mode">;
-def IsIOS            : Predicate<"Subtarget->isTargetIOS()">;
-def IsNotIOS         : Predicate<"!Subtarget->isTargetIOS()">;
 def IsMachO          : Predicate<"Subtarget->isTargetMachO()">;
 def IsNotMachO       : Predicate<"!Subtarget->isTargetMachO()">;
 def IsNaCl           : Predicate<"Subtarget->isTargetNaCl()">;
@@ -333,24 +331,6 @@ def imm16_31 : ImmLeaf<i32, [{
   return (int32_t)Imm >= 16 && (int32_t)Imm < 32;
 }]>;
 
-def so_imm_neg_asmoperand : AsmOperandClass { let Name = "ARMSOImmNeg"; }
-def so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
-    unsigned Value = -(unsigned)N->getZExtValue();
-    return Value && ARM_AM::getSOImmVal(Value) != -1;
-  }], imm_neg_XFORM> {
-  let ParserMatchClass = so_imm_neg_asmoperand;
-}
-
-// Note: this pattern doesn't require an encoder method and such, as it's
-// only used on aliases (Pat<> and InstAlias<>). The actual encoding
-// is handled by the destination instructions, which use so_imm.
-def so_imm_not_asmoperand : AsmOperandClass { let Name = "ARMSOImmNot"; }
-def so_imm_not : Operand<i32>, PatLeaf<(imm), [{
-    return ARM_AM::getSOImmVal(~(uint32_t)N->getZExtValue()) != -1;
-  }], imm_not_XFORM> {
-  let ParserMatchClass = so_imm_not_asmoperand;
-}
-
 // sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits.
 def sext_16_node : PatLeaf<(i32 GPR:$a), [{
   return CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17;
@@ -530,7 +510,7 @@ def shift_imm : Operand<i32> {
   let ParserMatchClass = ShifterImmAsmOperand;
 }
 
-// shifter_operand operands: so_reg_reg, so_reg_imm, and so_imm.
+// shifter_operand operands: so_reg_reg, so_reg_imm, and mod_imm.
 def ShiftedRegAsmOperand : AsmOperandClass { let Name = "RegShiftedReg"; }
 def so_reg_reg : Operand<i32>,  // reg reg imm
                  ComplexPattern<i32, 3, "SelectRegShifterOperand",
@@ -575,27 +555,43 @@ def shift_so_reg_imm : Operand<i32>,    // reg reg imm
   let MIOperandInfo = (ops GPR, i32imm);
 }
 
-
-// so_imm - Match a 32-bit shifter_operand immediate operand, which is an
-// 8-bit immediate rotated by an arbitrary number of bits.
-def SOImmAsmOperand: ImmAsmOperand { let Name = "ARMSOImm"; }
-def so_imm : Operand<i32>, ImmLeaf<i32, [{
+// mod_imm: match a 32-bit immediate operand, which can be encoded into
+// a 12-bit immediate; an 8-bit integer and a 4-bit rotator (See ARMARM
+// - "Modified Immediate Constants"). Within the MC layer we keep this
+// immediate in its encoded form.
+def ModImmAsmOperand: AsmOperandClass {
+  let Name = "ModImm";
+  let ParserMethod = "parseModImm";
+}
+def mod_imm : Operand<i32>, ImmLeaf<i32, [{
     return ARM_AM::getSOImmVal(Imm) != -1;
   }]> {
-  let EncoderMethod = "getSOImmOpValue";
-  let ParserMatchClass = SOImmAsmOperand;
-  let DecoderMethod = "DecodeSOImmOperand";
+  let EncoderMethod = "getModImmOpValue";
+  let PrintMethod = "printModImmOperand";
+  let ParserMatchClass = ModImmAsmOperand;
 }
 
-// Break so_imm's up into two pieces.  This handles immediates with up to 16
-// bits set in them.  This uses so_imm2part to match and so_imm2part_[12] to
-// get the first/second pieces.
-def so_imm2part : PatLeaf<(imm), [{
-      return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
-}]>;
+// Note: the patterns mod_imm_not and mod_imm_neg do not require an encoder
+// method and such, as they are only used on aliases (Pat<> and InstAlias<>).
+// The actual parsing, encoding, decoding are handled by the destination
+// instructions, which use mod_imm.
 
-/// arm_i32imm - True for +V6T2, or true only if so_imm2part is true.
-///
+def ModImmNotAsmOperand : AsmOperandClass { let Name = "ModImmNot"; }
+def mod_imm_not : Operand<i32>, PatLeaf<(imm), [{
+    return ARM_AM::getSOImmVal(~(uint32_t)N->getZExtValue()) != -1;
+  }], imm_not_XFORM> {
+  let ParserMatchClass = ModImmNotAsmOperand;
+}
+
+def ModImmNegAsmOperand : AsmOperandClass { let Name = "ModImmNeg"; }
+def mod_imm_neg : Operand<i32>, PatLeaf<(imm), [{
+    unsigned Value = -(unsigned)N->getZExtValue();
+    return Value && ARM_AM::getSOImmVal(Value) != -1;
+  }], imm_neg_XFORM> {
+  let ParserMatchClass = ModImmNegAsmOperand;
+}
+
+/// arm_i32imm - True for +V6T2, or when isSOImmTwoParVal()
 def arm_i32imm : PatLeaf<(imm), [{
   if (Subtarget->useMovt(*MF))
     return true;
@@ -1204,7 +1200,7 @@ include "ARMInstrFormats.td"
 // Multiclass helpers...
 //
 
-/// AsI1_bin_irs - Defines a set of (op r, {so_imm|r|so_reg}) patterns for a
+/// AsI1_bin_irs - Defines a set of (op r, {mod_imm|r|so_reg}) patterns for a
 /// binop that produces a value.
 let TwoOperandAliasConstraint = "$Rn = $Rd" in
 multiclass AsI1_bin_irs<bits<4> opcod, string opc,
@@ -1213,9 +1209,9 @@ multiclass AsI1_bin_irs<bits<4> opcod, string opc,
   // The register-immediate version is re-materializable. This is useful
   // in particular for taking the address of a local.
   let isReMaterializable = 1 in {
-  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm), DPFrm,
                iii, opc, "\t$Rd, $Rn, $imm",
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>,
+               [(set GPR:$Rd, (opnode GPR:$Rn, mod_imm:$imm))]>,
            Sched<[WriteALU, ReadALU]> {
     bits<4> Rd;
     bits<4> Rn;
@@ -1286,9 +1282,9 @@ multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
   // The register-immediate version is re-materializable. This is useful
   // in particular for taking the address of a local.
   let isReMaterializable = 1 in {
-  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm), DPFrm,
                iii, opc, "\t$Rd, $Rn, $imm",
-               [(set GPR:$Rd, (opnode so_imm:$imm, GPR:$Rn))]>,
+               [(set GPR:$Rd, (opnode mod_imm:$imm, GPR:$Rn))]>,
            Sched<[WriteALU, ReadALU]> {
     bits<4> Rd;
     bits<4> Rn;
@@ -1356,9 +1352,9 @@ let hasPostISelHook = 1, Defs = [CPSR] in {
 multiclass AsI1_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
                           InstrItinClass iis, PatFrag opnode,
                           bit Commutable = 0> {
-  def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm, pred:$p),
+  def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p),
                          4, iii,
-                         [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_imm:$imm))]>,
+                         [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, mod_imm:$imm))]>,
                          Sched<[WriteALU, ReadALU]>;
 
   def rr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, pred:$p),
@@ -1389,9 +1385,9 @@ let hasPostISelHook = 1, Defs = [CPSR] in {
 multiclass AsI1_rbin_s_is<InstrItinClass iii, InstrItinClass iir,
                           InstrItinClass iis, PatFrag opnode,
                           bit Commutable = 0> {
-  def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm, pred:$p),
+  def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p),
                          4, iii,
-                         [(set GPR:$Rd, CPSR, (opnode so_imm:$imm, GPR:$Rn))]>,
+                         [(set GPR:$Rd, CPSR, (opnode mod_imm:$imm, GPR:$Rn))]>,
            Sched<[WriteALU, ReadALU]>;
 
   def rsi : ARMPseudoInst<(outs GPR:$Rd),
@@ -1410,16 +1406,16 @@ multiclass AsI1_rbin_s_is<InstrItinClass iii, InstrItinClass iir,
 }
 }
 
-/// AI1_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
+/// AI1_cmp_irs - Defines a set of (op r, {mod_imm|r|so_reg}) cmp / test
 /// patterns. Similar to AsI1_bin_irs except the instruction does not produce
 /// a explicit result, only implicitly set CPSR.
 let isCompare = 1, Defs = [CPSR] in {
 multiclass AI1_cmp_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
                        PatFrag opnode, bit Commutable = 0> {
-  def ri : AI1<opcod, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, iii,
+  def ri : AI1<opcod, (outs), (ins GPR:$Rn, mod_imm:$imm), DPFrm, iii,
                opc, "\t$Rn, $imm",
-               [(opnode GPR:$Rn, so_imm:$imm)]>,
+               [(opnode GPR:$Rn, mod_imm:$imm)]>,
            Sched<[WriteCMP, ReadALU]> {
     bits<4> Rn;
     bits<12> imm;
@@ -1547,9 +1543,9 @@ let TwoOperandAliasConstraint = "$Rn = $Rd" in
 multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
                              bit Commutable = 0> {
   let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
-  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm),
                 DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
-               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_imm:$imm, CPSR))]>,
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, mod_imm:$imm, CPSR))]>,
                Requires<[IsARM]>,
            Sched<[WriteALU, ReadALU]> {
     bits<4> Rd;
@@ -1617,9 +1613,9 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
 let TwoOperandAliasConstraint = "$Rn = $Rd" in
 multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode> {
   let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
-  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm),
                 DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
-               [(set GPR:$Rd, CPSR, (opnode so_imm:$imm, GPR:$Rn, CPSR))]>,
+               [(set GPR:$Rd, CPSR, (opnode mod_imm:$imm, GPR:$Rn, CPSR))]>,
                Requires<[IsARM]>,
            Sched<[WriteALU, ReadALU]> {
     bits<4> Rd;
@@ -1813,7 +1809,7 @@ multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii,
 /// the function.  The first operand is the ID# for this instruction, the second
 /// is the index into the MachineConstantPool that this is, the third is the
 /// size in bytes of this constant pool entry.
-let neverHasSideEffects = 1, isNotDuplicable = 1 in
+let hasSideEffects = 0, isNotDuplicable = 1 in
 def CONSTPOOL_ENTRY :
 PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
                     i32imm:$size), NoItinerary, []>;
@@ -2057,7 +2053,7 @@ def PICSTRB : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
 
 // LEApcrel - Load a pc-relative address into a register without offending the
 // assembler.
-let neverHasSideEffects = 1, isReMaterializable = 1 in
+let hasSideEffects = 0, isReMaterializable = 1 in
 // The 'adr' mnemonic encodes differently if the label is before or after
 // the instruction. The {24-21} opcode bits are set by the fixup, as we don't
 // know until then which form of the instruction will be used.
@@ -2387,6 +2383,33 @@ def RFEIB_UPD : RFEI<1, "rfeib\t$Rn!"> {
   let Inst{24-23} = 0b11;
 }
 
+// Hypervisor Call is a system instruction
+let isCall = 1 in {
+def HVC : AInoP< (outs), (ins imm0_65535:$imm), BrFrm, NoItinerary,
+                "hvc", "\t$imm", []>,
+          Requires<[IsARM, HasVirtualization]> {
+  bits<16> imm;
+
+  // Even though HVC isn't predicable, it's encoding includes a condition field.
+  // The instruction is undefined if the condition field is 0xf otherwise it is
+  // unpredictable if it isn't condition AL (0xe).
+  let Inst{31-28} = 0b1110;
+  let Unpredictable{31-28} = 0b1111;
+  let Inst{27-24} = 0b0001;
+  let Inst{23-20} = 0b0100;
+  let Inst{19-8} = imm{15-4};
+  let Inst{7-4} = 0b0111;
+  let Inst{3-0} = imm{3-0};
+}
+}
+
+// Return from exception in Hypervisor mode.
+let isReturn = 1, isBarrier = 1, isTerminator = 1, Defs = [PC] in
+def ERET : ABI<0b0001, (outs), (ins), NoItinerary, "eret", "", []>,
+    Requires<[IsARM, HasVirtualization]> {
+    let Inst{23-0} = 0b011000000000000001101110;
+}
+
 //===----------------------------------------------------------------------===//
 //  Load / Store Instructions.
 //
@@ -2404,7 +2427,7 @@ defm STRB : AI_str1nopc<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si,
                    BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
 
 // Special LDR for loads from non-pc-relative constpools.
-let canFoldAsLoad = 1, mayLoad = 1, neverHasSideEffects = 1,
+let canFoldAsLoad = 1, mayLoad = 1, hasSideEffects = 0,
     isReMaterializable = 1, isCodeGenOnly = 1 in
 def LDRcp : AI2ldst<0b010, 1, 0, (outs GPR:$Rt), (ins addrmode_imm12:$addr),
                  AddrMode_i12, LdFrm, IIC_iLoad_r, "ldr", "\t$Rt, $addr",
@@ -2431,7 +2454,7 @@ def LDRSB : AI3ld<0b1101, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
                    IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr",
                    [(set GPR:$Rt, (sextloadi8 addrmode3:$addr))]>;
 
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
   // Load doubleword
   def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode3:$addr),
                    LdMiscFrm, IIC_iLoad_d_r, "ldrd", "\t$Rt, $Rt2, $addr", []>,
@@ -2508,7 +2531,7 @@ multiclass AI2_ldridx<bit isByte, string opc,
 
 }
 
-let mayLoad = 1, neverHasSideEffects = 1 in {
+let mayLoad = 1, hasSideEffects = 0 in {
 // FIXME: for LDR_PRE_REG etc. the itineray should be either IIC_iLoad_ru or
 // IIC_iLoad_siu depending on whether it the offset register is shifted.
 defm LDR  : AI2_ldridx<0, "ldr", IIC_iLoad_iu, IIC_iLoad_ru>;
@@ -2544,7 +2567,7 @@ multiclass AI3_ldridx<bits<4> op, string opc, InstrItinClass itin> {
   }
 }
 
-let mayLoad = 1, neverHasSideEffects = 1 in {
+let mayLoad = 1, hasSideEffects = 0 in {
 defm LDRH  : AI3_ldridx<0b1011, "ldrh", IIC_iLoad_bh_ru>;
 defm LDRSH : AI3_ldridx<0b1111, "ldrsh", IIC_iLoad_bh_ru>;
 defm LDRSB : AI3_ldridx<0b1101, "ldrsb", IIC_iLoad_bh_ru>;
@@ -2577,10 +2600,10 @@ def LDRD_POST: AI3ldstidx<0b1101, 0, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
   let DecoderMethod = "DecodeAddrMode3Instruction";
 }
 } // hasExtraDefRegAllocReq = 1
-} // mayLoad = 1, neverHasSideEffects = 1
+} // mayLoad = 1, hasSideEffects = 0
 
 // LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT.
-let mayLoad = 1, neverHasSideEffects = 1 in {
+let mayLoad = 1, hasSideEffects = 0 in {
 def LDRT_POST_REG : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                     (ins addr_offset_none:$addr, am2offset_reg:$offset),
                     IndexModePost, LdFrm, IIC_iLoad_ru,
@@ -2699,7 +2722,7 @@ def STRH : AI3str<0b1011, (outs), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm,
                [(truncstorei16 GPR:$Rt, addrmode3:$addr)]>;
 
 // Store doubleword
-let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
+let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
   def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$Rt2, addrmode3:$addr),
                     StMiscFrm, IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", []>,
              Requires<[IsARM, HasV5TE]> {
@@ -2772,7 +2795,7 @@ multiclass AI2_stridx<bit isByte, string opc,
   }
 }
 
-let mayStore = 1, neverHasSideEffects = 1 in {
+let mayStore = 1, hasSideEffects = 0 in {
 // FIXME: for STR_PRE_REG etc. the itineray should be either IIC_iStore_ru or
 // IIC_iStore_siu depending on whether it the offset register is shifted.
 defm STR  : AI2_stridx<0, "str", IIC_iStore_iu, IIC_iStore_ru>;
@@ -2864,7 +2887,7 @@ def STRH_POST : AI3ldstidx<0b1011, 0, 0, (outs GPR:$Rn_wb),
   let DecoderMethod = "DecodeAddrMode3Instruction";
 }
 
-let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
+let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
 def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb),
                           (ins GPR:$Rt, GPR:$Rt2, addrmode3_pre:$addr),
                           IndexModePre, StMiscFrm, IIC_iStore_d_ru,
@@ -2894,7 +2917,7 @@ def STRD_POST: AI3ldstidx<0b1111, 0, 0, (outs GPR:$Rn_wb),
   let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
   let DecoderMethod = "DecodeAddrMode3Instruction";
 }
-} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
+} // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1
 
 // STRT, STRBT, and STRHT
 
@@ -2938,7 +2961,7 @@ def STRBT_POST
   : ARMAsmPseudo<"strbt${q} $Rt, $addr",
                  (ins GPR:$Rt, addr_offset_none:$addr, pred:$q)>;
 
-let mayStore = 1, neverHasSideEffects = 1 in {
+let mayStore = 1, hasSideEffects = 0 in {
 def STRT_POST_REG : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
                    (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
                    IndexModePost, StFrm, IIC_iStore_ru,
@@ -3103,17 +3126,18 @@ multiclass arm_ldst_mult<string asm, string sfx, bit L_bit, bit P_bit, Format f,
   }
 }
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
 defm LDM : arm_ldst_mult<"ldm", "", 1, 0, LdStMulFrm, IIC_iLoad_m,
-                         IIC_iLoad_mu>;
+                         IIC_iLoad_mu>, ComplexDeprecationPredicate<"ARMLoad">;
 
 let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
 defm STM : arm_ldst_mult<"stm", "", 0, 0, LdStMulFrm, IIC_iStore_m,
-                         IIC_iStore_mu>;
+                         IIC_iStore_mu>,
+           ComplexDeprecationPredicate<"ARMStore">;
 
-} // neverHasSideEffects
+} // hasSideEffects
 
 // FIXME: remove when we have a way to marking a MI with these properties.
 // FIXME: Should pc be an implicit operand like PICADD, etc?
@@ -3139,7 +3163,7 @@ defm sysSTM : arm_ldst_mult<"stm", " ^", 0, 1, LdStMulFrm, IIC_iStore_m,
 //  Move Instructions.
 //
 
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
                 "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> {
   bits<4> Rd;
@@ -3153,7 +3177,7 @@ def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
 }
 
 // A version for the smaller set of tail call registers.
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm,
                 IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> {
   bits<4> Rd;
@@ -3197,8 +3221,8 @@ def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src),
 }
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
-def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm, IIC_iMOVi,
-                "mov", "\t$Rd, $imm", [(set GPR:$Rd, so_imm:$imm)]>, UnaryDP,
+def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm, IIC_iMOVi,
+                "mov", "\t$Rd, $imm", [(set GPR:$Rd, mod_imm:$imm)]>, UnaryDP,
                 Sched<[WriteALU]> {
   bits<4> Rd;
   bits<12> imm;
@@ -3408,10 +3432,10 @@ defm RSC : AI1_rsc_irs<0b0111, "rsc",
 // assume opposite meanings of the carry flag (i.e., carry == !borrow).
 // See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory
 // details.
-def : ARMPat<(add     GPR:$src, so_imm_neg:$imm),
-             (SUBri   GPR:$src, so_imm_neg:$imm)>;
-def : ARMPat<(ARMaddc GPR:$src, so_imm_neg:$imm),
-             (SUBSri  GPR:$src, so_imm_neg:$imm)>;
+def : ARMPat<(add     GPR:$src, mod_imm_neg:$imm),
+             (SUBri   GPR:$src, mod_imm_neg:$imm)>;
+def : ARMPat<(ARMaddc GPR:$src, mod_imm_neg:$imm),
+             (SUBSri  GPR:$src, mod_imm_neg:$imm)>;
 
 def : ARMPat<(add     GPR:$src, imm0_65535_neg:$imm),
              (SUBrr   GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
@@ -3423,8 +3447,8 @@ def : ARMPat<(ARMaddc GPR:$src, imm0_65535_neg:$imm),
 // The with-carry-in form matches bitwise not instead of the negation.
 // Effectively, the inverse interpretation of the carry flag already accounts
 // for part of the negation.
-def : ARMPat<(ARMadde GPR:$src, so_imm_not:$imm, CPSR),
-             (SBCri   GPR:$src, so_imm_not:$imm)>;
+def : ARMPat<(ARMadde GPR:$src, mod_imm_not:$imm, CPSR),
+             (SBCri   GPR:$src, mod_imm_not:$imm)>;
 def : ARMPat<(ARMadde GPR:$src, imm0_65535_neg:$imm, CPSR),
              (SBCrr   GPR:$src, (MOVi16 (imm_not_XFORM imm:$imm)))>,
              Requires<[IsARM, HasV6T2]>;
@@ -3705,9 +3729,9 @@ def  MVNsr  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift),
   let Inst{3-0} = shift{3-0};
 }
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
-def  MVNi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm,
+def  MVNi  : AsI1<0b1111, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm,
                   IIC_iMVNi, "mvn", "\t$Rd, $imm",
-                  [(set GPR:$Rd, so_imm_not:$imm)]>,UnaryDP, Sched<[WriteALU]> {
+                  [(set GPR:$Rd, mod_imm_not:$imm)]>,UnaryDP, Sched<[WriteALU]> {
   bits<4> Rd;
   bits<12> imm;
   let Inst{25} = 1;
@@ -3716,8 +3740,8 @@ def  MVNi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm,
   let Inst{11-0} = imm;
 }
 
-def : ARMPat<(and   GPR:$src, so_imm_not:$imm),
-             (BICri GPR:$src, so_imm_not:$imm)>;
+def : ARMPat<(and   GPR:$src, mod_imm_not:$imm),
+             (BICri GPR:$src, mod_imm_not:$imm)>;
 
 //===----------------------------------------------------------------------===//
 //  Multiply Instructions.
@@ -3811,7 +3835,7 @@ def MLS  : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
 }
 
 // Extra precision multiplies with low / high results
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 let isCommutable = 1 in {
 def SMULL : AsMul1I64<0b0000110, (outs GPR:$RdLo, GPR:$RdHi),
                                  (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
@@ -3878,7 +3902,7 @@ def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                            Requires<[IsARM, NoV6]>;
 }
 
-} // neverHasSideEffects
+} // hasSideEffects
 
 // Most significant word multiply
 def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
@@ -4242,8 +4266,8 @@ defm CMP  : AI1_cmp_irs<0b1010, "cmp",
                         BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
 
 // ARMcmpZ can re-use the above instruction definitions.
-def : ARMPat<(ARMcmpZ GPR:$src, so_imm:$imm),
-             (CMPri   GPR:$src, so_imm:$imm)>;
+def : ARMPat<(ARMcmpZ GPR:$src, mod_imm:$imm),
+             (CMPri   GPR:$src, mod_imm:$imm)>;
 def : ARMPat<(ARMcmpZ GPR:$src, GPR:$rhs),
              (CMPrr   GPR:$src, GPR:$rhs)>;
 def : ARMPat<(ARMcmpZ GPR:$src, so_reg_imm:$rhs),
@@ -4253,9 +4277,9 @@ def : ARMPat<(ARMcmpZ GPR:$src, so_reg_reg:$rhs),
 
 // CMN register-integer
 let isCompare = 1, Defs = [CPSR] in {
-def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, IIC_iCMPi,
+def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, mod_imm:$imm), DPFrm, IIC_iCMPi,
                 "cmn", "\t$Rn, $imm",
-                [(ARMcmn GPR:$Rn, so_imm:$imm)]>,
+                [(ARMcmn GPR:$Rn, mod_imm:$imm)]>,
                 Sched<[WriteCMP, ReadALU]> {
   bits<4> Rn;
   bits<12> imm;
@@ -4328,11 +4352,11 @@ def CMNzrsr : AI1<0b1011, (outs),
 
 }
 
-def : ARMPat<(ARMcmp  GPR:$src, so_imm_neg:$imm),
-             (CMNri   GPR:$src, so_imm_neg:$imm)>;
+def : ARMPat<(ARMcmp  GPR:$src, mod_imm_neg:$imm),
+             (CMNri   GPR:$src, mod_imm_neg:$imm)>;
 
-def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm),
-             (CMNri   GPR:$src, so_imm_neg:$imm)>;
+def : ARMPat<(ARMcmpZ GPR:$src, mod_imm_neg:$imm),
+             (CMNri   GPR:$src, mod_imm_neg:$imm)>;
 
 // Note that TST/TEQ don't set all the same flags that CMP does!
 defm TST  : AI1_cmp_irs<0b1000, "tst",
@@ -4359,7 +4383,7 @@ def BCCZi64 : PseudoInst<(outs),
 
 
 // Conditional moves
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 
 let isCommutable = 1, isSelect = 1 in
 def MOVCCr : ARMPseudoInst<(outs GPR:$Rd),
@@ -4396,9 +4420,9 @@ def MOVCCi16
 
 let isMoveImm = 1 in
 def MOVCCi : ARMPseudoInst<(outs GPR:$Rd),
-                           (ins GPR:$false, so_imm:$imm, cmovpred:$p),
+                           (ins GPR:$false, mod_imm:$imm, cmovpred:$p),
                            4, IIC_iCMOVi,
-                           [(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm,
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, mod_imm:$imm,
                                                    cmovpred:$p))]>,
       RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
@@ -4414,13 +4438,13 @@ def MOVCCi32imm
 
 let isMoveImm = 1 in
 def MVNCCi : ARMPseudoInst<(outs GPR:$Rd),
-                           (ins GPR:$false, so_imm:$imm, cmovpred:$p),
+                           (ins GPR:$false, mod_imm:$imm, cmovpred:$p),
                            4, IIC_iCMOVi,
-                           [(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm,
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, mod_imm_not:$imm,
                                                    cmovpred:$p))]>,
                 RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
-} // neverHasSideEffects
+} // hasSideEffects
 
 
 //===----------------------------------------------------------------------===//
@@ -5074,7 +5098,7 @@ def MRSbanked : ABI<0b0001, (outs GPRnopc:$Rd), (ins banked_reg:$banked),
 
   let Inst{23} = 0;
   let Inst{22} = banked{5}; // R bit
-  let Inst{21-20} = 0b10;
+  let Inst{21-20} = 0b00;
   let Inst{19-16} = banked{3-0};
   let Inst{15-12} = Rd;
   let Inst{11-9} = 0b001;
@@ -5103,17 +5127,17 @@ def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary,
   let Inst{3-0} = Rn;
 }
 
-def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask,  so_imm:$a), NoItinerary,
-               "msr", "\t$mask, $a", []> {
+def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask,  mod_imm:$imm), NoItinerary,
+               "msr", "\t$mask, $imm", []> {
   bits<5> mask;
-  bits<12> a;
+  bits<12> imm;
 
   let Inst{23} = 0;
   let Inst{22} = mask{4}; // R bit
   let Inst{21-20} = 0b10;
   let Inst{19-16} = mask{3-0};
   let Inst{15-12} = 0b1111;
-  let Inst{11-0} = a;
+  let Inst{11-0} = imm;
 }
 
 // However, the MSR (banked register) system instruction (ARMv7VE) *does* have a
@@ -5204,7 +5228,7 @@ let isBarrier = 1, hasSideEffects = 1, isTerminator = 1,
 def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
                              NoItinerary,
                          [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
-                                Requires<[IsARM, IsIOS]>;
+                                Requires<[IsARM]>;
 }
 
 // eh.sjlj.dispatchsetup pseudo-instruction.
@@ -5228,7 +5252,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in
 
 // Large immediate handling.
 
-// 32-bit immediate using two piece so_imms or movw + movt.
+// 32-bit immediate using two piece mod_imms or movw + movt.
 // This is a single pseudo instruction, the benefit is that it can be remat'd
 // as a single unit instead of having to handle reg inputs.
 // FIXME: Remove this when we can do generalized remat.
@@ -5257,6 +5281,7 @@ def LDRLIT_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
                                        (ARMWrapperPIC tglobaladdr:$addr))]>,
                       Requires<[IsARM, DontUseMovt]>;
 
+let AddedComplexity = 10 in
 def LDRLIT_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
                               NoItinerary,
                               [(set GPR:$dst,
@@ -5519,36 +5544,36 @@ def : MnemonicAlias<"uqsubaddx", "uqsax">;
 // USAX == USUBADDX
 def : MnemonicAlias<"usubaddx", "usax">;
 
-// "mov Rd, so_imm_not" can be handled via "mvn" in assembly, just like
+// "mov Rd, mod_imm_not" can be handled via "mvn" in assembly, just like
 // for isel.
 def : ARMInstAlias<"mov${s}${p} $Rd, $imm",
-                   (MVNi rGPR:$Rd, so_imm_not:$imm, pred:$p, cc_out:$s)>;
+                   (MVNi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
 def : ARMInstAlias<"mvn${s}${p} $Rd, $imm",
-                   (MOVi rGPR:$Rd, so_imm_not:$imm, pred:$p, cc_out:$s)>;
+                   (MOVi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
 // Same for AND <--> BIC
 def : ARMInstAlias<"bic${s}${p} $Rd, $Rn, $imm",
-                   (ANDri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                   (ANDri rGPR:$Rd, rGPR:$Rn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 def : ARMInstAlias<"bic${s}${p} $Rdn, $imm",
-                   (ANDri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                   (ANDri rGPR:$Rdn, rGPR:$Rdn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 def : ARMInstAlias<"and${s}${p} $Rd, $Rn, $imm",
-                   (BICri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                   (BICri rGPR:$Rd, rGPR:$Rn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 def : ARMInstAlias<"and${s}${p} $Rdn, $imm",
-                   (BICri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                   (BICri rGPR:$Rdn, rGPR:$Rdn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 
-// Likewise, "add Rd, so_imm_neg" -> sub
+// Likewise, "add Rd, mod_imm_neg" -> sub
 def : ARMInstAlias<"add${s}${p} $Rd, $Rn, $imm",
-                 (SUBri GPR:$Rd, GPR:$Rn, so_imm_neg:$imm, pred:$p, cc_out:$s)>;
+                 (SUBri GPR:$Rd, GPR:$Rn, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
 def : ARMInstAlias<"add${s}${p} $Rd, $imm",
-                 (SUBri GPR:$Rd, GPR:$Rd, so_imm_neg:$imm, pred:$p, cc_out:$s)>;
-// Same for CMP <--> CMN via so_imm_neg
+                 (SUBri GPR:$Rd, GPR:$Rd, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
+// Same for CMP <--> CMN via mod_imm_neg
 def : ARMInstAlias<"cmp${p} $Rd, $imm",
-                   (CMNri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>;
+                   (CMNri rGPR:$Rd, mod_imm_neg:$imm, pred:$p)>;
 def : ARMInstAlias<"cmn${p} $Rd, $imm",
-                   (CMPri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>;
+                   (CMPri rGPR:$Rd, mod_imm_neg:$imm, pred:$p)>;
 
 // The shifter forms of the MOV instruction are aliased to the ASR, LSL,
 // LSR, ROR, and RRX instructions.
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index a0c627c..2a7b4b5 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -665,7 +665,7 @@ class VLDQQQQWBPseudo<InstrItinClass itin>
                 (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin,
                 "$addr.addr = $wb, $src = $dst">;
 
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
 
 //   VLD1     : Vector Load (multiple single elements)
 class VLD1D<bits<4> op7_4, string Dt, Operand AddrMode>
@@ -1023,7 +1023,7 @@ def VLD4q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>;
 def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
 def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
 
-} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
+} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
 
 // Classes for VLD*LN pseudo-instructions with multi-register operands.
 // These are expanded to real instructions after register allocation.
@@ -1106,7 +1106,7 @@ def : Pat<(vector_insert (v4f32 QPR:$src),
                          (f32 (load addrmode6:$addr)), imm:$lane),
           (VLD1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
 
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
 
 // ...with address register writeback:
 class VLD1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1359,7 +1359,7 @@ def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32"> {
 def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
 def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
 
-} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
+} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
 
 //   VLD1DUP  : Vector Load (single element to all lanes)
 class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
@@ -1405,7 +1405,7 @@ def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32", v4i32, load,
 def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
           (VLD1DUPq32 addrmode6:$addr)>;
 
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
 // ...with address register writeback:
 multiclass VLD1DUPWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
@@ -1609,9 +1609,9 @@ def VLD4DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4dupu>;
 def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
 def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
 
-} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
+} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
 
-let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
+let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
 
 // Classes for VST* pseudo-instructions with multi-register operands.
 // These are expanded to real instructions after register allocation.
@@ -2025,7 +2025,7 @@ def VST4q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>;
 def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
 def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
 
-} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
+} // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1
 
 // Classes for VST*LN pseudo-instructions with multi-register operands.
 // These are expanded to real instructions after register allocation.
@@ -2129,7 +2129,7 @@ def VST1LNq8Pseudo_UPD  : VST1QLNWBPseudo<v16i8, post_truncsti8, NEONvgetlaneu>;
 def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo<v8i16, post_truncsti16,NEONvgetlaneu>;
 def VST1LNq32Pseudo_UPD : VST1QLNWBPseudo<v4i32, post_store, extractelt>;
 
-let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
+let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
 
 //   VST2LN   : Vector Store (single 2-element structure from one lane)
 class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -2351,7 +2351,7 @@ def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32"> {
 def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
 def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
 
-} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
+} // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1
 
 // Use vld1/vst1 for unaligned f64 load / store
 def : Pat<(f64 (hword_alignedload addrmode6:$addr)),
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index a867844..3c62e0e 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -714,7 +714,7 @@ def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
 //
 
 // These require base address to be written back or one of the loaded regs.
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
 def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
@@ -754,7 +754,7 @@ def tSTMIA_UPD : Thumb1I<(outs GPR:$wb),
   let Inst{7-0}  = regs;
 }
 
-} // neverHasSideEffects
+} // hasSideEffects
 
 def : InstAlias<"ldm${p} $Rn!, $regs",
                 (tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)>,
@@ -888,7 +888,7 @@ def tADDrr :                    // A8.6.6 T1
                 "add", "\t$Rd, $Rn, $Rm",
                 [(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 def tADDhirr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iALUr,
                      "add", "\t$Rdn, $Rm", []>,
                T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
@@ -1048,7 +1048,7 @@ def : tInstAlias <"movs $Rdn, $imm",
 
 // A7-73: MOV(2) - mov setting flag.
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 def tMOVr : Thumb1pI<(outs GPR:$Rd), (ins GPR:$Rm), AddrModeNone,
                       2, IIC_iMOVr,
                       "mov", "\t$Rd, $Rm", "", []>,
@@ -1070,7 +1070,7 @@ def tMOVSr      : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr,
   let Inst{5-3}  = Rm;
   let Inst{2-0}  = Rd;
 }
-} // neverHasSideEffects
+} // hasSideEffects
 
 // Multiply register
 let isCommutable = 1 in
@@ -1248,7 +1248,7 @@ def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p),
   let DecoderMethod = "DecodeThumbAddSpecialReg";
 }
 
-let neverHasSideEffects = 1, isReMaterializable = 1 in
+let hasSideEffects = 0, isReMaterializable = 1 in
 def tLEApcrel   : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p),
                               2, IIC_iALUi, []>, Sched<[WriteALU]>;
 
@@ -1297,7 +1297,7 @@ def tInt_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch),
                               AddrModeNone, 0, IndexModeNone,
                               Pseudo, NoItinerary, "", "",
                               [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
-                             Requires<[IsThumb, IsIOS]>;
+                             Requires<[IsThumb]>;
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
@@ -1375,6 +1375,17 @@ def : T1Pat<(zextloadi1 t_addrmode_rrs1:$addr),
 def : T1Pat<(zextloadi1 t_addrmode_is1:$addr),
             (tLDRBi t_addrmode_is1:$addr)>;
 
+// extload from the stack -> word load from the stack, as it avoids having to
+// materialize the base in a separate register. This only works when a word
+// load puts the byte/halfword value in the same place in the register that the
+// byte/halfword load would, i.e. when little-endian.
+def : T1Pat<(extloadi1  t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
+      Requires<[IsThumb, IsThumb1Only, IsLE]>;
+def : T1Pat<(extloadi8  t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
+      Requires<[IsThumb, IsThumb1Only, IsLE]>;
+def : T1Pat<(extloadi16 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
+      Requires<[IsThumb, IsThumb1Only, IsLE]>;
+
 // extload -> zextload
 def : T1Pat<(extloadi1  t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>;
 def : T1Pat<(extloadi1  t_addrmode_is1:$addr),  (tLDRBi t_addrmode_is1:$addr)>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 807c252..10b0a0e 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -1185,7 +1185,8 @@ class T2I_exta_rrot<bits<3> opcod, string opc, PatFrag opnode>
 
 class T2I_exta_rrot_np<bits<3> opcod, string opc>
   : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm,rot_imm:$rot),
-               IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot", []> {
+               IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot", []>,
+               Requires<[HasT2ExtractPack, IsThumb2]> {
   bits<2> rot;
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0100;
@@ -1241,7 +1242,7 @@ def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd),
   let DecoderMethod = "DecodeT2Adr";
 }
 
-let neverHasSideEffects = 1, isReMaterializable = 1 in
+let hasSideEffects = 0, isReMaterializable = 1 in
 def t2LEApcrel   : t2PseudoInst<(outs rGPR:$Rd), (ins i32imm:$label, pred:$p),
                                 4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
 let hasSideEffects = 1 in
@@ -1272,12 +1273,12 @@ defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
 defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
                       GPRnopc, UnOpFrag<(sextloadi8  node:$Src)>>;
 
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
 def t2LDRDi8  : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2),
                         (ins t2addrmode_imm8s4:$addr),
                         IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>;
-} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
+} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
 
 // zextload i1 -> zextload i8
 def : T2Pat<(zextloadi1 t2addrmode_imm12:$addr),
@@ -1326,7 +1327,7 @@ def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)),
 
 // Indexed loads
 
-let mayLoad = 1, neverHasSideEffects = 1 in {
+let mayLoad = 1, hasSideEffects = 0 in {
 def t2LDR_PRE  : T2Ipreldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_iu,
@@ -1378,7 +1379,7 @@ def t2LDRSH_POST : T2Ipostldst<1, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
                           "ldrsh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
-} // mayLoad = 1, neverHasSideEffects = 1
+} // mayLoad = 1, hasSideEffects = 0
 
 // LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110).
 // Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4
@@ -1443,14 +1444,14 @@ defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
                    rGPR, BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
 
 // Store doubleword
-let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in
+let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in
 def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
                        (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
                IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>;
 
 // Indexed stores
 
-let mayStore = 1, neverHasSideEffects = 1 in {
+let mayStore = 1, hasSideEffects = 0 in {
 def t2STR_PRE  : T2Ipreldst<0, 0b10, 0, 1, (outs GPRnopc:$Rn_wb),
                             (ins GPRnopc:$Rt, t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
@@ -1468,7 +1469,7 @@ def t2STRB_PRE  : T2Ipreldst<0, 0b00, 0, 1, (outs GPRnopc:$Rn_wb),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu,
                         "strb", "\t$Rt, $addr!",
                         "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
-} // mayStore = 1, neverHasSideEffects = 1
+} // mayStore = 1, hasSideEffects = 0
 
 def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb),
                             (ins GPRnopc:$Rt, addr_offset_none:$Rn,
@@ -1763,7 +1764,7 @@ multiclass thumb2_ld_mult<string asm, InstrItinClass itin,
   }
 }
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
 defm t2LDM : thumb2_ld_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, 1>;
@@ -1848,14 +1849,14 @@ multiclass thumb2_st_mult<string asm, InstrItinClass itin,
 let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
 defm t2STM : thumb2_st_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
 
-} // neverHasSideEffects
+} // hasSideEffects
 
 
 //===----------------------------------------------------------------------===//
 //  Move Instructions.
 //
 
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr,
                    "mov", ".w\t$Rd, $Rm", []>, Sched<[WriteALU]> {
   let Inst{31-27} = 0b11101;
@@ -2572,7 +2573,7 @@ def t2MLS: T2FourReg<
 }
 
 // Extra precision multiplies with low / high results
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 let isCommutable = 1 in {
 def t2SMULL : T2MulLong<0b000, 0b0000,
                   (outs rGPR:$RdLo, rGPR:$RdHi),
@@ -2603,7 +2604,7 @@ def t2UMAAL : T2MulLong<0b110, 0b0110,
                   (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64,
                   "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
           Requires<[IsThumb2, HasThumb2DSP]>;
-} // neverHasSideEffects
+} // hasSideEffects
 
 // Rounding variants of the below included for disassembly only
 
@@ -3150,7 +3151,7 @@ defm t2TEQ  : T2I_cmp_irs<0b0100, "teq",
                          BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>>;
 
 // Conditional moves
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 
 let isCommutable = 1, isSelect = 1 in
 def t2MOVCCr : t2PseudoInst<(outs rGPR:$Rd),
@@ -3213,7 +3214,7 @@ def t2MOVCCi32imm
       RegConstraint<"$false = $dst">;
 } // isCodeGenOnly = 1
 
-} // neverHasSideEffects
+} // hasSideEffects
 
 //===----------------------------------------------------------------------===//
 // Atomic operations intrinsics
@@ -3824,6 +3825,27 @@ def t2SUBS_PC_LR : T2I <(outs), (ins imm0_255:$imm), NoItinerary,
   let Inst{7-0} = imm;
 }
 
+// Hypervisor Call is a system instruction.
+let isCall = 1 in {
+def t2HVC : T2XI <(outs), (ins imm0_65535:$imm16), IIC_Br, "hvc.w\t$imm16", []>,
+      Requires<[IsThumb2, HasVirtualization]>, Sched<[WriteBr]> {
+    bits<16> imm16;
+    let Inst{31-20} = 0b111101111110;
+    let Inst{19-16} = imm16{15-12};
+    let Inst{15-12} = 0b1000;
+    let Inst{11-0} = imm16{11-0};
+}
+}
+
+// Alias for HVC without the ".w" optional width specifier
+def : t2InstAlias<"hvc\t$imm16", (t2HVC imm0_65535:$imm16)>;
+
+// ERET - Return from exception in Hypervisor mode.
+// B9.3.3, B9.3.20: ERET is an alias for "SUBS PC, LR, #0" in an implementation that
+// includes virtualization extensions.
+def t2ERET : InstAlias<"eret${p}", (t2SUBS_PC_LR 0, pred:$p)>,
+             Requires<[IsThumb2, HasVirtualization]>;
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //
@@ -4564,17 +4586,21 @@ def : t2InstAlias<"strh${p} $Rt, $addr",
                   (t2STRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
 
 // Extend instruction optional rotate operand.
-def : t2InstAlias<"sxtab${p} $Rd, $Rn, $Rm",
-                (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
-def : t2InstAlias<"sxtah${p} $Rd, $Rn, $Rm",
-                (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
-def : t2InstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
-                (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : InstAlias<"sxtab${p} $Rd, $Rn, $Rm",
+              (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"sxtah${p} $Rd, $Rn, $Rm",
+              (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
+              (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"sxtb16${p} $Rd, $Rm",
+              (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>,
+              Requires<[HasT2ExtractPack, IsThumb2]>;
 
 def : t2InstAlias<"sxtb${p} $Rd, $Rm",
                 (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
-def : t2InstAlias<"sxtb16${p} $Rd, $Rm",
-                (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
 def : t2InstAlias<"sxth${p} $Rd, $Rm",
                 (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
 def : t2InstAlias<"sxtb${p}.w $Rd, $Rm",
@@ -4582,19 +4608,23 @@ def : t2InstAlias<"sxtb${p}.w $Rd, $Rm",
 def : t2InstAlias<"sxth${p}.w $Rd, $Rm",
                 (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
 
-def : t2InstAlias<"uxtab${p} $Rd, $Rn, $Rm",
-                (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
-def : t2InstAlias<"uxtah${p} $Rd, $Rn, $Rm",
-                (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
-def : t2InstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
-                (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : InstAlias<"uxtab${p} $Rd, $Rn, $Rm",
+              (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"uxtah${p} $Rd, $Rn, $Rm",
+              (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
+              (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"uxtb16${p} $Rd, $Rm",
+              (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>,
+              Requires<[HasT2ExtractPack, IsThumb2]>;
+
 def : t2InstAlias<"uxtb${p} $Rd, $Rm",
                 (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
-def : t2InstAlias<"uxtb16${p} $Rd, $Rm",
-                (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
 def : t2InstAlias<"uxth${p} $Rd, $Rm",
                 (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
-
 def : t2InstAlias<"uxtb${p}.w $Rd, $Rm",
                 (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
 def : t2InstAlias<"uxth${p}.w $Rd, $Rm",
@@ -4603,15 +4633,17 @@ def : t2InstAlias<"uxth${p}.w $Rd, $Rm",
 // Extend instruction w/o the ".w" optional width specifier.
 def : t2InstAlias<"uxtb${p} $Rd, $Rm$rot",
                   (t2UXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
-def : t2InstAlias<"uxtb16${p} $Rd, $Rm$rot",
-                  (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : InstAlias<"uxtb16${p} $Rd, $Rm$rot",
+                (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>,
+                Requires<[HasT2ExtractPack, IsThumb2]>;
 def : t2InstAlias<"uxth${p} $Rd, $Rm$rot",
                   (t2UXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 
 def : t2InstAlias<"sxtb${p} $Rd, $Rm$rot",
                   (t2SXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
-def : t2InstAlias<"sxtb16${p} $Rd, $Rm$rot",
-                  (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : InstAlias<"sxtb16${p} $Rd, $Rm$rot",
+                (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>,
+                Requires<[HasT2ExtractPack, IsThumb2]>;
 def : t2InstAlias<"sxth${p} $Rd, $Rm$rot",
                   (t2SXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index d78f2ac..e0a9314 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -194,7 +194,7 @@ multiclass vfp_ldst_mult<string asm, bit L_bit,
   }
 }
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
 defm VLDM : vfp_ldst_mult<"vldm", 1, IIC_fpLoad_m, IIC_fpLoad_mu>;
@@ -202,7 +202,7 @@ defm VLDM : vfp_ldst_mult<"vldm", 1, IIC_fpLoad_m, IIC_fpLoad_mu>;
 let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
 defm VSTM : vfp_ldst_mult<"vstm", 0, IIC_fpStore_m, IIC_fpStore_mu>;
 
-} // neverHasSideEffects
+} // hasSideEffects
 
 def : MnemonicAlias<"vldm", "vldmia">;
 def : MnemonicAlias<"vstm", "vstmia">;
@@ -769,7 +769,7 @@ def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm",
                   [(set SPR:$Sd, (fsqrt SPR:$Sm))]>;
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 def VMOVD  : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
                   IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", []>;
@@ -777,7 +777,7 @@ def VMOVD  : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
 def VMOVS  : ASuI<0b11101, 0b11, 0b0000, 0b01, 0,
                   (outs SPR:$Sd), (ins SPR:$Sm),
                   IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>;
-} // neverHasSideEffects
+} // hasSideEffects
 
 //===----------------------------------------------------------------------===//
 // FP <-> GPR Copies.  Int <-> FP Conversions.
@@ -827,7 +827,7 @@ def VMOVSR : AVConv4I<0b11100000, 0b1010,
   let D = VFPNeonDomain;
 }
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 def VMOVRRD  : AVConv3I<0b11000101, 0b1011,
                         (outs GPR:$Rt, GPR:$Rt2), (ins DPR:$Dm),
                         IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $Dm",
@@ -876,7 +876,7 @@ def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
   let D = VFPNeonDomain;
   let DecoderMethod = "DecodeVMOVRRS";
 }
-} // neverHasSideEffects
+} // hasSideEffects
 
 // FMDHR: GPR -> SPR
 // FMDLR: GPR -> SPR
@@ -907,7 +907,7 @@ def VMOVDRR : AVConv5I<0b11000100, 0b1011,
   let isRegSequence = 1;
 }
 
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 def VMOVSRR : AVConv5I<0b11000100, 0b1010,
                      (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2),
                 IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2",
@@ -1543,7 +1543,7 @@ def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
 // FP Conditional moves.
 //
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 def VMOVDcc  : PseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, cmovpred:$p),
                     IIC_fpUNA64,
                     [(set (f64 DPR:$Dd),
@@ -1555,7 +1555,7 @@ def VMOVScc  : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p),
                     [(set (f32 SPR:$Sd),
                           (ARMcmov SPR:$Sn, SPR:$Sm, cmovpred:$p))]>,
                RegConstraint<"$Sn = $Sd">, Requires<[HasVFP2]>;
-} // neverHasSideEffects
+} // hasSideEffects
 
 //===----------------------------------------------------------------------===//
 // Move from VFP System Register to ARM core register.
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index c429ac1..a8d0981 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -170,7 +170,8 @@ static int getMemoryOpOffset(const MachineInstr *MI) {
     return OffField;
 
   // Thumb1 immediate offsets are scaled by 4
-  if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi)
+  if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi ||
+      Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi)
     return OffField * 4;
 
   int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
@@ -206,6 +207,7 @@ static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) {
     case ARM_AM::ib: return ARM::STMIB;
     }
   case ARM::tLDRi:
+  case ARM::tLDRspi:
     // tLDMIA is writeback-only - unless the base register is in the input
     // reglist.
     ++NumLDMGened;
@@ -214,6 +216,7 @@ static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) {
     case ARM_AM::ia: return ARM::tLDMIA;
     }
   case ARM::tSTRi:
+  case ARM::tSTRspi:
     // There is no non-writeback tSTMIA either.
     ++NumSTMGened;
     switch (Mode) {
@@ -328,7 +331,7 @@ AMSubMode getLoadStoreMultipleSubMode(int Opcode) {
 } // end namespace llvm
 
 static bool isT1i32Load(unsigned Opc) {
-  return Opc == ARM::tLDRi;
+  return Opc == ARM::tLDRi || Opc == ARM::tLDRspi;
 }
 
 static bool isT2i32Load(unsigned Opc) {
@@ -340,7 +343,7 @@ static bool isi32Load(unsigned Opc) {
 }
 
 static bool isT1i32Store(unsigned Opc) {
-  return Opc == ARM::tSTRi;
+  return Opc == ARM::tSTRi || Opc == ARM::tSTRspi;
 }
 
 static bool isT2i32Store(unsigned Opc) {
@@ -356,6 +359,8 @@ static unsigned getImmScale(unsigned Opc) {
   default: llvm_unreachable("Unhandled opcode!");
   case ARM::tLDRi:
   case ARM::tSTRi:
+  case ARM::tLDRspi:
+  case ARM::tSTRspi:
     return 1;
   case ARM::tLDRHi:
   case ARM::tSTRHi:
@@ -495,6 +500,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
   if (isThumb1)
     for (unsigned I = 0; I < NumRegs; ++I)
       if (Base == Regs[I].first) {
+        assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
         if (Opcode == ARM::tLDRi) {
           Writeback = false;
           break;
@@ -515,7 +521,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
   } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) {
     // VLDM/VSTM do not support DB mode without also updating the base reg.
     Mode = ARM_AM::db;
-  } else if (Offset != 0) {
+  } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
     // Check if this is a supported opcode before inserting instructions to
     // calculate a new base register.
     if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false;
@@ -545,6 +551,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
 
     int BaseOpc =
       isThumb2 ? ARM::t2ADDri :
+      (isThumb1 && Base == ARM::SP) ? ARM::tADDrSPi :
       (isThumb1 && Offset < 8) ? ARM::tADDi3 :
       isThumb1 ? ARM::tADDi8  : ARM::ADDri;
 
@@ -552,7 +559,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
       Offset = - Offset;
       BaseOpc =
         isThumb2 ? ARM::t2SUBri :
-        (isThumb1 && Offset < 8) ? ARM::tSUBi3 :
+        (isThumb1 && Offset < 8 && Base != ARM::SP) ? ARM::tSUBi3 :
         isThumb1 ? ARM::tSUBi8  : ARM::SUBri;
     }
 
@@ -566,18 +573,34 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
       // or
       //   MOV  NewBase, Base
       //   ADDS NewBase, #imm8.
-      if (Base != NewBase && Offset >= 8) {
+      if (Base != NewBase &&
+          (BaseOpc == ARM::tADDi8 || BaseOpc == ARM::tSUBi8)) {
         // Need to insert a MOV to the new base first.
-        BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase)
-          .addReg(Base, getKillRegState(BaseKill))
-          .addImm(Pred).addReg(PredReg);
+        if (isARMLowRegister(NewBase) && isARMLowRegister(Base) &&
+            !STI->hasV6Ops()) {
+          // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr
+          if (Pred != ARMCC::AL)
+            return false;
+          BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVSr), NewBase)
+            .addReg(Base, getKillRegState(BaseKill));
+        } else
+          BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase)
+            .addReg(Base, getKillRegState(BaseKill))
+            .addImm(Pred).addReg(PredReg);
+
         // Set up BaseKill and Base correctly to insert the ADDS/SUBS below.
         Base = NewBase;
         BaseKill = false;
       }
-      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase), true)
-        .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
-        .addImm(Pred).addReg(PredReg);
+      if (BaseOpc == ARM::tADDrSPi) {
+        assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4");
+        BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
+          .addReg(Base, getKillRegState(BaseKill)).addImm(Offset/4)
+          .addImm(Pred).addReg(PredReg);
+      } else
+        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase), true)
+          .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
+          .addImm(Pred).addReg(PredReg);
     } else {
       BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
         .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
@@ -958,6 +981,8 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
   case ARM::STRi12:
   case ARM::tLDRi:
   case ARM::tSTRi:
+  case ARM::tLDRspi:
+  case ARM::tSTRspi:
   case ARM::t2LDRi8:
   case ARM::t2LDRi12:
   case ARM::t2STRi8:
@@ -1393,6 +1418,8 @@ static bool isMemoryOp(const MachineInstr *MI) {
   case ARM::STRi12:
   case ARM::tLDRi:
   case ARM::tSTRi:
+  case ARM::tLDRspi:
+  case ARM::tSTRspi:
   case ARM::t2LDRi8:
   case ARM::t2LDRi12:
   case ARM::t2STRi8:
@@ -1787,12 +1814,11 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
 }
 
 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
-  const TargetMachine &TM = Fn.getTarget();
-  TL = TM.getSubtargetImpl()->getTargetLowering();
+  STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+  TL = STI->getTargetLowering();
   AFI = Fn.getInfo<ARMFunctionInfo>();
-  TII = TM.getSubtargetImpl()->getInstrInfo();
-  TRI = TM.getSubtargetImpl()->getRegisterInfo();
-  STI = &TM.getSubtarget<ARMSubtarget>();
+  TII = STI->getInstrInfo();
+  TRI = STI->getRegisterInfo();
   RS = new RegScavenger();
   isThumb2 = AFI->isThumb2Function();
   isThumb1 = AFI->isThumbFunction() && !isThumb2;
@@ -1802,7 +1828,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
        ++MFI) {
     MachineBasicBlock &MBB = *MFI;
     Modified |= LoadStoreMultipleOpti(MBB);
-    if (TM.getSubtarget<ARMSubtarget>().hasV5TOps())
+    if (STI->hasV5TOps())
       Modified |= MergeReturnIntoLDM(MBB);
   }
 
@@ -1850,10 +1876,10 @@ namespace {
 }
 
 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
-  TD = Fn.getSubtarget().getDataLayout();
-  TII = Fn.getSubtarget().getInstrInfo();
-  TRI = Fn.getSubtarget().getRegisterInfo();
+  TD = Fn.getTarget().getDataLayout();
   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+  TII = STI->getInstrInfo();
+  TRI = STI->getRegisterInfo();
   MRI = &Fn.getRegInfo();
   MF  = &Fn;
 
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index 023f5f8..fd4f5ff 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -119,11 +119,45 @@ void llvm::LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                         ARMAsmPrinter &AP) {
   OutMI.setOpcode(MI->getOpcode());
 
+  // In the MC layer, we keep modified immediates in their encoded form
+  bool EncodeImms = false;
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::MOVi:
+  case ARM::MVNi:
+  case ARM::CMPri:
+  case ARM::CMNri:
+  case ARM::TSTri:
+  case ARM::TEQri:
+  case ARM::MSRi:
+  case ARM::ADCri:
+  case ARM::ADDri:
+  case ARM::ADDSri:
+  case ARM::SBCri:
+  case ARM::SUBri:
+  case ARM::SUBSri:
+  case ARM::ANDri:
+  case ARM::ORRri:
+  case ARM::EORri:
+  case ARM::BICri:
+  case ARM::RSBri:
+  case ARM::RSBSri:
+  case ARM::RSCri:
+    EncodeImms = true;
+    break;
+  }
+
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
 
     MCOperand MCOp;
-    if (AP.lowerOperand(MO, MCOp))
+    if (AP.lowerOperand(MO, MCOp)) {
+      if (MCOp.isImm() && EncodeImms) {
+        int32_t Enc = ARM_AM::getSOImmVal(MCOp.getImm());
+        if (Enc != -1)
+          MCOp.setImm(Enc);
+      }
       OutMI.addOperand(MCOp);
+    }
   }
 }
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/lib/Target/ARM/ARMMachineFunctionInfo.cpp
index 892b269..229d041 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.cpp
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -14,8 +14,8 @@ using namespace llvm;
 void ARMFunctionInfo::anchor() { }
 
 ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF)
-    : isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
-      hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()),
+    : isThumb(MF.getSubtarget<ARMSubtarget>().isThumb()),
+      hasThumb2(MF.getSubtarget<ARMSubtarget>().hasThumb2()),
       StByValParamsPadding(0), ArgRegsSaveSize(0), HasStackFrame(false),
       RestoreSPFromFP(false), LRSpilledForFarJump(false),
       FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 4e67fa1..ddfdb52 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -16,10 +16,10 @@
 
 #include "ARMSubtarget.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/ADT/DenseMap.h"
 
 namespace llvm {
 
diff --git a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
index 2a49255..1c50f9e 100644
--- a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
+++ b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
@@ -9,8 +9,8 @@
 //===------------------------------------------------------------------------------------------===//
 
 #include "ARM.h"
-#include "ARMMachineFunctionInfo.h"
 #include "ARMInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 using namespace llvm;
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index b290e7f..45cc9ea 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -199,7 +199,7 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
   // Thumb1 instructions that know how to use hi regs.
   let AltOrders = [(add LR, GPR), (trunc GPR, 8)];
   let AltOrderSelect = [{
-      return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
   }];
 }
 
@@ -209,7 +209,7 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
 def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
   let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
   let AltOrderSelect = [{
-      return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
   }];
 }
 
@@ -219,7 +219,7 @@ def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
 def GPRwithAPSR : RegisterClass<"ARM", [i32], 32, (add (sub GPR, PC), APSR_NZCV)> {
   let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
   let AltOrderSelect = [{
-      return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
   }];
 }
 
@@ -237,7 +237,7 @@ def GPRsp : RegisterClass<"ARM", [i32], 32, (add SP)>;
 def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
   let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)];
   let AltOrderSelect = [{
-      return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
   }];
 }
 
@@ -255,7 +255,7 @@ def hGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, tGPR)>;
 def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R12)> {
   let AltOrders = [(and tcGPR, tGPR)];
   let AltOrderSelect = [{
-      return MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().isThumb1Only();
   }];
 }
 
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index fa30ac3..636205f 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -32,7 +32,8 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                              bool isVolatile, bool AlwaysInline,
                                              MachinePointerInfo DstPtrInfo,
                                           MachinePointerInfo SrcPtrInfo) const {
-  const ARMSubtarget &Subtarget = DAG.getTarget().getSubtarget<ARMSubtarget>();
+  const ARMSubtarget &Subtarget =
+      DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
   // Do repeated 4-byte loads and stores. To be improved.
   // This requires 4-byte alignment.
   if ((Align & 3) != 0)
@@ -150,14 +151,14 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                         SDValue Src, SDValue Size,
                         unsigned Align, bool isVolatile,
                         MachinePointerInfo DstPtrInfo) const {
-  const ARMSubtarget &Subtarget = DAG.getTarget().getSubtarget<ARMSubtarget>();
+  const ARMSubtarget &Subtarget =
+      DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
   // Use default for non-AAPCS (or MachO) subtargets
   if (!Subtarget.isAAPCS_ABI() || Subtarget.isTargetMachO() ||
       Subtarget.isTargetWindows())
     return SDValue();
 
-  const ARMTargetLowering &TLI =
-      *DAG.getTarget().getSubtarget<ARMSubtarget>().getTargetLowering();
+  const ARMTargetLowering &TLI = *Subtarget.getTargetLowering();
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 600f39d..89624dd 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -15,12 +15,14 @@
 #include "ARMFrameLowering.h"
 #include "ARMISelLowering.h"
 #include "ARMInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
 #include "ARMSelectionDAGInfo.h"
 #include "ARMSubtarget.h"
-#include "ARMMachineFunctionInfo.h"
+#include "ARMTargetMachine.h"
 #include "Thumb1FrameLowering.h"
 #include "Thumb1InstrInfo.h"
 #include "Thumb2InstrInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
@@ -28,7 +30,6 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 
 using namespace llvm;
 
@@ -87,56 +88,6 @@ IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT),
                          "Allow IT blocks based on ARMv7"),
               clEnumValEnd));
 
-static std::string computeDataLayout(ARMSubtarget &ST) {
-  std::string Ret = "";
-
-  if (ST.isLittle())
-    // Little endian.
-    Ret += "e";
-  else
-    // Big endian.
-    Ret += "E";
-
-  Ret += DataLayout::getManglingComponent(ST.getTargetTriple());
-
-  // Pointers are 32 bits and aligned to 32 bits.
-  Ret += "-p:32:32";
-
-  // ABIs other than APCS have 64 bit integers with natural alignment.
-  if (!ST.isAPCS_ABI())
-    Ret += "-i64:64";
-
-  // We have 64 bits floats. The APCS ABI requires them to be aligned to 32
-  // bits, others to 64 bits. We always try to align to 64 bits.
-  if (ST.isAPCS_ABI())
-    Ret += "-f64:32:64";
-
-  // We have 128 and 64 bit vectors. The APCS ABI aligns them to 32 bits, others
-  // to 64. We always ty to give them natural alignment.
-  if (ST.isAPCS_ABI())
-    Ret += "-v64:32:64-v128:32:128";
-  else
-    Ret += "-v128:64:128";
-
-  // Try to align aggregates to 32 bits (the default is 64 bits, which has no
-  // particular hardware support on 32-bit ARM).
-  Ret += "-a:0:32";
-
-  // Integer registers are 32 bits.
-  Ret += "-n32";
-
-  // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit
-  // aligned everywhere else.
-  if (ST.isTargetNaCl())
-    Ret += "-S128";
-  else if (ST.isAAPCS_ABI())
-    Ret += "-S64";
-  else
-    Ret += "-S32";
-
-  return Ret;
-}
-
 /// initializeSubtargetDependencies - Initializes using a CPU and feature string
 /// so that we can use initializer lists for subtarget initialization.
 ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
@@ -146,23 +97,31 @@ ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
   return *this;
 }
 
+ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU,
+                                                        StringRef FS) {
+  ARMSubtarget &STI = initializeSubtargetDependencies(CPU, FS);
+  if (STI.isThumb1Only())
+    return (ARMFrameLowering *)new Thumb1FrameLowering(STI);
+
+  return new ARMFrameLowering(STI);
+}
+
 ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS, const TargetMachine &TM,
-                           bool IsLittle)
+                           const std::string &FS,
+                           const ARMBaseTargetMachine &TM, bool IsLittle)
     : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
       ARMProcClass(None), stackAlignment(4), CPUString(CPU), IsLittle(IsLittle),
-      TargetTriple(TT), Options(TM.Options), TargetABI(ARM_ABI_UNKNOWN),
-      DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS))),
-      TSInfo(DL),
+      TargetTriple(TT), Options(TM.Options), TM(TM),
+      TSInfo(*TM.getDataLayout()),
+      FrameLowering(initializeFrameLowering(CPU, FS)),
+      // At this point initializeSubtargetDependencies has been called so
+      // we can query directly.
       InstrInfo(isThumb1Only()
                     ? (ARMBaseInstrInfo *)new Thumb1InstrInfo(*this)
                     : !isThumb()
                           ? (ARMBaseInstrInfo *)new ARMInstrInfo(*this)
                           : (ARMBaseInstrInfo *)new Thumb2InstrInfo(*this)),
-      TLInfo(TM),
-      FrameLowering(!isThumb1Only()
-                        ? new ARMFrameLowering(*this)
-                        : (ARMFrameLowering *)new Thumb1FrameLowering(*this)) {}
+      TLInfo(TM, *this) {}
 
 void ARMSubtarget::initializeEnvironment() {
   HasV4TOps = false;
@@ -216,7 +175,7 @@ void ARMSubtarget::initializeEnvironment() {
 
 void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (CPUString.empty()) {
-    if (isTargetIOS() && TargetTriple.getArchName().endswith("v7s"))
+    if (isTargetDarwin() && TargetTriple.getArchName().endswith("v7s"))
       // Default to the Swift CPU when targeting armv7s/thumbv7s.
       CPUString = "swift";
     else
@@ -226,8 +185,8 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // Insert the architecture feature derived from the target triple into the
   // feature string. This is important for setting features that are implied
   // based on the architecture version.
-  std::string ArchFS = ARM_MC::ParseARMTriple(TargetTriple.getTriple(),
-                                              CPUString);
+  std::string ArchFS =
+      ARM_MC::ParseARMTriple(TargetTriple.getTriple(), CPUString);
   if (!FS.empty()) {
     if (!ArchFS.empty())
       ArchFS = ArchFS + "," + FS.str();
@@ -246,30 +205,9 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUString);
 
-  if (TargetABI == ARM_ABI_UNKNOWN) {
-    switch (TargetTriple.getEnvironment()) {
-    case Triple::Android:
-    case Triple::EABI:
-    case Triple::EABIHF:
-    case Triple::GNUEABI:
-    case Triple::GNUEABIHF:
-      TargetABI = ARM_ABI_AAPCS;
-      break;
-    default:
-      if (TargetTriple.isOSBinFormatMachO() &&
-          TargetTriple.getOS() == Triple::UnknownOS)
-        TargetABI = ARM_ABI_AAPCS;
-      else
-        TargetABI = ARM_ABI_APCS;
-      break;
-    }
-  }
-
   // FIXME: this is invalid for WindowsCE
-  if (isTargetWindows()) {
-    TargetABI = ARM_ABI_AAPCS;
+  if (isTargetWindows())
     NoARM = true;
-  }
 
   if (isAAPCS_ABI())
     stackAlignment = 8;
@@ -331,6 +269,15 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
     UseNEONForSinglePrecisionFP = true;
 }
 
+bool ARMSubtarget::isAPCS_ABI() const {
+  assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
+  return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_APCS;
+}
+bool ARMSubtarget::isAAPCS_ABI() const {
+  assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
+  return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS;
+}
+
 /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol.
 bool
 ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
@@ -402,6 +349,5 @@ bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
   // immediates as it is inherently position independent, and may be out of
   // range otherwise.
   return UseMovt && (isTargetWindows() ||
-                     !MF.getFunction()->getAttributes().hasAttribute(
-                         AttributeSet::FunctionIndex, Attribute::MinSize));
+                     !MF.getFunction()->hasFnAttribute(Attribute::MinSize));
 }
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index d5ee009..f4deddf 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -20,10 +20,10 @@
 #include "ARMInstrInfo.h"
 #include "ARMSelectionDAGInfo.h"
 #include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "Thumb1FrameLowering.h"
 #include "Thumb1InstrInfo.h"
 #include "Thumb2InstrInfo.h"
-#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
@@ -37,6 +37,7 @@ namespace llvm {
 class GlobalValue;
 class StringRef;
 class TargetOptions;
+class ARMBaseTargetMachine;
 
 class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
@@ -228,18 +229,14 @@ protected:
   /// Options passed via command line that could influence the target
   const TargetOptions &Options;
 
- public:
-  enum {
-    ARM_ABI_UNKNOWN,
-    ARM_ABI_APCS,
-    ARM_ABI_AAPCS // ARM EABI
-  } TargetABI;
+  const ARMBaseTargetMachine &TM;
 
+public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
   ARMSubtarget(const std::string &TT, const std::string &CPU,
-               const std::string &FS, const TargetMachine &TM, bool IsLittle);
+               const std::string &FS, const ARMBaseTargetMachine &TM, bool IsLittle);
 
   /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
   /// that still makes it profitable to inline the call.
@@ -254,7 +251,6 @@ protected:
   /// so that we can use initializer lists for subtarget initialization.
   ARMSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
 
-  const DataLayout *getDataLayout() const override { return &DL; }
   const ARMSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
@@ -272,16 +268,17 @@ protected:
   }
 
 private:
-  const DataLayout DL;
   ARMSelectionDAGInfo TSInfo;
+  // Either Thumb1FrameLowering or ARMFrameLowering.
+  std::unique_ptr<ARMFrameLowering> FrameLowering;
   // Either Thumb1InstrInfo or Thumb2InstrInfo.
   std::unique_ptr<ARMBaseInstrInfo> InstrInfo;
   ARMTargetLowering   TLInfo;
-  // Either Thumb1FrameLowering or ARMFrameLowering.
-  std::unique_ptr<ARMFrameLowering> FrameLowering;
 
   void initializeEnvironment();
   void initSubtargetFeatures(StringRef CPU, StringRef FS);
+  ARMFrameLowering *initializeFrameLowering(StringRef CPU, StringRef FS);
+
 public:
   void computeIssueWidth();
 
@@ -316,7 +313,8 @@ public:
   bool hasCRC() const { return HasCRC; }
   bool hasVirtualization() const { return HasVirtualization; }
   bool useNEONForSinglePrecisionFP() const {
-    return hasNEON() && UseNEONForSinglePrecisionFP; }
+    return hasNEON() && UseNEONForSinglePrecisionFP;
+  }
 
   bool hasDivide() const { return HasHardwareDivide; }
   bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
@@ -350,7 +348,7 @@ public:
   bool isTargetIOS() const { return TargetTriple.isiOS(); }
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
-  bool isTargetNetBSD() const { return TargetTriple.getOS() == Triple::NetBSD; }
+  bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
   bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
 
   bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
@@ -391,14 +389,8 @@ public:
     return TargetTriple.getEnvironment() == Triple::Android;
   }
 
-  bool isAPCS_ABI() const {
-    assert(TargetABI != ARM_ABI_UNKNOWN);
-    return TargetABI == ARM_ABI_APCS;
-  }
-  bool isAAPCS_ABI() const {
-    assert(TargetABI != ARM_ABI_UNKNOWN);
-    return TargetABI == ARM_ABI_AAPCS;
-  }
+  bool isAPCS_ABI() const;
+  bool isAAPCS_ABI() const;
 
   bool isThumb() const { return InThumbMode; }
   bool isThumb1Only() const { return InThumbMode && !HasThumb2; }
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 88d6c5e..a97a058 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -11,13 +11,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
-#include "ARMTargetMachine.h"
 #include "ARMFrameLowering.h"
+#include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
+#include "ARMTargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -52,6 +53,110 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   return make_unique<ARMElfTargetObjectFile>();
 }
 
+static ARMBaseTargetMachine::ARMABI
+computeTargetABI(const Triple &TT, StringRef CPU,
+                 const TargetOptions &Options) {
+  if (Options.MCOptions.getABIName().startswith("aapcs"))
+    return ARMBaseTargetMachine::ARM_ABI_AAPCS;
+  else if (Options.MCOptions.getABIName().startswith("apcs"))
+    return ARMBaseTargetMachine::ARM_ABI_APCS;
+
+  assert(Options.MCOptions.getABIName().empty() &&
+         "Unknown target-abi option!");
+
+  ARMBaseTargetMachine::ARMABI TargetABI =
+      ARMBaseTargetMachine::ARM_ABI_UNKNOWN;
+
+  // FIXME: This is duplicated code from the front end and should be unified.
+  if (TT.isOSBinFormatMachO()) {
+    if (TT.getEnvironment() == llvm::Triple::EABI ||
+        (TT.getOS() == llvm::Triple::UnknownOS &&
+         TT.getObjectFormat() == llvm::Triple::MachO) ||
+        CPU.startswith("cortex-m")) {
+      TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+    } else {
+      TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
+    }
+  } else if (TT.isOSWindows()) {
+    // FIXME: this is invalid for WindowsCE
+    TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+  } else {
+    // Select the default based on the platform.
+    switch (TT.getEnvironment()) {
+    case llvm::Triple::Android:
+    case llvm::Triple::GNUEABI:
+    case llvm::Triple::GNUEABIHF:
+    case llvm::Triple::EABIHF:
+    case llvm::Triple::EABI:
+      TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+      break;
+    case llvm::Triple::GNU:
+      TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
+      break;
+    default:
+      if (TT.getOS() == llvm::Triple::NetBSD)
+	TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
+      else
+	TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+      break;
+    }
+  }
+
+  return TargetABI;
+}
+
+static std::string computeDataLayout(const Triple &TT,
+                                     ARMBaseTargetMachine::ARMABI ABI,
+                                     bool isLittle) {
+  std::string Ret = "";
+
+  if (isLittle)
+    // Little endian.
+    Ret += "e";
+  else
+    // Big endian.
+    Ret += "E";
+
+  Ret += DataLayout::getManglingComponent(TT);
+
+  // Pointers are 32 bits and aligned to 32 bits.
+  Ret += "-p:32:32";
+
+  // ABIs other than APCS have 64 bit integers with natural alignment.
+  if (ABI != ARMBaseTargetMachine::ARM_ABI_APCS)
+    Ret += "-i64:64";
+
+  // We have 64 bits floats. The APCS ABI requires them to be aligned to 32
+  // bits, others to 64 bits. We always try to align to 64 bits.
+  if (ABI == ARMBaseTargetMachine::ARM_ABI_APCS)
+    Ret += "-f64:32:64";
+
+  // We have 128 and 64 bit vectors. The APCS ABI aligns them to 32 bits, others
+  // to 64. We always ty to give them natural alignment.
+  if (ABI == ARMBaseTargetMachine::ARM_ABI_APCS)
+    Ret += "-v64:32:64-v128:32:128";
+  else
+    Ret += "-v128:64:128";
+
+  // Try to align aggregates to 32 bits (the default is 64 bits, which has no
+  // particular hardware support on 32-bit ARM).
+  Ret += "-a:0:32";
+
+  // Integer registers are 32 bits.
+  Ret += "-n32";
+
+  // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit
+  // aligned everywhere else.
+  if (TT.isOSNaCl())
+    Ret += "-S128";
+  else if (ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS)
+    Ret += "-S64";
+  else
+    Ret += "-S32";
+
+  return Ret;
+}
+
 /// TargetMachine ctor - Create an ARM architecture model.
 ///
 ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT,
@@ -60,6 +165,8 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT,
                                            Reloc::Model RM, CodeModel::Model CM,
                                            CodeGenOpt::Level OL, bool isLittle)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      TargetABI(computeTargetABI(Triple(TT), CPU, Options)),
+      DL(computeDataLayout(Triple(TT), TargetABI, isLittle)),
       TLOF(createTLOF(Triple(getTargetTriple()))),
       Subtarget(TT, CPU, FS, *this, isLittle), isLittle(isLittle) {
 
@@ -73,11 +180,8 @@ ARMBaseTargetMachine::~ARMBaseTargetMachine() {}
 
 const ARMSubtarget *
 ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
-  AttributeSet FnAttrs = F.getAttributes();
-  Attribute CPUAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
-  Attribute FSAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
 
   std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
                         ? CPUAttr.getValueAsString().str()
@@ -91,8 +195,7 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
   // function before we can generate a subtarget. We also need to use
   // it as a key for the subtarget since that can be the only difference
   // between two functions.
-  Attribute SFAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "use-soft-float");
+  Attribute SFAttr = F.getFnAttribute("use-soft-float");
   bool SoftFloat = !SFAttr.hasAttribute(Attribute::None)
                        ? SFAttr.getValueAsString() == "true"
                        : Options.UseSoftFloat;
@@ -109,12 +212,9 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
   return I.get();
 }
 
-void ARMBaseTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  // Add first the target-independent BasicTTI pass, then our ARM pass. This
-  // allows the ARM pass to delegate to the target independent layer when
-  // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(this));
-  PM.add(createARMTargetTransformInfoPass(this));
+TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis(
+      [this](Function &F) { return TargetTransformInfo(ARMTTIImpl(this, F)); });
 }
 
 
@@ -197,9 +297,9 @@ public:
   void addIRPasses() override;
   bool addPreISel() override;
   bool addInstSelector() override;
-  bool addPreRegAlloc() override;
-  bool addPreSched2() override;
-  bool addPreEmitPass() override;
+  void addPreRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
 };
 } // namespace
 
@@ -226,7 +326,12 @@ void ARMPassConfig::addIRPasses() {
 
 bool ARMPassConfig::addPreISel() {
   if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createGlobalMergePass(TM));
+    // FIXME: This is using the thumb1 only constant value for
+    // maximal global offset for merging globals. We may want
+    // to look into using the old value for non-thumb1 code of
+    // 4095 based on the TargetMachine, but this starts to become
+    // tricky when doing code gen per function.
+    addPass(createGlobalMergePass(TM, 127));
 
   return false;
 }
@@ -241,7 +346,7 @@ bool ARMPassConfig::addInstSelector() {
   return false;
 }
 
-bool ARMPassConfig::addPreRegAlloc() {
+void ARMPassConfig::addPreRegAlloc() {
   if (getOptLevel() != CodeGenOpt::None)
     addPass(createARMLoadStoreOptimizationPass(true));
   if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA9())
@@ -252,13 +357,11 @@ bool ARMPassConfig::addPreRegAlloc() {
     getARMSubtarget().hasNEON() && !DisableA15SDOptimization) {
     addPass(createA15SDOptimizerPass());
   }
-  return true;
 }
 
-bool ARMPassConfig::addPreSched2() {
+void ARMPassConfig::addPreSched2() {
   if (getOptLevel() != CodeGenOpt::None) {
     addPass(createARMLoadStoreOptimizationPass());
-    printAndVerify("After ARM load / store optimizer");
 
     if (getARMSubtarget().hasNEON())
       addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass));
@@ -279,11 +382,9 @@ bool ARMPassConfig::addPreSched2() {
   }
   if (getARMSubtarget().isThumb2())
     addPass(createThumb2ITBlockPass());
-
-  return true;
 }
 
-bool ARMPassConfig::addPreEmitPass() {
+void ARMPassConfig::addPreEmitPass() {
   if (getARMSubtarget().isThumb2()) {
     if (!getARMSubtarget().prefers32BitThumb())
       addPass(createThumb2SizeReductionPass());
@@ -294,6 +395,4 @@ bool ARMPassConfig::addPreEmitPass() {
 
   addPass(createARMOptimizeBarriersPass());
   addPass(createARMConstantIslandPass());
-
-  return true;
 }
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index fba0ec2..7f6a1ee 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -22,7 +22,15 @@
 namespace llvm {
 
 class ARMBaseTargetMachine : public LLVMTargetMachine {
+public:
+  enum ARMABI {
+    ARM_ABI_UNKNOWN,
+    ARM_ABI_APCS,
+    ARM_ABI_AAPCS // ARM EABI
+  } TargetABI;
+
 protected:
+  const DataLayout DL;
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   ARMSubtarget        Subtarget;
   bool isLittle;
@@ -39,9 +47,11 @@ public:
 
   const ARMSubtarget *getSubtargetImpl() const override { return &Subtarget; }
   const ARMSubtarget *getSubtargetImpl(const Function &F) const override;
+  const DataLayout *getDataLayout() const override { return &DL; }
+  bool isLittleEndian() const { return isLittle; }
 
-  /// \brief Register ARM analysis passes with a pass manager.
-  void addAnalysisPasses(PassManagerBase &PM) override;
+  /// \brief Get the TargetIRAnalysis for this target.
+  TargetIRAnalysis getTargetIRAnalysis() override;
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index 48238bf..80f03c6 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMTargetObjectFile.h"
-#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -27,7 +27,8 @@ using namespace dwarf;
 
 void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
                                         const TargetMachine &TM) {
-  bool isAAPCS_ABI = TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI();
+  bool isAAPCS_ABI = static_cast<const ARMTargetMachine &>(TM).TargetABI ==
+                     ARMTargetMachine::ARMABI::ARM_ABI_AAPCS;
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(isAAPCS_ABI);
 
@@ -36,10 +37,7 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
   }
 
   AttributesSection =
-    getContext().getELFSection(".ARM.attributes",
-                               ELF::SHT_ARM_ATTRIBUTES,
-                               0,
-                               SectionKind::getMetadata());
+      getContext().getELFSection(".ARM.attributes", ELF::SHT_ARM_ATTRIBUTES, 0);
 }
 
 const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index ec834e8..4e1b371 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1,4 +1,4 @@
-//===-- ARMTargetTransformInfo.cpp - ARM specific TTI pass ----------------===//
+//===-- ARMTargetTransformInfo.cpp - ARM specific TTI ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,17 +6,8 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-/// \file
-/// This file implements a TargetTransformInfo analysis pass specific to the
-/// ARM target machine. It uses the target's detailed information to provide
-/// more precise answers to certain TTI queries, while letting the target
-/// independent and default TTI implementations handle the rest.
-///
-//===----------------------------------------------------------------------===//
 
-#include "ARM.h"
-#include "ARMTargetMachine.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
+#include "ARMTargetTransformInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
@@ -24,132 +15,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "armtti"
 
-// Declare the pass initialization routine locally as target-specific passes
-// don't have a target-wide initialization entry point, and so we rely on the
-// pass constructor initialization.
-namespace llvm {
-void initializeARMTTIPass(PassRegistry &);
-}
-
-namespace {
-
-class ARMTTI final : public ImmutablePass, public TargetTransformInfo {
-  const ARMBaseTargetMachine *TM;
-  const ARMSubtarget *ST;
-  const ARMTargetLowering *TLI;
-
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
-
-public:
-  ARMTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
-    llvm_unreachable("This pass cannot be directly constructed");
-  }
-
-  ARMTTI(const ARMBaseTargetMachine *TM)
-      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
-        TLI(TM->getSubtargetImpl()->getTargetLowering()) {
-    initializeARMTTIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void initializePass() override {
-    pushTTIStack(this);
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    TargetTransformInfo::getAnalysisUsage(AU);
-  }
-
-  /// Pass identification.
-  static char ID;
-
-  /// Provide necessary pointer adjustments for the two base classes.
-  void *getAdjustedAnalysisPointer(const void *ID) override {
-    if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo*)this;
-    return this;
-  }
-
-  /// \name Scalar TTI Implementations
-  /// @{
-  using TargetTransformInfo::getIntImmCost;
-  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
-
-  /// @}
-
-
-  /// \name Vector TTI Implementations
-  /// @{
-
-  unsigned getNumberOfRegisters(bool Vector) const override {
-    if (Vector) {
-      if (ST->hasNEON())
-        return 16;
-      return 0;
-    }
-
-    if (ST->isThumb1Only())
-      return 8;
-    return 13;
-  }
-
-  unsigned getRegisterBitWidth(bool Vector) const override {
-    if (Vector) {
-      if (ST->hasNEON())
-        return 128;
-      return 0;
-    }
-
-    return 32;
-  }
-
-  unsigned getMaxInterleaveFactor() const override {
-    // These are out of order CPUs:
-    if (ST->isCortexA15() || ST->isSwift())
-      return 2;
-    return 1;
-  }
-
-  unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
-                          int Index, Type *SubTp) const override;
-
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
-                            Type *Src) const override;
-
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                              Type *CondTy) const override;
-
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
-                              unsigned Index) const override;
-
-  unsigned getAddressComputationCost(Type *Val,
-                                     bool IsComplex) const override;
-
-  unsigned getArithmeticInstrCost(
-      unsigned Opcode, Type *Ty, OperandValueKind Op1Info = OK_AnyValue,
-      OperandValueKind Op2Info = OK_AnyValue,
-      OperandValueProperties Opd1PropInfo = OP_None,
-      OperandValueProperties Opd2PropInfo = OP_None) const override;
-
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) const override;
-  /// @}
-};
-
-} // end anonymous namespace
-
-INITIALIZE_AG_PASS(ARMTTI, TargetTransformInfo, "armtti",
-                   "ARM Target Transform Info", true, true, false)
-char ARMTTI::ID = 0;
-
-ImmutablePass *
-llvm::createARMTargetTransformInfoPass(const ARMBaseTargetMachine *TM) {
-  return new ARMTTI(TM);
-}
-
-
-unsigned ARMTTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+unsigned ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned Bits = Ty->getPrimitiveSizeInBits();
@@ -181,8 +47,7 @@ unsigned ARMTTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
   return 3;
 }
 
-unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                  Type *Src) const {
+unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -206,7 +71,7 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   EVT DstTy = TLI->getValueType(Dst);
 
   if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+    return BaseT::getCastInstrCost(Opcode, Dst, Src);
 
   // Some arithmetic, load and store operations have specific instructions
   // to cast up/down their types automatically at no extra cost.
@@ -377,11 +242,11 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
       return ARMIntegerConversionTbl[Idx].Cost;
   }
 
-  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+  return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-unsigned ARMTTI::getVectorInstrCost(unsigned Opcode, Type *ValTy,
-                                    unsigned Index) const {
+unsigned ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                        unsigned Index) {
   // Penalize inserting into an D-subregister. We end up with a three times
   // lower estimated throughput on swift.
   if (ST->isSwift() &&
@@ -397,11 +262,11 @@ unsigned ARMTTI::getVectorInstrCost(unsigned Opcode, Type *ValTy,
       ValTy->getVectorElementType()->isIntegerTy())
     return 3;
 
-  return TargetTransformInfo::getVectorInstrCost(Opcode, ValTy, Index);
+  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
 }
 
-unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                    Type *CondTy) const {
+unsigned ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                        Type *CondTy) {
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   // On NEON a a vector select gets lowered to vbsl.
@@ -431,10 +296,10 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     return LT.first;
   }
 
-  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 }
 
-unsigned ARMTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+unsigned ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
@@ -449,13 +314,32 @@ unsigned ARMTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
   return 1;
 }
 
-unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
-                                Type *SubTp) const {
+unsigned ARMTTIImpl::getFPOpCost(Type *Ty) {
+  // Use similar logic that's in ARMISelLowering:
+  // Any ARM CPU with VFP2 has floating point, but Thumb1 didn't have access
+  // to VFP.
+
+  if (ST->hasVFP2() && !ST->isThumb1Only()) {
+    if (Ty->isFloatTy()) {
+      return TargetTransformInfo::TCC_Basic;
+    }
+
+    if (Ty->isDoubleTy()) {
+      return ST->isFPOnlySP() ? TargetTransformInfo::TCC_Expensive :
+        TargetTransformInfo::TCC_Basic;
+    }
+  }
+
+  return TargetTransformInfo::TCC_Expensive;
+}
+
+unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                                    Type *SubTp) {
   // We only handle costs of reverse and alternate shuffles for now.
-  if (Kind != SK_Reverse && Kind != SK_Alternate)
-    return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+  if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
+    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 
-  if (Kind == SK_Reverse) {
+  if (Kind == TTI::SK_Reverse) {
     static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = {
         // Reverse shuffle cost one instruction if we are shuffling within a
         // double word (vrev) or two if we shuffle a quad word (vrev, vext).
@@ -473,11 +357,11 @@ unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
 
     int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
     if (Idx == -1)
-      return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+      return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 
     return LT.first * NEONShuffleTbl[Idx].Cost;
   }
-  if (Kind == SK_Alternate) {
+  if (Kind == TTI::SK_Alternate) {
     static const CostTblEntry<MVT::SimpleValueType> NEONAltShuffleTbl[] = {
         // Alt shuffle cost table for ARM. Cost is the number of instructions
         // required to create the shuffled vector.
@@ -499,16 +383,16 @@ unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
     int Idx =
         CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
     if (Idx == -1)
-      return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+      return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
     return LT.first * NEONAltShuffleTbl[Idx].Cost;
   }
-  return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
-unsigned ARMTTI::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, OperandValueKind Op1Info,
-    OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo,
-    OperandValueProperties Opd2PropInfo) const {
+unsigned ARMTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
+    TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
 
   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
@@ -564,8 +448,8 @@ unsigned ARMTTI::getArithmeticInstrCost(
   if (Idx != -1)
     return LT.first * CostTbl[Idx].Cost;
 
-  unsigned Cost = TargetTransformInfo::getArithmeticInstrCost(
-      Opcode, Ty, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
+  unsigned Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+                                                Opd1PropInfo, Opd2PropInfo);
 
   // This is somewhat of a hack. The problem that we are facing is that SROA
   // creates a sequence of shift, and, or instructions to construct values.
@@ -581,8 +465,9 @@ unsigned ARMTTI::getArithmeticInstrCost(
   return Cost;
 }
 
-unsigned ARMTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                 unsigned AddressSpace) const {
+unsigned ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                     unsigned Alignment,
+                                     unsigned AddressSpace) {
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
 
   if (Src->isVectorTy() && Alignment != 16 &&
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
new file mode 100644
index 0000000..97590f6
--- /dev/null
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -0,0 +1,134 @@
+//===-- ARMTargetTransformInfo.h - ARM specific TTI -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// ARM target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H
+
+#include "ARM.h"
+#include "ARMTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
+  typedef BasicTTIImplBase<ARMTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const ARMSubtarget *ST;
+  const ARMTargetLowering *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+
+  const ARMSubtarget *getST() const { return ST; }
+  const ARMTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, Function &F)
+      : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {}
+
+  // Provide value semantics. MSVC requires that we spell all of these out.
+  ARMTTIImpl(const ARMTTIImpl &Arg)
+      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
+  ARMTTIImpl(ARMTTIImpl &&Arg)
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+        TLI(std::move(Arg.TLI)) {}
+  ARMTTIImpl &operator=(const ARMTTIImpl &RHS) {
+    BaseT::operator=(static_cast<const BaseT &>(RHS));
+    ST = RHS.ST;
+    TLI = RHS.TLI;
+    return *this;
+  }
+  ARMTTIImpl &operator=(ARMTTIImpl &&RHS) {
+    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
+    ST = std::move(RHS.ST);
+    TLI = std::move(RHS.TLI);
+    return *this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+
+  using BaseT::getIntImmCost;
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty);
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector) {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 16;
+      return 0;
+    }
+
+    if (ST->isThumb1Only())
+      return 8;
+    return 13;
+  }
+
+  unsigned getRegisterBitWidth(bool Vector) {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 128;
+      return 0;
+    }
+
+    return 32;
+  }
+
+  unsigned getMaxInterleaveFactor() {
+    // These are out of order CPUs:
+    if (ST->isCortexA15() || ST->isSwift())
+      return 2;
+    return 1;
+  }
+
+  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                          Type *SubTp);
+
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+
+  unsigned getAddressComputationCost(Type *Val, bool IsComplex);
+
+  unsigned getFPOpCost(Type *Ty);
+
+  unsigned getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace);
+
+  /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 9cc89bd..59461e8 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -164,7 +164,10 @@ class ARMAsmParser : public MCTargetAsmParser {
                               // according to count of instructions in block.
                               // ~0U if no active IT block.
   } ITState;
-  bool inITBlock() { return ITState.CurPosition != ~0U;}
+  bool inITBlock() { return ITState.CurPosition != ~0U; }
+  bool lastInITBlock() {
+    return ITState.CurPosition == 4 - countTrailingZeros(ITState.Mask);
+  }
   void forwardITPosition() {
     if (!inITBlock()) return;
     // Move to the next instruction in the IT block, if there is one. If not,
@@ -186,6 +189,11 @@ class ARMAsmParser : public MCTargetAsmParser {
     return getParser().Error(L, Msg, Ranges);
   }
 
+  bool validatetLDMRegList(MCInst Inst, const OperandVector &Operands,
+                           unsigned ListNo, bool IsARPop = false);
+  bool validatetSTMRegList(MCInst Inst, const OperandVector &Operands,
+                           unsigned ListNo);
+
   int tryParseRegister();
   bool tryParseRegisterWithWriteBack(OperandVector &);
   int tryParseShiftRegister(OperandVector &);
@@ -305,6 +313,7 @@ class ARMAsmParser : public MCTargetAsmParser {
   OperandMatchResultTy parseSetEndImm(OperandVector &);
   OperandMatchResultTy parseShifterImm(OperandVector &);
   OperandMatchResultTy parseRotImm(OperandVector &);
+  OperandMatchResultTy parseModImm(OperandVector &);
   OperandMatchResultTy parseBitfield(OperandVector &);
   OperandMatchResultTy parsePostIdxReg(OperandVector &);
   OperandMatchResultTy parseAM3Offset(OperandVector &);
@@ -318,7 +327,7 @@ class ARMAsmParser : public MCTargetAsmParser {
   void cvtThumbBranches(MCInst &Inst, const OperandVector &);
 
   bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
-  bool processInstruction(MCInst &Inst, const OperandVector &Ops);
+  bool processInstruction(MCInst &Inst, const OperandVector &Ops, MCStreamer &Out);
   bool shouldOmitCCOutOperand(StringRef Mnemonic, OperandVector &Operands);
   bool shouldOmitPredicateOperand(StringRef Mnemonic, OperandVector &Operands);
 
@@ -400,6 +409,7 @@ class ARMOperand : public MCParsedAsmOperand {
     k_ShiftedImmediate,
     k_ShifterImmediate,
     k_RotateImmediate,
+    k_ModifiedImmediate,
     k_BitfieldDescriptor,
     k_Token
   } Kind;
@@ -511,6 +521,11 @@ class ARMOperand : public MCParsedAsmOperand {
     unsigned Imm;
   };
 
+  struct ModImmOp {
+    unsigned Bits;
+    unsigned Rot;
+  };
+
   struct BitfieldOp {
     unsigned LSB;
     unsigned Width;
@@ -537,6 +552,7 @@ class ARMOperand : public MCParsedAsmOperand {
     struct RegShiftedRegOp RegShiftedReg;
     struct RegShiftedImmOp RegShiftedImm;
     struct RotImmOp RotImm;
+    struct ModImmOp ModImm;
     struct BitfieldOp Bitfield;
   };
 
@@ -612,6 +628,9 @@ public:
     case k_RotateImmediate:
       RotImm = o.RotImm;
       break;
+    case k_ModifiedImmediate:
+      ModImm = o.ModImm;
+      break;
     case k_BitfieldDescriptor:
       Bitfield = o.Bitfield;
       break;
@@ -1020,33 +1039,17 @@ public:
   }
   bool isAdrLabel() const {
     // If we have an immediate that's not a constant, treat it as a label
-    // reference needing a fixup. If it is a constant, but it can't fit 
-    // into shift immediate encoding, we reject it.
-    if (isImm() && !isa<MCConstantExpr>(getImm())) return true;
-    else return (isARMSOImm() || isARMSOImmNeg());
-  }
-  bool isARMSOImm() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return ARM_AM::getSOImmVal(Value) != -1;
-  }
-  bool isARMSOImmNot() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return ARM_AM::getSOImmVal(~Value) != -1;
-  }
-  bool isARMSOImmNeg() const {
+    // reference needing a fixup.
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+
+    // If it is a constant, it must fit into a modified immediate encoding.
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
-    // Only use this when not representable as a plain so_imm.
-    return ARM_AM::getSOImmVal(Value) == -1 &&
-      ARM_AM::getSOImmVal(-Value) != -1;
+    return (ARM_AM::getSOImmVal(Value) != -1 ||
+            ARM_AM::getSOImmVal(-Value) != -1);;
   }
   bool isT2SOImm() const {
     if (!isImm()) return false;
@@ -1091,6 +1094,22 @@ public:
   bool isRegShiftedReg() const { return Kind == k_ShiftedRegister; }
   bool isRegShiftedImm() const { return Kind == k_ShiftedImmediate; }
   bool isRotImm() const { return Kind == k_RotateImmediate; }
+  bool isModImm() const { return Kind == k_ModifiedImmediate; }
+  bool isModImmNot() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getSOImmVal(~Value) != -1;
+  }
+  bool isModImmNeg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getSOImmVal(Value) == -1 &&
+      ARM_AM::getSOImmVal(-Value) != -1;
+  }
   bool isBitfield() const { return Kind == k_BitfieldDescriptor; }
   bool isPostIdxRegShifted() const { return Kind == k_PostIndexRegister; }
   bool isPostIdxReg() const {
@@ -1826,6 +1845,30 @@ public:
     Inst.addOperand(MCOperand::CreateImm(RotImm.Imm >> 3));
   }
 
+  void addModImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    // Support for fixups (MCFixup)
+    if (isImm())
+      return addImmOperands(Inst, N);
+
+    Inst.addOperand(MCOperand::CreateImm(ModImm.Bits | (ModImm.Rot << 7)));
+  }
+
+  void addModImmNotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    uint32_t Enc = ARM_AM::getSOImmVal(~CE->getValue());
+    Inst.addOperand(MCOperand::CreateImm(Enc));
+  }
+
+  void addModImmNegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    uint32_t Enc = ARM_AM::getSOImmVal(-CE->getValue());
+    Inst.addOperand(MCOperand::CreateImm(Enc));
+  }
+
   void addBitfieldOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // Munge the lsb/width into a bitfield mask.
@@ -1982,22 +2025,6 @@ public:
     Inst.addOperand(MCOperand::CreateImm(Memory.OffsetImm->getValue()));
   }
 
-  void addARMSOImmNotOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    // The operand is actually a so_imm, but we have its bitwise
-    // negation in the assembly source, so twiddle it here.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::CreateImm(~CE->getValue()));
-  }
-
-  void addARMSOImmNegOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    // The operand is actually a so_imm, but we have its
-    // negation in the assembly source, so twiddle it here.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::CreateImm(-CE->getValue()));
-  }
-
   void addMemBarrierOptOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateImm(unsigned(getMemBarrierOpt())));
@@ -2630,6 +2657,16 @@ public:
     return Op;
   }
 
+  static std::unique_ptr<ARMOperand> CreateModImm(unsigned Bits, unsigned Rot,
+                                                  SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ModifiedImmediate);
+    Op->ModImm.Bits = Bits;
+    Op->ModImm.Rot = Rot;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   static std::unique_ptr<ARMOperand>
   CreateBitfield(unsigned LSB, unsigned Width, SMLoc S, SMLoc E) {
     auto Op = make_unique<ARMOperand>(k_BitfieldDescriptor);
@@ -2883,6 +2920,10 @@ void ARMOperand::print(raw_ostream &OS) const {
   case k_RotateImmediate:
     OS << "<ror " << " #" << (RotImm.Imm * 8) << ">";
     break;
+  case k_ModifiedImmediate:
+    OS << "<mod_imm #" << ModImm.Bits << ", #"
+       <<  ModImm.Rot << ")>";
+    break;
   case k_BitfieldDescriptor:
     OS << "<bitfield " << "lsb: " << Bitfield.LSB
        << ", width: " << Bitfield.Width << ">";
@@ -4339,6 +4380,123 @@ ARMAsmParser::parseRotImm(OperandVector &Operands) {
 }
 
 ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseModImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  MCAsmLexer &Lexer = getLexer();
+  int64_t Imm1, Imm2;
+
+  SMLoc S = Parser.getTok().getLoc();
+
+  // 1) A mod_imm operand can appear in the place of a register name:
+  //   add r0, #mod_imm
+  //   add r0, r0, #mod_imm
+  // to correctly handle the latter, we bail out as soon as we see an
+  // identifier.
+  //
+  // 2) Similarly, we do not want to parse into complex operands:
+  //   mov r0, #mod_imm
+  //   mov r0, :lower16:(_foo)
+  if (Parser.getTok().is(AsmToken::Identifier) ||
+      Parser.getTok().is(AsmToken::Colon))
+    return MatchOperand_NoMatch;
+
+  // Hash (dollar) is optional as per the ARMARM
+  if (Parser.getTok().is(AsmToken::Hash) ||
+      Parser.getTok().is(AsmToken::Dollar)) {
+    // Avoid parsing into complex operands (#:)
+    if (Lexer.peekTok().is(AsmToken::Colon))
+      return MatchOperand_NoMatch;
+
+    // Eat the hash (dollar)
+    Parser.Lex();
+  }
+
+  SMLoc Sx1, Ex1;
+  Sx1 = Parser.getTok().getLoc();
+  const MCExpr *Imm1Exp;
+  if (getParser().parseExpression(Imm1Exp, Ex1)) {
+    Error(Sx1, "malformed expression");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm1Exp);
+
+  if (CE) {
+    // Immediate must fit within 32-bits
+    Imm1 = CE->getValue();
+    int Enc = ARM_AM::getSOImmVal(Imm1);
+    if (Enc != -1 && Parser.getTok().is(AsmToken::EndOfStatement)) {
+      // We have a match!
+      Operands.push_back(ARMOperand::CreateModImm((Enc & 0xFF),
+                                                  (Enc & 0xF00) >> 7,
+                                                  Sx1, Ex1));
+      return MatchOperand_Success;
+    }
+
+    // We have parsed an immediate which is not for us, fallback to a plain
+    // immediate. This can happen for instruction aliases. For an example,
+    // ARMInstrInfo.td defines the alias [mov <-> mvn] which can transform
+    // a mov (mvn) with a mod_imm_neg/mod_imm_not operand into the opposite
+    // instruction with a mod_imm operand. The alias is defined such that the
+    // parser method is shared, that's why we have to do this here.
+    if (Parser.getTok().is(AsmToken::EndOfStatement)) {
+      Operands.push_back(ARMOperand::CreateImm(Imm1Exp, Sx1, Ex1));
+      return MatchOperand_Success;
+    }
+  } else {
+    // Operands like #(l1 - l2) can only be evaluated at a later stage (via an
+    // MCFixup). Fallback to a plain immediate.
+    Operands.push_back(ARMOperand::CreateImm(Imm1Exp, Sx1, Ex1));
+    return MatchOperand_Success;
+  }
+
+  // From this point onward, we expect the input to be a (#bits, #rot) pair
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Error(Sx1, "expected modified immediate operand: #[0, 255], #even[0-30]");
+    return MatchOperand_ParseFail;
+  }
+
+  if (Imm1 & ~0xFF) {
+    Error(Sx1, "immediate operand must a number in the range [0, 255]");
+    return MatchOperand_ParseFail;
+  }
+
+  // Eat the comma
+  Parser.Lex();
+
+  // Repeat for #rot
+  SMLoc Sx2, Ex2;
+  Sx2 = Parser.getTok().getLoc();
+
+  // Eat the optional hash (dollar)
+  if (Parser.getTok().is(AsmToken::Hash) ||
+      Parser.getTok().is(AsmToken::Dollar))
+    Parser.Lex();
+
+  const MCExpr *Imm2Exp;
+  if (getParser().parseExpression(Imm2Exp, Ex2)) {
+    Error(Sx2, "malformed expression");
+    return MatchOperand_ParseFail;
+  }
+
+  CE = dyn_cast<MCConstantExpr>(Imm2Exp);
+
+  if (CE) {
+    Imm2 = CE->getValue();
+    if (!(Imm2 & ~0x1E)) {
+      // We have a match!
+      Operands.push_back(ARMOperand::CreateModImm(Imm1, Imm2, S, Ex2));
+      return MatchOperand_Success;
+    }
+    Error(Sx2, "immediate operand must an even number in the range [0, 30]");
+    return MatchOperand_ParseFail;
+  } else {
+    Error(Sx2, "constant expression expected");
+    return MatchOperand_ParseFail;
+  }
+}
+
+ARMAsmParser::OperandMatchResultTy
 ARMAsmParser::parseBitfield(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
@@ -5091,15 +5249,52 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
     return true;
   }
 
+  enum {
+    COFF = (1 << MCObjectFileInfo::IsCOFF),
+    ELF = (1 << MCObjectFileInfo::IsELF),
+    MACHO = (1 << MCObjectFileInfo::IsMachO)
+  };
+  static const struct PrefixEntry {
+    const char *Spelling;
+    ARMMCExpr::VariantKind VariantKind;
+    uint8_t SupportedFormats;
+  } PrefixEntries[] = {
+    { "lower16", ARMMCExpr::VK_ARM_LO16, COFF | ELF | MACHO },
+    { "upper16", ARMMCExpr::VK_ARM_HI16, COFF | ELF | MACHO },
+  };
+
   StringRef IDVal = Parser.getTok().getIdentifier();
-  if (IDVal == "lower16") {
-    RefKind = ARMMCExpr::VK_ARM_LO16;
-  } else if (IDVal == "upper16") {
-    RefKind = ARMMCExpr::VK_ARM_HI16;
-  } else {
+
+  const auto &Prefix =
+      std::find_if(std::begin(PrefixEntries), std::end(PrefixEntries),
+                   [&IDVal](const PrefixEntry &PE) {
+                      return PE.Spelling == IDVal;
+                   });
+  if (Prefix == std::end(PrefixEntries)) {
     Error(Parser.getTok().getLoc(), "unexpected prefix in operand");
     return true;
   }
+
+  uint8_t CurrentFormat;
+  switch (getContext().getObjectFileInfo()->getObjectFileType()) {
+  case MCObjectFileInfo::IsMachO:
+    CurrentFormat = MACHO;
+    break;
+  case MCObjectFileInfo::IsELF:
+    CurrentFormat = ELF;
+    break;
+  case MCObjectFileInfo::IsCOFF:
+    CurrentFormat = COFF;
+    break;
+  }
+
+  if (~Prefix->SupportedFormats & CurrentFormat) {
+    Error(Parser.getTok().getLoc(),
+          "cannot represent relocation in the current file format");
+    return true;
+  }
+
+  RefKind = Prefix->VariantKind;
   Parser.Lex();
 
   if (getLexer().isNot(AsmToken::Colon)) {
@@ -5107,6 +5302,7 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
     return true;
   }
   Parser.Lex(); // Eat the last ':'
+
   return false;
 }
 
@@ -5139,7 +5335,8 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
       Mnemonic == "fmuls" || Mnemonic == "vmaxnm" || Mnemonic == "vminnm" ||
       Mnemonic == "vcvta" || Mnemonic == "vcvtn"  || Mnemonic == "vcvtp" ||
       Mnemonic == "vcvtm" || Mnemonic == "vrinta" || Mnemonic == "vrintn" ||
-      Mnemonic == "vrintp" || Mnemonic == "vrintm" || Mnemonic.startswith("vsel"))
+      Mnemonic == "vrintp" || Mnemonic == "vrintm" || Mnemonic == "hvc" ||
+      Mnemonic.startswith("vsel"))
     return Mnemonic;
 
   // First, split out any predication code. Ignore mnemonics we know aren't
@@ -5244,7 +5441,7 @@ getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
       Mnemonic == "vmaxnm" || Mnemonic == "vminnm" || Mnemonic == "vcvta" ||
       Mnemonic == "vcvtn" || Mnemonic == "vcvtp" || Mnemonic == "vcvtm" ||
       Mnemonic == "vrinta" || Mnemonic == "vrintn" || Mnemonic == "vrintp" ||
-      Mnemonic == "vrintm" || Mnemonic.startswith("aes") ||
+      Mnemonic == "vrintm" || Mnemonic.startswith("aes") || Mnemonic == "hvc" ||
       Mnemonic.startswith("sha1") || Mnemonic.startswith("sha256") ||
       (FullInst.startswith("vmull") && FullInst.endswith(".p64"))) {
     // These mnemonics are never predicable
@@ -5282,7 +5479,7 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
   // conditionally adding the cc_out in the first place because we need
   // to check the type of the parsed immediate operand.
   if (Mnemonic == "mov" && Operands.size() > 4 && !isThumb() &&
-      !static_cast<ARMOperand &>(*Operands[4]).isARMSOImm() &&
+      !static_cast<ARMOperand &>(*Operands[4]).isModImm() &&
       static_cast<ARMOperand &>(*Operands[4]).isImm0_65535Expr() &&
       static_cast<ARMOperand &>(*Operands[1]).getReg() == 0)
     return true;
@@ -5823,6 +6020,50 @@ static bool instIsBreakpoint(const MCInst &Inst) {
 
 }
 
+bool ARMAsmParser::validatetLDMRegList(MCInst Inst,
+                                       const OperandVector &Operands,
+                                       unsigned ListNo, bool IsARPop) {
+  const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]);
+  bool HasWritebackToken = Op.isToken() && Op.getToken() == "!";
+
+  bool ListContainsSP = listContainsReg(Inst, ListNo, ARM::SP);
+  bool ListContainsLR = listContainsReg(Inst, ListNo, ARM::LR);
+  bool ListContainsPC = listContainsReg(Inst, ListNo, ARM::PC);
+
+  if (!IsARPop && ListContainsSP)
+    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+                 "SP may not be in the register list");
+  else if (ListContainsPC && ListContainsLR)
+    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+                 "PC and LR may not be in the register list simultaneously");
+  else if (inITBlock() && !lastInITBlock() && ListContainsPC)
+    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+                 "instruction must be outside of IT block or the last "
+                 "instruction in an IT block");
+  return false;
+}
+
+bool ARMAsmParser::validatetSTMRegList(MCInst Inst,
+                                       const OperandVector &Operands,
+                                       unsigned ListNo) {
+  const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]);
+  bool HasWritebackToken = Op.isToken() && Op.getToken() == "!";
+
+  bool ListContainsSP = listContainsReg(Inst, ListNo, ARM::SP);
+  bool ListContainsPC = listContainsReg(Inst, ListNo, ARM::PC);
+
+  if (ListContainsSP && ListContainsPC)
+    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+                 "SP and PC may not be in the register list");
+  else if (ListContainsSP)
+    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+                 "SP may not be in the register list");
+  else if (ListContainsPC)
+    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+                 "PC may not be in the register list");
+  return false;
+}
+
 // FIXME: We would really like to be able to tablegen'erate this.
 bool ARMAsmParser::validateInstruction(MCInst &Inst,
                                        const OperandVector &Operands) {
@@ -6006,9 +6247,9 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
       return Error(Operands[3]->getStartLoc(),
                    "writeback operator '!' not allowed when base register "
                    "in register list");
-    if (listContainsReg(Inst, 3 + HasWritebackToken, ARM::SP))
-      return Error(Operands[3 + HasWritebackToken]->getStartLoc(),
-                   "SP not allowed in register list");
+
+    if (validatetLDMRegList(Inst, Operands, 3))
+      return true;
     break;
   }
   case ARM::LDMIA_UPD:
@@ -6025,13 +6266,14 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   case ARM::t2LDMIA:
   case ARM::t2LDMDB:
+    if (validatetLDMRegList(Inst, Operands, 3))
+      return true;
+    break;
   case ARM::t2STMIA:
-  case ARM::t2STMDB: {
-    if (listContainsReg(Inst, 3, ARM::SP))
-      return Error(Operands.back()->getStartLoc(),
-                   "SP not allowed in register list");
+  case ARM::t2STMDB:
+    if (validatetSTMRegList(Inst, Operands, 3))
+      return true;
     break;
-  }
   case ARM::t2LDMIA_UPD:
   case ARM::t2LDMDB_UPD:
   case ARM::t2STMIA_UPD:
@@ -6040,9 +6282,13 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
       return Error(Operands.back()->getStartLoc(),
                    "writeback register not allowed in register list");
 
-    if (listContainsReg(Inst, 4, ARM::SP))
-      return Error(Operands.back()->getStartLoc(),
-                   "SP not allowed in register list");
+    if (Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
+      if (validatetLDMRegList(Inst, Operands, 3))
+        return true;
+    } else {
+      if (validatetSTMRegList(Inst, Operands, 3))
+        return true;
+    }
     break;
   }
   case ARM::sysLDMIA_UPD:
@@ -6087,6 +6333,8 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
         !isThumbTwo())
       return Error(Operands[2]->getStartLoc(),
                    "registers must be in range r0-r7 or pc");
+    if (validatetLDMRegList(Inst, Operands, 2, !isMClass()))
+      return true;
     break;
   }
   case ARM::tPUSH: {
@@ -6095,6 +6343,8 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
         !isThumbTwo())
       return Error(Operands[2]->getStartLoc(),
                    "registers must be in range r0-r7 or lr");
+    if (validatetSTMRegList(Inst, Operands, 2))
+      return true;
     break;
   }
   case ARM::tSTMIA_UPD: {
@@ -6111,9 +6361,9 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
       return Error(Operands[4]->getStartLoc(),
                    "writeback operator '!' not allowed when base register "
                    "in register list");
-    if (listContainsReg(Inst, 4, ARM::SP) && !inITBlock())
-      return Error(Operands.back()->getStartLoc(),
-                   "SP not allowed in register list");
+
+    if (validatetSTMRegList(Inst, Operands, 4))
+      return true;
     break;
   }
   case ARM::tADDrSP: {
@@ -6434,7 +6684,8 @@ static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
 }
 
 bool ARMAsmParser::processInstruction(MCInst &Inst,
-                                      const OperandVector &Operands) {
+                                      const OperandVector &Operands,
+                                      MCStreamer &Out) {
   switch (Inst.getOpcode()) {
   // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
   case ARM::LDRT_POST:
@@ -6475,12 +6726,35 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   // Alias for alternate form of 'ADR Rd, #imm' instruction.
   case ARM::ADDri: {
     if (Inst.getOperand(1).getReg() != ARM::PC ||
-        Inst.getOperand(5).getReg() != 0)
+        Inst.getOperand(5).getReg() != 0 ||
+        !(Inst.getOperand(2).isExpr() || Inst.getOperand(2).isImm()))
       return false;
     MCInst TmpInst;
     TmpInst.setOpcode(ARM::ADR);
     TmpInst.addOperand(Inst.getOperand(0));
-    TmpInst.addOperand(Inst.getOperand(2));
+    if (Inst.getOperand(2).isImm()) {
+      // Immediate (mod_imm) will be in its encoded form, we must unencode it
+      // before passing it to the ADR instruction.
+      unsigned Enc = Inst.getOperand(2).getImm();
+      TmpInst.addOperand(MCOperand::CreateImm(
+        ARM_AM::rotr32(Enc & 0xFF, (Enc & 0xF00) >> 7)));
+    } else {
+      // Turn PC-relative expression into absolute expression.
+      // Reading PC provides the start of the current instruction + 8 and
+      // the transform to adr is biased by that.
+      MCSymbol *Dot = getContext().CreateTempSymbol();
+      Out.EmitLabel(Dot);
+      const MCExpr *OpExpr = Inst.getOperand(2).getExpr();
+      const MCExpr *InstPC = MCSymbolRefExpr::Create(Dot,
+                                                     MCSymbolRefExpr::VK_None,
+                                                     getContext());
+      const MCExpr *Const8 = MCConstantExpr::Create(8, getContext());
+      const MCExpr *ReadPC = MCBinaryExpr::CreateAdd(InstPC, Const8,
+                                                     getContext());
+      const MCExpr *FixupAddr = MCBinaryExpr::CreateAdd(ReadPC, OpExpr,
+                                                        getContext());
+      TmpInst.addOperand(MCOperand::CreateExpr(FixupAddr));
+    }
     TmpInst.addOperand(Inst.getOperand(3));
     TmpInst.addOperand(Inst.getOperand(4));
     Inst = TmpInst;
@@ -8302,7 +8576,6 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
                                      MatchingInlineAsm);
   switch (MatchResult) {
-  default: break;
   case Match_Success:
     // Context sensitive operand constraints aren't handled by the matcher,
     // so check them here.
@@ -8320,7 +8593,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       // encoding is selected. Loop on it while changes happen so the
       // individual transformations can chain off each other. E.g.,
       // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
-      while (processInstruction(Inst, Operands))
+      while (processInstruction(Inst, Operands, Out))
         ;
 
       // Only after the instruction is fully processed, we can validate it
@@ -8732,7 +9005,7 @@ bool ARMAsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
 
   Parser.Lex(); // Consume the EndOfStatement
 
-  if (!RegisterReqs.insert(std::make_pair(Name, Reg)).second) {
+  if (RegisterReqs.insert(std::make_pair(Name, Reg)).first->second != Reg) {
     Error(SRegLoc, "redefinition of '" + Name + "' does not match original.");
     return false;
   }
@@ -8858,8 +9131,13 @@ bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
   if (Tag == ARMBuildAttrs::compatibility) {
     if (Parser.getTok().isNot(AsmToken::Comma))
       IsStringValue = false;
-    else
-      Parser.Lex();
+    if (Parser.getTok().isNot(AsmToken::Comma)) {
+      Error(Parser.getTok().getLoc(), "comma expected");
+      Parser.eatToEndOfStatement();
+      return false;
+    } else {
+       Parser.Lex();
+    }
   }
 
   if (IsStringValue) {
@@ -8888,38 +9166,78 @@ bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
 bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
   StringRef CPU = getParser().parseStringToEndOfStatement().trim();
   getTargetStreamer().emitTextAttribute(ARMBuildAttrs::CPU_name, CPU);
+
+  if (!STI.isCPUStringValid(CPU)) {
+    Error(L, "Unknown CPU name");
+    return false;
+  }
+
+  // FIXME: This switches the CPU features globally, therefore it might
+  // happen that code you would not expect to assemble will. For details
+  // see: http://llvm.org/bugs/show_bug.cgi?id=20757
+  STI.InitMCProcessorInfo(CPU, "");
+  STI.InitCPUSchedModel(CPU);
+  setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+
   return false;
 }
 
 // FIXME: This is duplicated in getARMFPUFeatures() in
 // tools/clang/lib/Driver/Tools.cpp
 static const struct {
-  const unsigned Fpu;
+  const unsigned ID;
   const uint64_t Enabled;
   const uint64_t Disabled;
-} Fpus[] = {
-      {ARM::VFP, ARM::FeatureVFP2, ARM::FeatureNEON},
-      {ARM::VFPV2, ARM::FeatureVFP2, ARM::FeatureNEON},
-      {ARM::VFPV3, ARM::FeatureVFP3, ARM::FeatureNEON},
-      {ARM::VFPV3_D16, ARM::FeatureVFP3 | ARM::FeatureD16, ARM::FeatureNEON},
-      {ARM::VFPV4, ARM::FeatureVFP4, ARM::FeatureNEON},
-      {ARM::VFPV4_D16, ARM::FeatureVFP4 | ARM::FeatureD16, ARM::FeatureNEON},
-      {ARM::FPV5_D16, ARM::FeatureFPARMv8 | ARM::FeatureD16,
-       ARM::FeatureNEON | ARM::FeatureCrypto},
-      {ARM::FP_ARMV8, ARM::FeatureFPARMv8,
-       ARM::FeatureNEON | ARM::FeatureCrypto},
-      {ARM::NEON, ARM::FeatureNEON, 0},
-      {ARM::NEON_VFPV4, ARM::FeatureVFP4 | ARM::FeatureNEON, 0},
-      {ARM::NEON_FP_ARMV8, ARM::FeatureFPARMv8 | ARM::FeatureNEON,
-       ARM::FeatureCrypto},
-      {ARM::CRYPTO_NEON_FP_ARMV8,
-       ARM::FeatureFPARMv8 | ARM::FeatureNEON | ARM::FeatureCrypto, 0},
-      {ARM::SOFTVFP, 0, 0},
+} FPUs[] = {
+    {/* ID */ ARM::VFP,
+     /* Enabled */ ARM::FeatureVFP2,
+     /* Disabled */ ARM::FeatureNEON},
+    {/* ID */ ARM::VFPV2,
+     /* Enabled */ ARM::FeatureVFP2,
+     /* Disabled */ ARM::FeatureNEON},
+    {/* ID */ ARM::VFPV3,
+     /* Enabled */ ARM::FeatureVFP2 | ARM::FeatureVFP3,
+     /* Disabled */ ARM::FeatureNEON | ARM::FeatureD16},
+    {/* ID */ ARM::VFPV3_D16,
+     /* Enable */ ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureD16,
+     /* Disabled */ ARM::FeatureNEON},
+    {/* ID */ ARM::VFPV4,
+     /* Enabled */ ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureVFP4,
+     /* Disabled */ ARM::FeatureNEON | ARM::FeatureD16},
+    {/* ID */ ARM::VFPV4_D16,
+     /* Enabled */ ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureVFP4 |
+         ARM::FeatureD16,
+     /* Disabled */ ARM::FeatureNEON},
+    {/* ID */ ARM::FPV5_D16,
+     /* Enabled */ ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureVFP4 |
+         ARM::FeatureFPARMv8 | ARM::FeatureD16,
+     /* Disabled */ ARM::FeatureNEON | ARM::FeatureCrypto},
+    {/* ID */ ARM::FP_ARMV8,
+     /* Enabled */ ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureVFP4 |
+         ARM::FeatureFPARMv8,
+     /* Disabled */ ARM::FeatureNEON | ARM::FeatureCrypto | ARM::FeatureD16},
+    {/* ID */ ARM::NEON,
+     /* Enabled */ ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureNEON,
+     /* Disabled */ ARM::FeatureD16},
+    {/* ID */ ARM::NEON_VFPV4,
+     /* Enabled */ ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureVFP4 |
+         ARM::FeatureNEON,
+     /* Disabled */ ARM::FeatureD16},
+    {/* ID */ ARM::NEON_FP_ARMV8,
+     /* Enabled */ ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureVFP4 |
+         ARM::FeatureFPARMv8 | ARM::FeatureNEON,
+     /* Disabled */ ARM::FeatureCrypto | ARM::FeatureD16},
+    {/* ID */ ARM::CRYPTO_NEON_FP_ARMV8,
+     /* Enabled */ ARM::FeatureVFP2 | ARM::FeatureVFP3 | ARM::FeatureVFP4 |
+         ARM::FeatureFPARMv8 | ARM::FeatureNEON | ARM::FeatureCrypto,
+     /* Disabled */ ARM::FeatureD16},
+    {ARM::SOFTVFP, 0, 0},
 };
 
 /// parseDirectiveFPU
 ///  ::= .fpu str
 bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
+  SMLoc FPUNameLoc = getTok().getLoc();
   StringRef FPU = getParser().parseStringToEndOfStatement().trim();
 
   unsigned ID = StringSwitch<unsigned>(FPU)
@@ -8928,18 +9246,18 @@ bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
     .Default(ARM::INVALID_FPU);
 
   if (ID == ARM::INVALID_FPU) {
-    Error(L, "Unknown FPU name");
+    Error(FPUNameLoc, "Unknown FPU name");
     return false;
   }
 
-  for (const auto &Fpu : Fpus) {
-    if (Fpu.Fpu != ID)
+  for (const auto &Entry : FPUs) {
+    if (Entry.ID != ID)
       continue;
 
     // Need to toggle features that should be on but are off and that
     // should off but are on.
-    uint64_t Toggle = (Fpu.Enabled & ~STI.getFeatureBits()) |
-                      (Fpu.Disabled & STI.getFeatureBits());
+    uint64_t Toggle = (Entry.Enabled & ~STI.getFeatureBits()) |
+                      (Entry.Disabled & STI.getFeatureBits());
     setAvailableFeatures(ComputeAvailableFeatures(STI.ToggleFeature(Toggle)));
     break;
   }
@@ -9766,7 +10084,7 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
         if (CE->getValue() == 0)
           return Match_Success;
     break;
-  case MCK_ARMSOImm:
+  case MCK_ModImm:
     if (Op.isImm()) {
       const MCExpr *SOExpr = Op.getImm();
       int64_t Value;
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index ef65418..4d5122a 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -176,8 +176,6 @@ static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeSOImmOperand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
@@ -405,6 +403,28 @@ static MCDisassembler *createThumbDisassembler(const Target &T,
   return new ThumbDisassembler(STI, Ctx);
 }
 
+// Post-decoding checks
+static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size,
+                                            uint64_t Address, raw_ostream &OS,
+                                            raw_ostream &CS,
+                                            uint32_t Insn,
+                                            DecodeStatus Result)
+{
+  switch (MI.getOpcode()) {
+    case ARM::HVC: {
+      // HVC is undefined if condition = 0xf otherwise upredictable
+      // if condition != 0xe
+      uint32_t Cond = (Insn >> 28) & 0xF;
+      if (Cond == 0xF)
+        return MCDisassembler::Fail;
+      if (Cond != 0xE)
+        return MCDisassembler::SoftFail;
+      return Result;
+    }
+    default: return Result;
+  }
+}
+
 DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                                              ArrayRef<uint8_t> Bytes,
                                              uint64_t Address, raw_ostream &OS,
@@ -430,7 +450,7 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       decodeInstruction(DecoderTableARM32, MI, Insn, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
     Size = 4;
-    return Result;
+    return checkDecodedInstruction(MI, Size, Address, OS, CS, Insn, Result);
   }
 
   // VFP and NEON instructions, similarly, are shared between ARM
@@ -1113,15 +1133,6 @@ static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeSOImmOperand(MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder) {
-  uint32_t imm = Val & 0xFF;
-  uint32_t rot = (Val & 0xF00) >> 7;
-  uint32_t rot_imm = (imm >> rot) | (imm << ((32-rot) & 0x1F));
-  Inst.addOperand(MCOperand::CreateImm(rot_imm));
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -4960,7 +4971,7 @@ static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, uint32_t Val,
   DecodeStatus S = MCDisassembler::Success;
 
   // Shift of "asr #32" is not allowed in Thumb2 mode.
-  if (Val == 0x20) S = MCDisassembler::SoftFail;
+  if (Val == 0x20) S = MCDisassembler::Fail;
   Inst.addOperand(MCOperand::CreateImm(Val));
   return S;
 }
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 0570084..16eea33 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -269,7 +269,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
   // expressed as a GPRPair, so we have to manually merge them.
   // FIXME: We would really like to be able to tablegen'erate this.
   case ARM::LDREXD: case ARM::STREXD:
-  case ARM::LDAEXD: case ARM::STLEXD:
+  case ARM::LDAEXD: case ARM::STLEXD: {
     const MCRegisterClass& MRC = MRI.getRegClass(ARM::GPRRegClassID);
     bool isStore = Opcode == ARM::STREXD || Opcode == ARM::STLEXD;
     unsigned Reg = MI->getOperand(isStore ? 1 : 0).getReg();
@@ -290,6 +290,23 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
       printInstruction(&NewMI, O);
       return;
     }
+    break;
+  }
+    // B9.3.3 ERET (Thumb)
+    // For a target that has Virtualization Extensions, ERET is the preferred
+    // disassembly of SUBS PC, LR, #0
+  case ARM::t2SUBS_PC_LR: {
+    if (MI->getNumOperands() == 3 &&
+        MI->getOperand(0).isImm() &&
+        MI->getOperand(0).getImm() == 0 &&
+        (getAvailableFeatures() & ARM::FeatureVirtualization)) {
+      O << "\teret";
+      printPredicateOperand(MI, 1, O);
+      printAnnotation(O, Annot);
+      return;
+    }
+    break;
+  }
   }
 
   printInstruction(MI, O);
@@ -1301,6 +1318,52 @@ void ARMInstPrinter::printRotImmOperand(const MCInst *MI, unsigned OpNum,
   O << markup(">");
 }
 
+void ARMInstPrinter::printModImmOperand(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  MCOperand Op = MI->getOperand(OpNum);
+
+  // Support for fixups (MCFixup)
+  if (Op.isExpr())
+    return printOperand(MI, OpNum, O);
+
+  unsigned Bits = Op.getImm() & 0xFF;
+  unsigned Rot = (Op.getImm() & 0xF00) >> 7;
+
+  bool  PrintUnsigned = false;
+  switch (MI->getOpcode()){
+  case ARM::MOVi:
+    // Movs to PC should be treated unsigned
+    PrintUnsigned = (MI->getOperand(OpNum - 1).getReg() == ARM::PC);
+    break;
+  case ARM::MSRi:
+    // Movs to special registers should be treated unsigned
+    PrintUnsigned = true;
+    break;
+  }
+
+  int32_t Rotated = ARM_AM::rotr32(Bits, Rot);
+  if (ARM_AM::getSOImmVal(Rotated) == Op.getImm()) {
+    // #rot has the least possible value
+    O << "#" << markup("<imm:");
+    if (PrintUnsigned)
+      O << static_cast<uint32_t>(Rotated);
+    else
+      O << Rotated;
+    O << markup(">");
+    return;
+  }
+
+  // Explicit #bits, #rot implied
+  O << "#"
+    << markup("<imm:")
+    << Bits
+    << markup(">")
+    << ", #"
+    << markup("<imm:")
+    << Rot
+    << markup(">");
+}
+
 void ARMInstPrinter::printFBits16(const MCInst *MI, unsigned OpNum,
                                   raw_ostream &O) {
   O << markup("<imm:")
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 09fd536..f179e01 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -131,6 +131,7 @@ public:
   void printNEONModImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printImmPlusOneOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printRotImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printModImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printGPRPairOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
   void printPCLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index f24b419..a821a6b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -51,7 +51,7 @@ ARMELFObjectWriter::~ARMELFObjectWriter() {}
 
 bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD,
                                                  unsigned Type) const {
-  // FIXME: This is extremelly conservative. This really needs to use a
+  // FIXME: This is extremely conservative. This really needs to use a
   // whitelist with a clear explanation for why each realocation needs to
   // point to the symbol, not to the section.
   switch (Type) {
@@ -148,6 +148,22 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
   } else {
     switch ((unsigned)Fixup.getKind()) {
     default: llvm_unreachable("invalid fixup kind!");
+    case FK_Data_1:
+      switch (Modifier) {
+      default: llvm_unreachable("unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_ABS8;
+        break;
+      }
+      break;
+    case FK_Data_2:
+      switch (Modifier) {
+      default: llvm_unreachable("unsupported modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_ABS16;
+        break;
+      }
+      break;
     case FK_Data_4:
       switch (Modifier) {
       default: llvm_unreachable("Unsupported Modifier");
@@ -184,6 +200,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_ARM_PREL31:
         Type = ELF::R_ARM_PREL31;
         break;
+      case MCSymbolRefExpr::VK_ARM_SBREL:
+        Type = ELF::R_ARM_SBREL32;
+        break;
       case MCSymbolRefExpr::VK_ARM_TLSLDO:
         Type = ELF::R_ARM_TLS_LDO32;
         break;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 24ee537..2b65520 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -15,6 +15,7 @@
 
 #include "ARMArchName.h"
 #include "ARMFPUName.h"
+#include "ARMArchExtName.h"
 #include "ARMRegisterInfo.h"
 #include "ARMUnwindOpAsm.h"
 #include "llvm/ADT/StringExtras.h"
@@ -105,6 +106,19 @@ static unsigned GetArchDefaultCPUArch(unsigned ID) {
   return 0;
 }
 
+static const char *GetArchExtName(unsigned ID) {
+  switch (ID) {
+  default:
+    llvm_unreachable("Unknown ARCH Extension kind");
+    break;
+#define ARM_ARCHEXT_NAME(NAME, ID)                                             \
+  case ARM::ID:                                                                \
+    return NAME;
+#include "ARMArchExtName.def"
+  }
+  return nullptr;
+}
+
 namespace {
 
 class ARMELFStreamer;
@@ -134,6 +148,7 @@ class ARMTargetAsmStreamer : public ARMTargetStreamer {
   void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
                             StringRef StrinValue) override;
   void emitArch(unsigned Arch) override;
+  void emitArchExtension(unsigned ArchExt) override;
   void emitObjectArch(unsigned Arch) override;
   void emitFPU(unsigned FPU) override;
   void emitInst(uint32_t Inst, char Suffix = '\0') override;
@@ -249,6 +264,9 @@ void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute,
 void ARMTargetAsmStreamer::emitArch(unsigned Arch) {
   OS << "\t.arch\t" << GetArchName(Arch) << "\n";
 }
+void ARMTargetAsmStreamer::emitArchExtension(unsigned ArchExt) {
+  OS << "\t.arch_extension\t" << GetArchExtName(ArchExt) << "\n";
+}
 void ARMTargetAsmStreamer::emitObjectArch(unsigned Arch) {
   OS << "\t.object_arch\t" << GetArchName(Arch) << '\n';
 }
@@ -300,7 +318,19 @@ private:
     StringRef StringValue;
 
     static bool LessTag(const AttributeItem &LHS, const AttributeItem &RHS) {
-      return (LHS.Tag < RHS.Tag);
+      // The conformance tag must be emitted first when serialised
+      // into an object file. Specifically, the addenda to the ARM ABI
+      // states that (2.3.7.4):
+      //
+      // "To simplify recognition by consumers in the common case of
+      // claiming conformity for the whole file, this tag should be
+      // emitted first in a file-scope sub-subsection of the first
+      // public subsection of the attributes section."
+      //
+      // So it is special-cased in this comparison predicate when the
+      // attributes are sorted in finishAttributeSection().
+      return (RHS.Tag != ARMBuildAttrs::conformance) &&
+             ((LHS.Tag == ARMBuildAttrs::conformance) || (LHS.Tag < RHS.Tag));
     }
   };
 
@@ -541,6 +571,10 @@ public:
   /// necessary.
   void EmitValueImpl(const MCExpr *Value, unsigned Size,
                      const SMLoc &Loc) override {
+    if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value))
+      if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4))
+        getContext().FatalError(Loc, "relocated expression must be 32-bit");
+
     EmitDataMappingSymbol();
     MCELFStreamer::EmitValueImpl(Value, Size);
   }
@@ -942,11 +976,8 @@ void ARMTargetELFStreamer::finishAttributeSection() {
   if (AttributeSection) {
     Streamer.SwitchSection(AttributeSection);
   } else {
-    AttributeSection =
-      Streamer.getContext().getELFSection(".ARM.attributes",
-                                          ELF::SHT_ARM_ATTRIBUTES,
-                                          0,
-                                          SectionKind::getMetadata());
+    AttributeSection = Streamer.getContext().getELFSection(
+        ".ARM.attributes", ELF::SHT_ARM_ATTRIBUTES, 0);
     Streamer.SwitchSection(AttributeSection);
 
     // Format version
@@ -979,12 +1010,12 @@ void ARMTargetELFStreamer::finishAttributeSection() {
       Streamer.EmitULEB128IntValue(item.IntValue);
       break;
     case AttributeItem::TextAttribute:
-      Streamer.EmitBytes(item.StringValue.upper());
+      Streamer.EmitBytes(item.StringValue);
       Streamer.EmitIntValue(0, 1); // '\0'
       break;
     case AttributeItem::NumericAndTextAttributes:
       Streamer.EmitULEB128IntValue(item.IntValue);
-      Streamer.EmitBytes(item.StringValue.upper());
+      Streamer.EmitBytes(item.StringValue);
       Streamer.EmitIntValue(0, 1); // '\0'
       break;
     }
@@ -1053,11 +1084,11 @@ inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix,
   // Get .ARM.extab or .ARM.exidx section
   const MCSectionELF *EHSection = nullptr;
   if (const MCSymbol *Group = FnSection.getGroup()) {
-    EHSection = getContext().getELFSection(
-      EHSecName, Type, Flags | ELF::SHF_GROUP, Kind,
-      FnSection.getEntrySize(), Group->getName());
+    EHSection =
+        getContext().getELFSection(EHSecName, Type, Flags | ELF::SHF_GROUP,
+                                   FnSection.getEntrySize(), Group->getName());
   } else {
-    EHSection = getContext().getELFSection(EHSecName, Type, Flags, Kind);
+    EHSection = getContext().getELFSection(EHSecName, Type, Flags);
   }
   assert(EHSection && "Failed to get the required EH section");
 
@@ -1341,10 +1372,8 @@ MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
   return S;
 }
 
-MCStreamer *createARMNullStreamer(MCContext &Ctx) {
-  MCStreamer *S = llvm::createNullStreamer(Ctx);
-  new ARMTargetStreamer(*S);
-  return S;
+MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S) {
+  return new ARMTargetStreamer(S);
 }
 
 MCELFStreamer *createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index 1d82099..66a1618 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMMCAsmInfo.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
@@ -89,6 +89,7 @@ ARMCOFFMCAsmInfoMicrosoft::ARMCOFFMCAsmInfoMicrosoft() {
   AlignmentIsInBytes = false;
 
   PrivateGlobalPrefix = "$M";
+  PrivateLabelPrefix = "$M";
 }
 
 void ARMCOFFMCAsmInfoGNU::anchor() { }
@@ -101,6 +102,7 @@ ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() {
   Code16Directive = ".code\t16";
   Code32Directive = ".code\t32";
   PrivateGlobalPrefix = ".L";
+  PrivateLabelPrefix = ".L";
 
   SupportsDebugInformation = true;
   ExceptionsType = ExceptionHandling::None;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index f1fef41..6cb4715 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -21,7 +21,8 @@
 namespace llvm {
 
   class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin {
-    void anchor() override;
+    virtual void anchor();
+
   public:
     explicit ARMMCAsmInfoDarwin(StringRef TT);
   };
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index b8ee555..efbebd3 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -37,8 +37,8 @@ STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created.");
 
 namespace {
 class ARMMCCodeEmitter : public MCCodeEmitter {
-  ARMMCCodeEmitter(const ARMMCCodeEmitter &) LLVM_DELETED_FUNCTION;
-  void operator=(const ARMMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  ARMMCCodeEmitter(const ARMMCCodeEmitter &) = delete;
+  void operator=(const ARMMCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   const MCContext &CTX;
   bool IsLittleEndian;
@@ -304,6 +304,28 @@ public:
     return Binary;
   }
 
+  unsigned getModImmOpValue(const MCInst &MI, unsigned Op,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &ST) const {
+    const MCOperand &MO = MI.getOperand(Op);
+
+    // Support for fixups (MCFixup)
+    if (MO.isExpr()) {
+      const MCExpr *Expr = MO.getExpr();
+      // In instruction code this value always encoded as lowest 12 bits,
+      // so we don't have to perform any specific adjustments.
+      // Due to requirements of relocatable records we have to use FK_Data_4.
+      // See ARMELFObjectWriter::ExplicitRelSym and
+      //     ARMELFObjectWriter::GetRelocTypeInner for more details.
+      MCFixupKind Kind = MCFixupKind(FK_Data_4);
+      Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+      return 0;
+    }
+
+    // Immediate is already in its encoded format
+    return MO.getImm();
+  }
+
   /// getT2SOImmOpValue - Return an encoded 12-bit shifted-immediate value.
   unsigned getT2SOImmOpValue(const MCInst &MI, unsigned Op,
                            SmallVectorImpl<MCFixup> &Fixups,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 98190ba..8c19785 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -64,10 +64,60 @@ static bool getMCRDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
 }
 
 static bool getITDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
-                                  std::string &Info) {
-  if (STI.getFeatureBits() & llvm::ARM::HasV8Ops &&
-      MI.getOperand(1).isImm() && MI.getOperand(1).getImm() != 8) {
-    Info = "applying IT instruction to more than one subsequent instruction is deprecated";
+                                 std::string &Info) {
+  if (STI.getFeatureBits() & llvm::ARM::HasV8Ops && MI.getOperand(1).isImm() &&
+      MI.getOperand(1).getImm() != 8) {
+    Info = "applying IT instruction to more than one subsequent instruction is "
+           "deprecated";
+    return true;
+  }
+
+  return false;
+}
+
+static bool getARMStoreDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+                                       std::string &Info) {
+  assert((~STI.getFeatureBits() & llvm::ARM::ModeThumb) &&
+         "cannot predicate thumb instructions");
+
+  assert(MI.getNumOperands() >= 4 && "expected >= 4 arguments");
+  for (unsigned OI = 4, OE = MI.getNumOperands(); OI < OE; ++OI) {
+    assert(MI.getOperand(OI).isReg() && "expected register");
+    if (MI.getOperand(OI).getReg() == ARM::SP ||
+        MI.getOperand(OI).getReg() == ARM::PC) {
+      Info = "use of SP or PC in the list is deprecated";
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool getARMLoadDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+                                      std::string &Info) {
+  assert((~STI.getFeatureBits() & llvm::ARM::ModeThumb) &&
+         "cannot predicate thumb instructions");
+
+  assert(MI.getNumOperands() >= 4 && "expected >= 4 arguments");
+  bool ListContainsPC = false, ListContainsLR = false;
+  for (unsigned OI = 4, OE = MI.getNumOperands(); OI < OE; ++OI) {
+    assert(MI.getOperand(OI).isReg() && "expected register");
+    switch (MI.getOperand(OI).getReg()) {
+    default:
+      break;
+    case ARM::LR:
+      ListContainsLR = true;
+      break;
+    case ARM::PC:
+      ListContainsPC = true;
+      break;
+    case ARM::SP:
+      Info = "use of SP in the list is deprecated";
+      return true;
+    }
+  }
+
+  if (ListContainsPC && ListContainsLR) {
+    Info = "use of LR and PC simultaneously in the list is deprecated";
     return true;
   }
 
@@ -405,11 +455,15 @@ extern "C" void LLVMInitializeARMTargetMC() {
   TargetRegistry::RegisterAsmStreamer(TheThumbLETarget, createMCAsmStreamer);
   TargetRegistry::RegisterAsmStreamer(TheThumbBETarget, createMCAsmStreamer);
 
-  // Register the null streamer.
-  TargetRegistry::RegisterNullStreamer(TheARMLETarget, createARMNullStreamer);
-  TargetRegistry::RegisterNullStreamer(TheARMBETarget, createARMNullStreamer);
-  TargetRegistry::RegisterNullStreamer(TheThumbLETarget, createARMNullStreamer);
-  TargetRegistry::RegisterNullStreamer(TheThumbBETarget, createARMNullStreamer);
+  // Register the null TargetStreamer.
+  TargetRegistry::RegisterNullTargetStreamer(TheARMLETarget,
+                                             createARMNullTargetStreamer);
+  TargetRegistry::RegisterNullTargetStreamer(TheARMBETarget,
+                                             createARMNullTargetStreamer);
+  TargetRegistry::RegisterNullTargetStreamer(TheThumbLETarget,
+                                             createARMNullTargetStreamer);
+  TargetRegistry::RegisterNullTargetStreamer(TheThumbBETarget,
+                                             createARMNullTargetStreamer);
 
   // Register the MCInstPrinter.
   TargetRegistry::RegisterMCInstPrinter(TheARMLETarget, createARMMCInstPrinter);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index a6c20d5..c17e959 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -29,6 +29,7 @@ class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCStreamer;
 class MCRelocationInfo;
+class MCTargetStreamer;
 class StringRef;
 class Target;
 class raw_ostream;
@@ -51,7 +52,7 @@ MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
                                 MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                                 MCAsmBackend *TAB, bool ShowInst);
 
-MCStreamer *createARMNullStreamer(MCContext &Ctx);
+MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S);
 
 MCCodeEmitter *createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
                                         const MCRegisterInfo &MRI,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 7da5003..3187d36 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -54,10 +54,10 @@ public:
     : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
                                /*UseAggressiveSymbolFolding=*/true) {}
 
-  void RecordRelocation(MachObjectWriter *Writer,
-                        const MCAssembler &Asm, const MCAsmLayout &Layout,
-                        const MCFragment *Fragment, const MCFixup &Fixup,
-                        MCValue Target, uint64_t &FixedValue) override;
+  void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override;
 };
 }
 
@@ -232,7 +232,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
                    (IsPCRel               << 30) |
                    MachO::R_SCATTERED);
     MRE.r_word1 = Value2;
-    Writer->addRelocation(Fragment->getParent(), MRE);
+    Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
   }
 
   MachO::any_relocation_info MRE;
@@ -243,7 +243,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
                  (IsPCRel     << 30) |
                  MachO::R_SCATTERED);
   MRE.r_word1 = Value;
-  Writer->addRelocation(Fragment->getParent(), MRE);
+  Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
 }
 
 void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
@@ -297,7 +297,7 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
                    (IsPCRel               << 30) |
                    MachO::R_SCATTERED);
     MRE.r_word1 = Value2;
-    Writer->addRelocation(Fragment->getParent(), MRE);
+    Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
   }
 
   MachO::any_relocation_info MRE;
@@ -307,7 +307,7 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
                  (IsPCRel     << 30) |
                  MachO::R_SCATTERED);
   MRE.r_word1 = Value;
-  Writer->addRelocation(Fragment->getParent(), MRE);
+  Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
 }
 
 bool ARMMachObjectWriter::requiresExternRelocation(MachObjectWriter *Writer,
@@ -351,11 +351,10 @@ bool ARMMachObjectWriter::requiresExternRelocation(MachObjectWriter *Writer,
 }
 
 void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
-                                           const MCAssembler &Asm,
+                                           MCAssembler &Asm,
                                            const MCAsmLayout &Layout,
                                            const MCFragment *Fragment,
-                                           const MCFixup &Fixup,
-                                           MCValue Target,
+                                           const MCFixup &Fixup, MCValue Target,
                                            uint64_t &FixedValue) {
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
   unsigned Log2Size;
@@ -401,8 +400,8 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   // See <reloc.h>.
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
   unsigned Index = 0;
-  unsigned IsExtern = 0;
   unsigned Type = 0;
+  const MCSymbolData *RelSymbol = nullptr;
 
   if (Target.isAbsolute()) { // constant
     // FIXME!
@@ -422,8 +421,7 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
     // Check whether we need an external or internal relocation.
     if (requiresExternRelocation(Writer, Asm, *Fragment, RelocType, SD,
                                  FixedValue)) {
-      IsExtern = 1;
-      Index = SD->getIndex();
+      RelSymbol = SD;
 
       // For external relocations, make sure to offset the fixup value to
       // compensate for the addend of the symbol address, if it was
@@ -447,11 +445,8 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   // struct relocation_info (8 bytes)
   MachO::any_relocation_info MRE;
   MRE.r_word0 = FixupOffset;
-  MRE.r_word1 = ((Index     <<  0) |
-                 (IsPCRel   << 24) |
-                 (Log2Size  << 25) |
-                 (IsExtern  << 27) |
-                 (Type      << 28));
+  MRE.r_word1 =
+      (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
 
   // Even when it's not a scattered relocation, movw/movt always uses
   // a PAIR relocation.
@@ -476,10 +471,10 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
                        (Log2Size              << 25) |
                        (MachO::ARM_RELOC_PAIR << 28));
 
-    Writer->addRelocation(Fragment->getParent(), MREPair);
+    Writer->addRelocation(nullptr, Fragment->getParent(), MREPair);
   }
 
-  Writer->addRelocation(Fragment->getParent(), MRE);
+  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
 }
 
 MCObjectWriter *llvm::createARMMachObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index 8acd7af..b680db5 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -63,6 +63,7 @@ void ARMTargetStreamer::emitIntTextAttribute(unsigned Attribute,
                                              unsigned IntValue,
                                              StringRef StringValue) {}
 void ARMTargetStreamer::emitArch(unsigned Arch) {}
+void ARMTargetStreamer::emitArchExtension(unsigned ArchExt) {}
 void ARMTargetStreamer::emitObjectArch(unsigned Arch) {}
 void ARMTargetStreamer::emitFPU(unsigned FPU) {}
 void ARMTargetStreamer::finishAttributeSection() {}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index d31f1f4..2fd6445 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -8,7 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/Support/COFF.h"
@@ -26,14 +29,16 @@ public:
   virtual ~ARMWinCOFFObjectWriter() { }
 
   unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsCrossSection) const override;
+                        bool IsCrossSection,
+                        const MCAsmBackend &MAB) const override;
 
   bool recordRelocation(const MCFixup &) const override;
 };
 
 unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target,
                                               const MCFixup &Fixup,
-                                              bool IsCrossSection) const {
+                                              bool IsCrossSection,
+                                              const MCAsmBackend &MAB) const {
   assert(getMachine() == COFF::IMAGE_FILE_MACHINE_ARMNT &&
          "AArch64 support not yet implemented");
 
@@ -41,7 +46,10 @@ unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target,
     Target.isAbsolute() ? MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
 
   switch (static_cast<unsigned>(Fixup.getKind())) {
-  default: llvm_unreachable("unsupported relocation type");
+  default: {
+    const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
+    report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
+  }
   case FK_Data_4:
     switch (Modifier) {
     case MCSymbolRefExpr::VK_COFF_IMGREL32:
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index 35fe9b3..51e519d 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -381,7 +381,7 @@ bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
   TII = static_cast<const ARMBaseInstrInfo *>(Fn.getSubtarget().getInstrInfo());
   TRI = Fn.getSubtarget().getRegisterInfo();
   MRI = &Fn.getRegInfo();
-  const ARMSubtarget *STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
+  const ARMSubtarget *STI = &Fn.getSubtarget<ARMSubtarget>();
   isLikeA9 = STI->isLikeA9() || STI->isSwift();
   isSwift = STI->isSwift();
 
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 6deab4f..7dcc64e 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -52,9 +52,9 @@ void Thumb1FrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   const Thumb1InstrInfo &TII =
-      *static_cast<const Thumb1InstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const Thumb1RegisterInfo *RegInfo = static_cast<const Thumb1RegisterInfo *>(
-      MF.getSubtarget().getRegisterInfo());
+      *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo());
+  const Thumb1RegisterInfo *RegInfo =
+      static_cast<const Thumb1RegisterInfo *>(STI.getRegisterInfo());
   if (!hasReservedCallFrame(MF)) {
     // If we have alloca, convert as follows:
     // ADJCALLSTACKDOWN -> sub, sp, sp, amount
@@ -89,15 +89,12 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  const Thumb1RegisterInfo *RegInfo = static_cast<const Thumb1RegisterInfo *>(
-      MF.getSubtarget().getRegisterInfo());
+  const Thumb1RegisterInfo *RegInfo =
+      static_cast<const Thumb1RegisterInfo *>(STI.getRegisterInfo());
   const Thumb1InstrInfo &TII =
-      *static_cast<const Thumb1InstrInfo *>(MF.getSubtarget().getInstrInfo());
+      *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo());
 
-  unsigned Align = MF.getTarget()
-                       .getSubtargetImpl()
-                       ->getFrameLowering()
-                       ->getStackAlignment();
+  unsigned Align = STI.getFrameLowering()->getStackAlignment();
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   unsigned NumBytes = MFI->getStackSize();
   assert(NumBytes >= ArgRegsSaveSize &&
@@ -124,7 +121,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
     unsigned CFIIndex = MMI.addFrameInst(
         MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
     BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
   }
 
   if (!AFI->hasStackFrame()) {
@@ -135,7 +133,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
       BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
     }
     return;
   }
@@ -199,7 +198,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
     unsigned CFIIndex = MMI.addFrameInst(
         MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
     BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
   }
   for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
          E = CSI.end(); I != E; ++I) {
@@ -226,7 +226,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
       unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
           nullptr, MRI->getDwarfRegNum(Reg, true), MFI->getObjectOffset(FI)));
       BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
       break;
     }
   }
@@ -244,13 +245,15 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
       unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfa(
           nullptr, MRI->getDwarfRegNum(FramePtr, true), CFAOffset));
       BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
     } else {
       unsigned CFIIndex =
           MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(
               nullptr, MRI->getDwarfRegNum(FramePtr, true)));
       BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
     }
     if (NumBytes > 508)
       // If offset is > 508 then sp cannot be adjusted in a single instruction,
@@ -267,7 +270,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
       BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
     }
   }
 
@@ -324,15 +328,12 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
   DebugLoc dl = MBBI->getDebugLoc();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  const Thumb1RegisterInfo *RegInfo = static_cast<const Thumb1RegisterInfo *>(
-      MF.getSubtarget().getRegisterInfo());
+  const Thumb1RegisterInfo *RegInfo =
+      static_cast<const Thumb1RegisterInfo *>(STI.getRegisterInfo());
   const Thumb1InstrInfo &TII =
-      *static_cast<const Thumb1InstrInfo *>(MF.getSubtarget().getInstrInfo());
+      *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo());
 
-  unsigned Align = MF.getTarget()
-                       .getSubtargetImpl()
-                       ->getFrameLowering()
-                       ->getStackAlignment();
+  unsigned Align = STI.getFrameLowering()->getStackAlignment();
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   int NumBytes = (int)MFI->getStackSize();
   assert((unsigned)NumBytes >= ArgRegsSaveSize &&
@@ -459,8 +460,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     return false;
 
   DebugLoc DL;
-  MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
 
   if (MI != MBB.end()) DL = MI->getDebugLoc();
 
@@ -499,7 +499,7 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   MachineFunction &MF = *MBB.getParent();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
 
   bool isVarArg = AFI->getArgRegsSaveSize() > 0;
   DebugLoc DL = MI->getDebugLoc();
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 8ea912e..c24f740 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -44,7 +44,7 @@ void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   bool KillSrc) const {
   // Need to check the arch.
   MachineFunction &MF = *MBB.getParent();
-  const ARMSubtarget &st = MF.getTarget().getSubtarget<ARMSubtarget>();
+  const ARMSubtarget &st = MF.getSubtarget<ARMSubtarget>();
 
   assert(ARM::GPRRegClass.contains(DestReg, SrcReg) &&
          "Thumb1 can only copy GPR registers");
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index c10c809..5e2cbdc 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -71,7 +71,7 @@ Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
              "Thumb1 does not have ldr to high register");
 
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C = ConstantInt::get(
           Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
@@ -234,7 +234,6 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
   // If we would emit the copy with an immediate of 0, just use tMOVr.
   if (CopyOpc && Bytes < CopyScale) {
     CopyOpc = ARM::tMOVr;
-    CopyBits = 0;
     CopyScale = 1;
     CopyNeedsCC = false;
     CopyRange = 0;
@@ -389,12 +388,7 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
 
 void Thumb1RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
                                            int64_t Offset) const {
-  const ARMBaseInstrInfo &TII =
-      *static_cast<const ARMBaseInstrInfo *>(MI.getParent()
-                                                 ->getParent()
-                                                 ->getTarget()
-                                                 .getSubtargetImpl()
-                                                 ->getInstrInfo());
+  const ARMBaseInstrInfo &TII = *STI.getInstrInfo();
   int Off = Offset; // ARM doesn't need the general 64-bit offsets
   unsigned i = 0;
 
@@ -420,7 +414,7 @@ Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
   // off the frame pointer (if, for example, there are alloca() calls in
   // the function, the offset will be negative. Use R12 instead since that's
   // a call clobbered register that we know won't be used in Thumb1 mode.
-  const TargetInstrInfo &TII = *MBB.getParent()->getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
   DebugLoc DL;
   AddDefaultPred(BuildMI(MBB, I, DL, TII.get(ARM::tMOVr))
     .addReg(ARM::R12, RegState::Define)
@@ -466,8 +460,7 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const ARMBaseInstrInfo &TII =
-      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const ARMBaseInstrInfo &TII = *STI.getInstrInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   DebugLoc dl = MI.getDebugLoc();
   MachineInstrBuilder MIB(*MBB.getParent(), &MI);
@@ -478,8 +471,7 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                MF.getFrameInfo()->getStackSize() + SPAdj;
 
   if (MF.getFrameInfo()->hasVarSizedObjects()) {
-    assert(SPAdj == 0 && MF.getSubtarget().getFrameLowering()->hasFP(MF) &&
-           "Unexpected");
+    assert(SPAdj == 0 && STI.getFrameLowering()->hasFP(MF) && "Unexpected");
     // There are alloca()'s in this function, must reference off the frame
     // pointer or base pointer instead.
     if (!hasBasePointer(MF)) {
@@ -495,10 +487,7 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // when !hasReservedCallFrame().
 #ifndef NDEBUG
   if (RS && FrameReg == ARM::SP && RS->isScavengingFrameIndex(FrameIndex)){
-    assert(MF.getTarget()
-               .getSubtargetImpl()
-               ->getFrameLowering()
-               ->hasReservedCallFrame(MF) &&
+    assert(STI.getFrameLowering()->hasReservedCallFrame(MF) &&
            "Cannot use SP to access the emergency spill slot in "
            "functions without a reserved call frame");
     assert(!MF.getFrameInfo()->hasVarSizedObjects() &&
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index fdcb522..b657f2d 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -253,12 +253,12 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
 }
 
 bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
-  const TargetMachine &TM = Fn.getTarget();
+  const ARMSubtarget &STI =
+      static_cast<const ARMSubtarget &>(Fn.getSubtarget());
   AFI = Fn.getInfo<ARMFunctionInfo>();
-  TII = static_cast<const Thumb2InstrInfo *>(
-      TM.getSubtargetImpl()->getInstrInfo());
-  TRI = TM.getSubtargetImpl()->getRegisterInfo();
-  restrictIT = TM.getSubtarget<ARMSubtarget>().restrictIT();
+  TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
+  TRI = STI.getRegisterInfo();
+  restrictIT = STI.restrictIT();
 
   if (!AFI->isThumbFunction())
     return false;
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 91973e1..62c3752 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -574,13 +574,10 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
       }
     } else if (AddrMode == ARMII::AddrModeT2_i8s4) {
       Offset += MI.getOperand(FrameRegIdx + 1).getImm() * 4;
-      NumBits = 8;
-      // MCInst operand has already scaled value.
+      NumBits = 10; // 8 bits scaled by 4
+      // MCInst operand expects already scaled value.
       Scale = 1;
-      if (Offset < 0) {
-        isSub = true;
-        Offset = -Offset;
-      }
+      assert((Offset & 3) == 0 && "Can't encode this offset!");
     } else {
       llvm_unreachable("Unsupported addressing mode!");
     }
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index c51eb8b..2ee908b 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -1001,17 +1001,12 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
 }
 
 bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
-  const TargetMachine &TM = MF.getTarget();
-  TII = static_cast<const Thumb2InstrInfo *>(
-      TM.getSubtargetImpl()->getInstrInfo());
-  STI = &TM.getSubtarget<ARMSubtarget>();
+  STI = &static_cast<const ARMSubtarget &>(MF.getSubtarget());
+  TII = static_cast<const Thumb2InstrInfo *>(STI->getInstrInfo());
 
   // Optimizing / minimizing size?
-  AttributeSet FnAttrs = MF.getFunction()->getAttributes();
-  OptimizeSize = FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                                      Attribute::OptimizeForSize);
-  MinimizeSize =
-      FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+  OptimizeSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
+  MinimizeSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
 
   BlockInfo.clear();
   BlockInfo.resize(MF.getNumBlockIDs());
diff --git a/lib/Target/Android.mk b/lib/Target/Android.mk
index 4494eb0..1e34a85 100644
--- a/lib/Target/Android.mk
+++ b/lib/Target/Android.mk
@@ -3,7 +3,6 @@ LOCAL_PATH:= $(call my-dir)
 target_SRC_FILES := \
   Target.cpp \
   TargetIntrinsicInfo.cpp \
-  TargetLibraryInfo.cpp \
   TargetLoweringObjectFile.cpp \
   TargetMachineC.cpp \
   TargetMachine.cpp \
@@ -20,6 +19,7 @@ LOCAL_MODULE:= libLLVMTarget
 LOCAL_MODULE_TAGS := optional
 
 include $(LLVM_HOST_BUILD_MK)
+include $(LLVM_GEN_INTRINSICS_MK)
 include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device
@@ -34,5 +34,6 @@ LOCAL_MODULE:= libLLVMTarget
 LOCAL_MODULE_TAGS := optional
 
 include $(LLVM_DEVICE_BUILD_MK)
+include $(LLVM_GEN_INTRINSICS_MK)
 include $(BUILD_STATIC_LIBRARY)
 endif
diff --git a/lib/Target/BPF/BPF.h b/lib/Target/BPF/BPF.h
new file mode 100644
index 0000000..4a0cb20
--- /dev/null
+++ b/lib/Target/BPF/BPF.h
@@ -0,0 +1,22 @@
+//===-- BPF.h - Top-level interface for BPF representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPF_H
+#define LLVM_LIB_TARGET_BPF_BPF_H
+
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class BPFTargetMachine;
+
+FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
+}
+
+#endif
diff --git a/lib/Target/BPF/BPF.td b/lib/Target/BPF/BPF.td
new file mode 100644
index 0000000..a4ce90a
--- /dev/null
+++ b/lib/Target/BPF/BPF.td
@@ -0,0 +1,31 @@
+//===-- BPF.td - Describe the BPF Target Machine -----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+include "BPFRegisterInfo.td"
+include "BPFCallingConv.td"
+include "BPFInstrInfo.td"
+
+def BPFInstrInfo : InstrInfo;
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic", []>;
+
+def BPFInstPrinter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+def BPF : Target {
+  let InstructionSet = BPFInstrInfo;
+  let AssemblyWriters = [BPFInstPrinter];
+}
diff --git a/lib/Target/BPF/BPFAsmPrinter.cpp b/lib/Target/BPF/BPFAsmPrinter.cpp
new file mode 100644
index 0000000..dbc7bfe
--- /dev/null
+++ b/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -0,0 +1,87 @@
+//===-- BPFAsmPrinter.cpp - BPF LLVM assembly writer ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the BPF assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFInstrInfo.h"
+#include "BPFMCInstLower.h"
+#include "BPFTargetMachine.h"
+#include "InstPrinter/BPFInstPrinter.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+class BPFAsmPrinter : public AsmPrinter {
+public:
+  explicit BPFAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)) {}
+
+  const char *getPassName() const override { return "BPF Assembly Printer"; }
+
+  void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O,
+                    const char *Modifier = nullptr);
+  void EmitInstruction(const MachineInstr *MI) override;
+};
+}
+
+void BPFAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                 raw_ostream &O, const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << BPFInstPrinter::getRegisterName(MO.getReg());
+    break;
+
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    break;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    O << *getSymbol(MO.getGlobal());
+    break;
+
+  default:
+    llvm_unreachable("<unknown operand type>");
+  }
+}
+
+void BPFAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+
+  BPFMCInstLower MCInstLowering(OutContext, *this);
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  EmitToStreamer(OutStreamer, TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeBPFAsmPrinter() {
+  RegisterAsmPrinter<BPFAsmPrinter> X(TheBPFTarget);
+}
diff --git a/lib/Target/BPF/BPFCallingConv.td b/lib/Target/BPF/BPFCallingConv.td
new file mode 100644
index 0000000..8cec6fa
--- /dev/null
+++ b/lib/Target/BPF/BPFCallingConv.td
@@ -0,0 +1,29 @@
+//===-- BPFCallingConv.td - Calling Conventions BPF --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the BPF architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// BPF 64-bit C return-value convention.
+def RetCC_BPF64 : CallingConv<[CCIfType<[i64], CCAssignToReg<[R0]>>]>;
+
+// BPF 64-bit C Calling convention.
+def CC_BPF64 : CallingConv<[
+  // Promote i8/i16/i32 args to i64
+  CCIfType<[ i8, i16, i32 ], CCPromoteToType<i64>>,
+
+  // All arguments get passed in integer registers if there is space.
+  CCIfType<[i64], CCAssignToReg<[ R1, R2, R3, R4, R5 ]>>,
+
+  // Could be assigned to the stack in 8-byte aligned units, but unsupported
+  CCAssignToStack<8, 8>
+]>;
+
+def CSR : CalleeSavedRegs<(add R6, R7, R8, R9, R10)>;
diff --git a/lib/Target/BPF/BPFFrameLowering.cpp b/lib/Target/BPF/BPFFrameLowering.cpp
new file mode 100644
index 0000000..ae9f355
--- /dev/null
+++ b/lib/Target/BPF/BPFFrameLowering.cpp
@@ -0,0 +1,39 @@
+//===-- BPFFrameLowering.cpp - BPF Frame Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the BPF implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFFrameLowering.h"
+#include "BPFInstrInfo.h"
+#include "BPFSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+bool BPFFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
+
+void BPFFrameLowering::emitPrologue(MachineFunction &MF) const {}
+
+void BPFFrameLowering::emitEpilogue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {}
+
+void BPFFrameLowering::processFunctionBeforeCalleeSavedScan(
+    MachineFunction &MF, RegScavenger *RS) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  MRI.setPhysRegUnused(BPF::R6);
+  MRI.setPhysRegUnused(BPF::R7);
+  MRI.setPhysRegUnused(BPF::R8);
+  MRI.setPhysRegUnused(BPF::R9);
+}
diff --git a/lib/Target/BPF/BPFFrameLowering.h b/lib/Target/BPF/BPFFrameLowering.h
new file mode 100644
index 0000000..833046d
--- /dev/null
+++ b/lib/Target/BPF/BPFFrameLowering.h
@@ -0,0 +1,41 @@
+//===-- BPFFrameLowering.h - Define frame lowering for BPF -----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements BPF-specific bits of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFFRAMELOWERING_H
+#define LLVM_LIB_TARGET_BPF_BPFFRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class BPFSubtarget;
+
+class BPFFrameLowering : public TargetFrameLowering {
+public:
+  explicit BPFFrameLowering(const BPFSubtarget &sti)
+      : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, 0) {}
+
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const override;
+
+  void
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override {
+    MBB.erase(MI);
+  }
+};
+}
+#endif
diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp
new file mode 100644
index 0000000..07f62a9
--- /dev/null
+++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -0,0 +1,159 @@
+//===-- BPFISelDAGToDAG.cpp - A dag to dag inst selector for BPF ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for BPF,
+// converting from a legalized dag to a BPF dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFRegisterInfo.h"
+#include "BPFSubtarget.h"
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/IntrinsicInst.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-isel"
+
+// Instruction Selector Implementation
+namespace {
+
+class BPFDAGToDAGISel : public SelectionDAGISel {
+public:
+  explicit BPFDAGToDAGISel(BPFTargetMachine &TM) : SelectionDAGISel(TM) {}
+
+  const char *getPassName() const override {
+    return "BPF DAG->DAG Pattern Instruction Selection";
+  }
+
+private:
+// Include the pieces autogenerated from the target description.
+#include "BPFGenDAGISel.inc"
+
+  SDNode *Select(SDNode *N) override;
+
+  // Complex Pattern for address selection.
+  bool SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
+};
+}
+
+// ComplexPattern used on BPF Load/Store instructions
+bool BPFDAGToDAGISel::SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
+  // if Address is FI, get the TargetFrameIndex.
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+    Offset = CurDAG->getTargetConstant(0, MVT::i64);
+    return true;
+  }
+
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;
+
+  // Addresses of the form FI+const or FI|const
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isInt<32>(CN->getSExtValue())) {
+
+      // If the first operand is a FI, get the TargetFI Node
+      if (FrameIndexSDNode *FIN =
+              dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+      else
+        Base = Addr.getOperand(0);
+
+      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), MVT::i64);
+      return true;
+    }
+  }
+
+  Base   = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i64);
+  return true;
+}
+
+SDNode *BPFDAGToDAGISel::Select(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+
+  // Dump information about the Node being selected
+  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
+    return NULL;
+  }
+
+  // tablegen selection should be handled here.
+  switch (Opcode) {
+  default: break;
+
+  case ISD::UNDEF: {
+    errs() << "BUG: "; Node->dump(CurDAG); errs() << '\n';
+    report_fatal_error("shouldn't see UNDEF during Select");
+    break;
+  }
+
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    case Intrinsic::bpf_load_byte:
+    case Intrinsic::bpf_load_half:
+    case Intrinsic::bpf_load_word: {
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+      SDValue N1 = Node->getOperand(1);
+      SDValue Skb = Node->getOperand(2);
+      SDValue N3 = Node->getOperand(3);
+
+      SDValue R6Reg = CurDAG->getRegister(BPF::R6, MVT::i64);
+      Chain = CurDAG->getCopyToReg(Chain, DL, R6Reg, Skb, SDValue());
+      Node = CurDAG->UpdateNodeOperands(Node, Chain, N1, R6Reg, N3);
+      break;
+    }
+    }
+    break;
+  }
+
+  case ISD::FrameIndex: {
+    int FI = dyn_cast<FrameIndexSDNode>(Node)->getIndex();
+    EVT VT = Node->getValueType(0);
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
+    unsigned Opc = BPF::MOV_rr;
+    if (Node->hasOneUse())
+      return CurDAG->SelectNodeTo(Node, Opc, VT, TFI);
+    return CurDAG->getMachineNode(Opc, SDLoc(Node), VT, TFI);
+  }
+  }
+
+  // Select the default instruction
+  SDNode *ResNode = SelectCode(Node);
+
+  DEBUG(dbgs() << "=> ";
+        if (ResNode == nullptr || ResNode == Node)
+          Node->dump(CurDAG);
+        else
+          ResNode->dump(CurDAG);
+        dbgs() << '\n');
+  return ResNode;
+}
+
+FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) {
+  return new BPFDAGToDAGISel(TM);
+}
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
new file mode 100644
index 0000000..d94416b
--- /dev/null
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -0,0 +1,642 @@
+//===-- BPFISelLowering.cpp - BPF DAG Lowering Implementation  ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that BPF uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFISelLowering.h"
+#include "BPF.h"
+#include "BPFTargetMachine.h"
+#include "BPFSubtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-lower"
+
+namespace {
+
+// Diagnostic information for unimplemented or unsupported feature reporting.
+class DiagnosticInfoUnsupported : public DiagnosticInfo {
+private:
+  // Debug location where this diagnostic is triggered.
+  DebugLoc DLoc;
+  const Twine &Description;
+  const Function &Fn;
+  SDValue Value;
+
+  static int KindID;
+
+  static int getKindID() {
+    if (KindID == 0)
+      KindID = llvm::getNextAvailablePluginDiagnosticKind();
+    return KindID;
+  }
+
+public:
+  DiagnosticInfoUnsupported(SDLoc DLoc, const Function &Fn, const Twine &Desc,
+                            SDValue Value)
+      : DiagnosticInfo(getKindID(), DS_Error), DLoc(DLoc.getDebugLoc()),
+        Description(Desc), Fn(Fn), Value(Value) {}
+
+  void print(DiagnosticPrinter &DP) const override {
+    std::string Str;
+    raw_string_ostream OS(Str);
+
+    if (DLoc.isUnknown() == false) {
+      DILocation DIL(DLoc.getAsMDNode(Fn.getContext()));
+      StringRef Filename = DIL.getFilename();
+      unsigned Line = DIL.getLineNumber();
+      unsigned Column = DIL.getColumnNumber();
+      OS << Filename << ':' << Line << ':' << Column << ' ';
+    }
+
+    OS << "in function " << Fn.getName() << ' ' << *Fn.getFunctionType() << '\n'
+       << Description;
+    if (Value)
+      Value->print(OS);
+    OS << '\n';
+    OS.flush();
+    DP << Str;
+  }
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == getKindID();
+  }
+};
+
+int DiagnosticInfoUnsupported::KindID = 0;
+}
+
+BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
+                                     const BPFSubtarget &STI)
+    : TargetLowering(TM) {
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i64, &BPF::GPRRegClass);
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties(STI.getRegisterInfo());
+
+  setStackPointerRegisterToSaveRestore(BPF::R11);
+
+  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::SETCC, MVT::i64, Expand);
+  setOperationAction(ISD::SELECT, MVT::i64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  setOperationAction(ISD::MULHU, MVT::i64, Expand);
+  setOperationAction(ISD::MULHS, MVT::i64, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+
+  setOperationAction(ISD::ADDC, MVT::i64, Expand);
+  setOperationAction(ISD::ADDE, MVT::i64, Expand);
+  setOperationAction(ISD::SUBC, MVT::i64, Expand);
+  setOperationAction(ISD::SUBE, MVT::i64, Expand);
+
+  // no UNDEF allowed
+  setOperationAction(ISD::UNDEF, MVT::i64, Expand);
+
+  setOperationAction(ISD::ROTR, MVT::i64, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+
+  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i64, Custom);
+  setOperationAction(ISD::CTLZ, MVT::i64, Custom);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Expand);
+
+  // Extended load operations for i1 types must be promoted
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
+  }
+
+  setBooleanContents(ZeroOrOneBooleanContent);
+
+  // Function alignments (log2)
+  setMinFunctionAlignment(3);
+  setPrefFunctionAlignment(3);
+
+  // inline memcpy() for kernel to see explicit copy
+  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 128;
+  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 128;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128;
+}
+
+SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  case ISD::BR_CC:
+    return LowerBR_CC(Op, DAG);
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::SELECT_CC:
+    return LowerSELECT_CC(Op, DAG);
+  default:
+    llvm_unreachable("unimplemented operand");
+  }
+}
+
+// Calling Convention Implementation
+#include "BPFGenCallingConv.inc"
+
+SDValue BPFTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
+  switch (CallConv) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::C:
+  case CallingConv::Fast:
+    break;
+  }
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_BPF64);
+
+  for (auto &VA : ArgLocs) {
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      EVT RegVT = VA.getLocVT();
+      switch (RegVT.getSimpleVT().SimpleTy) {
+      default: {
+        errs() << "LowerFormalArguments Unhandled argument type: "
+               << RegVT.getSimpleVT().SimpleTy << '\n';
+        llvm_unreachable(0);
+      }
+      case MVT::i64:
+        unsigned VReg = RegInfo.createVirtualRegister(&BPF::GPRRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT);
+
+        // If this is an 8/16/32-bit value, it is really passed promoted to 64
+        // bits. Insert an assert[sz]ext to capture this, then truncate to the
+        // right size.
+        if (VA.getLocInfo() == CCValAssign::SExt)
+          ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+        else if (VA.getLocInfo() == CCValAssign::ZExt)
+          ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+
+        if (VA.getLocInfo() != CCValAssign::Full)
+          ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
+
+        InVals.push_back(ArgValue);
+      }
+    } else {
+      DiagnosticInfoUnsupported Err(DL, *MF.getFunction(),
+                                    "defined with too many args", SDValue());
+      DAG.getContext()->diagnose(Err);
+    }
+  }
+
+  if (IsVarArg || MF.getFunction()->hasStructRetAttr()) {
+    DiagnosticInfoUnsupported Err(
+        DL, *MF.getFunction(),
+        "functions with VarArgs or StructRet are not supported", SDValue());
+    DAG.getContext()->diagnose(Err);
+  }
+
+  return Chain;
+}
+
+SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                     SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  auto &Outs = CLI.Outs;
+  auto &OutVals = CLI.OutVals;
+  auto &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &IsTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // BPF target does not support tail call optimization.
+  IsTailCall = false;
+
+  switch (CallConv) {
+  default:
+    report_fatal_error("Unsupported calling convention");
+  case CallingConv::Fast:
+  case CallingConv::C:
+    break;
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_BPF64);
+
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  if (Outs.size() >= 6) {
+    DiagnosticInfoUnsupported Err(CLI.DL, *MF.getFunction(),
+                                  "too many args to ", Callee);
+    DAG.getContext()->diagnose(Err);
+  }
+
+  for (auto &Arg : Outs) {
+    ISD::ArgFlagsTy Flags = Arg.Flags;
+    if (!Flags.isByVal())
+      continue;
+
+    DiagnosticInfoUnsupported Err(CLI.DL, *MF.getFunction(),
+                                  "pass by value not supported ", Callee);
+    DAG.getContext()->diagnose(Err);
+  }
+
+  Chain = DAG.getCALLSEQ_START(
+      Chain, DAG.getConstant(NumBytes, getPointerTy(), true), CLI.DL);
+
+  SmallVector<std::pair<unsigned, SDValue>, 5> RegsToPass;
+
+  // Walk arg assignments
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[i];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, CLI.DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, CLI.DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, CLI.DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    // Push arguments into RegsToPass vector
+    if (VA.isRegLoc())
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    else
+      llvm_unreachable("call arg pass bug");
+  }
+
+  SDValue InFlag;
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain and
+  // flag operands which copy the outgoing args into registers.  The InFlag in
+  // necessary since all emitted instructions must be stuck together.
+  for (auto &Reg : RegsToPass) {
+    Chain = DAG.getCopyToReg(Chain, CLI.DL, Reg.first, Reg.second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), CLI.DL, getPointerTy(),
+                                        G->getOffset(), 0);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), getPointerTy(), 0);
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (auto &Reg : RegsToPass)
+    Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(BPFISD::CALL, CLI.DL, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(
+      Chain, DAG.getConstant(NumBytes, getPointerTy(), true),
+      DAG.getConstant(0, getPointerTy(), true), InFlag, CLI.DL);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, CLI.DL, DAG,
+                         InVals);
+}
+
+SDValue
+BPFTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                               bool IsVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
+                               SDLoc DL, SelectionDAG &DAG) const {
+
+  // CCValAssign - represent the assignment of the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
+
+  if (MF.getFunction()->getReturnType()->isAggregateType()) {
+    DiagnosticInfoUnsupported Err(DL, *MF.getFunction(),
+                                  "only integer returns supported", SDValue());
+    DAG.getContext()->diagnose(Err);
+  }
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_BPF64);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVals[i], Flag);
+
+    // Guarantee that all emitted copies are stuck together,
+    // avoiding something bad.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  unsigned Opc = BPFISD::RET_FLAG;
+  RetOps[0] = Chain; // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
+}
+
+SDValue BPFTargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
+
+  if (Ins.size() >= 2) {
+    DiagnosticInfoUnsupported Err(DL, *MF.getFunction(),
+                                  "only small returns supported", SDValue());
+    DAG.getContext()->diagnose(Err);
+  }
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_BPF64);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (auto &Val : RVLocs) {
+    Chain = DAG.getCopyFromReg(Chain, DL, Val.getLocReg(),
+                               Val.getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+static void NegateCC(SDValue &LHS, SDValue &RHS, ISD::CondCode &CC) {
+  switch (CC) {
+  default:
+    break;
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETLT:
+  case ISD::SETLE:
+    CC = ISD::getSetCCSwappedOperands(CC);
+    std::swap(LHS, RHS);
+    break;
+  }
+}
+
+SDValue BPFTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  SDLoc DL(Op);
+
+  NegateCC(LHS, RHS, CC);
+
+  return DAG.getNode(BPFISD::BR_CC, DL, Op.getValueType(), Chain, LHS, RHS,
+                     DAG.getConstant(CC, MVT::i64), Dest);
+}
+
+SDValue BPFTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue TrueV = Op.getOperand(2);
+  SDValue FalseV = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDLoc DL(Op);
+
+  NegateCC(LHS, RHS, CC);
+
+  SDValue TargetCC = DAG.getConstant(CC, MVT::i64);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  SDValue Ops[] = {LHS, RHS, TargetCC, TrueV, FalseV};
+
+  return DAG.getNode(BPFISD::SELECT_CC, DL, VTs, Ops);
+}
+
+const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default:
+    return NULL;
+  case BPFISD::RET_FLAG:
+    return "BPFISD::RET_FLAG";
+  case BPFISD::CALL:
+    return "BPFISD::CALL";
+  case BPFISD::SELECT_CC:
+    return "BPFISD::SELECT_CC";
+  case BPFISD::BR_CC:
+    return "BPFISD::BR_CC";
+  case BPFISD::Wrapper:
+    return "BPFISD::Wrapper";
+  }
+}
+
+SDValue BPFTargetLowering::LowerGlobalAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i64);
+
+  return DAG.getNode(BPFISD::Wrapper, DL, MVT::i64, GA);
+}
+
+MachineBasicBlock *
+BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                               MachineBasicBlock *BB) const {
+  unsigned Opc = MI->getOpcode();
+
+  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  assert(Opc == BPF::Select && "Unexpected instr type to insert");
+
+  // To "insert" a SELECT instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator I = BB;
+  ++I;
+
+  // ThisMBB:
+  // ...
+  //  TrueVal = ...
+  //  jmp_XX r1, r2 goto Copy1MBB
+  //  fallthrough --> Copy0MBB
+  MachineBasicBlock *ThisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *Copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *Copy1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+
+  F->insert(I, Copy0MBB);
+  F->insert(I, Copy1MBB);
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  Copy1MBB->splice(Copy1MBB->begin(), BB,
+                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  Copy1MBB->transferSuccessorsAndUpdatePHIs(BB);
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(Copy0MBB);
+  BB->addSuccessor(Copy1MBB);
+
+  // Insert Branch if Flag
+  unsigned LHS = MI->getOperand(1).getReg();
+  unsigned RHS = MI->getOperand(2).getReg();
+  int CC = MI->getOperand(3).getImm();
+  switch (CC) {
+  case ISD::SETGT:
+    BuildMI(BB, DL, TII.get(BPF::JSGT_rr))
+        .addReg(LHS)
+        .addReg(RHS)
+        .addMBB(Copy1MBB);
+    break;
+  case ISD::SETUGT:
+    BuildMI(BB, DL, TII.get(BPF::JUGT_rr))
+        .addReg(LHS)
+        .addReg(RHS)
+        .addMBB(Copy1MBB);
+    break;
+  case ISD::SETGE:
+    BuildMI(BB, DL, TII.get(BPF::JSGE_rr))
+        .addReg(LHS)
+        .addReg(RHS)
+        .addMBB(Copy1MBB);
+    break;
+  case ISD::SETUGE:
+    BuildMI(BB, DL, TII.get(BPF::JUGE_rr))
+        .addReg(LHS)
+        .addReg(RHS)
+        .addMBB(Copy1MBB);
+    break;
+  case ISD::SETEQ:
+    BuildMI(BB, DL, TII.get(BPF::JEQ_rr))
+        .addReg(LHS)
+        .addReg(RHS)
+        .addMBB(Copy1MBB);
+    break;
+  case ISD::SETNE:
+    BuildMI(BB, DL, TII.get(BPF::JNE_rr))
+        .addReg(LHS)
+        .addReg(RHS)
+        .addMBB(Copy1MBB);
+    break;
+  default:
+    report_fatal_error("unimplemented select CondCode " + Twine(CC));
+  }
+
+  // Copy0MBB:
+  //  %FalseValue = ...
+  //  # fallthrough to Copy1MBB
+  BB = Copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(Copy1MBB);
+
+  // Copy1MBB:
+  //  %Result = phi [ %FalseValue, Copy0MBB ], [ %TrueValue, ThisMBB ]
+  // ...
+  BB = Copy1MBB;
+  BuildMI(*BB, BB->begin(), DL, TII.get(BPF::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(5).getReg())
+      .addMBB(Copy0MBB)
+      .addReg(MI->getOperand(4).getReg())
+      .addMBB(ThisMBB);
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
diff --git a/lib/Target/BPF/BPFISelLowering.h b/lib/Target/BPF/BPFISelLowering.h
new file mode 100644
index 0000000..04d7908
--- /dev/null
+++ b/lib/Target/BPF/BPFISelLowering.h
@@ -0,0 +1,89 @@
+//===-- BPFISelLowering.h - BPF DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that BPF uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFISELLOWERING_H
+#define LLVM_LIB_TARGET_BPF_BPFISELLOWERING_H
+
+#include "BPF.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+namespace BPFISD {
+enum {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  RET_FLAG,
+  CALL,
+  SELECT_CC,
+  BR_CC,
+  Wrapper
+};
+}
+
+class BPFTargetLowering : public TargetLowering {
+public:
+  explicit BPFTargetLowering(const TargetMachine &TM, const BPFSubtarget &STI);
+
+  // Provide custom lowering hooks for some operations.
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  // This method returns the name of a target specific DAG node.
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr *MI,
+                              MachineBasicBlock *BB) const override;
+
+private:
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+
+  // Lower the result values of a call, copying them out of physregs into vregs
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool IsVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+                          SelectionDAG &DAG,
+                          SmallVectorImpl<SDValue> &InVals) const;
+
+  // Lower a call into CALLSEQ_START - BPFISD:CALL - CALLSEQ_END chain
+  SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  // Lower incoming arguments, copy physregs into vregs
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool IsVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               SDLoc DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+                      SelectionDAG &DAG) const override;
+
+  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                          MachineFunction &MF) const override {
+    return Size >= 8 ? MVT::i64 : MVT::i32;
+  }
+
+  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                         Type *Ty) const override {
+    return true;
+  }
+};
+}
+
+#endif
diff --git a/lib/Target/BPF/BPFInstrFormats.td b/lib/Target/BPF/BPFInstrFormats.td
new file mode 100644
index 0000000..53f3ad6
--- /dev/null
+++ b/lib/Target/BPF/BPFInstrFormats.td
@@ -0,0 +1,33 @@
+//===-- BPFInstrFormats.td - BPF Instruction Formats -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class InstBPF<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Instruction {
+  field bits<64> Inst;
+  field bits<64> SoftFail = 0;
+  let Size = 8;
+
+  let Namespace = "BPF";
+  let DecoderNamespace = "BPF";
+
+  bits<3> BPFClass;
+  let Inst{58-56} = BPFClass;
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+}
+
+// Pseudo instructions
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstBPF<outs, ins, asmstr, pattern> {
+  let Inst{63-0} = 0;
+  let isPseudo = 1;
+}
diff --git a/lib/Target/BPF/BPFInstrInfo.cpp b/lib/Target/BPF/BPFInstrInfo.cpp
new file mode 100644
index 0000000..28bd0ec
--- /dev/null
+++ b/lib/Target/BPF/BPFInstrInfo.cpp
@@ -0,0 +1,168 @@
+//===-- BPFInstrInfo.cpp - BPF Instruction Information ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the BPF implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFInstrInfo.h"
+#include "BPFSubtarget.h"
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "BPFGenInstrInfo.inc"
+
+using namespace llvm;
+
+BPFInstrInfo::BPFInstrInfo()
+    : BPFGenInstrInfo(BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {}
+
+void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, DebugLoc DL,
+                               unsigned DestReg, unsigned SrcReg,
+                               bool KillSrc) const {
+  if (BPF::GPRRegClass.contains(DestReg, SrcReg))
+    BuildMI(MBB, I, DL, get(BPF::MOV_rr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+}
+
+void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned SrcReg, bool IsKill, int FI,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end())
+    DL = I->getDebugLoc();
+
+  if (RC == &BPF::GPRRegClass)
+    BuildMI(MBB, I, DL, get(BPF::STD))
+        .addReg(SrcReg, getKillRegState(IsKill))
+        .addFrameIndex(FI)
+        .addImm(0);
+  else
+    llvm_unreachable("Can't store this register to stack slot");
+}
+
+void BPFInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned DestReg, int FI,
+                                        const TargetRegisterClass *RC,
+                                        const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end())
+    DL = I->getDebugLoc();
+
+  if (RC == &BPF::GPRRegClass)
+    BuildMI(MBB, I, DL, get(BPF::LDD), DestReg).addFrameIndex(FI).addImm(0);
+  else
+    llvm_unreachable("Can't load this register from stack slot");
+}
+
+bool BPFInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                 MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+
+    // Working from the bottom, when we see a non-terminator
+    // instruction, we're done.
+    if (!isUnpredicatedTerminator(I))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled
+    // by this analysis.
+    if (!I->isBranch())
+      return true;
+
+    // Handle unconditional branches.
+    if (I->getOpcode() == BPF::JMP) {
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a J, delete them.
+      while (std::next(I) != MBB.end())
+        std::next(I)->eraseFromParent();
+      Cond.clear();
+      FBB = 0;
+
+      // Delete the J if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = 0;
+        I->eraseFromParent();
+        I = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditinal destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+    // Cannot handle conditional branches
+    return true;
+  }
+
+  return false;
+}
+
+unsigned BPFInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock *TBB,
+                                    MachineBasicBlock *FBB,
+                                    const SmallVectorImpl<MachineOperand> &Cond,
+                                    DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+  if (Cond.empty()) {
+    // Unconditional branch
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(BPF::JMP)).addMBB(TBB);
+    return 1;
+  }
+
+  llvm_unreachable("Unexpected conditional branch");
+}
+
+unsigned BPFInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (I->getOpcode() != BPF::JMP)
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
diff --git a/lib/Target/BPF/BPFInstrInfo.h b/lib/Target/BPF/BPFInstrInfo.h
new file mode 100644
index 0000000..4056c2e
--- /dev/null
+++ b/lib/Target/BPF/BPFInstrInfo.h
@@ -0,0 +1,60 @@
+//===-- BPFInstrInfo.h - BPF Instruction Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the BPF implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFINSTRINFO_H
+#define LLVM_LIB_TARGET_BPF_BPFINSTRINFO_H
+
+#include "BPFRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "BPFGenInstrInfo.inc"
+
+namespace llvm {
+
+class BPFInstrInfo : public BPFGenInstrInfo {
+  const BPFRegisterInfo RI;
+
+public:
+  BPFInstrInfo();
+
+  const BPFRegisterInfo &getRegisterInfo() const { return RI; }
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+                           bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI, unsigned DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+};
+}
+
+#endif
diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td
new file mode 100644
index 0000000..47001f0
--- /dev/null
+++ b/lib/Target/BPF/BPFInstrInfo.td
@@ -0,0 +1,507 @@
+//===-- BPFInstrInfo.td - Target Description for BPF Target ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the BPF instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "BPFInstrFormats.td"
+
+// Instruction Operands and Patterns
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_BPFCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+def SDT_BPFCall         : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+def SDT_BPFSetFlag      : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>]>;
+def SDT_BPFSelectCC     : SDTypeProfile<1, 5, [SDTCisSameAs<1, 2>,
+                                               SDTCisSameAs<0, 4>,
+                                               SDTCisSameAs<4, 5>]>;
+def SDT_BPFBrCC         : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
+                                               SDTCisVT<3, OtherVT>]>;
+def SDT_BPFWrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+                                               SDTCisPtrTy<0>]>;
+
+def BPFcall         : SDNode<"BPFISD::CALL", SDT_BPFCall,
+                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                              SDNPVariadic]>;
+def BPFretflag      : SDNode<"BPFISD::RET_FLAG", SDTNone,
+                             [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def BPFcallseq_start: SDNode<"ISD::CALLSEQ_START", SDT_BPFCallSeqStart,
+                             [SDNPHasChain, SDNPOutGlue]>;
+def BPFcallseq_end  : SDNode<"ISD::CALLSEQ_END",   SDT_BPFCallSeqEnd,
+                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def BPFbrcc         : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC,
+                             [SDNPHasChain, SDNPOutGlue, SDNPInGlue]>;
+
+def BPFselectcc     : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC, [SDNPInGlue]>;
+def BPFWrapper      : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>;
+
+def brtarget : Operand<OtherVT>;
+def calltarget : Operand<i64>;
+
+def u64imm   : Operand<i64> {
+  let PrintMethod = "printImm64Operand";
+}
+
+def i64immSExt32 : PatLeaf<(imm),
+                [{return isInt<32>(N->getSExtValue()); }]>;
+
+// Addressing modes.
+def ADDRri : ComplexPattern<i64, 2, "SelectAddr", [frameindex], []>;
+
+// Address operands
+def MEMri : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let EncoderMethod = "getMemoryOpValue";
+  let MIOperandInfo = (ops GPR, i16imm);
+}
+
+// Conditional code predicates - used for pattern matching for jump instructions
+def BPF_CC_EQ  : PatLeaf<(imm),
+                         [{return (N->getZExtValue() == ISD::SETEQ);}]>;
+def BPF_CC_NE  : PatLeaf<(imm),
+                         [{return (N->getZExtValue() == ISD::SETNE);}]>;
+def BPF_CC_GE  : PatLeaf<(imm),
+                         [{return (N->getZExtValue() == ISD::SETGE);}]>;
+def BPF_CC_GT  : PatLeaf<(imm),
+                         [{return (N->getZExtValue() == ISD::SETGT);}]>;
+def BPF_CC_GTU : PatLeaf<(imm),
+                         [{return (N->getZExtValue() == ISD::SETUGT);}]>;
+def BPF_CC_GEU : PatLeaf<(imm),
+                         [{return (N->getZExtValue() == ISD::SETUGE);}]>;
+
+// jump instructions
+class JMP_RR<bits<4> Opc, string OpcodeStr, PatLeaf Cond>
+    : InstBPF<(outs), (ins GPR:$dst, GPR:$src, brtarget:$BrDst),
+              !strconcat(OpcodeStr, "\t$dst, $src goto $BrDst"),
+              [(BPFbrcc i64:$dst, i64:$src, Cond, bb:$BrDst)]> {
+  bits<4> op;
+  bits<1> BPFSrc;
+  bits<4> dst;
+  bits<4> src;
+  bits<16> BrDst;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{55-52} = src;
+  let Inst{51-48} = dst;
+  let Inst{47-32} = BrDst;
+
+  let op = Opc;
+  let BPFSrc = 1;
+  let BPFClass = 5; // BPF_JMP
+}
+
+class JMP_RI<bits<4> Opc, string OpcodeStr, PatLeaf Cond>
+    : InstBPF<(outs), (ins GPR:$dst, i64imm:$imm, brtarget:$BrDst),
+              !strconcat(OpcodeStr, "i\t$dst, $imm goto $BrDst"),
+              [(BPFbrcc i64:$dst, i64immSExt32:$imm, Cond, bb:$BrDst)]> {
+  bits<4> op;
+  bits<1> BPFSrc;
+  bits<4> dst;
+  bits<16> BrDst;
+  bits<32> imm;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{51-48} = dst;
+  let Inst{47-32} = BrDst;
+  let Inst{31-0} = imm;
+
+  let op = Opc;
+  let BPFSrc = 0;
+  let BPFClass = 5; // BPF_JMP
+}
+
+multiclass J<bits<4> Opc, string OpcodeStr, PatLeaf Cond> {
+  def _rr : JMP_RR<Opc, OpcodeStr, Cond>;
+  def _ri : JMP_RI<Opc, OpcodeStr, Cond>;
+}
+
+let isBranch = 1, isTerminator = 1, hasDelaySlot=0 in {
+// cmp+goto instructions
+defm JEQ  : J<0x1, "jeq",  BPF_CC_EQ>;
+defm JUGT : J<0x2, "jgt", BPF_CC_GTU>;
+defm JUGE : J<0x3, "jge", BPF_CC_GEU>;
+defm JNE  : J<0x5, "jne",  BPF_CC_NE>;
+defm JSGT : J<0x6, "jsgt", BPF_CC_GT>;
+defm JSGE : J<0x7, "jsge", BPF_CC_GE>;
+}
+
+// ALU instructions
+class ALU_RI<bits<4> Opc, string OpcodeStr, SDNode OpNode>
+    : InstBPF<(outs GPR:$dst), (ins GPR:$src2, i64imm:$imm),
+              !strconcat(OpcodeStr, "i\t$dst, $imm"),
+              [(set GPR:$dst, (OpNode GPR:$src2, i64immSExt32:$imm))]> {
+  bits<4> op;
+  bits<1> BPFSrc;
+  bits<4> dst;
+  bits<32> imm;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{51-48} = dst;
+  let Inst{31-0} = imm;
+
+  let op = Opc;
+  let BPFSrc = 0;
+  let BPFClass = 7; // BPF_ALU64
+}
+
+class ALU_RR<bits<4> Opc, string OpcodeStr, SDNode OpNode>
+    : InstBPF<(outs GPR:$dst), (ins GPR:$src2, GPR:$src),
+              !strconcat(OpcodeStr, "\t$dst, $src"),
+              [(set GPR:$dst, (OpNode i64:$src2, i64:$src))]> {
+  bits<4> op;
+  bits<1> BPFSrc;
+  bits<4> dst;
+  bits<4> src;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{55-52} = src;
+  let Inst{51-48} = dst;
+
+  let op = Opc;
+  let BPFSrc = 1;
+  let BPFClass = 7; // BPF_ALU64
+}
+
+multiclass ALU<bits<4> Opc, string OpcodeStr, SDNode OpNode> {
+  def _rr : ALU_RR<Opc, OpcodeStr, OpNode>;
+  def _ri : ALU_RI<Opc, OpcodeStr, OpNode>;
+}
+
+let Constraints = "$dst = $src2" in {
+let isAsCheapAsAMove = 1 in {
+  defm ADD : ALU<0x0, "add", add>;
+  defm SUB : ALU<0x1, "sub", sub>;
+  defm OR  : ALU<0x4, "or", or>;
+  defm AND : ALU<0x5, "and", and>;
+  defm SLL : ALU<0x6, "sll", shl>;
+  defm SRL : ALU<0x7, "srl", srl>;
+  defm XOR : ALU<0xa, "xor", xor>;
+  defm SRA : ALU<0xc, "sra", sra>;
+}
+  defm MUL : ALU<0x2, "mul", mul>;
+  defm DIV : ALU<0x3, "div", udiv>;
+}
+
+class MOV_RR<string OpcodeStr>
+    : InstBPF<(outs GPR:$dst), (ins GPR:$src),
+              !strconcat(OpcodeStr, "\t$dst, $src"),
+              []> {
+  bits<4> op;
+  bits<1> BPFSrc;
+  bits<4> dst;
+  bits<4> src;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{55-52} = src;
+  let Inst{51-48} = dst;
+
+  let op = 0xb;     // BPF_MOV
+  let BPFSrc = 1;   // BPF_X
+  let BPFClass = 7; // BPF_ALU64
+}
+
+class MOV_RI<string OpcodeStr>
+    : InstBPF<(outs GPR:$dst), (ins i64imm:$imm),
+              !strconcat(OpcodeStr, "\t$dst, $imm"),
+              [(set GPR:$dst, (i64 i64immSExt32:$imm))]> {
+  bits<4> op;
+  bits<1> BPFSrc;
+  bits<4> dst;
+  bits<32> imm;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{51-48} = dst;
+  let Inst{31-0} = imm;
+
+  let op = 0xb;     // BPF_MOV
+  let BPFSrc = 0;   // BPF_K
+  let BPFClass = 7; // BPF_ALU64
+}
+def MOV_rr : MOV_RR<"mov">;
+def MOV_ri : MOV_RI<"mov">;
+
+class LD_IMM64<bits<4> Pseudo, string OpcodeStr>
+    : InstBPF<(outs GPR:$dst), (ins u64imm:$imm),
+              !strconcat(OpcodeStr, "\t$dst, $imm"),
+              [(set GPR:$dst, (i64 imm:$imm))]> {
+
+  bits<3> mode;
+  bits<2> size;
+  bits<4> dst;
+  bits<64> imm;
+
+  let Inst{63-61} = mode;
+  let Inst{60-59} = size;
+  let Inst{51-48} = dst;
+  let Inst{55-52} = Pseudo;
+  let Inst{47-32} = 0;
+  let Inst{31-0} = imm{31-0};
+
+  let mode = 0;     // BPF_IMM
+  let size = 3;     // BPF_DW
+  let BPFClass = 0; // BPF_LD
+}
+def LD_imm64 : LD_IMM64<0, "ld_64">;
+
+// STORE instructions
+class STORE<bits<2> SizeOp, string OpcodeStr, list<dag> Pattern>
+    : InstBPF<(outs), (ins GPR:$src, MEMri:$addr),
+              !strconcat(OpcodeStr, "\t$addr, $src"), Pattern> {
+  bits<3> mode;
+  bits<2> size;
+  bits<4> src;
+  bits<20> addr;
+
+  let Inst{63-61} = mode;
+  let Inst{60-59} = size;
+  let Inst{51-48} = addr{19-16}; // base reg
+  let Inst{55-52} = src;
+  let Inst{47-32} = addr{15-0}; // offset
+
+  let mode = 3;     // BPF_MEM
+  let size = SizeOp;
+  let BPFClass = 3; // BPF_STX
+}
+
+class STOREi64<bits<2> Opc, string OpcodeStr, PatFrag OpNode>
+    : STORE<Opc, OpcodeStr, [(OpNode i64:$src, ADDRri:$addr)]>;
+
+def STW : STOREi64<0x0, "stw", truncstorei32>;
+def STH : STOREi64<0x1, "sth", truncstorei16>;
+def STB : STOREi64<0x2, "stb", truncstorei8>;
+def STD : STOREi64<0x3, "std", store>;
+
+// LOAD instructions
+class LOAD<bits<2> SizeOp, string OpcodeStr, list<dag> Pattern>
+    : InstBPF<(outs GPR:$dst), (ins MEMri:$addr),
+              !strconcat(OpcodeStr, "\t$dst, $addr"), Pattern> {
+  bits<3> mode;
+  bits<2> size;
+  bits<4> dst;
+  bits<20> addr;
+
+  let Inst{63-61} = mode;
+  let Inst{60-59} = size;
+  let Inst{51-48} = dst;
+  let Inst{55-52} = addr{19-16};
+  let Inst{47-32} = addr{15-0};
+
+  let mode = 3;     // BPF_MEM
+  let size = SizeOp;
+  let BPFClass = 1; // BPF_LDX
+}
+
+class LOADi64<bits<2> SizeOp, string OpcodeStr, PatFrag OpNode>
+    : LOAD<SizeOp, OpcodeStr, [(set i64:$dst, (OpNode ADDRri:$addr))]>;
+
+def LDW : LOADi64<0x0, "ldw", zextloadi32>;
+def LDH : LOADi64<0x1, "ldh", zextloadi16>;
+def LDB : LOADi64<0x2, "ldb", zextloadi8>;
+def LDD : LOADi64<0x3, "ldd", load>;
+
+class BRANCH<bits<4> Opc, string OpcodeStr, list<dag> Pattern>
+    : InstBPF<(outs), (ins brtarget:$BrDst),
+              !strconcat(OpcodeStr, "\t$BrDst"), Pattern> {
+  bits<4> op;
+  bits<16> BrDst;
+  bits<1> BPFSrc;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{47-32} = BrDst;
+
+  let op = Opc;
+  let BPFSrc = 0;
+  let BPFClass = 5; // BPF_JMP
+}
+
+class CALL<string OpcodeStr>
+    : InstBPF<(outs), (ins calltarget:$BrDst),
+              !strconcat(OpcodeStr, "\t$BrDst"), []> {
+  bits<4> op;
+  bits<32> BrDst;
+  bits<1> BPFSrc;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{31-0} = BrDst;
+
+  let op = 8;       // BPF_CALL
+  let BPFSrc = 0;
+  let BPFClass = 5; // BPF_JMP
+}
+
+// Jump always
+let isBranch = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1 in {
+  def JMP : BRANCH<0x0, "jmp", [(br bb:$BrDst)]>;
+}
+
+// Jump and link
+let isCall=1, hasDelaySlot=0, Uses = [R11],
+    // Potentially clobbered registers
+    Defs = [R0, R1, R2, R3, R4, R5] in {
+  def JAL  : CALL<"call">;
+}
+
+class NOP_I<string OpcodeStr>
+    : InstBPF<(outs), (ins i32imm:$imm),
+              !strconcat(OpcodeStr, "\t$imm"), []> {
+  // mov r0, r0 == nop
+  bits<4> op;
+  bits<1> BPFSrc;
+  bits<4> dst;
+  bits<4> src;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{55-52} = src;
+  let Inst{51-48} = dst;
+
+  let op = 0xb;     // BPF_MOV
+  let BPFSrc = 1;   // BPF_X
+  let BPFClass = 7; // BPF_ALU64
+  let src = 0;      // R0
+  let dst = 0;      // R0
+}
+
+let hasSideEffects = 0 in
+  def NOP : NOP_I<"nop">;
+
+class RET<string OpcodeStr>
+    : InstBPF<(outs), (ins),
+              !strconcat(OpcodeStr, ""), [(BPFretflag)]> {
+  bits<4> op;
+
+  let Inst{63-60} = op;
+  let Inst{59} = 0;
+  let Inst{31-0} = 0;
+
+  let op = 9;       // BPF_EXIT
+  let BPFClass = 5; // BPF_JMP
+}
+
+let isReturn = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1,
+    isNotDuplicable = 1 in {
+  def RET : RET<"ret">;
+}
+
+// ADJCALLSTACKDOWN/UP pseudo insns
+let Defs = [R11], Uses = [R11] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
+                              "#ADJCALLSTACKDOWN $amt",
+                              [(BPFcallseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+                              "#ADJCALLSTACKUP $amt1 $amt2",
+                              [(BPFcallseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let usesCustomInserter = 1 in {
+  def Select : Pseudo<(outs GPR:$dst),
+                      (ins GPR:$lhs, GPR:$rhs, i64imm:$imm, GPR:$src, GPR:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i64:$dst,
+                       (BPFselectcc i64:$lhs, i64:$rhs, (i64 imm:$imm), i64:$src, i64:$src2))]>;
+}
+
+// load 64-bit global addr into register
+def : Pat<(BPFWrapper tglobaladdr:$in), (LD_imm64 tglobaladdr:$in)>;
+
+// 0xffffFFFF doesn't fit into simm32, optimize common case
+def : Pat<(i64 (and (i64 GPR:$src), 0xffffFFFF)),
+          (SRL_ri (SLL_ri (i64 GPR:$src), 32), 32)>;
+
+// Calls
+def : Pat<(BPFcall tglobaladdr:$dst), (JAL tglobaladdr:$dst)>;
+def : Pat<(BPFcall imm:$dst), (JAL imm:$dst)>;
+
+// Loads
+def : Pat<(extloadi8  ADDRri:$src), (i64 (LDB ADDRri:$src))>;
+def : Pat<(extloadi16 ADDRri:$src), (i64 (LDH ADDRri:$src))>;
+def : Pat<(extloadi32 ADDRri:$src), (i64 (LDW ADDRri:$src))>;
+
+// Atomics
+class XADD<bits<2> SizeOp, string OpcodeStr, PatFrag OpNode>
+    : InstBPF<(outs GPR:$dst), (ins MEMri:$addr, GPR:$val),
+              !strconcat(OpcodeStr, "\t$dst, $addr, $val"),
+              [(set GPR:$dst, (OpNode ADDRri:$addr, GPR:$val))]> {
+  bits<3> mode;
+  bits<2> size;
+  bits<4> src;
+  bits<20> addr;
+
+  let Inst{63-61} = mode;
+  let Inst{60-59} = size;
+  let Inst{51-48} = addr{19-16}; // base reg
+  let Inst{55-52} = src;
+  let Inst{47-32} = addr{15-0}; // offset
+
+  let mode = 6;     // BPF_XADD
+  let size = SizeOp;
+  let BPFClass = 3; // BPF_STX
+}
+
+let Constraints = "$dst = $val" in {
+def XADD32 : XADD<0, "xadd32", atomic_load_add_32>;
+def XADD64 : XADD<3, "xadd64", atomic_load_add_64>;
+// undefined def XADD16 : XADD<1, "xadd16", atomic_load_add_16>;
+// undefined def XADD8  : XADD<2, "xadd8", atomic_load_add_8>;
+}
+
+let Defs = [R0, R1, R2, R3, R4, R5], Uses = [R6], hasSideEffects = 1,
+    hasExtraDefRegAllocReq = 1, hasExtraSrcRegAllocReq = 1, mayLoad = 1 in {
+class LOAD_ABS<bits<2> SizeOp, string OpcodeStr, Intrinsic OpNode>
+    : InstBPF<(outs), (ins GPR:$skb, i64imm:$imm),
+              !strconcat(OpcodeStr, "\tr0, $skb.data + $imm"),
+              [(set R0, (OpNode GPR:$skb, i64immSExt32:$imm))]> {
+  bits<3> mode;
+  bits<2> size;
+  bits<32> imm;
+
+  let Inst{63-61} = mode;
+  let Inst{60-59} = size;
+  let Inst{31-0} = imm;
+
+  let mode = 1;     // BPF_ABS
+  let size = SizeOp;
+  let BPFClass = 0; // BPF_LD
+}
+
+class LOAD_IND<bits<2> SizeOp, string OpcodeStr, Intrinsic OpNode>
+    : InstBPF<(outs), (ins GPR:$skb, GPR:$val),
+              !strconcat(OpcodeStr, "\tr0, $skb.data + $val"),
+              [(set R0, (OpNode GPR:$skb, GPR:$val))]> {
+  bits<3> mode;
+  bits<2> size;
+  bits<4> val;
+
+  let Inst{63-61} = mode;
+  let Inst{60-59} = size;
+  let Inst{55-52} = val;
+
+  let mode = 2;     // BPF_IND
+  let size = SizeOp;
+  let BPFClass = 0; // BPF_LD
+}
+}
+
+def LD_ABS_B : LOAD_ABS<2, "ldabs_b", int_bpf_load_byte>;
+def LD_ABS_H : LOAD_ABS<1, "ldabs_h", int_bpf_load_half>;
+def LD_ABS_W : LOAD_ABS<0, "ldabs_w", int_bpf_load_word>;
+
+def LD_IND_B : LOAD_IND<2, "ldind_b", int_bpf_load_byte>;
+def LD_IND_H : LOAD_IND<1, "ldind_h", int_bpf_load_half>;
+def LD_IND_W : LOAD_IND<0, "ldind_w", int_bpf_load_word>;
diff --git a/lib/Target/BPF/BPFMCInstLower.cpp b/lib/Target/BPF/BPFMCInstLower.cpp
new file mode 100644
index 0000000..5a695f0
--- /dev/null
+++ b/lib/Target/BPF/BPFMCInstLower.cpp
@@ -0,0 +1,77 @@
+//=-- BPFMCInstLower.cpp - Convert BPF MachineInstr to an MCInst ------------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower BPF MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFMCInstLower.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/SmallString.h"
+using namespace llvm;
+
+MCSymbol *
+BPFMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  return Printer.getSymbol(MO.getGlobal());
+}
+
+MCOperand BPFMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                             MCSymbol *Sym) const {
+
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+
+  if (!MO.isJTI() && MO.getOffset())
+    llvm_unreachable("unknown symbol op");
+
+  return MCOperand::CreateExpr(Expr);
+}
+
+void BPFMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit())
+        continue;
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::CreateExpr(
+          MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx));
+      break;
+    case MachineOperand::MO_RegisterMask:
+      continue;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+      break;
+    }
+
+    OutMI.addOperand(MCOp);
+  }
+}
diff --git a/lib/Target/BPF/BPFMCInstLower.h b/lib/Target/BPF/BPFMCInstLower.h
new file mode 100644
index 0000000..054e894
--- /dev/null
+++ b/lib/Target/BPF/BPFMCInstLower.h
@@ -0,0 +1,43 @@
+//===-- BPFMCInstLower.h - Lower MachineInstr to MCInst ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFMCINSTLOWER_H
+#define LLVM_LIB_TARGET_BPF_BPFMCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+class MachineInstr;
+class MachineModuleInfoMachO;
+class MachineOperand;
+class Mangler;
+
+// BPFMCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class LLVM_LIBRARY_VISIBILITY BPFMCInstLower {
+  MCContext &Ctx;
+
+  AsmPrinter &Printer;
+
+public:
+  BPFMCInstLower(MCContext &ctx, AsmPrinter &printer)
+      : Ctx(ctx), Printer(printer) {}
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+};
+}
+
+#endif
diff --git a/lib/Target/BPF/BPFRegisterInfo.cpp b/lib/Target/BPF/BPFRegisterInfo.cpp
new file mode 100644
index 0000000..8f885c3
--- /dev/null
+++ b/lib/Target/BPF/BPFRegisterInfo.cpp
@@ -0,0 +1,88 @@
+//===-- BPFRegisterInfo.cpp - BPF Register Information ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the BPF implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFRegisterInfo.h"
+#include "BPFSubtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "BPFGenRegisterInfo.inc"
+using namespace llvm;
+
+BPFRegisterInfo::BPFRegisterInfo()
+    : BPFGenRegisterInfo(BPF::R0) {}
+
+const MCPhysReg *
+BPFRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  return CSR_SaveList;
+}
+
+BitVector BPFRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(BPF::R10); // R10 is read only frame pointer
+  Reserved.set(BPF::R11); // R11 is pseudo stack pointer
+  return Reserved;
+}
+
+void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                          int SPAdj, unsigned FIOperandNum,
+                                          RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  unsigned FrameReg = getFrameRegister(MF);
+  int FrameIndex = MI.getOperand(i).getIndex();
+
+  if (MI.getOpcode() == BPF::MOV_rr) {
+    const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+    int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+
+    MI.getOperand(i).ChangeToRegister(FrameReg, false);
+
+    MachineBasicBlock &MBB = *MI.getParent();
+    unsigned reg = MI.getOperand(i - 1).getReg();
+    BuildMI(MBB, ++II, DL, TII.get(BPF::ADD_ri), reg)
+        .addReg(reg)
+        .addImm(Offset);
+    return;
+  }
+
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+               MI.getOperand(i + 1).getImm();
+
+  if (!isInt<32>(Offset))
+    llvm_unreachable("bug in frame offset");
+
+  MI.getOperand(i).ChangeToRegister(FrameReg, false);
+  MI.getOperand(i + 1).ChangeToImmediate(Offset);
+}
+
+unsigned BPFRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return BPF::R10;
+}
diff --git a/lib/Target/BPF/BPFRegisterInfo.h b/lib/Target/BPF/BPFRegisterInfo.h
new file mode 100644
index 0000000..364d6f6
--- /dev/null
+++ b/lib/Target/BPF/BPFRegisterInfo.h
@@ -0,0 +1,41 @@
+//===-- BPFRegisterInfo.h - BPF Register Information Impl -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the BPF implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFREGISTERINFO_H
+#define LLVM_LIB_TARGET_BPF_BPFREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "BPFGenRegisterInfo.inc"
+
+namespace llvm {
+
+struct BPFRegisterInfo : public BPFGenRegisterInfo {
+
+  BPFRegisterInfo();
+
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+};
+}
+
+#endif
diff --git a/lib/Target/BPF/BPFRegisterInfo.td b/lib/Target/BPF/BPFRegisterInfo.td
new file mode 100644
index 0000000..c8e24f8
--- /dev/null
+++ b/lib/Target/BPF/BPFRegisterInfo.td
@@ -0,0 +1,41 @@
+//===-- BPFRegisterInfo.td - BPF Register defs -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the BPF register file
+//===----------------------------------------------------------------------===//
+
+// Registers are identified with 4-bit ID numbers.
+// Ri - 64-bit integer registers
+class Ri<bits<16> Enc, string n> : Register<n> {
+  let Namespace = "BPF";
+  let HWEncoding = Enc;
+}
+
+// Integer registers
+def R0 : Ri< 0, "r0">, DwarfRegNum<[0]>;
+def R1 : Ri< 1, "r1">, DwarfRegNum<[1]>;
+def R2 : Ri< 2, "r2">, DwarfRegNum<[2]>;
+def R3 : Ri< 3, "r3">, DwarfRegNum<[3]>;
+def R4 : Ri< 4, "r4">, DwarfRegNum<[4]>;
+def R5 : Ri< 5, "r5">, DwarfRegNum<[5]>;
+def R6 : Ri< 6, "r6">, DwarfRegNum<[6]>;
+def R7 : Ri< 7, "r7">, DwarfRegNum<[7]>;
+def R8 : Ri< 8, "r8">, DwarfRegNum<[8]>;
+def R9 : Ri< 9, "r9">, DwarfRegNum<[9]>;
+def R10 : Ri<10, "r10">, DwarfRegNum<[10]>;
+def R11 : Ri<11, "r11">, DwarfRegNum<[11]>;
+
+// Register classes.
+def GPR : RegisterClass<"BPF", [i64], 64, (add R1, R2, R3, R4, R5,
+                                           R6, R7, R8, R9, // callee saved
+                                           R0,  // return value
+                                           R11, // stack ptr
+                                           R10  // frame ptr
+                                          )>;
diff --git a/lib/Target/BPF/BPFSubtarget.cpp b/lib/Target/BPF/BPFSubtarget.cpp
new file mode 100644
index 0000000..7f7a262
--- /dev/null
+++ b/lib/Target/BPF/BPFSubtarget.cpp
@@ -0,0 +1,31 @@
+//===-- BPFSubtarget.cpp - BPF Subtarget Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the BPF specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFSubtarget.h"
+#include "BPF.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "BPFGenSubtargetInfo.inc"
+
+void BPFSubtarget::anchor() {}
+
+BPFSubtarget::BPFSubtarget(const std::string &TT, const std::string &CPU,
+                           const std::string &FS, const TargetMachine &TM)
+    : BPFGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(*this),
+      TLInfo(TM, *this), TSInfo(TM.getDataLayout()) {}
diff --git a/lib/Target/BPF/BPFSubtarget.h b/lib/Target/BPF/BPFSubtarget.h
new file mode 100644
index 0000000..347cffd8
--- /dev/null
+++ b/lib/Target/BPF/BPFSubtarget.h
@@ -0,0 +1,64 @@
+//===-- BPFSubtarget.h - Define Subtarget for the BPF -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the BPF specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFSUBTARGET_H
+#define LLVM_LIB_TARGET_BPF_BPFSUBTARGET_H
+
+#include "BPFFrameLowering.h"
+#include "BPFISelLowering.h"
+#include "BPFInstrInfo.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "BPFGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class BPFSubtarget : public BPFGenSubtargetInfo {
+  virtual void anchor();
+  BPFInstrInfo InstrInfo;
+  BPFFrameLowering FrameLowering;
+  BPFTargetLowering TLInfo;
+  TargetSelectionDAGInfo TSInfo;
+
+public:
+  // This constructor initializes the data members to match that
+  // of the specified triple.
+  BPFSubtarget(const std::string &TT, const std::string &CPU,
+               const std::string &FS, const TargetMachine &TM);
+
+  // ParseSubtargetFeatures - Parses features string setting specified
+  // subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  const BPFInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const BPFFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const BPFTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const TargetRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+};
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp
new file mode 100644
index 0000000..5245395
--- /dev/null
+++ b/lib/Target/BPF/BPFTargetMachine.cpp
@@ -0,0 +1,69 @@
+//===-- BPFTargetMachine.cpp - Define TargetMachine for BPF ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about BPF target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeBPFTarget() {
+  // Register the target.
+  RegisterTargetMachine<BPFTargetMachine> X(TheBPFTarget);
+}
+
+// DataLayout --> Little-endian, 64-bit pointer/ABI/alignment
+// The stack is always 8 byte aligned
+// On function prologue, the stack is created by decrementing
+// its pointer. Once decremented, all references are done with positive
+// offset from the stack/frame pointer.
+BPFTargetMachine::BPFTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                                   StringRef FS, const TargetOptions &Options,
+                                   Reloc::Model RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      TLOF(make_unique<TargetLoweringObjectFileELF>()),
+      DL("e-m:e-p:64:64-i64:64-n32:64-S128"),
+      Subtarget(TT, CPU, FS, *this) {
+  initAsmInfo();
+}
+namespace {
+// BPF Code Generator Pass Configuration Options.
+class BPFPassConfig : public TargetPassConfig {
+public:
+  BPFPassConfig(BPFTargetMachine *TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  BPFTargetMachine &getBPFTargetMachine() const {
+    return getTM<BPFTargetMachine>();
+  }
+
+  bool addInstSelector() override;
+};
+}
+
+TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new BPFPassConfig(this, PM);
+}
+
+// Install an instruction selector pass using
+// the ISelDag to gen BPF code.
+bool BPFPassConfig::addInstSelector() {
+  addPass(createBPFISelDag(getBPFTargetMachine()));
+
+  return false;
+}
diff --git a/lib/Target/BPF/BPFTargetMachine.h b/lib/Target/BPF/BPFTargetMachine.h
new file mode 100644
index 0000000..821cffc
--- /dev/null
+++ b/lib/Target/BPF/BPFTargetMachine.h
@@ -0,0 +1,42 @@
+//===-- BPFTargetMachine.h - Define TargetMachine for BPF --- C++ ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the BPF specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFTARGETMACHINE_H
+#define LLVM_LIB_TARGET_BPF_BPFTARGETMACHINE_H
+
+#include "BPFSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class BPFTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  const DataLayout DL;
+  BPFSubtarget Subtarget;
+
+public:
+  BPFTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+                   const TargetOptions &Options, Reloc::Model RM,
+                   CodeModel::Model CM, CodeGenOpt::Level OL);
+
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const BPFSubtarget *getSubtargetImpl() const override { return &Subtarget; }
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+};
+}
+
+#endif
diff --git a/lib/Target/BPF/CMakeLists.txt b/lib/Target/BPF/CMakeLists.txt
new file mode 100644
index 0000000..3eac6e9
--- /dev/null
+++ b/lib/Target/BPF/CMakeLists.txt
@@ -0,0 +1,27 @@
+set(LLVM_TARGET_DEFINITIONS BPF.td)
+
+tablegen(LLVM BPFGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM BPFGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM BPFGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM X86GenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM BPFGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM BPFGenMCCodeEmitter.inc -gen-emitter)
+tablegen(LLVM BPFGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM BPFGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(BPFCommonTableGen)
+
+add_llvm_target(BPFCodeGen
+  BPFAsmPrinter.cpp
+  BPFFrameLowering.cpp
+  BPFInstrInfo.cpp
+  BPFISelDAGToDAG.cpp
+  BPFISelLowering.cpp
+  BPFMCInstLower.cpp
+  BPFRegisterInfo.cpp
+  BPFSubtarget.cpp
+  BPFTargetMachine.cpp
+  )
+
+add_subdirectory(InstPrinter)
+add_subdirectory(TargetInfo)
+add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp b/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
new file mode 100644
index 0000000..3f09379
--- /dev/null
+++ b/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
@@ -0,0 +1,86 @@
+//===-- BPFInstPrinter.cpp - Convert BPF MCInst to asm syntax -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an BPF MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFInstPrinter.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#include "BPFGenAsmWriter.inc"
+
+void BPFInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot) {
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+static void printExpr(const MCExpr *Expr, raw_ostream &O) {
+  const MCSymbolRefExpr *SRE;
+
+  if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr))
+    SRE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+  else
+    SRE = dyn_cast<MCSymbolRefExpr>(Expr);
+  assert(SRE && "Unexpected MCExpr type.");
+
+  MCSymbolRefExpr::VariantKind Kind = SRE->getKind();
+
+  assert(Kind == MCSymbolRefExpr::VK_None);
+  O << *Expr;
+}
+
+void BPFInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O, const char *Modifier) {
+  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    O << getRegisterName(Op.getReg());
+  } else if (Op.isImm()) {
+    O << (int32_t)Op.getImm();
+  } else {
+    assert(Op.isExpr() && "Expected an expression");
+    printExpr(Op.getExpr(), O);
+  }
+}
+
+void BPFInstPrinter::printMemOperand(const MCInst *MI, int OpNo, raw_ostream &O,
+                                     const char *Modifier) {
+  const MCOperand &RegOp = MI->getOperand(OpNo);
+  const MCOperand &OffsetOp = MI->getOperand(OpNo + 1);
+  // offset
+  if (OffsetOp.isImm())
+    O << formatDec(OffsetOp.getImm());
+  else
+    assert(0 && "Expected an immediate");
+
+  // register
+  assert(RegOp.isReg() && "Register operand not a register");
+  O << '(' << getRegisterName(RegOp.getReg()) << ')';
+}
+
+void BPFInstPrinter::printImm64Operand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << (uint64_t)Op.getImm();
+  else
+    O << Op;
+}
diff --git a/lib/Target/BPF/InstPrinter/BPFInstPrinter.h b/lib/Target/BPF/InstPrinter/BPFInstPrinter.h
new file mode 100644
index 0000000..d7c2899
--- /dev/null
+++ b/lib/Target/BPF/InstPrinter/BPFInstPrinter.h
@@ -0,0 +1,41 @@
+//===-- BPFInstPrinter.h - Convert BPF MCInst to asm syntax -------*- C++ -*--//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a BPF MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_INSTPRINTER_BPFINSTPRINTER_H
+#define LLVM_LIB_TARGET_BPF_INSTPRINTER_BPFINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+class MCOperand;
+
+class BPFInstPrinter : public MCInstPrinter {
+public:
+  BPFInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                 const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                    const char *Modifier = nullptr);
+  void printMemOperand(const MCInst *MI, int OpNo, raw_ostream &O,
+                       const char *Modifier = nullptr);
+  void printImm64Operand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+};
+}
+
+#endif
diff --git a/lib/Target/BPF/InstPrinter/CMakeLists.txt b/lib/Target/BPF/InstPrinter/CMakeLists.txt
new file mode 100644
index 0000000..f9e9161
--- /dev/null
+++ b/lib/Target/BPF/InstPrinter/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMBPFAsmPrinter
+  BPFInstPrinter.cpp
+  )
diff --git a/lib/Target/BPF/InstPrinter/LLVMBuild.txt b/lib/Target/BPF/InstPrinter/LLVMBuild.txt
new file mode 100644
index 0000000..88a937a
--- /dev/null
+++ b/lib/Target/BPF/InstPrinter/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/BPF/InstPrinter/LLVMBuild.txt ---------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = BPFAsmPrinter
+parent = BPF
+required_libraries = MC Support
+add_to_library_groups = BPF
diff --git a/lib/Target/BPF/InstPrinter/Makefile b/lib/Target/BPF/InstPrinter/Makefile
new file mode 100644
index 0000000..f46af83
--- /dev/null
+++ b/lib/Target/BPF/InstPrinter/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/BPF/InstPrinter/Makefile -----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMBPFAsmPrinter
+
+# Hack: we need to include 'main' BPF target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/BPF/LLVMBuild.txt b/lib/Target/BPF/LLVMBuild.txt
new file mode 100644
index 0000000..11578c8
--- /dev/null
+++ b/lib/Target/BPF/LLVMBuild.txt
@@ -0,0 +1,32 @@
+;===- ./lib/Target/BPF/LLVMBuild.txt ---------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = InstPrinter MCTargetDesc TargetInfo
+
+[component_0]
+type = TargetGroup
+name = BPF
+parent = Target
+has_asmprinter = 1
+
+[component_1]
+type = Library
+name = BPFCodeGen
+parent = BPF
+required_libraries = AsmPrinter CodeGen Core MC BPFAsmPrinter BPFDesc BPFInfo SelectionDAG Support Target
+add_to_library_groups = BPF
diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
new file mode 100644
index 0000000..87c8077
--- /dev/null
+++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -0,0 +1,83 @@
+//===-- BPFAsmBackend.cpp - BPF Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class BPFAsmBackend : public MCAsmBackend {
+public:
+  BPFAsmBackend() : MCAsmBackend() {}
+  ~BPFAsmBackend() override {}
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override;
+
+  // No instruction requires relaxation
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+
+  unsigned getNumFixupKinds() const override { return 1; }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {}
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+
+bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  if ((Count % 8) != 0)
+    return false;
+
+  for (uint64_t i = 0; i < Count; i += 8)
+    OW->Write64(0x15000000);
+
+  return true;
+}
+
+void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                               unsigned DataSize, uint64_t Value,
+                               bool IsPCRel) const {
+
+  if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
+    assert(Value == 0);
+    return;
+  }
+  assert(Fixup.getKind() == FK_PCRel_2);
+  *(uint16_t *)&Data[Fixup.getOffset() + 2] = (uint16_t)((Value - 8) / 8);
+}
+
+MCObjectWriter *BPFAsmBackend::createObjectWriter(raw_ostream &OS) const {
+  return createBPFELFObjectWriter(OS, 0);
+}
+}
+
+MCAsmBackend *llvm::createBPFAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI, StringRef TT,
+                                        StringRef CPU) {
+  return new BPFAsmBackend();
+}
diff --git a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
new file mode 100644
index 0000000..169a8a7
--- /dev/null
+++ b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -0,0 +1,53 @@
+//===-- BPFELFObjectWriter.cpp - BPF ELF Writer ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class BPFELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  BPFELFObjectWriter(uint8_t OSABI);
+
+  ~BPFELFObjectWriter() override;
+
+protected:
+  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsPCRel) const override;
+};
+}
+
+BPFELFObjectWriter::BPFELFObjectWriter(uint8_t OSABI)
+    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_NONE,
+                              /*HasRelocationAddend*/ false) {}
+
+BPFELFObjectWriter::~BPFELFObjectWriter() {}
+
+unsigned BPFELFObjectWriter::GetRelocType(const MCValue &Target,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel) const {
+  // determine the type of the relocation
+  switch ((unsigned)Fixup.getKind()) {
+  default:
+    llvm_unreachable("invalid fixup kind!");
+  case FK_SecRel_8:
+    return ELF::R_X86_64_64;
+  case FK_SecRel_4:
+    return ELF::R_X86_64_PC32;
+  }
+}
+
+MCObjectWriter *llvm::createBPFELFObjectWriter(raw_ostream &OS, uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new BPFELFObjectWriter(OSABI);
+  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true);
+}
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
new file mode 100644
index 0000000..ab61ae7
--- /dev/null
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -0,0 +1,36 @@
+//===-- BPFMCAsmInfo.h - BPF asm properties -------------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the BPFMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H
+#define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+class Target;
+
+class BPFMCAsmInfo : public MCAsmInfo {
+public:
+  explicit BPFMCAsmInfo(StringRef TT) {
+    PrivateGlobalPrefix = ".L";
+    WeakRefDirective = "\t.weak\t";
+
+    UsesELFSectionDirectiveForBSS = true;
+    HasSingleParameterDotFile = false;
+    HasDotTypeDotSizeDirective = false;
+  }
+};
+}
+
+#endif
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
new file mode 100644
index 0000000..b94693a
--- /dev/null
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -0,0 +1,167 @@
+//===-- BPFMCCodeEmitter.cpp - Convert BPF code to machine code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the BPFMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+namespace {
+class BPFMCCodeEmitter : public MCCodeEmitter {
+  BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete;
+  void operator=(const BPFMCCodeEmitter &) = delete;
+  const MCRegisterInfo &MRI;
+
+public:
+  BPFMCCodeEmitter(const MCRegisterInfo &mri) : MRI(mri) {}
+
+  ~BPFMCCodeEmitter() {}
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  // getMachineOpValue - Return binary encoding of operand. If the machin
+  // operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  uint64_t getMemoryOpValue(const MCInst &MI, unsigned Op,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+};
+}
+
+MCCodeEmitter *llvm::createBPFMCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
+                                            const MCSubtargetInfo &STI,
+                                            MCContext &Ctx) {
+  return new BPFMCCodeEmitter(MRI);
+}
+
+unsigned BPFMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                             const MCOperand &MO,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return MRI.getEncodingValue(MO.getReg());
+  if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+
+  assert(MO.isExpr());
+
+  const MCExpr *Expr = MO.getExpr();
+  MCExpr::ExprKind Kind = Expr->getKind();
+
+  assert(Kind == MCExpr::SymbolRef);
+
+  if (MI.getOpcode() == BPF::JAL)
+    // func call name
+    Fixups.push_back(MCFixup::Create(0, Expr, FK_SecRel_4));
+  else if (MI.getOpcode() == BPF::LD_imm64)
+    Fixups.push_back(MCFixup::Create(0, Expr, FK_SecRel_8));
+  else
+    // bb label
+    Fixups.push_back(MCFixup::Create(0, Expr, FK_PCRel_2));
+
+  return 0;
+}
+
+// Emit one byte through output stream
+void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) {
+  OS << (char)C;
+  ++CurByte;
+}
+
+// Emit a series of bytes (little endian)
+void EmitLEConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+                    raw_ostream &OS) {
+  assert(Size <= 8 && "size too big in emit constant");
+
+  for (unsigned i = 0; i != Size; ++i) {
+    EmitByte(Val & 255, CurByte, OS);
+    Val >>= 8;
+  }
+}
+
+// Emit a series of bytes (big endian)
+void EmitBEConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+                    raw_ostream &OS) {
+  assert(Size <= 8 && "size too big in emit constant");
+
+  for (int i = (Size - 1) * 8; i >= 0; i -= 8)
+    EmitByte((Val >> i) & 255, CurByte, OS);
+}
+
+void BPFMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  unsigned Opcode = MI.getOpcode();
+  // Keep track of the current byte being emitted
+  unsigned CurByte = 0;
+
+  if (Opcode == BPF::LD_imm64) {
+    uint64_t Value = getBinaryCodeForInstr(MI, Fixups, STI);
+    EmitByte(Value >> 56, CurByte, OS);
+    EmitByte(((Value >> 48) & 0xff), CurByte, OS);
+    EmitLEConstant(0, 2, CurByte, OS);
+    EmitLEConstant(Value & 0xffffFFFF, 4, CurByte, OS);
+
+    const MCOperand &MO = MI.getOperand(1);
+    uint64_t Imm = MO.isImm() ? MO.getImm() : 0;
+    EmitByte(0, CurByte, OS);
+    EmitByte(0, CurByte, OS);
+    EmitLEConstant(0, 2, CurByte, OS);
+    EmitLEConstant(Imm >> 32, 4, CurByte, OS);
+  } else {
+    // Get instruction encoding and emit it
+    uint64_t Value = getBinaryCodeForInstr(MI, Fixups, STI);
+    EmitByte(Value >> 56, CurByte, OS);
+    EmitByte((Value >> 48) & 0xff, CurByte, OS);
+    EmitLEConstant((Value >> 32) & 0xffff, 2, CurByte, OS);
+    EmitLEConstant(Value & 0xffffFFFF, 4, CurByte, OS);
+  }
+}
+
+// Encode BPF Memory Operand
+uint64_t BPFMCCodeEmitter::getMemoryOpValue(const MCInst &MI, unsigned Op,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  uint64_t Encoding;
+  const MCOperand Op1 = MI.getOperand(1);
+  assert(Op1.isReg() && "First operand is not register.");
+  Encoding = MRI.getEncodingValue(Op1.getReg());
+  Encoding <<= 16;
+  MCOperand Op2 = MI.getOperand(2);
+  assert(Op2.isImm() && "Second operand is not immediate.");
+  Encoding |= Op2.getImm() & 0xffff;
+  return Encoding;
+}
+
+#include "BPFGenMCCodeEmitter.inc"
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
new file mode 100644
index 0000000..f82f009
--- /dev/null
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -0,0 +1,111 @@
+//===-- BPFMCTargetDesc.cpp - BPF Target Descriptions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides BPF specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFMCTargetDesc.h"
+#include "BPFMCAsmInfo.h"
+#include "InstPrinter/BPFInstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "BPFGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "BPFGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "BPFGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createBPFMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitBPFMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createBPFMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitBPFMCRegisterInfo(X, BPF::R11 /* RAReg doesn't exist */);
+  return X;
+}
+
+static MCSubtargetInfo *createBPFMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                 StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitBPFMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createBPFMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCStreamer *createBPFMCStreamer(const Target &T, StringRef TT,
+                                       MCContext &Ctx, MCAsmBackend &MAB,
+                                       raw_ostream &_OS,
+                                       MCCodeEmitter *_Emitter,
+                                       const MCSubtargetInfo &STI,
+                                       bool RelaxAll) {
+  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll);
+}
+
+static MCInstPrinter *
+createBPFMCInstPrinter(const Target &T, unsigned SyntaxVariant,
+                       const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                       const MCRegisterInfo &MRI, const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new BPFInstPrinter(MAI, MII, MRI);
+  return 0;
+}
+
+extern "C" void LLVMInitializeBPFTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfo<BPFMCAsmInfo> X(TheBPFTarget);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheBPFTarget, createBPFMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheBPFTarget, createBPFMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheBPFTarget, createBPFMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheBPFTarget,
+                                          createBPFMCSubtargetInfo);
+
+  // Register the MC code emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheBPFTarget,
+                                        llvm::createBPFMCCodeEmitter);
+
+  // Register the ASM Backend
+  TargetRegistry::RegisterMCAsmBackend(TheBPFTarget, createBPFAsmBackend);
+
+  // Register the object streamer
+  TargetRegistry::RegisterMCObjectStreamer(TheBPFTarget, createBPFMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheBPFTarget, createBPFMCInstPrinter);
+}
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
new file mode 100644
index 0000000..55901cc
--- /dev/null
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
@@ -0,0 +1,59 @@
+//===-- BPFMCTargetDesc.h - BPF Target Descriptions -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides BPF specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCTARGETDESC_H
+#define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Config/config.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class Target;
+class StringRef;
+class raw_ostream;
+
+extern Target TheBPFTarget;
+
+MCCodeEmitter *createBPFMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
+                                      const MCSubtargetInfo &STI,
+                                      MCContext &Ctx);
+
+MCAsmBackend *createBPFAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  StringRef TT, StringRef CPU);
+
+MCObjectWriter *createBPFELFObjectWriter(raw_ostream &OS, uint8_t OSABI);
+}
+
+// Defines symbolic names for BPF registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "BPFGenRegisterInfo.inc"
+
+// Defines symbolic names for the BPF instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "BPFGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "BPFGenSubtargetInfo.inc"
+
+#endif
diff --git a/lib/Target/BPF/MCTargetDesc/CMakeLists.txt b/lib/Target/BPF/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 0000000..5fcd874
--- /dev/null
+++ b/lib/Target/BPF/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_llvm_library(LLVMBPFDesc
+  BPFMCTargetDesc.cpp
+  BPFAsmBackend.cpp
+  BPFMCCodeEmitter.cpp
+  BPFELFObjectWriter.cpp
+  )
diff --git a/lib/Target/BPF/MCTargetDesc/LLVMBuild.txt b/lib/Target/BPF/MCTargetDesc/LLVMBuild.txt
new file mode 100644
index 0000000..209d17c
--- /dev/null
+++ b/lib/Target/BPF/MCTargetDesc/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/BPF/MCTargetDesc/LLVMBuild.txt --------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = BPFDesc
+parent = BPF
+required_libraries = MC BPFAsmPrinter BPFInfo
+add_to_library_groups = BPF
diff --git a/lib/Target/BPF/MCTargetDesc/Makefile b/lib/Target/BPF/MCTargetDesc/Makefile
new file mode 100644
index 0000000..af70cd0
--- /dev/null
+++ b/lib/Target/BPF/MCTargetDesc/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/BPF/MCTargetDesc/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMBPFDesc
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/BPF/Makefile b/lib/Target/BPF/Makefile
new file mode 100644
index 0000000..7492f5e
--- /dev/null
+++ b/lib/Target/BPF/Makefile
@@ -0,0 +1,21 @@
+##===- lib/Target/BPF/Makefile -----------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMBPFCodeGen
+TARGET = BPF
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = BPFGenRegisterInfo.inc BPFGenInstrInfo.inc \
+		BPFGenAsmWriter.inc BPFGenAsmMatcher.inc BPFGenDAGISel.inc \
+		BPFGenMCCodeEmitter.inc BPFGenSubtargetInfo.inc BPFGenCallingConv.inc
+
+DIRS = InstPrinter TargetInfo MCTargetDesc
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp b/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
new file mode 100644
index 0000000..818a992
--- /dev/null
+++ b/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
@@ -0,0 +1,18 @@
+//===-- BPFTargetInfo.cpp - BPF Target Implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheBPFTarget;
+
+extern "C" void LLVMInitializeBPFTargetInfo() {
+  RegisterTarget<Triple::bpf> X(TheBPFTarget, "bpf", "BPF");
+}
diff --git a/lib/Target/BPF/TargetInfo/CMakeLists.txt b/lib/Target/BPF/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..ca08846
--- /dev/null
+++ b/lib/Target/BPF/TargetInfo/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMBPFInfo
+  BPFTargetInfo.cpp
+  )
diff --git a/lib/Target/BPF/TargetInfo/LLVMBuild.txt b/lib/Target/BPF/TargetInfo/LLVMBuild.txt
new file mode 100644
index 0000000..b56a858
--- /dev/null
+++ b/lib/Target/BPF/TargetInfo/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/BPF/TargetInfo/LLVMBuild.txt ----------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = BPFInfo
+parent = BPF
+required_libraries = Support
+add_to_library_groups = BPF
diff --git a/lib/Target/BPF/TargetInfo/Makefile b/lib/Target/BPF/TargetInfo/Makefile
new file mode 100644
index 0000000..02af58e
--- /dev/null
+++ b/lib/Target/BPF/TargetInfo/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/BPF/TargetInfo/Makefile ------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMBPFInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
index c61805b..1805437 100644
--- a/lib/Target/CMakeLists.txt
+++ b/lib/Target/CMakeLists.txt
@@ -1,14 +1,16 @@
+list(APPEND LLVM_COMMON_DEPENDS intrinsics_gen)
+
 add_llvm_library(LLVMTarget
   Target.cpp
   TargetIntrinsicInfo.cpp
-  TargetLibraryInfo.cpp
   TargetLoweringObjectFile.cpp
   TargetMachine.cpp
   TargetMachineC.cpp
   TargetSubtargetInfo.cpp
-  )
 
-list(APPEND LLVM_COMMON_DEPENDS intrinsics_gen)
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Target
+  )
 
 foreach(t ${LLVM_TARGETS_TO_BUILD})
   message(STATUS "Targeting ${t}")
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index f610fbb..c7fec52 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -22,12 +22,12 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Pass.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
@@ -1942,7 +1942,6 @@ void CppWriter::printModuleBody() {
 void CppWriter::printProgram(const std::string& fname,
                              const std::string& mName) {
   Out << "#include <llvm/Pass.h>\n";
-  Out << "#include <llvm/PassManager.h>\n";
 
   Out << "#include <llvm/ADT/SmallVector.h>\n";
   Out << "#include <llvm/Analysis/Verifier.h>\n";
@@ -1956,6 +1955,7 @@ void CppWriter::printProgram(const std::string& fname,
   Out << "#include <llvm/IR/InlineAsm.h>\n";
   Out << "#include <llvm/IR/Instructions.h>\n";
   Out << "#include <llvm/IR/LLVMContext.h>\n";
+  Out << "#include <llvm/IR/LegacyPassManager.h>\n";
   Out << "#include <llvm/IR/Module.h>\n";
   Out << "#include <llvm/Support/FormattedStream.h>\n";
   Out << "#include <llvm/Support/MathExtras.h>\n";
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index af7914f..eaa8bef 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -13,7 +13,6 @@ add_public_tablegen_target(HexagonCommonTableGen)
 
 add_llvm_target(HexagonCodeGen
   HexagonAsmPrinter.cpp
-  HexagonCallingConvLower.cpp
   HexagonCFGOptimizer.cpp
   HexagonCopyToCombine.cpp
   HexagonExpandPredSpillCode.cpp
diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index bc64be1..669af8c 100644
--- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
-
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCExpr.h"
@@ -18,14 +18,13 @@
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/Endian.h"
-
-#include <vector>
+#include "llvm/Support/raw_ostream.h"
 #include <array>
+#include <vector>
 
 using namespace llvm;
 
@@ -48,6 +47,13 @@ public:
 };
 }
 
+static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+  uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+  uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, void const *Decoder);
+
 static const uint16_t IntRegDecoderTable[] = {
   Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
   Hexagon::R5, Hexagon::R6, Hexagon::R7, Hexagon::R8, Hexagon::R9,
@@ -60,6 +66,16 @@ static const uint16_t IntRegDecoderTable[] = {
 static const uint16_t PredRegDecoderTable[] = { Hexagon::P0, Hexagon::P1,
 Hexagon::P2, Hexagon::P3 };
 
+static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
+  const uint16_t Table[], size_t Size) {
+  if (RegNo < Size) {
+    Inst.addOperand(MCOperand::CreateReg(Table[RegNo]));
+    return MCDisassembler::Success;
+  }
+  else
+    return MCDisassembler::Fail;
+}
+
 static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   uint64_t /*Address*/,
   void const *Decoder) {
@@ -71,6 +87,81 @@ static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+  uint64_t /*Address*/, const void *Decoder) {
+  static const uint16_t CtrlRegDecoderTable[] = {
+    Hexagon::SA0, Hexagon::LC0, Hexagon::SA1, Hexagon::LC1,
+    Hexagon::P3_0, Hexagon::NoRegister, Hexagon::C6, Hexagon::C7,
+    Hexagon::USR, Hexagon::PC, Hexagon::UGP, Hexagon::GP,
+    Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPCH
+  };
+
+  if (RegNo >= sizeof(CtrlRegDecoderTable) / sizeof(CtrlRegDecoderTable[0]))
+    return MCDisassembler::Fail;
+
+  if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister)
+    return MCDisassembler::Fail;
+
+  unsigned Register = CtrlRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t /*Address*/, void const *Decoder) {
+  static const uint16_t CtrlReg64DecoderTable[] = {
+    Hexagon::C1_0, Hexagon::NoRegister,
+    Hexagon::C3_2, Hexagon::NoRegister,
+    Hexagon::NoRegister, Hexagon::NoRegister,
+    Hexagon::C7_6, Hexagon::NoRegister,
+    Hexagon::C9_8, Hexagon::NoRegister,
+    Hexagon::C11_10, Hexagon::NoRegister,
+    Hexagon::CS, Hexagon::NoRegister,
+    Hexagon::UPC, Hexagon::NoRegister
+  };
+
+  if (RegNo >= sizeof(CtrlReg64DecoderTable) / sizeof(CtrlReg64DecoderTable[0]))
+    return MCDisassembler::Fail;
+
+  if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister)
+    return MCDisassembler::Fail;
+
+  unsigned Register = CtrlReg64DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+  uint64_t /*Address*/, const void *Decoder) {
+  unsigned Register = 0;
+  switch (RegNo) {
+  case 0:
+    Register = Hexagon::M0;
+    break;
+  case 1:
+    Register = Hexagon::M1;
+    break;
+  default:
+    return MCDisassembler::Fail;
+  }
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+  uint64_t /*Address*/, const void *Decoder) {
+  static const uint16_t DoubleRegDecoderTable[] = {
+    Hexagon::D0, Hexagon::D1, Hexagon::D2, Hexagon::D3,
+    Hexagon::D4, Hexagon::D5, Hexagon::D6, Hexagon::D7,
+    Hexagon::D8, Hexagon::D9, Hexagon::D10, Hexagon::D11,
+    Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15
+  };
+
+  return (DecodeRegisterClass(Inst, RegNo >> 1,
+    DoubleRegDecoderTable,
+    sizeof (DoubleRegDecoderTable)));
+}
+
 static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   uint64_t /*Address*/,
   void const *Decoder) {
@@ -110,5 +201,7 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
   // Remove parse bits.
   insn &= ~static_cast<uint32_t>(HexagonII::InstParseBits::INST_PARSE_MASK);
-  return decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
+  DecodeStatus Result = decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
+  HexagonMCInstrInfo::AppendImplicitOperands(MI);
+  return Result;
 }
diff --git a/lib/Target/Hexagon/Disassembler/LLVMBuild.txt b/lib/Target/Hexagon/Disassembler/LLVMBuild.txt
index 17ad11b..43bace7 100644
--- a/lib/Target/Hexagon/Disassembler/LLVMBuild.txt
+++ b/lib/Target/Hexagon/Disassembler/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = HexagonDisassembler
 parent = Hexagon
-required_libraries = HexagonInfo MCDisassembler Support
+required_libraries = HexagonDesc HexagonInfo MCDisassembler Support
 add_to_library_groups = Hexagon
diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h
index 64ae69c..e0a3b2f 100644
--- a/lib/Target/Hexagon/Hexagon.h
+++ b/lib/Target/Hexagon/Hexagon.h
@@ -21,26 +21,24 @@
 
 namespace llvm {
   class FunctionPass;
-  class ModulePass;
-  class TargetMachine;
-  class MachineInstr;
-  class HexagonMCInst;
   class HexagonAsmPrinter;
   class HexagonTargetMachine;
+  class MachineInstr;
+  class MCInst;
+  class ModulePass;
   class raw_ostream;
+  class TargetMachine;
 
   FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
                                      CodeGenOpt::Level OptLevel);
   FunctionPass *createHexagonDelaySlotFillerPass(const TargetMachine &TM);
   FunctionPass *createHexagonFPMoverPass(const TargetMachine &TM);
   FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM);
-  FunctionPass *createHexagonCFGOptimizer(const HexagonTargetMachine &TM);
+  FunctionPass *createHexagonCFGOptimizer();
 
-  FunctionPass *createHexagonSplitTFRCondSets(const HexagonTargetMachine &TM);
-  FunctionPass *createHexagonSplitConst32AndConst64(
-                      const HexagonTargetMachine &TM);
-  FunctionPass *createHexagonExpandPredSpillCode(
-                      const HexagonTargetMachine &TM);
+  FunctionPass *createHexagonSplitTFRCondSets();
+  FunctionPass *createHexagonSplitConst32AndConst64();
+  FunctionPass *createHexagonExpandPredSpillCode();
   FunctionPass *createHexagonHardwareLoops();
   FunctionPass *createHexagonPeephole();
   FunctionPass *createHexagonFixupHwLoops();
@@ -58,7 +56,7 @@ namespace llvm {
   TargetAsmBackend *createHexagonAsmBackend(const Target &,
                                                   const std::string &);
 */
-  void HexagonLowerToMC(const MachineInstr *MI, HexagonMCInst &MCI,
+  void HexagonLowerToMC(MachineInstr const *MI, MCInst &MCI,
                         HexagonAsmPrinter &AP);
 } // end namespace llvm;
 
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index 5f4a6c6..f892c9f 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -21,35 +21,23 @@ include "llvm/Target/Target.td"
 // Hexagon Subtarget features.
 //===----------------------------------------------------------------------===//
 
-// Hexagon Archtectures
-def ArchV2       : SubtargetFeature<"v2", "HexagonArchVersion", "V2",
-                                    "Hexagon v2">;
-def ArchV3       : SubtargetFeature<"v3", "HexagonArchVersion", "V3",
-                                    "Hexagon v3">;
-def ArchV4       : SubtargetFeature<"v4", "HexagonArchVersion", "V4",
-                                    "Hexagon v4">;
-def ArchV5       : SubtargetFeature<"v5", "HexagonArchVersion", "V5",
-                                    "Hexagon v5">;
+// Hexagon Architectures
+def ArchV4:  SubtargetFeature<"v4",  "HexagonArchVersion", "V4",  "Hexagon V4">;
+def ArchV5:  SubtargetFeature<"v5",  "HexagonArchVersion", "V5",  "Hexagon V5">;
 
 //===----------------------------------------------------------------------===//
 // Hexagon Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
-def HasV2T                      : Predicate<"Subtarget.hasV2TOps()">;
-def HasV2TOnly                  : Predicate<"Subtarget.hasV2TOpsOnly()">;
-def NoV2T                       : Predicate<"!Subtarget.hasV2TOps()">;
-def HasV3T                      : Predicate<"Subtarget.hasV3TOps()">;
-def HasV3TOnly                  : Predicate<"Subtarget.hasV3TOpsOnly()">;
-def NoV3T                       : Predicate<"!Subtarget.hasV3TOps()">;
-def HasV4T                      : Predicate<"Subtarget.hasV4TOps()">;
-def NoV4T                       : Predicate<"!Subtarget.hasV4TOps()">;
-def HasV5T                      : Predicate<"Subtarget.hasV5TOps()">;
-def NoV5T                       : Predicate<"!Subtarget.hasV5TOps()">;
-def UseMEMOP                    : Predicate<"Subtarget.useMemOps()">;
-def IEEERndNearV5T              : Predicate<"Subtarget.modeIEEERndNear()">;
+def HasV5T                      : Predicate<"Subtarget->hasV5TOps()">;
+def NoV5T                       : Predicate<"!Subtarget->hasV5TOps()">;
+def UseMEMOP                    : Predicate<"Subtarget->useMemOps()">;
+def IEEERndNearV5T              : Predicate<"Subtarget->modeIEEERndNear()">;
 
 //===----------------------------------------------------------------------===//
 // Classes used for relation maps.
 //===----------------------------------------------------------------------===//
+
+class ImmRegShl;
 // PredRel - Filter class used to relate non-predicated instructions with their
 // predicated forms.
 class PredRel;
@@ -137,7 +125,7 @@ def getPredOldOpcode : InstrMapping {
 //
 def getNewValueOpcode : InstrMapping {
   let FilterClass = "NewValueRel";
-  let RowFields = ["BaseOpcode", "PredSense", "PNewValue"];
+  let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode"];
   let ColFields = ["NValueST"];
   let KeyCol = ["false"];
   let ValueCols = [["true"]];
@@ -149,7 +137,7 @@ def getNewValueOpcode : InstrMapping {
 //
 def getNonNVStore : InstrMapping {
   let FilterClass = "NewValueRel";
-  let RowFields = ["BaseOpcode", "PredSense", "PNewValue"];
+  let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode"];
   let ColFields = ["NValueST"];
   let KeyCol = ["true"];
   let ValueCols = [["false"]];
@@ -180,6 +168,14 @@ def getRegForm : InstrMapping {
   let ValueCols = [["reg"]];
 }
 
+def getRegShlForm : InstrMapping {
+  let FilterClass = "ImmRegShl";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
+  let ColFields = ["InputType"];
+  let KeyCol = ["imm"];
+  let ValueCols = [["reg"]];
+}
+
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
 //===----------------------------------------------------------------------===//
@@ -200,8 +196,10 @@ class Proc<string Name, SchedMachineModel Model,
            list<SubtargetFeature> Features>
  : ProcessorModel<Name, Model, Features>;
 
-def : Proc<"hexagonv4", HexagonModelV4, [ArchV2, ArchV3, ArchV4]>;
-def : Proc<"hexagonv5", HexagonModelV4, [ArchV2, ArchV3, ArchV4, ArchV5]>;
+def : Proc<"hexagonv4",  HexagonModelV4,
+           [ArchV4]>;
+def : Proc<"hexagonv5",  HexagonModelV4,
+           [ArchV4, ArchV5]>;
 
 //===----------------------------------------------------------------------===//
 // Declare the target which we are implementing
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 9240282..180762f 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -19,7 +19,7 @@
 #include "HexagonSubtarget.h"
 #include "HexagonTargetMachine.h"
 #include "MCTargetDesc/HexagonInstPrinter.h"
-#include "MCTargetDesc/HexagonMCInst.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -61,6 +61,10 @@ static cl::opt<bool> AlignCalls(
          "hexagon-align-calls", cl::Hidden, cl::init(true),
           cl::desc("Insert falign after call instruction for Hexagon target"));
 
+HexagonAsmPrinter::HexagonAsmPrinter(TargetMachine &TM,
+                                     std::unique_ptr<MCStreamer> Streamer)
+    : AsmPrinter(TM, std::move(Streamer)), Subtarget(nullptr) {}
+
 void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
                                     raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand(OpNo);
@@ -174,7 +178,7 @@ bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 ///
 void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (MI->isBundle()) {
-    std::vector<const MachineInstr*> BundleMIs;
+    std::vector<MachineInstr const *> BundleMIs;
 
     const MachineBasicBlock *MBB = MI->getParent();
     MachineBasicBlock::const_instr_iterator MII = MI;
@@ -183,33 +187,35 @@ void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     while (MII != MBB->end() && MII->isInsideBundle()) {
       const MachineInstr *MInst = MII;
       if (MInst->getOpcode() == TargetOpcode::DBG_VALUE ||
-          MInst->getOpcode() == TargetOpcode::IMPLICIT_DEF) {
-          IgnoreCount++;
-          ++MII;
-          continue;
+        MInst->getOpcode() == TargetOpcode::IMPLICIT_DEF) {
+        IgnoreCount++;
+        ++MII;
+        continue;
       }
-      //BundleMIs.push_back(&*MII);
+      // BundleMIs.push_back(&*MII);
       BundleMIs.push_back(MInst);
       ++MII;
     }
     unsigned Size = BundleMIs.size();
-    assert((Size+IgnoreCount) == MI->getBundleSize() && "Corrupt Bundle!");
+    assert((Size + IgnoreCount) == MI->getBundleSize() && "Corrupt Bundle!");
     for (unsigned Index = 0; Index < Size; Index++) {
-      HexagonMCInst MCI;
-      MCI.setPacketStart(Index == 0);
-      MCI.setPacketEnd(Index == (Size-1));
+      MCInst MCI;
 
       HexagonLowerToMC(BundleMIs[Index], MCI, *this);
+      HexagonMCInstrInfo::AppendImplicitOperands(MCI);
+      HexagonMCInstrInfo::setPacketBegin(MCI, Index == 0);
+      HexagonMCInstrInfo::setPacketEnd(MCI, Index == (Size - 1));
       EmitToStreamer(OutStreamer, MCI);
     }
   }
   else {
-    HexagonMCInst MCI;
+    MCInst MCI;
+    HexagonLowerToMC(MI, MCI, *this);
+    HexagonMCInstrInfo::AppendImplicitOperands(MCI);
     if (MI->getOpcode() == Hexagon::ENDLOOP0) {
-      MCI.setPacketStart(true);
-      MCI.setPacketEnd(true);
+      HexagonMCInstrInfo::setPacketBegin(MCI, true);
+      HexagonMCInstrInfo::setPacketEnd(MCI, true);
     }
-    HexagonLowerToMC(MI, MCI, *this);
     EmitToStreamer(OutStreamer, MCI);
   }
 
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.h b/lib/Target/Hexagon/HexagonAsmPrinter.h
index 5f4c162..792fc8b 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.h
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -25,9 +25,12 @@ namespace llvm {
     const HexagonSubtarget *Subtarget;
 
   public:
-    explicit HexagonAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : AsmPrinter(TM, Streamer) {
-      Subtarget = &TM.getSubtarget<HexagonSubtarget>();
+    explicit HexagonAsmPrinter(TargetMachine &TM,
+                               std::unique_ptr<MCStreamer> Streamer);
+
+    bool runOnMachineFunction(MachineFunction &Fn) override {
+      Subtarget = &Fn.getSubtarget<HexagonSubtarget>();
+      return AsmPrinter::runOnMachineFunction(Fn);
     }
 
     const char *getPassName() const override {
diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index 8a4e02c..703e691 100644
--- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -37,15 +37,11 @@ namespace {
 class HexagonCFGOptimizer : public MachineFunctionPass {
 
 private:
-  const HexagonTargetMachine& QTM;
-  const HexagonSubtarget &QST;
-
   void InvertAndChangeJumpTarget(MachineInstr*, MachineBasicBlock*);
 
  public:
   static char ID;
-  HexagonCFGOptimizer(const HexagonTargetMachine& TM)
-    : MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {
+  HexagonCFGOptimizer() : MachineFunctionPass(ID) {
     initializeHexagonCFGOptimizerPass(*PassRegistry::getPassRegistry());
   }
 
@@ -59,49 +55,49 @@ private:
 char HexagonCFGOptimizer::ID = 0;
 
 static bool IsConditionalBranch(int Opc) {
-  return (Opc == Hexagon::JMP_t) || (Opc == Hexagon::JMP_f)
-    || (Opc == Hexagon::JMP_tnew_t) || (Opc == Hexagon::JMP_fnew_t);
+  return (Opc == Hexagon::J2_jumpt) || (Opc == Hexagon::J2_jumpf)
+    || (Opc == Hexagon::J2_jumptnewpt) || (Opc == Hexagon::J2_jumpfnewpt);
 }
 
 
 static bool IsUnconditionalJump(int Opc) {
-  return (Opc == Hexagon::JMP);
+  return (Opc == Hexagon::J2_jump);
 }
 
 
 void
 HexagonCFGOptimizer::InvertAndChangeJumpTarget(MachineInstr* MI,
                                                MachineBasicBlock* NewTarget) {
-  const HexagonInstrInfo *QII = QTM.getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII =
+      MI->getParent()->getParent()->getSubtarget().getInstrInfo();
   int NewOpcode = 0;
   switch(MI->getOpcode()) {
-  case Hexagon::JMP_t:
-    NewOpcode = Hexagon::JMP_f;
+  case Hexagon::J2_jumpt:
+    NewOpcode = Hexagon::J2_jumpf;
     break;
 
-  case Hexagon::JMP_f:
-    NewOpcode = Hexagon::JMP_t;
+  case Hexagon::J2_jumpf:
+    NewOpcode = Hexagon::J2_jumpt;
     break;
 
-  case Hexagon::JMP_tnew_t:
-    NewOpcode = Hexagon::JMP_fnew_t;
+  case Hexagon::J2_jumptnewpt:
+    NewOpcode = Hexagon::J2_jumpfnewpt;
     break;
 
-  case Hexagon::JMP_fnew_t:
-    NewOpcode = Hexagon::JMP_tnew_t;
+  case Hexagon::J2_jumpfnewpt:
+    NewOpcode = Hexagon::J2_jumptnewpt;
     break;
 
   default:
     llvm_unreachable("Cannot handle this case");
   }
 
-  MI->setDesc(QII->get(NewOpcode));
+  MI->setDesc(TII->get(NewOpcode));
   MI->getOperand(1).setMBB(NewTarget);
 }
 
 
 bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
-
   // Loop over all of the basic blocks.
   for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
        MBBb != MBBe; ++MBBb) {
@@ -163,8 +159,8 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
         // The target of the unconditional branch must be JumpAroundTarget.
         // TODO: If not, we should not invert the unconditional branch.
         MachineBasicBlock* CondBranchTarget = nullptr;
-        if ((MI->getOpcode() == Hexagon::JMP_t) ||
-            (MI->getOpcode() == Hexagon::JMP_f)) {
+        if ((MI->getOpcode() == Hexagon::J2_jumpt) ||
+            (MI->getOpcode() == Hexagon::J2_jumpf)) {
           CondBranchTarget = MI->getOperand(1).getMBB();
         }
 
@@ -248,6 +244,6 @@ void llvm::initializeHexagonCFGOptimizerPass(PassRegistry &Registry) {
   CALL_ONCE_INITIALIZATION(initializePassOnce)
 }
 
-FunctionPass *llvm::createHexagonCFGOptimizer(const HexagonTargetMachine &TM) {
-  return new HexagonCFGOptimizer(TM);
+FunctionPass *llvm::createHexagonCFGOptimizer() {
+  return new HexagonCFGOptimizer();
 }
diff --git a/lib/Target/Hexagon/HexagonCallingConvLower.cpp b/lib/Target/Hexagon/HexagonCallingConvLower.cpp
deleted file mode 100644
index 8d78409..0000000
--- a/lib/Target/Hexagon/HexagonCallingConvLower.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-//===-- llvm/CallingConvLower.cpp - Calling Convention lowering -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Hexagon_CCState class, used for lowering and
-// implementing calling conventions. Adapted from the machine independent
-// version of the class (CCState) but this handles calls to varargs functions
-//
-//===----------------------------------------------------------------------===//
-
-#include "HexagonCallingConvLower.h"
-#include "Hexagon.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-using namespace llvm;
-
-Hexagon_CCState::Hexagon_CCState(CallingConv::ID CC, bool isVarArg,
-                                 const TargetMachine &tm,
-                                 SmallVectorImpl<CCValAssign> &locs,
-                                 LLVMContext &c)
-  : CallingConv(CC), IsVarArg(isVarArg), TM(tm), Locs(locs), Context(c) {
-  // No stack is used.
-  StackOffset = 0;
-
-  UsedRegs.resize(
-      (TM.getSubtargetImpl()->getRegisterInfo()->getNumRegs() + 31) / 32);
-}
-
-// HandleByVal - Allocate a stack slot large enough to pass an argument by
-// value. The size and alignment information of the argument is encoded in its
-// parameter attribute.
-void Hexagon_CCState::HandleByVal(unsigned ValNo, EVT ValVT,
-                                EVT LocVT, CCValAssign::LocInfo LocInfo,
-                                int MinSize, int MinAlign,
-                                ISD::ArgFlagsTy ArgFlags) {
-  unsigned Align = ArgFlags.getByValAlign();
-  unsigned Size  = ArgFlags.getByValSize();
-  if (MinSize > (int)Size)
-    Size = MinSize;
-  if (MinAlign > (int)Align)
-    Align = MinAlign;
-  unsigned Offset = AllocateStack(Size, Align);
-
-  addLoc(CCValAssign::getMem(ValNo, ValVT.getSimpleVT(), Offset,
-                             LocVT.getSimpleVT(), LocInfo));
-}
-
-/// MarkAllocated - Mark a register and all of its aliases as allocated.
-void Hexagon_CCState::MarkAllocated(unsigned Reg) {
-  const TargetRegisterInfo &TRI = *TM.getSubtargetImpl()->getRegisterInfo();
-  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
-    UsedRegs[*AI/32] |= 1 << (*AI&31);
-}
-
-/// AnalyzeFormalArguments - Analyze an ISD::FORMAL_ARGUMENTS node,
-/// incorporating info about the formals into this state.
-void
-Hexagon_CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg>
-                                        &Ins,
-                                        Hexagon_CCAssignFn Fn,
-                                        unsigned SretValueInRegs) {
-  unsigned NumArgs = Ins.size();
-  unsigned i = 0;
-
-  // If the function returns a small struct in registers, skip
-  // over the first (dummy) argument.
-  if (SretValueInRegs != 0) {
-    ++i;
-  }
-
-
-  for (; i != NumArgs; ++i) {
-    EVT ArgVT = Ins[i].VT;
-    ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
-    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this, 0, 0, false)) {
-      dbgs() << "Formal argument #" << i << " has unhandled type "
-             << ArgVT.getEVTString() << "\n";
-      abort();
-    }
-  }
-}
-
-/// AnalyzeReturn - Analyze the returned values of an ISD::RET node,
-/// incorporating info about the result values into this state.
-void
-Hexagon_CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
-                               Hexagon_CCAssignFn Fn,
-                               unsigned SretValueInRegs) {
-
-  // For Hexagon, Return small structures in registers.
-  if (SretValueInRegs != 0) {
-    if (SretValueInRegs <= 32) {
-      unsigned Reg = Hexagon::R0;
-      addLoc(CCValAssign::getReg(0, MVT::i32, Reg, MVT::i32,
-                                 CCValAssign::Full));
-      return;
-    }
-    if (SretValueInRegs <= 64) {
-      unsigned Reg = Hexagon::D0;
-      addLoc(CCValAssign::getReg(0, MVT::i64, Reg, MVT::i64,
-                                 CCValAssign::Full));
-      return;
-    }
-  }
-
-
-  // Determine which register each value should be copied into.
-  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
-    EVT VT = Outs[i].VT;
-    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this, -1, -1, false)){
-      dbgs() << "Return operand #" << i << " has unhandled type "
-           << VT.getEVTString() << "\n";
-      abort();
-    }
-  }
-}
-
-
-/// AnalyzeCallOperands - Analyze an ISD::CALL node, incorporating info
-/// about the passed values into this state.
-void
-Hexagon_CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg>
-                                     &Outs,
-                                     Hexagon_CCAssignFn Fn,
-                                     int NonVarArgsParams,
-                                     unsigned SretValueSize) {
-  unsigned NumOps = Outs.size();
-
-  unsigned i = 0;
-  // If the called function returns a small struct in registers, skip
-  // the first actual parameter. We do not want to pass a pointer to
-  // the stack location.
-  if (SretValueSize != 0) {
-    ++i;
-  }
-
-  for (; i != NumOps; ++i) {
-    EVT ArgVT = Outs[i].VT;
-    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this,
-           NonVarArgsParams, i+1, false)) {
-      dbgs() << "Call operand #" << i << " has unhandled type "
-           << ArgVT.getEVTString() << "\n";
-      abort();
-    }
-  }
-}
-
-/// AnalyzeCallOperands - Same as above except it takes vectors of types
-/// and argument flags.
-void
-Hexagon_CCState::AnalyzeCallOperands(SmallVectorImpl<EVT> &ArgVTs,
-                                     SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
-                                     Hexagon_CCAssignFn Fn) {
-  unsigned NumOps = ArgVTs.size();
-  for (unsigned i = 0; i != NumOps; ++i) {
-    EVT ArgVT = ArgVTs[i];
-    ISD::ArgFlagsTy ArgFlags = Flags[i];
-    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this, -1, -1,
-           false)) {
-      dbgs() << "Call operand #" << i << " has unhandled type "
-           << ArgVT.getEVTString() << "\n";
-      abort();
-    }
-  }
-}
-
-/// AnalyzeCallResult - Analyze the return values of an ISD::CALL node,
-/// incorporating info about the passed values into this state.
-void
-Hexagon_CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
-                                   Hexagon_CCAssignFn Fn,
-                                   unsigned SretValueInRegs) {
-
-  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-    EVT VT = Ins[i].VT;
-    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
-      if (Fn(i, VT, VT, CCValAssign::Full, Flags, *this, -1, -1, false)) {
-        dbgs() << "Call result #" << i << " has unhandled type "
-               << VT.getEVTString() << "\n";
-      abort();
-    }
-  }
-}
-
-/// AnalyzeCallResult - Same as above except it's specialized for calls which
-/// produce a single value.
-void Hexagon_CCState::AnalyzeCallResult(EVT VT, Hexagon_CCAssignFn Fn) {
-  if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), *this, -1, -1,
-         false)) {
-    dbgs() << "Call result has unhandled type "
-         << VT.getEVTString() << "\n";
-    abort();
-  }
-}
diff --git a/lib/Target/Hexagon/HexagonCallingConvLower.h b/lib/Target/Hexagon/HexagonCallingConvLower.h
deleted file mode 100644
index 738ed1a..0000000
--- a/lib/Target/Hexagon/HexagonCallingConvLower.h
+++ /dev/null
@@ -1,187 +0,0 @@
-//===-- HexagonCallingConvLower.h - Calling Conventions ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the Hexagon_CCState class, used for lowering
-// and implementing calling conventions. Adapted from the target independent
-// version but this handles calls to varargs functions
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONCALLINGCONVLOWER_H
-#define LLVM_LIB_TARGET_HEXAGON_HEXAGONCALLINGCONVLOWER_H
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
-
-//
-// Need to handle varargs.
-//
-namespace llvm {
-  class TargetRegisterInfo;
-  class TargetMachine;
-  class Hexagon_CCState;
-  class SDNode;
-  struct EVT;
-
-/// Hexagon_CCAssignFn - This function assigns a location for Val, updating
-/// State to reflect the change.
-typedef bool Hexagon_CCAssignFn(unsigned ValNo, EVT ValVT,
-                              EVT LocVT, CCValAssign::LocInfo LocInfo,
-                              ISD::ArgFlagsTy ArgFlags, Hexagon_CCState &State,
-                              int NonVarArgsParams,
-                              int CurrentParam,
-                              bool ForceMem);
-
-
-/// CCState - This class holds information needed while lowering arguments and
-/// return values.  It captures which registers are already assigned and which
-/// stack slots are used.  It provides accessors to allocate these values.
-class Hexagon_CCState {
-  CallingConv::ID CallingConv;
-  bool IsVarArg;
-  const TargetMachine &TM;
-  SmallVectorImpl<CCValAssign> &Locs;
-  LLVMContext &Context;
-
-  unsigned StackOffset;
-  SmallVector<uint32_t, 16> UsedRegs;
-public:
-  Hexagon_CCState(CallingConv::ID CC, bool isVarArg, const TargetMachine &TM,
-                  SmallVectorImpl<CCValAssign> &locs, LLVMContext &c);
-
-  void addLoc(const CCValAssign &V) {
-    Locs.push_back(V);
-  }
-
-  LLVMContext &getContext() const { return Context; }
-  const TargetMachine &getTarget() const { return TM; }
-  unsigned getCallingConv() const { return CallingConv; }
-  bool isVarArg() const { return IsVarArg; }
-
-  unsigned getNextStackOffset() const { return StackOffset; }
-
-  /// isAllocated - Return true if the specified register (or an alias) is
-  /// allocated.
-  bool isAllocated(unsigned Reg) const {
-    return UsedRegs[Reg/32] & (1 << (Reg&31));
-  }
-
-  /// AnalyzeFormalArguments - Analyze an ISD::FORMAL_ARGUMENTS node,
-  /// incorporating info about the formals into this state.
-  void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
-                              Hexagon_CCAssignFn Fn, unsigned SretValueInRegs);
-
-  /// AnalyzeReturn - Analyze the returned values of an ISD::RET node,
-  /// incorporating info about the result values into this state.
-  void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
-                     Hexagon_CCAssignFn Fn, unsigned SretValueInRegs);
-
-  /// AnalyzeCallOperands - Analyze an ISD::CALL node, incorporating info
-  /// about the passed values into this state.
-  void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
-                           Hexagon_CCAssignFn Fn, int NonVarArgsParams,
-                           unsigned SretValueSize);
-
-  /// AnalyzeCallOperands - Same as above except it takes vectors of types
-  /// and argument flags.
-  void AnalyzeCallOperands(SmallVectorImpl<EVT> &ArgVTs,
-                           SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
-                           Hexagon_CCAssignFn Fn);
-
-  /// AnalyzeCallResult - Analyze the return values of an ISD::CALL node,
-  /// incorporating info about the passed values into this state.
-  void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
-                         Hexagon_CCAssignFn Fn, unsigned SretValueInRegs);
-
-  /// AnalyzeCallResult - Same as above except it's specialized for calls which
-  /// produce a single value.
-  void AnalyzeCallResult(EVT VT, Hexagon_CCAssignFn Fn);
-
-  /// getFirstUnallocated - Return the first unallocated register in the set, or
-  /// NumRegs if they are all allocated.
-  unsigned getFirstUnallocated(const unsigned *Regs, unsigned NumRegs) const {
-    for (unsigned i = 0; i != NumRegs; ++i)
-      if (!isAllocated(Regs[i]))
-        return i;
-    return NumRegs;
-  }
-
-  /// AllocateReg - Attempt to allocate one register.  If it is not available,
-  /// return zero.  Otherwise, return the register, marking it and any aliases
-  /// as allocated.
-  unsigned AllocateReg(unsigned Reg) {
-    if (isAllocated(Reg)) return 0;
-    MarkAllocated(Reg);
-    return Reg;
-  }
-
-  /// Version of AllocateReg with extra register to be shadowed.
-  unsigned AllocateReg(unsigned Reg, unsigned ShadowReg) {
-    if (isAllocated(Reg)) return 0;
-    MarkAllocated(Reg);
-    MarkAllocated(ShadowReg);
-    return Reg;
-  }
-
-  /// AllocateReg - Attempt to allocate one of the specified registers.  If none
-  /// are available, return zero.  Otherwise, return the first one available,
-  /// marking it and any aliases as allocated.
-  unsigned AllocateReg(const unsigned *Regs, unsigned NumRegs) {
-    unsigned FirstUnalloc = getFirstUnallocated(Regs, NumRegs);
-    if (FirstUnalloc == NumRegs)
-      return 0;    // Didn't find the reg.
-
-    // Mark the register and any aliases as allocated.
-    unsigned Reg = Regs[FirstUnalloc];
-    MarkAllocated(Reg);
-    return Reg;
-  }
-
-  /// Version of AllocateReg with list of registers to be shadowed.
-  unsigned AllocateReg(const unsigned *Regs, const unsigned *ShadowRegs,
-                       unsigned NumRegs) {
-    unsigned FirstUnalloc = getFirstUnallocated(Regs, NumRegs);
-    if (FirstUnalloc == NumRegs)
-      return 0;    // Didn't find the reg.
-
-    // Mark the register and any aliases as allocated.
-    unsigned Reg = Regs[FirstUnalloc], ShadowReg = ShadowRegs[FirstUnalloc];
-    MarkAllocated(Reg);
-    MarkAllocated(ShadowReg);
-    return Reg;
-  }
-
-  /// AllocateStack - Allocate a chunk of stack space with the specified size
-  /// and alignment.
-  unsigned AllocateStack(unsigned Size, unsigned Align) {
-    assert(Align && ((Align-1) & Align) == 0); // Align is power of 2.
-    StackOffset = ((StackOffset + Align-1) & ~(Align-1));
-    unsigned Result = StackOffset;
-    StackOffset += Size;
-    return Result;
-  }
-
-  // HandleByVal - Allocate a stack slot large enough to pass an argument by
-  // value. The size and alignment information of the argument is encoded in its
-  // parameter attribute.
-  void HandleByVal(unsigned ValNo, EVT ValVT,
-                   EVT LocVT, CCValAssign::LocInfo LocInfo,
-                   int MinSize, int MinAlign, ISD::ArgFlagsTy ArgFlags);
-
-private:
-  /// MarkAllocated - Mark a register and all of its aliases as allocated.
-  void MarkAllocated(unsigned Reg);
-};
-
-
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 4e76698..dd193f9 100644
--- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -114,7 +114,7 @@ static bool isCombinableInstType(MachineInstr *MI,
                                  const HexagonInstrInfo *TII,
                                  bool ShouldCombineAggressively) {
   switch(MI->getOpcode()) {
-  case Hexagon::TFR: {
+  case Hexagon::A2_tfr: {
     // A COPY instruction can be combined if its arguments are IntRegs (32bit).
     assert(MI->getOperand(0).isReg() && MI->getOperand(1).isReg());
 
@@ -124,7 +124,7 @@ static bool isCombinableInstType(MachineInstr *MI,
       Hexagon::IntRegsRegClass.contains(SrcReg);
   }
 
-  case Hexagon::TFRI: {
+  case Hexagon::A2_tfrsi: {
     // A transfer-immediate can be combined if its argument is a signed 8bit
     // value.
     assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
@@ -158,11 +158,11 @@ static bool isCombinableInstType(MachineInstr *MI,
 }
 
 static bool isGreaterThan8BitTFRI(MachineInstr *I) {
-  return I->getOpcode() == Hexagon::TFRI &&
+  return I->getOpcode() == Hexagon::A2_tfrsi &&
     !isInt<8>(I->getOperand(1).getImm());
 }
 static bool isGreaterThan6BitTFRI(MachineInstr *I) {
-  return I->getOpcode() == Hexagon::TFRI &&
+  return I->getOpcode() == Hexagon::A2_tfrsi &&
     !isUInt<6>(I->getOperand(1).getImm());
 }
 
@@ -171,26 +171,14 @@ static bool isGreaterThan6BitTFRI(MachineInstr *I) {
 static bool areCombinableOperations(const TargetRegisterInfo *TRI,
                                     MachineInstr *HighRegInst,
                                     MachineInstr *LowRegInst) {
-  assert((HighRegInst->getOpcode() == Hexagon::TFR ||
-          HighRegInst->getOpcode() == Hexagon::TFRI ||
+  assert((HighRegInst->getOpcode() == Hexagon::A2_tfr ||
+          HighRegInst->getOpcode() == Hexagon::A2_tfrsi ||
           HighRegInst->getOpcode() == Hexagon::TFRI_V4) &&
-         (LowRegInst->getOpcode() == Hexagon::TFR ||
-          LowRegInst->getOpcode() == Hexagon::TFRI ||
+         (LowRegInst->getOpcode() == Hexagon::A2_tfr ||
+          LowRegInst->getOpcode() == Hexagon::A2_tfrsi ||
           LowRegInst->getOpcode() == Hexagon::TFRI_V4) &&
          "Assume individual instructions are of a combinable type");
 
-  const HexagonRegisterInfo *QRI =
-    static_cast<const HexagonRegisterInfo *>(TRI);
-
-  // V4 added some combine variations (mixed immediate and register source
-  // operands), if we are on < V4 we can only combine 2 register-to-register
-  // moves and 2 immediate-to-register moves. We also don't have
-  // constant-extenders.
-  if (!QRI->Subtarget.hasV4TOps())
-    return HighRegInst->getOpcode() == LowRegInst->getOpcode() &&
-      !isGreaterThan8BitTFRI(HighRegInst) &&
-      !isGreaterThan6BitTFRI(LowRegInst);
-
   // There is no combine of two constant extended values.
   if ((HighRegInst->getOpcode() == Hexagon::TFRI_V4 ||
        isGreaterThan8BitTFRI(HighRegInst)) &&
@@ -418,7 +406,7 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
 
   // Get target info.
   TRI = MF.getSubtarget().getRegisterInfo();
-  TII = static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
 
   // Combine aggressively (for code size)
   ShouldCombineAggressively =
@@ -563,14 +551,14 @@ void HexagonCopyToCombine::emitCombineII(MachineBasicBlock::iterator &InsertPt,
 
   // Handle  globals.
   if (HiOperand.isGlobal()) {
-    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ii), DoubleDestReg)
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A2_combineii), DoubleDestReg)
       .addGlobalAddress(HiOperand.getGlobal(), HiOperand.getOffset(),
                         HiOperand.getTargetFlags())
       .addImm(LoOperand.getImm());
     return;
   }
   if (LoOperand.isGlobal()) {
-    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_iI_V4), DoubleDestReg)
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineii), DoubleDestReg)
       .addImm(HiOperand.getImm())
       .addGlobalAddress(LoOperand.getGlobal(), LoOperand.getOffset(),
                         LoOperand.getTargetFlags());
@@ -580,7 +568,7 @@ void HexagonCopyToCombine::emitCombineII(MachineBasicBlock::iterator &InsertPt,
   // Handle constant extended immediates.
   if (!isInt<8>(HiOperand.getImm())) {
     assert(isInt<8>(LoOperand.getImm()));
-    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ii), DoubleDestReg)
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A2_combineii), DoubleDestReg)
       .addImm(HiOperand.getImm())
       .addImm(LoOperand.getImm());
     return;
@@ -588,7 +576,7 @@ void HexagonCopyToCombine::emitCombineII(MachineBasicBlock::iterator &InsertPt,
 
   if (!isUInt<6>(LoOperand.getImm())) {
     assert(isInt<8>(HiOperand.getImm()));
-    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_iI_V4), DoubleDestReg)
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineii), DoubleDestReg)
       .addImm(HiOperand.getImm())
       .addImm(LoOperand.getImm());
     return;
@@ -596,7 +584,7 @@ void HexagonCopyToCombine::emitCombineII(MachineBasicBlock::iterator &InsertPt,
 
   // Insert new combine instruction.
   //  DoubleRegDest = combine #HiImm, #LoImm
-  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ii), DoubleDestReg)
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A2_combineii), DoubleDestReg)
     .addImm(HiOperand.getImm())
     .addImm(LoOperand.getImm());
 }
@@ -613,7 +601,7 @@ void HexagonCopyToCombine::emitCombineIR(MachineBasicBlock::iterator &InsertPt,
 
   // Handle global.
   if (HiOperand.isGlobal()) {
-    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ir_V4), DoubleDestReg)
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineir), DoubleDestReg)
       .addGlobalAddress(HiOperand.getGlobal(), HiOperand.getOffset(),
                         HiOperand.getTargetFlags())
       .addReg(LoReg, LoRegKillFlag);
@@ -621,7 +609,7 @@ void HexagonCopyToCombine::emitCombineIR(MachineBasicBlock::iterator &InsertPt,
   }
   // Insert new combine instruction.
   //  DoubleRegDest = combine #HiImm, LoReg
-  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ir_V4), DoubleDestReg)
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineir), DoubleDestReg)
     .addImm(HiOperand.getImm())
     .addReg(LoReg, LoRegKillFlag);
 }
@@ -638,7 +626,7 @@ void HexagonCopyToCombine::emitCombineRI(MachineBasicBlock::iterator &InsertPt,
 
   // Handle global.
   if (LoOperand.isGlobal()) {
-    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_rI_V4), DoubleDestReg)
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineri), DoubleDestReg)
       .addReg(HiReg, HiRegKillFlag)
       .addGlobalAddress(LoOperand.getGlobal(), LoOperand.getOffset(),
                         LoOperand.getTargetFlags());
@@ -647,7 +635,7 @@ void HexagonCopyToCombine::emitCombineRI(MachineBasicBlock::iterator &InsertPt,
 
   // Insert new combine instruction.
   //  DoubleRegDest = combine HiReg, #LoImm
-  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_rI_V4), DoubleDestReg)
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineri), DoubleDestReg)
     .addReg(HiReg, HiRegKillFlag)
     .addImm(LoOperand.getImm());
 }
@@ -666,7 +654,7 @@ void HexagonCopyToCombine::emitCombineRR(MachineBasicBlock::iterator &InsertPt,
 
   // Insert new combine instruction.
   //  DoubleRegDest = combine HiReg, LoReg
-  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_rr), DoubleDestReg)
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A2_combinew), DoubleDestReg)
     .addReg(HiReg, HiRegKillFlag)
     .addReg(LoReg, LoRegKillFlag);
 }
diff --git a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
index 8ef4c3a..8176598 100644
--- a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
+++ b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
@@ -20,7 +20,6 @@
 #include "Hexagon.h"
 #include "HexagonMachineFunctionInfo.h"
 #include "HexagonSubtarget.h"
-#include "HexagonTargetMachine.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -49,13 +48,9 @@ namespace llvm {
 namespace {
 
 class HexagonExpandPredSpillCode : public MachineFunctionPass {
-    const HexagonTargetMachine& QTM;
-    const HexagonSubtarget &QST;
-
  public:
     static char ID;
-    HexagonExpandPredSpillCode(const HexagonTargetMachine& TM) :
-      MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {
+    HexagonExpandPredSpillCode() : MachineFunctionPass(ID) {
       PassRegistry &Registry = *PassRegistry::getPassRegistry();
       initializeHexagonExpandPredSpillCodePass(Registry);
     }
@@ -72,7 +67,8 @@ char HexagonExpandPredSpillCode::ID = 0;
 
 bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
 
-  const HexagonInstrInfo *TII = QTM.getSubtargetImpl()->getInstrInfo();
+  const HexagonSubtarget &QST = Fn.getSubtarget<HexagonSubtarget>();
+  const HexagonInstrInfo *TII = QST.getInstrInfo();
 
   // Loop over all of the basic blocks.
   for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
@@ -86,45 +82,43 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
       if (Opc == Hexagon::STriw_pred) {
         // STriw_pred [R30], ofst, SrcReg;
         unsigned FP = MI->getOperand(0).getReg();
-        assert(
-            FP ==
-                QTM.getSubtargetImpl()->getRegisterInfo()->getFrameRegister() &&
-            "Not a Frame Pointer, Nor a Spill Slot");
+        assert(FP == QST.getRegisterInfo()->getFrameRegister() &&
+               "Not a Frame Pointer, Nor a Spill Slot");
         assert(MI->getOperand(1).isImm() && "Not an offset");
         int Offset = MI->getOperand(1).getImm();
         int SrcReg = MI->getOperand(2).getReg();
         assert(Hexagon::PredRegsRegClass.contains(SrcReg) &&
                "Not a predicate register");
-        if (!TII->isValidOffset(Hexagon::STriw_indexed, Offset)) {
-          if (!TII->isValidOffset(Hexagon::ADD_ri, Offset)) {
+        if (!TII->isValidOffset(Hexagon::S2_storeri_io, Offset)) {
+          if (!TII->isValidOffset(Hexagon::A2_addi, Offset)) {
             BuildMI(*MBB, MII, MI->getDebugLoc(),
                     TII->get(Hexagon::CONST32_Int_Real),
                       HEXAGON_RESERVED_REG_1).addImm(Offset);
             BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_add),
                     HEXAGON_RESERVED_REG_1)
               .addReg(FP).addReg(HEXAGON_RESERVED_REG_1);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::C2_tfrpr),
                       HEXAGON_RESERVED_REG_2).addReg(SrcReg);
             BuildMI(*MBB, MII, MI->getDebugLoc(),
-                    TII->get(Hexagon::STriw_indexed))
+                    TII->get(Hexagon::S2_storeri_io))
               .addReg(HEXAGON_RESERVED_REG_1)
               .addImm(0).addReg(HEXAGON_RESERVED_REG_2);
           } else {
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::ADD_ri),
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_addi),
                       HEXAGON_RESERVED_REG_1).addReg(FP).addImm(Offset);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::C2_tfrpr),
                       HEXAGON_RESERVED_REG_2).addReg(SrcReg);
             BuildMI(*MBB, MII, MI->getDebugLoc(),
-                          TII->get(Hexagon::STriw_indexed))
+                          TII->get(Hexagon::S2_storeri_io))
               .addReg(HEXAGON_RESERVED_REG_1)
               .addImm(0)
               .addReg(HEXAGON_RESERVED_REG_2);
           }
         } else {
-          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
+          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::C2_tfrpr),
                     HEXAGON_RESERVED_REG_2).addReg(SrcReg);
           BuildMI(*MBB, MII, MI->getDebugLoc(),
-                        TII->get(Hexagon::STriw_indexed)).
+                        TII->get(Hexagon::S2_storeri_io)).
                     addReg(FP).addImm(Offset).addReg(HEXAGON_RESERVED_REG_2);
         }
         MII = MBB->erase(MI);
@@ -135,14 +129,12 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
         assert(Hexagon::PredRegsRegClass.contains(DstReg) &&
                "Not a predicate register");
         unsigned FP = MI->getOperand(1).getReg();
-        assert(
-            FP ==
-                QTM.getSubtargetImpl()->getRegisterInfo()->getFrameRegister() &&
-            "Not a Frame Pointer, Nor a Spill Slot");
+        assert(FP == QST.getRegisterInfo()->getFrameRegister() &&
+               "Not a Frame Pointer, Nor a Spill Slot");
         assert(MI->getOperand(2).isImm() && "Not an offset");
         int Offset = MI->getOperand(2).getImm();
-        if (!TII->isValidOffset(Hexagon::LDriw, Offset)) {
-          if (!TII->isValidOffset(Hexagon::ADD_ri, Offset)) {
+        if (!TII->isValidOffset(Hexagon::L2_loadri_io, Offset)) {
+          if (!TII->isValidOffset(Hexagon::A2_addi, Offset)) {
             BuildMI(*MBB, MII, MI->getDebugLoc(),
                     TII->get(Hexagon::CONST32_Int_Real),
                       HEXAGON_RESERVED_REG_1).addImm(Offset);
@@ -150,26 +142,26 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
                     HEXAGON_RESERVED_REG_1)
               .addReg(FP)
               .addReg(HEXAGON_RESERVED_REG_1);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::LDriw),
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::L2_loadri_io),
                       HEXAGON_RESERVED_REG_2)
               .addReg(HEXAGON_RESERVED_REG_1)
               .addImm(0);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_PdRs),
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::C2_tfrrp),
                       DstReg).addReg(HEXAGON_RESERVED_REG_2);
           } else {
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::ADD_ri),
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_addi),
                       HEXAGON_RESERVED_REG_1).addReg(FP).addImm(Offset);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::LDriw),
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::L2_loadri_io),
                       HEXAGON_RESERVED_REG_2)
               .addReg(HEXAGON_RESERVED_REG_1)
               .addImm(0);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_PdRs),
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::C2_tfrrp),
                       DstReg).addReg(HEXAGON_RESERVED_REG_2);
           }
         } else {
-          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::LDriw),
+          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::L2_loadri_io),
                     HEXAGON_RESERVED_REG_2).addReg(FP).addImm(Offset);
-          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_PdRs),
+          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::C2_tfrrp),
                     DstReg).addReg(HEXAGON_RESERVED_REG_2);
         }
         MII = MBB->erase(MI);
@@ -200,6 +192,6 @@ void llvm::initializeHexagonExpandPredSpillCodePass(PassRegistry &Registry) {
 }
 
 FunctionPass*
-llvm::createHexagonExpandPredSpillCode(const HexagonTargetMachine &TM) {
-  return new HexagonExpandPredSpillCode(TM);
+llvm::createHexagonExpandPredSpillCode() {
+  return new HexagonExpandPredSpillCode();
 }
diff --git a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
index 5f9b927..e8d8f14 100644
--- a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
+++ b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -81,8 +81,8 @@ FunctionPass *llvm::createHexagonFixupHwLoops() {
 
 /// \brief Returns true if the instruction is a hardware loop instruction.
 static bool isHardwareLoop(const MachineInstr *MI) {
-  return MI->getOpcode() == Hexagon::LOOP0_r ||
-         MI->getOpcode() == Hexagon::LOOP0_i;
+  return MI->getOpcode() == Hexagon::J2_loop0r ||
+         MI->getOpcode() == Hexagon::J2_loop0i;
 }
 
 
@@ -168,18 +168,18 @@ void HexagonFixupHwLoops::convertLoopInstr(MachineFunction &MF,
   // First, set the LC0 with the trip count.
   if (MII->getOperand(1).isReg()) {
     // Trip count is a register
-    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::LC0)
+    BuildMI(*MBB, MII, DL, TII->get(Hexagon::A2_tfrrcr), Hexagon::LC0)
       .addReg(MII->getOperand(1).getReg());
   } else {
     // Trip count is an immediate.
-    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFRI), Scratch)
+    BuildMI(*MBB, MII, DL, TII->get(Hexagon::A2_tfrsi), Scratch)
       .addImm(MII->getOperand(1).getImm());
-    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::LC0)
+    BuildMI(*MBB, MII, DL, TII->get(Hexagon::A2_tfrrcr), Hexagon::LC0)
       .addReg(Scratch);
   }
   // Then, set the SA0 with the loop start address.
   BuildMI(*MBB, MII, DL, TII->get(Hexagon::CONST32_Label), Scratch)
     .addMBB(MII->getOperand(0).getMBB());
-  BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::SA0)
+  BuildMI(*MBB, MII, DL, TII->get(Hexagon::A2_tfrrcr), Hexagon::SA0)
     .addReg(Scratch);
 }
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 356f279..2b1992f 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -50,10 +50,8 @@ void HexagonFrameLowering::determineFrameLayout(MachineFunction &MF) const {
   unsigned FrameSize = MFI->getStackSize();
 
   // Get the alignments provided by the target.
-  unsigned TargetAlign = MF.getTarget()
-                             .getSubtargetImpl()
-                             ->getFrameLowering()
-                             ->getStackAlignment();
+  unsigned TargetAlign =
+      MF.getSubtarget().getFrameLowering()->getStackAlignment();
   // Get the maximum call frame size of all the calls.
   unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
 
@@ -80,8 +78,8 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineBasicBlock::iterator MBBI = MBB.begin();
-  const HexagonRegisterInfo *QRI = static_cast<const HexagonRegisterInfo *>(
-      MF.getSubtarget().getRegisterInfo());
+  const HexagonRegisterInfo *QRI =
+      MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   determineFrameLayout(MF);
 
@@ -122,17 +120,17 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF) const {
 
     if (NumBytes >= ALLOCFRAME_MAX) {
       // Emit allocframe(#0).
-      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::ALLOCFRAME)).addImm(0);
+      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::S2_allocframe)).addImm(0);
 
       // Subtract offset from frame pointer.
       BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::CONST32_Int_Real),
                                       HEXAGON_RESERVED_REG_1).addImm(NumBytes);
-      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::SUB_rr),
+      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::A2_sub),
                                       QRI->getStackRegister()).
                                       addReg(QRI->getStackRegister()).
                                       addReg(HEXAGON_RESERVED_REG_1);
     } else {
-      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::ALLOCFRAME)).addImm(NumBytes);
+      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::S2_allocframe)).addImm(NumBytes);
     }
   }
 }
@@ -161,15 +159,14 @@ void HexagonFrameLowering::emitEpilogue(MachineFunction &MF,
     // Handle EH_RETURN.
     if (MBBI->getOpcode() == Hexagon::EH_RETURN_JMPR) {
       assert(MBBI->getOperand(0).isReg() && "Offset should be in register!");
-      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::DEALLOCFRAME));
+      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::L2_deallocframe));
       BuildMI(MBB, MBBI, dl, TII.get(Hexagon::A2_add),
               Hexagon::R29).addReg(Hexagon::R29).addReg(Hexagon::R28);
       return;
     }
     // Replace 'jumpr r31' instruction with dealloc_return for V4 and higher
     // versions.
-    if (MF.getTarget().getSubtarget<HexagonSubtarget>().hasV4TOps() &&
-        MBBI->getOpcode() == Hexagon::JMPret && !DisableDeallocRet) {
+    if (MBBI->getOpcode() == Hexagon::JMPret && !DisableDeallocRet) {
       // Check for RESTORE_DEALLOC_RET_JMP_V4 call. Don't emit an extra DEALLOC
       // instruction if we encounter it.
       MachineBasicBlock::iterator BeforeJMPR =
@@ -183,7 +180,7 @@ void HexagonFrameLowering::emitEpilogue(MachineFunction &MF,
 
       // Add dealloc_return.
       MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI_end, dl, TII.get(Hexagon::DEALLOC_RET_V4));
+        BuildMI(MBB, MBBI_end, dl, TII.get(Hexagon::L4_return));
       // Transfer the function live-out registers.
       MIB->copyImplicitOps(*MBB.getParent(), &*MBBI);
       // Remove the JUMPR node.
@@ -198,7 +195,7 @@ void HexagonFrameLowering::emitEpilogue(MachineFunction &MF,
           I->getOpcode() == Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4)
         return;
 
-      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::DEALLOCFRAME));
+      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::L2_deallocframe));
     }
   }
 }
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index e2062a3..1577c33 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -28,7 +28,7 @@
 
 #include "llvm/ADT/SmallSet.h"
 #include "Hexagon.h"
-#include "HexagonTargetMachine.h"
+#include "HexagonSubtarget.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -64,9 +64,7 @@ namespace {
     MachineLoopInfo            *MLI;
     MachineRegisterInfo        *MRI;
     MachineDominatorTree       *MDT;
-    const HexagonTargetMachine *TM;
     const HexagonInstrInfo     *TII;
-    const HexagonRegisterInfo  *TRI;
 #ifndef NDEBUG
     static int Counter;
 #endif
@@ -265,9 +263,7 @@ namespace {
       return Contents.ImmVal;
     }
 
-    void print(raw_ostream &OS, const TargetMachine *TM = nullptr) const {
-      const TargetRegisterInfo *TRI =
-          TM ? TM->getSubtargetImpl()->getRegisterInfo() : nullptr;
+    void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const {
       if (isReg()) { OS << PrintReg(Contents.R.Reg, TRI, Contents.R.Sub); }
       if (isImm()) { OS << Contents.ImmVal; }
     }
@@ -285,8 +281,8 @@ INITIALIZE_PASS_END(HexagonHardwareLoops, "hwloops",
 
 /// \brief Returns true if the instruction is a hardware loop instruction.
 static bool isHardwareLoop(const MachineInstr *MI) {
-  return MI->getOpcode() == Hexagon::LOOP0_r ||
-    MI->getOpcode() == Hexagon::LOOP0_i;
+  return MI->getOpcode() == Hexagon::J2_loop0r ||
+    MI->getOpcode() == Hexagon::J2_loop0i;
 }
 
 FunctionPass *llvm::createHexagonHardwareLoops() {
@@ -302,11 +298,7 @@ bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
   MLI = &getAnalysis<MachineLoopInfo>();
   MRI = &MF.getRegInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
-  TM  = static_cast<const HexagonTargetMachine*>(&MF.getTarget());
-  TII = static_cast<const HexagonInstrInfo *>(
-      TM->getSubtargetImpl()->getInstrInfo());
-  TRI = static_cast<const HexagonRegisterInfo *>(
-      TM->getSubtargetImpl()->getRegisterInfo());
+  TII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
 
   for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end();
        I != E; ++I) {
@@ -357,7 +349,7 @@ bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L,
       unsigned PhiOpReg = Phi->getOperand(i).getReg();
       MachineInstr *DI = MRI->getVRegDef(PhiOpReg);
       unsigned UpdOpc = DI->getOpcode();
-      bool isAdd = (UpdOpc == Hexagon::ADD_ri);
+      bool isAdd = (UpdOpc == Hexagon::A2_addi);
 
       if (isAdd) {
         // If the register operand to the add is the PHI we're
@@ -540,21 +532,21 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
     return nullptr;
 
   switch (CondOpc) {
-    case Hexagon::CMPEQri:
-    case Hexagon::CMPEQrr:
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::C2_cmpeq:
       Cmp = !Negated ? Comparison::EQ : Comparison::NE;
       break;
-    case Hexagon::CMPGTUri:
-    case Hexagon::CMPGTUrr:
+    case Hexagon::C2_cmpgtui:
+    case Hexagon::C2_cmpgtu:
       Cmp = !Negated ? Comparison::GTu : Comparison::LEu;
       break;
-    case Hexagon::CMPGTri:
-    case Hexagon::CMPGTrr:
+    case Hexagon::C2_cmpgti:
+    case Hexagon::C2_cmpgt:
       Cmp = !Negated ? Comparison::GTs : Comparison::LEs;
       break;
     // Very limited support for byte/halfword compares.
-    case Hexagon::CMPbEQri_V4:
-    case Hexagon::CMPhEQri_V4: {
+    case Hexagon::A4_cmpbeqi:
+    case Hexagon::A4_cmpheqi: {
       if (IVBump != 1)
         return nullptr;
 
@@ -574,7 +566,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
       }
       if (InitV >= EndV)
         return nullptr;
-      if (CondOpc == Hexagon::CMPbEQri_V4) {
+      if (CondOpc == Hexagon::A4_cmpbeqi) {
         if (!isInt<8>(InitV) || !isInt<8>(EndV))
           return nullptr;
       } else {  // Hexagon::CMPhEQri_V4
@@ -626,12 +618,12 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
   // If so, use the immediate value rather than the register.
   if (Start->isReg()) {
     const MachineInstr *StartValInstr = MRI->getVRegDef(Start->getReg());
-    if (StartValInstr && StartValInstr->getOpcode() == Hexagon::TFRI)
+    if (StartValInstr && StartValInstr->getOpcode() == Hexagon::A2_tfrsi)
       Start = &StartValInstr->getOperand(1);
   }
   if (End->isReg()) {
     const MachineInstr *EndValInstr = MRI->getVRegDef(End->getReg());
-    if (EndValInstr && EndValInstr->getOpcode() == Hexagon::TFRI)
+    if (EndValInstr && EndValInstr->getOpcode() == Hexagon::A2_tfrsi)
       End = &EndValInstr->getOperand(1);
   }
 
@@ -781,9 +773,9 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
     DistR = End->getReg();
     DistSR = End->getSubReg();
   } else {
-    const MCInstrDesc &SubD = RegToReg ? TII->get(Hexagon::SUB_rr) :
-                              (RegToImm ? TII->get(Hexagon::SUB_ri) :
-                                          TII->get(Hexagon::ADD_ri));
+    const MCInstrDesc &SubD = RegToReg ? TII->get(Hexagon::A2_sub) :
+                              (RegToImm ? TII->get(Hexagon::A2_subri) :
+                                          TII->get(Hexagon::A2_addi));
     unsigned SubR = MRI->createVirtualRegister(IntRC);
     MachineInstrBuilder SubIB =
       BuildMI(*PH, InsertPos, DL, SubD, SubR);
@@ -811,7 +803,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
   } else {
     // Generate CountR = ADD DistR, AdjVal
     unsigned AddR = MRI->createVirtualRegister(IntRC);
-    const MCInstrDesc &AddD = TII->get(Hexagon::ADD_ri);
+    MCInstrDesc const &AddD = TII->get(Hexagon::A2_addi);
     BuildMI(*PH, InsertPos, DL, AddD, AddR)
       .addReg(DistR, 0, DistSR)
       .addImm(AdjV);
@@ -832,7 +824,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
 
     // Generate NormR = LSR DistR, Shift.
     unsigned LsrR = MRI->createVirtualRegister(IntRC);
-    const MCInstrDesc &LsrD = TII->get(Hexagon::LSR_ri);
+    const MCInstrDesc &LsrD = TII->get(Hexagon::S2_lsr_i_r);
     BuildMI(*PH, InsertPos, DL, LsrD, LsrR)
       .addReg(AdjR, 0, AdjSR)
       .addImm(Shift);
@@ -1086,7 +1078,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
     BuildMI(*Preheader, InsertPos, DL, TII->get(TargetOpcode::COPY), CountReg)
       .addReg(TripCount->getReg(), 0, TripCount->getSubReg());
     // Add the Loop instruction to the beginning of the loop.
-    BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::LOOP0_r))
+    BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::J2_loop0r))
       .addMBB(LoopStart)
       .addReg(CountReg);
   } else {
@@ -1095,14 +1087,14 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
     // if the immediate fits in the instructions.  Otherwise, we need to
     // create a new virtual register.
     int64_t CountImm = TripCount->getImm();
-    if (!TII->isValidOffset(Hexagon::LOOP0_i, CountImm)) {
+    if (!TII->isValidOffset(Hexagon::J2_loop0i, CountImm)) {
       unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
-      BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::TFRI), CountReg)
+      BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::A2_tfrsi), CountReg)
         .addImm(CountImm);
-      BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::LOOP0_r))
+      BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::J2_loop0r))
         .addMBB(LoopStart).addReg(CountReg);
     } else
-      BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::LOOP0_i))
+      BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::J2_loop0i))
         .addMBB(LoopStart).addImm(CountImm);
   }
 
@@ -1122,8 +1114,8 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
   // The loop ends with either:
   //  - a conditional branch followed by an unconditional branch, or
   //  - a conditional branch to the loop start.
-  if (LastI->getOpcode() == Hexagon::JMP_t ||
-      LastI->getOpcode() == Hexagon::JMP_f) {
+  if (LastI->getOpcode() == Hexagon::J2_jumpt ||
+      LastI->getOpcode() == Hexagon::J2_jumpf) {
     // Delete one and change/add an uncond. branch to out of the loop.
     MachineBasicBlock *BranchTarget = LastI->getOperand(1).getMBB();
     LastI = LastMBB->erase(LastI);
@@ -1194,8 +1186,8 @@ MachineInstr *HexagonHardwareLoops::defWithImmediate(unsigned R) {
   MachineInstr *DI = MRI->getVRegDef(R);
   unsigned DOpc = DI->getOpcode();
   switch (DOpc) {
-    case Hexagon::TFRI:
-    case Hexagon::TFRI64:
+    case Hexagon::A2_tfrsi:
+    case Hexagon::A2_tfrpi:
     case Hexagon::CONST32_Int_Real:
     case Hexagon::CONST64_Int_Real:
       return DI;
@@ -1277,7 +1269,7 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
       unsigned PhiReg = Phi->getOperand(i).getReg();
       MachineInstr *DI = MRI->getVRegDef(PhiReg);
       unsigned UpdOpc = DI->getOpcode();
-      bool isAdd = (UpdOpc == Hexagon::ADD_ri);
+      bool isAdd = (UpdOpc == Hexagon::A2_addi);
 
       if (isAdd) {
         // If the register operand to the add/sub is the PHI we are looking
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index dc58c42..fb056b5 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -47,7 +47,7 @@ namespace {
 class HexagonDAGToDAGISel : public SelectionDAGISel {
   /// Subtarget - Keep a pointer to the Hexagon Subtarget around so that we can
   /// make the right decision when generating code for different targets.
-  const HexagonSubtarget &Subtarget;
+  const HexagonSubtarget *Subtarget;
 
   // Keep a reference to HexagonTargetMachine.
   const HexagonTargetMachine& TM;
@@ -55,9 +55,7 @@ class HexagonDAGToDAGISel : public SelectionDAGISel {
 public:
   explicit HexagonDAGToDAGISel(HexagonTargetMachine &targetmachine,
                                CodeGenOpt::Level OptLevel)
-    : SelectionDAGISel(targetmachine, OptLevel),
-      Subtarget(targetmachine.getSubtarget<HexagonSubtarget>()),
-      TM(targetmachine) {
+      : SelectionDAGISel(targetmachine, OptLevel), TM(targetmachine) {
     initializeHexagonDAGToDAGISelPass(*PassRegistry::getPassRegistry());
   }
   bool hasNumUsesBelowThresGA(SDNode *N) const;
@@ -79,10 +77,21 @@ public:
   bool SelectADDRriU6_1(SDValue& N, SDValue &R1, SDValue &R2);
   bool SelectADDRriU6_2(SDValue& N, SDValue &R1, SDValue &R2);
 
+  // Complex Pattern Selectors.
+  inline bool SelectAddrGA(SDValue &N, SDValue &R);
+  inline bool SelectAddrGP(SDValue &N, SDValue &R);
+  bool SelectGlobalAddress(SDValue &N, SDValue &R, bool UseGP);
+  bool SelectAddrFI(SDValue &N, SDValue &R);
+
   const char *getPassName() const override {
     return "Hexagon DAG->DAG Pattern Instruction Selection";
   }
 
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    Subtarget = &MF.getSubtarget<HexagonSubtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
+
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
   bool SelectInlineAsmMemoryOperand(const SDValue &Op,
@@ -138,9 +147,7 @@ SDValue XformMskToBitPosU3Imm(uint8_t Imm) {
 // Return true if there is exactly one bit set in V, i.e., if V is one of the
 // following integers: 2^0, 2^1, ..., 2^31.
 bool ImmIsSingleBit(uint32_t v) const {
-  uint32_t c = CountPopulation_64(v);
-  // Only return true if we counted 1 bit.
-  return c == 1;
+  return isPowerOf2_32(v);
 }
 
 // XformM5ToU5Imm - Return a target constant with the specified value, of type
@@ -170,8 +177,21 @@ inline SDValue XformUToUM1Imm(unsigned Imm) {
   return CurDAG->getTargetConstant(Imm - 1, MVT::i32);
 }
 
+// XformSToSM2Imm - Return a target constant decremented by 2.
+inline SDValue XformSToSM2Imm(unsigned Imm) {
+  return CurDAG->getTargetConstant(Imm - 2, MVT::i32);
+}
+
+// XformSToSM3Imm - Return a target constant decremented by 3.
+inline SDValue XformSToSM3Imm(unsigned Imm) {
+  return CurDAG->getTargetConstant(Imm - 3, MVT::i32);
+}
+
 // Include the pieces autogenerated from the target description.
 #include "HexagonGenDAGISel.inc"
+
+private:
+  bool isValueExtension(SDValue const &Val, unsigned FromBits, SDValue &Src);
 };
 }  // end anonymous namespace
 
@@ -312,56 +332,6 @@ static unsigned doesIntrinsicReturnPredicate(unsigned ID)
   }
 }
 
-
-// Intrinsics that have predicate operands.
-static unsigned doesIntrinsicContainPredicate(unsigned ID)
-{
-  switch (ID) {
-    default:
-      return 0;
-    case Intrinsic::hexagon_C2_tfrpr:
-      return Hexagon::TFR_RsPd;
-    case Intrinsic::hexagon_C2_and:
-      return Hexagon::AND_pp;
-    case Intrinsic::hexagon_C2_xor:
-      return Hexagon::XOR_pp;
-    case Intrinsic::hexagon_C2_or:
-      return Hexagon::OR_pp;
-    case Intrinsic::hexagon_C2_not:
-      return Hexagon::NOT_p;
-    case Intrinsic::hexagon_C2_any8:
-      return Hexagon::ANY_pp;
-    case Intrinsic::hexagon_C2_all8:
-      return Hexagon::ALL_pp;
-    case Intrinsic::hexagon_C2_vitpack:
-      return Hexagon::VITPACK_pp;
-    case Intrinsic::hexagon_C2_mask:
-      return Hexagon::MASK_p;
-    case Intrinsic::hexagon_C2_mux:
-      return Hexagon::MUX_rr;
-
-      // Mapping hexagon_C2_muxir to MUX_pri.  This is pretty weird - but
-      // that's how it's mapped in q6protos.h.
-    case Intrinsic::hexagon_C2_muxir:
-      return Hexagon::MUX_ri;
-
-      // Mapping hexagon_C2_muxri to MUX_pir.  This is pretty weird - but
-      // that's how it's mapped in q6protos.h.
-    case Intrinsic::hexagon_C2_muxri:
-      return Hexagon::MUX_ir;
-
-    case Intrinsic::hexagon_C2_muxii:
-      return Hexagon::MUX_ii;
-    case Intrinsic::hexagon_C2_vmux:
-      return Hexagon::VMUX_prr64;
-    case Intrinsic::hexagon_S2_valignrb:
-      return Hexagon::VALIGN_rrp;
-    case Intrinsic::hexagon_S2_vsplicerb:
-      return Hexagon::VSPLICE_rrp;
-  }
-}
-
-
 static bool OffsetFitsS11(EVT MemType, int64_t Offset) {
   if (MemType == MVT::i64 && isShiftedInt<11,3>(Offset)) {
     return true;
@@ -404,10 +374,10 @@ SDNode *HexagonDAGToDAGISel::SelectBaseOffsetLoad(LoadSDNode *LD, SDLoc dl) {
                                                dl, PointerTy,
                                                TargAddr);
       // Figure out base + offset opcode
-      if (LoadedVT == MVT::i64) Opcode = Hexagon::LDrid_indexed;
-      else if (LoadedVT == MVT::i32) Opcode = Hexagon::LDriw_indexed;
-      else if (LoadedVT == MVT::i16) Opcode = Hexagon::LDrih_indexed;
-      else if (LoadedVT == MVT::i8) Opcode = Hexagon::LDrib_indexed;
+      if (LoadedVT == MVT::i64) Opcode = Hexagon::L2_loadrd_io;
+      else if (LoadedVT == MVT::i32) Opcode = Hexagon::L2_loadri_io;
+      else if (LoadedVT == MVT::i16) Opcode = Hexagon::L2_loadrh_io;
+      else if (LoadedVT == MVT::i8) Opcode = Hexagon::L2_loadrb_io;
       else llvm_unreachable("unknown memory type");
 
       // Build indexed load.
@@ -446,14 +416,13 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD,
 
   if (SelectADDRriS11_2(N1, CPTmpN1_0, CPTmpN1_1) &&
       N1.getNode()->getValueType(0) == MVT::i32) {
-    const HexagonInstrInfo *TII = static_cast<const HexagonInstrInfo *>(
-        TM.getSubtargetImpl()->getInstrInfo());
+    const HexagonInstrInfo *TII = Subtarget->getInstrInfo();
     if (TII->isValidAutoIncImm(LoadedVT, Val)) {
       SDValue TargetConst = CurDAG->getTargetConstant(Val, MVT::i32);
       SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32,
                                                 MVT::Other, Base, TargetConst,
                                                 Chain);
-      SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::SXTW, dl, MVT::i64,
+      SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_sxtw, dl, MVT::i64,
                                                 SDValue(Result_1, 0));
       MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
       MemOp[0] = LD->getMemOperand();
@@ -474,9 +443,9 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD,
     SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
                                               MVT::Other, Base, TargetConst0,
                                               Chain);
-    SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::SXTW, dl,
+    SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_sxtw, dl,
                                                 MVT::i64, SDValue(Result_1, 0));
-    SDNode* Result_3 = CurDAG->getMachineNode(Hexagon::ADD_ri, dl,
+    SDNode* Result_3 = CurDAG->getMachineNode(Hexagon::A2_addi, dl,
                                               MVT::i32, Base, TargetConstVal,
                                                 SDValue(Result_1, 1));
     MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
@@ -513,17 +482,16 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD,
 
   if (SelectADDRriS11_2(N1, CPTmpN1_0, CPTmpN1_1) &&
       N1.getNode()->getValueType(0) == MVT::i32) {
-    const HexagonInstrInfo *TII = static_cast<const HexagonInstrInfo *>(
-        TM.getSubtargetImpl()->getInstrInfo());
+    const HexagonInstrInfo *TII = Subtarget->getInstrInfo();
     if (TII->isValidAutoIncImm(LoadedVT, Val)) {
       SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
       SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
       SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
                                                 MVT::i32, MVT::Other, Base,
                                                 TargetConstVal, Chain);
-      SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::TFRI, dl, MVT::i32,
+      SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::i32,
                                                 TargetConst0);
-      SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::COMBINE_rr, dl,
+      SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::A2_combinew, dl,
                                                 MVT::i64, MVT::Other,
                                                 SDValue(Result_2,0),
                                                 SDValue(Result_1,0));
@@ -548,14 +516,14 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD,
     SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
                                               MVT::Other,
                                               Base, TargetConst0, Chain);
-    SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::TFRI, dl, MVT::i32,
+    SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::i32,
                                               TargetConst0);
-    SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::COMBINE_rr, dl,
+    SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::A2_combinew, dl,
                                               MVT::i64, MVT::Other,
                                               SDValue(Result_2,0),
                                               SDValue(Result_1,0));
     // Add offset to base.
-    SDNode* Result_4 = CurDAG->getMachineNode(Hexagon::ADD_ri, dl, MVT::i32,
+    SDNode* Result_4 = CurDAG->getMachineNode(Hexagon::A2_addi, dl, MVT::i32,
                                               Base, TargetConstVal,
                                               SDValue(Result_1, 1));
     MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
@@ -591,28 +559,27 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) {
   bool zextval = (LD->getExtensionType() == ISD::ZEXTLOAD);
 
   // Figure out the opcode.
-  const HexagonInstrInfo *TII = static_cast<const HexagonInstrInfo *>(
-      TM.getSubtargetImpl()->getInstrInfo());
+  const HexagonInstrInfo *TII = Subtarget->getInstrInfo();
   if (LoadedVT == MVT::i64) {
     if (TII->isValidAutoIncImm(LoadedVT, Val))
-      Opcode = Hexagon::POST_LDrid;
+      Opcode = Hexagon::L2_loadrd_pi;
     else
-      Opcode = Hexagon::LDrid;
+      Opcode = Hexagon::L2_loadrd_io;
   } else if (LoadedVT == MVT::i32) {
     if (TII->isValidAutoIncImm(LoadedVT, Val))
-      Opcode = Hexagon::POST_LDriw;
+      Opcode = Hexagon::L2_loadri_pi;
     else
-      Opcode = Hexagon::LDriw;
+      Opcode = Hexagon::L2_loadri_io;
   } else if (LoadedVT == MVT::i16) {
     if (TII->isValidAutoIncImm(LoadedVT, Val))
-      Opcode = zextval ? Hexagon::POST_LDriuh : Hexagon::POST_LDrih;
+      Opcode = zextval ? Hexagon::L2_loadruh_pi : Hexagon::L2_loadrh_pi;
     else
-      Opcode = zextval ? Hexagon::LDriuh : Hexagon::LDrih;
+      Opcode = zextval ? Hexagon::L2_loadruh_io : Hexagon::L2_loadrh_io;
   } else if (LoadedVT == MVT::i8) {
     if (TII->isValidAutoIncImm(LoadedVT, Val))
-      Opcode = zextval ? Hexagon::POST_LDriub : Hexagon::POST_LDrib;
+      Opcode = zextval ? Hexagon::L2_loadrub_pi : Hexagon::L2_loadrb_pi;
     else
-      Opcode = zextval ? Hexagon::LDriub : Hexagon::LDrib;
+      Opcode = zextval ? Hexagon::L2_loadrub_io : Hexagon::L2_loadrb_io;
   } else
     llvm_unreachable("unknown memory type");
 
@@ -652,7 +619,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) {
                                               LD->getValueType(0),
                                               MVT::Other, Base, TargetConst0,
                                               Chain);
-    SDNode* Result_2 = CurDAG->getMachineNode(Hexagon::ADD_ri, dl, MVT::i32,
+    SDNode* Result_2 = CurDAG->getMachineNode(Hexagon::A2_addi, dl, MVT::i32,
                                               Base, TargetConstVal,
                                               SDValue(Result_1, 1));
     MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
@@ -701,18 +668,17 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) {
 
   // Offset value must be within representable range
   // and must have correct alignment properties.
-  const HexagonInstrInfo *TII = static_cast<const HexagonInstrInfo *>(
-      TM.getSubtargetImpl()->getInstrInfo());
+  const HexagonInstrInfo *TII = Subtarget->getInstrInfo();
   if (TII->isValidAutoIncImm(StoredVT, Val)) {
     SDValue Ops[] = {Base, CurDAG->getTargetConstant(Val, MVT::i32), Value,
                      Chain};
     unsigned Opcode = 0;
 
     // Figure out the post inc version of opcode.
-    if (StoredVT == MVT::i64) Opcode = Hexagon::POST_STdri;
-    else if (StoredVT == MVT::i32) Opcode = Hexagon::POST_STwri;
-    else if (StoredVT == MVT::i16) Opcode = Hexagon::POST_SThri;
-    else if (StoredVT == MVT::i8) Opcode = Hexagon::POST_STbri;
+    if (StoredVT == MVT::i64) Opcode = Hexagon::S2_storerd_pi;
+    else if (StoredVT == MVT::i32) Opcode = Hexagon::S2_storeri_pi;
+    else if (StoredVT == MVT::i16) Opcode = Hexagon::S2_storerh_pi;
+    else if (StoredVT == MVT::i8) Opcode = Hexagon::S2_storerb_pi;
     else llvm_unreachable("unknown memory type");
 
     // Build post increment store.
@@ -735,17 +701,17 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) {
   unsigned Opcode = 0;
 
   // Figure out the opcode.
-  if (StoredVT == MVT::i64) Opcode = Hexagon::STrid;
-  else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw_indexed;
-  else if (StoredVT == MVT::i16) Opcode = Hexagon::STrih;
-  else if (StoredVT == MVT::i8) Opcode = Hexagon::STrib;
+  if (StoredVT == MVT::i64) Opcode = Hexagon::S2_storerd_io;
+  else if (StoredVT == MVT::i32) Opcode = Hexagon::S2_storeri_io;
+  else if (StoredVT == MVT::i16) Opcode = Hexagon::S2_storerh_io;
+  else if (StoredVT == MVT::i8) Opcode = Hexagon::S2_storerb_io;
   else llvm_unreachable("unknown memory type");
 
   // Build regular store.
   SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
   SDNode* Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
   // Build splitted incriment instruction.
-  SDNode* Result_2 = CurDAG->getMachineNode(Hexagon::ADD_ri, dl, MVT::i32,
+  SDNode* Result_2 = CurDAG->getMachineNode(Hexagon::A2_addi, dl, MVT::i32,
                                             Base,
                                             TargetConstVal,
                                             SDValue(Result_1, 0));
@@ -788,10 +754,10 @@ SDNode *HexagonDAGToDAGISel::SelectBaseOffsetStore(StoreSDNode *ST,
                                                  TargAddr);
 
         // Figure out base + offset opcode
-        if (StoredVT == MVT::i64) Opcode = Hexagon::STrid_indexed;
-        else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw_indexed;
-        else if (StoredVT == MVT::i16) Opcode = Hexagon::STrih_indexed;
-        else if (StoredVT == MVT::i8) Opcode = Hexagon::STrib_indexed;
+        if (StoredVT == MVT::i64) Opcode = Hexagon::S2_storerd_io;
+        else if (StoredVT == MVT::i32) Opcode = Hexagon::S2_storeri_io;
+        else if (StoredVT == MVT::i16) Opcode = Hexagon::S2_storerh_io;
+        else if (StoredVT == MVT::i8) Opcode = Hexagon::S2_storerb_io;
         else llvm_unreachable("unknown memory type");
 
         SDValue Ops[] = {SDValue(NewBase,0),
@@ -865,7 +831,7 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
 
       SDValue Chain = LD->getChain();
       SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
-      OP0 = SDValue (CurDAG->getMachineNode(Hexagon::LDriw, dl, MVT::i32,
+      OP0 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32,
                                             MVT::Other,
                                             LD->getBasePtr(), TargetConst0,
                                             Chain), 0);
@@ -891,7 +857,7 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
 
       SDValue Chain = LD->getChain();
       SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
-      OP1 = SDValue (CurDAG->getMachineNode(Hexagon::LDriw, dl, MVT::i32,
+      OP1 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32,
                                             MVT::Other,
                                             LD->getBasePtr(), TargetConst0,
                                             Chain), 0);
@@ -900,7 +866,7 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
     }
 
     // Generate a mpy instruction.
-    SDNode *Result = CurDAG->getMachineNode(Hexagon::MPY64, dl, MVT::i64,
+    SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_dpmpyss_s0, dl, MVT::i64,
                                             OP0, OP1);
     ReplaceUses(N, Result);
     return Result;
@@ -934,9 +900,9 @@ SDNode *HexagonDAGToDAGISel::SelectSelect(SDNode *N) {
             if (N000 == N2 &&
                 N0.getNode()->getValueType(N0.getResNo()) == MVT::i1 &&
                 N00.getNode()->getValueType(N00.getResNo()) == MVT::i32) {
-              SDNode *SextNode = CurDAG->getMachineNode(Hexagon::SXTH, dl,
+              SDNode *SextNode = CurDAG->getMachineNode(Hexagon::A2_sxth, dl,
                                                         MVT::i32, N000);
-              SDNode *Result = CurDAG->getMachineNode(Hexagon::MAXw_rr, dl,
+              SDNode *Result = CurDAG->getMachineNode(Hexagon::A2_max, dl,
                                                       MVT::i32,
                                                       SDValue(SextNode, 0),
                                                       N1);
@@ -958,9 +924,9 @@ SDNode *HexagonDAGToDAGISel::SelectSelect(SDNode *N) {
             if (N000 == N2 &&
                 N0.getNode()->getValueType(N0.getResNo()) == MVT::i1 &&
                 N00.getNode()->getValueType(N00.getResNo()) == MVT::i32) {
-              SDNode *SextNode = CurDAG->getMachineNode(Hexagon::SXTH, dl,
+              SDNode *SextNode = CurDAG->getMachineNode(Hexagon::A2_sxth, dl,
                                                         MVT::i32, N000);
-              SDNode *Result = CurDAG->getMachineNode(Hexagon::MINw_rr, dl,
+              SDNode *Result = CurDAG->getMachineNode(Hexagon::A2_min, dl,
                                                       MVT::i32,
                                                       SDValue(SextNode, 0),
                                                       N1);
@@ -1045,7 +1011,7 @@ SDNode *HexagonDAGToDAGISel::SelectTruncate(SDNode *N) {
 
         SDValue Chain = LD->getChain();
         SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
-        OP0 = SDValue (CurDAG->getMachineNode(Hexagon::LDriw, dl, MVT::i32,
+        OP0 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32,
                                               MVT::Other,
                                               LD->getBasePtr(),
                                               TargetConst0, Chain), 0);
@@ -1070,7 +1036,7 @@ SDNode *HexagonDAGToDAGISel::SelectTruncate(SDNode *N) {
 
         SDValue Chain = LD->getChain();
         SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
-        OP1 = SDValue (CurDAG->getMachineNode(Hexagon::LDriw, dl, MVT::i32,
+        OP1 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32,
                                               MVT::Other,
                                               LD->getBasePtr(),
                                               TargetConst0, Chain), 0);
@@ -1079,7 +1045,7 @@ SDNode *HexagonDAGToDAGISel::SelectTruncate(SDNode *N) {
       }
 
       // Generate a mpy instruction.
-      SDNode *Result = CurDAG->getMachineNode(Hexagon::MPY, dl, MVT::i32,
+      SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_mpy_up, dl, MVT::i32,
                                               OP0, OP1);
       ReplaceUses(N, Result);
       return Result;
@@ -1112,7 +1078,7 @@ SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
           if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val.getNode()))
             if (isInt<9>(CN->getSExtValue())) {
               SDNode* Result =
-                CurDAG->getMachineNode(Hexagon::MPYI_ri, dl,
+                CurDAG->getMachineNode(Hexagon::M2_mpysmi, dl,
                                        MVT::i32, Mul_0, Val);
               ReplaceUses(N, Result);
               return Result;
@@ -1140,7 +1106,7 @@ SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
                     dyn_cast<ConstantSDNode>(Val.getNode()))
                   if (isInt<9>(CN->getSExtValue())) {
                     SDNode* Result =
-                      CurDAG->getMachineNode(Hexagon::MPYI_ri, dl, MVT::i32,
+                      CurDAG->getMachineNode(Hexagon::M2_mpysmi, dl, MVT::i32,
                                              Shl2_0, Val);
                     ReplaceUses(N, Result);
                     return Result;
@@ -1177,13 +1143,13 @@ SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
       if (N->getValueType(0) == MVT::i64) {
         // Convert the zero_extend to Rs = Pd followed by COMBINE_rr(0,Rs).
         SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
-        SDNode *Result_1 = CurDAG->getMachineNode(Hexagon::TFR_RsPd, dl,
+        SDNode *Result_1 = CurDAG->getMachineNode(Hexagon::C2_tfrpr, dl,
                                                   MVT::i32,
                                                   SDValue(IsIntrinsic, 0));
-        SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::TFRI, dl,
+        SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl,
                                                   MVT::i32,
                                                   TargetConst0);
-        SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::COMBINE_rr, dl,
+        SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::A2_combinew, dl,
                                                   MVT::i64, MVT::Other,
                                                   SDValue(Result_2, 0),
                                                   SDValue(Result_1, 0));
@@ -1192,7 +1158,7 @@ SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
       }
       if (N->getValueType(0) == MVT::i32) {
         // Convert the zero_extend to Rs = Pd
-        SDNode* RsPd = CurDAG->getMachineNode(Hexagon::TFR_RsPd, dl,
+        SDNode* RsPd = CurDAG->getMachineNode(Hexagon::C2_tfrpr, dl,
                                               MVT::i32,
                                               SDValue(IsIntrinsic, 0));
         ReplaceUses(N, RsPd);
@@ -1204,56 +1170,30 @@ SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
   return SelectCode(N);
 }
 
-
 //
 // Checking for intrinsics which have predicate registers as operand(s)
 // and lowering to the actual intrinsic.
 //
 SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
-  SDLoc dl(N);
-  unsigned ID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-  unsigned IntrinsicWithPred = doesIntrinsicContainPredicate(ID);
-
-  // We are concerned with only those intrinsics that have predicate registers
-  // as at least one of the operands.
-  if (IntrinsicWithPred) {
-    SmallVector<SDValue, 8> Ops;
-    const HexagonInstrInfo *TII = static_cast<const HexagonInstrInfo *>(
-        TM.getSubtargetImpl()->getInstrInfo());
-    const MCInstrDesc &MCID = TII->get(IntrinsicWithPred);
-    const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
-
-    // Iterate over all the operands of the intrinsics.
-    // For PredRegs, do the transfer.
-    // For Double/Int Regs, just preserve the value
-    // For immediates, lower it.
-    for (unsigned i = 1; i < N->getNumOperands(); ++i) {
-      SDNode *Arg = N->getOperand(i).getNode();
-      const TargetRegisterClass *RC = TII->getRegClass(MCID, i, TRI, *MF);
-
-      if (RC == &Hexagon::IntRegsRegClass ||
-          RC == &Hexagon::DoubleRegsRegClass) {
-        Ops.push_back(SDValue(Arg, 0));
-      } else if (RC == &Hexagon::PredRegsRegClass) {
-        // Do the transfer.
-        SDNode *PdRs = CurDAG->getMachineNode(Hexagon::TFR_PdRs, dl, MVT::i1,
-                                              SDValue(Arg, 0));
-        Ops.push_back(SDValue(PdRs,0));
-      } else if (!RC && (dyn_cast<ConstantSDNode>(Arg) != nullptr)) {
-        // This is immediate operand. Lower it here making sure that we DO have
-        // const SDNode for immediate value.
-        int32_t Val = cast<ConstantSDNode>(Arg)->getSExtValue();
-        SDValue SDVal = CurDAG->getTargetConstant(Val, MVT::i32);
-        Ops.push_back(SDVal);
-      } else {
-        llvm_unreachable("Unimplemented");
-      }
-    }
-    EVT ReturnValueVT = N->getValueType(0);
-    SDNode *Result = CurDAG->getMachineNode(IntrinsicWithPred, dl,
-                                            ReturnValueVT, Ops);
-    ReplaceUses(N, Result);
-    return Result;
+  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned Bits;
+  switch (IID) {
+  case Intrinsic::hexagon_S2_vsplatrb:
+    Bits = 8;
+    break;
+  case Intrinsic::hexagon_S2_vsplatrh:
+    Bits = 16;
+    break;
+  default:
+    return SelectCode(N);
+  }
+
+  SDValue const &V = N->getOperand(1);
+  SDValue U;
+  if (isValueExtension(V, Bits, U)) {
+    SDValue R = CurDAG->getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+      N->getOperand(0), U);
+    return SelectCode(R.getNode());
   }
   return SelectCode(N);
 }
@@ -1289,19 +1229,19 @@ SDNode *HexagonDAGToDAGISel::SelectConstant(SDNode *N) {
     if (Val == -1) {
       // Create the IntReg = 1 node.
       SDNode* IntRegTFR =
-        CurDAG->getMachineNode(Hexagon::TFRI, dl, MVT::i32,
+        CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::i32,
                                CurDAG->getTargetConstant(0, MVT::i32));
 
       // Pd = IntReg
-      SDNode* Pd = CurDAG->getMachineNode(Hexagon::TFR_PdRs, dl, MVT::i1,
+      SDNode* Pd = CurDAG->getMachineNode(Hexagon::C2_tfrrp, dl, MVT::i1,
                                           SDValue(IntRegTFR, 0));
 
       // not(Pd)
-      SDNode* NotPd = CurDAG->getMachineNode(Hexagon::NOT_p, dl, MVT::i1,
+      SDNode* NotPd = CurDAG->getMachineNode(Hexagon::C2_not, dl, MVT::i1,
                                              SDValue(Pd, 0));
 
       // xor(not(Pd))
-      Result = CurDAG->getMachineNode(Hexagon::XOR_pp, dl, MVT::i1,
+      Result = CurDAG->getMachineNode(Hexagon::C2_xor, dl, MVT::i1,
                                       SDValue(Pd, 0), SDValue(NotPd, 0));
 
       // We have just built:
@@ -1334,7 +1274,7 @@ SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
 
   // Build Rd = Rd' + asr(Rs, Rt). The machine constraints will ensure that
   // Rd and Rd' are assigned to the same register
-  SDNode* Result = CurDAG->getMachineNode(Hexagon::ASR_ADD_rr, dl, MVT::i32,
+  SDNode* Result = CurDAG->getMachineNode(Hexagon::S2_asr_r_r_acc, dl, MVT::i32,
                                           N->getOperand(1),
                                           Src1->getOperand(0),
                                           Src1->getOperand(1));
@@ -1683,3 +1623,126 @@ bool HexagonDAGToDAGISel::foldGlobalAddressImpl(SDValue &N, SDValue &R,
   }
   return false;
 }
+
+bool HexagonDAGToDAGISel::SelectAddrFI(SDValue& N, SDValue &R) {
+  if (N.getOpcode() != ISD::FrameIndex)
+    return false;
+  FrameIndexSDNode *FX = cast<FrameIndexSDNode>(N);
+  R = CurDAG->getTargetFrameIndex(FX->getIndex(), MVT::i32);
+  return true;
+}
+
+inline bool HexagonDAGToDAGISel::SelectAddrGA(SDValue &N, SDValue &R) {
+  return SelectGlobalAddress(N, R, false);
+}
+
+inline bool HexagonDAGToDAGISel::SelectAddrGP(SDValue &N, SDValue &R) {
+  return SelectGlobalAddress(N, R, true);
+}
+
+bool HexagonDAGToDAGISel::SelectGlobalAddress(SDValue &N, SDValue &R,
+                                              bool UseGP) {
+  switch (N.getOpcode()) {
+  case ISD::ADD: {
+    SDValue N0 = N.getOperand(0);
+    SDValue N1 = N.getOperand(1);
+    unsigned GAOpc = N0.getOpcode();
+    if (UseGP && GAOpc != HexagonISD::CONST32_GP)
+      return false;
+    if (!UseGP && GAOpc != HexagonISD::CONST32)
+      return false;
+    if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N1)) {
+      SDValue Addr = N0.getOperand(0);
+      if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Addr)) {
+        if (GA->getOpcode() == ISD::TargetGlobalAddress) {
+          uint64_t NewOff = GA->getOffset() + (uint64_t)Const->getSExtValue();
+          R = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(Const),
+                                             N.getValueType(), NewOff);
+          return true;
+        }
+      }
+    }
+    break;
+  }
+  case HexagonISD::CONST32:
+    // The operand(0) of CONST32 is TargetGlobalAddress, which is what we
+    // want in the instruction.
+    if (!UseGP)
+      R = N.getOperand(0);
+    return !UseGP;
+  case HexagonISD::CONST32_GP:
+    if (UseGP)
+      R = N.getOperand(0);
+    return UseGP;
+  default:
+    return false;
+  }
+
+  return false;
+}
+
+bool HexagonDAGToDAGISel::isValueExtension(SDValue const &Val,
+                                           unsigned FromBits, SDValue &Src) {
+  unsigned Opc = Val.getOpcode();
+  switch (Opc) {
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND: {
+    SDValue const &Op0 = Val.getOperand(0);
+    EVT T = Op0.getValueType();
+    if (T.isInteger() && T.getSizeInBits() == FromBits) {
+      Src = Op0;
+      return true;
+    }
+    break;
+  }
+  case ISD::SIGN_EXTEND_INREG:
+  case ISD::AssertSext:
+  case ISD::AssertZext:
+    if (Val.getOperand(0).getValueType().isInteger()) {
+      VTSDNode *T = cast<VTSDNode>(Val.getOperand(1));
+      if (T->getVT().getSizeInBits() == FromBits) {
+        Src = Val.getOperand(0);
+        return true;
+      }
+    }
+    break;
+  case ISD::AND: {
+    // Check if this is an AND with "FromBits" of lower bits set to 1.
+    uint64_t FromMask = (1 << FromBits) - 1;
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(0))) {
+      if (C->getZExtValue() == FromMask) {
+        Src = Val.getOperand(1);
+        return true;
+      }
+    }
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(1))) {
+      if (C->getZExtValue() == FromMask) {
+        Src = Val.getOperand(0);
+        return true;
+      }
+    }
+    break;
+  }
+  case ISD::OR:
+  case ISD::XOR: {
+    // OR/XOR with the lower "FromBits" bits set to 0.
+    uint64_t FromMask = (1 << FromBits) - 1;
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(0))) {
+      if ((C->getZExtValue() & FromMask) == 0) {
+        Src = Val.getOperand(1);
+        return true;
+      }
+    }
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(1))) {
+      if ((C->getZExtValue() & FromMask) == 0) {
+        Src = Val.getOperand(0);
+        return true;
+      }
+    }
+  }
+  default:
+    break;
+  }
+  return false;
+}
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 7646088..0072994 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -188,7 +188,7 @@ static bool CC_Hexagon32(unsigned ValNo, MVT ValVT,
     Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
     Hexagon::R5
   };
-  if (unsigned Reg = State.AllocateReg(RegList, 6)) {
+  if (unsigned Reg = State.AllocateReg(RegList)) {
     State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
     return false;
   }
@@ -213,7 +213,7 @@ static bool CC_Hexagon64(unsigned ValNo, MVT ValVT,
   static const MCPhysReg RegList2[] = {
     Hexagon::R1, Hexagon::R3
   };
-  if (unsigned Reg = State.AllocateReg(RegList1, RegList2, 2)) {
+  if (unsigned Reg = State.AllocateReg(RegList1, RegList2)) {
     State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
     return false;
   }
@@ -404,6 +404,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool &isTailCall                      = CLI.IsTailCall;
   CallingConv::ID CallConv              = CLI.CallConv;
   bool isVarArg                         = CLI.IsVarArg;
+  bool doesNotReturn                    = CLI.DoesNotReturn;
 
   bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
 
@@ -462,8 +463,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
 
-  const HexagonRegisterInfo *QRI = static_cast<const HexagonRegisterInfo *>(
-      DAG.getSubtarget().getRegisterInfo());
+  const HexagonRegisterInfo *QRI = Subtarget->getRegisterInfo();
   SDValue StackPtr =
       DAG.getCopyFromReg(Chain, dl, QRI->getStackRegister(), getPointerTy());
 
@@ -597,7 +597,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (isTailCall)
     return DAG.getNode(HexagonISD::TC_RETURN, dl, NodeTys, Ops);
 
-  Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
+  int OpCode = doesNotReturn ? HexagonISD::CALLv3nr : HexagonISD::CALLv3;
+  Chain = DAG.getNode(OpCode, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
@@ -720,9 +721,7 @@ SDValue HexagonTargetLowering::LowerINLINEASM(SDValue Op,
                 cast<RegisterSDNode>(Node->getOperand(i))->getReg();
 
               // Check it to be lr
-              const HexagonRegisterInfo *QRI =
-                  static_cast<const HexagonRegisterInfo *>(
-                      DAG.getSubtarget().getRegisterInfo());
+              const HexagonRegisterInfo *QRI = Subtarget->getRegisterInfo();
               if (Reg == QRI->getRARegister()) {
                 FuncInfo->setHasClobberLR(true);
                 break;
@@ -815,8 +814,7 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 
   // The Sub result contains the new stack start address, so it
   // must be placed in the stack pointer register.
-  const HexagonRegisterInfo *QRI = static_cast<const HexagonRegisterInfo *>(
-      DAG.getSubtarget().getRegisterInfo());
+  const HexagonRegisterInfo *QRI = Subtarget->getRegisterInfo();
   SDValue CopyChain = DAG.getCopyToReg(Chain, dl, QRI->getStackRegister(), Sub);
 
   SDValue Ops[2] = { ArgAdjust, CopyChain };
@@ -875,7 +873,7 @@ const {
           RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
-      } else if (RegVT == MVT::i64) {
+      } else if (RegVT == MVT::i64 || RegVT == MVT::f64) {
         unsigned VReg =
           RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
@@ -963,7 +961,7 @@ HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
-  const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setReturnAddressIsTaken(true);
@@ -989,8 +987,7 @@ HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
-  const HexagonRegisterInfo *TRI = static_cast<const HexagonRegisterInfo *>(
-      DAG.getSubtarget().getRegisterInfo());
+  const HexagonRegisterInfo *TRI = Subtarget->getRegisterInfo();
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   MFI->setFrameAddressIsTaken(true);
 
@@ -1021,9 +1018,10 @@ SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op,
   SDLoc dl(Op);
   Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
 
-  const HexagonTargetObjectFile &TLOF =
-      static_cast<const HexagonTargetObjectFile &>(getObjFileLowering());
-  if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) {
+  const HexagonTargetObjectFile *TLOF =
+      static_cast<const HexagonTargetObjectFile *>(
+          getTargetMachine().getObjFileLowering());
+  if (TLOF->IsGlobalInSmallSection(GV, getTargetMachine())) {
     return DAG.getNode(HexagonISD::CONST32_GP, dl, getPointerTy(), Result);
   }
 
@@ -1042,24 +1040,22 @@ HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
 // TargetLowering Implementation
 //===----------------------------------------------------------------------===//
 
-HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &targetmachine)
-    : TargetLowering(targetmachine),
-      TM(targetmachine) {
-
-  const HexagonSubtarget &Subtarget = TM.getSubtarget<HexagonSubtarget>();
+HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
+                                             const HexagonSubtarget &STI)
+    : TargetLowering(TM), Subtarget(&STI) {
 
   // Set up the register classes.
   addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass);
   addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass);
 
-  if (Subtarget.hasV5TOps()) {
+  if (Subtarget->hasV5TOps()) {
     addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
     addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
   }
 
   addRegisterClass(MVT::i1, &Hexagon::PredRegsRegClass);
 
-  computeRegisterProperties();
+  computeRegisterProperties(Subtarget->getRegisterInfo());
 
   // Align loop entry
   setPrefLoopAlignment(4);
@@ -1109,15 +1105,22 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &targetmachine)
   setLibcallName(RTLIB::DIV_F64, "__hexagon_divdf3");
   setOperationAction(ISD::FDIV, MVT::f64, Expand);
 
+  setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3");
+  setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3");
+  setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3");
+
   setOperationAction(ISD::FSQRT, MVT::f32, Expand);
   setOperationAction(ISD::FSQRT, MVT::f64, Expand);
   setOperationAction(ISD::FSIN, MVT::f32, Expand);
   setOperationAction(ISD::FSIN, MVT::f64, Expand);
 
-  if (Subtarget.hasV5TOps()) {
+  if (Subtarget->hasV5TOps()) {
     // Hexagon V5 Support.
     setOperationAction(ISD::FADD, MVT::f32, Legal);
-    setOperationAction(ISD::FADD, MVT::f64, Legal);
+    setOperationAction(ISD::FADD, MVT::f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::f32, Legal);
+    setOperationAction(ISD::FSUB, MVT::f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::f64, Expand);
     setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
     setCondCodeAction(ISD::SETOEQ, MVT::f32, Legal);
     setCondCodeAction(ISD::SETOEQ, MVT::f64, Legal);
@@ -1202,11 +1205,14 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &targetmachine)
     setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi");
     setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi");
 
-    setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3");
-    setOperationAction(ISD::FADD, MVT::f64, Expand);
 
     setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
     setOperationAction(ISD::FADD, MVT::f32, Expand);
+    setOperationAction(ISD::FADD, MVT::f64, Expand);
+
+    setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3");
+    setOperationAction(ISD::FSUB, MVT::f32, Expand);
+    setOperationAction(ISD::FSUB, MVT::f64, Expand);
 
     setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2");
     setOperationAction(ISD::FP_EXTEND, MVT::f32, Expand);
@@ -1247,7 +1253,6 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &targetmachine)
     setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2");
     setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
 
-    setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3");
     setOperationAction(ISD::FMUL, MVT::f64, Expand);
 
     setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3");
@@ -1301,9 +1306,11 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &targetmachine)
   setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
 
   // Turn FP extload into load/fextend.
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  for (MVT VT : MVT::fp_valuetypes())
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
   // Hexagon has a i1 sign extending load.
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+  for (MVT VT : MVT::integer_valuetypes())
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
   // Turn FP truncstore into trunc + store.
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
@@ -1333,7 +1340,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &targetmachine)
   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 
-  if (Subtarget.hasV5TOps()) {
+  if (Subtarget->hasV5TOps()) {
 
     // We need to make the operation type of SELECT node to be Custom,
     // such that we don't go into the infinite loop of
@@ -1422,19 +1429,15 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &targetmachine)
 
   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
-
+  
+  setOperationAction(ISD::MULHS, MVT::i64, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
 
   setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
 
-  if (Subtarget.isSubtargetV2()) {
-    setExceptionPointerRegister(Hexagon::R20);
-    setExceptionSelectorRegister(Hexagon::R21);
-  } else {
-    setExceptionPointerRegister(Hexagon::R0);
-    setExceptionSelectorRegister(Hexagon::R1);
-  }
+  setExceptionPointerRegister(Hexagon::R0);
+  setExceptionSelectorRegister(Hexagon::R1);
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
@@ -1452,8 +1455,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &targetmachine)
   setMinFunctionAlignment(2);
 
   // Needed for DYNAMIC_STACKALLOC expansion.
-  const HexagonRegisterInfo *QRI = static_cast<const HexagonRegisterInfo *>(
-      TM.getSubtargetImpl()->getRegisterInfo());
+  const HexagonRegisterInfo *QRI = Subtarget->getRegisterInfo();
   setStackPointerRegisterToSaveRestore(QRI->getStackRegister());
   setSchedulingPreference(Sched::VLIW);
 }
@@ -1476,7 +1478,9 @@ HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
     case HexagonISD::Lo:          return "HexagonISD::Lo";
     case HexagonISD::FTOI:        return "HexagonISD::FTOI";
     case HexagonISD::ITOF:        return "HexagonISD::ITOF";
-    case HexagonISD::CALL:        return "HexagonISD::CALL";
+    case HexagonISD::CALLv3:      return "HexagonISD::CALLv3";
+    case HexagonISD::CALLv3nr:    return "HexagonISD::CALLv3nr";
+    case HexagonISD::CALLR:       return "HexagonISD::CALLR";
     case HexagonISD::RET_FLAG:    return "HexagonISD::RET_FLAG";
     case HexagonISD::BR_JT:       return "HexagonISD::BR_JT";
     case HexagonISD::TC_RETURN:   return "HexagonISD::TC_RETURN";
@@ -1591,10 +1595,10 @@ const {
 // Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
-std::pair<unsigned, const TargetRegisterClass*>
-HexagonTargetLowering::getRegForInlineAsmConstraint(const
-                                                    std::string &Constraint,
-                                                    MVT VT) const {
+std::pair<unsigned, const TargetRegisterClass *>
+HexagonTargetLowering::getRegForInlineAsmConstraint(
+    const TargetRegisterInfo *TRI, const std::string &Constraint,
+    MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':   // R0-R31
@@ -1615,14 +1619,14 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(const
     }
   }
 
-  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
 
 /// isFPImmLegal - Returns true if the target can instruction select the
 /// specified FP immediate natively. If false, the legalizer will
 /// materialize the FP immediate as a load from a constant pool.
 bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
-  return TM.getSubtarget<HexagonSubtarget>().hasV5TOps();
+  return Subtarget->hasV5TOps();
 }
 
 /// isLegalAddressingMode - Return true if the addressing mode represented by
@@ -1705,3 +1709,17 @@ bool HexagonTargetLowering::IsEligibleForTailCallOptimization(
   // information is not available.
   return true;
 }
+
+// Return true when the given node fits in a positive half word.
+bool llvm::isPositiveHalfWord(SDNode *N) {
+  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
+  if (CN && CN->getSExtValue() > 0 && isInt<16>(CN->getSExtValue()))
+    return true;
+
+  switch (N->getOpcode()) {
+  default:
+    return false;
+  case ISD::SIGN_EXTEND_INREG:
+    return true;
+  }
+}
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 63e4392..151c28f 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -21,6 +21,10 @@
 #include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
+
+// Return true when the given node fits in a positive half word.
+bool isPositiveHalfWord(SDNode *N);
+
   namespace HexagonISD {
     enum {
       FIRST_NUMBER = ISD::BUILTIN_OP_END,
@@ -45,10 +49,15 @@ namespace llvm {
       FTOI,        // FP to Int within a FP register.
       ITOF,        // Int to FP within a FP register.
 
-      CALL,        // A call instruction.
+      CALLv3,      // A V3+ call instruction.
+      CALLv3nr,    // A V3+ call instruction that doesn't return.
+      CALLR,
+
       RET_FLAG,    // Return with a flag operand.
       BR_JT,       // Jump table.
-      BARRIER,     // Memory barrier.
+      BARRIER,     // Memory barrier
+      POPCOUNT,
+      COMBINE,
       WrapperJT,
       WrapperCP,
       WrapperCombineII,
@@ -63,10 +72,13 @@ namespace llvm {
       WrapperShuffOB,
       WrapperShuffOH,
       TC_RETURN,
-      EH_RETURN
+      EH_RETURN,
+      DCFETCH
     };
   }
 
+  class HexagonSubtarget;
+
   class HexagonTargetLowering : public TargetLowering {
     int VarArgsFrameOffset;   // Frame offset to start of varargs area.
 
@@ -74,8 +86,9 @@ namespace llvm {
                               unsigned& RetSize) const;
 
   public:
-    const TargetMachine &TM;
-    explicit HexagonTargetLowering(const TargetMachine &targetmachine);
+    const HexagonSubtarget *Subtarget;
+    explicit HexagonTargetLowering(const TargetMachine &TM,
+                                   const HexagonSubtarget &Subtarget);
 
     /// IsEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization. Targets which want to do tail call
@@ -152,8 +165,9 @@ namespace llvm {
                                     ISD::MemIndexedMode &AM,
                                     SelectionDAG &DAG) const override;
 
-    std::pair<unsigned, const TargetRegisterClass*>
-    getRegForInlineAsmConstraint(const std::string &Constraint,
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 const std::string &Constraint,
                                  MVT VT) const override;
 
     // Intrinsics
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index cc27c4c..3d04678 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -28,20 +28,12 @@ def TypeXTYPE  : IType<8>;
 def TypeENDLOOP: IType<31>;
 
 // Maintain list of valid subtargets for each instruction.
-class SubTarget<bits<4> value> {
-  bits<4> Value = value;
+class SubTarget<bits<6> value> {
+  bits<6> Value = value;
 }
 
-def HasV2SubT     : SubTarget<0xf>;
-def HasV2SubTOnly : SubTarget<0x1>;
-def NoV2SubT      : SubTarget<0x0>;
-def HasV3SubT     : SubTarget<0xe>;
-def HasV3SubTOnly : SubTarget<0x2>;
-def NoV3SubT      : SubTarget<0x1>;
-def HasV4SubT     : SubTarget<0xc>;
-def NoV4SubT      : SubTarget<0x3>;
-def HasV5SubT     : SubTarget<0x8>;
-def NoV5SubT      : SubTarget<0x7>;
+def HasAnySubT    : SubTarget<0x3f>;  // 111111
+def HasV5SubT     : SubTarget<0x3e>;  // 111110
 
 // Addressing modes for load/store instructions
 class AddrModeType<bits<3> value> {
@@ -56,8 +48,8 @@ def BaseLongOffset : AddrModeType<4>;  // Indirect with long offset
 def BaseRegOffset  : AddrModeType<5>;  // Indirect with register offset
 def PostInc        : AddrModeType<6>;  // Post increment addressing mode
 
-class MemAccessSize<bits<3> value> {
-  bits<3> Value = value;
+class MemAccessSize<bits<4> value> {
+  bits<4> Value = value;
 }
 
 def NoMemAccess      : MemAccessSize<0>;// Not a memory acces instruction.
@@ -157,11 +149,11 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   bits<2> opExtentAlign = 0;
   let TSFlags{33-32} = opExtentAlign; // Alignment exponent before extending.
 
-  // If an instruction is valid on a subtarget (v2-v5), set the corresponding
-  // bit from validSubTargets. v2 is the least significant bit.
+  // If an instruction is valid on a subtarget, set the corresponding
+  // bit from validSubTargets.
   // By default, instruction is valid on all subtargets.
-  SubTarget validSubTargets = HasV2SubT;
-  let TSFlags{37-34} = validSubTargets.Value;
+  SubTarget validSubTargets = HasAnySubT;
+  let TSFlags{39-34} = validSubTargets.Value;
 
   // Addressing mode for load/store instructions.
   AddrModeType addrMode = NoAddrMode;
@@ -169,7 +161,7 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
 
   // Memory access size for mem access instructions (load/store)
   MemAccessSize accessSize = NoMemAccess;
-  let TSFlags{45-43} = accessSize.Value;
+  let TSFlags{46-43} = accessSize.Value;
 
   bits<1> isTaken = 0;
   let TSFlags {47} = isTaken; // Branch prediction.
@@ -186,13 +178,12 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   string InputType = "";    // Input is "imm" or "reg" type.
   string isMEMri = "false"; // Set to "true" for load/store with MEMri operand.
   string isFloat = "false"; // Set to "true" for the floating-point load/store.
-  string isBrTaken = ""; // Set to "true"/"false" for jump instructions
+  string isBrTaken = !if(isTaken, "true", "false"); // Set to "true"/"false" for jump instructions
 
   let PredSense = !if(isPredicated, !if(isPredicatedFalse, "false", "true"),
                                     "");
   let PNewValue = !if(isPredicatedNew, "new", "");
   let NValueST = !if(isNVStore, "true", "false");
-  let isCodeGenOnly = 1;
 
   // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
 }
@@ -203,6 +194,7 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
 
 // LD Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
+let mayLoad = 1 in
 class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
              string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01>
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>;
@@ -365,7 +357,6 @@ class ALU32_ii<dag outs, dag ins, string asmstr, list<dag> pattern = [],
                string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
    : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
 
-
 //
 // ALU64 patterns.
 //
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
index d92f97b..5fec80b 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV4.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -19,6 +19,7 @@
 
 def TypeMEMOP  : IType<9>;
 def TypeNV     : IType<10>;
+def TypeCOMPOUND : IType<12>;
 def TypePREFIX : IType<30>;
 
 //----------------------------------------------------------------------------//
@@ -65,3 +66,7 @@ let isCodeGenOnly = 1 in
 class EXTENDERInst<dag outs, dag ins, string asmstr, list<dag> pattern = []>
   : InstHexagon<outs, ins, asmstr, pattern, "", EXTENDER_tc_1_SLOT0123,
                 TypePREFIX>;
+
+class CJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND, TypeCOMPOUND>;
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 1688c4a..9bae12c 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -78,11 +78,11 @@ unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
 
   switch (MI->getOpcode()) {
   default: break;
-  case Hexagon::LDriw:
-  case Hexagon::LDrid:
-  case Hexagon::LDrih:
-  case Hexagon::LDrib:
-  case Hexagon::LDriub:
+  case Hexagon::L2_loadri_io:
+  case Hexagon::L2_loadrd_io:
+  case Hexagon::L2_loadrh_io:
+  case Hexagon::L2_loadrb_io:
+  case Hexagon::L2_loadrub_io:
     if (MI->getOperand(2).isFI() &&
         MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
       FrameIndex = MI->getOperand(2).getIndex();
@@ -103,10 +103,10 @@ unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
                                             int &FrameIndex) const {
   switch (MI->getOpcode()) {
   default: break;
-  case Hexagon::STriw:
-  case Hexagon::STrid:
-  case Hexagon::STrih:
-  case Hexagon::STrib:
+  case Hexagon::S2_storeri_io:
+  case Hexagon::S2_storerd_io:
+  case Hexagon::S2_storerh_io:
+  case Hexagon::S2_storerb_io:
     if (MI->getOperand(2).isFI() &&
         MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
       FrameIndex = MI->getOperand(0).getIndex();
@@ -124,8 +124,8 @@ HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
                              const SmallVectorImpl<MachineOperand> &Cond,
                              DebugLoc DL) const{
 
-    int BOpc   = Hexagon::JMP;
-    int BccOpc = Hexagon::JMP_t;
+    int BOpc   = Hexagon::J2_jump;
+    int BccOpc = Hexagon::J2_jumpt;
 
     assert(TBB && "InsertBranch must not be told to insert a fallthrough");
 
@@ -134,7 +134,7 @@ HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
     // If we want to reverse the branch an odd number of times, we want
     // JMP_f.
     if (!Cond.empty() && Cond[0].isImm() && Cond[0].getImm() == 0) {
-      BccOpc = Hexagon::JMP_f;
+      BccOpc = Hexagon::J2_jumpf;
       regPos = 1;
     }
 
@@ -213,7 +213,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   }
 
   // Delete the JMP if it's equivalent to a fall-through.
-  if (AllowModify && I->getOpcode() == Hexagon::JMP &&
+  if (AllowModify && I->getOpcode() == Hexagon::J2_jump &&
       MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
     DEBUG(dbgs()<< "\nErasing the jump to successor block\n";);
     I->eraseFromParent();
@@ -249,7 +249,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
   // If there is only one terminator instruction, process it.
   if (LastInst && !SecondLastInst) {
-    if (LastOpcode == Hexagon::JMP) {
+    if (LastOpcode == Hexagon::J2_jump) {
       TBB = LastInst->getOperand(0).getMBB();
       return false;
     }
@@ -274,7 +274,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
   bool SecLastOpcodeHasJMP_c = PredOpcodeHasJMP_c(SecLastOpcode);
   bool SecLastOpcodeHasNot = PredOpcodeHasNot(SecLastOpcode);
-  if (SecLastOpcodeHasJMP_c && (LastOpcode == Hexagon::JMP)) {
+  if (SecLastOpcodeHasJMP_c && (LastOpcode == Hexagon::J2_jump)) {
     TBB =  SecondLastInst->getOperand(1).getMBB();
     if (SecLastOpcodeHasNot)
       Cond.push_back(MachineOperand::CreateImm(0));
@@ -285,7 +285,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
   // If the block ends with two Hexagon:JMPs, handle it.  The second one is not
   // executed, so remove it.
-  if (SecLastOpcode == Hexagon::JMP && LastOpcode == Hexagon::JMP) {
+  if (SecLastOpcode == Hexagon::J2_jump && LastOpcode == Hexagon::J2_jump) {
     TBB = SecondLastInst->getOperand(0).getMBB();
     I = LastInst;
     if (AllowModify)
@@ -295,7 +295,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
   // If the block ends with an ENDLOOP, and JMP, handle it.
   if (SecLastOpcode == Hexagon::ENDLOOP0 &&
-      LastOpcode == Hexagon::JMP) {
+      LastOpcode == Hexagon::J2_jump) {
     TBB = SecondLastInst->getOperand(0).getMBB();
     Cond.push_back(SecondLastInst->getOperand(0));
     FBB = LastInst->getOperand(0).getMBB();
@@ -308,9 +308,9 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
 
 unsigned HexagonInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
-  int BOpc   = Hexagon::JMP;
-  int BccOpc = Hexagon::JMP_t;
-  int BccOpcNot = Hexagon::JMP_f;
+  int BOpc   = Hexagon::J2_jump;
+  int BccOpc = Hexagon::J2_jumpt;
+  int BccOpcNot = Hexagon::J2_jumpf;
 
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin()) return 0;
@@ -346,33 +346,31 @@ bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI,
 
   // Set mask and the first source register.
   switch (Opc) {
-    case Hexagon::CMPEHexagon4rr:
-    case Hexagon::CMPEQri:
-    case Hexagon::CMPEQrr:
-    case Hexagon::CMPGT64rr:
-    case Hexagon::CMPGTU64rr:
-    case Hexagon::CMPGTUri:
-    case Hexagon::CMPGTUrr:
-    case Hexagon::CMPGTri:
-    case Hexagon::CMPGTrr:
+    case Hexagon::C2_cmpeqp:
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::C2_cmpeq:
+    case Hexagon::C2_cmpgtp:
+    case Hexagon::C2_cmpgtup:
+    case Hexagon::C2_cmpgtui:
+    case Hexagon::C2_cmpgtu:
+    case Hexagon::C2_cmpgti:
+    case Hexagon::C2_cmpgt:
       SrcReg = MI->getOperand(1).getReg();
       Mask = ~0;
       break;
-    case Hexagon::CMPbEQri_V4:
-    case Hexagon::CMPbEQrr_sbsb_V4:
-    case Hexagon::CMPbEQrr_ubub_V4:
-    case Hexagon::CMPbGTUri_V4:
-    case Hexagon::CMPbGTUrr_V4:
-    case Hexagon::CMPbGTrr_V4:
+    case Hexagon::A4_cmpbeqi:
+    case Hexagon::A4_cmpbeq:
+    case Hexagon::A4_cmpbgtui:
+    case Hexagon::A4_cmpbgtu:
+    case Hexagon::A4_cmpbgt:
       SrcReg = MI->getOperand(1).getReg();
       Mask = 0xFF;
       break;
-    case Hexagon::CMPhEQri_V4:
-    case Hexagon::CMPhEQrr_shl_V4:
-    case Hexagon::CMPhEQrr_xor_V4:
-    case Hexagon::CMPhGTUri_V4:
-    case Hexagon::CMPhGTUrr_V4:
-    case Hexagon::CMPhGTrr_shl_V4:
+    case Hexagon::A4_cmpheqi:
+    case Hexagon::A4_cmpheq:
+    case Hexagon::A4_cmphgtui:
+    case Hexagon::A4_cmphgtu:
+    case Hexagon::A4_cmphgt:
       SrcReg = MI->getOperand(1).getReg();
       Mask = 0xFFFF;
       break;
@@ -380,30 +378,28 @@ bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI,
 
   // Set the value/second source register.
   switch (Opc) {
-    case Hexagon::CMPEHexagon4rr:
-    case Hexagon::CMPEQrr:
-    case Hexagon::CMPGT64rr:
-    case Hexagon::CMPGTU64rr:
-    case Hexagon::CMPGTUrr:
-    case Hexagon::CMPGTrr:
-    case Hexagon::CMPbEQrr_sbsb_V4:
-    case Hexagon::CMPbEQrr_ubub_V4:
-    case Hexagon::CMPbGTUrr_V4:
-    case Hexagon::CMPbGTrr_V4:
-    case Hexagon::CMPhEQrr_shl_V4:
-    case Hexagon::CMPhEQrr_xor_V4:
-    case Hexagon::CMPhGTUrr_V4:
-    case Hexagon::CMPhGTrr_shl_V4:
+    case Hexagon::C2_cmpeqp:
+    case Hexagon::C2_cmpeq:
+    case Hexagon::C2_cmpgtp:
+    case Hexagon::C2_cmpgtup:
+    case Hexagon::C2_cmpgtu:
+    case Hexagon::C2_cmpgt:
+    case Hexagon::A4_cmpbeq:
+    case Hexagon::A4_cmpbgtu:
+    case Hexagon::A4_cmpbgt:
+    case Hexagon::A4_cmpheq:
+    case Hexagon::A4_cmphgtu:
+    case Hexagon::A4_cmphgt:
       SrcReg2 = MI->getOperand(2).getReg();
       return true;
 
-    case Hexagon::CMPEQri:
-    case Hexagon::CMPGTUri:
-    case Hexagon::CMPGTri:
-    case Hexagon::CMPbEQri_V4:
-    case Hexagon::CMPbGTUri_V4:
-    case Hexagon::CMPhEQri_V4:
-    case Hexagon::CMPhGTUri_V4:
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::C2_cmpgtui:
+    case Hexagon::C2_cmpgti:
+    case Hexagon::A4_cmpbeqi:
+    case Hexagon::A4_cmpbgtui:
+    case Hexagon::A4_cmpheqi:
+    case Hexagon::A4_cmphgtui:
       SrcReg2 = 0;
       Value = MI->getOperand(2).getImm();
       return true;
@@ -418,16 +414,16 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  unsigned DestReg, unsigned SrcReg,
                                  bool KillSrc) const {
   if (Hexagon::IntRegsRegClass.contains(SrcReg, DestReg)) {
-    BuildMI(MBB, I, DL, get(Hexagon::TFR), DestReg).addReg(SrcReg);
+    BuildMI(MBB, I, DL, get(Hexagon::A2_tfr), DestReg).addReg(SrcReg);
     return;
   }
   if (Hexagon::DoubleRegsRegClass.contains(SrcReg, DestReg)) {
-    BuildMI(MBB, I, DL, get(Hexagon::TFR64), DestReg).addReg(SrcReg);
+    BuildMI(MBB, I, DL, get(Hexagon::A2_tfrp), DestReg).addReg(SrcReg);
     return;
   }
   if (Hexagon::PredRegsRegClass.contains(SrcReg, DestReg)) {
     // Map Pd = Ps to Pd = or(Ps, Ps).
-    BuildMI(MBB, I, DL, get(Hexagon::OR_pp),
+    BuildMI(MBB, I, DL, get(Hexagon::C2_or),
             DestReg).addReg(SrcReg).addReg(SrcReg);
     return;
   }
@@ -436,31 +432,31 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     // We can have an overlap between single and double reg: r1:0 = r0.
     if(SrcReg == RI.getSubReg(DestReg, Hexagon::subreg_loreg)) {
         // r1:0 = r0
-        BuildMI(MBB, I, DL, get(Hexagon::TFRI), (RI.getSubReg(DestReg,
+        BuildMI(MBB, I, DL, get(Hexagon::A2_tfrsi), (RI.getSubReg(DestReg,
                 Hexagon::subreg_hireg))).addImm(0);
     } else {
         // r1:0 = r1 or no overlap.
-        BuildMI(MBB, I, DL, get(Hexagon::TFR), (RI.getSubReg(DestReg,
+        BuildMI(MBB, I, DL, get(Hexagon::A2_tfr), (RI.getSubReg(DestReg,
                 Hexagon::subreg_loreg))).addReg(SrcReg);
-        BuildMI(MBB, I, DL, get(Hexagon::TFRI), (RI.getSubReg(DestReg,
+        BuildMI(MBB, I, DL, get(Hexagon::A2_tfrsi), (RI.getSubReg(DestReg,
                 Hexagon::subreg_hireg))).addImm(0);
     }
     return;
   }
-  if (Hexagon::CRRegsRegClass.contains(DestReg) &&
+  if (Hexagon::CtrRegsRegClass.contains(DestReg) &&
       Hexagon::IntRegsRegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(Hexagon::TFCR), DestReg).addReg(SrcReg);
+    BuildMI(MBB, I, DL, get(Hexagon::A2_tfrrcr), DestReg).addReg(SrcReg);
     return;
   }
   if (Hexagon::PredRegsRegClass.contains(SrcReg) &&
       Hexagon::IntRegsRegClass.contains(DestReg)) {
-    BuildMI(MBB, I, DL, get(Hexagon::TFR_RsPd), DestReg).
+    BuildMI(MBB, I, DL, get(Hexagon::C2_tfrpr), DestReg).
       addReg(SrcReg, getKillRegState(KillSrc));
     return;
   }
   if (Hexagon::IntRegsRegClass.contains(SrcReg) &&
       Hexagon::PredRegsRegClass.contains(DestReg)) {
-    BuildMI(MBB, I, DL, get(Hexagon::TFR_PdRs), DestReg).
+    BuildMI(MBB, I, DL, get(Hexagon::C2_tfrrp), DestReg).
       addReg(SrcReg, getKillRegState(KillSrc));
     return;
   }
@@ -488,11 +484,11 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                       Align);
 
   if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) {
-    BuildMI(MBB, I, DL, get(Hexagon::STriw))
+    BuildMI(MBB, I, DL, get(Hexagon::S2_storeri_io))
           .addFrameIndex(FI).addImm(0)
           .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
   } else if (Hexagon::DoubleRegsRegClass.hasSubClassEq(RC)) {
-    BuildMI(MBB, I, DL, get(Hexagon::STrid))
+    BuildMI(MBB, I, DL, get(Hexagon::S2_storerd_io))
           .addFrameIndex(FI).addImm(0)
           .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
   } else if (Hexagon::PredRegsRegClass.hasSubClassEq(RC)) {
@@ -533,10 +529,10 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                       MFI.getObjectSize(FI),
                       Align);
   if (RC == &Hexagon::IntRegsRegClass) {
-    BuildMI(MBB, I, DL, get(Hexagon::LDriw), DestReg)
+    BuildMI(MBB, I, DL, get(Hexagon::L2_loadri_io), DestReg)
           .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
   } else if (RC == &Hexagon::DoubleRegsRegClass) {
-    BuildMI(MBB, I, DL, get(Hexagon::LDrid), DestReg)
+    BuildMI(MBB, I, DL, get(Hexagon::L2_loadrd_io), DestReg)
           .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
   } else if (RC == &Hexagon::PredRegsRegClass) {
     BuildMI(MBB, I, DL, get(Hexagon::LDriw_pred), DestReg)
@@ -582,10 +578,6 @@ unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
 }
 
 bool HexagonInstrInfo::isExtendable(const MachineInstr *MI) const {
-  // Constant extenders are allowed only for V4 and above.
-  if (!Subtarget.hasV4TOps())
-    return false;
-
   const MCInstrDesc &MID = MI->getDesc();
   const uint64_t F = MID.TSFlags;
   if ((F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask)
@@ -648,78 +640,68 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
   const int Opc = MI->getOpcode();
 
   switch(Opc) {
-  case Hexagon::TFRI:
+  case Hexagon::A2_tfrsi:
     return isInt<12>(MI->getOperand(1).getImm());
 
-  case Hexagon::STrid:
-  case Hexagon::STrid_indexed:
+  case Hexagon::S2_storerd_io:
     return isShiftedUInt<6,3>(MI->getOperand(1).getImm());
 
-  case Hexagon::STriw:
-  case Hexagon::STriw_indexed:
-  case Hexagon::STriw_nv_V4:
+  case Hexagon::S2_storeri_io:
+  case Hexagon::S2_storerinew_io:
     return isShiftedUInt<6,2>(MI->getOperand(1).getImm());
 
-  case Hexagon::STrih:
-  case Hexagon::STrih_indexed:
-  case Hexagon::STrih_nv_V4:
+  case Hexagon::S2_storerh_io:
+  case Hexagon::S2_storerhnew_io:
     return isShiftedUInt<6,1>(MI->getOperand(1).getImm());
 
-  case Hexagon::STrib:
-  case Hexagon::STrib_indexed:
-  case Hexagon::STrib_nv_V4:
+  case Hexagon::S2_storerb_io:
+  case Hexagon::S2_storerbnew_io:
     return isUInt<6>(MI->getOperand(1).getImm());
 
-  case Hexagon::LDrid:
-  case Hexagon::LDrid_indexed:
+  case Hexagon::L2_loadrd_io:
     return isShiftedUInt<6,3>(MI->getOperand(2).getImm());
 
-  case Hexagon::LDriw:
-  case Hexagon::LDriw_indexed:
+  case Hexagon::L2_loadri_io:
     return isShiftedUInt<6,2>(MI->getOperand(2).getImm());
 
-  case Hexagon::LDrih:
-  case Hexagon::LDriuh:
-  case Hexagon::LDrih_indexed:
-  case Hexagon::LDriuh_indexed:
+  case Hexagon::L2_loadrh_io:
+  case Hexagon::L2_loadruh_io:
     return isShiftedUInt<6,1>(MI->getOperand(2).getImm());
 
-  case Hexagon::LDrib:
-  case Hexagon::LDriub:
-  case Hexagon::LDrib_indexed:
-  case Hexagon::LDriub_indexed:
+  case Hexagon::L2_loadrb_io:
+  case Hexagon::L2_loadrub_io:
     return isUInt<6>(MI->getOperand(2).getImm());
 
-  case Hexagon::POST_LDrid:
+  case Hexagon::L2_loadrd_pi:
     return isShiftedInt<4,3>(MI->getOperand(3).getImm());
 
-  case Hexagon::POST_LDriw:
+  case Hexagon::L2_loadri_pi:
     return isShiftedInt<4,2>(MI->getOperand(3).getImm());
 
-  case Hexagon::POST_LDrih:
-  case Hexagon::POST_LDriuh:
+  case Hexagon::L2_loadrh_pi:
+  case Hexagon::L2_loadruh_pi:
     return isShiftedInt<4,1>(MI->getOperand(3).getImm());
 
-  case Hexagon::POST_LDrib:
-  case Hexagon::POST_LDriub:
+  case Hexagon::L2_loadrb_pi:
+  case Hexagon::L2_loadrub_pi:
     return isInt<4>(MI->getOperand(3).getImm());
 
-  case Hexagon::STrib_imm_V4:
-  case Hexagon::STrih_imm_V4:
-  case Hexagon::STriw_imm_V4:
+  case Hexagon::S4_storeirb_io:
+  case Hexagon::S4_storeirh_io:
+  case Hexagon::S4_storeiri_io:
     return (isUInt<6>(MI->getOperand(1).getImm()) &&
             isInt<6>(MI->getOperand(2).getImm()));
 
-  case Hexagon::ADD_ri:
+  case Hexagon::A2_addi:
     return isInt<8>(MI->getOperand(2).getImm());
 
-  case Hexagon::ASLH:
-  case Hexagon::ASRH:
-  case Hexagon::SXTB:
-  case Hexagon::SXTH:
-  case Hexagon::ZXTB:
-  case Hexagon::ZXTH:
-    return Subtarget.hasV4TOps();
+  case Hexagon::A2_aslh:
+  case Hexagon::A2_asrh:
+  case Hexagon::A2_sxtb:
+  case Hexagon::A2_sxth:
+  case Hexagon::A2_zxtb:
+  case Hexagon::A2_zxth:
+    return true;
   }
 
   return true;
@@ -739,16 +721,16 @@ unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
 
   switch(Opc) {
     default: llvm_unreachable("Unexpected predicated instruction");
-    case Hexagon::COMBINE_rr_cPt:
-      return Hexagon::COMBINE_rr_cNotPt;
-    case Hexagon::COMBINE_rr_cNotPt:
-      return Hexagon::COMBINE_rr_cPt;
+    case Hexagon::C2_ccombinewt:
+      return Hexagon::C2_ccombinewf;
+    case Hexagon::C2_ccombinewf:
+      return Hexagon::C2_ccombinewt;
 
       // Dealloc_return.
-    case Hexagon::DEALLOC_RET_cPt_V4:
-      return Hexagon::DEALLOC_RET_cNotPt_V4;
-    case Hexagon::DEALLOC_RET_cNotPt_V4:
-      return Hexagon::DEALLOC_RET_cPt_V4;
+    case Hexagon::L4_return_t:
+      return Hexagon::L4_return_f;
+    case Hexagon::L4_return_f:
+      return Hexagon::L4_return_t;
   }
 }
 
@@ -780,22 +762,14 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
   case Hexagon::TFRI_f:
     return !invertPredicate ? Hexagon::TFRI_cPt_f :
                               Hexagon::TFRI_cNotPt_f;
-  case Hexagon::COMBINE_rr:
-    return !invertPredicate ? Hexagon::COMBINE_rr_cPt :
-                              Hexagon::COMBINE_rr_cNotPt;
-
-  // Word.
-  case Hexagon::STriw_f:
-    return !invertPredicate ? Hexagon::STriw_cPt :
-                              Hexagon::STriw_cNotPt;
-  case Hexagon::STriw_indexed_f:
-    return !invertPredicate ? Hexagon::STriw_indexed_cPt :
-                              Hexagon::STriw_indexed_cNotPt;
+  case Hexagon::A2_combinew:
+    return !invertPredicate ? Hexagon::C2_ccombinewt :
+                              Hexagon::C2_ccombinewf;
 
   // DEALLOC_RETURN.
-  case Hexagon::DEALLOC_RET_V4:
-    return !invertPredicate ? Hexagon::DEALLOC_RET_cPt_V4 :
-                              Hexagon::DEALLOC_RET_cNotPt_V4;
+  case Hexagon::L4_return:
+    return !invertPredicate ? Hexagon::L4_return_t:
+                              Hexagon::L4_return_f;
   }
   llvm_unreachable("Unexpected predicable instruction");
 }
@@ -901,7 +875,7 @@ PredicateInstruction(MachineInstr *MI,
         continue;
       }
       else {
-        assert(false && "Unexpected operand type");
+        llvm_unreachable("Unexpected operand type");
       }
     }
   }
@@ -1024,12 +998,10 @@ bool HexagonInstrInfo::isPredicatedNew(unsigned Opcode) const {
 
 // Returns true, if a ST insn can be promoted to a new-value store.
 bool HexagonInstrInfo::mayBeNewStore(const MachineInstr *MI) const {
-  const HexagonRegisterInfo& QRI = getRegisterInfo();
   const uint64_t F = MI->getDesc().TSFlags;
 
   return ((F >> HexagonII::mayNVStorePos) &
-           HexagonII::mayNVStoreMask &
-           QRI.Subtarget.hasV4TOps());
+           HexagonII::mayNVStoreMask);
 }
 
 bool
@@ -1082,13 +1054,13 @@ isProfitableToDupForIfCvt(MachineBasicBlock &MBB,unsigned NumInstrs,
 bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const {
   switch (MI->getOpcode()) {
   default: return false;
-  case Hexagon::DEALLOC_RET_V4 :
-  case Hexagon::DEALLOC_RET_cPt_V4 :
-  case Hexagon::DEALLOC_RET_cNotPt_V4 :
-  case Hexagon::DEALLOC_RET_cdnPnt_V4 :
-  case Hexagon::DEALLOC_RET_cNotdnPnt_V4 :
-  case Hexagon::DEALLOC_RET_cdnPt_V4 :
-  case Hexagon::DEALLOC_RET_cNotdnPt_V4 :
+  case Hexagon::L4_return:
+  case Hexagon::L4_return_t:
+  case Hexagon::L4_return_f:
+  case Hexagon::L4_return_tnew_pnt:
+  case Hexagon::L4_return_fnew_pnt:
+  case Hexagon::L4_return_tnew_pt:
+  case Hexagon::L4_return_fnew_pt:
    return true;
   }
 }
@@ -1107,63 +1079,55 @@ isValidOffset(const int Opcode, const int Offset) const {
 
   switch(Opcode) {
 
-  case Hexagon::LDriw:
-  case Hexagon::LDriw_indexed:
-  case Hexagon::LDriw_f:
-  case Hexagon::STriw_indexed:
-  case Hexagon::STriw:
-  case Hexagon::STriw_f:
+  case Hexagon::L2_loadri_io:
+  case Hexagon::S2_storeri_io:
     return (Offset >= Hexagon_MEMW_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMW_OFFSET_MAX);
 
-  case Hexagon::LDrid:
-  case Hexagon::LDrid_indexed:
-  case Hexagon::LDrid_f:
-  case Hexagon::STrid:
-  case Hexagon::STrid_indexed:
-  case Hexagon::STrid_f:
+  case Hexagon::L2_loadrd_io:
+  case Hexagon::S2_storerd_io:
     return (Offset >= Hexagon_MEMD_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMD_OFFSET_MAX);
 
-  case Hexagon::LDrih:
-  case Hexagon::LDriuh:
-  case Hexagon::STrih:
+  case Hexagon::L2_loadrh_io:
+  case Hexagon::L2_loadruh_io:
+  case Hexagon::S2_storerh_io:
     return (Offset >= Hexagon_MEMH_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMH_OFFSET_MAX);
 
-  case Hexagon::LDrib:
-  case Hexagon::STrib:
-  case Hexagon::LDriub:
+  case Hexagon::L2_loadrb_io:
+  case Hexagon::S2_storerb_io:
+  case Hexagon::L2_loadrub_io:
     return (Offset >= Hexagon_MEMB_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMB_OFFSET_MAX);
 
-  case Hexagon::ADD_ri:
+  case Hexagon::A2_addi:
   case Hexagon::TFR_FI:
     return (Offset >= Hexagon_ADDI_OFFSET_MIN) &&
       (Offset <= Hexagon_ADDI_OFFSET_MAX);
 
-  case Hexagon::MemOPw_ADDi_V4 :
-  case Hexagon::MemOPw_SUBi_V4 :
-  case Hexagon::MemOPw_ADDr_V4 :
-  case Hexagon::MemOPw_SUBr_V4 :
-  case Hexagon::MemOPw_ANDr_V4 :
-  case Hexagon::MemOPw_ORr_V4 :
+  case Hexagon::L4_iadd_memopw_io:
+  case Hexagon::L4_isub_memopw_io:
+  case Hexagon::L4_add_memopw_io:
+  case Hexagon::L4_sub_memopw_io:
+  case Hexagon::L4_and_memopw_io:
+  case Hexagon::L4_or_memopw_io:
     return (0 <= Offset && Offset <= 255);
 
-  case Hexagon::MemOPh_ADDi_V4 :
-  case Hexagon::MemOPh_SUBi_V4 :
-  case Hexagon::MemOPh_ADDr_V4 :
-  case Hexagon::MemOPh_SUBr_V4 :
-  case Hexagon::MemOPh_ANDr_V4 :
-  case Hexagon::MemOPh_ORr_V4 :
+  case Hexagon::L4_iadd_memoph_io:
+  case Hexagon::L4_isub_memoph_io:
+  case Hexagon::L4_add_memoph_io:
+  case Hexagon::L4_sub_memoph_io:
+  case Hexagon::L4_and_memoph_io:
+  case Hexagon::L4_or_memoph_io:
     return (0 <= Offset && Offset <= 127);
 
-  case Hexagon::MemOPb_ADDi_V4 :
-  case Hexagon::MemOPb_SUBi_V4 :
-  case Hexagon::MemOPb_ADDr_V4 :
-  case Hexagon::MemOPb_SUBr_V4 :
-  case Hexagon::MemOPb_ANDr_V4 :
-  case Hexagon::MemOPb_ORr_V4 :
+  case Hexagon::L4_iadd_memopb_io:
+  case Hexagon::L4_isub_memopb_io:
+  case Hexagon::L4_add_memopb_io:
+  case Hexagon::L4_sub_memopb_io:
+  case Hexagon::L4_and_memopb_io:
+  case Hexagon::L4_or_memopb_io:
     return (0 <= Offset && Offset <= 63);
 
   // LDri_pred and STriw_pred are pseudo operations, so it has to take offset of
@@ -1172,7 +1136,7 @@ isValidOffset(const int Opcode, const int Offset) const {
   case Hexagon::LDriw_pred:
     return true;
 
-  case Hexagon::LOOP0_i:
+  case Hexagon::J2_loop0i:
     return isUInt<10>(Offset);
 
   // INLINEASM is very special.
@@ -1220,31 +1184,31 @@ isMemOp(const MachineInstr *MI) const {
 
   switch (MI->getOpcode())
   {
-    default: return false;
-    case Hexagon::MemOPw_ADDi_V4 :
-    case Hexagon::MemOPw_SUBi_V4 :
-    case Hexagon::MemOPw_ADDr_V4 :
-    case Hexagon::MemOPw_SUBr_V4 :
-    case Hexagon::MemOPw_ANDr_V4 :
-    case Hexagon::MemOPw_ORr_V4 :
-    case Hexagon::MemOPh_ADDi_V4 :
-    case Hexagon::MemOPh_SUBi_V4 :
-    case Hexagon::MemOPh_ADDr_V4 :
-    case Hexagon::MemOPh_SUBr_V4 :
-    case Hexagon::MemOPh_ANDr_V4 :
-    case Hexagon::MemOPh_ORr_V4 :
-    case Hexagon::MemOPb_ADDi_V4 :
-    case Hexagon::MemOPb_SUBi_V4 :
-    case Hexagon::MemOPb_ADDr_V4 :
-    case Hexagon::MemOPb_SUBr_V4 :
-    case Hexagon::MemOPb_ANDr_V4 :
-    case Hexagon::MemOPb_ORr_V4 :
-    case Hexagon::MemOPb_SETBITi_V4:
-    case Hexagon::MemOPh_SETBITi_V4:
-    case Hexagon::MemOPw_SETBITi_V4:
-    case Hexagon::MemOPb_CLRBITi_V4:
-    case Hexagon::MemOPh_CLRBITi_V4:
-    case Hexagon::MemOPw_CLRBITi_V4:
+  default: return false;
+  case Hexagon::L4_iadd_memopw_io:
+  case Hexagon::L4_isub_memopw_io:
+  case Hexagon::L4_add_memopw_io:
+  case Hexagon::L4_sub_memopw_io:
+  case Hexagon::L4_and_memopw_io:
+  case Hexagon::L4_or_memopw_io:
+  case Hexagon::L4_iadd_memoph_io:
+  case Hexagon::L4_isub_memoph_io:
+  case Hexagon::L4_add_memoph_io:
+  case Hexagon::L4_sub_memoph_io:
+  case Hexagon::L4_and_memoph_io:
+  case Hexagon::L4_or_memoph_io:
+  case Hexagon::L4_iadd_memopb_io:
+  case Hexagon::L4_isub_memopb_io:
+  case Hexagon::L4_add_memopb_io:
+  case Hexagon::L4_sub_memopb_io:
+  case Hexagon::L4_and_memopb_io:
+  case Hexagon::L4_or_memopb_io:
+  case Hexagon::L4_ior_memopb_io:
+  case Hexagon::L4_ior_memoph_io:
+  case Hexagon::L4_ior_memopw_io:
+  case Hexagon::L4_iand_memopb_io:
+  case Hexagon::L4_iand_memoph_io:
+  case Hexagon::L4_iand_memopw_io:
     return true;
   }
   return false;
@@ -1264,12 +1228,12 @@ isSpillPredRegOp(const MachineInstr *MI) const {
 bool HexagonInstrInfo::isNewValueJumpCandidate(const MachineInstr *MI) const {
   switch (MI->getOpcode()) {
     default: return false;
-    case Hexagon::CMPEQrr:
-    case Hexagon::CMPEQri:
-    case Hexagon::CMPGTrr:
-    case Hexagon::CMPGTri:
-    case Hexagon::CMPGTUrr:
-    case Hexagon::CMPGTUri:
+    case Hexagon::C2_cmpeq:
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::C2_cmpgt:
+    case Hexagon::C2_cmpgti:
+    case Hexagon::C2_cmpgtu:
+    case Hexagon::C2_cmpgtui:
       return true;
   }
 }
@@ -1278,20 +1242,19 @@ bool HexagonInstrInfo::
 isConditionalTransfer (const MachineInstr *MI) const {
   switch (MI->getOpcode()) {
     default: return false;
-    case Hexagon::TFR_cPt:
-    case Hexagon::TFR_cNotPt:
-    case Hexagon::TFRI_cPt:
-    case Hexagon::TFRI_cNotPt:
-    case Hexagon::TFR_cdnPt:
-    case Hexagon::TFR_cdnNotPt:
-    case Hexagon::TFRI_cdnPt:
-    case Hexagon::TFRI_cdnNotPt:
+    case Hexagon::A2_tfrt:
+    case Hexagon::A2_tfrf:
+    case Hexagon::C2_cmoveit:
+    case Hexagon::C2_cmoveif:
+    case Hexagon::A2_tfrtnew:
+    case Hexagon::A2_tfrfnew:
+    case Hexagon::C2_cmovenewit:
+    case Hexagon::C2_cmovenewif:
       return true;
   }
 }
 
 bool HexagonInstrInfo::isConditionalALU32 (const MachineInstr* MI) const {
-  const HexagonRegisterInfo& QRI = getRegisterInfo();
   switch (MI->getOpcode())
   {
     default: return false;
@@ -1303,94 +1266,92 @@ bool HexagonInstrInfo::isConditionalALU32 (const MachineInstr* MI) const {
     case Hexagon::A2_pandfnew:
     case Hexagon::A2_pandt:
     case Hexagon::A2_pandtnew:
+    case Hexagon::A4_paslhf:
+    case Hexagon::A4_paslhfnew:
+    case Hexagon::A4_paslht:
+    case Hexagon::A4_paslhtnew:
+    case Hexagon::A4_pasrhf:
+    case Hexagon::A4_pasrhfnew:
+    case Hexagon::A4_pasrht:
+    case Hexagon::A4_pasrhtnew:
     case Hexagon::A2_porf:
     case Hexagon::A2_porfnew:
     case Hexagon::A2_port:
     case Hexagon::A2_portnew:
+    case Hexagon::A2_psubf:
+    case Hexagon::A2_psubfnew:
+    case Hexagon::A2_psubt:
+    case Hexagon::A2_psubtnew:
     case Hexagon::A2_pxorf:
     case Hexagon::A2_pxorfnew:
     case Hexagon::A2_pxort:
     case Hexagon::A2_pxortnew:
-    case Hexagon::ADD_ri_cPt:
-    case Hexagon::ADD_ri_cNotPt:
-    case Hexagon::SUB_rr_cPt:
-    case Hexagon::SUB_rr_cNotPt:
-    case Hexagon::COMBINE_rr_cPt:
-    case Hexagon::COMBINE_rr_cNotPt:
+    case Hexagon::A4_psxthf:
+    case Hexagon::A4_psxthfnew:
+    case Hexagon::A4_psxtht:
+    case Hexagon::A4_psxthtnew:
+    case Hexagon::A4_psxtbf:
+    case Hexagon::A4_psxtbfnew:
+    case Hexagon::A4_psxtbt:
+    case Hexagon::A4_psxtbtnew:
+    case Hexagon::A4_pzxtbf:
+    case Hexagon::A4_pzxtbfnew:
+    case Hexagon::A4_pzxtbt:
+    case Hexagon::A4_pzxtbtnew:
+    case Hexagon::A4_pzxthf:
+    case Hexagon::A4_pzxthfnew:
+    case Hexagon::A4_pzxtht:
+    case Hexagon::A4_pzxthtnew:
+    case Hexagon::A2_paddit:
+    case Hexagon::A2_paddif:
+    case Hexagon::C2_ccombinewt:
+    case Hexagon::C2_ccombinewf:
       return true;
-    case Hexagon::ASLH_cPt_V4:
-    case Hexagon::ASLH_cNotPt_V4:
-    case Hexagon::ASRH_cPt_V4:
-    case Hexagon::ASRH_cNotPt_V4:
-    case Hexagon::SXTB_cPt_V4:
-    case Hexagon::SXTB_cNotPt_V4:
-    case Hexagon::SXTH_cPt_V4:
-    case Hexagon::SXTH_cNotPt_V4:
-    case Hexagon::ZXTB_cPt_V4:
-    case Hexagon::ZXTB_cNotPt_V4:
-    case Hexagon::ZXTH_cPt_V4:
-    case Hexagon::ZXTH_cNotPt_V4:
-      return QRI.Subtarget.hasV4TOps();
   }
 }
 
 bool HexagonInstrInfo::
 isConditionalLoad (const MachineInstr* MI) const {
-  const HexagonRegisterInfo& QRI = getRegisterInfo();
   switch (MI->getOpcode())
   {
     default: return false;
-    case Hexagon::LDrid_cPt :
-    case Hexagon::LDrid_cNotPt :
-    case Hexagon::LDrid_indexed_cPt :
-    case Hexagon::LDrid_indexed_cNotPt :
-    case Hexagon::LDriw_cPt :
-    case Hexagon::LDriw_cNotPt :
-    case Hexagon::LDriw_indexed_cPt :
-    case Hexagon::LDriw_indexed_cNotPt :
-    case Hexagon::LDrih_cPt :
-    case Hexagon::LDrih_cNotPt :
-    case Hexagon::LDrih_indexed_cPt :
-    case Hexagon::LDrih_indexed_cNotPt :
-    case Hexagon::LDrib_cPt :
-    case Hexagon::LDrib_cNotPt :
-    case Hexagon::LDrib_indexed_cPt :
-    case Hexagon::LDrib_indexed_cNotPt :
-    case Hexagon::LDriuh_cPt :
-    case Hexagon::LDriuh_cNotPt :
-    case Hexagon::LDriuh_indexed_cPt :
-    case Hexagon::LDriuh_indexed_cNotPt :
-    case Hexagon::LDriub_cPt :
-    case Hexagon::LDriub_cNotPt :
-    case Hexagon::LDriub_indexed_cPt :
-    case Hexagon::LDriub_indexed_cNotPt :
+    case Hexagon::L2_ploadrdt_io :
+    case Hexagon::L2_ploadrdf_io:
+    case Hexagon::L2_ploadrit_io:
+    case Hexagon::L2_ploadrif_io:
+    case Hexagon::L2_ploadrht_io:
+    case Hexagon::L2_ploadrhf_io:
+    case Hexagon::L2_ploadrbt_io:
+    case Hexagon::L2_ploadrbf_io:
+    case Hexagon::L2_ploadruht_io:
+    case Hexagon::L2_ploadruhf_io:
+    case Hexagon::L2_ploadrubt_io:
+    case Hexagon::L2_ploadrubf_io:
+    case Hexagon::L2_ploadrdt_pi:
+    case Hexagon::L2_ploadrdf_pi:
+    case Hexagon::L2_ploadrit_pi:
+    case Hexagon::L2_ploadrif_pi:
+    case Hexagon::L2_ploadrht_pi:
+    case Hexagon::L2_ploadrhf_pi:
+    case Hexagon::L2_ploadrbt_pi:
+    case Hexagon::L2_ploadrbf_pi:
+    case Hexagon::L2_ploadruht_pi:
+    case Hexagon::L2_ploadruhf_pi:
+    case Hexagon::L2_ploadrubt_pi:
+    case Hexagon::L2_ploadrubf_pi:
+    case Hexagon::L4_ploadrdt_rr:
+    case Hexagon::L4_ploadrdf_rr:
+    case Hexagon::L4_ploadrbt_rr:
+    case Hexagon::L4_ploadrbf_rr:
+    case Hexagon::L4_ploadrubt_rr:
+    case Hexagon::L4_ploadrubf_rr:
+    case Hexagon::L4_ploadrht_rr:
+    case Hexagon::L4_ploadrhf_rr:
+    case Hexagon::L4_ploadruht_rr:
+    case Hexagon::L4_ploadruhf_rr:
+    case Hexagon::L4_ploadrit_rr:
+    case Hexagon::L4_ploadrif_rr:
       return true;
-    case Hexagon::POST_LDrid_cPt :
-    case Hexagon::POST_LDrid_cNotPt :
-    case Hexagon::POST_LDriw_cPt :
-    case Hexagon::POST_LDriw_cNotPt :
-    case Hexagon::POST_LDrih_cPt :
-    case Hexagon::POST_LDrih_cNotPt :
-    case Hexagon::POST_LDrib_cPt :
-    case Hexagon::POST_LDrib_cNotPt :
-    case Hexagon::POST_LDriuh_cPt :
-    case Hexagon::POST_LDriuh_cNotPt :
-    case Hexagon::POST_LDriub_cPt :
-    case Hexagon::POST_LDriub_cNotPt :
-      return QRI.Subtarget.hasV4TOps();
-    case Hexagon::LDrid_indexed_shl_cPt_V4 :
-    case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
-    case Hexagon::LDrib_indexed_shl_cPt_V4 :
-    case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
-    case Hexagon::LDriub_indexed_shl_cPt_V4 :
-    case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
-    case Hexagon::LDrih_indexed_shl_cPt_V4 :
-    case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
-    case Hexagon::LDriuh_indexed_shl_cPt_V4 :
-    case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
-    case Hexagon::LDriw_indexed_shl_cPt_V4 :
-    case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
-      return QRI.Subtarget.hasV4TOps();
   }
 }
 
@@ -1430,55 +1391,50 @@ isConditionalLoad (const MachineInstr* MI) const {
 // is not valid for new-value stores.
 bool HexagonInstrInfo::
 isConditionalStore (const MachineInstr* MI) const {
-  const HexagonRegisterInfo& QRI = getRegisterInfo();
   switch (MI->getOpcode())
   {
     default: return false;
-    case Hexagon::STrib_imm_cPt_V4 :
-    case Hexagon::STrib_imm_cNotPt_V4 :
-    case Hexagon::STrib_indexed_shl_cPt_V4 :
-    case Hexagon::STrib_indexed_shl_cNotPt_V4 :
-    case Hexagon::STrib_cPt :
-    case Hexagon::STrib_cNotPt :
-    case Hexagon::POST_STbri_cPt :
-    case Hexagon::POST_STbri_cNotPt :
-    case Hexagon::STrid_indexed_cPt :
-    case Hexagon::STrid_indexed_cNotPt :
-    case Hexagon::STrid_indexed_shl_cPt_V4 :
-    case Hexagon::POST_STdri_cPt :
-    case Hexagon::POST_STdri_cNotPt :
-    case Hexagon::STrih_cPt :
-    case Hexagon::STrih_cNotPt :
-    case Hexagon::STrih_indexed_cPt :
-    case Hexagon::STrih_indexed_cNotPt :
-    case Hexagon::STrih_imm_cPt_V4 :
-    case Hexagon::STrih_imm_cNotPt_V4 :
-    case Hexagon::STrih_indexed_shl_cPt_V4 :
-    case Hexagon::STrih_indexed_shl_cNotPt_V4 :
-    case Hexagon::POST_SThri_cPt :
-    case Hexagon::POST_SThri_cNotPt :
-    case Hexagon::STriw_cPt :
-    case Hexagon::STriw_cNotPt :
-    case Hexagon::STriw_indexed_cPt :
-    case Hexagon::STriw_indexed_cNotPt :
-    case Hexagon::STriw_imm_cPt_V4 :
-    case Hexagon::STriw_imm_cNotPt_V4 :
-    case Hexagon::STriw_indexed_shl_cPt_V4 :
-    case Hexagon::STriw_indexed_shl_cNotPt_V4 :
-    case Hexagon::POST_STwri_cPt :
-    case Hexagon::POST_STwri_cNotPt :
-      return QRI.Subtarget.hasV4TOps();
+    case Hexagon::S4_storeirbt_io:
+    case Hexagon::S4_storeirbf_io:
+    case Hexagon::S4_pstorerbt_rr:
+    case Hexagon::S4_pstorerbf_rr:
+    case Hexagon::S2_pstorerbt_io:
+    case Hexagon::S2_pstorerbf_io:
+    case Hexagon::S2_pstorerbt_pi:
+    case Hexagon::S2_pstorerbf_pi:
+    case Hexagon::S2_pstorerdt_io:
+    case Hexagon::S2_pstorerdf_io:
+    case Hexagon::S4_pstorerdt_rr:
+    case Hexagon::S4_pstorerdf_rr:
+    case Hexagon::S2_pstorerdt_pi:
+    case Hexagon::S2_pstorerdf_pi:
+    case Hexagon::S2_pstorerht_io:
+    case Hexagon::S2_pstorerhf_io:
+    case Hexagon::S4_storeirht_io:
+    case Hexagon::S4_storeirhf_io:
+    case Hexagon::S4_pstorerht_rr:
+    case Hexagon::S4_pstorerhf_rr:
+    case Hexagon::S2_pstorerht_pi:
+    case Hexagon::S2_pstorerhf_pi:
+    case Hexagon::S2_pstorerit_io:
+    case Hexagon::S2_pstorerif_io:
+    case Hexagon::S4_storeirit_io:
+    case Hexagon::S4_storeirif_io:
+    case Hexagon::S4_pstorerit_rr:
+    case Hexagon::S4_pstorerif_rr:
+    case Hexagon::S2_pstorerit_pi:
+    case Hexagon::S2_pstorerif_pi:
 
     // V4 global address store before promoting to dot new.
-    case Hexagon::STd_GP_cPt_V4 :
-    case Hexagon::STd_GP_cNotPt_V4 :
-    case Hexagon::STb_GP_cPt_V4 :
-    case Hexagon::STb_GP_cNotPt_V4 :
-    case Hexagon::STh_GP_cPt_V4 :
-    case Hexagon::STh_GP_cNotPt_V4 :
-    case Hexagon::STw_GP_cPt_V4 :
-    case Hexagon::STw_GP_cNotPt_V4 :
-      return QRI.Subtarget.hasV4TOps();
+    case Hexagon::S4_pstorerdt_abs:
+    case Hexagon::S4_pstorerdf_abs:
+    case Hexagon::S4_pstorerbt_abs:
+    case Hexagon::S4_pstorerbf_abs:
+    case Hexagon::S4_pstorerht_abs:
+    case Hexagon::S4_pstorerhf_abs:
+    case Hexagon::S4_pstorerit_abs:
+    case Hexagon::S4_pstorerif_abs:
+      return true;
 
     // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
     // from the "Conditional Store" list. Because a predicated new value store
@@ -1566,20 +1522,14 @@ int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const {
   switch (MI->getOpcode()) {
   default: llvm_unreachable("Unknown .new type");
   // store new value byte
-  case Hexagon::STrib_shl_V4:
-    return Hexagon::STrib_shl_nv_V4;
-
-  case Hexagon::STrih_shl_V4:
-    return Hexagon::STrih_shl_nv_V4;
+  case Hexagon::S4_storerb_ur:
+    return Hexagon::S4_storerbnew_ur;
 
-  case Hexagon::STriw_f:
-    return Hexagon::STriw_nv_V4;
+  case Hexagon::S4_storerh_ur:
+    return Hexagon::S4_storerhnew_ur;
 
-  case Hexagon::STriw_indexed_f:
-    return Hexagon::STriw_indexed_nv_V4;
-
-  case Hexagon::STriw_shl_V4:
-    return Hexagon::STriw_shl_nv_V4;
+  case Hexagon::S4_storeri_ur:
+    return Hexagon::S4_storerinew_ur;
 
   }
   return 0;
@@ -1597,28 +1547,28 @@ int HexagonInstrInfo::GetDotNewPredOp(MachineInstr *MI,
   switch (MI->getOpcode()) {
   default: llvm_unreachable("Unknown .new type");
   // Condtional Jumps
-  case Hexagon::JMP_t:
-  case Hexagon::JMP_f:
+  case Hexagon::J2_jumpt:
+  case Hexagon::J2_jumpf:
     return getDotNewPredJumpOp(MI, MBPI);
 
-  case Hexagon::JMPR_t:
-    return Hexagon::JMPR_tnew_tV3;
+  case Hexagon::J2_jumprt:
+    return Hexagon::J2_jumptnewpt;
 
-  case Hexagon::JMPR_f:
-    return Hexagon::JMPR_fnew_tV3;
+  case Hexagon::J2_jumprf:
+    return Hexagon::J2_jumprfnewpt;
 
-  case Hexagon::JMPret_t:
-    return Hexagon::JMPret_tnew_tV3;
+  case Hexagon::JMPrett:
+    return Hexagon::J2_jumprtnewpt;
 
-  case Hexagon::JMPret_f:
-    return Hexagon::JMPret_fnew_tV3;
+  case Hexagon::JMPretf:
+    return Hexagon::J2_jumprfnewpt;
 
 
   // Conditional combine
-  case Hexagon::COMBINE_rr_cPt :
-    return Hexagon::COMBINE_rr_cdnPt;
-  case Hexagon::COMBINE_rr_cNotPt :
-    return Hexagon::COMBINE_rr_cdnNotPt;
+  case Hexagon::C2_ccombinewt:
+    return Hexagon::C2_ccombinewnewt;
+  case Hexagon::C2_ccombinewf:
+    return Hexagon::C2_ccombinewnewf;
   }
 }
 
@@ -1670,11 +1620,6 @@ bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
 }
 
 bool HexagonInstrInfo::isConstExtended(MachineInstr *MI) const {
-
-  // Constant extenders are allowed only for V4 and above.
-  if (!Subtarget.hasV4TOps())
-    return false;
-
   const uint64_t F = MI->getDesc().TSFlags;
   unsigned isExtended = (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
   if (isExtended) // Instruction must be extended.
@@ -1735,10 +1680,10 @@ HexagonInstrInfo::getDotNewPredJumpOp(MachineInstr *MI,
     taken = true;
 
   switch (MI->getOpcode()) {
-  case Hexagon::JMP_t:
-    return taken ? Hexagon::JMP_tnew_t : Hexagon::JMP_tnew_nt;
-  case Hexagon::JMP_f:
-    return taken ? Hexagon::JMP_fnew_t : Hexagon::JMP_fnew_nt;
+  case Hexagon::J2_jumpt:
+    return taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew;
+  case Hexagon::J2_jumpf:
+    return taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew;
 
   default:
     llvm_unreachable("Unexpected jump instruction.");
@@ -1747,10 +1692,6 @@ HexagonInstrInfo::getDotNewPredJumpOp(MachineInstr *MI,
 // Returns true if a particular operand is extendable for an instruction.
 bool HexagonInstrInfo::isOperandExtended(const MachineInstr *MI,
                                          unsigned short OperandNum) const {
-  // Constant extenders are allowed only for V4 and above.
-  if (!Subtarget.hasV4TOps())
-    return false;
-
   const uint64_t F = MI->getDesc().TSFlags;
 
   return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask)
@@ -1850,16 +1791,16 @@ short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const {
 }
 
 bool HexagonInstrInfo::PredOpcodeHasJMP_c(Opcode_t Opcode) const {
-  return (Opcode == Hexagon::JMP_t) ||
-         (Opcode == Hexagon::JMP_f) ||
-         (Opcode == Hexagon::JMP_tnew_t) ||
-         (Opcode == Hexagon::JMP_fnew_t) ||
-         (Opcode == Hexagon::JMP_tnew_nt) ||
-         (Opcode == Hexagon::JMP_fnew_nt);
+  return (Opcode == Hexagon::J2_jumpt) ||
+         (Opcode == Hexagon::J2_jumpf) ||
+         (Opcode == Hexagon::J2_jumptnewpt) ||
+         (Opcode == Hexagon::J2_jumpfnewpt) ||
+         (Opcode == Hexagon::J2_jumpt) ||
+         (Opcode == Hexagon::J2_jumpf);
 }
 
 bool HexagonInstrInfo::PredOpcodeHasNot(Opcode_t Opcode) const {
-  return (Opcode == Hexagon::JMP_f) ||
-         (Opcode == Hexagon::JMP_fnew_t) ||
-         (Opcode == Hexagon::JMP_fnew_nt);
+  return (Opcode == Hexagon::J2_jumpf) ||
+         (Opcode == Hexagon::J2_jumpfnewpt) ||
+         (Opcode == Hexagon::J2_jumpfnew);
 }
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
index 4090681..60635cf 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -14,83 +14,100 @@
 include "HexagonInstrFormats.td"
 include "HexagonOperands.td"
 
-//===----------------------------------------------------------------------===//
+// Pattern fragment that combines the value type and the register class
+// into a single parameter.
+// The pat frags in the definitions below need to have a named register,
+// otherwise i32 will be assumed regardless of the register class. The
+// name of the register does not matter.
+def I1  : PatLeaf<(i1 PredRegs:$R)>;
+def I32 : PatLeaf<(i32 IntRegs:$R)>;
+def I64 : PatLeaf<(i64 DoubleRegs:$R)>;
+def F32 : PatLeaf<(f32 IntRegs:$R)>;
+def F64 : PatLeaf<(f64 DoubleRegs:$R)>;
+
+// Pattern fragments to extract the low and high subregisters from a
+// 64-bit value.
+def LoReg: OutPatFrag<(ops node:$Rs),
+                      (EXTRACT_SUBREG (i64 $Rs), subreg_loreg)>;
+def HiReg: OutPatFrag<(ops node:$Rs),
+                      (EXTRACT_SUBREG (i64 $Rs), subreg_hireg)>;
 
-// Multi-class for logical operators.
-multiclass ALU32_rr_ri<string OpcStr, SDNode OpNode> {
-  def rr : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
-                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
-                 [(set (i32 IntRegs:$dst), (OpNode (i32 IntRegs:$b),
-                                                   (i32 IntRegs:$c)))]>;
-  def ri : ALU32_ri<(outs IntRegs:$dst), (ins s10Imm:$b, IntRegs:$c),
-                 !strconcat("$dst = ", !strconcat(OpcStr, "(#$b, $c)")),
-                 [(set (i32 IntRegs:$dst), (OpNode s10Imm:$b,
-                                                   (i32 IntRegs:$c)))]>;
-}
+// SDNode for converting immediate C to C-1.
+def DEC_CONST_SIGNED : SDNodeXForm<imm, [{
+   // Return the byte immediate const-1 as an SDNode.
+   int32_t imm = N->getSExtValue();
+   return XformSToSM1Imm(imm);
+}]>;
 
-// Multi-class for compare ops.
-let isCompare = 1 in {
-multiclass CMP64_rr<string OpcStr, PatFrag OpNode> {
-  def rr : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$b, DoubleRegs:$c),
-                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
-                 [(set (i1 PredRegs:$dst),
-                       (OpNode (i64 DoubleRegs:$b), (i64 DoubleRegs:$c)))]>;
-}
+// SDNode for converting immediate C to C-2.
+def DEC2_CONST_SIGNED : SDNodeXForm<imm, [{
+   // Return the byte immediate const-2 as an SDNode.
+   int32_t imm = N->getSExtValue();
+   return XformSToSM2Imm(imm);
+}]>;
+
+// SDNode for converting immediate C to C-3.
+def DEC3_CONST_SIGNED : SDNodeXForm<imm, [{
+   // Return the byte immediate const-3 as an SDNode.
+   int32_t imm = N->getSExtValue();
+   return XformSToSM3Imm(imm);
+}]>;
 
-multiclass CMP32_rr_ri_s10<string OpcStr, string CextOp, PatFrag OpNode> {
-  let CextOpcode = CextOp in {
-    let InputType = "reg" in
-    def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
-                   !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
-                   [(set (i1 PredRegs:$dst),
-                         (OpNode (i32 IntRegs:$b), (i32 IntRegs:$c)))]>;
+// SDNode for converting immediate C to C-1.
+def DEC_CONST_UNSIGNED : SDNodeXForm<imm, [{
+   // Return the byte immediate const-1 as an SDNode.
+   uint32_t imm = N->getZExtValue();
+   return XformUToUM1Imm(imm);
+}]>;
 
-    let isExtendable = 1, opExtendable = 2, isExtentSigned = 1,
-    opExtentBits = 10, InputType = "imm" in
-    def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, s10Ext:$c),
-                   !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
-                   [(set (i1 PredRegs:$dst),
-                         (OpNode (i32 IntRegs:$b), s10ExtPred:$c))]>;
+//===----------------------------------------------------------------------===//
+// Compare
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, isCompare = 1, InputType = "imm", isExtendable = 1,
+    opExtendable = 2 in
+class T_CMP <string mnemonic, bits<2> MajOp, bit isNot, Operand ImmOp>
+  : ALU32Inst <(outs PredRegs:$dst),
+               (ins IntRegs:$src1, ImmOp:$src2),
+  "$dst = "#!if(isNot, "!","")#mnemonic#"($src1, #$src2)",
+  [], "",ALU32_2op_tc_2early_SLOT0123 >, ImmRegRel {
+    bits<2> dst;
+    bits<5> src1;
+    bits<10> src2;
+    let CextOpcode = mnemonic;
+    let opExtentBits  = !if(!eq(mnemonic, "cmp.gtu"), 9, 10);
+    let isExtentSigned = !if(!eq(mnemonic, "cmp.gtu"), 0, 1);
+
+    let IClass = 0b0111;
+
+    let Inst{27-24} = 0b0101;
+    let Inst{23-22} = MajOp;
+    let Inst{21}    = !if(!eq(mnemonic, "cmp.gtu"), 0, src2{9});
+    let Inst{20-16} = src1;
+    let Inst{13-5}  = src2{8-0};
+    let Inst{4}     = isNot;
+    let Inst{3-2}   = 0b00;
+    let Inst{1-0}   = dst;
   }
-}
 
-multiclass CMP32_rr_ri_u9<string OpcStr, string CextOp, PatFrag OpNode> {
-  let CextOpcode = CextOp in {
-    let InputType = "reg" in
-    def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
-                   !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
-                   [(set (i1 PredRegs:$dst),
-                         (OpNode (i32 IntRegs:$b), (i32 IntRegs:$c)))]>;
+def C2_cmpeqi   : T_CMP <"cmp.eq",  0b00, 0, s10Ext>;
+def C2_cmpgti   : T_CMP <"cmp.gt",  0b01, 0, s10Ext>;
+def C2_cmpgtui  : T_CMP <"cmp.gtu", 0b10, 0, u9Ext>;
 
-    let isExtendable = 1, opExtendable = 2, isExtentSigned = 0,
-    opExtentBits = 9, InputType = "imm" in
-    def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, u9Ext:$c),
-                   !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
-                   [(set (i1 PredRegs:$dst),
-                         (OpNode (i32 IntRegs:$b), u9ExtPred:$c))]>;
-  }
-}
+class T_CMP_pat <InstHexagon MI, PatFrag OpNode, PatLeaf ImmPred>
+  : Pat<(i1 (OpNode (i32 IntRegs:$src1), ImmPred:$src2)),
+        (MI IntRegs:$src1, ImmPred:$src2)>;
 
-multiclass CMP32_ri_s8<string OpcStr, PatFrag OpNode> {
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 8 in
-  def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, s8Ext:$c),
-                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
-                 [(set (i1 PredRegs:$dst), (OpNode (i32 IntRegs:$b),
-                                                   s8ExtPred:$c))]>;
-}
-}
+def : T_CMP_pat <C2_cmpeqi,  seteq,  s10ImmPred>;
+def : T_CMP_pat <C2_cmpgti,  setgt,  s10ImmPred>;
+def : T_CMP_pat <C2_cmpgtui, setugt, u9ImmPred>;
 
 //===----------------------------------------------------------------------===//
-// ALU32/ALU (Instructions with register-register form)
+// ALU32/ALU +
 //===----------------------------------------------------------------------===//
 def SDTHexagonI64I32I32 : SDTypeProfile<1, 2,
   [SDTCisVT<0, i64>, SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
 
-def HexagonWrapperCombineII :
-  SDNode<"HexagonISD::WrapperCombineII", SDTHexagonI64I32I32>;
-
-def HexagonWrapperCombineRR :
-  SDNode<"HexagonISD::WrapperCombineRR", SDTHexagonI64I32I32>;
+def HexagonCOMBINE : SDNode<"HexagonISD::COMBINE", SDTHexagonI64I32I32>;
 
 let hasSideEffects = 0, hasNewValue = 1, InputType = "reg" in
 class T_ALU32_3op<string mnemonic, bits<3> MajOp, bits<3> MinOp, bit OpsRev,
@@ -145,6 +162,41 @@ class T_ALU32_3op_pred<string mnemonic, bits<3> MajOp, bits<3> MinOp,
   let Inst{4-0} = Rd;
 }
 
+class T_ALU32_combineh<string Op1, string Op2, bits<3> MajOp, bits<3> MinOp,
+                      bit OpsRev>
+  : T_ALU32_3op<"", MajOp, MinOp, OpsRev, 0> {
+  let AsmString = "$Rd = combine($Rs"#Op1#", $Rt"#Op2#")";
+}
+
+def A2_combine_hh : T_ALU32_combineh<".h", ".h", 0b011, 0b100, 1>;
+def A2_combine_hl : T_ALU32_combineh<".h", ".l", 0b011, 0b101, 1>;
+def A2_combine_lh : T_ALU32_combineh<".l", ".h", 0b011, 0b110, 1>;
+def A2_combine_ll : T_ALU32_combineh<".l", ".l", 0b011, 0b111, 1>;
+
+class T_ALU32_3op_sfx<string mnemonic, string suffix, bits<3> MajOp,
+                      bits<3> MinOp, bit OpsRev, bit IsComm>
+  : T_ALU32_3op<"", MajOp, MinOp, OpsRev, IsComm> {
+  let AsmString = "$Rd = "#mnemonic#"($Rs, $Rt)"#suffix;
+}
+
+def A2_svaddh   : T_ALU32_3op<"vaddh",   0b110, 0b000, 0, 1>;
+def A2_svsubh   : T_ALU32_3op<"vsubh",   0b110, 0b100, 1, 0>;
+
+let Defs = [USR_OVF], Itinerary = ALU32_3op_tc_2_SLOT0123 in {
+  def A2_svaddhs  : T_ALU32_3op_sfx<"vaddh",  ":sat", 0b110, 0b001, 0, 1>;
+  def A2_addsat   : T_ALU32_3op_sfx<"add",    ":sat", 0b110, 0b010, 0, 1>;
+  def A2_svadduhs : T_ALU32_3op_sfx<"vadduh", ":sat", 0b110, 0b011, 0, 1>;
+  def A2_svsubhs  : T_ALU32_3op_sfx<"vsubh",  ":sat", 0b110, 0b101, 1, 0>;
+  def A2_subsat   : T_ALU32_3op_sfx<"sub",    ":sat", 0b110, 0b110, 1, 0>;
+  def A2_svsubuhs : T_ALU32_3op_sfx<"vsubuh", ":sat", 0b110, 0b111, 1, 0>;
+}
+
+let Itinerary = ALU32_3op_tc_2_SLOT0123 in
+def A2_svavghs  : T_ALU32_3op_sfx<"vavgh",  ":rnd", 0b111, 0b001, 0, 1>;
+
+def A2_svavgh   : T_ALU32_3op<"vavgh",   0b111, 0b000, 0, 1>;
+def A2_svnavgh  : T_ALU32_3op<"vnavgh",  0b111, 0b011, 1, 0>;
+
 multiclass T_ALU32_3op_p<string mnemonic, bits<3> MajOp, bits<3> MinOp,
                          bit OpsRev> {
   def t    : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 0, 0>;
@@ -160,7 +212,6 @@ multiclass T_ALU32_3op_A2<string mnemonic, bits<3> MajOp, bits<3> MinOp,
   defm A2_p#NAME : T_ALU32_3op_p<mnemonic, MajOp, MinOp, OpsRev>;
 }
 
-let isCodeGenOnly = 0 in
 defm add : T_ALU32_3op_A2<"add", 0b011, 0b000, 0, 1>;
 defm and : T_ALU32_3op_A2<"and", 0b001, 0b000, 0, 1>;
 defm or  : T_ALU32_3op_A2<"or",  0b001, 0b001, 0, 1>;
@@ -178,282 +229,418 @@ def: BinOp32_pat<or,  A2_or,  i32>;
 def: BinOp32_pat<sub, A2_sub, i32>;
 def: BinOp32_pat<xor, A2_xor, i32>;
 
-multiclass ALU32_Pbase<string mnemonic, RegisterClass RC, bit isNot,
-                       bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME : ALU32_rr<(outs RC:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs: $src3),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew,".new) $dst = ",
-            ") $dst = ")#mnemonic#"($src2, $src3)",
-            []>;
+// A few special cases producing register pairs:
+let OutOperandList = (outs DoubleRegs:$Rd), hasNewValue = 0 in {
+  def S2_packhl    : T_ALU32_3op  <"packhl",  0b101, 0b100, 0, 0>;
+
+  let isPredicable = 1 in
+    def A2_combinew  : T_ALU32_3op  <"combine", 0b101, 0b000, 0, 0>;
+
+  // Conditional combinew uses "newt/f" instead of "t/fnew".
+  def C2_ccombinewt    : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 0, 0>;
+  def C2_ccombinewf    : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 1, 0>;
+  def C2_ccombinewnewt : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 0, 1>;
+  def C2_ccombinewnewf : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 1, 1>;
 }
 
-multiclass ALU32_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    defm _c#NAME : ALU32_Pbase<mnemonic, RC, PredNot, 0>;
-    // Predicate new
-    defm _cdn#NAME : ALU32_Pbase<mnemonic, RC, PredNot, 1>;
-  }
+let hasSideEffects = 0, hasNewValue = 1, isCompare = 1, InputType = "reg"  in
+class T_ALU32_3op_cmp<string mnemonic, bits<2> MinOp, bit IsNeg, bit IsComm>
+  : ALU32_rr<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
+             "$Pd = "#mnemonic#"($Rs, $Rt)",
+             [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel {
+  let CextOpcode = mnemonic;
+  let isCommutable = IsComm;
+  bits<5> Rs;
+  bits<5> Rt;
+  bits<2> Pd;
+
+  let IClass = 0b1111;
+  let Inst{27-24} = 0b0010;
+  let Inst{22-21} = MinOp;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{4} = IsNeg;
+  let Inst{3-2} = 0b00;
+  let Inst{1-0} = Pd;
 }
 
-let InputType = "reg" in
-multiclass ALU32_base<string mnemonic, string CextOp, SDNode OpNode> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_rr in {
-    let isPredicable = 1 in
-    def NAME : ALU32_rr<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = "#mnemonic#"($src1, $src2)",
-            [(set (i32 IntRegs:$dst), (OpNode (i32 IntRegs:$src1),
-                                              (i32 IntRegs:$src2)))]>;
-
-    let neverHasSideEffects = 1, isPredicated = 1 in {
-      defm Pt : ALU32_Pred<mnemonic, IntRegs, 0>;
-      defm NotPt : ALU32_Pred<mnemonic, IntRegs, 1>;
-    }
-  }
+let Itinerary = ALU32_3op_tc_2early_SLOT0123 in {
+  def C2_cmpeq   : T_ALU32_3op_cmp< "cmp.eq",  0b00, 0, 1>;
+  def C2_cmpgt   : T_ALU32_3op_cmp< "cmp.gt",  0b10, 0, 0>;
+  def C2_cmpgtu  : T_ALU32_3op_cmp< "cmp.gtu", 0b11, 0, 0>;
 }
 
-defm SUB_rr : ALU32_base<"sub", "SUB", sub>, ImmRegRel, PredNewRel;
+// Patfrag to convert the usual comparison patfrags (e.g. setlt) to ones
+// that reverse the order of the operands.
+class RevCmp<PatFrag F> : PatFrag<(ops node:$rhs, node:$lhs), F.Fragment>;
 
-// Combines the two integer registers SRC1 and SRC2 into a double register.
-let isPredicable = 1 in
-class T_Combine : ALU32_rr<(outs DoubleRegs:$dst),
-                           (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = combine($src1, $src2)",
-            [(set (i64 DoubleRegs:$dst),
-              (i64 (HexagonWrapperCombineRR (i32 IntRegs:$src1),
-                                            (i32 IntRegs:$src2))))]>;
-
-multiclass Combine_base {
-  let BaseOpcode = "combine" in {
-    def NAME : T_Combine;
-    let neverHasSideEffects = 1, isPredicated = 1 in {
-      defm Pt : ALU32_Pred<"combine", DoubleRegs, 0>;
-      defm NotPt : ALU32_Pred<"combine", DoubleRegs, 1>;
-    }
-  }
-}
+// Pats for compares. They use PatFrags as operands, not SDNodes,
+// since seteq/setgt/etc. are defined as ParFrags.
+class T_cmp32_rr_pat<InstHexagon MI, PatFrag Op, ValueType VT>
+  : Pat<(VT (Op (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))),
+        (VT (MI IntRegs:$Rs, IntRegs:$Rt))>;
 
-defm COMBINE_rr : Combine_base, PredNewRel;
+def: T_cmp32_rr_pat<C2_cmpeq,  seteq, i1>;
+def: T_cmp32_rr_pat<C2_cmpgt,  setgt, i1>;
+def: T_cmp32_rr_pat<C2_cmpgtu, setugt, i1>;
 
-// Combines the two immediates SRC1 and SRC2 into a double register.
-class COMBINE_imm<Operand imm1, Operand imm2, PatLeaf pat1, PatLeaf pat2> :
-  ALU32_ii<(outs DoubleRegs:$dst), (ins imm1:$src1, imm2:$src2),
-  "$dst = combine(#$src1, #$src2)",
-  [(set (i64 DoubleRegs:$dst),
-        (i64 (HexagonWrapperCombineII (i32 pat1:$src1), (i32 pat2:$src2))))]>;
+def: T_cmp32_rr_pat<C2_cmpgt,  RevCmp<setlt>,  i1>;
+def: T_cmp32_rr_pat<C2_cmpgtu, RevCmp<setult>, i1>;
 
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 8 in
-def COMBINE_Ii : COMBINE_imm<s8Ext, s8Imm, s8ExtPred, s8ImmPred>;
+let CextOpcode = "MUX", InputType = "reg", hasNewValue = 1 in
+def C2_mux: ALU32_rr<(outs IntRegs:$Rd),
+                     (ins PredRegs:$Pu, IntRegs:$Rs, IntRegs:$Rt),
+      "$Rd = mux($Pu, $Rs, $Rt)", [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel {
+  bits<5> Rd;
+  bits<2> Pu;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let CextOpcode = "mux";
+  let InputType = "reg";
+  let hasSideEffects = 0;
+  let IClass = 0b1111;
+
+  let Inst{27-24} = 0b0100;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{6-5} = Pu;
+  let Inst{4-0} = Rd;
+}
+
+def: Pat<(i32 (select (i1 PredRegs:$Pu), (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))),
+         (C2_mux PredRegs:$Pu, IntRegs:$Rs, IntRegs:$Rt)>;
+
+// Combines the two immediates into a double register.
+// Increase complexity to make it greater than any complexity of a combine
+// that involves a register.
+
+let isReMaterializable = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
+    isExtentSigned = 1, isExtendable = 1, opExtentBits = 8, opExtendable = 1,
+    AddedComplexity = 75 in
+def A2_combineii: ALU32Inst <(outs DoubleRegs:$Rdd), (ins s8Ext:$s8, s8Imm:$S8),
+  "$Rdd = combine(#$s8, #$S8)",
+  [(set (i64 DoubleRegs:$Rdd),
+        (i64 (HexagonCOMBINE(i32 s8ExtPred:$s8), (i32 s8ImmPred:$S8))))]> {
+    bits<5> Rdd;
+    bits<8> s8;
+    bits<8> S8;
+
+    let IClass = 0b0111;
+    let Inst{27-23} = 0b11000;
+    let Inst{22-16} = S8{7-1};
+    let Inst{13}    = S8{0};
+    let Inst{12-5}  = s8;
+    let Inst{4-0}   = Rdd;
+  }
 
 //===----------------------------------------------------------------------===//
-// ALU32/ALU (ADD with register-immediate form)
+// Template class for predicated ADD of a reg and an Immediate value.
 //===----------------------------------------------------------------------===//
-multiclass ALU32ri_Pbase<string mnemonic, bit isNot, bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME : ALU32_ri<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, s8Ext: $src3),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew,".new) $dst = ",
-            ") $dst = ")#mnemonic#"($src2, #$src3)",
-            []>;
-}
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_Addri_Pred <bit PredNot, bit PredNew>
+  : ALU32_ri <(outs IntRegs:$Rd),
+              (ins PredRegs:$Pu, IntRegs:$Rs, s8Ext:$s8),
+  !if(PredNot, "if (!$Pu", "if ($Pu")#!if(PredNew,".new) $Rd = ",
+  ") $Rd = ")#"add($Rs, #$s8)"> {
+    bits<5> Rd;
+    bits<2> Pu;
+    bits<5> Rs;
+    bits<8> s8;
+
+    let isPredicatedNew = PredNew;
+    let IClass = 0b0111;
+
+    let Inst{27-24} = 0b0100;
+    let Inst{23}    = PredNot;
+    let Inst{22-21} = Pu;
+    let Inst{20-16} = Rs;
+    let Inst{13}    = PredNew;
+    let Inst{12-5}  = s8;
+    let Inst{4-0}   = Rd;
+  }
 
-multiclass ALU32ri_Pred<string mnemonic, bit PredNot> {
+//===----------------------------------------------------------------------===//
+// A2_addi: Add a signed immediate to a register.
+//===----------------------------------------------------------------------===//
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_Addri <Operand immOp>
+  : ALU32_ri <(outs IntRegs:$Rd),
+              (ins IntRegs:$Rs, immOp:$s16),
+  "$Rd = add($Rs, #$s16)", [], "", ALU32_ADDI_tc_1_SLOT0123> {
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<16> s16;
+
+    let IClass = 0b1011;
+
+    let Inst{27-21} = s16{15-9};
+    let Inst{20-16} = Rs;
+    let Inst{13-5}  = s16{8-0};
+    let Inst{4-0}   = Rd;
+  }
+
+//===----------------------------------------------------------------------===//
+// Multiclass for ADD of a register and an immediate value.
+//===----------------------------------------------------------------------===//
+multiclass Addri_Pred<string mnemonic, bit PredNot> {
   let isPredicatedFalse = PredNot in {
-    defm _c#NAME : ALU32ri_Pbase<mnemonic, PredNot, 0>;
+    def NAME     : T_Addri_Pred<PredNot, 0>;
     // Predicate new
-    defm _cdn#NAME : ALU32ri_Pbase<mnemonic, PredNot, 1>;
+    def NAME#new : T_Addri_Pred<PredNot, 1>;
   }
 }
 
-let isExtendable = 1, InputType = "imm" in
-multiclass ALU32ri_base<string mnemonic, string CextOp, SDNode OpNode> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_ri in {
-    let opExtendable = 2, isExtentSigned = 1, opExtentBits = 16,
-    isPredicable = 1 in
-    def NAME : ALU32_ri<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s16Ext:$src2),
-            "$dst = "#mnemonic#"($src1, #$src2)",
-            [(set (i32 IntRegs:$dst), (OpNode (i32 IntRegs:$src1),
-                                              (s16ExtPred:$src2)))]>;
+let isExtendable = 1, isExtentSigned = 1, InputType = "imm" in
+multiclass Addri_base<string mnemonic, SDNode OpNode> {
+  let CextOpcode = mnemonic, BaseOpcode = mnemonic#_ri in {
+    let opExtendable = 2, opExtentBits = 16, isPredicable = 1 in
+    def A2_#NAME : T_Addri<s16Ext>;
 
-    let opExtendable = 3, isExtentSigned = 1, opExtentBits = 8,
-    neverHasSideEffects = 1, isPredicated = 1 in {
-      defm Pt : ALU32ri_Pred<mnemonic, 0>;
-      defm NotPt : ALU32ri_Pred<mnemonic, 1>;
+    let opExtendable = 3, opExtentBits = 8, isPredicated = 1 in {
+      defm A2_p#NAME#t : Addri_Pred<mnemonic, 0>;
+      defm A2_p#NAME#f : Addri_Pred<mnemonic, 1>;
     }
   }
 }
 
-defm ADD_ri : ALU32ri_base<"add", "ADD", add>, ImmRegRel, PredNewRel;
+defm addi : Addri_base<"add", add>, ImmRegRel, PredNewRel;
 
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 10,
-CextOpcode = "OR", InputType = "imm" in
-def OR_ri : ALU32_ri<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s10Ext:$src2),
-            "$dst = or($src1, #$src2)",
-            [(set (i32 IntRegs:$dst), (or (i32 IntRegs:$src1),
-                                          s10ExtPred:$src2))]>, ImmRegRel;
+def: Pat<(i32 (add I32:$Rs, s16ExtPred:$s16)),
+         (i32 (A2_addi I32:$Rs, imm:$s16))>;
 
+//===----------------------------------------------------------------------===//
+// Template class used for the following ALU32 instructions.
+// Rd=and(Rs,#s10)
+// Rd=or(Rs,#s10)
+//===----------------------------------------------------------------------===//
 let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 10,
-InputType = "imm", CextOpcode = "AND" in
-def AND_ri : ALU32_ri<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s10Ext:$src2),
-            "$dst = and($src1, #$src2)",
-            [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1),
-                                           s10ExtPred:$src2))]>, ImmRegRel;
+InputType = "imm", hasNewValue = 1 in
+class T_ALU32ri_logical <string mnemonic, SDNode OpNode, bits<2> MinOp>
+  : ALU32_ri <(outs IntRegs:$Rd),
+              (ins IntRegs:$Rs, s10Ext:$s10),
+  "$Rd = "#mnemonic#"($Rs, #$s10)" ,
+  [(set (i32 IntRegs:$Rd), (OpNode (i32 IntRegs:$Rs), s10ExtPred:$s10))]> {
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<10> s10;
+    let CextOpcode = mnemonic;
+
+    let IClass = 0b0111;
+
+    let Inst{27-24} = 0b0110;
+    let Inst{23-22} = MinOp;
+    let Inst{21}    = s10{9};
+    let Inst{20-16} = Rs;
+    let Inst{13-5}  = s10{8-0};
+    let Inst{4-0}   = Rd;
+  }
 
-// Nop.
-let neverHasSideEffects = 1, isCodeGenOnly = 0 in
-def NOP : ALU32_rr<(outs), (ins),
-          "nop",
-          []>;
+def A2_orir  : T_ALU32ri_logical<"or", or, 0b10>, ImmRegRel;
+def A2_andir : T_ALU32ri_logical<"and", and, 0b00>, ImmRegRel;
 
+// Subtract register from immediate
 // Rd32=sub(#s10,Rs32)
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 10,
-CextOpcode = "SUB", InputType = "imm" in
-def SUB_ri : ALU32_ri<(outs IntRegs:$dst),
-            (ins s10Ext:$src1, IntRegs:$src2),
-            "$dst = sub(#$src1, $src2)",
-            [(set IntRegs:$dst, (sub s10ExtPred:$src1, IntRegs:$src2))]>,
-            ImmRegRel;
-
-// Rd = not(Rs) gets mapped to Rd=sub(#-1, Rs).
-def : Pat<(not (i32 IntRegs:$src1)),
-          (SUB_ri -1, (i32 IntRegs:$src1))>;
-
-// Rd = neg(Rs) gets mapped to Rd=sub(#0, Rs).
-// Pattern definition for 'neg' was not necessary.
-
-multiclass TFR_Pred<bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    def _c#NAME : ALU32_rr<(outs IntRegs:$dst),
-                           (ins PredRegs:$src1, IntRegs:$src2),
-            !if(PredNot, "if (!$src1", "if ($src1")#") $dst = $src2",
-            []>;
-    // Predicate new
-    let isPredicatedNew = 1 in
-    def _cdn#NAME : ALU32_rr<(outs IntRegs:$dst),
-                             (ins PredRegs:$src1, IntRegs:$src2),
-            !if(PredNot, "if (!$src1", "if ($src1")#".new) $dst = $src2",
-            []>;
+let isExtendable = 1, CextOpcode = "sub", opExtendable = 1, isExtentSigned = 1,
+    opExtentBits = 10, InputType = "imm", hasNewValue = 1, hasSideEffects = 0 in
+def A2_subri: ALU32_ri <(outs IntRegs:$Rd), (ins s10Ext:$s10, IntRegs:$Rs),
+  "$Rd = sub(#$s10, $Rs)", []>, ImmRegRel {
+    bits<5> Rd;
+    bits<10> s10;
+    bits<5> Rs;
+
+    let IClass = 0b0111;
+
+    let Inst{27-22} = 0b011001;
+    let Inst{21}    = s10{9};
+    let Inst{20-16} = Rs;
+    let Inst{13-5}  = s10{8-0};
+    let Inst{4-0}   = Rd;
   }
+
+// Nop.
+let hasSideEffects = 0 in
+def A2_nop: ALU32Inst <(outs), (ins), "nop" > {
+  let IClass = 0b0111;
+  let Inst{27-24} = 0b1111;
 }
 
-let InputType = "reg", neverHasSideEffects = 1 in
-multiclass TFR_base<string CextOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp in {
-    let isPredicable = 1 in
-    def NAME : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
-            "$dst = $src1",
-            []>;
+def: Pat<(sub s10ExtPred:$s10, IntRegs:$Rs),
+         (A2_subri imm:$s10, IntRegs:$Rs)>;
 
-    let  isPredicated = 1 in {
-      defm Pt : TFR_Pred<0>;
-      defm NotPt : TFR_Pred<1>;
-    }
+// Rd = not(Rs) gets mapped to Rd=sub(#-1, Rs).
+def: Pat<(not (i32 IntRegs:$src1)),
+         (A2_subri -1, IntRegs:$src1)>;
+
+let hasSideEffects = 0, hasNewValue = 1 in
+class T_tfr16<bit isHi>
+  : ALU32Inst <(outs IntRegs:$Rx), (ins IntRegs:$src1, u16Imm:$u16),
+  "$Rx"#!if(isHi, ".h", ".l")#" = #$u16",
+  [], "$src1 = $Rx" > {
+    bits<5> Rx;
+    bits<16> u16;
+
+    let IClass = 0b0111;
+    let Inst{27-26} = 0b00;
+    let Inst{25-24} = !if(isHi, 0b10, 0b01);
+    let Inst{23-22} = u16{15-14};
+    let Inst{21}    = 0b1;
+    let Inst{20-16} = Rx;
+    let Inst{13-0}  = u16{13-0};
   }
-}
 
-class T_TFR64_Pred<bit PredNot, bit isPredNew>
-            : ALU32_rr<(outs DoubleRegs:$dst),
-                       (ins PredRegs:$src1, DoubleRegs:$src2),
-            !if(PredNot, "if (!$src1", "if ($src1")#
-            !if(isPredNew, ".new) ", ") ")#"$dst = $src2", []>
-{
+def A2_tfril: T_tfr16<0>;
+def A2_tfrih: T_tfr16<1>;
+
+// Conditional transfer is an alias to conditional "Rd = add(Rs, #0)".
+let isPredicated = 1, hasNewValue = 1, opNewValue = 0 in
+class T_tfr_pred<bit isPredNot, bit isPredNew>
+  : ALU32Inst<(outs IntRegs:$dst),
+              (ins PredRegs:$src1, IntRegs:$src2),
+              "if ("#!if(isPredNot, "!", "")#
+              "$src1"#!if(isPredNew, ".new", "")#
+              ") $dst = $src2"> {
     bits<5> dst;
     bits<2> src1;
     bits<5> src2;
 
-    let IClass = 0b1111;
-    let Inst{27-24} = 0b1101;
+    let isPredicatedFalse = isPredNot;
+    let isPredicatedNew = isPredNew;
+    let IClass = 0b0111;
+
+    let Inst{27-24} = 0b0100;
+    let Inst{23} = isPredNot;
     let Inst{13} = isPredNew;
-    let Inst{7} = PredNot;
+    let Inst{12-5} = 0;
     let Inst{4-0} = dst;
-    let Inst{6-5} = src1;
-    let Inst{20-17} = src2{4-1};
-    let Inst{16} = 0b1;
-    let Inst{12-9} = src2{4-1};
-    let Inst{8} = 0b0;
-}
+    let Inst{22-21} = src1;
+    let Inst{20-16} = src2;
+  }
 
-multiclass TFR64_Pred<bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    def _c#NAME : T_TFR64_Pred<PredNot, 0>;
+let isPredicable = 1 in
+class T_tfr : ALU32Inst<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = $src"> {
+    bits<5> dst;
+    bits<5> src;
 
-    let isPredicatedNew = 1 in
-    def _cdn#NAME : T_TFR64_Pred<PredNot, 1>; // Predicate new
+    let IClass = 0b0111;
+
+    let Inst{27-21} = 0b0000011;
+    let Inst{20-16} = src;
+    let Inst{13}    = 0b0;
+    let Inst{4-0}   = dst;
+  }
+
+let InputType = "reg", hasNewValue = 1, hasSideEffects = 0 in
+multiclass tfr_base<string CextOp> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp in {
+    def NAME : T_tfr;
+
+    // Predicate
+    def t : T_tfr_pred<0, 0>;
+    def f : T_tfr_pred<1, 0>;
+    // Predicate new
+    def tnew : T_tfr_pred<0, 1>;
+    def fnew : T_tfr_pred<1, 1>;
   }
 }
 
-let neverHasSideEffects = 1 in
+// Assembler mapped to C2_ccombinew[t|f|newt|newf].
+// Please don't add bits to this instruction as it'll be converted into
+// 'combine' before object code emission.
+let isPredicated = 1 in
+class T_tfrp_pred<bit PredNot, bit PredNew>
+  : ALU32_rr <(outs DoubleRegs:$dst),
+              (ins PredRegs:$src1, DoubleRegs:$src2),
+  "if ("#!if(PredNot, "!", "")#"$src1"
+        #!if(PredNew, ".new", "")#") $dst = $src2" > {
+    let isPredicatedFalse = PredNot;
+    let isPredicatedNew = PredNew;
+  }
+
+// Assembler mapped to A2_combinew.
+// Please don't add bits to this instruction as it'll be converted into
+// 'combine' before object code emission.
+class T_tfrp : ALU32Inst <(outs DoubleRegs:$dst),
+               (ins DoubleRegs:$src),
+    "$dst = $src">;
+
+let hasSideEffects = 0 in
 multiclass TFR64_base<string BaseName> {
   let BaseOpcode = BaseName in {
     let isPredicable = 1 in
-    def NAME : ALU32Inst <(outs DoubleRegs:$dst),
-                          (ins DoubleRegs:$src1),
-                          "$dst = $src1" > {
-        bits<5> dst;
-        bits<5> src1;
-
-        let IClass = 0b1111;
-        let Inst{27-23} = 0b01010;
-        let Inst{4-0} = dst;
-        let Inst{20-17} = src1{4-1};
-        let Inst{16} = 0b1;
-        let Inst{12-9} = src1{4-1};
-        let Inst{8} = 0b0;
-    }
-
-    let  isPredicated = 1 in {
-      defm Pt : TFR64_Pred<0>;
-      defm NotPt : TFR64_Pred<1>;
-    }
+    def NAME : T_tfrp;
+    // Predicate
+    def t : T_tfrp_pred <0, 0>;
+    def f : T_tfrp_pred <1, 0>;
+    // Predicate new
+    def tnew : T_tfrp_pred <0, 1>;
+    def fnew : T_tfrp_pred <1, 1>;
   }
 }
 
-multiclass TFRI_Pred<bit PredNot> {
-  let isMoveImm = 1, isPredicatedFalse = PredNot in {
-    def _c#NAME : ALU32_ri<(outs IntRegs:$dst),
-                           (ins PredRegs:$src1, s12Ext:$src2),
-            !if(PredNot, "if (!$src1", "if ($src1")#") $dst = #$src2",
-            []>;
+let InputType = "imm", isExtendable = 1, isExtentSigned = 1, opExtentBits = 12,
+    isMoveImm = 1, opExtendable = 2, BaseOpcode = "TFRI", CextOpcode = "TFR",
+    hasSideEffects = 0, isPredicated = 1, hasNewValue = 1 in
+class T_TFRI_Pred<bit PredNot, bit PredNew>
+  : ALU32_ri<(outs IntRegs:$Rd), (ins PredRegs:$Pu, s12Ext:$s12),
+    "if ("#!if(PredNot,"!","")#"$Pu"#!if(PredNew,".new","")#") $Rd = #$s12",
+    [], "", ALU32_2op_tc_1_SLOT0123>, ImmRegRel, PredNewRel {
+  let isPredicatedFalse = PredNot;
+  let isPredicatedNew = PredNew;
 
-    // Predicate new
-    let isPredicatedNew = 1 in
-    def _cdn#NAME : ALU32_rr<(outs IntRegs:$dst),
-                             (ins PredRegs:$src1, s12Ext:$src2),
-            !if(PredNot, "if (!$src1", "if ($src1")#".new) $dst = #$src2",
-            []>;
-  }
-}
-
-let InputType = "imm", isExtendable = 1, isExtentSigned = 1 in
-multiclass TFRI_base<string CextOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#I in {
-    let isAsCheapAsAMove = 1 , opExtendable = 1, opExtentBits = 16,
-    isMoveImm = 1, isPredicable = 1, isReMaterializable = 1 in
-    def NAME : ALU32_ri<(outs IntRegs:$dst), (ins s16Ext:$src1),
-            "$dst = #$src1",
-            [(set (i32 IntRegs:$dst), s16ExtPred:$src1)]>;
-
-    let opExtendable = 2,  opExtentBits = 12, neverHasSideEffects = 1,
-    isPredicated = 1 in {
-      defm Pt    : TFRI_Pred<0>;
-      defm NotPt : TFRI_Pred<1>;
-    }
-  }
+  bits<5> Rd;
+  bits<2> Pu;
+  bits<12> s12;
+
+  let IClass = 0b0111;
+  let Inst{27-24} = 0b1110;
+  let Inst{23} = PredNot;
+  let Inst{22-21} = Pu;
+  let Inst{20} = 0b0;
+  let Inst{19-16,12-5} = s12;
+  let Inst{13} = PredNew;
+  let Inst{4-0} = Rd;
 }
 
-defm TFRI : TFRI_base<"TFR">, ImmRegRel, PredNewRel;
-defm TFR : TFR_base<"TFR">, ImmRegRel, PredNewRel;
-defm TFR64 : TFR64_base<"TFR64">, PredNewRel;
+def C2_cmoveit    : T_TFRI_Pred<0, 0>;
+def C2_cmoveif    : T_TFRI_Pred<1, 0>;
+def C2_cmovenewit : T_TFRI_Pred<0, 1>;
+def C2_cmovenewif : T_TFRI_Pred<1, 1>;
+
+let InputType = "imm", isExtendable = 1, isExtentSigned = 1,
+    CextOpcode = "TFR", BaseOpcode = "TFRI", hasNewValue = 1, opNewValue = 0,
+    isAsCheapAsAMove = 1 , opExtendable = 1, opExtentBits = 16, isMoveImm = 1,
+    isPredicated = 0, isPredicable = 1, isReMaterializable = 1 in
+def A2_tfrsi : ALU32Inst<(outs IntRegs:$Rd), (ins s16Ext:$s16), "$Rd = #$s16",
+    [(set (i32 IntRegs:$Rd), s16ExtPred:$s16)], "", ALU32_2op_tc_1_SLOT0123>,
+    ImmRegRel, PredRel {
+  bits<5> Rd;
+  bits<16> s16;
+
+  let IClass = 0b0111;
+  let Inst{27-24} = 0b1000;
+  let Inst{23-22,20-16,13-5} = s16;
+  let Inst{4-0} = Rd;
+}
+
+defm A2_tfr  : tfr_base<"TFR">, ImmRegRel, PredNewRel;
+let isAsmParserOnly = 1 in
+defm A2_tfrp : TFR64_base<"TFR64">, PredNewRel;
+
+// Assembler mapped
+let isReMaterializable = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
+    isAsmParserOnly = 1 in
+def A2_tfrpi : ALU64_rr<(outs DoubleRegs:$dst), (ins s8Imm64:$src1),
+                      "$dst = #$src1",
+                      [(set (i64 DoubleRegs:$dst), s8Imm64Pred:$src1)]>;
+
+// TODO: see if this instruction can be deleted..
+let isExtendable = 1, opExtendable = 1, opExtentBits = 6,
+    isAsmParserOnly = 1 in
+def TFRI64_V4 : ALU64_rr<(outs DoubleRegs:$dst), (ins u6Ext:$src1),
+                         "$dst = #$src1">;
 
-// Transfer control register.
-let neverHasSideEffects = 1 in
-def TFCR : CRInst<(outs CRRegs:$dst), (ins IntRegs:$src1),
-           "$dst = $src1",
-           []>;
 //===----------------------------------------------------------------------===//
 // ALU32/ALU -
 //===----------------------------------------------------------------------===//
@@ -462,159 +649,344 @@ def TFCR : CRInst<(outs CRRegs:$dst), (ins IntRegs:$src1),
 //===----------------------------------------------------------------------===//
 // ALU32/PERM +
 //===----------------------------------------------------------------------===//
+// Scalar mux register immediate.
+let hasSideEffects = 0, isExtentSigned = 1, CextOpcode = "MUX",
+    InputType = "imm", hasNewValue = 1, isExtendable = 1, opExtentBits = 8 in
+class T_MUX1 <bit MajOp, dag ins, string AsmStr>
+      : ALU32Inst <(outs IntRegs:$Rd), ins, AsmStr>, ImmRegRel {
+  bits<5> Rd;
+  bits<2> Pu;
+  bits<8> s8;
+  bits<5> Rs;
+
+  let IClass = 0b0111;
+  let Inst{27-24} = 0b0011;
+  let Inst{23} = MajOp;
+  let Inst{22-21} = Pu;
+  let Inst{20-16} = Rs;
+  let Inst{13}    = 0b0;
+  let Inst{12-5}  = s8;
+  let Inst{4-0}   = Rd;
+}
+
+let opExtendable = 2 in
+def C2_muxri : T_MUX1<0b1, (ins PredRegs:$Pu, s8Ext:$s8, IntRegs:$Rs),
+                           "$Rd = mux($Pu, #$s8, $Rs)">;
+
+let opExtendable = 3 in
+def C2_muxir : T_MUX1<0b0, (ins PredRegs:$Pu, IntRegs:$Rs, s8Ext:$s8),
+                           "$Rd = mux($Pu, $Rs, #$s8)">;
+
+def : Pat<(i32 (select I1:$Pu, s8ExtPred:$s8, I32:$Rs)),
+          (C2_muxri I1:$Pu, s8ExtPred:$s8, I32:$Rs)>;
+
+def : Pat<(i32 (select I1:$Pu, I32:$Rs, s8ExtPred:$s8)),
+          (C2_muxir I1:$Pu, I32:$Rs, s8ExtPred:$s8)>;
+
+// C2_muxii: Scalar mux immediates.
+let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1,
+    opExtentBits = 8, opExtendable = 2 in
+def C2_muxii: ALU32Inst <(outs IntRegs:$Rd),
+                         (ins PredRegs:$Pu, s8Ext:$s8, s8Imm:$S8),
+  "$Rd = mux($Pu, #$s8, #$S8)" ,
+  [(set (i32 IntRegs:$Rd),
+        (i32 (select I1:$Pu, s8ExtPred:$s8, s8ImmPred:$S8)))] > {
+    bits<5> Rd;
+    bits<2> Pu;
+    bits<8> s8;
+    bits<8> S8;
+
+    let IClass = 0b0111;
+
+    let Inst{27-25} = 0b101;
+    let Inst{24-23} = Pu;
+    let Inst{22-16} = S8{7-1};
+    let Inst{13}    = S8{0};
+    let Inst{12-5}  = s8;
+    let Inst{4-0}   = Rd;
+  }
+
+//===----------------------------------------------------------------------===//
+// template class for non-predicated alu32_2op instructions
+// - aslh, asrh, sxtb, sxth, zxth
+//===----------------------------------------------------------------------===//
+let hasNewValue = 1, opNewValue = 0 in
+class T_ALU32_2op <string mnemonic, bits<3> minOp> :
+  ALU32Inst <(outs IntRegs:$Rd), (ins IntRegs:$Rs),
+             "$Rd = "#mnemonic#"($Rs)", [] > {
+  bits<5> Rd;
+  bits<5> Rs;
+
+  let IClass = 0b0111;
+
+  let Inst{27-24} = 0b0000;
+  let Inst{23-21} = minOp;
+  let Inst{13} = 0b0;
+  let Inst{4-0} = Rd;
+  let Inst{20-16} = Rs;
+}
+
+//===----------------------------------------------------------------------===//
+// template class for predicated alu32_2op instructions
+// - aslh, asrh, sxtb, sxth, zxtb, zxth
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class T_ALU32_2op_Pred <string mnemonic, bits<3> minOp, bit isPredNot,
+                        bit isPredNew > :
+  ALU32Inst <(outs IntRegs:$Rd), (ins PredRegs:$Pu, IntRegs:$Rs),
+             !if(isPredNot, "if (!$Pu", "if ($Pu")
+             #!if(isPredNew, ".new) ",") ")#"$Rd = "#mnemonic#"($Rs)"> {
+  bits<5> Rd;
+  bits<2> Pu;
+  bits<5> Rs;
+
+  let IClass = 0b0111;
 
-let neverHasSideEffects = 1 in
-def COMBINE_ii : ALU32_ii<(outs DoubleRegs:$dst),
-            (ins s8Imm:$src1, s8Imm:$src2),
-            "$dst = combine(#$src1, #$src2)",
-            []>;
-
-// Mux.
-def VMUX_prr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
-                                                   DoubleRegs:$src2,
-                                                   DoubleRegs:$src3),
-            "$dst = vmux($src1, $src2, $src3)",
-            []>;
-
-let CextOpcode = "MUX", InputType = "reg" in
-def MUX_rr : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
-                                            IntRegs:$src2, IntRegs:$src3),
-             "$dst = mux($src1, $src2, $src3)",
-             [(set (i32 IntRegs:$dst),
-                   (i32 (select (i1 PredRegs:$src1), (i32 IntRegs:$src2),
-                                (i32 IntRegs:$src3))))]>, ImmRegRel;
-
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 8,
-CextOpcode = "MUX", InputType = "imm" in
-def MUX_ir : ALU32_ir<(outs IntRegs:$dst), (ins PredRegs:$src1, s8Ext:$src2,
-                                                IntRegs:$src3),
-             "$dst = mux($src1, #$src2, $src3)",
-             [(set (i32 IntRegs:$dst),
-                   (i32 (select (i1 PredRegs:$src1), s8ExtPred:$src2,
-                                (i32 IntRegs:$src3))))]>, ImmRegRel;
-
-let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 8,
-CextOpcode = "MUX", InputType = "imm" in
-def MUX_ri : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2,
-                                                s8Ext:$src3),
-             "$dst = mux($src1, $src2, #$src3)",
-             [(set (i32 IntRegs:$dst),
-                   (i32 (select (i1 PredRegs:$src1), (i32 IntRegs:$src2),
-                                 s8ExtPred:$src3)))]>, ImmRegRel;
-
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 8 in
-def MUX_ii : ALU32_ii<(outs IntRegs:$dst), (ins PredRegs:$src1, s8Ext:$src2,
-                                                s8Imm:$src3),
-             "$dst = mux($src1, #$src2, #$src3)",
-             [(set (i32 IntRegs:$dst), (i32 (select (i1 PredRegs:$src1),
-                                                    s8ExtPred:$src2,
-                                                    s8ImmPred:$src3)))]>;
-
-// ALU32 - aslh, asrh, sxtb, sxth, zxtb, zxth
-multiclass ALU32_2op_Pbase<string mnemonic, bit isNot, bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME : ALU32Inst<(outs IntRegs:$dst),
-                       (ins PredRegs:$src1, IntRegs:$src2),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew,".new) $dst = ",
-            ") $dst = ")#mnemonic#"($src2)">,
-            Requires<[HasV4T]>;
-}
-
-multiclass ALU32_2op_Pred<string mnemonic, bit PredNot> {
+  let Inst{27-24} = 0b0000;
+  let Inst{23-21} = minOp;
+  let Inst{13} = 0b1;
+  let Inst{11} = isPredNot;
+  let Inst{10} = isPredNew;
+  let Inst{4-0} = Rd;
+  let Inst{9-8} = Pu;
+  let Inst{20-16} = Rs;
+}
+
+multiclass ALU32_2op_Pred<string mnemonic, bits<3> minOp, bit PredNot> {
   let isPredicatedFalse = PredNot in {
-    defm _c#NAME : ALU32_2op_Pbase<mnemonic, PredNot, 0>;
+    def NAME : T_ALU32_2op_Pred<mnemonic, minOp, PredNot, 0>;
+
     // Predicate new
-    defm _cdn#NAME : ALU32_2op_Pbase<mnemonic, PredNot, 1>;
+    let isPredicatedNew = 1 in
+    def NAME#new : T_ALU32_2op_Pred<mnemonic, minOp, PredNot, 1>;
   }
 }
 
-multiclass ALU32_2op_base<string mnemonic> {
+multiclass ALU32_2op_base<string mnemonic, bits<3> minOp> {
   let BaseOpcode = mnemonic in {
-    let isPredicable = 1, neverHasSideEffects = 1 in
-    def NAME : ALU32Inst<(outs IntRegs:$dst),
-                         (ins IntRegs:$src1),
-            "$dst = "#mnemonic#"($src1)">;
-
-    let Predicates = [HasV4T], validSubTargets = HasV4SubT, isPredicated = 1,
-    neverHasSideEffects = 1 in {
-      defm Pt_V4    : ALU32_2op_Pred<mnemonic, 0>;
-      defm NotPt_V4 : ALU32_2op_Pred<mnemonic, 1>;
+    let isPredicable = 1, hasSideEffects = 0 in
+    def A2_#NAME : T_ALU32_2op<mnemonic, minOp>;
+
+    let isPredicated = 1, hasSideEffects = 0 in {
+      defm A4_p#NAME#t : ALU32_2op_Pred<mnemonic, minOp, 0>;
+      defm A4_p#NAME#f : ALU32_2op_Pred<mnemonic, minOp, 1>;
     }
   }
 }
 
-defm ASLH : ALU32_2op_base<"aslh">, PredNewRel;
-defm ASRH : ALU32_2op_base<"asrh">, PredNewRel;
-defm SXTB : ALU32_2op_base<"sxtb">, PredNewRel;
-defm SXTH : ALU32_2op_base<"sxth">,  PredNewRel;
-defm ZXTB : ALU32_2op_base<"zxtb">, PredNewRel;
-defm ZXTH : ALU32_2op_base<"zxth">,  PredNewRel;
+defm aslh : ALU32_2op_base<"aslh", 0b000>, PredNewRel;
+defm asrh : ALU32_2op_base<"asrh", 0b001>, PredNewRel;
+defm sxtb : ALU32_2op_base<"sxtb", 0b101>, PredNewRel;
+defm sxth : ALU32_2op_base<"sxth", 0b111>, PredNewRel;
+defm zxth : ALU32_2op_base<"zxth", 0b110>, PredNewRel;
+
+// Rd=zxtb(Rs): assembler mapped to Rd=and(Rs,#255).
+// Compiler would want to generate 'zxtb' instead of 'and' becuase 'zxtb' has
+// predicated forms while 'and' doesn't. Since integrated assembler can't
+// handle 'mapped' instructions, we need to encode 'zxtb' same as 'and' where
+// immediate operand is set to '255'.
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_ZXTB: ALU32Inst < (outs IntRegs:$Rd), (ins IntRegs:$Rs),
+  "$Rd = zxtb($Rs)", [] > { // Rd = and(Rs,255)
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<10> s10 = 255;
+
+    let IClass = 0b0111;
+
+    let Inst{27-22} = 0b011000;
+    let Inst{4-0} = Rd;
+    let Inst{20-16} = Rs;
+    let Inst{21} = s10{9};
+    let Inst{13-5} = s10{8-0};
+}
 
-def : Pat <(shl (i32 IntRegs:$src1), (i32 16)),
-           (ASLH IntRegs:$src1)>;
+//Rd=zxtb(Rs): assembler mapped to "Rd=and(Rs,#255)
+multiclass ZXTB_base <string mnemonic, bits<3> minOp> {
+  let BaseOpcode = mnemonic in {
+    let isPredicable = 1, hasSideEffects = 0 in
+    def A2_#NAME : T_ZXTB;
 
-def : Pat <(sra (i32 IntRegs:$src1), (i32 16)),
-           (ASRH IntRegs:$src1)>;
+    let isPredicated = 1, hasSideEffects = 0 in {
+      defm A4_p#NAME#t : ALU32_2op_Pred<mnemonic, minOp, 0>;
+      defm A4_p#NAME#f : ALU32_2op_Pred<mnemonic, minOp, 1>;
+    }
+  }
+}
 
-def : Pat <(sext_inreg (i32 IntRegs:$src1), i8),
-           (SXTB IntRegs:$src1)>;
+defm zxtb : ZXTB_base<"zxtb",0b100>, PredNewRel;
 
-def : Pat <(sext_inreg (i32 IntRegs:$src1), i16),
-           (SXTH IntRegs:$src1)>;
+def: Pat<(shl I32:$src1, (i32 16)),   (A2_aslh I32:$src1)>;
+def: Pat<(sra I32:$src1, (i32 16)),   (A2_asrh I32:$src1)>;
+def: Pat<(sext_inreg I32:$src1, i8),  (A2_sxtb I32:$src1)>;
+def: Pat<(sext_inreg I32:$src1, i16), (A2_sxth I32:$src1)>;
 
 //===----------------------------------------------------------------------===//
-// ALU32/PERM -
+// Template class for vector add and avg
 //===----------------------------------------------------------------------===//
 
+class T_VectALU_64 <string opc, bits<3> majOp, bits<3> minOp,
+                   bit isSat, bit isRnd, bit isCrnd, bit SwapOps >
+  : ALU64_rr < (outs DoubleRegs:$Rdd),
+                (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Rdd = "#opc#"($Rss, $Rtt)"#!if(isRnd, ":rnd", "")
+                             #!if(isCrnd,":crnd","")
+                             #!if(isSat, ":sat", ""),
+  [], "", ALU64_tc_2_SLOT23 > {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1101;
+
+    let Inst{27-24} = 0b0011;
+    let Inst{23-21} = majOp;
+    let Inst{20-16} = !if (SwapOps, Rtt, Rss);
+    let Inst{12-8} = !if (SwapOps, Rss, Rtt);
+    let Inst{7-5} = minOp;
+    let Inst{4-0} = Rdd;
+  }
 
-//===----------------------------------------------------------------------===//
-// ALU32/PRED +
-//===----------------------------------------------------------------------===//
+// ALU64 - Vector add
+// Rdd=vadd[u][bhw](Rss,Rtt)
+let Itinerary = ALU64_tc_1_SLOT23 in {
+  def A2_vaddub  : T_VectALU_64 < "vaddub", 0b000, 0b000, 0, 0, 0, 0>;
+  def A2_vaddh   : T_VectALU_64 < "vaddh",  0b000, 0b010, 0, 0, 0, 0>;
+  def A2_vaddw   : T_VectALU_64 < "vaddw",  0b000, 0b101, 0, 0, 0, 0>;
+}
 
-// Compare.
-defm CMPGTU : CMP32_rr_ri_u9<"cmp.gtu", "CMPGTU", setugt>, ImmRegRel;
-defm CMPGT : CMP32_rr_ri_s10<"cmp.gt", "CMPGT", setgt>, ImmRegRel;
-defm CMPEQ : CMP32_rr_ri_s10<"cmp.eq", "CMPEQ", seteq>, ImmRegRel;
+// Rdd=vadd[u][bhw](Rss,Rtt):sat
+let Defs = [USR_OVF] in {
+  def A2_vaddubs : T_VectALU_64 < "vaddub", 0b000, 0b001, 1, 0, 0, 0>;
+  def A2_vaddhs  : T_VectALU_64 < "vaddh",  0b000, 0b011, 1, 0, 0, 0>;
+  def A2_vadduhs : T_VectALU_64 < "vadduh", 0b000, 0b100, 1, 0, 0, 0>;
+  def A2_vaddws  : T_VectALU_64 < "vaddw",  0b000, 0b110, 1, 0, 0, 0>;
+}
 
-// SDNode for converting immediate C to C-1.
-def DEC_CONST_SIGNED : SDNodeXForm<imm, [{
-   // Return the byte immediate const-1 as an SDNode.
-   int32_t imm = N->getSExtValue();
-   return XformSToSM1Imm(imm);
-}]>;
+// ALU64 - Vector average
+// Rdd=vavg[u][bhw](Rss,Rtt)
+let Itinerary = ALU64_tc_1_SLOT23 in {
+  def A2_vavgub : T_VectALU_64 < "vavgub", 0b010, 0b000, 0, 0, 0, 0>;
+  def A2_vavgh  : T_VectALU_64 < "vavgh",  0b010, 0b010, 0, 0, 0, 0>;
+  def A2_vavguh : T_VectALU_64 < "vavguh", 0b010, 0b101, 0, 0, 0, 0>;
+  def A2_vavgw  : T_VectALU_64 < "vavgw",  0b011, 0b000, 0, 0, 0, 0>;
+  def A2_vavguw : T_VectALU_64 < "vavguw", 0b011, 0b011, 0, 0, 0, 0>;
+}
 
-// SDNode for converting immediate C to C-1.
-def DEC_CONST_UNSIGNED : SDNodeXForm<imm, [{
-   // Return the byte immediate const-1 as an SDNode.
-   uint32_t imm = N->getZExtValue();
-   return XformUToUM1Imm(imm);
-}]>;
+// Rdd=vavg[u][bhw](Rss,Rtt)[:rnd|:crnd]
+def A2_vavgubr : T_VectALU_64 < "vavgub", 0b010, 0b001, 0, 1, 0, 0>;
+def A2_vavghr  : T_VectALU_64 < "vavgh",  0b010, 0b011, 0, 1, 0, 0>;
+def A2_vavghcr : T_VectALU_64 < "vavgh",  0b010, 0b100, 0, 0, 1, 0>;
+def A2_vavguhr : T_VectALU_64 < "vavguh", 0b010, 0b110, 0, 1, 0, 0>;
+
+def A2_vavgwr  : T_VectALU_64 < "vavgw",  0b011, 0b001, 0, 1, 0, 0>;
+def A2_vavgwcr : T_VectALU_64 < "vavgw",  0b011, 0b010, 0, 0, 1, 0>;
+def A2_vavguwr : T_VectALU_64 < "vavguw", 0b011, 0b100, 0, 1, 0, 0>;
+
+// Rdd=vnavg[bh](Rss,Rtt)
+let Itinerary = ALU64_tc_1_SLOT23 in {
+  def A2_vnavgh   : T_VectALU_64 < "vnavgh", 0b100, 0b000, 0, 0, 0, 1>;
+  def A2_vnavgw   : T_VectALU_64 < "vnavgw", 0b100, 0b011, 0, 0, 0, 1>;
+}
+
+// Rdd=vnavg[bh](Rss,Rtt)[:rnd|:crnd]:sat
+let Defs = [USR_OVF] in {
+  def A2_vnavghr  : T_VectALU_64 < "vnavgh", 0b100, 0b001, 1, 1, 0, 1>;
+  def A2_vnavghcr : T_VectALU_64 < "vnavgh", 0b100, 0b010, 1, 0, 1, 1>;
+  def A2_vnavgwr  : T_VectALU_64 < "vnavgw", 0b100, 0b100, 1, 1, 0, 1>;
+  def A2_vnavgwcr : T_VectALU_64 < "vnavgw", 0b100, 0b110, 1, 0, 1, 1>;
+}
+
+// Rdd=vsub[u][bh](Rss,Rtt)
+let Itinerary = ALU64_tc_1_SLOT23 in {
+  def A2_vsubub  : T_VectALU_64 < "vsubub", 0b001, 0b000, 0, 0, 0, 1>;
+  def A2_vsubh   : T_VectALU_64 < "vsubh",  0b001, 0b010, 0, 0, 0, 1>;
+  def A2_vsubw   : T_VectALU_64 < "vsubw",  0b001, 0b101, 0, 0, 0, 1>;
+}
+
+// Rdd=vsub[u][bh](Rss,Rtt):sat
+let Defs = [USR_OVF] in {
+  def A2_vsububs : T_VectALU_64 < "vsubub", 0b001, 0b001, 1, 0, 0, 1>;
+  def A2_vsubhs  : T_VectALU_64 < "vsubh",  0b001, 0b011, 1, 0, 0, 1>;
+  def A2_vsubuhs : T_VectALU_64 < "vsubuh", 0b001, 0b100, 1, 0, 0, 1>;
+  def A2_vsubws  : T_VectALU_64 < "vsubw",  0b001, 0b110, 1, 0, 0, 1>;
+}
 
-def CTLZ_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
-    "$dst = cl0($src1)",
-    [(set (i32 IntRegs:$dst), (ctlz (i32 IntRegs:$src1)))]>;
+// Rdd=vmax[u][bhw](Rss,Rtt)
+def A2_vmaxb  : T_VectALU_64 < "vmaxb",  0b110, 0b110, 0, 0, 0, 1>;
+def A2_vmaxub : T_VectALU_64 < "vmaxub", 0b110, 0b000, 0, 0, 0, 1>;
+def A2_vmaxh  : T_VectALU_64 < "vmaxh",  0b110, 0b001, 0, 0, 0, 1>;
+def A2_vmaxuh : T_VectALU_64 < "vmaxuh", 0b110, 0b010, 0, 0, 0, 1>;
+def A2_vmaxw  : T_VectALU_64 < "vmaxw",  0b110, 0b011, 0, 0, 0, 1>;
+def A2_vmaxuw : T_VectALU_64 < "vmaxuw", 0b101, 0b101, 0, 0, 0, 1>;
+
+// Rdd=vmin[u][bhw](Rss,Rtt)
+def A2_vminb  : T_VectALU_64 < "vminb",  0b110, 0b111, 0, 0, 0, 1>;
+def A2_vminub : T_VectALU_64 < "vminub", 0b101, 0b000, 0, 0, 0, 1>;
+def A2_vminh  : T_VectALU_64 < "vminh",  0b101, 0b001, 0, 0, 0, 1>;
+def A2_vminuh : T_VectALU_64 < "vminuh", 0b101, 0b010, 0, 0, 0, 1>;
+def A2_vminw  : T_VectALU_64 < "vminw",  0b101, 0b011, 0, 0, 0, 1>;
+def A2_vminuw : T_VectALU_64 < "vminuw", 0b101, 0b100, 0, 0, 0, 1>;
 
-def CTTZ_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
-    "$dst = ct0($src1)",
-    [(set (i32 IntRegs:$dst), (cttz (i32 IntRegs:$src1)))]>;
+//===----------------------------------------------------------------------===//
+// Template class for vector compare
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_vcmp <string Str, bits<4> minOp>
+  : ALU64_rr <(outs PredRegs:$Pd),
+              (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Pd = "#Str#"($Rss, $Rtt)", [],
+  "", ALU64_tc_2early_SLOT23> {
+    bits<2> Pd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1101;
+
+    let Inst{27-23} = 0b00100;
+    let Inst{13} = minOp{3};
+    let Inst{7-5} = minOp{2-0};
+    let Inst{1-0} = Pd;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
 
-def CTLZ64_rr : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
-    "$dst = cl0($src1)",
-    [(set (i32 IntRegs:$dst), (i32 (trunc (ctlz (i64 DoubleRegs:$src1)))))]>;
+class T_vcmp_pat<InstHexagon MI, PatFrag Op, ValueType T>
+  : Pat<(i1 (Op (T DoubleRegs:$Rss), (T DoubleRegs:$Rtt))),
+        (i1 (MI DoubleRegs:$Rss, DoubleRegs:$Rtt))>;
+
+// Vector compare bytes
+def A2_vcmpbeq  : T_vcmp <"vcmpb.eq",  0b0110>;
+def A2_vcmpbgtu : T_vcmp <"vcmpb.gtu", 0b0111>;
+
+// Vector compare halfwords
+def A2_vcmpheq  : T_vcmp <"vcmph.eq",  0b0011>;
+def A2_vcmphgt  : T_vcmp <"vcmph.gt",  0b0100>;
+def A2_vcmphgtu : T_vcmp <"vcmph.gtu", 0b0101>;
+
+// Vector compare words
+def A2_vcmpweq  : T_vcmp <"vcmpw.eq",  0b0000>;
+def A2_vcmpwgt  : T_vcmp <"vcmpw.gt",  0b0001>;
+def A2_vcmpwgtu : T_vcmp <"vcmpw.gtu", 0b0010>;
+
+def: T_vcmp_pat<A2_vcmpbeq,  seteq,  v8i8>;
+def: T_vcmp_pat<A2_vcmpbgtu, setugt, v8i8>;
+def: T_vcmp_pat<A2_vcmpheq,  seteq,  v4i16>;
+def: T_vcmp_pat<A2_vcmphgt,  setgt,  v4i16>;
+def: T_vcmp_pat<A2_vcmphgtu, setugt, v4i16>;
+def: T_vcmp_pat<A2_vcmpweq,  seteq,  v2i32>;
+def: T_vcmp_pat<A2_vcmpwgt,  setgt,  v2i32>;
+def: T_vcmp_pat<A2_vcmpwgtu, setugt, v2i32>;
 
-def CTTZ64_rr : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
-    "$dst = ct0($src1)",
-    [(set (i32 IntRegs:$dst), (i32 (trunc (cttz (i64 DoubleRegs:$src1)))))]>;
+//===----------------------------------------------------------------------===//
+// ALU32/PERM -
+//===----------------------------------------------------------------------===//
 
-def TSTBIT_rr : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-    "$dst = tstbit($src1, $src2)",
-    [(set (i1 PredRegs:$dst),
-          (setne (and (shl 1, (i32 IntRegs:$src2)), (i32 IntRegs:$src1)), 0))]>;
 
-def TSTBIT_ri : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
-    "$dst = tstbit($src1, $src2)",
-    [(set (i1 PredRegs:$dst),
-          (setne (and (shl 1, (u5ImmPred:$src2)), (i32 IntRegs:$src1)), 0))]>;
+//===----------------------------------------------------------------------===//
+// ALU32/PRED +
+//===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // ALU32/PRED -
@@ -625,112 +997,280 @@ def TSTBIT_ri : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
 // ALU64/ALU +
 //===----------------------------------------------------------------------===//
 // Add.
-def ADD64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                     DoubleRegs:$src2),
-               "$dst = add($src1, $src2)",
-               [(set (i64 DoubleRegs:$dst), (add (i64 DoubleRegs:$src1),
-                                                 (i64 DoubleRegs:$src2)))]>;
+//===----------------------------------------------------------------------===//
+// Template Class
+// Add/Subtract halfword
+// Rd=add(Rt.L,Rs.[HL])[:sat]
+// Rd=sub(Rt.L,Rs.[HL])[:sat]
+// Rd=add(Rt.[LH],Rs.[HL])[:sat][:<16]
+// Rd=sub(Rt.[LH],Rs.[HL])[:sat][:<16]
+//===----------------------------------------------------------------------===//
 
-// Add halfword.
+let  hasNewValue = 1, opNewValue = 0 in
+class T_XTYPE_ADD_SUB <bits<2> LHbits, bit isSat, bit hasShift, bit isSub>
+  : ALU64Inst <(outs IntRegs:$Rd), (ins IntRegs:$Rt, IntRegs:$Rs),
+  "$Rd = "#!if(isSub,"sub","add")#"($Rt."
+          #!if(hasShift, !if(LHbits{1},"h","l"),"l") #", $Rs."
+          #!if(hasShift, !if(LHbits{0},"h)","l)"), !if(LHbits{1},"h)","l)"))
+          #!if(isSat,":sat","")
+          #!if(hasShift,":<<16",""), [], "", ALU64_tc_1_SLOT23> {
+    bits<5> Rd;
+    bits<5> Rt;
+    bits<5> Rs;
+    let IClass = 0b1101;
+
+    let Inst{27-23} = 0b01010;
+    let Inst{22} = hasShift;
+    let Inst{21} = isSub;
+    let Inst{7} = isSat;
+    let Inst{6-5} = LHbits;
+    let Inst{4-0} = Rd;
+    let Inst{12-8} = Rt;
+    let Inst{20-16} = Rs;
+  }
 
-// Compare.
-defm CMPEHexagon4 : CMP64_rr<"cmp.eq", seteq>;
-defm CMPGT64 : CMP64_rr<"cmp.gt", setgt>;
-defm CMPGTU64 : CMP64_rr<"cmp.gtu", setugt>;
-
-// Logical operations.
-def AND_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                     DoubleRegs:$src2),
-               "$dst = and($src1, $src2)",
-               [(set (i64 DoubleRegs:$dst), (and (i64 DoubleRegs:$src1),
-                                                 (i64 DoubleRegs:$src2)))]>;
-
-def OR_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                    DoubleRegs:$src2),
-              "$dst = or($src1, $src2)",
-              [(set (i64 DoubleRegs:$dst), (or (i64 DoubleRegs:$src1),
-                                               (i64 DoubleRegs:$src2)))]>;
-
-def XOR_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                     DoubleRegs:$src2),
-               "$dst = xor($src1, $src2)",
-               [(set (i64 DoubleRegs:$dst), (xor (i64 DoubleRegs:$src1),
-                                                 (i64 DoubleRegs:$src2)))]>;
-
-// Maximum.
-def MAXw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-              "$dst = max($src2, $src1)",
-              [(set (i32 IntRegs:$dst),
-                    (i32 (select (i1 (setlt (i32 IntRegs:$src2),
-                                            (i32 IntRegs:$src1))),
-                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+//Rd=sub(Rt.L,Rs.[LH])
+def A2_subh_l16_ll : T_XTYPE_ADD_SUB <0b00, 0, 0, 1>;
+def A2_subh_l16_hl : T_XTYPE_ADD_SUB <0b10, 0, 0, 1>;
 
-def MAXUw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-              "$dst = maxu($src2, $src1)",
-              [(set (i32 IntRegs:$dst),
-                    (i32 (select (i1 (setult (i32 IntRegs:$src2),
-                                             (i32 IntRegs:$src1))),
-                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
-
-def MAXd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                    DoubleRegs:$src2),
-              "$dst = max($src2, $src1)",
-              [(set (i64 DoubleRegs:$dst),
-                    (i64 (select (i1 (setlt (i64 DoubleRegs:$src2),
-                                            (i64 DoubleRegs:$src1))),
-                                 (i64 DoubleRegs:$src1),
-                                 (i64 DoubleRegs:$src2))))]>;
-
-def MAXUd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                     DoubleRegs:$src2),
-              "$dst = maxu($src2, $src1)",
-              [(set (i64 DoubleRegs:$dst),
-                    (i64 (select (i1 (setult (i64 DoubleRegs:$src2),
-                                             (i64 DoubleRegs:$src1))),
-                                 (i64 DoubleRegs:$src1),
-                                 (i64 DoubleRegs:$src2))))]>;
-
-// Minimum.
-def MINw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-              "$dst = min($src2, $src1)",
-              [(set (i32 IntRegs:$dst),
-                    (i32 (select (i1 (setgt (i32 IntRegs:$src2),
-                                            (i32 IntRegs:$src1))),
-                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+//Rd=add(Rt.L,Rs.[LH])
+def A2_addh_l16_ll : T_XTYPE_ADD_SUB <0b00, 0, 0, 0>;
+def A2_addh_l16_hl : T_XTYPE_ADD_SUB <0b10, 0, 0, 0>;
 
-def MINUw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-              "$dst = minu($src2, $src1)",
-              [(set (i32 IntRegs:$dst),
-                    (i32 (select (i1 (setugt (i32 IntRegs:$src2),
-                                             (i32 IntRegs:$src1))),
-                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
-
-def MINd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                    DoubleRegs:$src2),
-              "$dst = min($src2, $src1)",
-              [(set (i64 DoubleRegs:$dst),
-                    (i64 (select (i1 (setgt (i64 DoubleRegs:$src2),
-                                            (i64 DoubleRegs:$src1))),
-                                 (i64 DoubleRegs:$src1),
-                                 (i64 DoubleRegs:$src2))))]>;
-
-def MINUd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                     DoubleRegs:$src2),
-              "$dst = minu($src2, $src1)",
-              [(set (i64 DoubleRegs:$dst),
-                    (i64 (select (i1 (setugt (i64 DoubleRegs:$src2),
-                                             (i64 DoubleRegs:$src1))),
-                                 (i64 DoubleRegs:$src1),
-                                 (i64 DoubleRegs:$src2))))]>;
-
-// Subtract.
-def SUB64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                     DoubleRegs:$src2),
-               "$dst = sub($src1, $src2)",
-               [(set (i64 DoubleRegs:$dst), (sub (i64 DoubleRegs:$src1),
-                                                 (i64 DoubleRegs:$src2)))]>;
+let Itinerary = ALU64_tc_2_SLOT23, Defs = [USR_OVF] in {
+  //Rd=sub(Rt.L,Rs.[LH]):sat
+  def A2_subh_l16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 0, 1>;
+  def A2_subh_l16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 0, 1>;
+
+  //Rd=add(Rt.L,Rs.[LH]):sat
+  def A2_addh_l16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 0, 0>;
+  def A2_addh_l16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 0, 0>;
+}
+
+//Rd=sub(Rt.[LH],Rs.[LH]):<<16
+def A2_subh_h16_ll : T_XTYPE_ADD_SUB <0b00, 0, 1, 1>;
+def A2_subh_h16_lh : T_XTYPE_ADD_SUB <0b01, 0, 1, 1>;
+def A2_subh_h16_hl : T_XTYPE_ADD_SUB <0b10, 0, 1, 1>;
+def A2_subh_h16_hh : T_XTYPE_ADD_SUB <0b11, 0, 1, 1>;
+
+//Rd=add(Rt.[LH],Rs.[LH]):<<16
+def A2_addh_h16_ll : T_XTYPE_ADD_SUB <0b00, 0, 1, 0>;
+def A2_addh_h16_lh : T_XTYPE_ADD_SUB <0b01, 0, 1, 0>;
+def A2_addh_h16_hl : T_XTYPE_ADD_SUB <0b10, 0, 1, 0>;
+def A2_addh_h16_hh : T_XTYPE_ADD_SUB <0b11, 0, 1, 0>;
+
+let Itinerary = ALU64_tc_2_SLOT23, Defs = [USR_OVF] in {
+  //Rd=sub(Rt.[LH],Rs.[LH]):sat:<<16
+  def A2_subh_h16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 1, 1>;
+  def A2_subh_h16_sat_lh : T_XTYPE_ADD_SUB <0b01, 1, 1, 1>;
+  def A2_subh_h16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 1, 1>;
+  def A2_subh_h16_sat_hh : T_XTYPE_ADD_SUB <0b11, 1, 1, 1>;
+
+  //Rd=add(Rt.[LH],Rs.[LH]):sat:<<16
+  def A2_addh_h16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 1, 0>;
+  def A2_addh_h16_sat_lh : T_XTYPE_ADD_SUB <0b01, 1, 1, 0>;
+  def A2_addh_h16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 1, 0>;
+  def A2_addh_h16_sat_hh : T_XTYPE_ADD_SUB <0b11, 1, 1, 0>;
+}
+
+// Add halfword.
+def: Pat<(sext_inreg (add I32:$src1, I32:$src2), i16),
+         (A2_addh_l16_ll I32:$src1, I32:$src2)>;
+
+def: Pat<(sra (add (shl I32:$src1, (i32 16)), I32:$src2), (i32 16)),
+         (A2_addh_l16_hl I32:$src1, I32:$src2)>;
+
+def: Pat<(shl (add I32:$src1, I32:$src2), (i32 16)),
+         (A2_addh_h16_ll I32:$src1, I32:$src2)>;
 
 // Subtract halfword.
+def: Pat<(sext_inreg (sub I32:$src1, I32:$src2), i16),
+         (A2_subh_l16_ll I32:$src1, I32:$src2)>;
+
+def: Pat<(shl (sub I32:$src1, I32:$src2), (i32 16)),
+         (A2_subh_h16_ll I32:$src1, I32:$src2)>;
+
+let hasSideEffects = 0, hasNewValue = 1 in
+def S2_parityp: ALU64Inst<(outs IntRegs:$Rd),
+      (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
+      "$Rd = parity($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-24} = 0b0000;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{4-0} = Rd;
+}
+
+let hasNewValue = 1, opNewValue = 0, hasSideEffects = 0 in
+class T_XTYPE_MIN_MAX < bit isMax, bit isUnsigned >
+  : ALU64Inst < (outs IntRegs:$Rd), (ins IntRegs:$Rt, IntRegs:$Rs),
+  "$Rd = "#!if(isMax,"max","min")#!if(isUnsigned,"u","")
+          #"($Rt, $Rs)", [], "", ALU64_tc_2_SLOT23> {
+    bits<5> Rd;
+    bits<5> Rt;
+    bits<5> Rs;
+
+    let IClass = 0b1101;
+
+    let Inst{27-23} = 0b01011;
+    let Inst{22-21} = !if(isMax, 0b10, 0b01);
+    let Inst{7} = isUnsigned;
+    let Inst{4-0} = Rd;
+    let Inst{12-8} = !if(isMax, Rs, Rt);
+    let Inst{20-16} = !if(isMax, Rt, Rs);
+  }
+
+def A2_min  : T_XTYPE_MIN_MAX < 0, 0 >;
+def A2_minu : T_XTYPE_MIN_MAX < 0, 1 >;
+def A2_max  : T_XTYPE_MIN_MAX < 1, 0 >;
+def A2_maxu : T_XTYPE_MIN_MAX < 1, 1 >;
+
+// Here, depending on  the operand being selected, we'll either generate a
+// min or max instruction.
+// Ex:
+// (a>b)?a:b --> max(a,b) => Here check performed is '>' and the value selected
+// is the larger of two. So, the corresponding HexagonInst is passed in 'Inst'.
+// (a>b)?b:a --> min(a,b) => Here check performed is '>' but the smaller value
+// is selected and the corresponding HexagonInst is passed in 'SwapInst'.
+
+multiclass T_MinMax_pats <PatFrag Op, RegisterClass RC, ValueType VT,
+                          InstHexagon Inst, InstHexagon SwapInst> {
+  def: Pat<(select (i1 (Op (VT RC:$src1), (VT RC:$src2))),
+                   (VT RC:$src1), (VT RC:$src2)),
+           (Inst RC:$src1, RC:$src2)>;
+  def: Pat<(select (i1 (Op (VT RC:$src1), (VT RC:$src2))),
+                   (VT RC:$src2), (VT RC:$src1)),
+           (SwapInst RC:$src1, RC:$src2)>;
+}
+
+
+multiclass MinMax_pats <PatFrag Op, InstHexagon Inst, InstHexagon SwapInst> {
+  defm: T_MinMax_pats<Op, IntRegs, i32, Inst, SwapInst>;
+
+  def: Pat<(sext_inreg (i32 (select (i1 (Op (i32 PositiveHalfWord:$src1),
+                                            (i32 PositiveHalfWord:$src2))),
+                                    (i32 PositiveHalfWord:$src1),
+                                    (i32 PositiveHalfWord:$src2))), i16),
+           (Inst IntRegs:$src1, IntRegs:$src2)>;
+
+  def: Pat<(sext_inreg (i32 (select (i1 (Op (i32 PositiveHalfWord:$src1),
+                                            (i32 PositiveHalfWord:$src2))),
+                                    (i32 PositiveHalfWord:$src2),
+                                    (i32 PositiveHalfWord:$src1))), i16),
+           (SwapInst IntRegs:$src1, IntRegs:$src2)>;
+}
+
+let AddedComplexity = 200 in {
+  defm: MinMax_pats<setge,  A2_max,  A2_min>;
+  defm: MinMax_pats<setgt,  A2_max,  A2_min>;
+  defm: MinMax_pats<setle,  A2_min,  A2_max>;
+  defm: MinMax_pats<setlt,  A2_min,  A2_max>;
+  defm: MinMax_pats<setuge, A2_maxu, A2_minu>;
+  defm: MinMax_pats<setugt, A2_maxu, A2_minu>;
+  defm: MinMax_pats<setule, A2_minu, A2_maxu>;
+  defm: MinMax_pats<setult, A2_minu, A2_maxu>;
+}
+
+class T_cmp64_rr<string mnemonic, bits<3> MinOp, bit IsComm>
+  : ALU64_rr<(outs PredRegs:$Pd), (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
+             "$Pd = "#mnemonic#"($Rs, $Rt)", [], "", ALU64_tc_2early_SLOT23> {
+  let isCompare = 1;
+  let isCommutable = IsComm;
+  let hasSideEffects = 0;
+
+  bits<2> Pd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-21} = 0b0010100;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{7-5} = MinOp;
+  let Inst{1-0} = Pd;
+}
+
+def C2_cmpeqp  : T_cmp64_rr<"cmp.eq",  0b000, 1>;
+def C2_cmpgtp  : T_cmp64_rr<"cmp.gt",  0b010, 0>;
+def C2_cmpgtup : T_cmp64_rr<"cmp.gtu", 0b100, 0>;
+
+class T_cmp64_rr_pat<InstHexagon MI, PatFrag CmpOp>
+  : Pat<(i1 (CmpOp (i64 DoubleRegs:$Rs), (i64 DoubleRegs:$Rt))),
+        (i1 (MI DoubleRegs:$Rs, DoubleRegs:$Rt))>;
+
+def: T_cmp64_rr_pat<C2_cmpeqp,  seteq>;
+def: T_cmp64_rr_pat<C2_cmpgtp,  setgt>;
+def: T_cmp64_rr_pat<C2_cmpgtup, setugt>;
+def: T_cmp64_rr_pat<C2_cmpgtp,  RevCmp<setlt>>;
+def: T_cmp64_rr_pat<C2_cmpgtup, RevCmp<setult>>;
+
+def C2_vmux : ALU64_rr<(outs DoubleRegs:$Rd),
+      (ins PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt),
+      "$Rd = vmux($Pu, $Rs, $Rt)", [], "", ALU64_tc_1_SLOT23> {
+  let hasSideEffects = 0;
+
+  bits<5> Rd;
+  bits<2> Pu;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-24} = 0b0001;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{6-5} = Pu;
+  let Inst{4-0} = Rd;
+}
+
+class T_ALU64_rr<string mnemonic, string suffix, bits<4> RegType,
+                 bits<3> MajOp, bits<3> MinOp, bit OpsRev, bit IsComm,
+                 string Op2Pfx>
+  : ALU64_rr<(outs DoubleRegs:$Rd), (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
+             "$Rd = " #mnemonic# "($Rs, " #Op2Pfx# "$Rt)" #suffix, [],
+             "", ALU64_tc_1_SLOT23> {
+  let hasSideEffects = 0;
+  let isCommutable = IsComm;
+
+  bits<5> Rs;
+  bits<5> Rt;
+  bits<5> Rd;
+
+  let IClass = 0b1101;
+  let Inst{27-24} = RegType;
+  let Inst{23-21} = MajOp;
+  let Inst{20-16} = !if (OpsRev,Rt,Rs);
+  let Inst{12-8} = !if (OpsRev,Rs,Rt);
+  let Inst{7-5} = MinOp;
+  let Inst{4-0} = Rd;
+}
+
+class T_ALU64_arith<string mnemonic, bits<3> MajOp, bits<3> MinOp, bit IsSat,
+                    bit OpsRev, bit IsComm>
+  : T_ALU64_rr<mnemonic, !if(IsSat,":sat",""), 0b0011, MajOp, MinOp, OpsRev,
+               IsComm, "">;
+
+def A2_addp : T_ALU64_arith<"add", 0b000, 0b111, 0, 0, 1>;
+def A2_subp : T_ALU64_arith<"sub", 0b001, 0b111, 0, 1, 0>;
+
+def: Pat<(i64 (add I64:$Rs, I64:$Rt)), (A2_addp I64:$Rs, I64:$Rt)>;
+def: Pat<(i64 (sub I64:$Rs, I64:$Rt)), (A2_subp I64:$Rs, I64:$Rt)>;
+
+class T_ALU64_logical<string mnemonic, bits<3> MinOp, bit OpsRev, bit IsComm,
+                      bit IsNeg>
+  : T_ALU64_rr<mnemonic, "", 0b0011, 0b111, MinOp, OpsRev, IsComm,
+               !if(IsNeg,"~","")>;
+
+def A2_andp : T_ALU64_logical<"and", 0b000, 0, 1, 0>;
+def A2_orp  : T_ALU64_logical<"or",  0b010, 0, 1, 0>;
+def A2_xorp : T_ALU64_logical<"xor", 0b100, 0, 1, 0>;
+
+def: Pat<(i64 (and I64:$Rs, I64:$Rt)), (A2_andp I64:$Rs, I64:$Rt)>;
+def: Pat<(i64 (or  I64:$Rs, I64:$Rt)), (A2_orp  I64:$Rs, I64:$Rt)>;
+def: Pat<(i64 (xor I64:$Rs, I64:$Rt)), (A2_xorp I64:$Rs, I64:$Rt)>;
 
 //===----------------------------------------------------------------------===//
 // ALU64/ALU -
@@ -762,82 +1302,119 @@ def SUB64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
 // Pipelined looping instructions.
 
 // Logical operations on predicates.
-def AND_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
-             "$dst = and($src1, $src2)",
-             [(set (i1 PredRegs:$dst), (and (i1 PredRegs:$src1),
-                                            (i1 PredRegs:$src2)))]>;
-
-let neverHasSideEffects = 1 in
-def AND_pnotp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1,
-                                                 PredRegs:$src2),
-                "$dst = and($src1, !$src2)",
-                []>;
-
-def ANY_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1),
-             "$dst = any8($src1)",
-             []>;
-
-def ALL_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1),
-             "$dst = all8($src1)",
-             []>;
-
-def VITPACK_pp : SInst<(outs IntRegs:$dst), (ins PredRegs:$src1,
-                                                 PredRegs:$src2),
-             "$dst = vitpack($src1, $src2)",
-             []>;
+let hasSideEffects = 0 in
+class T_LOGICAL_1OP<string MnOp, bits<2> OpBits>
+    : CRInst<(outs PredRegs:$Pd), (ins PredRegs:$Ps),
+             "$Pd = " # MnOp # "($Ps)", [], "", CR_tc_2early_SLOT23> {
+  bits<2> Pd;
+  bits<2> Ps;
+
+  let IClass = 0b0110;
+  let Inst{27-23} = 0b10111;
+  let Inst{22-21} = OpBits;
+  let Inst{20} = 0b0;
+  let Inst{17-16} = Ps;
+  let Inst{13} = 0b0;
+  let Inst{1-0} = Pd;
+}
 
-def VALIGN_rrp : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                    DoubleRegs:$src2,
-                                                    PredRegs:$src3),
-             "$dst = valignb($src1, $src2, $src3)",
-             []>;
+def C2_any8 : T_LOGICAL_1OP<"any8", 0b00>;
+def C2_all8 : T_LOGICAL_1OP<"all8", 0b01>;
+def C2_not  : T_LOGICAL_1OP<"not",  0b10>;
+
+def: Pat<(i1 (not (i1 PredRegs:$Ps))),
+         (C2_not PredRegs:$Ps)>;
+
+let hasSideEffects = 0 in
+class T_LOGICAL_2OP<string MnOp, bits<3> OpBits, bit IsNeg, bit Rev>
+    : CRInst<(outs PredRegs:$Pd), (ins PredRegs:$Ps, PredRegs:$Pt),
+             "$Pd = " # MnOp # "($Ps, " # !if (IsNeg,"!","") # "$Pt)",
+             [], "", CR_tc_2early_SLOT23> {
+  bits<2> Pd;
+  bits<2> Ps;
+  bits<2> Pt;
+
+  let IClass = 0b0110;
+  let Inst{27-24} = 0b1011;
+  let Inst{23-21} = OpBits;
+  let Inst{20} = 0b0;
+  let Inst{17-16} = !if(Rev,Pt,Ps);  // Rs and Rt are reversed for some
+  let Inst{13} = 0b0;                // instructions.
+  let Inst{9-8} = !if(Rev,Ps,Pt);
+  let Inst{1-0} = Pd;
+}
 
-def VSPLICE_rrp : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                     DoubleRegs:$src2,
-                                                     PredRegs:$src3),
-             "$dst = vspliceb($src1, $src2, $src3)",
-             []>;
+def C2_and  : T_LOGICAL_2OP<"and", 0b000, 0, 1>;
+def C2_or   : T_LOGICAL_2OP<"or",  0b001, 0, 1>;
+def C2_xor  : T_LOGICAL_2OP<"xor", 0b010, 0, 0>;
+def C2_andn : T_LOGICAL_2OP<"and", 0b011, 1, 1>;
+def C2_orn  : T_LOGICAL_2OP<"or",  0b111, 1, 1>;
 
-def MASK_p : SInst<(outs DoubleRegs:$dst), (ins PredRegs:$src1),
-             "$dst = mask($src1)",
-             []>;
+def: Pat<(i1 (and I1:$Ps, I1:$Pt)),       (C2_and  I1:$Ps, I1:$Pt)>;
+def: Pat<(i1 (or  I1:$Ps, I1:$Pt)),       (C2_or   I1:$Ps, I1:$Pt)>;
+def: Pat<(i1 (xor I1:$Ps, I1:$Pt)),       (C2_xor  I1:$Ps, I1:$Pt)>;
+def: Pat<(i1 (and I1:$Ps, (not I1:$Pt))), (C2_andn I1:$Ps, I1:$Pt)>;
+def: Pat<(i1 (or  I1:$Ps, (not I1:$Pt))), (C2_orn  I1:$Ps, I1:$Pt)>;
 
-def NOT_p : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1),
-             "$dst = not($src1)",
-             [(set (i1 PredRegs:$dst), (not (i1 PredRegs:$src1)))]>;
-
-def OR_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
-            "$dst = or($src1, $src2)",
-            [(set (i1 PredRegs:$dst), (or (i1 PredRegs:$src1),
-                                          (i1 PredRegs:$src2)))]>;
+let hasSideEffects = 0, hasNewValue = 1 in
+def C2_vitpack : SInst<(outs IntRegs:$Rd), (ins PredRegs:$Ps, PredRegs:$Pt),
+      "$Rd = vitpack($Ps, $Pt)", [], "", S_2op_tc_1_SLOT23> {
+  bits<5> Rd;
+  bits<2> Ps;
+  bits<2> Pt;
+
+  let IClass = 0b1000;
+  let Inst{27-24} = 0b1001;
+  let Inst{22-21} = 0b00;
+  let Inst{17-16} = Ps;
+  let Inst{9-8} = Pt;
+  let Inst{4-0} = Rd;
+}
 
-def XOR_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
-             "$dst = xor($src1, $src2)",
-             [(set (i1 PredRegs:$dst), (xor (i1 PredRegs:$src1),
-                                            (i1 PredRegs:$src2)))]>;
+let hasSideEffects = 0 in
+def C2_mask : SInst<(outs DoubleRegs:$Rd), (ins PredRegs:$Pt),
+      "$Rd = mask($Pt)", [], "", S_2op_tc_1_SLOT23> {
+  bits<5> Rd;
+  bits<2> Pt;
 
+  let IClass = 0b1000;
+  let Inst{27-24} = 0b0110;
+  let Inst{9-8} = Pt;
+  let Inst{4-0} = Rd;
+}
 
 // User control register transfer.
 //===----------------------------------------------------------------------===//
 // CR -
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// JR +
+//===----------------------------------------------------------------------===//
+
 def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone,
-                               [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-def eh_return: SDNode<"HexagonISD::EH_RETURN", SDTNone,
-                      [SDNPHasChain]>;
+                     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def eh_return: SDNode<"HexagonISD::EH_RETURN", SDTNone, [SDNPHasChain]>;
 
 def SDHexagonBR_JT: SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
 def HexagonBR_JT: SDNode<"HexagonISD::BR_JT", SDHexagonBR_JT, [SDNPHasChain]>;
 
-let InputType = "imm", isBarrier = 1, isPredicable = 1,
-Defs = [PC], isExtendable = 1, opExtendable = 0, isExtentSigned = 1,
-opExtentBits = 24, isCodeGenOnly = 0 in
-class T_JMP <dag InsDag, list<dag> JumpList = []>
-            : JInst<(outs), InsDag,
-            "jump $dst" , JumpList> {
-    bits<24> dst;
+class CondStr<string CReg, bit True, bit New> {
+  string S = "if (" # !if(True,"","!") # CReg # !if(New,".new","") # ") ";
+}
+class JumpOpcStr<string Mnemonic, bit New, bit Taken> {
+  string S = Mnemonic # !if(Taken, ":t", !if(New, ":nt", ""));
+}
 
+let isBranch = 1, isBarrier = 1, Defs = [PC], hasSideEffects = 0,
+    isPredicable = 1,
+    isExtendable = 1, opExtendable = 0, isExtentSigned = 1,
+    opExtentBits = 24, opExtentAlign = 2, InputType = "imm" in
+class T_JMP<string ExtStr>
+  : JInst<(outs), (ins brtarget:$dst),
+      "jump " # ExtStr # "$dst",
+      [], "", J_tc_2early_SLOT23> {
+    bits<24> dst;
     let IClass = 0b0101;
 
     let Inst{27-25} = 0b100;
@@ -845,16 +1422,16 @@ class T_JMP <dag InsDag, list<dag> JumpList = []>
     let Inst{13-1} = dst{14-2};
 }
 
-let InputType = "imm", isExtendable = 1, opExtendable = 1, isExtentSigned = 1,
-Defs = [PC], isPredicated = 1, opExtentBits = 17 in
-class T_JMP_c <bit PredNot, bit isPredNew, bit isTak>:
-            JInst<(outs ), (ins PredRegs:$src, brtarget:$dst),
-            !if(PredNot, "if (!$src", "if ($src")#
-            !if(isPredNew, ".new) ", ") ")#"jump"#
-            !if(isPredNew, !if(isTak, ":t ", ":nt "), " ")#"$dst"> {
-
+let isBranch = 1, Defs = [PC], hasSideEffects = 0, isPredicated = 1,
+    isExtendable = 1, opExtendable = 1, isExtentSigned = 1,
+    opExtentBits = 17, opExtentAlign = 2, InputType = "imm" in
+class T_JMP_c<bit PredNot, bit isPredNew, bit isTak, string ExtStr>
+  : JInst<(outs), (ins PredRegs:$src, brtarget:$dst),
+      CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
+        JumpOpcStr<"jump", isPredNew, isTak>.S # " " #
+        ExtStr # "$dst",
+      [], "", J_tc_2early_SLOT23>, ImmRegRel {
     let isTaken = isTak;
-    let isBrTaken = !if(isPredNew, !if(isTaken, "true", "false"), "");
     let isPredicatedFalse = PredNot;
     let isPredicatedNew = isPredNew;
     bits<2> src;
@@ -864,7 +1441,7 @@ class T_JMP_c <bit PredNot, bit isPredNew, bit isTak>:
 
     let Inst{27-24} = 0b1100;
     let Inst{21} = PredNot;
-    let Inst{12} = !if(isPredNew, isTak, zero);
+    let Inst{12} = isTak;
     let Inst{11} = isPredNew;
     let Inst{9-8} = src;
     let Inst{23-22} = dst{16-15};
@@ -873,11 +1450,28 @@ class T_JMP_c <bit PredNot, bit isPredNew, bit isTak>:
     let Inst{7-1} = dst{8-2};
   }
 
-let isBarrier = 1, Defs = [PC], isPredicable = 1, InputType = "reg" in
-class T_JMPr<dag InsDag = (ins IntRegs:$dst)>
-            : JRInst<(outs ), InsDag,
-            "jumpr $dst" ,
-            []> {
+multiclass JMP_Pred<bit PredNot, string ExtStr> {
+  def NAME       : T_JMP_c<PredNot, 0, 0, ExtStr>; // not taken
+  // Predicate new
+  def NAME#newpt : T_JMP_c<PredNot, 1, 1, ExtStr>; // taken
+  def NAME#new   : T_JMP_c<PredNot, 1, 0, ExtStr>; // not taken
+}
+
+multiclass JMP_base<string BaseOp, string ExtStr> {
+  let BaseOpcode = BaseOp in {
+    def NAME : T_JMP<ExtStr>;
+    defm t : JMP_Pred<0, ExtStr>;
+    defm f : JMP_Pred<1, ExtStr>;
+  }
+}
+
+// Jumps to address stored in a register, JUMPR_MISC
+// if ([[!]P[.new]]) jumpr[:t/nt] Rs
+let isBranch = 1, isIndirectBranch = 1, isBarrier = 1, Defs = [PC],
+    isPredicable = 1, hasSideEffects = 0, InputType = "reg" in
+class T_JMPr
+  : JRInst<(outs), (ins IntRegs:$dst),
+      "jumpr $dst", [], "", J_tc_2early_SLOT2> {
     bits<5> dst;
 
     let IClass = 0b0101;
@@ -885,15 +1479,15 @@ class T_JMPr<dag InsDag = (ins IntRegs:$dst)>
     let Inst{20-16} = dst;
 }
 
-let Defs = [PC], isPredicated = 1, InputType = "reg" in
-class T_JMPr_c <bit PredNot, bit isPredNew, bit isTak>:
-            JRInst <(outs ), (ins PredRegs:$src, IntRegs:$dst),
-            !if(PredNot, "if (!$src", "if ($src")#
-            !if(isPredNew, ".new) ", ") ")#"jumpr"#
-            !if(isPredNew, !if(isTak, ":t ", ":nt "), " ")#"$dst"> {
+let isBranch = 1, isIndirectBranch = 1, Defs = [PC], isPredicated = 1,
+    hasSideEffects = 0, InputType = "reg" in
+class T_JMPr_c <bit PredNot, bit isPredNew, bit isTak>
+  : JRInst <(outs), (ins PredRegs:$src, IntRegs:$dst),
+      CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
+        JumpOpcStr<"jumpr", isPredNew, isTak>.S # " $dst", [],
+      "", J_tc_2early_SLOT2> {
 
     let isTaken = isTak;
-    let isBrTaken = !if(isPredNew, !if(isTaken, "true", "false"), "");
     let isPredicatedFalse = PredNot;
     let isPredicatedNew = isPredNew;
     bits<2> src;
@@ -904,73 +1498,88 @@ class T_JMPr_c <bit PredNot, bit isPredNew, bit isTak>:
     let Inst{27-22} = 0b001101;
     let Inst{21} = PredNot;
     let Inst{20-16} = dst;
-    let Inst{12} = !if(isPredNew, isTak, zero);
+    let Inst{12} = isTak;
     let Inst{11} = isPredNew;
     let Inst{9-8} = src;
-    let Predicates = !if(isPredNew, [HasV3T], [HasV2T]);
-    let validSubTargets = !if(isPredNew, HasV3SubT, HasV2SubT);
-}
-
-multiclass JMP_Pred<bit PredNot> {
-  def _#NAME : T_JMP_c<PredNot, 0, 0>;
-  // Predicate new
-  def _#NAME#new_t  : T_JMP_c<PredNot, 1, 1>; // taken
-  def _#NAME#new_nt : T_JMP_c<PredNot, 1, 0>; // not taken
-}
-
-multiclass JMP_base<string BaseOp> {
-  let BaseOpcode = BaseOp in {
-    def NAME : T_JMP<(ins brtarget:$dst), [(br bb:$dst)]>;
-    defm t : JMP_Pred<0>;
-    defm f : JMP_Pred<1>;
-  }
 }
 
 multiclass JMPR_Pred<bit PredNot> {
-  def NAME: T_JMPr_c<PredNot, 0, 0>;
+  def NAME        : T_JMPr_c<PredNot, 0, 0>; // not taken
   // Predicate new
-  def NAME#new_tV3  : T_JMPr_c<PredNot, 1, 1>; // taken
-  def NAME#new_ntV3 : T_JMPr_c<PredNot, 1, 0>; // not taken
+  def NAME#newpt  : T_JMPr_c<PredNot, 1, 1>; // taken
+  def NAME#new    : T_JMPr_c<PredNot, 1, 0>; // not taken
 }
 
 multiclass JMPR_base<string BaseOp> {
   let BaseOpcode = BaseOp in {
     def NAME : T_JMPr;
-    defm _t : JMPR_Pred<0>;
-    defm _f : JMPR_Pred<1>;
+    defm t : JMPR_Pred<0>;
+    defm f : JMPR_Pred<1>;
   }
 }
 
-let isTerminator = 1, neverHasSideEffects = 1 in {
-let isBranch = 1 in
-defm JMP : JMP_base<"JMP">, PredNewRel;
+let isCall = 1, hasSideEffects = 1 in
+class JUMPR_MISC_CALLR<bit isPred, bit isPredNot,
+               dag InputDag = (ins IntRegs:$Rs)>
+  : JRInst<(outs), InputDag,
+      !if(isPred, !if(isPredNot, "if (!$Pu) callr $Rs",
+                                 "if ($Pu) callr $Rs"),
+                                 "callr $Rs"),
+      [], "", J_tc_2early_SLOT2> {
+    bits<5> Rs;
+    bits<2> Pu;
+    let isPredicated = isPred;
+    let isPredicatedFalse = isPredNot;
 
-let isBranch = 1, isIndirectBranch = 1 in
-defm JMPR : JMPR_base<"JMPr">, PredNewRel;
+    let IClass = 0b0101;
+    let Inst{27-25} = 0b000;
+    let Inst{24-23} = !if (isPred, 0b10, 0b01);
+    let Inst{22} = 0;
+    let Inst{21} = isPredNot;
+    let Inst{9-8} = !if (isPred, Pu, 0b00);
+    let Inst{20-16} = Rs;
 
-let isReturn = 1, isCodeGenOnly = 1 in
-defm JMPret : JMPR_base<"JMPret">, PredNewRel;
+  }
+
+let Defs = VolatileV3.Regs in {
+  def J2_callrt : JUMPR_MISC_CALLR<1, 0, (ins PredRegs:$Pu, IntRegs:$Rs)>;
+  def J2_callrf : JUMPR_MISC_CALLR<1, 1, (ins PredRegs:$Pu, IntRegs:$Rs)>;
 }
 
-def : Pat<(retflag),
-          (JMPret (i32 R31))>;
+let isTerminator = 1, hasSideEffects = 0 in {
+  defm J2_jump : JMP_base<"JMP", "">, PredNewRel;
 
-def : Pat <(brcond (i1 PredRegs:$src1), bb:$offset),
-      (JMP_t (i1 PredRegs:$src1), bb:$offset)>;
+  // Deal with explicit assembly
+  //  - never extened a jump #,  always extend a jump ##
+  let isAsmParserOnly = 1 in {
+    defm J2_jump_ext   : JMP_base<"JMP", "##">;
+    defm J2_jump_noext : JMP_base<"JMP", "#">;
+  }
 
-// A return through builtin_eh_return.
-let isReturn = 1, isTerminator = 1, isBarrier = 1, neverHasSideEffects = 1,
-isCodeGenOnly = 1, Defs = [PC], Uses = [R28], isPredicable = 0 in
-def EH_RETURN_JMPR : T_JMPr;
+  defm J2_jumpr : JMPR_base<"JMPr">, PredNewRel;
 
-def : Pat<(eh_return),
-          (EH_RETURN_JMPR (i32 R31))>;
+  let isReturn = 1, isCodeGenOnly = 1 in
+  defm JMPret : JMPR_base<"JMPret">, PredNewRel;
+}
 
-def : Pat<(HexagonBR_JT (i32 IntRegs:$dst)),
-          (JMPR (i32 IntRegs:$dst))>;
+def: Pat<(br bb:$dst),
+         (J2_jump brtarget:$dst)>;
+def: Pat<(retflag),
+         (JMPret (i32 R31))>;
+def: Pat<(brcond (i1 PredRegs:$src1), bb:$offset),
+         (J2_jumpt PredRegs:$src1, bb:$offset)>;
 
-def : Pat<(brind (i32 IntRegs:$dst)),
-          (JMPR (i32 IntRegs:$dst))>;
+// A return through builtin_eh_return.
+let isReturn = 1, isTerminator = 1, isBarrier = 1, hasSideEffects = 0,
+    isCodeGenOnly = 1, Defs = [PC], Uses = [R28], isPredicable = 0 in
+def EH_RETURN_JMPR : T_JMPr;
+
+def: Pat<(eh_return),
+         (EH_RETURN_JMPR (i32 R31))>;
+def: Pat<(HexagonBR_JT (i32 IntRegs:$dst)),
+         (J2_jumpr IntRegs:$dst)>;
+def: Pat<(brind (i32 IntRegs:$dst)),
+         (J2_jumpr IntRegs:$dst)>;
 
 //===----------------------------------------------------------------------===//
 // JR -
@@ -979,265 +1588,688 @@ def : Pat<(brind (i32 IntRegs:$dst)),
 //===----------------------------------------------------------------------===//
 // LD +
 //===----------------------------------------------------------------------===//
-///
-// Load -- MEMri operand
-multiclass LD_MEMri_Pbase<string mnemonic, RegisterClass RC,
-                          bit isNot, bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME : LDInst2<(outs RC:$dst),
-                       (ins PredRegs:$src1, MEMri:$addr),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-            ") ")#"$dst = "#mnemonic#"($addr)",
-            []>;
-}
-
-multiclass LD_MEMri_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    defm _c#NAME : LD_MEMri_Pbase<mnemonic, RC, PredNot, 0>;
-    // Predicate new
-    defm _cdn#NAME : LD_MEMri_Pbase<mnemonic, RC, PredNot, 1>;
+
+// Load - Base with Immediate offset addressing mode
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, AddedComplexity = 20 in
+class T_load_io <string mnemonic, RegisterClass RC, bits<4> MajOp,
+                 Operand ImmOp>
+  : LDInst<(outs RC:$dst), (ins IntRegs:$src1, ImmOp:$offset),
+  "$dst = "#mnemonic#"($src1 + #$offset)", []>, AddrModeRel {
+    bits<4> name;
+    bits<5> dst;
+    bits<5> src1;
+    bits<14> offset;
+    bits<11> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "s11_3Ext"), offset{13-3},
+                     !if (!eq(ImmOpStr, "s11_2Ext"), offset{12-2},
+                     !if (!eq(ImmOpStr, "s11_1Ext"), offset{11-1},
+                                      /* s11_0Ext */ offset{10-0})));
+    let opExtentBits = !if (!eq(ImmOpStr, "s11_3Ext"), 14,
+                       !if (!eq(ImmOpStr, "s11_2Ext"), 13,
+                       !if (!eq(ImmOpStr, "s11_1Ext"), 12,
+                                        /* s11_0Ext */ 11)));
+    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
+
+    let IClass = 0b1001;
+
+    let Inst{27}    = 0b0;
+    let Inst{26-25} = offsetBits{10-9};
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13-5}  = offsetBits{8-0};
+    let Inst{4-0}   = dst;
   }
-}
 
-let isExtendable = 1, neverHasSideEffects = 1 in
-multiclass LD_MEMri<string mnemonic, string CextOp, RegisterClass RC,
-                    bits<5> ImmBits, bits<5> PredImmBits> {
+let opExtendable = 3, isExtentSigned = 0, isPredicated = 1 in
+class T_pload_io <string mnemonic, RegisterClass RC, bits<4>MajOp,
+                  Operand ImmOp, bit isNot, bit isPredNew>
+  : LDInst<(outs RC:$dst),
+           (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset),
+  "if ("#!if(isNot, "!$src1", "$src1")
+       #!if(isPredNew, ".new", "")
+       #") $dst = "#mnemonic#"($src2 + #$offset)",
+  [],"", V2LDST_tc_ld_SLOT01> , AddrModeRel {
+    bits<5> dst;
+    bits<2> src1;
+    bits<5> src2;
+    bits<9> offset;
+    bits<6> offsetBits;
+    string ImmOpStr = !cast<string>(ImmOp);
+
+    let offsetBits = !if (!eq(ImmOpStr, "u6_3Ext"), offset{8-3},
+                     !if (!eq(ImmOpStr, "u6_2Ext"), offset{7-2},
+                     !if (!eq(ImmOpStr, "u6_1Ext"), offset{6-1},
+                                      /* u6_0Ext */ offset{5-0})));
+    let opExtentBits = !if (!eq(ImmOpStr, "u6_3Ext"), 9,
+                       !if (!eq(ImmOpStr, "u6_2Ext"), 8,
+                       !if (!eq(ImmOpStr, "u6_1Ext"), 7,
+                                        /* u6_0Ext */ 6)));
+    let hasNewValue = !if (!eq(ImmOpStr, "u6_3Ext"), 0, 1);
+    let isPredicatedNew = isPredNew;
+    let isPredicatedFalse = isNot;
+
+    let IClass = 0b0100;
+
+    let Inst{27}    = 0b0;
+    let Inst{27}    = 0b0;
+    let Inst{26}    = isNot;
+    let Inst{25}    = isPredNew;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{13}    = 0b0;
+    let Inst{12-11} = src1;
+    let Inst{10-5}  = offsetBits;
+    let Inst{4-0}   = dst;
+  }
 
-  let CextOpcode = CextOp, BaseOpcode = CextOp in {
-    let opExtendable = 2, isExtentSigned = 1, opExtentBits = ImmBits,
-        isPredicable = 1 in
-      def NAME : LDInst2<(outs RC:$dst), (ins MEMri:$addr),
-                   "$dst = "#mnemonic#"($addr)",
-                   []>;
-
-    let opExtendable = 3, isExtentSigned = 0, opExtentBits = PredImmBits,
-        isPredicated = 1 in {
-      defm Pt : LD_MEMri_Pred<mnemonic, RC, 0 >;
-      defm NotPt : LD_MEMri_Pred<mnemonic, RC, 1 >;
-    }
+let isExtendable = 1, hasSideEffects = 0, addrMode = BaseImmOffset in
+multiclass LD_Idxd<string mnemonic, string CextOp, RegisterClass RC,
+                   Operand ImmOp, Operand predImmOp, bits<4>MajOp> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
+    let isPredicable = 1 in
+    def L2_#NAME#_io : T_load_io <mnemonic, RC, MajOp, ImmOp>;
+
+    // Predicated
+    def L2_p#NAME#t_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 0, 0>;
+    def L2_p#NAME#f_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 1, 0>;
+
+    // Predicated new
+    def L2_p#NAME#tnew_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 0, 1>;
+    def L2_p#NAME#fnew_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 1, 1>;
   }
 }
 
-let addrMode = BaseImmOffset, isMEMri = "true" in {
-  let accessSize = ByteAccess in {
-    defm LDrib: LD_MEMri < "memb", "LDrib", IntRegs, 11, 6>, AddrModeRel;
-    defm LDriub: LD_MEMri < "memub" , "LDriub", IntRegs, 11, 6>, AddrModeRel;
- }
+let accessSize = ByteAccess in {
+  defm loadrb:  LD_Idxd <"memb", "LDrib", IntRegs, s11_0Ext, u6_0Ext, 0b1000>;
+  defm loadrub: LD_Idxd <"memub", "LDriub", IntRegs, s11_0Ext, u6_0Ext, 0b1001>;
+}
 
-  let accessSize = HalfWordAccess in {
-    defm LDrih: LD_MEMri < "memh", "LDrih", IntRegs, 12, 7>, AddrModeRel;
-    defm LDriuh: LD_MEMri < "memuh", "LDriuh", IntRegs, 12, 7>, AddrModeRel;
- }
+let accessSize = HalfWordAccess, opExtentAlign = 1 in {
+  defm loadrh:  LD_Idxd <"memh", "LDrih", IntRegs, s11_1Ext, u6_1Ext, 0b1010>;
+  defm loadruh: LD_Idxd <"memuh", "LDriuh", IntRegs, s11_1Ext, u6_1Ext, 0b1011>;
+}
 
-  let accessSize = WordAccess in
-    defm LDriw: LD_MEMri < "memw", "LDriw", IntRegs, 13, 8>, AddrModeRel;
+let accessSize = WordAccess, opExtentAlign = 2 in
+defm loadri: LD_Idxd <"memw", "LDriw", IntRegs, s11_2Ext, u6_2Ext, 0b1100>;
 
-  let accessSize = DoubleWordAccess in
-    defm LDrid: LD_MEMri < "memd", "LDrid", DoubleRegs, 14, 9>, AddrModeRel;
+let accessSize = DoubleWordAccess, opExtentAlign = 3 in
+defm loadrd: LD_Idxd <"memd", "LDrid", DoubleRegs, s11_3Ext, u6_3Ext, 0b1110>;
+
+let accessSize = HalfWordAccess, opExtentAlign = 1 in {
+  def L2_loadbsw2_io:   T_load_io<"membh",  IntRegs, 0b0001, s11_1Ext>;
+  def L2_loadbzw2_io:   T_load_io<"memubh", IntRegs, 0b0011, s11_1Ext>;
 }
 
-def : Pat < (i32 (sextloadi8 ADDRriS11_0:$addr)),
-            (LDrib ADDRriS11_0:$addr) >;
+let accessSize = WordAccess, opExtentAlign = 2 in {
+  def L2_loadbzw4_io: T_load_io<"memubh", DoubleRegs, 0b0101, s11_2Ext>;
+  def L2_loadbsw4_io: T_load_io<"membh",  DoubleRegs, 0b0111, s11_2Ext>;
+}
 
-def : Pat < (i32 (zextloadi8 ADDRriS11_0:$addr)),
-            (LDriub ADDRriS11_0:$addr) >;
+let addrMode = BaseImmOffset, isExtendable = 1, hasSideEffects = 0,
+    opExtendable = 3, isExtentSigned = 1  in
+class T_loadalign_io <string str, bits<4> MajOp, Operand ImmOp>
+  : LDInst<(outs DoubleRegs:$dst),
+           (ins DoubleRegs:$src1, IntRegs:$src2, ImmOp:$offset),
+  "$dst = "#str#"($src2 + #$offset)", [],
+  "$src1 = $dst">, AddrModeRel {
+    bits<4> name;
+    bits<5> dst;
+    bits<5> src2;
+    bits<12> offset;
+    bits<11> offsetBits;
+
+    let offsetBits = !if (!eq(!cast<string>(ImmOp), "s11_1Ext"), offset{11-1},
+                                                  /* s11_0Ext */ offset{10-0});
+    let IClass = 0b1001;
+
+    let Inst{27}    = 0b0;
+    let Inst{26-25} = offsetBits{10-9};
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{13-5}  = offsetBits{8-0};
+    let Inst{4-0}   = dst;
+  }
 
-def : Pat < (i32 (sextloadi16 ADDRriS11_1:$addr)),
-            (LDrih ADDRriS11_1:$addr) >;
+let accessSize = HalfWordAccess, opExtentBits = 12, opExtentAlign = 1 in
+def L2_loadalignh_io: T_loadalign_io <"memh_fifo", 0b0010, s11_1Ext>;
 
-def : Pat < (i32 (zextloadi16 ADDRriS11_1:$addr)),
-            (LDriuh ADDRriS11_1:$addr) >;
+let accessSize = ByteAccess, opExtentBits = 11 in
+def L2_loadalignb_io: T_loadalign_io <"memb_fifo", 0b0100, s11_0Ext>;
 
-def : Pat < (i32 (load ADDRriS11_2:$addr)),
-            (LDriw ADDRriS11_2:$addr) >;
+// Patterns to select load-indexed (i.e. load from base+offset).
+multiclass Loadx_pat<PatFrag Load, ValueType VT, PatLeaf ImmPred,
+                     InstHexagon MI> {
+  def: Pat<(VT (Load AddrFI:$fi)), (VT (MI AddrFI:$fi, 0))>;
+  def: Pat<(VT (Load (add (i32 IntRegs:$Rs), ImmPred:$Off))),
+           (VT (MI IntRegs:$Rs, imm:$Off))>;
+  def: Pat<(VT (Load (i32 IntRegs:$Rs))), (VT (MI IntRegs:$Rs, 0))>;
+}
 
-def : Pat < (i64 (load ADDRriS11_3:$addr)),
-            (LDrid ADDRriS11_3:$addr) >;
+let AddedComplexity = 20 in {
+  defm: Loadx_pat<load,           i32, s11_2ExtPred, L2_loadri_io>;
+  defm: Loadx_pat<load,           i64, s11_3ExtPred, L2_loadrd_io>;
+  defm: Loadx_pat<atomic_load_8 , i32, s11_0ExtPred, L2_loadrub_io>;
+  defm: Loadx_pat<atomic_load_16, i32, s11_1ExtPred, L2_loadruh_io>;
+  defm: Loadx_pat<atomic_load_32, i32, s11_2ExtPred, L2_loadri_io>;
+  defm: Loadx_pat<atomic_load_64, i64, s11_3ExtPred, L2_loadrd_io>;
+
+  defm: Loadx_pat<extloadi1,      i32, s11_0ExtPred, L2_loadrub_io>;
+  defm: Loadx_pat<extloadi8,      i32, s11_0ExtPred, L2_loadrub_io>;
+  defm: Loadx_pat<extloadi16,     i32, s11_1ExtPred, L2_loadruh_io>;
+  defm: Loadx_pat<sextloadi8,     i32, s11_0ExtPred, L2_loadrb_io>;
+  defm: Loadx_pat<sextloadi16,    i32, s11_1ExtPred, L2_loadrh_io>;
+  defm: Loadx_pat<zextloadi1,     i32, s11_0ExtPred, L2_loadrub_io>;
+  defm: Loadx_pat<zextloadi8,     i32, s11_0ExtPred, L2_loadrub_io>;
+  defm: Loadx_pat<zextloadi16,    i32, s11_1ExtPred, L2_loadruh_io>;
+  // No sextloadi1.
+}
 
+// Sign-extending loads of i1 need to replicate the lowest bit throughout
+// the 32-bit value. Since the loaded value can only be 0 or 1, 0-v should
+// do the trick.
+let AddedComplexity = 20 in
+def: Pat<(i32 (sextloadi1 (i32 IntRegs:$Rs))),
+         (A2_subri 0, (L2_loadrub_io IntRegs:$Rs, 0))>;
 
-// Load - Base with Immediate offset addressing mode
-multiclass LD_Idxd_Pbase<string mnemonic, RegisterClass RC, Operand predImmOp,
-                        bit isNot, bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME : LDInst2<(outs RC:$dst),
-                     (ins PredRegs:$src1, IntRegs:$src2, predImmOp:$src3),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-            ") ")#"$dst = "#mnemonic#"($src2+#$src3)",
-            []>;
-}
-
-multiclass LD_Idxd_Pred<string mnemonic, RegisterClass RC, Operand predImmOp,
-                        bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    defm _c#NAME : LD_Idxd_Pbase<mnemonic, RC, predImmOp, PredNot, 0>;
-    // Predicate new
-    defm _cdn#NAME : LD_Idxd_Pbase<mnemonic, RC, predImmOp, PredNot, 1>;
+//===----------------------------------------------------------------------===//
+// Post increment load
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Template class for non-predicated post increment loads with immediate offset.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, addrMode = PostInc in
+class T_load_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
+                     bits<4> MajOp >
+  : LDInstPI <(outs RC:$dst, IntRegs:$dst2),
+  (ins IntRegs:$src1, ImmOp:$offset),
+  "$dst = "#mnemonic#"($src1++#$offset)" ,
+  [],
+  "$src1 = $dst2" > ,
+  PredNewRel {
+    bits<5> dst;
+    bits<5> src1;
+    bits<7> offset;
+    bits<4> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
+                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+                                      /* s4_0Imm */ offset{3-0})));
+    let hasNewValue = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
+
+    let IClass = 0b1001;
+
+    let Inst{27-25} = 0b101;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13-12} = 0b00;
+    let Inst{8-5} = offsetBits;
+    let Inst{4-0}   = dst;
   }
-}
 
-let isExtendable = 1, neverHasSideEffects = 1 in
-multiclass LD_Idxd<string mnemonic, string CextOp, RegisterClass RC,
-                   Operand ImmOp, Operand predImmOp, bits<5> ImmBits,
-                   bits<5> PredImmBits> {
+//===----------------------------------------------------------------------===//
+// Template class for predicated post increment loads with immediate offset.
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc in
+class T_pload_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
+                          bits<4> MajOp, bit isPredNot, bit isPredNew >
+  : LDInst <(outs RC:$dst, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset),
+  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+  ") ")#"$dst = "#mnemonic#"($src2++#$offset)",
+  [] ,
+  "$src2 = $dst2" > ,
+  PredNewRel {
+    bits<5> dst;
+    bits<2> src1;
+    bits<5> src2;
+    bits<7> offset;
+    bits<4> offsetBits;
 
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
-    let opExtendable = 2, isExtentSigned = 1, opExtentBits = ImmBits,
-        isPredicable = 1, AddedComplexity = 20 in
-      def NAME : LDInst2<(outs RC:$dst), (ins IntRegs:$src1, ImmOp:$offset),
-                   "$dst = "#mnemonic#"($src1+#$offset)",
-                   []>;
-
-    let opExtendable = 3, isExtentSigned = 0, opExtentBits = PredImmBits,
-        isPredicated = 1 in {
-      defm Pt : LD_Idxd_Pred<mnemonic, RC, predImmOp, 0 >;
-      defm NotPt : LD_Idxd_Pred<mnemonic, RC, predImmOp, 1 >;
-    }
+    let isPredicatedNew = isPredNew;
+    let isPredicatedFalse = isPredNot;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
+                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+                                      /* s4_0Imm */ offset{3-0})));
+    let hasNewValue = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
+
+    let IClass = 0b1001;
+
+    let Inst{27-25} = 0b101;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{13} = 0b1;
+    let Inst{12} = isPredNew;
+    let Inst{11} = isPredNot;
+    let Inst{10-9} = src1;
+    let Inst{8-5}  = offsetBits;
+    let Inst{4-0}  = dst;
   }
-}
 
-let addrMode = BaseImmOffset in {
-  let accessSize = ByteAccess in {
-    defm LDrib_indexed: LD_Idxd <"memb", "LDrib", IntRegs, s11_0Ext, u6_0Ext,
-                                  11, 6>, AddrModeRel;
-    defm LDriub_indexed: LD_Idxd <"memub" , "LDriub", IntRegs, s11_0Ext, u6_0Ext,
-                                   11, 6>, AddrModeRel;
-  }
-  let accessSize = HalfWordAccess in {
-    defm LDrih_indexed: LD_Idxd <"memh", "LDrih", IntRegs, s11_1Ext, u6_1Ext,
-                                 12, 7>, AddrModeRel;
-    defm LDriuh_indexed: LD_Idxd <"memuh", "LDriuh", IntRegs, s11_1Ext, u6_1Ext,
-                                  12, 7>, AddrModeRel;
+//===----------------------------------------------------------------------===//
+// Multiclass for post increment loads with immediate offset.
+//===----------------------------------------------------------------------===//
+
+multiclass LD_PostInc <string mnemonic, string BaseOp, RegisterClass RC,
+                       Operand ImmOp, bits<4> MajOp> {
+  let BaseOpcode = "POST_"#BaseOp in {
+    let isPredicable = 1 in
+    def L2_#NAME#_pi : T_load_pi < mnemonic, RC, ImmOp, MajOp>;
+
+    // Predicated
+    def L2_p#NAME#t_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 0, 0>;
+    def L2_p#NAME#f_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 1, 0>;
+
+    // Predicated new
+    def L2_p#NAME#tnew_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 0, 1>;
+    def L2_p#NAME#fnew_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 1, 1>;
   }
-  let accessSize = WordAccess in
-    defm LDriw_indexed: LD_Idxd <"memw", "LDriw", IntRegs, s11_2Ext, u6_2Ext,
-                                 13, 8>, AddrModeRel;
+}
 
-  let accessSize = DoubleWordAccess in
-    defm LDrid_indexed: LD_Idxd <"memd", "LDrid", DoubleRegs, s11_3Ext, u6_3Ext,
-                                 14, 9>, AddrModeRel;
+// post increment byte loads with immediate offset
+let accessSize = ByteAccess in {
+  defm loadrb  : LD_PostInc <"memb",  "LDrib", IntRegs, s4_0Imm, 0b1000>;
+  defm loadrub : LD_PostInc <"memub", "LDriub", IntRegs, s4_0Imm, 0b1001>;
 }
 
-let AddedComplexity = 20 in {
-def : Pat < (i32 (sextloadi8 (add IntRegs:$src1, s11_0ExtPred:$offset))),
-            (LDrib_indexed IntRegs:$src1, s11_0ExtPred:$offset) >;
+// post increment halfword loads with immediate offset
+let accessSize = HalfWordAccess, opExtentAlign = 1 in {
+  defm loadrh  : LD_PostInc <"memh",  "LDrih", IntRegs, s4_1Imm, 0b1010>;
+  defm loadruh : LD_PostInc <"memuh", "LDriuh", IntRegs, s4_1Imm, 0b1011>;
+}
 
-def : Pat < (i32 (zextloadi8 (add IntRegs:$src1, s11_0ExtPred:$offset))),
-            (LDriub_indexed IntRegs:$src1, s11_0ExtPred:$offset) >;
+// post increment word loads with immediate offset
+let accessSize = WordAccess, opExtentAlign = 2 in
+defm loadri : LD_PostInc <"memw", "LDriw", IntRegs, s4_2Imm, 0b1100>;
 
-def : Pat < (i32 (sextloadi16 (add IntRegs:$src1, s11_1ExtPred:$offset))),
-            (LDrih_indexed IntRegs:$src1, s11_1ExtPred:$offset) >;
+// post increment doubleword loads with immediate offset
+let accessSize = DoubleWordAccess, opExtentAlign = 3 in
+defm loadrd : LD_PostInc <"memd", "LDrid", DoubleRegs, s4_3Imm, 0b1110>;
+
+// Rd=memb[u]h(Rx++#s4:1)
+// Rdd=memb[u]h(Rx++#s4:2)
+let accessSize = HalfWordAccess, opExtentAlign = 1 in {
+  def L2_loadbsw2_pi   : T_load_pi <"membh", IntRegs, s4_1Imm, 0b0001>;
+  def L2_loadbzw2_pi   : T_load_pi <"memubh", IntRegs, s4_1Imm, 0b0011>;
+}
+let accessSize = WordAccess, opExtentAlign = 2, hasNewValue = 0 in {
+  def L2_loadbsw4_pi   : T_load_pi <"membh", DoubleRegs, s4_2Imm, 0b0111>;
+  def L2_loadbzw4_pi   : T_load_pi <"memubh", DoubleRegs, s4_2Imm, 0b0101>;
+}
 
-def : Pat < (i32 (zextloadi16 (add IntRegs:$src1, s11_1ExtPred:$offset))),
-            (LDriuh_indexed IntRegs:$src1, s11_1ExtPred:$offset) >;
+//===----------------------------------------------------------------------===//
+// Template class for post increment fifo loads with immediate offset.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, addrMode = PostInc in
+class T_loadalign_pi <string mnemonic, Operand ImmOp, bits<4> MajOp >
+  : LDInstPI <(outs DoubleRegs:$dst, IntRegs:$dst2),
+  (ins DoubleRegs:$src1, IntRegs:$src2, ImmOp:$offset),
+  "$dst = "#mnemonic#"($src2++#$offset)" ,
+  [], "$src2 = $dst2, $src1 = $dst" > ,
+  PredNewRel {
+    bits<5> dst;
+    bits<5> src2;
+    bits<5> offset;
+    bits<4> offsetBits;
+
+    let offsetBits = !if (!eq(!cast<string>(ImmOp), "s4_1Imm"), offset{4-1},
+                                                  /* s4_0Imm */ offset{3-0});
+    let IClass = 0b1001;
+
+    let Inst{27-25} = 0b101;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{13-12} = 0b00;
+    let Inst{8-5} = offsetBits;
+    let Inst{4-0}   = dst;
+  }
 
-def : Pat < (i32 (load (add IntRegs:$src1, s11_2ExtPred:$offset))),
-            (LDriw_indexed IntRegs:$src1, s11_2ExtPred:$offset) >;
+// Ryy=memh_fifo(Rx++#s4:1)
+// Ryy=memb_fifo(Rx++#s4:0)
+let accessSize = ByteAccess in
+def L2_loadalignb_pi : T_loadalign_pi <"memb_fifo", s4_0Imm, 0b0100>;
 
-def : Pat < (i64 (load (add IntRegs:$src1, s11_3ExtPred:$offset))),
-            (LDrid_indexed IntRegs:$src1, s11_3ExtPred:$offset) >;
-}
+let accessSize = HalfWordAccess, opExtentAlign = 1 in
+def L2_loadalignh_pi : T_loadalign_pi <"memh_fifo", s4_1Imm, 0b0010>;
 
 //===----------------------------------------------------------------------===//
-// Post increment load
+// Template class for post increment loads with register offset.
 //===----------------------------------------------------------------------===//
+let hasSideEffects = 0, addrMode = PostInc in
+class T_load_pr <string mnemonic, RegisterClass RC, bits<4> MajOp,
+                       MemAccessSize AccessSz>
+  : LDInstPI <(outs RC:$dst, IntRegs:$_dst_),
+              (ins IntRegs:$src1, ModRegs:$src2),
+  "$dst = "#mnemonic#"($src1++$src2)" ,
+  [], "$src1 = $_dst_" > {
+    bits<5> dst;
+    bits<5> src1;
+    bits<1> src2;
+
+    let accessSize = AccessSz;
+    let IClass = 0b1001;
+
+    let Inst{27-25} = 0b110;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13}    = src2;
+    let Inst{12}    = 0b0;
+    let Inst{7}     = 0b0;
+    let Inst{4-0}   = dst;
+  }
+
+let hasNewValue = 1 in {
+  def L2_loadrb_pr  : T_load_pr <"memb",  IntRegs, 0b1000, ByteAccess>;
+  def L2_loadrub_pr : T_load_pr <"memub", IntRegs, 0b1001, ByteAccess>;
+  def L2_loadrh_pr  : T_load_pr <"memh",  IntRegs, 0b1010, HalfWordAccess>;
+  def L2_loadruh_pr : T_load_pr <"memuh", IntRegs, 0b1011, HalfWordAccess>;
+  def L2_loadri_pr  : T_load_pr <"memw",  IntRegs, 0b1100, WordAccess>;
 
-multiclass LD_PostInc_Pbase<string mnemonic, RegisterClass RC, Operand ImmOp,
-                            bit isNot, bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME : LDInst2PI<(outs RC:$dst, IntRegs:$dst2),
-                       (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-            ") ")#"$dst = "#mnemonic#"($src2++#$offset)",
-            [],
-            "$src2 = $dst2">;
+  def L2_loadbzw2_pr : T_load_pr <"memubh", IntRegs, 0b0011, HalfWordAccess>;
 }
 
-multiclass LD_PostInc_Pred<string mnemonic, RegisterClass RC,
-                           Operand ImmOp, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    defm _c#NAME : LD_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 0>;
-    // Predicate new
-    let Predicates = [HasV4T], validSubTargets = HasV4SubT in
-    defm _cdn#NAME#_V4 : LD_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 1>;
-  }
+def L2_loadrd_pr   : T_load_pr <"memd", DoubleRegs, 0b1110, DoubleWordAccess>;
+def L2_loadbzw4_pr : T_load_pr <"memubh", DoubleRegs, 0b0101, WordAccess>;
+
+// Load predicate.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def LDriw_pred : LDInst<(outs PredRegs:$dst),
+                        (ins IntRegs:$addr, s11_2Ext:$off),
+                        ".error \"should not emit\"", []>;
+
+let Defs = [R29, R30, R31], Uses = [R30], hasSideEffects = 0 in
+  def L2_deallocframe : LDInst<(outs), (ins),
+                     "deallocframe",
+                     []> {
+    let IClass = 0b1001;
+
+    let Inst{27-16} = 0b000000011110;
+    let Inst{13} = 0b0;
+    let Inst{4-0} = 0b11110;
 }
 
-multiclass LD_PostInc<string mnemonic, string BaseOp, RegisterClass RC,
-                      Operand ImmOp> {
+// Load / Post increment circular addressing mode.
+let Uses = [CS], hasSideEffects = 0 in
+class T_load_pcr<string mnemonic, RegisterClass RC, bits<4> MajOp>
+  : LDInst <(outs RC:$dst, IntRegs:$_dst_),
+            (ins IntRegs:$Rz, ModRegs:$Mu),
+  "$dst = "#mnemonic#"($Rz ++ I:circ($Mu))", [],
+  "$Rz = $_dst_" > {
+    bits<5> dst;
+    bits<5> Rz;
+    bit Mu;
+
+    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
+    let IClass = 0b1001;
 
-  let BaseOpcode = "POST_"#BaseOp in {
-    let isPredicable = 1 in
-    def NAME : LDInst2PI<(outs RC:$dst, IntRegs:$dst2),
-                         (ins IntRegs:$src1, ImmOp:$offset),
-                 "$dst = "#mnemonic#"($src1++#$offset)",
-                 [],
-                 "$src1 = $dst2">;
-
-    let isPredicated = 1 in {
-      defm Pt : LD_PostInc_Pred<mnemonic, RC, ImmOp, 0 >;
-      defm NotPt : LD_PostInc_Pred<mnemonic, RC, ImmOp, 1 >;
-    }
+    let Inst{27-25} = 0b100;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = Rz;
+    let Inst{13} = Mu;
+    let Inst{12} = 0b0;
+    let Inst{9} = 0b1;
+    let Inst{7} = 0b0;
+    let Inst{4-0} = dst;
+ }
+
+let accessSize = ByteAccess in {
+  def L2_loadrb_pcr  : T_load_pcr <"memb",  IntRegs, 0b1000>;
+  def L2_loadrub_pcr : T_load_pcr <"memub", IntRegs, 0b1001>;
+}
+
+let accessSize = HalfWordAccess in {
+  def L2_loadrh_pcr   : T_load_pcr <"memh",   IntRegs, 0b1010>;
+  def L2_loadruh_pcr  : T_load_pcr <"memuh",  IntRegs, 0b1011>;
+  def L2_loadbsw2_pcr : T_load_pcr <"membh",  IntRegs, 0b0001>;
+  def L2_loadbzw2_pcr : T_load_pcr <"memubh", IntRegs, 0b0011>;
+}
+
+let accessSize = WordAccess in {
+  def  L2_loadri_pcr  : T_load_pcr <"memw", IntRegs, 0b1100>;
+  let hasNewValue = 0 in {
+    def L2_loadbzw4_pcr : T_load_pcr <"memubh", DoubleRegs, 0b0101>;
+    def L2_loadbsw4_pcr : T_load_pcr <"membh",  DoubleRegs, 0b0111>;
   }
 }
 
-let hasCtrlDep = 1, neverHasSideEffects = 1, addrMode = PostInc in {
-  defm POST_LDrib : LD_PostInc<"memb", "LDrib", IntRegs, s4_0Imm>,
-                    PredNewRel;
-  defm POST_LDriub : LD_PostInc<"memub", "LDriub", IntRegs, s4_0Imm>,
-                    PredNewRel;
-  defm POST_LDrih : LD_PostInc<"memh", "LDrih", IntRegs, s4_1Imm>,
-                    PredNewRel;
-  defm POST_LDriuh : LD_PostInc<"memuh", "LDriuh", IntRegs, s4_1Imm>,
-                    PredNewRel;
-  defm POST_LDriw : LD_PostInc<"memw", "LDriw", IntRegs, s4_2Imm>,
-                    PredNewRel;
-  defm POST_LDrid : LD_PostInc<"memd", "LDrid", DoubleRegs, s4_3Imm>,
-                    PredNewRel;
+let accessSize = DoubleWordAccess in
+def L2_loadrd_pcr  : T_load_pcr <"memd", DoubleRegs, 0b1110>;
+
+// Load / Post increment circular addressing mode.
+let Uses = [CS], hasSideEffects = 0 in
+class T_loadalign_pcr<string mnemonic, bits<4> MajOp, MemAccessSize AccessSz >
+  : LDInst <(outs DoubleRegs:$dst, IntRegs:$_dst_),
+            (ins DoubleRegs:$_src_, IntRegs:$Rz, ModRegs:$Mu),
+  "$dst = "#mnemonic#"($Rz ++ I:circ($Mu))", [],
+  "$Rz = $_dst_, $dst = $_src_" > {
+    bits<5> dst;
+    bits<5> Rz;
+    bit Mu;
+
+    let accessSize = AccessSz;
+    let IClass = 0b1001;
+
+    let Inst{27-25} = 0b100;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = Rz;
+    let Inst{13}    = Mu;
+    let Inst{12}    = 0b0;
+    let Inst{9}     = 0b1;
+    let Inst{7}     = 0b0;
+    let Inst{4-0}   = dst;
+ }
+
+def L2_loadalignb_pcr : T_loadalign_pcr <"memb_fifo", 0b0100, ByteAccess>;
+def L2_loadalignh_pcr : T_loadalign_pcr <"memh_fifo", 0b0010, HalfWordAccess>;
+
+//===----------------------------------------------------------------------===//
+// Circular loads with immediate offset.
+//===----------------------------------------------------------------------===//
+let Uses = [CS], mayLoad = 1, hasSideEffects = 0 in
+class T_load_pci <string mnemonic, RegisterClass RC,
+                  Operand ImmOp, bits<4> MajOp>
+  : LDInstPI<(outs RC:$dst, IntRegs:$_dst_),
+             (ins IntRegs:$Rz, ImmOp:$offset, ModRegs:$Mu),
+  "$dst = "#mnemonic#"($Rz ++ #$offset:circ($Mu))", [],
+  "$Rz = $_dst_"> {
+    bits<5> dst;
+    bits<5> Rz;
+    bits<1> Mu;
+    bits<7> offset;
+    bits<4> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
+    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
+                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+                                      /* s4_0Imm */ offset{3-0})));
+    let IClass      = 0b1001;
+    let Inst{27-25} = 0b100;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = Rz;
+    let Inst{13}    = Mu;
+    let Inst{12}    = 0b0;
+    let Inst{9}     = 0b0;
+    let Inst{8-5}   = offsetBits;
+    let Inst{4-0}   = dst;
+  }
+
+// Byte variants of circ load
+let accessSize = ByteAccess in {
+  def L2_loadrb_pci  : T_load_pci <"memb",  IntRegs, s4_0Imm, 0b1000>;
+  def L2_loadrub_pci : T_load_pci <"memub", IntRegs, s4_0Imm, 0b1001>;
 }
 
-def : Pat< (i32 (extloadi1 ADDRriS11_0:$addr)),
-           (i32 (LDrib ADDRriS11_0:$addr)) >;
+// Half word variants of circ load
+let accessSize = HalfWordAccess in {
+  def L2_loadrh_pci   : T_load_pci <"memh",   IntRegs, s4_1Imm, 0b1010>;
+  def L2_loadruh_pci  : T_load_pci <"memuh",  IntRegs, s4_1Imm, 0b1011>;
+  def L2_loadbzw2_pci : T_load_pci <"memubh", IntRegs, s4_1Imm, 0b0011>;
+  def L2_loadbsw2_pci : T_load_pci <"membh",  IntRegs, s4_1Imm, 0b0001>;
+}
 
-// Load byte any-extend.
-def : Pat < (i32 (extloadi8 ADDRriS11_0:$addr)),
-            (i32 (LDrib ADDRriS11_0:$addr)) >;
+// Word variants of circ load
+let accessSize = WordAccess in
+def L2_loadri_pci   : T_load_pci <"memw",   IntRegs,    s4_2Imm, 0b1100>;
 
-// Indexed load byte any-extend.
-let AddedComplexity = 20 in
-def : Pat < (i32 (extloadi8 (add IntRegs:$src1, s11_0ImmPred:$offset))),
-            (i32 (LDrib_indexed IntRegs:$src1, s11_0ImmPred:$offset)) >;
+let accessSize = WordAccess, hasNewValue = 0 in {
+  def L2_loadbzw4_pci : T_load_pci <"memubh", DoubleRegs, s4_2Imm, 0b0101>;
+  def L2_loadbsw4_pci : T_load_pci <"membh",  DoubleRegs, s4_2Imm, 0b0111>;
+}
 
-def : Pat < (i32 (extloadi16 ADDRriS11_1:$addr)),
-            (i32 (LDrih ADDRriS11_1:$addr))>;
+let accessSize = DoubleWordAccess, hasNewValue = 0 in
+def L2_loadrd_pci : T_load_pci <"memd", DoubleRegs, s4_3Imm, 0b1110>;
 
-let AddedComplexity = 20 in
-def : Pat < (i32 (extloadi16 (add IntRegs:$src1, s11_1ImmPred:$offset))),
-            (i32 (LDrih_indexed IntRegs:$src1, s11_1ImmPred:$offset)) >;
+//===----------------------------------------------------------------------===//
+// Circular loads - Pseudo
+//
+// Please note that the input operand order in the pseudo instructions
+// doesn't match with the real instructions. Pseudo instructions operand
+// order should mimics the ordering in the intrinsics. Also, 'src2' doesn't
+// appear in the AsmString because it's same as 'dst'.
+//===----------------------------------------------------------------------===//
+let isCodeGenOnly = 1,  mayLoad = 1, hasSideEffects = 0, isPseudo = 1 in
+class T_load_pci_pseudo <string opc, RegisterClass RC>
+  : LDInstPI<(outs IntRegs:$_dst_, RC:$dst),
+             (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4Imm:$src4),
+  ".error \"$dst = "#opc#"($src1++#$src4:circ($src3))\"",
+  [], "$src1 = $_dst_">;
+
+def L2_loadrb_pci_pseudo  : T_load_pci_pseudo <"memb",  IntRegs>;
+def L2_loadrub_pci_pseudo : T_load_pci_pseudo <"memub", IntRegs>;
+def L2_loadrh_pci_pseudo  : T_load_pci_pseudo <"memh",  IntRegs>;
+def L2_loadruh_pci_pseudo : T_load_pci_pseudo <"memuh", IntRegs>;
+def L2_loadri_pci_pseudo  : T_load_pci_pseudo <"memw",  IntRegs>;
+def L2_loadrd_pci_pseudo  : T_load_pci_pseudo <"memd",  DoubleRegs>;
+
+
+// TODO: memb_fifo and memh_fifo must take destination register as input.
+// One-off circ loads - not enough in common to break into a class.
+let accessSize = ByteAccess in
+def L2_loadalignb_pci : T_load_pci <"memb_fifo", DoubleRegs, s4_0Imm, 0b0100>;
+
+let accessSize = HalfWordAccess, opExtentAlign = 1 in
+def L2_loadalignh_pci : T_load_pci <"memh_fifo", DoubleRegs, s4_1Imm, 0b0010>;
+
+// L[24]_load[wd]_locked: Load word/double with lock.
+let isSoloAX = 1 in
+class T_load_locked <string mnemonic, RegisterClass RC>
+  : LD0Inst <(outs RC:$dst),
+             (ins IntRegs:$src),
+    "$dst = "#mnemonic#"($src)"> {
+    bits<5> dst;
+    bits<5> src;
+    let IClass = 0b1001;
+    let Inst{27-21} = 0b0010000;
+    let Inst{20-16} = src;
+    let Inst{13-12} = !if (!eq(mnemonic, "memd_locked"), 0b01, 0b00);
+    let Inst{5}   = 0;
+    let Inst{4-0} = dst;
+}
+let hasNewValue = 1, accessSize = WordAccess, opNewValue = 0 in
+  def L2_loadw_locked : T_load_locked <"memw_locked", IntRegs>;
+let accessSize = DoubleWordAccess in
+  def L4_loadd_locked : T_load_locked <"memd_locked", DoubleRegs>;
+
+// S[24]_store[wd]_locked: Store word/double conditionally.
+let isSoloAX = 1, isPredicateLate = 1 in
+class T_store_locked <string mnemonic, RegisterClass RC>
+  : ST0Inst <(outs PredRegs:$Pd), (ins IntRegs:$Rs, RC:$Rt),
+    mnemonic#"($Rs, $Pd) = $Rt"> {
+    bits<2> Pd;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1010;
+    let Inst{27-23} = 0b00001;
+    let Inst{22} = !if (!eq(mnemonic, "memw_locked"), 0b0, 0b1);
+    let Inst{21} = 0b1;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = Rt;
+    let Inst{1-0} = Pd;
+}
 
-let AddedComplexity = 10 in
-def : Pat < (i32 (zextloadi1 ADDRriS11_0:$addr)),
-            (i32 (LDriub ADDRriS11_0:$addr))>;
+let accessSize = WordAccess in
+def S2_storew_locked : T_store_locked <"memw_locked", IntRegs>;
 
-let AddedComplexity = 20 in
-def : Pat < (i32 (zextloadi1 (add IntRegs:$src1, s11_0ImmPred:$offset))),
-            (i32 (LDriub_indexed IntRegs:$src1, s11_0ImmPred:$offset))>;
+let accessSize = DoubleWordAccess in
+def S4_stored_locked : T_store_locked <"memd_locked", DoubleRegs>;
 
-// Load predicate.
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
-isPseudo = 1, Defs = [R10,R11,D5], neverHasSideEffects = 1 in
-def LDriw_pred : LDInst2<(outs PredRegs:$dst),
-            (ins MEMri:$addr),
-            "Error; should not emit",
-            []>;
+//===----------------------------------------------------------------------===//
+// Bit-reversed loads with auto-increment register
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_load_pbr<string mnemonic, RegisterClass RC,
+                            MemAccessSize addrSize, bits<4> majOp>
+  : LDInst
+    <(outs RC:$dst, IntRegs:$_dst_),
+     (ins IntRegs:$Rz, ModRegs:$Mu),
+     "$dst = "#mnemonic#"($Rz ++ $Mu:brev)" ,
+      [] , "$Rz = $_dst_" > {
+
+      let accessSize = addrSize;
+
+      bits<5> dst;
+      bits<5> Rz;
+      bits<1> Mu;
+
+      let IClass = 0b1001;
+
+      let Inst{27-25} = 0b111;
+      let Inst{24-21} = majOp;
+      let Inst{20-16} = Rz;
+      let Inst{13} = Mu;
+      let Inst{12} = 0b0;
+      let Inst{7} = 0b0;
+      let Inst{4-0} = dst;
+  }
 
-// Deallocate stack frame.
-let Defs = [R29, R30, R31], Uses = [R29], neverHasSideEffects = 1 in {
-  def DEALLOCFRAME : LDInst2<(outs), (ins),
-                     "deallocframe",
-                     []>;
+let hasNewValue =1, opNewValue = 0 in {
+  def L2_loadrb_pbr   : T_load_pbr <"memb",  IntRegs, ByteAccess, 0b1000>;
+  def L2_loadrub_pbr  : T_load_pbr <"memub", IntRegs, ByteAccess, 0b1001>;
+  def L2_loadrh_pbr   : T_load_pbr <"memh",  IntRegs, HalfWordAccess, 0b1010>;
+  def L2_loadruh_pbr  : T_load_pbr <"memuh", IntRegs, HalfWordAccess, 0b1011>;
+  def L2_loadbsw2_pbr : T_load_pbr <"membh", IntRegs, HalfWordAccess, 0b0001>;
+  def L2_loadbzw2_pbr : T_load_pbr <"memubh", IntRegs, HalfWordAccess, 0b0011>;
+  def L2_loadri_pbr : T_load_pbr <"memw", IntRegs, WordAccess, 0b1100>;
 }
 
-// Load and unpack bytes to halfwords.
+def L2_loadbzw4_pbr : T_load_pbr <"memubh", DoubleRegs, WordAccess, 0b0101>;
+def L2_loadbsw4_pbr : T_load_pbr <"membh",  DoubleRegs, WordAccess, 0b0111>;
+def L2_loadrd_pbr : T_load_pbr <"memd", DoubleRegs, DoubleWordAccess, 0b1110>;
+
+def L2_loadalignb_pbr :T_load_pbr <"memb_fifo", DoubleRegs, ByteAccess, 0b0100>;
+def L2_loadalignh_pbr :T_load_pbr <"memh_fifo", DoubleRegs,
+                                   HalfWordAccess, 0b0010>;
+
+//===----------------------------------------------------------------------===//
+// Bit-reversed loads - Pseudo
+//
+// Please note that 'src2' doesn't appear in the AsmString because
+// it's same as 'dst'.
+//===----------------------------------------------------------------------===//
+let isCodeGenOnly = 1, mayLoad = 1, hasSideEffects = 0, isPseudo = 1 in
+class T_load_pbr_pseudo <string opc, RegisterClass RC>
+  : LDInstPI<(outs IntRegs:$_dst_, RC:$dst),
+             (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+  ".error \"$dst = "#opc#"($src1++$src3:brev)\"",
+  [], "$src1 = $_dst_">;
+
+def L2_loadrb_pbr_pseudo  : T_load_pbr_pseudo <"memb",  IntRegs>;
+def L2_loadrub_pbr_pseudo : T_load_pbr_pseudo <"memub", IntRegs>;
+def L2_loadrh_pbr_pseudo  : T_load_pbr_pseudo <"memh",  IntRegs>;
+def L2_loadruh_pbr_pseudo : T_load_pbr_pseudo <"memuh", IntRegs>;
+def L2_loadri_pbr_pseudo  : T_load_pbr_pseudo <"memw",  IntRegs>;
+def L2_loadrd_pbr_pseudo  : T_load_pbr_pseudo <"memd",  DoubleRegs>;
+
 //===----------------------------------------------------------------------===//
 // LD -
 //===----------------------------------------------------------------------===//
@@ -1259,180 +2291,934 @@ let Defs = [R29, R30, R31], Uses = [R29], neverHasSideEffects = 1 in {
 //===----------------------------------------------------------------------===//
 // MTYPE/MPYH +
 //===----------------------------------------------------------------------===//
-// Multiply and use lower result.
-// Rd=+mpyi(Rs,#u8)
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 8 in
-def MPYI_riu : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u8Ext:$src2),
-              "$dst =+ mpyi($src1, #$src2)",
-              [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
-                                             u8ExtPred:$src2))]>;
 
-// Rd=-mpyi(Rs,#u8)
-def MPYI_rin : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u8Imm:$src2),
-              "$dst =- mpyi($src1, #$src2)",
-              [(set (i32 IntRegs:$dst), (ineg (mul (i32 IntRegs:$src1),
-                                                   u8ImmPred:$src2)))]>;
+//===----------------------------------------------------------------------===//
+// Template Class
+// MPYS / Multipy signed/unsigned halfwords
+//Rd=mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat]
+//===----------------------------------------------------------------------===//
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_M2_mpy < bits<2> LHbits, bit isSat, bit isRnd,
+                 bit hasShift, bit isUnsigned>
+  : MInst < (outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
+  "$Rd = "#!if(isUnsigned,"mpyu","mpy")#"($Rs."#!if(LHbits{1},"h","l")
+                                       #", $Rt."#!if(LHbits{0},"h)","l)")
+                                       #!if(hasShift,":<<1","")
+                                       #!if(isRnd,":rnd","")
+                                       #!if(isSat,":sat",""),
+  [], "", M_tc_3x_SLOT23 > {
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1100;
+    let Inst{23} = hasShift;
+    let Inst{22} = isUnsigned;
+    let Inst{21} = isRnd;
+    let Inst{7} = isSat;
+    let Inst{6-5} = LHbits;
+    let Inst{4-0} = Rd;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = Rt;
+  }
+
+//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpy_ll_s1: T_M2_mpy<0b00, 0, 0, 1, 0>;
+def M2_mpy_ll_s0: T_M2_mpy<0b00, 0, 0, 0, 0>;
+def M2_mpy_lh_s1: T_M2_mpy<0b01, 0, 0, 1, 0>;
+def M2_mpy_lh_s0: T_M2_mpy<0b01, 0, 0, 0, 0>;
+def M2_mpy_hl_s1: T_M2_mpy<0b10, 0, 0, 1, 0>;
+def M2_mpy_hl_s0: T_M2_mpy<0b10, 0, 0, 0, 0>;
+def M2_mpy_hh_s1: T_M2_mpy<0b11, 0, 0, 1, 0>;
+def M2_mpy_hh_s0: T_M2_mpy<0b11, 0, 0, 0, 0>;
+
+//Rd=mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpyu_ll_s1: T_M2_mpy<0b00, 0, 0, 1, 1>;
+def M2_mpyu_ll_s0: T_M2_mpy<0b00, 0, 0, 0, 1>;
+def M2_mpyu_lh_s1: T_M2_mpy<0b01, 0, 0, 1, 1>;
+def M2_mpyu_lh_s0: T_M2_mpy<0b01, 0, 0, 0, 1>;
+def M2_mpyu_hl_s1: T_M2_mpy<0b10, 0, 0, 1, 1>;
+def M2_mpyu_hl_s0: T_M2_mpy<0b10, 0, 0, 0, 1>;
+def M2_mpyu_hh_s1: T_M2_mpy<0b11, 0, 0, 1, 1>;
+def M2_mpyu_hh_s0: T_M2_mpy<0b11, 0, 0, 0, 1>;
+
+//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1]:rnd
+def M2_mpy_rnd_ll_s1: T_M2_mpy <0b00, 0, 1, 1, 0>;
+def M2_mpy_rnd_ll_s0: T_M2_mpy <0b00, 0, 1, 0, 0>;
+def M2_mpy_rnd_lh_s1: T_M2_mpy <0b01, 0, 1, 1, 0>;
+def M2_mpy_rnd_lh_s0: T_M2_mpy <0b01, 0, 1, 0, 0>;
+def M2_mpy_rnd_hl_s1: T_M2_mpy <0b10, 0, 1, 1, 0>;
+def M2_mpy_rnd_hl_s0: T_M2_mpy <0b10, 0, 1, 0, 0>;
+def M2_mpy_rnd_hh_s1: T_M2_mpy <0b11, 0, 1, 1, 0>;
+def M2_mpy_rnd_hh_s0: T_M2_mpy <0b11, 0, 1, 0, 0>;
+
+//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1][:sat]
+//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat]
+let Defs = [USR_OVF] in {
+  def M2_mpy_sat_ll_s1: T_M2_mpy <0b00, 1, 0, 1, 0>;
+  def M2_mpy_sat_ll_s0: T_M2_mpy <0b00, 1, 0, 0, 0>;
+  def M2_mpy_sat_lh_s1: T_M2_mpy <0b01, 1, 0, 1, 0>;
+  def M2_mpy_sat_lh_s0: T_M2_mpy <0b01, 1, 0, 0, 0>;
+  def M2_mpy_sat_hl_s1: T_M2_mpy <0b10, 1, 0, 1, 0>;
+  def M2_mpy_sat_hl_s0: T_M2_mpy <0b10, 1, 0, 0, 0>;
+  def M2_mpy_sat_hh_s1: T_M2_mpy <0b11, 1, 0, 1, 0>;
+  def M2_mpy_sat_hh_s0: T_M2_mpy <0b11, 1, 0, 0, 0>;
+
+  def M2_mpy_sat_rnd_ll_s1: T_M2_mpy <0b00, 1, 1, 1, 0>;
+  def M2_mpy_sat_rnd_ll_s0: T_M2_mpy <0b00, 1, 1, 0, 0>;
+  def M2_mpy_sat_rnd_lh_s1: T_M2_mpy <0b01, 1, 1, 1, 0>;
+  def M2_mpy_sat_rnd_lh_s0: T_M2_mpy <0b01, 1, 1, 0, 0>;
+  def M2_mpy_sat_rnd_hl_s1: T_M2_mpy <0b10, 1, 1, 1, 0>;
+  def M2_mpy_sat_rnd_hl_s0: T_M2_mpy <0b10, 1, 1, 0, 0>;
+  def M2_mpy_sat_rnd_hh_s1: T_M2_mpy <0b11, 1, 1, 1, 0>;
+  def M2_mpy_sat_rnd_hh_s0: T_M2_mpy <0b11, 1, 1, 0, 0>;
+}
+
+//===----------------------------------------------------------------------===//
+// Template Class
+// MPYS / Multipy signed/unsigned halfwords and add/subtract the
+// result from the accumulator.
+//Rx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
+//===----------------------------------------------------------------------===//
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_M2_mpy_acc < bits<2> LHbits, bit isSat, bit isNac,
+                 bit hasShift, bit isUnsigned >
+  : MInst_acc<(outs IntRegs:$Rx), (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
+  "$Rx "#!if(isNac,"-= ","+= ")#!if(isUnsigned,"mpyu","mpy")
+                              #"($Rs."#!if(LHbits{1},"h","l")
+                              #", $Rt."#!if(LHbits{0},"h)","l)")
+                              #!if(hasShift,":<<1","")
+                              #!if(isSat,":sat",""),
+  [], "$dst2 = $Rx", M_tc_3x_SLOT23 > {
+    bits<5> Rx;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+    let Inst{27-24} = 0b1110;
+    let Inst{23} = hasShift;
+    let Inst{22} = isUnsigned;
+    let Inst{21} = isNac;
+    let Inst{7} = isSat;
+    let Inst{6-5} = LHbits;
+    let Inst{4-0} = Rx;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = Rt;
+  }
+
+//Rx += mpy(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpy_acc_ll_s1: T_M2_mpy_acc <0b00, 0, 0, 1, 0>;
+def M2_mpy_acc_ll_s0: T_M2_mpy_acc <0b00, 0, 0, 0, 0>;
+def M2_mpy_acc_lh_s1: T_M2_mpy_acc <0b01, 0, 0, 1, 0>;
+def M2_mpy_acc_lh_s0: T_M2_mpy_acc <0b01, 0, 0, 0, 0>;
+def M2_mpy_acc_hl_s1: T_M2_mpy_acc <0b10, 0, 0, 1, 0>;
+def M2_mpy_acc_hl_s0: T_M2_mpy_acc <0b10, 0, 0, 0, 0>;
+def M2_mpy_acc_hh_s1: T_M2_mpy_acc <0b11, 0, 0, 1, 0>;
+def M2_mpy_acc_hh_s0: T_M2_mpy_acc <0b11, 0, 0, 0, 0>;
+
+//Rx += mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpyu_acc_ll_s1: T_M2_mpy_acc <0b00, 0, 0, 1, 1>;
+def M2_mpyu_acc_ll_s0: T_M2_mpy_acc <0b00, 0, 0, 0, 1>;
+def M2_mpyu_acc_lh_s1: T_M2_mpy_acc <0b01, 0, 0, 1, 1>;
+def M2_mpyu_acc_lh_s0: T_M2_mpy_acc <0b01, 0, 0, 0, 1>;
+def M2_mpyu_acc_hl_s1: T_M2_mpy_acc <0b10, 0, 0, 1, 1>;
+def M2_mpyu_acc_hl_s0: T_M2_mpy_acc <0b10, 0, 0, 0, 1>;
+def M2_mpyu_acc_hh_s1: T_M2_mpy_acc <0b11, 0, 0, 1, 1>;
+def M2_mpyu_acc_hh_s0: T_M2_mpy_acc <0b11, 0, 0, 0, 1>;
+
+//Rx -= mpy(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpy_nac_ll_s1: T_M2_mpy_acc <0b00, 0, 1, 1, 0>;
+def M2_mpy_nac_ll_s0: T_M2_mpy_acc <0b00, 0, 1, 0, 0>;
+def M2_mpy_nac_lh_s1: T_M2_mpy_acc <0b01, 0, 1, 1, 0>;
+def M2_mpy_nac_lh_s0: T_M2_mpy_acc <0b01, 0, 1, 0, 0>;
+def M2_mpy_nac_hl_s1: T_M2_mpy_acc <0b10, 0, 1, 1, 0>;
+def M2_mpy_nac_hl_s0: T_M2_mpy_acc <0b10, 0, 1, 0, 0>;
+def M2_mpy_nac_hh_s1: T_M2_mpy_acc <0b11, 0, 1, 1, 0>;
+def M2_mpy_nac_hh_s0: T_M2_mpy_acc <0b11, 0, 1, 0, 0>;
+
+//Rx -= mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpyu_nac_ll_s1: T_M2_mpy_acc <0b00, 0, 1, 1, 1>;
+def M2_mpyu_nac_ll_s0: T_M2_mpy_acc <0b00, 0, 1, 0, 1>;
+def M2_mpyu_nac_lh_s1: T_M2_mpy_acc <0b01, 0, 1, 1, 1>;
+def M2_mpyu_nac_lh_s0: T_M2_mpy_acc <0b01, 0, 1, 0, 1>;
+def M2_mpyu_nac_hl_s1: T_M2_mpy_acc <0b10, 0, 1, 1, 1>;
+def M2_mpyu_nac_hl_s0: T_M2_mpy_acc <0b10, 0, 1, 0, 1>;
+def M2_mpyu_nac_hh_s1: T_M2_mpy_acc <0b11, 0, 1, 1, 1>;
+def M2_mpyu_nac_hh_s0: T_M2_mpy_acc <0b11, 0, 1, 0, 1>;
+
+//Rx += mpy(Rs.[H|L],Rt.[H|L])[:<<1]:sat
+def M2_mpy_acc_sat_ll_s1: T_M2_mpy_acc <0b00, 1, 0, 1, 0>;
+def M2_mpy_acc_sat_ll_s0: T_M2_mpy_acc <0b00, 1, 0, 0, 0>;
+def M2_mpy_acc_sat_lh_s1: T_M2_mpy_acc <0b01, 1, 0, 1, 0>;
+def M2_mpy_acc_sat_lh_s0: T_M2_mpy_acc <0b01, 1, 0, 0, 0>;
+def M2_mpy_acc_sat_hl_s1: T_M2_mpy_acc <0b10, 1, 0, 1, 0>;
+def M2_mpy_acc_sat_hl_s0: T_M2_mpy_acc <0b10, 1, 0, 0, 0>;
+def M2_mpy_acc_sat_hh_s1: T_M2_mpy_acc <0b11, 1, 0, 1, 0>;
+def M2_mpy_acc_sat_hh_s0: T_M2_mpy_acc <0b11, 1, 0, 0, 0>;
+
+//Rx -= mpy(Rs.[H|L],Rt.[H|L])[:<<1]:sat
+def M2_mpy_nac_sat_ll_s1: T_M2_mpy_acc <0b00, 1, 1, 1, 0>;
+def M2_mpy_nac_sat_ll_s0: T_M2_mpy_acc <0b00, 1, 1, 0, 0>;
+def M2_mpy_nac_sat_lh_s1: T_M2_mpy_acc <0b01, 1, 1, 1, 0>;
+def M2_mpy_nac_sat_lh_s0: T_M2_mpy_acc <0b01, 1, 1, 0, 0>;
+def M2_mpy_nac_sat_hl_s1: T_M2_mpy_acc <0b10, 1, 1, 1, 0>;
+def M2_mpy_nac_sat_hl_s0: T_M2_mpy_acc <0b10, 1, 1, 0, 0>;
+def M2_mpy_nac_sat_hh_s1: T_M2_mpy_acc <0b11, 1, 1, 1, 0>;
+def M2_mpy_nac_sat_hh_s0: T_M2_mpy_acc <0b11, 1, 1, 0, 0>;
+
+//===----------------------------------------------------------------------===//
+// Template Class
+// MPYS / Multipy signed/unsigned halfwords and add/subtract the
+// result from the 64-bit destination register.
+//Rxx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
+//===----------------------------------------------------------------------===//
+
+class T_M2_mpyd_acc < bits<2> LHbits, bit isNac, bit hasShift, bit isUnsigned>
+  : MInst_acc<(outs DoubleRegs:$Rxx),
+              (ins DoubleRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
+  "$Rxx "#!if(isNac,"-= ","+= ")#!if(isUnsigned,"mpyu","mpy")
+                                #"($Rs."#!if(LHbits{1},"h","l")
+                                #", $Rt."#!if(LHbits{0},"h)","l)")
+                                #!if(hasShift,":<<1",""),
+  [], "$dst2 = $Rxx", M_tc_3x_SLOT23 > {
+    bits<5> Rxx;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b0110;
+    let Inst{23} = hasShift;
+    let Inst{22} = isUnsigned;
+    let Inst{21} = isNac;
+    let Inst{7} = 0;
+    let Inst{6-5} = LHbits;
+    let Inst{4-0} = Rxx;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = Rt;
+  }
+
+def M2_mpyd_acc_hh_s0: T_M2_mpyd_acc <0b11, 0, 0, 0>;
+def M2_mpyd_acc_hl_s0: T_M2_mpyd_acc <0b10, 0, 0, 0>;
+def M2_mpyd_acc_lh_s0: T_M2_mpyd_acc <0b01, 0, 0, 0>;
+def M2_mpyd_acc_ll_s0: T_M2_mpyd_acc <0b00, 0, 0, 0>;
+
+def M2_mpyd_acc_hh_s1: T_M2_mpyd_acc <0b11, 0, 1, 0>;
+def M2_mpyd_acc_hl_s1: T_M2_mpyd_acc <0b10, 0, 1, 0>;
+def M2_mpyd_acc_lh_s1: T_M2_mpyd_acc <0b01, 0, 1, 0>;
+def M2_mpyd_acc_ll_s1: T_M2_mpyd_acc <0b00, 0, 1, 0>;
+
+def M2_mpyd_nac_hh_s0: T_M2_mpyd_acc <0b11, 1, 0, 0>;
+def M2_mpyd_nac_hl_s0: T_M2_mpyd_acc <0b10, 1, 0, 0>;
+def M2_mpyd_nac_lh_s0: T_M2_mpyd_acc <0b01, 1, 0, 0>;
+def M2_mpyd_nac_ll_s0: T_M2_mpyd_acc <0b00, 1, 0, 0>;
+
+def M2_mpyd_nac_hh_s1: T_M2_mpyd_acc <0b11, 1, 1, 0>;
+def M2_mpyd_nac_hl_s1: T_M2_mpyd_acc <0b10, 1, 1, 0>;
+def M2_mpyd_nac_lh_s1: T_M2_mpyd_acc <0b01, 1, 1, 0>;
+def M2_mpyd_nac_ll_s1: T_M2_mpyd_acc <0b00, 1, 1, 0>;
+
+def M2_mpyud_acc_hh_s0: T_M2_mpyd_acc <0b11, 0, 0, 1>;
+def M2_mpyud_acc_hl_s0: T_M2_mpyd_acc <0b10, 0, 0, 1>;
+def M2_mpyud_acc_lh_s0: T_M2_mpyd_acc <0b01, 0, 0, 1>;
+def M2_mpyud_acc_ll_s0: T_M2_mpyd_acc <0b00, 0, 0, 1>;
+
+def M2_mpyud_acc_hh_s1: T_M2_mpyd_acc <0b11, 0, 1, 1>;
+def M2_mpyud_acc_hl_s1: T_M2_mpyd_acc <0b10, 0, 1, 1>;
+def M2_mpyud_acc_lh_s1: T_M2_mpyd_acc <0b01, 0, 1, 1>;
+def M2_mpyud_acc_ll_s1: T_M2_mpyd_acc <0b00, 0, 1, 1>;
+
+def M2_mpyud_nac_hh_s0: T_M2_mpyd_acc <0b11, 1, 0, 1>;
+def M2_mpyud_nac_hl_s0: T_M2_mpyd_acc <0b10, 1, 0, 1>;
+def M2_mpyud_nac_lh_s0: T_M2_mpyd_acc <0b01, 1, 0, 1>;
+def M2_mpyud_nac_ll_s0: T_M2_mpyd_acc <0b00, 1, 0, 1>;
+
+def M2_mpyud_nac_hh_s1: T_M2_mpyd_acc <0b11, 1, 1, 1>;
+def M2_mpyud_nac_hl_s1: T_M2_mpyd_acc <0b10, 1, 1, 1>;
+def M2_mpyud_nac_lh_s1: T_M2_mpyd_acc <0b01, 1, 1, 1>;
+def M2_mpyud_nac_ll_s1: T_M2_mpyd_acc <0b00, 1, 1, 1>;
+
+//===----------------------------------------------------------------------===//
+// Template Class -- Vector Multipy
+// Used for complex multiply real or imaginary, dual multiply and even halfwords
+//===----------------------------------------------------------------------===//
+class T_M2_vmpy < string opc, bits<3> MajOp, bits<3> MinOp, bit hasShift,
+                  bit isRnd, bit isSat >
+  : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Rdd = "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
+                              #!if(isRnd,":rnd","")
+                              #!if(isSat,":sat",""),
+  [] > {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1000;
+    let Inst{23-21} = MajOp;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = Rdd;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
+
+// Vector complex multiply imaginary: Rdd=vcmpyi(Rss,Rtt)[:<<1]:sat
+let Defs = [USR_OVF] in {
+def M2_vcmpy_s1_sat_i: T_M2_vmpy <"vcmpyi", 0b110, 0b110, 1, 0, 1>;
+def M2_vcmpy_s0_sat_i: T_M2_vmpy <"vcmpyi", 0b010, 0b110, 0, 0, 1>;
+
+// Vector complex multiply real: Rdd=vcmpyr(Rss,Rtt)[:<<1]:sat
+def M2_vcmpy_s1_sat_r: T_M2_vmpy <"vcmpyr", 0b101, 0b110, 1, 0, 1>;
+def M2_vcmpy_s0_sat_r: T_M2_vmpy <"vcmpyr", 0b001, 0b110, 0, 0, 1>;
+
+// Vector dual multiply: Rdd=vdmpy(Rss,Rtt)[:<<1]:sat
+def M2_vdmpys_s1: T_M2_vmpy <"vdmpy", 0b100, 0b100, 1, 0, 1>;
+def M2_vdmpys_s0: T_M2_vmpy <"vdmpy", 0b000, 0b100, 0, 0, 1>;
+
+// Vector multiply even halfwords: Rdd=vmpyeh(Rss,Rtt)[:<<1]:sat
+def M2_vmpy2es_s1: T_M2_vmpy <"vmpyeh", 0b100, 0b110, 1, 0, 1>;
+def M2_vmpy2es_s0: T_M2_vmpy <"vmpyeh", 0b000, 0b110, 0, 0, 1>;
+
+//Rdd=vmpywoh(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmpyh_s0:  T_M2_vmpy <"vmpywoh", 0b000, 0b111, 0, 0, 1>;
+def M2_mmpyh_s1:  T_M2_vmpy <"vmpywoh", 0b100, 0b111, 1, 0, 1>;
+def M2_mmpyh_rs0: T_M2_vmpy <"vmpywoh", 0b001, 0b111, 0, 1, 1>;
+def M2_mmpyh_rs1: T_M2_vmpy <"vmpywoh", 0b101, 0b111, 1, 1, 1>;
+
+//Rdd=vmpyweh(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmpyl_s0:  T_M2_vmpy <"vmpyweh", 0b000, 0b101, 0, 0, 1>;
+def M2_mmpyl_s1:  T_M2_vmpy <"vmpyweh", 0b100, 0b101, 1, 0, 1>;
+def M2_mmpyl_rs0: T_M2_vmpy <"vmpyweh", 0b001, 0b101, 0, 1, 1>;
+def M2_mmpyl_rs1: T_M2_vmpy <"vmpyweh", 0b101, 0b101, 1, 1, 1>;
+
+//Rdd=vmpywouh(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmpyuh_s0:  T_M2_vmpy <"vmpywouh", 0b010, 0b111, 0, 0, 1>;
+def M2_mmpyuh_s1:  T_M2_vmpy <"vmpywouh", 0b110, 0b111, 1, 0, 1>;
+def M2_mmpyuh_rs0: T_M2_vmpy <"vmpywouh", 0b011, 0b111, 0, 1, 1>;
+def M2_mmpyuh_rs1: T_M2_vmpy <"vmpywouh", 0b111, 0b111, 1, 1, 1>;
+
+//Rdd=vmpyweuh(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmpyul_s0:  T_M2_vmpy <"vmpyweuh", 0b010, 0b101, 0, 0, 1>;
+def M2_mmpyul_s1:  T_M2_vmpy <"vmpyweuh", 0b110, 0b101, 1, 0, 1>;
+def M2_mmpyul_rs0: T_M2_vmpy <"vmpyweuh", 0b011, 0b101, 0, 1, 1>;
+def M2_mmpyul_rs1: T_M2_vmpy <"vmpyweuh", 0b111, 0b101, 1, 1, 1>;
+}
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_MType_mpy <string mnemonic, bits<4> RegTyBits, RegisterClass RC,
+                   bits<3> MajOp, bits<3> MinOp, bit isSat = 0, bit isRnd = 0,
+                   string op2Suffix = "", bit isRaw = 0, bit isHi = 0 >
+  : MInst <(outs IntRegs:$dst), (ins RC:$src1, RC:$src2),
+  "$dst = "#mnemonic
+           #"($src1, $src2"#op2Suffix#")"
+           #!if(MajOp{2}, ":<<1", "")
+           #!if(isRnd, ":rnd", "")
+           #!if(isSat, ":sat", "")
+           #!if(isRaw, !if(isHi, ":raw:hi", ":raw:lo"), ""), [] > {
+    bits<5> dst;
+    bits<5> src1;
+    bits<5> src2;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = RegTyBits;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13}    = 0b0;
+    let Inst{12-8}  = src2;
+    let Inst{7-5}   = MinOp;
+    let Inst{4-0}   = dst;
+  }
+
+class T_MType_vrcmpy <string mnemonic, bits<3> MajOp, bits<3> MinOp, bit isHi>
+  : T_MType_mpy <mnemonic, 0b1001, DoubleRegs, MajOp, MinOp, 1, 1, "", 1, isHi>;
+
+class T_MType_dd  <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                   bit isSat = 0, bit isRnd = 0 >
+  : T_MType_mpy <mnemonic, 0b1001, DoubleRegs, MajOp, MinOp, isSat, isRnd>;
+
+class T_MType_rr1  <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                    bit isSat = 0, bit isRnd = 0 >
+  : T_MType_mpy<mnemonic, 0b1101, IntRegs, MajOp, MinOp, isSat, isRnd>;
+
+class T_MType_rr2 <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                   bit isSat = 0, bit isRnd = 0, string op2str = "" >
+  : T_MType_mpy<mnemonic, 0b1101, IntRegs, MajOp, MinOp, isSat, isRnd, op2str>;
+
+def M2_vradduh    : T_MType_dd <"vradduh", 0b000, 0b001, 0, 0>;
+def M2_vdmpyrs_s0 : T_MType_dd <"vdmpy",   0b000, 0b000, 1, 1>;
+def M2_vdmpyrs_s1 : T_MType_dd <"vdmpy",   0b100, 0b000, 1, 1>;
+
+let CextOpcode = "mpyi", InputType = "reg" in
+def M2_mpyi    : T_MType_rr1 <"mpyi", 0b000, 0b000>, ImmRegRel;
+
+def M2_mpy_up  : T_MType_rr1 <"mpy",  0b000, 0b001>;
+def M2_mpyu_up : T_MType_rr1 <"mpyu", 0b010, 0b001>;
+
+def M2_dpmpyss_rnd_s0 : T_MType_rr1 <"mpy", 0b001, 0b001, 0, 1>;
+
+def M2_vmpy2s_s0pack : T_MType_rr1 <"vmpyh", 0b001, 0b111, 1, 1>;
+def M2_vmpy2s_s1pack : T_MType_rr1 <"vmpyh", 0b101, 0b111, 1, 1>;
+
+def M2_hmmpyh_rs1 : T_MType_rr2 <"mpy", 0b101, 0b100, 1, 1, ".h">;
+def M2_hmmpyl_rs1 : T_MType_rr2 <"mpy", 0b111, 0b100, 1, 1, ".l">;
+
+def M2_cmpyrs_s0  : T_MType_rr2 <"cmpy", 0b001, 0b110, 1, 1>;
+def M2_cmpyrs_s1  : T_MType_rr2 <"cmpy", 0b101, 0b110, 1, 1>;
+def M2_cmpyrsc_s0 : T_MType_rr2 <"cmpy", 0b011, 0b110, 1, 1, "*">;
+def M2_cmpyrsc_s1 : T_MType_rr2 <"cmpy", 0b111, 0b110, 1, 1, "*">;
+
+// V4 Instructions
+def M2_vraddh : T_MType_dd <"vraddh", 0b001, 0b111, 0>;
+def M2_mpysu_up : T_MType_rr1 <"mpysu", 0b011, 0b001, 0>;
+def M2_mpy_up_s1 : T_MType_rr1 <"mpy", 0b101, 0b010, 0>;
+def M2_mpy_up_s1_sat : T_MType_rr1 <"mpy", 0b111, 0b000, 1>;
+
+def M2_hmmpyh_s1 : T_MType_rr2 <"mpy", 0b101, 0b000, 1, 0, ".h">;
+def M2_hmmpyl_s1 : T_MType_rr2 <"mpy", 0b101, 0b001, 1, 0, ".l">;
+
+def: Pat<(i32 (mul   I32:$src1, I32:$src2)), (M2_mpyi    I32:$src1, I32:$src2)>;
+def: Pat<(i32 (mulhs I32:$src1, I32:$src2)), (M2_mpy_up  I32:$src1, I32:$src2)>;
+def: Pat<(i32 (mulhu I32:$src1, I32:$src2)), (M2_mpyu_up I32:$src1, I32:$src2)>;
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_MType_mpy_ri <bit isNeg, Operand ImmOp, list<dag> pattern>
+  : MInst < (outs IntRegs:$Rd), (ins IntRegs:$Rs, ImmOp:$u8),
+  "$Rd ="#!if(isNeg, "- ", "+ ")#"mpyi($Rs, #$u8)" ,
+   pattern, "", M_tc_3x_SLOT23> {
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<8> u8;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b0000;
+    let Inst{23} = isNeg;
+    let Inst{13} = 0b0;
+    let Inst{4-0} = Rd;
+    let Inst{20-16} = Rs;
+    let Inst{12-5} = u8;
+  }
+
+let isExtendable = 1, opExtentBits = 8, opExtendable = 2 in
+def M2_mpysip : T_MType_mpy_ri <0, u8Ext,
+                [(set (i32 IntRegs:$Rd), (mul IntRegs:$Rs, u8ExtPred:$u8))]>;
+
+def M2_mpysin :  T_MType_mpy_ri <1, u8Imm,
+                [(set (i32 IntRegs:$Rd), (ineg (mul IntRegs:$Rs,
+                                                    u8ImmPred:$u8)))]>;
+
+// Assember mapped to M2_mpyi
+let isAsmParserOnly = 1 in
+def M2_mpyui : MInst<(outs IntRegs:$dst),
+                     (ins IntRegs:$src1, IntRegs:$src2),
+  "$dst = mpyui($src1, $src2)">;
 
 // Rd=mpyi(Rs,#m9)
 // s9 is NOT the same as m9 - but it works.. so far.
-// Assembler maps to either Rd=+mpyi(Rs,#u8 or Rd=-mpyi(Rs,#u8)
+// Assembler maps to either Rd=+mpyi(Rs,#u8) or Rd=-mpyi(Rs,#u8)
 // depending on the value of m9. See Arch Spec.
 let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 9,
-CextOpcode = "MPYI", InputType = "imm" in
-def MPYI_ri : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Ext:$src2),
-              "$dst = mpyi($src1, #$src2)",
-              [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
-                                             s9ExtPred:$src2))]>, ImmRegRel;
-
-// Rd=mpyi(Rs,Rt)
-let CextOpcode = "MPYI", InputType = "reg" in
-def MPYI : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-           "$dst = mpyi($src1, $src2)",
-           [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
-                                          (i32 IntRegs:$src2)))]>, ImmRegRel;
-
-// Rx+=mpyi(Rs,#u8)
-let isExtendable = 1, opExtendable = 3, isExtentSigned = 0, opExtentBits = 8,
-CextOpcode = "MPYI_acc", InputType = "imm" in
-def MPYI_acc_ri : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, u8Ext:$src3),
-            "$dst += mpyi($src2, #$src3)",
-            [(set (i32 IntRegs:$dst),
-                  (add (mul (i32 IntRegs:$src2), u8ExtPred:$src3),
-                       (i32 IntRegs:$src1)))],
-            "$src1 = $dst">, ImmRegRel;
+    CextOpcode = "mpyi", InputType = "imm", hasNewValue = 1,
+    isAsmParserOnly = 1 in
+def M2_mpysmi : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Ext:$src2),
+    "$dst = mpyi($src1, #$src2)",
+    [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
+                                   s9ExtPred:$src2))]>, ImmRegRel;
+
+let hasNewValue = 1, isExtendable = 1,  opExtentBits = 8, opExtendable = 3,
+    InputType = "imm" in
+class T_MType_acc_ri <string mnemonic, bits<3> MajOp, Operand ImmOp,
+                      list<dag> pattern = []>
+ : MInst < (outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, ImmOp:$src3),
+  "$dst "#mnemonic#"($src2, #$src3)",
+  pattern, "$src1 = $dst", M_tc_2_SLOT23> {
+    bits<5> dst;
+    bits<5> src2;
+    bits<8> src3;
+
+    let IClass = 0b1110;
+
+    let Inst{27-26} = 0b00;
+    let Inst{25-23} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{13} = 0b0;
+    let Inst{12-5} = src3;
+    let Inst{4-0} = dst;
+  }
 
-// Rx+=mpyi(Rs,Rt)
-let CextOpcode = "MPYI_acc", InputType = "reg" in
-def MPYI_acc_rr : MInst_acc<(outs IntRegs:$dst),
+let InputType = "reg", hasNewValue = 1 in
+class T_MType_acc_rr <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                      bit isSwap = 0, list<dag> pattern = [], bit hasNot = 0,
+                      bit isSat = 0, bit isShift = 0>
+  : MInst < (outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "$dst += mpyi($src2, $src3)",
-            [(set (i32 IntRegs:$dst),
-                  (add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
-                       (i32 IntRegs:$src1)))],
-            "$src1 = $dst">, ImmRegRel;
-
-// Rx-=mpyi(Rs,#u8)
-let isExtendable = 1, opExtendable = 3, isExtentSigned = 0, opExtentBits = 8 in
-def MPYI_sub_ri : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, u8Ext:$src3),
-            "$dst -= mpyi($src2, #$src3)",
-            [(set (i32 IntRegs:$dst),
-                  (sub (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
-                                                 u8ExtPred:$src3)))],
-            "$src1 = $dst">;
-
-// Multiply and use upper result.
-// Rd=mpy(Rs,Rt.H):<<1:rnd:sat
-// Rd=mpy(Rs,Rt.L):<<1:rnd:sat
-// Rd=mpy(Rs,Rt)
-def MPY : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-          "$dst = mpy($src1, $src2)",
-          [(set (i32 IntRegs:$dst), (mulhs (i32 IntRegs:$src1),
-                                           (i32 IntRegs:$src2)))]>;
-
-// Rd=mpy(Rs,Rt):rnd
-// Rd=mpyu(Rs,Rt)
-def MPYU : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-           "$dst = mpyu($src1, $src2)",
-           [(set (i32 IntRegs:$dst), (mulhu (i32 IntRegs:$src1),
-                                            (i32 IntRegs:$src2)))]>;
-
-// Multiply and use full result.
-// Rdd=mpyu(Rs,Rt)
-def MPYU64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             "$dst = mpyu($src1, $src2)",
-             [(set (i64 DoubleRegs:$dst),
-                   (mul (i64 (anyext (i32 IntRegs:$src1))),
-                        (i64 (anyext (i32 IntRegs:$src2)))))]>;
-
-// Rdd=mpy(Rs,Rt)
-def MPY64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             "$dst = mpy($src1, $src2)",
-             [(set (i64 DoubleRegs:$dst),
-                   (mul (i64 (sext (i32 IntRegs:$src1))),
-                        (i64 (sext (i32 IntRegs:$src2)))))]>;
+  "$dst "#mnemonic#"($src2, "#!if(hasNot, "~$src3)","$src3)")
+                          #!if(isShift, ":<<1", "")
+                          #!if(isSat, ":sat", ""),
+  pattern, "$src1 = $dst", M_tc_2_SLOT23 > {
+    bits<5> dst;
+    bits<5> src2;
+    bits<5> src3;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1111;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = !if(isSwap, src3, src2);
+    let Inst{13} = 0b0;
+    let Inst{12-8} = !if(isSwap, src2, src3);
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = dst;
+  }
+
+let CextOpcode = "MPYI_acc", Itinerary = M_tc_3x_SLOT23 in {
+  def M2_macsip : T_MType_acc_ri <"+= mpyi", 0b010, u8Ext,
+                  [(set (i32 IntRegs:$dst),
+                        (add (mul IntRegs:$src2, u8ExtPred:$src3),
+                             IntRegs:$src1))]>, ImmRegRel;
+
+  def M2_maci   : T_MType_acc_rr <"+= mpyi", 0b000, 0b000, 0,
+                 [(set (i32 IntRegs:$dst),
+                       (add (mul IntRegs:$src2, IntRegs:$src3),
+                            IntRegs:$src1))]>, ImmRegRel;
+}
+
+let CextOpcode = "ADD_acc" in {
+  let isExtentSigned = 1 in
+  def M2_accii : T_MType_acc_ri <"+= add", 0b100, s8Ext,
+                 [(set (i32 IntRegs:$dst),
+                       (add (add (i32 IntRegs:$src2), s8_16ExtPred:$src3),
+                            (i32 IntRegs:$src1)))]>, ImmRegRel;
+
+  def M2_acci  : T_MType_acc_rr <"+= add",  0b000, 0b001, 0,
+                 [(set (i32 IntRegs:$dst),
+                       (add (add (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
+                            (i32 IntRegs:$src1)))]>, ImmRegRel;
+}
+
+let CextOpcode = "SUB_acc" in {
+  let isExtentSigned = 1 in
+  def M2_naccii : T_MType_acc_ri <"-= add", 0b101, s8Ext>, ImmRegRel;
+
+  def M2_nacci  : T_MType_acc_rr <"-= add",  0b100, 0b001, 0>, ImmRegRel;
+}
+
+let Itinerary = M_tc_3x_SLOT23 in
+def M2_macsin : T_MType_acc_ri <"-= mpyi", 0b011, u8Ext>;
+
+def M2_xor_xacc : T_MType_acc_rr < "^= xor", 0b100, 0b011, 0>;
+def M2_subacc : T_MType_acc_rr <"+= sub",  0b000, 0b011, 1>;
+
+class T_MType_acc_pat1 <InstHexagon MI, SDNode firstOp, SDNode secOp,
+                        PatLeaf ImmPred>
+  : Pat <(secOp IntRegs:$src1, (firstOp IntRegs:$src2, ImmPred:$src3)),
+         (MI IntRegs:$src1, IntRegs:$src2, ImmPred:$src3)>;
+
+class T_MType_acc_pat2 <InstHexagon MI, SDNode firstOp, SDNode secOp>
+  : Pat <(i32 (secOp IntRegs:$src1, (firstOp IntRegs:$src2, IntRegs:$src3))),
+         (MI IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def : T_MType_acc_pat2 <M2_xor_xacc, xor, xor>;
+def : T_MType_acc_pat1 <M2_macsin, mul, sub, u8ExtPred>;
+
+def : T_MType_acc_pat1 <M2_naccii, add, sub, s8_16ExtPred>;
+def : T_MType_acc_pat2 <M2_nacci, add, sub>;
+
+//===----------------------------------------------------------------------===//
+// Template Class -- XType Vector Instructions
+//===----------------------------------------------------------------------===//
+class T_XTYPE_Vect < string opc, bits<3> MajOp, bits<3> MinOp, bit isConj >
+  : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Rdd = "#opc#"($Rss, $Rtt"#!if(isConj,"*)",")"),
+  [] > {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1000;
+    let Inst{23-21} = MajOp;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = Rdd;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
+
+class T_XTYPE_Vect_acc < string opc, bits<3> MajOp, bits<3> MinOp, bit isConj >
+  : MInst <(outs DoubleRegs:$Rdd),
+           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Rdd += "#opc#"($Rss, $Rtt"#!if(isConj,"*)",")"),
+  [], "$dst2 = $Rdd",M_tc_3x_SLOT23 > {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1010;
+    let Inst{23-21} = MajOp;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = Rdd;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
+
+class T_XTYPE_Vect_diff < bits<3> MajOp, string opc >
+  : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rtt, DoubleRegs:$Rss),
+  "$Rdd = "#opc#"($Rtt, $Rss)",
+  [], "",M_tc_2_SLOT23 > {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1000;
+    let Inst{23-21} = MajOp;
+    let Inst{7-5} = 0b000;
+    let Inst{4-0} = Rdd;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
+
+// Vector reduce add unsigned bytes: Rdd32=vrmpybu(Rss32,Rtt32)
+def A2_vraddub: T_XTYPE_Vect <"vraddub", 0b010, 0b001, 0>;
+def A2_vraddub_acc: T_XTYPE_Vect_acc <"vraddub", 0b010, 0b001, 0>;
+
+// Vector sum of absolute differences unsigned bytes: Rdd=vrsadub(Rss,Rtt)
+def A2_vrsadub: T_XTYPE_Vect <"vrsadub", 0b010, 0b010, 0>;
+def A2_vrsadub_acc: T_XTYPE_Vect_acc <"vrsadub", 0b010, 0b010, 0>;
+
+// Vector absolute difference: Rdd=vabsdiffh(Rtt,Rss)
+def M2_vabsdiffh: T_XTYPE_Vect_diff<0b011, "vabsdiffh">;
+
+// Vector absolute difference words: Rdd=vabsdiffw(Rtt,Rss)
+def M2_vabsdiffw: T_XTYPE_Vect_diff<0b001, "vabsdiffw">;
+
+// Vector reduce complex multiply real or imaginary:
+// Rdd[+]=vrcmpy[ir](Rss,Rtt[*])
+def M2_vrcmpyi_s0:  T_XTYPE_Vect <"vrcmpyi", 0b000, 0b000, 0>;
+def M2_vrcmpyi_s0c: T_XTYPE_Vect <"vrcmpyi", 0b010, 0b000, 1>;
+def M2_vrcmaci_s0:  T_XTYPE_Vect_acc <"vrcmpyi", 0b000, 0b000, 0>;
+def M2_vrcmaci_s0c: T_XTYPE_Vect_acc <"vrcmpyi", 0b010, 0b000, 1>;
+
+def M2_vrcmpyr_s0:  T_XTYPE_Vect <"vrcmpyr", 0b000, 0b001, 0>;
+def M2_vrcmpyr_s0c: T_XTYPE_Vect <"vrcmpyr", 0b011, 0b001, 1>;
+def M2_vrcmacr_s0:  T_XTYPE_Vect_acc <"vrcmpyr", 0b000, 0b001, 0>;
+def M2_vrcmacr_s0c: T_XTYPE_Vect_acc <"vrcmpyr", 0b011, 0b001, 1>;
+
+// Vector reduce halfwords:
+// Rdd[+]=vrmpyh(Rss,Rtt)
+def M2_vrmpy_s0: T_XTYPE_Vect <"vrmpyh", 0b000, 0b010, 0>;
+def M2_vrmac_s0: T_XTYPE_Vect_acc <"vrmpyh", 0b000, 0b010, 0>;
+
+//===----------------------------------------------------------------------===//
+// Template Class -- Vector Multipy with accumulation.
+// Used for complex multiply real or imaginary, dual multiply and even halfwords
+//===----------------------------------------------------------------------===//
+let Defs = [USR_OVF] in
+class T_M2_vmpy_acc_sat < string opc, bits<3> MajOp, bits<3> MinOp,
+                          bit hasShift, bit isRnd >
+  : MInst <(outs DoubleRegs:$Rxx),
+           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Rxx += "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
+                               #!if(isRnd,":rnd","")#":sat",
+  [], "$dst2 = $Rxx",M_tc_3x_SLOT23 > {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1010;
+    let Inst{23-21} = MajOp;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = Rxx;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
+
+class T_M2_vmpy_acc < string opc, bits<3> MajOp, bits<3> MinOp,
+                      bit hasShift, bit isRnd >
+  : MInst <(outs DoubleRegs:$Rxx),
+           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Rxx += "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
+                               #!if(isRnd,":rnd",""),
+  [], "$dst2 = $Rxx",M_tc_3x_SLOT23 > {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1010;
+    let Inst{23-21} = MajOp;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = Rxx;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
+
+// Vector multiply word by signed half with accumulation
+// Rxx+=vmpyw[eo]h(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmacls_s1:  T_M2_vmpy_acc_sat <"vmpyweh", 0b100, 0b101, 1, 0>;
+def M2_mmacls_s0:  T_M2_vmpy_acc_sat <"vmpyweh", 0b000, 0b101, 0, 0>;
+def M2_mmacls_rs1: T_M2_vmpy_acc_sat <"vmpyweh", 0b101, 0b101, 1, 1>;
+def M2_mmacls_rs0: T_M2_vmpy_acc_sat <"vmpyweh", 0b001, 0b101, 0, 1>;
+
+def M2_mmachs_s1:  T_M2_vmpy_acc_sat <"vmpywoh", 0b100, 0b111, 1, 0>;
+def M2_mmachs_s0:  T_M2_vmpy_acc_sat <"vmpywoh", 0b000, 0b111, 0, 0>;
+def M2_mmachs_rs1: T_M2_vmpy_acc_sat <"vmpywoh", 0b101, 0b111, 1, 1>;
+def M2_mmachs_rs0: T_M2_vmpy_acc_sat <"vmpywoh", 0b001, 0b111, 0, 1>;
+
+// Vector multiply word by unsigned half with accumulation
+// Rxx+=vmpyw[eo]uh(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmaculs_s1:  T_M2_vmpy_acc_sat <"vmpyweuh", 0b110, 0b101, 1, 0>;
+def M2_mmaculs_s0:  T_M2_vmpy_acc_sat <"vmpyweuh", 0b010, 0b101, 0, 0>;
+def M2_mmaculs_rs1: T_M2_vmpy_acc_sat <"vmpyweuh", 0b111, 0b101, 1, 1>;
+def M2_mmaculs_rs0: T_M2_vmpy_acc_sat <"vmpyweuh", 0b011, 0b101, 0, 1>;
+
+def M2_mmacuhs_s1:  T_M2_vmpy_acc_sat <"vmpywouh", 0b110, 0b111, 1, 0>;
+def M2_mmacuhs_s0:  T_M2_vmpy_acc_sat <"vmpywouh", 0b010, 0b111, 0, 0>;
+def M2_mmacuhs_rs1: T_M2_vmpy_acc_sat <"vmpywouh", 0b111, 0b111, 1, 1>;
+def M2_mmacuhs_rs0: T_M2_vmpy_acc_sat <"vmpywouh", 0b011, 0b111, 0, 1>;
+
+// Vector multiply even halfwords with accumulation
+// Rxx+=vmpyeh(Rss,Rtt)[:<<1][:sat]
+def M2_vmac2es:    T_M2_vmpy_acc     <"vmpyeh", 0b001, 0b010, 0, 0>;
+def M2_vmac2es_s1: T_M2_vmpy_acc_sat <"vmpyeh", 0b100, 0b110, 1, 0>;
+def M2_vmac2es_s0: T_M2_vmpy_acc_sat <"vmpyeh", 0b000, 0b110, 0, 0>;
+
+// Vector dual multiply with accumulation
+// Rxx+=vdmpy(Rss,Rtt)[:sat]
+def M2_vdmacs_s1: T_M2_vmpy_acc_sat <"vdmpy", 0b100, 0b100, 1, 0>;
+def M2_vdmacs_s0: T_M2_vmpy_acc_sat <"vdmpy", 0b000, 0b100, 0, 0>;
+
+// Vector complex multiply real or imaginary with accumulation
+// Rxx+=vcmpy[ir](Rss,Rtt):sat
+def M2_vcmac_s0_sat_r: T_M2_vmpy_acc_sat <"vcmpyr", 0b001, 0b100, 0, 0>;
+def M2_vcmac_s0_sat_i: T_M2_vmpy_acc_sat <"vcmpyi", 0b010, 0b100, 0, 0>;
+
+//===----------------------------------------------------------------------===//
+// Template Class -- Multiply signed/unsigned halfwords with and without
+// saturation and rounding
+//===----------------------------------------------------------------------===//
+class T_M2_mpyd < bits<2> LHbits, bit isRnd, bit hasShift, bit isUnsigned >
+  : MInst < (outs DoubleRegs:$Rdd), (ins IntRegs:$Rs, IntRegs:$Rt),
+  "$Rdd = "#!if(isUnsigned,"mpyu","mpy")#"($Rs."#!if(LHbits{1},"h","l")
+                                       #", $Rt."#!if(LHbits{0},"h)","l)")
+                                       #!if(hasShift,":<<1","")
+                                       #!if(isRnd,":rnd",""),
+  [] > {
+    bits<5> Rdd;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b0100;
+    let Inst{23} = hasShift;
+    let Inst{22} = isUnsigned;
+    let Inst{21} = isRnd;
+    let Inst{6-5} = LHbits;
+    let Inst{4-0} = Rdd;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = Rt;
+}
+
+def M2_mpyd_hh_s0: T_M2_mpyd<0b11, 0, 0, 0>;
+def M2_mpyd_hl_s0: T_M2_mpyd<0b10, 0, 0, 0>;
+def M2_mpyd_lh_s0: T_M2_mpyd<0b01, 0, 0, 0>;
+def M2_mpyd_ll_s0: T_M2_mpyd<0b00, 0, 0, 0>;
+
+def M2_mpyd_hh_s1: T_M2_mpyd<0b11, 0, 1, 0>;
+def M2_mpyd_hl_s1: T_M2_mpyd<0b10, 0, 1, 0>;
+def M2_mpyd_lh_s1: T_M2_mpyd<0b01, 0, 1, 0>;
+def M2_mpyd_ll_s1: T_M2_mpyd<0b00, 0, 1, 0>;
+
+def M2_mpyd_rnd_hh_s0: T_M2_mpyd<0b11, 1, 0, 0>;
+def M2_mpyd_rnd_hl_s0: T_M2_mpyd<0b10, 1, 0, 0>;
+def M2_mpyd_rnd_lh_s0: T_M2_mpyd<0b01, 1, 0, 0>;
+def M2_mpyd_rnd_ll_s0: T_M2_mpyd<0b00, 1, 0, 0>;
+
+def M2_mpyd_rnd_hh_s1: T_M2_mpyd<0b11, 1, 1, 0>;
+def M2_mpyd_rnd_hl_s1: T_M2_mpyd<0b10, 1, 1, 0>;
+def M2_mpyd_rnd_lh_s1: T_M2_mpyd<0b01, 1, 1, 0>;
+def M2_mpyd_rnd_ll_s1: T_M2_mpyd<0b00, 1, 1, 0>;
+
+//Rdd=mpyu(Rs.[HL],Rt.[HL])[:<<1]
+def M2_mpyud_hh_s0: T_M2_mpyd<0b11, 0, 0, 1>;
+def M2_mpyud_hl_s0: T_M2_mpyd<0b10, 0, 0, 1>;
+def M2_mpyud_lh_s0: T_M2_mpyd<0b01, 0, 0, 1>;
+def M2_mpyud_ll_s0: T_M2_mpyd<0b00, 0, 0, 1>;
+
+def M2_mpyud_hh_s1: T_M2_mpyd<0b11, 0, 1, 1>;
+def M2_mpyud_hl_s1: T_M2_mpyd<0b10, 0, 1, 1>;
+def M2_mpyud_lh_s1: T_M2_mpyd<0b01, 0, 1, 1>;
+def M2_mpyud_ll_s1: T_M2_mpyd<0b00, 0, 1, 1>;
+
+//===----------------------------------------------------------------------===//
+// Template Class for xtype mpy:
+// Vector multiply
+// Complex multiply
+// multiply 32X32 and use full result
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_XTYPE_mpy64 <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                     bit isSat, bit hasShift, bit isConj>
+   : MInst <(outs DoubleRegs:$Rdd),
+            (ins IntRegs:$Rs, IntRegs:$Rt),
+  "$Rdd = "#mnemonic#"($Rs, $Rt"#!if(isConj,"*)",")")
+                                #!if(hasShift,":<<1","")
+                                #!if(isSat,":sat",""),
+  [] > {
+    bits<5> Rdd;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b0101;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = Rt;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = Rdd;
+  }
+
+//===----------------------------------------------------------------------===//
+// Template Class for xtype mpy with accumulation into 64-bit:
+// Vector multiply
+// Complex multiply
+// multiply 32X32 and use full result
+//===----------------------------------------------------------------------===//
+class T_XTYPE_mpy64_acc <string op1, string op2, bits<3> MajOp, bits<3> MinOp,
+                         bit isSat, bit hasShift, bit isConj>
+  : MInst <(outs DoubleRegs:$Rxx),
+           (ins DoubleRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
+  "$Rxx "#op2#"= "#op1#"($Rs, $Rt"#!if(isConj,"*)",")")
+                                   #!if(hasShift,":<<1","")
+                                   #!if(isSat,":sat",""),
+
+  [] , "$dst2 = $Rxx" > {
+    bits<5> Rxx;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b0111;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = Rt;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = Rxx;
+  }
+
+// MPY - Multiply and use full result
+// Rdd = mpy[u](Rs,Rt)
+def M2_dpmpyss_s0 : T_XTYPE_mpy64 < "mpy", 0b000, 0b000, 0, 0, 0>;
+def M2_dpmpyuu_s0 : T_XTYPE_mpy64 < "mpyu", 0b010, 0b000, 0, 0, 0>;
+
+// Rxx[+-]= mpy[u](Rs,Rt)
+def M2_dpmpyss_acc_s0 : T_XTYPE_mpy64_acc < "mpy",  "+", 0b000, 0b000, 0, 0, 0>;
+def M2_dpmpyss_nac_s0 : T_XTYPE_mpy64_acc < "mpy",  "-", 0b001, 0b000, 0, 0, 0>;
+def M2_dpmpyuu_acc_s0 : T_XTYPE_mpy64_acc < "mpyu", "+", 0b010, 0b000, 0, 0, 0>;
+def M2_dpmpyuu_nac_s0 : T_XTYPE_mpy64_acc < "mpyu", "-", 0b011, 0b000, 0, 0, 0>;
+
+// Complex multiply real or imaginary
+// Rxx=cmpy[ir](Rs,Rt)
+def M2_cmpyi_s0 : T_XTYPE_mpy64 < "cmpyi", 0b000, 0b001, 0, 0, 0>;
+def M2_cmpyr_s0 : T_XTYPE_mpy64 < "cmpyr", 0b000, 0b010, 0, 0, 0>;
+
+// Rxx+=cmpy[ir](Rs,Rt)
+def M2_cmaci_s0 : T_XTYPE_mpy64_acc < "cmpyi", "+", 0b000, 0b001, 0, 0, 0>;
+def M2_cmacr_s0 : T_XTYPE_mpy64_acc < "cmpyr", "+", 0b000, 0b010, 0, 0, 0>;
+
+// Complex multiply
+// Rdd=cmpy(Rs,Rt)[:<<]:sat
+def M2_cmpys_s0 : T_XTYPE_mpy64 < "cmpy", 0b000, 0b110, 1, 0, 0>;
+def M2_cmpys_s1 : T_XTYPE_mpy64 < "cmpy", 0b100, 0b110, 1, 1, 0>;
+
+// Rdd=cmpy(Rs,Rt*)[:<<]:sat
+def M2_cmpysc_s0 : T_XTYPE_mpy64 < "cmpy", 0b010, 0b110, 1, 0, 1>;
+def M2_cmpysc_s1 : T_XTYPE_mpy64 < "cmpy", 0b110, 0b110, 1, 1, 1>;
+
+// Rxx[-+]=cmpy(Rs,Rt)[:<<1]:sat
+def M2_cmacs_s0  : T_XTYPE_mpy64_acc < "cmpy", "+", 0b000, 0b110, 1, 0, 0>;
+def M2_cnacs_s0  : T_XTYPE_mpy64_acc < "cmpy", "-", 0b000, 0b111, 1, 0, 0>;
+def M2_cmacs_s1  : T_XTYPE_mpy64_acc < "cmpy", "+", 0b100, 0b110, 1, 1, 0>;
+def M2_cnacs_s1  : T_XTYPE_mpy64_acc < "cmpy", "-", 0b100, 0b111, 1, 1, 0>;
+
+// Rxx[-+]=cmpy(Rs,Rt*)[:<<1]:sat
+def M2_cmacsc_s0 : T_XTYPE_mpy64_acc < "cmpy", "+", 0b010, 0b110, 1, 0, 1>;
+def M2_cnacsc_s0 : T_XTYPE_mpy64_acc < "cmpy", "-", 0b010, 0b111, 1, 0, 1>;
+def M2_cmacsc_s1 : T_XTYPE_mpy64_acc < "cmpy", "+", 0b110, 0b110, 1, 1, 1>;
+def M2_cnacsc_s1 : T_XTYPE_mpy64_acc < "cmpy", "-", 0b110, 0b111, 1, 1, 1>;
+
+// Vector multiply halfwords
+// Rdd=vmpyh(Rs,Rt)[:<<]:sat
+//let Defs = [USR_OVF] in {
+  def M2_vmpy2s_s1 : T_XTYPE_mpy64 < "vmpyh", 0b100, 0b101, 1, 1, 0>;
+  def M2_vmpy2s_s0 : T_XTYPE_mpy64 < "vmpyh", 0b000, 0b101, 1, 0, 0>;
+//}
+
+// Rxx+=vmpyh(Rs,Rt)[:<<1][:sat]
+def M2_vmac2     : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b001, 0b001, 0, 0, 0>;
+def M2_vmac2s_s1 : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b100, 0b101, 1, 1, 0>;
+def M2_vmac2s_s0 : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b000, 0b101, 1, 0, 0>;
+
+def: Pat<(i64 (mul (i64 (anyext (i32 IntRegs:$src1))),
+                   (i64 (anyext (i32 IntRegs:$src2))))),
+         (M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2)>;
+
+def: Pat<(i64 (mul (i64 (sext (i32 IntRegs:$src1))),
+                   (i64 (sext (i32 IntRegs:$src2))))),
+         (M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>;
+
+def: Pat<(i64 (mul (is_sext_i32:$src1),
+                   (is_sext_i32:$src2))),
+         (M2_dpmpyss_s0 (LoReg DoubleRegs:$src1), (LoReg DoubleRegs:$src2))>;
 
 // Multiply and accumulate, use full result.
 // Rxx[+-]=mpy(Rs,Rt)
-// Rxx+=mpy(Rs,Rt)
-def MPY64_acc : MInst_acc<(outs DoubleRegs:$dst),
-            (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "$dst += mpy($src2, $src3)",
-            [(set (i64 DoubleRegs:$dst),
-            (add (mul (i64 (sext (i32 IntRegs:$src2))),
-                      (i64 (sext (i32 IntRegs:$src3)))),
-                 (i64 DoubleRegs:$src1)))],
-            "$src1 = $dst">;
-
-// Rxx-=mpy(Rs,Rt)
-def MPY64_sub : MInst_acc<(outs DoubleRegs:$dst),
-            (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "$dst -= mpy($src2, $src3)",
-            [(set (i64 DoubleRegs:$dst),
-                  (sub (i64 DoubleRegs:$src1),
-                       (mul (i64 (sext (i32 IntRegs:$src2))),
-                            (i64 (sext (i32 IntRegs:$src3))))))],
-            "$src1 = $dst">;
-
-// Rxx[+-]=mpyu(Rs,Rt)
-// Rxx+=mpyu(Rs,Rt)
-def MPYU64_acc : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                            IntRegs:$src2, IntRegs:$src3),
-             "$dst += mpyu($src2, $src3)",
-             [(set (i64 DoubleRegs:$dst),
-                   (add (mul (i64 (anyext (i32 IntRegs:$src2))),
-                             (i64 (anyext (i32 IntRegs:$src3)))),
-                        (i64 DoubleRegs:$src1)))], "$src1 = $dst">;
-
-// Rxx-=mpyu(Rs,Rt)
-def MPYU64_sub : MInst_acc<(outs DoubleRegs:$dst),
-            (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "$dst -= mpyu($src2, $src3)",
-            [(set (i64 DoubleRegs:$dst),
-                  (sub (i64 DoubleRegs:$src1),
-                       (mul (i64 (anyext (i32 IntRegs:$src2))),
-                            (i64 (anyext (i32 IntRegs:$src3))))))],
-            "$src1 = $dst">;
-
-
-let InputType = "reg", CextOpcode = "ADD_acc" in
-def ADDrr_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
-                            IntRegs:$src2, IntRegs:$src3),
-             "$dst += add($src2, $src3)",
-             [(set (i32 IntRegs:$dst), (add (add (i32 IntRegs:$src2),
-                                                 (i32 IntRegs:$src3)),
-                                            (i32 IntRegs:$src1)))],
-             "$src1 = $dst">, ImmRegRel;
-
-let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 8,
-InputType = "imm", CextOpcode = "ADD_acc" in
-def ADDri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
-                            IntRegs:$src2, s8Ext:$src3),
-             "$dst += add($src2, #$src3)",
-             [(set (i32 IntRegs:$dst), (add (add (i32 IntRegs:$src2),
-                                                 s8_16ExtPred:$src3),
-                                            (i32 IntRegs:$src1)))],
-             "$src1 = $dst">, ImmRegRel;
-
-let CextOpcode = "SUB_acc", InputType = "reg" in
-def SUBrr_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
-                            IntRegs:$src2, IntRegs:$src3),
-             "$dst -= add($src2, $src3)",
-             [(set (i32 IntRegs:$dst),
-                   (sub (i32 IntRegs:$src1), (add (i32 IntRegs:$src2),
-                                                  (i32 IntRegs:$src3))))],
-             "$src1 = $dst">, ImmRegRel;
-
-let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 8,
-CextOpcode = "SUB_acc", InputType = "imm" in
-def SUBri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
-                            IntRegs:$src2, s8Ext:$src3),
-             "$dst -= add($src2, #$src3)",
-             [(set (i32 IntRegs:$dst), (sub (i32 IntRegs:$src1),
-                                            (add (i32 IntRegs:$src2),
-                                                 s8_16ExtPred:$src3)))],
-             "$src1 = $dst">, ImmRegRel;
+
+def: Pat<(i64 (add (i64 DoubleRegs:$src1),
+                   (mul (i64 (sext (i32 IntRegs:$src2))),
+                        (i64 (sext (i32 IntRegs:$src3)))))),
+         (M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: Pat<(i64 (sub (i64 DoubleRegs:$src1),
+                   (mul (i64 (sext (i32 IntRegs:$src2))),
+                        (i64 (sext (i32 IntRegs:$src3)))))),
+         (M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: Pat<(i64 (add (i64 DoubleRegs:$src1),
+                   (mul (i64 (anyext (i32 IntRegs:$src2))),
+                        (i64 (anyext (i32 IntRegs:$src3)))))),
+         (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: Pat<(i64 (add (i64 DoubleRegs:$src1),
+                   (mul (i64 (zext (i32 IntRegs:$src2))),
+                        (i64 (zext (i32 IntRegs:$src3)))))),
+         (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: Pat<(i64 (sub (i64 DoubleRegs:$src1),
+                   (mul (i64 (anyext (i32 IntRegs:$src2))),
+                        (i64 (anyext (i32 IntRegs:$src3)))))),
+         (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: Pat<(i64 (sub (i64 DoubleRegs:$src1),
+                   (mul (i64 (zext (i32 IntRegs:$src2))),
+                        (i64 (zext (i32 IntRegs:$src3)))))),
+         (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
 
 //===----------------------------------------------------------------------===//
 // MTYPE/MPYH -
@@ -1464,321 +3250,1134 @@ def SUBri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
 //===----------------------------------------------------------------------===//
 ///
 // Store doubleword.
-
 //===----------------------------------------------------------------------===//
-// Post increment store
+// Template class for non-predicated post increment stores with immediate offset
 //===----------------------------------------------------------------------===//
+let isPredicable = 1, hasSideEffects = 0, addrMode = PostInc in
+class T_store_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
+                 bits<4> MajOp, bit isHalf >
+  : STInst <(outs IntRegs:$_dst_),
+            (ins IntRegs:$src1, ImmOp:$offset, RC:$src2),
+  mnemonic#"($src1++#$offset) = $src2"#!if(isHalf, ".h", ""),
+  [], "$src1 = $_dst_" >,
+  AddrModeRel {
+    bits<5> src1;
+    bits<5> src2;
+    bits<7> offset;
+    bits<4> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
+                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+                                      /* s4_0Imm */ offset{3-0})));
+    let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
+
+    let IClass = 0b1010;
+
+    let Inst{27-25} = 0b101;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13}    = 0b0;
+    let Inst{12-8}  = src2;
+    let Inst{7}     = 0b0;
+    let Inst{6-3}   = offsetBits;
+    let Inst{1}     = 0b0;
+  }
 
-multiclass ST_PostInc_Pbase<string mnemonic, RegisterClass RC, Operand ImmOp,
-                            bit isNot, bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME : STInst2PI<(outs IntRegs:$dst),
+//===----------------------------------------------------------------------===//
+// Template class for predicated post increment stores with immediate offset
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc in
+class T_pstore_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
+                      bits<4> MajOp, bit isHalf, bit isPredNot, bit isPredNew >
+  : STInst <(outs IntRegs:$_dst_),
             (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-            ") ")#mnemonic#"($src2++#$offset) = $src3",
-            [],
-            "$src2 = $dst">;
-}
+  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+  ") ")#mnemonic#"($src2++#$offset) = $src3"#!if(isHalf, ".h", ""),
+  [], "$src2 = $_dst_" >,
+  AddrModeRel {
+    bits<2> src1;
+    bits<5> src2;
+    bits<7> offset;
+    bits<5> src3;
+    bits<4> offsetBits;
 
-multiclass ST_PostInc_Pred<string mnemonic, RegisterClass RC,
-                           Operand ImmOp, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    defm _c#NAME : ST_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 0>;
-    // Predicate new
-    let Predicates = [HasV4T], validSubTargets = HasV4SubT in
-    defm _cdn#NAME#_V4 : ST_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 1>;
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
+                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+                                      /* s4_0Imm */ offset{3-0})));
+
+    let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
+    let isPredicatedNew = isPredNew;
+    let isPredicatedFalse = isPredNot;
+
+    let IClass = 0b1010;
+
+    let Inst{27-25} = 0b101;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{13} = 0b1;
+    let Inst{12-8} = src3;
+    let Inst{7} = isPredNew;
+    let Inst{6-3} = offsetBits;
+    let Inst{2} = isPredNot;
+    let Inst{1-0} = src1;
   }
-}
 
-let hasCtrlDep = 1, isNVStorable = 1, neverHasSideEffects = 1 in
 multiclass ST_PostInc<string mnemonic, string BaseOp, RegisterClass RC,
-                      Operand ImmOp> {
+                      Operand ImmOp, bits<4> MajOp, bit isHalf = 0 > {
 
-  let hasCtrlDep = 1, BaseOpcode = "POST_"#BaseOp in {
-    let isPredicable = 1 in
-    def NAME : STInst2PI<(outs IntRegs:$dst),
-                (ins IntRegs:$src1, ImmOp:$offset, RC:$src2),
-                mnemonic#"($src1++#$offset) = $src2",
-                [],
-                "$src1 = $dst">;
-
-    let isPredicated = 1 in {
-      defm Pt : ST_PostInc_Pred<mnemonic, RC, ImmOp, 0 >;
-      defm NotPt : ST_PostInc_Pred<mnemonic, RC, ImmOp, 1 >;
-    }
+  let BaseOpcode = "POST_"#BaseOp in {
+    def S2_#NAME#_pi : T_store_pi <mnemonic, RC, ImmOp, MajOp, isHalf>;
+
+    // Predicated
+    def S2_p#NAME#t_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp, isHalf, 0, 0>;
+    def S2_p#NAME#f_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp, isHalf, 1, 0>;
+
+    // Predicated new
+    def S2_p#NAME#tnew_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp,
+                                          isHalf, 0, 1>;
+    def S2_p#NAME#fnew_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp,
+                                          isHalf, 1, 1>;
   }
 }
 
-defm POST_STbri: ST_PostInc <"memb", "STrib", IntRegs, s4_0Imm>, AddrModeRel;
-defm POST_SThri: ST_PostInc <"memh", "STrih", IntRegs, s4_1Imm>, AddrModeRel;
-defm POST_STwri: ST_PostInc <"memw", "STriw", IntRegs, s4_2Imm>, AddrModeRel;
+let accessSize = ByteAccess in
+defm storerb: ST_PostInc <"memb", "STrib", IntRegs, s4_0Imm, 0b1000>;
 
-let isNVStorable = 0 in
-defm POST_STdri: ST_PostInc <"memd", "STrid", DoubleRegs, s4_3Imm>, AddrModeRel;
+let accessSize = HalfWordAccess in
+defm storerh: ST_PostInc <"memh", "STrih", IntRegs, s4_1Imm, 0b1010>;
 
-def : Pat<(post_truncsti8 (i32 IntRegs:$src1), IntRegs:$src2,
-                           s4_3ImmPred:$offset),
-          (POST_STbri IntRegs:$src2, s4_0ImmPred:$offset, IntRegs:$src1)>;
+let accessSize = WordAccess in
+defm storeri: ST_PostInc <"memw", "STriw", IntRegs, s4_2Imm, 0b1100>;
 
-def : Pat<(post_truncsti16 (i32 IntRegs:$src1), IntRegs:$src2,
-                            s4_3ImmPred:$offset),
-          (POST_SThri IntRegs:$src2, s4_1ImmPred:$offset, IntRegs:$src1)>;
+let accessSize = DoubleWordAccess in
+defm storerd: ST_PostInc <"memd", "STrid", DoubleRegs, s4_3Imm, 0b1110>;
 
-def : Pat<(post_store (i32 IntRegs:$src1), IntRegs:$src2, s4_2ImmPred:$offset),
-          (POST_STwri IntRegs:$src2, s4_1ImmPred:$offset, IntRegs:$src1)>;
+let accessSize = HalfWordAccess, isNVStorable = 0 in
+defm storerf: ST_PostInc <"memh", "STrih_H", IntRegs, s4_1Imm, 0b1011, 1>;
 
-def : Pat<(post_store (i64 DoubleRegs:$src1), IntRegs:$src2,
-                       s4_3ImmPred:$offset),
-          (POST_STdri IntRegs:$src2, s4_3ImmPred:$offset, DoubleRegs:$src1)>;
+class Storepi_pat<PatFrag Store, PatFrag Value, PatFrag Offset,
+                  InstHexagon MI>
+  : Pat<(Store Value:$src1, I32:$src2, Offset:$offset),
+        (MI I32:$src2, imm:$offset, Value:$src1)>;
+
+def: Storepi_pat<post_truncsti8,  I32, s4_0ImmPred, S2_storerb_pi>;
+def: Storepi_pat<post_truncsti16, I32, s4_1ImmPred, S2_storerh_pi>;
+def: Storepi_pat<post_store,      I32, s4_2ImmPred, S2_storeri_pi>;
+def: Storepi_pat<post_store,      I64, s4_3ImmPred, S2_storerd_pi>;
 
 //===----------------------------------------------------------------------===//
-// multiclass for the store instructions with MEMri operand.
+// Template class for post increment stores with register offset.
 //===----------------------------------------------------------------------===//
-multiclass ST_MEMri_Pbase<string mnemonic, RegisterClass RC, bit isNot,
-                          bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME : STInst2<(outs),
-            (ins PredRegs:$src1, MEMri:$addr, RC: $src2),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-            ") ")#mnemonic#"($addr) = $src2",
-            []>;
-}
+let isNVStorable = 1 in
+class T_store_pr <string mnemonic, RegisterClass RC, bits<3> MajOp,
+                     MemAccessSize AccessSz, bit isHalf = 0>
+  : STInst <(outs IntRegs:$_dst_),
+            (ins IntRegs:$src1, ModRegs:$src2, RC:$src3),
+  mnemonic#"($src1++$src2) = $src3"#!if(isHalf, ".h", ""),
+  [], "$src1 = $_dst_" > {
+    bits<5> src1;
+    bits<1> src2;
+    bits<5> src3;
+    let accessSize = AccessSz;
+
+    let IClass = 0b1010;
 
-multiclass ST_MEMri_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    defm _c#NAME : ST_MEMri_Pbase<mnemonic, RC, PredNot, 0>;
+    let Inst{27-24} = 0b1101;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13} = src2;
+    let Inst{12-8} = src3;
+    let Inst{7} = 0b0;
+  }
 
-    // Predicate new
-    let validSubTargets = HasV4SubT, Predicates = [HasV4T] in
-    defm _cdn#NAME#_V4 : ST_MEMri_Pbase<mnemonic, RC, PredNot, 1>;
+def S2_storerb_pr : T_store_pr<"memb", IntRegs, 0b000, ByteAccess>;
+def S2_storerh_pr : T_store_pr<"memh", IntRegs, 0b010, HalfWordAccess>;
+def S2_storeri_pr : T_store_pr<"memw", IntRegs, 0b100, WordAccess>;
+def S2_storerd_pr : T_store_pr<"memd", DoubleRegs, 0b110, DoubleWordAccess>;
+
+def S2_storerf_pr : T_store_pr<"memh", IntRegs, 0b011, HalfWordAccess, 1>;
+
+let opExtendable = 1, isExtentSigned = 1, isPredicable = 1 in
+class T_store_io <string mnemonic, RegisterClass RC, Operand ImmOp,
+                 bits<3>MajOp, bit isH = 0>
+  : STInst <(outs),
+            (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+  mnemonic#"($src1+#$src2) = $src3"#!if(isH,".h","")>,
+  AddrModeRel, ImmRegRel {
+    bits<5> src1;
+    bits<14> src2; // Actual address offset
+    bits<5> src3;
+    bits<11> offsetBits; // Represents offset encoding
+
+    string ImmOpStr = !cast<string>(ImmOp);
+
+    let opExtentBits = !if (!eq(ImmOpStr, "s11_3Ext"), 14,
+                       !if (!eq(ImmOpStr, "s11_2Ext"), 13,
+                       !if (!eq(ImmOpStr, "s11_1Ext"), 12,
+                                        /* s11_0Ext */ 11)));
+    let offsetBits = !if (!eq(ImmOpStr, "s11_3Ext"), src2{13-3},
+                     !if (!eq(ImmOpStr, "s11_2Ext"), src2{12-2},
+                     !if (!eq(ImmOpStr, "s11_1Ext"), src2{11-1},
+                                      /* s11_0Ext */ src2{10-0})));
+    let IClass = 0b1010;
+
+    let Inst{27} = 0b0;
+    let Inst{26-25} = offsetBits{10-9};
+    let Inst{24} = 0b1;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13} = offsetBits{8};
+    let Inst{12-8} = src3;
+    let Inst{7-0} = offsetBits{7-0};
   }
-}
 
-let isExtendable = 1, isNVStorable = 1, neverHasSideEffects = 1 in
-multiclass ST_MEMri<string mnemonic, string CextOp, RegisterClass RC,
-                    bits<5> ImmBits, bits<5> PredImmBits> {
+let opExtendable = 2, isPredicated = 1 in
+class T_pstore_io <string mnemonic, RegisterClass RC, Operand ImmOp,
+                   bits<3>MajOp, bit PredNot, bit isPredNew, bit isH = 0>
+  : STInst <(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+  !if(PredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+  ") ")#mnemonic#"($src2+#$src3) = $src4"#!if(isH,".h",""),
+  [],"",V2LDST_tc_st_SLOT01 >,
+   AddrModeRel, ImmRegRel {
+    bits<2> src1;
+    bits<5> src2;
+    bits<9> src3; // Actual address offset
+    bits<5> src4;
+    bits<6> offsetBits; // Represents offset encoding
+
+    let isPredicatedNew = isPredNew;
+    let isPredicatedFalse = PredNot;
 
-  let CextOpcode = CextOp, BaseOpcode = CextOp in {
-    let opExtendable = 1, isExtentSigned = 1, opExtentBits = ImmBits,
-         isPredicable = 1 in
-    def NAME : STInst2<(outs),
-            (ins MEMri:$addr, RC:$src),
-            mnemonic#"($addr) = $src",
-            []>;
-
-    let opExtendable = 2, isExtentSigned = 0, opExtentBits = PredImmBits,
-        isPredicated = 1 in {
-      defm Pt : ST_MEMri_Pred<mnemonic, RC, 0>;
-      defm NotPt : ST_MEMri_Pred<mnemonic, RC, 1>;
-    }
+    string ImmOpStr = !cast<string>(ImmOp);
+    let opExtentBits = !if (!eq(ImmOpStr, "u6_3Ext"), 9,
+                       !if (!eq(ImmOpStr, "u6_2Ext"), 8,
+                       !if (!eq(ImmOpStr, "u6_1Ext"), 7,
+                                        /* u6_0Ext */ 6)));
+    let offsetBits = !if (!eq(ImmOpStr, "u6_3Ext"), src3{8-3},
+                     !if (!eq(ImmOpStr, "u6_2Ext"), src3{7-2},
+                     !if (!eq(ImmOpStr, "u6_1Ext"), src3{6-1},
+                                      /* u6_0Ext */ src3{5-0})));
+     let IClass = 0b0100;
+
+    let Inst{27} = 0b0;
+    let Inst{26} = PredNot;
+    let Inst{25} = isPredNew;
+    let Inst{24} = 0b0;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{13} = offsetBits{5};
+    let Inst{12-8} = src4;
+    let Inst{7-3} = offsetBits{4-0};
+    let Inst{1-0} = src1;
+  }
+
+let isExtendable = 1, isNVStorable = 1, hasSideEffects = 0 in
+multiclass ST_Idxd<string mnemonic, string CextOp, RegisterClass RC,
+                 Operand ImmOp, Operand predImmOp, bits<3> MajOp, bit isH = 0> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
+    def S2_#NAME#_io : T_store_io <mnemonic, RC, ImmOp, MajOp, isH>;
+
+    // Predicated
+    def S2_p#NAME#t_io : T_pstore_io<mnemonic, RC, predImmOp, MajOp, 0, 0, isH>;
+    def S2_p#NAME#f_io : T_pstore_io<mnemonic, RC, predImmOp, MajOp, 1, 0, isH>;
+
+    // Predicated new
+    def S4_p#NAME#tnew_io : T_pstore_io <mnemonic, RC, predImmOp,
+                                         MajOp, 0, 1, isH>;
+    def S4_p#NAME#fnew_io : T_pstore_io <mnemonic, RC, predImmOp,
+                                         MajOp, 1, 1, isH>;
   }
 }
 
-let addrMode = BaseImmOffset, isMEMri = "true" in {
+let addrMode = BaseImmOffset, InputType = "imm" in {
   let accessSize = ByteAccess in
-    defm STrib: ST_MEMri < "memb", "STrib", IntRegs, 11, 6>, AddrModeRel;
+    defm storerb: ST_Idxd < "memb", "STrib", IntRegs, s11_0Ext, u6_0Ext, 0b000>;
+
+  let accessSize = HalfWordAccess, opExtentAlign = 1 in
+    defm storerh: ST_Idxd < "memh", "STrih", IntRegs, s11_1Ext, u6_1Ext, 0b010>;
+
+  let accessSize = WordAccess, opExtentAlign = 2 in
+    defm storeri: ST_Idxd < "memw", "STriw", IntRegs, s11_2Ext, u6_2Ext, 0b100>;
 
-  let accessSize = HalfWordAccess in
-    defm STrih: ST_MEMri < "memh", "STrih", IntRegs, 12, 7>, AddrModeRel;
+  let accessSize = DoubleWordAccess, isNVStorable = 0, opExtentAlign = 3 in
+    defm storerd: ST_Idxd < "memd", "STrid", DoubleRegs, s11_3Ext,
+                            u6_3Ext, 0b110>;
 
-  let accessSize = WordAccess in
-    defm STriw: ST_MEMri < "memw", "STriw", IntRegs, 13, 8>, AddrModeRel;
+  let accessSize = HalfWordAccess, opExtentAlign = 1 in
+    defm storerf: ST_Idxd < "memh", "STrif", IntRegs, s11_1Ext,
+                            u6_1Ext, 0b011, 1>;
+}
 
-  let accessSize = DoubleWordAccess, isNVStorable = 0 in
-    defm STrid: ST_MEMri < "memd", "STrid", DoubleRegs, 14, 9>, AddrModeRel;
+// Patterns for generating stores, where the address takes different forms:
+// - frameindex,,
+// - base + offset,
+// - simple (base address without offset).
+// These would usually be used together (via Storex_pat defined below), but
+// in some cases one may want to apply different properties (such as
+// AddedComplexity) to the individual patterns.
+class Storex_fi_pat<PatFrag Store, PatFrag Value, InstHexagon MI>
+  : Pat<(Store Value:$Rs, AddrFI:$fi), (MI AddrFI:$fi, 0, Value:$Rs)>;
+class Storex_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
+                     InstHexagon MI>
+  : Pat<(Store Value:$Rt, (add (i32 IntRegs:$Rs), ImmPred:$Off)),
+        (MI IntRegs:$Rs, imm:$Off, Value:$Rt)>;
+class Storex_simple_pat<PatFrag Store, PatFrag Value, InstHexagon MI>
+  : Pat<(Store Value:$Rt, (i32 IntRegs:$Rs)),
+        (MI IntRegs:$Rs, 0, Value:$Rt)>;
+
+// Patterns for generating stores, where the address takes different forms,
+// and where the value being stored is transformed through the value modifier
+// ValueMod.  The address forms are same as above.
+class Storexm_fi_pat<PatFrag Store, PatFrag Value, PatFrag ValueMod,
+                     InstHexagon MI>
+  : Pat<(Store Value:$Rs, AddrFI:$fi),
+        (MI AddrFI:$fi, 0, (ValueMod Value:$Rs))>;
+class Storexm_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
+                      PatFrag ValueMod, InstHexagon MI>
+  : Pat<(Store Value:$Rt, (add (i32 IntRegs:$Rs), ImmPred:$Off)),
+        (MI IntRegs:$Rs, imm:$Off, (ValueMod Value:$Rt))>;
+class Storexm_simple_pat<PatFrag Store, PatFrag Value, PatFrag ValueMod,
+                         InstHexagon MI>
+  : Pat<(Store Value:$Rt, (i32 IntRegs:$Rs)),
+        (MI IntRegs:$Rs, 0, (ValueMod Value:$Rt))>;
+
+multiclass Storex_pat<PatFrag Store, PatFrag Value, PatLeaf ImmPred,
+                      InstHexagon MI> {
+  def: Storex_fi_pat  <Store, Value, MI>;
+  def: Storex_add_pat <Store, Value, ImmPred, MI>;
 }
 
-def : Pat<(truncstorei8 (i32 IntRegs:$src1), ADDRriS11_0:$addr),
-          (STrib ADDRriS11_0:$addr, (i32 IntRegs:$src1))>;
+multiclass Storexm_pat<PatFrag Store, PatFrag Value, PatLeaf ImmPred,
+                       PatFrag ValueMod, InstHexagon MI> {
+  def: Storexm_fi_pat  <Store, Value,          ValueMod, MI>;
+  def: Storexm_add_pat <Store, Value, ImmPred, ValueMod, MI>;
+}
 
-def : Pat<(truncstorei16 (i32 IntRegs:$src1), ADDRriS11_1:$addr),
-          (STrih ADDRriS11_1:$addr, (i32 IntRegs:$src1))>;
+// Regular stores in the DAG have two operands: value and address.
+// Atomic stores also have two, but they are reversed: address, value.
+// To use atomic stores with the patterns, they need to have their operands
+// swapped. This relies on the knowledge that the F.Fragment uses names
+// "ptr" and "val".
+class SwapSt<PatFrag F>
+  : PatFrag<(ops node:$val, node:$ptr), F.Fragment>;
 
-def : Pat<(store (i32 IntRegs:$src1), ADDRriS11_2:$addr),
-          (STriw ADDRriS11_2:$addr, (i32 IntRegs:$src1))>;
+let AddedComplexity = 20 in {
+  defm: Storex_pat<truncstorei8,    I32, s11_0ExtPred, S2_storerb_io>;
+  defm: Storex_pat<truncstorei16,   I32, s11_1ExtPred, S2_storerh_io>;
+  defm: Storex_pat<store,           I32, s11_2ExtPred, S2_storeri_io>;
+  defm: Storex_pat<store,           I64, s11_3ExtPred, S2_storerd_io>;
+
+  defm: Storex_pat<SwapSt<atomic_store_8>,  I32, s11_0ExtPred, S2_storerb_io>;
+  defm: Storex_pat<SwapSt<atomic_store_16>, I32, s11_1ExtPred, S2_storerh_io>;
+  defm: Storex_pat<SwapSt<atomic_store_32>, I32, s11_2ExtPred, S2_storeri_io>;
+  defm: Storex_pat<SwapSt<atomic_store_64>, I64, s11_3ExtPred, S2_storerd_io>;
+}
 
-def : Pat<(store (i64 DoubleRegs:$src1), ADDRriS11_3:$addr),
-          (STrid ADDRriS11_3:$addr, (i64 DoubleRegs:$src1))>;
+// Simple patterns should be tried with the least priority.
+def: Storex_simple_pat<truncstorei8,    I32, S2_storerb_io>;
+def: Storex_simple_pat<truncstorei16,   I32, S2_storerh_io>;
+def: Storex_simple_pat<store,           I32, S2_storeri_io>;
+def: Storex_simple_pat<store,           I64, S2_storerd_io>;
 
+def: Storex_simple_pat<SwapSt<atomic_store_8>,  I32, S2_storerb_io>;
+def: Storex_simple_pat<SwapSt<atomic_store_16>, I32, S2_storerh_io>;
+def: Storex_simple_pat<SwapSt<atomic_store_32>, I32, S2_storeri_io>;
+def: Storex_simple_pat<SwapSt<atomic_store_64>, I64, S2_storerd_io>;
+
+let AddedComplexity = 20 in {
+  defm: Storexm_pat<truncstorei8,  I64, s11_0ExtPred, LoReg, S2_storerb_io>;
+  defm: Storexm_pat<truncstorei16, I64, s11_1ExtPred, LoReg, S2_storerh_io>;
+  defm: Storexm_pat<truncstorei32, I64, s11_2ExtPred, LoReg, S2_storeri_io>;
+}
+
+def: Storexm_simple_pat<truncstorei8,  I64, LoReg, S2_storerb_io>;
+def: Storexm_simple_pat<truncstorei16, I64, LoReg, S2_storerh_io>;
+def: Storexm_simple_pat<truncstorei32, I64, LoReg, S2_storeri_io>;
+
+// Store predicate.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def STriw_pred : STInst<(outs),
+      (ins IntRegs:$addr, s11_2Ext:$off, PredRegs:$src1),
+      ".error \"should not emit\"", []>;
+
+// S2_allocframe: Allocate stack frame.
+let Defs = [R29, R30], Uses = [R29, R31, R30],
+    hasSideEffects = 0, accessSize = DoubleWordAccess in
+def S2_allocframe: ST0Inst <
+  (outs), (ins u11_3Imm:$u11_3),
+  "allocframe(#$u11_3)" > {
+    bits<14> u11_3;
+
+    let IClass = 0b1010;
+    let Inst{27-16} = 0b000010011101;
+    let Inst{13-11} = 0b000;
+    let Inst{10-0} = u11_3{13-3};
+  }
+
+// S2_storer[bhwdf]_pci: Store byte/half/word/double.
+// S2_storer[bhwdf]_pci -> S2_storerbnew_pci
+let Uses = [CS], isNVStorable = 1 in
+class T_store_pci <string mnemonic, RegisterClass RC,
+                         Operand Imm, bits<4>MajOp,
+                         MemAccessSize AlignSize, string RegSrc = "Rt">
+  : STInst <(outs IntRegs:$_dst_),
+  (ins IntRegs:$Rz, Imm:$offset, ModRegs:$Mu, RC:$Rt),
+  #mnemonic#"($Rz ++ #$offset:circ($Mu)) = $"#RegSrc#"",
+  [] ,
+  "$Rz = $_dst_" > {
+    bits<5> Rz;
+    bits<7> offset;
+    bits<1> Mu;
+    bits<5> Rt;
+    let accessSize = AlignSize;
+
+    let IClass = 0b1010;
+    let Inst{27-25} = 0b100;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = Rz;
+    let Inst{13} = Mu;
+    let Inst{12-8} = Rt;
+    let Inst{7} = 0b0;
+    let Inst{6-3} =
+      !if (!eq(!cast<string>(AlignSize), "DoubleWordAccess"), offset{6-3},
+      !if (!eq(!cast<string>(AlignSize), "WordAccess"),       offset{5-2},
+      !if (!eq(!cast<string>(AlignSize), "HalfWordAccess"),   offset{4-1},
+                                       /* ByteAccess */       offset{3-0})));
+    let Inst{1} = 0b0;
+  }
+
+def S2_storerb_pci : T_store_pci<"memb", IntRegs, s4_0Imm, 0b1000,
+                                        ByteAccess>;
+def S2_storerh_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1010,
+                                        HalfWordAccess>;
+def S2_storerf_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1011,
+                                        HalfWordAccess, "Rt.h">;
+def S2_storeri_pci : T_store_pci<"memw", IntRegs, s4_2Imm, 0b1100,
+                                        WordAccess>;
+def S2_storerd_pci : T_store_pci<"memd", DoubleRegs, s4_3Imm, 0b1110,
+                                        DoubleWordAccess>;
+
+let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 4 in
+class T_storenew_pci <string mnemonic, Operand Imm,
+                             bits<2>MajOp, MemAccessSize AlignSize>
+  : NVInst < (outs IntRegs:$_dst_),
+  (ins IntRegs:$Rz, Imm:$offset, ModRegs:$Mu, IntRegs:$Nt),
+  #mnemonic#"($Rz ++ #$offset:circ($Mu)) = $Nt.new",
+  [],
+  "$Rz = $_dst_"> {
+    bits<5> Rz;
+    bits<6> offset;
+    bits<1> Mu;
+    bits<3> Nt;
+
+    let accessSize = AlignSize;
+
+    let IClass = 0b1010;
+    let Inst{27-21} = 0b1001101;
+    let Inst{20-16} = Rz;
+    let Inst{13} = Mu;
+    let Inst{12-11} = MajOp;
+    let Inst{10-8} = Nt;
+    let Inst{7} = 0b0;
+    let Inst{6-3} =
+      !if (!eq(!cast<string>(AlignSize), "WordAccess"),     offset{5-2},
+      !if (!eq(!cast<string>(AlignSize), "HalfWordAccess"), offset{4-1},
+                                       /* ByteAccess */     offset{3-0}));
+    let Inst{1} = 0b0;
+  }
+
+def S2_storerbnew_pci : T_storenew_pci <"memb", s4_0Imm, 0b00, ByteAccess>;
+def S2_storerhnew_pci : T_storenew_pci <"memh", s4_1Imm, 0b01, HalfWordAccess>;
+def S2_storerinew_pci : T_storenew_pci <"memw", s4_2Imm, 0b10, WordAccess>;
+
+//===----------------------------------------------------------------------===//
+// Circular stores - Pseudo
+//
+// Please note that the input operand order in the pseudo instructions
+// doesn't match with the real instructions. Pseudo instructions operand
+// order should mimics the ordering in the intrinsics.
+//===----------------------------------------------------------------------===//
+let isCodeGenOnly = 1, mayStore = 1, hasSideEffects = 0, isPseudo = 1 in
+class T_store_pci_pseudo <string opc, RegisterClass RC>
+  : STInstPI<(outs IntRegs:$_dst_),
+             (ins IntRegs:$src1, RC:$src2, IntRegs:$src3, s4Imm:$src4),
+  ".error \""#opc#"($src1++#$src4:circ($src3)) = $src2\"",
+  [], "$_dst_ = $src1">;
+
+def S2_storerb_pci_pseudo : T_store_pci_pseudo <"memb", IntRegs>;
+def S2_storerh_pci_pseudo : T_store_pci_pseudo <"memh", IntRegs>;
+def S2_storerf_pci_pseudo : T_store_pci_pseudo <"memh", IntRegs>;
+def S2_storeri_pci_pseudo : T_store_pci_pseudo <"memw", IntRegs>;
+def S2_storerd_pci_pseudo : T_store_pci_pseudo <"memd", DoubleRegs>;
 
 //===----------------------------------------------------------------------===//
-// multiclass for the store instructions with base+immediate offset
-// addressing mode
+// Circular stores with auto-increment register
 //===----------------------------------------------------------------------===//
-multiclass ST_Idxd_Pbase<string mnemonic, RegisterClass RC, Operand predImmOp,
-                        bit isNot, bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME : STInst2<(outs),
-            (ins PredRegs:$src1, IntRegs:$src2, predImmOp:$src3, RC: $src4),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-            ") ")#mnemonic#"($src2+#$src3) = $src4",
-            []>;
+let Uses = [CS], isNVStorable = 1 in
+class T_store_pcr <string mnemonic, RegisterClass RC, bits<4>MajOp,
+                               MemAccessSize AlignSize, string RegSrc = "Rt">
+  : STInst <(outs IntRegs:$_dst_),
+  (ins IntRegs:$Rz, ModRegs:$Mu, RC:$Rt),
+  #mnemonic#"($Rz ++ I:circ($Mu)) = $"#RegSrc#"",
+  [],
+  "$Rz = $_dst_" > {
+    bits<5> Rz;
+    bits<1> Mu;
+    bits<5> Rt;
+
+    let accessSize = AlignSize;
+
+    let IClass = 0b1010;
+    let Inst{27-25} = 0b100;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = Rz;
+    let Inst{13} = Mu;
+    let Inst{12-8} = Rt;
+    let Inst{7} = 0b0;
+    let Inst{1} = 0b1;
+  }
+
+def S2_storerb_pcr : T_store_pcr<"memb", IntRegs, 0b1000, ByteAccess>;
+def S2_storerh_pcr : T_store_pcr<"memh", IntRegs, 0b1010, HalfWordAccess>;
+def S2_storeri_pcr : T_store_pcr<"memw", IntRegs, 0b1100, WordAccess>;
+def S2_storerd_pcr : T_store_pcr<"memd", DoubleRegs, 0b1110, DoubleWordAccess>;
+def S2_storerf_pcr : T_store_pcr<"memh", IntRegs, 0b1011,
+                                 HalfWordAccess, "Rt.h">;
+
+//===----------------------------------------------------------------------===//
+// Circular .new stores with auto-increment register
+//===----------------------------------------------------------------------===//
+let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3 in
+class T_storenew_pcr <string mnemonic, bits<2>MajOp,
+                                   MemAccessSize AlignSize>
+  : NVInst <(outs IntRegs:$_dst_),
+  (ins IntRegs:$Rz, ModRegs:$Mu, IntRegs:$Nt),
+  #mnemonic#"($Rz ++ I:circ($Mu)) = $Nt.new" ,
+  [] ,
+  "$Rz = $_dst_"> {
+    bits<5> Rz;
+    bits<1> Mu;
+    bits<3> Nt;
+
+    let accessSize = AlignSize;
+
+    let IClass = 0b1010;
+    let Inst{27-21} = 0b1001101;
+    let Inst{20-16} = Rz;
+    let Inst{13} = Mu;
+    let Inst{12-11} = MajOp;
+    let Inst{10-8} = Nt;
+    let Inst{7} = 0b0;
+    let Inst{1} = 0b1;
+  }
+
+def S2_storerbnew_pcr : T_storenew_pcr <"memb", 0b00, ByteAccess>;
+def S2_storerhnew_pcr : T_storenew_pcr <"memh", 0b01, HalfWordAccess>;
+def S2_storerinew_pcr : T_storenew_pcr <"memw", 0b10, WordAccess>;
+
+//===----------------------------------------------------------------------===//
+// Bit-reversed stores with auto-increment register
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_store_pbr<string mnemonic, RegisterClass RC,
+                            MemAccessSize addrSize, bits<3> majOp,
+                            bit isHalf = 0>
+  : STInst
+    <(outs IntRegs:$_dst_),
+     (ins IntRegs:$Rz, ModRegs:$Mu, RC:$src),
+     #mnemonic#"($Rz ++ $Mu:brev) = $src"#!if (!eq(isHalf, 1), ".h", ""),
+     [], "$Rz = $_dst_" > {
+
+      let accessSize = addrSize;
+
+      bits<5> Rz;
+      bits<1> Mu;
+      bits<5> src;
+
+      let IClass = 0b1010;
+
+      let Inst{27-24} = 0b1111;
+      let Inst{23-21} = majOp;
+      let Inst{7} = 0b0;
+      let Inst{20-16} = Rz;
+      let Inst{13} = Mu;
+      let Inst{12-8} = src;
+    }
+
+let isNVStorable = 1 in {
+  let BaseOpcode = "S2_storerb_pbr" in
+  def S2_storerb_pbr : T_store_pbr<"memb", IntRegs, ByteAccess,
+                                             0b000>, NewValueRel;
+  let BaseOpcode = "S2_storerh_pbr" in
+  def S2_storerh_pbr : T_store_pbr<"memh", IntRegs, HalfWordAccess,
+                                             0b010>, NewValueRel;
+  let BaseOpcode = "S2_storeri_pbr" in
+  def S2_storeri_pbr : T_store_pbr<"memw", IntRegs, WordAccess,
+                                             0b100>, NewValueRel;
 }
 
-multiclass ST_Idxd_Pred<string mnemonic, RegisterClass RC, Operand predImmOp,
-                        bit PredNot> {
-  let isPredicatedFalse = PredNot, isPredicated = 1 in {
-    defm _c#NAME : ST_Idxd_Pbase<mnemonic, RC, predImmOp, PredNot, 0>;
+def S2_storerf_pbr : T_store_pbr<"memh", IntRegs, HalfWordAccess, 0b011, 1>;
+def S2_storerd_pbr : T_store_pbr<"memd", DoubleRegs, DoubleWordAccess, 0b110>;
 
-    // Predicate new
-    let validSubTargets = HasV4SubT, Predicates = [HasV4T] in
-    defm _cdn#NAME#_V4 : ST_Idxd_Pbase<mnemonic, RC, predImmOp, PredNot, 1>;
+//===----------------------------------------------------------------------===//
+// Bit-reversed .new stores with auto-increment register
+//===----------------------------------------------------------------------===//
+let isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3,
+    hasSideEffects = 0 in
+class T_storenew_pbr<string mnemonic, MemAccessSize addrSize, bits<2> majOp>
+  : NVInst <(outs IntRegs:$_dst_),
+            (ins IntRegs:$Rz, ModRegs:$Mu, IntRegs:$Nt),
+     #mnemonic#"($Rz ++ $Mu:brev) = $Nt.new", [],
+     "$Rz = $_dst_">, NewValueRel {
+    let accessSize = addrSize;
+    bits<5> Rz;
+    bits<1> Mu;
+    bits<3> Nt;
+
+    let IClass = 0b1010;
+
+    let Inst{27-21} = 0b1111101;
+    let Inst{12-11} = majOp;
+    let Inst{7} = 0b0;
+    let Inst{20-16} = Rz;
+    let Inst{13} = Mu;
+    let Inst{10-8} = Nt;
   }
+
+let BaseOpcode = "S2_storerb_pbr" in
+def S2_storerbnew_pbr : T_storenew_pbr<"memb", ByteAccess, 0b00>;
+
+let BaseOpcode = "S2_storerh_pbr" in
+def S2_storerhnew_pbr : T_storenew_pbr<"memh", HalfWordAccess, 0b01>;
+
+let BaseOpcode = "S2_storeri_pbr" in
+def S2_storerinew_pbr : T_storenew_pbr<"memw", WordAccess, 0b10>;
+
+//===----------------------------------------------------------------------===//
+// Bit-reversed stores - Pseudo
+//
+// Please note that the input operand order in the pseudo instructions
+// doesn't match with the real instructions. Pseudo instructions operand
+// order should mimics the ordering in the intrinsics.
+//===----------------------------------------------------------------------===//
+let isCodeGenOnly = 1,  mayStore = 1, hasSideEffects = 0, isPseudo = 1 in
+class T_store_pbr_pseudo <string opc, RegisterClass RC>
+  : STInstPI<(outs IntRegs:$_dst_),
+             (ins IntRegs:$src1, RC:$src2, IntRegs:$src3),
+  ".error \""#opc#"($src1++$src3:brev) = $src2\"",
+  [], "$_dst_ = $src1">;
+
+def S2_storerb_pbr_pseudo : T_store_pbr_pseudo <"memb", IntRegs>;
+def S2_storerh_pbr_pseudo : T_store_pbr_pseudo <"memh", IntRegs>;
+def S2_storeri_pbr_pseudo : T_store_pbr_pseudo <"memw", IntRegs>;
+def S2_storerf_pbr_pseudo : T_store_pbr_pseudo <"memh", IntRegs>;
+def S2_storerd_pbr_pseudo : T_store_pbr_pseudo <"memd", DoubleRegs>;
+
+//===----------------------------------------------------------------------===//
+// ST -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Template class for S_2op instructions.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_S2op_1 <string mnemonic, bits<4> RegTyBits, RegisterClass RCOut,
+                RegisterClass RCIn, bits<2> MajOp, bits<3> MinOp, bit isSat>
+  : SInst <(outs RCOut:$dst), (ins RCIn:$src),
+  "$dst = "#mnemonic#"($src)"#!if(isSat, ":sat", ""),
+  [], "", S_2op_tc_1_SLOT23 > {
+    bits<5> dst;
+    bits<5> src;
+
+    let IClass = 0b1000;
+
+    let Inst{27-24} = RegTyBits;
+    let Inst{23-22} = MajOp;
+    let Inst{21} = 0b0;
+    let Inst{20-16} = src;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = dst;
+  }
+
+class T_S2op_1_di <string mnemonic, bits<2> MajOp, bits<3> MinOp>
+  : T_S2op_1 <mnemonic, 0b0100, DoubleRegs, IntRegs, MajOp, MinOp, 0>;
+
+let hasNewValue = 1 in
+class T_S2op_1_id <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit isSat = 0>
+  : T_S2op_1 <mnemonic, 0b1000, IntRegs, DoubleRegs, MajOp, MinOp, isSat>;
+
+let hasNewValue = 1 in
+class T_S2op_1_ii <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit isSat = 0>
+  : T_S2op_1 <mnemonic, 0b1100, IntRegs, IntRegs, MajOp, MinOp, isSat>;
+
+// Vector sign/zero extend
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+  def S2_vsxtbh : T_S2op_1_di <"vsxtbh", 0b00, 0b000>;
+  def S2_vsxthw : T_S2op_1_di <"vsxthw", 0b00, 0b100>;
+  def S2_vzxtbh : T_S2op_1_di <"vzxtbh", 0b00, 0b010>;
+  def S2_vzxthw : T_S2op_1_di <"vzxthw", 0b00, 0b110>;
 }
 
-let isExtendable = 1, isNVStorable = 1, neverHasSideEffects = 1 in
-multiclass ST_Idxd<string mnemonic, string CextOp, RegisterClass RC,
-                   Operand ImmOp, Operand predImmOp, bits<5> ImmBits,
-                   bits<5> PredImmBits> {
+// Vector splat bytes/halfwords
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+  def S2_vsplatrb : T_S2op_1_ii <"vsplatb", 0b01, 0b111>;
+  def S2_vsplatrh : T_S2op_1_di <"vsplath", 0b01, 0b010>;
+}
 
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
-    let opExtendable = 1, isExtentSigned = 1, opExtentBits = ImmBits,
-         isPredicable = 1 in
-    def NAME : STInst2<(outs),
-            (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-            mnemonic#"($src1+#$src2) = $src3",
-            []>;
+// Sign extend word to doubleword
+def A2_sxtw   : T_S2op_1_di <"sxtw", 0b01, 0b000>;
 
-    let opExtendable = 2, isExtentSigned = 0, opExtentBits = PredImmBits in {
-      defm Pt : ST_Idxd_Pred<mnemonic, RC, predImmOp, 0>;
-      defm NotPt : ST_Idxd_Pred<mnemonic, RC, predImmOp, 1>;
-    }
+def: Pat <(i64 (sext I32:$src)), (A2_sxtw I32:$src)>;
+
+// Vector saturate and pack
+let Defs = [USR_OVF] in {
+  def S2_svsathb  : T_S2op_1_ii <"vsathb", 0b10, 0b000>;
+  def S2_svsathub : T_S2op_1_ii <"vsathub", 0b10, 0b010>;
+  def S2_vsathb   : T_S2op_1_id <"vsathb", 0b00, 0b110>;
+  def S2_vsathub  : T_S2op_1_id <"vsathub", 0b00, 0b000>;
+  def S2_vsatwh   : T_S2op_1_id <"vsatwh", 0b00, 0b010>;
+  def S2_vsatwuh  : T_S2op_1_id <"vsatwuh", 0b00, 0b100>;
+}
+
+// Vector truncate
+def S2_vtrunohb : T_S2op_1_id <"vtrunohb", 0b10, 0b000>;
+def S2_vtrunehb : T_S2op_1_id <"vtrunehb", 0b10, 0b010>;
+
+// Swizzle the bytes of a word
+def A2_swiz : T_S2op_1_ii <"swiz", 0b10, 0b111>;
+
+// Saturate
+let Defs = [USR_OVF] in {
+  def A2_sat   : T_S2op_1_id <"sat", 0b11, 0b000>;
+  def A2_satb  : T_S2op_1_ii <"satb", 0b11, 0b111>;
+  def A2_satub : T_S2op_1_ii <"satub", 0b11, 0b110>;
+  def A2_sath  : T_S2op_1_ii <"sath", 0b11, 0b100>;
+  def A2_satuh : T_S2op_1_ii <"satuh", 0b11, 0b101>;
+  def A2_roundsat : T_S2op_1_id <"round", 0b11, 0b001, 0b1>;
+}
+
+let Itinerary = S_2op_tc_2_SLOT23 in {
+  // Vector round and pack
+  def S2_vrndpackwh   : T_S2op_1_id <"vrndwh", 0b10, 0b100>;
+
+  let Defs = [USR_OVF] in
+  def S2_vrndpackwhs  : T_S2op_1_id <"vrndwh", 0b10, 0b110, 1>;
+
+  // Bit reverse
+  def S2_brev : T_S2op_1_ii <"brev", 0b01, 0b110>;
+
+  // Absolute value word
+  def A2_abs    : T_S2op_1_ii <"abs", 0b10, 0b100>;
+
+  let Defs = [USR_OVF] in
+  def A2_abssat : T_S2op_1_ii <"abs", 0b10, 0b101, 1>;
+
+  // Negate with saturation
+  let Defs = [USR_OVF] in
+  def A2_negsat : T_S2op_1_ii <"neg", 0b10, 0b110, 1>;
+}
+
+def: Pat<(i32 (select (i1 (setlt (i32 IntRegs:$src), 0)),
+                      (i32 (sub 0, (i32 IntRegs:$src))),
+                      (i32 IntRegs:$src))),
+         (A2_abs IntRegs:$src)>;
+
+let AddedComplexity = 50 in
+def: Pat<(i32 (xor (add (sra (i32 IntRegs:$src), (i32 31)),
+                        (i32 IntRegs:$src)),
+                   (sra (i32 IntRegs:$src), (i32 31)))),
+         (A2_abs IntRegs:$src)>;
+
+class T_S2op_2 <string mnemonic, bits<4> RegTyBits, RegisterClass RCOut,
+                RegisterClass RCIn, bits<3> MajOp, bits<3> MinOp,
+                bit isSat, bit isRnd, list<dag> pattern = []>
+  : SInst <(outs RCOut:$dst),
+  (ins RCIn:$src, u5Imm:$u5),
+  "$dst = "#mnemonic#"($src, #$u5)"#!if(isSat, ":sat", "")
+                                   #!if(isRnd, ":rnd", ""),
+  pattern, "", S_2op_tc_2_SLOT23> {
+    bits<5> dst;
+    bits<5> src;
+    bits<5> u5;
+
+    let IClass = 0b1000;
+
+    let Inst{27-24} = RegTyBits;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = src;
+    let Inst{13} = 0b0;
+    let Inst{12-8} = u5;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = dst;
   }
+
+class T_S2op_2_di <string mnemonic, bits<3> MajOp, bits<3> MinOp>
+  : T_S2op_2 <mnemonic, 0b1000, DoubleRegs, IntRegs, MajOp, MinOp, 0, 0>;
+
+let hasNewValue = 1 in
+class T_S2op_2_id <string mnemonic, bits<3> MajOp, bits<3> MinOp>
+  : T_S2op_2 <mnemonic, 0b1000, IntRegs, DoubleRegs, MajOp, MinOp, 0, 0>;
+
+let hasNewValue = 1 in
+class T_S2op_2_ii <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                   bit isSat = 0, bit isRnd = 0, list<dag> pattern = []>
+  : T_S2op_2 <mnemonic, 0b1100, IntRegs, IntRegs, MajOp, MinOp,
+              isSat, isRnd, pattern>;
+
+class T_S2op_shift <string mnemonic, bits<3> MajOp, bits<3> MinOp, SDNode OpNd>
+  : T_S2op_2_ii <mnemonic, MajOp, MinOp, 0, 0,
+    [(set (i32 IntRegs:$dst), (OpNd (i32 IntRegs:$src),
+                                    (u5ImmPred:$u5)))]>;
+
+// Vector arithmetic shift right by immediate with truncate and pack
+def S2_asr_i_svw_trun : T_S2op_2_id <"vasrw", 0b110, 0b010>;
+
+// Arithmetic/logical shift right/left by immediate
+let Itinerary = S_2op_tc_1_SLOT23 in {
+  def S2_asr_i_r : T_S2op_shift <"asr", 0b000, 0b000, sra>;
+  def S2_lsr_i_r : T_S2op_shift <"lsr", 0b000, 0b001, srl>;
+  def S2_asl_i_r : T_S2op_shift <"asl", 0b000, 0b010, shl>;
 }
 
-let addrMode = BaseImmOffset, InputType = "reg" in {
-  let accessSize = ByteAccess in
-    defm STrib_indexed: ST_Idxd < "memb", "STrib", IntRegs, s11_0Ext,
-                                  u6_0Ext, 11, 6>, AddrModeRel, ImmRegRel;
+// Shift left by immediate with saturation
+let Defs = [USR_OVF] in
+def S2_asl_i_r_sat : T_S2op_2_ii <"asl", 0b010, 0b010, 1>;
+
+// Shift right with round
+def S2_asr_i_r_rnd : T_S2op_2_ii <"asr", 0b010, 0b000, 0, 1>;
+
+let isAsmParserOnly = 1 in
+def S2_asr_i_r_rnd_goodsyntax
+  : SInst <(outs IntRegs:$dst), (ins  IntRegs:$src, u5Imm:$u5),
+  "$dst = asrrnd($src, #$u5)",
+  [], "", S_2op_tc_1_SLOT23>;
+
+let isAsmParserOnly = 1 in
+def A2_not: ALU32_rr<(outs IntRegs:$dst),(ins IntRegs:$src),
+  "$dst = not($src)">;
+
+def: Pat<(i32 (sra (i32 (add (i32 (sra I32:$src1, u5ImmPred:$src2)),
+                             (i32 1))),
+                   (i32 1))),
+         (S2_asr_i_r_rnd IntRegs:$src1, u5ImmPred:$src2)>;
+
+class T_S2op_3<string opc, bits<2>MajOp, bits<3>minOp, bits<1> sat = 0>
+  : SInst<(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss),
+           "$Rdd = "#opc#"($Rss)"#!if(!eq(sat, 1),":sat","")> {
+  bits<5> Rss;
+  bits<5> Rdd;
+  let IClass = 0b1000;
+  let Inst{27-24} = 0;
+  let Inst{23-22} = MajOp;
+  let Inst{20-16} = Rss;
+  let Inst{7-5} = minOp;
+  let Inst{4-0} = Rdd;
+}
+
+def A2_absp : T_S2op_3 <"abs", 0b10, 0b110>;
+def A2_negp : T_S2op_3 <"neg", 0b10, 0b101>;
+def A2_notp : T_S2op_3 <"not", 0b10, 0b100>;
+
+// Innterleave/deinterleave
+def S2_interleave   : T_S2op_3 <"interleave",   0b11, 0b101>;
+def S2_deinterleave : T_S2op_3 <"deinterleave", 0b11, 0b100>;
+
+// Vector Complex conjugate
+def A2_vconj : T_S2op_3 <"vconj", 0b10, 0b111, 1>;
+
+// Vector saturate without pack
+def S2_vsathb_nopack  : T_S2op_3 <"vsathb",  0b00, 0b111>;
+def S2_vsathub_nopack : T_S2op_3 <"vsathub", 0b00, 0b100>;
+def S2_vsatwh_nopack  : T_S2op_3 <"vsatwh",  0b00, 0b110>;
+def S2_vsatwuh_nopack : T_S2op_3 <"vsatwuh", 0b00, 0b101>;
+
+// Vector absolute value halfwords with and without saturation
+// Rdd64=vabsh(Rss64)[:sat]
+def A2_vabsh    : T_S2op_3 <"vabsh", 0b01, 0b100>;
+def A2_vabshsat : T_S2op_3 <"vabsh", 0b01, 0b101, 1>;
+
+// Vector absolute value words with and without saturation
+def A2_vabsw    : T_S2op_3 <"vabsw", 0b01, 0b110>;
+def A2_vabswsat : T_S2op_3 <"vabsw", 0b01, 0b111, 1>;
+
+def : Pat<(not (i64 DoubleRegs:$src1)),
+          (A2_notp DoubleRegs:$src1)>;
+
+//===----------------------------------------------------------------------===//
+// STYPE/BIT +
+//===----------------------------------------------------------------------===//
+// Bit count
+
+let hasSideEffects = 0, hasNewValue = 1 in
+class T_COUNT_LEADING<string MnOp, bits<3> MajOp, bits<3> MinOp, bit Is32,
+                dag Out, dag Inp>
+    : SInst<Out, Inp, "$Rd = "#MnOp#"($Rs)", [], "", S_2op_tc_1_SLOT23> {
+  bits<5> Rs;
+  bits<5> Rd;
+  let IClass = 0b1000;
+  let Inst{27} = 0b1;
+  let Inst{26} = Is32;
+  let Inst{25-24} = 0b00;
+  let Inst{23-21} = MajOp;
+  let Inst{20-16} = Rs;
+  let Inst{7-5} = MinOp;
+  let Inst{4-0} = Rd;
+}
 
-  let accessSize = HalfWordAccess in
-    defm STrih_indexed: ST_Idxd < "memh", "STrih", IntRegs, s11_1Ext,
-                                  u6_1Ext, 12, 7>, AddrModeRel, ImmRegRel;
+class T_COUNT_LEADING_32<string MnOp, bits<3> MajOp, bits<3> MinOp>
+    : T_COUNT_LEADING<MnOp, MajOp, MinOp, 0b1,
+                      (outs IntRegs:$Rd), (ins IntRegs:$Rs)>;
+
+class T_COUNT_LEADING_64<string MnOp, bits<3> MajOp, bits<3> MinOp>
+    : T_COUNT_LEADING<MnOp, MajOp, MinOp, 0b0,
+                      (outs IntRegs:$Rd), (ins DoubleRegs:$Rs)>;
+
+def S2_cl0     : T_COUNT_LEADING_32<"cl0",     0b000, 0b101>;
+def S2_cl1     : T_COUNT_LEADING_32<"cl1",     0b000, 0b110>;
+def S2_ct0     : T_COUNT_LEADING_32<"ct0",     0b010, 0b100>;
+def S2_ct1     : T_COUNT_LEADING_32<"ct1",     0b010, 0b101>;
+def S2_cl0p    : T_COUNT_LEADING_64<"cl0",     0b010, 0b010>;
+def S2_cl1p    : T_COUNT_LEADING_64<"cl1",     0b010, 0b100>;
+def S2_clb     : T_COUNT_LEADING_32<"clb",     0b000, 0b100>;
+def S2_clbp    : T_COUNT_LEADING_64<"clb",     0b010, 0b000>;
+def S2_clbnorm : T_COUNT_LEADING_32<"normamt", 0b000, 0b111>;
+
+def: Pat<(i32 (ctlz I32:$Rs)),                (S2_cl0 I32:$Rs)>;
+def: Pat<(i32 (ctlz (not I32:$Rs))),          (S2_cl1 I32:$Rs)>;
+def: Pat<(i32 (cttz I32:$Rs)),                (S2_ct0 I32:$Rs)>;
+def: Pat<(i32 (cttz (not I32:$Rs))),          (S2_ct1 I32:$Rs)>;
+def: Pat<(i32 (trunc (ctlz I64:$Rss))),       (S2_cl0p I64:$Rss)>;
+def: Pat<(i32 (trunc (ctlz (not I64:$Rss)))), (S2_cl1p I64:$Rss)>;
+
+// Bit set/clear/toggle
 
-  let accessSize = WordAccess in
-    defm STriw_indexed: ST_Idxd < "memw", "STriw", IntRegs, s11_2Ext,
-                                  u6_2Ext, 13, 8>, AddrModeRel, ImmRegRel;
+let hasSideEffects = 0, hasNewValue = 1 in
+class T_SCT_BIT_IMM<string MnOp, bits<3> MinOp>
+    : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, u5Imm:$u5),
+            "$Rd = "#MnOp#"($Rs, #$u5)", [], "", S_2op_tc_1_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> u5;
+  let IClass = 0b1000;
+  let Inst{27-21} = 0b1100110;
+  let Inst{20-16} = Rs;
+  let Inst{13} = 0b0;
+  let Inst{12-8} = u5;
+  let Inst{7-5} = MinOp;
+  let Inst{4-0} = Rd;
+}
 
-  let accessSize = DoubleWordAccess, isNVStorable = 0 in
-    defm STrid_indexed: ST_Idxd < "memd", "STrid", DoubleRegs, s11_3Ext,
-                                  u6_3Ext, 14, 9>, AddrModeRel;
+let hasSideEffects = 0, hasNewValue = 1 in
+class T_SCT_BIT_REG<string MnOp, bits<2> MinOp>
+    : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
+            "$Rd = "#MnOp#"($Rs, $Rt)", [], "", S_3op_tc_1_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+  let IClass = 0b1100;
+  let Inst{27-22} = 0b011010;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{7-6} = MinOp;
+  let Inst{4-0} = Rd;
 }
 
-let AddedComplexity = 10 in {
-def : Pat<(truncstorei8 (i32 IntRegs:$src1), (add IntRegs:$src2,
-                                                  s11_0ExtPred:$offset)),
-          (STrib_indexed IntRegs:$src2, s11_0ImmPred:$offset,
-                         (i32 IntRegs:$src1))>;
+def S2_clrbit_i    : T_SCT_BIT_IMM<"clrbit",    0b001>;
+def S2_setbit_i    : T_SCT_BIT_IMM<"setbit",    0b000>;
+def S2_togglebit_i : T_SCT_BIT_IMM<"togglebit", 0b010>;
+def S2_clrbit_r    : T_SCT_BIT_REG<"clrbit",    0b01>;
+def S2_setbit_r    : T_SCT_BIT_REG<"setbit",    0b00>;
+def S2_togglebit_r : T_SCT_BIT_REG<"togglebit", 0b10>;
+
+def: Pat<(i32 (and (i32 IntRegs:$Rs), (not (shl 1, u5ImmPred:$u5)))),
+         (S2_clrbit_i IntRegs:$Rs, u5ImmPred:$u5)>;
+def: Pat<(i32 (or (i32 IntRegs:$Rs), (shl 1, u5ImmPred:$u5))),
+         (S2_setbit_i IntRegs:$Rs, u5ImmPred:$u5)>;
+def: Pat<(i32 (xor (i32 IntRegs:$Rs), (shl 1, u5ImmPred:$u5))),
+         (S2_togglebit_i IntRegs:$Rs, u5ImmPred:$u5)>;
+def: Pat<(i32 (and (i32 IntRegs:$Rs), (not (shl 1, (i32 IntRegs:$Rt))))),
+         (S2_clrbit_r IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(i32 (or (i32 IntRegs:$Rs), (shl 1, (i32 IntRegs:$Rt)))),
+         (S2_setbit_r IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(i32 (xor (i32 IntRegs:$Rs), (shl 1, (i32 IntRegs:$Rt)))),
+         (S2_togglebit_r IntRegs:$Rs, IntRegs:$Rt)>;
+
+// Bit test
+
+let hasSideEffects = 0 in
+class T_TEST_BIT_IMM<string MnOp, bits<3> MajOp>
+    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, u5Imm:$u5),
+            "$Pd = "#MnOp#"($Rs, #$u5)",
+            [], "", S_2op_tc_2early_SLOT23> {
+  bits<2> Pd;
+  bits<5> Rs;
+  bits<5> u5;
+  let IClass = 0b1000;
+  let Inst{27-24} = 0b0101;
+  let Inst{23-21} = MajOp;
+  let Inst{20-16} = Rs;
+  let Inst{13} = 0;
+  let Inst{12-8} = u5;
+  let Inst{1-0} = Pd;
+}
 
-def : Pat<(truncstorei16 (i32 IntRegs:$src1), (add IntRegs:$src2,
-                                                   s11_1ExtPred:$offset)),
-          (STrih_indexed IntRegs:$src2, s11_1ImmPred:$offset,
-                         (i32 IntRegs:$src1))>;
+let hasSideEffects = 0 in
+class T_TEST_BIT_REG<string MnOp, bit IsNeg>
+    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
+            "$Pd = "#MnOp#"($Rs, $Rt)",
+            [], "", S_3op_tc_2early_SLOT23> {
+  bits<2> Pd;
+  bits<5> Rs;
+  bits<5> Rt;
+  let IClass = 0b1100;
+  let Inst{27-22} = 0b011100;
+  let Inst{21} = IsNeg;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{1-0} = Pd;
+}
 
-def : Pat<(store (i32 IntRegs:$src1), (add IntRegs:$src2,
-                                           s11_2ExtPred:$offset)),
-          (STriw_indexed IntRegs:$src2, s11_2ImmPred:$offset,
-                         (i32 IntRegs:$src1))>;
+def S2_tstbit_i : T_TEST_BIT_IMM<"tstbit", 0b000>;
+def S2_tstbit_r : T_TEST_BIT_REG<"tstbit", 0>;
+
+let AddedComplexity = 20 in { // Complexity greater than cmp reg-imm.
+  def: Pat<(i1 (setne (and (shl 1, u5ImmPred:$u5), (i32 IntRegs:$Rs)), 0)),
+           (S2_tstbit_i IntRegs:$Rs, u5ImmPred:$u5)>;
+  def: Pat<(i1 (setne (and (shl 1, (i32 IntRegs:$Rt)), (i32 IntRegs:$Rs)), 0)),
+           (S2_tstbit_r IntRegs:$Rs, IntRegs:$Rt)>;
+  def: Pat<(i1 (trunc (i32 IntRegs:$Rs))),
+           (S2_tstbit_i IntRegs:$Rs, 0)>;
+  def: Pat<(i1 (trunc (i64 DoubleRegs:$Rs))),
+           (S2_tstbit_i (LoReg DoubleRegs:$Rs), 0)>;
+}
 
-def : Pat<(store (i64 DoubleRegs:$src1), (add IntRegs:$src2,
-                                              s11_3ExtPred:$offset)),
-          (STrid_indexed IntRegs:$src2, s11_3ImmPred:$offset,
-                         (i64 DoubleRegs:$src1))>;
+let hasSideEffects = 0 in
+class T_TEST_BITS_IMM<string MnOp, bits<2> MajOp, bit IsNeg>
+    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, u6Imm:$u6),
+            "$Pd = "#MnOp#"($Rs, #$u6)",
+            [], "", S_2op_tc_2early_SLOT23> {
+  bits<2> Pd;
+  bits<5> Rs;
+  bits<6> u6;
+  let IClass = 0b1000;
+  let Inst{27-24} = 0b0101;
+  let Inst{23-22} = MajOp;
+  let Inst{21} = IsNeg;
+  let Inst{20-16} = Rs;
+  let Inst{13-8} = u6;
+  let Inst{1-0} = Pd;
 }
 
-// memh(Rx++#s4:1)=Rt.H
+let hasSideEffects = 0 in
+class T_TEST_BITS_REG<string MnOp, bits<2> MajOp, bit IsNeg>
+    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
+            "$Pd = "#MnOp#"($Rs, $Rt)",
+            [], "", S_3op_tc_2early_SLOT23> {
+  bits<2> Pd;
+  bits<5> Rs;
+  bits<5> Rt;
+  let IClass = 0b1100;
+  let Inst{27-24} = 0b0111;
+  let Inst{23-22} = MajOp;
+  let Inst{21} = IsNeg;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{1-0} = Pd;
+}
 
-// Store word.
-// Store predicate.
-let Defs = [R10,R11,D5], neverHasSideEffects = 1 in
-def STriw_pred : STInst2<(outs),
-            (ins MEMri:$addr, PredRegs:$src1),
-            "Error; should not emit",
-            []>;
+def C2_bitsclri : T_TEST_BITS_IMM<"bitsclr", 0b10, 0>;
+def C2_bitsclr  : T_TEST_BITS_REG<"bitsclr", 0b10, 0>;
+def C2_bitsset  : T_TEST_BITS_REG<"bitsset", 0b01, 0>;
 
-// Allocate stack frame.
-let Defs = [R29, R30], Uses = [R31, R30], neverHasSideEffects = 1 in {
-  def ALLOCFRAME : STInst2<(outs),
-             (ins i32imm:$amt),
-             "allocframe(#$amt)",
-             []>;
+let AddedComplexity = 20 in { // Complexity greater than compare reg-imm.
+  def: Pat<(i1 (seteq (and (i32 IntRegs:$Rs), u6ImmPred:$u6), 0)),
+           (C2_bitsclri IntRegs:$Rs, u6ImmPred:$u6)>;
+  def: Pat<(i1 (seteq (and (i32 IntRegs:$Rs), (i32 IntRegs:$Rt)), 0)),
+           (C2_bitsclr IntRegs:$Rs, IntRegs:$Rt)>;
 }
+
+let AddedComplexity = 10 in   // Complexity greater than compare reg-reg.
+def: Pat<(i1 (seteq (and (i32 IntRegs:$Rs), (i32 IntRegs:$Rt)), IntRegs:$Rt)),
+         (C2_bitsset IntRegs:$Rs, IntRegs:$Rt)>;
+
 //===----------------------------------------------------------------------===//
-// ST -
+// STYPE/BIT -
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// STYPE/ALU +
+// STYPE/COMPLEX +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// STYPE/COMPLEX -
 //===----------------------------------------------------------------------===//
-// Logical NOT.
-def NOT_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
-               "$dst = not($src1)",
-               [(set (i64 DoubleRegs:$dst), (not (i64 DoubleRegs:$src1)))]>;
 
+//===----------------------------------------------------------------------===//
+// XTYPE/PERM +
+//===----------------------------------------------------------------------===//
 
-// Sign extend word to doubleword.
-def SXTW : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
-           "$dst = sxtw($src1)",
-           [(set (i64 DoubleRegs:$dst), (sext (i32 IntRegs:$src1)))]>;
 //===----------------------------------------------------------------------===//
-// STYPE/ALU -
+// XTYPE/PERM -
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// STYPE/BIT +
+// STYPE/PRED +
 //===----------------------------------------------------------------------===//
-// clrbit.
-def CLRBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
-            "$dst = clrbit($src1, #$src2)",
-            [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1),
-                                           (not
-                                              (shl 1, u5ImmPred:$src2))))]>;
-
-def CLRBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
-            "$dst = clrbit($src1, #$src2)",
-            []>;
-
-// Map from r0 = and(r1, 2147483647) to r0 = clrbit(r1, #31).
-def : Pat <(and (i32 IntRegs:$src1), 2147483647),
-      (CLRBIT_31 (i32 IntRegs:$src1), 31)>;
-
-// setbit.
-def SETBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
-            "$dst = setbit($src1, #$src2)",
-            [(set (i32 IntRegs:$dst), (or (i32 IntRegs:$src1),
-                                          (shl 1, u5ImmPred:$src2)))]>;
-
-// Map from r0 = or(r1, -2147483648) to r0 = setbit(r1, #31).
-def SETBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
-            "$dst = setbit($src1, #$src2)",
-            []>;
-
-def : Pat <(or (i32 IntRegs:$src1), -2147483648),
-      (SETBIT_31 (i32 IntRegs:$src1), 31)>;
-
-// togglebit.
-def TOGBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
-            "$dst = setbit($src1, #$src2)",
-            [(set (i32 IntRegs:$dst), (xor (i32 IntRegs:$src1),
-                                          (shl 1, u5ImmPred:$src2)))]>;
-
-// Map from r0 = xor(r1, -2147483648) to r0 = togglebit(r1, #31).
-def TOGBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
-            "$dst = togglebit($src1, #$src2)",
-            []>;
-
-def : Pat <(xor (i32 IntRegs:$src1), -2147483648),
-      (TOGBIT_31 (i32 IntRegs:$src1), 31)>;
 
 // Predicate transfer.
-let neverHasSideEffects = 1 in
-def TFR_RsPd : SInst<(outs IntRegs:$dst), (ins PredRegs:$src1),
-               "$dst = $src1  /* Should almost never emit this. */",
-               []>;
+let hasSideEffects = 0, hasNewValue = 1 in
+def C2_tfrpr : SInst<(outs IntRegs:$Rd), (ins PredRegs:$Ps),
+      "$Rd = $Ps", [], "", S_2op_tc_1_SLOT23> {
+  bits<5> Rd;
+  bits<2> Ps;
+
+  let IClass = 0b1000;
+  let Inst{27-24} = 0b1001;
+  let Inst{22} = 0b1;
+  let Inst{17-16} = Ps;
+  let Inst{4-0} = Rd;
+}
+
+// Transfer general register to predicate.
+let hasSideEffects = 0 in
+def C2_tfrrp: SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs),
+      "$Pd = $Rs", [], "", S_2op_tc_2early_SLOT23> {
+  bits<2> Pd;
+  bits<5> Rs;
+
+  let IClass = 0b1000;
+  let Inst{27-21} = 0b0101010;
+  let Inst{20-16} = Rs;
+  let Inst{1-0} = Pd;
+}
+
+let hasSideEffects = 0, isCodeGenOnly = 1 in
+def C2_pxfer_map: SInst<(outs PredRegs:$dst), (ins PredRegs:$src),
+     "$dst = $src">;
+
+
+// Patterns for loads of i1:
+def: Pat<(i1 (load AddrFI:$fi)),
+         (C2_tfrrp (L2_loadrub_io AddrFI:$fi, 0))>;
+def: Pat<(i1 (load (add (i32 IntRegs:$Rs), s11_0ExtPred:$Off))),
+         (C2_tfrrp (L2_loadrub_io IntRegs:$Rs, imm:$Off))>;
+def: Pat<(i1 (load (i32 IntRegs:$Rs))),
+         (C2_tfrrp (L2_loadrub_io IntRegs:$Rs, 0))>;
+
+def I1toI32: OutPatFrag<(ops node:$Rs),
+                        (C2_muxii (i1 $Rs), 1, 0)>;
+
+def I32toI1: OutPatFrag<(ops node:$Rs),
+                        (i1 (C2_tfrrp (i32 $Rs)))>;
+
+defm: Storexm_pat<store, I1, s11_0ExtPred, I1toI32, S2_storerb_io>;
+def: Storexm_simple_pat<store, I1, I1toI32, S2_storerb_io>;
 
-def TFR_PdRs : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1),
-               "$dst = $src1  /* Should almost never emit this. */",
-               [(set (i1 PredRegs:$dst), (trunc (i32 IntRegs:$src1)))]>;
 //===----------------------------------------------------------------------===//
 // STYPE/PRED -
 //===----------------------------------------------------------------------===//
@@ -1786,88 +4385,56 @@ def TFR_PdRs : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1),
 //===----------------------------------------------------------------------===//
 // STYPE/SHIFT +
 //===----------------------------------------------------------------------===//
+class S_2OpInstImm<string Mnemonic, bits<3>MajOp, bits<3>MinOp,
+                   Operand Imm, list<dag> pattern = [], bit isRnd = 0>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, Imm:$src2),
+           "$dst = "#Mnemonic#"($src1, #$src2)"#!if(isRnd, ":rnd", ""),
+           pattern> {
+  bits<5> src1;
+  bits<5> dst;
+  let IClass = 0b1000;
+  let Inst{27-24} = 0;
+  let Inst{23-21} = MajOp;
+  let Inst{20-16} = src1;
+  let Inst{7-5} = MinOp;
+  let Inst{4-0} = dst;
+}
+
+class S_2OpInstImmI6<string Mnemonic, SDNode OpNode, bits<3>MinOp>
+  : S_2OpInstImm<Mnemonic, 0b000, MinOp, u6Imm,
+  [(set (i64 DoubleRegs:$dst), (OpNode (i64 DoubleRegs:$src1),
+                                        u6ImmPred:$src2))]> {
+  bits<6> src2;
+  let Inst{13-8} = src2;
+}
+
 // Shift by immediate.
-def ASR_ri : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
-             "$dst = asr($src1, #$src2)",
-             [(set (i32 IntRegs:$dst), (sra (i32 IntRegs:$src1),
-                                            u5ImmPred:$src2))]>;
-
-def ASRd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
-              "$dst = asr($src1, #$src2)",
-              [(set (i64 DoubleRegs:$dst), (sra (i64 DoubleRegs:$src1),
-                                                u6ImmPred:$src2))]>;
-
-def ASL : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
-          "$dst = asl($src1, #$src2)",
-          [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1),
-                                         u5ImmPred:$src2))]>;
-
-def ASLd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
-              "$dst = asl($src1, #$src2)",
-              [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1),
-                                                u6ImmPred:$src2))]>;
-
-def LSR_ri : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
-             "$dst = lsr($src1, #$src2)",
-             [(set (i32 IntRegs:$dst), (srl (i32 IntRegs:$src1),
-                                            u5ImmPred:$src2))]>;
-
-def LSRd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
-              "$dst = lsr($src1, #$src2)",
-              [(set (i64 DoubleRegs:$dst), (srl (i64 DoubleRegs:$src1),
-                                                u6ImmPred:$src2))]>;
-
-// Shift by immediate and add.
-let AddedComplexity = 100 in
-def ADDASL : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
-                                             u3Imm:$src3),
-             "$dst = addasl($src1, $src2, #$src3)",
-             [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$src1),
-                                       (shl (i32 IntRegs:$src2),
-                                            u3ImmPred:$src3)))]>;
-
-// Shift by register.
-def ASL_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             "$dst = asl($src1, $src2)",
-             [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1),
-                                            (i32 IntRegs:$src2)))]>;
-
-def ASR_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             "$dst = asr($src1, $src2)",
-             [(set (i32 IntRegs:$dst), (sra (i32 IntRegs:$src1),
-                                            (i32 IntRegs:$src2)))]>;
-
-def LSL_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             "$dst = lsl($src1, $src2)",
-             [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1),
-                                            (i32 IntRegs:$src2)))]>;
-
-def LSR_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             "$dst = lsr($src1, $src2)",
-             [(set (i32 IntRegs:$dst), (srl (i32 IntRegs:$src1),
-                                            (i32 IntRegs:$src2)))]>;
-
-def ASLd : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
-           "$dst = asl($src1, $src2)",
-           [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1),
-                                             (i32 IntRegs:$src2)))]>;
-
-def LSLd : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
-           "$dst = lsl($src1, $src2)",
-           [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1),
-                                             (i32 IntRegs:$src2)))]>;
-
-def ASRd_rr : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                 IntRegs:$src2),
-              "$dst = asr($src1, $src2)",
-              [(set (i64 DoubleRegs:$dst), (sra (i64 DoubleRegs:$src1),
-                                                (i32 IntRegs:$src2)))]>;
-
-def LSRd_rr : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                 IntRegs:$src2),
-              "$dst = lsr($src1, $src2)",
-              [(set (i64 DoubleRegs:$dst), (srl (i64 DoubleRegs:$src1),
-                                                (i32 IntRegs:$src2)))]>;
+def S2_asr_i_p : S_2OpInstImmI6<"asr", sra, 0b000>;
+def S2_asl_i_p : S_2OpInstImmI6<"asl", shl, 0b010>;
+def S2_lsr_i_p : S_2OpInstImmI6<"lsr", srl, 0b001>;
+
+// Shift left by small amount and add.
+let AddedComplexity = 100, hasNewValue = 1, hasSideEffects = 0 in
+def S2_addasl_rrri: SInst <(outs IntRegs:$Rd),
+                           (ins IntRegs:$Rt, IntRegs:$Rs, u3Imm:$u3),
+  "$Rd = addasl($Rt, $Rs, #$u3)" ,
+  [(set (i32 IntRegs:$Rd), (add (i32 IntRegs:$Rt),
+                                (shl (i32 IntRegs:$Rs), u3ImmPred:$u3)))],
+  "", S_3op_tc_2_SLOT23> {
+    bits<5> Rd;
+    bits<5> Rt;
+    bits<5> Rs;
+    bits<3> u3;
+
+    let IClass = 0b1100;
+
+    let Inst{27-21} = 0b0100000;
+    let Inst{20-16} = Rs;
+    let Inst{13}    = 0b0;
+    let Inst{12-8}  = Rt;
+    let Inst{7-5}   = u3;
+    let Inst{4-0}   = Rd;
+  }
 
 //===----------------------------------------------------------------------===//
 // STYPE/SHIFT -
@@ -1894,39 +4461,222 @@ def LSRd_rr : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
 //===----------------------------------------------------------------------===//
 // SYSTEM/USER +
 //===----------------------------------------------------------------------===//
-def SDHexagonBARRIER: SDTypeProfile<0, 0, []>;
-def HexagonBARRIER: SDNode<"HexagonISD::BARRIER", SDHexagonBARRIER,
-                           [SDNPHasChain]>;
+def HexagonBARRIER: SDNode<"HexagonISD::BARRIER", SDTNone, [SDNPHasChain]>;
 
-let hasSideEffects = 1, isSolo = 1 in
-def BARRIER : SYSInst<(outs), (ins),
+let hasSideEffects = 1, isSoloAX = 1 in
+def Y2_barrier : SYSInst<(outs), (ins),
                      "barrier",
-                     [(HexagonBARRIER)]>;
+                     [(HexagonBARRIER)],"",ST_tc_st_SLOT0> {
+  let Inst{31-28} = 0b1010;
+  let Inst{27-21} = 0b1000000;
+}
 
 //===----------------------------------------------------------------------===//
 // SYSTEM/SUPER -
 //===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// CRUSER - Type.
+//===----------------------------------------------------------------------===//
+// HW loop
+let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
+    opExtendable = 0, hasSideEffects = 0 in
+class LOOP_iBase<string mnemonic, Operand brOp, bit mustExtend = 0>
+         : CRInst<(outs), (ins brOp:$offset, u10Imm:$src2),
+           #mnemonic#"($offset, #$src2)",
+           [], "" , CR_tc_3x_SLOT3> {
+    bits<9> offset;
+    bits<10> src2;
+
+    let IClass = 0b0110;
+
+    let Inst{27-22} = 0b100100;
+    let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
+    let Inst{20-16} = src2{9-5};
+    let Inst{12-8} = offset{8-4};
+    let Inst{7-5} = src2{4-2};
+    let Inst{4-3} = offset{3-2};
+    let Inst{1-0} = src2{1-0};
+}
 
-// TFRI64 - assembly mapped.
-let isReMaterializable = 1 in
-def TFRI64 : ALU64_rr<(outs DoubleRegs:$dst), (ins s8Imm64:$src1),
-             "$dst = #$src1",
-             [(set (i64 DoubleRegs:$dst), s8Imm64Pred:$src1)]>;
-
-// Pseudo instruction to encode a set of conditional transfers.
-// This instruction is used instead of a mux and trades-off codesize
-// for performance. We conduct this transformation optimistically in
-// the hope that these instructions get promoted to dot-new transfers.
-let AddedComplexity = 100, isPredicated = 1 in
-def TFR_condset_rr : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
-                                                        IntRegs:$src2,
-                                                        IntRegs:$src3),
-                     "Error; should not emit",
-                     [(set (i32 IntRegs:$dst),
-                           (i32 (select (i1 PredRegs:$src1),
-                                        (i32 IntRegs:$src2),
-                                        (i32 IntRegs:$src3))))]>;
-let AddedComplexity = 100, isPredicated = 1 in
+let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
+    opExtendable = 0, hasSideEffects = 0 in
+class LOOP_rBase<string mnemonic, Operand brOp, bit mustExtend = 0>
+         : CRInst<(outs), (ins brOp:$offset, IntRegs:$src2),
+           #mnemonic#"($offset, $src2)",
+           [], "" ,CR_tc_3x_SLOT3> {
+    bits<9> offset;
+    bits<5> src2;
+
+    let IClass = 0b0110;
+
+    let Inst{27-22} = 0b000000;
+    let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
+    let Inst{20-16} = src2;
+    let Inst{12-8} = offset{8-4};
+    let Inst{4-3} = offset{3-2};
+  }
+
+multiclass LOOP_ri<string mnemonic> {
+  def i : LOOP_iBase<mnemonic, brtarget>;
+  def r : LOOP_rBase<mnemonic, brtarget>;
+}
+
+
+let Defs = [SA0, LC0, USR] in
+defm J2_loop0 : LOOP_ri<"loop0">;
+
+// Interestingly only loop0's appear to set usr.lpcfg
+let Defs = [SA1, LC1] in
+defm J2_loop1 : LOOP_ri<"loop1">;
+
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
+    Defs = [PC, LC0], Uses = [SA0, LC0] in {
+def ENDLOOP0 : Endloop<(outs), (ins brtarget:$offset),
+                       ":endloop0",
+                       []>;
+}
+
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
+    Defs = [PC, LC1], Uses = [SA1, LC1] in {
+def ENDLOOP1 : Endloop<(outs), (ins brtarget:$offset),
+                       ":endloop1",
+                       []>;
+}
+
+// Pipelined loop instructions, sp[123]loop0
+let Defs = [LC0, SA0, P3, USR], hasSideEffects = 0,
+    isExtentSigned = 1, isExtendable = 1, opExtentBits = 9, opExtentAlign = 2,
+    opExtendable = 0, isPredicateLate = 1 in
+class SPLOOP_iBase<string SP, bits<2> op>
+  : CRInst <(outs), (ins brtarget:$r7_2, u10Imm:$U10),
+  "p3 = sp"#SP#"loop0($r7_2, #$U10)" > {
+    bits<9> r7_2;
+    bits<10> U10;
+
+    let IClass = 0b0110;
+
+    let Inst{22-21} = op;
+    let Inst{27-23} = 0b10011;
+    let Inst{20-16} = U10{9-5};
+    let Inst{12-8} = r7_2{8-4};
+    let Inst{7-5} = U10{4-2};
+    let Inst{4-3} = r7_2{3-2};
+    let Inst{1-0} = U10{1-0};
+  }
+
+let Defs = [LC0, SA0, P3, USR], hasSideEffects = 0,
+    isExtentSigned = 1, isExtendable = 1, opExtentBits = 9, opExtentAlign = 2,
+    opExtendable = 0, isPredicateLate = 1 in
+class SPLOOP_rBase<string SP, bits<2> op>
+  : CRInst <(outs), (ins brtarget:$r7_2, IntRegs:$Rs),
+  "p3 = sp"#SP#"loop0($r7_2, $Rs)" > {
+    bits<9> r7_2;
+    bits<5> Rs;
+
+    let IClass = 0b0110;
+
+    let Inst{22-21} = op;
+    let Inst{27-23} = 0b00001;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = r7_2{8-4};
+    let Inst{4-3} = r7_2{3-2};
+  }
+
+multiclass SPLOOP_ri<string mnemonic, bits<2> op> {
+  def i : SPLOOP_iBase<mnemonic, op>;
+  def r : SPLOOP_rBase<mnemonic, op>;
+}
+
+defm J2_ploop1s : SPLOOP_ri<"1", 0b01>;
+defm J2_ploop2s : SPLOOP_ri<"2", 0b10>;
+defm J2_ploop3s : SPLOOP_ri<"3", 0b11>;
+
+// if (Rs[!>=<]=#0) jump:[t/nt]
+let Defs = [PC], isPredicated = 1, isBranch = 1, hasSideEffects = 0,
+    hasSideEffects = 0 in
+class J2_jump_0_Base<string compare, bit isTak, bits<2> op>
+  : CRInst <(outs), (ins IntRegs:$Rs, brtarget:$r13_2),
+  "if ($Rs"#compare#"#0) jump"#!if(isTak, ":t", ":nt")#" $r13_2" > {
+    bits<5> Rs;
+    bits<15> r13_2;
+
+    let IClass = 0b0110;
+
+    let Inst{27-24} = 0b0001;
+    let Inst{23-22} = op;
+    let Inst{12} = isTak;
+    let Inst{21} = r13_2{14};
+    let Inst{20-16} = Rs;
+    let Inst{11-1} = r13_2{12-2};
+    let Inst{13} = r13_2{13};
+  }
+
+multiclass J2_jump_compare_0<string compare, bits<2> op> {
+  def NAME    : J2_jump_0_Base<compare, 0, op>;
+  def NAME#pt : J2_jump_0_Base<compare, 1, op>;
+}
+
+defm J2_jumprz    : J2_jump_compare_0<"!=", 0b00>;
+defm J2_jumprgtez : J2_jump_compare_0<">=", 0b01>;
+defm J2_jumprnz   : J2_jump_compare_0<"==", 0b10>;
+defm J2_jumprltez : J2_jump_compare_0<"<=", 0b11>;
+
+// Transfer to/from Control/GPR Guest/GPR
+let hasSideEffects = 0 in
+class TFR_CR_RS_base<RegisterClass CTRC, RegisterClass RC, bit isDouble>
+  : CRInst <(outs CTRC:$dst), (ins RC:$src),
+  "$dst = $src", [], "", CR_tc_3x_SLOT3> {
+    bits<5> dst;
+    bits<5> src;
+
+    let IClass = 0b0110;
+
+    let Inst{27-25} = 0b001;
+    let Inst{24} = isDouble;
+    let Inst{23-21} = 0b001;
+    let Inst{20-16} = src;
+    let Inst{4-0} = dst;
+  }
+
+def A2_tfrrcr : TFR_CR_RS_base<CtrRegs, IntRegs, 0b0>;
+def A4_tfrpcp : TFR_CR_RS_base<CtrRegs64, DoubleRegs, 0b1>;
+def : InstAlias<"m0 = $Rs", (A2_tfrrcr C6, IntRegs:$Rs)>;
+def : InstAlias<"m1 = $Rs", (A2_tfrrcr C7, IntRegs:$Rs)>;
+
+let hasSideEffects = 0 in
+class TFR_RD_CR_base<RegisterClass RC, RegisterClass CTRC, bit isSingle>
+  : CRInst <(outs RC:$dst), (ins CTRC:$src),
+  "$dst = $src", [], "", CR_tc_3x_SLOT3> {
+    bits<5> dst;
+    bits<5> src;
+
+    let IClass = 0b0110;
+
+    let Inst{27-26} = 0b10;
+    let Inst{25} = isSingle;
+    let Inst{24-21} = 0b0000;
+    let Inst{20-16} = src;
+    let Inst{4-0} = dst;
+  }
+
+let hasNewValue = 1, opNewValue = 0 in
+def A2_tfrcrr : TFR_RD_CR_base<IntRegs, CtrRegs, 1>;
+def A4_tfrcpp : TFR_RD_CR_base<DoubleRegs, CtrRegs64, 0>;
+def : InstAlias<"$Rd = m0", (A2_tfrcrr IntRegs:$Rd, C6)>;
+def : InstAlias<"$Rd = m1", (A2_tfrcrr IntRegs:$Rd, C7)>;
+
+// Y4_trace: Send value to etm trace.
+let isSoloAX = 1, hasSideEffects = 0 in
+def Y4_trace: CRInst <(outs), (ins IntRegs:$Rs),
+  "trace($Rs)"> {
+    bits<5> Rs;
+
+    let IClass = 0b0110;
+    let Inst{27-21} = 0b0010010;
+    let Inst{20-16} = Rs;
+  }
+
+let AddedComplexity = 100, isPredicated = 1, isCodeGenOnly = 1 in
 def TFR_condset_ri : ALU32_rr<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, s12Imm:$src3),
             "Error; should not emit",
@@ -1934,7 +4684,7 @@ def TFR_condset_ri : ALU32_rr<(outs IntRegs:$dst),
              (i32 (select (i1 PredRegs:$src1), (i32 IntRegs:$src2),
                           s12ImmPred:$src3)))]>;
 
-let AddedComplexity = 100, isPredicated = 1 in
+let AddedComplexity = 100, isPredicated = 1, isCodeGenOnly = 1 in
 def TFR_condset_ir : ALU32_rr<(outs IntRegs:$dst),
             (ins PredRegs:$src1, s12Imm:$src2, IntRegs:$src3),
             "Error; should not emit",
@@ -1942,7 +4692,7 @@ def TFR_condset_ir : ALU32_rr<(outs IntRegs:$dst),
              (i32 (select (i1 PredRegs:$src1), s12ImmPred:$src2,
                           (i32 IntRegs:$src3))))]>;
 
-let AddedComplexity = 100, isPredicated = 1 in
+let AddedComplexity = 100, isPredicated = 1, isCodeGenOnly = 1 in
 def TFR_condset_ii : ALU32_rr<(outs IntRegs:$dst),
                               (ins PredRegs:$src1, s12Imm:$src2, s12Imm:$src3),
                      "Error; should not emit",
@@ -1951,115 +4701,109 @@ def TFR_condset_ii : ALU32_rr<(outs IntRegs:$dst),
                                         s12ImmPred:$src3)))]>;
 
 // Generate frameindex addresses.
-let isReMaterializable = 1 in
+let isReMaterializable = 1, isCodeGenOnly = 1 in
 def TFR_FI : ALU32_ri<(outs IntRegs:$dst), (ins FrameIndex:$src1),
              "$dst = add($src1)",
              [(set (i32 IntRegs:$dst), ADDRri:$src1)]>;
 
-//
-// CR - Type.
-//
-let neverHasSideEffects = 1, Defs = [SA0, LC0] in {
-def LOOP0_i : CRInst<(outs), (ins brtarget:$offset, u10Imm:$src2),
-                      "loop0($offset, #$src2)",
-                      []>;
-}
+// Support for generating global address.
+// Taken from X86InstrInfo.td.
+def SDTHexagonCONST32 : SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisPtrTy<0>]>;
+def HexagonCONST32    : SDNode<"HexagonISD::CONST32",    SDTHexagonCONST32>;
+def HexagonCONST32_GP : SDNode<"HexagonISD::CONST32_GP", SDTHexagonCONST32>;
 
-let neverHasSideEffects = 1, Defs = [SA0, LC0] in {
-def LOOP0_r : CRInst<(outs), (ins brtarget:$offset, IntRegs:$src2),
-                      "loop0($offset, $src2)",
-                      []>;
-}
+// HI/LO Instructions
+let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0,
+    hasNewValue = 1, opNewValue = 0 in
+class REG_IMMED<string RegHalf, string Op, bit Rs, bits<3> MajOp, bit MinOp>
+  : ALU32_ri<(outs IntRegs:$dst),
+              (ins i32imm:$imm_value),
+              "$dst"#RegHalf#" = #"#Op#"($imm_value)", []> {
+    bits<5> dst;
+    bits<32> imm_value;
+    let IClass = 0b0111;
 
-let isBranch = 1, isTerminator = 1, neverHasSideEffects = 1,
-    Defs = [PC, LC0], Uses = [SA0, LC0] in {
-def ENDLOOP0 : Endloop<(outs), (ins brtarget:$offset),
-                       ":endloop0",
-                       []>;
+    let Inst{27} = Rs;
+    let Inst{26-24} = MajOp;
+    let Inst{21} = MinOp;
+    let Inst{20-16} = dst;
+    let Inst{23-22} = !if (!eq(Op, "LO"), imm_value{15-14}, imm_value{31-30});
+    let Inst{13-0} = !if (!eq(Op, "LO"), imm_value{13-0}, imm_value{29-16});
 }
 
-// Support for generating global address.
-// Taken from X86InstrInfo.td.
-def SDTHexagonCONST32 : SDTypeProfile<1, 1, [
-                                            SDTCisVT<0, i32>,
-                                            SDTCisVT<1, i32>,
-                                            SDTCisPtrTy<0>]>;
-def HexagonCONST32 : SDNode<"HexagonISD::CONST32",     SDTHexagonCONST32>;
-def HexagonCONST32_GP : SDNode<"HexagonISD::CONST32_GP",     SDTHexagonCONST32>;
+let isAsmParserOnly = 1 in {
+  def LO : REG_IMMED<".l", "LO", 0b0, 0b001, 0b1>;
+  def LO_H : REG_IMMED<".l", "HI", 0b0, 0b001, 0b1>;
+  def HI : REG_IMMED<".h", "HI", 0b0, 0b010, 0b1>;
+  def HI_L : REG_IMMED<".h", "LO", 0b0, 0b010, 0b1>;
+}
 
-// HI/LO Instructions
-let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
-def LO : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$global),
-                  "$dst.l = #LO($global)",
-                  []>;
+let  isMoveImm = 1, isCodeGenOnly = 1 in
+def LO_PIC : ALU32_ri<(outs IntRegs:$dst), (ins bblabel:$label),
+             "$dst.l = #LO($label@GOTREL)",
+             []>;
 
-let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
-def HI : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$global),
-                  "$dst.h = #HI($global)",
-                  []>;
+let  isMoveImm = 1, isCodeGenOnly = 1 in
+def HI_PIC : ALU32_ri<(outs IntRegs:$dst), (ins bblabel:$label),
+             "$dst.h = #HI($label@GOTREL)",
+             []>;
 
-let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0,
+    isAsmParserOnly = 1 in
 def LOi : ALU32_ri<(outs IntRegs:$dst), (ins i32imm:$imm_value),
                   "$dst.l = #LO($imm_value)",
                   []>;
 
 
-let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0,
+    isAsmParserOnly = 1 in
 def HIi : ALU32_ri<(outs IntRegs:$dst), (ins i32imm:$imm_value),
                   "$dst.h = #HI($imm_value)",
                   []>;
 
-let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0,
+    isAsmParserOnly = 1 in
 def LO_jt : ALU32_ri<(outs IntRegs:$dst), (ins jumptablebase:$jt),
                   "$dst.l = #LO($jt)",
                   []>;
 
-let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0,
+    isAsmParserOnly = 1 in
 def HI_jt : ALU32_ri<(outs IntRegs:$dst), (ins jumptablebase:$jt),
                   "$dst.h = #HI($jt)",
                   []>;
 
-
-let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
-def LO_label : ALU32_ri<(outs IntRegs:$dst), (ins bblabel:$label),
-                  "$dst.l = #LO($label)",
-                  []>;
-
-let isReMaterializable = 1, isMoveImm = 1 , neverHasSideEffects = 1 in
-def HI_label : ALU32_ri<(outs IntRegs:$dst), (ins bblabel:$label),
-                  "$dst.h = #HI($label)",
-                  []>;
-
 // This pattern is incorrect. When we add small data, we should change
 // this pattern to use memw(#foo).
 // This is for sdata.
-let isMoveImm = 1 in
-def CONST32 : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
+let isMoveImm = 1, isAsmParserOnly = 1 in
+def CONST32 : CONSTLDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
               "$dst = CONST32(#$global)",
               [(set (i32 IntRegs:$dst),
                     (load (HexagonCONST32 tglobaltlsaddr:$global)))]>;
 
-// This is for non-sdata.
 let isReMaterializable = 1, isMoveImm = 1 in
 def CONST32_set : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global),
                   "$dst = CONST32(#$global)",
                   [(set (i32 IntRegs:$dst),
                         (HexagonCONST32 tglobaladdr:$global))]>;
 
-let isReMaterializable = 1, isMoveImm = 1 in
-def CONST32_set_jt : LDInst2<(outs IntRegs:$dst), (ins jumptablebase:$jt),
+let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in
+def CONST32_set_jt : CONSTLDInst<(outs IntRegs:$dst), (ins jumptablebase:$jt),
                      "$dst = CONST32(#$jt)",
                      [(set (i32 IntRegs:$dst),
                            (HexagonCONST32 tjumptable:$jt))]>;
 
-let isReMaterializable = 1, isMoveImm = 1 in
+let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in
 def CONST32GP_set : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global),
                     "$dst = CONST32(#$global)",
                     [(set (i32 IntRegs:$dst),
                           (HexagonCONST32_GP tglobaladdr:$global))]>;
 
-let isReMaterializable = 1, isMoveImm = 1 in
-def CONST32_Int_Real : LDInst2<(outs IntRegs:$dst), (ins i32imm:$global),
+let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in
+def CONST32_Int_Real : CONSTLDInst<(outs IntRegs:$dst), (ins i32imm:$global),
                        "$dst = CONST32(#$global)",
                        [(set (i32 IntRegs:$dst), imm:$global) ]>;
 
@@ -2067,839 +4811,921 @@ def CONST32_Int_Real : LDInst2<(outs IntRegs:$dst), (ins i32imm:$global),
 def : Pat<(HexagonCONST32_GP tblockaddress:$addr),
           (CONST32_Int_Real tblockaddress:$addr)>;
 
-let isReMaterializable = 1, isMoveImm = 1 in
+let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in
 def CONST32_Label : LDInst2<(outs IntRegs:$dst), (ins bblabel:$label),
                     "$dst = CONST32($label)",
                     [(set (i32 IntRegs:$dst), (HexagonCONST32 bbl:$label))]>;
 
-let isReMaterializable = 1, isMoveImm = 1 in
-def CONST64_Int_Real : LDInst2<(outs DoubleRegs:$dst), (ins i64imm:$global),
+let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in
+def CONST64_Int_Real : CONSTLDInst<(outs DoubleRegs:$dst), (ins i64imm:$global),
                        "$dst = CONST64(#$global)",
-                       [(set (i64 DoubleRegs:$dst), imm:$global) ]>;
+                       [(set (i64 DoubleRegs:$dst), imm:$global)]>;
 
-def TFR_PdFalse : SInst<(outs PredRegs:$dst), (ins),
-                  "$dst = xor($dst, $dst)",
-                  [(set (i1 PredRegs:$dst), 0)]>;
+let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
+    isCodeGenOnly = 1 in
+def TFR_PdTrue : SInst<(outs PredRegs:$dst), (ins), "",
+                 [(set (i1 PredRegs:$dst), 1)]>;
 
-def MPY_trsext : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-       "$dst = mpy($src1, $src2)",
-       [(set (i32 IntRegs:$dst),
-             (trunc (i64 (srl (i64 (mul (i64 (sext (i32 IntRegs:$src1))),
-                                        (i64 (sext (i32 IntRegs:$src2))))),
-                              (i32 32)))))]>;
+let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
+    isCodeGenOnly = 1 in
+def TFR_PdFalse : SInst<(outs PredRegs:$dst), (ins), "$dst = xor($dst, $dst)",
+                  [(set (i1 PredRegs:$dst), 0)]>;
 
 // Pseudo instructions.
 def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
-
-def SDT_SPCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+def SDT_SPCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
                                         SDTCisVT<1, i32> ]>;
 
-def callseq_end : SDNode<"ISD::CALLSEQ_END",   SDT_SPCallSeqEnd,
-                  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPCallSeqStart,
                     [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_SPCallSeqEnd,
+                    [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
-def SDT_SPCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
-
-def call : SDNode<"HexagonISD::CALL", SDT_SPCall,
-           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
+def SDT_SPCall  : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
 
 // For tailcalls a HexagonTCRet SDNode has 3 SDNode Properties - a chain,
 // Optional Flag and Variable Arguments.
 // Its 1 Operand has pointer type.
-def HexagonTCRet    : SDNode<"HexagonISD::TC_RETURN", SDT_SPCall,
-                     [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
-
-let Defs = [R29, R30], Uses = [R31, R30, R29] in {
- def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                        "Should never be emitted",
-                        [(callseq_start timm:$amt)]>;
-}
-
-let Defs = [R29, R30, R31], Uses = [R29] in {
- def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
-                      "Should never be emitted",
-                      [(callseq_end timm:$amt1, timm:$amt2)]>;
-}
-// Call subroutine.
-let isCall = 1, neverHasSideEffects = 1,
-  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
-          R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def CALL : JInst<(outs), (ins calltarget:$dst),
-             "call $dst", []>;
-}
-
-// Call subroutine from register.
-let isCall = 1, neverHasSideEffects = 1,
-  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
-          R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def CALLR : JRInst<(outs), (ins IntRegs:$dst),
-              "callr $dst",
-              []>;
- }
+def HexagonTCRet : SDNode<"HexagonISD::TC_RETURN", SDT_SPCall,
+                          [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+let Defs = [R29, R30], Uses = [R31, R30, R29], isPseudo = 1 in
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                              ".error \"should not emit\" ",
+                              [(callseq_start timm:$amt)]>;
 
+let Defs = [R29, R30, R31], Uses = [R29], isPseudo = 1 in
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                             ".error \"should not emit\" ",
+                             [(callseq_end timm:$amt1, timm:$amt2)]>;
+
+// Call subroutine indirectly.
+let Defs = VolatileV3.Regs in
+def J2_callr : JUMPR_MISC_CALLR<0, 1>;
 
 // Indirect tail-call.
-let isCodeGenOnly = 1, isCall = 1, isReturn = 1  in
-def TCRETURNR : T_JMPr;
+let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
+    isTerminator = 1, isCodeGenOnly = 1 in
+def TCRETURNr : T_JMPr;
 
 // Direct tail-calls.
 let isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
 isTerminator = 1, isCodeGenOnly = 1 in {
-  def TCRETURNtg   : T_JMP<(ins calltarget:$dst)>;
-  def TCRETURNtext : T_JMP<(ins calltarget:$dst)>;
+  def TCRETURNtg   : JInst<(outs), (ins calltarget:$dst), "jump $dst",
+      [], "", J_tc_2early_SLOT23>;
+  def TCRETURNtext : JInst<(outs), (ins calltarget:$dst), "jump $dst",
+      [], "", J_tc_2early_SLOT23>;
 }
 
-// Map call instruction.
-def : Pat<(call (i32 IntRegs:$dst)),
-      (CALLR (i32 IntRegs:$dst))>, Requires<[HasV2TOnly]>;
-def : Pat<(call tglobaladdr:$dst),
-      (CALL tglobaladdr:$dst)>, Requires<[HasV2TOnly]>;
-def : Pat<(call texternalsym:$dst),
-      (CALL texternalsym:$dst)>, Requires<[HasV2TOnly]>;
 //Tail calls.
-def : Pat<(HexagonTCRet tglobaladdr:$dst),
-      (TCRETURNtg tglobaladdr:$dst)>;
-def : Pat<(HexagonTCRet texternalsym:$dst),
-      (TCRETURNtext texternalsym:$dst)>;
-def : Pat<(HexagonTCRet (i32 IntRegs:$dst)),
-      (TCRETURNR (i32 IntRegs:$dst))>;
-
-// Atomic load and store support
-// 8 bit atomic load
-def : Pat<(atomic_load_8 ADDRriS11_0:$src1),
-          (i32 (LDriub ADDRriS11_0:$src1))>;
-
-def : Pat<(atomic_load_8 (add (i32 IntRegs:$src1), s11_0ImmPred:$offset)),
-          (i32 (LDriub_indexed (i32 IntRegs:$src1), s11_0ImmPred:$offset))>;
-
-// 16 bit atomic load
-def : Pat<(atomic_load_16 ADDRriS11_1:$src1),
-          (i32 (LDriuh ADDRriS11_1:$src1))>;
-
-def : Pat<(atomic_load_16 (add (i32 IntRegs:$src1), s11_1ImmPred:$offset)),
-          (i32 (LDriuh_indexed (i32 IntRegs:$src1), s11_1ImmPred:$offset))>;
-
-def : Pat<(atomic_load_32 ADDRriS11_2:$src1),
-          (i32 (LDriw ADDRriS11_2:$src1))>;
-
-def : Pat<(atomic_load_32 (add (i32 IntRegs:$src1), s11_2ImmPred:$offset)),
-          (i32 (LDriw_indexed (i32 IntRegs:$src1), s11_2ImmPred:$offset))>;
-
-// 64 bit atomic load
-def : Pat<(atomic_load_64 ADDRriS11_3:$src1),
-          (i64 (LDrid ADDRriS11_3:$src1))>;
-
-def : Pat<(atomic_load_64 (add (i32 IntRegs:$src1), s11_3ImmPred:$offset)),
-          (i64 (LDrid_indexed (i32 IntRegs:$src1), s11_3ImmPred:$offset))>;
-
-
-def : Pat<(atomic_store_8 ADDRriS11_0:$src2, (i32 IntRegs:$src1)),
-          (STrib ADDRriS11_0:$src2, (i32 IntRegs:$src1))>;
-
-def : Pat<(atomic_store_8 (add (i32 IntRegs:$src2), s11_0ImmPred:$offset),
-                          (i32 IntRegs:$src1)),
-          (STrib_indexed (i32 IntRegs:$src2), s11_0ImmPred:$offset,
-                         (i32 IntRegs:$src1))>;
-
-
-def : Pat<(atomic_store_16 ADDRriS11_1:$src2, (i32 IntRegs:$src1)),
-          (STrih ADDRriS11_1:$src2, (i32 IntRegs:$src1))>;
-
-def : Pat<(atomic_store_16 (i32 IntRegs:$src1),
-                          (add (i32 IntRegs:$src2), s11_1ImmPred:$offset)),
-          (STrih_indexed (i32 IntRegs:$src2), s11_1ImmPred:$offset,
-                         (i32 IntRegs:$src1))>;
-
-def : Pat<(atomic_store_32 ADDRriS11_2:$src2, (i32 IntRegs:$src1)),
-          (STriw ADDRriS11_2:$src2, (i32 IntRegs:$src1))>;
-
-def : Pat<(atomic_store_32 (add (i32 IntRegs:$src2), s11_2ImmPred:$offset),
-                           (i32 IntRegs:$src1)),
-          (STriw_indexed (i32 IntRegs:$src2), s11_2ImmPred:$offset,
-                         (i32 IntRegs:$src1))>;
-
-
-
-
-def : Pat<(atomic_store_64 ADDRriS11_3:$src2, (i64 DoubleRegs:$src1)),
-          (STrid ADDRriS11_3:$src2, (i64 DoubleRegs:$src1))>;
-
-def : Pat<(atomic_store_64 (add (i32 IntRegs:$src2), s11_3ImmPred:$offset),
-                           (i64 DoubleRegs:$src1)),
-          (STrid_indexed (i32 IntRegs:$src2), s11_3ImmPred:$offset,
-                         (i64 DoubleRegs:$src1))>;
+def: Pat<(HexagonTCRet tglobaladdr:$dst),
+         (TCRETURNtg tglobaladdr:$dst)>;
+def: Pat<(HexagonTCRet texternalsym:$dst),
+         (TCRETURNtext texternalsym:$dst)>;
+def: Pat<(HexagonTCRet (i32 IntRegs:$dst)),
+         (TCRETURNr (i32 IntRegs:$dst))>;
 
 // Map from r0 = and(r1, 65535) to r0 = zxth(r1)
-def : Pat <(and (i32 IntRegs:$src1), 65535),
-      (ZXTH (i32 IntRegs:$src1))>;
+def: Pat<(and (i32 IntRegs:$src1), 65535),
+         (A2_zxth IntRegs:$src1)>;
 
 // Map from r0 = and(r1, 255) to r0 = zxtb(r1).
-def : Pat <(and (i32 IntRegs:$src1), 255),
-      (ZXTB (i32 IntRegs:$src1))>;
+def: Pat<(and (i32 IntRegs:$src1), 255),
+         (A2_zxtb IntRegs:$src1)>;
 
 // Map Add(p1, true) to p1 = not(p1).
 //     Add(p1, false) should never be produced,
 //     if it does, it got to be mapped to NOOP.
-def : Pat <(add (i1 PredRegs:$src1), -1),
-      (NOT_p (i1 PredRegs:$src1))>;
-
-// Map from p0 = setlt(r0, r1) r2 = mux(p0, r3, r4) =>
-//   p0 = cmp.lt(r0, r1), r0 = mux(p0, r2, r1).
-// cmp.lt(r0, r1) -> cmp.gt(r1, r0)
-def : Pat <(select (i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
-                   (i32 IntRegs:$src3),
-                   (i32 IntRegs:$src4)),
-      (i32 (TFR_condset_rr (CMPGTrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)),
-                           (i32 IntRegs:$src4), (i32 IntRegs:$src3)))>,
-      Requires<[HasV2TOnly]>;
+def: Pat<(add (i1 PredRegs:$src1), -1),
+         (C2_not PredRegs:$src1)>;
 
 // Map from p0 = pnot(p0); r0 = mux(p0, #i, #j) => r0 = mux(p0, #j, #i).
-def : Pat <(select (not (i1 PredRegs:$src1)), s8ImmPred:$src2, s8ImmPred:$src3),
-      (i32 (TFR_condset_ii (i1 PredRegs:$src1), s8ImmPred:$src3,
-                           s8ImmPred:$src2))>;
+def: Pat<(select (not (i1 PredRegs:$src1)), s8ImmPred:$src2, s8ExtPred:$src3),
+         (C2_muxii PredRegs:$src1, s8ExtPred:$src3, s8ImmPred:$src2)>;
 
 // Map from p0 = pnot(p0); r0 = select(p0, #i, r1)
-// => r0 = TFR_condset_ri(p0, r1, #i)
-def : Pat <(select (not (i1 PredRegs:$src1)), s12ImmPred:$src2,
-                   (i32 IntRegs:$src3)),
-      (i32 (TFR_condset_ri (i1 PredRegs:$src1), (i32 IntRegs:$src3),
-                           s12ImmPred:$src2))>;
+// => r0 = C2_muxir(p0, r1, #i)
+def: Pat<(select (not (i1 PredRegs:$src1)), s8ExtPred:$src2,
+                 (i32 IntRegs:$src3)),
+         (C2_muxir PredRegs:$src1, IntRegs:$src3, s8ExtPred:$src2)>;
 
 // Map from p0 = pnot(p0); r0 = mux(p0, r1, #i)
-// => r0 = TFR_condset_ir(p0, #i, r1)
-def : Pat <(select (not (i1 PredRegs:$src1)), IntRegs:$src2, s12ImmPred:$src3),
-      (i32 (TFR_condset_ir (i1 PredRegs:$src1), s12ImmPred:$src3,
-                           (i32 IntRegs:$src2)))>;
+// => r0 = C2_muxri (p0, #i, r1)
+def: Pat<(select (not (i1 PredRegs:$src1)), IntRegs:$src2, s8ExtPred:$src3),
+         (C2_muxri PredRegs:$src1, s8ExtPred:$src3, IntRegs:$src2)>;
 
 // Map from p0 = pnot(p0); if (p0) jump => if (!p0) jump.
-def : Pat <(brcond (not (i1 PredRegs:$src1)), bb:$offset),
-      (JMP_f (i1 PredRegs:$src1), bb:$offset)>;
+def: Pat<(brcond (not (i1 PredRegs:$src1)), bb:$offset),
+         (J2_jumpf PredRegs:$src1, bb:$offset)>;
 
-// Map from p2 = pnot(p2); p1 = and(p0, p2) => p1 = and(p0, !p2).
-def : Pat <(and (i1 PredRegs:$src1), (not (i1 PredRegs:$src2))),
-      (i1 (AND_pnotp (i1 PredRegs:$src1), (i1 PredRegs:$src2)))>;
+// Map from Rdd = sign_extend_inreg(Rss, i32) -> Rdd = A2_sxtw(Rss.lo).
+def: Pat<(i64 (sext_inreg (i64 DoubleRegs:$src1), i32)),
+         (A2_sxtw (LoReg DoubleRegs:$src1))>;
 
+// Map from Rdd = sign_extend_inreg(Rss, i16) -> Rdd = A2_sxtw(A2_sxth(Rss.lo)).
+def: Pat<(i64 (sext_inreg (i64 DoubleRegs:$src1), i16)),
+         (A2_sxtw (A2_sxth (LoReg DoubleRegs:$src1)))>;
 
-let AddedComplexity = 100 in
-def : Pat <(i64 (zextloadi1 (HexagonCONST32 tglobaladdr:$global))),
-      (i64 (COMBINE_rr (TFRI 0),
-                       (LDriub_indexed (CONST32_set tglobaladdr:$global), 0)))>,
-      Requires<[NoV4T]>;
-
-// Map from i1 loads to 32 bits. This assumes that the i1* is byte aligned.
-let AddedComplexity = 10 in
-def : Pat <(i32 (zextloadi1 ADDRriS11_0:$addr)),
-      (i32 (A2_and (i32 (LDrib ADDRriS11_0:$addr)), (TFRI 0x1)))>;
-
-// Map from Rdd = sign_extend_inreg(Rss, i32) -> Rdd = SXTW(Rss.lo).
-def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i32)),
-      (i64 (SXTW (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg))))>;
-
-// Map from Rdd = sign_extend_inreg(Rss, i16) -> Rdd = SXTW(SXTH(Rss.lo)).
-def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i16)),
-      (i64 (SXTW (i32 (SXTH (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
-                                                 subreg_loreg))))))>;
-
-// Map from Rdd = sign_extend_inreg(Rss, i8) -> Rdd = SXTW(SXTB(Rss.lo)).
-def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i8)),
-      (i64 (SXTW (i32 (SXTB (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
-                                                 subreg_loreg))))))>;
+// Map from Rdd = sign_extend_inreg(Rss, i8) -> Rdd = A2_sxtw(A2_sxtb(Rss.lo)).
+def: Pat<(i64 (sext_inreg (i64 DoubleRegs:$src1), i8)),
+         (A2_sxtw (A2_sxtb (LoReg DoubleRegs:$src1)))>;
 
 // We want to prevent emitting pnot's as much as possible.
-// Map brcond with an unsupported setcc to a JMP_f.
+// Map brcond with an unsupported setcc to a J2_jumpf.
 def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
                         bb:$offset),
-      (JMP_f (CMPEQrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)),
+      (J2_jumpf (C2_cmpeq (i32 IntRegs:$src1), (i32 IntRegs:$src2)),
                 bb:$offset)>;
 
 def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), s10ImmPred:$src2)),
                         bb:$offset),
-      (JMP_f (CMPEQri (i32 IntRegs:$src1), s10ImmPred:$src2), bb:$offset)>;
+      (J2_jumpf (C2_cmpeqi (i32 IntRegs:$src1), s10ImmPred:$src2), bb:$offset)>;
 
-def : Pat <(brcond (i1 (setne (i1 PredRegs:$src1), (i1 -1))), bb:$offset),
-      (JMP_f (i1 PredRegs:$src1), bb:$offset)>;
+def: Pat<(brcond (i1 (setne (i1 PredRegs:$src1), (i1 -1))), bb:$offset),
+         (J2_jumpf PredRegs:$src1, bb:$offset)>;
 
-def : Pat <(brcond (i1 (setne (i1 PredRegs:$src1), (i1 0))), bb:$offset),
-      (JMP_t (i1 PredRegs:$src1), bb:$offset)>;
+def: Pat<(brcond (i1 (setne (i1 PredRegs:$src1), (i1 0))), bb:$offset),
+         (J2_jumpt PredRegs:$src1, bb:$offset)>;
 
 // cmp.lt(Rs, Imm) -> !cmp.ge(Rs, Imm) -> !cmp.gt(Rs, Imm-1)
-def : Pat <(brcond (i1 (setlt (i32 IntRegs:$src1), s8ImmPred:$src2)),
-                        bb:$offset),
-      (JMP_f (CMPGTri (i32 IntRegs:$src1),
-                (DEC_CONST_SIGNED s8ImmPred:$src2)), bb:$offset)>;
+def: Pat<(brcond (i1 (setlt (i32 IntRegs:$src1), s8ImmPred:$src2)), bb:$offset),
+        (J2_jumpf (C2_cmpgti IntRegs:$src1, (DEC_CONST_SIGNED s8ImmPred:$src2)),
+                  bb:$offset)>;
 
 // cmp.lt(r0, r1) -> cmp.gt(r1, r0)
 def : Pat <(brcond (i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
                         bb:$offset),
-      (JMP_t (CMPGTrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)), bb:$offset)>;
+      (J2_jumpt (C2_cmpgt (i32 IntRegs:$src2), (i32 IntRegs:$src1)), bb:$offset)>;
 
 def : Pat <(brcond (i1 (setuge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
                    bb:$offset),
-      (JMP_f (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)),
+      (J2_jumpf (C2_cmpgtup (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)),
                    bb:$offset)>;
 
 def : Pat <(brcond (i1 (setule (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
                         bb:$offset),
-      (JMP_f (CMPGTUrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)),
+      (J2_jumpf (C2_cmpgtu (i32 IntRegs:$src1), (i32 IntRegs:$src2)),
                 bb:$offset)>;
 
 def : Pat <(brcond (i1 (setule (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
                    bb:$offset),
-      (JMP_f (CMPGTU64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
+      (J2_jumpf (C2_cmpgtup (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
                 bb:$offset)>;
 
 // Map from a 64-bit select to an emulated 64-bit mux.
 // Hexagon does not support 64-bit MUXes; so emulate with combines.
-def : Pat <(select (i1 PredRegs:$src1), (i64 DoubleRegs:$src2),
-                   (i64 DoubleRegs:$src3)),
-      (i64 (COMBINE_rr (i32 (MUX_rr (i1 PredRegs:$src1),
-                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
-                                                         subreg_hireg)),
-                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src3),
-                                                         subreg_hireg)))),
-                       (i32 (MUX_rr (i1 PredRegs:$src1),
-                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
-                                                         subreg_loreg)),
-                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src3),
-                                                         subreg_loreg))))))>;
+def: Pat<(select (i1 PredRegs:$src1), (i64 DoubleRegs:$src2),
+                 (i64 DoubleRegs:$src3)),
+         (A2_combinew (C2_mux PredRegs:$src1, (HiReg DoubleRegs:$src2),
+                                              (HiReg DoubleRegs:$src3)),
+                      (C2_mux PredRegs:$src1, (LoReg DoubleRegs:$src2),
+                                              (LoReg DoubleRegs:$src3)))>;
 
 // Map from a 1-bit select to logical ops.
 // From LegalizeDAG.cpp: (B1 ? B2 : B3) <=> (B1 & B2)|(!B1&B3).
-def : Pat <(select (i1 PredRegs:$src1), (i1 PredRegs:$src2),
-                   (i1 PredRegs:$src3)),
-      (OR_pp (AND_pp (i1 PredRegs:$src1), (i1 PredRegs:$src2)),
-             (AND_pp (NOT_p (i1 PredRegs:$src1)), (i1 PredRegs:$src3)))>;
+def: Pat<(select (i1 PredRegs:$src1), (i1 PredRegs:$src2), (i1 PredRegs:$src3)),
+         (C2_or (C2_and PredRegs:$src1, PredRegs:$src2),
+                (C2_and (C2_not PredRegs:$src1), PredRegs:$src3))>;
 
 // Map Pd = load(addr) -> Rs = load(addr); Pd = Rs.
 def : Pat<(i1 (load ADDRriS11_2:$addr)),
-      (i1 (TFR_PdRs (i32 (LDrib ADDRriS11_2:$addr))))>;
+      (i1 (C2_tfrrp (i32 (L2_loadrb_io AddrFI:$addr, 0))))>;
 
 // Map for truncating from 64 immediates to 32 bit immediates.
-def : Pat<(i32 (trunc (i64 DoubleRegs:$src))),
-      (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src), subreg_loreg))>;
+def: Pat<(i32 (trunc (i64 DoubleRegs:$src))),
+         (LoReg DoubleRegs:$src)>;
 
 // Map for truncating from i64 immediates to i1 bit immediates.
-def :  Pat<(i1 (trunc (i64 DoubleRegs:$src))),
-       (i1 (TFR_PdRs (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
-                                          subreg_loreg))))>;
+def: Pat<(i1 (trunc (i64 DoubleRegs:$src))),
+         (C2_tfrrp (LoReg DoubleRegs:$src))>;
 
 // Map memb(Rs) = Rdd -> memb(Rs) = Rt.
 def : Pat<(truncstorei8 (i64 DoubleRegs:$src), ADDRriS11_0:$addr),
-      (STrib ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+      (S2_storerb_io AddrFI:$addr, 0, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
                                                      subreg_loreg)))>;
 
 // Map memh(Rs) = Rdd -> memh(Rs) = Rt.
 def : Pat<(truncstorei16 (i64 DoubleRegs:$src), ADDRriS11_0:$addr),
-      (STrih ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+      (S2_storerh_io AddrFI:$addr, 0, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
                                                      subreg_loreg)))>;
 // Map memw(Rs) = Rdd -> memw(Rs) = Rt
 def : Pat<(truncstorei32 (i64  DoubleRegs:$src), ADDRriS11_0:$addr),
-      (STriw ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+      (S2_storeri_io AddrFI:$addr, 0, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
                                                      subreg_loreg)))>;
 
 // Map memw(Rs) = Rdd -> memw(Rs) = Rt.
 def : Pat<(truncstorei32 (i64 DoubleRegs:$src), ADDRriS11_0:$addr),
-      (STriw ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+      (S2_storeri_io AddrFI:$addr, 0, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
                                                      subreg_loreg)))>;
 
 // Map from i1 = constant<-1>; memw(addr) = i1 -> r0 = 1; memw(addr) = r0.
 def : Pat<(store (i1 -1), ADDRriS11_2:$addr),
-      (STrib ADDRriS11_2:$addr, (TFRI 1))>;
+      (S2_storerb_io AddrFI:$addr, 0, (A2_tfrsi 1))>;
 
 
 // Map from i1 = constant<-1>; store i1 -> r0 = 1; store r0.
 def : Pat<(store (i1 -1), ADDRriS11_2:$addr),
-      (STrib ADDRriS11_2:$addr, (TFRI 1))>;
+      (S2_storerb_io AddrFI:$addr, 0, (A2_tfrsi 1))>;
 
 // Map from memb(Rs) = Pd -> Rt = mux(Pd, #0, #1); store Rt.
 def : Pat<(store (i1 PredRegs:$src1), ADDRriS11_2:$addr),
-      (STrib ADDRriS11_2:$addr, (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0)) )>;
-
-// Map Rdd = anyext(Rs) -> Rdd = sxtw(Rs).
-// Hexagon_TODO: We can probably use combine but that will cost 2 instructions.
-// Better way to do this?
-def : Pat<(i64 (anyext (i32 IntRegs:$src1))),
-      (i64 (SXTW (i32 IntRegs:$src1)))>;
+      (S2_storerb_io AddrFI:$addr, 0, (i32 (C2_muxii (i1 PredRegs:$src1), 1, 0)) )>;
 
-// Map cmple -> cmpgt.
 // rs <= rt -> !(rs > rt).
-def : Pat<(i1 (setle (i32 IntRegs:$src1), s10ExtPred:$src2)),
-      (i1 (NOT_p (CMPGTri (i32 IntRegs:$src1), s10ExtPred:$src2)))>;
+let AddedComplexity = 30 in
+def: Pat<(i1 (setle (i32 IntRegs:$src1), s10ExtPred:$src2)),
+         (C2_not (C2_cmpgti IntRegs:$src1, s10ExtPred:$src2))>;
 
 // rs <= rt -> !(rs > rt).
 def : Pat<(i1 (setle (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
-      (i1 (NOT_p (CMPGTrr (i32 IntRegs:$src1), (i32 IntRegs:$src2))))>;
+      (i1 (C2_not (C2_cmpgt (i32 IntRegs:$src1), (i32 IntRegs:$src2))))>;
 
 // Rss <= Rtt -> !(Rss > Rtt).
-def : Pat<(i1 (setle (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
-      (i1 (NOT_p (CMPGT64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))))>;
+def: Pat<(i1 (setle (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+         (C2_not (C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2))>;
 
 // Map cmpne -> cmpeq.
 // Hexagon_TODO: We should improve on this.
 // rs != rt -> !(rs == rt).
-def : Pat <(i1 (setne (i32 IntRegs:$src1), s10ExtPred:$src2)),
-      (i1 (NOT_p(i1 (CMPEQri (i32 IntRegs:$src1), s10ExtPred:$src2))))>;
+let AddedComplexity = 30 in
+def: Pat<(i1 (setne (i32 IntRegs:$src1), s10ExtPred:$src2)),
+         (C2_not (C2_cmpeqi IntRegs:$src1, s10ExtPred:$src2))>;
 
 // Map cmpne(Rs) -> !cmpeqe(Rs).
 // rs != rt -> !(rs == rt).
 def : Pat <(i1 (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
-      (i1 (NOT_p (i1 (CMPEQrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)))))>;
+      (i1 (C2_not (i1 (C2_cmpeq (i32 IntRegs:$src1), (i32 IntRegs:$src2)))))>;
 
 // Convert setne back to xor for hexagon since we compute w/ pred registers.
-def : Pat <(i1 (setne (i1 PredRegs:$src1), (i1 PredRegs:$src2))),
-      (i1 (XOR_pp (i1 PredRegs:$src1), (i1 PredRegs:$src2)))>;
+def: Pat<(i1 (setne (i1 PredRegs:$src1), (i1 PredRegs:$src2))),
+         (C2_xor PredRegs:$src1, PredRegs:$src2)>;
 
 // Map cmpne(Rss) -> !cmpew(Rss).
 // rs != rt -> !(rs == rt).
-def : Pat <(i1 (setne (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
-      (i1 (NOT_p (i1 (CMPEHexagon4rr (i64 DoubleRegs:$src1),
-                                     (i64 DoubleRegs:$src2)))))>;
+def: Pat<(i1 (setne (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+         (C2_not (C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2))>;
 
 // Map cmpge(Rs, Rt) -> !(cmpgt(Rs, Rt).
 // rs >= rt -> !(rt > rs).
 def : Pat <(i1 (setge (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
-      (i1 (NOT_p (i1 (CMPGTrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))))>;
+      (i1 (C2_not (i1 (C2_cmpgt (i32 IntRegs:$src2), (i32 IntRegs:$src1)))))>;
 
 // cmpge(Rs, Imm) -> cmpgt(Rs, Imm-1)
-def : Pat <(i1 (setge (i32 IntRegs:$src1), s8ExtPred:$src2)),
-      (i1 (CMPGTri (i32 IntRegs:$src1), (DEC_CONST_SIGNED s8ExtPred:$src2)))>;
+let AddedComplexity = 30 in
+def: Pat<(i1 (setge (i32 IntRegs:$src1), s8ExtPred:$src2)),
+         (C2_cmpgti IntRegs:$src1, (DEC_CONST_SIGNED s8ExtPred:$src2))>;
 
 // Map cmpge(Rss, Rtt) -> !cmpgt(Rtt, Rss).
 // rss >= rtt -> !(rtt > rss).
-def : Pat <(i1 (setge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
-      (i1 (NOT_p (i1 (CMPGT64rr (i64 DoubleRegs:$src2),
-                                (i64 DoubleRegs:$src1)))))>;
+def: Pat<(i1 (setge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+         (C2_not (C2_cmpgtp DoubleRegs:$src2, DoubleRegs:$src1))>;
 
 // Map cmplt(Rs, Imm) -> !cmpge(Rs, Imm).
 // !cmpge(Rs, Imm) -> !cmpgt(Rs, Imm-1).
 // rs < rt -> !(rs >= rt).
-def : Pat <(i1 (setlt (i32 IntRegs:$src1), s8ExtPred:$src2)),
-      (i1 (NOT_p (CMPGTri (i32 IntRegs:$src1), (DEC_CONST_SIGNED s8ExtPred:$src2))))>;
-
-// Map cmplt(Rs, Rt) -> cmpgt(Rt, Rs).
-// rs < rt -> rt > rs.
-// We can let assembler map it, or we can do in the compiler itself.
-def : Pat <(i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
-      (i1 (CMPGTrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))>;
-
-// Map cmplt(Rss, Rtt) -> cmpgt(Rtt, Rss).
-// rss < rtt -> (rtt > rss).
-def : Pat <(i1 (setlt (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
-      (i1 (CMPGT64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)))>;
-
-// Map from cmpltu(Rs, Rd) -> cmpgtu(Rd, Rs)
-// rs < rt -> rt > rs.
-// We can let assembler map it, or we can do in the compiler itself.
-def : Pat <(i1 (setult (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
-      (i1 (CMPGTUrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))>;
-
-// Map from cmpltu(Rss, Rdd) -> cmpgtu(Rdd, Rss).
-// rs < rt -> rt > rs.
-def : Pat <(i1 (setult (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
-      (i1 (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)))>;
+let AddedComplexity = 30 in
+def: Pat<(i1 (setlt (i32 IntRegs:$src1), s8ExtPred:$src2)),
+         (C2_not (C2_cmpgti IntRegs:$src1, (DEC_CONST_SIGNED s8ExtPred:$src2)))>;
 
 // Generate cmpgeu(Rs, #0) -> cmpeq(Rs, Rs)
-def : Pat <(i1 (setuge (i32 IntRegs:$src1), 0)),
-      (i1 (CMPEQrr (i32 IntRegs:$src1), (i32 IntRegs:$src1)))>;
+def: Pat<(i1 (setuge (i32 IntRegs:$src1), 0)),
+         (C2_cmpeq IntRegs:$src1, IntRegs:$src1)>;
 
 // Generate cmpgeu(Rs, #u8) -> cmpgtu(Rs, #u8 -1)
-def : Pat <(i1 (setuge (i32 IntRegs:$src1), u8ExtPred:$src2)),
-      (i1 (CMPGTUri (i32 IntRegs:$src1), (DEC_CONST_UNSIGNED u8ExtPred:$src2)))>;
+def: Pat<(i1 (setuge (i32 IntRegs:$src1), u8ExtPred:$src2)),
+         (C2_cmpgtui IntRegs:$src1, (DEC_CONST_UNSIGNED u8ExtPred:$src2))>;
 
 // Generate cmpgtu(Rs, #u9)
-def : Pat <(i1 (setugt (i32 IntRegs:$src1), u9ExtPred:$src2)),
-      (i1 (CMPGTUri (i32 IntRegs:$src1), u9ExtPred:$src2))>;
-
-// Map from Rs >= Rt -> !(Rt > Rs).
-// rs >= rt -> !(rt > rs).
-def : Pat <(i1 (setuge (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
-      (i1 (NOT_p (CMPGTUrr (i32 IntRegs:$src2), (i32 IntRegs:$src1))))>;
+def: Pat<(i1 (setugt (i32 IntRegs:$src1), u9ExtPred:$src2)),
+         (C2_cmpgtui IntRegs:$src1, u9ExtPred:$src2)>;
 
 // Map from Rs >= Rt -> !(Rt > Rs).
 // rs >= rt -> !(rt > rs).
-def : Pat <(i1 (setuge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
-      (i1 (NOT_p (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1))))>;
-
-// Map from cmpleu(Rs, Rt) -> !cmpgtu(Rs, Rt).
-// Map from (Rs <= Rt) -> !(Rs > Rt).
-def : Pat <(i1 (setule (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
-      (i1 (NOT_p (CMPGTUrr (i32 IntRegs:$src1), (i32 IntRegs:$src2))))>;
+def: Pat<(i1 (setuge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+         (C2_not (C2_cmpgtup DoubleRegs:$src2, DoubleRegs:$src1))>;
 
 // Map from cmpleu(Rss, Rtt) -> !cmpgtu(Rss, Rtt-1).
 // Map from (Rs <= Rt) -> !(Rs > Rt).
-def : Pat <(i1 (setule (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
-      (i1 (NOT_p (CMPGTU64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))))>;
+def: Pat<(i1 (setule (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+         (C2_not (C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2))>;
 
 // Sign extends.
 // i1 -> i32
-def : Pat <(i32 (sext (i1 PredRegs:$src1))),
-      (i32 (MUX_ii (i1 PredRegs:$src1), -1, 0))>;
+def: Pat<(i32 (sext (i1 PredRegs:$src1))),
+         (C2_muxii PredRegs:$src1, -1, 0)>;
 
 // i1 -> i64
-def : Pat <(i64 (sext (i1 PredRegs:$src1))),
-      (i64 (COMBINE_rr (TFRI -1), (MUX_ii (i1 PredRegs:$src1), -1, 0)))>;
-
-// Convert sign-extended load back to load and sign extend.
-// i8 -> i64
-def:  Pat <(i64 (sextloadi8 ADDRriS11_0:$src1)),
-      (i64 (SXTW (LDrib ADDRriS11_0:$src1)))>;
-
-// Convert any-extended load back to load and sign extend.
-// i8 -> i64
-def:  Pat <(i64 (extloadi8 ADDRriS11_0:$src1)),
-      (i64 (SXTW (LDrib ADDRriS11_0:$src1)))>;
-
-// Convert sign-extended load back to load and sign extend.
-// i16 -> i64
-def:  Pat <(i64 (sextloadi16 ADDRriS11_1:$src1)),
-      (i64 (SXTW (LDrih ADDRriS11_1:$src1)))>;
+def: Pat<(i64 (sext (i1 PredRegs:$src1))),
+         (A2_combinew (A2_tfrsi -1), (C2_muxii PredRegs:$src1, -1, 0))>;
 
 // Convert sign-extended load back to load and sign extend.
 // i32 -> i64
 def:  Pat <(i64 (sextloadi32 ADDRriS11_2:$src1)),
-      (i64 (SXTW (LDriw ADDRriS11_2:$src1)))>;
-
+      (i64 (A2_sxtw (L2_loadri_io AddrFI:$src1, 0)))>;
 
 // Zero extends.
 // i1 -> i32
-def : Pat <(i32 (zext (i1 PredRegs:$src1))),
-      (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>;
+def: Pat<(i32 (zext (i1 PredRegs:$src1))),
+         (C2_muxii PredRegs:$src1, 1, 0)>;
 
-// i1 -> i64
-def : Pat <(i64 (zext (i1 PredRegs:$src1))),
-      (i64 (COMBINE_rr (TFRI 0), (MUX_ii (i1 PredRegs:$src1), 1, 0)))>,
-      Requires<[NoV4T]>;
+// Map from Rs = Pd to Pd = mux(Pd, #1, #0)
+def: Pat<(i32 (anyext (i1 PredRegs:$src1))),
+         (C2_muxii PredRegs:$src1, 1, 0)>;
 
-// i32 -> i64
-def : Pat <(i64 (zext (i32 IntRegs:$src1))),
-      (i64 (COMBINE_rr (TFRI 0), (i32 IntRegs:$src1)))>,
-      Requires<[NoV4T]>;
+// Map from Rss = Pd to Rdd = sxtw (mux(Pd, #1, #0))
+def: Pat<(i64 (anyext (i1 PredRegs:$src1))),
+         (A2_sxtw (C2_muxii PredRegs:$src1, 1, 0))>;
 
-// i8 -> i64
-def:  Pat <(i64 (zextloadi8 ADDRriS11_0:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), (LDriub ADDRriS11_0:$src1)))>,
-      Requires<[NoV4T]>;
+def: Pat<(i64 (or (i64 (shl (i64 DoubleRegs:$srcHigh),
+                           (i32 32))),
+               (i64 (zextloadi32 ADDRriS11_2:$srcLow)))),
+        (i64 (A2_combinew (EXTRACT_SUBREG (i64 DoubleRegs:$srcHigh), subreg_loreg),
+                        (L2_loadri_io AddrFI:$srcLow, 0)))>;
 
-let AddedComplexity = 20 in
-def:  Pat <(i64 (zextloadi8 (add (i32 IntRegs:$src1),
-                                s11_0ExtPred:$offset))),
-      (i64 (COMBINE_rr (TFRI 0), (LDriub_indexed IntRegs:$src1,
-                                  s11_0ExtPred:$offset)))>,
-      Requires<[NoV4T]>;
+// Multiply 64-bit unsigned and use upper result.
+def : Pat <(mulhu (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
+  (A2_addp
+    (M2_dpmpyuu_acc_s0
+      (S2_lsr_i_p
+        (A2_addp
+          (M2_dpmpyuu_acc_s0
+            (S2_lsr_i_p (M2_dpmpyuu_s0 (LoReg $src1), (LoReg $src2)), 32),
+            (HiReg $src1),
+            (LoReg $src2)),
+          (A2_combinew (A2_tfrsi 0),
+                       (LoReg (M2_dpmpyuu_s0 (LoReg $src1), (HiReg $src2))))),
+        32),
+      (HiReg $src1),
+      (HiReg $src2)),
+    (S2_lsr_i_p (M2_dpmpyuu_s0 (LoReg $src1), (HiReg $src2)), 32)
+)>;
 
-// i1 -> i64
-def:  Pat <(i64 (zextloadi1 ADDRriS11_0:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), (LDriub ADDRriS11_0:$src1)))>,
-      Requires<[NoV4T]>;
+// Hexagon specific ISD nodes.
+def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                                                 SDTCisVT<1, i32>]>;
+def SDTHexagonARGEXTEND   : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>;
 
-let AddedComplexity = 20 in
-def:  Pat <(i64 (zextloadi1 (add (i32 IntRegs:$src1),
-                                s11_0ExtPred:$offset))),
-      (i64 (COMBINE_rr (TFRI 0), (LDriub_indexed IntRegs:$src1,
-                                  s11_0ExtPred:$offset)))>,
-      Requires<[NoV4T]>;
+def Hexagon_ADJDYNALLOC : SDNode<"HexagonISD::ADJDYNALLOC",
+                                  SDTHexagonADJDYNALLOC>;
+def Hexagon_ARGEXTEND   : SDNode<"HexagonISD::ARGEXTEND", SDTHexagonARGEXTEND>;
 
-// i16 -> i64
-def:  Pat <(i64 (zextloadi16 ADDRriS11_1:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), (LDriuh ADDRriS11_1:$src1)))>,
-      Requires<[NoV4T]>;
+// Needed to tag these instructions for stack layout.
+let isCodeGenOnly = 1, usesCustomInserter = 1 in
+def ADJDYNALLOC : T_Addri<s6Imm>;
 
-let AddedComplexity = 20 in
-def:  Pat <(i64 (zextloadi16 (add (i32 IntRegs:$src1),
-                                  s11_1ExtPred:$offset))),
-      (i64 (COMBINE_rr (TFRI 0), (LDriuh_indexed IntRegs:$src1,
-                                  s11_1ExtPred:$offset)))>,
-      Requires<[NoV4T]>;
+def: Pat<(Hexagon_ADJDYNALLOC I32:$Rs, s16ImmPred:$s16),
+         (ADJDYNALLOC I32:$Rs, imm:$s16)>;
 
-// i32 -> i64
-def:  Pat <(i64 (zextloadi32 ADDRriS11_2:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), (LDriw ADDRriS11_2:$src1)))>,
-      Requires<[NoV4T]>;
+let isCodeGenOnly = 1 in
+def ARGEXTEND : ALU32_rr <(outs IntRegs:$dst), (ins IntRegs:$src1),
+                "$dst = $src1",
+                [(set (i32 IntRegs:$dst),
+                      (Hexagon_ARGEXTEND (i32 IntRegs:$src1)))]>;
 
 let AddedComplexity = 100 in
-def:  Pat <(i64 (zextloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
-      (i64 (COMBINE_rr (TFRI 0), (LDriw_indexed IntRegs:$src1,
-                                  s11_2ExtPred:$offset)))>,
-      Requires<[NoV4T]>;
+def: Pat<(i32 (sext_inreg (Hexagon_ARGEXTEND (i32 IntRegs:$src1)), i16)),
+         (i32 IntRegs:$src1)>;
 
-let AddedComplexity = 10 in
-def:  Pat <(i32 (zextloadi1 ADDRriS11_0:$src1)),
-      (i32 (LDriw ADDRriS11_0:$src1))>;
+def HexagonWrapperJT: SDNode<"HexagonISD::WrapperJT", SDTIntUnaryOp>;
 
-// Map from Rs = Pd to Pd = mux(Pd, #1, #0)
-def : Pat <(i32 (zext (i1 PredRegs:$src1))),
-      (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>;
+def : Pat<(HexagonWrapperJT tjumptable:$dst),
+          (i32 (CONST32_set_jt tjumptable:$dst))>;
 
-// Map from Rs = Pd to Pd = mux(Pd, #1, #0)
-def : Pat <(i32 (anyext (i1 PredRegs:$src1))),
-      (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>;
+// XTYPE/SHIFT
+//
+//===----------------------------------------------------------------------===//
+// Template Class
+// Shift by immediate/register and accumulate/logical
+//===----------------------------------------------------------------------===//
 
-// Map from Rss = Pd to Rdd = sxtw (mux(Pd, #1, #0))
-def : Pat <(i64 (anyext (i1 PredRegs:$src1))),
-      (i64 (SXTW (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))))>;
+// Rx[+-&|]=asr(Rs,#u5)
+// Rx[+-&|^]=lsr(Rs,#u5)
+// Rx[+-&|^]=asl(Rs,#u5)
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_shift_imm_acc_r <string opc1, string opc2, SDNode OpNode1,
+                         SDNode OpNode2, bits<3> majOp, bits<2> minOp>
+  : SInst_acc<(outs IntRegs:$Rx),
+              (ins IntRegs:$src1, IntRegs:$Rs, u5Imm:$u5),
+  "$Rx "#opc2#opc1#"($Rs, #$u5)",
+  [(set (i32 IntRegs:$Rx),
+         (OpNode2 (i32 IntRegs:$src1),
+                  (OpNode1 (i32 IntRegs:$Rs), u5ImmPred:$u5)))],
+  "$src1 = $Rx", S_2op_tc_2_SLOT23> {
+    bits<5> Rx;
+    bits<5> Rs;
+    bits<5> u5;
+
+    let IClass = 0b1000;
+
+    let Inst{27-24} = 0b1110;
+    let Inst{23-22} = majOp{2-1};
+    let Inst{13} = 0b0;
+    let Inst{7} = majOp{0};
+    let Inst{6-5} = minOp;
+    let Inst{4-0} = Rx;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = u5;
+  }
 
+// Rx[+-&|]=asr(Rs,Rt)
+// Rx[+-&|^]=lsr(Rs,Rt)
+// Rx[+-&|^]=asl(Rs,Rt)
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_shift_reg_acc_r <string opc1, string opc2, SDNode OpNode1,
+                         SDNode OpNode2, bits<2> majOp, bits<2> minOp>
+  : SInst_acc<(outs IntRegs:$Rx),
+              (ins IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt),
+  "$Rx "#opc2#opc1#"($Rs, $Rt)",
+  [(set (i32 IntRegs:$Rx),
+         (OpNode2 (i32 IntRegs:$src1),
+                  (OpNode1 (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))],
+  "$src1 = $Rx", S_3op_tc_2_SLOT23 > {
+    bits<5> Rx;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1100;
 
-let AddedComplexity = 100 in
-def: Pat<(i64 (or (i64 (shl (i64 DoubleRegs:$srcHigh),
-                           (i32 32))),
-               (i64 (zextloadi32 (i32 (add IntRegs:$src2,
-                                         s11_2ExtPred:$offset2)))))),
-        (i64 (COMBINE_rr (EXTRACT_SUBREG (i64 DoubleRegs:$srcHigh), subreg_loreg),
-                        (LDriw_indexed IntRegs:$src2,
-                                       s11_2ExtPred:$offset2)))>;
+    let Inst{27-24} = 0b1100;
+    let Inst{23-22} = majOp;
+    let Inst{7-6} = minOp;
+    let Inst{4-0} = Rx;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = Rt;
+  }
 
-def: Pat<(i64 (or (i64 (shl (i64 DoubleRegs:$srcHigh),
-                           (i32 32))),
-               (i64 (zextloadi32 ADDRriS11_2:$srcLow)))),
-        (i64 (COMBINE_rr (EXTRACT_SUBREG (i64 DoubleRegs:$srcHigh), subreg_loreg),
-                        (LDriw ADDRriS11_2:$srcLow)))>;
+// Rxx[+-&|]=asr(Rss,#u6)
+// Rxx[+-&|^]=lsr(Rss,#u6)
+// Rxx[+-&|^]=asl(Rss,#u6)
+
+class T_shift_imm_acc_p <string opc1, string opc2, SDNode OpNode1,
+                         SDNode OpNode2, bits<3> majOp, bits<2> minOp>
+  : SInst_acc<(outs DoubleRegs:$Rxx),
+              (ins DoubleRegs:$src1, DoubleRegs:$Rss, u6Imm:$u6),
+  "$Rxx "#opc2#opc1#"($Rss, #$u6)",
+  [(set (i64 DoubleRegs:$Rxx),
+        (OpNode2 (i64 DoubleRegs:$src1),
+                 (OpNode1 (i64 DoubleRegs:$Rss), u6ImmPred:$u6)))],
+  "$src1 = $Rxx", S_2op_tc_2_SLOT23> {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<6> u6;
+
+    let IClass = 0b1000;
+
+    let Inst{27-24} = 0b0010;
+    let Inst{23-22} = majOp{2-1};
+    let Inst{7} = majOp{0};
+    let Inst{6-5} = minOp;
+    let Inst{4-0} = Rxx;
+    let Inst{20-16} = Rss;
+    let Inst{13-8} = u6;
+  }
 
-def: Pat<(i64 (or (i64 (shl (i64 DoubleRegs:$srcHigh),
-                           (i32 32))),
-               (i64 (zext (i32 IntRegs:$srcLow))))),
-        (i64 (COMBINE_rr (EXTRACT_SUBREG (i64 DoubleRegs:$srcHigh), subreg_loreg),
-                        IntRegs:$srcLow))>;
 
-let AddedComplexity = 100 in
-def: Pat<(i64 (or (i64 (shl (i64 DoubleRegs:$srcHigh),
-                           (i32 32))),
-               (i64 (zextloadi32 (i32 (add IntRegs:$src2,
-                                         s11_2ExtPred:$offset2)))))),
-        (i64 (COMBINE_rr (EXTRACT_SUBREG (i64 DoubleRegs:$srcHigh), subreg_loreg),
-                        (LDriw_indexed IntRegs:$src2,
-                                       s11_2ExtPred:$offset2)))>;
+// Rxx[+-&|]=asr(Rss,Rt)
+// Rxx[+-&|^]=lsr(Rss,Rt)
+// Rxx[+-&|^]=asl(Rss,Rt)
+// Rxx[+-&|^]=lsl(Rss,Rt)
+
+class T_shift_reg_acc_p <string opc1, string opc2, SDNode OpNode1,
+                         SDNode OpNode2, bits<3> majOp, bits<2> minOp>
+  : SInst_acc<(outs DoubleRegs:$Rxx),
+              (ins DoubleRegs:$src1, DoubleRegs:$Rss, IntRegs:$Rt),
+  "$Rxx "#opc2#opc1#"($Rss, $Rt)",
+  [(set (i64 DoubleRegs:$Rxx),
+        (OpNode2 (i64 DoubleRegs:$src1),
+                 (OpNode1 (i64 DoubleRegs:$Rss), (i32 IntRegs:$Rt))))],
+  "$src1 = $Rxx", S_3op_tc_2_SLOT23> {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<5> Rt;
+
+    let IClass = 0b1100;
+
+    let Inst{27-24} = 0b1011;
+    let Inst{23-21} = majOp;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rt;
+    let Inst{7-6} = minOp;
+    let Inst{4-0} = Rxx;
+  }
 
-def: Pat<(i64 (or (i64 (shl (i64 DoubleRegs:$srcHigh),
-                           (i32 32))),
-               (i64 (zextloadi32 ADDRriS11_2:$srcLow)))),
-        (i64 (COMBINE_rr (EXTRACT_SUBREG (i64 DoubleRegs:$srcHigh), subreg_loreg),
-                        (LDriw ADDRriS11_2:$srcLow)))>;
+//===----------------------------------------------------------------------===//
+// Multi-class for the shift instructions with logical/arithmetic operators.
+//===----------------------------------------------------------------------===//
 
-def: Pat<(i64 (or (i64 (shl (i64 DoubleRegs:$srcHigh),
-                           (i32 32))),
-               (i64 (zext (i32 IntRegs:$srcLow))))),
-        (i64 (COMBINE_rr (EXTRACT_SUBREG (i64 DoubleRegs:$srcHigh), subreg_loreg),
-                        IntRegs:$srcLow))>;
-
-// Any extended 64-bit load.
-// anyext i32 -> i64
-def:  Pat <(i64 (extloadi32 ADDRriS11_2:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), (LDriw ADDRriS11_2:$src1)))>,
-      Requires<[NoV4T]>;
-
-// When there is an offset we should prefer the pattern below over the pattern above.
-// The complexity of the above is 13 (gleaned from HexagonGenDAGIsel.inc)
-// So this complexity below is comfortably higher to allow for choosing the below.
-// If this is not done then we generate addresses such as
-// ********************************************
-//        r1 = add (r0, #4)
-//        r1 = memw(r1 + #0)
-//  instead of
-//        r1 = memw(r0 + #4)
-// ********************************************
+multiclass xtype_imm_base<string OpcStr1, string OpcStr2, SDNode OpNode1,
+                         SDNode OpNode2, bits<3> majOp, bits<2> minOp > {
+  def _i_r#NAME : T_shift_imm_acc_r< OpcStr1, OpcStr2, OpNode1,
+                                     OpNode2, majOp, minOp >;
+  def _i_p#NAME : T_shift_imm_acc_p< OpcStr1, OpcStr2, OpNode1,
+                                     OpNode2, majOp, minOp >;
+}
+
+multiclass xtype_imm_acc<string opc1, SDNode OpNode, bits<2>minOp> {
+  let AddedComplexity = 100 in
+  defm _acc  : xtype_imm_base< opc1, "+= ", OpNode, add, 0b001, minOp>;
+
+  defm _nac  : xtype_imm_base< opc1, "-= ", OpNode, sub, 0b000, minOp>;
+  defm _and  : xtype_imm_base< opc1, "&= ", OpNode, and, 0b010, minOp>;
+  defm _or   : xtype_imm_base< opc1, "|= ", OpNode,  or, 0b011, minOp>;
+}
+
+multiclass xtype_xor_imm_acc<string opc1, SDNode OpNode, bits<2>minOp> {
 let AddedComplexity = 100 in
-def:  Pat <(i64 (extloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
-      (i64 (COMBINE_rr (TFRI 0), (LDriw_indexed IntRegs:$src1,
-                                  s11_2ExtPred:$offset)))>,
-      Requires<[NoV4T]>;
+  defm _xacc  : xtype_imm_base< opc1, "^= ", OpNode, xor, 0b100, minOp>;
+}
 
-// anyext i16 -> i64.
-def:  Pat <(i64 (extloadi16 ADDRriS11_2:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), (LDrih ADDRriS11_2:$src1)))>,
-      Requires<[NoV4T]>;
+defm S2_asr : xtype_imm_acc<"asr", sra, 0b00>;
 
-let AddedComplexity = 20 in
-def:  Pat <(i64 (extloadi16 (add (i32 IntRegs:$src1),
-                                  s11_1ExtPred:$offset))),
-      (i64 (COMBINE_rr (TFRI 0), (LDrih_indexed IntRegs:$src1,
-                                  s11_1ExtPred:$offset)))>,
-      Requires<[NoV4T]>;
+defm S2_lsr : xtype_imm_acc<"lsr", srl, 0b01>,
+              xtype_xor_imm_acc<"lsr", srl, 0b01>;
 
-// Map from Rdd = zxtw(Rs) -> Rdd = combine(0, Rs).
-def : Pat<(i64 (zext (i32 IntRegs:$src1))),
-      (i64 (COMBINE_rr (TFRI 0), (i32 IntRegs:$src1)))>,
-      Requires<[NoV4T]>;
+defm S2_asl : xtype_imm_acc<"asl", shl, 0b10>,
+              xtype_xor_imm_acc<"asl", shl, 0b10>;
 
-// Multiply 64-bit unsigned and use upper result.
-def : Pat <(mulhu (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
-      (i64
-       (MPYU64_acc
-        (i64
-         (COMBINE_rr
-          (TFRI 0),
-           (i32
-            (EXTRACT_SUBREG
-             (i64
-              (LSRd_ri
-               (i64
-                (MPYU64_acc
-                 (i64
-                  (MPYU64_acc
-                   (i64
-                    (COMBINE_rr (TFRI 0),
-                     (i32
-                      (EXTRACT_SUBREG
-                       (i64
-                        (LSRd_ri
-                         (i64
-                          (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
-                                                       subreg_loreg)),
-                                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
-                                                       subreg_loreg)))), 32)),
-                       subreg_loreg)))),
-                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
-                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)))),
-                 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
-                 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg)))),
-               32)), subreg_loreg)))),
-        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
-        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))))>;
-
-// Multiply 64-bit signed and use upper result.
-def : Pat <(mulhs (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
-      (i64
-       (MPY64_acc
-        (i64
-         (COMBINE_rr (TFRI 0),
-          (i32
-           (EXTRACT_SUBREG
-            (i64
-             (LSRd_ri
-              (i64
-               (MPY64_acc
-                (i64
-                 (MPY64_acc
-                  (i64
-                   (COMBINE_rr (TFRI 0),
-                    (i32
-                     (EXTRACT_SUBREG
-                      (i64
-                       (LSRd_ri
-                        (i64
-                         (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
-                                                      subreg_loreg)),
-                                 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
-                                                      subreg_loreg)))), 32)),
-                      subreg_loreg)))),
-                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
-                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)))),
-                (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
-                (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg)))),
-              32)), subreg_loreg)))),
-        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
-        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))))>;
+multiclass xtype_reg_acc_r<string opc1, SDNode OpNode, bits<2>minOp> {
+  let AddedComplexity = 100 in
+  def _acc : T_shift_reg_acc_r <opc1, "+= ", OpNode, add, 0b11, minOp>;
 
-// Hexagon specific ISD nodes.
-//def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>]>;
-def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2,
-                                  [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-def Hexagon_ADJDYNALLOC : SDNode<"HexagonISD::ADJDYNALLOC",
-                                  SDTHexagonADJDYNALLOC>;
-// Needed to tag these instructions for stack layout.
-let usesCustomInserter = 1 in
-def ADJDYNALLOC : ALU32_ri<(outs IntRegs:$dst), (ins IntRegs:$src1,
-                                                     s16Imm:$src2),
-                  "$dst = add($src1, #$src2)",
-                  [(set (i32 IntRegs:$dst),
-                        (Hexagon_ADJDYNALLOC (i32 IntRegs:$src1),
-                                             s16ImmPred:$src2))]>;
+  def _nac : T_shift_reg_acc_r <opc1, "-= ", OpNode, sub, 0b10, minOp>;
+  def _and : T_shift_reg_acc_r <opc1, "&= ", OpNode, and, 0b01, minOp>;
+  def _or  : T_shift_reg_acc_r <opc1, "|= ", OpNode,  or, 0b00, minOp>;
+}
 
-def SDTHexagonARGEXTEND : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>;
-def Hexagon_ARGEXTEND : SDNode<"HexagonISD::ARGEXTEND", SDTHexagonARGEXTEND>;
-def ARGEXTEND : ALU32_rr <(outs IntRegs:$dst), (ins IntRegs:$src1),
-                "$dst = $src1",
-                [(set (i32 IntRegs:$dst),
-                      (Hexagon_ARGEXTEND (i32 IntRegs:$src1)))]>;
+multiclass xtype_reg_acc_p<string opc1, SDNode OpNode, bits<2>minOp> {
+  let AddedComplexity = 100 in
+  def _acc : T_shift_reg_acc_p <opc1, "+= ", OpNode, add, 0b110, minOp>;
 
-let AddedComplexity = 100 in
-def : Pat<(i32 (sext_inreg (Hexagon_ARGEXTEND (i32 IntRegs:$src1)), i16)),
-      (COPY (i32 IntRegs:$src1))>;
+  def _nac : T_shift_reg_acc_p <opc1, "-= ", OpNode, sub, 0b100, minOp>;
+  def _and : T_shift_reg_acc_p <opc1, "&= ", OpNode, and, 0b010, minOp>;
+  def _or  : T_shift_reg_acc_p <opc1, "|= ", OpNode,  or, 0b000, minOp>;
+  def _xor : T_shift_reg_acc_p <opc1, "^= ", OpNode, xor, 0b011, minOp>;
+}
 
-def HexagonWrapperJT: SDNode<"HexagonISD::WrapperJT", SDTIntUnaryOp>;
+multiclass xtype_reg_acc<string OpcStr, SDNode OpNode, bits<2> minOp > {
+  defm _r_r : xtype_reg_acc_r <OpcStr, OpNode, minOp>;
+  defm _r_p : xtype_reg_acc_p <OpcStr, OpNode, minOp>;
+}
 
-def : Pat<(HexagonWrapperJT tjumptable:$dst),
-          (i32 (CONST32_set_jt tjumptable:$dst))>;
+defm S2_asl : xtype_reg_acc<"asl", shl, 0b10>;
+defm S2_asr : xtype_reg_acc<"asr", sra, 0b00>;
+defm S2_lsr : xtype_reg_acc<"lsr", srl, 0b01>;
+defm S2_lsl : xtype_reg_acc<"lsl", shl, 0b11>;
 
-// XTYPE/SHIFT
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_S3op_1 <string mnemonic, RegisterClass RC, bits<2> MajOp, bits<3> MinOp,
+                bit SwapOps, bit isSat = 0, bit isRnd = 0, bit hasShift = 0>
+  : SInst <(outs RC:$dst),
+           (ins DoubleRegs:$src1, DoubleRegs:$src2),
+  "$dst = "#mnemonic#"($src1, $src2)"#!if(isRnd, ":rnd", "")
+                                     #!if(hasShift,":>>1","")
+                                     #!if(isSat, ":sat", ""),
+  [], "", S_3op_tc_2_SLOT23 > {
+    bits<5> dst;
+    bits<5> src1;
+    bits<5> src2;
 
-// Multi-class for logical operators :
-// Shift by immediate/register and accumulate/logical
-multiclass xtype_imm<string OpcStr, SDNode OpNode1, SDNode OpNode2> {
-  def _ri : SInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, u5Imm:$src3),
-            !strconcat("$dst ", !strconcat(OpcStr, "($src2, #$src3)")),
-            [(set (i32 IntRegs:$dst),
-                  (OpNode2 (i32 IntRegs:$src1),
-                           (OpNode1 (i32 IntRegs:$src2),
-                                    u5ImmPred:$src3)))],
-            "$src1 = $dst">;
-
-  def d_ri : SInst_acc<(outs DoubleRegs:$dst),
-            (ins DoubleRegs:$src1, DoubleRegs:$src2, u6Imm:$src3),
-            !strconcat("$dst ", !strconcat(OpcStr, "($src2, #$src3)")),
-            [(set (i64 DoubleRegs:$dst), (OpNode2 (i64 DoubleRegs:$src1),
-                          (OpNode1 (i64 DoubleRegs:$src2), u6ImmPred:$src3)))],
-            "$src1 = $dst">;
-}
-
-// Multi-class for logical operators :
-// Shift by register and accumulate/logical (32/64 bits)
-multiclass xtype_reg<string OpcStr, SDNode OpNode1, SDNode OpNode2> {
-  def _rr : SInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            !strconcat("$dst ", !strconcat(OpcStr, "($src2, $src3)")),
-            [(set (i32 IntRegs:$dst),
-                  (OpNode2 (i32 IntRegs:$src1),
-                           (OpNode1 (i32 IntRegs:$src2),
-                                    (i32 IntRegs:$src3))))],
-            "$src1 = $dst">;
+    let IClass = 0b1100;
 
-  def d_rr : SInst_acc<(outs DoubleRegs:$dst),
-            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-            !strconcat("$dst ", !strconcat(OpcStr, "($src2, $src3)")),
-            [(set (i64 DoubleRegs:$dst),
-                  (OpNode2 (i64 DoubleRegs:$src1),
-                           (OpNode1 (i64 DoubleRegs:$src2),
-                                    (i32 IntRegs:$src3))))],
-            "$src1 = $dst">;
+    let Inst{27-24} = 0b0001;
+    let Inst{23-22} = MajOp;
+    let Inst{20-16} = !if (SwapOps, src2, src1);
+    let Inst{12-8}  = !if (SwapOps, src1, src2);
+    let Inst{7-5}   = MinOp;
+    let Inst{4-0}   = dst;
+  }
 
-}
+class T_S3op_64 <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit SwapOps,
+                 bit isSat = 0, bit isRnd = 0, bit hasShift = 0 >
+  : T_S3op_1 <mnemonic, DoubleRegs, MajOp, MinOp, SwapOps,
+              isSat, isRnd, hasShift>;
 
-multiclass basic_xtype_imm<string OpcStr, SDNode OpNode> {
-let AddedComplexity = 100 in
-  defm _ADD : xtype_imm< !strconcat("+= ", OpcStr), OpNode, add>;
-  defm _SUB : xtype_imm< !strconcat("-= ", OpcStr), OpNode, sub>;
-  defm _AND : xtype_imm< !strconcat("&= ", OpcStr), OpNode, and>;
-  defm _OR  : xtype_imm< !strconcat("|= ", OpcStr), OpNode, or>;
+let Itinerary = S_3op_tc_1_SLOT23 in {
+  def S2_shuffeb : T_S3op_64 < "shuffeb", 0b00, 0b010, 0>;
+  def S2_shuffeh : T_S3op_64 < "shuffeh", 0b00, 0b110, 0>;
+  def S2_shuffob : T_S3op_64 < "shuffob", 0b00, 0b100, 1>;
+  def S2_shuffoh : T_S3op_64 < "shuffoh", 0b10, 0b000, 1>;
+
+  def S2_vtrunewh : T_S3op_64 < "vtrunewh", 0b10, 0b010, 0>;
+  def S2_vtrunowh : T_S3op_64 < "vtrunowh", 0b10, 0b100, 0>;
 }
 
-multiclass basic_xtype_reg<string OpcStr, SDNode OpNode> {
-let AddedComplexity = 100 in
-  defm _ADD : xtype_reg< !strconcat("+= ", OpcStr), OpNode, add>;
-  defm _SUB : xtype_reg< !strconcat("-= ", OpcStr), OpNode, sub>;
-  defm _AND : xtype_reg< !strconcat("&= ", OpcStr), OpNode, and>;
-  defm _OR  : xtype_reg< !strconcat("|= ", OpcStr), OpNode, or>;
+def S2_lfsp : T_S3op_64 < "lfs", 0b10, 0b110, 0>;
+
+let hasSideEffects = 0 in
+class T_S3op_2 <string mnemonic, bits<3> MajOp, bit SwapOps>
+  : SInst < (outs DoubleRegs:$Rdd),
+            (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, PredRegs:$Pu),
+  "$Rdd = "#mnemonic#"($Rss, $Rtt, $Pu)",
+  [], "", S_3op_tc_1_SLOT23 > {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+    bits<2> Pu;
+
+    let IClass = 0b1100;
+
+    let Inst{27-24} = 0b0010;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = !if (SwapOps, Rtt, Rss);
+    let Inst{12-8} = !if (SwapOps, Rss, Rtt);
+    let Inst{6-5} = Pu;
+    let Inst{4-0} = Rdd;
+  }
+
+def S2_valignrb  : T_S3op_2 < "valignb",  0b000, 1>;
+def S2_vsplicerb : T_S3op_2 < "vspliceb", 0b100, 0>;
+
+//===----------------------------------------------------------------------===//
+// Template class used by vector shift, vector rotate, vector neg,
+// 32-bit shift, 64-bit shifts, etc.
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in
+class T_S3op_3 <string mnemonic, RegisterClass RC, bits<2> MajOp,
+                 bits<2> MinOp, bit isSat = 0, list<dag> pattern = [] >
+  : SInst <(outs RC:$dst),
+           (ins RC:$src1, IntRegs:$src2),
+  "$dst = "#mnemonic#"($src1, $src2)"#!if(isSat, ":sat", ""),
+  pattern, "", S_3op_tc_1_SLOT23> {
+    bits<5> dst;
+    bits<5> src1;
+    bits<5> src2;
+
+    let IClass = 0b1100;
+
+    let Inst{27-24} = !if(!eq(!cast<string>(RC), "IntRegs"), 0b0110, 0b0011);
+    let Inst{23-22} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{12-8} = src2;
+    let Inst{7-6} = MinOp;
+    let Inst{4-0} = dst;
+  }
+
+let hasNewValue = 1 in
+class T_S3op_shift32 <string mnemonic, SDNode OpNode, bits<2> MinOp>
+  : T_S3op_3 <mnemonic, IntRegs, 0b01, MinOp, 0,
+    [(set (i32 IntRegs:$dst), (OpNode (i32 IntRegs:$src1),
+                                      (i32 IntRegs:$src2)))]>;
+
+let hasNewValue = 1, Itinerary = S_3op_tc_2_SLOT23 in
+class T_S3op_shift32_Sat <string mnemonic, bits<2> MinOp>
+  : T_S3op_3 <mnemonic, IntRegs, 0b00, MinOp, 1, []>;
+
+
+class T_S3op_shift64 <string mnemonic, SDNode OpNode, bits<2> MinOp>
+  : T_S3op_3 <mnemonic, DoubleRegs, 0b10, MinOp, 0,
+    [(set (i64 DoubleRegs:$dst), (OpNode (i64 DoubleRegs:$src1),
+                                         (i32 IntRegs:$src2)))]>;
+
+
+class T_S3op_shiftVect <string mnemonic, bits<2> MajOp, bits<2> MinOp>
+  : T_S3op_3 <mnemonic, DoubleRegs, MajOp, MinOp, 0, []>;
+
+
+// Shift by register
+// Rdd=[asr|lsr|asl|lsl](Rss,Rt)
+
+def S2_asr_r_p : T_S3op_shift64 < "asr", sra, 0b00>;
+def S2_lsr_r_p : T_S3op_shift64 < "lsr", srl, 0b01>;
+def S2_asl_r_p : T_S3op_shift64 < "asl", shl, 0b10>;
+def S2_lsl_r_p : T_S3op_shift64 < "lsl", shl, 0b11>;
+
+// Rd=[asr|lsr|asl|lsl](Rs,Rt)
+
+def S2_asr_r_r : T_S3op_shift32<"asr", sra, 0b00>;
+def S2_lsr_r_r : T_S3op_shift32<"lsr", srl, 0b01>;
+def S2_asl_r_r : T_S3op_shift32<"asl", shl, 0b10>;
+def S2_lsl_r_r : T_S3op_shift32<"lsl", shl, 0b11>;
+
+// Shift by register with saturation
+// Rd=asr(Rs,Rt):sat
+// Rd=asl(Rs,Rt):sat
+
+let Defs = [USR_OVF] in {
+  def S2_asr_r_r_sat : T_S3op_shift32_Sat<"asr", 0b00>;
+  def S2_asl_r_r_sat : T_S3op_shift32_Sat<"asl", 0b10>;
 }
 
-multiclass xtype_xor_imm<string OpcStr, SDNode OpNode> {
-let AddedComplexity = 100 in
-  defm _XOR : xtype_imm< !strconcat("^= ", OpcStr), OpNode, xor>;
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_S3op_8 <string opc, bits<3> MinOp, bit isSat, bit isRnd, bit hasShift, bit hasSplat = 0>
+  : SInst < (outs IntRegs:$Rd),
+            (ins DoubleRegs:$Rss, IntRegs:$Rt),
+  "$Rd = "#opc#"($Rss, $Rt"#!if(hasSplat, "*", "")#")"
+                           #!if(hasShift, ":<<1", "")
+                           #!if(isRnd, ":rnd", "")
+                           #!if(isSat, ":sat", ""),
+  [], "", S_3op_tc_1_SLOT23 > {
+    bits<5> Rd;
+    bits<5> Rss;
+    bits<5> Rt;
+
+    let IClass = 0b1100;
+
+    let Inst{27-24} = 0b0101;
+    let Inst{20-16} = Rss;
+    let Inst{12-8}  = Rt;
+    let Inst{7-5}   = MinOp;
+    let Inst{4-0}   = Rd;
+  }
+
+def S2_asr_r_svw_trun : T_S3op_8<"vasrw", 0b010, 0, 0, 0>;
+
+let Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
+def S2_vcrotate : T_S3op_shiftVect < "vcrotate", 0b11, 0b00>;
+
+let hasSideEffects = 0 in
+class T_S3op_7 <string mnemonic, bit MajOp >
+  : SInst <(outs DoubleRegs:$Rdd),
+           (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, u3Imm:$u3),
+  "$Rdd = "#mnemonic#"($Rss, $Rtt, #$u3)" ,
+  [], "", S_3op_tc_1_SLOT23 > {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+    bits<3> u3;
+
+    let IClass = 0b1100;
+
+    let Inst{27-24} = 0b0000;
+    let Inst{23}    = MajOp;
+    let Inst{20-16} = !if(MajOp, Rss, Rtt);
+    let Inst{12-8}  =  !if(MajOp, Rtt, Rss);
+    let Inst{7-5}   = u3;
+    let Inst{4-0}   = Rdd;
+  }
+
+def S2_valignib  : T_S3op_7 < "valignb", 0>;
+def S2_vspliceib : T_S3op_7 < "vspliceb", 1>;
+
+//===----------------------------------------------------------------------===//
+// Template class for 'insert bitfield' instructions
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_S3op_insert <string mnemonic, RegisterClass RC>
+  : SInst <(outs RC:$dst),
+           (ins RC:$src1, RC:$src2, DoubleRegs:$src3),
+  "$dst = "#mnemonic#"($src2, $src3)" ,
+  [], "$src1 = $dst", S_3op_tc_1_SLOT23 > {
+    bits<5> dst;
+    bits<5> src2;
+    bits<5> src3;
+
+    let IClass = 0b1100;
+
+    let Inst{27-26} = 0b10;
+    let Inst{25-24} = !if(!eq(!cast<string>(RC), "IntRegs"), 0b00, 0b10);
+    let Inst{23}    = 0b0;
+    let Inst{20-16} = src2;
+    let Inst{12-8}  = src3;
+    let Inst{4-0}   = dst;
+  }
+
+let hasSideEffects = 0 in
+class T_S2op_insert <bits<4> RegTyBits, RegisterClass RC, Operand ImmOp>
+  : SInst <(outs RC:$dst), (ins RC:$dst2, RC:$src1, ImmOp:$src2, ImmOp:$src3),
+  "$dst = insert($src1, #$src2, #$src3)",
+  [], "$dst2 = $dst", S_2op_tc_2_SLOT23> {
+    bits<5> dst;
+    bits<5> src1;
+    bits<6> src2;
+    bits<6> src3;
+    bit bit23;
+    bit bit13;
+    string ImmOpStr = !cast<string>(ImmOp);
+
+    let bit23 = !if (!eq(ImmOpStr, "u6Imm"), src3{5}, 0);
+    let bit13 = !if (!eq(ImmOpStr, "u6Imm"), src2{5}, 0);
+
+    let IClass = 0b1000;
+
+    let Inst{27-24} = RegTyBits;
+    let Inst{23}    = bit23;
+    let Inst{22-21} = src3{4-3};
+    let Inst{20-16} = src1;
+    let Inst{13}    = bit13;
+    let Inst{12-8}  = src2{4-0};
+    let Inst{7-5}   = src3{2-0};
+    let Inst{4-0}   = dst;
+  }
+
+// Rx=insert(Rs,Rtt)
+// Rx=insert(Rs,#u5,#U5)
+let hasNewValue = 1 in {
+  def S2_insert_rp : T_S3op_insert <"insert", IntRegs>;
+  def S2_insert    : T_S2op_insert <0b1111, IntRegs, u5Imm>;
 }
 
-defm ASL : basic_xtype_imm<"asl", shl>, basic_xtype_reg<"asl", shl>,
-           xtype_xor_imm<"asl", shl>;
+// Rxx=insert(Rss,Rtt)
+// Rxx=insert(Rss,#u6,#U6)
+def S2_insertp_rp : T_S3op_insert<"insert", DoubleRegs>;
+def S2_insertp    : T_S2op_insert <0b0011, DoubleRegs, u6Imm>;
 
-defm LSR : basic_xtype_imm<"lsr", srl>, basic_xtype_reg<"lsr", srl>,
-           xtype_xor_imm<"lsr", srl>;
+//===----------------------------------------------------------------------===//
+// Template class for 'extract bitfield' instructions
+//===----------------------------------------------------------------------===//
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_S3op_extract <string mnemonic, bits<2> MinOp>
+  : SInst <(outs IntRegs:$Rd), (ins IntRegs:$Rs, DoubleRegs:$Rtt),
+  "$Rd = "#mnemonic#"($Rs, $Rtt)",
+  [], "", S_3op_tc_2_SLOT23 > {
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<5> Rtt;
+
+    let IClass = 0b1100;
+
+    let Inst{27-22} = 0b100100;
+    let Inst{20-16} = Rs;
+    let Inst{12-8}  = Rtt;
+    let Inst{7-6}   = MinOp;
+    let Inst{4-0}   = Rd;
+  }
+
+let hasSideEffects = 0 in
+class T_S2op_extract <string mnemonic, bits<4> RegTyBits,
+                      RegisterClass RC, Operand ImmOp>
+  : SInst <(outs RC:$dst), (ins RC:$src1, ImmOp:$src2, ImmOp:$src3),
+  "$dst = "#mnemonic#"($src1, #$src2, #$src3)",
+  [], "", S_2op_tc_2_SLOT23> {
+    bits<5> dst;
+    bits<5> src1;
+    bits<6> src2;
+    bits<6> src3;
+    bit bit23;
+    bit bit13;
+    string ImmOpStr = !cast<string>(ImmOp);
+
+    let bit23 = !if (!eq(ImmOpStr, "u6Imm"), src3{5},
+                !if (!eq(mnemonic, "extractu"), 0, 1));
+
+    let bit13 = !if (!eq(ImmOpStr, "u6Imm"), src2{5}, 0);
+
+    let IClass = 0b1000;
+
+    let Inst{27-24} = RegTyBits;
+    let Inst{23}    = bit23;
+    let Inst{22-21} = src3{4-3};
+    let Inst{20-16} = src1;
+    let Inst{13}    = bit13;
+    let Inst{12-8}  = src2{4-0};
+    let Inst{7-5}   = src3{2-0};
+    let Inst{4-0}   = dst;
+  }
 
-defm ASR : basic_xtype_imm<"asr", sra>, basic_xtype_reg<"asr", sra>;
-defm LSL : basic_xtype_reg<"lsl", shl>;
+// Extract bitfield
+
+// Rdd=extractu(Rss,Rtt)
+// Rdd=extractu(Rss,#u6,#U6)
+def S2_extractup_rp : T_S3op_64 < "extractu", 0b00, 0b000, 0>;
+def S2_extractup    : T_S2op_extract <"extractu", 0b0001, DoubleRegs, u6Imm>;
+
+// Rd=extractu(Rs,Rtt)
+// Rd=extractu(Rs,#u5,#U5)
+let hasNewValue = 1 in {
+  def S2_extractu_rp : T_S3op_extract<"extractu", 0b00>;
+  def S2_extractu    : T_S2op_extract <"extractu", 0b1101, IntRegs, u5Imm>;
+}
 
 // Change the sign of the immediate for Rd=-mpyi(Rs,#u8)
-def : Pat <(mul (i32 IntRegs:$src1), (ineg n8ImmPred:$src2)),
-      (i32 (MPYI_rin (i32 IntRegs:$src1), u8ImmPred:$src2))>;
+def: Pat<(mul (i32 IntRegs:$src1), (ineg n8ImmPred:$src2)),
+         (M2_mpysin IntRegs:$src1, u8ImmPred:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// :raw for of tableindx[bdhw] insns
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class tableidxRaw<string OpStr, bits<2>MinOp>
+  : SInst <(outs IntRegs:$Rx),
+           (ins IntRegs:$_dst_, IntRegs:$Rs, u4Imm:$u4, s6Imm:$S6),
+           "$Rx = "#OpStr#"($Rs, #$u4, #$S6):raw",
+    [], "$Rx = $_dst_" > {
+    bits<5> Rx;
+    bits<5> Rs;
+    bits<4> u4;
+    bits<6> S6;
+
+    let IClass = 0b1000;
+
+    let Inst{27-24} = 0b0111;
+    let Inst{23-22} = MinOp;
+    let Inst{21}    = u4{3};
+    let Inst{20-16} = Rs;
+    let Inst{13-8}  = S6;
+    let Inst{7-5}   = u4{2-0};
+    let Inst{4-0}   = Rx;
+  }
+
+def S2_tableidxb : tableidxRaw<"tableidxb", 0b00>;
+def S2_tableidxh : tableidxRaw<"tableidxh", 0b01>;
+def S2_tableidxw : tableidxRaw<"tableidxw", 0b10>;
+def S2_tableidxd : tableidxRaw<"tableidxd", 0b11>;
 
 //===----------------------------------------------------------------------===//
 // V3 Instructions +
@@ -2930,3 +5756,9 @@ include "HexagonInstrInfoV5.td"
 //===----------------------------------------------------------------------===//
 // V5 Instructions -
 //===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU32/64/Vector +
+//===----------------------------------------------------------------------===///
+
+include "HexagonInstrInfoVector.td"
+\ No newline at end of file
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV3.td b/lib/Target/Hexagon/HexagonInstrInfoV3.td
index 7e75554..84d035d 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV3.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV3.td
@@ -21,13 +21,52 @@ def callv3nr : SDNode<"HexagonISD::CALLv3nr", SDT_SPCall,
 // J +
 //===----------------------------------------------------------------------===//
 // Call subroutine.
-let isCall = 1, neverHasSideEffects = 1,
-  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, R28, R31,
-                P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def CALLv3 : JInst<(outs), (ins calltarget:$dst),
-             "call $dst", []>, Requires<[HasV3T]>;
+let isCall = 1, hasSideEffects = 1, Defs = VolatileV3.Regs, isPredicable = 1,
+    isExtended = 0, isExtendable = 1, opExtendable = 0,
+    isExtentSigned = 1, opExtentBits = 24, opExtentAlign = 2 in
+class T_Call<string ExtStr>
+  : JInst<(outs), (ins calltarget:$dst),
+      "call " # ExtStr # "$dst", [], "", J_tc_2early_SLOT23> {
+  let BaseOpcode = "call";
+  bits<24> dst;
+
+  let IClass = 0b0101;
+  let Inst{27-25} = 0b101;
+  let Inst{24-16,13-1} = dst{23-2};
+  let Inst{0} = 0b0;
+}
+
+let isCall = 1, hasSideEffects = 1, Defs = VolatileV3.Regs, isPredicated = 1,
+    isExtended = 0, isExtendable = 1, opExtendable = 1,
+    isExtentSigned = 1, opExtentBits = 17, opExtentAlign = 2 in
+class T_CallPred<bit IfTrue, string ExtStr>
+  : JInst<(outs), (ins PredRegs:$Pu, calltarget:$dst),
+      CondStr<"$Pu", IfTrue, 0>.S # "call " # ExtStr # "$dst",
+      [], "", J_tc_2early_SLOT23> {
+  let BaseOpcode = "call";
+  let isPredicatedFalse = !if(IfTrue,0,1);
+  bits<2> Pu;
+  bits<17> dst;
+
+  let IClass = 0b0101;
+  let Inst{27-24} = 0b1101;
+  let Inst{23-22,20-16,13,7-1} = dst{16-2};
+  let Inst{21} = !if(IfTrue,0,1);
+  let Inst{11} = 0b0;
+  let Inst{9-8} = Pu;
+}
+
+multiclass T_Calls<string ExtStr> {
+  def NAME : T_Call<ExtStr>;
+  def t    : T_CallPred<1, ExtStr>;
+  def f    : T_CallPred<0, ExtStr>;
 }
 
+defm J2_call: T_Calls<"">, PredRel;
+
+let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1, Defs = VolatileV3.Regs in
+def CALLv3nr :  T_Call<"">, PredRel;
+
 //===----------------------------------------------------------------------===//
 // J -
 //===----------------------------------------------------------------------===//
@@ -37,13 +76,10 @@ let isCall = 1, neverHasSideEffects = 1,
 // JR +
 //===----------------------------------------------------------------------===//
 // Call subroutine from register.
-let isCall = 1, neverHasSideEffects = 1,
-  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, R28, R31,
-                P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def CALLRv3 : JRInst<(outs), (ins IntRegs:$dst),
-              "callr $dst",
-              []>, Requires<[HasV3TOnly]>;
- }
+
+let isCodeGenOnly = 1, Defs = VolatileV3.Regs in {
+  def CALLRv3nr : JUMPR_MISC_CALLR<0, 1>; // Call, no return.
+}
 
 //===----------------------------------------------------------------------===//
 // JR -
@@ -53,27 +89,63 @@ let isCall = 1, neverHasSideEffects = 1,
 // ALU64/ALU +
 //===----------------------------------------------------------------------===//
 
-let AddedComplexity = 200 in
-def MAXw_dd : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                    DoubleRegs:$src2),
-              "$dst = max($src2, $src1)",
-              [(set (i64 DoubleRegs:$dst),
-                    (i64 (select (i1 (setlt (i64 DoubleRegs:$src2),
-                                            (i64 DoubleRegs:$src1))),
-                                 (i64 DoubleRegs:$src1),
-                                 (i64 DoubleRegs:$src2))))]>,
-Requires<[HasV3T]>;
-
-let AddedComplexity = 200 in
-def MINw_dd : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                    DoubleRegs:$src2),
-              "$dst = min($src2, $src1)",
-              [(set (i64 DoubleRegs:$dst),
-                    (i64 (select (i1 (setgt (i64 DoubleRegs:$src2),
-                                            (i64 DoubleRegs:$src1))),
-                                 (i64 DoubleRegs:$src1),
-                                 (i64 DoubleRegs:$src2))))]>,
-Requires<[HasV3T]>;
+let Defs = [USR_OVF], Itinerary = ALU64_tc_2_SLOT23 in
+def A2_addpsat : T_ALU64_arith<"add", 0b011, 0b101, 1, 0, 1>;
+
+class T_ALU64_addsp_hl<string suffix, bits<3> MinOp>
+  : T_ALU64_rr<"add", suffix, 0b0011, 0b011, MinOp, 0, 0, "">;
+
+def A2_addspl : T_ALU64_addsp_hl<":raw:lo", 0b110>;
+def A2_addsph : T_ALU64_addsp_hl<":raw:hi", 0b111>;
+
+let hasSideEffects = 0, isAsmParserOnly = 1 in
+def A2_addsp : ALU64_rr<(outs DoubleRegs:$Rd),
+  (ins IntRegs:$Rs, DoubleRegs:$Rt), "$Rd = add($Rs, $Rt)",
+  [(set (i64 DoubleRegs:$Rd), (i64 (add (i64 (sext (i32 IntRegs:$Rs))),
+                                        (i64 DoubleRegs:$Rt))))],
+  "", ALU64_tc_1_SLOT23>;
+
+
+let hasSideEffects = 0 in
+class T_XTYPE_MIN_MAX_P<bit isMax, bit isUnsigned>
+  : ALU64Inst<(outs DoubleRegs:$Rd), (ins DoubleRegs:$Rt, DoubleRegs:$Rs),
+  "$Rd = "#!if(isMax,"max","min")#!if(isUnsigned,"u","")
+          #"($Rt, $Rs)", [], "", ALU64_tc_2_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+
+  let Inst{27-23} = 0b00111;
+  let Inst{22-21} = !if(isMax, 0b10, 0b01);
+  let Inst{20-16} = !if(isMax, Rt, Rs);
+  let Inst{12-8} = !if(isMax, Rs, Rt);
+  let Inst{7} = 0b1;
+  let Inst{6} = !if(isMax, 0b0, 0b1);
+  let Inst{5} = isUnsigned;
+  let Inst{4-0} = Rd;
+}
+
+def A2_minp  : T_XTYPE_MIN_MAX_P<0, 0>;
+def A2_minup : T_XTYPE_MIN_MAX_P<0, 1>;
+def A2_maxp  : T_XTYPE_MIN_MAX_P<1, 0>;
+def A2_maxup : T_XTYPE_MIN_MAX_P<1, 1>;
+
+multiclass MinMax_pats_p<PatFrag Op, InstHexagon Inst, InstHexagon SwapInst> {
+  defm: T_MinMax_pats<Op, DoubleRegs, i64, Inst, SwapInst>;
+}
+
+let AddedComplexity = 200 in {
+  defm: MinMax_pats_p<setge,  A2_maxp,  A2_minp>;
+  defm: MinMax_pats_p<setgt,  A2_maxp,  A2_minp>;
+  defm: MinMax_pats_p<setle,  A2_minp,  A2_maxp>;
+  defm: MinMax_pats_p<setlt,  A2_minp,  A2_maxp>;
+  defm: MinMax_pats_p<setuge, A2_maxup, A2_minup>;
+  defm: MinMax_pats_p<setugt, A2_maxup, A2_minup>;
+  defm: MinMax_pats_p<setule, A2_minup, A2_maxup>;
+  defm: MinMax_pats_p<setult, A2_minup, A2_maxup>;
+}
 
 //===----------------------------------------------------------------------===//
 // ALU64/ALU -
@@ -83,25 +155,112 @@ Requires<[HasV3T]>;
 
 
 //def : Pat <(brcond (i1 (seteq (i32 IntRegs:$src1), 0)), bb:$offset),
-//      (JMP_RegEzt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
+//      (JMP_RegEzt (i32 IntRegs:$src1), bb:$offset)>;
 
 //def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), 0)), bb:$offset),
-//      (JMP_RegNzt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
+//      (JMP_RegNzt (i32 IntRegs:$src1), bb:$offset)>;
 
 //def : Pat <(brcond (i1 (setle (i32 IntRegs:$src1), 0)), bb:$offset),
-//      (JMP_RegLezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
+//      (JMP_RegLezt (i32 IntRegs:$src1), bb:$offset)>;
 
 //def : Pat <(brcond (i1 (setge (i32 IntRegs:$src1), 0)), bb:$offset),
-//      (JMP_RegGezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
+//      (JMP_RegGezt (i32 IntRegs:$src1), bb:$offset)>;
 
 //def : Pat <(brcond (i1 (setgt (i32 IntRegs:$src1), -1)), bb:$offset),
-//      (JMP_RegGezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
-
+//      (JMP_RegGezt (i32 IntRegs:$src1), bb:$offset)>;
 
 // Map call instruction
-def : Pat<(call (i32 IntRegs:$dst)),
-      (CALLRv3 (i32 IntRegs:$dst))>, Requires<[HasV3T]>;
-def : Pat<(call tglobaladdr:$dst),
-      (CALLv3 tglobaladdr:$dst)>, Requires<[HasV3T]>;
-def : Pat<(call texternalsym:$dst),
-      (CALLv3 texternalsym:$dst)>, Requires<[HasV3T]>;
+def : Pat<(callv3 (i32 IntRegs:$dst)),
+      (J2_callr (i32 IntRegs:$dst))>;
+def : Pat<(callv3 tglobaladdr:$dst),
+      (J2_call tglobaladdr:$dst)>;
+def : Pat<(callv3 texternalsym:$dst),
+      (J2_call texternalsym:$dst)>;
+def : Pat<(callv3 tglobaltlsaddr:$dst),
+      (J2_call tglobaltlsaddr:$dst)>;
+
+def : Pat<(callv3nr (i32 IntRegs:$dst)),
+      (CALLRv3nr (i32 IntRegs:$dst))>;
+def : Pat<(callv3nr tglobaladdr:$dst),
+      (CALLv3nr tglobaladdr:$dst)>;
+def : Pat<(callv3nr texternalsym:$dst),
+      (CALLv3nr texternalsym:$dst)>;
+
+//===----------------------------------------------------------------------===//
+// :raw form of vrcmpys:hi/lo insns
+//===----------------------------------------------------------------------===//
+// Vector reduce complex multiply by scalar.
+let Defs = [USR_OVF], hasSideEffects = 0 in
+class T_vrcmpRaw<string HiLo, bits<3>MajOp>:
+  MInst<(outs DoubleRegs:$Rdd),
+         (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+         "$Rdd = vrcmpys($Rss, $Rtt):<<1:sat:raw:"#HiLo, []> {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1000;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = Rss;
+    let Inst{12-8}  = Rtt;
+    let Inst{7-5}   = 0b100;
+    let Inst{4-0}   = Rdd;
+}
+
+def M2_vrcmpys_s1_h: T_vrcmpRaw<"hi", 0b101>;
+def M2_vrcmpys_s1_l: T_vrcmpRaw<"lo", 0b111>;
+
+// Assembler mapped to M2_vrcmpys_s1_h or M2_vrcmpys_s1_l
+let hasSideEffects = 0, isAsmParserOnly = 1 in
+def M2_vrcmpys_s1
+ : MInst<(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, IntRegs:$Rt),
+ "$Rdd=vrcmpys($Rss,$Rt):<<1:sat">;
+
+// Vector reduce complex multiply by scalar with accumulation.
+let Defs = [USR_OVF], hasSideEffects = 0 in
+class T_vrcmpys_acc<string HiLo, bits<3>MajOp>:
+  MInst <(outs DoubleRegs:$Rxx),
+         (ins DoubleRegs:$_src_, DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Rxx += vrcmpys($Rss, $Rtt):<<1:sat:raw:"#HiLo, [],
+  "$Rxx = $_src_"> {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1010;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = Rss;
+    let Inst{12-8}  = Rtt;
+    let Inst{7-5}   = 0b100;
+    let Inst{4-0}   = Rxx;
+  }
+
+def M2_vrcmpys_acc_s1_h: T_vrcmpys_acc<"hi", 0b101>;
+def M2_vrcmpys_acc_s1_l: T_vrcmpys_acc<"lo", 0b111>;
+
+// Assembler mapped to M2_vrcmpys_acc_s1_h or M2_vrcmpys_acc_s1_l
+
+let isAsmParserOnly = 1 in
+def M2_vrcmpys_acc_s1
+  : MInst <(outs DoubleRegs:$dst),
+           (ins DoubleRegs:$dst2, DoubleRegs:$src1, IntRegs:$src2),
+           "$dst += vrcmpys($src1, $src2):<<1:sat", [],
+           "$dst2 = $dst">;
+
+def M2_vrcmpys_s1rp_h : T_MType_vrcmpy <"vrcmpys", 0b101, 0b110, 1>;
+def M2_vrcmpys_s1rp_l : T_MType_vrcmpy <"vrcmpys", 0b101, 0b111, 0>;
+
+// Assembler mapped to M2_vrcmpys_s1rp_h or M2_vrcmpys_s1rp_l
+let isAsmParserOnly = 1 in
+def M2_vrcmpys_s1rp
+  : MInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss, IntRegs:$Rt),
+  "$Rd=vrcmpys($Rss,$Rt):<<1:rnd:sat">;
+
+
+// S2_cabacdecbin: Cabac decode bin.
+let Defs = [P0], isPredicateLate = 1, Itinerary = S_3op_tc_1_SLOT23 in
+def S2_cabacdecbin : T_S3op_64 < "decbin", 0b11, 0b110, 0>;
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index d39f7d7..0e4dde3 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -11,25 +11,34 @@
 //
 //===----------------------------------------------------------------------===//
 
-let neverHasSideEffects = 1 in
-class T_Immext<dag ins> :
-  EXTENDERInst<(outs), ins, "immext(#$imm)", []>,
-  Requires<[HasV4T]>;
-
-def IMMEXT_b : T_Immext<(ins brtarget:$imm)>;
-def IMMEXT_c : T_Immext<(ins calltarget:$imm)>;
-def IMMEXT_g : T_Immext<(ins globaladdress:$imm)>;
-def IMMEXT_i : T_Immext<(ins u26_6Imm:$imm)>;
-
-// Fold (add (CONST32 tglobaladdr:$addr) <offset>) into a global address.
-def FoldGlobalAddr : ComplexPattern<i32, 1, "foldGlobalAddress", [], []>;
+def addrga: PatLeaf<(i32 AddrGA:$Addr)>;
+def addrgp: PatLeaf<(i32 AddrGP:$Addr)>;
+
+let hasSideEffects = 0 in
+class T_Immext<Operand ImmType>
+  : EXTENDERInst<(outs), (ins ImmType:$imm),
+                 "immext(#$imm)", []> {
+    bits<32> imm;
+    let IClass = 0b0000;
+
+    let Inst{27-16} = imm{31-20};
+    let Inst{13-0} = imm{19-6};
+  }
 
-// Fold (add (CONST32_GP tglobaladdr:$addr) <offset>) into a global address.
-def FoldGlobalAddrGP : ComplexPattern<i32, 1, "foldGlobalAddressGP", [], []>;
+def A4_ext : T_Immext<u26_6Imm>;
+let isCodeGenOnly = 1 in {
+  let isBranch = 1 in
+    def A4_ext_b : T_Immext<brtarget>;
+  let isCall = 1 in
+    def A4_ext_c : T_Immext<calltarget>;
+  def A4_ext_g : T_Immext<globaladdress>;
+}
 
-def NumUsesBelowThresCONST32 : PatFrag<(ops node:$addr),
-                                       (HexagonCONST32 node:$addr), [{
-  return hasNumUsesBelowThresGA(N->getOperand(0).getNode());
+def BITPOS32 : SDNodeXForm<imm, [{
+   // Return the bit position we will set [0-31].
+   // As an SDNode.
+   int32_t imm = N->getSExtValue();
+   return XformMskToBitPosU5Imm(imm);
 }]>;
 
 // Hexagon V4 Architecture spec defines 8 instruction classes:
@@ -95,63 +104,158 @@ def NumUsesBelowThresCONST32 : PatFrag<(ops node:$addr),
 //===----------------------------------------------------------------------===//
 // ALU32 +
 //===----------------------------------------------------------------------===//
-// Generate frame index addresses.
-let neverHasSideEffects = 1, isReMaterializable = 1,
-isExtended = 1, opExtendable = 2, validSubTargets = HasV4SubT in
-def TFR_FI_immext_V4 : ALU32_ri<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s32Imm:$offset),
-            "$dst = add($src1, ##$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-// Rd=cmp.eq(Rs,#s8)
-let validSubTargets = HasV4SubT, isExtendable = 1, opExtendable = 2,
-isExtentSigned = 1, opExtentBits = 8 in
-def V4_A4_rcmpeqi : ALU32_ri<(outs IntRegs:$Rd),
-                    (ins IntRegs:$Rs, s8Ext:$s8),
-                    "$Rd = cmp.eq($Rs, #$s8)",
-                    [(set (i32 IntRegs:$Rd),
-                          (i32 (zext (i1 (seteq (i32 IntRegs:$Rs),
-                                                s8ExtPred:$s8)))))]>,
-                    Requires<[HasV4T]>;
-
-// Preserve the TSTBIT generation
-def : Pat <(i32 (zext (i1 (setne (i32 (and (i32 (shl 1, (i32 IntRegs:$src2))),
-                                           (i32 IntRegs:$src1))), 0)))),
-      (i32 (MUX_ii (i1 (TSTBIT_rr (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
-                   1, 0))>;
-
-// Interfered with tstbit generation, above pattern preserves, see : tstbit.ll
-// Rd=cmp.ne(Rs,#s8)
-let validSubTargets = HasV4SubT, isExtendable = 1, opExtendable = 2,
-isExtentSigned = 1, opExtentBits = 8 in
-def V4_A4_rcmpneqi : ALU32_ri<(outs IntRegs:$Rd),
-                     (ins IntRegs:$Rs, s8Ext:$s8),
-                     "$Rd = !cmp.eq($Rs, #$s8)",
-                     [(set (i32 IntRegs:$Rd),
-                           (i32 (zext (i1 (setne (i32 IntRegs:$Rs),
-                                                 s8ExtPred:$s8)))))]>,
-                     Requires<[HasV4T]>;
-
-// Rd=cmp.eq(Rs,Rt)
-let validSubTargets = HasV4SubT in
-def V4_A4_rcmpeq : ALU32_ri<(outs IntRegs:$Rd),
-                   (ins IntRegs:$Rs, IntRegs:$Rt),
-                   "$Rd = cmp.eq($Rs, $Rt)",
-                   [(set (i32 IntRegs:$Rd),
-                         (i32 (zext (i1 (seteq (i32 IntRegs:$Rs),
-                                               IntRegs:$Rt)))))]>,
-                   Requires<[HasV4T]>;
-
-// Rd=cmp.ne(Rs,Rt)
-let validSubTargets = HasV4SubT in
-def V4_A4_rcmpneq : ALU32_ri<(outs IntRegs:$Rd),
-                    (ins IntRegs:$Rs, IntRegs:$Rt),
-                    "$Rd = !cmp.eq($Rs, $Rt)",
-                    [(set (i32 IntRegs:$Rd),
-                          (i32 (zext (i1 (setne (i32 IntRegs:$Rs),
-                                               IntRegs:$Rt)))))]>,
-                    Requires<[HasV4T]>;
+
+class T_ALU32_3op_not<string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                      bit OpsRev>
+  : T_ALU32_3op<mnemonic, MajOp, MinOp, OpsRev, 0> {
+  let AsmString = "$Rd = "#mnemonic#"($Rs, ~$Rt)";
+}
+
+let BaseOpcode = "andn_rr", CextOpcode = "andn" in
+def A4_andn    : T_ALU32_3op_not<"and", 0b001, 0b100, 1>;
+let BaseOpcode = "orn_rr", CextOpcode = "orn" in
+def A4_orn     : T_ALU32_3op_not<"or",  0b001, 0b101, 1>;
+
+let CextOpcode = "rcmp.eq" in
+def A4_rcmpeq  : T_ALU32_3op<"cmp.eq",  0b011, 0b010, 0, 1>;
+let CextOpcode = "!rcmp.eq" in
+def A4_rcmpneq : T_ALU32_3op<"!cmp.eq", 0b011, 0b011, 0, 1>;
+
+def C4_cmpneq  : T_ALU32_3op_cmp<"!cmp.eq",  0b00, 1, 1>;
+def C4_cmplte  : T_ALU32_3op_cmp<"!cmp.gt",  0b10, 1, 0>;
+def C4_cmplteu : T_ALU32_3op_cmp<"!cmp.gtu", 0b11, 1, 0>;
+
+// Pats for instruction selection.
+
+// A class to embed the usual comparison patfrags within a zext to i32.
+// The seteq/setne frags use "lhs" and "rhs" as operands, so use the same
+// names, or else the frag's "body" won't match the operands.
+class CmpInReg<PatFrag Op>
+  : PatFrag<(ops node:$lhs, node:$rhs),(i32 (zext (i1 Op.Fragment)))>;
+
+def: T_cmp32_rr_pat<A4_rcmpeq,  CmpInReg<seteq>, i32>;
+def: T_cmp32_rr_pat<A4_rcmpneq, CmpInReg<setne>, i32>;
+
+def: T_cmp32_rr_pat<C4_cmpneq,  setne,  i1>;
+
+class T_CMP_rrbh<string mnemonic, bits<3> MinOp, bit IsComm>
+  : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
+    "$Pd = "#mnemonic#"($Rs, $Rt)", [], "", S_3op_tc_2early_SLOT23>,
+    ImmRegRel {
+  let InputType = "reg";
+  let CextOpcode = mnemonic;
+  let isCompare = 1;
+  let isCommutable = IsComm;
+  let hasSideEffects = 0;
+
+  bits<2> Pd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1100;
+  let Inst{27-21} = 0b0111110;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{7-5} = MinOp;
+  let Inst{1-0} = Pd;
+}
+
+def A4_cmpbeq  : T_CMP_rrbh<"cmpb.eq",  0b110, 1>;
+def A4_cmpbgt  : T_CMP_rrbh<"cmpb.gt",  0b010, 0>;
+def A4_cmpbgtu : T_CMP_rrbh<"cmpb.gtu", 0b111, 0>;
+def A4_cmpheq  : T_CMP_rrbh<"cmph.eq",  0b011, 1>;
+def A4_cmphgt  : T_CMP_rrbh<"cmph.gt",  0b100, 0>;
+def A4_cmphgtu : T_CMP_rrbh<"cmph.gtu", 0b101, 0>;
+
+let AddedComplexity = 100 in {
+  def: Pat<(i1 (seteq (and (xor (i32 IntRegs:$Rs), (i32 IntRegs:$Rt)),
+                       255), 0)),
+           (A4_cmpbeq IntRegs:$Rs, IntRegs:$Rt)>;
+  def: Pat<(i1 (setne (and (xor (i32 IntRegs:$Rs), (i32 IntRegs:$Rt)),
+                       255), 0)),
+           (C2_not (A4_cmpbeq IntRegs:$Rs, IntRegs:$Rt))>;
+  def: Pat<(i1 (seteq (and (xor (i32 IntRegs:$Rs), (i32 IntRegs:$Rt)),
+                           65535), 0)),
+           (A4_cmpheq IntRegs:$Rs, IntRegs:$Rt)>;
+  def: Pat<(i1 (setne (and (xor (i32 IntRegs:$Rs), (i32 IntRegs:$Rt)),
+                           65535), 0)),
+           (C2_not (A4_cmpheq IntRegs:$Rs, IntRegs:$Rt))>;
+}
+
+class T_CMP_ribh<string mnemonic, bits<2> MajOp, bit IsHalf, bit IsComm,
+                 Operand ImmType, bit IsImmExt, bit IsImmSigned, int ImmBits>
+  : ALU64Inst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, ImmType:$Imm),
+    "$Pd = "#mnemonic#"($Rs, #$Imm)", [], "", ALU64_tc_2early_SLOT23>,
+    ImmRegRel {
+  let InputType = "imm";
+  let CextOpcode = mnemonic;
+  let isCompare = 1;
+  let isCommutable = IsComm;
+  let hasSideEffects = 0;
+  let isExtendable = IsImmExt;
+  let opExtendable = !if (IsImmExt, 2, 0);
+  let isExtentSigned = IsImmSigned;
+  let opExtentBits = ImmBits;
+
+  bits<2> Pd;
+  bits<5> Rs;
+  bits<8> Imm;
+
+  let IClass = 0b1101;
+  let Inst{27-24} = 0b1101;
+  let Inst{22-21} = MajOp;
+  let Inst{20-16} = Rs;
+  let Inst{12-5} = Imm;
+  let Inst{4} = 0b0;
+  let Inst{3} = IsHalf;
+  let Inst{1-0} = Pd;
+}
+
+def A4_cmpbeqi  : T_CMP_ribh<"cmpb.eq",  0b00, 0, 1, u8Imm, 0, 0, 8>;
+def A4_cmpbgti  : T_CMP_ribh<"cmpb.gt",  0b01, 0, 0, s8Imm, 0, 1, 8>;
+def A4_cmpbgtui : T_CMP_ribh<"cmpb.gtu", 0b10, 0, 0, u7Ext, 1, 0, 7>;
+def A4_cmpheqi  : T_CMP_ribh<"cmph.eq",  0b00, 1, 1, s8Ext, 1, 1, 8>;
+def A4_cmphgti  : T_CMP_ribh<"cmph.gt",  0b01, 1, 0, s8Ext, 1, 1, 8>;
+def A4_cmphgtui : T_CMP_ribh<"cmph.gtu", 0b10, 1, 0, u7Ext, 1, 0, 7>;
+
+class T_RCMP_EQ_ri<string mnemonic, bit IsNeg>
+  : ALU32_ri<(outs IntRegs:$Rd), (ins IntRegs:$Rs, s8Ext:$s8),
+    "$Rd = "#mnemonic#"($Rs, #$s8)", [], "", ALU32_2op_tc_1_SLOT0123>,
+    ImmRegRel {
+  let InputType = "imm";
+  let CextOpcode = !if (IsNeg, "!rcmp.eq", "rcmp.eq");
+  let isExtendable = 1;
+  let opExtendable = 2;
+  let isExtentSigned = 1;
+  let opExtentBits = 8;
+  let hasNewValue = 1;
+
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<8> s8;
+
+  let IClass = 0b0111;
+  let Inst{27-24} = 0b0011;
+  let Inst{22} = 0b1;
+  let Inst{21} = IsNeg;
+  let Inst{20-16} = Rs;
+  let Inst{13} = 0b1;
+  let Inst{12-5} = s8;
+  let Inst{4-0} = Rd;
+}
+
+def A4_rcmpeqi  : T_RCMP_EQ_ri<"cmp.eq",  0>;
+def A4_rcmpneqi : T_RCMP_EQ_ri<"!cmp.eq", 1>;
+
+def: Pat<(i32 (zext (i1 (seteq (i32 IntRegs:$Rs), s8ExtPred:$s8)))),
+         (A4_rcmpeqi IntRegs:$Rs, s8ExtPred:$s8)>;
+def: Pat<(i32 (zext (i1 (setne (i32 IntRegs:$Rs), s8ExtPred:$s8)))),
+         (A4_rcmpneqi IntRegs:$Rs, s8ExtPred:$s8)>;
+
+// Preserve the S2_tstbit_r generation
+def: Pat<(i32 (zext (i1 (setne (i32 (and (i32 (shl 1, (i32 IntRegs:$src2))),
+                                         (i32 IntRegs:$src1))), 0)))),
+         (C2_muxii (S2_tstbit_r IntRegs:$src1, IntRegs:$src2), 1, 0)>;
 
 //===----------------------------------------------------------------------===//
 // ALU32 -
@@ -162,24 +266,31 @@ def V4_A4_rcmpneq : ALU32_ri<(outs IntRegs:$Rd),
 // ALU32/PERM +
 //===----------------------------------------------------------------------===//
 
-// Combine
-// Rdd=combine(Rs, #s8)
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 8,
-    neverHasSideEffects = 1, validSubTargets = HasV4SubT in
-def COMBINE_rI_V4 : ALU32_ri<(outs DoubleRegs:$dst),
-            (ins IntRegs:$src1, s8Ext:$src2),
-            "$dst = combine($src1, #$src2)",
-            []>,
-            Requires<[HasV4T]>;
-
-// Rdd=combine(#s8, Rs)
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 8,
-    neverHasSideEffects = 1, validSubTargets = HasV4SubT in
-def COMBINE_Ir_V4 : ALU32_ir<(outs DoubleRegs:$dst),
-            (ins s8Ext:$src1, IntRegs:$src2),
-            "$dst = combine(#$src1, $src2)",
-            []>,
-            Requires<[HasV4T]>;
+// Combine a word and an immediate into a register pair.
+let hasSideEffects = 0, isExtentSigned = 1, isExtendable = 1,
+    opExtentBits = 8 in
+class T_Combine1 <bits<2> MajOp, dag ins, string AsmStr>
+  : ALU32Inst <(outs DoubleRegs:$Rdd), ins, AsmStr> {
+    bits<5> Rdd;
+    bits<5> Rs;
+    bits<8> s8;
+
+    let IClass      = 0b0111;
+    let Inst{27-24} = 0b0011;
+    let Inst{22-21} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{13}    = 0b1;
+    let Inst{12-5}  = s8;
+    let Inst{4-0}   = Rdd;
+  }
+
+let opExtendable = 2 in
+def A4_combineri : T_Combine1<0b00, (ins IntRegs:$Rs, s8Ext:$s8),
+                                    "$Rdd = combine($Rs, #$s8)">;
+
+let opExtendable = 1 in
+def A4_combineir : T_Combine1<0b01, (ins s8Ext:$s8, IntRegs:$Rs),
+                                    "$Rdd = combine(#$s8, $Rs)">;
 
 def HexagonWrapperCombineRI_V4 :
   SDNode<"HexagonISD::WrapperCombineRI_V4", SDTHexagonI64I32I32>;
@@ -187,274 +298,355 @@ def HexagonWrapperCombineIR_V4 :
   SDNode<"HexagonISD::WrapperCombineIR_V4", SDTHexagonI64I32I32>;
 
 def : Pat <(HexagonWrapperCombineRI_V4 IntRegs:$r, s8ExtPred:$i),
-           (COMBINE_rI_V4 IntRegs:$r, s8ExtPred:$i)>,
-          Requires<[HasV4T]>;
+           (A4_combineri IntRegs:$r, s8ExtPred:$i)>;
 
 def : Pat <(HexagonWrapperCombineIR_V4 s8ExtPred:$i, IntRegs:$r),
-           (COMBINE_Ir_V4 s8ExtPred:$i, IntRegs:$r)>,
-          Requires<[HasV4T]>;
+           (A4_combineir s8ExtPred:$i, IntRegs:$r)>;
 
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 6,
-    neverHasSideEffects = 1, validSubTargets = HasV4SubT in
-def COMBINE_iI_V4 : ALU32_ii<(outs DoubleRegs:$dst),
-            (ins s8Imm:$src1, u6Ext:$src2),
-            "$dst = combine(#$src1, #$src2)",
-            []>,
-            Requires<[HasV4T]>;
+// A4_combineii: Set two small immediates.
+let hasSideEffects = 0, isExtendable = 1, opExtentBits = 6, opExtendable = 2 in
+def A4_combineii: ALU32Inst<(outs DoubleRegs:$Rdd), (ins s8Imm:$s8, u6Ext:$U6),
+  "$Rdd = combine(#$s8, #$U6)"> {
+    bits<5> Rdd;
+    bits<8> s8;
+    bits<6> U6;
+
+    let IClass = 0b0111;
+    let Inst{27-23} = 0b11001;
+    let Inst{20-16} = U6{5-1};
+    let Inst{13}    = U6{0};
+    let Inst{12-5}  = s8;
+    let Inst{4-0}   = Rdd;
+  }
+
+// The complexity of the combine with two immediates should be greater than
+// the complexity of a combine involving a register.
+let AddedComplexity = 75 in
+def: Pat<(HexagonCOMBINE s8ImmPred:$s8, u6ExtPred:$u6),
+         (A4_combineii imm:$s8, imm:$u6)>;
 
 //===----------------------------------------------------------------------===//
-// ALU32/PERM +
+// ALU32/PERM -
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // LD +
 //===----------------------------------------------------------------------===//
+
+def Zext64: OutPatFrag<(ops node:$Rs),
+  (i64 (A4_combineir 0, (i32 $Rs)))>;
+def Sext64: OutPatFrag<(ops node:$Rs),
+  (i64 (A2_sxtw (i32 $Rs)))>;
+
+// Patterns to generate indexed loads with different forms of the address:
+// - frameindex,
+// - base + offset,
+// - base (without offset).
+multiclass Loadxm_pat<PatFrag Load, ValueType VT, PatFrag ValueMod,
+                      PatLeaf ImmPred, InstHexagon MI> {
+  def: Pat<(VT (Load AddrFI:$fi)),
+           (VT (ValueMod (MI AddrFI:$fi, 0)))>;
+  def: Pat<(VT (Load (add IntRegs:$Rs, ImmPred:$Off))),
+           (VT (ValueMod (MI IntRegs:$Rs, imm:$Off)))>;
+  def: Pat<(VT (Load (i32 IntRegs:$Rs))),
+           (VT (ValueMod (MI IntRegs:$Rs, 0)))>;
+}
+
+defm: Loadxm_pat<extloadi1,   i64, Zext64, s11_0ExtPred, L2_loadrub_io>;
+defm: Loadxm_pat<extloadi8,   i64, Zext64, s11_0ExtPred, L2_loadrub_io>;
+defm: Loadxm_pat<extloadi16,  i64, Zext64, s11_1ExtPred, L2_loadruh_io>;
+defm: Loadxm_pat<zextloadi1,  i64, Zext64, s11_0ExtPred, L2_loadrub_io>;
+defm: Loadxm_pat<zextloadi8,  i64, Zext64, s11_0ExtPred, L2_loadrub_io>;
+defm: Loadxm_pat<zextloadi16, i64, Zext64, s11_1ExtPred, L2_loadruh_io>;
+defm: Loadxm_pat<sextloadi8,  i64, Sext64, s11_0ExtPred, L2_loadrb_io>;
+defm: Loadxm_pat<sextloadi16, i64, Sext64, s11_1ExtPred, L2_loadrh_io>;
+
+// Map Rdd = anyext(Rs) -> Rdd = combine(#0, Rs).
+def: Pat<(i64 (anyext (i32 IntRegs:$src1))), (Zext64 IntRegs:$src1)>;
+
 //===----------------------------------------------------------------------===//
 // Template class for load instructions with Absolute set addressing mode.
 //===----------------------------------------------------------------------===//
-let isExtended = 1, opExtendable = 2, neverHasSideEffects = 1,
-validSubTargets = HasV4SubT, addrMode = AbsoluteSet in
-class T_LD_abs_set<string mnemonic, RegisterClass RC>:
-            LDInst2<(outs RC:$dst1, IntRegs:$dst2),
-            (ins u0AlwaysExt:$addr),
-            "$dst1 = "#mnemonic#"($dst2=##$addr)",
-            []>,
-            Requires<[HasV4T]>;
+let isExtended = 1, opExtendable = 2, opExtentBits = 6, addrMode = AbsoluteSet,
+    hasSideEffects = 0 in
+class T_LD_abs_set<string mnemonic, RegisterClass RC, bits<4>MajOp>:
+            LDInst<(outs RC:$dst1, IntRegs:$dst2),
+            (ins u6Ext:$addr),
+            "$dst1 = "#mnemonic#"($dst2 = #$addr)",
+            []> {
+  bits<7> name;
+  bits<5> dst1;
+  bits<5> dst2;
+  bits<6> addr;
+
+  let IClass = 0b1001;
+  let Inst{27-25} = 0b101;
+  let Inst{24-21} = MajOp;
+  let Inst{13-12} = 0b01;
+  let Inst{4-0}   = dst1;
+  let Inst{20-16} = dst2;
+  let Inst{11-8}  = addr{5-2};
+  let Inst{6-5}   = addr{1-0};
+}
+
+let accessSize = ByteAccess, hasNewValue = 1 in {
+  def L4_loadrb_ap   : T_LD_abs_set <"memb",   IntRegs, 0b1000>;
+  def L4_loadrub_ap  : T_LD_abs_set <"memub",  IntRegs, 0b1001>;
+}
 
-def LDrid_abs_set_V4  : T_LD_abs_set <"memd", DoubleRegs>;
-def LDrib_abs_set_V4  : T_LD_abs_set <"memb", IntRegs>;
-def LDriub_abs_set_V4 : T_LD_abs_set <"memub", IntRegs>;
-def LDrih_abs_set_V4  : T_LD_abs_set <"memh", IntRegs>;
-def LDriw_abs_set_V4  : T_LD_abs_set <"memw", IntRegs>;
-def LDriuh_abs_set_V4 : T_LD_abs_set <"memuh", IntRegs>;
+let accessSize = HalfWordAccess, hasNewValue = 1 in {
+  def L4_loadrh_ap  : T_LD_abs_set <"memh",  IntRegs, 0b1010>;
+  def L4_loadruh_ap : T_LD_abs_set <"memuh", IntRegs, 0b1011>;
+  def L4_loadbsw2_ap : T_LD_abs_set <"membh",  IntRegs, 0b0001>;
+  def L4_loadbzw2_ap : T_LD_abs_set <"memubh", IntRegs, 0b0011>;
+}
 
+let accessSize = WordAccess, hasNewValue = 1 in
+  def L4_loadri_ap : T_LD_abs_set <"memw", IntRegs, 0b1100>;
 
-// multiclass for load instructions with base + register offset
-// addressing mode
-multiclass ld_idxd_shl_pbase<string mnemonic, RegisterClass RC, bit isNot,
-                             bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME : LDInst2<(outs RC:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-            ") ")#"$dst = "#mnemonic#"($src2+$src3<<#$offset)",
-            []>, Requires<[HasV4T]>;
+let accessSize = WordAccess in {
+  def L4_loadbzw4_ap : T_LD_abs_set <"memubh", DoubleRegs, 0b0101>;
+  def L4_loadbsw4_ap : T_LD_abs_set <"membh",  DoubleRegs, 0b0111>;
 }
 
-multiclass ld_idxd_shl_pred<string mnemonic, RegisterClass RC, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    defm _c#NAME : ld_idxd_shl_pbase<mnemonic, RC, PredNot, 0>;
-    // Predicate new
-    defm _cdn#NAME : ld_idxd_shl_pbase<mnemonic, RC, PredNot, 1>;
+let accessSize = DoubleWordAccess in
+def L4_loadrd_ap : T_LD_abs_set <"memd", DoubleRegs, 0b1110>;
+
+let accessSize = ByteAccess in
+  def L4_loadalignb_ap : T_LD_abs_set <"memb_fifo", DoubleRegs, 0b0100>;
+
+let accessSize = HalfWordAccess in
+def L4_loadalignh_ap : T_LD_abs_set <"memh_fifo", DoubleRegs, 0b0010>;
+
+// Load - Indirect with long offset
+let InputType = "imm", addrMode = BaseLongOffset, isExtended = 1,
+opExtentBits = 6, opExtendable = 3 in
+class T_LoadAbsReg <string mnemonic, string CextOp, RegisterClass RC,
+                    bits<4> MajOp>
+  : LDInst <(outs RC:$dst), (ins IntRegs:$src1, u2Imm:$src2, u6Ext:$src3),
+  "$dst = "#mnemonic#"($src1<<#$src2 + #$src3)",
+  [] >, ImmRegShl {
+    bits<5> dst;
+    bits<5> src1;
+    bits<2> src2;
+    bits<6> src3;
+    let CextOpcode = CextOp;
+    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
+
+    let IClass = 0b1001;
+    let Inst{27-25} = 0b110;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13}    = src2{1};
+    let Inst{12}    = 0b1;
+    let Inst{11-8}  = src3{5-2};
+    let Inst{7}     = src2{0};
+    let Inst{6-5}   = src3{1-0};
+    let Inst{4-0}   = dst;
   }
+
+let accessSize = ByteAccess in {
+  def L4_loadrb_ur  : T_LoadAbsReg<"memb",  "LDrib", IntRegs, 0b1000>;
+  def L4_loadrub_ur : T_LoadAbsReg<"memub", "LDriub", IntRegs, 0b1001>;
+  def L4_loadalignb_ur : T_LoadAbsReg<"memb_fifo", "LDrib_fifo",
+                                      DoubleRegs, 0b0100>;
 }
 
-let neverHasSideEffects  = 1 in
-multiclass ld_idxd_shl<string mnemonic, string CextOp, RegisterClass RC> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
-    let isPredicable = 1 in
-    def NAME#_V4 : LDInst2<(outs RC:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
-            "$dst = "#mnemonic#"($src1+$src2<<#$offset)",
-            []>, Requires<[HasV4T]>;
-
-    let isPredicated = 1 in {
-      defm Pt_V4 : ld_idxd_shl_pred<mnemonic, RC, 0 >;
-      defm NotPt_V4 : ld_idxd_shl_pred<mnemonic, RC, 1>;
-    }
-  }
+let accessSize = HalfWordAccess in {
+  def L4_loadrh_ur   : T_LoadAbsReg<"memh",   "LDrih",    IntRegs, 0b1010>;
+  def L4_loadruh_ur  : T_LoadAbsReg<"memuh",  "LDriuh",   IntRegs, 0b1011>;
+  def L4_loadbsw2_ur : T_LoadAbsReg<"membh",  "LDribh2",  IntRegs, 0b0001>;
+  def L4_loadbzw2_ur : T_LoadAbsReg<"memubh", "LDriubh2", IntRegs, 0b0011>;
+  def L4_loadalignh_ur : T_LoadAbsReg<"memh_fifo", "LDrih_fifo",
+                                      DoubleRegs, 0b0010>;
 }
 
-let addrMode = BaseRegOffset in {
-  let accessSize = ByteAccess in {
-    defm LDrib_indexed_shl: ld_idxd_shl<"memb", "LDrib", IntRegs>,
-                                        AddrModeRel;
-    defm LDriub_indexed_shl: ld_idxd_shl<"memub", "LDriub", IntRegs>,
-                                        AddrModeRel;
-  }
-  let accessSize = HalfWordAccess in {
-    defm LDrih_indexed_shl: ld_idxd_shl<"memh", "LDrih", IntRegs>, AddrModeRel;
-    defm LDriuh_indexed_shl: ld_idxd_shl<"memuh", "LDriuh", IntRegs>,
-                             AddrModeRel;
-  }
-  let accessSize = WordAccess in
-     defm LDriw_indexed_shl: ld_idxd_shl<"memw", "LDriw", IntRegs>, AddrModeRel;
+let accessSize = WordAccess in {
+  def L4_loadri_ur   : T_LoadAbsReg<"memw", "LDriw", IntRegs, 0b1100>;
+  def L4_loadbsw4_ur : T_LoadAbsReg<"membh", "LDribh4", DoubleRegs, 0b0111>;
+  def L4_loadbzw4_ur : T_LoadAbsReg<"memubh", "LDriubh4", DoubleRegs, 0b0101>;
+}
+
+let accessSize = DoubleWordAccess in
+def L4_loadrd_ur  : T_LoadAbsReg<"memd", "LDrid", DoubleRegs, 0b1110>;
+
 
-  let accessSize = DoubleWordAccess in
-    defm LDrid_indexed_shl: ld_idxd_shl<"memd", "LDrid", DoubleRegs>,
-                             AddrModeRel;
+multiclass T_LoadAbsReg_Pat <PatFrag ldOp, InstHexagon MI, ValueType VT = i32> {
+  def  : Pat <(VT (ldOp (add (shl IntRegs:$src1, u2ImmPred:$src2),
+                             (HexagonCONST32 tglobaladdr:$src3)))),
+              (MI IntRegs:$src1, u2ImmPred:$src2, tglobaladdr:$src3)>;
+
+  def  : Pat <(VT (ldOp (add IntRegs:$src1,
+                             (HexagonCONST32 tglobaladdr:$src2)))),
+              (MI IntRegs:$src1, 0, tglobaladdr:$src2)>;
 }
 
-// 'def pats' for load instructions with base + register offset and non-zero
-// immediate value. Immediate value is used to left-shift the second
-// register operand.
-let AddedComplexity = 40 in {
-def : Pat <(i32 (sextloadi8 (add IntRegs:$src1,
-                                 (shl IntRegs:$src2, u2ImmPred:$offset)))),
-           (LDrib_indexed_shl_V4 IntRegs:$src1,
-            IntRegs:$src2, u2ImmPred:$offset)>,
-            Requires<[HasV4T]>;
-
-def : Pat <(i32 (zextloadi8 (add IntRegs:$src1,
-                                 (shl IntRegs:$src2, u2ImmPred:$offset)))),
-           (LDriub_indexed_shl_V4 IntRegs:$src1,
-            IntRegs:$src2, u2ImmPred:$offset)>,
-            Requires<[HasV4T]>;
-
-def : Pat <(i32 (extloadi8 (add IntRegs:$src1,
-                                (shl IntRegs:$src2, u2ImmPred:$offset)))),
-           (LDriub_indexed_shl_V4 IntRegs:$src1,
-            IntRegs:$src2, u2ImmPred:$offset)>,
-            Requires<[HasV4T]>;
-
-def : Pat <(i32 (sextloadi16 (add IntRegs:$src1,
-                                  (shl IntRegs:$src2, u2ImmPred:$offset)))),
-           (LDrih_indexed_shl_V4 IntRegs:$src1,
-            IntRegs:$src2, u2ImmPred:$offset)>,
-            Requires<[HasV4T]>;
-
-def : Pat <(i32 (zextloadi16 (add IntRegs:$src1,
-                                  (shl IntRegs:$src2, u2ImmPred:$offset)))),
-           (LDriuh_indexed_shl_V4 IntRegs:$src1,
-            IntRegs:$src2, u2ImmPred:$offset)>,
-            Requires<[HasV4T]>;
-
-def : Pat <(i32 (extloadi16 (add IntRegs:$src1,
-                                 (shl IntRegs:$src2, u2ImmPred:$offset)))),
-           (LDriuh_indexed_shl_V4 IntRegs:$src1,
-            IntRegs:$src2, u2ImmPred:$offset)>,
-            Requires<[HasV4T]>;
-
-def : Pat <(i32 (load (add IntRegs:$src1,
-                           (shl IntRegs:$src2, u2ImmPred:$offset)))),
-           (LDriw_indexed_shl_V4 IntRegs:$src1,
-            IntRegs:$src2, u2ImmPred:$offset)>,
-            Requires<[HasV4T]>;
-
-def : Pat <(i64 (load (add IntRegs:$src1,
-                           (shl IntRegs:$src2, u2ImmPred:$offset)))),
-           (LDrid_indexed_shl_V4 IntRegs:$src1,
-            IntRegs:$src2, u2ImmPred:$offset)>,
-            Requires<[HasV4T]>;
+let AddedComplexity  = 60 in {
+defm : T_LoadAbsReg_Pat <sextloadi8, L4_loadrb_ur>;
+defm : T_LoadAbsReg_Pat <zextloadi8, L4_loadrub_ur>;
+defm : T_LoadAbsReg_Pat <extloadi8,  L4_loadrub_ur>;
+
+defm : T_LoadAbsReg_Pat <sextloadi16, L4_loadrh_ur>;
+defm : T_LoadAbsReg_Pat <zextloadi16, L4_loadruh_ur>;
+defm : T_LoadAbsReg_Pat <extloadi16,  L4_loadruh_ur>;
+
+defm : T_LoadAbsReg_Pat <load, L4_loadri_ur>;
+defm : T_LoadAbsReg_Pat <load, L4_loadrd_ur, i64>;
 }
 
+//===----------------------------------------------------------------------===//
+// Template classes for the non-predicated load instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+class T_load_rr <string mnemonic, RegisterClass RC, bits<3> MajOp>:
+   LDInst<(outs RC:$dst), (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$u2),
+  "$dst = "#mnemonic#"($src1 + $src2<<#$u2)",
+  [], "", V4LDST_tc_ld_SLOT01>, ImmRegShl, AddrModeRel {
+    bits<5> dst;
+    bits<5> src1;
+    bits<5> src2;
+    bits<2> u2;
 
-// 'def pats' for load instruction base + register offset and
-// zero immediate value.
-let AddedComplexity = 10 in {
-def : Pat <(i64 (load (add IntRegs:$src1, IntRegs:$src2))),
-           (LDrid_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
-            Requires<[HasV4T]>;
+    let IClass = 0b0011;
 
-def : Pat <(i32 (sextloadi8 (add IntRegs:$src1, IntRegs:$src2))),
-           (LDrib_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
-            Requires<[HasV4T]>;
+    let Inst{27-24} = 0b1010;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{12-8}  = src2;
+    let Inst{13}    = u2{1};
+    let Inst{7}     = u2{0};
+    let Inst{4-0}   = dst;
+  }
 
-def : Pat <(i32 (zextloadi8 (add IntRegs:$src1, IntRegs:$src2))),
-           (LDriub_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
-            Requires<[HasV4T]>;
+//===----------------------------------------------------------------------===//
+// Template classes for the predicated load instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+let isPredicated =  1 in
+class T_pload_rr <string mnemonic, RegisterClass RC, bits<3> MajOp,
+                  bit isNot, bit isPredNew>:
+   LDInst <(outs RC:$dst),
+           (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$u2),
+  !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+  ") ")#"$dst = "#mnemonic#"($src2+$src3<<#$u2)",
+  [], "", V4LDST_tc_ld_SLOT01>, AddrModeRel {
+    bits<5> dst;
+    bits<2> src1;
+    bits<5> src2;
+    bits<5> src3;
+    bits<2> u2;
+
+    let isPredicatedFalse = isNot;
+    let isPredicatedNew = isPredNew;
 
-def : Pat <(i32 (extloadi8 (add IntRegs:$src1, IntRegs:$src2))),
-           (LDriub_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
-            Requires<[HasV4T]>;
+    let IClass = 0b0011;
 
-def : Pat <(i32 (sextloadi16 (add IntRegs:$src1, IntRegs:$src2))),
-           (LDrih_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
-            Requires<[HasV4T]>;
+    let Inst{27-26} = 0b00;
+    let Inst{25}    = isPredNew;
+    let Inst{24}    = isNot;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{12-8}  = src3;
+    let Inst{13}    = u2{1};
+    let Inst{7}     = u2{0};
+    let Inst{6-5}   = src1;
+    let Inst{4-0}   = dst;
+  }
 
-def : Pat <(i32 (zextloadi16 (add IntRegs:$src1, IntRegs:$src2))),
-           (LDriuh_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
-            Requires<[HasV4T]>;
+//===----------------------------------------------------------------------===//
+// multiclass for load instructions with base + register offset
+// addressing mode
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, addrMode = BaseRegOffset in
+multiclass ld_idxd_shl <string mnemonic, string CextOp, RegisterClass RC,
+                        bits<3> MajOp > {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl,
+      InputType = "reg" in {
+    let isPredicable = 1 in
+    def L4_#NAME#_rr : T_load_rr <mnemonic, RC, MajOp>;
 
-def : Pat <(i32 (extloadi16 (add IntRegs:$src1, IntRegs:$src2))),
-           (LDriuh_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
-            Requires<[HasV4T]>;
+    // Predicated
+    def L4_p#NAME#t_rr : T_pload_rr <mnemonic, RC, MajOp, 0, 0>;
+    def L4_p#NAME#f_rr : T_pload_rr <mnemonic, RC, MajOp, 1, 0>;
 
-def : Pat <(i32 (load (add IntRegs:$src1, IntRegs:$src2))),
-           (LDriw_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
-            Requires<[HasV4T]>;
+    // Predicated new
+    def L4_p#NAME#tnew_rr : T_pload_rr <mnemonic, RC, MajOp, 0, 1>;
+    def L4_p#NAME#fnew_rr : T_pload_rr <mnemonic, RC, MajOp, 1, 1>;
+  }
 }
 
-// zext i1->i64
-def : Pat <(i64 (zext (i1 PredRegs:$src1))),
-      (i64 (COMBINE_Ir_V4 0, (MUX_ii (i1 PredRegs:$src1), 1, 0)))>,
-      Requires<[HasV4T]>;
+let hasNewValue = 1, accessSize = ByteAccess in {
+  defm loadrb  : ld_idxd_shl<"memb", "LDrib", IntRegs, 0b000>;
+  defm loadrub : ld_idxd_shl<"memub", "LDriub", IntRegs, 0b001>;
+}
 
-// zext i32->i64
-def : Pat <(i64 (zext (i32 IntRegs:$src1))),
-      (i64 (COMBINE_Ir_V4 0, (i32 IntRegs:$src1)))>,
-      Requires<[HasV4T]>;
-// zext i8->i64
-def:  Pat <(i64 (zextloadi8 ADDRriS11_0:$src1)),
-      (i64 (COMBINE_Ir_V4 0, (LDriub ADDRriS11_0:$src1)))>,
-      Requires<[HasV4T]>;
-
-let AddedComplexity = 20 in
-def:  Pat <(i64 (zextloadi8 (add (i32 IntRegs:$src1),
-                                s11_0ExtPred:$offset))),
-      (i64 (COMBINE_Ir_V4 0, (LDriub_indexed IntRegs:$src1,
-                                  s11_0ExtPred:$offset)))>,
-      Requires<[HasV4T]>;
+let hasNewValue = 1, accessSize = HalfWordAccess in {
+  defm loadrh  : ld_idxd_shl<"memh", "LDrih", IntRegs, 0b010>;
+  defm loadruh : ld_idxd_shl<"memuh", "LDriuh", IntRegs, 0b011>;
+}
+
+let hasNewValue = 1, accessSize = WordAccess in
+defm loadri : ld_idxd_shl<"memw", "LDriw", IntRegs, 0b100>;
+
+let accessSize = DoubleWordAccess in
+defm loadrd  : ld_idxd_shl<"memd", "LDrid", DoubleRegs, 0b110>;
+
+// 'def pats' for load instructions with base + register offset and non-zero
+// immediate value. Immediate value is used to left-shift the second
+// register operand.
+class Loadxs_pat<PatFrag Load, ValueType VT, InstHexagon MI>
+  : Pat<(VT (Load (add (i32 IntRegs:$Rs),
+                       (i32 (shl (i32 IntRegs:$Rt), u2ImmPred:$u2))))),
+        (VT (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2))>;
+
+let AddedComplexity = 40 in {
+  def: Loadxs_pat<extloadi8,   i32, L4_loadrub_rr>;
+  def: Loadxs_pat<zextloadi8,  i32, L4_loadrub_rr>;
+  def: Loadxs_pat<sextloadi8,  i32, L4_loadrb_rr>;
+  def: Loadxs_pat<extloadi16,  i32, L4_loadruh_rr>;
+  def: Loadxs_pat<zextloadi16, i32, L4_loadruh_rr>;
+  def: Loadxs_pat<sextloadi16, i32, L4_loadrh_rr>;
+  def: Loadxs_pat<load,        i32, L4_loadri_rr>;
+  def: Loadxs_pat<load,        i64, L4_loadrd_rr>;
+}
+
+// 'def pats' for load instruction base + register offset and
+// zero immediate value.
+class Loadxs_simple_pat<PatFrag Load, ValueType VT, InstHexagon MI>
+  : Pat<(VT (Load (add (i32 IntRegs:$Rs), (i32 IntRegs:$Rt)))),
+        (VT (MI IntRegs:$Rs, IntRegs:$Rt, 0))>;
+
+let AddedComplexity = 20 in {
+  def: Loadxs_simple_pat<extloadi8,   i32, L4_loadrub_rr>;
+  def: Loadxs_simple_pat<zextloadi8,  i32, L4_loadrub_rr>;
+  def: Loadxs_simple_pat<sextloadi8,  i32, L4_loadrb_rr>;
+  def: Loadxs_simple_pat<extloadi16,  i32, L4_loadruh_rr>;
+  def: Loadxs_simple_pat<zextloadi16, i32, L4_loadruh_rr>;
+  def: Loadxs_simple_pat<sextloadi16, i32, L4_loadrh_rr>;
+  def: Loadxs_simple_pat<load,        i32, L4_loadri_rr>;
+  def: Loadxs_simple_pat<load,        i64, L4_loadrd_rr>;
+}
 
 // zext i1->i64
-def:  Pat <(i64 (zextloadi1 ADDRriS11_0:$src1)),
-      (i64 (COMBINE_Ir_V4 0, (LDriub ADDRriS11_0:$src1)))>,
-      Requires<[HasV4T]>;
-
-let AddedComplexity = 20 in
-def:  Pat <(i64 (zextloadi1 (add (i32 IntRegs:$src1),
-                                s11_0ExtPred:$offset))),
-      (i64 (COMBINE_Ir_V4 0, (LDriub_indexed IntRegs:$src1,
-                                  s11_0ExtPred:$offset)))>,
-      Requires<[HasV4T]>;
-
-// zext i16->i64
-def:  Pat <(i64 (zextloadi16 ADDRriS11_1:$src1)),
-      (i64 (COMBINE_Ir_V4 0, (LDriuh ADDRriS11_1:$src1)))>,
-      Requires<[HasV4T]>;
-
-let AddedComplexity = 20 in
-def:  Pat <(i64 (zextloadi16 (add (i32 IntRegs:$src1),
-                                  s11_1ExtPred:$offset))),
-      (i64 (COMBINE_Ir_V4 0, (LDriuh_indexed IntRegs:$src1,
-                                  s11_1ExtPred:$offset)))>,
-      Requires<[HasV4T]>;
-
-// anyext i16->i64
-def:  Pat <(i64 (extloadi16 ADDRriS11_2:$src1)),
-      (i64 (COMBINE_Ir_V4 0, (LDrih ADDRriS11_2:$src1)))>,
-      Requires<[HasV4T]>;
-
-let AddedComplexity = 20 in
-def:  Pat <(i64 (extloadi16 (add (i32 IntRegs:$src1),
-                                  s11_1ExtPred:$offset))),
-      (i64 (COMBINE_Ir_V4 0, (LDrih_indexed IntRegs:$src1,
-                                  s11_1ExtPred:$offset)))>,
-      Requires<[HasV4T]>;
+def: Pat<(i64 (zext (i1 PredRegs:$src1))),
+         (Zext64 (C2_muxii PredRegs:$src1, 1, 0))>;
+
+// zext i32->i64
+def: Pat<(i64 (zext (i32 IntRegs:$src1))),
+         (Zext64 IntRegs:$src1)>;
 
 // zext i32->i64
 def:  Pat <(i64 (zextloadi32 ADDRriS11_2:$src1)),
-      (i64 (COMBINE_Ir_V4 0, (LDriw ADDRriS11_2:$src1)))>,
-      Requires<[HasV4T]>;
+      (i64 (A4_combineir 0, (L2_loadri_io AddrFI:$src1, 0)))>;
 
 let AddedComplexity = 100 in
 def:  Pat <(i64 (zextloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
-      (i64 (COMBINE_Ir_V4 0, (LDriw_indexed IntRegs:$src1,
-                                  s11_2ExtPred:$offset)))>,
-      Requires<[HasV4T]>;
+      (i64 (A4_combineir 0, (L2_loadri_io IntRegs:$src1,
+                                  s11_2ExtPred:$offset)))>;
 
 // anyext i32->i64
 def:  Pat <(i64 (extloadi32 ADDRriS11_2:$src1)),
-      (i64 (COMBINE_Ir_V4 0, (LDriw ADDRriS11_2:$src1)))>,
-      Requires<[HasV4T]>;
-
-let AddedComplexity = 100 in
-def:  Pat <(i64 (extloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
-      (i64 (COMBINE_Ir_V4 0, (LDriw_indexed IntRegs:$src1,
-                                  s11_2ExtPred:$offset)))>,
-      Requires<[HasV4T]>;
-
-
+      (i64 (A4_combineir 0, (L2_loadri_io AddrFI:$src1, 0)))>;
 
 //===----------------------------------------------------------------------===//
 // LD -
@@ -467,194 +659,357 @@ def:  Pat <(i64 (extloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
 //===----------------------------------------------------------------------===//
 // Template class for store instructions with Absolute set addressing mode.
 //===----------------------------------------------------------------------===//
-let isExtended = 1, opExtendable = 2, validSubTargets = HasV4SubT,
-addrMode = AbsoluteSet in
-class T_ST_abs_set<string mnemonic, RegisterClass RC>:
-            STInst2<(outs IntRegs:$dst1),
-            (ins RC:$src1, u0AlwaysExt:$src2),
-            mnemonic#"($dst1=##$src2) = $src1",
-            []>,
-            Requires<[HasV4T]>;
+let isExtended = 1, opExtendable = 1, opExtentBits = 6,
+    addrMode = AbsoluteSet, isNVStorable = 1 in
+class T_ST_absset <string mnemonic, string BaseOp, RegisterClass RC,
+                   bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0>
+  : STInst<(outs IntRegs:$dst),
+           (ins u6Ext:$addr, RC:$src),
+    mnemonic#"($dst = #$addr) = $src"#!if(isHalf, ".h","")>, NewValueRel {
+    bits<5> dst;
+    bits<6> addr;
+    bits<5> src;
+    let accessSize = AccessSz;
+    let BaseOpcode = BaseOp#"_AbsSet";
+
+    let IClass = 0b1010;
+
+    let Inst{27-24} = 0b1011;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = dst;
+    let Inst{13}    = 0b0;
+    let Inst{12-8}  = src;
+    let Inst{7}     = 0b1;
+    let Inst{5-0}   = addr;
+  }
 
-def STrid_abs_set_V4 : T_ST_abs_set <"memd", DoubleRegs>;
-def STrib_abs_set_V4 : T_ST_abs_set <"memb", IntRegs>;
-def STrih_abs_set_V4 : T_ST_abs_set <"memh", IntRegs>;
-def STriw_abs_set_V4 : T_ST_abs_set <"memw", IntRegs>;
+def S4_storerb_ap : T_ST_absset <"memb", "STrib", IntRegs, 0b000, ByteAccess>;
+def S4_storerh_ap : T_ST_absset <"memh", "STrih", IntRegs, 0b010,
+                                 HalfWordAccess>;
+def S4_storeri_ap : T_ST_absset <"memw", "STriw", IntRegs, 0b100, WordAccess>;
 
-//===----------------------------------------------------------------------===//
-// multiclass for store instructions with base + register offset addressing
-// mode
-//===----------------------------------------------------------------------===//
-multiclass ST_Idxd_shl_Pbase<string mnemonic, RegisterClass RC, bit isNot,
-                             bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME : STInst2<(outs),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
-                 RC:$src5),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-            ") ")#mnemonic#"($src2+$src3<<#$src4) = $src5",
-            []>,
-            Requires<[HasV4T]>;
+let isNVStorable = 0 in {
+  def S4_storerf_ap : T_ST_absset <"memh", "STrif", IntRegs,
+                                   0b011, HalfWordAccess, 1>;
+  def S4_storerd_ap : T_ST_absset <"memd", "STrid", DoubleRegs,
+                                   0b110, DoubleWordAccess>;
 }
 
-multiclass ST_Idxd_shl_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    defm _c#NAME : ST_Idxd_shl_Pbase<mnemonic, RC, PredNot, 0>;
-    // Predicate new
-    defm _cdn#NAME : ST_Idxd_shl_Pbase<mnemonic, RC, PredNot, 1>;
+let opExtendable = 1, isNewValue = 1, isNVStore = 1, opNewValue = 2,
+isExtended = 1, opExtentBits= 6 in
+class T_ST_absset_nv <string mnemonic, string BaseOp, bits<2> MajOp,
+                      MemAccessSize AccessSz >
+  : NVInst <(outs IntRegs:$dst),
+            (ins u6Ext:$addr, IntRegs:$src),
+    mnemonic#"($dst = #$addr) = $src.new">, NewValueRel {
+    bits<5> dst;
+    bits<6> addr;
+    bits<3> src;
+    let accessSize = AccessSz;
+    let BaseOpcode = BaseOp#"_AbsSet";
+
+    let IClass = 0b1010;
+
+    let Inst{27-21} = 0b1011101;
+    let Inst{20-16} = dst;
+    let Inst{13-11} = 0b000;
+    let Inst{12-11} = MajOp;
+    let Inst{10-8}  = src;
+    let Inst{7}     = 0b1;
+    let Inst{5-0}   = addr;
   }
+
+let mayStore = 1, addrMode = AbsoluteSet in {
+  def S4_storerbnew_ap : T_ST_absset_nv <"memb", "STrib", 0b00, ByteAccess>;
+  def S4_storerhnew_ap : T_ST_absset_nv <"memh", "STrih", 0b01, HalfWordAccess>;
+  def S4_storerinew_ap : T_ST_absset_nv <"memw", "STriw", 0b10, WordAccess>;
 }
 
-let isNVStorable = 1 in
-multiclass ST_Idxd_shl<string mnemonic, string CextOp, RegisterClass RC> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
-    let isPredicable = 1 in
-    def NAME#_V4 : STInst2<(outs),
-            (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, RC:$src4),
-            mnemonic#"($src1+$src2<<#$src3) = $src4",
-            []>,
-            Requires<[HasV4T]>;
-
-    let isPredicated = 1 in {
-      defm Pt_V4 : ST_Idxd_shl_Pred<mnemonic, RC, 0 >;
-      defm NotPt_V4 : ST_Idxd_shl_Pred<mnemonic, RC, 1>;
-    }
-  }
+let isExtended = 1, opExtendable = 2, opExtentBits = 6, InputType = "imm",
+addrMode = BaseLongOffset, AddedComplexity = 40 in
+class T_StoreAbsReg <string mnemonic, string CextOp, RegisterClass RC,
+                     bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0>
+  : STInst<(outs),
+           (ins IntRegs:$src1, u2Imm:$src2, u6Ext:$src3, RC:$src4),
+   mnemonic#"($src1<<#$src2 + #$src3) = $src4"#!if(isHalf, ".h",""),
+   []>, ImmRegShl, NewValueRel {
+
+    bits<5> src1;
+    bits<2> src2;
+    bits<6> src3;
+    bits<5> src4;
+
+    let accessSize = AccessSz;
+    let CextOpcode = CextOp;
+    let BaseOpcode = CextOp#"_shl";
+    let IClass = 0b1010;
+
+    let Inst{27-24} =0b1101;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13}    = src2{1};
+    let Inst{12-8}  = src4;
+    let Inst{7}     = 0b1;
+    let Inst{6}     = src2{0};
+    let Inst{5-0}   = src3;
 }
 
-// multiclass for new-value store instructions with base + register offset
-// addressing mode.
-multiclass ST_Idxd_shl_Pbase_nv<string mnemonic, RegisterClass RC, bit isNot,
-                             bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME#_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
-                 RC:$src5),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-            ") ")#mnemonic#"($src2+$src3<<#$src4) = $src5.new",
-            []>,
-            Requires<[HasV4T]>;
+def S4_storerb_ur : T_StoreAbsReg <"memb", "STrib", IntRegs, 0b000, ByteAccess>;
+def S4_storerh_ur : T_StoreAbsReg <"memh", "STrih", IntRegs, 0b010,
+                                   HalfWordAccess>;
+def S4_storerf_ur : T_StoreAbsReg <"memh", "STrif", IntRegs, 0b011,
+                                   HalfWordAccess, 1>;
+def S4_storeri_ur : T_StoreAbsReg <"memw", "STriw", IntRegs, 0b100, WordAccess>;
+def S4_storerd_ur : T_StoreAbsReg <"memd", "STrid", DoubleRegs, 0b110,
+                                   DoubleWordAccess>;
+
+let AddedComplexity = 40 in
+multiclass T_StoreAbsReg_Pats <InstHexagon MI, RegisterClass RC, ValueType VT,
+                           PatFrag stOp> {
+ def : Pat<(stOp (VT RC:$src4),
+                 (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
+                      u0AlwaysExtPred:$src3)),
+          (MI IntRegs:$src1, u2ImmPred:$src2, u0AlwaysExtPred:$src3, RC:$src4)>;
+
+ def : Pat<(stOp (VT RC:$src4),
+                 (add (shl IntRegs:$src1, u2ImmPred:$src2),
+                      (HexagonCONST32 tglobaladdr:$src3))),
+           (MI IntRegs:$src1, u2ImmPred:$src2, tglobaladdr:$src3, RC:$src4)>;
+
+ def : Pat<(stOp (VT RC:$src4),
+                 (add IntRegs:$src1, (HexagonCONST32 tglobaladdr:$src3))),
+           (MI IntRegs:$src1, 0, tglobaladdr:$src3, RC:$src4)>;
 }
 
-multiclass ST_Idxd_shl_Pred_nv<string mnemonic, RegisterClass RC, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    defm _c#NAME : ST_Idxd_shl_Pbase_nv<mnemonic, RC, PredNot, 0>;
-    // Predicate new
-    defm _cdn#NAME : ST_Idxd_shl_Pbase_nv<mnemonic, RC, PredNot, 1>;
+defm : T_StoreAbsReg_Pats <S4_storerd_ur, DoubleRegs, i64, store>;
+defm : T_StoreAbsReg_Pats <S4_storeri_ur, IntRegs, i32, store>;
+defm : T_StoreAbsReg_Pats <S4_storerb_ur, IntRegs, i32, truncstorei8>;
+defm : T_StoreAbsReg_Pats <S4_storerh_ur, IntRegs, i32, truncstorei16>;
+
+let mayStore = 1, isNVStore = 1, isExtended = 1, addrMode = BaseLongOffset,
+    opExtentBits = 6, isNewValue = 1, opNewValue = 3, opExtendable = 2 in
+class T_StoreAbsRegNV <string mnemonic, string CextOp, bits<2> MajOp,
+                       MemAccessSize AccessSz>
+  : NVInst <(outs ),
+            (ins IntRegs:$src1, u2Imm:$src2, u6Ext:$src3, IntRegs:$src4),
+  mnemonic#"($src1<<#$src2 + #$src3) = $src4.new">, NewValueRel {
+    bits<5> src1;
+    bits<2> src2;
+    bits<6> src3;
+    bits<3> src4;
+
+    let CextOpcode  = CextOp;
+    let BaseOpcode  = CextOp#"_shl";
+    let IClass      = 0b1010;
+
+    let Inst{27-21} = 0b1101101;
+    let Inst{12-11} = 0b00;
+    let Inst{7}     = 0b1;
+    let Inst{20-16} = src1;
+    let Inst{13}    = src2{1};
+    let Inst{12-11} = MajOp;
+    let Inst{10-8}  = src4;
+    let Inst{6}     = src2{0};
+    let Inst{5-0}   = src3;
   }
-}
 
-let mayStore = 1, isNVStore = 1 in
-multiclass ST_Idxd_shl_nv<string mnemonic, string CextOp, RegisterClass RC> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
-    let isPredicable = 1 in
-    def NAME#_nv_V4 : NVInst_V4<(outs),
-            (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, RC:$src4),
-            mnemonic#"($src1+$src2<<#$src3) = $src4.new",
-            []>,
-            Requires<[HasV4T]>;
-
-    let isPredicated = 1 in {
-      defm Pt : ST_Idxd_shl_Pred_nv<mnemonic, RC, 0 >;
-      defm NotPt : ST_Idxd_shl_Pred_nv<mnemonic, RC, 1>;
-    }
+def S4_storerbnew_ur : T_StoreAbsRegNV <"memb", "STrib", 0b00, ByteAccess>;
+def S4_storerhnew_ur : T_StoreAbsRegNV <"memh", "STrih", 0b01, HalfWordAccess>;
+def S4_storerinew_ur : T_StoreAbsRegNV <"memw", "STriw", 0b10, WordAccess>;
+
+//===----------------------------------------------------------------------===//
+// Template classes for the non-predicated store instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+let isPredicable = 1 in
+class T_store_rr <string mnemonic, RegisterClass RC, bits<3> MajOp, bit isH>
+  : STInst < (outs ), (ins IntRegs:$Rs, IntRegs:$Ru, u2Imm:$u2, RC:$Rt),
+  mnemonic#"($Rs + $Ru<<#$u2) = $Rt"#!if(isH, ".h",""),
+  [],"",V4LDST_tc_st_SLOT01>, ImmRegShl, AddrModeRel {
+
+    bits<5> Rs;
+    bits<5> Ru;
+    bits<2> u2;
+    bits<5> Rt;
+
+    let IClass = 0b0011;
+
+    let Inst{27-24} = 0b1011;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{12-8}  = Ru;
+    let Inst{13}    = u2{1};
+    let Inst{7}     = u2{0};
+    let Inst{4-0}   = Rt;
   }
-}
 
-let addrMode = BaseRegOffset, neverHasSideEffects = 1,
-validSubTargets = HasV4SubT in {
-  let accessSize = ByteAccess in
-    defm STrib_indexed_shl: ST_Idxd_shl<"memb", "STrib", IntRegs>,
-                            ST_Idxd_shl_nv<"memb", "STrib", IntRegs>, AddrModeRel;
+//===----------------------------------------------------------------------===//
+// Template classes for the predicated store instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+let isPredicated = 1 in
+class T_pstore_rr <string mnemonic, RegisterClass RC, bits<3> MajOp,
+                   bit isNot, bit isPredNew, bit isH>
+  : STInst <(outs),
+            (ins PredRegs:$Pv, IntRegs:$Rs, IntRegs:$Ru, u2Imm:$u2, RC:$Rt),
+
+  !if(isNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
+  ") ")#mnemonic#"($Rs+$Ru<<#$u2) = $Rt"#!if(isH, ".h",""),
+  [], "", V4LDST_tc_st_SLOT01> , AddrModeRel{
+    bits<2> Pv;
+    bits<5> Rs;
+    bits<5> Ru;
+    bits<2> u2;
+    bits<5> Rt;
+
+    let isPredicatedFalse = isNot;
+    let isPredicatedNew = isPredNew;
 
-  let accessSize = HalfWordAccess in
-    defm STrih_indexed_shl: ST_Idxd_shl<"memh", "STrih", IntRegs>,
-                            ST_Idxd_shl_nv<"memh", "STrih", IntRegs>, AddrModeRel;
+    let IClass = 0b0011;
 
-  let accessSize = WordAccess in
-    defm STriw_indexed_shl: ST_Idxd_shl<"memw", "STriw", IntRegs>,
-                            ST_Idxd_shl_nv<"memw", "STriw", IntRegs>, AddrModeRel;
+    let Inst{27-26} = 0b01;
+    let Inst{25}    = isPredNew;
+    let Inst{24}    = isNot;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{12-8}  = Ru;
+    let Inst{13}    = u2{1};
+    let Inst{7}     = u2{0};
+    let Inst{6-5}   = Pv;
+    let Inst{4-0}   = Rt;
+  }
 
-  let isNVStorable = 0, accessSize = DoubleWordAccess in
-    defm STrid_indexed_shl: ST_Idxd_shl<"memd", "STrid", DoubleRegs>, AddrModeRel;
-}
+//===----------------------------------------------------------------------===//
+// Template classes for the new-value store instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+let isPredicable = 1, isNewValue = 1, opNewValue = 3 in
+class T_store_new_rr <string mnemonic, bits<2> MajOp> :
+  NVInst < (outs ), (ins IntRegs:$Rs, IntRegs:$Ru, u2Imm:$u2, IntRegs:$Nt),
+  mnemonic#"($Rs + $Ru<<#$u2) = $Nt.new",
+  [],"",V4LDST_tc_st_SLOT0>, ImmRegShl, AddrModeRel {
 
-let Predicates = [HasV4T], AddedComplexity = 10 in {
-def : Pat<(truncstorei8 (i32 IntRegs:$src4),
-                       (add IntRegs:$src1, (shl IntRegs:$src2,
-                                                u2ImmPred:$src3))),
-          (STrib_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2,
-                                u2ImmPred:$src3, IntRegs:$src4)>;
+    bits<5> Rs;
+    bits<5> Ru;
+    bits<2> u2;
+    bits<3> Nt;
 
-def : Pat<(truncstorei16 (i32 IntRegs:$src4),
-                        (add IntRegs:$src1, (shl IntRegs:$src2,
-                                                 u2ImmPred:$src3))),
-          (STrih_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2,
-                                u2ImmPred:$src3, IntRegs:$src4)>;
+    let IClass = 0b0011;
 
-def : Pat<(store (i32 IntRegs:$src4),
-                 (add IntRegs:$src1, (shl IntRegs:$src2, u2ImmPred:$src3))),
-          (STriw_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2,
-                                u2ImmPred:$src3, IntRegs:$src4)>;
+    let Inst{27-21} = 0b1011101;
+    let Inst{20-16} = Rs;
+    let Inst{12-8}  = Ru;
+    let Inst{13}    = u2{1};
+    let Inst{7}     = u2{0};
+    let Inst{4-3}   = MajOp;
+    let Inst{2-0}   = Nt;
+  }
 
-def : Pat<(store (i64 DoubleRegs:$src4),
-                (add IntRegs:$src1, (shl IntRegs:$src2, u2ImmPred:$src3))),
-          (STrid_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2,
-                                u2ImmPred:$src3, DoubleRegs:$src4)>;
-}
+//===----------------------------------------------------------------------===//
+// Template classes for the predicated new-value store instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, isNewValue = 1, opNewValue = 4 in
+class T_pstore_new_rr <string mnemonic, bits<2> MajOp, bit isNot, bit isPredNew>
+  : NVInst<(outs),
+           (ins PredRegs:$Pv, IntRegs:$Rs, IntRegs:$Ru, u2Imm:$u2, IntRegs:$Nt),
+   !if(isNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
+   ") ")#mnemonic#"($Rs+$Ru<<#$u2) = $Nt.new",
+   [], "", V4LDST_tc_st_SLOT0>, AddrModeRel {
+    bits<2> Pv;
+    bits<5> Rs;
+    bits<5> Ru;
+    bits<2> u2;
+    bits<3> Nt;
+
+    let isPredicatedFalse = isNot;
+    let isPredicatedNew = isPredNew;
 
-let isExtended = 1, opExtendable = 2 in
-class T_ST_LongOff <string mnemonic, PatFrag stOp, RegisterClass RC, ValueType VT> :
-            STInst<(outs),
-            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, RC:$src4),
-            mnemonic#"($src1<<#$src2+##$src3) = $src4",
-            [(stOp (VT RC:$src4),
-                    (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
-                         u0AlwaysExtPred:$src3))]>,
-            Requires<[HasV4T]>;
+    let IClass = 0b0011;
+    let Inst{27-26} = 0b01;
+    let Inst{25}    = isPredNew;
+    let Inst{24}    = isNot;
+    let Inst{23-21} = 0b101;
+    let Inst{20-16} = Rs;
+    let Inst{12-8}  = Ru;
+    let Inst{13}    = u2{1};
+    let Inst{7}     = u2{0};
+    let Inst{6-5}   = Pv;
+    let Inst{4-3}   = MajOp;
+    let Inst{2-0}   = Nt;
+  }
 
-let isExtended = 1, opExtendable = 2, mayStore = 1, isNVStore = 1 in
-class T_ST_LongOff_nv <string mnemonic> :
-            NVInst_V4<(outs),
-            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, IntRegs:$src4),
-            mnemonic#"($src1<<#$src2+##$src3) = $src4.new",
-            []>,
-            Requires<[HasV4T]>;
+//===----------------------------------------------------------------------===//
+// multiclass for store instructions with base + register offset addressing
+// mode
+//===----------------------------------------------------------------------===//
+let isNVStorable = 1 in
+multiclass ST_Idxd_shl<string mnemonic, string CextOp, RegisterClass RC,
+                       bits<3> MajOp, bit isH = 0> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
+    def S4_#NAME#_rr : T_store_rr <mnemonic, RC, MajOp, isH>;
 
-multiclass ST_LongOff <string mnemonic, string BaseOp, PatFrag stOp> {
-  let  BaseOpcode = BaseOp#"_shl" in {
-    let isNVStorable = 1 in
-    def NAME#_V4 : T_ST_LongOff<mnemonic, stOp, IntRegs, i32>;
+    // Predicated
+    def S4_p#NAME#t_rr : T_pstore_rr <mnemonic, RC, MajOp, 0, 0, isH>;
+    def S4_p#NAME#f_rr : T_pstore_rr <mnemonic, RC, MajOp, 1, 0, isH>;
 
-    def NAME#_nv_V4 : T_ST_LongOff_nv<mnemonic>;
+    // Predicated new
+    def S4_p#NAME#tnew_rr : T_pstore_rr <mnemonic, RC, MajOp, 0, 1, isH>;
+    def S4_p#NAME#fnew_rr : T_pstore_rr <mnemonic, RC, MajOp, 1, 1, isH>;
   }
 }
 
-let AddedComplexity = 10, validSubTargets = HasV4SubT in {
-  def STrid_shl_V4 : T_ST_LongOff<"memd", store, DoubleRegs, i64>;
-  defm STrib_shl   : ST_LongOff <"memb", "STrib", truncstorei8>, NewValueRel;
-  defm STrih_shl   : ST_LongOff <"memh", "Strih", truncstorei16>, NewValueRel;
-  defm STriw_shl   : ST_LongOff <"memw", "STriw", store>, NewValueRel;
+//===----------------------------------------------------------------------===//
+// multiclass for new-value store instructions with base + register offset
+// addressing mode.
+//===----------------------------------------------------------------------===//
+let mayStore = 1, isNVStore = 1 in
+multiclass ST_Idxd_shl_nv <string mnemonic, string CextOp, RegisterClass RC,
+                           bits<2> MajOp> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
+    def S4_#NAME#new_rr : T_store_new_rr<mnemonic, MajOp>;
+
+    // Predicated
+    def S4_p#NAME#newt_rr : T_pstore_new_rr <mnemonic, MajOp, 0, 0>;
+    def S4_p#NAME#newf_rr : T_pstore_new_rr <mnemonic, MajOp, 1, 0>;
+
+    // Predicated new
+    def S4_p#NAME#newtnew_rr : T_pstore_new_rr <mnemonic, MajOp, 0, 1>;
+    def S4_p#NAME#newfnew_rr : T_pstore_new_rr <mnemonic, MajOp, 1, 1>;
+  }
 }
 
-let AddedComplexity = 40 in
-multiclass T_ST_LOff_Pats <InstHexagon I, RegisterClass RC, ValueType VT,
-                           PatFrag stOp> {
- def : Pat<(stOp (VT RC:$src4),
-           (add (shl IntRegs:$src1, u2ImmPred:$src2),
-               (NumUsesBelowThresCONST32 tglobaladdr:$src3))),
-           (I IntRegs:$src1, u2ImmPred:$src2, tglobaladdr:$src3, RC:$src4)>;
+let addrMode = BaseRegOffset, InputType = "reg", hasSideEffects = 0 in {
+  let accessSize = ByteAccess in
+  defm storerb: ST_Idxd_shl<"memb", "STrib", IntRegs, 0b000>,
+                ST_Idxd_shl_nv<"memb", "STrib", IntRegs, 0b00>;
 
- def : Pat<(stOp (VT RC:$src4),
-           (add IntRegs:$src1,
-               (NumUsesBelowThresCONST32 tglobaladdr:$src3))),
-           (I IntRegs:$src1, 0, tglobaladdr:$src3, RC:$src4)>;
+  let accessSize = HalfWordAccess in
+  defm storerh: ST_Idxd_shl<"memh", "STrih", IntRegs, 0b010>,
+                ST_Idxd_shl_nv<"memh", "STrih", IntRegs, 0b01>;
+
+  let accessSize = WordAccess in
+  defm storeri: ST_Idxd_shl<"memw", "STriw", IntRegs, 0b100>,
+                ST_Idxd_shl_nv<"memw", "STriw", IntRegs, 0b10>;
+
+  let isNVStorable = 0, accessSize = DoubleWordAccess in
+  defm storerd: ST_Idxd_shl<"memd", "STrid", DoubleRegs, 0b110>;
+
+  let isNVStorable = 0, accessSize = HalfWordAccess in
+  defm storerf: ST_Idxd_shl<"memh", "STrif", IntRegs, 0b011, 1>;
 }
 
-defm : T_ST_LOff_Pats<STrid_shl_V4, DoubleRegs, i64, store>;
-defm : T_ST_LOff_Pats<STriw_shl_V4, IntRegs, i32, store>;
-defm : T_ST_LOff_Pats<STrib_shl_V4, IntRegs, i32, truncstorei8>;
-defm : T_ST_LOff_Pats<STrih_shl_V4, IntRegs, i32, truncstorei16>;
+class Storexs_pat<PatFrag Store, PatFrag Value, InstHexagon MI>
+  : Pat<(Store Value:$Ru, (add (i32 IntRegs:$Rs),
+                               (i32 (shl (i32 IntRegs:$Rt), u2ImmPred:$u2)))),
+        (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2, Value:$Ru)>;
+
+let AddedComplexity = 40 in {
+  def: Storexs_pat<truncstorei8,  I32, S4_storerb_rr>;
+  def: Storexs_pat<truncstorei16, I32, S4_storerh_rr>;
+  def: Storexs_pat<store,         I32, S4_storeri_rr>;
+  def: Storexs_pat<store,         I64, S4_storerd_rr>;
+}
 
 // memd(Rx++#s4:3)=Rtt
 // memd(Rx++#s4:3:circ(Mu))=Rtt
@@ -668,75 +1023,151 @@ defm : T_ST_LOff_Pats<STrih_shl_V4, IntRegs, i32, truncstorei16>;
 // TODO: needs to be implemented.
 
 //===----------------------------------------------------------------------===//
+// Template class
+//===----------------------------------------------------------------------===//
+let isPredicable = 1, isExtendable = 1, isExtentSigned = 1, opExtentBits = 8,
+    opExtendable = 2 in
+class T_StoreImm <string mnemonic, Operand OffsetOp, bits<2> MajOp >
+  : STInst <(outs ), (ins IntRegs:$Rs, OffsetOp:$offset, s8Ext:$S8),
+  mnemonic#"($Rs+#$offset)=#$S8",
+  [], "", V4LDST_tc_st_SLOT01>,
+  ImmRegRel, PredNewRel {
+    bits<5> Rs;
+    bits<8> S8;
+    bits<8> offset;
+    bits<6> offsetBits;
+
+    string OffsetOpStr = !cast<string>(OffsetOp);
+    let offsetBits = !if (!eq(OffsetOpStr, "u6_2Imm"), offset{7-2},
+                     !if (!eq(OffsetOpStr, "u6_1Imm"), offset{6-1},
+                                         /* u6_0Imm */ offset{5-0}));
+
+    let IClass = 0b0011;
+
+    let Inst{27-25} = 0b110;
+    let Inst{22-21} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{12-7}  = offsetBits;
+    let Inst{13}    = S8{7};
+    let Inst{6-0}   = S8{6-0};
+  }
+
+let isPredicated = 1, isExtendable = 1, isExtentSigned = 1, opExtentBits = 6,
+    opExtendable = 3 in
+class T_StoreImm_pred <string mnemonic, Operand OffsetOp, bits<2> MajOp,
+                       bit isPredNot, bit isPredNew >
+  : STInst <(outs ),
+            (ins PredRegs:$Pv, IntRegs:$Rs, OffsetOp:$offset, s6Ext:$S6),
+  !if(isPredNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
+  ") ")#mnemonic#"($Rs+#$offset)=#$S6",
+  [], "", V4LDST_tc_st_SLOT01>,
+  ImmRegRel, PredNewRel {
+    bits<2> Pv;
+    bits<5> Rs;
+    bits<6> S6;
+    bits<8> offset;
+    bits<6> offsetBits;
+
+    string OffsetOpStr = !cast<string>(OffsetOp);
+    let offsetBits = !if (!eq(OffsetOpStr, "u6_2Imm"), offset{7-2},
+                     !if (!eq(OffsetOpStr, "u6_1Imm"), offset{6-1},
+                                         /* u6_0Imm */ offset{5-0}));
+    let isPredicatedNew = isPredNew;
+    let isPredicatedFalse = isPredNot;
+
+    let IClass = 0b0011;
+
+    let Inst{27-25} = 0b100;
+    let Inst{24}    = isPredNew;
+    let Inst{23}    = isPredNot;
+    let Inst{22-21} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{13}    = S6{5};
+    let Inst{12-7}  = offsetBits;
+    let Inst{6-5}   = Pv;
+    let Inst{4-0}   = S6{4-0};
+  }
+
+
+//===----------------------------------------------------------------------===//
 // multiclass for store instructions with base + immediate offset
 // addressing mode and immediate stored value.
 // mem[bhw](Rx++#s4:3)=#s8
 // if ([!]Pv[.new]) mem[bhw](Rx++#s4:3)=#s6
 //===----------------------------------------------------------------------===//
-multiclass ST_Imm_Pbase<string mnemonic, Operand OffsetOp, bit isNot,
-                        bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME : STInst2<(outs),
-            (ins PredRegs:$src1, IntRegs:$src2, OffsetOp:$src3, s6Ext:$src4),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-            ") ")#mnemonic#"($src2+#$src3) = #$src4",
-            []>,
-            Requires<[HasV4T]>;
-}
 
-multiclass ST_Imm_Pred<string mnemonic, Operand OffsetOp, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    defm _c#NAME : ST_Imm_Pbase<mnemonic, OffsetOp, PredNot, 0>;
-    // Predicate new
-    defm _cdn#NAME : ST_Imm_Pbase<mnemonic, OffsetOp, PredNot, 1>;
-  }
+multiclass ST_Imm_Pred <string mnemonic, Operand OffsetOp, bits<2> MajOp,
+                        bit PredNot> {
+  def _io    : T_StoreImm_pred <mnemonic, OffsetOp, MajOp, PredNot, 0>;
+  // Predicate new
+  def new_io : T_StoreImm_pred <mnemonic, OffsetOp, MajOp, PredNot, 1>;
 }
 
-let isExtendable = 1, isExtentSigned = 1, neverHasSideEffects = 1 in
-multiclass ST_Imm<string mnemonic, string CextOp, Operand OffsetOp> {
+multiclass ST_Imm <string mnemonic, string CextOp, Operand OffsetOp,
+                   bits<2> MajOp> {
   let CextOpcode = CextOp, BaseOpcode = CextOp#_imm in {
-    let opExtendable = 2, opExtentBits = 8, isPredicable = 1 in
-    def NAME#_V4 : STInst2<(outs),
-            (ins IntRegs:$src1, OffsetOp:$src2, s8Ext:$src3),
-            mnemonic#"($src1+#$src2) = #$src3",
-            []>,
-            Requires<[HasV4T]>;
-
-    let opExtendable = 3, opExtentBits = 6, isPredicated = 1 in {
-      defm Pt_V4 : ST_Imm_Pred<mnemonic, OffsetOp, 0>;
-      defm NotPt_V4 : ST_Imm_Pred<mnemonic, OffsetOp, 1 >;
-    }
+    def _io : T_StoreImm <mnemonic, OffsetOp, MajOp>;
+
+    defm t : ST_Imm_Pred <mnemonic, OffsetOp, MajOp, 0>;
+    defm f : ST_Imm_Pred <mnemonic, OffsetOp, MajOp, 1>;
   }
 }
 
-let addrMode = BaseImmOffset, InputType = "imm",
-validSubTargets = HasV4SubT in {
+let hasSideEffects = 0, addrMode = BaseImmOffset,
+    InputType = "imm" in {
   let accessSize = ByteAccess in
-    defm STrib_imm : ST_Imm<"memb", "STrib", u6_0Imm>, ImmRegRel, PredNewRel;
+  defm S4_storeirb : ST_Imm<"memb", "STrib", u6_0Imm, 0b00>;
 
   let accessSize = HalfWordAccess in
-    defm STrih_imm : ST_Imm<"memh", "STrih", u6_1Imm>, ImmRegRel, PredNewRel;
+  defm S4_storeirh : ST_Imm<"memh", "STrih", u6_1Imm, 0b01>;
 
   let accessSize = WordAccess in
-    defm STriw_imm : ST_Imm<"memw", "STriw", u6_2Imm>, ImmRegRel, PredNewRel;
+  defm S4_storeiri : ST_Imm<"memw", "STriw", u6_2Imm, 0b10>;
 }
 
-let Predicates = [HasV4T], AddedComplexity = 10 in {
-def: Pat<(truncstorei8 s8ExtPred:$src3, (add IntRegs:$src1, u6_0ImmPred:$src2)),
-            (STrib_imm_V4 IntRegs:$src1, u6_0ImmPred:$src2, s8ExtPred:$src3)>;
+def IMM_BYTE : SDNodeXForm<imm, [{
+  // -1 etc is  represented as 255 etc
+  // assigning to a byte restores our desired signed value.
+  int8_t imm = N->getSExtValue();
+  return CurDAG->getTargetConstant(imm, MVT::i32);
+}]>;
 
-def: Pat<(truncstorei16 s8ExtPred:$src3, (add IntRegs:$src1,
-                                              u6_1ImmPred:$src2)),
-            (STrih_imm_V4 IntRegs:$src1, u6_1ImmPred:$src2, s8ExtPred:$src3)>;
+def IMM_HALF : SDNodeXForm<imm, [{
+  // -1 etc is  represented as 65535 etc
+  // assigning to a short restores our desired signed value.
+  int16_t imm = N->getSExtValue();
+  return CurDAG->getTargetConstant(imm, MVT::i32);
+}]>;
 
-def: Pat<(store s8ExtPred:$src3, (add IntRegs:$src1, u6_2ImmPred:$src2)),
-            (STriw_imm_V4 IntRegs:$src1, u6_2ImmPred:$src2, s8ExtPred:$src3)>;
+def IMM_WORD : SDNodeXForm<imm, [{
+  // -1 etc can be represented as 4294967295 etc
+  // Currently, it's not doing this. But some optimization
+  // might convert -1 to a large +ve number.
+  // assigning to a word restores our desired signed value.
+  int32_t imm = N->getSExtValue();
+  return CurDAG->getTargetConstant(imm, MVT::i32);
+}]>;
+
+def ToImmByte : OutPatFrag<(ops node:$R), (IMM_BYTE $R)>;
+def ToImmHalf : OutPatFrag<(ops node:$R), (IMM_HALF $R)>;
+def ToImmWord : OutPatFrag<(ops node:$R), (IMM_WORD $R)>;
+
+let AddedComplexity = 40 in {
+  // Not using frameindex patterns for these stores, because the offset
+  // is not extendable. This could cause problems during removing the frame
+  // indices, since the offset with respect to R29/R30 may not fit in the
+  // u6 field.
+  def: Storexm_add_pat<truncstorei8, s8ExtPred, u6_0ImmPred, ToImmByte,
+                       S4_storeirb_io>;
+  def: Storexm_add_pat<truncstorei16, s8ExtPred, u6_1ImmPred, ToImmHalf,
+                       S4_storeirh_io>;
+  def: Storexm_add_pat<store, s8ExtPred, u6_2ImmPred, ToImmWord,
+                       S4_storeiri_io>;
 }
 
-let AddedComplexity = 6 in
-def : Pat <(truncstorei8 s8ExtPred:$src2, (i32 IntRegs:$src1)),
-           (STrib_imm_V4 IntRegs:$src1, 0, s8ExtPred:$src2)>,
-           Requires<[HasV4T]>;
+def: Storexm_simple_pat<truncstorei8,  s8ExtPred, ToImmByte, S4_storeirb_io>;
+def: Storexm_simple_pat<truncstorei16, s8ExtPred, ToImmHalf, S4_storeirh_io>;
+def: Storexm_simple_pat<store,         s8ExtPred, ToImmWord, S4_storeiri_io>;
 
 // memb(Rx++#s4:0:circ(Mu))=Rt
 // memb(Rx++I:circ(Mu))=Rt
@@ -744,16 +1175,10 @@ def : Pat <(truncstorei8 s8ExtPred:$src2, (i32 IntRegs:$src1)),
 // memb(Rx++Mu:brev)=Rt
 // memb(gp+#u16:0)=Rt
 
-
 // Store halfword.
 // TODO: needs to be implemented
 // memh(Re=#U6)=Rt.H
 // memh(Rs+#s11:1)=Rt.H
-let AddedComplexity = 6 in
-def : Pat <(truncstorei16 s8ExtPred:$src2, (i32 IntRegs:$src1)),
-           (STrih_imm_V4 IntRegs:$src1, 0, s8ExtPred:$src2)>,
-           Requires<[HasV4T]>;
-
 // memh(Rs+Ru<<#u2)=Rt.H
 // TODO: needs to be implemented.
 
@@ -770,7 +1195,6 @@ def : Pat <(truncstorei16 s8ExtPred:$src2, (i32 IntRegs:$src1)),
 // if ([!]Pv[.new]) memh(#u6)=Rt.H
 // if ([!]Pv[.new]) memh(#u6)=Rt
 
-
 // if ([!]Pv[.new]) memh(Rs+#u6:1)=Rt.H
 // TODO: needs to be implemented.
 
@@ -780,20 +1204,6 @@ def : Pat <(truncstorei16 s8ExtPred:$src2, (i32 IntRegs:$src1)),
 // Store word.
 // memw(Re=#U6)=Rt
 // TODO: Needs to be implemented.
-
-// Store predicate:
-let neverHasSideEffects = 1 in
-def STriw_pred_V4 : STInst2<(outs),
-            (ins MEMri:$addr, PredRegs:$src1),
-            "Error; should not emit",
-            []>,
-            Requires<[HasV4T]>;
-
-let AddedComplexity = 6 in
-def : Pat <(store s8ExtPred:$src2, (i32 IntRegs:$src1)),
-           (STriw_imm_V4 IntRegs:$src1, 0, s8ExtPred:$src2)>,
-           Requires<[HasV4T]>;
-
 // memw(Rx++#s4:2)=Rt
 // memw(Rx++#s4:2:circ(Mu))=Rt
 // memw(Rx++I:circ(Mu))=Rt
@@ -809,175 +1219,285 @@ def : Pat <(store s8ExtPred:$src2, (i32 IntRegs:$src1)),
 // NV/ST +
 //===----------------------------------------------------------------------===//
 
-// multiclass for new-value store instructions with base + immediate offset.
-//
-multiclass ST_Idxd_Pbase_nv<string mnemonic, RegisterClass RC,
-                            Operand predImmOp, bit isNot, bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME#_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, IntRegs:$src2, predImmOp:$src3, RC: $src4),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-            ") ")#mnemonic#"($src2+#$src3) = $src4.new",
-            []>,
-            Requires<[HasV4T]>;
-}
+let opNewValue = 2, opExtendable = 1, isExtentSigned = 1, isPredicable = 1 in
+class T_store_io_nv <string mnemonic, RegisterClass RC,
+                    Operand ImmOp, bits<2>MajOp>
+  : NVInst_V4 <(outs),
+               (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+  mnemonic#"($src1+#$src2) = $src3.new",
+  [],"",ST_tc_st_SLOT0> {
+    bits<5> src1;
+    bits<13> src2; // Actual address offset
+    bits<3> src3;
+    bits<11> offsetBits; // Represents offset encoding
+
+    let opExtentBits = !if (!eq(mnemonic, "memb"), 11,
+                       !if (!eq(mnemonic, "memh"), 12,
+                       !if (!eq(mnemonic, "memw"), 13, 0)));
 
-multiclass ST_Idxd_Pred_nv<string mnemonic, RegisterClass RC, Operand predImmOp,
-                           bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    defm _c#NAME : ST_Idxd_Pbase_nv<mnemonic, RC, predImmOp, PredNot, 0>;
-    // Predicate new
-    defm _cdn#NAME : ST_Idxd_Pbase_nv<mnemonic, RC, predImmOp, PredNot, 1>;
+    let opExtentAlign = !if (!eq(mnemonic, "memb"), 0,
+                        !if (!eq(mnemonic, "memh"), 1,
+                        !if (!eq(mnemonic, "memw"), 2, 0)));
+
+    let offsetBits = !if (!eq(mnemonic, "memb"),  src2{10-0},
+                     !if (!eq(mnemonic, "memh"),  src2{11-1},
+                     !if (!eq(mnemonic, "memw"),  src2{12-2}, 0)));
+
+    let IClass = 0b1010;
+
+    let Inst{27} = 0b0;
+    let Inst{26-25} = offsetBits{10-9};
+    let Inst{24-21} = 0b1101;
+    let Inst{20-16} = src1;
+    let Inst{13} = offsetBits{8};
+    let Inst{12-11} = MajOp;
+    let Inst{10-8} = src3;
+    let Inst{7-0} = offsetBits{7-0};
   }
-}
 
-let mayStore = 1, isNVStore = 1, neverHasSideEffects = 1, isExtendable = 1 in
+let opExtendable = 2, opNewValue = 3, isPredicated = 1 in
+class T_pstore_io_nv <string mnemonic, RegisterClass RC, Operand predImmOp,
+                         bits<2>MajOp, bit PredNot, bit isPredNew>
+  : NVInst_V4 <(outs),
+               (ins PredRegs:$src1, IntRegs:$src2, predImmOp:$src3, RC:$src4),
+  !if(PredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+  ") ")#mnemonic#"($src2+#$src3) = $src4.new",
+  [],"",V2LDST_tc_st_SLOT0> {
+    bits<2> src1;
+    bits<5> src2;
+    bits<9> src3;
+    bits<3> src4;
+    bits<6> offsetBits; // Represents offset encoding
+
+    let isPredicatedNew = isPredNew;
+    let isPredicatedFalse = PredNot;
+    let opExtentBits = !if (!eq(mnemonic, "memb"), 6,
+                       !if (!eq(mnemonic, "memh"), 7,
+                       !if (!eq(mnemonic, "memw"), 8, 0)));
+
+    let opExtentAlign = !if (!eq(mnemonic, "memb"), 0,
+                        !if (!eq(mnemonic, "memh"), 1,
+                        !if (!eq(mnemonic, "memw"), 2, 0)));
+
+    let offsetBits = !if (!eq(mnemonic, "memb"), src3{5-0},
+                     !if (!eq(mnemonic, "memh"), src3{6-1},
+                     !if (!eq(mnemonic, "memw"), src3{7-2}, 0)));
+
+    let IClass = 0b0100;
+
+    let Inst{27}    = 0b0;
+    let Inst{26}    = PredNot;
+    let Inst{25}    = isPredNew;
+    let Inst{24-21} = 0b0101;
+    let Inst{20-16} = src2;
+    let Inst{13}    = offsetBits{5};
+    let Inst{12-11} = MajOp;
+    let Inst{10-8}  = src4;
+    let Inst{7-3}   = offsetBits{4-0};
+    let Inst{2}     = 0b0;
+    let Inst{1-0}   = src1;
+  }
+
+// multiclass for new-value store instructions with base + immediate offset.
+//
+let mayStore = 1, isNVStore = 1, isNewValue = 1, hasSideEffects = 0,
+    isExtendable = 1 in
 multiclass ST_Idxd_nv<string mnemonic, string CextOp, RegisterClass RC,
-                   Operand ImmOp, Operand predImmOp, bits<5> ImmBits,
-                   bits<5> PredImmBits> {
+                   Operand ImmOp, Operand predImmOp, bits<2> MajOp> {
 
   let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
-    let opExtendable = 1, isExtentSigned = 1, opExtentBits = ImmBits,
-    isPredicable = 1 in
-    def NAME#_nv_V4 : NVInst_V4<(outs),
-            (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-            mnemonic#"($src1+#$src2) = $src3.new",
-            []>,
-            Requires<[HasV4T]>;
-
-    let opExtendable = 2, isExtentSigned = 0, opExtentBits = PredImmBits,
-    isPredicated = 1 in {
-      defm Pt : ST_Idxd_Pred_nv<mnemonic, RC, predImmOp, 0>;
-      defm NotPt : ST_Idxd_Pred_nv<mnemonic, RC, predImmOp, 1>;
-    }
+    def S2_#NAME#new_io : T_store_io_nv <mnemonic, RC, ImmOp, MajOp>;
+    // Predicated
+    def S2_p#NAME#newt_io :T_pstore_io_nv <mnemonic, RC, predImmOp, MajOp, 0, 0>;
+    def S2_p#NAME#newf_io :T_pstore_io_nv <mnemonic, RC, predImmOp, MajOp, 1, 0>;
+    // Predicated new
+    def S4_p#NAME#newtnew_io :T_pstore_io_nv <mnemonic, RC, predImmOp,
+                                              MajOp, 0, 1>;
+    def S4_p#NAME#newfnew_io :T_pstore_io_nv <mnemonic, RC, predImmOp,
+                                              MajOp, 1, 1>;
   }
 }
 
-let addrMode = BaseImmOffset, validSubTargets = HasV4SubT in {
+let addrMode = BaseImmOffset, InputType = "imm" in {
   let accessSize = ByteAccess in
-    defm STrib_indexed: ST_Idxd_nv<"memb", "STrib", IntRegs, s11_0Ext,
-                                   u6_0Ext, 11, 6>, AddrModeRel;
+  defm storerb: ST_Idxd_nv<"memb", "STrib", IntRegs, s11_0Ext,
+                           u6_0Ext, 0b00>, AddrModeRel;
 
-  let accessSize = HalfWordAccess in
-    defm STrih_indexed: ST_Idxd_nv<"memh", "STrih", IntRegs, s11_1Ext,
-                                   u6_1Ext, 12, 7>, AddrModeRel;
+  let accessSize = HalfWordAccess, opExtentAlign = 1 in
+  defm storerh: ST_Idxd_nv<"memh", "STrih", IntRegs, s11_1Ext,
+                           u6_1Ext, 0b01>, AddrModeRel;
 
-  let accessSize = WordAccess in
-    defm STriw_indexed: ST_Idxd_nv<"memw", "STriw", IntRegs, s11_2Ext,
-                                   u6_2Ext, 13, 8>, AddrModeRel;
+  let accessSize = WordAccess, opExtentAlign = 2 in
+  defm storeri: ST_Idxd_nv<"memw", "STriw", IntRegs, s11_2Ext,
+                           u6_2Ext, 0b10>, AddrModeRel;
 }
 
-// multiclass for new-value store instructions with base + immediate offset.
-// and MEMri operand.
-multiclass ST_MEMri_Pbase_nv<string mnemonic, RegisterClass RC, bit isNot,
-                          bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME#_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, MEMri:$addr, RC: $src2),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-            ") ")#mnemonic#"($addr) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-}
-
-multiclass ST_MEMri_Pred_nv<string mnemonic, RegisterClass RC, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    defm _c#NAME : ST_MEMri_Pbase_nv<mnemonic, RC, PredNot, 0>;
-
-    // Predicate new
-    defm _cdn#NAME : ST_MEMri_Pbase_nv<mnemonic, RC, PredNot, 1>;
-  }
-}
-
-let mayStore = 1, isNVStore = 1, isExtendable = 1, neverHasSideEffects = 1 in
-multiclass ST_MEMri_nv<string mnemonic, string CextOp, RegisterClass RC,
-                    bits<5> ImmBits, bits<5> PredImmBits> {
-
-  let CextOpcode = CextOp, BaseOpcode = CextOp in {
-    let opExtendable = 1, isExtentSigned = 1, opExtentBits = ImmBits,
-         isPredicable = 1 in
-    def NAME#_nv_V4 : NVInst_V4<(outs),
-            (ins MEMri:$addr, RC:$src),
-            mnemonic#"($addr) = $src.new",
-            []>,
-            Requires<[HasV4T]>;
-
-    let opExtendable = 2, isExtentSigned = 0, opExtentBits = PredImmBits,
-        neverHasSideEffects = 1, isPredicated = 1 in {
-      defm Pt : ST_MEMri_Pred_nv<mnemonic, RC, 0>;
-      defm NotPt : ST_MEMri_Pred_nv<mnemonic, RC, 1>;
-    }
+//===----------------------------------------------------------------------===//
+// Post increment loads with register offset.
+//===----------------------------------------------------------------------===//
+
+let hasNewValue = 1 in
+def L2_loadbsw2_pr : T_load_pr <"membh", IntRegs, 0b0001, HalfWordAccess>;
+
+def L2_loadbsw4_pr : T_load_pr <"membh", DoubleRegs, 0b0111, WordAccess>;
+
+let hasSideEffects = 0, addrMode = PostInc in
+class T_loadalign_pr <string mnemonic, bits<4> MajOp, MemAccessSize AccessSz>
+  : LDInstPI <(outs DoubleRegs:$dst, IntRegs:$_dst_),
+              (ins DoubleRegs:$src1, IntRegs:$src2, ModRegs:$src3),
+  "$dst = "#mnemonic#"($src2++$src3)", [],
+  "$src1 = $dst, $src2 = $_dst_"> {
+    bits<5> dst;
+    bits<5> src2;
+    bits<1> src3;
+
+    let accessSize = AccessSz;
+    let IClass = 0b1001;
+
+    let Inst{27-25} = 0b110;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{13}    = src3;
+    let Inst{12}    = 0b0;
+    let Inst{7}     = 0b0;
+    let Inst{4-0}   = dst;
   }
-}
 
-let addrMode = BaseImmOffset, isMEMri = "true", validSubTargets = HasV4SubT,
-mayStore = 1 in {
-  let accessSize = ByteAccess in
-    defm STrib: ST_MEMri_nv<"memb", "STrib", IntRegs, 11, 6>, AddrModeRel;
+def L2_loadalignb_pr : T_loadalign_pr <"memb_fifo", 0b0100, ByteAccess>;
+def L2_loadalignh_pr : T_loadalign_pr <"memh_fifo", 0b0010, HalfWordAccess>;
 
-  let accessSize = HalfWordAccess in
-    defm STrih: ST_MEMri_nv<"memh", "STrih", IntRegs, 12, 7>, AddrModeRel;
+//===----------------------------------------------------------------------===//
+// Template class for non-predicated post increment .new stores
+// mem[bhwd](Rx++#s4:[0123])=Nt.new
+//===----------------------------------------------------------------------===//
+let isPredicable = 1, hasSideEffects = 0, addrMode = PostInc, isNVStore = 1,
+    isNewValue = 1, opNewValue = 3 in
+class T_StorePI_nv <string mnemonic, Operand ImmOp, bits<2> MajOp >
+  : NVInstPI_V4 <(outs IntRegs:$_dst_),
+                 (ins IntRegs:$src1, ImmOp:$offset, IntRegs:$src2),
+  mnemonic#"($src1++#$offset) = $src2.new",
+  [], "$src1 = $_dst_">,
+  AddrModeRel {
+    bits<5> src1;
+    bits<3> src2;
+    bits<7> offset;
+    bits<4> offsetBits;
 
-  let accessSize = WordAccess in
-    defm STriw: ST_MEMri_nv<"memw", "STriw", IntRegs, 13, 8>, AddrModeRel;
-}
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+                                      /* s4_0Imm */ offset{3-0}));
+    let IClass = 0b1010;
+
+    let Inst{27-21} = 0b1011101;
+    let Inst{20-16} = src1;
+    let Inst{13} = 0b0;
+    let Inst{12-11} = MajOp;
+    let Inst{10-8} = src2;
+    let Inst{7} = 0b0;
+    let Inst{6-3} = offsetBits;
+    let Inst{1} = 0b0;
+  }
 
 //===----------------------------------------------------------------------===//
-// Post increment store
-// mem[bhwd](Rx++#s4:[0123])=Nt.new
+// Template class for predicated post increment .new stores
+// if([!]Pv[.new]) mem[bhwd](Rx++#s4:[0123])=Nt.new
 //===----------------------------------------------------------------------===//
+let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc, isNVStore = 1,
+    isNewValue = 1, opNewValue = 4 in
+class T_StorePI_nv_pred <string mnemonic, Operand ImmOp,
+                         bits<2> MajOp, bit isPredNot, bit isPredNew >
+  : NVInstPI_V4 <(outs IntRegs:$_dst_),
+                 (ins PredRegs:$src1, IntRegs:$src2,
+                      ImmOp:$offset, IntRegs:$src3),
+  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+  ") ")#mnemonic#"($src2++#$offset) = $src3.new",
+  [], "$src2 = $_dst_">,
+  AddrModeRel {
+    bits<2> src1;
+    bits<5> src2;
+    bits<3> src3;
+    bits<7> offset;
+    bits<4> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+                                      /* s4_0Imm */ offset{3-0}));
+    let isPredicatedNew = isPredNew;
+    let isPredicatedFalse = isPredNot;
+
+    let IClass = 0b1010;
+
+    let Inst{27-21} = 0b1011101;
+    let Inst{20-16} = src2;
+    let Inst{13} = 0b1;
+    let Inst{12-11} = MajOp;
+    let Inst{10-8} = src3;
+    let Inst{7} = isPredNew;
+    let Inst{6-3} = offsetBits;
+    let Inst{2} = isPredNot;
+    let Inst{1-0} = src1;
+  }
+
+multiclass ST_PostInc_Pred_nv<string mnemonic, Operand ImmOp,
+                              bits<2> MajOp, bit PredNot> {
+  def _pi : T_StorePI_nv_pred <mnemonic, ImmOp, MajOp, PredNot, 0>;
 
-multiclass ST_PostInc_Pbase_nv<string mnemonic, RegisterClass RC, Operand ImmOp,
-                            bit isNot, bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME#_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-            ") ")#mnemonic#"($src2++#$offset) = $src3.new",
-            [],
-            "$src2 = $dst">,
-            Requires<[HasV4T]>;
+  // Predicate new
+  def new_pi : T_StorePI_nv_pred <mnemonic, ImmOp, MajOp, PredNot, 1>;
 }
 
-multiclass ST_PostInc_Pred_nv<string mnemonic, RegisterClass RC,
-                           Operand ImmOp, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    defm _c#NAME : ST_PostInc_Pbase_nv<mnemonic, RC, ImmOp, PredNot, 0>;
-    // Predicate new
-    let Predicates = [HasV4T], validSubTargets = HasV4SubT in
-    defm _cdn#NAME : ST_PostInc_Pbase_nv<mnemonic, RC, ImmOp, PredNot, 1>;
+multiclass ST_PostInc_nv<string mnemonic, string BaseOp, Operand ImmOp,
+                         bits<2> MajOp> {
+  let BaseOpcode = "POST_"#BaseOp in {
+    def S2_#NAME#_pi : T_StorePI_nv <mnemonic, ImmOp, MajOp>;
+
+    // Predicated
+    defm S2_p#NAME#t : ST_PostInc_Pred_nv <mnemonic, ImmOp, MajOp, 0>;
+    defm S2_p#NAME#f : ST_PostInc_Pred_nv <mnemonic, ImmOp, MajOp, 1>;
   }
 }
 
-let hasCtrlDep = 1, isNVStore = 1, neverHasSideEffects = 1 in
-multiclass ST_PostInc_nv<string mnemonic, string BaseOp, RegisterClass RC,
-                      Operand ImmOp> {
+let accessSize = ByteAccess in
+defm storerbnew: ST_PostInc_nv <"memb", "STrib", s4_0Imm, 0b00>;
 
-  let BaseOpcode = "POST_"#BaseOp in {
-    let isPredicable = 1 in
-    def NAME#_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-                (ins IntRegs:$src1, ImmOp:$offset, RC:$src2),
-                mnemonic#"($src1++#$offset) = $src2.new",
-                [],
-                "$src1 = $dst">,
-                Requires<[HasV4T]>;
-
-    let isPredicated = 1 in {
-      defm Pt : ST_PostInc_Pred_nv<mnemonic, RC, ImmOp, 0 >;
-      defm NotPt : ST_PostInc_Pred_nv<mnemonic, RC, ImmOp, 1 >;
-    }
+let accessSize = HalfWordAccess in
+defm storerhnew: ST_PostInc_nv <"memh", "STrih", s4_1Imm, 0b01>;
+
+let accessSize = WordAccess in
+defm storerinew: ST_PostInc_nv <"memw", "STriw", s4_2Imm, 0b10>;
+
+//===----------------------------------------------------------------------===//
+// Template class for post increment .new stores with register offset
+//===----------------------------------------------------------------------===//
+let isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3 in
+class T_StorePI_RegNV <string mnemonic, bits<2> MajOp, MemAccessSize AccessSz>
+  : NVInstPI_V4 <(outs IntRegs:$_dst_),
+                 (ins IntRegs:$src1, ModRegs:$src2, IntRegs:$src3),
+  #mnemonic#"($src1++$src2) = $src3.new",
+  [], "$src1 = $_dst_"> {
+    bits<5> src1;
+    bits<1> src2;
+    bits<3> src3;
+    let accessSize = AccessSz;
+
+    let IClass = 0b1010;
+
+    let Inst{27-21} = 0b1101101;
+    let Inst{20-16} = src1;
+    let Inst{13}    = src2;
+    let Inst{12-11} = MajOp;
+    let Inst{10-8}  = src3;
+    let Inst{7}     = 0b0;
   }
-}
 
-let addrMode = PostInc, validSubTargets = HasV4SubT in {
-defm POST_STbri: ST_PostInc_nv <"memb", "STrib", IntRegs, s4_0Imm>, AddrModeRel;
-defm POST_SThri: ST_PostInc_nv <"memh", "STrih", IntRegs, s4_1Imm>, AddrModeRel;
-defm POST_STwri: ST_PostInc_nv <"memw", "STriw", IntRegs, s4_2Imm>, AddrModeRel;
-}
+def S2_storerbnew_pr : T_StorePI_RegNV<"memb", 0b00, ByteAccess>;
+def S2_storerhnew_pr : T_StorePI_RegNV<"memh", 0b01, HalfWordAccess>;
+def S2_storerinew_pr : T_StorePI_RegNV<"memw", 0b10, WordAccess>;
 
 // memb(Rx++#s4:0:circ(Mu))=Nt.new
 // memb(Rx++I:circ(Mu))=Nt.new
-// memb(Rx++Mu)=Nt.new
 // memb(Rx++Mu:brev)=Nt.new
 // memh(Rx++#s4:1:circ(Mu))=Nt.new
 // memh(Rx++I:circ(Mu))=Nt.new
@@ -1002,7 +1522,8 @@ defm POST_STwri: ST_PostInc_nv <"memw", "STriw", IntRegs, s4_2Imm>, AddrModeRel;
 // operands.
 //===----------------------------------------------------------------------===//
 
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11 in
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11,
+    opExtentAlign = 2 in
 class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
                       bit isNegCond, bit isTak>
   : NVInst_V4<(outs),
@@ -1010,8 +1531,7 @@ class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
     "if ("#!if(isNegCond, "!","")#mnemonic#
     "($src1"#!if(!eq(NvOpNum, 0),".new, ",", ")#
     "$src2"#!if(!eq(NvOpNum, 1),".new))","))")#" jump:"
-    #!if(isTak, "t","nt")#" $offset",
-    []>, Requires<[HasV4T]> {
+    #!if(isTak, "t","nt")#" $offset", []> {
 
       bits<5> src1;
       bits<5> src2;
@@ -1020,14 +1540,14 @@ class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
       bits<11> offset;
 
       let isTaken = isTak;
-      let isBrTaken = !if(isTaken, "true", "false");
       let isPredicatedFalse = isNegCond;
+      let opNewValue{0} = NvOpNum;
 
       let Ns = !if(!eq(NvOpNum, 0), src1{2-0}, src2{2-0});
       let RegOp = !if(!eq(NvOpNum, 0), src2, src1);
 
       let IClass = 0b0010;
-      let Inst{26} = 0b0;
+      let Inst{27-26} = 0b00;
       let Inst{25-23} = majOp;
       let Inst{22} = isNegCond;
       let Inst{18-16} = Ns;
@@ -1041,9 +1561,9 @@ class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
 multiclass NVJrr_cond<string mnemonic, bits<3> majOp, bit NvOpNum,
                        bit isNegCond> {
   // Branch not taken:
-  def _nt_V4: NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 0>;
+  def _nt: NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 0>;
   // Branch taken:
-  def _t_V4: NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 1>;
+  def _t : NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 1>;
 }
 
 // NvOpNum = 0 -> First Operand is a new-value Register
@@ -1052,8 +1572,8 @@ multiclass NVJrr_cond<string mnemonic, bits<3> majOp, bit NvOpNum,
 multiclass NVJrr_base<string mnemonic, string BaseOp, bits<3> majOp,
                        bit NvOpNum> {
   let BaseOpcode = BaseOp#_NVJ in {
-    defm _t_Jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 0>; // True cond
-    defm _f_Jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 1>; // False cond
+    defm _t_jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 0>; // True cond
+    defm _f_jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 1>; // False cond
   }
 }
 
@@ -1064,12 +1584,12 @@ multiclass NVJrr_base<string mnemonic, string BaseOp, bits<3> majOp,
 // if ([!]cmp.gtu(Rt,Ns.new)) jump:[n]t #r9:2
 
 let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
-  Defs = [PC], neverHasSideEffects = 1, validSubTargets = HasV4SubT in {
-  defm CMPEQrr  : NVJrr_base<"cmp.eq",  "CMPEQ",  0b000, 0>, PredRel;
-  defm CMPGTrr  : NVJrr_base<"cmp.gt",  "CMPGT",  0b001, 0>, PredRel;
-  defm CMPGTUrr : NVJrr_base<"cmp.gtu", "CMPGTU", 0b010, 0>, PredRel;
-  defm CMPLTrr  : NVJrr_base<"cmp.gt",  "CMPLT",  0b011, 1>, PredRel;
-  defm CMPLTUrr : NVJrr_base<"cmp.gtu", "CMPLTU", 0b100, 1>, PredRel;
+    Defs = [PC], hasSideEffects = 0 in {
+  defm J4_cmpeq  : NVJrr_base<"cmp.eq",  "CMPEQ",  0b000, 0>, PredRel;
+  defm J4_cmpgt  : NVJrr_base<"cmp.gt",  "CMPGT",  0b001, 0>, PredRel;
+  defm J4_cmpgtu : NVJrr_base<"cmp.gtu", "CMPGTU", 0b010, 0>, PredRel;
+  defm J4_cmplt  : NVJrr_base<"cmp.gt",  "CMPLT",  0b011, 1>, PredRel;
+  defm J4_cmpltu : NVJrr_base<"cmp.gtu", "CMPLTU", 0b100, 1>, PredRel;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1077,18 +1597,18 @@ let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
 // with a register and an unsigned immediate (U5) operand.
 //===----------------------------------------------------------------------===//
 
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11 in
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11,
+    opExtentAlign = 2 in
 class NVJri_template<string mnemonic, bits<3> majOp, bit isNegCond,
                          bit isTak>
   : NVInst_V4<(outs),
     (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset),
     "if ("#!if(isNegCond, "!","")#mnemonic#"($src1.new, #$src2)) jump:"
-    #!if(isTak, "t","nt")#" $offset",
-    []>, Requires<[HasV4T]> {
+    #!if(isTak, "t","nt")#" $offset", []> {
 
       let isTaken = isTak;
       let isPredicatedFalse = isNegCond;
-      let isBrTaken = !if(isTaken, "true", "false");
+      let isTaken = isTak;
 
       bits<3> src1;
       bits<5> src2;
@@ -1107,15 +1627,15 @@ class NVJri_template<string mnemonic, bits<3> majOp, bit isNegCond,
 
 multiclass NVJri_cond<string mnemonic, bits<3> majOp, bit isNegCond> {
   // Branch not taken:
-  def _nt_V4: NVJri_template<mnemonic, majOp, isNegCond, 0>;
+  def _nt: NVJri_template<mnemonic, majOp, isNegCond, 0>;
   // Branch taken:
-  def _t_V4: NVJri_template<mnemonic, majOp, isNegCond, 1>;
+  def _t : NVJri_template<mnemonic, majOp, isNegCond, 1>;
 }
 
 multiclass NVJri_base<string mnemonic, string BaseOp, bits<3> majOp> {
   let BaseOpcode = BaseOp#_NVJri in {
-    defm _t_Jumpnv : NVJri_cond<mnemonic, majOp, 0>; // True Cond
-    defm _f_Jumpnv : NVJri_cond<mnemonic, majOp, 1>; // False cond
+    defm _t_jumpnv : NVJri_cond<mnemonic, majOp, 0>; // True Cond
+    defm _f_jumpnv : NVJri_cond<mnemonic, majOp, 1>; // False cond
   }
 }
 
@@ -1124,10 +1644,10 @@ multiclass NVJri_base<string mnemonic, string BaseOp, bits<3> majOp> {
 // if ([!]cmp.gtu(Ns.new,#U5)) jump:[n]t #r9:2
 
 let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
-  Defs = [PC], neverHasSideEffects = 1, validSubTargets = HasV4SubT in {
-  defm CMPEQri  : NVJri_base<"cmp.eq", "CMPEQ", 0b000>, PredRel;
-  defm CMPGTri  : NVJri_base<"cmp.gt", "CMPGT", 0b001>, PredRel;
-  defm CMPGTUri : NVJri_base<"cmp.gtu", "CMPGTU", 0b010>, PredRel;
+    Defs = [PC], hasSideEffects = 0 in {
+  defm J4_cmpeqi  : NVJri_base<"cmp.eq", "CMPEQ", 0b000>, PredRel;
+  defm J4_cmpgti  : NVJri_base<"cmp.gt", "CMPGT", 0b001>, PredRel;
+  defm J4_cmpgtui : NVJri_base<"cmp.gtu", "CMPGTU", 0b010>, PredRel;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1135,19 +1655,19 @@ let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
 // with a register and an hardcoded 0/-1 immediate value.
 //===----------------------------------------------------------------------===//
 
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 11 in
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 11,
+    opExtentAlign = 2 in
 class NVJ_ConstImm_template<string mnemonic, bits<3> majOp, string ImmVal,
                             bit isNegCond, bit isTak>
   : NVInst_V4<(outs),
     (ins IntRegs:$src1, brtarget:$offset),
     "if ("#!if(isNegCond, "!","")#mnemonic
     #"($src1.new, #"#ImmVal#")) jump:"
-    #!if(isTak, "t","nt")#" $offset",
-    []>, Requires<[HasV4T]> {
+    #!if(isTak, "t","nt")#" $offset", []> {
 
       let isTaken = isTak;
       let isPredicatedFalse = isNegCond;
-      let isBrTaken = !if(isTaken, "true", "false");
+      let isTaken = isTak;
 
       bits<3> src1;
       bits<11> offset;
@@ -1164,16 +1684,16 @@ class NVJ_ConstImm_template<string mnemonic, bits<3> majOp, string ImmVal,
 multiclass NVJ_ConstImm_cond<string mnemonic, bits<3> majOp, string ImmVal,
                              bit isNegCond> {
   // Branch not taken:
-  def _nt_V4: NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 0>;
+  def _nt: NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 0>;
   // Branch taken:
-  def _t_V4: NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 1>;
+  def _t : NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 1>;
 }
 
 multiclass NVJ_ConstImm_base<string mnemonic, string BaseOp, bits<3> majOp,
                              string ImmVal> {
   let BaseOpcode = BaseOp#_NVJ_ConstImm in {
-  defm _t_Jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 0>; // True cond
-  defm _f_Jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 1>; // False Cond
+    defm _t_jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 0>; // True
+    defm _f_jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 1>; // False
   }
 }
 
@@ -1182,51 +1702,194 @@ multiclass NVJ_ConstImm_base<string mnemonic, string BaseOp, bits<3> majOp,
 // if ([!]cmp.gt(Ns.new,#-1)) jump:[n]t #r9:2
 
 let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator=1,
-  Defs = [PC], neverHasSideEffects = 1 in {
-  defm TSTBIT0  : NVJ_ConstImm_base<"tstbit", "TSTBIT", 0b011, "0">, PredRel;
-  defm CMPEQn1  : NVJ_ConstImm_base<"cmp.eq", "CMPEQ",  0b100, "-1">, PredRel;
-  defm CMPGTn1  : NVJ_ConstImm_base<"cmp.gt", "CMPGT",  0b101, "-1">, PredRel;
+    Defs = [PC], hasSideEffects = 0 in {
+  defm J4_tstbit0 : NVJ_ConstImm_base<"tstbit", "TSTBIT", 0b011, "0">, PredRel;
+  defm J4_cmpeqn1 : NVJ_ConstImm_base<"cmp.eq", "CMPEQ",  0b100, "-1">, PredRel;
+  defm J4_cmpgtn1 : NVJ_ConstImm_base<"cmp.gt", "CMPGT",  0b101, "-1">, PredRel;
+}
+
+// J4_hintjumpr: Hint indirect conditional jump.
+let isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def J4_hintjumpr: JRInst <
+  (outs),
+  (ins IntRegs:$Rs),
+  "hintjr($Rs)"> {
+    bits<5> Rs;
+    let IClass = 0b0101;
+    let Inst{27-21} = 0b0010101;
+    let Inst{20-16} = Rs;
+  }
+
+//===----------------------------------------------------------------------===//
+// NV/J -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// CR +
+//===----------------------------------------------------------------------===//
+
+// PC-relative add
+let hasNewValue = 1, isExtendable = 1, opExtendable = 1,
+    isExtentSigned = 0, opExtentBits = 6, hasSideEffects = 0, Uses = [PC] in
+def C4_addipc : CRInst <(outs IntRegs:$Rd), (ins u6Ext:$u6),
+  "$Rd = add(pc, #$u6)", [], "", CR_tc_2_SLOT3 > {
+    bits<5> Rd;
+    bits<6> u6;
+
+    let IClass = 0b0110;
+    let Inst{27-16} = 0b101001001001;
+    let Inst{12-7} = u6;
+    let Inst{4-0} = Rd;
+  }
+
+
+
+let hasSideEffects = 0 in
+class T_LOGICAL_3OP<string MnOp1, string MnOp2, bits<2> OpBits, bit IsNeg>
+    : CRInst<(outs PredRegs:$Pd),
+             (ins PredRegs:$Ps, PredRegs:$Pt, PredRegs:$Pu),
+             "$Pd = " # MnOp1 # "($Ps, " # MnOp2 # "($Pt, " #
+                   !if (IsNeg,"!","") # "$Pu))",
+             [], "", CR_tc_2early_SLOT23> {
+  bits<2> Pd;
+  bits<2> Ps;
+  bits<2> Pt;
+  bits<2> Pu;
+
+  let IClass = 0b0110;
+  let Inst{27-24} = 0b1011;
+  let Inst{23} = IsNeg;
+  let Inst{22-21} = OpBits;
+  let Inst{20} = 0b1;
+  let Inst{17-16} = Ps;
+  let Inst{13} = 0b0;
+  let Inst{9-8} = Pt;
+  let Inst{7-6} = Pu;
+  let Inst{1-0} = Pd;
 }
 
+def C4_and_and  : T_LOGICAL_3OP<"and", "and", 0b00, 0>;
+def C4_and_or   : T_LOGICAL_3OP<"and", "or",  0b01, 0>;
+def C4_or_and   : T_LOGICAL_3OP<"or",  "and", 0b10, 0>;
+def C4_or_or    : T_LOGICAL_3OP<"or",  "or",  0b11, 0>;
+def C4_and_andn : T_LOGICAL_3OP<"and", "and", 0b00, 1>;
+def C4_and_orn  : T_LOGICAL_3OP<"and", "or",  0b01, 1>;
+def C4_or_andn  : T_LOGICAL_3OP<"or",  "and", 0b10, 1>;
+def C4_or_orn   : T_LOGICAL_3OP<"or",  "or",  0b11, 1>;
+
+// op(Ps, op(Pt, Pu))
+class LogLog_pat<SDNode Op1, SDNode Op2, InstHexagon MI>
+  : Pat<(i1 (Op1 I1:$Ps, (Op2 I1:$Pt, I1:$Pu))),
+        (MI I1:$Ps, I1:$Pt, I1:$Pu)>;
+
+// op(Ps, op(Pt, ~Pu))
+class LogLogNot_pat<SDNode Op1, SDNode Op2, InstHexagon MI>
+  : Pat<(i1 (Op1 I1:$Ps, (Op2 I1:$Pt, (not I1:$Pu)))),
+        (MI I1:$Ps, I1:$Pt, I1:$Pu)>;
+
+def: LogLog_pat<and, and, C4_and_and>;
+def: LogLog_pat<and, or,  C4_and_or>;
+def: LogLog_pat<or,  and, C4_or_and>;
+def: LogLog_pat<or,  or,  C4_or_or>;
+
+def: LogLogNot_pat<and, and, C4_and_andn>;
+def: LogLogNot_pat<and, or,  C4_and_orn>;
+def: LogLogNot_pat<or,  and, C4_or_andn>;
+def: LogLogNot_pat<or,  or,  C4_or_orn>;
+
+//===----------------------------------------------------------------------===//
+// CR -
+//===----------------------------------------------------------------------===//
+
 //===----------------------------------------------------------------------===//
 // XTYPE/ALU +
 //===----------------------------------------------------------------------===//
 
+// Logical with-not instructions.
+def A4_andnp : T_ALU64_logical<"and", 0b001, 1, 0, 1>;
+def A4_ornp  : T_ALU64_logical<"or",  0b011, 1, 0, 1>;
+
+def: Pat<(i64 (and (i64 DoubleRegs:$Rs), (i64 (not (i64 DoubleRegs:$Rt))))),
+         (A4_andnp DoubleRegs:$Rs, DoubleRegs:$Rt)>;
+def: Pat<(i64 (or  (i64 DoubleRegs:$Rs), (i64 (not (i64 DoubleRegs:$Rt))))),
+         (A4_ornp DoubleRegs:$Rs, DoubleRegs:$Rt)>;
+
+let hasNewValue = 1, hasSideEffects = 0 in
+def S4_parity: ALU64Inst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
+      "$Rd = parity($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-21} = 0b0101111;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{4-0} = Rd;
+}
+
 //  Add and accumulate.
 //  Rd=add(Rs,add(Ru,#s6))
-let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 6,
-validSubTargets = HasV4SubT in
-def ADDr_ADDri_V4 : MInst<(outs IntRegs:$dst),
-          (ins IntRegs:$src1, IntRegs:$src2, s6Ext:$src3),
-          "$dst = add($src1, add($src2, #$src3))",
-          [(set (i32 IntRegs:$dst),
-           (add (i32 IntRegs:$src1), (add (i32 IntRegs:$src2),
-                                          s6_16ExtPred:$src3)))]>,
-          Requires<[HasV4T]>;
-
-//  Rd=add(Rs,sub(#s6,Ru))
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 6,
-validSubTargets = HasV4SubT in
-def ADDr_SUBri_V4 : MInst<(outs IntRegs:$dst),
-          (ins IntRegs:$src1, s6Ext:$src2, IntRegs:$src3),
-          "$dst = add($src1, sub(#$src2, $src3))",
-          [(set (i32 IntRegs:$dst),
-           (add (i32 IntRegs:$src1), (sub s6_10ExtPred:$src2,
-                                          (i32 IntRegs:$src3))))]>,
-          Requires<[HasV4T]>;
-
-// Generates the same instruction as ADDr_SUBri_V4 but matches different
-// pattern.
-//  Rd=add(Rs,sub(#s6,Ru))
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 6,
-validSubTargets = HasV4SubT in
-def ADDri_SUBr_V4 : MInst<(outs IntRegs:$dst),
-          (ins IntRegs:$src1, s6Ext:$src2, IntRegs:$src3),
-          "$dst = add($src1, sub(#$src2, $src3))",
-          [(set (i32 IntRegs:$dst),
-                (sub (add (i32 IntRegs:$src1), s6_10ExtPred:$src2),
-                     (i32 IntRegs:$src3)))]>,
-          Requires<[HasV4T]>;
+let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1, opExtentBits = 6,
+    opExtendable = 3 in
+def S4_addaddi : ALU64Inst <(outs IntRegs:$Rd),
+                            (ins IntRegs:$Rs, IntRegs:$Ru, s6Ext:$s6),
+  "$Rd = add($Rs, add($Ru, #$s6))" ,
+  [(set (i32 IntRegs:$Rd), (add (i32 IntRegs:$Rs),
+                           (add (i32 IntRegs:$Ru), s6_16ExtPred:$s6)))],
+  "", ALU64_tc_2_SLOT23> {
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<5> Ru;
+    bits<6> s6;
+
+    let IClass = 0b1101;
+
+    let Inst{27-23} = 0b10110;
+    let Inst{22-21} = s6{5-4};
+    let Inst{20-16} = Rs;
+    let Inst{13}    = s6{3};
+    let Inst{12-8}  = Rd;
+    let Inst{7-5}   = s6{2-0};
+    let Inst{4-0}   = Ru;
+  }
+
+let isExtentSigned = 1, hasSideEffects = 0, hasNewValue = 1, isExtendable = 1,
+    opExtentBits = 6, opExtendable = 2 in
+def S4_subaddi: ALU64Inst <(outs IntRegs:$Rd),
+                           (ins IntRegs:$Rs, s6Ext:$s6, IntRegs:$Ru),
+  "$Rd = add($Rs, sub(#$s6, $Ru))",
+  [], "", ALU64_tc_2_SLOT23> {
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<6> s6;
+    bits<5> Ru;
+
+    let IClass = 0b1101;
+
+    let Inst{27-23} = 0b10111;
+    let Inst{22-21} = s6{5-4};
+    let Inst{20-16} = Rs;
+    let Inst{13}    = s6{3};
+    let Inst{12-8}  = Rd;
+    let Inst{7-5}   = s6{2-0};
+    let Inst{4-0}   = Ru;
+  }
+
+// Rd=add(Rs,sub(#s6,Ru))
+def: Pat<(add (i32 IntRegs:$src1), (sub s6_10ExtPred:$src2,
+                                        (i32 IntRegs:$src3))),
+         (S4_subaddi IntRegs:$src1, s6_10ExtPred:$src2, IntRegs:$src3)>;
+
+// Rd=sub(add(Rs,#s6),Ru)
+def: Pat<(sub (add (i32 IntRegs:$src1), s6_10ExtPred:$src2),
+                   (i32 IntRegs:$src3)),
+         (S4_subaddi IntRegs:$src1, s6_10ExtPred:$src2, IntRegs:$src3)>;
+
+// Rd=add(sub(Rs,Ru),#s6)
+def: Pat<(add (sub (i32 IntRegs:$src1), (i32 IntRegs:$src3)),
+                   (s6_10ExtPred:$src2)),
+         (S4_subaddi IntRegs:$src1, s6_10ExtPred:$src2, IntRegs:$src3)>;
 
 
 //  Add or subtract doublewords with carry.
@@ -1235,213 +1898,316 @@ def ADDri_SUBr_V4 : MInst<(outs IntRegs:$dst),
 //TODO:
 //  Rdd=sub(Rss,Rtt,Px):carry
 
+// Extract bitfield
+// Rdd=extract(Rss,#u6,#U6)
+// Rdd=extract(Rss,Rtt)
+// Rd=extract(Rs,Rtt)
+// Rd=extract(Rs,#u5,#U5)
 
-//  Logical doublewords.
-//  Rdd=and(Rtt,~Rss)
-let validSubTargets = HasV4SubT in
-def ANDd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
-          (ins DoubleRegs:$src1, DoubleRegs:$src2),
-          "$dst = and($src1, ~$src2)",
-          [(set (i64 DoubleRegs:$dst), (and (i64 DoubleRegs:$src1),
-                                      (not (i64 DoubleRegs:$src2))))]>,
-          Requires<[HasV4T]>;
-
-//  Rdd=or(Rtt,~Rss)
-let validSubTargets = HasV4SubT in
-def ORd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
-          (ins DoubleRegs:$src1, DoubleRegs:$src2),
-          "$dst = or($src1, ~$src2)",
-          [(set (i64 DoubleRegs:$dst),
-           (or (i64 DoubleRegs:$src1), (not (i64 DoubleRegs:$src2))))]>,
-          Requires<[HasV4T]>;
-
-
-//  Logical-logical doublewords.
-//  Rxx^=xor(Rss,Rtt)
-let validSubTargets = HasV4SubT in
-def XORd_XORdd: MInst_acc<(outs DoubleRegs:$dst),
-          (ins DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-          "$dst ^= xor($src2, $src3)",
-          [(set (i64 DoubleRegs:$dst),
-           (xor (i64 DoubleRegs:$src1), (xor (i64 DoubleRegs:$src2),
-                                             (i64 DoubleRegs:$src3))))],
-          "$src1 = $dst">,
-          Requires<[HasV4T]>;
+def S4_extractp_rp : T_S3op_64 < "extract",  0b11, 0b100, 0>;
+def S4_extractp    : T_S2op_extract <"extract",  0b1010, DoubleRegs, u6Imm>;
 
+let hasNewValue = 1 in {
+  def S4_extract_rp : T_S3op_extract<"extract",  0b01>;
+  def S4_extract    : T_S2op_extract <"extract",  0b1101, IntRegs, u5Imm>;
+}
+
+// Complex add/sub halfwords/words
+let Defs = [USR_OVF] in {
+  def S4_vxaddsubh : T_S3op_64 < "vxaddsubh", 0b01, 0b100, 0, 1>;
+  def S4_vxaddsubw : T_S3op_64 < "vxaddsubw", 0b01, 0b000, 0, 1>;
+  def S4_vxsubaddh : T_S3op_64 < "vxsubaddh", 0b01, 0b110, 0, 1>;
+  def S4_vxsubaddw : T_S3op_64 < "vxsubaddw", 0b01, 0b010, 0, 1>;
+}
+
+let Defs = [USR_OVF] in {
+  def S4_vxaddsubhr : T_S3op_64 < "vxaddsubh", 0b11, 0b000, 0, 1, 1, 1>;
+  def S4_vxsubaddhr : T_S3op_64 < "vxsubaddh", 0b11, 0b010, 0, 1, 1, 1>;
+}
+
+let Itinerary = M_tc_3x_SLOT23, Defs = [USR_OVF] in {
+  def M4_mac_up_s1_sat: T_MType_acc_rr<"+= mpy", 0b011, 0b000, 0, [], 0, 1, 1>;
+  def M4_nac_up_s1_sat: T_MType_acc_rr<"-= mpy", 0b011, 0b001, 0, [], 0, 1, 1>;
+}
+
+// Logical xor with xor accumulation.
+// Rxx^=xor(Rss,Rtt)
+let hasSideEffects = 0 in
+def M4_xor_xacc
+  : SInst <(outs DoubleRegs:$Rxx),
+           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Rxx ^= xor($Rss, $Rtt)",
+  [(set (i64 DoubleRegs:$Rxx),
+   (xor (i64 DoubleRegs:$dst2), (xor (i64 DoubleRegs:$Rss),
+                                     (i64 DoubleRegs:$Rtt))))],
+  "$dst2 = $Rxx", S_3op_tc_1_SLOT23> {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1100;
+
+    let Inst{27-22} = 0b101010;
+    let Inst{20-16} = Rss;
+    let Inst{12-8}  = Rtt;
+    let Inst{7-5}   = 0b000;
+    let Inst{4-0}   = Rxx;
+  }
+
+// Rotate and reduce bytes
+// Rdd=vrcrotate(Rss,Rt,#u2)
+let hasSideEffects = 0 in
+def S4_vrcrotate
+  : SInst <(outs DoubleRegs:$Rdd),
+           (ins DoubleRegs:$Rss, IntRegs:$Rt, u2Imm:$u2),
+  "$Rdd = vrcrotate($Rss, $Rt, #$u2)",
+  [], "", S_3op_tc_3x_SLOT23> {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rt;
+    bits<2> u2;
+
+    let IClass = 0b1100;
+
+    let Inst{27-22} = 0b001111;
+    let Inst{20-16} = Rss;
+    let Inst{13}    = u2{1};
+    let Inst{12-8}  = Rt;
+    let Inst{7-6}   = 0b11;
+    let Inst{5}     = u2{0};
+    let Inst{4-0}   = Rdd;
+  }
+
+// Rotate and reduce bytes with accumulation
+// Rxx+=vrcrotate(Rss,Rt,#u2)
+let hasSideEffects = 0 in
+def S4_vrcrotate_acc
+  : SInst <(outs DoubleRegs:$Rxx),
+           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Rt, u2Imm:$u2),
+  "$Rxx += vrcrotate($Rss, $Rt, #$u2)", [],
+  "$dst2 = $Rxx", S_3op_tc_3x_SLOT23> {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<5> Rt;
+    bits<2> u2;
+
+    let IClass = 0b1100;
+
+    let Inst{27-21} = 0b1011101;
+    let Inst{20-16} = Rss;
+    let Inst{13}    = u2{1};
+    let Inst{12-8}  = Rt;
+    let Inst{5}     = u2{0};
+    let Inst{4-0}   = Rxx;
+  }
+
+// Vector reduce conditional negate halfwords
+let hasSideEffects = 0 in
+def S2_vrcnegh
+  : SInst <(outs DoubleRegs:$Rxx),
+           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Rt),
+  "$Rxx += vrcnegh($Rss, $Rt)", [],
+  "$dst2 = $Rxx", S_3op_tc_3x_SLOT23> {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<5> Rt;
+
+    let IClass = 0b1100;
+
+    let Inst{27-21} = 0b1011001;
+    let Inst{20-16} = Rss;
+    let Inst{13}    = 0b1;
+    let Inst{12-8}  = Rt;
+    let Inst{7-5}   = 0b111;
+    let Inst{4-0}   = Rxx;
+  }
+
+// Split bitfield
+def A4_bitspliti : T_S2op_2_di <"bitsplit", 0b110, 0b100>;
+
+// Arithmetic/Convergent round
+def A4_cround_ri : T_S2op_2_ii <"cround", 0b111, 0b000>;
+
+def A4_round_ri  : T_S2op_2_ii <"round", 0b111, 0b100>;
+
+let Defs = [USR_OVF] in
+def A4_round_ri_sat : T_S2op_2_ii <"round", 0b111, 0b110, 1>;
 
 // Logical-logical words.
-// Rx=or(Ru,and(Rx,#s10))
-let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 10,
-validSubTargets = HasV4SubT in
-def ORr_ANDri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, s10Ext:$src3),
-            "$dst = or($src1, and($src2, #$src3))",
-            [(set (i32 IntRegs:$dst),
-                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
-                                                s10ExtPred:$src3)))],
-            "$src2 = $dst">,
-            Requires<[HasV4T]>;
+// Compound or-and -- Rx=or(Ru,and(Rx,#s10))
+let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1, opExtentBits = 10,
+    opExtendable = 3 in
+def S4_or_andix:
+  ALU64Inst<(outs IntRegs:$Rx),
+            (ins IntRegs:$Ru, IntRegs:$_src_, s10Ext:$s10),
+  "$Rx = or($Ru, and($_src_, #$s10))" ,
+  [(set (i32 IntRegs:$Rx),
+        (or (i32 IntRegs:$Ru), (and (i32 IntRegs:$_src_), s10ExtPred:$s10)))] ,
+  "$_src_ = $Rx", ALU64_tc_2_SLOT23> {
+    bits<5> Rx;
+    bits<5> Ru;
+    bits<10> s10;
+
+    let IClass = 0b1101;
+
+    let Inst{27-22} = 0b101001;
+    let Inst{20-16} = Rx;
+    let Inst{21}    = s10{9};
+    let Inst{13-5}  = s10{8-0};
+    let Inst{4-0}   = Ru;
+  }
+
+// Miscellaneous ALU64 instructions.
+//
+let hasNewValue = 1, hasSideEffects = 0 in
+def A4_modwrapu: ALU64Inst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
+      "$Rd = modwrap($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-21} = 0b0011111;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{7-5} = 0b111;
+  let Inst{4-0} = Rd;
+}
+
+let hasSideEffects = 0 in
+def A4_bitsplit: ALU64Inst<(outs DoubleRegs:$Rd),
+      (ins IntRegs:$Rs, IntRegs:$Rt),
+      "$Rd = bitsplit($Rs, $Rt)", [], "", ALU64_tc_1_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-24} = 0b0100;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{4-0} = Rd;
+}
+
+let hasSideEffects = 0 in
+def dep_S2_packhl: ALU64Inst<(outs DoubleRegs:$Rd),
+      (ins IntRegs:$Rs, IntRegs:$Rt),
+      "$Rd = packhl($Rs, $Rt):deprecated", [], "", ALU64_tc_1_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-24} = 0b0100;
+  let Inst{21} = 0b0;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{4-0} = Rd;
+}
+
+let hasNewValue = 1, hasSideEffects = 0 in
+def dep_A2_addsat: ALU64Inst<(outs IntRegs:$Rd),
+      (ins IntRegs:$Rs, IntRegs:$Rt),
+      "$Rd = add($Rs, $Rt):sat:deprecated", [], "", ALU64_tc_2_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-21} = 0b0101100;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{7} = 0b0;
+  let Inst{4-0} = Rd;
+}
+
+let hasNewValue = 1, hasSideEffects = 0 in
+def dep_A2_subsat: ALU64Inst<(outs IntRegs:$Rd),
+      (ins IntRegs:$Rs, IntRegs:$Rt),
+      "$Rd = sub($Rs, $Rt):sat:deprecated", [], "", ALU64_tc_2_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-21} = 0b0101100;
+  let Inst{20-16} = Rt;
+  let Inst{12-8} = Rs;
+  let Inst{7} = 0b1;
+  let Inst{4-0} = Rd;
+}
+
+// Rx[&|]=xor(Rs,Rt)
+def M4_or_xor   : T_MType_acc_rr < "|= xor", 0b110, 0b001, 0>;
+def M4_and_xor  : T_MType_acc_rr < "&= xor", 0b010, 0b010, 0>;
+
+// Rx[&|^]=or(Rs,Rt)
+def M4_xor_or   : T_MType_acc_rr < "^= or",  0b110, 0b011, 0>;
+
+let CextOpcode = "ORr_ORr" in
+def M4_or_or    : T_MType_acc_rr < "|= or",  0b110, 0b000, 0>;
+def M4_and_or   : T_MType_acc_rr < "&= or",  0b010, 0b001, 0>;
 
 // Rx[&|^]=and(Rs,Rt)
-// Rx&=and(Rs,Rt)
-let validSubTargets = HasV4SubT in
-def ANDr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
-            "$dst &= and($src2, $src3)",
-            [(set (i32 IntRegs:$dst),
-                  (and (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
-                                                 (i32 IntRegs:$src3))))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>;
-
-// Rx|=and(Rs,Rt)
-let validSubTargets = HasV4SubT, CextOpcode = "ORr_ANDr", InputType = "reg" in
-def ORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
-            "$dst |= and($src2, $src3)",
-            [(set (i32 IntRegs:$dst),
-                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
-                                                (i32 IntRegs:$src3))))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>, ImmRegRel;
-
-// Rx^=and(Rs,Rt)
-let validSubTargets = HasV4SubT in
-def XORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
-            "$dst ^= and($src2, $src3)",
-            [(set (i32 IntRegs:$dst),
-             (xor (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
-                                            (i32 IntRegs:$src3))))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>;
+def M4_xor_and  : T_MType_acc_rr < "^= and", 0b110, 0b010, 0>;
+
+let CextOpcode = "ORr_ANDr" in
+def M4_or_and   : T_MType_acc_rr < "|= and", 0b010, 0b011, 0>;
+def M4_and_and  : T_MType_acc_rr < "&= and", 0b010, 0b000, 0>;
 
 // Rx[&|^]=and(Rs,~Rt)
-// Rx&=and(Rs,~Rt)
-let validSubTargets = HasV4SubT in
-def ANDr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
-            "$dst &= and($src2, ~$src3)",
-            [(set (i32 IntRegs:$dst),
-                  (and (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
-                                                 (not (i32 IntRegs:$src3)))))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>;
-
-// Rx|=and(Rs,~Rt)
-let validSubTargets = HasV4SubT in
-def ORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
-            "$dst |= and($src2, ~$src3)",
-            [(set (i32 IntRegs:$dst),
-             (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
-                                           (not (i32 IntRegs:$src3)))))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>;
-
-// Rx^=and(Rs,~Rt)
-let validSubTargets = HasV4SubT in
-def XORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
-            "$dst ^= and($src2, ~$src3)",
-            [(set (i32 IntRegs:$dst),
-             (xor (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
-                                            (not (i32 IntRegs:$src3)))))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>;
+def M4_xor_andn : T_MType_acc_rr < "^= and", 0b001, 0b010, 0, [], 1>;
+def M4_or_andn  : T_MType_acc_rr < "|= and", 0b001, 0b000, 0, [], 1>;
+def M4_and_andn : T_MType_acc_rr < "&= and", 0b001, 0b001, 0, [], 1>;
+
+def: T_MType_acc_pat2 <M4_or_xor, xor, or>;
+def: T_MType_acc_pat2 <M4_and_xor, xor, and>;
+def: T_MType_acc_pat2 <M4_or_and, and, or>;
+def: T_MType_acc_pat2 <M4_and_and, and, and>;
+def: T_MType_acc_pat2 <M4_xor_and, and, xor>;
+def: T_MType_acc_pat2 <M4_or_or, or, or>;
+def: T_MType_acc_pat2 <M4_and_or, or, and>;
+def: T_MType_acc_pat2 <M4_xor_or, or, xor>;
+
+class T_MType_acc_pat3 <InstHexagon MI, SDNode firstOp, SDNode secOp>
+  : Pat <(i32 (secOp IntRegs:$src1, (firstOp IntRegs:$src2,
+                                              (not IntRegs:$src3)))),
+         (i32 (MI IntRegs:$src1, IntRegs:$src2, IntRegs:$src3))>;
+
+def: T_MType_acc_pat3 <M4_or_andn, and, or>;
+def: T_MType_acc_pat3 <M4_and_andn, and, and>;
+def: T_MType_acc_pat3 <M4_xor_andn, and, xor>;
+
+// Compound or-or and or-and
+let isExtentSigned = 1, InputType = "imm", hasNewValue = 1, isExtendable = 1,
+    opExtentBits = 10, opExtendable = 3 in
+class T_CompOR <string mnemonic, bits<2> MajOp, SDNode OpNode>
+  : MInst_acc <(outs IntRegs:$Rx),
+               (ins IntRegs:$src1, IntRegs:$Rs, s10Ext:$s10),
+  "$Rx |= "#mnemonic#"($Rs, #$s10)",
+  [(set (i32 IntRegs:$Rx), (or (i32 IntRegs:$src1),
+                           (OpNode (i32 IntRegs:$Rs), s10ExtPred:$s10)))],
+  "$src1 = $Rx", ALU64_tc_2_SLOT23>, ImmRegRel {
+    bits<5> Rx;
+    bits<5> Rs;
+    bits<10> s10;
+
+    let IClass = 0b1101;
+
+    let Inst{27-24} = 0b1010;
+    let Inst{23-22} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{21}    = s10{9};
+    let Inst{13-5}  = s10{8-0};
+    let Inst{4-0}   = Rx;
+  }
 
-// Rx[&|^]=or(Rs,Rt)
-// Rx&=or(Rs,Rt)
-let validSubTargets = HasV4SubT in
-def ANDr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
-            "$dst &= or($src2, $src3)",
-            [(set (i32 IntRegs:$dst),
-                  (and (i32 IntRegs:$src1), (or (i32 IntRegs:$src2),
-                                                (i32 IntRegs:$src3))))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>;
-
-// Rx|=or(Rs,Rt)
-let validSubTargets = HasV4SubT, CextOpcode = "ORr_ORr", InputType = "reg" in
-def ORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
-            "$dst |= or($src2, $src3)",
-            [(set (i32 IntRegs:$dst),
-                  (or (i32 IntRegs:$src1), (or (i32 IntRegs:$src2),
-                                               (i32 IntRegs:$src3))))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>, ImmRegRel;
-
-// Rx^=or(Rs,Rt)
-let validSubTargets = HasV4SubT in
-def XORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
-            "$dst ^= or($src2, $src3)",
-            [(set (i32 IntRegs:$dst),
-             (xor (i32 IntRegs:$src1), (or (i32 IntRegs:$src2),
-                                           (i32 IntRegs:$src3))))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>;
-
-// Rx[&|^]=xor(Rs,Rt)
-// Rx&=xor(Rs,Rt)
-let validSubTargets = HasV4SubT in
-def ANDr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
-            "$dst &= xor($src2, $src3)",
-            [(set (i32 IntRegs:$dst),
-                  (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2),
-                                                 (i32 IntRegs:$src3))))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>;
-
-// Rx|=xor(Rs,Rt)
-let validSubTargets = HasV4SubT in
-def ORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
-            "$dst |= xor($src2, $src3)",
-            [(set (i32 IntRegs:$dst),
-                  (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2),
-                                                 (i32 IntRegs:$src3))))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>;
-
-// Rx^=xor(Rs,Rt)
-let validSubTargets = HasV4SubT in
-def XORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
-            "$dst ^= xor($src2, $src3)",
-            [(set (i32 IntRegs:$dst),
-             (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2),
-                                            (i32 IntRegs:$src3))))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>;
-
-// Rx|=and(Rs,#s10)
-let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 10,
-validSubTargets = HasV4SubT, CextOpcode = "ORr_ANDr", InputType = "imm" in
-def ORr_ANDri2_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, s10Ext:$src3),
-            "$dst |= and($src2, #$src3)",
-            [(set (i32 IntRegs:$dst),
-                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
-                                                s10ExtPred:$src3)))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>, ImmRegRel;
-
-// Rx|=or(Rs,#s10)
-let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 10,
-validSubTargets = HasV4SubT, CextOpcode = "ORr_ORr", InputType = "imm" in
-def ORr_ORri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, s10Ext:$src3),
-            "$dst |= or($src2, #$src3)",
-            [(set (i32 IntRegs:$dst),
-                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
-                                                s10ExtPred:$src3)))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>, ImmRegRel;
+let CextOpcode = "ORr_ANDr" in
+def S4_or_andi : T_CompOR <"and", 0b00, and>;
 
+let CextOpcode = "ORr_ORr" in
+def S4_or_ori : T_CompOR <"or", 0b10, or>;
 
 //    Modulo wrap
 //        Rd=modwrap(Rs,Rt)
@@ -1480,269 +2246,483 @@ def ORr_ORri_V4 : MInst_acc<(outs IntRegs:$dst),
 // XTYPE/ALU -
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// XTYPE/BIT +
+//===----------------------------------------------------------------------===//
+
+// Bit reverse
+def S2_brevp : T_S2op_3 <"brev", 0b11, 0b110>;
+
+// Bit count
+def S2_ct0p : T_COUNT_LEADING_64<"ct0", 0b111, 0b010>;
+def S2_ct1p : T_COUNT_LEADING_64<"ct1", 0b111, 0b100>;
+def S4_clbpnorm : T_COUNT_LEADING_64<"normamt", 0b011, 0b000>;
+
+def: Pat<(i32 (trunc (cttz (i64 DoubleRegs:$Rss)))),
+         (S2_ct0p (i64 DoubleRegs:$Rss))>;
+def: Pat<(i32 (trunc (cttz (not (i64 DoubleRegs:$Rss))))),
+         (S2_ct1p (i64 DoubleRegs:$Rss))>;
+
+let hasSideEffects = 0, hasNewValue = 1 in
+def S4_clbaddi : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, s6Imm:$s6),
+    "$Rd = add(clb($Rs), #$s6)", [], "", S_2op_tc_2_SLOT23> {
+  bits<5> Rs;
+  bits<5> Rd;
+  bits<6> s6;
+  let IClass = 0b1000;
+  let Inst{27-24} = 0b1100;
+  let Inst{23-21} = 0b001;
+  let Inst{20-16} = Rs;
+  let Inst{13-8} = s6;
+  let Inst{7-5} = 0b000;
+  let Inst{4-0} = Rd;
+}
+
+let hasSideEffects = 0, hasNewValue = 1 in
+def S4_clbpaddi : SInst<(outs IntRegs:$Rd), (ins DoubleRegs:$Rs, s6Imm:$s6),
+    "$Rd = add(clb($Rs), #$s6)", [], "", S_2op_tc_2_SLOT23> {
+  bits<5> Rs;
+  bits<5> Rd;
+  bits<6> s6;
+  let IClass = 0b1000;
+  let Inst{27-24} = 0b1000;
+  let Inst{23-21} = 0b011;
+  let Inst{20-16} = Rs;
+  let Inst{13-8} = s6;
+  let Inst{7-5} = 0b010;
+  let Inst{4-0} = Rd;
+}
+
+
+// Bit test/set/clear
+def S4_ntstbit_i : T_TEST_BIT_IMM<"!tstbit", 0b001>;
+def S4_ntstbit_r : T_TEST_BIT_REG<"!tstbit", 1>;
+
+let AddedComplexity = 20 in {   // Complexity greater than cmp reg-imm.
+  def: Pat<(i1 (seteq (and (shl 1, u5ImmPred:$u5), (i32 IntRegs:$Rs)), 0)),
+           (S4_ntstbit_i (i32 IntRegs:$Rs), u5ImmPred:$u5)>;
+  def: Pat<(i1 (seteq (and (shl 1, (i32 IntRegs:$Rt)), (i32 IntRegs:$Rs)), 0)),
+           (S4_ntstbit_r (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))>;
+}
+
+// Add extra complexity to prefer these instructions over bitsset/bitsclr.
+// The reason is that tstbit/ntstbit can be folded into a compound instruction:
+//   if ([!]tstbit(...)) jump ...
+let AddedComplexity = 100 in
+def: Pat<(i1 (setne (and (i32 IntRegs:$Rs), (i32 Set5ImmPred:$u5)), (i32 0))),
+         (S2_tstbit_i (i32 IntRegs:$Rs), (BITPOS32 Set5ImmPred:$u5))>;
+
+let AddedComplexity = 100 in
+def: Pat<(i1 (seteq (and (i32 IntRegs:$Rs), (i32 Set5ImmPred:$u5)), (i32 0))),
+         (S4_ntstbit_i (i32 IntRegs:$Rs), (BITPOS32 Set5ImmPred:$u5))>;
+
+def C4_nbitsset  : T_TEST_BITS_REG<"!bitsset", 0b01, 1>;
+def C4_nbitsclr  : T_TEST_BITS_REG<"!bitsclr", 0b10, 1>;
+def C4_nbitsclri : T_TEST_BITS_IMM<"!bitsclr", 0b10, 1>;
+
+// Do not increase complexity of these patterns. In the DAG, "cmp i8" may be
+// represented as a compare against "value & 0xFF", which is an exact match
+// for cmpb (same for cmph). The patterns below do not contain any additional
+// complexity that would make them preferable, and if they were actually used
+// instead of cmpb/cmph, they would result in a compare against register that
+// is loaded with the byte/half mask (i.e. 0xFF or 0xFFFF).
+def: Pat<(i1 (setne (and I32:$Rs, u6ImmPred:$u6), 0)),
+         (C4_nbitsclri I32:$Rs, u6ImmPred:$u6)>;
+def: Pat<(i1 (setne (and I32:$Rs, I32:$Rt), 0)),
+         (C4_nbitsclr I32:$Rs, I32:$Rt)>;
+def: Pat<(i1 (setne (and I32:$Rs, I32:$Rt), I32:$Rt)),
+         (C4_nbitsset I32:$Rs, I32:$Rt)>;
+
+//===----------------------------------------------------------------------===//
+// XTYPE/BIT -
+//===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // XTYPE/MPY +
 //===----------------------------------------------------------------------===//
 
-// Multiply and user lower result.
-// Rd=add(#u6,mpyi(Rs,#U6))
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 6,
-validSubTargets = HasV4SubT in
-def ADDi_MPYri_V4 : MInst<(outs IntRegs:$dst),
-            (ins u6Ext:$src1, IntRegs:$src2, u6Imm:$src3),
-            "$dst = add(#$src1, mpyi($src2, #$src3))",
-            [(set (i32 IntRegs:$dst),
-                  (add (mul (i32 IntRegs:$src2), u6ImmPred:$src3),
-                       u6ExtPred:$src1))]>,
-            Requires<[HasV4T]>;
+// Rd=add(#u6,mpyi(Rs,#U6)) -- Multiply by immed and add immed.
+
+let hasNewValue = 1, isExtendable = 1, opExtentBits = 6, opExtendable = 1 in
+def M4_mpyri_addi : MInst<(outs IntRegs:$Rd),
+  (ins u6Ext:$u6, IntRegs:$Rs, u6Imm:$U6),
+  "$Rd = add(#$u6, mpyi($Rs, #$U6))" ,
+  [(set (i32 IntRegs:$Rd),
+        (add (mul (i32 IntRegs:$Rs), u6ImmPred:$U6),
+             u6ExtPred:$u6))] ,"",ALU64_tc_3x_SLOT23> {
+    bits<5> Rd;
+    bits<6> u6;
+    bits<5> Rs;
+    bits<6> U6;
+
+    let IClass = 0b1101;
+
+    let Inst{27-24} = 0b1000;
+    let Inst{23}    = U6{5};
+    let Inst{22-21} = u6{5-4};
+    let Inst{20-16} = Rs;
+    let Inst{13}    = u6{3};
+    let Inst{12-8}  = Rd;
+    let Inst{7-5}   = u6{2-0};
+    let Inst{4-0}   = U6{4-0};
+  }
+
+// Rd=add(#u6,mpyi(Rs,Rt))
+let CextOpcode = "ADD_MPY", InputType = "imm", hasNewValue = 1,
+    isExtendable = 1, opExtentBits = 6, opExtendable = 1 in
+def M4_mpyrr_addi : MInst <(outs IntRegs:$Rd),
+  (ins u6Ext:$u6, IntRegs:$Rs, IntRegs:$Rt),
+  "$Rd = add(#$u6, mpyi($Rs, $Rt))" ,
+  [(set (i32 IntRegs:$Rd),
+        (add (mul (i32 IntRegs:$Rs), (i32 IntRegs:$Rt)), u6ExtPred:$u6))],
+  "", ALU64_tc_3x_SLOT23>, ImmRegRel {
+    bits<5> Rd;
+    bits<6> u6;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1101;
+
+    let Inst{27-23} = 0b01110;
+    let Inst{22-21} = u6{5-4};
+    let Inst{20-16} = Rs;
+    let Inst{13}    = u6{3};
+    let Inst{12-8}  = Rt;
+    let Inst{7-5}   = u6{2-0};
+    let Inst{4-0}   = Rd;
+  }
+
+let hasNewValue = 1 in
+class T_AddMpy <bit MajOp, PatLeaf ImmPred, dag ins>
+  : ALU64Inst <(outs IntRegs:$dst), ins,
+  "$dst = add($src1, mpyi("#!if(MajOp,"$src3, #$src2))",
+                                      "#$src2, $src3))"),
+  [(set (i32 IntRegs:$dst),
+        (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src3), ImmPred:$src2)))],
+  "", ALU64_tc_3x_SLOT23> {
+    bits<5> dst;
+    bits<5> src1;
+    bits<8> src2;
+    bits<5> src3;
+
+    let IClass = 0b1101;
+
+    bits<6> ImmValue = !if(MajOp, src2{5-0}, src2{7-2});
+
+    let Inst{27-24} = 0b1111;
+    let Inst{23}    = MajOp;
+    let Inst{22-21} = ImmValue{5-4};
+    let Inst{20-16} = src3;
+    let Inst{13}    = ImmValue{3};
+    let Inst{12-8}  = dst;
+    let Inst{7-5}   = ImmValue{2-0};
+    let Inst{4-0}   = src1;
+  }
+
+def M4_mpyri_addr_u2 : T_AddMpy<0b0, u6_2ImmPred,
+                       (ins IntRegs:$src1, u6_2Imm:$src2, IntRegs:$src3)>;
+
+let isExtendable = 1, opExtentBits = 6, opExtendable = 3,
+    CextOpcode = "ADD_MPY", InputType = "imm" in
+def M4_mpyri_addr : T_AddMpy<0b1, u6ExtPred,
+                    (ins IntRegs:$src1, IntRegs:$src3, u6Ext:$src2)>, ImmRegRel;
+
+// Rx=add(Ru,mpyi(Rx,Rs))
+let CextOpcode = "ADD_MPY", InputType = "reg", hasNewValue = 1 in
+def M4_mpyrr_addr: MInst_acc <(outs IntRegs:$Rx),
+                              (ins IntRegs:$Ru, IntRegs:$_src_, IntRegs:$Rs),
+  "$Rx = add($Ru, mpyi($_src_, $Rs))",
+  [(set (i32 IntRegs:$Rx), (add (i32 IntRegs:$Ru),
+                           (mul (i32 IntRegs:$_src_), (i32 IntRegs:$Rs))))],
+  "$_src_ = $Rx", M_tc_3x_SLOT23>, ImmRegRel {
+    bits<5> Rx;
+    bits<5> Ru;
+    bits<5> Rs;
+
+    let IClass = 0b1110;
+
+    let Inst{27-21} = 0b0011000;
+    let Inst{12-8} = Rx;
+    let Inst{4-0} = Ru;
+    let Inst{20-16} = Rs;
+  }
 
 // Rd=add(##,mpyi(Rs,#U6))
 def : Pat <(add (mul (i32 IntRegs:$src2), u6ImmPred:$src3),
                      (HexagonCONST32 tglobaladdr:$src1)),
-           (i32 (ADDi_MPYri_V4 tglobaladdr:$src1, IntRegs:$src2,
+           (i32 (M4_mpyri_addi tglobaladdr:$src1, IntRegs:$src2,
                                u6ImmPred:$src3))>;
 
-// Rd=add(#u6,mpyi(Rs,Rt))
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 6,
-validSubTargets = HasV4SubT, InputType = "imm", CextOpcode = "ADD_MPY" in
-def ADDi_MPYrr_V4 : MInst<(outs IntRegs:$dst),
-            (ins u6Ext:$src1, IntRegs:$src2, IntRegs:$src3),
-            "$dst = add(#$src1, mpyi($src2, $src3))",
-            [(set (i32 IntRegs:$dst),
-                  (add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
-                       u6ExtPred:$src1))]>,
-            Requires<[HasV4T]>, ImmRegRel;
-
 // Rd=add(##,mpyi(Rs,Rt))
 def : Pat <(add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
                      (HexagonCONST32 tglobaladdr:$src1)),
-           (i32 (ADDi_MPYrr_V4 tglobaladdr:$src1, IntRegs:$src2,
+           (i32 (M4_mpyrr_addi tglobaladdr:$src1, IntRegs:$src2,
                                IntRegs:$src3))>;
 
-// Rd=add(Ru,mpyi(#u6:2,Rs))
-let validSubTargets = HasV4SubT in
-def ADDr_MPYir_V4 : MInst<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, u6Imm:$src2, IntRegs:$src3),
-            "$dst = add($src1, mpyi(#$src2, $src3))",
-            [(set (i32 IntRegs:$dst),
-             (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src3),
-                                            u6_2ImmPred:$src2)))]>,
-            Requires<[HasV4T]>;
-
-// Rd=add(Ru,mpyi(Rs,#u6))
-let isExtendable = 1, opExtendable = 3, isExtentSigned = 0, opExtentBits = 6,
-validSubTargets = HasV4SubT, InputType = "imm", CextOpcode = "ADD_MPY" in
-def ADDr_MPYri_V4 : MInst<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, u6Ext:$src3),
-            "$dst = add($src1, mpyi($src2, #$src3))",
-            [(set (i32 IntRegs:$dst),
-                  (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
-                                                 u6ExtPred:$src3)))]>,
-            Requires<[HasV4T]>, ImmRegRel;
+// Vector reduce multiply word by signed half (32x16)
+//Rdd=vrmpyweh(Rss,Rtt)[:<<1]
+def M4_vrmpyeh_s0 : T_M2_vmpy<"vrmpyweh", 0b010, 0b100, 0, 0, 0>;
+def M4_vrmpyeh_s1 : T_M2_vmpy<"vrmpyweh", 0b110, 0b100, 1, 0, 0>;
 
-// Rx=add(Ru,mpyi(Rx,Rs))
-let validSubTargets = HasV4SubT, InputType = "reg", CextOpcode = "ADD_MPY" in
-def ADDr_MPYrr_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "$dst = add($src1, mpyi($src2, $src3))",
-            [(set (i32 IntRegs:$dst),
-             (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
-                                            (i32 IntRegs:$src3))))],
-            "$src2 = $dst">,
-            Requires<[HasV4T]>, ImmRegRel;
+//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+def M4_vrmpyoh_s0 : T_M2_vmpy<"vrmpywoh", 0b001, 0b010, 0, 0, 0>;
+def M4_vrmpyoh_s1 : T_M2_vmpy<"vrmpywoh", 0b101, 0b010, 1, 0, 0>;
 
+//Rdd+=vrmpyweh(Rss,Rtt)[:<<1]
+def M4_vrmpyeh_acc_s0: T_M2_vmpy_acc<"vrmpyweh", 0b001, 0b110, 0, 0>;
+def M4_vrmpyeh_acc_s1: T_M2_vmpy_acc<"vrmpyweh", 0b101, 0b110, 1, 0>;
 
-// Polynomial multiply words
-// Rdd=pmpyw(Rs,Rt)
-// Rxx^=pmpyw(Rs,Rt)
+//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+def M4_vrmpyoh_acc_s0: T_M2_vmpy_acc<"vrmpywoh", 0b011, 0b110, 0, 0>;
+def M4_vrmpyoh_acc_s1: T_M2_vmpy_acc<"vrmpywoh", 0b111, 0b110, 1, 0>;
 
-// Vector reduce multiply word by signed half (32x16)
-// Rdd=vrmpyweh(Rss,Rtt)[:<<1]
-// Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-// Rxx+=vrmpyweh(Rss,Rtt)[:<<1]
-// Rxx+=vrmpywoh(Rss,Rtt)[:<<1]
-
-// Multiply and use upper result
-// Rd=mpy(Rs,Rt.H):<<1:sat
-// Rd=mpy(Rs,Rt.L):<<1:sat
-// Rd=mpy(Rs,Rt):<<1
-// Rd=mpy(Rs,Rt):<<1:sat
-// Rd=mpysu(Rs,Rt)
-// Rx+=mpy(Rs,Rt):<<1:sat
-// Rx-=mpy(Rs,Rt):<<1:sat
-
-// Vector multiply bytes
-// Rdd=vmpybsu(Rs,Rt)
-// Rdd=vmpybu(Rs,Rt)
-// Rxx+=vmpybsu(Rs,Rt)
-// Rxx+=vmpybu(Rs,Rt)
+// Vector multiply halfwords, signed by unsigned
+// Rdd=vmpyhsu(Rs,Rt)[:<<]:sat
+def M2_vmpy2su_s0 : T_XTYPE_mpy64 < "vmpyhsu", 0b000, 0b111, 1, 0, 0>;
+def M2_vmpy2su_s1 : T_XTYPE_mpy64 < "vmpyhsu", 0b100, 0b111, 1, 1, 0>;
+
+// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat
+def M2_vmac2su_s0 : T_XTYPE_mpy64_acc < "vmpyhsu", "+", 0b011, 0b101, 1, 0, 0>;
+def M2_vmac2su_s1 : T_XTYPE_mpy64_acc < "vmpyhsu", "+", 0b111, 0b101, 1, 1, 0>;
 
 // Vector polynomial multiply halfwords
 // Rdd=vpmpyh(Rs,Rt)
+def M4_vpmpyh : T_XTYPE_mpy64 < "vpmpyh", 0b110, 0b111, 0, 0, 0>;
+
 // Rxx^=vpmpyh(Rs,Rt)
+def M4_vpmpyh_acc : T_XTYPE_mpy64_acc < "vpmpyh", "^", 0b101, 0b111, 0, 0, 0>;
+
+// Polynomial multiply words
+// Rdd=pmpyw(Rs,Rt)
+def M4_pmpyw : T_XTYPE_mpy64 < "pmpyw", 0b010, 0b111, 0, 0, 0>;
+
+// Rxx^=pmpyw(Rs,Rt)
+def M4_pmpyw_acc  : T_XTYPE_mpy64_acc < "pmpyw", "^", 0b001, 0b111, 0, 0, 0>;
 
 //===----------------------------------------------------------------------===//
 // XTYPE/MPY -
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// ALU64/Vector compare
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Template class for vector compare
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in
+class T_vcmpImm <string Str, bits<2> cmpOp, bits<2> minOp, Operand ImmOprnd>
+  : ALU64_rr <(outs PredRegs:$Pd),
+              (ins DoubleRegs:$Rss, ImmOprnd:$Imm),
+  "$Pd = "#Str#"($Rss, #$Imm)",
+  [], "", ALU64_tc_2early_SLOT23> {
+    bits<2> Pd;
+    bits<5> Rss;
+    bits<32> Imm;
+    bits<8> ImmBits;
+    let ImmBits{6-0} = Imm{6-0};
+    let ImmBits{7} = !if (!eq(cmpOp,0b10), 0b0, Imm{7}); // 0 for vcmp[bhw].gtu
+
+    let IClass = 0b1101;
+
+    let Inst{27-24} = 0b1100;
+    let Inst{22-21} = cmpOp;
+    let Inst{20-16} = Rss;
+    let Inst{12-5} = ImmBits;
+    let Inst{4-3} = minOp;
+    let Inst{1-0} = Pd;
+  }
+
+// Vector compare bytes
+def A4_vcmpbgt   : T_vcmp <"vcmpb.gt", 0b1010>;
+def: T_vcmp_pat<A4_vcmpbgt, setgt, v8i8>;
+
+let AsmString = "$Pd = any8(vcmpb.eq($Rss, $Rtt))" in
+def A4_vcmpbeq_any : T_vcmp <"any8(vcmpb.gt", 0b1000>;
+
+def A4_vcmpbeqi  : T_vcmpImm <"vcmpb.eq",  0b00, 0b00, u8Imm>;
+def A4_vcmpbgti  : T_vcmpImm <"vcmpb.gt",  0b01, 0b00, s8Imm>;
+def A4_vcmpbgtui : T_vcmpImm <"vcmpb.gtu", 0b10, 0b00, u7Imm>;
+
+// Vector compare halfwords
+def A4_vcmpheqi  : T_vcmpImm <"vcmph.eq",  0b00, 0b01, s8Imm>;
+def A4_vcmphgti  : T_vcmpImm <"vcmph.gt",  0b01, 0b01, s8Imm>;
+def A4_vcmphgtui : T_vcmpImm <"vcmph.gtu", 0b10, 0b01, u7Imm>;
+
+// Vector compare words
+def A4_vcmpweqi  : T_vcmpImm <"vcmpw.eq",  0b00, 0b10, s8Imm>;
+def A4_vcmpwgti  : T_vcmpImm <"vcmpw.gt",  0b01, 0b10, s8Imm>;
+def A4_vcmpwgtui : T_vcmpImm <"vcmpw.gtu", 0b10, 0b10, u7Imm>;
 
 //===----------------------------------------------------------------------===//
 // XTYPE/SHIFT +
 //===----------------------------------------------------------------------===//
-
-// Shift by immediate and accumulate.
-// Rx=add(#u8,asl(Rx,#U5))
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
-validSubTargets = HasV4SubT in
-def ADDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
-            "$dst = add(#$src1, asl($src2, #$src3))",
-            [(set (i32 IntRegs:$dst),
-                  (add (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
-                       u8ExtPred:$src1))],
-            "$src2 = $dst">,
-            Requires<[HasV4T]>;
-
-// Rx=add(#u8,lsr(Rx,#U5))
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
-validSubTargets = HasV4SubT in
-def ADDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
-            "$dst = add(#$src1, lsr($src2, #$src3))",
-            [(set (i32 IntRegs:$dst),
-                  (add (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
-                       u8ExtPred:$src1))],
-            "$src2 = $dst">,
-            Requires<[HasV4T]>;
-
-// Rx=sub(#u8,asl(Rx,#U5))
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
-validSubTargets = HasV4SubT in
-def SUBi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
-            "$dst = sub(#$src1, asl($src2, #$src3))",
-            [(set (i32 IntRegs:$dst),
-                  (sub (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
-                       u8ExtPred:$src1))],
-            "$src2 = $dst">,
-            Requires<[HasV4T]>;
-
-// Rx=sub(#u8,lsr(Rx,#U5))
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
-validSubTargets = HasV4SubT in
-def SUBi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
-            "$dst = sub(#$src1, lsr($src2, #$src3))",
-            [(set (i32 IntRegs:$dst),
-                  (sub (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
-                       u8ExtPred:$src1))],
-            "$src2 = $dst">,
-            Requires<[HasV4T]>;
-
-
-//Shift by immediate and logical.
-//Rx=and(#u8,asl(Rx,#U5))
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
-validSubTargets = HasV4SubT in
-def ANDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
-            "$dst = and(#$src1, asl($src2, #$src3))",
-            [(set (i32 IntRegs:$dst),
-                  (and (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
-                       u8ExtPred:$src1))],
-            "$src2 = $dst">,
-            Requires<[HasV4T]>;
-
-//Rx=and(#u8,lsr(Rx,#U5))
+// Shift by immediate and accumulate/logical.
+// Rx=add(#u8,asl(Rx,#U5))  Rx=add(#u8,lsr(Rx,#U5))
+// Rx=sub(#u8,asl(Rx,#U5))  Rx=sub(#u8,lsr(Rx,#U5))
+// Rx=and(#u8,asl(Rx,#U5))  Rx=and(#u8,lsr(Rx,#U5))
+// Rx=or(#u8,asl(Rx,#U5))   Rx=or(#u8,lsr(Rx,#U5))
 let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
-validSubTargets = HasV4SubT in
-def ANDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
-            "$dst = and(#$src1, lsr($src2, #$src3))",
-            [(set (i32 IntRegs:$dst),
-                  (and (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
-                       u8ExtPred:$src1))],
-            "$src2 = $dst">,
-            Requires<[HasV4T]>;
-
-//Rx=or(#u8,asl(Rx,#U5))
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
-AddedComplexity = 30, validSubTargets = HasV4SubT in
-def ORi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
-            "$dst = or(#$src1, asl($src2, #$src3))",
-            [(set (i32 IntRegs:$dst),
-                  (or (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
-                      u8ExtPred:$src1))],
-            "$src2 = $dst">,
-            Requires<[HasV4T]>;
-
-//Rx=or(#u8,lsr(Rx,#U5))
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
-AddedComplexity = 30, validSubTargets = HasV4SubT in
-def ORi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
-            "$dst = or(#$src1, lsr($src2, #$src3))",
-            [(set (i32 IntRegs:$dst),
-                  (or (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
-                      u8ExtPred:$src1))],
-            "$src2 = $dst">,
-            Requires<[HasV4T]>;
-
-
-//Shift by register.
-//Rd=lsl(#s6,Rt)
-let validSubTargets = HasV4SubT in {
-def LSLi_V4 : MInst<(outs IntRegs:$dst), (ins s6Imm:$src1, IntRegs:$src2),
-            "$dst = lsl(#$src1, $src2)",
-            [(set (i32 IntRegs:$dst), (shl s6ImmPred:$src1,
-                                           (i32 IntRegs:$src2)))]>,
-            Requires<[HasV4T]>;
-
-
-//Shift by register and logical.
-//Rxx^=asl(Rss,Rt)
-def ASLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
-            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-            "$dst ^= asl($src2, $src3)",
-            [(set (i64 DoubleRegs:$dst),
-                  (xor (i64 DoubleRegs:$src1), (shl (i64 DoubleRegs:$src2),
-                                                    (i32 IntRegs:$src3))))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>;
-
-//Rxx^=asr(Rss,Rt)
-def ASRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
-            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-            "$dst ^= asr($src2, $src3)",
-            [(set (i64 DoubleRegs:$dst),
-                  (xor (i64 DoubleRegs:$src1), (sra (i64 DoubleRegs:$src2),
-                                                    (i32 IntRegs:$src3))))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>;
-
-//Rxx^=lsl(Rss,Rt)
-def LSLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
-            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-            "$dst ^= lsl($src2, $src3)",
-            [(set (i64 DoubleRegs:$dst), (xor (i64 DoubleRegs:$src1),
-                                              (shl (i64 DoubleRegs:$src2),
-                                                   (i32 IntRegs:$src3))))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>;
-
-//Rxx^=lsr(Rss,Rt)
-def LSRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
-            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-            "$dst ^= lsr($src2, $src3)",
-            [(set (i64 DoubleRegs:$dst),
-                  (xor (i64 DoubleRegs:$src1), (srl (i64 DoubleRegs:$src2),
-                                                    (i32 IntRegs:$src3))))],
-            "$src1 = $dst">,
-            Requires<[HasV4T]>;
+    hasNewValue = 1, opNewValue = 0 in
+class T_S4_ShiftOperate<string MnOp, string MnSh, SDNode Op, SDNode Sh,
+                        bit asl_lsr, bits<2> MajOp, InstrItinClass Itin>
+  : MInst_acc<(outs IntRegs:$Rd), (ins u8Ext:$u8, IntRegs:$Rx, u5Imm:$U5),
+      "$Rd = "#MnOp#"(#$u8, "#MnSh#"($Rx, #$U5))",
+      [(set (i32 IntRegs:$Rd),
+            (Op (Sh I32:$Rx, u5ImmPred:$U5), u8ExtPred:$u8))],
+      "$Rd = $Rx", Itin> {
+
+  bits<5> Rd;
+  bits<8> u8;
+  bits<5> Rx;
+  bits<5> U5;
+
+  let IClass = 0b1101;
+  let Inst{27-24} = 0b1110;
+  let Inst{23-21} = u8{7-5};
+  let Inst{20-16} = Rd;
+  let Inst{13} = u8{4};
+  let Inst{12-8} = U5;
+  let Inst{7-5} = u8{3-1};
+  let Inst{4} = asl_lsr;
+  let Inst{3} = u8{0};
+  let Inst{2-1} = MajOp;
+}
+
+multiclass T_ShiftOperate<string mnemonic, SDNode Op, bits<2> MajOp,
+                          InstrItinClass Itin> {
+  def _asl_ri : T_S4_ShiftOperate<mnemonic, "asl", Op, shl, 0, MajOp, Itin>;
+  def _lsr_ri : T_S4_ShiftOperate<mnemonic, "lsr", Op, srl, 1, MajOp, Itin>;
+}
+
+let AddedComplexity = 200 in {
+  defm S4_addi : T_ShiftOperate<"add", add, 0b10, ALU64_tc_2_SLOT23>;
+  defm S4_andi : T_ShiftOperate<"and", and, 0b00, ALU64_tc_2_SLOT23>;
 }
 
+let AddedComplexity = 30 in
+defm S4_ori  : T_ShiftOperate<"or",  or,  0b01, ALU64_tc_1_SLOT23>;
+
+defm S4_subi : T_ShiftOperate<"sub", sub, 0b11, ALU64_tc_1_SLOT23>;
+
+let AddedComplexity = 200 in {
+  def: Pat<(add addrga:$addr, (shl I32:$src2, u5ImmPred:$src3)),
+           (S4_addi_asl_ri addrga:$addr, IntRegs:$src2, u5ImmPred:$src3)>;
+  def: Pat<(add addrga:$addr, (srl I32:$src2, u5ImmPred:$src3)),
+           (S4_addi_lsr_ri addrga:$addr, IntRegs:$src2, u5ImmPred:$src3)>;
+  def: Pat<(sub addrga:$addr, (shl I32:$src2, u5ImmPred:$src3)),
+           (S4_subi_asl_ri addrga:$addr, IntRegs:$src2, u5ImmPred:$src3)>;
+  def: Pat<(sub addrga:$addr, (srl I32:$src2, u5ImmPred:$src3)),
+           (S4_subi_lsr_ri addrga:$addr, IntRegs:$src2, u5ImmPred:$src3)>;
+}
+
+// Vector conditional negate
+// Rdd=vcnegh(Rss,Rt)
+let Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
+def S2_vcnegh   : T_S3op_shiftVect < "vcnegh",   0b11, 0b01>;
+
+// Rd=[cround|round](Rs,Rt)
+let hasNewValue = 1, Itinerary = S_3op_tc_2_SLOT23 in {
+  def A4_cround_rr    : T_S3op_3 < "cround", IntRegs, 0b11, 0b00>;
+  def A4_round_rr     : T_S3op_3 < "round", IntRegs, 0b11, 0b10>;
+}
+
+// Rd=round(Rs,Rt):sat
+let hasNewValue = 1, Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
+def A4_round_rr_sat : T_S3op_3 < "round", IntRegs, 0b11, 0b11, 1>;
+
+// Rd=[cmpyiwh|cmpyrwh](Rss,Rt):<<1:rnd:sat
+let Defs = [USR_OVF], Itinerary = S_3op_tc_3x_SLOT23 in {
+  def M4_cmpyi_wh     : T_S3op_8<"cmpyiwh", 0b100, 1, 1, 1>;
+  def M4_cmpyr_wh     : T_S3op_8<"cmpyrwh", 0b110, 1, 1, 1>;
+}
+
+// Rdd=[add|sub](Rss,Rtt,Px):carry
+let isPredicateLate = 1, hasSideEffects = 0 in
+class T_S3op_carry <string mnemonic, bits<3> MajOp>
+  : SInst < (outs DoubleRegs:$Rdd, PredRegs:$Px),
+            (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, PredRegs:$Pu),
+  "$Rdd = "#mnemonic#"($Rss, $Rtt, $Pu):carry",
+  [], "$Px = $Pu", S_3op_tc_1_SLOT23 > {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+    bits<2> Pu;
+
+    let IClass = 0b1100;
+
+    let Inst{27-24} = 0b0010;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = Rss;
+    let Inst{12-8}  = Rtt;
+    let Inst{6-5}   = Pu;
+    let Inst{4-0}   = Rdd;
+  }
+
+def A4_addp_c : T_S3op_carry < "add", 0b110 >;
+def A4_subp_c : T_S3op_carry < "sub", 0b111 >;
+
+let Itinerary = S_3op_tc_3_SLOT23, hasSideEffects = 0 in
+class T_S3op_6 <string mnemonic, bits<3> MinOp, bit isUnsigned>
+  : SInst <(outs DoubleRegs:$Rxx),
+           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Ru),
+  "$Rxx = "#mnemonic#"($Rss, $Ru)" ,
+  [] , "$dst2 = $Rxx"> {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<5> Ru;
+
+    let IClass = 0b1100;
+
+    let Inst{27-21} = 0b1011001;
+    let Inst{20-16} = Rss;
+    let Inst{13}    = isUnsigned;
+    let Inst{12-8}  = Rxx;
+    let Inst{7-5}   = MinOp;
+    let Inst{4-0}   = Ru;
+  }
+
+// Vector reduce maximum halfwords
+// Rxx=vrmax[u]h(Rss,Ru)
+def A4_vrmaxh  : T_S3op_6 < "vrmaxh",  0b001, 0>;
+def A4_vrmaxuh : T_S3op_6 < "vrmaxuh", 0b001, 1>;
+
+// Vector reduce maximum words
+// Rxx=vrmax[u]w(Rss,Ru)
+def A4_vrmaxw  : T_S3op_6 < "vrmaxw",  0b010, 0>;
+def A4_vrmaxuw : T_S3op_6 < "vrmaxuw", 0b010, 1>;
+
+// Vector reduce minimum halfwords
+// Rxx=vrmin[u]h(Rss,Ru)
+def A4_vrminh  : T_S3op_6 < "vrminh",  0b101, 0>;
+def A4_vrminuh : T_S3op_6 < "vrminuh", 0b101, 1>;
+
+// Vector reduce minimum words
+// Rxx=vrmin[u]w(Rss,Ru)
+def A4_vrminw  : T_S3op_6 < "vrminw",  0b110, 0>;
+def A4_vrminuw : T_S3op_6 < "vrminuw", 0b110, 1>;
+
+// Shift an immediate left by register amount.
+let hasNewValue = 1, hasSideEffects = 0 in
+def S4_lsli: SInst <(outs IntRegs:$Rd), (ins s6Imm:$s6, IntRegs:$Rt),
+  "$Rd = lsl(#$s6, $Rt)" ,
+  [(set (i32 IntRegs:$Rd), (shl s6ImmPred:$s6,
+                                 (i32 IntRegs:$Rt)))],
+  "", S_3op_tc_1_SLOT23> {
+    bits<5> Rd;
+    bits<6> s6;
+    bits<5> Rt;
+
+    let IClass = 0b1100;
+
+    let Inst{27-22} = 0b011010;
+    let Inst{20-16} = s6{5-1};
+    let Inst{12-8}  = Rt;
+    let Inst{7-6}   = 0b11;
+    let Inst{4-0}   = Rd;
+    let Inst{5}     = s6{0};
+  }
+
 //===----------------------------------------------------------------------===//
 // XTYPE/SHIFT -
 //===----------------------------------------------------------------------===//
@@ -1830,7 +2810,7 @@ class MemOp_rr_base <string opc, bits<2> opcBits, Operand ImmOp,
                  (ins IntRegs:$base, ImmOp:$offset, IntRegs:$delta),
                  opc#"($base+#$offset)"#memOp#"$delta",
                  []>,
-                 Requires<[HasV4T, UseMEMOP]> {
+                 Requires<[UseMEMOP]> {
 
     bits<5> base;
     bits<5> delta;
@@ -1841,6 +2821,7 @@ class MemOp_rr_base <string opc, bits<2> opcBits, Operand ImmOp,
                      !if (!eq(opcBits, 0b01), offset{6-1},
                      !if (!eq(opcBits, 0b10), offset{7-2},0)));
 
+    let opExtentAlign = opcBits;
     let IClass = 0b0011;
     let Inst{27-24} = 0b1110;
     let Inst{22-21} = opcBits;
@@ -1861,7 +2842,7 @@ class MemOp_ri_base <string opc, bits<2> opcBits, Operand ImmOp,
                   opc#"($base+#$offset)"#memOp#"#$delta"
                   #!if(memOpBits{1},")", ""), // clrbit, setbit - include ')'
                   []>,
-                  Requires<[HasV4T, UseMEMOP]> {
+                  Requires<[UseMEMOP]> {
 
     bits<5> base;
     bits<5> delta;
@@ -1872,6 +2853,7 @@ class MemOp_ri_base <string opc, bits<2> opcBits, Operand ImmOp,
                      !if (!eq(opcBits, 0b01), offset{6-1},
                      !if (!eq(opcBits, 0b10), offset{7-2},0)));
 
+    let opExtentAlign = opcBits;
     let IClass = 0b0011;
     let Inst{27-24} = 0b1111;
     let Inst{22-21} = opcBits;
@@ -1884,36 +2866,35 @@ class MemOp_ri_base <string opc, bits<2> opcBits, Operand ImmOp,
 
 // multiclass to define MemOp instructions with register operand.
 multiclass MemOp_rr<string opc, bits<2> opcBits, Operand ImmOp> {
-  def _ADD#NAME#_V4 : MemOp_rr_base <opc, opcBits, ImmOp, " += ", 0b00>; // add
-  def _SUB#NAME#_V4 : MemOp_rr_base <opc, opcBits, ImmOp, " -= ", 0b01>; // sub
-  def _AND#NAME#_V4 : MemOp_rr_base <opc, opcBits, ImmOp, " &= ", 0b10>; // and
-  def _OR#NAME#_V4  : MemOp_rr_base <opc, opcBits, ImmOp, " |= ", 0b11>; // or
+  def L4_add#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " += ", 0b00>; // add
+  def L4_sub#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " -= ", 0b01>; // sub
+  def L4_and#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " &= ", 0b10>; // and
+  def L4_or#NAME  : MemOp_rr_base <opc, opcBits, ImmOp, " |= ", 0b11>; // or
 }
 
 // multiclass to define MemOp instructions with immediate Operand.
 multiclass MemOp_ri<string opc, bits<2> opcBits, Operand ImmOp> {
-  def _ADD#NAME#_V4 : MemOp_ri_base <opc, opcBits, ImmOp, " += ", 0b00 >;
-  def _SUB#NAME#_V4 : MemOp_ri_base <opc, opcBits, ImmOp, " -= ", 0b01 >;
-  def _CLRBIT#NAME#_V4 : MemOp_ri_base<opc, opcBits, ImmOp, " =clrbit(", 0b10>;
-  def _SETBIT#NAME#_V4 : MemOp_ri_base<opc, opcBits, ImmOp, " =setbit(", 0b11>;
+  def L4_iadd#NAME : MemOp_ri_base <opc, opcBits, ImmOp, " += ", 0b00 >;
+  def L4_isub#NAME : MemOp_ri_base <opc, opcBits, ImmOp, " -= ", 0b01 >;
+  def L4_iand#NAME : MemOp_ri_base<opc, opcBits, ImmOp, " = clrbit(", 0b10>;
+  def L4_ior#NAME : MemOp_ri_base<opc, opcBits, ImmOp, " = setbit(", 0b11>;
 }
 
 multiclass MemOp_base <string opc, bits<2> opcBits, Operand ImmOp> {
-  defm r : MemOp_rr <opc, opcBits, ImmOp>;
-  defm i : MemOp_ri <opc, opcBits, ImmOp>;
+  defm _#NAME : MemOp_rr <opc, opcBits, ImmOp>;
+  defm _#NAME : MemOp_ri <opc, opcBits, ImmOp>;
 }
 
 // Define MemOp instructions.
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 0,
-validSubTargets =HasV4SubT in {
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0 in {
   let opExtentBits = 6, accessSize = ByteAccess in
-  defm MemOPb : MemOp_base <"memb", 0b00, u6_0Ext>;
+  defm memopb_io : MemOp_base <"memb", 0b00, u6_0Ext>;
 
   let opExtentBits = 7, accessSize = HalfWordAccess in
-  defm MemOPh : MemOp_base <"memh", 0b01, u6_1Ext>;
+  defm memoph_io : MemOp_base <"memh", 0b01, u6_1Ext>;
 
   let opExtentBits = 8, accessSize = WordAccess in
-  defm MemOPw : MemOp_base <"memw", 0b10, u6_2Ext>;
+  defm memopw_io : MemOp_base <"memw", 0b10, u6_2Ext>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1926,40 +2907,40 @@ validSubTargets =HasV4SubT in {
 multiclass MemOpi_u5Pats <PatFrag ldOp, PatFrag stOp, PatLeaf ExtPred,
                           InstHexagon MI, SDNode OpNode> {
   let AddedComplexity = 180 in
-  def : Pat < (stOp (OpNode (ldOp IntRegs:$addr), u5ImmPred:$addend),
-                    IntRegs:$addr),
-              (MI IntRegs:$addr, #0, u5ImmPred:$addend )>;
+  def: Pat<(stOp (OpNode (ldOp IntRegs:$addr), u5ImmPred:$addend),
+                  IntRegs:$addr),
+            (MI IntRegs:$addr, 0, u5ImmPred:$addend)>;
 
   let AddedComplexity = 190 in
-  def : Pat <(stOp (OpNode (ldOp (add IntRegs:$base, ExtPred:$offset)),
-                     u5ImmPred:$addend),
-             (add IntRegs:$base, ExtPred:$offset)),
-       (MI IntRegs:$base, ExtPred:$offset, u5ImmPred:$addend)>;
+  def: Pat<(stOp (OpNode (ldOp (add IntRegs:$base, ExtPred:$offset)),
+                  u5ImmPred:$addend),
+            (add IntRegs:$base, ExtPred:$offset)),
+            (MI IntRegs:$base, ExtPred:$offset, u5ImmPred:$addend)>;
 }
 
 multiclass MemOpi_u5ALUOp<PatFrag ldOp, PatFrag stOp, PatLeaf ExtPred,
                           InstHexagon addMI, InstHexagon subMI> {
-  defm : MemOpi_u5Pats<ldOp, stOp, ExtPred, addMI, add>;
-  defm : MemOpi_u5Pats<ldOp, stOp, ExtPred, subMI, sub>;
+  defm: MemOpi_u5Pats<ldOp, stOp, ExtPred, addMI, add>;
+  defm: MemOpi_u5Pats<ldOp, stOp, ExtPred, subMI, sub>;
 }
 
 multiclass MemOpi_u5ExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
   // Half Word
-  defm : MemOpi_u5ALUOp <ldOpHalf, truncstorei16, u6_1ExtPred,
-                         MemOPh_ADDi_V4, MemOPh_SUBi_V4>;
+  defm: MemOpi_u5ALUOp <ldOpHalf, truncstorei16, u6_1ExtPred,
+                        L4_iadd_memoph_io, L4_isub_memoph_io>;
   // Byte
-  defm : MemOpi_u5ALUOp <ldOpByte, truncstorei8, u6ExtPred,
-                         MemOPb_ADDi_V4, MemOPb_SUBi_V4>;
+  defm: MemOpi_u5ALUOp <ldOpByte, truncstorei8, u6ExtPred,
+                        L4_iadd_memopb_io, L4_isub_memopb_io>;
 }
 
-let Predicates = [HasV4T, UseMEMOP] in {
-  defm : MemOpi_u5ExtType<zextloadi8, zextloadi16>; // zero extend
-  defm : MemOpi_u5ExtType<sextloadi8, sextloadi16>; // sign extend
-  defm : MemOpi_u5ExtType<extloadi8,  extloadi16>;  // any extend
+let Predicates = [UseMEMOP] in {
+  defm: MemOpi_u5ExtType<zextloadi8, zextloadi16>; // zero extend
+  defm: MemOpi_u5ExtType<sextloadi8, sextloadi16>; // sign extend
+  defm: MemOpi_u5ExtType<extloadi8,  extloadi16>;  // any extend
 
   // Word
-  defm : MemOpi_u5ALUOp <load, store, u6_2ExtPred, MemOPw_ADDi_V4,
-                         MemOPw_SUBi_V4>;
+  defm: MemOpi_u5ALUOp <load, store, u6_2ExtPred, L4_iadd_memopw_io,
+                        L4_isub_memopw_io>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1970,37 +2951,36 @@ let Predicates = [HasV4T, UseMEMOP] in {
 //===----------------------------------------------------------------------===//
 
 multiclass MemOpi_m5Pats <PatFrag ldOp, PatFrag stOp, PatLeaf extPred,
-                          PatLeaf immPred, ComplexPattern addrPred,
-                          SDNodeXForm xformFunc, InstHexagon MI> {
+                          PatLeaf immPred, SDNodeXForm xformFunc,
+                          InstHexagon MI> {
   let AddedComplexity = 190 in
-  def : Pat <(stOp (add (ldOp IntRegs:$addr), immPred:$subend),
-                   IntRegs:$addr),
-             (MI IntRegs:$addr, #0, (xformFunc immPred:$subend) )>;
+  def: Pat<(stOp (add (ldOp IntRegs:$addr), immPred:$subend), IntRegs:$addr),
+           (MI IntRegs:$addr, 0, (xformFunc immPred:$subend))>;
 
   let AddedComplexity = 195 in
-  def : Pat<(stOp (add (ldOp (add IntRegs:$base, extPred:$offset)),
-                       immPred:$subend),
-                  (add IntRegs:$base, extPred:$offset)),
-            (MI IntRegs:$base, extPred:$offset, (xformFunc immPred:$subend))>;
+  def: Pat<(stOp (add (ldOp (add IntRegs:$base, extPred:$offset)),
+                  immPred:$subend),
+           (add IntRegs:$base, extPred:$offset)),
+           (MI IntRegs:$base, extPred:$offset, (xformFunc immPred:$subend))>;
 }
 
 multiclass MemOpi_m5ExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
   // Half Word
-  defm : MemOpi_m5Pats <ldOpHalf, truncstorei16, u6_1ExtPred, m5HImmPred,
-                        ADDRriU6_1, MEMOPIMM_HALF, MemOPh_SUBi_V4>;
+  defm: MemOpi_m5Pats <ldOpHalf, truncstorei16, u6_1ExtPred, m5HImmPred,
+                       MEMOPIMM_HALF, L4_isub_memoph_io>;
   // Byte
-  defm : MemOpi_m5Pats <ldOpByte, truncstorei8, u6ExtPred, m5BImmPred,
-                        ADDRriU6_0, MEMOPIMM_BYTE, MemOPb_SUBi_V4>;
+  defm: MemOpi_m5Pats <ldOpByte, truncstorei8, u6ExtPred, m5BImmPred,
+                       MEMOPIMM_BYTE, L4_isub_memopb_io>;
 }
 
-let Predicates = [HasV4T, UseMEMOP] in {
-  defm : MemOpi_m5ExtType<zextloadi8, zextloadi16>; // zero extend
-  defm : MemOpi_m5ExtType<sextloadi8, sextloadi16>; // sign extend
-  defm : MemOpi_m5ExtType<extloadi8,  extloadi16>;  // any extend
+let Predicates = [UseMEMOP] in {
+  defm: MemOpi_m5ExtType<zextloadi8, zextloadi16>; // zero extend
+  defm: MemOpi_m5ExtType<sextloadi8, sextloadi16>; // sign extend
+  defm: MemOpi_m5ExtType<extloadi8,  extloadi16>;  // any extend
 
   // Word
-  defm : MemOpi_m5Pats <load, store, u6_2ExtPred, m5ImmPred,
-                          ADDRriU6_2, MEMOPIMM, MemOPw_SUBi_V4>;
+  defm: MemOpi_m5Pats <load, store, u6_2ExtPred, m5ImmPred,
+                       MEMOPIMM, L4_isub_memopw_io>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2010,52 +2990,50 @@ let Predicates = [HasV4T, UseMEMOP] in {
 //===----------------------------------------------------------------------===//
 
 multiclass MemOpi_bitPats <PatFrag ldOp, PatFrag stOp, PatLeaf immPred,
-                     PatLeaf extPred, ComplexPattern addrPred,
-                     SDNodeXForm xformFunc, InstHexagon MI, SDNode OpNode> {
+                     PatLeaf extPred, SDNodeXForm xformFunc, InstHexagon MI,
+                     SDNode OpNode> {
 
   // mem[bhw](Rs+#u6:[012]) = [clrbit|setbit](#U5)
   let AddedComplexity = 250 in
-  def : Pat<(stOp (OpNode (ldOp (add IntRegs:$base, extPred:$offset)),
-                          immPred:$bitend),
-                  (add IntRegs:$base, extPred:$offset)),
-            (MI IntRegs:$base, extPred:$offset, (xformFunc immPred:$bitend))>;
+  def: Pat<(stOp (OpNode (ldOp (add IntRegs:$base, extPred:$offset)),
+                  immPred:$bitend),
+           (add IntRegs:$base, extPred:$offset)),
+           (MI IntRegs:$base, extPred:$offset, (xformFunc immPred:$bitend))>;
 
   // mem[bhw](Rs+#0) = [clrbit|setbit](#U5)
   let AddedComplexity = 225 in
-  def : Pat <(stOp (OpNode (ldOp (addrPred IntRegs:$addr, extPred:$offset)),
-                           immPred:$bitend),
-                   (addrPred (i32 IntRegs:$addr), extPred:$offset)),
-             (MI IntRegs:$addr, extPred:$offset, (xformFunc immPred:$bitend))>;
+  def: Pat<(stOp (OpNode (ldOp IntRegs:$addr), immPred:$bitend), IntRegs:$addr),
+           (MI IntRegs:$addr, 0, (xformFunc immPred:$bitend))>;
 }
 
-multiclass MemOpi_bitExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
+multiclass MemOpi_bitExtType<PatFrag ldOpByte, PatFrag ldOpHalf> {
   // Byte - clrbit
-  defm : MemOpi_bitPats<ldOpByte, truncstorei8, Clr3ImmPred, u6ExtPred,
-                       ADDRriU6_0, CLRMEMIMM_BYTE, MemOPb_CLRBITi_V4, and>;
+  defm: MemOpi_bitPats<ldOpByte, truncstorei8, Clr3ImmPred, u6ExtPred,
+                       CLRMEMIMM_BYTE, L4_iand_memopb_io, and>;
   // Byte - setbit
-  defm : MemOpi_bitPats<ldOpByte, truncstorei8, Set3ImmPred,  u6ExtPred,
-                       ADDRriU6_0, SETMEMIMM_BYTE, MemOPb_SETBITi_V4, or>;
+  defm: MemOpi_bitPats<ldOpByte, truncstorei8, Set3ImmPred, u6ExtPred,
+                       SETMEMIMM_BYTE, L4_ior_memopb_io, or>;
   // Half Word - clrbit
-  defm : MemOpi_bitPats<ldOpHalf, truncstorei16, Clr4ImmPred, u6_1ExtPred,
-                       ADDRriU6_1, CLRMEMIMM_SHORT, MemOPh_CLRBITi_V4, and>;
+  defm: MemOpi_bitPats<ldOpHalf, truncstorei16, Clr4ImmPred, u6_1ExtPred,
+                       CLRMEMIMM_SHORT, L4_iand_memoph_io, and>;
   // Half Word - setbit
-  defm : MemOpi_bitPats<ldOpHalf, truncstorei16, Set4ImmPred, u6_1ExtPred,
-                       ADDRriU6_1, SETMEMIMM_SHORT, MemOPh_SETBITi_V4, or>;
+  defm: MemOpi_bitPats<ldOpHalf, truncstorei16, Set4ImmPred, u6_1ExtPred,
+                       SETMEMIMM_SHORT, L4_ior_memoph_io, or>;
 }
 
-let Predicates = [HasV4T, UseMEMOP] in {
+let Predicates = [UseMEMOP] in {
   // mem[bh](Rs+#0) = [clrbit|setbit](#U5)
   // mem[bh](Rs+#u6:[01]) = [clrbit|setbit](#U5)
-  defm : MemOpi_bitExtType<zextloadi8, zextloadi16>; // zero extend
-  defm : MemOpi_bitExtType<sextloadi8, sextloadi16>; // sign extend
-  defm : MemOpi_bitExtType<extloadi8,  extloadi16>;  // any extend
+  defm: MemOpi_bitExtType<zextloadi8, zextloadi16>; // zero extend
+  defm: MemOpi_bitExtType<sextloadi8, sextloadi16>; // sign extend
+  defm: MemOpi_bitExtType<extloadi8,  extloadi16>;  // any extend
 
   // memw(Rs+#0) = [clrbit|setbit](#U5)
   // memw(Rs+#u6:2) = [clrbit|setbit](#U5)
-  defm : MemOpi_bitPats<load, store, Clr5ImmPred, u6_2ExtPred, ADDRriU6_2,
-                       CLRMEMIMM, MemOPw_CLRBITi_V4, and>;
-  defm : MemOpi_bitPats<load, store, Set5ImmPred, u6_2ExtPred, ADDRriU6_2,
-                       SETMEMIMM, MemOPw_SETBITi_V4, or>;
+  defm: MemOpi_bitPats<load, store, Clr5ImmPred, u6_2ExtPred, CLRMEMIMM,
+                       L4_iand_memopw_io, and>;
+  defm: MemOpi_bitPats<load, store, Set5ImmPred, u6_2ExtPred, SETMEMIMM,
+                       L4_ior_memopw_io, or>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2065,54 +3043,51 @@ let Predicates = [HasV4T, UseMEMOP] in {
 // mem[bhw](Rs+#U6:[012]) [+-&|]= Rt
 //===----------------------------------------------------------------------===//
 
-multiclass MemOpr_Pats <PatFrag ldOp, PatFrag stOp, ComplexPattern addrPred,
-                     PatLeaf extPred, InstHexagon MI, SDNode OpNode> {
+multiclass MemOpr_Pats <PatFrag ldOp, PatFrag stOp, PatLeaf extPred,
+                        InstHexagon MI, SDNode OpNode> {
   let AddedComplexity = 141 in
   // mem[bhw](Rs+#0) [+-&|]= Rt
-  def : Pat <(stOp (OpNode (ldOp (addrPred IntRegs:$addr, extPred:$offset)),
-                           (i32 IntRegs:$addend)),
-                   (addrPred (i32 IntRegs:$addr), extPred:$offset)),
-             (MI IntRegs:$addr, extPred:$offset, (i32 IntRegs:$addend) )>;
+  def: Pat<(stOp (OpNode (ldOp IntRegs:$addr), (i32 IntRegs:$addend)),
+                 IntRegs:$addr),
+           (MI IntRegs:$addr, 0, (i32 IntRegs:$addend))>;
 
   // mem[bhw](Rs+#U6:[012]) [+-&|]= Rt
   let AddedComplexity = 150 in
-  def : Pat <(stOp (OpNode (ldOp (add IntRegs:$base, extPred:$offset)),
-                           (i32 IntRegs:$orend)),
-                   (add IntRegs:$base, extPred:$offset)),
-             (MI IntRegs:$base, extPred:$offset, (i32 IntRegs:$orend) )>;
+  def: Pat<(stOp (OpNode (ldOp (add IntRegs:$base, extPred:$offset)),
+                  (i32 IntRegs:$orend)),
+           (add IntRegs:$base, extPred:$offset)),
+           (MI IntRegs:$base, extPred:$offset, (i32 IntRegs:$orend))>;
 }
 
-multiclass MemOPr_ALUOp<PatFrag ldOp, PatFrag stOp,
-                        ComplexPattern addrPred, PatLeaf extPred,
+multiclass MemOPr_ALUOp<PatFrag ldOp, PatFrag stOp, PatLeaf extPred,
                         InstHexagon addMI, InstHexagon subMI,
-                        InstHexagon andMI, InstHexagon orMI > {
-
-  defm : MemOpr_Pats <ldOp, stOp, addrPred, extPred, addMI, add>;
-  defm : MemOpr_Pats <ldOp, stOp, addrPred, extPred, subMI, sub>;
-  defm : MemOpr_Pats <ldOp, stOp, addrPred, extPred, andMI, and>;
-  defm : MemOpr_Pats <ldOp, stOp, addrPred, extPred, orMI,  or>;
+                        InstHexagon andMI, InstHexagon orMI> {
+  defm: MemOpr_Pats <ldOp, stOp, extPred, addMI, add>;
+  defm: MemOpr_Pats <ldOp, stOp, extPred, subMI, sub>;
+  defm: MemOpr_Pats <ldOp, stOp, extPred, andMI, and>;
+  defm: MemOpr_Pats <ldOp, stOp, extPred, orMI,  or>;
 }
 
 multiclass MemOPr_ExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
   // Half Word
-  defm : MemOPr_ALUOp <ldOpHalf, truncstorei16, ADDRriU6_1, u6_1ExtPred,
-                       MemOPh_ADDr_V4, MemOPh_SUBr_V4,
-                       MemOPh_ANDr_V4, MemOPh_ORr_V4>;
+  defm: MemOPr_ALUOp <ldOpHalf, truncstorei16, u6_1ExtPred,
+                      L4_add_memoph_io, L4_sub_memoph_io,
+                      L4_and_memoph_io, L4_or_memoph_io>;
   // Byte
-  defm : MemOPr_ALUOp <ldOpByte, truncstorei8, ADDRriU6_0, u6ExtPred,
-                       MemOPb_ADDr_V4, MemOPb_SUBr_V4,
-                       MemOPb_ANDr_V4, MemOPb_ORr_V4>;
+  defm: MemOPr_ALUOp <ldOpByte, truncstorei8, u6ExtPred,
+                      L4_add_memopb_io, L4_sub_memopb_io,
+                      L4_and_memopb_io, L4_or_memopb_io>;
 }
 
 // Define 'def Pats' for MemOps with register addend.
-let Predicates = [HasV4T, UseMEMOP] in {
+let Predicates = [UseMEMOP] in {
   // Byte, Half Word
-  defm : MemOPr_ExtType<zextloadi8, zextloadi16>; // zero extend
-  defm : MemOPr_ExtType<sextloadi8, sextloadi16>; // sign extend
-  defm : MemOPr_ExtType<extloadi8,  extloadi16>;  // any extend
+  defm: MemOPr_ExtType<zextloadi8, zextloadi16>; // zero extend
+  defm: MemOPr_ExtType<sextloadi8, sextloadi16>; // sign extend
+  defm: MemOPr_ExtType<extloadi8,  extloadi16>;  // any extend
   // Word
-  defm : MemOPr_ALUOp <load, store, ADDRriU6_2, u6_2ExtPred, MemOPw_ADDr_V4,
-                       MemOPw_SUBr_V4, MemOPw_ANDr_V4, MemOPw_ORr_V4 >;
+  defm: MemOPr_ALUOp <load, store, u6_2ExtPred, L4_add_memopw_io,
+                      L4_sub_memopw_io, L4_and_memopw_io, L4_or_memopw_io>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2130,123 +3105,28 @@ let Predicates = [HasV4T, UseMEMOP] in {
 // incorrect code for negative numbers.
 // Pd=cmpb.eq(Rs,#u8)
 
-let isCompare = 1, isExtendable = 1, opExtendable = 2, hasSideEffects = 0,
-    validSubTargets = HasV4SubT in
-class CMP_NOT_REG_IMM<string OpName, bits<2> op, Operand ImmOp,
-                      list<dag> Pattern>
-  : ALU32Inst <(outs PredRegs:$dst), (ins IntRegs:$src1, ImmOp:$src2),
-    "$dst = !cmp."#OpName#"($src1, #$src2)",
-    Pattern,
-    "", ALU32_2op_tc_2early_SLOT0123> {
-    bits<2> dst;
-    bits<5> src1;
-    bits<10> src2;
+// p=!cmp.eq(r1,#s10)
+def C4_cmpneqi  : T_CMP <"cmp.eq",  0b00, 1, s10Ext>;
+def C4_cmpltei  : T_CMP <"cmp.gt",  0b01, 1, s10Ext>;
+def C4_cmplteui : T_CMP <"cmp.gtu", 0b10, 1, u9Ext>;
 
-    let IClass = 0b0111;
-    let Inst{27-24} = 0b0101;
-    let Inst{23-22} = op;
-    let Inst{20-16} = src1;
-    let Inst{21} = !if (!eq(OpName, "gtu"), 0b0, src2{9});
-    let Inst{13-5} = src2{8-0};
-    let Inst{4-2} = 0b100;
-    let Inst{1-0} = dst;
-}
-
-let opExtentBits = 10, isExtentSigned = 1 in {
-def C4_cmpneqi : CMP_NOT_REG_IMM <"eq", 0b00, s10Ext, [(set (i1 PredRegs:$dst),
-                 (setne (i32 IntRegs:$src1), s10ExtPred:$src2))]>;
-
-def C4_cmpltei : CMP_NOT_REG_IMM <"gt", 0b01, s10Ext, [(set (i1 PredRegs:$dst),
-                 (not (setgt (i32 IntRegs:$src1), s10ExtPred:$src2)))]>;
-
-}
-let opExtentBits = 9 in
-def C4_cmplteui : CMP_NOT_REG_IMM <"gtu", 0b10, u9Ext, [(set (i1 PredRegs:$dst),
-                  (not (setugt (i32 IntRegs:$src1), u9ExtPred:$src2)))]>;
-
-
-
-// p=!cmp.eq(r1,r2)
-let isCompare = 1, validSubTargets = HasV4SubT in
-def CMPnotEQ_rr : ALU32_rr<(outs PredRegs:$dst),
-                           (ins IntRegs:$src1, IntRegs:$src2),
-      "$dst = !cmp.eq($src1, $src2)",
-      [(set (i1 PredRegs:$dst),
-            (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2)))]>,
-      Requires<[HasV4T]>;
-
-// p=!cmp.gt(r1,r2)
-let isCompare = 1, validSubTargets = HasV4SubT in
-def CMPnotGT_rr : ALU32_rr<(outs PredRegs:$dst),
-                           (ins IntRegs:$src1, IntRegs:$src2),
-      "$dst = !cmp.gt($src1, $src2)",
-      [(set (i1 PredRegs:$dst),
-            (not (setgt (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>,
-      Requires<[HasV4T]>;
-
-
-// p=!cmp.gtu(r1,r2)
-let isCompare = 1, validSubTargets = HasV4SubT in
-def CMPnotGTU_rr : ALU32_rr<(outs PredRegs:$dst),
-                            (ins IntRegs:$src1, IntRegs:$src2),
-      "$dst = !cmp.gtu($src1, $src2)",
-      [(set (i1 PredRegs:$dst),
-            (not (setugt (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>,
-      Requires<[HasV4T]>;
-
-let isCompare = 1, validSubTargets = HasV4SubT in
-def CMPbEQri_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, u8Imm:$src2),
-            "$dst = cmpb.eq($src1, #$src2)",
-            [(set (i1 PredRegs:$dst),
-                  (seteq (and (i32 IntRegs:$src1), 255), u8ImmPred:$src2))]>,
-            Requires<[HasV4T]>;
-
-def : Pat <(brcond (i1 (setne (and (i32 IntRegs:$src1), 255), u8ImmPred:$src2)),
-                       bb:$offset),
-      (JMP_f (CMPbEQri_V4 (i32 IntRegs:$src1), u8ImmPred:$src2),
-                bb:$offset)>,
-      Requires<[HasV4T]>;
-
-// Pd=cmpb.eq(Rs,Rt)
-let isCompare = 1, validSubTargets = HasV4SubT in
-def CMPbEQrr_ubub_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = cmpb.eq($src1, $src2)",
-            [(set (i1 PredRegs:$dst),
-                  (seteq (and (xor (i32 IntRegs:$src1),
-                                   (i32 IntRegs:$src2)), 255), 0))]>,
-            Requires<[HasV4T]>;
-
-// Pd=cmpb.eq(Rs,Rt)
-let isCompare = 1, validSubTargets = HasV4SubT in
-def CMPbEQrr_sbsb_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = cmpb.eq($src1, $src2)",
-            [(set (i1 PredRegs:$dst),
-                  (seteq (shl (i32 IntRegs:$src1), (i32 24)),
-                         (shl (i32 IntRegs:$src2), (i32 24))))]>,
-            Requires<[HasV4T]>;
-
-// Pd=cmpb.gt(Rs,Rt)
-let isCompare = 1, validSubTargets = HasV4SubT in
-def CMPbGTrr_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = cmpb.gt($src1, $src2)",
-            [(set (i1 PredRegs:$dst),
-                  (setgt (shl (i32 IntRegs:$src1), (i32 24)),
-                         (shl (i32 IntRegs:$src2), (i32 24))))]>,
-            Requires<[HasV4T]>;
-
-// Pd=cmpb.gtu(Rs,#u7)
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 7,
-isCompare = 1, validSubTargets = HasV4SubT, CextOpcode = "CMPbGTU", InputType = "imm" in
-def CMPbGTUri_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, u7Ext:$src2),
-            "$dst = cmpb.gtu($src1, #$src2)",
-            [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 255),
-                                              u7ExtPred:$src2))]>,
-            Requires<[HasV4T]>, ImmRegRel;
+def : T_CMP_pat <C4_cmpneqi,  setne,  s10ExtPred>;
+def : T_CMP_pat <C4_cmpltei,  setle,  s10ExtPred>;
+def : T_CMP_pat <C4_cmplteui, setule, u9ImmPred>;
+
+// rs <= rt -> !(rs > rt).
+/*
+def: Pat<(i1 (setle (i32 IntRegs:$src1), s10ExtPred:$src2)),
+         (C2_not (C2_cmpgti IntRegs:$src1, s10ExtPred:$src2))>;
+//         (C4_cmpltei IntRegs:$src1, s10ExtPred:$src2)>;
+*/
+// Map cmplt(Rs, Imm) -> !cmpgt(Rs, Imm-1).
+def: Pat<(i1 (setlt (i32 IntRegs:$src1), s8ExtPred:$src2)),
+         (C4_cmpltei IntRegs:$src1, (DEC_CONST_SIGNED s8ExtPred:$src2))>;
+
+// rs != rt -> !(rs == rt).
+def: Pat<(i1 (setne (i32 IntRegs:$src1), s10ExtPred:$src2)),
+         (C4_cmpneqi IntRegs:$src1, s10ExtPred:$src2)>;
 
 // SDNode for converting immediate C to C-1.
 def DEC_CONST_BYTE : SDNodeXForm<imm, [{
@@ -2263,10 +3143,9 @@ def DEC_CONST_BYTE : SDNodeXForm<imm, [{
 //   if (!Pd.new) Rd=#0
 def : Pat <(i32 (zext (i1 (seteq (i32 (and (i32 IntRegs:$Rs), 255)),
                                            u8ExtPred:$u8)))),
-           (i32 (TFR_condset_ii (i1 (CMPbEQri_V4 (i32 IntRegs:$Rs),
+           (i32 (TFR_condset_ii (i1 (A4_cmpbeqi (i32 IntRegs:$Rs),
                                                  (u8ExtPred:$u8))),
-                                1, 0))>,
-           Requires<[HasV4T]>;
+                                1, 0))>;
 
 // For the sequence
 //   zext( setne ( and(Rs, 255), u8))
@@ -2276,10 +3155,9 @@ def : Pat <(i32 (zext (i1 (seteq (i32 (and (i32 IntRegs:$Rs), 255)),
 //   if (!Pd.new) Rd=#1
 def : Pat <(i32 (zext (i1 (setne (i32 (and (i32 IntRegs:$Rs), 255)),
                                            u8ExtPred:$u8)))),
-           (i32 (TFR_condset_ii (i1 (CMPbEQri_V4 (i32 IntRegs:$Rs),
+           (i32 (TFR_condset_ii (i1 (A4_cmpbeqi (i32 IntRegs:$Rs),
                                                  (u8ExtPred:$u8))),
-                                0, 1))>,
-           Requires<[HasV4T]>;
+                                0, 1))>;
 
 // For the sequence
 //   zext( seteq (Rs, and(Rt, 255)))
@@ -2289,10 +3167,9 @@ def : Pat <(i32 (zext (i1 (setne (i32 (and (i32 IntRegs:$Rs), 255)),
 //   if (!Pd.new) Rd=#0
 def : Pat <(i32 (zext (i1 (seteq (i32 IntRegs:$Rt),
                                  (i32 (and (i32 IntRegs:$Rs), 255)))))),
-           (i32 (TFR_condset_ii (i1 (CMPbEQrr_ubub_V4 (i32 IntRegs:$Rs),
+           (i32 (TFR_condset_ii (i1 (A4_cmpbeq (i32 IntRegs:$Rs),
                                                       (i32 IntRegs:$Rt))),
-                                1, 0))>,
-           Requires<[HasV4T]>;
+                                1, 0))>;
 
 // For the sequence
 //   zext( setne (Rs, and(Rt, 255)))
@@ -2302,10 +3179,9 @@ def : Pat <(i32 (zext (i1 (seteq (i32 IntRegs:$Rt),
 //   if (!Pd.new) Rd=#1
 def : Pat <(i32 (zext (i1 (setne (i32 IntRegs:$Rt),
                                  (i32 (and (i32 IntRegs:$Rs), 255)))))),
-           (i32 (TFR_condset_ii (i1 (CMPbEQrr_ubub_V4 (i32 IntRegs:$Rs),
+           (i32 (TFR_condset_ii (i1 (A4_cmpbeq (i32 IntRegs:$Rs),
                                                       (i32 IntRegs:$Rt))),
-                                0, 1))>,
-           Requires<[HasV4T]>;
+                                0, 1))>;
 
 // For the sequence
 //   zext( setugt ( and(Rs, 255), u8))
@@ -2315,10 +3191,9 @@ def : Pat <(i32 (zext (i1 (setne (i32 IntRegs:$Rt),
 //   if (!Pd.new) Rd=#0
 def : Pat <(i32 (zext (i1 (setugt (i32 (and (i32 IntRegs:$Rs), 255)),
                                             u8ExtPred:$u8)))),
-           (i32 (TFR_condset_ii (i1 (CMPbGTUri_V4 (i32 IntRegs:$Rs),
+           (i32 (TFR_condset_ii (i1 (A4_cmpbgtui (i32 IntRegs:$Rs),
                                                   (u8ExtPred:$u8))),
-                                1, 0))>,
-           Requires<[HasV4T]>;
+                                1, 0))>;
 
 // For the sequence
 //   zext( setugt ( and(Rs, 254), u8))
@@ -2328,10 +3203,9 @@ def : Pat <(i32 (zext (i1 (setugt (i32 (and (i32 IntRegs:$Rs), 255)),
 //   if (!Pd.new) Rd=#0
 def : Pat <(i32 (zext (i1 (setugt (i32 (and (i32 IntRegs:$Rs), 254)),
                                             u8ExtPred:$u8)))),
-           (i32 (TFR_condset_ii (i1 (CMPbGTUri_V4 (i32 IntRegs:$Rs),
+           (i32 (TFR_condset_ii (i1 (A4_cmpbgtui (i32 IntRegs:$Rs),
                                                   (u8ExtPred:$u8))),
-                                1, 0))>,
-           Requires<[HasV4T]>;
+                                1, 0))>;
 
 // For the sequence
 //   zext( setult ( Rs, Rt))
@@ -2341,10 +3215,9 @@ def : Pat <(i32 (zext (i1 (setugt (i32 (and (i32 IntRegs:$Rs), 254)),
 //   if (!Pd.new) Rd=#0
 // cmp.ltu(Rs, Rt) -> cmp.gtu(Rt, Rs)
 def : Pat <(i32 (zext (i1 (setult (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
-           (i32 (TFR_condset_ii (i1 (CMPGTUrr (i32 IntRegs:$Rt),
+           (i32 (TFR_condset_ii (i1 (C2_cmpgtu (i32 IntRegs:$Rt),
                                               (i32 IntRegs:$Rs))),
-                                1, 0))>,
-           Requires<[HasV4T]>;
+                                1, 0))>;
 
 // For the sequence
 //   zext( setlt ( Rs, Rt))
@@ -2354,10 +3227,9 @@ def : Pat <(i32 (zext (i1 (setult (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
 //   if (!Pd.new) Rd=#0
 // cmp.lt(Rs, Rt) -> cmp.gt(Rt, Rs)
 def : Pat <(i32 (zext (i1 (setlt (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
-           (i32 (TFR_condset_ii (i1 (CMPGTrr (i32 IntRegs:$Rt),
+           (i32 (TFR_condset_ii (i1 (C2_cmpgt (i32 IntRegs:$Rt),
                                              (i32 IntRegs:$Rs))),
-                                1, 0))>,
-           Requires<[HasV4T]>;
+                                1, 0))>;
 
 // For the sequence
 //   zext( setugt ( Rs, Rt))
@@ -2366,10 +3238,9 @@ def : Pat <(i32 (zext (i1 (setlt (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
 //   if (Pd.new) Rd=#1
 //   if (!Pd.new) Rd=#0
 def : Pat <(i32 (zext (i1 (setugt (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
-           (i32 (TFR_condset_ii (i1 (CMPGTUrr (i32 IntRegs:$Rs),
+           (i32 (TFR_condset_ii (i1 (C2_cmpgtu (i32 IntRegs:$Rs),
                                               (i32 IntRegs:$Rt))),
-                                1, 0))>,
-           Requires<[HasV4T]>;
+                                1, 0))>;
 
 // This pattern interefers with coremark performance, not implementing at this
 // time.
@@ -2388,10 +3259,9 @@ def : Pat <(i32 (zext (i1 (setugt (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
 //   if (!Pd.new) Rd=#1
 // cmp.ltu(Rs, Rt) -> cmp.gtu(Rt, Rs)
 def : Pat <(i32 (zext (i1 (setuge (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
-           (i32 (TFR_condset_ii (i1 (CMPGTUrr (i32 IntRegs:$Rt),
+           (i32 (TFR_condset_ii (i1 (C2_cmpgtu (i32 IntRegs:$Rt),
                                               (i32 IntRegs:$Rs))),
-                                0, 1))>,
-           Requires<[HasV4T]>;
+                                0, 1))>;
 
 // For the sequence
 //   zext( setge ( Rs, Rt))
@@ -2401,10 +3271,9 @@ def : Pat <(i32 (zext (i1 (setuge (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
 //   if (!Pd.new) Rd=#1
 // cmp.lt(Rs, Rt) -> cmp.gt(Rt, Rs)
 def : Pat <(i32 (zext (i1 (setge (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
-           (i32 (TFR_condset_ii (i1 (CMPGTrr (i32 IntRegs:$Rt),
+           (i32 (TFR_condset_ii (i1 (C2_cmpgt (i32 IntRegs:$Rt),
                                              (i32 IntRegs:$Rs))),
-                                0, 1))>,
-           Requires<[HasV4T]>;
+                                0, 1))>;
 
 // For the sequence
 //   zext( setule ( Rs, Rt))
@@ -2413,10 +3282,9 @@ def : Pat <(i32 (zext (i1 (setge (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
 //   if (Pd.new) Rd=#0
 //   if (!Pd.new) Rd=#1
 def : Pat <(i32 (zext (i1 (setule (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
-           (i32 (TFR_condset_ii (i1 (CMPGTUrr (i32 IntRegs:$Rs),
+           (i32 (TFR_condset_ii (i1 (C2_cmpgtu (i32 IntRegs:$Rs),
                                               (i32 IntRegs:$Rt))),
-                                0, 1))>,
-           Requires<[HasV4T]>;
+                                0, 1))>;
 
 // For the sequence
 //   zext( setle ( Rs, Rt))
@@ -2425,16 +3293,15 @@ def : Pat <(i32 (zext (i1 (setule (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
 //   if (Pd.new) Rd=#0
 //   if (!Pd.new) Rd=#1
 def : Pat <(i32 (zext (i1 (setle (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
-           (i32 (TFR_condset_ii (i1 (CMPGTrr (i32 IntRegs:$Rs),
+           (i32 (TFR_condset_ii (i1 (C2_cmpgt (i32 IntRegs:$Rs),
                                              (i32 IntRegs:$Rt))),
-                                0, 1))>,
-           Requires<[HasV4T]>;
+                                0, 1))>;
 
 // For the sequence
 //   zext( setult ( and(Rs, 255), u8))
 // Use the isdigit transformation below
 
-// Generate code of the form 'mux_ii(cmpbgtu(Rdd, C-1),0,1)'
+// Generate code of the form 'C2_muxii(cmpbgtui(Rdd, C-1),0,1)'
 // for C code of the form r = ((c>='0') & (c<='9')) ? 1 : 0;.
 // The isdigit transformation relies on two 'clever' aspects:
 // 1) The data type is unsigned which allows us to eliminate a zero test after
@@ -2447,961 +3314,1044 @@ def : Pat <(i32 (zext (i1 (setle (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
 // The code is transformed upstream of llvm into
 //   retval = (c-48) < 10 ? 1 : 0;
 let AddedComplexity = 139 in
-def : Pat <(i32 (zext (i1 (setult (i32 (and (i32 IntRegs:$src1), 255)),
-                                  u7StrictPosImmPred:$src2)))),
-  (i32 (MUX_ii (i1 (CMPbGTUri_V4 (i32 IntRegs:$src1),
-                                 (DEC_CONST_BYTE u7StrictPosImmPred:$src2))),
-                   0, 1))>,
-                   Requires<[HasV4T]>;
-
-// Pd=cmpb.gtu(Rs,Rt)
-let isCompare = 1, validSubTargets = HasV4SubT, CextOpcode = "CMPbGTU",
-InputType = "reg" in
-def CMPbGTUrr_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = cmpb.gtu($src1, $src2)",
-            [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 255),
-                                             (and (i32 IntRegs:$src2), 255)))]>,
-            Requires<[HasV4T]>, ImmRegRel;
-
-// Following instruction is not being extended as it results into the incorrect
-// code for negative numbers.
-
-// Signed half compare(.eq) ri.
-// Pd=cmph.eq(Rs,#s8)
-let isCompare = 1, validSubTargets = HasV4SubT in
-def CMPhEQri_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, s8Imm:$src2),
-            "$dst = cmph.eq($src1, #$src2)",
-            [(set (i1 PredRegs:$dst), (seteq (and (i32 IntRegs:$src1), 65535),
-                                             s8ImmPred:$src2))]>,
-            Requires<[HasV4T]>;
-
-// Signed half compare(.eq) rr.
-// Case 1: xor + and, then compare:
-//   r0=xor(r0,r1)
-//   r0=and(r0,#0xffff)
-//   p0=cmp.eq(r0,#0)
-// Pd=cmph.eq(Rs,Rt)
-let isCompare = 1, validSubTargets = HasV4SubT in
-def CMPhEQrr_xor_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = cmph.eq($src1, $src2)",
-            [(set (i1 PredRegs:$dst), (seteq (and (xor (i32 IntRegs:$src1),
-                                                       (i32 IntRegs:$src2)),
-                                                  65535), 0))]>,
-            Requires<[HasV4T]>;
-
-// Signed half compare(.eq) rr.
-// Case 2: shift left 16 bits then compare:
-//   r0=asl(r0,16)
-//   r1=asl(r1,16)
-//   p0=cmp.eq(r0,r1)
-// Pd=cmph.eq(Rs,Rt)
-let isCompare = 1, validSubTargets = HasV4SubT in
-def CMPhEQrr_shl_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = cmph.eq($src1, $src2)",
-            [(set (i1 PredRegs:$dst),
-                  (seteq (shl (i32 IntRegs:$src1), (i32 16)),
-                         (shl (i32 IntRegs:$src2), (i32 16))))]>,
-            Requires<[HasV4T]>;
-
-/* Incorrect Pattern -- immediate should be right shifted before being
-used in the cmph.gt instruction.
-// Signed half compare(.gt) ri.
-// Pd=cmph.gt(Rs,#s8)
-
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 8,
-isCompare = 1, validSubTargets = HasV4SubT in
-def CMPhGTri_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, s8Ext:$src2),
-            "$dst = cmph.gt($src1, #$src2)",
-            [(set (i1 PredRegs:$dst),
-                  (setgt (shl (i32 IntRegs:$src1), (i32 16)),
-                         s8ExtPred:$src2))]>,
-            Requires<[HasV4T]>;
-*/
-
-// Signed half compare(.gt) rr.
-// Pd=cmph.gt(Rs,Rt)
-let isCompare = 1, validSubTargets = HasV4SubT in
-def CMPhGTrr_shl_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = cmph.gt($src1, $src2)",
-            [(set (i1 PredRegs:$dst),
-                  (setgt (shl (i32 IntRegs:$src1), (i32 16)),
-                         (shl (i32 IntRegs:$src2), (i32 16))))]>,
-            Requires<[HasV4T]>;
-
-// Unsigned half compare rr (.gtu).
-// Pd=cmph.gtu(Rs,Rt)
-let isCompare = 1, validSubTargets = HasV4SubT, CextOpcode = "CMPhGTU",
-InputType = "reg" in
-def CMPhGTUrr_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = cmph.gtu($src1, $src2)",
-            [(set (i1 PredRegs:$dst),
-                  (setugt (and (i32 IntRegs:$src1), 65535),
-                          (and (i32 IntRegs:$src2), 65535)))]>,
-            Requires<[HasV4T]>, ImmRegRel;
-
-// Unsigned half compare ri (.gtu).
-// Pd=cmph.gtu(Rs,#u7)
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 7,
-isCompare = 1, validSubTargets = HasV4SubT, CextOpcode = "CMPhGTU",
-InputType = "imm" in
-def CMPhGTUri_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, u7Ext:$src2),
-            "$dst = cmph.gtu($src1, #$src2)",
-            [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 65535),
-                                              u7ExtPred:$src2))]>,
-            Requires<[HasV4T]>, ImmRegRel;
-
-let validSubTargets = HasV4SubT in
-def NTSTBIT_rr : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-    "$dst = !tstbit($src1, $src2)",
-    [(set (i1 PredRegs:$dst),
-          (seteq (and (shl 1, (i32 IntRegs:$src2)), (i32 IntRegs:$src1)), 0))]>,
-    Requires<[HasV4T]>;
-
-let validSubTargets = HasV4SubT in
-def NTSTBIT_ri : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
-    "$dst = !tstbit($src1, $src2)",
-    [(set (i1 PredRegs:$dst),
-          (seteq (and (shl 1, u5ImmPred:$src2), (i32 IntRegs:$src1)), 0))]>,
-    Requires<[HasV4T]>;
+def: Pat<(i32 (zext (i1 (setult (i32 (and (i32 IntRegs:$src1), 255)),
+                         u7StrictPosImmPred:$src2)))),
+         (C2_muxii (A4_cmpbgtui IntRegs:$src1,
+                    (DEC_CONST_BYTE u7StrictPosImmPred:$src2)),
+          0, 1)>;
 
 //===----------------------------------------------------------------------===//
 // XTYPE/PRED -
 //===----------------------------------------------------------------------===//
 
-//Deallocate frame and return.
-//    dealloc_return
-let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicable = 1,
-  Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1 in {
-let validSubTargets = HasV4SubT in
-  def DEALLOC_RET_V4 : LD0Inst<(outs), (ins),
-            "dealloc_return",
-            []>,
-            Requires<[HasV4T]>;
+//===----------------------------------------------------------------------===//
+// Multiclass for DeallocReturn
+//===----------------------------------------------------------------------===//
+class L4_RETURN<string mnemonic, bit isNot, bit isPredNew, bit isTak>
+  : LD0Inst<(outs), (ins PredRegs:$src),
+  !if(isNot, "if (!$src", "if ($src")#
+  !if(isPredNew, ".new) ", ") ")#mnemonic#
+  !if(isPredNew, #!if(isTak,":t", ":nt"),""),
+  [], "", LD_tc_3or4stall_SLOT0> {
+
+    bits<2> src;
+    let BaseOpcode = "L4_RETURN";
+    let isPredicatedFalse = isNot;
+    let isPredicatedNew = isPredNew;
+    let isTaken = isTak;
+    let IClass = 0b1001;
+
+    let Inst{27-16} = 0b011000011110;
+
+    let Inst{13} = isNot;
+    let Inst{12} = isTak;
+    let Inst{11} = isPredNew;
+    let Inst{10} = 0b0;
+    let Inst{9-8} = src;
+    let Inst{4-0} = 0b11110;
+  }
+
+// Produce all predicated forms, p, !p, p.new, !p.new, :t, :nt
+multiclass L4_RETURN_PRED<string mnemonic, bit PredNot> {
+  let isPredicated = 1 in {
+    def _#NAME# : L4_RETURN <mnemonic, PredNot, 0, 1>;
+    def _#NAME#new_pnt : L4_RETURN <mnemonic, PredNot, 1, 0>;
+    def _#NAME#new_pt : L4_RETURN <mnemonic, PredNot, 1, 1>;
+  }
 }
 
+multiclass LD_MISC_L4_RETURN<string mnemonic> {
+  let isBarrier = 1, isPredicable = 1 in
+    def NAME : LD0Inst <(outs), (ins), mnemonic, [], "",
+                        LD_tc_3or4stall_SLOT0> {
+      let BaseOpcode = "L4_RETURN";
+      let IClass = 0b1001;
+      let Inst{27-16} = 0b011000011110;
+      let Inst{13-10} = 0b0000;
+      let Inst{4-0} = 0b11110;
+    }
+  defm t : L4_RETURN_PRED<mnemonic, 0 >;
+  defm f : L4_RETURN_PRED<mnemonic, 1 >;
+}
+
+let isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R30], hasSideEffects = 0 in
+defm L4_return: LD_MISC_L4_RETURN <"dealloc_return">, PredNewRel;
+
 // Restore registers and dealloc return function call.
 let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
-  Defs = [R29, R30, R31, PC] in {
-let validSubTargets = HasV4SubT in
+    Defs = [R29, R30, R31, PC], isPredicable = 0, isAsmParserOnly = 1 in {
   def RESTORE_DEALLOC_RET_JMP_V4 : JInst<(outs),
                                    (ins calltarget:$dst),
              "jump $dst",
-             []>,
-             Requires<[HasV4T]>;
+             []>;
 }
 
 // Restore registers and dealloc frame before a tail call.
-let isCall = 1, isBarrier = 1,
-  Defs = [R29, R30, R31, PC] in {
-let validSubTargets = HasV4SubT in
+let isCall = 1, Defs = [R29, R30, R31, PC], isAsmParserOnly = 1 in {
   def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : JInst<(outs),
                                            (ins calltarget:$dst),
              "call $dst",
-             []>,
-             Requires<[HasV4T]>;
+             []>;
 }
 
 // Save registers function call.
-let isCall = 1, isBarrier = 1,
-  Uses = [R29, R31] in {
+let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in {
   def SAVE_REGISTERS_CALL_V4 : JInst<(outs),
                                (ins calltarget:$dst),
              "call $dst // Save_calle_saved_registers",
-             []>,
-             Requires<[HasV4T]>;
+             []>;
 }
 
-//    if (Ps) dealloc_return
-let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
-    isPredicated = 1 in {
-let validSubTargets = HasV4SubT in
-  def DEALLOC_RET_cPt_V4 : LD0Inst<(outs),
-                           (ins PredRegs:$src1),
-            "if ($src1) dealloc_return",
-            []>,
-            Requires<[HasV4T]>;
-}
-
-//    if (!Ps) dealloc_return
-let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
-    isPredicated = 1, isPredicatedFalse = 1 in {
-let validSubTargets = HasV4SubT in
-  def DEALLOC_RET_cNotPt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
-            "if (!$src1) dealloc_return",
-            []>,
-            Requires<[HasV4T]>;
-}
-
-//    if (Ps.new) dealloc_return:nt
-let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
-    isPredicated = 1 in {
-let validSubTargets = HasV4SubT in
-  def DEALLOC_RET_cdnPnt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
-            "if ($src1.new) dealloc_return:nt",
-            []>,
-            Requires<[HasV4T]>;
-}
+//===----------------------------------------------------------------------===//
+// Template class for non predicated store instructions with
+// GP-Relative or absolute addressing.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, isPredicable = 1, isNVStorable = 1 in
+class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
+                    bits<2>MajOp, Operand AddrOp, bit isAbs, bit isHalf>
+  : STInst<(outs), (ins AddrOp:$addr, RC:$src),
+  mnemonic # !if(isAbs, "(##", "(#")#"$addr) = $src"#!if(isHalf, ".h",""),
+  [], "", V2LDST_tc_st_SLOT01> {
+    bits<19> addr;
+    bits<5> src;
+    bits<16> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
+                     !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
+                     !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
+                                      /* u16_0Imm */ addr{15-0})));
+    let IClass = 0b0100;
+    let Inst{27} = 1;
+    let Inst{26-25} = offsetBits{15-14};
+    let Inst{24}    = 0b0;
+    let Inst{23-22} = MajOp;
+    let Inst{21}    = isHalf;
+    let Inst{20-16} = offsetBits{13-9};
+    let Inst{13}    = offsetBits{8};
+    let Inst{12-8}  = src;
+    let Inst{7-0}   = offsetBits{7-0};
+  }
 
-//    if (!Ps.new) dealloc_return:nt
-let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
-    isPredicated = 1, isPredicatedFalse = 1 in {
-let validSubTargets = HasV4SubT in
-  def DEALLOC_RET_cNotdnPnt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
-            "if (!$src1.new) dealloc_return:nt",
-            []>,
-            Requires<[HasV4T]>;
-}
+//===----------------------------------------------------------------------===//
+// Template class for predicated store instructions with
+// GP-Relative or absolute addressing.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, isPredicated = 1, isNVStorable = 1, opExtentBits = 6,
+    opExtendable = 1 in
+class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp,
+                       bit isHalf, bit isNot, bit isNew>
+  : STInst<(outs), (ins PredRegs:$src1, u6Ext:$absaddr, RC: $src2),
+  !if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ",
+  ") ")#mnemonic#"(#$absaddr) = $src2"#!if(isHalf, ".h",""),
+  [], "", ST_tc_st_SLOT01>, AddrModeRel {
+    bits<2> src1;
+    bits<6> absaddr;
+    bits<5> src2;
+
+    let isPredicatedNew = isNew;
+    let isPredicatedFalse = isNot;
+
+    let IClass = 0b1010;
 
-//    if (Ps.new) dealloc_return:t
-let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
-    isPredicated = 1 in {
-let validSubTargets = HasV4SubT in
-  def DEALLOC_RET_cdnPt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
-            "if ($src1.new) dealloc_return:t",
-            []>,
-            Requires<[HasV4T]>;
-}
+    let Inst{27-24} = 0b1111;
+    let Inst{23-22} = MajOp;
+    let Inst{21}    = isHalf;
+    let Inst{17-16} = absaddr{5-4};
+    let Inst{13}    = isNew;
+    let Inst{12-8}  = src2;
+    let Inst{7}     = 0b1;
+    let Inst{6-3}   = absaddr{3-0};
+    let Inst{2}     = isNot;
+    let Inst{1-0}   = src1;
+  }
 
-// if (!Ps.new) dealloc_return:nt
-let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
-    isPredicated = 1, isPredicatedFalse = 1 in {
-let validSubTargets = HasV4SubT in
-  def DEALLOC_RET_cNotdnPt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
-            "if (!$src1.new) dealloc_return:t",
-            []>,
-            Requires<[HasV4T]>;
+//===----------------------------------------------------------------------===//
+// Template class for predicated store instructions with absolute addressing.
+//===----------------------------------------------------------------------===//
+class T_StoreAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
+                 bits<2> MajOp, bit isHalf>
+  : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, u0AlwaysExt, 1, isHalf>,
+                  AddrModeRel {
+  string ImmOpStr = !cast<string>(ImmOp);
+  let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
+                     !if (!eq(ImmOpStr, "u16_2Imm"), 18,
+                     !if (!eq(ImmOpStr, "u16_1Imm"), 17,
+                                      /* u16_0Imm */ 16)));
+
+  let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
+                      !if (!eq(ImmOpStr, "u16_2Imm"), 2,
+                      !if (!eq(ImmOpStr, "u16_1Imm"), 1,
+                                       /* u16_0Imm */ 0)));
 }
 
-// Load/Store with absolute addressing mode
-// memw(#u6)=Rt
+//===----------------------------------------------------------------------===//
+// Multiclass for store instructions with absolute addressing.
+//===----------------------------------------------------------------------===//
+let addrMode = Absolute, isExtended = 1 in
+multiclass ST_Abs<string mnemonic, string CextOp, RegisterClass RC,
+                  Operand ImmOp, bits<2> MajOp, bit isHalf = 0> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+    let opExtendable = 0, isPredicable = 1 in
+    def S2_#NAME#abs : T_StoreAbs <mnemonic, RC, ImmOp, MajOp, isHalf>;
 
-multiclass ST_Abs_Predbase<string mnemonic, RegisterClass RC, bit isNot,
-                           bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME#_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, u0AlwaysExt:$absaddr, RC: $src2),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-            ") ")#mnemonic#"(##$absaddr) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-}
+    // Predicated
+    def S4_p#NAME#t_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 0, 0>;
+    def S4_p#NAME#f_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 1, 0>;
 
-multiclass ST_Abs_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    defm _c#NAME : ST_Abs_Predbase<mnemonic, RC, PredNot, 0>;
-    // Predicate new
-    defm _cdn#NAME : ST_Abs_Predbase<mnemonic, RC, PredNot, 1>;
+    // .new Predicated
+    def S4_p#NAME#tnew_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 0, 1>;
+    def S4_p#NAME#fnew_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 1, 1>;
   }
 }
 
-let isNVStorable = 1, isExtended = 1, neverHasSideEffects = 1 in
-multiclass ST_Abs<string mnemonic, string CextOp, RegisterClass RC> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
-    let opExtendable = 0, isPredicable = 1 in
-    def NAME#_V4 : STInst2<(outs),
-            (ins u0AlwaysExt:$absaddr, RC:$src),
-            mnemonic#"(##$absaddr) = $src",
-            []>,
-            Requires<[HasV4T]>;
-
-    let opExtendable = 1, isPredicated = 1 in {
-      defm Pt : ST_Abs_Pred<mnemonic, RC, 0>;
-      defm NotPt : ST_Abs_Pred<mnemonic, RC, 1>;
-    }
+//===----------------------------------------------------------------------===//
+// Template class for non predicated new-value store instructions with
+// GP-Relative or absolute addressing.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, isPredicable = 1, mayStore = 1, isNVStore = 1,
+    isNewValue = 1, opNewValue = 1 in
+class T_StoreAbsGP_NV <string mnemonic, Operand ImmOp, bits<2>MajOp, bit isAbs>
+  : NVInst_V4<(outs), (ins u0AlwaysExt:$addr, IntRegs:$src),
+  mnemonic # !if(isAbs, "(##", "(#")#"$addr) = $src.new",
+  [], "", V2LDST_tc_st_SLOT0> {
+    bits<19> addr;
+    bits<3> src;
+    bits<16> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
+                     !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
+                     !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
+                                      /* u16_0Imm */ addr{15-0})));
+    let IClass = 0b0100;
+
+    let Inst{27} = 1;
+    let Inst{26-25} = offsetBits{15-14};
+    let Inst{24-21} = 0b0101;
+    let Inst{20-16} = offsetBits{13-9};
+    let Inst{13}    = offsetBits{8};
+    let Inst{12-11} = MajOp;
+    let Inst{10-8}  = src;
+    let Inst{7-0}   = offsetBits{7-0};
   }
-}
 
-multiclass ST_Abs_Predbase_nv<string mnemonic, RegisterClass RC, bit isNot,
-                           bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME#_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, u0AlwaysExt:$absaddr, RC: $src2),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-            ") ")#mnemonic#"(##$absaddr) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
+//===----------------------------------------------------------------------===//
+// Template class for predicated new-value store instructions with
+// absolute addressing.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, isPredicated = 1, mayStore = 1, isNVStore = 1,
+    isNewValue = 1, opNewValue = 2, opExtentBits = 6, opExtendable = 1 in
+class T_StoreAbs_NV_Pred <string mnemonic, bits<2> MajOp, bit isNot, bit isNew>
+  : NVInst_V4<(outs), (ins PredRegs:$src1, u6Ext:$absaddr, IntRegs:$src2),
+  !if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ",
+  ") ")#mnemonic#"(#$absaddr) = $src2.new",
+  [], "", ST_tc_st_SLOT0>, AddrModeRel {
+    bits<2> src1;
+    bits<6> absaddr;
+    bits<3> src2;
+
+    let isPredicatedNew = isNew;
+    let isPredicatedFalse = isNot;
+
+    let IClass = 0b1010;
+
+    let Inst{27-24} = 0b1111;
+    let Inst{23-21} = 0b101;
+    let Inst{17-16} = absaddr{5-4};
+    let Inst{13}    = isNew;
+    let Inst{12-11} = MajOp;
+    let Inst{10-8}  = src2;
+    let Inst{7}     = 0b1;
+    let Inst{6-3}   = absaddr{3-0};
+    let Inst{2}     = isNot;
+    let Inst{1-0}   = src1;
 }
 
-multiclass ST_Abs_Pred_nv<string mnemonic, RegisterClass RC, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    defm _c#NAME : ST_Abs_Predbase_nv<mnemonic, RC, PredNot, 0>;
-    // Predicate new
-    defm _cdn#NAME : ST_Abs_Predbase_nv<mnemonic, RC, PredNot, 1>;
-  }
+//===----------------------------------------------------------------------===//
+// Template class for non-predicated new-value store instructions with
+// absolute addressing.
+//===----------------------------------------------------------------------===//
+class T_StoreAbs_NV <string mnemonic, Operand ImmOp, bits<2> MajOp>
+  : T_StoreAbsGP_NV <mnemonic, ImmOp, MajOp, 1>, AddrModeRel {
+
+  string ImmOpStr = !cast<string>(ImmOp);
+  let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
+                     !if (!eq(ImmOpStr, "u16_2Imm"), 18,
+                     !if (!eq(ImmOpStr, "u16_1Imm"), 17,
+                                      /* u16_0Imm */ 16)));
+
+  let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
+                      !if (!eq(ImmOpStr, "u16_2Imm"), 2,
+                      !if (!eq(ImmOpStr, "u16_1Imm"), 1,
+                                       /* u16_0Imm */ 0)));
 }
 
-let mayStore = 1, isNVStore = 1, isExtended = 1, neverHasSideEffects = 1 in
-multiclass ST_Abs_nv<string mnemonic, string CextOp, RegisterClass RC> {
+//===----------------------------------------------------------------------===//
+// Multiclass for new-value store instructions with absolute addressing.
+//===----------------------------------------------------------------------===//
+let addrMode = Absolute, isExtended = 1  in
+multiclass ST_Abs_NV <string mnemonic, string CextOp, Operand ImmOp,
+                   bits<2> MajOp> {
   let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
     let opExtendable = 0, isPredicable = 1 in
-    def NAME#_nv_V4 : NVInst_V4<(outs),
-            (ins u0AlwaysExt:$absaddr, RC:$src),
-            mnemonic#"(##$absaddr) = $src.new",
-            []>,
-            Requires<[HasV4T]>;
-
-    let opExtendable = 1, isPredicated = 1 in {
-      defm Pt : ST_Abs_Pred_nv<mnemonic, RC, 0>;
-      defm NotPt : ST_Abs_Pred_nv<mnemonic, RC, 1>;
-    }
-  }
-}
+    def S2_#NAME#newabs : T_StoreAbs_NV <mnemonic, ImmOp, MajOp>;
 
-let addrMode = Absolute in {
-  let accessSize = ByteAccess in
-    defm STrib_abs : ST_Abs<"memb", "STrib", IntRegs>,
-                     ST_Abs_nv<"memb", "STrib", IntRegs>, AddrModeRel;
+    // Predicated
+    def S4_p#NAME#newt_abs  : T_StoreAbs_NV_Pred <mnemonic, MajOp, 0, 0>;
+    def S4_p#NAME#newf_abs  : T_StoreAbs_NV_Pred <mnemonic, MajOp, 1, 0>;
 
-  let accessSize = HalfWordAccess in
-    defm STrih_abs : ST_Abs<"memh", "STrih", IntRegs>,
-                     ST_Abs_nv<"memh", "STrih", IntRegs>, AddrModeRel;
-
-  let accessSize = WordAccess in
-    defm STriw_abs : ST_Abs<"memw", "STriw", IntRegs>,
-                     ST_Abs_nv<"memw", "STriw", IntRegs>, AddrModeRel;
-
-  let accessSize = DoubleWordAccess, isNVStorable = 0 in
-    defm STrid_abs : ST_Abs<"memd", "STrid", DoubleRegs>, AddrModeRel;
+    // .new Predicated
+    def S4_p#NAME#newtnew_abs : T_StoreAbs_NV_Pred <mnemonic, MajOp, 0, 1>;
+    def S4_p#NAME#newfnew_abs : T_StoreAbs_NV_Pred <mnemonic, MajOp, 1, 1>;
+  }
 }
 
-let Predicates = [HasV4T], AddedComplexity = 30 in {
-def : Pat<(truncstorei8 (i32 IntRegs:$src1),
-                        (HexagonCONST32 tglobaladdr:$absaddr)),
-          (STrib_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
+//===----------------------------------------------------------------------===//
+// Stores with absolute addressing
+//===----------------------------------------------------------------------===//
+let accessSize = ByteAccess in
+defm storerb : ST_Abs    <"memb", "STrib", IntRegs, u16_0Imm, 0b00>,
+               ST_Abs_NV <"memb", "STrib", u16_0Imm, 0b00>;
 
-def : Pat<(truncstorei16 (i32 IntRegs:$src1),
-                          (HexagonCONST32 tglobaladdr:$absaddr)),
-          (STrih_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
+let accessSize = HalfWordAccess in
+defm storerh : ST_Abs    <"memh", "STrih", IntRegs, u16_1Imm, 0b01>,
+               ST_Abs_NV <"memh", "STrih", u16_1Imm, 0b01>;
 
-def : Pat<(store (i32 IntRegs:$src1), (HexagonCONST32 tglobaladdr:$absaddr)),
-          (STriw_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
+let accessSize = WordAccess in
+defm storeri : ST_Abs    <"memw", "STriw", IntRegs, u16_2Imm, 0b10>,
+               ST_Abs_NV <"memw", "STriw", u16_2Imm, 0b10>;
 
-def : Pat<(store (i64 DoubleRegs:$src1),
-                 (HexagonCONST32 tglobaladdr:$absaddr)),
-          (STrid_abs_V4 tglobaladdr: $absaddr, DoubleRegs: $src1)>;
-}
+let isNVStorable = 0, accessSize = DoubleWordAccess in
+defm storerd : ST_Abs <"memd", "STrid", DoubleRegs, u16_3Imm, 0b11>;
+
+let isNVStorable = 0, accessSize = HalfWordAccess in
+defm storerf : ST_Abs <"memh", "STrif", IntRegs, u16_1Imm, 0b01, 1>;
 
 //===----------------------------------------------------------------------===//
-// multiclass for store instructions with GP-relative addressing mode.
+// GP-relative stores.
 // mem[bhwd](#global)=Rt
-// if ([!]Pv[.new]) mem[bhwd](##global) = Rt
+// Once predicated, these instructions map to absolute addressing mode.
+// if ([!]Pv[.new]) mem[bhwd](##global)=Rt
 //===----------------------------------------------------------------------===//
-let mayStore = 1, isNVStorable = 1 in
-multiclass ST_GP<string mnemonic, string BaseOp, RegisterClass RC> {
-  let BaseOpcode = BaseOp, isPredicable = 1 in
-  def NAME#_V4 : STInst2<(outs),
-          (ins globaladdress:$global, RC:$src),
-          mnemonic#"(#$global) = $src",
-          []>;
 
-  // When GP-relative instructions are predicated, their addressing mode is
-  // changed to absolute and they are always constant extended.
-  let BaseOpcode = BaseOp, isExtended = 1, opExtendable = 1,
-  isPredicated = 1 in {
-    defm Pt : ST_Abs_Pred <mnemonic, RC, 0>;
-    defm NotPt : ST_Abs_Pred <mnemonic, RC, 1>;
+let isAsmParserOnly = 1 in
+class T_StoreGP <string mnemonic, string BaseOp, RegisterClass RC,
+                 Operand ImmOp, bits<2> MajOp, bit isHalf = 0>
+  : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, globaladdress, 0, isHalf> {
+    // Set BaseOpcode same as absolute addressing instructions so that
+    // non-predicated GP-Rel instructions can have relate with predicated
+    // Absolute instruction.
+    let BaseOpcode = BaseOp#_abs;
+  }
+
+let isAsmParserOnly = 1 in
+multiclass ST_GP <string mnemonic, string BaseOp, Operand ImmOp,
+                  bits<2> MajOp, bit isHalf = 0> {
+  // Set BaseOpcode same as absolute addressing instructions so that
+  // non-predicated GP-Rel instructions can have relate with predicated
+  // Absolute instruction.
+  let BaseOpcode = BaseOp#_abs in {
+    def NAME#gp : T_StoreAbsGP <mnemonic, IntRegs, ImmOp, MajOp,
+                                globaladdress, 0, isHalf>;
+    // New-value store
+    def NAME#newgp : T_StoreAbsGP_NV <mnemonic, ImmOp, MajOp, 0> ;
   }
 }
 
-let mayStore = 1, isNVStore = 1 in
-multiclass ST_GP_nv<string mnemonic, string BaseOp, RegisterClass RC> {
-  let BaseOpcode = BaseOp, isPredicable = 1 in
-  def NAME#_nv_V4 : NVInst_V4<(outs),
-          (ins u0AlwaysExt:$global, RC:$src),
-          mnemonic#"(#$global) = $src.new",
-          []>,
-          Requires<[HasV4T]>;
-
-  // When GP-relative instructions are predicated, their addressing mode is
-  // changed to absolute and they are always constant extended.
-  let BaseOpcode = BaseOp, isExtended = 1, opExtendable = 1,
-  isPredicated = 1 in {
-    defm Pt : ST_Abs_Pred_nv<mnemonic, RC, 0>;
-    defm NotPt : ST_Abs_Pred_nv<mnemonic, RC, 1>;
-  }
-}
-
-let validSubTargets = HasV4SubT, neverHasSideEffects = 1 in {
-  let isNVStorable = 0 in
-  defm STd_GP : ST_GP <"memd", "STd_GP", DoubleRegs>, PredNewRel;
-
-  defm STb_GP : ST_GP<"memb",  "STb_GP", IntRegs>,
-                ST_GP_nv<"memb", "STb_GP", IntRegs>, NewValueRel;
-  defm STh_GP : ST_GP<"memh",  "STh_GP", IntRegs>,
-                ST_GP_nv<"memh", "STh_GP", IntRegs>, NewValueRel;
-  defm STw_GP : ST_GP<"memw",  "STw_GP", IntRegs>,
-                ST_GP_nv<"memw", "STw_GP", IntRegs>, NewValueRel;
-}
-
-// 64 bit atomic store
-def : Pat <(atomic_store_64 (HexagonCONST32_GP tglobaladdr:$global),
-                            (i64 DoubleRegs:$src1)),
-           (STd_GP_V4 tglobaladdr:$global, (i64 DoubleRegs:$src1))>,
-           Requires<[HasV4T]>;
-
-// Map from store(globaladdress) -> memd(#foo)
-let AddedComplexity = 100 in
-def : Pat <(store (i64 DoubleRegs:$src1),
-                  (HexagonCONST32_GP tglobaladdr:$global)),
-           (STd_GP_V4 tglobaladdr:$global, (i64 DoubleRegs:$src1))>;
+let accessSize = ByteAccess in
+defm S2_storerb : ST_GP<"memb", "STrib", u16_0Imm, 0b00>, NewValueRel;
 
-// 8 bit atomic store
-def : Pat < (atomic_store_8 (HexagonCONST32_GP tglobaladdr:$global),
-                            (i32 IntRegs:$src1)),
-            (STb_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+let accessSize = HalfWordAccess in
+defm S2_storerh : ST_GP<"memh", "STrih", u16_1Imm, 0b01>, NewValueRel;
 
-// Map from store(globaladdress) -> memb(#foo)
-let AddedComplexity = 100 in
-def : Pat<(truncstorei8 (i32 IntRegs:$src1),
-          (HexagonCONST32_GP tglobaladdr:$global)),
-          (STb_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+let accessSize = WordAccess in
+defm S2_storeri : ST_GP<"memw", "STriw", u16_2Imm, 0b10>, NewValueRel;
 
-// Map from "i1 = constant<-1>; memw(CONST32(#foo)) = i1"
-//       to "r0 = 1; memw(#foo) = r0"
-let AddedComplexity = 100 in
-def : Pat<(store (i1 -1), (HexagonCONST32_GP tglobaladdr:$global)),
-          (STb_GP_V4 tglobaladdr:$global, (TFRI 1))>;
+let isNVStorable = 0, accessSize = DoubleWordAccess in
+def S2_storerdgp : T_StoreGP <"memd", "STrid", DoubleRegs,
+                              u16_3Imm, 0b11>, PredNewRel;
 
-def : Pat<(atomic_store_16 (HexagonCONST32_GP tglobaladdr:$global),
-                           (i32 IntRegs:$src1)),
-          (STh_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+let isNVStorable = 0, accessSize = HalfWordAccess in
+def S2_storerfgp : T_StoreGP <"memh", "STrif", IntRegs,
+                              u16_1Imm, 0b01, 1>, PredNewRel;
 
-// Map from store(globaladdress) -> memh(#foo)
-let AddedComplexity = 100 in
-def : Pat<(truncstorei16 (i32 IntRegs:$src1),
-                         (HexagonCONST32_GP tglobaladdr:$global)),
-          (STh_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+class Loada_pat<PatFrag Load, ValueType VT, PatFrag Addr, InstHexagon MI>
+  : Pat<(VT (Load Addr:$addr)), (MI Addr:$addr)>;
 
-// 32 bit atomic store
-def : Pat<(atomic_store_32 (HexagonCONST32_GP tglobaladdr:$global),
-                           (i32 IntRegs:$src1)),
-          (STw_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+class Loadam_pat<PatFrag Load, ValueType VT, PatFrag Addr, PatFrag ValueMod,
+                 InstHexagon MI>
+  : Pat<(VT (Load Addr:$addr)), (ValueMod (MI Addr:$addr))>;
 
-// Map from store(globaladdress) -> memw(#foo)
-let AddedComplexity = 100 in
-def : Pat<(store (i32 IntRegs:$src1), (HexagonCONST32_GP tglobaladdr:$global)),
-          (STw_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+class Storea_pat<PatFrag Store, PatFrag Value, PatFrag Addr, InstHexagon MI>
+  : Pat<(Store Value:$val, Addr:$addr), (MI Addr:$addr, Value:$val)>;
 
-//===----------------------------------------------------------------------===//
-// Multiclass for the load instructions with absolute addressing mode.
-//===----------------------------------------------------------------------===//
-multiclass LD_Abs_Predbase<string mnemonic, RegisterClass RC, bit isNot,
-                           bit isPredNew> {
-  let isPredicatedNew = isPredNew in
-  def NAME : LDInst2<(outs RC:$dst),
-            (ins PredRegs:$src1, u0AlwaysExt:$absaddr),
-            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-            ") ")#"$dst = "#mnemonic#"(##$absaddr)",
-            []>,
-            Requires<[HasV4T]>;
-}
+class Stoream_pat<PatFrag Store, PatFrag Value, PatFrag Addr, PatFrag ValueMod,
+                  InstHexagon MI>
+  : Pat<(Store Value:$val, Addr:$addr),
+        (MI Addr:$addr, (ValueMod Value:$val))>;
 
-multiclass LD_Abs_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    defm _c#NAME : LD_Abs_Predbase<mnemonic, RC, PredNot, 0>;
-    // Predicate new
-    defm _cdn#NAME : LD_Abs_Predbase<mnemonic, RC, PredNot, 1>;
-  }
+def: Storea_pat<SwapSt<atomic_store_8>,  I32, addrgp, S2_storerbgp>;
+def: Storea_pat<SwapSt<atomic_store_16>, I32, addrgp, S2_storerhgp>;
+def: Storea_pat<SwapSt<atomic_store_32>, I32, addrgp, S2_storerigp>;
+def: Storea_pat<SwapSt<atomic_store_64>, I64, addrgp, S2_storerdgp>;
+
+let AddedComplexity = 100 in {
+  def: Storea_pat<truncstorei8,  I32, addrgp, S2_storerbgp>;
+  def: Storea_pat<truncstorei16, I32, addrgp, S2_storerhgp>;
+  def: Storea_pat<store,         I32, addrgp, S2_storerigp>;
+  def: Storea_pat<store,         I64, addrgp, S2_storerdgp>;
+
+  // Map from "i1 = constant<-1>; memw(CONST32(#foo)) = i1"
+  //       to "r0 = 1; memw(#foo) = r0"
+  let AddedComplexity = 100 in
+  def: Pat<(store (i1 -1), (HexagonCONST32_GP tglobaladdr:$global)),
+           (S2_storerbgp tglobaladdr:$global, (A2_tfrsi 1))>;
 }
 
-let isExtended = 1, neverHasSideEffects = 1 in
-multiclass LD_Abs<string mnemonic, string CextOp, RegisterClass RC> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
-    let  opExtendable = 1, isPredicable = 1 in
-    def NAME#_V4 : LDInst2<(outs RC:$dst),
-            (ins u0AlwaysExt:$absaddr),
-            "$dst = "#mnemonic#"(##$absaddr)",
-            []>,
-            Requires<[HasV4T]>;
-
-    let opExtendable = 2, isPredicated = 1 in {
-      defm Pt_V4 : LD_Abs_Pred<mnemonic, RC, 0>;
-      defm NotPt_V4 : LD_Abs_Pred<mnemonic, RC, 1>;
-    }
+//===----------------------------------------------------------------------===//
+// Template class for non predicated load instructions with
+// absolute addressing mode.
+//===----------------------------------------------------------------------===//
+let isPredicable = 1, hasSideEffects = 0 in
+class T_LoadAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
+                   bits<3> MajOp, Operand AddrOp, bit isAbs>
+  : LDInst <(outs RC:$dst), (ins AddrOp:$addr),
+  "$dst = "#mnemonic# !if(isAbs, "(##", "(#")#"$addr)",
+  [], "", V2LDST_tc_ld_SLOT01> {
+    bits<5> dst;
+    bits<19> addr;
+    bits<16> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
+                     !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
+                     !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
+                                      /* u16_0Imm */ addr{15-0})));
+
+    let IClass = 0b0100;
+
+    let Inst{27}    = 0b1;
+    let Inst{26-25} = offsetBits{15-14};
+    let Inst{24}    = 0b1;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = offsetBits{13-9};
+    let Inst{13-5}  = offsetBits{8-0};
+    let Inst{4-0}   = dst;
   }
-}
 
-let addrMode = Absolute in {
-  let accessSize = ByteAccess in {
-    defm LDrib_abs  : LD_Abs<"memb", "LDrib", IntRegs>, AddrModeRel;
-    defm LDriub_abs : LD_Abs<"memub", "LDriub", IntRegs>, AddrModeRel;
+class T_LoadAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
+                 bits<3> MajOp>
+  : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp, u0AlwaysExt, 1>, AddrModeRel {
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
+                       !if (!eq(ImmOpStr, "u16_2Imm"), 18,
+                       !if (!eq(ImmOpStr, "u16_1Imm"), 17,
+                                        /* u16_0Imm */ 16)));
+
+    let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
+                        !if (!eq(ImmOpStr, "u16_2Imm"), 2,
+                        !if (!eq(ImmOpStr, "u16_1Imm"), 1,
+                                        /* u16_0Imm */ 0)));
   }
-  let accessSize = HalfWordAccess in {
-    defm LDrih_abs  : LD_Abs<"memh", "LDrih", IntRegs>, AddrModeRel;
-    defm LDriuh_abs : LD_Abs<"memuh", "LDriuh", IntRegs>, AddrModeRel;
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated load instructions with
+// absolute addressing mode.
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, opExtentBits = 6, opExtendable = 2 in
+class T_LoadAbs_Pred <string mnemonic, RegisterClass RC, bits<3> MajOp,
+                      bit isPredNot, bit isPredNew>
+  : LDInst <(outs RC:$dst), (ins PredRegs:$src1, u6Ext:$absaddr),
+  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+  ") ")#"$dst = "#mnemonic#"(#$absaddr)">, AddrModeRel {
+    bits<5> dst;
+    bits<2> src1;
+    bits<6> absaddr;
+
+    let isPredicatedNew = isPredNew;
+    let isPredicatedFalse = isPredNot;
+    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
+
+    let IClass = 0b1001;
+
+    let Inst{27-24} = 0b1111;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = absaddr{5-1};
+    let Inst{13} = 0b1;
+    let Inst{12} = isPredNew;
+    let Inst{11} = isPredNot;
+    let Inst{10-9} = src1;
+    let Inst{8} = absaddr{0};
+    let Inst{7} = 0b1;
+    let Inst{4-0} = dst;
   }
-  let accessSize = WordAccess in
-    defm LDriw_abs  : LD_Abs<"memw", "LDriw", IntRegs>, AddrModeRel;
 
-  let accessSize = DoubleWordAccess in
-    defm LDrid_abs : LD_Abs<"memd",  "LDrid", DoubleRegs>, AddrModeRel;
+//===----------------------------------------------------------------------===//
+// Multiclass for the load instructions with absolute addressing mode.
+//===----------------------------------------------------------------------===//
+multiclass LD_Abs_Pred<string mnemonic, RegisterClass RC, bits<3> MajOp,
+                       bit PredNot> {
+  def _abs : T_LoadAbs_Pred <mnemonic, RC, MajOp, PredNot, 0>;
+  // Predicate new
+  def new_abs : T_LoadAbs_Pred <mnemonic, RC, MajOp, PredNot, 1>;
 }
 
-let Predicates = [HasV4T], AddedComplexity  = 30 in {
-def : Pat<(i32 (load (HexagonCONST32 tglobaladdr:$absaddr))),
-          (LDriw_abs_V4 tglobaladdr: $absaddr)>;
-
-def : Pat<(i32 (sextloadi8 (HexagonCONST32 tglobaladdr:$absaddr))),
-          (LDrib_abs_V4 tglobaladdr:$absaddr)>;
+let addrMode = Absolute, isExtended = 1 in
+multiclass LD_Abs<string mnemonic, string CextOp, RegisterClass RC,
+                  Operand ImmOp, bits<3> MajOp> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+    let opExtendable = 1, isPredicable = 1 in
+    def L4_#NAME#_abs: T_LoadAbs <mnemonic, RC, ImmOp, MajOp>;
 
-def : Pat<(i32 (zextloadi8 (HexagonCONST32 tglobaladdr:$absaddr))),
-          (LDriub_abs_V4 tglobaladdr:$absaddr)>;
+    // Predicated
+    defm L4_p#NAME#t : LD_Abs_Pred<mnemonic, RC, MajOp, 0>;
+    defm L4_p#NAME#f : LD_Abs_Pred<mnemonic, RC, MajOp, 1>;
+  }
+}
 
-def : Pat<(i32 (sextloadi16 (HexagonCONST32 tglobaladdr:$absaddr))),
-          (LDrih_abs_V4 tglobaladdr:$absaddr)>;
+let accessSize = ByteAccess, hasNewValue = 1 in {
+  defm loadrb  : LD_Abs<"memb",  "LDrib",  IntRegs, u16_0Imm, 0b000>;
+  defm loadrub : LD_Abs<"memub", "LDriub", IntRegs, u16_0Imm, 0b001>;
+}
 
-def : Pat<(i32 (zextloadi16 (HexagonCONST32 tglobaladdr:$absaddr))),
-          (LDriuh_abs_V4 tglobaladdr:$absaddr)>;
+let accessSize = HalfWordAccess, hasNewValue = 1 in {
+  defm loadrh  : LD_Abs<"memh",  "LDrih",  IntRegs, u16_1Imm, 0b010>;
+  defm loadruh : LD_Abs<"memuh", "LDriuh", IntRegs, u16_1Imm, 0b011>;
 }
 
+let accessSize = WordAccess, hasNewValue = 1 in
+defm loadri  : LD_Abs<"memw",  "LDriw",  IntRegs, u16_2Imm, 0b100>;
+
+let accessSize = DoubleWordAccess in
+defm loadrd  : LD_Abs<"memd",  "LDrid", DoubleRegs, u16_3Imm, 0b110>;
+
 //===----------------------------------------------------------------------===//
 // multiclass for load instructions with GP-relative addressing mode.
 // Rx=mem[bhwd](##global)
+// Once predicated, these instructions map to absolute addressing mode.
 // if ([!]Pv[.new]) Rx=mem[bhwd](##global)
 //===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1, validSubTargets = HasV4SubT in
-multiclass LD_GP<string mnemonic, string BaseOp, RegisterClass RC> {
-  let BaseOpcode = BaseOp in {
-    let isPredicable = 1 in
-    def NAME#_V4 : LDInst2<(outs RC:$dst),
-            (ins globaladdress:$global),
-            "$dst = "#mnemonic#"(#$global)",
-            []>;
-
-    let isExtended = 1, opExtendable = 2, isPredicated = 1 in {
-      defm Pt_V4 : LD_Abs_Pred<mnemonic, RC, 0>;
-      defm NotPt_V4 : LD_Abs_Pred<mnemonic, RC, 1>;
-    }
-  }
-}
 
-defm LDd_GP  : LD_GP<"memd",  "LDd_GP",  DoubleRegs>, PredNewRel;
-defm LDb_GP  : LD_GP<"memb",  "LDb_GP",  IntRegs>, PredNewRel;
-defm LDub_GP : LD_GP<"memub", "LDub_GP", IntRegs>, PredNewRel;
-defm LDh_GP  : LD_GP<"memh",  "LDh_GP",  IntRegs>, PredNewRel;
-defm LDuh_GP : LD_GP<"memuh", "LDuh_GP", IntRegs>, PredNewRel;
-defm LDw_GP  : LD_GP<"memw",  "LDw_GP",  IntRegs>, PredNewRel;
+let isAsmParserOnly = 1 in
+class T_LoadGP <string mnemonic, string BaseOp, RegisterClass RC, Operand ImmOp,
+                bits<3> MajOp>
+  : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp, globaladdress, 0>, PredNewRel {
+    let BaseOpcode = BaseOp#_abs;
+  }
 
-def : Pat <(atomic_load_64 (HexagonCONST32_GP tglobaladdr:$global)),
-           (i64 (LDd_GP_V4 tglobaladdr:$global))>;
+let accessSize = ByteAccess, hasNewValue = 1 in {
+  def L2_loadrbgp  : T_LoadGP<"memb",  "LDrib",  IntRegs, u16_0Imm, 0b000>;
+  def L2_loadrubgp : T_LoadGP<"memub", "LDriub", IntRegs, u16_0Imm, 0b001>;
+}
 
-def : Pat <(atomic_load_32 (HexagonCONST32_GP tglobaladdr:$global)),
-           (i32 (LDw_GP_V4 tglobaladdr:$global))>;
+let accessSize = HalfWordAccess, hasNewValue = 1 in {
+  def L2_loadrhgp  : T_LoadGP<"memh",  "LDrih",  IntRegs, u16_1Imm, 0b010>;
+  def L2_loadruhgp : T_LoadGP<"memuh", "LDriuh", IntRegs, u16_1Imm, 0b011>;
+}
 
-def : Pat <(atomic_load_16 (HexagonCONST32_GP tglobaladdr:$global)),
-           (i32 (LDuh_GP_V4 tglobaladdr:$global))>;
+let accessSize = WordAccess, hasNewValue = 1 in
+def L2_loadrigp  : T_LoadGP<"memw",  "LDriw",  IntRegs, u16_2Imm, 0b100>;
 
-def : Pat <(atomic_load_8 (HexagonCONST32_GP tglobaladdr:$global)),
-           (i32 (LDub_GP_V4 tglobaladdr:$global))>;
+let accessSize = DoubleWordAccess in
+def L2_loadrdgp  : T_LoadGP<"memd", "LDrid", DoubleRegs, u16_3Imm, 0b110>;
 
-// Map from load(globaladdress) -> memw(#foo + 0)
-let AddedComplexity = 100 in
-def : Pat <(i64 (load (HexagonCONST32_GP tglobaladdr:$global))),
-           (i64 (LDd_GP_V4 tglobaladdr:$global))>;
+def: Loada_pat<atomic_load_8,  i32, addrgp, L2_loadrubgp>;
+def: Loada_pat<atomic_load_16, i32, addrgp, L2_loadruhgp>;
+def: Loada_pat<atomic_load_32, i32, addrgp, L2_loadrigp>;
+def: Loada_pat<atomic_load_64, i64, addrgp, L2_loadrdgp>;
 
 // Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd
-let AddedComplexity = 100 in
-def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))),
-           (i1 (TFR_PdRs (i32 (LDb_GP_V4 tglobaladdr:$global))))>;
+def: Loadam_pat<load, i1, addrga, I32toI1, L4_loadrub_abs>;
+def: Loadam_pat<load, i1, addrgp, I32toI1, L2_loadrubgp>;
+
+def: Stoream_pat<store, I1, addrga, I1toI32, S2_storerbabs>;
+def: Stoream_pat<store, I1, addrgp, I1toI32, S2_storerbgp>;
+
+// Map from load(globaladdress) -> mem[u][bhwd](#foo)
+class LoadGP_pats <PatFrag ldOp, InstHexagon MI, ValueType VT = i32>
+  : Pat <(VT (ldOp (HexagonCONST32_GP tglobaladdr:$global))),
+         (VT (MI tglobaladdr:$global))>;
+
+let AddedComplexity = 100 in {
+  def: LoadGP_pats <extloadi8, L2_loadrbgp>;
+  def: LoadGP_pats <sextloadi8, L2_loadrbgp>;
+  def: LoadGP_pats <zextloadi8, L2_loadrubgp>;
+  def: LoadGP_pats <extloadi16, L2_loadrhgp>;
+  def: LoadGP_pats <sextloadi16, L2_loadrhgp>;
+  def: LoadGP_pats <zextloadi16, L2_loadruhgp>;
+  def: LoadGP_pats <load, L2_loadrigp>;
+  def: LoadGP_pats <load, L2_loadrdgp, i64>;
+}
 
 // When the Interprocedural Global Variable optimizer realizes that a certain
 // global variable takes only two constant values, it shrinks the global to
 // a boolean. Catch those loads here in the following 3 patterns.
-let AddedComplexity = 100 in
-def : Pat <(i32 (extloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
-           (i32 (LDb_GP_V4 tglobaladdr:$global))>;
+let AddedComplexity = 100 in {
+  def: LoadGP_pats <extloadi1, L2_loadrubgp>;
+  def: LoadGP_pats <zextloadi1, L2_loadrubgp>;
+}
 
-let AddedComplexity = 100 in
-def : Pat <(i32 (sextloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
-           (i32 (LDb_GP_V4 tglobaladdr:$global))>;
+// Transfer global address into a register
+def: Pat<(HexagonCONST32 tglobaladdr:$Rs),      (A2_tfrsi s16Ext:$Rs)>;
+def: Pat<(HexagonCONST32_GP tblockaddress:$Rs), (A2_tfrsi s16Ext:$Rs)>;
+def: Pat<(HexagonCONST32_GP tglobaladdr:$Rs),   (A2_tfrsi s16Ext:$Rs)>;
 
-// Map from load(globaladdress) -> memb(#foo)
-let AddedComplexity = 100 in
-def : Pat <(i32 (extloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
-           (i32 (LDb_GP_V4 tglobaladdr:$global))>;
+def: Pat<(i64 (ctlz I64:$src1)), (Zext64 (S2_cl0p I64:$src1))>;
+def: Pat<(i64 (cttz I64:$src1)), (Zext64 (S2_ct0p I64:$src1))>;
 
-// Map from load(globaladdress) -> memb(#foo)
-let AddedComplexity = 100 in
-def : Pat <(i32 (sextloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
-           (i32 (LDb_GP_V4 tglobaladdr:$global))>;
+let AddedComplexity  = 30 in {
+  def: Storea_pat<truncstorei8,  I32, u0AlwaysExtPred, S2_storerbabs>;
+  def: Storea_pat<truncstorei16, I32, u0AlwaysExtPred, S2_storerhabs>;
+  def: Storea_pat<store,         I32, u0AlwaysExtPred, S2_storeriabs>;
+}
 
-let AddedComplexity = 100 in
-def : Pat <(i32 (zextloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
-           (i32 (LDub_GP_V4 tglobaladdr:$global))>;
+let AddedComplexity  = 30 in {
+  def: Loada_pat<load,        i32, u0AlwaysExtPred, L4_loadri_abs>;
+  def: Loada_pat<sextloadi8,  i32, u0AlwaysExtPred, L4_loadrb_abs>;
+  def: Loada_pat<zextloadi8,  i32, u0AlwaysExtPred, L4_loadrub_abs>;
+  def: Loada_pat<sextloadi16, i32, u0AlwaysExtPred, L4_loadrh_abs>;
+  def: Loada_pat<zextloadi16, i32, u0AlwaysExtPred, L4_loadruh_abs>;
+}
 
-// Map from load(globaladdress) -> memub(#foo)
+// Indexed store word - global address.
+// memw(Rs+#u6:2)=#S8
 let AddedComplexity = 100 in
-def : Pat <(i32 (zextloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
-           (i32 (LDub_GP_V4 tglobaladdr:$global))>;
+def: Storex_add_pat<store, addrga, u6_2ImmPred, S4_storeiri_io>;
 
-// Map from load(globaladdress) -> memh(#foo)
-let AddedComplexity = 100 in
-def : Pat <(i32 (extloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
-           (i32 (LDh_GP_V4 tglobaladdr:$global))>;
+// Load from a global address that has only one use in the current basic block.
+let AddedComplexity = 100 in {
+  def: Loada_pat<extloadi8,   i32, addrga, L4_loadrub_abs>;
+  def: Loada_pat<sextloadi8,  i32, addrga, L4_loadrb_abs>;
+  def: Loada_pat<zextloadi8,  i32, addrga, L4_loadrub_abs>;
 
-// Map from load(globaladdress) -> memh(#foo)
-let AddedComplexity = 100 in
-def : Pat <(i32 (sextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
-           (i32 (LDh_GP_V4 tglobaladdr:$global))>;
+  def: Loada_pat<extloadi16,  i32, addrga, L4_loadruh_abs>;
+  def: Loada_pat<sextloadi16, i32, addrga, L4_loadrh_abs>;
+  def: Loada_pat<zextloadi16, i32, addrga, L4_loadruh_abs>;
 
-// Map from load(globaladdress) -> memuh(#foo)
-let AddedComplexity = 100 in
-def : Pat <(i32 (zextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
-           (i32 (LDuh_GP_V4 tglobaladdr:$global))>;
+  def: Loada_pat<load,        i32, addrga, L4_loadri_abs>;
+  def: Loada_pat<load,        i64, addrga, L4_loadrd_abs>;
+}
 
-// Map from load(globaladdress) -> memw(#foo)
-let AddedComplexity = 100 in
-def : Pat <(i32 (load (HexagonCONST32_GP tglobaladdr:$global))),
-           (i32 (LDw_GP_V4 tglobaladdr:$global))>;
+// Store to a global address that has only one use in the current basic block.
+let AddedComplexity = 100 in {
+  def: Storea_pat<truncstorei8,  I32, addrga, S2_storerbabs>;
+  def: Storea_pat<truncstorei16, I32, addrga, S2_storerhabs>;
+  def: Storea_pat<store,         I32, addrga, S2_storeriabs>;
+  def: Storea_pat<store,         I64, addrga, S2_storerdabs>;
 
+  def: Stoream_pat<truncstorei32, I64, addrga, LoReg, S2_storeriabs>;
+}
+
+// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd
+let AddedComplexity = 100 in
+def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))),
+           (i1 (C2_tfrrp (i32 (L2_loadrbgp tglobaladdr:$global))))>;
 
 // Transfer global address into a register
 let isExtended = 1, opExtendable = 1, AddedComplexity=50, isMoveImm = 1,
-isAsCheapAsAMove = 1, isReMaterializable = 1, validSubTargets = HasV4SubT in
+isAsCheapAsAMove = 1, isReMaterializable = 1, isCodeGenOnly = 1 in
 def TFRI_V4 : ALU32_ri<(outs IntRegs:$dst), (ins s16Ext:$src1),
            "$dst = #$src1",
-           [(set IntRegs:$dst, (HexagonCONST32 tglobaladdr:$src1))]>,
-           Requires<[HasV4T]>;
+           [(set IntRegs:$dst, (HexagonCONST32 tglobaladdr:$src1))]>;
 
 // Transfer a block address into a register
 def : Pat<(HexagonCONST32_GP tblockaddress:$src1),
-          (TFRI_V4 tblockaddress:$src1)>,
-          Requires<[HasV4T]>;
-
-let isExtended = 1, opExtendable = 2, AddedComplexity=50,
-neverHasSideEffects = 1, isPredicated = 1, validSubTargets = HasV4SubT in
-def TFRI_cPt_V4 : ALU32_ri<(outs IntRegs:$dst),
-                           (ins PredRegs:$src1, s16Ext:$src2),
-           "if($src1) $dst = #$src2",
-           []>,
-           Requires<[HasV4T]>;
-
-let isExtended = 1, opExtendable = 2, AddedComplexity=50, isPredicatedFalse = 1,
-neverHasSideEffects = 1, isPredicated = 1, validSubTargets = HasV4SubT in
-def TFRI_cNotPt_V4 : ALU32_ri<(outs IntRegs:$dst),
-                              (ins PredRegs:$src1, s16Ext:$src2),
-           "if(!$src1) $dst = #$src2",
-           []>,
-           Requires<[HasV4T]>;
-
-let isExtended = 1, opExtendable = 2, AddedComplexity=50,
-neverHasSideEffects = 1, isPredicated = 1, validSubTargets = HasV4SubT in
-def TFRI_cdnPt_V4 : ALU32_ri<(outs IntRegs:$dst),
-                             (ins PredRegs:$src1, s16Ext:$src2),
-           "if($src1.new) $dst = #$src2",
-           []>,
-           Requires<[HasV4T]>;
-
-let isExtended = 1, opExtendable = 2, AddedComplexity=50, isPredicatedFalse = 1,
-neverHasSideEffects = 1, isPredicated = 1, validSubTargets = HasV4SubT in
-def TFRI_cdnNotPt_V4 : ALU32_ri<(outs IntRegs:$dst),
-                                (ins PredRegs:$src1, s16Ext:$src2),
-           "if(!$src1.new) $dst = #$src2",
-           []>,
-           Requires<[HasV4T]>;
-
-let AddedComplexity = 50, Predicates = [HasV4T] in
-def : Pat<(HexagonCONST32_GP tglobaladdr:$src1),
-           (TFRI_V4 tglobaladdr:$src1)>,
-           Requires<[HasV4T]>;
-
-
-// Load - Indirect with long offset: These instructions take global address
-// as an operand
-let isExtended = 1, opExtendable = 3, AddedComplexity = 40,
-validSubTargets = HasV4SubT in
-def LDrid_ind_lo_V4 : LDInst<(outs DoubleRegs:$dst),
-            (ins IntRegs:$src1, u2Imm:$src2, globaladdressExt:$offset),
-            "$dst=memd($src1<<#$src2+##$offset)",
-            [(set (i64 DoubleRegs:$dst),
-                  (load (add (shl IntRegs:$src1, u2ImmPred:$src2),
-                        (HexagonCONST32 tglobaladdr:$offset))))]>,
-            Requires<[HasV4T]>;
+          (TFRI_V4 tblockaddress:$src1)>;
 
-let AddedComplexity = 40 in
-multiclass LD_indirect_lo<string OpcStr, PatFrag OpNode> {
-let isExtended = 1, opExtendable = 3, validSubTargets = HasV4SubT in
-  def _lo_V4 : LDInst<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, u2Imm:$src2, globaladdressExt:$offset),
-            !strconcat("$dst = ",
-            !strconcat(OpcStr, "($src1<<#$src2+##$offset)")),
-            [(set IntRegs:$dst,
-                  (i32 (OpNode (add (shl IntRegs:$src1, u2ImmPred:$src2),
-                          (HexagonCONST32 tglobaladdr:$offset)))))]>,
-            Requires<[HasV4T]>;
-}
-
-defm LDrib_ind : LD_indirect_lo<"memb", sextloadi8>;
-defm LDriub_ind : LD_indirect_lo<"memub", zextloadi8>;
-defm LDriub_ind_anyext : LD_indirect_lo<"memub", extloadi8>;
-defm LDrih_ind : LD_indirect_lo<"memh", sextloadi16>;
-defm LDriuh_ind : LD_indirect_lo<"memuh", zextloadi16>;
-defm LDriuh_ind_anyext : LD_indirect_lo<"memuh", extloadi16>;
-defm LDriw_ind : LD_indirect_lo<"memw", load>;
-
-let AddedComplexity = 40 in
-def : Pat <(i32 (sextloadi8 (add IntRegs:$src1,
-                                 (NumUsesBelowThresCONST32 tglobaladdr:$offset)))),
-           (i32 (LDrib_ind_lo_V4 IntRegs:$src1, 0, tglobaladdr:$offset))>,
-           Requires<[HasV4T]>;
-
-let AddedComplexity = 40 in
-def : Pat <(i32 (zextloadi8 (add IntRegs:$src1,
-                                 (NumUsesBelowThresCONST32 tglobaladdr:$offset)))),
-           (i32 (LDriub_ind_lo_V4 IntRegs:$src1, 0, tglobaladdr:$offset))>,
-           Requires<[HasV4T]>;
+let AddedComplexity = 50 in
+def : Pat<(HexagonCONST32_GP tglobaladdr:$src1),
+           (TFRI_V4 tglobaladdr:$src1)>;
 
-let Predicates = [HasV4T], AddedComplexity  = 30 in {
-def : Pat<(truncstorei8 (i32 IntRegs:$src1), u0AlwaysExtPred:$src2),
-          (STrib_abs_V4 u0AlwaysExtPred:$src2, IntRegs: $src1)>;
+// i8/i16/i32 -> i64 loads
+// We need a complexity of 120 here to override preceding handling of
+// zextload.
+let AddedComplexity = 120 in {
+  def: Loadam_pat<extloadi8,   i64, addrga, Zext64, L4_loadrub_abs>;
+  def: Loadam_pat<sextloadi8,  i64, addrga, Sext64, L4_loadrb_abs>;
+  def: Loadam_pat<zextloadi8,  i64, addrga, Zext64, L4_loadrub_abs>;
 
-def : Pat<(truncstorei16 (i32 IntRegs:$src1), u0AlwaysExtPred:$src2),
-          (STrih_abs_V4 u0AlwaysExtPred:$src2, IntRegs: $src1)>;
+  def: Loadam_pat<extloadi16,  i64, addrga, Zext64, L4_loadruh_abs>;
+  def: Loadam_pat<sextloadi16, i64, addrga, Sext64, L4_loadrh_abs>;
+  def: Loadam_pat<zextloadi16, i64, addrga, Zext64, L4_loadruh_abs>;
 
-def : Pat<(store (i32 IntRegs:$src1), u0AlwaysExtPred:$src2),
-          (STriw_abs_V4 u0AlwaysExtPred:$src2, IntRegs: $src1)>;
+  def: Loadam_pat<extloadi32,  i64, addrga, Zext64, L4_loadri_abs>;
+  def: Loadam_pat<sextloadi32, i64, addrga, Sext64, L4_loadri_abs>;
+  def: Loadam_pat<zextloadi32, i64, addrga, Zext64, L4_loadri_abs>;
 }
 
-let Predicates = [HasV4T], AddedComplexity  = 30 in {
-def : Pat<(i32 (load u0AlwaysExtPred:$src)),
-          (LDriw_abs_V4 u0AlwaysExtPred:$src)>;
+let AddedComplexity = 100 in {
+  def: Loada_pat<extloadi8,   i32, addrgp, L4_loadrub_abs>;
+  def: Loada_pat<sextloadi8,  i32, addrgp, L4_loadrb_abs>;
+  def: Loada_pat<zextloadi8,  i32, addrgp, L4_loadrub_abs>;
 
-def : Pat<(i32 (sextloadi8 u0AlwaysExtPred:$src)),
-          (LDrib_abs_V4 u0AlwaysExtPred:$src)>;
+  def: Loada_pat<extloadi16,  i32, addrgp, L4_loadruh_abs>;
+  def: Loada_pat<sextloadi16, i32, addrgp, L4_loadrh_abs>;
+  def: Loada_pat<zextloadi16, i32, addrgp, L4_loadruh_abs>;
 
-def : Pat<(i32 (zextloadi8 u0AlwaysExtPred:$src)),
-          (LDriub_abs_V4 u0AlwaysExtPred:$src)>;
-
-def : Pat<(i32 (sextloadi16 u0AlwaysExtPred:$src)),
-          (LDrih_abs_V4 u0AlwaysExtPred:$src)>;
-
-def : Pat<(i32 (zextloadi16 u0AlwaysExtPred:$src)),
-          (LDriuh_abs_V4 u0AlwaysExtPred:$src)>;
+  def: Loada_pat<load,        i32, addrgp, L4_loadri_abs>;
+  def: Loada_pat<load,        i64, addrgp, L4_loadrd_abs>;
 }
 
-// Indexed store word - global address.
-// memw(Rs+#u6:2)=#S8
-let AddedComplexity = 10 in
-def STriw_offset_ext_V4 : STInst<(outs),
-            (ins IntRegs:$src1, u6_2Imm:$src2, globaladdress:$src3),
-            "memw($src1+#$src2) = ##$src3",
-            [(store (HexagonCONST32 tglobaladdr:$src3),
-                    (add IntRegs:$src1, u6_2ImmPred:$src2))]>,
-            Requires<[HasV4T]>;
-
-def : Pat<(i64 (ctlz (i64 DoubleRegs:$src1))),
-          (i64 (COMBINE_Ir_V4 (i32 0), (i32 (CTLZ64_rr DoubleRegs:$src1))))>,
-          Requires<[HasV4T]>;
-
-def : Pat<(i64 (cttz (i64 DoubleRegs:$src1))),
-          (i64 (COMBINE_Ir_V4 (i32 0), (i32 (CTTZ64_rr DoubleRegs:$src1))))>,
-          Requires<[HasV4T]>;
+let AddedComplexity = 100 in {
+  def: Storea_pat<truncstorei8,  I32, addrgp, S2_storerbabs>;
+  def: Storea_pat<truncstorei16, I32, addrgp, S2_storerhabs>;
+  def: Storea_pat<store,         I32, addrgp, S2_storeriabs>;
+  def: Storea_pat<store,         I64, addrgp, S2_storerdabs>;
+}
 
+def: Loada_pat<atomic_load_8,  i32, addrgp, L4_loadrub_abs>;
+def: Loada_pat<atomic_load_16, i32, addrgp, L4_loadruh_abs>;
+def: Loada_pat<atomic_load_32, i32, addrgp, L4_loadri_abs>;
+def: Loada_pat<atomic_load_64, i64, addrgp, L4_loadrd_abs>;
 
-// i8 -> i64 loads
-// We need a complexity of 120 here to override preceding handling of
-// zextloadi8.
-let Predicates = [HasV4T], AddedComplexity = 120 in {
-def:  Pat <(i64 (extloadi8 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
-      (i64 (COMBINE_Ir_V4 0, (LDrib_abs_V4 tglobaladdr:$addr)))>;
+def: Storea_pat<SwapSt<atomic_store_8>,  I32, addrgp, S2_storerbabs>;
+def: Storea_pat<SwapSt<atomic_store_16>, I32, addrgp, S2_storerhabs>;
+def: Storea_pat<SwapSt<atomic_store_32>, I32, addrgp, S2_storeriabs>;
+def: Storea_pat<SwapSt<atomic_store_64>, I64, addrgp, S2_storerdabs>;
 
-def:  Pat <(i64 (zextloadi8 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
-      (i64 (COMBINE_Ir_V4 0, (LDriub_abs_V4 tglobaladdr:$addr)))>;
+//===----------------------------------------------------------------------===//
+// :raw for of boundscheck:hi:lo insns
+//===----------------------------------------------------------------------===//
 
-def:  Pat <(i64 (sextloadi8 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
-      (i64 (SXTW (LDrib_abs_V4 tglobaladdr:$addr)))>;
+// A4_boundscheck_lo: Detect if a register is within bounds.
+let hasSideEffects = 0 in
+def A4_boundscheck_lo: ALU64Inst <
+  (outs PredRegs:$Pd),
+  (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Pd = boundscheck($Rss, $Rtt):raw:lo"> {
+    bits<2> Pd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1101;
+
+    let Inst{27-23} = 0b00100;
+    let Inst{13} = 0b1;
+    let Inst{7-5} = 0b100;
+    let Inst{1-0} = Pd;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
 
-def:  Pat <(i64 (extloadi8 FoldGlobalAddr:$addr)),
-      (i64 (COMBINE_Ir_V4 0, (LDrib_abs_V4 FoldGlobalAddr:$addr)))>;
+// A4_boundscheck_hi: Detect if a register is within bounds.
+let hasSideEffects = 0 in
+def A4_boundscheck_hi: ALU64Inst <
+  (outs PredRegs:$Pd),
+  (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Pd = boundscheck($Rss, $Rtt):raw:hi"> {
+    bits<2> Pd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1101;
+
+    let Inst{27-23} = 0b00100;
+    let Inst{13} = 0b1;
+    let Inst{7-5} = 0b101;
+    let Inst{1-0} = Pd;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
 
-def:  Pat <(i64 (zextloadi8 FoldGlobalAddr:$addr)),
-      (i64 (COMBINE_Ir_V4 0, (LDriub_abs_V4 FoldGlobalAddr:$addr)))>;
+let hasSideEffects = 0, isAsmParserOnly = 1 in
+def A4_boundscheck : MInst <
+  (outs PredRegs:$Pd), (ins IntRegs:$Rs, DoubleRegs:$Rtt),
+  "$Pd=boundscheck($Rs,$Rtt)">;
+
+// A4_tlbmatch: Detect if a VA/ASID matches a TLB entry.
+let isPredicateLate = 1, hasSideEffects = 0 in
+def A4_tlbmatch : ALU64Inst<(outs PredRegs:$Pd),
+  (ins DoubleRegs:$Rs, IntRegs:$Rt),
+  "$Pd = tlbmatch($Rs, $Rt)",
+  [], "", ALU64_tc_2early_SLOT23> {
+    bits<2> Pd;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1101;
+    let Inst{27-23} = 0b00100;
+    let Inst{20-16} = Rs;
+    let Inst{13} = 0b1;
+    let Inst{12-8} = Rt;
+    let Inst{7-5} = 0b011;
+    let Inst{1-0} = Pd;
+  }
 
-def:  Pat <(i64 (sextloadi8 FoldGlobalAddr:$addr)),
-      (i64 (SXTW (LDrib_abs_V4 FoldGlobalAddr:$addr)))>;
+// We need custom lowering of ISD::PREFETCH into HexagonISD::DCFETCH
+// because the SDNode ISD::PREFETCH has properties MayLoad and MayStore.
+// We don't really want either one here.
+def SDTHexagonDCFETCH : SDTypeProfile<0, 2, [SDTCisPtrTy<0>,SDTCisInt<1>]>;
+def HexagonDCFETCH : SDNode<"HexagonISD::DCFETCH", SDTHexagonDCFETCH,
+                            [SDNPHasChain]>;
+
+// Use LD0Inst for dcfetch, but set "mayLoad" to 0 because this doesn't
+// really do a load.
+let hasSideEffects = 1, mayLoad = 0 in
+def Y2_dcfetchbo : LD0Inst<(outs), (ins IntRegs:$Rs, u11_3Imm:$u11_3),
+      "dcfetch($Rs + #$u11_3)",
+      [(HexagonDCFETCH IntRegs:$Rs, u11_3ImmPred:$u11_3)],
+      "", LD_tc_ld_SLOT0> {
+  bits<5> Rs;
+  bits<14> u11_3;
+
+  let IClass = 0b1001;
+  let Inst{27-21} = 0b0100000;
+  let Inst{20-16} = Rs;
+  let Inst{13} = 0b0;
+  let Inst{10-0} = u11_3{13-3};
 }
-// i16 -> i64 loads
-// We need a complexity of 120 here to override preceding handling of
-// zextloadi16.
-let AddedComplexity = 120 in {
-def:  Pat <(i64 (extloadi16 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
-      (i64 (COMBINE_Ir_V4 0, (LDrih_abs_V4 tglobaladdr:$addr)))>,
-      Requires<[HasV4T]>;
 
-def:  Pat <(i64 (zextloadi16 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
-      (i64 (COMBINE_Ir_V4 0, (LDriuh_abs_V4 tglobaladdr:$addr)))>,
-      Requires<[HasV4T]>;
-
-def:  Pat <(i64 (sextloadi16 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
-      (i64 (SXTW (LDrih_abs_V4 tglobaladdr:$addr)))>,
-      Requires<[HasV4T]>;
-
-def:  Pat <(i64 (extloadi16 FoldGlobalAddr:$addr)),
-      (i64 (COMBINE_Ir_V4 0, (LDrih_abs_V4 FoldGlobalAddr:$addr)))>,
-      Requires<[HasV4T]>;
-
-def:  Pat <(i64 (zextloadi16 FoldGlobalAddr:$addr)),
-      (i64 (COMBINE_Ir_V4 0, (LDriuh_abs_V4 FoldGlobalAddr:$addr)))>,
-      Requires<[HasV4T]>;
+//===----------------------------------------------------------------------===//
+// Compound instructions
+//===----------------------------------------------------------------------===//
 
-def:  Pat <(i64 (sextloadi16 FoldGlobalAddr:$addr)),
-      (i64 (SXTW (LDrih_abs_V4 FoldGlobalAddr:$addr)))>,
-      Requires<[HasV4T]>;
+let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
+    isPredicated = 1, isPredicatedNew = 1, isExtendable = 1,
+    opExtentBits = 11, opExtentAlign = 2, opExtendable = 1,
+    isTerminator = 1 in
+class CJInst_tstbit_R0<string px, bit np, string tnt>
+  : InstHexagon<(outs), (ins IntRegs:$Rs, brtarget:$r9_2),
+  ""#px#" = tstbit($Rs, #0); if ("
+    #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
+  [], "", COMPOUND, TypeCOMPOUND> {
+  bits<4> Rs;
+  bits<11> r9_2;
+
+  // np: !p[01]
+  let isPredicatedFalse = np;
+  // tnt: Taken/Not Taken
+  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
+  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
+
+  let IClass = 0b0001;
+  let Inst{27-26} = 0b00;
+  let Inst{25} = !if (!eq(px, "!p1"), 1,
+                 !if (!eq(px,  "p1"), 1, 0));
+  let Inst{24-23} = 0b11;
+  let Inst{22} = np;
+  let Inst{21-20} = r9_2{10-9};
+  let Inst{19-16} = Rs;
+  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
+  let Inst{9-8} = 0b11;
+  let Inst{7-1} = r9_2{8-2};
 }
-// i32->i64 loads
-// We need a complexity of 120 here to override preceding handling of
-// zextloadi32.
-let AddedComplexity = 120 in {
-def:  Pat <(i64 (extloadi32 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
-      (i64 (COMBINE_Ir_V4 0, (LDriw_abs_V4 tglobaladdr:$addr)))>,
-      Requires<[HasV4T]>;
-
-def:  Pat <(i64 (zextloadi32 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
-      (i64 (COMBINE_Ir_V4 0, (LDriw_abs_V4 tglobaladdr:$addr)))>,
-      Requires<[HasV4T]>;
-
-def:  Pat <(i64 (sextloadi32 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
-      (i64 (SXTW (LDriw_abs_V4 tglobaladdr:$addr)))>,
-      Requires<[HasV4T]>;
-
-def:  Pat <(i64 (extloadi32 FoldGlobalAddr:$addr)),
-      (i64 (COMBINE_Ir_V4 0, (LDriw_abs_V4 FoldGlobalAddr:$addr)))>,
-      Requires<[HasV4T]>;
 
-def:  Pat <(i64 (zextloadi32 FoldGlobalAddr:$addr)),
-      (i64 (COMBINE_Ir_V4 0, (LDriw_abs_V4 FoldGlobalAddr:$addr)))>,
-      Requires<[HasV4T]>;
-
-def:  Pat <(i64 (sextloadi32 FoldGlobalAddr:$addr)),
-      (i64 (SXTW (LDriw_abs_V4 FoldGlobalAddr:$addr)))>,
-      Requires<[HasV4T]>;
+let Defs = [PC, P0], Uses = [P0] in {
+  def J4_tstbit0_tp0_jump_nt : CJInst_tstbit_R0<"p0", 0, "nt">;
+  def J4_tstbit0_tp0_jump_t : CJInst_tstbit_R0<"p0", 0, "t">;
+  def J4_tstbit0_fp0_jump_nt : CJInst_tstbit_R0<"p0", 1, "nt">;
+  def J4_tstbit0_fp0_jump_t : CJInst_tstbit_R0<"p0", 1, "t">;
 }
 
-// Indexed store double word - global address.
-// memw(Rs+#u6:2)=#S8
-let AddedComplexity = 10 in
-def STrih_offset_ext_V4 : STInst<(outs),
-            (ins IntRegs:$src1, u6_1Imm:$src2, globaladdress:$src3),
-            "memh($src1+#$src2) = ##$src3",
-            [(truncstorei16 (HexagonCONST32 tglobaladdr:$src3),
-                    (add IntRegs:$src1, u6_1ImmPred:$src2))]>,
-            Requires<[HasV4T]>;
-// Map from store(globaladdress + x) -> memd(#foo + x)
-let AddedComplexity = 100 in
-def : Pat<(store (i64 DoubleRegs:$src1),
-                 FoldGlobalAddrGP:$addr),
-          (STrid_abs_V4 FoldGlobalAddrGP:$addr, (i64 DoubleRegs:$src1))>,
-          Requires<[HasV4T]>;
-
-def : Pat<(atomic_store_64 FoldGlobalAddrGP:$addr,
-                           (i64 DoubleRegs:$src1)),
-          (STrid_abs_V4 FoldGlobalAddrGP:$addr, (i64 DoubleRegs:$src1))>,
-          Requires<[HasV4T]>;
-
-// Map from store(globaladdress + x) -> memb(#foo + x)
-let AddedComplexity = 100 in
-def : Pat<(truncstorei8 (i32 IntRegs:$src1), FoldGlobalAddrGP:$addr),
-          (STrib_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
-            Requires<[HasV4T]>;
-
-def : Pat<(atomic_store_8 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1)),
-          (STrib_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
-            Requires<[HasV4T]>;
-
-// Map from store(globaladdress + x) -> memh(#foo + x)
-let AddedComplexity = 100 in
-def : Pat<(truncstorei16 (i32 IntRegs:$src1), FoldGlobalAddrGP:$addr),
-          (STrih_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
-            Requires<[HasV4T]>;
-
-def : Pat<(atomic_store_16 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1)),
-          (STrih_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
-            Requires<[HasV4T]>;
-
-// Map from store(globaladdress + x) -> memw(#foo + x)
-let AddedComplexity = 100 in
-def : Pat<(store (i32 IntRegs:$src1), FoldGlobalAddrGP:$addr),
-          (STriw_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
-           Requires<[HasV4T]>;
-
-def : Pat<(atomic_store_32 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1)),
-          (STriw_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
-            Requires<[HasV4T]>;
-
-// Map from load(globaladdress + x) -> memd(#foo + x)
-let AddedComplexity = 100 in
-def : Pat<(i64 (load FoldGlobalAddrGP:$addr)),
-          (i64 (LDrid_abs_V4 FoldGlobalAddrGP:$addr))>,
-           Requires<[HasV4T]>;
-
-def : Pat<(atomic_load_64 FoldGlobalAddrGP:$addr),
-          (i64 (LDrid_abs_V4 FoldGlobalAddrGP:$addr))>,
-           Requires<[HasV4T]>;
-
-// Map from load(globaladdress + x) -> memb(#foo + x)
-let AddedComplexity = 100 in
-def : Pat<(i32 (extloadi8 FoldGlobalAddrGP:$addr)),
-          (i32 (LDrib_abs_V4 FoldGlobalAddrGP:$addr))>,
-           Requires<[HasV4T]>;
-
-// Map from load(globaladdress + x) -> memb(#foo + x)
-let AddedComplexity = 100 in
-def : Pat<(i32 (sextloadi8 FoldGlobalAddrGP:$addr)),
-          (i32 (LDrib_abs_V4 FoldGlobalAddrGP:$addr))>,
-           Requires<[HasV4T]>;
-
-//let AddedComplexity = 100 in
-let AddedComplexity = 100 in
-def : Pat<(i32 (extloadi16 FoldGlobalAddrGP:$addr)),
-          (i32 (LDrih_abs_V4 FoldGlobalAddrGP:$addr))>,
-           Requires<[HasV4T]>;
-
-// Map from load(globaladdress + x) -> memh(#foo + x)
-let AddedComplexity = 100 in
-def : Pat<(i32 (sextloadi16 FoldGlobalAddrGP:$addr)),
-          (i32 (LDrih_abs_V4 FoldGlobalAddrGP:$addr))>,
-           Requires<[HasV4T]>;
-
-// Map from load(globaladdress + x) -> memuh(#foo + x)
-let AddedComplexity = 100 in
-def : Pat<(i32 (zextloadi16 FoldGlobalAddrGP:$addr)),
-          (i32 (LDriuh_abs_V4 FoldGlobalAddrGP:$addr))>,
-           Requires<[HasV4T]>;
+let Defs = [PC, P1], Uses = [P1] in {
+  def J4_tstbit0_tp1_jump_nt : CJInst_tstbit_R0<"p1", 0, "nt">;
+  def J4_tstbit0_tp1_jump_t : CJInst_tstbit_R0<"p1", 0, "t">;
+  def J4_tstbit0_fp1_jump_nt : CJInst_tstbit_R0<"p1", 1, "nt">;
+  def J4_tstbit0_fp1_jump_t : CJInst_tstbit_R0<"p1", 1, "t">;
+}
 
-def : Pat<(atomic_load_16 FoldGlobalAddrGP:$addr),
-          (i32 (LDriuh_abs_V4 FoldGlobalAddrGP:$addr))>,
-           Requires<[HasV4T]>;
 
-// Map from load(globaladdress + x) -> memub(#foo + x)
-let AddedComplexity = 100 in
-def : Pat<(i32 (zextloadi8 FoldGlobalAddrGP:$addr)),
-          (i32 (LDriub_abs_V4 FoldGlobalAddrGP:$addr))>,
-           Requires<[HasV4T]>;
+let isBranch = 1, hasSideEffects = 0,
+    isExtentSigned = 1, isPredicated = 1, isPredicatedNew = 1,
+    isExtendable = 1, opExtentBits = 11, opExtentAlign = 2,
+    opExtendable = 2, isTerminator = 1 in
+class CJInst_RR<string px, string op, bit np, string tnt>
+  : InstHexagon<(outs), (ins IntRegs:$Rs, IntRegs:$Rt, brtarget:$r9_2),
+  ""#px#" = cmp."#op#"($Rs, $Rt); if ("
+   #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
+  [], "", COMPOUND, TypeCOMPOUND> {
+  bits<4> Rs;
+  bits<4> Rt;
+  bits<11> r9_2;
+
+  // np: !p[01]
+  let isPredicatedFalse = np;
+  // tnt: Taken/Not Taken
+  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
+  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
+
+  let IClass = 0b0001;
+  let Inst{27-23} = !if (!eq(op, "eq"),  0b01000,
+                    !if (!eq(op, "gt"),  0b01001,
+                    !if (!eq(op, "gtu"), 0b01010, 0)));
+  let Inst{22} = np;
+  let Inst{21-20} = r9_2{10-9};
+  let Inst{19-16} = Rs;
+  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
+  // px: Predicate reg 0/1
+  let Inst{12} = !if (!eq(px, "!p1"), 1,
+                 !if (!eq(px,  "p1"), 1, 0));
+  let Inst{11-8} = Rt;
+  let Inst{7-1} = r9_2{8-2};
+}
 
-def : Pat<(atomic_load_8 FoldGlobalAddrGP:$addr),
-          (i32 (LDriub_abs_V4 FoldGlobalAddrGP:$addr))>,
-           Requires<[HasV4T]>;
+// P[10] taken/not taken.
+multiclass T_tnt_CJInst_RR<string op, bit np> {
+  let Defs = [PC, P0], Uses = [P0] in {
+    def NAME#p0_jump_nt : CJInst_RR<"p0", op, np, "nt">;
+    def NAME#p0_jump_t : CJInst_RR<"p0", op, np, "t">;
+  }
+  let Defs = [PC, P1], Uses = [P1] in {
+    def NAME#p1_jump_nt : CJInst_RR<"p1", op, np, "nt">;
+    def NAME#p1_jump_t : CJInst_RR<"p1", op, np, "t">;
+  }
+}
+// Predicate / !Predicate
+multiclass T_pnp_CJInst_RR<string op>{
+  defm J4_cmp#NAME#_t : T_tnt_CJInst_RR<op, 0>;
+  defm J4_cmp#NAME#_f : T_tnt_CJInst_RR<op, 1>;
+}
+// TypeCJ Instructions compare RR and jump
+defm eq : T_pnp_CJInst_RR<"eq">;
+defm gt : T_pnp_CJInst_RR<"gt">;
+defm gtu : T_pnp_CJInst_RR<"gtu">;
+
+let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
+    isPredicated = 1, isPredicatedNew = 1, isExtendable = 1, opExtentBits = 11,
+    opExtentAlign = 2, opExtendable = 2, isTerminator = 1 in
+class CJInst_RU5<string px, string op, bit np, string tnt>
+  : InstHexagon<(outs), (ins IntRegs:$Rs, u5Imm:$U5, brtarget:$r9_2),
+  ""#px#" = cmp."#op#"($Rs, #$U5); if ("
+    #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
+  [], "", COMPOUND, TypeCOMPOUND> {
+  bits<4> Rs;
+  bits<5> U5;
+  bits<11> r9_2;
+
+  // np: !p[01]
+  let isPredicatedFalse = np;
+  // tnt: Taken/Not Taken
+  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
+  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
+
+  let IClass = 0b0001;
+  let Inst{27-26} = 0b00;
+  // px: Predicate reg 0/1
+  let Inst{25} = !if (!eq(px, "!p1"), 1,
+                 !if (!eq(px,  "p1"), 1, 0));
+  let Inst{24-23} = !if (!eq(op, "eq"),  0b00,
+                    !if (!eq(op, "gt"),  0b01,
+                    !if (!eq(op, "gtu"), 0b10, 0)));
+  let Inst{22} = np;
+  let Inst{21-20} = r9_2{10-9};
+  let Inst{19-16} = Rs;
+  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
+  let Inst{12-8} = U5;
+  let Inst{7-1} = r9_2{8-2};
+}
+// P[10] taken/not taken.
+multiclass T_tnt_CJInst_RU5<string op, bit np> {
+  let Defs = [PC, P0], Uses = [P0] in {
+    def NAME#p0_jump_nt : CJInst_RU5<"p0", op, np, "nt">;
+    def NAME#p0_jump_t : CJInst_RU5<"p0", op, np, "t">;
+  }
+  let Defs = [PC, P1], Uses = [P1] in {
+    def NAME#p1_jump_nt : CJInst_RU5<"p1", op, np, "nt">;
+    def NAME#p1_jump_t : CJInst_RU5<"p1", op, np, "t">;
+  }
+}
+// Predicate / !Predicate
+multiclass T_pnp_CJInst_RU5<string op>{
+  defm J4_cmp#NAME#i_t : T_tnt_CJInst_RU5<op, 0>;
+  defm J4_cmp#NAME#i_f : T_tnt_CJInst_RU5<op, 1>;
+}
+// TypeCJ Instructions compare RI and jump
+defm eq : T_pnp_CJInst_RU5<"eq">;
+defm gt : T_pnp_CJInst_RU5<"gt">;
+defm gtu : T_pnp_CJInst_RU5<"gtu">;
+
+let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
+    isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1,
+    isExtendable = 1, opExtentBits = 11, opExtentAlign = 2, opExtendable = 1,
+    isTerminator = 1 in
+class CJInst_Rn1<string px, string op, bit np, string tnt>
+  : InstHexagon<(outs), (ins IntRegs:$Rs, brtarget:$r9_2),
+  ""#px#" = cmp."#op#"($Rs,#-1); if ("
+  #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
+  [], "", COMPOUND, TypeCOMPOUND> {
+  bits<4> Rs;
+  bits<11> r9_2;
+
+  // np: !p[01]
+  let isPredicatedFalse = np;
+  // tnt: Taken/Not Taken
+  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
+  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
+
+  let IClass = 0b0001;
+  let Inst{27-26} = 0b00;
+  let Inst{25} = !if (!eq(px, "!p1"), 1,
+                 !if (!eq(px,  "p1"), 1, 0));
+
+  let Inst{24-23} = 0b11;
+  let Inst{22} = np;
+  let Inst{21-20} = r9_2{10-9};
+  let Inst{19-16} = Rs;
+  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
+  let Inst{9-8} = !if (!eq(op, "eq"),  0b00,
+                  !if (!eq(op, "gt"),  0b01, 0));
+  let Inst{7-1} = r9_2{8-2};
+}
 
-// Map from load(globaladdress + x) -> memw(#foo + x)
-let AddedComplexity = 100 in
-def : Pat<(i32 (load FoldGlobalAddrGP:$addr)),
-          (i32 (LDriw_abs_V4 FoldGlobalAddrGP:$addr))>,
-           Requires<[HasV4T]>;
+// P[10] taken/not taken.
+multiclass T_tnt_CJInst_Rn1<string op, bit np> {
+  let Defs = [PC, P0], Uses = [P0] in {
+    def NAME#p0_jump_nt : CJInst_Rn1<"p0", op, np, "nt">;
+    def NAME#p0_jump_t : CJInst_Rn1<"p0", op, np, "t">;
+  }
+  let Defs = [PC, P1], Uses = [P1] in {
+    def NAME#p1_jump_nt : CJInst_Rn1<"p1", op, np, "nt">;
+    def NAME#p1_jump_t : CJInst_Rn1<"p1", op, np, "t">;
+  }
+}
+// Predicate / !Predicate
+multiclass T_pnp_CJInst_Rn1<string op>{
+  defm J4_cmp#NAME#n1_t : T_tnt_CJInst_Rn1<op, 0>;
+  defm J4_cmp#NAME#n1_f : T_tnt_CJInst_Rn1<op, 1>;
+}
+// TypeCJ Instructions compare -1 and jump
+defm eq : T_pnp_CJInst_Rn1<"eq">;
+defm gt : T_pnp_CJInst_Rn1<"gt">;
+
+// J4_jumpseti: Direct unconditional jump and set register to immediate.
+let Defs = [PC], isBranch = 1, hasSideEffects = 0, hasNewValue = 1,
+    isExtentSigned = 1, opNewValue = 0, isExtendable = 1, opExtentBits = 11,
+    opExtentAlign = 2, opExtendable = 2 in
+def J4_jumpseti: CJInst <
+  (outs IntRegs:$Rd),
+  (ins u6Imm:$U6, brtarget:$r9_2),
+  "$Rd = #$U6 ; jump $r9_2"> {
+    bits<4> Rd;
+    bits<6> U6;
+    bits<11> r9_2;
+
+    let IClass = 0b0001;
+    let Inst{27-24} = 0b0110;
+    let Inst{21-20} = r9_2{10-9};
+    let Inst{19-16} = Rd;
+    let Inst{13-8} = U6;
+    let Inst{7-1} = r9_2{8-2};
+  }
 
-def : Pat<(atomic_load_32 FoldGlobalAddrGP:$addr),
-          (i32 (LDriw_abs_V4 FoldGlobalAddrGP:$addr))>,
-           Requires<[HasV4T]>;
+// J4_jumpsetr: Direct unconditional jump and transfer register.
+let Defs = [PC], isBranch = 1, hasSideEffects = 0, hasNewValue = 1,
+    isExtentSigned = 1, opNewValue = 0, isExtendable = 1, opExtentBits = 11,
+    opExtentAlign = 2, opExtendable = 2 in
+def J4_jumpsetr: CJInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs, brtarget:$r9_2),
+  "$Rd = $Rs ; jump $r9_2"> {
+    bits<4> Rd;
+    bits<4> Rs;
+    bits<11> r9_2;
+
+    let IClass = 0b0001;
+    let Inst{27-24} = 0b0111;
+    let Inst{21-20} = r9_2{10-9};
+    let Inst{11-8} = Rd;
+    let Inst{19-16} = Rs;
+    let Inst{7-1} = r9_2{8-2};
+  }
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV5.td b/lib/Target/Hexagon/HexagonInstrInfoV5.td
index 9da6074..19b0935 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV5.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV5.td
@@ -1,26 +1,94 @@
-def SDTHexagonFCONST32 : SDTypeProfile<1, 1, [
-                                            SDTCisVT<0, f32>,
-                                            SDTCisPtrTy<1>]>;
-def HexagonFCONST32 : SDNode<"HexagonISD::FCONST32",     SDTHexagonFCONST32>;
+//=- HexagonInstrInfoV5.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V5 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XTYPE/MPY
+//===----------------------------------------------------------------------===//
+
+  //Rdd[+]=vrmpybsu(Rss,Rtt)
+let Predicates = [HasV5T] in {
+  def M5_vrmpybsu: T_XTYPE_Vect<"vrmpybsu", 0b110, 0b001, 0>;
+  def M5_vrmacbsu: T_XTYPE_Vect_acc<"vrmpybsu", 0b110, 0b001, 0>;
+
+  //Rdd[+]=vrmpybu(Rss,Rtt)
+  def M5_vrmpybuu: T_XTYPE_Vect<"vrmpybu", 0b100, 0b001, 0>;
+  def M5_vrmacbuu: T_XTYPE_Vect_acc<"vrmpybu", 0b100, 0b001, 0>;
+
+  def M5_vdmpybsu: T_M2_vmpy<"vdmpybsu", 0b101, 0b001, 0, 0, 1>;
+  def M5_vdmacbsu: T_M2_vmpy_acc_sat <"vdmpybsu", 0b001, 0b001, 0, 0>;
+}
+
+// Vector multiply bytes
+// Rdd=vmpyb[s]u(Rs,Rt)
+let Predicates = [HasV5T] in {
+  def M5_vmpybsu: T_XTYPE_mpy64 <"vmpybsu", 0b010, 0b001, 0, 0, 0>;
+  def M5_vmpybuu: T_XTYPE_mpy64 <"vmpybu",  0b100, 0b001, 0, 0, 0>;
+
+  // Rxx+=vmpyb[s]u(Rs,Rt)
+  def M5_vmacbsu: T_XTYPE_mpy64_acc <"vmpybsu", "+", 0b110, 0b001, 0, 0, 0>;
+  def M5_vmacbuu: T_XTYPE_mpy64_acc <"vmpybu", "+", 0b100, 0b001, 0, 0, 0>;
+
+  // Rd=vaddhub(Rss,Rtt):sat
+  let hasNewValue = 1, opNewValue = 0 in
+    def A5_vaddhubs: T_S3op_1 <"vaddhub", IntRegs, 0b01, 0b001, 0, 1>;
+}
+
+def S2_asr_i_p_rnd : S_2OpInstImm<"asr", 0b110, 0b111, u6Imm,
+      [(set I64:$dst,
+            (sra (i64 (add (i64 (sra I64:$src1, u6ImmPred:$src2)), 1)),
+                 (i32 1)))], 1>,
+      Requires<[HasV5T]> {
+  bits<6> src2;
+  let Inst{13-8} = src2;
+}
+
+let isAsmParserOnly = 1 in
+def S2_asr_i_p_rnd_goodsyntax
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+    "$dst = asrrnd($src1, #$src2)">;
+
+def C4_fastcorner9 : T_LOGICAL_2OP<"fastcorner9", 0b000, 0, 0>,
+  Requires<[HasV5T]> {
+  let Inst{13,7,4} = 0b111;
+}
+
+def C4_fastcorner9_not : T_LOGICAL_2OP<"!fastcorner9", 0b000, 0, 0>,
+  Requires<[HasV5T]> {
+  let Inst{20,13,7,4} = 0b1111;
+}
 
-let isReMaterializable = 1, isMoveImm = 1 in
+def SDTHexagonFCONST32 : SDTypeProfile<1, 1, [SDTCisVT<0, f32>,
+                                              SDTCisPtrTy<1>]>;
+def HexagonFCONST32 : SDNode<"HexagonISD::FCONST32", SDTHexagonFCONST32>;
+
+let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in
 def FCONST32_nsdata : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
-              "$dst = CONST32(#$global)",
-              [(set (f32 IntRegs:$dst),
-              (HexagonFCONST32 tglobaladdr:$global))]>,
-               Requires<[HasV5T]>;
+                             "$dst = CONST32(#$global)",
+                             [(set F32:$dst,
+                              (HexagonFCONST32 tglobaladdr:$global))]>,
+                             Requires<[HasV5T]>;
 
-let isReMaterializable = 1, isMoveImm = 1 in
+let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in
 def CONST64_Float_Real : LDInst<(outs DoubleRegs:$dst), (ins f64imm:$src1),
-                       "$dst = CONST64(#$src1)",
-                       [(set DoubleRegs:$dst, fpimm:$src1)]>,
-          Requires<[HasV5T]>;
+                                "$dst = CONST64(#$src1)",
+                                [(set F64:$dst, fpimm:$src1)]>,
+                                Requires<[HasV5T]>;
 
-let isReMaterializable = 1, isMoveImm = 1 in
+let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in
 def CONST32_Float_Real : LDInst<(outs IntRegs:$dst), (ins f32imm:$src1),
-                       "$dst = CONST32(#$src1)",
-                       [(set IntRegs:$dst, fpimm:$src1)]>,
-          Requires<[HasV5T]>;
+                                "$dst = CONST32(#$src1)",
+                                [(set F32:$dst, fpimm:$src1)]>,
+                                Requires<[HasV5T]>;
 
 // Transfer immediate float.
 // Only works with single precision fp value.
@@ -29,605 +97,841 @@ def CONST32_Float_Real : LDInst<(outs IntRegs:$dst), (ins f32imm:$src1),
 // Make sure that complexity is more than the CONST32 pattern in
 // HexagonInstrInfo.td patterns.
 let isExtended = 1, opExtendable = 1, isMoveImm = 1, isReMaterializable = 1,
-isPredicable = 1, AddedComplexity = 30, validSubTargets = HasV5SubT,
-isCodeGenOnly = 1 in
+    isPredicable = 1, AddedComplexity = 30, validSubTargets = HasV5SubT,
+    isCodeGenOnly = 1 in
 def TFRI_f : ALU32_ri<(outs IntRegs:$dst), (ins f32Ext:$src1),
-           "$dst = #$src1",
-           [(set IntRegs:$dst, fpimm:$src1)]>,
-          Requires<[HasV5T]>;
+                      "$dst = #$src1",
+                      [(set F32:$dst, fpimm:$src1)]>,
+                      Requires<[HasV5T]>;
 
 let isExtended = 1, opExtendable = 2, isPredicated = 1,
-neverHasSideEffects = 1, validSubTargets = HasV5SubT in
+    hasSideEffects = 0, validSubTargets = HasV5SubT, isCodeGenOnly = 1 in
 def TFRI_cPt_f : ALU32_ri<(outs IntRegs:$dst),
                           (ins PredRegs:$src1, f32Ext:$src2),
-           "if ($src1) $dst = #$src2",
-           []>,
-          Requires<[HasV5T]>;
+                          "if ($src1) $dst = #$src2", []>,
+                          Requires<[HasV5T]>;
 
-let isExtended = 1, opExtendable = 2, isPredicated = 1, isPredicatedFalse = 1,
-neverHasSideEffects = 1, validSubTargets = HasV5SubT in
+let isPseudo = 1, isExtended = 1, opExtendable = 2, isPredicated = 1,
+    isPredicatedFalse = 1, hasSideEffects = 0, validSubTargets = HasV5SubT in
 def TFRI_cNotPt_f : ALU32_ri<(outs IntRegs:$dst),
                              (ins PredRegs:$src1, f32Ext:$src2),
-           "if (!$src1) $dst =#$src2",
-           []>,
-          Requires<[HasV5T]>;
+                             "if (!$src1) $dst = #$src2", []>,
+                             Requires<[HasV5T]>;
+
+def SDTHexagonI32I64: SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
+                                           SDTCisVT<1, i64>]>;
+
+def HexagonPOPCOUNT: SDNode<"HexagonISD::POPCOUNT", SDTHexagonI32I64>;
+
+let hasNewValue = 1, validSubTargets = HasV5SubT in
+def S5_popcountp : ALU64_rr<(outs IntRegs:$Rd), (ins DoubleRegs:$Rss),
+  "$Rd = popcount($Rss)",
+  [(set I32:$Rd, (HexagonPOPCOUNT I64:$Rss))], "", S_2op_tc_2_SLOT23>,
+  Requires<[HasV5T]> {
+    bits<5> Rd;
+    bits<5> Rss;
+
+    let IClass = 0b1000;
+
+    let Inst{27-21} = 0b1000011;
+    let Inst{7-5} = 0b011;
+    let Inst{4-0} = Rd;
+    let Inst{20-16} = Rss;
+  }
+
+defm: Loadx_pat<load, f32, s11_2ExtPred, L2_loadri_io>;
+defm: Loadx_pat<load, f64, s11_3ExtPred, L2_loadrd_io>;
+
+defm: Storex_pat<store, F32, s11_2ExtPred, S2_storeri_io>;
+defm: Storex_pat<store, F64, s11_3ExtPred, S2_storerd_io>;
+def: Storex_simple_pat<store, F32, S2_storeri_io>;
+def: Storex_simple_pat<store, F64, S2_storerd_io>;
+
+let isFP = 1, hasNewValue = 1, opNewValue = 0 in
+class T_MInstFloat <string mnemonic, bits<3> MajOp, bits<3> MinOp>
+  : MInst<(outs IntRegs:$Rd),
+          (ins IntRegs:$Rs, IntRegs:$Rt),
+  "$Rd = "#mnemonic#"($Rs, $Rt)", [],
+  "" , M_tc_3or4x_SLOT23 > ,
+  Requires<[HasV5T]> {
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1011;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{13} = 0b0;
+    let Inst{12-8} = Rt;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = Rd;
+  }
+
+let isCommutable = 1 in {
+  def F2_sfadd : T_MInstFloat < "sfadd", 0b000, 0b000>;
+  def F2_sfmpy : T_MInstFloat < "sfmpy", 0b010, 0b000>;
+}
 
-// Convert single precision to double precision and vice-versa.
-def CONVERT_sf2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
-                "$dst = convert_sf2df($src)",
-                [(set DoubleRegs:$dst, (fextend IntRegs:$src))]>,
-          Requires<[HasV5T]>;
+def F2_sfsub : T_MInstFloat < "sfsub", 0b000, 0b001>;
 
-def CONVERT_df2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
-                "$dst = convert_df2sf($src)",
-                [(set IntRegs:$dst, (fround DoubleRegs:$src))]>,
-          Requires<[HasV5T]>;
+def: Pat<(f32 (fadd F32:$src1, F32:$src2)),
+         (F2_sfadd F32:$src1, F32:$src2)>;
 
+def: Pat<(f32 (fsub F32:$src1, F32:$src2)),
+         (F2_sfsub F32:$src1, F32:$src2)>;
 
-// Load.
-def LDrid_f : LDInst<(outs DoubleRegs:$dst),
-            (ins MEMri:$addr),
-            "$dst = memd($addr)",
-            [(set DoubleRegs:$dst, (f64 (load ADDRriS11_3:$addr)))]>,
-          Requires<[HasV5T]>;
+def: Pat<(f32 (fmul F32:$src1, F32:$src2)),
+         (F2_sfmpy F32:$src1, F32:$src2)>;
 
+let Itinerary = M_tc_3x_SLOT23 in {
+  def F2_sfmax : T_MInstFloat < "sfmax", 0b100, 0b000>;
+  def F2_sfmin : T_MInstFloat < "sfmin", 0b100, 0b001>;
+}
 
-let AddedComplexity = 20 in
-def LDrid_indexed_f : LDInst<(outs DoubleRegs:$dst),
-            (ins IntRegs:$src1, s11_3Imm:$offset),
-            "$dst = memd($src1+#$offset)",
-            [(set DoubleRegs:$dst, (f64 (load (add IntRegs:$src1,
-                                              s11_3ImmPred:$offset))))]>,
-          Requires<[HasV5T]>;
+let AddedComplexity = 100, Predicates = [HasV5T] in {
+  def: Pat<(f32 (select (i1 (setolt F32:$src1, F32:$src2)),
+                        F32:$src1, F32:$src2)),
+           (F2_sfmin F32:$src1, F32:$src2)>;
 
-def LDriw_f : LDInst<(outs IntRegs:$dst),
-            (ins MEMri:$addr), "$dst = memw($addr)",
-            [(set IntRegs:$dst, (f32 (load ADDRriS11_2:$addr)))]>,
-          Requires<[HasV5T]>;
+  def: Pat<(f32 (select (i1 (setogt F32:$src1, F32:$src2)),
+                        F32:$src2, F32:$src1)),
+           (F2_sfmin F32:$src1, F32:$src2)>;
 
+  def: Pat<(f32 (select (i1 (setogt F32:$src1, F32:$src2)),
+                        F32:$src1, F32:$src2)),
+           (F2_sfmax F32:$src1, F32:$src2)>;
 
-let AddedComplexity = 20 in
-def LDriw_indexed_f : LDInst<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s11_2Imm:$offset),
-            "$dst = memw($src1+#$offset)",
-            [(set IntRegs:$dst, (f32 (load (add IntRegs:$src1,
-                                           s11_2ImmPred:$offset))))]>,
-          Requires<[HasV5T]>;
+  def: Pat<(f32 (select (i1 (setolt F32:$src1, F32:$src2)),
+                        F32:$src2, F32:$src1)),
+           (F2_sfmax F32:$src1, F32:$src2)>;
+}
 
-// Store.
-def STriw_f : STInst<(outs),
-            (ins MEMri:$addr, IntRegs:$src1),
-            "memw($addr) = $src1",
-            [(store (f32 IntRegs:$src1), ADDRriS11_2:$addr)]>,
-          Requires<[HasV5T]>;
+def F2_sffixupn : T_MInstFloat < "sffixupn", 0b110, 0b000>;
+def F2_sffixupd : T_MInstFloat < "sffixupd", 0b110, 0b001>;
+
+// F2_sfrecipa: Reciprocal approximation for division.
+let isPredicateLate = 1, isFP = 1,
+hasSideEffects = 0, hasNewValue = 1 in
+def F2_sfrecipa: MInst <
+  (outs IntRegs:$Rd, PredRegs:$Pe),
+  (ins IntRegs:$Rs, IntRegs:$Rt),
+  "$Rd, $Pe = sfrecipa($Rs, $Rt)">,
+  Requires<[HasV5T]> {
+    bits<5> Rd;
+    bits<2> Pe;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+    let Inst{27-21} = 0b1011111;
+    let Inst{20-16} = Rs;
+    let Inst{13}    = 0b0;
+    let Inst{12-8}  = Rt;
+    let Inst{7}     = 0b1;
+    let Inst{6-5}   = Pe;
+    let Inst{4-0}   = Rd;
+  }
+
+// F2_dfcmpeq: Floating point compare for equal.
+let isCompare = 1, isFP = 1 in
+class T_fcmp <string mnemonic, RegisterClass RC, bits<3> MinOp,
+              list<dag> pattern = [] >
+  : ALU64Inst <(outs PredRegs:$dst), (ins RC:$src1, RC:$src2),
+  "$dst = "#mnemonic#"($src1, $src2)", pattern,
+  "" , ALU64_tc_2early_SLOT23 > ,
+  Requires<[HasV5T]> {
+    bits<2> dst;
+    bits<5> src1;
+    bits<5> src2;
+
+    let IClass = 0b1101;
+
+    let Inst{27-21} = 0b0010111;
+    let Inst{20-16} = src1;
+    let Inst{12-8}  = src2;
+    let Inst{7-5}   = MinOp;
+    let Inst{1-0}   = dst;
+  }
+
+class T_fcmp64 <string mnemonic, PatFrag OpNode, bits<3> MinOp>
+  : T_fcmp <mnemonic, DoubleRegs, MinOp,
+  [(set  I1:$dst, (OpNode F64:$src1, F64:$src2))]> {
+  let IClass = 0b1101;
+  let Inst{27-21} = 0b0010111;
+}
 
-let AddedComplexity = 10 in
-def STriw_indexed_f : STInst<(outs),
-            (ins IntRegs:$src1, s11_2Imm:$src2, IntRegs:$src3),
-            "memw($src1+#$src2) = $src3",
-            [(store (f32 IntRegs:$src3),
-                (add IntRegs:$src1, s11_2ImmPred:$src2))]>,
-          Requires<[HasV5T]>;
+class T_fcmp32 <string mnemonic, PatFrag OpNode, bits<3> MinOp>
+  : T_fcmp <mnemonic, IntRegs, MinOp,
+  [(set  I1:$dst, (OpNode F32:$src1, F32:$src2))]> {
+  let IClass = 0b1100;
+  let Inst{27-21} = 0b0111111;
+}
 
-def STrid_f : STInst<(outs),
-            (ins MEMri:$addr, DoubleRegs:$src1),
-            "memd($addr) = $src1",
-            [(store (f64 DoubleRegs:$src1), ADDRriS11_2:$addr)]>,
-          Requires<[HasV5T]>;
+def F2_dfcmpeq : T_fcmp64<"dfcmp.eq", setoeq, 0b000>;
+def F2_dfcmpgt : T_fcmp64<"dfcmp.gt", setogt, 0b001>;
+def F2_dfcmpge : T_fcmp64<"dfcmp.ge", setoge, 0b010>;
+def F2_dfcmpuo : T_fcmp64<"dfcmp.uo", setuo,  0b011>;
+
+def F2_sfcmpge : T_fcmp32<"sfcmp.ge", setoge, 0b000>;
+def F2_sfcmpuo : T_fcmp32<"sfcmp.uo", setuo,  0b001>;
+def F2_sfcmpeq : T_fcmp32<"sfcmp.eq", setoeq, 0b011>;
+def F2_sfcmpgt : T_fcmp32<"sfcmp.gt", setogt, 0b100>;
+
+//===----------------------------------------------------------------------===//
+// Multiclass to define 'Def Pats' for ordered gt, ge, eq operations.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasV5T] in
+multiclass T_fcmp_pats<PatFrag cmpOp, InstHexagon IntMI, InstHexagon DoubleMI> {
+  // IntRegs
+  def: Pat<(i1 (cmpOp F32:$src1, F32:$src2)),
+           (IntMI F32:$src1, F32:$src2)>;
+  // DoubleRegs
+  def: Pat<(i1 (cmpOp F64:$src1, F64:$src2)),
+           (DoubleMI F64:$src1, F64:$src2)>;
+}
 
-// Indexed store double word.
-let AddedComplexity = 10 in
-def STrid_indexed_f : STInst<(outs),
-            (ins IntRegs:$src1, s11_3Imm:$src2,  DoubleRegs:$src3),
-            "memd($src1+#$src2) = $src3",
-            [(store (f64 DoubleRegs:$src3),
-                                (add IntRegs:$src1, s11_3ImmPred:$src2))]>,
-          Requires<[HasV5T]>;
+defm : T_fcmp_pats <seteq, F2_sfcmpeq, F2_dfcmpeq>;
+defm : T_fcmp_pats <setgt, F2_sfcmpgt, F2_dfcmpgt>;
+defm : T_fcmp_pats <setge, F2_sfcmpge, F2_dfcmpge>;
+
+//===----------------------------------------------------------------------===//
+// Multiclass to define 'Def Pats' for unordered gt, ge, eq operations.
+//===----------------------------------------------------------------------===//
+let Predicates = [HasV5T] in
+multiclass unord_Pats <PatFrag cmpOp, InstHexagon IntMI, InstHexagon DoubleMI> {
+  // IntRegs
+  def: Pat<(i1 (cmpOp F32:$src1, F32:$src2)),
+           (C2_or (F2_sfcmpuo F32:$src1, F32:$src2),
+                  (IntMI F32:$src1, F32:$src2))>;
+
+  // DoubleRegs
+  def: Pat<(i1 (cmpOp F64:$src1, F64:$src2)),
+           (C2_or (F2_dfcmpuo F64:$src1, F64:$src2),
+                  (DoubleMI F64:$src1, F64:$src2))>;
+}
 
+defm : unord_Pats <setuge, F2_sfcmpge, F2_dfcmpge>;
+defm : unord_Pats <setugt, F2_sfcmpgt, F2_dfcmpgt>;
+defm : unord_Pats <setueq, F2_sfcmpeq, F2_dfcmpeq>;
+
+//===----------------------------------------------------------------------===//
+// Multiclass to define 'Def Pats' for the following dags:
+// seteq(setoeq(op1, op2), 0) -> not(setoeq(op1, op2))
+// seteq(setoeq(op1, op2), 1) -> setoeq(op1, op2)
+// setne(setoeq(op1, op2), 0) -> setoeq(op1, op2)
+// setne(setoeq(op1, op2), 1) -> not(setoeq(op1, op2))
+//===----------------------------------------------------------------------===//
+let Predicates = [HasV5T] in
+multiclass eq_ordgePats <PatFrag cmpOp, InstHexagon IntMI,
+                         InstHexagon DoubleMI> {
+  // IntRegs
+  def: Pat<(i1 (seteq (i1 (cmpOp F32:$src1, F32:$src2)), 0)),
+           (C2_not (IntMI F32:$src1, F32:$src2))>;
+  def: Pat<(i1 (seteq (i1 (cmpOp F32:$src1, F32:$src2)), 1)),
+           (IntMI F32:$src1, F32:$src2)>;
+  def: Pat<(i1 (setne (i1 (cmpOp F32:$src1, F32:$src2)), 0)),
+           (IntMI F32:$src1, F32:$src2)>;
+  def: Pat<(i1 (setne (i1 (cmpOp F32:$src1, F32:$src2)), 1)),
+           (C2_not (IntMI F32:$src1, F32:$src2))>;
+
+  // DoubleRegs
+  def : Pat<(i1 (seteq (i1 (cmpOp F64:$src1, F64:$src2)), 0)),
+            (C2_not (DoubleMI F64:$src1, F64:$src2))>;
+  def : Pat<(i1 (seteq (i1 (cmpOp F64:$src1, F64:$src2)), 1)),
+            (DoubleMI F64:$src1, F64:$src2)>;
+  def : Pat<(i1 (setne (i1 (cmpOp F64:$src1, F64:$src2)), 0)),
+            (DoubleMI F64:$src1, F64:$src2)>;
+  def : Pat<(i1 (setne (i1 (cmpOp F64:$src1, F64:$src2)), 1)),
+            (C2_not (DoubleMI F64:$src1, F64:$src2))>;
+}
 
-// Add
-let isCommutable = 1 in
-def fADD_rr : ALU64_rr<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = sfadd($src1, $src2)",
-            [(set IntRegs:$dst, (fadd IntRegs:$src1, IntRegs:$src2))]>,
-          Requires<[HasV5T]>;
+defm : eq_ordgePats<setoeq, F2_sfcmpeq, F2_dfcmpeq>;
+defm : eq_ordgePats<setoge, F2_sfcmpge, F2_dfcmpge>;
+defm : eq_ordgePats<setogt, F2_sfcmpgt, F2_dfcmpgt>;
+
+//===----------------------------------------------------------------------===//
+// Multiclass to define 'Def Pats' for the following dags:
+// seteq(setolt(op1, op2), 0) -> not(setogt(op2, op1))
+// seteq(setolt(op1, op2), 1) -> setogt(op2, op1)
+// setne(setolt(op1, op2), 0) -> setogt(op2, op1)
+// setne(setolt(op1, op2), 1) -> not(setogt(op2, op1))
+//===----------------------------------------------------------------------===//
+let Predicates = [HasV5T] in
+multiclass eq_ordltPats <PatFrag cmpOp, InstHexagon IntMI,
+                         InstHexagon DoubleMI> {
+  // IntRegs
+  def: Pat<(i1 (seteq (i1 (cmpOp F32:$src1, F32:$src2)), 0)),
+           (C2_not (IntMI F32:$src2, F32:$src1))>;
+  def: Pat<(i1 (seteq (i1 (cmpOp F32:$src1, F32:$src2)), 1)),
+           (IntMI F32:$src2, F32:$src1)>;
+  def: Pat<(i1 (setne (i1 (cmpOp F32:$src1, F32:$src2)), 0)),
+           (IntMI F32:$src2, F32:$src1)>;
+  def: Pat<(i1 (setne (i1 (cmpOp F32:$src1, F32:$src2)), 1)),
+           (C2_not (IntMI F32:$src2, F32:$src1))>;
+
+  // DoubleRegs
+  def: Pat<(i1 (seteq (i1 (cmpOp F64:$src1, F64:$src2)), 0)),
+           (C2_not (DoubleMI F64:$src2, F64:$src1))>;
+  def: Pat<(i1 (seteq (i1 (cmpOp F64:$src1, F64:$src2)), 1)),
+           (DoubleMI F64:$src2, F64:$src1)>;
+  def: Pat<(i1 (setne (i1 (cmpOp F64:$src1, F64:$src2)), 0)),
+           (DoubleMI F64:$src2, F64:$src1)>;
+  def: Pat<(i1 (setne (i1 (cmpOp F64:$src1, F64:$src2)), 0)),
+           (C2_not (DoubleMI F64:$src2, F64:$src1))>;
+}
 
-let isCommutable = 1 in
-def fADD64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                     DoubleRegs:$src2),
-               "$dst = dfadd($src1, $src2)",
-               [(set DoubleRegs:$dst, (fadd DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>,
-          Requires<[HasV5T]>;
+defm : eq_ordltPats<setole, F2_sfcmpge, F2_dfcmpge>;
+defm : eq_ordltPats<setolt, F2_sfcmpgt, F2_dfcmpgt>;
 
-def fSUB_rr : ALU64_rr<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = sfsub($src1, $src2)",
-            [(set IntRegs:$dst, (fsub IntRegs:$src1, IntRegs:$src2))]>,
-          Requires<[HasV5T]>;
 
-def fSUB64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                     DoubleRegs:$src2),
-               "$dst = dfsub($src1, $src2)",
-               [(set DoubleRegs:$dst, (fsub DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>,
-               Requires<[HasV5T]>;
-
-let isCommutable = 1 in
-def fMUL_rr : ALU64_rr<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = sfmpy($src1, $src2)",
-            [(set IntRegs:$dst, (fmul IntRegs:$src1, IntRegs:$src2))]>,
-            Requires<[HasV5T]>;
-
-let isCommutable = 1 in
-def fMUL64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                     DoubleRegs:$src2),
-               "$dst = dfmpy($src1, $src2)",
-               [(set DoubleRegs:$dst, (fmul DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>,
-               Requires<[HasV5T]>;
-
-// Compare.
-let isCompare = 1 in {
-multiclass FCMP64_rr<string OpcStr, PatFrag OpNode> {
-  def _rr : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$b, DoubleRegs:$c),
-                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
-                 [(set PredRegs:$dst,
-                        (OpNode (f64 DoubleRegs:$b), (f64 DoubleRegs:$c)))]>,
-                 Requires<[HasV5T]>;
+// o. seto inverse of setuo. http://llvm.org/docs/LangRef.html#i_fcmp
+let Predicates = [HasV5T] in {
+  def: Pat<(i1 (seto F32:$src1, F32:$src2)),
+           (C2_not (F2_sfcmpuo F32:$src2, F32:$src1))>;
+  def: Pat<(i1 (seto F32:$src1, fpimm:$src2)),
+           (C2_not (F2_sfcmpuo (TFRI_f fpimm:$src2), F32:$src1))>;
+  def: Pat<(i1 (seto F64:$src1, F64:$src2)),
+           (C2_not (F2_dfcmpuo F64:$src2, F64:$src1))>;
+  def: Pat<(i1 (seto F64:$src1, fpimm:$src2)),
+           (C2_not (F2_dfcmpuo (CONST64_Float_Real fpimm:$src2), F64:$src1))>;
+}
+
+// Ordered lt.
+let Predicates = [HasV5T] in {
+  def: Pat<(i1 (setolt F32:$src1, F32:$src2)),
+           (F2_sfcmpgt F32:$src2, F32:$src1)>;
+  def: Pat<(i1 (setolt F32:$src1, fpimm:$src2)),
+           (F2_sfcmpgt (f32 (TFRI_f fpimm:$src2)), F32:$src1)>;
+  def: Pat<(i1 (setolt F64:$src1, F64:$src2)),
+           (F2_dfcmpgt F64:$src2, F64:$src1)>;
+  def: Pat<(i1 (setolt F64:$src1, fpimm:$src2)),
+           (F2_dfcmpgt (CONST64_Float_Real fpimm:$src2), F64:$src1)>;
 }
 
-multiclass FCMP32_rr<string OpcStr, PatFrag OpNode> {
-  def _rr : ALU64_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
-                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
-                 [(set PredRegs:$dst,
-                        (OpNode (f32 IntRegs:$b), (f32 IntRegs:$c)))]>,
-                 Requires<[HasV5T]>;
+// Unordered lt.
+let Predicates = [HasV5T] in {
+  def: Pat<(i1 (setult F32:$src1, F32:$src2)),
+           (C2_or (F2_sfcmpuo  F32:$src1, F32:$src2),
+                  (F2_sfcmpgt F32:$src2, F32:$src1))>;
+  def: Pat<(i1 (setult F32:$src1, fpimm:$src2)),
+           (C2_or (F2_sfcmpuo  F32:$src1, (TFRI_f fpimm:$src2)),
+                  (F2_sfcmpgt (TFRI_f fpimm:$src2), F32:$src1))>;
+  def: Pat<(i1 (setult F64:$src1, F64:$src2)),
+           (C2_or (F2_dfcmpuo  F64:$src1, F64:$src2),
+                  (F2_dfcmpgt F64:$src2, F64:$src1))>;
+  def: Pat<(i1 (setult F64:$src1, fpimm:$src2)),
+           (C2_or (F2_dfcmpuo  F64:$src1, (CONST64_Float_Real fpimm:$src2)),
+                  (F2_dfcmpgt (CONST64_Float_Real fpimm:$src2), F64:$src1))>;
 }
+
+// Ordered le.
+let Predicates = [HasV5T] in {
+  // rs <= rt -> rt >= rs.
+  def: Pat<(i1 (setole F32:$src1, F32:$src2)),
+           (F2_sfcmpge F32:$src2, F32:$src1)>;
+  def: Pat<(i1 (setole F32:$src1, fpimm:$src2)),
+           (F2_sfcmpge (TFRI_f fpimm:$src2), F32:$src1)>;
+
+  // Rss <= Rtt -> Rtt >= Rss.
+  def: Pat<(i1 (setole F64:$src1, F64:$src2)),
+           (F2_dfcmpge F64:$src2, F64:$src1)>;
+  def: Pat<(i1 (setole F64:$src1, fpimm:$src2)),
+           (F2_dfcmpge (CONST64_Float_Real fpimm:$src2), F64:$src1)>;
 }
 
-defm FCMPOEQ64 : FCMP64_rr<"dfcmp.eq", setoeq>;
-defm FCMPUEQ64 : FCMP64_rr<"dfcmp.eq", setueq>;
-defm FCMPOGT64 : FCMP64_rr<"dfcmp.gt", setogt>;
-defm FCMPUGT64 : FCMP64_rr<"dfcmp.gt", setugt>;
-defm FCMPOGE64 : FCMP64_rr<"dfcmp.ge", setoge>;
-defm FCMPUGE64 : FCMP64_rr<"dfcmp.ge", setuge>;
-
-defm FCMPOEQ32 : FCMP32_rr<"sfcmp.eq", setoeq>;
-defm FCMPUEQ32 : FCMP32_rr<"sfcmp.eq", setueq>;
-defm FCMPOGT32 : FCMP32_rr<"sfcmp.gt", setogt>;
-defm FCMPUGT32 : FCMP32_rr<"sfcmp.gt", setugt>;
-defm FCMPOGE32 : FCMP32_rr<"sfcmp.ge", setoge>;
-defm FCMPUGE32 : FCMP32_rr<"sfcmp.ge", setuge>;
-
-// olt.
-def : Pat <(i1 (setolt (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
-      (i1 (FCMPOGT32_rr IntRegs:$src2, IntRegs:$src1))>,
-      Requires<[HasV5T]>;
-
-def : Pat <(i1 (setolt (f32 IntRegs:$src1), (fpimm:$src2))),
-      (i1 (FCMPOGT32_rr (f32 (TFRI_f fpimm:$src2)), (f32 IntRegs:$src1)))>,
-      Requires<[HasV5T]>;
-
-def : Pat <(i1 (setolt (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
-      (i1 (FCMPOGT64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
-      Requires<[HasV5T]>;
-
-def : Pat <(i1 (setolt (f64 DoubleRegs:$src1), (fpimm:$src2))),
-      (i1 (FCMPOGT64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
-                        (f64 DoubleRegs:$src1)))>,
-      Requires<[HasV5T]>;
-
-// gt.
-def : Pat <(i1 (setugt (f64 DoubleRegs:$src1), (fpimm:$src2))),
-      (i1 (FCMPUGT64_rr (f64 DoubleRegs:$src1),
-                        (f64 (CONST64_Float_Real fpimm:$src2))))>,
-      Requires<[HasV5T]>;
-
-def : Pat <(i1 (setugt (f32 IntRegs:$src1), (fpimm:$src2))),
-      (i1 (FCMPUGT32_rr (f32 IntRegs:$src1), (f32 (TFRI_f fpimm:$src2))))>,
-      Requires<[HasV5T]>;
-
-// ult.
-def : Pat <(i1 (setult (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
-      (i1 (FCMPUGT32_rr IntRegs:$src2, IntRegs:$src1))>,
-      Requires<[HasV5T]>;
-
-def : Pat <(i1 (setult (f32 IntRegs:$src1), (fpimm:$src2))),
-      (i1 (FCMPUGT32_rr (f32 (TFRI_f fpimm:$src2)), (f32 IntRegs:$src1)))>,
-      Requires<[HasV5T]>;
-
-def : Pat <(i1 (setult (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
-      (i1 (FCMPUGT64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
-      Requires<[HasV5T]>;
-
-def : Pat <(i1 (setult (f64 DoubleRegs:$src1), (fpimm:$src2))),
-      (i1 (FCMPUGT64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
-                        (f64 DoubleRegs:$src1)))>,
-      Requires<[HasV5T]>;
-
-// le.
+// Unordered le.
+let Predicates = [HasV5T] in {
 // rs <= rt -> rt >= rs.
-def : Pat<(i1 (setole (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
-      (i1 (FCMPOGE32_rr IntRegs:$src2, IntRegs:$src1))>,
-      Requires<[HasV5T]>;
+  def: Pat<(i1 (setule F32:$src1, F32:$src2)),
+           (C2_or (F2_sfcmpuo  F32:$src1, F32:$src2),
+                  (F2_sfcmpge F32:$src2, F32:$src1))>;
+  def: Pat<(i1 (setule F32:$src1, fpimm:$src2)),
+           (C2_or (F2_sfcmpuo  F32:$src1, (TFRI_f fpimm:$src2)),
+                  (F2_sfcmpge (TFRI_f fpimm:$src2), F32:$src1))>;
+  def: Pat<(i1 (setule F64:$src1, F64:$src2)),
+           (C2_or (F2_dfcmpuo  F64:$src1, F64:$src2),
+                  (F2_dfcmpge F64:$src2, F64:$src1))>;
+  def: Pat<(i1 (setule F64:$src1, fpimm:$src2)),
+           (C2_or (F2_dfcmpuo  F64:$src1, (CONST64_Float_Real fpimm:$src2)),
+                  (F2_dfcmpge (CONST64_Float_Real fpimm:$src2), F64:$src1))>;
+}
+
+// Ordered ne.
+let Predicates = [HasV5T] in {
+  def: Pat<(i1 (setone F32:$src1, F32:$src2)),
+           (C2_not (F2_sfcmpeq F32:$src1, F32:$src2))>;
+  def: Pat<(i1 (setone F64:$src1, F64:$src2)),
+           (C2_not (F2_dfcmpeq F64:$src1, F64:$src2))>;
+  def: Pat<(i1 (setone F32:$src1, fpimm:$src2)),
+           (C2_not (F2_sfcmpeq F32:$src1, (TFRI_f fpimm:$src2)))>;
+  def: Pat<(i1 (setone F64:$src1, fpimm:$src2)),
+           (C2_not (F2_dfcmpeq F64:$src1, (CONST64_Float_Real fpimm:$src2)))>;
+}
 
-def : Pat<(i1 (setole (f32 IntRegs:$src1), (fpimm:$src2))),
-      (i1 (FCMPOGE32_rr (f32 (TFRI_f fpimm:$src2)), IntRegs:$src1))>,
-      Requires<[HasV5T]>;
+// Unordered ne.
+let Predicates = [HasV5T] in {
+  def: Pat<(i1 (setune F32:$src1, F32:$src2)),
+           (C2_or (F2_sfcmpuo F32:$src1, F32:$src2),
+                  (C2_not (F2_sfcmpeq F32:$src1, F32:$src2)))>;
+  def: Pat<(i1 (setune F64:$src1, F64:$src2)),
+           (C2_or (F2_dfcmpuo F64:$src1, F64:$src2),
+                  (C2_not (F2_dfcmpeq F64:$src1, F64:$src2)))>;
+  def: Pat<(i1 (setune F32:$src1, fpimm:$src2)),
+           (C2_or (F2_sfcmpuo F32:$src1, (TFRI_f fpimm:$src2)),
+                  (C2_not (F2_sfcmpeq F32:$src1, (TFRI_f fpimm:$src2))))>;
+  def: Pat<(i1 (setune F64:$src1, fpimm:$src2)),
+           (C2_or (F2_dfcmpuo F64:$src1, (CONST64_Float_Real fpimm:$src2)),
+                  (C2_not (F2_dfcmpeq F64:$src1,
+                                        (CONST64_Float_Real fpimm:$src2))))>;
+}
 
+// Besides set[o|u][comparions], we also need set[comparisons].
+let Predicates = [HasV5T] in {
+  // lt.
+  def: Pat<(i1 (setlt F32:$src1, F32:$src2)),
+           (F2_sfcmpgt F32:$src2, F32:$src1)>;
+  def: Pat<(i1 (setlt F32:$src1, fpimm:$src2)),
+           (F2_sfcmpgt (TFRI_f fpimm:$src2), F32:$src1)>;
+  def: Pat<(i1 (setlt F64:$src1, F64:$src2)),
+           (F2_dfcmpgt F64:$src2, F64:$src1)>;
+  def: Pat<(i1 (setlt F64:$src1, fpimm:$src2)),
+           (F2_dfcmpgt (CONST64_Float_Real fpimm:$src2), F64:$src1)>;
+
+  // le.
+  // rs <= rt -> rt >= rs.
+  def: Pat<(i1 (setle F32:$src1, F32:$src2)),
+           (F2_sfcmpge F32:$src2, F32:$src1)>;
+  def: Pat<(i1 (setle F32:$src1, fpimm:$src2)),
+           (F2_sfcmpge (TFRI_f fpimm:$src2), F32:$src1)>;
+
+  // Rss <= Rtt -> Rtt >= Rss.
+  def: Pat<(i1 (setle F64:$src1, F64:$src2)),
+           (F2_dfcmpge F64:$src2, F64:$src1)>;
+  def: Pat<(i1 (setle F64:$src1, fpimm:$src2)),
+           (F2_dfcmpge (CONST64_Float_Real fpimm:$src2), F64:$src1)>;
+
+  // ne.
+  def: Pat<(i1 (setne F32:$src1, F32:$src2)),
+           (C2_not (F2_sfcmpeq F32:$src1, F32:$src2))>;
+  def: Pat<(i1 (setne F64:$src1, F64:$src2)),
+           (C2_not (F2_dfcmpeq F64:$src1, F64:$src2))>;
+  def: Pat<(i1 (setne F32:$src1, fpimm:$src2)),
+           (C2_not (F2_sfcmpeq F32:$src1, (TFRI_f fpimm:$src2)))>;
+  def: Pat<(i1 (setne F64:$src1, fpimm:$src2)),
+           (C2_not (F2_dfcmpeq F64:$src1, (CONST64_Float_Real fpimm:$src2)))>;
+}
 
-// Rss <= Rtt -> Rtt >= Rss.
-def : Pat<(i1 (setole (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
-      (i1 (FCMPOGE64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
-      Requires<[HasV5T]>;
+// F2 convert template classes:
+let isFP = 1 in
+class F2_RDD_RSS_CONVERT<string mnemonic, bits<3> MinOp,
+                         SDNode Op, PatLeaf RCOut, PatLeaf RCIn,
+                         string chop ="">
+  : SInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss),
+   "$Rdd = "#mnemonic#"($Rss)"#chop,
+   [(set RCOut:$Rdd, (Op RCIn:$Rss))], "",
+   S_2op_tc_3or4x_SLOT23> {
+     bits<5> Rdd;
+     bits<5> Rss;
+
+     let IClass = 0b1000;
+
+     let Inst{27-21} = 0b0000111;
+     let Inst{20-16} = Rss;
+     let Inst{7-5} = MinOp;
+     let Inst{4-0} = Rdd;
+  }
+
+let isFP = 1 in
+class F2_RDD_RS_CONVERT<string mnemonic, bits<3> MinOp,
+                        SDNode Op, PatLeaf RCOut, PatLeaf RCIn,
+                        string chop ="">
+  : SInst <(outs DoubleRegs:$Rdd), (ins IntRegs:$Rs),
+   "$Rdd = "#mnemonic#"($Rs)"#chop,
+   [(set RCOut:$Rdd, (Op RCIn:$Rs))], "",
+   S_2op_tc_3or4x_SLOT23> {
+     bits<5> Rdd;
+     bits<5> Rs;
+
+     let IClass = 0b1000;
+
+     let Inst{27-21} = 0b0100100;
+     let Inst{20-16} = Rs;
+     let Inst{7-5} = MinOp;
+     let Inst{4-0} = Rdd;
+  }
+
+let isFP = 1, hasNewValue = 1 in
+class F2_RD_RSS_CONVERT<string mnemonic, bits<3> MinOp,
+                        SDNode Op, PatLeaf RCOut, PatLeaf RCIn,
+                        string chop ="">
+  : SInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss),
+   "$Rd = "#mnemonic#"($Rss)"#chop,
+   [(set RCOut:$Rd, (Op RCIn:$Rss))], "",
+   S_2op_tc_3or4x_SLOT23> {
+     bits<5> Rd;
+     bits<5> Rss;
+
+     let IClass = 0b1000;
+
+     let Inst{27-24} = 0b1000;
+     let Inst{23-21} = MinOp;
+     let Inst{20-16} = Rss;
+     let Inst{7-5} = 0b001;
+     let Inst{4-0} = Rd;
+  }
+
+let isFP = 1, hasNewValue = 1 in
+class F2_RD_RS_CONVERT<string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                        SDNode Op, PatLeaf RCOut, PatLeaf RCIn,
+                        string chop ="">
+  : SInst <(outs IntRegs:$Rd), (ins IntRegs:$Rs),
+   "$Rd = "#mnemonic#"($Rs)"#chop,
+   [(set RCOut:$Rd, (Op RCIn:$Rs))], "",
+   S_2op_tc_3or4x_SLOT23> {
+     bits<5> Rd;
+     bits<5> Rs;
+
+     let IClass = 0b1000;
+
+     let Inst{27-24} = 0b1011;
+     let Inst{23-21} = MajOp;
+     let Inst{20-16} = Rs;
+     let Inst{7-5} = MinOp;
+     let Inst{4-0} = Rd;
+  }
 
-def : Pat<(i1 (setole (f64 DoubleRegs:$src1), (fpimm:$src2))),
-      (i1 (FCMPOGE64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
-                                DoubleRegs:$src1))>,
-      Requires<[HasV5T]>;
+// Convert single precision to double precision and vice-versa.
+def F2_conv_sf2df : F2_RDD_RS_CONVERT <"convert_sf2df", 0b000,
+                                       fextend, F64, F32>;
 
-// rs <= rt -> rt >= rs.
-def : Pat<(i1 (setule (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
-      (i1 (FCMPUGE32_rr IntRegs:$src2, IntRegs:$src1))>,
-      Requires<[HasV5T]>;
-
-def : Pat<(i1 (setule (f32 IntRegs:$src1), (fpimm:$src2))),
-      (i1 (FCMPUGE32_rr (f32 (TFRI_f fpimm:$src2)), IntRegs:$src1))>,
-      Requires<[HasV5T]>;
-
-// Rss <= Rtt -> Rtt >= Rss.
-def : Pat<(i1 (setule (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
-      (i1 (FCMPUGE64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
-      Requires<[HasV5T]>;
-
-def : Pat<(i1 (setule (f64 DoubleRegs:$src1), (fpimm:$src2))),
-      (i1 (FCMPUGE64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
-                                DoubleRegs:$src1))>,
-      Requires<[HasV5T]>;
-
-// ne.
-def : Pat<(i1 (setone (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
-      (i1 (NOT_p (FCMPOEQ32_rr IntRegs:$src1, IntRegs:$src2)))>,
-      Requires<[HasV5T]>;
-
-def : Pat<(i1 (setone (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
-      (i1 (NOT_p (FCMPOEQ64_rr DoubleRegs:$src1, DoubleRegs:$src2)))>,
-      Requires<[HasV5T]>;
-
-def : Pat<(i1 (setune (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
-      (i1 (NOT_p (FCMPUEQ32_rr IntRegs:$src1, IntRegs:$src2)))>,
-      Requires<[HasV5T]>;
-
-def : Pat<(i1 (setune (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
-      (i1 (NOT_p (FCMPUEQ64_rr DoubleRegs:$src1, DoubleRegs:$src2)))>,
-      Requires<[HasV5T]>;
-
-def : Pat<(i1 (setone (f32 IntRegs:$src1), (fpimm:$src2))),
-      (i1 (NOT_p (FCMPOEQ32_rr IntRegs:$src1, (f32 (TFRI_f fpimm:$src2)))))>,
-      Requires<[HasV5T]>;
-
-def : Pat<(i1 (setone (f64 DoubleRegs:$src1), (fpimm:$src2))),
-      (i1 (NOT_p (FCMPOEQ64_rr DoubleRegs:$src1,
-                              (f64 (CONST64_Float_Real fpimm:$src2)))))>,
-      Requires<[HasV5T]>;
-
-def : Pat<(i1 (setune (f32 IntRegs:$src1), (fpimm:$src2))),
-      (i1 (NOT_p (FCMPUEQ32_rr IntRegs:$src1,  (f32 (TFRI_f fpimm:$src2)))))>,
-      Requires<[HasV5T]>;
-
-def : Pat<(i1 (setune (f64 DoubleRegs:$src1), (fpimm:$src2))),
-      (i1 (NOT_p (FCMPUEQ64_rr DoubleRegs:$src1,
-                              (f64 (CONST64_Float_Real fpimm:$src2)))))>,
-      Requires<[HasV5T]>;
+def F2_conv_df2sf : F2_RD_RSS_CONVERT <"convert_df2sf", 0b000,
+                                       fround, F32, F64>;
 
 // Convert Integer to Floating Point.
-def CONVERT_d2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
-              "$dst = convert_d2sf($src)",
-              [(set (f32 IntRegs:$dst), (sint_to_fp (i64 DoubleRegs:$src)))]>,
-              Requires<[HasV5T]>;
-
-def CONVERT_ud2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
-              "$dst = convert_ud2sf($src)",
-              [(set (f32 IntRegs:$dst), (uint_to_fp (i64 DoubleRegs:$src)))]>,
-              Requires<[HasV5T]>;
-
-def CONVERT_uw2sf : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
-              "$dst = convert_uw2sf($src)",
-              [(set (f32 IntRegs:$dst), (uint_to_fp (i32 IntRegs:$src)))]>,
-              Requires<[HasV5T]>;
-
-def CONVERT_w2sf : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
-              "$dst = convert_w2sf($src)",
-              [(set (f32 IntRegs:$dst), (sint_to_fp (i32 IntRegs:$src)))]>,
-              Requires<[HasV5T]>;
-
-def CONVERT_d2df : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
-              "$dst = convert_d2df($src)",
-              [(set (f64 DoubleRegs:$dst), (sint_to_fp (i64 DoubleRegs:$src)))]>,
-              Requires<[HasV5T]>;
-
-def CONVERT_ud2df : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
-              "$dst = convert_ud2df($src)",
-              [(set (f64 DoubleRegs:$dst), (uint_to_fp (i64 DoubleRegs:$src)))]>,
-              Requires<[HasV5T]>;
-
-def CONVERT_uw2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
-              "$dst = convert_uw2df($src)",
-              [(set (f64 DoubleRegs:$dst), (uint_to_fp (i32 IntRegs:$src)))]>,
-              Requires<[HasV5T]>;
-
-def CONVERT_w2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
-              "$dst = convert_w2df($src)",
-              [(set (f64 DoubleRegs:$dst), (sint_to_fp (i32 IntRegs:$src)))]>,
-              Requires<[HasV5T]>;
+def F2_conv_d2sf : F2_RD_RSS_CONVERT <"convert_d2sf", 0b010,
+                                       sint_to_fp, F32, I64>;
+def F2_conv_ud2sf : F2_RD_RSS_CONVERT <"convert_ud2sf", 0b001,
+                                       uint_to_fp, F32, I64>;
+def F2_conv_uw2sf : F2_RD_RS_CONVERT <"convert_uw2sf", 0b001, 0b000,
+                                       uint_to_fp, F32, I32>;
+def F2_conv_w2sf : F2_RD_RS_CONVERT <"convert_w2sf", 0b010, 0b000,
+                                       sint_to_fp, F32, I32>;
+def F2_conv_d2df : F2_RDD_RSS_CONVERT <"convert_d2df", 0b011,
+                                       sint_to_fp, F64, I64>;
+def F2_conv_ud2df : F2_RDD_RSS_CONVERT <"convert_ud2df", 0b010,
+                                        uint_to_fp, F64, I64>;
+def F2_conv_uw2df : F2_RDD_RS_CONVERT <"convert_uw2df", 0b001,
+                                       uint_to_fp, F64, I32>;
+def F2_conv_w2df : F2_RDD_RS_CONVERT <"convert_w2df", 0b010,
+                                       sint_to_fp, F64, I32>;
 
 // Convert Floating Point to Integer - default.
-def CONVERT_df2uw : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
-              "$dst = convert_df2uw($src):chop",
-              [(set (i32 IntRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
-              Requires<[HasV5T]>;
-
-def CONVERT_df2w : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
-              "$dst = convert_df2w($src):chop",
-              [(set (i32 IntRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
-              Requires<[HasV5T]>;
-
-def CONVERT_sf2uw : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
-              "$dst = convert_sf2uw($src):chop",
-              [(set (i32 IntRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
-              Requires<[HasV5T]>;
-
-def CONVERT_sf2w : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
-              "$dst = convert_sf2w($src):chop",
-              [(set (i32 IntRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
-              Requires<[HasV5T]>;
-
-def CONVERT_df2d : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
-              "$dst = convert_df2d($src):chop",
-              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
-              Requires<[HasV5T]>;
-
-def CONVERT_df2ud : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
-              "$dst = convert_df2ud($src):chop",
-              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
-              Requires<[HasV5T]>;
-
-def CONVERT_sf2d : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
-              "$dst = convert_sf2d($src):chop",
-              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
-              Requires<[HasV5T]>;
-
-def CONVERT_sf2ud : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
-              "$dst = convert_sf2ud($src):chop",
-              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
-              Requires<[HasV5T]>;
+def F2_conv_df2uw_chop : F2_RD_RSS_CONVERT <"convert_df2uw", 0b101,
+                                            fp_to_uint, I32, F64, ":chop">;
+def F2_conv_df2w_chop : F2_RD_RSS_CONVERT <"convert_df2w", 0b111,
+                                            fp_to_sint, I32, F64, ":chop">;
+def F2_conv_sf2uw_chop : F2_RD_RS_CONVERT <"convert_sf2uw", 0b011, 0b001,
+                                       fp_to_uint, I32, F32, ":chop">;
+def F2_conv_sf2w_chop : F2_RD_RS_CONVERT <"convert_sf2w", 0b100, 0b001,
+                                       fp_to_sint, I32, F32, ":chop">;
+def F2_conv_df2d_chop : F2_RDD_RSS_CONVERT <"convert_df2d", 0b110,
+                                            fp_to_sint, I64, F64, ":chop">;
+def F2_conv_df2ud_chop : F2_RDD_RSS_CONVERT <"convert_df2ud", 0b111,
+                                             fp_to_uint, I64, F64, ":chop">;
+def F2_conv_sf2d_chop : F2_RDD_RS_CONVERT <"convert_sf2d", 0b110,
+                                       fp_to_sint, I64, F32, ":chop">;
+def F2_conv_sf2ud_chop : F2_RDD_RS_CONVERT <"convert_sf2ud", 0b101,
+                                            fp_to_uint, I64, F32, ":chop">;
 
 // Convert Floating Point to Integer: non-chopped.
-let AddedComplexity = 20 in
-def CONVERT_df2uw_nchop : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
-              "$dst = convert_df2uw($src)",
-              [(set (i32 IntRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
-              Requires<[HasV5T, IEEERndNearV5T]>;
-
-let AddedComplexity = 20 in
-def CONVERT_df2w_nchop : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
-              "$dst = convert_df2w($src)",
-              [(set (i32 IntRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
-              Requires<[HasV5T, IEEERndNearV5T]>;
-
-let AddedComplexity = 20 in
-def CONVERT_sf2uw_nchop : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
-              "$dst = convert_sf2uw($src)",
-              [(set (i32 IntRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
-              Requires<[HasV5T, IEEERndNearV5T]>;
-
-let AddedComplexity = 20 in
-def CONVERT_sf2w_nchop : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
-              "$dst = convert_sf2w($src)",
-              [(set (i32 IntRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
-              Requires<[HasV5T, IEEERndNearV5T]>;
-
-let AddedComplexity = 20 in
-def CONVERT_df2d_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
-              "$dst = convert_df2d($src)",
-              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
-              Requires<[HasV5T, IEEERndNearV5T]>;
-
-let AddedComplexity = 20 in
-def CONVERT_df2ud_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
-              "$dst = convert_df2ud($src)",
-              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
-              Requires<[HasV5T, IEEERndNearV5T]>;
-
-let AddedComplexity = 20 in
-def CONVERT_sf2d_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
-              "$dst = convert_sf2d($src)",
-              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
-              Requires<[HasV5T, IEEERndNearV5T]>;
-
-let AddedComplexity = 20 in
-def CONVERT_sf2ud_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
-              "$dst = convert_sf2ud($src)",
-              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
-              Requires<[HasV5T, IEEERndNearV5T]>;
-
+let AddedComplexity = 20, Predicates = [HasV5T, IEEERndNearV5T] in {
+  def F2_conv_df2d : F2_RDD_RSS_CONVERT <"convert_df2d", 0b000,
+                                         fp_to_sint, I64, F64>;
+  def F2_conv_df2ud : F2_RDD_RSS_CONVERT <"convert_df2ud", 0b001,
+                                          fp_to_uint, I64, F64>;
+  def F2_conv_sf2ud : F2_RDD_RS_CONVERT <"convert_sf2ud", 0b011,
+                                         fp_to_uint, I64, F32>;
+  def F2_conv_sf2d : F2_RDD_RS_CONVERT <"convert_sf2d", 0b100,
+                                         fp_to_sint, I64, F32>;
+  def F2_conv_df2uw : F2_RD_RSS_CONVERT <"convert_df2uw", 0b011,
+                                         fp_to_uint, I32, F64>;
+  def F2_conv_df2w : F2_RD_RSS_CONVERT <"convert_df2w", 0b100,
+                                         fp_to_sint, I32, F64>;
+  def F2_conv_sf2uw : F2_RD_RS_CONVERT <"convert_sf2uw", 0b011, 0b000,
+                                         fp_to_uint, I32, F32>;
+  def F2_conv_sf2w : F2_RD_RS_CONVERT <"convert_sf2w", 0b100, 0b000,
+                                         fp_to_sint, I32, F32>;
+}
 
+// Fix up radicand.
+let isFP = 1, hasNewValue = 1 in
+def F2_sffixupr: SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs),
+  "$Rd = sffixupr($Rs)",
+  [], "" , S_2op_tc_3or4x_SLOT23>, Requires<[HasV5T]> {
+    bits<5> Rd;
+    bits<5> Rs;
 
-// Bitcast is different than [fp|sint|uint]_to_[sint|uint|fp].
-def : Pat <(i32 (bitconvert (f32 IntRegs:$src))),
-           (i32 (TFR IntRegs:$src))>,
-          Requires<[HasV5T]>;
+    let IClass = 0b1000;
 
-def : Pat <(f32 (bitconvert (i32 IntRegs:$src))),
-           (f32 (TFR IntRegs:$src))>,
-          Requires<[HasV5T]>;
+    let Inst{27-21} = 0b1011101;
+    let Inst{20-16} = Rs;
+    let Inst{7-5}   = 0b000;
+    let Inst{4-0}   = Rd;
+  }
 
-def : Pat <(i64 (bitconvert (f64 DoubleRegs:$src))),
-           (i64 (TFR64 DoubleRegs:$src))>,
-          Requires<[HasV5T]>;
-
-def : Pat <(f64 (bitconvert (i64 DoubleRegs:$src))),
-           (f64 (TFR64 DoubleRegs:$src))>,
-          Requires<[HasV5T]>;
+// Bitcast is different than [fp|sint|uint]_to_[sint|uint|fp].
+let Predicates = [HasV5T] in {
+  def: Pat <(i32 (bitconvert F32:$src)), (I32:$src)>;
+  def: Pat <(f32 (bitconvert I32:$src)), (F32:$src)>;
+  def: Pat <(i64 (bitconvert F64:$src)), (I64:$src)>;
+  def: Pat <(f64 (bitconvert I64:$src)), (F64:$src)>;
+}
 
-// Floating point fused multiply-add.
-def FMADD_dp : ALU64_acc<(outs DoubleRegs:$dst),
-                  (ins DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-              "$dst += dfmpy($src2, $src3)",
-              [(set (f64 DoubleRegs:$dst),
-                  (fma DoubleRegs:$src2, DoubleRegs:$src3, DoubleRegs:$src1))],
-                  "$src1 = $dst">,
-              Requires<[HasV5T]>;
-
-def FMADD_sp : ALU64_acc<(outs IntRegs:$dst),
-                  (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-              "$dst += sfmpy($src2, $src3)",
-              [(set (f32 IntRegs:$dst),
-                  (fma IntRegs:$src2, IntRegs:$src3, IntRegs:$src1))],
-                  "$src1 = $dst">,
-              Requires<[HasV5T]>;
-
-
-// Floating point max/min.
-let AddedComplexity = 100 in
-def FMAX_dp : ALU64_rr<(outs DoubleRegs:$dst),
-                  (ins DoubleRegs:$src1, DoubleRegs:$src2),
-              "$dst = dfmax($src1, $src2)",
-              [(set DoubleRegs:$dst, (f64 (select (i1 (setolt DoubleRegs:$src2,
-                                                        DoubleRegs:$src1)),
-                                             DoubleRegs:$src1,
-                                             DoubleRegs:$src2)))]>,
-               Requires<[HasV5T]>;
-
-let AddedComplexity = 100 in
-def FMAX_sp : ALU64_rr<(outs IntRegs:$dst),
-                  (ins IntRegs:$src1, IntRegs:$src2),
-              "$dst = sfmax($src1, $src2)",
-              [(set IntRegs:$dst, (f32 (select (i1 (setolt IntRegs:$src2,
-                                                        IntRegs:$src1)),
-                                             IntRegs:$src1,
-                                             IntRegs:$src2)))]>,
-               Requires<[HasV5T]>;
-
-let AddedComplexity = 100 in
-def FMIN_dp : ALU64_rr<(outs DoubleRegs:$dst),
-                  (ins DoubleRegs:$src1, DoubleRegs:$src2),
-              "$dst = dfmin($src1, $src2)",
-              [(set DoubleRegs:$dst, (f64 (select (i1 (setogt DoubleRegs:$src2,
-                                                        DoubleRegs:$src1)),
-                                             DoubleRegs:$src1,
-                                             DoubleRegs:$src2)))]>,
-               Requires<[HasV5T]>;
-
-let AddedComplexity = 100 in
-def FMIN_sp : ALU64_rr<(outs IntRegs:$dst),
-                  (ins IntRegs:$src1, IntRegs:$src2),
-              "$dst = sfmin($src1, $src2)",
-              [(set IntRegs:$dst, (f32 (select (i1 (setogt IntRegs:$src2,
-                                                        IntRegs:$src1)),
-                                             IntRegs:$src1,
-                                             IntRegs:$src2)))]>,
-               Requires<[HasV5T]>;
-
-// Pseudo instruction to encode a set of conditional transfers.
-// This instruction is used instead of a mux and trades-off codesize
-// for performance. We conduct this transformation optimistically in
-// the hope that these instructions get promoted to dot-new transfers.
-let AddedComplexity = 100, isPredicated = 1 in
-def TFR_condset_rr_f : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
-                                                        IntRegs:$src2,
-                                                        IntRegs:$src3),
-                     "Error; should not emit",
-                     [(set IntRegs:$dst, (f32 (select PredRegs:$src1,
-                                                 IntRegs:$src2,
-                                                 IntRegs:$src3)))]>,
-               Requires<[HasV5T]>;
-
-let AddedComplexity = 100, isPredicated = 1 in
-def TFR_condset_rr64_f : ALU32_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
-                                                        DoubleRegs:$src2,
-                                                        DoubleRegs:$src3),
-                     "Error; should not emit",
-                     [(set DoubleRegs:$dst, (f64 (select PredRegs:$src1,
-                                                 DoubleRegs:$src2,
-                                                 DoubleRegs:$src3)))]>,
-               Requires<[HasV5T]>;
-
-
-
-let AddedComplexity = 100, isPredicated = 1 in
-def TFR_condset_ri_f : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, f32imm:$src3),
-            "Error; should not emit",
-            [(set IntRegs:$dst,
-             (f32 (select PredRegs:$src1, IntRegs:$src2, fpimm:$src3)))]>,
-               Requires<[HasV5T]>;
-
-let AddedComplexity = 100, isPredicated = 1 in
-def TFR_condset_ir_f : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, f32imm:$src2, IntRegs:$src3),
-            "Error; should not emit",
-            [(set IntRegs:$dst,
-             (f32 (select PredRegs:$src1, fpimm:$src2, IntRegs:$src3)))]>,
-               Requires<[HasV5T]>;
-
-let AddedComplexity = 100, isPredicated = 1 in
-def TFR_condset_ii_f : ALU32_rr<(outs IntRegs:$dst),
-                              (ins PredRegs:$src1, f32imm:$src2, f32imm:$src3),
-                     "Error; should not emit",
-                     [(set IntRegs:$dst, (f32 (select PredRegs:$src1,
-                                                 fpimm:$src2,
-                                                 fpimm:$src3)))]>,
-               Requires<[HasV5T]>;
-
-
-def : Pat <(select (i1 (setult (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
-                   (f32 IntRegs:$src3),
-                   (f32 IntRegs:$src4)),
-    (TFR_condset_rr_f (FCMPUGT32_rr IntRegs:$src2, IntRegs:$src1), IntRegs:$src4,
-                      IntRegs:$src3)>, Requires<[HasV5T]>;
-
-def : Pat <(select (i1 (setult (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
-                   (f64 DoubleRegs:$src3),
-                   (f64 DoubleRegs:$src4)),
-      (TFR_condset_rr64_f (FCMPUGT64_rr DoubleRegs:$src2, DoubleRegs:$src1),
-                DoubleRegs:$src4, DoubleRegs:$src3)>, Requires<[HasV5T]>;
-
-// Map from p0 = pnot(p0); r0 = mux(p0, #i, #j) => r0 = mux(p0, #j, #i).
-def : Pat <(select (not PredRegs:$src1), fpimm:$src2, fpimm:$src3),
-      (TFR_condset_ii_f PredRegs:$src1, fpimm:$src3, fpimm:$src2)>;
+// F2_sffma: Floating-point fused multiply add.
+let isFP = 1, hasNewValue = 1 in
+class T_sfmpy_acc <bit isSub, bit isLib>
+  : MInst<(outs IntRegs:$Rx),
+          (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
+  "$Rx "#!if(isSub, "-=","+=")#" sfmpy($Rs, $Rt)"#!if(isLib, ":lib",""),
+  [], "$dst2 = $Rx" , M_tc_3_SLOT23 > ,
+  Requires<[HasV5T]> {
+    bits<5> Rx;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-21} = 0b1111000;
+    let Inst{20-16} = Rs;
+    let Inst{13}    = 0b0;
+    let Inst{12-8}  = Rt;
+    let Inst{7}     = 0b1;
+    let Inst{6}     = isLib;
+    let Inst{5}     = isSub;
+    let Inst{4-0}   = Rx;
+  }
+
+def F2_sffma: T_sfmpy_acc <0, 0>;
+def F2_sffms: T_sfmpy_acc <1, 0>;
+def F2_sffma_lib: T_sfmpy_acc <0, 1>;
+def F2_sffms_lib: T_sfmpy_acc <1, 1>;
+
+def : Pat <(f32 (fma F32:$src2, F32:$src3, F32:$src1)),
+           (F2_sffma F32:$src1, F32:$src2, F32:$src3)>;
+
+// Floating-point fused multiply add w/ additional scaling (2**pu).
+let isFP = 1, hasNewValue = 1 in
+def F2_sffma_sc: MInst <
+  (outs IntRegs:$Rx),
+  (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt, PredRegs:$Pu),
+  "$Rx += sfmpy($Rs, $Rt, $Pu):scale" ,
+  [], "$dst2 = $Rx" , M_tc_3_SLOT23 > ,
+  Requires<[HasV5T]> {
+    bits<5> Rx;
+    bits<5> Rs;
+    bits<5> Rt;
+    bits<2> Pu;
+
+    let IClass = 0b1110;
+
+    let Inst{27-21} = 0b1111011;
+    let Inst{20-16} = Rs;
+    let Inst{13}    = 0b0;
+    let Inst{12-8}  = Rt;
+    let Inst{7}     = 0b1;
+    let Inst{6-5}   = Pu;
+    let Inst{4-0}   = Rx;
+  }
+
+let isExtended = 1, isExtentSigned = 1, opExtentBits = 8, opExtendable = 3,
+    isPseudo = 1, InputType = "imm" in
+def MUX_ir_f : ALU32_rr<(outs IntRegs:$dst),
+      (ins PredRegs:$src1, IntRegs:$src2, f32Ext:$src3),
+      "$dst = mux($src1, $src2, #$src3)",
+      [(set F32:$dst, (f32 (select I1:$src1, F32:$src2, fpimm:$src3)))]>,
+    Requires<[HasV5T]>;
+
+let isExtended = 1, isExtentSigned = 1, opExtentBits = 8, opExtendable = 2,
+    isPseudo = 1, InputType = "imm" in
+def MUX_ri_f : ALU32_rr<(outs IntRegs:$dst),
+      (ins PredRegs:$src1, f32Ext:$src2, IntRegs:$src3),
+      "$dst = mux($src1, #$src2, $src3)",
+      [(set F32:$dst, (f32 (select I1:$src1, fpimm:$src2, F32:$src3)))]>,
+    Requires<[HasV5T]>;
+
+def: Pat<(select I1:$src1, F32:$src2, F32:$src3),
+         (C2_mux I1:$src1, F32:$src2, F32:$src3)>,
+     Requires<[HasV5T]>;
+
+def: Pat<(select (i1 (setult F32:$src1, F32:$src2)), F32:$src3, F32:$src4),
+         (C2_mux (F2_sfcmpgt F32:$src2, F32:$src1), F32:$src4, F32:$src3)>,
+     Requires<[HasV5T]>;
+
+def: Pat<(select I1:$src1, F64:$src2, F64:$src3),
+         (C2_vmux I1:$src1, F64:$src2, F64:$src3)>,
+    Requires<[HasV5T]>;
+
+def: Pat<(select (i1 (setult F64:$src1, F64:$src2)), F64:$src3, F64:$src4),
+         (C2_vmux (F2_dfcmpgt F64:$src2, F64:$src1), F64:$src3, F64:$src4)>,
+     Requires<[HasV5T]>;
 
 // Map from p0 = pnot(p0); r0 = select(p0, #i, r1)
-// => r0 = TFR_condset_ri(p0, r1, #i)
-def : Pat <(select (not PredRegs:$src1), fpimm:$src2, IntRegs:$src3),
-      (TFR_condset_ri_f PredRegs:$src1, IntRegs:$src3, fpimm:$src2)>;
+// => r0 = MUX_ir_f(p0, #i, r1)
+def: Pat<(select (not I1:$src1), fpimm:$src2, F32:$src3),
+         (MUX_ir_f I1:$src1, F32:$src3, fpimm:$src2)>,
+     Requires<[HasV5T]>;
 
 // Map from p0 = pnot(p0); r0 = mux(p0, r1, #i)
-// => r0 = TFR_condset_ir(p0, #i, r1)
-def : Pat <(select (not PredRegs:$src1), IntRegs:$src2, fpimm:$src3),
-      (TFR_condset_ir_f PredRegs:$src1, fpimm:$src3, IntRegs:$src2)>;
+// => r0 = MUX_ri_f(p0, r1, #i)
+def: Pat<(select (not I1:$src1), F32:$src2, fpimm:$src3),
+         (MUX_ri_f I1:$src1, fpimm:$src3, F32:$src2)>,
+     Requires<[HasV5T]>;
+
+def: Pat<(i32 (fp_to_sint F64:$src1)),
+         (LoReg (F2_conv_df2d_chop F64:$src1))>,
+     Requires<[HasV5T]>;
+
+//===----------------------------------------------------------------------===//
+// :natural forms of vasrh and vasrhub insns
+//===----------------------------------------------------------------------===//
+// S5_asrhub_rnd_sat: Vector arithmetic shift right by immediate with round,
+// saturate, and pack.
+let Defs = [USR_OVF], hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class T_ASRHUB<bit isSat>
+  : SInst <(outs IntRegs:$Rd),
+  (ins DoubleRegs:$Rss, u4Imm:$u4),
+  "$Rd = vasrhub($Rss, #$u4):"#!if(isSat, "sat", "raw"),
+  [], "", S_2op_tc_2_SLOT23>,
+  Requires<[HasV5T]> {
+    bits<5> Rd;
+    bits<5> Rss;
+    bits<4> u4;
+
+    let IClass = 0b1000;
+
+    let Inst{27-21} = 0b1000011;
+    let Inst{20-16} = Rss;
+    let Inst{13-12} = 0b00;
+    let Inst{11-8} = u4;
+    let Inst{7-6} = 0b10;
+    let Inst{5} = isSat;
+    let Inst{4-0} = Rd;
+  }
+
+def S5_asrhub_rnd_sat : T_ASRHUB <0>;
+def S5_asrhub_sat : T_ASRHUB <1>;
+
+let isAsmParserOnly = 1 in
+def S5_asrhub_rnd_sat_goodsyntax
+  : SInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss, u4Imm:$u4),
+  "$Rd = vasrhub($Rss, #$u4):rnd:sat">, Requires<[HasV5T]>;
+
+// S5_vasrhrnd: Vector arithmetic shift right by immediate with round.
+let hasSideEffects = 0 in
+def S5_vasrhrnd : SInst <(outs DoubleRegs:$Rdd),
+                         (ins DoubleRegs:$Rss, u4Imm:$u4),
+  "$Rdd = vasrh($Rss, #$u4):raw">,
+  Requires<[HasV5T]> {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<4> u4;
+
+    let IClass = 0b1000;
+
+    let Inst{27-21} = 0b0000001;
+    let Inst{20-16} = Rss;
+    let Inst{13-12} = 0b00;
+    let Inst{11-8}  = u4;
+    let Inst{7-5}   = 0b000;
+    let Inst{4-0}   = Rdd;
+  }
+
+let isAsmParserOnly = 1 in
+def S5_vasrhrnd_goodsyntax
+  : SInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, u4Imm:$u4),
+  "$Rdd = vasrh($Rss,#$u4):rnd">, Requires<[HasV5T]>;
+
+// Floating point reciprocal square root approximation
+let Uses = [USR], isPredicateLate = 1, isFP = 1,
+    hasSideEffects = 0, hasNewValue = 1, opNewValue = 0,
+    validSubTargets = HasV5SubT in
+def F2_sfinvsqrta: SInst <
+  (outs IntRegs:$Rd, PredRegs:$Pe),
+  (ins IntRegs:$Rs),
+  "$Rd, $Pe = sfinvsqrta($Rs)" > ,
+  Requires<[HasV5T]> {
+    bits<5> Rd;
+    bits<2> Pe;
+    bits<5> Rs;
+
+    let IClass = 0b1000;
+
+    let Inst{27-21} = 0b1011111;
+    let Inst{20-16} = Rs;
+    let Inst{7} = 0b0;
+    let Inst{6-5} = Pe;
+    let Inst{4-0} = Rd;
+  }
+
+// Complex multiply 32x16
+let Defs = [USR_OVF], Itinerary = S_3op_tc_3x_SLOT23 in {
+  def M4_cmpyi_whc : T_S3op_8<"cmpyiwh", 0b101, 1, 1, 1, 1>;
+  def M4_cmpyr_whc : T_S3op_8<"cmpyrwh", 0b111, 1, 1, 1, 1>;
+}
 
-def : Pat <(i32 (fp_to_sint (f64 DoubleRegs:$src1))),
-          (i32 (EXTRACT_SUBREG (i64 (CONVERT_df2d (f64 DoubleRegs:$src1))), subreg_loreg))>,
-          Requires<[HasV5T]>;
+// Classify floating-point value
+let isFP = 1 in
+ def F2_sfclass : T_TEST_BIT_IMM<"sfclass", 0b111>;
+
+let isFP = 1 in
+def F2_dfclass: ALU64Inst<(outs PredRegs:$Pd), (ins DoubleRegs:$Rss, u5Imm:$u5),
+  "$Pd = dfclass($Rss, #$u5)",
+  [], "" , ALU64_tc_2early_SLOT23 > , Requires<[HasV5T]> {
+    bits<2> Pd;
+    bits<5> Rss;
+    bits<5> u5;
+
+    let IClass = 0b1101;
+    let Inst{27-21} = 0b1100100;
+    let Inst{20-16} = Rss;
+    let Inst{12-10} = 0b000;
+    let Inst{9-5}   = u5;
+    let Inst{4-3}   = 0b10;
+    let Inst{1-0}   = Pd;
+  }
+
+// Instructions to create floating point constant
+class T_fimm <string mnemonic, RegisterClass RC, bits<4> RegType, bit isNeg>
+  : ALU64Inst<(outs RC:$dst), (ins u10Imm:$src),
+  "$dst = "#mnemonic#"(#$src)"#!if(isNeg, ":neg", ":pos"),
+  [], "", ALU64_tc_3x_SLOT23>, Requires<[HasV5T]> {
+    bits<5> dst;
+    bits<10> src;
+
+    let IClass = 0b1101;
+    let Inst{27-24} = RegType;
+    let Inst{23}    = 0b0;
+    let Inst{22}    = isNeg;
+    let Inst{21}    = src{9};
+    let Inst{13-5}  = src{8-0};
+    let Inst{4-0}   = dst;
+  }
+
+let hasNewValue = 1, opNewValue = 0 in {
+def F2_sfimm_p : T_fimm <"sfmake", IntRegs, 0b0110, 0>;
+def F2_sfimm_n : T_fimm <"sfmake", IntRegs, 0b0110, 1>;
+}
+
+def F2_dfimm_p : T_fimm <"dfmake", DoubleRegs, 0b1001, 0>;
+def F2_dfimm_n : T_fimm <"dfmake", DoubleRegs, 0b1001, 1>;
 
 def : Pat <(fabs (f32 IntRegs:$src1)),
-           (CLRBIT_31 (f32 IntRegs:$src1), 31)>,
+           (S2_clrbit_i (f32 IntRegs:$src1), 31)>,
           Requires<[HasV5T]>;
 
 def : Pat <(fneg (f32 IntRegs:$src1)),
-           (TOGBIT_31 (f32 IntRegs:$src1), 31)>,
-          Requires<[HasV5T]>;
-
-/*
-def : Pat <(fabs (f64 DoubleRegs:$src1)),
-          (CLRBIT_31 (f32 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg)), 31)>,
-          Requires<[HasV5T]>;
-
-def : Pat <(fabs (f64 DoubleRegs:$src1)),
-          (CLRBIT_31 (f32 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg)), 31)>,
+           (S2_togglebit_i (f32 IntRegs:$src1), 31)>,
           Requires<[HasV5T]>;
-          */
diff --git a/lib/Target/Hexagon/HexagonInstrInfoVector.td b/lib/Target/Hexagon/HexagonInstrInfoVector.td
new file mode 100644
index 0000000..6e67b6e
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonInstrInfoVector.td
@@ -0,0 +1,65 @@
+//===- HexagonInstrInfoVector.td - Hexagon Vector Patterns -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon Vector instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+def V2I1:  PatLeaf<(v2i1  PredRegs:$R)>;
+def V4I1:  PatLeaf<(v4i1  PredRegs:$R)>;
+def V8I1:  PatLeaf<(v8i1  PredRegs:$R)>;
+def V4I8:  PatLeaf<(v4i8  IntRegs:$R)>;
+def V2I16: PatLeaf<(v2i16 IntRegs:$R)>;
+def V8I8:  PatLeaf<(v8i8  DoubleRegs:$R)>;
+def V4I16: PatLeaf<(v4i16 DoubleRegs:$R)>;
+def V2I32: PatLeaf<(v2i32 DoubleRegs:$R)>;
+
+// Vector shift support. Vector shifting in Hexagon is rather different
+// from internal representation of LLVM.
+// LLVM assumes all shifts (in vector case) will have the form
+// <VT> = SHL/SRA/SRL <VT> by <VT>
+// while Hexagon has the following format:
+// <VT> = SHL/SRA/SRL <VT> by <IT/i32>
+// As a result, special care is needed to guarantee correctness and
+// performance.
+class vshift_v4i16<SDNode Op, string Str, bits<3>MajOp, bits<3>MinOp>
+  : S_2OpInstImm<Str, MajOp, MinOp, u4Imm,
+      [(set (v4i16 DoubleRegs:$dst),
+            (Op (v4i16 DoubleRegs:$src1), u4ImmPred:$src2))]> {
+  bits<4> src2;
+  let Inst{11-8} = src2;
+}
+
+class vshift_v2i32<SDNode Op, string Str, bits<3>MajOp, bits<3>MinOp>
+  : S_2OpInstImm<Str, MajOp, MinOp, u5Imm,
+      [(set (v2i32 DoubleRegs:$dst),
+            (Op (v2i32 DoubleRegs:$src1), u5ImmPred:$src2))]> {
+  bits<5> src2;
+  let Inst{12-8} = src2;
+}
+
+def S2_asr_i_vw : vshift_v2i32<sra, "vasrw", 0b010, 0b000>;
+def S2_lsr_i_vw : vshift_v2i32<srl, "vlsrw", 0b010, 0b001>;
+def S2_asl_i_vw : vshift_v2i32<shl, "vaslw", 0b010, 0b010>;
+
+def S2_asr_i_vh : vshift_v4i16<sra, "vasrh", 0b100, 0b000>;
+def S2_lsr_i_vh : vshift_v4i16<srl, "vlsrh", 0b100, 0b001>;
+def S2_asl_i_vh : vshift_v4i16<shl, "vaslh", 0b100, 0b010>;
+
+// Vector shift words by register
+def S2_asr_r_vw : T_S3op_shiftVect < "vasrw", 0b00, 0b00>;
+def S2_lsr_r_vw : T_S3op_shiftVect < "vlsrw", 0b00, 0b01>;
+def S2_asl_r_vw : T_S3op_shiftVect < "vaslw", 0b00, 0b10>;
+def S2_lsl_r_vw : T_S3op_shiftVect < "vlslw", 0b00, 0b11>;
+
+// Vector shift halfwords by register
+def S2_asr_r_vh : T_S3op_shiftVect < "vasrh", 0b01, 0b00>;
+def S2_lsr_r_vh : T_S3op_shiftVect < "vlsrh", 0b01, 0b01>;
+def S2_asl_r_vh : T_S3op_shiftVect < "vaslh", 0b01, 0b10>;
+def S2_lsl_r_vh : T_S3op_shiftVect < "vlslh", 0b01, 0b11>;
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index b3385d8..c0551e8 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -13,3495 +13,1250 @@
 // March 4, 2008
 //===----------------------------------------------------------------------===//
 
-//
-// ALU 32 types.
-//
+class T_I_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID imm:$Is),
+         (MI imm:$Is)>;
 
-class qi_ALU32_sisi<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class qi_ALU32_sis10<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, s10Imm:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
-
-class qi_ALU32_sis8<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
-
-class qi_ALU32_siu8<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, u8Imm:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
-
-class qi_ALU32_siu9<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, u9Imm:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
-
-class si_ALU32_qisisi<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
-                                      IntRegs:$src3),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
-                                        IntRegs:$src3))]>;
-
-class si_ALU32_qis8si<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2,
-                                       IntRegs:$src3),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2, $src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2,
-                                        IntRegs:$src3))]>;
-
-class si_ALU32_qisis8<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
-                                       s8Imm:$src3),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, #$src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
-                                        imm:$src3))]>;
-
-class si_ALU32_qis8s8<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2, s8Imm:$src3),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2, #$src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2, imm:$src3))]>;
-
-class si_ALU32_sisi<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU32_sisi_sat<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU32_sisi_rnd<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU32_sis16<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s16Imm:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
-
-class si_ALU32_sis10<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s10Imm:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
-
-class si_ALU32_s10si<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins s10Imm:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "(#$src1, $src2)")),
-             [(set IntRegs:$dst, (IntID imm:$src1, IntRegs:$src2))]>;
-
-class si_lo_ALU32_siu16<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u16Imm:$src2),
-             !strconcat("$dst.l = ", !strconcat(opc , "#$src2")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
-
-class si_hi_ALU32_siu16<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u16Imm:$src2),
-             !strconcat("$dst.h = ", !strconcat(opc , "#$src2")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
-
-class si_ALU32_s16<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins s16Imm:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "#$src1")),
-             [(set IntRegs:$dst, (IntID imm:$src1))]>;
-
-class di_ALU32_s8<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs DoubleRegs:$dst), (ins s8Imm:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "#$src1")),
-             [(set DoubleRegs:$dst, (IntID imm:$src1))]>;
-
-class di_ALU64_di<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
-             !strconcat("$dst = ", !strconcat(opc , "$src")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src))]>;
-
-class si_ALU32_si<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
-             !strconcat("$dst = ", !strconcat(opc , "($src)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
-
-class si_ALU32_si_tfr<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
-             !strconcat("$dst = ", !strconcat(opc , "$src")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+class T_R_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs),
+         (MI I32:$Rs)>;
 
-//
-// ALU 64 types.
-//
+class T_P_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs),
+         (MI DoubleRegs:$Rs)>;
+
+class T_II_pat <InstHexagon MI, Intrinsic IntID, PatFrag Imm1, PatFrag Imm2>
+  : Pat<(IntID Imm1:$Is, Imm2:$It),
+        (MI Imm1:$Is, Imm2:$It)>;
+
+class T_RI_pat <InstHexagon MI, Intrinsic IntID, PatLeaf ImmPred = PatLeaf<(i32 imm)>>
+  : Pat<(IntID I32:$Rs, ImmPred:$It),
+        (MI I32:$Rs, ImmPred:$It)>;
+
+class T_IR_pat <InstHexagon MI, Intrinsic IntID, PatFrag ImmPred = PatLeaf<(i32 imm)>>
+  : Pat<(IntID ImmPred:$Is, I32:$Rt),
+        (MI ImmPred:$Is, I32:$Rt)>;
+
+class T_PI_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID I64:$Rs, imm:$It),
+        (MI DoubleRegs:$Rs, imm:$It)>;
+
+class T_RP_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID I32:$Rs, I64:$Rt),
+        (MI I32:$Rs, DoubleRegs:$Rt)>;
+
+class T_RR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs, I32:$Rt),
+         (MI I32:$Rs, I32:$Rt)>;
+
+class T_PP_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I64:$Rt),
+         (MI DoubleRegs:$Rs, DoubleRegs:$Rt)>;
 
-class si_ALU64_si_sat<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
-             !strconcat("$dst = ", !strconcat(opc , "($src):sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
-
-class si_ALU64_didi<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
-
-class di_ALU64_sidi<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, DoubleRegs:$src2))]>;
-
-class di_ALU64_didi<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
-
-class di_ALU64_qididi<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, DoubleRegs:$src2,
-                                          DoubleRegs:$src3),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, DoubleRegs:$src2,
-                                           DoubleRegs:$src3))]>;
-
-class di_ALU64_sisi<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_ALU64_didi_sat<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
-
-class di_ALU64_didi_rnd<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
-
-class di_ALU64_didi_crnd<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):crnd")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
-
-class di_ALU64_didi_rnd_sat<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd:sat")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
-
-class di_ALU64_didi_crnd_sat<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):crnd:sat")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
-
-class qi_ALU64_didi<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set PredRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
-
-class si_ALU64_sisi<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_sat_lh<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_l16_sat_hh<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_l16_sat_lh<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_l16_sat_hl<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_l16_sat_ll<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_l16_hh<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_l16_hl<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_l16_lh<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_l16_ll<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_h16_sat_hh<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1.H, $src2.H):sat:<<16")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_h16_sat_lh<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1.L, $src2.H):sat:<<16")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_h16_sat_hl<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1.H, $src2.L):sat:<<16")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_h16_sat_ll<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1.L, $src2.L):sat:<<16")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_h16_hh<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):<<16")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_h16_hl<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):<<16")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_h16_lh<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):<<16")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_h16_ll<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):<<16")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_lh<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_ll<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_ALU64_sisi_sat<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+class T_QII_pat <InstHexagon MI, Intrinsic IntID, PatFrag Imm1, PatFrag Imm2>
+  : Pat <(IntID (i32 PredRegs:$Ps), Imm1:$Is, Imm2:$It),
+         (MI PredRegs:$Ps, Imm1:$Is, Imm2:$It)>;
 
-//
-// SInst classes.
-//
+class T_QRI_pat <InstHexagon MI, Intrinsic IntID, PatFrag ImmPred>
+  : Pat <(IntID (i32 PredRegs:$Ps), I32:$Rs, ImmPred:$Is),
+         (MI PredRegs:$Ps, I32:$Rs, ImmPred:$Is)>;
 
-class qi_SInst_qi<string opc, Intrinsic IntID>
-  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src),
-             !strconcat("$dst = ", !strconcat(opc , "($src)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src))]>;
-
-class qi_SInst_qi_pxfer<string opc, Intrinsic IntID>
-  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src),
-             !strconcat("$dst = ", !strconcat(opc , "$src")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src))]>;
-
-class qi_SInst_qiqi<string opc, Intrinsic IntID>
-  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class qi_SInst_qiqi_neg<string opc, Intrinsic IntID>
-  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, !$src2)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_SInst_di<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
-             !strconcat("$dst = ", !strconcat(opc , "($src)")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src))]>;
-
-class di_SInst_di_sat<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
-             !strconcat("$dst = ", !strconcat(opc , "($src):sat")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src))]>;
-
-class si_SInst_di<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src),
-          !strconcat("$dst = ", !strconcat(opc , "($src)")),
-          [(set IntRegs:$dst, (IntID DoubleRegs:$src))]>;
-
-class si_SInst_di_sat<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src),
-          !strconcat("$dst = ", !strconcat(opc , "($src):sat")),
-          [(set IntRegs:$dst, (IntID DoubleRegs:$src))]>;
-
-class di_SInst_disi<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
-
-class di_SInst_didi<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
-
-class di_SInst_si<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
-          !strconcat("$dst = ", !strconcat(opc , "($src1)")),
-          [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
-
-class si_SInst_sisiu3<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, u3Imm:$src3),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, #$src3)")),
-          [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
-                                     imm:$src3))]>;
-
-class si_SInst_diu5<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, u5Imm:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
-
-class si_SInst_disi<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
-
-class si_SInst_sidi<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, DoubleRegs:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-          [(set IntRegs:$dst, (IntID IntRegs:$src1, DoubleRegs:$src2))]>;
-
-class di_SInst_disisi<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2,
-                                       IntRegs:$src3),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2,
-                                        IntRegs:$src3))]>;
-
-class di_SInst_sisi<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-          [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class qi_SInst_siu5<string opc, Intrinsic IntID>
-  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-          [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
-
-class qi_SInst_siu6<string opc, Intrinsic IntID>
-  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u6Imm:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-          [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
-
-class qi_SInst_sisi<string opc, Intrinsic IntID>
-  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-          [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_SInst_si<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src),
-          !strconcat("$dst = ", !strconcat(opc , "($src)")),
-          [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
-
-class si_SInst_si_sat<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src),
-          !strconcat("$dst = ", !strconcat(opc , "($src):sat")),
-          [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
-
-class di_SInst_qi<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src),
-          !strconcat("$dst = ", !strconcat(opc , "($src)")),
-          [(set DoubleRegs:$dst, (IntID IntRegs:$src))]>;
-
-class si_SInst_qi<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src),
-          !strconcat("$dst = ", !strconcat(opc , "$src")),
-          [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
-
-class si_SInst_qiqi<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-          [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class qi_SInst_si<string opc, Intrinsic IntID>
-  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src),
-          !strconcat("$dst = ", !strconcat(opc , "$src")),
-          [(set PredRegs:$dst, (IntID IntRegs:$src))]>;
-
-class si_SInst_sisi<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-          [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_SInst_diu6<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
-
-class si_SInst_siu5<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-          [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
-
-class si_SInst_siu5_rnd<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):rnd")),
-          [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
-
-class si_SInst_siu5u5<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2, u5Imm:$src3),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2, #$src3)")),
-          [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2, imm:$src3))]>;
-
-class si_SInst_sisisi_acc<string opc, Intrinsic IntID>
-  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-              !strconcat("$dst += ", !strconcat(opc , "($src1, $src2)")),
-              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                         IntRegs:$src2))],
-              "$dst2 = $dst">;
-
-class si_SInst_sisisi_nac<string opc, Intrinsic IntID>
-  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-              !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2)")),
-              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                         IntRegs:$src2))],
-              "$dst2 = $dst">;
-
-class di_SInst_didisi_acc<string opc, Intrinsic IntID>
-  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc , "($src1, $src2)")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
-                                             DoubleRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_SInst_didisi_nac<string opc, Intrinsic IntID>
-  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           IntRegs:$src2),
-          !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
-                                        DoubleRegs:$src1, IntRegs:$src2))],
-          "$dst2 = $dst">;
-
-class si_SInst_sisiu5u5<string opc, Intrinsic IntID>
-  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        u5Imm:$src2, u5Imm:$src3),
-              !strconcat("$dst = ", !strconcat(opc ,
-                                               "($src1, #$src2, #$src3)")),
-              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                         imm:$src2, imm:$src3))],
-              "$dst2 = $dst">;
-
-class si_SInst_sisidi<string opc, Intrinsic IntID>
-  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        DoubleRegs:$src2),
-              !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                         DoubleRegs:$src2))],
-              "$dst2 = $dst">;
-
-class di_SInst_didiu6u6<string opc, Intrinsic IntID>
-  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           u6Imm:$src2, u6Imm:$src3),
-              !strconcat("$dst = ", !strconcat(opc ,
-                                               "($src1, #$src2, #$src3)")),
-              [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
-                                            imm:$src2, imm:$src3))],
-              "$dst2 = $dst">;
-
-class di_SInst_dididi<string opc, Intrinsic IntID>
-  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           DoubleRegs:$src2),
-              !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-              [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
-                                            DoubleRegs:$src1,
-                                            DoubleRegs:$src2))],
-              "$dst2 = $dst">;
-
-class di_SInst_diu6u6<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2,
-                                       u6Imm:$src3),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2, #$src3)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2,
-                                        imm:$src3))]>;
-
-class di_SInst_didiqi<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
-                                       IntRegs:$src3),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2,
-                                        IntRegs:$src3))]>;
-
-class di_SInst_didiu3<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
-                                       u3Imm:$src3),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, #$src3)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2,
-                                        imm:$src3))]>;
-
-class di_SInst_didisi_or<string opc, Intrinsic IntID>
-  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           IntRegs:$src2),
-          !strconcat("$dst |= ", !strconcat(opc , "($src1, $src2)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
-                                        IntRegs:$src2))],
-          "$dst2 = $dst">;
-
-class di_SInst_didisi_and<string opc, Intrinsic IntID>
-  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           IntRegs:$src2),
-          !strconcat("$dst &= ", !strconcat(opc , "($src1, $src2)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
-                                        IntRegs:$src2))],
-          "$dst2 = $dst">;
-
-class di_SInst_didiu6_and<string opc, Intrinsic IntID>
-  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           u6Imm:$src2),
-          !strconcat("$dst &= ", !strconcat(opc , "($src1, #$src2)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
-                                        imm:$src2))],
-          "$dst2 = $dst">;
-
-class di_SInst_didiu6_or<string opc, Intrinsic IntID>
-  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           u6Imm:$src2),
-          !strconcat("$dst |= ", !strconcat(opc , "($src1, #$src2)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
-                                        imm:$src2))],
-          "$dst2 = $dst">;
-
-class di_SInst_didiu6_xor<string opc, Intrinsic IntID>
-  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           u6Imm:$src2),
-          !strconcat("$dst ^= ", !strconcat(opc , "($src1, #$src2)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
-                                        imm:$src2))],
-          "$dst2 = $dst">;
-
-class si_SInst_sisisi_and<string opc, Intrinsic IntID>
-  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-              !strconcat("$dst &= ", !strconcat(opc , "($src1, $src2)")),
-              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                         IntRegs:$src2))],
-              "$dst2 = $dst">;
-
-class si_SInst_sisisi_or<string opc, Intrinsic IntID>
-  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-              !strconcat("$dst |= ", !strconcat(opc , "($src1, $src2)")),
-              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                         IntRegs:$src2))],
-              "$dst2 = $dst">;
-
-
-class si_SInst_sisiu5_and<string opc, Intrinsic IntID>
-  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        u5Imm:$src2),
-              !strconcat("$dst &= ", !strconcat(opc , "($src1, #$src2)")),
-              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                         imm:$src2))],
-              "$dst2 = $dst">;
-
-class si_SInst_sisiu5_or<string opc, Intrinsic IntID>
-  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        u5Imm:$src2),
-              !strconcat("$dst |= ", !strconcat(opc , "($src1, #$src2)")),
-              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                         imm:$src2))],
-              "$dst2 = $dst">;
-
-class si_SInst_sisiu5_xor<string opc, Intrinsic IntID>
-  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        u5Imm:$src2),
-              !strconcat("$dst ^= ", !strconcat(opc , "($src1, #$src2)")),
-              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                         imm:$src2))],
-              "$dst2 = $dst">;
-
-class si_SInst_sisiu5_acc<string opc, Intrinsic IntID>
-  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        u5Imm:$src2),
-              !strconcat("$dst += ", !strconcat(opc , "($src1, #$src2)")),
-              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                         imm:$src2))],
-              "$dst2 = $dst">;
-
-class si_SInst_sisiu5_nac<string opc, Intrinsic IntID>
-  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        u5Imm:$src2),
-              !strconcat("$dst -= ", !strconcat(opc , "($src1, #$src2)")),
-              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                         imm:$src2))],
-              "$dst2 = $dst">;
-
-class di_SInst_didiu6_acc<string opc, Intrinsic IntID>
-  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           u5Imm:$src2),
-              !strconcat("$dst += ", !strconcat(opc , "($src1, #$src2)")),
-              [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
-                                            DoubleRegs:$src1, imm:$src2))],
-              "$dst2 = $dst">;
-
-class di_SInst_didiu6_nac<string opc, Intrinsic IntID>
-  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           u5Imm:$src2),
-              !strconcat("$dst -= ", !strconcat(opc , "($src1, #$src2)")),
-              [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
-                                            imm:$src2))],
-              "$dst2 = $dst">;
+class T_QIR_pat <InstHexagon MI, Intrinsic IntID, PatFrag ImmPred>
+  : Pat <(IntID (i32 PredRegs:$Ps), ImmPred:$Is, I32:$Rs),
+         (MI PredRegs:$Ps, ImmPred:$Is, I32:$Rs)>;
 
+class T_RRI_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs, I32:$Rt, imm:$Iu),
+         (MI I32:$Rs, I32:$Rt, imm:$Iu)>;
 
-//
-// MInst classes.
-//
+class T_RII_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs, imm:$It, imm:$Iu),
+         (MI I32:$Rs, imm:$It, imm:$Iu)>;
 
-class di_MInst_sisi_rnd_hh_s1<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ", !strconcat(opc ,
-                                                "($src1.H, $src2.H):<<1:rnd")),
-               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_rnd_hh<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ", !strconcat(opc ,
-                                                "($src1.H, $src2.H):rnd")),
-               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_rnd_hl_s1<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ", !strconcat(opc ,
-                                                "($src1.H, $src2.L):<<1:rnd")),
-               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_rnd_hl<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ", !strconcat(opc ,
-                                                "($src1.H, $src2.L):rnd")),
-               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_rnd_lh_s1<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ", !strconcat(opc ,
-                                                "($src1.L, $src2.H):<<1:rnd")),
-               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_rnd_lh<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ", !strconcat(opc ,
-                                                "($src1.L, $src2.H):rnd")),
-               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_rnd_ll_s1<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ", !strconcat(opc ,
-                                                "($src1.L, $src2.L):<<1:rnd")),
-               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_rnd_ll<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ", !strconcat(opc ,
-                                                "($src1.L, $src2.L):rnd")),
-               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_disisi_acc<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-             !strconcat("$dst += ", !strconcat(opc , "($src1, $src2)")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2))],
-             "$dst2 = $dst">;
-
-class di_MInst_disisi_nac<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-             !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2)")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2))],
-             "$dst2 = $dst">;
-
-class di_MInst_disisi_acc_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-             !strconcat("$dst += ", !strconcat(opc , "($src1, $src2):sat")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2))],
-             "$dst2 = $dst">;
-
-class di_MInst_disisi_nac_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-             !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2):sat")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2))],
-             "$dst2 = $dst">;
-
-class di_MInst_disisi_acc_sat_conj<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-             !strconcat("$dst += ", !strconcat(opc , "($src1, $src2*):sat")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2))],
-             "$dst2 = $dst">;
-
-class di_MInst_disisi_nac_sat_conj<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-             !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2*):sat")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2))],
-             "$dst2 = $dst">;
-
-class di_MInst_disisi_nac_s1_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-             !strconcat("$dst -= ", !strconcat(opc ,
-                                               "($src1, $src2):<<1:sat")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2))],
-             "$dst2 = $dst">;
-
-class di_MInst_disisi_acc_s1_sat_conj<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-             !strconcat("$dst += ", !strconcat(opc ,
-                                               "($src1, $src2*):<<1:sat")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2))],
-             "$dst2 = $dst">;
-
-class di_MInst_disisi_nac_s1_sat_conj<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-             !strconcat("$dst -= ", !strconcat(opc ,
-                                               "($src1, $src2*):<<1:sat")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2))],
-             "$dst2 = $dst">;
-
-class di_MInst_s8s8<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins s8Imm:$src1, s8Imm:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "(#$src1, #$src2)")),
-             [(set DoubleRegs:$dst, (IntID imm:$src1, imm:$src2))]>;
-
-class si_MInst_sis9<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Imm:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
-
-class si_MInst_sisi<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_hh<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H)")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_hh_s1<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):<<1")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_lh<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H)")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_lh_s1<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):<<1")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_hl<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L)")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_hl_s1<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):<<1")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_ll<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L)")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_ll_s1<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):<<1")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-
-class si_MInst_sisi_hh<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_hh_s1<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):<<1")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_lh<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_lh_s1<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):<<1")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_hl<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_hl_s1<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):<<1")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_ll<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_ll_s1<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):<<1")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_up<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_didi<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
-
-class di_MInst_didi_conj<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2*)")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
-
-class di_MInst_sisi_s1_sat_conj<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, $src2*):<<1:sat")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_didi_s1_rnd_sat<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, $src2):<<1:rnd:sat")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
-
-class di_MInst_didi_sat<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
-
-class di_MInst_didi_rnd_sat<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, $src2):rnd:sat")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
-
-class si_SInst_sisi_sat<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
-          [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_SInst_didi_sat<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
-          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
-
-class si_SInst_disi_s1_rnd_sat<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, $src2):<<1:rnd:sat")),
-             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_s1_rnd_sat<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, $src2):<<1:rnd:sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_l_s1_rnd_sat<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, $src2.L):<<1:rnd:sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_h_s1_rnd_sat<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, $src2.H):<<1:rnd:sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_rnd_sat_conj<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, $src2*):rnd:sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_s1_rnd_sat_conj<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, $src2*):<<1:rnd:sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_rnd_sat<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, $src2):rnd:sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_rnd<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisisi_xacc<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
-                                        IntRegs:$src3),
-             !strconcat("$dst ^= ", !strconcat(opc , "($src2, $src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
-                                        IntRegs:$src3))],
-             "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
-                                        IntRegs:$src3),
-             !strconcat("$dst += ", !strconcat(opc , "($src2, $src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
-                                        IntRegs:$src3))],
-             "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
-                                        IntRegs:$src3),
-             !strconcat("$dst -= ", !strconcat(opc , "($src2, $src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
-                                        IntRegs:$src3))],
-             "$dst2 = $dst">;
-
-class si_MInst_sisis8_acc<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
-                                        s8Imm:$src3),
-             !strconcat("$dst += ", !strconcat(opc , "($src2, #$src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
-                                        imm:$src3))],
-             "$dst2 = $dst">;
-
-class si_MInst_sisis8_nac<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
-                                        s8Imm:$src3),
-             !strconcat("$dst -= ", !strconcat(opc , "($src2, #$src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
-                                        imm:$src3))],
-             "$dst2 = $dst">;
-
-class si_MInst_sisiu4u5<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        u4Imm:$src2, u5Imm:$src3),
-               !strconcat("$dst = ", !strconcat(opc ,
-                                                "($src1, #$src2, #$src3)")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          imm:$src2, imm:$src3))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisiu8_acc<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
-                                        u8Imm:$src3),
-               !strconcat("$dst += ", !strconcat(opc , "($src2, #$src3)")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
-                                          imm:$src3))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisiu8_nac<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
-                                        u8Imm:$src3),
-               !strconcat("$dst -= ", !strconcat(opc , "($src2, #$src3)")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
-                                          imm:$src3))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc_hh<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc , "($src1.H, $src2.H)")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc_sat_lh<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1.L, $src2.H):sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc_sat_lh_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1.L, $src2.H):<<1:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc_sat_hh<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1.H, $src2.H):sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc_sat_hh_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1.H, $src2.H):<<1:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc_hh_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1.H, $src2.H):<<1")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_hh<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc , "($src1.H, $src2.H)")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_sat_hh_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.H, $src2.H):<<1:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_sat_hh<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.H, $src2.H):sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_sat_hl_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.H, $src2.L):<<1:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_sat_hl<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.H, $src2.L):sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_sat_lh_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.L, $src2.H):<<1:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_sat_lh<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.L, $src2.H):sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_sat_ll_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.L, $src2.L):<<1:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_sat_ll<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.L, $src2.L):sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_hh_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.H, $src2.H):<<1")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc_hl<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc , "($src1.H, $src2.L)")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc_hl_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1.H, $src2.L):<<1")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_hl<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc , "($src1.H, $src2.L)")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_hl_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.H, $src2.L):<<1")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc_lh<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc , "($src1.L, $src2.H)")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc_lh_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1.L, $src2.H):<<1")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_lh<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc , "($src1.L, $src2.H)")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_lh_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.L, $src2.H):<<1")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc_ll<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc , "($src1.L, $src2.L)")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc_ll_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1.L, $src2.L):<<1")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc_sat_ll_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1.L, $src2.L):<<1:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc_sat_hl_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1.H, $src2.L):<<1:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc_sat_ll<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1.L, $src2.L):sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_acc_sat_hl<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1.H, $src2.L):sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_ll<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc , "($src1.L, $src2.L)")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_ll_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.L, $src2.L):<<1")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_hh_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.H, $src2.H):sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_hh_s1_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.H, $src2.H):<<1:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_hl_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.H, $src2.L):sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_hl_s1_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.H, $src2.L):<<1:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_lh_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.L, $src2.H):sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_lh_s1_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.L, $src2.H):<<1:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_ll_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.L, $src2.L):sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_nac_ll_s1_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1.L, $src2.L):<<1:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                          IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_ALU32_sisi<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_sat<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_sat_conj<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2*):sat")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_sisi_s1_sat<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):<<1:sat")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_didi_s1_sat<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):<<1:sat")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
-
-class si_MInst_didi_s1_rnd_sat<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, $src2):<<1:rnd:sat")),
-             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
-
-class si_MInst_didi_rnd_sat<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd:sat")),
-             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
-
-class si_MInst_sisi_sat_hh<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_sat_hh_s1<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ", !strconcat(opc ,
-                                                "($src1.H, $src2.H):<<1:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_sat_hl<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_sat_hl_s1<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ", !strconcat(opc ,
-                                                "($src1.H, $src2.L):<<1:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_sat_lh<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_sat_lh_s1<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ", !strconcat(opc ,
-                                                "($src1.L, $src2.H):<<1:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_sat_ll<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):sat")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_sat_ll_s1<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ", !strconcat(opc ,
-                                                "($src1.L, $src2.L):<<1:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_sat_rnd_hh<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ", !strconcat(opc ,
-                                                "($src1.H, $src2.H):rnd:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_rnd_hh<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ", !strconcat(opc ,
-                                                "($src1.H, $src2.H):rnd")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_rnd_hh_s1<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ", !strconcat(opc ,
-                                                "($src1.H, $src2.H):<<1:rnd")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_sat_rnd_hh_s1<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ",
-                          !strconcat(opc ,
-                                     "($src1.H, $src2.H):<<1:rnd:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_rnd_hl<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ",
-                          !strconcat(opc , "($src1.H, $src2.L):rnd")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_rnd_hl_s1<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ",
-                          !strconcat(opc , "($src1.H, $src2.L):<<1:rnd")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_sat_rnd_hl<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ",
-                          !strconcat(opc , "($src1.H, $src2.L):rnd:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_sat_rnd_hl_s1<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ",
-                          !strconcat(opc , "($src1.H, $src2.L):<<1:rnd:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_rnd_lh<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ",
-                          !strconcat(opc , "($src1.L, $src2.H):rnd")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_sat_rnd_lh<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ",
-                          !strconcat(opc , "($src1.L, $src2.H):rnd:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_sat_rnd_lh_s1<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ",
-                          !strconcat(opc , "($src1.L, $src2.H):<<1:rnd:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_rnd_lh_s1<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ",
-                          !strconcat(opc , "($src1.L, $src2.H):<<1:rnd")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_sat_rnd_ll<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ",
-                          !strconcat(opc , "($src1.L, $src2.L):rnd:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_sat_rnd_ll_s1<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ",
-                          !strconcat(opc , "($src1.L, $src2.L):<<1:rnd:sat")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_rnd_ll<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ",
-                          !strconcat(opc , "($src1.L, $src2.L):rnd")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_sisi_rnd_ll_s1<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-               !strconcat("$dst = ",
-                          !strconcat(opc , "($src1.L, $src2.L):<<1:rnd")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_dididi_acc_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2,
-                                           DoubleRegs:$src1, DoubleRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc , "($src1, $src2):sat")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
-                                             DoubleRegs:$src1,
-                                             DoubleRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_dididi_acc_rnd_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           DoubleRegs:$src2),
-               !strconcat("$dst += ",
-                          !strconcat(opc , "($src1, $src2):rnd:sat")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
-                                             DoubleRegs:$src1,
-                                             DoubleRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_dididi_acc_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2,
-                                           DoubleRegs:$src1,
-                                           DoubleRegs:$src2),
-               !strconcat("$dst += ",
-                          !strconcat(opc , "($src1, $src2):<<1")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
-                                             DoubleRegs:$src1,
-                                             DoubleRegs:$src2))],
-               "$dst2 = $dst">;
-
-
-class di_MInst_dididi_acc_s1_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2,
-                                           DoubleRegs:$src1,
-                                           DoubleRegs:$src2),
-               !strconcat("$dst += ",
-                          !strconcat(opc , "($src1, $src2):<<1:sat")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
-                                             DoubleRegs:$src1,
-                                             DoubleRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_dididi_acc_s1_rnd_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           DoubleRegs:$src2),
-               !strconcat("$dst += ",
-                          !strconcat(opc , "($src1, $src2):<<1:rnd:sat")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
-                                             DoubleRegs:$src1,
-                                             DoubleRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_dididi_acc<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           DoubleRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc , "($src1, $src2)")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
-                                             DoubleRegs:$src1,
-                                             DoubleRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_dididi_acc_conj<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           DoubleRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc , "($src1, $src2*)")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
-                                             DoubleRegs:$src1,
-                                             DoubleRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_acc_hh<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc , "($src1.H, $src2.H)")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_acc_hl<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc , "($src1.H, $src2.L)")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_acc_lh<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc , "($src1.L, $src2.H)")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_acc_ll<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst += ", !strconcat(opc , "($src1.L, $src2.L)")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_acc_hh_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst += ",
-                          !strconcat(opc , "($src1.H, $src2.H):<<1")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_acc_hl_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst += ",
-                          !strconcat(opc , "($src1.H, $src2.L):<<1")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_acc_lh_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst += ",
-                          !strconcat(opc , "($src1.L, $src2.H):<<1")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_acc_ll_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst += ",
-                          !strconcat(opc , "($src1.L, $src2.L):<<1")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_nac_hh<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc , "($src1.H, $src2.H)")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_nac_hl<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc , "($src1.H, $src2.L)")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_nac_lh<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc , "($src1.L, $src2.H)")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_nac_ll<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst -= ", !strconcat(opc , "($src1.L, $src2.L)")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_nac_hh_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst -= ",
-                          !strconcat(opc , "($src1.H, $src2.H):<<1")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_nac_hl_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst -= ",
-                          !strconcat(opc , "($src1.H, $src2.L):<<1")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_nac_lh_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst -= ",
-                          !strconcat(opc , "($src1.L, $src2.H):<<1")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_nac_ll_s1<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst -= ",
-                          !strconcat(opc , "($src1.L, $src2.L):<<1")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disisi_acc_s1_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst += ",
-                          !strconcat(opc , "($src1, $src2):<<1:sat")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class di_MInst_disi_s1_sat<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):<<1:sat")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
-
-class di_MInst_didisi_acc_s1_sat<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           IntRegs:$src2),
-               !strconcat("$dst += ",
-                          !strconcat(opc , "($src1, $src2):<<1:sat")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
-                                             DoubleRegs:$src1,
-                                             IntRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_disi_s1_rnd_sat<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ",
-                        !strconcat(opc , "($src1, $src2):<<1:rnd:sat")),
-             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
-
-class si_MInst_didi<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
-
-
-class T_RI_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat<(IntID (i32 IntRegs:$Rs), imm:$It),
-        (MI IntRegs:$Rs, imm:$It)>;
+class T_IRI_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID imm:$It, I32:$Rs, imm:$Iu),
+         (MI imm:$It, I32:$Rs, imm:$Iu)>;
 
-//
-// LDInst classes.
-//
-let mayLoad = 1, neverHasSideEffects = 1 in
-class di_LDInstPI_diu4<string opc, Intrinsic IntID>
-  : LDInstPI<(outs IntRegs:$dst, DoubleRegs:$dst2),
-           (ins IntRegs:$src1, IntRegs:$src2, CRRegs:$src3, s4Imm:$offset),
-           "$dst2 = memd($src1++#$offset:circ($src3))",
-           [],
-           "$src1 = $dst">;
+class T_IRR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID imm:$Is, I32:$Rs, I32:$Rt),
+         (MI imm:$Is, I32:$Rs, I32:$Rt)>;
 
-/********************************************************************
-*            ALU32/ALU                                              *
-*********************************************************************/
+class T_RIR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs, imm:$Is, I32:$Rt),
+         (MI I32:$Rs, imm:$Is, I32:$Rt)>;
 
-// ALU32 / ALU / Add.
-def HEXAGON_A2_add:
-  si_ALU32_sisi                   <"add",      int_hexagon_A2_add>;
-def HEXAGON_A2_addi:
-  si_ALU32_sis16                  <"add",      int_hexagon_A2_addi>;
-
-// ALU32 / ALU / Logical operations.
-def HEXAGON_A2_and:
-  si_ALU32_sisi                   <"and",      int_hexagon_A2_and>;
-def HEXAGON_A2_andir:
-  si_ALU32_sis10                  <"and",      int_hexagon_A2_andir>;
-def HEXAGON_A2_not:
-  si_ALU32_si                     <"not",      int_hexagon_A2_not>;
-def HEXAGON_A2_or:
-  si_ALU32_sisi                   <"or",       int_hexagon_A2_or>;
-def HEXAGON_A2_orir:
-  si_ALU32_sis10                  <"or",       int_hexagon_A2_orir>;
-def HEXAGON_A2_xor:
-  si_ALU32_sisi                   <"xor",      int_hexagon_A2_xor>;
-
-// ALU32 / ALU / Negate.
-def HEXAGON_A2_neg:
-  si_ALU32_si                     <"neg",      int_hexagon_A2_neg>;
-
-// ALU32 / ALU / Subtract.
-def HEXAGON_A2_sub:
-  si_ALU32_sisi                   <"sub",      int_hexagon_A2_sub>;
-def HEXAGON_A2_subri:
-  si_ALU32_s10si                  <"sub",      int_hexagon_A2_subri>;
-
-// ALU32 / ALU / Transfer Immediate.
-def HEXAGON_A2_tfril:
-  si_lo_ALU32_siu16               <"",         int_hexagon_A2_tfril>;
-def HEXAGON_A2_tfrih:
-  si_hi_ALU32_siu16               <"",         int_hexagon_A2_tfrih>;
-def HEXAGON_A2_tfrsi:
-  si_ALU32_s16                    <"",         int_hexagon_A2_tfrsi>;
-def HEXAGON_A2_tfrpi:
-  di_ALU32_s8                     <"",         int_hexagon_A2_tfrpi>;
-
-// ALU32 / ALU / Transfer Register.
-def HEXAGON_A2_tfr:
-  si_ALU32_si_tfr                  <"",        int_hexagon_A2_tfr>;
+class T_RRR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs, I32:$Rt, I32:$Ru),
+         (MI I32:$Rs, I32:$Rt, I32:$Ru)>;
 
-/********************************************************************
-*            ALU32/PERM                                             *
-*********************************************************************/
+class T_PPI_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I64:$Rt, imm:$Iu),
+         (MI DoubleRegs:$Rs, DoubleRegs:$Rt, imm:$Iu)>;
 
-// ALU32 / PERM / Combine.
-def HEXAGON_A2_combinew:
-  di_ALU32_sisi                   <"combine",  int_hexagon_A2_combinew>;
-def HEXAGON_A2_combine_hh:
-  si_MInst_sisi_hh                <"combine",  int_hexagon_A2_combine_hh>;
-def HEXAGON_A2_combine_lh:
-  si_MInst_sisi_lh                <"combine",  int_hexagon_A2_combine_lh>;
-def HEXAGON_A2_combine_hl:
-  si_MInst_sisi_hl                <"combine",  int_hexagon_A2_combine_hl>;
-def HEXAGON_A2_combine_ll:
-  si_MInst_sisi_ll                <"combine",  int_hexagon_A2_combine_ll>;
-def HEXAGON_A2_combineii:
-  di_MInst_s8s8                   <"combine",  int_hexagon_A2_combineii>;
-
-// ALU32 / PERM / Mux.
-def HEXAGON_C2_mux:
-  si_ALU32_qisisi                 <"mux",      int_hexagon_C2_mux>;
-def HEXAGON_C2_muxri:
-  si_ALU32_qis8si                 <"mux",      int_hexagon_C2_muxri>;
-def HEXAGON_C2_muxir:
-  si_ALU32_qisis8                 <"mux",      int_hexagon_C2_muxir>;
-def HEXAGON_C2_muxii:
-  si_ALU32_qis8s8                 <"mux",      int_hexagon_C2_muxii>;
-
-// ALU32 / PERM / Shift halfword.
-def HEXAGON_A2_aslh:
-  si_ALU32_si                     <"aslh",     int_hexagon_A2_aslh>;
-def HEXAGON_A2_asrh:
-  si_ALU32_si                     <"asrh",     int_hexagon_A2_asrh>;
-def SI_to_SXTHI_asrh:
-  si_ALU32_si                     <"asrh",     int_hexagon_SI_to_SXTHI_asrh>;
-
-// ALU32 / PERM / Sign/zero extend.
-def HEXAGON_A2_sxth:
-  si_ALU32_si                     <"sxth",     int_hexagon_A2_sxth>;
-def HEXAGON_A2_sxtb:
-  si_ALU32_si                     <"sxtb",     int_hexagon_A2_sxtb>;
-def HEXAGON_A2_zxth:
-  si_ALU32_si                     <"zxth",     int_hexagon_A2_zxth>;
-def HEXAGON_A2_zxtb:
-  si_ALU32_si                     <"zxtb",     int_hexagon_A2_zxtb>;
+class T_PII_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, imm:$It, imm:$Iu),
+         (MI DoubleRegs:$Rs, imm:$It, imm:$Iu)>;
 
-/********************************************************************
-*            ALU32/PRED                                             *
-*********************************************************************/
+class T_PPP_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I64:$Rt, I64:$Ru),
+         (MI DoubleRegs:$Rs, DoubleRegs:$Rt, DoubleRegs:$Ru)>;
 
-// ALU32 / PRED / Compare.
-def HEXAGON_C2_cmpeq:
-  qi_ALU32_sisi                   <"cmp.eq",   int_hexagon_C2_cmpeq>;
-def HEXAGON_C2_cmpeqi:
-  qi_ALU32_sis10                  <"cmp.eq",   int_hexagon_C2_cmpeqi>;
-def HEXAGON_C2_cmpgei:
-  qi_ALU32_sis8                   <"cmp.ge",   int_hexagon_C2_cmpgei>;
-def HEXAGON_C2_cmpgeui:
-  qi_ALU32_siu8                   <"cmp.geu",  int_hexagon_C2_cmpgeui>;
-def HEXAGON_C2_cmpgt:
-  qi_ALU32_sisi                   <"cmp.gt",   int_hexagon_C2_cmpgt>;
-def HEXAGON_C2_cmpgti:
-  qi_ALU32_sis10                  <"cmp.gt",   int_hexagon_C2_cmpgti>;
-def HEXAGON_C2_cmpgtu:
-  qi_ALU32_sisi                   <"cmp.gtu",  int_hexagon_C2_cmpgtu>;
-def HEXAGON_C2_cmpgtui:
-  qi_ALU32_siu9                   <"cmp.gtu",  int_hexagon_C2_cmpgtui>;
-def HEXAGON_C2_cmplt:
-  qi_ALU32_sisi                   <"cmp.lt",   int_hexagon_C2_cmplt>;
-def HEXAGON_C2_cmpltu:
-  qi_ALU32_sisi                   <"cmp.ltu",  int_hexagon_C2_cmpltu>;
+class T_PPR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I64:$Rt, I32:$Ru),
+         (MI DoubleRegs:$Rs, DoubleRegs:$Rt, I32:$Ru)>;
 
-/********************************************************************
-*            ALU32/VH                                               *
-*********************************************************************/
+class T_PRR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I32:$Rt, I32:$Ru),
+         (MI DoubleRegs:$Rs, I32:$Rt, I32:$Ru)>;
 
-// ALU32 / VH / Vector add halfwords.
-// Rd32=vadd[u]h(Rs32,Rt32:sat]
-def HEXAGON_A2_svaddh:
-  si_ALU32_sisi                   <"vaddh",    int_hexagon_A2_svaddh>;
-def HEXAGON_A2_svaddhs:
-  si_ALU32_sisi_sat               <"vaddh",    int_hexagon_A2_svaddhs>;
-def HEXAGON_A2_svadduhs:
-  si_ALU32_sisi_sat               <"vadduh",   int_hexagon_A2_svadduhs>;
-
-// ALU32 / VH / Vector average halfwords.
-def HEXAGON_A2_svavgh:
-  si_ALU32_sisi                   <"vavgh",    int_hexagon_A2_svavgh>;
-def HEXAGON_A2_svavghs:
-  si_ALU32_sisi_rnd               <"vavgh",    int_hexagon_A2_svavghs>;
-def HEXAGON_A2_svnavgh:
-  si_ALU32_sisi                   <"vnavgh",   int_hexagon_A2_svnavgh>;
-
-// ALU32 / VH / Vector subtract halfwords.
-def HEXAGON_A2_svsubh:
-  si_ALU32_sisi                   <"vsubh",    int_hexagon_A2_svsubh>;
-def HEXAGON_A2_svsubhs:
-  si_ALU32_sisi_sat               <"vsubh",    int_hexagon_A2_svsubhs>;
-def HEXAGON_A2_svsubuhs:
-  si_ALU32_sisi_sat               <"vsubuh",   int_hexagon_A2_svsubuhs>;
+class T_PPQ_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I64:$Rt, (i32 PredRegs:$Ru)),
+         (MI DoubleRegs:$Rs, DoubleRegs:$Rt, PredRegs:$Ru)>;
 
-/********************************************************************
-*            ALU64/ALU                                              *
-*********************************************************************/
+class T_PR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I32:$Rt),
+         (MI DoubleRegs:$Rs, I32:$Rt)>;
 
-// ALU64 / ALU / Add.
-def HEXAGON_A2_addp:
-  di_ALU64_didi                   <"add",      int_hexagon_A2_addp>;
-def HEXAGON_A2_addsat:
-  si_ALU64_sisi_sat               <"add",      int_hexagon_A2_addsat>;
-
-// ALU64 / ALU / Add halfword.
-// Even though the definition says hl, it should be lh -
-//so DON'T change the class " si_ALU64_sisi_l16_lh " it inherits.
-def HEXAGON_A2_addh_l16_hl:
-  si_ALU64_sisi_l16_lh            <"add",      int_hexagon_A2_addh_l16_hl>;
-def HEXAGON_A2_addh_l16_ll:
-  si_ALU64_sisi_l16_ll            <"add",      int_hexagon_A2_addh_l16_ll>;
-
-def HEXAGON_A2_addh_l16_sat_hl:
-  si_ALU64_sisi_l16_sat_lh        <"add",      int_hexagon_A2_addh_l16_sat_hl>;
-def HEXAGON_A2_addh_l16_sat_ll:
-  si_ALU64_sisi_l16_sat_ll        <"add",      int_hexagon_A2_addh_l16_sat_ll>;
-
-def HEXAGON_A2_addh_h16_hh:
-  si_ALU64_sisi_h16_hh            <"add",      int_hexagon_A2_addh_h16_hh>;
-def HEXAGON_A2_addh_h16_hl:
-  si_ALU64_sisi_h16_hl            <"add",      int_hexagon_A2_addh_h16_hl>;
-def HEXAGON_A2_addh_h16_lh:
-  si_ALU64_sisi_h16_lh            <"add",      int_hexagon_A2_addh_h16_lh>;
-def HEXAGON_A2_addh_h16_ll:
-  si_ALU64_sisi_h16_ll            <"add",      int_hexagon_A2_addh_h16_ll>;
-
-def HEXAGON_A2_addh_h16_sat_hh:
-  si_ALU64_sisi_h16_sat_hh        <"add",      int_hexagon_A2_addh_h16_sat_hh>;
-def HEXAGON_A2_addh_h16_sat_hl:
-  si_ALU64_sisi_h16_sat_hl        <"add",      int_hexagon_A2_addh_h16_sat_hl>;
-def HEXAGON_A2_addh_h16_sat_lh:
-  si_ALU64_sisi_h16_sat_lh        <"add",      int_hexagon_A2_addh_h16_sat_lh>;
-def HEXAGON_A2_addh_h16_sat_ll:
-  si_ALU64_sisi_h16_sat_ll        <"add",      int_hexagon_A2_addh_h16_sat_ll>;
-
-// ALU64 / ALU / Compare.
-def HEXAGON_C2_cmpeqp:
-  qi_ALU64_didi                   <"cmp.eq",   int_hexagon_C2_cmpeqp>;
-def HEXAGON_C2_cmpgtp:
-  qi_ALU64_didi                   <"cmp.gt",   int_hexagon_C2_cmpgtp>;
-def HEXAGON_C2_cmpgtup:
-  qi_ALU64_didi                   <"cmp.gtu",  int_hexagon_C2_cmpgtup>;
-
-// ALU64 / ALU / Logical operations.
-def HEXAGON_A2_andp:
-  di_ALU64_didi                   <"and",      int_hexagon_A2_andp>;
-def HEXAGON_A2_orp:
-  di_ALU64_didi                   <"or",       int_hexagon_A2_orp>;
-def HEXAGON_A2_xorp:
-  di_ALU64_didi                   <"xor",      int_hexagon_A2_xorp>;
-
-// ALU64 / ALU / Maximum.
-def HEXAGON_A2_max:
-  si_ALU64_sisi                   <"max",      int_hexagon_A2_max>;
-def HEXAGON_A2_maxu:
-  si_ALU64_sisi                   <"maxu",     int_hexagon_A2_maxu>;
-
-// ALU64 / ALU / Minimum.
-def HEXAGON_A2_min:
-  si_ALU64_sisi                   <"min",      int_hexagon_A2_min>;
-def HEXAGON_A2_minu:
-  si_ALU64_sisi                   <"minu",     int_hexagon_A2_minu>;
-
-// ALU64 / ALU / Subtract.
-def HEXAGON_A2_subp:
-  di_ALU64_didi                   <"sub",      int_hexagon_A2_subp>;
-def HEXAGON_A2_subsat:
-  si_ALU64_sisi_sat               <"sub",      int_hexagon_A2_subsat>;
-
-// ALU64 / ALU / Subtract halfword.
-// Even though the definition says hl, it should be lh -
-//so DON'T change the class " si_ALU64_sisi_l16_lh " it inherits.
-def HEXAGON_A2_subh_l16_hl:
-  si_ALU64_sisi_l16_lh            <"sub",      int_hexagon_A2_subh_l16_hl>;
-def HEXAGON_A2_subh_l16_ll:
-  si_ALU64_sisi_l16_ll            <"sub",      int_hexagon_A2_subh_l16_ll>;
-
-def HEXAGON_A2_subh_l16_sat_hl:
-  si_ALU64_sisi_l16_sat_lh        <"sub",      int_hexagon_A2_subh_l16_sat_hl>;
-def HEXAGON_A2_subh_l16_sat_ll:
-  si_ALU64_sisi_l16_sat_ll        <"sub",      int_hexagon_A2_subh_l16_sat_ll>;
-
-def HEXAGON_A2_subh_h16_hh:
-  si_ALU64_sisi_h16_hh            <"sub",      int_hexagon_A2_subh_h16_hh>;
-def HEXAGON_A2_subh_h16_hl:
-  si_ALU64_sisi_h16_hl            <"sub",      int_hexagon_A2_subh_h16_hl>;
-def HEXAGON_A2_subh_h16_lh:
-  si_ALU64_sisi_h16_lh            <"sub",      int_hexagon_A2_subh_h16_lh>;
-def HEXAGON_A2_subh_h16_ll:
-  si_ALU64_sisi_h16_ll            <"sub",      int_hexagon_A2_subh_h16_ll>;
-
-def HEXAGON_A2_subh_h16_sat_hh:
-  si_ALU64_sisi_h16_sat_hh        <"sub",      int_hexagon_A2_subh_h16_sat_hh>;
-def HEXAGON_A2_subh_h16_sat_hl:
-  si_ALU64_sisi_h16_sat_hl        <"sub",      int_hexagon_A2_subh_h16_sat_hl>;
-def HEXAGON_A2_subh_h16_sat_lh:
-  si_ALU64_sisi_h16_sat_lh        <"sub",      int_hexagon_A2_subh_h16_sat_lh>;
-def HEXAGON_A2_subh_h16_sat_ll:
-  si_ALU64_sisi_h16_sat_ll        <"sub",      int_hexagon_A2_subh_h16_sat_ll>;
-
-// ALU64 / ALU / Transfer register.
-def HEXAGON_A2_tfrp:
-  di_ALU64_di                     <"",         int_hexagon_A2_tfrp>;
+class T_D_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID (F64:$Rs)),
+        (MI (F64:$Rs))>;
 
-/********************************************************************
-*            ALU64/BIT                                              *
-*********************************************************************/
+class T_DI_pat <InstHexagon MI, Intrinsic IntID,
+                PatLeaf ImmPred = PatLeaf<(i32 imm)>>
+  : Pat<(IntID F64:$Rs, ImmPred:$It),
+        (MI F64:$Rs, ImmPred:$It)>;
 
-// ALU64 / BIT / Masked parity.
-def HEXAGON_S2_parityp:
-  si_ALU64_didi                   <"parity",   int_hexagon_S2_parityp>;
+class T_F_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID F32:$Rs),
+        (MI F32:$Rs)>;
 
-/********************************************************************
-*            ALU64/PERM                                             *
-*********************************************************************/
+class T_FI_pat <InstHexagon MI, Intrinsic IntID,
+                 PatLeaf ImmPred = PatLeaf<(i32 imm)>>
+  : Pat<(IntID F32:$Rs, ImmPred:$It),
+        (MI F32:$Rs, ImmPred:$It)>;
 
-// ALU64 / PERM / Vector pack high and low halfwords.
-def HEXAGON_S2_packhl:
-  di_ALU64_sisi                   <"packhl",   int_hexagon_S2_packhl>;
+class T_FF_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID F32:$Rs, F32:$Rt),
+        (MI F32:$Rs, F32:$Rt)>;
 
-/********************************************************************
-*            ALU64/VB                                               *
-*********************************************************************/
+class T_DD_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID F64:$Rs, F64:$Rt),
+        (MI F64:$Rs, F64:$Rt)>;
 
-// ALU64 / VB / Vector add unsigned bytes.
-def HEXAGON_A2_vaddub:
-  di_ALU64_didi                   <"vaddub",   int_hexagon_A2_vaddub>;
-def HEXAGON_A2_vaddubs:
-  di_ALU64_didi_sat               <"vaddub",   int_hexagon_A2_vaddubs>;
-
-// ALU64 / VB / Vector average unsigned bytes.
-def HEXAGON_A2_vavgub:
-  di_ALU64_didi                   <"vavgub",   int_hexagon_A2_vavgub>;
-def HEXAGON_A2_vavgubr:
-  di_ALU64_didi_rnd               <"vavgub",   int_hexagon_A2_vavgubr>;
-
-// ALU64 / VB / Vector compare unsigned bytes.
-def HEXAGON_A2_vcmpbeq:
-  qi_ALU64_didi                   <"vcmpb.eq", int_hexagon_A2_vcmpbeq>;
-def HEXAGON_A2_vcmpbgtu:
-  qi_ALU64_didi                   <"vcmpb.gtu",int_hexagon_A2_vcmpbgtu>;
-
-// ALU64 / VB / Vector maximum/minimum unsigned bytes.
-def HEXAGON_A2_vmaxub:
-  di_ALU64_didi                   <"vmaxub",   int_hexagon_A2_vmaxub>;
-def HEXAGON_A2_vminub:
-  di_ALU64_didi                   <"vminub",   int_hexagon_A2_vminub>;
-
-// ALU64 / VB / Vector subtract unsigned bytes.
-def HEXAGON_A2_vsubub:
-  di_ALU64_didi                   <"vsubub",   int_hexagon_A2_vsubub>;
-def HEXAGON_A2_vsububs:
-  di_ALU64_didi_sat               <"vsubub",   int_hexagon_A2_vsububs>;
+class T_FFF_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID F32:$Rs, F32:$Rt, F32:$Ru),
+        (MI F32:$Rs, F32:$Rt, F32:$Ru)>;
 
-// ALU64 / VB / Vector mux.
-def HEXAGON_C2_vmux:
-  di_ALU64_qididi                 <"vmux",     int_hexagon_C2_vmux>;
+class T_FFFQ_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID F32:$Rs, F32:$Rt, F32:$Ru, (i32 PredRegs:$Rx)),
+         (MI F32:$Rs, F32:$Rt, F32:$Ru, PredRegs:$Rx)>;
 
+//===----------------------------------------------------------------------===//
+// MPYS / Multipy signed/unsigned halfwords
+//Rd=mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat]
+//===----------------------------------------------------------------------===//
 
-/********************************************************************
-*            ALU64/VH                                               *
-*********************************************************************/
+def : T_RR_pat <M2_mpy_ll_s1, int_hexagon_M2_mpy_ll_s1>;
+def : T_RR_pat <M2_mpy_ll_s0, int_hexagon_M2_mpy_ll_s0>;
+def : T_RR_pat <M2_mpy_lh_s1, int_hexagon_M2_mpy_lh_s1>;
+def : T_RR_pat <M2_mpy_lh_s0, int_hexagon_M2_mpy_lh_s0>;
+def : T_RR_pat <M2_mpy_hl_s1, int_hexagon_M2_mpy_hl_s1>;
+def : T_RR_pat <M2_mpy_hl_s0, int_hexagon_M2_mpy_hl_s0>;
+def : T_RR_pat <M2_mpy_hh_s1, int_hexagon_M2_mpy_hh_s1>;
+def : T_RR_pat <M2_mpy_hh_s0, int_hexagon_M2_mpy_hh_s0>;
+
+def : T_RR_pat <M2_mpyu_ll_s1, int_hexagon_M2_mpyu_ll_s1>;
+def : T_RR_pat <M2_mpyu_ll_s0, int_hexagon_M2_mpyu_ll_s0>;
+def : T_RR_pat <M2_mpyu_lh_s1, int_hexagon_M2_mpyu_lh_s1>;
+def : T_RR_pat <M2_mpyu_lh_s0, int_hexagon_M2_mpyu_lh_s0>;
+def : T_RR_pat <M2_mpyu_hl_s1, int_hexagon_M2_mpyu_hl_s1>;
+def : T_RR_pat <M2_mpyu_hl_s0, int_hexagon_M2_mpyu_hl_s0>;
+def : T_RR_pat <M2_mpyu_hh_s1, int_hexagon_M2_mpyu_hh_s1>;
+def : T_RR_pat <M2_mpyu_hh_s0, int_hexagon_M2_mpyu_hh_s0>;
+
+def : T_RR_pat <M2_mpy_sat_ll_s1, int_hexagon_M2_mpy_sat_ll_s1>;
+def : T_RR_pat <M2_mpy_sat_ll_s0, int_hexagon_M2_mpy_sat_ll_s0>;
+def : T_RR_pat <M2_mpy_sat_lh_s1, int_hexagon_M2_mpy_sat_lh_s1>;
+def : T_RR_pat <M2_mpy_sat_lh_s0, int_hexagon_M2_mpy_sat_lh_s0>;
+def : T_RR_pat <M2_mpy_sat_hl_s1, int_hexagon_M2_mpy_sat_hl_s1>;
+def : T_RR_pat <M2_mpy_sat_hl_s0, int_hexagon_M2_mpy_sat_hl_s0>;
+def : T_RR_pat <M2_mpy_sat_hh_s1, int_hexagon_M2_mpy_sat_hh_s1>;
+def : T_RR_pat <M2_mpy_sat_hh_s0, int_hexagon_M2_mpy_sat_hh_s0>;
+
+def : T_RR_pat <M2_mpy_rnd_ll_s1, int_hexagon_M2_mpy_rnd_ll_s1>;
+def : T_RR_pat <M2_mpy_rnd_ll_s0, int_hexagon_M2_mpy_rnd_ll_s0>;
+def : T_RR_pat <M2_mpy_rnd_lh_s1, int_hexagon_M2_mpy_rnd_lh_s1>;
+def : T_RR_pat <M2_mpy_rnd_lh_s0, int_hexagon_M2_mpy_rnd_lh_s0>;
+def : T_RR_pat <M2_mpy_rnd_hl_s1, int_hexagon_M2_mpy_rnd_hl_s1>;
+def : T_RR_pat <M2_mpy_rnd_hl_s0, int_hexagon_M2_mpy_rnd_hl_s0>;
+def : T_RR_pat <M2_mpy_rnd_hh_s1, int_hexagon_M2_mpy_rnd_hh_s1>;
+def : T_RR_pat <M2_mpy_rnd_hh_s0, int_hexagon_M2_mpy_rnd_hh_s0>;
+
+def : T_RR_pat <M2_mpy_sat_rnd_ll_s1, int_hexagon_M2_mpy_sat_rnd_ll_s1>;
+def : T_RR_pat <M2_mpy_sat_rnd_ll_s0, int_hexagon_M2_mpy_sat_rnd_ll_s0>;
+def : T_RR_pat <M2_mpy_sat_rnd_lh_s1, int_hexagon_M2_mpy_sat_rnd_lh_s1>;
+def : T_RR_pat <M2_mpy_sat_rnd_lh_s0, int_hexagon_M2_mpy_sat_rnd_lh_s0>;
+def : T_RR_pat <M2_mpy_sat_rnd_hl_s1, int_hexagon_M2_mpy_sat_rnd_hl_s1>;
+def : T_RR_pat <M2_mpy_sat_rnd_hl_s0, int_hexagon_M2_mpy_sat_rnd_hl_s0>;
+def : T_RR_pat <M2_mpy_sat_rnd_hh_s1, int_hexagon_M2_mpy_sat_rnd_hh_s1>;
+def : T_RR_pat <M2_mpy_sat_rnd_hh_s0, int_hexagon_M2_mpy_sat_rnd_hh_s0>;
 
-// ALU64 / VH / Vector add halfwords.
-// Rdd64=vadd[u]h(Rss64,Rtt64:sat]
-def HEXAGON_A2_vaddh:
-  di_ALU64_didi                   <"vaddh",    int_hexagon_A2_vaddh>;
-def HEXAGON_A2_vaddhs:
-  di_ALU64_didi_sat               <"vaddh",    int_hexagon_A2_vaddhs>;
-def HEXAGON_A2_vadduhs:
-  di_ALU64_didi_sat               <"vadduh",   int_hexagon_A2_vadduhs>;
-
-// ALU64 / VH / Vector average halfwords.
-// Rdd64=v[n]avg[u]h(Rss64,Rtt64:rnd/:crnd][:sat]
-def HEXAGON_A2_vavgh:
-  di_ALU64_didi                   <"vavgh",    int_hexagon_A2_vavgh>;
-def HEXAGON_A2_vavghcr:
-  di_ALU64_didi_crnd              <"vavgh",    int_hexagon_A2_vavghcr>;
-def HEXAGON_A2_vavghr:
-  di_ALU64_didi_rnd               <"vavgh",    int_hexagon_A2_vavghr>;
-def HEXAGON_A2_vavguh:
-  di_ALU64_didi                   <"vavguh",   int_hexagon_A2_vavguh>;
-def HEXAGON_A2_vavguhr:
-  di_ALU64_didi_rnd               <"vavguh",   int_hexagon_A2_vavguhr>;
-def HEXAGON_A2_vnavgh:
-  di_ALU64_didi                   <"vnavgh",   int_hexagon_A2_vnavgh>;
-def HEXAGON_A2_vnavghcr:
-  di_ALU64_didi_crnd_sat          <"vnavgh",   int_hexagon_A2_vnavghcr>;
-def HEXAGON_A2_vnavghr:
-  di_ALU64_didi_rnd_sat           <"vnavgh",   int_hexagon_A2_vnavghr>;
-
-// ALU64 / VH / Vector compare halfwords.
-def HEXAGON_A2_vcmpheq:
-  qi_ALU64_didi                   <"vcmph.eq", int_hexagon_A2_vcmpheq>;
-def HEXAGON_A2_vcmphgt:
-  qi_ALU64_didi                   <"vcmph.gt", int_hexagon_A2_vcmphgt>;
-def HEXAGON_A2_vcmphgtu:
-  qi_ALU64_didi                   <"vcmph.gtu",int_hexagon_A2_vcmphgtu>;
-
-// ALU64 / VH / Vector maximum halfwords.
-def HEXAGON_A2_vmaxh:
-  di_ALU64_didi                   <"vmaxh",    int_hexagon_A2_vmaxh>;
-def HEXAGON_A2_vmaxuh:
-  di_ALU64_didi                   <"vmaxuh",   int_hexagon_A2_vmaxuh>;
-
-// ALU64 / VH / Vector minimum halfwords.
-def HEXAGON_A2_vminh:
-  di_ALU64_didi                   <"vminh",    int_hexagon_A2_vminh>;
-def HEXAGON_A2_vminuh:
-  di_ALU64_didi                   <"vminuh",   int_hexagon_A2_vminuh>;
-
-// ALU64 / VH / Vector subtract halfwords.
-def HEXAGON_A2_vsubh:
-  di_ALU64_didi                   <"vsubh",    int_hexagon_A2_vsubh>;
-def HEXAGON_A2_vsubhs:
-  di_ALU64_didi_sat               <"vsubh",    int_hexagon_A2_vsubhs>;
-def HEXAGON_A2_vsubuhs:
-  di_ALU64_didi_sat               <"vsubuh",   int_hexagon_A2_vsubuhs>;
 
+//===----------------------------------------------------------------------===//
+// MPYS / Multipy signed/unsigned halfwords and add/subtract the
+// result from the accumulator.
+//Rx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
+//===----------------------------------------------------------------------===//
 
-/********************************************************************
-*            ALU64/VW                                               *
-*********************************************************************/
+def : T_RRR_pat <M2_mpy_acc_ll_s1, int_hexagon_M2_mpy_acc_ll_s1>;
+def : T_RRR_pat <M2_mpy_acc_ll_s0, int_hexagon_M2_mpy_acc_ll_s0>;
+def : T_RRR_pat <M2_mpy_acc_lh_s1, int_hexagon_M2_mpy_acc_lh_s1>;
+def : T_RRR_pat <M2_mpy_acc_lh_s0, int_hexagon_M2_mpy_acc_lh_s0>;
+def : T_RRR_pat <M2_mpy_acc_hl_s1, int_hexagon_M2_mpy_acc_hl_s1>;
+def : T_RRR_pat <M2_mpy_acc_hl_s0, int_hexagon_M2_mpy_acc_hl_s0>;
+def : T_RRR_pat <M2_mpy_acc_hh_s1, int_hexagon_M2_mpy_acc_hh_s1>;
+def : T_RRR_pat <M2_mpy_acc_hh_s0, int_hexagon_M2_mpy_acc_hh_s0>;
+
+def : T_RRR_pat <M2_mpyu_acc_ll_s1, int_hexagon_M2_mpyu_acc_ll_s1>;
+def : T_RRR_pat <M2_mpyu_acc_ll_s0, int_hexagon_M2_mpyu_acc_ll_s0>;
+def : T_RRR_pat <M2_mpyu_acc_lh_s1, int_hexagon_M2_mpyu_acc_lh_s1>;
+def : T_RRR_pat <M2_mpyu_acc_lh_s0, int_hexagon_M2_mpyu_acc_lh_s0>;
+def : T_RRR_pat <M2_mpyu_acc_hl_s1, int_hexagon_M2_mpyu_acc_hl_s1>;
+def : T_RRR_pat <M2_mpyu_acc_hl_s0, int_hexagon_M2_mpyu_acc_hl_s0>;
+def : T_RRR_pat <M2_mpyu_acc_hh_s1, int_hexagon_M2_mpyu_acc_hh_s1>;
+def : T_RRR_pat <M2_mpyu_acc_hh_s0, int_hexagon_M2_mpyu_acc_hh_s0>;
+
+def : T_RRR_pat <M2_mpy_nac_ll_s1, int_hexagon_M2_mpy_nac_ll_s1>;
+def : T_RRR_pat <M2_mpy_nac_ll_s0, int_hexagon_M2_mpy_nac_ll_s0>;
+def : T_RRR_pat <M2_mpy_nac_lh_s1, int_hexagon_M2_mpy_nac_lh_s1>;
+def : T_RRR_pat <M2_mpy_nac_lh_s0, int_hexagon_M2_mpy_nac_lh_s0>;
+def : T_RRR_pat <M2_mpy_nac_hl_s1, int_hexagon_M2_mpy_nac_hl_s1>;
+def : T_RRR_pat <M2_mpy_nac_hl_s0, int_hexagon_M2_mpy_nac_hl_s0>;
+def : T_RRR_pat <M2_mpy_nac_hh_s1, int_hexagon_M2_mpy_nac_hh_s1>;
+def : T_RRR_pat <M2_mpy_nac_hh_s0, int_hexagon_M2_mpy_nac_hh_s0>;
+
+def : T_RRR_pat <M2_mpyu_nac_ll_s1, int_hexagon_M2_mpyu_nac_ll_s1>;
+def : T_RRR_pat <M2_mpyu_nac_ll_s0, int_hexagon_M2_mpyu_nac_ll_s0>;
+def : T_RRR_pat <M2_mpyu_nac_lh_s1, int_hexagon_M2_mpyu_nac_lh_s1>;
+def : T_RRR_pat <M2_mpyu_nac_lh_s0, int_hexagon_M2_mpyu_nac_lh_s0>;
+def : T_RRR_pat <M2_mpyu_nac_hl_s1, int_hexagon_M2_mpyu_nac_hl_s1>;
+def : T_RRR_pat <M2_mpyu_nac_hl_s0, int_hexagon_M2_mpyu_nac_hl_s0>;
+def : T_RRR_pat <M2_mpyu_nac_hh_s1, int_hexagon_M2_mpyu_nac_hh_s1>;
+def : T_RRR_pat <M2_mpyu_nac_hh_s0, int_hexagon_M2_mpyu_nac_hh_s0>;
+
+def : T_RRR_pat <M2_mpy_acc_sat_ll_s1, int_hexagon_M2_mpy_acc_sat_ll_s1>;
+def : T_RRR_pat <M2_mpy_acc_sat_ll_s0, int_hexagon_M2_mpy_acc_sat_ll_s0>;
+def : T_RRR_pat <M2_mpy_acc_sat_lh_s1, int_hexagon_M2_mpy_acc_sat_lh_s1>;
+def : T_RRR_pat <M2_mpy_acc_sat_lh_s0, int_hexagon_M2_mpy_acc_sat_lh_s0>;
+def : T_RRR_pat <M2_mpy_acc_sat_hl_s1, int_hexagon_M2_mpy_acc_sat_hl_s1>;
+def : T_RRR_pat <M2_mpy_acc_sat_hl_s0, int_hexagon_M2_mpy_acc_sat_hl_s0>;
+def : T_RRR_pat <M2_mpy_acc_sat_hh_s1, int_hexagon_M2_mpy_acc_sat_hh_s1>;
+def : T_RRR_pat <M2_mpy_acc_sat_hh_s0, int_hexagon_M2_mpy_acc_sat_hh_s0>;
+
+def : T_RRR_pat <M2_mpy_nac_sat_ll_s1, int_hexagon_M2_mpy_nac_sat_ll_s1>;
+def : T_RRR_pat <M2_mpy_nac_sat_ll_s0, int_hexagon_M2_mpy_nac_sat_ll_s0>;
+def : T_RRR_pat <M2_mpy_nac_sat_lh_s1, int_hexagon_M2_mpy_nac_sat_lh_s1>;
+def : T_RRR_pat <M2_mpy_nac_sat_lh_s0, int_hexagon_M2_mpy_nac_sat_lh_s0>;
+def : T_RRR_pat <M2_mpy_nac_sat_hl_s1, int_hexagon_M2_mpy_nac_sat_hl_s1>;
+def : T_RRR_pat <M2_mpy_nac_sat_hl_s0, int_hexagon_M2_mpy_nac_sat_hl_s0>;
+def : T_RRR_pat <M2_mpy_nac_sat_hh_s1, int_hexagon_M2_mpy_nac_sat_hh_s1>;
+def : T_RRR_pat <M2_mpy_nac_sat_hh_s0, int_hexagon_M2_mpy_nac_sat_hh_s0>;
+
+
+//===----------------------------------------------------------------------===//
+// Multiply signed/unsigned halfwords with and without saturation and rounding
+// into a 64-bits destination register.
+//===----------------------------------------------------------------------===//
+
+def : T_RR_pat <M2_mpyd_hh_s0, int_hexagon_M2_mpyd_hh_s0>;
+def : T_RR_pat <M2_mpyd_hl_s0, int_hexagon_M2_mpyd_hl_s0>;
+def : T_RR_pat <M2_mpyd_lh_s0, int_hexagon_M2_mpyd_lh_s0>;
+def : T_RR_pat <M2_mpyd_ll_s0, int_hexagon_M2_mpyd_ll_s0>;
+def : T_RR_pat <M2_mpyd_hh_s1, int_hexagon_M2_mpyd_hh_s1>;
+def : T_RR_pat <M2_mpyd_hl_s1, int_hexagon_M2_mpyd_hl_s1>;
+def : T_RR_pat <M2_mpyd_lh_s1, int_hexagon_M2_mpyd_lh_s1>;
+def : T_RR_pat <M2_mpyd_ll_s1, int_hexagon_M2_mpyd_ll_s1>;
+
+def : T_RR_pat <M2_mpyd_rnd_hh_s0, int_hexagon_M2_mpyd_rnd_hh_s0>;
+def : T_RR_pat <M2_mpyd_rnd_hl_s0, int_hexagon_M2_mpyd_rnd_hl_s0>;
+def : T_RR_pat <M2_mpyd_rnd_lh_s0, int_hexagon_M2_mpyd_rnd_lh_s0>;
+def : T_RR_pat <M2_mpyd_rnd_ll_s0, int_hexagon_M2_mpyd_rnd_ll_s0>;
+def : T_RR_pat <M2_mpyd_rnd_hh_s1, int_hexagon_M2_mpyd_rnd_hh_s1>;
+def : T_RR_pat <M2_mpyd_rnd_hl_s1, int_hexagon_M2_mpyd_rnd_hl_s1>;
+def : T_RR_pat <M2_mpyd_rnd_lh_s1, int_hexagon_M2_mpyd_rnd_lh_s1>;
+def : T_RR_pat <M2_mpyd_rnd_ll_s1, int_hexagon_M2_mpyd_rnd_ll_s1>;
+
+def : T_RR_pat <M2_mpyud_hh_s0, int_hexagon_M2_mpyud_hh_s0>;
+def : T_RR_pat <M2_mpyud_hl_s0, int_hexagon_M2_mpyud_hl_s0>;
+def : T_RR_pat <M2_mpyud_lh_s0, int_hexagon_M2_mpyud_lh_s0>;
+def : T_RR_pat <M2_mpyud_ll_s0, int_hexagon_M2_mpyud_ll_s0>;
+def : T_RR_pat <M2_mpyud_hh_s1, int_hexagon_M2_mpyud_hh_s1>;
+def : T_RR_pat <M2_mpyud_hl_s1, int_hexagon_M2_mpyud_hl_s1>;
+def : T_RR_pat <M2_mpyud_lh_s1, int_hexagon_M2_mpyud_lh_s1>;
+def : T_RR_pat <M2_mpyud_ll_s1, int_hexagon_M2_mpyud_ll_s1>;
+
+//===----------------------------------------------------------------------===//
+// MPYS / Multipy signed/unsigned halfwords and add/subtract the
+// result from the 64-bit destination register.
+//Rxx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
+//===----------------------------------------------------------------------===//
+
+def : T_PRR_pat <M2_mpyd_acc_hh_s0, int_hexagon_M2_mpyd_acc_hh_s0>;
+def : T_PRR_pat <M2_mpyd_acc_hl_s0, int_hexagon_M2_mpyd_acc_hl_s0>;
+def : T_PRR_pat <M2_mpyd_acc_lh_s0, int_hexagon_M2_mpyd_acc_lh_s0>;
+def : T_PRR_pat <M2_mpyd_acc_ll_s0, int_hexagon_M2_mpyd_acc_ll_s0>;
+
+def : T_PRR_pat <M2_mpyd_acc_hh_s1, int_hexagon_M2_mpyd_acc_hh_s1>;
+def : T_PRR_pat <M2_mpyd_acc_hl_s1, int_hexagon_M2_mpyd_acc_hl_s1>;
+def : T_PRR_pat <M2_mpyd_acc_lh_s1, int_hexagon_M2_mpyd_acc_lh_s1>;
+def : T_PRR_pat <M2_mpyd_acc_ll_s1, int_hexagon_M2_mpyd_acc_ll_s1>;
+
+def : T_PRR_pat <M2_mpyd_nac_hh_s0, int_hexagon_M2_mpyd_nac_hh_s0>;
+def : T_PRR_pat <M2_mpyd_nac_hl_s0, int_hexagon_M2_mpyd_nac_hl_s0>;
+def : T_PRR_pat <M2_mpyd_nac_lh_s0, int_hexagon_M2_mpyd_nac_lh_s0>;
+def : T_PRR_pat <M2_mpyd_nac_ll_s0, int_hexagon_M2_mpyd_nac_ll_s0>;
+
+def : T_PRR_pat <M2_mpyd_nac_hh_s1, int_hexagon_M2_mpyd_nac_hh_s1>;
+def : T_PRR_pat <M2_mpyd_nac_hl_s1, int_hexagon_M2_mpyd_nac_hl_s1>;
+def : T_PRR_pat <M2_mpyd_nac_lh_s1, int_hexagon_M2_mpyd_nac_lh_s1>;
+def : T_PRR_pat <M2_mpyd_nac_ll_s1, int_hexagon_M2_mpyd_nac_ll_s1>;
+
+def : T_PRR_pat <M2_mpyud_acc_hh_s0, int_hexagon_M2_mpyud_acc_hh_s0>;
+def : T_PRR_pat <M2_mpyud_acc_hl_s0, int_hexagon_M2_mpyud_acc_hl_s0>;
+def : T_PRR_pat <M2_mpyud_acc_lh_s0, int_hexagon_M2_mpyud_acc_lh_s0>;
+def : T_PRR_pat <M2_mpyud_acc_ll_s0, int_hexagon_M2_mpyud_acc_ll_s0>;
+
+def : T_PRR_pat <M2_mpyud_acc_hh_s1, int_hexagon_M2_mpyud_acc_hh_s1>;
+def : T_PRR_pat <M2_mpyud_acc_hl_s1, int_hexagon_M2_mpyud_acc_hl_s1>;
+def : T_PRR_pat <M2_mpyud_acc_lh_s1, int_hexagon_M2_mpyud_acc_lh_s1>;
+def : T_PRR_pat <M2_mpyud_acc_ll_s1, int_hexagon_M2_mpyud_acc_ll_s1>;
+
+def : T_PRR_pat <M2_mpyud_nac_hh_s0, int_hexagon_M2_mpyud_nac_hh_s0>;
+def : T_PRR_pat <M2_mpyud_nac_hl_s0, int_hexagon_M2_mpyud_nac_hl_s0>;
+def : T_PRR_pat <M2_mpyud_nac_lh_s0, int_hexagon_M2_mpyud_nac_lh_s0>;
+def : T_PRR_pat <M2_mpyud_nac_ll_s0, int_hexagon_M2_mpyud_nac_ll_s0>;
+
+def : T_PRR_pat <M2_mpyud_nac_hh_s1, int_hexagon_M2_mpyud_nac_hh_s1>;
+def : T_PRR_pat <M2_mpyud_nac_hl_s1, int_hexagon_M2_mpyud_nac_hl_s1>;
+def : T_PRR_pat <M2_mpyud_nac_lh_s1, int_hexagon_M2_mpyud_nac_lh_s1>;
+def : T_PRR_pat <M2_mpyud_nac_ll_s1, int_hexagon_M2_mpyud_nac_ll_s1>;
+
+// Vector complex multiply imaginary: Rdd=vcmpyi(Rss,Rtt)[:<<1]:sat
+def : T_PP_pat <M2_vcmpy_s1_sat_i, int_hexagon_M2_vcmpy_s1_sat_i>;
+def : T_PP_pat <M2_vcmpy_s0_sat_i, int_hexagon_M2_vcmpy_s0_sat_i>;
+
+// Vector complex multiply real: Rdd=vcmpyr(Rss,Rtt)[:<<1]:sat
+def : T_PP_pat <M2_vcmpy_s1_sat_r, int_hexagon_M2_vcmpy_s1_sat_r>;
+def : T_PP_pat <M2_vcmpy_s0_sat_r, int_hexagon_M2_vcmpy_s0_sat_r>;
+
+// Vector dual multiply: Rdd=vdmpy(Rss,Rtt)[:<<1]:sat
+def : T_PP_pat <M2_vdmpys_s1, int_hexagon_M2_vdmpys_s1>;
+def : T_PP_pat <M2_vdmpys_s0, int_hexagon_M2_vdmpys_s0>;
+
+// Vector multiply even halfwords: Rdd=vmpyeh(Rss,Rtt)[:<<1]:sat
+def : T_PP_pat <M2_vmpy2es_s1, int_hexagon_M2_vmpy2es_s1>;
+def : T_PP_pat <M2_vmpy2es_s0, int_hexagon_M2_vmpy2es_s0>;
+
+//Rdd=vmpywoh(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PP_pat <M2_mmpyh_s0,  int_hexagon_M2_mmpyh_s0>;
+def : T_PP_pat <M2_mmpyh_s1,  int_hexagon_M2_mmpyh_s1>;
+def : T_PP_pat <M2_mmpyh_rs0, int_hexagon_M2_mmpyh_rs0>;
+def : T_PP_pat <M2_mmpyh_rs1, int_hexagon_M2_mmpyh_rs1>;
+
+//Rdd=vmpyweh(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PP_pat <M2_mmpyl_s0,  int_hexagon_M2_mmpyl_s0>;
+def : T_PP_pat <M2_mmpyl_s1,  int_hexagon_M2_mmpyl_s1>;
+def : T_PP_pat <M2_mmpyl_rs0, int_hexagon_M2_mmpyl_rs0>;
+def : T_PP_pat <M2_mmpyl_rs1, int_hexagon_M2_mmpyl_rs1>;
+
+//Rdd=vmpywouh(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PP_pat <M2_mmpyuh_s0,  int_hexagon_M2_mmpyuh_s0>;
+def : T_PP_pat <M2_mmpyuh_s1,  int_hexagon_M2_mmpyuh_s1>;
+def : T_PP_pat <M2_mmpyuh_rs0, int_hexagon_M2_mmpyuh_rs0>;
+def : T_PP_pat <M2_mmpyuh_rs1, int_hexagon_M2_mmpyuh_rs1>;
+
+//Rdd=vmpyweuh(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PP_pat <M2_mmpyul_s0,  int_hexagon_M2_mmpyul_s0>;
+def : T_PP_pat <M2_mmpyul_s1,  int_hexagon_M2_mmpyul_s1>;
+def : T_PP_pat <M2_mmpyul_rs0, int_hexagon_M2_mmpyul_rs0>;
+def : T_PP_pat <M2_mmpyul_rs1, int_hexagon_M2_mmpyul_rs1>;
+
+// Vector reduce add unsigned bytes: Rdd32[+]=vrmpybu(Rss32,Rtt32)
+def : T_PP_pat  <A2_vraddub,     int_hexagon_A2_vraddub>;
+def : T_PPP_pat <A2_vraddub_acc, int_hexagon_A2_vraddub_acc>;
+
+// Vector sum of absolute differences unsigned bytes: Rdd=vrsadub(Rss,Rtt)
+def : T_PP_pat  <A2_vrsadub,     int_hexagon_A2_vrsadub>;
+def : T_PPP_pat <A2_vrsadub_acc, int_hexagon_A2_vrsadub_acc>;
+
+// Vector absolute difference: Rdd=vabsdiffh(Rtt,Rss)
+def : T_PP_pat <M2_vabsdiffh, int_hexagon_M2_vabsdiffh>;
+
+// Vector absolute difference words: Rdd=vabsdiffw(Rtt,Rss)
+def : T_PP_pat <M2_vabsdiffw, int_hexagon_M2_vabsdiffw>;
+
+// Vector reduce complex multiply real or imaginary:
+// Rdd[+]=vrcmpy[ir](Rss,Rtt[*])
+def : T_PP_pat  <M2_vrcmpyi_s0,  int_hexagon_M2_vrcmpyi_s0>;
+def : T_PP_pat  <M2_vrcmpyi_s0c, int_hexagon_M2_vrcmpyi_s0c>;
+def : T_PPP_pat <M2_vrcmaci_s0,  int_hexagon_M2_vrcmaci_s0>;
+def : T_PPP_pat <M2_vrcmaci_s0c, int_hexagon_M2_vrcmaci_s0c>;
+
+def : T_PP_pat  <M2_vrcmpyr_s0,  int_hexagon_M2_vrcmpyr_s0>;
+def : T_PP_pat  <M2_vrcmpyr_s0c, int_hexagon_M2_vrcmpyr_s0c>;
+def : T_PPP_pat <M2_vrcmacr_s0,  int_hexagon_M2_vrcmacr_s0>;
+def : T_PPP_pat <M2_vrcmacr_s0c, int_hexagon_M2_vrcmacr_s0c>;
+
+// Vector reduce halfwords
+// Rdd[+]=vrmpyh(Rss,Rtt)
+def : T_PP_pat  <M2_vrmpy_s0, int_hexagon_M2_vrmpy_s0>;
+def : T_PPP_pat <M2_vrmac_s0, int_hexagon_M2_vrmac_s0>;
+
+//===----------------------------------------------------------------------===//
+// Vector Multipy with accumulation
+//===----------------------------------------------------------------------===//
 
-// ALU64 / VW / Vector add words.
-// Rdd32=vaddw(Rss32,Rtt32)[:sat]
-def HEXAGON_A2_vaddw:
-  di_ALU64_didi                   <"vaddw",    int_hexagon_A2_vaddw>;
-def HEXAGON_A2_vaddws:
-  di_ALU64_didi_sat               <"vaddw",   int_hexagon_A2_vaddws>;
-
-// ALU64 / VW / Vector average words.
-def HEXAGON_A2_vavguw:
-  di_ALU64_didi                   <"vavguw",   int_hexagon_A2_vavguw>;
-def HEXAGON_A2_vavguwr:
-  di_ALU64_didi_rnd               <"vavguw",   int_hexagon_A2_vavguwr>;
-def HEXAGON_A2_vavgw:
-  di_ALU64_didi                   <"vavgw",    int_hexagon_A2_vavgw>;
-def HEXAGON_A2_vavgwcr:
-  di_ALU64_didi_crnd              <"vavgw",    int_hexagon_A2_vavgwcr>;
-def HEXAGON_A2_vavgwr:
-  di_ALU64_didi_rnd               <"vavgw",    int_hexagon_A2_vavgwr>;
-def HEXAGON_A2_vnavgw:
-  di_ALU64_didi                   <"vnavgw",   int_hexagon_A2_vnavgw>;
-def HEXAGON_A2_vnavgwcr:
-  di_ALU64_didi_crnd_sat          <"vnavgw",   int_hexagon_A2_vnavgwcr>;
-def HEXAGON_A2_vnavgwr:
-  di_ALU64_didi_rnd_sat           <"vnavgw",   int_hexagon_A2_vnavgwr>;
-
-// ALU64 / VW / Vector compare words.
-def HEXAGON_A2_vcmpweq:
-  qi_ALU64_didi                   <"vcmpw.eq", int_hexagon_A2_vcmpweq>;
-def HEXAGON_A2_vcmpwgt:
-  qi_ALU64_didi                   <"vcmpw.gt", int_hexagon_A2_vcmpwgt>;
-def HEXAGON_A2_vcmpwgtu:
-  qi_ALU64_didi                   <"vcmpw.gtu",int_hexagon_A2_vcmpwgtu>;
-
-// ALU64 / VW / Vector maximum words.
-def HEXAGON_A2_vmaxw:
-  di_ALU64_didi                   <"vmaxw",    int_hexagon_A2_vmaxw>;
-def HEXAGON_A2_vmaxuw:
-  di_ALU64_didi                   <"vmaxuw",   int_hexagon_A2_vmaxuw>;
-
-// ALU64 / VW / Vector minimum words.
-def HEXAGON_A2_vminw:
-  di_ALU64_didi                   <"vminw",    int_hexagon_A2_vminw>;
-def HEXAGON_A2_vminuw:
-  di_ALU64_didi                   <"vminuw",   int_hexagon_A2_vminuw>;
-
-// ALU64 / VW / Vector subtract words.
-def HEXAGON_A2_vsubw:
-  di_ALU64_didi                   <"vsubw",    int_hexagon_A2_vsubw>;
-def HEXAGON_A2_vsubws:
-  di_ALU64_didi_sat               <"vsubw",    int_hexagon_A2_vsubws>;
+// Vector multiply word by signed half with accumulation
+// Rxx+=vmpyw[eo]h(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PPP_pat <M2_mmacls_s1, int_hexagon_M2_mmacls_s1>;
+def : T_PPP_pat <M2_mmacls_s0, int_hexagon_M2_mmacls_s0>;
+def : T_PPP_pat <M2_mmacls_rs1, int_hexagon_M2_mmacls_rs1>;
+def : T_PPP_pat <M2_mmacls_rs0, int_hexagon_M2_mmacls_rs0>;
+def : T_PPP_pat <M2_mmachs_s1, int_hexagon_M2_mmachs_s1>;
+def : T_PPP_pat <M2_mmachs_s0, int_hexagon_M2_mmachs_s0>;
+def : T_PPP_pat <M2_mmachs_rs1, int_hexagon_M2_mmachs_rs1>;
+def : T_PPP_pat <M2_mmachs_rs0, int_hexagon_M2_mmachs_rs0>;
+
+// Vector multiply word by unsigned half with accumulation
+// Rxx+=vmpyw[eo]uh(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PPP_pat <M2_mmaculs_s1, int_hexagon_M2_mmaculs_s1>;
+def : T_PPP_pat <M2_mmaculs_s0, int_hexagon_M2_mmaculs_s0>;
+def : T_PPP_pat <M2_mmaculs_rs1, int_hexagon_M2_mmaculs_rs1>;
+def : T_PPP_pat <M2_mmaculs_rs0, int_hexagon_M2_mmaculs_rs0>;
+def : T_PPP_pat <M2_mmacuhs_s1, int_hexagon_M2_mmacuhs_s1>;
+def : T_PPP_pat <M2_mmacuhs_s0, int_hexagon_M2_mmacuhs_s0>;
+def : T_PPP_pat <M2_mmacuhs_rs1, int_hexagon_M2_mmacuhs_rs1>;
+def : T_PPP_pat <M2_mmacuhs_rs0, int_hexagon_M2_mmacuhs_rs0>;
+
+// Vector multiply even halfwords with accumulation
+// Rxx+=vmpyeh(Rss,Rtt)[:<<1][:sat]
+def : T_PPP_pat <M2_vmac2es, int_hexagon_M2_vmac2es>;
+def : T_PPP_pat <M2_vmac2es_s1, int_hexagon_M2_vmac2es_s1>;
+def : T_PPP_pat <M2_vmac2es_s0, int_hexagon_M2_vmac2es_s0>;
+
+// Vector dual multiply with accumulation
+// Rxx+=vdmpy(Rss,Rtt)[:sat]
+def : T_PPP_pat <M2_vdmacs_s1, int_hexagon_M2_vdmacs_s1>;
+def : T_PPP_pat <M2_vdmacs_s0, int_hexagon_M2_vdmacs_s0>;
+
+// Vector complex multiply real or imaginary with accumulation
+// Rxx+=vcmpy[ir](Rss,Rtt):sat
+def : T_PPP_pat <M2_vcmac_s0_sat_r, int_hexagon_M2_vcmac_s0_sat_r>;
+def : T_PPP_pat <M2_vcmac_s0_sat_i, int_hexagon_M2_vcmac_s0_sat_i>;
 
+//===----------------------------------------------------------------------===//
+// Add/Subtract halfword
+// Rd=add(Rt.L,Rs.[HL])[:sat]
+// Rd=sub(Rt.L,Rs.[HL])[:sat]
+// Rd=add(Rt.[LH],Rs.[HL])[:sat][:<16]
+// Rd=sub(Rt.[LH],Rs.[HL])[:sat][:<16]
+//===----------------------------------------------------------------------===//
+
+//Rd=add(Rt.L,Rs.[LH])
+def : T_RR_pat <A2_addh_l16_ll,     int_hexagon_A2_addh_l16_ll>;
+def : T_RR_pat <A2_addh_l16_hl,     int_hexagon_A2_addh_l16_hl>;
+
+//Rd=add(Rt.L,Rs.[LH]):sat
+def : T_RR_pat <A2_addh_l16_sat_ll, int_hexagon_A2_addh_l16_sat_ll>;
+def : T_RR_pat <A2_addh_l16_sat_hl, int_hexagon_A2_addh_l16_sat_hl>;
+
+//Rd=sub(Rt.L,Rs.[LH])
+def : T_RR_pat <A2_subh_l16_ll,     int_hexagon_A2_subh_l16_ll>;
+def : T_RR_pat <A2_subh_l16_hl,     int_hexagon_A2_subh_l16_hl>;
+
+//Rd=sub(Rt.L,Rs.[LH]):sat
+def : T_RR_pat <A2_subh_l16_sat_ll, int_hexagon_A2_subh_l16_sat_ll>;
+def : T_RR_pat <A2_subh_l16_sat_hl, int_hexagon_A2_subh_l16_sat_hl>;
+
+//Rd=add(Rt.[LH],Rs.[LH]):<<16
+def : T_RR_pat <A2_addh_h16_ll,     int_hexagon_A2_addh_h16_ll>;
+def : T_RR_pat <A2_addh_h16_lh,     int_hexagon_A2_addh_h16_lh>;
+def : T_RR_pat <A2_addh_h16_hl,     int_hexagon_A2_addh_h16_hl>;
+def : T_RR_pat <A2_addh_h16_hh,     int_hexagon_A2_addh_h16_hh>;
+
+//Rd=sub(Rt.[LH],Rs.[LH]):<<16
+def : T_RR_pat <A2_subh_h16_ll,     int_hexagon_A2_subh_h16_ll>;
+def : T_RR_pat <A2_subh_h16_lh,     int_hexagon_A2_subh_h16_lh>;
+def : T_RR_pat <A2_subh_h16_hl,     int_hexagon_A2_subh_h16_hl>;
+def : T_RR_pat <A2_subh_h16_hh,     int_hexagon_A2_subh_h16_hh>;
+
+//Rd=add(Rt.[LH],Rs.[LH]):sat:<<16
+def : T_RR_pat <A2_addh_h16_sat_ll, int_hexagon_A2_addh_h16_sat_ll>;
+def : T_RR_pat <A2_addh_h16_sat_lh, int_hexagon_A2_addh_h16_sat_lh>;
+def : T_RR_pat <A2_addh_h16_sat_hl, int_hexagon_A2_addh_h16_sat_hl>;
+def : T_RR_pat <A2_addh_h16_sat_hh, int_hexagon_A2_addh_h16_sat_hh>;
+
+//Rd=sub(Rt.[LH],Rs.[LH]):sat:<<16
+def : T_RR_pat <A2_subh_h16_sat_ll, int_hexagon_A2_subh_h16_sat_ll>;
+def : T_RR_pat <A2_subh_h16_sat_lh, int_hexagon_A2_subh_h16_sat_lh>;
+def : T_RR_pat <A2_subh_h16_sat_hl, int_hexagon_A2_subh_h16_sat_hl>;
+def : T_RR_pat <A2_subh_h16_sat_hh, int_hexagon_A2_subh_h16_sat_hh>;
+
+// ALU64 / ALU / min max
+def : T_RR_pat<A2_max,  int_hexagon_A2_max>;
+def : T_RR_pat<A2_min,  int_hexagon_A2_min>;
+def : T_RR_pat<A2_maxu, int_hexagon_A2_maxu>;
+def : T_RR_pat<A2_minu, int_hexagon_A2_minu>;
+
+// Shift and accumulate
+def : T_RRI_pat <S2_asr_i_r_nac,  int_hexagon_S2_asr_i_r_nac>;
+def : T_RRI_pat <S2_lsr_i_r_nac,  int_hexagon_S2_lsr_i_r_nac>;
+def : T_RRI_pat <S2_asl_i_r_nac,  int_hexagon_S2_asl_i_r_nac>;
+def : T_RRI_pat <S2_asr_i_r_acc,  int_hexagon_S2_asr_i_r_acc>;
+def : T_RRI_pat <S2_lsr_i_r_acc,  int_hexagon_S2_lsr_i_r_acc>;
+def : T_RRI_pat <S2_asl_i_r_acc,  int_hexagon_S2_asl_i_r_acc>;
+
+def : T_RRI_pat <S2_asr_i_r_and,  int_hexagon_S2_asr_i_r_and>;
+def : T_RRI_pat <S2_lsr_i_r_and,  int_hexagon_S2_lsr_i_r_and>;
+def : T_RRI_pat <S2_asl_i_r_and,  int_hexagon_S2_asl_i_r_and>;
+def : T_RRI_pat <S2_asr_i_r_or,   int_hexagon_S2_asr_i_r_or>;
+def : T_RRI_pat <S2_lsr_i_r_or,   int_hexagon_S2_lsr_i_r_or>;
+def : T_RRI_pat <S2_asl_i_r_or,   int_hexagon_S2_asl_i_r_or>;
+def : T_RRI_pat <S2_lsr_i_r_xacc, int_hexagon_S2_lsr_i_r_xacc>;
+def : T_RRI_pat <S2_asl_i_r_xacc, int_hexagon_S2_asl_i_r_xacc>;
+
+def : T_PPI_pat <S2_asr_i_p_nac,  int_hexagon_S2_asr_i_p_nac>;
+def : T_PPI_pat <S2_lsr_i_p_nac,  int_hexagon_S2_lsr_i_p_nac>;
+def : T_PPI_pat <S2_asl_i_p_nac,  int_hexagon_S2_asl_i_p_nac>;
+def : T_PPI_pat <S2_asr_i_p_acc,  int_hexagon_S2_asr_i_p_acc>;
+def : T_PPI_pat <S2_lsr_i_p_acc,  int_hexagon_S2_lsr_i_p_acc>;
+def : T_PPI_pat <S2_asl_i_p_acc,  int_hexagon_S2_asl_i_p_acc>;
+
+def : T_PPI_pat <S2_asr_i_p_and,  int_hexagon_S2_asr_i_p_and>;
+def : T_PPI_pat <S2_lsr_i_p_and,  int_hexagon_S2_lsr_i_p_and>;
+def : T_PPI_pat <S2_asl_i_p_and,  int_hexagon_S2_asl_i_p_and>;
+def : T_PPI_pat <S2_asr_i_p_or,   int_hexagon_S2_asr_i_p_or>;
+def : T_PPI_pat <S2_lsr_i_p_or,   int_hexagon_S2_lsr_i_p_or>;
+def : T_PPI_pat <S2_asl_i_p_or,   int_hexagon_S2_asl_i_p_or>;
+def : T_PPI_pat <S2_lsr_i_p_xacc, int_hexagon_S2_lsr_i_p_xacc>;
+def : T_PPI_pat <S2_asl_i_p_xacc, int_hexagon_S2_asl_i_p_xacc>;
+
+def : T_RRR_pat <S2_asr_r_r_nac,  int_hexagon_S2_asr_r_r_nac>;
+def : T_RRR_pat <S2_lsr_r_r_nac,  int_hexagon_S2_lsr_r_r_nac>;
+def : T_RRR_pat <S2_asl_r_r_nac,  int_hexagon_S2_asl_r_r_nac>;
+def : T_RRR_pat <S2_lsl_r_r_nac,  int_hexagon_S2_lsl_r_r_nac>;
+def : T_RRR_pat <S2_asr_r_r_acc,  int_hexagon_S2_asr_r_r_acc>;
+def : T_RRR_pat <S2_lsr_r_r_acc,  int_hexagon_S2_lsr_r_r_acc>;
+def : T_RRR_pat <S2_asl_r_r_acc,  int_hexagon_S2_asl_r_r_acc>;
+def : T_RRR_pat <S2_lsl_r_r_acc,  int_hexagon_S2_lsl_r_r_acc>;
+
+def : T_RRR_pat <S2_asr_r_r_and,  int_hexagon_S2_asr_r_r_and>;
+def : T_RRR_pat <S2_lsr_r_r_and,  int_hexagon_S2_lsr_r_r_and>;
+def : T_RRR_pat <S2_asl_r_r_and,  int_hexagon_S2_asl_r_r_and>;
+def : T_RRR_pat <S2_lsl_r_r_and,  int_hexagon_S2_lsl_r_r_and>;
+def : T_RRR_pat <S2_asr_r_r_or,   int_hexagon_S2_asr_r_r_or>;
+def : T_RRR_pat <S2_lsr_r_r_or,   int_hexagon_S2_lsr_r_r_or>;
+def : T_RRR_pat <S2_asl_r_r_or,   int_hexagon_S2_asl_r_r_or>;
+def : T_RRR_pat <S2_lsl_r_r_or,   int_hexagon_S2_lsl_r_r_or>;
+
+def : T_PPR_pat <S2_asr_r_p_nac,  int_hexagon_S2_asr_r_p_nac>;
+def : T_PPR_pat <S2_lsr_r_p_nac,  int_hexagon_S2_lsr_r_p_nac>;
+def : T_PPR_pat <S2_asl_r_p_nac,  int_hexagon_S2_asl_r_p_nac>;
+def : T_PPR_pat <S2_lsl_r_p_nac,  int_hexagon_S2_lsl_r_p_nac>;
+def : T_PPR_pat <S2_asr_r_p_acc,  int_hexagon_S2_asr_r_p_acc>;
+def : T_PPR_pat <S2_lsr_r_p_acc,  int_hexagon_S2_lsr_r_p_acc>;
+def : T_PPR_pat <S2_asl_r_p_acc,  int_hexagon_S2_asl_r_p_acc>;
+def : T_PPR_pat <S2_lsl_r_p_acc,  int_hexagon_S2_lsl_r_p_acc>;
+
+def : T_PPR_pat <S2_asr_r_p_and,  int_hexagon_S2_asr_r_p_and>;
+def : T_PPR_pat <S2_lsr_r_p_and,  int_hexagon_S2_lsr_r_p_and>;
+def : T_PPR_pat <S2_asl_r_p_and,  int_hexagon_S2_asl_r_p_and>;
+def : T_PPR_pat <S2_lsl_r_p_and,  int_hexagon_S2_lsl_r_p_and>;
+def : T_PPR_pat <S2_asr_r_p_or,   int_hexagon_S2_asr_r_p_or>;
+def : T_PPR_pat <S2_lsr_r_p_or,   int_hexagon_S2_lsr_r_p_or>;
+def : T_PPR_pat <S2_asl_r_p_or,   int_hexagon_S2_asl_r_p_or>;
+def : T_PPR_pat <S2_lsl_r_p_or,   int_hexagon_S2_lsl_r_p_or>;
+
+def : T_RRI_pat <S2_asr_i_r_nac,  int_hexagon_S2_asr_i_r_nac>;
+def : T_RRI_pat <S2_lsr_i_r_nac,  int_hexagon_S2_lsr_i_r_nac>;
+def : T_RRI_pat <S2_asl_i_r_nac,  int_hexagon_S2_asl_i_r_nac>;
+def : T_RRI_pat <S2_asr_i_r_acc,  int_hexagon_S2_asr_i_r_acc>;
+def : T_RRI_pat <S2_lsr_i_r_acc,  int_hexagon_S2_lsr_i_r_acc>;
+def : T_RRI_pat <S2_asl_i_r_acc,  int_hexagon_S2_asl_i_r_acc>;
+
+def : T_RRI_pat <S2_asr_i_r_and,  int_hexagon_S2_asr_i_r_and>;
+def : T_RRI_pat <S2_lsr_i_r_and,  int_hexagon_S2_lsr_i_r_and>;
+def : T_RRI_pat <S2_asl_i_r_and,  int_hexagon_S2_asl_i_r_and>;
+def : T_RRI_pat <S2_asr_i_r_or,   int_hexagon_S2_asr_i_r_or>;
+def : T_RRI_pat <S2_lsr_i_r_or,   int_hexagon_S2_lsr_i_r_or>;
+def : T_RRI_pat <S2_asl_i_r_or,   int_hexagon_S2_asl_i_r_or>;
+def : T_RRI_pat <S2_lsr_i_r_xacc, int_hexagon_S2_lsr_i_r_xacc>;
+def : T_RRI_pat <S2_asl_i_r_xacc, int_hexagon_S2_asl_i_r_xacc>;
+
+def : T_PPI_pat <S2_asr_i_p_nac,  int_hexagon_S2_asr_i_p_nac>;
+def : T_PPI_pat <S2_lsr_i_p_nac,  int_hexagon_S2_lsr_i_p_nac>;
+def : T_PPI_pat <S2_asl_i_p_nac,  int_hexagon_S2_asl_i_p_nac>;
+def : T_PPI_pat <S2_asr_i_p_acc,  int_hexagon_S2_asr_i_p_acc>;
+def : T_PPI_pat <S2_lsr_i_p_acc,  int_hexagon_S2_lsr_i_p_acc>;
+def : T_PPI_pat <S2_asl_i_p_acc,  int_hexagon_S2_asl_i_p_acc>;
+
+def : T_PPI_pat <S2_asr_i_p_and,  int_hexagon_S2_asr_i_p_and>;
+def : T_PPI_pat <S2_lsr_i_p_and,  int_hexagon_S2_lsr_i_p_and>;
+def : T_PPI_pat <S2_asl_i_p_and,  int_hexagon_S2_asl_i_p_and>;
+def : T_PPI_pat <S2_asr_i_p_or,   int_hexagon_S2_asr_i_p_or>;
+def : T_PPI_pat <S2_lsr_i_p_or,   int_hexagon_S2_lsr_i_p_or>;
+def : T_PPI_pat <S2_asl_i_p_or,   int_hexagon_S2_asl_i_p_or>;
+def : T_PPI_pat <S2_lsr_i_p_xacc, int_hexagon_S2_lsr_i_p_xacc>;
+def : T_PPI_pat <S2_asl_i_p_xacc, int_hexagon_S2_asl_i_p_xacc>;
+
+def : T_RRR_pat <S2_asr_r_r_nac,  int_hexagon_S2_asr_r_r_nac>;
+def : T_RRR_pat <S2_lsr_r_r_nac,  int_hexagon_S2_lsr_r_r_nac>;
+def : T_RRR_pat <S2_asl_r_r_nac,  int_hexagon_S2_asl_r_r_nac>;
+def : T_RRR_pat <S2_lsl_r_r_nac,  int_hexagon_S2_lsl_r_r_nac>;
+def : T_RRR_pat <S2_asr_r_r_acc,  int_hexagon_S2_asr_r_r_acc>;
+def : T_RRR_pat <S2_lsr_r_r_acc,  int_hexagon_S2_lsr_r_r_acc>;
+def : T_RRR_pat <S2_asl_r_r_acc,  int_hexagon_S2_asl_r_r_acc>;
+def : T_RRR_pat <S2_lsl_r_r_acc,  int_hexagon_S2_lsl_r_r_acc>;
+
+def : T_RRR_pat <S2_asr_r_r_and,  int_hexagon_S2_asr_r_r_and>;
+def : T_RRR_pat <S2_lsr_r_r_and,  int_hexagon_S2_lsr_r_r_and>;
+def : T_RRR_pat <S2_asl_r_r_and,  int_hexagon_S2_asl_r_r_and>;
+def : T_RRR_pat <S2_lsl_r_r_and,  int_hexagon_S2_lsl_r_r_and>;
+def : T_RRR_pat <S2_asr_r_r_or,   int_hexagon_S2_asr_r_r_or>;
+def : T_RRR_pat <S2_lsr_r_r_or,   int_hexagon_S2_lsr_r_r_or>;
+def : T_RRR_pat <S2_asl_r_r_or,   int_hexagon_S2_asl_r_r_or>;
+def : T_RRR_pat <S2_lsl_r_r_or,   int_hexagon_S2_lsl_r_r_or>;
+
+def : T_PPR_pat <S2_asr_r_p_nac,  int_hexagon_S2_asr_r_p_nac>;
+def : T_PPR_pat <S2_lsr_r_p_nac,  int_hexagon_S2_lsr_r_p_nac>;
+def : T_PPR_pat <S2_asl_r_p_nac,  int_hexagon_S2_asl_r_p_nac>;
+def : T_PPR_pat <S2_lsl_r_p_nac,  int_hexagon_S2_lsl_r_p_nac>;
+def : T_PPR_pat <S2_asr_r_p_acc,  int_hexagon_S2_asr_r_p_acc>;
+def : T_PPR_pat <S2_lsr_r_p_acc,  int_hexagon_S2_lsr_r_p_acc>;
+def : T_PPR_pat <S2_asl_r_p_acc,  int_hexagon_S2_asl_r_p_acc>;
+def : T_PPR_pat <S2_lsl_r_p_acc,  int_hexagon_S2_lsl_r_p_acc>;
+
+def : T_PPR_pat <S2_asr_r_p_and,  int_hexagon_S2_asr_r_p_and>;
+def : T_PPR_pat <S2_lsr_r_p_and,  int_hexagon_S2_lsr_r_p_and>;
+def : T_PPR_pat <S2_asl_r_p_and,  int_hexagon_S2_asl_r_p_and>;
+def : T_PPR_pat <S2_lsl_r_p_and,  int_hexagon_S2_lsl_r_p_and>;
+def : T_PPR_pat <S2_asr_r_p_or,   int_hexagon_S2_asr_r_p_or>;
+def : T_PPR_pat <S2_lsr_r_p_or,   int_hexagon_S2_lsr_r_p_or>;
+def : T_PPR_pat <S2_asl_r_p_or,   int_hexagon_S2_asl_r_p_or>;
+def : T_PPR_pat <S2_lsl_r_p_or,   int_hexagon_S2_lsl_r_p_or>;
 
 /********************************************************************
-*            CR                                                     *
+*            ALU32/ALU                                              *
 *********************************************************************/
+def : T_RR_pat<A2_add,      int_hexagon_A2_add>;
+def : T_RI_pat<A2_addi,     int_hexagon_A2_addi>;
+def : T_RR_pat<A2_sub,      int_hexagon_A2_sub>;
+def : T_IR_pat<A2_subri,    int_hexagon_A2_subri>;
+def : T_RR_pat<A2_and,      int_hexagon_A2_and>;
+def : T_RI_pat<A2_andir,    int_hexagon_A2_andir>;
+def : T_RR_pat<A2_or,       int_hexagon_A2_or>;
+def : T_RI_pat<A2_orir,     int_hexagon_A2_orir>;
+def : T_RR_pat<A2_xor,      int_hexagon_A2_xor>;
+def : T_RR_pat<A2_combinew, int_hexagon_A2_combinew>;
+
+// Assembler mapped from Rd32=not(Rs32) to Rd32=sub(#-1,Rs32)
+def : Pat <(int_hexagon_A2_not (I32:$Rs)),
+           (A2_subri -1, IntRegs:$Rs)>;
+
+// Assembler mapped from Rd32=neg(Rs32) to Rd32=sub(#0,Rs32)
+def : Pat <(int_hexagon_A2_neg IntRegs:$Rs),
+           (A2_subri 0, IntRegs:$Rs)>;
+
+// Transfer immediate
+def  : Pat <(int_hexagon_A2_tfril (I32:$Rs), u16_0ImmPred:$Is),
+            (A2_tfril IntRegs:$Rs, u16_0ImmPred:$Is)>;
+def  : Pat <(int_hexagon_A2_tfrih (I32:$Rs), u16_0ImmPred:$Is),
+            (A2_tfrih IntRegs:$Rs, u16_0ImmPred:$Is)>;
+
+//  Transfer Register/immediate.
+def : T_R_pat <A2_tfr, int_hexagon_A2_tfr>;
+def : T_I_pat <A2_tfrsi, int_hexagon_A2_tfrsi>;
+
+// Assembler mapped from Rdd32=Rss32 to Rdd32=combine(Rss.H32,Rss.L32)
+def : Pat<(int_hexagon_A2_tfrp DoubleRegs:$src),
+          (A2_combinew (HiReg DoubleRegs:$src), (LoReg DoubleRegs:$src))>;
 
-// CR / Logical reductions on predicates.
-def HEXAGON_C2_all8:
-  qi_SInst_qi                     <"all8",     int_hexagon_C2_all8>;
-def HEXAGON_C2_any8:
-  qi_SInst_qi                     <"any8",     int_hexagon_C2_any8>;
-
-// CR / Logical operations on predicates.
-def HEXAGON_C2_pxfer_map:
-  qi_SInst_qi_pxfer               <"",         int_hexagon_C2_pxfer_map>;
-def HEXAGON_C2_and:
-  qi_SInst_qiqi                   <"and",      int_hexagon_C2_and>;
-def HEXAGON_C2_andn:
-  qi_SInst_qiqi_neg               <"and",      int_hexagon_C2_andn>;
-def HEXAGON_C2_not:
-  qi_SInst_qi                     <"not",      int_hexagon_C2_not>;
-def HEXAGON_C2_or:
-  qi_SInst_qiqi                   <"or",       int_hexagon_C2_or>;
-def HEXAGON_C2_orn:
-  qi_SInst_qiqi_neg               <"or",       int_hexagon_C2_orn>;
-def HEXAGON_C2_xor:
-  qi_SInst_qiqi                   <"xor",      int_hexagon_C2_xor>;
-
+/********************************************************************
+*            ALU32/PERM                                             *
+*********************************************************************/
+// Combine
+def: T_RR_pat<A2_combine_hh, int_hexagon_A2_combine_hh>;
+def: T_RR_pat<A2_combine_hl, int_hexagon_A2_combine_hl>;
+def: T_RR_pat<A2_combine_lh, int_hexagon_A2_combine_lh>;
+def: T_RR_pat<A2_combine_ll, int_hexagon_A2_combine_ll>;
+
+def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s8ExtPred, s8ImmPred>;
+
+def: Pat<(i32 (int_hexagon_C2_mux (I32:$Rp), (I32:$Rs),
+                                                     (I32:$Rt))),
+         (i32 (C2_mux (C2_tfrrp IntRegs:$Rp), IntRegs:$Rs, IntRegs:$Rt))>;
+
+// Mux
+def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s8ExtPred>;
+def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s8ExtPred>;
+def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s8ExtPred, s8ImmPred>;
+
+// Shift halfword
+def : T_R_pat<A2_aslh, int_hexagon_A2_aslh>;
+def : T_R_pat<A2_asrh, int_hexagon_A2_asrh>;
+def : T_R_pat<A2_asrh, int_hexagon_SI_to_SXTHI_asrh>;
+
+// Sign/zero extend
+def : T_R_pat<A2_sxth, int_hexagon_A2_sxth>;
+def : T_R_pat<A2_sxtb, int_hexagon_A2_sxtb>;
+def : T_R_pat<A2_zxth, int_hexagon_A2_zxth>;
+def : T_R_pat<A2_zxtb, int_hexagon_A2_zxtb>;
 
 /********************************************************************
-*            MTYPE/ALU                                              *
+*            ALU32/PRED                                             *
 *********************************************************************/
+// Compare
+def : T_RR_pat<C2_cmpeq,  int_hexagon_C2_cmpeq>;
+def : T_RR_pat<C2_cmpgt,  int_hexagon_C2_cmpgt>;
+def : T_RR_pat<C2_cmpgtu, int_hexagon_C2_cmpgtu>;
+
+def : T_RI_pat<C2_cmpeqi, int_hexagon_C2_cmpeqi, s10ExtPred>;
+def : T_RI_pat<C2_cmpgti, int_hexagon_C2_cmpgti, s10ExtPred>;
+def : T_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u9ExtPred>;
 
-// MTYPE / ALU / Add and accumulate.
-def HEXAGON_M2_acci:
-  si_MInst_sisisi_acc             <"add",      int_hexagon_M2_acci>;
-def HEXAGON_M2_accii:
-  si_MInst_sisis8_acc             <"add",      int_hexagon_M2_accii>;
-def HEXAGON_M2_nacci:
-  si_MInst_sisisi_nac             <"add",      int_hexagon_M2_nacci>;
-def HEXAGON_M2_naccii:
-  si_MInst_sisis8_nac             <"add",      int_hexagon_M2_naccii>;
+def : Pat <(i32 (int_hexagon_C2_cmpgei (I32:$src1), s8ExtPred:$src2)),
+      (i32 (C2_cmpgti (I32:$src1),
+                      (DEC_CONST_SIGNED s8ExtPred:$src2)))>;
 
-// MTYPE / ALU / Subtract and accumulate.
-def HEXAGON_M2_subacc:
-  si_MInst_sisisi_acc             <"sub",      int_hexagon_M2_subacc>;
+def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), u8ExtPred:$src2)),
+      (i32 (C2_cmpgtui (I32:$src1),
+                       (DEC_CONST_UNSIGNED u8ExtPred:$src2)))>;
 
-// MTYPE / ALU / Vector absolute difference.
-def HEXAGON_M2_vabsdiffh:
-  di_MInst_didi                   <"vabsdiffh",int_hexagon_M2_vabsdiffh>;
-def HEXAGON_M2_vabsdiffw:
-  di_MInst_didi                   <"vabsdiffw",int_hexagon_M2_vabsdiffw>;
+// The instruction, Pd=cmp.geu(Rs, #u8) -> Pd=cmp.eq(Rs,Rs) when #u8 == 0.
+def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), 0)),
+      (i32 (C2_cmpeq (I32:$src1), (I32:$src1)))>;
 
-// MTYPE / ALU / XOR and xor with destination.
-def HEXAGON_M2_xor_xacc:
-  si_MInst_sisisi_xacc            <"xor",      int_hexagon_M2_xor_xacc>;
+def : Pat <(i32 (int_hexagon_C2_cmplt (I32:$src1),
+                                      (I32:$src2))),
+      (i32 (C2_cmpgt (I32:$src2), (I32:$src1)))>;
 
+def : Pat <(i32 (int_hexagon_C2_cmpltu (I32:$src1),
+                                       (I32:$src2))),
+      (i32 (C2_cmpgtu (I32:$src2), (I32:$src1)))>;
 
 /********************************************************************
-*            MTYPE/COMPLEX                                          *
+*            ALU32/VH                                               *
 *********************************************************************/
+// Vector add, subtract, average halfwords
+def: T_RR_pat<A2_svaddh,   int_hexagon_A2_svaddh>;
+def: T_RR_pat<A2_svaddhs,  int_hexagon_A2_svaddhs>;
+def: T_RR_pat<A2_svadduhs, int_hexagon_A2_svadduhs>;
 
-// MTYPE / COMPLEX / Complex multiply.
-// Rdd[-+]=cmpy(Rs, Rt:<<1]:sat
-def HEXAGON_M2_cmpys_s1:
-  di_MInst_sisi_s1_sat            <"cmpy",     int_hexagon_M2_cmpys_s1>;
-def HEXAGON_M2_cmpys_s0:
-  di_MInst_sisi_sat               <"cmpy",     int_hexagon_M2_cmpys_s0>;
-def HEXAGON_M2_cmpysc_s1:
-  di_MInst_sisi_s1_sat_conj       <"cmpy",     int_hexagon_M2_cmpysc_s1>;
-def HEXAGON_M2_cmpysc_s0:
-  di_MInst_sisi_sat_conj          <"cmpy",     int_hexagon_M2_cmpysc_s0>;
-
-def HEXAGON_M2_cmacs_s1:
-  di_MInst_disisi_acc_s1_sat      <"cmpy",     int_hexagon_M2_cmacs_s1>;
-def HEXAGON_M2_cmacs_s0:
-  di_MInst_disisi_acc_sat         <"cmpy",     int_hexagon_M2_cmacs_s0>;
-def HEXAGON_M2_cmacsc_s1:
-  di_MInst_disisi_acc_s1_sat_conj <"cmpy",     int_hexagon_M2_cmacsc_s1>;
-def HEXAGON_M2_cmacsc_s0:
-  di_MInst_disisi_acc_sat_conj    <"cmpy",     int_hexagon_M2_cmacsc_s0>;
-
-def HEXAGON_M2_cnacs_s1:
-  di_MInst_disisi_nac_s1_sat      <"cmpy",     int_hexagon_M2_cnacs_s1>;
-def HEXAGON_M2_cnacs_s0:
-  di_MInst_disisi_nac_sat         <"cmpy",     int_hexagon_M2_cnacs_s0>;
-def HEXAGON_M2_cnacsc_s1:
-  di_MInst_disisi_nac_s1_sat_conj <"cmpy",     int_hexagon_M2_cnacsc_s1>;
-def HEXAGON_M2_cnacsc_s0:
-  di_MInst_disisi_nac_sat_conj    <"cmpy",     int_hexagon_M2_cnacsc_s0>;
-
-// MTYPE / COMPLEX / Complex multiply real or imaginary.
-def HEXAGON_M2_cmpyr_s0:
-  di_MInst_sisi                   <"cmpyr",    int_hexagon_M2_cmpyr_s0>;
-def HEXAGON_M2_cmacr_s0:
-  di_MInst_disisi_acc             <"cmpyr",    int_hexagon_M2_cmacr_s0>;
-
-def HEXAGON_M2_cmpyi_s0:
-  di_MInst_sisi                   <"cmpyi",    int_hexagon_M2_cmpyi_s0>;
-def HEXAGON_M2_cmaci_s0:
-  di_MInst_disisi_acc             <"cmpyi",    int_hexagon_M2_cmaci_s0>;
-
-// MTYPE / COMPLEX / Complex multiply with round and pack.
-// Rxx32+=cmpy(Rs32,[*]Rt32:<<1]:rnd:sat
-def HEXAGON_M2_cmpyrs_s0:
-  si_MInst_sisi_rnd_sat           <"cmpy",     int_hexagon_M2_cmpyrs_s0>;
-def HEXAGON_M2_cmpyrs_s1:
-  si_MInst_sisi_s1_rnd_sat        <"cmpy",     int_hexagon_M2_cmpyrs_s1>;
-
-def HEXAGON_M2_cmpyrsc_s0:
-  si_MInst_sisi_rnd_sat_conj      <"cmpy",     int_hexagon_M2_cmpyrsc_s0>;
-def HEXAGON_M2_cmpyrsc_s1:
-  si_MInst_sisi_s1_rnd_sat_conj   <"cmpy",     int_hexagon_M2_cmpyrsc_s1>;
-
-//MTYPE / COMPLEX / Vector complex multiply real or imaginary.
-def HEXAGON_M2_vcmpy_s0_sat_i:
-  di_MInst_didi_sat               <"vcmpyi",   int_hexagon_M2_vcmpy_s0_sat_i>;
-def HEXAGON_M2_vcmpy_s1_sat_i:
-  di_MInst_didi_s1_sat            <"vcmpyi",   int_hexagon_M2_vcmpy_s1_sat_i>;
-
-def HEXAGON_M2_vcmpy_s0_sat_r:
-  di_MInst_didi_sat               <"vcmpyr",   int_hexagon_M2_vcmpy_s0_sat_r>;
-def HEXAGON_M2_vcmpy_s1_sat_r:
-  di_MInst_didi_s1_sat            <"vcmpyr",   int_hexagon_M2_vcmpy_s1_sat_r>;
-
-def HEXAGON_M2_vcmac_s0_sat_i:
-  di_MInst_dididi_acc_sat         <"vcmpyi",   int_hexagon_M2_vcmac_s0_sat_i>;
-def HEXAGON_M2_vcmac_s0_sat_r:
-  di_MInst_dididi_acc_sat         <"vcmpyr",   int_hexagon_M2_vcmac_s0_sat_r>;
-
-//MTYPE / COMPLEX / Vector reduce complex multiply real or imaginary.
-def HEXAGON_M2_vrcmpyi_s0:
-  di_MInst_didi                   <"vrcmpyi",  int_hexagon_M2_vrcmpyi_s0>;
-def HEXAGON_M2_vrcmpyr_s0:
-  di_MInst_didi                   <"vrcmpyr",  int_hexagon_M2_vrcmpyr_s0>;
-
-def HEXAGON_M2_vrcmpyi_s0c:
-  di_MInst_didi_conj              <"vrcmpyi",  int_hexagon_M2_vrcmpyi_s0c>;
-def HEXAGON_M2_vrcmpyr_s0c:
-  di_MInst_didi_conj              <"vrcmpyr",  int_hexagon_M2_vrcmpyr_s0c>;
-
-def HEXAGON_M2_vrcmaci_s0:
-  di_MInst_dididi_acc             <"vrcmpyi",  int_hexagon_M2_vrcmaci_s0>;
-def HEXAGON_M2_vrcmacr_s0:
-  di_MInst_dididi_acc             <"vrcmpyr",  int_hexagon_M2_vrcmacr_s0>;
-
-def HEXAGON_M2_vrcmaci_s0c:
-  di_MInst_dididi_acc_conj        <"vrcmpyi",  int_hexagon_M2_vrcmaci_s0c>;
-def HEXAGON_M2_vrcmacr_s0c:
-  di_MInst_dididi_acc_conj        <"vrcmpyr",  int_hexagon_M2_vrcmacr_s0c>;
+def: T_RR_pat<A2_svsubh,   int_hexagon_A2_svsubh>;
+def: T_RR_pat<A2_svsubhs,  int_hexagon_A2_svsubhs>;
+def: T_RR_pat<A2_svsubuhs, int_hexagon_A2_svsubuhs>;
 
+def: T_RR_pat<A2_svavgh,   int_hexagon_A2_svavgh>;
+def: T_RR_pat<A2_svavghs,  int_hexagon_A2_svavghs>;
+def: T_RR_pat<A2_svnavgh,  int_hexagon_A2_svnavgh>;
 
 /********************************************************************
-*            MTYPE/MPYH                                             *
+*            ALU64/ALU                                              *
 *********************************************************************/
+def: T_RR_pat<A2_addsat,   int_hexagon_A2_addsat>;
+def: T_RR_pat<A2_subsat,   int_hexagon_A2_subsat>;
+def: T_PP_pat<A2_addp,     int_hexagon_A2_addp>;
+def: T_PP_pat<A2_subp,     int_hexagon_A2_subp>;
+
+def: T_PP_pat<A2_andp,     int_hexagon_A2_andp>;
+def: T_PP_pat<A2_orp,      int_hexagon_A2_orp>;
+def: T_PP_pat<A2_xorp,     int_hexagon_A2_xorp>;
 
-// MTYPE / MPYH / Multiply and use lower result.
-//def HEXAGON_M2_mpysmi:
-//FIXME: Hexagon_M2_mpysmi should really by of the type si_MInst_sim9,
-// not si_MInst_sis9 - but for now, we will use s9.
-// def Hexagon_M2_mpysmi:
-//  si_MInst_sim9                   <"mpyi",     int_hexagon_M2_mpysmi>;
-def Hexagon_M2_mpysmi:
-  si_MInst_sis9                   <"mpyi",     int_hexagon_M2_mpysmi>;
-def HEXAGON_M2_mpyi:
-  si_MInst_sisi                   <"mpyi",     int_hexagon_M2_mpyi>;
-def HEXAGON_M2_mpyui:
-  si_MInst_sisi                   <"mpyui",    int_hexagon_M2_mpyui>;
-def HEXAGON_M2_macsip:
-  si_MInst_sisiu8_acc             <"mpyi",     int_hexagon_M2_macsip>;
-def HEXAGON_M2_maci:
-  si_MInst_sisisi_acc             <"mpyi",     int_hexagon_M2_maci>;
-def HEXAGON_M2_macsin:
-  si_MInst_sisiu8_nac             <"mpyi",     int_hexagon_M2_macsin>;
-
-// MTYPE / MPYH / Multiply word by half (32x16).
-//Rdd[+]=vmpywoh(Rss,Rtt)[:<<1][:rnd][:sat]
-//Rdd[+]=vmpyweh(Rss,Rtt)[:<<1][:rnd][:sat]
-def HEXAGON_M2_mmpyl_rs1:
-  di_MInst_didi_s1_rnd_sat        <"vmpyweh",  int_hexagon_M2_mmpyl_rs1>;
-def HEXAGON_M2_mmpyl_s1:
-  di_MInst_didi_s1_sat            <"vmpyweh",  int_hexagon_M2_mmpyl_s1>;
-def HEXAGON_M2_mmpyl_rs0:
-  di_MInst_didi_rnd_sat           <"vmpyweh",  int_hexagon_M2_mmpyl_rs0>;
-def HEXAGON_M2_mmpyl_s0:
-  di_MInst_didi_sat               <"vmpyweh",  int_hexagon_M2_mmpyl_s0>;
-def HEXAGON_M2_mmpyh_rs1:
-  di_MInst_didi_s1_rnd_sat        <"vmpywoh",  int_hexagon_M2_mmpyh_rs1>;
-def HEXAGON_M2_mmpyh_s1:
-  di_MInst_didi_s1_sat            <"vmpywoh",  int_hexagon_M2_mmpyh_s1>;
-def HEXAGON_M2_mmpyh_rs0:
-  di_MInst_didi_rnd_sat           <"vmpywoh",  int_hexagon_M2_mmpyh_rs0>;
-def HEXAGON_M2_mmpyh_s0:
-  di_MInst_didi_sat               <"vmpywoh",  int_hexagon_M2_mmpyh_s0>;
-def HEXAGON_M2_mmacls_rs1:
-  di_MInst_dididi_acc_s1_rnd_sat  <"vmpyweh",  int_hexagon_M2_mmacls_rs1>;
-def HEXAGON_M2_mmacls_s1:
-  di_MInst_dididi_acc_s1_sat      <"vmpyweh",  int_hexagon_M2_mmacls_s1>;
-def HEXAGON_M2_mmacls_rs0:
-  di_MInst_dididi_acc_rnd_sat     <"vmpyweh",  int_hexagon_M2_mmacls_rs0>;
-def HEXAGON_M2_mmacls_s0:
-  di_MInst_dididi_acc_sat         <"vmpyweh",  int_hexagon_M2_mmacls_s0>;
-def HEXAGON_M2_mmachs_rs1:
-  di_MInst_dididi_acc_s1_rnd_sat  <"vmpywoh",  int_hexagon_M2_mmachs_rs1>;
-def HEXAGON_M2_mmachs_s1:
-  di_MInst_dididi_acc_s1_sat      <"vmpywoh",  int_hexagon_M2_mmachs_s1>;
-def HEXAGON_M2_mmachs_rs0:
-  di_MInst_dididi_acc_rnd_sat     <"vmpywoh",  int_hexagon_M2_mmachs_rs0>;
-def HEXAGON_M2_mmachs_s0:
-  di_MInst_dididi_acc_sat         <"vmpywoh",  int_hexagon_M2_mmachs_s0>;
-
-// MTYPE / MPYH / Multiply word by unsigned half (32x16).
-//Rdd[+]=vmpywouh(Rss,Rtt)[:<<1][:rnd][:sat]
-//Rdd[+]=vmpyweuh(Rss,Rtt)[:<<1][:rnd][:sat]
-def HEXAGON_M2_mmpyul_rs1:
-  di_MInst_didi_s1_rnd_sat        <"vmpyweuh", int_hexagon_M2_mmpyul_rs1>;
-def HEXAGON_M2_mmpyul_s1:
-  di_MInst_didi_s1_sat            <"vmpyweuh", int_hexagon_M2_mmpyul_s1>;
-def HEXAGON_M2_mmpyul_rs0:
-  di_MInst_didi_rnd_sat           <"vmpyweuh", int_hexagon_M2_mmpyul_rs0>;
-def HEXAGON_M2_mmpyul_s0:
-  di_MInst_didi_sat               <"vmpyweuh", int_hexagon_M2_mmpyul_s0>;
-def HEXAGON_M2_mmpyuh_rs1:
-  di_MInst_didi_s1_rnd_sat        <"vmpywouh", int_hexagon_M2_mmpyuh_rs1>;
-def HEXAGON_M2_mmpyuh_s1:
-  di_MInst_didi_s1_sat            <"vmpywouh", int_hexagon_M2_mmpyuh_s1>;
-def HEXAGON_M2_mmpyuh_rs0:
-  di_MInst_didi_rnd_sat           <"vmpywouh", int_hexagon_M2_mmpyuh_rs0>;
-def HEXAGON_M2_mmpyuh_s0:
-  di_MInst_didi_sat               <"vmpywouh", int_hexagon_M2_mmpyuh_s0>;
-def HEXAGON_M2_mmaculs_rs1:
-  di_MInst_dididi_acc_s1_rnd_sat  <"vmpyweuh", int_hexagon_M2_mmaculs_rs1>;
-def HEXAGON_M2_mmaculs_s1:
-  di_MInst_dididi_acc_s1_sat      <"vmpyweuh", int_hexagon_M2_mmaculs_s1>;
-def HEXAGON_M2_mmaculs_rs0:
-  di_MInst_dididi_acc_rnd_sat     <"vmpyweuh", int_hexagon_M2_mmaculs_rs0>;
-def HEXAGON_M2_mmaculs_s0:
-  di_MInst_dididi_acc_sat         <"vmpyweuh", int_hexagon_M2_mmaculs_s0>;
-def HEXAGON_M2_mmacuhs_rs1:
-  di_MInst_dididi_acc_s1_rnd_sat  <"vmpywouh", int_hexagon_M2_mmacuhs_rs1>;
-def HEXAGON_M2_mmacuhs_s1:
-  di_MInst_dididi_acc_s1_sat      <"vmpywouh", int_hexagon_M2_mmacuhs_s1>;
-def HEXAGON_M2_mmacuhs_rs0:
-  di_MInst_dididi_acc_rnd_sat     <"vmpywouh", int_hexagon_M2_mmacuhs_rs0>;
-def HEXAGON_M2_mmacuhs_s0:
-  di_MInst_dididi_acc_sat         <"vmpywouh", int_hexagon_M2_mmacuhs_s0>;
-
-// MTYPE / MPYH / Multiply and use upper result.
-def HEXAGON_M2_hmmpyh_rs1:
-  si_MInst_sisi_h_s1_rnd_sat      <"mpy",      int_hexagon_M2_hmmpyh_rs1>;
-def HEXAGON_M2_hmmpyl_rs1:
-  si_MInst_sisi_l_s1_rnd_sat      <"mpy",      int_hexagon_M2_hmmpyl_rs1>;
-def HEXAGON_M2_mpy_up:
-  si_MInst_sisi                   <"mpy",      int_hexagon_M2_mpy_up>;
-def HEXAGON_M2_dpmpyss_rnd_s0:
-  si_MInst_sisi_rnd               <"mpy",      int_hexagon_M2_dpmpyss_rnd_s0>;
-def HEXAGON_M2_mpyu_up:
-  si_MInst_sisi                   <"mpyu",     int_hexagon_M2_mpyu_up>;
-
-// MTYPE / MPYH / Multiply and use full result.
-def HEXAGON_M2_dpmpyuu_s0:
-  di_MInst_sisi                   <"mpyu",     int_hexagon_M2_dpmpyuu_s0>;
-def HEXAGON_M2_dpmpyuu_acc_s0:
-  di_MInst_disisi_acc             <"mpyu",     int_hexagon_M2_dpmpyuu_acc_s0>;
-def HEXAGON_M2_dpmpyuu_nac_s0:
-  di_MInst_disisi_nac             <"mpyu",     int_hexagon_M2_dpmpyuu_nac_s0>;
-def HEXAGON_M2_dpmpyss_s0:
-  di_MInst_sisi                   <"mpy",      int_hexagon_M2_dpmpyss_s0>;
-def HEXAGON_M2_dpmpyss_acc_s0:
-  di_MInst_disisi_acc             <"mpy",      int_hexagon_M2_dpmpyss_acc_s0>;
-def HEXAGON_M2_dpmpyss_nac_s0:
-  di_MInst_disisi_nac             <"mpy",      int_hexagon_M2_dpmpyss_nac_s0>;
+def: T_PP_pat<C2_cmpeqp,   int_hexagon_C2_cmpeqp>;
+def: T_PP_pat<C2_cmpgtp,   int_hexagon_C2_cmpgtp>;
+def: T_PP_pat<C2_cmpgtup,  int_hexagon_C2_cmpgtup>;
 
+def: T_PP_pat<S2_parityp,  int_hexagon_S2_parityp>;
+def: T_RR_pat<S2_packhl,   int_hexagon_S2_packhl>;
 
 /********************************************************************
-*            MTYPE/MPYS                                             *
+*            ALU64/VB                                               *
 *********************************************************************/
+// ALU64 - Vector add
+def : T_PP_pat <A2_vaddub,   int_hexagon_A2_vaddub>;
+def : T_PP_pat <A2_vaddubs,  int_hexagon_A2_vaddubs>;
+def : T_PP_pat <A2_vaddh,    int_hexagon_A2_vaddh>;
+def : T_PP_pat <A2_vaddhs,   int_hexagon_A2_vaddhs>;
+def : T_PP_pat <A2_vadduhs,  int_hexagon_A2_vadduhs>;
+def : T_PP_pat <A2_vaddw,    int_hexagon_A2_vaddw>;
+def : T_PP_pat <A2_vaddws,   int_hexagon_A2_vaddws>;
+
+// ALU64 - Vector average
+def : T_PP_pat <A2_vavgub,   int_hexagon_A2_vavgub>;
+def : T_PP_pat <A2_vavgubr,  int_hexagon_A2_vavgubr>;
+def : T_PP_pat <A2_vavgh,    int_hexagon_A2_vavgh>;
+def : T_PP_pat <A2_vavghr,   int_hexagon_A2_vavghr>;
+def : T_PP_pat <A2_vavghcr,  int_hexagon_A2_vavghcr>;
+def : T_PP_pat <A2_vavguh,   int_hexagon_A2_vavguh>;
+def : T_PP_pat <A2_vavguhr,  int_hexagon_A2_vavguhr>;
+
+def : T_PP_pat <A2_vavgw,    int_hexagon_A2_vavgw>;
+def : T_PP_pat <A2_vavgwr,   int_hexagon_A2_vavgwr>;
+def : T_PP_pat <A2_vavgwcr,  int_hexagon_A2_vavgwcr>;
+def : T_PP_pat <A2_vavguw,   int_hexagon_A2_vavguw>;
+def : T_PP_pat <A2_vavguwr,  int_hexagon_A2_vavguwr>;
+
+// ALU64 - Vector negative average
+def : T_PP_pat <A2_vnavgh,   int_hexagon_A2_vnavgh>;
+def : T_PP_pat <A2_vnavghr,  int_hexagon_A2_vnavghr>;
+def : T_PP_pat <A2_vnavghcr, int_hexagon_A2_vnavghcr>;
+def : T_PP_pat <A2_vnavgw,   int_hexagon_A2_vnavgw>;
+def : T_PP_pat <A2_vnavgwr,  int_hexagon_A2_vnavgwr>;
+def : T_PP_pat <A2_vnavgwcr, int_hexagon_A2_vnavgwcr>;
+
+// ALU64 - Vector max
+def : T_PP_pat <A2_vmaxh,    int_hexagon_A2_vmaxh>;
+def : T_PP_pat <A2_vmaxw,    int_hexagon_A2_vmaxw>;
+def : T_PP_pat <A2_vmaxub,   int_hexagon_A2_vmaxub>;
+def : T_PP_pat <A2_vmaxuh,   int_hexagon_A2_vmaxuh>;
+def : T_PP_pat <A2_vmaxuw,   int_hexagon_A2_vmaxuw>;
+
+// ALU64 - Vector min
+def : T_PP_pat <A2_vminh,    int_hexagon_A2_vminh>;
+def : T_PP_pat <A2_vminw,    int_hexagon_A2_vminw>;
+def : T_PP_pat <A2_vminub,   int_hexagon_A2_vminub>;
+def : T_PP_pat <A2_vminuh,   int_hexagon_A2_vminuh>;
+def : T_PP_pat <A2_vminuw,   int_hexagon_A2_vminuw>;
+
+// ALU64 - Vector sub
+def : T_PP_pat <A2_vsubub,   int_hexagon_A2_vsubub>;
+def : T_PP_pat <A2_vsububs,  int_hexagon_A2_vsububs>;
+def : T_PP_pat <A2_vsubh,    int_hexagon_A2_vsubh>;
+def : T_PP_pat <A2_vsubhs,   int_hexagon_A2_vsubhs>;
+def : T_PP_pat <A2_vsubuhs,  int_hexagon_A2_vsubuhs>;
+def : T_PP_pat <A2_vsubw,    int_hexagon_A2_vsubw>;
+def : T_PP_pat <A2_vsubws,   int_hexagon_A2_vsubws>;
+
+// ALU64 - Vector compare bytes
+def : T_PP_pat <A2_vcmpbeq,  int_hexagon_A2_vcmpbeq>;
+def : T_PP_pat <A4_vcmpbgt,  int_hexagon_A4_vcmpbgt>;
+def : T_PP_pat <A2_vcmpbgtu, int_hexagon_A2_vcmpbgtu>;
+
+// ALU64 - Vector compare halfwords
+def : T_PP_pat <A2_vcmpheq,  int_hexagon_A2_vcmpheq>;
+def : T_PP_pat <A2_vcmphgt,  int_hexagon_A2_vcmphgt>;
+def : T_PP_pat <A2_vcmphgtu, int_hexagon_A2_vcmphgtu>;
+
+// ALU64 - Vector compare words
+def : T_PP_pat <A2_vcmpweq,  int_hexagon_A2_vcmpweq>;
+def : T_PP_pat <A2_vcmpwgt,  int_hexagon_A2_vcmpwgt>;
+def : T_PP_pat <A2_vcmpwgtu, int_hexagon_A2_vcmpwgtu>;
 
-// MTYPE / MPYS / Scalar 16x16 multiply signed.
-//Rd=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]|
-//          [:<<0[:rnd|:sat|:rnd:sat]|:<<1[:rnd|:sat|:rnd:sat]]]
-def HEXAGON_M2_mpy_hh_s0:
-  si_MInst_sisi_hh                <"mpy",     int_hexagon_M2_mpy_hh_s0>;
-def HEXAGON_M2_mpy_hh_s1:
-  si_MInst_sisi_hh_s1             <"mpy",     int_hexagon_M2_mpy_hh_s1>;
-def HEXAGON_M2_mpy_rnd_hh_s1:
-  si_MInst_sisi_rnd_hh_s1         <"mpy",     int_hexagon_M2_mpy_rnd_hh_s1>;
-def HEXAGON_M2_mpy_sat_rnd_hh_s1:
-  si_MInst_sisi_sat_rnd_hh_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_hh_s1>;
-def HEXAGON_M2_mpy_sat_hh_s1:
-  si_MInst_sisi_sat_hh_s1         <"mpy",     int_hexagon_M2_mpy_sat_hh_s1>;
-def HEXAGON_M2_mpy_rnd_hh_s0:
-  si_MInst_sisi_rnd_hh            <"mpy",     int_hexagon_M2_mpy_rnd_hh_s0>;
-def HEXAGON_M2_mpy_sat_rnd_hh_s0:
-  si_MInst_sisi_sat_rnd_hh        <"mpy",     int_hexagon_M2_mpy_sat_rnd_hh_s0>;
-def HEXAGON_M2_mpy_sat_hh_s0:
-  si_MInst_sisi_sat_hh            <"mpy",     int_hexagon_M2_mpy_sat_hh_s0>;
-
-def HEXAGON_M2_mpy_hl_s0:
-  si_MInst_sisi_hl                <"mpy",     int_hexagon_M2_mpy_hl_s0>;
-def HEXAGON_M2_mpy_hl_s1:
-  si_MInst_sisi_hl_s1             <"mpy",     int_hexagon_M2_mpy_hl_s1>;
-def HEXAGON_M2_mpy_rnd_hl_s1:
-  si_MInst_sisi_rnd_hl_s1         <"mpy",     int_hexagon_M2_mpy_rnd_hl_s1>;
-def HEXAGON_M2_mpy_sat_rnd_hl_s1:
-  si_MInst_sisi_sat_rnd_hl_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_hl_s1>;
-def HEXAGON_M2_mpy_sat_hl_s1:
-  si_MInst_sisi_sat_hl_s1         <"mpy",     int_hexagon_M2_mpy_sat_hl_s1>;
-def HEXAGON_M2_mpy_rnd_hl_s0:
-  si_MInst_sisi_rnd_hl            <"mpy",     int_hexagon_M2_mpy_rnd_hl_s0>;
-def HEXAGON_M2_mpy_sat_rnd_hl_s0:
-  si_MInst_sisi_sat_rnd_hl        <"mpy",     int_hexagon_M2_mpy_sat_rnd_hl_s0>;
-def HEXAGON_M2_mpy_sat_hl_s0:
-  si_MInst_sisi_sat_hl            <"mpy",     int_hexagon_M2_mpy_sat_hl_s0>;
-
-def HEXAGON_M2_mpy_lh_s0:
-  si_MInst_sisi_lh                <"mpy",     int_hexagon_M2_mpy_lh_s0>;
-def HEXAGON_M2_mpy_lh_s1:
-  si_MInst_sisi_lh_s1             <"mpy",     int_hexagon_M2_mpy_lh_s1>;
-def HEXAGON_M2_mpy_rnd_lh_s1:
-  si_MInst_sisi_rnd_lh_s1         <"mpy",     int_hexagon_M2_mpy_rnd_lh_s1>;
-def HEXAGON_M2_mpy_sat_rnd_lh_s1:
-  si_MInst_sisi_sat_rnd_lh_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_lh_s1>;
-def HEXAGON_M2_mpy_sat_lh_s1:
-  si_MInst_sisi_sat_lh_s1         <"mpy",     int_hexagon_M2_mpy_sat_lh_s1>;
-def HEXAGON_M2_mpy_rnd_lh_s0:
-  si_MInst_sisi_rnd_lh            <"mpy",     int_hexagon_M2_mpy_rnd_lh_s0>;
-def HEXAGON_M2_mpy_sat_rnd_lh_s0:
-  si_MInst_sisi_sat_rnd_lh        <"mpy",     int_hexagon_M2_mpy_sat_rnd_lh_s0>;
-def HEXAGON_M2_mpy_sat_lh_s0:
-  si_MInst_sisi_sat_lh            <"mpy",     int_hexagon_M2_mpy_sat_lh_s0>;
-
-def HEXAGON_M2_mpy_ll_s0:
-  si_MInst_sisi_ll                <"mpy",     int_hexagon_M2_mpy_ll_s0>;
-def HEXAGON_M2_mpy_ll_s1:
-  si_MInst_sisi_ll_s1             <"mpy",     int_hexagon_M2_mpy_ll_s1>;
-def HEXAGON_M2_mpy_rnd_ll_s1:
-  si_MInst_sisi_rnd_ll_s1         <"mpy",     int_hexagon_M2_mpy_rnd_ll_s1>;
-def HEXAGON_M2_mpy_sat_rnd_ll_s1:
-  si_MInst_sisi_sat_rnd_ll_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_ll_s1>;
-def HEXAGON_M2_mpy_sat_ll_s1:
-  si_MInst_sisi_sat_ll_s1         <"mpy",     int_hexagon_M2_mpy_sat_ll_s1>;
-def HEXAGON_M2_mpy_rnd_ll_s0:
-  si_MInst_sisi_rnd_ll            <"mpy",     int_hexagon_M2_mpy_rnd_ll_s0>;
-def HEXAGON_M2_mpy_sat_rnd_ll_s0:
-  si_MInst_sisi_sat_rnd_ll        <"mpy",     int_hexagon_M2_mpy_sat_rnd_ll_s0>;
-def HEXAGON_M2_mpy_sat_ll_s0:
-  si_MInst_sisi_sat_ll            <"mpy",     int_hexagon_M2_mpy_sat_ll_s0>;
-
-//Rdd=mpy(Rs.[H|L],Rt.[H|L])[[:<<0|:<<1]|[:<<0:rnd|:<<1:rnd]]
-def HEXAGON_M2_mpyd_hh_s0:
-  di_MInst_sisi_hh                <"mpy",     int_hexagon_M2_mpyd_hh_s0>;
-def HEXAGON_M2_mpyd_hh_s1:
-  di_MInst_sisi_hh_s1             <"mpy",     int_hexagon_M2_mpyd_hh_s1>;
-def HEXAGON_M2_mpyd_rnd_hh_s1:
-  di_MInst_sisi_rnd_hh_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_hh_s1>;
-def HEXAGON_M2_mpyd_rnd_hh_s0:
-  di_MInst_sisi_rnd_hh            <"mpy",     int_hexagon_M2_mpyd_rnd_hh_s0>;
-
-def HEXAGON_M2_mpyd_hl_s0:
-  di_MInst_sisi_hl                <"mpy",     int_hexagon_M2_mpyd_hl_s0>;
-def HEXAGON_M2_mpyd_hl_s1:
-  di_MInst_sisi_hl_s1             <"mpy",     int_hexagon_M2_mpyd_hl_s1>;
-def HEXAGON_M2_mpyd_rnd_hl_s1:
-  di_MInst_sisi_rnd_hl_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_hl_s1>;
-def HEXAGON_M2_mpyd_rnd_hl_s0:
-  di_MInst_sisi_rnd_hl            <"mpy",     int_hexagon_M2_mpyd_rnd_hl_s0>;
-
-def HEXAGON_M2_mpyd_lh_s0:
-  di_MInst_sisi_lh                <"mpy",     int_hexagon_M2_mpyd_lh_s0>;
-def HEXAGON_M2_mpyd_lh_s1:
-  di_MInst_sisi_lh_s1             <"mpy",     int_hexagon_M2_mpyd_lh_s1>;
-def HEXAGON_M2_mpyd_rnd_lh_s1:
-  di_MInst_sisi_rnd_lh_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_lh_s1>;
-def HEXAGON_M2_mpyd_rnd_lh_s0:
-  di_MInst_sisi_rnd_lh            <"mpy",     int_hexagon_M2_mpyd_rnd_lh_s0>;
-
-def HEXAGON_M2_mpyd_ll_s0:
-  di_MInst_sisi_ll                <"mpy",     int_hexagon_M2_mpyd_ll_s0>;
-def HEXAGON_M2_mpyd_ll_s1:
-  di_MInst_sisi_ll_s1             <"mpy",     int_hexagon_M2_mpyd_ll_s1>;
-def HEXAGON_M2_mpyd_rnd_ll_s1:
-  di_MInst_sisi_rnd_ll_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_ll_s1>;
-def HEXAGON_M2_mpyd_rnd_ll_s0:
-  di_MInst_sisi_rnd_ll            <"mpy",     int_hexagon_M2_mpyd_rnd_ll_s0>;
-
-//Rx+=mpy(Rs.[H|L],Rt.[H|L])[[[:<<0|:<<1]|[:<<0:sat|:<<1:sat]]
-def HEXAGON_M2_mpy_acc_hh_s0:
-  si_MInst_sisisi_acc_hh            <"mpy",  int_hexagon_M2_mpy_acc_hh_s0>;
-def HEXAGON_M2_mpy_acc_hh_s1:
-  si_MInst_sisisi_acc_hh_s1         <"mpy",  int_hexagon_M2_mpy_acc_hh_s1>;
-def HEXAGON_M2_mpy_acc_sat_hh_s1:
-  si_MInst_sisisi_acc_sat_hh_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_hh_s1>;
-def HEXAGON_M2_mpy_acc_sat_hh_s0:
-  si_MInst_sisisi_acc_sat_hh        <"mpy",  int_hexagon_M2_mpy_acc_sat_hh_s0>;
-
-def HEXAGON_M2_mpy_acc_hl_s0:
-  si_MInst_sisisi_acc_hl            <"mpy",  int_hexagon_M2_mpy_acc_hl_s0>;
-def HEXAGON_M2_mpy_acc_hl_s1:
-  si_MInst_sisisi_acc_hl_s1         <"mpy",  int_hexagon_M2_mpy_acc_hl_s1>;
-def HEXAGON_M2_mpy_acc_sat_hl_s1:
-  si_MInst_sisisi_acc_sat_hl_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_hl_s1>;
-def HEXAGON_M2_mpy_acc_sat_hl_s0:
-  si_MInst_sisisi_acc_sat_hl        <"mpy",  int_hexagon_M2_mpy_acc_sat_hl_s0>;
-
-def HEXAGON_M2_mpy_acc_lh_s0:
-  si_MInst_sisisi_acc_lh            <"mpy",  int_hexagon_M2_mpy_acc_lh_s0>;
-def HEXAGON_M2_mpy_acc_lh_s1:
-  si_MInst_sisisi_acc_lh_s1         <"mpy",  int_hexagon_M2_mpy_acc_lh_s1>;
-def HEXAGON_M2_mpy_acc_sat_lh_s1:
-  si_MInst_sisisi_acc_sat_lh_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_lh_s1>;
-def HEXAGON_M2_mpy_acc_sat_lh_s0:
-  si_MInst_sisisi_acc_sat_lh        <"mpy",  int_hexagon_M2_mpy_acc_sat_lh_s0>;
-
-def HEXAGON_M2_mpy_acc_ll_s0:
-  si_MInst_sisisi_acc_ll            <"mpy",  int_hexagon_M2_mpy_acc_ll_s0>;
-def HEXAGON_M2_mpy_acc_ll_s1:
-  si_MInst_sisisi_acc_ll_s1         <"mpy",  int_hexagon_M2_mpy_acc_ll_s1>;
-def HEXAGON_M2_mpy_acc_sat_ll_s1:
-  si_MInst_sisisi_acc_sat_ll_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_ll_s1>;
-def HEXAGON_M2_mpy_acc_sat_ll_s0:
-  si_MInst_sisisi_acc_sat_ll        <"mpy",  int_hexagon_M2_mpy_acc_sat_ll_s0>;
-
-//Rx-=mpy(Rs.[H|L],Rt.[H|L])[[[:<<0|:<<1]|[:<<0:sat|:<<1:sat]]
-def HEXAGON_M2_mpy_nac_hh_s0:
-  si_MInst_sisisi_nac_hh            <"mpy",  int_hexagon_M2_mpy_nac_hh_s0>;
-def HEXAGON_M2_mpy_nac_hh_s1:
-  si_MInst_sisisi_nac_hh_s1         <"mpy",  int_hexagon_M2_mpy_nac_hh_s1>;
-def HEXAGON_M2_mpy_nac_sat_hh_s1:
-  si_MInst_sisisi_nac_sat_hh_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_hh_s1>;
-def HEXAGON_M2_mpy_nac_sat_hh_s0:
-  si_MInst_sisisi_nac_sat_hh        <"mpy",  int_hexagon_M2_mpy_nac_sat_hh_s0>;
-
-def HEXAGON_M2_mpy_nac_hl_s0:
-  si_MInst_sisisi_nac_hl            <"mpy",  int_hexagon_M2_mpy_nac_hl_s0>;
-def HEXAGON_M2_mpy_nac_hl_s1:
-  si_MInst_sisisi_nac_hl_s1         <"mpy",  int_hexagon_M2_mpy_nac_hl_s1>;
-def HEXAGON_M2_mpy_nac_sat_hl_s1:
-  si_MInst_sisisi_nac_sat_hl_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_hl_s1>;
-def HEXAGON_M2_mpy_nac_sat_hl_s0:
-  si_MInst_sisisi_nac_sat_hl        <"mpy",  int_hexagon_M2_mpy_nac_sat_hl_s0>;
-
-def HEXAGON_M2_mpy_nac_lh_s0:
-  si_MInst_sisisi_nac_lh            <"mpy",  int_hexagon_M2_mpy_nac_lh_s0>;
-def HEXAGON_M2_mpy_nac_lh_s1:
-  si_MInst_sisisi_nac_lh_s1         <"mpy",  int_hexagon_M2_mpy_nac_lh_s1>;
-def HEXAGON_M2_mpy_nac_sat_lh_s1:
-  si_MInst_sisisi_nac_sat_lh_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_lh_s1>;
-def HEXAGON_M2_mpy_nac_sat_lh_s0:
-  si_MInst_sisisi_nac_sat_lh        <"mpy",  int_hexagon_M2_mpy_nac_sat_lh_s0>;
-
-def HEXAGON_M2_mpy_nac_ll_s0:
-  si_MInst_sisisi_nac_ll            <"mpy",  int_hexagon_M2_mpy_nac_ll_s0>;
-def HEXAGON_M2_mpy_nac_ll_s1:
-  si_MInst_sisisi_nac_ll_s1         <"mpy",  int_hexagon_M2_mpy_nac_ll_s1>;
-def HEXAGON_M2_mpy_nac_sat_ll_s1:
-  si_MInst_sisisi_nac_sat_ll_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_ll_s1>;
-def HEXAGON_M2_mpy_nac_sat_ll_s0:
-  si_MInst_sisisi_nac_sat_ll        <"mpy",  int_hexagon_M2_mpy_nac_sat_ll_s0>;
-
-//Rx+=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]
-def HEXAGON_M2_mpyd_acc_hh_s0:
-  di_MInst_disisi_acc_hh          <"mpy",    int_hexagon_M2_mpyd_acc_hh_s0>;
-def HEXAGON_M2_mpyd_acc_hh_s1:
-  di_MInst_disisi_acc_hh_s1       <"mpy",    int_hexagon_M2_mpyd_acc_hh_s1>;
-
-def HEXAGON_M2_mpyd_acc_hl_s0:
-  di_MInst_disisi_acc_hl          <"mpy",    int_hexagon_M2_mpyd_acc_hl_s0>;
-def HEXAGON_M2_mpyd_acc_hl_s1:
-  di_MInst_disisi_acc_hl_s1       <"mpy",    int_hexagon_M2_mpyd_acc_hl_s1>;
-
-def HEXAGON_M2_mpyd_acc_lh_s0:
-  di_MInst_disisi_acc_lh          <"mpy",    int_hexagon_M2_mpyd_acc_lh_s0>;
-def HEXAGON_M2_mpyd_acc_lh_s1:
-  di_MInst_disisi_acc_lh_s1       <"mpy",    int_hexagon_M2_mpyd_acc_lh_s1>;
-
-def HEXAGON_M2_mpyd_acc_ll_s0:
-  di_MInst_disisi_acc_ll          <"mpy",    int_hexagon_M2_mpyd_acc_ll_s0>;
-def HEXAGON_M2_mpyd_acc_ll_s1:
-  di_MInst_disisi_acc_ll_s1       <"mpy",    int_hexagon_M2_mpyd_acc_ll_s1>;
-
-//Rx-=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]
-def HEXAGON_M2_mpyd_nac_hh_s0:
-  di_MInst_disisi_nac_hh          <"mpy",    int_hexagon_M2_mpyd_nac_hh_s0>;
-def HEXAGON_M2_mpyd_nac_hh_s1:
-  di_MInst_disisi_nac_hh_s1       <"mpy",    int_hexagon_M2_mpyd_nac_hh_s1>;
-
-def HEXAGON_M2_mpyd_nac_hl_s0:
-  di_MInst_disisi_nac_hl          <"mpy",    int_hexagon_M2_mpyd_nac_hl_s0>;
-def HEXAGON_M2_mpyd_nac_hl_s1:
-  di_MInst_disisi_nac_hl_s1       <"mpy",    int_hexagon_M2_mpyd_nac_hl_s1>;
-
-def HEXAGON_M2_mpyd_nac_lh_s0:
-  di_MInst_disisi_nac_lh          <"mpy",    int_hexagon_M2_mpyd_nac_lh_s0>;
-def HEXAGON_M2_mpyd_nac_lh_s1:
-  di_MInst_disisi_nac_lh_s1       <"mpy",    int_hexagon_M2_mpyd_nac_lh_s1>;
-
-def HEXAGON_M2_mpyd_nac_ll_s0:
-  di_MInst_disisi_nac_ll          <"mpy",    int_hexagon_M2_mpyd_nac_ll_s0>;
-def HEXAGON_M2_mpyd_nac_ll_s1:
-  di_MInst_disisi_nac_ll_s1       <"mpy",    int_hexagon_M2_mpyd_nac_ll_s1>;
-
-// MTYPE / MPYS / Scalar 16x16 multiply unsigned.
-//Rd=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def HEXAGON_M2_mpyu_hh_s0:
-  si_MInst_sisi_hh                <"mpyu",    int_hexagon_M2_mpyu_hh_s0>;
-def HEXAGON_M2_mpyu_hh_s1:
-  si_MInst_sisi_hh_s1             <"mpyu",    int_hexagon_M2_mpyu_hh_s1>;
-def HEXAGON_M2_mpyu_hl_s0:
-  si_MInst_sisi_hl                <"mpyu",    int_hexagon_M2_mpyu_hl_s0>;
-def HEXAGON_M2_mpyu_hl_s1:
-  si_MInst_sisi_hl_s1             <"mpyu",    int_hexagon_M2_mpyu_hl_s1>;
-def HEXAGON_M2_mpyu_lh_s0:
-  si_MInst_sisi_lh                <"mpyu",    int_hexagon_M2_mpyu_lh_s0>;
-def HEXAGON_M2_mpyu_lh_s1:
-  si_MInst_sisi_lh_s1             <"mpyu",    int_hexagon_M2_mpyu_lh_s1>;
-def HEXAGON_M2_mpyu_ll_s0:
-  si_MInst_sisi_ll                <"mpyu",    int_hexagon_M2_mpyu_ll_s0>;
-def HEXAGON_M2_mpyu_ll_s1:
-  si_MInst_sisi_ll_s1             <"mpyu",    int_hexagon_M2_mpyu_ll_s1>;
-
-//Rdd=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def HEXAGON_M2_mpyud_hh_s0:
-  di_MInst_sisi_hh                <"mpyu",    int_hexagon_M2_mpyud_hh_s0>;
-def HEXAGON_M2_mpyud_hh_s1:
-  di_MInst_sisi_hh_s1             <"mpyu",    int_hexagon_M2_mpyud_hh_s1>;
-def HEXAGON_M2_mpyud_hl_s0:
-  di_MInst_sisi_hl                <"mpyu",    int_hexagon_M2_mpyud_hl_s0>;
-def HEXAGON_M2_mpyud_hl_s1:
-  di_MInst_sisi_hl_s1             <"mpyu",    int_hexagon_M2_mpyud_hl_s1>;
-def HEXAGON_M2_mpyud_lh_s0:
-  di_MInst_sisi_lh                <"mpyu",    int_hexagon_M2_mpyud_lh_s0>;
-def HEXAGON_M2_mpyud_lh_s1:
-  di_MInst_sisi_lh_s1             <"mpyu",    int_hexagon_M2_mpyud_lh_s1>;
-def HEXAGON_M2_mpyud_ll_s0:
-  di_MInst_sisi_ll                <"mpyu",    int_hexagon_M2_mpyud_ll_s0>;
-def HEXAGON_M2_mpyud_ll_s1:
-  di_MInst_sisi_ll_s1             <"mpyu",    int_hexagon_M2_mpyud_ll_s1>;
-
-//Rd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def HEXAGON_M2_mpyu_acc_hh_s0:
-  si_MInst_sisisi_acc_hh            <"mpyu",    int_hexagon_M2_mpyu_acc_hh_s0>;
-def HEXAGON_M2_mpyu_acc_hh_s1:
-  si_MInst_sisisi_acc_hh_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_hh_s1>;
-def HEXAGON_M2_mpyu_acc_hl_s0:
-  si_MInst_sisisi_acc_hl            <"mpyu",    int_hexagon_M2_mpyu_acc_hl_s0>;
-def HEXAGON_M2_mpyu_acc_hl_s1:
-  si_MInst_sisisi_acc_hl_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_hl_s1>;
-def HEXAGON_M2_mpyu_acc_lh_s0:
-  si_MInst_sisisi_acc_lh            <"mpyu",    int_hexagon_M2_mpyu_acc_lh_s0>;
-def HEXAGON_M2_mpyu_acc_lh_s1:
-  si_MInst_sisisi_acc_lh_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_lh_s1>;
-def HEXAGON_M2_mpyu_acc_ll_s0:
-  si_MInst_sisisi_acc_ll            <"mpyu",    int_hexagon_M2_mpyu_acc_ll_s0>;
-def HEXAGON_M2_mpyu_acc_ll_s1:
-  si_MInst_sisisi_acc_ll_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_ll_s1>;
-
-//Rd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def HEXAGON_M2_mpyu_nac_hh_s0:
-  si_MInst_sisisi_nac_hh            <"mpyu",    int_hexagon_M2_mpyu_nac_hh_s0>;
-def HEXAGON_M2_mpyu_nac_hh_s1:
-  si_MInst_sisisi_nac_hh_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_hh_s1>;
-def HEXAGON_M2_mpyu_nac_hl_s0:
-  si_MInst_sisisi_nac_hl            <"mpyu",    int_hexagon_M2_mpyu_nac_hl_s0>;
-def HEXAGON_M2_mpyu_nac_hl_s1:
-  si_MInst_sisisi_nac_hl_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_hl_s1>;
-def HEXAGON_M2_mpyu_nac_lh_s0:
-  si_MInst_sisisi_nac_lh            <"mpyu",    int_hexagon_M2_mpyu_nac_lh_s0>;
-def HEXAGON_M2_mpyu_nac_lh_s1:
-  si_MInst_sisisi_nac_lh_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_lh_s1>;
-def HEXAGON_M2_mpyu_nac_ll_s0:
-  si_MInst_sisisi_nac_ll            <"mpyu",    int_hexagon_M2_mpyu_nac_ll_s0>;
-def HEXAGON_M2_mpyu_nac_ll_s1:
-  si_MInst_sisisi_nac_ll_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_ll_s1>;
-
-//Rdd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def HEXAGON_M2_mpyud_acc_hh_s0:
-  di_MInst_disisi_acc_hh            <"mpyu", int_hexagon_M2_mpyud_acc_hh_s0>;
-def HEXAGON_M2_mpyud_acc_hh_s1:
-  di_MInst_disisi_acc_hh_s1         <"mpyu", int_hexagon_M2_mpyud_acc_hh_s1>;
-def HEXAGON_M2_mpyud_acc_hl_s0:
-  di_MInst_disisi_acc_hl            <"mpyu", int_hexagon_M2_mpyud_acc_hl_s0>;
-def HEXAGON_M2_mpyud_acc_hl_s1:
-  di_MInst_disisi_acc_hl_s1         <"mpyu", int_hexagon_M2_mpyud_acc_hl_s1>;
-def HEXAGON_M2_mpyud_acc_lh_s0:
-  di_MInst_disisi_acc_lh            <"mpyu", int_hexagon_M2_mpyud_acc_lh_s0>;
-def HEXAGON_M2_mpyud_acc_lh_s1:
-  di_MInst_disisi_acc_lh_s1         <"mpyu", int_hexagon_M2_mpyud_acc_lh_s1>;
-def HEXAGON_M2_mpyud_acc_ll_s0:
-  di_MInst_disisi_acc_ll            <"mpyu", int_hexagon_M2_mpyud_acc_ll_s0>;
-def HEXAGON_M2_mpyud_acc_ll_s1:
-  di_MInst_disisi_acc_ll_s1         <"mpyu", int_hexagon_M2_mpyud_acc_ll_s1>;
-
-//Rdd-=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def HEXAGON_M2_mpyud_nac_hh_s0:
-  di_MInst_disisi_nac_hh            <"mpyu", int_hexagon_M2_mpyud_nac_hh_s0>;
-def HEXAGON_M2_mpyud_nac_hh_s1:
-  di_MInst_disisi_nac_hh_s1         <"mpyu", int_hexagon_M2_mpyud_nac_hh_s1>;
-def HEXAGON_M2_mpyud_nac_hl_s0:
-  di_MInst_disisi_nac_hl            <"mpyu", int_hexagon_M2_mpyud_nac_hl_s0>;
-def HEXAGON_M2_mpyud_nac_hl_s1:
-  di_MInst_disisi_nac_hl_s1         <"mpyu", int_hexagon_M2_mpyud_nac_hl_s1>;
-def HEXAGON_M2_mpyud_nac_lh_s0:
-  di_MInst_disisi_nac_lh            <"mpyu", int_hexagon_M2_mpyud_nac_lh_s0>;
-def HEXAGON_M2_mpyud_nac_lh_s1:
-  di_MInst_disisi_nac_lh_s1         <"mpyu", int_hexagon_M2_mpyud_nac_lh_s1>;
-def HEXAGON_M2_mpyud_nac_ll_s0:
-  di_MInst_disisi_nac_ll            <"mpyu", int_hexagon_M2_mpyud_nac_ll_s0>;
-def HEXAGON_M2_mpyud_nac_ll_s1:
-  di_MInst_disisi_nac_ll_s1         <"mpyu", int_hexagon_M2_mpyud_nac_ll_s1>;
-
+// ALU64 / VB / Vector mux.
+def : Pat<(int_hexagon_C2_vmux PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt),
+          (C2_vmux PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt)>;
+
+// MPY - Multiply and use full result
+// Rdd = mpy[u](Rs, Rt)
+def : T_RR_pat <M2_dpmpyss_s0, int_hexagon_M2_dpmpyss_s0>;
+def : T_RR_pat <M2_dpmpyuu_s0, int_hexagon_M2_dpmpyuu_s0>;
+
+// Complex multiply real or imaginary
+def : T_RR_pat <M2_cmpyi_s0,   int_hexagon_M2_cmpyi_s0>;
+def : T_RR_pat <M2_cmpyr_s0,   int_hexagon_M2_cmpyr_s0>;
+
+// Complex multiply
+def : T_RR_pat <M2_cmpys_s0,   int_hexagon_M2_cmpys_s0>;
+def : T_RR_pat <M2_cmpysc_s0,  int_hexagon_M2_cmpysc_s0>;
+def : T_RR_pat <M2_cmpys_s1,   int_hexagon_M2_cmpys_s1>;
+def : T_RR_pat <M2_cmpysc_s1,  int_hexagon_M2_cmpysc_s1>;
+
+// Vector multiply halfwords
+// Rdd=vmpyh(Rs,Rt)[:<<1]:sat
+def : T_RR_pat <M2_vmpy2s_s0,  int_hexagon_M2_vmpy2s_s0>;
+def : T_RR_pat <M2_vmpy2s_s1,  int_hexagon_M2_vmpy2s_s1>;
+
+// Rxx[+-]= mpy[u](Rs,Rt)
+def : T_PRR_pat <M2_dpmpyss_acc_s0, int_hexagon_M2_dpmpyss_acc_s0>;
+def : T_PRR_pat <M2_dpmpyss_nac_s0, int_hexagon_M2_dpmpyss_nac_s0>;
+def : T_PRR_pat <M2_dpmpyuu_acc_s0, int_hexagon_M2_dpmpyuu_acc_s0>;
+def : T_PRR_pat <M2_dpmpyuu_nac_s0, int_hexagon_M2_dpmpyuu_nac_s0>;
+
+// Rxx[-+]=cmpy(Rs,Rt)[:<<1]:sat
+def : T_PRR_pat <M2_cmacs_s0, int_hexagon_M2_cmacs_s0>;
+def : T_PRR_pat <M2_cnacs_s0, int_hexagon_M2_cnacs_s0>;
+def : T_PRR_pat <M2_cmacs_s1, int_hexagon_M2_cmacs_s1>;
+def : T_PRR_pat <M2_cnacs_s1, int_hexagon_M2_cnacs_s1>;
+
+// Rxx[-+]=cmpy(Rs,Rt*)[:<<1]:sat
+def : T_PRR_pat <M2_cmacsc_s0, int_hexagon_M2_cmacsc_s0>;
+def : T_PRR_pat <M2_cnacsc_s0, int_hexagon_M2_cnacsc_s0>;
+def : T_PRR_pat <M2_cmacsc_s1, int_hexagon_M2_cmacsc_s1>;
+def : T_PRR_pat <M2_cnacsc_s1, int_hexagon_M2_cnacsc_s1>;
+
+// Rxx+=cmpy[ir](Rs,Rt)
+def : T_PRR_pat <M2_cmaci_s0, int_hexagon_M2_cmaci_s0>;
+def : T_PRR_pat <M2_cmacr_s0, int_hexagon_M2_cmacr_s0>;
+
+// Rxx+=vmpyh(Rs,Rt)[:<<1][:sat]
+def : T_PRR_pat <M2_vmac2, int_hexagon_M2_vmac2>;
+def : T_PRR_pat <M2_vmac2s_s0, int_hexagon_M2_vmac2s_s0>;
+def : T_PRR_pat <M2_vmac2s_s1, int_hexagon_M2_vmac2s_s1>;
 
 /********************************************************************
-*            MTYPE/VB                                               *
+*            CR                                                     *
 *********************************************************************/
+class qi_CRInst_qi_pat<InstHexagon Inst, Intrinsic IntID> :
+  Pat<(i32 (IntID IntRegs:$Rs)),
+      (i32 (C2_tfrpr (Inst (C2_tfrrp IntRegs:$Rs))))>;
 
-// MTYPE / VB / Vector reduce add unsigned bytes.
-def HEXAGON_A2_vraddub:
-  di_MInst_didi                   <"vraddub", int_hexagon_A2_vraddub>;
-def HEXAGON_A2_vraddub_acc:
-  di_MInst_dididi_acc             <"vraddub", int_hexagon_A2_vraddub_acc>;
+class qi_CRInst_qiqi_pat<InstHexagon Inst, Intrinsic IntID> :
+  Pat<(i32 (IntID IntRegs:$Rs, IntRegs:$Rt)),
+      (i32 (C2_tfrpr (Inst (C2_tfrrp IntRegs:$Rs), (C2_tfrrp IntRegs:$Rt))))>;
 
-// MTYPE / VB / Vector sum of absolute differences unsigned bytes.
-def HEXAGON_A2_vrsadub:
-  di_MInst_didi                   <"vrsadub", int_hexagon_A2_vrsadub>;
-def HEXAGON_A2_vrsadub_acc:
-  di_MInst_dididi_acc             <"vrsadub", int_hexagon_A2_vrsadub_acc>;
+def: qi_CRInst_qi_pat<C2_not,     int_hexagon_C2_not>;
+def: qi_CRInst_qi_pat<C2_all8,    int_hexagon_C2_all8>;
+def: qi_CRInst_qi_pat<C2_any8,    int_hexagon_C2_any8>;
 
-/********************************************************************
-*            MTYPE/VH                                               *
-*********************************************************************/
+def: qi_CRInst_qiqi_pat<C2_and,   int_hexagon_C2_and>;
+def: qi_CRInst_qiqi_pat<C2_andn,  int_hexagon_C2_andn>;
+def: qi_CRInst_qiqi_pat<C2_or,    int_hexagon_C2_or>;
+def: qi_CRInst_qiqi_pat<C2_orn,   int_hexagon_C2_orn>;
+def: qi_CRInst_qiqi_pat<C2_xor,   int_hexagon_C2_xor>;
 
-// MTYPE / VH / Vector dual multiply.
-def HEXAGON_M2_vdmpys_s1:
-  di_MInst_didi_s1_sat            <"vdmpy",   int_hexagon_M2_vdmpys_s1>;
-def HEXAGON_M2_vdmpys_s0:
-  di_MInst_didi_sat               <"vdmpy",   int_hexagon_M2_vdmpys_s0>;
-def HEXAGON_M2_vdmacs_s1:
-  di_MInst_dididi_acc_s1_sat      <"vdmpy",   int_hexagon_M2_vdmacs_s1>;
-def HEXAGON_M2_vdmacs_s0:
-  di_MInst_dididi_acc_sat         <"vdmpy",   int_hexagon_M2_vdmacs_s0>;
-
-// MTYPE / VH / Vector dual multiply with round and pack.
-def HEXAGON_M2_vdmpyrs_s0:
-  si_MInst_didi_rnd_sat           <"vdmpy",   int_hexagon_M2_vdmpyrs_s0>;
-def HEXAGON_M2_vdmpyrs_s1:
-  si_MInst_didi_s1_rnd_sat        <"vdmpy",   int_hexagon_M2_vdmpyrs_s1>;
-
-// MTYPE / VH / Vector multiply even halfwords.
-def HEXAGON_M2_vmpy2es_s1:
-  di_MInst_didi_s1_sat            <"vmpyeh",  int_hexagon_M2_vmpy2es_s1>;
-def HEXAGON_M2_vmpy2es_s0:
-  di_MInst_didi_sat               <"vmpyeh",  int_hexagon_M2_vmpy2es_s0>;
-def HEXAGON_M2_vmac2es:
-  di_MInst_dididi_acc             <"vmpyeh",  int_hexagon_M2_vmac2es>;
-def HEXAGON_M2_vmac2es_s1:
-  di_MInst_dididi_acc_s1_sat      <"vmpyeh",  int_hexagon_M2_vmac2es_s1>;
-def HEXAGON_M2_vmac2es_s0:
-  di_MInst_dididi_acc_sat         <"vmpyeh",  int_hexagon_M2_vmac2es_s0>;
-
-// MTYPE / VH / Vector multiply halfwords.
-def HEXAGON_M2_vmpy2s_s0:
-  di_MInst_sisi_sat               <"vmpyh",   int_hexagon_M2_vmpy2s_s0>;
-def HEXAGON_M2_vmpy2s_s1:
-  di_MInst_sisi_s1_sat            <"vmpyh",   int_hexagon_M2_vmpy2s_s1>;
-def HEXAGON_M2_vmac2:
-  di_MInst_disisi_acc             <"vmpyh",   int_hexagon_M2_vmac2>;
-def HEXAGON_M2_vmac2s_s0:
-  di_MInst_disisi_acc_sat         <"vmpyh",   int_hexagon_M2_vmac2s_s0>;
-def HEXAGON_M2_vmac2s_s1:
-  di_MInst_disisi_acc_s1_sat      <"vmpyh",   int_hexagon_M2_vmac2s_s1>;
-
-// MTYPE / VH / Vector multiply halfwords with round and pack.
-def HEXAGON_M2_vmpy2s_s0pack:
-  si_MInst_sisi_rnd_sat           <"vmpyh",   int_hexagon_M2_vmpy2s_s0pack>;
-def HEXAGON_M2_vmpy2s_s1pack:
-  si_MInst_sisi_s1_rnd_sat        <"vmpyh",   int_hexagon_M2_vmpy2s_s1pack>;
-
-// MTYPE / VH / Vector reduce multiply halfwords.
-// Rxx32+=vrmpyh(Rss32,Rtt32)
-def HEXAGON_M2_vrmpy_s0:
-  di_MInst_didi                   <"vrmpyh",  int_hexagon_M2_vrmpy_s0>;
-def HEXAGON_M2_vrmac_s0:
-  di_MInst_dididi_acc             <"vrmpyh",  int_hexagon_M2_vrmac_s0>;
+// Multiply 32x32 and use lower result
+def : T_RRI_pat <M2_macsip, int_hexagon_M2_macsip>;
+def : T_RRI_pat <M2_macsin, int_hexagon_M2_macsin>;
+def : T_RRR_pat <M2_maci, int_hexagon_M2_maci>;
 
+// Subtract and accumulate
+def : T_RRR_pat <M2_subacc, int_hexagon_M2_subacc>;
 
-/********************************************************************
-*            STYPE/ALU                                              *
-*********************************************************************/
+// Add and accumulate
+def : T_RRR_pat <M2_acci,   int_hexagon_M2_acci>;
+def : T_RRR_pat <M2_nacci,  int_hexagon_M2_nacci>;
+def : T_RRI_pat <M2_accii,  int_hexagon_M2_accii>;
+def : T_RRI_pat <M2_naccii, int_hexagon_M2_naccii>;
 
-// STYPE / ALU / Absolute value.
-def HEXAGON_A2_abs:
-  si_SInst_si                     <"abs",     int_hexagon_A2_abs>;
-def HEXAGON_A2_absp:
-  di_SInst_di                     <"abs",     int_hexagon_A2_absp>;
-def HEXAGON_A2_abssat:
-  si_SInst_si_sat                 <"abs",     int_hexagon_A2_abssat>;
+// XOR and XOR with destination
+def : T_RRR_pat <M2_xor_xacc, int_hexagon_M2_xor_xacc>;
 
-// STYPE / ALU / Negate.
-def HEXAGON_A2_negp:
-  di_SInst_di                     <"neg",     int_hexagon_A2_negp>;
-def HEXAGON_A2_negsat:
-  si_SInst_si_sat                 <"neg",     int_hexagon_A2_negsat>;
+class MType_R32_pat <Intrinsic IntID, InstHexagon OutputInst> :
+      Pat <(IntID IntRegs:$src1, IntRegs:$src2),
+           (OutputInst IntRegs:$src1, IntRegs:$src2)>;
 
-// STYPE / ALU / Logical Not.
-def HEXAGON_A2_notp:
-  di_SInst_di                     <"not",     int_hexagon_A2_notp>;
+// Vector dual multiply with round and pack
 
-// STYPE / ALU / Sign extend word to doubleword.
-def HEXAGON_A2_sxtw:
-  di_SInst_si                     <"sxtw",     int_hexagon_A2_sxtw>;
+def : Pat <(int_hexagon_M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+           (M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2)>;
 
+def : Pat <(int_hexagon_M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+           (M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2)>;
+
+// Vector multiply halfwords with round and pack
+
+def : MType_R32_pat <int_hexagon_M2_vmpy2s_s0pack, M2_vmpy2s_s0pack>;
+def : MType_R32_pat <int_hexagon_M2_vmpy2s_s1pack, M2_vmpy2s_s1pack>;
+
+// Multiply and use lower result
+def : MType_R32_pat <int_hexagon_M2_mpyi, M2_mpyi>;
+def : T_RI_pat<M2_mpysmi, int_hexagon_M2_mpysmi>;
+
+// Assembler mapped from Rd32=mpyui(Rs32,Rt32) to Rd32=mpyi(Rs32,Rt32)
+def : MType_R32_pat <int_hexagon_M2_mpyui, M2_mpyi>;
+
+// Multiply and use upper result
+def : MType_R32_pat <int_hexagon_M2_mpy_up, M2_mpy_up>;
+def : MType_R32_pat <int_hexagon_M2_mpyu_up, M2_mpyu_up>;
+def : MType_R32_pat <int_hexagon_M2_hmmpyh_rs1, M2_hmmpyh_rs1>;
+def : MType_R32_pat <int_hexagon_M2_hmmpyl_rs1, M2_hmmpyl_rs1>;
+def : MType_R32_pat <int_hexagon_M2_dpmpyss_rnd_s0, M2_dpmpyss_rnd_s0>;
+
+// Complex multiply with round and pack
+// Rxx32+=cmpy(Rs32,[*]Rt32:<<1]:rnd:sat
+def : MType_R32_pat <int_hexagon_M2_cmpyrs_s0, M2_cmpyrs_s0>;
+def : MType_R32_pat <int_hexagon_M2_cmpyrs_s1, M2_cmpyrs_s1>;
+def : MType_R32_pat <int_hexagon_M2_cmpyrsc_s0, M2_cmpyrsc_s0>;
+def : MType_R32_pat <int_hexagon_M2_cmpyrsc_s1, M2_cmpyrsc_s1>;
 
 /********************************************************************
-*            STYPE/BIT                                              *
+*            STYPE/ALU                                              *
 *********************************************************************/
+def : T_P_pat <A2_absp, int_hexagon_A2_absp>;
+def : T_P_pat <A2_negp, int_hexagon_A2_negp>;
+def : T_P_pat <A2_notp, int_hexagon_A2_notp>;
 
-// STYPE / BIT / Count leading.
-def HEXAGON_S2_cl0:
-  si_SInst_si                     <"cl0",     int_hexagon_S2_cl0>;
-def HEXAGON_S2_cl0p:
-  si_SInst_di                     <"cl0",     int_hexagon_S2_cl0p>;
-def HEXAGON_S2_cl1:
-  si_SInst_si                     <"cl1",     int_hexagon_S2_cl1>;
-def HEXAGON_S2_cl1p:
-  si_SInst_di                     <"cl1",     int_hexagon_S2_cl1p>;
-def HEXAGON_S2_clb:
-  si_SInst_si                     <"clb",     int_hexagon_S2_clb>;
-def HEXAGON_S2_clbp:
-  si_SInst_di                     <"clb",     int_hexagon_S2_clbp>;
-def HEXAGON_S2_clbnorm:
-  si_SInst_si                     <"normamt", int_hexagon_S2_clbnorm>;
-
-// STYPE / BIT / Count trailing.
-def HEXAGON_S2_ct0:
-  si_SInst_si                     <"ct0",     int_hexagon_S2_ct0>;
-def HEXAGON_S2_ct1:
-  si_SInst_si                     <"ct1",     int_hexagon_S2_ct1>;
-
-// STYPE / BIT / Compare bit mask.
-def Hexagon_C2_bitsclr:
-  qi_SInst_sisi                   <"bitsclr", int_hexagon_C2_bitsclr>;
-def Hexagon_C2_bitsclri:
-  qi_SInst_siu6                   <"bitsclr", int_hexagon_C2_bitsclri>;
-def Hexagon_C2_bitsset:
-  qi_SInst_sisi                   <"bitsset", int_hexagon_C2_bitsset>;
-
-// STYPE / BIT / Extract unsigned.
-// Rd[d][32/64]=extractu(Rs[s],Rt[t],[imm])
-def HEXAGON_S2_extractu:
-  si_SInst_siu5u5                 <"extractu",int_hexagon_S2_extractu>;
-def HEXAGON_S2_extractu_rp:
-  si_SInst_sidi                   <"extractu",int_hexagon_S2_extractu_rp>;
-def HEXAGON_S2_extractup:
-  di_SInst_diu6u6                 <"extractu",int_hexagon_S2_extractup>;
-def HEXAGON_S2_extractup_rp:
-  di_SInst_didi                   <"extractu",int_hexagon_S2_extractup_rp>;
-
-// STYPE / BIT / Insert bitfield.
-def Hexagon_S2_insert:
-  si_SInst_sisiu5u5               <"insert",  int_hexagon_S2_insert>;
-def Hexagon_S2_insert_rp:
-  si_SInst_sisidi                 <"insert",  int_hexagon_S2_insert_rp>;
-def Hexagon_S2_insertp:
-  di_SInst_didiu6u6               <"insert",  int_hexagon_S2_insertp>;
-def Hexagon_S2_insertp_rp:
-  di_SInst_dididi                 <"insert",  int_hexagon_S2_insertp_rp>;
-
-// STYPE / BIT / Innterleave/deinterleave.
-def Hexagon_S2_interleave:
-  di_SInst_di                     <"interleave", int_hexagon_S2_interleave>;
-def Hexagon_S2_deinterleave:
-  di_SInst_di                     <"deinterleave", int_hexagon_S2_deinterleave>;
-
-// STYPE / BIT / Linear feedback-shift Iteration.
-def Hexagon_S2_lfsp:
-  di_SInst_didi                   <"lfs",     int_hexagon_S2_lfsp>;
-
-// STYPE / BIT / Bit reverse.
-def Hexagon_S2_brev:
-  si_SInst_si                     <"brev",    int_hexagon_S2_brev>;
-
-// STYPE / BIT / Set/Clear/Toggle Bit.
-def HEXAGON_S2_setbit_i:
-  si_SInst_siu5                   <"setbit",  int_hexagon_S2_setbit_i>;
-def HEXAGON_S2_togglebit_i:
-  si_SInst_siu5                   <"togglebit", int_hexagon_S2_togglebit_i>;
-def HEXAGON_S2_clrbit_i:
-  si_SInst_siu5                   <"clrbit",  int_hexagon_S2_clrbit_i>;
-def HEXAGON_S2_setbit_r:
-  si_SInst_sisi                   <"setbit",  int_hexagon_S2_setbit_r>;
-def HEXAGON_S2_togglebit_r:
-  si_SInst_sisi                   <"togglebit", int_hexagon_S2_togglebit_r>;
-def HEXAGON_S2_clrbit_r:
-  si_SInst_sisi                   <"clrbit",  int_hexagon_S2_clrbit_r>;
-
-// STYPE / BIT / Test Bit.
-def HEXAGON_S2_tstbit_i:
-  qi_SInst_siu5                   <"tstbit",  int_hexagon_S2_tstbit_i>;
-def HEXAGON_S2_tstbit_r:
-  qi_SInst_sisi                   <"tstbit",  int_hexagon_S2_tstbit_r>;
+/********************************************************************
+*            STYPE/BIT                                              *
+*********************************************************************/
 
+// Count leading/trailing
+def: T_R_pat<S2_cl0,     int_hexagon_S2_cl0>;
+def: T_P_pat<S2_cl0p,    int_hexagon_S2_cl0p>;
+def: T_R_pat<S2_cl1,     int_hexagon_S2_cl1>;
+def: T_P_pat<S2_cl1p,    int_hexagon_S2_cl1p>;
+def: T_R_pat<S2_clb,     int_hexagon_S2_clb>;
+def: T_P_pat<S2_clbp,    int_hexagon_S2_clbp>;
+def: T_R_pat<S2_clbnorm, int_hexagon_S2_clbnorm>;
+def: T_R_pat<S2_ct0,     int_hexagon_S2_ct0>;
+def: T_R_pat<S2_ct1,     int_hexagon_S2_ct1>;
+
+// Compare bit mask
+def: T_RR_pat<C2_bitsclr,  int_hexagon_C2_bitsclr>;
+def: T_RI_pat<C2_bitsclri, int_hexagon_C2_bitsclri>;
+def: T_RR_pat<C2_bitsset,  int_hexagon_C2_bitsset>;
+
+// Vector shuffle
+def : T_PP_pat <S2_shuffeb, int_hexagon_S2_shuffeb>;
+def : T_PP_pat <S2_shuffob, int_hexagon_S2_shuffob>;
+def : T_PP_pat <S2_shuffeh, int_hexagon_S2_shuffeh>;
+def : T_PP_pat <S2_shuffoh, int_hexagon_S2_shuffoh>;
+
+// Vector truncate
+def : T_PP_pat <S2_vtrunewh, int_hexagon_S2_vtrunewh>;
+def : T_PP_pat <S2_vtrunowh, int_hexagon_S2_vtrunowh>;
+
+// Linear feedback-shift Iteration.
+def : T_PP_pat <S2_lfsp, int_hexagon_S2_lfsp>;
+
+// Vector splice
+def : T_PPQ_pat <S2_vsplicerb, int_hexagon_S2_vsplicerb>;
+def : T_PPI_pat <S2_vspliceib, int_hexagon_S2_vspliceib>;
+
+// Shift by immediate and add
+def : T_RRI_pat<S2_addasl_rrri, int_hexagon_S2_addasl_rrri>;
+
+// Extract bitfield
+def : T_PII_pat<S2_extractup,    int_hexagon_S2_extractup>;
+def : T_RII_pat<S2_extractu,     int_hexagon_S2_extractu>;
+def : T_RP_pat <S2_extractu_rp,  int_hexagon_S2_extractu_rp>;
+def : T_PP_pat <S2_extractup_rp, int_hexagon_S2_extractup_rp>;
+
+// Insert bitfield
+def : Pat <(int_hexagon_S2_insert_rp IntRegs:$src1, IntRegs:$src2,
+                                     DoubleRegs:$src3),
+           (S2_insert_rp IntRegs:$src1, IntRegs:$src2, DoubleRegs:$src3)>;
+
+def : Pat<(i64 (int_hexagon_S2_insertp_rp (I64:$src1),
+                 (I64:$src2), (I64:$src3))),
+          (i64 (S2_insertp_rp (I64:$src1), (I64:$src2),
+                              (I64:$src3)))>;
+
+def : Pat<(int_hexagon_S2_insert IntRegs:$src1, IntRegs:$src2,
+                                 u5ImmPred:$src3, u5ImmPred:$src4),
+          (S2_insert IntRegs:$src1, IntRegs:$src2,
+                     u5ImmPred:$src3, u5ImmPred:$src4)>;
+
+def : Pat<(i64 (int_hexagon_S2_insertp (I64:$src1),
+                 (I64:$src2), u6ImmPred:$src3, u6ImmPred:$src4)),
+          (i64 (S2_insertp (I64:$src1), (I64:$src2),
+                           u6ImmPred:$src3, u6ImmPred:$src4))>;
+
+
+// Innterleave/deinterleave
+def : T_P_pat <S2_interleave, int_hexagon_S2_interleave>;
+def : T_P_pat <S2_deinterleave, int_hexagon_S2_deinterleave>;
+
+// Set/Clear/Toggle Bit
+def: T_RI_pat<S2_setbit_i,    int_hexagon_S2_setbit_i>;
+def: T_RI_pat<S2_clrbit_i,    int_hexagon_S2_clrbit_i>;
+def: T_RI_pat<S2_togglebit_i, int_hexagon_S2_togglebit_i>;
+
+def: T_RR_pat<S2_setbit_r,    int_hexagon_S2_setbit_r>;
+def: T_RR_pat<S2_clrbit_r,    int_hexagon_S2_clrbit_r>;
+def: T_RR_pat<S2_togglebit_r, int_hexagon_S2_togglebit_r>;
+
+// Test Bit
+def: T_RI_pat<S2_tstbit_i,    int_hexagon_S2_tstbit_i>;
+def: T_RR_pat<S2_tstbit_r,    int_hexagon_S2_tstbit_r>;
 
 /********************************************************************
 *            STYPE/COMPLEX                                          *
 *********************************************************************/
+// Vector Complex conjugate
+def : T_P_pat <A2_vconj, int_hexagon_A2_vconj>;
 
-// STYPE / COMPLEX / Vector Complex conjugate.
-def HEXAGON_A2_vconj:
-  di_SInst_di_sat                 <"vconj",   int_hexagon_A2_vconj>;
-
-// STYPE / COMPLEX / Vector Complex rotate.
-def HEXAGON_S2_vcrotate:
-  di_SInst_disi                   <"vcrotate",int_hexagon_S2_vcrotate>;
-
+// Vector Complex rotate
+def : T_PR_pat <S2_vcrotate, int_hexagon_S2_vcrotate>;
 
 /********************************************************************
 *            STYPE/PERM                                             *
 *********************************************************************/
 
-// STYPE / PERM / Saturate.
-def HEXAGON_A2_sat:
-  si_SInst_di                     <"sat",     int_hexagon_A2_sat>;
-def HEXAGON_A2_satb:
-  si_SInst_si                     <"satb",    int_hexagon_A2_satb>;
-def HEXAGON_A2_sath:
-  si_SInst_si                     <"sath",    int_hexagon_A2_sath>;
-def HEXAGON_A2_satub:
-  si_SInst_si                     <"satub",   int_hexagon_A2_satub>;
-def HEXAGON_A2_satuh:
-  si_SInst_si                     <"satuh",   int_hexagon_A2_satuh>;
-
-// STYPE / PERM / Swizzle bytes.
-def HEXAGON_A2_swiz:
-  si_SInst_si                     <"swiz",    int_hexagon_A2_swiz>;
-
-// STYPE / PERM / Vector align.
-// Need custom lowering
-def HEXAGON_S2_valignib:
-  di_SInst_didiu3                 <"valignb", int_hexagon_S2_valignib>;
-def HEXAGON_S2_valignrb:
-  di_SInst_didiqi                 <"valignb", int_hexagon_S2_valignrb>;
-
-// STYPE / PERM / Vector round and pack.
-def HEXAGON_S2_vrndpackwh:
-  si_SInst_di                     <"vrndwh",  int_hexagon_S2_vrndpackwh>;
-def HEXAGON_S2_vrndpackwhs:
-  si_SInst_di_sat                 <"vrndwh",  int_hexagon_S2_vrndpackwhs>;
-
-// STYPE / PERM / Vector saturate and pack.
-def HEXAGON_S2_svsathb:
-  si_SInst_si                     <"vsathb",  int_hexagon_S2_svsathb>;
-def HEXAGON_S2_vsathb:
-  si_SInst_di                     <"vsathb",  int_hexagon_S2_vsathb>;
-def HEXAGON_S2_svsathub:
-  si_SInst_si                     <"vsathub", int_hexagon_S2_svsathub>;
-def HEXAGON_S2_vsathub:
-  si_SInst_di                     <"vsathub", int_hexagon_S2_vsathub>;
-def HEXAGON_S2_vsatwh:
-  si_SInst_di                     <"vsatwh",  int_hexagon_S2_vsatwh>;
-def HEXAGON_S2_vsatwuh:
-  si_SInst_di                     <"vsatwuh", int_hexagon_S2_vsatwuh>;
-
-// STYPE / PERM / Vector saturate without pack.
-def HEXAGON_S2_vsathb_nopack:
-  di_SInst_di                     <"vsathb",  int_hexagon_S2_vsathb_nopack>;
-def HEXAGON_S2_vsathub_nopack:
-  di_SInst_di                     <"vsathub", int_hexagon_S2_vsathub_nopack>;
-def HEXAGON_S2_vsatwh_nopack:
-  di_SInst_di                     <"vsatwh",  int_hexagon_S2_vsatwh_nopack>;
-def HEXAGON_S2_vsatwuh_nopack:
-  di_SInst_di                     <"vsatwuh", int_hexagon_S2_vsatwuh_nopack>;
-
-// STYPE / PERM / Vector shuffle.
-def HEXAGON_S2_shuffeb:
-  di_SInst_didi                   <"shuffeb", int_hexagon_S2_shuffeb>;
-def HEXAGON_S2_shuffeh:
-  di_SInst_didi                   <"shuffeh", int_hexagon_S2_shuffeh>;
-def HEXAGON_S2_shuffob:
-  di_SInst_didi                   <"shuffob", int_hexagon_S2_shuffob>;
-def HEXAGON_S2_shuffoh:
-  di_SInst_didi                   <"shuffoh", int_hexagon_S2_shuffoh>;
-
-// STYPE / PERM / Vector splat bytes.
-def HEXAGON_S2_vsplatrb:
-  si_SInst_si                     <"vsplatb", int_hexagon_S2_vsplatrb>;
-
-// STYPE / PERM / Vector splat halfwords.
-def HEXAGON_S2_vsplatrh:
-  di_SInst_si                     <"vsplath", int_hexagon_S2_vsplatrh>;
-
-// STYPE / PERM / Vector splice.
-def Hexagon_S2_vsplicerb:
-  di_SInst_didiqi                 <"vspliceb",int_hexagon_S2_vsplicerb>;
-def Hexagon_S2_vspliceib:
-  di_SInst_didiu3                 <"vspliceb",int_hexagon_S2_vspliceib>;
-
-// STYPE / PERM / Sign extend.
-def HEXAGON_S2_vsxtbh:
-  di_SInst_si                     <"vsxtbh",  int_hexagon_S2_vsxtbh>;
-def HEXAGON_S2_vsxthw:
-  di_SInst_si                     <"vsxthw",  int_hexagon_S2_vsxthw>;
-
-// STYPE / PERM / Truncate.
-def HEXAGON_S2_vtrunehb:
-  si_SInst_di                     <"vtrunehb",int_hexagon_S2_vtrunehb>;
-def HEXAGON_S2_vtrunohb:
-  si_SInst_di                     <"vtrunohb",int_hexagon_S2_vtrunohb>;
-def HEXAGON_S2_vtrunewh:
-  di_SInst_didi                   <"vtrunewh",int_hexagon_S2_vtrunewh>;
-def HEXAGON_S2_vtrunowh:
-  di_SInst_didi                   <"vtrunowh",int_hexagon_S2_vtrunowh>;
-
-// STYPE / PERM / Zero extend.
-def HEXAGON_S2_vzxtbh:
-  di_SInst_si                     <"vzxtbh",  int_hexagon_S2_vzxtbh>;
-def HEXAGON_S2_vzxthw:
-  di_SInst_si                     <"vzxthw",  int_hexagon_S2_vzxthw>;
-
+// Vector saturate without pack
+def : T_P_pat <S2_vsathb_nopack, int_hexagon_S2_vsathb_nopack>;
+def : T_P_pat <S2_vsathub_nopack, int_hexagon_S2_vsathub_nopack>;
+def : T_P_pat <S2_vsatwh_nopack, int_hexagon_S2_vsatwh_nopack>;
+def : T_P_pat <S2_vsatwuh_nopack, int_hexagon_S2_vsatwuh_nopack>;
 
 /********************************************************************
 *            STYPE/PRED                                             *
 *********************************************************************/
 
-// STYPE / PRED / Mask generate from predicate.
-def HEXAGON_C2_mask:
-  di_SInst_qi                     <"mask",   int_hexagon_C2_mask>;
-
-// STYPE / PRED / Predicate transfer.
-def HEXAGON_C2_tfrpr:
-  si_SInst_qi                     <"",       int_hexagon_C2_tfrpr>;
-def HEXAGON_C2_tfrrp:
-  qi_SInst_si                     <"",       int_hexagon_C2_tfrrp>;
+// Predicate transfer
+def: Pat<(i32 (int_hexagon_C2_tfrpr (I32:$Rs))),
+         (i32 (C2_tfrpr (C2_tfrrp (I32:$Rs))))>;
+def: Pat<(i32 (int_hexagon_C2_tfrrp (I32:$Rs))),
+         (i32 (C2_tfrpr (C2_tfrrp (I32:$Rs))))>;
 
-// STYPE / PRED / Viterbi pack even and odd predicate bits.
-def HEXAGON_C2_vitpack:
-  si_SInst_qiqi                   <"vitpack",int_hexagon_C2_vitpack>;
+// Mask generate from predicate
+def: Pat<(i64 (int_hexagon_C2_mask (I32:$Rs))),
+         (i64 (C2_mask (C2_tfrrp (I32:$Rs))))>;
 
+// Viterbi pack even and odd predicate bits
+def: Pat<(i32 (int_hexagon_C2_vitpack (I32:$Rs), (I32:$Rt))),
+         (i32 (C2_vitpack (C2_tfrrp (I32:$Rs)),
+                          (C2_tfrrp (I32:$Rt))))>;
 
 /********************************************************************
 *            STYPE/SHIFT                                            *
 *********************************************************************/
 
-// STYPE / SHIFT / Shift by immediate.
-def HEXAGON_S2_asl_i_r:
-  si_SInst_siu5                   <"asl",     int_hexagon_S2_asl_i_r>;
-def HEXAGON_S2_asr_i_r:
-  si_SInst_siu5                   <"asr",     int_hexagon_S2_asr_i_r>;
-def HEXAGON_S2_lsr_i_r:
-  si_SInst_siu5                   <"lsr",     int_hexagon_S2_lsr_i_r>;
-def HEXAGON_S2_asl_i_p:
-  di_SInst_diu6                   <"asl",     int_hexagon_S2_asl_i_p>;
-def HEXAGON_S2_asr_i_p:
-  di_SInst_diu6                   <"asr",     int_hexagon_S2_asr_i_p>;
-def HEXAGON_S2_lsr_i_p:
-  di_SInst_diu6                   <"lsr",     int_hexagon_S2_lsr_i_p>;
-
-// STYPE / SHIFT / Shift by immediate and accumulate.
-def HEXAGON_S2_asl_i_r_acc:
-  si_SInst_sisiu5_acc             <"asl",     int_hexagon_S2_asl_i_r_acc>;
-def HEXAGON_S2_asr_i_r_acc:
-  si_SInst_sisiu5_acc             <"asr",     int_hexagon_S2_asr_i_r_acc>;
-def HEXAGON_S2_lsr_i_r_acc:
-  si_SInst_sisiu5_acc             <"lsr",     int_hexagon_S2_lsr_i_r_acc>;
-def HEXAGON_S2_asl_i_r_nac:
-  si_SInst_sisiu5_nac             <"asl",     int_hexagon_S2_asl_i_r_nac>;
-def HEXAGON_S2_asr_i_r_nac:
-  si_SInst_sisiu5_nac             <"asr",     int_hexagon_S2_asr_i_r_nac>;
-def HEXAGON_S2_lsr_i_r_nac:
-  si_SInst_sisiu5_nac             <"lsr",     int_hexagon_S2_lsr_i_r_nac>;
-def HEXAGON_S2_asl_i_p_acc:
-  di_SInst_didiu6_acc             <"asl",     int_hexagon_S2_asl_i_p_acc>;
-def HEXAGON_S2_asr_i_p_acc:
-  di_SInst_didiu6_acc             <"asr",     int_hexagon_S2_asr_i_p_acc>;
-def HEXAGON_S2_lsr_i_p_acc:
-  di_SInst_didiu6_acc             <"lsr",     int_hexagon_S2_lsr_i_p_acc>;
-def HEXAGON_S2_asl_i_p_nac:
-  di_SInst_didiu6_nac             <"asl",     int_hexagon_S2_asl_i_p_nac>;
-def HEXAGON_S2_asr_i_p_nac:
-  di_SInst_didiu6_nac             <"asr",     int_hexagon_S2_asr_i_p_nac>;
-def HEXAGON_S2_lsr_i_p_nac:
-  di_SInst_didiu6_nac             <"lsr",     int_hexagon_S2_lsr_i_p_nac>;
-
-// STYPE / SHIFT / Shift by immediate and add.
-def HEXAGON_S2_addasl_rrri:
-  si_SInst_sisiu3                 <"addasl",  int_hexagon_S2_addasl_rrri>;
-
-// STYPE / SHIFT / Shift by immediate and logical.
-def HEXAGON_S2_asl_i_r_and:
-  si_SInst_sisiu5_and             <"asl",     int_hexagon_S2_asl_i_r_and>;
-def HEXAGON_S2_asr_i_r_and:
-  si_SInst_sisiu5_and             <"asr",     int_hexagon_S2_asr_i_r_and>;
-def HEXAGON_S2_lsr_i_r_and:
-  si_SInst_sisiu5_and             <"lsr",     int_hexagon_S2_lsr_i_r_and>;
-
-def HEXAGON_S2_asl_i_r_xacc:
-  si_SInst_sisiu5_xor             <"asl",     int_hexagon_S2_asl_i_r_xacc>;
-def HEXAGON_S2_lsr_i_r_xacc:
-  si_SInst_sisiu5_xor             <"lsr",     int_hexagon_S2_lsr_i_r_xacc>;
-
-def HEXAGON_S2_asl_i_r_or:
-  si_SInst_sisiu5_or              <"asl",     int_hexagon_S2_asl_i_r_or>;
-def HEXAGON_S2_asr_i_r_or:
-  si_SInst_sisiu5_or              <"asr",     int_hexagon_S2_asr_i_r_or>;
-def HEXAGON_S2_lsr_i_r_or:
-  si_SInst_sisiu5_or              <"lsr",     int_hexagon_S2_lsr_i_r_or>;
-
-def HEXAGON_S2_asl_i_p_and:
-  di_SInst_didiu6_and             <"asl",     int_hexagon_S2_asl_i_p_and>;
-def HEXAGON_S2_asr_i_p_and:
-  di_SInst_didiu6_and             <"asr",     int_hexagon_S2_asr_i_p_and>;
-def HEXAGON_S2_lsr_i_p_and:
-  di_SInst_didiu6_and             <"lsr",     int_hexagon_S2_lsr_i_p_and>;
-
-def HEXAGON_S2_asl_i_p_xacc:
-  di_SInst_didiu6_xor             <"asl",     int_hexagon_S2_asl_i_p_xacc>;
-def HEXAGON_S2_lsr_i_p_xacc:
-  di_SInst_didiu6_xor             <"lsr",     int_hexagon_S2_lsr_i_p_xacc>;
-
-def HEXAGON_S2_asl_i_p_or:
-  di_SInst_didiu6_or              <"asl",     int_hexagon_S2_asl_i_p_or>;
-def HEXAGON_S2_asr_i_p_or:
-  di_SInst_didiu6_or              <"asr",     int_hexagon_S2_asr_i_p_or>;
-def HEXAGON_S2_lsr_i_p_or:
-  di_SInst_didiu6_or              <"lsr",     int_hexagon_S2_lsr_i_p_or>;
-
-// STYPE / SHIFT / Shift right by immediate with rounding.
-def HEXAGON_S2_asr_i_r_rnd:
-  si_SInst_siu5_rnd               <"asr",     int_hexagon_S2_asr_i_r_rnd>;
-def HEXAGON_S2_asr_i_r_rnd_goodsyntax:
-  si_SInst_siu5              <"asrrnd",  int_hexagon_S2_asr_i_r_rnd_goodsyntax>;
-
-// STYPE / SHIFT / Shift left by immediate with saturation.
-def HEXAGON_S2_asl_i_r_sat:
-  si_SInst_sisi_sat               <"asl",     int_hexagon_S2_asl_i_r_sat>;
-
-// STYPE / SHIFT / Shift by register.
-def HEXAGON_S2_asl_r_r:
-  si_SInst_sisi                   <"asl",     int_hexagon_S2_asl_r_r>;
-def HEXAGON_S2_asr_r_r:
-  si_SInst_sisi                   <"asr",     int_hexagon_S2_asr_r_r>;
-def HEXAGON_S2_lsl_r_r:
-  si_SInst_sisi                   <"lsl",     int_hexagon_S2_lsl_r_r>;
-def HEXAGON_S2_lsr_r_r:
-  si_SInst_sisi                   <"lsr",     int_hexagon_S2_lsr_r_r>;
-def HEXAGON_S2_asl_r_p:
-  di_SInst_disi                   <"asl",     int_hexagon_S2_asl_r_p>;
-def HEXAGON_S2_asr_r_p:
-  di_SInst_disi                   <"asr",     int_hexagon_S2_asr_r_p>;
-def HEXAGON_S2_lsl_r_p:
-  di_SInst_disi                   <"lsl",     int_hexagon_S2_lsl_r_p>;
-def HEXAGON_S2_lsr_r_p:
-  di_SInst_disi                   <"lsr",     int_hexagon_S2_lsr_r_p>;
-
-// STYPE / SHIFT / Shift by register and accumulate.
-def HEXAGON_S2_asl_r_r_acc:
-  si_SInst_sisisi_acc             <"asl",     int_hexagon_S2_asl_r_r_acc>;
-def HEXAGON_S2_asr_r_r_acc:
-  si_SInst_sisisi_acc             <"asr",     int_hexagon_S2_asr_r_r_acc>;
-def HEXAGON_S2_lsl_r_r_acc:
-  si_SInst_sisisi_acc             <"lsl",     int_hexagon_S2_lsl_r_r_acc>;
-def HEXAGON_S2_lsr_r_r_acc:
-  si_SInst_sisisi_acc             <"lsr",     int_hexagon_S2_lsr_r_r_acc>;
-def HEXAGON_S2_asl_r_p_acc:
-  di_SInst_didisi_acc             <"asl",     int_hexagon_S2_asl_r_p_acc>;
-def HEXAGON_S2_asr_r_p_acc:
-  di_SInst_didisi_acc             <"asr",     int_hexagon_S2_asr_r_p_acc>;
-def HEXAGON_S2_lsl_r_p_acc:
-  di_SInst_didisi_acc             <"lsl",     int_hexagon_S2_lsl_r_p_acc>;
-def HEXAGON_S2_lsr_r_p_acc:
-  di_SInst_didisi_acc             <"lsr",     int_hexagon_S2_lsr_r_p_acc>;
-
-def HEXAGON_S2_asl_r_r_nac:
-  si_SInst_sisisi_nac             <"asl",     int_hexagon_S2_asl_r_r_nac>;
-def HEXAGON_S2_asr_r_r_nac:
-  si_SInst_sisisi_nac             <"asr",     int_hexagon_S2_asr_r_r_nac>;
-def HEXAGON_S2_lsl_r_r_nac:
-  si_SInst_sisisi_nac             <"lsl",     int_hexagon_S2_lsl_r_r_nac>;
-def HEXAGON_S2_lsr_r_r_nac:
-  si_SInst_sisisi_nac             <"lsr",     int_hexagon_S2_lsr_r_r_nac>;
-def HEXAGON_S2_asl_r_p_nac:
-  di_SInst_didisi_nac             <"asl",     int_hexagon_S2_asl_r_p_nac>;
-def HEXAGON_S2_asr_r_p_nac:
-  di_SInst_didisi_nac             <"asr",     int_hexagon_S2_asr_r_p_nac>;
-def HEXAGON_S2_lsl_r_p_nac:
-  di_SInst_didisi_nac             <"lsl",     int_hexagon_S2_lsl_r_p_nac>;
-def HEXAGON_S2_lsr_r_p_nac:
-  di_SInst_didisi_nac             <"lsr",     int_hexagon_S2_lsr_r_p_nac>;
-
-// STYPE / SHIFT / Shift by register and logical.
-def HEXAGON_S2_asl_r_r_and:
-  si_SInst_sisisi_and             <"asl",     int_hexagon_S2_asl_r_r_and>;
-def HEXAGON_S2_asr_r_r_and:
-  si_SInst_sisisi_and             <"asr",     int_hexagon_S2_asr_r_r_and>;
-def HEXAGON_S2_lsl_r_r_and:
-  si_SInst_sisisi_and             <"lsl",     int_hexagon_S2_lsl_r_r_and>;
-def HEXAGON_S2_lsr_r_r_and:
-  si_SInst_sisisi_and             <"lsr",     int_hexagon_S2_lsr_r_r_and>;
-
-def HEXAGON_S2_asl_r_r_or:
-  si_SInst_sisisi_or              <"asl",     int_hexagon_S2_asl_r_r_or>;
-def HEXAGON_S2_asr_r_r_or:
-  si_SInst_sisisi_or              <"asr",     int_hexagon_S2_asr_r_r_or>;
-def HEXAGON_S2_lsl_r_r_or:
-  si_SInst_sisisi_or              <"lsl",     int_hexagon_S2_lsl_r_r_or>;
-def HEXAGON_S2_lsr_r_r_or:
-  si_SInst_sisisi_or              <"lsr",     int_hexagon_S2_lsr_r_r_or>;
-
-def HEXAGON_S2_asl_r_p_and:
-  di_SInst_didisi_and             <"asl",     int_hexagon_S2_asl_r_p_and>;
-def HEXAGON_S2_asr_r_p_and:
-  di_SInst_didisi_and             <"asr",     int_hexagon_S2_asr_r_p_and>;
-def HEXAGON_S2_lsl_r_p_and:
-  di_SInst_didisi_and             <"lsl",     int_hexagon_S2_lsl_r_p_and>;
-def HEXAGON_S2_lsr_r_p_and:
-  di_SInst_didisi_and             <"lsr",     int_hexagon_S2_lsr_r_p_and>;
-
-def HEXAGON_S2_asl_r_p_or:
-  di_SInst_didisi_or              <"asl",     int_hexagon_S2_asl_r_p_or>;
-def HEXAGON_S2_asr_r_p_or:
-  di_SInst_didisi_or              <"asr",     int_hexagon_S2_asr_r_p_or>;
-def HEXAGON_S2_lsl_r_p_or:
-  di_SInst_didisi_or              <"lsl",     int_hexagon_S2_lsl_r_p_or>;
-def HEXAGON_S2_lsr_r_p_or:
-  di_SInst_didisi_or              <"lsr",     int_hexagon_S2_lsr_r_p_or>;
-
-// STYPE / SHIFT / Shift by register with saturation.
-def HEXAGON_S2_asl_r_r_sat:
-  si_SInst_sisi_sat               <"asl",     int_hexagon_S2_asl_r_r_sat>;
-def HEXAGON_S2_asr_r_r_sat:
-  si_SInst_sisi_sat               <"asr",     int_hexagon_S2_asr_r_r_sat>;
-
-// STYPE / SHIFT / Table Index.
-def Hexagon_S2_tableidxb_goodsyntax:
-  si_MInst_sisiu4u5          <"tableidxb",int_hexagon_S2_tableidxb_goodsyntax>;
-def Hexagon_S2_tableidxd_goodsyntax:
-  si_MInst_sisiu4u5          <"tableidxd",int_hexagon_S2_tableidxd_goodsyntax>;
-def Hexagon_S2_tableidxh_goodsyntax:
-  si_MInst_sisiu4u5          <"tableidxh",int_hexagon_S2_tableidxh_goodsyntax>;
-def Hexagon_S2_tableidxw_goodsyntax:
-  si_MInst_sisiu4u5          <"tableidxw",int_hexagon_S2_tableidxw_goodsyntax>;
+def : T_PI_pat <S2_asr_i_p, int_hexagon_S2_asr_i_p>;
+def : T_PI_pat <S2_lsr_i_p, int_hexagon_S2_lsr_i_p>;
+def : T_PI_pat <S2_asl_i_p, int_hexagon_S2_asl_i_p>;
+
+def : T_PR_pat <S2_asr_r_p, int_hexagon_S2_asr_r_p>;
+def : T_PR_pat <S2_lsr_r_p, int_hexagon_S2_lsr_r_p>;
+def : T_PR_pat <S2_asl_r_p, int_hexagon_S2_asl_r_p>;
+def : T_PR_pat <S2_lsl_r_p, int_hexagon_S2_lsl_r_p>;
+
+def : T_RR_pat <S2_asr_r_r, int_hexagon_S2_asr_r_r>;
+def : T_RR_pat <S2_lsr_r_r, int_hexagon_S2_lsr_r_r>;
+def : T_RR_pat <S2_asl_r_r, int_hexagon_S2_asl_r_r>;
+def : T_RR_pat <S2_lsl_r_r, int_hexagon_S2_lsl_r_r>;
+
+def : T_RR_pat <S2_asr_r_r_sat, int_hexagon_S2_asr_r_r_sat>;
+def : T_RR_pat <S2_asl_r_r_sat, int_hexagon_S2_asl_r_r_sat>;
+
+def : T_R_pat <S2_vsxtbh,   int_hexagon_S2_vsxtbh>;
+def : T_R_pat <S2_vzxtbh,   int_hexagon_S2_vzxtbh>;
+def : T_R_pat <S2_vsxthw,   int_hexagon_S2_vsxthw>;
+def : T_R_pat <S2_vzxthw,   int_hexagon_S2_vzxthw>;
+def : T_R_pat <S2_vsplatrh, int_hexagon_S2_vsplatrh>;
+def : T_R_pat <A2_sxtw,     int_hexagon_A2_sxtw>;
+
+// Vector saturate and pack
+def : T_R_pat <S2_svsathb,  int_hexagon_S2_svsathb>;
+def : T_R_pat <S2_svsathub, int_hexagon_S2_svsathub>;
+def : T_P_pat <S2_vsathub,  int_hexagon_S2_vsathub>;
+def : T_P_pat <S2_vsatwh,   int_hexagon_S2_vsatwh>;
+def : T_P_pat <S2_vsatwuh,  int_hexagon_S2_vsatwuh>;
+def : T_P_pat <S2_vsathb,   int_hexagon_S2_vsathb>;
+
+def : T_P_pat <S2_vtrunohb,    int_hexagon_S2_vtrunohb>;
+def : T_P_pat <S2_vtrunehb,    int_hexagon_S2_vtrunehb>;
+def : T_P_pat <S2_vrndpackwh,  int_hexagon_S2_vrndpackwh>;
+def : T_P_pat <S2_vrndpackwhs, int_hexagon_S2_vrndpackwhs>;
+def : T_R_pat <S2_brev,        int_hexagon_S2_brev>;
+def : T_R_pat <S2_vsplatrb,    int_hexagon_S2_vsplatrb>;
+
+def : T_R_pat <A2_abs,    int_hexagon_A2_abs>;
+def : T_R_pat <A2_abssat, int_hexagon_A2_abssat>;
+def : T_R_pat <A2_negsat, int_hexagon_A2_negsat>;
+
+def : T_R_pat <A2_swiz,   int_hexagon_A2_swiz>;
+
+def : T_P_pat <A2_sat,    int_hexagon_A2_sat>;
+def : T_R_pat <A2_sath,   int_hexagon_A2_sath>;
+def : T_R_pat <A2_satuh,  int_hexagon_A2_satuh>;
+def : T_R_pat <A2_satub,  int_hexagon_A2_satub>;
+def : T_R_pat <A2_satb,   int_hexagon_A2_satb>;
+
+// Vector arithmetic shift right by immediate with truncate and pack.
+def : T_PI_pat<S2_asr_i_svw_trun, int_hexagon_S2_asr_i_svw_trun>;
+
+def : T_RI_pat <S2_asr_i_r,     int_hexagon_S2_asr_i_r>;
+def : T_RI_pat <S2_lsr_i_r,     int_hexagon_S2_lsr_i_r>;
+def : T_RI_pat <S2_asl_i_r,     int_hexagon_S2_asl_i_r>;
+def : T_RI_pat <S2_asr_i_r_rnd, int_hexagon_S2_asr_i_r_rnd>;
+def : T_RI_pat <S2_asr_i_r_rnd_goodsyntax,
+                int_hexagon_S2_asr_i_r_rnd_goodsyntax>;
+
+// Shift left by immediate with saturation.
+def : T_RI_pat <S2_asl_i_r_sat, int_hexagon_S2_asl_i_r_sat>;
 
+//===----------------------------------------------------------------------===//
+// Template 'def pat' to map tableidx[bhwd] intrinsics to :raw instructions.
+//===----------------------------------------------------------------------===//
+class S2op_tableidx_pat <Intrinsic IntID, InstHexagon OutputInst,
+                         SDNodeXForm XformImm>
+  : Pat <(IntID IntRegs:$src1, IntRegs:$src2, u4ImmPred:$src3, u5ImmPred:$src4),
+         (OutputInst IntRegs:$src1, IntRegs:$src2, u4ImmPred:$src3,
+                     (XformImm u5ImmPred:$src4))>;
+
+
+// Table Index : Extract and insert bits.
+// Map to the real hardware instructions after subtracting appropriate
+// values from the 4th input operand. Please note that subtraction is not
+// needed for int_hexagon_S2_tableidxb_goodsyntax.
+
+def : Pat <(int_hexagon_S2_tableidxb_goodsyntax IntRegs:$src1, IntRegs:$src2,
+                                              u4ImmPred:$src3, u5ImmPred:$src4),
+           (S2_tableidxb IntRegs:$src1, IntRegs:$src2,
+                         u4ImmPred:$src3, u5ImmPred:$src4)>;
+
+def : S2op_tableidx_pat <int_hexagon_S2_tableidxh_goodsyntax, S2_tableidxh,
+                         DEC_CONST_SIGNED>;
+def : S2op_tableidx_pat <int_hexagon_S2_tableidxw_goodsyntax, S2_tableidxw,
+                         DEC2_CONST_SIGNED>;
+def : S2op_tableidx_pat <int_hexagon_S2_tableidxd_goodsyntax, S2_tableidxd,
+                         DEC3_CONST_SIGNED>;
 
 /********************************************************************
 *            STYPE/VH                                               *
 *********************************************************************/
 
-// STYPE / VH / Vector absolute value halfwords.
-// Rdd64=vabsh(Rss64)
-def HEXAGON_A2_vabsh:
-  di_SInst_di                     <"vabsh",   int_hexagon_A2_vabsh>;
-def HEXAGON_A2_vabshsat:
-  di_SInst_di_sat                 <"vabsh",   int_hexagon_A2_vabshsat>;
-
-// STYPE / VH / Vector shift halfwords by immediate.
-// Rdd64=v[asl/asr/lsr]h(Rss64,Rt32)
-def HEXAGON_S2_asl_i_vh:
-  di_SInst_disi                   <"vaslh",   int_hexagon_S2_asl_i_vh>;
-def HEXAGON_S2_asr_i_vh:
-  di_SInst_disi                   <"vasrh",   int_hexagon_S2_asr_i_vh>;
-def HEXAGON_S2_lsr_i_vh:
-  di_SInst_disi                   <"vlsrh",   int_hexagon_S2_lsr_i_vh>;
-
-// STYPE / VH / Vector shift halfwords by register.
-// Rdd64=v[asl/asr/lsl/lsr]w(Rss64,Rt32)
-def HEXAGON_S2_asl_r_vh:
-  di_SInst_disi                   <"vaslh",   int_hexagon_S2_asl_r_vh>;
-def HEXAGON_S2_asr_r_vh:
-  di_SInst_disi                   <"vasrh",   int_hexagon_S2_asr_r_vh>;
-def HEXAGON_S2_lsl_r_vh:
-  di_SInst_disi                   <"vlslh",   int_hexagon_S2_lsl_r_vh>;
-def HEXAGON_S2_lsr_r_vh:
-  di_SInst_disi                   <"vlsrh",   int_hexagon_S2_lsr_r_vh>;
+// Vector absolute value halfwords with and without saturation
+// Rdd64=vabsh(Rss64)[:sat]
+def : T_P_pat <A2_vabsh, int_hexagon_A2_vabsh>;
+def : T_P_pat <A2_vabshsat, int_hexagon_A2_vabshsat>;
+
+// Vector shift halfwords by immediate
+// Rdd64=[vaslh/vasrh/vlsrh](Rss64,u4)
+def : T_PI_pat <S2_asr_i_vh, int_hexagon_S2_asr_i_vh>;
+def : T_PI_pat <S2_lsr_i_vh, int_hexagon_S2_lsr_i_vh>;
+def : T_PI_pat <S2_asl_i_vh, int_hexagon_S2_asl_i_vh>;
 
+// Vector shift halfwords by register
+// Rdd64=[vaslw/vasrw/vlslw/vlsrw](Rss64,Rt32)
+def : T_PR_pat <S2_asr_r_vh, int_hexagon_S2_asr_r_vh>;
+def : T_PR_pat <S2_lsr_r_vh, int_hexagon_S2_lsr_r_vh>;
+def : T_PR_pat <S2_asl_r_vh, int_hexagon_S2_asl_r_vh>;
+def : T_PR_pat <S2_lsl_r_vh, int_hexagon_S2_lsl_r_vh>;
 
 /********************************************************************
 *            STYPE/VW                                               *
 *********************************************************************/
 
-// STYPE / VW / Vector absolute value words.
-def HEXAGON_A2_vabsw:
-  di_SInst_di                     <"vabsw",   int_hexagon_A2_vabsw>;
-def HEXAGON_A2_vabswsat:
-  di_SInst_di_sat                 <"vabsw",   int_hexagon_A2_vabswsat>;
-
-// STYPE / VW / Vector shift words by immediate.
-// Rdd64=v[asl/vsl]w(Rss64,Rt32)
-def HEXAGON_S2_asl_i_vw:
-  di_SInst_disi                   <"vaslw",   int_hexagon_S2_asl_i_vw>;
-def HEXAGON_S2_asr_i_vw:
-  di_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_i_vw>;
-def HEXAGON_S2_lsr_i_vw:
-  di_SInst_disi                   <"vlsrw",   int_hexagon_S2_lsr_i_vw>;
-
-// STYPE / VW / Vector shift words by register.
-// Rdd64=v[asl/vsl]w(Rss64,Rt32)
-def HEXAGON_S2_asl_r_vw:
-  di_SInst_disi                   <"vaslw",   int_hexagon_S2_asl_r_vw>;
-def HEXAGON_S2_asr_r_vw:
-  di_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_r_vw>;
-def HEXAGON_S2_lsl_r_vw:
-  di_SInst_disi                   <"vlslw",   int_hexagon_S2_lsl_r_vw>;
-def HEXAGON_S2_lsr_r_vw:
-  di_SInst_disi                   <"vlsrw",   int_hexagon_S2_lsr_r_vw>;
-
-// STYPE / VW / Vector shift words with truncate and pack.
-def HEXAGON_S2_asr_r_svw_trun:
-  si_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_r_svw_trun>;
-def HEXAGON_S2_asr_i_svw_trun:
-  si_SInst_diu5                   <"vasrw",   int_hexagon_S2_asr_i_svw_trun>;
-
-// LD / Circular loads.
-def HEXAGON_circ_ldd:
-  di_LDInstPI_diu4                <"circ_ldd", int_hexagon_circ_ldd>;
+// Vector absolute value words with and without saturation
+def : T_P_pat <A2_vabsw, int_hexagon_A2_vabsw>;
+def : T_P_pat <A2_vabswsat, int_hexagon_A2_vabswsat>;
+
+// Vector shift words by immediate.
+// Rdd64=[vasrw/vlsrw|vaslw](Rss64,u5)
+def : T_PI_pat <S2_asr_i_vw, int_hexagon_S2_asr_i_vw>;
+def : T_PI_pat <S2_lsr_i_vw, int_hexagon_S2_lsr_i_vw>;
+def : T_PI_pat <S2_asl_i_vw, int_hexagon_S2_asl_i_vw>;
+
+// Vector shift words by register.
+// Rdd64=[vasrw/vlsrw|vaslw|vlslw](Rss64,Rt32)
+def : T_PR_pat <S2_asr_r_vw, int_hexagon_S2_asr_r_vw>;
+def : T_PR_pat <S2_lsr_r_vw, int_hexagon_S2_lsr_r_vw>;
+def : T_PR_pat <S2_asl_r_vw, int_hexagon_S2_asl_r_vw>;
+def : T_PR_pat <S2_lsl_r_vw, int_hexagon_S2_lsl_r_vw>;
+
+// Vector shift words with truncate and pack
+
+def : T_PR_pat <S2_asr_r_svw_trun, int_hexagon_S2_asr_r_svw_trun>;
+
+def : T_R_pat<L2_loadw_locked, int_hexagon_L2_loadw_locked>;
+def : T_R_pat<L4_loadd_locked, int_hexagon_L4_loadd_locked>;
+
+def: Pat<(i32 (int_hexagon_S2_storew_locked (I32:$Rs), (I32:$Rt))),
+         (i32 (C2_tfrpr (S2_storew_locked (I32:$Rs), (I32:$Rt))))>;
+def: Pat<(i32 (int_hexagon_S4_stored_locked (I32:$Rs), (I64:$Rt))),
+         (i32 (C2_tfrpr (S4_stored_locked (I32:$Rs), (I64:$Rt))))>;
 
 include "HexagonIntrinsicsV3.td"
 include "HexagonIntrinsicsV4.td"
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsDerived.td b/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
index 2788101..4c28b28 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
@@ -13,13 +13,13 @@
 //
 def : Pat <(mul DoubleRegs:$src1, DoubleRegs:$src2),
       (i64
-       (COMBINE_rr
-        (HEXAGON_M2_maci
-         (HEXAGON_M2_maci
+       (A2_combinew
+        (M2_maci
+         (M2_maci
           (i32
            (EXTRACT_SUBREG
             (i64
-             (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+             (M2_dpmpyuu_s0 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
                                           subreg_loreg)),
                      (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
                                           subreg_loreg)))),
@@ -31,7 +31,8 @@ def : Pat <(mul DoubleRegs:$src1, DoubleRegs:$src2),
         (i32
          (EXTRACT_SUBREG
           (i64
-           (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+           (M2_dpmpyuu_s0 
+             (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
                                         subreg_loreg)))), subreg_loreg))))>;
 
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV3.td b/lib/Target/Hexagon/HexagonIntrinsicsV3.td
index 2a54e62..6152cb0 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsV3.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV3.td
@@ -11,40 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-
-
-
-// MTYPE / COMPLEX / Vector reduce complex multiply real or imaginary.
-def Hexagon_M2_vrcmpys_s1:
-  di_MInst_disi_s1_sat            <"vrcmpys",  int_hexagon_M2_vrcmpys_s1>;
-def Hexagon_M2_vrcmpys_acc_s1:
-  di_MInst_didisi_acc_s1_sat      <"vrcmpys",  int_hexagon_M2_vrcmpys_acc_s1>;
-def Hexagon_M2_vrcmpys_s1rp:
-  si_MInst_disi_s1_rnd_sat        <"vrcmpys",  int_hexagon_M2_vrcmpys_s1rp>;
-
-
-
-
-/********************************************************************
-*            MTYPE/VB                                               *
-*********************************************************************/
-
-// MTYPE / VB / Vector reduce add unsigned bytes.
-def Hexagon_M2_vradduh:
-  si_MInst_didi                   <"vradduh",  int_hexagon_M2_vradduh>;
-
-
-/********************************************************************
-*            ALU64/ALU                                              *
-*********************************************************************/
-
-// ALU64 / ALU / Add.
-def Hexagon_A2_addsp:
-  di_ALU64_sidi                   <"add",      int_hexagon_A2_addsp>;
-def Hexagon_A2_addpsat:
-  di_ALU64_didi                   <"add",      int_hexagon_A2_addpsat>;
-
-def Hexagon_A2_maxp:
-  di_ALU64_didi                   <"max",      int_hexagon_A2_maxp>;
-def Hexagon_A2_maxup:
-  di_ALU64_didi                   <"maxu",     int_hexagon_A2_maxup>;
+// Vector reduce complex multiply real or imaginary
+def : T_PR_pat <M2_vrcmpys_s1,     int_hexagon_M2_vrcmpys_s1>;
+def : T_PPR_pat<M2_vrcmpys_acc_s1, int_hexagon_M2_vrcmpys_acc_s1>;
+def : T_PR_pat <M2_vrcmpys_s1rp,   int_hexagon_M2_vrcmpys_s1rp>;
+
+// Vector reduce add unsigned halfwords
+def : T_PP_pat<M2_vradduh, int_hexagon_M2_vradduh>;
+
+def: T_RP_pat<A2_addsp,   int_hexagon_A2_addsp>;
+def: T_PP_pat<A2_addpsat, int_hexagon_A2_addpsat>;
+def: T_PP_pat<A2_minp,    int_hexagon_A2_minp>;
+def: T_PP_pat<A2_minup,   int_hexagon_A2_minup>;
+def: T_PP_pat<A2_maxp,    int_hexagon_A2_maxp>;
+def: T_PP_pat<A2_maxup,   int_hexagon_A2_maxup>;
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV4.td b/lib/Target/Hexagon/HexagonIntrinsicsV4.td
index 77b148b..8d068eb 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsV4.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV4.td
@@ -12,359 +12,307 @@
 // 80-V9418-12 Rev. A
 // June 15, 2010
 
+// Vector reduce multiply word by signed half (32x16)
+//Rdd=vrmpyweh(Rss,Rtt)[:<<1]
+def : T_PP_pat <M4_vrmpyeh_s0, int_hexagon_M4_vrmpyeh_s0>;
+def : T_PP_pat <M4_vrmpyeh_s1, int_hexagon_M4_vrmpyeh_s1>;
+
+//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+def : T_PP_pat <M4_vrmpyoh_s0, int_hexagon_M4_vrmpyoh_s0>;
+def : T_PP_pat <M4_vrmpyoh_s1, int_hexagon_M4_vrmpyoh_s1>;
+
+//Rdd+=vrmpyweh(Rss,Rtt)[:<<1]
+def : T_PPP_pat <M4_vrmpyeh_acc_s0, int_hexagon_M4_vrmpyeh_acc_s0>;
+def : T_PPP_pat <M4_vrmpyeh_acc_s1, int_hexagon_M4_vrmpyeh_acc_s1>;
+
+//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+def : T_PPP_pat <M4_vrmpyoh_acc_s0, int_hexagon_M4_vrmpyoh_acc_s0>;
+def : T_PPP_pat <M4_vrmpyoh_acc_s1, int_hexagon_M4_vrmpyoh_acc_s1>;
+
+// Vector multiply halfwords, signed by unsigned
+// Rdd=vmpyhsu(Rs,Rt)[:<<1]:sat
+def : T_RR_pat <M2_vmpy2su_s0, int_hexagon_M2_vmpy2su_s0>;
+def : T_RR_pat <M2_vmpy2su_s1, int_hexagon_M2_vmpy2su_s1>;
+
+// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat
+def : T_PRR_pat <M2_vmac2su_s0, int_hexagon_M2_vmac2su_s0>;
+def : T_PRR_pat <M2_vmac2su_s1, int_hexagon_M2_vmac2su_s1>;
+
+// Vector polynomial multiply halfwords
+// Rdd=vpmpyh(Rs,Rt)
+def : T_RR_pat <M4_vpmpyh, int_hexagon_M4_vpmpyh>;
+// Rxx[^]=vpmpyh(Rs,Rt)
+def : T_PRR_pat <M4_vpmpyh_acc, int_hexagon_M4_vpmpyh_acc>;
+
+// Polynomial multiply words
+// Rdd=pmpyw(Rs,Rt)
+def : T_RR_pat <M4_pmpyw, int_hexagon_M4_pmpyw>;
+// Rxx^=pmpyw(Rs,Rt)
+def : T_PRR_pat <M4_pmpyw_acc, int_hexagon_M4_pmpyw_acc>;
+
+//Rxx^=asr(Rss,Rt)
+def : T_PPR_pat <S2_asr_r_p_xor, int_hexagon_S2_asr_r_p_xor>;
+//Rxx^=asl(Rss,Rt)
+def : T_PPR_pat <S2_asl_r_p_xor, int_hexagon_S2_asl_r_p_xor>;
+//Rxx^=lsr(Rss,Rt)
+def : T_PPR_pat <S2_lsr_r_p_xor, int_hexagon_S2_lsr_r_p_xor>;
+//Rxx^=lsl(Rss,Rt)
+def : T_PPR_pat <S2_lsl_r_p_xor, int_hexagon_S2_lsl_r_p_xor>;
+
+// Multiply and use upper result
+def : MType_R32_pat <int_hexagon_M2_mpysu_up, M2_mpysu_up>;
+def : MType_R32_pat <int_hexagon_M2_mpy_up_s1, M2_mpy_up_s1>;
+def : MType_R32_pat <int_hexagon_M2_hmmpyh_s1, M2_hmmpyh_s1>;
+def : MType_R32_pat <int_hexagon_M2_hmmpyl_s1, M2_hmmpyl_s1>;
+def : MType_R32_pat <int_hexagon_M2_mpy_up_s1_sat, M2_mpy_up_s1_sat>;
+
+// Vector reduce add unsigned halfwords
+def : Pat <(int_hexagon_M2_vraddh DoubleRegs:$src1, DoubleRegs:$src2),
+           (M2_vraddh DoubleRegs:$src1, DoubleRegs:$src2)>;
+
+def : T_P_pat <S2_brevp, int_hexagon_S2_brevp>;
+
+def: T_P_pat  <S2_ct0p,      int_hexagon_S2_ct0p>;
+def: T_P_pat  <S2_ct1p,      int_hexagon_S2_ct1p>;
+def: T_RR_pat<C4_nbitsset,  int_hexagon_C4_nbitsset>;
+def: T_RR_pat<C4_nbitsclr,  int_hexagon_C4_nbitsclr>;
+def: T_RI_pat<C4_nbitsclri, int_hexagon_C4_nbitsclri>;
+
+
+class vcmpImm_pat <InstHexagon MI, Intrinsic IntID, PatLeaf immPred> :
+      Pat <(IntID  (i64 DoubleRegs:$src1), immPred:$src2),
+           (MI (i64 DoubleRegs:$src1), immPred:$src2)>;
+
+def : vcmpImm_pat <A4_vcmpbeqi, int_hexagon_A4_vcmpbeqi, u8ImmPred>;
+def : vcmpImm_pat <A4_vcmpbgti, int_hexagon_A4_vcmpbgti, s8ImmPred>;
+def : vcmpImm_pat <A4_vcmpbgtui, int_hexagon_A4_vcmpbgtui, u7ImmPred>;
+
+def : vcmpImm_pat <A4_vcmpheqi, int_hexagon_A4_vcmpheqi, s8ImmPred>;
+def : vcmpImm_pat <A4_vcmphgti, int_hexagon_A4_vcmphgti, s8ImmPred>;
+def : vcmpImm_pat <A4_vcmphgtui, int_hexagon_A4_vcmphgtui, u7ImmPred>;
+
+def : vcmpImm_pat <A4_vcmpweqi, int_hexagon_A4_vcmpweqi, s8ImmPred>;
+def : vcmpImm_pat <A4_vcmpwgti, int_hexagon_A4_vcmpwgti, s8ImmPred>;
+def : vcmpImm_pat <A4_vcmpwgtui, int_hexagon_A4_vcmpwgtui, u7ImmPred>;
+
+def : T_PP_pat<A4_vcmpbeq_any, int_hexagon_A4_vcmpbeq_any>;
+
+def : T_RR_pat<A4_cmpbeq,   int_hexagon_A4_cmpbeq>;
+def : T_RR_pat<A4_cmpbgt,   int_hexagon_A4_cmpbgt>;
+def : T_RR_pat<A4_cmpbgtu,  int_hexagon_A4_cmpbgtu>;
+def : T_RR_pat<A4_cmpheq,   int_hexagon_A4_cmpheq>;
+def : T_RR_pat<A4_cmphgt,   int_hexagon_A4_cmphgt>;
+def : T_RR_pat<A4_cmphgtu,  int_hexagon_A4_cmphgtu>;
+
+def : T_RI_pat<A4_cmpbeqi,  int_hexagon_A4_cmpbeqi>;
+def : T_RI_pat<A4_cmpbgti,  int_hexagon_A4_cmpbgti>;
+def : T_RI_pat<A4_cmpbgtui, int_hexagon_A4_cmpbgtui>;
+
+def : T_RI_pat<A4_cmpheqi,  int_hexagon_A4_cmpheqi>;
+def : T_RI_pat<A4_cmphgti,  int_hexagon_A4_cmphgti>;
+def : T_RI_pat<A4_cmphgtui, int_hexagon_A4_cmphgtui>;
+
+def : T_RP_pat <A4_boundscheck, int_hexagon_A4_boundscheck>;
+
+def : T_PR_pat<A4_tlbmatch, int_hexagon_A4_tlbmatch>;
+
+def : Pat <(int_hexagon_M4_mpyrr_addr IntRegs:$src1, IntRegs:$src2,
+                                      IntRegs:$src3),
+           (M4_mpyrr_addr IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def : T_IRR_pat <M4_mpyrr_addi, int_hexagon_M4_mpyrr_addi>;
+def : T_IRI_pat <M4_mpyri_addi, int_hexagon_M4_mpyri_addi>;
+def : T_RIR_pat <M4_mpyri_addr_u2, int_hexagon_M4_mpyri_addr_u2>;
+def : T_RRI_pat <M4_mpyri_addr, int_hexagon_M4_mpyri_addr>;
+// Multiply 32x32 and use upper result
+def : T_RRR_pat <M4_mac_up_s1_sat, int_hexagon_M4_mac_up_s1_sat>;
+def : T_RRR_pat <M4_nac_up_s1_sat, int_hexagon_M4_nac_up_s1_sat>;
+
+// Complex multiply 32x16
+def : T_PR_pat <M4_cmpyi_wh, int_hexagon_M4_cmpyi_wh>;
+def : T_PR_pat <M4_cmpyr_wh, int_hexagon_M4_cmpyr_wh>;
+
+def : T_PR_pat <M4_cmpyi_whc, int_hexagon_M4_cmpyi_whc>;
+def : T_PR_pat <M4_cmpyr_whc, int_hexagon_M4_cmpyr_whc>;
+
+def : T_PP_pat<A4_andnp, int_hexagon_A4_andnp>;
+def : T_PP_pat<A4_ornp,  int_hexagon_A4_ornp>;
+
+// Complex add/sub halfwords/words
+def : T_PP_pat <S4_vxaddsubw, int_hexagon_S4_vxaddsubw>;
+def : T_PP_pat <S4_vxsubaddw, int_hexagon_S4_vxsubaddw>;
+def : T_PP_pat <S4_vxaddsubh, int_hexagon_S4_vxaddsubh>;
+def : T_PP_pat <S4_vxsubaddh, int_hexagon_S4_vxsubaddh>;
+
+def : T_PP_pat <S4_vxaddsubhr, int_hexagon_S4_vxaddsubhr>;
+def : T_PP_pat <S4_vxsubaddhr, int_hexagon_S4_vxsubaddhr>;
+
+// Extract bitfield
+def : T_PP_pat  <S4_extractp_rp, int_hexagon_S4_extractp_rp>;
+def : T_RP_pat  <S4_extract_rp, int_hexagon_S4_extract_rp>;
+def : T_PII_pat <S4_extractp, int_hexagon_S4_extractp>;
+def : T_RII_pat <S4_extract, int_hexagon_S4_extract>;
+
+// Vector conditional negate
+// Rdd=vcnegh(Rss,Rt)
+def : T_PR_pat <S2_vcnegh, int_hexagon_S2_vcnegh>;
+
+// Shift an immediate left by register amount
+def : T_IR_pat<S4_lsli, int_hexagon_S4_lsli>;
+
+// Vector reduce maximum halfwords
+def : T_PPR_pat <A4_vrmaxh, int_hexagon_A4_vrmaxh>;
+def : T_PPR_pat <A4_vrmaxuh, int_hexagon_A4_vrmaxuh>;
 
-//
-// ALU 32 types.
-//
-
-class si_ALU32_sisi_not<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, ~$src2)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class di_ALU32_s8si<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs DoubleRegs:$dst), (ins s8Imm:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "(#$src1, $src2)")),
-             [(set DoubleRegs:$dst, (IntID imm:$src1, IntRegs:$src2))]>;
+// Vector reduce maximum words
+def : T_PPR_pat <A4_vrmaxw, int_hexagon_A4_vrmaxw>;
+def : T_PPR_pat <A4_vrmaxuw, int_hexagon_A4_vrmaxuw>;
 
-class di_ALU32_sis8<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+// Vector reduce minimum halfwords
+def : T_PPR_pat <A4_vrminh, int_hexagon_A4_vrminh>;
+def : T_PPR_pat <A4_vrminuh, int_hexagon_A4_vrminuh>;
 
-class qi_neg_ALU32_sisi<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = !", !strconcat(opc , "($src1, $src2)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+// Vector reduce minimum words
+def : T_PPR_pat <A4_vrminw, int_hexagon_A4_vrminw>;
+def : T_PPR_pat <A4_vrminuw, int_hexagon_A4_vrminuw>;
 
-class qi_neg_ALU32_sis10<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, s10Imm:$src2),
-             !strconcat("$dst = !", !strconcat(opc , "($src1, #$src2)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+// Rotate and reduce bytes
+def : Pat <(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2,
+                                     u2ImmPred:$src3),
+           (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2ImmPred:$src3)>;
+
+// Rotate and reduce bytes with accumulation
+// Rxx+=vrcrotate(Rss,Rt,#u2)
+def : Pat <(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
+                                         IntRegs:$src3, u2ImmPred:$src4),
+           (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
+                             IntRegs:$src3, u2ImmPred:$src4)>;
+
+// Vector conditional negate
+def : T_PPR_pat<S2_vrcnegh, int_hexagon_S2_vrcnegh>;
 
-class qi_neg_ALU32_siu9<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, u9Imm:$src2),
-             !strconcat("$dst = !", !strconcat(opc , "($src1, #$src2)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+// Logical xor with xor accumulation
+def : T_PPP_pat<M4_xor_xacc, int_hexagon_M4_xor_xacc>;
+
+// ALU64 - Vector min/max byte
+def : T_PP_pat <A2_vminb, int_hexagon_A2_vminb>;
+def : T_PP_pat <A2_vmaxb, int_hexagon_A2_vmaxb>;
+
+// Shift and add/sub/and/or
+def : T_IRI_pat <S4_andi_asl_ri, int_hexagon_S4_andi_asl_ri>;
+def : T_IRI_pat <S4_ori_asl_ri,  int_hexagon_S4_ori_asl_ri>;
+def : T_IRI_pat <S4_addi_asl_ri, int_hexagon_S4_addi_asl_ri>;
+def : T_IRI_pat <S4_subi_asl_ri, int_hexagon_S4_subi_asl_ri>;
+def : T_IRI_pat <S4_andi_lsr_ri, int_hexagon_S4_andi_lsr_ri>;
+def : T_IRI_pat <S4_ori_lsr_ri,  int_hexagon_S4_ori_lsr_ri>;
+def : T_IRI_pat <S4_addi_lsr_ri, int_hexagon_S4_addi_lsr_ri>;
+def : T_IRI_pat <S4_subi_lsr_ri, int_hexagon_S4_subi_lsr_ri>;
 
-class si_neg_ALU32_sisi<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = !", !strconcat(opc , "($src1, $src2)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class si_neg_ALU32_sis8<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2),
-             !strconcat("$dst = !", !strconcat(opc , "($src1, #$src2)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
-
-class si_ALU32_sis8<string opc, Intrinsic IntID>
-  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
-
-
-//
-// SInst Classes.
-//
-class qi_neg_SInst_qiqi<string opc, Intrinsic IntID>
-  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = !", !strconcat(opc , "($src1, $src2)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class qi_SInst_qi_andqiqi_neg<string opc, Intrinsic IntID>
-  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
-                                     IntRegs:$src3),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, and($src2, !$src3)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
-                                         IntRegs:$src3))]>;
-
-class qi_SInst_qi_andqiqi<string opc, Intrinsic IntID>
-  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
-                                     IntRegs:$src3),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, and($src2, $src3)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
-                                         IntRegs:$src3))]>;
-
-class qi_SInst_qi_orqiqi_neg<string opc, Intrinsic IntID>
-  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
-                                     IntRegs:$src3),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, or($src2, !$src3)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
-                                         IntRegs:$src3))]>;
-
-class qi_SInst_qi_orqiqi<string opc, Intrinsic IntID>
-  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
-                                     IntRegs:$src3),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, or($src2, $src3)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
-                                         IntRegs:$src3))]>;
-
-class si_SInst_si_addsis6<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, s6Imm:$src3),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, add($src2, #$src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
-                                        imm:$src3))]>;
-
-class si_SInst_si_subs6si<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s6Imm:$src2, IntRegs:$src3),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, sub(#$src2, $src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2,
-                                        IntRegs:$src3))]>;
-
-class di_ALU64_didi_neg<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, ~$src2)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
-
-class di_MInst_dididi_xacc<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                           DoubleRegs:$src2),
-               !strconcat("$dst ^= ", !strconcat(opc , "($src1, $src2)")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
-                                             DoubleRegs:$src2))],
-               "$dst2 = $dst">;
-
-class si_MInst_sisisi_and<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
-                                    IntRegs:$src3),
-             !strconcat("$dst &= ", !strconcat(opc , "($src2, $src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
-                                        IntRegs:$src3))]>;
-
-class si_MInst_sisisi_andn<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
-                                    IntRegs:$src3),
-             !strconcat("$dst &= ", !strconcat(opc , "($src2, ~$src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
-                                        IntRegs:$src3))]>;
-
-class si_SInst_sisis10_andi<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, s10Imm:$src3),
-             !strconcat("$dst = ", !strconcat(opc ,
-                                              "($src1, and($src2, #$src3))")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
-                                        imm:$src3))]>;
-
-class si_MInst_sisisi_xor<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
-                                    IntRegs:$src3),
-             !strconcat("$dst ^= ", !strconcat(opc , "($src2, $src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
-                                        IntRegs:$src3))]>;
-
-class si_MInst_sisisi_xorn<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
-                                    IntRegs:$src3),
-             !strconcat("$dst ^= ", !strconcat(opc , "($src2, ~$src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
-                                        IntRegs:$src3))]>;
-
-class si_SInst_sisis10_or<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2, s10Imm:$src3),
-             !strconcat("$dst |= ", !strconcat(opc , "($src2, #$src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
-                                        imm:$src3))]>;
-
-class si_MInst_sisisi_or<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
-                                    IntRegs:$src3),
-             !strconcat("$dst |= ", !strconcat(opc , "($src2, $src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
-                                        IntRegs:$src3))]>;
-
-class si_MInst_sisisi_orn<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
-                                    IntRegs:$src3),
-             !strconcat("$dst |= ", !strconcat(opc , "($src2, ~$src3)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
-                                        IntRegs:$src3))]>;
-
-class si_SInst_siu5_sat<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):sat")),
-          [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+// Split bitfield
+def : T_RI_pat <A4_bitspliti, int_hexagon_A4_bitspliti>;
+def : T_RR_pat <A4_bitsplit, int_hexagon_A4_bitsplit>;
 
+def: T_RR_pat<S4_parity,   int_hexagon_S4_parity>;
+
+def: T_RI_pat<S4_ntstbit_i,  int_hexagon_S4_ntstbit_i>;
+def: T_RR_pat<S4_ntstbit_r,  int_hexagon_S4_ntstbit_r>;
+
+def: T_RI_pat<S4_clbaddi,  int_hexagon_S4_clbaddi>;
+def: T_PI_pat<S4_clbpaddi, int_hexagon_S4_clbpaddi>;
+def: T_P_pat <S4_clbpnorm, int_hexagon_S4_clbpnorm>;
 
 /********************************************************************
 *            ALU32/ALU                                              *
 *********************************************************************/
 
 // ALU32 / ALU / Logical Operations.
-def Hexagon_A4_orn  : si_ALU32_sisi_not <"or",  int_hexagon_A4_orn>;
-def Hexagon_A4_andn : si_ALU32_sisi_not <"and", int_hexagon_A4_andn>;
-
+def: T_RR_pat<A4_andn, int_hexagon_A4_andn>;
+def: T_RR_pat<A4_orn,  int_hexagon_A4_orn>;
 
 /********************************************************************
 *            ALU32/PERM                                             *
 *********************************************************************/
 
-// ALU32 / PERM / Combine Words Into Doublewords.
-def Hexagon_A4_combineir : di_ALU32_s8si  <"combine", int_hexagon_A4_combineir>;
-def Hexagon_A4_combineri : di_ALU32_sis8  <"combine", int_hexagon_A4_combineri>;
-
+// Combine Words Into Doublewords.
+def: T_RI_pat<A4_combineri, int_hexagon_A4_combineri, s8ExtPred>;
+def: T_IR_pat<A4_combineir, int_hexagon_A4_combineir, s8ExtPred>;
 
 /********************************************************************
 *            ALU32/PRED                                             *
 *********************************************************************/
 
-// ALU32 / PRED / Conditional Shift Halfword.
-// ALU32 / PRED / Conditional Sign Extend.
-// ALU32 / PRED / Conditional Zero Extend.
-// ALU32 / PRED / Compare.
-def Hexagon_C4_cmpltei : qi_neg_ALU32_sis10 <"cmp.gt", int_hexagon_C4_cmpltei>;
-def Hexagon_C4_cmplte  : qi_neg_ALU32_sisi  <"cmp.gt", int_hexagon_C4_cmplte>;
-def Hexagon_C4_cmplteu : qi_neg_ALU32_sisi  <"cmp.gtu",int_hexagon_C4_cmplteu>;
+// Compare
+def : T_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi, s10ExtPred>;
+def : T_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei, s10ExtPred>;
+def : T_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui, u9ExtPred>;
 
-def: T_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi>;
-def: T_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei>;
-def: T_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui>;
-
-// ALU32 / PRED / cmpare To General Register.
-def Hexagon_A4_rcmpneq : si_neg_ALU32_sisi <"cmp.eq", int_hexagon_A4_rcmpneq>;
-def Hexagon_A4_rcmpneqi: si_neg_ALU32_sis8 <"cmp.eq", int_hexagon_A4_rcmpneqi>;
-def Hexagon_A4_rcmpeq  : si_ALU32_sisi     <"cmp.eq", int_hexagon_A4_rcmpeq>;
-def Hexagon_A4_rcmpeqi : si_ALU32_sis8     <"cmp.eq", int_hexagon_A4_rcmpeqi>;
+def: T_RR_pat<A4_rcmpeq,  int_hexagon_A4_rcmpeq>;
+def: T_RR_pat<A4_rcmpneq, int_hexagon_A4_rcmpneq>;
 
+def: T_RI_pat<A4_rcmpeqi,  int_hexagon_A4_rcmpeqi>;
+def: T_RI_pat<A4_rcmpneqi, int_hexagon_A4_rcmpneqi>;
 
 /********************************************************************
 *            CR                                                     *
 *********************************************************************/
 
-// CR / Corner Detection Acceleration.
-def Hexagon_C4_fastcorner9:
-  qi_SInst_qiqi<"fastcorner9", int_hexagon_C4_fastcorner9>;
-def Hexagon_C4_fastcorner9_not:
-  qi_neg_SInst_qiqi<"fastcorner9",int_hexagon_C4_fastcorner9_not>;
-
 // CR / Logical Operations On Predicates.
-def Hexagon_C4_and_andn:
-  qi_SInst_qi_andqiqi_neg         <"and",      int_hexagon_C4_and_andn>;
-def Hexagon_C4_and_and:
-  qi_SInst_qi_andqiqi             <"and",      int_hexagon_C4_and_and>;
-def Hexagon_C4_and_orn:
-  qi_SInst_qi_orqiqi_neg          <"and",      int_hexagon_C4_and_orn>;
-def Hexagon_C4_and_or:
-  qi_SInst_qi_orqiqi              <"and",      int_hexagon_C4_and_or>;
-def Hexagon_C4_or_andn:
-  qi_SInst_qi_andqiqi_neg         <"or",       int_hexagon_C4_or_andn>;
-def Hexagon_C4_or_and:
-  qi_SInst_qi_andqiqi             <"or",       int_hexagon_C4_or_and>;
-def Hexagon_C4_or_orn:
-  qi_SInst_qi_orqiqi_neg          <"or",       int_hexagon_C4_or_orn>;
-def Hexagon_C4_or_or:
-  qi_SInst_qi_orqiqi              <"or",       int_hexagon_C4_or_or>;
 
+class qi_CRInst_qiqiqi_pat<Intrinsic IntID, InstHexagon Inst> :
+  Pat<(i32 (IntID IntRegs:$Rs, IntRegs:$Rt, IntRegs:$Ru)),
+      (i32 (C2_tfrpr (Inst (C2_tfrrp IntRegs:$Rs),
+                           (C2_tfrrp IntRegs:$Rt),
+                           (C2_tfrrp IntRegs:$Ru))))>;
+
+def: qi_CRInst_qiqiqi_pat<int_hexagon_C4_and_and,   C4_and_and>;
+def: qi_CRInst_qiqiqi_pat<int_hexagon_C4_and_andn,  C4_and_andn>;
+def: qi_CRInst_qiqiqi_pat<int_hexagon_C4_and_or,    C4_and_or>;
+def: qi_CRInst_qiqiqi_pat<int_hexagon_C4_and_orn,   C4_and_orn>;
+def: qi_CRInst_qiqiqi_pat<int_hexagon_C4_or_and,    C4_or_and>;
+def: qi_CRInst_qiqiqi_pat<int_hexagon_C4_or_andn,   C4_or_andn>;
+def: qi_CRInst_qiqiqi_pat<int_hexagon_C4_or_or,     C4_or_or>;
+def: qi_CRInst_qiqiqi_pat<int_hexagon_C4_or_orn,    C4_or_orn>;
 
 /********************************************************************
 *            XTYPE/ALU                                              *
 *********************************************************************/
 
-// XTYPE / ALU / Add And Accumulate.
-def Hexagon_S4_addaddi:
-  si_SInst_si_addsis6             <"add",      int_hexagon_S4_addaddi>;
-def Hexagon_S4_subaddi:
-  si_SInst_si_subs6si             <"add",      int_hexagon_S4_subaddi>;
+// Add And Accumulate.
 
-// XTYPE / ALU / Logical Doublewords.
-def Hexagon_S4_andnp:
-  di_ALU64_didi_neg               <"and",      int_hexagon_A4_andnp>;
-def Hexagon_S4_ornp:
-  di_ALU64_didi_neg               <"or",       int_hexagon_A4_ornp>;
+def : T_RRI_pat <S4_addaddi, int_hexagon_S4_addaddi>;
+def : T_RIR_pat <S4_subaddi, int_hexagon_S4_subaddi>;
 
-// XTYPE / ALU / Logical-logical Doublewords.
-def Hexagon_M4_xor_xacc:
-  di_MInst_dididi_xacc            <"xor",      int_hexagon_M4_xor_xacc>;
 
 // XTYPE / ALU / Logical-logical Words.
-def HEXAGON_M4_and_and:
-  si_MInst_sisisi_and             <"and",      int_hexagon_M4_and_and>;
-def HEXAGON_M4_and_or:
-  si_MInst_sisisi_and             <"or",       int_hexagon_M4_and_or>;
-def HEXAGON_M4_and_xor:
-  si_MInst_sisisi_and             <"xor",      int_hexagon_M4_and_xor>;
-def HEXAGON_M4_and_andn:
-  si_MInst_sisisi_andn            <"and",      int_hexagon_M4_and_andn>;
-def HEXAGON_M4_xor_and:
-  si_MInst_sisisi_xor             <"and",      int_hexagon_M4_xor_and>;
-def HEXAGON_M4_xor_or:
-  si_MInst_sisisi_xor             <"or",       int_hexagon_M4_xor_or>;
-def HEXAGON_M4_xor_andn:
-  si_MInst_sisisi_xorn            <"and",      int_hexagon_M4_xor_andn>;
-def HEXAGON_M4_or_and:
-  si_MInst_sisisi_or              <"and",      int_hexagon_M4_or_and>;
-def HEXAGON_M4_or_or:
-  si_MInst_sisisi_or              <"or",       int_hexagon_M4_or_or>;
-def HEXAGON_M4_or_xor:
-  si_MInst_sisisi_or              <"xor",      int_hexagon_M4_or_xor>;
-def HEXAGON_M4_or_andn:
-  si_MInst_sisisi_orn             <"and",      int_hexagon_M4_or_andn>;
-def HEXAGON_S4_or_andix:
-  si_SInst_sisis10_andi           <"or",       int_hexagon_S4_or_andix>;
-def HEXAGON_S4_or_andi:
-  si_SInst_sisis10_or             <"and",      int_hexagon_S4_or_andi>;
-def HEXAGON_S4_or_ori:
-  si_SInst_sisis10_or             <"or",       int_hexagon_S4_or_ori>;
-
-// XTYPE / ALU / Modulo wrap.
-def HEXAGON_A4_modwrapu:
-  si_ALU64_sisi                   <"modwrap",  int_hexagon_A4_modwrapu>;
-
-// XTYPE / ALU / Round.
-def HEXAGON_A4_cround_ri:
-  si_SInst_siu5                   <"cround",   int_hexagon_A4_cround_ri>;
-def HEXAGON_A4_cround_rr:
-  si_SInst_sisi                   <"cround",   int_hexagon_A4_cround_rr>;
-def HEXAGON_A4_round_ri:
-  si_SInst_siu5                   <"round",    int_hexagon_A4_round_ri>;
-def HEXAGON_A4_round_rr:
-  si_SInst_sisi                   <"round",    int_hexagon_A4_round_rr>;
-def HEXAGON_A4_round_ri_sat:
-  si_SInst_siu5_sat               <"round",    int_hexagon_A4_round_ri_sat>;
-def HEXAGON_A4_round_rr_sat:
-  si_SInst_sisi_sat               <"round",    int_hexagon_A4_round_rr_sat>;
-
-// XTYPE / ALU / Vector reduce add unsigned halfwords.
-// XTYPE / ALU / Vector add bytes.
-// XTYPE / ALU / Vector conditional negate.
-// XTYPE / ALU / Vector maximum bytes.
-// XTYPE / ALU / Vector reduce maximum halfwords.
-// XTYPE / ALU / Vector reduce maximum words.
-// XTYPE / ALU / Vector minimum bytes.
-// XTYPE / ALU / Vector reduce minimum halfwords.
-// XTYPE / ALU / Vector reduce minimum words.
-// XTYPE / ALU / Vector subtract bytes.
-
-
-/********************************************************************
-*            XTYPE/BIT                                              *
-*********************************************************************/
-
-// XTYPE / BIT / Count leading.
-// XTYPE / BIT / Count trailing.
-// XTYPE / BIT / Extract bitfield.
-// XTYPE / BIT / Masked parity.
-// XTYPE / BIT / Bit reverse.
-// XTYPE / BIT / Split bitfield.
-
-
-/********************************************************************
-*            XTYPE/COMPLEX                                          *
-*********************************************************************/
-
-// XTYPE / COMPLEX / Complex add/sub halfwords.
-// XTYPE / COMPLEX / Complex add/sub words.
-// XTYPE / COMPLEX / Complex multiply 32x16.
-// XTYPE / COMPLEX / Vector reduce complex rotate.
-
-
-/********************************************************************
-*            XTYPE/MPY                                              *
-*********************************************************************/
-
-// XTYPE / COMPLEX / Complex add/sub halfwords.
+def : T_RRR_pat <M4_or_xor,   int_hexagon_M4_or_xor>;
+def : T_RRR_pat <M4_and_xor,  int_hexagon_M4_and_xor>;
+def : T_RRR_pat <M4_or_and,   int_hexagon_M4_or_and>;
+def : T_RRR_pat <M4_and_and,  int_hexagon_M4_and_and>;
+def : T_RRR_pat <M4_xor_and,  int_hexagon_M4_xor_and>;
+def : T_RRR_pat <M4_or_or,    int_hexagon_M4_or_or>;
+def : T_RRR_pat <M4_and_or,   int_hexagon_M4_and_or>;
+def : T_RRR_pat <M4_xor_or,   int_hexagon_M4_xor_or>;
+def : T_RRR_pat <M4_or_andn,  int_hexagon_M4_or_andn>;
+def : T_RRR_pat <M4_and_andn, int_hexagon_M4_and_andn>;
+def : T_RRR_pat <M4_xor_andn, int_hexagon_M4_xor_andn>;
+
+def : T_RRI_pat <S4_or_andi, int_hexagon_S4_or_andi>;
+def : T_RRI_pat <S4_or_andix,  int_hexagon_S4_or_andix>;
+def : T_RRI_pat <S4_or_ori, int_hexagon_S4_or_ori>;
+
+// Modulo wrap.
+def : T_RR_pat <A4_modwrapu, int_hexagon_A4_modwrapu>;
+
+// Arithmetic/Convergent round
+// Rd=[cround|round](Rs,Rt)[:sat]
+// Rd=[cround|round](Rs,#u5)[:sat]
+def : T_RI_pat <A4_cround_ri, int_hexagon_A4_cround_ri>;
+def : T_RR_pat <A4_cround_rr, int_hexagon_A4_cround_rr>;
+
+def : T_RI_pat <A4_round_ri, int_hexagon_A4_round_ri>;
+def : T_RR_pat <A4_round_rr, int_hexagon_A4_round_rr>;
+
+def : T_RI_pat <A4_round_ri_sat, int_hexagon_A4_round_ri_sat>;
+def : T_RR_pat <A4_round_rr_sat, int_hexagon_A4_round_rr_sat>;
+
+def : T_P_pat <A2_roundsat, int_hexagon_A2_roundsat>;
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/lib/Target/Hexagon/HexagonIntrinsicsV5.td
index 1d44b52..60e6b1e 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsV5.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV5.td
@@ -1,395 +1,111 @@
-class sf_SInst_sf<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1))]>;
-
-class si_SInst_sf<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1))]>;
-
-class sf_SInst_si<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1))]>;
-
-class sf_SInst_di<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
-             [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>;
-
-class sf_SInst_df<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
-             [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>;
-
-class si_SInst_df<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
-             [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>;
-
-class df_SInst_sf<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
-
-class di_SInst_sf<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
-
-class df_SInst_si<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
-             [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
-
-class df_SInst_df<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>;
-
-class di_SInst_df<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>;
-
-
-class df_SInst_di<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
-             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>;
-
-class sf_MInst_sfsf<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class df_MInst_dfdf<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-           !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-           [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
-
-class qi_ALU64_dfdf<string opc, Intrinsic IntID>
-  : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
-           !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-           [(set PredRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
-
-class qi_ALU64_dfu5<string opc, Intrinsic IntID>
-  : ALU64_ri<(outs PredRegs:$dst), (ins DoubleRegs:$src1, u5Imm:$src2),
-           !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-           [(set PredRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
-
-
-class sf_MInst_sfsfsf_acc<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
-                                        IntRegs:$dst2),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1, $src2)")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1,
-                                          IntRegs:$src2, IntRegs:$dst2))],
-               "$dst2 = $dst">;
-
-class sf_MInst_sfsfsf_nac<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
-                                        IntRegs:$dst2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1, $src2)")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1,
-                                          IntRegs:$src2, IntRegs:$dst2))],
-               "$dst2 = $dst">;
-
-
-class sf_MInst_sfsfsfsi_sc<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2, IntRegs:$src3),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1, $src2, $src3):scale")),
-               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
-                                        IntRegs:$src2, IntRegs:$src3))],
-               "$dst2 = $dst">;
-
-class sf_MInst_sfsfsf_acc_lib<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
-                                        IntRegs:$dst2),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1, $src2):lib")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1,
-                                          IntRegs:$src2, IntRegs:$dst2))],
-               "$dst2 = $dst">;
-
-class sf_MInst_sfsfsf_nac_lib<string opc, Intrinsic IntID>
-  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
-                                        IntRegs:$dst2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1, $src2):lib")),
-               [(set IntRegs:$dst, (IntID IntRegs:$src1,
-                                          IntRegs:$src2, IntRegs:$dst2))],
-               "$dst2 = $dst">;
-
-class df_MInst_dfdfdf_acc<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
-                                        DoubleRegs:$dst2),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1, $src2)")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
-                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
-               "$dst2 = $dst">;
-
-class df_MInst_dfdfdf_nac<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
-                                        DoubleRegs:$dst2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1, $src2)")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
-                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
-               "$dst2 = $dst">;
-
-
-class df_MInst_dfdfdfsi_sc<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
-                                        DoubleRegs:$src2, IntRegs:$src3),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1, $src2, $src3):scale")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
-                                        DoubleRegs:$src2, IntRegs:$src3))],
-               "$dst2 = $dst">;
-
-class df_MInst_dfdfdf_acc_lib<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
-                                        DoubleRegs:$dst2),
-               !strconcat("$dst += ", !strconcat(opc ,
-                                                 "($src1, $src2):lib")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
-                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
-               "$dst2 = $dst">;
-
-class df_MInst_dfdfdf_nac_lib<string opc, Intrinsic IntID>
-  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
-                                        DoubleRegs:$dst2),
-               !strconcat("$dst -= ", !strconcat(opc ,
-                                                 "($src1, $src2):lib")),
-               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
-                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
-               "$dst2 = $dst">;
-
-class qi_SInst_sfsf<string opc, Intrinsic IntID>
-  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
-
-class qi_SInst_sfu5<string opc, Intrinsic IntID>
-  : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
-             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
-
-class sf_ALU64_u10_pos<string opc, Intrinsic IntID>
-  : ALU64_ri<(outs IntRegs:$dst), (ins u10Imm:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "#$src1):pos")),
-             [(set IntRegs:$dst, (IntID imm:$src1))]>;
-
-class sf_ALU64_u10_neg<string opc, Intrinsic IntID>
-  : ALU64_ri<(outs IntRegs:$dst), (ins u10Imm:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "#$src1):neg")),
-             [(set IntRegs:$dst, (IntID imm:$src1))]>;
-
-class df_ALU64_u10_pos<string opc, Intrinsic IntID>
-  : ALU64_ri<(outs DoubleRegs:$dst), (ins u10Imm:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "#$src1):pos")),
-             [(set DoubleRegs:$dst, (IntID imm:$src1))]>;
-
-class df_ALU64_u10_neg<string opc, Intrinsic IntID>
-  : ALU64_ri<(outs DoubleRegs:$dst), (ins u10Imm:$src1),
-             !strconcat("$dst = ", !strconcat(opc , "#$src1):neg")),
-             [(set DoubleRegs:$dst, (IntID imm:$src1))]>;
-
-class di_MInst_diu6<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
-
-class di_MInst_diu4_rnd<string opc, Intrinsic IntID>
-  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):rnd")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
-
-class si_MInst_diu4_rnd_sat<string opc, Intrinsic IntID>
-  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):rnd:sat")),
-          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
-
-class si_SInst_diu4_sat<string opc, Intrinsic IntID>
-  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):sat")),
-          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
-
-
-def HEXAGON_C4_fastcorner9:
-    qi_SInst_qiqi       <"fastcorner9", int_hexagon_C4_fastcorner9>;
-def HEXAGON_C4_fastcorner9_not:
-    qi_SInst_qiqi <"!fastcorner9", int_hexagon_C4_fastcorner9_not>;
-def HEXAGON_M5_vrmpybuu:
-    di_MInst_didi <"vrmpybu", int_hexagon_M5_vrmpybuu>;
-def HEXAGON_M5_vrmacbuu:
-    di_MInst_dididi_acc <"vrmpybu", int_hexagon_M5_vrmacbuu>;
-def HEXAGON_M5_vrmpybsu:
-    di_MInst_didi <"vrmpybsu", int_hexagon_M5_vrmpybsu>;
-def HEXAGON_M5_vrmacbsu:
-    di_MInst_dididi_acc <"vrmpybsu", int_hexagon_M5_vrmacbsu>;
-def HEXAGON_M5_vmpybuu:
-    di_MInst_sisi <"vmpybu", int_hexagon_M5_vmpybuu>;
-def HEXAGON_M5_vmpybsu:
-    di_MInst_sisi <"vmpybsu", int_hexagon_M5_vmpybsu>;
-def HEXAGON_M5_vmacbuu:
-    di_MInst_disisi_acc <"vmpybu", int_hexagon_M5_vmacbuu>;
-def HEXAGON_M5_vmacbsu:
-    di_MInst_disisi_acc <"vmpybsu", int_hexagon_M5_vmacbsu>;
-def HEXAGON_M5_vdmpybsu:
-    di_MInst_didi_sat <"vdmpybsu", int_hexagon_M5_vdmpybsu>;
-def HEXAGON_M5_vdmacbsu:
-    di_MInst_dididi_acc_sat <"vdmpybsu", int_hexagon_M5_vdmacbsu>;
-def HEXAGON_A5_vaddhubs:
-    si_SInst_didi_sat <"vaddhub", int_hexagon_A5_vaddhubs>;
-def HEXAGON_S5_popcountp:
-    si_SInst_di <"popcount", int_hexagon_S5_popcountp>;
-def HEXAGON_S5_asrhub_rnd_sat_goodsyntax:
-    si_MInst_diu4_rnd_sat <"vasrhub", int_hexagon_S5_asrhub_rnd_sat_goodsyntax>;
-def HEXAGON_S5_asrhub_sat:
-    si_SInst_diu4_sat <"vasrhub", int_hexagon_S5_asrhub_sat>;
-def HEXAGON_S5_vasrhrnd_goodsyntax:
-    di_MInst_diu4_rnd <"vasrh", int_hexagon_S5_vasrhrnd_goodsyntax>;
-def HEXAGON_S2_asr_i_p_rnd:
-    di_SInst_diu6 <"asr", int_hexagon_S2_asr_i_p_rnd>;
-def HEXAGON_S2_asr_i_p_rnd_goodsyntax:
-    di_MInst_diu6 <"asrrnd", int_hexagon_S2_asr_i_p_rnd_goodsyntax>;
-def HEXAGON_F2_sfadd:
-    sf_MInst_sfsf <"sfadd", int_hexagon_F2_sfadd>;
-def HEXAGON_F2_sfsub:
-    sf_MInst_sfsf <"sfsub", int_hexagon_F2_sfsub>;
-def HEXAGON_F2_sfmpy:
-    sf_MInst_sfsf <"sfmpy", int_hexagon_F2_sfmpy>;
-def HEXAGON_F2_sffma:
-    sf_MInst_sfsfsf_acc <"sfmpy", int_hexagon_F2_sffma>;
-def HEXAGON_F2_sffma_sc:
-    sf_MInst_sfsfsfsi_sc <"sfmpy", int_hexagon_F2_sffma_sc>;
-def HEXAGON_F2_sffms:
-    sf_MInst_sfsfsf_nac <"sfmpy", int_hexagon_F2_sffms>;
-def HEXAGON_F2_sffma_lib:
-    sf_MInst_sfsfsf_acc_lib <"sfmpy", int_hexagon_F2_sffma_lib>;
-def HEXAGON_F2_sffms_lib:
-    sf_MInst_sfsfsf_nac_lib <"sfmpy", int_hexagon_F2_sffms_lib>;
-def HEXAGON_F2_sfcmpeq:
-    qi_SInst_sfsf <"sfcmp.eq", int_hexagon_F2_sfcmpeq>;
-def HEXAGON_F2_sfcmpgt:
-    qi_SInst_sfsf <"sfcmp.gt", int_hexagon_F2_sfcmpgt>;
-def HEXAGON_F2_sfcmpge:
-    qi_SInst_sfsf <"sfcmp.ge", int_hexagon_F2_sfcmpge>;
-def HEXAGON_F2_sfcmpuo:
-    qi_SInst_sfsf <"sfcmp.uo", int_hexagon_F2_sfcmpuo>;
-def HEXAGON_F2_sfmax:
-    sf_MInst_sfsf <"sfmax", int_hexagon_F2_sfmax>;
-def HEXAGON_F2_sfmin:
-    sf_MInst_sfsf <"sfmin", int_hexagon_F2_sfmin>;
-def HEXAGON_F2_sfclass:
-    qi_SInst_sfu5 <"sfclass", int_hexagon_F2_sfclass>;
-def HEXAGON_F2_sfimm_p:
-    sf_ALU64_u10_pos <"sfmake", int_hexagon_F2_sfimm_p>;
-def HEXAGON_F2_sfimm_n:
-    sf_ALU64_u10_neg <"sfmake", int_hexagon_F2_sfimm_n>;
-def HEXAGON_F2_sffixupn:
-    sf_MInst_sfsf <"sffixupn", int_hexagon_F2_sffixupn>;
-def HEXAGON_F2_sffixupd:
-    sf_MInst_sfsf <"sffixupd", int_hexagon_F2_sffixupd>;
-def HEXAGON_F2_sffixupr:
-    sf_SInst_sf <"sffixupr", int_hexagon_F2_sffixupr>;
-def HEXAGON_F2_dfadd:
-    df_MInst_dfdf <"dfadd", int_hexagon_F2_dfadd>;
-def HEXAGON_F2_dfsub:
-    df_MInst_dfdf <"dfsub", int_hexagon_F2_dfsub>;
-def HEXAGON_F2_dfmpy:
-    df_MInst_dfdf <"dfmpy", int_hexagon_F2_dfmpy>;
-def HEXAGON_F2_dffma:
-    df_MInst_dfdfdf_acc <"dfmpy", int_hexagon_F2_dffma>;
-def HEXAGON_F2_dffms:
-    df_MInst_dfdfdf_nac <"dfmpy", int_hexagon_F2_dffms>;
-def HEXAGON_F2_dffma_lib:
-    df_MInst_dfdfdf_acc_lib <"dfmpy", int_hexagon_F2_dffma_lib>;
-def HEXAGON_F2_dffms_lib:
-    df_MInst_dfdfdf_nac_lib <"dfmpy", int_hexagon_F2_dffms_lib>;
-def HEXAGON_F2_dffma_sc:
-    df_MInst_dfdfdfsi_sc <"dfmpy", int_hexagon_F2_dffma_sc>;
-def HEXAGON_F2_dfmax:
-    df_MInst_dfdf <"dfmax", int_hexagon_F2_dfmax>;
-def HEXAGON_F2_dfmin:
-    df_MInst_dfdf <"dfmin", int_hexagon_F2_dfmin>;
-def HEXAGON_F2_dfcmpeq:
-    qi_ALU64_dfdf <"dfcmp.eq", int_hexagon_F2_dfcmpeq>;
-def HEXAGON_F2_dfcmpgt:
-    qi_ALU64_dfdf <"dfcmp.gt", int_hexagon_F2_dfcmpgt>;
-def HEXAGON_F2_dfcmpge:
-    qi_ALU64_dfdf <"dfcmp.ge", int_hexagon_F2_dfcmpge>;
-def HEXAGON_F2_dfcmpuo:
-    qi_ALU64_dfdf <"dfcmp.uo", int_hexagon_F2_dfcmpuo>;
-def HEXAGON_F2_dfclass:
-    qi_ALU64_dfu5 <"dfclass", int_hexagon_F2_dfclass>;
-def HEXAGON_F2_dfimm_p:
-    df_ALU64_u10_pos <"dfmake", int_hexagon_F2_dfimm_p>;
-def HEXAGON_F2_dfimm_n:
-    df_ALU64_u10_neg <"dfmake", int_hexagon_F2_dfimm_n>;
-def HEXAGON_F2_dffixupn:
-    df_MInst_dfdf <"dffixupn", int_hexagon_F2_dffixupn>;
-def HEXAGON_F2_dffixupd:
-    df_MInst_dfdf <"dffixupd", int_hexagon_F2_dffixupd>;
-def HEXAGON_F2_dffixupr:
-    df_SInst_df <"dffixupr", int_hexagon_F2_dffixupr>;
-def HEXAGON_F2_conv_sf2df:
-    df_SInst_sf <"convert_sf2df", int_hexagon_F2_conv_sf2df>;
-def HEXAGON_F2_conv_df2sf:
-    sf_SInst_df <"convert_df2sf", int_hexagon_F2_conv_df2sf>;
-def HEXAGON_F2_conv_uw2sf:
-    sf_SInst_si <"convert_uw2sf", int_hexagon_F2_conv_uw2sf>;
-def HEXAGON_F2_conv_uw2df:
-    df_SInst_si <"convert_uw2df", int_hexagon_F2_conv_uw2df>;
-def HEXAGON_F2_conv_w2sf:
-    sf_SInst_si <"convert_w2sf", int_hexagon_F2_conv_w2sf>;
-def HEXAGON_F2_conv_w2df:
-    df_SInst_si <"convert_w2df", int_hexagon_F2_conv_w2df>;
-def HEXAGON_F2_conv_ud2sf:
-    sf_SInst_di <"convert_ud2sf", int_hexagon_F2_conv_ud2sf>;
-def HEXAGON_F2_conv_ud2df:
-    df_SInst_di <"convert_ud2df", int_hexagon_F2_conv_ud2df>;
-def HEXAGON_F2_conv_d2sf:
-    sf_SInst_di <"convert_d2sf", int_hexagon_F2_conv_d2sf>;
-def HEXAGON_F2_conv_d2df:
-    df_SInst_di <"convert_d2df", int_hexagon_F2_conv_d2df>;
-def HEXAGON_F2_conv_sf2uw:
-    si_SInst_sf <"convert_sf2uw", int_hexagon_F2_conv_sf2uw>;
-def HEXAGON_F2_conv_sf2w:
-    si_SInst_sf <"convert_sf2w", int_hexagon_F2_conv_sf2w>;
-def HEXAGON_F2_conv_sf2ud:
-    di_SInst_sf <"convert_sf2ud", int_hexagon_F2_conv_sf2ud>;
-def HEXAGON_F2_conv_sf2d:
-    di_SInst_sf <"convert_sf2d", int_hexagon_F2_conv_sf2d>;
-def HEXAGON_F2_conv_df2uw:
-    si_SInst_df <"convert_df2uw", int_hexagon_F2_conv_df2uw>;
-def HEXAGON_F2_conv_df2w:
-    si_SInst_df <"convert_df2w", int_hexagon_F2_conv_df2w>;
-def HEXAGON_F2_conv_df2ud:
-    di_SInst_df <"convert_df2ud", int_hexagon_F2_conv_df2ud>;
-def HEXAGON_F2_conv_df2d:
-    di_SInst_df <"convert_df2d", int_hexagon_F2_conv_df2d>;
-def HEXAGON_F2_conv_sf2uw_chop:
-    si_SInst_sf <"convert_sf2uw", int_hexagon_F2_conv_sf2uw_chop>;
-def HEXAGON_F2_conv_sf2w_chop:
-    si_SInst_sf <"convert_sf2w", int_hexagon_F2_conv_sf2w_chop>;
-def HEXAGON_F2_conv_sf2ud_chop:
-    di_SInst_sf <"convert_sf2ud", int_hexagon_F2_conv_sf2ud_chop>;
-def HEXAGON_F2_conv_sf2d_chop:
-    di_SInst_sf <"convert_sf2d", int_hexagon_F2_conv_sf2d_chop>;
-def HEXAGON_F2_conv_df2uw_chop:
-    si_SInst_df <"convert_df2uw", int_hexagon_F2_conv_df2uw_chop>;
-def HEXAGON_F2_conv_df2w_chop:
-    si_SInst_df <"convert_df2w", int_hexagon_F2_conv_df2w_chop>;
-def HEXAGON_F2_conv_df2ud_chop:
-    di_SInst_df <"convert_df2ud", int_hexagon_F2_conv_df2ud_chop>;
-def HEXAGON_F2_conv_df2d_chop:
-    di_SInst_df <"convert_df2d", int_hexagon_F2_conv_df2d_chop>;
+//===- HexagonIntrinsicsV5.td - V5 Instruction intrinsics --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//Rdd[+]=vrmpybsu(Rss,Rtt)
+//Rdd[+]=vrmpybuu(Rss,Rtt)
+let Predicates = [HasV5T]  in {
+def : T_PP_pat  <M5_vrmpybsu, int_hexagon_M5_vrmpybsu>;
+def : T_PP_pat  <M5_vrmpybuu, int_hexagon_M5_vrmpybuu>;
+
+def : T_PP_pat <M5_vdmpybsu, int_hexagon_M5_vdmpybsu>;
+
+def : T_PPP_pat <M5_vrmacbsu, int_hexagon_M5_vrmacbsu>;
+def : T_PPP_pat <M5_vrmacbuu, int_hexagon_M5_vrmacbuu>;
+//Rxx+=vdmpybsu(Rss,Rtt):sat
+def : T_PPP_pat <M5_vdmacbsu, int_hexagon_M5_vdmacbsu>;
+
+// Vector multiply bytes
+// Rdd=vmpyb[s]u(Rs,Rt)
+def : T_RR_pat <M5_vmpybsu, int_hexagon_M5_vmpybsu>;
+def : T_RR_pat <M5_vmpybuu, int_hexagon_M5_vmpybuu>;
+
+// Rxx+=vmpyb[s]u(Rs,Rt)
+def : T_PRR_pat <M5_vmacbsu, int_hexagon_M5_vmacbsu>;
+def : T_PRR_pat <M5_vmacbuu, int_hexagon_M5_vmacbuu>;
+
+// Rd=vaddhub(Rss,Rtt):sat
+def : T_PP_pat <A5_vaddhubs, int_hexagon_A5_vaddhubs>;
+}
+
+def : T_FF_pat<F2_sfadd, int_hexagon_F2_sfadd>;
+def : T_FF_pat<F2_sfsub, int_hexagon_F2_sfsub>;
+def : T_FF_pat<F2_sfmpy, int_hexagon_F2_sfmpy>;
+def : T_FF_pat<F2_sfmax, int_hexagon_F2_sfmax>;
+def : T_FF_pat<F2_sfmin, int_hexagon_F2_sfmin>;
+
+def : T_FF_pat<F2_sffixupn, int_hexagon_F2_sffixupn>;
+def : T_FF_pat<F2_sffixupd, int_hexagon_F2_sffixupd>;
+def : T_F_pat <F2_sffixupr, int_hexagon_F2_sffixupr>;
+
+def: qi_CRInst_qiqi_pat<C4_fastcorner9,     int_hexagon_C4_fastcorner9>;
+def: qi_CRInst_qiqi_pat<C4_fastcorner9_not, int_hexagon_C4_fastcorner9_not>;
+
+def : T_P_pat <S5_popcountp, int_hexagon_S5_popcountp>;
+def : T_PI_pat <S5_asrhub_sat, int_hexagon_S5_asrhub_sat>;
+
+def : T_PI_pat <S2_asr_i_p_rnd, int_hexagon_S2_asr_i_p_rnd>;
+def : T_PI_pat <S2_asr_i_p_rnd_goodsyntax,
+                int_hexagon_S2_asr_i_p_rnd_goodsyntax>;
+
+def : T_PI_pat <S5_asrhub_rnd_sat_goodsyntax,
+                int_hexagon_S5_asrhub_rnd_sat_goodsyntax>;
+
+def : T_PI_pat <S5_vasrhrnd_goodsyntax, int_hexagon_S5_vasrhrnd_goodsyntax>;
+
+def : T_FFF_pat <F2_sffma, int_hexagon_F2_sffma>;
+def : T_FFF_pat <F2_sffms, int_hexagon_F2_sffms>;
+def : T_FFF_pat <F2_sffma_lib, int_hexagon_F2_sffma_lib>;
+def : T_FFF_pat <F2_sffms_lib, int_hexagon_F2_sffms_lib>;
+def : T_FFFQ_pat <F2_sffma_sc, int_hexagon_F2_sffma_sc>;
+
+// Compare floating-point value
+def : T_FF_pat <F2_sfcmpge, int_hexagon_F2_sfcmpge>;
+def : T_FF_pat <F2_sfcmpuo, int_hexagon_F2_sfcmpuo>;
+def : T_FF_pat <F2_sfcmpeq, int_hexagon_F2_sfcmpeq>;
+def : T_FF_pat <F2_sfcmpgt, int_hexagon_F2_sfcmpgt>;
+
+def : T_DD_pat <F2_dfcmpeq, int_hexagon_F2_dfcmpeq>;
+def : T_DD_pat <F2_dfcmpgt, int_hexagon_F2_dfcmpgt>;
+def : T_DD_pat <F2_dfcmpge, int_hexagon_F2_dfcmpge>;
+def : T_DD_pat <F2_dfcmpuo, int_hexagon_F2_dfcmpuo>;
+
+// Create floating-point value
+def : T_I_pat <F2_sfimm_p, int_hexagon_F2_sfimm_p>;
+def : T_I_pat <F2_sfimm_n, int_hexagon_F2_sfimm_n>;
+def : T_I_pat <F2_dfimm_p, int_hexagon_F2_dfimm_p>;
+def : T_I_pat <F2_dfimm_n, int_hexagon_F2_dfimm_n>;
+
+def : T_DI_pat <F2_dfclass, int_hexagon_F2_dfclass>;
+def : T_FI_pat <F2_sfclass, int_hexagon_F2_sfclass>;
+def : T_F_pat <F2_conv_sf2df, int_hexagon_F2_conv_sf2df>;
+def : T_D_pat <F2_conv_df2sf, int_hexagon_F2_conv_df2sf>;
+def : T_R_pat <F2_conv_uw2sf, int_hexagon_F2_conv_uw2sf>;
+def : T_R_pat <F2_conv_uw2df, int_hexagon_F2_conv_uw2df>;
+def : T_R_pat <F2_conv_w2sf,  int_hexagon_F2_conv_w2sf>;
+def : T_R_pat <F2_conv_w2df,  int_hexagon_F2_conv_w2df>;
+def : T_P_pat <F2_conv_ud2sf, int_hexagon_F2_conv_ud2sf>;
+def : T_P_pat <F2_conv_ud2df, int_hexagon_F2_conv_ud2df>;
+def : T_P_pat <F2_conv_d2sf,  int_hexagon_F2_conv_d2sf>;
+def : T_P_pat <F2_conv_d2df,  int_hexagon_F2_conv_d2df>;
+def : T_F_pat <F2_conv_sf2uw, int_hexagon_F2_conv_sf2uw>;
+def : T_F_pat <F2_conv_sf2w,  int_hexagon_F2_conv_sf2w>;
+def : T_F_pat <F2_conv_sf2ud, int_hexagon_F2_conv_sf2ud>;
+def : T_F_pat <F2_conv_sf2d,  int_hexagon_F2_conv_sf2d>;
+def : T_D_pat <F2_conv_df2uw, int_hexagon_F2_conv_df2uw>;
+def : T_D_pat <F2_conv_df2w,  int_hexagon_F2_conv_df2w>;
+def : T_D_pat <F2_conv_df2ud, int_hexagon_F2_conv_df2ud>;
+def : T_D_pat <F2_conv_df2d,  int_hexagon_F2_conv_df2d>;
+def : T_F_pat <F2_conv_sf2uw_chop, int_hexagon_F2_conv_sf2uw_chop>;
+def : T_F_pat <F2_conv_sf2w_chop,  int_hexagon_F2_conv_sf2w_chop>;
+def : T_F_pat <F2_conv_sf2ud_chop, int_hexagon_F2_conv_sf2ud_chop>;
+def : T_F_pat <F2_conv_sf2d_chop,  int_hexagon_F2_conv_sf2d_chop>;
+def : T_D_pat <F2_conv_df2uw_chop, int_hexagon_F2_conv_df2uw_chop>;
+def : T_D_pat <F2_conv_df2w_chop,  int_hexagon_F2_conv_df2w_chop>;
+def : T_D_pat <F2_conv_df2ud_chop, int_hexagon_F2_conv_df2ud_chop>;
+def : T_D_pat <F2_conv_df2d_chop,  int_hexagon_F2_conv_df2d_chop>;
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
index 5e4346d..9c9f3af 100644
--- a/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -15,7 +15,6 @@
 #include "Hexagon.h"
 #include "HexagonAsmPrinter.h"
 #include "HexagonMachineFunctionInfo.h"
-#include "MCTargetDesc/HexagonMCInst.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Mangler.h"
@@ -39,10 +38,9 @@ static MCOperand GetSymbolRef(const MachineOperand& MO, const MCSymbol* Symbol,
 }
 
 // Create an MCInst from a MachineInstr
-void llvm::HexagonLowerToMC(const MachineInstr* MI, HexagonMCInst& MCI,
+void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCI,
                             HexagonAsmPrinter& AP) {
   MCI.setOpcode(MI->getOpcode());
-  MCI.setDesc(MI->getDesc());
 
   for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) {
     const MachineOperand &MO = MI->getOperand(i);
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 97c626f..35f732c 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -205,20 +205,17 @@ void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
   // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
   // are disabled, then these HazardRecs will be disabled.
   const InstrItineraryData *Itin = DAG->getSchedModel()->getInstrItineraries();
-  const TargetMachine &TM = DAG->MF.getTarget();
+  const TargetSubtargetInfo &STI = DAG->MF.getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
   delete Top.HazardRec;
   delete Bot.HazardRec;
-  Top.HazardRec =
-      TM.getSubtargetImpl()->getInstrInfo()->CreateTargetMIHazardRecognizer(
-          Itin, DAG);
-  Bot.HazardRec =
-      TM.getSubtargetImpl()->getInstrInfo()->CreateTargetMIHazardRecognizer(
-          Itin, DAG);
+  Top.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG);
+  Bot.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG);
 
   delete Top.ResourceModel;
   delete Bot.ResourceModel;
-  Top.ResourceModel = new VLIWResourceModel(TM, DAG->getSchedModel());
-  Bot.ResourceModel = new VLIWResourceModel(TM, DAG->getSchedModel());
+  Top.ResourceModel = new VLIWResourceModel(STI, DAG->getSchedModel());
+  Bot.ResourceModel = new VLIWResourceModel(STI, DAG->getSchedModel());
 
   assert((!llvm::ForceTopDown || !llvm::ForceBottomUp) &&
          "-misched-topdown incompatible with -misched-bottomup");
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
index 1e023c3..6034344 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -54,11 +54,9 @@ class VLIWResourceModel {
   unsigned TotalPackets;
 
 public:
-VLIWResourceModel(const TargetMachine &TM, const TargetSchedModel *SM) :
-    SchedModel(SM), TotalPackets(0) {
-  ResourcesModel =
-      TM.getSubtargetImpl()->getInstrInfo()->CreateTargetScheduleState(
-          *TM.getSubtargetImpl());
+  VLIWResourceModel(const TargetSubtargetInfo &STI, const TargetSchedModel *SM)
+      : SchedModel(SM), TotalPackets(0) {
+  ResourcesModel = STI.getInstrInfo()->CreateTargetScheduleState(STI);
 
     // This hard requirement could be relaxed,
     // but for now do not let it proceed.
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 782c979..806d448 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -176,7 +176,7 @@ static bool commonChecksToProhibitNewValueJump(bool afterRA,
     return false;
 
   // if call in path, bail out.
-  if (MII->getOpcode() == Hexagon::CALLv3)
+  if (MII->getOpcode() == Hexagon::J2_call)
     return false;
 
   // if NVJ is running prior to RA, do the following checks.
@@ -199,8 +199,7 @@ static bool commonChecksToProhibitNewValueJump(bool afterRA,
     // of registers by individual passes in the backend. At this time,
     // we don't know the scope of usage and definitions of these
     // instructions.
-    if (MII->getOpcode() == Hexagon::TFR_condset_rr ||
-        MII->getOpcode() == Hexagon::TFR_condset_ii ||
+    if (MII->getOpcode() == Hexagon::TFR_condset_ii ||
         MII->getOpcode() == Hexagon::TFR_condset_ri ||
         MII->getOpcode() == Hexagon::TFR_condset_ir ||
         MII->getOpcode() == Hexagon::LDriw_pred     ||
@@ -228,8 +227,8 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
     int64_t v = MI->getOperand(2).getImm();
 
     if (!(isUInt<5>(v) ||
-         ((MI->getOpcode() == Hexagon::CMPEQri ||
-           MI->getOpcode() == Hexagon::CMPGTri) &&
+         ((MI->getOpcode() == Hexagon::C2_cmpeqi ||
+           MI->getOpcode() == Hexagon::C2_cmpgti) &&
           (v == -1))))
       return false;
   }
@@ -299,49 +298,49 @@ static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg,
     taken = true;
 
   switch (MI->getOpcode()) {
-    case Hexagon::CMPEQrr:
-      return taken ? Hexagon::CMPEQrr_t_Jumpnv_t_V4
-                   : Hexagon::CMPEQrr_t_Jumpnv_nt_V4;
+    case Hexagon::C2_cmpeq:
+      return taken ? Hexagon::J4_cmpeq_t_jumpnv_t
+                   : Hexagon::J4_cmpeq_t_jumpnv_nt;
 
-    case Hexagon::CMPEQri: {
+    case Hexagon::C2_cmpeqi: {
       if (reg >= 0)
-        return taken ? Hexagon::CMPEQri_t_Jumpnv_t_V4
-                     : Hexagon::CMPEQri_t_Jumpnv_nt_V4;
+        return taken ? Hexagon::J4_cmpeqi_t_jumpnv_t
+                     : Hexagon::J4_cmpeqi_t_jumpnv_nt;
       else
-        return taken ? Hexagon::CMPEQn1_t_Jumpnv_t_V4
-                     : Hexagon::CMPEQn1_t_Jumpnv_nt_V4;
+        return taken ? Hexagon::J4_cmpeqn1_t_jumpnv_t
+                     : Hexagon::J4_cmpeqn1_t_jumpnv_nt;
     }
 
-    case Hexagon::CMPGTrr: {
+    case Hexagon::C2_cmpgt: {
       if (secondRegNewified)
-        return taken ? Hexagon::CMPLTrr_t_Jumpnv_t_V4
-                     : Hexagon::CMPLTrr_t_Jumpnv_nt_V4;
+        return taken ? Hexagon::J4_cmplt_t_jumpnv_t
+                     : Hexagon::J4_cmplt_t_jumpnv_nt;
       else
-        return taken ? Hexagon::CMPGTrr_t_Jumpnv_t_V4
-                     : Hexagon::CMPGTrr_t_Jumpnv_nt_V4;
+        return taken ? Hexagon::J4_cmpgt_t_jumpnv_t
+                     : Hexagon::J4_cmpgt_t_jumpnv_nt;
     }
 
-    case Hexagon::CMPGTri: {
+    case Hexagon::C2_cmpgti: {
       if (reg >= 0)
-        return taken ? Hexagon::CMPGTri_t_Jumpnv_t_V4
-                     : Hexagon::CMPGTri_t_Jumpnv_nt_V4;
+        return taken ? Hexagon::J4_cmpgti_t_jumpnv_t
+                     : Hexagon::J4_cmpgti_t_jumpnv_nt;
       else
-        return taken ? Hexagon::CMPGTn1_t_Jumpnv_t_V4
-                     : Hexagon::CMPGTn1_t_Jumpnv_nt_V4;
+        return taken ? Hexagon::J4_cmpgtn1_t_jumpnv_t
+                     : Hexagon::J4_cmpgtn1_t_jumpnv_nt;
     }
 
-    case Hexagon::CMPGTUrr: {
+    case Hexagon::C2_cmpgtu: {
       if (secondRegNewified)
-        return taken ? Hexagon::CMPLTUrr_t_Jumpnv_t_V4
-                     : Hexagon::CMPLTUrr_t_Jumpnv_nt_V4;
+        return taken ? Hexagon::J4_cmpltu_t_jumpnv_t
+                     : Hexagon::J4_cmpltu_t_jumpnv_nt;
       else
-        return taken ? Hexagon::CMPGTUrr_t_Jumpnv_t_V4
-                     : Hexagon::CMPGTUrr_t_Jumpnv_nt_V4;
+        return taken ? Hexagon::J4_cmpgtu_t_jumpnv_t
+                     : Hexagon::J4_cmpgtu_t_jumpnv_nt;
     }
 
-    case Hexagon::CMPGTUri:
-      return taken ? Hexagon::CMPGTUri_t_Jumpnv_t_V4
-                   : Hexagon::CMPGTUri_t_Jumpnv_nt_V4;
+    case Hexagon::C2_cmpgtui:
+      return taken ? Hexagon::J4_cmpgtui_t_jumpnv_t
+                   : Hexagon::J4_cmpgtui_t_jumpnv_nt;
 
     default:
        llvm_unreachable("Could not find matching New Value Jump instruction.");
@@ -356,19 +355,15 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
                << "********** Function: "
                << MF.getName() << "\n");
 
-#if 0
-  // for now disable this, if we move NewValueJump before register
-  // allocation we need this information.
-  LiveVariables &LVs = getAnalysis<LiveVariables>();
-#endif
+  // If we move NewValueJump before register allocation we'll need live variable
+  // analysis here too.
 
   QII = static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo());
   QRI = static_cast<const HexagonRegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
 
-  if (!QRI->Subtarget.hasV4TOps() ||
-      DisableNewValueJumps) {
+  if (DisableNewValueJumps) {
     return false;
   }
 
@@ -413,12 +408,12 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
       DEBUG(dbgs() << "Instr: "; MI->dump(); dbgs() << "\n");
 
       if (!foundJump &&
-         (MI->getOpcode() == Hexagon::JMP_t ||
-          MI->getOpcode() == Hexagon::JMP_f ||
-          MI->getOpcode() == Hexagon::JMP_tnew_t ||
-          MI->getOpcode() == Hexagon::JMP_tnew_nt ||
-          MI->getOpcode() == Hexagon::JMP_fnew_t ||
-          MI->getOpcode() == Hexagon::JMP_fnew_nt)) {
+         (MI->getOpcode() == Hexagon::J2_jumpt ||
+          MI->getOpcode() == Hexagon::J2_jumpf ||
+          MI->getOpcode() == Hexagon::J2_jumptnewpt ||
+          MI->getOpcode() == Hexagon::J2_jumptnew ||
+          MI->getOpcode() == Hexagon::J2_jumpfnewpt ||
+          MI->getOpcode() == Hexagon::J2_jumpfnew)) {
         // This is where you would insert your compare and
         // instr that feeds compare
         jmpPos = MII;
@@ -454,9 +449,9 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
 
         jmpTarget = MI->getOperand(1).getMBB();
         foundJump = true;
-        if (MI->getOpcode() == Hexagon::JMP_f ||
-            MI->getOpcode() == Hexagon::JMP_fnew_t ||
-            MI->getOpcode() == Hexagon::JMP_fnew_nt) {
+        if (MI->getOpcode() == Hexagon::J2_jumpf ||
+            MI->getOpcode() == Hexagon::J2_jumpfnewpt ||
+            MI->getOpcode() == Hexagon::J2_jumpfnew) {
           invertPredicate = true;
         }
         continue;
@@ -545,7 +540,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
           if (isSecondOpReg) {
             // In case of CMPLT, or CMPLTU, or EQ with the second register
             // to newify, swap the operands.
-            if (cmpInstr->getOpcode() == Hexagon::CMPEQrr &&
+            if (cmpInstr->getOpcode() == Hexagon::C2_cmpeq &&
                                      feederReg == (unsigned) cmpOp2) {
               unsigned tmp = cmpReg1;
               bool tmpIsKill = MO1IsKill;
@@ -612,8 +607,8 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
                                     .addReg(cmpOp2, getKillRegState(MO2IsKill))
                                     .addMBB(jmpTarget);
 
-          else if ((cmpInstr->getOpcode() == Hexagon::CMPEQri ||
-                    cmpInstr->getOpcode() == Hexagon::CMPGTri) &&
+          else if ((cmpInstr->getOpcode() == Hexagon::C2_cmpeqi ||
+                    cmpInstr->getOpcode() == Hexagon::C2_cmpgti) &&
                     cmpOp2 == -1 )
             // Corresponding new-value compare jump instructions don't have the
             // operand for -1 immediate value.
diff --git a/lib/Target/Hexagon/HexagonOperands.td b/lib/Target/Hexagon/HexagonOperands.td
index c79d78f..318ca72 100644
--- a/lib/Target/Hexagon/HexagonOperands.td
+++ b/lib/Target/Hexagon/HexagonOperands.td
@@ -39,6 +39,7 @@ let PrintMethod = "printImmOperand" in {
   def u16_0Imm : Operand<i32>;
   def u16_1Imm : Operand<i32>;
   def u16_2Imm : Operand<i32>;
+  def u16_3Imm : Operand<i32>;
   def u11_3Imm : Operand<i32>;
   def u10Imm : Operand<i32>;
   def u9Imm : Operand<i32>;
@@ -258,6 +259,19 @@ def u16_s8ImmPred  : PatLeaf<(i32 imm), [{
   return isShiftedUInt<16,8>(v);
 }]>;
 
+def u16_0ImmPred  : PatLeaf<(i32 imm), [{
+  // True if the immediate fits in a 16-bit unsigned field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<16>(v);
+}]>;
+
+def u11_3ImmPred : PatLeaf<(i32 imm), [{
+  // True if the immediate fits in a 14-bit unsigned field, and the lowest
+  // three bits are 0.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<11,3>(v);
+}]>;
+
 def u9ImmPred  : PatLeaf<(i32 imm), [{
   // u9ImmPred predicate - True if the immediate fits in a 9-bit unsigned
   // field.
@@ -329,6 +343,12 @@ def u5ImmPred  : PatLeaf<(i32 imm), [{
   return isUInt<5>(v);
 }]>;
 
+def u4ImmPred  : PatLeaf<(i32 imm), [{
+  // u4ImmPred predicate - True if the immediate fits in a 4-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<4>(v);
+}]>;
 
 def u3ImmPred  : PatLeaf<(i32 imm), [{
   // u3ImmPred predicate - True if the immediate fits in a 3-bit unsigned
@@ -497,309 +517,218 @@ def u0AlwaysExt : Operand<i32>;
 // Predicates for constant extendable operands
 def s16ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 16-bit sign extended field.
-    return isInt<16>(v);
-  else {
-    if (isInt<16>(v))
-      return true;
+  if (isInt<16>(v))
+    return true;
 
-    // Return true if extending this immediate is profitable and the value
-    // can fit in a 32-bit signed field.
-    return isConstExtProfitable(Node) && isInt<32>(v);
-  }
+  // Return true if extending this immediate is profitable and the value
+  // can fit in a 32-bit signed field.
+  return isConstExtProfitable(Node) && isInt<32>(v);
 }]>;
 
 def s10ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 10-bit sign extended field.
-    return isInt<10>(v);
-  else {
-    if (isInt<10>(v))
-      return true;
+  if (isInt<10>(v))
+    return true;
 
-    // Return true if extending this immediate is profitable and the value
-    // can fit in a 32-bit signed field.
-    return isConstExtProfitable(Node) && isInt<32>(v);
-  }
+  // Return true if extending this immediate is profitable and the value
+  // can fit in a 32-bit signed field.
+  return isConstExtProfitable(Node) && isInt<32>(v);
 }]>;
 
 def s9ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 9-bit sign extended field.
-    return isInt<9>(v);
-  else {
-    if (isInt<9>(v))
-      return true;
+  if (isInt<9>(v))
+    return true;
 
-    // Return true if extending this immediate is profitable and the value
-    // can fit in a 32-bit unsigned field.
-    return isConstExtProfitable(Node) && isInt<32>(v);
-  }
+  // Return true if extending this immediate is profitable and the value
+  // can fit in a 32-bit unsigned field.
+  return isConstExtProfitable(Node) && isInt<32>(v);
 }]>;
 
 def s8ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 8-bit sign extended field.
-    return isInt<8>(v);
-  else {
-    if (isInt<8>(v))
-      return true;
+  if (isInt<8>(v))
+    return true;
 
-    // Return true if extending this immediate is profitable and the value
-    // can fit in a 32-bit signed field.
-    return isConstExtProfitable(Node) && isInt<32>(v);
-  }
+  // Return true if extending this immediate is profitable and the value
+  // can fit in a 32-bit signed field.
+  return isConstExtProfitable(Node) && isInt<32>(v);
 }]>;
 
 def s8_16ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate fits in a 8-bit sign extended field.
-    return isInt<8>(v);
-  else {
-    if (isInt<8>(v))
-      return true;
-
-    // Return true if extending this immediate is profitable and the value
-    // can't fit in a 16-bit signed field. This is required to avoid
-    // unnecessary constant extenders.
-    return isConstExtProfitable(Node) && !isInt<16>(v);
-  }
+  if (isInt<8>(v))
+    return true;
+
+  // Return true if extending this immediate is profitable and the value
+  // can't fit in a 16-bit signed field. This is required to avoid
+  // unnecessary constant extenders.
+  return isConstExtProfitable(Node) && !isInt<16>(v);
 }]>;
 
 def s6ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 6-bit sign extended field.
-    return isInt<6>(v);
-  else {
-    if (isInt<6>(v))
-      return true;
+  if (isInt<6>(v))
+    return true;
 
-    // Return true if extending this immediate is profitable and the value
-    // can fit in a 32-bit unsigned field.
-    return isConstExtProfitable(Node) && isInt<32>(v);
-  }
+  // Return true if extending this immediate is profitable and the value
+  // can fit in a 32-bit unsigned field.
+  return isConstExtProfitable(Node) && isInt<32>(v);
 }]>;
 
 def s6_16ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate fits in a 6-bit sign extended field.
-    return isInt<6>(v);
-  else {
-    if (isInt<6>(v))
-      return true;
-
-    // Return true if extending this immediate is profitable and the value
-    // can't fit in a 16-bit signed field. This is required to avoid
-    // unnecessary constant extenders.
-    return isConstExtProfitable(Node) && !isInt<16>(v);
-  }
+  if (isInt<6>(v))
+    return true;
+
+  // Return true if extending this immediate is profitable and the value
+  // can't fit in a 16-bit signed field. This is required to avoid
+  // unnecessary constant extenders.
+  return isConstExtProfitable(Node) && !isInt<16>(v);
 }]>;
 
 def s6_10ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 6-bit sign extended field.
-    return isInt<6>(v);
-  else {
-    if (isInt<6>(v))
-      return true;
-
-    // Return true if extending this immediate is profitable and the value
-    // can't fit in a 10-bit signed field. This is required to avoid
-    // unnecessary constant extenders.
-    return isConstExtProfitable(Node) && !isInt<10>(v);
-  }
+  if (isInt<6>(v))
+    return true;
+
+  // Return true if extending this immediate is profitable and the value
+  // can't fit in a 10-bit signed field. This is required to avoid
+  // unnecessary constant extenders.
+  return isConstExtProfitable(Node) && !isInt<10>(v);
 }]>;
 
 def s11_0ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 11-bit sign extended field.
-    return isShiftedInt<11,0>(v);
-  else {
-    if (isInt<11>(v))
-      return true;
+  if (isInt<11>(v))
+    return true;
 
-    // Return true if extending this immediate is profitable and the value
-    // can fit in a 32-bit signed field.
-    return isConstExtProfitable(Node) && isInt<32>(v);
-  }
+  // Return true if extending this immediate is profitable and the value
+  // can fit in a 32-bit signed field.
+  return isConstExtProfitable(Node) && isInt<32>(v);
 }]>;
 
 def s11_1ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 12-bit sign extended field and
-    // is 2 byte aligned.
+  if (isInt<12>(v))
     return isShiftedInt<11,1>(v);
-  else {
-    if (isInt<12>(v))
-      return isShiftedInt<11,1>(v);
 
-    // Return true if extending this immediate is profitable and the low 1 bit
-    // is zero (2-byte aligned).
-    return isConstExtProfitable(Node) && isInt<32>(v) && ((v % 2) == 0);
-  }
+  // Return true if extending this immediate is profitable and the low 1 bit
+  // is zero (2-byte aligned).
+  return isConstExtProfitable(Node) && isInt<32>(v) && ((v % 2) == 0);
 }]>;
 
 def s11_2ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 13-bit sign extended field and
-    // is 4-byte aligned.
+  if (isInt<13>(v))
     return isShiftedInt<11,2>(v);
-  else {
-    if (isInt<13>(v))
-      return isShiftedInt<11,2>(v);
 
-    // Return true if extending this immediate is profitable and the low 2-bits
-    // are zero (4-byte aligned).
-    return isConstExtProfitable(Node)  && isInt<32>(v) && ((v % 4) == 0);
-  }
+  // Return true if extending this immediate is profitable and the low 2-bits
+  // are zero (4-byte aligned).
+  return isConstExtProfitable(Node)  && isInt<32>(v) && ((v % 4) == 0);
 }]>;
 
 def s11_3ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 14-bit sign extended field and
-    // is 8-byte aligned.
-    return isShiftedInt<11,3>(v);
-  else {
-    if (isInt<14>(v))
-     return isShiftedInt<11,3>(v);
-
-    // Return true if extending this immediate is profitable and the low 3-bits
-    // are zero (8-byte aligned).
-    return isConstExtProfitable(Node)  && isInt<32>(v) && ((v % 8) == 0);
-  }
+  if (isInt<14>(v))
+   return isShiftedInt<11,3>(v);
+
+  // Return true if extending this immediate is profitable and the low 3-bits
+  // are zero (8-byte aligned).
+  return isConstExtProfitable(Node)  && isInt<32>(v) && ((v % 8) == 0);
 }]>;
 
 def u0AlwaysExtPred : PatLeaf<(i32 imm), [{
   // Predicate for an unsigned 32-bit value that always needs to be extended.
-  if (Subtarget.hasV4TOps()) {
-    if (isConstExtProfitable(Node)) {
-      int64_t v = (int64_t)N->getSExtValue();
-      return isUInt<32>(v);
-    }
+  if (isConstExtProfitable(Node)) {
+    int64_t v = (int64_t)N->getSExtValue();
+    return isUInt<32>(v);
   }
   return false;
 }]>;
 
 def u6ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 6-bit unsigned field.
-    return isUInt<6>(v);
-  else {
-    if (isUInt<6>(v))
-      return true;
+  if (isUInt<6>(v))
+    return true;
 
-    // Return true if extending this immediate is profitable and the value
-    // can fit in a 32-bit unsigned field.
-    return isConstExtProfitable(Node) && isUInt<32>(v);
-  }
+  // Return true if extending this immediate is profitable and the value
+  // can fit in a 32-bit unsigned field.
+  return isConstExtProfitable(Node) && isUInt<32>(v);
 }]>;
 
 def u7ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 7-bit unsigned field.
-    return isUInt<7>(v);
-  else {
-    if (isUInt<7>(v))
-      return true;
+  if (isUInt<7>(v))
+    return true;
 
-    // Return true if extending this immediate is profitable and the value
-    // can fit in a 32-bit unsigned field.
-    return isConstExtProfitable(Node) && isUInt<32>(v);
-  }
+  // Return true if extending this immediate is profitable and the value
+  // can fit in a 32-bit unsigned field.
+  return isConstExtProfitable(Node) && isUInt<32>(v);
 }]>;
 
 def u8ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 8-bit unsigned field.
-    return isUInt<8>(v);
-  else {
-    if (isUInt<8>(v))
-      return true;
+  if (isUInt<8>(v))
+    return true;
 
-    // Return true if extending this immediate is profitable and the value
-    // can fit in a 32-bit unsigned field.
-    return isConstExtProfitable(Node) && isUInt<32>(v);
-  }
+  // Return true if extending this immediate is profitable and the value
+  // can fit in a 32-bit unsigned field.
+  return isConstExtProfitable(Node) && isUInt<32>(v);
 }]>;
 
 def u9ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 9-bit unsigned field.
-    return isUInt<9>(v);
-  else {
-    if (isUInt<9>(v))
-      return true;
+  if (isUInt<9>(v))
+    return true;
 
-    // Return true if extending this immediate is profitable and the value
-    // can fit in a 32-bit unsigned field.
-    return isConstExtProfitable(Node) && isUInt<32>(v);
-  }
+  // Return true if extending this immediate is profitable and the value
+  // can fit in a 32-bit unsigned field.
+  return isConstExtProfitable(Node) && isUInt<32>(v);
 }]>;
 
 def u6_1ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 7-bit unsigned field and
-    // is 2-byte aligned.
+  if (isUInt<7>(v))
     return isShiftedUInt<6,1>(v);
-  else {
-    if (isUInt<7>(v))
-      return isShiftedUInt<6,1>(v);
 
-    // Return true if extending this immediate is profitable and the value
-    // can fit in a 32-bit unsigned field.
-    return isConstExtProfitable(Node) && isUInt<32>(v) && ((v % 2) == 0);
-  }
+  // Return true if extending this immediate is profitable and the value
+  // can fit in a 32-bit unsigned field.
+  return isConstExtProfitable(Node) && isUInt<32>(v) && ((v % 2) == 0);
 }]>;
 
 def u6_2ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 8-bit unsigned field and
-    // is 4-byte aligned.
+  if (isUInt<8>(v))
     return isShiftedUInt<6,2>(v);
-  else {
-    if (isUInt<8>(v))
-      return isShiftedUInt<6,2>(v);
 
-    // Return true if extending this immediate is profitable and the value
-    // can fit in a 32-bit unsigned field.
-    return isConstExtProfitable(Node) && isUInt<32>(v) && ((v % 4) == 0);
-  }
+  // Return true if extending this immediate is profitable and the value
+  // can fit in a 32-bit unsigned field.
+  return isConstExtProfitable(Node) && isUInt<32>(v) && ((v % 4) == 0);
 }]>;
 
 def u6_3ExtPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (!Subtarget.hasV4TOps())
-    // Return true if the immediate can fit in a 9-bit unsigned field and
-    // is 8-byte aligned.
+  if (isUInt<9>(v))
     return isShiftedUInt<6,3>(v);
-  else {
-    if (isUInt<9>(v))
-      return isShiftedUInt<6,3>(v);
 
-    // Return true if extending this immediate is profitable and the value
-    // can fit in a 32-bit unsigned field.
-    return isConstExtProfitable(Node) && isUInt<32>(v) && ((v % 8) == 0);
-  }
+  // Return true if extending this immediate is profitable and the value
+  // can fit in a 32-bit unsigned field.
+  return isConstExtProfitable(Node) && isUInt<32>(v) && ((v % 8) == 0);
 }]>;
 
+
+// This complex pattern exists only to create a machine instruction operand
+// of type "frame index". There doesn't seem to be a way to do that directly
+// in the patterns.
+def AddrFI : ComplexPattern<i32, 1, "SelectAddrFI", [frameindex], []>;
+
+// These complex patterns are not strictly necessary, since global address
+// folding will happen during DAG combining. For distinguishing between GA
+// and GP, pat frags with HexagonCONST32 and HexagonCONST32_GP can be used.
+def AddrGA : ComplexPattern<i32, 1, "SelectAddrGA", [], []>;
+def AddrGP : ComplexPattern<i32, 1, "SelectAddrGP", [], []>;
+
 // Addressing modes.
 
 def ADDRrr : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
@@ -856,3 +785,12 @@ def symbolHi32 : Operand<i32> {
 def symbolLo32 : Operand<i32> {
   let PrintMethod = "printSymbolLo";
 }
+
+// Return true if for a 32 to 64-bit sign-extended load.
+def is_sext_i32 : PatLeaf<(i64 DoubleRegs:$src1), [{
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
+  if (!LD)
+    return false;
+  return LD->getExtensionType() == ISD::SEXTLOAD &&
+         LD->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 8912152..afd3a17 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -112,7 +112,7 @@ INITIALIZE_PASS(HexagonPeephole, "hexagon-peephole", "Hexagon Peephole",
 
 bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
   QII = static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  QRI = MF.getTarget().getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  QRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
   MRI = &MF.getRegInfo();
 
   DenseMap<unsigned, unsigned> PeepholeMap;
@@ -133,7 +133,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
       MachineInstr *MI = MII;
       // Look for sign extends:
       // %vreg170<def> = SXTW %vreg166
-      if (!DisableOptSZExt && MI->getOpcode() == Hexagon::SXTW) {
+      if (!DisableOptSZExt && MI->getOpcode() == Hexagon::A2_sxtw) {
         assert (MI->getNumOperands() == 2);
         MachineOperand &Dst = MI->getOperand(0);
         MachineOperand &Src  = MI->getOperand(1);
@@ -152,7 +152,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
       // Look for  %vreg170<def> = COMBINE_ir_V4 (0, %vreg169)
       // %vreg170:DoublRegs, %vreg169:IntRegs
       if (!DisableOptExtTo64 &&
-          MI->getOpcode () == Hexagon::COMBINE_Ir_V4) {
+          MI->getOpcode () == Hexagon::A4_combineir) {
         assert (MI->getNumOperands() == 3);
         MachineOperand &Dst = MI->getOperand(0);
         MachineOperand &Src1 = MI->getOperand(1);
@@ -169,7 +169,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
       // %vregIntReg = COPY %vregDoubleReg1:subreg_loreg.
       // and convert into
       // %vregIntReg = COPY %vregDoubleReg0:subreg_hireg.
-      if (MI->getOpcode() == Hexagon::LSRd_ri) {
+      if (MI->getOpcode() == Hexagon::S2_lsr_i_p) {
         assert(MI->getNumOperands() == 3);
         MachineOperand &Dst = MI->getOperand(0);
         MachineOperand &Src1 = MI->getOperand(1);
@@ -184,7 +184,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
 
       // Look for P=NOT(P).
       if (!DisablePNotP &&
-          (MI->getOpcode() == Hexagon::NOT_p)) {
+          (MI->getOpcode() == Hexagon::C2_not)) {
         assert (MI->getNumOperands() == 2);
         MachineOperand &Dst = MI->getOperand(0);
         MachineOperand &Src  = MI->getOperand(1);
@@ -269,10 +269,9 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
           unsigned PR = 1, S1 = 2, S2 = 3;   // Operand indices.
 
           switch (Op) {
-            case Hexagon::TFR_condset_rr:
+            case Hexagon::C2_mux:
+            case Hexagon::C2_muxii:
             case Hexagon::TFR_condset_ii:
-            case Hexagon::MUX_ii:
-            case Hexagon::MUX_rr:
               NewOp = Op;
               break;
             case Hexagon::TFR_condset_ri:
@@ -281,11 +280,11 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
             case Hexagon::TFR_condset_ir:
               NewOp = Hexagon::TFR_condset_ri;
               break;
-            case Hexagon::MUX_ri:
-              NewOp = Hexagon::MUX_ir;
+            case Hexagon::C2_muxri:
+              NewOp = Hexagon::C2_muxir;
               break;
-            case Hexagon::MUX_ir:
-              NewOp = Hexagon::MUX_ri;
+            case Hexagon::C2_muxir:
+              NewOp = Hexagon::C2_muxri;
               break;
           }
           if (NewOp) {
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 2b6741c..3df98d6 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -45,9 +45,6 @@ HexagonRegisterInfo::HexagonRegisterInfo(HexagonSubtarget &st)
 
 const MCPhysReg *
 HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  static const MCPhysReg CalleeSavedRegsV2[] = {
-    Hexagon::R24,   Hexagon::R25,   Hexagon::R26,   Hexagon::R27, 0
-  };
   static const MCPhysReg CalleeSavedRegsV3[] = {
     Hexagon::R16,   Hexagon::R17,   Hexagon::R18,   Hexagon::R19,
     Hexagon::R20,   Hexagon::R21,   Hexagon::R22,   Hexagon::R23,
@@ -55,11 +52,6 @@ HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   };
 
   switch(Subtarget.getHexagonArchVersion()) {
-  case HexagonSubtarget::V1:
-    break;
-  case HexagonSubtarget::V2:
-    return CalleeSavedRegsV2;
-  case HexagonSubtarget::V3:
   case HexagonSubtarget::V4:
   case HexagonSubtarget::V5:
     return CalleeSavedRegsV3;
@@ -88,10 +80,6 @@ BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
 
 const TargetRegisterClass* const*
 HexagonRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
-  static const TargetRegisterClass * const CalleeSavedRegClassesV2[] = {
-    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
-    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
-    };
   static const TargetRegisterClass * const CalleeSavedRegClassesV3[] = {
     &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
     &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
@@ -102,11 +90,6 @@ HexagonRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
   };
 
   switch(Subtarget.getHexagonArchVersion()) {
-  case HexagonSubtarget::V1:
-    break;
-  case HexagonSubtarget::V2:
-    return CalleeSavedRegClassesV2;
-  case HexagonSubtarget::V3:
   case HexagonSubtarget::V4:
   case HexagonSubtarget::V5:
     return CalleeSavedRegClassesV3;
@@ -159,20 +142,18 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       //
       // r0 = add(r30, #10000)
       // r0 = memw(r0)
-      if ( (MI.getOpcode() == Hexagon::LDriw)  ||
-           (MI.getOpcode() == Hexagon::LDrid)   ||
-           (MI.getOpcode() == Hexagon::LDrih)   ||
-           (MI.getOpcode() == Hexagon::LDriuh)  ||
-           (MI.getOpcode() == Hexagon::LDrib)   ||
-           (MI.getOpcode() == Hexagon::LDriub)  ||
-           (MI.getOpcode() == Hexagon::LDriw_f) ||
-           (MI.getOpcode() == Hexagon::LDrid_f)) {
-        unsigned dstReg = (MI.getOpcode() == Hexagon::LDrid) ?
+      if ( (MI.getOpcode() == Hexagon::L2_loadri_io)  ||
+           (MI.getOpcode() == Hexagon::L2_loadrd_io)   ||
+           (MI.getOpcode() == Hexagon::L2_loadrh_io) ||
+           (MI.getOpcode() == Hexagon::L2_loadruh_io) ||
+           (MI.getOpcode() == Hexagon::L2_loadrb_io) ||
+           (MI.getOpcode() == Hexagon::L2_loadrub_io)) {
+        unsigned dstReg = (MI.getOpcode() == Hexagon::L2_loadrd_io) ?
           getSubReg(MI.getOperand(0).getReg(), Hexagon::subreg_loreg) :
           MI.getOperand(0).getReg();
 
         // Check if offset can fit in addi.
-        if (!TII.isValidOffset(Hexagon::ADD_ri, Offset)) {
+        if (!TII.isValidOffset(Hexagon::A2_addi, Offset)) {
           BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
                   TII.get(Hexagon::CONST32_Int_Real), dstReg).addImm(Offset);
           BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
@@ -180,19 +161,16 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                   dstReg).addReg(FrameReg).addReg(dstReg);
         } else {
           BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
-                  TII.get(Hexagon::ADD_ri),
+                  TII.get(Hexagon::A2_addi),
                   dstReg).addReg(FrameReg).addImm(Offset);
         }
 
         MI.getOperand(FIOperandNum).ChangeToRegister(dstReg, false, false,true);
         MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
-      } else if ((MI.getOpcode() == Hexagon::STriw_indexed) ||
-                 (MI.getOpcode() == Hexagon::STriw) ||
-                 (MI.getOpcode() == Hexagon::STrid) ||
-                 (MI.getOpcode() == Hexagon::STrih) ||
-                 (MI.getOpcode() == Hexagon::STrib) ||
-                 (MI.getOpcode() == Hexagon::STrid_f) ||
-                 (MI.getOpcode() == Hexagon::STriw_f)) {
+      } else if ((MI.getOpcode() == Hexagon::S2_storeri_io) ||
+                 (MI.getOpcode() == Hexagon::S2_storerd_io) ||
+                 (MI.getOpcode() == Hexagon::S2_storerh_io) ||
+                 (MI.getOpcode() == Hexagon::S2_storerb_io)) {
         // For stores, we need a reserved register. Change
         // memw(r30 + #10000) = r0 to:
         //
@@ -201,7 +179,7 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
         unsigned resReg = HEXAGON_RESERVED_REG_1;
 
         // Check if offset can fit in addi.
-        if (!TII.isValidOffset(Hexagon::ADD_ri, Offset)) {
+        if (!TII.isValidOffset(Hexagon::A2_addi, Offset)) {
           BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
                   TII.get(Hexagon::CONST32_Int_Real), resReg).addImm(Offset);
           BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
@@ -209,47 +187,19 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                   resReg).addReg(FrameReg).addReg(resReg);
         } else {
           BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
-                  TII.get(Hexagon::ADD_ri),
+                  TII.get(Hexagon::A2_addi),
                   resReg).addReg(FrameReg).addImm(Offset);
         }
         MI.getOperand(FIOperandNum).ChangeToRegister(resReg, false, false,true);
         MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
       } else if (TII.isMemOp(&MI)) {
         // use the constant extender if the instruction provides it
-        // and we are V4TOps.
-        if (Subtarget.hasV4TOps()) {
-          if (TII.isConstExtended(&MI)) {
-            MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
-            MI.getOperand(FIOperandNum+1).ChangeToImmediate(Offset);
-            TII.immediateExtend(&MI);
-          } else {
-            llvm_unreachable("Need to implement for memops");
-          }
+        if (TII.isConstExtended(&MI)) {
+          MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
+          MI.getOperand(FIOperandNum+1).ChangeToImmediate(Offset);
+          TII.immediateExtend(&MI);
         } else {
-          // Only V3 and older instructions here.
-          unsigned ResReg = HEXAGON_RESERVED_REG_1;
-          if (!MFI.hasVarSizedObjects() &&
-              TII.isValidOffset(MI.getOpcode(), (FrameSize+Offset))) {
-            MI.getOperand(FIOperandNum).ChangeToRegister(getStackRegister(),
-                                                         false, false, false);
-            MI.getOperand(FIOperandNum+1).ChangeToImmediate(FrameSize+Offset);
-          } else if (!TII.isValidOffset(Hexagon::ADD_ri, Offset)) {
-            BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
-                    TII.get(Hexagon::CONST32_Int_Real), ResReg).addImm(Offset);
-            BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
-                    TII.get(Hexagon::A2_add), ResReg).addReg(FrameReg).
-              addReg(ResReg);
-            MI.getOperand(FIOperandNum).ChangeToRegister(ResReg, false, false,
-                                                         true);
-            MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
-          } else {
-            BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
-                    TII.get(Hexagon::ADD_ri), ResReg).addReg(FrameReg).
-              addImm(Offset);
-            MI.getOperand(FIOperandNum).ChangeToRegister(ResReg, false, false,
-                                                         true);
-            MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
-          }
+          llvm_unreachable("Need to implement for memops");
         }
       } else {
         unsigned dstReg = MI.getOperand(0).getReg();
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td
index 9750984..edf1c25 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -13,20 +13,25 @@
 
 let Namespace = "Hexagon" in {
 
-  class HexagonReg<bits<5> num, string n> : Register<n> {
+  class HexagonReg<bits<5> num, string n, list<string> alt = [], 
+                   list<Register> alias = []> : Register<n> {
     field bits<5> Num;
+    let Aliases = alias;
     let HWEncoding{4-0} = num;
   }
 
-  class HexagonDoubleReg<bits<5> num, string n, list<Register> subregs> :
+  class HexagonDoubleReg<bits<5> num, string n, list<Register> subregs,
+                         list<string> alt = []> :
         RegisterWithSubRegs<n, subregs> {
     field bits<5> Num;
+
+    let AltNames = alt;
     let HWEncoding{4-0} = num;
   }
 
   // Registers are identified with 5-bit ID numbers.
   // Ri - 32-bit integer registers.
-  class Ri<bits<5> num, string n> : HexagonReg<num, n> {
+  class Ri<bits<5> num, string n, list<string> alt = []> : HexagonReg<num, n, alt> {
     let Num = num;
   }
 
@@ -49,27 +54,37 @@ let Namespace = "Hexagon" in {
   }
 
   // Rc - control registers
-  class Rc<bits<5> num, string n> : HexagonReg<num, n> {
+  class Rc<bits<5> num, string n,
+           list<string> alt = [], list<Register> alias = []> : 
+        HexagonReg<num, n, alt, alias> {
     let Num = num;
   }
 
-  // Rj - aliased integer registers
-  class Rj<string n, Ri R>: HexagonReg<R.Num, n> {
-    let Num = R.Num;
-    let Aliases = [R];
+  // Rcc - 64-bit control registers.
+  class Rcc<bits<5> num, string n, list<Register> subregs,
+            list<string> alt = []> :
+        HexagonDoubleReg<num, n, subregs, alt> {
+    let Num = num;
+    let SubRegs = subregs;
+  }
+
+  // Mx - address modifier registers
+  class Mx<bits<1> num, string n> : HexagonReg<{0b0000, num}, n> {
+    let Num = !cast<bits<5>>(num);
   }
 
   def subreg_loreg  : SubRegIndex<32>;
   def subreg_hireg  : SubRegIndex<32, 32>;
+  def subreg_overflow : SubRegIndex<1, 0>;
 
   // Integer registers.
-  foreach I = 0-31 in {
-    def R#I  : Ri<I, "r"#I>,  DwarfRegNum<[I]>;
+  foreach i = 0-28 in {
+    def R#i  : Ri<i, "r"#i>,  DwarfRegNum<[i]>;
   }
 
-  def SP : Rj<"sp", R29>, DwarfRegNum<[29]>;
-  def FP : Rj<"fp", R30>, DwarfRegNum<[30]>;
-  def LR : Rj<"lr", R31>, DwarfRegNum<[31]>;
+  def R29 : Ri<29, "r29", ["sp"]>, DwarfRegNum<[29]>;
+  def R30 : Ri<30, "r30", ["fp"]>, DwarfRegNum<[30]>;
+  def R31 : Ri<31, "r31", ["lr"]>, DwarfRegNum<[31]>;
 
   // Aliases of the R* registers used to hold 64-bit int values (doubles).
   let SubRegIndices = [subreg_loreg, subreg_hireg], CoveredBySubRegs = 1 in {
@@ -97,44 +112,98 @@ let Namespace = "Hexagon" in {
   def P2 : Rp<2, "p2">, DwarfRegNum<[65]>;
   def P3 : Rp<3, "p3">, DwarfRegNum<[66]>;
 
-  // Control registers.
-  def SA0 : Rc<0, "sa0">, DwarfRegNum<[67]>;
-  def LC0 : Rc<1, "lc0">, DwarfRegNum<[68]>;
-
-  def SA1 : Rc<2, "sa1">, DwarfRegNum<[69]>;
-  def LC1 : Rc<3, "lc1">, DwarfRegNum<[70]>;
+  // Modifier registers.
+  // C6 and C7 can also be M0 and M1, but register names must be unique, even
+  // if belonging to different register classes.
+  def M0 : Mx<0, "m0">, DwarfRegNum<[72]>;
+  def M1 : Mx<1, "m1">, DwarfRegNum<[73]>;
 
-  def M0 : Rc<6, "m0">, DwarfRegNum<[71]>;
-  def M1 : Rc<7, "m1">, DwarfRegNum<[72]>;
+  // Fake register to represent USR.OVF bit. Artihmetic/saturating instruc-
+  // tions modify this bit, and multiple such instructions are allowed in the
+  // same packet. We need to ignore output dependencies on this bit, but not
+  // on the entire USR.
+  def USR_OVF : Rc<?, "usr.ovf">;
 
-  def PC : Rc<9,  "pc">, DwarfRegNum<[32]>; // is the Dwarf number correct?
-  def GP : Rc<11, "gp">, DwarfRegNum<[33]>; // is the Dwarf number correct?
+  // Control registers.
+  def SA0  : Rc<0,  "sa0",       ["c0"]>,   DwarfRegNum<[67]>;
+  def LC0  : Rc<1,  "lc0",       ["c1"]>,   DwarfRegNum<[68]>;
+  def SA1  : Rc<2,  "sa1",       ["c2"]>,   DwarfRegNum<[69]>;
+  def LC1  : Rc<3,  "lc1",       ["c3"]>,   DwarfRegNum<[70]>;
+  def P3_0 : Rc<4,  "p3:0",      ["c4"], [P0, P1, P2, P3]>,
+                                            DwarfRegNum<[71]>;
+  def C6   : Rc<6,  "c6",        [], [M0]>, DwarfRegNum<[72]>;
+  def C7   : Rc<7,  "c7",        [], [M1]>, DwarfRegNum<[73]>;
+
+  def USR  : Rc<8,  "usr",       ["c8"]>,   DwarfRegNum<[74]> {
+    let SubRegIndices = [subreg_overflow];
+    let SubRegs = [USR_OVF];
+  }
+  def PC   : Rc<9,  "pc">,                  DwarfRegNum<[75]>;
+  def UGP  : Rc<10, "ugp",       ["c10"]>,  DwarfRegNum<[76]>;
+  def GP   : Rc<11, "gp">,                  DwarfRegNum<[77]>;
+  def CS0  : Rc<12, "cs0",       ["c12"]>,  DwarfRegNum<[78]>;
+  def CS1  : Rc<13, "cs1",       ["c13"]>,  DwarfRegNum<[79]>;
+  def UPCL : Rc<14, "upcyclelo", ["c14"]>,  DwarfRegNum<[80]>;
+  def UPCH : Rc<15, "upcyclehi", ["c15"]>,  DwarfRegNum<[81]>;
 }
 
+  // Control registers pairs.
+  let SubRegIndices = [subreg_loreg, subreg_hireg], CoveredBySubRegs = 1 in {
+    def C1_0   : Rcc<0,   "c1:0",  [SA0, LC0], ["lc0:sa0"]>, DwarfRegNum<[67]>;
+    def C3_2   : Rcc<2,   "c3:2",  [SA1, LC1], ["lc1:sa1"]>, DwarfRegNum<[69]>;
+    def C7_6   : Rcc<6,   "c7:6",  [C6, C7],   ["m1:0"]>,    DwarfRegNum<[72]>;
+    def C9_8   : Rcc<8,   "c9:8",  [USR, PC]>,               DwarfRegNum<[74]>;
+    def C11_10 : Rcc<10, "c11:10", [UGP, GP]>,               DwarfRegNum<[76]>;
+    def CS     : Rcc<12, "c13:12", [CS0, CS1], ["cs1:0"]>,   DwarfRegNum<[78]>;
+    def UPC    : Rcc<14, "c15:14", [UPCL, UPCH]>,            DwarfRegNum<[80]>;
+  }
+
 // Register classes.
 //
 // FIXME: the register order should be defined in terms of the preferred
 // allocation order...
 //
-def IntRegs : RegisterClass<"Hexagon", [i32,f32], 32,
+def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32,
                             (add (sequence "R%u", 0, 9),
                                  (sequence "R%u", 12, 28),
                                  R10, R11, R29, R30, R31)> {
 }
 
-def DoubleRegs : RegisterClass<"Hexagon", [i64,f64], 64,
+def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64,
                                (add (sequence "D%u", 0, 4),
                                     (sequence "D%u", 6, 13), D5, D14, D15)>;
 
 
-def PredRegs : RegisterClass<"Hexagon", [i1], 32, (add (sequence "P%u", 0, 3))>
+def PredRegs : RegisterClass<"Hexagon", 
+                             [i1, v2i1, v4i1, v8i1, v4i8, v2i16, i32], 32,
+                             (add (sequence "P%u", 0, 3))>
 {
   let Size = 32;
 }
 
-def CRRegs : RegisterClass<"Hexagon", [i32], 32,
-                           (add (sequence "LC%u", 0, 1),
-                                (sequence "SA%u", 0, 1),
-                                (sequence "M%u", 0, 1), PC, GP)> {
-  let Size = 32;
+let Size = 32 in
+def ModRegs : RegisterClass<"Hexagon", [i32], 32, (add M0, M1)>;
+
+let Size = 32, isAllocatable = 0 in
+def CtrRegs : RegisterClass<"Hexagon", [i32], 32,
+                           (add LC0, SA0, LC1, SA1,
+                                P3_0,
+                                 M0, M1, C6, C7, CS0, CS1, UPCL, UPCH,
+                                 USR, USR_OVF, UGP, GP, PC)>;
+
+let Size = 64, isAllocatable = 0 in
+def CtrRegs64 : RegisterClass<"Hexagon", [i64], 64,
+                              (add C1_0, C3_2, C7_6, C9_8, C11_10, CS, UPC)>;
+
+def VolatileV3 {
+  list<Register> Regs = [D0, D1, D2, D3, D4, D5, D6, D7,
+                         R28, R31,
+                         P0, P1, P2, P3,
+                         M0, M1,
+                         LC0, LC1, SA0, SA1, USR, USR_OVF];
 }
+
+def PositiveHalfWord : PatLeaf<(i32 IntRegs:$a),
+[{
+  return isPositiveHalfWord(N);
+}]>;
diff --git a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
index 2b459a4..0c24075 100644
--- a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
+++ b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
@@ -15,6 +15,7 @@
 #include "Hexagon.h"
 #include "HexagonTargetMachine.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Pass.h"
@@ -42,7 +43,7 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineFunctionAnalysis>();
       AU.addPreserved<MachineFunctionAnalysis>();
-      AU.addPreserved("stack-protector");
+      AU.addPreserved<StackProtector>();
       FunctionPass::getAnalysisUsage(AU);
     }
   };
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index 8fdd493..ce6a39a 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -48,12 +48,9 @@ using namespace llvm;
 namespace {
 
 class HexagonSplitConst32AndConst64 : public MachineFunctionPass {
-  const HexagonTargetMachine &QTM;
-
  public:
     static char ID;
-    HexagonSplitConst32AndConst64(const HexagonTargetMachine &TM)
-        : MachineFunctionPass(ID), QTM(TM) {}
+    HexagonSplitConst32AndConst64() : MachineFunctionPass(ID) {}
 
     const char *getPassName() const override {
       return "Hexagon Split Const32s and Const64s";
@@ -68,13 +65,12 @@ char HexagonSplitConst32AndConst64::ID = 0;
 bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
 
   const HexagonTargetObjectFile &TLOF =
-      (const HexagonTargetObjectFile &)QTM.getSubtargetImpl()
-          ->getTargetLowering()
-          ->getObjFileLowering();
+      *static_cast<const HexagonTargetObjectFile *>(
+          Fn.getTarget().getObjFileLowering());
   if (TLOF.IsSmallDataEnabled())
     return true;
 
-  const TargetInstrInfo *TII = QTM.getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
 
   // Loop over all of the basic blocks
   for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
@@ -117,9 +113,9 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
         MachineOperand &Symbol = MI->getOperand (1);
 
         BuildMI (*MBB, MII, MI->getDebugLoc(),
-                 TII->get(Hexagon::LO_label), DestReg).addOperand(Symbol);
+                 TII->get(Hexagon::LO_PIC), DestReg).addOperand(Symbol);
         BuildMI (*MBB, MII, MI->getDebugLoc(),
-                 TII->get(Hexagon::HI_label), DestReg).addOperand(Symbol);
+                 TII->get(Hexagon::HI_PIC), DestReg).addOperand(Symbol);
         // MBB->erase returns the iterator to the next instruction, which is the
         // one we want to process next
         MII = MBB->erase (MI);
@@ -139,9 +135,9 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
       else if (Opc == Hexagon::CONST64_Int_Real) {
         int DestReg = MI->getOperand(0).getReg();
         int64_t ImmValue = MI->getOperand(1).getImm ();
-        unsigned DestLo = QTM.getSubtargetImpl()->getRegisterInfo()->getSubReg(
+        unsigned DestLo = Fn.getSubtarget().getRegisterInfo()->getSubReg(
             DestReg, Hexagon::subreg_loreg);
-        unsigned DestHi = QTM.getSubtargetImpl()->getRegisterInfo()->getSubReg(
+        unsigned DestHi = Fn.getSubtarget().getRegisterInfo()->getSubReg(
             DestReg, Hexagon::subreg_hireg);
 
         int32_t LowWord = (ImmValue & 0xFFFFFFFF);
@@ -176,6 +172,6 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
 //===----------------------------------------------------------------------===//
 
 FunctionPass *
-llvm::createHexagonSplitConst32AndConst64(const HexagonTargetMachine &TM) {
-  return new HexagonSplitConst32AndConst64(TM);
+llvm::createHexagonSplitConst32AndConst64() {
+  return new HexagonSplitConst32AndConst64();
 }
diff --git a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
index 1052b80..8873bb9 100644
--- a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
+++ b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
@@ -58,13 +58,9 @@ namespace llvm {
 namespace {
 
 class HexagonSplitTFRCondSets : public MachineFunctionPass {
-    const HexagonTargetMachine &QTM;
-    const HexagonSubtarget &QST;
-
  public:
     static char ID;
-    HexagonSplitTFRCondSets(const HexagonTargetMachine& TM) :
-      MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {
+    HexagonSplitTFRCondSets() : MachineFunctionPass(ID) {
       initializeHexagonSplitTFRCondSetsPass(*PassRegistry::getPassRegistry());
     }
 
@@ -80,7 +76,7 @@ char HexagonSplitTFRCondSets::ID = 0;
 
 bool HexagonSplitTFRCondSets::runOnMachineFunction(MachineFunction &Fn) {
 
-  const TargetInstrInfo *TII = QTM.getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
 
   // Loop over all of the basic blocks.
   for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
@@ -90,41 +86,8 @@ bool HexagonSplitTFRCondSets::runOnMachineFunction(MachineFunction &Fn) {
     for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
          ++MII) {
       MachineInstr *MI = MII;
-      int Opc1, Opc2;
       switch(MI->getOpcode()) {
-        case Hexagon::TFR_condset_rr:
-        case Hexagon::TFR_condset_rr_f:
-        case Hexagon::TFR_condset_rr64_f: {
-          int DestReg = MI->getOperand(0).getReg();
-          int SrcReg1 = MI->getOperand(2).getReg();
-          int SrcReg2 = MI->getOperand(3).getReg();
-
-          if (MI->getOpcode() == Hexagon::TFR_condset_rr ||
-              MI->getOpcode() == Hexagon::TFR_condset_rr_f) {
-            Opc1 = Hexagon::TFR_cPt;
-            Opc2 = Hexagon::TFR_cNotPt;
-          }
-          else if (MI->getOpcode() == Hexagon::TFR_condset_rr64_f) {
-            Opc1 = Hexagon::TFR64_cPt;
-            Opc2 = Hexagon::TFR64_cNotPt;
-          }
-
-          // Minor optimization: do not emit the predicated copy if the source
-          // and the destination is the same register.
-          if (DestReg != SrcReg1) {
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Opc1),
-                    DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg1);
-          }
-          if (DestReg != SrcReg2) {
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Opc2),
-                    DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg2);
-          }
-          MII = MBB->erase(MI);
-          --MII;
-          break;
-        }
-        case Hexagon::TFR_condset_ri:
-        case Hexagon::TFR_condset_ri_f: {
+        case Hexagon::TFR_condset_ri: {
           int DestReg = MI->getOperand(0).getReg();
           int SrcReg1 = MI->getOperand(2).getReg();
 
@@ -132,77 +95,50 @@ bool HexagonSplitTFRCondSets::runOnMachineFunction(MachineFunction &Fn) {
           // is the same register.
           if (DestReg != SrcReg1) {
             BuildMI(*MBB, MII, MI->getDebugLoc(),
-              TII->get(Hexagon::TFR_cPt), DestReg).
+              TII->get(Hexagon::A2_tfrt), DestReg).
               addReg(MI->getOperand(1).getReg()).addReg(SrcReg1);
           }
-          if (MI->getOpcode() ==  Hexagon::TFR_condset_ri ) {
-            BuildMI(*MBB, MII, MI->getDebugLoc(),
-              TII->get(Hexagon::TFRI_cNotPt), DestReg).
-              addReg(MI->getOperand(1).getReg()).
-              addImm(MI->getOperand(3).getImm());
-          } else if (MI->getOpcode() ==  Hexagon::TFR_condset_ri_f ) {
-            BuildMI(*MBB, MII, MI->getDebugLoc(),
-              TII->get(Hexagon::TFRI_cNotPt_f), DestReg).
-              addReg(MI->getOperand(1).getReg()).
-              addFPImm(MI->getOperand(3).getFPImm());
-          }
+          BuildMI(*MBB, MII, MI->getDebugLoc(),
+            TII->get(Hexagon::C2_cmoveif), DestReg).
+            addReg(MI->getOperand(1).getReg()).
+            addImm(MI->getOperand(3).getImm());
 
           MII = MBB->erase(MI);
           --MII;
           break;
         }
-        case Hexagon::TFR_condset_ir:
-        case Hexagon::TFR_condset_ir_f: {
+        case Hexagon::TFR_condset_ir: {
           int DestReg = MI->getOperand(0).getReg();
           int SrcReg2 = MI->getOperand(3).getReg();
 
-          if (MI->getOpcode() ==  Hexagon::TFR_condset_ir ) {
-            BuildMI(*MBB, MII, MI->getDebugLoc(),
-              TII->get(Hexagon::TFRI_cPt), DestReg).
-              addReg(MI->getOperand(1).getReg()).
-              addImm(MI->getOperand(2).getImm());
-          } else if (MI->getOpcode() ==  Hexagon::TFR_condset_ir_f ) {
-            BuildMI(*MBB, MII, MI->getDebugLoc(),
-              TII->get(Hexagon::TFRI_cPt_f), DestReg).
-              addReg(MI->getOperand(1).getReg()).
-              addFPImm(MI->getOperand(2).getFPImm());
-          }
+          BuildMI(*MBB, MII, MI->getDebugLoc(),
+            TII->get(Hexagon::C2_cmoveit), DestReg).
+            addReg(MI->getOperand(1).getReg()).
+            addImm(MI->getOperand(2).getImm());
 
           // Do not emit the predicated copy if the source and
           // the destination is the same register.
           if (DestReg != SrcReg2) {
             BuildMI(*MBB, MII, MI->getDebugLoc(),
-              TII->get(Hexagon::TFR_cNotPt), DestReg).
+              TII->get(Hexagon::A2_tfrf), DestReg).
               addReg(MI->getOperand(1).getReg()).addReg(SrcReg2);
           }
           MII = MBB->erase(MI);
           --MII;
           break;
         }
-        case Hexagon::TFR_condset_ii:
-        case Hexagon::TFR_condset_ii_f: {
+        case Hexagon::TFR_condset_ii: {
           int DestReg = MI->getOperand(0).getReg();
           int SrcReg1 = MI->getOperand(1).getReg();
 
-          if (MI->getOpcode() ==  Hexagon::TFR_condset_ii ) {
-            int Immed1 = MI->getOperand(2).getImm();
-            int Immed2 = MI->getOperand(3).getImm();
-            BuildMI(*MBB, MII, MI->getDebugLoc(),
-                    TII->get(Hexagon::TFRI_cPt),
-                    DestReg).addReg(SrcReg1).addImm(Immed1);
-            BuildMI(*MBB, MII, MI->getDebugLoc(),
-                    TII->get(Hexagon::TFRI_cNotPt),
-                    DestReg).addReg(SrcReg1).addImm(Immed2);
-          } else if (MI->getOpcode() ==  Hexagon::TFR_condset_ii_f ) {
-            BuildMI(*MBB, MII, MI->getDebugLoc(),
-                    TII->get(Hexagon::TFRI_cPt_f), DestReg).
-                    addReg(SrcReg1).
-                    addFPImm(MI->getOperand(2).getFPImm());
-            BuildMI(*MBB, MII, MI->getDebugLoc(),
-                    TII->get(Hexagon::TFRI_cNotPt_f), DestReg).
-                    addReg(SrcReg1).
-                    addFPImm(MI->getOperand(3).getFPImm());
-          }
+          int Immed1 = MI->getOperand(2).getImm();
+          int Immed2 = MI->getOperand(3).getImm();
+          BuildMI(*MBB, MII, MI->getDebugLoc(),
+                  TII->get(Hexagon::C2_cmoveit),
+                  DestReg).addReg(SrcReg1).addImm(Immed1);
+          BuildMI(*MBB, MII, MI->getDebugLoc(),
+                  TII->get(Hexagon::C2_cmoveif),
+                  DestReg).addReg(SrcReg1).addImm(Immed2);
           MII = MBB->erase(MI);
           --MII;
           break;
@@ -231,7 +167,6 @@ void llvm::initializeHexagonSplitTFRCondSetsPass(PassRegistry &Registry) {
   CALL_ONCE_INITIALIZATION(initializePassOnce)
 }
 
-FunctionPass*
-llvm::createHexagonSplitTFRCondSets(const HexagonTargetMachine &TM) {
-  return new HexagonSplitTFRCondSets(TM);
+FunctionPass *llvm::createHexagonSplitTFRCondSets() {
+  return new HexagonSplitTFRCondSets();
 }
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 657893f..380f023 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -54,12 +54,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
   if (CPUString.empty())
     CPUString = "hexagonv4";
 
-  if (CPUString == "hexagonv2") {
-    HexagonArchVersion = V2;
-  } else if (CPUString == "hexagonv3") {
-    EnableV3 = true;
-    HexagonArchVersion = V3;
-  } else if (CPUString == "hexagonv4") {
+  if (CPUString == "hexagonv4") {
     HexagonArchVersion = V4;
   } else if (CPUString == "hexagonv5") {
     HexagonArchVersion = V5;
@@ -74,9 +69,8 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
 HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS,
                                    const TargetMachine &TM)
     : HexagonGenSubtargetInfo(TT, CPU, FS), CPUString(CPU.str()),
-      DL("e-m:e-p:32:32-i1:32-i64:64-a:0-n32"),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM),
-      TSInfo(DL), FrameLowering() {
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
+      TSInfo(*TM.getDataLayout()), FrameLowering() {
 
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUString);
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index 10776ae..57de546 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -15,8 +15,8 @@
 #define LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
 
 #include "HexagonFrameLowering.h"
-#include "HexagonInstrInfo.h"
 #include "HexagonISelLowering.h"
+#include "HexagonInstrInfo.h"
 #include "HexagonSelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
@@ -39,13 +39,12 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
 
 public:
   enum HexagonArchEnum {
-    V1, V2, V3, V4, V5
+    V4, V5
   };
 
   HexagonArchEnum HexagonArchVersion;
 private:
   std::string CPUString;
-  const DataLayout DL;       // Calculates type size & alignment.
   HexagonInstrInfo InstrInfo;
   HexagonTargetLowering TLInfo;
   HexagonSelectionDAGInfo TSInfo;
@@ -74,7 +73,6 @@ public:
   const HexagonSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
-  const DataLayout *getDataLayout() const override { return &DL; }
 
   HexagonSubtarget &initializeSubtargetDependencies(StringRef CPU,
                                                     StringRef FS);
@@ -83,18 +81,11 @@ public:
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
-  bool hasV2TOps () const { return HexagonArchVersion >= V2; }
-  bool hasV2TOpsOnly () const { return HexagonArchVersion == V2; }
-  bool hasV3TOps () const { return HexagonArchVersion >= V3; }
-  bool hasV3TOpsOnly () const { return HexagonArchVersion == V3; }
-  bool hasV4TOps () const { return HexagonArchVersion >= V4; }
-  bool hasV4TOpsOnly () const { return HexagonArchVersion == V4; }
-  bool useMemOps () const { return HexagonArchVersion >= V4 && UseMemOps; }
-  bool hasV5TOps () const { return HexagonArchVersion >= V5; }
-  bool hasV5TOpsOnly () const { return HexagonArchVersion == V5; }
-  bool modeIEEERndNear () const { return ModeIEEERndNear; }
-
-  bool isSubtargetV2() const { return HexagonArchVersion == V2;}
+  bool useMemOps() const { return UseMemOps; }
+  bool hasV5TOps() const { return getHexagonArchVersion() >= V5; }
+  bool hasV5TOpsOnly() const { return getHexagonArchVersion() == V5; }
+  bool modeIEEERndNear() const { return ModeIEEERndNear; }
+
   const std::string &getCPUString () const { return CPUString; }
 
   // Threshold for small data section
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index cd18dfb..64f75a3 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -17,8 +17,8 @@
 #include "HexagonMachineScheduler.h"
 #include "HexagonTargetObjectFile.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
@@ -71,7 +71,7 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT,
                                            CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
       TLOF(make_unique<HexagonTargetObjectFile>()),
-      Subtarget(TT, CPU, FS, *this) {
+      DL("e-m:e-p:32:32-i1:32-i64:64-a:0-n32"), Subtarget(TT, CPU, FS, *this) {
     initAsmInfo();
 }
 
@@ -103,10 +103,10 @@ public:
   }
 
   bool addInstSelector() override;
-  bool addPreRegAlloc() override;
-  bool addPostRegAlloc() override;
-  bool addPreSched2() override;
-  bool addPreEmitPass() override;
+  void addPreRegAlloc() override;
+  void addPostRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
 };
 } // namespace
 
@@ -131,51 +131,41 @@ bool HexagonPassConfig::addInstSelector() {
   return false;
 }
 
-bool HexagonPassConfig::addPreRegAlloc() {
+void HexagonPassConfig::addPreRegAlloc() {
   if (getOptLevel() != CodeGenOpt::None)
     if (!DisableHardwareLoops)
-      addPass(createHexagonHardwareLoops());
-  return false;
+      addPass(createHexagonHardwareLoops(), false);
 }
 
-bool HexagonPassConfig::addPostRegAlloc() {
-  const HexagonTargetMachine &TM = getHexagonTargetMachine();
+void HexagonPassConfig::addPostRegAlloc() {
   if (getOptLevel() != CodeGenOpt::None)
     if (!DisableHexagonCFGOpt)
-      addPass(createHexagonCFGOptimizer(TM));
-  return false;
+      addPass(createHexagonCFGOptimizer(), false);
 }
 
-bool HexagonPassConfig::addPreSched2() {
-  const HexagonTargetMachine &TM = getHexagonTargetMachine();
-
-  addPass(createHexagonCopyToCombine());
+void HexagonPassConfig::addPreSched2() {
+  addPass(createHexagonCopyToCombine(), false);
   if (getOptLevel() != CodeGenOpt::None)
-    addPass(&IfConverterID);
-  addPass(createHexagonSplitConst32AndConst64(TM));
-  printAndVerify("After hexagon split const32/64 pass");
-  return true;
+    addPass(&IfConverterID, false);
+  addPass(createHexagonSplitConst32AndConst64());
 }
 
-bool HexagonPassConfig::addPreEmitPass() {
-  const HexagonTargetMachine &TM = getHexagonTargetMachine();
+void HexagonPassConfig::addPreEmitPass() {
   bool NoOpt = (getOptLevel() == CodeGenOpt::None);
 
   if (!NoOpt)
-    addPass(createHexagonNewValueJump());
+    addPass(createHexagonNewValueJump(), false);
 
   // Expand Spill code for predicate registers.
-  addPass(createHexagonExpandPredSpillCode(TM));
+  addPass(createHexagonExpandPredSpillCode(), false);
 
   // Split up TFRcondsets into conditional transfers.
-  addPass(createHexagonSplitTFRCondSets(TM));
+  addPass(createHexagonSplitTFRCondSets(), false);
 
   // Create Packets.
   if (!NoOpt) {
     if (!DisableHardwareLoops)
-      addPass(createHexagonFixupHwLoops());
-    addPass(createHexagonPacketizer());
+      addPass(createHexagonFixupHwLoops(), false);
+    addPass(createHexagonPacketizer(), false);
   }
-
-  return false;
 }
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h
index 4a9f447..e0b3a9b 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -24,6 +24,7 @@ class Module;
 
 class HexagonTargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  const DataLayout DL; // Calculates type size & alignment.
   HexagonSubtarget Subtarget;
 
 public:
@@ -32,7 +33,7 @@ public:
                        Reloc::Model RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
   ~HexagonTargetMachine() override;
-
+  const DataLayout *getDataLayout() const override { return &DL; }
   const HexagonSubtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index f4ab5e2..d8660d3 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -33,14 +33,10 @@ void HexagonTargetObjectFile::Initialize(MCContext &Ctx,
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(TM.Options.UseInitArray);
 
-  SmallDataSection =
-    getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
-                               ELF::SHF_WRITE | ELF::SHF_ALLOC,
-                               SectionKind::getDataRel());
-  SmallBSSSection =
-    getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
-                               ELF::SHF_WRITE | ELF::SHF_ALLOC,
-                               SectionKind::getBSS());
+  SmallDataSection = getContext().getELFSection(
+      ".sdata", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+  SmallBSSSection = getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+                                               ELF::SHF_WRITE | ELF::SHF_ALLOC);
 }
 
 // sdata/sbss support taken largely from the MIPS Backend.
@@ -79,8 +75,7 @@ IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
 
   if (Kind.isBSS() || Kind.isDataNoRel() || Kind.isCommon()) {
     Type *Ty = GV->getType()->getElementType();
-    return IsInSmallSection(
-        TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(Ty));
+    return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
   }
 
   return false;
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index e7296d6..c123640 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -264,8 +264,7 @@ bool HexagonPacketizer::runOnMachineFunction(MachineFunction &Fn) {
 
 
 static bool IsIndirectCall(MachineInstr* MI) {
-  return ((MI->getOpcode() == Hexagon::CALLR) ||
-          (MI->getOpcode() == Hexagon::CALLRv3));
+  return MI->getOpcode() == Hexagon::J2_callr;
 }
 
 // Reserve resources for constant extender. Trigure an assertion if
@@ -273,7 +272,7 @@ static bool IsIndirectCall(MachineInstr* MI) {
 void HexagonPacketizerList::reserveResourcesForConstExt(MachineInstr* MI) {
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
   MachineFunction *MF = MI->getParent()->getParent();
-  MachineInstr *PseudoMI = MF->CreateMachineInstr(QII->get(Hexagon::IMMEXT_i),
+  MachineInstr *PseudoMI = MF->CreateMachineInstr(QII->get(Hexagon::A4_ext),
                                                   MI->getDebugLoc());
 
   if (ResourceTracker->canReserveResources(PseudoMI)) {
@@ -291,7 +290,7 @@ bool HexagonPacketizerList::canReserveResourcesForConstExt(MachineInstr *MI) {
   assert((QII->isExtended(MI) || QII->isConstExtended(MI)) &&
          "Should only be called for constant extended instructions");
   MachineFunction *MF = MI->getParent()->getParent();
-  MachineInstr *PseudoMI = MF->CreateMachineInstr(QII->get(Hexagon::IMMEXT_i),
+  MachineInstr *PseudoMI = MF->CreateMachineInstr(QII->get(Hexagon::A4_ext),
                                                   MI->getDebugLoc());
   bool CanReserve = ResourceTracker->canReserveResources(PseudoMI);
   MF->DeleteMachineInstr(PseudoMI);
@@ -303,7 +302,7 @@ bool HexagonPacketizerList::canReserveResourcesForConstExt(MachineInstr *MI) {
 bool HexagonPacketizerList::tryAllocateResourcesForConstExt(MachineInstr* MI) {
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
   MachineFunction *MF = MI->getParent()->getParent();
-  MachineInstr *PseudoMI = MF->CreateMachineInstr(QII->get(Hexagon::IMMEXT_i),
+  MachineInstr *PseudoMI = MF->CreateMachineInstr(QII->get(Hexagon::A4_ext),
                                                   MI->getDebugLoc());
 
   if (ResourceTracker->canReserveResources(PseudoMI)) {
@@ -366,12 +365,12 @@ static bool IsRegDependence(const SDep::Kind DepType) {
 }
 
 static bool IsDirectJump(MachineInstr* MI) {
-  return (MI->getOpcode() == Hexagon::JMP);
+  return (MI->getOpcode() == Hexagon::J2_jump);
 }
 
 static bool IsSchedBarrier(MachineInstr* MI) {
   switch (MI->getOpcode()) {
-  case Hexagon::BARRIER:
+  case Hexagon::Y2_barrier:
     return true;
   }
   return false;
@@ -382,8 +381,8 @@ static bool IsControlFlow(MachineInstr* MI) {
 }
 
 static bool IsLoopN(MachineInstr *MI) {
-  return (MI->getOpcode() == Hexagon::LOOP0_i ||
-          MI->getOpcode() == Hexagon::LOOP0_r);
+  return (MI->getOpcode() == Hexagon::J2_loop0i ||
+          MI->getOpcode() == Hexagon::J2_loop0r);
 }
 
 /// DoesModifyCalleeSavedReg - Returns true if the instruction modifies a
@@ -563,8 +562,8 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore(
     if (PacketSU->getInstr()->getDesc().mayStore() ||
         // if we have mayStore = 1 set on ALLOCFRAME and DEALLOCFRAME,
         // then we don't need this
-        PacketSU->getInstr()->getOpcode() == Hexagon::ALLOCFRAME ||
-        PacketSU->getInstr()->getOpcode() == Hexagon::DEALLOCFRAME)
+        PacketSU->getInstr()->getOpcode() == Hexagon::S2_allocframe ||
+        PacketSU->getInstr()->getOpcode() == Hexagon::L2_deallocframe)
       return false;
   }
 
@@ -721,10 +720,7 @@ bool HexagonPacketizerList::CanPromoteToNewValue(
     MachineBasicBlock::iterator &MII) {
 
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  const HexagonRegisterInfo *QRI =
-      (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo();
-  if (!QRI->Subtarget.hasV4TOps() ||
-      !QII->mayBeNewStore(MI))
+  if (!QII->mayBeNewStore(MI))
     return false;
 
   MachineInstr *PacketMI = PacketSU->getInstr();
@@ -1055,84 +1051,82 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
   // first store is not in SLOT0. New value store, new value jump,
   // dealloc_return and memop always take SLOT0.
   // Arch spec 3.4.4.2
-  if (QRI->Subtarget.hasV4TOps()) {
-    if (MCIDI.mayStore() && MCIDJ.mayStore() &&
-       (QII->isNewValueInst(J) || QII->isMemOp(J) || QII->isMemOp(I))) {
-      Dependence = true;
-      return false;
-    }
+  if (MCIDI.mayStore() && MCIDJ.mayStore() &&
+      (QII->isNewValueInst(J) || QII->isMemOp(J) || QII->isMemOp(I))) {
+    Dependence = true;
+    return false;
+  }
 
-    if ((QII->isMemOp(J) && MCIDI.mayStore())
-        || (MCIDJ.mayStore() && QII->isMemOp(I))
-        || (QII->isMemOp(J) && QII->isMemOp(I))) {
-      Dependence = true;
-      return false;
-    }
+  if ((QII->isMemOp(J) && MCIDI.mayStore())
+      || (MCIDJ.mayStore() && QII->isMemOp(I))
+      || (QII->isMemOp(J) && QII->isMemOp(I))) {
+    Dependence = true;
+    return false;
+  }
 
-    //if dealloc_return
-    if (MCIDJ.mayStore() && QII->isDeallocRet(I)) {
-      Dependence = true;
-      return false;
-    }
+  //if dealloc_return
+  if (MCIDJ.mayStore() && QII->isDeallocRet(I)) {
+    Dependence = true;
+    return false;
+  }
 
-    // If an instruction feeds new value jump, glue it.
-    MachineBasicBlock::iterator NextMII = I;
-    ++NextMII;
-    if (NextMII != I->getParent()->end() && QII->isNewValueJump(NextMII)) {
-      MachineInstr *NextMI = NextMII;
+  // If an instruction feeds new value jump, glue it.
+  MachineBasicBlock::iterator NextMII = I;
+  ++NextMII;
+  if (NextMII != I->getParent()->end() && QII->isNewValueJump(NextMII)) {
+    MachineInstr *NextMI = NextMII;
 
-      bool secondRegMatch = false;
-      bool maintainNewValueJump = false;
+    bool secondRegMatch = false;
+    bool maintainNewValueJump = false;
 
-      if (NextMI->getOperand(1).isReg() &&
-          I->getOperand(0).getReg() == NextMI->getOperand(1).getReg()) {
-        secondRegMatch = true;
-        maintainNewValueJump = true;
-      }
+    if (NextMI->getOperand(1).isReg() &&
+        I->getOperand(0).getReg() == NextMI->getOperand(1).getReg()) {
+      secondRegMatch = true;
+      maintainNewValueJump = true;
+    }
 
-      if (!secondRegMatch &&
-           I->getOperand(0).getReg() == NextMI->getOperand(0).getReg()) {
-        maintainNewValueJump = true;
-      }
+    if (!secondRegMatch &&
+          I->getOperand(0).getReg() == NextMI->getOperand(0).getReg()) {
+      maintainNewValueJump = true;
+    }
 
-      for (std::vector<MachineInstr*>::iterator
-            VI = CurrentPacketMIs.begin(),
-             VE = CurrentPacketMIs.end();
-           (VI != VE && maintainNewValueJump); ++VI) {
-        SUnit *PacketSU = MIToSUnit.find(*VI)->second;
+    for (std::vector<MachineInstr*>::iterator
+          VI = CurrentPacketMIs.begin(),
+            VE = CurrentPacketMIs.end();
+          (VI != VE && maintainNewValueJump); ++VI) {
+      SUnit *PacketSU = MIToSUnit.find(*VI)->second;
 
-        // NVJ can not be part of the dual jump - Arch Spec: section 7.8
-        if (PacketSU->getInstr()->getDesc().isCall()) {
-          Dependence = true;
-          break;
-        }
-        // Validate
-        // 1. Packet does not have a store in it.
-        // 2. If the first operand of the nvj is newified, and the second
-        //    operand is also a reg, it (second reg) is not defined in
-        //    the same packet.
-        // 3. If the second operand of the nvj is newified, (which means
-        //    first operand is also a reg), first reg is not defined in
-        //    the same packet.
-        if (PacketSU->getInstr()->getDesc().mayStore()               ||
-            PacketSU->getInstr()->getOpcode() == Hexagon::ALLOCFRAME ||
-            // Check #2.
-            (!secondRegMatch && NextMI->getOperand(1).isReg() &&
-             PacketSU->getInstr()->modifiesRegister(
-                               NextMI->getOperand(1).getReg(), QRI)) ||
-            // Check #3.
-            (secondRegMatch &&
-             PacketSU->getInstr()->modifiesRegister(
-                               NextMI->getOperand(0).getReg(), QRI))) {
-          Dependence = true;
-          break;
-        }
+      // NVJ can not be part of the dual jump - Arch Spec: section 7.8
+      if (PacketSU->getInstr()->getDesc().isCall()) {
+        Dependence = true;
+        break;
+      }
+      // Validate
+      // 1. Packet does not have a store in it.
+      // 2. If the first operand of the nvj is newified, and the second
+      //    operand is also a reg, it (second reg) is not defined in
+      //    the same packet.
+      // 3. If the second operand of the nvj is newified, (which means
+      //    first operand is also a reg), first reg is not defined in
+      //    the same packet.
+      if (PacketSU->getInstr()->getDesc().mayStore()               ||
+          PacketSU->getInstr()->getOpcode() == Hexagon::S2_allocframe ||
+          // Check #2.
+          (!secondRegMatch && NextMI->getOperand(1).isReg() &&
+            PacketSU->getInstr()->modifiesRegister(
+                              NextMI->getOperand(1).getReg(), QRI)) ||
+          // Check #3.
+          (secondRegMatch &&
+            PacketSU->getInstr()->modifiesRegister(
+                              NextMI->getOperand(0).getReg(), QRI))) {
+        Dependence = true;
+        break;
       }
-      if (!Dependence)
-        GlueToNewValueJump = true;
-      else
-        return false;
     }
+    if (!Dependence)
+      GlueToNewValueJump = true;
+    else
+      return false;
   }
 
   if (SUJ->isSucc(SUI)) {
@@ -1254,9 +1248,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
       else if ((DepType == SDep::Order) &&
                !I->hasOrderedMemoryRef() &&
                !J->hasOrderedMemoryRef()) {
-        if (QRI->Subtarget.hasV4TOps() &&
-            // hexagonv4 allows dual store.
-            MCIDI.mayStore() && MCIDJ.mayStore()) {
+        if (MCIDI.mayStore() && MCIDJ.mayStore()) {
           /* do nothing */
         }
         // store followed by store-- not OK on V2
@@ -1278,11 +1270,10 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
       // packetized in a same packet. This implies that the store is using
       // caller's SP. Hence, offset needs to be updated accordingly.
       else if (DepType == SDep::Data
-               && QRI->Subtarget.hasV4TOps()
-               && J->getOpcode() == Hexagon::ALLOCFRAME
-               && (I->getOpcode() == Hexagon::STrid
-                   || I->getOpcode() == Hexagon::STriw
-                   || I->getOpcode() == Hexagon::STrib)
+               && J->getOpcode() == Hexagon::S2_allocframe
+               && (I->getOpcode() == Hexagon::S2_storerd_io
+                   || I->getOpcode() == Hexagon::S2_storeri_io
+                   || I->getOpcode() == Hexagon::S2_storerb_io)
                && I->getOperand(0).getReg() == QRI->getStackRegister()
                && QII->isValidOffset(I->getOpcode(),
                                      I->getOperand(1).getImm() -
diff --git a/lib/Target/Hexagon/HexagonVarargsCallingConvention.h b/lib/Target/Hexagon/HexagonVarargsCallingConvention.h
deleted file mode 100644
index edbe29a..0000000
--- a/lib/Target/Hexagon/HexagonVarargsCallingConvention.h
+++ /dev/null
@@ -1,149 +0,0 @@
-//===-- HexagonVarargsCallingConvention.h - Calling Conventions -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the functions that assign locations to outgoing function
-// arguments. Adapted from the target independent version but this handles
-// calls to varargs functions
-//
-//===----------------------------------------------------------------------===//
-//
-
-
-
-
-static bool RetCC_Hexagon32_VarArgs(unsigned ValNo, EVT ValVT,
-                                    EVT LocVT, CCValAssign::LocInfo LocInfo,
-                                    ISD::ArgFlagsTy ArgFlags,
-                                    Hexagon_CCState &State,
-                                    int NonVarArgsParams,
-                                    int CurrentParam,
-                                    bool ForceMem);
-
-
-static bool CC_Hexagon32_VarArgs(unsigned ValNo, EVT ValVT,
-                                 EVT LocVT, CCValAssign::LocInfo LocInfo,
-                                 ISD::ArgFlagsTy ArgFlags,
-                                 Hexagon_CCState &State,
-                                 int NonVarArgsParams,
-                                 int CurrentParam,
-                                 bool ForceMem) {
-  unsigned ByValSize = 0;
-  if (ArgFlags.isByVal() &&
-      ((ByValSize = ArgFlags.getByValSize()) >
-       (MVT(MVT::i64).getSizeInBits() / 8))) {
-    ForceMem = true;
-  }
-
-
-  // Only assign registers for named (non-varargs) arguments
-  if ( !ForceMem && ((NonVarArgsParams == -1) || (CurrentParam <=
-                                                  NonVarArgsParams))) {
-
-    if (LocVT == MVT::i32 ||
-        LocVT == MVT::i16 ||
-        LocVT == MVT::i8 ||
-        LocVT == MVT::f32) {
-      static const unsigned RegList1[] = {
-        Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
-        Hexagon::R5
-      };
-      if (unsigned Reg = State.AllocateReg(RegList1, 6)) {
-        State.addLoc(CCValAssign::getReg(ValNo, ValVT.getSimpleVT(), Reg,
-                                         LocVT.getSimpleVT(), LocInfo));
-        return false;
-      }
-    }
-
-    if (LocVT == MVT::i64 ||
-        LocVT == MVT::f64) {
-      static const unsigned RegList2[] = {
-        Hexagon::D0, Hexagon::D1, Hexagon::D2
-      };
-      if (unsigned Reg = State.AllocateReg(RegList2, 3)) {
-        State.addLoc(CCValAssign::getReg(ValNo, ValVT.getSimpleVT(), Reg,
-                                         LocVT.getSimpleVT(), LocInfo));
-        return false;
-      }
-    }
-  }
-
-  const Type* ArgTy = LocVT.getTypeForEVT(State.getContext());
-  unsigned Alignment = State.getTarget()
-                           .getSubtargetImpl()
-                           ->getDataLayout()
-                           ->getABITypeAlignment(ArgTy);
-  unsigned Size =
-      State.getTarget().getSubtargetImpl()->getDataLayout()->getTypeSizeInBits(
-          ArgTy) /
-      8;
-
-  // If it's passed by value, then we need the size of the aggregate not of
-  // the pointer.
-  if (ArgFlags.isByVal()) {
-    Size = ByValSize;
-
-    // Hexagon_TODO: Get the alignment of the contained type here.
-    Alignment = 8;
-  }
-
-  unsigned Offset3 = State.AllocateStack(Size, Alignment);
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT.getSimpleVT(), Offset3,
-                                   LocVT.getSimpleVT(), LocInfo));
-  return false;
-}
-
-
-static bool RetCC_Hexagon32_VarArgs(unsigned ValNo, EVT ValVT,
-                                    EVT LocVT, CCValAssign::LocInfo LocInfo,
-                                    ISD::ArgFlagsTy ArgFlags,
-                                    Hexagon_CCState &State,
-                                    int NonVarArgsParams,
-                                    int CurrentParam,
-                                    bool ForceMem) {
-
-  if (LocVT == MVT::i32 ||
-      LocVT == MVT::f32) {
-    static const unsigned RegList1[] = {
-      Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
-      Hexagon::R5
-    };
-    if (unsigned Reg = State.AllocateReg(RegList1, 6)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT.getSimpleVT(), Reg,
-                                       LocVT.getSimpleVT(), LocInfo));
-      return false;
-    }
-  }
-
-  if (LocVT == MVT::i64 ||
-      LocVT == MVT::f64) {
-    static const unsigned RegList2[] = {
-      Hexagon::D0, Hexagon::D1, Hexagon::D2
-    };
-    if (unsigned Reg = State.AllocateReg(RegList2, 3)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT.getSimpleVT(), Reg,
-                                       LocVT.getSimpleVT(), LocInfo));
-      return false;
-    }
-  }
-
-  const Type* ArgTy = LocVT.getTypeForEVT(State.getContext());
-  unsigned Alignment = State.getTarget()
-                           .getSubtargetImpl()
-                           ->getDataLayout()
-                           ->getABITypeAlignment(ArgTy);
-  unsigned Size =
-      State.getTarget().getSubtargetImpl()->getDataLayout()->getTypeSizeInBits(
-          ArgTy) /
-      8;
-
-  unsigned Offset3 = State.AllocateStack(Size, Alignment);
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT.getSimpleVT(), Offset3,
-                                   LocVT.getSimpleVT(), LocInfo));
-  return false;
-}
diff --git a/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt b/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
index 2a6124e..4c987ed 100644
--- a/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
@@ -4,7 +4,7 @@ add_llvm_library(LLVMHexagonDesc
   HexagonInstPrinter.cpp
   HexagonMCAsmInfo.cpp
   HexagonMCCodeEmitter.cpp
-  HexagonMCInst.cpp
+  HexagonMCInstrInfo.cpp
   HexagonMCTargetDesc.cpp
   )
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index c0a3fae..8e02f79 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -19,7 +19,6 @@
 
 #include "HexagonMCTargetDesc.h"
 #include "llvm/Support/ErrorHandling.h"
-
 #include <stdint.h>
 
 namespace llvm {
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 1fd8d70..6c87c9f 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -14,7 +14,7 @@
 #include "HexagonAsmPrinter.h"
 #include "Hexagon.h"
 #include "HexagonInstPrinter.h"
-#include "MCTargetDesc/HexagonMCInst.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
@@ -77,46 +77,41 @@ StringRef HexagonInstPrinter::getRegName(unsigned RegNo) const {
   return getRegisterName(RegNo);
 }
 
-void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
-                                   StringRef Annot) {
-  printInst((const HexagonMCInst*)(MI), O, Annot);
-}
-
-void HexagonInstPrinter::printInst(const HexagonMCInst *MI, raw_ostream &O,
+void HexagonInstPrinter::printInst(MCInst const *MI, raw_ostream &O,
                                    StringRef Annot) {
   const char startPacket = '{',
              endPacket = '}';
   // TODO: add outer HW loop when it's supported too.
   if (MI->getOpcode() == Hexagon::ENDLOOP0) {
     // Ending a harware loop is different from ending an regular packet.
-    assert(MI->isPacketEnd() && "Loop-end must also end the packet");
+    assert(HexagonMCInstrInfo::isPacketEnd(*MI) && "Loop-end must also end the packet");
 
-    if (MI->isPacketStart()) {
+    if (HexagonMCInstrInfo::isPacketBegin(*MI)) {
       // There must be a packet to end a loop.
       // FIXME: when shuffling is always run, this shouldn't be needed.
-      HexagonMCInst Nop;
+      MCInst Nop;
       StringRef NoAnnot;
 
-      Nop.setOpcode (Hexagon::NOP);
-      Nop.setPacketStart (MI->isPacketStart());
+      Nop.setOpcode (Hexagon::A2_nop);
+      HexagonMCInstrInfo::setPacketBegin (Nop, HexagonMCInstrInfo::isPacketBegin(*MI));
       printInst (&Nop, O, NoAnnot);
     }
 
     // Close the packet.
-    if (MI->isPacketEnd())
+    if (HexagonMCInstrInfo::isPacketEnd(*MI))
       O << PacketPadding << endPacket;
 
     printInstruction(MI, O);
   }
   else {
     // Prefix the insn opening the packet.
-    if (MI->isPacketStart())
+    if (HexagonMCInstrInfo::isPacketBegin(*MI))
       O << PacketPadding << startPacket << '\n';
 
     printInstruction(MI, O);
 
     // Suffix the insn closing the packet.
-    if (MI->isPacketEnd())
+    if (HexagonMCInstrInfo::isPacketEnd(*MI))
       // Suffix the packet in a new line always, since the GNU assembler has
       // issues with a closing brace on the same line as CONST{32,64}.
       O << '\n' << PacketPadding << endPacket;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
index 55ae95c..d02243b 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
@@ -18,17 +18,14 @@
 #include "llvm/MC/MCInstrInfo.h"
 
 namespace llvm {
-  class HexagonMCInst;
-
   class HexagonInstPrinter : public MCInstPrinter {
   public:
-    explicit HexagonInstPrinter(const MCAsmInfo &MAI,
-                                const MCInstrInfo &MII,
-                                const MCRegisterInfo &MRI)
+    explicit HexagonInstPrinter(MCAsmInfo const &MAI,
+                                MCInstrInfo const &MII,
+                                MCRegisterInfo const &MRI)
       : MCInstPrinter(MAI, MII, MRI), MII(MII) {}
 
-    void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
-    void printInst(const HexagonMCInst *MI, raw_ostream &O, StringRef Annot);
+    void printInst(MCInst const *MI, raw_ostream &O, StringRef Annot) override;
     virtual StringRef getOpcodeName(unsigned Opcode) const;
     void printInstruction(const MCInst *MI, raw_ostream &O);
     StringRef getRegName(unsigned RegNo) const;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 4471977..a5a09ba 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -10,8 +10,8 @@
 #include "Hexagon.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "MCTargetDesc/HexagonMCCodeEmitter.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
-#include "MCTargetDesc/HexagonMCInst.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
@@ -35,9 +35,9 @@ namespace {
 /// Possible values for instruction packet parse field.
 enum class ParseField { duplex = 0x0, last0 = 0x1, last1 = 0x2, end = 0x3 };
 /// \brief Returns the packet bits based on instruction position.
-uint32_t getPacketBits(HexagonMCInst const &HMI) {
+uint32_t getPacketBits(MCInst const &HMI) {
   unsigned const ParseFieldOffset = 14;
-  ParseField Field = HMI.isPacketEnd() ? ParseField::end : ParseField::last0;
+  ParseField Field = HexagonMCInstrInfo::isPacketEnd(HMI) ? ParseField::end : ParseField::last0;
   return static_cast <uint32_t> (Field) << ParseFieldOffset;
 }
 void emitLittleEndian(uint64_t Binary, raw_ostream &OS) {
@@ -51,14 +51,15 @@ void emitLittleEndian(uint64_t Binary, raw_ostream &OS) {
 HexagonMCCodeEmitter::HexagonMCCodeEmitter(MCInstrInfo const &aMII,
                                            MCSubtargetInfo const &aMST,
                                            MCContext &aMCT)
-    : MST(aMST), MCT(aMCT) {}
+    : MST(aMST), MCT(aMCT), MCII (aMII) {}
 
 void HexagonMCCodeEmitter::EncodeInstruction(MCInst const &MI, raw_ostream &OS,
                                              SmallVectorImpl<MCFixup> &Fixups,
                                              MCSubtargetInfo const &STI) const {
-  HexagonMCInst const &HMB = static_cast<HexagonMCInst const &>(MI);
-  uint64_t Binary = getBinaryCodeForInstr(HMB, Fixups, STI) | getPacketBits(HMB);
-  assert(HMB.getDesc().getSize() == 4 && "All instructions should be 32bit");
+  uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI) | getPacketBits(MI);
+  assert(HexagonMCInstrInfo::getDesc(MCII, MI).getSize() == 4 &&
+         "All instructions should be 32bit");
+  (void)&MCII;
   emitLittleEndian(Binary, OS);
   ++MCNumEmitted;
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
index 96048ad..db1d707 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
@@ -28,6 +28,7 @@ namespace llvm {
 class HexagonMCCodeEmitter : public MCCodeEmitter {
   MCSubtargetInfo const &MST;
   MCContext &MCT;
+  MCInstrInfo const &MCII;
 
 public:
   HexagonMCCodeEmitter(MCInstrInfo const &aMII, MCSubtargetInfo const &aMST,
@@ -51,8 +52,8 @@ public:
                              MCSubtargetInfo const &STI) const;
 
 private:
-  HexagonMCCodeEmitter(HexagonMCCodeEmitter const &) LLVM_DELETED_FUNCTION;
-  void operator=(HexagonMCCodeEmitter const &) LLVM_DELETED_FUNCTION;
+  HexagonMCCodeEmitter(HexagonMCCodeEmitter const &) = delete;
+  void operator=(HexagonMCCodeEmitter const &) = delete;
 }; // class HexagonMCCodeEmitter
 
 } // namespace llvm
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.cpp
deleted file mode 100644
index c842b9b..0000000
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-//===- HexagonMCInst.cpp - Hexagon sub-class of MCInst --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class extends MCInst to allow some Hexagon VLIW annotations.
-//
-//===----------------------------------------------------------------------===//
-
-#include "HexagonInstrInfo.h"
-#include "MCTargetDesc/HexagonBaseInfo.h"
-#include "MCTargetDesc/HexagonMCInst.h"
-#include "MCTargetDesc/HexagonMCTargetDesc.h"
-
-using namespace llvm;
-
-// Return the slots used by the insn.
-unsigned HexagonMCInst::getUnits(const HexagonTargetMachine* TM) const {
-  const HexagonInstrInfo *QII = TM->getSubtargetImpl()->getInstrInfo();
-  const InstrItineraryData *II =
-      TM->getSubtargetImpl()->getInstrItineraryData();
-  const InstrStage*
-    IS = II->beginStage(QII->get(this->getOpcode()).getSchedClass());
-
-  return (IS->getUnits());
-}
-
-// Return the Hexagon ISA class for the insn.
-unsigned HexagonMCInst::getType() const {
-  const uint64_t F = MCID->TSFlags;
-
-  return ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
-}
-
-// Return whether the insn is an actual insn.
-bool HexagonMCInst::isCanon() const {
-  return (!MCID->isPseudo() &&
-          !isPrefix() &&
-          getType() != HexagonII::TypeENDLOOP);
-}
-
-// Return whether the insn is a prefix.
-bool HexagonMCInst::isPrefix() const {
-  return (getType() == HexagonII::TypePREFIX);
-}
-
-// Return whether the insn is solo, i.e., cannot be in a packet.
-bool HexagonMCInst::isSolo() const {
-  const uint64_t F = MCID->TSFlags;
-  return ((F >> HexagonII::SoloPos) & HexagonII::SoloMask);
-}
-
-// Return whether the insn is a new-value consumer.
-bool HexagonMCInst::isNewValue() const {
-  const uint64_t F = MCID->TSFlags;
-  return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
-}
-
-// Return whether the instruction is a legal new-value producer.
-bool HexagonMCInst::hasNewValue() const {
-  const uint64_t F = MCID->TSFlags;
-  return ((F >> HexagonII::hasNewValuePos) & HexagonII::hasNewValueMask);
-}
-
-// Return the operand that consumes or produces a new value.
-const MCOperand& HexagonMCInst::getNewValue() const {
-  const uint64_t F = MCID->TSFlags;
-  const unsigned O = (F >> HexagonII::NewValueOpPos) &
-                     HexagonII::NewValueOpMask;
-  const MCOperand& MCO = getOperand(O);
-
-  assert ((isNewValue() || hasNewValue()) && MCO.isReg());
-  return (MCO);
-}
-
-// Return whether the instruction needs to be constant extended.
-// 1) Always return true if the instruction has 'isExtended' flag set.
-//
-// isExtendable:
-// 2) For immediate extended operands, return true only if the value is
-//    out-of-range.
-// 3) For global address, always return true.
-
-bool HexagonMCInst::isConstExtended(void) const {
-  if (isExtended())
-    return true;
-
-  if (!isExtendable())
-    return false;
-
-  short ExtOpNum = getCExtOpNum();
-  int MinValue   = getMinValue();
-  int MaxValue   = getMaxValue();
-  const MCOperand& MO = getOperand(ExtOpNum);
-
-  // We could be using an instruction with an extendable immediate and shoehorn
-  // a global address into it. If it is a global address it will be constant
-  // extended. We do this for COMBINE.
-  // We currently only handle isGlobal() because it is the only kind of
-  // object we are going to end up with here for now.
-  // In the future we probably should add isSymbol(), etc.
-  if (MO.isExpr())
-    return true;
-
-  // If the extendable operand is not 'Immediate' type, the instruction should
-  // have 'isExtended' flag set.
-  assert(MO.isImm() && "Extendable operand must be Immediate type");
-
-  int ImmValue = MO.getImm();
-  return (ImmValue < MinValue || ImmValue > MaxValue);
-}
-
-// Return whether the instruction must be always extended.
-bool HexagonMCInst::isExtended(void) const {
-  const uint64_t F = MCID->TSFlags;
-  return (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
-}
-
-// Return true if the instruction may be extended based on the operand value.
-bool HexagonMCInst::isExtendable(void) const {
-  const uint64_t F = MCID->TSFlags;
-  return (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
-}
-
-// Return number of bits in the constant extended operand.
-unsigned HexagonMCInst::getBitCount(void) const {
-  const uint64_t F = MCID->TSFlags;
-  return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
-}
-
-// Return constant extended operand number.
-unsigned short HexagonMCInst::getCExtOpNum(void) const {
-  const uint64_t F = MCID->TSFlags;
-  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
-}
-
-// Return whether the operand can be constant extended.
-bool HexagonMCInst::isOperandExtended(const unsigned short OperandNum) const {
-  const uint64_t F = MCID->TSFlags;
-  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask)
-          == OperandNum;
-}
-
-// Return the min value that a constant extendable operand can have
-// without being extended.
-int HexagonMCInst::getMinValue(void) const {
-  const uint64_t F = MCID->TSFlags;
-  unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
-                    & HexagonII::ExtentSignedMask;
-  unsigned bits =  (F >> HexagonII::ExtentBitsPos)
-                    & HexagonII::ExtentBitsMask;
-
-  if (isSigned) // if value is signed
-    return -1U << (bits - 1);
-  else
-    return 0;
-}
-
-// Return the max value that a constant extendable operand can have
-// without being extended.
-int HexagonMCInst::getMaxValue(void) const {
-  const uint64_t F = MCID->TSFlags;
-  unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
-                    & HexagonII::ExtentSignedMask;
-  unsigned bits =  (F >> HexagonII::ExtentBitsPos)
-                    & HexagonII::ExtentBitsMask;
-
-  if (isSigned) // if value is signed
-    return ~(-1U << (bits - 1));
-  else
-    return ~(-1U << bits);
-}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h
deleted file mode 100644
index 90fbbf3..0000000
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h
+++ /dev/null
@@ -1,100 +0,0 @@
-//===- HexagonMCInst.h - Hexagon sub-class of MCInst ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class extends MCInst to allow some VLIW annotations.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINST_H
-#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINST_H
-
-#include "HexagonTargetMachine.h"
-#include "llvm/MC/MCInst.h"
-
-namespace llvm {
-  class MCOperand;
-
-  class HexagonMCInst: public MCInst {
-    // MCID is set during instruction lowering.
-    // It is needed in order to access TSFlags for
-    // use in checking MC instruction properties.
-    const MCInstrDesc *MCID;
-
-    // Packet start and end markers
-    unsigned packetStart: 1, packetEnd: 1;
-
-  public:
-    explicit HexagonMCInst():
-      MCInst(), MCID(nullptr), packetStart(0), packetEnd(0) {};
-    HexagonMCInst(const MCInstrDesc& mcid):
-      MCInst(), MCID(&mcid), packetStart(0), packetEnd(0) {};
-
-    bool isPacketStart() const { return (packetStart); };
-    bool isPacketEnd() const { return (packetEnd); };
-    void setPacketStart(bool Y) { packetStart = Y; };
-    void setPacketEnd(bool Y) { packetEnd = Y; };
-    void resetPacket() { setPacketStart(false); setPacketEnd(false); };
-
-    // Return the slots used by the insn.
-    unsigned getUnits(const HexagonTargetMachine* TM) const;
-
-    // Return the Hexagon ISA class for the insn.
-    unsigned getType() const;
-
-    void setDesc(const MCInstrDesc& mcid) { MCID = &mcid; };
-    const MCInstrDesc& getDesc(void) const { return *MCID; };
-
-    // Return whether the insn is an actual insn.
-    bool isCanon() const;
-
-    // Return whether the insn is a prefix.
-    bool isPrefix() const;
-
-    // Return whether the insn is solo, i.e., cannot be in a packet.
-    bool isSolo() const;
-
-    // Return whether the instruction needs to be constant extended.
-    bool isConstExtended() const;
-
-    // Return constant extended operand number.
-    unsigned short getCExtOpNum(void) const;
-
-    // Return whether the insn is a new-value consumer.
-    bool isNewValue() const;
-
-    // Return whether the instruction is a legal new-value producer.
-    bool hasNewValue() const;
-
-    // Return the operand that consumes or produces a new value.
-    const MCOperand& getNewValue() const;
-
-    // Return number of bits in the constant extended operand.
-    unsigned getBitCount(void) const;
-
-  private:
-    // Return whether the instruction must be always extended.
-    bool isExtended() const;
-
-    // Return true if the insn may be extended based on the operand value.
-    bool isExtendable() const;
-
-    // Return true if the operand can be constant extended.
-    bool isOperandExtended(const unsigned short OperandNum) const;
-
-    // Return the min value that a constant extendable operand can have
-    // without being extended.
-    int getMinValue() const;
-
-    // Return the max value that a constant extendable operand can have
-    // without being extended.
-    int getMaxValue() const;
-  };
-}
-
-#endif
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
new file mode 100644
index 0000000..33e7c81
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -0,0 +1,223 @@
+//===- HexagonMCInstrInfo.cpp - Hexagon sub-class of MCInst ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class extends MCInstrInfo to allow Hexagon specific MCInstr queries
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCInstrInfo.h"
+#include "HexagonBaseInfo.h"
+
+namespace llvm {
+void HexagonMCInstrInfo::AppendImplicitOperands(MCInst &MCI) {
+  MCI.addOperand(MCOperand::CreateImm(0));
+  MCI.addOperand(MCOperand::CreateInst(nullptr));
+}
+
+unsigned HexagonMCInstrInfo::getBitCount(MCInstrInfo const &MCII,
+                                         MCInst const &MCI) {
+  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
+}
+
+// Return constant extended operand number.
+unsigned short HexagonMCInstrInfo::getCExtOpNum(MCInstrInfo const &MCII,
+                                                MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
+}
+
+MCInstrDesc const &HexagonMCInstrInfo::getDesc(MCInstrInfo const &MCII,
+                                               MCInst const &MCI) {
+  return (MCII.get(MCI.getOpcode()));
+}
+
+std::bitset<16> HexagonMCInstrInfo::GetImplicitBits(MCInst const &MCI) {
+  SanityCheckImplicitOperands(MCI);
+  std::bitset<16> Bits(MCI.getOperand(MCI.getNumOperands() - 2).getImm());
+  return Bits;
+}
+
+// Return the max value that a constant extendable operand can have
+// without being extended.
+int HexagonMCInstrInfo::getMaxValue(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
+  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  unsigned isSigned =
+      (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
+  unsigned bits = (F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return ~(-1U << (bits - 1));
+  else
+    return ~(-1U << bits);
+}
+
+// Return the min value that a constant extendable operand can have
+// without being extended.
+int HexagonMCInstrInfo::getMinValue(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
+  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  unsigned isSigned =
+      (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
+  unsigned bits = (F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return -1U << (bits - 1);
+  else
+    return 0;
+}
+
+// Return the operand that consumes or produces a new value.
+MCOperand const &HexagonMCInstrInfo::getNewValue(MCInstrInfo const &MCII,
+                                                 MCInst const &MCI) {
+  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  unsigned const O =
+      (F >> HexagonII::NewValueOpPos) & HexagonII::NewValueOpMask;
+  MCOperand const &MCO = MCI.getOperand(O);
+
+  assert((HexagonMCInstrInfo::isNewValue(MCII, MCI) ||
+          HexagonMCInstrInfo::hasNewValue(MCII, MCI)) &&
+         MCO.isReg());
+  return (MCO);
+}
+
+// Return the Hexagon ISA class for the insn.
+unsigned HexagonMCInstrInfo::getType(MCInstrInfo const &MCII,
+                                     MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+
+  return ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
+}
+
+// Return whether the instruction is a legal new-value producer.
+bool HexagonMCInstrInfo::hasNewValue(MCInstrInfo const &MCII,
+                                     MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::hasNewValuePos) & HexagonII::hasNewValueMask);
+}
+
+// Return whether the insn is an actual insn.
+bool HexagonMCInstrInfo::isCanon(MCInstrInfo const &MCII, MCInst const &MCI) {
+  return (!HexagonMCInstrInfo::getDesc(MCII, MCI).isPseudo() &&
+          !HexagonMCInstrInfo::isPrefix(MCII, MCI) &&
+          HexagonMCInstrInfo::getType(MCII, MCI) != HexagonII::TypeENDLOOP);
+}
+
+// Return whether the instruction needs to be constant extended.
+// 1) Always return true if the instruction has 'isExtended' flag set.
+//
+// isExtendable:
+// 2) For immediate extended operands, return true only if the value is
+//    out-of-range.
+// 3) For global address, always return true.
+
+bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
+                                         MCInst const &MCI) {
+  if (HexagonMCInstrInfo::isExtended(MCII, MCI))
+    return true;
+
+  if (!HexagonMCInstrInfo::isExtendable(MCII, MCI))
+    return false;
+
+  short ExtOpNum = HexagonMCInstrInfo::getCExtOpNum(MCII, MCI);
+  int MinValue = HexagonMCInstrInfo::getMinValue(MCII, MCI);
+  int MaxValue = HexagonMCInstrInfo::getMaxValue(MCII, MCI);
+  MCOperand const &MO = MCI.getOperand(ExtOpNum);
+
+  // We could be using an instruction with an extendable immediate and shoehorn
+  // a global address into it. If it is a global address it will be constant
+  // extended. We do this for COMBINE.
+  // We currently only handle isGlobal() because it is the only kind of
+  // object we are going to end up with here for now.
+  // In the future we probably should add isSymbol(), etc.
+  if (MO.isExpr())
+    return true;
+
+  // If the extendable operand is not 'Immediate' type, the instruction should
+  // have 'isExtended' flag set.
+  assert(MO.isImm() && "Extendable operand must be Immediate type");
+
+  int ImmValue = MO.getImm();
+  return (ImmValue < MinValue || ImmValue > MaxValue);
+}
+
+// Return true if the instruction may be extended based on the operand value.
+bool HexagonMCInstrInfo::isExtendable(MCInstrInfo const &MCII,
+                                      MCInst const &MCI) {
+  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
+}
+
+// Return whether the instruction must be always extended.
+bool HexagonMCInstrInfo::isExtended(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
+  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
+}
+
+// Return whether the insn is a new-value consumer.
+bool HexagonMCInstrInfo::isNewValue(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
+}
+
+// Return whether the operand can be constant extended.
+bool HexagonMCInstrInfo::isOperandExtended(MCInstrInfo const &MCII,
+                                           MCInst const &MCI,
+                                           unsigned short OperandNum) {
+  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask) ==
+         OperandNum;
+}
+
+bool HexagonMCInstrInfo::isPacketBegin(MCInst const &MCI) {
+  std::bitset<16> Bits(GetImplicitBits(MCI));
+  return Bits.test(packetBeginIndex);
+}
+
+bool HexagonMCInstrInfo::isPacketEnd(MCInst const &MCI) {
+  std::bitset<16> Bits(GetImplicitBits(MCI));
+  return Bits.test(packetEndIndex);
+}
+
+// Return whether the insn is a prefix.
+bool HexagonMCInstrInfo::isPrefix(MCInstrInfo const &MCII, MCInst const &MCI) {
+  return (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypePREFIX);
+}
+
+// Return whether the insn is solo, i.e., cannot be in a packet.
+bool HexagonMCInstrInfo::isSolo(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::SoloPos) & HexagonII::SoloMask);
+}
+
+void HexagonMCInstrInfo::resetPacket(MCInst &MCI) {
+  setPacketBegin(MCI, false);
+  setPacketEnd(MCI, false);
+}
+
+void HexagonMCInstrInfo::SetImplicitBits(MCInst &MCI, std::bitset<16> Bits) {
+  SanityCheckImplicitOperands(MCI);
+  MCI.getOperand(MCI.getNumOperands() - 2).setImm(Bits.to_ulong());
+}
+
+void HexagonMCInstrInfo::setPacketBegin(MCInst &MCI, bool f) {
+  std::bitset<16> Bits(GetImplicitBits(MCI));
+  Bits.set(packetBeginIndex, f);
+  SetImplicitBits(MCI, Bits);
+}
+
+void HexagonMCInstrInfo::setPacketEnd(MCInst &MCI, bool f) {
+  std::bitset<16> Bits(GetImplicitBits(MCI));
+  Bits.set(packetEndIndex, f);
+  SetImplicitBits(MCI, Bits);
+}
+}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
new file mode 100644
index 0000000..10fc0f3
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -0,0 +1,106 @@
+//===- HexagonMCInstrInfo.cpp - Hexagon sub-class of MCInst ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility functions for Hexagon specific MCInst queries
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H
+
+#include "llvm/MC/MCInstrInfo.h"
+
+#include <bitset>
+
+namespace llvm {
+class MCInstrDesc;
+class MCInstrInfo;
+class MCInst;
+class MCOperand;
+namespace HexagonMCInstrInfo {
+void AppendImplicitOperands(MCInst &MCI);
+
+// Return number of bits in the constant extended operand.
+unsigned getBitCount(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return constant extended operand number.
+unsigned short getCExtOpNum(MCInstrInfo const &MCII, MCInst const &MCI);
+
+MCInstrDesc const &getDesc(MCInstrInfo const &MCII, MCInst const &MCI);
+
+std::bitset<16> GetImplicitBits(MCInst const &MCI);
+
+// Return the max value that a constant extendable operand can have
+// without being extended.
+int getMaxValue(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the min value that a constant extendable operand can have
+// without being extended.
+int getMinValue(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the operand that consumes or produces a new value.
+MCOperand const &getNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the Hexagon ISA class for the insn.
+unsigned getType(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return whether the instruction is a legal new-value producer.
+bool hasNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
+  
+// Return whether the insn is an actual insn.
+bool isCanon(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return whether the instruction needs to be constant extended.
+bool isConstExtended(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return true if the insn may be extended based on the operand value.
+bool isExtendable(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return whether the instruction must be always extended.
+bool isExtended(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return whether the insn is a new-value consumer.
+bool isNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return true if the operand can be constant extended.
+bool isOperandExtended(MCInstrInfo const &MCII, MCInst const &MCI,
+                       unsigned short OperandNum);
+
+bool isPacketBegin(MCInst const &MCI);
+
+bool isPacketEnd(MCInst const &MCI);
+
+// Return whether the insn is a prefix.
+bool isPrefix(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return whether the insn is solo, i.e., cannot be in a packet.
+bool isSolo(MCInstrInfo const &MCII, MCInst const &MCI);
+
+static const size_t packetBeginIndex = 0;
+static const size_t packetEndIndex = 1;
+
+void resetPacket(MCInst &MCI);
+
+inline void SanityCheckImplicitOperands(MCInst const &MCI) {
+  assert(MCI.getNumOperands() >= 2 && "At least the two implicit operands");
+  assert(MCI.getOperand(MCI.getNumOperands() - 1).isInst() &&
+          "Implicit bits and flags");
+  assert(MCI.getOperand(MCI.getNumOperands() - 2).isImm() &&
+          "Parent pointer");
+}
+
+void SetImplicitBits(MCInst &MCI, std::bitset<16> Bits);
+
+void setPacketBegin(MCInst &MCI, bool Y);
+
+void setPacketEnd(MCInst &MCI, bool Y);
+}
+}
+
+#endif // LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 14ddd9d..09a305b 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -35,7 +35,7 @@ using namespace llvm;
 #define GET_REGINFO_MC_DESC
 #include "HexagonGenRegisterInfo.inc"
 
-static MCInstrInfo *createHexagonMCInstrInfo() {
+MCInstrInfo *llvm::createHexagonMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitHexagonMCInstrInfo(X);
   return X;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index 02fd516..f074b65 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -30,6 +30,8 @@ class raw_ostream;
 
 extern Target TheHexagonTarget;
 
+MCInstrInfo *createHexagonMCInstrInfo();
+
 MCCodeEmitter *createHexagonMCCodeEmitter(MCInstrInfo const &MCII,
                                           MCRegisterInfo const &MRI,
                                           MCSubtargetInfo const &MST,
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index 1b0837c..4112046 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = ARM AArch64 CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
+subdirectories = ARM AArch64 BPF CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
@@ -45,7 +45,7 @@ parent = Libraries
 type = Library
 name = Target
 parent = Libraries
-required_libraries = Core MC Support
+required_libraries = Analysis Core MC Support
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with every built target, which makes it easy for tools to include every
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 22a973e..fb7823e 100644
--- a/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -39,8 +39,8 @@ using namespace llvm;
 namespace {
   class MSP430AsmPrinter : public AsmPrinter {
   public:
-    MSP430AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : AsmPrinter(TM, Streamer) {}
+    MSP430AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+        : AsmPrinter(TM, std::move(Streamer)) {}
 
     const char *getPassName() const override {
       return "MSP430 Assembly Printer";
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 81c176b..2f70cde 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -92,14 +92,9 @@ namespace {
 ///
 namespace {
   class MSP430DAGToDAGISel : public SelectionDAGISel {
-    const MSP430TargetLowering &Lowering;
-    const MSP430Subtarget &Subtarget;
-
   public:
     MSP430DAGToDAGISel(MSP430TargetMachine &TM, CodeGenOpt::Level OptLevel)
-        : SelectionDAGISel(TM, OptLevel),
-          Lowering(*TM.getSubtargetImpl()->getTargetLowering()),
-          Subtarget(*TM.getSubtargetImpl()) {}
+        : SelectionDAGISel(TM, OptLevel) {}
 
     const char *getPassName() const override {
       return "MSP430 DAG->DAG Pattern Instruction Selection";
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 22936dd..18141a6 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -57,7 +57,8 @@ HWMultMode("msp430-hwmult-mode", cl::Hidden,
                 "Assume hardware multiplier cannot be used inside interrupts"),
              clEnumValEnd));
 
-MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM)
+MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
+                                           const MSP430Subtarget &STI)
     : TargetLowering(TM) {
 
   // Set up the register classes.
@@ -65,7 +66,7 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM)
   addRegisterClass(MVT::i16, &MSP430::GR16RegClass);
 
   // Compute derived properties from the register classes
-  computeRegisterProperties();
+  computeRegisterProperties(STI.getRegisterInfo());
 
   // Provide all sorts of operation actions
 
@@ -80,11 +81,13 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM)
   setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
   setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
 
-  setLoadExtAction(ISD::EXTLOAD,  MVT::i1,  Promote);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1,  Promote);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i8,  Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand);
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD,  VT, MVT::i1,  Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1,  Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1,  Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8,  Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Expand);
+  }
 
   // We don't have any truncstores
   setTruncStoreAction(MVT::i16, MVT::i8, Expand);
@@ -222,10 +225,10 @@ MSP430TargetLowering::getConstraintType(const std::string &Constraint) const {
   return TargetLowering::getConstraintType(Constraint);
 }
 
-std::pair<unsigned, const TargetRegisterClass*>
-MSP430TargetLowering::
-getRegForInlineAsmConstraint(const std::string &Constraint,
-                             MVT VT) const {
+std::pair<unsigned, const TargetRegisterClass *>
+MSP430TargetLowering::getRegForInlineAsmConstraint(
+    const TargetRegisterInfo *TRI, const std::string &Constraint,
+    MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC Constraint Letters
     switch (Constraint[0]) {
@@ -238,7 +241,7 @@ getRegForInlineAsmConstraint(const std::string &Constraint,
     }
   }
 
-  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
 
 //===----------------------------------------------------------------------===//
@@ -326,7 +329,7 @@ static void AnalyzeArguments(CCState &State,
     if (!UseStack && Parts <= RegsLeft) {
       unsigned FirstVal = ValNo;
       for (unsigned j = 0; j < Parts; j++) {
-        unsigned Reg = State.AllocateReg(RegList, NbRegs);
+        unsigned Reg = State.AllocateReg(RegList);
         State.addLoc(CCValAssign::getReg(ValNo++, ArgVT, Reg, LocVT, LocInfo));
         RegsLeft--;
       }
@@ -977,11 +980,7 @@ SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   } else {
     SDValue Zero = DAG.getConstant(0, VT);
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
-    SmallVector<SDValue, 4> Ops;
-    Ops.push_back(One);
-    Ops.push_back(Zero);
-    Ops.push_back(TargetCC);
-    Ops.push_back(Flag);
+    SDValue Ops[] = {One, Zero, TargetCC, Flag};
     return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, Ops);
   }
 }
@@ -999,11 +998,7 @@ SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op,
   SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
 
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
-  SmallVector<SDValue, 4> Ops;
-  Ops.push_back(TrueV);
-  Ops.push_back(FalseV);
-  Ops.push_back(TargetCC);
-  Ops.push_back(Flag);
+  SDValue Ops[] = {TrueV, FalseV, TargetCC, Flag};
 
   return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, Ops);
 }
@@ -1199,8 +1194,7 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
   MachineFunction *F = BB->getParent();
   MachineRegisterInfo &RI = F->getRegInfo();
   DebugLoc dl = MI->getDebugLoc();
-  const TargetInstrInfo &TII =
-      *getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo &TII = *F->getSubtarget().getInstrInfo();
 
   unsigned Opc;
   const TargetRegisterClass * RC;
@@ -1311,8 +1305,7 @@ MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
       Opc == MSP430::Srl8 || Opc == MSP430::Srl16)
     return EmitShiftInstr(MI, BB);
 
-  const TargetInstrInfo &TII =
-      *getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
 
   assert((Opc == MSP430::Select16 || Opc == MSP430::Select8) &&
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 073ddc9..9266c3b 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -66,9 +66,11 @@ namespace llvm {
     };
   }
 
+  class MSP430Subtarget;
   class MSP430TargetLowering : public TargetLowering {
   public:
-    explicit MSP430TargetLowering(const TargetMachine &TM);
+    explicit MSP430TargetLowering(const TargetMachine &TM,
+                                  const MSP430Subtarget &STI);
 
     MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i8; }
 
@@ -95,8 +97,9 @@ namespace llvm {
 
     TargetLowering::ConstraintType
     getConstraintType(const std::string &Constraint) const override;
-    std::pair<unsigned, const TargetRegisterClass*>
-    getRegForInlineAsmConstraint(const std::string &Constraint,
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 const std::string &Constraint,
                                  MVT VT) const override;
 
     /// isTruncateFree - Return true if it's free to truncate a value of type
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index 7c5aa11..c0c29b9 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -153,7 +153,7 @@ let usesCustomInserter = 1 in {
   }
 }
 
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 def NOP : Pseudo<(outs), (ins), "nop", []>;
 
 //===----------------------------------------------------------------------===//
@@ -224,7 +224,7 @@ let isCall = 1 in
 //===----------------------------------------------------------------------===//
 //  Miscellaneous Instructions...
 //
-let Defs = [SP], Uses = [SP], neverHasSideEffects=1 in {
+let Defs = [SP], Uses = [SP], hasSideEffects=0 in {
 let mayLoad = 1 in
 def POP16r   : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
                        (outs GR16:$reg), (ins), "pop.w\t$reg", []>;
@@ -238,7 +238,7 @@ def PUSH16r  : II16r<0x0,
 // Move Instructions
 
 // FIXME: Provide proper encoding!
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 def MOV8rr  : I8rr<0x0,
                    (outs GR8:$dst), (ins GR8:$src),
                    "mov.b\t{$src, $dst}",
diff --git a/lib/Target/MSP430/MSP430MCInstLower.cpp b/lib/Target/MSP430/MSP430MCInstLower.cpp
index 77b91b7..05352a2 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.cpp
+++ b/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -26,7 +26,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 MCSymbol *MSP430MCInstLower::
@@ -51,7 +50,7 @@ GetExternalSymbolSymbol(const MachineOperand &MO) const {
 
 MCSymbol *MSP430MCInstLower::
 GetJumpTableSymbol(const MachineOperand &MO) const {
-  const DataLayout *DL = Printer.TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = Printer.TM.getDataLayout();
   SmallString<256> Name;
   raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "JTI"
                             << Printer.getFunctionNumber() << '_'
@@ -68,7 +67,7 @@ GetJumpTableSymbol(const MachineOperand &MO) const {
 
 MCSymbol *MSP430MCInstLower::
 GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
-  const DataLayout *DL = Printer.TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = Printer.TM.getDataLayout();
   SmallString<256> Name;
   raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "CPI"
                             << Printer.getFunctionNumber() << '_'
diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp
index cb83b92..7468519 100644
--- a/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -32,8 +32,6 @@ MSP430Subtarget &MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU,
 
 MSP430Subtarget::MSP430Subtarget(const std::string &TT, const std::string &CPU,
                                  const std::string &FS, const TargetMachine &TM)
-    : MSP430GenSubtargetInfo(TT, CPU, FS),
-      // FIXME: Check DataLayout string.
-      DL("e-m:e-p:16:16-i32:16:32-a:16-n8:16"), FrameLowering(),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM),
-      TSInfo(DL) {}
+    : MSP430GenSubtargetInfo(TT, CPU, FS), FrameLowering(),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
+      TSInfo(*TM.getDataLayout()) {}
diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h
index d1845db..30d46d3 100644
--- a/lib/Target/MSP430/MSP430Subtarget.h
+++ b/lib/Target/MSP430/MSP430Subtarget.h
@@ -15,8 +15,8 @@
 #define LLVM_LIB_TARGET_MSP430_MSP430SUBTARGET_H
 
 #include "MSP430FrameLowering.h"
-#include "MSP430InstrInfo.h"
 #include "MSP430ISelLowering.h"
+#include "MSP430InstrInfo.h"
 #include "MSP430RegisterInfo.h"
 #include "MSP430SelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -32,7 +32,6 @@ class StringRef;
 class MSP430Subtarget : public MSP430GenSubtargetInfo {
   virtual void anchor();
   bool ExtendedInsts;
-  const DataLayout DL; // Calculates type size & alignment
   MSP430FrameLowering FrameLowering;
   MSP430InstrInfo InstrInfo;
   MSP430TargetLowering TLInfo;
@@ -55,7 +54,6 @@ public:
     return &FrameLowering;
   }
   const MSP430InstrInfo *getInstrInfo() const override { return &InstrInfo; }
-  const DataLayout *getDataLayout() const override { return &DL; }
   const TargetRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 8cee016..348e672 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -12,11 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "MSP430TargetMachine.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "MSP430.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
@@ -32,7 +32,8 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T, StringRef TT,
                                          CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
       TLOF(make_unique<TargetLoweringObjectFileELF>()),
-      Subtarget(TT, CPU, FS, *this) {
+      // FIXME: Check DataLayout string.
+      DL("e-m:e-p:16:16-i32:16:32-a:16-n8:16"), Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
 
@@ -50,7 +51,7 @@ public:
   }
 
   bool addInstSelector() override;
-  bool addPreEmitPass() override;
+  void addPreEmitPass() override;
 };
 } // namespace
 
@@ -64,8 +65,7 @@ bool MSP430PassConfig::addInstSelector() {
   return false;
 }
 
-bool MSP430PassConfig::addPreEmitPass() {
+void MSP430PassConfig::addPreEmitPass() {
   // Must run branch selection immediately preceding the asm printer.
-  addPass(createMSP430BranchSelectionPass());
-  return false;
+  addPass(createMSP430BranchSelectionPass(), false);
 }
diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
index 0e54ed6..c6a6a70 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/lib/Target/MSP430/MSP430TargetMachine.h
@@ -25,6 +25,7 @@ namespace llvm {
 ///
 class MSP430TargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  const DataLayout DL; // Calculates type size & alignment
   MSP430Subtarget        Subtarget;
 
 public:
@@ -34,6 +35,7 @@ public:
                       CodeGenOpt::Level OL);
   ~MSP430TargetMachine() override;
 
+  const DataLayout *getDataLayout() const override { return &DL; }
   const MSP430Subtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
diff --git a/lib/Target/MSP430/README.txt b/lib/Target/MSP430/README.txt
index 5b9634b..e989924 100644
--- a/lib/Target/MSP430/README.txt
+++ b/lib/Target/MSP430/README.txt
@@ -38,3 +38,4 @@ way (currently they emit explicit comparison).
 10. Handle imm in comparisons in better way (see comment in MSP430InstrInfo.td)
 
 11. Implement hooks for better memory op folding, etc.
+
diff --git a/lib/Target/Mips/Android.mk b/lib/Target/Mips/Android.mk
index 18d1177..235e788 100644
--- a/lib/Target/Mips/Android.mk
+++ b/lib/Target/Mips/Android.mk
@@ -20,7 +20,6 @@ mips_codegen_SRC_FILES := \
   Mips16ISelLowering.cpp \
   Mips16InstrInfo.cpp \
   Mips16RegisterInfo.cpp \
-  MipsABIInfo.cpp \
   MipsAnalyzeImmediate.cpp \
   MipsAsmPrinter.cpp \
   MipsCCState.cpp \
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 0c5b41f..1040bf7 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -7,13 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/MipsABIInfo.h"
 #include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "MipsRegisterInfo.h"
 #include "MipsTargetStreamer.h"
 #include "llvm/ADT/APInt.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -26,8 +27,8 @@
 #include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
 #include <memory>
 
 using namespace llvm;
@@ -75,9 +76,10 @@ public:
       Mips::FeatureMips3_32 | Mips::FeatureMips3_32r2 | Mips::FeatureMips4 |
       Mips::FeatureMips4_32 | Mips::FeatureMips4_32r2 | Mips::FeatureMips5 |
       Mips::FeatureMips5_32r2 | Mips::FeatureMips32 | Mips::FeatureMips32r2 |
-      Mips::FeatureMips32r6 | Mips::FeatureMips64 | Mips::FeatureMips64r2 |
-      Mips::FeatureMips64r6 | Mips::FeatureCnMips | Mips::FeatureFP64Bit |
-      Mips::FeatureGP64Bit | Mips::FeatureNaN2008;
+      Mips::FeatureMips32r3 | Mips::FeatureMips32r5 | Mips::FeatureMips32r6 |
+      Mips::FeatureMips64 | Mips::FeatureMips64r2 | Mips::FeatureMips64r3 |
+      Mips::FeatureMips64r5 | Mips::FeatureMips64r6 | Mips::FeatureCnMips |
+      Mips::FeatureFP64Bit | Mips::FeatureGP64Bit | Mips::FeatureNaN2008;
 
 private:
   unsigned ATReg;
@@ -95,6 +97,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   }
 
   MCSubtargetInfo &STI;
+  MipsABIInfo ABI;
   SmallVector<std::unique_ptr<MipsAssemblerOptions>, 2> AssemblerOptions;
   MCSymbol *CurrentFn; // Pointer to the function being parsed. It may be a
                        // nullptr, which indicates that no function is currently
@@ -147,6 +150,12 @@ class MipsAsmParser : public MCTargetAsmParser {
   MipsAsmParser::OperandMatchResultTy parseLSAImm(OperandVector &Operands);
 
   MipsAsmParser::OperandMatchResultTy
+  parseRegisterPair (OperandVector &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMovePRegPair(OperandVector &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
   parseRegisterList (OperandVector  &Operands);
 
   bool searchSymbolAlias(OperandVector &Operands);
@@ -160,6 +169,9 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandInstruction(MCInst &Inst, SMLoc IDLoc,
                          SmallVectorImpl<MCInst> &Instructions);
 
+  bool expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
+                         SmallVectorImpl<MCInst> &Instructions);
+
   bool expandLoadImm(MCInst &Inst, SMLoc IDLoc,
                      SmallVectorImpl<MCInst> &Instructions);
 
@@ -168,6 +180,8 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   bool expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
                             SmallVectorImpl<MCInst> &Instructions);
+  bool expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc,
+                                  SmallVectorImpl<MCInst> &Instructions);
 
   void expandLoadAddressSym(MCInst &Inst, SMLoc IDLoc,
                             SmallVectorImpl<MCInst> &Instructions);
@@ -175,6 +189,10 @@ class MipsAsmParser : public MCTargetAsmParser {
   void expandMemInst(MCInst &Inst, SMLoc IDLoc,
                      SmallVectorImpl<MCInst> &Instructions, bool isLoad,
                      bool isImmOpnd);
+
+  bool expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
+                               SmallVectorImpl<MCInst> &Instructions);
+
   bool reportParseError(Twine ErrorMsg);
   bool reportParseError(SMLoc Loc, Twine ErrorMsg);
 
@@ -310,7 +328,9 @@ public:
 
   MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
                 const MCInstrInfo &MII, const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(sti) {
+      : MCTargetAsmParser(), STI(sti),
+        ABI(MipsABIInfo::computeTargetABI(Triple(sti.getTargetTriple()),
+                                          sti.getCPU(), Options)) {
     MCAsmParserExtension::Initialize(parser);
 
     // Initialize the set of available features.
@@ -326,12 +346,6 @@ public:
 
     getTargetStreamer().updateABIInfo(*this);
 
-    // Assert exactly one ABI was chosen.
-    assert((((STI.getFeatureBits() & Mips::FeatureO32) != 0) +
-            ((STI.getFeatureBits() & Mips::FeatureEABI) != 0) +
-            ((STI.getFeatureBits() & Mips::FeatureN32) != 0) +
-            ((STI.getFeatureBits() & Mips::FeatureN64) != 0)) == 1);
-
     if (!isABI_O32() && !useOddSPReg() != 0)
       report_fatal_error("-mno-odd-spreg requires the O32 ABI");
 
@@ -343,9 +357,10 @@ public:
 
   bool isGP64bit() const { return STI.getFeatureBits() & Mips::FeatureGP64Bit; }
   bool isFP64bit() const { return STI.getFeatureBits() & Mips::FeatureFP64Bit; }
-  bool isABI_N32() const { return STI.getFeatureBits() & Mips::FeatureN32; }
-  bool isABI_N64() const { return STI.getFeatureBits() & Mips::FeatureN64; }
-  bool isABI_O32() const { return STI.getFeatureBits() & Mips::FeatureO32; }
+  const MipsABIInfo &getABI() const { return ABI; }
+  bool isABI_N32() const { return ABI.IsN32(); }
+  bool isABI_N64() const { return ABI.IsN64(); }
+  bool isABI_O32() const { return ABI.IsO32(); }
   bool isABI_FPXX() const { return STI.getFeatureBits() & Mips::FeatureFPXX; }
 
   bool useOddSPReg() const {
@@ -372,12 +387,27 @@ public:
   bool hasMips64r2() const {
     return (STI.getFeatureBits() & Mips::FeatureMips64r2);
   }
+  bool hasMips32r3() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips32r3);
+  }
+  bool hasMips64r3() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips64r3);
+  }
+  bool hasMips32r5() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips32r5);
+  }
+  bool hasMips64r5() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips64r5);
+  }
   bool hasMips32r6() const {
     return (STI.getFeatureBits() & Mips::FeatureMips32r6);
   }
   bool hasMips64r6() const {
     return (STI.getFeatureBits() & Mips::FeatureMips64r6);
   }
+  bool hasCnMips() const {
+    return (STI.getFeatureBits() & Mips::FeatureCnMips);
+  }
   bool hasDSP() const { return (STI.getFeatureBits() & Mips::FeatureDSP); }
   bool hasDSPR2() const { return (STI.getFeatureBits() & Mips::FeatureDSPR2); }
   bool hasMSA() const { return (STI.getFeatureBits() & Mips::FeatureMSA); }
@@ -428,7 +458,8 @@ private:
     k_PhysRegister,  /// A physical register from the Mips namespace
     k_RegisterIndex, /// A register index in one or more RegKind.
     k_Token,         /// A simple token
-    k_RegList        /// A physical register list
+    k_RegList,       /// A physical register list
+    k_RegPair        /// A pair of physical register
   } Kind;
 
 public:
@@ -663,6 +694,16 @@ public:
     Inst.addOperand(MCOperand::CreateReg(getGPRMM16Reg()));
   }
 
+  void addGPRMM16AsmRegZeroOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getGPRMM16Reg()));
+  }
+
+  void addGPRMM16AsmRegMovePOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getGPRMM16Reg()));
+  }
+
   /// Render the operand to an MCInst as a GPR64
   /// Asserts if the wrong number of operands are requested, or the operand
   /// is not a k_RegisterIndex compatible with RegKind_GPR
@@ -760,6 +801,15 @@ public:
     addExpr(Inst, Expr);
   }
 
+  void addMicroMipsMemOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateReg(getMemBase()->getGPRMM16Reg()));
+
+    const MCExpr *Expr = getMemOff();
+    addExpr(Inst, Expr);
+  }
+
   void addRegListOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
 
@@ -767,6 +817,19 @@ public:
       Inst.addOperand(MCOperand::CreateReg(RegNo));
   }
 
+  void addRegPairOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    unsigned RegNo = getRegPair();
+    Inst.addOperand(MCOperand::CreateReg(RegNo++));
+    Inst.addOperand(MCOperand::CreateReg(RegNo));
+  }
+
+  void addMovePRegPairOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    for (auto RegNo : getRegList())
+      Inst.addOperand(MCOperand::CreateReg(RegNo));
+  }
+
   bool isReg() const override {
     // As a special case until we sort out the definition of div/divu, pretend
     // that $0/$zero are k_PhysRegister so that MCK_ZERO works correctly.
@@ -792,6 +855,37 @@ public:
   template <unsigned Bits> bool isMemWithSimmOffset() const {
     return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff());
   }
+  bool isMemWithGRPMM16Base() const {
+    return isMem() && getMemBase()->isMM16AsmReg();
+  }
+  template <unsigned Bits> bool isMemWithUimmOffsetSP() const {
+    return isMem() && isConstantMemOff() && isUInt<Bits>(getConstantMemOff())
+      && getMemBase()->isRegIdx() && (getMemBase()->getGPR32Reg() == Mips::SP);
+  }
+  template <unsigned Bits> bool isMemWithUimmWordAlignedOffsetSP() const {
+    return isMem() && isConstantMemOff() && isUInt<Bits>(getConstantMemOff())
+      && (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx()
+      && (getMemBase()->getGPR32Reg() == Mips::SP);
+  }
+  bool isRegList16() const {
+    if (!isRegList())
+      return false;
+
+    int Size = RegList.List->size();
+    if (Size < 2 || Size > 5 || *RegList.List->begin() != Mips::S0 ||
+        RegList.List->back() != Mips::RA)
+      return false;
+
+    int PrevReg = *RegList.List->begin();
+    for (int i = 1; i < Size - 1; i++) {
+      int Reg = (*(RegList.List))[i];
+      if ( Reg != PrevReg + 1)
+        return false;
+      PrevReg = Reg;
+    }
+
+    return true;
+  }
   bool isInvNum() const { return Kind == k_Immediate; }
   bool isLSAImm() const {
     if (!isConstantImm())
@@ -800,11 +894,31 @@ public:
     return 1 <= Val && Val <= 4;
   }
   bool isRegList() const { return Kind == k_RegList; }
+  bool isMovePRegPair() const {
+    if (Kind != k_RegList || RegList.List->size() != 2)
+      return false;
+
+    unsigned R0 = RegList.List->front();
+    unsigned R1 = RegList.List->back();
+
+    if ((R0 == Mips::A1 && R1 == Mips::A2) ||
+        (R0 == Mips::A1 && R1 == Mips::A3) ||
+        (R0 == Mips::A2 && R1 == Mips::A3) ||
+        (R0 == Mips::A0 && R1 == Mips::S5) ||
+        (R0 == Mips::A0 && R1 == Mips::S6) ||
+        (R0 == Mips::A0 && R1 == Mips::A1) ||
+        (R0 == Mips::A0 && R1 == Mips::A2) ||
+        (R0 == Mips::A0 && R1 == Mips::A3))
+      return true;
+
+    return false;
+  }
 
   StringRef getToken() const {
     assert(Kind == k_Token && "Invalid access!");
     return StringRef(Tok.Data, Tok.Length);
   }
+  bool isRegPair() const { return Kind == k_RegPair; }
 
   unsigned getReg() const override {
     // As a special case until we sort out the definition of div/divu, pretend
@@ -846,6 +960,11 @@ public:
     return *(RegList.List);
   }
 
+  unsigned getRegPair() const {
+    assert((Kind == k_RegPair) && "Invalid access!");
+    return RegIdx.Index;
+  }
+
   static std::unique_ptr<MipsOperand> CreateToken(StringRef Str, SMLoc S,
                                                   MipsAsmParser &Parser) {
     auto Op = make_unique<MipsOperand>(k_Token, Parser);
@@ -947,14 +1066,21 @@ public:
     assert (Regs.size() > 0 && "Empty list not allowed");
 
     auto Op = make_unique<MipsOperand>(k_RegList, Parser);
-    Op->RegList.List = new SmallVector<unsigned, 10>();
-    for (auto Reg : Regs)
-      Op->RegList.List->push_back(Reg);
+    Op->RegList.List = new SmallVector<unsigned, 10>(Regs.begin(), Regs.end());
     Op->StartLoc = StartLoc;
     Op->EndLoc = EndLoc;
     return Op;
   }
 
+  static std::unique_ptr<MipsOperand>
+  CreateRegPair(unsigned RegNo, SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_RegPair, Parser);
+    Op->RegIdx.Index = RegNo;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   bool isGPRAsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index <= 31;
   }
@@ -964,6 +1090,19 @@ public:
     return ((RegIdx.Index >= 2 && RegIdx.Index <= 7)
             || RegIdx.Index == 16 || RegIdx.Index == 17);
   }
+  bool isMM16AsmRegZero() const {
+    if (!(isRegIdx() && RegIdx.Kind))
+      return false;
+    return (RegIdx.Index == 0 ||
+            (RegIdx.Index >= 2 && RegIdx.Index <= 7) ||
+            RegIdx.Index == 17);
+  }
+  bool isMM16AsmRegMoveP() const {
+    if (!(isRegIdx() && RegIdx.Kind))
+      return false;
+    return (RegIdx.Index == 0 || (RegIdx.Index >= 2 && RegIdx.Index <= 3) ||
+      (RegIdx.Index >= 16 && RegIdx.Index <= 20));
+  }
   bool isFGRAsmReg() const {
     // AFGR64 is $0-$15 but we handle this in getAFGR64()
     return isRegIdx() && RegIdx.Kind & RegKind_FGR && RegIdx.Index <= 31;
@@ -1014,6 +1153,7 @@ public:
     case k_PhysRegister:
     case k_RegisterIndex:
     case k_Token:
+    case k_RegPair:
       break;
     }
   }
@@ -1047,6 +1187,9 @@ public:
         OS << Reg << " ";
       OS <<  ">";
       break;
+    case k_RegPair:
+      OS << "RegPair<" << RegIdx.Index << "," << RegIdx.Index + 1 << ">";
+      break;
     }
   }
 }; // class MipsOperand
@@ -1085,6 +1228,13 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     switch (Opcode) {
     default:
       break;
+    case Mips::BBIT0:
+    case Mips::BBIT032:
+    case Mips::BBIT1:
+    case Mips::BBIT132:
+      assert(hasCnMips() && "instruction only valid for octeon cpus");
+      // Fall through
+
     case Mips::BEQ:
     case Mips::BNE:
     case Mips::BEQ_MM:
@@ -1125,6 +1275,17 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                             1LL << (inMicroMipsMode() ? 1 : 2)))
         return Error(IDLoc, "branch to misaligned address");
       break;
+    case Mips::BEQZ16_MM:
+    case Mips::BNEZ16_MM:
+      assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
+      Offset = Inst.getOperand(1);
+      if (!Offset.isImm())
+        break; // We'll deal with this situation later on when applying fixups.
+      if (!isIntN(8, Offset.getImm()))
+        return Error(IDLoc, "branch target out of range");
+      if (OffsetToAlignment(Offset.getImm(), 2LL))
+        return Error(IDLoc, "branch to misaligned address");
+      break;
     }
   }
 
@@ -1136,6 +1297,74 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                                                       "nop instruction");
   }
 
+  if (hasCnMips()) {
+    const unsigned Opcode = Inst.getOpcode();
+    MCOperand Opnd;
+    int Imm;
+
+    switch (Opcode) {
+      default:
+        break;
+
+      case Mips::BBIT0:
+      case Mips::BBIT032:
+      case Mips::BBIT1:
+      case Mips::BBIT132:
+        assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
+        // The offset is handled above
+        Opnd = Inst.getOperand(1);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < 0 || Imm > (Opcode == Mips::BBIT0 ||
+                              Opcode == Mips::BBIT1 ? 63 : 31))
+          return Error(IDLoc, "immediate operand value out of range");
+        if (Imm > 31) {
+          Inst.setOpcode(Opcode == Mips::BBIT0 ? Mips::BBIT032
+                                               : Mips::BBIT132);
+          Inst.getOperand(1).setImm(Imm - 32);
+        }
+        break;
+
+      case Mips::CINS:
+      case Mips::CINS32:
+      case Mips::EXTS:
+      case Mips::EXTS32:
+        assert(MCID.getNumOperands() == 4 && "unexpected number of operands");
+        // Check length
+        Opnd = Inst.getOperand(3);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < 0 || Imm > 31)
+          return Error(IDLoc, "immediate operand value out of range");
+        // Check position
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < 0 || Imm > (Opcode == Mips::CINS ||
+                              Opcode == Mips::EXTS ? 63 : 31))
+          return Error(IDLoc, "immediate operand value out of range");
+        if (Imm > 31) {
+          Inst.setOpcode(Opcode == Mips::CINS ? Mips::CINS32 : Mips::EXTS32);
+          Inst.getOperand(2).setImm(Imm - 32);
+        }
+        break;
+
+      case Mips::SEQi:
+      case Mips::SNEi:
+        assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (!isInt<10>(Imm))
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+    }
+  }
+
   if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder()) {
     // If this instruction has a delay slot and .set reorder is active,
     // emit a NOP after it.
@@ -1189,8 +1418,38 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     } // for
   }   // if load/store
 
-  // TODO: Handle this with the AsmOperandClass.PredicateMethod.
   if (inMicroMipsMode()) {
+    if (MCID.mayLoad()) {
+      // Try to create 16-bit GP relative load instruction.
+      for (unsigned i = 0; i < MCID.getNumOperands(); i++) {
+        const MCOperandInfo &OpInfo = MCID.OpInfo[i];
+        if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY) ||
+            (OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
+          MCOperand &Op = Inst.getOperand(i);
+          if (Op.isImm()) {
+            int MemOffset = Op.getImm();
+            MCOperand &DstReg = Inst.getOperand(0);
+            MCOperand &BaseReg = Inst.getOperand(1);
+            if (isIntN(9, MemOffset) && (MemOffset % 4 == 0) &&
+                getContext().getRegisterInfo()->getRegClass(
+                  Mips::GPRMM16RegClassID).contains(DstReg.getReg()) &&
+                BaseReg.getReg() == Mips::GP) {
+              MCInst TmpInst;
+              TmpInst.setLoc(IDLoc);
+              TmpInst.setOpcode(Mips::LWGP_MM);
+              TmpInst.addOperand(MCOperand::CreateReg(DstReg.getReg()));
+              TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
+              TmpInst.addOperand(MCOperand::CreateImm(MemOffset));
+              Instructions.push_back(TmpInst);
+              return false;
+            }
+          }
+        }
+      } // for
+    }   // if load
+
+    // TODO: Handle this with the AsmOperandClass.PredicateMethod.
+
     MCOperand Opnd;
     int Imm;
 
@@ -1260,6 +1519,57 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
               Imm == 64 || Imm == 255 || Imm == 32768 || Imm == 65535))
           return Error(IDLoc, "immediate operand value out of range");
         break;
+      case Mips::LBU16_MM:
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < -1 || Imm > 14)
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::SB16_MM:
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < 0 || Imm > 15)
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::LHU16_MM:
+      case Mips::SH16_MM:
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < 0 || Imm > 30 || (Imm % 2 != 0))
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::LW16_MM:
+      case Mips::SW16_MM:
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < 0 || Imm > 60 || (Imm % 4 != 0))
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::CACHE:
+      case Mips::PREF:
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (!isUInt<5>(Imm))
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::ADDIUPC_MM:
+        MCOperand Opnd = Inst.getOperand(1);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        int Imm = Opnd.getImm();
+        if ((Imm % 4 != 0) || !isIntN(25, Imm))
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
     }
   }
 
@@ -1278,6 +1588,11 @@ bool MipsAsmParser::needsExpansion(MCInst &Inst) {
   case Mips::LoadAddr32Imm:
   case Mips::LoadAddr32Reg:
   case Mips::LoadImm64Reg:
+  case Mips::B_MM_Pseudo:
+  case Mips::LWM_MM:
+  case Mips::SWM_MM:
+  case Mips::JalOneReg:
+  case Mips::JalTwoReg:
     return true;
   default:
     return false;
@@ -1287,9 +1602,7 @@ bool MipsAsmParser::needsExpansion(MCInst &Inst) {
 bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
                                       SmallVectorImpl<MCInst> &Instructions) {
   switch (Inst.getOpcode()) {
-  default:
-    assert(0 && "unimplemented expansion");
-    return true;
+  default: llvm_unreachable("unimplemented expansion");
   case Mips::LoadImm32Reg:
     return expandLoadImm(Inst, IDLoc, Instructions);
   case Mips::LoadImm64Reg:
@@ -1302,6 +1615,14 @@ bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
     return expandLoadAddressImm(Inst, IDLoc, Instructions);
   case Mips::LoadAddr32Reg:
     return expandLoadAddressReg(Inst, IDLoc, Instructions);
+  case Mips::B_MM_Pseudo:
+    return expandUncondBranchMMPseudo(Inst, IDLoc, Instructions);
+  case Mips::SWM_MM:
+  case Mips::LWM_MM:
+    return expandLoadStoreMultiple(Inst, IDLoc, Instructions);
+  case Mips::JalOneReg:
+  case Mips::JalTwoReg:
+    return expandJalWithRegs(Inst, IDLoc, Instructions);
   }
 }
 
@@ -1336,6 +1657,48 @@ void createShiftOr(int64_t Value, unsigned RegNo, SMLoc IDLoc,
 }
 }
 
+bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
+                                      SmallVectorImpl<MCInst> &Instructions) {
+  // Create a JALR instruction which is going to replace the pseudo-JAL.
+  MCInst JalrInst;
+  JalrInst.setLoc(IDLoc);
+  const MCOperand FirstRegOp = Inst.getOperand(0);
+  const unsigned Opcode = Inst.getOpcode();
+
+  if (Opcode == Mips::JalOneReg) {
+    // jal $rs => jalr $rs
+    if (inMicroMipsMode()) {
+      JalrInst.setOpcode(Mips::JALR16_MM);
+      JalrInst.addOperand(FirstRegOp);
+    } else {
+      JalrInst.setOpcode(Mips::JALR);
+      JalrInst.addOperand(MCOperand::CreateReg(Mips::RA));
+      JalrInst.addOperand(FirstRegOp);
+    }
+  } else if (Opcode == Mips::JalTwoReg) {
+    // jal $rd, $rs => jalr $rd, $rs
+    JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR);
+    JalrInst.addOperand(FirstRegOp);
+    const MCOperand SecondRegOp = Inst.getOperand(1);
+    JalrInst.addOperand(SecondRegOp);
+  }
+  Instructions.push_back(JalrInst);
+
+  // If .set reorder is active, emit a NOP after it.
+  if (AssemblerOptions.back()->isReorder()) {
+    // This is a 32-bit NOP because these 2 pseudo-instructions
+    // do not have a short delay slot.
+    MCInst NopInst;
+    NopInst.setOpcode(Mips::SLL);
+    NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+    NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+    NopInst.addOperand(MCOperand::CreateImm(0));
+    Instructions.push_back(NopInst);
+  }
+
+  return false;
+}
+
 bool MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc,
                                   SmallVectorImpl<MCInst> &Instructions) {
   MCInst tmpInst;
@@ -1587,6 +1950,49 @@ MipsAsmParser::expandLoadAddressSym(MCInst &Inst, SMLoc IDLoc,
   }
 }
 
+bool MipsAsmParser::expandUncondBranchMMPseudo(
+    MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
+  assert(getInstDesc(Inst.getOpcode()).getNumOperands() == 1 &&
+         "unexpected number of operands");
+
+  MCOperand Offset = Inst.getOperand(0);
+  if (Offset.isExpr()) {
+    Inst.clear();
+    Inst.setOpcode(Mips::BEQ_MM);
+    Inst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+    Inst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+    Inst.addOperand(MCOperand::CreateExpr(Offset.getExpr()));
+  } else {
+    assert(Offset.isImm() && "expected immediate operand kind");
+    if (isIntN(11, Offset.getImm())) {
+      // If offset fits into 11 bits then this instruction becomes microMIPS
+      // 16-bit unconditional branch instruction.
+      Inst.setOpcode(Mips::B16_MM);
+    } else {
+      if (!isIntN(17, Offset.getImm()))
+        Error(IDLoc, "branch target out of range");
+      if (OffsetToAlignment(Offset.getImm(), 1LL << 1))
+        Error(IDLoc, "branch to misaligned address");
+      Inst.clear();
+      Inst.setOpcode(Mips::BEQ_MM);
+      Inst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+      Inst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+      Inst.addOperand(MCOperand::CreateImm(Offset.getImm()));
+    }
+  }
+  Instructions.push_back(Inst);
+
+  if (AssemblerOptions.back()->isReorder()) {
+    // If .set reorder is active, emit a NOP after the branch instruction.
+    MCInst NopInst;
+    NopInst.setOpcode(Mips::MOVE16_MM);
+    NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+    NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+    Instructions.push_back(NopInst);
+  }
+  return false;
+}
+
 void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
                                   SmallVectorImpl<MCInst> &Instructions,
                                   bool isLoad, bool isImmOpnd) {
@@ -1703,6 +2109,29 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
   TempInst.clear();
 }
 
+bool
+MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
+                                       SmallVectorImpl<MCInst> &Instructions) {
+  unsigned OpNum = Inst.getNumOperands();
+  unsigned Opcode = Inst.getOpcode();
+  unsigned NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM32_MM : Mips::LWM32_MM;
+
+  assert (Inst.getOperand(OpNum - 1).isImm() &&
+          Inst.getOperand(OpNum - 2).isReg() &&
+          Inst.getOperand(OpNum - 3).isReg() && "Invalid instruction operand.");
+
+  if (OpNum < 8 && Inst.getOperand(OpNum - 1).getImm() <= 60 &&
+      Inst.getOperand(OpNum - 1).getImm() >= 0 &&
+      Inst.getOperand(OpNum - 2).getReg() == Mips::SP &&
+      Inst.getOperand(OpNum - 3).getReg() == Mips::RA)
+    // It can be implemented as SWM16 or LWM16 instruction.
+    NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MM : Mips::LWM16_MM;
+
+  Inst.setOpcode(NewOpcode);
+  Instructions.push_back(Inst);
+  return false;
+}
+
 unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   // As described by the Mips32r2 spec, the registers Rd and Rs for
   // jalr.hb must be different.
@@ -1727,8 +2156,6 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
 
   switch (MatchResult) {
-  default:
-    break;
   case Match_Success: {
     if (processInstruction(Inst, IDLoc, Instructions))
       return true;
@@ -1757,7 +2184,8 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_RequiresDifferentSrcAndDst:
     return Error(IDLoc, "source and destination must be different");
   }
-  return true;
+
+  llvm_unreachable("Implement any new match types added!");
 }
 
 void MipsAsmParser::warnIfAssemblerTemporary(int RegIndex, SMLoc Loc) {
@@ -2642,6 +3070,61 @@ MipsAsmParser::parseRegisterList(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseRegisterPair(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+
+  SMLoc S = Parser.getTok().getLoc();
+  if (parseAnyRegister(Operands) != MatchOperand_Success)
+    return MatchOperand_ParseFail;
+
+  SMLoc E = Parser.getTok().getLoc();
+  MipsOperand &Op = static_cast<MipsOperand &>(*Operands.back());
+  unsigned Reg = Op.getGPR32Reg();
+  Operands.pop_back();
+  Operands.push_back(MipsOperand::CreateRegPair(Reg, S, E, *this));
+  return MatchOperand_Success;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseMovePRegPair(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> TmpOperands;
+  SmallVector<unsigned, 10> Regs;
+
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_ParseFail;
+
+  SMLoc S = Parser.getTok().getLoc();
+
+  if (parseAnyRegister(TmpOperands) != MatchOperand_Success)
+    return MatchOperand_ParseFail;
+
+  MipsOperand *Reg = &static_cast<MipsOperand &>(*TmpOperands.back());
+  unsigned RegNo = isGP64bit() ? Reg->getGPR64Reg() : Reg->getGPR32Reg();
+  Regs.push_back(RegNo);
+
+  SMLoc E = Parser.getTok().getLoc();
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Error(E, "',' expected");
+    return MatchOperand_ParseFail;
+  }
+
+  // Remove comma.
+  Parser.Lex();
+
+  if (parseAnyRegister(TmpOperands) != MatchOperand_Success)
+    return MatchOperand_ParseFail;
+
+  Reg = &static_cast<MipsOperand &>(*TmpOperands.back());
+  RegNo = isGP64bit() ? Reg->getGPR64Reg() : Reg->getGPR32Reg();
+  Regs.push_back(RegNo);
+
+  Operands.push_back(MipsOperand::CreateRegList(Regs, S, E, *this));
+
+  return MatchOperand_Success;
+}
+
 MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
 
   MCSymbolRefExpr::VariantKind VK =
@@ -2804,67 +3287,84 @@ bool MipsAsmParser::reportParseError(SMLoc Loc, Twine ErrorMsg) {
 bool MipsAsmParser::parseSetNoAtDirective() {
   MCAsmParser &Parser = getParser();
   // Line should look like: ".set noat".
-  // set at reg to 0.
+
+  // Set the $at register to $0.
   AssemblerOptions.back()->setATReg(0);
-  // eat noat
-  Parser.Lex();
+
+  Parser.Lex(); // Eat "noat".
+
   // If this is not the end of the statement, report an error.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     reportParseError("unexpected token, expected end of statement");
     return false;
   }
+
+  getTargetStreamer().emitDirectiveSetNoAt();
   Parser.Lex(); // Consume the EndOfStatement.
   return false;
 }
 
 bool MipsAsmParser::parseSetAtDirective() {
+  // Line can be: ".set at", which sets $at to $1
+  //          or  ".set at=$reg", which sets $at to $reg.
   MCAsmParser &Parser = getParser();
-  // Line can be .set at - defaults to $1
-  // or .set at=$reg
-  int AtRegNo;
-  getParser().Lex();
+  Parser.Lex(); // Eat "at".
+
   if (getLexer().is(AsmToken::EndOfStatement)) {
+    // No register was specified, so we set $at to $1.
     AssemblerOptions.back()->setATReg(1);
+
+    getTargetStreamer().emitDirectiveSetAt();
     Parser.Lex(); // Consume the EndOfStatement.
     return false;
-  } else if (getLexer().is(AsmToken::Equal)) {
-    getParser().Lex(); // Eat the '='.
-    if (getLexer().isNot(AsmToken::Dollar)) {
-      reportParseError("unexpected token, expected dollar sign '$'");
+  }
+
+  if (getLexer().isNot(AsmToken::Equal)) {
+    reportParseError("unexpected token, expected equals sign");
+    return false;
+  }
+  Parser.Lex(); // Eat "=".
+
+  if (getLexer().isNot(AsmToken::Dollar)) {
+    if (getLexer().is(AsmToken::EndOfStatement)) {
+      reportParseError("no register specified");
       return false;
-    }
-    Parser.Lex(); // Eat the '$'.
-    const AsmToken &Reg = Parser.getTok();
-    if (Reg.is(AsmToken::Identifier)) {
-      AtRegNo = matchCPURegisterName(Reg.getIdentifier());
-    } else if (Reg.is(AsmToken::Integer)) {
-      AtRegNo = Reg.getIntVal();
     } else {
-      reportParseError("unexpected token, expected identifier or integer");
-      return false;
-    }
-
-    if (AtRegNo < 0 || AtRegNo > 31) {
-      reportParseError("unexpected token in statement");
+      reportParseError("unexpected token, expected dollar sign '$'");
       return false;
     }
+  }
+  Parser.Lex(); // Eat "$".
 
-    if (!AssemblerOptions.back()->setATReg(AtRegNo)) {
-      reportParseError("invalid register");
-      return false;
-    }
-    getParser().Lex(); // Eat the register.
+  // Find out what "reg" is.
+  unsigned AtRegNo;
+  const AsmToken &Reg = Parser.getTok();
+  if (Reg.is(AsmToken::Identifier)) {
+    AtRegNo = matchCPURegisterName(Reg.getIdentifier());
+  } else if (Reg.is(AsmToken::Integer)) {
+    AtRegNo = Reg.getIntVal();
+  } else {
+    reportParseError("unexpected token, expected identifier or integer");
+    return false;
+  }
 
-    if (getLexer().isNot(AsmToken::EndOfStatement)) {
-      reportParseError("unexpected token, expected end of statement");
-      return false;
-    }
-    Parser.Lex(); // Consume the EndOfStatement.
+  // Check if $reg is a valid register. If it is, set $at to $reg.
+  if (!AssemblerOptions.back()->setATReg(AtRegNo)) {
+    reportParseError("invalid register");
     return false;
-  } else {
-    reportParseError("unexpected token in statement");
+  }
+  Parser.Lex(); // Eat "reg".
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
     return false;
   }
+
+  getTargetStreamer().emitDirectiveSetAtWithArg(AtRegNo);
+
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
 }
 
 bool MipsAsmParser::parseSetReorderDirective() {
@@ -3118,9 +3618,13 @@ bool MipsAsmParser::parseSetArchDirective() {
           .Case("mips5", "mips5")
           .Case("mips32", "mips32")
           .Case("mips32r2", "mips32r2")
+          .Case("mips32r3", "mips32r3")
+          .Case("mips32r5", "mips32r5")
           .Case("mips32r6", "mips32r6")
           .Case("mips64", "mips64")
           .Case("mips64r2", "mips64r2")
+          .Case("mips64r3", "mips64r3")
+          .Case("mips64r5", "mips64r5")
           .Case("mips64r6", "mips64r6")
           .Case("cnmips", "cnmips")
           .Case("r4000", "mips3") // This is an implementation of Mips3.
@@ -3178,6 +3682,14 @@ bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
     selectArch("mips32r2");
     getTargetStreamer().emitDirectiveSetMips32R2();
     break;
+  case Mips::FeatureMips32r3:
+    selectArch("mips32r3");
+    getTargetStreamer().emitDirectiveSetMips32R3();
+    break;
+  case Mips::FeatureMips32r5:
+    selectArch("mips32r5");
+    getTargetStreamer().emitDirectiveSetMips32R5();
+    break;
   case Mips::FeatureMips32r6:
     selectArch("mips32r6");
     getTargetStreamer().emitDirectiveSetMips32R6();
@@ -3190,6 +3702,14 @@ bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
     selectArch("mips64r2");
     getTargetStreamer().emitDirectiveSetMips64R2();
     break;
+  case Mips::FeatureMips64r3:
+    selectArch("mips64r3");
+    getTargetStreamer().emitDirectiveSetMips64R3();
+    break;
+  case Mips::FeatureMips64r5:
+    selectArch("mips64r5");
+    getTargetStreamer().emitDirectiveSetMips64R5();
+    break;
   case Mips::FeatureMips64r6:
     selectArch("mips64r6");
     getTargetStreamer().emitDirectiveSetMips64R6();
@@ -3294,12 +3814,20 @@ bool MipsAsmParser::parseDirectiveCPSetup() {
   if (!eatComma("unexpected token, expected comma"))
     return true;
 
-  StringRef Name;
-  if (Parser.parseIdentifier(Name))
-    reportParseError("expected identifier");
-  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+  const MCExpr *Expr;
+  if (Parser.parseExpression(Expr)) {
+    reportParseError("expected expression");
+    return false;
+  }
+
+  if (Expr->getKind() != MCExpr::SymbolRef) {
+    reportParseError("expected symbol");
+    return false;
+  }
+  const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
 
-  getTargetStreamer().emitDirectiveCpsetup(FuncReg, Save, *Sym, SaveIsReg);
+  getTargetStreamer().emitDirectiveCpsetup(FuncReg, Save, Ref->getSymbol(),
+                                           SaveIsReg);
   return false;
 }
 
@@ -3375,12 +3903,20 @@ bool MipsAsmParser::parseDirectiveSet() {
     return parseSetFeature(Mips::FeatureMips32);
   } else if (Tok.getString() == "mips32r2") {
     return parseSetFeature(Mips::FeatureMips32r2);
+  } else if (Tok.getString() == "mips32r3") {
+    return parseSetFeature(Mips::FeatureMips32r3);
+  } else if (Tok.getString() == "mips32r5") {
+    return parseSetFeature(Mips::FeatureMips32r5);
   } else if (Tok.getString() == "mips32r6") {
     return parseSetFeature(Mips::FeatureMips32r6);
   } else if (Tok.getString() == "mips64") {
     return parseSetFeature(Mips::FeatureMips64);
   } else if (Tok.getString() == "mips64r2") {
     return parseSetFeature(Mips::FeatureMips64r2);
+  } else if (Tok.getString() == "mips64r3") {
+    return parseSetFeature(Mips::FeatureMips64r3);
+  } else if (Tok.getString() == "mips64r5") {
+    return parseSetFeature(Mips::FeatureMips64r5);
   } else if (Tok.getString() == "mips64r6") {
     return parseSetFeature(Mips::FeatureMips64r6);
   } else if (Tok.getString() == "dsp") {
@@ -3518,43 +4054,44 @@ bool MipsAsmParser::parseDirectiveModule() {
     return false;
   }
 
-  if (Lexer.is(AsmToken::Identifier)) {
-    StringRef Option = Parser.getTok().getString();
-    Parser.Lex();
-
-    if (Option == "oddspreg") {
-      getTargetStreamer().emitDirectiveModuleOddSPReg(true, isABI_O32());
-      clearFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
+  StringRef Option;
+  if (Parser.parseIdentifier(Option)) {
+    reportParseError("expected .module option identifier");
+    return false;
+  }
 
-      if (getLexer().isNot(AsmToken::EndOfStatement)) {
-        reportParseError("unexpected token, expected end of statement");
-        return false;
-      }
+  if (Option == "oddspreg") {
+    getTargetStreamer().emitDirectiveModuleOddSPReg(true, isABI_O32());
+    clearFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
 
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
       return false;
-    } else if (Option == "nooddspreg") {
-      if (!isABI_O32()) {
-        Error(L, "'.module nooddspreg' requires the O32 ABI");
-        return false;
-      }
+    }
 
-      getTargetStreamer().emitDirectiveModuleOddSPReg(false, isABI_O32());
-      setFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
+    return false; // parseDirectiveModule has finished successfully.
+  } else if (Option == "nooddspreg") {
+    if (!isABI_O32()) {
+      Error(L, "'.module nooddspreg' requires the O32 ABI");
+      return false;
+    }
 
-      if (getLexer().isNot(AsmToken::EndOfStatement)) {
-        reportParseError("unexpected token, expected end of statement");
-        return false;
-      }
+    getTargetStreamer().emitDirectiveModuleOddSPReg(false, isABI_O32());
+    setFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
 
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
       return false;
-    } else if (Option == "fp") {
-      return parseDirectiveModuleFP();
     }
 
+    return false; // parseDirectiveModule has finished successfully.
+  } else if (Option == "fp") {
+    return parseDirectiveModuleFP();
+  } else {
     return Error(L, "'" + Twine(Option) + "' is not a valid .module option.");
   }
-
-  return false;
 }
 
 /// parseDirectiveModuleFP
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index 1f201b0..36ba8e5 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -21,7 +21,6 @@ add_llvm_target(MipsCodeGen
   Mips16ISelDAGToDAG.cpp
   Mips16ISelLowering.cpp
   Mips16RegisterInfo.cpp
-  MipsABIInfo.cpp
   MipsAnalyzeImmediate.cpp
   MipsAsmPrinter.cpp
   MipsCCState.cpp
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 48904ce..8849366 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -30,34 +30,15 @@ typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
 
-/// A disasembler class for Mips.
-class MipsDisassemblerBase : public MCDisassembler {
+class MipsDisassembler : public MCDisassembler {
+  bool IsMicroMips;
+  bool IsBigEndian;
 public:
-  MipsDisassemblerBase(const MCSubtargetInfo &STI, MCContext &Ctx,
-                       bool IsBigEndian)
+  MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, bool IsBigEndian)
       : MCDisassembler(STI, Ctx),
-        IsN64(STI.getFeatureBits() & Mips::FeatureN64),
+        IsMicroMips(STI.getFeatureBits() & Mips::FeatureMicroMips),
         IsBigEndian(IsBigEndian) {}
 
-  virtual ~MipsDisassemblerBase() {}
-
-  bool isN64() const { return IsN64; }
-
-private:
-  bool IsN64;
-protected:
-  bool IsBigEndian;
-};
-
-/// A disasembler class for Mips32.
-class MipsDisassembler : public MipsDisassemblerBase {
-  bool IsMicroMips;
-public:
-  MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, bool bigEndian)
-      : MipsDisassemblerBase(STI, Ctx, bigEndian) {
-    IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips;
-  }
-
   bool hasMips3() const { return STI.getFeatureBits() & Mips::FeatureMips3; }
   bool hasMips32() const { return STI.getFeatureBits() & Mips::FeatureMips32; }
   bool hasMips32r6() const {
@@ -77,19 +58,6 @@ public:
                               raw_ostream &CStream) const override;
 };
 
-/// A disasembler class for Mips64.
-class Mips64Disassembler : public MipsDisassemblerBase {
-public:
-  Mips64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
-                     bool bigEndian) :
-    MipsDisassemblerBase(STI, Ctx, bigEndian) {}
-
-  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
-                              ArrayRef<uint8_t> Bytes, uint64_t Address,
-                              raw_ostream &VStream,
-                              raw_ostream &CStream) const override;
-};
-
 } // end anonymous namespace
 
 // Forward declare these because the autogenerated code will reference them.
@@ -109,6 +77,16 @@ static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst,
                                                uint64_t Address,
                                                const void *Decoder);
 
+static DecodeStatus DecodeGPRMM16ZeroRegisterClass(MCInst &Inst,
+                                                   unsigned RegNo,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+
+static DecodeStatus DecodeGPRMM16MovePRegisterClass(MCInst &Inst,
+                                                    unsigned RegNo,
+                                                    uint64_t Address,
+                                                    const void *Decoder);
+
 static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
                                              unsigned RegNo,
                                              uint64_t Address,
@@ -223,6 +201,20 @@ static DecodeStatus DecodeBranchTarget26(MCInst &Inst,
                                          uint64_t Address,
                                          const void *Decoder);
 
+// DecodeBranchTarget7MM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTarget7MM(MCInst &Inst,
+                                          unsigned Offset,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+// DecodeBranchTarget10MM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTarget10MM(MCInst &Inst,
+                                           unsigned Offset,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
 // DecodeBranchTargetMM - Decode microMIPS branch offset, which is
 // shifted left by 1 bit.
 static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
@@ -247,9 +239,44 @@ static DecodeStatus DecodeCacheOp(MCInst &Inst,
                               uint64_t Address,
                               const void *Decoder);
 
+static DecodeStatus DecodeCacheOpR6(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder);
+
+static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder);
+
+static DecodeStatus DecodeSyncI(MCInst &Inst,
+                                unsigned Insn,
+                                uint64_t Address,
+                                const void *Decoder);
+
 static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder);
 
+static DecodeStatus DecodeMemMMImm4(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder);
+
+static DecodeStatus DecodeMemMMSPImm5Lsl2(MCInst &Inst,
+                                          unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+static DecodeStatus DecodeMemMMGPImm7Lsl2(MCInst &Inst,
+                                          unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst,
+                                               unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
 static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
                                      unsigned Insn,
                                      uint64_t Address,
@@ -272,11 +299,35 @@ static DecodeStatus DecodeFMem3(MCInst &Inst, unsigned Insn,
                                uint64_t Address,
                                const void *Decoder);
 
+static DecodeStatus DecodeFMemCop2R6(MCInst &Inst, unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder);
+
 static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
                                        unsigned Insn,
                                        uint64_t Address,
                                        const void *Decoder);
 
+static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst,
+                                       unsigned Value,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
+static DecodeStatus DecodeUImm6Lsl2(MCInst &Inst,
+                                    unsigned Value,
+                                    uint64_t Address,
+                                    const void *Decoder);
+
+static DecodeStatus DecodeLiSimm7(MCInst &Inst,
+                                  unsigned Value,
+                                  uint64_t Address,
+                                  const void *Decoder);
+
+static DecodeStatus DecodeSimm4(MCInst &Inst,
+                                unsigned Value,
+                                uint64_t Address,
+                                const void *Decoder);
+
 static DecodeStatus DecodeSimm16(MCInst &Inst,
                                  unsigned Insn,
                                  uint64_t Address,
@@ -305,6 +356,18 @@ static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
 static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
                                      uint64_t Address, const void *Decoder);
 
+static DecodeStatus DecodeSimm9SP(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeUImm5lsl2(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeSimm23Lsl2(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder);
+
 /// INSVE_[BHWD] have an implicit operand that the generated decoder doesn't
 /// handle.
 template <typename InsnType>
@@ -345,6 +408,14 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
                                          const void *Decoder);
 
+static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
 namespace llvm {
 extern Target TheMipselTarget, TheMipsTarget, TheMips64Target,
               TheMips64elTarget;
@@ -364,20 +435,6 @@ static MCDisassembler *createMipselDisassembler(
   return new MipsDisassembler(STI, Ctx, false);
 }
 
-static MCDisassembler *createMips64Disassembler(
-                       const Target &T,
-                       const MCSubtargetInfo &STI,
-                       MCContext &Ctx) {
-  return new Mips64Disassembler(STI, Ctx, true);
-}
-
-static MCDisassembler *createMips64elDisassembler(
-                       const Target &T,
-                       const MCSubtargetInfo &STI,
-                       MCContext &Ctx) {
-  return new Mips64Disassembler(STI, Ctx, false);
-}
-
 extern "C" void LLVMInitializeMipsDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(TheMipsTarget,
@@ -385,15 +442,15 @@ extern "C" void LLVMInitializeMipsDisassembler() {
   TargetRegistry::RegisterMCDisassembler(TheMipselTarget,
                                          createMipselDisassembler);
   TargetRegistry::RegisterMCDisassembler(TheMips64Target,
-                                         createMips64Disassembler);
+                                         createMipsDisassembler);
   TargetRegistry::RegisterMCDisassembler(TheMips64elTarget,
-                                         createMips64elDisassembler);
+                                         createMipselDisassembler);
 }
 
 #include "MipsGenDisassemblerTables.inc"
 
 static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
-  const MipsDisassemblerBase *Dis = static_cast<const MipsDisassemblerBase*>(D);
+  const MipsDisassembler *Dis = static_cast<const MipsDisassembler*>(D);
   const MCRegisterInfo *RegInfo = Dis->getContext().getRegisterInfo();
   return *(RegInfo->getRegClass(RC).begin() + RegNo);
 }
@@ -700,6 +757,26 @@ static DecodeStatus DecodeBlezGroupBranch(MCInst &MI, InsnType insn,
   return MCDisassembler::Success;
 }
 
+/// Read two bytes from the ArrayRef and return 16 bit halfword sorted
+/// according to the given endianess.
+static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                      uint64_t &Size, uint32_t &Insn,
+                                      bool IsBigEndian) {
+  // We want to read exactly 2 Bytes of data.
+  if (Bytes.size() < 2) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  if (IsBigEndian) {
+    Insn = (Bytes[0] << 8) | Bytes[1];
+  } else {
+    Insn = (Bytes[1] << 8) | Bytes[0];
+  }
+
+  return MCDisassembler::Success;
+}
+
 /// Read four bytes from the ArrayRef and return 32 bit word sorted
 /// according to the given endianess
 static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
@@ -711,15 +788,19 @@ static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
     return MCDisassembler::Fail;
   }
 
+  // High 16 bits of a 32-bit microMIPS instruction (where the opcode is)
+  // always precede the low 16 bits in the instruction stream (that is, they
+  // are placed at lower addresses in the instruction stream).
+  //
+  // microMIPS byte ordering:
+  //   Big-endian:    0 | 1 | 2 | 3
+  //   Little-endian: 1 | 0 | 3 | 2
+
   if (IsBigEndian) {
     // Encoded as a big-endian 32-bit word in the stream.
     Insn =
         (Bytes[3] << 0) | (Bytes[2] << 8) | (Bytes[1] << 16) | (Bytes[0] << 24);
   } else {
-    // Encoded as a small-endian 32-bit word in the stream.
-    // Little-endian byte ordering:
-    //   mips32r2:   4 | 3 | 2 | 1
-    //   microMIPS:  2 | 1 | 4 | 3
     if (IsMicroMips) {
       Insn = (Bytes[2] << 0) | (Bytes[3] << 8) | (Bytes[0] << 16) |
              (Bytes[1] << 24);
@@ -738,14 +819,25 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
                                               raw_ostream &VStream,
                                               raw_ostream &CStream) const {
   uint32_t Insn;
-
-  DecodeStatus Result =
-      readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, IsMicroMips);
-  if (Result == MCDisassembler::Fail)
-    return MCDisassembler::Fail;
+  DecodeStatus Result;
 
   if (IsMicroMips) {
-    DEBUG(dbgs() << "Trying MicroMips32 table (32-bit opcodes):\n");
+    Result = readInstruction16(Bytes, Address, Size, Insn, IsBigEndian);
+
+    DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n");
+    // Calling the auto-generated decoder function.
+    Result = decodeInstruction(DecoderTableMicroMips16, Instr, Insn, Address,
+                               this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 2;
+      return Result;
+    }
+
+    Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, true);
+    if (Result == MCDisassembler::Fail)
+      return MCDisassembler::Fail;
+
+    DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n");
     // Calling the auto-generated decoder function.
     Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address,
                                this, STI);
@@ -756,6 +848,10 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
     return MCDisassembler::Fail;
   }
 
+  Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false);
+  if (Result == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
   if (hasCOP3()) {
     DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
     Result =
@@ -786,39 +882,19 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
     }
   }
 
-  DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
-  // Calling the auto-generated decoder function.
-  Result =
-      decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return Result;
+  if (isGP64()) {
+    DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips6432, Instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
   }
 
-  return MCDisassembler::Fail;
-}
-
-DecodeStatus Mips64Disassembler::getInstruction(MCInst &Instr, uint64_t &Size,
-                                                ArrayRef<uint8_t> Bytes,
-                                                uint64_t Address,
-                                                raw_ostream &VStream,
-                                                raw_ostream &CStream) const {
-  uint32_t Insn;
-
-  DecodeStatus Result =
-      readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false);
-  if (Result == MCDisassembler::Fail)
-    return MCDisassembler::Fail;
-
+  DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
   // Calling the auto-generated decoder function.
   Result =
-      decodeInstruction(DecoderTableMips6432, Instr, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return Result;
-  }
-  // If we fail to decode in Mips64 decoder space we can try in Mips32
-  Result =
       decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
     Size = 4;
@@ -854,7 +930,33 @@ static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst,
                                                unsigned RegNo,
                                                uint64_t Address,
                                                const void *Decoder) {
-  return MCDisassembler::Fail;
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, Mips::GPRMM16RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGPRMM16ZeroRegisterClass(MCInst &Inst,
+                                                   unsigned RegNo,
+                                                   uint64_t Address,
+                                                   const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, Mips::GPRMM16ZeroRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGPRMM16MovePRegisterClass(MCInst &Inst,
+                                                    unsigned RegNo,
+                                                    uint64_t Address,
+                                                    const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, Mips::GPRMM16MovePRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
 }
 
 static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
@@ -872,7 +974,7 @@ static DecodeStatus DecodePtrRegisterClass(MCInst &Inst,
                                            unsigned RegNo,
                                            uint64_t Address,
                                            const void *Decoder) {
-  if (static_cast<const MipsDisassembler *>(Decoder)->isN64())
+  if (static_cast<const MipsDisassembler *>(Decoder)->isGP64())
     return DecodeGPR64RegisterClass(Inst, RegNo, Address, Decoder);
 
   return DecodeGPR32RegisterClass(Inst, RegNo, Address, Decoder);
@@ -953,7 +1055,8 @@ static DecodeStatus DecodeMem(MCInst &Inst,
   Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
   Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
 
-  if(Inst.getOpcode() == Mips::SC){
+  if(Inst.getOpcode() == Mips::SC ||
+     Inst.getOpcode() == Mips::SCD){
     Inst.addOperand(MCOperand::CreateReg(Reg));
   }
 
@@ -981,6 +1084,55 @@ static DecodeStatus DecodeCacheOp(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder) {
+  int Offset = SignExtend32<12>(Insn & 0xfff);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned Hint = fieldFromInstruction(Insn, 21, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+  Inst.addOperand(MCOperand::CreateImm(Hint));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCacheOpR6(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder) {
+  int Offset = fieldFromInstruction(Insn, 7, 9);
+  unsigned Hint = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+  Inst.addOperand(MCOperand::CreateImm(Hint));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSyncI(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder) {
   int Offset = SignExtend32<10>(fieldFromInstruction(Insn, 16, 10));
@@ -1027,6 +1179,106 @@ static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeMemMMImm4(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder) {
+  unsigned Offset = Insn & 0xf;
+  unsigned Reg = fieldFromInstruction(Insn, 7, 3);
+  unsigned Base = fieldFromInstruction(Insn, 4, 3);
+
+  switch (Inst.getOpcode()) {
+    case Mips::LBU16_MM:
+    case Mips::LHU16_MM:
+    case Mips::LW16_MM:
+      if (DecodeGPRMM16RegisterClass(Inst, Reg, Address, Decoder)
+            == MCDisassembler::Fail)
+        return MCDisassembler::Fail;
+      break;
+    case Mips::SB16_MM:
+    case Mips::SH16_MM:
+    case Mips::SW16_MM:
+      if (DecodeGPRMM16ZeroRegisterClass(Inst, Reg, Address, Decoder)
+            == MCDisassembler::Fail)
+        return MCDisassembler::Fail;
+      break;
+  }
+
+  if (DecodeGPRMM16RegisterClass(Inst, Base, Address, Decoder)
+        == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  switch (Inst.getOpcode()) {
+    case Mips::LBU16_MM:
+      if (Offset == 0xf)
+        Inst.addOperand(MCOperand::CreateImm(-1));
+      else
+        Inst.addOperand(MCOperand::CreateImm(Offset));
+      break;
+    case Mips::SB16_MM:
+      Inst.addOperand(MCOperand::CreateImm(Offset));
+      break;
+    case Mips::LHU16_MM:
+    case Mips::SH16_MM:
+      Inst.addOperand(MCOperand::CreateImm(Offset << 1));
+      break;
+    case Mips::LW16_MM:
+    case Mips::SW16_MM:
+      Inst.addOperand(MCOperand::CreateImm(Offset << 2));
+      break;
+  }
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMSPImm5Lsl2(MCInst &Inst,
+                                          unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  unsigned Offset = Insn & 0x1F;
+  unsigned Reg = fieldFromInstruction(Insn, 5, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Mips::SP));
+  Inst.addOperand(MCOperand::CreateImm(Offset << 2));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMGPImm7Lsl2(MCInst &Inst,
+                                          unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  unsigned Offset = Insn & 0x7F;
+  unsigned Reg = fieldFromInstruction(Insn, 7, 3);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+  Inst.addOperand(MCOperand::CreateImm(Offset << 2));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst,
+                                               unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  int Offset = SignExtend32<4>(Insn & 0xf);
+
+  if (DecodeRegListOperand16(Inst, Insn, Address, Decoder)
+      == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateReg(Mips::SP));
+  Inst.addOperand(MCOperand::CreateImm(Offset << 2));
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
                                      unsigned Insn,
                                      uint64_t Address,
@@ -1052,6 +1304,9 @@ static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
     // fallthrough
   default:
     Inst.addOperand(MCOperand::CreateReg(Reg));
+    if (Inst.getOpcode() == Mips::LWP_MM || Inst.getOpcode() == Mips::SWP_MM)
+      Inst.addOperand(MCOperand::CreateReg(Reg+1));
+
     Inst.addOperand(MCOperand::CreateReg(Base));
     Inst.addOperand(MCOperand::CreateImm(Offset));
   }
@@ -1131,6 +1386,23 @@ static DecodeStatus DecodeFMem3(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeFMemCop2R6(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder) {
+  int Offset = SignExtend32<11>(Insn & 0x07ff);
+  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 11, 5);
+
+  Reg = getReg(Decoder, Mips::COP2RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
 static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
                                        unsigned Insn,
                                        uint64_t Address,
@@ -1324,6 +1596,24 @@ static DecodeStatus DecodeBranchTarget26(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeBranchTarget7MM(MCInst &Inst,
+                                          unsigned Offset,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<7>(Offset) << 1;
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget10MM(MCInst &Inst,
+                                           unsigned Offset,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<10>(Offset) << 1;
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
                                          unsigned Offset,
                                          uint64_t Address,
@@ -1342,6 +1632,46 @@ static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst,
+                                       unsigned Value,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  if (Value == 0)
+    Inst.addOperand(MCOperand::CreateImm(1));
+  else if (Value == 0x7)
+    Inst.addOperand(MCOperand::CreateImm(-1));
+  else
+    Inst.addOperand(MCOperand::CreateImm(Value << 2));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeUImm6Lsl2(MCInst &Inst,
+                                    unsigned Value,
+                                    uint64_t Address,
+                                    const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(Value << 2));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLiSimm7(MCInst &Inst,
+                                  unsigned Value,
+                                  uint64_t Address,
+                                  const void *Decoder) {
+  if (Value == 0x7F)
+    Inst.addOperand(MCOperand::CreateImm(-1));
+  else
+    Inst.addOperand(MCOperand::CreateImm(Value));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSimm4(MCInst &Inst,
+                                unsigned Value,
+                                uint64_t Address,
+                                const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<4>(Value)));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeSimm16(MCInst &Inst,
                                  unsigned Insn,
                                  uint64_t Address,
@@ -1391,6 +1721,36 @@ static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeSimm9SP(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  int32_t DecodedValue;
+  switch (Insn) {
+  case 0: DecodedValue = 256; break;
+  case 1: DecodedValue = 257; break;
+  case 510: DecodedValue = -258; break;
+  case 511: DecodedValue = -257; break;
+  default: DecodedValue = SignExtend32<9>(Insn); break;
+  }
+  Inst.addOperand(MCOperand::CreateImm(DecodedValue * 4));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  // Insn must be >= 0, since it is unsigned that condition is always true.
+  assert(Insn < 16);
+  int32_t DecodedValues[] = {128, 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64,
+                             255, 32768, 65535};
+  Inst.addOperand(MCOperand::CreateImm(DecodedValues[Insn]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeUImm5lsl2(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(Insn << 2));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeRegListOperand(MCInst &Inst,
                                          unsigned Insn,
                                          uint64_t Address,
@@ -1413,3 +1773,69 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst,
 
   return MCDisassembler::Success;
 }
+
+static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3};
+  unsigned RegLst = fieldFromInstruction(Insn, 4, 2);
+  unsigned RegNum = RegLst & 0x3;
+
+  for (unsigned i = 0; i <= RegNum; i++)
+    Inst.addOperand(MCOperand::CreateReg(Regs[i]));
+
+  Inst.addOperand(MCOperand::CreateReg(Mips::RA));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned Insn,
+                                       uint64_t Address, const void *Decoder) {
+
+  unsigned RegPair = fieldFromInstruction(Insn, 7, 3);
+
+  switch (RegPair) {
+  default:
+    return MCDisassembler::Fail;
+  case 0:
+    Inst.addOperand(MCOperand::CreateReg(Mips::A1));
+    Inst.addOperand(MCOperand::CreateReg(Mips::A2));
+    break;
+  case 1:
+    Inst.addOperand(MCOperand::CreateReg(Mips::A1));
+    Inst.addOperand(MCOperand::CreateReg(Mips::A3));
+    break;
+  case 2:
+    Inst.addOperand(MCOperand::CreateReg(Mips::A2));
+    Inst.addOperand(MCOperand::CreateReg(Mips::A3));
+    break;
+  case 3:
+    Inst.addOperand(MCOperand::CreateReg(Mips::A0));
+    Inst.addOperand(MCOperand::CreateReg(Mips::S5));
+    break;
+  case 4:
+    Inst.addOperand(MCOperand::CreateReg(Mips::A0));
+    Inst.addOperand(MCOperand::CreateReg(Mips::S6));
+    break;
+  case 5:
+    Inst.addOperand(MCOperand::CreateReg(Mips::A0));
+    Inst.addOperand(MCOperand::CreateReg(Mips::A1));
+    break;
+  case 6:
+    Inst.addOperand(MCOperand::CreateReg(Mips::A0));
+    Inst.addOperand(MCOperand::CreateReg(Mips::A2));
+    break;
+  case 7:
+    Inst.addOperand(MCOperand::CreateReg(Mips::A0));
+    Inst.addOperand(MCOperand::CreateReg(Mips::A3));
+    break;
+  }
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSimm23Lsl2(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<23>(Insn) << 2));
+  return MCDisassembler::Success;
+}
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index ab6b225..aad549d 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -134,8 +134,8 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
   } else if (const MipsMCExpr *ME = dyn_cast<MipsMCExpr>(Expr)) {
     ME->print(OS);
     return;
-  } else if (!(SRE = dyn_cast<MCSymbolRefExpr>(Expr)))
-    assert(false && "Unexpected MCExpr type.");
+  } else
+    SRE = cast<MCSymbolRefExpr>(Expr);
 
   MCSymbolRefExpr::VariantKind Kind = SRE->getKind();
 
@@ -233,6 +233,8 @@ printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) {
     break;
   case Mips::SWM32_MM:
   case Mips::LWM32_MM:
+  case Mips::SWM16_MM:
+  case Mips::LWM16_MM:
     opNum = MI->getNumOperands() - 2;
     break;
   }
@@ -260,6 +262,11 @@ printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O) {
 }
 
 void MipsInstPrinter::
+printRegisterPair(const MCInst *MI, int opNum, raw_ostream &O) {
+  printRegName(O, MI->getOperand(opNum).getReg());
+}
+
+void MipsInstPrinter::
 printSHFMask(const MCInst *MI, int opNum, raw_ostream &O) {
   llvm_unreachable("TODO");
 }
@@ -283,6 +290,7 @@ bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI,
 bool MipsInstPrinter::printAlias(const MCInst &MI, raw_ostream &OS) {
   switch (MI.getOpcode()) {
   case Mips::BEQ:
+  case Mips::BEQ_MM:
     // beq $zero, $zero, $L2 => b $L2
     // beq $r0, $zero, $L2 => beqz $r0, $L2
     return (isReg<Mips::ZERO>(MI, 0) && isReg<Mips::ZERO>(MI, 1) &&
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 42df013..468dc07 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -99,6 +99,7 @@ private:
   void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
   void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O);
   void printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O);
+  void printRegisterPair(const MCInst *MI, int opNum, raw_ostream &O);
   void printSHFMask(const MCInst *MI, int opNum, raw_ostream &O);
 
   bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo,
diff --git a/lib/Target/Mips/MCTargetDesc/Android.mk b/lib/Target/Mips/MCTargetDesc/Android.mk
index 89e132d..7f462d3 100644
--- a/lib/Target/Mips/MCTargetDesc/Android.mk
+++ b/lib/Target/Mips/MCTargetDesc/Android.mk
@@ -8,6 +8,7 @@ mips_mc_desc_TBLGEN_TABLES := \
 
 mips_mc_desc_SRC_FILES := \
   MipsABIFlagsSection.cpp \
+  MipsABIInfo.cpp \
   MipsAsmBackend.cpp \
   MipsELFObjectWriter.cpp \
   MipsELFStreamer.cpp \
diff --git a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index 6b3788c..c63af7c 100644
--- a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_library(LLVMMipsDesc
+  MipsABIInfo.cpp
   MipsABIFlagsSection.cpp
   MipsAsmBackend.cpp
   MipsELFObjectWriter.cpp
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
index 8bcfb0f..473f4f2 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
@@ -145,6 +145,10 @@ public:
       ISALevel = 64;
       if (P.hasMips64r6())
         ISARevision = 6;
+      else if (P.hasMips64r5())
+        ISARevision = 5;
+      else if (P.hasMips64r3())
+        ISARevision = 3;
       else if (P.hasMips64r2())
         ISARevision = 2;
       else
@@ -153,6 +157,10 @@ public:
       ISALevel = 32;
       if (P.hasMips32r6())
         ISARevision = 6;
+      else if (P.hasMips32r5())
+        ISARevision = 5;
+      else if (P.hasMips32r3())
+        ISARevision = 3;
       else if (P.hasMips32r2())
         ISARevision = 2;
       else
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
new file mode 100644
index 0000000..faf9741
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
@@ -0,0 +1,92 @@
+//===---- MipsABIInfo.cpp - Information about MIPS ABI's ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsABIInfo.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCTargetOptions.h"
+
+using namespace llvm;
+
+namespace {
+static const MCPhysReg O32IntRegs[4] = {Mips::A0, Mips::A1, Mips::A2, Mips::A3};
+
+static const MCPhysReg Mips64IntRegs[8] = {
+    Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64,
+    Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64};
+}
+
+const ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const {
+  if (IsO32())
+    return makeArrayRef(O32IntRegs);
+  if (IsN32() || IsN64())
+    return makeArrayRef(Mips64IntRegs);
+  llvm_unreachable("Unhandled ABI");
+}
+
+const ArrayRef<MCPhysReg> MipsABIInfo::GetVarArgRegs() const {
+  if (IsO32())
+    return makeArrayRef(O32IntRegs);
+  if (IsN32() || IsN64())
+    return makeArrayRef(Mips64IntRegs);
+  llvm_unreachable("Unhandled ABI");
+}
+
+unsigned MipsABIInfo::GetCalleeAllocdArgSizeInBytes(CallingConv::ID CC) const {
+  if (IsO32())
+    return CC != CallingConv::Fast ? 16 : 0;
+  if (IsN32() || IsN64() || IsEABI())
+    return 0;
+  llvm_unreachable("Unhandled ABI");
+}
+
+MipsABIInfo MipsABIInfo::computeTargetABI(Triple TT, StringRef CPU,
+                                          const MCTargetOptions &Options) {
+  if (Options.getABIName().startswith("o32"))
+    return MipsABIInfo::O32();
+  else if (Options.getABIName().startswith("n32"))
+    return MipsABIInfo::N32();
+  else if (Options.getABIName().startswith("n64"))
+    return MipsABIInfo::N64();
+  else if (Options.getABIName().startswith("eabi"))
+    return MipsABIInfo::EABI();
+  else if (!Options.getABIName().empty())
+    llvm_unreachable("Unknown ABI option for MIPS");
+
+  // FIXME: This shares code with the selectMipsCPU routine that's
+  // used and not shared in a couple of other places. This needs unifying
+  // at some level.
+  if (CPU.empty() || CPU == "generic") {
+    if (TT.getArch() == Triple::mips || TT.getArch() == Triple::mipsel)
+      CPU = "mips32";
+    else
+      CPU = "mips64";
+  }
+
+  return StringSwitch<MipsABIInfo>(CPU)
+      .Case("mips1", MipsABIInfo::O32())
+      .Case("mips2", MipsABIInfo::O32())
+      .Case("mips32", MipsABIInfo::O32())
+      .Case("mips32r2", MipsABIInfo::O32())
+      .Case("mips32r3", MipsABIInfo::O32())
+      .Case("mips32r5", MipsABIInfo::O32())
+      .Case("mips32r6", MipsABIInfo::O32())
+      .Case("mips16", MipsABIInfo::O32())
+      .Case("mips3", MipsABIInfo::N64())
+      .Case("mips4", MipsABIInfo::N64())
+      .Case("mips5", MipsABIInfo::N64())
+      .Case("mips64", MipsABIInfo::N64())
+      .Case("mips64r2", MipsABIInfo::N64())
+      .Case("mips64r3", MipsABIInfo::N64())
+      .Case("mips64r5", MipsABIInfo::N64())
+      .Case("mips64r6", MipsABIInfo::N64())
+      .Case("octeon", MipsABIInfo::N64())
+      .Default(MipsABIInfo::Unknown());
+}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
new file mode 100644
index 0000000..008e08e
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
@@ -0,0 +1,67 @@
+//===---- MipsABIInfo.h - Information about MIPS ABI's --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIINFO_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIINFO_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/MC/MCRegisterInfo.h"
+
+namespace llvm {
+
+class MCTargetOptions;
+class StringRef;
+
+class MipsABIInfo {
+public:
+  enum class ABI { Unknown, O32, N32, N64, EABI };
+
+protected:
+  ABI ThisABI;
+
+public:
+  MipsABIInfo(ABI ThisABI) : ThisABI(ThisABI) {}
+
+  static MipsABIInfo Unknown() { return MipsABIInfo(ABI::Unknown); }
+  static MipsABIInfo O32() { return MipsABIInfo(ABI::O32); }
+  static MipsABIInfo N32() { return MipsABIInfo(ABI::N32); }
+  static MipsABIInfo N64() { return MipsABIInfo(ABI::N64); }
+  static MipsABIInfo EABI() { return MipsABIInfo(ABI::EABI); }
+  static MipsABIInfo computeTargetABI(Triple TT, StringRef CPU,
+                                      const MCTargetOptions &Options);
+
+  bool IsKnown() const { return ThisABI != ABI::Unknown; }
+  bool IsO32() const { return ThisABI == ABI::O32; }
+  bool IsN32() const { return ThisABI == ABI::N32; }
+  bool IsN64() const { return ThisABI == ABI::N64; }
+  bool IsEABI() const { return ThisABI == ABI::EABI; }
+  ABI GetEnumValue() const { return ThisABI; }
+
+  /// The registers to use for byval arguments.
+  const ArrayRef<MCPhysReg> GetByValArgRegs() const;
+
+  /// The registers to use for the variable argument list.
+  const ArrayRef<MCPhysReg> GetVarArgRegs() const;
+
+  /// Obtain the size of the area allocated by the callee for arguments.
+  /// CallingConv::FastCall affects the value for O32.
+  unsigned GetCalleeAllocdArgSizeInBytes(CallingConv::ID CC) const;
+
+  /// Ordering of ABI's
+  /// MipsGenSubtargetInfo.inc will use this to resolve conflicts when given
+  /// multiple ABI options.
+  bool operator<(const MipsABIInfo Other) const {
+    return ThisABI < Other.GetEnumValue();
+  }
+};
+}
+
+#endif
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index efeb54d..acf6f21 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -103,6 +103,22 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case Mips::fixup_MICROMIPS_26_S1:
     Value >>= 1;
     break;
+  case Mips::fixup_MICROMIPS_PC7_S1:
+    Value -= 4;
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t) Value / 2;
+    // We now check if Value can be encoded as a 7-bit signed immediate.
+    if (!isIntN(7, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC7 fixup");
+    break;
+  case Mips::fixup_MICROMIPS_PC10_S1:
+    Value -= 2;
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t) Value / 2;
+    // We now check if Value can be encoded as a 10-bit signed immediate.
+    if (!isIntN(10, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC10 fixup");
+    break;
   case Mips::fixup_MICROMIPS_PC16_S1:
     Value -= 4;
     // Forcing a signed division because Value can be negative.
@@ -149,7 +165,8 @@ MCObjectWriter *MipsAsmBackend::createObjectWriter(raw_ostream &OS) const {
 //   microMIPS:  x | x | a | b
 
 static bool needsMMLEByteOrder(unsigned Kind) {
-  return Kind >= Mips::fixup_MICROMIPS_26_S1 &&
+  return Kind != Mips::fixup_MICROMIPS_PC10_S1 &&
+         Kind >= Mips::fixup_MICROMIPS_26_S1 &&
          Kind < Mips::LastTargetFixupKind;
 }
 
@@ -182,6 +199,7 @@ void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   switch ((unsigned)Kind) {
   case FK_Data_2:
   case Mips::fixup_Mips_16:
+  case Mips::fixup_MICROMIPS_PC10_S1:
     FullSize = 2;
     break;
   case FK_Data_8:
@@ -271,6 +289,8 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_MICROMIPS_HI16",    0,     16,   0 },
     { "fixup_MICROMIPS_LO16",    0,     16,   0 },
     { "fixup_MICROMIPS_GOT16",   0,     16,   0 },
+    { "fixup_MICROMIPS_PC7_S1",  0,      7,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC10_S1", 0,     10,   MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MICROMIPS_PC16_S1", 0,     16,   MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MICROMIPS_CALL16",  0,     16,   0 },
     { "fixup_MICROMIPS_GOT_DISP",        0,     16,   0 },
@@ -334,6 +354,8 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_MICROMIPS_HI16",   16,     16,   0 },
     { "fixup_MICROMIPS_LO16",   16,     16,   0 },
     { "fixup_MICROMIPS_GOT16",  16,     16,   0 },
+    { "fixup_MICROMIPS_PC7_S1",  9,      7,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC10_S1", 6,     10,   MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MICROMIPS_PC16_S1",16,     16,   MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MICROMIPS_CALL16", 16,     16,   0 },
     { "fixup_MICROMIPS_GOT_DISP",        16,     16,   0 },
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index d4f4983..dd0e54c 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -16,8 +16,8 @@
 #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSASMBACKEND_H
 
 #include "MCTargetDesc/MipsFixupKinds.h"
-#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmBackend.h"
 
 namespace llvm {
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 4ea7846..e14dc8d 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -11,6 +11,7 @@
 #include "MCTargetDesc/MipsFixupKinds.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSection.h"
@@ -161,6 +162,12 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   case Mips::fixup_MICROMIPS_GOT16:
     Type = ELF::R_MICROMIPS_GOT16;
     break;
+  case Mips::fixup_MICROMIPS_PC7_S1:
+    Type = ELF::R_MICROMIPS_PC7_S1;
+    break;
+  case Mips::fixup_MICROMIPS_PC10_S1:
+    Type = ELF::R_MICROMIPS_PC10_S1;
+    break;
   case Mips::fixup_MICROMIPS_PC16_S1:
     Type = ELF::R_MICROMIPS_PC16_S1;
     break;
@@ -219,7 +226,7 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
 bool
 MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD,
                                              unsigned Type) const {
-  // FIXME: This is extremelly conservative. This really needs to use a
+  // FIXME: This is extremely conservative. This really needs to use a
   // whitelist with a clear explanation for why each realocation needs to
   // point to the symbol, not to the section.
   switch (Type) {
@@ -244,8 +251,11 @@ MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD,
   case ELF::R_MICROMIPS_LO16:
     return true;
 
-  case ELF::R_MIPS_26:
   case ELF::R_MIPS_32:
+    if (MCELF::getOther(SD) & (ELF::STO_MIPS_MICROMIPS >> 2))
+      return true;
+    // falltrough
+  case ELF::R_MIPS_26:
   case ELF::R_MIPS_64:
   case ELF::R_MIPS_GPREL16:
     return false;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index 136146b..bc76d8a 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -37,7 +37,7 @@ public:
                   MCCodeEmitter *Emitter, const MCSubtargetInfo &STI)
       : MCELFStreamer(Context, MAB, OS, Emitter) {
 
-    RegInfoRecord = new MipsRegInfoRecord(this, Context, STI);
+    RegInfoRecord = new MipsRegInfoRecord(this, Context);
     MipsOptionRecords.push_back(
         std::unique_ptr<MipsRegInfoRecord>(RegInfoRecord));
   }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index 317db16..fa8d6a6 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -158,6 +158,12 @@ namespace Mips {
     // resulting in - R_MICROMIPS_GOT16
     fixup_MICROMIPS_GOT16,
 
+    // resulting in - R_MICROMIPS_PC7_S1
+    fixup_MICROMIPS_PC7_S1,
+
+    // resulting in - R_MICROMIPS_PC10_S1
+    fixup_MICROMIPS_PC10_S1,
+
     // resulting in - R_MICROMIPS_PC16_S1
     fixup_MICROMIPS_PC16_S1,
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 2f5d196..e2bd5a8 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -34,6 +34,7 @@ MipsMCAsmInfo::MipsMCAsmInfo(StringRef TT) {
   Data32bitsDirective         = "\t.4byte\t";
   Data64bitsDirective         = "\t.8byte\t";
   PrivateGlobalPrefix         = "$";
+  PrivateLabelPrefix          = "$";
   CommentString               = "#";
   ZeroDirective               = "\t.space\t";
   GPRel32Directive            = "\t.gpword\t";
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index d632c27..8208725 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -20,9 +20,9 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -173,7 +173,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   // Unfortunately in MIPS both NOP and SLL will come in with Binary == 0
   // so we have to special check for them.
   unsigned Opcode = TmpInst.getOpcode();
-  if ((Opcode != Mips::NOP) && (Opcode != Mips::SLL) && !Binary)
+  if ((Opcode != Mips::NOP) && (Opcode != Mips::SLL) &&
+      (Opcode != Mips::SLL_MM) && !Binary)
     llvm_unreachable("unimplemented opcode in EncodeInstruction()");
 
   if (STI.getFeatureBits() & Mips::FeatureMicroMips) {
@@ -219,6 +220,50 @@ getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
+/// getBranchTarget7OpValueMM - Return binary encoding of the microMIPS branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTarget7OpValueMM(const MCInst &MI, unsigned OpNo,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValueMM expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MICROMIPS_PC7_S1)));
+  return 0;
+}
+
+/// getBranchTargetOpValueMMPC10 - Return binary encoding of the microMIPS
+/// 10-bit branch target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValueMMPC10(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValuePC10 expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                   MCFixupKind(Mips::fixup_MICROMIPS_PC10_S1)));
+  return 0;
+}
+
 /// getBranchTargetOpValue - Return binary encoding of the microMIPS branch
 /// target operand. If the machine operand requires relocation,
 /// record the relocation and return zero.
@@ -635,6 +680,77 @@ MipsMCCodeEmitter::getMemEncoding(const MCInst &MI, unsigned OpNo,
 }
 
 unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm4(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 6-4, offset is encoded in bits 3-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),
+                                       Fixups, STI) << 4;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
+                                       Fixups, STI);
+
+  return (OffBits & 0xF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm4Lsl1(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 6-4, offset is encoded in bits 3-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),
+                                       Fixups, STI) << 4;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
+                                       Fixups, STI) >> 1;
+
+  return (OffBits & 0xF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm4Lsl2(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 6-4, offset is encoded in bits 3-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),
+                                       Fixups, STI) << 4;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
+                                       Fixups, STI) >> 2;
+
+  return (OffBits & 0xF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMSPImm5Lsl2(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+  // Register is encoded in bits 9-5, offset is encoded in bits 4-0.
+  assert(MI.getOperand(OpNo).isReg() &&
+         MI.getOperand(OpNo).getReg() == Mips::SP &&
+         "Unexpected base register!");
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
+                                       Fixups, STI) >> 2;
+
+  return OffBits & 0x1F;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMGPImm7Lsl2(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+  // Register is encoded in bits 9-7, offset is encoded in bits 6-0.
+  assert(MI.getOperand(OpNo).isReg() &&
+         MI.getOperand(OpNo).getReg() == Mips::GP &&
+         "Unexpected base register!");
+
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
+                                       Fixups, STI) >> 2;
+
+  return OffBits & 0x7F;
+}
+
+unsigned MipsMCCodeEmitter::
 getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
                       SmallVectorImpl<MCFixup> &Fixups,
                       const MCSubtargetInfo &STI) const {
@@ -657,6 +773,30 @@ getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
   return (OffBits & 0x0FFF) | RegBits;
 }
 
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+  // opNum can be invalid if instruction had reglist as operand
+  // MemOperand is always last operand of instruction (base + offset)
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case Mips::SWM16_MM:
+  case Mips::LWM16_MM:
+    OpNo = MI.getNumOperands() - 2;
+    break;
+  }
+
+  // Offset is encoded in bits 4-0.
+  assert(MI.getOperand(OpNo).isReg());
+  // Base register is always SP - thus it is not encoded.
+  assert(MI.getOperand(OpNo+1).isImm());
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+  return ((OffBits >> 2) & 0x0F);
+}
+
 unsigned
 MipsMCCodeEmitter::getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
                                       SmallVectorImpl<MCFixup> &Fixups,
@@ -788,4 +928,64 @@ MipsMCCodeEmitter::getRegisterListOpValue(const MCInst &MI, unsigned OpNo,
   return res;
 }
 
+unsigned
+MipsMCCodeEmitter::getRegisterListOpValue16(const MCInst &MI, unsigned OpNo,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  return (MI.getNumOperands() - 4);
+}
+
+unsigned
+MipsMCCodeEmitter::getRegisterPairOpValue(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+}
+
+unsigned
+MipsMCCodeEmitter::getMovePRegPairOpValue(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  unsigned res = 0;
+
+  if (MI.getOperand(0).getReg() == Mips::A1 &&
+      MI.getOperand(1).getReg() == Mips::A2)
+    res = 0;
+  else if (MI.getOperand(0).getReg() == Mips::A1 &&
+           MI.getOperand(1).getReg() == Mips::A3)
+    res = 1;
+  else if (MI.getOperand(0).getReg() == Mips::A2 &&
+           MI.getOperand(1).getReg() == Mips::A3)
+    res = 2;
+  else if (MI.getOperand(0).getReg() == Mips::A0 &&
+           MI.getOperand(1).getReg() == Mips::S5)
+    res = 3;
+  else if (MI.getOperand(0).getReg() == Mips::A0 &&
+           MI.getOperand(1).getReg() == Mips::S6)
+    res = 4;
+  else if (MI.getOperand(0).getReg() == Mips::A0 &&
+           MI.getOperand(1).getReg() == Mips::A1)
+    res = 5;
+  else if (MI.getOperand(0).getReg() == Mips::A0 &&
+           MI.getOperand(1).getReg() == Mips::A2)
+    res = 6;
+  else if (MI.getOperand(0).getReg() == Mips::A0 &&
+           MI.getOperand(1).getReg() == Mips::A3)
+    res = 7;
+
+  return res;
+}
+
+unsigned
+MipsMCCodeEmitter::getSimm23Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  assert(MO.isImm() && "getSimm23Lsl2Encoding expects only an immediate");
+  // The immediate is encoded as 'immediate >> 2'.
+  unsigned Res = static_cast<unsigned>(MO.getImm());
+  assert((Res & 3) == 0);
+  return Res >> 2;
+}
+
 #include "MipsGenMCCodeEmitter.inc"
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index 9016fcf..b01726d 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -31,8 +31,8 @@ class MCSubtargetInfo;
 class raw_ostream;
 
 class MipsMCCodeEmitter : public MCCodeEmitter {
-  MipsMCCodeEmitter(const MipsMCCodeEmitter &) LLVM_DELETED_FUNCTION;
-  void operator=(const MipsMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  MipsMCCodeEmitter(const MipsMCCodeEmitter &) = delete;
+  void operator=(const MipsMCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   MCContext &Ctx;
   bool IsLittleEndian;
@@ -101,6 +101,20 @@ public:
                                   SmallVectorImpl<MCFixup> &Fixups,
                                   const MCSubtargetInfo &STI) const;
 
+  // getBranchTarget7OpValue - Return binary encoding of the microMIPS branch
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTarget7OpValueMM(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  // getBranchTargetOpValueMMPC10 - Return binary encoding of the microMIPS
+  // 10-bit branch target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTargetOpValueMMPC10(const MCInst &MI, unsigned OpNo,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const;
+
   // getBranchTargetOpValue - Return binary encoding of the microMIPS branch
   // target operand. If the machine operand requires relocation,
   // record the relocation and return zero.
@@ -142,9 +156,27 @@ public:
   unsigned getMemEncoding(const MCInst &MI, unsigned OpNo,
                           SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm4(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm4Lsl1(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm4Lsl2(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMSPImm5Lsl2(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMGPImm7Lsl2(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
   unsigned getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
   unsigned getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
                               SmallVectorImpl<MCFixup> &Fixups,
                               const MCSubtargetInfo &STI) const;
@@ -172,12 +204,28 @@ public:
                             SmallVectorImpl<MCFixup> &Fixups,
                             const MCSubtargetInfo &STI) const;
 
+  unsigned getRegisterPairOpValue(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  unsigned getMovePRegPairOpValue(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  unsigned getSimm23Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
   unsigned getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const;
 
   unsigned getRegisterListOpValue(const MCInst &MI, unsigned OpNo,
                                   SmallVectorImpl<MCFixup> &Fixups,
                                   const MCSubtargetInfo &STI) const;
+
+  unsigned getRegisterListOpValue16(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
 }; // class MipsMCCodeEmitter
 } // namespace llvm.
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index bab4254..9b56067 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -43,7 +43,7 @@ using namespace llvm;
 
 /// Select the Mips CPU for the given triple and cpu name.
 /// FIXME: Merge with the copy in MipsSubtarget.cpp
-static inline StringRef selectMipsCPU(StringRef TT, StringRef CPU) {
+StringRef MIPS_MC::selectMipsCPU(StringRef TT, StringRef CPU) {
   if (CPU.empty() || CPU == "generic") {
     Triple TheTriple(TT);
     if (TheTriple.getArch() == Triple::mips ||
@@ -69,7 +69,7 @@ static MCRegisterInfo *createMipsMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *createMipsMCSubtargetInfo(StringRef TT, StringRef CPU,
                                                   StringRef FS) {
-  CPU = selectMipsCPU(TT, CPU);
+  CPU = MIPS_MC::selectMipsCPU(TT, CPU);
   MCSubtargetInfo *X = new MCSubtargetInfo();
   InitMipsMCSubtargetInfo(X, TT, CPU, FS);
   return X;
@@ -130,10 +130,8 @@ createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
   return S;
 }
 
-static MCStreamer *createMipsNullStreamer(MCContext &Ctx) {
-  MCStreamer *S = llvm::createNullStreamer(Ctx);
-  new MipsTargetStreamer(*S);
-  return S;
+static MCTargetStreamer *createMipsNullTargetStreamer(MCStreamer &S) {
+  return new MipsTargetStreamer(S);
 }
 
 extern "C" void LLVMInitializeMipsTargetMC() {
@@ -190,11 +188,14 @@ extern "C" void LLVMInitializeMipsTargetMC() {
   TargetRegistry::RegisterAsmStreamer(TheMips64Target, createMCAsmStreamer);
   TargetRegistry::RegisterAsmStreamer(TheMips64elTarget, createMCAsmStreamer);
 
-  TargetRegistry::RegisterNullStreamer(TheMipsTarget, createMipsNullStreamer);
-  TargetRegistry::RegisterNullStreamer(TheMipselTarget, createMipsNullStreamer);
-  TargetRegistry::RegisterNullStreamer(TheMips64Target, createMipsNullStreamer);
-  TargetRegistry::RegisterNullStreamer(TheMips64elTarget,
-                                       createMipsNullStreamer);
+  TargetRegistry::RegisterNullTargetStreamer(TheMipsTarget,
+                                             createMipsNullTargetStreamer);
+  TargetRegistry::RegisterNullTargetStreamer(TheMipselTarget,
+                                             createMipsNullTargetStreamer);
+  TargetRegistry::RegisterNullTargetStreamer(TheMips64Target,
+                                             createMipsNullTargetStreamer);
+  TargetRegistry::RegisterNullTargetStreamer(TheMips64elTarget,
+                                             createMipsNullTargetStreamer);
 
   // Register the asm backend.
   TargetRegistry::RegisterMCAsmBackend(TheMipsTarget,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index f08a8f4..9528b4e 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -55,10 +55,13 @@ MCAsmBackend *createMipsAsmBackendEL64(const Target &T,
                                        const MCRegisterInfo &MRI, StringRef TT,
                                        StringRef CPU);
 
-MCObjectWriter *createMipsELFObjectWriter(raw_ostream &OS,
-                                          uint8_t OSABI,
-                                          bool IsLittleEndian,
-                                          bool Is64Bit);
+MCObjectWriter *createMipsELFObjectWriter(raw_ostream &OS, uint8_t OSABI,
+                                          bool IsLittleEndian, bool Is64Bit);
+
+namespace MIPS_MC {
+StringRef selectMipsCPU(StringRef TT, StringRef CPU);
+}
+
 } // End llvm namespace
 
 // Defines symbolic names for Mips registers.  This defines a mapping from
diff --git a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
index 0ef2208..188e3e8 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -9,14 +9,15 @@
 
 #include "MipsOptionRecord.h"
 #include "MipsELFStreamer.h"
+#include "MipsTargetStreamer.h"
 #include "llvm/MC/MCSectionELF.h"
 
 using namespace llvm;
 
 void MipsRegInfoRecord::EmitMipsOptionRecord() {
   MCAssembler &MCA = Streamer->getAssembler();
-  Triple T(STI.getTargetTriple());
-  uint64_t Features = STI.getFeatureBits();
+  MipsTargetStreamer *MTS =
+      static_cast<MipsTargetStreamer *>(Streamer->getTargetStreamer());
 
   Streamer->PushSection();
 
@@ -24,17 +25,16 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
   // we don't emit .Mips.options for other ELFs other than N64.
   // Since .reginfo has the same information as .Mips.options (ODK_REGINFO),
   // we can use the same abstraction (MipsRegInfoRecord class) to handle both.
-  if (Features & Mips::FeatureN64) {
+  if (MTS->getABI().IsN64()) {
     // The EntrySize value of 1 seems strange since the records are neither
     // 1-byte long nor fixed length but it matches the value GAS emits.
     const MCSectionELF *Sec =
         Context.getELFSection(".MIPS.options", ELF::SHT_MIPS_OPTIONS,
-                              ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP,
-                              SectionKind::getMetadata(), 1, "");
+                              ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, 1, "");
     MCA.getOrCreateSectionData(*Sec).setAlignment(8);
     Streamer->SwitchSection(Sec);
 
-    Streamer->EmitIntValue(1, 1);  // kind
+    Streamer->EmitIntValue(ELF::ODK_REGINFO, 1);  // kind
     Streamer->EmitIntValue(40, 1); // size
     Streamer->EmitIntValue(0, 2);  // section
     Streamer->EmitIntValue(0, 4);  // info
@@ -46,11 +46,10 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
     Streamer->EmitIntValue(ri_cprmask[3], 4);
     Streamer->EmitIntValue(ri_gp_value, 8);
   } else {
-    const MCSectionELF *Sec =
-        Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO, ELF::SHF_ALLOC,
-                              SectionKind::getMetadata(), 24, "");
+    const MCSectionELF *Sec = Context.getELFSection(
+        ".reginfo", ELF::SHT_MIPS_REGINFO, ELF::SHF_ALLOC, 24, "");
     MCA.getOrCreateSectionData(*Sec)
-        .setAlignment(Features & Mips::FeatureN32 ? 8 : 4);
+        .setAlignment(MTS->getABI().IsN32() ? 8 : 4);
     Streamer->SwitchSection(Sec);
 
     Streamer->EmitIntValue(ri_gprmask, 4);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 1e092f2..64d7cab 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -43,6 +43,9 @@ void MipsTargetStreamer::emitDirectiveSetNoMacro() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetMsa() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetNoMsa() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetAt() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetAtWithArg(unsigned RegNo) {
+  forbidModuleDirective();
+}
 void MipsTargetStreamer::emitDirectiveSetNoAt() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveEnd(StringRef Name) {}
 void MipsTargetStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {}
@@ -67,9 +70,13 @@ void MipsTargetStreamer::emitDirectiveSetMips4() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetMips5() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetMips32() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetMips32R2() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips32R3() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips32R5() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetMips32R6() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetMips64() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetMips64R2() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips64R3() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips64R5() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetMips64R6() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetPop() {}
 void MipsTargetStreamer::emitDirectiveSetPush() {}
@@ -144,6 +151,11 @@ void MipsTargetAsmStreamer::emitDirectiveSetAt() {
   MipsTargetStreamer::emitDirectiveSetAt();
 }
 
+void MipsTargetAsmStreamer::emitDirectiveSetAtWithArg(unsigned RegNo) {
+  OS << "\t.set\tat=$" << Twine(RegNo) << "\n";
+  MipsTargetStreamer::emitDirectiveSetAtWithArg(RegNo);
+}
+
 void MipsTargetAsmStreamer::emitDirectiveSetNoAt() {
   OS << "\t.set\tnoat\n";
   MipsTargetStreamer::emitDirectiveSetNoAt();
@@ -223,6 +235,16 @@ void MipsTargetAsmStreamer::emitDirectiveSetMips32R2() {
   MipsTargetStreamer::emitDirectiveSetMips32R2();
 }
 
+void MipsTargetAsmStreamer::emitDirectiveSetMips32R3() {
+  OS << "\t.set\tmips32r3\n";
+  MipsTargetStreamer::emitDirectiveSetMips32R3();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips32R5() {
+  OS << "\t.set\tmips32r5\n";
+  MipsTargetStreamer::emitDirectiveSetMips32R5();
+}
+
 void MipsTargetAsmStreamer::emitDirectiveSetMips32R6() {
   OS << "\t.set\tmips32r6\n";
   MipsTargetStreamer::emitDirectiveSetMips32R6();
@@ -238,6 +260,16 @@ void MipsTargetAsmStreamer::emitDirectiveSetMips64R2() {
   MipsTargetStreamer::emitDirectiveSetMips64R2();
 }
 
+void MipsTargetAsmStreamer::emitDirectiveSetMips64R3() {
+  OS << "\t.set\tmips64r3\n";
+  MipsTargetStreamer::emitDirectiveSetMips64R3();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips64R5() {
+  OS << "\t.set\tmips64r5\n";
+  MipsTargetStreamer::emitDirectiveSetMips64R5();
+}
+
 void MipsTargetAsmStreamer::emitDirectiveSetMips64R6() {
   OS << "\t.set\tmips64r6\n";
   MipsTargetStreamer::emitDirectiveSetMips64R6();
@@ -335,19 +367,32 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
                                              const MCSubtargetInfo &STI)
     : MipsTargetStreamer(S), MicroMipsEnabled(false), STI(STI) {
   MCAssembler &MCA = getStreamer().getAssembler();
-  uint64_t Features = STI.getFeatureBits();
   Triple T(STI.getTargetTriple());
   Pic = (MCA.getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_)
             ? true
             : false;
 
-  // Update e_header flags
-  unsigned EFlags = 0;
+  uint64_t Features = STI.getFeatureBits();
+
+  // Set the header flags that we can in the constructor.
+  // FIXME: This is a fairly terrible hack. We set the rest
+  // of these in the destructor. The problem here is two-fold:
+  //
+  // a: Some of the eflags can be set/reset by directives.
+  // b: There aren't any usage paths that initialize the ABI
+  //    pointer until after we initialize either an assembler
+  //    or the target machine.
+  // We can fix this by making the target streamer construct
+  // the ABI, but this is fraught with wide ranging dependency
+  // issues as well.
+  unsigned EFlags = MCA.getELFHeaderEFlags();
 
   // Architecture
   if (Features & Mips::FeatureMips64r6)
     EFlags |= ELF::EF_MIPS_ARCH_64R6;
-  else if (Features & Mips::FeatureMips64r2)
+  else if (Features & Mips::FeatureMips64r2 ||
+           Features & Mips::FeatureMips64r3 ||
+           Features & Mips::FeatureMips64r5)
     EFlags |= ELF::EF_MIPS_ARCH_64R2;
   else if (Features & Mips::FeatureMips64)
     EFlags |= ELF::EF_MIPS_ARCH_64;
@@ -359,7 +404,9 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
     EFlags |= ELF::EF_MIPS_ARCH_3;
   else if (Features & Mips::FeatureMips32r6)
     EFlags |= ELF::EF_MIPS_ARCH_32R6;
-  else if (Features & Mips::FeatureMips32r2)
+  else if (Features & Mips::FeatureMips32r2 ||
+           Features & Mips::FeatureMips32r3 ||
+           Features & Mips::FeatureMips32r5)
     EFlags |= ELF::EF_MIPS_ARCH_32R2;
   else if (Features & Mips::FeatureMips32)
     EFlags |= ELF::EF_MIPS_ARCH_32;
@@ -368,19 +415,6 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
   else
     EFlags |= ELF::EF_MIPS_ARCH_1;
 
-  // ABI
-  // N64 does not require any ABI bits.
-  if (Features & Mips::FeatureO32)
-    EFlags |= ELF::EF_MIPS_ABI_O32;
-  else if (Features & Mips::FeatureN32)
-    EFlags |= ELF::EF_MIPS_ABI2;
-
-  if (Features & Mips::FeatureGP64Bit) {
-    if (Features & Mips::FeatureO32)
-      EFlags |= ELF::EF_MIPS_32BITMODE; /* Compatibility Mode */
-  } else if (Features & Mips::FeatureMips64r2 || Features & Mips::FeatureMips64)
-    EFlags |= ELF::EF_MIPS_32BITMODE;
-
   // Other options.
   if (Features & Mips::FeatureNaN2008)
     EFlags |= ELF::EF_MIPS_NAN2008;
@@ -388,8 +422,6 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
   // -mabicalls and -mplt are not implemented but we should act as if they were
   // given.
   EFlags |= ELF::EF_MIPS_CPIC;
-  if (Features & Mips::FeatureN64)
-    EFlags |= ELF::EF_MIPS_PIC;
 
   MCA.setELFHeaderEFlags(EFlags);
 }
@@ -424,6 +456,32 @@ void MipsTargetELFStreamer::finish() {
   DataSectionData.setAlignment(std::max(16u, DataSectionData.getAlignment()));
   BSSSectionData.setAlignment(std::max(16u, BSSSectionData.getAlignment()));
 
+  uint64_t Features = STI.getFeatureBits();
+
+  // Update e_header flags. See the FIXME and comment above in
+  // the constructor for a full rundown on this.
+  unsigned EFlags = MCA.getELFHeaderEFlags();
+
+  // ABI
+  // N64 does not require any ABI bits.
+  if (getABI().IsO32())
+    EFlags |= ELF::EF_MIPS_ABI_O32;
+  else if (getABI().IsN32())
+    EFlags |= ELF::EF_MIPS_ABI2;
+
+  if (Features & Mips::FeatureGP64Bit) {
+    if (getABI().IsO32())
+      EFlags |= ELF::EF_MIPS_32BITMODE; /* Compatibility Mode */
+  } else if (Features & Mips::FeatureMips64r2 || Features & Mips::FeatureMips64)
+    EFlags |= ELF::EF_MIPS_32BITMODE;
+
+  // If we've set the cpic eflag and we're n64, go ahead and set the pic
+  // one as well.
+  if (EFlags & ELF::EF_MIPS_CPIC && getABI().IsN64())
+    EFlags |= ELF::EF_MIPS_PIC;
+
+  MCA.setELFHeaderEFlags(EFlags);
+
   // Emit all the option records.
   // At the moment we are only emitting .Mips.options (ODK_REGINFO) and
   // .reginfo.
@@ -493,9 +551,8 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
   MCContext &Context = MCA.getContext();
   MCStreamer &OS = getStreamer();
 
-  const MCSectionELF *Sec = Context.getELFSection(".pdr", ELF::SHT_PROGBITS,
-                                                  ELF::SHF_ALLOC | ELF::SHT_REL,
-                                                  SectionKind::getMetadata());
+  const MCSectionELF *Sec = Context.getELFSection(
+      ".pdr", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHT_REL);
 
   const MCSymbolRefExpr *ExprRef =
       MCSymbolRefExpr::Create(Name, MCSymbolRefExpr::VK_None, Context);
@@ -604,7 +661,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   // addui $gp, $gp, %lo(_gp_disp)
   // addu  $gp, $gp, $reg
   // when support for position independent code is enabled.
-  if (!Pic || (isN32() || isN64()))
+  if (!Pic || (getABI().IsN32() || getABI().IsN64()))
     return;
 
   // There's a GNU extension controlled by -mno-shared that allows
@@ -653,7 +710,7 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
                                                  const MCSymbol &Sym,
                                                  bool IsReg) {
   // Only N32 and N64 emit anything for .cpsetup iff PIC is set.
-  if (!Pic || !(isN32() || isN64()))
+  if (!Pic || !(getABI().IsN32() || getABI().IsN64()))
     return;
 
   MCAssembler &MCA = getStreamer().getAssembler();
@@ -677,9 +734,10 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
   Inst.clear();
 
   const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::Create(
-      Sym.getName(), MCSymbolRefExpr::VK_Mips_GPOFF_HI, MCA.getContext());
+      &Sym, MCSymbolRefExpr::VK_Mips_GPOFF_HI, MCA.getContext());
   const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::Create(
-      Sym.getName(), MCSymbolRefExpr::VK_Mips_GPOFF_LO, MCA.getContext());
+      &Sym, MCSymbolRefExpr::VK_Mips_GPOFF_LO, MCA.getContext());
+
   // lui $gp, %hi(%neg(%gp_rel(funcSym)))
   Inst.setOpcode(Mips::LUi);
   Inst.addOperand(MCOperand::CreateReg(Mips::GP));
@@ -709,9 +767,8 @@ void MipsTargetELFStreamer::emitMipsAbiFlags() {
   MCAssembler &MCA = getStreamer().getAssembler();
   MCContext &Context = MCA.getContext();
   MCStreamer &OS = getStreamer();
-  const MCSectionELF *Sec =
-      Context.getELFSection(".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS,
-                            ELF::SHF_ALLOC, SectionKind::getMetadata(), 24, "");
+  const MCSectionELF *Sec = Context.getELFSection(
+      ".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS, ELF::SHF_ALLOC, 24, "");
   MCSectionData &ABIShndxSD = MCA.getOrCreateSectionData(*Sec);
   ABIShndxSD.setAlignment(8);
   OS.SwitchSection(Sec);
diff --git a/lib/Target/Mips/MicroMipsInstrFormats.td b/lib/Target/Mips/MicroMipsInstrFormats.td
index 59bf949..560afa4 100644
--- a/lib/Target/Mips/MicroMipsInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -108,6 +108,40 @@ class ADDIUR2_FM_MM16 {
   let Inst{0}     = 0;
 }
 
+class LOAD_STORE_FM_MM16<bits<6> op> {
+  bits<3> rt;
+  bits<7> addr;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = op;
+  let Inst{9-7}   = rt;
+  let Inst{6-4}   = addr{6-4};
+  let Inst{3-0}   = addr{3-0};
+}
+
+class LOAD_STORE_SP_FM_MM16<bits<6> op> {
+  bits<5> rt;
+  bits<5> offset;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = op;
+  let Inst{9-5}   = rt;
+  let Inst{4-0}   = offset;
+}
+
+class LOAD_GP_FM_MM16<bits<6> op> {
+  bits<3> rt;
+  bits<7> offset;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = op;
+  let Inst{9-7} = rt;
+  let Inst{6-0} = offset;
+}
+
 class ADDIUS5_FM_MM16 {
   bits<5> rd;
   bits<4> imm;
@@ -195,6 +229,49 @@ class ADDIUR1SP_FM_MM16 {
   let Inst{0}     = 1;
 }
 
+class BRKSDBBP16_FM_MM<bits<6> op> {
+  bits<4> code_;
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-4}   = op;
+  let Inst{3-0}   = code_;
+}
+
+class BEQNEZ_FM_MM16<bits<6> op> {
+  bits<3> rs;
+  bits<7> offset;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = op;
+  let Inst{9-7}   = rs;
+  let Inst{6-0}   = offset;
+}
+
+class B16_FM {
+  bits<10> offset;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x33;
+  let Inst{9-0}   = offset;
+}
+
+class MOVEP_FM_MM16 {
+  bits<3> dst_regs;
+  bits<3> rt;
+  bits<3> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x21;
+  let Inst{9-7}   = dst_regs;
+  let Inst{6-4}   = rt;
+  let Inst{3-1}   = rs;
+  let Inst{0}     = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // MicroMIPS 32-bit Instruction Formats
 //===----------------------------------------------------------------------===//
@@ -817,3 +894,52 @@ class LWM_FM_MM<bits<4> funct> : MMArch {
   let Inst{15-12} = funct;
   let Inst{11-0}  = addr{11-0};
 }
+
+class LWM_FM_MM16<bits<4> funct> : MMArch {
+  bits<2> rt;
+  bits<4> addr;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-6}   = funct;
+  let Inst{5-4}   = rt;
+  let Inst{3-0}   = addr;
+}
+
+class CACHE_PREF_FM_MM<bits<6> op, bits<4> funct> : MMArch {
+  bits<21> addr;
+  bits<5> hint;
+  bits<5> base = addr{20-16};
+  bits<12> offset = addr{11-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = hint;
+  let Inst{20-16} = base;
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = offset;
+}
+
+class BARRIER_FM_MM<bits<5> op> : MMArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-21} = 0x0;
+  let Inst{20-16} = 0x0;
+  let Inst{15-11} = op;
+  let Inst{10-6}  = 0x0;
+  let Inst{5-0}   = 0x0;
+}
+
+class ADDIUPC_FM_MM {
+  bits<3> rs;
+  bits<23> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1e;
+  let Inst{25-23} = rs;
+  let Inst{22-0} = imm;
+}
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index e854620..e20df2f 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -1,7 +1,13 @@
 def addrimm12 : ComplexPattern<iPTR, 2, "selectIntAddrMM", [frameindex]>;
+def addrimm4lsl2 : ComplexPattern<iPTR, 2, "selectIntAddrLSL2MM", [frameindex]>;
 
-def simm4 : Operand<i32>;
+def simm4 : Operand<i32> {
+  let DecoderMethod = "DecodeSimm4";
+}
 def simm7 : Operand<i32>;
+def li_simm7 : Operand<i32> {
+  let DecoderMethod = "DecodeLiSimm7";
+}
 
 def simm12 : Operand<i32> {
   let DecoderMethod = "DecodeSimm12";
@@ -9,14 +15,17 @@ def simm12 : Operand<i32> {
 
 def uimm5_lsl2 : Operand<OtherVT> {
   let EncoderMethod = "getUImm5Lsl2Encoding";
+  let DecoderMethod = "DecodeUImm5lsl2";
 }
 
 def uimm6_lsl2 : Operand<i32> {
   let EncoderMethod = "getUImm6Lsl2Encoding";
+  let DecoderMethod = "DecodeUImm6Lsl2";
 }
 
 def simm9_addiusp : Operand<i32> {
   let EncoderMethod = "getSImm9AddiuspValue";
+  let DecoderMethod = "DecodeSimm9SP";
 }
 
 def uimm3_shift : Operand<i32> {
@@ -25,10 +34,12 @@ def uimm3_shift : Operand<i32> {
 
 def simm3_lsa2 : Operand<i32> {
   let EncoderMethod = "getSImm3Lsa2Value";
+  let DecoderMethod = "DecodeAddiur2Simm7";
 }
 
 def uimm4_andi : Operand<i32> {
   let EncoderMethod = "getUImm4AndValue";
+  let DecoderMethod = "DecodeANDI16Imm";
 }
 
 def immSExtAddiur2 : ImmLeaf<i32, [{return Imm == 1 || Imm == -1 ||
@@ -46,6 +57,54 @@ def immZExt2Shift : ImmLeaf<i32, [{return Imm >= 1 && Imm <= 8;}]>;
 
 def immLi16 : ImmLeaf<i32, [{return Imm >= -1 && Imm <= 126;}]>;
 
+def MicroMipsMemGPRMM16AsmOperand : AsmOperandClass {
+  let Name = "MicroMipsMem";
+  let RenderMethod = "addMicroMipsMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithGRPMM16Base";
+}
+
+class mem_mm_4_generic : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops GPRMM16, simm4);
+  let OperandType = "OPERAND_MEMORY";
+  let ParserMatchClass = MicroMipsMemGPRMM16AsmOperand;
+}
+
+def mem_mm_4 : mem_mm_4_generic {
+  let EncoderMethod = "getMemEncodingMMImm4";
+}
+
+def mem_mm_4_lsl1 : mem_mm_4_generic {
+  let EncoderMethod = "getMemEncodingMMImm4Lsl1";
+}
+
+def mem_mm_4_lsl2 : mem_mm_4_generic {
+  let EncoderMethod = "getMemEncodingMMImm4Lsl2";
+}
+
+def MicroMipsMemSPAsmOperand : AsmOperandClass {
+  let Name = "MicroMipsMemSP";
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithUimmWordAlignedOffsetSP<7>";
+}
+
+def mem_mm_sp_imm5_lsl2 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops GPR32:$base, simm5:$offset);
+  let OperandType = "OPERAND_MEMORY";
+  let ParserMatchClass = MicroMipsMemSPAsmOperand;
+  let EncoderMethod = "getMemEncodingMMSPImm5Lsl2";
+}
+
+def mem_mm_gp_imm7_lsl2 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops GPRMM16:$base, simm7:$offset);
+  let OperandType = "OPERAND_MEMORY";
+  let EncoderMethod = "getMemEncodingMMGPImm7Lsl2";
+}
+
 def mem_mm_12 : Operand<i32> {
   let PrintMethod = "printMemOperand";
   let MIOperandInfo = (ops GPR32, simm12);
@@ -54,6 +113,22 @@ def mem_mm_12 : Operand<i32> {
   let OperandType = "OPERAND_MEMORY";
 }
 
+def MipsMemUimm4AsmOperand : AsmOperandClass {
+  let Name = "MemOffsetUimm4";
+  let SuperClasses = [MipsMemAsmOperand];
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithUimmOffsetSP<6>";
+}
+
+def mem_mm_4sp : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops GPR32, uimm8);
+  let EncoderMethod = "getMemEncodingMMImm4sp";
+  let ParserMatchClass = MipsMemUimm4AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
 def jmptarget_mm : Operand<OtherVT> {
   let EncoderMethod = "getJumpTargetOpValueMM";
 }
@@ -62,10 +137,30 @@ def calltarget_mm : Operand<iPTR> {
   let EncoderMethod = "getJumpTargetOpValueMM";
 }
 
+def brtarget7_mm : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTarget7OpValueMM";
+  let OperandType   = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget7MM";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def brtarget10_mm : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValueMMPC10";
+  let OperandType   = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget10MM";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
 def brtarget_mm : Operand<OtherVT> {
   let EncoderMethod = "getBranchTargetOpValueMM";
   let OperandType   = "OPERAND_PCREL";
   let DecoderMethod = "DecodeBranchTargetMM";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def simm23_lsl2 : Operand<i32> {
+  let EncoderMethod = "getSimm23Lsl2Encoding";
+  let DecoderMethod = "DecodeSimm23Lsl2";
 }
 
 class CompactBranchMM<string opstr, DAGOperand opnd, PatFrag cond_op,
@@ -97,6 +192,58 @@ class StoreLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
   let DecoderMethod = "DecodeMemMMImm12";
 }
 
+/// A register pair used by movep instruction.
+def MovePRegPairAsmOperand : AsmOperandClass {
+  let Name = "MovePRegPair";
+  let ParserMethod = "parseMovePRegPair";
+  let PredicateMethod = "isMovePRegPair";
+}
+
+def movep_regpair : Operand<i32> {
+  let EncoderMethod = "getMovePRegPairOpValue";
+  let ParserMatchClass = MovePRegPairAsmOperand;
+  let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeMovePRegPair";
+  let MIOperandInfo = (ops GPR32Opnd, GPR32Opnd);
+}
+
+class MovePMM16<string opstr, RegisterOperand RO> :
+MicroMipsInst16<(outs movep_regpair:$dst_regs), (ins RO:$rs, RO:$rt),
+                 !strconcat(opstr, "\t$dst_regs, $rs, $rt"), [],
+                 NoItinerary, FrmR> {
+  let isReMaterializable = 1;
+}
+
+/// A register pair used by load/store pair instructions.
+def RegPairAsmOperand : AsmOperandClass {
+  let Name = "RegPair";
+  let ParserMethod = "parseRegisterPair";
+}
+
+def regpair : Operand<i32> {
+  let EncoderMethod = "getRegisterPairOpValue";
+  let ParserMatchClass = RegPairAsmOperand;
+  let PrintMethod = "printRegisterPair";
+  let DecoderMethod = "DecodeRegPairOperand";
+  let MIOperandInfo = (ops GPR32Opnd, GPR32Opnd);
+}
+
+class StorePairMM<string opstr, InstrItinClass Itin = NoItinerary,
+                  ComplexPattern Addr = addr> :
+  InstSE<(outs), (ins regpair:$rt, mem_mm_12:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI, opstr> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let mayStore = 1;
+}
+
+class LoadPairMM<string opstr, InstrItinClass Itin = NoItinerary,
+                 ComplexPattern Addr = addr> :
+  InstSE<(outs regpair:$rt), (ins mem_mm_12:$addr),
+          !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI, opstr> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let mayLoad = 1;
+}
+
 class LLBaseMM<string opstr, RegisterOperand RO> :
   InstSE<(outs RO:$rt), (ins mem_mm_12:$addr),
          !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
@@ -156,6 +303,50 @@ class ShiftIMM16<string opstr, Operand ImmOpnd, RegisterOperand RO,
   MicroMipsInst16<(outs RO:$rd), (ins RO:$rt, ImmOpnd:$shamt),
                   !strconcat(opstr, "\t$rd, $rt, $shamt"), [], Itin, FrmR>;
 
+class LoadMM16<string opstr, DAGOperand RO, SDPatternOperator OpNode,
+               InstrItinClass Itin, Operand MemOpnd> :
+  MicroMipsInst16<(outs RO:$rt), (ins MemOpnd:$addr),
+                  !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm4";
+  let canFoldAsLoad = 1;
+  let mayLoad = 1;
+}
+
+class StoreMM16<string opstr, DAGOperand RTOpnd, DAGOperand RO,
+                SDPatternOperator OpNode, InstrItinClass Itin,
+                Operand MemOpnd> :
+  MicroMipsInst16<(outs), (ins RTOpnd:$rt, MemOpnd:$addr),
+                  !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm4";
+  let mayStore = 1;
+}
+
+class LoadSPMM16<string opstr, DAGOperand RO, InstrItinClass Itin,
+                 Operand MemOpnd> :
+  MicroMipsInst16<(outs RO:$rt), (ins MemOpnd:$offset),
+                  !strconcat(opstr, "\t$rt, $offset"), [], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMSPImm5Lsl2";
+  let canFoldAsLoad = 1;
+  let mayLoad = 1;
+}
+
+class StoreSPMM16<string opstr, DAGOperand RO, InstrItinClass Itin,
+                  Operand MemOpnd> :
+  MicroMipsInst16<(outs), (ins RO:$rt, MemOpnd:$offset),
+                  !strconcat(opstr, "\t$rt, $offset"), [], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMSPImm5Lsl2";
+  let mayStore = 1;
+}
+
+class LoadGPMM16<string opstr, DAGOperand RO, InstrItinClass Itin,
+                 Operand MemOpnd> :
+  MicroMipsInst16<(outs RO:$rt), (ins MemOpnd:$offset),
+                  !strconcat(opstr, "\t$rt, $offset"), [], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMGPImm7Lsl2";
+  let canFoldAsLoad = 1;
+  let mayLoad = 1;
+}
+
 class AddImmUR2<string opstr, RegisterOperand RO> :
   MicroMipsInst16<(outs RO:$rd), (ins RO:$rs, simm3_lsa2:$imm),
                   !strconcat(opstr, "\t$rd, $rs, $imm"),
@@ -192,8 +383,7 @@ class MoveMM16<string opstr, RegisterOperand RO, bit isComm = 0,
   let isReMaterializable = 1;
 }
 
-class LoadImmMM16<string opstr, Operand Od, RegisterOperand RO,
-                  SDPatternOperator imm_type = null_frag> :
+class LoadImmMM16<string opstr, Operand Od, RegisterOperand RO> :
   MicroMipsInst16<(outs RO:$rd), (ins Od:$imm),
                   !strconcat(opstr, "\t$rd, $imm"), [], NoItinerary, FrmI> {
   let isReMaterializable = 1;
@@ -223,7 +413,6 @@ class JumpRAddiuStackMM16 :
                   [], IIBranch, FrmR> {
   let isTerminator = 1;
   let isBarrier = 1;
-  let hasDelaySlot = 1;
   let isBranch = 1;
   let isIndirectBranch = 1;
 }
@@ -247,6 +436,21 @@ class JumpRegCMM16<string opstr, RegisterOperand RO> :
   let isIndirectBranch = 1;
 }
 
+// Break16 and Sdbbp16
+class BrkSdbbp16MM<string opstr> :
+  MicroMipsInst16<(outs), (ins uimm4:$code_),
+                  !strconcat(opstr, "\t$code_"),
+                  [], NoItinerary, FrmOther>;
+
+class CBranchZeroMM<string opstr, DAGOperand opnd, RegisterOperand RO> :
+  MicroMipsInst16<(outs), (ins RO:$rs, opnd:$offset),
+                  !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let Defs = [AT];
+}
+
 // MicroMIPS Jump and Link (Call) - Short Delay Slot
 let isCall = 1, hasDelaySlot = 1, Defs = [RA] in {
   class JumpLinkMM<string opstr, DAGOperand opnd> :
@@ -271,6 +475,10 @@ class LoadWordIndexedScaledMM<string opstr, RegisterOperand RO,
   InstSE<(outs RO:$rd), (ins PtrRC:$base, PtrRC:$index),
          !strconcat(opstr, "\t$rd, ${index}(${base})"), [], Itin, FrmFI>;
 
+class AddImmUPC<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rs), (ins simm23_lsl2:$imm),
+         !strconcat(opstr, "\t$rs, $imm"), [], NoItinerary, FrmR>;
+
 /// A list of registers used by load/store multiple instructions.
 def RegListAsmOperand : AsmOperandClass {
   let Name = "RegList";
@@ -284,6 +492,20 @@ def reglist : Operand<i32> {
   let DecoderMethod = "DecodeRegListOperand";
 }
 
+def RegList16AsmOperand : AsmOperandClass {
+  let Name = "RegList16";
+  let ParserMethod = "parseRegisterList";
+  let PredicateMethod = "isRegList16";
+  let RenderMethod = "addRegListOperands";
+}
+
+def reglist16 : Operand<i32> {
+  let EncoderMethod = "getRegisterListOpValue16";
+  let DecoderMethod = "DecodeRegListOperand16";
+  let PrintMethod = "printRegisterList";
+  let ParserMatchClass = RegList16AsmOperand;
+}
+
 class StoreMultMM<string opstr,
             InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
   InstSE<(outs), (ins reglist:$rt, mem_mm_12:$addr),
@@ -300,6 +522,36 @@ class LoadMultMM<string opstr,
   let mayLoad = 1;
 }
 
+class StoreMultMM16<string opstr,
+                    InstrItinClass Itin = NoItinerary,
+                    ComplexPattern Addr = addr> :
+  MicroMipsInst16<(outs), (ins reglist16:$rt, mem_mm_4sp:$addr),
+                  !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
+  let mayStore = 1;
+}
+
+class LoadMultMM16<string opstr,
+                   InstrItinClass Itin = NoItinerary,
+                   ComplexPattern Addr = addr> :
+  MicroMipsInst16<(outs reglist16:$rt), (ins mem_mm_4sp:$addr),
+                  !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
+  let mayLoad = 1;
+}
+
+class UncondBranchMM16<string opstr> :
+  MicroMipsInst16<(outs), (ins brtarget10_mm:$offset),
+                  !strconcat(opstr, "\t$offset"),
+                  [], IIBranch, FrmI> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let hasDelaySlot = 1;
+  let Predicates = [RelocPIC, InMicroMips];
+  let Defs = [AT];
+}
+
 def ADDU16_MM : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
                 ARITH_FM_MM16<0>;
 def SUBU16_MM : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
@@ -316,6 +568,25 @@ def SLL16_MM : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>,
                SHIFT_FM_MM16<0>;
 def SRL16_MM : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>,
                SHIFT_FM_MM16<1>;
+def LBU16_MM : LoadMM16<"lbu16", GPRMM16Opnd, zextloadi8, II_LBU,
+                        mem_mm_4>, LOAD_STORE_FM_MM16<0x02>;
+def LHU16_MM : LoadMM16<"lhu16", GPRMM16Opnd, zextloadi16, II_LHU,
+                        mem_mm_4_lsl1>, LOAD_STORE_FM_MM16<0x0a>;
+def LW16_MM : LoadMM16<"lw16", GPRMM16Opnd, load, II_LW, mem_mm_4_lsl2>,
+                      LOAD_STORE_FM_MM16<0x1a>;
+def SB16_MM : StoreMM16<"sb16", GPRMM16OpndZero, GPRMM16Opnd, truncstorei8,
+                        II_SB, mem_mm_4>, LOAD_STORE_FM_MM16<0x22>;
+def SH16_MM : StoreMM16<"sh16", GPRMM16OpndZero, GPRMM16Opnd, truncstorei16,
+                        II_SH, mem_mm_4_lsl1>,
+                        LOAD_STORE_FM_MM16<0x2a>;
+def SW16_MM : StoreMM16<"sw16", GPRMM16OpndZero, GPRMM16Opnd, store, II_SW,
+                        mem_mm_4_lsl2>, LOAD_STORE_FM_MM16<0x3a>;
+def LWGP_MM : LoadGPMM16<"lw", GPRMM16Opnd, II_LW, mem_mm_gp_imm7_lsl2>,
+                         LOAD_GP_FM_MM16<0x19>;
+def LWSP_MM : LoadSPMM16<"lw", GPR32Opnd, II_LW, mem_mm_sp_imm5_lsl2>,
+              LOAD_STORE_SP_FM_MM16<0x12>;
+def SWSP_MM : StoreSPMM16<"sw", GPR32Opnd, II_SW, mem_mm_sp_imm5_lsl2>,
+              LOAD_STORE_SP_FM_MM16<0x32>;
 def ADDIUR1SP_MM : AddImmUR1SP<"addiur1sp", GPRMM16Opnd>, ADDIUR1SP_FM_MM16;
 def ADDIUR2_MM : AddImmUR2<"addiur2", GPRMM16Opnd>, ADDIUR2_FM_MM16;
 def ADDIUS5_MM : AddImmUS5<"addius5", GPR32Opnd>, ADDIUS5_FM_MM16;
@@ -323,13 +594,21 @@ def ADDIUSP_MM : AddImmUSP<"addiusp">, ADDIUSP_FM_MM16;
 def MFHI16_MM : MoveFromHILOMM<"mfhi", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x10>;
 def MFLO16_MM : MoveFromHILOMM<"mflo", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x12>;
 def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>;
-def LI16_MM : LoadImmMM16<"li16", simm7, GPRMM16Opnd, immLi16>,
-              LI_FM_MM16, IsAsCheapAsAMove;
+def MOVEP_MM : MovePMM16<"movep", GPRMM16OpndMoveP>, MOVEP_FM_MM16;
+def LI16_MM : LoadImmMM16<"li16", li_simm7, GPRMM16Opnd>, LI_FM_MM16,
+              IsAsCheapAsAMove;
 def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>;
 def JALRS16_MM : JumpLinkRegSMM16<"jalrs16", GPR32Opnd>, JALR_FM_MM16<0x0f>;
 def JRC16_MM : JumpRegCMM16<"jrc", GPR32Opnd>, JALR_FM_MM16<0x0d>;
 def JRADDIUSP : JumpRAddiuStackMM16, JRADDIUSP_FM_MM16<0x18>;
 def JR16_MM : JumpRegMM16<"jr16", GPR32Opnd>, JALR_FM_MM16<0x0c>;
+def BEQZ16_MM : CBranchZeroMM<"beqz16", brtarget7_mm, GPRMM16Opnd>,
+                BEQNEZ_FM_MM16<0x23>;
+def BNEZ16_MM : CBranchZeroMM<"bnez16", brtarget7_mm, GPRMM16Opnd>,
+                BEQNEZ_FM_MM16<0x2b>;
+def B16_MM : UncondBranchMM16<"b16">, B16_FM;
+def BREAK16_MM : BrkSdbbp16MM<"break16">, BRKSDBBP16_FM_MM<0x28>;
+def SDBBP16_MM : BrkSdbbp16MM<"sdbbp16">, BRKSDBBP16_FM_MM<0x2C>;
 
 class WaitMM<string opstr> :
   InstSE<(outs), (ins uimm10:$code_), !strconcat(opstr, "\t$code_"), [],
@@ -387,6 +666,9 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def UDIV_MM  : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
                  MULT_FM_MM<0x2ec>;
 
+  /// Arithmetic Instructions with PC and Immediate
+  def ADDIUPC_MM : AddImmUPC<"addiupc", GPRMM16Opnd>, ADDIUPC_FM_MM;
+
   /// Shift Instructions
   def SLL_MM   : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL>,
                  SRA_FM_MM<0, 0>;
@@ -434,6 +716,25 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   /// Load and Store Instructions - multiple
   def SWM32_MM  : StoreMultMM<"swm32">, LWM_FM_MM<0xd>;
   def LWM32_MM  : LoadMultMM<"lwm32">, LWM_FM_MM<0x5>;
+  def SWM16_MM : StoreMultMM16<"swm16">, LWM_FM_MM16<0x5>;
+  def LWM16_MM : LoadMultMM16<"lwm16">, LWM_FM_MM16<0x4>;
+
+  /// Load and Store Pair Instructions
+  def SWP_MM  : StorePairMM<"swp">, LWM_FM_MM<0x9>;
+  def LWP_MM  : LoadPairMM<"lwp">, LWM_FM_MM<0x1>;
+
+  /// Load and Store multiple pseudo Instructions
+  class LoadWordMultMM<string instr_asm > :
+    MipsAsmPseudoInst<(outs reglist:$rt), (ins mem_mm_12:$addr),
+                      !strconcat(instr_asm, "\t$rt, $addr")> ;
+
+  class StoreWordMultMM<string instr_asm > :
+    MipsAsmPseudoInst<(outs), (ins reglist:$rt, mem_mm_12:$addr),
+                      !strconcat(instr_asm, "\t$rt, $addr")> ;
+
+
+  def SWM_MM  : StoreWordMultMM<"swm">;
+  def LWM_MM  : LoadWordMultMM<"lwm">;
 
   /// Move Conditional
   def MOVZ_I_MM : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd,
@@ -487,6 +788,7 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
     def J_MM        : MMRel, JumpFJ<jmptarget_mm, "j", br, bb, "j">,
                       J_FM_MM<0x35>;
     def JAL_MM      : MMRel, JumpLink<"jal", calltarget_mm>, J_FM_MM<0x3d>;
+    def JALX_MM     : MMRel, JumpLink<"jalx", calltarget>, J_FM_MM<0x3c>;
   }
   def JR_MM   : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>;
   def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>;
@@ -550,6 +852,16 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def LL_MM : LLBaseMM<"ll", GPR32Opnd>, LL_FM_MM<0x3>;
   def SC_MM : SCBaseMM<"sc", GPR32Opnd>, LL_FM_MM<0xb>;
 
+  let DecoderMethod = "DecodeCacheOpMM" in {
+  def CACHE_MM : MMRel, CacheOp<"cache", mem_mm_12>,
+                 CACHE_PREF_FM_MM<0x08, 0x6>;
+  def PREF_MM  : MMRel, CacheOp<"pref", mem_mm_12>,
+                 CACHE_PREF_FM_MM<0x18, 0x2>;
+  }
+  def SSNOP_MM : MMRel, Barrier<"ssnop">, BARRIER_FM_MM<0x1>;
+  def EHB_MM   : MMRel, Barrier<"ehb">, BARRIER_FM_MM<0x3>;
+  def PAUSE_MM : MMRel, Barrier<"pause">, BARRIER_FM_MM<0x5>;
+
   def TLBP_MM : MMRel, TLB<"tlbp">, COP0_TLB_FM_MM<0x0d>;
   def TLBR_MM : MMRel, TLB<"tlbr">, COP0_TLB_FM_MM<0x4d>;
   def TLBWI_MM : MMRel, TLB<"tlbwi">, COP0_TLB_FM_MM<0x8d>;
@@ -565,6 +877,13 @@ let Predicates = [InMicroMips] in {
 // MicroMips arbitrary patterns that map to one or more instructions
 //===----------------------------------------------------------------------===//
 
+def : MipsPat<(i32 immLi16:$imm),
+              (LI16_MM immLi16:$imm)>;
+def : MipsPat<(i32 immSExt16:$imm),
+              (ADDiu_MM ZERO, immSExt16:$imm)>;
+def : MipsPat<(i32 immZExt16:$imm),
+              (ORi_MM ZERO, immZExt16:$imm)>;
+
 def : MipsPat<(add GPRMM16:$src, immSExtAddiur2:$imm),
               (ADDIUR2_MM GPRMM16:$src, immSExtAddiur2:$imm)>;
 def : MipsPat<(add GPR32:$src, immSExtAddius5:$imm),
@@ -587,9 +906,27 @@ def : MipsPat<(srl GPRMM16:$src, immZExt2Shift:$imm),
 def : MipsPat<(srl GPR32:$src, immZExt5:$imm),
               (SRL_MM GPR32:$src, immZExt5:$imm)>;
 
+def : MipsPat<(store GPRMM16:$src, addrimm4lsl2:$addr),
+              (SW16_MM GPRMM16:$src, addrimm4lsl2:$addr)>;
+def : MipsPat<(store GPR32:$src, addr:$addr),
+              (SW_MM GPR32:$src, addr:$addr)>;
+
+def : MipsPat<(load addrimm4lsl2:$addr),
+              (LW16_MM addrimm4lsl2:$addr)>;
+def : MipsPat<(load addr:$addr),
+              (LW_MM addr:$addr)>;
+
 //===----------------------------------------------------------------------===//
 // MicroMips instruction aliases
 //===----------------------------------------------------------------------===//
 
+class UncondBranchMMPseudo<string opstr> :
+  MipsAsmPseudoInst<(outs), (ins brtarget_mm:$offset),
+                    !strconcat(opstr, "\t$offset")>;
+
+  def B_MM_Pseudo : UncondBranchMMPseudo<"b">;
+
   def : MipsInstAlias<"wait", (WAIT_MM 0x0), 1>;
+  def : MipsInstAlias<"nop", (SLL_MM ZERO, ZERO, 0), 1>;
+  def : MipsInstAlias<"nop", (MOVE16_MM ZERO, ZERO), 1>;
 }
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index 87f1b04..cb09c1a 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -22,7 +22,6 @@ namespace llvm {
   class MipsTargetMachine;
   class FunctionPass;
 
-  FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
   FunctionPass *createMipsOptimizePICCallPass(MipsTargetMachine &TM);
   FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
   FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM);
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 3e1d047..01c548e 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -69,14 +69,6 @@ def FeatureNaN2008     : SubtargetFeature<"nan2008", "IsNaN2008bit", "true",
                                 "IEEE 754-2008 NaN encoding.">;
 def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat",
                                 "true", "Only supports single precision float">;
-def FeatureO32         : SubtargetFeature<"o32", "ABI", "MipsABIInfo::O32()",
-                                "Enable o32 ABI">;
-def FeatureN32         : SubtargetFeature<"n32", "ABI", "MipsABIInfo::N32()",
-                                "Enable n32 ABI">;
-def FeatureN64         : SubtargetFeature<"n64", "ABI", "MipsABIInfo::N64()",
-                                "Enable n64 ABI">;
-def FeatureEABI        : SubtargetFeature<"eabi", "ABI", "MipsABIInfo::EABI()",
-                                "Enable eabi ABI">;
 def FeatureNoOddSPReg  : SubtargetFeature<"nooddspreg", "UseOddSPReg", "false",
                               "Disable odd numbered single-precision "
                               "registers">;
@@ -122,10 +114,16 @@ def FeatureMips32r2    : SubtargetFeature<"mips32r2", "MipsArchVersion",
                                 "Mips32r2", "Mips32r2 ISA Support",
                                 [FeatureMips3_32r2, FeatureMips4_32r2,
                                  FeatureMips5_32r2, FeatureMips32]>;
+def FeatureMips32r3    : SubtargetFeature<"mips32r3", "MipsArchVersion",
+                                "Mips32r3", "Mips32r3 ISA Support",
+                                [FeatureMips32r2]>;
+def FeatureMips32r5    : SubtargetFeature<"mips32r5", "MipsArchVersion",
+                                "Mips32r5", "Mips32r5 ISA Support",
+                                [FeatureMips32r3]>;
 def FeatureMips32r6    : SubtargetFeature<"mips32r6", "MipsArchVersion",
                                 "Mips32r6",
                                 "Mips32r6 ISA Support [experimental]",
-                                [FeatureMips32r2, FeatureFP64Bit,
+                                [FeatureMips32r5, FeatureFP64Bit,
                                  FeatureNaN2008]>;
 def FeatureMips64      : SubtargetFeature<"mips64", "MipsArchVersion",
                                 "Mips64", "Mips64 ISA Support",
@@ -133,10 +131,16 @@ def FeatureMips64      : SubtargetFeature<"mips64", "MipsArchVersion",
 def FeatureMips64r2    : SubtargetFeature<"mips64r2", "MipsArchVersion",
                                 "Mips64r2", "Mips64r2 ISA Support",
                                 [FeatureMips64, FeatureMips32r2]>;
+def FeatureMips64r3    : SubtargetFeature<"mips64r3", "MipsArchVersion",
+                                "Mips64r3", "Mips64r3 ISA Support",
+                                [FeatureMips64r2, FeatureMips32r3]>;
+def FeatureMips64r5    : SubtargetFeature<"mips64r5", "MipsArchVersion",
+                                "Mips64r5", "Mips64r5 ISA Support",
+                                [FeatureMips64r3, FeatureMips32r5]>;
 def FeatureMips64r6    : SubtargetFeature<"mips64r6", "MipsArchVersion",
                                 "Mips64r6",
                                 "Mips64r6 ISA Support [experimental]",
-                                [FeatureMips32r6, FeatureMips64r2,
+                                [FeatureMips32r6, FeatureMips64r5,
                                  FeatureNaN2008]>;
 
 def FeatureMips16  : SubtargetFeature<"mips16", "InMips16Mode", "true",
@@ -162,20 +166,24 @@ def FeatureCnMips     : SubtargetFeature<"cnmips", "HasCnMips",
 class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, MipsGenericItineraries, Features>;
 
-def : Proc<"mips1", [FeatureMips1, FeatureO32]>;
-def : Proc<"mips2", [FeatureMips2, FeatureO32]>;
-def : Proc<"mips32", [FeatureMips32, FeatureO32]>;
-def : Proc<"mips32r2", [FeatureMips32r2, FeatureO32]>;
-def : Proc<"mips32r6", [FeatureMips32r6, FeatureO32]>;
-
-def : Proc<"mips3", [FeatureMips3, FeatureN64]>;
-def : Proc<"mips4", [FeatureMips4, FeatureN64]>;
-def : Proc<"mips5", [FeatureMips5, FeatureN64]>;
-def : Proc<"mips64", [FeatureMips64, FeatureN64]>;
-def : Proc<"mips64r2", [FeatureMips64r2, FeatureN64]>;
-def : Proc<"mips64r6", [FeatureMips64r6, FeatureN64]>;
-def : Proc<"mips16", [FeatureMips16, FeatureO32]>;
-def : Proc<"octeon", [FeatureMips64r2, FeatureN64, FeatureCnMips]>;
+def : Proc<"mips1", [FeatureMips1]>;
+def : Proc<"mips2", [FeatureMips2]>;
+def : Proc<"mips32", [FeatureMips32]>;
+def : Proc<"mips32r2", [FeatureMips32r2]>;
+def : Proc<"mips32r3", [FeatureMips32r3]>;
+def : Proc<"mips32r5", [FeatureMips32r5]>;
+def : Proc<"mips32r6", [FeatureMips32r6]>;
+
+def : Proc<"mips3", [FeatureMips3]>;
+def : Proc<"mips4", [FeatureMips4]>;
+def : Proc<"mips5", [FeatureMips5]>;
+def : Proc<"mips64", [FeatureMips64]>;
+def : Proc<"mips64r2", [FeatureMips64r2]>;
+def : Proc<"mips64r3", [FeatureMips64r3]>;
+def : Proc<"mips64r5", [FeatureMips64r5]>;
+def : Proc<"mips64r6", [FeatureMips64r6]>;
+def : Proc<"mips16", [FeatureMips16]>;
+def : Proc<"octeon", [FeatureMips64r2, FeatureCnMips]>;
 
 def MipsAsmParser : AsmParser {
   let ShouldEmitMatchRegisterName = 0;
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index 6070276..abecfa0 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -36,7 +36,7 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const Mips16InstrInfo &TII =
-      *static_cast<const Mips16InstrInfo *>(MF.getSubtarget().getInstrInfo());
+      *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   uint64_t StackSize = MFI->getStackSize();
@@ -84,7 +84,7 @@ void Mips16FrameLowering::emitEpilogue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const Mips16InstrInfo &TII =
-      *static_cast<const Mips16InstrInfo *>(MF.getSubtarget().getInstrInfo());
+      *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
   DebugLoc dl = MBBI->getDebugLoc();
   uint64_t StackSize = MFI->getStackSize();
 
@@ -154,7 +154,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       Amount = -Amount;
 
     const Mips16InstrInfo &TII =
-        *static_cast<const Mips16InstrInfo *>(MF.getSubtarget().getInstrInfo());
+        *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
 
     TII.adjustStackPtr(Mips::SP, Amount, MBB, I);
   }
@@ -174,7 +174,7 @@ void Mips16FrameLowering::
 processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                      RegScavenger *RS) const {
   const Mips16InstrInfo &TII =
-      *static_cast<const Mips16InstrInfo *>(MF.getSubtarget().getInstrInfo());
+      *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
   const MipsRegisterInfo &RI = TII.getRegisterInfo();
   const BitVector Reserved = RI.getReservedRegs(MF);
   bool SaveS2 = Reserved[Mips::S2];
diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp
index 9488e63..32dc90a 100644
--- a/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/lib/Target/Mips/Mips16HardFloat.cpp
@@ -247,12 +247,12 @@ static void swapFPIntParams
 // Having called needsFPHelperFromSig
 //
 static void assureFPCallStub(Function &F, Module *M,
-                             const MipsSubtarget &Subtarget) {
+                             const MipsTargetMachine &TM) {
   // for now we only need them for static relocation
-  if (Subtarget.getRelocationModel() == Reloc::PIC_)
+  if (TM.getRelocationModel() == Reloc::PIC_)
     return;
   LLVMContext &Context = M->getContext();
-  bool LE = Subtarget.isLittle();
+  bool LE = TM.isLittleEndian();
   std::string Name = F.getName();
   std::string SectionName = ".mips16.call.fp." + Name;
   std::string StubName = "__call_stub_fp_" + Name;
@@ -362,8 +362,8 @@ static bool isIntrinsicInline(Function *F) {
 // Returns of float, double and complex need to be handled with a helper
 // function.
 //
-static bool fixupFPReturnAndCall
-  (Function &F, Module *M,  const MipsSubtarget &Subtarget) {
+static bool fixupFPReturnAndCall(Function &F, Module *M,
+                                 const MipsTargetMachine &TM) {
   bool Modified = false;
   LLVMContext &C = M->getContext();
   Type *MyVoid = Type::getVoidTy(C);
@@ -426,9 +426,9 @@ static bool fixupFPReturnAndCall
               Modified=true;
               F.addFnAttr("saveS2");
             }
-            if (Subtarget.getRelocationModel() != Reloc::PIC_ ) {
+            if (TM.getRelocationModel() != Reloc::PIC_ ) {
               if (needsFPHelperFromSig(*F_)) {
-                assureFPCallStub(*F_, M, Subtarget);
+                assureFPCallStub(*F_, M, TM);
                 Modified=true;
               }
             }
@@ -439,9 +439,9 @@ static bool fixupFPReturnAndCall
 }
 
 static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
-                  const MipsSubtarget &Subtarget ) {
-  bool PicMode = Subtarget.getRelocationModel() == Reloc::PIC_;
-  bool LE = Subtarget.isLittle();
+                           const MipsTargetMachine &TM) {
+  bool PicMode = TM.getRelocationModel() == Reloc::PIC_;
+  bool LE = TM.isLittleEndian();
   LLVMContext &Context = M->getContext();
   std::string Name = F->getName();
   std::string SectionName = ".mips16.fn." + Name;
@@ -458,7 +458,6 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
   FStub->setSection(SectionName);
   BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub);
   InlineAsmHelper IAH(Context, BB);
-  IAH.Out(" .set  macro");
   if (PicMode) {
     IAH.Out(".set noreorder");
     IAH.Out(".cpload  $$25");
@@ -467,7 +466,6 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
     IAH.Out("la $$25," + LocalName);
   }
   else {
-    IAH.Out(".set reorder");
     IAH.Out("la $$25," + Name);
   }
   swapFPIntParams(PV, M, IAH, LE, false);
@@ -522,11 +520,11 @@ bool Mips16HardFloat::runOnModule(Module &M) {
     }
     if (F->isDeclaration() || F->hasFnAttribute("mips16_fp_stub") ||
         F->hasFnAttribute("nomips16")) continue;
-    Modified |= fixupFPReturnAndCall(*F, &M, Subtarget);
+    Modified |= fixupFPReturnAndCall(*F, &M, TM);
     FPParamVariant V = whichFPParamVariantNeeded(*F);
     if (V != NoSig) {
       Modified = true;
-      createFPFnStub(F, &M, V, Subtarget);
+      createFPFnStub(F, &M, V, TM);
     }
   }
   return Modified;
diff --git a/lib/Target/Mips/Mips16HardFloat.h b/lib/Target/Mips/Mips16HardFloat.h
index 19b7bf2..586cc25 100644
--- a/lib/Target/Mips/Mips16HardFloat.h
+++ b/lib/Target/Mips/Mips16HardFloat.h
@@ -25,26 +25,16 @@ using namespace llvm;
 namespace llvm {
 
 class Mips16HardFloat : public ModulePass {
-
 public:
   static char ID;
 
-  Mips16HardFloat(MipsTargetMachine &TM_) : ModulePass(ID),
-    TM(TM_), Subtarget(TM.getSubtarget<MipsSubtarget>()) {
-  }
-
-  const char *getPassName() const override {
-    return "MIPS16 Hard Float Pass";
-  }
+  Mips16HardFloat(MipsTargetMachine &TM_) : ModulePass(ID), TM(TM_) {}
 
+  const char *getPassName() const override { return "MIPS16 Hard Float Pass"; }
   bool runOnModule(Module &M) override;
 
 protected:
-  /// Keep a pointer to the MipsSubtarget around so that we can make the right
-  /// decision when generating code for different targets.
-  const TargetMachine &TM;
-  const MipsSubtarget &Subtarget;
-
+  const MipsTargetMachine &TM;
 };
 
 ModulePass *createMips16HardFloat(MipsTargetMachine &TM);
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index 7732be4..3221ccb 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -37,7 +37,7 @@ using namespace llvm;
 #define DEBUG_TYPE "mips-isel"
 
 bool Mips16DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  Subtarget = &TM.getSubtarget<MipsSubtarget>();
+  Subtarget = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
   if (!Subtarget->inMips16Mode())
     return false;
   return MipsDAGToDAGISel::runOnMachineFunction(MF);
@@ -72,11 +72,10 @@ void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
   MachineBasicBlock &MBB = MF.front();
   MachineBasicBlock::iterator I = MBB.begin();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
   DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
   unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
-  const TargetRegisterClass *RC =
-    (const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
+  const TargetRegisterClass *RC = &Mips::CPU16RegsRegClass;
 
   V0 = RegInfo.createVirtualRegister(RC);
   V1 = RegInfo.createVirtualRegister(RC);
@@ -103,7 +102,7 @@ void Mips16DAGToDAGISel::initMips16SPAliasReg(MachineFunction &MF) {
 
   MachineBasicBlock &MBB = MF.front();
   MachineBasicBlock::iterator I = MBB.begin();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
   DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
   unsigned Mips16SPAliasReg = MipsFI->getMips16SPAliasReg();
 
@@ -135,7 +134,7 @@ void Mips16DAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
         switch (SD->getMemoryVT().getSizeInBits()) {
         case 8:
         case 16:
-          AliasReg = TM.getSubtargetImpl()->getFrameLowering()->hasFP(*MF)
+          AliasReg = Subtarget->getFrameLowering()->hasFP(*MF)
                          ? AliasFPReg
                          : getMips16SPAliasReg();
           return;
@@ -147,7 +146,7 @@ void Mips16DAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
         switch (SD->getMemoryVT().getSizeInBits()) {
         case 8:
         case 16:
-          AliasReg = TM.getSubtargetImpl()->getFrameLowering()->hasFP(*MF)
+          AliasReg = Subtarget->getFrameLowering()->hasFP(*MF)
                          ? AliasFPReg
                          : getMips16SPAliasReg();
           return;
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index d4852c4..ede4f37 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -149,7 +149,7 @@ Mips16TargetLowering::Mips16TargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
   setOperationAction(ISD::BSWAP, MVT::i64, Expand);
 
-  computeRegisterProperties();
+  computeRegisterProperties(STI.getRegisterInfo());
 }
 
 const MipsTargetLowering *
@@ -497,14 +497,14 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
   SDValue JumpTarget = Callee;
 
   // T9 should contain the address of the callee function if
-  // -reloction-model=pic or it is an indirect call.
+  // -relocation-model=pic or it is an indirect call.
   if (IsPICCall || !GlobalOrExternal) {
     unsigned V0Reg = Mips::V0;
     if (NeedMips16Helper) {
       RegsToPass.push_front(std::make_pair(V0Reg, Callee));
       JumpTarget = DAG.getExternalSymbol(Mips16HelperFunction, getPointerTy());
       ExternalSymbolSDNode *S = cast<ExternalSymbolSDNode>(JumpTarget);
-      JumpTarget = getAddrGlobal(S, JumpTarget.getValueType(), DAG,
+      JumpTarget = getAddrGlobal(S, CLI.DL, JumpTarget.getValueType(), DAG,
                                  MipsII::MO_GOT, Chain,
                                  FuncInfo->callPtrInfo(S->getSymbol()));
     } else
@@ -522,8 +522,7 @@ MachineBasicBlock *Mips16TargetLowering::
 emitSel16(unsigned Opc, MachineInstr *MI, MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
   // To "insert" a SELECT_CC instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
@@ -580,13 +579,12 @@ emitSel16(unsigned Opc, MachineInstr *MI, MachineBasicBlock *BB) const {
   return BB;
 }
 
-MachineBasicBlock *Mips16TargetLowering::emitSelT16
-  (unsigned Opc1, unsigned Opc2,
-   MachineInstr *MI, MachineBasicBlock *BB) const {
+MachineBasicBlock *
+Mips16TargetLowering::emitSelT16(unsigned Opc1, unsigned Opc2, MachineInstr *MI,
+                                 MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
   // To "insert" a SELECT_CC instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
@@ -645,13 +643,13 @@ MachineBasicBlock *Mips16TargetLowering::emitSelT16
 
 }
 
-MachineBasicBlock *Mips16TargetLowering::emitSeliT16
-  (unsigned Opc1, unsigned Opc2,
-   MachineInstr *MI, MachineBasicBlock *BB) const {
+MachineBasicBlock *
+Mips16TargetLowering::emitSeliT16(unsigned Opc1, unsigned Opc2,
+                                  MachineInstr *MI,
+                                  MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
   // To "insert" a SELECT_CC instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
@@ -710,14 +708,13 @@ MachineBasicBlock *Mips16TargetLowering::emitSeliT16
 
 }
 
-MachineBasicBlock
-  *Mips16TargetLowering::emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
-                                             MachineInstr *MI,
-                                             MachineBasicBlock *BB) const {
+MachineBasicBlock *
+Mips16TargetLowering::emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
+                                          MachineInstr *MI,
+                                          MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   unsigned regX = MI->getOperand(0).getReg();
   unsigned regY = MI->getOperand(1).getReg();
   MachineBasicBlock *target = MI->getOperand(2).getMBB();
@@ -729,12 +726,11 @@ MachineBasicBlock
 }
 
 MachineBasicBlock *Mips16TargetLowering::emitFEXT_T8I8I16_ins(
-  unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc, bool ImmSigned,
-  MachineInstr *MI,  MachineBasicBlock *BB) const {
+    unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc, bool ImmSigned,
+    MachineInstr *MI, MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   unsigned regX = MI->getOperand(0).getReg();
   int64_t imm = MI->getOperand(1).getImm();
   MachineBasicBlock *target = MI->getOperand(2).getMBB();
@@ -763,13 +759,12 @@ static unsigned Mips16WhichOp8uOr16simm
     llvm_unreachable("immediate field not usable");
 }
 
-MachineBasicBlock *Mips16TargetLowering::emitFEXT_CCRX16_ins(
-  unsigned SltOpc,
-  MachineInstr *MI,  MachineBasicBlock *BB) const {
+MachineBasicBlock *
+Mips16TargetLowering::emitFEXT_CCRX16_ins(unsigned SltOpc, MachineInstr *MI,
+                                          MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   unsigned CC = MI->getOperand(0).getReg();
   unsigned regX = MI->getOperand(1).getReg();
   unsigned regY = MI->getOperand(2).getReg();
@@ -781,13 +776,13 @@ MachineBasicBlock *Mips16TargetLowering::emitFEXT_CCRX16_ins(
   return BB;
 }
 
-MachineBasicBlock *Mips16TargetLowering::emitFEXT_CCRXI16_ins(
-  unsigned SltiOpc, unsigned SltiXOpc,
-  MachineInstr *MI,  MachineBasicBlock *BB )const {
+MachineBasicBlock *
+Mips16TargetLowering::emitFEXT_CCRXI16_ins(unsigned SltiOpc, unsigned SltiXOpc,
+                                           MachineInstr *MI,
+                                           MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   unsigned CC = MI->getOperand(0).getReg();
   unsigned regX = MI->getOperand(1).getReg();
   int64_t Imm = MI->getOperand(2).getImm();
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index 4dd9af2..976becc 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -144,7 +144,6 @@ bool Mips16InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
 /// opcode, e.g. turning BEQ to BNE.
 unsigned Mips16InstrInfo::getOppositeBranchOpc(unsigned Opc) const {
   switch (Opc) {
-  default:  llvm_unreachable("Illegal opcode!");
   case Mips::BeqzRxImmX16: return Mips::BnezRxImmX16;
   case Mips::BnezRxImmX16: return Mips::BeqzRxImmX16;
   case Mips::BeqzRxImm16: return Mips::BnezRxImm16;
@@ -166,8 +165,7 @@ unsigned Mips16InstrInfo::getOppositeBranchOpc(unsigned Opc) const {
   case Mips::BtnezT8SltX16: return Mips::BteqzT8SltX16;
   case Mips::BtnezT8SltiX16: return Mips::BteqzT8SltiX16;
   }
-  assert(false && "Implement this function.");
-  return 0;
+  llvm_unreachable("Illegal opcode!");
 }
 
 static void addSaveRestoreRegs(MachineInstrBuilder &MIB,
@@ -288,7 +286,7 @@ void Mips16InstrInfo::adjustStackPtrBig(unsigned SP, int64_t Amount,
 void Mips16InstrInfo::adjustStackPtrBigUnrestricted(
     unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator I) const {
-   assert(false && "adjust stack pointer amount exceeded");
+   llvm_unreachable("adjust stack pointer amount exceeded");
 }
 
 /// Adjust SP by Amount bytes.
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index 2364f4d..10fff03 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -502,7 +502,7 @@ class ArithLogic16Defs<bit isCom=0> {
   bits<5> shamt = 0;
   bit isCommutable = isCom;
   bit isReMaterializable = 1;
-  bit neverHasSideEffects = 1;
+  bit hasSideEffects = 0;
 }
 
 class branch16 {
@@ -879,7 +879,7 @@ def MoveR3216: FI8_MOVR3216_ins<"move", IIAlu>;
 //
 def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> {
   let Uses = [HI0];
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
 }
 
 //
@@ -889,7 +889,7 @@ def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> {
 //
 def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> {
   let Uses = [LO0];
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
 }
 
 //
@@ -897,13 +897,13 @@ def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> {
 //
 def MultRxRy16:  FMULT16_ins<"mult",  IIAlu> {
   let isCommutable = 1;
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
   let Defs = [HI0, LO0];
 }
 
 def MultuRxRy16: FMULT16_ins<"multu", IIAlu> {
   let isCommutable = 1;
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
   let Defs = [HI0, LO0];
 }
 
@@ -914,7 +914,7 @@ def MultuRxRy16: FMULT16_ins<"multu", IIAlu> {
 //
 def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> {
   let isCommutable = 1;
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
   let Defs = [HI0, LO0];
 }
 
@@ -925,7 +925,7 @@ def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> {
 //
 def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIAlu> {
   let isCommutable = 1;
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
   let Defs = [HI0, LO0];
 }
 
@@ -1910,7 +1910,7 @@ def cpinst_operand : Operand<i32> {
 // is the index into the MachineConstantPool that this is, the third is the
 // size in bytes of this constant pool entry.
 //
-let neverHasSideEffects = 1, isNotDuplicable = 1 in
+let hasSideEffects = 0, isNotDuplicable = 1 in
 def CONSTPOOL_ENTRY :
 MipsPseudo16<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
                       i32imm:$size), "foo", []>;
diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
index 0bb452a..c45acc4 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -65,7 +65,7 @@ bool Mips16RegisterInfo::saveScavengerRegister
    const TargetRegisterClass *RC,
    unsigned Reg) const {
   DebugLoc DL;
-  const TargetInstrInfo &TII = *MBB.getParent()->getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   TII.copyPhysReg(MBB, I, DL, Mips::T0, Reg, true);
   TII.copyPhysReg(MBB, UseMI, DL, Reg, Mips::T0, true);
   return true;
@@ -106,7 +106,7 @@ void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
   if (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI)
     FrameReg = Mips::SP;
   else {
-    const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+    const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
     if (TFI->hasFP(MF)) {
       FrameReg = Mips::S0;
     }
@@ -140,8 +140,7 @@ void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
     DebugLoc DL = II->getDebugLoc();
     unsigned NewImm;
     const Mips16InstrInfo &TII =
-        *static_cast<const Mips16InstrInfo *>(
-            MBB.getParent()->getSubtarget().getInstrInfo());
+        *static_cast<const Mips16InstrInfo *>(Subtarget.getInstrInfo());
     FrameReg = TII.loadImmediate(FrameReg, Offset, MBB, II, DL, NewImm);
     Offset = SignExtend64<16>(NewImm);
     IsKill = true;
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index 6d6735b..49c6322 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -379,7 +379,6 @@ class JMP_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd,
   list<dag> Pattern = [];
   bit isTerminator = 1;
   bit hasDelaySlot = 0;
-  string DecoderMethod = "DecodeSimm16";
 }
 
 class JIALC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jialc", calloffset16,
@@ -550,6 +549,7 @@ class CACHE_HINT_DESC<string instr_asm, Operand MemOpnd,
   dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint);
   string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
   list<dag> Pattern = [];
+  string DecoderMethod = "DecodeCacheOpR6";
 }
 
 class CACHE_DESC : CACHE_HINT_DESC<"cache", mem_simm9, GPR32Opnd>;
@@ -561,6 +561,7 @@ class COP2LD_DESC_BASE<string instr_asm, RegisterOperand COPOpnd> {
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
   list<dag> Pattern = [];
   bit mayLoad = 1;
+  string DecoderMethod = "DecodeFMemCop2R6";
 }
 
 class LDC2_R6_DESC : COP2LD_DESC_BASE<"ldc2", COP2Opnd>;
@@ -572,6 +573,7 @@ class COP2ST_DESC_BASE<string instr_asm, RegisterOperand COPOpnd> {
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
   list<dag> Pattern = [];
   bit mayStore = 1;
+  string DecoderMethod = "DecodeFMemCop2R6";
 }
 
 class SDC2_R6_DESC : COP2ST_DESC_BASE<"sdc2", COP2Opnd>;
@@ -756,7 +758,7 @@ def : MipsPat<(setge f32:$lhs, f32:$rhs), (CMP_LT_S f32:$rhs, f32:$lhs)>,
       ISA_MIPS32R6;
 def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LT_S f32:$lhs, f32:$rhs)>,
       ISA_MIPS32R6;
-def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$lhs, f32:$rhs)>,
+def : MipsPat<(setle f32:$lhs, f32:$rhs), (CMP_LE_S f32:$lhs, f32:$rhs)>,
       ISA_MIPS32R6;
 def : MipsPat<(setne f32:$lhs, f32:$rhs),
               (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
@@ -776,7 +778,7 @@ def : MipsPat<(setge f64:$lhs, f64:$rhs), (CMP_LT_D f64:$rhs, f64:$lhs)>,
       ISA_MIPS32R6;
 def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LT_D f64:$lhs, f64:$rhs)>,
       ISA_MIPS32R6;
-def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$lhs, f64:$rhs)>,
+def : MipsPat<(setle f64:$lhs, f64:$rhs), (CMP_LE_D f64:$lhs, f64:$rhs)>,
       ISA_MIPS32R6;
 def : MipsPat<(setne f64:$lhs, f64:$rhs),
               (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 4e2dcd8..776e473 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -16,6 +16,10 @@
 //===----------------------------------------------------------------------===//
 
 // Unsigned Operand
+def uimm5_64      : Operand<i64> {
+  let PrintMethod = "printUnsignedImm";
+}
+
 def uimm16_64      : Operand<i64> {
   let PrintMethod = "printUnsignedImm";
 }
@@ -41,6 +45,38 @@ def immSExt10_64 : PatLeaf<(i64 imm),
 def immZExt16_64 : PatLeaf<(i64 imm),
                            [{ return isInt<16>(N->getZExtValue()); }]>;
 
+def immZExt5_64 : ImmLeaf<i64, [{ return Imm == (Imm & 0x1f); }]>;
+
+// Transformation function: get log2 of low 32 bits of immediate
+def Log2LO : SDNodeXForm<imm, [{
+  return getImm(N, Log2_64((unsigned) N->getZExtValue()));
+}]>;
+
+// Transformation function: get log2 of high 32 bits of immediate
+def Log2HI : SDNodeXForm<imm, [{
+  return getImm(N, Log2_64((unsigned) (N->getZExtValue() >> 32)));
+}]>;
+
+// Predicate: True if immediate is a power of 2 and fits 32 bits
+def PowerOf2LO : PatLeaf<(imm), [{
+  if (N->getValueType(0) == MVT::i64) {
+    uint64_t Imm = N->getZExtValue();
+    return isPowerOf2_64(Imm) && (Imm & 0xffffffff) == Imm;
+  }
+  else
+    return false;
+}]>;
+
+// Predicate: True if immediate is a power of 2 and exceeds 32 bits
+def PowerOf2HI : PatLeaf<(imm), [{
+  if (N->getValueType(0) == MVT::i64) {
+    uint64_t Imm = N->getZExtValue();
+    return isPowerOf2_64(Imm) && (Imm & 0xffffffff00000000) == Imm;
+  }
+  else
+    return false;
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Instructions specific format
 //===----------------------------------------------------------------------===//
@@ -290,7 +326,8 @@ class ExtsCins<string opstr, SDPatternOperator Op = null_frag>:
 class SetCC64_R<string opstr, PatFrag cond_op> :
   InstSE<(outs GPR64Opnd:$rd), (ins GPR64Opnd:$rs, GPR64Opnd:$rt),
          !strconcat(opstr, "\t$rd, $rs, $rt"),
-         [(set GPR64Opnd:$rd, (cond_op GPR64Opnd:$rs, GPR64Opnd:$rt))],
+         [(set GPR64Opnd:$rd, (zext (cond_op GPR64Opnd:$rs,
+                                             GPR64Opnd:$rt)))],
          II_SEQ_SNE, FrmR, opstr> {
   let TwoOperandAliasConstraint = "$rd = $rs";
 }
@@ -298,17 +335,40 @@ class SetCC64_R<string opstr, PatFrag cond_op> :
 class SetCC64_I<string opstr, PatFrag cond_op>:
   InstSE<(outs GPR64Opnd:$rt), (ins GPR64Opnd:$rs, simm10_64:$imm10),
          !strconcat(opstr, "\t$rt, $rs, $imm10"),
-         [(set GPR64Opnd:$rt, (cond_op GPR64Opnd:$rs, immSExt10_64:$imm10))],
+         [(set GPR64Opnd:$rt, (zext (cond_op GPR64Opnd:$rs,
+                                             immSExt10_64:$imm10)))],
          II_SEQI_SNEI, FrmI, opstr> {
   let TwoOperandAliasConstraint = "$rt = $rs";
 }
 
+class CBranchBitNum<string opstr, DAGOperand opnd, PatFrag cond_op,
+                    RegisterOperand RO, bits<64> shift = 1> :
+  InstSE<(outs), (ins RO:$rs, uimm5_64:$p, opnd:$offset),
+         !strconcat(opstr, "\t$rs, $p, $offset"),
+         [(brcond (i32 (cond_op (and RO:$rs, (shl shift, immZExt5_64:$p)), 0)),
+                  bb:$offset)], IIBranch, FrmI, opstr> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let Defs = [AT];
+}
+
 // Unsigned Byte Add
 let Pattern = [(set GPR64Opnd:$rd,
                     (and (add GPR64Opnd:$rs, GPR64Opnd:$rt), 255))] in
 def BADDu  : ArithLogicR<"baddu", GPR64Opnd, 1, II_BADDU>,
                               ADD_FM<0x1c, 0x28>;
 
+// Branch on Bit Clear /+32
+def BBIT0  : CBranchBitNum<"bbit0", brtarget, seteq, GPR64Opnd>, BBIT_FM<0x32>;
+def BBIT032: CBranchBitNum<"bbit032", brtarget, seteq, GPR64Opnd, 0x100000000>,
+                           BBIT_FM<0x36>;
+
+// Branch on Bit Set /+32
+def BBIT1  : CBranchBitNum<"bbit1", brtarget, setne, GPR64Opnd>, BBIT_FM<0x3a>;
+def BBIT132: CBranchBitNum<"bbit132", brtarget, setne, GPR64Opnd, 0x100000000>,
+                           BBIT_FM<0x3e>;
+
 // Multiply Doubleword to GPR
 let Defs = [HI0, LO0, P0, P1, P2] in
 def DMUL  : ArithLogicR<"dmul", GPR64Opnd, 1, II_DMUL, mul>,
@@ -359,6 +419,14 @@ def VMULU : ArithLogicR<"vmulu", GPR64Opnd, 0, II_DMUL>,
 
 }
 
+/// Move between CPU and coprocessor registers
+let DecoderNamespace = "Mips64", Predicates = [HasMips64] in {
+def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd>, MFC3OP_FM<0x10, 1>;
+def DMTC0 : MFC3OP<"dmtc0", GPR64Opnd>, MFC3OP_FM<0x10, 5>, ISA_MIPS3;
+def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd>, MFC3OP_FM<0x12, 1>, ISA_MIPS3;
+def DMTC2 : MFC3OP<"dmtc2", GPR64Opnd>, MFC3OP_FM<0x12, 5>, ISA_MIPS3;
+}
+
 //===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
 //===----------------------------------------------------------------------===//
@@ -426,6 +494,14 @@ def : MipsPat<(trunc (assertzext GPR64:$src)),
 def : MipsPat<(i32 (trunc GPR64:$src)),
               (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>;
 
+// Bypass trunc nodes for bitwise ops.
+def : MipsPat<(i32 (trunc (and GPR64:$lhs, GPR64:$rhs))),
+              (EXTRACT_SUBREG (AND64 GPR64:$lhs, GPR64:$rhs), sub_32)>;
+def : MipsPat<(i32 (trunc (or GPR64:$lhs, GPR64:$rhs))),
+              (EXTRACT_SUBREG (OR64 GPR64:$lhs, GPR64:$rhs), sub_32)>;
+def : MipsPat<(i32 (trunc (xor GPR64:$lhs, GPR64:$rhs))),
+              (EXTRACT_SUBREG (XOR64 GPR64:$lhs, GPR64:$rhs), sub_32)>;
+
 // 32-to-64-bit extension
 def : MipsPat<(i64 (anyext GPR32:$src)), (SLL64_32 GPR32:$src)>;
 def : MipsPat<(i64 (zext GPR32:$src)), (DSRL (DSLL64_32 GPR32:$src), 32)>;
@@ -438,6 +514,28 @@ def : MipsPat<(i64 (sext_inreg GPR64:$src, i32)),
 // bswap MipsPattern
 def : MipsPat<(bswap GPR64:$rt), (DSHD (DSBH GPR64:$rt))>;
 
+// Carry pattern
+def : MipsPat<(subc GPR64:$lhs, GPR64:$rhs),
+              (DSUBu GPR64:$lhs, GPR64:$rhs)>;
+let AdditionalPredicates = [NotDSP] in {
+  def : MipsPat<(addc GPR64:$lhs, GPR64:$rhs),
+                (DADDu GPR64:$lhs, GPR64:$rhs)>;
+  def : MipsPat<(addc GPR64:$lhs, immSExt16:$imm),
+                (DADDiu GPR64:$lhs, imm:$imm)>;
+}
+
+// Octeon bbit0/bbit1 MipsPattern
+let Predicates = [HasMips64, HasCnMips] in {
+def : MipsPat<(brcond (i32 (seteq (and i64:$lhs, PowerOf2LO:$mask), 0)), bb:$dst),
+              (BBIT0 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>;
+def : MipsPat<(brcond (i32 (seteq (and i64:$lhs, PowerOf2HI:$mask), 0)), bb:$dst),
+              (BBIT032 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>;
+def : MipsPat<(brcond (i32 (setne (and i64:$lhs, PowerOf2LO:$mask), 0)), bb:$dst),
+              (BBIT1 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>;
+def : MipsPat<(brcond (i32 (setne (and i64:$lhs, PowerOf2HI:$mask), 0)), bb:$dst),
+              (BBIT132 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>;
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
@@ -489,19 +587,6 @@ def : MipsInstAlias<"dsrl $rd, $rt, $rs",
                     (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
                     ISA_MIPS3;
 
-class LoadImm64< string instr_asm, Operand Od, RegisterOperand RO> :
-  MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm64),
-                     !strconcat(instr_asm, "\t$rt, $imm64")> ;
-def LoadImm64Reg : LoadImm64<"dli", imm64, GPR64Opnd>;
-
-/// Move between CPU and coprocessor registers
-let DecoderNamespace = "Mips64", Predicates = [HasMips64] in {
-def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd>, MFC3OP_FM<0x10, 1>;
-def DMTC0 : MFC3OP<"dmtc0", GPR64Opnd>, MFC3OP_FM<0x10, 5>, ISA_MIPS3;
-def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd>, MFC3OP_FM<0x12, 1>, ISA_MIPS3;
-def DMTC2 : MFC3OP<"dmtc2", GPR64Opnd>, MFC3OP_FM<0x12, 5>, ISA_MIPS3;
-}
-
 // Two operand (implicit 0 selector) versions:
 def : MipsInstAlias<"dmfc0 $rt, $rd", (DMFC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
 def : MipsInstAlias<"dmtc0 $rt, $rd", (DMTC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
@@ -514,3 +599,12 @@ def : MipsInstAlias<"syncs",      (SYNC 0x6), 0>;
 def : MipsInstAlias<"syncw",      (SYNC 0x4), 0>;
 def : MipsInstAlias<"syncws",     (SYNC 0x5), 0>;
 }
+
+//===----------------------------------------------------------------------===//
+// Assembler Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+class LoadImm64<string instr_asm, Operand Od, RegisterOperand RO> :
+  MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm64),
+                     !strconcat(instr_asm, "\t$rt, $imm64")> ;
+def LoadImm64Reg : LoadImm64<"dli", imm64, GPR64Opnd>;
diff --git a/lib/Target/Mips/MipsABIInfo.cpp b/lib/Target/Mips/MipsABIInfo.cpp
deleted file mode 100644
index f885369..0000000
--- a/lib/Target/Mips/MipsABIInfo.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-//===---- MipsABIInfo.cpp - Information about MIPS ABI's ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MipsABIInfo.h"
-#include "MipsRegisterInfo.h"
-
-using namespace llvm;
-
-namespace {
-static const MCPhysReg O32IntRegs[4] = {Mips::A0, Mips::A1, Mips::A2, Mips::A3};
-
-static const MCPhysReg Mips64IntRegs[8] = {
-    Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64,
-    Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64};
-}
-
-const ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const {
-  if (IsO32())
-    return makeArrayRef(O32IntRegs);
-  if (IsN32() || IsN64())
-    return makeArrayRef(Mips64IntRegs);
-  llvm_unreachable("Unhandled ABI");
-}
-
-const ArrayRef<MCPhysReg> MipsABIInfo::GetVarArgRegs() const {
-  if (IsO32())
-    return makeArrayRef(O32IntRegs);
-  if (IsN32() || IsN64())
-    return makeArrayRef(Mips64IntRegs);
-  llvm_unreachable("Unhandled ABI");
-}
-
-unsigned MipsABIInfo::GetCalleeAllocdArgSizeInBytes(CallingConv::ID CC) const {
-  if (IsO32())
-    return CC != CallingConv::Fast ? 16 : 0;
-  if (IsN32() || IsN64() || IsEABI())
-    return 0;
-  llvm_unreachable("Unhandled ABI");
-}
diff --git a/lib/Target/Mips/MipsABIInfo.h b/lib/Target/Mips/MipsABIInfo.h
deleted file mode 100644
index bea585e..0000000
--- a/lib/Target/Mips/MipsABIInfo.h
+++ /dev/null
@@ -1,61 +0,0 @@
-//===---- MipsABIInfo.h - Information about MIPS ABI's --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MIPSABIINFO_H
-#define MIPSABIINFO_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/IR/CallingConv.h"
-
-namespace llvm {
-
-class MipsABIInfo {
-public:
-  enum class ABI { Unknown, O32, N32, N64, EABI };
-
-protected:
-  ABI ThisABI;
-
-public:
-  MipsABIInfo(ABI ThisABI) : ThisABI(ThisABI) {}
-
-  static MipsABIInfo Unknown() { return MipsABIInfo(ABI::Unknown); }
-  static MipsABIInfo O32() { return MipsABIInfo(ABI::O32); }
-  static MipsABIInfo N32() { return MipsABIInfo(ABI::N32); }
-  static MipsABIInfo N64() { return MipsABIInfo(ABI::N64); }
-  static MipsABIInfo EABI() { return MipsABIInfo(ABI::EABI); }
-
-  bool IsKnown() const { return ThisABI != ABI::Unknown; }
-  bool IsO32() const { return ThisABI == ABI::O32; }
-  bool IsN32() const { return ThisABI == ABI::N32; }
-  bool IsN64() const { return ThisABI == ABI::N64; }
-  bool IsEABI() const { return ThisABI == ABI::EABI; }
-  ABI GetEnumValue() const { return ThisABI; }
-
-  /// The registers to use for byval arguments.
-  const ArrayRef<MCPhysReg> GetByValArgRegs() const;
-
-  /// The registers to use for the variable argument list.
-  const ArrayRef<MCPhysReg> GetVarArgRegs() const;
-
-  /// Obtain the size of the area allocated by the callee for arguments.
-  /// CallingConv::FastCall affects the value for O32.
-  unsigned GetCalleeAllocdArgSizeInBytes(CallingConv::ID CC) const;
-
-  /// Ordering of ABI's
-  /// MipsGenSubtargetInfo.inc will use this to resolve conflicts when given
-  /// multiple ABI options.
-  bool operator<(const MipsABIInfo Other) const {
-    return ThisABI < Other.GetEnumValue();
-  }
-};
-}
-
-#endif
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 832fa05..c662e13 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -19,6 +19,7 @@
 #include "MipsAsmPrinter.h"
 #include "MipsInstrInfo.h"
 #include "MipsMCInstLower.h"
+#include "MipsTargetMachine.h"
 #include "MipsTargetStreamer.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -53,12 +54,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "mips-asm-printer"
 
-MipsTargetStreamer &MipsAsmPrinter::getTargetStreamer() {
+MipsTargetStreamer &MipsAsmPrinter::getTargetStreamer() const {
   return static_cast<MipsTargetStreamer &>(*OutStreamer.getTargetStreamer());
 }
 
 bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
-  Subtarget = &TM.getSubtarget<MipsSubtarget>();
+  Subtarget = &MF.getSubtarget<MipsSubtarget>();
 
   // Initialize TargetLoweringObjectFile.
   const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
@@ -319,7 +320,7 @@ void MipsAsmPrinter::emitFrameDirective() {
 
 /// Emit Set directives.
 const char *MipsAsmPrinter::getCurrentABIString() const {
-  switch (Subtarget->getABI().GetEnumValue()) {
+  switch (static_cast<MipsTargetMachine &>(TM).getABI().GetEnumValue()) {
   case MipsABIInfo::ABI::O32:  return "abi32";
   case MipsABIInfo::ABI::N32:  return "abiN32";
   case MipsABIInfo::ABI::N64:  return "abi64";
@@ -357,10 +358,7 @@ void MipsAsmPrinter::EmitFunctionBodyStart() {
 
   MCInstLowering.Initialize(&MF->getContext());
 
-  bool IsNakedFunction =
-    MF->getFunction()->
-      getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                   Attribute::Naked);
+  bool IsNakedFunction = MF->getFunction()->hasFnAttribute(Attribute::Naked);
   if (!IsNakedFunction)
     emitFrameDirective();
 
@@ -560,7 +558,7 @@ bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 
 void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                   raw_ostream &O) {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   const MachineOperand &MO = MI->getOperand(opNum);
   bool closeP = false;
 
@@ -689,7 +687,21 @@ printRegisterList(const MachineInstr *MI, int opNum, raw_ostream &O) {
 }
 
 void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
-  bool IsABICalls = Subtarget->isABICalls();
+
+  // Compute MIPS architecture attributes based on the default subtarget
+  // that we'd have constructed. Module level directives aren't LTO
+  // clean anyhow.
+  // FIXME: For ifunc related functions we could iterate over and look
+  // for a feature string that doesn't match the default one.
+  StringRef TT = TM.getTargetTriple();
+  StringRef CPU =
+      MIPS_MC::selectMipsCPU(TM.getTargetTriple(), TM.getTargetCPU());
+  StringRef FS = TM.getTargetFeatureString();
+  const MipsTargetMachine &MTM = static_cast<const MipsTargetMachine &>(TM);
+  const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM);
+
+  bool IsABICalls = STI.isABICalls();
+  const MipsABIInfo &ABI = MTM.getABI();
   if (IsABICalls) {
     getTargetStreamer().emitDirectiveAbiCalls();
     Reloc::Model RM = TM.getRelocationModel();
@@ -697,68 +709,88 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
     //        Ideally it should test for properties of the ABI and not the ABI
     //        itself.
     //        For the moment, I'm only correcting enough to make MIPS-IV work.
-    if (RM == Reloc::Static && !Subtarget->isABI_N64())
+    if (RM == Reloc::Static && !ABI.IsN64())
       getTargetStreamer().emitDirectiveOptionPic0();
   }
 
   // Tell the assembler which ABI we are using
   std::string SectionName = std::string(".mdebug.") + getCurrentABIString();
-  OutStreamer.SwitchSection(OutContext.getELFSection(
-      SectionName, ELF::SHT_PROGBITS, 0, SectionKind::getDataRel()));
+  OutStreamer.SwitchSection(
+      OutContext.getELFSection(SectionName, ELF::SHT_PROGBITS, 0));
 
   // NaN: At the moment we only support:
   // 1. .nan legacy (default)
   // 2. .nan 2008
-  Subtarget->isNaN2008() ? getTargetStreamer().emitDirectiveNaN2008()
-    : getTargetStreamer().emitDirectiveNaNLegacy();
+  STI.isNaN2008() ? getTargetStreamer().emitDirectiveNaN2008()
+                  : getTargetStreamer().emitDirectiveNaNLegacy();
 
   // TODO: handle O64 ABI
 
-  if (Subtarget->isABI_EABI()) {
-    if (Subtarget->isGP32bit())
-      OutStreamer.SwitchSection(
-          OutContext.getELFSection(".gcc_compiled_long32", ELF::SHT_PROGBITS, 0,
-                                   SectionKind::getDataRel()));
+  if (ABI.IsEABI()) {
+    if (STI.isGP32bit())
+      OutStreamer.SwitchSection(OutContext.getELFSection(".gcc_compiled_long32",
+                                                         ELF::SHT_PROGBITS, 0));
     else
-      OutStreamer.SwitchSection(
-          OutContext.getELFSection(".gcc_compiled_long64", ELF::SHT_PROGBITS, 0,
-                                   SectionKind::getDataRel()));
+      OutStreamer.SwitchSection(OutContext.getELFSection(".gcc_compiled_long64",
+                                                         ELF::SHT_PROGBITS, 0));
   }
 
-  getTargetStreamer().updateABIInfo(*Subtarget);
+  getTargetStreamer().updateABIInfo(STI);
 
   // We should always emit a '.module fp=...' but binutils 2.24 does not accept
   // it. We therefore emit it when it contradicts the ABI defaults (-mfpxx or
   // -mfp64) and omit it otherwise.
-  if (Subtarget->isABI_O32() && (Subtarget->isABI_FPXX() ||
-                                 Subtarget->isFP64bit()))
+  if (ABI.IsO32() && (STI.isABI_FPXX() || STI.isFP64bit()))
     getTargetStreamer().emitDirectiveModuleFP();
 
   // We should always emit a '.module [no]oddspreg' but binutils 2.24 does not
   // accept it. We therefore emit it when it contradicts the default or an
   // option has changed the default (i.e. FPXX) and omit it otherwise.
-  if (Subtarget->isABI_O32() && (!Subtarget->useOddSPReg() ||
-                                 Subtarget->isABI_FPXX()))
-    getTargetStreamer().emitDirectiveModuleOddSPReg(Subtarget->useOddSPReg(),
-                                                    Subtarget->isABI_O32());
+  if (ABI.IsO32() && (!STI.useOddSPReg() || STI.isABI_FPXX()))
+    getTargetStreamer().emitDirectiveModuleOddSPReg(STI.useOddSPReg(),
+                                                    ABI.IsO32());
+}
+
+void MipsAsmPrinter::emitInlineAsmStart() const {
+  MipsTargetStreamer &TS = getTargetStreamer();
+
+  // GCC's choice of assembler options for inline assembly code ('at', 'macro'
+  // and 'reorder') is different from LLVM's choice for generated code ('noat',
+  // 'nomacro' and 'noreorder').
+  // In order to maintain compatibility with inline assembly code which depends
+  // on GCC's assembler options being used, we have to switch to those options
+  // for the duration of the inline assembly block and then switch back.
+  TS.emitDirectiveSetPush();
+  TS.emitDirectiveSetAt();
+  TS.emitDirectiveSetMacro();
+  TS.emitDirectiveSetReorder();
+  OutStreamer.AddBlankLine();
 }
 
-void MipsAsmPrinter::EmitJal(MCSymbol *Symbol) {
+void MipsAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+                                      const MCSubtargetInfo *EndInfo) const {
+  OutStreamer.AddBlankLine();
+  getTargetStreamer().emitDirectiveSetPop();
+}
+
+void MipsAsmPrinter::EmitJal(const MCSubtargetInfo &STI, MCSymbol *Symbol) {
   MCInst I;
   I.setOpcode(Mips::JAL);
   I.addOperand(
       MCOperand::CreateExpr(MCSymbolRefExpr::Create(Symbol, OutContext)));
-  OutStreamer.EmitInstruction(I, getSubtargetInfo());
+  OutStreamer.EmitInstruction(I, STI);
 }
 
-void MipsAsmPrinter::EmitInstrReg(unsigned Opcode, unsigned Reg) {
+void MipsAsmPrinter::EmitInstrReg(const MCSubtargetInfo &STI, unsigned Opcode,
+                                  unsigned Reg) {
   MCInst I;
   I.setOpcode(Opcode);
   I.addOperand(MCOperand::CreateReg(Reg));
-  OutStreamer.EmitInstruction(I, getSubtargetInfo());
+  OutStreamer.EmitInstruction(I, STI);
 }
 
-void MipsAsmPrinter::EmitInstrRegReg(unsigned Opcode, unsigned Reg1,
+void MipsAsmPrinter::EmitInstrRegReg(const MCSubtargetInfo &STI,
+                                     unsigned Opcode, unsigned Reg1,
                                      unsigned Reg2) {
   MCInst I;
   //
@@ -774,20 +806,22 @@ void MipsAsmPrinter::EmitInstrRegReg(unsigned Opcode, unsigned Reg1,
   I.setOpcode(Opcode);
   I.addOperand(MCOperand::CreateReg(Reg1));
   I.addOperand(MCOperand::CreateReg(Reg2));
-  OutStreamer.EmitInstruction(I, getSubtargetInfo());
+  OutStreamer.EmitInstruction(I, STI);
 }
 
-void MipsAsmPrinter::EmitInstrRegRegReg(unsigned Opcode, unsigned Reg1,
+void MipsAsmPrinter::EmitInstrRegRegReg(const MCSubtargetInfo &STI,
+                                        unsigned Opcode, unsigned Reg1,
                                         unsigned Reg2, unsigned Reg3) {
   MCInst I;
   I.setOpcode(Opcode);
   I.addOperand(MCOperand::CreateReg(Reg1));
   I.addOperand(MCOperand::CreateReg(Reg2));
   I.addOperand(MCOperand::CreateReg(Reg3));
-  OutStreamer.EmitInstruction(I, getSubtargetInfo());
+  OutStreamer.EmitInstruction(I, STI);
 }
 
-void MipsAsmPrinter::EmitMovFPIntPair(unsigned MovOpc, unsigned Reg1,
+void MipsAsmPrinter::EmitMovFPIntPair(const MCSubtargetInfo &STI,
+                                      unsigned MovOpc, unsigned Reg1,
                                       unsigned Reg2, unsigned FPReg1,
                                       unsigned FPReg2, bool LE) {
   if (!LE) {
@@ -795,59 +829,60 @@ void MipsAsmPrinter::EmitMovFPIntPair(unsigned MovOpc, unsigned Reg1,
     Reg1 = Reg2;
     Reg2 = temp;
   }
-  EmitInstrRegReg(MovOpc, Reg1, FPReg1);
-  EmitInstrRegReg(MovOpc, Reg2, FPReg2);
+  EmitInstrRegReg(STI, MovOpc, Reg1, FPReg1);
+  EmitInstrRegReg(STI, MovOpc, Reg2, FPReg2);
 }
 
-void MipsAsmPrinter::EmitSwapFPIntParams(Mips16HardFloatInfo::FPParamVariant PV,
+void MipsAsmPrinter::EmitSwapFPIntParams(const MCSubtargetInfo &STI,
+                                         Mips16HardFloatInfo::FPParamVariant PV,
                                          bool LE, bool ToFP) {
   using namespace Mips16HardFloatInfo;
   unsigned MovOpc = ToFP ? Mips::MTC1 : Mips::MFC1;
   switch (PV) {
   case FSig:
-    EmitInstrRegReg(MovOpc, Mips::A0, Mips::F12);
+    EmitInstrRegReg(STI, MovOpc, Mips::A0, Mips::F12);
     break;
   case FFSig:
-    EmitMovFPIntPair(MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F14, LE);
+    EmitMovFPIntPair(STI, MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F14, LE);
     break;
   case FDSig:
-    EmitInstrRegReg(MovOpc, Mips::A0, Mips::F12);
-    EmitMovFPIntPair(MovOpc, Mips::A2, Mips::A3, Mips::F14, Mips::F15, LE);
+    EmitInstrRegReg(STI, MovOpc, Mips::A0, Mips::F12);
+    EmitMovFPIntPair(STI, MovOpc, Mips::A2, Mips::A3, Mips::F14, Mips::F15, LE);
     break;
   case DSig:
-    EmitMovFPIntPair(MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
+    EmitMovFPIntPair(STI, MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
     break;
   case DDSig:
-    EmitMovFPIntPair(MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
-    EmitMovFPIntPair(MovOpc, Mips::A2, Mips::A3, Mips::F14, Mips::F15, LE);
+    EmitMovFPIntPair(STI, MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
+    EmitMovFPIntPair(STI, MovOpc, Mips::A2, Mips::A3, Mips::F14, Mips::F15, LE);
     break;
   case DFSig:
-    EmitMovFPIntPair(MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
-    EmitInstrRegReg(MovOpc, Mips::A2, Mips::F14);
+    EmitMovFPIntPair(STI, MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
+    EmitInstrRegReg(STI, MovOpc, Mips::A2, Mips::F14);
     break;
   case NoSig:
     return;
   }
 }
 
-void
-MipsAsmPrinter::EmitSwapFPIntRetval(Mips16HardFloatInfo::FPReturnVariant RV,
-                                    bool LE) {
+void MipsAsmPrinter::EmitSwapFPIntRetval(
+    const MCSubtargetInfo &STI, Mips16HardFloatInfo::FPReturnVariant RV,
+    bool LE) {
   using namespace Mips16HardFloatInfo;
   unsigned MovOpc = Mips::MFC1;
   switch (RV) {
   case FRet:
-    EmitInstrRegReg(MovOpc, Mips::V0, Mips::F0);
+    EmitInstrRegReg(STI, MovOpc, Mips::V0, Mips::F0);
     break;
   case DRet:
-    EmitMovFPIntPair(MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
+    EmitMovFPIntPair(STI, MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
     break;
   case CFRet:
-    EmitMovFPIntPair(MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
+    EmitMovFPIntPair(STI, MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
     break;
   case CDRet:
-    EmitMovFPIntPair(MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
-    EmitMovFPIntPair(MovOpc, Mips::A0, Mips::A1, Mips::F2, Mips::F3, LE);
+    EmitMovFPIntPair(STI, MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
+    EmitMovFPIntPair(STI, MovOpc, Mips::A0, Mips::A1, Mips::F2, Mips::F3, LE);
     break;
   case NoFPRet:
     break;
@@ -858,7 +893,14 @@ void MipsAsmPrinter::EmitFPCallStub(
     const char *Symbol, const Mips16HardFloatInfo::FuncSignature *Signature) {
   MCSymbol *MSymbol = OutContext.GetOrCreateSymbol(StringRef(Symbol));
   using namespace Mips16HardFloatInfo;
-  bool LE = Subtarget->isLittle();
+  bool LE = getDataLayout().isLittleEndian();
+  // Construct a local MCSubtargetInfo here.
+  // This is because the MachineFunction won't exist (but have not yet been
+  // freed) and since we're at the global level we can use the default
+  // constructed subtarget.
+  std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
+      TM.getTargetTriple(), TM.getTargetCPU(), TM.getTargetFeatureString()));
+
   //
   // .global xxxx
   //
@@ -921,7 +963,7 @@ void MipsAsmPrinter::EmitFPCallStub(
   //
   const MCSectionELF *M = OutContext.getELFSection(
       ".mips16.call.fp." + std::string(Symbol), ELF::SHT_PROGBITS,
-      ELF::SHF_ALLOC | ELF::SHF_EXECINSTR, SectionKind::getText());
+      ELF::SHF_ALLOC | ELF::SHF_EXECINSTR);
   OutStreamer.SwitchSection(M, nullptr);
   //
   // .align 2
@@ -946,13 +988,10 @@ void MipsAsmPrinter::EmitFPCallStub(
       OutContext.GetOrCreateSymbol("__call_stub_fp_" + Twine(Symbol));
   OutStreamer.EmitSymbolAttribute(MType, MCSA_ELF_TypeFunction);
   OutStreamer.EmitLabel(Stub);
-  //
-  // we just handle non pic for now. these function will not be
-  // called otherwise. when the full stub generation is moved here
-  // we need to deal with pic.
-  //
-  if (Subtarget->getRelocationModel() == Reloc::PIC_)
-    llvm_unreachable("should not be here if we are compiling pic");
+
+  // Only handle non-pic for now.
+  assert(TM.getRelocationModel() != Reloc::PIC_ &&
+         "should not be here if we are compiling pic");
   TS.emitDirectiveSetReorder();
   //
   // We need to add a MipsMCExpr class to MCTargetDesc to fully implement
@@ -969,22 +1008,22 @@ void MipsAsmPrinter::EmitFPCallStub(
   //
   // Mov $18, $31
 
-  EmitInstrRegRegReg(Mips::ADDu, Mips::S2, Mips::RA, Mips::ZERO);
+  EmitInstrRegRegReg(*STI, Mips::ADDu, Mips::S2, Mips::RA, Mips::ZERO);
 
-  EmitSwapFPIntParams(Signature->ParamSig, LE, true);
+  EmitSwapFPIntParams(*STI, Signature->ParamSig, LE, true);
 
   // Jal xxxx
   //
-  EmitJal(MSymbol);
+  EmitJal(*STI, MSymbol);
 
   // fix return values
-  EmitSwapFPIntRetval(Signature->RetSig, LE);
+  EmitSwapFPIntRetval(*STI, Signature->RetSig, LE);
   //
   // do the return
   // if (Signature->RetSig == NoFPRet)
   //  llvm_unreachable("should not be any stubs here with no return value");
   // else
-  EmitInstrReg(Mips::JR, Mips::S2);
+  EmitInstrReg(*STI, Mips::JR, Mips::S2);
 
   MCSymbol *Tmp = OutContext.CreateTempSymbol();
   OutStreamer.EmitLabel(Tmp);
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index 0582e21..d4c5b80 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -31,7 +31,7 @@ class Module;
 class raw_ostream;
 
 class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
-  MipsTargetStreamer &getTargetStreamer();
+  MipsTargetStreamer &getTargetStreamer() const;
 
   void EmitInstrWithMacroNoAT(const MachineInstr *MI);
 
@@ -60,22 +60,31 @@ private:
   std::map<const char *, const llvm::Mips16HardFloatInfo::FuncSignature *>
   StubsNeeded;
 
-  void EmitJal(MCSymbol *Symbol);
+  void emitInlineAsmStart() const override;
 
-  void EmitInstrReg(unsigned Opcode, unsigned Reg);
+  void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+                        const MCSubtargetInfo *EndInfo) const override;
 
-  void EmitInstrRegReg(unsigned Opcode, unsigned Reg1, unsigned Reg2);
+  void EmitJal(const MCSubtargetInfo &STI, MCSymbol *Symbol);
 
-  void EmitInstrRegRegReg(unsigned Opcode, unsigned Reg1, unsigned Reg2,
-                          unsigned Reg3);
+  void EmitInstrReg(const MCSubtargetInfo &STI, unsigned Opcode, unsigned Reg);
 
-  void EmitMovFPIntPair(unsigned MovOpc, unsigned Reg1, unsigned Reg2,
-                        unsigned FPReg1, unsigned FPReg2, bool LE);
+  void EmitInstrRegReg(const MCSubtargetInfo &STI, unsigned Opcode,
+                       unsigned Reg1, unsigned Reg2);
 
-  void EmitSwapFPIntParams(Mips16HardFloatInfo::FPParamVariant, bool LE,
+  void EmitInstrRegRegReg(const MCSubtargetInfo &STI, unsigned Opcode,
+                          unsigned Reg1, unsigned Reg2, unsigned Reg3);
+
+  void EmitMovFPIntPair(const MCSubtargetInfo &STI, unsigned MovOpc,
+                        unsigned Reg1, unsigned Reg2, unsigned FPReg1,
+                        unsigned FPReg2, bool LE);
+
+  void EmitSwapFPIntParams(const MCSubtargetInfo &STI,
+                           Mips16HardFloatInfo::FPParamVariant, bool LE,
                            bool ToFP);
 
-  void EmitSwapFPIntRetval(Mips16HardFloatInfo::FPReturnVariant, bool LE);
+  void EmitSwapFPIntRetval(const MCSubtargetInfo &STI,
+                           Mips16HardFloatInfo::FPReturnVariant, bool LE);
 
   void EmitFPCallStub(const char *, const Mips16HardFloatInfo::FuncSignature *);
 
@@ -89,14 +98,10 @@ public:
   const MipsFunctionInfo *MipsFI;
   MipsMCInstLower MCInstLowering;
 
-  // We initialize the subtarget here and in runOnMachineFunction
-  // since there are certain target specific flags (ABI) that could
-  // reside on the TargetMachine, but are on the subtarget currently
-  // and we need them for the beginning of file output before we've
-  // seen a single function.
-  explicit MipsAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : AsmPrinter(TM, Streamer), MCP(nullptr), InConstantPool(false),
-        Subtarget(&TM.getSubtarget<MipsSubtarget>()), MCInstLowering(*this) {}
+  explicit MipsAsmPrinter(TargetMachine &TM,
+                          std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)), MCP(nullptr),
+        InConstantPool(false), MCInstLowering(*this) {}
 
   const char *getPassName() const override {
     return "Mips Assembly Printer";
diff --git a/lib/Target/Mips/MipsCCState.cpp b/lib/Target/Mips/MipsCCState.cpp
index e18cc8b..b808129 100644
--- a/lib/Target/Mips/MipsCCState.cpp
+++ b/lib/Target/Mips/MipsCCState.cpp
@@ -132,8 +132,8 @@ void MipsCCState::PreAnalyzeFormalArgumentsForF128(
       continue;
     }
 
-    assert(Ins[i].OrigArgIndex < MF.getFunction()->arg_size());
-    std::advance(FuncArg, Ins[i].OrigArgIndex);
+    assert(Ins[i].getOrigArgIndex() < MF.getFunction()->arg_size());
+    std::advance(FuncArg, Ins[i].getOrigArgIndex());
 
     OriginalArgWasF128.push_back(
         originalTypeIsF128(FuncArg->getType(), nullptr));
diff --git a/lib/Target/Mips/MipsCCState.h b/lib/Target/Mips/MipsCCState.h
index cc4531d..081c393 100644
--- a/lib/Target/Mips/MipsCCState.h
+++ b/lib/Target/Mips/MipsCCState.h
@@ -10,9 +10,9 @@
 #ifndef MIPSCCSTATE_H
 #define MIPSCCSTATE_H
 
+#include "MipsISelLowering.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CallingConvLower.h"
-#include "MipsISelLowering.h"
 
 namespace llvm {
 class SDNode;
@@ -85,10 +85,10 @@ public:
   // provide a means of accessing ArgListEntry::IsFixed. Delete them from this
   // class. This doesn't stop them being used via the base class though.
   void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
-                           CCAssignFn Fn) LLVM_DELETED_FUNCTION;
+                           CCAssignFn Fn) = delete;
   void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs,
                            SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
-                           CCAssignFn Fn) LLVM_DELETED_FUNCTION;
+                           CCAssignFn Fn) = delete;
 
   void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
                               CCAssignFn Fn) {
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 7318de2..abee185 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -20,6 +20,29 @@ class CCIfSubtarget<string F, CCAction A, string Invert = "">
 // The inverse of CCIfSubtarget
 class CCIfSubtargetNot<string F, CCAction A> : CCIfSubtarget<F, A, "!">;
 
+/// Match if the original argument (before lowering) was a float.
+/// For example, this is true for i32's that were lowered from soft-float.
+class CCIfOrigArgWasNotFloat<CCAction A>
+    : CCIf<"!static_cast<MipsCCState *>(&State)->WasOriginalArgFloat(ValNo)",
+           A>;
+
+/// Match if the original argument (before lowering) was a 128-bit float (i.e.
+/// long double).
+class CCIfOrigArgWasF128<CCAction A>
+    : CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgF128(ValNo)", A>;
+
+/// Match if this specific argument is a vararg.
+/// This is slightly different fro CCIfIsVarArg which matches if any argument is
+/// a vararg.
+class CCIfArgIsVarArg<CCAction A>
+    : CCIf<"!static_cast<MipsCCState *>(&State)->IsCallOperandFixed(ValNo)", A>;
+
+
+/// Match if the special calling conv is the specified value.
+class CCIfSpecialCallingConv<string CC, CCAction A>
+    : CCIf<"static_cast<MipsCCState *>(&State)->getSpecialCallingConv() == "
+               "MipsCCState::" # CC, A>;
+
 // For soft-float, f128 values are returned in A0_64 rather than V1_64.
 def RetCC_F128SoftFloat : CallingConv<[
   CCAssignToReg<[V0_64, A0_64]>
@@ -105,9 +128,7 @@ def CC_MipsN : CallingConv<[
           CCIfInReg<CCPromoteToUpperBitsInType<i64>>>>,
 
   // All integers (except soft-float integers) are promoted to 64-bit.
-  CCIfType<[i8, i16, i32],
-     CCIf<"!static_cast<MipsCCState *>(&State)->WasOriginalArgFloat(ValNo)",
-          CCPromoteToType<i64>>>,
+  CCIfType<[i8, i16, i32], CCIfOrigArgWasNotFloat<CCPromoteToType<i64>>>,
 
   // The only i32's we have left are soft-float arguments.
   CCIfSubtarget<"abiUsesSoftFloat()", CCIfType<[i32], CCDelegateTo<CC_MipsN_SoftFloat>>>,
@@ -138,6 +159,10 @@ def CC_MipsN : CallingConv<[
 // N32/64 variable arguments.
 // All arguments are passed in integer registers.
 def CC_MipsN_VarArg : CallingConv<[
+  CCIfType<[i8, i16, i32, i64],
+      CCIfSubtargetNot<"isLittle()",
+          CCIfInReg<CCPromoteToUpperBitsInType<i64>>>>,
+
   // All integers are promoted to 64-bit.
   CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
 
@@ -162,9 +187,7 @@ def RetCC_MipsN : CallingConv<[
   //
   // f128 should only occur for the N64 ABI where long double is 128-bit. On
   // N32, long double is equivalent to double.
-  CCIfType<[i64],
-      CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgF128(ValNo)",
-           CCDelegateTo<RetCC_F128>>>,
+  CCIfType<[i64], CCIfOrigArgWasF128<CCDelegateTo<RetCC_F128>>>,
 
   // Aggregate returns are positioned at the lowest address in the slot for
   // both little and big-endian targets. When passing in registers, this
@@ -330,8 +353,7 @@ def CC_Mips16RetHelper : CallingConv<[
 def CC_Mips_FixedArg : CallingConv<[
   // Mips16 needs special handling on some functions.
   CCIf<"State.getCallingConv() != CallingConv::Fast",
-      CCIf<"static_cast<MipsCCState *>(&State)->getSpecialCallingConv() == "
-               "MipsCCState::Mips16RetHelperConv",
+      CCIfSpecialCallingConv<"Mips16RetHelperConv",
            CCDelegateTo<CC_Mips16RetHelper>>>,
 
   CCIfByVal<CCDelegateTo<CC_Mips_ByVal>>,
@@ -348,8 +370,7 @@ def CC_Mips_FixedArg : CallingConv<[
   // N32, long double is equivalent to double.
   CCIfType<[i64],
       CCIfSubtargetNot<"abiUsesSoftFloat()",
-          CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgF128(ValNo)",
-              CCBitConvertToType<f64>>>>,
+          CCIfOrigArgWasF128<CCBitConvertToType<f64>>>>,
 
   CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_Mips_FastCC>>,
 
@@ -369,9 +390,7 @@ def CC_Mips_VarArg : CallingConv<[
 ]>;
 
 def CC_Mips : CallingConv<[
-  CCIfVarArg<
-      CCIf<"!static_cast<MipsCCState *>(&State)->IsCallOperandFixed(ValNo)",
-          CCDelegateTo<CC_Mips_VarArg>>>,
+  CCIfVarArg<CCIfArgIsVarArg<CCDelegateTo<CC_Mips_VarArg>>>,
   CCDelegateTo<CC_Mips_FixedArg>
 ]>;
 
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index 690f626..af10cd4 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -263,3 +263,40 @@ defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
        FGR_64;
 defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
        FGR_64;
+
+// For targets that don't have conditional-move instructions
+// we have to match SELECT nodes with pseudo instructions.
+let usesCustomInserter = 1 in {
+  class Select_Pseudo<RegisterOperand RC> :
+    PseudoSE<(outs RC:$dst), (ins GPR32Opnd:$cond, RC:$T, RC:$F),
+            [(set RC:$dst, (select GPR32Opnd:$cond, RC:$T, RC:$F))]>,
+    ISA_MIPS1_NOT_4_32;
+
+  class SelectFP_Pseudo_T<RegisterOperand RC> :
+    PseudoSE<(outs RC:$dst), (ins GPR32Opnd:$cond, RC:$T, RC:$F),
+             [(set RC:$dst, (MipsCMovFP_T RC:$T, GPR32Opnd:$cond, RC:$F))]>,
+    ISA_MIPS1_NOT_4_32;
+
+  class SelectFP_Pseudo_F<RegisterOperand RC> :
+    PseudoSE<(outs RC:$dst), (ins GPR32Opnd:$cond, RC:$T, RC:$F),
+             [(set RC:$dst, (MipsCMovFP_F RC:$T, GPR32Opnd:$cond, RC:$F))]>,
+    ISA_MIPS1_NOT_4_32;
+}
+
+def PseudoSELECT_I : Select_Pseudo<GPR32Opnd>;
+def PseudoSELECT_I64 : Select_Pseudo<GPR64Opnd>;
+def PseudoSELECT_S : Select_Pseudo<FGR32Opnd>;
+def PseudoSELECT_D32 : Select_Pseudo<AFGR64Opnd>, FGR_32;
+def PseudoSELECT_D64 : Select_Pseudo<FGR64Opnd>, FGR_64;
+
+def PseudoSELECTFP_T_I : SelectFP_Pseudo_T<GPR32Opnd>;
+def PseudoSELECTFP_T_I64 : SelectFP_Pseudo_T<GPR64Opnd>;
+def PseudoSELECTFP_T_S : SelectFP_Pseudo_T<FGR32Opnd>;
+def PseudoSELECTFP_T_D32 : SelectFP_Pseudo_T<AFGR64Opnd>, FGR_32;
+def PseudoSELECTFP_T_D64 : SelectFP_Pseudo_T<FGR64Opnd>, FGR_64;
+
+def PseudoSELECTFP_F_I : SelectFP_Pseudo_F<GPR32Opnd>;
+def PseudoSELECTFP_F_I64 : SelectFP_Pseudo_F<GPR64Opnd>;
+def PseudoSELECTFP_F_S : SelectFP_Pseudo_F<FGR32Opnd>;
+def PseudoSELECTFP_F_D32 : SelectFP_Pseudo_F<AFGR64Opnd>, FGR_32;
+def PseudoSELECTFP_F_D64 : SelectFP_Pseudo_F<FGR64Opnd>, FGR_64;
diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
index c4e5ac0..96553d2 100644
--- a/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -448,14 +448,12 @@ bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // FIXME:
   MF = &mf;
   MCP = mf.getConstantPool();
-  STI = &mf.getTarget().getSubtarget<MipsSubtarget>();
+  STI = &static_cast<const MipsSubtarget &>(mf.getSubtarget());
   DEBUG(dbgs() << "constant island machine function " << "\n");
   if (!STI->inMips16Mode() || !MipsSubtarget::useConstantIslands()) {
     return false;
   }
-  TII = (const Mips16InstrInfo *)MF->getTarget()
-            .getSubtargetImpl()
-            ->getInstrInfo();
+  TII = (const Mips16InstrInfo *)STI->getInstrInfo();
   MFI = MF->getInfo<MipsFunctionInfo>();
   DEBUG(dbgs() << "constant island processing " << "\n");
   //
@@ -562,7 +560,7 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
   // identity mapping of CPI's to CPE's.
   const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
 
-  const DataLayout &TD = *MF->getSubtarget().getDataLayout();
+  const DataLayout &TD = *MF->getTarget().getDataLayout();
   for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
     unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
     assert(Size >= 4 && "Too small constant pool entry");
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index d7ba6d4..ac03c0b 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -69,7 +69,7 @@ namespace {
 
   class RegDefsUses {
   public:
-    RegDefsUses(TargetMachine &TM);
+    RegDefsUses(const TargetRegisterInfo &TRI);
     void init(const MachineInstr &MI);
 
     /// This function sets all caller-saved registers in Defs.
@@ -196,6 +196,12 @@ namespace {
   private:
     bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
 
+    Iter replaceWithCompactBranch(MachineBasicBlock &MBB,
+                                  Iter Branch, DebugLoc DL);
+
+    Iter replaceWithCompactJump(MachineBasicBlock &MBB,
+                                Iter Jump, DebugLoc DL);
+
     /// This function checks if it is valid to move Candidate to the delay slot
     /// and returns true if it isn't. It also updates memory and register
     /// dependence information.
@@ -207,7 +213,7 @@ namespace {
     template<typename IterTy>
     bool searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
                      RegDefsUses &RegDU, InspectMemInstr &IM,
-                     IterTy &Filler) const;
+                     IterTy &Filler, Iter Slot) const;
 
     /// This function searches in the backward direction for an instruction that
     /// can be moved to the delay slot. Returns true on success.
@@ -275,11 +281,7 @@ static void addLiveInRegs(Iter Filler, MachineBasicBlock &MBB) {
 
 #ifndef NDEBUG
     const MachineFunction &MF = *MBB.getParent();
-    assert(MF.getTarget()
-               .getSubtargetImpl()
-               ->getRegisterInfo()
-               ->getAllocatableSet(MF)
-               .test(R) &&
+    assert(MF.getSubtarget().getRegisterInfo()->getAllocatableSet(MF).test(R) &&
            "Shouldn't move an instruction with unallocatable registers across "
            "basic block boundaries.");
 #endif
@@ -289,9 +291,8 @@ static void addLiveInRegs(Iter Filler, MachineBasicBlock &MBB) {
   }
 }
 
-RegDefsUses::RegDefsUses(TargetMachine &TM)
-    : TRI(*TM.getSubtargetImpl()->getRegisterInfo()),
-      Defs(TRI.getNumRegs(), false), Uses(TRI.getNumRegs(), false) {}
+RegDefsUses::RegDefsUses(const TargetRegisterInfo &TRI)
+    : TRI(TRI), Defs(TRI.getNumRegs(), false), Uses(TRI.getNumRegs(), false) {}
 
 void RegDefsUses::init(const MachineInstr &MI) {
   // Add all register operands which are explicit and non-variadic.
@@ -494,42 +495,135 @@ getUnderlyingObjects(const MachineInstr &MI,
   return true;
 }
 
+// Replace Branch with the compact branch instruction.
+Iter Filler::replaceWithCompactBranch(MachineBasicBlock &MBB,
+                                      Iter Branch, DebugLoc DL) {
+  const MipsInstrInfo *TII =
+      MBB.getParent()->getSubtarget<MipsSubtarget>().getInstrInfo();
+
+  unsigned NewOpcode =
+    (((unsigned) Branch->getOpcode()) == Mips::BEQ) ? Mips::BEQZC_MM
+                                                    : Mips::BNEZC_MM;
+
+  const MCInstrDesc &NewDesc = TII->get(NewOpcode);
+  MachineInstrBuilder MIB = BuildMI(MBB, Branch, DL, NewDesc);
+
+  MIB.addReg(Branch->getOperand(0).getReg());
+  MIB.addMBB(Branch->getOperand(2).getMBB());
+
+  Iter tmpIter = Branch;
+  Branch = std::prev(Branch);
+  MBB.erase(tmpIter);
+
+  return Branch;
+}
+
+// Replace Jumps with the compact jump instruction.
+Iter Filler::replaceWithCompactJump(MachineBasicBlock &MBB,
+                                    Iter Jump, DebugLoc DL) {
+  const MipsInstrInfo *TII =
+      MBB.getParent()->getSubtarget<MipsSubtarget>().getInstrInfo();
+
+  const MCInstrDesc &NewDesc = TII->get(Mips::JRC16_MM);
+  MachineInstrBuilder MIB = BuildMI(MBB, Jump, DL, NewDesc);
+
+  MIB.addReg(Jump->getOperand(0).getReg());
+
+  Iter tmpIter = Jump;
+  Jump = std::prev(Jump);
+  MBB.erase(tmpIter);
+
+  return Jump;
+}
+
+// For given opcode returns opcode of corresponding instruction with short
+// delay slot.
+static int getEquivalentCallShort(int Opcode) {
+  switch (Opcode) {
+  case Mips::BGEZAL:
+    return Mips::BGEZALS_MM;
+  case Mips::BLTZAL:
+    return Mips::BLTZALS_MM;
+  case Mips::JAL:
+    return Mips::JALS_MM;
+  case Mips::JALR:
+    return Mips::JALRS_MM;
+  case Mips::JALR16_MM:
+    return Mips::JALRS16_MM;
+  default:
+    llvm_unreachable("Unexpected call instruction for microMIPS.");
+  }
+}
+
 /// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
 /// We assume there is only one delay slot per delayed instruction.
 bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
-  bool InMicroMipsMode = TM.getSubtarget<MipsSubtarget>().inMicroMipsMode();
+  const MipsSubtarget &STI = MBB.getParent()->getSubtarget<MipsSubtarget>();
+  bool InMicroMipsMode = STI.inMicroMipsMode();
+  const MipsInstrInfo *TII = STI.getInstrInfo();
 
   for (Iter I = MBB.begin(); I != MBB.end(); ++I) {
     if (!hasUnoccupiedSlot(&*I))
       continue;
 
-    // For microMIPS, at the moment, do not fill delay slots of call
-    // instructions.
-    //
-    // TODO: Support for replacing regular call instructions with corresponding
-    // short delay slot instructions should be implemented.
-    if (!InMicroMipsMode || !I->isCall()) {
-      ++FilledSlots;
-      Changed = true;
-
-      // Delay slot filling is disabled at -O0.
-      if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None)) {
-        if (searchBackward(MBB, I))
-          continue;
+    ++FilledSlots;
+    Changed = true;
 
-        if (I->isTerminator()) {
-          if (searchSuccBBs(MBB, I))
-            continue;
-        } else if (searchForward(MBB, I)) {
-          continue;
+    // Delay slot filling is disabled at -O0.
+    if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None)) {
+      bool Filled = false;
+
+      if (searchBackward(MBB, I)) {
+        Filled = true;
+      } else if (I->isTerminator()) {
+        if (searchSuccBBs(MBB, I)) {
+          Filled = true;
+        }
+      } else if (searchForward(MBB, I)) {
+        Filled = true;
+      }
+
+      if (Filled) {
+        // Get instruction with delay slot.
+        MachineBasicBlock::instr_iterator DSI(I);
+
+        if (InMicroMipsMode && TII->GetInstSizeInBytes(std::next(DSI)) == 2 &&
+            DSI->isCall()) {
+          // If instruction in delay slot is 16b change opcode to
+          // corresponding instruction with short delay slot.
+          DSI->setDesc(TII->get(getEquivalentCallShort(DSI->getOpcode())));
         }
+
+        continue;
       }
     }
 
+    // If instruction is BEQ or BNE with one ZERO register, then instead of
+    // adding NOP replace this instruction with the corresponding compact
+    // branch instruction, i.e. BEQZC or BNEZC.
+    unsigned Opcode = I->getOpcode();
+    if (InMicroMipsMode) {
+      switch (Opcode) {
+        case Mips::BEQ:
+        case Mips::BNE:
+          if (((unsigned) I->getOperand(1).getReg()) == Mips::ZERO) {
+            I = replaceWithCompactBranch(MBB, I, I->getDebugLoc());
+            continue;
+          }
+          break;
+        case Mips::JR:
+        case Mips::PseudoReturn:
+        case Mips::PseudoIndirectBranch:
+          // For microMIPS the PseudoReturn and PseudoIndirectBranch are allways
+          // expanded to JR_MM, so they can be replaced with JRC16_MM.
+          I = replaceWithCompactJump(MBB, I, I->getDebugLoc());
+          continue;
+        default:
+          break;
+      }
+    }
     // Bundle the NOP to the instruction with the delay slot.
-    const MipsInstrInfo *TII = static_cast<const MipsInstrInfo *>(
-        TM.getSubtargetImpl()->getInstrInfo());
     BuildMI(MBB, std::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
     MIBundleBuilder(MBB, I, std::next(I, 2));
   }
@@ -546,7 +640,7 @@ FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
 template<typename IterTy>
 bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
                          RegDefsUses &RegDU, InspectMemInstr& IM,
-                         IterTy &Filler) const {
+                         IterTy &Filler, Iter Slot) const {
   for (IterTy I = Begin; I != End; ++I) {
     // skip debug value
     if (I->isDebugValue())
@@ -561,7 +655,8 @@ bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
     if (delayHasHazard(*I, RegDU, IM))
       continue;
 
-    if (TM.getSubtarget<MipsSubtarget>().isTargetNaCl()) {
+    const MipsSubtarget &STI = MBB.getParent()->getSubtarget<MipsSubtarget>();
+    if (STI.isTargetNaCl()) {
       // In NaCl, instructions that must be masked are forbidden in delay slots.
       // We only check for loads, stores and SP changes.  Calls, returns and
       // branches are not checked because non-NaCl targets never put them in
@@ -569,11 +664,18 @@ bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
       unsigned AddrIdx;
       if ((isBasePlusOffsetMemoryAccess(I->getOpcode(), &AddrIdx) &&
            baseRegNeedsLoadStoreMask(I->getOperand(AddrIdx).getReg())) ||
-          I->modifiesRegister(Mips::SP,
-                              TM.getSubtargetImpl()->getRegisterInfo()))
+          I->modifiesRegister(Mips::SP, STI.getRegisterInfo()))
         continue;
     }
 
+    bool InMicroMipsMode = STI.inMicroMipsMode();
+    const MipsInstrInfo *TII = STI.getInstrInfo();
+    unsigned Opcode = (*Slot).getOpcode();
+    if (InMicroMipsMode && TII->GetInstSizeInBytes(&(*I)) == 2 &&
+        (Opcode == Mips::JR || Opcode == Mips::PseudoIndirectBranch ||
+         Opcode == Mips::PseudoReturn))
+      continue;
+
     Filler = I;
     return true;
   }
@@ -585,13 +687,14 @@ bool Filler::searchBackward(MachineBasicBlock &MBB, Iter Slot) const {
   if (DisableBackwardSearch)
     return false;
 
-  RegDefsUses RegDU(TM);
+  RegDefsUses RegDU(*MBB.getParent()->getSubtarget().getRegisterInfo());
   MemDefsUses MemDU(MBB.getParent()->getFrameInfo());
   ReverseIter Filler;
 
   RegDU.init(*Slot);
 
-  if (!searchRange(MBB, ReverseIter(Slot), MBB.rend(), RegDU, MemDU, Filler))
+  if (!searchRange(MBB, ReverseIter(Slot), MBB.rend(), RegDU, MemDU, Filler,
+      Slot))
     return false;
 
   MBB.splice(std::next(Slot), &MBB, std::next(Filler).base());
@@ -605,13 +708,13 @@ bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
   if (DisableForwardSearch || !Slot->isCall())
     return false;
 
-  RegDefsUses RegDU(TM);
+  RegDefsUses RegDU(*MBB.getParent()->getSubtarget().getRegisterInfo());
   NoMemInstr NM;
   Iter Filler;
 
   RegDU.setCallerSaved(*Slot);
 
-  if (!searchRange(MBB, std::next(Slot), MBB.end(), RegDU, NM, Filler))
+  if (!searchRange(MBB, std::next(Slot), MBB.end(), RegDU, NM, Filler, Slot))
     return false;
 
   MBB.splice(std::next(Slot), &MBB, Filler);
@@ -629,7 +732,7 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
   if (!SuccBB)
     return false;
 
-  RegDefsUses RegDU(TM);
+  RegDefsUses RegDU(*MBB.getParent()->getSubtarget().getRegisterInfo());
   bool HasMultipleSuccs = false;
   BB2BrMap BrMap;
   std::unique_ptr<InspectMemInstr> IM;
@@ -654,7 +757,8 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
     IM.reset(new MemDefsUses(MFI));
   }
 
-  if (!searchRange(MBB, SuccBB->begin(), SuccBB->end(), RegDU, *IM, Filler))
+  if (!searchRange(MBB, SuccBB->begin(), SuccBB->end(), RegDU, *IM, Filler,
+      Slot))
     return false;
 
   insertDelayFiller(Filler, BrMap);
@@ -681,7 +785,7 @@ MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
 std::pair<MipsInstrInfo::BranchType, MachineInstr *>
 Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
   const MipsInstrInfo *TII =
-      static_cast<const MipsInstrInfo *>(TM.getSubtargetImpl()->getInstrInfo());
+      MBB.getParent()->getSubtarget<MipsSubtarget>().getInstrInfo();
   MachineBasicBlock *TrueBB = nullptr, *FalseBB = nullptr;
   SmallVector<MachineInstr*, 2> BranchInstrs;
   SmallVector<MachineOperand, 2> Cond;
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index 2bb16e3..7d69659 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -1,19 +1,21 @@
 //===-- MipsastISel.cpp - Mips FastISel implementation
 //---------------------===//
 
-#include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/FastISel.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "MipsCCState.h"
-#include "MipsRegisterInfo.h"
+#include "MipsInstrInfo.h"
 #include "MipsISelLowering.h"
 #include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
 #include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
 
@@ -43,6 +45,7 @@ class MipsFastISel final : public FastISel {
     void setKind(BaseKind K) { Kind = K; }
     BaseKind getKind() const { return Kind; }
     bool isRegBase() const { return Kind == RegBase; }
+    bool isFIBase() const { return Kind == FrameIndexBase; }
     void setReg(unsigned Reg) {
       assert(isRegBase() && "Invalid base register access!");
       Base.Reg = Reg;
@@ -51,6 +54,15 @@ class MipsFastISel final : public FastISel {
       assert(isRegBase() && "Invalid base register access!");
       return Base.Reg;
     }
+    void setFI(unsigned FI) {
+      assert(isFIBase() && "Invalid base frame index access!");
+      Base.FI = FI;
+    }
+    unsigned getFI() const {
+      assert(isFIBase() && "Invalid base frame index access!");
+      return Base.FI;
+    }
+
     void setOffset(int64_t Offset_) { Offset = Offset_; }
     int64_t getOffset() const { return Offset; }
     void setGlobalValue(const GlobalValue *G) { GV = G; }
@@ -59,11 +71,10 @@ class MipsFastISel final : public FastISel {
 
   /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can
   /// make the right decision when generating code for different targets.
-  Module &M;
   const TargetMachine &TM;
+  const MipsSubtarget *Subtarget;
   const TargetInstrInfo &TII;
   const TargetLowering &TLI;
-  const MipsSubtarget *Subtarget;
   MipsFunctionInfo *MFI;
 
   // Convenience variables to avoid some queries.
@@ -94,6 +105,7 @@ private:
   bool isLoadTypeLegal(Type *Ty, MVT &VT);
   bool computeAddress(const Value *Obj, Address &Addr);
   bool computeCallAddress(const Value *V, Address &Addr);
+  void simplifyAddress(Address &Addr);
 
   // Emit helper routines.
   bool emitCmp(unsigned DestReg, const CmpInst *CI);
@@ -157,17 +169,15 @@ public:
   // Backend specific FastISel code.
   explicit MipsFastISel(FunctionLoweringInfo &funcInfo,
                         const TargetLibraryInfo *libInfo)
-      : FastISel(funcInfo, libInfo),
-        M(const_cast<Module &>(*funcInfo.Fn->getParent())),
-        TM(funcInfo.MF->getTarget()),
-        TII(*TM.getSubtargetImpl()->getInstrInfo()),
-        TLI(*TM.getSubtargetImpl()->getTargetLowering()),
-        Subtarget(&TM.getSubtarget<MipsSubtarget>()) {
+      : FastISel(funcInfo, libInfo), TM(funcInfo.MF->getTarget()),
+        Subtarget(&funcInfo.MF->getSubtarget<MipsSubtarget>()),
+        TII(*Subtarget->getInstrInfo()), TLI(*Subtarget->getTargetLowering()) {
     MFI = funcInfo.MF->getInfo<MipsFunctionInfo>();
     Context = &funcInfo.Fn->getContext();
-    TargetSupported = ((Subtarget->getRelocationModel() == Reloc::PIC_) &&
-                       ((Subtarget->hasMips32r2() || Subtarget->hasMips32()) &&
-                        (Subtarget->isABI_O32())));
+    TargetSupported =
+        ((TM.getRelocationModel() == Reloc::PIC_) &&
+         ((Subtarget->hasMips32r2() || Subtarget->hasMips32()) &&
+          (static_cast<const MipsTargetMachine &>(TM).getABI().IsO32())));
     UnsupportedFPMode = Subtarget->isFP64bit();
   }
 
@@ -188,9 +198,9 @@ static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, MVT LocVT,
   llvm_unreachable("should not be called");
 }
 
-bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT,
-                     CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                     CCState &State) {
+static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT,
+                            CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
   llvm_unreachable("should not be called");
 }
 
@@ -306,14 +316,82 @@ unsigned MipsFastISel::fastMaterializeConstant(const Constant *C) {
 }
 
 bool MipsFastISel::computeAddress(const Value *Obj, Address &Addr) {
-  // This construct looks a big awkward but it is how other ports handle this
-  // and as this function is more fully completed, these cases which
-  // return false will have additional code in them.
-  //
-  if (isa<Instruction>(Obj))
-    return false;
-  else if (isa<ConstantExpr>(Obj))
+
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (isa<ConstantExpr>(Obj))
     return false;
+  switch (Opcode) {
+  default:
+    break;
+  case Instruction::BitCast: {
+    // Look through bitcasts.
+    return computeAddress(U->getOperand(0), Addr);
+  }
+  case Instruction::GetElementPtr: {
+    Address SavedAddr = Addr;
+    uint64_t TmpOffset = Addr.getOffset();
+    // Iterate through the GEP folding the constants into offsets where
+    // we can.
+    gep_type_iterator GTI = gep_type_begin(U);
+    for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e;
+         ++i, ++GTI) {
+      const Value *Op = *i;
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = DL.getStructLayout(STy);
+        unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+        TmpOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+        for (;;) {
+          if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+            // Constant-offset addressing.
+            TmpOffset += CI->getSExtValue() * S;
+            break;
+          }
+          if (canFoldAddIntoGEP(U, Op)) {
+            // A compatible add with a constant operand. Fold the constant.
+            ConstantInt *CI =
+                cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+            TmpOffset += CI->getSExtValue() * S;
+            // Iterate on the other operand.
+            Op = cast<AddOperator>(Op)->getOperand(0);
+            continue;
+          }
+          // Unsupported
+          goto unsupported_gep;
+        }
+      }
+    }
+    // Try to grab the base operand now.
+    Addr.setOffset(TmpOffset);
+    if (computeAddress(U->getOperand(0), Addr))
+      return true;
+    // We failed, restore everything and try the other options.
+    Addr = SavedAddr;
+  unsupported_gep:
+    break;
+  }
+  case Instruction::Alloca: {
+    const AllocaInst *AI = cast<AllocaInst>(Obj);
+    DenseMap<const AllocaInst *, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
+      Addr.setKind(Address::FrameIndexBase);
+      Addr.setFI(SI->second);
+      return true;
+    }
+    break;
+  }
+  }
   Addr.setReg(getRegForValue(Obj));
   return Addr.getReg() != 0;
 }
@@ -519,8 +597,26 @@ bool MipsFastISel::emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   default:
     return false;
   }
-  emitInstLoad(Opc, ResultReg, Addr.getReg(), Addr.getOffset());
-  return true;
+  if (Addr.isRegBase()) {
+    simplifyAddress(Addr);
+    emitInstLoad(Opc, ResultReg, Addr.getReg(), Addr.getOffset());
+    return true;
+  }
+  if (Addr.isFIBase()) {
+    unsigned FI = Addr.getFI();
+    unsigned Align = 4;
+    unsigned Offset = Addr.getOffset();
+    MachineFrameInfo &MFI = *MF->getFrameInfo();
+    MachineMemOperand *MMO = MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(FI), MachineMemOperand::MOLoad,
+        MFI.getObjectSize(FI), Align);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+        .addFrameIndex(FI)
+        .addImm(Offset)
+        .addMemOperand(MMO);
+    return true;
+  }
+  return false;
 }
 
 bool MipsFastISel::emitStore(MVT VT, unsigned SrcReg, Address &Addr,
@@ -552,8 +648,27 @@ bool MipsFastISel::emitStore(MVT VT, unsigned SrcReg, Address &Addr,
   default:
     return false;
   }
-  emitInstStore(Opc, SrcReg, Addr.getReg(), Addr.getOffset());
-  return true;
+  if (Addr.isRegBase()) {
+    simplifyAddress(Addr);
+    emitInstStore(Opc, SrcReg, Addr.getReg(), Addr.getOffset());
+    return true;
+  }
+  if (Addr.isFIBase()) {
+    unsigned FI = Addr.getFI();
+    unsigned Align = 4;
+    unsigned Offset = Addr.getOffset();
+    MachineFrameInfo &MFI = *MF->getFrameInfo();
+    MachineMemOperand *MMO = MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(FI), MachineMemOperand::MOLoad,
+        MFI.getObjectSize(FI), Align);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+        .addReg(SrcReg)
+        .addFrameIndex(FI)
+        .addImm(Offset)
+        .addMemOperand(MMO);
+    return true;
+  }
+  return false;
 }
 
 bool MipsFastISel::selectLoad(const Instruction *I) {
@@ -972,28 +1087,93 @@ bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
   CLI.Call = MIB;
 
-  // Add implicit physical register uses to the call.
-  for (auto Reg : CLI.OutRegs)
-    MIB.addReg(Reg, RegState::Implicit);
-
-  // Add a register mask with the call-preserved registers.  Proper
-  // defs for return values will be added by setPhysRegsDeadExcept().
-  MIB.addRegMask(TRI.getCallPreservedMask(CC));
-
-  CLI.Call = MIB;
   // Finish off the call including any return values.
   return finishCall(CLI, RetVT, NumBytes);
 }
 
 bool MipsFastISel::selectRet(const Instruction *I) {
+  const Function &F = *I->getParent()->getParent();
   const ReturnInst *Ret = cast<ReturnInst>(I);
 
   if (!FuncInfo.CanLowerReturn)
     return false;
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+
   if (Ret->getNumOperands() > 0) {
-    return false;
+    CallingConv::ID CC = F.getCallingConv();
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    MipsCCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs,
+                       I->getContext());
+    CCAssignFn *RetCC = RetCC_Mips;
+    CCInfo.AnalyzeReturn(Outs, RetCC);
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+    const Value *RV = Ret->getOperand(0);
+
+    // Don't bother handling odd stuff for now.
+    if ((VA.getLocInfo() != CCValAssign::Full) &&
+        (VA.getLocInfo() != CCValAssign::BCvt))
+      return false;
+
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    unsigned SrcReg = Reg + VA.getValNo();
+    unsigned DestReg = VA.getLocReg();
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!MRI.getRegClass(SrcReg)->contains(DestReg))
+      return false;
+
+    EVT RVEVT = TLI.getValueType(RV->getType());
+    if (!RVEVT.isSimple())
+      return false;
+
+    if (RVEVT.isVector())
+      return false;
+
+    MVT RVVT = RVEVT.getSimpleVT();
+    if (RVVT == MVT::f128)
+      return false;
+
+    MVT DestVT = VA.getValVT();
+    // Special handling for extended integers.
+    if (RVVT != DestVT) {
+      if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
+        return false;
+
+      if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+        return false;
+
+      bool IsZExt = Outs[0].Flags.isZExt();
+      SrcReg = emitIntExt(RVVT, SrcReg, DestVT, IsZExt);
+      if (SrcReg == 0)
+        return false;
+    }
+
+    // Make the copy.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg);
+
+    // Add register to return instruction.
+    RetRegs.push_back(VA.getLocReg());
   }
-  emitInst(Mips::RetRA);
+  MachineInstrBuilder MIB = emitInst(Mips::RetRA);
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
   return true;
 }
 
@@ -1118,7 +1298,8 @@ bool MipsFastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
 unsigned MipsFastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
                                   bool isZExt) {
   unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
-  return emitIntExt(SrcVT, SrcReg, DestVT, DestReg, isZExt);
+  bool Success = emitIntExt(SrcVT, SrcReg, DestVT, DestReg, isZExt);
+  return Success ? DestReg : 0;
 }
 
 bool MipsFastISel::fastSelectInstruction(const Instruction *I) {
@@ -1170,6 +1351,17 @@ unsigned MipsFastISel::getRegEnsuringSimpleIntegerWidening(const Value *V,
   return VReg;
 }
 
+void MipsFastISel::simplifyAddress(Address &Addr) {
+  if (!isInt<16>(Addr.getOffset())) {
+    unsigned TempReg =
+      materialize32BitInt(Addr.getOffset(), &Mips::GPR32RegClass);
+    unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
+    emitInst(Mips::ADDu, DestReg).addReg(TempReg).addReg(Addr.getReg());
+    Addr.setReg(DestReg);
+    Addr.setOffset(0);
+  }
+}
+
 namespace llvm {
 FastISel *Mips::createFastISel(FunctionLoweringInfo &funcInfo,
                                const TargetLibraryInfo *libInfo) {
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index 3014a0d..8b8b019 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -100,7 +100,7 @@ bool MipsFrameLowering::hasFP(const MachineFunction &MF) const {
 
 uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
 
   int64_t Offset = 0;
 
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 0bdabf3..21fc8ce 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -47,7 +47,7 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  Subtarget = &TM.getSubtarget<MipsSubtarget>();
+  Subtarget = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
   bool Ret = SelectionDAGISel::runOnMachineFunction(MF);
 
   processFunctionAfterISel(MF);
@@ -95,6 +95,12 @@ bool MipsDAGToDAGISel::selectIntAddrMM(SDValue Addr, SDValue &Base,
   return false;
 }
 
+bool MipsDAGToDAGISel::selectIntAddrLSL2MM(SDValue Addr, SDValue &Base,
+                                           SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
 bool MipsDAGToDAGISel::selectIntAddrMSA(SDValue Addr, SDValue &Base,
                                         SDValue &Offset) const {
   llvm_unreachable("Unimplemented function.");
@@ -230,12 +236,3 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
   OutOps.push_back(Op);
   return false;
 }
-
-/// createMipsISelDag - This pass converts a legalized DAG into a
-/// MIPS-specific DAG, ready for instruction scheduling.
-FunctionPass *llvm::createMipsISelDag(MipsTargetMachine &TM) {
-  if (TM.getSubtargetImpl()->inMips16Mode())
-    return llvm::createMips16ISelDag(TM);
-
-  return llvm::createMipsSEISelDag(TM);
-}
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h
index ff8760d..6b72877 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -73,6 +73,9 @@ private:
   virtual bool selectIntAddrMM(SDValue Addr, SDValue &Base,
                                SDValue &Offset) const;
 
+  virtual bool selectIntAddrLSL2MM(SDValue Addr, SDValue &Base,
+                                   SDValue &Offset) const;
+
   /// Match addr+simm10 and addr
   virtual bool selectIntAddrMSA(SDValue Addr, SDValue &Base,
                                 SDValue &Offset) const;
@@ -125,11 +128,6 @@ private:
                                     char ConstraintCode,
                                     std::vector<SDValue> &OutOps) override;
 };
-
-/// createMipsISelDag - This pass converts a legalized DAG into a
-/// MIPS-specific DAG, ready for instruction scheduling.
-FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
-
 }
 
 #endif
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index ff2bfb3..9253b2e 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -70,7 +70,7 @@ static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
   if (!isShiftedMask_64(I))
     return false;
 
-  Size = CountPopulation_64(I);
+  Size = countPopulation(I);
   Pos = countTrailingZeros(I);
   return true;
 }
@@ -203,7 +203,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
 MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
                                        const MipsSubtarget &STI)
-    : TargetLowering(TM), Subtarget(STI) {
+    : TargetLowering(TM), Subtarget(STI), ABI(TM.getABI()) {
   // Mips does not have i1 type, so use i32 for
   // setcc operations results (slt, sgt, ...).
   setBooleanContents(ZeroOrOneBooleanContent);
@@ -215,12 +215,15 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
                        ZeroOrNegativeOneBooleanContent);
 
   // Load extented operations for i1 types must be promoted
-  setLoadExtAction(ISD::EXTLOAD,  MVT::i1,  Promote);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1,  Promote);
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD,  VT, MVT::i1,  Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1,  Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1,  Promote);
+  }
 
   // MIPS doesn't have extending float->double load/store
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  for (MVT VT : MVT::fp_valuetypes())
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
   // Used by legalize types to correctly generate the setcc result.
@@ -258,6 +261,9 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
     setOperationAction(ISD::LOAD,               MVT::i64,   Custom);
     setOperationAction(ISD::STORE,              MVT::i64,   Custom);
     setOperationAction(ISD::FP_TO_SINT,         MVT::i64,   Custom);
+    setOperationAction(ISD::SHL_PARTS,          MVT::i64,   Custom);
+    setOperationAction(ISD::SRA_PARTS,          MVT::i64,   Custom);
+    setOperationAction(ISD::SRL_PARTS,          MVT::i64,   Custom);
   }
 
   if (!Subtarget.isGP64bit()) {
@@ -368,9 +374,9 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
     setOperationAction(ISD::BSWAP, MVT::i64, Expand);
 
   if (Subtarget.isGP64bit()) {
-    setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Custom);
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Custom);
-    setLoadExtAction(ISD::EXTLOAD, MVT::i32, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, MVT::i32, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, MVT::i32, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::i64, MVT::i32, Custom);
     setTruncStoreAction(MVT::i64, MVT::i32, Custom);
   }
 
@@ -387,14 +393,12 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
 
   // The arguments on the stack are defined in terms of 4-byte slots on O32
   // and 8-byte slots on N32/N64.
-  setMinStackArgumentAlignment(
-      (Subtarget.isABI_N32() || Subtarget.isABI_N64()) ? 8 : 4);
+  setMinStackArgumentAlignment((ABI.IsN32() || ABI.IsN64()) ? 8 : 4);
 
-  setStackPointerRegisterToSaveRestore(Subtarget.isABI_N64() ? Mips::SP_64
-                                                             : Mips::SP);
+  setStackPointerRegisterToSaveRestore(ABI.IsN64() ? Mips::SP_64 : Mips::SP);
 
-  setExceptionPointerRegister(Subtarget.isABI_N64() ? Mips::A0_64 : Mips::A0);
-  setExceptionSelectorRegister(Subtarget.isABI_N64() ? Mips::A1_64 : Mips::A1);
+  setExceptionPointerRegister(ABI.IsN64() ? Mips::A0_64 : Mips::A0);
+  setExceptionSelectorRegister(ABI.IsN64() ? Mips::A1_64 : Mips::A1);
 
   MaxStoresPerMemcpy = 16;
 
@@ -933,18 +937,35 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case Mips::DIVU:
   case Mips::MOD:
   case Mips::MODU:
-    return insertDivByZeroTrap(
-        MI, *BB, *getTargetMachine().getSubtargetImpl()->getInstrInfo(), false);
+    return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo(), false);
   case Mips::PseudoDSDIV:
   case Mips::PseudoDUDIV:
   case Mips::DDIV:
   case Mips::DDIVU:
   case Mips::DMOD:
   case Mips::DMODU:
-    return insertDivByZeroTrap(
-        MI, *BB, *getTargetMachine().getSubtargetImpl()->getInstrInfo(), true);
+    return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo(), true);
   case Mips::SEL_D:
     return emitSEL_D(MI, BB);
+
+  case Mips::PseudoSELECT_I:
+  case Mips::PseudoSELECT_I64:
+  case Mips::PseudoSELECT_S:
+  case Mips::PseudoSELECT_D32:
+  case Mips::PseudoSELECT_D64:
+    return emitPseudoSELECT(MI, BB, false, Mips::BNE);
+  case Mips::PseudoSELECTFP_F_I:
+  case Mips::PseudoSELECTFP_F_I64:
+  case Mips::PseudoSELECTFP_F_S:
+  case Mips::PseudoSELECTFP_F_D32:
+  case Mips::PseudoSELECTFP_F_D64:
+    return emitPseudoSELECT(MI, BB, true, Mips::BC1F);
+  case Mips::PseudoSELECTFP_T_I:
+  case Mips::PseudoSELECTFP_T_I64:
+  case Mips::PseudoSELECTFP_T_S:
+  case Mips::PseudoSELECTFP_T_D32:
+  case Mips::PseudoSELECTFP_T_D64:
+    return emitPseudoSELECT(MI, BB, true, Mips::BC1T);
   }
 }
 
@@ -959,8 +980,7 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned LL, SC, AND, NOR, ZERO, BEQ;
 
@@ -1043,8 +1063,7 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
 MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
     MachineInstr *MI, MachineBasicBlock *BB, unsigned Size, unsigned DstReg,
     unsigned SrcReg) const {
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   if (Subtarget.hasMips32r2() && Size == 1) {
@@ -1080,8 +1099,7 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   unsigned Dest = MI->getOperand(0).getReg();
@@ -1178,7 +1196,8 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
   //   beq     success,$0,loopMBB
 
   BB = loopMBB;
-  BuildMI(BB, DL, TII->get(Mips::LL), OldVal).addReg(AlignedAddr).addImm(0);
+  unsigned LL = isMicroMips ? Mips::LL_MM : Mips::LL;
+  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
   if (Nand) {
     //  and andres, oldval, incr2
     //  nor binopres, $0, andres
@@ -1201,7 +1220,8 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
     .addReg(OldVal).addReg(Mask2);
   BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
     .addReg(MaskedOldVal0).addReg(NewVal);
-  BuildMI(BB, DL, TII->get(Mips::SC), Success)
+  unsigned SC = isMicroMips ? Mips::SC_MM : Mips::SC;
+  BuildMI(BB, DL, TII->get(SC), Success)
     .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, DL, TII->get(Mips::BEQ))
     .addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);
@@ -1231,8 +1251,7 @@ MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned LL, SC, ZERO, BNE, BEQ;
 
@@ -1314,8 +1333,7 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   unsigned Dest    = MI->getOperand(0).getReg();
@@ -1412,7 +1430,8 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   //    and     maskedoldval0,oldval,mask
   //    bne     maskedoldval0,shiftedcmpval,sinkMBB
   BB = loop1MBB;
-  BuildMI(BB, DL, TII->get(Mips::LL), OldVal).addReg(AlignedAddr).addImm(0);
+  unsigned LL = isMicroMips ? Mips::LL_MM : Mips::LL;
+  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
     .addReg(OldVal).addReg(Mask);
   BuildMI(BB, DL, TII->get(Mips::BNE))
@@ -1428,7 +1447,8 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
     .addReg(OldVal).addReg(Mask2);
   BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
     .addReg(MaskedOldVal1).addReg(ShiftedNewVal);
-  BuildMI(BB, DL, TII->get(Mips::SC), Success)
+  unsigned SC = isMicroMips ? Mips::SC_MM : Mips::SC;
+  BuildMI(BB, DL, TII->get(SC), Success)
       .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, DL, TII->get(Mips::BEQ))
       .addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);
@@ -1450,10 +1470,8 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
 MachineBasicBlock *MipsTargetLowering::emitSEL_D(MachineInstr *MI,
                                                  MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
-  const TargetRegisterInfo *TRI =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
   MachineBasicBlock::iterator II(MI);
@@ -1497,8 +1515,7 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
                         false, 0);
   Chain = Addr.getValue(1);
 
-  if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) ||
-      Subtarget.isABI_N64()) {
+  if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) || ABI.IsN64()) {
     // For PIC, the sequence is:
     // BRIND(load(Jumptable + index) + RelocBase)
     // RelocBase can be JumpTable, GOT or some sort of global base.
@@ -1580,32 +1597,29 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
   GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = N->getGlobal();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
-      !Subtarget.isABI_N64()) {
-    const MipsTargetObjectFile &TLOF =
-      (const MipsTargetObjectFile&)getObjFileLowering();
-
-    if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine()))
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !ABI.IsN64()) {
+    const MipsTargetObjectFile *TLOF =
+        static_cast<const MipsTargetObjectFile *>(
+            getTargetMachine().getObjFileLowering());
+    if (TLOF->IsGlobalInSmallSection(GV, getTargetMachine()))
       // %gp_rel relocation
-      return getAddrGPRel(N, Ty, DAG);
+      return getAddrGPRel(N, SDLoc(N), Ty, DAG);
 
     // %hi/%lo relocation
-    return getAddrNonPIC(N, Ty, DAG);
+    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
   }
 
   if (GV->hasInternalLinkage() || (GV->hasLocalLinkage() && !isa<Function>(GV)))
-    return getAddrLocal(N, Ty, DAG,
-                        Subtarget.isABI_N32() || Subtarget.isABI_N64());
+    return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
 
   if (LargeGOT)
-    return getAddrGlobalLargeGOT(N, Ty, DAG, MipsII::MO_GOT_HI16,
+    return getAddrGlobalLargeGOT(N, SDLoc(N), Ty, DAG, MipsII::MO_GOT_HI16,
                                  MipsII::MO_GOT_LO16, DAG.getEntryNode(),
                                  MachinePointerInfo::getGOT());
 
-  return getAddrGlobal(N, Ty, DAG,
-                       (Subtarget.isABI_N32() || Subtarget.isABI_N64())
-                           ? MipsII::MO_GOT_DISP
-                           : MipsII::MO_GOT16,
+  return getAddrGlobal(N, SDLoc(N), Ty, DAG,
+                       (ABI.IsN32() || ABI.IsN64()) ? MipsII::MO_GOT_DISP
+                                                    : MipsII::MO_GOT16,
                        DAG.getEntryNode(), MachinePointerInfo::getGOT());
 }
 
@@ -1614,12 +1628,10 @@ SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
   BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
-      !Subtarget.isABI_N64())
-    return getAddrNonPIC(N, Ty, DAG);
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !ABI.IsN64())
+    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
 
-  return getAddrLocal(N, Ty, DAG,
-                      Subtarget.isABI_N32() || Subtarget.isABI_N64());
+  return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
 }
 
 SDValue MipsTargetLowering::
@@ -1707,12 +1719,10 @@ lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
   JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
-      !Subtarget.isABI_N64())
-    return getAddrNonPIC(N, Ty, DAG);
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !ABI.IsN64())
+    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
 
-  return getAddrLocal(N, Ty, DAG,
-                      Subtarget.isABI_N32() || Subtarget.isABI_N64());
+  return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
 }
 
 SDValue MipsTargetLowering::
@@ -1721,20 +1731,19 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
   ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
-      !Subtarget.isABI_N64()) {
-    const MipsTargetObjectFile &TLOF =
-      (const MipsTargetObjectFile&)getObjFileLowering();
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !ABI.IsN64()) {
+    const MipsTargetObjectFile *TLOF =
+        static_cast<const MipsTargetObjectFile *>(
+            getTargetMachine().getObjFileLowering());
 
-    if (TLOF.IsConstantInSmallSection(N->getConstVal(), getTargetMachine()))
+    if (TLOF->IsConstantInSmallSection(N->getConstVal(), getTargetMachine()))
       // %gp_rel relocation
-      return getAddrGPRel(N, Ty, DAG);
+      return getAddrGPRel(N, SDLoc(N), Ty, DAG);
 
-    return getAddrNonPIC(N, Ty, DAG);
+    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
   }
 
-  return getAddrLocal(N, Ty, DAG,
-                      Subtarget.isABI_N32() || Subtarget.isABI_N64());
+  return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
 }
 
 SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -1760,8 +1769,7 @@ SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   unsigned Align = Node->getConstantOperandVal(3);
   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
   SDLoc DL(Node);
-  unsigned ArgSlotSizeInBytes =
-      (Subtarget.isABI_N32() || Subtarget.isABI_N64()) ? 8 : 4;
+  unsigned ArgSlotSizeInBytes = (ABI.IsN32() || ABI.IsN64()) ? 8 : 4;
 
   SDValue VAListLoad = DAG.getLoad(getPointerTy(), DL, Chain, VAListPtr,
                                    MachinePointerInfo(SV), false, false, false,
@@ -1924,9 +1932,8 @@ lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   MFI->setFrameAddressIsTaken(true);
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  SDValue FrameAddr =
-      DAG.getCopyFromReg(DAG.getEntryNode(), DL,
-                         Subtarget.isABI_N64() ? Mips::FP_64 : Mips::FP, VT);
+  SDValue FrameAddr = DAG.getCopyFromReg(
+      DAG.getEntryNode(), DL, ABI.IsN64() ? Mips::FP_64 : Mips::FP, VT);
   return FrameAddr;
 }
 
@@ -1942,7 +1949,7 @@ SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MVT VT = Op.getSimpleValueType();
-  unsigned RA = Subtarget.isABI_N64() ? Mips::RA_64 : Mips::RA;
+  unsigned RA = ABI.IsN64() ? Mips::RA_64 : Mips::RA;
   MFI->setReturnAddressIsTaken(true);
 
   // Return RA, which contains the return address. Mark it an implicit live-in.
@@ -1964,12 +1971,12 @@ SDValue MipsTargetLowering::lowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
   SDLoc DL(Op);
-  EVT Ty = Subtarget.isABI_N64() ? MVT::i64 : MVT::i32;
+  EVT Ty = ABI.IsN64() ? MVT::i64 : MVT::i32;
 
   // Store stack offset in V1, store jump target in V0. Glue CopyToReg and
   // EH_RETURN nodes, so that instructions are emitted back-to-back.
-  unsigned OffsetReg = Subtarget.isABI_N64() ? Mips::V1_64 : Mips::V1;
-  unsigned AddrReg = Subtarget.isABI_N64() ? Mips::V0_64 : Mips::V0;
+  unsigned OffsetReg = ABI.IsN64() ? Mips::V1_64 : Mips::V1;
+  unsigned AddrReg = ABI.IsN64() ? Mips::V0_64 : Mips::V0;
   Chain = DAG.getCopyToReg(Chain, DL, OffsetReg, Offset, SDValue());
   Chain = DAG.getCopyToReg(Chain, DL, AddrReg, Handler, Chain.getValue(1));
   return DAG.getNode(MipsISD::EH_RETURN, DL, MVT::Other, Chain,
@@ -1991,10 +1998,11 @@ SDValue MipsTargetLowering::lowerATOMIC_FENCE(SDValue Op,
 SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc DL(Op);
+  MVT VT = Subtarget.isGP64bit() ? MVT::i64 : MVT::i32;
+
   SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
   SDValue Shamt = Op.getOperand(2);
-
-  // if shamt < 32:
+  // if shamt < (VT.bits):
   //  lo = (shl lo, shamt)
   //  hi = (or (shl hi, shamt) (srl (srl lo, 1), ~shamt))
   // else:
@@ -2002,18 +2010,17 @@ SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op,
   //  hi = (shl lo, shamt[4:0])
   SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
                             DAG.getConstant(-1, MVT::i32));
-  SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, MVT::i32, Lo,
-                                      DAG.getConstant(1, MVT::i32));
-  SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, MVT::i32, ShiftRight1Lo,
-                                     Not);
-  SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi, Shamt);
-  SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, ShiftLeftHi, ShiftRightLo);
-  SDValue ShiftLeftLo = DAG.getNode(ISD::SHL, DL, MVT::i32, Lo, Shamt);
+  SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, VT, Lo,
+                                      DAG.getConstant(1, VT));
+  SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, ShiftRight1Lo, Not);
+  SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, Hi, Shamt);
+  SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
+  SDValue ShiftLeftLo = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt);
   SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
                              DAG.getConstant(0x20, MVT::i32));
-  Lo = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond,
-                   DAG.getConstant(0, MVT::i32), ShiftLeftLo);
-  Hi = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond, ShiftLeftLo, Or);
+  Lo = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+                   DAG.getConstant(0, VT), ShiftLeftLo);
+  Hi = DAG.getNode(ISD::SELECT, DL, VT, Cond, ShiftLeftLo, Or);
 
   SDValue Ops[2] = {Lo, Hi};
   return DAG.getMergeValues(Ops, DL);
@@ -2024,8 +2031,9 @@ SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
   SDLoc DL(Op);
   SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
   SDValue Shamt = Op.getOperand(2);
+  MVT VT = Subtarget.isGP64bit() ? MVT::i64 : MVT::i32;
 
-  // if shamt < 32:
+  // if shamt < (VT.bits):
   //  lo = (or (shl (shl hi, 1), ~shamt) (srl lo, shamt))
   //  if isSRA:
   //    hi = (sra hi, shamt)
@@ -2040,21 +2048,19 @@ SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
   //   hi = 0
   SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
                             DAG.getConstant(-1, MVT::i32));
-  SDValue ShiftLeft1Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
-                                     DAG.getConstant(1, MVT::i32));
-  SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, MVT::i32, ShiftLeft1Hi, Not);
-  SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, MVT::i32, Lo, Shamt);
-  SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, ShiftLeftHi, ShiftRightLo);
-  SDValue ShiftRightHi = DAG.getNode(IsSRA ? ISD::SRA : ISD::SRL, DL, MVT::i32,
-                                     Hi, Shamt);
+  SDValue ShiftLeft1Hi = DAG.getNode(ISD::SHL, DL, VT, Hi,
+                                     DAG.getConstant(1, VT));
+  SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, ShiftLeft1Hi, Not);
+  SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, Lo, Shamt);
+  SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
+  SDValue ShiftRightHi = DAG.getNode(IsSRA ? ISD::SRA : ISD::SRL,
+                                     DL, VT, Hi, Shamt);
   SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
                              DAG.getConstant(0x20, MVT::i32));
-  SDValue Shift31 = DAG.getNode(ISD::SRA, DL, MVT::i32, Hi,
-                                DAG.getConstant(31, MVT::i32));
-  Lo = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond, ShiftRightHi, Or);
-  Hi = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond,
-                   IsSRA ? Shift31 : DAG.getConstant(0, MVT::i32),
-                   ShiftRightHi);
+  SDValue Shift31 = DAG.getNode(ISD::SRA, DL, VT, Hi, DAG.getConstant(31, VT));
+  Lo = DAG.getNode(ISD::SELECT, DL, VT, Cond, ShiftRightHi, Or);
+  Hi = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+                   IsSRA ? Shift31 : DAG.getConstant(0, VT), ShiftRightHi);
 
   SDValue Ops[2] = {Lo, Hi};
   return DAG.getMergeValues(Ops, DL);
@@ -2266,9 +2272,9 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
 
 static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
                        CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                       CCState &State, const MCPhysReg *F64Regs) {
-
-  static const unsigned IntRegsSize = 4, FloatRegsSize = 2;
+                       CCState &State, ArrayRef<MCPhysReg> F64Regs) {
+  const MipsSubtarget &Subtarget = static_cast<const MipsSubtarget &>(
+      State.getMachineFunction().getSubtarget());
 
   static const MCPhysReg IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 };
   static const MCPhysReg F32Regs[] = { Mips::F12, Mips::F14 };
@@ -2278,6 +2284,19 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
     return true;
 
   // Promote i8 and i16
+  if (ArgFlags.isInReg() && !Subtarget.isLittle()) {
+    if (LocVT == MVT::i8 || LocVT == MVT::i16 || LocVT == MVT::i32) {
+      LocVT = MVT::i32;
+      if (ArgFlags.isSExt())
+        LocInfo = CCValAssign::SExtUpper;
+      else if (ArgFlags.isZExt())
+        LocInfo = CCValAssign::ZExtUpper;
+      else
+        LocInfo = CCValAssign::AExtUpper;
+    }
+  }
+
+  // Promote i8 and i16
   if (LocVT == MVT::i8 || LocVT == MVT::i16) {
     LocVT = MVT::i32;
     if (ArgFlags.isSExt())
@@ -2293,39 +2312,39 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
   // f32 and f64 are allocated in A0, A1, A2, A3 when either of the following
   // is true: function is vararg, argument is 3rd or higher, there is previous
   // argument which is not f32 or f64.
-  bool AllocateFloatsInIntReg = State.isVarArg() || ValNo > 1
-      || State.getFirstUnallocated(F32Regs, FloatRegsSize) != ValNo;
+  bool AllocateFloatsInIntReg = State.isVarArg() || ValNo > 1 ||
+                                State.getFirstUnallocated(F32Regs) != ValNo;
   unsigned OrigAlign = ArgFlags.getOrigAlign();
   bool isI64 = (ValVT == MVT::i32 && OrigAlign == 8);
 
   if (ValVT == MVT::i32 || (ValVT == MVT::f32 && AllocateFloatsInIntReg)) {
-    Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    Reg = State.AllocateReg(IntRegs);
     // If this is the first part of an i64 arg,
     // the allocated register must be either A0 or A2.
     if (isI64 && (Reg == Mips::A1 || Reg == Mips::A3))
-      Reg = State.AllocateReg(IntRegs, IntRegsSize);
+      Reg = State.AllocateReg(IntRegs);
     LocVT = MVT::i32;
   } else if (ValVT == MVT::f64 && AllocateFloatsInIntReg) {
     // Allocate int register and shadow next int register. If first
     // available register is Mips::A1 or Mips::A3, shadow it too.
-    Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    Reg = State.AllocateReg(IntRegs);
     if (Reg == Mips::A1 || Reg == Mips::A3)
-      Reg = State.AllocateReg(IntRegs, IntRegsSize);
-    State.AllocateReg(IntRegs, IntRegsSize);
+      Reg = State.AllocateReg(IntRegs);
+    State.AllocateReg(IntRegs);
     LocVT = MVT::i32;
   } else if (ValVT.isFloatingPoint() && !AllocateFloatsInIntReg) {
     // we are guaranteed to find an available float register
     if (ValVT == MVT::f32) {
-      Reg = State.AllocateReg(F32Regs, FloatRegsSize);
+      Reg = State.AllocateReg(F32Regs);
       // Shadow int register
-      State.AllocateReg(IntRegs, IntRegsSize);
+      State.AllocateReg(IntRegs);
     } else {
-      Reg = State.AllocateReg(F64Regs, FloatRegsSize);
+      Reg = State.AllocateReg(F64Regs);
       // Shadow int registers
-      unsigned Reg2 = State.AllocateReg(IntRegs, IntRegsSize);
+      unsigned Reg2 = State.AllocateReg(IntRegs);
       if (Reg2 == Mips::A1 || Reg2 == Mips::A3)
-        State.AllocateReg(IntRegs, IntRegsSize);
-      State.AllocateReg(IntRegs, IntRegsSize);
+        State.AllocateReg(IntRegs);
+      State.AllocateReg(IntRegs);
     }
   } else
     llvm_unreachable("Cannot handle this ValVT.");
@@ -2407,8 +2426,8 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
   // used for the function (that is, Mips linker doesn't generate lazy binding
   // stub for a function whose address is taken in the program).
   if (IsPICCall && !InternalLinkage && IsCallReloc) {
-    unsigned GPReg = Subtarget.isABI_N64() ? Mips::GP_64 : Mips::GP;
-    EVT Ty = Subtarget.isABI_N64() ? MVT::i64 : MVT::i32;
+    unsigned GPReg = ABI.IsN64() ? Mips::GP_64 : Mips::GP;
+    EVT Ty = ABI.IsN64() ? MVT::i64 : MVT::i32;
     RegsToPass.push_back(std::make_pair(GPReg, getGlobalReg(CLI.DAG, Ty)));
   }
 
@@ -2431,8 +2450,7 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
                                       RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CLI.CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   if (Subtarget.inMips16HardFloat()) {
@@ -2468,7 +2486,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering();
+  const TargetFrameLowering *TFL = Subtarget.getFrameLowering();
   MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
   bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
 
@@ -2480,7 +2498,6 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Allocate the reserved argument area. It seems strange to do this from the
   // caller side but removing it breaks the frame size calculation.
-  const MipsABIInfo &ABI = Subtarget.getABI();
   CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
 
   CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(), Callee.getNode());
@@ -2511,8 +2528,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL);
 
   SDValue StackPtr = DAG.getCopyFromReg(
-      Chain, DL, Subtarget.isABI_N64() ? Mips::SP_64 : Mips::SP,
-      getPointerTy());
+      Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP, getPointerTy());
 
   // With EABI is it possible to have 16 args on registers.
   std::deque< std::pair<unsigned, SDValue> > RegsToPass;
@@ -2626,9 +2642,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
-  bool IsPICCall =
-      (Subtarget.isABI_N64() || IsPIC); // true if calls are translated to
-                                         // jalr $25
+  bool IsPICCall = (ABI.IsN64() || IsPIC); // true if calls are translated to
+                                           // jalr $25
   bool GlobalOrExternal = false, InternalLinkage = false, IsCallReloc = false;
   SDValue CalleeLo;
   EVT Ty = Callee.getValueType();
@@ -2639,15 +2654,14 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       InternalLinkage = Val->hasInternalLinkage();
 
       if (InternalLinkage)
-        Callee = getAddrLocal(G, Ty, DAG,
-                              Subtarget.isABI_N32() || Subtarget.isABI_N64());
+        Callee = getAddrLocal(G, DL, Ty, DAG, ABI.IsN32() || ABI.IsN64());
       else if (LargeGOT) {
-        Callee = getAddrGlobalLargeGOT(G, Ty, DAG, MipsII::MO_CALL_HI16,
+        Callee = getAddrGlobalLargeGOT(G, DL, Ty, DAG, MipsII::MO_CALL_HI16,
                                        MipsII::MO_CALL_LO16, Chain,
                                        FuncInfo->callPtrInfo(Val));
         IsCallReloc = true;
       } else {
-        Callee = getAddrGlobal(G, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
+        Callee = getAddrGlobal(G, DL, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
                                FuncInfo->callPtrInfo(Val));
         IsCallReloc = true;
       }
@@ -2659,16 +2673,16 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     const char *Sym = S->getSymbol();
 
-    if (!Subtarget.isABI_N64() && !IsPIC) // !N64 && static
-      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(),
-                                            MipsII::MO_NO_FLAG);
+    if (!ABI.IsN64() && !IsPIC) // !N64 && static
+      Callee =
+          DAG.getTargetExternalSymbol(Sym, getPointerTy(), MipsII::MO_NO_FLAG);
     else if (LargeGOT) {
-      Callee = getAddrGlobalLargeGOT(S, Ty, DAG, MipsII::MO_CALL_HI16,
+      Callee = getAddrGlobalLargeGOT(S, DL, Ty, DAG, MipsII::MO_CALL_HI16,
                                      MipsII::MO_CALL_LO16, Chain,
                                      FuncInfo->callPtrInfo(Sym));
       IsCallReloc = true;
     } else { // N64 || PIC
-      Callee = getAddrGlobal(S, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
+      Callee = getAddrGlobal(S, DL, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
                              FuncInfo->callPtrInfo(Sym));
       IsCallReloc = true;
     }
@@ -2844,7 +2858,6 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   SmallVector<CCValAssign, 16> ArgLocs;
   MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
                      *DAG.getContext());
-  const MipsABIInfo &ABI = Subtarget.getABI();
   CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
   Function::const_arg_iterator FuncArg =
     DAG.getMachineFunction().getFunction()->arg_begin();
@@ -2858,13 +2871,16 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-    std::advance(FuncArg, Ins[i].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Ins[i].OrigArgIndex;
+    if (Ins[i].isOrigArg()) {
+      std::advance(FuncArg, Ins[i].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[i].getOrigArgIndex();
+    }
     EVT ValVT = VA.getValVT();
     ISD::ArgFlagsTy Flags = Ins[i].Flags;
     bool IsRegLoc = VA.isRegLoc();
 
     if (Flags.isByVal()) {
+      assert(Ins[i].isOrigArg() && "Byval arguments cannot be implicit");
       unsigned FirstByValReg, LastByValReg;
       unsigned ByValIdx = CCInfo.getInRegsParamsProcessed();
       CCInfo.getInRegsParamInfo(ByValIdx, FirstByValReg, LastByValReg);
@@ -2897,7 +2913,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
           (RegVT == MVT::i64 && ValVT == MVT::f64) ||
           (RegVT == MVT::f64 && ValVT == MVT::i64))
         ArgValue = DAG.getNode(ISD::BITCAST, DL, ValVT, ArgValue);
-      else if (Subtarget.isABI_O32() && RegVT == MVT::i32 &&
+      else if (ABI.IsO32() && RegVT == MVT::i32 &&
                ValVT == MVT::f64) {
         unsigned Reg2 = addLiveIn(DAG.getMachineFunction(),
                                   getNextIntArgReg(ArgReg), RC);
@@ -2912,7 +2928,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
     } else { // VA.isRegLoc()
       MVT LocVT = VA.getLocVT();
 
-      if (Subtarget.isABI_O32()) {
+      if (ABI.IsO32()) {
         // We ought to be able to use LocVT directly but O32 sets it to i32
         // when allocating floating point values to integer registers.
         // This shouldn't influence how we load the value into registers unless
@@ -2949,7 +2965,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
       unsigned Reg = MipsFI->getSRetReturnReg();
       if (!Reg) {
         Reg = MF.getRegInfo().createVirtualRegister(
-            getRegClassFor(Subtarget.isABI_N64() ? MVT::i64 : MVT::i32));
+            getRegClassFor(ABI.IsN64() ? MVT::i64 : MVT::i32));
         MipsFI->setSRetReturnReg(Reg);
       }
       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[i]);
@@ -3066,7 +3082,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
     if (!Reg)
       llvm_unreachable("sret virtual register not created in the entry block");
     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
-    unsigned V0 = Subtarget.isABI_N64() ? Mips::V0_64 : Mips::V0;
+    unsigned V0 = ABI.IsN64() ? Mips::V0_64 : Mips::V0;
 
     Chain = DAG.getCopyToReg(Chain, DL, V0, Val, Flag);
     Flag = Chain.getValue(1);
@@ -3201,7 +3217,7 @@ parsePhysicalReg(StringRef C, std::string &Prefix,
 std::pair<unsigned, const TargetRegisterClass *> MipsTargetLowering::
 parseRegForInlineAsmConstraint(StringRef C, MVT VT) const {
   const TargetRegisterInfo *TRI =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
+      Subtarget.getRegisterInfo();
   const TargetRegisterClass *RC;
   std::string Prefix;
   unsigned long long Reg;
@@ -3275,9 +3291,10 @@ parseRegForInlineAsmConstraint(StringRef C, MVT VT) const {
 /// Given a register class constraint, like 'r', if this corresponds directly
 /// to an LLVM register class, return a register of 0 and the register class
 /// pointer.
-std::pair<unsigned, const TargetRegisterClass*> MipsTargetLowering::
-getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
-{
+std::pair<unsigned, const TargetRegisterClass *>
+MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                 const std::string &Constraint,
+                                                 MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'd': // Address register. Same as 'r' unless generating MIPS16 code.
@@ -3333,7 +3350,7 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
   if (R.second)
     return R;
 
-  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
 
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
@@ -3477,7 +3494,7 @@ bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 }
 
 unsigned MipsTargetLowering::getJumpTableEncoding() const {
-  if (Subtarget.isABI_N64())
+  if (ABI.IsN64())
     return MachineJumpTableInfo::EK_GPRel64BlockAddress;
 
   return TargetLowering::getJumpTableEncoding();
@@ -3495,7 +3512,6 @@ void MipsTargetLowering::copyByValRegs(
   unsigned RegAreaSize = NumRegs * GPRSizeInBytes;
   unsigned FrameObjSize = std::max(Flags.getByValSize(), RegAreaSize);
   int FrameObjOffset;
-  const MipsABIInfo &ABI = Subtarget.getABI();
   ArrayRef<MCPhysReg> ByValArgRegs = ABI.GetByValArgRegs();
 
   if (RegAreaSize)
@@ -3547,7 +3563,7 @@ void MipsTargetLowering::passByValArg(
   unsigned NumRegs = LastReg - FirstReg;
 
   if (NumRegs) {
-    const ArrayRef<MCPhysReg> ArgRegs = Subtarget.getABI().GetByValArgRegs();
+    const ArrayRef<MCPhysReg> ArgRegs = ABI.GetByValArgRegs();
     bool LeftoverBytes = (NumRegs * RegSizeInBytes > ByValSizeInBytes);
     unsigned I = 0;
 
@@ -3630,8 +3646,8 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
                                          SDValue Chain, SDLoc DL,
                                          SelectionDAG &DAG,
                                          CCState &State) const {
-  const ArrayRef<MCPhysReg> ArgRegs = Subtarget.getABI().GetVarArgRegs();
-  unsigned Idx = State.getFirstUnallocated(ArgRegs.data(), ArgRegs.size());
+  const ArrayRef<MCPhysReg> ArgRegs = ABI.GetVarArgRegs();
+  unsigned Idx = State.getFirstUnallocated(ArgRegs);
   unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
   MVT RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
   const TargetRegisterClass *RC = getRegClassFor(RegTy);
@@ -3646,7 +3662,6 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
     VaArgOffset =
         RoundUpToAlignment(State.getNextStackOffset(), RegSizeInBytes);
   else {
-    const MipsABIInfo &ABI = Subtarget.getABI();
     VaArgOffset =
         (int)ABI.GetCalleeAllocdArgSizeInBytes(State.getCallingConv()) -
         (int)(RegSizeInBytes * (ArgRegs.size() - Idx));
@@ -3677,8 +3692,7 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
 
 void MipsTargetLowering::HandleByVal(CCState *State, unsigned &Size,
                                      unsigned Align) const {
-  MachineFunction &MF = State->getMachineFunction();
-  const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering();
+  const TargetFrameLowering *TFL = Subtarget.getFrameLowering();
 
   assert(Size && "Byval argument's size shouldn't be 0.");
 
@@ -3689,10 +3703,10 @@ void MipsTargetLowering::HandleByVal(CCState *State, unsigned &Size,
 
   if (State->getCallingConv() != CallingConv::Fast) {
     unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
-    const ArrayRef<MCPhysReg> IntArgRegs = Subtarget.getABI().GetByValArgRegs();
+    const ArrayRef<MCPhysReg> IntArgRegs = ABI.GetByValArgRegs();
     // FIXME: The O32 case actually describes no shadow registers.
     const MCPhysReg *ShadowRegs =
-        Subtarget.isABI_O32() ? IntArgRegs.data() : Mips64DPRegs;
+        ABI.IsO32() ? IntArgRegs.data() : Mips64DPRegs;
 
     // We used to check the size as well but we can't do that anymore since
     // CCState::HandleByVal() rounds up the size after calling this function.
@@ -3700,7 +3714,7 @@ void MipsTargetLowering::HandleByVal(CCState *State, unsigned &Size,
            "Byval argument's alignment should be a multiple of"
            "RegSizeInBytes.");
 
-    FirstReg = State->getFirstUnallocated(IntArgRegs.data(), IntArgRegs.size());
+    FirstReg = State->getFirstUnallocated(IntArgRegs);
 
     // If Align > RegSizeInBytes, the first arg register must be even.
     // FIXME: This condition happens to do the right thing but it's not the
@@ -3720,3 +3734,102 @@ void MipsTargetLowering::HandleByVal(CCState *State, unsigned &Size,
 
   State->addInRegsParamInfo(FirstReg, FirstReg + NumRegs);
 }
+
+MachineBasicBlock *
+MipsTargetLowering::emitPseudoSELECT(MachineInstr *MI, MachineBasicBlock *BB,
+                                     bool isFPCmp, unsigned Opc) const {
+  assert(!(Subtarget.hasMips4() || Subtarget.hasMips32()) &&
+         "Subtarget already supports SELECT nodes with the use of"
+         "conditional-move instructions.");
+
+  const TargetInstrInfo *TII =
+      Subtarget.getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  // To "insert" a SELECT instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  if (isFPCmp) {
+    // bc1[tf] cc, sinkMBB
+    BuildMI(BB, DL, TII->get(Opc))
+      .addReg(MI->getOperand(1).getReg())
+      .addMBB(sinkMBB);
+  } else {
+    // bne rs, $0, sinkMBB
+    BuildMI(BB, DL, TII->get(Opc))
+      .addReg(MI->getOperand(1).getReg())
+      .addReg(Mips::ZERO)
+      .addMBB(sinkMBB);
+  }
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), DL,
+          TII->get(Mips::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB)
+    .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+
+  return BB;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned MipsTargetLowering::getRegisterByName(const char* RegName,
+                                               EVT VT) const {
+  // Named registers is expected to be fairly rare. For now, just support $28
+  // since the linux kernel uses it.
+  if (Subtarget.isGP64bit()) {
+    unsigned Reg = StringSwitch<unsigned>(RegName)
+                         .Case("$28", Mips::GP_64)
+                         .Default(0);
+    if (Reg)
+      return Reg;
+  } else {
+    unsigned Reg = StringSwitch<unsigned>(RegName)
+                         .Case("$28", Mips::GP)
+                         .Default(0);
+    if (Reg)
+      return Reg;
+  }
+  report_fatal_error("Invalid register name global variable");
+}
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 60e53da..9f86a43 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MIPSISELLOWERING_H
 #define LLVM_LIB_TARGET_MIPS_MIPSISELLOWERING_H
 
+#include "MCTargetDesc/MipsABIInfo.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips.h"
 #include "llvm/CodeGen/CallingConvLower.h"
@@ -262,6 +263,8 @@ namespace llvm {
 
     void HandleByVal(CCState *, unsigned &, unsigned) const override;
 
+    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+
   protected:
     SDValue getGlobalReg(SelectionDAG &DAG, EVT Ty) const;
 
@@ -270,9 +273,8 @@ namespace llvm {
     //
     // (add (load (wrapper $gp, %got(sym)), %lo(sym))
     template <class NodeTy>
-    SDValue getAddrLocal(NodeTy *N, EVT Ty, SelectionDAG &DAG,
+    SDValue getAddrLocal(NodeTy *N, SDLoc DL, EVT Ty, SelectionDAG &DAG,
                          bool IsN32OrN64) const {
-      SDLoc DL(N);
       unsigned GOTFlag = IsN32OrN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
       SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
                                 getTargetNode(N, Ty, DAG, GOTFlag));
@@ -289,11 +291,10 @@ namespace llvm {
     // computing a global symbol's address:
     //
     // (load (wrapper $gp, %got(sym)))
-    template<class NodeTy>
-    SDValue getAddrGlobal(NodeTy *N, EVT Ty, SelectionDAG &DAG,
+    template <class NodeTy>
+    SDValue getAddrGlobal(NodeTy *N, SDLoc DL, EVT Ty, SelectionDAG &DAG,
                           unsigned Flag, SDValue Chain,
                           const MachinePointerInfo &PtrInfo) const {
-      SDLoc DL(N);
       SDValue Tgt = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
                                 getTargetNode(N, Ty, DAG, Flag));
       return DAG.getLoad(Ty, DL, Chain, Tgt, PtrInfo, false, false, false, 0);
@@ -303,14 +304,13 @@ namespace llvm {
     // computing a global symbol's address in large-GOT mode:
     //
     // (load (wrapper (add %hi(sym), $gp), %lo(sym)))
-    template<class NodeTy>
-    SDValue getAddrGlobalLargeGOT(NodeTy *N, EVT Ty, SelectionDAG &DAG,
-                                  unsigned HiFlag, unsigned LoFlag,
-                                  SDValue Chain,
+    template <class NodeTy>
+    SDValue getAddrGlobalLargeGOT(NodeTy *N, SDLoc DL, EVT Ty,
+                                  SelectionDAG &DAG, unsigned HiFlag,
+                                  unsigned LoFlag, SDValue Chain,
                                   const MachinePointerInfo &PtrInfo) const {
-      SDLoc DL(N);
-      SDValue Hi = DAG.getNode(MipsISD::Hi, DL, Ty,
-                               getTargetNode(N, Ty, DAG, HiFlag));
+      SDValue Hi =
+          DAG.getNode(MipsISD::Hi, DL, Ty, getTargetNode(N, Ty, DAG, HiFlag));
       Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
       SDValue Wrapper = DAG.getNode(MipsISD::Wrapper, DL, Ty, Hi,
                                     getTargetNode(N, Ty, DAG, LoFlag));
@@ -322,9 +322,9 @@ namespace llvm {
     // computing a symbol's address in non-PIC mode:
     //
     // (add %hi(sym), %lo(sym))
-    template<class NodeTy>
-    SDValue getAddrNonPIC(NodeTy *N, EVT Ty, SelectionDAG &DAG) const {
-      SDLoc DL(N);
+    template <class NodeTy>
+    SDValue getAddrNonPIC(NodeTy *N, SDLoc DL, EVT Ty,
+                          SelectionDAG &DAG) const {
       SDValue Hi = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_HI);
       SDValue Lo = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_LO);
       return DAG.getNode(ISD::ADD, DL, Ty,
@@ -336,9 +336,8 @@ namespace llvm {
     // computing a symbol's address using gp-relative addressing:
     //
     // (add $gp, %gp_rel(sym))
-    template<class NodeTy>
-    SDValue getAddrGPRel(NodeTy *N, EVT Ty, SelectionDAG &DAG) const {
-      SDLoc DL(N);
+    template <class NodeTy>
+    SDValue getAddrGPRel(NodeTy *N, SDLoc DL, EVT Ty, SelectionDAG &DAG) const {
       assert(Ty == MVT::i32);
       SDValue GPRel = getTargetNode(N, Ty, DAG, MipsII::MO_GPREL);
       return DAG.getNode(ISD::ADD, DL, Ty,
@@ -363,6 +362,8 @@ namespace llvm {
 
     // Subtarget Info
     const MipsSubtarget &Subtarget;
+    // Cache the ABI from the TargetMachine, we use it everywhere.
+    const MipsABIInfo &ABI;
 
   private:
     // Create a TargetGlobalAddress node.
@@ -488,9 +489,10 @@ namespace llvm {
     std::pair<unsigned, const TargetRegisterClass *>
     parseRegForInlineAsmConstraint(StringRef C, MVT VT) const;
 
-    std::pair<unsigned, const TargetRegisterClass*>
-              getRegForInlineAsmConstraint(const std::string &Constraint,
-                                           MVT VT) const override;
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 const std::string &Constraint,
+                                 MVT VT) const override;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
@@ -534,6 +536,9 @@ namespace llvm {
     MachineBasicBlock *emitAtomicCmpSwapPartword(MachineInstr *MI,
                                   MachineBasicBlock *BB, unsigned Size) const;
     MachineBasicBlock *emitSEL_D(MachineInstr *MI, MachineBasicBlock *BB) const;
+    MachineBasicBlock *emitPseudoSELECT(MachineInstr *MI,
+                                        MachineBasicBlock *BB, bool isFPCmp,
+                                        unsigned Opc) const;
   };
 
   /// Create MipsTargetLowering objects.
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 2aa8328..ed97cb4 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -458,42 +458,42 @@ def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
 defm FSUB :  ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>;
 
 def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
-             MADDS_FM<4, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
+             MADDS_FM<4, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
 def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
-             MADDS_FM<5, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
+             MADDS_FM<5, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
 
 let AdditionalPredicates = [NoNaNsFPMath] in {
   def NMADD_S : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
-                MADDS_FM<6, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
+                MADDS_FM<6, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
   def NMSUB_S : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
-                MADDS_FM<7, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
+                MADDS_FM<7, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
 }
 
 def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
-               MADDS_FM<4, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
+               MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
 def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
-               MADDS_FM<5, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
+               MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
 
 let AdditionalPredicates = [NoNaNsFPMath] in {
   def NMADD_D32 : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
-                  MADDS_FM<6, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
+                  MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
   def NMSUB_D32 : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
-                  MADDS_FM<7, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
+                  MADDS_FM<7, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
 }
 
-let isCodeGenOnly=1 in {
+let DecoderNamespace = "Mips64" in {
   def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>,
-                 MADDS_FM<4, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+                 MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
   def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>,
-                 MADDS_FM<5, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+                 MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
 }
 
 let AdditionalPredicates = [NoNaNsFPMath],
-    isCodeGenOnly=1 in {
+    DecoderNamespace = "Mips64" in {
   def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>,
-                  MADDS_FM<6, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+                  MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
   def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, II_NMSUB_D, fsub>,
-                  MADDS_FM<7, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+                  MADDS_FM<7, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 5c91fbc..8cc1603 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -297,6 +297,19 @@ class BGEZ_FM<bits<6> op, bits<5> funct> : StdArch {
   let Inst{15-0}  = offset;
 }
 
+class BBIT_FM<bits<6> op> : StdArch {
+  bits<5>  rs;
+  bits<5>  p;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = p;
+  let Inst{15-0}  = offset;
+}
+
 class SLTI_FM<bits<6> op> : StdArch {
   bits<5> rt;
   bits<5> rs;
@@ -411,6 +424,20 @@ class SYNC_FM : StdArch {
   let Inst{5-0}   = 0xf;
 }
 
+class SYNCI_FM : StdArch {
+  // Produced by the mem_simm16 address as reg << 16 | imm (see getMemEncoding).
+  bits<21> addr;
+  bits<5> rs = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000001;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0b11111;
+  let Inst{15-0}  = offset;
+}
+
 class MULT_FM<bits<6> op, bits<6> funct> : StdArch {
   bits<5>  rs;
   bits<5>  rt;
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index dcc0e24..0839147 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -15,7 +15,7 @@
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MipsAnalyzeImmediate.h"
 #include "MipsMachineFunction.h"
-#include "MipsTargetMachine.h"
+#include "MipsSubtarget.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index aebac34..04a16b3 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -156,6 +156,8 @@ def HasMips3     :    Predicate<"Subtarget->hasMips3()">,
                       AssemblerPredicate<"FeatureMips3">;
 def HasMips4_32  :    Predicate<"Subtarget->hasMips4_32()">,
                       AssemblerPredicate<"FeatureMips4_32">;
+def NotMips4_32  :    Predicate<"!Subtarget->hasMips4_32()">,
+                      AssemblerPredicate<"FeatureMips4_32">;
 def HasMips4_32r2 :   Predicate<"Subtarget->hasMips4_32r2()">,
                       AssemblerPredicate<"FeatureMips4_32r2">;
 def HasMips5_32r2 :   Predicate<"Subtarget->hasMips5_32r2()">,
@@ -180,8 +182,6 @@ def HasMips64r6  :    Predicate<"Subtarget->hasMips64r6()">,
                       AssemblerPredicate<"FeatureMips64r6">;
 def NotMips64r6  :    Predicate<"!Subtarget->hasMips64r6()">,
                       AssemblerPredicate<"!FeatureMips64r6">;
-def IsN64       :     Predicate<"Subtarget->isABI_N64()">,
-                      AssemblerPredicate<"FeatureN64">;
 def InMips16Mode :    Predicate<"Subtarget->inMips16Mode()">,
                       AssemblerPredicate<"FeatureMips16">;
 def HasCnMips    :    Predicate<"Subtarget->hasCnMips()">,
@@ -220,6 +220,9 @@ class GPR_64 { list<Predicate> GPRPredicates = [IsGP64bit]; }
 //        subtractive predicate will hopefully keep us under the 32 predicate
 //        limit long enough to develop an alternative way to handle P1||P2
 //        predicates.
+class ISA_MIPS1_NOT_4_32 {
+  list<Predicate> InsnPredicates = [NotMips4_32];
+}
 class ISA_MIPS1_NOT_32R6_64R6 {
   list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6];
 }
@@ -316,7 +319,7 @@ class IsAsCheapAsAMove {
 }
 
 class NeverHasSideEffects {
-  bit neverHasSideEffects = 1;
+  bit hasSideEffects = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -425,7 +428,14 @@ def MipsMemSimm11AsmOperand : AsmOperandClass {
   let RenderMethod = "addMemOperands";
   let ParserMethod = "parseMemOperand";
   let PredicateMethod = "isMemWithSimmOffset<11>";
-  //let DiagnosticType = "Simm11";
+}
+
+def MipsMemSimm16AsmOperand : AsmOperandClass {
+  let Name = "MemOffsetSimm16";
+  let SuperClasses = [MipsMemAsmOperand];
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithSimmOffset<16>";
 }
 
 def MipsInvertedImmoperand : AsmOperandClass {
@@ -470,6 +480,12 @@ def mem_simm11 : mem_generic {
   let ParserMatchClass = MipsMemSimm11AsmOperand;
 }
 
+def mem_simm16 : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm16);
+  let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemSimm16AsmOperand;
+}
+
 def mem_ea : Operand<iPTR> {
   let PrintMethod = "printMemOperandEA";
   let MIOperandInfo = (ops ptr_rc, simm16);
@@ -632,7 +648,7 @@ class shift_rotate_reg<string opstr, RegisterOperand RO, InstrItinClass itin,
 class LoadUpper<string opstr, RegisterOperand RO, Operand Imm>:
   InstSE<(outs RO:$rt), (ins Imm:$imm16), !strconcat(opstr, "\t$rt, $imm16"),
          [], II_LUI, FrmI, opstr>, IsAsCheapAsAMove {
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
   let isReMaterializable = 1;
 }
 
@@ -860,6 +876,13 @@ class SYNC_FT<string opstr> :
   InstSE<(outs), (ins i32imm:$stype), "sync $stype", [(MipsSync imm:$stype)],
          NoItinerary, FrmOther, opstr>;
 
+class SYNCI_FT<string opstr> :
+  InstSE<(outs), (ins mem_simm16:$addr), !strconcat(opstr, "\t$addr"), [],
+         NoItinerary, FrmOther, opstr> {
+  let hasSideEffects = 1;
+  let DecoderMethod = "DecodeSyncI";
+}
+
 let hasSideEffects = 1 in
 class TEQ_FT<string opstr, RegisterOperand RO> :
   InstSE<(outs), (ins RO:$rs, RO:$rt, uimm16:$code_),
@@ -876,7 +899,7 @@ class Mult<string opstr, InstrItinClass itin, RegisterOperand RO,
          itin, FrmR, opstr> {
   let isCommutable = 1;
   let Defs = DefRegs;
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
 }
 
 // Pseudo multiply/divide instruction with explicit accumulator register
@@ -922,7 +945,7 @@ class MoveFromLOHI<string opstr, RegisterOperand RO, Register UseReg>:
   InstSE<(outs RO:$rd), (ins), !strconcat(opstr, "\t$rd"), [], II_MFHI_MFLO,
          FrmR, opstr> {
   let Uses = [UseReg];
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
 }
 
 class PseudoMTLOHI<RegisterClass DstRC, RegisterClass SrcRC>
@@ -934,7 +957,7 @@ class MoveToLOHI<string opstr, RegisterOperand RO, list<Register> DefRegs>:
   InstSE<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), [], II_MTHI_MTLO,
   FrmR, opstr> {
   let Defs = DefRegs;
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
 }
 
 class EffectiveAddress<string opstr, RegisterOperand RO> :
@@ -964,7 +987,7 @@ class SignExtInReg<string opstr, ValueType vt, RegisterOperand RO,
 class SubwordSwap<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [],
          NoItinerary, FrmR, opstr> {
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
 }
 
 // Read Hardware
@@ -1130,12 +1153,14 @@ def ADD   : MMRel, ArithLogicR<"add", GPR32Opnd>, ADD_FM<0, 0x20>;
 def SUB   : MMRel, ArithLogicR<"sub", GPR32Opnd>, ADD_FM<0, 0x22>;
 def SLT   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM<0, 0x2a>;
 def SLTu  : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>, ADD_FM<0, 0x2b>;
+let AdditionalPredicates = [NotInMicroMips] in {
 def AND   : MMRel, ArithLogicR<"and", GPR32Opnd, 1, II_AND, and>,
             ADD_FM<0, 0x24>;
 def OR    : MMRel, ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>,
             ADD_FM<0, 0x25>;
 def XOR   : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>,
             ADD_FM<0, 0x26>;
+}
 def NOR   : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM<0, 0x27>;
 
 /// Shift Instructions
@@ -1169,11 +1194,15 @@ def LBu : Load<"lbu", GPR32Opnd, zextloadi8, II_LBU, addrDefault>, MMRel,
 def LH  : Load<"lh", GPR32Opnd, sextloadi16, II_LH, addrDefault>, MMRel,
           LW_FM<0x21>;
 def LHu : Load<"lhu", GPR32Opnd, zextloadi16, II_LHU>, MMRel, LW_FM<0x25>;
+let AdditionalPredicates = [NotInMicroMips] in {
 def LW  : Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel,
           LW_FM<0x23>;
+}
 def SB  : Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel, LW_FM<0x28>;
 def SH  : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>;
+let AdditionalPredicates = [NotInMicroMips] in {
 def SW  : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>;
+}
 
 /// load/store left/right
 let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
@@ -1188,6 +1217,7 @@ def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>,
           ISA_MIPS1_NOT_32R6_64R6;
 }
 
+let AdditionalPredicates = [NotInMicroMips] in {
 // COP2 Memory Instructions
 def LWC2 : LW_FT2<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>,
            ISA_MIPS1_NOT_32R6_64R6;
@@ -1207,8 +1237,10 @@ let DecoderNamespace = "COP3_" in {
   def SDC3 : SW_FT3<"sdc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3f>,
              ISA_MIPS2;
 }
+}
 
 def SYNC : MMRel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS32;
+def SYNCI : MMRel, SYNCI_FT<"synci">, SYNCI_FM, ISA_MIPS32R2;
 
 def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>, ISA_MIPS2;
 def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>, ISA_MIPS2;
@@ -1284,8 +1316,8 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
 }
 
-// FIXME: JALX really requires either MIPS16 or microMIPS in addition to MIPS32.
-def JALX  : JumpLink<"jalx", calltarget>, FJ<0x1D>, ISA_MIPS32_NOT_32R6_64R6;
+def JALX : MMRel, JumpLink<"jalx", calltarget>, FJ<0x1D>,
+           ISA_MIPS32_NOT_32R6_64R6;
 def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>,
              ISA_MIPS1_NOT_32R6_64R6;
 def BGEZALL : MMRel, BGEZAL_FT<"bgezall", brtarget, GPR32Opnd, 0>,
@@ -1440,10 +1472,10 @@ def MFC2 : MFC3OP<"mfc2", GPR32Opnd>, MFC3OP_FM<0x12, 0>;
 def MTC2 : MFC3OP<"mtc2", GPR32Opnd>, MFC3OP_FM<0x12, 4>;
 
 class Barrier<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary,
-                                      FrmOther>;
-def SSNOP : Barrier<"ssnop">, BARRIER_FM<1>;
-def EHB : Barrier<"ehb">, BARRIER_FM<3>;
-def PAUSE : Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2;
+                                      FrmOther, asmstr>;
+def SSNOP : MMRel, Barrier<"ssnop">, BARRIER_FM<1>;
+def EHB : MMRel, Barrier<"ehb">, BARRIER_FM<3>;
+def PAUSE : MMRel, Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2;
 
 // JR_HB and JALR_HB are defined here using the new style naming
 // scheme because some of this code is shared with Mips32r6InstrInfo.td
@@ -1494,13 +1526,14 @@ def TLBWR : MMRel, TLB<"tlbwr">, COP0_TLB_FM<0x06>;
 
 class CacheOp<string instr_asm, Operand MemOpnd> :
     InstSE<(outs), (ins  MemOpnd:$addr, uimm5:$hint),
-           !strconcat(instr_asm, "\t$hint, $addr"), [], NoItinerary, FrmOther> {
+           !strconcat(instr_asm, "\t$hint, $addr"), [], NoItinerary, FrmOther,
+           instr_asm> {
   let DecoderMethod = "DecodeCacheOp";
 }
 
-def CACHE : CacheOp<"cache", mem>, CACHEOP_FM<0b101111>,
+def CACHE : MMRel, CacheOp<"cache", mem>, CACHEOP_FM<0b101111>,
             INSN_MIPS3_32_NOT_32R6_64R6;
-def PREF :  CacheOp<"pref", mem>, CACHEOP_FM<0b110011>,
+def PREF :  MMRel, CacheOp<"pref", mem>, CACHEOP_FM<0b110011>,
             INSN_MIPS3_32_NOT_32R6_64R6;
 
 //===----------------------------------------------------------------------===//
@@ -1531,8 +1564,6 @@ def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
 let Predicates = [NotInMicroMips] in {
 def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
 }
-def : MipsInstAlias<"jal $rs", (JALR RA, GPR32Opnd:$rs), 0>;
-def : MipsInstAlias<"jal $rd,$rs", (JALR GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
 def : MipsInstAlias<"jalr.hb $rs", (JALR_HB RA, GPR32Opnd:$rs), 1>, ISA_MIPS32;
 def : MipsInstAlias<"not $rt, $rs",
                     (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
@@ -1557,7 +1588,9 @@ def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
 def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
 def : MipsInstAlias<"mfc2 $rt, $rd", (MFC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
 def : MipsInstAlias<"mtc2 $rt, $rd", (MTC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+let AdditionalPredicates = [NotInMicroMips] in {
 def : MipsInstAlias<"b $offset", (BEQ ZERO, ZERO, brtarget:$offset), 0>;
+}
 def : MipsInstAlias<"bnez $rs,$offset",
                     (BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
 def : MipsInstAlias<"beqz $rs,$offset",
@@ -1606,7 +1639,7 @@ def : MipsInstAlias<"sync",
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
 
-class LoadImm32< string instr_asm, Operand Od, RegisterOperand RO> :
+class LoadImm32<string instr_asm, Operand Od, RegisterOperand RO> :
   MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
                      !strconcat(instr_asm, "\t$rt, $imm32")> ;
 def LoadImm32Reg : LoadImm32<"li", uimm5, GPR32Opnd>;
@@ -1621,6 +1654,11 @@ class LoadAddressImm<string instr_asm, Operand Od, RegisterOperand RO> :
                      !strconcat(instr_asm, "\t$rt, $imm32")> ;
 def LoadAddr32Imm : LoadAddressImm<"la", uimm5, GPR32Opnd>;
 
+def JalTwoReg : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rs),
+                      "jal\t$rd, $rs"> ;
+def JalOneReg : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs),
+                      "jal\t$rs"> ;
+
 //===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
 //===----------------------------------------------------------------------===//
@@ -1633,10 +1671,12 @@ class StoreRegImmPat<Instruction StoreInst, ValueType ValTy> :
   MipsPat<(store ValTy:$v, addrRegImm:$a), (StoreInst ValTy:$v, addrRegImm:$a)>;
 
 // Small immediates
+let AdditionalPredicates = [NotInMicroMips] in {
 def : MipsPat<(i32 immSExt16:$in),
               (ADDiu ZERO, imm:$in)>;
 def : MipsPat<(i32 immZExt16:$in),
               (ORi ZERO, imm:$in)>;
+}
 def : MipsPat<(i32 immLow16Zero:$in),
               (LUi (HI16 imm:$in))>;
 
@@ -1826,7 +1866,9 @@ def : MipsPat<(bswap GPR32:$rt), (ROTR (WSBH GPR32:$rt), 16)>;
 let AddedComplexity = 40 in {
   def : LoadRegImmPat<LBu, i32, zextloadi8>;
   def : LoadRegImmPat<LH, i32, sextloadi16>;
+  let AdditionalPredicates = [NotInMicroMips] in {
   def : LoadRegImmPat<LW, i32, load>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index e44d6ee..90f8cc0 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -63,11 +63,9 @@ namespace {
   public:
     static char ID;
     MipsLongBranch(TargetMachine &tm)
-      : MachineFunctionPass(ID), TM(tm),
-        IsPIC(TM.getRelocationModel() == Reloc::PIC_),
-        ABI(TM.getSubtarget<MipsSubtarget>().getABI()),
-        LongBranchSeqSize(!IsPIC ? 2 : (ABI.IsN64() ? 10 :
-            (!TM.getSubtarget<MipsSubtarget>().isTargetNaCl() ? 9 : 10))) {}
+        : MachineFunctionPass(ID), TM(tm),
+          IsPIC(TM.getRelocationModel() == Reloc::PIC_),
+          ABI(static_cast<const MipsTargetMachine &>(TM).getABI()) {}
 
     const char *getPassName() const override {
       return "Mips Long Branch";
@@ -110,8 +108,7 @@ static MachineBasicBlock *getTargetMBB(const MachineInstr &Br) {
       return MO.getMBB();
   }
 
-  assert(false && "This instruction does not have an MBB operand.");
-  return nullptr;
+  llvm_unreachable("This instruction does not have an MBB operand.");
 }
 
 // Traverse the list of instructions backwards until a non-debug instruction is
@@ -171,7 +168,7 @@ void MipsLongBranch::initMBBInfo() {
   MBBInfos.resize(MF->size());
 
   const MipsInstrInfo *TII =
-      static_cast<const MipsInstrInfo *>(TM.getSubtargetImpl()->getInstrInfo());
+      static_cast<const MipsInstrInfo *>(MF->getSubtarget().getInstrInfo());
   for (unsigned I = 0, E = MBBInfos.size(); I < E; ++I) {
     MachineBasicBlock *MBB = MF->getBlockNumbered(I);
 
@@ -217,8 +214,8 @@ int64_t MipsLongBranch::computeOffset(const MachineInstr *Br) {
 // MachineBasicBlock operand MBBOpnd.
 void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
                                    DebugLoc DL, MachineBasicBlock *MBBOpnd) {
-  const MipsInstrInfo *TII =
-      static_cast<const MipsInstrInfo *>(TM.getSubtargetImpl()->getInstrInfo());
+  const MipsInstrInfo *TII = static_cast<const MipsInstrInfo *>(
+      MBB.getParent()->getSubtarget().getInstrInfo());
   unsigned NewOpc = TII->getOppositeBranchOpc(Br->getOpcode());
   const MCInstrDesc &NewDesc = TII->get(NewOpc);
 
@@ -237,15 +234,21 @@ void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
 
   MIB.addMBB(MBBOpnd);
 
-  // Bundle the instruction in the delay slot to the newly created branch
-  // and erase the original branch.
-  assert(Br->isBundledWithSucc());
-  MachineBasicBlock::instr_iterator II(Br);
-  MIBundleBuilder(&*MIB).append((++II)->removeFromBundle());
+  if (Br->hasDelaySlot()) {
+    // Bundle the instruction in the delay slot to the newly created branch
+    // and erase the original branch.
+    assert(Br->isBundledWithSucc());
+    MachineBasicBlock::instr_iterator II(Br);
+    MIBundleBuilder(&*MIB).append((++II)->removeFromBundle());
+  }
   Br->eraseFromParent();
 }
 
 // Expand branch instructions to long branches.
+// TODO: This function has to be fixed for beqz16 and bnez16, because it
+// currently assumes that all branches have 16-bit offsets, and will produce
+// wrong code if branches whose allowed offsets are [-128, -126, ..., 126]
+// are present.
 void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
   MachineBasicBlock::iterator Pos;
   MachineBasicBlock *MBB = I.Br->getParent(), *TgtMBB = getTargetMBB(*I.Br);
@@ -253,9 +256,10 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
   const BasicBlock *BB = MBB->getBasicBlock();
   MachineFunction::iterator FallThroughMBB = ++MachineFunction::iterator(MBB);
   MachineBasicBlock *LongBrMBB = MF->CreateMachineBasicBlock(BB);
-
+  const MipsSubtarget &Subtarget =
+      static_cast<const MipsSubtarget &>(MF->getSubtarget());
   const MipsInstrInfo *TII =
-      static_cast<const MipsInstrInfo *>(TM.getSubtargetImpl()->getInstrInfo());
+      static_cast<const MipsInstrInfo *>(Subtarget.getInstrInfo());
 
   MF->insert(FallThroughMBB, LongBrMBB);
   MBB->removeSuccessor(TgtMBB);
@@ -270,8 +274,6 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
     // We must select between the MIPS32r6/MIPS64r6 BAL (which is a normal
     // instruction) and the pre-MIPS32r6/MIPS64r6 definition (which is an
     // pseudo-instruction wrapping BGEZAL).
-
-    const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
     unsigned BalOp = Subtarget.hasMips32r6() ? Mips::BAL : Mips::BAL_BR;
 
     if (!ABI.IsN64()) {
@@ -328,7 +330,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
         .addReg(Mips::SP).addImm(0);
 
-      if (!TM.getSubtarget<MipsSubtarget>().isTargetNaCl()) {
+      if (!Subtarget.isTargetNaCl()) {
         MIBundleBuilder(*BalTgtMBB, Pos)
           .append(BuildMI(*MF, DL, TII->get(Mips::JR)).addReg(Mips::AT))
           .append(BuildMI(*MF, DL, TII->get(Mips::ADDiu), Mips::SP)
@@ -447,14 +449,17 @@ static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) {
 }
 
 bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
+  const MipsSubtarget &STI =
+      static_cast<const MipsSubtarget &>(F.getSubtarget());
   const MipsInstrInfo *TII =
-      static_cast<const MipsInstrInfo *>(TM.getSubtargetImpl()->getInstrInfo());
+      static_cast<const MipsInstrInfo *>(STI.getInstrInfo());
+  LongBranchSeqSize =
+      !IsPIC ? 2 : (ABI.IsN64() ? 10 : (!STI.isTargetNaCl() ? 9 : 10));
 
-  const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>();
   if (STI.inMips16Mode() || !STI.enableLongBranchPass())
     return false;
   if ((TM.getRelocationModel() == Reloc::PIC_) &&
-      TM.getSubtarget<MipsSubtarget>().isABI_O32() &&
+      static_cast<const MipsTargetMachine &>(TM).getABI().IsO32() &&
       F.getInfo<MipsFunctionInfo>()->globalBaseRegSet())
     emitGPDisp(F, TII);
 
@@ -476,10 +481,10 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
       if (!I->Br || I->HasLongBranch)
         continue;
 
-      int ShVal = TM.getSubtarget<MipsSubtarget>().inMicroMipsMode() ? 2 : 4;
+      int ShVal = STI.inMicroMipsMode() ? 2 : 4;
       int64_t Offset = computeOffset(I->Br) / ShVal;
 
-      if (TM.getSubtarget<MipsSubtarget>().isTargetNaCl()) {
+      if (STI.isTargetNaCl()) {
         // The offset calculation does not include sandboxing instructions
         // that will be added later in the MC layer.  Since at this point we
         // don't know the exact amount of code that "sandboxing" will add, we
diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index a89718a..30b93dc 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp
@@ -7,10 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MipsMachineFunction.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MipsInstrInfo.h"
+#include "MipsMachineFunction.h"
 #include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
@@ -78,15 +79,14 @@ unsigned MipsFunctionInfo::getGlobalBaseReg() {
   if (GlobalBaseReg)
     return GlobalBaseReg;
 
-  const MipsSubtarget &ST = MF.getTarget().getSubtarget<MipsSubtarget>();
-
-  const TargetRegisterClass *RC;
-  if (ST.inMips16Mode())
-    RC=(const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
-  else
-    RC = ST.isABI_N64() ?
-      (const TargetRegisterClass*)&Mips::GPR64RegClass :
-      (const TargetRegisterClass*)&Mips::GPR32RegClass;
+  const TargetRegisterClass *RC =
+      static_cast<const MipsSubtarget &>(MF.getSubtarget()).inMips16Mode()
+          ? &Mips::CPU16RegsRegClass
+          : static_cast<const MipsTargetMachine &>(MF.getTarget())
+                    .getABI()
+                    .IsN64()
+                ? &Mips::GPR64RegClass
+                : &Mips::GPR32RegClass;
   return GlobalBaseReg = MF.getRegInfo().createVirtualRegister(RC);
 }
 
@@ -98,16 +98,16 @@ unsigned MipsFunctionInfo::getMips16SPAliasReg() {
   if (Mips16SPAliasReg)
     return Mips16SPAliasReg;
 
-  const TargetRegisterClass *RC;
-  RC=(const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
+  const TargetRegisterClass *RC = &Mips::CPU16RegsRegClass;
   return Mips16SPAliasReg = MF.getRegInfo().createVirtualRegister(RC);
 }
 
 void MipsFunctionInfo::createEhDataRegsFI() {
   for (int I = 0; I < 4; ++I) {
-    const MipsSubtarget &ST = MF.getTarget().getSubtarget<MipsSubtarget>();
-    const TargetRegisterClass *RC = ST.isABI_N64() ?
-        &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+    const TargetRegisterClass *RC =
+        static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI().IsN64()
+            ? &Mips::GPR64RegClass
+            : &Mips::GPR32RegClass;
 
     EhDataRegFI[I] = MF.getFrameInfo()->CreateStackObject(RC->getSize(),
         RC->getAlignment(), false);
diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
index 22c524e..7c940ee 100644
--- a/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -174,7 +174,7 @@ void MBBInfo::postVisit() {
 
 // OptimizePICCall methods.
 bool OptimizePICCall::runOnMachineFunction(MachineFunction &F) {
-  if (F.getTarget().getSubtarget<MipsSubtarget>().inMips16Mode())
+  if (static_cast<const MipsSubtarget &>(F.getSubtarget()).inMips16Mode())
     return false;
 
   // Do a pre-order traversal of the dominator tree.
diff --git a/lib/Target/Mips/MipsOptionRecord.h b/lib/Target/Mips/MipsOptionRecord.h
index f82544a..dc29cbd 100644
--- a/lib/Target/Mips/MipsOptionRecord.h
+++ b/lib/Target/Mips/MipsOptionRecord.h
@@ -36,9 +36,8 @@ public:
 
 class MipsRegInfoRecord : public MipsOptionRecord {
 public:
-  MipsRegInfoRecord(MipsELFStreamer *S, MCContext &Context,
-                    const MCSubtargetInfo &STI)
-      : Streamer(S), Context(Context), STI(STI) {
+  MipsRegInfoRecord(MipsELFStreamer *S, MCContext &Context)
+      : Streamer(S), Context(Context) {
     ri_gprmask = 0;
     ri_cprmask[0] = ri_cprmask[1] = ri_cprmask[2] = ri_cprmask[3] = 0;
     ri_gp_value = 0;
@@ -61,7 +60,6 @@ public:
 private:
   MipsELFStreamer *Streamer;
   MCContext &Context;
-  const MCSubtargetInfo &STI;
   const MCRegisterClass *GPR32RegClass;
   const MCRegisterClass *GPR64RegClass;
   const MCRegisterClass *FGR32RegClass;
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 20ef3f3..2110c03 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -17,6 +17,7 @@
 #include "MipsInstrInfo.h"
 #include "MipsMachineFunction.h"
 #include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -62,7 +63,7 @@ MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   case Mips::GPR32RegClassID:
   case Mips::GPR64RegClassID:
   case Mips::DSPRRegClassID: {
-    const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+    const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
     return 28 - TFI->hasFP(MF);
   }
   case Mips::FGR32RegClassID:
@@ -167,7 +168,7 @@ getReservedRegs(const MachineFunction &MF) const {
       Reserved.set(*Reg);
   }
   // Reserve FP if this function should have a dedicated frame pointer register.
-  if (MF.getSubtarget().getFrameLowering()->hasFP(MF)) {
+  if (Subtarget.getFrameLowering()->hasFP(MF)) {
     if (Subtarget.inMips16Mode())
       Reserved.set(Mips::S0);
     else {
@@ -256,8 +257,9 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
 
 unsigned MipsRegisterInfo::
 getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
-  bool IsN64 = Subtarget.isABI_N64();
+  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+  bool IsN64 =
+      static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI().IsN64();
 
   if (Subtarget.inMips16Mode())
     return TFI->hasFP(MF) ? Mips::S0 : Mips::SP;
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 42fe76b..7497a25 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -289,10 +289,28 @@ def GPR32 : GPR32Class<[i32]>;
 def DSPR  : GPR32Class<[v4i8, v2i16]>;
 
 def GPRMM16 : RegisterClass<"Mips", [i32], 32, (add
+  // Callee save
+  S0, S1,
   // Return Values and Arguments
-  V0, V1, A0, A1, A2, A3,
+  V0, V1, A0, A1, A2, A3)>;
+
+def GPRMM16Zero : RegisterClass<"Mips", [i32], 32, (add
+  // Reserved
+  ZERO,
   // Callee save
-  S0, S1)>;
+  S1,
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3)>;
+
+def GPRMM16MoveP : RegisterClass<"Mips", [i32], 32, (add
+  // Reserved
+  ZERO,
+  // Callee save
+  S1,
+  // Return Values and Arguments
+  V0, V1,
+  // Callee save
+  S0, S2, S3, S4)>;
 
 def GPR64 : RegisterClass<"Mips", [i64], 64, (add
 // Reserved
@@ -380,6 +398,8 @@ def MSA128W: RegisterClass<"Mips", [v4i32, v4f32], 128,
                            (sequence "W%u", 0, 31)>;
 def MSA128D: RegisterClass<"Mips", [v2i64, v2f64], 128,
                            (sequence "W%u", 0, 31)>;
+def MSA128WEvens: RegisterClass<"Mips", [v4i32, v4f32], 128,
+                                (decimate (sequence "W%u", 0, 31), 2)>;
 
 def MSACtrl: RegisterClass<"Mips", [i32], 32, (add
   MSAIR, MSACSR, MSAAccess, MSASave, MSAModify, MSARequest, MSAMap, MSAUnmap)>;
@@ -446,6 +466,16 @@ def GPRMM16AsmOperand : MipsAsmRegOperand {
   let PredicateMethod = "isMM16AsmReg";
 }
 
+def GPRMM16AsmOperandZero : MipsAsmRegOperand {
+  let Name = "GPRMM16AsmRegZero";
+  let PredicateMethod = "isMM16AsmRegZero";
+}
+
+def GPRMM16AsmOperandMoveP : MipsAsmRegOperand {
+  let Name = "GPRMM16AsmRegMoveP";
+  let PredicateMethod = "isMM16AsmRegMoveP";
+}
+
 def ACC64DSPAsmOperand : MipsAsmRegOperand {
   let Name = "ACC64DSPAsmReg";
   let PredicateMethod = "isACCAsmReg";
@@ -505,6 +535,14 @@ def GPRMM16Opnd : RegisterOperand<GPRMM16> {
   let ParserMatchClass = GPRMM16AsmOperand;
 }
 
+def GPRMM16OpndZero : RegisterOperand<GPRMM16Zero> {
+  let ParserMatchClass = GPRMM16AsmOperandZero;
+}
+
+def GPRMM16OpndMoveP : RegisterOperand<GPRMM16MoveP> {
+  let ParserMatchClass = GPRMM16AsmOperandMoveP;
+}
+
 def GPR64Opnd : RegisterOperand<GPR64> {
   let ParserMatchClass = GPR64AsmOperand;
 }
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 97d9edf..7c79c4c 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -71,11 +71,17 @@ private:
 
   MachineFunction &MF;
   MachineRegisterInfo &MRI;
+  const MipsSubtarget &Subtarget;
+  const MipsSEInstrInfo &TII;
+  const MipsRegisterInfo &RegInfo;
 };
 }
 
 ExpandPseudo::ExpandPseudo(MachineFunction &MF_)
-  : MF(MF_), MRI(MF.getRegInfo()) {}
+    : MF(MF_), MRI(MF.getRegInfo()),
+      Subtarget(static_cast<const MipsSubtarget &>(MF.getSubtarget())),
+      TII(*static_cast<const MipsSEInstrInfo *>(Subtarget.getInstrInfo())),
+      RegInfo(*Subtarget.getRegisterInfo()) {}
 
 bool ExpandPseudo::expand() {
   bool Expanded = false;
@@ -146,11 +152,6 @@ void ExpandPseudo::expandLoadCCond(MachineBasicBlock &MBB, Iter I) {
 
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
-  const MipsSEInstrInfo &TII =
-      *static_cast<const MipsSEInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const MipsRegisterInfo &RegInfo = *static_cast<const MipsRegisterInfo *>(
-                                        MF.getSubtarget().getRegisterInfo());
-
   const TargetRegisterClass *RC = RegInfo.intRegClass(4);
   unsigned VR = MRI.createVirtualRegister(RC);
   unsigned Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
@@ -166,11 +167,6 @@ void ExpandPseudo::expandStoreCCond(MachineBasicBlock &MBB, Iter I) {
 
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
-  const MipsSEInstrInfo &TII =
-      *static_cast<const MipsSEInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const MipsRegisterInfo &RegInfo = *static_cast<const MipsRegisterInfo *>(
-                                        MF.getSubtarget().getRegisterInfo());
-
   const TargetRegisterClass *RC = RegInfo.intRegClass(4);
   unsigned VR = MRI.createVirtualRegister(RC);
   unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
@@ -189,11 +185,6 @@ void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I,
 
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
-  const MipsSEInstrInfo &TII =
-      *static_cast<const MipsSEInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const MipsRegisterInfo &RegInfo = *static_cast<const MipsRegisterInfo *>(
-                                        MF.getSubtarget().getRegisterInfo());
-
   const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
   unsigned VR0 = MRI.createVirtualRegister(RC);
   unsigned VR1 = MRI.createVirtualRegister(RC);
@@ -219,11 +210,6 @@ void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I,
 
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
-  const MipsSEInstrInfo &TII =
-      *static_cast<const MipsSEInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const MipsRegisterInfo &RegInfo = *static_cast<const MipsRegisterInfo *>(
-                                        MF.getSubtarget().getRegisterInfo());
-
   const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
   unsigned VR0 = MRI.createVirtualRegister(RC);
   unsigned VR1 = MRI.createVirtualRegister(RC);
@@ -254,11 +240,6 @@ bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I,
   //  mfhi $vr1, src
   //  copy dst_hi, $vr1
 
-  const MipsSEInstrInfo &TII =
-      *static_cast<const MipsSEInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const MipsRegisterInfo &RegInfo = *static_cast<const MipsRegisterInfo *>(
-                                        MF.getSubtarget().getRegisterInfo());
-
   unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg();
   unsigned VRegSize = RegInfo.getMinimalPhysRegClass(Dst)->getSize() / 2;
   const TargetRegisterClass *RC = RegInfo.intRegClass(VRegSize);
@@ -298,16 +279,8 @@ bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB,
   // register). Unfortunately, we have to make this decision before register
   // allocation so for now we use a spill/reload sequence for all
   // double-precision values in regardless of being an odd/even register.
-
-  const TargetMachine &TM = MF.getTarget();
-  const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
   if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) ||
       (FP64 && !Subtarget.useOddSPReg())) {
-    const MipsSEInstrInfo &TII = *static_cast<const MipsSEInstrInfo *>(
-                                     TM.getSubtargetImpl()->getInstrInfo());
-    const MipsRegisterInfo &TRI = *static_cast<const MipsRegisterInfo *>(
-                                      TM.getSubtargetImpl()->getRegisterInfo());
-
     unsigned DstReg = I->getOperand(0).getReg();
     unsigned LoReg = I->getOperand(1).getReg();
     unsigned HiReg = I->getOperand(2).getReg();
@@ -327,11 +300,11 @@ bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB,
     int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(RC2);
     if (!Subtarget.isLittle())
       std::swap(LoReg, HiReg);
-    TII.storeRegToStack(MBB, I, LoReg, I->getOperand(1).isKill(), FI, RC, &TRI,
-                        0);
-    TII.storeRegToStack(MBB, I, HiReg, I->getOperand(2).isKill(), FI, RC, &TRI,
-                        4);
-    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &TRI, 0);
+    TII.storeRegToStack(MBB, I, LoReg, I->getOperand(1).isKill(), FI, RC,
+                        &RegInfo, 0);
+    TII.storeRegToStack(MBB, I, HiReg, I->getOperand(2).isKill(), FI, RC,
+                        &RegInfo, 4);
+    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, 0);
     return true;
   }
 
@@ -359,15 +332,8 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
   // allocation so for now we use a spill/reload sequence for all
   // double-precision values in regardless of being an odd/even register.
 
-  const TargetMachine &TM = MF.getTarget();
-  const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
   if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) ||
       (FP64 && !Subtarget.useOddSPReg())) {
-    const MipsSEInstrInfo &TII = *static_cast<const MipsSEInstrInfo *>(
-                                     TM.getSubtargetImpl()->getInstrInfo());
-    const MipsRegisterInfo &TRI = *static_cast<const MipsRegisterInfo *>(
-                                      TM.getSubtargetImpl()->getRegisterInfo());
-
     unsigned DstReg = I->getOperand(0).getReg();
     unsigned SrcReg = I->getOperand(1).getReg();
     unsigned N = I->getOperand(2).getImm();
@@ -386,9 +352,9 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
     // We re-use the same spill slot each time so that the stack frame doesn't
     // grow too much in functions with a large number of moves.
     int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(RC);
-    TII.storeRegToStack(MBB, I, SrcReg, I->getOperand(1).isKill(), FI, RC, &TRI,
-                        0);
-    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &TRI, Offset);
+    TII.storeRegToStack(MBB, I, SrcReg, I->getOperand(1).isKill(), FI, RC,
+                        &RegInfo, 0);
+    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, Offset);
     return true;
   }
 
@@ -415,9 +381,9 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
   const MipsSEInstrInfo &TII =
-      *static_cast<const MipsSEInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const MipsRegisterInfo &RegInfo = *static_cast<const MipsRegisterInfo *>(
-                                        MF.getSubtarget().getRegisterInfo());
+      *static_cast<const MipsSEInstrInfo *>(STI.getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+      *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo());
 
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -550,9 +516,9 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
   const MipsSEInstrInfo &TII =
-      *static_cast<const MipsSEInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const MipsRegisterInfo &RegInfo = *static_cast<const MipsRegisterInfo *>(
-                                        MF.getSubtarget().getRegisterInfo());
+      *static_cast<const MipsSEInstrInfo *>(STI.getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+      *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo());
 
   DebugLoc dl = MBBI->getDebugLoc();
   unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
@@ -605,7 +571,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                           const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
   MachineBasicBlock *EntryBlock = MF->begin();
-  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
 
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     // Add the callee-saved register as live-in. Do not add if the register is
@@ -646,7 +612,7 @@ void MipsSEFrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   const MipsSEInstrInfo &TII =
-      *static_cast<const MipsSEInstrInfo *>(MF.getSubtarget().getInstrInfo());
+      *static_cast<const MipsSEInstrInfo *>(STI.getInstrInfo());
 
   if (!hasReservedCallFrame(MF)) {
     int64_t Amount = I->getOperand(0).getImm();
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index f759905..0761ded 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -37,7 +37,7 @@ using namespace llvm;
 #define DEBUG_TYPE "mips-isel"
 
 bool MipsSEDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  Subtarget = &TM.getSubtarget<MipsSubtarget>();
+  Subtarget = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
   if (Subtarget->inMips16Mode())
     return false;
   return MipsDAGToDAGISel::runOnMachineFunction(MF);
@@ -130,20 +130,17 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
   MachineBasicBlock &MBB = MF.front();
   MachineBasicBlock::iterator I = MBB.begin();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
   DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
   unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg();
   const TargetRegisterClass *RC;
-
-  if (Subtarget->isABI_N64())
-    RC = (const TargetRegisterClass*)&Mips::GPR64RegClass;
-  else
-    RC = (const TargetRegisterClass*)&Mips::GPR32RegClass;
+  const MipsABIInfo &ABI = static_cast<const MipsTargetMachine &>(TM).getABI();
+  RC = (ABI.IsN64()) ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
 
   V0 = RegInfo.createVirtualRegister(RC);
   V1 = RegInfo.createVirtualRegister(RC);
 
-  if (Subtarget->isABI_N64()) {
+  if (ABI.IsN64()) {
     MF.getRegInfo().addLiveIn(Mips::T9_64);
     MBB.addLiveIn(Mips::T9_64);
 
@@ -175,7 +172,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
   MF.getRegInfo().addLiveIn(Mips::T9);
   MBB.addLiveIn(Mips::T9);
 
-  if (Subtarget->isABI_N32()) {
+  if (ABI.IsN32()) {
     // lui $v0, %hi(%neg(%gp_rel(fname)))
     // addu $v1, $v0, $t9
     // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
@@ -188,7 +185,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
     return;
   }
 
-  assert(Subtarget->isABI_O32());
+  assert(ABI.IsO32());
 
   // For O32 ABI, the following instruction sequence is emitted to initialize
   // the global base register:
@@ -239,13 +236,31 @@ SDNode *MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
           (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
          "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
 
+  unsigned SLTuOp = Mips::SLTu, ADDuOp = Mips::ADDu;
+  if (Subtarget->isGP64bit()) {
+    SLTuOp = Mips::SLTu64;
+    ADDuOp = Mips::DADDu;
+  }
+
   SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
   SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1);
   EVT VT = LHS.getValueType();
 
-  SDNode *Carry = CurDAG->getMachineNode(Mips::SLTu, DL, VT, Ops);
-  SDNode *AddCarry = CurDAG->getMachineNode(Mips::ADDu, DL, VT,
+  SDNode *Carry = CurDAG->getMachineNode(SLTuOp, DL, VT, Ops);
+
+  if (Subtarget->isGP64bit()) {
+    // On 64-bit targets, sltu produces an i64 but our backend currently says
+    // that SLTu64 produces an i32. We need to fix this in the long run but for
+    // now, just make the DAG type-correct by asserting the upper bits are zero.
+    Carry = CurDAG->getMachineNode(Mips::SUBREG_TO_REG, DL, VT,
+                                   CurDAG->getTargetConstant(0, VT),
+                                   SDValue(Carry, 0),
+                                   CurDAG->getTargetConstant(Mips::sub_32, VT));
+  }
+
+  SDNode *AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT,
                                             SDValue(Carry, 0), RHS);
+
   return CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS,
                               SDValue(AddCarry, 0));
 }
@@ -392,6 +407,28 @@ bool MipsSEDAGToDAGISel::selectIntAddrMM(SDValue Addr, SDValue &Base,
     selectAddrDefault(Addr, Base, Offset);
 }
 
+bool MipsSEDAGToDAGISel::selectIntAddrLSL2MM(SDValue Addr, SDValue &Base,
+                                             SDValue &Offset) const {
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 7)) {
+    if (isa<FrameIndexSDNode>(Base))
+      return false;
+
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Offset)) {
+      unsigned CnstOff = CN->getZExtValue();
+      return (CnstOff == (CnstOff & 0x3c));
+    }
+
+    return false;
+  }
+
+  // For all other cases where "lw" would be selected, don't select "lw16"
+  // because it would result in additional instructions to prepare operands.
+  if (selectAddrRegImm(Addr, Base, Offset))
+    return false;
+
+  return selectAddrDefault(Addr, Base, Offset);
+}
+
 bool MipsSEDAGToDAGISel::selectIntAddrMSA(SDValue Addr, SDValue &Base,
                                           SDValue &Offset) const {
   if (selectAddrRegImm10(Addr, Base, Offset))
@@ -644,7 +681,8 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
 
   case ISD::SUBE: {
     SDValue InFlag = Node->getOperand(2);
-    Result = selectAddESubE(Mips::SUBu, InFlag, InFlag.getOperand(0), DL, Node);
+    unsigned Opc = Subtarget->isGP64bit() ? Mips::DSUBu : Mips::SUBu;
+    Result = selectAddESubE(Opc, InFlag, InFlag.getOperand(0), DL, Node);
     return std::make_pair(true, Result);
   }
 
@@ -652,7 +690,8 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC.
       break;
     SDValue InFlag = Node->getOperand(2);
-    Result = selectAddESubE(Mips::ADDu, InFlag, InFlag.getValue(0), DL, Node);
+    unsigned Opc = Subtarget->isGP64bit() ? Mips::DADDu : Mips::ADDu;
+    Result = selectAddESubE(Opc, InFlag, InFlag.getValue(0), DL, Node);
     return std::make_pair(true, Result);
   }
 
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index 2e11fa7..2d24eb4 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -65,6 +65,9 @@ private:
   bool selectIntAddrMM(SDValue Addr, SDValue &Base,
                        SDValue &Offset) const override;
 
+  bool selectIntAddrLSL2MM(SDValue Addr, SDValue &Base,
+                           SDValue &Offset) const override;
+
   bool selectIntAddrMSA(SDValue Addr, SDValue &Base,
                         SDValue &Offset) const override;
 
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 4a0ce09..09ff4f9 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -46,17 +46,13 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
 
   if (Subtarget.hasDSP() || Subtarget.hasMSA()) {
     // Expand all truncating stores and extending loads.
-    unsigned FirstVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-    unsigned LastVT = (unsigned)MVT::LAST_VECTOR_VALUETYPE;
-
-    for (unsigned VT0 = FirstVT; VT0 <= LastVT; ++VT0) {
-      for (unsigned VT1 = FirstVT; VT1 <= LastVT; ++VT1)
-        setTruncStoreAction((MVT::SimpleValueType)VT0,
-                            (MVT::SimpleValueType)VT1, Expand);
-
-      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT0, Expand);
-      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT0, Expand);
-      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT0, Expand);
+    for (MVT VT0 : MVT::vector_valuetypes()) {
+      for (MVT VT1 : MVT::vector_valuetypes()) {
+        setTruncStoreAction(VT0, VT1, Expand);
+        setLoadExtAction(ISD::SEXTLOAD, VT0, VT1, Expand);
+        setLoadExtAction(ISD::ZEXTLOAD, VT0, VT1, Expand);
+        setLoadExtAction(ISD::EXTLOAD, VT0, VT1, Expand);
+      }
     }
   }
 
@@ -126,6 +122,8 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
     setOperationAction(ISD::MUL,              MVT::i64, Custom);
 
   if (Subtarget.isGP64bit()) {
+    setOperationAction(ISD::SMUL_LOHI,        MVT::i64, Custom);
+    setOperationAction(ISD::UMUL_LOHI,        MVT::i64, Custom);
     setOperationAction(ISD::MULHS,            MVT::i64, Custom);
     setOperationAction(ISD::MULHU,            MVT::i64, Custom);
     setOperationAction(ISD::SDIVREM,          MVT::i64, Custom);
@@ -204,6 +202,8 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
   if (Subtarget.hasMips64r6()) {
     // MIPS64r6 replaces the accumulator-based multiplies with a three register
     // instruction
+    setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+    setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
     setOperationAction(ISD::MUL, MVT::i64, Legal);
     setOperationAction(ISD::MULHS, MVT::i64, Legal);
     setOperationAction(ISD::MULHU, MVT::i64, Legal);
@@ -224,7 +224,7 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
     setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
   }
 
-  computeRegisterProperties();
+  computeRegisterProperties(Subtarget.getRegisterInfo());
 }
 
 const MipsTargetLowering *
@@ -1836,11 +1836,9 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_fill_h:
   case Intrinsic::mips_fill_w:
   case Intrinsic::mips_fill_d: {
-    SmallVector<SDValue, 16> Ops;
     EVT ResTy = Op->getValueType(0);
-
-    for (unsigned i = 0; i < ResTy.getVectorNumElements(); ++i)
-      Ops.push_back(Op->getOperand(1));
+    SmallVector<SDValue, 16> Ops(ResTy.getVectorNumElements(),
+                                 Op->getOperand(1));
 
     // If ResTy is v2i64 then the type legalizer will break this node down into
     // an equivalent v4i32.
@@ -2291,9 +2289,9 @@ lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
 static bool isConstantOrUndef(const SDValue Op) {
   if (Op->getOpcode() == ISD::UNDEF)
     return true;
-  if (dyn_cast<ConstantSDNode>(Op))
+  if (isa<ConstantSDNode>(Op))
     return true;
-  if (dyn_cast<ConstantFPSDNode>(Op))
+  if (isa<ConstantFPSDNode>(Op))
     return true;
   return false;
 }
@@ -2747,8 +2745,7 @@ emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
   //  $vr0 = phi($vr2, $fbb, $vr1, $tbb)
 
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   const TargetRegisterClass *RC = &Mips::GPR32RegClass;
   DebugLoc DL = MI->getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -2813,8 +2810,7 @@ emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB,
   //  $rd = phi($rd1, $fbb, $rd2, $tbb)
 
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   const TargetRegisterClass *RC = &Mips::GPR32RegClass;
   DebugLoc DL = MI->getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -2875,18 +2871,28 @@ emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB,
 // for lane 1 because it would require FR=0 mode which isn't supported by MSA.
 MachineBasicBlock * MipsSETargetLowering::
 emitCOPY_FW(MachineInstr *MI, MachineBasicBlock *BB) const{
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned Fd = MI->getOperand(0).getReg();
   unsigned Ws = MI->getOperand(1).getReg();
   unsigned Lane = MI->getOperand(2).getImm();
 
-  if (Lane == 0)
-    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_lo);
-  else {
-    unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+  if (Lane == 0) {
+    unsigned Wt = Ws;
+    if (!Subtarget.useOddSPReg()) {
+      // We must copy to an even-numbered MSA register so that the
+      // single-precision sub-register is also guaranteed to be even-numbered.
+      Wt = RegInfo.createVirtualRegister(&Mips::MSA128WEvensRegClass);
+
+      BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Wt).addReg(Ws);
+    }
+
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
+  } else {
+    unsigned Wt = RegInfo.createVirtualRegister(
+        Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass :
+                                  &Mips::MSA128WEvensRegClass);
 
     BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(Lane);
     BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
@@ -2910,8 +2916,7 @@ MachineBasicBlock * MipsSETargetLowering::
 emitCOPY_FD(MachineInstr *MI, MachineBasicBlock *BB) const{
   assert(Subtarget.isFP64bit());
 
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   unsigned Fd  = MI->getOperand(0).getReg();
   unsigned Ws  = MI->getOperand(1).getReg();
@@ -2940,15 +2945,16 @@ emitCOPY_FD(MachineInstr *MI, MachineBasicBlock *BB) const{
 MachineBasicBlock *
 MipsSETargetLowering::emitINSERT_FW(MachineInstr *MI,
                                     MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned Wd = MI->getOperand(0).getReg();
   unsigned Wd_in = MI->getOperand(1).getReg();
   unsigned Lane = MI->getOperand(2).getImm();
   unsigned Fs = MI->getOperand(3).getReg();
-  unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+  unsigned Wt = RegInfo.createVirtualRegister(
+      Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass :
+                                &Mips::MSA128WEvensRegClass);
 
   BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
       .addImm(0)
@@ -2975,8 +2981,7 @@ MipsSETargetLowering::emitINSERT_FD(MachineInstr *MI,
                                     MachineBasicBlock *BB) const {
   assert(Subtarget.isFP64bit());
 
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned Wd = MI->getOperand(0).getReg();
@@ -3024,8 +3029,7 @@ MipsSETargetLowering::emitINSERT_DF_VIDX(MachineInstr *MI,
                                          MachineBasicBlock *BB,
                                          unsigned EltSizeInBytes,
                                          bool IsFP) const {
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned Wd = MI->getOperand(0).getReg();
@@ -3135,8 +3139,7 @@ MipsSETargetLowering::emitINSERT_DF_VIDX(MachineInstr *MI,
 MachineBasicBlock *
 MipsSETargetLowering::emitFILL_FW(MachineInstr *MI,
                                   MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned Wd = MI->getOperand(0).getReg();
@@ -3167,8 +3170,7 @@ MipsSETargetLowering::emitFILL_FD(MachineInstr *MI,
                                   MachineBasicBlock *BB) const {
   assert(Subtarget.isFP64bit());
 
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned Wd = MI->getOperand(0).getReg();
@@ -3196,8 +3198,7 @@ MipsSETargetLowering::emitFILL_FD(MachineInstr *MI,
 MachineBasicBlock *
 MipsSETargetLowering::emitFEXP2_W_1(MachineInstr *MI,
                                     MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   const TargetRegisterClass *RC = &Mips::MSA128WRegClass;
   unsigned Ws1 = RegInfo.createVirtualRegister(RC);
@@ -3226,8 +3227,7 @@ MipsSETargetLowering::emitFEXP2_W_1(MachineInstr *MI,
 MachineBasicBlock *
 MipsSETargetLowering::emitFEXP2_D_1(MachineInstr *MI,
                                     MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   const TargetRegisterClass *RC = &Mips::MSA128DRegClass;
   unsigned Ws1 = RegInfo.createVirtualRegister(RC);
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index 16bea8b..74f291f 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -27,7 +27,7 @@ using namespace llvm;
 MipsSEInstrInfo::MipsSEInstrInfo(const MipsSubtarget &STI)
     : MipsInstrInfo(STI, STI.getRelocationModel() == Reloc::PIC_ ? Mips::B
                                                                  : Mips::J),
-      RI(STI), IsN64(STI.isABI_N64()) {}
+      RI(STI) {}
 
 const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const {
   return RI;
@@ -38,9 +38,8 @@ const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const {
 /// the destination along with the FrameIndex of the loaded stack slot.  If
 /// not, return 0.  This predicate must return 0 if the instruction has
 /// any side effects other than loading from the stack slot.
-unsigned MipsSEInstrInfo::
-isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
-{
+unsigned MipsSEInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
   unsigned Opc = MI->getOpcode();
 
   if ((Opc == Mips::LW)   || (Opc == Mips::LD)   ||
@@ -61,9 +60,8 @@ isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
 /// the source reg along with the FrameIndex of the loaded stack slot.  If
 /// not, return 0.  This predicate must return 0 if the instruction has
 /// any side effects other than storing to the stack slot.
-unsigned MipsSEInstrInfo::
-isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const
-{
+unsigned MipsSEInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                             int &FrameIndex) const {
   unsigned Opc = MI->getOpcode();
 
   if ((Opc == Mips::SW)   || (Opc == Mips::SD)   ||
@@ -352,6 +350,8 @@ unsigned MipsSEInstrInfo::getOppositeBranchOpc(unsigned Opc) const {
   case Mips::BLEZ64: return Mips::BGTZ64;
   case Mips::BC1T:   return Mips::BC1F;
   case Mips::BC1F:   return Mips::BC1T;
+  case Mips::BEQZC_MM: return Mips::BNEZC_MM;
+  case Mips::BNEZC_MM: return Mips::BEQZC_MM;
   }
 }
 
@@ -422,7 +422,7 @@ unsigned MipsSEInstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
           Opc == Mips::BEQ64  || Opc == Mips::BNE64  || Opc == Mips::BGTZ64 ||
           Opc == Mips::BGEZ64 || Opc == Mips::BLTZ64 || Opc == Mips::BLEZ64 ||
           Opc == Mips::BC1T   || Opc == Mips::BC1F   || Opc == Mips::B      ||
-          Opc == Mips::J) ?
+          Opc == Mips::J || Opc == Mips::BEQZC_MM || Opc == Mips::BNEZC_MM) ?
          Opc : 0;
 }
 
@@ -620,18 +620,13 @@ void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB,
   // jr   $ra (via RetRA)
   const TargetMachine &TM = MBB.getParent()->getTarget();
   if (TM.getRelocationModel() == Reloc::PIC_)
-    BuildMI(MBB, I, I->getDebugLoc(),
-            TM.getSubtargetImpl()->getInstrInfo()->get(ADDU), T9)
+    BuildMI(MBB, I, I->getDebugLoc(), get(ADDU), T9)
         .addReg(TargetReg)
         .addReg(ZERO);
-  BuildMI(MBB, I, I->getDebugLoc(),
-          TM.getSubtargetImpl()->getInstrInfo()->get(ADDU), RA)
+  BuildMI(MBB, I, I->getDebugLoc(), get(ADDU), RA)
       .addReg(TargetReg)
       .addReg(ZERO);
-  BuildMI(MBB, I, I->getDebugLoc(),
-          TM.getSubtargetImpl()->getInstrInfo()->get(ADDU), SP)
-      .addReg(SP)
-      .addReg(OffsetReg);
+  BuildMI(MBB, I, I->getDebugLoc(), get(ADDU), SP).addReg(SP).addReg(OffsetReg);
   expandRetRA(MBB, I);
 }
 
diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index b2d2301..d16fab2 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h
@@ -21,7 +21,6 @@ namespace llvm {
 
 class MipsSEInstrInfo : public MipsInstrInfo {
   const MipsSERegisterInfo RI;
-  bool IsN64;
 
 public:
   explicit MipsSEInstrInfo(const MipsSubtarget &STI);
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 8768b12..26f39a2 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -33,120 +33,61 @@ using namespace llvm;
 
 // FIXME: Maybe this should be on by default when Mips16 is specified
 //
-static cl::opt<bool> Mixed16_32(
-  "mips-mixed-16-32",
-  cl::init(false),
-  cl::desc("Allow for a mixture of Mips16 "
-           "and Mips32 code in a single source file"),
-  cl::Hidden);
-
-static cl::opt<bool> Mips_Os16(
-  "mips-os16",
-  cl::init(false),
-  cl::desc("Compile all functions that don' use "
-           "floating point as Mips 16"),
-  cl::Hidden);
-
 static cl::opt<bool>
-Mips16HardFloat("mips16-hard-float", cl::NotHidden,
-                cl::desc("MIPS: mips16 hard float enable."),
-                cl::init(false));
+    Mixed16_32("mips-mixed-16-32", cl::init(false),
+               cl::desc("Allow for a mixture of Mips16 "
+                        "and Mips32 code in a single output file"),
+               cl::Hidden);
+
+static cl::opt<bool> Mips_Os16("mips-os16", cl::init(false),
+                               cl::desc("Compile all functions that don't use "
+                                        "floating point as Mips 16"),
+                               cl::Hidden);
+
+static cl::opt<bool> Mips16HardFloat("mips16-hard-float", cl::NotHidden,
+                                     cl::desc("Enable mips16 hard float."),
+                                     cl::init(false));
 
 static cl::opt<bool>
-Mips16ConstantIslands(
-  "mips16-constant-islands", cl::NotHidden,
-  cl::desc("MIPS: mips16 constant islands enable."),
-  cl::init(true));
+    Mips16ConstantIslands("mips16-constant-islands", cl::NotHidden,
+                          cl::desc("Enable mips16 constant islands."),
+                          cl::init(true));
 
 static cl::opt<bool>
-GPOpt("mgpopt", cl::Hidden,
-      cl::desc("MIPS: Enable gp-relative addressing of small data items"));
-
-/// Select the Mips CPU for the given triple and cpu name.
-/// FIXME: Merge with the copy in MipsMCTargetDesc.cpp
-static StringRef selectMipsCPU(Triple TT, StringRef CPU) {
-  if (CPU.empty() || CPU == "generic") {
-    if (TT.getArch() == Triple::mips || TT.getArch() == Triple::mipsel)
-      CPU = "mips32";
-    else
-      CPU = "mips64";
-  }
-  return CPU;
-}
+    GPOpt("mgpopt", cl::Hidden,
+          cl::desc("Enable gp-relative addressing of mips small data items"));
 
 void MipsSubtarget::anchor() { }
 
-static std::string computeDataLayout(const MipsSubtarget &ST) {
-  std::string Ret = "";
-
-  // There are both little and big endian mips.
-  if (ST.isLittle())
-    Ret += "e";
-  else
-    Ret += "E";
-
-  Ret += "-m:m";
-
-  // Pointers are 32 bit on some ABIs.
-  if (!ST.isABI_N64())
-    Ret += "-p:32:32";
-
-  // 8 and 16 bit integers only need no have natural alignment, but try to
-  // align them to 32 bits. 64 bit integers have natural alignment.
-  Ret += "-i8:8:32-i16:16:32-i64:64";
-
-  // 32 bit registers are always available and the stack is at least 64 bit
-  // aligned. On N64 64 bit registers are also available and the stack is
-  // 128 bit aligned.
-  if (ST.isABI_N64() || ST.isABI_N32())
-    Ret += "-n32:64-S128";
-  else
-    Ret += "-n32-S64";
-
-  return Ret;
-}
-
 MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
                              const std::string &FS, bool little,
-                             const MipsTargetMachine *_TM)
+                             const MipsTargetMachine &TM)
     : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault),
-      ABI(MipsABIInfo::Unknown()), IsLittle(little), IsSingleFloat(false),
-      IsFPXX(false), NoABICalls(false), IsFP64bit(false), UseOddSPReg(true),
-      IsNaN2008bit(false), IsGP64bit(false), HasVFPU(false), HasCnMips(false),
-      IsLinux(true), HasMips3_32(false), HasMips3_32r2(false),
-      HasMips4_32(false), HasMips4_32r2(false), HasMips5_32r2(false),
-      InMips16Mode(false), InMips16HardFloat(Mips16HardFloat),
-      InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
-      AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16),
-      HasMSA(false), TM(_TM), TargetTriple(TT),
-      DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS, TM))),
-      TSInfo(DL), InstrInfo(MipsInstrInfo::create(*this)),
+      IsLittle(little), IsSingleFloat(false), IsFPXX(false), NoABICalls(false),
+      IsFP64bit(false), UseOddSPReg(true), IsNaN2008bit(false),
+      IsGP64bit(false), HasVFPU(false), HasCnMips(false), HasMips3_32(false),
+      HasMips3_32r2(false), HasMips4_32(false), HasMips4_32r2(false),
+      HasMips5_32r2(false), InMips16Mode(false),
+      InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
+      HasDSPR2(false), AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16),
+      HasMSA(false), TM(TM), TargetTriple(TT), TSInfo(*TM.getDataLayout()),
+      InstrInfo(
+          MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))),
       FrameLowering(MipsFrameLowering::create(*this)),
-      TLInfo(MipsTargetLowering::create(*TM, *this)) {
+      TLInfo(MipsTargetLowering::create(TM, *this)) {
 
   PreviousInMips16Mode = InMips16Mode;
 
   if (MipsArchVersion == MipsDefault)
     MipsArchVersion = Mips32;
 
-  // Don't even attempt to generate code for MIPS-I, MIPS-III and MIPS-V.
-  // They have not been tested and currently exist for the integrated
-  // assembler only.
+  // Don't even attempt to generate code for MIPS-I and MIPS-V. They have not
+  // been tested and currently exist for the integrated assembler only.
   if (MipsArchVersion == Mips1)
     report_fatal_error("Code generation for MIPS-I is not implemented", false);
-  if (MipsArchVersion == Mips3)
-    report_fatal_error("Code generation for MIPS-III is not implemented",
-                       false);
   if (MipsArchVersion == Mips5)
     report_fatal_error("Code generation for MIPS-V is not implemented", false);
 
-  // Assert exactly one ABI was chosen.
-  assert(ABI.IsKnown());
-  assert((((getFeatureBits() & Mips::FeatureO32) != 0) +
-          ((getFeatureBits() & Mips::FeatureEABI) != 0) +
-          ((getFeatureBits() & Mips::FeatureN32) != 0) +
-          ((getFeatureBits() & Mips::FeatureN64) != 0)) == 1);
-
   // Check if Architecture and ABI are compatible.
   assert(((!isGP64bit() && (isABI_O32() || isABI_EABI())) ||
           (isGP64bit() && (isABI_N32() || isABI_N64()))) &&
@@ -172,11 +113,7 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
       report_fatal_error(ISA + " is not compatible with the DSP ASE", false);
   }
 
-  // Is the target system Linux ?
-  if (TT.find("linux") == std::string::npos)
-    IsLinux = false;
-
-  if (NoABICalls && TM->getRelocationModel() == Reloc::PIC_)
+  if (NoABICalls && TM.getRelocationModel() == Reloc::PIC_)
     report_fatal_error("position-independent code requires '-mabicalls'");
 
   // Set UseSmallSection.
@@ -203,22 +140,22 @@ CodeGenOpt::Level MipsSubtarget::getOptLevelToEnablePostRAScheduler() const {
 
 MipsSubtarget &
 MipsSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS,
-                                               const TargetMachine *TM) {
-  std::string CPUName = selectMipsCPU(TargetTriple, CPU);
-  
+                                               const TargetMachine &TM) {
+  std::string CPUName = MIPS_MC::selectMipsCPU(TM.getTargetTriple(), CPU);
+
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FS);
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUName);
 
-  if (InMips16Mode && !TM->Options.UseSoftFloat)
+  if (InMips16Mode && !TM.Options.UseSoftFloat)
     InMips16HardFloat = true;
 
   return *this;
 }
 
 bool MipsSubtarget::abiUsesSoftFloat() const {
-  return TM->Options.UseSoftFloat && !InMips16HardFloat;
+  return TM.Options.UseSoftFloat && !InMips16HardFloat;
 }
 
 bool MipsSubtarget::useConstantIslands() {
@@ -227,5 +164,11 @@ bool MipsSubtarget::useConstantIslands() {
 }
 
 Reloc::Model MipsSubtarget::getRelocationModel() const {
-  return TM->getRelocationModel();
+  return TM.getRelocationModel();
 }
+
+bool MipsSubtarget::isABI_EABI() const { return getABI().IsEABI(); }
+bool MipsSubtarget::isABI_N64() const { return getABI().IsN64(); }
+bool MipsSubtarget::isABI_N32() const { return getABI().IsN32(); }
+bool MipsSubtarget::isABI_O32() const { return getABI().IsO32(); }
+const MipsABIInfo &MipsSubtarget::getABI() const { return TM.getABI(); }
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index bff9013..faded8a 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MIPSSUBTARGET_H
 #define LLVM_LIB_TARGET_MIPS_MIPSSUBTARGET_H
 
+#include "MCTargetDesc/MipsABIInfo.h"
 #include "MipsFrameLowering.h"
 #include "MipsISelLowering.h"
 #include "MipsInstrInfo.h"
@@ -22,7 +23,6 @@
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include "MipsABIInfo.h"
 #include <string>
 
 #define GET_SUBTARGETINFO_HEADER
@@ -38,16 +38,13 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
 
   enum MipsArchEnum {
     MipsDefault,
-    Mips1, Mips2, Mips32, Mips32r2, Mips32r6, Mips3, Mips4, Mips5, Mips64,
-    Mips64r2, Mips64r6
+    Mips1, Mips2, Mips32, Mips32r2, Mips32r3, Mips32r5, Mips32r6, Mips32Max,
+    Mips3, Mips4, Mips5, Mips64, Mips64r2, Mips64r3, Mips64r5, Mips64r6
   };
 
   // Mips architecture version
   MipsArchEnum MipsArchVersion;
 
-  // Selected ABI
-  MipsABIInfo ABI;
-
   // IsLittle - The target is Little Endian
   bool IsLittle;
 
@@ -136,11 +133,10 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // as from the command line
   enum {NoOverride, Mips16Override, NoMips16Override} OverrideMode;
 
-  const MipsTargetMachine *TM;
+  const MipsTargetMachine &TM;
 
   Triple TargetTriple;
 
-  const DataLayout DL; // Calculates type size & alignment
   const MipsSelectionDAGInfo TSInfo;
   std::unique_ptr<const MipsInstrInfo> InstrInfo;
   std::unique_ptr<const MipsFrameLowering> FrameLowering;
@@ -153,18 +149,18 @@ public:
   CodeGenOpt::Level getOptLevelToEnablePostRAScheduler() const override;
 
   /// Only O32 and EABI supported right now.
-  bool isABI_EABI() const { return ABI.IsEABI(); }
-  bool isABI_N64() const { return ABI.IsN64(); }
-  bool isABI_N32() const { return ABI.IsN32(); }
-  bool isABI_O32() const { return ABI.IsO32(); }
+  bool isABI_EABI() const;
+  bool isABI_N64() const;
+  bool isABI_N32() const;
+  bool isABI_O32() const;
+  const MipsABIInfo &getABI() const;
   bool isABI_FPXX() const { return isABI_O32() && IsFPXX; }
-  const MipsABIInfo &getABI() const { return ABI; }
 
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   MipsSubtarget(const std::string &TT, const std::string &CPU,
                 const std::string &FS, bool little,
-                const MipsTargetMachine *TM);
+                const MipsTargetMachine &TM);
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
@@ -178,21 +174,30 @@ public:
   bool hasMips4_32() const { return HasMips4_32; }
   bool hasMips4_32r2() const { return HasMips4_32r2; }
   bool hasMips32() const {
-    return MipsArchVersion >= Mips32 && MipsArchVersion != Mips3 &&
-           MipsArchVersion != Mips4 && MipsArchVersion != Mips5;
+    return (MipsArchVersion >= Mips32 && MipsArchVersion < Mips32Max) ||
+           hasMips64();
   }
   bool hasMips32r2() const {
-    return MipsArchVersion == Mips32r2 || MipsArchVersion == Mips32r6 ||
-           MipsArchVersion == Mips64r2 || MipsArchVersion == Mips64r6;
+    return (MipsArchVersion >= Mips32r2 && MipsArchVersion < Mips32Max) ||
+           hasMips64r2();
+  }
+  bool hasMips32r3() const {
+    return (MipsArchVersion >= Mips32r3 && MipsArchVersion < Mips32Max) ||
+           hasMips64r2();
+  }
+  bool hasMips32r5() const {
+    return (MipsArchVersion >= Mips32r5 && MipsArchVersion < Mips32Max) ||
+           hasMips64r2();
   }
   bool hasMips32r6() const {
-    return MipsArchVersion == Mips32r6 || MipsArchVersion == Mips64r6;
+    return (MipsArchVersion >= Mips32r6 && MipsArchVersion < Mips32Max) ||
+           hasMips64r6();
   }
   bool hasMips64() const { return MipsArchVersion >= Mips64; }
-  bool hasMips64r2() const {
-    return MipsArchVersion == Mips64r2 || MipsArchVersion == Mips64r6;
-  }
-  bool hasMips64r6() const { return MipsArchVersion == Mips64r6; }
+  bool hasMips64r2() const { return MipsArchVersion >= Mips64r2; }
+  bool hasMips64r3() const { return MipsArchVersion >= Mips64r3; }
+  bool hasMips64r5() const { return MipsArchVersion >= Mips64r5; }
+  bool hasMips64r6() const { return MipsArchVersion >= Mips64r6; }
 
   bool hasCnMips() const { return HasCnMips; }
 
@@ -223,7 +228,6 @@ public:
   bool hasDSP() const { return HasDSP; }
   bool hasDSPR2() const { return HasDSPR2; }
   bool hasMSA() const { return HasMSA; }
-  bool isLinux() const { return IsLinux; }
   bool useSmallSection() const { return UseSmallSection; }
 
   bool hasStandardEncoding() const { return !inMips16Mode(); }
@@ -239,9 +243,9 @@ public:
   bool hasMTHC1() const { return hasMips32r2(); }
 
   bool allowMixed16_32() const { return inMips16ModeDefault() |
-                                        AllowMixed16_32;}
+                                        AllowMixed16_32; }
 
-  bool os16() const { return Os16;};
+  bool os16() const { return Os16; }
 
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
 
@@ -255,7 +259,7 @@ public:
   Reloc::Model getRelocationModel() const;
 
   MipsSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS,
-                                                 const TargetMachine *TM);
+                                                 const TargetMachine &TM);
 
   /// Does the system support unaligned memory access.
   ///
@@ -271,7 +275,6 @@ public:
   const MipsSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
-  const DataLayout *getDataLayout() const override { return &DL; }
   const MipsInstrInfo *getInstrInfo() const override { return InstrInfo.get(); }
   const TargetFrameLowering *getFrameLowering() const override {
     return FrameLowering.get();
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 33280e3..86c8931 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -29,7 +29,7 @@
 #include "MipsTargetObjectFile.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -46,6 +46,36 @@ extern "C" void LLVMInitializeMipsTarget() {
   RegisterTargetMachine<MipselTargetMachine> B(TheMips64elTarget);
 }
 
+static std::string computeDataLayout(bool isLittle, MipsABIInfo &ABI) {
+  std::string Ret = "";
+
+  // There are both little and big endian mips.
+  if (isLittle)
+    Ret += "e";
+  else
+    Ret += "E";
+
+  Ret += "-m:m";
+
+  // Pointers are 32 bit on some ABIs.
+  if (!ABI.IsN64())
+    Ret += "-p:32:32";
+
+  // 8 and 16 bit integers only need no have natural alignment, but try to
+  // align them to 32 bits. 64 bit integers have natural alignment.
+  Ret += "-i8:8:32-i16:16:32-i64:64";
+
+  // 32 bit registers are always available and the stack is at least 64 bit
+  // aligned. On N64 64 bit registers are also available and the stack is
+  // 128 bit aligned.
+  if (ABI.IsN64() || ABI.IsN32())
+    Ret += "-n32:64-S128";
+  else
+    Ret += "-n32-S64";
+
+  return Ret;
+}
+
 // On function prologue, the stack is created by decrementing
 // its pointer. Once decremented, all references are done with positive
 // offset from the stack/frame pointer, using StackGrowsUp enables
@@ -57,14 +87,14 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, StringRef TT,
                                      Reloc::Model RM, CodeModel::Model CM,
                                      CodeGenOpt::Level OL, bool isLittle)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-      isLittle(isLittle),
-      TLOF(make_unique<MipsTargetObjectFile>()),
-      Subtarget(nullptr),
-      DefaultSubtarget(TT, CPU, FS, isLittle, this),
+      isLittle(isLittle), TLOF(make_unique<MipsTargetObjectFile>()),
+      ABI(MipsABIInfo::computeTargetABI(Triple(TT), CPU, Options.MCOptions)),
+      DL(computeDataLayout(isLittle, ABI)), Subtarget(nullptr),
+      DefaultSubtarget(TT, CPU, FS, isLittle, *this),
       NoMips16Subtarget(TT, CPU, FS.empty() ? "-mips16" : FS.str() + ",-mips16",
-                        isLittle, this),
+                        isLittle, *this),
       Mips16Subtarget(TT, CPU, FS.empty() ? "+mips16" : FS.str() + ",+mips16",
-                      isLittle, this) {
+                      isLittle, *this) {
   Subtarget = &DefaultSubtarget;
   initAsmInfo();
 }
@@ -91,11 +121,8 @@ MipselTargetMachine(const Target &T, StringRef TT,
 
 const MipsSubtarget *
 MipsTargetMachine::getSubtargetImpl(const Function &F) const {
-  AttributeSet FnAttrs = F.getAttributes();
-  Attribute CPUAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
-  Attribute FSAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
 
   std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
                         ? CPUAttr.getValueAsString().str()
@@ -104,19 +131,16 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const {
                        ? FSAttr.getValueAsString().str()
                        : TargetFS;
   bool hasMips16Attr =
-      !FnAttrs.getAttribute(AttributeSet::FunctionIndex, "mips16")
-           .hasAttribute(Attribute::None);
+      !F.getFnAttribute("mips16").hasAttribute(Attribute::None);
   bool hasNoMips16Attr =
-      !FnAttrs.getAttribute(AttributeSet::FunctionIndex, "nomips16")
-           .hasAttribute(Attribute::None);
+      !F.getFnAttribute("nomips16").hasAttribute(Attribute::None);
 
   // FIXME: This is related to the code below to reset the target options,
   // we need to know whether or not the soft float flag is set on the
   // function before we can generate a subtarget. We also need to use
   // it as a key for the subtarget since that can be the only difference
   // between two functions.
-  Attribute SFAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "use-soft-float");
+  Attribute SFAttr = F.getFnAttribute("use-soft-float");
   bool softFloat = !SFAttr.hasAttribute(Attribute::None)
                        ? SFAttr.getValueAsString() == "true"
                        : Options.UseSoftFloat;
@@ -133,7 +157,7 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const {
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = llvm::make_unique<MipsSubtarget>(TargetTriple, CPU, FS, isLittle, this);
+    I = llvm::make_unique<MipsSubtarget>(TargetTriple, CPU, FS, isLittle, *this);
   }
   return I.get();
 }
@@ -170,9 +194,9 @@ public:
   void addIRPasses() override;
   bool addInstSelector() override;
   void addMachineSSAOptimization() override;
-  bool addPreEmitPass() override;
+  void addPreEmitPass() override;
 
-  bool addPreRegAlloc() override;
+  void addPreRegAlloc() override;
 
 };
 } // namespace
@@ -203,35 +227,30 @@ void MipsPassConfig::addMachineSSAOptimization() {
   TargetPassConfig::addMachineSSAOptimization();
 }
 
-bool MipsPassConfig::addPreRegAlloc() {
-  if (getOptLevel() == CodeGenOpt::None) {
+void MipsPassConfig::addPreRegAlloc() {
+  if (getOptLevel() == CodeGenOpt::None)
     addPass(createMipsOptimizePICCallPass(getMipsTargetMachine()));
-    return true;
-  }
-  else
-    return false;
 }
 
-void MipsTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  if (Subtarget->allowMixed16_32()) {
-    DEBUG(errs() << "No ");
-    //FIXME: The Basic Target Transform Info
-    // pass needs to become a function pass instead of
-    // being an immutable pass and then this method as it exists now
-    // would be unnecessary.
-    PM.add(createNoTargetTransformInfoPass());
-  } else
-    LLVMTargetMachine::addAnalysisPasses(PM);
-  DEBUG(errs() << "Target Transform Info Pass Added\n");
+TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](Function &F) {
+    if (Subtarget->allowMixed16_32()) {
+      DEBUG(errs() << "No Target Transform Info Pass Added\n");
+      // FIXME: This is no longer necessary as the TTI returned is per-function.
+      return TargetTransformInfo(getDataLayout());
+    }
+
+    DEBUG(errs() << "Target Transform Info Pass Added\n");
+    return TargetTransformInfo(BasicTTIImpl(this, F));
+  });
 }
 
 // Implemented by targets that want to run passes immediately before
 // machine code is emitted. return true if -print-machineinstrs should
 // print out the code after the passes.
-bool MipsPassConfig::addPreEmitPass() {
+void MipsPassConfig::addPreEmitPass() {
   MipsTargetMachine &TM = getMipsTargetMachine();
   addPass(createMipsDelaySlotFillerPass(TM));
   addPass(createMipsLongBranchPass(TM));
   addPass(createMipsConstantIslandPass(TM));
-  return true;
 }
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index 1349f82..afd0cea 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -14,7 +14,9 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MIPSTARGETMACHINE_H
 #define LLVM_LIB_TARGET_MIPS_MIPSTARGETMACHINE_H
 
+#include "MCTargetDesc/MipsABIInfo.h"
 #include "MipsSubtarget.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -27,6 +29,9 @@ class MipsRegisterInfo;
 class MipsTargetMachine : public LLVMTargetMachine {
   bool isLittle;
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  // Selected ABI
+  MipsABIInfo ABI;
+  const DataLayout DL; // Calculates type size & alignment
   MipsSubtarget *Subtarget;
   MipsSubtarget DefaultSubtarget;
   MipsSubtarget NoMips16Subtarget;
@@ -40,8 +45,9 @@ public:
                     CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle);
   ~MipsTargetMachine() override;
 
-  void addAnalysisPasses(PassManagerBase &PM) override;
+  TargetIRAnalysis getTargetIRAnalysis() override;
 
+  const DataLayout *getDataLayout() const override { return &DL; }
   const MipsSubtarget *getSubtargetImpl() const override {
     if (Subtarget)
       return Subtarget;
@@ -59,6 +65,9 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+  bool isLittleEndian() const { return isLittle; }
+  const MipsABIInfo &getABI() const { return ABI; }
 };
 
 /// MipsebTargetMachine - Mips32/64 big endian target machine.
diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
index b56c39b..c07693e 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -39,15 +39,11 @@ void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(TM.Options.UseInitArray);
 
-  SmallDataSection =
-    getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
-                               ELF::SHF_WRITE |ELF::SHF_ALLOC,
-                               SectionKind::getDataRel());
-
-  SmallBSSSection =
-    getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
-                               ELF::SHF_WRITE |ELF::SHF_ALLOC,
-                               SectionKind::getBSS());
+  SmallDataSection = getContext().getELFSection(
+      ".sdata", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+
+  SmallBSSSection = getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+                                               ELF::SHF_WRITE | ELF::SHF_ALLOC);
   this->TM = &TM;
 }
 
@@ -109,8 +105,7 @@ IsGlobalInSmallSectionImpl(const GlobalValue *GV,
     return false;
 
   Type *Ty = GV->getType()->getElementType();
-  return IsInSmallSection(
-      TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(Ty));
+  return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
 }
 
 const MCSection *MipsTargetObjectFile::
@@ -132,10 +127,9 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
 /// Return true if this constant should be placed into small data section.
 bool MipsTargetObjectFile::
 IsConstantInSmallSection(const Constant *CN, const TargetMachine &TM) const {
-  return (TM.getSubtarget<MipsSubtarget>().useSmallSection() &&
-          LocalSData &&
-          IsInSmallSection(TM.getSubtargetImpl()->getDataLayout()
-                           ->getTypeAllocSize(CN->getType())));
+  return (
+      TM.getSubtarget<MipsSubtarget>().useSmallSection() && LocalSData &&
+      IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(CN->getType())));
 }
 
 const MCSection *MipsTargetObjectFile::
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
index c1f17933..b3b8296 100644
--- a/lib/Target/Mips/MipsTargetStreamer.h
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -10,10 +10,11 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MIPSTARGETSTREAMER_H
 #define LLVM_LIB_TARGET_MIPS_MIPSTARGETSTREAMER_H
 
+#include "MCTargetDesc/MipsABIFlagsSection.h"
+#include "MCTargetDesc/MipsABIInfo.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
-#include "MCTargetDesc/MipsABIFlagsSection.h"
 
 namespace llvm {
 
@@ -34,6 +35,7 @@ public:
   virtual void emitDirectiveSetMsa();
   virtual void emitDirectiveSetNoMsa();
   virtual void emitDirectiveSetAt();
+  virtual void emitDirectiveSetAtWithArg(unsigned RegNo);
   virtual void emitDirectiveSetNoAt();
   virtual void emitDirectiveEnd(StringRef Name);
 
@@ -57,9 +59,13 @@ public:
   virtual void emitDirectiveSetMips5();
   virtual void emitDirectiveSetMips32();
   virtual void emitDirectiveSetMips32R2();
+  virtual void emitDirectiveSetMips32R3();
+  virtual void emitDirectiveSetMips32R5();
   virtual void emitDirectiveSetMips32R6();
   virtual void emitDirectiveSetMips64();
   virtual void emitDirectiveSetMips64R2();
+  virtual void emitDirectiveSetMips64R3();
+  virtual void emitDirectiveSetMips64R5();
   virtual void emitDirectiveSetMips64R6();
   virtual void emitDirectiveSetDsp();
   virtual void emitDirectiveSetNoDsp();
@@ -95,12 +101,18 @@ public:
   // structure values.
   template <class PredicateLibrary>
   void updateABIInfo(const PredicateLibrary &P) {
+    ABI = &P.getABI();
     ABIFlagsSection.setAllFromPredicates(P);
   }
 
   MipsABIFlagsSection &getABIFlagsSection() { return ABIFlagsSection; }
+  const MipsABIInfo &getABI() const {
+    assert(ABI && "ABI hasn't been set!");
+    return *ABI;
+  }
 
 protected:
+  const MipsABIInfo *ABI;
   MipsABIFlagsSection ABIFlagsSection;
 
   bool GPRInfoSet;
@@ -138,6 +150,7 @@ public:
   void emitDirectiveSetMsa() override;
   void emitDirectiveSetNoMsa() override;
   void emitDirectiveSetAt() override;
+  void emitDirectiveSetAtWithArg(unsigned RegNo) override;
   void emitDirectiveSetNoAt() override;
   void emitDirectiveEnd(StringRef Name) override;
 
@@ -161,9 +174,13 @@ public:
   void emitDirectiveSetMips5() override;
   void emitDirectiveSetMips32() override;
   void emitDirectiveSetMips32R2() override;
+  void emitDirectiveSetMips32R3() override;
+  void emitDirectiveSetMips32R5() override;
   void emitDirectiveSetMips32R6() override;
   void emitDirectiveSetMips64() override;
   void emitDirectiveSetMips64R2() override;
+  void emitDirectiveSetMips64R3() override;
+  void emitDirectiveSetMips64R5() override;
   void emitDirectiveSetMips64R6() override;
   void emitDirectiveSetDsp() override;
   void emitDirectiveSetNoDsp() override;
@@ -224,11 +241,6 @@ public:
   // ABI Flags
   void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI) override;
   void emitMipsAbiFlags() override;
-
-protected:
-  bool isO32() const { return STI.getFeatureBits() & Mips::FeatureO32; }
-  bool isN32() const { return STI.getFeatureBits() & Mips::FeatureN32; }
-  bool isN64() const { return STI.getFeatureBits() & Mips::FeatureN64; }
 };
 }
 #endif
diff --git a/lib/Target/NVPTX/LLVMBuild.txt b/lib/Target/NVPTX/LLVMBuild.txt
index bc8d82e..6ea244a 100644
--- a/lib/Target/NVPTX/LLVMBuild.txt
+++ b/lib/Target/NVPTX/LLVMBuild.txt
@@ -28,5 +28,5 @@ has_asmprinter = 1
 type = Library
 name = NVPTXCodeGen
 parent = NVPTX
-required_libraries = Analysis AsmPrinter CodeGen Core MC NVPTXAsmPrinter NVPTXDesc NVPTXInfo Scalar SelectionDAG Support Target
+required_libraries = Analysis AsmPrinter CodeGen Core MC NVPTXAsmPrinter NVPTXDesc NVPTXInfo Scalar SelectionDAG Support Target TransformUtils
 add_to_library_groups = NVPTX
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index 4fd5bdd..11d737e 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -50,5 +50,6 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(StringRef TT) {
   AscizDirective = " .b8";
 
   // @TODO: Can we just disable this?
+  WeakDirective = "\t// .weak\t";
   GlobalDirective = "\t// .globl\t";
 }
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index 13ba57e..382525d 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -59,9 +59,8 @@ inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) {
   llvm_unreachable("Unknown condition code");
 }
 
-ImmutablePass *createNVPTXTargetTransformInfoPass(const NVPTXTargetMachine *TM);
-FunctionPass *
-createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel);
+FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
+                                 llvm::CodeGenOpt::Level OptLevel);
 ModulePass *createNVPTXAssignValidGlobalNamesPass();
 ModulePass *createGenericToNVVMPass();
 FunctionPass *createNVPTXFavorNonGenericAddrSpacesPass();
diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
index 69fc86e..c343980 100644
--- a/lib/Target/NVPTX/NVPTXAllocaHoisting.h
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_NVPTX_NVPTXALLOCAHOISTING_H
 
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Pass.h"
 
@@ -32,8 +33,8 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DataLayoutPass>();
-    AU.addPreserved("stack-protector");
     AU.addPreserved<MachineFunctionAnalysis>();
+    AU.addPreserved<StackProtector>();
   }
 
   const char *getPassName() const override {
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 35ba4f1..833db04 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -17,8 +17,8 @@
 #include "MCTargetDesc/NVPTXMCAsmInfo.h"
 #include "NVPTX.h"
 #include "NVPTXInstrInfo.h"
-#include "NVPTXMachineFunctionInfo.h"
 #include "NVPTXMCExpr.h"
+#include "NVPTXMachineFunctionInfo.h"
 #include "NVPTXRegisterInfo.h"
 #include "NVPTXTargetMachine.h"
 #include "NVPTXUtilities.h"
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/DebugInfo.h"
@@ -45,6 +46,7 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TimeValue.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
 #include <sstream>
 using namespace llvm;
 
@@ -108,160 +110,6 @@ void VisitGlobalVariableForEmission(
 }
 }
 
-// @TODO: This is a copy from AsmPrinter.cpp.  The function is static, so we
-// cannot just link to the existing version.
-/// LowerConstant - Lower the specified LLVM Constant to an MCExpr.
-///
-using namespace nvptx;
-const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
-  MCContext &Ctx = AP.OutContext;
-
-  if (CV->isNullValue() || isa<UndefValue>(CV))
-    return MCConstantExpr::Create(0, Ctx);
-
-  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV))
-    return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
-
-  if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
-    return MCSymbolRefExpr::Create(AP.getSymbol(GV), Ctx);
-
-  if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
-    return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
-
-  const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
-  if (!CE)
-    llvm_unreachable("Unknown constant value to lower!");
-
-  switch (CE->getOpcode()) {
-  default:
-    // If the code isn't optimized, there may be outstanding folding
-    // opportunities. Attempt to fold the expression using DataLayout as a
-    // last resort before giving up.
-    if (Constant *C = ConstantFoldConstantExpression(
-            CE, AP.TM.getSubtargetImpl()->getDataLayout()))
-      if (C != CE)
-        return LowerConstant(C, AP);
-
-    // Otherwise report the problem to the user.
-    {
-      std::string S;
-      raw_string_ostream OS(S);
-      OS << "Unsupported expression in static initializer: ";
-      CE->printAsOperand(OS, /*PrintType=*/ false,
-                         !AP.MF ? nullptr : AP.MF->getFunction()->getParent());
-      report_fatal_error(OS.str());
-    }
-  case Instruction::AddrSpaceCast: {
-    // Strip any addrspace(1)->addrspace(0) addrspace casts. These will be
-    // handled by the generic() logic in the MCExpr printer
-    PointerType *DstTy            = cast<PointerType>(CE->getType());
-    PointerType *SrcTy            = cast<PointerType>(CE->getOperand(0)->getType());
-    if (SrcTy->getAddressSpace() == 1 && DstTy->getAddressSpace() == 0) {
-      return LowerConstant(cast<const Constant>(CE->getOperand(0)), AP);
-    }
-    std::string S;
-    raw_string_ostream OS(S);
-    OS << "Unsupported expression in static initializer: ";
-    CE->printAsOperand(OS, /*PrintType=*/ false,
-                       !AP.MF ? nullptr : AP.MF->getFunction()->getParent());
-    report_fatal_error(OS.str());
-  }
-  case Instruction::GetElementPtr: {
-    const DataLayout &TD = *AP.TM.getSubtargetImpl()->getDataLayout();
-    // Generate a symbolic expression for the byte address
-    APInt OffsetAI(TD.getPointerSizeInBits(), 0);
-    cast<GEPOperator>(CE)->accumulateConstantOffset(TD, OffsetAI);
-
-    const MCExpr *Base = LowerConstant(CE->getOperand(0), AP);
-    if (!OffsetAI)
-      return Base;
-
-    int64_t Offset = OffsetAI.getSExtValue();
-    return MCBinaryExpr::CreateAdd(Base, MCConstantExpr::Create(Offset, Ctx),
-                                   Ctx);
-  }
-
-  case Instruction::Trunc:
-    // We emit the value and depend on the assembler to truncate the generated
-    // expression properly.  This is important for differences between
-    // blockaddress labels.  Since the two labels are in the same function, it
-    // is reasonable to treat their delta as a 32-bit value.
-  // FALL THROUGH.
-  case Instruction::BitCast:
-    return LowerConstant(CE->getOperand(0), AP);
-
-  case Instruction::IntToPtr: {
-    const DataLayout &TD = *AP.TM.getSubtargetImpl()->getDataLayout();
-    // Handle casts to pointers by changing them into casts to the appropriate
-    // integer type.  This promotes constant folding and simplifies this code.
-    Constant *Op = CE->getOperand(0);
-    Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CV->getContext()),
-                                      false /*ZExt*/);
-    return LowerConstant(Op, AP);
-  }
-
-  case Instruction::PtrToInt: {
-    const DataLayout &TD = *AP.TM.getSubtargetImpl()->getDataLayout();
-    // Support only foldable casts to/from pointers that can be eliminated by
-    // changing the pointer to the appropriately sized integer type.
-    Constant *Op = CE->getOperand(0);
-    Type *Ty = CE->getType();
-
-    const MCExpr *OpExpr = LowerConstant(Op, AP);
-
-    // We can emit the pointer value into this slot if the slot is an
-    // integer slot equal to the size of the pointer.
-    if (TD.getTypeAllocSize(Ty) == TD.getTypeAllocSize(Op->getType()))
-      return OpExpr;
-
-    // Otherwise the pointer is smaller than the resultant integer, mask off
-    // the high bits so we are sure to get a proper truncation if the input is
-    // a constant expr.
-    unsigned InBits = TD.getTypeAllocSizeInBits(Op->getType());
-    const MCExpr *MaskExpr =
-        MCConstantExpr::Create(~0ULL >> (64 - InBits), Ctx);
-    return MCBinaryExpr::CreateAnd(OpExpr, MaskExpr, Ctx);
-  }
-
-    // The MC library also has a right-shift operator, but it isn't consistently
-  // signed or unsigned between different targets.
-  case Instruction::Add:
-  case Instruction::Sub:
-  case Instruction::Mul:
-  case Instruction::SDiv:
-  case Instruction::SRem:
-  case Instruction::Shl:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor: {
-    const MCExpr *LHS = LowerConstant(CE->getOperand(0), AP);
-    const MCExpr *RHS = LowerConstant(CE->getOperand(1), AP);
-    switch (CE->getOpcode()) {
-    default:
-      llvm_unreachable("Unknown binary operator constant cast expr");
-    case Instruction::Add:
-      return MCBinaryExpr::CreateAdd(LHS, RHS, Ctx);
-    case Instruction::Sub:
-      return MCBinaryExpr::CreateSub(LHS, RHS, Ctx);
-    case Instruction::Mul:
-      return MCBinaryExpr::CreateMul(LHS, RHS, Ctx);
-    case Instruction::SDiv:
-      return MCBinaryExpr::CreateDiv(LHS, RHS, Ctx);
-    case Instruction::SRem:
-      return MCBinaryExpr::CreateMod(LHS, RHS, Ctx);
-    case Instruction::Shl:
-      return MCBinaryExpr::CreateShl(LHS, RHS, Ctx);
-    case Instruction::And:
-      return MCBinaryExpr::CreateAnd(LHS, RHS, Ctx);
-    case Instruction::Or:
-      return MCBinaryExpr::CreateOr(LHS, RHS, Ctx);
-    case Instruction::Xor:
-      return MCBinaryExpr::CreateXor(LHS, RHS, Ctx);
-    }
-  }
-  }
-}
-
 void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
   if (!EmitLineNumbers)
     return;
@@ -316,7 +164,7 @@ void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
 void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   SmallString<128> Str;
   raw_svector_ostream OS(Str);
-  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
+  if (static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() == NVPTX::CUDA)
     emitLineNumberAsDotLoc(*MI);
 
   MCInst Inst;
@@ -389,8 +237,6 @@ void NVPTXAsmPrinter::lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp) {
 
 void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
   OutMI.setOpcode(MI->getOpcode());
-  const NVPTXSubtarget &ST = TM.getSubtarget<NVPTXSubtarget>();
-
   // Special: Do not mangle symbol operand of CALL_PROTOTYPE
   if (MI->getOpcode() == NVPTX::CALL_PROTOTYPE) {
     const MachineOperand &MO = MI->getOperand(0);
@@ -403,7 +249,7 @@ void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
     const MachineOperand &MO = MI->getOperand(i);
 
     MCOperand MCOp;
-    if (!ST.hasImageHandles()) {
+    if (!nvptxSubtarget->hasImageHandles()) {
       if (lowerImageHandleOperand(MI, i, MCOp)) {
         OutMI.addOperand(MCOp);
         continue;
@@ -500,12 +346,12 @@ MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) {
 }
 
 void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
-  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
-  const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering();
+  const DataLayout *TD = TM.getDataLayout();
+  const TargetLowering *TLI = nvptxSubtarget->getTargetLowering();
 
   Type *Ty = F->getReturnType();
 
-  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
 
   if (Ty->getTypeID() == Type::VoidTyID)
     return;
@@ -528,17 +374,15 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
     } else if (isa<PointerType>(Ty)) {
       O << ".param .b" << TLI->getPointerTy().getSizeInBits()
         << " func_retval0";
-    } else {
-      if ((Ty->getTypeID() == Type::StructTyID) || isa<VectorType>(Ty)) {
-        unsigned totalsz = TD->getTypeAllocSize(Ty);
-        unsigned retAlignment = 0;
-        if (!llvm::getAlign(*F, 0, retAlignment))
-          retAlignment = TD->getABITypeAlignment(Ty);
-        O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz
-          << "]";
-      } else
-        assert(false && "Unknown return type");
-    }
+    } else if ((Ty->getTypeID() == Type::StructTyID) || isa<VectorType>(Ty)) {
+       unsigned totalsz = TD->getTypeAllocSize(Ty);
+       unsigned retAlignment = 0;
+       if (!llvm::getAlign(*F, 0, retAlignment))
+         retAlignment = TD->getABITypeAlignment(Ty);
+       O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz
+         << "]";
+    } else
+      llvm_unreachable("Unknown return type");
   } else {
     SmallVector<EVT, 16> vtparts;
     ComputeValueVTs(*TLI, Ty, vtparts);
@@ -574,6 +418,42 @@ void NVPTXAsmPrinter::printReturnValStr(const MachineFunction &MF,
   printReturnValStr(F, O);
 }
 
+// Return true if MBB is the header of a loop marked with
+// llvm.loop.unroll.disable.
+// TODO: consider "#pragma unroll 1" which is equivalent to "#pragma nounroll".
+bool NVPTXAsmPrinter::isLoopHeaderOfNoUnroll(
+    const MachineBasicBlock &MBB) const {
+  MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
+  // TODO: isLoopHeader() should take "const MachineBasicBlock *".
+  // We insert .pragma "nounroll" only to the loop header.
+  if (!LI.isLoopHeader(const_cast<MachineBasicBlock *>(&MBB)))
+    return false;
+
+  // llvm.loop.unroll.disable is marked on the back edges of a loop. Therefore,
+  // we iterate through each back edge of the loop with header MBB, and check
+  // whether its metadata contains llvm.loop.unroll.disable.
+  for (auto I = MBB.pred_begin(); I != MBB.pred_end(); ++I) {
+    const MachineBasicBlock *PMBB = *I;
+    if (LI.getLoopFor(PMBB) != LI.getLoopFor(&MBB)) {
+      // Edges from other loops to MBB are not back edges.
+      continue;
+    }
+    if (const BasicBlock *PBB = PMBB->getBasicBlock()) {
+      if (MDNode *LoopID = PBB->getTerminator()->getMetadata("llvm.loop")) {
+        if (GetUnrollMetadata(LoopID, "llvm.loop.unroll.disable"))
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+void NVPTXAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
+  AsmPrinter::EmitBasicBlockStart(MBB);
+  if (isLoopHeaderOfNoUnroll(MBB))
+    OutStreamer.EmitRawText(StringRef("\t.pragma \"nounroll\";\n"));
+}
+
 void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
   SmallString<128> Str;
   raw_svector_ostream O(Str);
@@ -624,14 +504,13 @@ void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
 
 void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
   unsigned RegNo = MI->getOperand(0).getReg();
-  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
+  const TargetRegisterInfo *TRI = nvptxSubtarget->getRegisterInfo();
   if (TRI->isVirtualRegister(RegNo)) {
     OutStreamer.AddComment(Twine("implicit-def: ") +
                            getVirtualRegisterName(RegNo));
   } else {
-    OutStreamer.AddComment(
-        Twine("implicit-def: ") +
-        TM.getSubtargetImpl()->getRegisterInfo()->getName(RegNo));
+    OutStreamer.AddComment(Twine("implicit-def: ") +
+                           nvptxSubtarget->getRegisterInfo()->getName(RegNo));
   }
   OutStreamer.AddBlankLine();
 }
@@ -793,11 +672,6 @@ static bool usedInOneFunc(const User *U, Function const *&oneFunc) {
       return false;
   }
 
-  if (const MDNode *md = dyn_cast<MDNode>(U))
-    if (md->hasName() && ((md->getName().str() == "llvm.dbg.gv") ||
-                          (md->getName().str() == "llvm.dbg.sp")))
-      return true;
-
   for (const User *UU : U->users())
     if (usedInOneFunc(UU, oneFunc) == false)
       return false;
@@ -938,6 +812,14 @@ void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) {
 }
 
 bool NVPTXAsmPrinter::doInitialization(Module &M) {
+  // Construct a default subtarget off of the TargetMachine defaults. The
+  // rest of NVPTX isn't friendly to change subtargets per function and
+  // so the default TargetMachine will have all of the options.
+  StringRef TT = TM.getTargetTriple();
+  StringRef CPU = TM.getTargetCPU();
+  StringRef FS = TM.getTargetFeatureString();
+  const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
+  const NVPTXSubtarget STI(TT, CPU, FS, NTM);
 
   SmallString<128> Str1;
   raw_svector_ostream OS1(Str1);
@@ -952,10 +834,10 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
   const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
       .Initialize(OutContext, TM);
 
-  Mang = new Mangler(TM.getSubtargetImpl()->getDataLayout());
+  Mang = new Mangler(TM.getDataLayout());
 
   // Emit header before any dwarf directives are emitted below.
-  emitHeader(M, OS1);
+  emitHeader(M, OS1, STI);
   OutStreamer.EmitRawText(OS1.str());
 
   // Already commented out
@@ -971,7 +853,8 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
     OutStreamer.AddBlankLine();
   }
 
-  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
+  // If we're not NVCL we're CUDA, go ahead and emit filenames.
+  if (Triple(TM.getTargetTriple()).getOS() != Triple::NVCL)
     recordAndEmitFilenames(M);
 
   GlobalsEmitted = false;
@@ -1012,22 +895,24 @@ void NVPTXAsmPrinter::emitGlobals(const Module &M) {
   OutStreamer.EmitRawText(OS2.str());
 }
 
-void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O) {
+void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
+                                 const NVPTXSubtarget &STI) {
   O << "//\n";
   O << "// Generated by LLVM NVPTX Back-End\n";
   O << "//\n";
   O << "\n";
 
-  unsigned PTXVersion = nvptxSubtarget.getPTXVersion();
+  unsigned PTXVersion = STI.getPTXVersion();
   O << ".version " << (PTXVersion / 10) << "." << (PTXVersion % 10) << "\n";
 
   O << ".target ";
-  O << nvptxSubtarget.getTargetName();
+  O << STI.getTargetName();
 
-  if (nvptxSubtarget.getDrvInterface() == NVPTX::NVCL)
+  const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
+  if (NTM.getDrvInterface() == NVPTX::NVCL)
     O << ", texmode_independent";
-  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) {
-    if (!nvptxSubtarget.hasDouble())
+  else {
+    if (!STI.hasDouble())
       O << ", map_f64_to_f32";
   }
 
@@ -1037,7 +922,7 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O) {
   O << "\n";
 
   O << ".address_size ";
-  if (nvptxSubtarget.is64Bit())
+  if (NTM.is64Bit())
     O << "64";
   else
     O << "32";
@@ -1047,7 +932,6 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O) {
 }
 
 bool NVPTXAsmPrinter::doFinalization(Module &M) {
-
   // If we did not emit any functions, then the global declarations have not
   // yet been emitted.
   if (!GlobalsEmitted) {
@@ -1109,7 +993,7 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
 
 void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
                                            raw_ostream &O) {
-  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) {
+  if (static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() == NVPTX::CUDA) {
     if (V->hasExternalLinkage()) {
       if (isa<GlobalVariable>(V)) {
         const GlobalVariable *GVar = cast<GlobalVariable>(V);
@@ -1153,7 +1037,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
       GVar->getName().startswith("nvvm."))
     return;
 
-  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *TD = TM.getDataLayout();
 
   // GlobalVariables are always constant pointers themselves.
   const PointerType *PTy = GVar->getType();
@@ -1287,7 +1171,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
   else
     O << " .align " << GVar->getAlignment();
 
-  if (ETy->isSingleValueType()) {
+  if (ETy->isFloatingPointTy() || ETy->isIntegerTy() || ETy->isPointerTy()) {
     O << " .";
     // Special case: ABI requires that we use .u8 for predicates
     if (ETy->isIntegerTy(1))
@@ -1341,7 +1225,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           AggBuffer aggBuffer(ElementSize, O, *this);
           bufferAggregateConstant(Initializer, &aggBuffer);
           if (aggBuffer.numSymbols) {
-            if (nvptxSubtarget.is64Bit()) {
+            if (static_cast<const NVPTXTargetMachine &>(TM).is64Bit()) {
               O << " .u64 " << *getSymbol(GVar) << "[";
               O << ElementSize / 8;
             } else {
@@ -1439,7 +1323,7 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, bool useB4PTR) const {
   case Type::DoubleTyID:
     return "f64";
   case Type::PointerTyID:
-    if (nvptxSubtarget.is64Bit())
+    if (static_cast<const NVPTXTargetMachine &>(TM).is64Bit())
       if (useB4PTR)
         return "b64";
       else
@@ -1456,7 +1340,7 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, bool useB4PTR) const {
 void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
                                             raw_ostream &O) {
 
-  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *TD = TM.getDataLayout();
 
   // GlobalVariables are always constant pointers themselves.
   const PointerType *PTy = GVar->getType();
@@ -1469,7 +1353,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
   else
     O << " .align " << GVar->getAlignment();
 
-  if (ETy->isSingleValueType()) {
+  if (ETy->isFloatingPointTy() || ETy->isIntegerTy() || ETy->isPointerTy()) {
     O << " .";
     O << getPTXFundamentalTypeStr(ETy);
     O << " ";
@@ -1508,17 +1392,6 @@ static unsigned int getOpenCLAlignment(const DataLayout *TD, Type *Ty) {
   if (ATy)
     return getOpenCLAlignment(TD, ATy->getElementType());
 
-  const VectorType *VTy = dyn_cast<VectorType>(Ty);
-  if (VTy) {
-    Type *ETy = VTy->getElementType();
-    unsigned int numE = VTy->getNumElements();
-    unsigned int alignE = TD->getPrefTypeAlignment(ETy);
-    if (numE == 3)
-      return 4 * alignE;
-    else
-      return numE * alignE;
-  }
-
   const StructType *STy = dyn_cast<StructType>(Ty);
   if (STy) {
     unsigned int alignStruct = 1;
@@ -1541,50 +1414,22 @@ static unsigned int getOpenCLAlignment(const DataLayout *TD, Type *Ty) {
 
 void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
                                      int paramIndex, raw_ostream &O) {
-  if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
-      (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA))
-    O << *getSymbol(I->getParent()) << "_param_" << paramIndex;
-  else {
-    std::string argName = I->getName();
-    const char *p = argName.c_str();
-    while (*p) {
-      if (*p == '.')
-        O << "_";
-      else
-        O << *p;
-      p++;
-    }
-  }
+  O << *getSymbol(I->getParent()) << "_param_" << paramIndex;
 }
 
 void NVPTXAsmPrinter::printParamName(int paramIndex, raw_ostream &O) {
-  Function::const_arg_iterator I, E;
-  int i = 0;
-
-  if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
-      (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)) {
-    O << *CurrentFnSym << "_param_" << paramIndex;
-    return;
-  }
-
-  for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, i++) {
-    if (i == paramIndex) {
-      printParamName(I, paramIndex, O);
-      return;
-    }
-  }
-  llvm_unreachable("paramIndex out of bound");
+  O << *CurrentFnSym << "_param_" << paramIndex;
 }
 
 void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
-  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *TD = TM.getDataLayout();
   const AttributeSet &PAL = F->getAttributes();
-  const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering();
+  const TargetLowering *TLI = nvptxSubtarget->getTargetLowering();
   Function::const_arg_iterator I, E;
   unsigned paramIndex = 0;
   bool first = true;
   bool isKernelFunc = llvm::isKernelFunction(*F);
-  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
   MVT thePointerTy = TLI->getPointerTy();
 
   O << "(\n";
@@ -1603,21 +1448,21 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
         if (isImage(*I)) {
           std::string sname = I->getName();
           if (isImageWriteOnly(*I) || isImageReadWrite(*I)) {
-            if (nvptxSubtarget.hasImageHandles())
+            if (nvptxSubtarget->hasImageHandles())
               O << "\t.param .u64 .ptr .surfref ";
             else
               O << "\t.param .surfref ";
             O << *CurrentFnSym << "_param_" << paramIndex;
           }
           else { // Default image is read_only
-            if (nvptxSubtarget.hasImageHandles())
+            if (nvptxSubtarget->hasImageHandles())
               O << "\t.param .u64 .ptr .texref ";
             else
               O << "\t.param .texref ";
             O << *CurrentFnSym << "_param_" << paramIndex;
           }
         } else {
-          if (nvptxSubtarget.hasImageHandles())
+          if (nvptxSubtarget->hasImageHandles())
             O << "\t.param .u64 .ptr .samplerref ";
           else
             O << "\t.param .samplerref ";
@@ -1650,7 +1495,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
           // Special handling for pointer arguments to kernel
           O << "\t.param .u" << thePointerTy.getSizeInBits() << " ";
 
-          if (nvptxSubtarget.getDrvInterface() != NVPTX::CUDA) {
+          if (static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() !=
+              NVPTX::CUDA) {
             Type *ETy = PTy->getElementType();
             int addrSpace = PTy->getAddressSpace();
             switch (addrSpace) {
@@ -1779,7 +1625,7 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
   if (NumBytes) {
     O << "\t.local .align " << MFI->getMaxAlignment() << " .b8 \t" << DEPOTNAME
       << getFunctionNumber() << "[" << NumBytes << "];\n";
-    if (nvptxSubtarget.is64Bit()) {
+    if (static_cast<const NVPTXTargetMachine &>(MF.getTarget()).is64Bit()) {
       O << "\t.reg .b64 \t%SP;\n";
       O << "\t.reg .b64 \t%SPL;\n";
     } else {
@@ -1900,7 +1746,7 @@ void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
       }
       return;
     } else {
-      O << *LowerConstant(CPV, *this);
+      O << *lowerConstant(CPV);
       return;
     }
   }
@@ -1910,7 +1756,7 @@ void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
 void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
                                    AggBuffer *aggBuffer) {
 
-  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *TD = TM.getDataLayout();
 
   if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
     int s = TD->getTypeAllocSize(CPV->getType());
@@ -2034,7 +1880,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
 
 void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
                                               AggBuffer *aggBuffer) {
-  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *TD = TM.getDataLayout();
   int Bytes;
 
   // Old constants
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 83fa5d3..7e6b5e8 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -39,13 +39,6 @@
 // A better approach is to clone the MCAsmStreamer to a MCPTXAsmStreamer
 // (subclass of MCStreamer).
 
-// This is defined in AsmPrinter.cpp.
-// Used to process the constant expressions in initializers.
-namespace nvptx {
-const llvm::MCExpr *
-LowerConstant(const llvm::Constant *CV, llvm::AsmPrinter &AP);
-}
-
 namespace llvm {
 
 class LineReader {
@@ -145,7 +138,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
         unsigned int nSym = 0;
         unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
         unsigned int nBytes = 4;
-        if (AP.nvptxSubtarget.is64Bit())
+        if (static_cast<const NVPTXTargetMachine &>(AP.TM).is64Bit())
           nBytes = 8;
         for (pos = 0; pos < size; pos += nBytes) {
           if (pos)
@@ -167,7 +160,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
                 O << *Name;
               }
             } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) {
-              O << *nvptx::LowerConstant(Cexpr, AP);
+              O << *AP.lowerConstant(Cexpr);
             } else
               llvm_unreachable("symbol type unknown");
             nSym++;
@@ -194,6 +187,7 @@ private:
   const Function *F;
   std::string CurrentFnName;
 
+  void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override;
   void EmitFunctionEntryLabel() override;
   void EmitFunctionBodyStart() override;
   void EmitFunctionBodyEnd() override;
@@ -218,7 +212,7 @@ private:
   void printParamName(Function::const_arg_iterator I, int paramIndex,
                       raw_ostream &O);
   void emitGlobals(const Module &M);
-  void emitHeader(Module &M, raw_ostream &O);
+  void emitHeader(Module &M, raw_ostream &O, const NVPTXSubtarget &STI);
   void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const;
   void emitVirtualRegister(unsigned int vr, raw_ostream &);
   void emitFunctionExternParamList(const MachineFunction &MF);
@@ -254,8 +248,10 @@ private:
   typedef DenseMap<unsigned, unsigned> VRegMap;
   typedef DenseMap<const TargetRegisterClass *, VRegMap> VRegRCMap;
   VRegRCMap VRegMapping;
-  // cache the subtarget here.
-  const NVPTXSubtarget &nvptxSubtarget;
+
+  // Cache the subtarget here.
+  const NVPTXSubtarget *nvptxSubtarget;
+
   // Build the map between type name and ID based on module's type
   // symbol table.
   std::map<const Type *, std::string> TypeNameMap;
@@ -288,6 +284,8 @@ private:
                                MCOperand &MCOp);
   void lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp);
 
+  bool isLoopHeaderOfNoUnroll(const MachineBasicBlock &MBB) const;
+
   LineReader *reader;
   LineReader *getReader(std::string);
 
@@ -305,12 +303,12 @@ private:
   bool EmitGeneric;
 
 public:
-  NVPTXAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : AsmPrinter(TM, Streamer),
-        nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
+  NVPTXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)),
+        EmitGeneric(static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() ==
+                    NVPTX::CUDA) {
     CurrentBankselLabelInBasicBlock = "";
     reader = nullptr;
-    EmitGeneric = (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA);
   }
 
   ~NVPTXAsmPrinter() {
@@ -318,6 +316,15 @@ public:
       delete reader;
   }
 
+  bool runOnMachineFunction(MachineFunction &F) override {
+    nvptxSubtarget = &F.getSubtarget<NVPTXSubtarget>();
+    return AsmPrinter::runOnMachineFunction(F);
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineLoopInfo>();
+    AsmPrinter::getAnalysisUsage(AU);
+  }
+
   bool ignoreLoc(const MachineInstr &);
 
   std::string getVirtualRegisterName(unsigned) const;
diff --git a/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
index 962b123..7d4be8e 100644
--- a/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
+++ b/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
@@ -19,8 +19,8 @@
 
 #include "NVPTX.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/raw_ostream.h"
 #include <string>
 
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 314df38..34d3a66 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -26,9 +26,8 @@
 
 using namespace llvm;
 
-NVPTXFrameLowering::NVPTXFrameLowering(NVPTXSubtarget &STI)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0),
-      is64bit(STI.is64Bit()) {}
+NVPTXFrameLowering::NVPTXFrameLowering()
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0) {}
 
 bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
 
@@ -45,7 +44,7 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const {
 
     // mov %SPL, %depot;
     // cvta.local %SP, %SPL;
-    if (is64bit) {
+    if (static_cast<const NVPTXTargetMachine &>(MF.getTarget()).is64Bit()) {
       unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int64RegsRegClass);
       MachineInstr *MI =
           BuildMI(MBB, MBBI, dl, MF.getSubtarget().getInstrInfo()->get(
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h
index 0846b78..d1e0a5c 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -19,18 +19,16 @@
 namespace llvm {
 class NVPTXSubtarget;
 class NVPTXFrameLowering : public TargetFrameLowering {
-  bool is64bit;
-
 public:
-  explicit NVPTXFrameLowering(NVPTXSubtarget &STI);
+  explicit NVPTXFrameLowering();
 
   bool hasFP(const MachineFunction &MF) const override;
   void emitPrologue(MachineFunction &MF) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                  MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I) const override;
+  void
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 58fa95b..86d134b 100644
--- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -22,10 +22,11 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueMap.h"
-#include "llvm/PassManager.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 
 using namespace llvm;
 
@@ -54,8 +55,7 @@ private:
                                                 IRBuilder<> &Builder);
   Value *remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
                            IRBuilder<> &Builder);
-  void remapNamedMDNode(Module *M, NamedMDNode *N);
-  MDNode *remapMDNode(Module *M, MDNode *N);
+  void remapNamedMDNode(ValueToValueMapTy &VM, NamedMDNode *N);
 
   typedef ValueMap<GlobalVariable *, GlobalVariable *> GVMapTy;
   typedef ValueMap<Constant *, Value *> ConstantToValueMapTy;
@@ -125,12 +125,17 @@ bool GenericToNVVM::runOnModule(Module &M) {
     ConstantToValueMap.clear();
   }
 
+  // Copy GVMap over to a standard value map.
+  ValueToValueMapTy VM;
+  for (auto I = GVMap.begin(), E = GVMap.end(); I != E; ++I)
+    VM[I->first] = I->second;
+
   // Walk through the metadata section and update the debug information
   // associated with the global variables in the default address space.
   for (Module::named_metadata_iterator I = M.named_metadata_begin(),
                                        E = M.named_metadata_end();
        I != E; I++) {
-    remapNamedMDNode(&M, I);
+    remapNamedMDNode(VM, I);
   }
 
   // Walk through the global variable  initializers, and replace any use of
@@ -362,7 +367,7 @@ Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
   }
 }
 
-void GenericToNVVM::remapNamedMDNode(Module *M, NamedMDNode *N) {
+void GenericToNVVM::remapNamedMDNode(ValueToValueMapTy &VM, NamedMDNode *N) {
 
   bool OperandChanged = false;
   SmallVector<MDNode *, 16> NewOperands;
@@ -372,7 +377,7 @@ void GenericToNVVM::remapNamedMDNode(Module *M, NamedMDNode *N) {
   // converted to another value.
   for (unsigned i = 0; i < NumOperands; ++i) {
     MDNode *Operand = N->getOperand(i);
-    MDNode *NewOperand = remapMDNode(M, Operand);
+    MDNode *NewOperand = MapMetadata(Operand, VM);
     OperandChanged |= Operand != NewOperand;
     NewOperands.push_back(NewOperand);
   }
@@ -390,47 +395,3 @@ void GenericToNVVM::remapNamedMDNode(Module *M, NamedMDNode *N) {
     N->addOperand(*I);
   }
 }
-
-MDNode *GenericToNVVM::remapMDNode(Module *M, MDNode *N) {
-
-  bool OperandChanged = false;
-  SmallVector<Value *, 8> NewOperands;
-  unsigned NumOperands = N->getNumOperands();
-
-  // Check if any operand is or contains a global variable in  GVMap, and thus
-  // converted to another value.
-  for (unsigned i = 0; i < NumOperands; ++i) {
-    Value *Operand = N->getOperand(i);
-    Value *NewOperand = Operand;
-    if (Operand) {
-      if (isa<GlobalVariable>(Operand)) {
-        GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(Operand));
-        if (I != GVMap.end()) {
-          NewOperand = I->second;
-          if (++i < NumOperands) {
-            NewOperands.push_back(NewOperand);
-            // Address space of the global variable follows the global variable
-            // in the global variable debug info (see createGlobalVariable in
-            // lib/Analysis/DIBuilder.cpp).
-            NewOperand =
-                ConstantInt::get(Type::getInt32Ty(M->getContext()),
-                                 I->second->getType()->getAddressSpace());
-          }
-        }
-      } else if (isa<MDNode>(Operand)) {
-        NewOperand = remapMDNode(M, cast<MDNode>(Operand));
-      }
-    }
-    OperandChanged |= Operand != NewOperand;
-    NewOperands.push_back(NewOperand);
-  }
-
-  // If none of the operands has been modified, return N as it is.
-  if (!OperandChanged) {
-    return N;
-  }
-
-  // If any of the operands has been modified, create a new MDNode with the new
-  // operands.
-  return MDNode::get(M->getContext(), makeArrayRef(NewOperands));
-}
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index cd0422d..e01c780 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -50,11 +50,15 @@ FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
 
 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
                                      CodeGenOpt::Level OptLevel)
-    : SelectionDAGISel(tm, OptLevel),
-      Subtarget(tm.getSubtarget<NVPTXSubtarget>()) {
+    : SelectionDAGISel(tm, OptLevel), TM(tm) {
   doMulWide = (OptLevel > 0);
 }
 
+bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+    Subtarget = &static_cast<const NVPTXSubtarget &>(MF.getSubtarget());
+    return SelectionDAGISel::runOnMachineFunction(MF);
+}
+
 int NVPTXDAGToDAGISel::getDivF32Level() const {
   if (UsePrecDivF32.getNumOccurrences() > 0) {
     // If nvptx-prec-div32=N is used on the command-line, always honor it
@@ -89,16 +93,14 @@ bool NVPTXDAGToDAGISel::useF32FTZ() const {
     const Function *F = MF->getFunction();
     // Otherwise, check for an nvptx-f32ftz attribute on the function
     if (F->hasFnAttribute("nvptx-f32ftz"))
-      return (F->getAttributes().getAttribute(AttributeSet::FunctionIndex,
-                                              "nvptx-f32ftz")
-                                              .getValueAsString() == "true");
+      return F->getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
     else
       return false;
   }
 }
 
 bool NVPTXDAGToDAGISel::allowFMA() const {
-  const NVPTXTargetLowering *TL = Subtarget.getTargetLowering();
+  const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
   return TL->allowFMA(*MF, OptLevel);
 }
 
@@ -525,8 +527,7 @@ SDNode *NVPTXDAGToDAGISel::SelectIntrinsicChain(SDNode *N) {
   }
 }
 
-static unsigned int getCodeAddrSpace(MemSDNode *N,
-                                     const NVPTXSubtarget &Subtarget) {
+static unsigned int getCodeAddrSpace(MemSDNode *N) {
   const Value *Src = N->getMemOperand()->getValue();
 
   if (!Src)
@@ -579,20 +580,16 @@ SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
     switch (SrcAddrSpace) {
     default: report_fatal_error("Bad address space in addrspacecast");
     case ADDRESS_SPACE_GLOBAL:
-      Opc = Subtarget.is64Bit() ? NVPTX::cvta_global_yes_64
-                                : NVPTX::cvta_global_yes;
+      Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
       break;
     case ADDRESS_SPACE_SHARED:
-      Opc = Subtarget.is64Bit() ? NVPTX::cvta_shared_yes_64
-                                : NVPTX::cvta_shared_yes;
+      Opc = TM.is64Bit() ? NVPTX::cvta_shared_yes_64 : NVPTX::cvta_shared_yes;
       break;
     case ADDRESS_SPACE_CONST:
-      Opc = Subtarget.is64Bit() ? NVPTX::cvta_const_yes_64
-                                : NVPTX::cvta_const_yes;
+      Opc = TM.is64Bit() ? NVPTX::cvta_const_yes_64 : NVPTX::cvta_const_yes;
       break;
     case ADDRESS_SPACE_LOCAL:
-      Opc = Subtarget.is64Bit() ? NVPTX::cvta_local_yes_64
-                                : NVPTX::cvta_local_yes;
+      Opc = TM.is64Bit() ? NVPTX::cvta_local_yes_64 : NVPTX::cvta_local_yes;
       break;
     }
     return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src);
@@ -604,20 +601,20 @@ SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
     switch (DstAddrSpace) {
     default: report_fatal_error("Bad address space in addrspacecast");
     case ADDRESS_SPACE_GLOBAL:
-      Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_global_yes_64
-                                : NVPTX::cvta_to_global_yes;
+      Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
+                         : NVPTX::cvta_to_global_yes;
       break;
     case ADDRESS_SPACE_SHARED:
-      Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_shared_yes_64
-                                : NVPTX::cvta_to_shared_yes;
+      Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_yes_64
+                         : NVPTX::cvta_to_shared_yes;
       break;
     case ADDRESS_SPACE_CONST:
-      Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_const_yes_64
-                                : NVPTX::cvta_to_const_yes;
+      Opc =
+          TM.is64Bit() ? NVPTX::cvta_to_const_yes_64 : NVPTX::cvta_to_const_yes;
       break;
     case ADDRESS_SPACE_LOCAL:
-      Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_local_yes_64
-                                : NVPTX::cvta_to_local_yes;
+      Opc =
+          TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes;
       break;
     }
     return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src);
@@ -638,7 +635,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
     return nullptr;
 
   // Address Space Setting
-  unsigned int codeAddrSpace = getCodeAddrSpace(LD, Subtarget);
+  unsigned int codeAddrSpace = getCodeAddrSpace(LD);
 
   // Volatile Setting
   // - .volatile is only availalble for .global and .shared
@@ -713,9 +710,8 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
                       getI32Imm(vecType), getI32Imm(fromType),
                       getI32Imm(fromTypeWidth), Addr, Chain };
     NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
-  } else if (Subtarget.is64Bit()
-                 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
-                 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
+  } else if (TM.is64Bit() ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
+                          : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
     switch (TargetVT) {
     case MVT::i8:
       Opcode = NVPTX::LD_i8_asi;
@@ -742,10 +738,9 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
                       getI32Imm(vecType), getI32Imm(fromType),
                       getI32Imm(fromTypeWidth), Base, Offset, Chain };
     NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
-  } else if (Subtarget.is64Bit()
-                 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
-                 : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
-    if (Subtarget.is64Bit()) {
+  } else if (TM.is64Bit() ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
+                          : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
+    if (TM.is64Bit()) {
       switch (TargetVT) {
       case MVT::i8:
         Opcode = NVPTX::LD_i8_ari_64;
@@ -797,7 +792,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
                       getI32Imm(fromTypeWidth), Base, Offset, Chain };
     NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
   } else {
-    if (Subtarget.is64Bit()) {
+    if (TM.is64Bit()) {
       switch (TargetVT) {
       case MVT::i8:
         Opcode = NVPTX::LD_i8_areg_64;
@@ -874,7 +869,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     return nullptr;
 
   // Address Space Setting
-  unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD, Subtarget);
+  unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
 
   // Volatile Setting
   // - .volatile is only availalble for .global and .shared
@@ -974,9 +969,8 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
                       getI32Imm(VecType), getI32Imm(FromType),
                       getI32Imm(FromTypeWidth), Addr, Chain };
     LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
-  } else if (Subtarget.is64Bit()
-                 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
-                 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
+  } else if (TM.is64Bit() ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
+                          : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
       return nullptr;
@@ -1028,10 +1022,9 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
                       getI32Imm(VecType), getI32Imm(FromType),
                       getI32Imm(FromTypeWidth), Base, Offset, Chain };
     LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
-  } else if (Subtarget.is64Bit()
-                 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
-                 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
-    if (Subtarget.is64Bit()) {
+  } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
+                          : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
+    if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
         return nullptr;
@@ -1133,7 +1126,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
 
     LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
   } else {
-    if (Subtarget.is64Bit()) {
+    if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
         return nullptr;
@@ -1425,10 +1418,9 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
 
     SDValue Ops[] = { Addr, Chain };
     LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
-  } else if (Subtarget.is64Bit()
-                 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
-                 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
-    if (Subtarget.is64Bit()) {
+  } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
+                          : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
+    if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
         return nullptr;
@@ -1710,7 +1702,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
 
     LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
   } else {
-    if (Subtarget.is64Bit()) {
+    if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
         return nullptr;
@@ -2013,7 +2005,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
     return nullptr;
 
   // Address Space Setting
-  unsigned int codeAddrSpace = getCodeAddrSpace(ST, Subtarget);
+  unsigned int codeAddrSpace = getCodeAddrSpace(ST);
 
   // Volatile Setting
   // - .volatile is only availalble for .global and .shared
@@ -2083,9 +2075,8 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
                       getI32Imm(vecType), getI32Imm(toType),
                       getI32Imm(toTypeWidth), Addr, Chain };
     NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
-  } else if (Subtarget.is64Bit()
-                 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
-                 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+  } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
+                          : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
     switch (SourceVT) {
     case MVT::i8:
       Opcode = NVPTX::ST_i8_asi;
@@ -2112,10 +2103,9 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
                       getI32Imm(vecType), getI32Imm(toType),
                       getI32Imm(toTypeWidth), Base, Offset, Chain };
     NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
-  } else if (Subtarget.is64Bit()
-                 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
-                 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
-    if (Subtarget.is64Bit()) {
+  } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
+                          : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+    if (TM.is64Bit()) {
       switch (SourceVT) {
       case MVT::i8:
         Opcode = NVPTX::ST_i8_ari_64;
@@ -2167,7 +2157,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
                       getI32Imm(toTypeWidth), Base, Offset, Chain };
     NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
   } else {
-    if (Subtarget.is64Bit()) {
+    if (TM.is64Bit()) {
       switch (SourceVT) {
       case MVT::i8:
         Opcode = NVPTX::ST_i8_areg_64;
@@ -2241,7 +2231,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
   EVT StoreVT = MemSD->getMemoryVT();
 
   // Address Space Setting
-  unsigned CodeAddrSpace = getCodeAddrSpace(MemSD, Subtarget);
+  unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
 
   if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
     report_fatal_error("Cannot store to pointer that points to constant "
@@ -2344,9 +2334,8 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
       break;
     }
     StOps.push_back(Addr);
-  } else if (Subtarget.is64Bit()
-                 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
-                 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+  } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
+                          : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
       return nullptr;
@@ -2395,10 +2384,9 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     }
     StOps.push_back(Base);
     StOps.push_back(Offset);
-  } else if (Subtarget.is64Bit()
-                 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
-                 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
-    if (Subtarget.is64Bit()) {
+  } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
+                          : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+    if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
         return nullptr;
@@ -2496,7 +2484,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     StOps.push_back(Base);
     StOps.push_back(Offset);
   } else {
-    if (Subtarget.is64Bit()) {
+    if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
         return nullptr;
@@ -4772,7 +4760,7 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
     }
 
     // How many bits are in our mask?
-    uint64_t NumBits = CountTrailingOnes_64(MaskVal);
+    uint64_t NumBits = countTrailingOnes(MaskVal);
     Len = CurDAG->getTargetConstant(NumBits, MVT::i32);
 
     if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
@@ -4836,10 +4824,10 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
         NumZeros = 0;
         // The number of bits in the result bitfield will be the number of
         // trailing ones (the AND) minus the number of bits we shift off
-        NumBits = CountTrailingOnes_64(MaskVal) - ShiftAmt;
+        NumBits = countTrailingOnes(MaskVal) - ShiftAmt;
       } else if (isShiftedMask_64(MaskVal)) {
         NumZeros = countTrailingZeros(MaskVal);
-        unsigned NumOnes = CountTrailingOnes_64(MaskVal >> NumZeros);
+        unsigned NumOnes = countTrailingOnes(MaskVal >> NumZeros);
         // The number of bits in the result bitfield will be the number of
         // trailing zeros plus the number of set bits in the mask minus the
         // number of bits we shift off
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 69afcd7..ca432b5 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -26,6 +26,7 @@ using namespace llvm;
 namespace {
 
 class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
+  const NVPTXTargetMachine &TM;
 
   // If true, generate mul.wide from sext and mul
   bool doMulWide;
@@ -43,8 +44,8 @@ public:
   const char *getPassName() const override {
     return "NVPTX DAG->DAG Pattern Instruction Selection";
   }
-
-  const NVPTXSubtarget &Subtarget;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  const NVPTXSubtarget *Subtarget;
 
   bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                     char ConstraintCode,
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 0b0b536..1dc81f7 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -106,9 +106,9 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, Type *Ty,
 }
 
 // NVPTXTargetLowering Constructor.
-NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM)
-    : TargetLowering(TM), nvTM(&TM),
-      nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
+NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
+                                         const NVPTXSubtarget &STI)
+    : TargetLowering(TM), nvTM(&TM), STI(STI) {
 
   // always lower memset, memcpy, and memmove intrinsics to load/store
   // instructions, rather
@@ -167,14 +167,14 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM)
   setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
   setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
 
-  if (nvptxSubtarget.hasROT64()) {
+  if (STI.hasROT64()) {
     setOperationAction(ISD::ROTL, MVT::i64, Legal);
     setOperationAction(ISD::ROTR, MVT::i64, Legal);
   } else {
     setOperationAction(ISD::ROTL, MVT::i64, Expand);
     setOperationAction(ISD::ROTR, MVT::i64, Expand);
   }
-  if (nvptxSubtarget.hasROT32()) {
+  if (STI.hasROT32()) {
     setOperationAction(ISD::ROTL, MVT::i32, Legal);
     setOperationAction(ISD::ROTR, MVT::i32, Legal);
   } else {
@@ -203,8 +203,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM)
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
   // Turn FP extload into load/fextend
-  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
   // Turn FP truncstore into trunc + store.
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
@@ -214,12 +215,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM)
   setOperationAction(ISD::LOAD, MVT::i1, Custom);
   setOperationAction(ISD::STORE, MVT::i1, Custom);
 
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
-  setTruncStoreAction(MVT::i64, MVT::i1, Expand);
-  setTruncStoreAction(MVT::i32, MVT::i1, Expand);
-  setTruncStoreAction(MVT::i16, MVT::i1, Expand);
-  setTruncStoreAction(MVT::i8, MVT::i1, Expand);
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setTruncStoreAction(VT, MVT::i1, Expand);
+  }
 
   // This is legal in NVPTX
   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
@@ -232,9 +232,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM)
   setOperationAction(ISD::ADDE, MVT::i64, Expand);
 
   // Register custom handling for vector loads/stores
-  for (int i = MVT::FIRST_VECTOR_VALUETYPE; i <= MVT::LAST_VECTOR_VALUETYPE;
-       ++i) {
-    MVT VT = (MVT::SimpleValueType) i;
+  for (MVT VT : MVT::vector_valuetypes()) {
     if (IsPTXVectorType(VT)) {
       setOperationAction(ISD::LOAD, VT, Custom);
       setOperationAction(ISD::STORE, VT, Custom);
@@ -261,6 +259,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM)
   setOperationAction(ISD::CTPOP, MVT::i32, Legal);
   setOperationAction(ISD::CTPOP, MVT::i64, Legal);
 
+  // PTX does not directly support SELP of i1, so promote to i32 first
+  setOperationAction(ISD::SELECT, MVT::i1, Custom);
+
   // We have some custom DAG combine patterns for these nodes
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::AND);
@@ -270,7 +271,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM)
 
   // Now deduce the information based on the above mentioned
   // actions
-  computeRegisterProperties();
+  computeRegisterProperties(STI.getRegisterInfo());
 }
 
 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -878,7 +879,7 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
                                   unsigned retAlignment,
                                   const ImmutableCallSite *CS) const {
 
-  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  bool isABI = (STI.getSmVersion() >= 20);
   assert(isABI && "Non-ABI compilation is not supported");
   if (!isABI)
     return "";
@@ -905,16 +906,14 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
       O << ".param .b" << size << " _";
     } else if (isa<PointerType>(retTy)) {
       O << ".param .b" << getPointerTy().getSizeInBits() << " _";
+    } else if ((retTy->getTypeID() == Type::StructTyID) ||
+               isa<VectorType>(retTy)) {
+      O << ".param .align "
+        << retAlignment
+        << " .b8 _["
+        << getDataLayout()->getTypeAllocSize(retTy) << "]";
     } else {
-      if((retTy->getTypeID() == Type::StructTyID) ||
-         isa<VectorType>(retTy)) {
-        O << ".param .align "
-          << retAlignment
-          << " .b8 _["
-          << getDataLayout()->getTypeAllocSize(retTy) << "]";
-      } else {
-        assert(false && "Unknown return type");
-      }
+      llvm_unreachable("Unknown return type");
     }
     O << ") ";
   }
@@ -1045,7 +1044,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   Type *retTy = CLI.RetTy;
   ImmutableCallSite *CS = CLI.CS;
 
-  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  bool isABI = (STI.getSmVersion() >= 20);
   assert(isABI && "Non-ABI compilation is not supported");
   if (!isABI)
     return Chain;
@@ -1456,8 +1455,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       EVT ObjectVT = getValueType(retTy);
       unsigned NumElts = ObjectVT.getVectorNumElements();
       EVT EltVT = ObjectVT.getVectorElementType();
-      assert(nvTM->getSubtargetImpl()->getTargetLowering()->getNumRegisters(
-                 F->getContext(), ObjectVT) == NumElts &&
+      assert(STI.getTargetLowering()->getNumRegisters(F->getContext(),
+                                                      ObjectVT) == NumElts &&
              "Vector was not scalarized");
       unsigned sz = EltVT.getSizeInBits();
       bool needTruncate = sz < 8 ? true : false;
@@ -1475,11 +1474,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           LoadRetVTs.push_back(EltVT);
         LoadRetVTs.push_back(MVT::Other);
         LoadRetVTs.push_back(MVT::Glue);
-        SmallVector<SDValue, 4> LoadRetOps;
-        LoadRetOps.push_back(Chain);
-        LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
-        LoadRetOps.push_back(DAG.getConstant(0, MVT::i32));
-        LoadRetOps.push_back(InFlag);
+        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, MVT::i32),
+                                DAG.getConstant(0, MVT::i32), InFlag};
         SDValue retval = DAG.getMemIntrinsicNode(
             NVPTXISD::LoadParam, dl,
             DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
@@ -1505,11 +1501,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         }
         LoadRetVTs.push_back(MVT::Other);
         LoadRetVTs.push_back(MVT::Glue);
-        SmallVector<SDValue, 4> LoadRetOps;
-        LoadRetOps.push_back(Chain);
-        LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
-        LoadRetOps.push_back(DAG.getConstant(0, MVT::i32));
-        LoadRetOps.push_back(InFlag);
+        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, MVT::i32),
+                                DAG.getConstant(0, MVT::i32), InFlag};
         SDValue retval = DAG.getMemIntrinsicNode(
             NVPTXISD::LoadParamV2, dl,
             DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
@@ -1551,11 +1544,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           }
           LoadRetVTs.push_back(MVT::Other);
           LoadRetVTs.push_back(MVT::Glue);
-          SmallVector<SDValue, 4> LoadRetOps;
-          LoadRetOps.push_back(Chain);
-          LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
-          LoadRetOps.push_back(DAG.getConstant(Ofst, MVT::i32));
-          LoadRetOps.push_back(InFlag);
+          SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, MVT::i32),
+                                  DAG.getConstant(Ofst, MVT::i32), InFlag};
           SDValue retval = DAG.getMemIntrinsicNode(
               Opc, dl, DAG.getVTList(LoadRetVTs),
               LoadRetOps, EltVT, MachinePointerInfo());
@@ -1609,11 +1599,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         LoadRetVTs.push_back(MVT::Other);
         LoadRetVTs.push_back(MVT::Glue);
 
-        SmallVector<SDValue, 4> LoadRetOps;
-        LoadRetOps.push_back(Chain);
-        LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
-        LoadRetOps.push_back(DAG.getConstant(Offsets[i], MVT::i32));
-        LoadRetOps.push_back(InFlag);
+        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, MVT::i32),
+                                DAG.getConstant(Offsets[i], MVT::i32), InFlag};
         SDValue retval = DAG.getMemIntrinsicNode(
             NVPTXISD::LoadParam, dl,
             DAG.getVTList(LoadRetVTs), LoadRetOps,
@@ -1679,7 +1666,7 @@ SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
   SDValue ShAmt  = Op.getOperand(2);
   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
 
-  if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) {
+  if (VTBits == 32 && STI.getSmVersion() >= 35) {
 
     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
     // {dHi, dLo} = {aHi, aLo} >> Amt
@@ -1739,7 +1726,7 @@ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt  = Op.getOperand(2);
 
-  if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) {
+  if (VTBits == 32 && STI.getSmVersion() >= 35) {
 
     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
     // {dHi, dLo} = {aHi, aLo} << Amt
@@ -1807,11 +1794,29 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SRA_PARTS:
   case ISD::SRL_PARTS:
     return LowerShiftRightParts(Op, DAG);
+  case ISD::SELECT:
+    return LowerSelect(Op, DAG);
   default:
     llvm_unreachable("Custom lowering not defined for operation");
   }
 }
 
+SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Op0 = Op->getOperand(0);
+  SDValue Op1 = Op->getOperand(1);
+  SDValue Op2 = Op->getOperand(2);
+  SDLoc DL(Op.getNode());
+
+  assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
+
+  Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
+  Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
+  SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
+  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
+
+  return Trunc;
+}
+
 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   if (Op.getValueType() == MVT::i1)
     return LowerLOADi1(Op, DAG);
@@ -2033,13 +2038,13 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
 
   const Function *F = MF.getFunction();
   const AttributeSet &PAL = F->getAttributes();
-  const TargetLowering *TLI = DAG.getSubtarget().getTargetLowering();
+  const TargetLowering *TLI = STI.getTargetLowering();
 
   SDValue Root = DAG.getRoot();
   std::vector<SDValue> OutChains;
 
   bool isKernel = llvm::isKernelFunction(*F);
-  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  bool isABI = (STI.getSmVersion() >= 20);
   assert(isABI && "Non-ABI compilation is not supported");
   if (!isABI)
     return Chain;
@@ -2337,7 +2342,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   Type *RetTy = F->getReturnType();
   const DataLayout *TD = getDataLayout();
 
-  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  bool isABI = (STI.getSmVersion() >= 20);
   assert(isABI && "Non-ABI compilation is not supported");
   if (!isABI)
     return Chain;
@@ -3757,7 +3762,8 @@ NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const {
 }
 
 std::pair<unsigned, const TargetRegisterClass *>
-NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                  const std::string &Constraint,
                                                   MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
@@ -3778,7 +3784,7 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
     }
   }
-  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
 
 /// getFunctionAlignment - Return the Log2 alignment of this function.
@@ -4200,7 +4206,7 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
     default: break;
     case ISD::ADD:
     case ISD::FADD:
-      return PerformADDCombine(N, DCI, nvptxSubtarget, OptLevel);
+      return PerformADDCombine(N, DCI, STI, OptLevel);
     case ISD::MUL:
       return PerformMULCombine(N, DCI, OptLevel);
     case ISD::SHL:
@@ -4285,11 +4291,8 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
   }
   }
 
-  SmallVector<SDValue, 8> OtherOps;
-
   // Copy regular operands
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-    OtherOps.push_back(N->getOperand(i));
+  SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
 
   // The select routine does not have access to the LoadSDNode instance, so
   // pass along the extension information
@@ -4402,8 +4405,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
       OtherOps.push_back(Chain); // Chain
                                  // Skip operand 1 (intrinsic ID)
       // Others
-      for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i)
-        OtherOps.push_back(N->getOperand(i));
+      OtherOps.append(N->op_begin() + 2, N->op_end());
 
       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
 
@@ -4434,9 +4436,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
              "Custom handling of non-i8 ldu/ldg?");
 
       // Just copy all operands as-is
-      SmallVector<SDValue, 4> Ops;
-      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-        Ops.push_back(N->getOperand(i));
+      SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
 
       // Force output to i16
       SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index d66d81a..1b4da2c 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -436,7 +436,8 @@ class NVPTXSubtarget;
 //===--------------------------------------------------------------------===//
 class NVPTXTargetLowering : public TargetLowering {
 public:
-  explicit NVPTXTargetLowering(const NVPTXTargetMachine &TM);
+  explicit NVPTXTargetLowering(const NVPTXTargetMachine &TM,
+                               const NVPTXSubtarget &STI);
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -469,7 +470,8 @@ public:
   ConstraintType
   getConstraintType(const std::string &Constraint) const override;
   std::pair<unsigned, const TargetRegisterClass *>
-  getRegForInlineAsmConstraint(const std::string &Constraint,
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               const std::string &Constraint,
                                MVT VT) const override;
 
   SDValue LowerFormalArguments(
@@ -507,8 +509,10 @@ public:
 
   bool isFMAFasterThanFMulAndFAdd(EVT) const override { return true; }
 
+  bool enableAggressiveFMAFusion(EVT VT) const override { return true; }
+
 private:
-  const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here
+  const NVPTXSubtarget &STI; // cache the subtarget here
 
   SDValue getExtSymb(SelectionDAG &DAG, const char *name, int idx,
                      EVT = MVT::i32) const;
@@ -527,6 +531,8 @@ private:
   SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue LowerSelect(SDValue Op, SelectionDAG &DAG) const;
+
   void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
diff --git a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
index a98fb37..aa36b6b 100644
--- a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
+++ b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -16,11 +16,11 @@
 
 #include "NVPTX.h"
 #include "NVPTXUtilities.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/ConstantFolding.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index b5b4fbe..dabc3be 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -14,11 +14,11 @@
 #include "NVPTX.h"
 #include "NVPTXInstrInfo.h"
 #include "NVPTXTargetMachine.h"
-#include "llvm/IR/Function.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
 
 using namespace llvm;
 
@@ -28,9 +28,7 @@ using namespace llvm;
 // Pin the vtable to this file.
 void NVPTXInstrInfo::anchor() {}
 
-// FIXME: Add the subtarget support on this constructor.
-NVPTXInstrInfo::NVPTXInstrInfo(NVPTXSubtarget &STI)
-    : NVPTXGenInstrInfo(), RegInfo(STI) {}
+NVPTXInstrInfo::NVPTXInstrInfo() : NVPTXGenInstrInfo(), RegInfo() {}
 
 void NVPTXInstrInfo::copyPhysReg(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
index 6de7536..9b5d491 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -27,7 +27,7 @@ class NVPTXInstrInfo : public NVPTXGenInstrInfo {
   const NVPTXRegisterInfo RegInfo;
   virtual void anchor();
 public:
-  explicit NVPTXInstrInfo(NVPTXSubtarget &STI);
+  explicit NVPTXInstrInfo();
 
   const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
 
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 9900b8c..68f0d9f 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -117,24 +117,24 @@ def F32ConstOne : Operand<f32>, PatLeaf<(f32 fpimm)>, SDNodeXForm<fpimm, [{
 //===----------------------------------------------------------------------===//
 
 
-def hasAtomRedG32 : Predicate<"Subtarget.hasAtomRedG32()">;
-def hasAtomRedS32 : Predicate<"Subtarget.hasAtomRedS32()">;
-def hasAtomRedGen32 : Predicate<"Subtarget.hasAtomRedGen32()">;
+def hasAtomRedG32 : Predicate<"Subtarget->hasAtomRedG32()">;
+def hasAtomRedS32 : Predicate<"Subtarget->hasAtomRedS32()">;
+def hasAtomRedGen32 : Predicate<"Subtarget->hasAtomRedGen32()">;
 def useAtomRedG32forGen32 :
-  Predicate<"!Subtarget.hasAtomRedGen32() && Subtarget.hasAtomRedG32()">;
-def hasBrkPt : Predicate<"Subtarget.hasBrkPt()">;
-def hasAtomRedG64 : Predicate<"Subtarget.hasAtomRedG64()">;
-def hasAtomRedS64 : Predicate<"Subtarget.hasAtomRedS64()">;
-def hasAtomRedGen64 : Predicate<"Subtarget.hasAtomRedGen64()">;
+  Predicate<"!Subtarget->hasAtomRedGen32() && Subtarget->hasAtomRedG32()">;
+def hasBrkPt : Predicate<"Subtarget->hasBrkPt()">;
+def hasAtomRedG64 : Predicate<"Subtarget->hasAtomRedG64()">;
+def hasAtomRedS64 : Predicate<"Subtarget->hasAtomRedS64()">;
+def hasAtomRedGen64 : Predicate<"Subtarget->hasAtomRedGen64()">;
 def useAtomRedG64forGen64 :
-  Predicate<"!Subtarget.hasAtomRedGen64() && Subtarget.hasAtomRedG64()">;
-def hasAtomAddF32 : Predicate<"Subtarget.hasAtomAddF32()">;
-def hasVote : Predicate<"Subtarget.hasVote()">;
-def hasDouble : Predicate<"Subtarget.hasDouble()">;
-def reqPTX20 : Predicate<"Subtarget.reqPTX20()">;
-def hasLDG : Predicate<"Subtarget.hasLDG()">;
-def hasLDU : Predicate<"Subtarget.hasLDU()">;
-def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">;
+  Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">;
+def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">;
+def hasVote : Predicate<"Subtarget->hasVote()">;
+def hasDouble : Predicate<"Subtarget->hasDouble()">;
+def reqPTX20 : Predicate<"Subtarget->reqPTX20()">;
+def hasLDG : Predicate<"Subtarget->hasLDG()">;
+def hasLDU : Predicate<"Subtarget->hasLDU()">;
+def hasGenericLdSt : Predicate<"Subtarget->hasGenericLdSt()">;
 
 def doF32FTZ : Predicate<"useF32FTZ()">;
 def doNoF32FTZ : Predicate<"!useF32FTZ()">;
@@ -150,12 +150,12 @@ def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
 def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
 def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
 
-def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
-def noHWROT32 : Predicate<"!Subtarget.hasHWROT32()">;
+def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
+def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
 
 def true : Predicate<"1">;
 
-def hasPTX31 : Predicate<"Subtarget.getPTXVersion() >= 31">;
+def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
 
 
 //===----------------------------------------------------------------------===//
@@ -296,7 +296,7 @@ multiclass F2<string OpcStr, SDNode OpNode> {
 // General Type Conversion
 //-----------------------------------
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 // Generate a cvt to the given type from all possible types.
 // Each instance takes a CvtMode immediate that defines the conversion mode to
 // use.  It can be CvtNONE to omit a conversion mode.
@@ -1356,11 +1356,6 @@ defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
 defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
 defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
 
-// Special select for predicate operands
-def : Pat<(i1 (select Int1Regs:$p, Int1Regs:$a, Int1Regs:$b)),
-              (ORb1rr (ANDb1rr Int1Regs:$p, Int1Regs:$a),
-              (ANDb1rr (NOT1 Int1Regs:$p), Int1Regs:$b))>;
-
 //
 // Funnnel shift in clamp mode
 //
@@ -1659,12 +1654,12 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
             (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
 }
 
-defm FSetGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
-defm FSetLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
-defm FSetGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
-defm FSetLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
-defm FSetEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
-defm FSetNE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
+defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
+defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
+defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
+defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
+defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
+defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
 
 defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
 defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
@@ -1673,6 +1668,13 @@ defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
 defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
 defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
 
+defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
+defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
+defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
+defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
+defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
+defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
+
 defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
 defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
 
@@ -2094,7 +2096,7 @@ multiclass LD<NVPTXRegClass regclass> {
            "$fromWidth \t$dst, [$addr+$offset];"), []>;
 }
 
-let mayLoad=1, neverHasSideEffects=1 in {
+let mayLoad=1, hasSideEffects=0 in {
 defm LD_i8  : LD<Int16Regs>;
 defm LD_i16 : LD<Int16Regs>;
 defm LD_i32 : LD<Int32Regs>;
@@ -2136,7 +2138,7 @@ multiclass ST<NVPTXRegClass regclass> {
            " \t[$addr+$offset], $src;"), []>;
 }
 
-let mayStore=1, neverHasSideEffects=1 in {
+let mayStore=1, hasSideEffects=0 in {
 defm ST_i8  : ST<Int16Regs>;
 defm ST_i16 : ST<Int16Regs>;
 defm ST_i32 : ST<Int32Regs>;
@@ -2220,7 +2222,7 @@ multiclass LD_VEC<NVPTXRegClass regclass> {
                "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
                 []>;
 }
-let mayLoad=1, neverHasSideEffects=1 in {
+let mayLoad=1, hasSideEffects=0 in {
 defm LDV_i8  : LD_VEC<Int16Regs>;
 defm LDV_i16 : LD_VEC<Int16Regs>;
 defm LDV_i32 : LD_VEC<Int32Regs>;
@@ -2303,7 +2305,7 @@ multiclass ST_VEC<NVPTXRegClass regclass> {
                "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
     []>;
 }
-let mayStore=1, neverHasSideEffects=1 in {
+let mayStore=1, hasSideEffects=0 in {
 defm STV_i8  : ST_VEC<Int16Regs>;
 defm STV_i16 : ST_VEC<Int16Regs>;
 defm STV_i32 : ST_VEC<Int32Regs>;
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
index 8759406..da301d5 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
@@ -16,6 +16,7 @@
 #define LLVM_LIB_TARGET_NVPTX_NVPTXLOWERAGGRCOPIES_H
 
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Pass.h"
 
@@ -29,8 +30,8 @@ struct NVPTXLowerAggrCopies : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DataLayoutPass>();
-    AU.addPreserved("stack-protector");
     AU.addPreserved<MachineFunctionAnalysis>();
+    AU.addPreserved<StackProtector>();
   }
 
   bool runOnFunction(Function &F) override;
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index a1e1b9e..c1c67e3 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -48,9 +48,9 @@ MachineFunctionPass *llvm::createNVPTXPrologEpilogPass() {
 char NVPTXPrologEpilogPass::ID = 0;
 
 bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
-  const TargetMachine &TM = MF.getTarget();
-  const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
-  const TargetRegisterInfo &TRI = *TM.getSubtargetImpl()->getRegisterInfo();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetFrameLowering &TFI = *STI.getFrameLowering();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
   bool Modified = false;
 
   calculateFrameObjectOffsets(MF);
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 358ccce..5ca96e4 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -71,8 +71,7 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
 }
 }
 
-NVPTXRegisterInfo::NVPTXRegisterInfo(const NVPTXSubtarget &st)
-    : NVPTXGenRegisterInfo(0), Is64Bit(st.is64Bit()) {}
+NVPTXRegisterInfo::NVPTXRegisterInfo() : NVPTXGenRegisterInfo(0) {}
 
 #define GET_REGINFO_TARGET_DESC
 #include "NVPTXGenRegisterInfo.inc"
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.h b/lib/Target/NVPTX/NVPTXRegisterInfo.h
index d2e6733..75b8f15 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.h
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.h
@@ -22,19 +22,13 @@
 #include "NVPTXGenRegisterInfo.inc"
 
 namespace llvm {
-
-// Forward Declarations.
-class TargetInstrInfo;
-class NVPTXSubtarget;
-
 class NVPTXRegisterInfo : public NVPTXGenRegisterInfo {
 private:
-  bool Is64Bit;
   // Hold Strings that can be free'd all together with NVPTXRegisterInfo
   ManagedStringPool ManagedStrPool;
 
 public:
-  NVPTXRegisterInfo(const NVPTXSubtarget &st);
+  NVPTXRegisterInfo();
 
   //------------------------------------------------------
   // Pure virtual functions from TargetRegisterInfo
diff --git a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
index 324420d..e83f735 100644
--- a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
+++ b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -16,11 +16,12 @@
 #include "NVPTX.h"
 #include "NVPTXMachineFunctionInfo.h"
 #include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/DenseSet.h"
 
 using namespace llvm;
 
@@ -142,8 +143,9 @@ findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx) {
   case NVPTX::LD_i64_avar: {
     // The handle is a parameter value being loaded, replace with the
     // parameter symbol
-    const NVPTXSubtarget &ST = MF.getTarget().getSubtarget<NVPTXSubtarget>();
-    if (ST.getDrvInterface() == NVPTX::CUDA) {
+    const NVPTXTargetMachine &TM =
+        static_cast<const NVPTXTargetMachine &>(MF.getTarget());
+    if (TM.getDrvInterface() == NVPTX::CUDA) {
       // For CUDA, we preserve the param loads coming from function arguments
       return false;
     }
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 3d52532..069d6e1 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
 
 using namespace llvm;
 
@@ -25,17 +26,6 @@ using namespace llvm;
 // Pin the vtable to this file.
 void NVPTXSubtarget::anchor() {}
 
-static std::string computeDataLayout(bool is64Bit) {
-  std::string Ret = "e";
-
-  if (!is64Bit)
-    Ret += "-p:32:32";
-
-  Ret += "-i64:64-v16:16-v32:32-n16:32:64";
-
-  return Ret;
-}
-
 NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                                 StringRef FS) {
     // Provide the default CPU if we don't have one.
@@ -54,18 +44,18 @@ NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
 }
 
 NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
-                               const std::string &FS, const TargetMachine &TM,
-                               bool is64Bit)
-    : NVPTXGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), PTXVersion(0),
-      SmVersion(20), DL(computeDataLayout(is64Bit)),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)),
-      TLInfo((const NVPTXTargetMachine &)TM), TSInfo(&DL),
-      FrameLowering(*this) {
-
-  Triple T(TT);
-
-  if (T.getOS() == Triple::NVCL)
-    drvInterface = NVPTX::NVCL;
-  else
-    drvInterface = NVPTX::CUDA;
+                               const std::string &FS,
+                               const NVPTXTargetMachine &TM)
+    : NVPTXGenSubtargetInfo(TT, CPU, FS), PTXVersion(0), SmVersion(20), TM(TM),
+      InstrInfo(), TLInfo(TM, initializeSubtargetDependencies(CPU, FS)),
+      TSInfo(TM.getDataLayout()), FrameLowering() {}
+
+bool NVPTXSubtarget::hasImageHandles() const {
+  // Enable handles for Kepler+, where CUDA supports indirect surfaces and
+  // textures
+  if (TM.getDrvInterface() == NVPTX::CUDA)
+    return (SmVersion >= 30);
+
+  // Disabled, otherwise
+  return false;
 }
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index fb2d404..e9833e5 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -32,8 +32,6 @@ namespace llvm {
 class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   virtual void anchor();
   std::string TargetName;
-  NVPTX::DrvInterface drvInterface;
-  bool Is64Bit;
 
   // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
   unsigned PTXVersion;
@@ -41,7 +39,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31
   unsigned int SmVersion;
 
-  const DataLayout DL; // Calculates type size & alignment
+  const NVPTXTargetMachine &TM;
   NVPTXInstrInfo InstrInfo;
   NVPTXTargetLowering TLInfo;
   TargetSelectionDAGInfo TSInfo;
@@ -55,13 +53,12 @@ public:
   /// of the specified module.
   ///
   NVPTXSubtarget(const std::string &TT, const std::string &CPU,
-                 const std::string &FS, const TargetMachine &TM, bool is64Bit);
+                 const std::string &FS, const NVPTXTargetMachine &TM);
 
   const TargetFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
   const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
-  const DataLayout *getDataLayout() const override { return &DL; }
   const NVPTXRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
@@ -95,20 +92,9 @@ public:
   }
   inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); }
   inline bool hasROT64() const { return SmVersion >= 20; }
-
-  bool hasImageHandles() const {
-    // Enable handles for Kepler+, where CUDA supports indirect surfaces and
-    // textures
-    if (getDrvInterface() == NVPTX::CUDA)
-      return (SmVersion >= 30);
-
-    // Disabled, otherwise
-    return false;
-  }
-  bool is64Bit() const { return Is64Bit; }
+  bool hasImageHandles() const;
 
   unsigned int getSmVersion() const { return SmVersion; }
-  NVPTX::DrvInterface getDrvInterface() const { return drvInterface; }
   std::string getTargetName() const { return TargetName; }
 
   unsigned getPTXVersion() const { return PTXVersion; }
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index d87693f..1a267a6 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -17,6 +17,7 @@
 #include "NVPTXAllocaHoisting.h"
 #include "NVPTXLowerAggrCopies.h"
 #include "NVPTXTargetObjectFile.h"
+#include "NVPTXTargetTransformInfo.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
@@ -24,12 +25,12 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormattedStream.h"
@@ -69,14 +70,29 @@ extern "C" void LLVMInitializeNVPTXTarget() {
   initializeNVPTXLowerStructArgsPass(*PassRegistry::getPassRegistry());
 }
 
+static std::string computeDataLayout(bool is64Bit) {
+  std::string Ret = "e";
+
+  if (!is64Bit)
+    Ret += "-p:32:32";
+
+  Ret += "-i64:64-v16:16-v32:32-n16:32:64";
+
+  return Ret;
+}
+
 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, StringRef TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
                                        Reloc::Model RM, CodeModel::Model CM,
                                        CodeGenOpt::Level OL, bool is64bit)
-    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), is64bit(is64bit),
       TLOF(make_unique<NVPTXTargetObjectFile>()),
-      Subtarget(TT, CPU, FS, *this, is64bit) {
+      DL(computeDataLayout(is64bit)), Subtarget(TT, CPU, FS, *this) {
+  if (Triple(TT).getOS() == Triple::NVCL)
+    drvInterface = NVPTX::NVCL;
+  else
+    drvInterface = NVPTX::CUDA;
   initAsmInfo();
 }
 
@@ -110,8 +126,7 @@ public:
 
   void addIRPasses() override;
   bool addInstSelector() override;
-  bool addPreRegAlloc() override;
-  bool addPostRegAlloc() override;
+  void addPostRegAlloc() override;
   void addMachineSSAOptimization() override;
 
   FunctionPass *createTargetRegisterAllocator(bool) override;
@@ -125,12 +140,9 @@ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
   return PassConfig;
 }
 
-void NVPTXTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  // Add first the target-independent BasicTTI pass, then our NVPTX pass. This
-  // allows the NVPTX pass to delegate to the target independent layer when
-  // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(this));
-  PM.add(createNVPTXTargetTransformInfoPass(this));
+TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis(
+      [this](Function &) { return TargetTransformInfo(NVPTXTTIImpl(this)); });
 }
 
 void NVPTXPassConfig::addIRPasses() {
@@ -149,6 +161,7 @@ void NVPTXPassConfig::addIRPasses() {
   addPass(createNVPTXAssignValidGlobalNamesPass());
   addPass(createGenericToNVVMPass());
   addPass(createNVPTXFavorNonGenericAddrSpacesPass());
+  addPass(createStraightLineStrengthReducePass());
   addPass(createSeparateConstOffsetFromGEPPass());
   // The SeparateConstOffsetFromGEP pass creates variadic bases that can be used
   // by multiple GEPs. Run GVN or EarlyCSE to really reuse them. GVN generates
@@ -183,10 +196,8 @@ bool NVPTXPassConfig::addInstSelector() {
   return false;
 }
 
-bool NVPTXPassConfig::addPreRegAlloc() { return false; }
-bool NVPTXPassConfig::addPostRegAlloc() {
-  addPass(createNVPTXPrologEpilogPass());
-  return false;
+void NVPTXPassConfig::addPostRegAlloc() {
+  addPass(createNVPTXPrologEpilogPass(), false);
 }
 
 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
index a726bd1..a81abfe 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXTARGETMACHINE_H
 #define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETMACHINE_H
 
-#include "NVPTXSubtarget.h"
 #include "ManagedStringPool.h"
+#include "NVPTXSubtarget.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSelectionDAGInfo.h"
@@ -25,7 +25,10 @@ namespace llvm {
 /// NVPTXTargetMachine
 ///
 class NVPTXTargetMachine : public LLVMTargetMachine {
+  bool is64bit;
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  const DataLayout DL; // Calculates type size & alignment
+  NVPTX::DrvInterface drvInterface;
   NVPTXSubtarget Subtarget;
 
   // Hold Strings that can be free'd all together with NVPTXTargetMachine
@@ -37,9 +40,10 @@ public:
                      CodeModel::Model CM, CodeGenOpt::Level OP, bool is64bit);
 
   ~NVPTXTargetMachine() override;
-
+  const DataLayout *getDataLayout() const override { return &DL; }
   const NVPTXSubtarget *getSubtargetImpl() const override { return &Subtarget; }
-
+  bool is64Bit() const { return is64bit; }
+  NVPTX::DrvInterface getDrvInterface() const { return drvInterface; }
   ManagedStringPool *getManagedStrPool() const {
     return const_cast<ManagedStringPool *>(&ManagedStrPool);
   }
@@ -55,8 +59,7 @@ public:
     return TLOF.get();
   }
 
-  /// \brief Register NVPTX analysis passes with a pass manager.
-  void addAnalysisPasses(PassManagerBase &PM) override;
+  TargetIRAnalysis getTargetIRAnalysis() override;
 
 }; // NVPTXTargetMachine.
 
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index b09d0d4..b8af04d 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -1,4 +1,4 @@
-//===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI pass ---------===//
+//===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,19 +6,12 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// \file
-// This file implements a TargetTransformInfo analysis pass specific to the
-// NVPTX target machine. It uses the target's detailed information to provide
-// more precise answers to certain TTI queries, while letting the target
-// independent and default TTI implementations handle the rest.
-//
-//===----------------------------------------------------------------------===//
 
-#include "NVPTXTargetMachine.h"
+#include "NVPTXTargetTransformInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
@@ -26,69 +19,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "NVPTXtti"
 
-// Declare the pass initialization routine locally as target-specific passes
-// don't have a target-wide initialization entry point, and so we rely on the
-// pass constructor initialization.
-namespace llvm {
-void initializeNVPTXTTIPass(PassRegistry &);
-}
-
-namespace {
-
-class NVPTXTTI final : public ImmutablePass, public TargetTransformInfo {
-  const NVPTXTargetLowering *TLI;
-public:
-  NVPTXTTI() : ImmutablePass(ID), TLI(nullptr) {
-    llvm_unreachable("This pass cannot be directly constructed");
-  }
-
-  NVPTXTTI(const NVPTXTargetMachine *TM)
-      : ImmutablePass(ID), TLI(TM->getSubtargetImpl()->getTargetLowering()) {
-    initializeNVPTXTTIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void initializePass() override { pushTTIStack(this); }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    TargetTransformInfo::getAnalysisUsage(AU);
-  }
-
-  /// Pass identification.
-  static char ID;
-
-  /// Provide necessary pointer adjustments for the two base classes.
-  void *getAdjustedAnalysisPointer(const void *ID) override {
-    if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo *)this;
-    return this;
-  }
-
-  bool hasBranchDivergence() const override;
-
-  unsigned getArithmeticInstrCost(
-      unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
-      OperandValueKind Opd2Info = OK_AnyValue,
-      OperandValueProperties Opd1PropInfo = OP_None,
-      OperandValueProperties Opd2PropInfo = OP_None) const override;
-};
-
-} // end anonymous namespace
-
-INITIALIZE_AG_PASS(NVPTXTTI, TargetTransformInfo, "NVPTXtti",
-                   "NVPTX Target Transform Info", true, true, false)
-char NVPTXTTI::ID = 0;
-
-ImmutablePass *
-llvm::createNVPTXTargetTransformInfoPass(const NVPTXTargetMachine *TM) {
-  return new NVPTXTTI(TM);
-}
-
-bool NVPTXTTI::hasBranchDivergence() const { return true; }
-
-unsigned NVPTXTTI::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
-    OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
-    OperandValueProperties Opd2PropInfo) const {
+unsigned NVPTXTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
   // Legalize the type.
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
 
@@ -96,8 +30,8 @@ unsigned NVPTXTTI::getArithmeticInstrCost(
 
   switch (ISD) {
   default:
-    return TargetTransformInfo::getArithmeticInstrCost(
-        Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                         Opd1PropInfo, Opd2PropInfo);
   case ISD::ADD:
   case ISD::MUL:
   case ISD::XOR:
@@ -109,7 +43,7 @@ unsigned NVPTXTTI::getArithmeticInstrCost(
     if (LT.second.SimpleTy == MVT::i64)
       return 2 * LT.first;
     // Delegate other cases to the basic TTI.
-    return TargetTransformInfo::getArithmeticInstrCost(
-        Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                         Opd1PropInfo, Opd2PropInfo);
   }
 }
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
new file mode 100644
index 0000000..bf21e88
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -0,0 +1,74 @@
+//===-- NVPTXTargetTransformInfo.h - NVPTX specific TTI ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// NVPTX target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETTRANSFORMINFO_H
+
+#include "NVPTX.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
+  typedef BasicTTIImplBase<NVPTXTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const NVPTXSubtarget *ST;
+  const NVPTXTargetLowering *TLI;
+
+  const NVPTXSubtarget *getST() const { return ST; };
+  const NVPTXTargetLowering *getTLI() const { return TLI; };
+
+public:
+  explicit NVPTXTTIImpl(const NVPTXTargetMachine *TM)
+      : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {}
+
+  // Provide value semantics. MSVC requires that we spell all of these out.
+  NVPTXTTIImpl(const NVPTXTTIImpl &Arg)
+      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
+  NVPTXTTIImpl(NVPTXTTIImpl &&Arg)
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+        TLI(std::move(Arg.TLI)) {}
+  NVPTXTTIImpl &operator=(const NVPTXTTIImpl &RHS) {
+    BaseT::operator=(static_cast<const BaseT &>(RHS));
+    ST = RHS.ST;
+    TLI = RHS.TLI;
+    return *this;
+  }
+  NVPTXTTIImpl &operator=(NVPTXTTIImpl &&RHS) {
+    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
+    ST = std::move(RHS.ST);
+    TLI = std::move(RHS.TLI);
+    return *this;
+  }
+
+  bool hasBranchDivergence() { return true; }
+
+  unsigned getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp
index 5caa8bd..cf1feac 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -15,16 +15,16 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MutexGuard.h"
 #include <algorithm>
 #include <cstring>
 #include <map>
 #include <string>
 #include <vector>
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/Support/MutexGuard.h"
 
 using namespace llvm;
 
@@ -52,7 +52,7 @@ static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
     assert(prop && "Annotation property not a string");
 
     // value
-    ConstantInt *Val = dyn_cast<ConstantInt>(md->getOperand(i + 1));
+    ConstantInt *Val = mdconst::dyn_extract<ConstantInt>(md->getOperand(i + 1));
     assert(Val && "Value operand not a constant int");
 
     std::string keyname = prop->getString().str();
@@ -75,7 +75,8 @@ static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
   for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
     const MDNode *elem = NMD->getOperand(i);
 
-    Value *entity = elem->getOperand(0);
+    GlobalValue *entity =
+        mdconst::dyn_extract_or_null<GlobalValue>(elem->getOperand(0));
     // entity may be null due to DCE
     if (!entity)
       continue;
@@ -322,7 +323,7 @@ bool llvm::getAlign(const CallInst &I, unsigned index, unsigned &align) {
   if (MDNode *alignNode = I.getMetadata("callalign")) {
     for (int i = 0, n = alignNode->getNumOperands(); i < n; i++) {
       if (const ConstantInt *CI =
-              dyn_cast<ConstantInt>(alignNode->getOperand(i))) {
+              mdconst::dyn_extract<ConstantInt>(alignNode->getOperand(i))) {
         unsigned v = CI->getZExtValue();
         if ((v >> 16) == index) {
           align = v & 0xFFFF;
diff --git a/lib/Target/NVPTX/NVPTXVector.td b/lib/Target/NVPTX/NVPTXVector.td
index 775df19..85aa34e 100644
--- a/lib/Target/NVPTX/NVPTXVector.td
+++ b/lib/Target/NVPTX/NVPTXVector.td
@@ -661,7 +661,7 @@ class ShuffleAsmStr4<string type>
   string s  = !strconcat(t6, ShuffleOneLine<"4", "3", type>.s);
 }
 
-let neverHasSideEffects=1, VecInstType=isVecShuffle.Value in {
+let hasSideEffects=0, VecInstType=isVecShuffle.Value in {
 def VecShuffle_v4f32 : NVPTXVecInst<(outs V4F32Regs:$dst),
                        (ins  V4F32Regs:$src1, V4F32Regs:$src2,
                              i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
@@ -847,7 +847,7 @@ class Vec_Move<string asmstr, NVPTXRegClass vclass, NVPTXInst sop=NOP>
                    !strconcat(asmstr, "\t${dst:vecfull}, ${src:vecfull};"),
                    [], sop>;
 
-let isAsCheapAsAMove=1, neverHasSideEffects=1, IsSimpleMove=1,
+let isAsCheapAsAMove=1, hasSideEffects=0, IsSimpleMove=1,
   VecInstType=isVecOther.Value in {
 def V4f32Mov : Vec_Move<"mov.v4.f32", V4F32Regs, FMOV32rr>;
 def V2f32Mov : Vec_Move<"mov.v2.f32", V2F32Regs, FMOV32rr>;
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 06bb968..bf00e73 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -32,9 +32,7 @@
 
 using namespace llvm;
 
-namespace {
-
-static unsigned RRegs[32] = {
+static const MCPhysReg RRegs[32] = {
   PPC::R0,  PPC::R1,  PPC::R2,  PPC::R3,
   PPC::R4,  PPC::R5,  PPC::R6,  PPC::R7,
   PPC::R8,  PPC::R9,  PPC::R10, PPC::R11,
@@ -44,7 +42,7 @@ static unsigned RRegs[32] = {
   PPC::R24, PPC::R25, PPC::R26, PPC::R27,
   PPC::R28, PPC::R29, PPC::R30, PPC::R31
 };
-static unsigned RRegsNoR0[32] = {
+static const MCPhysReg RRegsNoR0[32] = {
   PPC::ZERO,
             PPC::R1,  PPC::R2,  PPC::R3,
   PPC::R4,  PPC::R5,  PPC::R6,  PPC::R7,
@@ -55,7 +53,7 @@ static unsigned RRegsNoR0[32] = {
   PPC::R24, PPC::R25, PPC::R26, PPC::R27,
   PPC::R28, PPC::R29, PPC::R30, PPC::R31
 };
-static unsigned XRegs[32] = {
+static const MCPhysReg XRegs[32] = {
   PPC::X0,  PPC::X1,  PPC::X2,  PPC::X3,
   PPC::X4,  PPC::X5,  PPC::X6,  PPC::X7,
   PPC::X8,  PPC::X9,  PPC::X10, PPC::X11,
@@ -65,7 +63,7 @@ static unsigned XRegs[32] = {
   PPC::X24, PPC::X25, PPC::X26, PPC::X27,
   PPC::X28, PPC::X29, PPC::X30, PPC::X31
 };
-static unsigned XRegsNoX0[32] = {
+static const MCPhysReg XRegsNoX0[32] = {
   PPC::ZERO8,
             PPC::X1,  PPC::X2,  PPC::X3,
   PPC::X4,  PPC::X5,  PPC::X6,  PPC::X7,
@@ -76,7 +74,7 @@ static unsigned XRegsNoX0[32] = {
   PPC::X24, PPC::X25, PPC::X26, PPC::X27,
   PPC::X28, PPC::X29, PPC::X30, PPC::X31
 };
-static unsigned FRegs[32] = {
+static const MCPhysReg FRegs[32] = {
   PPC::F0,  PPC::F1,  PPC::F2,  PPC::F3,
   PPC::F4,  PPC::F5,  PPC::F6,  PPC::F7,
   PPC::F8,  PPC::F9,  PPC::F10, PPC::F11,
@@ -86,7 +84,7 @@ static unsigned FRegs[32] = {
   PPC::F24, PPC::F25, PPC::F26, PPC::F27,
   PPC::F28, PPC::F29, PPC::F30, PPC::F31
 };
-static unsigned VRegs[32] = {
+static const MCPhysReg VRegs[32] = {
   PPC::V0,  PPC::V1,  PPC::V2,  PPC::V3,
   PPC::V4,  PPC::V5,  PPC::V6,  PPC::V7,
   PPC::V8,  PPC::V9,  PPC::V10, PPC::V11,
@@ -96,7 +94,7 @@ static unsigned VRegs[32] = {
   PPC::V24, PPC::V25, PPC::V26, PPC::V27,
   PPC::V28, PPC::V29, PPC::V30, PPC::V31
 };
-static unsigned VSRegs[64] = {
+static const MCPhysReg VSRegs[64] = {
   PPC::VSL0,  PPC::VSL1,  PPC::VSL2,  PPC::VSL3,
   PPC::VSL4,  PPC::VSL5,  PPC::VSL6,  PPC::VSL7,
   PPC::VSL8,  PPC::VSL9,  PPC::VSL10, PPC::VSL11,
@@ -115,7 +113,7 @@ static unsigned VSRegs[64] = {
   PPC::VSH24, PPC::VSH25, PPC::VSH26, PPC::VSH27,
   PPC::VSH28, PPC::VSH29, PPC::VSH30, PPC::VSH31
 };
-static unsigned VSFRegs[64] = {
+static const MCPhysReg VSFRegs[64] = {
   PPC::F0,  PPC::F1,  PPC::F2,  PPC::F3,
   PPC::F4,  PPC::F5,  PPC::F6,  PPC::F7,
   PPC::F8,  PPC::F9,  PPC::F10, PPC::F11,
@@ -134,7 +132,17 @@ static unsigned VSFRegs[64] = {
   PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
   PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
 };
-static unsigned CRBITRegs[32] = {
+static unsigned QFRegs[32] = {
+  PPC::QF0,  PPC::QF1,  PPC::QF2,  PPC::QF3,
+  PPC::QF4,  PPC::QF5,  PPC::QF6,  PPC::QF7,
+  PPC::QF8,  PPC::QF9,  PPC::QF10, PPC::QF11,
+  PPC::QF12, PPC::QF13, PPC::QF14, PPC::QF15,
+  PPC::QF16, PPC::QF17, PPC::QF18, PPC::QF19,
+  PPC::QF20, PPC::QF21, PPC::QF22, PPC::QF23,
+  PPC::QF24, PPC::QF25, PPC::QF26, PPC::QF27,
+  PPC::QF28, PPC::QF29, PPC::QF30, PPC::QF31
+};
+static const MCPhysReg CRBITRegs[32] = {
   PPC::CR0LT, PPC::CR0GT, PPC::CR0EQ, PPC::CR0UN,
   PPC::CR1LT, PPC::CR1GT, PPC::CR1EQ, PPC::CR1UN,
   PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
@@ -144,7 +152,7 @@ static unsigned CRBITRegs[32] = {
   PPC::CR6LT, PPC::CR6GT, PPC::CR6EQ, PPC::CR6UN,
   PPC::CR7LT, PPC::CR7GT, PPC::CR7EQ, PPC::CR7UN
 };
-static unsigned CRRegs[8] = {
+static const MCPhysReg CRRegs[8] = {
   PPC::CR0, PPC::CR1, PPC::CR2, PPC::CR3,
   PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7
 };
@@ -210,6 +218,8 @@ EvaluateCRExpr(const MCExpr *E) {
   llvm_unreachable("Invalid expression kind!");
 }
 
+namespace {
+
 struct PPCOperand;
 
 class PPCAsmParser : public MCTargetAsmParser {
@@ -429,6 +439,7 @@ public:
   bool isU8ImmX8() const { return Kind == Immediate &&
                                   isUInt<8>(getImm()) &&
                                   (getImm() & 7) == 0; }
+  bool isU12Imm() const { return Kind == Immediate && isUInt<12>(getImm()); }
   bool isU16Imm() const {
     switch (Kind) {
       case Expression:
@@ -564,6 +575,21 @@ public:
     Inst.addOperand(MCOperand::CreateReg(VSFRegs[getVSReg()]));
   }
 
+  void addRegQFRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(QFRegs[getReg()]));
+  }
+
+  void addRegQSRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(QFRegs[getReg()]));
+  }
+
+  void addRegQBRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(QFRegs[getReg()]));
+  }
+
   void addRegCRBITRCOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateReg(CRBITRegs[getCRBit()]));
@@ -1053,7 +1079,6 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   MCInst Inst;
 
   switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
-  default: break;
   case Match_Success:
     // Post-process instructions (typically extended mnemonics)
     ProcessInstruction(Inst, Operands);
@@ -1063,7 +1088,7 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_MissingFeature:
     return Error(IDLoc, "instruction use requires an option to be enabled");
   case Match_MnemonicFail:
-      return Error(IDLoc, "unrecognized instruction mnemonic");
+    return Error(IDLoc, "unrecognized instruction mnemonic");
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
     if (ErrorInfo != ~0ULL) {
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index 47a9474..936ed7f 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -20,8 +20,11 @@ add_llvm_target(PowerPCCodeGen
   PPCInstrInfo.cpp
   PPCISelDAGToDAG.cpp
   PPCISelLowering.cpp
+  PPCEarlyReturn.cpp
   PPCFastISel.cpp
   PPCFrameLowering.cpp
+  PPCLoopDataPrefetch.cpp
+  PPCLoopPreIncPrep.cpp
   PPCMCInstLower.cpp
   PPCMachineFunctionInfo.cpp
   PPCRegisterInfo.cpp
@@ -30,6 +33,9 @@ add_llvm_target(PowerPCCodeGen
   PPCTargetObjectFile.cpp
   PPCTargetTransformInfo.cpp
   PPCSelectionDAGInfo.cpp
+  PPCTLSDynamicCall.cpp
+  PPCVSXCopy.cpp
+  PPCVSXFMAMutate.cpp
   )
 
 add_subdirectory(AsmParser)
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 5251b60..0ed0723 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -164,6 +164,17 @@ static const unsigned G8Regs[] = {
   PPC::X28, PPC::X29, PPC::X30, PPC::X31
 };
 
+static const unsigned QFRegs[] = {
+  PPC::QF0, PPC::QF1, PPC::QF2, PPC::QF3,
+  PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7,
+  PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11,
+  PPC::QF12, PPC::QF13, PPC::QF14, PPC::QF15,
+  PPC::QF16, PPC::QF17, PPC::QF18, PPC::QF19,
+  PPC::QF20, PPC::QF21, PPC::QF22, PPC::QF23,
+  PPC::QF24, PPC::QF25, PPC::QF26, PPC::QF27,
+  PPC::QF28, PPC::QF29, PPC::QF30, PPC::QF31
+};
+
 template <std::size_t N>
 static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
                                         const unsigned (&Regs)[N]) {
@@ -235,6 +246,15 @@ static DecodeStatus DecodeG8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
 #define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass
 #define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass
 
+static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, QFRegs);
+}
+
+#define DecodeQSRCRegisterClass DecodeQFRCRegisterClass
+#define DecodeQBRCRegisterClass DecodeQFRCRegisterClass
+
 template<unsigned N>
 static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm,
                                       int64_t Address, const void *Decoder) {
@@ -335,6 +355,15 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   uint32_t Inst =
       (Bytes[0] << 24) | (Bytes[1] << 16) | (Bytes[2] << 8) | (Bytes[3] << 0);
 
+  if ((STI.getFeatureBits() & PPC::FeatureQPX) != 0) {
+    DecodeStatus result =
+      decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI);
+    if (result != MCDisassembler::Fail)
+      return result;
+
+    MI.clear();
+  }
+
   return decodeInstruction(DecoderTable32, MI, Inst, Address, this, STI);
 }
 
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 670c40a..c287fbe 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -34,7 +34,20 @@ FullRegNames("ppc-asm-full-reg-names", cl::Hidden, cl::init(false),
 #include "PPCGenAsmWriter.inc"
 
 void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
-  OS << getRegisterName(RegNo);
+  const char *RegName = getRegisterName(RegNo);
+  if (RegName[0] == 'q' /* QPX */) {
+    // The system toolchain on the BG/Q does not understand QPX register names
+    // in .cfi_* directives, so print the name of the floating-point
+    // subregister instead.
+    std::string RN(RegName);
+
+    RN[0] = 'f';
+    OS << RN;
+
+    return;
+  }
+
+  OS << RegName;
 }
 
 void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
@@ -236,6 +249,13 @@ void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo,
   O << (unsigned int)Value;
 }
 
+void PPCInstPrinter::printU12ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  unsigned short Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 4095 && "Invalid u12imm argument!");
+  O << (unsigned short)Value;
+}
+
 void PPCInstPrinter::printS16ImmOperand(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) {
   if (MI->getOperand(OpNo).isImm())
@@ -338,6 +358,7 @@ static const char *stripRegisterPrefix(const char *RegName) {
   switch (RegName[0]) {
   case 'r':
   case 'f':
+  case 'q': // for QPX
   case 'v':
     if (RegName[1] == 's')
       return RegName + 2;
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index b21aa22..6ead19b 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -48,6 +48,7 @@ public:
   void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU12ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printS16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/PowerPC/LLVMBuild.txt b/lib/Target/PowerPC/LLVMBuild.txt
index 9d173d6..fd5fa56 100644
--- a/lib/Target/PowerPC/LLVMBuild.txt
+++ b/lib/Target/PowerPC/LLVMBuild.txt
@@ -31,5 +31,5 @@ has_jit = 1
 type = Library
 name = PowerPCCodeGen
 parent = PowerPC
-required_libraries = Analysis AsmPrinter CodeGen Core MC PowerPCAsmPrinter PowerPCDesc PowerPCInfo SelectionDAG Support Target TransformUtils
+required_libraries = Analysis AsmPrinter CodeGen Core MC PowerPCAsmPrinter PowerPCDesc PowerPCInfo Scalar SelectionDAG Support Target TransformUtils
 add_to_library_groups = PowerPC
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index c54d5e7..bea88a2 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -9,8 +9,8 @@
 
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCFixupKinds.h"
-#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 893aae3..2b4f2d8 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -74,9 +74,6 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) {
   AssemblerDialect = 1;           // New-Style mnemonics.
   LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
 
-  if (T.getOS() == llvm::Triple::FreeBSD ||
-      (T.getOS() == llvm::Triple::NetBSD && !is64Bit) ||
-      (T.getOS() == llvm::Triple::OpenBSD && !is64Bit))
-    UseIntegratedAssembler = true;
+  UseIntegratedAssembler = true;
 }
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 9f0294d..86ad385 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -21,7 +21,8 @@ namespace llvm {
 class Triple;
 
   class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin {
-    void anchor() override;
+    virtual void anchor();
+
   public:
     explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple&);
   };
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 786b7fe..06d380e 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -31,8 +31,8 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace {
 class PPCMCCodeEmitter : public MCCodeEmitter {
-  PPCMCCodeEmitter(const PPCMCCodeEmitter &) LLVM_DELETED_FUNCTION;
-  void operator=(const PPCMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  PPCMCCodeEmitter(const PPCMCCodeEmitter &) = delete;
+  void operator=(const PPCMCCodeEmitter &) = delete;
 
   const MCInstrInfo &MCII;
   const MCContext &CTX;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 00be8f4..f2da389 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -184,6 +184,23 @@ public:
     if ((Flags & ELF::EF_PPC64_ABI) == 0)
       MCA.setELFHeaderEFlags(Flags | 2);
   }
+  void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override {
+    // When encoding an assignment to set symbol A to symbol B, also copy
+    // the st_other bits encoding the local entry point offset.
+    if (Value->getKind() != MCExpr::SymbolRef)
+      return;
+    const MCSymbol &RhsSym =
+        static_cast<const MCSymbolRefExpr *>(Value)->getSymbol();
+    MCSymbolData &Data = getStreamer().getOrCreateSymbolData(&RhsSym);
+    MCSymbolData &SymbolData = getStreamer().getOrCreateSymbolData(Symbol);
+    // The "other" values are stored in the last 6 bits of the second byte.
+    // The traditional defines for STO values assume the full byte and thus
+    // the shift to pack it.
+    unsigned Other = MCELF::getOther(SymbolData) << 2;
+    Other &= ~ELF::STO_PPC64_LOCAL_MASK;
+    Other |= (MCELF::getOther(Data) << 2) & ELF::STO_PPC64_LOCAL_MASK;
+    MCELF::setOther(SymbolData, Other >> 2);
+  }
 };
 
 class PPCTargetMachOStreamer : public PPCTargetStreamer {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index df2f14a..f7259b9 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -41,7 +41,7 @@ public:
       : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
                                  /*UseAggressiveSymbolFolding=*/Is64Bit) {}
 
-  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+  void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
                         const MCAsmLayout &Layout, const MCFragment *Fragment,
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue) override {
@@ -282,7 +282,7 @@ bool PPCMachObjectWriter::RecordScatteredRelocation(
     MachO::any_relocation_info MRE;
     makeScatteredRelocationInfo(MRE, other_half, MachO::GENERIC_RELOC_PAIR,
                                 Log2Size, IsPCRel, Value2);
-    Writer->addRelocation(Fragment->getParent(), MRE);
+    Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
   } else {
     // If the offset is more than 24-bits, it won't fit in a scattered
     // relocation offset field, so we fall back to using a non-scattered
@@ -296,7 +296,7 @@ bool PPCMachObjectWriter::RecordScatteredRelocation(
   }
   MachO::any_relocation_info MRE;
   makeScatteredRelocationInfo(MRE, FixupOffset, Type, Log2Size, IsPCRel, Value);
-  Writer->addRelocation(Fragment->getParent(), MRE);
+  Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
   return true;
 }
 
@@ -331,9 +331,9 @@ void PPCMachObjectWriter::RecordPPCRelocation(
   // See <reloc.h>.
   const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup);
   unsigned Index = 0;
-  unsigned IsExtern = 0;
   unsigned Type = RelocType;
 
+  const MCSymbolData *RelSymbol = nullptr;
   if (Target.isAbsolute()) { // constant
                              // SymbolNum of 0 indicates the absolute section.
                              //
@@ -355,8 +355,7 @@ void PPCMachObjectWriter::RecordPPCRelocation(
 
     // Check whether we need an external or internal relocation.
     if (Writer->doesSymbolRequireExternRelocation(SD)) {
-      IsExtern = 1;
-      Index = SD->getIndex();
+      RelSymbol = SD;
       // For external relocations, make sure to offset the fixup value to
       // compensate for the addend of the symbol address, if it was
       // undefined. This occurs with weak definitions, for example.
@@ -375,9 +374,8 @@ void PPCMachObjectWriter::RecordPPCRelocation(
 
   // struct relocation_info (8 bytes)
   MachO::any_relocation_info MRE;
-  makeRelocationInfo(MRE, FixupOffset, Index, IsPCRel, Log2Size, IsExtern,
-                     Type);
-  Writer->addRelocation(Fragment->getParent(), MRE);
+  makeRelocationInfo(MRE, FixupOffset, Index, IsPCRel, Log2Size, false, Type);
+  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
 }
 
 MCObjectWriter *llvm::createPPCMachObjectWriter(raw_ostream &OS, bool Is64Bit,
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index 8fb33df..5e5a9b1 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -34,18 +34,17 @@ namespace llvm {
 #ifndef NDEBUG
   FunctionPass *createPPCCTRLoopsVerify();
 #endif
+  FunctionPass *createPPCLoopDataPrefetchPass();
+  FunctionPass *createPPCLoopPreIncPrepPass(PPCTargetMachine &TM);
   FunctionPass *createPPCEarlyReturnPass();
   FunctionPass *createPPCVSXCopyPass();
-  FunctionPass *createPPCVSXCopyCleanupPass();
   FunctionPass *createPPCVSXFMAMutatePass();
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
+  FunctionPass *createPPCTLSDynamicCallPass();
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP, bool isDarwin);
 
-  /// \brief Creates an PPC-specific Target Transformation Info pass.
-  ImmutablePass *createPPCTargetTransformInfoPass(const PPCTargetMachine *TM);
-
   void initializePPCVSXFMAMutatePass(PassRegistry&);
   extern char &PPCVSXFMAMutateID;
 
@@ -93,12 +92,7 @@ namespace llvm {
     MO_TOC_LO    = 7 << 4,
 
     // Symbol for VK_PPC_TLS fixup attached to an ADD instruction
-    MO_TLS       = 8 << 4,
-
-    // Symbols for VK_PPC_TLSGD and VK_PPC_TLSLD in __tls_get_addr
-    // call sequences.
-    MO_TLSLD     = 9 << 4,
-    MO_TLSGD     = 10 << 4
+    MO_TLS       = 8 << 4
   };
   } // end namespace PPCII
   
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index 46d56a4..f53add5 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -88,8 +88,13 @@ def FeaturePOPCNTD   : SubtargetFeature<"popcntd","HasPOPCNTD", "true",
                                         "Enable the popcnt[dw] instructions">;
 def FeatureLDBRX     : SubtargetFeature<"ldbrx","HasLDBRX", "true",
                                         "Enable the ldbrx instruction">;
+def FeatureCMPB      : SubtargetFeature<"cmpb", "HasCMPB", "true",
+                                        "Enable the cmpb instruction">;
+def FeatureICBT      : SubtargetFeature<"icbt","HasICBT", "true",
+                                        "Enable icbt instruction">;
 def FeatureBookE     : SubtargetFeature<"booke", "IsBookE", "true",
-                                        "Enable Book E instructions">;
+                                        "Enable Book E instructions",
+                                        [FeatureICBT]>;
 def FeatureMSYNC     : SubtargetFeature<"msync", "HasOnlyMSYNC", "true",
                               "Has only the msync instruction instead of sync",
                               [FeatureBookE]>;
@@ -104,9 +109,17 @@ def FeatureQPX       : SubtargetFeature<"qpx","HasQPX", "true",
 def FeatureVSX       : SubtargetFeature<"vsx","HasVSX", "true",
                                         "Enable VSX instructions",
                                         [FeatureAltivec]>;
+def FeatureP8Altivec : SubtargetFeature<"power8-altivec", "HasP8Altivec", "true",
+                                        "Enable POWER8 Altivec instructions",
+                                        [FeatureAltivec]>;
 def FeatureP8Vector  : SubtargetFeature<"power8-vector", "HasP8Vector", "true",
                                         "Enable POWER8 vector instructions",
-                                        [FeatureVSX, FeatureAltivec]>;
+                                        [FeatureVSX, FeatureP8Altivec]>;
+
+def FeatureInvariantFunctionDescriptors :
+  SubtargetFeature<"invariant-function-descriptors",
+                   "HasInvariantFunctionDescriptors", "true",
+                   "Assume function descriptors are invariant">;
 
 def DeprecatedMFTB   : SubtargetFeature<"", "DeprecatedMFTB", "true",
                                         "Treat mftb as deprecated">;
@@ -116,21 +129,10 @@ def DeprecatedDST    : SubtargetFeature<"", "DeprecatedDST", "true",
 // Note: Future features to add when support is extended to more
 // recent ISA levels:
 //
-// CMPB         p6, p6x, p7        cmpb
 // DFP          p6, p6x, p7        decimal floating-point instructions
 // POPCNTB      p5 through p7      popcntb and related instructions
 
 //===----------------------------------------------------------------------===//
-// ABI Selection                                                              //
-//===----------------------------------------------------------------------===//
-
-def FeatureELFv1 : SubtargetFeature<"elfv1", "TargetABI", "PPC_ABI_ELFv1",
-                                    "Use the ELFv1 ABI">;
-
-def FeatureELFv2 : SubtargetFeature<"elfv2", "TargetABI", "PPC_ABI_ELFv2",
-                                    "Use the ELFv2 ABI">;
-
-//===----------------------------------------------------------------------===//
 // Classes used for relation maps.
 //===----------------------------------------------------------------------===//
 // RecFormRel - Filter class used to relate non-record-form instructions with
@@ -201,12 +203,12 @@ include "PPCInstrInfo.td"
 def : Processor<"generic", G3Itineraries, [Directive32]>;
 def : ProcessorModel<"440", PPC440Model, [Directive440, FeatureISEL,
                                           FeatureFRES, FeatureFRSQRTE,
-                                          FeatureBookE, FeatureMSYNC,
-                                          DeprecatedMFTB]>;
+                                          FeatureICBT, FeatureBookE, 
+                                          FeatureMSYNC, DeprecatedMFTB]>;
 def : ProcessorModel<"450", PPC440Model, [Directive440, FeatureISEL,
                                           FeatureFRES, FeatureFRSQRTE,
-                                          FeatureBookE, FeatureMSYNC,
-                                          DeprecatedMFTB]>;
+                                          FeatureICBT, FeatureBookE, 
+                                          FeatureMSYNC, DeprecatedMFTB]>;
 def : Processor<"601", G3Itineraries, [Directive601]>;
 def : Processor<"602", G3Itineraries, [Directive602]>;
 def : Processor<"603", G3Itineraries, [Directive603,
@@ -233,6 +235,34 @@ def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec,
                                             FeatureFRES, FeatureFRSQRTE]>;
 def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec,
                                            FeatureFRES, FeatureFRSQRTE]>;
+
+/*  Since new processors generally contain a superset of features of those that
+    came before them, the idea is to make implementations of new processors
+    less error prone and easier to read.
+    Namely:
+        list<SubtargetFeature> Power8FeatureList = ...
+        list<SubtargetFeature> FutureProcessorSpecificFeatureList =
+            [ features that Power8 does not support ]
+        list<SubtargetFeature> FutureProcessorFeatureList =
+            !listconcat(Power8FeatureList, FutureProcessorSpecificFeatureList)
+
+    Makes it explicit and obvious what is new in FutureProcesor vs. Power8 as
+    well as providing a single point of definition if the feature set will be
+    used elsewhere.
+    
+*/
+def ProcessorFeatures {
+    list<SubtargetFeature> Power8FeatureList =
+        [DirectivePwr8, FeatureAltivec, FeatureP8Altivec, FeatureVSX, 
+        FeatureP8Vector, FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, 
+        FeatureFRE, FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
+        FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
+        FeatureFPRND, FeatureFPCVT, FeatureISEL,
+        FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX,
+        Feature64Bit /*, Feature64BitRegs */, FeatureICBT,
+        DeprecatedMFTB, DeprecatedDST];
+}
+
 def : ProcessorModel<"970", G5Model,
                   [Directive970, FeatureAltivec,
                    FeatureMFOCRF, FeatureFSqrt,
@@ -246,27 +276,27 @@ def : ProcessorModel<"g5", G5Model,
                    DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"e500mc", PPCE500mcModel,
                   [DirectiveE500mc, FeatureMFOCRF,
-                   FeatureSTFIWX, FeatureBookE, FeatureISEL,
-                   DeprecatedMFTB]>;
+                   FeatureSTFIWX, FeatureICBT, FeatureBookE, 
+                   FeatureISEL, DeprecatedMFTB]>;
 def : ProcessorModel<"e5500", PPCE5500Model,
                   [DirectiveE5500, FeatureMFOCRF, Feature64Bit,
-                   FeatureSTFIWX, FeatureBookE, FeatureISEL,
-                   DeprecatedMFTB]>;
+                   FeatureSTFIWX, FeatureICBT, FeatureBookE, 
+                   FeatureISEL, DeprecatedMFTB]>;
 def : ProcessorModel<"a2", PPCA2Model,
-                  [DirectiveA2, FeatureBookE, FeatureMFOCRF,
+                  [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF,
                    FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
                    FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
-                   FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
+                   FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX, Feature64Bit
                /*, Feature64BitRegs */, DeprecatedMFTB]>;
 def : ProcessorModel<"a2q", PPCA2Model,
-                  [DirectiveA2, FeatureBookE, FeatureMFOCRF,
+                  [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF,
                    FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
                    FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
-                   FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
+                   FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX, Feature64Bit
                /*, Feature64BitRegs */, FeatureQPX, DeprecatedMFTB]>;
 def : ProcessorModel<"pwr3", G5Model,
                   [DirectivePwr3, FeatureAltivec,
@@ -292,45 +322,33 @@ def : ProcessorModel<"pwr6", G5Model,
                   [DirectivePwr6, FeatureAltivec,
                    FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
                    FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
-                   FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, FeatureCMPB,
                    FeatureFPRND, Feature64Bit /*, Feature64BitRegs */,
                    DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr6x", G5Model,
                   [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
                    FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
-                   FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureSTFIWX, FeatureLFIWAX, FeatureCMPB,
                    FeatureFPRND, Feature64Bit,
                    DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr7", P7Model,
-                  [DirectivePwr7, FeatureAltivec,
+                  [DirectivePwr7, FeatureAltivec, FeatureVSX,
                    FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
                    FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
                    FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
-                   FeaturePOPCNTD, FeatureLDBRX,
-                   Feature64Bit /*, Feature64BitRegs */,
-                   DeprecatedMFTB, DeprecatedDST]>;
-def : ProcessorModel<"pwr8", P7Model /* FIXME: Update to P8Model when available */,
-                  [DirectivePwr8, FeatureAltivec,
-                   FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
-                   FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
-                   FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
-                   FeatureFPRND, FeatureFPCVT, FeatureISEL,
-                   FeaturePOPCNTD, FeatureLDBRX,
+                   FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX,
                    Feature64Bit /*, Feature64BitRegs */,
                    DeprecatedMFTB, DeprecatedDST]>;
+def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.Power8FeatureList>;
 def : Processor<"ppc", G3Itineraries, [Directive32]>;
 def : ProcessorModel<"ppc64", G5Model,
                   [Directive64, FeatureAltivec,
                    FeatureMFOCRF, FeatureFSqrt, FeatureFRES,
                    FeatureFRSQRTE, FeatureSTFIWX,
                    Feature64Bit /*, Feature64BitRegs */]>;
-def : ProcessorModel<"ppc64le", G5Model,
-                  [Directive64, FeatureAltivec,
-                   FeatureMFOCRF, FeatureFSqrt, FeatureFRES,
-                   FeatureFRSQRTE, FeatureSTFIWX,
-                   Feature64Bit /*, Feature64BitRegs */]>;
+def : ProcessorModel<"ppc64le", P8Model, ProcessorFeatures.Power8FeatureList>;
 
 //===----------------------------------------------------------------------===//
 // Calling Conventions
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 5648873..1327290 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -18,9 +18,9 @@
 
 #include "PPC.h"
 #include "InstPrinter/PPCInstPrinter.h"
-#include "PPCMachineFunctionInfo.h"
 #include "MCTargetDesc/PPCMCExpr.h"
 #include "MCTargetDesc/PPCPredicates.h"
+#include "PPCMachineFunctionInfo.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
 #include "PPCTargetStreamer.h"
@@ -34,6 +34,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
@@ -67,12 +68,13 @@ namespace {
   class PPCAsmPrinter : public AsmPrinter {
   protected:
     MapVector<MCSymbol*, MCSymbol*> TOC;
-    const PPCSubtarget &Subtarget;
+    const PPCSubtarget *Subtarget;
     uint64_t TOCLabelID;
+    StackMaps SM;
   public:
-    explicit PPCAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : AsmPrinter(TM, Streamer),
-        Subtarget(TM.getSubtarget<PPCSubtarget>()), TOCLabelID(0) {}
+    explicit PPCAsmPrinter(TargetMachine &TM,
+                           std::unique_ptr<MCStreamer> Streamer)
+        : AsmPrinter(TM, std::move(Streamer)), TOCLabelID(0), SM(*this) {}
 
     const char *getPassName() const override {
       return "PowerPC Assembly Printer";
@@ -90,13 +92,26 @@ namespace {
     bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                unsigned AsmVariant, const char *ExtraCode,
                                raw_ostream &O) override;
+
+    void EmitEndOfAsmFile(Module &M) override;
+
+    void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                       const MachineInstr &MI);
+    void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                         const MachineInstr &MI);
+    void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK);
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      Subtarget = &MF.getSubtarget<PPCSubtarget>();
+      return AsmPrinter::runOnMachineFunction(MF);
+    }
   };
 
   /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
   class PPCLinuxAsmPrinter : public PPCAsmPrinter {
   public:
-    explicit PPCLinuxAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : PPCAsmPrinter(TM, Streamer) {}
+    explicit PPCLinuxAsmPrinter(TargetMachine &TM,
+                                std::unique_ptr<MCStreamer> Streamer)
+        : PPCAsmPrinter(TM, std::move(Streamer)) {}
 
     const char *getPassName() const override {
       return "Linux PPC Assembly Printer";
@@ -115,8 +130,9 @@ namespace {
   /// OS X
   class PPCDarwinAsmPrinter : public PPCAsmPrinter {
   public:
-    explicit PPCDarwinAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : PPCAsmPrinter(TM, Streamer) {}
+    explicit PPCDarwinAsmPrinter(TargetMachine &TM,
+                                 std::unique_ptr<MCStreamer> Streamer)
+        : PPCAsmPrinter(TM, std::move(Streamer)) {}
 
     const char *getPassName() const override {
       return "Darwin PPC Assembly Printer";
@@ -135,6 +151,7 @@ static const char *stripRegisterPrefix(const char *RegName) {
   switch (RegName[0]) {
     case 'r':
     case 'f':
+    case 'q': // for QPX
     case 'v':
       if (RegName[1] == 's')
         return RegName + 2;
@@ -147,7 +164,7 @@ static const char *stripRegisterPrefix(const char *RegName) {
 
 void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   const MachineOperand &MO = MI->getOperand(OpNo);
   
   switch (MO.getType()) {
@@ -155,7 +172,8 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
     const char *RegName = PPCInstPrinter::getRegisterName(MO.getReg());
     // Linux assembler (Others?) does not take register mnemonics.
     // FIXME - What about special registers used in mfspr/mtspr?
-    if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName);
+    if (!Subtarget->isDarwin())
+      RegName = stripRegisterPrefix(RegName);
     O << RegName;
     return;
   }
@@ -270,7 +288,8 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
     case 'y': // A memory reference for an X-form instruction
       {
         const char *RegName = "r0";
-        if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName);
+        if (!Subtarget->isDarwin())
+          RegName = stripRegisterPrefix(RegName);
         O << RegName << ", ";
         printOperand(MI, OpNo, O);
         return false;
@@ -302,7 +321,7 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
 /// exists for it.  If not, create one.  Then return a symbol that references
 /// the TOC entry.
 MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   MCSymbol *&TOCEntry = TOC[Sym];
 
   // To avoid name clash check if the name already exists.
@@ -316,13 +335,120 @@ MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) {
   return TOCEntry;
 }
 
+void PPCAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  SM.serializeToStackMapSection();
+}
+
+void PPCAsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                                  const MachineInstr &MI) {
+  unsigned NumNOPBytes = MI.getOperand(1).getImm();
+
+  SM.recordStackMap(MI);
+  assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+
+  // Scan ahead to trim the shadow.
+  const MachineBasicBlock &MBB = *MI.getParent();
+  MachineBasicBlock::const_iterator MII(MI);
+  ++MII;
+  while (NumNOPBytes > 0) {
+    if (MII == MBB.end() || MII->isCall() ||
+        MII->getOpcode() == PPC::DBG_VALUE ||
+        MII->getOpcode() == TargetOpcode::PATCHPOINT ||
+        MII->getOpcode() == TargetOpcode::STACKMAP)
+      break;
+    ++MII;
+    NumNOPBytes -= 4;
+  }
+
+  // Emit nops.
+  for (unsigned i = 0; i < NumNOPBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::NOP));
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>
+void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                                    const MachineInstr &MI) {
+  SM.recordPatchPoint(MI);
+  PatchPointOpers Opers(&MI);
+
+  int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+  unsigned EncodedBytes = 0;
+  if (CallTarget) {
+    assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
+           "High 16 bits of call target should be zero.");
+    unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+    EncodedBytes = 6*4;
+    // Materialize the jump address:
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI8)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 32) & 0xFFFF));
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::RLDIC)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm(32).addImm(16));
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORIS8)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 16) & 0xFFFF));
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORI8)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm(CallTarget & 0xFFFF));
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR8).addReg(ScratchReg));
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTRL8));
+  }
+
+  // Emit padding.
+  unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+  assert((NumBytes - EncodedBytes) % 4 == 0 &&
+         "Invalid number of NOP bytes requested!");
+  for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::NOP));
+}
+
+/// EmitTlsCall -- Given a GETtls[ld]ADDR[32] instruction, print a
+/// call to __tls_get_addr to the current output stream.
+void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
+                                MCSymbolRefExpr::VariantKind VK) {
+  StringRef Name = "__tls_get_addr";
+  MCSymbol *TlsGetAddr = OutContext.GetOrCreateSymbol(Name);
+  MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+
+  assert(MI->getOperand(0).isReg() &&
+         ((Subtarget->isPPC64() && MI->getOperand(0).getReg() == PPC::X3) ||
+          (!Subtarget->isPPC64() && MI->getOperand(0).getReg() == PPC::R3)) &&
+         "GETtls[ld]ADDR[32] must define GPR3");
+  assert(MI->getOperand(1).isReg() &&
+         ((Subtarget->isPPC64() && MI->getOperand(1).getReg() == PPC::X3) ||
+          (!Subtarget->isPPC64() && MI->getOperand(1).getReg() == PPC::R3)) &&
+         "GETtls[ld]ADDR[32] must read GPR3");
+
+  if (!Subtarget->isPPC64() && !Subtarget->isDarwin() &&
+      TM.getRelocationModel() == Reloc::PIC_)
+    Kind = MCSymbolRefExpr::VK_PLT;
+  const MCSymbolRefExpr *TlsRef =
+    MCSymbolRefExpr::Create(TlsGetAddr, Kind, OutContext);
+  const MachineOperand &MO = MI->getOperand(2);
+  const GlobalValue *GValue = MO.getGlobal();
+  MCSymbol *MOSymbol = getSymbol(GValue);
+  const MCExpr *SymVar = MCSymbolRefExpr::Create(MOSymbol, VK, OutContext);
+  EmitToStreamer(OutStreamer,
+                 MCInstBuilder(Subtarget->isPPC64() ?
+                               PPC::BL8_NOP_TLS : PPC::BL_TLS)
+                 .addExpr(TlsRef)
+                 .addExpr(SymVar));
+}
 
 /// EmitInstruction -- Print out a single PowerPC MI in Darwin syntax to
 /// the current output stream.
 ///
 void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MCInst TmpInst;
-  bool isPPC64 = Subtarget.isPPC64();
+  bool isPPC64 = Subtarget->isPPC64();
   bool isDarwin = Triple(TM.getTargetTriple()).isOSDarwin();
   const Module *M = MF->getFunction()->getParent();
   PICLevel::Level PL = M->getPICLevel();
@@ -332,6 +458,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   default: break;
   case TargetOpcode::DBG_VALUE:
     llvm_unreachable("Should be handled target independently");
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(OutStreamer, SM, *MI);
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(OutStreamer, SM, *MI);
+
   case PPC::MoveGOTtoLR: {
     // Transform %LR = MoveGOTtoLR
     // Into this: bl _GLOBAL_OFFSET_TABLE_@local-4
@@ -602,7 +733,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case PPC::ADDISgotTprelHA: {
     // Transform: %Xd = ADDISgotTprelHA %X2, <ga:@sym>
     // Into:      %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
-    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
@@ -611,7 +742,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
                               OutContext);
     EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                 .addReg(MI->getOperand(0).getReg())
-                                .addReg(PPC::X2)
+                                .addReg(MI->getOperand(1).getReg())
                                 .addExpr(SymGotTprel));
     return;
   }
@@ -681,7 +812,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case PPC::ADDIStlsgdHA: {
     // Transform: %Xd = ADDIStlsgdHA %X2, <ga:@sym>
     // Into:      %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
-    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
@@ -690,7 +821,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
                               OutContext);
     EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                 .addReg(MI->getOperand(0).getReg())
-                                .addReg(PPC::X2)
+                                .addReg(MI->getOperand(1).getReg())
                                 .addExpr(SymGotTlsGD));
     return;
   }
@@ -703,22 +834,30 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *SymGotTlsGD =
-      MCSymbolRefExpr::Create(MOSymbol, Subtarget.isPPC64() ?
-                                         MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO :
-                                         MCSymbolRefExpr::VK_PPC_GOT_TLSGD,
-                              OutContext);
+    const MCExpr *SymGotTlsGD = MCSymbolRefExpr::Create(
+        MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO
+                                       : MCSymbolRefExpr::VK_PPC_GOT_TLSGD,
+        OutContext);
     EmitToStreamer(OutStreamer,
-                   MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+                   MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
                    .addReg(MI->getOperand(0).getReg())
                    .addReg(MI->getOperand(1).getReg())
                    .addExpr(SymGotTlsGD));
     return;
   }
+  case PPC::GETtlsADDR:
+    // Transform: %X3 = GETtlsADDR %X3, <ga:@sym>
+    // Into: BL8_NOP_TLS __tls_get_addr(sym at tlsgd)
+  case PPC::GETtlsADDR32: {
+    // Transform: %R3 = GETtlsADDR32 %R3, <ga:@sym>
+    // Into: BL_TLS __tls_get_addr(sym at tlsgd)@PLT
+    EmitTlsCall(MI, MCSymbolRefExpr::VK_PPC_TLSGD);
+    return;
+  }
   case PPC::ADDIStlsldHA: {
     // Transform: %Xd = ADDIStlsldHA %X2, <ga:@sym>
     // Into:      %Xd = ADDIS8 %X2, sym@got@tlsld@ha
-    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
@@ -727,7 +866,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
                               OutContext);
     EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                 .addReg(MI->getOperand(0).getReg())
-                                .addReg(PPC::X2)
+                                .addReg(MI->getOperand(1).getReg())
                                 .addExpr(SymGotTlsLD));
     return;
   }
@@ -740,16 +879,24 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *SymGotTlsLD =
-      MCSymbolRefExpr::Create(MOSymbol, Subtarget.isPPC64() ?
-                                         MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO :
-                                         MCSymbolRefExpr::VK_PPC_GOT_TLSLD,
-                              OutContext);
+    const MCExpr *SymGotTlsLD = MCSymbolRefExpr::Create(
+        MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO
+                                       : MCSymbolRefExpr::VK_PPC_GOT_TLSLD,
+        OutContext);
     EmitToStreamer(OutStreamer,
-                   MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI)
-                   .addReg(MI->getOperand(0).getReg())
-                   .addReg(MI->getOperand(1).getReg())
-                   .addExpr(SymGotTlsLD));
+                   MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+                       .addReg(MI->getOperand(0).getReg())
+                       .addReg(MI->getOperand(1).getReg())
+                       .addExpr(SymGotTlsLD));
+    return;
+  }
+  case PPC::GETtlsldADDR:
+    // Transform: %X3 = GETtlsldADDR %X3, <ga:@sym>
+    // Into: BL8_NOP_TLS __tls_get_addr(sym at tlsld)
+  case PPC::GETtlsldADDR32: {
+    // Transform: %R3 = GETtlsldADDR32 %R3, <ga:@sym>
+    // Into: BL_TLS __tls_get_addr(sym at tlsld)@PLT
+    EmitTlsCall(MI, MCSymbolRefExpr::VK_PPC_TLSLD);
     return;
   }
   case PPC::ADDISdtprelHA:
@@ -764,11 +911,12 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MCExpr *SymDtprel =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_HA,
                               OutContext);
-    EmitToStreamer(OutStreamer,
-                   MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDIS8 : PPC::ADDIS)
-                   .addReg(MI->getOperand(0).getReg())
-                   .addReg(Subtarget.isPPC64() ? PPC::X3 : PPC::R3)
-                   .addExpr(SymDtprel));
+    EmitToStreamer(
+        OutStreamer,
+        MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDIS8 : PPC::ADDIS)
+            .addReg(MI->getOperand(0).getReg())
+            .addReg(Subtarget->isPPC64() ? PPC::X3 : PPC::R3)
+            .addExpr(SymDtprel));
     return;
   }
   case PPC::ADDIdtprelL:
@@ -784,15 +932,15 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO,
                               OutContext);
     EmitToStreamer(OutStreamer,
-                   MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI)
-                   .addReg(MI->getOperand(0).getReg())
-                   .addReg(MI->getOperand(1).getReg())
-                   .addExpr(SymDtprel));
+                   MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+                       .addReg(MI->getOperand(0).getReg())
+                       .addReg(MI->getOperand(1).getReg())
+                       .addExpr(SymDtprel));
     return;
   }
   case PPC::MFOCRF:
   case PPC::MFOCRF8:
-    if (!Subtarget.hasMFOCRF()) {
+    if (!Subtarget->hasMFOCRF()) {
       // Transform: %R3 = MFOCRF %CR7
       // Into:      %R3 = MFCR   ;; cr7
       unsigned NewOpcode =
@@ -806,7 +954,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     break;
   case PPC::MTOCRF:
   case PPC::MTOCRF8:
-    if (!Subtarget.hasMFOCRF()) {
+    if (!Subtarget->hasMFOCRF()) {
       // Transform: %CR7 = MTOCRF %R3
       // Into:      MTCRF mask, %R3 ;; cr7
       unsigned NewOpcode =
@@ -831,7 +979,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // suite shows a handful of test cases that fail this check for
     // Darwin.  Those need to be investigated before this sanity test
     // can be enabled for those subtargets.
-    if (!Subtarget.isDarwin()) {
+    if (!Subtarget->isDarwin()) {
       unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
       const MachineOperand &MO = MI->getOperand(OpNum);
       if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
@@ -847,7 +995,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 }
 
 void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
-  if (Subtarget.isELFv2ABI()) {
+  if (static_cast<const PPCTargetMachine &>(TM).isELFv2ABI()) {
     PPCTargetStreamer *TS =
       static_cast<PPCTargetStreamer *>(OutStreamer.getTargetStreamer());
 
@@ -855,15 +1003,15 @@ void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
       TS->emitAbiVersion(2);
   }
 
-  if (Subtarget.isPPC64() || TM.getRelocationModel() != Reloc::PIC_)
+  if (static_cast<const PPCTargetMachine &>(TM).isPPC64() ||
+      TM.getRelocationModel() != Reloc::PIC_)
     return AsmPrinter::EmitStartOfAsmFile(M);
 
   if (M.getPICLevel() == PICLevel::Small)
     return AsmPrinter::EmitStartOfAsmFile(M);
 
-  OutStreamer.SwitchSection(OutContext.getELFSection(".got2",
-         ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
-         SectionKind::getReadOnly()));
+  OutStreamer.SwitchSection(OutContext.getELFSection(
+      ".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC));
 
   MCSymbol *TOCSym = OutContext.GetOrCreateSymbol(Twine(".LTOC"));
   MCSymbol *CurrentPos = OutContext.CreateTempSymbol();
@@ -884,12 +1032,12 @@ void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
 
 void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
   // linux/ppc32 - Normal entry label.
-  if (!Subtarget.isPPC64() && 
+  if (!Subtarget->isPPC64() && 
       (TM.getRelocationModel() != Reloc::PIC_ || 
        MF->getFunction()->getParent()->getPICLevel() == PICLevel::Small))
     return AsmPrinter::EmitFunctionEntryLabel();
 
-  if (!Subtarget.isPPC64()) {
+  if (!Subtarget->isPPC64()) {
     const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
   	if (PPCFI->usesPICBase()) {
       MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol();
@@ -910,14 +1058,13 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
   }
 
   // ELFv2 ABI - Normal entry label.
-  if (Subtarget.isELFv2ABI())
+  if (Subtarget->isELFv2ABI())
     return AsmPrinter::EmitFunctionEntryLabel();
 
   // Emit an official procedure descriptor.
   MCSectionSubPair Current = OutStreamer.getCurrentSection();
-  const MCSectionELF *Section = OutStreamer.getContext().getELFSection(".opd",
-      ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
-      SectionKind::getReadOnly());
+  const MCSectionELF *Section = OutStreamer.getContext().getELFSection(
+      ".opd", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
   OutStreamer.SwitchSection(Section);
   OutStreamer.EmitLabel(CurrentFnSym);
   OutStreamer.EmitValueToAlignment(8);
@@ -944,7 +1091,7 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
 
 
 bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
-  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *TD = TM.getDataLayout();
 
   bool isPPC64 = TD->getPointerSizeInBits() == 64;
 
@@ -955,13 +1102,11 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
     const MCSectionELF *Section;
     
     if (isPPC64)
-      Section = OutStreamer.getContext().getELFSection(".toc",
-        ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
-        SectionKind::getReadOnly());
-	else
-      Section = OutStreamer.getContext().getELFSection(".got2",
-        ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
-        SectionKind::getReadOnly());
+      Section = OutStreamer.getContext().getELFSection(
+          ".toc", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+        else
+          Section = OutStreamer.getContext().getELFSection(
+              ".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
     OutStreamer.SwitchSection(Section);
 
     for (MapVector<MCSymbol*, MCSymbol*>::iterator I = TOC.begin(),
@@ -1015,7 +1160,7 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
   //
   // This ensures we have r2 set up correctly while executing the function
   // body, no matter which entry point is called.
-  if (Subtarget.isELFv2ABI()
+  if (Subtarget->isELFv2ABI()
       // Only do all that if the function uses r2 in the first place.
       && !MF->getRegInfo().use_empty(PPC::X2)) {
 
@@ -1070,7 +1215,7 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyEnd() {
   // FIXME: We should fill in the eight-byte mandatory fields as described in
   // the PPC64 ELF ABI (this is a low-priority item because GDB does not
   // currently make use of these fields).
-  if (Subtarget.isPPC64()) {
+  if (Subtarget->isPPC64()) {
     OutStreamer.EmitIntValue(0, 4/*size*/);
     OutStreamer.EmitIntValue(0, 8/*size*/);
   }
@@ -1101,13 +1246,21 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
     "ppc64le"
   };
 
-  unsigned Directive = Subtarget.getDarwinDirective();
-  if (Subtarget.hasMFOCRF() && Directive < PPC::DIR_970)
-    Directive = PPC::DIR_970;
-  if (Subtarget.hasAltivec() && Directive < PPC::DIR_7400)
-    Directive = PPC::DIR_7400;
-  if (Subtarget.isPPC64() && Directive < PPC::DIR_64)
-    Directive = PPC::DIR_64;
+  // Get the numerically largest directive.
+  // FIXME: How should we merge darwin directives?
+  unsigned Directive = PPC::DIR_NONE;
+  for (const Function &F : M) {
+    const PPCSubtarget &STI = TM.getSubtarget<PPCSubtarget>(F);
+    unsigned FDir = STI.getDarwinDirective();
+    Directive = Directive > FDir ? FDir : STI.getDarwinDirective();
+    if (STI.hasMFOCRF() && Directive < PPC::DIR_970)
+      Directive = PPC::DIR_970;
+    if (STI.hasAltivec() && Directive < PPC::DIR_7400)
+      Directive = PPC::DIR_7400;
+    if (STI.isPPC64() && Directive < PPC::DIR_64)
+      Directive = PPC::DIR_64;
+  }
+
   assert(Directive <= PPC::DIR_64 && "Directive out of range.");
 
   assert(Directive < array_lengthof(CPUDirectives) &&
@@ -1150,10 +1303,18 @@ static MCSymbol *GetAnonSym(MCSymbol *Sym, MCContext &Ctx) {
 
 void PPCDarwinAsmPrinter::
 EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
-  bool isPPC64 =
-      TM.getSubtargetImpl()->getDataLayout()->getPointerSizeInBits() == 64;
-  bool isDarwin = Subtarget.isDarwin();
-  
+  bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
+
+  // Construct a local MCSubtargetInfo and shadow EmitToStreamer here.
+  // This is because the MachineFunction won't exist (but have not yet been
+  // freed) and since we're at the global level we can use the default
+  // constructed subtarget.
+  std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
+      TM.getTargetTriple(), TM.getTargetCPU(), TM.getTargetFeatureString()));
+  auto EmitToStreamer = [&STI] (MCStreamer &S, const MCInst &Inst) {
+    S.EmitInstruction(Inst, *STI);
+  };
+
   const TargetLoweringObjectFileMachO &TLOFMacho = 
     static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
 
@@ -1192,7 +1353,7 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
       // mflr r11
       EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MFLR).addReg(PPC::R11));
       // addis r11, r11, ha16(LazyPtr - AnonSymbol)
-      const MCExpr *SubHa16 = PPCMCExpr::CreateHa(Sub, isDarwin, OutContext);
+      const MCExpr *SubHa16 = PPCMCExpr::CreateHa(Sub, true, OutContext);
       EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS)
         .addReg(PPC::R11)
         .addReg(PPC::R11)
@@ -1202,7 +1363,7 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
 
       // ldu r12, lo16(LazyPtr - AnonSymbol)(r11)
       // lwzu r12, lo16(LazyPtr - AnonSymbol)(r11)
-      const MCExpr *SubLo16 = PPCMCExpr::CreateLo(Sub, isDarwin, OutContext);
+      const MCExpr *SubLo16 = PPCMCExpr::CreateLo(Sub, true, OutContext);
       EmitToStreamer(OutStreamer, MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
         .addReg(PPC::R12)
         .addExpr(SubLo16).addExpr(SubLo16)
@@ -1248,7 +1409,7 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
 
     // lis r11, ha16(LazyPtr)
     const MCExpr *LazyPtrHa16 =
-      PPCMCExpr::CreateHa(LazyPtrExpr, isDarwin, OutContext);
+      PPCMCExpr::CreateHa(LazyPtrExpr, true, OutContext);
     EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LIS)
       .addReg(PPC::R11)
       .addExpr(LazyPtrHa16));
@@ -1256,7 +1417,7 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
     // ldu r12, lo16(LazyPtr)(r11)
     // lwzu r12, lo16(LazyPtr)(r11)
     const MCExpr *LazyPtrLo16 =
-      PPCMCExpr::CreateLo(LazyPtrExpr, isDarwin, OutContext);
+      PPCMCExpr::CreateLo(LazyPtrExpr, true, OutContext);
     EmitToStreamer(OutStreamer, MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
       .addReg(PPC::R12)
       .addExpr(LazyPtrLo16).addExpr(LazyPtrLo16)
@@ -1287,8 +1448,7 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
 
 
 bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
-  bool isPPC64 =
-      TM.getSubtargetImpl()->getDataLayout()->getPointerSizeInBits() == 64;
+  bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
 
   // Darwin/PPC always uses mach-o.
   const TargetLoweringObjectFileMachO &TLOFMacho = 
@@ -1383,13 +1543,12 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
 /// for a MachineFunction to the given output stream, in a format that the
 /// Darwin assembler can deal with.
 ///
-static AsmPrinter *createPPCAsmPrinterPass(TargetMachine &tm,
-                                           MCStreamer &Streamer) {
-  const PPCSubtarget *Subtarget = &tm.getSubtarget<PPCSubtarget>();
-
-  if (Subtarget->isDarwin())
-    return new PPCDarwinAsmPrinter(tm, Streamer);
-  return new PPCLinuxAsmPrinter(tm, Streamer);
+static AsmPrinter *
+createPPCAsmPrinterPass(TargetMachine &tm,
+                        std::unique_ptr<MCStreamer> &&Streamer) {
+  if (Triple(tm.getTargetTriple()).isMacOSX())
+    return new PPCDarwinAsmPrinter(tm, std::move(Streamer));
+  return new PPCLinuxAsmPrinter(tm, std::move(Streamer));
 }
 
 // Force static initialization.
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index 41594be..940d55a 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -70,12 +70,37 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
   Fn.RenumberBlocks();
   BlockSizes.resize(Fn.getNumBlockIDs());
 
+  auto GetAlignmentAdjustment =
+    [TII](MachineBasicBlock &MBB, unsigned Offset) -> unsigned {
+    unsigned Align = MBB.getAlignment();
+    if (!Align)
+      return 0;
+
+    unsigned AlignAmt = 1 << Align;
+    unsigned ParentAlign = MBB.getParent()->getAlignment();
+
+    if (Align <= ParentAlign)
+      return OffsetToAlignment(Offset, AlignAmt);
+
+    // The alignment of this MBB is larger than the function's alignment, so we
+    // can't tell whether or not it will insert nops. Assume that it will.
+    return AlignAmt + OffsetToAlignment(Offset, AlignAmt);
+  };
+
   // Measure each MBB and compute a size for the entire function.
   unsigned FuncSize = 0;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
        ++MFI) {
     MachineBasicBlock *MBB = MFI;
 
+    // The end of the previous block may have extra nops if this block has an
+    // alignment requirement.
+    if (MBB->getNumber() > 0) {
+      unsigned AlignExtra = GetAlignmentAdjustment(*MBB, FuncSize);
+      BlockSizes[MBB->getNumber()-1] += AlignExtra;
+      FuncSize += AlignExtra;
+    }
+
     unsigned BlockSize = 0;
     for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
          MBBI != EE; ++MBBI)
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 5f3b176..5af8aab 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -30,6 +30,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
@@ -42,7 +43,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -94,8 +94,8 @@ namespace {
     bool runOnFunction(Function &F) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<LoopInfo>();
-      AU.addPreserved<LoopInfo>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addRequired<ScalarEvolution>();
@@ -146,7 +146,7 @@ namespace {
 INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
                     false, false)
@@ -168,12 +168,13 @@ FunctionPass *llvm::createPPCCTRLoopsVerify() {
 #endif // NDEBUG
 
 bool PPCCTRLoops::runOnFunction(Function &F) {
-  LI = &getAnalysis<LoopInfo>();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   SE = &getAnalysis<ScalarEvolution>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
-  LibInfo = getAnalysisIfAvailable<TargetLibraryInfo>();
+  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+  LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
 
   bool MadeChange = false;
 
@@ -194,6 +195,21 @@ static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) {
   return false;
 }
 
+// Determining the address of a TLS variable results in a function call in
+// certain TLS models.
+static bool memAddrUsesCTR(const PPCTargetMachine *TM,
+                           const llvm::Value *MemAddr) {
+  const auto *GV = dyn_cast<GlobalValue>(MemAddr);
+  if (!GV)
+    return false;
+  if (!GV->isThreadLocal())
+    return false;
+  if (!TM)
+    return true;
+  TLSModel::Model Model = TM->getTLSModel(GV);
+  return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic;
+}
+
 bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
   for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
        J != JE; ++J) {
@@ -214,7 +230,8 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
 
       if (!TM)
         return true;
-      const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
+      const TargetLowering *TLI =
+          TM->getSubtargetImpl(*BB->getParent())->getTargetLowering();
 
       if (Function *F = CI->getCalledFunction()) {
         // Most intrinsics don't become function calls, but some might.
@@ -384,11 +401,15 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
       if (!TM)
         return true;
-      const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
+      const TargetLowering *TLI =
+          TM->getSubtargetImpl(*BB->getParent())->getTargetLowering();
 
       if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
         return true;
     }
+    for (Value *Operand : J->operands())
+      if (memAddrUsesCTR(TM, Operand))
+        return true;
   }
 
   return false;
diff --git a/lib/Target/PowerPC/PPCCallingConv.h b/lib/Target/PowerPC/PPCCallingConv.h
new file mode 100644
index 0000000..eb904a8
--- /dev/null
+++ b/lib/Target/PowerPC/PPCCallingConv.h
@@ -0,0 +1,35 @@
+//=== PPCCallingConv.h - PPC Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the PPC Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_PPC_PPCCALLINGCONV_H
+#define LLVM_LIB_TARGET_PPC_PPCCALLINGCONV_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+inline bool CC_PPC_AnyReg_Error(unsigned &, MVT &, MVT &,
+                                CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
+                                CCState &) {
+  llvm_unreachable("The AnyReg calling convention is only supported by the " \
+                   "stackmap and patchpoint intrinsics.");
+  // gracefully fallback to PPC C calling convention on Release builds.
+  return false;
+}
+
+} // End llvm namespace
+
+#endif
+
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index cf8fee4..045fca3 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -28,8 +28,21 @@ class CCIfNotSubtarget<string F, CCAction A>
 // Return Value Calling Convention
 //===----------------------------------------------------------------------===//
 
+// PPC64 AnyReg return-value convention. No explicit register is specified for
+// the return-value. The register allocator is allowed and expected to choose
+// any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the PPC C calling convention.
+def RetCC_PPC64_AnyReg : CallingConv<[
+  CCCustom<"CC_PPC_AnyReg_Error">
+]>;
+
 // Return-value convention for PowerPC
 def RetCC_PPC : CallingConv<[
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_PPC64_AnyReg>>,
+
   // On PPC64, integer return values are always promoted to i64
   CCIfType<[i32, i1], CCIfSubtarget<"isPPC64()", CCPromoteToType<i64>>>,
   CCIfType<[i1], CCIfNotSubtarget<"isPPC64()", CCPromoteToType<i32>>>,
@@ -42,15 +55,28 @@ def RetCC_PPC : CallingConv<[
   // only the ELFv2 ABI fully utilizes all these registers.
   CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
   CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
-  
+
+  // QPX vectors are returned in QF1 and QF2. 
+  CCIfType<[v4f64, v4f32, v4i1],
+           CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>,
+ 
   // Vector types returned as "direct" go into V2 .. V9; note that only the
   // ELFv2 ABI fully utilizes all these registers.
-  CCIfType<[v16i8, v8i16, v4i32, v4f32],
-           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>,
-  CCIfType<[v2f64, v2i64],
-           CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>>
+  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCIfSubtarget<"hasAltivec()",
+           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>,
+  CCIfType<[v2f64, v2i64], CCIfSubtarget<"hasVSX()",
+           CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>>>
 ]>;
 
+// No explicit register is specified for the AnyReg calling convention. The
+// register allocator may assign the arguments to any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the PPC C calling convention.
+def CC_PPC64_AnyReg : CallingConv<[
+  CCCustom<"CC_PPC_AnyReg_Error">
+]>;
 
 // Note that we don't currently have calling conventions for 64-bit
 // PowerPC, but handle all the complexities of the ABI in the lowering
@@ -61,6 +87,8 @@ def RetCC_PPC : CallingConv<[
 // Only handle ints and floats.  All ints are promoted to i64.
 // Vector types and quadword ints are not handled.
 def CC_PPC64_ELF_FIS : CallingConv<[
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_PPC64_AnyReg>>,
+
   CCIfType<[i1],  CCPromoteToType<i64>>,
   CCIfType<[i8],  CCPromoteToType<i64>>,
   CCIfType<[i16], CCPromoteToType<i64>>,
@@ -74,6 +102,8 @@ def CC_PPC64_ELF_FIS : CallingConv<[
 // and multiple register returns are "supported" to avoid compile
 // errors, but none are handled by the fast selector.
 def RetCC_PPC64_ELF_FIS : CallingConv<[
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_PPC64_AnyReg>>,
+
   CCIfType<[i1],   CCPromoteToType<i64>>,
   CCIfType<[i8],   CCPromoteToType<i64>>,
   CCIfType<[i16],  CCPromoteToType<i64>>,
@@ -82,10 +112,12 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[
   CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
   CCIfType<[f32],  CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
   CCIfType<[f64],  CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
-  CCIfType<[v16i8, v8i16, v4i32, v4f32],
-           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>,
-  CCIfType<[v2f64, v2i64],
-           CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>>
+  CCIfType<[v4f64, v4f32, v4i1],
+           CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>,
+  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCIfSubtarget<"hasAltivec()",
+           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>,
+  CCIfType<[v2f64, v2i64], CCIfSubtarget<"hasVSX()",
+           CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>>>
 ]>;
 
 //===----------------------------------------------------------------------===//
@@ -118,6 +150,9 @@ def CC_PPC32_SVR4_Common : CallingConv<[
   // alignment and size as doubles.
   CCIfType<[f32,f64], CCAssignToStack<8, 8>>,  
 
+  // QPX vectors that are stored in double precision need 32-byte alignment.
+  CCIfType<[v4f64, v4i1], CCAssignToStack<32, 32>>,
+
   // Vectors get 16-byte stack slots that are 16-byte aligned.
   CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>
 ]>;
@@ -132,12 +167,17 @@ def CC_PPC32_SVR4_VarArg : CallingConv<[
 // In contrast to CC_PPC32_SVR4_VarArg, this calling convention first tries to
 // put vector arguments in vector registers before putting them on the stack.
 def CC_PPC32_SVR4 : CallingConv<[
+  // QPX vectors mirror the scalar FP convention.
+  CCIfType<[v4f64, v4f32, v4i1], CCIfSubtarget<"hasQPX()",
+    CCAssignToReg<[QF1, QF2, QF3, QF4, QF5, QF6, QF7, QF8]>>>,
+
   // The first 12 Vector arguments are passed in AltiVec registers.
-  CCIfType<[v16i8, v8i16, v4i32, v4f32],
-           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13]>>,
-  CCIfType<[v2f64, v2i64],
+  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCIfSubtarget<"hasAltivec()",
+           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9,
+                          V10, V11, V12, V13]>>>,
+  CCIfType<[v2f64, v2i64], CCIfSubtarget<"hasVSX()",
            CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9,
-                          VSH10, VSH11, VSH12, VSH13]>>,
+                          VSH10, VSH11, VSH12, VSH13]>>>,
            
   CCDelegateTo<CC_PPC32_SVR4_Common>
 ]>;  
@@ -198,8 +238,23 @@ def CSR_SVR464   : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20,
                                         F27, F28, F29, F30, F31, CR2, CR3, CR4
                                    )>;
 
-
 def CSR_SVR464_Altivec : CalleeSavedRegs<(add CSR_SVR464, CSR_Altivec)>;
 
+def CSR_SVR464_R2 : CalleeSavedRegs<(add CSR_SVR464, X2)>;
+
+def CSR_SVR464_R2_Altivec : CalleeSavedRegs<(add CSR_SVR464_Altivec, X2)>;
+
 def CSR_NoRegs : CalleeSavedRegs<(add)>;
 
+def CSR_64_AllRegs: CalleeSavedRegs<(add X0, (sequence "X%u", 3, 10),
+                                             (sequence "X%u", 14, 31),
+                                             (sequence "F%u", 0, 31),
+                                             (sequence "CR%u", 0, 7))>;
+
+def CSR_64_AllRegs_Altivec : CalleeSavedRegs<(add CSR_64_AllRegs,
+                                             (sequence "V%u", 0, 31))>;
+
+def CSR_64_AllRegs_VSX : CalleeSavedRegs<(add CSR_64_AllRegs_Altivec,
+                                         (sequence "VSL%u", 0, 31),
+                                         (sequence "VSH%u", 0, 31))>;
+
diff --git a/lib/Target/PowerPC/PPCEarlyReturn.cpp b/lib/Target/PowerPC/PPCEarlyReturn.cpp
new file mode 100644
index 0000000..08673cc
--- /dev/null
+++ b/lib/Target/PowerPC/PPCEarlyReturn.cpp
@@ -0,0 +1,201 @@
+//===------------- PPCEarlyReturn.cpp - Form Early Returns ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass that form early (predicated) returns. If-conversion handles some of
+// this, but this pass picks up some remaining cases.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-early-ret"
+STATISTIC(NumBCLR, "Number of early conditional returns");
+STATISTIC(NumBLR,  "Number of early returns");
+
+namespace llvm {
+  void initializePPCEarlyReturnPass(PassRegistry&);
+}
+
+namespace {
+  // PPCEarlyReturn pass - For simple functions without epilogue code, move
+  // returns up, and create conditional returns, to avoid unnecessary
+  // branch-to-blr sequences.
+  struct PPCEarlyReturn : public MachineFunctionPass {
+    static char ID;
+    PPCEarlyReturn() : MachineFunctionPass(ID) {
+      initializePPCEarlyReturnPass(*PassRegistry::getPassRegistry());
+    }
+
+    const TargetInstrInfo *TII;
+
+protected:
+    bool processBlock(MachineBasicBlock &ReturnMBB) {
+      bool Changed = false;
+
+      MachineBasicBlock::iterator I = ReturnMBB.begin();
+      I = ReturnMBB.SkipPHIsAndLabels(I);
+
+      // The block must be essentially empty except for the blr.
+      if (I == ReturnMBB.end() ||
+          (I->getOpcode() != PPC::BLR && I->getOpcode() != PPC::BLR8) ||
+          I != ReturnMBB.getLastNonDebugInstr())
+        return Changed;
+
+      SmallVector<MachineBasicBlock*, 8> PredToRemove;
+      for (MachineBasicBlock::pred_iterator PI = ReturnMBB.pred_begin(),
+           PIE = ReturnMBB.pred_end(); PI != PIE; ++PI) {
+        bool OtherReference = false, BlockChanged = false;
+        for (MachineBasicBlock::iterator J = (*PI)->getLastNonDebugInstr();;) {
+          MachineInstrBuilder MIB;
+          if (J->getOpcode() == PPC::B) {
+            if (J->getOperand(0).getMBB() == &ReturnMBB) {
+              // This is an unconditional branch to the return. Replace the
+              // branch with a blr.
+              MIB =
+                BuildMI(**PI, J, J->getDebugLoc(), TII->get(I->getOpcode()));
+              MIB.copyImplicitOps(I);
+              MachineBasicBlock::iterator K = J--;
+              K->eraseFromParent();
+              BlockChanged = true;
+              ++NumBLR;
+              continue;
+            }
+          } else if (J->getOpcode() == PPC::BCC) {
+            if (J->getOperand(2).getMBB() == &ReturnMBB) {
+              // This is a conditional branch to the return. Replace the branch
+              // with a bclr.
+              MIB = BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR))
+                      .addImm(J->getOperand(0).getImm())
+                      .addReg(J->getOperand(1).getReg());
+              MIB.copyImplicitOps(I);
+              MachineBasicBlock::iterator K = J--;
+              K->eraseFromParent();
+              BlockChanged = true;
+              ++NumBCLR;
+              continue;
+            }
+          } else if (J->getOpcode() == PPC::BC || J->getOpcode() == PPC::BCn) {
+            if (J->getOperand(1).getMBB() == &ReturnMBB) {
+              // This is a conditional branch to the return. Replace the branch
+              // with a bclr.
+              MIB = BuildMI(**PI, J, J->getDebugLoc(),
+                            TII->get(J->getOpcode() == PPC::BC ?
+                                     PPC::BCLR : PPC::BCLRn))
+                      .addReg(J->getOperand(0).getReg());
+              MIB.copyImplicitOps(I);
+              MachineBasicBlock::iterator K = J--;
+              K->eraseFromParent();
+              BlockChanged = true;
+              ++NumBCLR;
+              continue;
+            }
+          } else if (J->isBranch()) {
+            if (J->isIndirectBranch()) {
+              if (ReturnMBB.hasAddressTaken())
+                OtherReference = true;
+            } else
+              for (unsigned i = 0; i < J->getNumOperands(); ++i)
+                if (J->getOperand(i).isMBB() &&
+                    J->getOperand(i).getMBB() == &ReturnMBB)
+                  OtherReference = true;
+          } else if (!J->isTerminator() && !J->isDebugValue())
+            break;
+
+          if (J == (*PI)->begin())
+            break;
+
+          --J;
+        }
+
+        if ((*PI)->canFallThrough() && (*PI)->isLayoutSuccessor(&ReturnMBB))
+          OtherReference = true;
+
+        // Predecessors are stored in a vector and can't be removed here.
+        if (!OtherReference && BlockChanged) {
+          PredToRemove.push_back(*PI);
+        }
+
+        if (BlockChanged)
+          Changed = true;
+      }
+
+      for (unsigned i = 0, ie = PredToRemove.size(); i != ie; ++i)
+        PredToRemove[i]->removeSuccessor(&ReturnMBB);
+
+      if (Changed && !ReturnMBB.hasAddressTaken()) {
+        // We now might be able to merge this blr-only block into its
+        // by-layout predecessor.
+        if (ReturnMBB.pred_size() == 1 &&
+            (*ReturnMBB.pred_begin())->isLayoutSuccessor(&ReturnMBB)) {
+          // Move the blr into the preceding block.
+          MachineBasicBlock &PrevMBB = **ReturnMBB.pred_begin();
+          PrevMBB.splice(PrevMBB.end(), &ReturnMBB, I);
+          PrevMBB.removeSuccessor(&ReturnMBB);
+        }
+
+        if (ReturnMBB.pred_empty())
+          ReturnMBB.eraseFromParent();
+      }
+
+      return Changed;
+    }
+
+public:
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      TII = MF.getSubtarget().getInstrInfo();
+
+      bool Changed = false;
+
+      // If the function does not have at least two blocks, then there is
+      // nothing to do.
+      if (MF.size() < 2)
+        return Changed;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS(PPCEarlyReturn, DEBUG_TYPE,
+                "PowerPC Early-Return Creation", false, false)
+
+char PPCEarlyReturn::ID = 0;
+FunctionPass*
+llvm::createPPCEarlyReturnPass() { return new PPCEarlyReturn(); }
+
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 1149354..54532b5 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -15,7 +15,9 @@
 
 #include "PPC.h"
 #include "MCTargetDesc/PPCPredicates.h"
+#include "PPCCallingConv.h"
 #include "PPCISelLowering.h"
+#include "PPCMachineFunctionInfo.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/Optional.h"
@@ -84,18 +86,20 @@ typedef struct Address {
 class PPCFastISel final : public FastISel {
 
   const TargetMachine &TM;
+  const PPCSubtarget *PPCSubTarget;
+  PPCFunctionInfo *PPCFuncInfo;
   const TargetInstrInfo &TII;
   const TargetLowering &TLI;
-  const PPCSubtarget *PPCSubTarget;
   LLVMContext *Context;
 
   public:
     explicit PPCFastISel(FunctionLoweringInfo &FuncInfo,
                          const TargetLibraryInfo *LibInfo)
         : FastISel(FuncInfo, LibInfo), TM(FuncInfo.MF->getTarget()),
-          TII(*TM.getSubtargetImpl()->getInstrInfo()),
-          TLI(*TM.getSubtargetImpl()->getTargetLowering()),
-          PPCSubTarget(&TM.getSubtarget<PPCSubtarget>()),
+          PPCSubTarget(&FuncInfo.MF->getSubtarget<PPCSubtarget>()),
+          PPCFuncInfo(FuncInfo.MF->getInfo<PPCFunctionInfo>()),
+          TII(*PPCSubTarget->getInstrInfo()),
+          TLI(*PPCSubTarget->getTargetLowering()),
           Context(&FuncInfo.Fn->getContext()) {}
 
   // Backend specific FastISel code.
@@ -119,6 +123,8 @@ class PPCFastISel final : public FastISel {
                              unsigned Op0, bool Op0IsKill,
                              unsigned Op1, bool Op1IsKill);
 
+    bool fastLowerCall(CallLoweringInfo &CLI) override;
+
   // Instruction selection routines.
   private:
     bool SelectLoad(const Instruction *I);
@@ -130,7 +136,6 @@ class PPCFastISel final : public FastISel {
     bool SelectIToFP(const Instruction *I, bool IsSigned);
     bool SelectFPToI(const Instruction *I, bool IsSigned);
     bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode);
-    bool SelectCall(const Instruction *I);
     bool SelectRet(const Instruction *I);
     bool SelectTrunc(const Instruction *I);
     bool SelectIntExt(const Instruction *I);
@@ -139,6 +144,9 @@ class PPCFastISel final : public FastISel {
   private:
     bool isTypeLegal(Type *Ty, MVT &VT);
     bool isLoadTypeLegal(Type *Ty, MVT &VT);
+    bool isVSFRCRegister(unsigned Register) const {
+      return MRI.getRegClass(Register)->getID() == PPC::VSFRCRegClassID;
+    }
     bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value,
                     bool isZExt, unsigned DestReg);
     bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
@@ -171,9 +179,7 @@ class PPCFastISel final : public FastISel {
                          CallingConv::ID CC,
                          unsigned &NumBytes,
                          bool IsVarArg);
-    void finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
-                    const Instruction *I, CallingConv::ID CC,
-                    unsigned &NumBytes, bool IsVarArg);
+    bool finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumBytes);
     CCAssignFn *usePPC32CCs(unsigned Flag);
 
   private:
@@ -482,6 +488,16 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   // the indexed form.  Also handle stack pointers with special needs.
   unsigned IndexReg = 0;
   PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg);
+
+  // If this is a potential VSX load with an offset of 0, a VSX indexed load can
+  // be used.
+  bool IsVSFRC = (ResultReg != 0) && isVSFRCRegister(ResultReg);
+  if (IsVSFRC && (Opc == PPC::LFD) && 
+      (Addr.BaseType != Address::FrameIndexBase) && UseOffset &&
+      (Addr.Offset == 0)) {
+    UseOffset = false;
+  }
+
   if (ResultReg == 0)
     ResultReg = createResultReg(UseRC);
 
@@ -489,6 +505,8 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   // in range, as otherwise PPCSimplifyAddress would have converted it
   // into a RegBase.
   if (Addr.BaseType == Address::FrameIndexBase) {
+    // VSX only provides an indexed load.
+    if (IsVSFRC && Opc == PPC::LFD) return false;
 
     MachineMemOperand *MMO =
       FuncInfo.MF->getMachineMemOperand(
@@ -501,6 +519,8 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
 
   // Base reg with offset in range.
   } else if (UseOffset) {
+    // VSX only provides an indexed load.
+    if (IsVSFRC && Opc == PPC::LFD) return false;
 
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addImm(Addr.Offset).addReg(Addr.Base.Reg);
@@ -524,7 +544,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
       case PPC::LWA_32: Opc = PPC::LWAX_32; break;
       case PPC::LD:     Opc = PPC::LDX;     break;
       case PPC::LFS:    Opc = PPC::LFSX;    break;
-      case PPC::LFD:    Opc = PPC::LFDX;    break;
+      case PPC::LFD:    Opc = IsVSFRC ? PPC::LXSDX : PPC::LFDX; break;
     }
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(Addr.Base.Reg).addReg(IndexReg);
@@ -602,10 +622,22 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
   unsigned IndexReg = 0;
   PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg);
 
+  // If this is a potential VSX store with an offset of 0, a VSX indexed store
+  // can be used.
+  bool IsVSFRC = isVSFRCRegister(SrcReg);
+  if (IsVSFRC && (Opc == PPC::STFD) && 
+      (Addr.BaseType != Address::FrameIndexBase) && UseOffset && 
+      (Addr.Offset == 0)) {
+    UseOffset = false;
+  }
+
   // Note: If we still have a frame index here, we know the offset is
   // in range, as otherwise PPCSimplifyAddress would have converted it
   // into a RegBase.
   if (Addr.BaseType == Address::FrameIndexBase) {
+    // VSX only provides an indexed store.
+    if (IsVSFRC && Opc == PPC::STFD) return false;
+
     MachineMemOperand *MMO =
       FuncInfo.MF->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
@@ -619,12 +651,15 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
         .addMemOperand(MMO);
 
   // Base reg with offset in range.
-  } else if (UseOffset)
+  } else if (UseOffset) {
+    // VSX only provides an indexed store.
+    if (IsVSFRC && Opc == PPC::STFD) return false;
+    
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
       .addReg(SrcReg).addImm(Addr.Offset).addReg(Addr.Base.Reg);
 
   // Indexed form.
-  else {
+  } else {
     // Get the RR opcode corresponding to the RI one.  FIXME: It would be
     // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
     // is hard to get at.
@@ -638,7 +673,7 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
       case PPC::STW8: Opc = PPC::STWX8; break;
       case PPC::STD:  Opc = PPC::STDX;  break;
       case PPC::STFS: Opc = PPC::STFSX; break;
-      case PPC::STFD: Opc = PPC::STFDX; break;
+      case PPC::STFD: Opc = IsVSFRC ? PPC::STXSDX : PPC::STFDX; break;
     }
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
       .addReg(SrcReg).addReg(Addr.Base.Reg).addReg(IndexReg);
@@ -1202,9 +1237,7 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
   CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, *Context);
 
   // Reserve space for the linkage area on the stack.
-  bool isELFv2ABI = PPCSubTarget->isELFv2ABI();
-  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false,
-                                                          isELFv2ABI);
+  unsigned LinkageSize = PPCSubTarget->getFrameLowering()->getLinkageSize();
   CCInfo.AllocateStack(LinkageSize, 8);
 
   CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS);
@@ -1243,7 +1276,7 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
 
   // Prepare to assign register arguments.  Every argument uses up a
   // GPR protocol register even if it's passed in a floating-point
-  // register.
+  // register (unless we're using the fast calling convention).
   unsigned NextGPR = PPC::X3;
   unsigned NextFPR = PPC::F1;
 
@@ -1293,7 +1326,8 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
     unsigned ArgReg;
     if (ArgVT == MVT::f32 || ArgVT == MVT::f64) {
       ArgReg = NextFPR++;
-      ++NextGPR;
+      if (CC != CallingConv::Fast)
+        ++NextGPR;
     } else
       ArgReg = NextGPR++;
 
@@ -1307,9 +1341,9 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
 
 // For a call that we've determined we can fast-select, finish the
 // call sequence and generate a copy to obtain the return value (if any).
-void PPCFastISel::finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
-                             const Instruction *I, CallingConv::ID CC,
-                             unsigned &NumBytes, bool IsVarArg) {
+bool PPCFastISel::finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumBytes) {
+  CallingConv::ID CC = CLI.CallConv;
+
   // Issue CallSEQ_END.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(TII.getCallFrameDestroyOpcode()))
@@ -1320,7 +1354,7 @@ void PPCFastISel::finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
   // any real difficulties there.
   if (RetVT != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, *Context);
+    CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
     CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
     CCValAssign &VA = RVLocs[0];
     assert(RVLocs.size() == 1 && "No support for multi-reg return values!");
@@ -1365,39 +1399,35 @@ void PPCFastISel::finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
     }
 
     assert(ResultReg && "ResultReg unset!");
-    UsedRegs.push_back(SourcePhysReg);
-    updateValueMap(I, ResultReg);
+    CLI.InRegs.push_back(SourcePhysReg);
+    CLI.ResultReg = ResultReg;
+    CLI.NumResultRegs = 1;
   }
+
+  return true;
 }
 
-// Attempt to fast-select a call instruction.
-bool PPCFastISel::SelectCall(const Instruction *I) {
-  const CallInst *CI = cast<CallInst>(I);
-  const Value *Callee = CI->getCalledValue();
+bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) {
+  CallingConv::ID CC  = CLI.CallConv;
+  bool IsTailCall     = CLI.IsTailCall;
+  bool IsVarArg       = CLI.IsVarArg;
+  const Value *Callee = CLI.Callee;
+  const char *SymName = CLI.SymName;
 
-  // Can't handle inline asm.
-  if (isa<InlineAsm>(Callee))
+  if (!Callee && !SymName)
     return false;
 
   // Allow SelectionDAG isel to handle tail calls.
-  if (CI->isTailCall())
+  if (IsTailCall)
     return false;
 
-  // Obtain calling convention.
-  ImmutableCallSite CS(CI);
-  CallingConv::ID CC = CS.getCallingConv();
-
-  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
-  bool IsVarArg = FTy->isVarArg();
-
-  // Not ready for varargs yet.
+  // Let SDISel handle vararg functions.
   if (IsVarArg)
     return false;
 
   // Handle simple calls for now, with legal return types and
   // those that can be extended.
-  Type *RetTy = I->getType();
+  Type *RetTy = CLI.RetTy;
   MVT RetVT;
   if (RetTy->isVoidTy())
     RetVT = MVT::isVoid;
@@ -1418,7 +1448,7 @@ bool PPCFastISel::SelectCall(const Instruction *I) {
 
   // Bail early if more than 8 arguments, as we only currently
   // handle arguments passed in registers.
-  unsigned NumArgs = CS.arg_size();
+  unsigned NumArgs = CLI.OutVals.size();
   if (NumArgs > 8)
     return false;
 
@@ -1433,28 +1463,16 @@ bool PPCFastISel::SelectCall(const Instruction *I) {
   ArgVTs.reserve(NumArgs);
   ArgFlags.reserve(NumArgs);
 
-  for (ImmutableCallSite::arg_iterator II = CS.arg_begin(), IE = CS.arg_end();
-       II != IE; ++II) {
-    // FIXME: ARM does something for intrinsic calls here, check into that.
-
-    unsigned AttrIdx = II - CS.arg_begin() + 1;
-    
+  for (unsigned i = 0, ie = NumArgs; i != ie; ++i) {
     // Only handle easy calls for now.  It would be reasonably easy
     // to handle <= 8-byte structures passed ByVal in registers, but we
     // have to ensure they are right-justified in the register.
-    if (CS.paramHasAttr(AttrIdx, Attribute::InReg) ||
-        CS.paramHasAttr(AttrIdx, Attribute::StructRet) ||
-        CS.paramHasAttr(AttrIdx, Attribute::Nest) ||
-        CS.paramHasAttr(AttrIdx, Attribute::ByVal))
+    ISD::ArgFlagsTy Flags = CLI.OutFlags[i];
+    if (Flags.isInReg() || Flags.isSRet() || Flags.isNest() || Flags.isByVal())
       return false;
 
-    ISD::ArgFlagsTy Flags;
-    if (CS.paramHasAttr(AttrIdx, Attribute::SExt))
-      Flags.setSExt();
-    if (CS.paramHasAttr(AttrIdx, Attribute::ZExt))
-      Flags.setZExt();
-
-    Type *ArgTy = (*II)->getType();
+    Value *ArgValue = CLI.OutVals[i];
+    Type *ArgTy = ArgValue->getType();
     MVT ArgVT;
     if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8)
       return false;
@@ -1462,14 +1480,11 @@ bool PPCFastISel::SelectCall(const Instruction *I) {
     if (ArgVT.isVector())
       return false;
 
-    unsigned Arg = getRegForValue(*II);
+    unsigned Arg = getRegForValue(ArgValue);
     if (Arg == 0)
       return false;
 
-    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
-    Flags.setOrigAlign(OriginalAlignment);
-
-    Args.push_back(*II);
+    Args.push_back(ArgValue);
     ArgRegs.push_back(Arg);
     ArgVTs.push_back(ArgVT);
     ArgFlags.push_back(Flags);
@@ -1483,39 +1498,46 @@ bool PPCFastISel::SelectCall(const Instruction *I) {
                        RegArgs, CC, NumBytes, IsVarArg))
     return false;
 
+  MachineInstrBuilder MIB;
   // FIXME: No handling for function pointers yet.  This requires
   // implementing the function descriptor (OPD) setup.
   const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
-  if (!GV)
-    return false;
-
-  // Build direct call with NOP for TOC restore.
-  // FIXME: We can and should optimize away the NOP for local calls.
-  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                    TII.get(PPC::BL8_NOP));
-  // Add callee.
-  MIB.addGlobalAddress(GV);
+  if (!GV) {
+    // patchpoints are a special case; they always dispatch to a pointer value.
+    // However, we don't actually want to generate the indirect call sequence
+    // here (that will be generated, as necessary, during asm printing), and
+    // the call we generate here will be erased by FastISel::selectPatchpoint,
+    // so don't try very hard...
+    if (CLI.IsPatchPoint)
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::NOP));
+    else
+      return false;
+  } else {
+    // Build direct call with NOP for TOC restore.
+    // FIXME: We can and should optimize away the NOP for local calls.
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                  TII.get(PPC::BL8_NOP));
+    // Add callee.
+    MIB.addGlobalAddress(GV);
+  }
 
   // Add implicit physical register uses to the call.
   for (unsigned II = 0, IE = RegArgs.size(); II != IE; ++II)
     MIB.addReg(RegArgs[II], RegState::Implicit);
 
-  // Direct calls in the ELFv2 ABI need the TOC register live into the call.
-  if (PPCSubTarget->isELFv2ABI())
-    MIB.addReg(PPC::X2, RegState::Implicit);
+  // Direct calls, in both the ELF V1 and V2 ABIs, need the TOC register live
+  // into the call.
+  PPCFuncInfo->setUsesTOCBasePtr();
+  MIB.addReg(PPC::X2, RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.  Proper
   // defs for return values will be added by setPhysRegsDeadExcept().
   MIB.addRegMask(TRI.getCallPreservedMask(CC));
 
-  // Finish off the call including any return values.
-  SmallVector<unsigned, 4> UsedRegs;
-  finishCall(RetVT, UsedRegs, I, CC, NumBytes, IsVarArg);
-
-  // Set all unused physregs defs as dead.
-  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+  CLI.Call = MIB;
 
-  return true;
+  // Finish off the call including any return values.
+  return finishCall(RetVT, CLI, NumBytes);
 }
 
 // Attempt to fast-select a return instruction.
@@ -1626,7 +1648,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
   }
 
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                    TII.get(PPC::BLR));
+                                    TII.get(PPC::BLR8));
 
   for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
     MIB.addReg(RetRegs[i], RegState::Implicit);
@@ -1805,9 +1827,7 @@ bool PPCFastISel::fastSelectInstruction(const Instruction *I) {
     case Instruction::Sub:
       return SelectBinaryIntOp(I, ISD::SUB);
     case Instruction::Call:
-      if (dyn_cast<IntrinsicInst>(I))
-        return false;
-      return SelectCall(I);
+      return selectCall(I);
     case Instruction::Ret:
       return SelectRet(I);
     case Instruction::Trunc:
@@ -1846,6 +1866,7 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
   unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD;
   unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
 
+  PPCFuncInfo->setUsesTOCBasePtr();
   // For small code model, generate a LF[SD](0, LDtocCPT(Idx, X2)).
   if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocCPT),
@@ -1895,6 +1916,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
   if (GV->isThreadLocal())
     return 0;
 
+  PPCFuncInfo->setUsesTOCBasePtr();
   // For small code model, generate a simple TOC load.
   if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtoc),
@@ -2077,7 +2099,7 @@ unsigned PPCFastISel::fastMaterializeConstant(const Constant *C) {
   else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
     return PPCMaterializeGV(GV, VT);
   else if (isa<ConstantInt>(C))
-    return PPCMaterializeInt(C, VT);
+    return PPCMaterializeInt(C, VT, VT != MVT::i1);
 
   return 0;
 }
@@ -2280,13 +2302,10 @@ namespace llvm {
   // Create the fast instruction selector for PowerPC64 ELF.
   FastISel *PPC::createFastISel(FunctionLoweringInfo &FuncInfo,
                                 const TargetLibraryInfo *LibInfo) {
-    const TargetMachine &TM = FuncInfo.MF->getTarget();
-
     // Only available on 64-bit ELF for now.
-    const PPCSubtarget *Subtarget = &TM.getSubtarget<PPCSubtarget>();
-    if (Subtarget->isPPC64() && Subtarget->isSVR4ABI())
+    const PPCSubtarget &Subtarget = FuncInfo.MF->getSubtarget<PPCSubtarget>();
+    if (Subtarget.isPPC64() && Subtarget.isSVR4ABI())
       return new PPCFastISel(FuncInfo, LibInfo);
-
     return nullptr;
   }
 }
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index dc87a6c..f997fea 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -16,6 +16,7 @@
 #include "PPCInstrInfo.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -36,10 +37,58 @@ static const uint16_t VRRegNo[] = {
  PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31
 };
 
+static unsigned computeReturnSaveOffset(const PPCSubtarget &STI) {
+  if (STI.isDarwinABI())
+    return STI.isPPC64() ? 16 : 8;
+  // SVR4 ABI:
+  return STI.isPPC64() ? 16 : 4;
+}
+
+static unsigned computeTOCSaveOffset(const PPCSubtarget &STI) {
+  return STI.isELFv2ABI() ? 24 : 40;
+}
+
+static unsigned computeFramePointerSaveOffset(const PPCSubtarget &STI) {
+  // For the Darwin ABI:
+  // We cannot use the TOC save slot (offset +20) in the PowerPC linkage area
+  // for saving the frame pointer (if needed.)  While the published ABI has
+  // not used this slot since at least MacOSX 10.2, there is older code
+  // around that does use it, and that needs to continue to work.
+  if (STI.isDarwinABI())
+    return STI.isPPC64() ? -8U : -4U;
+
+  // SVR4 ABI: First slot in the general register save area.
+  return STI.isPPC64() ? -8U : -4U;
+}
+
+static unsigned computeLinkageSize(const PPCSubtarget &STI) {
+  if (STI.isDarwinABI() || STI.isPPC64())
+    return (STI.isELFv2ABI() ? 4 : 6) * (STI.isPPC64() ? 8 : 4);
+
+  // SVR4 ABI:
+  return 8;
+}
+
+static unsigned computeBasePointerSaveOffset(const PPCSubtarget &STI) {
+  if (STI.isDarwinABI())
+    return STI.isPPC64() ? -16U : -8U;
+
+  // SVR4 ABI: First slot in the general register save area.
+  return STI.isPPC64()
+             ? -16U
+             : (STI.getTargetMachine().getRelocationModel() == Reloc::PIC_)
+                   ? -12U
+                   : -8U;
+}
+
 PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI)
     : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
-                          (STI.hasQPX() || STI.isBGQ()) ? 32 : 16, 0),
-      Subtarget(STI) {}
+                          STI.getPlatformStackAlignment(), 0),
+      Subtarget(STI), ReturnSaveOffset(computeReturnSaveOffset(Subtarget)),
+      TOCSaveOffset(computeTOCSaveOffset(Subtarget)),
+      FramePointerSaveOffset(computeFramePointerSaveOffset(Subtarget)),
+      LinkageSize(computeLinkageSize(Subtarget)),
+      BasePointerSaveOffset(computeBasePointerSaveOffset(STI)) {}
 
 // With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
 const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
@@ -355,6 +404,20 @@ static bool hasNonRISpills(const MachineFunction &MF) {
   return FuncInfo->hasNonRISpills();
 }
 
+/// MustSaveLR - Return true if this function requires that we save the LR
+/// register onto the stack in the prolog and restore it in the epilog of the
+/// function.
+static bool MustSaveLR(const MachineFunction &MF, unsigned LR) {
+  const PPCFunctionInfo *MFI = MF.getInfo<PPCFunctionInfo>();
+
+  // We need a save/restore of LR if there is any def of LR (which is
+  // defined by calls, including the PIC setup sequence), or if there is
+  // some use of the LR stack slot (e.g. for builtin_return_address).
+  // (LR comes in 32 and 64 bit versions.)
+  MachineRegisterInfo::def_iterator RI = MF.getRegInfo().def_begin(LR);
+  return RI !=MF.getRegInfo().def_end() || MFI->isLRStoreRequired();
+}
+
 /// determineFrameLayout - Determine the size of the frame and maximum call
 /// frame size.
 unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
@@ -372,15 +435,15 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
   unsigned AlignMask = std::max(MaxAlign, TargetAlign) - 1;
 
   const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
 
   // If we are a leaf function, and use up to 224 bytes of stack space,
   // don't have a frame pointer, calls, or dynamic alloca then we do not need
   // to adjust the stack pointer (we fit in the Red Zone).
   // The 32-bit SVR4 ABI has no Red Zone. However, it can still generate
   // stackless code if all local vars are reg-allocated.
-  bool DisableRedZone = MF.getFunction()->getAttributes().
-    hasAttribute(AttributeSet::FunctionIndex, Attribute::NoRedZone);
+  bool DisableRedZone = MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+  unsigned LR = RegInfo->getRARegister();
   if (!DisableRedZone &&
       (Subtarget.isPPC64() ||                      // 32-bit SVR4, no stack-
        !Subtarget.isSVR4ABI() ||                   //   allocated locals.
@@ -388,6 +451,7 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
       FrameSize <= 224 &&                          // Fits in red zone.
       !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
       !MFI->adjustsStack() &&                      // No calls.
+      !MustSaveLR(MF, LR) &&
       !RegInfo->hasBasePointer(MF)) { // No special alignment.
     // No need for frame
     if (UpdateMF)
@@ -399,9 +463,7 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
   unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
 
   // Maximum call frame needs to be at least big enough for linkage area.
-  unsigned minCallFrameSize = getLinkageSize(Subtarget.isPPC64(),
-                                             Subtarget.isDarwinABI(),
-                                             Subtarget.isELFv2ABI());
+  unsigned minCallFrameSize = getLinkageSize();
   maxCallFrameSize = std::max(maxCallFrameSize, minCallFrameSize);
 
   // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
@@ -444,12 +506,12 @@ bool PPCFrameLowering::needsFP(const MachineFunction &MF) const {
 
   // Naked functions have no stack frame pushed, so we don't have a frame
   // pointer.
-  if (MF.getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::Naked))
+  if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
     return false;
 
   return MF.getTarget().Options.DisableFramePointerElim(MF) ||
     MFI->hasVarSizedObjects() ||
+    MFI->hasStackMap() || MFI->hasPatchPoint() ||
     (MF.getTarget().Options.GuaranteedTailCallOpt &&
      MF.getInfo<PPCFunctionInfo>()->hasFastCall());
 }
@@ -460,7 +522,7 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
   unsigned FP8Reg = is31 ? PPC::X31 : PPC::X1;
 
   const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
   bool HasBP = RegInfo->hasBasePointer(MF);
   unsigned BPReg  = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg;
   unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg;
@@ -498,24 +560,22 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(MF.getSubtarget().getInstrInfo());
+      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
   const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
 
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
   DebugLoc dl;
-  bool needsFrameMoves = MMI.hasDebugInfo() ||
+  bool needsCFI = MMI.hasDebugInfo() ||
     MF.getFunction()->needsUnwindTableEntry();
-  bool isPIC = MF.getTarget().getRelocationModel() == Reloc::PIC_;
 
   // Get processor type.
   bool isPPC64 = Subtarget.isPPC64();
   // Get the ABI.
-  bool isDarwinABI = Subtarget.isDarwinABI();
   bool isSVR4ABI = Subtarget.isSVR4ABI();
   bool isELFv2ABI = Subtarget.isELFv2ABI();
-  assert((isDarwinABI || isSVR4ABI) &&
+  assert((Subtarget.isDarwinABI() || isSVR4ABI) &&
          "Currently only Darwin and SVR4 ABIs are supported for PowerPC.");
 
   // Scan the prolog, looking for an UPDATE_VRSAVE instruction.  If we find it,
@@ -581,7 +641,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
   assert((isPPC64 || !isSVR4ABI || !(!FrameSize && (MustSaveLR || HasFP))) &&
          "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4.");
 
-  int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
+  int LROffset = getReturnSaveOffset();
 
   int FPOffset = 0;
   if (HasFP) {
@@ -591,8 +651,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       assert(FPIndex && "No Frame Pointer Save Slot!");
       FPOffset = FFI->getObjectOffset(FPIndex);
     } else {
-      FPOffset =
-          PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+      FPOffset = getFramePointerSaveOffset();
     }
   }
 
@@ -604,13 +663,18 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       assert(BPIndex && "No Base Pointer Save Slot!");
       BPOffset = FFI->getObjectOffset(BPIndex);
     } else {
-      BPOffset =
-        PPCFrameLowering::getBasePointerSaveOffset(isPPC64,
-                                                   isDarwinABI,
-                                                   isPIC);
+      BPOffset = getBasePointerSaveOffset();
     }
   }
 
+  int PBPOffset = 0;
+  if (FI->usesPICBase()) {
+    MachineFrameInfo *FFI = MF.getFrameInfo();
+    int PBPIndex = FI->getPICBasePointerSaveIndex();
+    assert(PBPIndex && "No PIC Base Pointer Save Slot!");
+    PBPOffset = FFI->getObjectOffset(PBPIndex);
+  }
+
   // Get stack alignments.
   unsigned MaxAlign = MFI->getMaxAlignment();
   if (HasBP && MaxAlign > 1)
@@ -644,6 +708,13 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       .addImm(FPOffset)
       .addReg(SPReg);
 
+  if (FI->usesPICBase())
+    // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe.
+    BuildMI(MBB, MBBI, dl, StoreInst)
+      .addReg(PPC::R30)
+      .addImm(PBPOffset)
+      .addReg(SPReg);
+
   if (HasBP)
     // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe.
     BuildMI(MBB, MBBI, dl, StoreInst)
@@ -726,17 +797,28 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       .addReg(ScratchReg);
   }
 
-  // Add the "machine moves" for the instructions we generated above, but in
-  // reverse order.
-  if (needsFrameMoves) {
-    // Show update of SP.
-    assert(NegFrameSize);
-    unsigned CFIIndex = MMI.addFrameInst(
-        MCCFIInstruction::createDefCfaOffset(nullptr, NegFrameSize));
+  // Add Call Frame Information for the instructions we generated above.
+  if (needsCFI) {
+    unsigned CFIIndex;
+
+    if (HasBP) {
+      // Define CFA in terms of BP. Do this in preference to using FP/SP,
+      // because if the stack needed aligning then CFA won't be at a fixed
+      // offset from FP/SP.
+      unsigned Reg = MRI->getDwarfRegNum(BPReg, true);
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+    } else {
+      // Adjust the definition of CFA to account for the change in SP.
+      assert(NegFrameSize);
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, NegFrameSize));
+    }
     BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex);
 
     if (HasFP) {
+      // Describe where FP was saved, at a fixed offset from CFA.
       unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, Reg, FPOffset));
@@ -744,7 +826,17 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
           .addCFIIndex(CFIIndex);
     }
 
+    if (FI->usesPICBase()) {
+      // Describe where FP was saved, at a fixed offset from CFA.
+      unsigned Reg = MRI->getDwarfRegNum(PPC::R30, true);
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, PBPOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+
     if (HasBP) {
+      // Describe where BP was saved, at a fixed offset from CFA.
       unsigned Reg = MRI->getDwarfRegNum(BPReg, true);
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, Reg, BPOffset));
@@ -753,6 +845,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
     }
 
     if (MustSaveLR) {
+      // Describe where LR was saved, at a fixed offset from CFA.
       unsigned Reg = MRI->getDwarfRegNum(LRReg, true);
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, Reg, LROffset));
@@ -767,8 +860,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       .addReg(SPReg)
       .addReg(SPReg);
 
-    if (needsFrameMoves) {
-      // Mark effective beginning of when frame pointer is ready.
+    if (!HasBP && needsCFI) {
+      // Change the definition of CFA from SP+offset to FP+offset, because SP
+      // will change at every alloca.
       unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
@@ -778,8 +872,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
     }
   }
 
-  if (needsFrameMoves) {
-    // Add callee saved registers to move list.
+  if (needsCFI) {
+    // Describe where callee saved registers were saved, at fixed offsets from
+    // CFA.
     const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
     for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
       unsigned Reg = CSI[I].getReg();
@@ -824,14 +919,15 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   assert(MBBI != MBB.end() && "Returning block has no terminator");
   const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(MF.getSubtarget().getInstrInfo());
+      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
   const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
 
   unsigned RetOpcode = MBBI->getOpcode();
   DebugLoc dl;
 
   assert((RetOpcode == PPC::BLR ||
+          RetOpcode == PPC::BLR8 ||
           RetOpcode == PPC::TCRETURNri ||
           RetOpcode == PPC::TCRETURNdi ||
           RetOpcode == PPC::TCRETURNai ||
@@ -849,9 +945,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   // Get processor type.
   bool isPPC64 = Subtarget.isPPC64();
   // Get the ABI.
-  bool isDarwinABI = Subtarget.isDarwinABI();
   bool isSVR4ABI = Subtarget.isSVR4ABI();
-  bool isPIC = MF.getTarget().getRelocationModel() == Reloc::PIC_;
 
   // Check if the link register (LR) has been saved.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
@@ -879,7 +973,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8
                                                 : PPC::ADD4 );
 
-  int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
+  int LROffset = getReturnSaveOffset();
 
   int FPOffset = 0;
   if (HasFP) {
@@ -889,8 +983,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
       assert(FPIndex && "No Frame Pointer Save Slot!");
       FPOffset = FFI->getObjectOffset(FPIndex);
     } else {
-      FPOffset =
-          PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+      FPOffset = getFramePointerSaveOffset();
     }
   }
 
@@ -902,13 +995,18 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
       assert(BPIndex && "No Base Pointer Save Slot!");
       BPOffset = FFI->getObjectOffset(BPIndex);
     } else {
-      BPOffset =
-        PPCFrameLowering::getBasePointerSaveOffset(isPPC64,
-                                                   isDarwinABI,
-                                                   isPIC);
+      BPOffset = getBasePointerSaveOffset();
     }
   }
 
+  int PBPOffset = 0;
+  if (FI->usesPICBase()) {
+    MachineFrameInfo *FFI = MF.getFrameInfo();
+    int PBPIndex = FI->getPICBasePointerSaveIndex();
+    assert(PBPIndex && "No PIC Base Pointer Save Slot!");
+    PBPOffset = FFI->getObjectOffset(PBPIndex);
+  }
+
   bool UsesTCRet =  RetOpcode == PPC::TCRETURNri ||
     RetOpcode == PPC::TCRETURNdi ||
     RetOpcode == PPC::TCRETURNai ||
@@ -988,6 +1086,13 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
       .addImm(FPOffset)
       .addReg(SPReg);
 
+  if (FI->usesPICBase())
+    // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe.
+    BuildMI(MBB, MBBI, dl, LoadInst)
+      .addReg(PPC::R30)
+      .addImm(PBPOffset)
+      .addReg(SPReg);
+
   if (HasBP)
     BuildMI(MBB, MBBI, dl, LoadInst, BPReg)
       .addImm(BPOffset)
@@ -1003,7 +1108,8 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
 
   // Callee pop calling convention. Pop parameter/linkage area. Used for tail
   // call optimization
-  if (MF.getTarget().Options.GuaranteedTailCallOpt && RetOpcode == PPC::BLR &&
+  if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+      (RetOpcode == PPC::BLR || RetOpcode == PPC::BLR8) &&
       MF.getFunction()->getCallingConv() == CallingConv::Fast) {
      PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
      unsigned CallerAllocatedAmt = FI->getMinReservedArea();
@@ -1051,25 +1157,11 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-/// MustSaveLR - Return true if this function requires that we save the LR
-/// register onto the stack in the prolog and restore it in the epilog of the
-/// function.
-static bool MustSaveLR(const MachineFunction &MF, unsigned LR) {
-  const PPCFunctionInfo *MFI = MF.getInfo<PPCFunctionInfo>();
-
-  // We need a save/restore of LR if there is any def of LR (which is
-  // defined by calls, including the PIC setup sequence), or if there is
-  // some use of the LR stack slot (e.g. for builtin_return_address).
-  // (LR comes in 32 and 64 bit versions.)
-  MachineRegisterInfo::def_iterator RI = MF.getRegInfo().def_begin(LR);
-  return RI !=MF.getRegInfo().def_end() || MFI->isLRStoreRequired();
-}
-
 void
 PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                                    RegScavenger *) const {
   const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
 
   //  Save and clear the LR state.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
@@ -1082,13 +1174,12 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   int FPSI = FI->getFramePointerSaveIndex();
   bool isPPC64 = Subtarget.isPPC64();
   bool isDarwinABI  = Subtarget.isDarwinABI();
-  bool isPIC = MF.getTarget().getRelocationModel() == Reloc::PIC_;
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
   // If the frame pointer save index hasn't been defined yet.
   if (!FPSI && needsFP(MF)) {
     // Find out what the fix offset of the frame pointer save area.
-    int FPOffset = getFramePointerSaveOffset(isPPC64, isDarwinABI);
+    int FPOffset = getFramePointerSaveOffset();
     // Allocate the frame index for frame pointer save area.
     FPSI = MFI->CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
     // Save the result.
@@ -1097,13 +1188,21 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 
   int BPSI = FI->getBasePointerSaveIndex();
   if (!BPSI && RegInfo->hasBasePointer(MF)) {
-    int BPOffset = getBasePointerSaveOffset(isPPC64, isDarwinABI, isPIC);
+    int BPOffset = getBasePointerSaveOffset();
     // Allocate the frame index for the base pointer save area.
     BPSI = MFI->CreateFixedObject(isPPC64? 8 : 4, BPOffset, true);
     // Save the result.
     FI->setBasePointerSaveIndex(BPSI);
   }
 
+  // Reserve stack space for the PIC Base register (R30).
+  // Only used in SVR4 32-bit.
+  if (FI->usesPICBase()) {
+    int PBPSI = FI->getPICBasePointerSaveIndex();
+    PBPSI = MFI->CreateFixedObject(4, -8, true);
+    FI->setPICBasePointerSaveIndex(PBPSI);
+  }
+
   // Reserve stack space to move the linkage area to in case of a tail call.
   int TCSPDelta = 0;
   if (MF.getTarget().Options.GuaranteedTailCallOpt &&
@@ -1201,7 +1300,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
   }
 
   PPCFunctionInfo *PFI = MF.getInfo<PPCFunctionInfo>();
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
 
   int64_t LowerBound = 0;
 
@@ -1235,8 +1334,17 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
     FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
   }
 
+  if (PFI->usesPICBase()) {
+    HasGPSaveArea = true;
+
+    int FI = PFI->getPICBasePointerSaveIndex();
+    assert(FI && "No PIC Base Pointer Save Slot!");
+
+    FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+  }
+
   const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
   if (RegInfo->hasBasePointer(MF)) {
     HasGPSaveArea = true;
 
@@ -1384,7 +1492,7 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   MachineFunction *MF = MBB.getParent();
   const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(MF->getSubtarget().getInstrInfo());
+      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
   DebugLoc DL;
   bool CRSpilled = false;
   MachineInstrBuilder CRMIB;
@@ -1445,8 +1553,7 @@ restoreCRs(bool isPPC64, bool is31,
            const std::vector<CalleeSavedInfo> &CSI, unsigned CSIIndex) {
 
   MachineFunction *MF = MBB.getParent();
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(MF->getSubtarget().getInstrInfo());
+  const PPCInstrInfo &TII = *MF->getSubtarget<PPCSubtarget>().getInstrInfo();
   DebugLoc DL;
   unsigned RestoreOp, MoveReg;
 
@@ -1478,8 +1585,7 @@ restoreCRs(bool isPPC64, bool is31,
 void PPCFrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   if (MF.getTarget().Options.GuaranteedTailCallOpt &&
       I->getOpcode() == PPC::ADJCALLSTACKUP) {
     // Add (actually subtract) back the amount the callee popped on return.
@@ -1529,7 +1635,7 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   MachineFunction *MF = MBB.getParent();
   const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(MF->getSubtarget().getInstrInfo());
+      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
   bool CR2Spilled = false;
   bool CR3Spilled = false;
   bool CR4Spilled = false;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index c482588..dddabb8 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -23,6 +23,11 @@ class PPCSubtarget;
 
 class PPCFrameLowering: public TargetFrameLowering {
   const PPCSubtarget &Subtarget;
+  const unsigned ReturnSaveOffset;
+  const unsigned TOCSaveOffset;
+  const unsigned FramePointerSaveOffset;
+  const unsigned LinkageSize;
+  const unsigned BasePointerSaveOffset;
 
 public:
   PPCFrameLowering(const PPCSubtarget &STI);
@@ -67,56 +72,23 @@ public:
 
   /// getReturnSaveOffset - Return the previous frame offset to save the
   /// return address.
-  static unsigned getReturnSaveOffset(bool isPPC64, bool isDarwinABI) {
-    if (isDarwinABI)
-      return isPPC64 ? 16 : 8;
-    // SVR4 ABI:
-    return isPPC64 ? 16 : 4;
-  }
+  unsigned getReturnSaveOffset() const { return ReturnSaveOffset; }
 
   /// getTOCSaveOffset - Return the previous frame offset to save the
   /// TOC register -- 64-bit SVR4 ABI only.
-  static unsigned getTOCSaveOffset(bool isELFv2ABI) {
-    return isELFv2ABI ? 24 : 40;
-  }
+  unsigned getTOCSaveOffset() const { return TOCSaveOffset; }
 
   /// getFramePointerSaveOffset - Return the previous frame offset to save the
   /// frame pointer.
-  static unsigned getFramePointerSaveOffset(bool isPPC64, bool isDarwinABI) {
-    // For the Darwin ABI:
-    // We cannot use the TOC save slot (offset +20) in the PowerPC linkage area
-    // for saving the frame pointer (if needed.)  While the published ABI has
-    // not used this slot since at least MacOSX 10.2, there is older code
-    // around that does use it, and that needs to continue to work.
-    if (isDarwinABI)
-      return isPPC64 ? -8U : -4U;
-
-    // SVR4 ABI: First slot in the general register save area.
-    return isPPC64 ? -8U : -4U;
-  }
+  unsigned getFramePointerSaveOffset() const { return FramePointerSaveOffset; }
 
   /// getBasePointerSaveOffset - Return the previous frame offset to save the
   /// base pointer.
-  static unsigned getBasePointerSaveOffset(bool isPPC64,
-                                           bool isDarwinABI,
-                                           bool isPIC) {
-    if (isDarwinABI)
-      return isPPC64 ? -16U : -8U;
-
-    // SVR4 ABI: First slot in the general register save area.
-    return isPPC64 ? -16U : isPIC ? -12U : -8U;
-  }
+  unsigned getBasePointerSaveOffset() const { return BasePointerSaveOffset; }
 
   /// getLinkageSize - Return the size of the PowerPC ABI linkage area.
   ///
-  static unsigned getLinkageSize(bool isPPC64, bool isDarwinABI,
-                                 bool isELFv2ABI) {
-    if (isDarwinABI || isPPC64)
-      return (isELFv2ABI ? 4 : 6) * (isPPC64 ? 8 : 4);
-
-    // SVR4 ABI:
-    return 8;
-  }
+  unsigned getLinkageSize() const { return LinkageSize; }
 
   const SpillSlot *
   getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index d9b242c..7234e30 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -160,7 +160,7 @@ unsigned PPCDispatchGroupSBHazardRecognizer::PreEmitNoops(SUnit *SU) {
   // new group.
   if (isLoadAfterStore(SU) && CurSlots < 6) {
     unsigned Directive =
-      DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+        DAG->MF.getSubtarget<PPCSubtarget>().getDarwinDirective();
     // If we're using a special group-terminating nop, then we need only one.
     if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 ||
         Directive == PPC::DIR_PWR8 )
@@ -220,7 +220,7 @@ void PPCDispatchGroupSBHazardRecognizer::Reset() {
 
 void PPCDispatchGroupSBHazardRecognizer::EmitNoop() {
   unsigned Directive =
-    DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+      DAG->MF.getSubtarget<PPCSubtarget>().getDarwinDirective();
   // If the group has now filled all of its slots, or if we're using a special
   // group-terminating nop, the group is complete.
   if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 ||
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 49ba58b..b10e854 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -42,6 +42,16 @@ using namespace llvm;
 cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
 cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
 
+static cl::opt<bool>
+    UseBitPermRewriter("ppc-use-bit-perm-rewriter", cl::init(true),
+                       cl::desc("use aggressive ppc isel for bit permutations"),
+                       cl::Hidden);
+static cl::opt<bool> BPermRewriterNoMasking(
+    "ppc-bit-perm-rewriter-stress-rotates",
+    cl::desc("stress rotate selection in aggressive ppc isel for "
+             "bit permutations"),
+    cl::Hidden);
+
 namespace llvm {
   void initializePPCDAGToDAGISelPass(PassRegistry&);
 }
@@ -53,22 +63,20 @@ namespace {
   ///
   class PPCDAGToDAGISel : public SelectionDAGISel {
     const PPCTargetMachine &TM;
-    const PPCTargetLowering *PPCLowering;
     const PPCSubtarget *PPCSubTarget;
+    const PPCTargetLowering *PPCLowering;
     unsigned GlobalBaseReg;
   public:
     explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
-        : SelectionDAGISel(tm), TM(tm),
-          PPCLowering(TM.getSubtargetImpl()->getTargetLowering()),
-          PPCSubTarget(TM.getSubtargetImpl()) {
+        : SelectionDAGISel(tm), TM(tm) {
       initializePPCDAGToDAGISelPass(*PassRegistry::getPassRegistry());
     }
 
     bool runOnMachineFunction(MachineFunction &MF) override {
       // Make sure we re-emit a set of the global base reg if necessary
       GlobalBaseReg = 0;
-      PPCLowering = TM.getSubtargetImpl()->getTargetLowering();
-      PPCSubTarget = TM.getSubtargetImpl();
+      PPCSubTarget = &MF.getSubtarget<PPCSubtarget>();
+      PPCLowering = PPCSubTarget->getTargetLowering();
       SelectionDAGISel::runOnMachineFunction(MF);
 
       if (!PPCSubTarget->isSVR4ABI())
@@ -77,6 +85,7 @@ namespace {
       return true;
     }
 
+    void PreprocessISelDAG() override;
     void PostprocessISelDAG() override;
 
     /// getI32Imm - Return a target constant with the specified value, of type
@@ -112,11 +121,14 @@ namespace {
     /// base register.  Return the virtual register that holds this value.
     SDNode *getGlobalBaseReg();
 
+    SDNode *getFrameIndex(SDNode *SN, SDNode *N, unsigned Offset = 0);
+
     // Select - Convert the specified operand from a target-independent to a
     // target-specific node if it hasn't already been changed.
     SDNode *Select(SDNode *N) override;
 
     SDNode *SelectBitfieldInsert(SDNode *N);
+    SDNode *SelectBitPermutation(SDNode *N);
 
     /// SelectCC - Select a comparison of the specified values with the
     /// specified condition code, returning the CR# of the expression.
@@ -173,10 +185,20 @@ namespace {
     /// a register.  The case of adding a (possibly relocatable) constant to a
     /// register can be improved, but it is wrong to substitute Reg+Reg for
     /// Reg in an asm, because the load or store opcode would have to change.
-   bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+    bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                       char ConstraintCode,
                                       std::vector<SDValue> &OutOps) override {
-      OutOps.push_back(Op);
+      // We need to make sure that this one operand does not end up in r0
+      // (because we might end up lowering this as 0(%op)).
+      const TargetRegisterInfo *TRI = PPCSubTarget->getRegisterInfo();
+      const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF, /*Kind=*/1);
+      SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32);
+      SDValue NewOp =
+        SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                       SDLoc(Op), Op.getValueType(),
+                                       Op, RC), 0);
+
+      OutOps.push_back(NewOp);
       return false;
     }
 
@@ -193,10 +215,16 @@ private:
     SDNode *SelectSETCC(SDNode *N);
 
     void PeepholePPC64();
+    void PeepholePPC64ZExt();
     void PeepholeCROps();
 
+    SDValue combineToCMPB(SDNode *N);
+    void foldBoolExts(SDValue &Res, SDNode *&N);
+
     bool AllUsersSelectZero(SDNode *N);
     void SwapAllSelectUsers(SDNode *N);
+
+    SDNode *transferMemOperands(SDNode *N, SDNode *Result);
   };
 }
 
@@ -234,7 +262,7 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
   unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
   unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
 
-  const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo();
   MachineBasicBlock &EntryBB = *Fn.begin();
   DebugLoc dl;
   // Emit the following code into the entry block:
@@ -270,7 +298,7 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
 ///
 SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
   if (!GlobalBaseReg) {
-    const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+    const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo();
     // Insert the set of GlobalBaseReg into the first MBB of the function
     MachineBasicBlock &FirstMBB = MF->front();
     MachineBasicBlock::iterator MBBI = FirstMBB.begin();
@@ -283,12 +311,13 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
         if (M->getPICLevel() == PICLevel::Small) {
           BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MoveGOTtoLR));
           BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
+          MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true);
         } else {
           BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
           BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
           unsigned TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
           BuildMI(FirstMBB, MBBI, dl,
-                  TII.get(PPC::UpdateGBR)).addReg(GlobalBaseReg)
+                  TII.get(PPC::UpdateGBR), GlobalBaseReg)
                   .addReg(TempReg, RegState::Define).addReg(GlobalBaseReg);
           MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true);
         }
@@ -363,6 +392,18 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
          && isInt32Immediate(N->getOperand(1).getNode(), Imm);
 }
 
+SDNode *PPCDAGToDAGISel::getFrameIndex(SDNode *SN, SDNode *N, unsigned Offset) {
+  SDLoc dl(SN);
+  int FI = cast<FrameIndexSDNode>(N)->getIndex();
+  SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0));
+  unsigned Opc = N->getValueType(0) == MVT::i32 ? PPC::ADDI : PPC::ADDI8;
+  if (SN->hasOneUse())
+    return CurDAG->SelectNodeTo(SN, Opc, N->getValueType(0), TFI,
+                                getSmallIPtrImm(Offset));
+  return CurDAG->getMachineNode(Opc, dl, N->getValueType(0), TFI,
+                                getSmallIPtrImm(Offset));
+}
+
 bool PPCDAGToDAGISel::isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
   if (!Val)
     return false;
@@ -507,6 +548,1401 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
   return nullptr;
 }
 
+// Predict the number of instructions that would be generated by calling
+// SelectInt64(N).
+static unsigned SelectInt64CountDirect(int64_t Imm) {
+  // Assume no remaining bits.
+  unsigned Remainder = 0;
+  // Assume no shift required.
+  unsigned Shift = 0;
+
+  // If it can't be represented as a 32 bit value.
+  if (!isInt<32>(Imm)) {
+    Shift = countTrailingZeros<uint64_t>(Imm);
+    int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+
+    // If the shifted value fits 32 bits.
+    if (isInt<32>(ImmSh)) {
+      // Go with the shifted value.
+      Imm = ImmSh;
+    } else {
+      // Still stuck with a 64 bit value.
+      Remainder = Imm;
+      Shift = 32;
+      Imm >>= 32;
+    }
+  }
+
+  // Intermediate operand.
+  unsigned Result = 0;
+
+  // Handle first 32 bits.
+  unsigned Lo = Imm & 0xFFFF;
+  unsigned Hi = (Imm >> 16) & 0xFFFF;
+
+  // Simple value.
+  if (isInt<16>(Imm)) {
+    // Just the Lo bits.
+    ++Result;
+  } else if (Lo) {
+    // Handle the Hi bits and Lo bits.
+    Result += 2;
+  } else {
+    // Just the Hi bits.
+    ++Result;
+  }
+
+  // If no shift, we're done.
+  if (!Shift) return Result;
+
+  // Shift for next step if the upper 32-bits were not zero.
+  if (Imm)
+    ++Result;
+
+  // Add in the last bits as required.
+  if ((Hi = (Remainder >> 16) & 0xFFFF))
+    ++Result;
+  if ((Lo = Remainder & 0xFFFF))
+    ++Result;
+
+  return Result;
+}
+
+static uint64_t Rot64(uint64_t Imm, unsigned R) {
+  return (Imm << R) | (Imm >> (64 - R));
+}
+
+static unsigned SelectInt64Count(int64_t Imm) {
+  unsigned Count = SelectInt64CountDirect(Imm);
+  if (Count == 1)
+    return Count;
+
+  for (unsigned r = 1; r < 63; ++r) {
+    uint64_t RImm = Rot64(Imm, r);
+    unsigned RCount = SelectInt64CountDirect(RImm) + 1;
+    Count = std::min(Count, RCount);
+
+    // See comments in SelectInt64 for an explanation of the logic below.
+    unsigned LS = findLastSet(RImm);
+    if (LS != r-1)
+      continue;
+
+    uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
+    uint64_t RImmWithOnes = RImm | OnesMask;
+
+    RCount = SelectInt64CountDirect(RImmWithOnes) + 1;
+    Count = std::min(Count, RCount);
+  }
+
+  return Count;
+}
+
+// Select a 64-bit constant. For cost-modeling purposes, SelectInt64Count
+// (above) needs to be kept in sync with this function.
+static SDNode *SelectInt64Direct(SelectionDAG *CurDAG, SDLoc dl, int64_t Imm) {
+  // Assume no remaining bits.
+  unsigned Remainder = 0;
+  // Assume no shift required.
+  unsigned Shift = 0;
+
+  // If it can't be represented as a 32 bit value.
+  if (!isInt<32>(Imm)) {
+    Shift = countTrailingZeros<uint64_t>(Imm);
+    int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+
+    // If the shifted value fits 32 bits.
+    if (isInt<32>(ImmSh)) {
+      // Go with the shifted value.
+      Imm = ImmSh;
+    } else {
+      // Still stuck with a 64 bit value.
+      Remainder = Imm;
+      Shift = 32;
+      Imm >>= 32;
+    }
+  }
+
+  // Intermediate operand.
+  SDNode *Result;
+
+  // Handle first 32 bits.
+  unsigned Lo = Imm & 0xFFFF;
+  unsigned Hi = (Imm >> 16) & 0xFFFF;
+
+  auto getI32Imm = [CurDAG](unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+  };
+
+  // Simple value.
+  if (isInt<16>(Imm)) {
+    // Just the Lo bits.
+    Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
+  } else if (Lo) {
+    // Handle the Hi bits.
+    unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
+    Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi));
+    // And Lo bits.
+    Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+                                    SDValue(Result, 0), getI32Imm(Lo));
+  } else {
+    // Just the Hi bits.
+    Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
+  }
+
+  // If no shift, we're done.
+  if (!Shift) return Result;
+
+  // Shift for next step if the upper 32-bits were not zero.
+  if (Imm) {
+    Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64,
+                                    SDValue(Result, 0),
+                                    getI32Imm(Shift),
+                                    getI32Imm(63 - Shift));
+  }
+
+  // Add in the last bits as required.
+  if ((Hi = (Remainder >> 16) & 0xFFFF)) {
+    Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
+                                    SDValue(Result, 0), getI32Imm(Hi));
+  }
+  if ((Lo = Remainder & 0xFFFF)) {
+    Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+                                    SDValue(Result, 0), getI32Imm(Lo));
+  }
+
+  return Result;
+}
+
+static SDNode *SelectInt64(SelectionDAG *CurDAG, SDLoc dl, int64_t Imm) {
+  unsigned Count = SelectInt64CountDirect(Imm);
+  if (Count == 1)
+    return SelectInt64Direct(CurDAG, dl, Imm);
+
+  unsigned RMin = 0;
+
+  int64_t MatImm;
+  unsigned MaskEnd;
+
+  for (unsigned r = 1; r < 63; ++r) {
+    uint64_t RImm = Rot64(Imm, r);
+    unsigned RCount = SelectInt64CountDirect(RImm) + 1;
+    if (RCount < Count) {
+      Count = RCount;
+      RMin = r;
+      MatImm = RImm;
+      MaskEnd = 63;
+    }
+
+    // If the immediate to generate has many trailing zeros, it might be
+    // worthwhile to generate a rotated value with too many leading ones
+    // (because that's free with li/lis's sign-extension semantics), and then
+    // mask them off after rotation.
+
+    unsigned LS = findLastSet(RImm);
+    // We're adding (63-LS) higher-order ones, and we expect to mask them off
+    // after performing the inverse rotation by (64-r). So we need that:
+    //   63-LS == 64-r => LS == r-1
+    if (LS != r-1)
+      continue;
+
+    uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
+    uint64_t RImmWithOnes = RImm | OnesMask;
+
+    RCount = SelectInt64CountDirect(RImmWithOnes) + 1;
+    if (RCount < Count) {
+      Count = RCount;
+      RMin = r;
+      MatImm = RImmWithOnes;
+      MaskEnd = LS;
+    }
+  }
+
+  if (!RMin)
+    return SelectInt64Direct(CurDAG, dl, Imm);
+
+  auto getI32Imm = [CurDAG](unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+  };
+
+  SDValue Val = SDValue(SelectInt64Direct(CurDAG, dl, MatImm), 0);
+  return CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Val,
+                                getI32Imm(64 - RMin), getI32Imm(MaskEnd));
+}
+
+// Select a 64-bit constant.
+static SDNode *SelectInt64(SelectionDAG *CurDAG, SDNode *N) {
+  SDLoc dl(N);
+
+  // Get 64 bit value.
+  int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
+  return SelectInt64(CurDAG, dl, Imm);
+}
+
+namespace {
+class BitPermutationSelector {
+  struct ValueBit {
+    SDValue V;
+
+    // The bit number in the value, using a convention where bit 0 is the
+    // lowest-order bit.
+    unsigned Idx;
+
+    enum Kind {
+      ConstZero,
+      Variable
+    } K;
+
+    ValueBit(SDValue V, unsigned I, Kind K = Variable)
+      : V(V), Idx(I), K(K) {}
+    ValueBit(Kind K = Variable)
+      : V(SDValue(nullptr, 0)), Idx(UINT32_MAX), K(K) {}
+
+    bool isZero() const {
+      return K == ConstZero;
+    }
+
+    bool hasValue() const {
+      return K == Variable;
+    }
+
+    SDValue getValue() const {
+      assert(hasValue() && "Cannot get the value of a constant bit");
+      return V;
+    }
+
+    unsigned getValueBitIndex() const {
+      assert(hasValue() && "Cannot get the value bit index of a constant bit");
+      return Idx;
+    }
+  };
+
+  // A bit group has the same underlying value and the same rotate factor.
+  struct BitGroup {
+    SDValue V;
+    unsigned RLAmt;
+    unsigned StartIdx, EndIdx;
+
+    // This rotation amount assumes that the lower 32 bits of the quantity are
+    // replicated in the high 32 bits by the rotation operator (which is done
+    // by rlwinm and friends in 64-bit mode).
+    bool Repl32;
+    // Did converting to Repl32 == true change the rotation factor? If it did,
+    // it decreased it by 32.
+    bool Repl32CR;
+    // Was this group coalesced after setting Repl32 to true?
+    bool Repl32Coalesced;
+
+    BitGroup(SDValue V, unsigned R, unsigned S, unsigned E)
+      : V(V), RLAmt(R), StartIdx(S), EndIdx(E), Repl32(false), Repl32CR(false),
+        Repl32Coalesced(false) {
+      DEBUG(dbgs() << "\tbit group for " << V.getNode() << " RLAmt = " << R <<
+                      " [" << S << ", " << E << "]\n");
+    }
+  };
+
+  // Information on each (Value, RLAmt) pair (like the number of groups
+  // associated with each) used to choose the lowering method.
+  struct ValueRotInfo {
+    SDValue V;
+    unsigned RLAmt;
+    unsigned NumGroups;
+    unsigned FirstGroupStartIdx;
+    bool Repl32;
+
+    ValueRotInfo()
+      : RLAmt(UINT32_MAX), NumGroups(0), FirstGroupStartIdx(UINT32_MAX),
+        Repl32(false) {}
+
+    // For sorting (in reverse order) by NumGroups, and then by
+    // FirstGroupStartIdx.
+    bool operator < (const ValueRotInfo &Other) const {
+      // We need to sort so that the non-Repl32 come first because, when we're
+      // doing masking, the Repl32 bit groups might be subsumed into the 64-bit
+      // masking operation.
+      if (Repl32 < Other.Repl32)
+        return true;
+      else if (Repl32 > Other.Repl32)
+        return false;
+      else if (NumGroups > Other.NumGroups)
+        return true;
+      else if (NumGroups < Other.NumGroups)
+        return false;
+      else if (FirstGroupStartIdx < Other.FirstGroupStartIdx)
+        return true;
+      return false;
+    }
+  };
+
+  // Return true if something interesting was deduced, return false if we're
+  // providing only a generic representation of V (or something else likewise
+  // uninteresting for instruction selection).
+  bool getValueBits(SDValue V, SmallVector<ValueBit, 64> &Bits) {
+    switch (V.getOpcode()) {
+    default: break;
+    case ISD::ROTL:
+      if (isa<ConstantSDNode>(V.getOperand(1))) {
+        unsigned RotAmt = V.getConstantOperandVal(1);
+
+        SmallVector<ValueBit, 64> LHSBits(Bits.size());
+        getValueBits(V.getOperand(0), LHSBits);
+
+        for (unsigned i = 0; i < Bits.size(); ++i)
+          Bits[i] = LHSBits[i < RotAmt ? i + (Bits.size() - RotAmt) : i - RotAmt];
+
+        return true;
+      }
+      break;
+    case ISD::SHL:
+      if (isa<ConstantSDNode>(V.getOperand(1))) {
+        unsigned ShiftAmt = V.getConstantOperandVal(1);
+
+        SmallVector<ValueBit, 64> LHSBits(Bits.size());
+        getValueBits(V.getOperand(0), LHSBits);
+
+        for (unsigned i = ShiftAmt; i < Bits.size(); ++i)
+          Bits[i] = LHSBits[i - ShiftAmt];
+
+        for (unsigned i = 0; i < ShiftAmt; ++i)
+          Bits[i] = ValueBit(ValueBit::ConstZero);
+
+        return true;
+      }
+      break;
+    case ISD::SRL:
+      if (isa<ConstantSDNode>(V.getOperand(1))) {
+        unsigned ShiftAmt = V.getConstantOperandVal(1);
+
+        SmallVector<ValueBit, 64> LHSBits(Bits.size());
+        getValueBits(V.getOperand(0), LHSBits);
+
+        for (unsigned i = 0; i < Bits.size() - ShiftAmt; ++i)
+          Bits[i] = LHSBits[i + ShiftAmt];
+
+        for (unsigned i = Bits.size() - ShiftAmt; i < Bits.size(); ++i)
+          Bits[i] = ValueBit(ValueBit::ConstZero);
+
+        return true;
+      }
+      break;
+    case ISD::AND:
+      if (isa<ConstantSDNode>(V.getOperand(1))) {
+        uint64_t Mask = V.getConstantOperandVal(1);
+
+        SmallVector<ValueBit, 64> LHSBits(Bits.size());
+        bool LHSTrivial = getValueBits(V.getOperand(0), LHSBits);
+
+        for (unsigned i = 0; i < Bits.size(); ++i)
+          if (((Mask >> i) & 1) == 1)
+            Bits[i] = LHSBits[i];
+          else
+            Bits[i] = ValueBit(ValueBit::ConstZero);
+
+        // Mark this as interesting, only if the LHS was also interesting. This
+        // prevents the overall procedure from matching a single immediate 'and'
+        // (which is non-optimal because such an and might be folded with other
+        // things if we don't select it here).
+        return LHSTrivial;
+      }
+      break;
+    case ISD::OR: {
+      SmallVector<ValueBit, 64> LHSBits(Bits.size()), RHSBits(Bits.size());
+      getValueBits(V.getOperand(0), LHSBits);
+      getValueBits(V.getOperand(1), RHSBits);
+
+      bool AllDisjoint = true;
+      for (unsigned i = 0; i < Bits.size(); ++i)
+        if (LHSBits[i].isZero())
+          Bits[i] = RHSBits[i];
+        else if (RHSBits[i].isZero())
+          Bits[i] = LHSBits[i];
+        else {
+          AllDisjoint = false;
+          break;
+        }
+
+      if (!AllDisjoint)
+        break;
+
+      return true;
+    }
+    }
+
+    for (unsigned i = 0; i < Bits.size(); ++i)
+      Bits[i] = ValueBit(V, i);
+
+    return false;
+  }
+
+  // For each value (except the constant ones), compute the left-rotate amount
+  // to get it from its original to final position.
+  void computeRotationAmounts() {
+    HasZeros = false;
+    RLAmt.resize(Bits.size());
+    for (unsigned i = 0; i < Bits.size(); ++i)
+      if (Bits[i].hasValue()) {
+        unsigned VBI = Bits[i].getValueBitIndex();
+        if (i >= VBI)
+          RLAmt[i] = i - VBI;
+        else
+          RLAmt[i] = Bits.size() - (VBI - i);
+      } else if (Bits[i].isZero()) {
+        HasZeros = true;
+        RLAmt[i] = UINT32_MAX;
+      } else {
+        llvm_unreachable("Unknown value bit type");
+      }
+  }
+
+  // Collect groups of consecutive bits with the same underlying value and
+  // rotation factor. If we're doing late masking, we ignore zeros, otherwise
+  // they break up groups.
+  void collectBitGroups(bool LateMask) {
+    BitGroups.clear();
+
+    unsigned LastRLAmt = RLAmt[0];
+    SDValue LastValue = Bits[0].hasValue() ? Bits[0].getValue() : SDValue();
+    unsigned LastGroupStartIdx = 0;
+    for (unsigned i = 1; i < Bits.size(); ++i) {
+      unsigned ThisRLAmt = RLAmt[i];
+      SDValue ThisValue = Bits[i].hasValue() ? Bits[i].getValue() : SDValue();
+      if (LateMask && !ThisValue) {
+        ThisValue = LastValue;
+        ThisRLAmt = LastRLAmt;
+        // If we're doing late masking, then the first bit group always starts
+        // at zero (even if the first bits were zero).
+        if (BitGroups.empty())
+          LastGroupStartIdx = 0;
+      }
+
+      // If this bit has the same underlying value and the same rotate factor as
+      // the last one, then they're part of the same group.
+      if (ThisRLAmt == LastRLAmt && ThisValue == LastValue)
+        continue;
+
+      if (LastValue.getNode())
+        BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
+                                     i-1));
+      LastRLAmt = ThisRLAmt;
+      LastValue = ThisValue;
+      LastGroupStartIdx = i;
+    }
+    if (LastValue.getNode())
+      BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
+                                   Bits.size()-1));
+
+    if (BitGroups.empty())
+      return;
+
+    // We might be able to combine the first and last groups.
+    if (BitGroups.size() > 1) {
+      // If the first and last groups are the same, then remove the first group
+      // in favor of the last group, making the ending index of the last group
+      // equal to the ending index of the to-be-removed first group.
+      if (BitGroups[0].StartIdx == 0 &&
+          BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 &&
+          BitGroups[0].V == BitGroups[BitGroups.size()-1].V &&
+          BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) {
+        DEBUG(dbgs() << "\tcombining final bit group with inital one\n");
+        BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx;
+        BitGroups.erase(BitGroups.begin());
+      }
+    }
+  }
+
+  // Take all (SDValue, RLAmt) pairs and sort them by the number of groups
+  // associated with each. If there is a degeneracy, pick the one that occurs
+  // first (in the final value).
+  void collectValueRotInfo() {
+    ValueRots.clear();
+
+    for (auto &BG : BitGroups) {
+      unsigned RLAmtKey = BG.RLAmt + (BG.Repl32 ? 64 : 0);
+      ValueRotInfo &VRI = ValueRots[std::make_pair(BG.V, RLAmtKey)];
+      VRI.V = BG.V;
+      VRI.RLAmt = BG.RLAmt;
+      VRI.Repl32 = BG.Repl32;
+      VRI.NumGroups += 1;
+      VRI.FirstGroupStartIdx = std::min(VRI.FirstGroupStartIdx, BG.StartIdx);
+    }
+
+    // Now that we've collected the various ValueRotInfo instances, we need to
+    // sort them.
+    ValueRotsVec.clear();
+    for (auto &I : ValueRots) {
+      ValueRotsVec.push_back(I.second);
+    }
+    std::sort(ValueRotsVec.begin(), ValueRotsVec.end());
+  }
+
+  // In 64-bit mode, rlwinm and friends have a rotation operator that
+  // replicates the low-order 32 bits into the high-order 32-bits. The mask
+  // indices of these instructions can only be in the lower 32 bits, so they
+  // can only represent some 64-bit bit groups. However, when they can be used,
+  // the 32-bit replication can be used to represent, as a single bit group,
+  // otherwise separate bit groups. We'll convert to replicated-32-bit bit
+  // groups when possible. Returns true if any of the bit groups were
+  // converted.
+  void assignRepl32BitGroups() {
+    // If we have bits like this:
+    //
+    // Indices:    15 14 13 12 11 10 9 8  7  6  5  4  3  2  1  0
+    // V bits: ... 7  6  5  4  3  2  1 0 31 30 29 28 27 26 25 24
+    // Groups:    |      RLAmt = 8      |      RLAmt = 40       |
+    //
+    // But, making use of a 32-bit operation that replicates the low-order 32
+    // bits into the high-order 32 bits, this can be one bit group with a RLAmt
+    // of 8.
+
+    auto IsAllLow32 = [this](BitGroup & BG) {
+      if (BG.StartIdx <= BG.EndIdx) {
+        for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i) {
+          if (!Bits[i].hasValue())
+            continue;
+          if (Bits[i].getValueBitIndex() >= 32)
+            return false;
+        }
+      } else {
+        for (unsigned i = BG.StartIdx; i < Bits.size(); ++i) {
+          if (!Bits[i].hasValue())
+            continue;
+          if (Bits[i].getValueBitIndex() >= 32)
+            return false;
+        }
+        for (unsigned i = 0; i <= BG.EndIdx; ++i) {
+          if (!Bits[i].hasValue())
+            continue;
+          if (Bits[i].getValueBitIndex() >= 32)
+            return false;
+        }
+      }
+
+      return true;
+    };
+
+    for (auto &BG : BitGroups) {
+      if (BG.StartIdx < 32 && BG.EndIdx < 32) {
+        if (IsAllLow32(BG)) {
+          if (BG.RLAmt >= 32) {
+            BG.RLAmt -= 32;
+            BG.Repl32CR = true;
+          }
+
+          BG.Repl32 = true;
+
+          DEBUG(dbgs() << "\t32-bit replicated bit group for " <<
+                          BG.V.getNode() << " RLAmt = " << BG.RLAmt <<
+                          " [" << BG.StartIdx << ", " << BG.EndIdx << "]\n");
+        }
+      }
+    }
+
+    // Now walk through the bit groups, consolidating where possible.
+    for (auto I = BitGroups.begin(); I != BitGroups.end();) {
+      // We might want to remove this bit group by merging it with the previous
+      // group (which might be the ending group).
+      auto IP = (I == BitGroups.begin()) ?
+                std::prev(BitGroups.end()) : std::prev(I);
+      if (I->Repl32 && IP->Repl32 && I->V == IP->V && I->RLAmt == IP->RLAmt &&
+          I->StartIdx == (IP->EndIdx + 1) % 64 && I != IP) {
+
+        DEBUG(dbgs() << "\tcombining 32-bit replicated bit group for " <<
+                        I->V.getNode() << " RLAmt = " << I->RLAmt <<
+                        " [" << I->StartIdx << ", " << I->EndIdx <<
+                        "] with group with range [" <<
+                        IP->StartIdx << ", " << IP->EndIdx << "]\n");
+
+        IP->EndIdx = I->EndIdx;
+        IP->Repl32CR = IP->Repl32CR || I->Repl32CR;
+        IP->Repl32Coalesced = true;
+        I = BitGroups.erase(I);
+        continue;
+      } else {
+        // There is a special case worth handling: If there is a single group
+        // covering the entire upper 32 bits, and it can be merged with both
+        // the next and previous groups (which might be the same group), then
+        // do so. If it is the same group (so there will be only one group in
+        // total), then we need to reverse the order of the range so that it
+        // covers the entire 64 bits.
+        if (I->StartIdx == 32 && I->EndIdx == 63) {
+          assert(std::next(I) == BitGroups.end() &&
+                 "bit group ends at index 63 but there is another?");
+          auto IN = BitGroups.begin();
+
+          if (IP->Repl32 && IN->Repl32 && I->V == IP->V && I->V == IN->V && 
+              (I->RLAmt % 32) == IP->RLAmt && (I->RLAmt % 32) == IN->RLAmt &&
+              IP->EndIdx == 31 && IN->StartIdx == 0 && I != IP &&
+              IsAllLow32(*I)) {
+
+            DEBUG(dbgs() << "\tcombining bit group for " <<
+                            I->V.getNode() << " RLAmt = " << I->RLAmt <<
+                            " [" << I->StartIdx << ", " << I->EndIdx <<
+                            "] with 32-bit replicated groups with ranges [" <<
+                            IP->StartIdx << ", " << IP->EndIdx << "] and [" <<
+                            IN->StartIdx << ", " << IN->EndIdx << "]\n");
+
+            if (IP == IN) {
+              // There is only one other group; change it to cover the whole
+              // range (backward, so that it can still be Repl32 but cover the
+              // whole 64-bit range).
+              IP->StartIdx = 31;
+              IP->EndIdx = 30;
+              IP->Repl32CR = IP->Repl32CR || I->RLAmt >= 32;
+              IP->Repl32Coalesced = true;
+              I = BitGroups.erase(I);
+            } else {
+              // There are two separate groups, one before this group and one
+              // after us (at the beginning). We're going to remove this group,
+              // but also the group at the very beginning.
+              IP->EndIdx = IN->EndIdx;
+              IP->Repl32CR = IP->Repl32CR || IN->Repl32CR || I->RLAmt >= 32;
+              IP->Repl32Coalesced = true;
+              I = BitGroups.erase(I);
+              BitGroups.erase(BitGroups.begin());
+            }
+
+            // This must be the last group in the vector (and we might have
+            // just invalidated the iterator above), so break here.
+            break;
+          }
+        }
+      }
+
+      ++I;
+    }
+  }
+
+  SDValue getI32Imm(unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  }
+
+  uint64_t getZerosMask() {
+    uint64_t Mask = 0;
+    for (unsigned i = 0; i < Bits.size(); ++i) {
+      if (Bits[i].hasValue())
+        continue;
+      Mask |= (UINT64_C(1) << i);
+    }
+
+    return ~Mask;
+  }
+
+  // Depending on the number of groups for a particular value, it might be
+  // better to rotate, mask explicitly (using andi/andis), and then or the
+  // result. Select this part of the result first.
+  void SelectAndParts32(SDLoc dl, SDValue &Res, unsigned *InstCnt) {
+    if (BPermRewriterNoMasking)
+      return;
+
+    for (ValueRotInfo &VRI : ValueRotsVec) {
+      unsigned Mask = 0;
+      for (unsigned i = 0; i < Bits.size(); ++i) {
+        if (!Bits[i].hasValue() || Bits[i].getValue() != VRI.V)
+          continue;
+        if (RLAmt[i] != VRI.RLAmt)
+          continue;
+        Mask |= (1u << i);
+      }
+
+      // Compute the masks for andi/andis that would be necessary.
+      unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16;
+      assert((ANDIMask != 0 || ANDISMask != 0) &&
+             "No set bits in mask for value bit groups");
+      bool NeedsRotate = VRI.RLAmt != 0;
+
+      // We're trying to minimize the number of instructions. If we have one
+      // group, using one of andi/andis can break even.  If we have three
+      // groups, we can use both andi and andis and break even (to use both
+      // andi and andis we also need to or the results together). We need four
+      // groups if we also need to rotate. To use andi/andis we need to do more
+      // than break even because rotate-and-mask instructions tend to be easier
+      // to schedule.
+
+      // FIXME: We've biased here against using andi/andis, which is right for
+      // POWER cores, but not optimal everywhere. For example, on the A2,
+      // andi/andis have single-cycle latency whereas the rotate-and-mask
+      // instructions take two cycles, and it would be better to bias toward
+      // andi/andis in break-even cases.
+
+      unsigned NumAndInsts = (unsigned) NeedsRotate +
+                             (unsigned) (ANDIMask != 0) +
+                             (unsigned) (ANDISMask != 0) +
+                             (unsigned) (ANDIMask != 0 && ANDISMask != 0) +
+                             (unsigned) (bool) Res;
+
+      DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() <<
+                      " RL: " << VRI.RLAmt << ":" <<
+                      "\n\t\t\tisel using masking: " << NumAndInsts <<
+                      " using rotates: " << VRI.NumGroups << "\n");
+
+      if (NumAndInsts >= VRI.NumGroups)
+        continue;
+
+      DEBUG(dbgs() << "\t\t\t\tusing masking\n");
+
+      if (InstCnt) *InstCnt += NumAndInsts;
+
+      SDValue VRot;
+      if (VRI.RLAmt) {
+        SDValue Ops[] =
+          { VRI.V, getI32Imm(VRI.RLAmt), getI32Imm(0), getI32Imm(31) };
+        VRot = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
+                                              Ops), 0);
+      } else {
+        VRot = VRI.V;
+      }
+
+      SDValue ANDIVal, ANDISVal;
+      if (ANDIMask != 0)
+        ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32,
+                            VRot, getI32Imm(ANDIMask)), 0);
+      if (ANDISMask != 0)
+        ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32,
+                             VRot, getI32Imm(ANDISMask)), 0);
+
+      SDValue TotalVal;
+      if (!ANDIVal)
+        TotalVal = ANDISVal;
+      else if (!ANDISVal)
+        TotalVal = ANDIVal;
+      else
+        TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
+                             ANDIVal, ANDISVal), 0);
+
+      if (!Res)
+        Res = TotalVal;
+      else
+        Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
+                        Res, TotalVal), 0);
+
+      // Now, remove all groups with this underlying value and rotation
+      // factor.
+      for (auto I = BitGroups.begin(); I != BitGroups.end();) {
+        if (I->V == VRI.V && I->RLAmt == VRI.RLAmt)
+          I = BitGroups.erase(I);
+        else
+          ++I;
+      }
+    }
+  }
+
+  // Instruction selection for the 32-bit case.
+  SDNode *Select32(SDNode *N, bool LateMask, unsigned *InstCnt) {
+    SDLoc dl(N);
+    SDValue Res;
+
+    if (InstCnt) *InstCnt = 0;
+
+    // Take care of cases that should use andi/andis first.
+    SelectAndParts32(dl, Res, InstCnt);
+
+    // If we've not yet selected a 'starting' instruction, and we have no zeros
+    // to fill in, select the (Value, RLAmt) with the highest priority (largest
+    // number of groups), and start with this rotated value.
+    if ((!HasZeros || LateMask) && !Res) {
+      ValueRotInfo &VRI = ValueRotsVec[0];
+      if (VRI.RLAmt) {
+        if (InstCnt) *InstCnt += 1;
+        SDValue Ops[] =
+          { VRI.V, getI32Imm(VRI.RLAmt), getI32Imm(0), getI32Imm(31) };
+        Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
+      } else {
+        Res = VRI.V;
+      }
+
+      // Now, remove all groups with this underlying value and rotation factor.
+      for (auto I = BitGroups.begin(); I != BitGroups.end();) {
+        if (I->V == VRI.V && I->RLAmt == VRI.RLAmt)
+          I = BitGroups.erase(I);
+        else
+          ++I;
+      }
+    }
+
+    if (InstCnt) *InstCnt += BitGroups.size();
+
+    // Insert the other groups (one at a time).
+    for (auto &BG : BitGroups) {
+      if (!Res) {
+        SDValue Ops[] =
+          { BG.V, getI32Imm(BG.RLAmt), getI32Imm(Bits.size() - BG.EndIdx - 1),
+            getI32Imm(Bits.size() - BG.StartIdx - 1) };
+        Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
+      } else {
+        SDValue Ops[] =
+          { Res, BG.V, getI32Imm(BG.RLAmt), getI32Imm(Bits.size() - BG.EndIdx - 1),
+            getI32Imm(Bits.size() - BG.StartIdx - 1) };
+        Res = SDValue(CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops), 0);
+      }
+    }
+
+    if (LateMask) {
+      unsigned Mask = (unsigned) getZerosMask();
+
+      unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16;
+      assert((ANDIMask != 0 || ANDISMask != 0) &&
+             "No set bits in zeros mask?");
+
+      if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) +
+                               (unsigned) (ANDISMask != 0) +
+                               (unsigned) (ANDIMask != 0 && ANDISMask != 0);
+
+      SDValue ANDIVal, ANDISVal;
+      if (ANDIMask != 0)
+        ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32,
+                            Res, getI32Imm(ANDIMask)), 0);
+      if (ANDISMask != 0)
+        ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32,
+                             Res, getI32Imm(ANDISMask)), 0);
+
+      if (!ANDIVal)
+        Res = ANDISVal;
+      else if (!ANDISVal)
+        Res = ANDIVal;
+      else
+        Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
+                        ANDIVal, ANDISVal), 0);
+    }
+
+    return Res.getNode();
+  }
+
+  unsigned SelectRotMask64Count(unsigned RLAmt, bool Repl32,
+                                unsigned MaskStart, unsigned MaskEnd,
+                                bool IsIns) {
+    // In the notation used by the instructions, 'start' and 'end' are reversed
+    // because bits are counted from high to low order.
+    unsigned InstMaskStart = 64 - MaskEnd - 1,
+             InstMaskEnd   = 64 - MaskStart - 1;
+
+    if (Repl32)
+      return 1;
+
+    if ((!IsIns && (InstMaskEnd == 63 || InstMaskStart == 0)) ||
+        InstMaskEnd == 63 - RLAmt)
+      return 1;
+
+    return 2;
+  }
+
+  // For 64-bit values, not all combinations of rotates and masks are
+  // available. Produce one if it is available.
+  SDValue SelectRotMask64(SDValue V, SDLoc dl, unsigned RLAmt, bool Repl32,
+                          unsigned MaskStart, unsigned MaskEnd,
+                          unsigned *InstCnt = nullptr) {
+    // In the notation used by the instructions, 'start' and 'end' are reversed
+    // because bits are counted from high to low order.
+    unsigned InstMaskStart = 64 - MaskEnd - 1,
+             InstMaskEnd   = 64 - MaskStart - 1;
+
+    if (InstCnt) *InstCnt += 1;
+
+    if (Repl32) {
+      // This rotation amount assumes that the lower 32 bits of the quantity
+      // are replicated in the high 32 bits by the rotation operator (which is
+      // done by rlwinm and friends).
+      assert(InstMaskStart >= 32 && "Mask cannot start out of range");
+      assert(InstMaskEnd   >= 32 && "Mask cannot end out of range");
+      SDValue Ops[] =
+        { V, getI32Imm(RLAmt), getI32Imm(InstMaskStart - 32),
+          getI32Imm(InstMaskEnd - 32) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLWINM8, dl, MVT::i64,
+                                            Ops), 0);
+    }
+
+    if (InstMaskEnd == 63) {
+      SDValue Ops[] =
+        { V, getI32Imm(RLAmt), getI32Imm(InstMaskStart) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Ops), 0);
+    }
+
+    if (InstMaskStart == 0) {
+      SDValue Ops[] =
+        { V, getI32Imm(RLAmt), getI32Imm(InstMaskEnd) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Ops), 0);
+    }
+
+    if (InstMaskEnd == 63 - RLAmt) {
+      SDValue Ops[] =
+        { V, getI32Imm(RLAmt), getI32Imm(InstMaskStart) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, Ops), 0);
+    }
+
+    // We cannot do this with a single instruction, so we'll use two. The
+    // problem is that we're not free to choose both a rotation amount and mask
+    // start and end independently. We can choose an arbitrary mask start and
+    // end, but then the rotation amount is fixed. Rotation, however, can be
+    // inverted, and so by applying an "inverse" rotation first, we can get the
+    // desired result.
+    if (InstCnt) *InstCnt += 1;
+
+    // The rotation mask for the second instruction must be MaskStart.
+    unsigned RLAmt2 = MaskStart;
+    // The first instruction must rotate V so that the overall rotation amount
+    // is RLAmt.
+    unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64;
+    if (RLAmt1)
+      V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63);
+    return SelectRotMask64(V, dl, RLAmt2, false, MaskStart, MaskEnd);
+  }
+
+  // For 64-bit values, not all combinations of rotates and masks are
+  // available. Produce a rotate-mask-and-insert if one is available.
+  SDValue SelectRotMaskIns64(SDValue Base, SDValue V, SDLoc dl, unsigned RLAmt,
+                             bool Repl32, unsigned MaskStart,
+                             unsigned MaskEnd, unsigned *InstCnt = nullptr) {
+    // In the notation used by the instructions, 'start' and 'end' are reversed
+    // because bits are counted from high to low order.
+    unsigned InstMaskStart = 64 - MaskEnd - 1,
+             InstMaskEnd   = 64 - MaskStart - 1;
+
+    if (InstCnt) *InstCnt += 1;
+
+    if (Repl32) {
+      // This rotation amount assumes that the lower 32 bits of the quantity
+      // are replicated in the high 32 bits by the rotation operator (which is
+      // done by rlwinm and friends).
+      assert(InstMaskStart >= 32 && "Mask cannot start out of range");
+      assert(InstMaskEnd   >= 32 && "Mask cannot end out of range");
+      SDValue Ops[] =
+        { Base, V, getI32Imm(RLAmt), getI32Imm(InstMaskStart - 32),
+          getI32Imm(InstMaskEnd - 32) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLWIMI8, dl, MVT::i64,
+                                            Ops), 0);
+    }
+
+    if (InstMaskEnd == 63 - RLAmt) {
+      SDValue Ops[] =
+        { Base, V, getI32Imm(RLAmt), getI32Imm(InstMaskStart) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops), 0);
+    }
+
+    // We cannot do this with a single instruction, so we'll use two. The
+    // problem is that we're not free to choose both a rotation amount and mask
+    // start and end independently. We can choose an arbitrary mask start and
+    // end, but then the rotation amount is fixed. Rotation, however, can be
+    // inverted, and so by applying an "inverse" rotation first, we can get the
+    // desired result.
+    if (InstCnt) *InstCnt += 1;
+
+    // The rotation mask for the second instruction must be MaskStart.
+    unsigned RLAmt2 = MaskStart;
+    // The first instruction must rotate V so that the overall rotation amount
+    // is RLAmt.
+    unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64;
+    if (RLAmt1)
+      V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63);
+    return SelectRotMaskIns64(Base, V, dl, RLAmt2, false, MaskStart, MaskEnd);
+  }
+
+  void SelectAndParts64(SDLoc dl, SDValue &Res, unsigned *InstCnt) {
+    if (BPermRewriterNoMasking)
+      return;
+
+    // The idea here is the same as in the 32-bit version, but with additional
+    // complications from the fact that Repl32 might be true. Because we
+    // aggressively convert bit groups to Repl32 form (which, for small
+    // rotation factors, involves no other change), and then coalesce, it might
+    // be the case that a single 64-bit masking operation could handle both
+    // some Repl32 groups and some non-Repl32 groups. If converting to Repl32
+    // form allowed coalescing, then we must use a 32-bit rotaton in order to
+    // completely capture the new combined bit group.
+
+    for (ValueRotInfo &VRI : ValueRotsVec) {
+      uint64_t Mask = 0;
+
+      // We need to add to the mask all bits from the associated bit groups.
+      // If Repl32 is false, we need to add bits from bit groups that have
+      // Repl32 true, but are trivially convertable to Repl32 false. Such a
+      // group is trivially convertable if it overlaps only with the lower 32
+      // bits, and the group has not been coalesced.
+      auto MatchingBG = [VRI](BitGroup &BG) {
+        if (VRI.V != BG.V)
+          return false;
+
+        unsigned EffRLAmt = BG.RLAmt;
+        if (!VRI.Repl32 && BG.Repl32) {
+          if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx <= BG.EndIdx &&
+              !BG.Repl32Coalesced) {
+            if (BG.Repl32CR)
+              EffRLAmt += 32;
+          } else {
+            return false;
+          }
+        } else if (VRI.Repl32 != BG.Repl32) {
+          return false;
+        }
+
+        if (VRI.RLAmt != EffRLAmt)
+          return false;
+
+        return true;
+      };
+
+      for (auto &BG : BitGroups) {
+        if (!MatchingBG(BG))
+          continue;
+
+        if (BG.StartIdx <= BG.EndIdx) {
+          for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i)
+            Mask |= (UINT64_C(1) << i);
+        } else {
+          for (unsigned i = BG.StartIdx; i < Bits.size(); ++i)
+            Mask |= (UINT64_C(1) << i);
+          for (unsigned i = 0; i <= BG.EndIdx; ++i)
+            Mask |= (UINT64_C(1) << i);
+        }
+      }
+
+      // We can use the 32-bit andi/andis technique if the mask does not
+      // require any higher-order bits. This can save an instruction compared
+      // to always using the general 64-bit technique.
+      bool Use32BitInsts = isUInt<32>(Mask);
+      // Compute the masks for andi/andis that would be necessary.
+      unsigned ANDIMask = (Mask & UINT16_MAX),
+               ANDISMask = (Mask >> 16) & UINT16_MAX;
+
+      bool NeedsRotate = VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask));
+
+      unsigned NumAndInsts = (unsigned) NeedsRotate +
+                             (unsigned) (bool) Res;
+      if (Use32BitInsts)
+        NumAndInsts += (unsigned) (ANDIMask != 0) + (unsigned) (ANDISMask != 0) +
+                       (unsigned) (ANDIMask != 0 && ANDISMask != 0);
+      else
+        NumAndInsts += SelectInt64Count(Mask) + /* and */ 1;
+
+      unsigned NumRLInsts = 0;
+      bool FirstBG = true;
+      for (auto &BG : BitGroups) {
+        if (!MatchingBG(BG))
+          continue;
+        NumRLInsts +=
+          SelectRotMask64Count(BG.RLAmt, BG.Repl32, BG.StartIdx, BG.EndIdx,
+                               !FirstBG);
+        FirstBG = false;
+      }
+
+      DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() <<
+                      " RL: " << VRI.RLAmt << (VRI.Repl32 ? " (32):" : ":") <<
+                      "\n\t\t\tisel using masking: " << NumAndInsts <<
+                      " using rotates: " << NumRLInsts << "\n");
+
+      // When we'd use andi/andis, we bias toward using the rotates (andi only
+      // has a record form, and is cracked on POWER cores). However, when using
+      // general 64-bit constant formation, bias toward the constant form,
+      // because that exposes more opportunities for CSE.
+      if (NumAndInsts > NumRLInsts)
+        continue;
+      if (Use32BitInsts && NumAndInsts == NumRLInsts)
+        continue;
+
+      DEBUG(dbgs() << "\t\t\t\tusing masking\n");
+
+      if (InstCnt) *InstCnt += NumAndInsts;
+
+      SDValue VRot;
+      // We actually need to generate a rotation if we have a non-zero rotation
+      // factor or, in the Repl32 case, if we care about any of the
+      // higher-order replicated bits. In the latter case, we generate a mask
+      // backward so that it actually includes the entire 64 bits.
+      if (VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask)))
+        VRot = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32,
+                               VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63);
+      else
+        VRot = VRI.V;
+
+      SDValue TotalVal;
+      if (Use32BitInsts) {
+        assert((ANDIMask != 0 || ANDISMask != 0) &&
+               "No set bits in mask when using 32-bit ands for 64-bit value");
+
+        SDValue ANDIVal, ANDISVal;
+        if (ANDIMask != 0)
+          ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64,
+                              VRot, getI32Imm(ANDIMask)), 0);
+        if (ANDISMask != 0)
+          ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64,
+                               VRot, getI32Imm(ANDISMask)), 0);
+
+        if (!ANDIVal)
+          TotalVal = ANDISVal;
+        else if (!ANDISVal)
+          TotalVal = ANDIVal;
+        else
+          TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+                               ANDIVal, ANDISVal), 0);
+      } else {
+        TotalVal = SDValue(SelectInt64(CurDAG, dl, Mask), 0);
+        TotalVal =
+          SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
+                                         VRot, TotalVal), 0);
+     }
+
+      if (!Res)
+        Res = TotalVal;
+      else
+        Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+                                             Res, TotalVal), 0);
+
+      // Now, remove all groups with this underlying value and rotation
+      // factor.
+      for (auto I = BitGroups.begin(); I != BitGroups.end();) {
+        if (MatchingBG(*I))
+          I = BitGroups.erase(I);
+        else
+          ++I;
+      }
+    }
+  }
+
+  // Instruction selection for the 64-bit case.
+  SDNode *Select64(SDNode *N, bool LateMask, unsigned *InstCnt) {
+    SDLoc dl(N);
+    SDValue Res;
+
+    if (InstCnt) *InstCnt = 0;
+
+    // Take care of cases that should use andi/andis first.
+    SelectAndParts64(dl, Res, InstCnt);
+
+    // If we've not yet selected a 'starting' instruction, and we have no zeros
+    // to fill in, select the (Value, RLAmt) with the highest priority (largest
+    // number of groups), and start with this rotated value.
+    if ((!HasZeros || LateMask) && !Res) {
+      // If we have both Repl32 groups and non-Repl32 groups, the non-Repl32
+      // groups will come first, and so the VRI representing the largest number
+      // of groups might not be first (it might be the first Repl32 groups).
+      unsigned MaxGroupsIdx = 0;
+      if (!ValueRotsVec[0].Repl32) {
+        for (unsigned i = 0, ie = ValueRotsVec.size(); i < ie; ++i)
+          if (ValueRotsVec[i].Repl32) {
+            if (ValueRotsVec[i].NumGroups > ValueRotsVec[0].NumGroups)
+              MaxGroupsIdx = i;
+            break;
+          }
+      }
+
+      ValueRotInfo &VRI = ValueRotsVec[MaxGroupsIdx];
+      bool NeedsRotate = false;
+      if (VRI.RLAmt) {
+        NeedsRotate = true;
+      } else if (VRI.Repl32) {
+        for (auto &BG : BitGroups) {
+          if (BG.V != VRI.V || BG.RLAmt != VRI.RLAmt ||
+              BG.Repl32 != VRI.Repl32)
+            continue;
+
+          // We don't need a rotate if the bit group is confined to the lower
+          // 32 bits.
+          if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx < BG.EndIdx)
+            continue;
+
+          NeedsRotate = true;
+          break;
+        }
+      }
+
+      if (NeedsRotate)
+        Res = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32,
+                              VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63,
+                              InstCnt);
+      else
+        Res = VRI.V;
+
+      // Now, remove all groups with this underlying value and rotation factor.
+      if (Res)
+        for (auto I = BitGroups.begin(); I != BitGroups.end();) {
+          if (I->V == VRI.V && I->RLAmt == VRI.RLAmt && I->Repl32 == VRI.Repl32)
+            I = BitGroups.erase(I);
+          else
+            ++I;
+        }
+    }
+
+    // Because 64-bit rotates are more flexible than inserts, we might have a
+    // preference regarding which one we do first (to save one instruction).
+    if (!Res)
+      for (auto I = BitGroups.begin(), IE = BitGroups.end(); I != IE; ++I) {
+        if (SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx,
+                                false) <
+            SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx,
+                                true)) {
+          if (I != BitGroups.begin()) {
+            BitGroup BG = *I;
+            BitGroups.erase(I);
+            BitGroups.insert(BitGroups.begin(), BG);
+          }
+
+          break;
+        }
+      }
+
+    // Insert the other groups (one at a time).
+    for (auto &BG : BitGroups) {
+      if (!Res)
+        Res = SelectRotMask64(BG.V, dl, BG.RLAmt, BG.Repl32, BG.StartIdx,
+                              BG.EndIdx, InstCnt);
+      else
+        Res = SelectRotMaskIns64(Res, BG.V, dl, BG.RLAmt, BG.Repl32,
+                                 BG.StartIdx, BG.EndIdx, InstCnt);
+    }
+
+    if (LateMask) {
+      uint64_t Mask = getZerosMask();
+
+      // We can use the 32-bit andi/andis technique if the mask does not
+      // require any higher-order bits. This can save an instruction compared
+      // to always using the general 64-bit technique.
+      bool Use32BitInsts = isUInt<32>(Mask);
+      // Compute the masks for andi/andis that would be necessary.
+      unsigned ANDIMask = (Mask & UINT16_MAX),
+               ANDISMask = (Mask >> 16) & UINT16_MAX;
+
+      if (Use32BitInsts) {
+        assert((ANDIMask != 0 || ANDISMask != 0) &&
+               "No set bits in mask when using 32-bit ands for 64-bit value");
+
+        if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) +
+                                 (unsigned) (ANDISMask != 0) +
+                                 (unsigned) (ANDIMask != 0 && ANDISMask != 0);
+
+        SDValue ANDIVal, ANDISVal;
+        if (ANDIMask != 0)
+          ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64,
+                              Res, getI32Imm(ANDIMask)), 0);
+        if (ANDISMask != 0)
+          ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64,
+                               Res, getI32Imm(ANDISMask)), 0);
+
+        if (!ANDIVal)
+          Res = ANDISVal;
+        else if (!ANDISVal)
+          Res = ANDIVal;
+        else
+          Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+                          ANDIVal, ANDISVal), 0);
+      } else {
+        if (InstCnt) *InstCnt += SelectInt64Count(Mask) + /* and */ 1;
+
+        SDValue MaskVal = SDValue(SelectInt64(CurDAG, dl, Mask), 0);
+        Res =
+          SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
+                                         Res, MaskVal), 0);
+      }
+    }
+
+    return Res.getNode();
+  }
+
+  SDNode *Select(SDNode *N, bool LateMask, unsigned *InstCnt = nullptr) {
+    // Fill in BitGroups.
+    collectBitGroups(LateMask);
+    if (BitGroups.empty())
+      return nullptr;
+
+    // For 64-bit values, figure out when we can use 32-bit instructions.
+    if (Bits.size() == 64)
+      assignRepl32BitGroups();
+
+    // Fill in ValueRotsVec.
+    collectValueRotInfo();
+
+    if (Bits.size() == 32) {
+      return Select32(N, LateMask, InstCnt);
+    } else {
+      assert(Bits.size() == 64 && "Not 64 bits here?");
+      return Select64(N, LateMask, InstCnt);
+    }
+
+    return nullptr;
+  }
+
+  SmallVector<ValueBit, 64> Bits;
+
+  bool HasZeros;
+  SmallVector<unsigned, 64> RLAmt;
+
+  SmallVector<BitGroup, 16> BitGroups;
+
+  DenseMap<std::pair<SDValue, unsigned>, ValueRotInfo> ValueRots;
+  SmallVector<ValueRotInfo, 16> ValueRotsVec;
+
+  SelectionDAG *CurDAG;
+
+public:
+  BitPermutationSelector(SelectionDAG *DAG)
+    : CurDAG(DAG) {}
+
+  // Here we try to match complex bit permutations into a set of
+  // rotate-and-shift/shift/and/or instructions, using a set of heuristics
+  // known to produce optimial code for common cases (like i32 byte swapping).
+  SDNode *Select(SDNode *N) {
+    Bits.resize(N->getValueType(0).getSizeInBits());
+    if (!getValueBits(SDValue(N, 0), Bits))
+      return nullptr;
+
+    DEBUG(dbgs() << "Considering bit-permutation-based instruction"
+                    " selection for:    ");
+    DEBUG(N->dump(CurDAG));
+
+    // Fill it RLAmt and set HasZeros.
+    computeRotationAmounts();
+
+    if (!HasZeros)
+      return Select(N, false);
+
+    // We currently have two techniques for handling results with zeros: early
+    // masking (the default) and late masking. Late masking is sometimes more
+    // efficient, but because the structure of the bit groups is different, it
+    // is hard to tell without generating both and comparing the results. With
+    // late masking, we ignore zeros in the resulting value when inserting each
+    // set of bit groups, and then mask in the zeros at the end. With early
+    // masking, we only insert the non-zero parts of the result at every step.
+
+    unsigned InstCnt, InstCntLateMask;
+    DEBUG(dbgs() << "\tEarly masking:\n");
+    SDNode *RN = Select(N, false, &InstCnt);
+    DEBUG(dbgs() << "\t\tisel would use " << InstCnt << " instructions\n");
+
+    DEBUG(dbgs() << "\tLate masking:\n");
+    SDNode *RNLM = Select(N, true, &InstCntLateMask);
+    DEBUG(dbgs() << "\t\tisel would use " << InstCntLateMask <<
+                    " instructions\n");
+
+    if (InstCnt <= InstCntLateMask) {
+      DEBUG(dbgs() << "\tUsing early-masking for isel\n");
+      return RN;
+    }
+
+    DEBUG(dbgs() << "\tUsing late-masking for isel\n");
+    return RNLM;
+  }
+};
+} // anonymous namespace
+
+SDNode *PPCDAGToDAGISel::SelectBitPermutation(SDNode *N) {
+  if (N->getValueType(0) != MVT::i32 &&
+      N->getValueType(0) != MVT::i64)
+    return nullptr;
+
+  if (!UseBitPermRewriter)
+    return nullptr;
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::ROTL:
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::AND:
+  case ISD::OR: {
+    BitPermutationSelector BPS(CurDAG);
+    return BPS.Select(N);
+  }
+  }
+
+  return nullptr;
+}
+
 /// SelectCC - Select a comparison of the specified values with the specified
 /// condition code, returning the CR# of the expression.
 SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
@@ -859,6 +2295,9 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   // Altivec Vector compare instructions do not set any CR register by default and
   // vector compare operations return the same type as the operands.
   if (LHS.getValueType().isVector()) {
+    if (PPCSubTarget->hasQPX())
+      return nullptr;
+
     EVT VecVT = LHS.getValueType();
     bool Swap, Negate;
     unsigned int VCmpInst = getVCmpInst(VecVT.getSimpleVT(), CC,
@@ -905,6 +2344,14 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1));
 }
 
+SDNode *PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+  return Result;
+}
+
 
 // Select - Convert the specified operand from a target-independent to a
 // target-specific node if it hasn't already been changed.
@@ -922,81 +2369,16 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       N->getOperand(1).getOpcode() == ISD::TargetConstant)
     llvm_unreachable("Invalid ADD with TargetConstant operand");
 
+  // Try matching complex bit permutations before doing anything else.
+  if (SDNode *NN = SelectBitPermutation(N))
+    return NN;
+
   switch (N->getOpcode()) {
   default: break;
 
   case ISD::Constant: {
-    if (N->getValueType(0) == MVT::i64) {
-      // Get 64 bit value.
-      int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
-      // Assume no remaining bits.
-      unsigned Remainder = 0;
-      // Assume no shift required.
-      unsigned Shift = 0;
-
-      // If it can't be represented as a 32 bit value.
-      if (!isInt<32>(Imm)) {
-        Shift = countTrailingZeros<uint64_t>(Imm);
-        int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
-
-        // If the shifted value fits 32 bits.
-        if (isInt<32>(ImmSh)) {
-          // Go with the shifted value.
-          Imm = ImmSh;
-        } else {
-          // Still stuck with a 64 bit value.
-          Remainder = Imm;
-          Shift = 32;
-          Imm >>= 32;
-        }
-      }
-
-      // Intermediate operand.
-      SDNode *Result;
-
-      // Handle first 32 bits.
-      unsigned Lo = Imm & 0xFFFF;
-      unsigned Hi = (Imm >> 16) & 0xFFFF;
-
-      // Simple value.
-      if (isInt<16>(Imm)) {
-       // Just the Lo bits.
-        Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
-      } else if (Lo) {
-        // Handle the Hi bits.
-        unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
-        Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi));
-        // And Lo bits.
-        Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
-                                        SDValue(Result, 0), getI32Imm(Lo));
-      } else {
-       // Just the Hi bits.
-        Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
-      }
-
-      // If no shift, we're done.
-      if (!Shift) return Result;
-
-      // Shift for next step if the upper 32-bits were not zero.
-      if (Imm) {
-        Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64,
-                                        SDValue(Result, 0),
-                                        getI32Imm(Shift),
-                                        getI32Imm(63 - Shift));
-      }
-
-      // Add in the last bits as required.
-      if ((Hi = (Remainder >> 16) & 0xFFFF)) {
-        Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
-                                        SDValue(Result, 0), getI32Imm(Hi));
-      }
-      if ((Lo = Remainder & 0xFFFF)) {
-        Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
-                                        SDValue(Result, 0), getI32Imm(Lo));
-      }
-
-      return Result;
-    }
+    if (N->getValueType(0) == MVT::i64)
+      return SelectInt64(CurDAG, N);
     break;
   }
 
@@ -1009,16 +2391,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   case PPCISD::GlobalBaseReg:
     return getGlobalBaseReg();
 
-  case ISD::FrameIndex: {
-    int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0));
-    unsigned Opc = N->getValueType(0) == MVT::i32 ? PPC::ADDI : PPC::ADDI8;
-    if (N->hasOneUse())
-      return CurDAG->SelectNodeTo(N, Opc, N->getValueType(0), TFI,
-                                  getSmallIPtrImm(0));
-    return CurDAG->getMachineNode(Opc, dl, N->getValueType(0), TFI,
-                                  getSmallIPtrImm(0));
-  }
+  case ISD::FrameIndex:
+    return getFrameIndex(N, N);
 
   case PPCISD::MFOCRF: {
     SDValue InFlag = N->getOperand(1);
@@ -1026,35 +2400,31 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
                                   N->getOperand(0), InFlag);
   }
 
-  case ISD::SDIV: {
-    // FIXME: since this depends on the setting of the carry flag from the srawi
-    //        we should really be making notes about that for the scheduler.
-    // FIXME: It sure would be nice if we could cheaply recognize the
-    //        srl/add/sra pattern the dag combiner will generate for this as
-    //        sra/addze rather than having to handle sdiv ourselves.  oh well.
-    unsigned Imm;
-    if (isInt32Immediate(N->getOperand(1), Imm)) {
-      SDValue N0 = N->getOperand(0);
-      if ((signed)Imm > 0 && isPowerOf2_32(Imm)) {
-        SDNode *Op =
-          CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue,
-                                 N0, getI32Imm(Log2_32(Imm)));
-        return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32,
-                                    SDValue(Op, 0), SDValue(Op, 1));
-      } else if ((signed)Imm < 0 && isPowerOf2_32(-Imm)) {
-        SDNode *Op =
-          CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue,
-                                 N0, getI32Imm(Log2_32(-Imm)));
-        SDValue PT =
-          SDValue(CurDAG->getMachineNode(PPC::ADDZE, dl, MVT::i32,
-                                         SDValue(Op, 0), SDValue(Op, 1)),
-                    0);
-        return CurDAG->SelectNodeTo(N, PPC::NEG, MVT::i32, PT);
-      }
-    }
+  case PPCISD::READ_TIME_BASE: {
+    return CurDAG->getMachineNode(PPC::ReadTB, dl, MVT::i32, MVT::i32,
+                                  MVT::Other, N->getOperand(0));
+  }
 
-    // Other cases are autogenerated.
-    break;
+  case PPCISD::SRA_ADDZE: {
+    SDValue N0 = N->getOperand(0);
+    SDValue ShiftAmt =
+      CurDAG->getTargetConstant(*cast<ConstantSDNode>(N->getOperand(1))->
+                                  getConstantIntValue(), N->getValueType(0));
+    if (N->getValueType(0) == MVT::i64) {
+      SDNode *Op =
+        CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, MVT::Glue,
+                               N0, ShiftAmt);
+      return CurDAG->SelectNodeTo(N, PPC::ADDZE8, MVT::i64,
+                                  SDValue(Op, 0), SDValue(Op, 1));
+    } else {
+      assert(N->getValueType(0) == MVT::i32 &&
+             "Expecting i64 or i32 in PPCISD::SRA_ADDZE");
+      SDNode *Op =
+        CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue,
+                               N0, ShiftAmt);
+      return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32,
+                                  SDValue(Op, 0), SDValue(Op, 1));
+    }
   }
 
   case ISD::LOAD: {
@@ -1100,9 +2470,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Chain = LD->getChain();
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Offset, Base, Chain };
-      return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
-                                    PPCLowering->getPointerTy(),
-                                    MVT::Other, Ops);
+      return transferMemOperands(N, CurDAG->getMachineNode(Opcode, dl,
+                                      LD->getValueType(0),
+                                      PPCLowering->getPointerTy(),
+                                      MVT::Other, Ops));
     } else {
       unsigned Opcode;
       bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
@@ -1111,6 +2482,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
         assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
         switch (LoadedVT.getSimpleVT().SimpleTy) {
           default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::v4f64: Opcode = PPC::QVLFDUX; break; // QPX
+          case MVT::v4f32: Opcode = PPC::QVLFSUX; break; // QPX
           case MVT::f64: Opcode = PPC::LFDUX; break;
           case MVT::f32: Opcode = PPC::LFSUX; break;
           case MVT::i32: Opcode = PPC::LWZUX; break;
@@ -1135,9 +2508,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Chain = LD->getChain();
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Base, Offset, Chain };
-      return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
-                                    PPCLowering->getPointerTy(),
-                                    MVT::Other, Ops);
+      return transferMemOperands(N, CurDAG->getMachineNode(Opcode, dl,
+                                      LD->getValueType(0),
+                                      PPCLowering->getPointerTy(),
+                                      MVT::Other, Ops));
     }
   }
 
@@ -1166,7 +2540,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
         isMask_64(Imm64)) {
       SDValue Val = N->getOperand(0);
-      MB = 64 - CountTrailingOnes_64(Imm64);
+      MB = 64 - countTrailingOnes(Imm64);
       SH = 0;
 
       // If the operand is a logical right shift, we can fold it into this
@@ -1207,13 +2581,34 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     // Other cases are autogenerated.
     break;
   }
-  case ISD::OR:
+  case ISD::OR: {
     if (N->getValueType(0) == MVT::i32)
       if (SDNode *I = SelectBitfieldInsert(N))
         return I;
 
+    short Imm;
+    if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
+        isIntS16Immediate(N->getOperand(1), Imm)) {
+      APInt LHSKnownZero, LHSKnownOne;
+      CurDAG->computeKnownBits(N->getOperand(0), LHSKnownZero, LHSKnownOne);
+
+      // If this is equivalent to an add, then we can fold it with the
+      // FrameIndex calculation.
+      if ((LHSKnownZero.getZExtValue()|~(uint64_t)Imm) == ~0ULL)
+        return getFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
+    }
+
     // Other cases are autogenerated.
     break;
+  }
+  case ISD::ADD: {
+    short Imm;
+    if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
+        isIntS16Immediate(N->getOperand(1), Imm))
+      return getFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
+
+    break;
+  }
   case ISD::SHL: {
     unsigned Imm, SH, MB, ME;
     if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
@@ -1333,6 +2728,12 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
         SelectCCOp = PPC::SELECT_CC_VSFRC;
       else
         SelectCCOp = PPC::SELECT_CC_F8;
+    else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f64)
+      SelectCCOp = PPC::SELECT_CC_QFRC;
+    else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f32)
+      SelectCCOp = PPC::SELECT_CC_QSRC;
+    else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4i1)
+      SelectCCOp = PPC::SELECT_CC_QBRC;
     else if (N->getValueType(0) == MVT::v2f64 ||
              N->getValueType(0) == MVT::v2i64)
       SelectCCOp = PPC::SELECT_CC_VSRC;
@@ -1365,6 +2766,15 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
         else
           DM[i] = 1;
 
+      // For little endian, we must swap the input operands and adjust
+      // the mask elements (reverse and invert them).
+      if (PPCSubTarget->isLittleEndian()) {
+        std::swap(Op1, Op2);
+        unsigned tmp = DM[0];
+        DM[0] = 1 - DM[1];
+        DM[1] = 1 - tmp;
+      }
+
       SDValue DMV = CurDAG->getTargetConstant(DM[1] | (DM[0] << 1), MVT::i32);
 
       if (Op1 == Op2 && DM[0] == 0 && DM[1] == 0 &&
@@ -1453,8 +2863,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
             "Only supported for 64-bit ABI and 32-bit SVR4");
     if (PPCSubTarget->isSVR4ABI() && !PPCSubTarget->isPPC64()) {
       SDValue GA = N->getOperand(0);
-      return CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA,
-                                    N->getOperand(1));
+      return transferMemOperands(N, CurDAG->getMachineNode(PPC::LWZtoc, dl,
+                                      MVT::i32, GA, N->getOperand(1)));
     }
 
     // For medium and large code model, we generate two instructions as
@@ -1474,12 +2884,12 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     SDValue GA = N->getOperand(0);
     SDValue TOCbase = N->getOperand(1);
     SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64,
-                                        TOCbase, GA);
+                                         TOCbase, GA);
 
     if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA) ||
         CModel == CodeModel::Large)
-      return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
-                                    SDValue(Tmp, 0));
+      return transferMemOperands(N, CurDAG->getMachineNode(PPC::LDtocL, dl,
+                                      MVT::i64, GA, SDValue(Tmp, 0)));
 
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
       const GlobalValue *GValue = G->getGlobal();
@@ -1487,8 +2897,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
            (GValue->isDeclaration() || GValue->isWeakForLinker())) ||
           GValue->isDeclaration() || GValue->hasCommonLinkage() ||
           GValue->hasAvailableExternallyLinkage())
-        return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
-                                      SDValue(Tmp, 0));
+        return transferMemOperands(N, CurDAG->getMachineNode(PPC::LDtocL, dl,
+                                        MVT::i64, GA, SDValue(Tmp, 0)));
     }
 
     return CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
@@ -1576,6 +2986,324 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   return SelectCode(N);
 }
 
+// If the target supports the cmpb instruction, do the idiom recognition here.
+// We don't do this as a DAG combine because we don't want to do it as nodes
+// are being combined (because we might miss part of the eventual idiom). We
+// don't want to do it during instruction selection because we want to reuse
+// the logic for lowering the masking operations already part of the
+// instruction selector.
+SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) {
+  SDLoc dl(N);
+
+  assert(N->getOpcode() == ISD::OR &&
+         "Only OR nodes are supported for CMPB");
+
+  SDValue Res;
+  if (!PPCSubTarget->hasCMPB())
+    return Res;
+
+  if (N->getValueType(0) != MVT::i32 &&
+      N->getValueType(0) != MVT::i64)
+    return Res;
+
+  EVT VT = N->getValueType(0);
+
+  SDValue RHS, LHS;
+  bool BytesFound[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+  uint64_t Mask = 0, Alt = 0;
+
+  auto IsByteSelectCC = [this](SDValue O, unsigned &b,
+                               uint64_t &Mask, uint64_t &Alt,
+                               SDValue &LHS, SDValue &RHS) {
+    if (O.getOpcode() != ISD::SELECT_CC)
+      return false;
+    ISD::CondCode CC = cast<CondCodeSDNode>(O.getOperand(4))->get();
+
+    if (!isa<ConstantSDNode>(O.getOperand(2)) ||
+        !isa<ConstantSDNode>(O.getOperand(3)))
+      return false;
+
+    uint64_t PM = O.getConstantOperandVal(2);
+    uint64_t PAlt = O.getConstantOperandVal(3);
+    for (b = 0; b < 8; ++b) {
+      uint64_t Mask = UINT64_C(0xFF) << (8*b);
+      if (PM && (PM & Mask) == PM && (PAlt & Mask) == PAlt)
+        break;
+    }
+
+    if (b == 8)
+      return false;
+    Mask |= PM;
+    Alt  |= PAlt;
+
+    if (!isa<ConstantSDNode>(O.getOperand(1)) ||
+        O.getConstantOperandVal(1) != 0) {
+      SDValue Op0 = O.getOperand(0), Op1 = O.getOperand(1);
+      if (Op0.getOpcode() == ISD::TRUNCATE)
+        Op0 = Op0.getOperand(0);
+      if (Op1.getOpcode() == ISD::TRUNCATE)
+        Op1 = Op1.getOperand(0);
+
+      if (Op0.getOpcode() == ISD::SRL && Op1.getOpcode() == ISD::SRL &&
+          Op0.getOperand(1) == Op1.getOperand(1) && CC == ISD::SETEQ &&
+          isa<ConstantSDNode>(Op0.getOperand(1))) {
+
+        unsigned Bits = Op0.getValueType().getSizeInBits();
+        if (b != Bits/8-1)
+          return false;
+        if (Op0.getConstantOperandVal(1) != Bits-8)
+          return false;
+
+        LHS = Op0.getOperand(0);
+        RHS = Op1.getOperand(0);
+        return true;
+      }
+
+      // When we have small integers (i16 to be specific), the form present
+      // post-legalization uses SETULT in the SELECT_CC for the
+      // higher-order byte, depending on the fact that the
+      // even-higher-order bytes are known to all be zero, for example:
+      //   select_cc (xor $lhs, $rhs), 256, 65280, 0, setult
+      // (so when the second byte is the same, because all higher-order
+      // bits from bytes 3 and 4 are known to be zero, the result of the
+      // xor can be at most 255)
+      if (Op0.getOpcode() == ISD::XOR && CC == ISD::SETULT &&
+          isa<ConstantSDNode>(O.getOperand(1))) {
+
+        uint64_t ULim = O.getConstantOperandVal(1);
+        if (ULim != (UINT64_C(1) << b*8))
+          return false;
+
+        // Now we need to make sure that the upper bytes are known to be
+        // zero.
+        unsigned Bits = Op0.getValueType().getSizeInBits();
+        if (!CurDAG->MaskedValueIsZero(Op0,
+              APInt::getHighBitsSet(Bits, Bits - (b+1)*8)))
+          return false;
+        
+        LHS = Op0.getOperand(0);
+        RHS = Op0.getOperand(1);
+        return true;
+      }
+
+      return false;
+    }
+
+    if (CC != ISD::SETEQ)
+      return false;
+
+    SDValue Op = O.getOperand(0);
+    if (Op.getOpcode() == ISD::AND) {
+      if (!isa<ConstantSDNode>(Op.getOperand(1)))
+        return false;
+      if (Op.getConstantOperandVal(1) != (UINT64_C(0xFF) << (8*b)))
+        return false;
+
+      SDValue XOR = Op.getOperand(0);
+      if (XOR.getOpcode() == ISD::TRUNCATE)
+        XOR = XOR.getOperand(0);
+      if (XOR.getOpcode() != ISD::XOR)
+        return false;
+
+      LHS = XOR.getOperand(0);
+      RHS = XOR.getOperand(1);
+      return true;
+    } else if (Op.getOpcode() == ISD::SRL) {
+      if (!isa<ConstantSDNode>(Op.getOperand(1)))
+        return false;
+      unsigned Bits = Op.getValueType().getSizeInBits();
+      if (b != Bits/8-1)
+        return false;
+      if (Op.getConstantOperandVal(1) != Bits-8)
+        return false;
+
+      SDValue XOR = Op.getOperand(0);
+      if (XOR.getOpcode() == ISD::TRUNCATE)
+        XOR = XOR.getOperand(0);
+      if (XOR.getOpcode() != ISD::XOR)
+        return false;
+
+      LHS = XOR.getOperand(0);
+      RHS = XOR.getOperand(1);
+      return true;
+    }
+
+    return false;
+  };
+
+  SmallVector<SDValue, 8> Queue(1, SDValue(N, 0));
+  while (!Queue.empty()) {
+    SDValue V = Queue.pop_back_val();
+
+    for (const SDValue &O : V.getNode()->ops()) {
+      unsigned b;
+      uint64_t M = 0, A = 0;
+      SDValue OLHS, ORHS;
+      if (O.getOpcode() == ISD::OR) {
+        Queue.push_back(O);
+      } else if (IsByteSelectCC(O, b, M, A, OLHS, ORHS)) {
+        if (!LHS) {
+          LHS = OLHS;
+          RHS = ORHS;
+          BytesFound[b] = true;
+          Mask |= M;
+          Alt  |= A;
+        } else if ((LHS == ORHS && RHS == OLHS) ||
+                   (RHS == ORHS && LHS == OLHS)) {
+          BytesFound[b] = true;
+          Mask |= M;
+          Alt  |= A;
+        } else {
+          return Res;
+        }
+      } else {
+        return Res;
+      }
+    }
+  }
+
+  unsigned LastB = 0, BCnt = 0;
+  for (unsigned i = 0; i < 8; ++i)
+    if (BytesFound[LastB]) {
+      ++BCnt;
+      LastB = i;
+    }
+
+  if (!LastB || BCnt < 2)
+    return Res;
+
+  // Because we'll be zero-extending the output anyway if don't have a specific
+  // value for each input byte (via the Mask), we can 'anyext' the inputs.
+  if (LHS.getValueType() != VT) {
+    LHS = CurDAG->getAnyExtOrTrunc(LHS, dl, VT);
+    RHS = CurDAG->getAnyExtOrTrunc(RHS, dl, VT);
+  }
+
+  Res = CurDAG->getNode(PPCISD::CMPB, dl, VT, LHS, RHS);
+
+  bool NonTrivialMask = ((int64_t) Mask) != INT64_C(-1);
+  if (NonTrivialMask && !Alt) {
+    // Res = Mask & CMPB
+    Res = CurDAG->getNode(ISD::AND, dl, VT, Res, CurDAG->getConstant(Mask, VT));
+  } else if (Alt) {
+    // Res = (CMPB & Mask) | (~CMPB & Alt)
+    // Which, as suggested here:
+    //   https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge
+    // can be written as:
+    // Res = Alt ^ ((Alt ^ Mask) & CMPB)
+    // useful because the (Alt ^ Mask) can be pre-computed.
+    Res = CurDAG->getNode(ISD::AND, dl, VT, Res,
+                          CurDAG->getConstant(Mask ^ Alt, VT));
+    Res = CurDAG->getNode(ISD::XOR, dl, VT, Res, CurDAG->getConstant(Alt, VT));
+  }
+
+  return Res;
+}
+
+// When CR bit registers are enabled, an extension of an i1 variable to a i32
+// or i64 value is lowered in terms of a SELECT_I[48] operation, and thus
+// involves constant materialization of a 0 or a 1 or both. If the result of
+// the extension is then operated upon by some operator that can be constant
+// folded with a constant 0 or 1, and that constant can be materialized using
+// only one instruction (like a zero or one), then we should fold in those
+// operations with the select.
+void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) {
+  if (!PPCSubTarget->useCRBits())
+    return;
+
+  if (N->getOpcode() != ISD::ZERO_EXTEND &&
+      N->getOpcode() != ISD::SIGN_EXTEND &&
+      N->getOpcode() != ISD::ANY_EXTEND)
+    return;
+
+  if (N->getOperand(0).getValueType() != MVT::i1)
+    return;
+
+  if (!N->hasOneUse())
+    return;
+
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Cond = N->getOperand(0);
+  SDValue ConstTrue =
+    CurDAG->getConstant(N->getOpcode() == ISD::SIGN_EXTEND ? -1 : 1, VT);
+  SDValue ConstFalse = CurDAG->getConstant(0, VT);
+
+  do {
+    SDNode *User = *N->use_begin();
+    if (User->getNumOperands() != 2)
+      break;
+
+    auto TryFold = [this, N, User](SDValue Val) {
+      SDValue UserO0 = User->getOperand(0), UserO1 = User->getOperand(1);
+      SDValue O0 = UserO0.getNode() == N ? Val : UserO0;
+      SDValue O1 = UserO1.getNode() == N ? Val : UserO1;
+
+      return CurDAG->FoldConstantArithmetic(User->getOpcode(),
+                                            User->getValueType(0),
+                                            O0.getNode(), O1.getNode());
+    };
+
+    SDValue TrueRes = TryFold(ConstTrue);
+    if (!TrueRes)
+      break;
+    SDValue FalseRes = TryFold(ConstFalse);
+    if (!FalseRes)
+      break;
+
+    // For us to materialize these using one instruction, we must be able to
+    // represent them as signed 16-bit integers.
+    uint64_t True  = cast<ConstantSDNode>(TrueRes)->getZExtValue(),
+             False = cast<ConstantSDNode>(FalseRes)->getZExtValue();
+    if (!isInt<16>(True) || !isInt<16>(False))
+      break;
+
+    // We can replace User with a new SELECT node, and try again to see if we
+    // can fold the select with its user.
+    Res = CurDAG->getSelect(dl, User->getValueType(0), Cond, TrueRes, FalseRes);
+    N = User;
+    ConstTrue = TrueRes;
+    ConstFalse = FalseRes;
+  } while (N->hasOneUse());
+}
+
+void PPCDAGToDAGISel::PreprocessISelDAG() {
+  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+  ++Position;
+
+  bool MadeChange = false;
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = --Position;
+    if (N->use_empty())
+      continue;
+
+    SDValue Res;
+    switch (N->getOpcode()) {
+    default: break;
+    case ISD::OR:
+      Res = combineToCMPB(N);
+      break;
+    }
+
+    if (!Res)
+      foldBoolExts(Res, N);
+
+    if (Res) {
+      DEBUG(dbgs() << "PPC DAG preprocessing replacing:\nOld:    ");
+      DEBUG(N->dump(CurDAG));
+      DEBUG(dbgs() << "\nNew: ");
+      DEBUG(Res.getNode()->dump(CurDAG));
+      DEBUG(dbgs() << "\n");
+
+      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+      MadeChange = true;
+    }
+  }
+
+  if (MadeChange)
+    CurDAG->RemoveDeadNodes();
+}
+
 /// PostprocessISelDAG - Perform some late peephole optimizations
 /// on the DAG representation.
 void PPCDAGToDAGISel::PostprocessISelDAG() {
@@ -1586,6 +3314,7 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
 
   PeepholePPC64();
   PeepholeCROps();
+  PeepholePPC64ZExt();
 }
 
 // Check if all users of this node will become isel where the second operand
@@ -1700,6 +3429,9 @@ void PPCDAGToDAGISel::PeepholeCROps() {
       case PPC::SELECT_I8:
       case PPC::SELECT_F4:
       case PPC::SELECT_F8:
+      case PPC::SELECT_QFRC:
+      case PPC::SELECT_QSRC:
+      case PPC::SELECT_QBRC:
       case PPC::SELECT_VRRC:
       case PPC::SELECT_VSFRC:
       case PPC::SELECT_VSRC: {
@@ -2007,6 +3739,9 @@ void PPCDAGToDAGISel::PeepholeCROps() {
       case PPC::SELECT_I8:
       case PPC::SELECT_F4:
       case PPC::SELECT_F8:
+      case PPC::SELECT_QFRC:
+      case PPC::SELECT_QSRC:
+      case PPC::SELECT_QBRC:
       case PPC::SELECT_VRRC:
       case PPC::SELECT_VSFRC:
       case PPC::SELECT_VSRC:
@@ -2059,6 +3794,315 @@ void PPCDAGToDAGISel::PeepholeCROps() {
   } while (IsModified);
 }
 
+// Gather the set of 32-bit operations that are known to have their
+// higher-order 32 bits zero, where ToPromote contains all such operations.
+static bool PeepholePPC64ZExtGather(SDValue Op32,
+                                    SmallPtrSetImpl<SDNode *> &ToPromote) {
+  if (!Op32.isMachineOpcode())
+    return false;
+
+  // First, check for the "frontier" instructions (those that will clear the
+  // higher-order 32 bits.
+
+  // For RLWINM and RLWNM, we need to make sure that the mask does not wrap
+  // around. If it does not, then these instructions will clear the
+  // higher-order bits.
+  if ((Op32.getMachineOpcode() == PPC::RLWINM ||
+       Op32.getMachineOpcode() == PPC::RLWNM) &&
+      Op32.getConstantOperandVal(2) <= Op32.getConstantOperandVal(3)) {
+    ToPromote.insert(Op32.getNode());
+    return true;
+  }
+
+  // SLW and SRW always clear the higher-order bits.
+  if (Op32.getMachineOpcode() == PPC::SLW ||
+      Op32.getMachineOpcode() == PPC::SRW) {
+    ToPromote.insert(Op32.getNode());
+    return true;
+  }
+
+  // For LI and LIS, we need the immediate to be positive (so that it is not
+  // sign extended).
+  if (Op32.getMachineOpcode() == PPC::LI ||
+      Op32.getMachineOpcode() == PPC::LIS) {
+    if (!isUInt<15>(Op32.getConstantOperandVal(0)))
+      return false;
+
+    ToPromote.insert(Op32.getNode());
+    return true;
+  }
+
+  // LHBRX and LWBRX always clear the higher-order bits.
+  if (Op32.getMachineOpcode() == PPC::LHBRX ||
+      Op32.getMachineOpcode() == PPC::LWBRX) {
+    ToPromote.insert(Op32.getNode());
+    return true;
+  }
+
+  // CNTLZW always produces a 64-bit value in [0,32], and so is zero extended.
+  if (Op32.getMachineOpcode() == PPC::CNTLZW) {
+    ToPromote.insert(Op32.getNode());
+    return true;
+  }
+
+  // Next, check for those instructions we can look through.
+
+  // Assuming the mask does not wrap around, then the higher-order bits are
+  // taken directly from the first operand.
+  if (Op32.getMachineOpcode() == PPC::RLWIMI &&
+      Op32.getConstantOperandVal(3) <= Op32.getConstantOperandVal(4)) {
+    SmallPtrSet<SDNode *, 16> ToPromote1;
+    if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1))
+      return false;
+
+    ToPromote.insert(Op32.getNode());
+    ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+    return true;
+  }
+
+  // For OR, the higher-order bits are zero if that is true for both operands.
+  // For SELECT_I4, the same is true (but the relevant operand numbers are
+  // shifted by 1).
+  if (Op32.getMachineOpcode() == PPC::OR ||
+      Op32.getMachineOpcode() == PPC::SELECT_I4) {
+    unsigned B = Op32.getMachineOpcode() == PPC::SELECT_I4 ? 1 : 0;
+    SmallPtrSet<SDNode *, 16> ToPromote1;
+    if (!PeepholePPC64ZExtGather(Op32.getOperand(B+0), ToPromote1))
+      return false;
+    if (!PeepholePPC64ZExtGather(Op32.getOperand(B+1), ToPromote1))
+      return false;
+
+    ToPromote.insert(Op32.getNode());
+    ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+    return true;
+  }
+
+  // For ORI and ORIS, we need the higher-order bits of the first operand to be
+  // zero, and also for the constant to be positive (so that it is not sign
+  // extended).
+  if (Op32.getMachineOpcode() == PPC::ORI ||
+      Op32.getMachineOpcode() == PPC::ORIS) {
+    SmallPtrSet<SDNode *, 16> ToPromote1;
+    if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1))
+      return false;
+    if (!isUInt<15>(Op32.getConstantOperandVal(1)))
+      return false;
+
+    ToPromote.insert(Op32.getNode());
+    ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+    return true;
+  }
+
+  // The higher-order bits of AND are zero if that is true for at least one of
+  // the operands.
+  if (Op32.getMachineOpcode() == PPC::AND) {
+    SmallPtrSet<SDNode *, 16> ToPromote1, ToPromote2;
+    bool Op0OK =
+      PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1);
+    bool Op1OK =
+      PeepholePPC64ZExtGather(Op32.getOperand(1), ToPromote2);
+    if (!Op0OK && !Op1OK)
+      return false;
+
+    ToPromote.insert(Op32.getNode());
+
+    if (Op0OK)
+      ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+
+    if (Op1OK)
+      ToPromote.insert(ToPromote2.begin(), ToPromote2.end());
+
+    return true;
+  }
+
+  // For ANDI and ANDIS, the higher-order bits are zero if either that is true
+  // of the first operand, or if the second operand is positive (so that it is
+  // not sign extended).
+  if (Op32.getMachineOpcode() == PPC::ANDIo ||
+      Op32.getMachineOpcode() == PPC::ANDISo) {
+    SmallPtrSet<SDNode *, 16> ToPromote1;
+    bool Op0OK =
+      PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1);
+    bool Op1OK = isUInt<15>(Op32.getConstantOperandVal(1));
+    if (!Op0OK && !Op1OK)
+      return false;
+
+    ToPromote.insert(Op32.getNode());
+
+    if (Op0OK)
+      ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+
+    return true;
+  }
+
+  return false;
+}
+
+void PPCDAGToDAGISel::PeepholePPC64ZExt() {
+  if (!PPCSubTarget->isPPC64())
+    return;
+
+  // When we zero-extend from i32 to i64, we use a pattern like this:
+  // def : Pat<(i64 (zext i32:$in)),
+  //           (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32),
+  //                   0, 32)>;
+  // There are several 32-bit shift/rotate instructions, however, that will
+  // clear the higher-order bits of their output, rendering the RLDICL
+  // unnecessary. When that happens, we remove it here, and redefine the
+  // relevant 32-bit operation to be a 64-bit operation.
+
+  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+  ++Position;
+
+  bool MadeChange = false;
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = --Position;
+    // Skip dead nodes and any non-machine opcodes.
+    if (N->use_empty() || !N->isMachineOpcode())
+      continue;
+
+    if (N->getMachineOpcode() != PPC::RLDICL)
+      continue;
+
+    if (N->getConstantOperandVal(1) != 0 ||
+        N->getConstantOperandVal(2) != 32)
+      continue;
+
+    SDValue ISR = N->getOperand(0);
+    if (!ISR.isMachineOpcode() ||
+        ISR.getMachineOpcode() != TargetOpcode::INSERT_SUBREG)
+      continue;
+
+    if (!ISR.hasOneUse())
+      continue;
+
+    if (ISR.getConstantOperandVal(2) != PPC::sub_32)
+      continue;
+
+    SDValue IDef = ISR.getOperand(0);
+    if (!IDef.isMachineOpcode() ||
+        IDef.getMachineOpcode() != TargetOpcode::IMPLICIT_DEF)
+      continue;
+
+    // We now know that we're looking at a canonical i32 -> i64 zext. See if we
+    // can get rid of it.
+
+    SDValue Op32 = ISR->getOperand(1);
+    if (!Op32.isMachineOpcode())
+      continue;
+
+    // There are some 32-bit instructions that always clear the high-order 32
+    // bits, there are also some instructions (like AND) that we can look
+    // through.
+    SmallPtrSet<SDNode *, 16> ToPromote;
+    if (!PeepholePPC64ZExtGather(Op32, ToPromote))
+      continue;
+
+    // If the ToPromote set contains nodes that have uses outside of the set
+    // (except for the original INSERT_SUBREG), then abort the transformation.
+    bool OutsideUse = false;
+    for (SDNode *PN : ToPromote) {
+      for (SDNode *UN : PN->uses()) {
+        if (!ToPromote.count(UN) && UN != ISR.getNode()) {
+          OutsideUse = true;
+          break;
+        }
+      }
+
+      if (OutsideUse)
+        break;
+    }
+    if (OutsideUse)
+      continue;
+
+    MadeChange = true;
+
+    // We now know that this zero extension can be removed by promoting to
+    // nodes in ToPromote to 64-bit operations, where for operations in the
+    // frontier of the set, we need to insert INSERT_SUBREGs for their
+    // operands.
+    for (SDNode *PN : ToPromote) {
+      unsigned NewOpcode;
+      switch (PN->getMachineOpcode()) {
+      default:
+        llvm_unreachable("Don't know the 64-bit variant of this instruction");
+      case PPC::RLWINM:    NewOpcode = PPC::RLWINM8; break;
+      case PPC::RLWNM:     NewOpcode = PPC::RLWNM8; break;
+      case PPC::SLW:       NewOpcode = PPC::SLW8; break;
+      case PPC::SRW:       NewOpcode = PPC::SRW8; break;
+      case PPC::LI:        NewOpcode = PPC::LI8; break;
+      case PPC::LIS:       NewOpcode = PPC::LIS8; break;
+      case PPC::LHBRX:     NewOpcode = PPC::LHBRX8; break;
+      case PPC::LWBRX:     NewOpcode = PPC::LWBRX8; break;
+      case PPC::CNTLZW:    NewOpcode = PPC::CNTLZW8; break;
+      case PPC::RLWIMI:    NewOpcode = PPC::RLWIMI8; break;
+      case PPC::OR:        NewOpcode = PPC::OR8; break;
+      case PPC::SELECT_I4: NewOpcode = PPC::SELECT_I8; break;
+      case PPC::ORI:       NewOpcode = PPC::ORI8; break;
+      case PPC::ORIS:      NewOpcode = PPC::ORIS8; break;
+      case PPC::AND:       NewOpcode = PPC::AND8; break;
+      case PPC::ANDIo:     NewOpcode = PPC::ANDIo8; break;
+      case PPC::ANDISo:    NewOpcode = PPC::ANDISo8; break;
+      }
+
+      // Note: During the replacement process, the nodes will be in an
+      // inconsistent state (some instructions will have operands with values
+      // of the wrong type). Once done, however, everything should be right
+      // again.
+
+      SmallVector<SDValue, 4> Ops;
+      for (const SDValue &V : PN->ops()) {
+        if (!ToPromote.count(V.getNode()) && V.getValueType() == MVT::i32 &&
+            !isa<ConstantSDNode>(V)) {
+          SDValue ReplOpOps[] = { ISR.getOperand(0), V, ISR.getOperand(2) };
+          SDNode *ReplOp =
+            CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, SDLoc(V),
+                                   ISR.getNode()->getVTList(), ReplOpOps);
+          Ops.push_back(SDValue(ReplOp, 0));
+        } else {
+          Ops.push_back(V);
+        }
+      }
+
+      // Because all to-be-promoted nodes only have users that are other
+      // promoted nodes (or the original INSERT_SUBREG), we can safely replace
+      // the i32 result value type with i64.
+
+      SmallVector<EVT, 2> NewVTs;
+      SDVTList VTs = PN->getVTList();
+      for (unsigned i = 0, ie = VTs.NumVTs; i != ie; ++i)
+        if (VTs.VTs[i] == MVT::i32)
+          NewVTs.push_back(MVT::i64);
+        else
+          NewVTs.push_back(VTs.VTs[i]);
+
+      DEBUG(dbgs() << "PPC64 ZExt Peephole morphing:\nOld:    ");
+      DEBUG(PN->dump(CurDAG));
+
+      CurDAG->SelectNodeTo(PN, NewOpcode, CurDAG->getVTList(NewVTs), Ops);
+
+      DEBUG(dbgs() << "\nNew: ");
+      DEBUG(PN->dump(CurDAG));
+      DEBUG(dbgs() << "\n");
+    }
+
+    // Now we replace the original zero extend and its associated INSERT_SUBREG
+    // with the value feeding the INSERT_SUBREG (which has now been promoted to
+    // return an i64).
+
+    DEBUG(dbgs() << "PPC64 ZExt Peephole replacing:\nOld:    ");
+    DEBUG(N->dump(CurDAG));
+    DEBUG(dbgs() << "\nNew: ");
+    DEBUG(Op32.getNode()->dump(CurDAG));
+    DEBUG(dbgs() << "\n");
+
+    ReplaceUses(N, Op32.getNode());
+  }
+
+  if (MadeChange)
+    CurDAG->RemoveDeadNodes();
+}
+
 void PPCDAGToDAGISel::PeepholePPC64() {
   // These optimizations are currently supported only for 64-bit SVR4.
   if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64())
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index e93bdaf..147e94b 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "PPCISelLowering.h"
 #include "MCTargetDesc/PPCPredicates.h"
+#include "PPCCallingConv.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCPerfectShuffle.h"
 #include "PPCTargetMachine.h"
@@ -24,6 +25,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
@@ -55,11 +57,9 @@ cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
 // FIXME: Remove this once the bug has been fixed!
 extern cl::opt<bool> ANDIGlueBug;
 
-PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
-    : TargetLowering(TM),
-      Subtarget(*TM.getSubtargetImpl()) {
-  setPow2SDivIsCheap();
-
+PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
+                                     const PPCSubtarget &STI)
+    : TargetLowering(TM), Subtarget(STI) {
   // Use _setjmp/_longjmp instead of setjmp/longjmp.
   setUseUnderscoreSetJmp(true);
   setUseUnderscoreLongJmp(true);
@@ -75,8 +75,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
   addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
 
   // PowerPC has an i16 but no i8 (or i1) SEXTLOAD
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
+  }
 
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
@@ -86,11 +88,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
   setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
   setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
   setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal);
 
   if (Subtarget.useCRBits()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
@@ -115,12 +121,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
     if (ANDIGlueBug)
       setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
 
-    setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
-    setTruncStoreAction(MVT::i64, MVT::i1, Expand);
-    setTruncStoreAction(MVT::i32, MVT::i1, Expand);
-    setTruncStoreAction(MVT::i16, MVT::i1, Expand);
-    setTruncStoreAction(MVT::i8, MVT::i1, Expand);
+    for (MVT VT : MVT::integer_valuetypes()) {
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+      setTruncStoreAction(VT, MVT::i1, Expand);
+    }
 
     addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
   }
@@ -171,13 +176,13 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
 
   // If we're enabling GP optimizations, use hardware square root
   if (!Subtarget.hasFSQRT() &&
-      !(TM.Options.UnsafeFPMath &&
-        Subtarget.hasFRSQRTE() && Subtarget.hasFRE()))
+      !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() &&
+        Subtarget.hasFRE()))
     setOperationAction(ISD::FSQRT, MVT::f64, Expand);
 
   if (!Subtarget.hasFSQRT() &&
-      !(TM.Options.UnsafeFPMath &&
-        Subtarget.hasFRSQRTES() && Subtarget.hasFRES()))
+      !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() &&
+        Subtarget.hasFRES()))
     setOperationAction(ISD::FSQRT, MVT::f32, Expand);
 
   if (Subtarget.hasFCPSGN()) {
@@ -395,14 +400,21 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
   if (Subtarget.hasAltivec()) {
     // First set operation action for all vector types to expand. Then we
     // will selectively turn on ones that can be effectively codegen'd.
-    for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
-      MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
-
+    for (MVT VT : MVT::vector_valuetypes()) {
       // add/sub are legal for all supported vector VT's.
       setOperationAction(ISD::ADD , VT, Legal);
       setOperationAction(ISD::SUB , VT, Legal);
 
+      // Vector instructions introduced in P8
+      if (Subtarget.hasP8Altivec()) {
+        setOperationAction(ISD::CTPOP, VT, Legal);
+        setOperationAction(ISD::CTLZ, VT, Legal);
+      }
+      else {
+        setOperationAction(ISD::CTPOP, VT, Expand);
+        setOperationAction(ISD::CTLZ, VT, Expand);
+      }
+
       // We promote all shuffles to v16i8.
       setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);
       AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);
@@ -457,22 +469,18 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
       setOperationAction(ISD::FPOW, VT, Expand);
       setOperationAction(ISD::BSWAP, VT, Expand);
-      setOperationAction(ISD::CTPOP, VT, Expand);
-      setOperationAction(ISD::CTLZ, VT, Expand);
       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
       setOperationAction(ISD::CTTZ, VT, Expand);
       setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
       setOperationAction(ISD::VSELECT, VT, Expand);
       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
 
-      for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-           j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) {
-        MVT::SimpleValueType InnerVT = (MVT::SimpleValueType)j;
+      for (MVT InnerVT : MVT::vector_valuetypes()) {
         setTruncStoreAction(VT, InnerVT, Expand);
+        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
       }
-      setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
-      setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
-      setLoadExtAction(ISD::EXTLOAD, VT, Expand);
     }
 
     // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
@@ -597,12 +605,171 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
 
       addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
     }
+
+    if (Subtarget.hasP8Altivec()) 
+      addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass);
+  }
+
+  if (Subtarget.hasQPX()) {
+    setOperationAction(ISD::FADD, MVT::v4f64, Legal);
+    setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
+    setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
+    setOperationAction(ISD::FREM, MVT::v4f64, Expand);
+
+    setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal);
+    setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand);
+
+    setOperationAction(ISD::LOAD  , MVT::v4f64, Custom);
+    setOperationAction(ISD::STORE , MVT::v4f64, Custom);
+
+    setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom);
+
+    if (!Subtarget.useCRBits())
+      setOperationAction(ISD::SELECT, MVT::v4f64, Expand);
+    setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal);
+    setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand);
+    setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal);
+    setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand);
+
+    setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal);
+    setOperationAction(ISD::FP_ROUND_INREG , MVT::v4f32, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
+
+    setOperationAction(ISD::FNEG , MVT::v4f64, Legal);
+    setOperationAction(ISD::FABS , MVT::v4f64, Legal);
+    setOperationAction(ISD::FSIN , MVT::v4f64, Expand);
+    setOperationAction(ISD::FCOS , MVT::v4f64, Expand);
+    setOperationAction(ISD::FPOWI , MVT::v4f64, Expand);
+    setOperationAction(ISD::FPOW , MVT::v4f64, Expand);
+    setOperationAction(ISD::FLOG , MVT::v4f64, Expand);
+    setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand);
+    setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand);
+    setOperationAction(ISD::FEXP , MVT::v4f64, Expand);
+    setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand);
+
+    setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal);
+
+    setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal);
+    setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal);
+
+    addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass);
+
+    setOperationAction(ISD::FADD, MVT::v4f32, Legal);
+    setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FREM, MVT::v4f32, Expand);
+
+    setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
+    setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand);
+
+    setOperationAction(ISD::LOAD  , MVT::v4f32, Custom);
+    setOperationAction(ISD::STORE , MVT::v4f32, Custom);
+
+    if (!Subtarget.useCRBits())
+      setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
+    setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal);
+    setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand);
+    setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal);
+    setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand);
+
+    setOperationAction(ISD::FNEG , MVT::v4f32, Legal);
+    setOperationAction(ISD::FABS , MVT::v4f32, Legal);
+    setOperationAction(ISD::FSIN , MVT::v4f32, Expand);
+    setOperationAction(ISD::FCOS , MVT::v4f32, Expand);
+    setOperationAction(ISD::FPOWI , MVT::v4f32, Expand);
+    setOperationAction(ISD::FPOW , MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG , MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand);
+    setOperationAction(ISD::FEXP , MVT::v4f32, Expand);
+    setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand);
+
+    setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
+
+    setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal);
+    setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal);
+
+    addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass);
+
+    setOperationAction(ISD::AND , MVT::v4i1, Legal);
+    setOperationAction(ISD::OR , MVT::v4i1, Legal);
+    setOperationAction(ISD::XOR , MVT::v4i1, Legal);
+
+    if (!Subtarget.useCRBits())
+      setOperationAction(ISD::SELECT, MVT::v4i1, Expand);
+    setOperationAction(ISD::VSELECT, MVT::v4i1, Legal);
+
+    setOperationAction(ISD::LOAD  , MVT::v4i1, Custom);
+    setOperationAction(ISD::STORE , MVT::v4i1, Custom);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand);
+    setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);
+
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
+
+    addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass);
+
+    setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
+    setOperationAction(ISD::FCEIL,  MVT::v4f64, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
+    setOperationAction(ISD::FROUND, MVT::v4f64, Legal);
+
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+    setOperationAction(ISD::FCEIL,  MVT::v4f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
+
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
+
+    // These need to set FE_INEXACT, and so cannot be vectorized here.
+    setOperationAction(ISD::FRINT, MVT::v4f64, Expand);
+    setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
+
+    if (TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
+      setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
+
+      setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+      setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+    } else {
+      setOperationAction(ISD::FDIV, MVT::v4f64, Expand);
+      setOperationAction(ISD::FSQRT, MVT::v4f64, Expand);
+
+      setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
+      setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
+    }
   }
 
-  if (Subtarget.has64BitSupport()) {
+  if (Subtarget.has64BitSupport())
     setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
-    setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
-  }
+
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom);
 
   if (!isPPC64) {
     setOperationAction(ISD::ATOMIC_LOAD,  MVT::i64, Expand);
@@ -610,8 +777,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
   }
 
   setBooleanContents(ZeroOrOneBooleanContent);
-  // Altivec instructions set fields to all zeros or all ones.
-  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  if (Subtarget.hasAltivec()) {
+    // Altivec instructions set fields to all zeros or all ones.
+    setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+  }
 
   if (!isPPC64) {
     // These libcalls are not available in 32-bit.
@@ -632,6 +802,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
 
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::SINT_TO_FP);
+  if (Subtarget.hasFPCVT())
+    setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::BR_CC);
@@ -639,6 +811,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
     setTargetDAGCombine(ISD::BRCOND);
   setTargetDAGCombine(ISD::BSWAP);
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
 
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
@@ -672,13 +846,33 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
 
   // With 32 condition bits, we don't need to sink (and duplicate) compares
   // aggressively in CodeGenPrep.
-  if (Subtarget.useCRBits())
+  if (Subtarget.useCRBits()) {
     setHasMultipleConditionRegisters();
+    setJumpIsExpensive();
+  }
 
   setMinFunctionAlignment(2);
   if (Subtarget.isDarwin())
     setPrefFunctionAlignment(4);
 
+  switch (Subtarget.getDarwinDirective()) {
+  default: break;
+  case PPC::DIR_970:
+  case PPC::DIR_A2:
+  case PPC::DIR_E500mc:
+  case PPC::DIR_E5500:
+  case PPC::DIR_PWR4:
+  case PPC::DIR_PWR5:
+  case PPC::DIR_PWR5X:
+  case PPC::DIR_PWR6:
+  case PPC::DIR_PWR6X:
+  case PPC::DIR_PWR7:
+  case PPC::DIR_PWR8:
+    setPrefFunctionAlignment(4);
+    setPrefLoopAlignment(4);
+    break;
+  }
+
   setInsertFencesForAtomic(true);
 
   if (Subtarget.enableMachineScheduler())
@@ -686,10 +880,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
   else
     setSchedulingPreference(Sched::Hybrid);
 
-  computeRegisterProperties();
+  computeRegisterProperties(STI.getRegisterInfo());
 
-  // The Freescale cores does better with aggressive inlining of memcpy and
-  // friends. Gcc uses same threshold of 128 bytes (= 32 word stores).
+  // The Freescale cores do better with aggressive inlining of memcpy and
+  // friends. GCC uses same threshold of 128 bytes (= 32 word stores).
   if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc ||
       Subtarget.getDarwinDirective() == PPC::DIR_E5500) {
     MaxStoresPerMemset = 32;
@@ -698,8 +892,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
     MaxStoresPerMemcpyOptSize = 8;
     MaxStoresPerMemmove = 32;
     MaxStoresPerMemmoveOptSize = 8;
-
-    setPrefFunctionAlignment(4);
   }
 }
 
@@ -751,19 +943,23 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   default: return nullptr;
   case PPCISD::FSEL:            return "PPCISD::FSEL";
   case PPCISD::FCFID:           return "PPCISD::FCFID";
+  case PPCISD::FCFIDU:          return "PPCISD::FCFIDU";
+  case PPCISD::FCFIDS:          return "PPCISD::FCFIDS";
+  case PPCISD::FCFIDUS:         return "PPCISD::FCFIDUS";
   case PPCISD::FCTIDZ:          return "PPCISD::FCTIDZ";
   case PPCISD::FCTIWZ:          return "PPCISD::FCTIWZ";
+  case PPCISD::FCTIDUZ:         return "PPCISD::FCTIDUZ";
+  case PPCISD::FCTIWUZ:         return "PPCISD::FCTIWUZ";
   case PPCISD::FRE:             return "PPCISD::FRE";
   case PPCISD::FRSQRTE:         return "PPCISD::FRSQRTE";
   case PPCISD::STFIWX:          return "PPCISD::STFIWX";
   case PPCISD::VMADDFP:         return "PPCISD::VMADDFP";
   case PPCISD::VNMSUBFP:        return "PPCISD::VNMSUBFP";
   case PPCISD::VPERM:           return "PPCISD::VPERM";
+  case PPCISD::CMPB:            return "PPCISD::CMPB";
   case PPCISD::Hi:              return "PPCISD::Hi";
   case PPCISD::Lo:              return "PPCISD::Lo";
   case PPCISD::TOC_ENTRY:       return "PPCISD::TOC_ENTRY";
-  case PPCISD::LOAD:            return "PPCISD::LOAD";
-  case PPCISD::LOAD_TOC:        return "PPCISD::LOAD_TOC";
   case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
   case PPCISD::GlobalBaseReg:   return "PPCISD::GlobalBaseReg";
   case PPCISD::SRL:             return "PPCISD::SRL";
@@ -771,11 +967,11 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::SHL:             return "PPCISD::SHL";
   case PPCISD::CALL:            return "PPCISD::CALL";
   case PPCISD::CALL_NOP:        return "PPCISD::CALL_NOP";
-  case PPCISD::CALL_TLS:        return "PPCISD::CALL_TLS";
-  case PPCISD::CALL_NOP_TLS:    return "PPCISD::CALL_NOP_TLS";
   case PPCISD::MTCTR:           return "PPCISD::MTCTR";
   case PPCISD::BCTRL:           return "PPCISD::BCTRL";
+  case PPCISD::BCTRL_LOAD_TOC:  return "PPCISD::BCTRL_LOAD_TOC";
   case PPCISD::RET_FLAG:        return "PPCISD::RET_FLAG";
+  case PPCISD::READ_TIME_BASE:  return "PPCISD::READ_TIME_BASE";
   case PPCISD::EH_SJLJ_SETJMP:  return "PPCISD::EH_SJLJ_SETJMP";
   case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
   case PPCISD::MFOCRF:          return "PPCISD::MFOCRF";
@@ -783,6 +979,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::VCMPo:           return "PPCISD::VCMPo";
   case PPCISD::LBRX:            return "PPCISD::LBRX";
   case PPCISD::STBRX:           return "PPCISD::STBRX";
+  case PPCISD::LFIWAX:          return "PPCISD::LFIWAX";
+  case PPCISD::LFIWZX:          return "PPCISD::LFIWZX";
   case PPCISD::LARX:            return "PPCISD::LARX";
   case PPCISD::STCX:            return "PPCISD::STCX";
   case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
@@ -793,27 +991,38 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
   case PPCISD::CR6SET:          return "PPCISD::CR6SET";
   case PPCISD::CR6UNSET:        return "PPCISD::CR6UNSET";
-  case PPCISD::ADDIS_TOC_HA:    return "PPCISD::ADDIS_TOC_HA";
-  case PPCISD::LD_TOC_L:        return "PPCISD::LD_TOC_L";
-  case PPCISD::ADDI_TOC_L:      return "PPCISD::ADDI_TOC_L";
   case PPCISD::PPC32_GOT:       return "PPCISD::PPC32_GOT";
   case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
   case PPCISD::LD_GOT_TPREL_L:  return "PPCISD::LD_GOT_TPREL_L";
   case PPCISD::ADD_TLS:         return "PPCISD::ADD_TLS";
   case PPCISD::ADDIS_TLSGD_HA:  return "PPCISD::ADDIS_TLSGD_HA";
   case PPCISD::ADDI_TLSGD_L:    return "PPCISD::ADDI_TLSGD_L";
+  case PPCISD::GET_TLS_ADDR:    return "PPCISD::GET_TLS_ADDR";
+  case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR";
   case PPCISD::ADDIS_TLSLD_HA:  return "PPCISD::ADDIS_TLSLD_HA";
   case PPCISD::ADDI_TLSLD_L:    return "PPCISD::ADDI_TLSLD_L";
+  case PPCISD::GET_TLSLD_ADDR:  return "PPCISD::GET_TLSLD_ADDR";
+  case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR";
   case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
   case PPCISD::ADDI_DTPREL_L:   return "PPCISD::ADDI_DTPREL_L";
   case PPCISD::VADD_SPLAT:      return "PPCISD::VADD_SPLAT";
   case PPCISD::SC:              return "PPCISD::SC";
+  case PPCISD::QVFPERM:         return "PPCISD::QVFPERM";
+  case PPCISD::QVGPCI:          return "PPCISD::QVGPCI";
+  case PPCISD::QVALIGNI:        return "PPCISD::QVALIGNI";
+  case PPCISD::QVESPLATI:       return "PPCISD::QVESPLATI";
+  case PPCISD::QBFLT:           return "PPCISD::QBFLT";
+  case PPCISD::QVLFSb:          return "PPCISD::QVLFSb";
   }
 }
 
-EVT PPCTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT PPCTargetLowering::getSetCCResultType(LLVMContext &C, EVT VT) const {
   if (!VT.isVector())
     return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
+
+  if (Subtarget.hasQPX())
+    return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
+
   return VT.changeVectorElementTypeToInteger();
 }
 
@@ -853,7 +1062,7 @@ static bool isConstantOrUndef(int Op, int Val) {
 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                                SelectionDAG &DAG) {
-  bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian();
+  bool IsLE = DAG.getTarget().getDataLayout()->isLittleEndian();
   if (ShuffleKind == 0) {
     if (IsLE)
       return false;
@@ -884,7 +1093,7 @@ bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                                SelectionDAG &DAG) {
-  bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian();
+  bool IsLE = DAG.getTarget().getDataLayout()->isLittleEndian();
   if (ShuffleKind == 0) {
     if (IsLE)
       return false;
@@ -939,7 +1148,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
 /// the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
                              unsigned ShuffleKind, SelectionDAG &DAG) {
-  if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) {
+  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
     if (ShuffleKind == 1) // unary
       return isVMerge(N, UnitSize, 0, 0);
     else if (ShuffleKind == 2) // swapped
@@ -964,7 +1173,7 @@ bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
 /// the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
                              unsigned ShuffleKind, SelectionDAG &DAG) {
-  if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) {
+  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
     if (ShuffleKind == 1) // unary
       return isVMerge(N, UnitSize, 8, 8);
     else if (ShuffleKind == 2) // swapped
@@ -1008,8 +1217,7 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
   if (ShiftAmt < i) return -1;
 
   ShiftAmt -= i;
-  bool isLE = DAG.getTarget().getSubtargetImpl()->getDataLayout()->
-    isLittleEndian();
+  bool isLE = DAG.getTarget().getDataLayout()->isLittleEndian();
 
   if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
     // Check the rest of the elements to see if they are consecutive.
@@ -1082,7 +1290,7 @@ unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
                                 SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   assert(isSplatShuffleMask(SVOp, EltSize));
-  if (DAG.getSubtarget().getDataLayout()->isLittleEndian())
+  if (DAG.getTarget().getDataLayout()->isLittleEndian())
     return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
   else
     return SVOp->getMaskElt(0) / EltSize;
@@ -1200,6 +1408,36 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
   return SDValue();
 }
 
+/// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift
+/// amount, otherwise return -1.
+int PPC::isQVALIGNIShuffleMask(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1)
+    return -1;
+
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+
+  // Find the first non-undef value in the shuffle mask.
+  unsigned i;
+  for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i)
+    /*search*/;
+
+  if (i == 4) return -1;  // all undef.
+
+  // Otherwise, check to see if the rest of the elements are consecutively
+  // numbered from this value.
+  unsigned ShiftAmt = SVOp->getMaskElt(i);
+  if (ShiftAmt < i) return -1;
+  ShiftAmt -= i;
+
+  // Check the rest of the elements to see if they are consecutive.
+  for (++i; i != 4; ++i)
+    if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
+      return -1;
+
+  return ShiftAmt;
+}
+
 //===----------------------------------------------------------------------===//
 //  Addressing Mode Selection
 //===----------------------------------------------------------------------===//
@@ -1459,9 +1697,16 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
   } else
     return false;
 
-  // PowerPC doesn't have preinc load/store instructions for vectors.
-  if (VT.isVector())
-    return false;
+  // PowerPC doesn't have preinc load/store instructions for vectors (except
+  // for QPX, which does have preinc r+r forms).
+  if (VT.isVector()) {
+    if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) {
+      return false;
+    } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) {
+      AM = ISD::PRE_INC;
+      return true;
+    }
+  }
 
   if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
 
@@ -1518,8 +1763,9 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
 
 /// GetLabelAccessInfo - Return true if we should reference labels using a
 /// PICBase, set the HiOpFlags and LoOpFlags to the target MO flags.
-static bool GetLabelAccessInfo(const TargetMachine &TM, unsigned &HiOpFlags,
-                               unsigned &LoOpFlags,
+static bool GetLabelAccessInfo(const TargetMachine &TM,
+                               const PPCSubtarget &Subtarget,
+                               unsigned &HiOpFlags, unsigned &LoOpFlags,
                                const GlobalValue *GV = nullptr) {
   HiOpFlags = PPCII::MO_HA;
   LoOpFlags = PPCII::MO_LO;
@@ -1534,7 +1780,7 @@ static bool GetLabelAccessInfo(const TargetMachine &TM, unsigned &HiOpFlags,
 
   // If this is a reference to a global value that requires a non-lazy-ptr, make
   // sure that instruction lowering adds it.
-  if (GV && TM.getSubtarget<PPCSubtarget>().hasLazyResolverStub(GV, TM)) {
+  if (GV && Subtarget.hasLazyResolverStub(GV)) {
     HiOpFlags |= PPCII::MO_NLP_FLAG;
     LoOpFlags |= PPCII::MO_NLP_FLAG;
 
@@ -1566,6 +1812,28 @@ static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
   return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
 }
 
+static void setUsesTOCBasePtr(MachineFunction &MF) {
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setUsesTOCBasePtr();
+}
+
+static void setUsesTOCBasePtr(SelectionDAG &DAG) {
+  setUsesTOCBasePtr(DAG.getMachineFunction());
+}
+
+static SDValue getTOCEntry(SelectionDAG &DAG, SDLoc dl, bool Is64Bit,
+                           SDValue GA) {
+  EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
+  SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) :
+                DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);
+
+  SDValue Ops[] = { GA, Reg };
+  return DAG.getMemIntrinsicNode(PPCISD::TOC_ENTRY, dl,
+                                 DAG.getVTList(VT, MVT::Other), Ops, VT,
+                                 MachinePointerInfo::getGOT(), 0, false, true,
+                                 false, 0);
+}
+
 SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
                                              SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
@@ -1575,20 +1843,19 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
   if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+    setUsesTOCBasePtr(DAG);
     SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
-    return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(CP), MVT::i64, GA,
-                       DAG.getRegister(PPC::X2, MVT::i64));
+    return getTOCEntry(DAG, SDLoc(CP), true, GA);
   }
 
   unsigned MOHiFlag, MOLoFlag;
-  bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
+  bool isPIC =
+      GetLabelAccessInfo(DAG.getTarget(), Subtarget, MOHiFlag, MOLoFlag);
 
   if (isPIC && Subtarget.isSVR4ABI()) {
     SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
                                            PPCII::MO_PIC_FLAG);
-    SDLoc DL(CP);
-    return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i32, GA,
-                       DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT));
+    return getTOCEntry(DAG, SDLoc(CP), false, GA);
   }
 
   SDValue CPIHi =
@@ -1605,20 +1872,19 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
   if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+    setUsesTOCBasePtr(DAG);
     SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
-    return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), MVT::i64, GA,
-                       DAG.getRegister(PPC::X2, MVT::i64));
+    return getTOCEntry(DAG, SDLoc(JT), true, GA);
   }
 
   unsigned MOHiFlag, MOLoFlag;
-  bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
+  bool isPIC =
+      GetLabelAccessInfo(DAG.getTarget(), Subtarget, MOHiFlag, MOLoFlag);
 
   if (isPIC && Subtarget.isSVR4ABI()) {
     SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
                                         PPCII::MO_PIC_FLAG);
-    SDLoc DL(GA);
-    return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), PtrVT, GA,
-                       DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT));
+    return getTOCEntry(DAG, SDLoc(GA), false, GA);
   }
 
   SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
@@ -1635,39 +1901,19 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual BlockAddress is stored in the TOC.
   if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+    setUsesTOCBasePtr(DAG);
     SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
-    return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(BASDN), MVT::i64, GA,
-                       DAG.getRegister(PPC::X2, MVT::i64));
+    return getTOCEntry(DAG, SDLoc(BASDN), true, GA);
   }
 
   unsigned MOHiFlag, MOLoFlag;
-  bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
+  bool isPIC =
+      GetLabelAccessInfo(DAG.getTarget(), Subtarget, MOHiFlag, MOLoFlag);
   SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
   SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
   return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG);
 }
 
-// Generate a call to __tls_get_addr for the given GOT entry Op.
-std::pair<SDValue,SDValue>
-PPCTargetLowering::lowerTLSCall(SDValue Op, SDLoc dl,
-                                SelectionDAG &DAG) const {
-
-  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
-  TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Entry.Node = Op;
-  Entry.Ty = IntPtrTy;
-  Args.push_back(Entry);
-
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::C, IntPtrTy,
-               DAG.getTargetExternalSymbol("__tls_get_addr", getPointerTy()),
-               std::move(Args), 0);
-
-  return LowerCallTo(CLI);
-}
-
 SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
 
@@ -1702,6 +1948,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                 PPCII::MO_TLS);
     SDValue GOTPtr;
     if (is64bit) {
+      setUsesTOCBasePtr(DAG);
       SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
       GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
                            PtrVT, GOTReg, TGA);
@@ -1713,10 +1960,10 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   }
 
   if (Model == TLSModel::GeneralDynamic) {
-    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
-                                             PPCII::MO_TLSGD);
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
     SDValue GOTPtr;
     if (is64bit) {
+      setUsesTOCBasePtr(DAG);
       SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
       GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
                                    GOTReg, TGA);
@@ -1726,17 +1973,15 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
       else
         GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
     }
-    SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSGD_L, dl, PtrVT,
-                                   GOTPtr, TGA);
-    std::pair<SDValue, SDValue> CallResult = lowerTLSCall(GOTEntry, dl, DAG);
-    return CallResult.first;
+    return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT,
+                       GOTPtr, TGA, TGA);
   }
 
   if (Model == TLSModel::LocalDynamic) {
-    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
-                                             PPCII::MO_TLSLD);
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
     SDValue GOTPtr;
     if (is64bit) {
+      setUsesTOCBasePtr(DAG);
       SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
       GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
                            GOTReg, TGA);
@@ -1746,13 +1991,10 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
       else
         GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
     }
-    SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSLD_L, dl, PtrVT,
-                                   GOTPtr, TGA);
-    std::pair<SDValue, SDValue> CallResult = lowerTLSCall(GOTEntry, dl, DAG);
-    SDValue TLSAddr = CallResult.first;
-    SDValue Chain = CallResult.second;
-    SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, PtrVT,
-                                      Chain, TLSAddr, TGA);
+    SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl,
+                                  PtrVT, GOTPtr, TGA, TGA);
+    SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl,
+                                      PtrVT, TLSAddr, TGA);
     return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
   }
 
@@ -1769,20 +2011,20 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
   if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+    setUsesTOCBasePtr(DAG);
     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
-    return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i64, GA,
-                       DAG.getRegister(PPC::X2, MVT::i64));
+    return getTOCEntry(DAG, DL, true, GA);
   }
 
   unsigned MOHiFlag, MOLoFlag;
-  bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag, GV);
+  bool isPIC =
+      GetLabelAccessInfo(DAG.getTarget(), Subtarget, MOHiFlag, MOLoFlag, GV);
 
   if (isPIC && Subtarget.isSVR4ABI()) {
     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
                                             GSDN->getOffset(),
                                             PPCII::MO_PIC_FLAG);
-    return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i32, GA,
-                       DAG.getNode(PPCISD::GlobalBaseReg, DL, MVT::i32));
+    return getTOCEntry(DAG, DL, false, GA);
   }
 
   SDValue GAHi =
@@ -2151,7 +2393,7 @@ bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
   };
   const unsigned NumArgRegs = array_lengthof(ArgRegs);
 
-  unsigned RegNum = State.getFirstUnallocated(ArgRegs, NumArgRegs);
+  unsigned RegNum = State.getFirstUnallocated(ArgRegs);
 
   // Skip one register if the first unallocated register has an even register
   // number and there are still argument registers available which have not been
@@ -2179,7 +2421,7 @@ bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
 
   const unsigned NumArgRegs = array_lengthof(ArgRegs);
 
-  unsigned RegNum = State.getFirstUnallocated(ArgRegs, NumArgRegs);
+  unsigned RegNum = State.getFirstUnallocated(ArgRegs);
 
   // If there is only one Floating-point register left we need to put both f64
   // values of a split ppc_fp128 value on the stack.
@@ -2205,6 +2447,17 @@ static const MCPhysReg *GetFPR() {
   return FPR;
 }
 
+/// GetQFPR - Get the set of QPX registers that should be allocated for
+/// arguments.
+static const MCPhysReg *GetQFPR() {
+  static const MCPhysReg QFPR[] = {
+    PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7,
+    PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13
+  };
+
+  return QFPR;
+}
+
 /// CalculateStackSlotSize - Calculates the size reserved for this argument on
 /// the stack.
 static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
@@ -2233,6 +2486,10 @@ static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
       ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
       ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64)
     Align = 16;
+  // QPX vector types stored in double-precision are padded to a 32 byte
+  // boundary.
+  else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1)
+    Align = 32;
 
   // ByVal parameters are aligned as requested.
   if (Flags.isByVal()) {
@@ -2271,7 +2528,7 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
                                    unsigned ParamAreaSize,
                                    unsigned &ArgOffset,
                                    unsigned &AvailableFPRs,
-                                   unsigned &AvailableVRs) {
+                                   unsigned &AvailableVRs, bool HasQPX) {
   bool UseMemory = false;
 
   // Respect alignment of argument on the stack.
@@ -2295,7 +2552,11 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
   // However, if the argument is actually passed in an FPR or a VR,
   // we don't use memory after all.
   if (!Flags.isByVal()) {
-    if (ArgVT == MVT::f32 || ArgVT == MVT::f64)
+    if (ArgVT == MVT::f32 || ArgVT == MVT::f64 ||
+        // QPX registers overlap with the scalar FP registers.
+        (HasQPX && (ArgVT == MVT::v4f32 ||
+                    ArgVT == MVT::v4f64 ||
+                    ArgVT == MVT::v4i1)))
       if (AvailableFPRs > 0) {
         --AvailableFPRs;
         return false;
@@ -2314,10 +2575,9 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
 
 /// EnsureStackAlignment - Round stack frame size up from NumBytes to
 /// ensure minimum alignment required for target.
-static unsigned EnsureStackAlignment(const TargetMachine &Target,
+static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
                                      unsigned NumBytes) {
-  unsigned TargetAlign =
-      Target.getSubtargetImpl()->getFrameLowering()->getStackAlignment();
+  unsigned TargetAlign = Lowering->getStackAlignment();
   unsigned AlignMask = TargetAlign - 1;
   NumBytes = (NumBytes + AlignMask) & ~AlignMask;
   return NumBytes;
@@ -2398,7 +2658,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
                  *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
-  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false, false);
+  unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
   CCInfo.AllocateStack(LinkageSize, PtrByteSize);
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
@@ -2430,13 +2690,21 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
         case MVT::v16i8:
         case MVT::v8i16:
         case MVT::v4i32:
-        case MVT::v4f32:
           RC = &PPC::VRRCRegClass;
           break;
+        case MVT::v4f32:
+          RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass;
+          break;
         case MVT::v2f64:
         case MVT::v2i64:
           RC = &PPC::VSHRCRegClass;
           break;
+        case MVT::v4f64:
+          RC = &PPC::QFRCRegClass;
+          break;
+        case MVT::v4i1:
+          RC = &PPC::QBRCRegClass;
+          break;
       }
 
       // Transform the arguments stored in physical registers into virtual ones.
@@ -2484,7 +2752,8 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
   // call optimized function's reserved stack space needs to be aligned so that
   // taking the difference between two stack areas will result in an aligned
   // stack.
-  MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea);
+  MinReservedArea =
+      EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
   FuncInfo->setMinReservedArea(MinReservedArea);
 
   SmallVector<SDValue, 8> MemOps;
@@ -2506,10 +2775,8 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
     if (DisablePPCFloatInVariadic)
       NumFPArgRegs = 0;
 
-    FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs,
-                                                          NumGPArgRegs));
-    FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs,
-                                                          NumFPArgRegs));
+    FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs));
+    FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs));
 
     // Make room for NumGPArgRegs and NumFPArgRegs.
     int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
@@ -2599,14 +2866,15 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
   MachineFrameInfo *MFI = MF.getFrameInfo();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
 
+  assert(!(CallConv == CallingConv::Fast && isVarArg) &&
+         "fastcc not supported on varargs functions");
+
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   // Potential tail calls could cause overwriting of argument stack slots.
   bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
                        (CallConv == CallingConv::Fast));
   unsigned PtrByteSize = 8;
-
-  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false,
-                                                          isELFv2ABI);
+  unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
 
   static const MCPhysReg GPR[] = {
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
@@ -2624,9 +2892,12 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
     PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13
   };
 
+  static const MCPhysReg *QFPR = GetQFPR();
+
   const unsigned Num_GPR_Regs = array_lengthof(GPR);
   const unsigned Num_FPR_Regs = 13;
   const unsigned Num_VR_Regs  = array_lengthof(VR);
+  const unsigned Num_QFPR_Regs = Num_FPR_Regs;
 
   // Do a first pass over the arguments to determine whether the ABI
   // guarantees that our caller has allocated the parameter save area
@@ -2642,7 +2913,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
   for (unsigned i = 0, e = Ins.size(); i != e; ++i)
     if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
                                PtrByteSize, LinkageSize, ParamAreaSize,
-                               NumBytes, AvailableFPRs, AvailableVRs))
+                               NumBytes, AvailableFPRs, AvailableVRs,
+                               Subtarget.hasQPX()))
       HasParameterArea = true;
 
   // Add DAG nodes to load the arguments or copy them out of registers.  On
@@ -2650,7 +2922,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
   // although the first ones are often in registers.
 
   unsigned ArgOffset = LinkageSize;
-  unsigned GPR_idx, FPR_idx = 0, VR_idx = 0;
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+  unsigned &QFPR_idx = FPR_idx;
   SmallVector<SDValue, 8> MemOps;
   Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
   unsigned CurArgIdx = 0;
@@ -2662,22 +2935,37 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
     unsigned ObjSize = ObjectVT.getStoreSize();
     unsigned ArgSize = ObjSize;
     ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
-    std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Ins[ArgNo].OrigArgIndex;
+    if (Ins[ArgNo].isOrigArg()) {
+      std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[ArgNo].getOrigArgIndex();
+    }
+    // We re-align the argument offset for each argument, except when using the
+    // fast calling convention, when we need to make sure we do that only when
+    // we'll actually use a stack slot.
+    unsigned CurArgOffset, Align;
+    auto ComputeArgOffset = [&]() {
+      /* Respect alignment of argument on the stack.  */
+      Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
+      ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+      CurArgOffset = ArgOffset;
+    };
 
-    /* Respect alignment of argument on the stack.  */
-    unsigned Align =
-      CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
-    ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
-    unsigned CurArgOffset = ArgOffset;
+    if (CallConv != CallingConv::Fast) {
+      ComputeArgOffset();
 
-    /* Compute GPR index associated with argument offset.  */
-    GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
-    GPR_idx = std::min(GPR_idx, Num_GPR_Regs);
+      /* Compute GPR index associated with argument offset.  */
+      GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+      GPR_idx = std::min(GPR_idx, Num_GPR_Regs);
+    }
 
     // FIXME the codegen can be much improved in some cases.
     // We do not have to keep everything in memory.
     if (Flags.isByVal()) {
+      assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
+
+      if (CallConv == CallingConv::Fast)
+        ComputeArgOffset();
+
       // ObjSize is the true size, ArgSize rounded up to multiple of registers.
       ObjSize = Flags.getByValSize();
       ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
@@ -2721,7 +3009,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
         InVals.push_back(Arg);
 
         if (GPR_idx != Num_GPR_Regs) {
-          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           SDValue Store;
 
@@ -2783,7 +3071,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
       // passed directly.  Clang may use those instead of "byval" aggregate
       // types to avoid forcing arguments to memory unnecessarily.
       if (GPR_idx != Num_GPR_Regs) {
-        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
         if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
@@ -2791,10 +3079,14 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
           // value to MVT::i64 and then truncate to the correct register size.
           ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
       } else {
+        if (CallConv == CallingConv::Fast)
+          ComputeArgOffset();
+
         needsLoad = true;
         ArgSize = PtrByteSize;
       }
-      ArgOffset += 8;
+      if (CallConv != CallingConv::Fast || needsLoad)
+        ArgOffset += 8;
       break;
 
     case MVT::f32:
@@ -2808,17 +3100,20 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
         if (ObjectVT == MVT::f32)
           VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
         else
-          VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() ?
-                                            &PPC::VSFRCRegClass :
-                                            &PPC::F8RCRegClass);
+          VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX()
+                                                ? &PPC::VSFRCRegClass
+                                                : &PPC::F8RCRegClass);
 
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
         ++FPR_idx;
-      } else if (GPR_idx != Num_GPR_Regs) {
+      } else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) {
+        // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
+        // once we support fp <-> gpr moves.
+
         // This can only ever happen in the presence of f32 array types,
         // since otherwise we never run out of FPRs before running out
         // of GPRs.
-        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
         if (ObjectVT == MVT::f32) {
@@ -2830,16 +3125,21 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
 
         ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal);
       } else {
+        if (CallConv == CallingConv::Fast)
+          ComputeArgOffset();
+
         needsLoad = true;
       }
 
       // When passing an array of floats, the array occupies consecutive
       // space in the argument area; only round up to the next doubleword
       // at the end of the array.  Otherwise, each float takes 8 bytes.
-      ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
-      ArgOffset += ArgSize;
-      if (Flags.isInConsecutiveRegsLast())
-        ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      if (CallConv != CallingConv::Fast || needsLoad) {
+        ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
+        ArgOffset += ArgSize;
+        if (Flags.isInConsecutiveRegsLast())
+          ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      }
       break;
     case MVT::v4f32:
     case MVT::v4i32:
@@ -2847,6 +3147,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
     case MVT::v16i8:
     case MVT::v2f64:
     case MVT::v2i64:
+      if (!Subtarget.hasQPX()) {
       // These can be scalar arguments or elements of a vector array type
       // passed directly.  The latter are used to implement ELFv2 homogenous
       // vector aggregates.
@@ -2857,9 +3158,43 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
         ++VR_idx;
       } else {
+        if (CallConv == CallingConv::Fast)
+          ComputeArgOffset();
+
         needsLoad = true;
       }
-      ArgOffset += 16;
+      if (CallConv != CallingConv::Fast || needsLoad)
+        ArgOffset += 16;
+      break;
+      } // not QPX
+
+      assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 &&
+             "Invalid QPX parameter type");
+      /* fall through */
+
+    case MVT::v4f64:
+    case MVT::v4i1:
+      // QPX vectors are treated like their scalar floating-point subregisters
+      // (except that they're larger).
+      unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32;
+      if (QFPR_idx != Num_QFPR_Regs) {
+        const TargetRegisterClass *RC;
+        switch (ObjectVT.getSimpleVT().SimpleTy) {
+        case MVT::v4f64: RC = &PPC::QFRCRegClass; break;
+        case MVT::v4f32: RC = &PPC::QSRCRegClass; break;
+        default:         RC = &PPC::QBRCRegClass; break;
+        }
+
+        unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        ++QFPR_idx;
+      } else {
+        if (CallConv == CallingConv::Fast)
+          ComputeArgOffset();
+        needsLoad = true;
+      }
+      if (CallConv != CallingConv::Fast || needsLoad)
+        ArgOffset += Sz;
       break;
     }
 
@@ -2888,7 +3223,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
   // call optimized functions' reserved stack space needs to be aligned so that
   // taking the difference between two stack areas will result in an aligned
   // stack.
-  MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea);
+  MinReservedArea =
+      EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
   FuncInfo->setMinReservedArea(MinReservedArea);
 
   // If the function takes variable number of arguments, make a frame index for
@@ -2942,9 +3278,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
   bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
                        (CallConv == CallingConv::Fast));
   unsigned PtrByteSize = isPPC64 ? 8 : 4;
-
-  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true,
-                                                          false);
+  unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
   unsigned ArgOffset = LinkageSize;
   // Area that is at least reserved in caller of this function.
   unsigned MinReservedArea = ArgOffset;
@@ -3038,9 +3372,10 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
     unsigned ObjSize = ObjectVT.getSizeInBits()/8;
     unsigned ArgSize = ObjSize;
     ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
-    std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Ins[ArgNo].OrigArgIndex;
-
+    if (Ins[ArgNo].isOrigArg()) {
+      std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[ArgNo].getOrigArgIndex();
+    }
     unsigned CurArgOffset = ArgOffset;
 
     // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
@@ -3061,6 +3396,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
     // FIXME the codegen can be much improved in some cases.
     // We do not have to keep everything in memory.
     if (Flags.isByVal()) {
+      assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
+
       // ObjSize is the true size, ArgSize rounded up to multiple of registers.
       ObjSize = Flags.getByValSize();
       ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
@@ -3249,7 +3586,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
   // call optimized functions' reserved stack space needs to be aligned so that
   // taking the difference between two stack areas will result in an aligned
   // stack.
-  MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea);
+  MinReservedArea =
+      EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
   FuncInfo->setMinReservedArea(MinReservedArea);
 
   // If the function takes variable number of arguments, make a frame index for
@@ -3404,8 +3742,9 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
   if (SPDiff) {
     // Calculate the new stack slot for the return address.
     int SlotSize = isPPC64 ? 8 : 4;
-    int NewRetAddrLoc = SPDiff + PPCFrameLowering::getReturnSaveOffset(isPPC64,
-                                                                   isDarwinABI);
+    const PPCFrameLowering *FL =
+        MF.getSubtarget<PPCSubtarget>().getFrameLowering();
+    int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset();
     int NewRetAddr = MF.getFrameInfo()->CreateFixedObject(SlotSize,
                                                           NewRetAddrLoc, true);
     EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
@@ -3417,8 +3756,7 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
     // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
     // slot as the FP is never overwritten.
     if (isDarwinABI) {
-      int NewFPLoc =
-        SPDiff + PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+      int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset();
       int NewFPIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewFPLoc,
                                                           true);
       SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
@@ -3548,12 +3886,27 @@ void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
   InFlag = Chain.getValue(1);
 }
 
+// Is this global address that of a function that can be called by name? (as
+// opposed to something that must hold a descriptor for an indirect call).
+static bool isFunctionGlobalAddress(SDValue Callee) {
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    if (Callee.getOpcode() == ISD::GlobalTLSAddress ||
+        Callee.getOpcode() == ISD::TargetGlobalTLSAddress)
+      return false;
+
+    return G->getGlobal()->getType()->getElementType()->isFunctionTy();
+  }
+
+  return false;
+}
+
 static
 unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
-                     SDValue &Chain, SDLoc dl, int SPDiff, bool isTailCall,
+                     SDValue &Chain, SDValue CallSeqStart, SDLoc dl, int SPDiff,
+                     bool isTailCall, bool IsPatchPoint,
                      SmallVectorImpl<std::pair<unsigned, SDValue> > &RegsToPass,
                      SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
-                     const PPCSubtarget &Subtarget) {
+                     ImmutableCallSite *CS, const PPCSubtarget &Subtarget) {
 
   bool isPPC64 = Subtarget.isPPC64();
   bool isSVR4ABI = Subtarget.isSVR4ABI();
@@ -3573,7 +3926,10 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
       needIndirectCall = false;
     }
 
-  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+  if (isFunctionGlobalAddress(Callee)) {
+    GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
+    // A call to a TLS address is actually an indirect call to a
+    // thread-specific pointer.
     unsigned OpFlags = 0;
     if ((DAG.getTarget().getRelocationModel() != Reloc::Static &&
          (Subtarget.getTargetTriple().isMacOSX() &&
@@ -3604,7 +3960,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
          (Subtarget.getTargetTriple().isMacOSX() &&
           Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) ||
         (Subtarget.isTargetELF() && !isPPC64 &&
-         DAG.getTarget().getRelocationModel() == Reloc::PIC_)	) {
+         DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
       // PC-relative references to external symbols should go through $stub,
       // unless we're building with the leopard linker or later, which
       // automatically synthesizes these stubs.
@@ -3616,6 +3972,16 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     needIndirectCall = false;
   }
 
+  if (IsPatchPoint) {
+    // We'll form an invalid direct call when lowering a patchpoint; the full
+    // sequence for an indirect call is complicated, and many of the
+    // instructions introduced might have side effects (and, thus, can't be
+    // removed later). The call itself will be removed as soon as the
+    // argument/return lowering is complete, so the fact that it has the wrong
+    // kind of operands should not really matter.
+    needIndirectCall = false;
+  }
+
   if (needIndirectCall) {
     // Otherwise, this is an indirect call.  We have to use a MTCTR/BCTRL pair
     // to do the call, we can't use PPCISD::CALL.
@@ -3641,50 +4007,51 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
       //   6. On return of the callee, the TOC of the caller needs to be
       //      restored (this is done in FinishCall()).
       //
-      // All those operations are flagged together to ensure that no other
+      // The loads are scheduled at the beginning of the call sequence, and the
+      // register copies are flagged together to ensure that no other
       // operations can be scheduled in between. E.g. without flagging the
-      // operations together, a TOC access in the caller could be scheduled
-      // between the load of the callee TOC and the branch to the callee, which
+      // copies together, a TOC access in the caller could be scheduled between
+      // the assignment of the callee TOC and the branch to the callee, which
       // results in the TOC access going through the TOC of the callee instead
       // of going through the TOC of the caller, which leads to incorrect code.
 
       // Load the address of the function entry point from the function
       // descriptor.
-      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other, MVT::Glue);
-      SDValue LoadFuncPtr = DAG.getNode(PPCISD::LOAD, dl, VTs,
-                              makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2));
-      Chain = LoadFuncPtr.getValue(1);
-      InFlag = LoadFuncPtr.getValue(2);
+      SDValue LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-1);
+      if (LDChain.getValueType() == MVT::Glue)
+        LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-2);
+
+      bool LoadsInv = Subtarget.hasInvariantFunctionDescriptors();
+
+      MachinePointerInfo MPI(CS ? CS->getCalledValue() : nullptr);
+      SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI,
+                                        false, false, LoadsInv, 8);
 
       // Load environment pointer into r11.
-      // Offset of the environment pointer within the function descriptor.
       SDValue PtrOff = DAG.getIntPtrConstant(16);
-
       SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff);
-      SDValue LoadEnvPtr = DAG.getNode(PPCISD::LOAD, dl, VTs, Chain, AddPtr,
-                                       InFlag);
-      Chain = LoadEnvPtr.getValue(1);
-      InFlag = LoadEnvPtr.getValue(2);
+      SDValue LoadEnvPtr = DAG.getLoad(MVT::i64, dl, LDChain, AddPtr,
+                                       MPI.getWithOffset(16), false, false,
+                                       LoadsInv, 8);
+
+      SDValue TOCOff = DAG.getIntPtrConstant(8);
+      SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff);
+      SDValue TOCPtr = DAG.getLoad(MVT::i64, dl, LDChain, AddTOC,
+                                   MPI.getWithOffset(8), false, false,
+                                   LoadsInv, 8);
+
+      setUsesTOCBasePtr(DAG);
+      SDValue TOCVal = DAG.getCopyToReg(Chain, dl, PPC::X2, TOCPtr,
+                                        InFlag);
+      Chain = TOCVal.getValue(0);
+      InFlag = TOCVal.getValue(1);
 
       SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr,
                                         InFlag);
+
       Chain = EnvVal.getValue(0);
       InFlag = EnvVal.getValue(1);
 
-      // Load TOC of the callee into r2. We are using a target-specific load
-      // with r2 hard coded, because the result of a target-independent load
-      // would never go directly into r2, since r2 is a reserved register (which
-      // prevents the register allocator from allocating it), resulting in an
-      // additional register being allocated and an unnecessary move instruction
-      // being generated.
-      VTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      SDValue TOCOff = DAG.getIntPtrConstant(8);
-      SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff);
-      SDValue LoadTOCPtr = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain,
-                                       AddTOC, InFlag);
-      Chain = LoadTOCPtr.getValue(0);
-      InFlag = LoadTOCPtr.getValue(1);
-
       MTCTROps[0] = Chain;
       MTCTROps[1] = LoadFuncPtr;
       MTCTROps[2] = InFlag;
@@ -3712,23 +4079,6 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
   if (Callee.getNode()) {
     Ops.push_back(Chain);
     Ops.push_back(Callee);
-
-    // If this is a call to __tls_get_addr, find the symbol whose address
-    // is to be taken and add it to the list.  This will be used to 
-    // generate __tls_get_addr(<sym>@tlsgd) or __tls_get_addr(<sym>@tlsld).
-    // We find the symbol by walking the chain to the CopyFromReg, walking
-    // back from the CopyFromReg to the ADDI_TLSGD_L or ADDI_TLSLD_L, and
-    // pulling the symbol from that node.
-    if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
-      if (!strcmp(S->getSymbol(), "__tls_get_addr")) {
-        assert(!needIndirectCall && "Indirect call to __tls_get_addr???");
-        SDNode *AddI = Chain.getNode()->getOperand(2).getNode();
-        SDValue TGTAddr = AddI->getOperand(1);
-        assert(TGTAddr.getNode()->getOpcode() == ISD::TargetGlobalTLSAddress &&
-               "Didn't find target global TLS address where we expected one");
-        Ops.push_back(TGTAddr);
-        CallOpc = PPCISD::CALL_TLS;
-      }
   }
   // If this is a tail call add stack pointer delta.
   if (isTailCall)
@@ -3740,9 +4090,12 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));
 
-  // Direct calls in the ELFv2 ABI need the TOC register live into the call.
-  if (Callee.getNode() && isELFv2ABI)
+  // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live
+  // into the call.
+  if (isSVR4ABI && isPPC64 && !IsPatchPoint) {
+    setUsesTOCBasePtr(DAG);
     Ops.push_back(DAG.getRegister(PPC::X2, PtrVT));
+  }
 
   return CallOpc;
 }
@@ -3804,22 +4157,22 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
 SDValue
 PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
-                              bool isTailCall, bool isVarArg,
+                              bool isTailCall, bool isVarArg, bool IsPatchPoint,
                               SelectionDAG &DAG,
                               SmallVector<std::pair<unsigned, SDValue>, 8>
                                 &RegsToPass,
                               SDValue InFlag, SDValue Chain,
-                              SDValue &Callee,
+                              SDValue CallSeqStart, SDValue &Callee,
                               int SPDiff, unsigned NumBytes,
                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                              SmallVectorImpl<SDValue> &InVals) const {
+                              SmallVectorImpl<SDValue> &InVals,
+                              ImmutableCallSite *CS) const {
 
-  bool isELFv2ABI = Subtarget.isELFv2ABI();
   std::vector<EVT> NodeTys;
   SmallVector<SDValue, 8> Ops;
-  unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff,
-                                 isTailCall, RegsToPass, Ops, NodeTys,
-                                 Subtarget);
+  unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl,
+                                 SPDiff, isTailCall, IsPatchPoint, RegsToPass,
+                                 Ops, NodeTys, CS, Subtarget);
 
   // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
   if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64())
@@ -3833,8 +4186,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
      getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0;
 
   // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
@@ -3863,8 +4215,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
   // stack frame. If caller and callee belong to the same module (and have the
   // same TOC), the NOP will remain unchanged.
 
-  bool needsTOCRestore = false;
-  if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64()) {
+  if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() &&
+      !IsPatchPoint) {
     if (CallOpc == PPCISD::BCTRL) {
       // This is a call through a function pointer.
       // Restore the caller TOC from the save area into R2.
@@ -3875,31 +4227,27 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
       // since r2 is a reserved register (which prevents the register allocator
       // from allocating it), resulting in an additional register being
       // allocated and an unnecessary move instruction being generated.
-      needsTOCRestore = true;
+      CallOpc = PPCISD::BCTRL_LOAD_TOC;
+
+      EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+      SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
+      unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
+      SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset);
+      SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff);
+
+      // The address needs to go after the chain input but before the flag (or
+      // any other variadic arguments).
+      Ops.insert(std::next(Ops.begin()), AddTOC);
     } else if ((CallOpc == PPCISD::CALL) &&
                (!isLocalCall(Callee) ||
-                DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
+                DAG.getTarget().getRelocationModel() == Reloc::PIC_))
       // Otherwise insert NOP for non-local calls.
       CallOpc = PPCISD::CALL_NOP;
-    } else if (CallOpc == PPCISD::CALL_TLS)
-      // For 64-bit SVR4, TLS calls are always non-local.
-      CallOpc = PPCISD::CALL_NOP_TLS;
   }
 
   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
-  if (needsTOCRestore) {
-    SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-    SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
-    unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI);
-    SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset);
-    SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff);
-    Chain = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, AddTOC, InFlag);
-    InFlag = Chain.getValue(1);
-  }
-
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
                              DAG.getIntPtrConstant(BytesCalleePops, true),
                              InFlag, dl);
@@ -3923,40 +4271,43 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool &isTailCall                      = CLI.IsTailCall;
   CallingConv::ID CallConv              = CLI.CallConv;
   bool isVarArg                         = CLI.IsVarArg;
+  bool IsPatchPoint                     = CLI.IsPatchPoint;
+  ImmutableCallSite *CS                 = CLI.CS;
 
   if (isTailCall)
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
                                                    Ins, DAG);
 
-  if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
+  if (!isTailCall && CS && CS->isMustTailCall())
     report_fatal_error("failed to perform tail call elimination on a call "
                        "site marked musttail");
 
   if (Subtarget.isSVR4ABI()) {
     if (Subtarget.isPPC64())
       return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
-                              isTailCall, Outs, OutVals, Ins,
-                              dl, DAG, InVals);
+                              isTailCall, IsPatchPoint, Outs, OutVals, Ins,
+                              dl, DAG, InVals, CS);
     else
       return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
-                              isTailCall, Outs, OutVals, Ins,
-                              dl, DAG, InVals);
+                              isTailCall, IsPatchPoint, Outs, OutVals, Ins,
+                              dl, DAG, InVals, CS);
   }
 
   return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
-                          isTailCall, Outs, OutVals, Ins,
-                          dl, DAG, InVals);
+                          isTailCall, IsPatchPoint, Outs, OutVals, Ins,
+                          dl, DAG, InVals, CS);
 }
 
 SDValue
 PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
                                     CallingConv::ID CallConv, bool isVarArg,
-                                    bool isTailCall,
+                                    bool isTailCall, bool IsPatchPoint,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                     SDLoc dl, SelectionDAG &DAG,
-                                    SmallVectorImpl<SDValue> &InVals) const {
+                                    SmallVectorImpl<SDValue> &InVals,
+                                    ImmutableCallSite *CS) const {
   // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
   // of the 32-bit SVR4 ABI stack frame layout.
 
@@ -3986,7 +4337,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
                  *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
-  CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false, false),
+  CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(),
                        PtrByteSize);
 
   if (isVarArg) {
@@ -4161,9 +4512,9 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
     PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
                     false, TailCallArguments);
 
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
-                    RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
-                    Ins, InVals);
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG,
+                    RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
+                    NumBytes, Ins, InVals, CS);
 }
 
 // Copy an argument into memory, being careful to do this outside the
@@ -4189,12 +4540,13 @@ PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
 SDValue
 PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
                                     CallingConv::ID CallConv, bool isVarArg,
-                                    bool isTailCall,
+                                    bool isTailCall, bool IsPatchPoint,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                     SDLoc dl, SelectionDAG &DAG,
-                                    SmallVectorImpl<SDValue> &InVals) const {
+                                    SmallVectorImpl<SDValue> &InVals,
+                                    ImmutableCallSite *CS) const {
 
   bool isELFv2ABI = Subtarget.isELFv2ABI();
   bool isLittleEndian = Subtarget.isLittleEndian();
@@ -4214,13 +4566,43 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       CallConv == CallingConv::Fast)
     MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
 
+  assert(!(CallConv == CallingConv::Fast && isVarArg) &&
+         "fastcc not supported on varargs functions");
+
   // Count how many bytes are to be pushed on the stack, including the linkage
   // area, and parameter passing area.  On ELFv1, the linkage area is 48 bytes
   // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
   // area is 32 bytes reserved space for [SP][CR][LR][TOC].
-  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false,
-                                                          isELFv2ABI);
+  unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
   unsigned NumBytes = LinkageSize;
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+  unsigned &QFPR_idx = FPR_idx;
+
+  static const MCPhysReg GPR[] = {
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const MCPhysReg *FPR = GetFPR();
+
+  static const MCPhysReg VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+  static const MCPhysReg VSRH[] = {
+    PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8,
+    PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13
+  };
+
+  static const MCPhysReg *QFPR = GetQFPR();
+
+  const unsigned NumGPRs = array_lengthof(GPR);
+  const unsigned NumFPRs = 13;
+  const unsigned NumVRs  = array_lengthof(VR);
+  const unsigned NumQFPRs = NumFPRs;
+
+  // When using the fast calling convention, we don't provide backing for
+  // arguments that will be in registers.
+  unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;
 
   // Add up all the space actually used.
   for (unsigned i = 0; i != NumOps; ++i) {
@@ -4228,6 +4610,47 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     EVT ArgVT = Outs[i].VT;
     EVT OrigVT = Outs[i].ArgVT;
 
+    if (CallConv == CallingConv::Fast) {
+      if (Flags.isByVal())
+        NumGPRsUsed += (Flags.getByValSize()+7)/8;
+      else
+        switch (ArgVT.getSimpleVT().SimpleTy) {
+        default: llvm_unreachable("Unexpected ValueType for argument!");
+        case MVT::i1:
+        case MVT::i32:
+        case MVT::i64:
+          if (++NumGPRsUsed <= NumGPRs)
+            continue;
+          break;
+        case MVT::v4i32:
+        case MVT::v8i16:
+        case MVT::v16i8:
+        case MVT::v2f64:
+        case MVT::v2i64:
+          if (++NumVRsUsed <= NumVRs)
+            continue;
+          break;
+        case MVT::v4f32:
+	  // When using QPX, this is handled like a FP register, otherwise, it
+	  // is an Altivec register.
+          if (Subtarget.hasQPX()) {
+            if (++NumFPRsUsed <= NumFPRs)
+              continue;
+          } else {
+            if (++NumVRsUsed <= NumVRs)
+              continue;
+          }
+          break;
+        case MVT::f32:
+        case MVT::f64:
+        case MVT::v4f64: // QPX
+        case MVT::v4i1:  // QPX
+          if (++NumFPRsUsed <= NumFPRs)
+            continue;
+          break;
+        }
+    }
+
     /* Respect alignment of argument on the stack.  */
     unsigned Align =
       CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
@@ -4251,7 +4674,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
   // Tail call needs the stack to be aligned.
   if (getTargetMachine().Options.GuaranteedTailCallOpt &&
       CallConv == CallingConv::Fast)
-    NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes);
+    NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
 
   // Calculate by how many bytes the stack has to be adjusted in case of tail
   // call optimization.
@@ -4284,26 +4707,6 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
   // must be stored to our stack, and loaded into integer regs as well, if
   // any integer regs are available for argument passing.
   unsigned ArgOffset = LinkageSize;
-  unsigned GPR_idx, FPR_idx = 0, VR_idx = 0;
-
-  static const MCPhysReg GPR[] = {
-    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
-    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
-  };
-  static const MCPhysReg *FPR = GetFPR();
-
-  static const MCPhysReg VR[] = {
-    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
-    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
-  };
-  static const MCPhysReg VSRH[] = {
-    PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8,
-    PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13
-  };
-
-  const unsigned NumGPRs = array_lengthof(GPR);
-  const unsigned NumFPRs = 13;
-  const unsigned NumVRs  = array_lengthof(VR);
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
@@ -4315,22 +4718,31 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     EVT ArgVT = Outs[i].VT;
     EVT OrigVT = Outs[i].ArgVT;
 
-    /* Respect alignment of argument on the stack.  */
-    unsigned Align =
-      CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
-    ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
-
-    /* Compute GPR index associated with argument offset.  */
-    GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
-    GPR_idx = std::min(GPR_idx, NumGPRs);
-
     // PtrOff will be used to store the current argument to the stack if a
     // register cannot be found for it.
     SDValue PtrOff;
 
-    PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+    // We re-align the argument offset for each argument, except when using the
+    // fast calling convention, when we need to make sure we do that only when
+    // we'll actually use a stack slot.
+    auto ComputePtrOff = [&]() {
+      /* Respect alignment of argument on the stack.  */
+      unsigned Align =
+        CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+      ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
 
-    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+      PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+
+      PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+    };
+
+    if (CallConv != CallingConv::Fast) {
+      ComputePtrOff();
+
+      /* Compute GPR index associated with argument offset.  */
+      GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+      GPR_idx = std::min(GPR_idx, NumGPRs);
+    }
 
     // Promote integers to 64-bit values.
     if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) {
@@ -4355,6 +4767,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       if (Size == 0)
         continue;
 
+      if (CallConv == CallingConv::Fast)
+        ComputePtrOff();
+
       // All aggregates smaller than 8 bytes must be passed right-justified.
       if (Size==1 || Size==2 || Size==4) {
         EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
@@ -4363,7 +4778,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
                                         MachinePointerInfo(), VT,
                                         false, false, false, 0);
           MemOpChains.push_back(Load.getValue(1));
-          RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
 
           ArgOffset += PtrByteSize;
           continue;
@@ -4425,7 +4840,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
                                    MachinePointerInfo(),
                                    false, false, false, 0);
         MemOpChains.push_back(Load.getValue(1));
-        RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load));
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
 
         // Done with this argument.
         ArgOffset += PtrByteSize;
@@ -4461,13 +4876,19 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       // passed directly.  Clang may use those instead of "byval" aggregate
       // types to avoid forcing arguments to memory unnecessarily.
       if (GPR_idx != NumGPRs) {
-        RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Arg));
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
       } else {
+        if (CallConv == CallingConv::Fast)
+          ComputePtrOff();
+
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
+        if (CallConv == CallingConv::Fast)
+          ArgOffset += PtrByteSize;
       }
-      ArgOffset += PtrByteSize;
+      if (CallConv != CallingConv::Fast)
+        ArgOffset += PtrByteSize;
       break;
     case MVT::f32:
     case MVT::f64: {
@@ -4481,6 +4902,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       // then the parameter save area.  For now, put all arguments to vararg
       // routines always in both locations (FPR *and* GPR or stack slot).
       bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs;
+      bool NeededLoad = false;
 
       // First load the argument into the next available FPR.
       if (FPR_idx != NumFPRs)
@@ -4489,7 +4911,10 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       // Next, load the argument into GPR or stack slot if needed.
       if (!NeedGPROrStack)
         ;
-      else if (GPR_idx != NumGPRs) {
+      else if (GPR_idx != NumGPRs && CallConv != CallingConv::Fast) {
+        // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
+        // once we support fp <-> gpr moves.
+
         // In the non-vararg case, this can only ever happen in the
         // presence of f32 array types, since otherwise we never run
         // out of FPRs before running out of GPRs.
@@ -4528,8 +4953,11 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
           ArgVal = SDValue();
 
         if (ArgVal.getNode())
-          RegsToPass.push_back(std::make_pair(GPR[GPR_idx], ArgVal));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal));
       } else {
+        if (CallConv == CallingConv::Fast)
+          ComputePtrOff();
+
         // Single-precision floating-point values are mapped to the
         // second (rightmost) word of the stack doubleword.
         if (Arg.getValueType() == MVT::f32 &&
@@ -4541,14 +4969,18 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
+
+        NeededLoad = true;
       }
       // When passing an array of floats, the array occupies consecutive
       // space in the argument area; only round up to the next doubleword
       // at the end of the array.  Otherwise, each float takes 8 bytes.
-      ArgOffset += (Arg.getValueType() == MVT::f32 &&
-                    Flags.isInConsecutiveRegs()) ? 4 : 8;
-      if (Flags.isInConsecutiveRegsLast())
-        ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      if (CallConv != CallingConv::Fast || NeededLoad) {
+        ArgOffset += (Arg.getValueType() == MVT::f32 &&
+                      Flags.isInConsecutiveRegs()) ? 4 : 8;
+        if (Flags.isInConsecutiveRegsLast())
+          ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      }
       break;
     }
     case MVT::v4f32:
@@ -4557,6 +4989,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     case MVT::v16i8:
     case MVT::v2f64:
     case MVT::v2i64:
+      if (!Subtarget.hasQPX()) {
       // These can be scalar arguments or elements of a vector array type
       // passed directly.  The latter are used to implement ELFv2 homogenous
       // vector aggregates.
@@ -4607,12 +5040,73 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
 
         RegsToPass.push_back(std::make_pair(VReg, Arg));
       } else {
+        if (CallConv == CallingConv::Fast)
+          ComputePtrOff();
+
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, true, MemOpChains,
+                         TailCallArguments, dl);
+        if (CallConv == CallingConv::Fast)
+          ArgOffset += 16;
+      }
+
+      if (CallConv != CallingConv::Fast)
+        ArgOffset += 16;
+      break;
+      } // not QPX
+
+      assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 &&
+             "Invalid QPX parameter type");
+
+      /* fall through */
+    case MVT::v4f64:
+    case MVT::v4i1: {
+      bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
+      if (isVarArg) {
+        // We could elide this store in the case where the object fits
+        // entirely in R registers.  Maybe later.
+        SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
+                                     MachinePointerInfo(), false, false, 0);
+        MemOpChains.push_back(Store);
+        if (QFPR_idx != NumQFPRs) {
+          SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl,
+                                     Store, PtrOff, MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load));
+        }
+        ArgOffset += (IsF32 ? 16 : 32);
+        for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) {
+          if (GPR_idx == NumGPRs)
+            break;
+          SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+                                  DAG.getConstant(i, PtrVT));
+          SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+        }
+        break;
+      }
+
+      // Non-varargs QPX params go into registers or on the stack.
+      if (QFPR_idx != NumQFPRs) {
+        RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg));
+      } else {
+        if (CallConv == CallingConv::Fast)
+          ComputePtrOff();
+
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, true, MemOpChains,
                          TailCallArguments, dl);
+        if (CallConv == CallingConv::Fast)
+          ArgOffset += (IsF32 ? 16 : 32);
       }
-      ArgOffset += 16;
+
+      if (CallConv != CallingConv::Fast)
+        ArgOffset += (IsF32 ? 16 : 32);
       break;
+      }
     }
   }
 
@@ -4625,21 +5119,23 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
   // Check if this is an indirect call (MTCTR/BCTRL).
   // See PrepareCall() for more information about calls through function
   // pointers in the 64-bit SVR4 ABI.
-  if (!isTailCall &&
-      !dyn_cast<GlobalAddressSDNode>(Callee) &&
-      !dyn_cast<ExternalSymbolSDNode>(Callee)) {
+  if (!isTailCall && !IsPatchPoint &&
+      !isFunctionGlobalAddress(Callee) &&
+      !isa<ExternalSymbolSDNode>(Callee)) {
     // Load r2 into a virtual register and store it to the TOC save area.
+    setUsesTOCBasePtr(DAG);
     SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
     // TOC save area offset.
-    unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI);
+    unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
     SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset);
     SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
-    Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(),
+    Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr,
+                         MachinePointerInfo::getStack(TOCSaveOffset),
                          false, false, 0);
     // In the ELFv2 ABI, R12 must contain the address of an indirect callee.
     // This does not mean the MTCTR instruction must use R12; it's easier
     // to model this as an extra parameter, so do that.
-    if (isELFv2ABI)
+    if (isELFv2ABI && !IsPatchPoint)
       RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
   }
 
@@ -4656,20 +5152,21 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp,
                     FPOp, true, TailCallArguments);
 
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
-                    RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
-                    Ins, InVals);
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG,
+                    RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
+                    NumBytes, Ins, InVals, CS);
 }
 
 SDValue
 PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
                                     CallingConv::ID CallConv, bool isVarArg,
-                                    bool isTailCall,
+                                    bool isTailCall, bool IsPatchPoint,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                     SDLoc dl, SelectionDAG &DAG,
-                                    SmallVectorImpl<SDValue> &InVals) const {
+                                    SmallVectorImpl<SDValue> &InVals,
+                                    ImmutableCallSite *CS) const {
 
   unsigned NumOps = Outs.size();
 
@@ -4691,8 +5188,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
   // Count how many bytes are to be pushed on the stack, including the linkage
   // area, and parameter passing area.  We start with 24/48 bytes, which is
   // prereserved space for [SP][CR][LR][3 x unused].
-  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true,
-                                                          false);
+  unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
   unsigned NumBytes = LinkageSize;
 
   // Add up all the space actually used.
@@ -4737,7 +5233,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
   // Tail call needs the stack to be aligned.
   if (getTargetMachine().Options.GuaranteedTailCallOpt &&
       CallConv == CallingConv::Fast)
-    NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes);
+    NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
 
   // Calculate by how many bytes the stack has to be adjusted in case of tail
   // call optimization.
@@ -5030,8 +5526,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
   // not mean the MTCTR instruction must use R12; it's easier to model this as
   // an extra parameter, so do that.
   if (!isTailCall &&
-      !dyn_cast<GlobalAddressSDNode>(Callee) &&
-      !dyn_cast<ExternalSymbolSDNode>(Callee) &&
+      !isFunctionGlobalAddress(Callee) &&
+      !isa<ExternalSymbolSDNode>(Callee) &&
       !isBLACompatibleAddress(Callee, DAG))
     RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
                                                    PPC::R12), Callee));
@@ -5049,9 +5545,9 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
     PrepareTailCall(DAG, InFlag, Chain, dl, isPPC64, SPDiff, NumBytes, LROp,
                     FPOp, true, TailCallArguments);
 
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
-                    RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
-                    Ins, InVals);
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG,
+                    RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
+                    NumBytes, Ins, InVals, CS);
 }
 
 bool
@@ -5150,7 +5646,6 @@ SDValue
 PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool isPPC64 = Subtarget.isPPC64();
-  bool isDarwinABI = Subtarget.isDarwinABI();
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
   // Get current frame pointer save index.  The users of this index will be
@@ -5161,9 +5656,9 @@ PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
   // If the frame pointer save index hasn't been defined yet.
   if (!RASI) {
     // Find out what the fix offset of the frame pointer save area.
-    int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
+    int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset();
     // Allocate the frame index for frame pointer save area.
-    RASI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, LROffset, true);
+    RASI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, LROffset, false);
     // Save the result.
     FI->setReturnAddrSaveIndex(RASI);
   }
@@ -5174,7 +5669,6 @@ SDValue
 PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool isPPC64 = Subtarget.isPPC64();
-  bool isDarwinABI = Subtarget.isDarwinABI();
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
   // Get current frame pointer save index.  The users of this index will be
@@ -5185,9 +5679,7 @@ PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
   // If the frame pointer save index hasn't been defined yet.
   if (!FPSI) {
     // Find out what the fix offset of the frame pointer save area.
-    int FPOffset = PPCFrameLowering::getFramePointerSaveOffset(isPPC64,
-                                                           isDarwinABI);
-
+    int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset();
     // Allocate the frame index for frame pointer save area.
     FPSI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
     // Save the result.
@@ -5233,6 +5725,9 @@ SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
 }
 
 SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  if (Op.getValueType().isVector())
+    return LowerVectorLoad(Op, DAG);
+
   assert(Op.getValueType() == MVT::i1 &&
          "Custom lowering only for i1 loads");
 
@@ -5254,6 +5749,9 @@ SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  if (Op.getOperand(1).getValueType().isVector())
+    return LowerVectorStore(Op, DAG);
+
   assert(Op.getOperand(1).getValueType() == MVT::i1 &&
          "Custom lowering only for i1 stores");
 
@@ -5381,9 +5879,9 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
-// FIXME: Split this code up when LegalizeDAGTypes lands.
-SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
-                                           SDLoc dl) const {
+void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
+                                               SelectionDAG &DAG,
+                                               SDLoc dl) const {
   assert(Op.getOperand(0).getValueType().isFloatingPoint());
   SDValue Src = Op.getOperand(0);
   if (Src.getValueType() == MVT::f32)
@@ -5393,10 +5891,11 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
   switch (Op.getSimpleValueType().SimpleTy) {
   default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
   case MVT::i32:
-    Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ :
-                        (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ :
-                                                   PPCISD::FCTIDZ),
-                      dl, MVT::f64, Src);
+    Tmp = DAG.getNode(
+        Op.getOpcode() == ISD::FP_TO_SINT
+            ? PPCISD::FCTIWZ
+            : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
+        dl, MVT::f64, Src);
     break;
   case MVT::i64:
     assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
@@ -5432,16 +5931,119 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
   if (Op.getValueType() == MVT::i32 && !i32Stack) {
     FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
                         DAG.getConstant(4, FIPtr.getValueType()));
-    MPI = MachinePointerInfo();
+    MPI = MPI.getWithOffset(4);
   }
 
-  return DAG.getLoad(Op.getValueType(), dl, Chain, FIPtr, MPI,
-                     false, false, false, 0);
+  RLI.Chain = Chain;
+  RLI.Ptr = FIPtr;
+  RLI.MPI = MPI;
+}
+
+SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+                                          SDLoc dl) const {
+  ReuseLoadInfo RLI;
+  LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
+
+  return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI, false,
+                     false, RLI.IsInvariant, RLI.Alignment, RLI.AAInfo,
+                     RLI.Ranges);
+}
+
+// We're trying to insert a regular store, S, and then a load, L. If the
+// incoming value, O, is a load, we might just be able to have our load use the
+// address used by O. However, we don't know if anything else will store to
+// that address before we can load from it. To prevent this situation, we need
+// to insert our load, L, into the chain as a peer of O. To do this, we give L
+// the same chain operand as O, we create a token factor from the chain results
+// of O and L, and we replace all uses of O's chain result with that token
+// factor (see spliceIntoChain below for this last part).
+bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
+                                            ReuseLoadInfo &RLI,
+                                            SelectionDAG &DAG,
+                                            ISD::LoadExtType ET) const {
+  SDLoc dl(Op);
+  if (ET == ISD::NON_EXTLOAD &&
+      (Op.getOpcode() == ISD::FP_TO_UINT ||
+       Op.getOpcode() == ISD::FP_TO_SINT) &&
+      isOperationLegalOrCustom(Op.getOpcode(),
+                               Op.getOperand(0).getValueType())) {
+
+    LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
+    return true;
+  }
+
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(Op);
+  if (!LD || LD->getExtensionType() != ET || LD->isVolatile() ||
+      LD->isNonTemporal())
+    return false;
+  if (LD->getMemoryVT() != MemVT)
+    return false;
+
+  RLI.Ptr = LD->getBasePtr();
+  if (LD->isIndexed() && LD->getOffset().getOpcode() != ISD::UNDEF) {
+    assert(LD->getAddressingMode() == ISD::PRE_INC &&
+           "Non-pre-inc AM on PPC?");
+    RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr,
+                          LD->getOffset());
+  }
+
+  RLI.Chain = LD->getChain();
+  RLI.MPI = LD->getPointerInfo();
+  RLI.IsInvariant = LD->isInvariant();
+  RLI.Alignment = LD->getAlignment();
+  RLI.AAInfo = LD->getAAInfo();
+  RLI.Ranges = LD->getRanges();
+
+  RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1);
+  return true;
+}
+
+// Given the head of the old chain, ResChain, insert a token factor containing
+// it and NewResChain, and make users of ResChain now be users of that token
+// factor.
+void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
+                                        SDValue NewResChain,
+                                        SelectionDAG &DAG) const {
+  if (!ResChain)
+    return;
+
+  SDLoc dl(NewResChain);
+
+  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                           NewResChain, DAG.getUNDEF(MVT::Other));
+  assert(TF.getNode() != NewResChain.getNode() &&
+         "A new TF really is required here");
+
+  DAG.ReplaceAllUsesOfValueWith(ResChain, TF);
+  DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain);
 }
 
 SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
-                                           SelectionDAG &DAG) const {
+                                          SelectionDAG &DAG) const {
   SDLoc dl(Op);
+
+  if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) {
+    if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64)
+      return SDValue();
+
+    SDValue Value = Op.getOperand(0);
+    // The values are now known to be -1 (false) or 1 (true). To convert this
+    // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
+    // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
+    Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
+  
+    SDValue FPHalfs = DAG.getConstantFP(0.5, MVT::f64);
+    FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64,
+                          FPHalfs, FPHalfs, FPHalfs, FPHalfs);
+  
+    Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
+
+    if (Op.getValueType() != MVT::v4f64)
+      Value = DAG.getNode(ISD::FP_ROUND, dl,
+                          Op.getValueType(), Value, DAG.getIntPtrConstant(1));
+    return Value;
+  }
+
   // Don't handle ppc_fp128 here; let it be lowered to a libcall.
   if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
     return SDValue();
@@ -5456,13 +6058,14 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
 
   // If we have FCFIDS, then use it when converting to single-precision.
   // Otherwise, convert to double-precision and then round.
-  unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
-                   (Op.getOpcode() == ISD::UINT_TO_FP ?
-                    PPCISD::FCFIDUS : PPCISD::FCFIDS) :
-                   (Op.getOpcode() == ISD::UINT_TO_FP ?
-                    PPCISD::FCFIDU : PPCISD::FCFID);
-  MVT      FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
-                   MVT::f32 : MVT::f64;
+  unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
+                       ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
+                                                            : PPCISD::FCFIDS)
+                       : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
+                                                            : PPCISD::FCFID);
+  MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
+                  ? MVT::f32
+                  : MVT::f64;
 
   if (Op.getOperand(0).getValueType() == MVT::i64) {
     SDValue SINT = Op.getOperand(0);
@@ -5512,7 +6115,70 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
       SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
     }
 
-    SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
+    ReuseLoadInfo RLI;
+    SDValue Bits;
+
+    MachineFunction &MF = DAG.getMachineFunction();
+    if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) {
+      Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI, false,
+                         false, RLI.IsInvariant, RLI.Alignment, RLI.AAInfo,
+                         RLI.Ranges);
+      spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
+    } else if (Subtarget.hasLFIWAX() &&
+               canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) {
+      MachineMemOperand *MMO =
+        MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+                                RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+      SDValue Ops[] = { RLI.Chain, RLI.Ptr };
+      Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl,
+                                     DAG.getVTList(MVT::f64, MVT::Other),
+                                     Ops, MVT::i32, MMO);
+      spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
+    } else if (Subtarget.hasFPCVT() &&
+               canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) {
+      MachineMemOperand *MMO =
+        MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+                                RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+      SDValue Ops[] = { RLI.Chain, RLI.Ptr };
+      Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl,
+                                     DAG.getVTList(MVT::f64, MVT::Other),
+                                     Ops, MVT::i32, MMO);
+      spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
+    } else if (((Subtarget.hasLFIWAX() &&
+                 SINT.getOpcode() == ISD::SIGN_EXTEND) ||
+                (Subtarget.hasFPCVT() &&
+                 SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
+               SINT.getOperand(0).getValueType() == MVT::i32) {
+      MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+      EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+      int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
+      SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+      SDValue Store =
+        DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
+                     MachinePointerInfo::getFixedStack(FrameIdx),
+                     false, false, 0);
+
+      assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
+             "Expected an i32 store");
+
+      RLI.Ptr = FIdx;
+      RLI.Chain = Store;
+      RLI.MPI = MachinePointerInfo::getFixedStack(FrameIdx);
+      RLI.Alignment = 4;
+
+      MachineMemOperand *MMO =
+        MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+                                RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+      SDValue Ops[] = { RLI.Chain, RLI.Ptr };
+      Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ?
+                                     PPCISD::LFIWZX : PPCISD::LFIWAX,
+                                     dl, DAG.getVTList(MVT::f64, MVT::Other),
+                                     Ops, MVT::i32, MMO);
+    } else
+      Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
+
     SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);
 
     if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
@@ -5533,23 +6199,36 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
 
   SDValue Ld;
   if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
-    int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
-    SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
-
-    SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
-                                 MachinePointerInfo::getFixedStack(FrameIdx),
-                                 false, false, 0);
+    ReuseLoadInfo RLI;
+    bool ReusingLoad;
+    if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI,
+                                            DAG))) {
+      int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
+      SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+      SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
+                                   MachinePointerInfo::getFixedStack(FrameIdx),
+                                   false, false, 0);
+
+      assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
+             "Expected an i32 store");
+
+      RLI.Ptr = FIdx;
+      RLI.Chain = Store;
+      RLI.MPI = MachinePointerInfo::getFixedStack(FrameIdx);
+      RLI.Alignment = 4;
+    }
 
-    assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
-           "Expected an i32 store");
     MachineMemOperand *MMO =
-      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
-                              MachineMemOperand::MOLoad, 4, 4);
-    SDValue Ops[] = { Store, FIdx };
+      MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+                              RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+    SDValue Ops[] = { RLI.Chain, RLI.Ptr };
     Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
                                    PPCISD::LFIWZX : PPCISD::LFIWAX,
                                  dl, DAG.getVTList(MVT::f64, MVT::Other),
                                  Ops, MVT::i32, MMO);
+    if (ReusingLoad)
+      spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG);
   } else {
     assert(Subtarget.isPPC64() &&
            "i32->FP without LFIWAX supported only on PPC64");
@@ -5816,6 +6495,127 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
   assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
 
+  if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) {
+    // We first build an i32 vector, load it into a QPX register,
+    // then convert it to a floating-point vector and compare it
+    // to a zero vector to get the boolean result.
+    MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+    int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
+    MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx);
+    EVT PtrVT = getPointerTy();
+    SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+    assert(BVN->getNumOperands() == 4 &&
+      "BUILD_VECTOR for v4i1 does not have 4 operands");
+
+    bool IsConst = true;
+    for (unsigned i = 0; i < 4; ++i) {
+      if (BVN->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+      if (!isa<ConstantSDNode>(BVN->getOperand(i))) {
+        IsConst = false;
+        break;
+      }
+    }
+
+    if (IsConst) {
+      Constant *One =
+        ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0);
+      Constant *NegOne =
+        ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0);
+
+      SmallVector<Constant*, 4> CV(4, NegOne);
+      for (unsigned i = 0; i < 4; ++i) {
+        if (BVN->getOperand(i).getOpcode() == ISD::UNDEF)
+          CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext()));
+        else if (cast<ConstantSDNode>(BVN->getOperand(i))->
+                   getConstantIntValue()->isZero())
+          continue;
+        else
+          CV[i] = One;
+      }
+
+      Constant *CP = ConstantVector::get(CV);
+      SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(),
+                      16 /* alignment */);
+ 
+      SmallVector<SDValue, 2> Ops;
+      Ops.push_back(DAG.getEntryNode());
+      Ops.push_back(CPIdx);
+
+      SmallVector<EVT, 2> ValueVTs;
+      ValueVTs.push_back(MVT::v4i1);
+      ValueVTs.push_back(MVT::Other); // chain
+      SDVTList VTs = DAG.getVTList(ValueVTs);
+
+      return DAG.getMemIntrinsicNode(PPCISD::QVLFSb,
+        dl, VTs, Ops, MVT::v4f32,
+        MachinePointerInfo::getConstantPool());
+    }
+
+    SmallVector<SDValue, 4> Stores;
+    for (unsigned i = 0; i < 4; ++i) {
+      if (BVN->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+
+      unsigned Offset = 4*i;
+      SDValue Idx = DAG.getConstant(Offset, FIdx.getValueType());
+      Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
+
+      unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize();
+      if (StoreSize > 4) {
+        Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl,
+                                           BVN->getOperand(i), Idx,
+                                           PtrInfo.getWithOffset(Offset),
+                                           MVT::i32, false, false, 0));
+      } else {
+        SDValue StoreValue = BVN->getOperand(i);
+        if (StoreSize < 4)
+          StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue);
+
+        Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl,
+                                      StoreValue, Idx,
+                                      PtrInfo.getWithOffset(Offset),
+                                      false, false, 0));
+      }
+    }
+
+    SDValue StoreChain;
+    if (!Stores.empty())
+      StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+    else
+      StoreChain = DAG.getEntryNode();
+
+    // Now load from v4i32 into the QPX register; this will extend it to
+    // v4i64 but not yet convert it to a floating point. Nevertheless, this
+    // is typed as v4f64 because the QPX register integer states are not
+    // explicitly represented.
+
+    SmallVector<SDValue, 2> Ops;
+    Ops.push_back(StoreChain);
+    Ops.push_back(DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, MVT::i32));
+    Ops.push_back(FIdx);
+
+    SmallVector<EVT, 2> ValueVTs;
+    ValueVTs.push_back(MVT::v4f64);
+    ValueVTs.push_back(MVT::Other); // chain
+    SDVTList VTs = DAG.getVTList(ValueVTs);
+
+    SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN,
+      dl, VTs, Ops, MVT::v4i32, PtrInfo);
+    LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
+      DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, MVT::i32),
+      LoadedVect);
+
+    SDValue FPZeros = DAG.getConstantFP(0.0, MVT::f64);
+    FPZeros = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64,
+                          FPZeros, FPZeros, FPZeros, FPZeros);
+
+    return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ);
+  }
+
+  // All other QPX vectors are handled by generic code.
+  if (Subtarget.hasQPX())
+    return SDValue();
+
   // Check if this is a splat of a constant value.
   APInt APSplatBits, APSplatUndef;
   unsigned SplatBitSize;
@@ -6074,6 +6874,45 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   EVT VT = Op.getValueType();
   bool isLittleEndian = Subtarget.isLittleEndian();
 
+  if (Subtarget.hasQPX()) {
+    if (VT.getVectorNumElements() != 4)
+      return SDValue();
+
+    if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
+
+    int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp);
+    if (AlignIdx != -1) {
+      return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2,
+                         DAG.getConstant(AlignIdx, MVT::i32));
+    } else if (SVOp->isSplat()) {
+      int SplatIdx = SVOp->getSplatIndex();
+      if (SplatIdx >= 4) {
+        std::swap(V1, V2);
+        SplatIdx -= 4;
+      }
+
+      // FIXME: If SplatIdx == 0 and the input came from a load, then there is
+      // nothing to do.
+
+      return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1,
+                         DAG.getConstant(SplatIdx, MVT::i32));
+    }
+
+    // Lower this into a qvgpci/qvfperm pair.
+
+    // Compute the qvgpci literal
+    unsigned idx = 0;
+    for (unsigned i = 0; i < 4; ++i) {
+      int m = SVOp->getMaskElt(i);
+      unsigned mm = m >= 0 ? (unsigned) m : i;
+      idx |= mm << (3-i)*3;
+    }
+
+    SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64,
+                             DAG.getConstant(idx, MVT::i32));
+    return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3);
+  }
+
   // Cases that are handled by instructions that take permute immediates
   // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
   // selected by the instruction selector.
@@ -6356,6 +7195,302 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
                      false, false, false, 0);
 }
 
+SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDNode *N = Op.getNode();
+
+  assert(N->getOperand(0).getValueType() == MVT::v4i1 &&
+         "Unknown extract_vector_elt type");
+
+  SDValue Value = N->getOperand(0);
+
+  // The first part of this is like the store lowering except that we don't
+  // need to track the chain.
+
+  // The values are now known to be -1 (false) or 1 (true). To convert this
+  // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
+  // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
+  Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
+
+  // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
+  // understand how to form the extending load.
+  SDValue FPHalfs = DAG.getConstantFP(0.5, MVT::f64);
+  FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64,
+                        FPHalfs, FPHalfs, FPHalfs, FPHalfs);
+
+  Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); 
+
+  // Now convert to an integer and store.
+  Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
+    DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, MVT::i32),
+    Value);
+
+  MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+  int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx);
+  EVT PtrVT = getPointerTy();
+  SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+  SDValue StoreChain = DAG.getEntryNode();
+  SmallVector<SDValue, 2> Ops;
+  Ops.push_back(StoreChain);
+  Ops.push_back(DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, MVT::i32));
+  Ops.push_back(Value);
+  Ops.push_back(FIdx);
+
+  SmallVector<EVT, 2> ValueVTs;
+  ValueVTs.push_back(MVT::Other); // chain
+  SDVTList VTs = DAG.getVTList(ValueVTs);
+
+  StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
+    dl, VTs, Ops, MVT::v4i32, PtrInfo);
+
+  // Extract the value requested.
+  unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  SDValue Idx = DAG.getConstant(Offset, FIdx.getValueType());
+  Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
+
+  SDValue IntVal = DAG.getLoad(MVT::i32, dl, StoreChain, Idx,
+                               PtrInfo.getWithOffset(Offset),
+                               false, false, false, 0);
+
+  if (!Subtarget.useCRBits())
+    return IntVal;
+
+  return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal);
+}
+
+/// Lowering for QPX v4i1 loads
+SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
+  SDValue LoadChain = LN->getChain();
+  SDValue BasePtr = LN->getBasePtr();
+
+  if (Op.getValueType() == MVT::v4f64 ||
+      Op.getValueType() == MVT::v4f32) {
+    EVT MemVT = LN->getMemoryVT();
+    unsigned Alignment = LN->getAlignment();
+
+    // If this load is properly aligned, then it is legal.
+    if (Alignment >= MemVT.getStoreSize())
+      return Op;
+
+    EVT ScalarVT = Op.getValueType().getScalarType(),
+        ScalarMemVT = MemVT.getScalarType();
+    unsigned Stride = ScalarMemVT.getStoreSize();
+
+    SmallVector<SDValue, 8> Vals, LoadChains;
+    for (unsigned Idx = 0; Idx < 4; ++Idx) {
+      SDValue Load;
+      if (ScalarVT != ScalarMemVT)
+        Load =
+          DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain,
+                         BasePtr,
+                         LN->getPointerInfo().getWithOffset(Idx*Stride),
+                         ScalarMemVT, LN->isVolatile(), LN->isNonTemporal(),
+                         LN->isInvariant(), MinAlign(Alignment, Idx*Stride),
+                         LN->getAAInfo());
+      else
+        Load =
+          DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr,
+                       LN->getPointerInfo().getWithOffset(Idx*Stride),
+                       LN->isVolatile(), LN->isNonTemporal(),
+                       LN->isInvariant(), MinAlign(Alignment, Idx*Stride),
+                       LN->getAAInfo());
+
+      if (Idx == 0 && LN->isIndexed()) {
+        assert(LN->getAddressingMode() == ISD::PRE_INC &&
+               "Unknown addressing mode on vector load");
+        Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(),
+                                  LN->getAddressingMode());
+      }
+
+      Vals.push_back(Load);
+      LoadChains.push_back(Load.getValue(1));
+
+      BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                            DAG.getConstant(Stride, BasePtr.getValueType()));
+    }
+
+    SDValue TF =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
+    SDValue Value = DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                   Op.getValueType(), Vals);
+
+    if (LN->isIndexed()) {
+      SDValue RetOps[] = { Value, Vals[0].getValue(1), TF };
+      return DAG.getMergeValues(RetOps, dl);
+    }
+
+    SDValue RetOps[] = { Value, TF };
+    return DAG.getMergeValues(RetOps, dl);
+  }
+
+  assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower");
+  assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported");
+
+  // To lower v4i1 from a byte array, we load the byte elements of the
+  // vector and then reuse the BUILD_VECTOR logic.
+
+  SmallVector<SDValue, 4> VectElmts, VectElmtChains;
+  for (unsigned i = 0; i < 4; ++i) {
+    SDValue Idx = DAG.getConstant(i, BasePtr.getValueType());
+    Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
+
+    VectElmts.push_back(DAG.getExtLoad(ISD::EXTLOAD,
+                        dl, MVT::i32, LoadChain, Idx,
+                        LN->getPointerInfo().getWithOffset(i),
+                        MVT::i8 /* memory type */,
+                        LN->isVolatile(), LN->isNonTemporal(),
+                        LN->isInvariant(),
+                        1 /* alignment */, LN->getAAInfo()));
+    VectElmtChains.push_back(VectElmts[i].getValue(1));
+  }
+
+  LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains);
+  SDValue Value = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i1, VectElmts);
+
+  SDValue RVals[] = { Value, LoadChain };
+  return DAG.getMergeValues(RVals, dl);
+}
+
+/// Lowering for QPX v4i1 stores
+SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
+  SDValue StoreChain = SN->getChain();
+  SDValue BasePtr = SN->getBasePtr();
+  SDValue Value = SN->getValue();
+
+  if (Value.getValueType() == MVT::v4f64 ||
+      Value.getValueType() == MVT::v4f32) {
+    EVT MemVT = SN->getMemoryVT();
+    unsigned Alignment = SN->getAlignment();
+
+    // If this store is properly aligned, then it is legal.
+    if (Alignment >= MemVT.getStoreSize())
+      return Op;
+
+    EVT ScalarVT = Value.getValueType().getScalarType(),
+        ScalarMemVT = MemVT.getScalarType();
+    unsigned Stride = ScalarMemVT.getStoreSize();
+
+    SmallVector<SDValue, 8> Stores;
+    for (unsigned Idx = 0; Idx < 4; ++Idx) {
+      SDValue Ex =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
+                    DAG.getConstant(Idx, getVectorIdxTy()));
+      SDValue Store;
+      if (ScalarVT != ScalarMemVT)
+        Store =
+          DAG.getTruncStore(StoreChain, dl, Ex, BasePtr,
+                            SN->getPointerInfo().getWithOffset(Idx*Stride),
+                            ScalarMemVT, SN->isVolatile(), SN->isNonTemporal(),
+                            MinAlign(Alignment, Idx*Stride), SN->getAAInfo());
+      else
+        Store =
+          DAG.getStore(StoreChain, dl, Ex, BasePtr,
+                       SN->getPointerInfo().getWithOffset(Idx*Stride),
+                       SN->isVolatile(), SN->isNonTemporal(),
+                       MinAlign(Alignment, Idx*Stride), SN->getAAInfo());
+
+      if (Idx == 0 && SN->isIndexed()) {
+        assert(SN->getAddressingMode() == ISD::PRE_INC &&
+               "Unknown addressing mode on vector store");
+        Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(),
+                                    SN->getAddressingMode());
+      }
+
+      BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                            DAG.getConstant(Stride, BasePtr.getValueType()));
+      Stores.push_back(Store);
+    }
+
+    SDValue TF =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+
+    if (SN->isIndexed()) {
+      SDValue RetOps[] = { TF, Stores[0].getValue(1) };
+      return DAG.getMergeValues(RetOps, dl);
+    }
+
+    return TF;
+  }
+
+  assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported");
+  assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower");
+
+  // The values are now known to be -1 (false) or 1 (true). To convert this
+  // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
+  // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
+  Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
+
+  // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
+  // understand how to form the extending load.
+  SDValue FPHalfs = DAG.getConstantFP(0.5, MVT::f64);
+  FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64,
+                        FPHalfs, FPHalfs, FPHalfs, FPHalfs);
+
+  Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); 
+
+  // Now convert to an integer and store.
+  Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
+    DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, MVT::i32),
+    Value);
+
+  MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+  int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx);
+  EVT PtrVT = getPointerTy();
+  SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+  SmallVector<SDValue, 2> Ops;
+  Ops.push_back(StoreChain);
+  Ops.push_back(DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, MVT::i32));
+  Ops.push_back(Value);
+  Ops.push_back(FIdx);
+
+  SmallVector<EVT, 2> ValueVTs;
+  ValueVTs.push_back(MVT::Other); // chain
+  SDVTList VTs = DAG.getVTList(ValueVTs);
+
+  StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
+    dl, VTs, Ops, MVT::v4i32, PtrInfo);
+
+  // Move data into the byte array.
+  SmallVector<SDValue, 4> Loads, LoadChains;
+  for (unsigned i = 0; i < 4; ++i) {
+    unsigned Offset = 4*i;
+    SDValue Idx = DAG.getConstant(Offset, FIdx.getValueType());
+    Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
+
+    Loads.push_back(DAG.getLoad(MVT::i32, dl, StoreChain, Idx,
+                                   PtrInfo.getWithOffset(Offset),
+                                   false, false, false, 0));
+    LoadChains.push_back(Loads[i].getValue(1));
+  }
+
+  StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
+
+  SmallVector<SDValue, 4> Stores;
+  for (unsigned i = 0; i < 4; ++i) {
+    SDValue Idx = DAG.getConstant(i, BasePtr.getValueType());
+    Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
+
+    Stores.push_back(DAG.getTruncStore(StoreChain, dl, Loads[i], Idx,
+                                       SN->getPointerInfo().getWithOffset(i),
+                                       MVT::i8 /* memory type */,
+                                       SN->isNonTemporal(), SN->isVolatile(), 
+                                       1 /* alignment */, SN->getAAInfo()));
+  }
+
+  StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+
+  return StoreChain;
+}
+
 SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   if (Op.getValueType() == MVT::v4i32) {
@@ -6462,7 +7597,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT:         return LowerFP_TO_INT(Op, DAG,
-                                                       SDLoc(Op));
+                                                      SDLoc(Op));
   case ISD::UINT_TO_FP:
   case ISD::SINT_TO_FP:         return LowerINT_TO_FP(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
@@ -6478,6 +7613,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, DAG);
 
   // For counter-based loop handling.
@@ -6492,11 +7628,19 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
                                            SmallVectorImpl<SDValue>&Results,
                                            SelectionDAG &DAG) const {
-  const TargetMachine &TM = getTargetMachine();
   SDLoc dl(N);
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Do not know how to custom type legalize this operation!");
+  case ISD::READCYCLECOUNTER: {
+    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+    SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));
+
+    Results.push_back(RTB);
+    Results.push_back(RTB.getValue(1));
+    Results.push_back(RTB.getValue(2));
+    break;
+  }
   case ISD::INTRINSIC_W_CHAIN: {
     if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
         Intrinsic::ppc_is_decremented_ctr_nonzero)
@@ -6514,8 +7658,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     break;
   }
   case ISD::VAARG: {
-    if (!TM.getSubtarget<PPCSubtarget>().isSVR4ABI()
-        || TM.getSubtarget<PPCSubtarget>().isPPC64())
+    if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64())
       return;
 
     EVT VT = N->getValueType(0);
@@ -6597,8 +7740,7 @@ MachineBasicBlock *
 PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                                     bool is64bit, unsigned BinOpcode) const {
   // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction *F = BB->getParent();
@@ -6621,9 +7763,8 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
 
   MachineRegisterInfo &RegInfo = F->getRegInfo();
   unsigned TmpReg = (!BinOpcode) ? incr :
-    RegInfo.createVirtualRegister(
-       is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
-                 (const TargetRegisterClass *) &PPC::GPRCRegClass);
+    RegInfo.createVirtualRegister( is64bit ? &PPC::G8RCRegClass
+                                           : &PPC::GPRCRegClass);
 
   //  thisMBB:
   //   ...
@@ -6660,8 +7801,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
                                             bool is8bit,    // operation
                                             unsigned BinOpcode) const {
   // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   // In 64 bit mode we have to use 64 bits for addresses, even though the
   // lwarx/stwcx are 32 bits.  With the 32-bit atomics we can use address
   // registers without caring whether they're 32 or 64, but here we're
@@ -6689,9 +7829,8 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   MachineRegisterInfo &RegInfo = F->getRegInfo();
-  const TargetRegisterClass *RC =
-    is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
-              (const TargetRegisterClass *) &PPC::GPRCRegClass;
+  const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass
+                                          : &PPC::GPRCRegClass;
   unsigned PtrReg = RegInfo.createVirtualRegister(RC);
   unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
   unsigned ShiftReg = RegInfo.createVirtualRegister(RC);
@@ -6789,8 +7928,7 @@ llvm::MachineBasicBlock*
 PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
                                     MachineBasicBlock *MBB) const {
   DebugLoc DL = MI->getDebugLoc();
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -6863,6 +8001,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   unsigned BufReg = MI->getOperand(1).getReg();
 
   if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) {
+    setUsesTOCBasePtr(*MBB->getParent());
     MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
             .addReg(PPC::X2)
             .addImm(TOCOffset)
@@ -6873,23 +8012,21 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   // Naked functions never have a base pointer, and so we use r1. For all
   // other functions, this decision must be delayed until during PEI.
   unsigned BaseReg;
-  if (MF->getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::Naked))
+  if (MF->getFunction()->hasFnAttribute(Attribute::Naked))
     BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
   else
     BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
 
   MIB = BuildMI(*thisMBB, MI, DL,
                 TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW))
-          .addReg(BaseReg)
-          .addImm(BPOffset)
-          .addReg(BufReg);
+            .addReg(BaseReg)
+            .addImm(BPOffset)
+            .addReg(BufReg);
   MIB.setMemRefs(MMOBegin, MMOEnd);
 
   // Setup
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
-  const PPCRegisterInfo *TRI =
-      getTargetMachine().getSubtarget<PPCSubtarget>().getRegisterInfo();
+  const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
   MIB.addRegMask(TRI->getNoPreservedMask());
 
   BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
@@ -6903,8 +8040,9 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
 
   // mainMBB:
   //  mainDstReg = 0
-  MIB = BuildMI(mainMBB, DL,
-    TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);
+  MIB =
+      BuildMI(mainMBB, DL,
+              TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);
 
   // Store IP
   if (Subtarget.isPPC64()) {
@@ -6938,8 +8076,7 @@ MachineBasicBlock *
 PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
                                      MachineBasicBlock *MBB) const {
   DebugLoc DL = MI->getDebugLoc();
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -6958,10 +8095,13 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   unsigned FP  = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
   unsigned SP  = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
-  unsigned BP  = (PVT == MVT::i64) ? PPC::X30 :
-                  (Subtarget.isSVR4ABI() &&
-                   MF->getTarget().getRelocationModel() == Reloc::PIC_ ?
-                     PPC::R29 : PPC::R30);
+  unsigned BP =
+      (PVT == MVT::i64)
+          ? PPC::X30
+          : (Subtarget.isSVR4ABI() &&
+                     MF->getTarget().getRelocationModel() == Reloc::PIC_
+                 ? PPC::R29
+                 : PPC::R30);
 
   MachineInstrBuilder MIB;
 
@@ -7024,6 +8164,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
 
   // Reload TOC
   if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
+    setUsesTOCBasePtr(*MBB->getParent());
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
             .addImm(TOCOffset)
             .addReg(BufReg);
@@ -7043,6 +8184,22 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
 MachineBasicBlock *
 PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) const {
+  if (MI->getOpcode() == TargetOpcode::STACKMAP ||
+      MI->getOpcode() == TargetOpcode::PATCHPOINT) {
+    if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() &&
+        MI->getOpcode() == TargetOpcode::PATCHPOINT) {
+      // Call lowering should have added an r2 operand to indicate a dependence
+      // on the TOC base pointer value. It can't however, because there is no
+      // way to mark the dependence as implicit there, and so the stackmap code
+      // will confuse it with a regular operand. Instead, add the dependence
+      // here.
+      setUsesTOCBasePtr(*BB->getParent());
+      MI->addOperand(MachineOperand::CreateReg(PPC::X2, false, true));
+    }
+
+    return emitPatchPoint(MI, BB);
+  }
+
   if (MI->getOpcode() == PPC::EH_SjLj_SetJmp32 ||
       MI->getOpcode() == PPC::EH_SjLj_SetJmp64) {
     return emitEHSjLjSetJmp(MI, BB);
@@ -7051,8 +8208,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return emitEHSjLjLongJmp(MI, BB);
   }
 
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
   // To "insert" these instructions we actually have to insert their
   // control-flow patterns.
@@ -7063,9 +8219,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   MachineFunction *F = BB->getParent();
 
   if (Subtarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 ||
-                                 MI->getOpcode() == PPC::SELECT_CC_I8 ||
-                                 MI->getOpcode() == PPC::SELECT_I4 ||
-                                 MI->getOpcode() == PPC::SELECT_I8)) {
+                              MI->getOpcode() == PPC::SELECT_CC_I8 ||
+                              MI->getOpcode() == PPC::SELECT_I4 ||
+                              MI->getOpcode() == PPC::SELECT_I8)) {
     SmallVector<MachineOperand, 2> Cond;
     if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
         MI->getOpcode() == PPC::SELECT_CC_I8)
@@ -7075,8 +8231,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     Cond.push_back(MI->getOperand(1));
 
     DebugLoc dl = MI->getDebugLoc();
-    const TargetInstrInfo *TII =
-        getTargetMachine().getSubtargetImpl()->getInstrInfo();
     TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(),
                       Cond, MI->getOperand(2).getReg(),
                       MI->getOperand(3).getReg());
@@ -7084,6 +8238,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
              MI->getOpcode() == PPC::SELECT_CC_I8 ||
              MI->getOpcode() == PPC::SELECT_CC_F4 ||
              MI->getOpcode() == PPC::SELECT_CC_F8 ||
+             MI->getOpcode() == PPC::SELECT_CC_QFRC ||
+             MI->getOpcode() == PPC::SELECT_CC_QSRC ||
+             MI->getOpcode() == PPC::SELECT_CC_QBRC ||
              MI->getOpcode() == PPC::SELECT_CC_VRRC ||
              MI->getOpcode() == PPC::SELECT_CC_VSFRC ||
              MI->getOpcode() == PPC::SELECT_CC_VSRC ||
@@ -7091,6 +8248,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
              MI->getOpcode() == PPC::SELECT_I8 ||
              MI->getOpcode() == PPC::SELECT_F4 ||
              MI->getOpcode() == PPC::SELECT_F8 ||
+             MI->getOpcode() == PPC::SELECT_QFRC ||
+             MI->getOpcode() == PPC::SELECT_QSRC ||
+             MI->getOpcode() == PPC::SELECT_QBRC ||
              MI->getOpcode() == PPC::SELECT_VRRC ||
              MI->getOpcode() == PPC::SELECT_VSFRC ||
              MI->getOpcode() == PPC::SELECT_VSRC) {
@@ -7124,6 +8284,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
         MI->getOpcode() == PPC::SELECT_I8 ||
         MI->getOpcode() == PPC::SELECT_F4 ||
         MI->getOpcode() == PPC::SELECT_F8 ||
+        MI->getOpcode() == PPC::SELECT_QFRC ||
+        MI->getOpcode() == PPC::SELECT_QSRC ||
+        MI->getOpcode() == PPC::SELECT_QBRC ||
         MI->getOpcode() == PPC::SELECT_VRRC ||
         MI->getOpcode() == PPC::SELECT_VSFRC ||
         MI->getOpcode() == PPC::SELECT_VSRC) {
@@ -7151,6 +8314,51 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
             TII->get(PPC::PHI), MI->getOperand(0).getReg())
       .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
       .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+  } else if (MI->getOpcode() == PPC::ReadTB) {
+    // To read the 64-bit time-base register on a 32-bit target, we read the
+    // two halves. Should the counter have wrapped while it was being read, we
+    // need to try again.
+    // ...
+    // readLoop:
+    // mfspr Rx,TBU # load from TBU
+    // mfspr Ry,TB  # load from TB
+    // mfspr Rz,TBU # load from TBU
+    // cmpw crX,Rx,Rz # check if ‘old’=’new’
+    // bne readLoop   # branch if they're not equal
+    // ...
+
+    MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    DebugLoc dl = MI->getDebugLoc();
+    F->insert(It, readMBB);
+    F->insert(It, sinkMBB);
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    sinkMBB->splice(sinkMBB->begin(), BB,
+                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
+    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    BB->addSuccessor(readMBB);
+    BB = readMBB;
+
+    MachineRegisterInfo &RegInfo = F->getRegInfo();
+    unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
+    unsigned LoReg = MI->getOperand(0).getReg();
+    unsigned HiReg = MI->getOperand(1).getReg();
+
+    BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269);
+    BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268);
+    BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269);
+
+    unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
+
+    BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg)
+      .addReg(HiReg).addReg(ReadAgainReg);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(CmpReg).addMBB(readMBB);
+
+    BB->addSuccessor(readMBB);
+    BB->addSuccessor(sinkMBB);
   }
   else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
@@ -7309,9 +8517,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
     MachineRegisterInfo &RegInfo = F->getRegInfo();
-    const TargetRegisterClass *RC =
-      is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
-                (const TargetRegisterClass *) &PPC::GPRCRegClass;
+    const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass
+                                            : &PPC::GPRCRegClass;
     unsigned PtrReg = RegInfo.createVirtualRegister(RC);
     unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
     unsigned ShiftReg = RegInfo.createVirtualRegister(RC);
@@ -7453,7 +8660,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2);
 
     // Restore FPSCR value.
-    BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF)).addImm(1).addReg(MFFSReg);
+    BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg);
   } else if (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT ||
              MI->getOpcode() == PPC::ANDIo_1_GT_BIT ||
              MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
@@ -7493,9 +8700,11 @@ SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand,
                                             bool &UseOneConstNR) const {
   EVT VT = Operand.getValueType();
   if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
-      (VT == MVT::f64 && Subtarget.hasFRSQRTE())  ||
+      (VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
       (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
-      (VT == MVT::v2f64 && Subtarget.hasVSX())) {
+      (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
+      (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
+      (VT == MVT::v4f64 && Subtarget.hasQPX())) {
     // Convergence is quadratic, so we essentially double the number of digits
     // correct after every iteration. For both FRE and FRSQRTE, the minimum
     // architected relative accuracy is 2^-5. When hasRecipPrec(), this is
@@ -7514,9 +8723,11 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand,
                                             unsigned &RefinementSteps) const {
   EVT VT = Operand.getValueType();
   if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
-      (VT == MVT::f64 && Subtarget.hasFRE())  ||
+      (VT == MVT::f64 && Subtarget.hasFRE()) ||
       (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
-      (VT == MVT::v2f64 && Subtarget.hasVSX())) {
+      (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
+      (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
+      (VT == MVT::v4f64 && Subtarget.hasQPX())) {
     // Convergence is quadratic, so we essentially double the number of digits
     // correct after every iteration. For both FRE and FRSQRTE, the minimum
     // architected relative accuracy is 2^-5. When hasRecipPrec(), this is
@@ -7529,6 +8740,28 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand,
   return SDValue();
 }
 
+bool PPCTargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
+  // Note: This functionality is used only when unsafe-fp-math is enabled, and
+  // on cores with reciprocal estimates (which are used when unsafe-fp-math is
+  // enabled for division), this functionality is redundant with the default
+  // combiner logic (once the division -> reciprocal/multiply transformation
+  // has taken place). As a result, this matters more for older cores than for
+  // newer ones.
+
+  // Combine multiple FDIVs with the same divisor into multiple FMULs by the
+  // reciprocal if there are two or more FDIVs (for embedded cores with only
+  // one FP pipeline) for three or more FDIVs (for generic OOO cores).
+  switch (Subtarget.getDarwinDirective()) {
+  default:
+    return NumUsers > 2;
+  case PPC::DIR_440:
+  case PPC::DIR_A2:
+  case PPC::DIR_E500mc:
+  case PPC::DIR_E5500:
+    return NumUsers > 1;
+  }
+}
+
 static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
                             unsigned Bytes, int Dist,
                             SelectionDAG &DAG) {
@@ -7580,6 +8813,24 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
     EVT VT;
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
     default: return false;
+    case Intrinsic::ppc_qpx_qvlfd:
+    case Intrinsic::ppc_qpx_qvlfda:
+      VT = MVT::v4f64;
+      break;
+    case Intrinsic::ppc_qpx_qvlfs:
+    case Intrinsic::ppc_qpx_qvlfsa:
+      VT = MVT::v4f32;
+      break;
+    case Intrinsic::ppc_qpx_qvlfcd:
+    case Intrinsic::ppc_qpx_qvlfcda:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_qpx_qvlfcs:
+    case Intrinsic::ppc_qpx_qvlfcsa:
+      VT = MVT::v2f32;
+      break;
+    case Intrinsic::ppc_qpx_qvlfiwa:
+    case Intrinsic::ppc_qpx_qvlfiwz:
     case Intrinsic::ppc_altivec_lvx:
     case Intrinsic::ppc_altivec_lvxl:
     case Intrinsic::ppc_vsx_lxvw4x:
@@ -7606,6 +8857,24 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
     EVT VT;
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
     default: return false;
+    case Intrinsic::ppc_qpx_qvstfd:
+    case Intrinsic::ppc_qpx_qvstfda:
+      VT = MVT::v4f64;
+      break;
+    case Intrinsic::ppc_qpx_qvstfs:
+    case Intrinsic::ppc_qpx_qvstfsa:
+      VT = MVT::v4f32;
+      break;
+    case Intrinsic::ppc_qpx_qvstfcd:
+    case Intrinsic::ppc_qpx_qvstfcda:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_qpx_qvstfcs:
+    case Intrinsic::ppc_qpx_qvstfcsa:
+      VT = MVT::v2f32;
+      break;
+    case Intrinsic::ppc_qpx_qvstfiw:
+    case Intrinsic::ppc_qpx_qvstfiwa:
     case Intrinsic::ppc_altivec_stvx:
     case Intrinsic::ppc_altivec_stvxl:
     case Intrinsic::ppc_vsx_stxvw4x:
@@ -7704,8 +8973,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
 
-  assert(Subtarget.useCRBits() &&
-         "Expecting to be tracking CR bits");
+  assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits");
   // If we're tracking CR bits, we need to be careful that we don't have:
   //   trunc(binary-ops(zext(x), zext(y)))
   // or
@@ -8001,10 +9269,8 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
       N->getValueType(0) != MVT::i64)
     return SDValue();
 
-  if (!((N->getOperand(0).getValueType() == MVT::i1 &&
-        Subtarget.useCRBits()) ||
-       (N->getOperand(0).getValueType() == MVT::i32 &&
-        Subtarget.isPPC64())))
+  if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) ||
+        (N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64())))
     return SDValue();
 
   if (N->getOperand(0).getOpcode() != ISD::AND &&
@@ -8053,6 +9319,10 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
     }
   }
 
+  // The operands of a select that must be truncated when the select is
+  // promoted because the operand is actually part of the to-be-promoted set.
+  DenseMap<SDNode *, EVT> SelectTruncOp[2];
+
   // Make sure that this is a self-contained cluster of operations (which
   // is not quite the same thing as saying that everything has only one
   // use).
@@ -8067,18 +9337,19 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
       if (User != N && !Visited.count(User))
         return SDValue();
 
-      // Make sure that we're not going to promote the non-output-value
-      // operand(s) or SELECT or SELECT_CC.
-      // FIXME: Although we could sometimes handle this, and it does occur in
-      // practice that one of the condition inputs to the select is also one of
-      // the outputs, we currently can't deal with this.
+      // If we're going to promote the non-output-value operand(s) or SELECT or
+      // SELECT_CC, record them for truncation.
       if (User->getOpcode() == ISD::SELECT) {
         if (User->getOperand(0) == Inputs[i])
-          return SDValue();
+          SelectTruncOp[0].insert(std::make_pair(User,
+                                    User->getOperand(0).getValueType()));
       } else if (User->getOpcode() == ISD::SELECT_CC) {
-        if (User->getOperand(0) == Inputs[i] ||
-            User->getOperand(1) == Inputs[i])
-          return SDValue();
+        if (User->getOperand(0) == Inputs[i])
+          SelectTruncOp[0].insert(std::make_pair(User,
+                                    User->getOperand(0).getValueType()));
+        if (User->getOperand(1) == Inputs[i])
+          SelectTruncOp[1].insert(std::make_pair(User,
+                                    User->getOperand(1).getValueType()));
       }
     }
   }
@@ -8091,18 +9362,19 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
       if (User != N && !Visited.count(User))
         return SDValue();
 
-      // Make sure that we're not going to promote the non-output-value
-      // operand(s) or SELECT or SELECT_CC.
-      // FIXME: Although we could sometimes handle this, and it does occur in
-      // practice that one of the condition inputs to the select is also one of
-      // the outputs, we currently can't deal with this.
+      // If we're going to promote the non-output-value operand(s) or SELECT or
+      // SELECT_CC, record them for truncation.
       if (User->getOpcode() == ISD::SELECT) {
         if (User->getOperand(0) == PromOps[i])
-          return SDValue();
+          SelectTruncOp[0].insert(std::make_pair(User,
+                                    User->getOperand(0).getValueType()));
       } else if (User->getOpcode() == ISD::SELECT_CC) {
-        if (User->getOperand(0) == PromOps[i] ||
-            User->getOperand(1) == PromOps[i])
-          return SDValue();
+        if (User->getOperand(0) == PromOps[i])
+          SelectTruncOp[0].insert(std::make_pair(User,
+                                    User->getOperand(0).getValueType()));
+        if (User->getOperand(1) == PromOps[i])
+          SelectTruncOp[1].insert(std::make_pair(User,
+                                    User->getOperand(1).getValueType()));
       }
     }
   }
@@ -8183,6 +9455,19 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
       continue;
     }
 
+    // For SELECT and SELECT_CC nodes, we do a similar check for any
+    // to-be-promoted comparison inputs.
+    if (PromOp.getOpcode() == ISD::SELECT ||
+        PromOp.getOpcode() == ISD::SELECT_CC) {
+      if ((SelectTruncOp[0].count(PromOp.getNode()) &&
+           PromOp.getOperand(0).getValueType() != N->getValueType(0)) ||
+          (SelectTruncOp[1].count(PromOp.getNode()) &&
+           PromOp.getOperand(1).getValueType() != N->getValueType(0))) {
+        PromOps.insert(PromOps.begin(), PromOp);
+        continue;
+      }
+    }
+
     SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
                                 PromOp.getNode()->op_end());
 
@@ -8201,6 +9486,18 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
         Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
     }
 
+    // If we've promoted the comparison inputs of a SELECT or SELECT_CC,
+    // truncate them again to the original value type.
+    if (PromOp.getOpcode() == ISD::SELECT ||
+        PromOp.getOpcode() == ISD::SELECT_CC) {
+      auto SI0 = SelectTruncOp[0].find(PromOp.getNode());
+      if (SI0 != SelectTruncOp[0].end())
+        Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]);
+      auto SI1 = SelectTruncOp[1].find(PromOp.getNode());
+      if (SI1 != SelectTruncOp[1].end())
+        Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]);
+    }
+
     DAG.ReplaceAllUsesOfValueWith(PromOp,
       DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));
   }
@@ -8227,9 +9524,177 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
                                  N->getOperand(0), ShiftCst), ShiftCst);
 }
 
+SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  assert((N->getOpcode() == ISD::SINT_TO_FP ||
+          N->getOpcode() == ISD::UINT_TO_FP) &&
+         "Need an int -> FP conversion node here");
+
+  if (!Subtarget.has64BitSupport())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  SDValue Op(N, 0);
+
+  // Don't handle ppc_fp128 here or i1 conversions.
+  if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
+    return SDValue();
+  if (Op.getOperand(0).getValueType() == MVT::i1)
+    return SDValue();
+
+  // For i32 intermediate values, unfortunately, the conversion functions
+  // leave the upper 32 bits of the value are undefined. Within the set of
+  // scalar instructions, we have no method for zero- or sign-extending the
+  // value. Thus, we cannot handle i32 intermediate values here.
+  if (Op.getOperand(0).getValueType() == MVT::i32)
+    return SDValue();
+
+  assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
+         "UINT_TO_FP is supported only with FPCVT");
+
+  // If we have FCFIDS, then use it when converting to single-precision.
+  // Otherwise, convert to double-precision and then round.
+  unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
+                       ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
+                                                            : PPCISD::FCFIDS)
+                       : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
+                                                            : PPCISD::FCFID);
+  MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
+                  ? MVT::f32
+                  : MVT::f64;
+
+  // If we're converting from a float, to an int, and back to a float again,
+  // then we don't need the store/load pair at all.
+  if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT &&
+       Subtarget.hasFPCVT()) ||
+      (Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) {
+    SDValue Src = Op.getOperand(0).getOperand(0);
+    if (Src.getValueType() == MVT::f32) {
+      Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
+      DCI.AddToWorklist(Src.getNode());
+    }
+
+    unsigned FCTOp =
+      Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
+                                                        PPCISD::FCTIDUZ;
+
+    SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src);
+    SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp);
+
+    if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
+      FP = DAG.getNode(ISD::FP_ROUND, dl,
+                       MVT::f32, FP, DAG.getIntPtrConstant(0));
+      DCI.AddToWorklist(FP.getNode());
+    }
+
+    return FP;
+  }
+
+  return SDValue();
+}
+
+// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
+// builtins) into loads with swaps.
+SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  SDValue Chain;
+  SDValue Base;
+  MachineMemOperand *MMO;
+
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode for little endian VSX load");
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    Chain = LD->getChain();
+    Base = LD->getBasePtr();
+    MMO = LD->getMemOperand();
+    // If the MMO suggests this isn't a load of a full vector, leave
+    // things alone.  For a built-in, we have to make the change for
+    // correctness, so if there is a size problem that will be a bug.
+    if (MMO->getSize() < 16)
+      return SDValue();
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
+    Chain = Intrin->getChain();
+    Base = Intrin->getBasePtr();
+    MMO = Intrin->getMemOperand();
+    break;
+  }
+  }
+
+  MVT VecTy = N->getValueType(0).getSimpleVT();
+  SDValue LoadOps[] = { Chain, Base };
+  SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
+                                         DAG.getVTList(VecTy, MVT::Other),
+                                         LoadOps, VecTy, MMO);
+  DCI.AddToWorklist(Load.getNode());
+  Chain = Load.getValue(1);
+  SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
+                             DAG.getVTList(VecTy, MVT::Other), Chain, Load);
+  DCI.AddToWorklist(Swap.getNode());
+  return Swap;
+}
+
+// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for
+// builtins) into stores with swaps.
+SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  SDValue Chain;
+  SDValue Base;
+  unsigned SrcOpnd;
+  MachineMemOperand *MMO;
+
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode for little endian VSX store");
+  case ISD::STORE: {
+    StoreSDNode *ST = cast<StoreSDNode>(N);
+    Chain = ST->getChain();
+    Base = ST->getBasePtr();
+    MMO = ST->getMemOperand();
+    SrcOpnd = 1;
+    // If the MMO suggests this isn't a store of a full vector, leave
+    // things alone.  For a built-in, we have to make the change for
+    // correctness, so if there is a size problem that will be a bug.
+    if (MMO->getSize() < 16)
+      return SDValue();
+    break;
+  }
+  case ISD::INTRINSIC_VOID: {
+    MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
+    Chain = Intrin->getChain();
+    // Intrin->getBasePtr() oddly does not get what we want.
+    Base = Intrin->getOperand(3);
+    MMO = Intrin->getMemOperand();
+    SrcOpnd = 2;
+    break;
+  }
+  }
+
+  SDValue Src = N->getOperand(SrcOpnd);
+  MVT VecTy = Src.getValueType().getSimpleVT();
+  SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
+                             DAG.getVTList(VecTy, MVT::Other), Chain, Src);
+  DCI.AddToWorklist(Swap.getNode());
+  Chain = Swap.getValue(1);
+  SDValue StoreOps[] = { Chain, Swap, Base };
+  SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl,
+                                          DAG.getVTList(MVT::Other),
+                                          StoreOps, VecTy, MMO);
+  DCI.AddToWorklist(Store.getNode());
+  return Store;
+}
+
 SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
-  const TargetMachine &TM = getTargetMachine();
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
   switch (N->getOpcode()) {
@@ -8262,40 +9727,11 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SELECT_CC:
     return DAGCombineTruncBoolExt(N, DCI);
   case ISD::SINT_TO_FP:
-    if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
-      if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) {
-        // Turn (sint_to_fp (fp_to_sint X)) -> fctidz/fcfid without load/stores.
-        // We allow the src/dst to be either f32/f64, but the intermediate
-        // type must be i64.
-        if (N->getOperand(0).getValueType() == MVT::i64 &&
-            N->getOperand(0).getOperand(0).getValueType() != MVT::ppcf128) {
-          SDValue Val = N->getOperand(0).getOperand(0);
-          if (Val.getValueType() == MVT::f32) {
-            Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
-            DCI.AddToWorklist(Val.getNode());
-          }
-
-          Val = DAG.getNode(PPCISD::FCTIDZ, dl, MVT::f64, Val);
-          DCI.AddToWorklist(Val.getNode());
-          Val = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Val);
-          DCI.AddToWorklist(Val.getNode());
-          if (N->getValueType(0) == MVT::f32) {
-            Val = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Val,
-                              DAG.getIntPtrConstant(0));
-            DCI.AddToWorklist(Val.getNode());
-          }
-          return Val;
-        } else if (N->getOperand(0).getValueType() == MVT::i32) {
-          // If the intermediate type is i32, we can avoid the load/store here
-          // too.
-        }
-      }
-    }
-    break;
-  case ISD::STORE:
+  case ISD::UINT_TO_FP:
+    return combineFPToIntToFP(N, DCI);
+  case ISD::STORE: {
     // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)).
-    if (TM.getSubtarget<PPCSubtarget>().hasSTFIWX() &&
-        !cast<StoreSDNode>(N)->isTruncatingStore() &&
+    if (Subtarget.hasSTFIWX() && !cast<StoreSDNode>(N)->isTruncatingStore() &&
         N->getOperand(1).getOpcode() == ISD::FP_TO_SINT &&
         N->getOperand(1).getValueType() == MVT::i32 &&
         N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) {
@@ -8326,8 +9762,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         N->getOperand(1).getNode()->hasOneUse() &&
         (N->getOperand(1).getValueType() == MVT::i32 ||
          N->getOperand(1).getValueType() == MVT::i16 ||
-         (TM.getSubtarget<PPCSubtarget>().hasLDBRX() &&
-          TM.getSubtarget<PPCSubtarget>().isPPC64() &&
+         (Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
           N->getOperand(1).getValueType() == MVT::i64))) {
       SDValue BSwapOp = N->getOperand(1).getOperand(0);
       // Do an any-extend to 32-bits if this is a half-word input.
@@ -8343,20 +9778,45 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                 Ops, cast<StoreSDNode>(N)->getMemoryVT(),
                                 cast<StoreSDNode>(N)->getMemOperand());
     }
+
+    // For little endian, VSX stores require generating xxswapd/lxvd2x.
+    EVT VT = N->getOperand(1).getValueType();
+    if (VT.isSimple()) {
+      MVT StoreVT = VT.getSimpleVT();
+      if (Subtarget.hasVSX() && Subtarget.isLittleEndian() &&
+          (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||
+           StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))
+        return expandVSXStoreForLE(N, DCI);
+    }
     break;
+  }
   case ISD::LOAD: {
     LoadSDNode *LD = cast<LoadSDNode>(N);
     EVT VT = LD->getValueType(0);
-    Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+
+    // For little endian, VSX loads require generating lxvd2x/xxswapd.
+    if (VT.isSimple()) {
+      MVT LoadVT = VT.getSimpleVT();
+      if (Subtarget.hasVSX() && Subtarget.isLittleEndian() &&
+          (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 ||
+           LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32))
+        return expandVSXLoadForLE(N, DCI);
+    }
+
+    EVT MemVT = LD->getMemoryVT();
+    Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
     unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
-    if (ISD::isNON_EXTLoad(N) && VT.isVector() &&
-        TM.getSubtarget<PPCSubtarget>().hasAltivec() &&
-        // P8 and later hardware should just use LOAD.
-        !TM.getSubtarget<PPCSubtarget>().hasP8Vector() &&
-        (VT == MVT::v16i8 || VT == MVT::v8i16 ||
-         VT == MVT::v4i32 || VT == MVT::v4f32) &&
+    Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext());
+    unsigned ScalarABIAlignment = getDataLayout()->getABITypeAlignment(STy);
+    if (LD->isUnindexed() && VT.isVector() &&
+        ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
+          // P8 and later hardware should just use LOAD.
+          !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 ||
+                                       VT == MVT::v4i32 || VT == MVT::v4f32)) ||
+         (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) &&
+          LD->getAlignment() >= ScalarABIAlignment)) &&
         LD->getAlignment() < ABIAlignment) {
-      // This is a type-legal unaligned Altivec load.
+      // This is a type-legal unaligned Altivec or QPX load.
       SDValue Chain = LD->getChain();
       SDValue Ptr = LD->getBasePtr();
       bool isLittleEndian = Subtarget.isLittleEndian();
@@ -8385,10 +9845,28 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       // a different base address offset from this one by an aligned amount.
       // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
       // optimization later.
-      Intrinsic::ID Intr = (isLittleEndian ?
-                            Intrinsic::ppc_altivec_lvsr :
-                            Intrinsic::ppc_altivec_lvsl);
-      SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8);
+      Intrinsic::ID Intr, IntrLD, IntrPerm;
+      MVT PermCntlTy, PermTy, LDTy;
+      if (Subtarget.hasAltivec()) {
+        Intr = isLittleEndian ?  Intrinsic::ppc_altivec_lvsr :
+                                 Intrinsic::ppc_altivec_lvsl;
+        IntrLD = Intrinsic::ppc_altivec_lvx;
+        IntrPerm = Intrinsic::ppc_altivec_vperm;
+        PermCntlTy = MVT::v16i8;
+        PermTy = MVT::v4i32;
+        LDTy = MVT::v4i32;
+      } else {
+        Intr =   MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld :
+                                       Intrinsic::ppc_qpx_qvlpcls;
+        IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd :
+                                       Intrinsic::ppc_qpx_qvlfs;
+        IntrPerm = Intrinsic::ppc_qpx_qvfperm;
+        PermCntlTy = MVT::v4f64;
+        PermTy = MVT::v4f64;
+        LDTy = MemVT.getSimpleVT();
+      }
+
+      SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy);
 
       // Create the new MMO for the new base load. It is like the original MMO,
       // but represents an area in memory almost twice the vector size centered
@@ -8397,18 +9875,16 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       // original unaligned load.
       MachineFunction &MF = DAG.getMachineFunction();
       MachineMemOperand *BaseMMO =
-        MF.getMachineMemOperand(LD->getMemOperand(),
-                                -LD->getMemoryVT().getStoreSize()+1,
-                                2*LD->getMemoryVT().getStoreSize()-1);
+        MF.getMachineMemOperand(LD->getMemOperand(), -MemVT.getStoreSize()+1,
+                                2*MemVT.getStoreSize()-1);
 
       // Create the new base load.
-      SDValue LDXIntID = DAG.getTargetConstant(Intrinsic::ppc_altivec_lvx,
-                                               getPointerTy());
+      SDValue LDXIntID = DAG.getTargetConstant(IntrLD, getPointerTy());
       SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
       SDValue BaseLoad =
         DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
-                                DAG.getVTList(MVT::v4i32, MVT::Other),
-                                BaseLoadOps, MVT::v4i32, BaseMMO);
+                                DAG.getVTList(PermTy, MVT::Other),
+                                BaseLoadOps, LDTy, BaseMMO);
 
       // Note that the value of IncOffset (which is provided to the next
       // load's pointer info offset value, and thus used to calculate the
@@ -8432,12 +9908,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
 
       MachineMemOperand *ExtraMMO =
         MF.getMachineMemOperand(LD->getMemOperand(),
-                                1, 2*LD->getMemoryVT().getStoreSize()-1);
+                                1, 2*MemVT.getStoreSize()-1);
       SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };
       SDValue ExtraLoad =
         DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
-                                DAG.getVTList(MVT::v4i32, MVT::Other),
-                                ExtraLoadOps, MVT::v4i32, ExtraMMO);
+                                DAG.getVTList(PermTy, MVT::Other),
+                                ExtraLoadOps, LDTy, ExtraMMO);
 
       SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
         BaseLoad.getValue(1), ExtraLoad.getValue(1));
@@ -8449,14 +9925,19 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       // and ExtraLoad here.
       SDValue Perm;
       if (isLittleEndian)
-        Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
+        Perm = BuildIntrinsicOp(IntrPerm,
                                 ExtraLoad, BaseLoad, PermCntl, DAG, dl);
       else
-        Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
+        Perm = BuildIntrinsicOp(IntrPerm,
                                 BaseLoad, ExtraLoad, PermCntl, DAG, dl);
 
-      if (VT != MVT::v4i32)
-        Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm);
+      if (VT != PermTy)
+        Perm = Subtarget.hasAltivec() ?
+                 DAG.getNode(ISD::BITCAST, dl, VT, Perm) :
+                 DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX
+                               DAG.getTargetConstant(1, MVT::i64));
+                               // second argument is 1 because this rounding
+                               // is always exact.
 
       // The output of the permutation is our loaded result, the TokenFactor is
       // our new chain.
@@ -8465,43 +9946,96 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     }
     }
     break;
-  case ISD::INTRINSIC_WO_CHAIN: {
-    bool isLittleEndian = Subtarget.isLittleEndian();
-    Intrinsic::ID Intr = (isLittleEndian ?
-                          Intrinsic::ppc_altivec_lvsr :
-                          Intrinsic::ppc_altivec_lvsl);
-    if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() == Intr &&
+    case ISD::INTRINSIC_WO_CHAIN: {
+      bool isLittleEndian = Subtarget.isLittleEndian();
+      unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+      Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
+                                           : Intrinsic::ppc_altivec_lvsl);
+      if ((IID == Intr ||
+           IID == Intrinsic::ppc_qpx_qvlpcld  ||
+           IID == Intrinsic::ppc_qpx_qvlpcls) &&
         N->getOperand(1)->getOpcode() == ISD::ADD) {
-      SDValue Add = N->getOperand(1);
-
-      if (DAG.MaskedValueIsZero(Add->getOperand(1),
-            APInt::getAllOnesValue(4 /* 16 byte alignment */).zext(
-              Add.getValueType().getScalarType().getSizeInBits()))) {
-        SDNode *BasePtr = Add->getOperand(0).getNode();
-        for (SDNode::use_iterator UI = BasePtr->use_begin(),
-             UE = BasePtr->use_end(); UI != UE; ++UI) {
-          if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
-              cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() ==
-                Intr) {
-            // We've found another LVSL/LVSR, and this address is an aligned
-            // multiple of that one. The results will be the same, so use the
-            // one we've just found instead.
-
-            return SDValue(*UI, 0);
+        SDValue Add = N->getOperand(1);
+
+        int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ?
+                   5 /* 32 byte alignment */ : 4 /* 16 byte alignment */;
+
+        if (DAG.MaskedValueIsZero(
+                Add->getOperand(1),
+                APInt::getAllOnesValue(Bits /* alignment */)
+                    .zext(
+                        Add.getValueType().getScalarType().getSizeInBits()))) {
+          SDNode *BasePtr = Add->getOperand(0).getNode();
+          for (SDNode::use_iterator UI = BasePtr->use_begin(),
+                                    UE = BasePtr->use_end();
+               UI != UE; ++UI) {
+            if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+                cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) {
+              // We've found another LVSL/LVSR, and this address is an aligned
+              // multiple of that one. The results will be the same, so use the
+              // one we've just found instead.
+
+              return SDValue(*UI, 0);
+            }
+          }
+        }
+
+        if (isa<ConstantSDNode>(Add->getOperand(1))) {
+          SDNode *BasePtr = Add->getOperand(0).getNode();
+          for (SDNode::use_iterator UI = BasePtr->use_begin(),
+               UE = BasePtr->use_end(); UI != UE; ++UI) {
+            if (UI->getOpcode() == ISD::ADD &&
+                isa<ConstantSDNode>(UI->getOperand(1)) &&
+                (cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() -
+                 cast<ConstantSDNode>(UI->getOperand(1))->getZExtValue()) %
+                (1ULL << Bits) == 0) {
+              SDNode *OtherAdd = *UI;
+              for (SDNode::use_iterator VI = OtherAdd->use_begin(),
+                   VE = OtherAdd->use_end(); VI != VE; ++VI) {
+                if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+                    cast<ConstantSDNode>(VI->getOperand(0))->getZExtValue() == IID) {
+                  return SDValue(*VI, 0);
+                }
+              }
+            }
           }
         }
       }
     }
-    }
 
     break;
+  case ISD::INTRINSIC_W_CHAIN: {
+    // For little endian, VSX loads require generating lxvd2x/xxswapd.
+    if (Subtarget.hasVSX() && Subtarget.isLittleEndian()) {
+      switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+      default:
+        break;
+      case Intrinsic::ppc_vsx_lxvw4x:
+      case Intrinsic::ppc_vsx_lxvd2x:
+        return expandVSXLoadForLE(N, DCI);
+      }
+    }
+    break;
+  }
+  case ISD::INTRINSIC_VOID: {
+    // For little endian, VSX stores require generating xxswapd/stxvd2x.
+    if (Subtarget.hasVSX() && Subtarget.isLittleEndian()) {
+      switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+      default:
+        break;
+      case Intrinsic::ppc_vsx_stxvw4x:
+      case Intrinsic::ppc_vsx_stxvd2x:
+        return expandVSXStoreForLE(N, DCI);
+      }
+    }
+    break;
+  }
   case ISD::BSWAP:
     // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
     if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
         N->getOperand(0).hasOneUse() &&
         (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 ||
-         (TM.getSubtarget<PPCSubtarget>().hasLDBRX() &&
-          TM.getSubtarget<PPCSubtarget>().isPPC64() &&
+         (Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
           N->getValueType(0) == MVT::i64))) {
       SDValue Load = N->getOperand(0);
       LoadSDNode *LD = cast<LoadSDNode>(Load);
@@ -8705,6 +10239,38 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
   return SDValue();
 }
 
+SDValue
+PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+                                  SelectionDAG &DAG,
+                                  std::vector<SDNode *> *Created) const {
+  // fold (sdiv X, pow2)
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::i64 && !Subtarget.isPPC64())
+    return SDValue();
+  if ((VT != MVT::i32 && VT != MVT::i64) ||
+      !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+
+  bool IsNegPow2 = (-Divisor).isPowerOf2();
+  unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros();
+  SDValue ShiftAmt = DAG.getConstant(Lg2, VT);
+
+  SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt);
+  if (Created)
+    Created->push_back(Op.getNode());
+
+  if (IsNegPow2) {
+    Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), Op);
+    if (Created)
+      Created->push_back(Op.getNode());
+  }
+
+  return Op;
+}
+
 //===----------------------------------------------------------------------===//
 // Inline Assembly Support
 //===----------------------------------------------------------------------===//
@@ -8746,6 +10312,38 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   }
 }
 
+unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+  switch (Subtarget.getDarwinDirective()) {
+  default: break;
+  case PPC::DIR_970:
+  case PPC::DIR_PWR4:
+  case PPC::DIR_PWR5:
+  case PPC::DIR_PWR5X:
+  case PPC::DIR_PWR6:
+  case PPC::DIR_PWR6X:
+  case PPC::DIR_PWR7:
+  case PPC::DIR_PWR8: {
+    if (!ML)
+      break;
+
+    const PPCInstrInfo *TII = Subtarget.getInstrInfo();
+
+    // For small loops (between 5 and 8 instructions), align to a 32-byte
+    // boundary so that the entire loop fits in one instruction-cache line.
+    uint64_t LoopSize = 0;
+    for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
+      for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J)
+        LoopSize += TII->GetInstSizeInBytes(J);
+
+    if (LoopSize > 16 && LoopSize <= 32)
+      return 5;
+
+    break;
+  }
+  }
+
+  return TargetLowering::getPrefLoopAlignment(ML);
+}
 
 /// getConstraintType - Given a constraint, return the type of
 /// constraint it is for this target.
@@ -8833,8 +10431,9 @@ PPCTargetLowering::getSingleConstraintMatchWeight(
   return weight;
 }
 
-std::pair<unsigned, const TargetRegisterClass*>
-PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+std::pair<unsigned, const TargetRegisterClass *>
+PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                const std::string &Constraint,
                                                 MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC RS6000 Constraint Letters
@@ -8852,8 +10451,16 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
         return std::make_pair(0U, &PPC::F4RCRegClass);
       if (VT == MVT::f64 || VT == MVT::i64)
         return std::make_pair(0U, &PPC::F8RCRegClass);
+      if (VT == MVT::v4f64 && Subtarget.hasQPX())
+        return std::make_pair(0U, &PPC::QFRCRegClass);
+      if (VT == MVT::v4f32 && Subtarget.hasQPX())
+        return std::make_pair(0U, &PPC::QSRCRegClass);
       break;
     case 'v':
+      if (VT == MVT::v4f64 && Subtarget.hasQPX())
+        return std::make_pair(0U, &PPC::QFRCRegClass);
+      if (VT == MVT::v4f32 && Subtarget.hasQPX())
+        return std::make_pair(0U, &PPC::QSRCRegClass);
       return std::make_pair(0U, &PPC::VRRCRegClass);
     case 'y':   // crrc
       return std::make_pair(0U, &PPC::CRRCRegClass);
@@ -8867,8 +10474,8 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     return std::make_pair(0U, &PPC::VSFRCRegClass);
   }
 
-  std::pair<unsigned, const TargetRegisterClass*> R =
-    TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  std::pair<unsigned, const TargetRegisterClass *> R =
+      TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 
   // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
   // (which we call X[0-9]+). If a 64-bit value has been requested, and a
@@ -8877,12 +10484,15 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
   // the AsmName field from *RegisterInfo.td, then this would not be necessary.
   if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
-      PPC::GPRCRegClass.contains(R.first)) {
-    const TargetRegisterInfo *TRI =
-        getTargetMachine().getSubtargetImpl()->getRegisterInfo();
+      PPC::GPRCRegClass.contains(R.first))
     return std::make_pair(TRI->getMatchingSuperReg(R.first,
                             PPC::sub_32, &PPC::G8RCRegClass),
                           &PPC::G8RCRegClass);
+
+  // GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.
+  if (!R.second && StringRef("{cc}").equals_lower(Constraint)) {
+    R.first = PPC::CR0;
+    R.second = &PPC::CRRCRegClass;
   }
 
   return R;
@@ -8913,37 +10523,42 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   case 'P': {
     ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
     if (!CST) return; // Must be an immediate to match.
-    unsigned Value = CST->getZExtValue();
+    int64_t Value = CST->getSExtValue();
+    EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative
+                         // numbers are printed as such.
     switch (Letter) {
     default: llvm_unreachable("Unknown constraint letter!");
     case 'I':  // "I" is a signed 16-bit constant.
-      if ((short)Value == (int)Value)
-        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      if (isInt<16>(Value))
+        Result = DAG.getTargetConstant(Value, TCVT);
       break;
     case 'J':  // "J" is a constant with only the high-order 16 bits nonzero.
+      if (isShiftedUInt<16, 16>(Value))
+        Result = DAG.getTargetConstant(Value, TCVT);
+      break;
     case 'L':  // "L" is a signed 16-bit constant shifted left 16 bits.
-      if ((short)Value == 0)
-        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      if (isShiftedInt<16, 16>(Value))
+        Result = DAG.getTargetConstant(Value, TCVT);
       break;
     case 'K':  // "K" is a constant with only the low-order 16 bits nonzero.
-      if ((Value >> 16) == 0)
-        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      if (isUInt<16>(Value))
+        Result = DAG.getTargetConstant(Value, TCVT);
       break;
     case 'M':  // "M" is a constant that is greater than 31.
       if (Value > 31)
-        Result = DAG.getTargetConstant(Value, Op.getValueType());
+        Result = DAG.getTargetConstant(Value, TCVT);
       break;
     case 'N':  // "N" is a positive constant that is an exact power of two.
-      if ((int)Value > 0 && isPowerOf2_32(Value))
-        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      if (Value > 0 && isPowerOf2_64(Value))
+        Result = DAG.getTargetConstant(Value, TCVT);
       break;
     case 'O':  // "O" is the constant zero.
       if (Value == 0)
-        Result = DAG.getTargetConstant(Value, Op.getValueType());
+        Result = DAG.getTargetConstant(Value, TCVT);
       break;
     case 'P':  // "P" is a constant whose negation is a signed 16-bit constant.
-      if ((short)-Value == (int)-Value)
-        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      if (isInt<16>(-Value))
+        Result = DAG.getTargetConstant(Value, TCVT);
       break;
     }
     break;
@@ -8963,7 +10578,9 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 // by AM is legal for this target, for a load/store of the specified type.
 bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM,
                                               Type *Ty) const {
-  // FIXME: PPC does not allow r+i addressing modes for vectors!
+  // PPC does not allow r+i addressing modes for vectors!
+  if (Ty->isVectorTy() && AM.BaseOffs != 0)
+    return false;
 
   // PPC allows a sign-extended 16-bit immediate field.
   if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
@@ -9012,14 +10629,12 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   FuncInfo->setLRStoreRequired();
   bool isPPC64 = Subtarget.isPPC64();
-  bool isDarwinABI = Subtarget.isDarwinABI();
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset =
-
-      DAG.getConstant(PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI),
-                      isPPC64? MVT::i64 : MVT::i32);
+        DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(),
+                        isPPC64 ? MVT::i64 : MVT::i32);
     return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, getPointerTy(),
                                    FrameAddr, Offset),
@@ -9047,8 +10662,7 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
   // Naked functions never have a frame pointer, and so we use r1. For all
   // other functions, this decision must be delayed until during PEI.
   unsigned FrameReg;
-  if (MF.getFunction()->getAttributes().hasAttribute(
-        AttributeSet::FunctionIndex, Attribute::Naked))
+  if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
     FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
   else
     FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
@@ -9076,7 +10690,7 @@ unsigned PPCTargetLowering::getRegisterByName(const char* RegName,
   bool is64Bit = isPPC64 && VT == MVT::i64;
   unsigned Reg = StringSwitch<unsigned>(RegName)
                    .Case("r1", is64Bit ? PPC::X1 : PPC::R1)
-                   .Case("r2", isDarwinABI ? 0 : (is64Bit ? PPC::X2 : PPC::R2))
+                   .Case("r2", (isDarwinABI || isPPC64) ? 0 : PPC::R2)
                    .Case("r13", (!isPPC64 && isDarwinABI) ? 0 :
                                   (is64Bit ? PPC::X13 : PPC::R13))
                    .Default(0);
@@ -9097,6 +10711,12 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                            unsigned Intrinsic) const {
 
   switch (Intrinsic) {
+  case Intrinsic::ppc_qpx_qvlfd:
+  case Intrinsic::ppc_qpx_qvlfs:
+  case Intrinsic::ppc_qpx_qvlfcd:
+  case Intrinsic::ppc_qpx_qvlfcs:
+  case Intrinsic::ppc_qpx_qvlfiwa:
+  case Intrinsic::ppc_qpx_qvlfiwz:
   case Intrinsic::ppc_altivec_lvx:
   case Intrinsic::ppc_altivec_lvxl:
   case Intrinsic::ppc_altivec_lvebx:
@@ -9118,6 +10738,18 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     case Intrinsic::ppc_vsx_lxvd2x:
       VT = MVT::v2f64;
       break;
+    case Intrinsic::ppc_qpx_qvlfd:
+      VT = MVT::v4f64;
+      break;
+    case Intrinsic::ppc_qpx_qvlfs:
+      VT = MVT::v4f32;
+      break;
+    case Intrinsic::ppc_qpx_qvlfcd:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_qpx_qvlfcs:
+      VT = MVT::v2f32;
+      break;
     default:
       VT = MVT::v4i32;
       break;
@@ -9134,6 +10766,47 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.writeMem = false;
     return true;
   }
+  case Intrinsic::ppc_qpx_qvlfda:
+  case Intrinsic::ppc_qpx_qvlfsa:
+  case Intrinsic::ppc_qpx_qvlfcda:
+  case Intrinsic::ppc_qpx_qvlfcsa:
+  case Intrinsic::ppc_qpx_qvlfiwaa:
+  case Intrinsic::ppc_qpx_qvlfiwza: {
+    EVT VT;
+    switch (Intrinsic) {
+    case Intrinsic::ppc_qpx_qvlfda:
+      VT = MVT::v4f64;
+      break;
+    case Intrinsic::ppc_qpx_qvlfsa:
+      VT = MVT::v4f32;
+      break;
+    case Intrinsic::ppc_qpx_qvlfcda:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_qpx_qvlfcsa:
+      VT = MVT::v2f32;
+      break;
+    default:
+      VT = MVT::v4i32;
+      break;
+    }
+
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = VT;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.size = VT.getStoreSize();
+    Info.align = 1;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::ppc_qpx_qvstfd:
+  case Intrinsic::ppc_qpx_qvstfs:
+  case Intrinsic::ppc_qpx_qvstfcd:
+  case Intrinsic::ppc_qpx_qvstfcs:
+  case Intrinsic::ppc_qpx_qvstfiw:
   case Intrinsic::ppc_altivec_stvx:
   case Intrinsic::ppc_altivec_stvxl:
   case Intrinsic::ppc_altivec_stvebx:
@@ -9155,6 +10828,18 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     case Intrinsic::ppc_vsx_stxvd2x:
       VT = MVT::v2f64;
       break;
+    case Intrinsic::ppc_qpx_qvstfd:
+      VT = MVT::v4f64;
+      break;
+    case Intrinsic::ppc_qpx_qvstfs:
+      VT = MVT::v4f32;
+      break;
+    case Intrinsic::ppc_qpx_qvstfcd:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_qpx_qvstfcs:
+      VT = MVT::v2f32;
+      break;
     default:
       VT = MVT::v4i32;
       break;
@@ -9171,6 +10856,41 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.writeMem = true;
     return true;
   }
+  case Intrinsic::ppc_qpx_qvstfda:
+  case Intrinsic::ppc_qpx_qvstfsa:
+  case Intrinsic::ppc_qpx_qvstfcda:
+  case Intrinsic::ppc_qpx_qvstfcsa:
+  case Intrinsic::ppc_qpx_qvstfiwa: {
+    EVT VT;
+    switch (Intrinsic) {
+    case Intrinsic::ppc_qpx_qvstfda:
+      VT = MVT::v4f64;
+      break;
+    case Intrinsic::ppc_qpx_qvstfsa:
+      VT = MVT::v4f32;
+      break;
+    case Intrinsic::ppc_qpx_qvstfcda:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_qpx_qvstfcsa:
+      VT = MVT::v2f32;
+      break;
+    default:
+      VT = MVT::v4i32;
+      break;
+    }
+
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = VT;
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = 0;
+    Info.size = VT.getStoreSize();
+    Info.align = 1;
+    Info.vol = false;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
   default:
     break;
   }
@@ -9229,6 +10949,31 @@ bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   return NumBits1 == 64 && NumBits2 == 32;
 }
 
+bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  // Generally speaking, zexts are not free, but they are free when they can be
+  // folded with other operations.
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) {
+    EVT MemVT = LD->getMemoryVT();
+    if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 ||
+         (Subtarget.isPPC64() && MemVT == MVT::i32)) &&
+        (LD->getExtensionType() == ISD::NON_EXTLOAD ||
+         LD->getExtensionType() == ISD::ZEXTLOAD))
+      return true;
+  }
+
+  // FIXME: Add other cases...
+  //  - 32-bit shifts with a zext to i64
+  //  - zext after ctlz, bswap, etc.
+  //  - zext after and by a constant mask
+
+  return TargetLowering::isZExtFree(Val, VT2);
+}
+
+bool PPCTargetLowering::isFPExtFree(EVT VT) const {
+  assert(VT.isFloatingPoint());
+  return true;
+}
+
 bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   return isInt<16>(Imm) || isUInt<16>(Imm);
 }
@@ -9289,12 +11034,30 @@ bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   return false;
 }
 
+const MCPhysReg *
+PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
+  // LR is a callee-save register, but we must treat it as clobbered by any call
+  // site. Hence we include LR in the scratch registers, which are in turn added
+  // as implicit-defs for stackmaps and patchpoints. The same reasoning applies
+  // to CTR, which is used by any indirect call.
+  static const MCPhysReg ScratchRegs[] = {
+    PPC::X12, PPC::LR8, PPC::CTR8, 0
+  };
+
+  return ScratchRegs;
+}
+
 bool
 PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
                      EVT VT , unsigned DefinedValues) const {
   if (VT == MVT::v2i64)
     return false;
 
+  if (Subtarget.hasQPX()) {
+    if (VT == MVT::v4f32 || VT == MVT::v4f64 || VT == MVT::v4i1)
+      return true;
+  }
+
   return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
 }
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index bb4d1f1..04afe88 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -61,6 +61,9 @@ namespace llvm {
       ///
       VPERM,
 
+      /// The CMPB instruction (takes two operands of i32 or i64).
+      CMPB,
+
       /// Hi/Lo - These represent the high and low 16-bit parts of a global
       /// address respectively.  These nodes have two operands, the first of
       /// which must be a TargetGlobalAddress, and the second of which must be a
@@ -68,18 +71,9 @@ namespace llvm {
       /// though these are usually folded into other nodes.
       Hi, Lo,
 
-      TOC_ENTRY,
-
       /// The following two target-specific nodes are used for calls through
       /// function pointers in the 64-bit SVR4 ABI.
 
-      /// Like a regular LOAD but additionally taking/producing a flag.
-      LOAD,
-
-      /// Like LOAD (taking/producing a flag), but using r2 as hard-coded
-      /// destination.
-      LOAD_TOC,
-
       /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
       /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
       /// compute an allocation on the stack.
@@ -94,15 +88,17 @@ namespace llvm {
       /// code.
       SRL, SRA, SHL,
 
+      /// The combination of sra[wd]i and addze used to implemented signed
+      /// integer division by a power of 2. The first operand is the dividend,
+      /// and the second is the constant shift amount (representing the
+      /// divisor).
+      SRA_ADDZE,
+
       /// CALL - A direct function call.
       /// CALL_NOP is a call with the special NOP which follows 64-bit
       /// SVR4 calls.
       CALL, CALL_NOP,
 
-      /// CALL_TLS and CALL_NOP_TLS - Versions of CALL and CALL_NOP used
-      /// to access TLS variables.
-      CALL_TLS, CALL_NOP_TLS,
-
       /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
       /// MTCTR instruction.
       MTCTR,
@@ -111,6 +107,10 @@ namespace llvm {
       /// BCTRL instruction.
       BCTRL,
 
+      /// CHAIN,FLAG = BCTRL(CHAIN, ADDR, INFLAG) - The combination of a bctrl
+      /// instruction and the TOC reload required on SVR4 PPC64.
+      BCTRL_LOAD_TOC,
+
       /// Return with a flag operand, matched by 'blr'
       RET_FLAG,
 
@@ -125,6 +125,10 @@ namespace llvm {
       /// implement truncation of i32 or i64 to i1.
       ANDIo_1_EQ_BIT, ANDIo_1_GT_BIT,
 
+      // READ_TIME_BASE - A read of the 64-bit time-base register on a 32-bit
+      // target (returns (Lo, Hi)). It takes a chain operand.
+      READ_TIME_BASE,
+
       // EH_SJLJ_SETJMP - SjLj exception handling setjmp.
       EH_SJLJ_SETJMP,
 
@@ -186,7 +190,7 @@ namespace llvm {
       PPC32_GOT,
 
       /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and
-      /// local dynamic TLS  on PPC32.
+      /// local dynamic TLS on PPC32.
       PPC32_PICGOT,
 
       /// G8RC = ADDIS_GOT_TPREL_HA %X2, Symbol - Used by the initial-exec
@@ -213,26 +217,46 @@ namespace llvm {
       /// register to sym\@got\@tlsgd\@ha.
       ADDIS_TLSGD_HA,
 
-      /// G8RC = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
+      /// %X3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
       /// model, produces an ADDI8 instruction that adds G8RReg to
-      /// sym\@got\@tlsgd\@l.
+      /// sym\@got\@tlsgd\@l and stores the result in X3.  Hidden by
+      /// ADDIS_TLSGD_L_ADDR until after register assignment.
       ADDI_TLSGD_L,
 
+      /// %X3 = GET_TLS_ADDR %X3, Symbol - For the general-dynamic TLS
+      /// model, produces a call to __tls_get_addr(sym\@tlsgd).  Hidden by
+      /// ADDIS_TLSGD_L_ADDR until after register assignment.
+      GET_TLS_ADDR,
+
+      /// G8RC = ADDI_TLSGD_L_ADDR G8RReg, Symbol, Symbol - Op that
+      /// combines ADDI_TLSGD_L and GET_TLS_ADDR until expansion following
+      /// register assignment.
+      ADDI_TLSGD_L_ADDR,
+
       /// G8RC = ADDIS_TLSLD_HA %X2, Symbol - For the local-dynamic TLS
       /// model, produces an ADDIS8 instruction that adds the GOT base
       /// register to sym\@got\@tlsld\@ha.
       ADDIS_TLSLD_HA,
 
-      /// G8RC = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
+      /// %X3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
       /// model, produces an ADDI8 instruction that adds G8RReg to
-      /// sym\@got\@tlsld\@l.
+      /// sym\@got\@tlsld\@l and stores the result in X3.  Hidden by
+      /// ADDIS_TLSLD_L_ADDR until after register assignment.
       ADDI_TLSLD_L,
 
-      /// G8RC = ADDIS_DTPREL_HA %X3, Symbol, Chain - For the
-      /// local-dynamic TLS model, produces an ADDIS8 instruction
-      /// that adds X3 to sym\@dtprel\@ha. The Chain operand is needed
-      /// to tie this in place following a copy to %X3 from the result
-      /// of a GET_TLSLD_ADDR.
+      /// %X3 = GET_TLSLD_ADDR %X3, Symbol - For the local-dynamic TLS
+      /// model, produces a call to __tls_get_addr(sym\@tlsld).  Hidden by
+      /// ADDIS_TLSLD_L_ADDR until after register assignment.
+      GET_TLSLD_ADDR,
+
+      /// G8RC = ADDI_TLSLD_L_ADDR G8RReg, Symbol, Symbol - Op that
+      /// combines ADDI_TLSLD_L and GET_TLSLD_ADDR until expansion
+      /// following register assignment.
+      ADDI_TLSLD_L_ADDR,
+
+      /// G8RC = ADDIS_DTPREL_HA %X3, Symbol - For the local-dynamic TLS
+      /// model, produces an ADDIS8 instruction that adds X3 to
+      /// sym\@dtprel\@ha.
       ADDIS_DTPREL_HA,
 
       /// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS
@@ -250,6 +274,29 @@ namespace llvm {
       /// operand identifies the operating system entry point.
       SC,
 
+      /// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little
+      /// endian.  Maps to an xxswapd instruction that corrects an lxvd2x
+      /// or stxvd2x instruction.  The chain is necessary because the
+      /// sequence replaces a load and needs to provide the same number
+      /// of outputs.
+      XXSWAPD,
+
+      /// QVFPERM = This corresponds to the QPX qvfperm instruction.
+      QVFPERM,
+
+      /// QVGPCI = This corresponds to the QPX qvgpci instruction.
+      QVGPCI,
+
+      /// QVALIGNI = This corresponds to the QPX qvaligni instruction.
+      QVALIGNI,
+
+      /// QVESPLATI = This corresponds to the QPX qvesplati instruction.
+      QVESPLATI,
+
+      /// QBFLT = Access the underlying QPX floating-point boolean
+      /// representation.
+      QBFLT,
+
       /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
       /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
       /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
@@ -276,20 +323,24 @@ namespace llvm {
       /// destination 64-bit register.
       LFIWZX,
 
-      /// G8RC = ADDIS_TOC_HA %X2, Symbol - For medium and large code model,
-      /// produces an ADDIS8 instruction that adds the TOC base register to
-      /// sym\@toc\@ha.
-      ADDIS_TOC_HA,
+      /// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian.
+      /// Maps directly to an lxvd2x instruction that will be followed by
+      /// an xxswapd.
+      LXVD2X,
 
-      /// G8RC = LD_TOC_L Symbol, G8RReg - For medium and large code model,
-      /// produces a LD instruction with base register G8RReg and offset
-      /// sym\@toc\@l. Preceded by an ADDIS_TOC_HA to form a full 32-bit offset.
-      LD_TOC_L,
+      /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
+      /// Maps directly to an stxvd2x instruction that will be preceded by
+      /// an xxswapd.
+      STXVD2X,
 
-      /// G8RC = ADDI_TOC_L G8RReg, Symbol - For medium code model, produces
-      /// an ADDI8 instruction that adds G8RReg to sym\@toc\@l.
-      /// Preceded by an ADDIS_TOC_HA to form a full 32-bit offset.
-      ADDI_TOC_L
+      /// QBRC, CHAIN = QVLFSb CHAIN, Ptr
+      /// The 4xf32 load used for v4i1 constants.
+      QVLFSb,
+
+      /// GPRC = TOC_ENTRY GA, TOC
+      /// Loads the entry for GA from the TOC, where the TOC base is given by
+      /// the last operand.
+      TOC_ENTRY
     };
   }
 
@@ -338,14 +389,18 @@ namespace llvm {
     /// size, return the constant being splatted.  The ByteSize field indicates
     /// the number of bytes of each element [124] -> [bhw].
     SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+
+    /// If this is a qvaligni shuffle mask, return the shift
+    /// amount, otherwise return -1.
+    int isQVALIGNIShuffleMask(SDNode *N);
   }
 
-  class PPCSubtarget;
   class PPCTargetLowering : public TargetLowering {
     const PPCSubtarget &Subtarget;
 
   public:
-    explicit PPCTargetLowering(const PPCTargetMachine &TM);
+    explicit PPCTargetLowering(const PPCTargetMachine &TM,
+                               const PPCSubtarget &STI);
 
     /// getTargetNodeName() - This method returns the name of a target specific
     /// DAG node.
@@ -353,6 +408,14 @@ namespace llvm {
 
     MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
 
+    bool isCheapToSpeculateCttz() const override {
+      return true;
+    }
+
+    bool isCheapToSpeculateCtlz() const override {
+      return true;
+    }
+
     /// getSetCCResultType - Return the ISD::SETCC ValueType
     EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
@@ -399,8 +462,14 @@ namespace llvm {
     void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
                             SelectionDAG &DAG) const override;
 
+    SDValue expandVSXLoadForLE(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue expandVSXStoreForLE(SDNode *N, DAGCombinerInfo &DCI) const;
+
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
+    SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+                          std::vector<SDNode *> *Created) const override;
+
     unsigned getRegisterByName(const char* RegName, EVT VT) const override;
 
     void computeKnownBitsForTargetNode(const SDValue Op,
@@ -409,6 +478,8 @@ namespace llvm {
                                        const SelectionDAG &DAG,
                                        unsigned Depth = 0) const override;
 
+    unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
+
     Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
                                   bool IsStore, bool IsLoad) const override;
     Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
@@ -438,9 +509,10 @@ namespace llvm {
     ConstraintWeight getSingleConstraintMatchWeight(
       AsmOperandInfo &info, const char *constraint) const override;
 
-    std::pair<unsigned, const TargetRegisterClass*>
-      getRegForInlineAsmConstraint(const std::string &Constraint,
-                                   MVT VT) const override;
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 const std::string &Constraint,
+                                 MVT VT) const override;
 
     /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
     /// function arguments in the caller parameter area.  This is the actual
@@ -476,6 +548,10 @@ namespace llvm {
     bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
     bool isTruncateFree(EVT VT1, EVT VT2) const override;
 
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+    bool isFPExtFree(EVT VT) const override;
+
     /// \brief Returns true if it is beneficial to convert a load of a constant
     /// to just the constant itself.
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
@@ -516,6 +592,8 @@ namespace llvm {
     /// expanded to fmul + fadd.
     bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
 
+    const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+
     // Should we expand the build vector with shuffles?
     bool
     shouldExpandBuildVectorWithShuffles(EVT VT,
@@ -541,6 +619,29 @@ namespace llvm {
     }
 
   private:
+
+    struct ReuseLoadInfo {
+      SDValue Ptr;
+      SDValue Chain;
+      SDValue ResChain;
+      MachinePointerInfo MPI;
+      bool IsInvariant;
+      unsigned Alignment;
+      AAMDNodes AAInfo;
+      const MDNode *Ranges;
+
+      ReuseLoadInfo() : IsInvariant(false), Alignment(0), Ranges(nullptr) {}
+    };
+
+    bool canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI,
+                             SelectionDAG &DAG,
+                             ISD::LoadExtType ET = ISD::NON_EXTLOAD) const;
+    void spliceIntoChain(SDValue ResChain, SDValue NewResChain,
+                         SelectionDAG &DAG) const;
+
+    void LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
+                                SelectionDAG &DAG, SDLoc dl) const;
+
     SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
     SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
 
@@ -563,8 +664,6 @@ namespace llvm {
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
-    std::pair<SDValue,SDValue> lowerTLSCall(SDValue Op, SDLoc dl,
-                                            SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
@@ -593,26 +692,31 @@ namespace llvm {
     SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
 
+    SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
+
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
                             SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
     SDValue FinishCall(CallingConv::ID CallConv, SDLoc dl, bool isTailCall,
-                       bool isVarArg,
+                       bool isVarArg, bool IsPatchPoint,
                        SelectionDAG &DAG,
                        SmallVector<std::pair<unsigned, SDValue>, 8>
                          &RegsToPass,
-                       SDValue InFlag, SDValue Chain,
+                       SDValue InFlag, SDValue Chain, SDValue CallSeqStart,
                        SDValue &Callee,
                        int SPDiff, unsigned NumBytes,
                        const SmallVectorImpl<ISD::InputArg> &Ins,
-                       SmallVectorImpl<SDValue> &InVals) const;
+                       SmallVectorImpl<SDValue> &InVals,
+                       ImmutableCallSite *CS) const;
 
     SDValue
       LowerFormalArguments(SDValue Chain,
@@ -669,41 +773,46 @@ namespace llvm {
     SDValue
       LowerCall_Darwin(SDValue Chain, SDValue Callee,
                        CallingConv::ID CallConv,
-                       bool isVarArg, bool isTailCall,
+                       bool isVarArg, bool isTailCall, bool IsPatchPoint,
                        const SmallVectorImpl<ISD::OutputArg> &Outs,
                        const SmallVectorImpl<SDValue> &OutVals,
                        const SmallVectorImpl<ISD::InputArg> &Ins,
                        SDLoc dl, SelectionDAG &DAG,
-                       SmallVectorImpl<SDValue> &InVals) const;
+                       SmallVectorImpl<SDValue> &InVals,
+                       ImmutableCallSite *CS) const;
     SDValue
       LowerCall_64SVR4(SDValue Chain, SDValue Callee,
                        CallingConv::ID CallConv,
-                       bool isVarArg, bool isTailCall,
+                       bool isVarArg, bool isTailCall, bool IsPatchPoint,
                        const SmallVectorImpl<ISD::OutputArg> &Outs,
                        const SmallVectorImpl<SDValue> &OutVals,
                        const SmallVectorImpl<ISD::InputArg> &Ins,
                        SDLoc dl, SelectionDAG &DAG,
-                       SmallVectorImpl<SDValue> &InVals) const;
+                       SmallVectorImpl<SDValue> &InVals,
+                       ImmutableCallSite *CS) const;
     SDValue
     LowerCall_32SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                     bool isVarArg, bool isTailCall,
+                     bool isVarArg, bool isTailCall, bool IsPatchPoint,
                      const SmallVectorImpl<ISD::OutputArg> &Outs,
                      const SmallVectorImpl<SDValue> &OutVals,
                      const SmallVectorImpl<ISD::InputArg> &Ins,
                      SDLoc dl, SelectionDAG &DAG,
-                     SmallVectorImpl<SDValue> &InVals) const;
+                     SmallVectorImpl<SDValue> &InVals,
+                     ImmutableCallSite *CS) const;
 
     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
 
     SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
                              unsigned &RefinementSteps,
                              bool &UseOneConstNR) const override;
     SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
                              unsigned &RefinementSteps) const override;
+    bool combineRepeatedFPDivisors(unsigned NumUsers) const override;
 
     CCAssignFn *useFastISelCCs(unsigned Flag) const;
   };
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 9a19abb..69c0d7d 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -81,6 +81,9 @@ def HI48_64 : SDNodeXForm<imm, [{
 
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
+  let isReturn = 1, Uses = [LR8, RM] in
+    def BLR8 : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB,
+                            [(retflag)]>, Requires<[In64BitMode]>;
   let isBranch = 1, isIndirectBranch = 1, Uses = [CTR8] in {
     def BCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
                              []>,
@@ -167,6 +170,17 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
     }
   }
 }
+
+let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
+    Defs = [LR8, X2], Uses = [CTR8, RM], RST = 2 in {
+  def BCTRL8_LDinto_toc :
+    XLForm_2_ext_and_DSForm_1<19, 528, 20, 0, 1, 58, 0, (outs),
+                              (ins memrix:$src),
+                              "bctrl\n\tld 2, $src", IIC_BrB,
+                              [(PPCbctrl_load_toc ixaddr:$src)]>,
+    Requires<[In64BitMode]>;
+}
+
 } // Interpretation64Bit
 
 // FIXME: Duplicating this for the asm parser should be unnecessary, but the
@@ -188,9 +202,6 @@ def : Pat<(PPCcall (i64 texternalsym:$dst)),
 def : Pat<(PPCcall_nop (i64 texternalsym:$dst)),
           (BL8_NOP texternalsym:$dst)>;
 
-def : Pat<(PPCcall_nop_tls texternalsym:$func, tglobaltlsaddr:$sym),
-          (BL8_NOP_TLS texternalsym:$func, tglobaltlsaddr:$sym)>;
-
 // Atomic operations
 let usesCustomInserter = 1 in {
   let Defs = [CR0] in {
@@ -282,7 +293,7 @@ def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm),
 
 // 64-bit CR instructions
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 def MTOCRF8: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins g8rc:$ST),
                         "mtocrf $FXM, $ST", IIC_BrMCRX>,
             PPC970_DGroup_First, PPC970_Unit_CRU;
@@ -299,7 +310,7 @@ def MFOCRF8: XFXForm_5a<31, 19, (outs g8rc:$rT), (ins crbitm:$FXM),
 def MFCR8 : XFXForm_3<31, 19, (outs g8rc:$rT), (ins),
                      "mfcr $rT", IIC_SprMFCR>,
                      PPC970_MicroCode, PPC970_Unit_CRU;
-} // neverHasSideEffects = 1
+} // hasSideEffects = 0
 
 let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
   let Defs = [CTR8] in
@@ -366,7 +377,7 @@ def MFLR8  : XFXForm_1_ext<31, 339, 8, (outs g8rc:$rT), (ins),
 
 let PPC970_Unit = 1 in {  // FXU Operations.
 let Interpretation64Bit = 1 in {
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 let isCodeGenOnly = 1 in {
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
@@ -517,7 +528,7 @@ defm MULHDU : XOForm_1r<31, 9, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
 }
 } // Interpretation64Bit
 
-let isCompare = 1, neverHasSideEffects = 1 in {
+let isCompare = 1, hasSideEffects = 0 in {
   def CMPD   : XForm_16_ext<31, 0, (outs crrc:$crD), (ins g8rc:$rA, g8rc:$rB),
                             "cmpd $crD, $rA, $rB", IIC_IntCompare>, isPPC64;
   def CMPLD  : XForm_16_ext<31, 32, (outs crrc:$crD), (ins g8rc:$rA, g8rc:$rB),
@@ -529,7 +540,7 @@ let isCompare = 1, neverHasSideEffects = 1 in {
                            IIC_IntCompare>, isPPC64;
 }
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 defm SLD  : XForm_6r<31,  27, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
                      "sld", "$rA, $rS, $rB", IIC_IntRotateD,
                      [(set i64:$rA, (PPCshl i64:$rS, i32:$rB))]>, isPPC64;
@@ -540,13 +551,21 @@ defm SRAD : XForm_6rc<31, 794, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
                       "srad", "$rA, $rS, $rB", IIC_IntRotateD,
                       [(set i64:$rA, (PPCsra i64:$rS, i32:$rB))]>, isPPC64;
 
-let Interpretation64Bit = 1, isCodeGenOnly = 1 in { 
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+defm CNTLZW8 : XForm_11r<31,  26, (outs g8rc:$rA), (ins g8rc:$rS),
+                        "cntlzw", "$rA, $rS", IIC_IntGeneral, []>;
+
 defm EXTSB8 : XForm_11r<31, 954, (outs g8rc:$rA), (ins g8rc:$rS),
                         "extsb", "$rA, $rS", IIC_IntSimple,
                         [(set i64:$rA, (sext_inreg i64:$rS, i8))]>;
 defm EXTSH8 : XForm_11r<31, 922, (outs g8rc:$rA), (ins g8rc:$rS),
                         "extsh", "$rA, $rS", IIC_IntSimple,
                         [(set i64:$rA, (sext_inreg i64:$rS, i16))]>;
+
+defm SLW8  : XForm_6r<31,  24, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                      "slw", "$rA, $rS, $rB", IIC_IntGeneral, []>;
+defm SRW8  : XForm_6r<31, 536, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                      "srw", "$rA, $rS, $rB", IIC_IntGeneral, []>;
 } // Interpretation64Bit
 
 // For fast-isel:
@@ -575,6 +594,11 @@ def POPCNTD : XForm_11<31, 506, (outs g8rc:$rA), (ins g8rc:$rS),
                        "popcntd $rA, $rS", IIC_IntGeneral,
                        [(set i64:$rA, (ctpop i64:$rS))]>;
 
+let isCodeGenOnly = 1, isCommutable = 1 in
+def CMPB8 : XForm_6<31, 508, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                    "cmpb $rA, $rS, $rB", IIC_IntGeneral,
+                    [(set i64:$rA, (PPCcmpb i64:$rS, i64:$rB))]>;
+
 // popcntw also does a population count on the high 32 bits (storing the
 // results in the high 32-bits of the output). We'll ignore that here (which is
 // safe because we never separately use the high part of the 64-bit registers).
@@ -600,14 +624,12 @@ def MULLI8 : DForm_2<7, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
                        [(set i64:$rD, (mul i64:$rA, imm64SExt16:$imm))]>;
 }
 
-let neverHasSideEffects = 1 in {
-let isCommutable = 1 in {
+let hasSideEffects = 0 in {
 defm RLDIMI : MDForm_1r<30, 3, (outs g8rc:$rA),
                         (ins g8rc:$rSi, g8rc:$rS, u6imm:$SH, u6imm:$MBE),
                         "rldimi", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
                         []>, isPPC64, RegConstraint<"$rSi = $rA">,
                         NoEncode<"$rSi">;
-}
 
 // Rotate instructions.
 defm RLDCL  : MDSForm_1r<30, 8,
@@ -645,7 +667,11 @@ defm RLWINM8 : MForm_2r<21, (outs g8rc:$rA),
                         "rlwinm", "$rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
                         []>;
 
-let isCommutable = 1 in {
+defm RLWNM8  : MForm_2r<23, (outs g8rc:$rA),
+                        (ins g8rc:$rS, g8rc:$rB, u5imm:$MB, u5imm:$ME),
+                        "rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral,
+                        []>;
+
 // RLWIMI can be commuted if the rotate amount is zero.
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 defm RLWIMI8 : MForm_2r<20, (outs g8rc:$rA),
@@ -653,15 +679,14 @@ defm RLWIMI8 : MForm_2r<20, (outs g8rc:$rA),
                         u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME",
                         IIC_IntRotate, []>, PPC970_DGroup_Cracked,
                         RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">;
-}
 
 let isSelect = 1 in
 def ISEL8   : AForm_4<31, 15,
                      (outs g8rc:$rT), (ins g8rc_nox0:$rA, g8rc:$rB, crbitrc:$cond),
-                     "isel $rT, $rA, $rB, $cond", IIC_IntGeneral,
+                     "isel $rT, $rA, $rB, $cond", IIC_IntISEL,
                      []>;
 }  // Interpretation64Bit
-}  // neverHasSideEffects = 1
+}  // hasSideEffects = 0
 }  // End FXU Operations.
 
 
@@ -702,7 +727,7 @@ def LWAX_32 : XForm_1<31, 341, (outs gprc:$rD), (ins memrr:$src),
 } // end fast-isel isCodeGenOnly
 
 // Update forms.
-let mayLoad = 1, neverHasSideEffects = 1 in {
+let mayLoad = 1, hasSideEffects = 0 in {
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 def LHAU8 : DForm_1<43, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
                     (ins memri:$addr),
@@ -750,7 +775,7 @@ def LWZX8 : XForm_1<31,  23, (outs g8rc:$rD), (ins memrr:$src),
                    
                    
 // Update forms.
-let mayLoad = 1, neverHasSideEffects = 1 in {
+let mayLoad = 1, hasSideEffects = 0 in {
 def LBZU8 : DForm_1<35, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
                     "lbzu $rD, $addr", IIC_LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
@@ -809,11 +834,6 @@ def LDtocBA: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
                   [(set i64:$rD,
                      (PPCtoc_entry tblockaddress:$disp, i64:$reg))]>, isPPC64;
 
-let hasSideEffects = 1, isCodeGenOnly = 1, RST = 2, Defs = [X2] in
-def LDinto_toc: DSForm_1<58, 0, (outs), (ins memrix:$src),
-                    "ld 2, $src", IIC_LdStLD,
-                    [(PPCload_toc ixaddr:$src)]>, isPPC64;
-
 def LDX  : XForm_1<31,  21, (outs g8rc:$rD), (ins memrr:$src),
                    "ldx $rD, $src", IIC_LdStLD,
                    [(set i64:$rD, (load xaddr:$src))]>, isPPC64;
@@ -821,7 +841,14 @@ def LDBRX : XForm_1<31,  532, (outs g8rc:$rD), (ins memrr:$src),
                    "ldbrx $rD, $src", IIC_LdStLoad,
                    [(set i64:$rD, (PPClbrx xoaddr:$src, i64))]>, isPPC64;
 
-let mayLoad = 1, neverHasSideEffects = 1 in {
+let mayLoad = 1, hasSideEffects = 0, isCodeGenOnly = 1 in {
+def LHBRX8 : XForm_1<31, 790, (outs g8rc:$rD), (ins memrr:$src),
+                   "lhbrx $rD, $src", IIC_LdStLoad, []>;
+def LWBRX8 : XForm_1<31,  534, (outs g8rc:$rD), (ins memrr:$src),
+                   "lwbrx $rD, $src", IIC_LdStLoad, []>;
+}
+
+let mayLoad = 1, hasSideEffects = 0 in {
 def LDU  : DSForm_1<58, 1, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memrix:$addr),
                     "ldu $rD, $addr", IIC_LdStLDU,
                     []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
@@ -835,25 +862,16 @@ def LDUX : XForm_1<31, 53, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
 }
 }
 
-def : Pat<(PPCload ixaddr:$src),
-          (LD ixaddr:$src)>;
-def : Pat<(PPCload xaddr:$src),
-          (LDX xaddr:$src)>;
-
 // Support for medium and large code model.
+let hasSideEffects = 0 in {
 def ADDIStocHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
-                       "#ADDIStocHA",
-                       [(set i64:$rD,
-                         (PPCaddisTocHA i64:$reg, tglobaladdr:$disp))]>,
-                       isPPC64;
+                       "#ADDIStocHA", []>, isPPC64;
+let mayLoad = 1 in
 def LDtocL: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
-                   "#LDtocL",
-                   [(set i64:$rD,
-                     (PPCldTocL tglobaladdr:$disp, i64:$reg))]>, isPPC64;
+                   "#LDtocL", []>, isPPC64;
 def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
-                     "#ADDItocL",
-                     [(set i64:$rD,
-                       (PPCaddiTocL i64:$reg, tglobaladdr:$disp))]>, isPPC64;
+                     "#ADDItocL", []>, isPPC64;
+}
 
 // Support for thread-local storage.
 def ADDISgotTprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
@@ -879,6 +897,28 @@ def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                        [(set i64:$rD,
                          (PPCaddiTlsgdL i64:$reg, tglobaltlsaddr:$disp))]>,
                  isPPC64;
+// LR8 is a true define, while the rest of the Defs are clobbers.  X3 is
+// explicitly defined when this op is created, so not mentioned here.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+    Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
+def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
+                        "#GETtlsADDR",
+                        [(set i64:$rD,
+                          (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>,
+                 isPPC64;
+// Combined op for ADDItlsgdL and GETtlsADDR, late expanded.  X3 and LR8
+// are true defines while the rest of the Defs are clobbers.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+    Defs = [X0,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7]
+    in
+def ADDItlsgdLADDR : Pseudo<(outs g8rc:$rD),
+                            (ins g8rc_nox0:$reg, s16imm64:$disp, tlsgd:$sym),
+                            "#ADDItlsgdLADDR",
+                            [(set i64:$rD,
+                              (PPCaddiTlsgdLAddr i64:$reg,
+                                                 tglobaltlsaddr:$disp,
+                                                 tglobaltlsaddr:$sym))]>,
+                     isPPC64;
 def ADDIStlsldHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                          "#ADDIStlsldHA",
                          [(set i64:$rD,
@@ -889,6 +929,28 @@ def ADDItlsldL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                        [(set i64:$rD,
                          (PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>,
                  isPPC64;
+// LR8 is a true define, while the rest of the Defs are clobbers.  X3 is
+// explicitly defined when this op is created, so not mentioned here.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+    Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
+def GETtlsldADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
+                          "#GETtlsldADDR",
+                          [(set i64:$rD,
+                            (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>,
+                   isPPC64;
+// Combined op for ADDItlsldL and GETtlsADDR, late expanded.  X3 and LR8
+// are true defines, while the rest of the Defs are clobbers.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+    Defs = [X0,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7]
+    in
+def ADDItlsldLADDR : Pseudo<(outs g8rc:$rD),
+                            (ins g8rc_nox0:$reg, s16imm64:$disp, tlsgd:$sym),
+                            "#ADDItlsldLADDR",
+                            [(set i64:$rD,
+                              (PPCaddiTlsldLAddr i64:$reg,
+                                                 tglobaltlsaddr:$disp,
+                                                 tglobaltlsaddr:$sym))]>,
+                     isPPC64;
 def ADDISdtprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                           "#ADDISdtprelHA",
                           [(set i64:$rD,
@@ -1006,7 +1068,7 @@ def : Pat<(pre_store i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
 //
 
 
-let PPC970_Unit = 3, neverHasSideEffects = 1,
+let PPC970_Unit = 3, hasSideEffects = 0,
     Uses = [RM] in {  // FPU Operations.
 defm FCFID  : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fcfid", "$frD, $frB", IIC_FPGeneral,
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 4ef08eb..f6acd6e 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -791,18 +791,27 @@ def : Pat<(store v4i32:$rS, xoaddr:$dst),
 def : Pat<(v16i8 (bitconvert (v8i16 VRRC:$src))), (v16i8 VRRC:$src)>;
 def : Pat<(v16i8 (bitconvert (v4i32 VRRC:$src))), (v16i8 VRRC:$src)>;
 def : Pat<(v16i8 (bitconvert (v4f32 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 VRRC:$src))), (v16i8 VRRC:$src)>;
 
 def : Pat<(v8i16 (bitconvert (v16i8 VRRC:$src))), (v8i16 VRRC:$src)>;
 def : Pat<(v8i16 (bitconvert (v4i32 VRRC:$src))), (v8i16 VRRC:$src)>;
 def : Pat<(v8i16 (bitconvert (v4f32 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 VRRC:$src))), (v8i16 VRRC:$src)>;
 
 def : Pat<(v4i32 (bitconvert (v16i8 VRRC:$src))), (v4i32 VRRC:$src)>;
 def : Pat<(v4i32 (bitconvert (v8i16 VRRC:$src))), (v4i32 VRRC:$src)>;
 def : Pat<(v4i32 (bitconvert (v4f32 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 VRRC:$src))), (v4i32 VRRC:$src)>;
 
 def : Pat<(v4f32 (bitconvert (v16i8 VRRC:$src))), (v4f32 VRRC:$src)>;
 def : Pat<(v4f32 (bitconvert (v8i16 VRRC:$src))), (v4f32 VRRC:$src)>;
 def : Pat<(v4f32 (bitconvert (v4i32 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 VRRC:$src))), (v4f32 VRRC:$src)>;
+
+def : Pat<(v2i64 (bitconvert (v16i8 VRRC:$src))), (v2i64 VRRC:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 VRRC:$src))), (v2i64 VRRC:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 VRRC:$src))), (v2i64 VRRC:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 VRRC:$src))), (v2i64 VRRC:$src)>;
 
 // Shuffles.
 
@@ -929,3 +938,58 @@ def : Pat<(v4f32 (fnearbyint v4f32:$vA)),
 
 } // end HasAltivec
 
+def HasP8Altivec : Predicate<"PPCSubTarget->hasP8Altivec()">;
+let Predicates = [HasP8Altivec] in {
+
+// Count Leading Zeros
+def VCLZB : VXForm_2<1794, (outs vrrc:$vD), (ins vrrc:$vB),
+                     "vclzb $vD, $vB", IIC_VecGeneral,
+                     [(set v16i8:$vD, (ctlz v16i8:$vB))]>;
+def VCLZH : VXForm_2<1858, (outs vrrc:$vD), (ins vrrc:$vB),
+                     "vclzh $vD, $vB", IIC_VecGeneral,
+                     [(set v8i16:$vD, (ctlz v8i16:$vB))]>;
+def VCLZW : VXForm_2<1922, (outs vrrc:$vD), (ins vrrc:$vB),
+                     "vclzw $vD, $vB", IIC_VecGeneral,
+                     [(set v4i32:$vD, (ctlz v4i32:$vB))]>;
+def VCLZD : VXForm_2<1986, (outs vrrc:$vD), (ins vrrc:$vB),
+                     "vclzd $vD, $vB", IIC_VecGeneral,
+                     [(set v2i64:$vD, (ctlz v2i64:$vB))]>;
+
+// Population Count
+def VPOPCNTB : VXForm_2<1795, (outs vrrc:$vD), (ins vrrc:$vB),
+                        "vpopcntb $vD, $vB", IIC_VecGeneral,
+                        [(set v16i8:$vD, (ctpop v16i8:$vB))]>;
+def VPOPCNTH : VXForm_2<1859, (outs vrrc:$vD), (ins vrrc:$vB),
+                        "vpopcnth $vD, $vB", IIC_VecGeneral,
+                        [(set v8i16:$vD, (ctpop v8i16:$vB))]>;
+def VPOPCNTW : VXForm_2<1923, (outs vrrc:$vD), (ins vrrc:$vB),
+                        "vpopcntw $vD, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (ctpop v4i32:$vB))]>;
+def VPOPCNTD : VXForm_2<1987, (outs vrrc:$vD), (ins vrrc:$vB),
+                        "vpopcntd $vD, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (ctpop v2i64:$vB))]>;
+
+let isCommutable = 1 in {
+// FIXME: Use AddedComplexity > 400 to ensure these patterns match before the 
+//        VSX equivalents. We need to fix this up at some point. Two possible
+//        solutions for this problem:
+//        1. Disable Altivec patterns that compete with VSX patterns using the
+//           !HasVSX predicate. This essentially favours VSX over Altivec, in 
+//           hopes of reducing register pressure (larger register set using VSX 
+//           instructions than VMX instructions)
+//        2. Employ a more disciplined use of AddedComplexity, which would provide
+//           more fine-grained control than option 1. This would be beneficial
+//           if we find situations where Altivec is really preferred over VSX. 
+def VEQV  : VXForm_1<1668, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                     "veqv $vD, $vA, $vB", IIC_VecGeneral,
+                     [(set v4i32:$vD, (vnot_ppc (xor v4i32:$vA, v4i32:$vB)))]>;
+def VNAND : VXForm_1<1412, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                     "vnand $vD, $vA, $vB", IIC_VecGeneral,
+                     [(set v4i32:$vD, (vnot_ppc (and v4i32:$vA, v4i32:$vB)))]>;
+} // isCommutable
+
+def VORC : VXForm_1<1348, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vorc $vD, $vA, $vB", IIC_VecGeneral,
+                      [(set v4i32:$vD, (or v4i32:$vA,
+                                           (vnot_ppc v4i32:$vB)))]>;
+} // end HasP8Altivec
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index aa68497..506a2d0 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -385,6 +385,12 @@ class XForm_tlb<bits<10> xo, dag OOL, dag IOL, string asmstr,
   let RST = 0;
 }
 
+class XForm_attn<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  let Inst{21-30} = xo;
+}
+
 // This is the same as XForm_base_r3xo, but the first two operands are swapped
 // when code is emitted.
 class XForm_base_r3xo_swapped
@@ -556,6 +562,47 @@ class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = 0;
 }
 
+// Used for QPX
+class XForm_18<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> FRT;
+  bits<5> FRA;
+  bits<5> FRB;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = FRT;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_19<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_18<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let FRA = 0;
+}
+
+class XForm_20<bits<6> opcode, bits<6> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> FRT;
+  bits<5> FRA;
+  bits<5> FRB;
+  bits<4> tttt;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = FRT;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-24} = tttt;
+  let Inst{25-30} = xo;
+  let Inst{31}    = 0;
+}
+
 class XForm_24<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                InstrItinClass itin, list<dag> pattern> 
   : I<opcode, OOL, IOL, asmstr, itin> {
@@ -939,6 +986,64 @@ class XLForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = 0;
 }
 
+class XLForm_4<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bit W;
+  bits<4> U;
+  
+  bit RC = 0;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-14} = 0;
+  let Inst{15}    = W;
+  let Inst{16-19} = U;
+  let Inst{20}    = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class XLForm_2_and_DSForm_1<bits<6> opcode1, bits<10> xo1, bit lk,
+                            bits<6> opcode2, bits<2> xo2,
+                            dag OOL, dag IOL, string asmstr,
+                            InstrItinClass itin, list<dag> pattern>
+        : I2<opcode1, opcode2, OOL, IOL, asmstr, itin> {
+  bits<5> BO;
+  bits<5> BI;
+  bits<2> BH;
+
+  bits<5>  RST;
+  bits<19> DS_RA;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = BO;
+  let Inst{11-15} = BI;
+  let Inst{16-18} = 0;
+  let Inst{19-20} = BH;
+  let Inst{21-30} = xo1;
+  let Inst{31}    = lk;
+
+  let Inst{38-42} = RST;
+  let Inst{43-47} = DS_RA{18-14};  // Register #
+  let Inst{48-61} = DS_RA{13-0};   // Displacement.
+  let Inst{62-63} = xo2;
+}
+
+class XLForm_2_ext_and_DSForm_1<bits<6> opcode1, bits<10> xo1,
+                                bits<5> bo, bits<5> bi, bit lk,
+                                bits<6> opcode2, bits<2> xo2,
+                                dag OOL, dag IOL, string asmstr,
+                                InstrItinClass itin, list<dag> pattern>
+  : XLForm_2_and_DSForm_1<opcode1, xo1, lk, opcode2, xo2,
+                          OOL, IOL, asmstr, itin, pattern> {
+  let BO = bo;
+  let BI = bi;
+  let BH = 0;
+}
+
 // 1.7.8 XFX-Form
 class XFXForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                 InstrItinClass itin>
@@ -1036,6 +1141,25 @@ class XFLForm<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = RC;
 }
 
+class XFLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin, list<dag>pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bit L;
+  bits<8> FLM;
+  bit W;
+  bits<5> FRB;
+
+  bit RC = 0;    // set by isDOT
+  let Pattern = pattern;
+
+  let Inst{6}     = L;
+  let Inst{7-14}  = FLM;
+  let Inst{15}    = W;
+  let Inst{16-20} = FRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
 // 1.7.10 XS-Form - SRADI.
 class XSForm_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
                InstrItinClass itin, list<dag> pattern>
@@ -1132,6 +1256,14 @@ class AForm_4<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = 0;
 }
 
+// Used for QPX
+class AForm_4a<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let FRA = 0;
+  let FRC = 0;
+}
+
 // 1.7.13 M-Form
 class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
               InstrItinClass itin, list<dag> pattern>
@@ -1356,6 +1488,49 @@ class VXRForm_1<bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{22-31} = xo;
 }
 
+// Z23-Form (used by QPX)
+class Z23Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> FRT;
+  bits<5> FRA;
+  bits<5> FRB;
+  bits<2> idx;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = FRT;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-22} = idx;
+  let Inst{23-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class Z23Form_2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : Z23Form_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let FRB = 0;
+}
+
+class Z23Form_3<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> FRT;
+  bits<12> idx;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = FRT;
+  let Inst{11-22} = idx;
+  let Inst{23-30} = xo;
+  let Inst{31}    = RC;
+}
+
 //===----------------------------------------------------------------------===//
 class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
     : I<0, OOL, IOL, asmstr, NoItinerary> {
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index daf8790..fe9474a 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -51,9 +52,6 @@ opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
 static cl::opt<bool> DisableCmpOpt("disable-ppc-cmp-opt",
 cl::desc("Disable compare instruction optimization"), cl::Hidden);
 
-static cl::opt<bool> DisableVSXFMAMutate("disable-ppc-vsx-fma-mutation",
-cl::desc("Disable VSX FMA instruction mutation"), cl::Hidden);
-
 static cl::opt<bool> VSXSelfCopyCrash("crash-on-ppc-vsx-self-copy",
 cl::desc("Causes the backend to crash instead of generating a nop VSX copy"),
 cl::Hidden);
@@ -84,11 +82,11 @@ PPCInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
 
 /// CreateTargetPostRAHazardRecognizer - Return the postRA hazard recognizer
 /// to use for this target when scheduling the DAG.
-ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
-  const InstrItineraryData *II,
-  const ScheduleDAG *DAG) const {
+ScheduleHazardRecognizer *
+PPCInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                                 const ScheduleDAG *DAG) const {
   unsigned Directive =
-      DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+      DAG->MF.getSubtarget<PPCSubtarget>().getDarwinDirective();
 
   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8)
     return new PPCDispatchGroupSBHazardRecognizer(II, DAG);
@@ -183,6 +181,9 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   case PPC::RESTORE_CRBIT:
   case PPC::LVX:
   case PPC::LXVD2X:
+  case PPC::QVLFDX:
+  case PPC::QVLFSXs:
+  case PPC::QVLFDXb:
   case PPC::RESTORE_VRSAVE:
     // Check for the operands added by addFrameReference (the immediate is the
     // offset which defaults to 0).
@@ -209,6 +210,9 @@ unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   case PPC::SPILL_CRBIT:
   case PPC::STVX:
   case PPC::STXVD2X:
+  case PPC::QVSTFDX:
+  case PPC::QVSTFSXs:
+  case PPC::QVSTFDXb:
   case PPC::SPILL_VRSAVE:
     // Check for the operands added by addFrameReference (the immediate is the
     // offset which defaults to 0).
@@ -230,10 +234,12 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
 
   // Normal instructions can be commuted the obvious way.
   if (MI->getOpcode() != PPC::RLWIMI &&
-      MI->getOpcode() != PPC::RLWIMIo &&
-      MI->getOpcode() != PPC::RLWIMI8 &&
-      MI->getOpcode() != PPC::RLWIMI8o)
+      MI->getOpcode() != PPC::RLWIMIo)
     return TargetInstrInfo::commuteInstruction(MI, NewMI);
+  // Note that RLWIMI can be commuted as a 32-bit instruction, but not as a
+  // 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because
+  // changing the relative order of the mask operands might change what happens
+  // to the high-bits of the mask (and, thus, the result).
 
   // Cannot commute if it has a non-zero rotate count.
   if (MI->getOperand(3).getImm() != 0)
@@ -699,7 +705,7 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   // legalization. Promote them here.
   const TargetRegisterInfo *TRI = &getRegisterInfo();
   if (PPC::F8RCRegClass.contains(DestReg) &&
-      PPC::VSLRCRegClass.contains(SrcReg)) {
+      PPC::VSRCRegClass.contains(SrcReg)) {
     unsigned SuperReg =
       TRI->getMatchingSuperReg(DestReg, PPC::sub_64, &PPC::VSRCRegClass);
 
@@ -708,7 +714,7 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
     DestReg = SuperReg;
   } else if (PPC::VRRCRegClass.contains(DestReg) &&
-             PPC::VSHRCRegClass.contains(SrcReg)) {
+             PPC::VSRCRegClass.contains(SrcReg)) {
     unsigned SuperReg =
       TRI->getMatchingSuperReg(DestReg, PPC::sub_128, &PPC::VSRCRegClass);
 
@@ -717,7 +723,7 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
     DestReg = SuperReg;
   } else if (PPC::F8RCRegClass.contains(SrcReg) &&
-             PPC::VSLRCRegClass.contains(DestReg)) {
+             PPC::VSRCRegClass.contains(DestReg)) {
     unsigned SuperReg =
       TRI->getMatchingSuperReg(SrcReg, PPC::sub_64, &PPC::VSRCRegClass);
 
@@ -726,7 +732,7 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
     SrcReg = SuperReg;
   } else if (PPC::VRRCRegClass.contains(SrcReg) &&
-             PPC::VSHRCRegClass.contains(DestReg)) {
+             PPC::VSRCRegClass.contains(DestReg)) {
     unsigned SuperReg =
       TRI->getMatchingSuperReg(SrcReg, PPC::sub_128, &PPC::VSRCRegClass);
 
@@ -759,6 +765,12 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Opc = PPC::XXLOR;
   else if (PPC::VSFRCRegClass.contains(DestReg, SrcReg))
     Opc = PPC::XXLORf;
+  else if (PPC::QFRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::QVFMR;
+  else if (PPC::QSRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::QVFMRs;
+  else if (PPC::QBRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::QVFMRb;
   else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg))
     Opc = PPC::CROR;
   else
@@ -844,6 +856,24 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                                getKillRegState(isKill)),
                                        FrameIdx));
     SpillsVRS = true;
+  } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFDX))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFSXs))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFDXb))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
   } else {
     llvm_unreachable("Unknown regclass!");
   }
@@ -939,6 +969,18 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                                DestReg),
                                        FrameIdx));
     SpillsVRS = true;
+  } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFDX), DestReg),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFSXs), DestReg),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFDXb), DestReg),
+                                       FrameIdx));
+    NonRI = true;
   } else {
     llvm_unreachable("Unknown regclass!");
   }
@@ -1111,7 +1153,7 @@ bool PPCInstrInfo::PredicateInstruction(
                      MachineInstr *MI,
                      const SmallVectorImpl<MachineOperand> &Pred) const {
   unsigned OpC = MI->getOpcode();
-  if (OpC == PPC::BLR) {
+  if (OpC == PPC::BLR || OpC == PPC::BLR8) {
     if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) {
       bool isPPC64 = Subtarget.isPPC64();
       MI->setDesc(get(Pred[0].getImm() ?
@@ -1275,6 +1317,7 @@ bool PPCInstrInfo::isPredicable(MachineInstr *MI) const {
     return false;
   case PPC::B:
   case PPC::BLR:
+  case PPC::BLR8:
   case PPC::BCTR:
   case PPC::BCTR8:
   case PPC::BCTRL:
@@ -1593,677 +1636,14 @@ unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
     const MachineFunction *MF = MI->getParent()->getParent();
     const char *AsmStr = MI->getOperand(0).getSymbolName();
     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  } else if (Opcode == TargetOpcode::STACKMAP) {
+    return MI->getOperand(1).getImm();
+  } else if (Opcode == TargetOpcode::PATCHPOINT) {
+    PatchPointOpers Opers(MI);
+    return Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
   } else {
     const MCInstrDesc &Desc = get(Opcode);
     return Desc.getSize();
   }
 }
 
-#undef DEBUG_TYPE
-#define DEBUG_TYPE "ppc-vsx-fma-mutate"
-
-namespace {
-  // PPCVSXFMAMutate pass - For copies between VSX registers and non-VSX registers
-  // (Altivec and scalar floating-point registers), we need to transform the
-  // copies into subregister copies with other restrictions.
-  struct PPCVSXFMAMutate : public MachineFunctionPass {
-    static char ID;
-    PPCVSXFMAMutate() : MachineFunctionPass(ID) {
-      initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
-    }
-
-    LiveIntervals *LIS;
-
-    const PPCTargetMachine *TM;
-    const PPCInstrInfo *TII;
-
-protected:
-    bool processBlock(MachineBasicBlock &MBB) {
-      bool Changed = false;
-
-      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-      const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
-      for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
-           I != IE; ++I) {
-        MachineInstr *MI = I;
-
-        // The default (A-type) VSX FMA form kills the addend (it is taken from
-        // the target register, which is then updated to reflect the result of
-        // the FMA). If the instruction, however, kills one of the registers
-        // used for the product, then we can use the M-form instruction (which
-        // will take that value from the to-be-defined register).
-
-        int AltOpc = PPC::getAltVSXFMAOpcode(MI->getOpcode());
-        if (AltOpc == -1)
-          continue;
-
-        // This pass is run after register coalescing, and so we're looking for
-        // a situation like this:
-        //   ...
-        //   %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
-        //   %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
-        //                         %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
-        //   ...
-        //   %vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
-        //                         %RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
-        //   ...
-        // Where we can eliminate the copy by changing from the A-type to the
-        // M-type instruction. Specifically, for this example, this means:
-        //   %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
-        //                         %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
-        // is replaced by:
-        //   %vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
-        //                         %RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
-        // and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
-
-        SlotIndex FMAIdx = LIS->getInstructionIndex(MI);
-
-        VNInfo *AddendValNo =
-          LIS->getInterval(MI->getOperand(1).getReg()).Query(FMAIdx).valueIn();
-        MachineInstr *AddendMI = LIS->getInstructionFromIndex(AddendValNo->def);
-
-        // The addend and this instruction must be in the same block.
-
-        if (!AddendMI || AddendMI->getParent() != MI->getParent())
-          continue;
-
-        // The addend must be a full copy within the same register class.
-
-        if (!AddendMI->isFullCopy())
-          continue;
-
-        unsigned AddendSrcReg = AddendMI->getOperand(1).getReg();
-        if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg)) {
-          if (MRI.getRegClass(AddendMI->getOperand(0).getReg()) !=
-              MRI.getRegClass(AddendSrcReg))
-            continue;
-        } else {
-          // If AddendSrcReg is a physical register, make sure the destination
-          // register class contains it.
-          if (!MRI.getRegClass(AddendMI->getOperand(0).getReg())
-                ->contains(AddendSrcReg))
-            continue;
-        }
-
-        // In theory, there could be other uses of the addend copy before this
-        // fma.  We could deal with this, but that would require additional
-        // logic below and I suspect it will not occur in any relevant
-        // situations.  Additionally, check whether the copy source is killed
-        // prior to the fma.  In order to replace the addend here with the
-        // source of the copy, it must still be live here.  We can't use
-        // interval testing for a physical register, so as long as we're
-        // walking the MIs we may as well test liveness here.
-        bool OtherUsers = false, KillsAddendSrc = false;
-        for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI);
-             J != JE; --J) {
-          if (J->readsVirtualRegister(AddendMI->getOperand(0).getReg())) {
-            OtherUsers = true;
-            break;
-          }
-          if (J->modifiesRegister(AddendSrcReg, TRI) ||
-              J->killsRegister(AddendSrcReg, TRI)) {
-            KillsAddendSrc = true;
-            break;
-          }
-        }
-
-        if (OtherUsers || KillsAddendSrc)
-          continue;
-
-        // Find one of the product operands that is killed by this instruction.
-
-        unsigned KilledProdOp = 0, OtherProdOp = 0;
-        if (LIS->getInterval(MI->getOperand(2).getReg())
-                     .Query(FMAIdx).isKill()) {
-          KilledProdOp = 2;
-          OtherProdOp  = 3;
-        } else if (LIS->getInterval(MI->getOperand(3).getReg())
-                     .Query(FMAIdx).isKill()) {
-          KilledProdOp = 3;
-          OtherProdOp  = 2;
-        }
-
-        // If there are no killed product operands, then this transformation is
-        // likely not profitable.
-        if (!KilledProdOp)
-          continue;
-
-        // For virtual registers, verify that the addend source register
-        // is live here (as should have been assured above).
-        assert((!TargetRegisterInfo::isVirtualRegister(AddendSrcReg) ||
-                LIS->getInterval(AddendSrcReg).liveAt(FMAIdx)) &&
-               "Addend source register is not live!");
-
-        // Transform: (O2 * O3) + O1 -> (O2 * O1) + O3.
-
-        unsigned AddReg = AddendMI->getOperand(1).getReg();
-        unsigned KilledProdReg = MI->getOperand(KilledProdOp).getReg();
-        unsigned OtherProdReg  = MI->getOperand(OtherProdOp).getReg();
-
-        unsigned AddSubReg = AddendMI->getOperand(1).getSubReg();
-        unsigned KilledProdSubReg = MI->getOperand(KilledProdOp).getSubReg();
-        unsigned OtherProdSubReg  = MI->getOperand(OtherProdOp).getSubReg();
-
-        bool AddRegKill = AddendMI->getOperand(1).isKill();
-        bool KilledProdRegKill = MI->getOperand(KilledProdOp).isKill();
-        bool OtherProdRegKill  = MI->getOperand(OtherProdOp).isKill();
-
-        bool AddRegUndef = AddendMI->getOperand(1).isUndef();
-        bool KilledProdRegUndef = MI->getOperand(KilledProdOp).isUndef();
-        bool OtherProdRegUndef  = MI->getOperand(OtherProdOp).isUndef();
-
-        unsigned OldFMAReg = MI->getOperand(0).getReg();
-
-        // The transformation doesn't work well with things like:
-        //    %vreg5 = A-form-op %vreg5, %vreg11, %vreg5;
-        // so leave such things alone.
-        if (OldFMAReg == KilledProdReg)
-          continue;
-
-        assert(OldFMAReg == AddendMI->getOperand(0).getReg() &&
-               "Addend copy not tied to old FMA output!");
-
-        DEBUG(dbgs() << "VSX FMA Mutation:\n    " << *MI;);
-
-        MI->getOperand(0).setReg(KilledProdReg);
-        MI->getOperand(1).setReg(KilledProdReg);
-        MI->getOperand(3).setReg(AddReg);
-        MI->getOperand(2).setReg(OtherProdReg);
-
-        MI->getOperand(0).setSubReg(KilledProdSubReg);
-        MI->getOperand(1).setSubReg(KilledProdSubReg);
-        MI->getOperand(3).setSubReg(AddSubReg);
-        MI->getOperand(2).setSubReg(OtherProdSubReg);
-
-        MI->getOperand(1).setIsKill(KilledProdRegKill);
-        MI->getOperand(3).setIsKill(AddRegKill);
-        MI->getOperand(2).setIsKill(OtherProdRegKill);
-
-        MI->getOperand(1).setIsUndef(KilledProdRegUndef);
-        MI->getOperand(3).setIsUndef(AddRegUndef);
-        MI->getOperand(2).setIsUndef(OtherProdRegUndef);
-
-        MI->setDesc(TII->get(AltOpc));
-
-        DEBUG(dbgs() << " -> " << *MI);
-
-        // The killed product operand was killed here, so we can reuse it now
-        // for the result of the fma.
-
-        LiveInterval &FMAInt = LIS->getInterval(OldFMAReg);
-        VNInfo *FMAValNo = FMAInt.getVNInfoAt(FMAIdx.getRegSlot());
-        for (auto UI = MRI.reg_nodbg_begin(OldFMAReg), UE = MRI.reg_nodbg_end();
-             UI != UE;) {
-          MachineOperand &UseMO = *UI;
-          MachineInstr *UseMI = UseMO.getParent();
-          ++UI;
-
-          // Don't replace the result register of the copy we're about to erase.
-          if (UseMI == AddendMI)
-            continue;
-
-          UseMO.setReg(KilledProdReg);
-          UseMO.setSubReg(KilledProdSubReg);
-        }
-
-        // Extend the live intervals of the killed product operand to hold the
-        // fma result.
-
-        LiveInterval &NewFMAInt = LIS->getInterval(KilledProdReg);
-        for (LiveInterval::iterator AI = FMAInt.begin(), AE = FMAInt.end();
-             AI != AE; ++AI) {
-          // Don't add the segment that corresponds to the original copy.
-          if (AI->valno == AddendValNo)
-            continue;
-
-          VNInfo *NewFMAValNo =
-            NewFMAInt.getNextValue(AI->start,
-                                   LIS->getVNInfoAllocator());
-
-          NewFMAInt.addSegment(LiveInterval::Segment(AI->start, AI->end,
-                                                     NewFMAValNo));
-        }
-        DEBUG(dbgs() << "  extended: " << NewFMAInt << '\n');
-
-        FMAInt.removeValNo(FMAValNo);
-        DEBUG(dbgs() << "  trimmed:  " << FMAInt << '\n');
-
-        // Remove the (now unused) copy.
-
-        DEBUG(dbgs() << "  removing: " << *AddendMI << '\n');
-        LIS->RemoveMachineInstrFromMaps(AddendMI);
-        AddendMI->eraseFromParent();
-
-        Changed = true;
-      }
-
-      return Changed;
-    }
-
-public:
-    bool runOnMachineFunction(MachineFunction &MF) override {
-      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
-      // If we don't have VSX then go ahead and return without doing
-      // anything.
-      if (!TM->getSubtargetImpl()->hasVSX())
-        return false;
-
-      LIS = &getAnalysis<LiveIntervals>();
-
-      TII = TM->getSubtargetImpl()->getInstrInfo();
-
-      bool Changed = false;
-
-      if (DisableVSXFMAMutate)
-        return Changed;
-
-      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
-        MachineBasicBlock &B = *I++;
-        if (processBlock(B))
-          Changed = true;
-      }
-
-      return Changed;
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<LiveIntervals>();
-      AU.addPreserved<LiveIntervals>();
-      AU.addRequired<SlotIndexes>();
-      AU.addPreserved<SlotIndexes>();
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-  };
-}
-
-INITIALIZE_PASS_BEGIN(PPCVSXFMAMutate, DEBUG_TYPE,
-                      "PowerPC VSX FMA Mutation", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
-INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE,
-                    "PowerPC VSX FMA Mutation", false, false)
-
-char &llvm::PPCVSXFMAMutateID = PPCVSXFMAMutate::ID;
-
-char PPCVSXFMAMutate::ID = 0;
-FunctionPass*
-llvm::createPPCVSXFMAMutatePass() { return new PPCVSXFMAMutate(); }
-
-#undef DEBUG_TYPE
-#define DEBUG_TYPE "ppc-vsx-copy"
-
-namespace llvm {
-  void initializePPCVSXCopyPass(PassRegistry&);
-}
-
-namespace {
-  // PPCVSXCopy pass - For copies between VSX registers and non-VSX registers
-  // (Altivec and scalar floating-point registers), we need to transform the
-  // copies into subregister copies with other restrictions.
-  struct PPCVSXCopy : public MachineFunctionPass {
-    static char ID;
-    PPCVSXCopy() : MachineFunctionPass(ID) {
-      initializePPCVSXCopyPass(*PassRegistry::getPassRegistry());
-    }
-
-    const PPCTargetMachine *TM;
-    const PPCInstrInfo *TII;
-
-    bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC,
-                      MachineRegisterInfo &MRI) {
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-        return RC->hasSubClassEq(MRI.getRegClass(Reg));
-      } else if (RC->contains(Reg)) {
-        return true;
-      }
-
-      return false;
-    }
-
-    bool IsVSReg(unsigned Reg, MachineRegisterInfo &MRI) {
-      return IsRegInClass(Reg, &PPC::VSRCRegClass, MRI);
-    }
-
-    bool IsVRReg(unsigned Reg, MachineRegisterInfo &MRI) {
-      return IsRegInClass(Reg, &PPC::VRRCRegClass, MRI);
-    }
-
-    bool IsF8Reg(unsigned Reg, MachineRegisterInfo &MRI) {
-      return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI);
-    }
-
-protected:
-    bool processBlock(MachineBasicBlock &MBB) {
-      bool Changed = false;
-
-      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-      for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
-           I != IE; ++I) {
-        MachineInstr *MI = I;
-        if (!MI->isFullCopy())
-          continue;
-
-        MachineOperand &DstMO = MI->getOperand(0);
-        MachineOperand &SrcMO = MI->getOperand(1);
-
-        if ( IsVSReg(DstMO.getReg(), MRI) &&
-            !IsVSReg(SrcMO.getReg(), MRI)) {
-          // This is a copy *to* a VSX register from a non-VSX register.
-          Changed = true;
-
-          const TargetRegisterClass *SrcRC =
-            IsVRReg(SrcMO.getReg(), MRI) ? &PPC::VSHRCRegClass :
-                                           &PPC::VSLRCRegClass;
-          assert((IsF8Reg(SrcMO.getReg(), MRI) ||
-                  IsVRReg(SrcMO.getReg(), MRI)) &&
-                 "Unknown source for a VSX copy");
-
-          unsigned NewVReg = MRI.createVirtualRegister(SrcRC);
-          BuildMI(MBB, MI, MI->getDebugLoc(),
-                  TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg)
-            .addImm(1) // add 1, not 0, because there is no implicit clearing
-                       // of the high bits.
-            .addOperand(SrcMO)
-            .addImm(IsVRReg(SrcMO.getReg(), MRI) ? PPC::sub_128 :
-                                                   PPC::sub_64);
-
-          // The source of the original copy is now the new virtual register.
-          SrcMO.setReg(NewVReg);
-        } else if (!IsVSReg(DstMO.getReg(), MRI) &&
-                    IsVSReg(SrcMO.getReg(), MRI)) {
-          // This is a copy *from* a VSX register to a non-VSX register.
-          Changed = true;
-
-          const TargetRegisterClass *DstRC =
-            IsVRReg(DstMO.getReg(), MRI) ? &PPC::VSHRCRegClass :
-                                           &PPC::VSLRCRegClass;
-          assert((IsF8Reg(DstMO.getReg(), MRI) ||
-                  IsVRReg(DstMO.getReg(), MRI)) &&
-                 "Unknown destination for a VSX copy");
-
-          // Copy the VSX value into a new VSX register of the correct subclass.
-          unsigned NewVReg = MRI.createVirtualRegister(DstRC);
-          BuildMI(MBB, MI, MI->getDebugLoc(),
-                  TII->get(TargetOpcode::COPY), NewVReg)
-            .addOperand(SrcMO);
-
-          // Transform the original copy into a subregister extraction copy.
-          SrcMO.setReg(NewVReg);
-          SrcMO.setSubReg(IsVRReg(DstMO.getReg(), MRI) ? PPC::sub_128 :
-                                                         PPC::sub_64);
-        }
-      }
-
-      return Changed;
-    }
-
-public:
-    bool runOnMachineFunction(MachineFunction &MF) override {
-      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
-      // If we don't have VSX on the subtarget, don't do anything.
-      if (!TM->getSubtargetImpl()->hasVSX())
-        return false;
-      TII = TM->getSubtargetImpl()->getInstrInfo();
-
-      bool Changed = false;
-
-      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
-        MachineBasicBlock &B = *I++;
-        if (processBlock(B))
-          Changed = true;
-      }
-
-      return Changed;
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-  };
-}
-
-INITIALIZE_PASS(PPCVSXCopy, DEBUG_TYPE,
-                "PowerPC VSX Copy Legalization", false, false)
-
-char PPCVSXCopy::ID = 0;
-FunctionPass*
-llvm::createPPCVSXCopyPass() { return new PPCVSXCopy(); }
-
-#undef DEBUG_TYPE
-#define DEBUG_TYPE "ppc-vsx-copy-cleanup"
-
-namespace llvm {
-  void initializePPCVSXCopyCleanupPass(PassRegistry&);
-}
-
-namespace {
-  // PPCVSXCopyCleanup pass - We sometimes end up generating self copies of VSX
-  // registers (mostly because the ABI code still places all values into the
-  // "traditional" floating-point and vector registers). Remove them here.
-  struct PPCVSXCopyCleanup : public MachineFunctionPass {
-    static char ID;
-    PPCVSXCopyCleanup() : MachineFunctionPass(ID) {
-      initializePPCVSXCopyCleanupPass(*PassRegistry::getPassRegistry());
-    }
-
-    const PPCTargetMachine *TM;
-    const PPCInstrInfo *TII;
-
-protected:
-    bool processBlock(MachineBasicBlock &MBB) {
-      bool Changed = false;
-
-      SmallVector<MachineInstr *, 4> ToDelete;
-      for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
-           I != IE; ++I) {
-        MachineInstr *MI = I;
-        if (MI->getOpcode() == PPC::XXLOR &&
-            MI->getOperand(0).getReg() == MI->getOperand(1).getReg() &&
-            MI->getOperand(0).getReg() == MI->getOperand(2).getReg())
-          ToDelete.push_back(MI);
-      }
-
-      if (!ToDelete.empty())
-        Changed = true;
-
-      for (unsigned i = 0, ie = ToDelete.size(); i != ie; ++i) {
-        DEBUG(dbgs() << "Removing VSX self-copy: " << *ToDelete[i]);
-        ToDelete[i]->eraseFromParent();
-      }
-
-      return Changed;
-    }
-
-public:
-    bool runOnMachineFunction(MachineFunction &MF) override {
-      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
-      // If we don't have VSX don't bother doing anything here.
-      if (!TM->getSubtargetImpl()->hasVSX())
-        return false;
-      TII = TM->getSubtargetImpl()->getInstrInfo();
-
-      bool Changed = false;
-
-      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
-        MachineBasicBlock &B = *I++;
-        if (processBlock(B))
-          Changed = true;
-      }
-
-      return Changed;
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-  };
-}
-
-INITIALIZE_PASS(PPCVSXCopyCleanup, DEBUG_TYPE,
-                "PowerPC VSX Copy Cleanup", false, false)
-
-char PPCVSXCopyCleanup::ID = 0;
-FunctionPass*
-llvm::createPPCVSXCopyCleanupPass() { return new PPCVSXCopyCleanup(); }
-
-#undef DEBUG_TYPE
-#define DEBUG_TYPE "ppc-early-ret"
-STATISTIC(NumBCLR, "Number of early conditional returns");
-STATISTIC(NumBLR,  "Number of early returns");
-
-namespace llvm {
-  void initializePPCEarlyReturnPass(PassRegistry&);
-}
-
-namespace {
-  // PPCEarlyReturn pass - For simple functions without epilogue code, move
-  // returns up, and create conditional returns, to avoid unnecessary
-  // branch-to-blr sequences.
-  struct PPCEarlyReturn : public MachineFunctionPass {
-    static char ID;
-    PPCEarlyReturn() : MachineFunctionPass(ID) {
-      initializePPCEarlyReturnPass(*PassRegistry::getPassRegistry());
-    }
-
-    const PPCTargetMachine *TM;
-    const PPCInstrInfo *TII;
-
-protected:
-    bool processBlock(MachineBasicBlock &ReturnMBB) {
-      bool Changed = false;
-
-      MachineBasicBlock::iterator I = ReturnMBB.begin();
-      I = ReturnMBB.SkipPHIsAndLabels(I);
-
-      // The block must be essentially empty except for the blr.
-      if (I == ReturnMBB.end() || I->getOpcode() != PPC::BLR ||
-          I != ReturnMBB.getLastNonDebugInstr())
-        return Changed;
-
-      SmallVector<MachineBasicBlock*, 8> PredToRemove;
-      for (MachineBasicBlock::pred_iterator PI = ReturnMBB.pred_begin(),
-           PIE = ReturnMBB.pred_end(); PI != PIE; ++PI) {
-        bool OtherReference = false, BlockChanged = false;
-        for (MachineBasicBlock::iterator J = (*PI)->getLastNonDebugInstr();;) {
-          if (J->getOpcode() == PPC::B) {
-            if (J->getOperand(0).getMBB() == &ReturnMBB) {
-              // This is an unconditional branch to the return. Replace the
-              // branch with a blr.
-              BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BLR));
-              MachineBasicBlock::iterator K = J--;
-              K->eraseFromParent();
-              BlockChanged = true;
-              ++NumBLR;
-              continue;
-            }
-          } else if (J->getOpcode() == PPC::BCC) {
-            if (J->getOperand(2).getMBB() == &ReturnMBB) {
-              // This is a conditional branch to the return. Replace the branch
-              // with a bclr.
-              BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR))
-                .addImm(J->getOperand(0).getImm())
-                .addReg(J->getOperand(1).getReg());
-              MachineBasicBlock::iterator K = J--;
-              K->eraseFromParent();
-              BlockChanged = true;
-              ++NumBCLR;
-              continue;
-            }
-          } else if (J->getOpcode() == PPC::BC || J->getOpcode() == PPC::BCn) {
-            if (J->getOperand(1).getMBB() == &ReturnMBB) {
-              // This is a conditional branch to the return. Replace the branch
-              // with a bclr.
-              BuildMI(**PI, J, J->getDebugLoc(),
-                      TII->get(J->getOpcode() == PPC::BC ?
-                               PPC::BCLR : PPC::BCLRn))
-                .addReg(J->getOperand(0).getReg());
-              MachineBasicBlock::iterator K = J--;
-              K->eraseFromParent();
-              BlockChanged = true;
-              ++NumBCLR;
-              continue;
-            }
-          } else if (J->isBranch()) {
-            if (J->isIndirectBranch()) {
-              if (ReturnMBB.hasAddressTaken())
-                OtherReference = true;
-            } else
-              for (unsigned i = 0; i < J->getNumOperands(); ++i)
-                if (J->getOperand(i).isMBB() &&
-                    J->getOperand(i).getMBB() == &ReturnMBB)
-                  OtherReference = true;
-          } else if (!J->isTerminator() && !J->isDebugValue())
-            break;
-
-          if (J == (*PI)->begin())
-            break;
-
-          --J;
-        }
-
-        if ((*PI)->canFallThrough() && (*PI)->isLayoutSuccessor(&ReturnMBB))
-          OtherReference = true;
-
-        // Predecessors are stored in a vector and can't be removed here.
-        if (!OtherReference && BlockChanged) {
-          PredToRemove.push_back(*PI);
-        }
-
-        if (BlockChanged)
-          Changed = true;
-      }
-
-      for (unsigned i = 0, ie = PredToRemove.size(); i != ie; ++i)
-        PredToRemove[i]->removeSuccessor(&ReturnMBB);
-
-      if (Changed && !ReturnMBB.hasAddressTaken()) {
-        // We now might be able to merge this blr-only block into its
-        // by-layout predecessor.
-        if (ReturnMBB.pred_size() == 1 &&
-            (*ReturnMBB.pred_begin())->isLayoutSuccessor(&ReturnMBB)) {
-          // Move the blr into the preceding block.
-          MachineBasicBlock &PrevMBB = **ReturnMBB.pred_begin();
-          PrevMBB.splice(PrevMBB.end(), &ReturnMBB, I);
-          PrevMBB.removeSuccessor(&ReturnMBB);
-        }
-
-        if (ReturnMBB.pred_empty())
-          ReturnMBB.eraseFromParent();
-      }
-
-      return Changed;
-    }
-
-public:
-    bool runOnMachineFunction(MachineFunction &MF) override {
-      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
-      TII = TM->getSubtargetImpl()->getInstrInfo();
-
-      bool Changed = false;
-
-      // If the function does not have at least two blocks, then there is
-      // nothing to do.
-      if (MF.size() < 2)
-        return Changed;
-
-      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
-        MachineBasicBlock &B = *I++;
-        if (processBlock(B))
-          Changed = true;
-      }
-
-      return Changed;
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-  };
-}
-
-INITIALIZE_PASS(PPCEarlyReturn, DEBUG_TYPE,
-                "PowerPC Early-Return Creation", false, false)
-
-char PPCEarlyReturn::ID = 0;
-FunctionPass*
-llvm::createPPCEarlyReturnPass() { return new PPCEarlyReturn(); }
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 4d310fe..4add6f9 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -106,6 +106,15 @@ public:
                                               UseNode, UseIdx);
   }
 
+  bool hasLowDefLatency(const InstrItineraryData *ItinData,
+                        const MachineInstr *DefMI,
+                        unsigned DefIdx) const override {
+    // Machine LICM should hoist all instructions in low-register-pressure
+    // situations; none are sufficiently free to justify leaving in a loop
+    // body.
+    return false;
+  }
+
   bool isCoalescableExtInstr(const MachineInstr &MI,
                              unsigned &SrcReg, unsigned &DstReg,
                              unsigned &SubIdx) const override;
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 8c76c46..1a045b1 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -61,6 +61,27 @@ def tocentry32 : Operand<iPTR> {
   let MIOperandInfo = (ops i32imm:$imm);
 }
 
+def SDT_PPCqvfperm   : SDTypeProfile<1, 3, [
+  SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVec<3>
+]>;
+def SDT_PPCqvgpci   : SDTypeProfile<1, 1, [
+  SDTCisVec<0>, SDTCisInt<1>
+]>;
+def SDT_PPCqvaligni   : SDTypeProfile<1, 3, [
+  SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3>
+]>;
+def SDT_PPCqvesplati   : SDTypeProfile<1, 2, [
+  SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisInt<2>
+]>;
+
+def SDT_PPCqbflt : SDTypeProfile<1, 1, [
+  SDTCisVec<0>, SDTCisVec<1>
+]>;
+
+def SDT_PPCqvlfsb : SDTypeProfile<1, 1, [
+  SDTCisVec<0>, SDTCisPtrTy<1>
+]>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC specific DAG Nodes.
 //
@@ -98,7 +119,8 @@ def PPCfsel   : SDNode<"PPCISD::FSEL",
 
 def PPChi       : SDNode<"PPCISD::Hi", SDTIntBinOp, []>;
 def PPClo       : SDNode<"PPCISD::Lo", SDTIntBinOp, []>;
-def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp, [SDNPMayLoad]>;
+def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp,
+                         [SDNPMayLoad, SDNPMemOperand]>;
 def PPCvmaddfp  : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>;
 def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>;
 
@@ -110,14 +132,35 @@ def PPCldGotTprelL : SDNode<"PPCISD::LD_GOT_TPREL_L", SDTIntBinOp,
 def PPCaddTls     : SDNode<"PPCISD::ADD_TLS", SDTIntBinOp, []>;
 def PPCaddisTlsgdHA : SDNode<"PPCISD::ADDIS_TLSGD_HA", SDTIntBinOp>;
 def PPCaddiTlsgdL   : SDNode<"PPCISD::ADDI_TLSGD_L", SDTIntBinOp>;
+def PPCgetTlsAddr   : SDNode<"PPCISD::GET_TLS_ADDR", SDTIntBinOp>;
+def PPCaddiTlsgdLAddr : SDNode<"PPCISD::ADDI_TLSGD_L_ADDR",
+                               SDTypeProfile<1, 3, [
+                                 SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                 SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
 def PPCaddisTlsldHA : SDNode<"PPCISD::ADDIS_TLSLD_HA", SDTIntBinOp>;
 def PPCaddiTlsldL   : SDNode<"PPCISD::ADDI_TLSLD_L", SDTIntBinOp>;
-def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp,
-                              [SDNPHasChain]>;
+def PPCgetTlsldAddr : SDNode<"PPCISD::GET_TLSLD_ADDR", SDTIntBinOp>;
+def PPCaddiTlsldLAddr : SDNode<"PPCISD::ADDI_TLSLD_L_ADDR",
+                               SDTypeProfile<1, 3, [
+                                 SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                 SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
+def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp>;
 def PPCaddiDtprelL   : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
 
 def PPCvperm    : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
 
+def PPCqvfperm   : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>;
+def PPCqvgpci    : SDNode<"PPCISD::QVGPCI", SDT_PPCqvgpci, []>;
+def PPCqvaligni  : SDNode<"PPCISD::QVALIGNI", SDT_PPCqvaligni, []>;
+def PPCqvesplati : SDNode<"PPCISD::QVESPLATI", SDT_PPCqvesplati, []>;
+
+def PPCqbflt     : SDNode<"PPCISD::QBFLT", SDT_PPCqbflt, []>;
+
+def PPCqvlfsb    : SDNode<"PPCISD::QVLFSb", SDT_PPCqvlfsb,
+                          [SDNPHasChain, SDNPMayLoad]>;
+
+def PPCcmpb     : SDNode<"PPCISD::CMPB", SDTIntBinOp, []>;
+
 // These nodes represent the 32-bit PPC shifts that operate on 6-bit shift
 // amounts.  These nodes are generated by the multi-precision shift code.
 def PPCsrl        : SDNode<"PPCISD::SRL"       , SDTIntShiftOp>;
@@ -134,25 +177,18 @@ def SDT_PPCCall   : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
 def PPCcall  : SDNode<"PPCISD::CALL", SDT_PPCCall,
                       [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                        SDNPVariadic]>;
-def PPCcall_tls : SDNode<"PPCISD::CALL_TLS", SDT_PPCCall,
-                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
-                          SDNPVariadic]>;
 def PPCcall_nop  : SDNode<"PPCISD::CALL_NOP", SDT_PPCCall,
                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                            SDNPVariadic]>;
-def PPCcall_nop_tls : SDNode<"PPCISD::CALL_NOP_TLS", SDT_PPCCall,
-                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
-                              SDNPVariadic]>;
-def PPCload   : SDNode<"PPCISD::LOAD", SDTypeProfile<1, 1, []>,
-                       [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-def PPCload_toc : SDNode<"PPCISD::LOAD_TOC", SDTypeProfile<0, 1, []>,
-                          [SDNPHasChain, SDNPSideEffect,
-                           SDNPInGlue, SDNPOutGlue]>;
 def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def PPCbctrl : SDNode<"PPCISD::BCTRL", SDTNone,
                       [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                        SDNPVariadic]>;
+def PPCbctrl_load_toc : SDNode<"PPCISD::BCTRL_LOAD_TOC",
+                               SDTypeProfile<0, 1, []>,
+                               [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                SDNPVariadic]>;
 
 def retflag       : SDNode<"PPCISD::RET_FLAG", SDTNone,
                            [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
@@ -195,12 +231,6 @@ def PPClarx      : SDNode<"PPCISD::LARX", SDT_PPClarx,
 def PPCstcx      : SDNode<"PPCISD::STCX", SDT_PPCstcx,
                           [SDNPHasChain, SDNPMayStore]>;
 
-// Instructions to support medium and large code model
-def PPCaddisTocHA : SDNode<"PPCISD::ADDIS_TOC_HA", SDTIntBinOp, []>;
-def PPCldTocL     : SDNode<"PPCISD::LD_TOC_L", SDTIntBinOp, [SDNPMayLoad]>;
-def PPCaddiTocL   : SDNode<"PPCISD::ADDI_TOC_L", SDTIntBinOp, []>;
-
-
 // Instructions to support dynamic alloca.
 def SDTDynOp  : SDTypeProfile<1, 2, []>;
 def PPCdynalloc   : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
@@ -460,6 +490,15 @@ def u6imm   : Operand<i32> {
   let ParserMatchClass = PPCU6ImmAsmOperand;
   let DecoderMethod = "decodeUImmOperand<6>";
 }
+def PPCU12ImmAsmOperand : AsmOperandClass {
+  let Name = "U12Imm"; let PredicateMethod = "isU12Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u12imm  : Operand<i32> {
+  let PrintMethod = "printU12ImmOperand";
+  let ParserMatchClass = PPCU12ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<12>";
+}
 def PPCS16ImmAsmOperand : AsmOperandClass {
   let Name = "S16Imm"; let PredicateMethod = "isS16Imm";
   let RenderMethod = "addS16ImmOperands";
@@ -675,6 +714,10 @@ def IsPPC4xx  : Predicate<"PPCSubTarget->isPPC4xx()">;
 def IsPPC6xx  : Predicate<"PPCSubTarget->isPPC6xx()">;
 def IsE500  : Predicate<"PPCSubTarget->isE500()">;
 def HasSPE  : Predicate<"PPCSubTarget->HasSPE()">;
+def HasICBT : Predicate<"PPCSubTarget->hasICBT()">;
+
+def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">;
+def NaNsFPMath   : Predicate<"!TM.Options.NoNaNsFPMath">;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Multiclass Definitions.
@@ -1010,7 +1053,7 @@ def RESTORE_CRBIT : Pseudo<(outs crbitrc:$cond), (ins memri:$F),
 let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
   let isReturn = 1, Uses = [LR, RM] in
     def BLR : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB,
-                           [(retflag)]>;
+                           [(retflag)]>, Requires<[In32BitMode]>;
   let isBranch = 1, isIndirectBranch = 1, Uses = [CTR] in {
     def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
                             []>;
@@ -1313,14 +1356,14 @@ def DCBZL  : DCB_Form<1014, 1, (outs), (ins memrr:$dst), "dcbzl $dst",
                       PPC970_DGroup_Single;
 
 def ICBT  : XForm_icbt<31, 22, (outs), (ins u4imm:$CT, memrr:$src),
-                       "icbt $CT, $src", IIC_LdStLoad>, Requires<[IsBookE]>;
+                       "icbt $CT, $src", IIC_LdStLoad>, Requires<[HasICBT]>;
 
 def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 1)),
           (DCBT xoaddr:$dst)>;   // data prefetch for loads
 def : Pat<(prefetch xoaddr:$dst, (i32 1), imm, (i32 1)),
           (DCBTST xoaddr:$dst)>; // data prefetch for stores
 def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 0)),
-          (ICBT 0, xoaddr:$dst)>; // inst prefetch (for read)
+          (ICBT 0, xoaddr:$dst)>, Requires<[HasICBT]>; // inst prefetch (for read)
 
 // Atomic operations
 let usesCustomInserter = 1 in {
@@ -1454,7 +1497,7 @@ def LFD : DForm_1<50, (outs f8rc:$rD), (ins memri:$src),
 
 
 // Unindexed (r+i) Loads with Update (preinc).
-let mayLoad = 1, neverHasSideEffects = 1 in {
+let mayLoad = 1, hasSideEffects = 0 in {
 def LBZU : DForm_1<35, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
                    "lbzu $rD, $addr", IIC_LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
@@ -1797,7 +1840,7 @@ def NOP_GT_PWR7 : DForm_4_fixedreg_zero<24, 2, (outs), (ins),
                                         "ori 2, 2, 0", IIC_IntSimple, []>;
 }
 
-let isCompare = 1, neverHasSideEffects = 1 in {
+let isCompare = 1, hasSideEffects = 0 in {
   def CMPWI : DForm_5_ext<11, (outs crrc:$crD), (ins gprc:$rA, s16imm:$imm),
                           "cmpwi $crD, $rA, $imm", IIC_IntCompare>;
   def CMPLWI : DForm_6_ext<10, (outs crrc:$dst), (ins gprc:$src1, u16imm:$src2),
@@ -1805,7 +1848,7 @@ let isCompare = 1, neverHasSideEffects = 1 in {
 }
 }
 
-let PPC970_Unit = 1, neverHasSideEffects = 1 in {  // FXU Operations.
+let PPC970_Unit = 1, hasSideEffects = 0 in {  // FXU Operations.
 let isCommutable = 1 in {
 defm NAND : XForm_6r<31, 476, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
                      "nand", "$rA, $rS, $rB", IIC_IntSimple,
@@ -1848,7 +1891,7 @@ defm SRAW : XForm_6rc<31, 792, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
 }
 
 let PPC970_Unit = 1 in {  // FXU Operations.
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 defm SRAWI : XForm_10rc<31, 824, (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH),
                         "srawi", "$rA, $rS, $SH", IIC_IntShift,
                         [(set i32:$rA, (sra i32:$rS, (i32 imm:$SH)))]>;
@@ -1861,8 +1904,13 @@ defm EXTSB  : XForm_11r<31, 954, (outs gprc:$rA), (ins gprc:$rS),
 defm EXTSH  : XForm_11r<31, 922, (outs gprc:$rA), (ins gprc:$rS),
                         "extsh", "$rA, $rS", IIC_IntSimple,
                         [(set i32:$rA, (sext_inreg i32:$rS, i16))]>;
+
+let isCommutable = 1 in
+def CMPB : XForm_6<31, 508, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                   "cmpb $rA, $rS, $rB", IIC_IntGeneral,
+                   [(set i32:$rA, (PPCcmpb i32:$rS, i32:$rB))]>;
 }
-let isCompare = 1, neverHasSideEffects = 1 in {
+let isCompare = 1, hasSideEffects = 0 in {
   def CMPW   : XForm_16_ext<31, 0, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB),
                             "cmpw $crD, $rA, $rB", IIC_IntCompare>;
   def CMPLW  : XForm_16_ext<31, 32, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB),
@@ -1872,7 +1920,7 @@ let isCompare = 1, neverHasSideEffects = 1 in {
 let PPC970_Unit = 3 in {  // FPU Operations.
 //def FCMPO  : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB),
 //                      "fcmpo $crD, $fA, $fB", IIC_FPCompare>;
-let isCompare = 1, neverHasSideEffects = 1 in {
+let isCompare = 1, hasSideEffects = 0 in {
   def FCMPUS : XForm_17<63, 0, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB),
                         "fcmpu $crD, $fA, $fB", IIC_FPCompare>;
   let Interpretation64Bit = 1, isCodeGenOnly = 1 in
@@ -1881,7 +1929,7 @@ let isCompare = 1, neverHasSideEffects = 1 in {
 }
 
 let Uses = [RM] in {
-  let neverHasSideEffects = 1 in {
+  let hasSideEffects = 0 in {
   defm FCTIW  : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
                           "fctiw", "$frD, $frB", IIC_FPGeneral,
                           []>;
@@ -1902,7 +1950,7 @@ let Uses = [RM] in {
                           [(set f32:$frD, (frnd f32:$frB))]>;
   }
 
-  let neverHasSideEffects = 1 in {
+  let hasSideEffects = 0 in {
   let Interpretation64Bit = 1, isCodeGenOnly = 1 in
   defm FRIPD  : XForm_26r<63, 456, (outs f8rc:$frD), (ins f8rc:$frB),
                           "frip", "$frD, $frB", IIC_FPGeneral,
@@ -1939,13 +1987,13 @@ let Uses = [RM] in {
 /// often coalesced away and we don't want the dispatch group builder to think
 /// that they will fill slots (which could cause the load of a LSU reject to
 /// sneak into a d-group with a store).
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 defm FMR   : XForm_26r<63, 72, (outs f4rc:$frD), (ins f4rc:$frB),
                        "fmr", "$frD, $frB", IIC_FPGeneral,
                        []>,  // (set f32:$frD, f32:$frB)
                        PPC970_Unit_Pseudo;
 
-let PPC970_Unit = 3, neverHasSideEffects = 1 in {  // FPU Operations.
+let PPC970_Unit = 3, hasSideEffects = 0 in {  // FPU Operations.
 // These are artificially split into two different forms, for 4/8 byte FP.
 defm FABSS  : XForm_26r<63, 264, (outs f4rc:$frD), (ins f4rc:$frB),
                         "fabs", "$frD, $frB", IIC_FPGeneral,
@@ -1994,11 +2042,20 @@ defm FRSQRTES : XForm_26r<59, 26, (outs f4rc:$frD), (ins f4rc:$frB),
 
 // XL-Form instructions.  condition register logical ops.
 //
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 def MCRF   : XLForm_3<19, 0, (outs crrc:$BF), (ins crrc:$BFA),
                       "mcrf $BF, $BFA", IIC_BrMCR>,
              PPC970_DGroup_First, PPC970_Unit_CRU;
 
+// FIXME: According to the ISA (section 2.5.1 of version 2.06), the
+// condition-register logical instructions have preferred forms. Specifically,
+// it is preferred that the bit specified by the BT field be in the same
+// condition register as that specified by the bit BB. We might want to account
+// for this via hinting the register allocator and anti-dep breakers, or we
+// could constrain the register class to force this constraint and then loosen
+// it during register allocation via convertToThreeAddress or some similar
+// mechanism.
+
 let isCommutable = 1 in {
 def CRAND  : XLForm_1<19, 257, (outs crbitrc:$CRD),
                                (ins crbitrc:$CRA, crbitrc:$CRB),
@@ -2072,6 +2129,12 @@ def MTSPR : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, gprc:$RT),
 def MFTB : XFXForm_1<31, 371, (outs gprc:$RT), (ins i32imm:$SPR),
                      "mftb $RT, $SPR", IIC_SprMFTB>, Deprecated<DeprecatedMFTB>;
 
+// A pseudo-instruction used to implement the read of the 64-bit cycle counter
+// on a 32-bit target.
+let hasSideEffects = 1, usesCustomInserter = 1 in
+def ReadTB : Pseudo<(outs gprc:$lo, gprc:$hi), (ins),
+                    "#ReadTB", []>;
+
 let Uses = [CTR] in {
 def MFCTR : XFXForm_1_ext<31, 339, 9, (outs gprc:$rT), (ins),
                           "mfctr $rT", IIC_SprMFSPR>,
@@ -2133,7 +2196,7 @@ let mayLoad = 1 in
 def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F),
                      "#RESTORE_VRSAVE", []>;
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 def MTOCRF: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins gprc:$ST),
                        "mtocrf $FXM, $ST", IIC_BrMCRX>,
             PPC970_DGroup_First, PPC970_Unit_CRU;
@@ -2150,7 +2213,7 @@ def MFOCRF: XFXForm_5a<31, 19, (outs gprc:$rT), (ins crbitm:$FXM),
 def MFCR : XFXForm_3<31, 19, (outs gprc:$rT), (ins),
                      "mfcr $rT", IIC_SprMFCR>,
                      PPC970_MicroCode, PPC970_Unit_CRU;
-} // neverHasSideEffects = 1
+} // hasSideEffects = 0
 
 // Pseudo instruction to perform FADD in round-to-zero mode.
 let usesCustomInserter = 1, Uses = [RM] in {
@@ -2167,19 +2230,24 @@ let Uses = [RM], Defs = [RM] in {
   def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM),
                         "mtfsb1 $FM", IIC_IntMTFSB0, []>,
                PPC970_DGroup_Single, PPC970_Unit_FPU;
-  def MTFSF  : XFLForm<63, 711, (outs), (ins i32imm:$FM, f8rc:$rT),
-                       "mtfsf $FM, $rT", IIC_IntMTFSB0, []>,
-               PPC970_DGroup_Single, PPC970_Unit_FPU;
+  let isCodeGenOnly = 1 in
+  def MTFSFb  : XFLForm<63, 711, (outs), (ins i32imm:$FM, f8rc:$rT),
+                        "mtfsf $FM, $rT", IIC_IntMTFSB0, []>,
+                PPC970_DGroup_Single, PPC970_Unit_FPU;
 }
 let Uses = [RM] in {
   def MFFS   : XForm_42<63, 583, (outs f8rc:$rT), (ins),
                          "mffs $rT", IIC_IntMFFS,
                          [(set f64:$rT, (PPCmffs))]>,
                PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+  let Defs = [CR1] in
+  def MFFSo : XForm_42<63, 583, (outs f8rc:$rT), (ins),
+                      "mffs. $rT", IIC_IntMFFS, []>, isDOT;
 }
 
 
-let PPC970_Unit = 1, neverHasSideEffects = 1 in {  // FXU Operations.
+let PPC970_Unit = 1, hasSideEffects = 0 in {  // FXU Operations.
 // XO-Form instructions.  Arithmetic instructions that can set overflow bit
 let isCommutable = 1 in
 defm ADD4  : XOForm_1r<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
@@ -2250,7 +2318,7 @@ defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$rT), (ins gprc:$rA),
 // A-Form instructions.  Most of the instructions executed in the FPU are of
 // this type.
 //
-let PPC970_Unit = 3, neverHasSideEffects = 1 in {  // FPU Operations.
+let PPC970_Unit = 3, hasSideEffects = 0 in {  // FPU Operations.
 let Uses = [RM] in {
 let isCommutable = 1 in {
   defm FMADD : AForm_1r<63, 29, 
@@ -2346,12 +2414,12 @@ let Uses = [RM] in {
   }
 }
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 let PPC970_Unit = 1 in {  // FXU Operations.
   let isSelect = 1 in
   def ISEL  : AForm_4<31, 15,
                      (outs gprc:$rT), (ins gprc_nor0:$rA, gprc:$rB, crbitrc:$cond),
-                     "isel $rT, $rA, $rB, $cond", IIC_IntGeneral,
+                     "isel $rT, $rA, $rB, $cond", IIC_IntISEL,
                      []>;
 }
 
@@ -2382,7 +2450,7 @@ defm RLWNM  : MForm_2r<23, (outs gprc:$rA),
                        "rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral,
                        []>;
 }
-} // neverHasSideEffects = 1
+} // hasSideEffects = 0
 
 //===----------------------------------------------------------------------===//
 // PowerPC Instruction Patterns
@@ -2433,9 +2501,6 @@ def : Pat<(PPCcall (i32 tglobaladdr:$dst)),
 def : Pat<(PPCcall (i32 texternalsym:$dst)),
           (BL texternalsym:$dst)>;
 
-def : Pat<(PPCcall_tls texternalsym:$func, tglobaltlsaddr:$sym),
-          (BL_TLS texternalsym:$func, tglobaltlsaddr:$sym)>;
-
 def : Pat<(PPCtc_return (i32 tglobaladdr:$dst),  imm:$imm),
           (TCRETURNdi tglobaladdr:$dst, imm:$imm)>;
 
@@ -2490,10 +2555,49 @@ def ADDItlsgdL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
                          "#ADDItlsgdL32",
                          [(set i32:$rD,
                            (PPCaddiTlsgdL i32:$reg, tglobaltlsaddr:$disp))]>;
+// LR is a true define, while the rest of the Defs are clobbers.  R3 is
+// explicitly defined when this op is created, so not mentioned here.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+    Defs = [R0,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
+def GETtlsADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
+                          "GETtlsADDR32",
+                          [(set i32:$rD,
+                            (PPCgetTlsAddr i32:$reg, tglobaltlsaddr:$sym))]>;
+// Combined op for ADDItlsgdL32 and GETtlsADDR32, late expanded.  R3 and LR
+// are true defines while the rest of the Defs are clobbers.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+    Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
+def ADDItlsgdLADDR32 : Pseudo<(outs gprc:$rD),
+                              (ins gprc_nor0:$reg, s16imm:$disp, tlsgd32:$sym),
+                              "#ADDItlsgdLADDR32",
+                              [(set i32:$rD,
+                                (PPCaddiTlsgdLAddr i32:$reg,
+                                                   tglobaltlsaddr:$disp,
+                                                   tglobaltlsaddr:$sym))]>;
 def ADDItlsldL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
                           "#ADDItlsldL32",
                           [(set i32:$rD,
                             (PPCaddiTlsldL i32:$reg, tglobaltlsaddr:$disp))]>;
+// LR is a true define, while the rest of the Defs are clobbers.  R3 is
+// explicitly defined when this op is created, so not mentioned here.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+    Defs = [R0,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
+def GETtlsldADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
+                            "GETtlsldADDR32",
+                            [(set i32:$rD,
+                              (PPCgetTlsldAddr i32:$reg,
+                                               tglobaltlsaddr:$sym))]>;
+// Combined op for ADDItlsldL32 and GETtlsADDR32, late expanded.  R3 and LR
+// are true defines while the rest of the Defs are clobbers.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+    Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
+def ADDItlsldLADDR32 : Pseudo<(outs gprc:$rD),
+                              (ins gprc_nor0:$reg, s16imm:$disp, tlsgd32:$sym),
+                              "#ADDItlsldLADDR32",
+                              [(set i32:$rD,
+                                (PPCaddiTlsldLAddr i32:$reg,
+                                                   tglobaltlsaddr:$disp,
+                                                   tglobaltlsaddr:$sym))]>;
 def ADDIdtprelL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
                            "#ADDIdtprelL32",
                            [(set i32:$rD,
@@ -2578,6 +2682,7 @@ include "PPCInstrAltivec.td"
 include "PPCInstrSPE.td"
 include "PPCInstr64Bit.td"
 include "PPCInstrVSX.td"
+include "PPCInstrQPX.td"
 
 def crnot : OutPatFrag<(ops node:$in),
                        (CRNOR $in, $in)>;
@@ -3108,7 +3213,8 @@ def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins),
 def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src),
                     "icbi $src", IIC_LdStICBI, []>;
 
-def EIEIO : XForm_24_eieio<31, 854, (outs), (ins),
+// We used to have EIEIO as value but E[0-9A-Z] is a reserved name
+def EnforceIEIO : XForm_24_eieio<31, 854, (outs), (ins),
                            "eieio", IIC_LdStLoad, []>;
 
 def WAIT : XForm_24_sync<31, 62, (outs), (ins i32imm:$L),
@@ -3161,6 +3267,28 @@ def MFMSR : XForm_rs<31, 83, (outs gprc:$RT), (ins),
 def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, i32imm:$L),
                     "mtmsrd $RS, $L", IIC_SprMTMSRD>;
 
+def MCRFS : XLForm_3<63, 64, (outs crrc:$BF), (ins crrc:$BFA),
+                     "mcrfs $BF, $BFA", IIC_BrMCR>;
+
+def MTFSFI : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
+                      "mtfsfi $BF, $U, $W", IIC_IntMFFS>;
+
+def MTFSFIo : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
+                       "mtfsfi. $BF, $U, $W", IIC_IntMFFS>, isDOT;
+
+def : InstAlias<"mtfsfi $BF, $U", (MTFSFI crrc:$BF, i32imm:$U, 0)>;
+def : InstAlias<"mtfsfi. $BF, $U", (MTFSFIo crrc:$BF, i32imm:$U, 0)>;
+
+def MTFSF : XFLForm_1<63, 711, (outs),
+                      (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W),
+                      "mtfsf $FLM, $FRB, $L, $W", IIC_IntMFFS, []>;
+def MTFSFo : XFLForm_1<63, 711, (outs),
+                       (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W),
+                       "mtfsf. $FLM, $FRB, $L, $W", IIC_IntMFFS, []>, isDOT;
+
+def : InstAlias<"mtfsf $FLM, $FRB", (MTFSF i32imm:$FLM, f8rc:$FRB, 0, 0)>;
+def : InstAlias<"mtfsf. $FLM, $FRB", (MTFSFo i32imm:$FLM, f8rc:$FRB, 0, 0)>;
+
 def SLBIE : XForm_16b<31, 434, (outs), (ins gprc:$RB),
                         "slbie $RB", IIC_SprSLBIE, []>;
 
@@ -3232,6 +3360,26 @@ def MFDCR : XFXForm_1<31, 323, (outs gprc:$RT), (ins i32imm:$SPR),
 def MTDCR : XFXForm_1<31, 451, (outs), (ins gprc:$RT, i32imm:$SPR),
                       "mtdcr $SPR, $RT", IIC_SprMTSPR>, Requires<[IsPPC4xx]>;
 
+def ATTN : XForm_attn<0, 256, (outs), (ins), "attn", IIC_BrB>;
+
+def LBZCIX : XForm_base_r3xo<31, 853, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
+                             "lbzcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LHZCIX : XForm_base_r3xo<31, 821, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
+                             "lhzcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LWZCIX : XForm_base_r3xo<31, 789, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
+                             "lwzcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LDCIX :  XForm_base_r3xo<31, 885, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
+                             "ldcix $RST, $A, $B", IIC_LdStLoad, []>;
+
+def STBCIX : XForm_base_r3xo<31, 981, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
+                             "stbcix $RST, $A, $B", IIC_LdStLoad, []>;
+def STHCIX : XForm_base_r3xo<31, 949, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
+                             "sthcix $RST, $A, $B", IIC_LdStLoad, []>;
+def STWCIX : XForm_base_r3xo<31, 917, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
+                             "stwcix $RST, $A, $B", IIC_LdStLoad, []>;
+def STDCIX : XForm_base_r3xo<31, 1013, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
+                             "stdcix $RST, $A, $B", IIC_LdStLoad, []>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC Assembler Instruction Aliases
 //
@@ -3497,6 +3645,9 @@ def : InstAlias<"rotlw. $rA, $rS, $rB", (RLWNMo gprc:$rA, gprc:$rS, gprc:$rB, 0,
 def : InstAlias<"clrlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
 def : InstAlias<"clrlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
 
+def : InstAlias<"cntlz $rA, $rS", (CNTLZW gprc:$rA, gprc:$rS)>;
+def : InstAlias<"cntlz. $rA, $rS", (CNTLZWo gprc:$rA, gprc:$rS)>;
+
 def EXTLDI : PPCAsmPseudo<"extldi $rA, $rS, $n, $b",
                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
 def EXTLDIo : PPCAsmPseudo<"extldi. $rA, $rS, $n, $b",
diff --git a/lib/Target/PowerPC/PPCInstrQPX.td b/lib/Target/PowerPC/PPCInstrQPX.td
new file mode 100644
index 0000000..c984d46
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrQPX.td
@@ -0,0 +1,1192 @@
+//===- PPCInstrQPX.td - The PowerPC QPX Extension --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the QPX extension to the PowerPC instruction set.
+// Reference:
+// Book Q: QPX Architecture Definition. IBM (as updated in) 2011.
+//
+//===----------------------------------------------------------------------===//
+
+def PPCRegQFRCAsmOperand : AsmOperandClass {
+  let Name = "RegQFRC"; let PredicateMethod = "isRegNumber";
+}
+def qfrc : RegisterOperand<QFRC> {
+  let ParserMatchClass = PPCRegQFRCAsmOperand;
+}
+def PPCRegQSRCAsmOperand : AsmOperandClass {
+  let Name = "RegQSRC"; let PredicateMethod = "isRegNumber";
+}
+def qsrc : RegisterOperand<QSRC> {
+  let ParserMatchClass = PPCRegQSRCAsmOperand;
+}
+def PPCRegQBRCAsmOperand : AsmOperandClass {
+  let Name = "RegQBRC"; let PredicateMethod = "isRegNumber";
+}
+def qbrc : RegisterOperand<QBRC> {
+  let ParserMatchClass = PPCRegQBRCAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+// Helpers for defining instructions that directly correspond to intrinsics.
+
+// QPXA1_Int - A AForm_1 intrinsic definition.
+class QPXA1_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
+  : AForm_1<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+              !strconcat(opc, " $FRT, $FRA, $FRC, $FRB"), IIC_FPFused,
+                       [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>;
+// QPXA1s_Int - A AForm_1 intrinsic definition (simple instructions).
+class QPXA1s_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
+  : AForm_1<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+              !strconcat(opc, " $FRT, $FRA, $FRC, $FRB"), IIC_VecPerm,
+                       [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>;
+// QPXA2_Int - A AForm_2 intrinsic definition.
+class QPXA2_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
+  : AForm_2<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+              !strconcat(opc, " $FRT, $FRA, $FRB"), IIC_FPGeneral,
+                       [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB))]>;
+// QPXA3_Int - A AForm_3 intrinsic definition.
+class QPXA3_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
+  : AForm_3<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC),
+              !strconcat(opc, " $FRT, $FRA, $FRC"), IIC_FPGeneral,
+                       [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRC))]>;
+// QPXA4_Int - A AForm_4a intrinsic definition.
+class QPXA4_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
+  : AForm_4a<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRB),
+              !strconcat(opc, " $FRT, $FRB"), IIC_FPGeneral,
+                       [(set v4f64:$FRT, (IntID v4f64:$FRB))]>;
+// QPXX18_Int - A XForm_18 intrinsic definition.
+class QPXX18_Int<bits<6> opcode, bits<10> xo, string opc, Intrinsic IntID>
+  : XForm_18<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+              !strconcat(opc, " $FRT, $FRA, $FRB"), IIC_FPCompare,
+                       [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB))]>;
+// QPXX19_Int - A XForm_19 intrinsic definition.
+class QPXX19_Int<bits<6> opcode, bits<10> xo, string opc, Intrinsic IntID>
+  : XForm_19<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRB),
+              !strconcat(opc, " $FRT, $FRB"), IIC_FPGeneral,
+                       [(set v4f64:$FRT, (IntID v4f64:$FRB))]>;
+
+//===----------------------------------------------------------------------===//
+// Pattern Frags.
+
+def extloadv4f32 : PatFrag<(ops node:$ptr), (extload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::v4f32;
+}]>;
+
+def truncstorev4f32 : PatFrag<(ops node:$val, node:$ptr),
+                            (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32;
+}]>;
+def pre_truncstv4f32 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                               (pre_truncst node:$val,
+                                            node:$base, node:$offset), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32;
+}]>;
+
+def fround_inexact : PatFrag<(ops node:$val), (fround node:$val), [{
+  return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() == 0;
+}]>;
+
+def fround_exact : PatFrag<(ops node:$val), (fround node:$val), [{
+  return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() == 1;
+}]>;
+
+let FastIselShouldIgnore = 1 in // FastIsel should ignore all u12 instrs.
+  def u12 : ImmLeaf<i32, [{ return (Imm & 0xFFF) == Imm; }]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions.
+
+def HasQPX : Predicate<"PPCSubTarget->hasQPX()">;
+let Predicates = [HasQPX] in {
+let DecoderNamespace = "QPX" in {
+let hasSideEffects = 0 in { // QPX instructions don't have side effects.
+let Uses = [RM] in {
+  // Add Instructions
+  let isCommutable = 1 in {
+    def QVFADD : AForm_2<4, 21,
+                        (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+                        "qvfadd $FRT, $FRA, $FRB", IIC_FPGeneral,
+                        [(set v4f64:$FRT, (fadd v4f64:$FRA, v4f64:$FRB))]>;
+    let isCodeGenOnly = 1 in
+      def QVFADDS : QPXA2_Int<0, 21, "qvfadds", int_ppc_qpx_qvfadds>;
+    def QVFADDSs : AForm_2<0, 21,
+                          (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+                          "qvfadds $FRT, $FRA, $FRB", IIC_FPGeneral,
+                          [(set v4f32:$FRT, (fadd v4f32:$FRA, v4f32:$FRB))]>;
+  }
+  def QVFSUB : AForm_2<4, 20,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+                      "qvfsub $FRT, $FRA, $FRB", IIC_FPGeneral,
+                      [(set v4f64:$FRT, (fsub v4f64:$FRA, v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+    def QVFSUBS : QPXA2_Int<0, 20, "qvfsubs", int_ppc_qpx_qvfsubs>;
+  def QVFSUBSs : AForm_2<0, 20,
+                        (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+                        "qvfsubs $FRT, $FRA, $FRB", IIC_FPGeneral,
+                        [(set v4f32:$FRT, (fsub v4f32:$FRA, v4f32:$FRB))]>;
+
+  // Estimate Instructions
+  def QVFRE : AForm_4a<4, 24, (outs qfrc:$FRT), (ins qfrc:$FRB),
+                       "qvfre $FRT, $FRB", IIC_FPGeneral,
+                       [(set v4f64:$FRT, (PPCfre v4f64:$FRB))]>;
+  def QVFRES : QPXA4_Int<0, 24, "qvfres", int_ppc_qpx_qvfres>;
+  let isCodeGenOnly = 1 in
+  def QVFRESs : AForm_4a<0, 24, (outs qsrc:$FRT), (ins qsrc:$FRB),
+                         "qvfres $FRT, $FRB", IIC_FPGeneral,
+                         [(set v4f32:$FRT, (PPCfre v4f32:$FRB))]>;
+
+  def QVFRSQRTE : AForm_4a<4, 26, (outs qfrc:$FRT), (ins qfrc:$FRB),
+                           "qvfrsqrte $FRT, $FRB", IIC_FPGeneral,
+                           [(set v4f64:$FRT, (PPCfrsqrte v4f64:$FRB))]>;
+  def QVFRSQRTES : QPXA4_Int<0, 26, "qvfrsqrtes", int_ppc_qpx_qvfrsqrtes>;
+  let isCodeGenOnly = 1 in
+  def QVFRSQRTESs : AForm_4a<0, 26, (outs qsrc:$FRT), (ins qsrc:$FRB),
+                             "qvfrsqrtes $FRT, $FRB", IIC_FPGeneral,
+                             [(set v4f32:$FRT, (PPCfrsqrte v4f32:$FRB))]>;
+
+  // Multiply Instructions
+  let isCommutable = 1 in {
+    def QVFMUL : AForm_3<4, 25,
+                        (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC),
+                        "qvfmul $FRT, $FRA, $FRC", IIC_FPGeneral,
+                        [(set v4f64:$FRT, (fmul v4f64:$FRA, v4f64:$FRC))]>;
+    let isCodeGenOnly = 1 in
+      def QVFMULS : QPXA3_Int<0, 25, "qvfmuls", int_ppc_qpx_qvfmuls>;
+    def QVFMULSs : AForm_3<0, 25,
+                          (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC),
+                          "qvfmuls $FRT, $FRA, $FRC", IIC_FPGeneral,
+                          [(set v4f32:$FRT, (fmul v4f32:$FRA, v4f32:$FRC))]>;
+  }
+  def QVFXMUL : QPXA3_Int<4, 17, "qvfxmul", int_ppc_qpx_qvfxmul>;
+  def QVFXMULS : QPXA3_Int<0, 17, "qvfxmuls", int_ppc_qpx_qvfxmuls>;
+
+  // Multiply-add instructions
+  def QVFMADD : AForm_1<4, 29,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+                      "qvfmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+    def QVFMADDS : QPXA1_Int<0, 29, "qvfmadds", int_ppc_qpx_qvfmadds>;
+  def QVFMADDSs : AForm_1<0, 29,
+                        (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+                        "qvfmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                        [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, v4f32:$FRB))]>;
+  def QVFNMADD : AForm_1<4, 31,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+                      "qvfnmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC,
+                                                   v4f64:$FRB)))]>;
+  let isCodeGenOnly = 1 in
+    def QVFNMADDS : QPXA1_Int<0, 31, "qvfnmadds", int_ppc_qpx_qvfnmadds>;
+  def QVFNMADDSs : AForm_1<0, 31,
+                        (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+                        "qvfnmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                        [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC,
+                                                     v4f32:$FRB)))]>;
+  def QVFMSUB : AForm_1<4, 28,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+                      "qvfmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC,
+                                             (fneg v4f64:$FRB)))]>;
+  let isCodeGenOnly = 1 in
+    def QVFMSUBS : QPXA1_Int<0, 28, "qvfmsubs", int_ppc_qpx_qvfmsubs>;
+  def QVFMSUBSs : AForm_1<0, 28,
+                      (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+                      "qvfmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC,
+                                             (fneg v4f32:$FRB)))]>;
+  def QVFNMSUB : AForm_1<4, 30,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+                      "qvfnmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC,
+                                              (fneg v4f64:$FRB))))]>;
+  let isCodeGenOnly = 1 in
+    def QVFNMSUBS : QPXA1_Int<0, 30, "qvfnmsubs", int_ppc_qpx_qvfnmsubs>;
+  def QVFNMSUBSs : AForm_1<0, 30,
+                      (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+                      "qvfnmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC,
+                                              (fneg v4f32:$FRB))))]>;
+  def QVFXMADD : QPXA1_Int<4, 9, "qvfxmadd", int_ppc_qpx_qvfxmadd>;
+  def QVFXMADDS : QPXA1_Int<0, 9, "qvfxmadds", int_ppc_qpx_qvfxmadds>;
+  def QVFXXNPMADD : QPXA1_Int<4, 11, "qvfxxnpmadd", int_ppc_qpx_qvfxxnpmadd>;
+  def QVFXXNPMADDS : QPXA1_Int<0, 11, "qvfxxnpmadds", int_ppc_qpx_qvfxxnpmadds>;
+  def QVFXXCPNMADD : QPXA1_Int<4, 3, "qvfxxcpnmadd", int_ppc_qpx_qvfxxcpnmadd>;
+  def QVFXXCPNMADDS : QPXA1_Int<0, 3, "qvfxxcpnmadds", int_ppc_qpx_qvfxxcpnmadds>;
+  def QVFXXMADD : QPXA1_Int<4, 1, "qvfxxmadd", int_ppc_qpx_qvfxxmadd>;
+  def QVFXXMADDS : QPXA1_Int<0, 1, "qvfxxmadds", int_ppc_qpx_qvfxxmadds>;
+
+  // Select Instruction
+  let isCodeGenOnly = 1 in
+    def QVFSEL : QPXA1s_Int<4, 23, "qvfsel", int_ppc_qpx_qvfsel>;
+  def QVFSELb : AForm_1<4, 23, (outs qfrc:$FRT),
+                        (ins qbrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+                        "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm,
+                        [(set v4f64:$FRT, (vselect v4i1:$FRA,
+                                                   v4f64:$FRC, v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+  def QVFSELbs : AForm_1<4, 23, (outs qsrc:$FRT),
+                        (ins qbrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+                        "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm,
+                        [(set v4f32:$FRT, (vselect v4i1:$FRA,
+                                                   v4f32:$FRC, v4f32:$FRB))]>;
+  let isCodeGenOnly = 1 in
+  def QVFSELbb: AForm_1<4, 23, (outs qbrc:$FRT),
+                        (ins qbrc:$FRA, qbrc:$FRB, qbrc:$FRC),
+                        "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm,
+                        [(set v4i1:$FRT, (vselect v4i1:$FRA,
+                                                  v4i1:$FRC, v4i1:$FRB))]>;
+
+  // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+  // instruction selection into a branch sequence.
+  let usesCustomInserter = 1 in {
+    def SELECT_CC_QFRC: Pseudo<(outs qfrc:$dst), (ins crrc:$cond, qfrc:$T, qfrc:$F,
+                                i32imm:$BROPC), "#SELECT_CC_QFRC",
+                                []>;
+    def SELECT_CC_QSRC: Pseudo<(outs qsrc:$dst), (ins crrc:$cond, qsrc:$T, qsrc:$F,
+                                i32imm:$BROPC), "#SELECT_CC_QSRC",
+                                []>;
+    def SELECT_CC_QBRC: Pseudo<(outs qbrc:$dst), (ins crrc:$cond, qbrc:$T, qbrc:$F,
+                                i32imm:$BROPC), "#SELECT_CC_QBRC",
+                                []>;
+
+    // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition
+    // register bit directly.
+    def SELECT_QFRC: Pseudo<(outs qfrc:$dst), (ins crbitrc:$cond,
+                            qfrc:$T, qfrc:$F), "#SELECT_QFRC",
+                            [(set v4f64:$dst,
+                                  (select i1:$cond, v4f64:$T, v4f64:$F))]>;
+    def SELECT_QSRC: Pseudo<(outs qsrc:$dst), (ins crbitrc:$cond,
+                            qsrc:$T, qsrc:$F), "#SELECT_QSRC",
+                            [(set v4f32:$dst,
+                                  (select i1:$cond, v4f32:$T, v4f32:$F))]>;
+    def SELECT_QBRC: Pseudo<(outs qbrc:$dst), (ins crbitrc:$cond,
+                            qbrc:$T, qbrc:$F), "#SELECT_QBRC",
+                            [(set v4i1:$dst,
+                                  (select i1:$cond, v4i1:$T, v4i1:$F))]>;
+  }
+
+  // Convert and Round Instructions
+  def QVFCTID : QPXX19_Int<4, 814, "qvfctid", int_ppc_qpx_qvfctid>;
+  let isCodeGenOnly = 1 in
+    def QVFCTIDb : XForm_19<4, 814, (outs qbrc:$FRT), (ins qbrc:$FRB),
+                            "qvfctid $FRT, $FRB", IIC_FPGeneral, []>;
+
+  def QVFCTIDU : QPXX19_Int<4, 942, "qvfctidu", int_ppc_qpx_qvfctidu>;
+  def QVFCTIDZ : QPXX19_Int<4, 815, "qvfctidz", int_ppc_qpx_qvfctidz>;
+  def QVFCTIDUZ : QPXX19_Int<4, 943, "qvfctiduz", int_ppc_qpx_qvfctiduz>;
+  def QVFCTIW : QPXX19_Int<4, 14, "qvfctiw", int_ppc_qpx_qvfctiw>;
+  def QVFCTIWU : QPXX19_Int<4, 142, "qvfctiwu", int_ppc_qpx_qvfctiwu>;
+  def QVFCTIWZ : QPXX19_Int<4, 15, "qvfctiwz", int_ppc_qpx_qvfctiwz>;
+  def QVFCTIWUZ : QPXX19_Int<4, 143, "qvfctiwuz", int_ppc_qpx_qvfctiwuz>;
+  def QVFCFID : QPXX19_Int<4, 846, "qvfcfid", int_ppc_qpx_qvfcfid>;
+  let isCodeGenOnly = 1 in
+    def QVFCFIDb : XForm_19<4, 846, (outs qbrc:$FRT), (ins qbrc:$FRB),
+                            "qvfcfid $FRT, $FRB", IIC_FPGeneral, []>;
+
+  def QVFCFIDU : QPXX19_Int<4, 974, "qvfcfidu", int_ppc_qpx_qvfcfidu>;
+  def QVFCFIDS : QPXX19_Int<0, 846, "qvfcfids", int_ppc_qpx_qvfcfids>;
+  def QVFCFIDUS : QPXX19_Int<0, 974, "qvfcfidus", int_ppc_qpx_qvfcfidus>;
+
+  let isCodeGenOnly = 1 in
+    def QVFRSP : QPXX19_Int<4, 12, "qvfrsp", int_ppc_qpx_qvfrsp>;
+  def QVFRSPs : XForm_19<4, 12,
+                      (outs qsrc:$FRT), (ins qfrc:$FRB),
+                      "qvfrsp $FRT, $FRB", IIC_FPGeneral,
+                      [(set v4f32:$FRT, (fround_inexact v4f64:$FRB))]>;
+
+  def QVFRIZ : XForm_19<4, 424, (outs qfrc:$FRT), (ins qfrc:$FRB),
+                        "qvfriz $FRT, $FRB", IIC_FPGeneral,
+                        [(set v4f64:$FRT, (ftrunc v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+    def QVFRIZs : XForm_19<4, 424, (outs qsrc:$FRT), (ins qsrc:$FRB),
+                           "qvfriz $FRT, $FRB", IIC_FPGeneral,
+                           [(set v4f32:$FRT, (ftrunc v4f32:$FRB))]>;
+
+  def QVFRIN : XForm_19<4, 392, (outs qfrc:$FRT), (ins qfrc:$FRB),
+                        "qvfrin $FRT, $FRB", IIC_FPGeneral,
+                        [(set v4f64:$FRT, (frnd v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+    def QVFRINs : XForm_19<4, 392, (outs qsrc:$FRT), (ins qsrc:$FRB),
+                           "qvfrin $FRT, $FRB", IIC_FPGeneral,
+                           [(set v4f32:$FRT, (frnd v4f32:$FRB))]>;
+
+  def QVFRIP : XForm_19<4, 456, (outs qfrc:$FRT), (ins qfrc:$FRB),
+                        "qvfrip $FRT, $FRB", IIC_FPGeneral,
+                        [(set v4f64:$FRT, (fceil v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+    def QVFRIPs : XForm_19<4, 456, (outs qsrc:$FRT), (ins qsrc:$FRB),
+                           "qvfrip $FRT, $FRB", IIC_FPGeneral,
+                           [(set v4f32:$FRT, (fceil v4f32:$FRB))]>;
+
+  def QVFRIM : XForm_19<4, 488, (outs qfrc:$FRT), (ins qfrc:$FRB),
+                        "qvfrim $FRT, $FRB", IIC_FPGeneral,
+                        [(set v4f64:$FRT, (ffloor v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+    def QVFRIMs : XForm_19<4, 488, (outs qsrc:$FRT), (ins qsrc:$FRB),
+                           "qvfrim $FRT, $FRB", IIC_FPGeneral,
+                           [(set v4f32:$FRT, (ffloor v4f32:$FRB))]>;
+
+  // Move Instructions
+  def QVFMR : XForm_19<4, 72,
+                      (outs qfrc:$FRT), (ins qfrc:$FRB),
+                      "qvfmr $FRT, $FRB", IIC_VecPerm,
+                      [/* (set v4f64:$FRT, v4f64:$FRB) */]>;
+  let isCodeGenOnly = 1 in {
+    def QVFMRs : XForm_19<4, 72,
+                         (outs qsrc:$FRT), (ins qsrc:$FRB),
+                         "qvfmr $FRT, $FRB", IIC_VecPerm,
+                         [/* (set v4f32:$FRT, v4f32:$FRB) */]>;
+    def QVFMRb : XForm_19<4, 72,
+                         (outs qbrc:$FRT), (ins qbrc:$FRB),
+                         "qvfmr $FRT, $FRB", IIC_VecPerm,
+                         [/* (set v4i1:$FRT, v4i1:$FRB) */]>;
+  }
+  def QVFNEG : XForm_19<4, 40,
+                      (outs qfrc:$FRT), (ins qfrc:$FRB),
+                      "qvfneg $FRT, $FRB", IIC_VecPerm,
+                      [(set v4f64:$FRT, (fneg v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+    def QVFNEGs : XForm_19<4, 40,
+                         (outs qsrc:$FRT), (ins qsrc:$FRB),
+                         "qvfneg $FRT, $FRB", IIC_VecPerm,
+                         [(set v4f32:$FRT, (fneg v4f32:$FRB))]>;
+  def QVFABS : XForm_19<4, 264,
+                      (outs qfrc:$FRT), (ins qfrc:$FRB),
+                      "qvfabs $FRT, $FRB", IIC_VecPerm,
+                      [(set v4f64:$FRT, (fabs v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+    def QVFABSs : XForm_19<4, 264,
+                         (outs qsrc:$FRT), (ins qsrc:$FRB),
+                         "qvfabs $FRT, $FRB", IIC_VecPerm,
+                         [(set v4f32:$FRT, (fabs v4f32:$FRB))]>;
+  def QVFNABS : XForm_19<4, 136,
+                      (outs qfrc:$FRT), (ins qfrc:$FRB),
+                      "qvfnabs $FRT, $FRB", IIC_VecPerm,
+                      [(set v4f64:$FRT, (fneg (fabs v4f64:$FRB)))]>;
+  let isCodeGenOnly = 1 in
+    def QVFNABSs : XForm_19<4, 136,
+                         (outs qsrc:$FRT), (ins qsrc:$FRB),
+                         "qvfnabs $FRT, $FRB", IIC_VecPerm,
+                         [(set v4f32:$FRT, (fneg (fabs v4f32:$FRB)))]>;
+  def QVFCPSGN : XForm_18<4, 8,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+                      "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm,
+                      [(set v4f64:$FRT, (fcopysign v4f64:$FRB, v4f64:$FRA))]>;
+  let isCodeGenOnly = 1 in
+    def QVFCPSGNs : XForm_18<4, 8,
+                         (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+                         "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm,
+                         [(set v4f32:$FRT, (fcopysign v4f32:$FRB, v4f32:$FRA))]>;
+
+  def QVALIGNI : Z23Form_1<4, 5,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u2imm:$idx),
+                      "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm,
+                      [(set v4f64:$FRT,
+                            (PPCqvaligni v4f64:$FRA, v4f64:$FRB,
+                                         (i32 imm:$idx)))]>;
+  let isCodeGenOnly = 1 in
+     def QVALIGNIs : Z23Form_1<4, 5,
+                         (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, u2imm:$idx),
+                         "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm,
+                         [(set v4f32:$FRT,
+                               (PPCqvaligni v4f32:$FRA, v4f32:$FRB,
+                                            (i32 imm:$idx)))]>;
+  let isCodeGenOnly = 1 in
+     def QVALIGNIb : Z23Form_1<4, 5,
+                         (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u2imm:$idx),
+                         "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm,
+                         [(set v4i1:$FRT,
+                               (PPCqvaligni v4i1:$FRA, v4i1:$FRB,
+                                            (i32 imm:$idx)))]>;
+
+  def QVESPLATI : Z23Form_2<4, 37,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, u2imm:$idx),
+                      "qvesplati $FRT, $FRA, $idx", IIC_VecPerm,
+                      [(set v4f64:$FRT,
+                            (PPCqvesplati v4f64:$FRA, (i32 imm:$idx)))]>;
+  let isCodeGenOnly = 1 in
+     def QVESPLATIs : Z23Form_2<4, 37,
+                         (outs qsrc:$FRT), (ins qsrc:$FRA, u2imm:$idx),
+                         "qvesplati $FRT, $FRA, $idx", IIC_VecPerm,
+                         [(set v4f32:$FRT,
+                               (PPCqvesplati v4f32:$FRA, (i32 imm:$idx)))]>;
+  let isCodeGenOnly = 1 in
+     def QVESPLATIb : Z23Form_2<4, 37,
+                         (outs qbrc:$FRT), (ins qbrc:$FRA, u2imm:$idx),
+                         "qvesplati $FRT, $FRA, $idx", IIC_VecPerm,
+                         [(set v4i1:$FRT,
+                               (PPCqvesplati v4i1:$FRA, (i32 imm:$idx)))]>;
+
+  def QVFPERM : AForm_1<4, 6,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+                      "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm,
+                      [(set v4f64:$FRT,
+                            (PPCqvfperm v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>;
+  let isCodeGenOnly = 1 in
+     def QVFPERMs : AForm_1<4, 6,
+                         (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qfrc:$FRC),
+                         "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm,
+                         [(set v4f32:$FRT,
+                               (PPCqvfperm v4f32:$FRA, v4f32:$FRB, v4f64:$FRC))]>;
+
+  let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+  def QVGPCI : Z23Form_3<4, 133,
+                      (outs qfrc:$FRT), (ins u12imm:$idx),
+                      "qvgpci $FRT, $idx", IIC_VecPerm,
+                      [(set v4f64:$FRT, (PPCqvgpci (u12:$idx)))]>;
+
+  // Compare Instruction
+  let isCodeGenOnly = 1 in
+    def QVFTSTNAN : QPXX18_Int<4, 64, "qvftstnan", int_ppc_qpx_qvftstnan>;
+  def QVFTSTNANb : XForm_18<4, 64, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+                           "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare,
+                           [(set v4i1:$FRT,
+                                 (setcc v4f64:$FRA, v4f64:$FRB, SETUO))]>;
+  let isCodeGenOnly = 1 in
+  def QVFTSTNANbs : XForm_18<4, 64, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+                            "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare,
+                            [(set v4i1:$FRT,
+                                  (setcc v4f32:$FRA, v4f32:$FRB, SETUO))]>;
+  let isCodeGenOnly = 1 in
+    def QVFCMPLT : QPXX18_Int<4, 96, "qvfcmplt", int_ppc_qpx_qvfcmplt>;
+  def QVFCMPLTb : XForm_18<4, 96, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+                           "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare,
+                           [(set v4i1:$FRT,
+                                 (setcc v4f64:$FRA, v4f64:$FRB, SETOLT))]>;
+  let isCodeGenOnly = 1 in
+  def QVFCMPLTbs : XForm_18<4, 96, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+                            "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare,
+                            [(set v4i1:$FRT,
+                                  (setcc v4f32:$FRA, v4f32:$FRB, SETOLT))]>;
+  let isCodeGenOnly = 1 in
+    def QVFCMPGT : QPXX18_Int<4, 32, "qvfcmpgt", int_ppc_qpx_qvfcmpgt>;
+  def QVFCMPGTb : XForm_18<4, 32, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+                           "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare,
+                           [(set v4i1:$FRT,
+                                 (setcc v4f64:$FRA, v4f64:$FRB, SETOGT))]>;
+  let isCodeGenOnly = 1 in
+  def QVFCMPGTbs : XForm_18<4, 32, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+                            "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare,
+                            [(set v4i1:$FRT,
+                                  (setcc v4f32:$FRA, v4f32:$FRB, SETOGT))]>;
+  let isCodeGenOnly = 1 in
+    def QVFCMPEQ : QPXX18_Int<4, 0, "qvfcmpeq", int_ppc_qpx_qvfcmpeq>;
+  def QVFCMPEQb : XForm_18<4, 0, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+                           "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare,
+                           [(set v4i1:$FRT,
+                                 (setcc v4f64:$FRA, v4f64:$FRB, SETOEQ))]>;
+  let isCodeGenOnly = 1 in
+  def QVFCMPEQbs : XForm_18<4, 0, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+                            "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare,
+                            [(set v4i1:$FRT,
+                                  (setcc v4f32:$FRA, v4f32:$FRB, SETOEQ))]>;
+
+  let isCodeGenOnly = 1 in
+  def QVFLOGICAL : XForm_20<4, 4,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u12imm:$tttt),
+                      "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>;
+  def QVFLOGICALb : XForm_20<4, 4,
+                      (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt),
+                      "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>;
+  let isCodeGenOnly = 1 in
+  def QVFLOGICALs : XForm_20<4, 4,
+                      (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt),
+                      "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>;
+
+  // Load indexed instructions
+  let mayLoad = 1, canFoldAsLoad = 1 in {
+    def QVLFDX : XForm_1<31, 583,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfdx $FRT, $src", IIC_LdStLFD,
+                        [(set v4f64:$FRT, (load xoaddr:$src))]>;
+    let isCodeGenOnly = 1 in
+    def QVLFDXb : XForm_1<31, 583,
+                        (outs qbrc:$FRT), (ins memrr:$src),
+                        "qvlfdx $FRT, $src", IIC_LdStLFD, []>;
+
+    let RC = 1 in
+    def QVLFDXA : XForm_1<31, 583,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfdxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFDUX : XForm_1<31, 615,
+                        (outs qfrc:$FRT, ptr_rc_nor0:$ea_result),
+                        (ins memrr:$src),
+                        "qvlfdux $FRT, $src", IIC_LdStLFDU, []>,
+                        RegConstraint<"$src.ptrreg = $ea_result">,
+                        NoEncode<"$ea_result">;
+    let RC = 1 in
+    def QVLFDUXA : XForm_1<31, 615,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfduxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFSX : XForm_1<31, 519,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfsx $FRT, $src", IIC_LdStLFD,
+                        [(set v4f64:$FRT, (extloadv4f32 xoaddr:$src))]>;
+
+    let isCodeGenOnly = 1 in
+    def QVLFSXb : XForm_1<31, 519,
+                        (outs qbrc:$FRT), (ins memrr:$src),
+                        "qvlfsx $FRT, $src", IIC_LdStLFD,
+                        [(set v4i1:$FRT, (PPCqvlfsb xoaddr:$src))]>;
+    let isCodeGenOnly = 1 in
+    def QVLFSXs : XForm_1<31, 519,
+                        (outs qsrc:$FRT), (ins memrr:$src),
+                        "qvlfsx $FRT, $src", IIC_LdStLFD,
+                        [(set v4f32:$FRT, (load xoaddr:$src))]>;
+
+    let RC = 1 in
+    def QVLFSXA : XForm_1<31, 519,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfsxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFSUX : XForm_1<31, 551,
+                        (outs qsrc:$FRT, ptr_rc_nor0:$ea_result),
+                        (ins memrr:$src),
+                        "qvlfsux $FRT, $src", IIC_LdStLFDU, []>,
+                        RegConstraint<"$src.ptrreg = $ea_result">,
+                        NoEncode<"$ea_result">;
+
+    let RC = 1 in
+    def QVLFSUXA : XForm_1<31, 551,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfsuxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFCDX : XForm_1<31, 71,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfcdx $FRT, $src", IIC_LdStLFD, []>;
+    let RC = 1 in
+    def QVLFCDXA : XForm_1<31, 71,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfcdxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFCDUX : XForm_1<31, 103,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfcdux $FRT, $src", IIC_LdStLFD, []>;
+    let RC = 1 in
+    def QVLFCDUXA : XForm_1<31, 103,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfcduxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFCSX : XForm_1<31, 7,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfcsx $FRT, $src", IIC_LdStLFD, []>;
+    let isCodeGenOnly = 1 in
+    def QVLFCSXs : XForm_1<31, 7,
+                         (outs qsrc:$FRT), (ins memrr:$src),
+                         "qvlfcsx $FRT, $src", IIC_LdStLFD, []>;
+
+    let RC = 1 in
+    def QVLFCSXA : XForm_1<31, 7,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfcsxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFCSUX : XForm_1<31, 39,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfcsux $FRT, $src", IIC_LdStLFD, []>;
+    let RC = 1 in
+    def QVLFCSUXA : XForm_1<31, 39,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfcsuxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFIWAX : XForm_1<31, 871,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfiwax $FRT, $src", IIC_LdStLFD, []>;
+    let RC = 1 in
+    def QVLFIWAXA : XForm_1<31, 871,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfiwaxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFIWZX : XForm_1<31, 839,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfiwzx $FRT, $src", IIC_LdStLFD, []>;
+    let RC = 1 in
+    def QVLFIWZXA : XForm_1<31, 839,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfiwzxa $FRT, $src", IIC_LdStLFD, []>;
+  }
+
+
+  def QVLPCLDX : XForm_1<31, 582,
+                      (outs qfrc:$FRT), (ins memrr:$src),
+                      "qvlpcldx $FRT, $src", IIC_LdStLFD, []>;
+  def QVLPCLSX : XForm_1<31, 518,
+                      (outs qfrc:$FRT), (ins memrr:$src),
+                      "qvlpclsx $FRT, $src", IIC_LdStLFD, []>;
+  let isCodeGenOnly = 1 in
+    def QVLPCLSXint : XForm_11<31, 518,
+                              (outs qfrc:$FRT), (ins G8RC:$src),
+                              "qvlpclsx $FRT, 0, $src", IIC_LdStLFD, []>;
+  def QVLPCRDX : XForm_1<31, 70,
+                      (outs qfrc:$FRT), (ins memrr:$src),
+                      "qvlpcrdx $FRT, $src", IIC_LdStLFD, []>;
+  def QVLPCRSX : XForm_1<31, 6,
+                      (outs qfrc:$FRT), (ins memrr:$src),
+                      "qvlpcrsx $FRT, $src", IIC_LdStLFD, []>;
+
+  // Store indexed instructions
+  let mayStore = 1 in {
+    def QVSTFDX : XForm_8<31, 711,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfdx $FRT, $dst", IIC_LdStSTFD,
+                        [(store qfrc:$FRT, xoaddr:$dst)]>;
+    let isCodeGenOnly = 1 in
+    def QVSTFDXb : XForm_8<31, 711,
+                        (outs), (ins qbrc:$FRT, memrr:$dst),
+                        "qvstfdx $FRT, $dst", IIC_LdStSTFD, []>;
+
+    let RC = 1 in
+    def QVSTFDXA : XForm_8<31, 711,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfdxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFDUX : XForm_8<31, 743, (outs ptr_rc_nor0:$ea_res),
+                           (ins qfrc:$FRT, memrr:$dst),
+                           "qvstfdux $FRT, $dst", IIC_LdStSTFDU, []>,
+                           RegConstraint<"$dst.ptrreg = $ea_res">,
+                           NoEncode<"$ea_res">;
+
+    let RC = 1 in
+    def QVSTFDUXA : XForm_8<31, 743,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfduxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFDXI : XForm_8<31, 709,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfdxi $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFDXIA : XForm_8<31, 709,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfdxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFDUXI : XForm_8<31, 741,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfduxi $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFDUXIA : XForm_8<31, 741,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfduxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFSX : XForm_8<31, 647,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfsx $FRT, $dst", IIC_LdStSTFD,
+                        [(truncstorev4f32 qfrc:$FRT, xoaddr:$dst)]>;
+    let isCodeGenOnly = 1 in
+    def QVSTFSXs : XForm_8<31, 647,
+                         (outs), (ins qsrc:$FRT, memrr:$dst),
+                         "qvstfsx $FRT, $dst", IIC_LdStSTFD,
+                         [(store qsrc:$FRT, xoaddr:$dst)]>;
+
+    let RC = 1 in
+    def QVSTFSXA : XForm_8<31, 647,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfsxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFSUX : XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res),
+                           (ins qsrc:$FRT, memrr:$dst),
+                           "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>,
+                           RegConstraint<"$dst.ptrreg = $ea_res">,
+                           NoEncode<"$ea_res">;
+    let isCodeGenOnly = 1 in
+    def QVSTFSUXs: XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res),
+                           (ins qfrc:$FRT, memrr:$dst),
+                           "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>,
+                           RegConstraint<"$dst.ptrreg = $ea_res">,
+                           NoEncode<"$ea_res">;
+
+    let RC = 1 in
+    def QVSTFSUXA : XForm_8<31, 679,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfsuxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFSXI : XForm_8<31, 645,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfsxi $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFSXIA : XForm_8<31, 645,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfsxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFSUXI : XForm_8<31, 677,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfsuxi $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFSUXIA : XForm_8<31, 677,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfsuxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFCDX : XForm_8<31, 199,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcdx $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFCDXA : XForm_8<31, 199,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcdxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFCSX : XForm_8<31, 135,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>;
+    let isCodeGenOnly = 1 in
+    def QVSTFCSXs : XForm_8<31, 135,
+                         (outs), (ins qsrc:$FRT, memrr:$dst),
+                         "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>;
+
+    let RC = 1 in
+    def QVSTFCSXA : XForm_8<31, 135,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcsxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFCDUX : XForm_8<31, 231,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcdux $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFCDUXA : XForm_8<31, 231,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcduxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFCSUX : XForm_8<31, 167,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcsux $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFCSUXA : XForm_8<31, 167,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcsuxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFCDXI : XForm_8<31, 197,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcdxi $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFCDXIA : XForm_8<31, 197,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcdxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFCSXI : XForm_8<31, 133,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcsxi $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFCSXIA : XForm_8<31, 133,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcsxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFCDUXI : XForm_8<31, 229,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcduxi $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFCDUXIA : XForm_8<31, 229,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcduxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFCSUXI : XForm_8<31, 165,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcsuxi $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFCSUXIA : XForm_8<31, 165,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcsuxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFIWX : XForm_8<31, 967,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfiwx $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFIWXA : XForm_8<31, 967,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfiwxa $FRT, $dst", IIC_LdStSTFD, []>;
+  }
+}
+
+} // neverHasSideEffects
+}
+
+def : InstAlias<"qvfclr $FRT",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 0)>;
+def : InstAlias<"qvfand $FRT, $FRA, $FRB",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 1)>;
+def : InstAlias<"qvfandc $FRT, $FRA, $FRB",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 4)>;
+def : InstAlias<"qvfctfb $FRT, $FRA",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 5)>;
+def : InstAlias<"qvfxor $FRT, $FRA, $FRB",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 6)>;
+def : InstAlias<"qvfor $FRT, $FRA, $FRB",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 7)>;
+def : InstAlias<"qvfnor $FRT, $FRA, $FRB",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 8)>;
+def : InstAlias<"qvfequ $FRT, $FRA, $FRB",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 9)>;
+def : InstAlias<"qvfnot $FRT, $FRA",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 10)>;
+def : InstAlias<"qvforc $FRT, $FRA, $FRB",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 13)>;
+def : InstAlias<"qvfnand $FRT, $FRA, $FRB",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 14)>;
+def : InstAlias<"qvfset $FRT",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 15)>;
+
+//===----------------------------------------------------------------------===//
+// Additional QPX Patterns
+//
+
+def : Pat<(v4f64 (scalar_to_vector f64:$A)),
+          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), $A, sub_64)>;
+def : Pat<(v4f32 (scalar_to_vector f32:$A)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $A, sub_64)>;
+
+def : Pat<(f64 (vector_extract v4f64:$S, 0)),
+          (EXTRACT_SUBREG $S, sub_64)>;
+def : Pat<(f32 (vector_extract v4f32:$S, 0)),
+          (EXTRACT_SUBREG $S, sub_64)>;
+
+def : Pat<(f64 (vector_extract v4f64:$S, 1)),
+          (EXTRACT_SUBREG (QVESPLATI $S, 1), sub_64)>;
+def : Pat<(f64 (vector_extract v4f64:$S, 2)),
+          (EXTRACT_SUBREG (QVESPLATI $S, 2), sub_64)>;
+def : Pat<(f64 (vector_extract v4f64:$S, 3)),
+          (EXTRACT_SUBREG (QVESPLATI $S, 3), sub_64)>;
+
+def : Pat<(f32 (vector_extract v4f32:$S, 1)),
+          (EXTRACT_SUBREG (QVESPLATIs $S, 1), sub_64)>;
+def : Pat<(f32 (vector_extract v4f32:$S, 2)),
+          (EXTRACT_SUBREG (QVESPLATIs $S, 2), sub_64)>;
+def : Pat<(f32 (vector_extract v4f32:$S, 3)),
+          (EXTRACT_SUBREG (QVESPLATIs $S, 3), sub_64)>;
+
+def : Pat<(f64 (vector_extract v4f64:$S, i64:$F)),
+          (EXTRACT_SUBREG (QVFPERM $S, $S,
+                                   (QVLPCLSXint (RLDICR $F, 2,
+                                                        /* 63-2 = */ 61))),
+                          sub_64)>;
+def : Pat<(f32 (vector_extract v4f32:$S, i64:$F)),
+          (EXTRACT_SUBREG (QVFPERMs $S, $S,
+                                    (QVLPCLSXint (RLDICR $F, 2,
+                                                         /* 63-2 = */ 61))),
+                          sub_64)>;
+
+def : Pat<(int_ppc_qpx_qvfperm v4f64:$A, v4f64:$B, v4f64:$C),
+          (QVFPERM $A, $B, $C)>;
+
+def : Pat<(int_ppc_qpx_qvfcpsgn v4f64:$A, v4f64:$B),
+          (QVFCPSGN $A, $B)>;
+
+// FCOPYSIGN's operand types need not agree.
+def : Pat<(fcopysign v4f64:$frB, v4f32:$frA),
+          (QVFCPSGN (COPY_TO_REGCLASS $frA, QFRC), $frB)>;
+def : Pat<(fcopysign QSRC:$frB, QFRC:$frA),
+          (QVFCPSGNs (COPY_TO_REGCLASS $frA, QSRC), $frB)>;
+
+def : Pat<(int_ppc_qpx_qvfneg v4f64:$A), (QVFNEG $A)>;
+def : Pat<(int_ppc_qpx_qvfabs v4f64:$A), (QVFABS $A)>;
+def : Pat<(int_ppc_qpx_qvfnabs v4f64:$A), (QVFNABS $A)>;
+
+def : Pat<(int_ppc_qpx_qvfriz v4f64:$A), (QVFRIZ $A)>;
+def : Pat<(int_ppc_qpx_qvfrin v4f64:$A), (QVFRIN $A)>;
+def : Pat<(int_ppc_qpx_qvfrip v4f64:$A), (QVFRIP $A)>;
+def : Pat<(int_ppc_qpx_qvfrim v4f64:$A), (QVFRIM $A)>;
+
+def : Pat<(int_ppc_qpx_qvfre v4f64:$A), (QVFRE $A)>;
+def : Pat<(int_ppc_qpx_qvfrsqrte v4f64:$A), (QVFRSQRTE $A)>;
+
+def : Pat<(int_ppc_qpx_qvfadd v4f64:$A, v4f64:$B),
+          (QVFADD $A, $B)>;
+def : Pat<(int_ppc_qpx_qvfsub v4f64:$A, v4f64:$B),
+          (QVFSUB $A, $B)>;
+def : Pat<(int_ppc_qpx_qvfmul v4f64:$A, v4f64:$B),
+          (QVFMUL $A, $B)>;
+
+// Additional QVFNMSUB patterns: -a*c + b == -(a*c - b)
+def : Pat<(fma (fneg v4f64:$A), v4f64:$C, v4f64:$B),
+          (QVFNMSUB $A, $B, $C)>;
+def : Pat<(fma v4f64:$A, (fneg v4f64:$C), v4f64:$B),
+          (QVFNMSUB $A, $B, $C)>;
+def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B),
+          (QVFNMSUBSs $A, $B, $C)>;
+def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B),
+          (QVFNMSUBSs $A, $B, $C)>;
+
+def : Pat<(int_ppc_qpx_qvfmadd v4f64:$A, v4f64:$B, v4f64:$C),
+          (QVFMADD $A, $B, $C)>;
+def : Pat<(int_ppc_qpx_qvfnmadd v4f64:$A, v4f64:$B, v4f64:$C),
+          (QVFNMADD $A, $B, $C)>;
+def : Pat<(int_ppc_qpx_qvfmsub v4f64:$A, v4f64:$B, v4f64:$C),
+          (QVFMSUB $A, $B, $C)>;
+def : Pat<(int_ppc_qpx_qvfnmsub v4f64:$A, v4f64:$B, v4f64:$C),
+          (QVFNMSUB $A, $B, $C)>;
+
+def : Pat<(int_ppc_qpx_qvlfd xoaddr:$src),
+          (QVLFDX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src),
+          (QVLFDXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfs xoaddr:$src),
+          (QVLFSX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src),
+          (QVLFSXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfcda xoaddr:$src),
+          (QVLFCDXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfcd xoaddr:$src),
+          (QVLFCDX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfcsa xoaddr:$src),
+          (QVLFCSXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfcs xoaddr:$src),
+          (QVLFCSX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src),
+          (QVLFDXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfiwaa xoaddr:$src),
+          (QVLFIWAXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfiwa xoaddr:$src),
+          (QVLFIWAX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfiwza xoaddr:$src),
+          (QVLFIWZXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfiwz xoaddr:$src),
+          (QVLFIWZX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src),
+          (QVLFSXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlpcld xoaddr:$src),
+          (QVLPCLDX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlpcls xoaddr:$src),
+          (QVLPCLSX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlpcrd xoaddr:$src),
+          (QVLPCRDX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlpcrs xoaddr:$src),
+          (QVLPCRSX xoaddr:$src)>;
+
+def : Pat<(int_ppc_qpx_qvstfd v4f64:$T, xoaddr:$dst),
+          (QVSTFDX $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfs v4f64:$T, xoaddr:$dst),
+          (QVSTFSX $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfcda v4f64:$T, xoaddr:$dst),
+          (QVSTFCDXA $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfcd v4f64:$T, xoaddr:$dst),
+          (QVSTFCDX $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfcsa v4f64:$T, xoaddr:$dst),
+          (QVSTFCSXA $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfcs v4f64:$T, xoaddr:$dst),
+          (QVSTFCSX $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfda v4f64:$T, xoaddr:$dst),
+          (QVSTFDXA $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfiwa v4f64:$T, xoaddr:$dst),
+          (QVSTFIWXA $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfiw v4f64:$T, xoaddr:$dst),
+          (QVSTFIWX $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfsa v4f64:$T, xoaddr:$dst),
+          (QVSTFSXA $T, xoaddr:$dst)>;
+
+def : Pat<(pre_store v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (QVSTFDUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store v4f32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (QVSTFSUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_truncstv4f32 v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (QVSTFSUXs $rS, $ptrreg, $ptroff)>;
+
+def : Pat<(int_ppc_qpx_qvflogical  v4f64:$A, v4f64:$B, (i32 imm:$idx)),
+          (QVFLOGICAL $A, $B, imm:$idx)>;
+def : Pat<(int_ppc_qpx_qvgpci (u12:$idx)),
+          (QVGPCI imm:$idx)>;
+
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOGE),
+          (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
+                       (QVFTSTNANb $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOLE),
+          (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
+                       (QVFTSTNANb $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETONE),
+          (QVFLOGICALb (QVFCMPEQb $FRA, $FRB),
+                       (QVFTSTNANb $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETO),
+          (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
+                       (QVFTSTNANb $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUEQ),
+          (QVFLOGICALb (QVFCMPEQb $FRA, $FRB),
+                       (QVFTSTNANb $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGT),
+          (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
+                       (QVFTSTNANb $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGE),
+          (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
+                       (QVFCMPLTb $FRA, $FRB), (i32 13))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULT),
+          (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
+                       (QVFTSTNANb $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULE),
+          (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
+                       (QVFCMPGTb $FRA, $FRB), (i32 13))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUNE),
+          (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
+                       (QVFCMPEQb $FRA, $FRB), (i32 13))>;
+
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETEQ),
+          (QVFCMPEQb $FRA, $FRB)>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGT),
+          (QVFCMPGTb $FRA, $FRB)>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGE),
+          (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
+                       (QVFCMPLTb $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLT),
+          (QVFCMPLTb $FRA, $FRB)>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLE),
+          (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
+                       (QVFCMPGTb $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETNE),
+          (QVFLOGICALb (QVFCMPEQb $FRA, $FRB),
+                       (QVFCMPEQb $FRA, $FRB), (i32 10))>;
+
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOGE),
+          (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
+                       (QVFTSTNANbs $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOLE),
+          (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
+                       (QVFTSTNANbs $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETONE),
+          (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB),
+                       (QVFTSTNANbs $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETO),
+          (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
+                       (QVFTSTNANbs $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUEQ),
+          (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB),
+                       (QVFTSTNANbs $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGT),
+          (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
+                       (QVFTSTNANbs $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGE),
+          (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
+                       (QVFCMPLTbs $FRA, $FRB), (i32 13))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULT),
+          (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
+                       (QVFTSTNANbs $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULE),
+          (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
+                       (QVFCMPGTbs $FRA, $FRB), (i32 13))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUNE),
+          (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
+                       (QVFCMPEQbs $FRA, $FRB), (i32 13))>;
+
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETEQ),
+          (QVFCMPEQbs $FRA, $FRB)>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGT),
+          (QVFCMPGTbs $FRA, $FRB)>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGE),
+          (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
+                       (QVFCMPLTbs $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLT),
+          (QVFCMPLTbs $FRA, $FRB)>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLE),
+          (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
+                       (QVFCMPGTbs $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETNE),
+          (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB),
+                       (QVFCMPEQbs $FRA, $FRB), (i32 10))>;
+
+def : Pat<(and v4i1:$FRA, (not v4i1:$FRB)),
+          (QVFLOGICALb $FRA, $FRB, (i32 4))>;
+def : Pat<(not (or v4i1:$FRA, v4i1:$FRB)),
+          (QVFLOGICALb $FRA, $FRB, (i32 8))>;
+def : Pat<(not (xor v4i1:$FRA, v4i1:$FRB)),
+          (QVFLOGICALb $FRA, $FRB, (i32 9))>;
+def : Pat<(or v4i1:$FRA, (not v4i1:$FRB)),
+          (QVFLOGICALb $FRA, $FRB, (i32 13))>;
+def : Pat<(not (and v4i1:$FRA, v4i1:$FRB)),
+          (QVFLOGICALb $FRA, $FRB, (i32 14))>;
+
+def : Pat<(and v4i1:$FRA, v4i1:$FRB),
+          (QVFLOGICALb $FRA, $FRB, (i32 1))>;
+def : Pat<(or v4i1:$FRA, v4i1:$FRB),
+          (QVFLOGICALb $FRA, $FRB, (i32 7))>;
+def : Pat<(xor v4i1:$FRA, v4i1:$FRB),
+          (QVFLOGICALb $FRA, $FRB, (i32 6))>;
+def : Pat<(not v4i1:$FRA),
+          (QVFLOGICALb $FRA, $FRA, (i32 10))>;
+
+def : Pat<(v4f64 (fextend v4f32:$src)),
+          (COPY_TO_REGCLASS $src, QFRC)>;
+
+def : Pat<(v4f32 (fround_exact v4f64:$src)),
+          (COPY_TO_REGCLASS $src, QSRC)>;
+
+// Extract the underlying floating-point values from the
+// QPX (-1.0, 1.0) boolean representation.
+def : Pat<(v4f64 (PPCqbflt v4i1:$src)),
+          (COPY_TO_REGCLASS $src, QFRC)>;
+
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLT)),
+          (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLE)),
+          (SELECT_QFRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETEQ)),
+          (SELECT_QFRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGE)),
+          (SELECT_QFRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGT)),
+          (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETNE)),
+          (SELECT_QFRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLT)),
+          (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLE)),
+          (SELECT_QSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETEQ)),
+          (SELECT_QSRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGE)),
+          (SELECT_QSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGT)),
+          (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETNE)),
+          (SELECT_QSRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLT)),
+          (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLE)),
+          (SELECT_QBRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETEQ)),
+          (SELECT_QBRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGE)),
+          (SELECT_QBRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGT)),
+          (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETNE)),
+          (SELECT_QBRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+} // end HasQPX
+
+let Predicates = [HasQPX, NoNaNsFPMath] in {
+def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB),
+          (QVFSELb (QVFCMPLTb $FRA, $FRB), $FRB, $FRA)>;
+def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB),
+          (QVFSELb (QVFCMPGTb $FRA, $FRB), $FRB, $FRA)>;
+
+def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB),
+          (QVFSELbs (QVFCMPLTbs $FRA, $FRB), $FRB, $FRA)>;
+def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB),
+          (QVFSELbs (QVFCMPGTbs $FRA, $FRB), $FRB, $FRA)>;
+}
+
+let Predicates = [HasQPX, NaNsFPMath] in {
+// When either of these operands is NaN, we should return the other operand.
+// QVFCMPLT/QVFCMPGT return false is either operand is NaN, which means we need
+// to explicitly or with a NaN test on the second operand.
+def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB),
+          (QVFSELb (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
+                                (QVFTSTNANb $FRB, $FRB), (i32 7)),
+                   $FRB, $FRA)>;
+def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB),
+          (QVFSELb (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
+                                (QVFTSTNANb $FRB, $FRB), (i32 7)),
+                   $FRB, $FRA)>;
+
+def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB),
+          (QVFSELbs (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
+                                 (QVFTSTNANbs $FRB, $FRB), (i32 7)),
+                   $FRB, $FRA)>;
+def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB),
+          (QVFSELbs (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
+                                 (QVFTSTNANbs $FRB, $FRB), (i32 7)),
+                   $FRB, $FRA)>;
+}
+
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 2c8f998..d6cb3a0 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -25,6 +25,23 @@ def vsfrc : RegisterOperand<VSFRC> {
   let ParserMatchClass = PPCRegVSFRCAsmOperand;
 }
 
+// Little-endian-specific nodes.
+def SDT_PPClxvd2x : SDTypeProfile<1, 1, [
+  SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCstxvd2x : SDTypeProfile<0, 2, [
+  SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCxxswapd : SDTypeProfile<1, 1, [
+  SDTCisSameAs<0, 1>
+]>;
+
+def PPClxvd2x  : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
+                        [SDNPHasChain, SDNPMayLoad]>;
+def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x,
+                        [SDNPHasChain, SDNPMayStore]>;
+def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>;
+
 multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, dag OOL, dag IOL,
                     string asmbase, string asmstr, InstrItinClass itin,
                     list<dag> pattern> {
@@ -40,9 +57,12 @@ multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, dag OOL, dag IOL,
 }
 
 def HasVSX : Predicate<"PPCSubTarget->hasVSX()">;
+def IsLittleEndian : Predicate<"PPCSubTarget->isLittleEndian()">;
+def IsBigEndian : Predicate<"!PPCSubTarget->isLittleEndian()">;
+
 let Predicates = [HasVSX] in {
 let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
-let neverHasSideEffects = 1 in { // VSX instructions don't have side effects.
+let hasSideEffects = 0 in { // VSX instructions don't have side effects.
 let Uses = [RM] in {
 
   // Load indexed instructions
@@ -77,12 +97,12 @@ let Uses = [RM] in {
     def STXVD2X : XX1Form<31, 972,
                          (outs), (ins vsrc:$XT, memrr:$dst),
                          "stxvd2x $XT, $dst", IIC_LdStSTFD,
-                         [(int_ppc_vsx_stxvd2x v2f64:$XT, xoaddr:$dst)]>;
+                         [(store v2f64:$XT, xoaddr:$dst)]>;
 
     def STXVW4X : XX1Form<31, 908,
                          (outs), (ins vsrc:$XT, memrr:$dst),
                          "stxvw4x $XT, $dst", IIC_LdStSTFD,
-                         [(int_ppc_vsx_stxvw4x v4i32:$XT, xoaddr:$dst)]>;
+                         [(store v4i32:$XT, xoaddr:$dst)]>;
   }
 
   // Add/Mul Instructions
@@ -728,7 +748,7 @@ let Uses = [RM] in {
   def XXSPLTW : XX2Form_2<60, 164,
                        (outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM),
                        "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;
-} // neverHasSideEffects
+} // hasSideEffects
 
 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
 // instruction selection into a branch sequence.
@@ -773,6 +793,8 @@ def : InstAlias<"xxswapd $XT, $XB",
                 (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 2)>;
 
 let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
+
+let Predicates = [IsBigEndian] in {
 def : Pat<(v2f64 (scalar_to_vector f64:$A)),
           (v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>;
 
@@ -780,6 +802,18 @@ def : Pat<(f64 (vector_extract v2f64:$S, 0)),
           (f64 (EXTRACT_SUBREG $S, sub_64))>;
 def : Pat<(f64 (vector_extract v2f64:$S, 1)),
           (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
+}
+
+let Predicates = [IsLittleEndian] in {
+def : Pat<(v2f64 (scalar_to_vector f64:$A)),
+          (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), $A, sub_64),
+                           (SUBREG_TO_REG (i64 1), $A, sub_64), 0))>;
+
+def : Pat<(f64 (vector_extract v2f64:$S, 0)),
+          (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
+def : Pat<(f64 (vector_extract v2f64:$S, 1)),
+          (f64 (EXTRACT_SUBREG $S, sub_64))>;
+}
 
 // Additional fnmsub patterns: -a*c + b == -(a*c - b)
 def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
@@ -854,11 +888,21 @@ def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))),
 def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
 def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
 def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>;
 
 // Stores.
-def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
+          (STXVD2X $rS, xoaddr:$dst)>;
 def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
-def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
+def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
+          (STXVW4X $rS, xoaddr:$dst)>;
+def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+
+// Permutes.
+def : Pat<(v2f64 (PPCxxswapd v2f64:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v2i64 (PPCxxswapd v2i64:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>;
 
 // Selects.
 def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)),
@@ -896,3 +940,28 @@ def : Pat<(int_ppc_vsx_xvdivdp v2f64:$A, v2f64:$B),
 } // AddedComplexity
 } // HasVSX
 
+// The following VSX instructions were introduced in Power ISA 2.07
+/* FIXME: if the operands are v2i64, these patterns will not match.
+   we should define new patterns or otherwise match the same patterns
+   when the elements are larger than i32.
+*/
+def HasP8Vector : Predicate<"PPCSubTarget->hasP8Vector()">;
+let Predicates = [HasP8Vector] in {
+let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
+let isCommutable = 1 in {
+  def XXLEQV : XX3Form<60, 186,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxleqv $XT, $XA, $XB", IIC_VecGeneral,
+                       [(set v4i32:$XT, (vnot_ppc (xor v4i32:$XA, v4i32:$XB)))]>;
+  def XXLNAND : XX3Form<60, 178,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xxlnand $XT, $XA, $XB", IIC_VecGeneral,
+                        [(set v4i32:$XT, (vnot_ppc (and v4i32:$XA,
+                                                    v4i32:$XB)))]>;
+  } // isCommutable
+def XXLORC : XX3Form<60, 170,
+                     (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                     "xxlorc $XT, $XA, $XB", IIC_VecGeneral,
+                     [(set v4i32:$XT, (or v4i32:$XA, (vnot_ppc v4i32:$XB)))]>;
+} // AddedComplexity = 500
+} // HasP8Vector
diff --git a/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp b/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp
new file mode 100644
index 0000000..efd2d92
--- /dev/null
+++ b/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp
@@ -0,0 +1,231 @@
+//===-------- PPCLoopDataPrefetch.cpp - Loop Data Prefetching Pass --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Loop Data Prefetching Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppc-loop-data-prefetch"
+#include "PPC.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+// By default, we limit this to creating 16 PHIs (which is a little over half
+// of the allocatable register set).
+static cl::opt<bool>
+PrefetchWrites("ppc-loop-prefetch-writes", cl::Hidden, cl::init(false),
+               cl::desc("Prefetch write addresses"));
+
+// This seems like a reasonable default for the BG/Q (this pass is enabled, by
+// default, only on the BG/Q).
+static cl::opt<unsigned>
+PrefDist("ppc-loop-prefetch-distance", cl::Hidden, cl::init(300),
+         cl::desc("The loop prefetch distance"));
+
+static cl::opt<unsigned>
+CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
+              cl::desc("The loop prefetch cache line size"));
+
+namespace llvm {
+  void initializePPCLoopDataPrefetchPass(PassRegistry&);
+}
+
+namespace {
+
+  class PPCLoopDataPrefetch : public FunctionPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    PPCLoopDataPrefetch() : FunctionPass(ID) {
+      initializePPCLoopDataPrefetchPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addRequired<ScalarEvolution>();
+      // FIXME: For some reason, preserving SE here breaks LSR (even if
+      // this pass changes nothing).
+      // AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
+    }
+
+    bool runOnFunction(Function &F) override;
+    bool runOnLoop(Loop *L);
+
+  private:
+    AssumptionCache *AC;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
+    const TargetTransformInfo *TTI;
+    const DataLayout *DL;
+  };
+}
+
+char PPCLoopDataPrefetch::ID = 0;
+INITIALIZE_PASS_BEGIN(PPCLoopDataPrefetch, "ppc-loop-data-prefetch",
+                      "PPC Loop Data Prefetch", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_END(PPCLoopDataPrefetch, "ppc-loop-data-prefetch",
+                    "PPC Loop Data Prefetch", false, false)
+
+FunctionPass *llvm::createPPCLoopDataPrefetchPass() { return new PPCLoopDataPrefetch(); }
+
+bool PPCLoopDataPrefetch::runOnFunction(Function &F) {
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  SE = &getAnalysis<ScalarEvolution>();
+  DL = F.getParent()->getDataLayout();
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  bool MadeChange = false;
+
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end();
+       I != E; ++I) {
+    Loop *L = *I;
+    MadeChange |= runOnLoop(L);
+  }
+
+  return MadeChange;
+}
+
+bool PPCLoopDataPrefetch::runOnLoop(Loop *L) {
+  bool MadeChange = false;
+
+  // Only prefetch in the inner-most loop
+  if (!L->empty())
+    return MadeChange;
+
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+  // Calculate the number of iterations ahead to prefetch
+  CodeMetrics Metrics;
+  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+       I != IE; ++I) {
+
+    // If the loop already has prefetches, then assume that the user knows
+    // what he or she is doing and don't add any more.
+    for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end();
+         J != JE; ++J)
+      if (CallInst *CI = dyn_cast<CallInst>(J))
+        if (Function *F = CI->getCalledFunction())
+          if (F->getIntrinsicID() == Intrinsic::prefetch)
+            return MadeChange;
+
+    Metrics.analyzeBasicBlock(*I, *TTI, EphValues);
+  }
+  unsigned LoopSize = Metrics.NumInsts;
+  if (!LoopSize)
+    LoopSize = 1;
+
+  unsigned ItersAhead = PrefDist/LoopSize;
+  if (!ItersAhead)
+    ItersAhead = 1;
+
+  SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads;
+  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+       I != IE; ++I) {
+    for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end();
+        J != JE; ++J) {
+      Value *PtrValue;
+      Instruction *MemI;
+
+      if (LoadInst *LMemI = dyn_cast<LoadInst>(J)) {
+        MemI = LMemI;
+        PtrValue = LMemI->getPointerOperand();
+      } else if (StoreInst *SMemI = dyn_cast<StoreInst>(J)) {
+        if (!PrefetchWrites) continue;
+        MemI = SMemI;
+        PtrValue = SMemI->getPointerOperand();
+      } else continue;
+
+      unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
+      if (PtrAddrSpace)
+        continue;
+
+      if (L->isLoopInvariant(PtrValue))
+        continue;
+
+      const SCEV *LSCEV = SE->getSCEV(PtrValue);
+      const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+      if (!LSCEVAddRec)
+        continue;
+
+      // We don't want to double prefetch individual cache lines. If this load
+      // is known to be within one cache line of some other load that has
+      // already been prefetched, then don't prefetch this one as well.
+      bool DupPref = false;
+      for (SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>,
+             16>::iterator K = PrefLoads.begin(), KE = PrefLoads.end();
+           K != KE; ++K) {
+        const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, K->second);
+        if (const SCEVConstant *ConstPtrDiff =
+            dyn_cast<SCEVConstant>(PtrDiff)) {
+          int64_t PD = abs64(ConstPtrDiff->getValue()->getSExtValue());
+          if (PD < (int64_t) CacheLineSize) {
+            DupPref = true;
+            break;
+          }
+        }
+      }
+      if (DupPref)
+        continue;
+
+      const SCEV *NextLSCEV = SE->getAddExpr(LSCEVAddRec, SE->getMulExpr(
+        SE->getConstant(LSCEVAddRec->getType(), ItersAhead),
+        LSCEVAddRec->getStepRecurrence(*SE)));
+      if (!isSafeToExpand(NextLSCEV, *SE))
+        continue;
+
+      PrefLoads.push_back(std::make_pair(MemI, LSCEVAddRec));
+
+      Type *I8Ptr = Type::getInt8PtrTy((*I)->getContext(), PtrAddrSpace);
+      SCEVExpander SCEVE(*SE, "prefaddr");
+      Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI);
+
+      IRBuilder<> Builder(MemI);
+      Module *M = (*I)->getParent()->getParent();
+      Type *I32 = Type::getInt32Ty((*I)->getContext());
+      Value *PrefetchFunc = Intrinsic::getDeclaration(M, Intrinsic::prefetch);
+      Builder.CreateCall4(PrefetchFunc, PrefPtrValue,
+        ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1),
+        ConstantInt::get(I32, 3), ConstantInt::get(I32, 1));
+
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
diff --git a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
new file mode 100644
index 0000000..df65227
--- /dev/null
+++ b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
@@ -0,0 +1,382 @@
+//===------ PPCLoopPreIncPrep.cpp - Loop Pre-Inc. AM Prep. Pass -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to prepare loops for pre-increment addressing
+// modes. Additional PHIs are created for loop induction variables used by
+// load/store instructions so that the pre-increment forms can be used.
+// Generically, this means transforming loops like this:
+//   for (int i = 0; i < n; ++i)
+//     array[i] = c;
+// to look like this:
+//   T *p = array[-1];
+//   for (int i = 0; i < n; ++i)
+//     *++p = c;
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppc-loop-preinc-prep"
+#include "PPC.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+// By default, we limit this to creating 16 PHIs (which is a little over half
+// of the allocatable register set).
+static cl::opt<unsigned> MaxVars("ppc-preinc-prep-max-vars",
+                                 cl::Hidden, cl::init(16),
+  cl::desc("Potential PHI threshold for PPC preinc loop prep"));
+
+namespace llvm {
+  void initializePPCLoopPreIncPrepPass(PassRegistry&);
+}
+
+namespace {
+
+  class PPCLoopPreIncPrep : public FunctionPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    PPCLoopPreIncPrep() : FunctionPass(ID), TM(nullptr) {
+      initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry());
+    }
+    PPCLoopPreIncPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
+      initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addRequired<ScalarEvolution>();
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    bool runOnLoop(Loop *L);
+    void simplifyLoopLatch(Loop *L);
+    bool rotateLoop(Loop *L);
+
+  private:
+    PPCTargetMachine *TM;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
+    const DataLayout *DL;
+  };
+}
+
+char PPCLoopPreIncPrep::ID = 0;
+static const char *name = "Prepare loop for pre-inc. addressing modes";
+INITIALIZE_PASS_BEGIN(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_END(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false)
+
+FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) {
+  return new PPCLoopPreIncPrep(TM);
+}
+
+namespace {
+  struct SCEVLess : std::binary_function<const SCEV *, const SCEV *, bool>
+  {
+    SCEVLess(ScalarEvolution *SE) : SE(SE) {}
+
+    bool operator() (const SCEV *X, const SCEV *Y) const {
+      const SCEV *Diff = SE->getMinusSCEV(X, Y);
+      return cast<SCEVConstant>(Diff)->getValue()->getSExtValue() < 0;
+    }
+
+  protected:
+    ScalarEvolution *SE;
+  };
+}
+
+static bool IsPtrInBounds(Value *BasePtr) {
+  Value *StrippedBasePtr = BasePtr;
+  while (BitCastInst *BC = dyn_cast<BitCastInst>(StrippedBasePtr))
+    StrippedBasePtr = BC->getOperand(0);
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(StrippedBasePtr))
+    return GEP->isInBounds();
+
+  return false;
+}
+
+static Value *GetPointerOperand(Value *MemI) {
+  if (LoadInst *LMemI = dyn_cast<LoadInst>(MemI)) {
+    return LMemI->getPointerOperand();
+  } else if (StoreInst *SMemI = dyn_cast<StoreInst>(MemI)) {
+    return SMemI->getPointerOperand();
+  } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(MemI)) {
+    if (IMemI->getIntrinsicID() == Intrinsic::prefetch)
+      return IMemI->getArgOperand(0);
+  }
+
+  return 0;
+}
+
+bool PPCLoopPreIncPrep::runOnFunction(Function &F) {
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  SE = &getAnalysis<ScalarEvolution>();
+
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  DL = DLP ? &DLP->getDataLayout() : 0;
+
+  bool MadeChange = false;
+
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end();
+       I != E; ++I) {
+    Loop *L = *I;
+    MadeChange |= runOnLoop(L);
+  }
+
+  return MadeChange;
+}
+
+bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
+  bool MadeChange = false;
+
+  if (!DL)
+    return MadeChange;
+
+  // Only prep. the inner-most loop
+  if (!L->empty())
+    return MadeChange;
+
+  BasicBlock *Header = L->getHeader();
+
+  const PPCSubtarget *ST =
+    TM ? TM->getSubtargetImpl(*Header->getParent()) : nullptr;
+
+  unsigned HeaderLoopPredCount = 0;
+  for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
+       PI != PE; ++PI) {
+    ++HeaderLoopPredCount;
+  }
+
+  // Collect buckets of comparable addresses used by loads and stores.
+  typedef std::multimap<const SCEV *, Instruction *, SCEVLess> Bucket;
+  SmallVector<Bucket, 16> Buckets;
+  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+       I != IE; ++I) {
+    for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end();
+        J != JE; ++J) {
+      Value *PtrValue;
+      Instruction *MemI;
+
+      if (LoadInst *LMemI = dyn_cast<LoadInst>(J)) {
+        MemI = LMemI;
+        PtrValue = LMemI->getPointerOperand();
+      } else if (StoreInst *SMemI = dyn_cast<StoreInst>(J)) {
+        MemI = SMemI;
+        PtrValue = SMemI->getPointerOperand();
+      } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(J)) {
+        if (IMemI->getIntrinsicID() == Intrinsic::prefetch) {
+          MemI = IMemI;
+          PtrValue = IMemI->getArgOperand(0);
+        } else continue;
+      } else continue;
+
+      unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
+      if (PtrAddrSpace)
+        continue;
+
+      // There are no update forms for Altivec vector load/stores.
+      if (ST && ST->hasAltivec() &&
+          PtrValue->getType()->getPointerElementType()->isVectorTy())
+        continue;
+
+      if (L->isLoopInvariant(PtrValue))
+        continue;
+
+      const SCEV *LSCEV = SE->getSCEV(PtrValue);
+      if (!isa<SCEVAddRecExpr>(LSCEV))
+        continue;
+
+      bool FoundBucket = false;
+      for (unsigned i = 0, e = Buckets.size(); i != e; ++i)
+        for (Bucket::iterator K = Buckets[i].begin(), KE = Buckets[i].end();
+             K != KE; ++K) {
+          const SCEV *Diff = SE->getMinusSCEV(K->first, LSCEV);
+          if (isa<SCEVConstant>(Diff)) {
+            Buckets[i].insert(std::make_pair(LSCEV, MemI));
+            FoundBucket = true;
+            break;
+          }
+        }
+
+      if (!FoundBucket) {
+        Buckets.push_back(Bucket(SCEVLess(SE)));
+        Buckets[Buckets.size()-1].insert(std::make_pair(LSCEV, MemI));
+      }
+    }
+  }
+
+  if (Buckets.empty() || Buckets.size() > MaxVars)
+    return MadeChange;
+
+  BasicBlock *LoopPredecessor = L->getLoopPredecessor();
+  // If there is no loop predecessor, or the loop predecessor's terminator
+  // returns a value (which might contribute to determining the loop's
+  // iteration space), insert a new preheader for the loop.
+  if (!LoopPredecessor ||
+      !LoopPredecessor->getTerminator()->getType()->isVoidTy())
+    LoopPredecessor = InsertPreheaderForLoop(L, this);
+  if (!LoopPredecessor)
+    return MadeChange;
+
+  SmallSet<BasicBlock *, 16> BBChanged;
+  for (unsigned i = 0, e = Buckets.size(); i != e; ++i) {
+    // The base address of each bucket is transformed into a phi and the others
+    // are rewritten as offsets of that variable.
+
+    const SCEVAddRecExpr *BasePtrSCEV =
+      cast<SCEVAddRecExpr>(Buckets[i].begin()->first);
+    if (!BasePtrSCEV->isAffine())
+      continue;
+
+    Instruction *MemI = Buckets[i].begin()->second;
+    Value *BasePtr = GetPointerOperand(MemI);
+    assert(BasePtr && "No pointer operand");
+
+    Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(),
+      BasePtr->getType()->getPointerAddressSpace());
+
+    const SCEV *BasePtrStartSCEV = BasePtrSCEV->getStart();
+    if (!SE->isLoopInvariant(BasePtrStartSCEV, L))
+      continue;
+
+    const SCEVConstant *BasePtrIncSCEV =
+      dyn_cast<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE));
+    if (!BasePtrIncSCEV)
+      continue;
+    BasePtrStartSCEV = SE->getMinusSCEV(BasePtrStartSCEV, BasePtrIncSCEV);
+    if (!isSafeToExpand(BasePtrStartSCEV, *SE))
+      continue;
+
+    PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount,
+      MemI->hasName() ? MemI->getName() + ".phi" : "",
+      Header->getFirstNonPHI());
+
+    SCEVExpander SCEVE(*SE, "pistart");
+    Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy,
+      LoopPredecessor->getTerminator());
+
+    // Note that LoopPredecessor might occur in the predecessor list multiple
+    // times, and we need to add it the right number of times.
+    for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
+         PI != PE; ++PI) {
+      if (*PI != LoopPredecessor)
+        continue;
+
+      NewPHI->addIncoming(BasePtrStart, LoopPredecessor);
+    }
+
+    Instruction *InsPoint = Header->getFirstInsertionPt();
+    GetElementPtrInst *PtrInc =
+      GetElementPtrInst::Create(NewPHI, BasePtrIncSCEV->getValue(),
+        MemI->hasName() ? MemI->getName() + ".inc" : "", InsPoint);
+    PtrInc->setIsInBounds(IsPtrInBounds(BasePtr));
+    for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
+         PI != PE; ++PI) {
+      if (*PI == LoopPredecessor)
+        continue;
+
+      NewPHI->addIncoming(PtrInc, *PI);
+    }
+
+    Instruction *NewBasePtr;
+    if (PtrInc->getType() != BasePtr->getType())
+      NewBasePtr = new BitCastInst(PtrInc, BasePtr->getType(),
+        PtrInc->hasName() ? PtrInc->getName() + ".cast" : "", InsPoint);
+    else
+      NewBasePtr = PtrInc;
+
+    if (Instruction *IDel = dyn_cast<Instruction>(BasePtr))
+      BBChanged.insert(IDel->getParent());
+    BasePtr->replaceAllUsesWith(NewBasePtr);
+    RecursivelyDeleteTriviallyDeadInstructions(BasePtr);
+
+    Value *LastNewPtr = NewBasePtr;
+    for (Bucket::iterator I = std::next(Buckets[i].begin()),
+         IE = Buckets[i].end(); I != IE; ++I) {
+      Value *Ptr = GetPointerOperand(I->second);
+      assert(Ptr && "No pointer operand");
+      if (Ptr == LastNewPtr)
+        continue;
+
+      Instruction *RealNewPtr;
+      const SCEVConstant *Diff =
+        cast<SCEVConstant>(SE->getMinusSCEV(I->first, BasePtrSCEV));
+      if (Diff->isZero()) {
+        RealNewPtr = NewBasePtr;
+      } else {
+        Instruction *PtrIP = dyn_cast<Instruction>(Ptr);
+        if (PtrIP && isa<Instruction>(NewBasePtr) &&
+            cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent())
+          PtrIP = 0;
+        else if (isa<PHINode>(PtrIP))
+          PtrIP = PtrIP->getParent()->getFirstInsertionPt();
+        else if (!PtrIP)
+          PtrIP = I->second;
+  
+        GetElementPtrInst *NewPtr =
+          GetElementPtrInst::Create(PtrInc, Diff->getValue(),
+            I->second->hasName() ? I->second->getName() + ".off" : "", PtrIP);
+        if (!PtrIP)
+          NewPtr->insertAfter(cast<Instruction>(PtrInc));
+        NewPtr->setIsInBounds(IsPtrInBounds(Ptr));
+        RealNewPtr = NewPtr;
+      }
+
+      if (Instruction *IDel = dyn_cast<Instruction>(Ptr))
+        BBChanged.insert(IDel->getParent());
+
+      Instruction *ReplNewPtr;
+      if (Ptr->getType() != RealNewPtr->getType()) {
+        ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(),
+          Ptr->hasName() ? Ptr->getName() + ".cast" : "");
+        ReplNewPtr->insertAfter(RealNewPtr);
+      } else
+        ReplNewPtr = RealNewPtr;
+
+      Ptr->replaceAllUsesWith(ReplNewPtr);
+      RecursivelyDeleteTriviallyDeadInstructions(Ptr);
+
+      LastNewPtr = RealNewPtr;
+    }
+
+    MadeChange = true;
+  }
+
+  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+       I != IE; ++I) {
+    if (BBChanged.count(*I))
+      DeleteDeadPHIs(*I);
+  }
+
+  return MadeChange;
+}
+
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index 880b520..819738b 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPC.h"
-#include "PPCSubtarget.h"
 #include "MCTargetDesc/PPCMCExpr.h"
+#include "PPCSubtarget.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -38,7 +38,7 @@ static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) {
 static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
   const TargetMachine &TM = AP.TM;
   Mangler *Mang = AP.Mang;
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   MCContext &Ctx = AP.OutContext;
   bool isDarwin = Triple(TM.getTargetTriple()).isOSDarwin();
 
@@ -137,12 +137,6 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
     case PPCII::MO_TLS:
       RefKind = MCSymbolRefExpr::VK_PPC_TLS;
       break;
-    case PPCII::MO_TLSGD:
-      RefKind = MCSymbolRefExpr::VK_PPC_TLSGD;
-      break;
-    case PPCII::MO_TLSLD:
-      RefKind = MCSymbolRefExpr::VK_PPC_TLSLD;
-      break;
   }
 
   if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB && !isDarwin)
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
index 4aff95a..dd896a9 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -18,7 +18,8 @@ using namespace llvm;
 void PPCFunctionInfo::anchor() { }
 
 MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const {
-  const DataLayout *DL = MF.getSubtarget().getDataLayout();
-  return MF.getContext().GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix())+
-    Twine(MF.getFunctionNumber())+"$poff");
+  const DataLayout *DL = MF.getTarget().getDataLayout();
+  return MF.getContext().GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix()) +
+                                           Twine(MF.getFunctionNumber()) +
+                                           "$poff");
 }
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 83de799..607cdf6 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -35,6 +35,9 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   /// Frame index where the old base pointer is stored.
   int BasePointerSaveIndex;
 
+  /// Frame index where the old PIC base pointer is stored.
+  int PICBasePointerSaveIndex;
+
   /// MustSaveLR - Indicates whether LR is defined (or clobbered) in the current
   /// function.  This is only valid after the initial scan of the function by
   /// PEI.
@@ -59,6 +62,9 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   /// entry, even though LR may otherwise apparently not be used.
   bool LRStoreRequired;
 
+  /// This function makes use of the PPC64 ELF TOC base pointer (register r2).
+  bool UsesTOCBasePtr;
+
   /// MinReservedArea - This is the frame size that is at least reserved in a
   /// potential caller (parameter+linkage area).
   unsigned MinReservedArea;
@@ -103,11 +109,13 @@ public:
     : FramePointerSaveIndex(0),
       ReturnAddrSaveIndex(0),
       BasePointerSaveIndex(0),
+      PICBasePointerSaveIndex(0),
       HasSpills(false),
       HasNonRISpills(false),
       SpillsCR(false),
       SpillsVRSAVE(false),
       LRStoreRequired(false),
+      UsesTOCBasePtr(false),
       MinReservedArea(0),
       TailCallSPDelta(0),
       HasFastCall(false),
@@ -128,6 +136,9 @@ public:
   int getBasePointerSaveIndex() const { return BasePointerSaveIndex; }
   void setBasePointerSaveIndex(int Idx) { BasePointerSaveIndex = Idx; }
 
+  int getPICBasePointerSaveIndex() const { return PICBasePointerSaveIndex; }
+  void setPICBasePointerSaveIndex(int Idx) { PICBasePointerSaveIndex = Idx; }
+
   unsigned getMinReservedArea() const { return MinReservedArea; }
   void setMinReservedArea(unsigned size) { MinReservedArea = size; }
 
@@ -157,6 +168,9 @@ public:
   void setLRStoreRequired() { LRStoreRequired = true; }
   bool isLRStoreRequired() const { return LRStoreRequired; }
 
+  void setUsesTOCBasePtr()    { UsesTOCBasePtr = true; }
+  bool usesTOCBasePtr() const { return UsesTOCBasePtr; }
+
   void setHasFastCall() { HasFastCall = true; }
   bool hasFastCall() const { return HasFastCall;}
 
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 9b9966f..c9a9684 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -99,6 +99,14 @@ PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
 
 const MCPhysReg*
 PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) {
+    if (Subtarget.hasVSX())
+      return CSR_64_AllRegs_VSX_SaveList;
+    if (Subtarget.hasAltivec())
+      return CSR_64_AllRegs_Altivec_SaveList;
+    return CSR_64_AllRegs_SaveList;
+  }
+
   if (Subtarget.isDarwinABI())
     return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
                                   CSR_Darwin64_Altivec_SaveList :
@@ -107,9 +115,14 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
                                   CSR_Darwin32_Altivec_SaveList :
                                   CSR_Darwin32_SaveList);
 
+  // On PPC64, we might need to save r2 (but only if it is not reserved).
+  bool SaveR2 = MF->getRegInfo().isAllocatable(PPC::X2);
+
   return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
-                                CSR_SVR464_Altivec_SaveList :
-                                CSR_SVR464_SaveList) :
+                                (SaveR2 ? CSR_SVR464_R2_Altivec_SaveList :
+                                          CSR_SVR464_Altivec_SaveList) :
+                                (SaveR2 ? CSR_SVR464_R2_SaveList :
+                                          CSR_SVR464_SaveList)) :
                                (Subtarget.hasAltivec() ?
                                 CSR_SVR432_Altivec_SaveList :
                                 CSR_SVR432_SaveList);
@@ -117,6 +130,14 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 const uint32_t*
 PPCRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  if (CC == CallingConv::AnyReg) {
+    if (Subtarget.hasVSX())
+      return CSR_64_AllRegs_VSX_RegMask;
+    if (Subtarget.hasAltivec())
+      return CSR_64_AllRegs_Altivec_RegMask;
+    return CSR_64_AllRegs_RegMask;
+  }
+
   if (Subtarget.isDarwinABI())
     return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
                                   CSR_Darwin64_Altivec_RegMask :
@@ -138,10 +159,18 @@ PPCRegisterInfo::getNoPreservedMask() const {
   return CSR_NoRegs_RegMask;
 }
 
+void PPCRegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
+  unsigned PseudoRegs[] = { PPC::ZERO, PPC::ZERO8, PPC::RM };
+  for (unsigned i = 0, ie = array_lengthof(PseudoRegs); i != ie; ++i) {
+    unsigned Reg = PseudoRegs[i];
+    Mask[Reg / 32] &= ~(1u << (Reg % 32));
+  }
+}
+
 BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  const PPCFrameLowering *PPCFI = static_cast<const PPCFrameLowering *>(
-      MF.getSubtarget().getFrameLowering());
+  const PPCFrameLowering *PPCFI =
+      static_cast<const PPCFrameLowering *>(Subtarget.getFrameLowering());
 
   // The ZERO register is not really a register, but the representation of r0
   // when used in instructions that treat r0 as the constant 0.
@@ -192,7 +221,16 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
     // The 64-bit SVR4 ABI reserves r2 for the TOC pointer.
     if (Subtarget.isSVR4ABI()) {
-      Reserved.set(PPC::X2);
+      // We only reserve r2 if we need to use the TOC pointer. If we have no
+      // explicit uses of the TOC pointer (meaning we're a leaf function with
+      // no constant-pool loads, etc.) and we have no potential uses inside an
+      // inline asm block, then we can treat r2 has an ordinary callee-saved
+      // register.
+      const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+      if (FuncInfo->usesTOCBasePtr() || MF.hasInlineAsm())
+        Reserved.set(PPC::X2);
+      else
+        Reserved.reset(PPC::R2);
     }
   }
 
@@ -220,10 +258,9 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
-unsigned
-PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
-                                         MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                              MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
   const unsigned DefaultSafety = 1;
 
   switch (RC->getID()) {
@@ -238,6 +275,9 @@ PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   }
   case PPC::F8RCRegClassID:
   case PPC::F4RCRegClassID:
+  case PPC::QFRCRegClassID:
+  case PPC::QSRCRegClassID:
+  case PPC::QBRCRegClassID:
   case PPC::VRRCRegClassID:
   case PPC::VFRCRegClassID:
   case PPC::VSLRCRegClassID:
@@ -251,8 +291,8 @@ PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   }
 }
 
-const TargetRegisterClass*
-PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)const {
+const TargetRegisterClass *PPCRegisterInfo::getLargestLegalSuperClass(
+    const TargetRegisterClass *RC) const {
   if (Subtarget.hasVSX()) {
     // With VSX, we can inflate various sub-register classes to the full VSX
     // register set.
@@ -287,7 +327,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
   // Get the frame info.
   MachineFrameInfo *MFI = MF.getFrameInfo();
   // Get the instruction info.
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   // Determine whether 64-bit pointers are used.
   bool LP64 = Subtarget.isPPC64();
   DebugLoc dl = MI.getDebugLoc();
@@ -298,10 +338,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
   unsigned FrameSize = MFI->getStackSize();
   
   // Get stack alignments.
-  unsigned TargetAlign = MF.getTarget()
-                             .getSubtargetImpl()
-                             ->getFrameLowering()
-                             ->getStackAlignment();
+  unsigned TargetAlign = Subtarget.getFrameLowering()->getStackAlignment();
   unsigned MaxAlign = MFI->getMaxAlignment();
   assert((maxCallFrameSize & (MaxAlign-1)) == 0 &&
          "Maximum call-frame size not sufficiently aligned");
@@ -406,7 +443,7 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   bool LP64 = Subtarget.isPPC64();
@@ -450,7 +487,7 @@ void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   bool LP64 = Subtarget.isPPC64();
@@ -523,7 +560,7 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   bool LP64 = Subtarget.isPPC64();
@@ -566,7 +603,7 @@ void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II,
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   bool LP64 = Subtarget.isPPC64();
@@ -613,7 +650,7 @@ void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II,
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
@@ -638,7 +675,7 @@ void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II,
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
@@ -700,7 +737,10 @@ static unsigned getOffsetONFromFION(const MachineInstr &MI,
   // Take into account whether it's an add or mem instruction
   unsigned OffsetOperandNo = (FIOperandNum == 2) ? 1 : 2;
   if (MI.isInlineAsm())
-    OffsetOperandNo = FIOperandNum-1;
+    OffsetOperandNo = FIOperandNum - 1;
+  else if (MI.getOpcode() == TargetOpcode::STACKMAP ||
+           MI.getOpcode() == TargetOpcode::PATCHPOINT)
+    OffsetOperandNo = FIOperandNum + 1;
 
   return OffsetOperandNo;
 }
@@ -718,7 +758,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // Get the basic block's function.
   MachineFunction &MF = *MBB.getParent();
   // Get the instruction info.
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   // Get the frame info.
   MachineFrameInfo *MFI = MF.getFrameInfo();
   DebugLoc dl = MI.getDebugLoc();
@@ -772,7 +812,8 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   // If the instruction is not present in ImmToIdxMap, then it has no immediate
   // form (and must be r+r).
-  bool noImmForm = !MI.isInlineAsm() && !ImmToIdxMap.count(OpC);
+  bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP &&
+                   OpC != TargetOpcode::PATCHPOINT && !ImmToIdxMap.count(OpC);
 
   // Now add the frame object offset to the offset from r1.
   int Offset = MFI->getObjectOffset(FrameIndex);
@@ -783,8 +824,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // to Offset to get the correct offset.
   // Naked functions have stack size 0, although getStackSize may not reflect that
   // because we didn't call all the pieces that compute it for naked functions.
-  if (!MF.getFunction()->getAttributes().
-        hasAttribute(AttributeSet::FunctionIndex, Attribute::Naked)) {
+  if (!MF.getFunction()->hasFnAttribute(Attribute::Naked)) {
     if (!(hasBasePointer(MF) && FrameIndex < 0))
       Offset += MFI->getStackSize();
   }
@@ -796,8 +836,10 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // only "std" to a stack slot that is at least 4-byte aligned, but it can
   // happen in invalid code.
   assert(OpC != PPC::DBG_VALUE &&
-         "This should be handle in a target independent way");
-  if (!noImmForm && isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) {
+         "This should be handled in a target-independent way");
+  if (!noImmForm && ((isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) ||
+                     OpC == TargetOpcode::STACKMAP ||
+                     OpC == TargetOpcode::PATCHPOINT)) {
     MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
     return;
   }
@@ -843,7 +885,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 }
 
 unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
 
   if (!Subtarget.isPPC64())
     return TFI->hasFP(MF) ? PPC::R31 : PPC::R1;
@@ -887,14 +929,9 @@ bool PPCRegisterInfo::canRealignStack(const MachineFunction &MF) const {
 bool PPCRegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *F = MF.getFunction();
-  unsigned StackAlign = MF.getTarget()
-                            .getSubtargetImpl()
-                            ->getFrameLowering()
-                            ->getStackAlignment();
-  bool requiresRealignment =
-    ((MFI->getMaxAlignment() > StackAlign) ||
-     F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                     Attribute::StackAlignment));
+  unsigned StackAlign = Subtarget.getFrameLowering()->getStackAlignment();
+  bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
+                              F->hasFnAttribute(Attribute::StackAlignment));
 
   return requiresRealignment && canRealignStack(MF);
 }
@@ -928,8 +965,8 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
   MachineBasicBlock &MBB = *MI->getParent();
   MachineFunction &MF = *MBB.getParent();
 
-  const PPCFrameLowering *PPCFI = static_cast<const PPCFrameLowering *>(
-      MF.getSubtarget().getFrameLowering());
+  const PPCFrameLowering *PPCFI =
+      static_cast<const PPCFrameLowering *>(Subtarget.getFrameLowering());
   unsigned StackEst =
     PPCFI->determineFrameLayout(MF, false, true);
 
@@ -963,7 +1000,7 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
     DL = Ins->getDebugLoc();
 
   const MachineFunction &MF = *MBB->getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   const MCInstrDesc &MCID = TII.get(ADDriOpc);
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
@@ -988,7 +1025,7 @@ void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
 
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   const MCInstrDesc &MCID = MI.getDesc();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   MRI.constrainRegClass(BaseReg,
@@ -1008,6 +1045,8 @@ bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
   Offset += MI->getOperand(OffsetOperandNo).getImm();
 
   return MI->getOpcode() == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm
+         MI->getOpcode() == TargetOpcode::STACKMAP ||
+         MI->getOpcode() == TargetOpcode::PATCHPOINT ||
          (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0));
 }
 
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index c182f95..4c2ef90 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -49,6 +49,8 @@ public:
   const uint32_t *getCallPreservedMask(CallingConv::ID CC) const override;
   const uint32_t *getNoPreservedMask() const;
 
+  void adjustStackMapLiveOutMask(uint32_t *Mask) const override;
+
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   /// We require the register scavenger.
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index b3d145b..9a7df96 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -49,6 +49,13 @@ class FPR<bits<5> num, string n> : PPCReg<n> {
   let HWEncoding{4-0} = num;
 }
 
+// QFPR - One of the 32 256-bit floating-point vector registers (used for QPX)
+class QFPR<FPR SubReg, string n> : PPCReg<n> {
+  let HWEncoding = SubReg.HWEncoding;
+  let SubRegs = [SubReg];
+  let SubRegIndices = [sub_64];
+}
+
 // VF - One of the 32 64-bit floating-point subregisters of the vector
 // registers (used by VSX).
 class VF<bits<5> num, string n> : PPCReg<n> {
@@ -114,6 +121,12 @@ foreach Index = 0-31 in {
   def VF#Index : VF<Index, "vs" # !add(Index, 32)>;
 }
 
+// QPX Floating-point registers
+foreach Index = 0-31 in {
+  def QF#Index : QFPR<!cast<FPR>("F"#Index), "q"#Index>,
+                 DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>;
+}
+
 // Vector registers
 foreach Index = 0-31 in {
   def V#Index : VR<!cast<VF>("VF"#Index), "v"#Index>,
@@ -131,8 +144,8 @@ foreach Index = 0-31 in {
 }
 
 // The reprsentation of r0 when treated as the constant 0.
-def ZERO  : GPR<0, "0">;
-def ZERO8 : GP8<ZERO, "0">;
+def ZERO  : GPR<0, "0">,    DwarfRegAlias<R0>;
+def ZERO8 : GP8<ZERO, "0">, DwarfRegAlias<X0>;
 
 // Representations of the frame pointer used by ISD::FRAMEADDR.
 def FP   : GPR<0 /* arbitrary */, "**FRAME POINTER**">;
@@ -188,13 +201,6 @@ def CR6 : CR<6, "cr6", [CR6LT, CR6GT, CR6EQ, CR6UN]>, DwarfRegNum<[74, 74]>;
 def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75, 75]>;
 }
 
-// The full condition-code register. This is not modeled fully, but defined
-// here primarily, for compatibility with gcc, to allow the inline asm "cc"
-// clobber specification to work.
-def CC : PPCReg<"cc">, DwarfRegAlias<CR0> {
-  let Aliases = [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7];
-}
-
 // Link register
 def LR  : SPR<8, "lr">, DwarfRegNum<[-2, 65]>;
 //let Aliases = [LR] in
@@ -210,7 +216,7 @@ def VRSAVE: SPR<256, "vrsave">, DwarfRegNum<[109]>;
 // Carry bit.  In the architecture this is really bit 0 of the XER register
 // (which really is SPR register 1);  this is the only bit interesting to a
 // compiler.
-def CARRY: SPR<1, "ca">;
+def CARRY: SPR<1, "ca">, DwarfRegNum<[76]>;
 
 // FP rounding mode:  bits 30 and 31 of the FP status and control register
 // This is not allocated as a normal register; it appears only in
@@ -219,25 +225,57 @@ def CARRY: SPR<1, "ca">;
 // most registers, it has to be done in code; to make this work all the
 // return and call instructions are described as Uses of RM, so instructions
 // that do nothing but change RM will not get deleted.
-// Also, in the architecture it is not really a SPR; 512 is arbitrary.
-def RM: SPR<512, "**ROUNDING MODE**">;
+def RM: PPCReg<"**ROUNDING MODE**">;
 
 /// Register classes
 // Allocate volatiles first
 // then nonvolatiles in reverse order since stmw/lmw save from rN to r31
 def GPRC : RegisterClass<"PPC", [i32], 32, (add (sequence "R%u", 2, 12),
                                                 (sequence "R%u", 30, 13),
-                                                R31, R0, R1, FP, BP)>;
+                                                R31, R0, R1, FP, BP)> {
+  // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
+  // put it at the end of the list.
+  let AltOrders = [(add (sub GPRC, R2), R2)];
+  let AltOrderSelect = [{
+    const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
+    return S.isPPC64() && S.isSVR4ABI();
+  }];
+}
 
 def G8RC : RegisterClass<"PPC", [i64], 64, (add (sequence "X%u", 2, 12),
                                                 (sequence "X%u", 30, 14),
-                                                X31, X13, X0, X1, FP8, BP8)>;
+                                                X31, X13, X0, X1, FP8, BP8)> {
+  // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
+  // put it at the end of the list.
+  let AltOrders = [(add (sub G8RC, X2), X2)];
+  let AltOrderSelect = [{
+    const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
+    return S.isPPC64() && S.isSVR4ABI();
+  }];
+}
 
 // For some instructions r0 is special (representing the value 0 instead of
 // the value in the r0 register), and we use these register subclasses to
 // prevent r0 from being allocated for use by those instructions.
-def GPRC_NOR0 : RegisterClass<"PPC", [i32], 32, (add (sub GPRC, R0), ZERO)>;
-def G8RC_NOX0 : RegisterClass<"PPC", [i64], 64, (add (sub G8RC, X0), ZERO8)>;
+def GPRC_NOR0 : RegisterClass<"PPC", [i32], 32, (add (sub GPRC, R0), ZERO)> {
+  // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
+  // put it at the end of the list.
+  let AltOrders = [(add (sub GPRC_NOR0, R2), R2)];
+  let AltOrderSelect = [{
+    const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
+    return S.isPPC64() && S.isSVR4ABI();
+  }];
+}
+
+def G8RC_NOX0 : RegisterClass<"PPC", [i64], 64, (add (sub G8RC, X0), ZERO8)> {
+  // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
+  // put it at the end of the list.
+  let AltOrders = [(add (sub G8RC_NOX0, X2), X2)];
+  let AltOrderSelect = [{
+    const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
+    return S.isPPC64() && S.isSVR4ABI();
+  }];
+}
 
 // Allocate volatiles first, then non-volatiles in reverse order. With the SVR4
 // ABI the size of the Floating-point register save area is determined by the
@@ -250,7 +288,7 @@ def F8RC : RegisterClass<"PPC", [f64], 64, (add (sequence "F%u", 0, 13),
                                                 (sequence "F%u", 31, 14))>;
 def F4RC : RegisterClass<"PPC", [f32], 32, (add F8RC)>;
 
-def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v4f32], 128,
+def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v2i64,v4f32], 128,
                          (add V2, V3, V4, V5, V0, V1, V6, V7, V8, V9, V10, V11,
                              V12, V13, V14, V15, V16, V17, V18, V19, V31, V30,
                              V29, V28, V27, V26, V25, V24, V23, V22, V21, V20)>;
@@ -278,6 +316,16 @@ def VFRC :  RegisterClass<"PPC", [f64], 64,
                                VF22, VF21, VF20)>;
 def VSFRC : RegisterClass<"PPC", [f64], 64, (add F8RC, VFRC)>;
 
+// For QPX
+def QFRC : RegisterClass<"PPC", [v4f64], 256, (add (sequence "QF%u", 0, 13),
+                                                (sequence "QF%u", 31, 14))>;
+def QSRC : RegisterClass<"PPC", [v4f32], 128, (add QFRC)>;
+def QBRC : RegisterClass<"PPC", [v4i1], 256, (add QFRC)> {
+  // These are actually stored as floating-point values where a positive
+  // number is true and anything else (including NaN) is false.
+  let Size = 256;
+}
+
 def CRBITRC : RegisterClass<"PPC", [i1], 32,
   (add CR2LT, CR2GT, CR2EQ, CR2UN,
        CR3LT, CR3GT, CR3EQ, CR3UN,
@@ -308,7 +356,3 @@ def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY)> {
   let CopyCost = -1;
 }
 
-def CCRC : RegisterClass<"PPC", [i32], 32, (add CC)> {
-  let isAllocatable = 0;
-}
-
diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
index 7f80121..2f3a1f9 100644
--- a/lib/Target/PowerPC/PPCSchedule.td
+++ b/lib/Target/PowerPC/PPCSchedule.td
@@ -13,6 +13,7 @@
 def IIC_IntSimple    : InstrItinClass;
 def IIC_IntGeneral   : InstrItinClass;
 def IIC_IntCompare   : InstrItinClass;
+def IIC_IntISEL      : InstrItinClass;
 def IIC_IntDivD      : InstrItinClass;
 def IIC_IntDivW      : InstrItinClass;
 def IIC_IntMFFS      : InstrItinClass;
@@ -119,6 +120,7 @@ include "PPCScheduleG4.td"
 include "PPCScheduleG4Plus.td"
 include "PPCScheduleG5.td"
 include "PPCScheduleP7.td"
+include "PPCScheduleP8.td"
 include "PPCScheduleA2.td"
 include "PPCScheduleE500mc.td"
 include "PPCScheduleE5500.td"
@@ -216,6 +218,7 @@ include "PPCScheduleE5500.td"
 //    fsub       IIC_FPAddSub
 //    fsubs      IIC_FPGeneral
 //    icbi       IIC_LdStICBI
+//    isel       IIC_IntISEL
 //    isync      IIC_SprISYNC
 //    lbz        IIC_LdStLoad
 //    lbzu       IIC_LdStLoadUpd
diff --git a/lib/Target/PowerPC/PPCSchedule440.td b/lib/Target/PowerPC/PPCSchedule440.td
index 218fed2..04a43bc 100644
--- a/lib/Target/PowerPC/PPCSchedule440.td
+++ b/lib/Target/PowerPC/PPCSchedule440.td
@@ -121,6 +121,14 @@ def PPC440Itineraries : ProcessorItineraries<
                                 [2, 0, 0],
                                 [P440_GPR_Bypass,
                                  P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntISEL,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass, NoBypass]>,
   InstrItinData<IIC_IntCompare, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
                                  InstrStage<1, [P440_IRACC, P440_LRACC]>,
                                  InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td
index 1447696..21a357a 100644
--- a/lib/Target/PowerPC/PPCScheduleA2.td
+++ b/lib/Target/PowerPC/PPCScheduleA2.td
@@ -29,6 +29,8 @@ def PPCA2Itineraries : ProcessorItineraries<
                                  [1, 0, 0]>,
   InstrItinData<IIC_IntGeneral,  [InstrStage<1, [A2_XU]>],
                                  [2, 0, 0]>,
+  InstrItinData<IIC_IntISEL,     [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0, 0]>,
   InstrItinData<IIC_IntCompare,  [InstrStage<1, [A2_XU]>],
                                  [2, 0, 0]>,
   InstrItinData<IIC_IntDivW,     [InstrStage<1, [A2_XU]>],
diff --git a/lib/Target/PowerPC/PPCScheduleE500mc.td b/lib/Target/PowerPC/PPCScheduleE500mc.td
index dab89e3..36b8517 100644
--- a/lib/Target/PowerPC/PPCScheduleE500mc.td
+++ b/lib/Target/PowerPC/PPCScheduleE500mc.td
@@ -54,6 +54,12 @@ def PPCE500mcItineraries : ProcessorItineraries<
                                  [4, 1, 1], // Latency = 1
                                  [E500_GPR_Bypass,
                                   E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntISEL,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass,
+                                  E500_CR_Bypass]>,
   InstrItinData<IIC_IntCompare,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
                                   InstrStage<1, [E500_SFX0, E500_SFX1]>],
                                  [5, 1, 1], // Latency = 1 or 2
diff --git a/lib/Target/PowerPC/PPCScheduleE5500.td b/lib/Target/PowerPC/PPCScheduleE5500.td
index de097d9..7c2693e 100644
--- a/lib/Target/PowerPC/PPCScheduleE5500.td
+++ b/lib/Target/PowerPC/PPCScheduleE5500.td
@@ -58,6 +58,12 @@ def PPCE5500Itineraries : ProcessorItineraries<
                                  [5, 2, 2], // Latency = 1
                                  [E5500_GPR_Bypass,
                                   E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntISEL,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5, 2, 2, 2], // Latency = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass,
+                                  E5500_CR_Bypass]>,
   InstrItinData<IIC_IntCompare,  [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
                                   InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
                                  [6, 2, 2], // Latency = 1 or 2
diff --git a/lib/Target/PowerPC/PPCScheduleP7.td b/lib/Target/PowerPC/PPCScheduleP7.td
index d3e4269..635d154 100644
--- a/lib/Target/PowerPC/PPCScheduleP7.td
+++ b/lib/Target/PowerPC/PPCScheduleP7.td
@@ -89,6 +89,10 @@ def P7Itineraries : ProcessorItineraries<
                                                   P7_DU3, P7_DU4], 0>,
                                    InstrStage<1, [P7_FX1, P7_FX2]>],
                                   [1, 1, 1]>,
+  InstrItinData<IIC_IntISEL,      [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2], 0>,
+                                   InstrStage<1, [P7_BRU]>],
+                                  [1, 1, 1, 1]>,
   InstrItinData<IIC_IntCompare  , [InstrStage<1, [P7_DU1, P7_DU2,
                                                   P7_DU3, P7_DU4], 0>,
                                    InstrStage<1, [P7_FX1, P7_FX2]>],
@@ -380,6 +384,9 @@ def P7Model : SchedMachineModel {
                        // Itineraries are queried instead.
   let MispredictPenalty = 16;
 
+  // Try to make sure we have at least 10 dispatch groups in a loop.
+  let LoopMicroOpBufferSize = 40;
+
   let Itineraries = P7Itineraries;
 }
 
diff --git a/lib/Target/PowerPC/PPCScheduleP8.td b/lib/Target/PowerPC/PPCScheduleP8.td
new file mode 100644
index 0000000..020739b
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleP8.td
@@ -0,0 +1,401 @@
+//===-- PPCScheduleP8.td - PPC P8 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the POWER8 processor.
+//
+//===----------------------------------------------------------------------===//
+
+// Scheduling for the P8 involves tracking two types of resources:
+//  1. The dispatch bundle slots
+//  2. The functional unit resources
+
+// Dispatch units:
+def P8_DU1    : FuncUnit;
+def P8_DU2    : FuncUnit;
+def P8_DU3    : FuncUnit;
+def P8_DU4    : FuncUnit;
+def P8_DU5    : FuncUnit;
+def P8_DU6    : FuncUnit;
+def P8_DU7    : FuncUnit; // Only branch instructions will use DU7,DU8
+def P8_DU8    : FuncUnit;
+
+// 10 insns per cycle (2-LU, 2-LSU, 2-FXU, 2-FPU, 1-CRU, 1-BRU).
+
+def P8_LU1     : FuncUnit; // Loads or fixed-point operations 1
+def P8_LU2     : FuncUnit; // Loads or fixed-point operations 2
+
+// Load/Store pipelines can handle Stores, fixed-point loads, and simple
+// fixed-point operations.
+def P8_LSU1    : FuncUnit; // Load/Store pipeline 1
+def P8_LSU2    : FuncUnit; // Load/Store pipeline 2
+
+// Fixed Point unit
+def P8_FXU1    : FuncUnit; // FX pipeline 1
+def P8_FXU2    : FuncUnit; // FX pipeline 2
+
+// The Floating-Point Unit (FPU) and Vector Media Extension (VMX) units
+// are combined on P7 and newer into a Vector Scalar Unit (VSU).
+// The P8 Instruction latency documents still refers to the unit as the
+// FPU, so keep in mind that FPU==VSU.
+// In contrast to the P7, the VMX units on P8 are symmetric, so no need to
+// split vector integer ops or 128-bit load/store/perms to the specific units.
+def P8_FPU1    : FuncUnit; // VS pipeline 1
+def P8_FPU2    : FuncUnit; // VS pipeline 2
+
+def P8_CRU    : FuncUnit; // CR unit (CR logicals and move-from-SPRs)
+def P8_BRU    : FuncUnit; // BR unit
+
+def P8Itineraries : ProcessorItineraries<
+  [P8_DU1, P8_DU2, P8_DU3, P8_DU4, P8_DU5, P8_DU6, P8_DU7, P8_DU8,
+   P8_LU1, P8_LU2, P8_LSU1, P8_LSU2, P8_FXU1, P8_FXU2,
+   P8_FPU1, P8_FPU2, P8_CRU, P8_BRU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2,
+                                                  P8_LU1, P8_LU2,
+                                                  P8_LSU1, P8_LSU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2, P8_LU1,
+                                                  P8_LU2, P8_LSU1, P8_LSU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntISEL,      [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2], 0>,
+                                   InstrStage<1, [P8_BRU]>],
+                                  [1, 1, 1, 1]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntDivW     , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<15, [P8_FXU1, P8_FXU2]>],
+                                  [15, 1, 1]>,
+  InstrItinData<IIC_IntDivD     , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<23, [P8_FXU1, P8_FXU2]>],
+                                  [23, 1, 1]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [4, 1, 1]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [4, 1, 1]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [4, 1, 1]>,
+  InstrItinData<IIC_IntRotate   , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                   [1, 1, 1]>,
+  InstrItinData<IIC_IntRotateD  , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                   [1, 1, 1]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [1, 1]>,
+  InstrItinData<IIC_IntTrapD    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [1, 1]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [P8_DU7, P8_DU8], 0>,
+                                   InstrStage<1, [P8_BRU]>],
+                                  [3, 1, 1]>,
+  // FIXME - the Br* groups below are not branch related, so should probably
+  // be renamed.
+  // IIC_BrCR consists of the cr* instructions.  (crand,crnor,creqv, etc).
+  // and should be 'First' in dispatch.
+  InstrItinData<IIC_BrCR        , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_CRU]>],
+                                  [3, 1, 1]>,
+  // IIC_BrMCR consists of the mcrf instruction.
+  InstrItinData<IIC_BrMCR       , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_CRU]>],
+                                  [3, 1, 1]>,
+  // IIC_BrMCRX consists of mcrxr (obsolete instruction) and mtcrf, which
+  // should be first in the dispatch group.
+  InstrItinData<IIC_BrMCRX      , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [3, 1]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2 ], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [2, 2, 1, 1]>,
+  // Update-Indexed form loads/stores are no longer first and last in the
+  // dispatch group.  They are simply cracked, so require DU1,DU2.
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLD      , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_LdStLDU     , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [2, 2, 1, 1]>,
+  InstrItinData<IIC_LdStLDUX    , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LU1, P8_LU2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLVecX   , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LU1, P8_LU2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LU1, P8_LU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LU1, P8_LU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2,
+                                                  P8_LU1, P8_LU2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [4, 4, 1, 1]>,
+  // first+last in dispatch group.
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_DU3], 0>,
+                                   InstrStage<1, [P8_DU4], 0>,
+                                   InstrStage<1, [P8_DU5], 0>,
+                                   InstrStage<1, [P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [4, 4, 1, 1]>,
+  InstrItinData<IIC_LdStLWA     , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2]>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLWARX,    [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_DU3], 0>,
+                                   InstrStage<1, [P8_DU4], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2]>],
+                                  [3, 1, 1]>,
+  // first+last
+  InstrItinData<IIC_LdStLDARX,    [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_DU3], 0>,
+                                   InstrStage<1, [P8_DU4], 0>,
+                                   InstrStage<1, [P8_DU5], 0>,
+                                   InstrStage<1, [P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLMW     , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2]>],
+                                  [2, 1, 1]>,
+// Stores are dual-issued from the issue queue, so may only take up one
+// dispatch slot.  The instruction will be broken into two IOPS. The agen
+// op is issued to the LSU, and the data op (register fetch) is issued
+// to either the LU (GPR store) or the VSU (FPR store).
+  InstrItinData<IIC_LdStStore   , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2]>,
+                                   InstrStage<1, [P8_LU1, P8_LU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTD     , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LU1, P8_LU2,
+                                                  P8_LSU1, P8_LSU2]>]
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTDU    , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LU1, P8_LU2,
+                                                  P8_LSU1, P8_LSU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [2, 1, 1, 1]>,
+  // First+last
+  InstrItinData<IIC_LdStSTDUX   , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_DU3], 0>,
+                                   InstrStage<1, [P8_DU4], 0>,
+                                   InstrStage<1, [P8_DU5], 0>,
+                                   InstrStage<1, [P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [2, 1, 1, 1]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [2, 1, 1, 1]>,
+  InstrItinData<IIC_LdStSTVEBX  , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTDCX   , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_DU3], 0>,
+                                   InstrStage<1, [P8_DU4], 0>,
+                                   InstrStage<1, [P8_DU5], 0>,
+                                   InstrStage<1, [P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+                                   InstrStage<1, [P8_LU1, P8_LU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_DU3], 0>,
+                                   InstrStage<1, [P8_DU4], 0>,
+                                   InstrStage<1, [P8_DU5], 0>,
+                                   InstrStage<1, [P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+                                   InstrStage<1, [P8_LU1, P8_LU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_SprMFCR     , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_CRU]>],
+                                  [6, 1]>,
+  InstrItinData<IIC_SprMFCRF    , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_CRU]>],
+                                  [3, 1]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [4, 1]>, // mtctr
+  InstrItinData<IIC_FPGeneral   , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [5, 1, 1]>,
+  InstrItinData<IIC_FPCompare   , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [8, 1, 1]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [33, 1, 1]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [27, 1, 1]>,
+  InstrItinData<IIC_FPSqrtD     , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [44, 1, 1]>,
+  InstrItinData<IIC_FPSqrtS     , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [32, 1, 1]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [5, 1, 1, 1]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [5, 1, 1]>,
+  InstrItinData<IIC_VecGeneral  , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_VecVSL      , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_VecVSR      , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_VecFP       , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [6, 1, 1]>,
+  InstrItinData<IIC_VecFPCompare, [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [6, 1, 1]>,
+  InstrItinData<IIC_VecFPRound  , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [6, 1, 1]>,
+  InstrItinData<IIC_VecComplex  , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [7, 1, 1]>,
+  InstrItinData<IIC_VecPerm     , [InstrStage<1, [P8_DU1, P8_DU2], 0>,
+                                   InstrStage<1, [P8_FPU2, P8_FPU2]>],
+                                  [3, 1, 1]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// P8 machine model for scheduling and other instruction cost heuristics.
+// P8 has an 8 insn dispatch group (6 non-branch, 2 branch) and can issue up
+// to 10 insns per cycle (2-LU, 2-LSU, 2-FXU, 2-FPU, 1-CRU, 1-BRU).
+
+def P8Model : SchedMachineModel {
+  let IssueWidth = 8;  // up to 8 instructions dispatched per cycle.
+                       // up to six non-branch instructions.
+                       // up to two branches in a dispatch group.
+
+  let MinLatency = 0;  // Out-of-order dispatch.
+  let LoadLatency = 3; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 16;
+
+  // Try to make sure we have at least 10 dispatch groups in a loop.
+  let LoopMicroOpBufferSize = 60;
+
+  let Itineraries = P8Itineraries;
+}
+
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index 04e7ec6..c91428d 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -14,11 +14,13 @@
 #include "PPCSubtarget.h"
 #include "PPC.h"
 #include "PPCRegisterInfo.h"
+#include "PPCTargetMachine.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
@@ -32,39 +34,12 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_CTOR
 #include "PPCGenSubtargetInfo.inc"
 
-/// Return the datalayout string of a subtarget.
-static std::string getDataLayoutString(const Triple &T) {
-  bool is64Bit = T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le;
-  std::string Ret;
-
-  // Most PPC* platforms are big endian, PPC64LE is little endian.
-  if (T.getArch() == Triple::ppc64le)
-    Ret = "e";
-  else
-    Ret = "E";
-
-  Ret += DataLayout::getManglingComponent(T);
-
-  // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit
-  // pointers.
-  if (!is64Bit || T.getOS() == Triple::Lv2)
-    Ret += "-p:32:32";
-
-  // Note, the alignment values for f64 and i64 on ppc64 in Darwin
-  // documentation are wrong; these are correct (i.e. "what gcc does").
-  if (is64Bit || !T.isOSDarwin())
-    Ret += "-i64:64";
-  else
-    Ret += "-f64:32:64";
-
-  // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
-  if (is64Bit)
-    Ret += "-n32:64";
-  else
-    Ret += "-n32";
-
-  return Ret;
-}
+static cl::opt<bool> UseSubRegLiveness("ppc-track-subreg-liveness",
+cl::desc("Enable subregister liveness tracking for PPC"), cl::Hidden);
+
+static cl::opt<bool> QPXStackUnaligned("qpx-stack-unaligned",
+  cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"),
+  cl::Hidden);
 
 PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                             StringRef FS) {
@@ -76,12 +51,10 @@ PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
 PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
                            const std::string &FS, const PPCTargetMachine &TM)
     : PPCGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT),
-      DL(getDataLayoutString(TargetTriple)),
       IsPPC64(TargetTriple.getArch() == Triple::ppc64 ||
               TargetTriple.getArch() == Triple::ppc64le),
-      TargetABI(PPC_ABI_UNKNOWN),
-      FrameLowering(initializeSubtargetDependencies(CPU, FS)), InstrInfo(*this),
-      TLInfo(TM), TSInfo(&DL) {}
+      TM(TM), FrameLowering(initializeSubtargetDependencies(CPU, FS)),
+      InstrInfo(*this), TLInfo(TM, *this), TSInfo(TM.getDataLayout()) {}
 
 void PPCSubtarget::initializeEnvironment() {
   StackAlignment = 16;
@@ -95,6 +68,7 @@ void PPCSubtarget::initializeEnvironment() {
   HasQPX = false;
   HasVSX = false;
   HasP8Vector = false;
+  HasP8Altivec = false;
   HasFCPSGN = false;
   HasFSQRT = false;
   HasFRE = false;
@@ -108,6 +82,7 @@ void PPCSubtarget::initializeEnvironment() {
   HasFPCVT = false;
   HasISEL = false;
   HasPOPCNTD = false;
+  HasCMPB = false;
   HasLDBRX = false;
   IsBookE = false;
   HasOnlyMSYNC = false;
@@ -117,13 +92,21 @@ void PPCSubtarget::initializeEnvironment() {
   DeprecatedMFTB = false;
   DeprecatedDST = false;
   HasLazyResolverStubs = false;
+  HasICBT = false;
+  HasInvariantFunctionDescriptors = false;
+  IsQPXStackUnaligned = false;
 }
 
 void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // Determine default and user specified characteristics
   std::string CPUName = CPU;
-  if (CPUName.empty())
-    CPUName = "generic";
+  if (CPUName.empty()) {
+    // If cross-compiling with -march=ppc64le without -mcpu
+    if (TargetTriple.getArch() == Triple::ppc64le)
+      CPUName = "ppc64le";
+    else
+      CPUName = "generic";
+  }
 #if (defined(__APPLE__) || defined(__linux__)) && \
     (defined(__ppc__) || defined(__powerpc__))
   if (CPUName == "generic")
@@ -148,35 +131,18 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // QPX requires a 32-byte aligned stack. Note that we need to do this if
   // we're compiling for a BG/Q system regardless of whether or not QPX
   // is enabled because external functions will assume this alignment.
-  if (hasQPX() || isBGQ())
-    StackAlignment = 32;
+  IsQPXStackUnaligned = QPXStackUnaligned;
+  StackAlignment = getPlatformStackAlignment();
 
   // Determine endianness.
+  // FIXME: Part of the TargetMachine.
   IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le);
-
-  // FIXME: For now, we disable VSX in little-endian mode until endian
-  // issues in those instructions can be addressed.
-  if (IsLittleEndian) {
-    HasVSX = false;
-    HasP8Vector = false;
-  }
-
-  // Determine default ABI.
-  if (TargetABI == PPC_ABI_UNKNOWN) {
-    if (!isDarwin() && IsPPC64) {
-      if (IsLittleEndian)
-        TargetABI = PPC_ABI_ELFv2;
-      else
-        TargetABI = PPC_ABI_ELFv1;
-    }
-  }
 }
 
 /// hasLazyResolverStub - Return true if accesses to the specified global have
 /// to go through a dyld lazy resolution stub.  This means that an extra load
 /// is required to get the address of the global.
-bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV,
-                                       const TargetMachine &TM) const {
+bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const {
   // We never have stubs if HasLazyResolverStubs=false or if in static mode.
   if (!HasLazyResolverStubs || TM.getRelocationModel() == Reloc::Static)
     return false;
@@ -240,3 +206,9 @@ bool PPCSubtarget::useAA() const {
   return needsAggressiveScheduling(DarwinDirective);
 }
 
+bool PPCSubtarget::enableSubRegLiveness() const {
+  return UseSubRegLiveness;
+}
+
+bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); }
+bool PPCSubtarget::isPPC64() const { return TM.isPPC64(); }
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 1df19c3..247a96d 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -15,8 +15,8 @@
 #define LLVM_LIB_TARGET_POWERPC_PPCSUBTARGET_H
 
 #include "PPCFrameLowering.h"
-#include "PPCInstrInfo.h"
 #include "PPCISelLowering.h"
+#include "PPCInstrInfo.h"
 #include "PPCSelectionDAGInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/DataLayout.h"
@@ -68,9 +68,6 @@ protected:
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
 
-  // Calculates type size & alignment
-  const DataLayout DL;
-
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned StackAlignment;
@@ -92,6 +89,7 @@ protected:
   bool HasQPX;
   bool HasVSX;
   bool HasP8Vector;
+  bool HasP8Altivec;
   bool HasFCPSGN;
   bool HasFSQRT;
   bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
@@ -102,6 +100,7 @@ protected:
   bool HasFPCVT;
   bool HasISEL;
   bool HasPOPCNTD;
+  bool HasCMPB;
   bool HasLDBRX;
   bool IsBookE;
   bool HasOnlyMSYNC;
@@ -112,13 +111,15 @@ protected:
   bool DeprecatedDST;
   bool HasLazyResolverStubs;
   bool IsLittleEndian;
+  bool HasICBT;
+  bool HasInvariantFunctionDescriptors;
 
-  enum {
-    PPC_ABI_UNKNOWN,
-    PPC_ABI_ELFv1,
-    PPC_ABI_ELFv2
-  } TargetABI;
+  /// When targeting QPX running a stock PPC64 Linux kernel where the stack
+  /// alignment has not been changed, we need to keep the 16-byte alignment
+  /// of the stack.
+  bool IsQPXStackUnaligned;
 
+  const PPCTargetMachine &TM;
   PPCFrameLowering FrameLowering;
   PPCInstrInfo InstrInfo;
   PPCTargetLowering TLInfo;
@@ -153,7 +154,6 @@ public:
   const PPCFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-  const DataLayout *getDataLayout() const override { return &DL; }
   const PPCInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const PPCTargetLowering *getTargetLowering() const override {
     return &TLInfo;
@@ -164,6 +164,7 @@ public:
   const PPCRegisterInfo *getRegisterInfo() const override {
     return &getInstrInfo()->getRegisterInfo();
   }
+  const PPCTargetMachine &getTargetMachine() const { return TM; }
 
   /// initializeSubtargetDependencies - Initializes using a CPU and feature string
   /// so that we can use initializer lists for subtarget initialization.
@@ -176,7 +177,7 @@ private:
 public:
   /// isPPC64 - Return true if we are generating code for 64-bit pointer mode.
   ///
-  bool isPPC64() const { return IsPPC64; }
+  bool isPPC64() const;
 
   /// has64BitSupport - Return true if the selected CPU supports 64-bit
   /// instructions, regardless of whether we are in 32-bit or 64-bit mode.
@@ -194,8 +195,7 @@ public:
   /// hasLazyResolverStub - Return true if accesses to the specified global have
   /// to go through a dyld lazy resolution stub.  This means that an extra load
   /// is required to get the address of the global.
-  bool hasLazyResolverStub(const GlobalValue *GV,
-                           const TargetMachine &TM) const;
+  bool hasLazyResolverStub(const GlobalValue *GV) const;
 
   // isLittleEndian - True if generating little-endian code
   bool isLittleEndian() const { return IsLittleEndian; }
@@ -217,9 +217,11 @@ public:
   bool hasQPX() const { return HasQPX; }
   bool hasVSX() const { return HasVSX; }
   bool hasP8Vector() const { return HasP8Vector; }
+  bool hasP8Altivec() const { return HasP8Altivec; }
   bool hasMFOCRF() const { return HasMFOCRF; }
   bool hasISEL() const { return HasISEL; }
   bool hasPOPCNTD() const { return HasPOPCNTD; }
+  bool hasCMPB() const { return HasCMPB; }
   bool hasLDBRX() const { return HasLDBRX; }
   bool isBookE() const { return IsBookE; }
   bool hasOnlyMSYNC() const { return HasOnlyMSYNC; }
@@ -228,6 +230,18 @@ public:
   bool isE500() const { return IsE500; }
   bool isDeprecatedMFTB() const { return DeprecatedMFTB; }
   bool isDeprecatedDST() const { return DeprecatedDST; }
+  bool hasICBT() const { return HasICBT; }
+  bool hasInvariantFunctionDescriptors() const {
+    return HasInvariantFunctionDescriptors;
+  }
+
+  bool isQPXStackUnaligned() const { return IsQPXStackUnaligned; }
+  unsigned getPlatformStackAlignment() const {
+    if ((hasQPX() || isBGQ()) && !isQPXStackUnaligned())
+      return 32;
+
+    return 16;
+  }
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
@@ -239,9 +253,9 @@ public:
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
 
-  bool isDarwinABI() const { return isDarwin(); }
-  bool isSVR4ABI() const { return !isDarwin(); }
-  bool isELFv2ABI() const { return TargetABI == PPC_ABI_ELFv2; }
+  bool isDarwinABI() const { return isTargetMachO() || isDarwin(); }
+  bool isSVR4ABI() const { return !isDarwinABI(); }
+  bool isELFv2ABI() const;
 
   bool enableEarlyIfConversion() const override { return hasISEL(); }
 
@@ -257,6 +271,8 @@ public:
                            MachineInstr *end,
                            unsigned NumRegionInstrs) const override;
   bool useAA() const override;
+
+  bool enableSubRegLiveness() const override;
 };
 } // End llvm namespace
 
diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
new file mode 100644
index 0000000..270fc71
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -0,0 +1,168 @@
+//===---------- PPCTLSDynamicCall.cpp - TLS Dynamic Call Fixup ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands ADDItls{ld,gd}LADDR[32] machine instructions into
+// separate ADDItls[gd]L[32] and GETtlsADDR[32] instructions, both of
+// which define GPR3.  A copy is added from GPR3 to the target virtual
+// register of the original instruction.  The GETtlsADDR[32] is really
+// a call instruction, so its target register is constrained to be GPR3.
+// This is not true of ADDItls[gd]L[32], but there is a legacy linker
+// optimization bug that requires the target register of the addi of
+// a local- or general-dynamic TLS access sequence to be GPR3.
+//
+// This is done in a late pass so that TLS variable accesses can be
+// fully commoned by MachineCSE.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-tls-dynamic-call"
+
+namespace llvm {
+  void initializePPCTLSDynamicCallPass(PassRegistry&);
+}
+
+namespace {
+  struct PPCTLSDynamicCall : public MachineFunctionPass {
+    static char ID;
+    PPCTLSDynamicCall() : MachineFunctionPass(ID) {
+      initializePPCTLSDynamicCallPass(*PassRegistry::getPassRegistry());
+    }
+
+    const PPCInstrInfo *TII;
+    LiveIntervals *LIS;
+
+protected:
+    bool processBlock(MachineBasicBlock &MBB) {
+      bool Changed = false;
+      bool Is64Bit = MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64();
+
+      for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+           I != IE; ++I) {
+        MachineInstr *MI = I;
+
+        if (MI->getOpcode() != PPC::ADDItlsgdLADDR &&
+            MI->getOpcode() != PPC::ADDItlsldLADDR &&
+            MI->getOpcode() != PPC::ADDItlsgdLADDR32 &&
+            MI->getOpcode() != PPC::ADDItlsldLADDR32)
+          continue;
+
+        DEBUG(dbgs() << "TLS Dynamic Call Fixup:\n    " << *MI;);
+
+        unsigned OutReg = MI->getOperand(0).getReg();
+        unsigned InReg  = MI->getOperand(1).getReg();
+        DebugLoc DL = MI->getDebugLoc();
+        unsigned GPR3 = Is64Bit ? PPC::X3 : PPC::R3;
+        unsigned Opc1, Opc2;
+        SmallVector<unsigned, 4> OrigRegs;
+        OrigRegs.push_back(OutReg);
+        OrigRegs.push_back(InReg);
+        OrigRegs.push_back(GPR3);
+
+        switch (MI->getOpcode()) {
+        default:
+          llvm_unreachable("Opcode inconsistency error");
+        case PPC::ADDItlsgdLADDR:
+          Opc1 = PPC::ADDItlsgdL;
+          Opc2 = PPC::GETtlsADDR;
+          break;
+        case PPC::ADDItlsldLADDR:
+          Opc1 = PPC::ADDItlsldL;
+          Opc2 = PPC::GETtlsldADDR;
+          break;
+        case PPC::ADDItlsgdLADDR32:
+          Opc1 = PPC::ADDItlsgdL32;
+          Opc2 = PPC::GETtlsADDR32;
+          break;
+        case PPC::ADDItlsldLADDR32:
+          Opc1 = PPC::ADDItlsldL32;
+          Opc2 = PPC::GETtlsldADDR32;
+          break;
+        }
+
+        // Expand into two ops built prior to the existing instruction.
+        MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3)
+          .addReg(InReg);
+        Addi->addOperand(MI->getOperand(2));
+
+        // The ADDItls* instruction is the first instruction in the
+        // repair range.
+        MachineBasicBlock::iterator First = I;
+        --First;
+
+        MachineInstr *Call = (BuildMI(MBB, I, DL, TII->get(Opc2), GPR3)
+                              .addReg(GPR3));
+        Call->addOperand(MI->getOperand(3));
+
+        BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), OutReg)
+          .addReg(GPR3);
+
+        // The COPY is the last instruction in the repair range.
+        MachineBasicBlock::iterator Last = I;
+        --Last;
+
+        // Move past the original instruction and remove it.
+        ++I;
+        MI->removeFromParent();
+
+        // Repair the live intervals.
+        LIS->repairIntervalsInRange(&MBB, First, Last, OrigRegs);
+        Changed = true;
+      }
+
+      return Changed;
+    }
+
+public:
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      TII = MF.getSubtarget<PPCSubtarget>().getInstrInfo();
+      LIS = &getAnalysis<LiveIntervals>();
+
+      bool Changed = false;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LiveIntervals>();
+      AU.addPreserved<LiveIntervals>();
+      AU.addRequired<SlotIndexes>();
+      AU.addPreserved<SlotIndexes>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS_BEGIN(PPCTLSDynamicCall, DEBUG_TYPE,
+                      "PowerPC TLS Dynamic Call Fixup", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(PPCTLSDynamicCall, DEBUG_TYPE,
+                    "PowerPC TLS Dynamic Call Fixup", false, false)
+
+char PPCTLSDynamicCall::ID = 0;
+FunctionPass*
+llvm::createPPCTLSDynamicCallPass() { return new PPCTLSDynamicCall(); }
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index f15189c..b219e93 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -12,26 +12,42 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCTargetMachine.h"
-#include "PPCTargetObjectFile.h"
 #include "PPC.h"
+#include "PPCTargetObjectFile.h"
+#include "PPCTargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 static cl::
 opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden,
                         cl::desc("Disable CTR loops for PPC"));
 
+static cl::
+opt<bool> DisablePreIncPrep("disable-ppc-preinc-prep", cl::Hidden,
+                            cl::desc("Disable PPC loop preinc prep"));
+
 static cl::opt<bool>
 VSXFMAMutateEarly("schedule-ppc-vsx-fma-mutation-early",
   cl::Hidden, cl::desc("Schedule VSX FMA instruction mutation early"));
 
+static cl::opt<bool>
+EnableGEPOpt("ppc-gep-opt", cl::Hidden,
+             cl::desc("Enable optimizations on complex GEPs"),
+             cl::init(true));
+
+static cl::opt<bool>
+EnablePrefetch("enable-ppc-prefetching",
+                  cl::desc("disable software prefetching on PPC"),
+                  cl::init(false), cl::Hidden);
+
 extern "C" void LLVMInitializePowerPCTarget() {
   // Register the targets
   RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);
@@ -39,6 +55,40 @@ extern "C" void LLVMInitializePowerPCTarget() {
   RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget);
 }
 
+/// Return the datalayout string of a subtarget.
+static std::string getDataLayoutString(const Triple &T) {
+  bool is64Bit = T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le;
+  std::string Ret;
+
+  // Most PPC* platforms are big endian, PPC64LE is little endian.
+  if (T.getArch() == Triple::ppc64le)
+    Ret = "e";
+  else
+    Ret = "E";
+
+  Ret += DataLayout::getManglingComponent(T);
+
+  // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit
+  // pointers.
+  if (!is64Bit || T.getOS() == Triple::Lv2)
+    Ret += "-p:32:32";
+
+  // Note, the alignment values for f64 and i64 on ppc64 in Darwin
+  // documentation are wrong; these are correct (i.e. "what gcc does").
+  if (is64Bit || !T.isOSDarwin())
+    Ret += "-i64:64";
+  else
+    Ret += "-f64:32:64";
+
+  // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
+  if (is64Bit)
+    Ret += "-n32:64";
+  else
+    Ret += "-n32";
+
+  return Ret;
+}
+
 static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL, StringRef TT) {
   std::string FullFS = FS;
   Triple TargetTriple(TT);
@@ -58,6 +108,14 @@ static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL, String
     else
       FullFS = "+crbits";
   }
+
+  if (OL != CodeGenOpt::None) {
+     if (!FullFS.empty())
+      FullFS = "+invariant-function-descriptors," + FullFS;
+    else
+      FullFS = "+invariant-function-descriptors";
+  }
+
   return FullFS;
 }
 
@@ -70,6 +128,30 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   return make_unique<PPC64LinuxTargetObjectFile>();
 }
 
+static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
+                                                 const TargetOptions &Options) {
+  if (Options.MCOptions.getABIName().startswith("elfv1"))
+    return PPCTargetMachine::PPC_ABI_ELFv1;
+  else if (Options.MCOptions.getABIName().startswith("elfv2"))
+    return PPCTargetMachine::PPC_ABI_ELFv2;
+
+  assert(Options.MCOptions.getABIName().empty() &&
+	 "Unknown target-abi option!");
+
+  if (!TT.isMacOSX()) {
+    switch (TT.getArch()) {
+    case Triple::ppc64le:
+      return PPCTargetMachine::PPC_ABI_ELFv2;
+    case Triple::ppc64:
+      return PPCTargetMachine::PPC_ABI_ELFv1;
+    default:
+      // Fallthrough.
+      ;
+    }
+  }
+  return PPCTargetMachine::PPC_ABI_UNKNOWN;
+}
+
 // The FeatureString here is a little subtle. We are modifying the feature string
 // with what are (currently) non-function specific overrides as it goes into the
 // LLVMTargetMachine constructor and then using the stored value in the
@@ -81,7 +163,8 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT, StringRef CPU,
     : LLVMTargetMachine(T, TT, CPU, computeFSAdditions(FS, OL, TT), Options, RM,
                         CM, OL),
       TLOF(createTLOF(Triple(getTargetTriple()))),
-      Subtarget(TT, CPU, TargetFS, *this) {
+      TargetABI(computeTargetABI(Triple(TT), Options)),
+      DL(getDataLayoutString(Triple(TT))), Subtarget(TT, CPU, TargetFS, *this) {
   initAsmInfo();
 }
 
@@ -109,11 +192,8 @@ PPC64TargetMachine::PPC64TargetMachine(const Target &T, StringRef TT,
 
 const PPCSubtarget *
 PPCTargetMachine::getSubtargetImpl(const Function &F) const {
-  AttributeSet FnAttrs = F.getAttributes();
-  Attribute CPUAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
-  Attribute FSAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
 
   std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
                         ? CPUAttr.getValueAsString().str()
@@ -148,17 +228,13 @@ public:
     return getTM<PPCTargetMachine>();
   }
 
-  const PPCSubtarget &getPPCSubtarget() const {
-    return *getPPCTargetMachine().getSubtargetImpl();
-  }
-
   void addIRPasses() override;
   bool addPreISel() override;
   bool addILPOpts() override;
   bool addInstSelector() override;
-  bool addPreRegAlloc() override;
-  bool addPreSched2() override;
-  bool addPreEmitPass() override;
+  void addPreRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
 };
 } // namespace
 
@@ -168,10 +244,37 @@ TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
 
 void PPCPassConfig::addIRPasses() {
   addPass(createAtomicExpandPass(&getPPCTargetMachine()));
+
+  // For the BG/Q (or if explicitly requested), add explicit data prefetch
+  // intrinsics.
+  bool UsePrefetching =
+    Triple(TM->getTargetTriple()).getVendor() == Triple::BGQ &&           
+    getOptLevel() != CodeGenOpt::None;
+  if (EnablePrefetch.getNumOccurrences() > 0)
+    UsePrefetching = EnablePrefetch;
+  if (UsePrefetching)
+    addPass(createPPCLoopDataPrefetchPass());
+
+  if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
+    // Call SeparateConstOffsetFromGEP pass to extract constants within indices
+    // and lower a GEP with multiple indices to either arithmetic operations or
+    // multiple GEPs with single index.
+    addPass(createSeparateConstOffsetFromGEPPass(TM, true));
+    // Call EarlyCSE pass to find and remove subexpressions in the lowered
+    // result.
+    addPass(createEarlyCSEPass());
+    // Do loop invariant code motion in case part of the lowered result is
+    // invariant.
+    addPass(createLICMPass());
+  }
+
   TargetPassConfig::addIRPasses();
 }
 
 bool PPCPassConfig::addPreISel() {
+  if (!DisablePreIncPrep && getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine()));
+
   if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
     addPass(createPPCCTRLoops(getPPCTargetMachine()));
 
@@ -196,35 +299,27 @@ bool PPCPassConfig::addInstSelector() {
   return false;
 }
 
-bool PPCPassConfig::addPreRegAlloc() {
+void PPCPassConfig::addPreRegAlloc() {
   initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
   insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
              &PPCVSXFMAMutateID);
-  return false;
+  if (getPPCTargetMachine().getRelocationModel() == Reloc::PIC_)
+    addPass(createPPCTLSDynamicCallPass());
 }
 
-bool PPCPassConfig::addPreSched2() {
-  addPass(createPPCVSXCopyCleanupPass());
-
+void PPCPassConfig::addPreSched2() {
   if (getOptLevel() != CodeGenOpt::None)
     addPass(&IfConverterID);
-
-  return true;
 }
 
-bool PPCPassConfig::addPreEmitPass() {
+void PPCPassConfig::addPreEmitPass() {
   if (getOptLevel() != CodeGenOpt::None)
-    addPass(createPPCEarlyReturnPass());
+    addPass(createPPCEarlyReturnPass(), false);
   // Must run branch selection immediately preceding the asm printer.
-  addPass(createPPCBranchSelectionPass());
-  return false;
+  addPass(createPPCBranchSelectionPass(), false);
 }
 
-void PPCTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  // Add first the target-independent BasicTTI pass, then our PPC pass. This
-  // allows the PPC pass to delegate to the target independent layer when
-  // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(this));
-  PM.add(createPPCTargetTransformInfoPass(this));
+TargetIRAnalysis PPCTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis(
+      [this](Function &F) { return TargetTransformInfo(PPCTTIImpl(this, F)); });
 }
-
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 5095d73..6508484 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -24,30 +24,41 @@ namespace llvm {
 /// PPCTargetMachine - Common code between 32-bit and 64-bit PowerPC targets.
 ///
 class PPCTargetMachine : public LLVMTargetMachine {
+public:
+  enum PPCABI { PPC_ABI_UNKNOWN, PPC_ABI_ELFv1, PPC_ABI_ELFv2 };
+private:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  PPCABI TargetABI;
+  // Calculates type size & alignment
+  const DataLayout DL;
   PPCSubtarget Subtarget;
 
   mutable StringMap<std::unique_ptr<PPCSubtarget>> SubtargetMap;
 
 public:
-  PPCTargetMachine(const Target &T, StringRef TT,
-                   StringRef CPU, StringRef FS, const TargetOptions &Options,
-                   Reloc::Model RM, CodeModel::Model CM,
-                   CodeGenOpt::Level OL);
+  PPCTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+                   const TargetOptions &Options, Reloc::Model RM,
+                   CodeModel::Model CM, CodeGenOpt::Level OL);
 
   ~PPCTargetMachine() override;
 
+  const DataLayout *getDataLayout() const override { return &DL; }
   const PPCSubtarget *getSubtargetImpl() const override { return &Subtarget; }
   const PPCSubtarget *getSubtargetImpl(const Function &F) const override;
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  /// \brief Register PPC analysis passes with a pass manager.
-  void addAnalysisPasses(PassManagerBase &PM) override;
+  TargetIRAnalysis getTargetIRAnalysis() override;
+
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+  bool isELFv2ABI() const { return TargetABI == PPC_ABI_ELFv2; }
+  bool isPPC64() const {
+    Triple TT(getTargetTriple());
+    return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le);
+  };
 };
 
 /// PPC32TargetMachine - PowerPC 32-bit target machine.
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 37624ed..073bbb0 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -1,4 +1,4 @@
-//===-- PPCTargetTransformInfo.cpp - PPC specific TTI pass ----------------===//
+//===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,17 +6,10 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-/// \file
-/// This file implements a TargetTransformInfo analysis pass specific to the
-/// PPC target machine. It uses the target's detailed information to provide
-/// more precise answers to certain TTI queries, while letting the target
-/// independent and default TTI implementations handle the rest.
-///
-//===----------------------------------------------------------------------===//
 
-#include "PPC.h"
-#include "PPCTargetMachine.h"
+#include "PPCTargetTransformInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
@@ -28,115 +21,23 @@ using namespace llvm;
 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
 
-// Declare the pass initialization routine locally as target-specific passes
-// don't have a target-wide initialization entry point, and so we rely on the
-// pass constructor initialization.
-namespace llvm {
-void initializePPCTTIPass(PassRegistry &);
-}
-
-namespace {
-
-class PPCTTI final : public ImmutablePass, public TargetTransformInfo {
-  const TargetMachine *TM;
-  const PPCSubtarget *ST;
-  const PPCTargetLowering *TLI;
-
-public:
-  PPCTTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) {
-    llvm_unreachable("This pass cannot be directly constructed");
-  }
-
-  PPCTTI(const PPCTargetMachine *TM)
-      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
-        TLI(TM->getSubtargetImpl()->getTargetLowering()) {
-    initializePPCTTIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void initializePass() override {
-    pushTTIStack(this);
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    TargetTransformInfo::getAnalysisUsage(AU);
-  }
-
-  /// Pass identification.
-  static char ID;
-
-  /// Provide necessary pointer adjustments for the two base classes.
-  void *getAdjustedAnalysisPointer(const void *ID) override {
-    if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo*)this;
-    return this;
-  }
-
-  /// \name Scalar TTI Implementations
-  /// @{
-  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
-
-  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                         Type *Ty) const override;
-  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                         Type *Ty) const override;
-
-  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
-  void getUnrollingPreferences(const Function *F, Loop *L,
-                               UnrollingPreferences &UP) const override;
-
-  /// @}
-
-  /// \name Vector TTI Implementations
-  /// @{
-
-  unsigned getNumberOfRegisters(bool Vector) const override;
-  unsigned getRegisterBitWidth(bool Vector) const override;
-  unsigned getMaxInterleaveFactor() const override;
-  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
-                                  OperandValueKind, OperandValueProperties,
-                                  OperandValueProperties) const override;
-  unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
-                          int Index, Type *SubTp) const override;
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
-                            Type *Src) const override;
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                              Type *CondTy) const override;
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
-                              unsigned Index) const override;
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) const override;
-
-  /// @}
-};
-
-} // end anonymous namespace
-
-INITIALIZE_AG_PASS(PPCTTI, TargetTransformInfo, "ppctti",
-                   "PPC Target Transform Info", true, true, false)
-char PPCTTI::ID = 0;
-
-ImmutablePass *
-llvm::createPPCTargetTransformInfoPass(const PPCTargetMachine *TM) {
-  return new PPCTTI(TM);
-}
-
-
 //===----------------------------------------------------------------------===//
 //
 // PPC cost model.
 //
 //===----------------------------------------------------------------------===//
 
-PPCTTI::PopcntSupportKind PPCTTI::getPopcntSupport(unsigned TyWidth) const {
+TargetTransformInfo::PopcntSupportKind
+PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
   if (ST->hasPOPCNTD() && TyWidth <= 64)
-    return PSK_FastHardware;
-  return PSK_Software;
+    return TTI::PSK_FastHardware;
+  return TTI::PSK_Software;
 }
 
-unsigned PPCTTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+unsigned PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   if (DisablePPCConstHoist)
-    return TargetTransformInfo::getIntImmCost(Imm, Ty);
+    return BaseT::getIntImmCost(Imm, Ty);
 
   assert(Ty->isIntegerTy());
 
@@ -145,28 +46,28 @@ unsigned PPCTTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
     return ~0U;
 
   if (Imm == 0)
-    return TCC_Free;
+    return TTI::TCC_Free;
 
   if (Imm.getBitWidth() <= 64) {
     if (isInt<16>(Imm.getSExtValue()))
-      return TCC_Basic;
+      return TTI::TCC_Basic;
 
     if (isInt<32>(Imm.getSExtValue())) {
       // A constant that can be materialized using lis.
       if ((Imm.getZExtValue() & 0xFFFF) == 0)
-        return TCC_Basic;
+        return TTI::TCC_Basic;
 
-      return 2 * TCC_Basic;
+      return 2 * TTI::TCC_Basic;
     }
   }
 
-  return 4 * TCC_Basic;
+  return 4 * TTI::TCC_Basic;
 }
 
-unsigned PPCTTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
-                               const APInt &Imm, Type *Ty) const {
+unsigned PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                                   const APInt &Imm, Type *Ty) {
   if (DisablePPCConstHoist)
-    return TargetTransformInfo::getIntImmCost(IID, Idx, Imm, Ty);
+    return BaseT::getIntImmCost(IID, Idx, Imm, Ty);
 
   assert(Ty->isIntegerTy());
 
@@ -175,22 +76,32 @@ unsigned PPCTTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
     return ~0U;
 
   switch (IID) {
-  default: return TCC_Free;
+  default:
+    return TTI::TCC_Free;
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::ssub_with_overflow:
   case Intrinsic::usub_with_overflow:
     if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
-      return TCC_Free;
+      return TTI::TCC_Free;
+    break;
+  case Intrinsic::experimental_stackmap:
+    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TTI::TCC_Free;
+    break;
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TTI::TCC_Free;
     break;
   }
-  return PPCTTI::getIntImmCost(Imm, Ty);
+  return PPCTTIImpl::getIntImmCost(Imm, Ty);
 }
 
-unsigned PPCTTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                               Type *Ty) const {
+unsigned PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
+                                   const APInt &Imm, Type *Ty) {
   if (DisablePPCConstHoist)
-    return TargetTransformInfo::getIntImmCost(Opcode, Idx, Imm, Ty);
+    return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty);
 
   assert(Ty->isIntegerTy());
 
@@ -202,14 +113,15 @@ unsigned PPCTTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
   bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
        ZeroFree = false;
   switch (Opcode) {
-  default: return TCC_Free;
+  default:
+    return TTI::TCC_Free;
   case Instruction::GetElementPtr:
     // Always hoist the base address of a GetElementPtr. This prevents the
     // creation of new constants for every base constant that gets constant
     // folded with the offset.
     if (Idx == 0)
-      return 2 * TCC_Basic;
-    return TCC_Free;
+      return 2 * TTI::TCC_Basic;
+    return TTI::TCC_Free;
   case Instruction::And:
     RunFree = true; // (for the rotate-and-mask instructions)
     // Fallthrough...
@@ -241,52 +153,54 @@ unsigned PPCTTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
   }
 
   if (ZeroFree && Imm == 0)
-    return TCC_Free;
+    return TTI::TCC_Free;
 
   if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
     if (isInt<16>(Imm.getSExtValue()))
-      return TCC_Free;
+      return TTI::TCC_Free;
 
     if (RunFree) {
       if (Imm.getBitWidth() <= 32 &&
           (isShiftedMask_32(Imm.getZExtValue()) ||
            isShiftedMask_32(~Imm.getZExtValue())))
-        return TCC_Free;
-
+        return TTI::TCC_Free;
 
       if (ST->isPPC64() &&
           (isShiftedMask_64(Imm.getZExtValue()) ||
            isShiftedMask_64(~Imm.getZExtValue())))
-        return TCC_Free;
+        return TTI::TCC_Free;
     }
 
     if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
-      return TCC_Free;
+      return TTI::TCC_Free;
 
     if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
-      return TCC_Free;
+      return TTI::TCC_Free;
   }
 
-  return PPCTTI::getIntImmCost(Imm, Ty);
+  return PPCTTIImpl::getIntImmCost(Imm, Ty);
 }
 
-void PPCTTI::getUnrollingPreferences(const Function *F, Loop *L,
-                                     UnrollingPreferences &UP) const {
-  if (TM->getSubtarget<PPCSubtarget>(F).getDarwinDirective() == PPC::DIR_A2) {
+void PPCTTIImpl::getUnrollingPreferences(Loop *L,
+                                         TTI::UnrollingPreferences &UP) {
+  if (ST->getDarwinDirective() == PPC::DIR_A2) {
     // The A2 is in-order with a deep pipeline, and concatenation unrolling
     // helps expose latency-hiding opportunities to the instruction scheduler.
     UP.Partial = UP.Runtime = true;
   }
+
+  BaseT::getUnrollingPreferences(L, UP);
 }
 
-unsigned PPCTTI::getNumberOfRegisters(bool Vector) const {
-  if (Vector && !ST->hasAltivec())
+unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
+  if (Vector && !ST->hasAltivec() && !ST->hasQPX())
     return 0;
   return ST->hasVSX() ? 64 : 32;
 }
 
-unsigned PPCTTI::getRegisterBitWidth(bool Vector) const {
+unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) {
   if (Vector) {
+    if (ST->hasQPX()) return 256;
     if (ST->hasAltivec()) return 128;
     return 0;
   }
@@ -297,7 +211,7 @@ unsigned PPCTTI::getRegisterBitWidth(bool Vector) const {
 
 }
 
-unsigned PPCTTI::getMaxInterleaveFactor() const {
+unsigned PPCTTIImpl::getMaxInterleaveFactor() {
   unsigned Directive = ST->getDarwinDirective();
   // The 440 has no SIMD support, but floating-point instructions
   // have a 5-cycle latency, so unroll by 5x for latency hiding.
@@ -313,40 +227,46 @@ unsigned PPCTTI::getMaxInterleaveFactor() const {
   if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
     return 1;
 
+  // For P7 and P8, floating-point instructions have a 6-cycle latency and
+  // there are two execution units, so unroll by 12x for latency hiding.
+  if (Directive == PPC::DIR_PWR7 ||
+      Directive == PPC::DIR_PWR8)
+    return 12;
+
   // For most things, modern systems have two execution units (and
   // out-of-order execution).
   return 2;
 }
 
-unsigned PPCTTI::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, OperandValueKind Op1Info,
-    OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo,
-    OperandValueProperties Opd2PropInfo) const {
+unsigned PPCTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
+    TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
   // Fallback to the default implementation.
-  return TargetTransformInfo::getArithmeticInstrCost(
-      Opcode, Ty, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+                                       Opd1PropInfo, Opd2PropInfo);
 }
 
-unsigned PPCTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
-                                Type *SubTp) const {
-  return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+unsigned PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                                    Type *SubTp) {
+  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
-unsigned PPCTTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
+unsigned PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
-  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+  return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-unsigned PPCTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                    Type *CondTy) const {
-  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+unsigned PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                        Type *CondTy) {
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 }
 
-unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                    unsigned Index) const {
+unsigned PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                        unsigned Index) {
   assert(Val->isVectorTy() && "This must be a vector type");
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -357,7 +277,13 @@ unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
     if (Index == 0)
       return 0;
 
-    return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+    return BaseT::getVectorInstrCost(Opcode, Val, Index);
+  } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
+    // Floating point scalars are already located in index #0.
+    if (Index == 0)
+      return 0;
+
+    return BaseT::getVectorInstrCost(Opcode, Val, Index);
   }
 
   // Estimated cost of a load-hit-store delay.  This was obtained
@@ -374,21 +300,20 @@ unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
   // these need to be estimated as very costly.
   if (ISD == ISD::EXTRACT_VECTOR_ELT ||
       ISD == ISD::INSERT_VECTOR_ELT)
-    return LHSPenalty +
-      TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+    return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index);
 
-  return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+  return BaseT::getVectorInstrCost(Opcode, Val, Index);
 }
 
-unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                 unsigned AddressSpace) const {
+unsigned PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                     unsigned Alignment,
+                                     unsigned AddressSpace) {
   // Legalize the type.
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
          "Invalid Opcode");
 
-  unsigned Cost =
-    TargetTransformInfo::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+  unsigned Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
 
   // VSX loads/stores support unaligned access.
   if (ST->hasVSX()) {
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
new file mode 100644
index 0000000..cef7079
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -0,0 +1,103 @@
+//===-- PPCTargetTransformInfo.h - PPC specific TTI -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// PPC target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_POWERPC_PPCTARGETTRANSFORMINFO_H
+
+#include "PPC.h"
+#include "PPCTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
+  typedef BasicTTIImplBase<PPCTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const PPCSubtarget *ST;
+  const PPCTargetLowering *TLI;
+
+  const PPCSubtarget *getST() const { return ST; }
+  const PPCTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit PPCTTIImpl(const PPCTargetMachine *TM, Function &F)
+      : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {}
+
+  // Provide value semantics. MSVC requires that we spell all of these out.
+  PPCTTIImpl(const PPCTTIImpl &Arg)
+      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
+  PPCTTIImpl(PPCTTIImpl &&Arg)
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+        TLI(std::move(Arg.TLI)) {}
+  PPCTTIImpl &operator=(const PPCTTIImpl &RHS) {
+    BaseT::operator=(static_cast<const BaseT &>(RHS));
+    ST = RHS.ST;
+    TLI = RHS.TLI;
+    return *this;
+  }
+  PPCTTIImpl &operator=(PPCTTIImpl &&RHS) {
+    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
+    ST = std::move(RHS.ST);
+    TLI = std::move(RHS.TLI);
+    return *this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+
+  using BaseT::getIntImmCost;
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty);
+
+  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                         Type *Ty);
+  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                         Type *Ty);
+
+  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getMaxInterleaveFactor();
+  unsigned getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                          Type *SubTp);
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace);
+
+  /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/PowerPC/PPCVSXCopy.cpp b/lib/Target/PowerPC/PPCVSXCopy.cpp
new file mode 100644
index 0000000..5e3ae2a
--- /dev/null
+++ b/lib/Target/PowerPC/PPCVSXCopy.cpp
@@ -0,0 +1,176 @@
+//===-------------- PPCVSXCopy.cpp - VSX Copy Legalization ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass which deals with the complexity of generating legal VSX register
+// copies to/from register classes which partially overlap with the VSX
+// register file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
+#include "PPCHazardRecognizers.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-vsx-copy"
+
+namespace llvm {
+  void initializePPCVSXCopyPass(PassRegistry&);
+}
+
+namespace {
+  // PPCVSXCopy pass - For copies between VSX registers and non-VSX registers
+  // (Altivec and scalar floating-point registers), we need to transform the
+  // copies into subregister copies with other restrictions.
+  struct PPCVSXCopy : public MachineFunctionPass {
+    static char ID;
+    PPCVSXCopy() : MachineFunctionPass(ID) {
+      initializePPCVSXCopyPass(*PassRegistry::getPassRegistry());
+    }
+
+    const TargetInstrInfo *TII;
+
+    bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC,
+                      MachineRegisterInfo &MRI) {
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        return RC->hasSubClassEq(MRI.getRegClass(Reg));
+      } else if (RC->contains(Reg)) {
+        return true;
+      }
+
+      return false;
+    }
+
+    bool IsVSReg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::VSRCRegClass, MRI);
+    }
+
+    bool IsVRReg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::VRRCRegClass, MRI);
+    }
+
+    bool IsF8Reg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI);
+    }
+
+protected:
+    bool processBlock(MachineBasicBlock &MBB) {
+      bool Changed = false;
+
+      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+      for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+           I != IE; ++I) {
+        MachineInstr *MI = I;
+        if (!MI->isFullCopy())
+          continue;
+
+        MachineOperand &DstMO = MI->getOperand(0);
+        MachineOperand &SrcMO = MI->getOperand(1);
+
+        if ( IsVSReg(DstMO.getReg(), MRI) &&
+            !IsVSReg(SrcMO.getReg(), MRI)) {
+          // This is a copy *to* a VSX register from a non-VSX register.
+          Changed = true;
+
+          const TargetRegisterClass *SrcRC =
+            IsVRReg(SrcMO.getReg(), MRI) ? &PPC::VSHRCRegClass :
+                                           &PPC::VSLRCRegClass;
+          assert((IsF8Reg(SrcMO.getReg(), MRI) ||
+                  IsVRReg(SrcMO.getReg(), MRI)) &&
+                 "Unknown source for a VSX copy");
+
+          unsigned NewVReg = MRI.createVirtualRegister(SrcRC);
+          BuildMI(MBB, MI, MI->getDebugLoc(),
+                  TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg)
+            .addImm(1) // add 1, not 0, because there is no implicit clearing
+                       // of the high bits.
+            .addOperand(SrcMO)
+            .addImm(IsVRReg(SrcMO.getReg(), MRI) ? PPC::sub_128 :
+                                                   PPC::sub_64);
+
+          // The source of the original copy is now the new virtual register.
+          SrcMO.setReg(NewVReg);
+        } else if (!IsVSReg(DstMO.getReg(), MRI) &&
+                    IsVSReg(SrcMO.getReg(), MRI)) {
+          // This is a copy *from* a VSX register to a non-VSX register.
+          Changed = true;
+
+          const TargetRegisterClass *DstRC =
+            IsVRReg(DstMO.getReg(), MRI) ? &PPC::VSHRCRegClass :
+                                           &PPC::VSLRCRegClass;
+          assert((IsF8Reg(DstMO.getReg(), MRI) ||
+                  IsVRReg(DstMO.getReg(), MRI)) &&
+                 "Unknown destination for a VSX copy");
+
+          // Copy the VSX value into a new VSX register of the correct subclass.
+          unsigned NewVReg = MRI.createVirtualRegister(DstRC);
+          BuildMI(MBB, MI, MI->getDebugLoc(),
+                  TII->get(TargetOpcode::COPY), NewVReg)
+            .addOperand(SrcMO);
+
+          // Transform the original copy into a subregister extraction copy.
+          SrcMO.setReg(NewVReg);
+          SrcMO.setSubReg(IsVRReg(DstMO.getReg(), MRI) ? PPC::sub_128 :
+                                                         PPC::sub_64);
+        }
+      }
+
+      return Changed;
+    }
+
+public:
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      // If we don't have VSX on the subtarget, don't do anything.
+      const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
+      if (!STI.hasVSX())
+        return false;
+      TII = STI.getInstrInfo();
+
+      bool Changed = false;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS(PPCVSXCopy, DEBUG_TYPE,
+                "PowerPC VSX Copy Legalization", false, false)
+
+char PPCVSXCopy::ID = 0;
+FunctionPass*
+llvm::createPPCVSXCopyPass() { return new PPCVSXCopy(); }
+
diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
new file mode 100644
index 0000000..f352fa6
--- /dev/null
+++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -0,0 +1,335 @@
+//===--------------- PPCVSXFMAMutate.cpp - VSX FMA Mutation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass mutates the form of VSX FMA instructions to avoid unnecessary
+// copies.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool> DisableVSXFMAMutate("disable-ppc-vsx-fma-mutation",
+cl::desc("Disable VSX FMA instruction mutation"), cl::Hidden);
+
+#define DEBUG_TYPE "ppc-vsx-fma-mutate"
+
+namespace llvm { namespace PPC {
+  int getAltVSXFMAOpcode(uint16_t Opcode);
+} }
+
+namespace {
+  // PPCVSXFMAMutate pass - For copies between VSX registers and non-VSX registers
+  // (Altivec and scalar floating-point registers), we need to transform the
+  // copies into subregister copies with other restrictions.
+  struct PPCVSXFMAMutate : public MachineFunctionPass {
+    static char ID;
+    PPCVSXFMAMutate() : MachineFunctionPass(ID) {
+      initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
+    }
+
+    LiveIntervals *LIS;
+    const PPCInstrInfo *TII;
+
+protected:
+    bool processBlock(MachineBasicBlock &MBB) {
+      bool Changed = false;
+
+      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+      const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
+      for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+           I != IE; ++I) {
+        MachineInstr *MI = I;
+
+        // The default (A-type) VSX FMA form kills the addend (it is taken from
+        // the target register, which is then updated to reflect the result of
+        // the FMA). If the instruction, however, kills one of the registers
+        // used for the product, then we can use the M-form instruction (which
+        // will take that value from the to-be-defined register).
+
+        int AltOpc = PPC::getAltVSXFMAOpcode(MI->getOpcode());
+        if (AltOpc == -1)
+          continue;
+
+        // This pass is run after register coalescing, and so we're looking for
+        // a situation like this:
+        //   ...
+        //   %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
+        //   %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
+        //                         %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
+        //   ...
+        //   %vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
+        //                         %RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
+        //   ...
+        // Where we can eliminate the copy by changing from the A-type to the
+        // M-type instruction. Specifically, for this example, this means:
+        //   %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
+        //                         %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
+        // is replaced by:
+        //   %vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
+        //                         %RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
+        // and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
+
+        SlotIndex FMAIdx = LIS->getInstructionIndex(MI);
+
+        VNInfo *AddendValNo =
+          LIS->getInterval(MI->getOperand(1).getReg()).Query(FMAIdx).valueIn();
+        MachineInstr *AddendMI = LIS->getInstructionFromIndex(AddendValNo->def);
+
+        // The addend and this instruction must be in the same block.
+
+        if (!AddendMI || AddendMI->getParent() != MI->getParent())
+          continue;
+
+        // The addend must be a full copy within the same register class.
+
+        if (!AddendMI->isFullCopy())
+          continue;
+
+        unsigned AddendSrcReg = AddendMI->getOperand(1).getReg();
+        if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg)) {
+          if (MRI.getRegClass(AddendMI->getOperand(0).getReg()) !=
+              MRI.getRegClass(AddendSrcReg))
+            continue;
+        } else {
+          // If AddendSrcReg is a physical register, make sure the destination
+          // register class contains it.
+          if (!MRI.getRegClass(AddendMI->getOperand(0).getReg())
+                ->contains(AddendSrcReg))
+            continue;
+        }
+
+        // In theory, there could be other uses of the addend copy before this
+        // fma.  We could deal with this, but that would require additional
+        // logic below and I suspect it will not occur in any relevant
+        // situations.  Additionally, check whether the copy source is killed
+        // prior to the fma.  In order to replace the addend here with the
+        // source of the copy, it must still be live here.  We can't use
+        // interval testing for a physical register, so as long as we're
+        // walking the MIs we may as well test liveness here.
+        bool OtherUsers = false, KillsAddendSrc = false;
+        for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI);
+             J != JE; --J) {
+          if (J->readsVirtualRegister(AddendMI->getOperand(0).getReg())) {
+            OtherUsers = true;
+            break;
+          }
+          if (J->modifiesRegister(AddendSrcReg, TRI) ||
+              J->killsRegister(AddendSrcReg, TRI)) {
+            KillsAddendSrc = true;
+            break;
+          }
+        }
+
+        if (OtherUsers || KillsAddendSrc)
+          continue;
+
+        // Find one of the product operands that is killed by this instruction.
+
+        unsigned KilledProdOp = 0, OtherProdOp = 0;
+        if (LIS->getInterval(MI->getOperand(2).getReg())
+                     .Query(FMAIdx).isKill()) {
+          KilledProdOp = 2;
+          OtherProdOp  = 3;
+        } else if (LIS->getInterval(MI->getOperand(3).getReg())
+                     .Query(FMAIdx).isKill()) {
+          KilledProdOp = 3;
+          OtherProdOp  = 2;
+        }
+
+        // If there are no killed product operands, then this transformation is
+        // likely not profitable.
+        if (!KilledProdOp)
+          continue;
+
+        // For virtual registers, verify that the addend source register
+        // is live here (as should have been assured above).
+        assert((!TargetRegisterInfo::isVirtualRegister(AddendSrcReg) ||
+                LIS->getInterval(AddendSrcReg).liveAt(FMAIdx)) &&
+               "Addend source register is not live!");
+
+        // Transform: (O2 * O3) + O1 -> (O2 * O1) + O3.
+
+        unsigned AddReg = AddendMI->getOperand(1).getReg();
+        unsigned KilledProdReg = MI->getOperand(KilledProdOp).getReg();
+        unsigned OtherProdReg  = MI->getOperand(OtherProdOp).getReg();
+
+        unsigned AddSubReg = AddendMI->getOperand(1).getSubReg();
+        unsigned KilledProdSubReg = MI->getOperand(KilledProdOp).getSubReg();
+        unsigned OtherProdSubReg  = MI->getOperand(OtherProdOp).getSubReg();
+
+        bool AddRegKill = AddendMI->getOperand(1).isKill();
+        bool KilledProdRegKill = MI->getOperand(KilledProdOp).isKill();
+        bool OtherProdRegKill  = MI->getOperand(OtherProdOp).isKill();
+
+        bool AddRegUndef = AddendMI->getOperand(1).isUndef();
+        bool KilledProdRegUndef = MI->getOperand(KilledProdOp).isUndef();
+        bool OtherProdRegUndef  = MI->getOperand(OtherProdOp).isUndef();
+
+        unsigned OldFMAReg = MI->getOperand(0).getReg();
+
+        // The transformation doesn't work well with things like:
+        //    %vreg5 = A-form-op %vreg5, %vreg11, %vreg5;
+        // so leave such things alone.
+        if (OldFMAReg == KilledProdReg)
+          continue;
+
+        assert(OldFMAReg == AddendMI->getOperand(0).getReg() &&
+               "Addend copy not tied to old FMA output!");
+
+        DEBUG(dbgs() << "VSX FMA Mutation:\n    " << *MI;);
+
+        MI->getOperand(0).setReg(KilledProdReg);
+        MI->getOperand(1).setReg(KilledProdReg);
+        MI->getOperand(3).setReg(AddReg);
+        MI->getOperand(2).setReg(OtherProdReg);
+
+        MI->getOperand(0).setSubReg(KilledProdSubReg);
+        MI->getOperand(1).setSubReg(KilledProdSubReg);
+        MI->getOperand(3).setSubReg(AddSubReg);
+        MI->getOperand(2).setSubReg(OtherProdSubReg);
+
+        MI->getOperand(1).setIsKill(KilledProdRegKill);
+        MI->getOperand(3).setIsKill(AddRegKill);
+        MI->getOperand(2).setIsKill(OtherProdRegKill);
+
+        MI->getOperand(1).setIsUndef(KilledProdRegUndef);
+        MI->getOperand(3).setIsUndef(AddRegUndef);
+        MI->getOperand(2).setIsUndef(OtherProdRegUndef);
+
+        MI->setDesc(TII->get(AltOpc));
+
+        DEBUG(dbgs() << " -> " << *MI);
+
+        // The killed product operand was killed here, so we can reuse it now
+        // for the result of the fma.
+
+        LiveInterval &FMAInt = LIS->getInterval(OldFMAReg);
+        VNInfo *FMAValNo = FMAInt.getVNInfoAt(FMAIdx.getRegSlot());
+        for (auto UI = MRI.reg_nodbg_begin(OldFMAReg), UE = MRI.reg_nodbg_end();
+             UI != UE;) {
+          MachineOperand &UseMO = *UI;
+          MachineInstr *UseMI = UseMO.getParent();
+          ++UI;
+
+          // Don't replace the result register of the copy we're about to erase.
+          if (UseMI == AddendMI)
+            continue;
+
+          UseMO.setReg(KilledProdReg);
+          UseMO.setSubReg(KilledProdSubReg);
+        }
+
+        // Extend the live intervals of the killed product operand to hold the
+        // fma result.
+
+        LiveInterval &NewFMAInt = LIS->getInterval(KilledProdReg);
+        for (LiveInterval::iterator AI = FMAInt.begin(), AE = FMAInt.end();
+             AI != AE; ++AI) {
+          // Don't add the segment that corresponds to the original copy.
+          if (AI->valno == AddendValNo)
+            continue;
+
+          VNInfo *NewFMAValNo =
+            NewFMAInt.getNextValue(AI->start,
+                                   LIS->getVNInfoAllocator());
+
+          NewFMAInt.addSegment(LiveInterval::Segment(AI->start, AI->end,
+                                                     NewFMAValNo));
+        }
+        DEBUG(dbgs() << "  extended: " << NewFMAInt << '\n');
+
+        FMAInt.removeValNo(FMAValNo);
+        DEBUG(dbgs() << "  trimmed:  " << FMAInt << '\n');
+
+        // Remove the (now unused) copy.
+
+        DEBUG(dbgs() << "  removing: " << *AddendMI << '\n');
+        LIS->RemoveMachineInstrFromMaps(AddendMI);
+        AddendMI->eraseFromParent();
+
+        Changed = true;
+      }
+
+      return Changed;
+    }
+
+public:
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      // If we don't have VSX then go ahead and return without doing
+      // anything.
+      const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
+      if (!STI.hasVSX())
+        return false;
+
+      LIS = &getAnalysis<LiveIntervals>();
+
+      TII = STI.getInstrInfo();
+
+      bool Changed = false;
+
+      if (DisableVSXFMAMutate)
+        return Changed;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LiveIntervals>();
+      AU.addPreserved<LiveIntervals>();
+      AU.addRequired<SlotIndexes>();
+      AU.addPreserved<SlotIndexes>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS_BEGIN(PPCVSXFMAMutate, DEBUG_TYPE,
+                      "PowerPC VSX FMA Mutation", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE,
+                    "PowerPC VSX FMA Mutation", false, false)
+
+char &llvm::PPCVSXFMAMutateID = PPCVSXFMAMutate::ID;
+
+char PPCVSXFMAMutate::ID = 0;
+FunctionPass*
+llvm::createPPCVSXFMAMutatePass() { return new PPCVSXFMAMutate(); }
+
+
diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt
index 514f840..4132b04 100644
--- a/lib/Target/PowerPC/README.txt
+++ b/lib/Target/PowerPC/README.txt
@@ -5,38 +5,6 @@ TODO:
 
 ===-------------------------------------------------------------------------===
 
-On PPC64, this:
-
-long f2 (long x) { return 0xfffffff000000000UL; }
-long f3 (long x) { return 0x1ffffffffUL; }
-
-could compile into:
-
-_f2:
-	li r3,-1
-	rldicr r3,r3,0,27
-	blr
-_f3:
-	li r3,-1
-	rldicl r3,r3,0,31
-	blr
-
-we produce:
-
-_f2:
-	lis r2, 4095
-	ori r2, r2, 65535
-	sldi r3, r2, 36
-	blr 
-_f3:
-	li r2, 1
-	sldi r2, r2, 32
-	oris r2, r2, 65535
-	ori r3, r2, 65535
-	blr 
-
-===-------------------------------------------------------------------------===
-
 This code:
 
 unsigned add32carry(unsigned sum, unsigned x) {
@@ -63,40 +31,6 @@ Ick.
 
 ===-------------------------------------------------------------------------===
 
-Support 'update' load/store instructions.  These are cracked on the G5, but are
-still a codesize win.
-
-With preinc enabled, this:
-
-long *%test4(long *%X, long *%dest) {
-        %Y = getelementptr long* %X, int 4
-        %A = load long* %Y
-        store long %A, long* %dest
-        ret long* %Y
-}
-
-compiles to:
-
-_test4:
-        mr r2, r3
-        lwzu r5, 32(r2)
-        lwz r3, 36(r3)
-        stw r5, 0(r4)
-        stw r3, 4(r4)
-        mr r3, r2
-        blr 
-
-with -sched=list-burr, I get:
-
-_test4:
-        lwz r2, 36(r3)
-        lwzu r5, 32(r3)
-        stw r2, 4(r4)
-        stw r5, 0(r4)
-        blr 
-
-===-------------------------------------------------------------------------===
-
 We compile the hottest inner loop of viterbi to:
 
         li r6, 0
@@ -184,33 +118,6 @@ http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 
 ===-------------------------------------------------------------------------===
 
-Compile offsets from allocas:
-
-int *%test() {
-        %X = alloca { int, int }
-        %Y = getelementptr {int,int}* %X, int 0, uint 1
-        ret int* %Y
-}
-
-into a single add, not two:
-
-_test:
-        addi r2, r1, -8
-        addi r3, r2, 4
-        blr
-
---> important for C++.
-
-===-------------------------------------------------------------------------===
-
-No loads or stores of the constants should be needed:
-
-struct foo { double X, Y; };
-void xxx(struct foo F);
-void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
-
-===-------------------------------------------------------------------------===
-
 Darwin Stub removal:
 
 We still generate calls to foo$stub, and stubs, on Darwin.  This is not
@@ -269,57 +176,6 @@ just fastcc.
 
 ===-------------------------------------------------------------------------===
 
-Compile this:
-
-int foo(int a) {
-  int b = (a < 8);
-  if (b) {
-    return b * 3;     // ignore the fact that this is always 3.
-  } else {
-    return 2;
-  }
-}
-
-into something not this:
-
-_foo:
-1)      cmpwi cr7, r3, 8
-        mfcr r2, 1
-        rlwinm r2, r2, 29, 31, 31
-1)      cmpwi cr0, r3, 7
-        bgt cr0, LBB1_2 ; UnifiedReturnBlock
-LBB1_1: ; then
-        rlwinm r2, r2, 0, 31, 31
-        mulli r3, r2, 3
-        blr
-LBB1_2: ; UnifiedReturnBlock
-        li r3, 2
-        blr
-
-In particular, the two compares (marked 1) could be shared by reversing one.
-This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
-same operands (but backwards) exists.  In this case, this wouldn't save us 
-anything though, because the compares still wouldn't be shared.
-
-===-------------------------------------------------------------------------===
-
-We should custom expand setcc instead of pretending that we have it.  That
-would allow us to expose the access of the crbit after the mfcr, allowing
-that access to be trivially folded into other ops.  A simple example:
-
-int foo(int a, int b) { return (a < b) << 4; }
-
-compiles into:
-
-_foo:
-        cmpw cr7, r3, r4
-        mfcr r2, 1
-        rlwinm r2, r2, 29, 31, 31
-        slwi r3, r2, 4
-        blr
-
-===-------------------------------------------------------------------------===
-
 Fold add and sub with constant into non-extern, non-weak addresses so this:
 
 static int a;
@@ -347,48 +203,6 @@ _foo:
 
 ===-------------------------------------------------------------------------===
 
-We generate really bad code for this:
-
-int f(signed char *a, _Bool b, _Bool c) {
-   signed char t = 0;
-  if (b)  t = *a;
-  if (c)  *a = t;
-}
-
-===-------------------------------------------------------------------------===
-
-This:
-int test(unsigned *P) { return *P >> 24; }
-
-Should compile to:
-
-_test:
-        lbz r3,0(r3)
-        blr
-
-not:
-
-_test:
-        lwz r2, 0(r3)
-        srwi r3, r2, 24
-        blr
-
-===-------------------------------------------------------------------------===
-
-On the G5, logical CR operations are more expensive in their three
-address form: ops that read/write the same register are half as expensive as
-those that read from two registers that are different from their destination.
-
-We should model this with two separate instructions.  The isel should generate
-the "two address" form of the instructions.  When the register allocator 
-detects that it needs to insert a copy due to the two-addresness of the CR
-logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
-we can convert to the "three address" instruction, to save code space.
-
-This only matters when we start generating cr logical ops.
-
-===-------------------------------------------------------------------------===
-
 We should compile these two functions to the same thing:
 
 #include <stdlib.h>
@@ -474,27 +288,6 @@ http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 
 ===-------------------------------------------------------------------------===
 
-float foo(float X) { return (int)(X); }
-
-Currently produces:
-
-_foo:
-        fctiwz f0, f1
-        stfd f0, -8(r1)
-        lwz r2, -4(r1)
-        extsw r2, r2
-        std r2, -16(r1)
-        lfd f0, -16(r1)
-        fcfid f0, f0
-        frsp f1, f0
-        blr
-
-We could use a target dag combine to turn the lwz/extsw into an lwa when the 
-lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
-win only.
-
-===-------------------------------------------------------------------------===
-
 We generate ugly code for this:
 
 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
@@ -552,32 +345,6 @@ _foo:
 
 ===-------------------------------------------------------------------------===
 
-We compile:
-
-unsigned test6(unsigned x) { 
-  return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
-}
-
-into:
-
-_test6:
-        lis r2, 255
-        rlwinm r3, r3, 16, 0, 31
-        ori r2, r2, 255
-        and r3, r3, r2
-        blr
-
-GCC gets it down to:
-
-_test6:
-        rlwinm r0,r3,16,8,15
-        rlwinm r3,r3,16,24,31
-        or r3,r3,r0
-        blr
-
-
-===-------------------------------------------------------------------------===
-
 Consider a function like this:
 
 float foo(float X) { return X + 1234.4123f; }
@@ -674,48 +441,6 @@ _bar:
 
 ===-------------------------------------------------------------------------===
 
-We currently compile 32-bit bswap:
-
-declare i32 @llvm.bswap.i32(i32 %A)
-define i32 @test(i32 %A) {
-        %B = call i32 @llvm.bswap.i32(i32 %A)
-        ret i32 %B
-}
-
-to:
-
-_test:
-        rlwinm r2, r3, 24, 16, 23
-        slwi r4, r3, 24
-        rlwimi r2, r3, 8, 24, 31
-        rlwimi r4, r3, 8, 8, 15
-        rlwimi r4, r2, 0, 16, 31
-        mr r3, r4
-        blr 
-
-it would be more efficient to produce:
-
-_foo:   mr r0,r3
-        rlwinm r3,r3,8,0xffffffff
-        rlwimi r3,r0,24,0,7
-        rlwimi r3,r0,24,16,23
-        blr
-
-===-------------------------------------------------------------------------===
-
-test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
-
-__ZNK4llvm5APInt17countLeadingZerosEv:
-        ld r2, 0(r3)
-        cntlzd r2, r2
-        or r2, r2, r2     <<-- silly.
-        addi r3, r2, -64
-        blr 
-
-The dead or is a 'truncate' from 64- to 32-bits.
-
-===-------------------------------------------------------------------------===
-
 We generate horrible ppc code for this:
 
 #define N  2000000
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 261075e..fb87cc5 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -38,6 +38,7 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
 // SI Passes
 FunctionPass *createSITypeRewriter();
 FunctionPass *createSIAnnotateControlFlowPass();
+FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
@@ -46,6 +47,10 @@ FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
 FunctionPass *createSIFixSGPRLiveRangesPass();
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIInsertWaits(TargetMachine &tm);
+FunctionPass *createSIPrepareScratchRegs();
+
+void initializeSIFoldOperandsPass(PassRegistry &);
+extern char &SIFoldOperandsID;
 
 void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
@@ -59,19 +64,20 @@ Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
 ModulePass *createAMDGPUAlwaysInlinePass();
 
-/// \brief Creates an AMDGPU-specific Target Transformation Info pass.
-ImmutablePass *
-createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM);
-
 void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
 extern char &SIFixSGPRLiveRangesID;
 
 
 extern Target TheAMDGPUTarget;
+extern Target TheGCNTarget;
 
 namespace AMDGPU {
 enum TargetIndex {
-  TI_CONSTDATA_START
+  TI_CONSTDATA_START,
+  TI_SCRATCH_RSRC_DWORD0,
+  TI_SCRATCH_RSRC_DWORD1,
+  TI_SCRATCH_RSRC_DWORD2,
+  TI_SCRATCH_RSRC_DWORD3
 };
 }
 
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 4cf1243..a7d48b3 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -48,6 +48,12 @@ def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
         "Enable double precision denormal handling",
         [FeatureFP64]>;
 
+def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
+        "FastFMAF32",
+        "true",
+        "Assuming f32 fma is at least as fast as mul + add",
+        []>;
+
 // Some instructions do not support denormals despite this flag. Using
 // fp32 denormals also causes instructions to run at the double
 // precision rate for the device.
@@ -92,6 +98,11 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
         "true",
         "Support flat address space">;
 
+def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
+        "EnableVGPRSpilling",
+        "true",
+        "Enable spilling of VGPRs to scratch memory">;
+
 class SubtargetFeatureFetchLimit <string Value> :
                           SubtargetFeature <"fetch"#Value,
         "TexVTXClauseSize",
@@ -147,10 +158,16 @@ def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
 def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
         [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
          FeatureWavefrontSize64, FeatureFlatAddressSpace]>;
+
+def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
+        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
+         FeatureWavefrontSize64, FeatureFlatAddressSpace]>;
+
 //===----------------------------------------------------------------------===//
 
 def AMDGPUInstrInfo : InstrInfo {
   let guessInstructionProperties = 1;
+  let noNamedPositionallyEncodedOperands = 1;
 }
 
 def AMDGPUAsmParser : AsmParser {
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index 5511d7c..92bc314 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -18,6 +18,7 @@
 
 #include "AMDGPUAsmPrinter.h"
 #include "AMDGPU.h"
+#include "AMDKernelCodeT.h"
 #include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600MachineFunctionInfo.h"
@@ -57,7 +58,7 @@ using namespace llvm;
 // instructions to run at the double precision rate for the device so it's
 // probably best to just report no single precision denormals.
 static uint32_t getFPMode(const MachineFunction &F) {
-  const AMDGPUSubtarget& ST = F.getTarget().getSubtarget<AMDGPUSubtarget>();
+  const AMDGPUSubtarget& ST = F.getSubtarget<AMDGPUSubtarget>();
   // TODO: Is there any real use for the flush in only / flush out only modes?
 
   uint32_t FP32Denormals =
@@ -72,19 +73,20 @@ static uint32_t getFPMode(const MachineFunction &F) {
          FP_DENORM_MODE_DP(FP64Denormals);
 }
 
-static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
-                                              MCStreamer &Streamer) {
-  return new AMDGPUAsmPrinter(tm, Streamer);
+static AsmPrinter *
+createAMDGPUAsmPrinterPass(TargetMachine &tm,
+                           std::unique_ptr<MCStreamer> &&Streamer) {
+  return new AMDGPUAsmPrinter(tm, std::move(Streamer));
 }
 
 extern "C" void LLVMInitializeR600AsmPrinter() {
   TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
+  TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass);
 }
 
-AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer) {
-  DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode();
-}
+AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
+                                   std::unique_ptr<MCStreamer> Streamer)
+    : AsmPrinter(TM, std::move(Streamer)) {}
 
 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
 
@@ -106,14 +108,17 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   EmitFunctionHeader();
 
   MCContext &Context = getObjFileLowering().getContext();
-  const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config",
-                                              ELF::SHT_PROGBITS, 0,
-                                              SectionKind::getReadOnly());
+  const MCSectionELF *ConfigSection =
+      Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
   OutStreamer.SwitchSection(ConfigSection);
 
-  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
+  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
-  if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+  if (STM.isAmdHsaOS()) {
+    getSIProgramInfo(KernelInfo, MF);
+    EmitAmdKernelCodeT(MF, KernelInfo);
+    OutStreamer.EmitCodeAlignment(2 << (MF.getAlignment() - 1));
+  } else if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
     getSIProgramInfo(KernelInfo, MF);
     EmitProgramInfoSI(MF, KernelInfo);
   } else {
@@ -128,10 +133,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   EmitFunctionBody();
 
   if (isVerbose()) {
-    const MCSectionELF *CommentSection
-      = Context.getELFSection(".AMDGPU.csdata",
-                              ELF::SHT_PROGBITS, 0,
-                              SectionKind::getReadOnly());
+    const MCSectionELF *CommentSection =
+        Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
     OutStreamer.SwitchSection(CommentSection);
 
     if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
@@ -156,22 +159,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   }
 
   if (STM.dumpCode()) {
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-    MF.dump();
-#endif
 
-    if (DisasmEnabled) {
-      OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm",
-                                                  ELF::SHT_NOTE, 0,
-                                                  SectionKind::getReadOnly()));
+    OutStreamer.SwitchSection(
+        Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
 
-      for (size_t i = 0; i < DisasmLines.size(); ++i) {
-        std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
-        Comment += " ; " + HexLines[i] + "\n";
+    for (size_t i = 0; i < DisasmLines.size(); ++i) {
+      std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
+      Comment += " ; " + HexLines[i] + "\n";
 
-        OutStreamer.EmitBytes(StringRef(DisasmLines[i]));
-        OutStreamer.EmitBytes(StringRef(Comment));
-      }
+      OutStreamer.EmitBytes(StringRef(DisasmLines[i]));
+      OutStreamer.EmitBytes(StringRef(Comment));
     }
   }
 
@@ -181,10 +178,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
   unsigned MaxGPR = 0;
   bool killPixel = false;
-  const R600RegisterInfo *RI = static_cast<const R600RegisterInfo *>(
-      TM.getSubtargetImpl()->getRegisterInfo());
+  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  const R600RegisterInfo *RI =
+      static_cast<const R600RegisterInfo *>(STM.getRegisterInfo());
   const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
 
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
@@ -240,13 +237,15 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
 
 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                                         const MachineFunction &MF) const {
+  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   uint64_t CodeSize = 0;
   unsigned MaxSGPR = 0;
   unsigned MaxVGPR = 0;
   bool VCCUsed = false;
   bool FlatUsed = false;
-  const SIRegisterInfo *RI = static_cast<const SIRegisterInfo *>(
-      TM.getSubtargetImpl()->getRegisterInfo());
+  const SIRegisterInfo *RI =
+      static_cast<const SIRegisterInfo *>(STM.getRegisterInfo());
 
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
@@ -285,7 +284,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
         if (AMDGPU::SReg_32RegClass.contains(reg)) {
           isSGPR = true;
           width = 1;
-        } else if (AMDGPU::VReg_32RegClass.contains(reg)) {
+        } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
           isSGPR = false;
           width = 1;
         } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
@@ -340,6 +339,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.NumVGPR = MaxVGPR + 1;
   ProgInfo.NumSGPR = MaxSGPR + 1;
 
+  ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
+  ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
   // register.
   ProgInfo.FloatMode = getFPMode(MF);
@@ -356,21 +357,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.FlatUsed = FlatUsed;
   ProgInfo.VCCUsed = VCCUsed;
   ProgInfo.CodeLen = CodeSize;
-}
-
-void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
-                                         const SIProgramInfo &KernelInfo) {
-  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
-  unsigned RsrcReg;
-  switch (MFI->getShaderType()) {
-  default: // Fall through
-  case ShaderType::COMPUTE:  RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break;
-  case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break;
-  case ShaderType::PIXEL:    RsrcReg = R_00B028_SPI_SHADER_PGM_RSRC1_PS; break;
-  case ShaderType::VERTEX:   RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break;
-  }
 
   unsigned LDSAlignShift;
   if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
@@ -384,59 +370,203 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
   unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
                           MFI->getMaximumWorkGroupSize(MF);
 
-  unsigned LDSBlocks =
-     RoundUpToAlignment(MFI->LDSSize + LDSSpillSize,
-	                      1 << LDSAlignShift) >> LDSAlignShift;
+  ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
+  ProgInfo.LDSBlocks =
+     RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
 
   // Scratch is allocated in 256 dword blocks.
   unsigned ScratchAlignShift = 10;
   // We need to program the hardware with the amount of scratch memory that
-  // is used by the entire wave.  KernelInfo.ScratchSize is the amount of
+  // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
   // scratch memory used per thread.
-  unsigned ScratchBlocks =
-    RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(),
+  ProgInfo.ScratchBlocks =
+    RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(),
                        1 << ScratchAlignShift) >> ScratchAlignShift;
 
-  unsigned VGPRBlocks = (KernelInfo.NumVGPR - 1) / 4;
-  unsigned SGPRBlocks = (KernelInfo.NumSGPR - 1) / 8;
+  ProgInfo.ComputePGMRSrc1 =
+      S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
+      S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
+      S_00B848_PRIORITY(ProgInfo.Priority) |
+      S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
+      S_00B848_PRIV(ProgInfo.Priv) |
+      S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
+      S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
+      S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
+
+  ProgInfo.ComputePGMRSrc2 =
+      S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
+      S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
+      S_00B84C_TGID_X_EN(1) |
+      S_00B84C_TGID_Y_EN(1) |
+      S_00B84C_TGID_Z_EN(1) |
+      S_00B84C_TG_SIZE_EN(1) |
+      S_00B84C_TIDIG_COMP_CNT(2) |
+      S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
+}
+
+static unsigned getRsrcReg(unsigned ShaderType) {
+  switch (ShaderType) {
+  default: // Fall through
+  case ShaderType::COMPUTE:  return R_00B848_COMPUTE_PGM_RSRC1;
+  case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
+  case ShaderType::PIXEL:    return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
+  case ShaderType::VERTEX:   return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
+  }
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
+                                         const SIProgramInfo &KernelInfo) {
+  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned RsrcReg = getRsrcReg(MFI->getShaderType());
 
   if (MFI->getShaderType() == ShaderType::COMPUTE) {
     OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
 
-    const uint32_t ComputePGMRSrc1 =
-      S_00B848_VGPRS(VGPRBlocks) |
-      S_00B848_SGPRS(SGPRBlocks) |
-      S_00B848_PRIORITY(KernelInfo.Priority) |
-      S_00B848_FLOAT_MODE(KernelInfo.FloatMode) |
-      S_00B848_PRIV(KernelInfo.Priv) |
-      S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) |
-      S_00B848_IEEE_MODE(KernelInfo.DebugMode) |
-      S_00B848_IEEE_MODE(KernelInfo.IEEEMode);
-
-    OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
+    OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
 
     OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
-    const uint32_t ComputePGMRSrc2 =
-      S_00B84C_LDS_SIZE(LDSBlocks) |
-      S_00B02C_SCRATCH_EN(ScratchBlocks > 0);
-
-    OutStreamer.EmitIntValue(ComputePGMRSrc2, 4);
+    OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
 
     OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
-    OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4);
+    OutStreamer.EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
 
     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
     // 0" comment but I don't see a corresponding field in the register spec.
   } else {
     OutStreamer.EmitIntValue(RsrcReg, 4);
-    OutStreamer.EmitIntValue(S_00B028_VGPRS(VGPRBlocks) |
-                             S_00B028_SGPRS(SGPRBlocks), 4);
+    OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
+                             S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
+    if (STM.isVGPRSpillingEnabled(MFI)) {
+      OutStreamer.EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
+      OutStreamer.EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+    }
   }
 
   if (MFI->getShaderType() == ShaderType::PIXEL) {
     OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
-    OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4);
+    OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
     OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
     OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
   }
 }
+
+void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
+                                        const SIProgramInfo &KernelInfo) const {
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  amd_kernel_code_t header;
+
+  memset(&header, 0, sizeof(header));
+
+  header.amd_code_version_major = AMD_CODE_VERSION_MAJOR;
+  header.amd_code_version_minor = AMD_CODE_VERSION_MINOR;
+
+  header.struct_byte_size = sizeof(amd_kernel_code_t);
+
+  header.target_chip = STM.getAmdKernelCodeChipID();
+
+  header.kernel_code_entry_byte_offset = (1ULL << MF.getAlignment());
+
+  header.compute_pgm_resource_registers =
+      KernelInfo.ComputePGMRSrc1 |
+      (KernelInfo.ComputePGMRSrc2 << 32);
+
+  // Code Properties:
+  header.code_properties = AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
+                           AMD_CODE_PROPERTY_IS_PTR64;
+
+  if (KernelInfo.FlatUsed)
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+
+  if (KernelInfo.ScratchBlocks)
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
+
+  header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
+  header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
+
+  // MFI->ABIArgOffset is the number of bytes for the kernel arguments
+  // plus 36.  36 is the number of bytes reserved at the begining of the
+  // input buffer to store work-group size information.
+  // FIXME: We should be adding the size of the implicit arguments
+  // to this value.
+  header.kernarg_segment_byte_size = MFI->ABIArgOffset;
+
+  header.wavefront_sgpr_count = KernelInfo.NumSGPR;
+  header.workitem_vgpr_count = KernelInfo.NumVGPR;
+
+  // FIXME: What values do I put for these alignments
+  header.kernarg_segment_alignment = 0;
+  header.group_segment_alignment = 0;
+  header.private_segment_alignment = 0;
+
+  header.code_type = 1; // HSA_EXT_CODE_KERNEL
+
+  header.wavefront_size = STM.getWavefrontSize();
+
+  const MCSectionELF *VersionSection =
+      OutContext.getELFSection(".hsa.version", ELF::SHT_PROGBITS, 0);
+  OutStreamer.SwitchSection(VersionSection);
+  OutStreamer.EmitBytes(Twine("HSA Code Unit:" +
+                        Twine(header.hsail_version_major) + "." +
+                        Twine(header.hsail_version_minor) + ":" +
+                        "AMD:" +
+                        Twine(header.amd_code_version_major) + "." +
+                        Twine(header.amd_code_version_minor) +  ":" +
+                        "GFX8.1:0").str());
+
+  OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+
+  if (isVerbose()) {
+    OutStreamer.emitRawComment("amd_code_version_major = " +
+                               Twine(header.amd_code_version_major), false);
+    OutStreamer.emitRawComment("amd_code_version_minor = " +
+                               Twine(header.amd_code_version_minor), false);
+    OutStreamer.emitRawComment("struct_byte_size = " +
+                               Twine(header.struct_byte_size), false);
+    OutStreamer.emitRawComment("target_chip = " +
+                               Twine(header.target_chip), false);
+    OutStreamer.emitRawComment(" compute_pgm_rsrc1: " +
+                               Twine::utohexstr(KernelInfo.ComputePGMRSrc1), false);
+    OutStreamer.emitRawComment(" compute_pgm_rsrc2: " +
+                               Twine::utohexstr(KernelInfo.ComputePGMRSrc2), false);
+    OutStreamer.emitRawComment("enable_sgpr_private_segment_buffer = " +
+      Twine((bool)(header.code_properties &
+                   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)), false);
+    OutStreamer.emitRawComment("enable_sgpr_kernarg_segment_ptr = " +
+      Twine((bool)(header.code_properties &
+                   AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)), false);
+    OutStreamer.emitRawComment("private_element_size = 2 ", false);
+    OutStreamer.emitRawComment("is_ptr64 = " +
+        Twine((bool)(header.code_properties & AMD_CODE_PROPERTY_IS_PTR64)), false);
+    OutStreamer.emitRawComment("workitem_private_segment_byte_size = " +
+                               Twine(header.workitem_private_segment_byte_size),
+                               false);
+    OutStreamer.emitRawComment("workgroup_group_segment_byte_size = " +
+                               Twine(header.workgroup_group_segment_byte_size),
+                               false);
+    OutStreamer.emitRawComment("gds_segment_byte_size = " +
+                               Twine(header.gds_segment_byte_size), false);
+    OutStreamer.emitRawComment("kernarg_segment_byte_size = " +
+                               Twine(header.kernarg_segment_byte_size), false);
+    OutStreamer.emitRawComment("wavefront_sgpr_count = " +
+                               Twine(header.wavefront_sgpr_count), false);
+    OutStreamer.emitRawComment("workitem_vgpr_count = " +
+                               Twine(header.workitem_vgpr_count), false);
+    OutStreamer.emitRawComment("code_type = " + Twine(header.code_type), false);
+    OutStreamer.emitRawComment("wavefront_size = " +
+                               Twine((int)header.wavefront_size), false);
+    OutStreamer.emitRawComment("optimization_level = " +
+                               Twine(header.optimization_level), false);
+    OutStreamer.emitRawComment("hsail_profile = " +
+                               Twine(header.hsail_profile), false);
+    OutStreamer.emitRawComment("hsail_machine_model = " +
+                               Twine(header.hsail_machine_model), false);
+    OutStreamer.emitRawComment("hsail_version_major = " +
+                               Twine(header.hsail_version_major), false);
+    OutStreamer.emitRawComment("hsail_version_minor = " +
+                               Twine(header.hsail_version_minor), false);
+  }
+
+  OutStreamer.EmitBytes(StringRef((char*)&header, sizeof(header)));
+}
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
index b9a0767..58ffb1e 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -24,8 +24,8 @@ class AMDGPUAsmPrinter : public AsmPrinter {
 private:
   struct SIProgramInfo {
     SIProgramInfo() :
-      NumVGPR(0),
-      NumSGPR(0),
+      VGPRBlocks(0),
+      SGPRBlocks(0),
       Priority(0),
       FloatMode(0),
       Priv(0),
@@ -33,13 +33,19 @@ private:
       DebugMode(0),
       IEEEMode(0),
       ScratchSize(0),
+      ComputePGMRSrc1(0),
+      LDSBlocks(0),
+      ScratchBlocks(0),
+      ComputePGMRSrc2(0),
+      NumVGPR(0),
+      NumSGPR(0),
       FlatUsed(false),
       VCCUsed(false),
       CodeLen(0) {}
 
     // Fields set in PGM_RSRC1 pm4 packet.
-    uint32_t NumVGPR;
-    uint32_t NumSGPR;
+    uint32_t VGPRBlocks;
+    uint32_t SGPRBlocks;
     uint32_t Priority;
     uint32_t FloatMode;
     uint32_t Priv;
@@ -48,6 +54,17 @@ private:
     uint32_t IEEEMode;
     uint32_t ScratchSize;
 
+    uint64_t ComputePGMRSrc1;
+
+    // Fields set in PGM_RSRC2 pm4 packet.
+    uint32_t LDSBlocks;
+    uint32_t ScratchBlocks;
+
+    uint64_t ComputePGMRSrc2;
+
+    uint32_t NumVGPR;
+    uint32_t NumSGPR;
+    uint32_t LDSSize;
     bool FlatUsed;
 
     // Bonus information for debugging.
@@ -64,9 +81,12 @@ private:
   /// can correctly setup the GPU state.
   void EmitProgramInfoR600(const MachineFunction &MF);
   void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
+  void EmitAmdKernelCodeT(const MachineFunction &MF,
+                          const SIProgramInfo &KernelInfo) const;
 
 public:
-  explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
+  explicit AMDGPUAsmPrinter(TargetMachine &TM,
+                            std::unique_ptr<MCStreamer> Streamer);
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -80,7 +100,6 @@ public:
   void EmitEndOfAsmFile(Module &M) override;
 
 protected:
-  bool DisasmEnabled;
   std::vector<std::string> DisasmLines, HexLines;
   size_t DisasmLineMaxLen;
 };
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index 90b6672..b5ab703 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -39,11 +39,11 @@ namespace {
 class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
   // make the right decision when generating code for different targets.
-  const AMDGPUSubtarget &Subtarget;
+  const AMDGPUSubtarget *Subtarget;
 public:
   AMDGPUDAGToDAGISel(TargetMachine &TM);
   virtual ~AMDGPUDAGToDAGISel();
-
+  bool runOnMachineFunction(MachineFunction &MF) override;
   SDNode *Select(SDNode *N) override;
   const char *getPassName() const override;
   void PostprocessISelDAG() override;
@@ -95,9 +95,9 @@ private:
                    SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
                    SDValue &TFE) const;
   bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
-                         SDValue &Offset) const;
+                         SDValue &SOffset, SDValue &Offset) const;
   bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
-                         SDValue &VAddr, SDValue &Offset,
+                         SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
                          SDValue &SLC) const;
   bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
                           SDValue &SOffset, SDValue &ImmOffset) const;
@@ -113,6 +113,9 @@ private:
 
   bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods,
                             SDValue &Omod) const;
+  bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods,
+                                 SDValue &Clamp,
+                                 SDValue &Omod) const;
 
   SDNode *SelectADD_SUB_I64(SDNode *N);
   SDNode *SelectDIV_SCALE(SDNode *N);
@@ -129,7 +132,11 @@ FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) {
 }
 
 AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
-  : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<AMDGPUSubtarget>()) {
+    : SelectionDAGISel(TM) {}
+
+bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &static_cast<const AMDGPUSubtarget &>(MF.getSubtarget());
+  return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
 AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
@@ -153,7 +160,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
   switch (N->getMachineOpcode()) {
   default: {
     const MCInstrDesc &Desc =
-        TM.getSubtargetImpl()->getInstrInfo()->get(N->getMachineOpcode());
+        Subtarget->getInstrInfo()->get(N->getMachineOpcode());
     unsigned OpIdx = Desc.getNumDefs() + OpNo;
     if (OpIdx >= Desc.getNumOperands())
       return nullptr;
@@ -161,17 +168,17 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
     if (RegClass == -1)
       return nullptr;
 
-    return TM.getSubtargetImpl()->getRegisterInfo()->getRegClass(RegClass);
+    return Subtarget->getRegisterInfo()->getRegClass(RegClass);
   }
   case AMDGPU::REG_SEQUENCE: {
     unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
     const TargetRegisterClass *SuperRC =
-        TM.getSubtargetImpl()->getRegisterInfo()->getRegClass(RCID);
+        Subtarget->getRegisterInfo()->getRegClass(RCID);
 
     SDValue SubRegOp = N->getOperand(OpNo + 1);
     unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
-    return TM.getSubtargetImpl()->getRegisterInfo()->getSubClassWithSubReg(
-        SuperRC, SubRegIdx);
+    return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
+                                                              SubRegIdx);
   }
   }
 }
@@ -241,7 +248,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     return nullptr;   // Already selected.
   }
 
-  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
   switch (Opc) {
   default: break;
   // We are selecting i64 ADD here instead of custom lower it during
@@ -250,7 +256,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   case ISD::ADD:
   case ISD::SUB: {
     if (N->getValueType(0) != MVT::i64 ||
-        ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+        Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
       break;
 
     return SelectADD_SUB_I64(N);
@@ -259,15 +265,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   case AMDGPUISD::BUILD_VERTICAL_VECTOR:
   case ISD::BUILD_VECTOR: {
     unsigned RegClassID;
-    const AMDGPURegisterInfo *TRI = static_cast<const AMDGPURegisterInfo *>(
-        TM.getSubtargetImpl()->getRegisterInfo());
-    const SIRegisterInfo *SIRI = static_cast<const SIRegisterInfo *>(
-        TM.getSubtargetImpl()->getRegisterInfo());
+    const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
     EVT VT = N->getValueType(0);
     unsigned NumVectorElts = VT.getVectorNumElements();
     EVT EltVT = VT.getVectorElementType();
     assert(EltVT.bitsEq(MVT::i32));
-    if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
       bool UseVReg = true;
       for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
                                                     U != E; ++U) {
@@ -278,12 +281,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
         if (!RC) {
           continue;
         }
-        if (SIRI->isSGPRClass(RC)) {
+        if (static_cast<const SIRegisterInfo *>(TRI)->isSGPRClass(RC)) {
           UseVReg = false;
         }
       }
       switch(NumVectorElts) {
-      case 1: RegClassID = UseVReg ? AMDGPU::VReg_32RegClassID :
+      case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID :
                                      AMDGPU::SReg_32RegClassID;
         break;
       case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
@@ -365,7 +368,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   }
   case ISD::BUILD_PAIR: {
     SDValue RC, SubReg0, SubReg1;
-    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
       break;
     }
     if (N->getValueType(0) == MVT::i128) {
@@ -387,8 +390,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
 
   case ISD::Constant:
   case ISD::ConstantFP: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+    if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
         N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
       break;
 
@@ -414,8 +416,55 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
                                   N->getValueType(0), Ops);
   }
 
+  case ISD::LOAD: {
+    // To simplify the TableGen patters, we replace all i64 loads with
+    // v2i32 loads.  Alternatively, we could promote i64 loads to v2i32
+    // during DAG legalization, however, so places (ExpandUnalignedLoad)
+    // in the DAG legalizer assume that if i64 is legal, so doing this
+    // promotion early can cause problems.
+    EVT VT = N->getValueType(0);
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD)
+      break;
+
+    SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(),
+                                     LD->getBasePtr(), LD->getMemOperand());
+    SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
+                                      MVT::i64, NewLoad);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1));
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast);
+    SelectCode(NewLoad.getNode());
+    N = BitCast.getNode();
+    break;
+  }
+
+  case ISD::STORE: {
+    // Handle i64 stores here for the same reason mentioned above for loads.
+    StoreSDNode *ST = cast<StoreSDNode>(N);
+    SDValue Value = ST->getValue();
+    if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore())
+      break;
+
+    SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
+                                      MVT::v2i32, Value);
+    SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
+                                        ST->getBasePtr(), ST->getMemOperand());
+
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
+
+    if (NewValue.getOpcode() == ISD::BITCAST) {
+      Select(NewStore.getNode());
+      return SelectCode(NewValue.getNode());
+    }
+
+    // getNode() may fold the bitcast if its input was another bitcast.  If that
+    // happens we should only select the new store.
+    N = NewStore.getNode();
+    break;
+  }
+
   case AMDGPUISD::REGISTER_LOAD: {
-    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       break;
     SDValue Addr, Offset;
 
@@ -431,7 +480,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
                                   Ops);
   }
   case AMDGPUISD::REGISTER_STORE: {
-    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       break;
     SDValue Addr, Offset;
     SelectADDRIndirect(N->getOperand(2), Addr, Offset);
@@ -449,7 +498,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
 
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
-    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+    if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
       break;
 
     // There is a scalar version available, but unlike the vector version which
@@ -554,13 +603,11 @@ bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const {
 }
 
 bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
-  if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
-        N->getMemoryVT().bitsLT(MVT::i32)) {
+  if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
+    if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+        N->getMemoryVT().bitsLT(MVT::i32))
       return true;
-    }
-  }
+
   return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
 }
 
@@ -736,6 +783,8 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
   return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
 }
 
+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
 SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
   SDLoc SL(N);
   EVT VT = N->getValueType(0);
@@ -745,30 +794,22 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
   unsigned Opc
     = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
 
-  const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32);
-  const SDValue False = CurDAG->getTargetConstant(0, MVT::i1);
-  SDValue Ops[] = {
-    Zero,             // src0_modifiers
-    N->getOperand(0), // src0
-    Zero,             // src1_modifiers
-    N->getOperand(1), // src1
-    Zero,             // src2_modifiers
-    N->getOperand(2), // src2
-    False,            // clamp
-    Zero              // omod
-  };
+  // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+  SDValue Ops[8];
 
+  SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
+  SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
+  SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
   return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
 }
 
 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
                                          unsigned OffsetBits) const {
-  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
   if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
       (OffsetBits == 8 && !isUInt<8>(Offset)))
     return false;
 
-  if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS)
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS)
     return true;
 
   // On Southern Islands instruction with a negative base value and an offset
@@ -879,26 +920,32 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
     SDValue N1 = Addr.getOperand(1);
     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
 
-    if (isLegalMUBUFImmOffset(C1)) {
-
-      if (N0.getOpcode() == ISD::ADD) {
-        // (add (add N2, N3), C1) -> addr64
-        SDValue N2 = N0.getOperand(0);
-        SDValue N3 = N0.getOperand(1);
-        Addr64 = CurDAG->getTargetConstant(1, MVT::i1);
-        Ptr = N2;
-        VAddr = N3;
-        Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
-        return;
-      }
+    if (N0.getOpcode() == ISD::ADD) {
+      // (add (add N2, N3), C1) -> addr64
+      SDValue N2 = N0.getOperand(0);
+      SDValue N3 = N0.getOperand(1);
+      Addr64 = CurDAG->getTargetConstant(1, MVT::i1);
+      Ptr = N2;
+      VAddr = N3;
+    } else {
 
       // (add N0, C1) -> offset
       VAddr = CurDAG->getTargetConstant(0, MVT::i32);
       Ptr = N0;
-      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
+    }
+
+    if (isLegalMUBUFImmOffset(C1)) {
+        Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
+        return;
+    } else if (isUInt<32>(C1->getZExtValue())) {
+      // Illegal offset, store it in soffset.
+      Offset = CurDAG->getTargetConstant(0, MVT::i16);
+      SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+                   CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i32)), 0);
       return;
     }
   }
+
   if (Addr.getOpcode() == ISD::ADD) {
     // (add N0, N1) -> addr64
     SDValue N0 = Addr.getOperand(0);
@@ -918,9 +965,9 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
 }
 
 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
-                                           SDValue &VAddr,
+                                           SDValue &VAddr, SDValue &SOffset,
                                            SDValue &Offset) const {
-  SDValue Ptr, SOffset, Offen, Idxen, Addr64, GLC, SLC, TFE;
+  SDValue Ptr, Offen, Idxen, Addr64, GLC, SLC, TFE;
 
   SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
               GLC, SLC, TFE);
@@ -940,11 +987,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
 }
 
 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
-                                           SDValue &VAddr, SDValue &Offset,
-                                           SDValue &SLC) const {
+                                           SDValue &VAddr, SDValue &SOffset,
+					   SDValue &Offset,
+					   SDValue &SLC) const {
   SLC = CurDAG->getTargetConstant(0, MVT::i1);
 
-  return SelectMUBUFAddr64(Addr, SRsrc, VAddr, Offset);
+  return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset);
 }
 
 bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
@@ -954,21 +1002,32 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
   SDLoc DL(Addr);
   MachineFunction &MF = CurDAG->getMachineFunction();
   const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const SITargetLowering& Lowering =
     *static_cast<const SITargetLowering*>(getTargetLowering());
 
-  unsigned ScratchPtrReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
   unsigned ScratchOffsetReg =
       TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
   Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass,
                                 ScratchOffsetReg, MVT::i32);
+  SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32);
+  SDValue ScratchRsrcDword0 =
+      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0);
+
+  SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32);
+  SDValue ScratchRsrcDword1 =
+      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0);
 
-  SDValue ScratchPtr =
-    CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
-                           MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64);
+  const SDValue RsrcOps[] = {
+      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
+      ScratchRsrcDword0,
+      CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+      ScratchRsrcDword1,
+      CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
+  };
+  SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                              MVT::v2i32, RsrcOps), 0);
   Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0);
   SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
       MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
@@ -985,22 +1044,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
     }
   }
 
-  // (add FI, n0)
-  if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
-       isa<FrameIndexSDNode>(Addr.getOperand(0))) {
-    VAddr = Addr.getOperand(1);
-    ImmOffset = Addr.getOperand(0);
-    return true;
-  }
-
-  // (FI)
-  if (isa<FrameIndexSDNode>(Addr)) {
-    VAddr = SDValue(CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
-                                          CurDAG->getConstant(0, MVT::i32)), 0);
-    ImmOffset = Addr;
-    return true;
-  }
-
   // (node)
   VAddr = Addr;
   ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
@@ -1012,6 +1055,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
                                            SDValue &GLC, SDValue &SLC,
                                            SDValue &TFE) const {
   SDValue Ptr, VAddr, Offen, Idxen, Addr64;
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
 
   SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
               GLC, SLC, TFE);
@@ -1019,7 +1064,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
   if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
       !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
       !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
-    uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT |
+    uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
                     APInt::getAllOnesValue(32).getZExtValue(); // Size
     SDLoc DL(Addr);
 
@@ -1045,7 +1090,7 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
   SDLoc DL(N);
 
-  assert(Subtarget.hasFlatAddressSpace() &&
+  assert(Subtarget->hasFlatAddressSpace() &&
          "addrspacecast only supported with flat address space!");
 
   assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
@@ -1081,7 +1126,9 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   if (DestSize > SrcSize) {
     assert(SrcSize == 32 && DestSize == 64);
 
-    SDValue RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32);
+    // FIXME: This is probably wrong, we should never be defining
+    // a register class with both VGPRs and SGPRs
+    SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, MVT::i32);
 
     const SDValue Ops[] = {
       RC,
@@ -1141,6 +1188,14 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src,
   return SelectVOP3Mods(In, Src, SrcMods);
 }
 
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
+                                                   SDValue &SrcMods,
+                                                   SDValue &Clamp,
+                                                   SDValue &Omod) const {
+  Clamp = Omod = CurDAG->getTargetConstant(0, MVT::i32);
+  return SelectVOP3Mods(In, Src, SrcMods);
+}
+
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   const AMDGPUTargetLowering& Lowering =
     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 2f95b74..4707279 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -102,11 +102,9 @@ EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
 }
 
-AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
-  TargetLowering(TM) {
-
-  Subtarget = &TM.getSubtarget<AMDGPUSubtarget>();
-
+AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
+                                           const AMDGPUSubtarget &STI)
+    : TargetLowering(TM), Subtarget(&STI) {
   setOperationAction(ISD::Constant, MVT::i32, Legal);
   setOperationAction(ISD::Constant, MVT::i64, Legal);
   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
@@ -127,12 +125,21 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::FABS,   MVT::f32, Legal);
   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
-  setOperationAction(ISD::FROUND, MVT::f32, Legal);
   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
 
+  setOperationAction(ISD::FROUND, MVT::f32, Custom);
+  setOperationAction(ISD::FROUND, MVT::f64, Custom);
+
   setOperationAction(ISD::FREM, MVT::f32, Custom);
   setOperationAction(ISD::FREM, MVT::f64, Custom);
 
+  // v_mad_f32 does not support denormals according to some sources.
+  if (!Subtarget->hasFP32Denormals())
+    setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+  // Expand to fneg + fadd.
+  setOperationAction(ISD::FSUB, MVT::f64, Expand);
+
   // Lower floating point store/load to integer store/load to reduce the number
   // of patterns in tablegen.
   setOperationAction(ISD::STORE, MVT::f32, Promote);
@@ -141,9 +148,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
 
-  setOperationAction(ISD::STORE, MVT::i64, Promote);
-  AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
-
   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
 
@@ -162,9 +166,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   // Custom lowering of vector stores is required for local address space
   // stores.
   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
-  // XXX: Native v2i32 local address space stores are possible, but not
-  // currently implemented.
-  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 
   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
@@ -187,9 +188,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
 
-  setOperationAction(ISD::LOAD, MVT::i64, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
-
   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 
@@ -216,18 +214,28 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
 
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand);
+  // There are no 64-bit extloads. These should be done as a 32-bit extload and
+  // an extension to 64-bit.
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
+  }
+
+  for (MVT VT : MVT::integer_vector_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
+  }
 
   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
 
@@ -246,7 +254,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 
-  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 
@@ -382,6 +391,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setTargetDAGCombine(ISD::SELECT_CC);
   setTargetDAGCombine(ISD::STORE);
 
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::FSUB);
+
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
   setSchedulingPreference(Sched::RegPressure);
   setJumpIsExpensive(true);
 
@@ -397,6 +412,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   // large sequence of instructions.
   setIntDivIsCheap(false);
   setPow2SDivIsCheap(false);
+  setFsqrtIsCheap(true);
 
   // FIXME: Need to really handle these.
   MaxStoresPerMemcpy  = 4096;
@@ -429,6 +445,29 @@ bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
 }
 
+bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
+                                                 ISD::LoadExtType,
+                                                 EVT NewVT) const {
+
+  unsigned NewSize = NewVT.getStoreSizeInBits();
+
+  // If we are reducing to a 32-bit load, this is always better.
+  if (NewSize == 32)
+    return true;
+
+  EVT OldVT = N->getValueType(0);
+  unsigned OldSize = OldVT.getStoreSizeInBits();
+
+  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
+  // extloads, so doing one requires using a buffer_load. In cases where we
+  // still couldn't use a scalar load, using the wider load shouldn't really
+  // hurt anything.
+
+  // If the old size already had to be an extload, there's no harm in continuing
+  // to reduce the width.
+  return (OldSize < 32);
+}
+
 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
                                                    EVT CastTy) const {
   if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
@@ -442,6 +481,18 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
           (LScalarSize < 32));
 }
 
+// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
+// profitable with the expansion for 64-bit since it's generally good to
+// speculate things.
+// FIXME: These should really have the size as a parameter.
+bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
+  return true;
+}
+
+bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
+  return true;
+}
+
 //===---------------------------------------------------------------------===//
 // Target Properties
 //===---------------------------------------------------------------------===//
@@ -560,6 +611,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
   case ISD::FRINT: return LowerFRINT(Op, DAG);
   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
+  case ISD::FROUND: return LowerFROUND(Op, DAG);
   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
@@ -619,7 +671,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
                                                        const SDValue &InitPtr,
                                                        SDValue Chain,
                                                        SelectionDAG &DAG) const {
-  const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout();
+  const DataLayout *TD = getDataLayout();
   SDLoc DL(InitPtr);
   Type *InitTy = Init->getType();
 
@@ -707,7 +759,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
                                                  SDValue Op,
                                                  SelectionDAG &DAG) const {
 
-  const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout();
+  const DataLayout *TD = getDataLayout();
   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = G->getGlobal();
 
@@ -810,8 +862,7 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
                                               SelectionDAG &DAG) const {
 
   MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
-      getTargetMachine().getSubtargetImpl()->getFrameLowering());
+  const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering();
 
   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
 
@@ -866,10 +917,9 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     }
 
     case Intrinsic::AMDGPU_div_fmas:
-      // FIXME: Dropping bool parameter. Work is needed to support the implicit
-      // read from VCC.
       return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                         Op.getOperand(4));
 
     case Intrinsic::AMDGPU_div_fixup:
       return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
@@ -889,7 +939,19 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
 
     case Intrinsic::AMDGPU_rsq_clamped:
-      return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
+      if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        Type *Type = VT.getTypeForEVT(*DAG.getContext());
+        APFloat Max = APFloat::getLargest(Type->getFltSemantics());
+        APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
+
+        SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+        SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
+                                  DAG.getConstantFP(Max, VT));
+        return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
+                           DAG.getConstantFP(Min, VT));
+      } else {
+        return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
+      }
 
     case Intrinsic::AMDGPU_ldexp:
       return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1),
@@ -962,6 +1024,10 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     case AMDGPUIntrinsic::AMDGPU_brev:
       return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
 
+  case Intrinsic::AMDGPU_class:
+    return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2));
+
     case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
       return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
 
@@ -1000,17 +1066,21 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
 }
 
 /// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::CombineFMinMax(SDLoc DL,
-                                             EVT VT,
-                                             SDValue LHS,
-                                             SDValue RHS,
-                                             SDValue True,
-                                             SDValue False,
-                                             SDValue CC,
-                                             SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
+                                                   EVT VT,
+                                                   SDValue LHS,
+                                                   SDValue RHS,
+                                                   SDValue True,
+                                                   SDValue False,
+                                                   SDValue CC,
+                                                   DAGCombinerInfo &DCI) const {
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return SDValue();
+
   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
     return SDValue();
 
+  SelectionDAG &DAG = DCI.DAG;
   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   switch (CCOpcode) {
   case ISD::SETOEQ:
@@ -1027,27 +1097,47 @@ SDValue AMDGPUTargetLowering::CombineFMinMax(SDLoc DL,
   case ISD::SETO:
     break;
   case ISD::SETULE:
-  case ISD::SETULT:
+  case ISD::SETULT: {
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
+    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+  }
   case ISD::SETOLE:
   case ISD::SETOLT:
   case ISD::SETLE:
   case ISD::SETLT: {
+    // Ordered. Assume ordered for undefined.
+
+    // Only do this after legalization to avoid interfering with other combines
+    // which might occur.
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
+        !DCI.isCalledByLegalizer())
+      return SDValue();
+
     // We need to permute the operands to get the correct NaN behavior. The
     // selected operand is the second one based on the failing compare with NaN,
     // so permute it based on the compare type the hardware uses.
     if (LHS == True)
-      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
-    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
+    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+  }
+  case ISD::SETUGE:
+  case ISD::SETUGT: {
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
   }
   case ISD::SETGT:
   case ISD::SETGE:
-  case ISD::SETUGE:
   case ISD::SETOGE:
-  case ISD::SETUGT:
   case ISD::SETOGT: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
+        !DCI.isCalledByLegalizer())
+      return SDValue();
+
     if (LHS == True)
-      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
-    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
+      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
   }
   case ISD::SETCC_INVALID:
     llvm_unreachable("Invalid setcc condcode!");
@@ -1330,24 +1420,6 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   EVT MemVT = Load->getMemoryVT();
 
-  if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) {
-    // We can do the extload to 32-bits, and then need to separately extend to
-    // 64-bits.
-
-    SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32,
-                                       Load->getChain(),
-                                       Load->getBasePtr(),
-                                       MemVT,
-                                       Load->getMemOperand());
-
-    SDValue Ops[] = {
-      DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32),
-      ExtLoad32.getValue(1)
-    };
-
-    return DAG.getMergeValues(Ops, DL);
-  }
-
   if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
     assert(VT == MVT::i1 && "Only i1 non-extloads expected");
     // FIXME: Copied from PPC
@@ -1586,12 +1658,26 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
 
+  if (VT == MVT::i64 &&
+    DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
+    DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
+
+    SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
+                              LHS_Lo, RHS_Lo);
+
+    SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero);
+    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero);
+    Results.push_back(DIV);
+    Results.push_back(REM);
+    return;
+  }
+
   // Get Speculative values
   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
 
-  SDValue REM_Hi = zero;
   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
+  SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero);
 
   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
   SDValue DIV_Lo = zero;
@@ -1599,8 +1685,10 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
   const unsigned halfBitWidth = HalfVT.getSizeInBits();
 
   for (unsigned i = 0; i < halfBitWidth; ++i) {
-    SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
-    // Get Value of high bit
+    const unsigned bitPos = halfBitWidth - i - 1;
+    SDValue POS = DAG.getConstant(bitPos, HalfVT);
+    // Get value of high bit
+    // TODO: Remove the BFE part when the optimization is fixed
     SDValue HBit;
     if (halfBitWidth == 32 && Subtarget->hasBFE()) {
       HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
@@ -1608,33 +1696,23 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
       HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
       HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
     }
+    HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
 
-    SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
-      DAG.getConstant(halfBitWidth - 1, HalfVT));
-    REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
-    REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
-
-    REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
-    REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
+    // Shift
+    REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, VT));
+    // Add LHS high bit
+    REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
 
-
-    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
-
-    SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
+    SDValue BIT = DAG.getConstant(1 << bitPos, HalfVT);
     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
 
     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
 
     // Update REM
-
     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
-
     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
-    REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
-    REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
   }
 
-  SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
   SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
   Results.push_back(DIV);
   Results.push_back(REM);
@@ -1655,8 +1733,8 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
   SDValue Den = Op.getOperand(1);
 
   if (VT == MVT::i32) {
-    if (DAG.MaskedValueIsZero(Op.getOperand(0), APInt(32, 0xff << 24)) &&
-        DAG.MaskedValueIsZero(Op.getOperand(1), APInt(32, 0xff << 24))) {
+    if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) &&
+        DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) {
       // TODO: We technically could do this for i64, but shouldn't that just be
       // handled by something generally reducing 64-bit division on 32-bit
       // values to 32-bit?
@@ -1768,19 +1846,31 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
 
-  if (VT == MVT::i32) {
-    if (DAG.ComputeNumSignBits(Op.getOperand(0)) > 8 &&
-        DAG.ComputeNumSignBits(Op.getOperand(1)) > 8) {
-      // TODO: We technically could do this for i64, but shouldn't that just be
-      // handled by something generally reducing 64-bit division on 32-bit
-      // values to 32-bit?
-      return LowerDIVREM24(Op, DAG, true);
-    }
-  }
-
   SDValue Zero = DAG.getConstant(0, VT);
   SDValue NegOne = DAG.getConstant(-1, VT);
 
+  if (VT == MVT::i32 &&
+      DAG.ComputeNumSignBits(LHS) > 8 &&
+      DAG.ComputeNumSignBits(RHS) > 8) {
+    return LowerDIVREM24(Op, DAG, true);
+  }
+  if (VT == MVT::i64 &&
+      DAG.ComputeNumSignBits(LHS) > 32 &&
+      DAG.ComputeNumSignBits(RHS) > 32) {
+    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+    //HiLo split
+    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
+    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
+    SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
+                                 LHS_Lo, RHS_Lo);
+    SDValue Res[2] = {
+      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
+      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
+    };
+    return DAG.getMergeValues(Res, DL);
+  }
+
   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
@@ -1845,6 +1935,20 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
 }
 
+static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) {
+  const unsigned FractBits = 52;
+  const unsigned ExpBits = 11;
+
+  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
+                                Hi,
+                                DAG.getConstant(FractBits - 32, MVT::i32),
+                                DAG.getConstant(ExpBits, MVT::i32));
+  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
+                            DAG.getConstant(1023, MVT::i32));
+
+  return Exp;
+}
+
 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Src = Op.getOperand(0);
@@ -1860,16 +1964,9 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
   // exponent.
   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
 
-  const unsigned FractBits = 52;
-  const unsigned ExpBits = 11;
+  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
 
-  // Extract the exponent.
-  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
-                                Hi,
-                                DAG.getConstant(FractBits - 32, MVT::i32),
-                                DAG.getConstant(ExpBits, MVT::i32));
-  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
-                            DAG.getConstant(1023, MVT::i32));
+  const unsigned FractBits = 52;
 
   // Extract the sign bit.
   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32);
@@ -1932,6 +2029,99 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con
   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
 }
 
+// XXX - May require not supporting f32 denormals?
+SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue X = Op.getOperand(0);
+
+  SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
+
+  SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
+
+  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
+
+  const SDValue Zero = DAG.getConstantFP(0.0, MVT::f32);
+  const SDValue One = DAG.getConstantFP(1.0, MVT::f32);
+  const SDValue Half = DAG.getConstantFP(0.5, MVT::f32);
+
+  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
+
+  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
+
+  SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
+
+  return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
+}
+
+SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue X = Op.getOperand(0);
+
+  SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
+
+  const SDValue Zero = DAG.getConstant(0, MVT::i32);
+  const SDValue One = DAG.getConstant(1, MVT::i32);
+  const SDValue NegOne = DAG.getConstant(-1, MVT::i32);
+  const SDValue FiftyOne = DAG.getConstant(51, MVT::i32);
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
+
+
+  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
+
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
+
+  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
+
+  const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), MVT::i64);
+
+  SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
+  SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
+                          DAG.getConstant(INT64_C(0x0008000000000000), MVT::i64),
+                          Exp);
+
+  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
+  SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
+                              DAG.getConstant(0, MVT::i64), Tmp0,
+                              ISD::SETNE);
+
+  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
+                             D, DAG.getConstant(0, MVT::i64));
+  SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
+
+  K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
+  K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
+
+  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
+  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
+  SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
+
+  SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
+                            ExpEqNegOne,
+                            DAG.getConstantFP(1.0, MVT::f64),
+                            DAG.getConstantFP(0.0, MVT::f64));
+
+  SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
+
+  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
+  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
+
+  return K;
+}
+
+SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  if (VT == MVT::f32)
+    return LowerFROUND32(Op, DAG);
+
+  if (VT == MVT::f64)
+    return LowerFROUND64(Op, DAG);
+
+  llvm_unreachable("unhandled type");
+}
+
 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Src = Op.getOperand(0);
@@ -2155,7 +2345,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
   SDValue Value = SN->getValue();
   EVT VT = Value.getValueType();
 
-  if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode()))
+  if (isTypeLegal(VT) || SN->isVolatile() ||
+      !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8)
     return SDValue();
 
   LoadSDNode *LoadVal = cast<LoadSDNode>(Value);
@@ -2231,27 +2422,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
       simplifyI24(N1, DCI);
       return SDValue();
     }
-  case ISD::SELECT_CC: {
-    SDLoc DL(N);
-    EVT VT = N->getValueType(0);
-
-    if (VT == MVT::f32 ||
-        (VT == MVT::f64 &&
-         Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)) {
-      SDValue LHS = N->getOperand(0);
-      SDValue RHS = N->getOperand(1);
-      SDValue True = N->getOperand(2);
-      SDValue False = N->getOperand(3);
-      SDValue CC = N->getOperand(4);
-
-      return CombineFMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
-    }
-
-    break;
-  }
   case ISD::SELECT: {
     SDValue Cond = N->getOperand(0);
-    if (Cond.getOpcode() == ISD::SETCC) {
+    if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) {
       SDLoc DL(N);
       EVT VT = N->getValueType(0);
       SDValue LHS = Cond.getOperand(0);
@@ -2261,11 +2434,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
       SDValue True = N->getOperand(1);
       SDValue False = N->getOperand(2);
 
-      if (VT == MVT::f32 ||
-          (VT == MVT::f64 &&
-           Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)) {
-        return CombineFMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
-      }
+      if (VT == MVT::f32)
+        return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
 
       // TODO: Implement min / max Evergreen instructions.
       if (VT == MVT::i32 &&
@@ -2451,7 +2621,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(DWORDADDR)
   NODE_NAME_CASE(FRACT)
   NODE_NAME_CASE(CLAMP)
-  NODE_NAME_CASE(MAD)
   NODE_NAME_CASE(FMAX_LEGACY)
   NODE_NAME_CASE(SMAX)
   NODE_NAME_CASE(UMAX)
@@ -2474,6 +2643,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(RSQ_LEGACY)
   NODE_NAME_CASE(RSQ_CLAMPED)
   NODE_NAME_CASE(LDEXP)
+  NODE_NAME_CASE(FP_CLASS)
   NODE_NAME_CASE(DOT4)
   NODE_NAME_CASE(BFE_U32)
   NODE_NAME_CASE(BFE_I32)
@@ -2505,6 +2675,46 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
+SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
+                                               DAGCombinerInfo &DCI,
+                                               unsigned &RefinementSteps,
+                                               bool &UseOneConstNR) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = Operand.getValueType();
+
+  if (VT == MVT::f32) {
+    RefinementSteps = 0;
+    return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
+  }
+
+  // TODO: There is also f64 rsq instruction, but the documentation is less
+  // clear on its precision.
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
+                                               DAGCombinerInfo &DCI,
+                                               unsigned &RefinementSteps) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = Operand.getValueType();
+
+  if (VT == MVT::f32) {
+    // Reciprocal, < 1 ulp error.
+    //
+    // This reciprocal approximation converges to < 0.5 ulp error with one
+    // newton rhapson performed with two fused multiple adds (FMAs).
+
+    RefinementSteps = 0;
+    return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
+  }
+
+  // TODO: There is also f64 rcp instruction, but the documentation is less
+  // clear on its precision.
+
+  return SDValue();
+}
+
 static void computeKnownBitsForMinMax(const SDValue Op0,
                                       const SDValue Op1,
                                       APInt &KnownZero,
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index 36b4ee6..6bc6ca5 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -43,12 +43,15 @@ private:
   /// \brief Split a vector store into multiple scalar stores.
   /// \returns The resulting chain.
 
-  SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
@@ -86,6 +89,7 @@ protected:
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
   void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &Results) const;
@@ -106,7 +110,7 @@ protected:
                               const SmallVectorImpl<ISD::InputArg> &Ins) const;
 
 public:
-  AMDGPUTargetLowering(TargetMachine &TM);
+  AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI);
 
   bool isFAbsFree(EVT VT) const override;
   bool isFNegFree(EVT VT) const override;
@@ -124,8 +128,14 @@ public:
 
   bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
   bool ShouldShrinkFPConstant(EVT VT) const override;
+  bool shouldReduceLoadWidth(SDNode *Load,
+                             ISD::LoadExtType ExtType,
+                             EVT ExtVT) const override;
 
   bool isLoadBitCastBeneficial(EVT, EVT) const override;
+  bool isCheapToSpeculateCttz() const override;
+  bool isCheapToSpeculateCtlz() const override;
+
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                       bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -142,14 +152,14 @@ public:
 
   SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue CombineFMinMax(SDLoc DL,
-                         EVT VT,
-                         SDValue LHS,
-                         SDValue RHS,
-                         SDValue True,
-                         SDValue False,
-                         SDValue CC,
-                         SelectionDAG &DAG) const;
+  SDValue CombineFMinMaxLegacy(SDLoc DL,
+                               EVT VT,
+                               SDValue LHS,
+                               SDValue RHS,
+                               SDValue True,
+                               SDValue False,
+                               SDValue CC,
+                               DAGCombinerInfo &DCI) const;
   SDValue CombineIMinMax(SDLoc DL,
                          EVT VT,
                          SDValue LHS,
@@ -161,6 +171,14 @@ public:
 
   const char* getTargetNodeName(unsigned Opcode) const override;
 
+  SDValue getRsqrtEstimate(SDValue Operand,
+                           DAGCombinerInfo &DCI,
+                           unsigned &RefinementSteps,
+                           bool &UseOneConstNR) const override;
+  SDValue getRecipEstimate(SDValue Operand,
+                           DAGCombinerInfo &DCI,
+                           unsigned &RefinementSteps) const override;
+
   virtual SDNode *PostISelFolding(MachineSDNode *N,
                                   SelectionDAG &DAG) const {
     return N;
@@ -200,7 +218,6 @@ enum {
   DWORDADDR,
   FRACT,
   CLAMP,
-  MAD, // Multiply + add with same result as the separate operations.
 
   // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
   // Denormals handled on some parts.
@@ -231,6 +248,7 @@ enum {
   RSQ_LEGACY,
   RSQ_CLAMPED,
   LDEXP,
+  FP_CLASS,
   DOT4,
   BFE_U32, // Extract range of bits with zero extension to 32-bits.
   BFE_I32, // Extract range of bits with sign extension to 32-bits.
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index a8fc614..f4de2d6 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -319,10 +319,7 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
     return -1;
   }
 
-  Offset = MF.getTarget()
-               .getSubtargetImpl()
-               ->getFrameLowering()
-               ->getFrameIndexOffset(MF, -1);
+  Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1);
 
   return getIndirectIndexBegin(MF) + Offset;
 }
@@ -341,8 +338,39 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
 // instead.
 namespace llvm {
 namespace AMDGPU {
-int getMCOpcode(uint16_t Opcode, unsigned Gen) {
-  return getMCOpcode(Opcode);
+static int getMCOpcode(uint16_t Opcode, unsigned Gen) {
+  return getMCOpcodeGen(Opcode, (enum Subtarget)Gen);
 }
 }
 }
+
+// This must be kept in sync with the SISubtarget class in SIInstrInfo.td
+enum SISubtarget {
+  SI = 0,
+  VI = 1
+};
+
+static enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) {
+  switch (Gen) {
+  default:
+    return SI;
+  case AMDGPUSubtarget::VOLCANIC_ISLANDS:
+    return VI;
+  }
+}
+
+int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
+  int MCOp = AMDGPU::getMCOpcode(Opcode,
+                        AMDGPUSubtargetToSISubtarget(RI.ST.getGeneration()));
+
+  // -1 means that Opcode is already a native instruction.
+  if (MCOp == -1)
+    return Opcode;
+
+  // (uint16_t)-1 means that Opcode is a pseudo instruction that has
+  // no encoding in the given subtarget generation.
+  if (MCOp == (uint16_t)-1)
+    return -1;
+
+  return MCOp;
+}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index da9833d..202183c 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -135,6 +135,17 @@ public:
   bool isRegisterStore(const MachineInstr &MI) const;
   bool isRegisterLoad(const MachineInstr &MI) const;
 
+  /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
+  /// Return -1 if the target-specific opcode for the pseudo instruction does
+  /// not exist. If Opcode is not a pseudo instruction, this is identity.
+  int pseudoToMCOpcode(int Opcode) const;
+
+  /// \brief Return the descriptor of the target-specific machine instruction
+  /// that corresponds to the specified pseudo or native opcode.
+  const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const {
+    return get(pseudoToMCOpcode(Opcode));
+  }
+
 //===---------------------------------------------------------------------===//
 // Pure virtual funtions to be implemented by sub-classes.
 //===---------------------------------------------------------------------===//
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index 4ee0f2b..901eb51 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -27,10 +27,19 @@ def AMDGPULdExpOp : SDTypeProfile<1, 2,
   [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
 >;
 
+def AMDGPUFPClassOp : SDTypeProfile<1, 2,
+  [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>]
+>;
+
 def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
   [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
 >;
 
+// float, float, float, vcc
+def AMDGPUFmasOp : SDTypeProfile<1, 4,
+  [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>]
+>;
+
 //===----------------------------------------------------------------------===//
 // AMDGPU DAG Nodes
 //
@@ -58,16 +67,17 @@ def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>;
 
 def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
 
+def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
+
 // out = max(a, b) a and b are floats, where a nan comparison fails.
 // This is not commutative because this gives the second operand:
 //   x < nan ? x : nan -> nan
 //   nan < x ? nan : x -> x
 def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp,
-  [SDNPAssociative]
+  []
 >;
 
 def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
-def AMDGPUmad : SDNode<"AMDGPUISD::MAD", SDTFPTernaryOp, []>;
 
 // out = max(a, b) a and b are signed ints
 def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
@@ -81,7 +91,7 @@ def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
 
 // out = min(a, b) a and b are floats, where a nan comparison fails.
 def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
-  [SDNPAssociative]
+  []
 >;
 
 // out = min(a, b) a and b are signed ints
@@ -147,7 +157,7 @@ def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
 
 //  Special case divide FMA with scale and flags (src0 = Quotient,
 //  src1 = Denominator, src2 = Numerator).
-def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>;
+def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>;
 
 // Single or double precision division fixup.
 // Special case divide fixup and flags(src0 = Quotient, src1 =
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index c215865..849b241 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -23,8 +23,6 @@ class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instructio
   let Pattern = pattern;
   let Itinerary = NullALU;
 
-  let isCodeGenOnly = 1;
-
   let TSFlags{63} = isRegisterLoad;
   let TSFlags{62} = isRegisterStore;
 }
@@ -73,6 +71,11 @@ def COND_OEQ : PatLeaf <
   [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}]
 >;
 
+def COND_ONE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}]
+>;
+
 def COND_OGT : PatLeaf <
   (cond),
   [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}]
@@ -93,23 +96,28 @@ def COND_OLE : PatLeaf <
   [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
 >;
 
-def COND_UNE : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
->;
 
 def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
 def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
 
 //===----------------------------------------------------------------------===//
-// PatLeafs for unsigned comparisons
+// PatLeafs for unsigned / unordered comparisons
 //===----------------------------------------------------------------------===//
 
+def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>;
+def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>;
 def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>;
 def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>;
 def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>;
 def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>;
 
+// XXX - For some reason R600 version is preferring to use unordered
+// for setne?
+def COND_UNE_NE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
+>;
+
 //===----------------------------------------------------------------------===//
 // PatLeafs for signed comparisons
 //===----------------------------------------------------------------------===//
@@ -154,10 +162,6 @@ class PrivateStore <SDPatternOperator op> : PrivateMemOp <
   (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
 >;
 
-def extloadi8_private : PrivateLoad <extloadi8>;
-def sextloadi8_private : PrivateLoad <sextloadi8>;
-def extloadi16_private : PrivateLoad <extloadi16>;
-def sextloadi16_private : PrivateLoad <sextloadi16>;
 def load_private : PrivateLoad <load>;
 
 def truncstorei8_private : PrivateStore <truncstorei8>;
@@ -221,6 +225,9 @@ def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
     return isLocalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
+def extloadi8_private : PrivateLoad <az_extloadi8>;
+def sextloadi8_private : PrivateLoad <sextloadi8>;
+
 def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
 }]>;
@@ -257,6 +264,9 @@ def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
     return isLocalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
+def extloadi16_private : PrivateLoad <az_extloadi16>;
+def sextloadi16_private : PrivateLoad <sextloadi16>;
+
 def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
 }]>;
@@ -403,11 +413,6 @@ def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
 // Misc Pattern Fragments
 //===----------------------------------------------------------------------===//
 
-def fmad : PatFrag <
-  (ops node:$src0, node:$src1, node:$src2),
-  (fadd (fmul node:$src0, node:$src1), node:$src2)
->;
-
 class Constants {
 int TWO_PI = 0x40c90fdb;
 int PI = 0x40490fdb;
@@ -428,6 +433,11 @@ def FP_ONE : PatLeaf <
   [{return N->isExactlyValue(1.0);}]
 >;
 
+def FP_HALF : PatLeaf <
+  (fpimm),
+  [{return N->isExactlyValue(0.5);}]
+>;
+
 let isCodeGenOnly = 1, isPseudo = 1 in {
 
 let usesCustomInserter = 1  in {
@@ -575,7 +585,7 @@ applied.
 
 def legalshift32 : ImmLeaf <i32, [{return Imm >=0 && Imm < 32;}]>;
 def bfemask : PatLeaf <(imm), [{return isMask_32(N->getZExtValue());}],
-                            SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(CountTrailingOnes_32(N->getZExtValue()), MVT::i32);}]>>;
+                            SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(countTrailingOnes(N->getZExtValue()), MVT::i32);}]>>;
 
 class BFEPattern <Instruction BFE> : Pat <
   (and (srl i32:$x, legalshift32:$y), bfemask:$z),
@@ -593,6 +603,20 @@ class ROTRPattern <Instruction BIT_ALIGN> : Pat <
 // 24-bit arithmetic patterns
 def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>;
 
+// Special conversion patterns
+
+def cvt_rpi_i32_f32 : PatFrag <
+  (ops node:$src),
+  (fp_to_sint (ffloor (fadd $src, FP_HALF))),
+  [{ (void) N; return TM.Options.NoNaNsFPMath; }]
+>;
+
+def cvt_flr_i32_f32 : PatFrag <
+  (ops node:$src),
+  (fp_to_sint (ffloor $src)),
+  [{ (void)N; return TM.Options.NoNaNsFPMath; }]
+>;
+
 /*
 class UMUL24Pattern <Instruction UMUL24> : Pat <
   (mul U24:$x, U24:$y),
@@ -639,17 +663,10 @@ class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
   (RcpInst $src)
 >;
 
-multiclass RsqPat<Instruction RsqInst, ValueType vt> {
-  def : Pat <
-    (fdiv FP_ONE, (fsqrt vt:$src)),
-    (RsqInst $src)
-  >;
-
-  def : Pat <
-    (AMDGPUrcp (fsqrt vt:$src)),
-    (RsqInst $src)
-  >;
-}
+class RsqPat<Instruction RsqInst, ValueType vt> : Pat <
+  (AMDGPUrcp (fsqrt vt:$src)),
+  (RsqInst $src)
+>;
 
 include "R600Instructions.td"
 include "R700Instructions.td"
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index bca027f..f047ed0 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
@@ -39,37 +40,23 @@ AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st):
   Ctx(ctx), ST(st)
 { }
 
-enum AMDGPUMCInstLower::SISubtarget
-AMDGPUMCInstLower::AMDGPUSubtargetToSISubtarget(unsigned) const {
-  return AMDGPUMCInstLower::SI;
-}
-
-unsigned AMDGPUMCInstLower::getMCOpcode(unsigned MIOpcode) const {
+void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
 
-  int MCOpcode = AMDGPU::getMCOpcode(MIOpcode,
-                              AMDGPUSubtargetToSISubtarget(ST.getGeneration()));
-  if (MCOpcode == -1)
-    MCOpcode = MIOpcode;
+  int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode());
 
-  return MCOpcode;
-}
-
-void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+  if (MCOpcode == -1) {
+    LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
+    C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
+                "a target-specific version: " + Twine(MI->getOpcode()));
+  }
 
-  OutMI.setOpcode(getMCOpcode(MI->getOpcode()));
+  OutMI.setOpcode(MCOpcode);
 
   for (const MachineOperand &MO : MI->explicit_operands()) {
     MCOperand MCOp;
     switch (MO.getType()) {
     default:
       llvm_unreachable("unknown operand type");
-    case MachineOperand::MO_FPImmediate: {
-      const APFloat &FloatValue = MO.getFPImm()->getValueAPF();
-      assert(&FloatValue.getSemantics() == &APFloat::IEEEsingle &&
-             "Only floating point immediates are supported at the moment.");
-      MCOp = MCOperand::CreateFPImm(FloatValue.convertToFloat());
-      break;
-    }
     case MachineOperand::MO_Immediate:
       MCOp = MCOperand::CreateImm(MO.getImm());
       break;
@@ -93,18 +80,24 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
       MCOp = MCOperand::CreateExpr(Expr);
       break;
     }
+    case MachineOperand::MO_ExternalSymbol: {
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(MO.getSymbolName()));
+      const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      MCOp = MCOperand::CreateExpr(Expr);
+      break;
+    }
     }
     OutMI.addOperand(MCOp);
   }
 }
 
 void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  AMDGPUMCInstLower MCInstLowering(OutContext,
-                               MF->getTarget().getSubtarget<AMDGPUSubtarget>());
+  const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+  AMDGPUMCInstLower MCInstLowering(OutContext, STI);
 
 #ifdef _DEBUG
   StringRef Err;
-  if (!TM.getSubtargetImpl()->getInstrInfo()->verifyInstruction(MI, Err)) {
+  if (!STI.getInstrInfo()->verifyInstruction(MI, Err)) {
     errs() << "Warning: Illegal instruction detected: " << Err << "\n";
     MI->dump();
   }
@@ -122,15 +115,15 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MCInstLowering.lower(MI, TmpInst);
     EmitToStreamer(OutStreamer, TmpInst);
 
-    if (DisasmEnabled) {
+    if (STI.dumpCode()) {
       // Disassemble instruction/operands to text.
       DisasmLines.resize(DisasmLines.size() + 1);
       std::string &DisasmLine = DisasmLines.back();
       raw_string_ostream DisasmStream(DisasmLine);
 
       AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(),
-                                    *TM.getSubtargetImpl()->getInstrInfo(),
-                                    *TM.getSubtargetImpl()->getRegisterInfo());
+                                    *MF->getSubtarget().getInstrInfo(),
+                                    *MF->getSubtarget().getRegisterInfo());
       InstPrinter.printInst(&TmpInst, DisasmStream, StringRef());
 
       // Disassemble instruction/operands to hex representation.
@@ -141,7 +134,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCObjectStreamer &ObjStreamer = (MCObjectStreamer &)OutStreamer;
       MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
       InstEmitter.EncodeInstruction(TmpInst, CodeStream, Fixups,
-                                    TM.getSubtarget<MCSubtargetInfo>());
+                                    MF->getSubtarget<MCSubtargetInfo>());
       CodeStream.flush();
 
       HexLines.resize(HexLines.size() + 1);
diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h
index 00d1f1b..d322fe0 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.h
+++ b/lib/Target/R600/AMDGPUMCInstLower.h
@@ -19,22 +19,9 @@ class MCContext;
 class MCInst;
 
 class AMDGPUMCInstLower {
-
-  // This must be kept in sync with the SISubtarget class in SIInstrInfo.td
-  enum SISubtarget {
-    SI = 0
-  };
-
   MCContext &Ctx;
   const AMDGPUSubtarget &ST;
 
-  /// Convert a member of the AMDGPUSubtarget::Generation enum to the
-  /// SISubtarget enum.
-  enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) const;
-
-  /// Get the MC opcode for this MachineInstr.
-  unsigned getMCOpcode(unsigned MIOpcode) const;
-
 public:
   AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST);
 
diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp
index 0f3f9e2..21c7da6 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.cpp
+++ b/lib/Target/R600/AMDGPUMachineFunction.cpp
@@ -15,9 +15,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   LDSSize(0),
   ScratchSize(0),
   IsKernel(true) {
-  AttributeSet Set = MF.getFunction()->getAttributes();
-  Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
-                                 ShaderTypeAttribute);
+  Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute);
 
   if (A.isStringAttribute()) {
     StringRef Str = A.getValueAsString();
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index 3433280..57b054b 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -42,8 +42,7 @@ void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 }
 
 unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  assert(!"Subroutines not supported yet");
-  return 0;
+  return AMDGPU::NoRegister;
 }
 
 unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index 9d09a19..70c8525 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -16,11 +16,11 @@
 #include "R600ISelLowering.h"
 #include "R600InstrInfo.h"
 #include "R600MachineScheduler.h"
-#include "SIInstrInfo.h"
 #include "SIISelLowering.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/SmallString.h"
-
-#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 
 using namespace llvm;
 
@@ -31,22 +31,9 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_CTOR
 #include "AMDGPUGenSubtargetInfo.inc"
 
-static std::string computeDataLayout(const AMDGPUSubtarget &ST) {
-  std::string Ret = "e-p:32:32";
-
-  if (ST.is64bit()) {
-    // 32-bit private, local, and region pointers. 64-bit global and constant.
-    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
-  }
-
-  Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
-         "-v512:512-v1024:1024-v2048:2048-n32:64";
-
-  return Ret;
-}
-
 AMDGPUSubtarget &
-AMDGPUSubtarget::initializeSubtargetDependencies(StringRef GPU, StringRef FS) {
+AMDGPUSubtarget::initializeSubtargetDependencies(StringRef TT, StringRef GPU,
+                                                 StringRef FS) {
   // Determine default and user-specified characteristics
   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
   // enabled, but some instructions do not respect them and they run at the
@@ -59,6 +46,9 @@ AMDGPUSubtarget::initializeSubtargetDependencies(StringRef GPU, StringRef FS) {
   SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
   FullFS += FS;
 
+  if (GPU == "" && Triple(TT).getArch() == Triple::amdgcn)
+    GPU = "SI";
+
   ParseSubtargetFeatures(GPU, FullFS);
 
   // FIXME: I don't think think Evergreen has any useful support for
@@ -76,21 +66,24 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS,
     : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false),
       DumpCode(false), R600ALUInst(false), HasVertexCache(false),
       TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
-      FP64Denormals(false), FP32Denormals(false), CaymanISA(false),
-      FlatAddressSpace(false), EnableIRStructurizer(true),
-      EnablePromoteAlloca(false), EnableIfCvt(true),
-      EnableLoadStoreOpt(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
-      DL(computeDataLayout(initializeSubtargetDependencies(GPU, FS))),
+      FP64Denormals(false), FP32Denormals(false), FastFMAF32(false),
+      CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true),
+      EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false),
+      WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
+      EnableVGPRSpilling(false),
       FrameLowering(TargetFrameLowering::StackGrowsUp,
                     64 * 16, // Maximum stack alignment (long16)
                     0),
-      InstrItins(getInstrItineraryForCPU(GPU)) {
+      InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
+
+  initializeSubtargetDependencies(TT, GPU, FS);
+
   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
     InstrInfo.reset(new R600InstrInfo(*this));
-    TLInfo.reset(new R600TargetLowering(TM));
+    TLInfo.reset(new R600TargetLowering(TM, *this));
   } else {
     InstrInfo.reset(new SIInstrInfo(*this));
-    TLInfo.reset(new SITargetLowering(TM));
+    TLInfo.reset(new SITargetLowering(TM, *this));
   }
 }
 
@@ -107,3 +100,33 @@ unsigned AMDGPUSubtarget::getStackEntrySize() const {
     llvm_unreachable("Illegal wavefront size.");
   }
 }
+
+unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const {
+  switch(getGeneration()) {
+  default: llvm_unreachable("ChipID unknown");
+  case SEA_ISLANDS: return 12;
+  }
+}
+
+bool AMDGPUSubtarget::isVGPRSpillingEnabled(
+                                       const SIMachineFunctionInfo *MFI) const {
+  return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling;
+}
+
+void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                          MachineInstr *begin,
+                                          MachineInstr *end,
+                                          unsigned NumRegionInstrs) const {
+  if (getGeneration() >= SOUTHERN_ISLANDS) {
+
+    // Track register pressure so the scheduler can try to decrease
+    // pressure once register usage is above the threshold defined by
+    // SIRegisterInfo::getRegPressureSetLimit()
+    Policy.ShouldTrackPressure = true;
+
+    // Enabling both top down and bottom up scheduling seems to give us less
+    // register spills than just using one of these approaches on its own.
+    Policy.OnlyTopDown = false;
+    Policy.OnlyBottomUp = false;
+  }
+}
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index f71d80a..1b0122c 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -20,7 +20,6 @@
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "R600ISelLowering.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -30,6 +29,8 @@
 
 namespace llvm {
 
+class SIMachineFunctionInfo;
+
 class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
 
 public:
@@ -39,7 +40,8 @@ public:
     EVERGREEN,
     NORTHERN_ISLANDS,
     SOUTHERN_ISLANDS,
-    SEA_ISLANDS
+    SEA_ISLANDS,
+    VOLCANIC_ISLANDS,
   };
 
 private:
@@ -53,6 +55,7 @@ private:
   bool FP64;
   bool FP64Denormals;
   bool FP32Denormals;
+  bool FastFMAF32;
   bool CaymanISA;
   bool FlatAddressSpace;
   bool EnableIRStructurizer;
@@ -62,16 +65,18 @@ private:
   unsigned WavefrontSize;
   bool CFALUBug;
   int LocalMemorySize;
+  bool EnableVGPRSpilling;
 
-  const DataLayout DL;
   AMDGPUFrameLowering FrameLowering;
   std::unique_ptr<AMDGPUTargetLowering> TLInfo;
   std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
   InstrItineraryData InstrItins;
+  Triple TargetTriple;
 
 public:
   AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS, TargetMachine &TM);
-  AMDGPUSubtarget &initializeSubtargetDependencies(StringRef GPU, StringRef FS);
+  AMDGPUSubtarget &initializeSubtargetDependencies(StringRef TT, StringRef GPU,
+                                                   StringRef FS);
 
   const AMDGPUFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
@@ -85,7 +90,6 @@ public:
   AMDGPUTargetLowering *getTargetLowering() const override {
     return TLInfo.get();
   }
-  const DataLayout *getDataLayout() const override { return &DL; }
   const InstrItineraryData *getInstrItineraryData() const override {
     return &InstrItins;
   }
@@ -124,6 +128,10 @@ public:
     return FP64Denormals;
   }
 
+  bool hasFastFMAF32() const {
+    return FastFMAF32;
+  }
+
   bool hasFlatAddressSpace() const {
     return FlatAddressSpace;
   }
@@ -198,10 +206,16 @@ public:
     return LocalMemorySize;
   }
 
+  unsigned getAmdKernelCodeChipID() const;
+
   bool enableMachineScheduler() const override {
-    return getGeneration() <= NORTHERN_ISLANDS;
+    return true;
   }
 
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
+                           MachineInstr *begin, MachineInstr *end,
+                           unsigned NumRegionInstrs) const override;
+
   // Helper functions to simplify if statements
   bool isTargetELF() const {
     return false;
@@ -217,6 +231,22 @@ public:
   bool r600ALUEncoding() const {
     return R600ALUInst;
   }
+  bool isAmdHsaOS() const {
+    return TargetTriple.getOS() == Triple::AMDHSA;
+  }
+  bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const;
+
+  unsigned getMaxWavesPerCU() const {
+    if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      return 10;
+
+    // FIXME: Not sure what this is for other subtagets.
+    llvm_unreachable("do not know max waves per CU for this subtarget.");
+  }
+
+  bool enableSubRegLiveness() const override {
+    return false;
+  }
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index b2cd988..a862f3c 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -15,6 +15,7 @@
 
 #include "AMDGPUTargetMachine.h"
 #include "AMDGPU.h"
+#include "AMDGPUTargetTransformInfo.h"
 #include "R600ISelLowering.h"
 #include "R600InstrInfo.h"
 #include "R600MachineScheduler.h"
@@ -27,7 +28,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm/Transforms/IPO.h"
@@ -38,7 +39,8 @@ using namespace llvm;
 
 extern "C" void LLVMInitializeR600Target() {
   // Register the target
-  RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget);
+  RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget);
+  RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget);
 }
 
 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
@@ -49,12 +51,28 @@ static MachineSchedRegistry
 SchedCustomRegistry("r600", "Run R600's custom scheduler",
                     createR600MachineScheduler);
 
+static std::string computeDataLayout(StringRef TT) {
+  Triple Triple(TT);
+  std::string Ret = "e-p:32:32";
+
+  if (Triple.getArch() == Triple::amdgcn) {
+    // 32-bit private, local, and region pointers. 64-bit global and constant.
+    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
+  }
+
+  Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
+         "-v512:512-v1024:1024-v2048:2048-n32:64";
+
+  return Ret;
+}
+
 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
                                          StringRef CPU, StringRef FS,
                                          TargetOptions Options, Reloc::Model RM,
                                          CodeModel::Model CM,
                                          CodeGenOpt::Level OptLevel)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
+      DL(computeDataLayout(TT)),
       TLOF(new TargetLoweringObjectFileELF()),
       Subtarget(TT, CPU, FS, *this), IntrinsicInfo() {
   setRequiresStructuredCFG(true);
@@ -65,10 +83,33 @@ AMDGPUTargetMachine::~AMDGPUTargetMachine() {
   delete TLOF;
 }
 
+//===----------------------------------------------------------------------===//
+// R600 Target Machine (R600 -> Cayman)
+//===----------------------------------------------------------------------===//
+
+R600TargetMachine::R600TargetMachine(const Target &T, StringRef TT, StringRef FS,
+                    StringRef CPU, TargetOptions Options, Reloc::Model RM,
+                    CodeModel::Model CM, CodeGenOpt::Level OL) :
+    AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) { }
+
+
+//===----------------------------------------------------------------------===//
+// GCN Target Machine (SI+)
+//===----------------------------------------------------------------------===//
+
+GCNTargetMachine::GCNTargetMachine(const Target &T, StringRef TT, StringRef FS,
+                    StringRef CPU, TargetOptions Options, Reloc::Model RM,
+                    CodeModel::Model CM, CodeGenOpt::Level OL) :
+    AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) { }
+
+//===----------------------------------------------------------------------===//
+// AMDGPU Pass Setup
+//===----------------------------------------------------------------------===//
+
 namespace {
 class AMDGPUPassConfig : public TargetPassConfig {
 public:
-  AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM)
+  AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
@@ -85,29 +126,38 @@ public:
 
   void addIRPasses() override;
   void addCodeGenPrepare() override;
+  virtual bool addPreISel() override;
+  virtual bool addInstSelector() override;
+};
+
+class R600PassConfig : public AMDGPUPassConfig {
+public:
+  R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
+    : AMDGPUPassConfig(TM, PM) { }
+
   bool addPreISel() override;
-  bool addInstSelector() override;
-  bool addPreRegAlloc() override;
-  bool addPostRegAlloc() override;
-  bool addPreSched2() override;
-  bool addPreEmitPass() override;
+  void addPreRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
 };
-} // End of anonymous namespace
 
-TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new AMDGPUPassConfig(this, PM);
-}
+class GCNPassConfig : public AMDGPUPassConfig {
+public:
+  GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
+    : AMDGPUPassConfig(TM, PM) { }
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  void addPreRegAlloc() override;
+  void addPostRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
+};
 
-//===----------------------------------------------------------------------===//
-// AMDGPU Analysis Pass Setup
-//===----------------------------------------------------------------------===//
+} // End of anonymous namespace
 
-void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  // Add first the target-independent BasicTTI pass, then our AMDGPU pass. This
-  // allows the AMDGPU pass to delegate to the target independent layer when
-  // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(this));
-  PM.add(createAMDGPUTargetTransformInfoPass(this));
+TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis(
+      [this](Function &F) { return TargetTransformInfo(AMDGPUTTIImpl(this)); });
 }
 
 void AMDGPUPassConfig::addIRPasses() {
@@ -129,7 +179,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
     addPass(createAMDGPUPromoteAlloca(ST));
     addPass(createSROAPass());
   }
-
   TargetPassConfig::addCodeGenPrepare();
 }
 
@@ -139,84 +188,96 @@ AMDGPUPassConfig::addPreISel() {
   addPass(createFlattenCFGPass());
   if (ST.IsIRStructurizerEnabled())
     addPass(createStructurizeCFGPass());
-  if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-    addPass(createSinkingPass());
-    addPass(createSITypeRewriter());
-    addPass(createSIAnnotateControlFlowPass());
-  } else {
-    addPass(createR600TextureIntrinsicsReplacer());
-  }
   return false;
 }
 
 bool AMDGPUPassConfig::addInstSelector() {
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-
   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
+  return false;
+}
 
-  if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-    addPass(createSILowerI1CopiesPass());
-    addPass(createSIFixSGPRCopiesPass(*TM));
-  }
+//===----------------------------------------------------------------------===//
+// R600 Pass Setup
+//===----------------------------------------------------------------------===//
 
+bool R600PassConfig::addPreISel() {
+  AMDGPUPassConfig::addPreISel();
+  addPass(createR600TextureIntrinsicsReplacer());
   return false;
 }
 
-bool AMDGPUPassConfig::addPreRegAlloc() {
+void R600PassConfig::addPreRegAlloc() {
+  addPass(createR600VectorRegMerger(*TM));
+}
+
+void R600PassConfig::addPreSched2() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+  addPass(createR600EmitClauseMarkers(), false);
+  if (ST.isIfCvtEnabled())
+    addPass(&IfConverterID, false);
+  addPass(createR600ClauseMergePass(*TM), false);
+}
 
-  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    addPass(createR600VectorRegMerger(*TM));
-  } else {
-     if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) {
-      // Don't do this with no optimizations since it throws away debug info by
-      // merging nonadjacent loads.
-
-      // This should be run after scheduling, but before register allocation. It
-      // also need extra copies to the address operand to be eliminated.
-      initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
-      insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
-    }
-
-    addPass(createSIShrinkInstructionsPass());
-    addPass(createSIFixSGPRLiveRangesPass());
-  }
-  return false;
+void R600PassConfig::addPreEmitPass() {
+  addPass(createAMDGPUCFGStructurizerPass(), false);
+  addPass(createR600ExpandSpecialInstrsPass(*TM), false);
+  addPass(&FinalizeMachineBundlesID, false);
+  addPass(createR600Packetizer(*TM), false);
+  addPass(createR600ControlFlowFinalizer(*TM), false);
 }
 
-bool AMDGPUPassConfig::addPostRegAlloc() {
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new R600PassConfig(this, PM);
+}
 
-  addPass(createSIShrinkInstructionsPass());
-  if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    addPass(createSIInsertWaits(*TM));
-  }
+//===----------------------------------------------------------------------===//
+// GCN Pass Setup
+//===----------------------------------------------------------------------===//
+
+bool GCNPassConfig::addPreISel() {
+  AMDGPUPassConfig::addPreISel();
+  addPass(createSinkingPass());
+  addPass(createSITypeRewriter());
+  addPass(createSIAnnotateControlFlowPass());
   return false;
 }
 
-bool AMDGPUPassConfig::addPreSched2() {
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-
-  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
-    addPass(createR600EmitClauseMarkers());
-  if (ST.isIfCvtEnabled())
-    addPass(&IfConverterID);
-  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
-    addPass(createR600ClauseMergePass(*TM));
+bool GCNPassConfig::addInstSelector() {
+  AMDGPUPassConfig::addInstSelector();
+  addPass(createSILowerI1CopiesPass());
+  addPass(createSIFixSGPRCopiesPass(*TM));
+  addPass(createSIFoldOperandsPass());
   return false;
 }
 
-bool AMDGPUPassConfig::addPreEmitPass() {
+void GCNPassConfig::addPreRegAlloc() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    addPass(createAMDGPUCFGStructurizerPass());
-    addPass(createR600ExpandSpecialInstrsPass(*TM));
-    addPass(&FinalizeMachineBundlesID);
-    addPass(createR600Packetizer(*TM));
-    addPass(createR600ControlFlowFinalizer(*TM));
-  } else {
-    addPass(createSILowerControlFlowPass(*TM));
+  if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) {
+  // Don't do this with no optimizations since it throws away debug info by
+  // merging nonadjacent loads.
+
+  // This should be run after scheduling, but before register allocation. It
+  // also need extra copies to the address operand to be eliminated.
+  initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
+  insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
   }
+  addPass(createSIShrinkInstructionsPass(), false);
+  addPass(createSIFixSGPRLiveRangesPass(), false);
+}
 
-  return false;
+void GCNPassConfig::addPostRegAlloc() {
+  addPass(createSIPrepareScratchRegs(), false);
+  addPass(createSIShrinkInstructionsPass(), false);
+}
+
+void GCNPassConfig::addPreSched2() {
+  addPass(createSIInsertWaits(*TM), false);
+}
+
+void GCNPassConfig::addPreEmitPass() {
+  addPass(createSILowerControlFlowPass(*TM), false);
+}
+
+TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new GCNPassConfig(this, PM);
 }
diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
index 1b3dbce..a691536 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.h
+++ b/lib/Target/R600/AMDGPUTargetMachine.h
@@ -24,7 +24,15 @@
 
 namespace llvm {
 
+//===----------------------------------------------------------------------===//
+// AMDGPU Target Machine (R600+)
+//===----------------------------------------------------------------------===//
+
 class AMDGPUTargetMachine : public LLVMTargetMachine {
+private:
+  const DataLayout DL;
+
+protected:
   TargetLoweringObjectFile *TLOF;
   AMDGPUSubtarget Subtarget;
   AMDGPUIntrinsicInfo IntrinsicInfo;
@@ -34,21 +42,52 @@ public:
                       StringRef CPU, TargetOptions Options, Reloc::Model RM,
                       CodeModel::Model CM, CodeGenOpt::Level OL);
   ~AMDGPUTargetMachine();
+  // FIXME: This is currently broken, the DataLayout needs to move to
+  // the target machine.
+  const DataLayout *getDataLayout() const override {
+    return &DL;
+  }
   const AMDGPUSubtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
   const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
     return &IntrinsicInfo;
   }
-  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  TargetIRAnalysis getTargetIRAnalysis() override;
 
-  /// \brief Register R600 analysis passes with a pass manager.
-  void addAnalysisPasses(PassManagerBase &PM) override;
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF;
   }
 };
 
+//===----------------------------------------------------------------------===//
+// R600 Target Machine (R600 -> Cayman)
+//===----------------------------------------------------------------------===//
+
+class R600TargetMachine : public AMDGPUTargetMachine {
+
+public:
+  R600TargetMachine(const Target &T, StringRef TT, StringRef FS,
+                    StringRef CPU, TargetOptions Options, Reloc::Model RM,
+                    CodeModel::Model CM, CodeGenOpt::Level OL);
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+};
+
+//===----------------------------------------------------------------------===//
+// GCN Target Machine (SI+)
+//===----------------------------------------------------------------------===//
+
+class GCNTargetMachine : public AMDGPUTargetMachine {
+
+public:
+  GCNTargetMachine(const Target &T, StringRef TT, StringRef FS,
+                    StringRef CPU, TargetOptions Options, Reloc::Model RM,
+                    CodeModel::Model CM, CodeGenOpt::Level OL);
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+};
+
 } // End namespace llvm
 
 #endif
diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
index e7bc006..68f4600 100644
--- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
@@ -15,11 +15,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUTargetTransformInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
@@ -27,80 +27,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "AMDGPUtti"
 
-// Declare the pass initialization routine locally as target-specific passes
-// don't have a target-wide initialization entry point, and so we rely on the
-// pass constructor initialization.
-namespace llvm {
-void initializeAMDGPUTTIPass(PassRegistry &);
-}
-
-namespace {
-
-class AMDGPUTTI final : public ImmutablePass, public TargetTransformInfo {
-  const AMDGPUTargetMachine *TM;
-  const AMDGPUSubtarget *ST;
-  const AMDGPUTargetLowering *TLI;
-
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
-
-public:
-  AMDGPUTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
-    llvm_unreachable("This pass cannot be directly constructed");
-  }
-
-  AMDGPUTTI(const AMDGPUTargetMachine *TM)
-      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
-        TLI(TM->getSubtargetImpl()->getTargetLowering()) {
-    initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void initializePass() override { pushTTIStack(this); }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    TargetTransformInfo::getAnalysisUsage(AU);
-  }
-
-  /// Pass identification.
-  static char ID;
-
-  /// Provide necessary pointer adjustments for the two base classes.
-  void *getAdjustedAnalysisPointer(const void *ID) override {
-    if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo *)this;
-    return this;
-  }
-
-  bool hasBranchDivergence() const override;
-
-  void getUnrollingPreferences(const Function *F, Loop *L,
-                               UnrollingPreferences &UP) const override;
-
-  PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const override;
-
-  unsigned getNumberOfRegisters(bool Vector) const override;
-  unsigned getRegisterBitWidth(bool Vector) const override;
-  unsigned getMaxInterleaveFactor() const override;
-};
-
-} // end anonymous namespace
-
-INITIALIZE_AG_PASS(AMDGPUTTI, TargetTransformInfo, "AMDGPUtti",
-                   "AMDGPU Target Transform Info", true, true, false)
-char AMDGPUTTI::ID = 0;
-
-ImmutablePass *
-llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) {
-  return new AMDGPUTTI(TM);
-}
-
-bool AMDGPUTTI::hasBranchDivergence() const { return true; }
-
-void AMDGPUTTI::getUnrollingPreferences(const Function *, Loop *L,
-                                        UnrollingPreferences &UP) const {
+void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
+                                            TTI::UnrollingPreferences &UP) {
   UP.Threshold = 300; // Twice the default.
-  UP.Count = UINT_MAX;
+  UP.MaxCount = UINT_MAX;
   UP.Partial = true;
 
   // TODO: Do we want runtime unrolling?
@@ -130,13 +60,7 @@ void AMDGPUTTI::getUnrollingPreferences(const Function *, Loop *L,
   }
 }
 
-AMDGPUTTI::PopcntSupportKind
-AMDGPUTTI::getPopcntSupport(unsigned TyWidth) const {
-  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
-  return ST->hasBCNT(TyWidth) ? PSK_FastHardware : PSK_Software;
-}
-
-unsigned AMDGPUTTI::getNumberOfRegisters(bool Vec) const {
+unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
   if (Vec)
     return 0;
 
@@ -147,11 +71,9 @@ unsigned AMDGPUTTI::getNumberOfRegisters(bool Vec) const {
   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
 }
 
-unsigned AMDGPUTTI::getRegisterBitWidth(bool) const {
-  return 32;
-}
+unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; }
 
-unsigned AMDGPUTTI::getMaxInterleaveFactor() const {
+unsigned AMDGPUTTIImpl::getMaxInterleaveFactor() {
   // Semi-arbitrary large amount.
   return 64;
 }
diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.h b/lib/Target/R600/AMDGPUTargetTransformInfo.h
new file mode 100644
index 0000000..4abbdf2
--- /dev/null
+++ b/lib/Target/R600/AMDGPUTargetTransformInfo.h
@@ -0,0 +1,78 @@
+//===-- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// AMDGPU target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> {
+  typedef BasicTTIImplBase<AMDGPUTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const AMDGPUSubtarget *ST;
+  const AMDGPUTargetLowering *TLI;
+
+  const AMDGPUSubtarget *getST() const { return ST; }
+  const AMDGPUTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM)
+      : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {}
+
+  // Provide value semantics. MSVC requires that we spell all of these out.
+  AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg)
+      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
+  AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg)
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+        TLI(std::move(Arg.TLI)) {}
+  AMDGPUTTIImpl &operator=(const AMDGPUTTIImpl &RHS) {
+    BaseT::operator=(static_cast<const BaseT &>(RHS));
+    ST = RHS.ST;
+    TLI = RHS.TLI;
+    return *this;
+  }
+  AMDGPUTTIImpl &operator=(AMDGPUTTIImpl &&RHS) {
+    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
+    ST = std::move(RHS.ST);
+    TLI = std::move(RHS.TLI);
+    return *this;
+  }
+
+  bool hasBranchDivergence() { return true; }
+
+  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
+    assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+    return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software;
+  }
+
+  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getMaxInterleaveFactor();
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/R600/AMDKernelCodeT.h b/lib/Target/R600/AMDKernelCodeT.h
new file mode 100644
index 0000000..4d3041f
--- /dev/null
+++ b/lib/Target/R600/AMDKernelCodeT.h
@@ -0,0 +1,704 @@
+//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file AMDKernelCodeT.h
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDKERNELCODET_H
+#define AMDKERNELCODET_H
+
+#include <cstddef>
+#include <cstdint>
+
+//---------------------------------------------------------------------------//
+// AMD Kernel Code, and its dependencies                                     //
+//---------------------------------------------------------------------------//
+
+typedef uint8_t hsa_powertwo8_t;
+typedef uint32_t hsa_ext_code_kind_t;
+typedef uint8_t hsa_ext_brig_profile8_t;
+typedef uint8_t hsa_ext_brig_machine_model8_t;
+typedef uint64_t hsa_ext_control_directive_present64_t;
+typedef uint16_t hsa_ext_exception_kind16_t;
+typedef uint32_t hsa_ext_code_kind32_t;
+
+typedef struct hsa_dim3_s {
+  uint32_t x;
+  uint32_t y;
+  uint32_t z;
+} hsa_dim3_t;
+
+/// The version of the amd_*_code_t struct. Minor versions must be
+/// backward compatible.
+typedef uint32_t amd_code_version32_t;
+enum amd_code_version_t {
+  AMD_CODE_VERSION_MAJOR = 0,
+  AMD_CODE_VERSION_MINOR = 1
+};
+
+/// The values used to define the number of bytes to use for the
+/// swizzle element size.
+enum amd_element_byte_size_t {
+  AMD_ELEMENT_2_BYTES = 0,
+  AMD_ELEMENT_4_BYTES = 1,
+  AMD_ELEMENT_8_BYTES = 2,
+  AMD_ELEMENT_16_BYTES = 3
+};
+
+/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
+/// COMPUTE_PGM_RSRC2 registers.
+typedef uint64_t amd_compute_pgm_resource_register64_t;
+
+/// Every amd_*_code_t has the following properties, which are composed of
+/// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*),
+/// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount
+/// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0.
+///
+/// (Note that bit fields cannot be used as their layout is
+/// implementation defined in the C standard and so cannot be used to
+/// specify an ABI)
+typedef uint32_t amd_code_property32_t;
+enum amd_code_property_mask_t {
+
+  /// Enable the setup of the SGPR user data registers
+  /// (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t
+  /// for initial register state.
+  ///
+  /// The total number of SGPRuser data registers requested must not
+  /// exceed 16. Any requests beyond 16 will be ignored.
+  ///
+  /// Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of
+  /// SGPR user data registers enabled up to 16).
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
+
+  /// Control wave ID base counter for GDS ordered-append. Used to set
+  /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
+  /// ORDERED_APPEND_MODE also needs to be settable)
+  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10,
+  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
+
+  /// The interleave (swizzle) element size in bytes required by the
+  /// code for private memory. This must be 2, 4, 8 or 16. This value
+  /// is provided to the finalizer when it is invoked and is recorded
+  /// here. The hardware will interleave the memory requests of each
+  /// lane of a wavefront by this element size to ensure each
+  /// work-item gets a distinct memory memory location. Therefore, the
+  /// finalizer ensures that all load and store operations done to
+  /// private memory do not exceed this size. For example, if the
+  /// element size is 4 (32-bits or dword) and a 64-bit value must be
+  /// loaded, the finalizer will generate two 32-bit loads. This
+  /// ensures that the interleaving will get the the work-item
+  /// specific dword for both halves of the 64-bit value. If it just
+  /// did a 64-bit load then it would get one dword which belonged to
+  /// its own work-item, but the second dword would belong to the
+  /// adjacent lane work-item since the interleaving is in dwords.
+  ///
+  /// The value used must match the value that the runtime configures
+  /// the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This
+  /// is generally DWORD.
+  ///
+  /// Use values from the amd_element_byte_size_t enum.
+  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11,
+  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
+  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
+
+  /// Are global memory addresses 64 bits. Must match
+  /// amd_kernel_code_t.hsail_machine_model ==
+  /// HSA_MACHINE_LARGE. Must also match
+  /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
+  /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
+  AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13,
+  AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
+  AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
+
+  /// Indicate if the generated ISA is using a dynamically sized call
+  /// stack. This can happen if calls are implemented using a call
+  /// stack and recursion, alloca or calls to indirect functions are
+  /// present. In these cases the Finalizer cannot compute the total
+  /// private segment size at compile time. In this case the
+  /// workitem_private_segment_byte_size only specifies the statically
+  /// know private segment size, and additional space must be added
+  /// for the call stack.
+  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14,
+  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
+  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
+
+  /// Indicate if code generated has support for debugging.
+  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15,
+  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
+  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT
+};
+
+/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL
+/// control directives. These control how the finalizer generates code. This
+/// struct is used both as an argument to hsaFinalizeKernel to specify values for
+/// the control directives, and is used in HsaKernelCode to record the values of
+/// the control directives that the finalize used when generating the code which
+/// either came from the finalizer argument or explicit HSAIL control
+/// directives. See the definition of the control directives in HSA Programmer's
+/// Reference Manual which also defines how the values specified as finalizer
+/// arguments have to agree with the control directives in the HSAIL code.
+typedef struct hsa_ext_control_directives_s {
+  /// This is a bit set indicating which control directives have been
+  /// specified. If the value is 0 then there are no control directives specified
+  /// and the rest of the fields can be ignored. The bits are accessed using the
+  /// hsa_ext_control_directives_present_mask_t. Any control directive that is not
+  /// enabled in this bit set must have the value of all 0s.
+  hsa_ext_control_directive_present64_t enabled_control_directives;
+
+  /// If enableBreakExceptions is not enabled then must be 0, otherwise must be
+  /// non-0 and specifies the set of HSAIL exceptions that must have the BREAK
+  /// policy enabled. If this set is not empty then the generated code may have
+  /// lower performance than if the set is empty. If the kernel being finalized
+  /// has any enablebreakexceptions control directives, then the values specified
+  /// by this argument are unioned with the values in these control
+  /// directives. If any of the functions the kernel calls have an
+  /// enablebreakexceptions control directive, then they must be equal or a
+  /// subset of, this union.
+  hsa_ext_exception_kind16_t enable_break_exceptions;
+
+  /// If enableDetectExceptions is not enabled then must be 0, otherwise must be
+  /// non-0 and specifies the set of HSAIL exceptions that must have the DETECT
+  /// policy enabled. If this set is not empty then the generated code may have
+  /// lower performance than if the set is empty. However, an implementation
+  /// should endeavour to make the performance impact small. If the kernel being
+  /// finalized has any enabledetectexceptions control directives, then the
+  /// values specified by this argument are unioned with the values in these
+  /// control directives. If any of the functions the kernel calls have an
+  /// enabledetectexceptions control directive, then they must be equal or a
+  /// subset of, this union.
+  hsa_ext_exception_kind16_t enable_detect_exceptions;
+
+  /// If maxDynamicGroupSize is not enabled then must be 0, and any amount of
+  /// dynamic group segment can be allocated for a dispatch, otherwise the value
+  /// specifies the maximum number of bytes of dynamic group segment that can be
+  /// allocated for a dispatch. If the kernel being finalized has any
+  /// maxdynamicsize control directives, then the values must be the same, and
+  /// must be the same as this argument if it is enabled. This value can be used
+  /// by the finalizer to determine the maximum number of bytes of group memory
+  /// used by each work-group by adding this value to the group memory required
+  /// for all group segment variables used by the kernel and all functions it
+  /// calls, and group memory used to implement other HSAIL features such as
+  /// fbarriers and the detect exception operations. This can allow the finalizer
+  /// to determine the expected number of work-groups that can be executed by a
+  /// compute unit and allow more resources to be allocated to the work-items if
+  /// it is known that fewer work-groups can be executed due to group memory
+  /// limitations.
+  uint32_t max_dynamic_group_size;
+
+  /// If maxFlatGridSize is not enabled then must be 0, otherwise must be greater
+  /// than 0. See HSA Programmer's Reference Manual description of
+  /// maxflatgridsize control directive.
+  uint32_t max_flat_grid_size;
+
+  /// If maxFlatWorkgroupSize is not enabled then must be 0, otherwise must be
+  /// greater than 0. See HSA Programmer's Reference Manual description of
+  /// maxflatworkgroupsize control directive.
+  uint32_t max_flat_workgroup_size;
+
+  /// If requestedWorkgroupsPerCu is not enabled then must be 0, and the
+  /// finalizer is free to generate ISA that may result in any number of
+  /// work-groups executing on a single compute unit. Otherwise, the finalizer
+  /// should attempt to generate ISA that will allow the specified number of
+  /// work-groups to execute on a single compute unit. This is only a hint and
+  /// can be ignored by the finalizer. If the kernel being finalized, or any of
+  /// the functions it calls, has a requested control directive, then the values
+  /// must be the same. This can be used to determine the number of resources
+  /// that should be allocated to a single work-group and work-item. For example,
+  /// a low value may allow more resources to be allocated, resulting in higher
+  /// per work-item performance, as it is known there will never be more than the
+  /// specified number of work-groups actually executing on the compute
+  /// unit. Conversely, a high value may allocate fewer resources, resulting in
+  /// lower per work-item performance, which is offset by the fact it allows more
+  /// work-groups to actually execute on the compute unit.
+  uint32_t requested_workgroups_per_cu;
+
+  /// If not enabled then all elements for Dim3 must be 0, otherwise every
+  /// element must be greater than 0. See HSA Programmer's Reference Manual
+  /// description of requiredgridsize control directive.
+  hsa_dim3_t required_grid_size;
+
+  /// If requiredWorkgroupSize is not enabled then all elements for Dim3 must be
+  /// 0, and the produced code can be dispatched with any legal work-group range
+  /// consistent with the dispatch dimensions. Otherwise, the code produced must
+  /// always be dispatched with the specified work-group range. No element of the
+  /// specified range must be 0. It must be consistent with required_dimensions
+  /// and max_flat_workgroup_size. If the kernel being finalized, or any of the
+  /// functions it calls, has a requiredworkgroupsize control directive, then the
+  /// values must be the same. Specifying a value can allow the finalizer to
+  /// optimize work-group id operations, and if the number of work-items in the
+  /// work-group is less than the WAVESIZE then barrier operations can be
+  /// optimized to just a memory fence.
+  hsa_dim3_t required_workgroup_size;
+
+  /// If requiredDim is not enabled then must be 0 and the produced kernel code
+  /// can be dispatched with 1, 2 or 3 dimensions. If enabled then the value is
+  /// 1..3 and the code produced must only be dispatched with a dimension that
+  /// matches. Other values are illegal. If the kernel being finalized, or any of
+  /// the functions it calls, has a requireddimsize control directive, then the
+  /// values must be the same. This can be used to optimize the code generated to
+  /// compute the absolute and flat work-group and work-item id, and the dim
+  /// HSAIL operations.
+  uint8_t required_dim;
+
+  /// Reserved. Must be 0.
+  uint8_t reserved[75];
+} hsa_ext_control_directives_t;
+
+/// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel
+/// Code Object to set up the hardware to execute the kernel dispatch.
+///
+/// Initial Kernel Register State.
+///
+/// Initial kernel register state will be set up by CP/SPI prior to the start
+/// of execution of every wavefront. This is limited by the constraints of the
+/// current hardware.
+///
+/// The order of the SGPR registers is defined, but the Finalizer can specify
+/// which ones are actually setup in the amd_kernel_code_t object using the
+/// enable_sgpr_* bit fields. The register numbers used for enabled registers
+/// are dense starting at SGPR0: the first enabled register is SGPR0, the next
+/// enabled register is SGPR1 etc.; disabled registers do not have an SGPR
+/// number.
+///
+/// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and
+/// apply to all waves of the grid. It is possible to specify more than 16 User
+/// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16
+/// are actually initialized. These are then immediately followed by the System
+/// SGPRs that are set up by ADC/SPI and can have different values for each wave
+/// of the grid dispatch.
+///
+/// SGPR register initial state is defined as follows:
+///
+/// Private Segment Buffer (enable_sgpr_private_segment_buffer):
+///   Number of User SGPR registers: 4. V# that can be used, together with
+///   Scratch Wave Offset as an offset, to access the Private/Spill/Arg
+///   segments using a segment address. It must be set as follows:
+///     - Base address: of the scratch memory area used by the dispatch. It
+///       does not include the scratch wave offset. It will be the per process
+///       SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for
+///       example there may be a per pipe offset, or per AQL Queue offset).
+///     - Stride + data_format: Element Size * Index Stride (???)
+///     - Cache swizzle: ???
+///     - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for
+///       scratch)
+///     - Num records: Flat Scratch Work Item Size / Element Size (???)
+///     - Dst_sel_*: ???
+///     - Num_format: ???
+///     - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must
+///       agree with amd_kernel_code_t.privateElementSize)
+///     - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must
+///       be number of wavefront lanes for scratch, must agree with
+///       amd_kernel_code_t.wavefrontSize)
+///     - Add tid enable: 1
+///     - ATC: from SH_MEM_CONFIG.PRIVATE_ATC,
+///     - Hash_enable: ???
+///     - Heap: ???
+///     - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE
+///     - Type: 0 (a buffer) (???)
+///
+/// Dispatch Ptr (enable_sgpr_dispatch_ptr):
+///   Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet
+///   for kernel actually executing.
+///
+/// Queue Ptr (enable_sgpr_queue_ptr):
+///   Number of User SGPR registers: 2. 64 bit address of AmdQueue object for
+///   AQL queue on which the dispatch packet was queued.
+///
+/// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr):
+///   Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This
+///   is directly copied from the kernargPtr in the dispatch packet. Having CP
+///   load it once avoids loading it at the beginning of every wavefront.
+///
+/// Dispatch Id (enable_sgpr_dispatch_id):
+///   Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch
+///   packet being executed.
+///
+/// Flat Scratch Init (enable_sgpr_flat_scratch_init):
+///   Number of User SGPR registers: 2. This is 2 SGPRs.
+///
+///   For CI/VI:
+///     The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE
+///     to base of memory for scratch for this dispatch. This is the same offset
+///     used in computing the Scratch Segment Buffer base address. The value of
+///     Scratch Wave Offset must be added by the kernel code and moved to
+///     SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions.
+///
+///     The second SGPR is 32 bit byte size of a single work-item�s scratch
+///     memory usage. This is directly loaded from the dispatch packet Private
+///     Segment Byte Size and rounded up to a multiple of DWORD.
+///
+///     \todo [Does CP need to round this to >4 byte alignment?]
+///
+///     The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in
+///     flat memory instructions. Having CP load it once avoids loading it at
+///     the beginning of every wavefront.
+///
+///   For PI:
+///     This is the 64 bit base address of the scratch backing memory for
+///     allocated by CP for this dispatch.
+///
+/// Private Segment Size (enable_sgpr_private_segment_size):
+///   Number of User SGPR registers: 1. The 32 bit byte size of a single
+///   work-item�s scratch memory allocation. This is the value from the dispatch
+///   packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD.
+///
+///   \todo [Does CP need to round this to >4 byte alignment?]
+///
+///   Having CP load it once avoids loading it at the beginning of every
+///   wavefront.
+///
+///   \todo [This will not be used for CI/VI since it is the same value as
+///   the second SGPR of Flat Scratch Init. However, it is need for PI which
+///   changes meaning of Flat Scratchg Init..]
+///
+/// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x):
+///   Number of User SGPR registers: 1. 32 bit count of the number of
+///   work-groups in the X dimension for the grid being executed. Computed from
+///   the fields in the HsaDispatchPacket as
+///   ((gridSize.x+workgroupSize.x-1)/workgroupSize.x).
+///
+/// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y):
+///   Number of User SGPR registers: 1. 32 bit count of the number of
+///   work-groups in the Y dimension for the grid being executed. Computed from
+///   the fields in the HsaDispatchPacket as
+///   ((gridSize.y+workgroupSize.y-1)/workgroupSize.y).
+///
+///   Only initialized if <16 previous SGPRs initialized.
+///
+/// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z):
+///   Number of User SGPR registers: 1. 32 bit count of the number of
+///   work-groups in the Z dimension for the grid being executed. Computed
+///   from the fields in the HsaDispatchPacket as
+///   ((gridSize.z+workgroupSize.z-1)/workgroupSize.z).
+///
+///   Only initialized if <16 previous SGPRs initialized.
+///
+/// Work-Group Id X (enable_sgpr_workgroup_id_x):
+///   Number of System SGPR registers: 1. 32 bit work group id in X dimension
+///   of grid for wavefront. Always present.
+///
+/// Work-Group Id Y (enable_sgpr_workgroup_id_y):
+///   Number of System SGPR registers: 1. 32 bit work group id in Y dimension
+///   of grid for wavefront.
+///
+/// Work-Group Id Z (enable_sgpr_workgroup_id_z):
+///   Number of System SGPR registers: 1. 32 bit work group id in Z dimension
+///   of grid for wavefront. If present then Work-group Id Y will also be
+///   present
+///
+/// Work-Group Info (enable_sgpr_workgroup_info):
+///   Number of System SGPR registers: 1. {first_wave, 14�b0000,
+///   ordered_append_term[10:0], threadgroup_size_in_waves[5:0]}
+///
+/// Private Segment Wave Byte Offset
+/// (enable_sgpr_private_segment_wave_byte_offset):
+///   Number of System SGPR registers: 1. 32 bit byte offset from base of
+///   dispatch scratch base. Must be used as an offset with Private/Spill/Arg
+///   segment address when using Scratch Segment Buffer. It must be added to
+///   Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing.
+///
+///
+/// The order of the VGPR registers is defined, but the Finalizer can specify
+/// which ones are actually setup in the amd_kernel_code_t object using the
+/// enableVgpr*  bit fields. The register numbers used for enabled registers
+/// are dense starting at VGPR0: the first enabled register is VGPR0, the next
+/// enabled register is VGPR1 etc.; disabled registers do not have an VGPR
+/// number.
+///
+/// VGPR register initial state is defined as follows:
+///
+/// Work-Item Id X (always initialized):
+///   Number of registers: 1. 32 bit work item id in X dimension of work-group
+///   for wavefront lane.
+///
+/// Work-Item Id X (enable_vgpr_workitem_id > 0):
+///   Number of registers: 1. 32 bit work item id in Y dimension of work-group
+///   for wavefront lane.
+///
+/// Work-Item Id X (enable_vgpr_workitem_id > 0):
+///   Number of registers: 1. 32 bit work item id in Z dimension of work-group
+///   for wavefront lane.
+///
+///
+/// The setting of registers is being done by existing GPU hardware as follows:
+///   1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data
+///      registers.
+///   2) Work-group Id registers X, Y, Z are set by SPI which supports any
+///      combination including none.
+///   3) Scratch Wave Offset is also set by SPI which is why its value cannot
+///      be added into the value Flat Scratch Offset which would avoid the
+///      Finalizer generated prolog having to do the add.
+///   4) The VGPRs are set by SPI which only supports specifying either (X),
+///      (X, Y) or (X, Y, Z).
+///
+/// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so
+/// they can be moved as a 64 bit value to the hardware required SGPRn-3 and
+/// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register.
+///
+/// The global segment can be accessed either using flat operations or buffer
+/// operations. If buffer operations are used then the Global Buffer used to
+/// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a
+/// segment address is not passed into the kernel code by CP since its base
+/// address is always 0. Instead the Finalizer generates prolog code to
+/// initialize 4 SGPRs with a V# that has the following properties, and then
+/// uses that in the buffer instructions:
+///   - base address of 0
+///   - no swizzle
+///   - ATC=1
+///   - MTYPE set to support memory coherence specified in
+///     amd_kernel_code_t.globalMemoryCoherence
+///
+/// When the Global Buffer is used to access the Kernarg segment, must add the
+/// dispatch packet kernArgPtr to a kernarg segment address before using this V#.
+/// Alternatively scalar loads can be used if the kernarg offset is uniform, as
+/// the kernarg segment is constant for the duration of the kernel execution.
+///
+typedef struct amd_kernel_code_s {
+  /// The AMD major version of the Code Object. Must be the value
+  /// AMD_CODE_VERSION_MAJOR.
+  amd_code_version32_t amd_code_version_major;
+
+  /// The AMD minor version of the Code Object. Minor versions must be
+  /// backward compatible. Must be the value
+  /// AMD_CODE_VERSION_MINOR.
+  amd_code_version32_t amd_code_version_minor;
+
+  /// The byte size of this struct. Must be set to
+  /// sizeof(amd_kernel_code_t). Used for backward
+  /// compatibility.
+  uint32_t struct_byte_size;
+
+  /// The target chip instruction set for which code has been
+  /// generated. Values are from the E_SC_INSTRUCTION_SET enumeration
+  /// in sc/Interface/SCCommon.h.
+  uint32_t target_chip;
+
+  /// Byte offset (possibly negative) from start of amd_kernel_code_t
+  /// object to kernel's entry point instruction. The actual code for
+  /// the kernel is required to be 256 byte aligned to match hardware
+  /// requirements (SQ cache line is 16). The code must be position
+  /// independent code (PIC) for AMD devices to give runtime the
+  /// option of copying code to discrete GPU memory or APU L2
+  /// cache. The Finalizer should endeavour to allocate all kernel
+  /// machine code in contiguous memory pages so that a device
+  /// pre-fetcher will tend to only pre-fetch Kernel Code objects,
+  /// improving cache performance.
+  int64_t kernel_code_entry_byte_offset;
+
+  /// Range of bytes to consider prefetching expressed as an offset
+  /// and size. The offset is from the start (possibly negative) of
+  /// amd_kernel_code_t object. Set both to 0 if no prefetch
+  /// information is available.
+  ///
+  /// \todo ttye 11/15/2013 Is the prefetch definition we want? Did
+  /// not make the size a uint64_t as prefetching more than 4GiB seems
+  /// excessive.
+  int64_t kernel_code_prefetch_byte_offset;
+  uint64_t kernel_code_prefetch_byte_size;
+
+  /// Number of bytes of scratch backing memory required for full
+  /// occupancy of target chip. This takes into account the number of
+  /// bytes of scratch per work-item, the wavefront size, the maximum
+  /// number of wavefronts per CU, and the number of CUs. This is an
+  /// upper limit on scratch. If the grid being dispatched is small it
+  /// may only need less than this. If the kernel uses no scratch, or
+  /// the Finalizer has not computed this value, it must be 0.
+  uint64_t max_scratch_backing_memory_byte_size;
+
+  /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
+  /// COMPUTE_PGM_RSRC2 registers.
+  amd_compute_pgm_resource_register64_t compute_pgm_resource_registers;
+
+  /// Code properties. See amd_code_property_mask_t for a full list of
+  /// properties.
+  amd_code_property32_t code_properties;
+
+  /// The amount of memory required for the combined private, spill
+  /// and arg segments for a work-item in bytes. If
+  /// is_dynamic_callstack is 1 then additional space must be added to
+  /// this value for the call stack.
+  uint32_t workitem_private_segment_byte_size;
+
+  /// The amount of group segment memory required by a work-group in
+  /// bytes. This does not include any dynamically allocated group
+  /// segment memory that may be added when the kernel is
+  /// dispatched.
+  uint32_t workgroup_group_segment_byte_size;
+
+  /// Number of byte of GDS required by kernel dispatch. Must be 0 if
+  /// not using GDS.
+  uint32_t gds_segment_byte_size;
+
+  /// The size in bytes of the kernarg segment that holds the values
+  /// of the arguments to the kernel. This could be used by CP to
+  /// prefetch the kernarg segment pointed to by the dispatch packet.
+  uint64_t kernarg_segment_byte_size;
+
+  /// Number of fbarrier's used in the kernel and all functions it
+  /// calls. If the implementation uses group memory to allocate the
+  /// fbarriers then that amount must already be included in the
+  /// workgroup_group_segment_byte_size total.
+  uint32_t workgroup_fbarrier_count;
+
+  /// Number of scalar registers used by a wavefront. This includes
+  /// the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size
+  /// and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a
+  /// trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS.
+  uint16_t wavefront_sgpr_count;
+
+  /// Number of vector registers used by each work-item. Used to set
+  /// COMPUTE_PGM_RSRC1.VGPRS.
+  uint16_t workitem_vgpr_count;
+
+  /// If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the
+  /// first fixed VGPR number reserved.
+  uint16_t reserved_vgpr_first;
+
+  /// The number of consecutive VGPRs reserved by the client. If
+  /// is_debug_supported then this count includes VGPRs reserved
+  /// for debugger use.
+  uint16_t reserved_vgpr_count;
+
+  /// If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the
+  /// first fixed SGPR number reserved.
+  uint16_t reserved_sgpr_first;
+
+  /// The number of consecutive SGPRs reserved by the client. If
+  /// is_debug_supported then this count includes SGPRs reserved
+  /// for debugger use.
+  uint16_t reserved_sgpr_count;
+
+  /// If is_debug_supported is 0 then must be 0. Otherwise, this is the
+  /// fixed SGPR number used to hold the wave scratch offset for the
+  /// entire kernel execution, or uint16_t(-1) if the register is not
+  /// used or not known.
+  uint16_t debug_wavefront_private_segment_offset_sgpr;
+
+  /// If is_debug_supported is 0 then must be 0. Otherwise, this is the
+  /// fixed SGPR number of the first of 4 SGPRs used to hold the
+  /// scratch V# used for the entire kernel execution, or uint16_t(-1)
+  /// if the registers are not used or not known.
+  uint16_t debug_private_segment_buffer_sgpr;
+
+  /// The maximum byte alignment of variables used by the kernel in
+  /// the specified memory segment. Expressed as a power of two. Must
+  /// be at least HSA_POWERTWO_16.
+  hsa_powertwo8_t kernarg_segment_alignment;
+  hsa_powertwo8_t group_segment_alignment;
+  hsa_powertwo8_t private_segment_alignment;
+
+  uint8_t reserved3;
+
+  /// Type of code object.
+  hsa_ext_code_kind32_t code_type;
+
+  /// Reserved for code properties if any are defined in the future.
+  /// There are currently no code properties so this field must be 0.
+  uint32_t reserved4;
+
+  /// Wavefront size expressed as a power of two. Must be a power of 2
+  /// in range 1..64 inclusive. Used to support runtime query that
+  /// obtains wavefront size, which may be used by application to
+  /// allocated dynamic group memory and set the dispatch work-group
+  /// size.
+  hsa_powertwo8_t wavefront_size;
+
+  /// The optimization level specified when the kernel was
+  /// finalized.
+  uint8_t optimization_level;
+
+  /// The HSAIL profile defines which features are used. This
+  /// information is from the HSAIL version directive. If this
+  /// amd_kernel_code_t is not generated from an HSAIL compilation
+  /// unit then must be 0.
+  hsa_ext_brig_profile8_t hsail_profile;
+
+  /// The HSAIL machine model gives the address sizes used by the
+  /// code. This information is from the HSAIL version directive. If
+  /// not generated from an HSAIL compilation unit then must still
+  /// indicate for what machine mode the code is generated.
+  hsa_ext_brig_machine_model8_t hsail_machine_model;
+
+  /// The HSAIL major version. This information is from the HSAIL
+  /// version directive. If this amd_kernel_code_t is not
+  /// generated from an HSAIL compilation unit then must be 0.
+  uint32_t hsail_version_major;
+
+  /// The HSAIL minor version. This information is from the HSAIL
+  /// version directive. If this amd_kernel_code_t is not
+  /// generated from an HSAIL compilation unit then must be 0.
+  uint32_t hsail_version_minor;
+
+  /// Reserved for HSAIL target options if any are defined in the
+  /// future. There are currently no target options so this field
+  /// must be 0.
+  uint16_t reserved5;
+
+  /// Reserved. Must be 0.
+  uint16_t reserved6;
+
+  /// The values should be the actually values used by the finalizer
+  /// in generating the code. This may be the union of values
+  /// specified as finalizer arguments and explicit HSAIL control
+  /// directives. If the finalizer chooses to ignore a control
+  /// directive, and not generate constrained code, then the control
+  /// directive should not be marked as enabled even though it was
+  /// present in the HSAIL or finalizer argument. The values are
+  /// intended to reflect the constraints that the code actually
+  /// requires to correctly execute, not the values that were
+  /// actually specified at finalize time.
+  hsa_ext_control_directives_t control_directive;
+
+  /// The code can immediately follow the amd_kernel_code_t, or can
+  /// come after subsequent amd_kernel_code_t structs when there are
+  /// multiple kernels in the compilation unit.
+
+} amd_kernel_code_t;
+
+#endif // AMDKERNELCODET_H
diff --git a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp
index 7ad815d..3b4ba1a 100644
--- a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp
@@ -163,23 +163,22 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   MCInst Inst;
 
   switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
-    default: break;
-    case Match_Success:
-      Inst.setLoc(IDLoc);
-      Out.EmitInstruction(Inst, STI);
-      return false;
-    case Match_MissingFeature:
-      return Error(IDLoc, "instruction use requires an option to be enabled");
-    case Match_MnemonicFail:
-        return Error(IDLoc, "unrecognized instruction mnemonic");
-    case Match_InvalidOperand: {
-      if (ErrorInfo != ~0ULL) {
-        if (ErrorInfo >= Operands.size())
-          return Error(IDLoc, "too few operands for instruction");
-
-      }
-      return Error(IDLoc, "invalid operand for instruction");
+  case Match_Success:
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst, STI);
+    return false;
+  case Match_MissingFeature:
+    return Error(IDLoc, "instruction use requires an option to be enabled");
+  case Match_MnemonicFail:
+    return Error(IDLoc, "unrecognized instruction mnemonic");
+  case Match_InvalidOperand: {
+    if (ErrorInfo != ~0ULL) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
     }
+    return Error(IDLoc, "invalid operand for instruction");
+  }
   }
   llvm_unreachable("Implement any new match types added!");
 }
@@ -312,6 +311,7 @@ bool AMDGPUOperand::isSWaitCnt() const {
 /// Force static initialization.
 extern "C" void LLVMInitializeR600AsmParser() {
   RegisterMCAsmParser<AMDGPUAsmParser> A(TheAMDGPUTarget);
+  RegisterMCAsmParser<AMDGPUAsmParser> B(TheGCNTarget);
 }
 
 #define GET_REGISTER_MATCHER
diff --git a/lib/Target/R600/CIInstructions.td b/lib/Target/R600/CIInstructions.td
new file mode 100644
index 0000000..3ac7af8
--- /dev/null
+++ b/lib/Target/R600/CIInstructions.td
@@ -0,0 +1,42 @@
+//===-- CIInstructions.td - CI Instruction Defintions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Instruction definitions for CI and newer.
+//===----------------------------------------------------------------------===//
+
+
+def isCIVI : Predicate <
+  "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || "
+  "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS"
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isCIVI in {
+
+defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64",
+  VOP_F64_F64, ftrunc
+>;
+defm V_CEIL_F64 : VOP1Inst <vop1<0x18>, "v_ceil_f64",
+  VOP_F64_F64, fceil
+>;
+defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64",
+  VOP_F64_F64, ffloor
+>;
+defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64",
+  VOP_F64_F64, frint
+>;
+defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32",
+  VOP_F32_F32
+>;
+defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32",
+  VOP_F32_F32
+>;
+} // End SubtargetPredicate = isCIVI
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index ed0a216..5a4bae2 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -43,6 +43,7 @@ add_llvm_target(R600CodeGen
   SIAnnotateControlFlow.cpp
   SIFixSGPRCopies.cpp
   SIFixSGPRLiveRanges.cpp
+  SIFoldOperands.cpp
   SIInsertWaits.cpp
   SIInstrInfo.cpp
   SIISelLowering.cpp
@@ -50,6 +51,7 @@ add_llvm_target(R600CodeGen
   SILowerControlFlow.cpp
   SILowerI1Copies.cpp
   SIMachineFunctionInfo.cpp
+  SIPrepareScratchRegs.cpp
   SIRegisterInfo.cpp
   SIShrinkInstructions.cpp
   SITypeRewriter.cpp
diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td
index 58b5ce2..ba4df82 100644
--- a/lib/Target/R600/CaymanInstructions.td
+++ b/lib/Target/R600/CaymanInstructions.td
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def isCayman : Predicate<"Subtarget.hasCaymanISA()">;
+def isCayman : Predicate<"Subtarget->hasCaymanISA()">;
 
 //===----------------------------------------------------------------------===//
 // Cayman Instructions
@@ -46,7 +46,7 @@ def SIN_cm : SIN_Common<0x8D>;
 def COS_cm : COS_Common<0x8E>;
 } // End isVector = 1
 
-defm : RsqPat<RECIPSQRT_IEEE_cm, f32>;
+def : RsqPat<RECIPSQRT_IEEE_cm, f32>;
 
 def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
 
diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td
index f24f76b..9f9472c 100644
--- a/lib/Target/R600/EvergreenInstructions.td
+++ b/lib/Target/R600/EvergreenInstructions.td
@@ -14,14 +14,14 @@
 //===----------------------------------------------------------------------===//
 
 def isEG : Predicate<
-  "Subtarget.getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
-  "Subtarget.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-  "!Subtarget.hasCaymanISA()"
+  "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
+  "Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+  "!Subtarget->hasCaymanISA()"
 >;
 
 def isEGorCayman : Predicate<
-  "Subtarget.getGeneration() == AMDGPUSubtarget::EVERGREEN ||"
-  "Subtarget.getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS"
+  "Subtarget->getGeneration() == AMDGPUSubtarget::EVERGREEN ||"
+  "Subtarget->getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS"
 >;
 
 //===----------------------------------------------------------------------===//
@@ -69,7 +69,7 @@ def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
 def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
 def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
 def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
-defm : RsqPat<RECIPSQRT_IEEE_eg, f32>;
+def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
 def SIN_eg : SIN_Common<0x8D>;
 def COS_eg : COS_Common<0x8E>;
 
@@ -590,8 +590,6 @@ def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
 // SHA-256 Patterns
 def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
 
-def : FROUNDPat <CNDGE_eg, CNDGT_eg>;
-
 def EG_ExportSwz : ExportSwzInst {
   let Word1{19-16} = 0; // BURST_COUNT
   let Word1{20} = 0; // VALID_PIXEL_MODE
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index 64fe726..b66ed10 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -9,11 +9,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUInstPrinter.h"
-#include "SIDefines.h"
-
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -74,7 +74,7 @@ void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) {
   if (MI->getOperand(OpNo).getImm()) {
     O << " offset:";
-    printU16ImmOperand(MI, OpNo, O);
+    printU16ImmDecOperand(MI, OpNo, O);
   }
 }
 
@@ -208,7 +208,7 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
   O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
 }
 
-void AMDGPUInstPrinter::printImmediate(uint32_t Imm, raw_ostream &O) {
+void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) {
   int32_t SImm = static_cast<int32_t>(Imm);
   if (SImm >= -16 && SImm <= 64) {
     O << SImm;
@@ -233,9 +233,37 @@ void AMDGPUInstPrinter::printImmediate(uint32_t Imm, raw_ostream &O) {
     O << "4.0";
   else if (Imm == FloatToBits(-4.0f))
     O << "-4.0";
-  else {
+  else
     O << formatHex(static_cast<uint64_t>(Imm));
+}
+
+void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) {
+  int64_t SImm = static_cast<int64_t>(Imm);
+  if (SImm >= -16 && SImm <= 64) {
+    O << SImm;
+    return;
   }
+
+  if (Imm == DoubleToBits(0.0))
+    O << "0.0";
+  else if (Imm == DoubleToBits(1.0))
+    O << "1.0";
+  else if (Imm == DoubleToBits(-1.0))
+    O << "-1.0";
+  else if (Imm == DoubleToBits(0.5))
+    O << "0.5";
+  else if (Imm == DoubleToBits(-0.5))
+    O << "-0.5";
+  else if (Imm == DoubleToBits(2.0))
+    O << "2.0";
+  else if (Imm == DoubleToBits(-2.0))
+    O << "-2.0";
+  else if (Imm == DoubleToBits(4.0))
+    O << "4.0";
+  else if (Imm == DoubleToBits(-4.0))
+    O << "-4.0";
+  else
+    llvm_unreachable("64-bit literal constants not supported");
 }
 
 void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -253,14 +281,39 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
       break;
     }
   } else if (Op.isImm()) {
-    printImmediate(Op.getImm(), O);
+    const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+    int RCID = Desc.OpInfo[OpNo].RegClass;
+    if (RCID != -1) {
+      const MCRegisterClass &ImmRC = MRI.getRegClass(RCID);
+      if (ImmRC.getSize() == 4)
+        printImmediate32(Op.getImm(), O);
+      else if (ImmRC.getSize() == 8)
+        printImmediate64(Op.getImm(), O);
+      else
+        llvm_unreachable("Invalid register class size");
+    } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) {
+      printImmediate32(Op.getImm(), O);
+    } else {
+      // We hit this for the immediate instruction bits that don't yet have a
+      // custom printer.
+      // TODO: Eventually this should be unnecessary.
+      O << formatDec(Op.getImm());
+    }
   } else if (Op.isFPImm()) {
-
     // We special case 0.0 because otherwise it will be printed as an integer.
     if (Op.getFPImm() == 0.0)
       O << "0.0";
-    else
-      printImmediate(FloatToBits(Op.getFPImm()), O);
+    else {
+      const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+      const MCRegisterClass &ImmRC = MRI.getRegClass(Desc.OpInfo[OpNo].RegClass);
+
+      if (ImmRC.getSize() == 4)
+        printImmediate32(FloatToBits(Op.getFPImm()), O);
+      else if (ImmRC.getSize() == 8)
+        printImmediate64(DoubleToBits(Op.getFPImm()), O);
+      else
+        llvm_unreachable("Invalid register class size");
+    }
   } else if (Op.isExpr()) {
     const MCExpr *Exp = Op.getExpr();
     Exp->print(O);
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
index 4c06ac0..1d43c7a 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -48,7 +48,8 @@ private:
   void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printRegOperand(unsigned RegNo, raw_ostream &O);
-  void printImmediate(uint32_t Imm, raw_ostream &O);
+  void printImmediate32(uint32_t I, raw_ostream &O);
+  void printImmediate64(uint64_t I, raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
index 5fb311b..d0c634f 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -29,7 +29,7 @@ public:
                                 const MCAsmLayout &Layout) override {
     //XXX: Implement if necessary.
   }
-  void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
+  void RecordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
                         MCValue Target, bool &IsPCRel,
                         uint64_t &FixedValue) override {
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 3c2b889..19d89fb 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -17,6 +17,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfoELF() {
   MaxInstLength = 16;
   SeparatorString = "\n";
   CommentString = ";";
+  PrivateLabelPrefix = "";
   InlineAsmStart = ";#ASMSTART";
   InlineAsmEnd = ";#ASMEND";
 
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 8731055..83403ba 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -15,6 +15,7 @@
 #include "AMDGPUMCTargetDesc.h"
 #include "AMDGPUMCAsmInfo.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
+#include "SIDefines.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -92,20 +93,29 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
 extern "C" void LLVMInitializeR600TargetMC() {
 
   RegisterMCAsmInfo<AMDGPUMCAsmInfo> Y(TheAMDGPUTarget);
+  RegisterMCAsmInfo<AMDGPUMCAsmInfo> Z(TheGCNTarget);
 
   TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDGPUMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheGCNTarget, createAMDGPUMCCodeGenInfo);
 
   TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDGPUMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheGCNTarget, createAMDGPUMCInstrInfo);
 
   TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDGPUMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheGCNTarget, createAMDGPUMCRegisterInfo);
 
   TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDGPUMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheGCNTarget, createAMDGPUMCSubtargetInfo);
 
   TargetRegistry::RegisterMCInstPrinter(TheAMDGPUTarget, createAMDGPUMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheGCNTarget, createAMDGPUMCInstPrinter);
 
   TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, createAMDGPUMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createAMDGPUMCCodeEmitter);
 
   TargetRegistry::RegisterMCAsmBackend(TheAMDGPUTarget, createAMDGPUAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheGCNTarget, createAMDGPUAsmBackend);
 
   TargetRegistry::RegisterMCObjectStreamer(TheAMDGPUTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheGCNTarget, createMCStreamer);
 }
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
index c019766..bc8cd53 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -30,6 +30,7 @@ class Target;
 class raw_ostream;
 
 extern Target TheAMDGPUTarget;
+extern Target TheGCNTarget;
 
 MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
                                        const MCRegisterInfo &MRI,
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index dc1344f..8a555ff 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -30,8 +30,8 @@ using namespace llvm;
 namespace {
 
 class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
-  R600MCCodeEmitter(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION;
-  void operator=(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  R600MCCodeEmitter(const R600MCCodeEmitter &) = delete;
+  void operator=(const R600MCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   const MCRegisterInfo &MRI;
 
diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
index 999fd0d..7e23772 100644
--- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -14,10 +14,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "SIDefines.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
 #include "MCTargetDesc/AMDGPUFixupKinds.h"
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCFixup.h"
@@ -31,15 +31,9 @@ using namespace llvm;
 
 namespace {
 
-/// \brief Helper type used in encoding
-typedef union {
-  int32_t I;
-  float F;
-} IntFloatUnion;
-
 class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
-  SIMCCodeEmitter(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION;
-  void operator=(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  SIMCCodeEmitter(const SIMCCodeEmitter &) = delete;
+  void operator=(const SIMCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   const MCRegisterInfo &MRI;
   MCContext &Ctx;
@@ -48,7 +42,7 @@ class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
   bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
 
   /// \brief Encode an fp or int literal
-  uint32_t getLitEncoding(const MCOperand &MO) const;
+  uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize) const;
 
 public:
   SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
@@ -85,60 +79,107 @@ MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
 
 bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc,
                                    unsigned OpNo) const {
-  unsigned RegClass = Desc.OpInfo[OpNo].RegClass;
-  return (AMDGPU::SSrc_32RegClassID == RegClass) ||
-         (AMDGPU::SSrc_64RegClassID == RegClass) ||
-         (AMDGPU::VSrc_32RegClassID == RegClass) ||
-         (AMDGPU::VSrc_64RegClassID == RegClass) ||
-	 (AMDGPU::VCSrc_32RegClassID == RegClass) ||
-	 (AMDGPU::VCSrc_64RegClassID == RegClass);
+  unsigned OpType = Desc.OpInfo[OpNo].OperandType;
+
+  return OpType == AMDGPU::OPERAND_REG_IMM32 ||
+         OpType == AMDGPU::OPERAND_REG_INLINE_C;
 }
 
-uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const {
+// Returns the encoding value to use if the given integer is an integer inline
+// immediate value, or 0 if it is not.
+template <typename IntTy>
+static uint32_t getIntInlineImmEncoding(IntTy Imm) {
+  if (Imm >= 0 && Imm <= 64)
+    return 128 + Imm;
 
-  IntFloatUnion Imm;
-  if (MO.isImm())
-    Imm.I = MO.getImm();
-  else if (MO.isFPImm())
-    Imm.F = MO.getFPImm();
-  else if (MO.isExpr())
-    return 255;
-  else
-    return ~0;
+  if (Imm >= -16 && Imm <= -1)
+    return 192 + std::abs(Imm);
 
-  if (Imm.I >= 0 && Imm.I <= 64)
-    return 128 + Imm.I;
+  return 0;
+}
 
-  if (Imm.I >= -16 && Imm.I <= -1)
-    return 192 + abs(Imm.I);
+static uint32_t getLit32Encoding(uint32_t Val) {
+  uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val));
+  if (IntImm != 0)
+    return IntImm;
 
-  if (Imm.F == 0.5f)
+  if (Val == FloatToBits(0.5f))
     return 240;
 
-  if (Imm.F == -0.5f)
+  if (Val == FloatToBits(-0.5f))
     return 241;
 
-  if (Imm.F == 1.0f)
+  if (Val == FloatToBits(1.0f))
     return 242;
 
-  if (Imm.F == -1.0f)
+  if (Val == FloatToBits(-1.0f))
     return 243;
 
-  if (Imm.F == 2.0f)
+  if (Val == FloatToBits(2.0f))
     return 244;
 
-  if (Imm.F == -2.0f)
+  if (Val == FloatToBits(-2.0f))
     return 245;
 
-  if (Imm.F == 4.0f)
+  if (Val == FloatToBits(4.0f))
     return 246;
 
-  if (Imm.F == -4.0f)
+  if (Val == FloatToBits(-4.0f))
     return 247;
 
   return 255;
 }
 
+static uint32_t getLit64Encoding(uint64_t Val) {
+  uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val));
+  if (IntImm != 0)
+    return IntImm;
+
+  if (Val == DoubleToBits(0.5))
+    return 240;
+
+  if (Val == DoubleToBits(-0.5))
+    return 241;
+
+  if (Val == DoubleToBits(1.0))
+    return 242;
+
+  if (Val == DoubleToBits(-1.0))
+    return 243;
+
+  if (Val == DoubleToBits(2.0))
+    return 244;
+
+  if (Val == DoubleToBits(-2.0))
+    return 245;
+
+  if (Val == DoubleToBits(4.0))
+    return 246;
+
+  if (Val == DoubleToBits(-4.0))
+    return 247;
+
+  return 255;
+}
+
+uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
+                                         unsigned OpSize) const {
+  if (MO.isExpr())
+    return 255;
+
+  assert(!MO.isFPImm());
+
+  if (!MO.isImm())
+    return ~0;
+
+  if (OpSize == 4)
+    return getLit32Encoding(static_cast<uint32_t>(MO.getImm()));
+
+  assert(OpSize == 8);
+
+  return getLit64Encoding(static_cast<uint64_t>(MO.getImm()));
+}
+
 void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                                        SmallVectorImpl<MCFixup> &Fixups,
                                        const MCSubtargetInfo &STI) const {
@@ -161,25 +202,24 @@ void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     if (!isSrcOperand(Desc, i))
       continue;
 
+    int RCID = Desc.OpInfo[i].RegClass;
+    const MCRegisterClass &RC = MRI.getRegClass(RCID);
+
     // Is this operand a literal immediate?
     const MCOperand &Op = MI.getOperand(i);
-    if (getLitEncoding(Op) != 255)
+    if (getLitEncoding(Op, RC.getSize()) != 255)
       continue;
 
     // Yes! Encode it
-    IntFloatUnion Imm;
+    int64_t Imm = 0;
+
     if (Op.isImm())
-      Imm.I = Op.getImm();
-    else if (Op.isFPImm())
-      Imm.F = Op.getFPImm();
-    else {
-      assert(Op.isExpr());
-      // This will be replaced with a fixup value.
-      Imm.I = 0;
-    }
+      Imm = Op.getImm();
+    else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
+      llvm_unreachable("Must be immediate or expr");
 
     for (unsigned j = 0; j < 4; j++) {
-      OS.write((uint8_t) ((Imm.I >> (8 * j)) & 0xff));
+      OS.write((uint8_t) ((Imm >> (8 * j)) & 0xff));
     }
 
     // Only one literal value allowed
@@ -234,7 +274,10 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
 
   const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
   if (isSrcOperand(Desc, OpNo)) {
-    uint32_t Enc = getLitEncoding(MO);
+    int RCID = Desc.OpInfo[OpNo].RegClass;
+    const MCRegisterClass &RC = MRI.getRegClass(RCID);
+
+    uint32_t Enc = getLitEncoding(MO, RC.getSize());
     if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
       return Enc;
 
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index ce17d7c..fb5aa61 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -83,28 +83,44 @@ def : Proc<"cayman",     R600_VLIW4_Itin,
 // Southern Islands
 //===----------------------------------------------------------------------===//
 
-def : Proc<"SI",         SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"SI", SIFullSpeedModel,
+  [FeatureSouthernIslands, FeatureFastFMAF32]
+>;
 
-def : Proc<"tahiti",     SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"tahiti",   SIFullSpeedModel,
+  [FeatureSouthernIslands, FeatureFastFMAF32]
+>;
 
-def : Proc<"pitcairn",   SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
 
-def : Proc<"verde",      SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"verde",    SIQuarterSpeedModel, [FeatureSouthernIslands]>;
 
-def : Proc<"oland",      SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"oland",    SIQuarterSpeedModel, [FeatureSouthernIslands]>;
 
-def : Proc<"hainan",     SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"hainan",   SIQuarterSpeedModel, [FeatureSouthernIslands]>;
 
 //===----------------------------------------------------------------------===//
 // Sea Islands
 //===----------------------------------------------------------------------===//
 
-def : Proc<"bonaire",    SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"bonaire",    SIQuarterSpeedModel, [FeatureSeaIslands]>;
 
-def : Proc<"kabini",     SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"kabini",     SIQuarterSpeedModel, [FeatureSeaIslands]>;
 
-def : Proc<"kaveri",     SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"kaveri",     SIQuarterSpeedModel, [FeatureSeaIslands]>;
 
-def : Proc<"hawaii",     SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"hawaii", SIFullSpeedModel,
+  [FeatureSeaIslands, FeatureFastFMAF32]
+>;
 
-def : Proc<"mullins",    SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"mullins",    SIQuarterSpeedModel, [FeatureSeaIslands]>;
+
+//===----------------------------------------------------------------------===//
+// Volcanic Islands
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"tonga",   SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
+
+def : ProcessorModel<"iceland", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
+
+def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index edaf278..c8f37f6 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -39,14 +39,14 @@ struct CFStack {
     FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
   };
 
-  const AMDGPUSubtarget &ST;
+  const AMDGPUSubtarget *ST;
   std::vector<StackItem> BranchStack;
   std::vector<StackItem> LoopStack;
   unsigned MaxStackSize;
   unsigned CurrentEntries;
   unsigned CurrentSubEntries;
 
-  CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st),
+  CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st),
       // We need to reserve a stack entry for CALL_FS in vertex shaders.
       MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
       CurrentEntries(0), CurrentSubEntries(0) { }
@@ -76,11 +76,11 @@ bool CFStack::branchStackContains(CFStack::StackItem Item) {
 }
 
 bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
-  if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() &&
+  if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
       getLoopDepth() > 1)
     return true;
 
-  if (!ST.hasCFAluBug())
+  if (!ST->hasCFAluBug())
     return false;
 
   switch(Opcode) {
@@ -91,7 +91,7 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
   case AMDGPU::CF_ALU_CONTINUE:
     if (CurrentSubEntries == 0)
       return false;
-    if (ST.getWavefrontSize() == 64) {
+    if (ST->getWavefrontSize() == 64) {
       // We are being conservative here.  We only require this work-around if
       // CurrentSubEntries > 3 &&
       // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
@@ -102,7 +102,7 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
       // resources without any problems.
       return CurrentSubEntries > 3;
     } else {
-      assert(ST.getWavefrontSize() == 32);
+      assert(ST->getWavefrontSize() == 32);
       // We are being conservative here.  We only require the work-around if
       // CurrentSubEntries > 7 &&
       // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
@@ -118,8 +118,8 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
   default:
     return 0;
   case CFStack::FIRST_NON_WQM_PUSH:
-  assert(!ST.hasCaymanISA());
-  if (ST.getGeneration() <= AMDGPUSubtarget::R700) {
+  assert(!ST->hasCaymanISA());
+  if (ST->getGeneration() <= AMDGPUSubtarget::R700) {
     // +1 For the push operation.
     // +2 Extra space required.
     return 3;
@@ -132,7 +132,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
     return 2;
   }
   case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
-    assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
+    assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
     // +1 For the push operation.
     // +1 Extra space required.
     return 2;
@@ -153,13 +153,14 @@ void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
   case AMDGPU::CF_PUSH_EG:
   case AMDGPU::CF_ALU_PUSH_BEFORE:
     if (!isWQM) {
-      if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
+      if (!ST->hasCaymanISA() &&
+          !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
         Item = CFStack::FIRST_NON_WQM_PUSH;  // May not be required on Evergreen/NI
                                              // See comment in
                                              // CFStack::getSubEntrySize()
       else if (CurrentEntries > 0 &&
-               ST.getGeneration() > AMDGPUSubtarget::EVERGREEN &&
-               !ST.hasCaymanISA() &&
+               ST->getGeneration() > AMDGPUSubtarget::EVERGREEN &&
+               !ST->hasCaymanISA() &&
                !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
         Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
       else
@@ -219,7 +220,7 @@ private:
   const R600InstrInfo *TII;
   const R600RegisterInfo *TRI;
   unsigned MaxFetchInst;
-  const AMDGPUSubtarget &ST;
+  const AMDGPUSubtarget *ST;
 
   bool IsTrivialInst(MachineInstr *MI) const {
     switch (MI->getOpcode()) {
@@ -233,7 +234,7 @@ private:
 
   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
     unsigned Opcode = 0;
-    bool isEg = (ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
+    bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
     switch (CFI) {
     case CF_TC:
       Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
@@ -266,7 +267,7 @@ private:
       Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
       break;
     case CF_END:
-      if (ST.hasCaymanISA()) {
+      if (ST->hasCaymanISA()) {
         Opcode = AMDGPU::CF_END_CM;
         break;
       }
@@ -467,17 +468,14 @@ private:
   }
 
 public:
-  R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
-    TII (nullptr), TRI(nullptr),
-    ST(tm.getSubtarget<AMDGPUSubtarget>()) {
-      const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
-      MaxFetchInst = ST.getTexVTXClauseSize();
-  }
+  R600ControlFlowFinalizer(TargetMachine &tm)
+      : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
-    TRI = static_cast<const R600RegisterInfo *>(
-        MF.getSubtarget().getRegisterInfo());
+    ST = &MF.getSubtarget<AMDGPUSubtarget>();
+    MaxFetchInst = ST->getTexVTXClauseSize();
+    TII = static_cast<const R600InstrInfo *>(ST->getInstrInfo());
+    TRI = static_cast<const R600RegisterInfo *>(ST->getRegisterInfo());
     R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 
     CFStack CFStack(ST, MFI->getShaderType());
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index a214e53..c738611 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -30,9 +30,9 @@
 
 using namespace llvm;
 
-R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
-    AMDGPUTargetLowering(TM),
-    Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
+R600TargetLowering::R600TargetLowering(TargetMachine &TM,
+                                       const AMDGPUSubtarget &STI)
+    : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
@@ -40,7 +40,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
 
-  computeRegisterProperties();
+  computeRegisterProperties(STI.getRegisterInfo());
 
   // Set condition code actions
   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
@@ -122,12 +122,19 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
 
   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
   // spaces, so it is custom lowered to handle those where it isn't.
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
-  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
-  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
+
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
+
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
+  }
 
   setOperationAction(ISD::STORE, MVT::i8, Custom);
   setOperationAction(ISD::STORE, MVT::i32, Custom);
@@ -181,8 +188,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::SUBE, VT, Expand);
   }
 
-  setBooleanContents(ZeroOrNegativeOneBooleanContent);
-  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
   setSchedulingPreference(Sched::Source);
 }
 
@@ -192,7 +197,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineBasicBlock::iterator I = *MI;
   const R600InstrInfo *TII =
-      static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo());
+      static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
 
   switch (MI->getOpcode()) {
   default:
@@ -647,9 +652,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
       MachineSDNode *interp;
       if (ijb < 0) {
-        const MachineFunction &MF = DAG.getMachineFunction();
-        const R600InstrInfo *TII = static_cast<const R600InstrInfo *>(
-            MF.getSubtarget().getInstrInfo());
+        const R600InstrInfo *TII =
+            static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
         return DAG.getTargetExtractSubreg(
@@ -1115,6 +1119,13 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
   SDValue CC = Op.getOperand(4);
   SDValue Temp;
 
+  if (VT == MVT::f32) {
+    DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
+    SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
+    if (MinMax)
+      return MinMax;
+  }
+
   // LHS and RHS are guaranteed to be the same value type
   EVT CompareVT = LHS.getValueType();
 
@@ -1369,8 +1380,8 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   // Lowering for indirect addressing
 
   const MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
-      getTargetMachine().getSubtargetImpl()->getFrameLowering());
+  const AMDGPUFrameLowering *TFL =
+      static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
   unsigned StackWidth = TFL->getStackWidth(MF);
 
   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
@@ -1567,8 +1578,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
 
   // Lowering for indirect addressing
   const MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
-      getTargetMachine().getSubtargetImpl()->getFrameLowering());
+  const AMDGPUFrameLowering *TFL =
+      static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
   unsigned StackWidth = TFL->getStackWidth(MF);
 
   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
@@ -1682,7 +1693,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
     // XXX - I think PartOffset should give you this, but it seems to give the
     // size of the register which isn't useful.
 
-    unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset();
+    unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
     unsigned PartOffset = VA.getLocMemOffset();
     unsigned Offset = 36 + VA.getLocMemOffset();
 
@@ -2172,9 +2183,7 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
   unsigned Opcode = Node->getMachineOpcode();
   SDValue FakeOp;
 
-  std::vector<SDValue> Ops;
-  for (const SDUse &I : Node->ops())
-    Ops.push_back(I);
+  std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
 
   if (Opcode == AMDGPU::DOT_4) {
     int OperandIdx[] = {
@@ -2236,10 +2245,7 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
         AMDGPU::OpName::clamp);
     if (ClampIdx < 0)
       return Node;
-    std::vector<SDValue> Ops;
-    unsigned NumOp = Src.getNumOperands();
-    for(unsigned i = 0; i < NumOp; ++i)
-          Ops.push_back(Src.getOperand(i));
+    std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
         Node->getVTList(), Ops);
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index 10ebc10..c547195 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -23,7 +23,7 @@ class R600InstrInfo;
 
 class R600TargetLowering : public AMDGPUTargetLowering {
 public:
-  R600TargetLowering(TargetMachine &TM);
+  R600TargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI);
   MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
       MachineBasicBlock * BB) const override;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index b6c00f8..291fb04 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -335,10 +335,11 @@ def load_param : LoadParamFrag<load>;
 def load_param_exti8 : LoadParamFrag<az_extloadi8>;
 def load_param_exti16 : LoadParamFrag<az_extloadi16>;
 
-def isR600 : Predicate<"Subtarget.getGeneration() <= AMDGPUSubtarget::R700">;
+def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">;
 
-def isR600toCayman : Predicate<
-                     "Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">;
+def isR600toCayman
+    : Predicate<
+          "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">;
 
 //===----------------------------------------------------------------------===//
 // R600 SDNodes
@@ -579,6 +580,7 @@ i32imm:$COUNT, i32imm:$Enabled),
   let ALT_CONST = 0;
   let WHOLE_QUAD_MODE = 0;
   let BARRIER = 1;
+  let isCodeGenOnly = 1;
   let UseNamedOperandTable = 1;
 
   let Inst{31-0} = Word0;
@@ -641,6 +643,7 @@ def FETCH_CLAUSE : AMDGPUInst <(outs),
   field bits<8> Inst;
   bits<8> num;
   let Inst = num;
+  let isCodeGenOnly = 1;
 }
 
 def ALU_CLAUSE : AMDGPUInst <(outs),
@@ -648,10 +651,13 @@ def ALU_CLAUSE : AMDGPUInst <(outs),
   field bits<8> Inst;
   bits<8> num;
   let Inst = num;
+  let isCodeGenOnly = 1;
 }
 
 def LITERALS : AMDGPUInst <(outs),
 (ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > {
+  let isCodeGenOnly = 1;
+
   field bits<64> Inst;
   bits<32> literal1;
   bits<32> literal2;
@@ -698,7 +704,7 @@ def SGE : R600_2OP <
 
 def SNE : R600_2OP <
   0xB, "SETNE",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))]
 >;
 
 def SETE_DX10 : R600_2OP <
@@ -716,9 +722,10 @@ def SETGE_DX10 : R600_2OP <
   [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))]
 >;
 
+// FIXME: This should probably be COND_ONE
 def SETNE_DX10 : R600_2OP <
   0xF, "SETNE_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE))]
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))]
 >;
 
 def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
@@ -913,7 +920,7 @@ class MULADD_Common <bits<5> inst> : R600_3OP <
 
 class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
   inst, "MULADD_IEEE",
-  [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))]
+  [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))]
 >;
 
 class FMA_Common <bits<5> inst> : R600_3OP <
@@ -1141,16 +1148,6 @@ class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ie
   (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x))
 >;
 
-// FROUND pattern
-class FROUNDPat<Instruction CNDGE, Instruction CNDGT> : Pat <
-  (AMDGPUround f32:$x),
-  (CNDGE $x,
-  (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)),
-  (CNDGT (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x))
-  )
->;
-
-
 //===----------------------------------------------------------------------===//
 // R600 / R700 Instructions
 //===----------------------------------------------------------------------===//
@@ -1192,9 +1189,7 @@ let Predicates = [isR600] in {
   def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
 
   def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
-  defm : RsqPat<RECIPSQRT_IEEE_r600, f32>;
-
-  def : FROUNDPat <CNDGE_r600, CNDGT_r600>;
+  def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
 
   def R600_ExportSwz : ExportSwzInst {
     let Word1{20-17} = 0; // BURST_COUNT
@@ -1248,6 +1243,7 @@ let Predicates = [isR600] in {
   def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR),
   "PUSH_ELSE @$ADDR"> {
     let CNT = 0;
+    let POP_COUNT = 0; // FIXME?
   }
   def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
   "ELSE @$ADDR POP:$POP_COUNT"> {
@@ -1364,7 +1360,7 @@ def CONST_COPY : Instruction {
   let Pattern =
       [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
   let AsmString = "CONST_COPY";
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
   let isAsCheapAsAMove = 1;
   let Itinerary = NullALU;
 }
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index d782713..bcde5fb 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -16,7 +16,7 @@
 #include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Pass.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -26,17 +26,16 @@ using namespace llvm;
 void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
   assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
   DAG = static_cast<ScheduleDAGMILive*>(dag);
+  const AMDGPUSubtarget &ST = DAG->MF.getSubtarget<AMDGPUSubtarget>();
   TII = static_cast<const R600InstrInfo*>(DAG->TII);
   TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
-  VLIW5 = !DAG->MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
+  VLIW5 = !ST.hasCaymanISA();
   MRI = &DAG->MRI;
   CurInstKind = IDOther;
   CurEmitted = 0;
   OccupedSlotsMask = 31;
   InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
   InstKindLimit[IDOther] = 32;
-
-  const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
   InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
   AluInstCount = 0;
   FetchInstCount = 0;
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index ddf68c9..deee5bc 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -153,7 +153,7 @@ public:
         TII(static_cast<const R600InstrInfo *>(
             MF.getSubtarget().getInstrInfo())),
         TRI(TII->getRegisterInfo()) {
-    VLIW5 = !MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
+    VLIW5 = !MF.getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
   }
 
   // initPacketizerState - initialize some internal flags.
diff --git a/lib/Target/R600/R700Instructions.td b/lib/Target/R600/R700Instructions.td
index 9aad85d..613a0d7 100644
--- a/lib/Target/R600/R700Instructions.td
+++ b/lib/Target/R600/R700Instructions.td
@@ -13,7 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def isR700 : Predicate<"Subtarget.getGeneration() == AMDGPUSubtarget::R700">;
+def isR700 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::R700">;
 
 let Predicates = [isR700] in {
   def SIN_r700 : SIN_Common<0x6E>;
diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp
index 91eb60b..79f6532 100644
--- a/lib/Target/R600/SIAnnotateControlFlow.cpp
+++ b/lib/Target/R600/SIAnnotateControlFlow.cpp
@@ -14,6 +14,7 @@
 
 #include "AMDGPU.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
@@ -66,6 +67,8 @@ class SIAnnotateControlFlow : public FunctionPass {
   DominatorTree *DT;
   StackVector Stack;
 
+  LoopInfo *LI;
+
   bool isTopOfStack(BasicBlock *BB);
 
   Value *popSaved();
@@ -99,6 +102,7 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
@@ -277,10 +281,26 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
 
   Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
   push(Term->getSuccessor(0), Arg);
-}
-
-/// \brief Close the last opened control flow
+}/// \brief Close the last opened control flow
 void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
+  llvm::Loop *L = LI->getLoopFor(BB);
+
+  if (L && L->getHeader() == BB) {
+    // We can't insert an EndCF call into a loop header, because it will
+    // get executed on every iteration of the loop, when it should be
+    // executed only once before the loop.
+    SmallVector <BasicBlock*, 8> Latches;
+    L->getLoopLatches(Latches);
+
+    std::vector<BasicBlock*> Preds;
+    for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
+      if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end())
+        Preds.push_back(*PI);
+    }
+    BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT,
+                                      LI, false);
+  }
+
   CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt());
 }
 
@@ -288,6 +308,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
 /// recognize if/then/else and loops.
 bool SIAnnotateControlFlow::runOnFunction(Function &F) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
   for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
        E = df_end(&F.getEntryBlock()); I != E; ++I) {
diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h
index 2e7dab6..b540140 100644
--- a/lib/Target/R600/SIDefines.h
+++ b/lib/Target/R600/SIDefines.h
@@ -8,25 +8,49 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
+#include "llvm/MC/MCInstrDesc.h"
+
 #ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H
 #define LLVM_LIB_TARGET_R600_SIDEFINES_H
 
 namespace SIInstrFlags {
 // This needs to be kept in sync with the field bits in InstSI.
 enum {
-  MIMG = 1 << 3,
-  SMRD = 1 << 4,
-  VOP1 = 1 << 5,
-  VOP2 = 1 << 6,
-  VOP3 = 1 << 7,
-  VOPC = 1 << 8,
-  SALU = 1 << 9,
-  MUBUF = 1 << 10,
-  MTBUF = 1 << 11,
-  FLAT = 1 << 12
+  SALU = 1 << 3,
+  VALU = 1 << 4,
+
+  SOP1 = 1 << 5,
+  SOP2 = 1 << 6,
+  SOPC = 1 << 7,
+  SOPK = 1 << 8,
+  SOPP = 1 << 9,
+
+  VOP1 = 1 << 10,
+  VOP2 = 1 << 11,
+  VOP3 = 1 << 12,
+  VOPC = 1 << 13,
+
+  MUBUF = 1 << 14,
+  MTBUF = 1 << 15,
+  SMRD = 1 << 16,
+  DS = 1 << 17,
+  MIMG = 1 << 18,
+  FLAT = 1 << 19,
+  WQM = 1 << 20
 };
 }
 
+namespace llvm {
+namespace AMDGPU {
+  enum OperandType {
+    /// Operand with register or 32-bit immediate
+    OPERAND_REG_IMM32 = llvm::MCOI::OPERAND_FIRST_TARGET,
+    /// Operand with register or inline constant
+    OPERAND_REG_INLINE_C
+  };
+}
+}
+
 namespace SIInstrFlags {
   enum Flags {
     // First 4 bits are the instruction encoding
@@ -34,6 +58,21 @@ namespace SIInstrFlags {
     EXP_CNT = 1 << 1,
     LGKM_CNT = 1 << 2
   };
+
+  // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
+  // The result is true if any of these tests are true.
+  enum ClassFlags {
+    S_NAN = 1 << 0,        // Signaling NaN
+    Q_NAN = 1 << 1,        // Quiet NaN
+    N_INFINITY = 1 << 2,   // Negative infinity
+    N_NORMAL = 1 << 3,     // Negative normal
+    N_SUBNORMAL = 1 << 4,  // Negative subnormal
+    N_ZERO = 1 << 5,       // Negative zero
+    P_ZERO = 1 << 6,       // Positive zero
+    P_SUBNORMAL = 1 << 7,  // Positive subnormal
+    P_NORMAL = 1 << 8,     // Positive normal
+    P_INFINITY = 1 << 9    // Positive infinity
+  };
 }
 
 namespace SISrcMods {
@@ -61,7 +100,14 @@ namespace SIOutMods {
 #define   S_00B028_VGPRS(x)                                           (((x) & 0x3F) << 0)
 #define   S_00B028_SGPRS(x)                                           (((x) & 0x0F) << 6)
 #define R_00B84C_COMPUTE_PGM_RSRC2                                      0x00B84C
-#define   S_00B02C_SCRATCH_EN(x)                                      (((x) & 0x1) << 0)
+#define   S_00B84C_SCRATCH_EN(x)                                      (((x) & 0x1) << 0)
+#define   S_00B84C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
+#define   S_00B84C_TGID_X_EN(x)                                       (((x) & 0x1) << 7)
+#define   S_00B84C_TGID_Y_EN(x)                                       (((x) & 0x1) << 8)
+#define   S_00B84C_TGID_Z_EN(x)                                       (((x) & 0x1) << 9)
+#define   S_00B84C_TG_SIZE_EN(x)                                      (((x) & 0x1) << 10)
+#define   S_00B84C_TIDIG_COMP_CNT(x)                                  (((x) & 0x03) << 11)
+
 #define   S_00B84C_LDS_SIZE(x)                                        (((x) & 0x1FF) << 15)
 #define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
 
@@ -118,4 +164,8 @@ namespace SIOutMods {
 #define R_00B860_COMPUTE_TMPRING_SIZE                                   0x00B860
 #define   S_00B860_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
 
+#define R_0286E8_SPI_TMPRING_SIZE                                       0x0286E8
+#define   S_0286E8_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
+
+
 #endif
diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp
index d6f4b4c..cd1b3ac 100644
--- a/lib/Target/R600/SIFixSGPRCopies.cpp
+++ b/lib/Target/R600/SIFixSGPRCopies.cpp
@@ -136,12 +136,12 @@ const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses(
                                                  const MachineRegisterInfo &MRI,
                                                  unsigned Reg,
                                                  unsigned SubReg) const {
-  // The Reg parameter to the function must always be defined by either a PHI
-  // or a COPY, therefore it cannot be a physical register.
-  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
-         "Reg cannot be a physical register");
 
-  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  const TargetRegisterClass *RC
+    = TargetRegisterInfo::isVirtualRegister(Reg) ?
+    MRI.getRegClass(Reg) :
+    TRI->getRegClass(Reg);
+
   RC = TRI->getSubRegClass(RC, SubReg);
   for (MachineRegisterInfo::use_instr_iterator
        I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) {
@@ -182,7 +182,12 @@ bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
   unsigned DstReg = Copy.getOperand(0).getReg();
   unsigned SrcReg = Copy.getOperand(1).getReg();
   unsigned SrcSubReg = Copy.getOperand(1).getSubReg();
-  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
+
+  const TargetRegisterClass *DstRC
+    = TargetRegisterInfo::isVirtualRegister(DstReg) ?
+    MRI.getRegClass(DstReg) :
+    TRI->getRegClass(DstReg);
+
   const TargetRegisterClass *SrcRC;
 
   if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
@@ -217,20 +222,21 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
       switch (MI.getOpcode()) {
       default: continue;
       case AMDGPU::PHI: {
-        DEBUG(dbgs() << " Fixing PHI:\n");
-        DEBUG(MI.print(dbgs()));
+        DEBUG(dbgs() << "Fixing PHI: " << MI);
 
-        for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
-          unsigned Reg = MI.getOperand(i).getReg();
-          const TargetRegisterClass *RC = inferRegClassFromDef(TRI, MRI, Reg,
-                                                  MI.getOperand(0).getSubReg());
-          MRI.constrainRegClass(Reg, RC);
+        for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+          const MachineOperand &Op = MI.getOperand(i);
+          unsigned Reg = Op.getReg();
+          const TargetRegisterClass *RC
+            = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg());
+
+          MRI.constrainRegClass(Op.getReg(), RC);
         }
         unsigned Reg = MI.getOperand(0).getReg();
         const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg,
                                                   MI.getOperand(0).getSubReg());
-        if (TRI->getCommonSubClass(RC, &AMDGPU::VReg_32RegClass)) {
-          MRI.constrainRegClass(Reg, &AMDGPU::VReg_32RegClass);
+        if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) {
+          MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass);
         }
 
         if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp
new file mode 100644
index 0000000..ae4b05d
--- /dev/null
+++ b/lib/Target/R600/SIFoldOperands.cpp
@@ -0,0 +1,287 @@
+//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "si-fold-operands"
+using namespace llvm;
+
+namespace {
+
+class SIFoldOperands : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIFoldOperands() : MachineFunctionPass(ID) {
+    initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Fold Operands";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+struct FoldCandidate {
+  MachineInstr *UseMI;
+  unsigned UseOpNo;
+  MachineOperand *OpToFold;
+  uint64_t ImmToFold;
+
+  FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) :
+                UseMI(MI), UseOpNo(OpNo) {
+
+    if (FoldOp->isImm()) {
+      OpToFold = nullptr;
+      ImmToFold = FoldOp->getImm();
+    } else {
+      assert(FoldOp->isReg());
+      OpToFold = FoldOp;
+    }
+  }
+
+  bool isImm() const {
+    return !OpToFold;
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE,
+                      "SI Fold Operands", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE,
+                    "SI Fold Operands", false, false)
+
+char SIFoldOperands::ID = 0;
+
+char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
+
+FunctionPass *llvm::createSIFoldOperandsPass() {
+  return new SIFoldOperands();
+}
+
+static bool isSafeToFold(unsigned Opcode) {
+  switch(Opcode) {
+  case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::V_MOV_B32_e64:
+  case AMDGPU::V_MOV_B64_PSEUDO:
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::COPY:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool updateOperand(FoldCandidate &Fold,
+                          const TargetRegisterInfo &TRI) {
+  MachineInstr *MI = Fold.UseMI;
+  MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
+  assert(Old.isReg());
+
+  if (Fold.isImm()) {
+    Old.ChangeToImmediate(Fold.ImmToFold);
+    return true;
+  }
+
+  MachineOperand *New = Fold.OpToFold;
+  if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
+      TargetRegisterInfo::isVirtualRegister(New->getReg())) {
+    Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
+    return true;
+  }
+
+  // FIXME: Handle physical registers.
+
+  return false;
+}
+
+static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
+                             MachineInstr *MI, unsigned OpNo,
+                             MachineOperand *OpToFold,
+                             const SIInstrInfo *TII) {
+  if (!TII->isOperandLegal(MI, OpNo, OpToFold)) {
+    // Operand is not legal, so try to commute the instruction to
+    // see if this makes it possible to fold.
+    unsigned CommuteIdx0;
+    unsigned CommuteIdx1;
+    bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1);
+
+    if (CanCommute) {
+      if (CommuteIdx0 == OpNo)
+        OpNo = CommuteIdx1;
+      else if (CommuteIdx1 == OpNo)
+        OpNo = CommuteIdx0;
+    }
+
+    if (!CanCommute || !TII->commuteInstruction(MI))
+      return false;
+
+    if (!TII->isOperandLegal(MI, OpNo, OpToFold))
+      return false;
+  }
+
+  FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+  return true;
+}
+
+bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      if (!isSafeToFold(MI.getOpcode()))
+        continue;
+
+      unsigned OpSize = TII->getOpSize(MI, 1);
+      MachineOperand &OpToFold = MI.getOperand(1);
+      bool FoldingImm = OpToFold.isImm();
+
+      // FIXME: We could also be folding things like FrameIndexes and
+      // TargetIndexes.
+      if (!FoldingImm && !OpToFold.isReg())
+        continue;
+
+      // Folding immediates with more than one use will increase program size.
+      // FIXME: This will also reduce register usage, which may be better
+      // in some cases.  A better heuristic is needed.
+      if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) &&
+          !MRI.hasOneUse(MI.getOperand(0).getReg()))
+        continue;
+
+      // FIXME: Fold operands with subregs.
+      if (OpToFold.isReg() &&
+          (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) ||
+           OpToFold.getSubReg()))
+        continue;
+
+      std::vector<FoldCandidate> FoldList;
+      for (MachineRegisterInfo::use_iterator
+           Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
+           Use != E; ++Use) {
+
+        MachineInstr *UseMI = Use->getParent();
+        const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo());
+
+        // FIXME: Fold operands with subregs.
+        if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) ||
+            UseOp.isImplicit())) {
+          continue;
+        }
+
+        APInt Imm;
+
+        if (FoldingImm) {
+          unsigned UseReg = UseOp.getReg();
+          const TargetRegisterClass *UseRC
+            = TargetRegisterInfo::isVirtualRegister(UseReg) ?
+            MRI.getRegClass(UseReg) :
+            TRI.getRegClass(UseReg);
+
+          Imm = APInt(64, OpToFold.getImm());
+
+          // Split 64-bit constants into 32-bits for folding.
+          if (UseOp.getSubReg()) {
+            if (UseRC->getSize() != 8)
+              continue;
+
+            if (UseOp.getSubReg() == AMDGPU::sub0) {
+              Imm = Imm.getLoBits(32);
+            } else {
+              assert(UseOp.getSubReg() == AMDGPU::sub1);
+              Imm = Imm.getHiBits(32);
+            }
+          }
+
+          // In order to fold immediates into copies, we need to change the
+          // copy to a MOV.
+          if (UseMI->getOpcode() == AMDGPU::COPY) {
+            unsigned DestReg = UseMI->getOperand(0).getReg();
+            const TargetRegisterClass *DestRC
+              = TargetRegisterInfo::isVirtualRegister(DestReg) ?
+              MRI.getRegClass(DestReg) :
+              TRI.getRegClass(DestReg);
+
+            unsigned MovOp = TII->getMovOpcode(DestRC);
+            if (MovOp == AMDGPU::COPY)
+              continue;
+
+            UseMI->setDesc(TII->get(MovOp));
+          }
+        }
+
+        const MCInstrDesc &UseDesc = UseMI->getDesc();
+
+        // Don't fold into target independent nodes.  Target independent opcodes
+        // don't have defined register classes.
+        if (UseDesc.isVariadic() ||
+            UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
+          continue;
+
+        if (FoldingImm) {
+          MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+          tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII);
+          continue;
+        }
+
+        tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII);
+
+        // FIXME: We could try to change the instruction from 64-bit to 32-bit
+        // to enable more folding opportunites.  The shrink operands pass
+        // already does this.
+      }
+
+      for (FoldCandidate &Fold : FoldList) {
+        if (updateOperand(Fold, TRI)) {
+          // Clear kill flags.
+          if (!Fold.isImm()) {
+            assert(Fold.OpToFold && Fold.OpToFold->isReg());
+            Fold.OpToFold->setIsKill(false);
+          }
+          DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
+                Fold.UseOpNo << " of " << *Fold.UseMI << '\n');
+        }
+      }
+    }
+  }
+  return false;
+}
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 8d4164a..7d794b8 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -35,8 +35,9 @@
 
 using namespace llvm;
 
-SITargetLowering::SITargetLowering(TargetMachine &TM) :
-    AMDGPUTargetLowering(TM) {
+SITargetLowering::SITargetLowering(TargetMachine &TM,
+                                   const AMDGPUSubtarget &STI)
+    : AMDGPUTargetLowering(TM, STI) {
   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
 
@@ -44,7 +45,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
 
   addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
-  addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
+  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
 
   addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
@@ -59,22 +60,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
   addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
 
-  computeRegisterProperties();
-
-  // Condition Codes
-  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
-
-  setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
-  setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
-  setCondCodeAction(ISD::SETUGE, MVT::f64, Expand);
-  setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
-  setCondCodeAction(ISD::SETULE, MVT::f64, Expand);
-  setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
+  computeRegisterProperties(STI.getRegisterInfo());
 
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
@@ -104,12 +90,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
 
   setOperationAction(ISD::STORE, MVT::i1, Custom);
-  setOperationAction(ISD::STORE, MVT::i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 
-  setOperationAction(ISD::SELECT, MVT::f32, Promote);
-  AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
   setOperationAction(ISD::SELECT, MVT::f64, Promote);
   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
@@ -147,26 +129,34 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
 
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
-
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
-
-  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
-  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
-  setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-
-  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
-  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
+  for (MVT VT : MVT::integer_valuetypes()) {
+    if (VT == MVT::i64)
+      continue;
+
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
+
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
+
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
+  }
+
+  for (MVT VT : MVT::integer_vector_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand);
+  }
+
+  for (MVT VT : MVT::fp_valuetypes())
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
@@ -213,13 +203,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
     }
   }
 
-  for (int I = MVT::v1f64; I <= MVT::v8f64; ++I) {
-    MVT::SimpleValueType VT = static_cast<MVT::SimpleValueType>(I);
-    setOperationAction(ISD::FTRUNC, VT, Expand);
-    setOperationAction(ISD::FCEIL, VT, Expand);
-    setOperationAction(ISD::FFLOOR, VT, Expand);
-  }
-
   if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
@@ -228,6 +211,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   }
 
   setOperationAction(ISD::FDIV, MVT::f32, Custom);
+  setOperationAction(ISD::FDIV, MVT::f64, Custom);
 
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
@@ -235,7 +219,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setTargetDAGCombine(ISD::FMAXNUM);
   setTargetDAGCombine(ISD::SELECT_CC);
   setTargetDAGCombine(ISD::SETCC);
-
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::UINT_TO_FP);
 
   // All memory operations. Some folding on the pointer operand is done to help
@@ -315,7 +300,7 @@ bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
   return true;
 }
 
-bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT  VT,
+bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                       unsigned AddrSpace,
                                                       unsigned Align,
                                                       bool *IsFast) const {
@@ -327,9 +312,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT  VT,
   if (!VT.isSimple() || VT == MVT::Other)
     return false;
 
-  // XXX - CI changes say "Support for unaligned memory accesses" but I don't
-  // see what for specifically. The wording everywhere else seems to be the
-  // same.
+  // TODO - CI+ supports unaligned memory accesses, but this requires driver
+  // support.
 
   // XXX - The only mention I see of this in the ISA manual is for LDS direct
   // reads the "byte address and must be dword aligned". Is it also true for the
@@ -341,12 +325,18 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT  VT,
     return Align % 4 == 0;
   }
 
+  // Smaller than dword value must be aligned.
+  // FIXME: This should be allowed on CI+
+  if (VT.bitsLT(MVT::i32))
+    return false;
+
   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
   // byte-address are ignored, thus forcing Dword alignment.
   // This applies to private, global, and constant memory.
   if (IsFast)
     *IsFast = true;
-  return VT.bitsGT(MVT::i32);
+
+  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
 }
 
 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
@@ -379,8 +369,8 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const {
 
 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                          Type *Ty) const {
-  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
-      getTargetMachine().getSubtargetImpl()->getInstrInfo());
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
   return TII->isInlineConstant(Imm);
 }
 
@@ -413,16 +403,11 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
 }
 
 SDValue SITargetLowering::LowerFormalArguments(
-                                      SDValue Chain,
-                                      CallingConv::ID CallConv,
-                                      bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      SDLoc DL, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const {
-
-  const TargetMachine &TM = getTargetMachine();
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
   const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo());
+      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
 
   MachineFunction &MF = DAG.getMachineFunction();
   FunctionType *FType = MF.getFunction()->getFunctionType();
@@ -461,7 +446,7 @@ SDValue SITargetLowering::LowerFormalArguments(
       // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
       // three or five element vertex only needs three or five registers,
       // NOT four or eigth.
-      Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
+      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
       unsigned NumElements = ParamType->getVectorNumElements();
 
       for (unsigned j = 0; j != NumElements; ++j) {
@@ -489,7 +474,10 @@ SDValue SITargetLowering::LowerFormalArguments(
   // The pointer to the list of arguments is stored in SGPR0, SGPR1
 	// The pointer to the scratch buffer is stored in SGPR2, SGPR3
   if (Info->getShaderType() == ShaderType::COMPUTE) {
-    Info->NumUserSGPRs = 4;
+    if (Subtarget->isAmdHsaOS())
+      Info->NumUserSGPRs = 2;  // FIXME: Need to support scratch buffers.
+    else
+      Info->NumUserSGPRs = 4;
 
     unsigned InputPtrReg =
         TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
@@ -541,7 +529,7 @@ SDValue SITargetLowering::LowerFormalArguments(
                                    Offset, Ins[i].Flags.isSExt());
 
       const PointerType *ParamTy =
-          dyn_cast<PointerType>(FType->getParamType(Ins[i].OrigArgIndex));
+        dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
       if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
         // On SI local pointers are just offsets into LDS, so they are always
@@ -576,7 +564,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     if (Arg.VT.isVector()) {
 
       // Build a vector from the registers
-      Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
+      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
       unsigned NumElements = ParamType->getVectorNumElements();
 
       SmallVector<SDValue, 4> Regs;
@@ -589,8 +577,7 @@ SDValue SITargetLowering::LowerFormalArguments(
 
       // Fill up the missing vector elements
       NumElements = Arg.VT.getVectorNumElements() - NumElements;
-      for (unsigned j = 0; j != NumElements; ++j)
-        Regs.push_back(DAG.getUNDEF(VT));
+      Regs.append(NumElements, DAG.getUNDEF(VT));
 
       InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs));
       continue;
@@ -598,6 +585,12 @@ SDValue SITargetLowering::LowerFormalArguments(
 
     InVals.push_back(Val);
   }
+
+  if (Info->getShaderType() != ShaderType::COMPUTE) {
+    unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>(
+        AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
+    Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
+  }
   return Chain;
 }
 
@@ -605,25 +598,14 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const {
 
   MachineBasicBlock::iterator I = *MI;
-  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
-      getTargetMachine().getSubtargetImpl()->getInstrInfo());
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
 
   switch (MI->getOpcode()) {
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
-  case AMDGPU::BRANCH: return BB;
-  case AMDGPU::V_SUB_F64: {
-    unsigned DestReg = MI->getOperand(0).getReg();
-    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg)
-      .addImm(0)  // SRC0 modifiers
-      .addReg(MI->getOperand(1).getReg())
-      .addImm(1)  // SRC1 modifiers
-      .addReg(MI->getOperand(2).getReg())
-      .addImm(0)  // CLAMP
-      .addImm(0); // OMOD
-    MI->eraseFromParent();
-    break;
-  }
+  case AMDGPU::BRANCH:
+    return BB;
   case AMDGPU::SI_RegisterStorePseudo: {
     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
     unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -640,17 +622,43 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
   return BB;
 }
 
-EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
+  // This currently forces unfolding various combinations of fsub into fma with
+  // free fneg'd operands. As long as we have fast FMA (controlled by
+  // isFMAFasterThanFMulAndFAdd), we should perform these.
+
+  // When fma is quarter rate, for f64 where add / sub are at best half rate,
+  // most of these combines appear to be cycle neutral but save on instruction
+  // count / code size.
+  return true;
+}
+
+EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const {
   if (!VT.isVector()) {
     return MVT::i1;
   }
-  return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+  return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
 }
 
 MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
   return MVT::i32;
 }
 
+// Answering this is somewhat tricky and depends on the specific device which
+// have different rates for fma or all f64 operations.
+//
+// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
+// regardless of which device (although the number of cycles differs between
+// devices), so it is always profitable for f64.
+//
+// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
+// only on full rate devices. Normally, we should prefer selecting v_mad_f32
+// which we can always do even without fused FP ops since it returns the same
+// result as the separate operations and since it is always full
+// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
+// however does not support denormals, so we do report fma as faster if we have
+// a fast fma device and require denormals.
+//
 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   VT = VT.getScalarType();
 
@@ -659,7 +667,11 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
 
   switch (VT.getSimpleVT().SimpleTy) {
   case MVT::f32:
-    return false; /* There is V_MAD_F32 for f32 */
+    // This is as fast on some subtargets. However, we always have full rate f32
+    // mad available which returns the same result as the separate operations
+    // which we should prefer over fma. We can't use this if we want to support
+    // denormals, so only report this in these cases.
+    return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
   case MVT::f64:
     return true;
   default:
@@ -755,15 +767,12 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
 
   // Build the result and
-  SmallVector<EVT, 4> Res;
-  for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i)
-    Res.push_back(Intr->getValueType(i));
+  ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
 
   // operands of the new intrinsic call
   SmallVector<SDValue, 4> Ops;
   Ops.push_back(BRCOND.getOperand(0));
-  for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i)
-    Ops.push_back(Intr->getOperand(i));
+  Ops.append(Intr->op_begin() + 1, Intr->op_end());
   Ops.push_back(Target);
 
   // build the new intrinsic call
@@ -839,7 +848,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo*>(MF.getSubtarget().getRegisterInfo());
+      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
@@ -889,13 +898,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT);
   case Intrinsic::r600_read_tidig_x:
-    return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT);
   case Intrinsic::r600_read_tidig_y:
-    return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT);
   case Intrinsic::r600_read_tidig_z:
-    return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT);
   case AMDGPUIntrinsic::SI_load_const: {
     SDValue Ops[] = {
@@ -1090,7 +1099,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
   const APFloat K1Val(BitsToFloat(0x2f800000));
   const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32);
 
-  const SDValue One = DAG.getTargetConstantFP(1.0, MVT::f32);
+  const SDValue One = DAG.getConstantFP(1.0, MVT::f32);
 
   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
 
@@ -1108,7 +1117,70 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
-  return SDValue();
+  if (DAG.getTarget().Options.UnsafeFPMath)
+    return LowerFastFDIV(Op, DAG);
+
+  SDLoc SL(Op);
+  SDValue X = Op.getOperand(0);
+  SDValue Y = Op.getOperand(1);
+
+  const SDValue One = DAG.getConstantFP(1.0, MVT::f64);
+
+  SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
+
+  SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
+
+  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
+
+  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
+
+  SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
+
+  SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
+
+  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
+
+  SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
+
+  SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
+
+  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
+                             NegDivScale0, Mul, DivScale1);
+
+  SDValue Scale;
+
+  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    // Workaround a hardware bug on SI where the condition output from div_scale
+    // is not usable.
+
+    const SDValue Hi = DAG.getConstant(1, MVT::i32);
+
+    // Figure out if the scale to use for div_fmas.
+    SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
+    SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
+    SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
+    SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
+
+    SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
+    SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
+
+    SDValue Scale0Hi
+      = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
+    SDValue Scale1Hi
+      = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
+
+    SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
+    SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
+    Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
+  } else {
+    Scale = DivScale1.getValue(1);
+  }
+
+  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
+                             Fma4, Fma3, Mul, Scale);
+
+  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
 }
 
 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
@@ -1129,11 +1201,6 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Store->getMemoryVT();
 
   // These stores are legal.
-  if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
-      VT.isVector() && VT.getVectorNumElements() == 2 &&
-      VT.getVectorElementType() == MVT::i32)
-    return SDValue();
-
   if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
     if (VT.isVector() && VT.getVectorNumElements() > 4)
       return ScalarizeVectorStore(Op, DAG);
@@ -1177,7 +1244,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 //===----------------------------------------------------------------------===//
 
 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
-                                                     DAGCombinerInfo &DCI) {
+                                                     DAGCombinerInfo &DCI) const {
   EVT VT = N->getValueType(0);
   EVT ScalarVT = VT.getScalarType();
   if (ScalarVT != MVT::f32)
@@ -1225,8 +1292,21 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
     EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
     EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
     EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
-
     LoadSDNode *Load = cast<LoadSDNode>(Src);
+
+    unsigned AS = Load->getAddressSpace();
+    unsigned Align = Load->getAlignment();
+    Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
+    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
+
+    // Don't try to replace the load if we have to expand it due to alignment
+    // problems. Otherwise we will end up scalarizing the load, and trying to
+    // repack into the vector for no real reason.
+    if (Align < ABIAlignment &&
+        !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) {
+      return SDValue();
+    }
+
     SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
                                      Load->getChain(),
                                      Load->getBasePtr(),
@@ -1297,8 +1377,8 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
   if (!CAdd)
     return SDValue();
 
-  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
-      getTargetMachine().getSubtargetImpl()->getInstrInfo());
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
 
   // If the resulting offset is too large, we can't fold it into the addressing
   // mode offset.
@@ -1316,6 +1396,102 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
 }
 
+SDValue SITargetLowering::performAndCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+
+  // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
+  // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  if (LHS.getOpcode() == ISD::SETCC &&
+      RHS.getOpcode() == ISD::SETCC) {
+    ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
+    ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
+
+    SDValue X = LHS.getOperand(0);
+    SDValue Y = RHS.getOperand(0);
+    if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
+      return SDValue();
+
+    if (LCC == ISD::SETO) {
+      if (X != LHS.getOperand(1))
+        return SDValue();
+
+      if (RCC == ISD::SETUNE) {
+        const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
+        if (!C1 || !C1->isInfinity() || C1->isNegative())
+          return SDValue();
+
+        const uint32_t Mask = SIInstrFlags::N_NORMAL |
+                              SIInstrFlags::N_SUBNORMAL |
+                              SIInstrFlags::N_ZERO |
+                              SIInstrFlags::P_ZERO |
+                              SIInstrFlags::P_SUBNORMAL |
+                              SIInstrFlags::P_NORMAL;
+
+        static_assert(((~(SIInstrFlags::S_NAN |
+                          SIInstrFlags::Q_NAN |
+                          SIInstrFlags::N_INFINITY |
+                          SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
+                      "mask not equal");
+
+        return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1,
+                           X, DAG.getConstant(Mask, MVT::i32));
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performOrCombine(SDNode *N,
+                                           DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
+  if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
+      RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
+    SDValue Src = LHS.getOperand(0);
+    if (Src != RHS.getOperand(0))
+      return SDValue();
+
+    const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+    const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+    if (!CLHS || !CRHS)
+      return SDValue();
+
+    // Only 10 bits are used.
+    static const uint32_t MaxMask = 0x3ff;
+
+    uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
+    return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1,
+                       Src, DAG.getConstant(NewMask, MVT::i32));
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performClassCombine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Mask = N->getOperand(1);
+
+  // fp_class x, 0 -> false
+  if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
+    if (CMask->isNullValue())
+      return DAG.getConstant(0, MVT::i1);
+  }
+
+  return SDValue();
+}
+
 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
   switch (Opc) {
   case ISD::FMAXNUM:
@@ -1371,33 +1547,47 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N,
   return SDValue();
 }
 
+SDValue SITargetLowering::performSetCCCombine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  EVT VT = LHS.getValueType();
+
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return SDValue();
+
+  // Match isinf pattern
+  // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
+    const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
+    if (!CRHS)
+      return SDValue();
+
+    const APFloat &APF = CRHS->getValueAPF();
+    if (APF.isInfinity() && !APF.isNegative()) {
+      unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
+      return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
+                         LHS.getOperand(0), DAG.getConstant(Mask, MVT::i32));
+    }
+  }
+
+  return SDValue();
+}
+
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
-  EVT VT = N->getValueType(0);
 
   switch (N->getOpcode()) {
-    default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
-    case ISD::SETCC: {
-      SDValue Arg0 = N->getOperand(0);
-      SDValue Arg1 = N->getOperand(1);
-      SDValue CC = N->getOperand(2);
-      ConstantSDNode * C = nullptr;
-      ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
-
-      // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
-      if (VT == MVT::i1
-          && Arg0.getOpcode() == ISD::SIGN_EXTEND
-          && Arg0.getOperand(0).getValueType() == MVT::i1
-          && (C = dyn_cast<ConstantSDNode>(Arg1))
-          && C->isNullValue()
-          && CCOp == ISD::SETNE) {
-        return SimplifySetCC(VT, Arg0.getOperand(0),
-                             DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL);
-      }
-      break;
-    }
+  default:
+    return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+  case ISD::SETCC:
+    return performSetCCCombine(N, DCI);
   case ISD::FMAXNUM: // TODO: What about fmax_legacy?
   case ISD::FMINNUM:
   case AMDGPUISD::SMAX:
@@ -1442,6 +1632,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     if (VT != MVT::f32)
       break;
 
+    // Only do this if we are not trying to support denormals. v_mad_f32 does
+    // not support denormals ever.
+    if (Subtarget->hasFP32Denormals())
+      break;
+
     SDValue LHS = N->getOperand(0);
     SDValue RHS = N->getOperand(1);
 
@@ -1452,8 +1647,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     if (LHS.getOpcode() == ISD::FADD) {
       SDValue A = LHS.getOperand(0);
       if (A == LHS.getOperand(1)) {
-        const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32);
-        return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, RHS);
+        const SDValue Two = DAG.getConstantFP(2.0, MVT::f32);
+        return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS);
       }
     }
 
@@ -1461,12 +1656,12 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     if (RHS.getOpcode() == ISD::FADD) {
       SDValue A = RHS.getOperand(0);
       if (A == RHS.getOperand(1)) {
-        const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32);
-        return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, LHS);
+        const SDValue Two = DAG.getConstantFP(2.0, MVT::f32);
+        return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS);
       }
     }
 
-    break;
+    return SDValue();
   }
   case ISD::FSUB: {
     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
@@ -1476,39 +1671,22 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
 
     // Try to get the fneg to fold into the source modifier. This undoes generic
     // DAG combines and folds them into the mad.
-    if (VT == MVT::f32) {
+    //
+    // Only do this if we are not trying to support denormals. v_mad_f32 does
+    // not support denormals ever.
+    if (VT == MVT::f32 &&
+        !Subtarget->hasFP32Denormals()) {
       SDValue LHS = N->getOperand(0);
       SDValue RHS = N->getOperand(1);
-
-      if (LHS.getOpcode() == ISD::FMUL) {
-        // (fsub (fmul a, b), c) -> mad a, b, (fneg c)
-
-        SDValue A = LHS.getOperand(0);
-        SDValue B = LHS.getOperand(1);
-        SDValue C = DAG.getNode(ISD::FNEG, DL, VT, RHS);
-
-        return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C);
-      }
-
-      if (RHS.getOpcode() == ISD::FMUL) {
-        // (fsub c, (fmul a, b)) -> mad (fneg a), b, c
-
-        SDValue A = DAG.getNode(ISD::FNEG, DL, VT, RHS.getOperand(0));
-        SDValue B = RHS.getOperand(1);
-        SDValue C = LHS;
-
-        return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C);
-      }
-
       if (LHS.getOpcode() == ISD::FADD) {
         // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
 
         SDValue A = LHS.getOperand(0);
         if (A == LHS.getOperand(1)) {
-          const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32);
+          const SDValue Two = DAG.getConstantFP(2.0, MVT::f32);
           SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS);
 
-          return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, NegRHS);
+          return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS);
         }
       }
 
@@ -1517,10 +1695,12 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
 
         SDValue A = RHS.getOperand(0);
         if (A == RHS.getOperand(1)) {
-          const SDValue NegTwo = DAG.getTargetConstantFP(-2.0, MVT::f32);
-          return DAG.getNode(AMDGPUISD::MAD, DL, VT, NegTwo, A, LHS);
+          const SDValue NegTwo = DAG.getConstantFP(-2.0, MVT::f32);
+          return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS);
         }
       }
+
+      return SDValue();
     }
 
     break;
@@ -1554,9 +1734,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
       SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
       if (NewPtr) {
-        SmallVector<SDValue, 8> NewOps;
-        for (unsigned I = 0, E = MemNode->getNumOperands(); I != E; ++I)
-          NewOps.push_back(MemNode->getOperand(I));
+        SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end());
 
         NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
         return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0);
@@ -1564,287 +1742,44 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     }
     break;
   }
+  case ISD::AND:
+    return performAndCombine(N, DCI);
+  case ISD::OR:
+    return performOrCombine(N, DCI);
+  case AMDGPUISD::FP_CLASS:
+    return performClassCombine(N, DCI);
   }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
 
-/// \brief Test if RegClass is one of the VSrc classes
-static bool isVSrc(unsigned RegClass) {
-  switch(RegClass) {
-    default: return false;
-    case AMDGPU::VSrc_32RegClassID:
-    case AMDGPU::VCSrc_32RegClassID:
-    case AMDGPU::VSrc_64RegClassID:
-    case AMDGPU::VCSrc_64RegClassID:
-      return true;
-  }
-}
-
-/// \brief Test if RegClass is one of the SSrc classes
-static bool isSSrc(unsigned RegClass) {
-  return AMDGPU::SSrc_32RegClassID == RegClass ||
-         AMDGPU::SSrc_64RegClassID == RegClass;
-}
-
 /// \brief Analyze the possible immediate value Op
 ///
 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
 /// and the immediate value if it's a literal immediate
 int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
 
-  union {
-    int32_t I;
-    float F;
-  } Imm;
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
 
   if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
-    if (Node->getZExtValue() >> 32) {
-        return -1;
-    }
-    Imm.I = Node->getSExtValue();
-  } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
-    if (N->getValueType(0) != MVT::f32)
-      return -1;
-    Imm.F = Node->getValueAPF().convertToFloat();
-  } else
-    return -1; // It isn't an immediate
-
-  if ((Imm.I >= -16 && Imm.I <= 64) ||
-      Imm.F == 0.5f || Imm.F == -0.5f ||
-      Imm.F == 1.0f || Imm.F == -1.0f ||
-      Imm.F == 2.0f || Imm.F == -2.0f ||
-      Imm.F == 4.0f || Imm.F == -4.0f)
-    return 0; // It's an inline immediate
-
-  return Imm.I; // It's a literal immediate
-}
-
-/// \brief Try to fold an immediate directly into an instruction
-bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
-                               bool &ScalarSlotUsed) const {
-
-  MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand);
-  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
-      getTargetMachine().getSubtargetImpl()->getInstrInfo());
-  if (!Mov || !TII->isMov(Mov->getMachineOpcode()))
-    return false;
-
-  const SDValue &Op = Mov->getOperand(0);
-  int32_t Value = analyzeImmediate(Op.getNode());
-  if (Value == -1) {
-    // Not an immediate at all
-    return false;
-
-  } else if (Value == 0) {
-    // Inline immediates can always be fold
-    Operand = Op;
-    return true;
-
-  } else if (Value == Immediate) {
-    // Already fold literal immediate
-    Operand = Op;
-    return true;
-
-  } else if (!ScalarSlotUsed && !Immediate) {
-    // Fold this literal immediate
-    ScalarSlotUsed = true;
-    Immediate = Value;
-    Operand = Op;
-    return true;
+    if (TII->isInlineConstant(Node->getAPIntValue()))
+      return 0;
 
+    uint64_t Val = Node->getZExtValue();
+    return isUInt<32>(Val) ? Val : -1;
   }
 
-  return false;
-}
+  if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
+    if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt()))
+      return 0;
 
-const TargetRegisterClass *SITargetLowering::getRegClassForNode(
-                                   SelectionDAG &DAG, const SDValue &Op) const {
-  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
-      getTargetMachine().getSubtargetImpl()->getInstrInfo());
-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
-
-  if (!Op->isMachineOpcode()) {
-    switch(Op->getOpcode()) {
-    case ISD::CopyFromReg: {
-      MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
-      unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-        return MRI.getRegClass(Reg);
-      }
-      return TRI.getPhysRegClass(Reg);
-    }
-    default:  return nullptr;
-    }
-  }
-  const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode());
-  int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass;
-  if (OpClassID != -1) {
-    return TRI.getRegClass(OpClassID);
-  }
-  switch(Op.getMachineOpcode()) {
-  case AMDGPU::COPY_TO_REGCLASS:
-    // Operand 1 is the register class id for COPY_TO_REGCLASS instructions.
-    OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
-
-    // If the COPY_TO_REGCLASS instruction is copying to a VSrc register
-    // class, then the register class for the value could be either a
-    // VReg or and SReg.  In order to get a more accurate
-    if (isVSrc(OpClassID))
-      return getRegClassForNode(DAG, Op.getOperand(0));
-
-    return TRI.getRegClass(OpClassID);
-  case AMDGPU::EXTRACT_SUBREG: {
-    int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-    const TargetRegisterClass *SuperClass =
-      getRegClassForNode(DAG, Op.getOperand(0));
-    return TRI.getSubClassWithSubReg(SuperClass, SubIdx);
-  }
-  case AMDGPU::REG_SEQUENCE:
-    // Operand 0 is the register class id for REG_SEQUENCE instructions.
-    return TRI.getRegClass(
-      cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue());
-  default:
-    return getRegClassFor(Op.getSimpleValueType());
-  }
-}
+    if (Node->getValueType(0) == MVT::f32)
+      return FloatToBits(Node->getValueAPF().convertToFloat());
 
-/// \brief Does "Op" fit into register class "RegClass" ?
-bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
-                                    unsigned RegClass) const {
-  const TargetRegisterInfo *TRI =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
-  const TargetRegisterClass *RC = getRegClassForNode(DAG, Op);
-  if (!RC) {
-    return false;
+    return -1;
   }
-  return TRI->getRegClass(RegClass)->hasSubClassEq(RC);
-}
 
-/// \returns true if \p Node's operands are different from the SDValue list
-/// \p Ops
-static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) {
-  for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) {
-    if (Ops[i].getNode() != Node->getOperand(i).getNode()) {
-      return true;
-    }
-  }
-  return false;
-}
-
-/// TODO: This needs to be removed. It's current primary purpose is to fold
-/// immediates into operands when legal. The legalization parts are redundant
-/// with SIInstrInfo::legalizeOperands which is called in a post-isel hook.
-SDNode *SITargetLowering::legalizeOperands(MachineSDNode *Node,
-                                           SelectionDAG &DAG) const {
-  // Original encoding (either e32 or e64)
-  int Opcode = Node->getMachineOpcode();
-  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
-      getTargetMachine().getSubtargetImpl()->getInstrInfo());
-  const MCInstrDesc *Desc = &TII->get(Opcode);
-
-  unsigned NumDefs = Desc->getNumDefs();
-  unsigned NumOps = Desc->getNumOperands();
-
-  // Commuted opcode if available
-  int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1;
-  const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev);
-
-  assert(!DescRev || DescRev->getNumDefs() == NumDefs);
-  assert(!DescRev || DescRev->getNumOperands() == NumOps);
-
-  int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
-  bool HaveVSrc = false, HaveSSrc = false;
-
-  // First figure out what we already have in this instruction.
-  for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
-       i != e && Op < NumOps; ++i, ++Op) {
-
-    unsigned RegClass = Desc->OpInfo[Op].RegClass;
-    if (isVSrc(RegClass))
-      HaveVSrc = true;
-    else if (isSSrc(RegClass))
-      HaveSSrc = true;
-    else
-      continue;
-
-    int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode());
-    if (Imm != -1 && Imm != 0) {
-      // Literal immediate
-      Immediate = Imm;
-    }
-  }
-
-  // If we neither have VSrc nor SSrc, it makes no sense to continue.
-  if (!HaveVSrc && !HaveSSrc)
-    return Node;
-
-  // No scalar allowed when we have both VSrc and SSrc
-  bool ScalarSlotUsed = HaveVSrc && HaveSSrc;
-
-  // If this instruction has an implicit use of VCC, then it can't use the
-  // constant bus.
-  for (unsigned i = 0, e = Desc->getNumImplicitUses(); i != e; ++i) {
-    if (Desc->ImplicitUses[i] == AMDGPU::VCC) {
-      ScalarSlotUsed = true;
-      break;
-    }
-  }
-
-  // Second go over the operands and try to fold them
-  std::vector<SDValue> Ops;
-  for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
-       i != e && Op < NumOps; ++i, ++Op) {
-
-    const SDValue &Operand = Node->getOperand(i);
-    Ops.push_back(Operand);
-
-    // Already folded immediate?
-    if (isa<ConstantSDNode>(Operand.getNode()) ||
-        isa<ConstantFPSDNode>(Operand.getNode()))
-      continue;
-
-    // Is this a VSrc or SSrc operand?
-    unsigned RegClass = Desc->OpInfo[Op].RegClass;
-    if (isVSrc(RegClass) || isSSrc(RegClass)) {
-      // Try to fold the immediates. If this ends up with multiple constant bus
-      // uses, it will be legalized later.
-      foldImm(Ops[i], Immediate, ScalarSlotUsed);
-      continue;
-    }
-
-    if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) {
-
-      unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass;
-      assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass));
-
-      // Test if it makes sense to swap operands
-      if (foldImm(Ops[1], Immediate, ScalarSlotUsed) ||
-          (!fitsRegClass(DAG, Ops[1], RegClass) &&
-           fitsRegClass(DAG, Ops[1], OtherRegClass))) {
-
-        // Swap commutable operands
-        std::swap(Ops[0], Ops[1]);
-
-        Desc = DescRev;
-        DescRev = nullptr;
-        continue;
-      }
-    }
-  }
-
-  // Add optional chain and glue
-  for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i)
-    Ops.push_back(Node->getOperand(i));
-
-  // Nodes that have a glue result are not CSE'd by getMachineNode(), so in
-  // this case a brand new node is always be created, even if the operands
-  // are the same as before.  So, manually check if anything has been changed.
-  if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) {
-    return Node;
-  }
-
-  // Create a complete new instruction
-  return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops);
+  return -1;
 }
 
 /// \brief Helper function for adjustWritemask
@@ -1904,14 +1839,13 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   // Adjust the writemask in the node
   std::vector<SDValue> Ops;
   Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32));
-  for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
-    Ops.push_back(Node->getOperand(i));
+  Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end());
   Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
 
   // If we only got one lane, replace it with a copy
   // (if NewDmask has only one bit set...)
   if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
-    SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32);
+    SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, MVT::i32);
     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
                                       SDLoc(), Users[Lane]->getValueType(0),
                                       SDValue(Node, 0), RC);
@@ -1963,9 +1897,8 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
 /// \brief Fold the instructions after selecting them.
 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                           SelectionDAG &DAG) const {
-  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
-      getTargetMachine().getSubtargetImpl()->getInstrInfo());
-  Node = AdjustRegClass(Node, DAG);
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
 
   if (TII->isMIMG(Node->getMachineOpcode()))
     adjustWritemask(Node, DAG);
@@ -1975,17 +1908,17 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
     legalizeTargetIndependentNode(Node, DAG);
     return Node;
   }
-
-  return legalizeOperands(Node, DAG);
+  return Node;
 }
 
 /// \brief Assign the register class depending on the number of
 /// bits set in the writemask
 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
                                                      SDNode *Node) const {
-  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
-      getTargetMachine().getSubtargetImpl()->getInstrInfo());
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
 
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   TII->legalizeOperands(MI);
 
   if (TII->isMIMG(MI->getOpcode())) {
@@ -1998,14 +1931,13 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
     const TargetRegisterClass *RC;
     switch (BitsSet) {
     default: return;
-    case 1:  RC = &AMDGPU::VReg_32RegClass; break;
+    case 1:  RC = &AMDGPU::VGPR_32RegClass; break;
     case 2:  RC = &AMDGPU::VReg_64RegClass; break;
     case 3:  RC = &AMDGPU::VReg_96RegClass; break;
     }
 
     unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
     MI->setDesc(TII->get(NewOpcode));
-    MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
     MRI.setRegClass(VReg, RC);
     return;
   }
@@ -2030,6 +1962,8 @@ static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) {
 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
                                                 SDLoc DL,
                                                 SDValue Ptr) const {
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
 #if 1
     // XXX - Workaround for moveToVALU not handling different register class
     // inserts for REG_SEQUENCE.
@@ -2039,7 +1973,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
       DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, MVT::i32),
       buildSMovImm32(DAG, DL, 0),
       DAG.getTargetConstant(AMDGPU::sub0, MVT::i32),
-      buildSMovImm32(DAG, DL, AMDGPU::RSRC_DATA_FORMAT >> 32),
+      buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
       DAG.getTargetConstant(AMDGPU::sub1, MVT::i32)
     };
 
@@ -2063,7 +1997,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
       DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
       buildSMovImm32(DAG, DL, 0),
       DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
-      buildSMovImm32(DAG, DL, AMDGPU::RSRC_DATA_FORMAT >> 32),
+      buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32),
       DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
     };
 
@@ -2110,57 +2044,14 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
 MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
                                                   SDLoc DL,
                                                   SDValue Ptr) const {
-  uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE |
                   0xffffffff; // Size
 
   return buildRSRC(DAG, DL, Ptr, 0, Rsrc);
 }
 
-MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N,
-                                                SelectionDAG &DAG) const {
-
-  SDLoc DL(N);
-  unsigned NewOpcode = N->getMachineOpcode();
-
-  switch (N->getMachineOpcode()) {
-  default: return N;
-  case AMDGPU::S_LOAD_DWORD_IMM:
-    NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
-    // Fall-through
-  case AMDGPU::S_LOAD_DWORDX2_SGPR:
-    if (NewOpcode == N->getMachineOpcode()) {
-      NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
-    }
-    // Fall-through
-  case AMDGPU::S_LOAD_DWORDX4_IMM:
-  case AMDGPU::S_LOAD_DWORDX4_SGPR: {
-    if (NewOpcode == N->getMachineOpcode()) {
-      NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
-    }
-    if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) {
-      return N;
-    }
-    ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1));
-
-    const SDValue Zero64 = DAG.getTargetConstant(0, MVT::i64);
-    SDValue Ptr(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Zero64), 0);
-    MachineSDNode *RSrc = wrapAddr64Rsrc(DAG, DL, Ptr);
-
-    SmallVector<SDValue, 8> Ops;
-    Ops.push_back(SDValue(RSrc, 0));
-    Ops.push_back(N->getOperand(0));
-    Ops.push_back(DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32));
-
-    // Copy remaining operands so we keep any chain and glue nodes that follow
-    // the normal operands.
-    for (unsigned I = 2, E = N->getNumOperands(); I != E; ++I)
-      Ops.push_back(N->getOperand(I));
-
-    return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops);
-  }
-  }
-}
-
 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
                                                const TargetRegisterClass *RC,
                                                unsigned Reg, EVT VT) const {
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index 7bf406e..92f5847 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -42,27 +42,22 @@ class SITargetLowering : public AMDGPUTargetLowering {
   SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
-  bool foldImm(SDValue &Operand, int32_t &Immediate,
-               bool &ScalarSlotUsed) const;
-  const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG,
-                                                const SDValue &Op) const;
-  bool fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
-                    unsigned RegClass) const;
-
-  SDNode *legalizeOperands(MachineSDNode *N, SelectionDAG &DAG) const;
   void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
-  MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const;
 
-  static SDValue performUCharToFloatCombine(SDNode *N,
-                                            DAGCombinerInfo &DCI);
+  SDValue performUCharToFloatCombine(SDNode *N,
+                                     DAGCombinerInfo &DCI) const;
   SDValue performSHLPtrCombine(SDNode *N,
                                unsigned AS,
                                DAGCombinerInfo &DCI) const;
+  SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
 public:
-  SITargetLowering(TargetMachine &tm);
+  SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI);
 
   bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
                           EVT /*VT*/) const override;
@@ -94,6 +89,7 @@ public:
 
   MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
                                       MachineBasicBlock * BB) const override;
+  bool enableAggressiveFMAFusion(EVT VT) const override;
   EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
   MVT getScalarShiftAmountTy(EVT VT) const override;
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index 712d97d..50f20ac 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -41,6 +41,12 @@ typedef union {
 
 } Counters;
 
+typedef enum {
+  OTHER,
+  SMEM,
+  VMEM
+} InstType;
+
 typedef Counters RegCounters[512];
 typedef std::pair<unsigned, unsigned> RegInterval;
 
@@ -73,6 +79,11 @@ private:
   /// \brief Different export instruction types seen since last wait.
   unsigned ExpInstrTypesSeen;
 
+  /// \brief Type of the last opcode.
+  InstType LastOpcodeType;
+
+  bool LastInstWritesM0;
+
   /// \brief Get increment/decrement amount for this instruction.
   Counters getHwCounts(MachineInstr &MI);
 
@@ -83,7 +94,8 @@ private:
   RegInterval getRegInterval(MachineOperand &Op);
 
   /// \brief Handle instructions async components
-  void pushInstruction(MachineInstr &MI);
+  void pushInstruction(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator I);
 
   /// \brief Insert the actual wait instruction
   bool insertWait(MachineBasicBlock &MBB,
@@ -96,6 +108,9 @@ private:
   /// \brief Resolve all operand dependencies to counter requirements
   Counters handleOperands(MachineInstr &MI);
 
+  /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
+  void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
+
 public:
   SIInsertWaits(TargetMachine &tm) :
     MachineFunctionPass(ID),
@@ -176,6 +191,29 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
   if (!MI.getDesc().mayStore())
     return false;
 
+  // Check if this operand is the value being stored.
+  // Special case for DS instructions, since the address
+  // operand comes before the value operand and it may have
+  // multiple data operands.
+
+  if (TII->isDS(MI.getOpcode())) {
+    MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
+    if (Data && Op.isIdenticalTo(*Data))
+      return true;
+
+    MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
+    if (Data0 && Op.isIdenticalTo(*Data0))
+      return true;
+
+    MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
+    if (Data1 && Op.isIdenticalTo(*Data1))
+      return true;
+
+    return false;
+  }
+
+  // NOTE: This assumes that the value operand is before the
+  // address operand, and that there is only one value operand.
   for (MachineInstr::mop_iterator I = MI.operands_begin(),
        E = MI.operands_end(); I != E; ++I) {
 
@@ -203,10 +241,11 @@ RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
   return Result;
 }
 
-void SIInsertWaits::pushInstruction(MachineInstr &MI) {
+void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I) {
 
   // Get the hardware counter increments and sum them up
-  Counters Increment = getHwCounts(MI);
+  Counters Increment = getHwCounts(*I);
   unsigned Sum = 0;
 
   for (unsigned i = 0; i < 3; ++i) {
@@ -215,17 +254,43 @@ void SIInsertWaits::pushInstruction(MachineInstr &MI) {
   }
 
   // If we don't increase anything then that's it
-  if (Sum == 0)
+  if (Sum == 0) {
+    LastOpcodeType = OTHER;
     return;
+  }
+
+  if (TRI->ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM
+    // or SMEM clause, respectively.
+    //
+    // The temporary workaround is to break the clauses with S_NOP.
+    //
+    // The proper solution would be to allocate registers such that all source
+    // and destination registers don't overlap, e.g. this is illegal:
+    //   r0 = load r2
+    //   r2 = load r0
+    if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) ||
+        (LastOpcodeType == VMEM && Increment.Named.VM)) {
+      // Insert a NOP to break the clause.
+      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
+          .addImm(0);
+      LastInstWritesM0 = false;
+    }
+
+    if (TII->isSMRD(I->getOpcode()))
+      LastOpcodeType = SMEM;
+    else if (Increment.Named.VM)
+      LastOpcodeType = VMEM;
+  }
 
   // Remember which export instructions we have seen
   if (Increment.Named.EXP) {
-    ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2;
+    ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
   }
 
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
 
-    MachineOperand &Op = MI.getOperand(i);
+    MachineOperand &Op = I->getOperand(i);
     if (!isOpRelevant(Op))
       continue;
 
@@ -302,6 +367,8 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
                   ((Counts.Named.EXP & 0x7) << 4) |
                   ((Counts.Named.LGKM & 0x7) << 8));
 
+  LastOpcodeType = OTHER;
+  LastInstWritesM0 = false;
   return true;
 }
 
@@ -343,6 +410,30 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
   return Result;
 }
 
+void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) {
+  if (TRI->ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return;
+
+  // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
+  if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
+    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
+    LastInstWritesM0 = false;
+    return;
+  }
+
+  // Set whether this instruction sets M0
+  LastInstWritesM0 = false;
+
+  unsigned NumOperands = I->getNumOperands();
+  for (unsigned i = 0; i < NumOperands; i++) {
+    const MachineOperand &Op = I->getOperand(i);
+
+    if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
+      LastInstWritesM0 = true;
+  }
+}
+
 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
 // around other non-memory instructions.
 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
@@ -356,6 +447,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
 
   WaitedOn = ZeroCounts;
   LastIssued = ZeroCounts;
+  LastOpcodeType = OTHER;
+  LastInstWritesM0 = false;
 
   memset(&UsedRegs, 0, sizeof(UsedRegs));
   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
@@ -367,8 +460,14 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
          I != E; ++I) {
 
-      Changes |= insertWait(MBB, I, handleOperands(*I));
-      pushInstruction(*I);
+      // Wait for everything before a barrier.
+      if (I->getOpcode() == AMDGPU::S_BARRIER)
+        Changes |= insertWait(MBB, I, LastIssued);
+      else
+        Changes |= insertWait(MBB, I, handleOperands(*I));
+
+      pushInstruction(MBB, I);
+      handleSendMsg(MBB, I);
     }
 
     // Wait for everything at the end of the MBB
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 10e0a3f..c90c741 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -17,65 +17,109 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
   field bits<1> VM_CNT = 0;
   field bits<1> EXP_CNT = 0;
   field bits<1> LGKM_CNT = 0;
-  field bits<1> MIMG = 0;
-  field bits<1> SMRD = 0;
+
+  field bits<1> SALU = 0;
+  field bits<1> VALU = 0;
+
+  field bits<1> SOP1 = 0;
+  field bits<1> SOP2 = 0;
+  field bits<1> SOPC = 0;
+  field bits<1> SOPK = 0;
+  field bits<1> SOPP = 0;
+
   field bits<1> VOP1 = 0;
   field bits<1> VOP2 = 0;
   field bits<1> VOP3 = 0;
   field bits<1> VOPC = 0;
-  field bits<1> SALU = 0;
+
   field bits<1> MUBUF = 0;
   field bits<1> MTBUF = 0;
+  field bits<1> SMRD = 0;
+  field bits<1> DS = 0;
+  field bits<1> MIMG = 0;
   field bits<1> FLAT = 0;
+  field bits<1> WQM = 0;
 
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = VM_CNT;
   let TSFlags{1} = EXP_CNT;
   let TSFlags{2} = LGKM_CNT;
-  let TSFlags{3} = MIMG;
-  let TSFlags{4} = SMRD;
-  let TSFlags{5} = VOP1;
-  let TSFlags{6} = VOP2;
-  let TSFlags{7} = VOP3;
-  let TSFlags{8} = VOPC;
-  let TSFlags{9} = SALU;
-  let TSFlags{10} = MUBUF;
-  let TSFlags{11} = MTBUF;
-  let TSFlags{12} = FLAT;
+
+  let TSFlags{3} = SALU;
+  let TSFlags{4} = VALU;
+
+  let TSFlags{5} = SOP1;
+  let TSFlags{6} = SOP2;
+  let TSFlags{7} = SOPC;
+  let TSFlags{8} = SOPK;
+  let TSFlags{9} = SOPP;
+
+  let TSFlags{10} = VOP1;
+  let TSFlags{11} = VOP2;
+  let TSFlags{12} = VOP3;
+  let TSFlags{13} = VOPC;
+
+  let TSFlags{14} = MUBUF;
+  let TSFlags{15} = MTBUF;
+  let TSFlags{16} = SMRD;
+  let TSFlags{17} = DS;
+  let TSFlags{18} = MIMG;
+  let TSFlags{19} = FLAT;
+  let TSFlags{20} = WQM;
 
   // Most instructions require adjustments after selection to satisfy
   // operand requirements.
   let hasPostISelHook = 1;
+  let SchedRW = [Write32Bit];
 }
 
 class Enc32 {
-
   field bits<32> Inst;
   int Size = 4;
 }
 
 class Enc64 {
-
   field bits<64> Inst;
   int Size = 8;
 }
 
-class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> :
+let Uses = [EXEC] in {
+
+class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
     InstSI <outs, ins, asm, pattern> {
+
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
   let UseNamedOperandTable = 1;
+  let VALU = 1;
+}
+
+class VOPCCommon <dag ins, string asm, list<dag> pattern> :
+    VOPAnyCommon <(outs VCCReg:$dst), ins, asm, pattern> {
+
+  let DisableEncoding = "$dst";
+  let VOPC = 1;
+  let Size = 4;
+}
+
+class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> :
+    VOPAnyCommon <outs, ins, asm, pattern> {
+
   let VOP1 = 1;
+  let Size = 4;
+}
+
+class VOP2Common <dag outs, dag ins, string asm, list<dag> pattern> :
+    VOPAnyCommon <outs, ins, asm, pattern> {
+
+  let VOP2 = 1;
+  let Size = 4;
 }
 
 class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern> {
+    VOPAnyCommon <outs, ins, asm, pattern> {
 
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let UseNamedOperandTable = 1;
   // Using complex patterns gives VOP3 patterns a very high complexity rating,
   // but standalone patterns are almost always prefered, so we need to adjust the
   // priority lower.  The goal is to use a high number to reduce complexity to
@@ -83,63 +127,58 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
   let AddedComplexity = -1000;
 
   let VOP3 = 1;
-
   int Size = 8;
-  let Uses = [EXEC];
 }
 
+} // End Uses = [EXEC]
+
 //===----------------------------------------------------------------------===//
 // Scalar operations
 //===----------------------------------------------------------------------===//
 
 class SOP1e <bits<8> op> : Enc32 {
+  bits<7> sdst;
+  bits<8> ssrc0;
 
-  bits<7> SDST;
-  bits<8> SSRC0;
-
-  let Inst{7-0} = SSRC0;
+  let Inst{7-0} = ssrc0;
   let Inst{15-8} = op;
-  let Inst{22-16} = SDST;
+  let Inst{22-16} = sdst;
   let Inst{31-23} = 0x17d; //encoding;
 }
 
 class SOP2e <bits<7> op> : Enc32 {
+  bits<7> sdst;
+  bits<8> ssrc0;
+  bits<8> ssrc1;
 
-  bits<7> SDST;
-  bits<8> SSRC0;
-  bits<8> SSRC1;
-
-  let Inst{7-0} = SSRC0;
-  let Inst{15-8} = SSRC1;
-  let Inst{22-16} = SDST;
+  let Inst{7-0} = ssrc0;
+  let Inst{15-8} = ssrc1;
+  let Inst{22-16} = sdst;
   let Inst{29-23} = op;
   let Inst{31-30} = 0x2; // encoding
 }
 
 class SOPCe <bits<7> op> : Enc32 {
+  bits<8> ssrc0;
+  bits<8> ssrc1;
 
-  bits<8> SSRC0;
-  bits<8> SSRC1;
-
-  let Inst{7-0} = SSRC0;
-  let Inst{15-8} = SSRC1;
+  let Inst{7-0} = ssrc0;
+  let Inst{15-8} = ssrc1;
   let Inst{22-16} = op;
   let Inst{31-23} = 0x17e;
 }
 
 class SOPKe <bits<5> op> : Enc32 {
+  bits <7> sdst;
+  bits <16> simm16;
 
-  bits <7> SDST;
-  bits <16> SIMM16;
-
-  let Inst{15-0} = SIMM16;
-  let Inst{22-16} = SDST;
+  let Inst{15-0} = simm16;
+  let Inst{22-16} = sdst;
   let Inst{27-23} = op;
   let Inst{31-28} = 0xb; //encoding
 }
 
 class SOPPe <bits<7> op> : Enc32 {
-
   bits <16> simm16;
 
   let Inst{15-0} = simm16;
@@ -148,35 +187,36 @@ class SOPPe <bits<7> op> : Enc32 {
 }
 
 class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
+  bits<7> sdst;
+  bits<7> sbase;
+  bits<8> offset;
 
-  bits<7> SDST;
-  bits<7> SBASE;
-  bits<8> OFFSET;
-
-  let Inst{7-0} = OFFSET;
+  let Inst{7-0} = offset;
   let Inst{8} = imm;
-  let Inst{14-9} = SBASE{6-1};
-  let Inst{21-15} = SDST;
+  let Inst{14-9} = sbase{6-1};
+  let Inst{21-15} = sdst;
   let Inst{26-22} = op;
   let Inst{31-27} = 0x18; //encoding
 }
 
-class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI<outs, ins, asm, pattern>, SOP1e <op> {
-
+let SchedRW = [WriteSALU] in {
+class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI<outs, ins, asm, pattern> {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
   let SALU = 1;
+  let SOP1 = 1;
 }
 
-class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern>, SOP2e<op> {
+class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
 
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
   let SALU = 1;
+  let SOP2 = 1;
 
   let UseNamedOperandTable = 1;
 }
@@ -189,17 +229,19 @@ class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayStore = 0;
   let hasSideEffects = 0;
   let SALU = 1;
+  let SOPC = 1;
 
   let UseNamedOperandTable = 1;
 }
 
-class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
-   InstSI <outs, ins , asm, pattern>, SOPKe<op> {
+class SOPK <dag outs, dag ins, string asm, list<dag> pattern> :
+   InstSI <outs, ins , asm, pattern> {
 
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
   let SALU = 1;
+  let SOPK = 1;
 
   let UseNamedOperandTable = 1;
 }
@@ -210,12 +252,14 @@ class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let isCodeGenOnly = 0;
   let SALU = 1;
+  let SOPP = 1;
 
   let UseNamedOperandTable = 1;
 }
 
+} // let SchedRW = [WriteSALU]
+
 class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
     InstSI<outs, ins, asm, pattern> {
 
@@ -225,6 +269,7 @@ class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 1;
   let hasSideEffects = 0;
   let UseNamedOperandTable = 1;
+  let SchedRW = [WriteSMEM];
 }
 
 //===----------------------------------------------------------------------===//
@@ -232,32 +277,44 @@ class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
 //===----------------------------------------------------------------------===//
 
 class VOP1e <bits<8> op> : Enc32 {
+  bits<8> vdst;
+  bits<9> src0;
 
-  bits<8> VDST;
-  bits<9> SRC0;
-
-  let Inst{8-0} = SRC0;
+  let Inst{8-0} = src0;
   let Inst{16-9} = op;
-  let Inst{24-17} = VDST;
+  let Inst{24-17} = vdst;
   let Inst{31-25} = 0x3f; //encoding
 }
 
 class VOP2e <bits<6> op> : Enc32 {
+  bits<8> vdst;
+  bits<9> src0;
+  bits<8> src1;
 
-  bits<8> VDST;
-  bits<9> SRC0;
-  bits<8> VSRC1;
-
-  let Inst{8-0} = SRC0;
-  let Inst{16-9} = VSRC1;
-  let Inst{24-17} = VDST;
+  let Inst{8-0} = src0;
+  let Inst{16-9} = src1;
+  let Inst{24-17} = vdst;
   let Inst{30-25} = op;
   let Inst{31} = 0x0; //encoding
 }
 
-class VOP3e <bits<9> op> : Enc64 {
+class VOP2_MADKe <bits<6> op> : Enc64 {
+
+  bits<8>  vdst;
+  bits<9>  src0;
+  bits<8>  vsrc1;
+  bits<32> src2;
 
-  bits<8> dst;
+  let Inst{8-0} = src0;
+  let Inst{16-9} = vsrc1;
+  let Inst{24-17} = vdst;
+  let Inst{30-25} = op;
+  let Inst{31} = 0x0; // encoding
+  let Inst{63-32} = src2;
+}
+
+class VOP3e <bits<9> op> : Enc64 {
+  bits<8> vdst;
   bits<2> src0_modifiers;
   bits<9> src0;
   bits<2> src1_modifiers;
@@ -267,7 +324,7 @@ class VOP3e <bits<9> op> : Enc64 {
   bits<1> clamp;
   bits<2> omod;
 
-  let Inst{7-0} = dst;
+  let Inst{7-0} = vdst;
   let Inst{8} = src0_modifiers{1};
   let Inst{9} = src1_modifiers{1};
   let Inst{10} = src2_modifiers{1};
@@ -284,8 +341,7 @@ class VOP3e <bits<9> op> : Enc64 {
 }
 
 class VOP3be <bits<9> op> : Enc64 {
-
-  bits<8> dst;
+  bits<8> vdst;
   bits<2> src0_modifiers;
   bits<9> src0;
   bits<2> src1_modifiers;
@@ -295,7 +351,7 @@ class VOP3be <bits<9> op> : Enc64 {
   bits<7> sdst;
   bits<2> omod;
 
-  let Inst{7-0} = dst;
+  let Inst{7-0} = vdst;
   let Inst{14-8} = sdst;
   let Inst{25-17} = op;
   let Inst{31-26} = 0x34; //encoding
@@ -309,33 +365,30 @@ class VOP3be <bits<9> op> : Enc64 {
 }
 
 class VOPCe <bits<8> op> : Enc32 {
+  bits<9> src0;
+  bits<8> vsrc1;
 
-  bits<9> SRC0;
-  bits<8> VSRC1;
-
-  let Inst{8-0} = SRC0;
-  let Inst{16-9} = VSRC1;
+  let Inst{8-0} = src0;
+  let Inst{16-9} = vsrc1;
   let Inst{24-17} = op;
   let Inst{31-25} = 0x3e;
 }
 
 class VINTRPe <bits<2> op> : Enc32 {
+  bits<8> vdst;
+  bits<8> vsrc;
+  bits<2> attrchan;
+  bits<6> attr;
 
-  bits<8> VDST;
-  bits<8> VSRC;
-  bits<2> ATTRCHAN;
-  bits<6> ATTR;
-
-  let Inst{7-0} = VSRC;
-  let Inst{9-8} = ATTRCHAN;
-  let Inst{15-10} = ATTR;
+  let Inst{7-0} = vsrc;
+  let Inst{9-8} = attrchan;
+  let Inst{15-10} = attr;
   let Inst{17-16} = op;
-  let Inst{25-18} = VDST;
+  let Inst{25-18} = vdst;
   let Inst{31-26} = 0x32; // encoding
 }
 
 class DSe <bits<8> op> : Enc64 {
-
   bits<8> vdst;
   bits<1> gds;
   bits<8> addr;
@@ -356,7 +409,6 @@ class DSe <bits<8> op> : Enc64 {
 }
 
 class MUBUFe <bits<7> op> : Enc64 {
-
   bits<12> offset;
   bits<1> offen;
   bits<1> idxen;
@@ -387,67 +439,65 @@ class MUBUFe <bits<7> op> : Enc64 {
 }
 
 class MTBUFe <bits<3> op> : Enc64 {
+  bits<8> vdata;
+  bits<12> offset;
+  bits<1> offen;
+  bits<1> idxen;
+  bits<1> glc;
+  bits<1> addr64;
+  bits<4> dfmt;
+  bits<3> nfmt;
+  bits<8> vaddr;
+  bits<7> srsrc;
+  bits<1> slc;
+  bits<1> tfe;
+  bits<8> soffset;
 
-  bits<8> VDATA;
-  bits<12> OFFSET;
-  bits<1> OFFEN;
-  bits<1> IDXEN;
-  bits<1> GLC;
-  bits<1> ADDR64;
-  bits<4> DFMT;
-  bits<3> NFMT;
-  bits<8> VADDR;
-  bits<7> SRSRC;
-  bits<1> SLC;
-  bits<1> TFE;
-  bits<8> SOFFSET;
-
-  let Inst{11-0} = OFFSET;
-  let Inst{12} = OFFEN;
-  let Inst{13} = IDXEN;
-  let Inst{14} = GLC;
-  let Inst{15} = ADDR64;
+  let Inst{11-0} = offset;
+  let Inst{12} = offen;
+  let Inst{13} = idxen;
+  let Inst{14} = glc;
+  let Inst{15} = addr64;
   let Inst{18-16} = op;
-  let Inst{22-19} = DFMT;
-  let Inst{25-23} = NFMT;
+  let Inst{22-19} = dfmt;
+  let Inst{25-23} = nfmt;
   let Inst{31-26} = 0x3a; //encoding
-  let Inst{39-32} = VADDR;
-  let Inst{47-40} = VDATA;
-  let Inst{52-48} = SRSRC{6-2};
-  let Inst{54} = SLC;
-  let Inst{55} = TFE;
-  let Inst{63-56} = SOFFSET;
+  let Inst{39-32} = vaddr;
+  let Inst{47-40} = vdata;
+  let Inst{52-48} = srsrc{6-2};
+  let Inst{54} = slc;
+  let Inst{55} = tfe;
+  let Inst{63-56} = soffset;
 }
 
 class MIMGe <bits<7> op> : Enc64 {
-
-  bits<8> VDATA;
-  bits<4> DMASK;
-  bits<1> UNORM;
-  bits<1> GLC;
-  bits<1> DA;
-  bits<1> R128;
-  bits<1> TFE;
-  bits<1> LWE;
-  bits<1> SLC;
-  bits<8> VADDR;
-  bits<7> SRSRC;
-  bits<7> SSAMP;
-
-  let Inst{11-8} = DMASK;
-  let Inst{12} = UNORM;
-  let Inst{13} = GLC;
-  let Inst{14} = DA;
-  let Inst{15} = R128;
-  let Inst{16} = TFE;
-  let Inst{17} = LWE;
+  bits<8> vdata;
+  bits<4> dmask;
+  bits<1> unorm;
+  bits<1> glc;
+  bits<1> da;
+  bits<1> r128;
+  bits<1> tfe;
+  bits<1> lwe;
+  bits<1> slc;
+  bits<8> vaddr;
+  bits<7> srsrc;
+  bits<7> ssamp;
+
+  let Inst{11-8} = dmask;
+  let Inst{12} = unorm;
+  let Inst{13} = glc;
+  let Inst{14} = da;
+  let Inst{15} = r128;
+  let Inst{16} = tfe;
+  let Inst{17} = lwe;
   let Inst{24-18} = op;
-  let Inst{25} = SLC;
+  let Inst{25} = slc;
   let Inst{31-26} = 0x3c;
-  let Inst{39-32} = VADDR;
-  let Inst{47-40} = VDATA;
-  let Inst{52-48} = SRSRC{6-2};
-  let Inst{57-53} = SSAMP{6-2};
+  let Inst{39-32} = vaddr;
+  let Inst{47-40} = vdata;
+  let Inst{52-48} = srsrc{6-2};
+  let Inst{57-53} = ssamp{6-2};
 }
 
 class FLATe<bits<7> op> : Enc64 {
@@ -471,26 +521,26 @@ class FLATe<bits<7> op> : Enc64 {
 }
 
 class EXPe : Enc64 {
-  bits<4> EN;
-  bits<6> TGT;
-  bits<1> COMPR;
-  bits<1> DONE;
-  bits<1> VM;
-  bits<8> VSRC0;
-  bits<8> VSRC1;
-  bits<8> VSRC2;
-  bits<8> VSRC3;
-
-  let Inst{3-0} = EN;
-  let Inst{9-4} = TGT;
-  let Inst{10} = COMPR;
-  let Inst{11} = DONE;
-  let Inst{12} = VM;
+  bits<4> en;
+  bits<6> tgt;
+  bits<1> compr;
+  bits<1> done;
+  bits<1> vm;
+  bits<8> vsrc0;
+  bits<8> vsrc1;
+  bits<8> vsrc2;
+  bits<8> vsrc3;
+
+  let Inst{3-0} = en;
+  let Inst{9-4} = tgt;
+  let Inst{10} = compr;
+  let Inst{11} = done;
+  let Inst{12} = vm;
   let Inst{31-26} = 0x3e;
-  let Inst{39-32} = VSRC0;
-  let Inst{47-40} = VSRC1;
-  let Inst{55-48} = VSRC2;
-  let Inst{63-56} = VSRC3;
+  let Inst{39-32} = vsrc0;
+  let Inst{47-40} = vsrc1;
+  let Inst{55-48} = vsrc2;
+  let Inst{63-56} = vsrc3;
 }
 
 let Uses = [EXEC] in {
@@ -500,34 +550,13 @@ class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
     VOP1e<op>;
 
 class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern>, VOP2e<op> {
-
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let UseNamedOperandTable = 1;
-  let VOP2 = 1;
-}
-
-class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    VOP3Common <outs, ins, asm, pattern>, VOP3e<op>;
-
-class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    VOP3Common <outs, ins, asm, pattern>, VOP3be<op>;
+    VOP2Common <outs, ins, asm, pattern>, VOP2e<op>;
 
 class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
-    InstSI <(outs VCCReg:$dst), ins, asm, pattern>, VOPCe <op> {
-
-  let DisableEncoding = "$dst";
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let UseNamedOperandTable = 1;
-  let VOPC = 1;
-}
+    VOPCCommon <ins, asm, pattern>, VOPCe <op>;
 
-class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern>, VINTRPe<op> {
+class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -541,15 +570,18 @@ class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
 
 let Uses = [EXEC] in {
 
-class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern> , DSe<op> {
+class DS <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
 
   let LGKM_CNT = 1;
+  let DS = 1;
   let UseNamedOperandTable = 1;
+  let DisableEncoding = "$m0";
+  let SchedRW = [WriteLDS];
 }
 
-class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI<outs, ins, asm, pattern>, MUBUFe <op> {
+class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI<outs, ins, asm, pattern> {
 
   let VM_CNT = 1;
   let EXP_CNT = 1;
@@ -557,6 +589,7 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
 
   let hasSideEffects = 0;
   let UseNamedOperandTable = 1;
+  let SchedRW = [WriteVMEM];
 }
 
 class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -566,8 +599,9 @@ class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
   let EXP_CNT = 1;
   let MTBUF = 1;
 
-  let neverHasSideEffects = 1;
+  let hasSideEffects = 0;
   let UseNamedOperandTable = 1;
+  let SchedRW = [WriteVMEM];
 }
 
 class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -596,5 +630,4 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
 }
 
 
-
 } // End Uses = [EXEC]
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 8343362..4f1e5ad 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -28,8 +28,7 @@
 using namespace llvm;
 
 SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st)
-  : AMDGPUInstrInfo(st),
-    RI(st) { }
+    : AMDGPUInstrInfo(st), RI(st) {}
 
 //===----------------------------------------------------------------------===//
 // TargetInstrInfo callbacks
@@ -326,26 +325,6 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   unsigned Opcode;
   const int16_t *SubIndices;
 
-  if (AMDGPU::M0 == DestReg) {
-    // Check if M0 isn't already set to this value
-    for (MachineBasicBlock::reverse_iterator E = MBB.rend(),
-      I = MachineBasicBlock::reverse_iterator(MI); I != E; ++I) {
-
-      if (!I->definesRegister(AMDGPU::M0))
-        continue;
-
-      unsigned Opc = I->getOpcode();
-      if (Opc != TargetOpcode::COPY && Opc != AMDGPU::S_MOV_B32)
-        break;
-
-      if (!I->readsRegister(SrcReg))
-        break;
-
-      // The copy isn't necessary
-      return;
-    }
-  }
-
   if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
@@ -353,6 +332,21 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
 
   } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
+    if (DestReg == AMDGPU::VCC) {
+      if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+      } else {
+        // FIXME: Hack until VReg_1 removed.
+        assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
+        BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC)
+          .addImm(0)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+      }
+
+      return;
+    }
+
     assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
             .addReg(SrcReg, getKillRegState(KillSrc));
@@ -373,8 +367,8 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Opcode = AMDGPU::S_MOV_B32;
     SubIndices = Sub0_15;
 
-  } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) {
-    assert(AMDGPU::VReg_32RegClass.contains(SrcReg) ||
+  } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) {
+    assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
            AMDGPU::SReg_32RegClass.contains(SrcReg));
     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
             .addReg(SrcReg, getKillRegState(KillSrc));
@@ -428,27 +422,30 @@ unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const {
   int NewOpc;
 
   // Try to map original to commuted opcode
-  if ((NewOpc = AMDGPU::getCommuteRev(Opcode)) != -1)
+  NewOpc = AMDGPU::getCommuteRev(Opcode);
+  // Check if the commuted (REV) opcode exists on the target.
+  if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1)
     return NewOpc;
 
   // Try to map commuted to original opcode
-  if ((NewOpc = AMDGPU::getCommuteOrig(Opcode)) != -1)
+  NewOpc = AMDGPU::getCommuteOrig(Opcode);
+  // Check if the original (non-REV) opcode exists on the target.
+  if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1)
     return NewOpc;
 
   return Opcode;
 }
 
-static bool shouldTryToSpillVGPRs(MachineFunction *MF) {
-
-  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  const TargetMachine &TM = MF->getTarget();
-
-  // FIXME: Even though it can cause problems, we need to enable
-  // spilling at -O0, since the fast register allocator always
-  // spills registers that are live at the end of blocks.
-  return MFI->getShaderType() == ShaderType::COMPUTE &&
-         TM.getOptLevel() == CodeGenOpt::None;
+unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
 
+  if (DstRC->getSize() == 4) {
+    return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+  } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
+    return AMDGPU::S_MOV_B64;
+  } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) {
+    return  AMDGPU::V_MOV_B64_PSEUDO;
+  }
+  return AMDGPU::COPY;
 }
 
 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -458,6 +455,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       const TargetRegisterClass *RC,
                                       const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo *FrameInfo = MF->getFrameInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
   int Opcode = -1;
@@ -473,7 +471,9 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
       case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
     }
-  } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) {
+  } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
+    MFI->setHasSpilledVGPRs();
+
     switch(RC->getSize() * 8) {
       case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break;
       case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break;
@@ -488,12 +488,16 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     FrameInfo->setObjectAlignment(FrameIndex, 4);
     BuildMI(MBB, MI, DL, get(Opcode))
             .addReg(SrcReg)
-            .addFrameIndex(FrameIndex);
+            .addFrameIndex(FrameIndex)
+            // Place-holder registers, these will be filled in by
+            // SIPrepareScratchRegs.
+            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
+            .addReg(AMDGPU::SGPR0, RegState::Undef);
   } else {
     LLVMContext &Ctx = MF->getFunction()->getContext();
     Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
                   " spill register");
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0)
+    BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
             .addReg(SrcReg);
   }
 }
@@ -504,6 +508,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
+  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo *FrameInfo = MF->getFrameInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
   int Opcode = -1;
@@ -516,7 +521,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
       case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
     }
-  } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) {
+  } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
     switch(RC->getSize() * 8) {
       case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break;
       case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break;
@@ -530,13 +535,17 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   if (Opcode != -1) {
     FrameInfo->setObjectAlignment(FrameIndex, 4);
     BuildMI(MBB, MI, DL, get(Opcode), DestReg)
-            .addFrameIndex(FrameIndex);
+            .addFrameIndex(FrameIndex)
+            // Place-holder registers, these will be filled in by
+            // SIPrepareScratchRegs.
+            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
+            .addReg(AMDGPU::SGPR0, RegState::Undef);
+
   } else {
     LLVMContext &Ctx = MF->getFunction()->getContext();
     Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
                   " restore register");
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
-            .addReg(AMDGPU::VGPR0);
+    BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
   }
 }
 
@@ -548,7 +557,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
                                                unsigned Size) const {
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  const AMDGPUSubtarget &ST = MF->getTarget().getSubtarget<AMDGPUSubtarget>();
+  const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>();
   const SIRegisterInfo *TRI =
       static_cast<const SIRegisterInfo*>(ST.getRegisterInfo());
   DebugLoc DL = MBB.findDebugLoc(MI);
@@ -561,7 +570,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
     MachineBasicBlock::iterator Insert = Entry.front();
     DebugLoc DL = Insert->getDebugLoc();
 
-    TIDReg = RI.findUnusedVGPR(MF->getRegInfo());
+    TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass);
     if (TIDReg == AMDGPU::NoRegister)
       return TIDReg;
 
@@ -616,7 +625,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
               .addImm(-1)
               .addImm(0);
 
-      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e32),
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
               TIDReg)
               .addImm(-1)
               .addReg(TIDReg);
@@ -682,12 +691,42 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     // This is just a placeholder for register allocation.
     MI->eraseFromParent();
     break;
+
+  case AMDGPU::V_MOV_B64_PSEUDO: {
+    unsigned Dst = MI->getOperand(0).getReg();
+    unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
+    unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+
+    const MachineOperand &SrcOp = MI->getOperand(1);
+    // FIXME: Will this work for 64-bit floating point immediates?
+    assert(!SrcOp.isFPImm());
+    if (SrcOp.isImm()) {
+      APInt Imm(64, SrcOp.getImm());
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+              .addImm(Imm.getLoBits(32).getZExtValue())
+              .addReg(Dst, RegState::Implicit);
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+              .addImm(Imm.getHiBits(32).getZExtValue())
+              .addReg(Dst, RegState::Implicit);
+    } else {
+      assert(SrcOp.isReg());
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+              .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
+              .addReg(Dst, RegState::Implicit);
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+              .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
+              .addReg(Dst, RegState::Implicit);
+    }
+    MI->eraseFromParent();
+    break;
+  }
   }
   return true;
 }
 
 MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
                                               bool NewMI) const {
+
   if (MI->getNumOperands() < 3)
     return nullptr;
 
@@ -709,12 +748,13 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
   // Make sure it's legal to commute operands for VOP2.
   if (isVOP2(MI->getOpcode()) &&
       (!isOperandLegal(MI, Src0Idx, &Src1) ||
-       !isOperandLegal(MI, Src1Idx, &Src0)))
+       !isOperandLegal(MI, Src1Idx, &Src0))) {
     return nullptr;
+  }
 
   if (!Src1.isReg()) {
-    // Allow commuting instructions with Imm or FPImm operands.
-    if (NewMI || (!Src1.isImm() && !Src1.isFPImm()) ||
+    // Allow commuting instructions with Imm operands.
+    if (NewMI || !Src1.isImm() ||
        (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
       return nullptr;
     }
@@ -742,8 +782,6 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
     unsigned SubReg = Src0.getSubReg();
     if (Src1.isImm())
       Src0.ChangeToImmediate(Src1.getImm());
-    else if (Src1.isFPImm())
-      Src0.ChangeToFPImmediate(Src1.getFPImm());
     else
       llvm_unreachable("Should only have immediates");
 
@@ -821,6 +859,131 @@ SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
   return RC != &AMDGPU::EXECRegRegClass;
 }
 
+static void removeModOperands(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
+                                              AMDGPU::OpName::src0_modifiers);
+  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
+                                              AMDGPU::OpName::src1_modifiers);
+  int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
+                                              AMDGPU::OpName::src2_modifiers);
+
+  MI.RemoveOperand(Src2ModIdx);
+  MI.RemoveOperand(Src1ModIdx);
+  MI.RemoveOperand(Src0ModIdx);
+}
+
+bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
+                                unsigned Reg, MachineRegisterInfo *MRI) const {
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return false;
+
+  unsigned Opc = UseMI->getOpcode();
+  if (Opc == AMDGPU::V_MAD_F32) {
+    // Don't fold if we are using source modifiers. The new VOP2 instructions
+    // don't have them.
+    if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) ||
+        hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) ||
+        hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) {
+      return false;
+    }
+
+    MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0);
+    MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1);
+    MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2);
+
+    // Multiplied part is the constant: Use v_madmk_f32
+    // We should only expect these to be on src0 due to canonicalizations.
+    if (Src0->isReg() && Src0->getReg() == Reg) {
+      if (!Src1->isReg() ||
+          (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+        return false;
+
+      if (!Src2->isReg() ||
+          (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))))
+        return false;
+
+      // We need to do some weird looking operand shuffling since the madmk
+      // operands are out of the normal expected order with the multiplied
+      // constant as the last operand.
+      //
+      // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1
+      // src0 -> src2 K
+      // src1 -> src0
+      // src2 -> src1
+
+      const int64_t Imm = DefMI->getOperand(1).getImm();
+
+      // FIXME: This would be a lot easier if we could return a new instruction
+      // instead of having to modify in place.
+
+      // Remove these first since they are at the end.
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+                                                      AMDGPU::OpName::omod));
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+                                                      AMDGPU::OpName::clamp));
+
+      unsigned Src1Reg = Src1->getReg();
+      unsigned Src1SubReg = Src1->getSubReg();
+      unsigned Src2Reg = Src2->getReg();
+      unsigned Src2SubReg = Src2->getSubReg();
+      Src0->setReg(Src1Reg);
+      Src0->setSubReg(Src1SubReg);
+      Src1->setReg(Src2Reg);
+      Src1->setSubReg(Src2SubReg);
+
+      Src2->ChangeToImmediate(Imm);
+
+      removeModOperands(*UseMI);
+      UseMI->setDesc(get(AMDGPU::V_MADMK_F32));
+
+      bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+      if (DeleteDef)
+        DefMI->eraseFromParent();
+
+      return true;
+    }
+
+    // Added part is the constant: Use v_madak_f32
+    if (Src2->isReg() && Src2->getReg() == Reg) {
+      // Not allowed to use constant bus for another operand.
+      // We can however allow an inline immediate as src0.
+      if (!Src0->isImm() &&
+          (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
+        return false;
+
+      if (!Src1->isReg() ||
+          (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+        return false;
+
+      const int64_t Imm = DefMI->getOperand(1).getImm();
+
+      // FIXME: This would be a lot easier if we could return a new instruction
+      // instead of having to modify in place.
+
+      // Remove these first since they are at the end.
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+                                                      AMDGPU::OpName::omod));
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+                                                      AMDGPU::OpName::clamp));
+
+      Src2->ChangeToImmediate(Imm);
+
+      // These come before src2.
+      removeModOperands(*UseMI);
+      UseMI->setDesc(get(AMDGPU::V_MADAK_F32));
+
+      bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+      if (DeleteDef)
+        DefMI->eraseFromParent();
+
+      return true;
+    }
+  }
+
+  return false;
+}
+
 bool
 SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI,
                                          AliasAnalysis *AA) const {
@@ -915,63 +1078,24 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
   return false;
 }
 
-namespace llvm {
-namespace AMDGPU {
-// Helper function generated by tablegen.  We are wrapping this with
-// an SIInstrInfo function that returns bool rather than int.
-int isDS(uint16_t Opcode);
-}
-}
-
-bool SIInstrInfo::isDS(uint16_t Opcode) const {
-  return ::AMDGPU::isDS(Opcode) != -1;
-}
-
-bool SIInstrInfo::isMIMG(uint16_t Opcode) const {
-  return get(Opcode).TSFlags & SIInstrFlags::MIMG;
-}
-
-bool SIInstrInfo::isSMRD(uint16_t Opcode) const {
-  return get(Opcode).TSFlags & SIInstrFlags::SMRD;
-}
-
-bool SIInstrInfo::isMUBUF(uint16_t Opcode) const {
-  return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
-}
-
-bool SIInstrInfo::isMTBUF(uint16_t Opcode) const {
-  return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
-}
-
-bool SIInstrInfo::isFLAT(uint16_t Opcode) const {
-  return get(Opcode).TSFlags & SIInstrFlags::FLAT;
-}
-
-bool SIInstrInfo::isVOP1(uint16_t Opcode) const {
-  return get(Opcode).TSFlags & SIInstrFlags::VOP1;
-}
-
-bool SIInstrInfo::isVOP2(uint16_t Opcode) const {
-  return get(Opcode).TSFlags & SIInstrFlags::VOP2;
-}
-
-bool SIInstrInfo::isVOP3(uint16_t Opcode) const {
-  return get(Opcode).TSFlags & SIInstrFlags::VOP3;
-}
-
-bool SIInstrInfo::isVOPC(uint16_t Opcode) const {
-  return get(Opcode).TSFlags & SIInstrFlags::VOPC;
-}
-
-bool SIInstrInfo::isSALUInstr(const MachineInstr &MI) const {
-  return get(MI.getOpcode()).TSFlags & SIInstrFlags::SALU;
-}
-
 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
-  int32_t Val = Imm.getSExtValue();
-  if (Val >= -16 && Val <= 64)
+  int64_t SVal = Imm.getSExtValue();
+  if (SVal >= -16 && SVal <= 64)
     return true;
 
+  if (Imm.getBitWidth() == 64) {
+    uint64_t Val = Imm.getZExtValue();
+    return (DoubleToBits(0.0) == Val) ||
+           (DoubleToBits(1.0) == Val) ||
+           (DoubleToBits(-1.0) == Val) ||
+           (DoubleToBits(0.5) == Val) ||
+           (DoubleToBits(-0.5) == Val) ||
+           (DoubleToBits(2.0) == Val) ||
+           (DoubleToBits(-2.0) == Val) ||
+           (DoubleToBits(4.0) == Val) ||
+           (DoubleToBits(-4.0) == Val);
+  }
+
   // The actual type of the operand does not seem to matter as long
   // as the bits match one of the inline immediate values.  For example:
   //
@@ -980,32 +1104,38 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
   //
   // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
   // floating-point, so it is a legal inline immediate.
-
-  return (APInt::floatToBits(0.0f) == Imm) ||
-         (APInt::floatToBits(1.0f) == Imm) ||
-         (APInt::floatToBits(-1.0f) == Imm) ||
-         (APInt::floatToBits(0.5f) == Imm) ||
-         (APInt::floatToBits(-0.5f) == Imm) ||
-         (APInt::floatToBits(2.0f) == Imm) ||
-         (APInt::floatToBits(-2.0f) == Imm) ||
-         (APInt::floatToBits(4.0f) == Imm) ||
-         (APInt::floatToBits(-4.0f) == Imm);
-}
-
-bool SIInstrInfo::isInlineConstant(const MachineOperand &MO) const {
-  if (MO.isImm())
-    return isInlineConstant(APInt(32, MO.getImm(), true));
-
-  if (MO.isFPImm()) {
-    APFloat FpImm = MO.getFPImm()->getValueAPF();
-    return isInlineConstant(FpImm.bitcastToAPInt());
+  uint32_t Val = Imm.getZExtValue();
+
+  return (FloatToBits(0.0f) == Val) ||
+         (FloatToBits(1.0f) == Val) ||
+         (FloatToBits(-1.0f) == Val) ||
+         (FloatToBits(0.5f) == Val) ||
+         (FloatToBits(-0.5f) == Val) ||
+         (FloatToBits(2.0f) == Val) ||
+         (FloatToBits(-2.0f) == Val) ||
+         (FloatToBits(4.0f) == Val) ||
+         (FloatToBits(-4.0f) == Val);
+}
+
+bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
+                                   unsigned OpSize) const {
+  if (MO.isImm()) {
+    // MachineOperand provides no way to tell the true operand size, since it
+    // only records a 64-bit value. We need to know the size to determine if a
+    // 32-bit floating point immediate bit pattern is legal for an integer
+    // immediate. It would be for any 32-bit integer operand, but would not be
+    // for a 64-bit one.
+
+    unsigned BitSize = 8 * OpSize;
+    return isInlineConstant(APInt(BitSize, MO.getImm(), true));
   }
 
   return false;
 }
 
-bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO) const {
-  return (MO.isImm() || MO.isFPImm()) && !isInlineConstant(MO);
+bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO,
+                                    unsigned OpSize) const {
+  return MO.isImm() && !isInlineConstant(MO, OpSize);
 }
 
 static bool compareMachineOp(const MachineOperand &Op0,
@@ -1018,8 +1148,6 @@ static bool compareMachineOp(const MachineOperand &Op0,
     return Op0.getReg() == Op1.getReg();
   case MachineOperand::MO_Immediate:
     return Op0.getImm() == Op1.getImm();
-  case MachineOperand::MO_FPImmediate:
-    return Op0.getFPImm() == Op1.getFPImm();
   default:
     llvm_unreachable("Didn't expect to be comparing these operand types");
   }
@@ -1029,7 +1157,7 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
                                  const MachineOperand &MO) const {
   const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo];
 
-  assert(MO.isImm() || MO.isFPImm() || MO.isTargetIndex() || MO.isFI());
+  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
 
   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
     return true;
@@ -1037,21 +1165,26 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
   if (OpInfo.RegClass < 0)
     return false;
 
-  if (isLiteralConstant(MO))
-    return RI.regClassCanUseLiteralConstant(OpInfo.RegClass);
+  unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize();
+  if (isLiteralConstant(MO, OpSize))
+    return RI.opCanUseLiteralConstant(OpInfo.OperandType);
 
-  return RI.regClassCanUseInlineConstant(OpInfo.RegClass);
+  return RI.opCanUseInlineConstant(OpInfo.OperandType);
 }
 
-bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) {
+bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) const {
   switch (AS) {
   case AMDGPUAS::GLOBAL_ADDRESS: {
     // MUBUF instructions a 12-bit offset in bytes.
     return isUInt<12>(OffsetSize);
   }
   case AMDGPUAS::CONSTANT_ADDRESS: {
-    // SMRD instructions have an 8-bit offset in dwords.
-    return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
+    // SMRD instructions have an 8-bit offset in dwords on SI and
+    // a 20-bit offset in bytes on VI.
+    if (RI.ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+      return isUInt<20>(OffsetSize);
+    else
+      return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
   }
   case AMDGPUAS::LOCAL_ADDRESS:
   case AMDGPUAS::REGION_ADDRESS: {
@@ -1066,7 +1199,11 @@ bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) {
 }
 
 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
-  return AMDGPU::getVOPe32(Opcode) != -1;
+  int Op32 = AMDGPU::getVOPe32(Opcode);
+  if (Op32 == -1)
+    return false;
+
+  return pseudoToMCOpcode(Op32) != -1;
 }
 
 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
@@ -1084,9 +1221,10 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
 }
 
 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
-                                  const MachineOperand &MO) const {
+                                  const MachineOperand &MO,
+                                  unsigned OpSize) const {
   // Literal constants use the constant bus.
-  if (isLiteralConstant(MO))
+  if (isLiteralConstant(MO, OpSize))
     return true;
 
   if (!MO.isReg() || !MO.isUse())
@@ -1132,21 +1270,35 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
 
   // Make sure the register classes are correct
   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
+    if (MI->getOperand(i).isFPImm()) {
+      ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
+                "all fp values to integers.";
+      return false;
+    }
+
+    int RegClass = Desc.OpInfo[i].RegClass;
+
     switch (Desc.OpInfo[i].OperandType) {
-    case MCOI::OPERAND_REGISTER: {
-      if ((MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm()) &&
-          !isImmOperandLegal(MI, i, MI->getOperand(i))) {
-          ErrInfo = "Illegal immediate value for operand.";
-          return false;
-        }
+    case MCOI::OPERAND_REGISTER:
+      if (MI->getOperand(i).isImm()) {
+        ErrInfo = "Illegal immediate value for operand.";
+        return false;
+      }
+      break;
+    case AMDGPU::OPERAND_REG_IMM32:
+      break;
+    case AMDGPU::OPERAND_REG_INLINE_C:
+      if (isLiteralConstant(MI->getOperand(i),
+                            RI.getRegClass(RegClass)->getSize())) {
+        ErrInfo = "Illegal immediate value for operand.";
+        return false;
       }
       break;
     case MCOI::OPERAND_IMMEDIATE:
       // Check if this operand is an immediate.
       // FrameIndex operands will be replaced by immediates, so they are
       // allowed.
-      if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm() &&
-          !MI->getOperand(i).isFI()) {
+      if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) {
         ErrInfo = "Expected immediate, but got non-immediate";
         return false;
       }
@@ -1158,7 +1310,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
     if (!MI->getOperand(i).isReg())
       continue;
 
-    int RegClass = Desc.OpInfo[i].RegClass;
     if (RegClass != -1) {
       unsigned Reg = MI->getOperand(i).getReg();
       if (TargetRegisterInfo::isVirtualRegister(Reg))
@@ -1175,11 +1326,18 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
 
   // Verify VOP*
   if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) {
+    // Only look at the true operands. Only a real operand can use the constant
+    // bus, and we don't want to check pseudo-operands like the source modifier
+    // flags.
+    const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+
     unsigned ConstantBusCount = 0;
     unsigned SGPRUsed = AMDGPU::NoRegister;
-    for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = MI->getOperand(i);
-      if (usesConstantBus(MRI, MO)) {
+    for (int OpIdx : OpIndices) {
+      if (OpIdx == -1)
+        break;
+      const MachineOperand &MO = MI->getOperand(OpIdx);
+      if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) {
         if (MO.isReg()) {
           if (MO.getReg() != SGPRUsed)
             ++ConstantBusCount;
@@ -1195,31 +1353,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
     }
   }
 
-  // Verify SRC1 for VOP2 and VOPC
-  if (Src1Idx != -1 && (isVOP2(Opcode) || isVOPC(Opcode))) {
-    const MachineOperand &Src1 = MI->getOperand(Src1Idx);
-    if (Src1.isImm() || Src1.isFPImm()) {
-      ErrInfo = "VOP[2C] src1 cannot be an immediate.";
-      return false;
-    }
-  }
-
-  // Verify VOP3
-  if (isVOP3(Opcode)) {
-    if (Src0Idx != -1 && isLiteralConstant(MI->getOperand(Src0Idx))) {
-      ErrInfo = "VOP3 src0 cannot be a literal constant.";
-      return false;
-    }
-    if (Src1Idx != -1 && isLiteralConstant(MI->getOperand(Src1Idx))) {
-      ErrInfo = "VOP3 src1 cannot be a literal constant.";
-      return false;
-    }
-    if (Src2Idx != -1 && isLiteralConstant(MI->getOperand(Src2Idx))) {
-      ErrInfo = "VOP3 src2 cannot be a literal constant.";
-      return false;
-    }
-  }
-
   // Verify misc. restrictions on specific instructions.
   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
@@ -1287,7 +1420,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
   case AMDGPU::S_LOAD_DWORDX4_IMM:
   case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
-  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e32;
+  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
   }
@@ -1302,8 +1435,13 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   const MCInstrDesc &Desc = get(MI.getOpcode());
   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
-      Desc.OpInfo[OpNo].RegClass == -1)
-    return MRI.getRegClass(MI.getOperand(OpNo).getReg());
+      Desc.OpInfo[OpNo].RegClass == -1) {
+    unsigned Reg = MI.getOperand(OpNo).getReg();
+
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      return MRI.getRegClass(Reg);
+    return RI.getPhysRegClass(Reg);
+  }
 
   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
   return RI.getRegClass(RCID);
@@ -1339,7 +1477,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
   if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
     VRC = &AMDGPU::VReg_64RegClass;
   else
-    VRC = &AMDGPU::VReg_32RegClass;
+    VRC = &AMDGPU::VGPR_32RegClass;
 
   unsigned Reg = MRI.createVirtualRegister(VRC);
   DebugLoc DL = MBB->findDebugLoc(I);
@@ -1428,6 +1566,14 @@ unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
   return Dst;
 }
 
+// Change the order of operands from (0, 1, 2) to (0, 2, 1)
+void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
+  assert(Inst->getNumExplicitOperands() == 3);
+  MachineOperand Op1 = Inst->getOperand(1);
+  Inst->RemoveOperand(1);
+  Inst->addOperand(Op1);
+}
+
 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
                                  const MachineOperand *MO) const {
   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
@@ -1438,14 +1584,16 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
   if (!MO)
     MO = &MI->getOperand(OpIdx);
 
-  if (usesConstantBus(MRI, *MO)) {
+  if (isVALU(InstDesc.Opcode) &&
+      usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
     unsigned SGPRUsed =
         MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister;
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
       if (i == OpIdx)
         continue;
-      if (usesConstantBus(MRI, MI->getOperand(i)) &&
-          MI->getOperand(i).isReg() && MI->getOperand(i).getReg() != SGPRUsed) {
+      const MachineOperand &Op = MI->getOperand(i);
+      if (Op.isReg() && Op.getReg() != SGPRUsed &&
+          usesConstantBus(MRI, Op, getOpSize(*MI, i))) {
         return false;
       }
     }
@@ -1463,12 +1611,13 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
     //
     // s_sendmsg 0, s0 ; Operand defined as m0reg
     //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
+
     return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
   }
 
 
   // Handle non-register types that are treated like immediates.
-  assert(MO->isImm() || MO->isFPImm() || MO->isTargetIndex() || MO->isFI());
+  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
 
   if (!DefinedRC) {
     // This operand expects an immediate.
@@ -1537,7 +1686,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
           // We can use one SGPR in each VOP3 instruction.
           continue;
         }
-      } else if (!isLiteralConstant(MO)) {
+      } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) {
         // If it is not a register and not a literal constant, then it must be
         // an inline constant which is always legal.
         continue;
@@ -1641,17 +1790,18 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
 
     // SRsrcPtrLo = srsrc:sub0
     unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc,
-        &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
+        &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass);
 
     // SRsrcPtrHi = srsrc:sub1
     unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc,
-        &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
+        &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass);
 
     // Create an empty resource descriptor
     unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+    uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
 
     // Zero64 = 0
     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
@@ -1661,12 +1811,12 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
     // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
             SRsrcFormatLo)
-            .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
+            .addImm(RsrcDataFormat & 0xFFFFFFFF);
 
     // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
             SRsrcFormatHi)
-            .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
+            .addImm(RsrcDataFormat >> 32);
 
     // NewSRsrc = {Zero64, SRsrcFormat}
     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
@@ -1685,8 +1835,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
     if (VAddr) {
       // This is already an ADDR64 instruction so we need to add the pointer
       // extracted from the resource descriptor to the current value of VAddr.
-      NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
-      NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+      NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
       // NewVaddrLo = SRsrcPtrLo + VAddr:sub0
       BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
@@ -1709,9 +1859,6 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
       MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
       MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
       MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
-      assert(SOffset->isImm() && SOffset->getImm() == 0 && "Legalizing MUBUF "
-             "with non-zero soffset is not implemented");
-      (void)SOffset;
 
       // Create the new instruction.
       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
@@ -1722,6 +1869,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
                   .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
                                               // This will be replaced later
                                               // with the new value of vaddr.
+                  .addOperand(*SOffset)
                   .addOperand(*Offset);
 
       MI->removeFromParent();
@@ -1764,27 +1912,30 @@ void SIInstrInfo::splitSMRD(MachineInstr *MI,
       getNamedOperand(*MI, AMDGPU::OpName::offset);
   const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
 
+  // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes
+  // on VI.
   if (OffOp) {
+    bool isVI = RI.ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
+    unsigned OffScale = isVI ? 1 : 4;
     // Handle the _IMM variant
-    unsigned LoOffset = OffOp->getImm();
-    unsigned HiOffset = LoOffset + (HalfSize / 4);
+    unsigned LoOffset = OffOp->getImm() * OffScale;
+    unsigned HiOffset = LoOffset + HalfSize;
     Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo)
                   .addOperand(*SBase)
-                  .addImm(LoOffset);
+                  .addImm(LoOffset / OffScale);
 
-    if (!isUInt<8>(HiOffset)) {
+    if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) {
       unsigned OffsetSGPR =
           MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
       BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR)
-              .addImm(HiOffset << 2);  // The immediate offset is in dwords,
-                                       // but offset in register is in bytes.
+              .addImm(HiOffset); // The offset in register is in bytes.
       Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
                     .addOperand(*SBase)
                     .addReg(OffsetSGPR);
     } else {
       Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi)
                      .addOperand(*SBase)
-                     .addImm(HiOffset);
+                     .addImm(HiOffset / OffScale);
     }
   } else {
     // Handle the _SGPR variant
@@ -1849,10 +2000,13 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con
         ImmOffset = 0;
       } else {
         assert(MI->getOperand(2).isImm());
-        // SMRD instructions take a dword offsets and MUBUF instructions
-        // take a byte offset.
-        ImmOffset = MI->getOperand(2).getImm() << 2;
+        // SMRD instructions take a dword offsets on SI and byte offset on VI
+        // and MUBUF instructions always take a byte offset.
+        ImmOffset = MI->getOperand(2).getImm();
+        if (RI.ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
+          ImmOffset <<= 2;
         RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
         if (isUInt<12>(ImmOffset)) {
           BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
                   RegOffset)
@@ -1870,13 +2024,14 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con
       unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
       unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
       unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
 
       BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1)
               .addImm(0);
       BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2)
-              .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
+              .addImm(RsrcDataFormat & 0xFFFFFFFF);
       BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
-              .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
+              .addImm(RsrcDataFormat >> 32);
       BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
               .addReg(DWord0)
               .addImm(AMDGPU::sub0)
@@ -1893,6 +2048,7 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con
         MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false);
       }
       MI->getOperand(1).setReg(SRsrc);
+      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0));
       MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
 
       const TargetRegisterClass *NewDstRC =
@@ -2001,6 +2157,43 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       continue;
     }
 
+    case AMDGPU::S_LSHL_B32:
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_ASHR_I32:
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_LSHR_B32:
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_LSHL_B64:
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_LSHLREV_B64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_ASHR_I64:
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_ASHRREV_I64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_LSHR_B64:
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_LSHRREV_B64;
+        swapOperands(Inst);
+      }
+      break;
+
     case AMDGPU::S_BFE_U64:
     case AMDGPU::S_BFM_B64:
       llvm_unreachable("Moving this op to VALU not implemented");
@@ -2107,7 +2300,7 @@ unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
 }
 
 const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
-  return &AMDGPU::VReg_32RegClass;
+  return &AMDGPU::VGPR_32RegClass;
 }
 
 void SIInstrInfo::splitScalar64BitUnaryOp(
@@ -2237,7 +2430,7 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist
   MachineOperand &Dest = Inst->getOperand(0);
   MachineOperand &Src = Inst->getOperand(1);
 
-  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e32);
+  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
   const TargetRegisterClass *SrcRC = Src.isReg() ?
     MRI.getRegClass(Src.getReg()) :
     &AMDGPU::SGPR_32RegClass;
@@ -2419,7 +2612,7 @@ MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
                                    unsigned ValueReg,
                                    unsigned Address, unsigned OffsetReg) const {
   const DebugLoc &DL = MBB->findDebugLoc(I);
-  unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister(
+  unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
                                       getIndirectIndexBegin(*MBB->getParent()));
 
   return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1))
@@ -2437,7 +2630,7 @@ MachineInstrBuilder SIInstrInfo::buildIndirectRead(
                                    unsigned ValueReg,
                                    unsigned Address, unsigned OffsetReg) const {
   const DebugLoc &DL = MBB->findDebugLoc(I);
-  unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister(
+  unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
                                       getIndirectIndexBegin(*MBB->getParent()));
 
   return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC))
@@ -2459,7 +2652,7 @@ void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved,
 
 
   for (int Index = Begin; Index <= End; ++Index)
-    Reserved.set(AMDGPU::VReg_32RegClass.getRegister(Index));
+    Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index));
 
   for (int Index = std::max(0, Begin - 1); Index <= End; ++Index)
     Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index));
@@ -2485,3 +2678,11 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
 
   return &MI.getOperand(Idx);
 }
+
+uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
+  uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
+  if (ST.isAmdHsaOS())
+    RsrcDataFormat |= (1ULL << 56);
+
+  return RsrcDataFormat;
+}
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index 3bdbc9b..12dc3f3 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -17,6 +17,7 @@
 #define LLVM_LIB_TARGET_R600_SIINSTRINFO_H
 
 #include "AMDGPUInstrInfo.h"
+#include "SIDefines.h"
 #include "SIRegisterInfo.h"
 
 namespace llvm {
@@ -44,6 +45,8 @@ private:
                          const TargetRegisterClass *RC,
                          const MachineOperand &Op) const;
 
+  void swapOperands(MachineBasicBlock::iterator Inst) const;
+
   void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
                                MachineInstr *Inst, unsigned Opcode) const;
 
@@ -107,6 +110,10 @@ public:
 
   bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 
+  // \brief Returns an opcode that can be used to move a value to a \p DstRC
+  // register.  If there is no hardware instruction that can store to \p
+  // DstRC, then AMDGPU::COPY is returned.
+  unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
   unsigned commuteOpcode(unsigned Opcode) const;
 
   MachineInstr *commuteInstruction(MachineInstr *MI,
@@ -128,27 +135,92 @@ public:
   bool isMov(unsigned Opcode) const override;
 
   bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
-  bool isDS(uint16_t Opcode) const;
-  bool isMIMG(uint16_t Opcode) const;
-  bool isSMRD(uint16_t Opcode) const;
-  bool isMUBUF(uint16_t Opcode) const;
-  bool isMTBUF(uint16_t Opcode) const;
-  bool isFLAT(uint16_t Opcode) const;
-  bool isVOP1(uint16_t Opcode) const;
-  bool isVOP2(uint16_t Opcode) const;
-  bool isVOP3(uint16_t Opcode) const;
-  bool isVOPC(uint16_t Opcode) const;
+
+  bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
+                     unsigned Reg, MachineRegisterInfo *MRI) const final;
+
+  bool isSALU(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SALU;
+  }
+
+  bool isVALU(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VALU;
+  }
+
+  bool isSOP1(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SOP1;
+  }
+
+  bool isSOP2(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SOP2;
+  }
+
+  bool isSOPC(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SOPC;
+  }
+
+  bool isSOPK(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SOPK;
+  }
+
+  bool isSOPP(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SOPP;
+  }
+
+  bool isVOP1(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VOP1;
+  }
+
+  bool isVOP2(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VOP2;
+  }
+
+  bool isVOP3(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VOP3;
+  }
+
+  bool isVOPC(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VOPC;
+  }
+
+  bool isMUBUF(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
+  }
+
+  bool isMTBUF(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
+  }
+
+  bool isSMRD(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SMRD;
+  }
+
+  bool isDS(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::DS;
+  }
+
+  bool isMIMG(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::MIMG;
+  }
+
+  bool isFLAT(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::FLAT;
+  }
+
+  bool isWQM(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::WQM;
+  }
 
   bool isInlineConstant(const APInt &Imm) const;
-  bool isInlineConstant(const MachineOperand &MO) const;
-  bool isLiteralConstant(const MachineOperand &MO) const;
+  bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const;
+  bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const;
 
   bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
                          const MachineOperand &MO) const;
 
   /// \brief Return true if the given offset Size in bytes can be folded into
   /// the immediate offsets of a memory instruction for the given address space.
-  static bool canFoldOffset(unsigned OffsetSize, unsigned AS) LLVM_READNONE;
+  bool canFoldOffset(unsigned OffsetSize, unsigned AS) const;
 
   /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
   /// This function will return false if you pass it a 32-bit instruction.
@@ -156,7 +228,8 @@ public:
 
   /// \brief Returns true if this operand uses the constant bus.
   bool usesConstantBus(const MachineRegisterInfo &MRI,
-                       const MachineOperand &MO) const;
+                       const MachineOperand &MO,
+                       unsigned OpSize) const;
 
   /// \brief Return true if this instruction has any modifiers.
   ///  e.g. src[012]_mod, omod, clamp.
@@ -168,7 +241,6 @@ public:
   bool verifyInstruction(const MachineInstr *MI,
                          StringRef &ErrInfo) const override;
 
-  bool isSALUInstr(const MachineInstr &MI) const;
   static unsigned getVALUOp(const MachineInstr &MI);
 
   bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const;
@@ -179,7 +251,27 @@ public:
   /// the register class of its machine operand.
   /// to infer the correct register class base on the other operands.
   const TargetRegisterClass *getOpRegClass(const MachineInstr &MI,
-                                           unsigned OpNo) const;\
+                                           unsigned OpNo) const;
+
+  /// \brief Return the size in bytes of the operand OpNo on the given
+  // instruction opcode.
+  unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const {
+    const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo];
+
+    if (OpInfo.RegClass == -1) {
+      // If this is an immediate operand, this must be a 32-bit literal.
+      assert(OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE);
+      return 4;
+    }
+
+    return RI.getRegClass(OpInfo.RegClass)->getSize();
+  }
+
+  /// \brief This form should usually be preferred since it handles operands
+  /// with unknown register classes.
+  unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const {
+    return getOpRegClass(MI, OpNo)->getSize();
+  }
 
   /// \returns true if it is legal for the operand at index \p OpNo
   /// to read a VGPR.
@@ -250,6 +342,9 @@ public:
                                         unsigned OpName) const {
     return getNamedOperand(const_cast<MachineInstr &>(MI), OpName);
   }
+
+  uint64_t getDefaultRsrcDataFormat() const;
+
 };
 
 namespace AMDGPU {
@@ -258,7 +353,6 @@ namespace AMDGPU {
   int getVOPe32(uint16_t Opcode);
   int getCommuteRev(uint16_t Opcode);
   int getCommuteOrig(uint16_t Opcode);
-  int getMCOpcode(uint16_t Opcode, unsigned Gen);
   int getAddr64Inst(uint16_t Opcode);
   int getAtomicRetOp(uint16_t Opcode);
   int getAtomicNoRetOp(uint16_t Opcode);
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 713e84e..e2747dc 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -9,35 +9,65 @@
 
 class vop {
   field bits<9> SI3;
+  field bits<10> VI3;
 }
 
-class vopc <bits<8> si> : vop {
+class vopc <bits<8> si, bits<8> vi = !add(0x40, si)> : vop {
   field bits<8> SI = si;
+  field bits<8> VI = vi;
 
-  field bits<9> SI3 = {0, si{7-0}};
+  field bits<9>  SI3 = {0, si{7-0}};
+  field bits<10> VI3 = {0, 0, vi{7-0}};
 }
 
-class vop1 <bits<8> si> : vop {
-  field bits<8> SI  = si;
+class vop1 <bits<8> si, bits<8> vi = si> : vop {
+  field bits<8> SI = si;
+  field bits<8> VI = vi;
 
-  field bits<9> SI3 = {1, 1, si{6-0}};
+  field bits<9>  SI3 = {1, 1, si{6-0}};
+  field bits<10> VI3 = !add(0x140, vi);
 }
 
-class vop2 <bits<6> si> : vop {
+class vop2 <bits<6> si, bits<6> vi = si> : vop {
   field bits<6> SI = si;
+  field bits<6> VI = vi;
+
+  field bits<9>  SI3 = {1, 0, 0, si{5-0}};
+  field bits<10> VI3 = {0, 1, 0, 0, vi{5-0}};
+}
 
-  field bits<9> SI3 = {1, 0, 0, si{5-0}};
+// Specify a VOP2 opcode for SI and VOP3 opcode for VI
+// that doesn't have VOP2 encoding on VI
+class vop23 <bits<6> si, bits<10> vi> : vop2 <si> {
+  let VI3 = vi;
 }
 
-class vop3 <bits<9> si> : vop {
-  field bits<9> SI3 = si;
+class vop3 <bits<9> si, bits<10> vi = {0, si}> : vop {
+  let SI3 = si;
+  let VI3 = vi;
+}
+
+class sop1 <bits<8> si, bits<8> vi = si> {
+  field bits<8> SI = si;
+  field bits<8> VI = vi;
+}
+
+class sop2 <bits<7> si, bits<7> vi = si> {
+  field bits<7> SI = si;
+  field bits<7> VI = vi;
+}
+
+class sopk <bits<5> si, bits<5> vi = si> {
+  field bits<5> SI = si;
+  field bits<5> VI = vi;
 }
 
 // Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
-// in AMDGPUMCInstLower.h
+// in AMDGPUInstrInfo.cpp
 def SISubtarget {
   int NONE = -1;
   int SI = 0;
+  int VI = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -131,6 +161,22 @@ def as_i32imm: SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i32);
 }]>;
 
+def as_i64imm: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i64);
+}]>;
+
+// Copied from the AArch64 backend:
+def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+  N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i32);
+}]>;
+
+// Copied from the AArch64 backend:
+def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+  N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i64);
+}]>;
+
 def IMM8bit : PatLeaf <(imm),
   [{return isUInt<8>(N->getZExtValue());}]
 >;
@@ -143,6 +189,10 @@ def IMM16bit : PatLeaf <(imm),
   [{return isUInt<16>(N->getZExtValue());}]
 >;
 
+def IMM20bit : PatLeaf <(imm),
+  [{return isUInt<20>(N->getZExtValue());}]
+>;
+
 def IMM32bit : PatLeaf <(imm),
   [{return isUInt<32>(N->getZExtValue());}]
 >;
@@ -156,13 +206,16 @@ class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
   return isInlineImmediate(N);
 }]>;
 
+class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
+  return isInlineImmediate(N);
+}]>;
+
 class SGPRImm <dag frag> : PatLeaf<frag, [{
-  if (TM.getSubtarget<AMDGPUSubtarget>().getGeneration() <
-      AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+  if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
     return false;
   }
   const SIRegisterInfo *SIRI =
-                       static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo());
+      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
                                                 U != E; ++U) {
     if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) {
@@ -186,6 +239,7 @@ def sopp_brtarget : Operand<OtherVT> {
 }
 
 include "SIInstrFormats.td"
+include "VIInstrFormats.td"
 
 let OperandType = "OPERAND_IMMEDIATE" in {
 
@@ -238,14 +292,15 @@ def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">;
 def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
 
 def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
-def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">;
-def MUBUFAddr64Atomic : ComplexPattern<i64, 4, "SelectMUBUFAddr64">;
+def MUBUFAddr64 : ComplexPattern<i64, 4, "SelectMUBUFAddr64">;
+def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
 def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
 def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
 def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
 
 def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
 def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
+def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">;
 def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
 
 //===----------------------------------------------------------------------===//
@@ -298,7 +353,7 @@ class SIMCInstr <string pseudo, int subtarget> {
 class EXPCommon : InstSI<
   (outs),
   (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
-       VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
+       VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3),
   "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
   [] > {
 
@@ -308,60 +363,157 @@ class EXPCommon : InstSI<
 
 multiclass EXP_m {
 
-  let isPseudo = 1 in {
+  let isPseudo = 1, isCodeGenOnly = 1 in {
     def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ;
   }
 
   def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe;
+
+  def _vi : EXPCommon, SIMCInstr <"exp", SISubtarget.VI>, EXPe_vi;
 }
 
 //===----------------------------------------------------------------------===//
 // Scalar classes
 //===----------------------------------------------------------------------===//
 
-class SOP1_32 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
-  op, (outs SReg_32:$dst), (ins SSrc_32:$src0),
-  opName#" $dst, $src0", pattern
+class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+  SOP1 <outs, ins, "", pattern>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm> :
+  SOP1 <outs, ins, asm, []>,
+  SOP1e <op.SI>,
+  SIMCInstr<opName, SISubtarget.SI>;
+
+class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm> :
+  SOP1 <outs, ins, asm, []>,
+  SOP1e <op.VI>,
+  SIMCInstr<opName, SISubtarget.VI>;
+
+multiclass SOP1_m <sop1 op, string opName, dag outs, dag ins, string asm,
+                   list<dag> pattern> {
+
+  def "" : SOP1_Pseudo <opName, outs, ins, pattern>;
+
+  def _si : SOP1_Real_si <op, opName, outs, ins, asm>;
+
+  def _vi : SOP1_Real_vi <op, opName, outs, ins, asm>;
+
+}
+
+multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
+    op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0),
+    opName#" $dst, $src0", pattern
 >;
 
-class SOP1_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
-  op, (outs SReg_64:$dst), (ins SSrc_64:$src0),
-  opName#" $dst, $src0", pattern
+multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
+    op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0),
+    opName#" $dst, $src0", pattern
 >;
 
+// no input, 64-bit output.
+multiclass SOP1_64_0 <sop1 op, string opName, list<dag> pattern> {
+  def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins), pattern>;
+
+  def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins),
+    opName#" $dst"> {
+    let ssrc0 = 0;
+  }
+
+  def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins),
+    opName#" $dst"> {
+    let ssrc0 = 0;
+  }
+}
+
+// 64-bit input, no output
+multiclass SOP1_1 <sop1 op, string opName, list<dag> pattern> {
+  def "" : SOP1_Pseudo <opName, (outs), (ins SReg_64:$src0), pattern>;
+
+  def _si : SOP1_Real_si <op, opName, (outs), (ins SReg_64:$src0),
+    opName#" $src0"> {
+    let sdst = 0;
+  }
+
+  def _vi : SOP1_Real_vi <op, opName, (outs), (ins SReg_64:$src0),
+    opName#" $src0"> {
+    let sdst = 0;
+  }
+}
+
 // 64-bit input, 32-bit output.
-class SOP1_32_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
-  op, (outs SReg_32:$dst), (ins SSrc_64:$src0),
-  opName#" $dst, $src0", pattern
+multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
+    op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0),
+    opName#" $dst, $src0", pattern
 >;
 
-class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
-  op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
-  opName#" $dst, $src0, $src1", pattern
->;
+class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> :
+  SOP2<outs, ins, "", pattern>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let Size = 4;
 
-class SOP2_SELECT_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
-  op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
-  opName#" $dst, $src0, $src1 [$scc]", pattern
->;
+  // Pseudo instructions have no encodings, but adding this field here allows
+  // us to do:
+  // let sdst = xxx in {
+  // for multiclasses that include both real and pseudo instructions.
+  field bits<7> sdst = 0;
+}
 
-class SOP2_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
-  op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
-  opName#" $dst, $src0, $src1", pattern
->;
+class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm> :
+  SOP2<outs, ins, asm, []>,
+  SOP2e<op.SI>,
+  SIMCInstr<opName, SISubtarget.SI>;
+
+class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> :
+  SOP2<outs, ins, asm, []>,
+  SOP2e<op.VI>,
+  SIMCInstr<opName, SISubtarget.VI>;
+
+multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> {
+  def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst),
+    (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), pattern>;
+
+  def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst),
+    (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
+    opName#" $dst, $src0, $src1 [$scc]">;
+
+  def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst),
+    (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
+    opName#" $dst, $src0, $src1 [$scc]">;
+}
+
+multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm,
+                   list<dag> pattern> {
+
+  def "" : SOP2_Pseudo <opName, outs, ins, pattern>;
+
+  def _si : SOP2_Real_si <op, opName, outs, ins, asm>;
 
-class SOP2_64_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
-  op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
-  opName#" $dst, $src0, $src1", pattern
+  def _vi : SOP2_Real_vi <op, opName, outs, ins, asm>;
+
+}
+
+multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
+    op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
+    opName#" $dst, $src0, $src1", pattern
 >;
 
-class SOP2_SHIFT_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
-  op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
-  opName#" $dst, $src0, $src1", pattern
+multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
+    op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
+    opName#" $dst, $src0, $src1", pattern
 >;
 
+multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
+    op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
+    opName#" $dst, $src0, $src1", pattern
+>;
 
-class SOPC_Helper <bits<7> op, RegisterClass rc, ValueType vt,
+class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
                     string opName, PatLeaf cond> : SOPC <
   op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1),
   opName#" $dst, $src0, $src1", []>;
@@ -372,15 +524,44 @@ class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
 class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL>
   : SOPC_Helper<op, SSrc_64, i64, opName, cond>;
 
-class SOPK_32 <bits<5> op, string opName, list<dag> pattern> : SOPK <
-  op, (outs SReg_32:$dst), (ins u16imm:$src0),
-  opName#" $dst, $src0", pattern
->;
+class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+  SOPK <outs, ins, "", pattern>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
 
-class SOPK_64 <bits<5> op, string opName, list<dag> pattern> : SOPK <
-  op, (outs SReg_64:$dst), (ins u16imm:$src0),
-  opName#" $dst, $src0", pattern
->;
+class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm> :
+  SOPK <outs, ins, asm, []>,
+  SOPKe <op.SI>,
+  SIMCInstr<opName, SISubtarget.SI>;
+
+class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm> :
+  SOPK <outs, ins, asm, []>,
+  SOPKe <op.VI>,
+  SIMCInstr<opName, SISubtarget.VI>;
+
+multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> {
+  def "" : SOPK_Pseudo <opName, (outs SReg_32:$dst), (ins u16imm:$src0),
+    pattern>;
+
+  def _si : SOPK_Real_si <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0),
+    opName#" $dst, $src0">;
+
+  def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0),
+    opName#" $dst, $src0">;
+}
+
+multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> {
+  def "" : SOPK_Pseudo <opName, (outs SCCReg:$dst),
+    (ins SReg_32:$src0, u16imm:$src1), pattern>;
+
+  def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst),
+    (ins SReg_32:$src0, u16imm:$src1), opName#" $dst, $src0">;
+
+  def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst),
+    (ins SReg_32:$src0, u16imm:$src1), opName#" $dst, $src0">;
+}
 
 //===----------------------------------------------------------------------===//
 // SMRD classes
@@ -390,6 +571,7 @@ class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   SMRD <outs, ins, "", pattern>,
   SIMCInstr<opName, SISubtarget.NONE> {
   let isPseudo = 1;
+  let isCodeGenOnly = 1;
 }
 
 class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins,
@@ -398,6 +580,12 @@ class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins,
   SMRDe <op, imm>,
   SIMCInstr<opName, SISubtarget.SI>;
 
+class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins,
+                    string asm> :
+  SMRD <outs, ins, asm, []>,
+  SMEMe_vi <op, imm>,
+  SIMCInstr<opName, SISubtarget.VI>;
+
 multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins,
                    string asm, list<dag> pattern> {
 
@@ -405,6 +593,11 @@ multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins,
 
   def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>;
 
+  // glc is only applicable to scalar stores, which are not yet
+  // implemented.
+  let glc = 0 in {
+    def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>;
+  }
 }
 
 multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass,
@@ -444,44 +637,27 @@ class getNumSrcArgs<ValueType Src1, ValueType Src2> {
 // Returns the register class to use for the destination of VOP[123C]
 // instructions for the given VT.
 class getVALUDstForVT<ValueType VT> {
-  RegisterClass ret = !if(!eq(VT.Size, 32), VReg_32, VReg_64);
+  RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32,
+                          !if(!eq(VT.Size, 64), VReg_64,
+                            SReg_64)); // else VT == i1
 }
 
 // Returns the register class to use for source 0 of VOP[12C]
 // instructions for the given VT.
 class getVOPSrc0ForVT<ValueType VT> {
-  RegisterClass ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64);
+  RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64);
 }
 
 // Returns the register class to use for source 1 of VOP[12C] for the
 // given VT.
 class getVOPSrc1ForVT<ValueType VT> {
-  RegisterClass ret = !if(!eq(VT.Size, 32), VReg_32, VReg_64);
-}
-
-// Returns the register classes for the source arguments of a VOP[12C]
-// instruction for the given SrcVTs.
-class getInRC32 <list<ValueType> SrcVT> {
-  list<RegisterClass> ret = [
-    getVOPSrc0ForVT<SrcVT[0]>.ret,
-    getVOPSrc1ForVT<SrcVT[1]>.ret
-  ];
+  RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64);
 }
 
 // Returns the register class to use for sources of VOP3 instructions for the
 // given VT.
 class getVOP3SrcForVT<ValueType VT> {
-  RegisterClass ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64);
-}
-
-// Returns the register classes for the source arguments of a VOP3
-// instruction for the given SrcVTs.
-class getInRC64 <list<ValueType> SrcVT> {
-  list<RegisterClass> ret = [
-    getVOP3SrcForVT<SrcVT[0]>.ret,
-    getVOP3SrcForVT<SrcVT[1]>.ret,
-    getVOP3SrcForVT<SrcVT[2]>.ret
-  ];
+  RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64);
 }
 
 // Returns 1 if the source arguments have modifiers, 0 if they do not.
@@ -491,15 +667,15 @@ class hasModifiers<ValueType SrcVT> {
 }
 
 // Returns the input arguments for VOP[12C] instructions for the given SrcVT.
-class getIns32 <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
+class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
   dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0),               // VOP1
             !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2
                                     (ins)));
 }
 
 // Returns the input arguments for VOP3 instructions for the given SrcVT.
-class getIns64 <RegisterClass Src0RC, RegisterClass Src1RC,
-                RegisterClass Src2RC, int NumSrcArgs,
+class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
+                RegisterOperand Src2RC, int NumSrcArgs,
                 bit HasModifiers> {
 
   dag ret =
@@ -549,7 +725,7 @@ class getAsm32 <int NumSrcArgs> {
 // Returns the assembly string for the inputs and outputs of a VOP3
 // instruction.
 class getAsm64 <int NumSrcArgs, bit HasModifiers> {
-  string src0 = "$src0_modifiers,";
+  string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
   string src1 = !if(!eq(NumSrcArgs, 1), "",
                    !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
                                            " $src1_modifiers,"));
@@ -570,11 +746,11 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field ValueType Src1VT = ArgVT[2];
   field ValueType Src2VT = ArgVT[3];
   field RegisterClass DstRC = getVALUDstForVT<DstVT>.ret;
-  field RegisterClass Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
+  field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
   field RegisterClass Src1RC32 = getVOPSrc1ForVT<Src1VT>.ret;
-  field RegisterClass Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
-  field RegisterClass Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
-  field RegisterClass Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
+  field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
+  field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
+  field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
 
   field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret;
   field bit HasModifiers = hasModifiers<Src0VT>.ret;
@@ -604,14 +780,31 @@ def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>;
 def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>;
 def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
 def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
+def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
 def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
 def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> {
   let Src0RC32 = VCSrc_32;
 }
+
+def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> {
+  let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
+  let Asm64 = " $dst, $src0_modifiers, $src1";
+}
+
+def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> {
+  let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
+  let Asm64 = " $dst, $src0_modifiers, $src1";
+}
+
 def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
+def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
 def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
 
 def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>;
+def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> {
+  field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2);
+  field string Asm = " $dst, $src0, $vsrc1, $src2";
+}
 def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
 def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
 def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
@@ -633,8 +826,13 @@ class AtomicNoRet <string noRetOp, bit isRet> {
 
 class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
   VOP1Common <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
+  VOP <opName>,
+  SIMCInstr <opName#"_e32", SISubtarget.NONE> {
   let isPseudo = 1;
+  let isCodeGenOnly = 1;
+
+  field bits<8> vdst;
+  field bits<9> src0;
 }
 
 multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
@@ -642,32 +840,99 @@ multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
   def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
 
   def _si : VOP1<op.SI, outs, ins, asm, []>,
-            SIMCInstr <opName, SISubtarget.SI>;
+            SIMCInstr <opName#"_e32", SISubtarget.SI>;
+  def _vi : VOP1<op.VI, outs, ins, asm, []>,
+            SIMCInstr <opName#"_e32", SISubtarget.VI>;
+}
+
+multiclass VOP1SI_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
+                   string opName> {
+  def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
+
+  def _si : VOP1<op.SI, outs, ins, asm, []>,
+            SIMCInstr <opName#"_e32", SISubtarget.SI>;
+  // No VI instruction. This class is for SI only.
+}
+
+class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+  VOP2Common <outs, ins, "", pattern>,
+  VOP <opName>,
+  SIMCInstr<opName#"_e32", SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
+                     string opName, string revOp> {
+  def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
+           VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+  def _si : VOP2 <op.SI, outs, ins, opName#asm, []>,
+            SIMCInstr <opName#"_e32", SISubtarget.SI>;
+}
+
+multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
+                   string opName, string revOp> {
+  def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
+           VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+  def _si : VOP2 <op.SI, outs, ins, opName#asm, []>,
+            SIMCInstr <opName#"_e32", SISubtarget.SI>;
+  def _vi : VOP2 <op.VI, outs, ins, opName#asm, []>,
+            SIMCInstr <opName#"_e32", SISubtarget.VI>;
 }
 
 class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> {
 
   bits<2> src0_modifiers = !if(HasModifiers, ?, 0);
   bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0);
-  bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ? ,0) ,0);
+  bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ?, 0), 0);
   bits<2> omod = !if(HasModifiers, ?, 0);
   bits<1> clamp = !if(HasModifiers, ?, 0);
   bits<9> src1 = !if(HasSrc1, ?, 0);
   bits<9> src2 = !if(HasSrc2, ?, 0);
 }
 
+class VOP3DisableModFields <bit HasSrc0Mods,
+                            bit HasSrc1Mods = 0,
+                            bit HasSrc2Mods = 0,
+                            bit HasOutputMods = 0> {
+  bits<2> src0_modifiers = !if(HasSrc0Mods, ?, 0);
+  bits<2> src1_modifiers = !if(HasSrc1Mods, ?, 0);
+  bits<2> src2_modifiers = !if(HasSrc2Mods, ?, 0);
+  bits<2> omod = !if(HasOutputMods, ?, 0);
+  bits<1> clamp = !if(HasOutputMods, ?, 0);
+}
+
 class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
   VOP3Common <outs, ins, "", pattern>,
   VOP <opName>,
-  SIMCInstr<opName, SISubtarget.NONE> {
+  SIMCInstr<opName#"_e64", SISubtarget.NONE> {
   let isPseudo = 1;
+  let isCodeGenOnly = 1;
 }
 
 class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
-  VOP3 <op, outs, ins, asm, []>,
-  SIMCInstr<opName, SISubtarget.SI>;
-
-multiclass VOP3_m <vop3 op, dag outs, dag ins, string asm, list<dag> pattern,
+  VOP3Common <outs, ins, asm, []>,
+  VOP3e <op>,
+  SIMCInstr<opName#"_e64", SISubtarget.SI>;
+
+class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> :
+  VOP3Common <outs, ins, asm, []>,
+  VOP3e_vi <op>,
+  SIMCInstr <opName#"_e64", SISubtarget.VI>;
+
+class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
+  VOP3Common <outs, ins, asm, []>,
+  VOP3be <op>,
+  SIMCInstr<opName#"_e64", SISubtarget.SI>;
+
+class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> :
+  VOP3Common <outs, ins, asm, []>,
+  VOP3be_vi <op>,
+  SIMCInstr <opName#"_e64", SISubtarget.VI>;
+
+multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern,
                    string opName, int NumSrcArgs, bit HasMods = 1> {
 
   def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
@@ -676,7 +941,26 @@ multiclass VOP3_m <vop3 op, dag outs, dag ins, string asm, list<dag> pattern,
             VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
                               !if(!eq(NumSrcArgs, 2), 0, 1),
                               HasMods>;
+  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+            VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
+                              !if(!eq(NumSrcArgs, 2), 0, 1),
+                              HasMods>;
+}
+
+// VOP3_m without source modifiers
+multiclass VOP3_m_nomods <vop op, dag outs, dag ins, string asm, list<dag> pattern,
+                   string opName, int NumSrcArgs, bit HasMods = 1> {
 
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+  let src0_modifiers = 0,
+      src1_modifiers = 0,
+      src2_modifiers = 0,
+      clamp = 0,
+      omod = 0 in {
+    def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>;
+    def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>;
+  }
 }
 
 multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm,
@@ -686,6 +970,19 @@ multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm,
 
   def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
             VOP3DisableFields<0, 0, HasMods>;
+
+  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+            VOP3DisableFields<0, 0, HasMods>;
+}
+
+multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm,
+                     list<dag> pattern, string opName, bit HasMods = 1> {
+
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+            VOP3DisableFields<0, 0, HasMods>;
+  // No VI instruction. This class is for SI only.
 }
 
 multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
@@ -695,12 +992,28 @@ multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
   def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
            VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
 
-  def _si : VOP3_Real_si <op.SI3,
-              outs, ins, asm, opName>,
-            VOP2_REV<revOp#"_e64_si", !eq(revOp, opName)>,
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+            VOP3DisableFields<1, 0, HasMods>;
+
+  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+            VOP3DisableFields<1, 0, HasMods>;
+}
+
+multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
+                     list<dag> pattern, string opName, string revOp,
+                     bit HasMods = 1, bit UseFullOp = 0> {
+
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
+           VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
+
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
             VOP3DisableFields<1, 0, HasMods>;
+
+  // No VI instruction. This class is for SI only.
 }
 
+// XXX - Is v_div_scale_{f32|f64} only available in vop3b without
+// option of implicit vcc use?
 multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm,
                       list<dag> pattern, string opName, string revOp,
                       bit HasMods = 1, bit UseFullOp = 0> {
@@ -711,13 +1024,27 @@ multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm,
   // can write it into any SGPR. We currently don't use the carry out,
   // so for now hardcode it to VCC as well.
   let sdst = SIOperand.VCC, Defs = [VCC] in {
-    def _si : VOP3b <op.SI3, outs, ins, asm, pattern>,
-              VOP3DisableFields<1, 0, HasMods>,
-              SIMCInstr<opName, SISubtarget.SI>,
-              VOP2_REV<revOp#"_e64_si", !eq(revOp, opName)>;
+    def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
+              VOP3DisableFields<1, 0, HasMods>;
+
+    def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
+              VOP3DisableFields<1, 0, HasMods>;
   } // End sdst = SIOperand.VCC, Defs = [VCC]
 }
 
+multiclass VOP3b_3_m <vop op, dag outs, dag ins, string asm,
+                      list<dag> pattern, string opName, string revOp,
+                      bit HasMods = 1, bit UseFullOp = 0> {
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+
+  def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
+            VOP3DisableFields<1, 1, HasMods>;
+
+  def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
+            VOP3DisableFields<1, 1, HasMods>;
+}
+
 multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName,
                      bit HasMods, bit defExec> {
@@ -725,17 +1052,39 @@ multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm,
   def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
 
   def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
-              VOP3DisableFields<1, 0, HasMods> {
+            VOP3DisableFields<1, 0, HasMods> {
+    let Defs = !if(defExec, [EXEC], []);
+  }
+
+  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+            VOP3DisableFields<1, 0, HasMods> {
     let Defs = !if(defExec, [EXEC], []);
   }
 }
 
+// An instruction that is VOP2 on SI and VOP3 on VI, no modifiers.
+multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins,
+                         string asm, list<dag> pattern = []> {
+  let isPseudo = 1, isCodeGenOnly = 1 in {
+    def "" : VOPAnyCommon <outs, ins, "", pattern>,
+             SIMCInstr<opName, SISubtarget.NONE>;
+  }
+
+  def _si : VOP2 <op.SI3{5-0}, outs, ins, asm, []>,
+            SIMCInstr <opName, SISubtarget.SI>;
+
+  def _vi : VOP3Common <outs, ins, asm, []>,
+            VOP3e_vi <op.VI3>,
+            VOP3DisableFields <1, 0, 0>,
+            SIMCInstr <opName, SISubtarget.VI>;
+}
+
 multiclass VOP1_Helper <vop1 op, string opName, dag outs,
                         dag ins32, string asm32, list<dag> pat32,
                         dag ins64, string asm64, list<dag> pat64,
                         bit HasMods> {
 
-  def _e32 : VOP1 <op.SI, outs, ins32, opName#asm32, pat32>, VOP<opName>;
+  defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>;
 
   defm _e64 : VOP3_1_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, HasMods>;
 }
@@ -752,17 +1101,24 @@ multiclass VOP1Inst <vop1 op, string opName, VOPProfile P,
   P.HasModifiers
 >;
 
-class VOP2_e32 <bits<6> op, string opName, dag outs, dag ins, string asm,
-                list<dag> pattern, string revOp> :
-  VOP2 <op, outs, ins, opName#asm, pattern>,
-  VOP <opName>,
-  VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
+multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
+                       SDPatternOperator node = null_frag> {
+
+  defm _e32 : VOP1SI_m <op, P.Outs, P.Ins32, opName#P.Asm32, [], opName>;
+
+  defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64,
+    !if(P.HasModifiers,
+      [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+                                i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
+      [(set P.DstVT:$dst, (node P.Src0VT:$src0))]),
+    opName, P.HasModifiers>;
+}
 
 multiclass VOP2_Helper <vop2 op, string opName, dag outs,
                         dag ins32, string asm32, list<dag> pat32,
                         dag ins64, string asm64, list<dag> pat64,
                         string revOp, bit HasMods> {
-  def _e32 : VOP2_e32 <op.SI, opName, outs, ins32, asm32, pat32, revOp>;
+  defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>;
 
   defm _e64 : VOP3_2_m <op,
     outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods
@@ -784,12 +1140,27 @@ multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
   revOp, P.HasModifiers
 >;
 
+multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P,
+                       SDPatternOperator node = null_frag,
+                       string revOp = opName> {
+  defm _e32 : VOP2SI_m <op, P.Outs, P.Ins32, P.Asm32, [], opName, revOp>;
+
+  defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#"_e64"#P.Asm64,
+    !if(P.HasModifiers,
+        [(set P.DstVT:$dst,
+             (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                        i1:$clamp, i32:$omod)),
+                   (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+    opName, revOp, P.HasModifiers>;
+}
+
 multiclass VOP2b_Helper <vop2 op, string opName, dag outs,
                          dag ins32, string asm32, list<dag> pat32,
                          dag ins64, string asm64, list<dag> pat64,
                          string revOp, bit HasMods> {
 
-  def _e32 : VOP2_e32 <op.SI, opName, outs, ins32, asm32, pat32, revOp>;
+  defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>;
 
   defm _e64 : VOP3b_2_m <op,
     outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods
@@ -811,16 +1182,94 @@ multiclass VOP2bInst <vop2 op, string opName, VOPProfile P,
   revOp, P.HasModifiers
 >;
 
+// A VOP2 instruction that is VOP3-only on VI.
+multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs,
+                            dag ins32, string asm32, list<dag> pat32,
+                            dag ins64, string asm64, list<dag> pat64,
+                            string revOp, bit HasMods> {
+  defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+
+  defm _e64 : VOP3_2_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName,
+                        revOp, HasMods>;
+}
+
+multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P,
+                          SDPatternOperator node = null_frag,
+                          string revOp = opName>
+                          : VOP2_VI3_Helper <
+  op, opName, P.Outs,
+  P.Ins32, P.Asm32, [],
+  P.Ins64, P.Asm64,
+  !if(P.HasModifiers,
+      [(set P.DstVT:$dst,
+           (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                      i1:$clamp, i32:$omod)),
+                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+      [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+  revOp, P.HasModifiers
+>;
+
+multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> {
+
+  def "" : VOP2_Pseudo <VOP_MADK.Outs, VOP_MADK.Ins, pattern, opName>;
+
+let isCodeGenOnly = 0 in {
+  def _si : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins,
+                        !strconcat(opName, VOP_MADK.Asm), []>,
+            SIMCInstr <opName#"_e32", SISubtarget.SI>,
+            VOP2_MADKe <op.SI>;
+
+  def _vi : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins,
+                        !strconcat(opName, VOP_MADK.Asm), []>,
+            SIMCInstr <opName#"_e32", SISubtarget.VI>,
+            VOP2_MADKe <op.VI>;
+} // End isCodeGenOnly = 0
+}
+
+class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+  VOPCCommon <ins, "", pattern>,
+  VOP <opName>,
+  SIMCInstr<opName#"_e32", SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern,
+                   string opName, bit DefExec> {
+  def "" : VOPC_Pseudo <outs, ins, pattern, opName>;
+
+  def _si : VOPC<op.SI, ins, asm, []>,
+            SIMCInstr <opName#"_e32", SISubtarget.SI> {
+    let Defs = !if(DefExec, [EXEC], []);
+  }
+
+  def _vi : VOPC<op.VI, ins, asm, []>,
+            SIMCInstr <opName#"_e32", SISubtarget.VI> {
+    let Defs = !if(DefExec, [EXEC], []);
+  }
+}
+
 multiclass VOPC_Helper <vopc op, string opName,
                         dag ins32, string asm32, list<dag> pat32,
                         dag out64, dag ins64, string asm64, list<dag> pat64,
                         bit HasMods, bit DefExec> {
-  def _e32 : VOPC <op.SI, ins32, opName#asm32, pat32>, VOP <opName> {
-    let Defs = !if(DefExec, [EXEC], []);
-  }
+  defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
+
+  defm _e64 : VOP3_C_m <op, out64, ins64, opName#"_e64"#asm64, pat64,
+                        opName, HasMods, DefExec>;
+}
+
+// Special case for class instructions which only have modifiers on
+// the 1st source operand.
+multiclass VOPC_Class_Helper <vopc op, string opName,
+                             dag ins32, string asm32, list<dag> pat32,
+                             dag out64, dag ins64, string asm64, list<dag> pat64,
+                             bit HasMods, bit DefExec> {
+  defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
 
-  defm _e64 : VOP3_C_m <op, out64, ins64, opName#"_e64"#asm64, pat64, opName,
-                        HasMods, DefExec>;
+  defm _e64 : VOP3_C_m <op, out64, ins64, opName#"_e64"#asm64, pat64,
+                        opName, HasMods, DefExec>,
+                        VOP3DisableModFields<1, 0, 0>;
 }
 
 multiclass VOPCInst <vopc op, string opName,
@@ -839,6 +1288,19 @@ multiclass VOPCInst <vopc op, string opName,
   P.HasModifiers, DefExec
 >;
 
+multiclass VOPCClassInst <vopc op, string opName, VOPProfile P,
+                     bit DefExec = 0> : VOPC_Class_Helper <
+  op, opName,
+  P.Ins32, P.Asm32, [],
+  (outs SReg_64:$dst), P.Ins64, P.Asm64,
+  !if(P.HasModifiers,
+      [(set i1:$dst,
+          (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))],
+      [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]),
+  P.HasModifiers, DefExec
+>;
+
+
 multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL> :
   VOPCInst <op, opName, VOP_F32_F32_F32, cond>;
 
@@ -873,6 +1335,18 @@ multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
     op, outs, ins, opName#asm, pat, opName, NumSrcArgs, HasMods
 >;
 
+multiclass VOPC_CLASS_F32 <vopc op, string opName> :
+  VOPCClassInst <op, opName, VOP_I1_F32_I32, 0>;
+
+multiclass VOPCX_CLASS_F32 <vopc op, string opName> :
+  VOPCClassInst <op, opName, VOP_I1_F32_I32, 1>;
+
+multiclass VOPC_CLASS_F64 <vopc op, string opName> :
+  VOPCClassInst <op, opName, VOP_I1_F64_I32, 0>;
+
+multiclass VOPCX_CLASS_F64 <vopc op, string opName> :
+  VOPCClassInst <op, opName, VOP_I1_F64_I32, 1>;
+
 multiclass VOP3Inst <vop3 op, string opName, VOPProfile P,
                      SDPatternOperator node = null_frag> : VOP3_Helper <
   op, opName, P.Outs, P.Ins64, P.Asm64,
@@ -901,9 +1375,31 @@ multiclass VOP3Inst <vop3 op, string opName, VOPProfile P,
   P.NumSrcArgs, P.HasModifiers
 >;
 
-multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterClass arc,
+// Special case for v_div_fmas_{f32|f64}, since it seems to be the
+// only VOP instruction that implicitly reads VCC.
+multiclass VOP3_VCC_Inst <vop3 op, string opName,
+                          VOPProfile P,
+                          SDPatternOperator node = null_frag> : VOP3_Helper <
+  op, opName,
+  P.Outs,
+  (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0,
+       InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1,
+       InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2,
+       ClampMod:$clamp,
+       omod:$omod),
+  " $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod",
+  [(set P.DstVT:$dst,
+            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                       i1:$clamp, i32:$omod)),
+                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+                  (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)),
+                  (i1 VCC)))],
+  3, 1
+>;
+
+multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc,
                     string opName, list<dag> pattern> :
-  VOP3b_2_m <
+  VOP3b_3_m <
   op, (outs vrc:$vdst, SReg_64:$sdst),
       (ins InputModsNoDefault:$src0_modifiers, arc:$src0,
            InputModsNoDefault:$src1_modifiers, arc:$src1,
@@ -917,7 +1413,7 @@ multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> :
   VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>;
 
 multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> :
-  VOP3b_Helper <op, VReg_32, VSrc_32, opName, pattern>;
+  VOP3b_Helper <op, VGPR_32, VSrc_32, opName, pattern>;
 
 
 class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
@@ -931,124 +1427,259 @@ class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
         i32:$omod)>;
 
 //===----------------------------------------------------------------------===//
+// Interpolation opcodes
+//===----------------------------------------------------------------------===//
+
+class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+  VINTRPCommon <outs, ins, "", pattern>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins,
+                      string asm> :
+  VINTRPCommon <outs, ins, asm, []>,
+  VINTRPe <op>,
+  SIMCInstr<opName, SISubtarget.SI>;
+
+class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins,
+                      string asm> :
+  VINTRPCommon <outs, ins, asm, []>,
+  VINTRPe_vi <op>,
+  SIMCInstr<opName, SISubtarget.VI>;
+
+multiclass VINTRP_m <bits <2> op, string opName, dag outs, dag ins, string asm,
+                     string disableEncoding = "", string constraints = "",
+                     list<dag> pattern = []> {
+  let DisableEncoding = disableEncoding,
+      Constraints = constraints in {
+    def "" : VINTRP_Pseudo <opName, outs, ins, pattern>;
+
+    def _si : VINTRP_Real_si <op, opName, outs, ins, asm>;
+
+    def _vi : VINTRP_Real_vi <op, opName, outs, ins, asm>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
 // Vector I/O classes
 //===----------------------------------------------------------------------===//
 
-class DS_1A <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> :
-    DS <op, outs, ins, asm, pat> {
+class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+  DS <outs, ins, "", pattern>,
+  SIMCInstr <opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class DS_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> :
+  DS <outs, ins, asm, []>,
+  DSe <op>,
+  SIMCInstr <opName, SISubtarget.SI>;
+
+class DS_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> :
+  DS <outs, ins, asm, []>,
+  DSe_vi <op>,
+  SIMCInstr <opName, SISubtarget.VI>;
+
+class DS_1A_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> :
+  DS <outs, ins, asm, []>,
+  DSe <op>,
+  SIMCInstr <opName, SISubtarget.SI> {
+
+  // Single load interpret the 2 i8imm operands as a single i16 offset.
   bits<16> offset;
+  let offset0 = offset{7-0};
+  let offset1 = offset{15-8};
+}
+
+class DS_1A_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> :
+  DS <outs, ins, asm, []>,
+  DSe_vi <op>,
+  SIMCInstr <opName, SISubtarget.VI> {
 
   // Single load interpret the 2 i8imm operands as a single i16 offset.
+  bits<16> offset;
   let offset0 = offset{7-0};
   let offset1 = offset{15-8};
+}
+
+multiclass DS_1A_Load_m <bits<8> op, string opName, dag outs, dag ins, string asm,
+                         list<dag> pat> {
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def "" : DS_Pseudo <opName, outs, ins, pat>;
 
-  let hasSideEffects = 0;
+    let data0 = 0, data1 = 0 in {
+      def _si : DS_1A_Real_si <op, opName, outs, ins, asm>;
+      def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>;
+    }
+  }
 }
 
-class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
+multiclass DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass>
+    : DS_1A_Load_m <
   op,
+  asm,
   (outs regClass:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, ds_offset:$offset),
-  asm#" $vdst, $addr"#"$offset"#" [M0]",
-  []> {
-  let data0 = 0;
-  let data1 = 0;
-  let mayLoad = 1;
-  let mayStore = 0;
+  (ins i1imm:$gds, VGPR_32:$addr, ds_offset:$offset, M0Reg:$m0),
+  asm#" $vdst, $addr"#"$offset",
+  []>;
+
+multiclass DS_Load2_m <bits<8> op, string opName, dag outs, dag ins, string asm,
+                       list<dag> pat> {
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def "" : DS_Pseudo <opName, outs, ins, pat>;
+
+    let data0 = 0, data1 = 0 in {
+      def _si : DS_Real_si <op, opName, outs, ins, asm>;
+      def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
+    }
+  }
 }
 
-class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
+multiclass DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass>
+    : DS_Load2_m <
   op,
+  asm,
   (outs regClass:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1),
-  asm#" $vdst, $addr"#"$offset0"#"$offset1 [M0]",
-  []> {
-  let data0 = 0;
-  let data1 = 0;
-  let mayLoad = 1;
-  let mayStore = 0;
-  let hasSideEffects = 0;
+  (ins i1imm:$gds, VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1,
+        M0Reg:$m0),
+  asm#" $vdst, $addr"#"$offset0"#"$offset1",
+  []>;
+
+multiclass DS_1A_Store_m <bits<8> op, string opName, dag outs, dag ins,
+                          string asm, list<dag> pat> {
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+    def "" : DS_Pseudo <opName, outs, ins, pat>;
+
+    let data1 = 0, vdst = 0 in {
+      def _si : DS_1A_Real_si <op, opName, outs, ins, asm>;
+      def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>;
+    }
+  }
 }
 
-class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
+multiclass DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass>
+    : DS_1A_Store_m <
   op,
+  asm,
   (outs),
-  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, ds_offset:$offset),
-  asm#" $addr, $data0"#"$offset"#" [M0]",
-  []> {
-  let data1 = 0;
-  let mayStore = 1;
-  let mayLoad = 0;
-  let vdst = 0;
+  (ins i1imm:$gds, VGPR_32:$addr, regClass:$data0, ds_offset:$offset, M0Reg:$m0),
+  asm#" $addr, $data0"#"$offset",
+  []>;
+
+multiclass DS_Store_m <bits<8> op, string opName, dag outs, dag ins,
+                       string asm, list<dag> pat> {
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+    def "" : DS_Pseudo <opName, outs, ins, pat>;
+
+    let vdst = 0 in {
+      def _si : DS_Real_si <op, opName, outs, ins, asm>;
+      def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
+    }
+  }
 }
 
-class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
+multiclass DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass>
+    : DS_Store_m <
   op,
+  asm,
   (outs),
-  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, regClass:$data1,
-       ds_offset0:$offset0, ds_offset1:$offset1),
-  asm#" $addr, $data0, $data1"#"$offset0"#"$offset1 [M0]",
-  []> {
-  let mayStore = 1;
-  let mayLoad = 0;
-  let hasSideEffects = 0;
-  let vdst = 0;
-}
+  (ins i1imm:$gds, VGPR_32:$addr, regClass:$data0, regClass:$data1,
+       ds_offset0:$offset0, ds_offset1:$offset1, M0Reg:$m0),
+  asm#" $addr, $data0, $data1"#"$offset0"#"$offset1",
+  []>;
 
 // 1 address, 1 data.
-class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A <
-  op,
-  (outs rc:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset),
-  asm#" $vdst, $addr, $data0"#"$offset"#" [M0]", []>,
-  AtomicNoRet<noRetOp, 1> {
+multiclass DS_1A1D_RET_m <bits<8> op, string opName, dag outs, dag ins,
+                          string asm, list<dag> pat, string noRetOp> {
+  let mayLoad = 1, mayStore = 1,
+      hasPostISelHook = 1 // Adjusted to no return version.
+      in {
+    def "" : DS_Pseudo <opName, outs, ins, pat>,
+             AtomicNoRet<noRetOp, 1>;
+
+    let data1 = 0 in {
+      def _si : DS_1A_Real_si <op, opName, outs, ins, asm>;
+      def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>;
+    }
+  }
+}
 
-  let data1 = 0;
-  let mayStore = 1;
-  let mayLoad = 1;
+multiclass DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc,
+                        string noRetOp = ""> : DS_1A1D_RET_m <
+  op, asm,
+  (outs rc:$vdst),
+  (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0),
+  asm#" $vdst, $addr, $data0"#"$offset", [], noRetOp>;
 
-  let hasPostISelHook = 1; // Adjusted to no return version.
+// 1 address, 2 data.
+multiclass DS_1A2D_RET_m <bits<8> op, string opName, dag outs, dag ins,
+                          string asm, list<dag> pat, string noRetOp> {
+  let mayLoad = 1, mayStore = 1,
+      hasPostISelHook = 1 // Adjusted to no return version.
+      in {
+    def "" : DS_Pseudo <opName, outs, ins, pat>,
+             AtomicNoRet<noRetOp, 1>;
+
+    def _si : DS_1A_Real_si <op, opName, outs, ins, asm>;
+    def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>;
+  }
 }
 
-// 1 address, 2 data.
-class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A <
-  op,
+multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc,
+                   string noRetOp = ""> : DS_1A2D_RET_m <
+  op, asm,
   (outs rc:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset),
-  asm#" $vdst, $addr, $data0, $data1"#"$offset"#" [M0]",
-  []>,
-  AtomicNoRet<noRetOp, 1> {
-  let mayStore = 1;
-  let mayLoad = 1;
-  let hasPostISelHook = 1; // Adjusted to no return version.
-}
+  (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0),
+  asm#" $vdst, $addr, $data0, $data1"#"$offset",
+  [], noRetOp>;
 
 // 1 address, 2 data.
-class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A <
-  op,
-  (outs),
-  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset),
-  asm#" $addr, $data0, $data1"#"$offset"#" [M0]",
-  []>,
-  AtomicNoRet<noRetOp, 0> {
-  let mayStore = 1;
-  let mayLoad = 1;
+multiclass DS_1A2D_NORET_m <bits<8> op, string opName, dag outs, dag ins,
+                            string asm, list<dag> pat, string noRetOp> {
+  let mayLoad = 1, mayStore = 1 in {
+    def "" : DS_Pseudo <opName, outs, ins, pat>,
+             AtomicNoRet<noRetOp, 0>;
+
+    let vdst = 0 in {
+      def _si : DS_1A_Real_si <op, opName, outs, ins, asm>;
+      def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>;
+    }
+  }
 }
 
-// 1 address, 1 data.
-class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A <
-  op,
+multiclass DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc,
+                     string noRetOp = asm> : DS_1A2D_NORET_m <
+  op, asm,
   (outs),
-  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset),
-  asm#" $addr, $data0"#"$offset"#" [M0]",
-  []>,
-  AtomicNoRet<noRetOp, 0> {
+  (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0),
+  asm#" $addr, $data0, $data1"#"$offset",
+  [], noRetOp>;
 
-  let data1 = 0;
-  let mayStore = 1;
-  let mayLoad = 1;
+// 1 address, 1 data.
+multiclass DS_1A1D_NORET_m <bits<8> op, string opName, dag outs, dag ins,
+                            string asm, list<dag> pat, string noRetOp> {
+  let mayLoad = 1, mayStore = 1 in {
+    def "" : DS_Pseudo <opName, outs, ins, pat>,
+             AtomicNoRet<noRetOp, 0>;
+
+    let data1 = 0, vdst = 0 in {
+      def _si : DS_1A_Real_si <op, opName, outs, ins, asm>;
+      def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>;
+    }
+  }
 }
 
+multiclass DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc,
+                          string noRetOp = asm> : DS_1A1D_NORET_m <
+  op, asm,
+  (outs),
+  (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0),
+  asm#" $addr, $data0"#"$offset",
+  [], noRetOp>;
+
 //===----------------------------------------------------------------------===//
 // MTBUF classes
 //===----------------------------------------------------------------------===//
@@ -1057,6 +1688,7 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   MTBUF <outs, ins, "", pattern>,
   SIMCInstr<opName, SISubtarget.NONE> {
   let isPseudo = 1;
+  let isCodeGenOnly = 1;
 }
 
 class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins,
@@ -1065,6 +1697,11 @@ class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins,
   MTBUFe <op>,
   SIMCInstr<opName, SISubtarget.SI>;
 
+class MTBUF_Real_vi <bits<4> op, string opName, dag outs, dag ins, string asm> :
+  MTBUF <outs, ins, asm, []>,
+  MTBUFe_vi <op>,
+  SIMCInstr <opName, SISubtarget.VI>;
+
 multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm,
                     list<dag> pattern> {
 
@@ -1072,6 +1709,8 @@ multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm,
 
   def _si : MTBUF_Real_si <op, opName, outs, ins, asm>;
 
+  def _vi : MTBUF_Real_vi <{0, op{2}, op{1}, op{0}}, opName, outs, ins, asm>;
+
 }
 
 let mayStore = 1, mayLoad = 0 in {
@@ -1080,8 +1719,8 @@ multiclass MTBUF_Store_Helper <bits<3> op, string opName,
                                RegisterClass regClass> : MTBUF_m <
   op, opName, (outs),
   (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
-   i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
-   SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
+   i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
+   SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset),
   opName#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
         #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", []
 >;
@@ -1094,43 +1733,124 @@ multiclass MTBUF_Load_Helper <bits<3> op, string opName,
                               RegisterClass regClass> : MTBUF_m <
   op, opName, (outs regClass:$dst),
   (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
-       i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc,
-       i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
+       i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
+       i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset),
   opName#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
         #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", []
 >;
 
 } // mayLoad = 1, mayStore = 0
 
-class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
+//===----------------------------------------------------------------------===//
+// MUBUF classes
+//===----------------------------------------------------------------------===//
 
+class mubuf <bits<7> si, bits<7> vi = si> {
+  field bits<7> SI = si;
+  field bits<7> VI = vi;
+}
+
+class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
   bit IsAddr64 = is_addr64;
   string OpName = NAME # suffix;
 }
 
-class MUBUFAtomicAddr64 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern>
-    : MUBUF <op, outs, ins, asm, pattern> {
+class MUBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+  MUBUF <outs, ins, "", pattern>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+
+  // dummy fields, so that we can use let statements around multiclasses
+  bits<1> offen;
+  bits<1> idxen;
+  bits<8> vaddr;
+  bits<1> glc;
+  bits<1> slc;
+  bits<1> tfe;
+  bits<8> soffset;
+}
+
+class MUBUF_Real_si <mubuf op, string opName, dag outs, dag ins,
+                     string asm> :
+  MUBUF <outs, ins, asm, []>,
+  MUBUFe <op.SI>,
+  SIMCInstr<opName, SISubtarget.SI> {
+  let lds = 0;
+}
 
-  let offen = 0;
-  let idxen = 0;
-  let addr64 = 1;
-  let tfe = 0;
+class MUBUF_Real_vi <mubuf op, string opName, dag outs, dag ins,
+                     string asm> :
+  MUBUF <outs, ins, asm, []>,
+  MUBUFe_vi <op.VI>,
+  SIMCInstr<opName, SISubtarget.VI> {
   let lds = 0;
-  let soffset = 128;
 }
 
-class MUBUFAtomicOffset <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern>
-    : MUBUF <op, outs, ins, asm, pattern> {
+multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm,
+                    list<dag> pattern> {
+
+  def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
+           MUBUFAddr64Table <0>;
 
-  let offen = 0;
-  let idxen = 0;
-  let addr64 = 0;
-  let tfe = 0;
+  let addr64 = 0 in {
+    def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
+  }
+
+  def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>;
+}
+
+multiclass MUBUFAddr64_m <mubuf op, string opName, dag outs,
+                          dag ins, string asm, list<dag> pattern> {
+
+  def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
+           MUBUFAddr64Table <1>;
+
+  let addr64 = 1 in {
+    def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
+  }
+
+  // There is no VI version. If the pseudo is selected, it should be lowered
+  // for VI appropriately.
+}
+
+class MUBUF_si <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+  MUBUF <outs, ins, asm, pattern>, MUBUFe <op> {
   let lds = 0;
-  let vaddr = 0;
 }
 
-multiclass MUBUF_Atomic <bits<7> op, string name, RegisterClass rc,
+multiclass MUBUFAtomicOffset_m <mubuf op, string opName, dag outs, dag ins,
+                                string asm, list<dag> pattern, bit is_return> {
+
+  def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
+           MUBUFAddr64Table <0, !if(is_return, "_RTN", "")>,
+           AtomicNoRet<NAME#"_OFFSET", is_return>;
+
+  let offen = 0, idxen = 0, tfe = 0, vaddr = 0 in {
+    let addr64 = 0 in {
+      def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
+    }
+
+    def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>;
+  }
+}
+
+multiclass MUBUFAtomicAddr64_m <mubuf op, string opName, dag outs, dag ins,
+                                string asm, list<dag> pattern, bit is_return> {
+
+  def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
+           MUBUFAddr64Table <1, !if(is_return, "_RTN", "")>,
+           AtomicNoRet<NAME#"_ADDR64", is_return>;
+
+  let offen = 0, idxen = 0, addr64 = 1, tfe = 0 in {
+    def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
+  }
+
+  // There is no VI version. If the pseudo is selected, it should be lowered
+  // for VI appropriately.
+}
+
+multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
                          ValueType vt, SDPatternOperator atomic> {
 
   let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in {
@@ -1138,174 +1858,149 @@ multiclass MUBUF_Atomic <bits<7> op, string name, RegisterClass rc,
     // No return variants
     let glc = 0 in {
 
-      def _ADDR64 : MUBUFAtomicAddr64 <
-        op, (outs),
+      defm _ADDR64 : MUBUFAtomicAddr64_m <
+        op, name#"_addr64", (outs),
         (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
-             mbuf_offset:$offset, slc:$slc),
-        name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#"$slc", []
-      >, MUBUFAddr64Table<1>, AtomicNoRet<NAME#"_ADDR64", 0>;
+             mbuf_offset:$offset, SCSrc_32:$soffset, slc:$slc),
+        name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0
+      >;
 
-      def _OFFSET : MUBUFAtomicOffset <
-        op, (outs),
+      defm _OFFSET : MUBUFAtomicOffset_m <
+        op, name#"_offset", (outs),
         (ins rc:$vdata, SReg_128:$srsrc, mbuf_offset:$offset,
-             SSrc_32:$soffset, slc:$slc),
-        name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", []
-      >, MUBUFAddr64Table<0>, AtomicNoRet<NAME#"_OFFSET", 0>;
+             SCSrc_32:$soffset, slc:$slc),
+        name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0
+      >;
     } // glc = 0
 
     // Variant that return values
     let glc = 1, Constraints = "$vdata = $vdata_in",
         DisableEncoding = "$vdata_in"  in {
 
-      def _RTN_ADDR64 : MUBUFAtomicAddr64 <
-        op, (outs rc:$vdata),
+      defm _RTN_ADDR64 : MUBUFAtomicAddr64_m <
+        op, name#"_rtn_addr64", (outs rc:$vdata),
         (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr,
-             mbuf_offset:$offset, slc:$slc),
-        name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#" glc"#"$slc",
+             mbuf_offset:$offset, SSrc_32:$soffset, slc:$slc),
+        name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc",
         [(set vt:$vdata,
-         (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i16:$offset,
-                                    i1:$slc), vt:$vdata_in))]
-      >, MUBUFAddr64Table<1, "_RTN">, AtomicNoRet<NAME#"_ADDR64", 1>;
+         (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+	                            i16:$offset, i1:$slc), vt:$vdata_in))], 1
+      >;
 
-      def _RTN_OFFSET : MUBUFAtomicOffset <
-        op, (outs rc:$vdata),
+      defm _RTN_OFFSET : MUBUFAtomicOffset_m <
+        op, name#"_rtn_offset", (outs rc:$vdata),
         (ins rc:$vdata_in, SReg_128:$srsrc, mbuf_offset:$offset,
-             SSrc_32:$soffset, slc:$slc),
+             SCSrc_32:$soffset, slc:$slc),
         name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc",
         [(set vt:$vdata,
          (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset,
-                                    i1:$slc), vt:$vdata_in))]
-      >, MUBUFAddr64Table<0, "_RTN">, AtomicNoRet<NAME#"_OFFSET", 1>;
+                                    i1:$slc), vt:$vdata_in))], 1
+      >;
 
     } // glc = 1
 
   } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1
 }
 
-multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass,
+multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass,
                               ValueType load_vt = i32,
                               SDPatternOperator ld = null_frag> {
 
-  let lds = 0, mayLoad = 1 in {
+  let mayLoad = 1, mayStore = 0 in {
+    let offen = 0, idxen = 0, vaddr = 0 in {
+      defm _OFFSET : MUBUF_m <op, name#"_offset", (outs regClass:$vdata),
+                           (ins SReg_128:$srsrc,
+                           mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc,
+                           slc:$slc, tfe:$tfe),
+                           name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
+                           [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc,
+                                                     i32:$soffset, i16:$offset,
+                                                     i1:$glc, i1:$slc, i1:$tfe)))]>;
+    }
 
-    let addr64 = 0 in {
+    let offen = 1, idxen = 0  in {
+      defm _OFFEN  : MUBUF_m <op, name#"_offen", (outs regClass:$vdata),
+                           (ins SReg_128:$srsrc, VGPR_32:$vaddr,
+                           SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc,
+                           tfe:$tfe),
+                           name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+    }
+
+    let offen = 0, idxen = 1 in {
+      defm _IDXEN  : MUBUF_m <op, name#"_idxen", (outs regClass:$vdata),
+                           (ins SReg_128:$srsrc, VGPR_32:$vaddr,
+                           mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc,
+                           slc:$slc, tfe:$tfe),
+                           name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+    }
 
-      let offen = 0, idxen = 0, vaddr = 0 in {
-        def _OFFSET : MUBUF <op, (outs regClass:$vdata),
-                             (ins SReg_128:$srsrc,
-                             mbuf_offset:$offset, SSrc_32:$soffset, glc:$glc,
-                             slc:$slc, tfe:$tfe),
-                             asm#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
-                             [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc,
-                                                       i32:$soffset, i16:$offset,
-                                                       i1:$glc, i1:$slc, i1:$tfe)))]>,
-                     MUBUFAddr64Table<0>;
-      }
-
-      let offen = 1, idxen = 0  in {
-        def _OFFEN  : MUBUF <op, (outs regClass:$vdata),
-                             (ins SReg_128:$srsrc, VReg_32:$vaddr,
-                             SSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc,
-                             tfe:$tfe),
-                             asm#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
-      }
-
-      let offen = 0, idxen = 1 in {
-        def _IDXEN  : MUBUF <op, (outs regClass:$vdata),
-                             (ins SReg_128:$srsrc, VReg_32:$vaddr,
-                             mbuf_offset:$offset, SSrc_32:$soffset, glc:$glc,
-                             slc:$slc, tfe:$tfe),
-                             asm#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
-      }
-
-      let offen = 1, idxen = 1 in {
-        def _BOTHEN : MUBUF <op, (outs regClass:$vdata),
-                             (ins SReg_128:$srsrc, VReg_64:$vaddr,
-                             SSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe),
-                             asm#" $vdata, $vaddr, $srsrc, $soffset, idxen offen"#"$glc"#"$slc"#"$tfe", []>;
-      }
+    let offen = 1, idxen = 1 in {
+      defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs regClass:$vdata),
+                           (ins SReg_128:$srsrc, VReg_64:$vaddr,
+                           SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+                           name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
     }
 
-    let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in {
-      def _ADDR64 : MUBUF <op, (outs regClass:$vdata),
-                           (ins SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset),
-                           asm#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset",
+    let offen = 0, idxen = 0, glc = 0, slc = 0, tfe = 0 in {
+      defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs regClass:$vdata),
+                           (ins SReg_128:$srsrc, VReg_64:$vaddr,
+                                SCSrc_32:$soffset, mbuf_offset:$offset),
+                           name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset",
                            [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc,
-                                                  i64:$vaddr, i16:$offset)))]>, MUBUFAddr64Table<1>;
+                                                  i64:$vaddr, i32:$soffset,
+                                                  i16:$offset)))]>;
     }
   }
 }
 
-multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
+multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass,
                           ValueType store_vt, SDPatternOperator st> {
-
-  let addr64 = 0, lds = 0 in {
-
-    def "" : MUBUF <
-      op, (outs),
-      (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset,
-           mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc,
-           tfe:$tfe),
-      name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"#
-           "$glc"#"$slc"#"$tfe",
-      []
-    >;
+  let mayLoad = 0, mayStore = 1 in {
+    defm : MUBUF_m <op, name, (outs),
+                    (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset,
+                    mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc,
+                    tfe:$tfe),
+                    name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"#
+                    "$glc"#"$slc"#"$tfe", []>;
 
     let offen = 0, idxen = 0, vaddr = 0 in {
-      def _OFFSET : MUBUF <
-        op, (outs),
-        (ins vdataClass:$vdata, SReg_128:$srsrc, mbuf_offset:$offset,
-              SSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe),
-        name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
-        [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
-                                           i16:$offset, i1:$glc, i1:$slc,
-                                           i1:$tfe))]
-      >, MUBUFAddr64Table<0>;
+      defm _OFFSET : MUBUF_m <op, name#"_offset",(outs),
+                              (ins vdataClass:$vdata, SReg_128:$srsrc, mbuf_offset:$offset,
+                              SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe),
+                              name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
+                              [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+                                   i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>;
     } // offen = 0, idxen = 0, vaddr = 0
 
     let offen = 1, idxen = 0  in {
-      def _OFFEN  : MUBUF <
-        op, (outs),
-        (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset,
-             mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
-        name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#
-            "$glc"#"$slc"#"$tfe",
-        []
-      >;
+      defm _OFFEN : MUBUF_m <op, name#"_offen", (outs),
+                             (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset,
+                             mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+                             name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#
+                             "$glc"#"$slc"#"$tfe", []>;
     } // end offen = 1, idxen = 0
 
-  } // End addr64 = 0, lds = 0
-
-  def _ADDR64 : MUBUF <
-    op, (outs),
-    (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset),
-    name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset",
-    [(st store_vt:$vdata,
-     (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))]>, MUBUFAddr64Table<1>
-     {
-
-      let mayLoad = 0;
-      let mayStore = 1;
-
-      // Encoding
-      let offen = 0;
-      let idxen = 0;
-      let glc = 0;
-      let addr64 = 1;
-      let lds = 0;
-      let slc = 0;
-      let tfe = 0;
-      let soffset = 128; // ZERO
-   }
+    let offen = 0, idxen = 0, glc = 0, slc = 0, tfe = 0 in {
+      defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs),
+                                    (ins vdataClass:$vdata, SReg_128:$srsrc,
+                                         VReg_64:$vaddr, SCSrc_32:$soffset,
+                                         mbuf_offset:$offset),
+                                    name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset",
+                                    [(st store_vt:$vdata,
+                                      (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr,
+                                                   i32:$soffset, i16:$offset))]>;
+    }
+  } // End mayLoad = 0, mayStore = 1
 }
 
 class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> :
-      FLAT <op, (outs regClass:$data),
+      FLAT <op, (outs regClass:$vdst),
                 (ins VReg_64:$addr),
-            asm#" $data, $addr, [M0, FLAT_SCRATCH]", []> {
+            asm#" $vdst, $addr, [M0, FLAT_SCRATCH]", []> {
   let glc = 0;
   let slc = 0;
   let tfe = 0;
+  let data = 0;
   let mayLoad = 1;
 }
 
@@ -1321,6 +2016,7 @@ class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
   let glc = 0;
   let slc = 0;
   let tfe = 0;
+  let vdst = 0;
 }
 
 class MIMG_Mask <string op, int channels> {
@@ -1339,7 +2035,7 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm,
   asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
      #" $tfe, $lwe, $slc, $vaddr, $srsrc",
   []> {
-  let SSAMP = 0;
+  let ssamp = 0;
   let mayLoad = 1;
   let mayStore = 0;
   let hasPostISelHook = 1;
@@ -1348,7 +2044,7 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm,
 multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
                                       RegisterClass dst_rc,
                                       int channels> {
-  def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_32>,
+  def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32>,
             MIMG_Mask<asm#"_V1", channels>;
   def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
             MIMG_Mask<asm#"_V2", channels>;
@@ -1357,7 +2053,7 @@ multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
 }
 
 multiclass MIMG_NoSampler <bits<7> op, string asm> {
-  defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VReg_32, 1>;
+  defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
   defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
   defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
   defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
@@ -1365,7 +2061,7 @@ multiclass MIMG_NoSampler <bits<7> op, string asm> {
 
 class MIMG_Sampler_Helper <bits<7> op, string asm,
                            RegisterClass dst_rc,
-                           RegisterClass src_rc> : MIMG <
+                           RegisterClass src_rc, int wqm> : MIMG <
   op,
   (outs dst_rc:$vdata),
   (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
@@ -1377,33 +2073,41 @@ class MIMG_Sampler_Helper <bits<7> op, string asm,
   let mayLoad = 1;
   let mayStore = 0;
   let hasPostISelHook = 1;
+  let WQM = wqm;
 }
 
 multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
                                     RegisterClass dst_rc,
-                                    int channels> {
-  def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_32>,
+                                    int channels, int wqm> {
+  def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm>,
             MIMG_Mask<asm#"_V1", channels>;
-  def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64>,
+  def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>,
             MIMG_Mask<asm#"_V2", channels>;
-  def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128>,
+  def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>,
             MIMG_Mask<asm#"_V4", channels>;
-  def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256>,
+  def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>,
             MIMG_Mask<asm#"_V8", channels>;
-  def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512>,
+  def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>,
             MIMG_Mask<asm#"_V16", channels>;
 }
 
 multiclass MIMG_Sampler <bits<7> op, string asm> {
-  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VReg_32, 1>;
-  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2>;
-  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3>;
-  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4>;
+  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 0>;
+  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 0>;
+  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 0>;
+  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 0>;
+}
+
+multiclass MIMG_Sampler_WQM <bits<7> op, string asm> {
+  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 1>;
+  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 1>;
+  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 1>;
+  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 1>;
 }
 
 class MIMG_Gather_Helper <bits<7> op, string asm,
                           RegisterClass dst_rc,
-                          RegisterClass src_rc> : MIMG <
+                          RegisterClass src_rc, int wqm> : MIMG <
   op,
   (outs dst_rc:$vdata),
   (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
@@ -1424,28 +2128,36 @@ class MIMG_Gather_Helper <bits<7> op, string asm,
   // Therefore, disable all code which updates DMASK by setting these two:
   let MIMG = 0;
   let hasPostISelHook = 0;
+  let WQM = wqm;
 }
 
 multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
                                     RegisterClass dst_rc,
-                                    int channels> {
-  def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_32>,
+                                    int channels, int wqm> {
+  def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>,
             MIMG_Mask<asm#"_V1", channels>;
-  def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64>,
+  def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>,
             MIMG_Mask<asm#"_V2", channels>;
-  def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128>,
+  def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>,
             MIMG_Mask<asm#"_V4", channels>;
-  def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256>,
+  def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>,
             MIMG_Mask<asm#"_V8", channels>;
-  def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512>,
+  def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>,
             MIMG_Mask<asm#"_V16", channels>;
 }
 
 multiclass MIMG_Gather <bits<7> op, string asm> {
-  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VReg_32, 1>;
-  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2>;
-  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3>;
-  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4>;
+  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 0>;
+  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 0>;
+  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 0>;
+  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 0>;
+}
+
+multiclass MIMG_Gather_WQM <bits<7> op, string asm> {
+  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 1>;
+  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 1>;
+  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 1>;
+  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 1>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1496,20 +2208,12 @@ def getCommuteOrig : InstrMapping {
   let ValueCols = [["1"]];
 }
 
-def isDS : InstrMapping {
-  let FilterClass = "DS";
-  let RowFields = ["Inst"];
-  let ColFields = ["Size"];
-  let KeyCol = ["8"];
-  let ValueCols = [["8"]];
-}
-
-def getMCOpcode : InstrMapping {
+def getMCOpcodeGen : InstrMapping {
   let FilterClass = "SIMCInstr";
   let RowFields = ["PseudoInstr"];
   let ColFields = ["Subtarget"];
   let KeyCol = [!cast<string>(SISubtarget.NONE)];
-  let ValueCols = [[!cast<string>(SISubtarget.SI)]];
+  let ValueCols = [[!cast<string>(SISubtarget.SI)],[!cast<string>(SISubtarget.VI)]];
 }
 
 def getAddr64Inst : InstrMapping {
@@ -1539,3 +2243,5 @@ def getAtomicNoRetOp : InstrMapping {
 }
 
 include "SIInstructions.td"
+include "CIInstructions.td"
+include "VIInstructions.td"
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 90da7a9..4f72e99 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -26,11 +26,18 @@ def SendMsgImm : Operand<i32> {
   let PrintMethod = "printSendMsg";
 }
 
-def isSI : Predicate<"Subtarget.getGeneration() "
+def isGCN : Predicate<"Subtarget->getGeneration() "
                       ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">;
-
-def isCI : Predicate<"Subtarget.getGeneration() "
+def isSICI : Predicate<
+  "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+  "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
+>;
+def isCI : Predicate<"Subtarget->getGeneration() "
                       ">= AMDGPUSubtarget::SEA_ISLANDS">;
+def isVI : Predicate <
+  "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS"
+>;
+
 def HasFlatAddressSpace : Predicate<"Subtarget.hasFlatAddressSpace()">;
 
 def SWaitMatchClass : AsmOperandClass {
@@ -43,7 +50,7 @@ def WAIT_FLAG : InstFlag<"printWaitFlag"> {
   let ParserMatchClass = SWaitMatchClass;
 }
 
-let SubtargetPredicate = isSI in {
+let SubtargetPredicate = isGCN in {
 
 //===----------------------------------------------------------------------===//
 // EXP Instructions
@@ -96,90 +103,99 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
 //===----------------------------------------------------------------------===//
 
 let isMoveImm = 1 in {
-def S_MOV_B32 : SOP1_32 <0x00000003, "s_mov_b32", []>;
-def S_MOV_B64 : SOP1_64 <0x00000004, "s_mov_b64", []>;
-def S_CMOV_B32 : SOP1_32 <0x00000005, "s_cmov_b32", []>;
-def S_CMOV_B64 : SOP1_64 <0x00000006, "s_cmov_b64", []>;
+  let isReMaterializable = 1 in {
+    defm S_MOV_B32 : SOP1_32 <sop1<0x03, 0x00>, "s_mov_b32", []>;
+    defm S_MOV_B64 : SOP1_64 <sop1<0x04, 0x01>, "s_mov_b64", []>;
+  } // let isRematerializeable = 1
+
+  let Uses = [SCC] in {
+    defm S_CMOV_B32 : SOP1_32 <sop1<0x05, 0x02>, "s_cmov_b32", []>;
+    defm S_CMOV_B64 : SOP1_64 <sop1<0x06, 0x03>, "s_cmov_b64", []>;
+  } // End Uses = [SCC]
 } // End isMoveImm = 1
 
-def S_NOT_B32 : SOP1_32 <0x00000007, "s_not_b32",
-  [(set i32:$dst, (not i32:$src0))]
->;
+let Defs = [SCC] in {
+  defm S_NOT_B32 : SOP1_32 <sop1<0x07, 0x04>, "s_not_b32",
+    [(set i32:$dst, (not i32:$src0))]
+  >;
 
-def S_NOT_B64 : SOP1_64 <0x00000008, "s_not_b64",
-  [(set i64:$dst, (not i64:$src0))]
->;
-def S_WQM_B32 : SOP1_32 <0x00000009, "s_wqm_b32", []>;
-def S_WQM_B64 : SOP1_64 <0x0000000a, "s_wqm_b64", []>;
-def S_BREV_B32 : SOP1_32 <0x0000000b, "s_brev_b32",
+  defm S_NOT_B64 : SOP1_64 <sop1<0x08, 0x05>, "s_not_b64",
+    [(set i64:$dst, (not i64:$src0))]
+  >;
+  defm S_WQM_B32 : SOP1_32 <sop1<0x09, 0x06>, "s_wqm_b32", []>;
+  defm S_WQM_B64 : SOP1_64 <sop1<0x0a, 0x07>, "s_wqm_b64", []>;
+} // End Defs = [SCC]
+
+
+defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32",
   [(set i32:$dst, (AMDGPUbrev i32:$src0))]
 >;
-def S_BREV_B64 : SOP1_64 <0x0000000c, "s_brev_b64", []>;
+defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>;
 
-////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "s_bcnt0_i32_b32", []>;
-////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "s_bcnt0_i32_b64", []>;
-def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "s_bcnt1_i32_b32",
-  [(set i32:$dst, (ctpop i32:$src0))]
->;
-def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "s_bcnt1_i32_b64", []>;
+let Defs = [SCC] in {
+  defm S_BCNT0_I32_B32 : SOP1_32 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>;
+  defm S_BCNT0_I32_B64 : SOP1_32_64 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>;
+  defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32",
+    [(set i32:$dst, (ctpop i32:$src0))]
+  >;
+  defm S_BCNT1_I32_B64 : SOP1_32_64 <sop1<0x10, 0x0d>, "s_bcnt1_i32_b64", []>;
+} // End Defs = [SCC]
 
-////def S_FF0_I32_B32 : SOP1_32 <0x00000011, "s_ff0_i32_b32", []>;
-////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "s_ff0_i32_b64", []>;
-def S_FF1_I32_B32 : SOP1_32 <0x00000013, "s_ff1_i32_b32",
+defm S_FF0_I32_B32 : SOP1_32 <sop1<0x11, 0x0e>, "s_ff0_i32_b32", []>;
+defm S_FF0_I32_B64 : SOP1_32_64 <sop1<0x12, 0x0f>, "s_ff0_i32_b64", []>;
+defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32",
   [(set i32:$dst, (cttz_zero_undef i32:$src0))]
 >;
-////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "s_ff1_i32_b64", []>;
+defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>;
 
-def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "s_flbit_i32_b32",
+defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32",
   [(set i32:$dst, (ctlz_zero_undef i32:$src0))]
 >;
 
-//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "s_flbit_i32_b64", []>;
-def S_FLBIT_I32 : SOP1_32 <0x00000017, "s_flbit_i32", []>;
-//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "s_flbit_i32_i64", []>;
-def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "s_sext_i32_i8",
+defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>;
+defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32", []>;
+defm S_FLBIT_I32_I64 : SOP1_32_64 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>;
+defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8",
   [(set i32:$dst, (sext_inreg i32:$src0, i8))]
 >;
-def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "s_sext_i32_i16",
+defm S_SEXT_I32_I16 : SOP1_32 <sop1<0x1a, 0x17>, "s_sext_i32_i16",
   [(set i32:$dst, (sext_inreg i32:$src0, i16))]
 >;
 
-////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "s_bitset0_b32", []>;
-////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "s_bitset0_b64", []>;
-////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "s_bitset1_b32", []>;
-////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "s_bitset1_b64", []>;
-def S_GETPC_B64 : SOP1 <
-  0x0000001f, (outs SReg_64:$dst), (ins), "s_getpc_b64 $dst", []
-> {
-  let SSRC0 = 0;
-}
-def S_SETPC_B64 : SOP1_64 <0x00000020, "s_setpc_b64", []>;
-def S_SWAPPC_B64 : SOP1_64 <0x00000021, "s_swappc_b64", []>;
-def S_RFE_B64 : SOP1_64 <0x00000022, "s_rfe_b64", []>;
-
-let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC] in {
-
-def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "s_and_saveexec_b64", []>;
-def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "s_or_saveexec_b64", []>;
-def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "s_xor_saveexec_b64", []>;
-def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "s_andn2_saveexec_b64", []>;
-def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "s_orn2_saveexec_b64", []>;
-def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "s_nand_saveexec_b64", []>;
-def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "s_nor_saveexec_b64", []>;
-def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "s_xnor_saveexec_b64", []>;
-
-} // End hasSideEffects = 1
-
-def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "s_quadmask_b32", []>;
-def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "s_quadmask_b64", []>;
-def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "s_movrels_b32", []>;
-def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "s_movrels_b64", []>;
-def S_MOVRELD_B32 : SOP1_32 <0x00000030, "s_movreld_b32", []>;
-def S_MOVRELD_B64 : SOP1_64 <0x00000031, "s_movreld_b64", []>;
-//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "s_cbranch_join", []>;
-def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "s_mov_regrd_b32", []>;
-def S_ABS_I32 : SOP1_32 <0x00000034, "s_abs_i32", []>;
-def S_MOV_FED_B32 : SOP1_32 <0x00000035, "s_mov_fed_b32", []>;
+defm S_BITSET0_B32 : SOP1_32 <sop1<0x1b, 0x18>, "s_bitset0_b32", []>;
+defm S_BITSET0_B64 : SOP1_64 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>;
+defm S_BITSET1_B32 : SOP1_32 <sop1<0x1d, 0x1a>, "s_bitset1_b32", []>;
+defm S_BITSET1_B64 : SOP1_64 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>;
+defm S_GETPC_B64 : SOP1_64_0 <sop1<0x1f, 0x1c>, "s_getpc_b64", []>;
+defm S_SETPC_B64 : SOP1_64 <sop1<0x20, 0x1d>, "s_setpc_b64", []>;
+defm S_SWAPPC_B64 : SOP1_64 <sop1<0x21, 0x1e>, "s_swappc_b64", []>;
+defm S_RFE_B64 : SOP1_64 <sop1<0x22, 0x1f>, "s_rfe_b64", []>;
+
+let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
+
+defm S_AND_SAVEEXEC_B64 : SOP1_64 <sop1<0x24, 0x20>, "s_and_saveexec_b64", []>;
+defm S_OR_SAVEEXEC_B64 : SOP1_64 <sop1<0x25, 0x21>, "s_or_saveexec_b64", []>;
+defm S_XOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x26, 0x22>, "s_xor_saveexec_b64", []>;
+defm S_ANDN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x27, 0x23>, "s_andn2_saveexec_b64", []>;
+defm S_ORN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x28, 0x24>, "s_orn2_saveexec_b64", []>;
+defm S_NAND_SAVEEXEC_B64 : SOP1_64 <sop1<0x29, 0x25>, "s_nand_saveexec_b64", []>;
+defm S_NOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2a, 0x26>, "s_nor_saveexec_b64", []>;
+defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []>;
+
+} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC]
+
+defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>;
+defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>;
+defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>;
+defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>;
+defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>;
+defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>;
+defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>;
+defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>;
+let Defs = [SCC] in {
+  defm S_ABS_I32 : SOP1_32 <sop1<0x34, 0x30>, "s_abs_i32", []>;
+} // End Defs = [SCC]
+defm S_MOV_FED_B32 : SOP1_32 <sop1<0x35, 0x31>, "s_mov_fed_b32", []>;
 
 //===----------------------------------------------------------------------===//
 // SOP2 Instructions
@@ -187,119 +203,132 @@ def S_MOV_FED_B32 : SOP1_32 <0x00000035, "s_mov_fed_b32", []>;
 
 let Defs = [SCC] in { // Carry out goes to SCC
 let isCommutable = 1 in {
-def S_ADD_U32 : SOP2_32 <0x00000000, "s_add_u32", []>;
-def S_ADD_I32 : SOP2_32 <0x00000002, "s_add_i32",
+defm S_ADD_U32 : SOP2_32 <sop2<0x00>, "s_add_u32", []>;
+defm S_ADD_I32 : SOP2_32 <sop2<0x02>, "s_add_i32",
   [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
 >;
 } // End isCommutable = 1
 
-def S_SUB_U32 : SOP2_32 <0x00000001, "s_sub_u32", []>;
-def S_SUB_I32 : SOP2_32 <0x00000003, "s_sub_i32",
+defm S_SUB_U32 : SOP2_32 <sop2<0x01>, "s_sub_u32", []>;
+defm S_SUB_I32 : SOP2_32 <sop2<0x03>, "s_sub_i32",
   [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
 >;
 
 let Uses = [SCC] in { // Carry in comes from SCC
 let isCommutable = 1 in {
-def S_ADDC_U32 : SOP2_32 <0x00000004, "s_addc_u32",
+defm S_ADDC_U32 : SOP2_32 <sop2<0x04>, "s_addc_u32",
   [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
 } // End isCommutable = 1
 
-def S_SUBB_U32 : SOP2_32 <0x00000005, "s_subb_u32",
+defm S_SUBB_U32 : SOP2_32 <sop2<0x05>, "s_subb_u32",
   [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
 } // End Uses = [SCC]
-} // End Defs = [SCC]
 
-def S_MIN_I32 : SOP2_32 <0x00000006, "s_min_i32",
+defm S_MIN_I32 : SOP2_32 <sop2<0x06>, "s_min_i32",
   [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
 >;
-def S_MIN_U32 : SOP2_32 <0x00000007, "s_min_u32",
+defm S_MIN_U32 : SOP2_32 <sop2<0x07>, "s_min_u32",
   [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
 >;
-def S_MAX_I32 : SOP2_32 <0x00000008, "s_max_i32",
+defm S_MAX_I32 : SOP2_32 <sop2<0x08>, "s_max_i32",
   [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
 >;
-def S_MAX_U32 : SOP2_32 <0x00000009, "s_max_u32",
+defm S_MAX_U32 : SOP2_32 <sop2<0x09>, "s_max_u32",
   [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
 >;
+} // End Defs = [SCC]
 
-def S_CSELECT_B32 : SOP2_SELECT_32 <
-  0x0000000a, "s_cselect_b32",
-  []
->;
+defm S_CSELECT_B32 : SOP2_SELECT_32 <sop2<0x0a>, "s_cselect_b32", []>;
 
-def S_CSELECT_B64 : SOP2_64 <0x0000000b, "s_cselect_b64", []>;
+let Uses = [SCC] in {
+  defm S_CSELECT_B64 : SOP2_64 <sop2<0x0b>, "s_cselect_b64", []>;
+} // End Uses = [SCC]
 
-def S_AND_B32 : SOP2_32 <0x0000000e, "s_and_b32",
+let Defs = [SCC] in {
+defm S_AND_B32 : SOP2_32 <sop2<0x0e, 0x0c>, "s_and_b32",
   [(set i32:$dst, (and i32:$src0, i32:$src1))]
 >;
 
-def S_AND_B64 : SOP2_64 <0x0000000f, "s_and_b64",
+defm S_AND_B64 : SOP2_64 <sop2<0x0f, 0x0d>, "s_and_b64",
   [(set i64:$dst, (and i64:$src0, i64:$src1))]
 >;
 
-def S_OR_B32 : SOP2_32 <0x00000010, "s_or_b32",
+defm S_OR_B32 : SOP2_32 <sop2<0x10, 0x0e>, "s_or_b32",
   [(set i32:$dst, (or i32:$src0, i32:$src1))]
 >;
 
-def S_OR_B64 : SOP2_64 <0x00000011, "s_or_b64",
+defm S_OR_B64 : SOP2_64 <sop2<0x11, 0x0f>, "s_or_b64",
   [(set i64:$dst, (or i64:$src0, i64:$src1))]
 >;
 
-def S_XOR_B32 : SOP2_32 <0x00000012, "s_xor_b32",
+defm S_XOR_B32 : SOP2_32 <sop2<0x12, 0x10>, "s_xor_b32",
   [(set i32:$dst, (xor i32:$src0, i32:$src1))]
 >;
 
-def S_XOR_B64 : SOP2_64 <0x00000013, "s_xor_b64",
+defm S_XOR_B64 : SOP2_64 <sop2<0x13, 0x11>, "s_xor_b64",
   [(set i64:$dst, (xor i64:$src0, i64:$src1))]
 >;
-def S_ANDN2_B32 : SOP2_32 <0x00000014, "s_andn2_b32", []>;
-def S_ANDN2_B64 : SOP2_64 <0x00000015, "s_andn2_b64", []>;
-def S_ORN2_B32 : SOP2_32 <0x00000016, "s_orn2_b32", []>;
-def S_ORN2_B64 : SOP2_64 <0x00000017, "s_orn2_b64", []>;
-def S_NAND_B32 : SOP2_32 <0x00000018, "s_nand_b32", []>;
-def S_NAND_B64 : SOP2_64 <0x00000019, "s_nand_b64", []>;
-def S_NOR_B32 : SOP2_32 <0x0000001a, "s_nor_b32", []>;
-def S_NOR_B64 : SOP2_64 <0x0000001b, "s_nor_b64", []>;
-def S_XNOR_B32 : SOP2_32 <0x0000001c, "s_xnor_b32", []>;
-def S_XNOR_B64 : SOP2_64 <0x0000001d, "s_xnor_b64", []>;
+defm S_ANDN2_B32 : SOP2_32 <sop2<0x14, 0x12>, "s_andn2_b32", []>;
+defm S_ANDN2_B64 : SOP2_64 <sop2<0x15, 0x13>, "s_andn2_b64", []>;
+defm S_ORN2_B32 : SOP2_32 <sop2<0x16, 0x14>, "s_orn2_b32", []>;
+defm S_ORN2_B64 : SOP2_64 <sop2<0x17, 0x15>, "s_orn2_b64", []>;
+defm S_NAND_B32 : SOP2_32 <sop2<0x18, 0x16>, "s_nand_b32", []>;
+defm S_NAND_B64 : SOP2_64 <sop2<0x19, 0x17>, "s_nand_b64", []>;
+defm S_NOR_B32 : SOP2_32 <sop2<0x1a, 0x18>, "s_nor_b32", []>;
+defm S_NOR_B64 : SOP2_64 <sop2<0x1b, 0x19>, "s_nor_b64", []>;
+defm S_XNOR_B32 : SOP2_32 <sop2<0x1c, 0x1a>, "s_xnor_b32", []>;
+defm S_XNOR_B64 : SOP2_64 <sop2<0x1d, 0x1b>, "s_xnor_b64", []>;
+} // End Defs = [SCC]
 
 // Use added complexity so these patterns are preferred to the VALU patterns.
 let AddedComplexity = 1 in {
+let Defs = [SCC] in {
 
-def S_LSHL_B32 : SOP2_32 <0x0000001e, "s_lshl_b32",
+defm S_LSHL_B32 : SOP2_32 <sop2<0x1e, 0x1c>, "s_lshl_b32",
   [(set i32:$dst, (shl i32:$src0, i32:$src1))]
 >;
-def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "s_lshl_b64",
+defm S_LSHL_B64 : SOP2_64_32 <sop2<0x1f, 0x1d>, "s_lshl_b64",
   [(set i64:$dst, (shl i64:$src0, i32:$src1))]
 >;
-def S_LSHR_B32 : SOP2_32 <0x00000020, "s_lshr_b32",
+defm S_LSHR_B32 : SOP2_32 <sop2<0x20, 0x1e>, "s_lshr_b32",
   [(set i32:$dst, (srl i32:$src0, i32:$src1))]
 >;
-def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "s_lshr_b64",
+defm S_LSHR_B64 : SOP2_64_32 <sop2<0x21, 0x1f>, "s_lshr_b64",
   [(set i64:$dst, (srl i64:$src0, i32:$src1))]
 >;
-def S_ASHR_I32 : SOP2_32 <0x00000022, "s_ashr_i32",
+defm S_ASHR_I32 : SOP2_32 <sop2<0x22, 0x20>, "s_ashr_i32",
   [(set i32:$dst, (sra i32:$src0, i32:$src1))]
 >;
-def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "s_ashr_i64",
+defm S_ASHR_I64 : SOP2_64_32 <sop2<0x23, 0x21>, "s_ashr_i64",
   [(set i64:$dst, (sra i64:$src0, i32:$src1))]
 >;
+} // End Defs = [SCC]
 
-
-def S_BFM_B32 : SOP2_32 <0x00000024, "s_bfm_b32", []>;
-def S_BFM_B64 : SOP2_64 <0x00000025, "s_bfm_b64", []>;
-def S_MUL_I32 : SOP2_32 <0x00000026, "s_mul_i32",
+defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32", []>;
+defm S_BFM_B64 : SOP2_64 <sop2<0x25, 0x23>, "s_bfm_b64", []>;
+defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32",
   [(set i32:$dst, (mul i32:$src0, i32:$src1))]
 >;
 
 } // End AddedComplexity = 1
 
-def S_BFE_U32 : SOP2_32 <0x00000027, "s_bfe_u32", []>;
-def S_BFE_I32 : SOP2_32 <0x00000028, "s_bfe_i32", []>;
-def S_BFE_U64 : SOP2_64 <0x00000029, "s_bfe_u64", []>;
-def S_BFE_I64 : SOP2_64_32 <0x0000002a, "s_bfe_i64", []>;
-//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "s_cbranch_g_fork", []>;
-def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "s_absdiff_i32", []>;
+let Defs = [SCC] in {
+defm S_BFE_U32 : SOP2_32 <sop2<0x27, 0x25>, "s_bfe_u32", []>;
+defm S_BFE_I32 : SOP2_32 <sop2<0x28, 0x26>, "s_bfe_i32", []>;
+defm S_BFE_U64 : SOP2_64 <sop2<0x29, 0x27>, "s_bfe_u64", []>;
+defm S_BFE_I64 : SOP2_64_32 <sop2<0x2a, 0x28>, "s_bfe_i64", []>;
+} // End Defs = [SCC]
+
+let sdst = 0 in {
+defm S_CBRANCH_G_FORK : SOP2_m <
+  sop2<0x2b, 0x29>, "s_cbranch_g_fork", (outs),
+  (ins SReg_64:$src0, SReg_64:$src1), "s_cbranch_g_fork $src0, $src1", []
+>;
+}
+
+let Defs = [SCC] in {
+defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>;
+} // End Defs = [SCC]
 
 //===----------------------------------------------------------------------===//
 // SOPC Instructions
@@ -328,9 +357,13 @@ def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">;
 //===----------------------------------------------------------------------===//
 
 let isReMaterializable = 1 in {
-def S_MOVK_I32 : SOPK_32 <0x00000000, "s_movk_i32", []>;
+defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>;
 } // End isReMaterializable = 1
-def S_CMOVK_I32 : SOPK_32 <0x00000002, "s_cmovk_i32", []>;
+let Uses = [SCC] in {
+  defm S_CMOVK_I32 : SOPK_32 <sopk<0x02, 0x01>, "s_cmovk_i32", []>;
+}
+
+let isCompare = 1 in {
 
 /*
 This instruction is disabled for now until we can figure out how to teach
@@ -344,38 +377,36 @@ SCC = S_CMPK_EQ_I32 SGPR0, imm
 VCC = COPY SCC
 VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1
 
-def S_CMPK_EQ_I32 : SOPK <
-  0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1),
-  "s_cmpk_eq_i32",
+defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32",
   [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))]
 >;
 */
 
-let isCompare = 1, Defs = [SCC] in {
-def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "s_cmpk_lg_i32", []>;
-def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "s_cmpk_gt_i32", []>;
-def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "s_cmpk_ge_i32", []>;
-def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "s_cmpk_lt_i32", []>;
-def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "s_cmpk_le_i32", []>;
-def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "s_cmpk_eq_u32", []>;
-def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "s_cmpk_lg_u32", []>;
-def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "s_cmpk_gt_u32", []>;
-def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "s_cmpk_ge_u32", []>;
-def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "s_cmpk_lt_u32", []>;
-def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "s_cmpk_le_u32", []>;
-} // End isCompare = 1, Defs = [SCC]
-
-let Defs = [SCC], isCommutable = 1 in {
-  def S_ADDK_I32 : SOPK_32 <0x0000000f, "s_addk_i32", []>;
-  def S_MULK_I32 : SOPK_32 <0x00000010, "s_mulk_i32", []>;
+defm S_CMPK_LG_I32 : SOPK_SCC <sopk<0x04, 0x03>, "s_cmpk_lg_i32", []>;
+defm S_CMPK_GT_I32 : SOPK_SCC <sopk<0x05, 0x04>, "s_cmpk_gt_i32", []>;
+defm S_CMPK_GE_I32 : SOPK_SCC <sopk<0x06, 0x05>, "s_cmpk_ge_i32", []>;
+defm S_CMPK_LT_I32 : SOPK_SCC <sopk<0x07, 0x06>, "s_cmpk_lt_i32", []>;
+defm S_CMPK_LE_I32 : SOPK_SCC <sopk<0x08, 0x07>, "s_cmpk_le_i32", []>;
+defm S_CMPK_EQ_U32 : SOPK_SCC <sopk<0x09, 0x08>, "s_cmpk_eq_u32", []>;
+defm S_CMPK_LG_U32 : SOPK_SCC <sopk<0x0a, 0x09>, "s_cmpk_lg_u32", []>;
+defm S_CMPK_GT_U32 : SOPK_SCC <sopk<0x0b, 0x0a>, "s_cmpk_gt_u32", []>;
+defm S_CMPK_GE_U32 : SOPK_SCC <sopk<0x0c, 0x0b>, "s_cmpk_ge_u32", []>;
+defm S_CMPK_LT_U32 : SOPK_SCC <sopk<0x0d, 0x0c>, "s_cmpk_lt_u32", []>;
+defm S_CMPK_LE_U32 : SOPK_SCC <sopk<0x0e, 0x0d>, "s_cmpk_le_u32", []>;
+} // End isCompare = 1
+
+let isCommutable = 1 in {
+  let Defs = [SCC], isCommutable = 1 in {
+    defm S_ADDK_I32 : SOPK_32 <sopk<0x0f, 0x0e>, "s_addk_i32", []>;
+  }
+  defm S_MULK_I32 : SOPK_32 <sopk<0x10, 0x0f>, "s_mulk_i32", []>;
 }
 
-//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "s_cbranch_i_fork", []>;
-def S_GETREG_B32 : SOPK_32 <0x00000012, "s_getreg_b32", []>;
-def S_SETREG_B32 : SOPK_32 <0x00000013, "s_setreg_b32", []>;
-def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "s_getreg_regrd_b32", []>;
-//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "s_setreg_imm32_b32", []>;
-//def EXP : EXP_ <0x00000000, "exp", []>;
+//defm S_CBRANCH_I_FORK : SOPK_ <sopk<0x11, 0x10>, "s_cbranch_i_fork", []>;
+defm S_GETREG_B32 : SOPK_32 <sopk<0x12, 0x11>, "s_getreg_b32", []>;
+defm S_SETREG_B32 : SOPK_32 <sopk<0x13, 0x12>, "s_setreg_b32", []>;
+defm S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32", []>;
+//defm S_SETREG_IMM32_B32 : SOPK_32 <sopk<0x15, 0x14>, "s_setreg_imm32_b32", []>;
 
 //===----------------------------------------------------------------------===//
 // SOPP Instructions
@@ -476,82 +507,84 @@ def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> {
 
 let isCompare = 1 in {
 
-defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0>, "v_cmp_f_f32">;
-defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1>, "v_cmp_lt_f32", COND_OLT>;
-defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2>, "v_cmp_eq_f32", COND_OEQ>;
-defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3>, "v_cmp_le_f32", COND_OLE>;
-defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4>, "v_cmp_gt_f32", COND_OGT>;
-defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5>, "v_cmp_lg_f32">;
-defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6>, "v_cmp_ge_f32", COND_OGE>;
-defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7>, "v_cmp_o_f32", COND_O>;
-defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8>, "v_cmp_u_f32", COND_UO>;
-defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9>, "v_cmp_nge_f32">;
-defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa>, "v_cmp_nlg_f32">;
-defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb>, "v_cmp_ngt_f32">;
-defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc>, "v_cmp_nle_f32">;
-defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd>, "v_cmp_neq_f32", COND_UNE>;
-defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe>, "v_cmp_nlt_f32">;
-defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf>, "v_cmp_tru_f32">;
+defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0, 0x40>, "v_cmp_f_f32">;
+defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1, 0x41>, "v_cmp_lt_f32", COND_OLT>;
+defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2, 0x42>, "v_cmp_eq_f32", COND_OEQ>;
+defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3, 0x43>, "v_cmp_le_f32", COND_OLE>;
+defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4, 0x44>, "v_cmp_gt_f32", COND_OGT>;
+defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5, 0x45>, "v_cmp_lg_f32", COND_ONE>;
+defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6, 0x46>, "v_cmp_ge_f32", COND_OGE>;
+defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7, 0x47>, "v_cmp_o_f32", COND_O>;
+defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8, 0x48>, "v_cmp_u_f32", COND_UO>;
+defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9, 0x49>, "v_cmp_nge_f32",  COND_ULT>;
+defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa, 0x4a>, "v_cmp_nlg_f32", COND_UEQ>;
+defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb, 0x4b>, "v_cmp_ngt_f32", COND_ULE>;
+defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc, 0x4c>, "v_cmp_nle_f32", COND_UGT>;
+defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd, 0x4d>, "v_cmp_neq_f32", COND_UNE>;
+defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe, 0x4e>, "v_cmp_nlt_f32", COND_UGE>;
+defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf, 0x4f>, "v_cmp_tru_f32">;
 
 let hasSideEffects = 1 in {
 
-defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10>, "v_cmpx_f_f32">;
-defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11>, "v_cmpx_lt_f32">;
-defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12>, "v_cmpx_eq_f32">;
-defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13>, "v_cmpx_le_f32">;
-defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14>, "v_cmpx_gt_f32">;
-defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15>, "v_cmpx_lg_f32">;
-defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16>, "v_cmpx_ge_f32">;
-defm V_CMPX_O_F32 : VOPCX_F32 <vopc<0x17>, "v_cmpx_o_f32">;
-defm V_CMPX_U_F32 : VOPCX_F32 <vopc<0x18>, "v_cmpx_u_f32">;
-defm V_CMPX_NGE_F32 : VOPCX_F32 <vopc<0x19>, "v_cmpx_nge_f32">;
-defm V_CMPX_NLG_F32 : VOPCX_F32 <vopc<0x1a>, "v_cmpx_nlg_f32">;
-defm V_CMPX_NGT_F32 : VOPCX_F32 <vopc<0x1b>, "v_cmpx_ngt_f32">;
-defm V_CMPX_NLE_F32 : VOPCX_F32 <vopc<0x1c>, "v_cmpx_nle_f32">;
-defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d>, "v_cmpx_neq_f32">;
-defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e>, "v_cmpx_nlt_f32">;
-defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f>, "v_cmpx_tru_f32">;
+defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10, 0x50>, "v_cmpx_f_f32">;
+defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11, 0x51>, "v_cmpx_lt_f32">;
+defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12, 0x52>, "v_cmpx_eq_f32">;
+defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13, 0x53>, "v_cmpx_le_f32">;
+defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14, 0x54>, "v_cmpx_gt_f32">;
+defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15, 0x55>, "v_cmpx_lg_f32">;
+defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16, 0x56>, "v_cmpx_ge_f32">;
+defm V_CMPX_O_F32 : VOPCX_F32 <vopc<0x17, 0x57>, "v_cmpx_o_f32">;
+defm V_CMPX_U_F32 : VOPCX_F32 <vopc<0x18, 0x58>, "v_cmpx_u_f32">;
+defm V_CMPX_NGE_F32 : VOPCX_F32 <vopc<0x19, 0x59>, "v_cmpx_nge_f32">;
+defm V_CMPX_NLG_F32 : VOPCX_F32 <vopc<0x1a, 0x5a>, "v_cmpx_nlg_f32">;
+defm V_CMPX_NGT_F32 : VOPCX_F32 <vopc<0x1b, 0x5b>, "v_cmpx_ngt_f32">;
+defm V_CMPX_NLE_F32 : VOPCX_F32 <vopc<0x1c, 0x5c>, "v_cmpx_nle_f32">;
+defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d, 0x5d>, "v_cmpx_neq_f32">;
+defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e, 0x5e>, "v_cmpx_nlt_f32">;
+defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f, 0x5f>, "v_cmpx_tru_f32">;
 
 } // End hasSideEffects = 1
 
-defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20>, "v_cmp_f_f64">;
-defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21>, "v_cmp_lt_f64", COND_OLT>;
-defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22>, "v_cmp_eq_f64", COND_OEQ>;
-defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23>, "v_cmp_le_f64", COND_OLE>;
-defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24>, "v_cmp_gt_f64", COND_OGT>;
-defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25>, "v_cmp_lg_f64">;
-defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26>, "v_cmp_ge_f64", COND_OGE>;
-defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27>, "v_cmp_o_f64", COND_O>;
-defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28>, "v_cmp_u_f64", COND_UO>;
-defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29>, "v_cmp_nge_f64">;
-defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a>, "v_cmp_nlg_f64">;
-defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b>, "v_cmp_ngt_f64">;
-defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c>, "v_cmp_nle_f64">;
-defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d>, "v_cmp_neq_f64", COND_UNE>;
-defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e>, "v_cmp_nlt_f64">;
-defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f>, "v_cmp_tru_f64">;
+defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20, 0x60>, "v_cmp_f_f64">;
+defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21, 0x61>, "v_cmp_lt_f64", COND_OLT>;
+defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22, 0x62>, "v_cmp_eq_f64", COND_OEQ>;
+defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23, 0x63>, "v_cmp_le_f64", COND_OLE>;
+defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24, 0x64>, "v_cmp_gt_f64", COND_OGT>;
+defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25, 0x65>, "v_cmp_lg_f64", COND_ONE>;
+defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26, 0x66>, "v_cmp_ge_f64", COND_OGE>;
+defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27, 0x67>, "v_cmp_o_f64", COND_O>;
+defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28, 0x68>, "v_cmp_u_f64", COND_UO>;
+defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29, 0x69>, "v_cmp_nge_f64", COND_ULT>;
+defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a, 0x6a>, "v_cmp_nlg_f64", COND_UEQ>;
+defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b, 0x6b>, "v_cmp_ngt_f64", COND_ULE>;
+defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c, 0x6c>, "v_cmp_nle_f64", COND_UGT>;
+defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d, 0x6d>, "v_cmp_neq_f64", COND_UNE>;
+defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e, 0x6e>, "v_cmp_nlt_f64", COND_UGE>;
+defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f, 0x6f>, "v_cmp_tru_f64">;
 
 let hasSideEffects = 1 in {
 
-defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30>, "v_cmpx_f_f64">;
-defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31>, "v_cmpx_lt_f64">;
-defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32>, "v_cmpx_eq_f64">;
-defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33>, "v_cmpx_le_f64">;
-defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34>, "v_cmpx_gt_f64">;
-defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35>, "v_cmpx_lg_f64">;
-defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36>, "v_cmpx_ge_f64">;
-defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37>, "v_cmpx_o_f64">;
-defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38>, "v_cmpx_u_f64">;
-defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39>, "v_cmpx_nge_f64">;
-defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a>, "v_cmpx_nlg_f64">;
-defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b>, "v_cmpx_ngt_f64">;
-defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c>, "v_cmpx_nle_f64">;
-defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d>, "v_cmpx_neq_f64">;
-defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e>, "v_cmpx_nlt_f64">;
-defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f>, "v_cmpx_tru_f64">;
+defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30, 0x70>, "v_cmpx_f_f64">;
+defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31, 0x71>, "v_cmpx_lt_f64">;
+defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32, 0x72>, "v_cmpx_eq_f64">;
+defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33, 0x73>, "v_cmpx_le_f64">;
+defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34, 0x74>, "v_cmpx_gt_f64">;
+defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35, 0x75>, "v_cmpx_lg_f64">;
+defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36, 0x76>, "v_cmpx_ge_f64">;
+defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37, 0x77>, "v_cmpx_o_f64">;
+defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38, 0x78>, "v_cmpx_u_f64">;
+defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39, 0x79>, "v_cmpx_nge_f64">;
+defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a, 0x7a>, "v_cmpx_nlg_f64">;
+defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b, 0x7b>, "v_cmpx_ngt_f64">;
+defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c, 0x7c>, "v_cmpx_nle_f64">;
+defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d, 0x7d>, "v_cmpx_neq_f64">;
+defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e, 0x7e>, "v_cmpx_nlt_f64">;
+defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f, 0x7f>, "v_cmpx_tru_f64">;
 
 } // End hasSideEffects = 1
 
+let SubtargetPredicate = isSICI in {
+
 defm V_CMPS_F_F32 : VOPC_F32 <vopc<0x40>, "v_cmps_f_f32">;
 defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32">;
 defm V_CMPS_EQ_F32 : VOPC_F32 <vopc<0x42>, "v_cmps_eq_f32">;
@@ -628,104 +661,106 @@ defm V_CMPSX_TRU_F64 : VOPC_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">;
 
 } // End hasSideEffects = 1, Defs = [EXEC]
 
-defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80>, "v_cmp_f_i32">;
-defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81>, "v_cmp_lt_i32", COND_SLT>;
-defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82>, "v_cmp_eq_i32", COND_EQ>;
-defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83>, "v_cmp_le_i32", COND_SLE>;
-defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84>, "v_cmp_gt_i32", COND_SGT>;
-defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85>, "v_cmp_ne_i32", COND_NE>;
-defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86>, "v_cmp_ge_i32", COND_SGE>;
-defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87>, "v_cmp_t_i32">;
+} // End SubtargetPredicate = isSICI
+
+defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80, 0xc0>, "v_cmp_f_i32">;
+defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81, 0xc1>, "v_cmp_lt_i32", COND_SLT>;
+defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82, 0xc2>, "v_cmp_eq_i32", COND_EQ>;
+defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83, 0xc3>, "v_cmp_le_i32", COND_SLE>;
+defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84, 0xc4>, "v_cmp_gt_i32", COND_SGT>;
+defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85, 0xc5>, "v_cmp_ne_i32", COND_NE>;
+defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86, 0xc6>, "v_cmp_ge_i32", COND_SGE>;
+defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87, 0xc7>, "v_cmp_t_i32">;
 
 let hasSideEffects = 1 in {
 
-defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90>, "v_cmpx_f_i32">;
-defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91>, "v_cmpx_lt_i32">;
-defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92>, "v_cmpx_eq_i32">;
-defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93>, "v_cmpx_le_i32">;
-defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94>, "v_cmpx_gt_i32">;
-defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95>, "v_cmpx_ne_i32">;
-defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96>, "v_cmpx_ge_i32">;
-defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97>, "v_cmpx_t_i32">;
+defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90, 0xd0>, "v_cmpx_f_i32">;
+defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91, 0xd1>, "v_cmpx_lt_i32">;
+defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92, 0xd2>, "v_cmpx_eq_i32">;
+defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93, 0xd3>, "v_cmpx_le_i32">;
+defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94, 0xd4>, "v_cmpx_gt_i32">;
+defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95, 0xd5>, "v_cmpx_ne_i32">;
+defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96, 0xd6>, "v_cmpx_ge_i32">;
+defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97, 0xd7>, "v_cmpx_t_i32">;
 
 } // End hasSideEffects = 1
 
-defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0>, "v_cmp_f_i64">;
-defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1>, "v_cmp_lt_i64", COND_SLT>;
-defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2>, "v_cmp_eq_i64", COND_EQ>;
-defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3>, "v_cmp_le_i64", COND_SLE>;
-defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4>, "v_cmp_gt_i64", COND_SGT>;
-defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5>, "v_cmp_ne_i64", COND_NE>;
-defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6>, "v_cmp_ge_i64", COND_SGE>;
-defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7>, "v_cmp_t_i64">;
+defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0, 0xe0>, "v_cmp_f_i64">;
+defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1, 0xe1>, "v_cmp_lt_i64", COND_SLT>;
+defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2, 0xe2>, "v_cmp_eq_i64", COND_EQ>;
+defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3, 0xe3>, "v_cmp_le_i64", COND_SLE>;
+defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4, 0xe4>, "v_cmp_gt_i64", COND_SGT>;
+defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5, 0xe5>, "v_cmp_ne_i64", COND_NE>;
+defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6, 0xe6>, "v_cmp_ge_i64", COND_SGE>;
+defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7, 0xe7>, "v_cmp_t_i64">;
 
 let hasSideEffects = 1 in {
 
-defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0>, "v_cmpx_f_i64">;
-defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1>, "v_cmpx_lt_i64">;
-defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2>, "v_cmpx_eq_i64">;
-defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3>, "v_cmpx_le_i64">;
-defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4>, "v_cmpx_gt_i64">;
-defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5>, "v_cmpx_ne_i64">;
-defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6>, "v_cmpx_ge_i64">;
-defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7>, "v_cmpx_t_i64">;
+defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0, 0xf0>, "v_cmpx_f_i64">;
+defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1, 0xf1>, "v_cmpx_lt_i64">;
+defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2, 0xf2>, "v_cmpx_eq_i64">;
+defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3, 0xf3>, "v_cmpx_le_i64">;
+defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4, 0xf4>, "v_cmpx_gt_i64">;
+defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5, 0xf5>, "v_cmpx_ne_i64">;
+defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6, 0xf6>, "v_cmpx_ge_i64">;
+defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7, 0xf7>, "v_cmpx_t_i64">;
 
 } // End hasSideEffects = 1
 
-defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0>, "v_cmp_f_u32">;
-defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1>, "v_cmp_lt_u32", COND_ULT>;
-defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2>, "v_cmp_eq_u32", COND_EQ>;
-defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3>, "v_cmp_le_u32", COND_ULE>;
-defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4>, "v_cmp_gt_u32", COND_UGT>;
-defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5>, "v_cmp_ne_u32", COND_NE>;
-defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6>, "v_cmp_ge_u32", COND_UGE>;
-defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7>, "v_cmp_t_u32">;
+defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0, 0xc8>, "v_cmp_f_u32">;
+defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1, 0xc9>, "v_cmp_lt_u32", COND_ULT>;
+defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2, 0xca>, "v_cmp_eq_u32", COND_EQ>;
+defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3, 0xcb>, "v_cmp_le_u32", COND_ULE>;
+defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4, 0xcc>, "v_cmp_gt_u32", COND_UGT>;
+defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5, 0xcd>, "v_cmp_ne_u32", COND_NE>;
+defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6, 0xce>, "v_cmp_ge_u32", COND_UGE>;
+defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7, 0xcf>, "v_cmp_t_u32">;
 
 let hasSideEffects = 1 in {
 
-defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0>, "v_cmpx_f_u32">;
-defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1>, "v_cmpx_lt_u32">;
-defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2>, "v_cmpx_eq_u32">;
-defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3>, "v_cmpx_le_u32">;
-defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4>, "v_cmpx_gt_u32">;
-defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5>, "v_cmpx_ne_u32">;
-defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6>, "v_cmpx_ge_u32">;
-defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7>, "v_cmpx_t_u32">;
+defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0, 0xd8>, "v_cmpx_f_u32">;
+defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1, 0xd9>, "v_cmpx_lt_u32">;
+defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2, 0xda>, "v_cmpx_eq_u32">;
+defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3, 0xdb>, "v_cmpx_le_u32">;
+defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4, 0xdc>, "v_cmpx_gt_u32">;
+defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5, 0xdd>, "v_cmpx_ne_u32">;
+defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6, 0xde>, "v_cmpx_ge_u32">;
+defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7, 0xdf>, "v_cmpx_t_u32">;
 
 } // End hasSideEffects = 1
 
-defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0>, "v_cmp_f_u64">;
-defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1>, "v_cmp_lt_u64", COND_ULT>;
-defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2>, "v_cmp_eq_u64", COND_EQ>;
-defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3>, "v_cmp_le_u64", COND_ULE>;
-defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4>, "v_cmp_gt_u64", COND_UGT>;
-defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5>, "v_cmp_ne_u64", COND_NE>;
-defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6>, "v_cmp_ge_u64", COND_UGE>;
-defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7>, "v_cmp_t_u64">;
+defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0, 0xe8>, "v_cmp_f_u64">;
+defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1, 0xe9>, "v_cmp_lt_u64", COND_ULT>;
+defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2, 0xea>, "v_cmp_eq_u64", COND_EQ>;
+defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3, 0xeb>, "v_cmp_le_u64", COND_ULE>;
+defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4, 0xec>, "v_cmp_gt_u64", COND_UGT>;
+defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5, 0xed>, "v_cmp_ne_u64", COND_NE>;
+defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6, 0xee>, "v_cmp_ge_u64", COND_UGE>;
+defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7, 0xef>, "v_cmp_t_u64">;
 
 let hasSideEffects = 1 in {
 
-defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0>, "v_cmpx_f_u64">;
-defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1>, "v_cmpx_lt_u64">;
-defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2>, "v_cmpx_eq_u64">;
-defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3>, "v_cmpx_le_u64">;
-defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4>, "v_cmpx_gt_u64">;
-defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5>, "v_cmpx_ne_u64">;
-defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6>, "v_cmpx_ge_u64">;
-defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7>, "v_cmpx_t_u64">;
+defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0, 0xf8>, "v_cmpx_f_u64">;
+defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1, 0xf9>, "v_cmpx_lt_u64">;
+defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2, 0xfa>, "v_cmpx_eq_u64">;
+defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3, 0xfb>, "v_cmpx_le_u64">;
+defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4, 0xfc>, "v_cmpx_gt_u64">;
+defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5, 0xfd>, "v_cmpx_ne_u64">;
+defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6, 0xfe>, "v_cmpx_ge_u64">;
+defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7, 0xff>, "v_cmpx_t_u64">;
 
 } // End hasSideEffects = 1
 
-defm V_CMP_CLASS_F32 : VOPC_F32 <vopc<0x88>, "v_cmp_class_f32">;
+defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <vopc<0x88, 0x10>, "v_cmp_class_f32">;
 
 let hasSideEffects = 1 in {
-defm V_CMPX_CLASS_F32 : VOPCX_F32 <vopc<0x98>, "v_cmpx_class_f32">;
+defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <vopc<0x98, 0x11>, "v_cmpx_class_f32">;
 } // End hasSideEffects = 1
 
-defm V_CMP_CLASS_F64 : VOPC_F64 <vopc<0xa8>, "v_cmp_class_f64">;
+defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <vopc<0xa8, 0x12>, "v_cmp_class_f64">;
 
 let hasSideEffects = 1 in {
-defm V_CMPX_CLASS_F64 : VOPCX_F64 <vopc<0xb8>, "v_cmpx_class_f64">;
+defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <vopc<0xb8, 0x13>, "v_cmpx_class_f64">;
 } // End hasSideEffects = 1
 
 } // End isCompare = 1
@@ -735,88 +770,88 @@ defm V_CMPX_CLASS_F64 : VOPCX_F64 <vopc<0xb8>, "v_cmpx_class_f64">;
 //===----------------------------------------------------------------------===//
 
 
-def DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VReg_32>;
-def DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VReg_32>;
-def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VReg_32>;
-def DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VReg_32>;
-def DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VReg_32>;
-def DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VReg_32>;
-def DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VReg_32>;
-def DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VReg_32>;
-def DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VReg_32>;
-def DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VReg_32>;
-def DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VReg_32>;
-def DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VReg_32>;
-def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "ds_mskor_b32", VReg_32>;
-def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VReg_32>;
-def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VReg_32>;
-def DS_MIN_F32 : DS_1A1D_NORET <0x12, "ds_min_f32", VReg_32>;
-def DS_MAX_F32 : DS_1A1D_NORET <0x13, "ds_max_f32", VReg_32>;
-
-def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VReg_32, "ds_add_u32">;
-def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VReg_32, "ds_sub_u32">;
-def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VReg_32, "ds_rsub_u32">;
-def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VReg_32, "ds_inc_u32">;
-def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VReg_32, "ds_dec_u32">;
-def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VReg_32, "ds_min_i32">;
-def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VReg_32, "ds_max_i32">;
-def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VReg_32, "ds_min_u32">;
-def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VReg_32, "ds_max_u32">;
-def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VReg_32, "ds_and_b32">;
-def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VReg_32, "ds_or_b32">;
-def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VReg_32, "ds_xor_b32">;
-def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "ds_mskor_rtn_b32", VReg_32, "ds_mskor_b32">;
-def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VReg_32>;
-//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "ds_wrxchg2_rtn_b32", VReg_32, "ds_wrxchg2_b32">;
-//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "ds_wrxchg2_rtn_b32", VReg_32, "ds_wrxchg2st64_b32">;
-def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VReg_32, "ds_cmpst_b32">;
-def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VReg_32, "ds_cmpst_f32">;
-def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "ds_min_rtn_f32", VReg_32, "ds_min_f32">;
-def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "ds_max_rtn_f32", VReg_32, "ds_max_f32">;
+defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>;
+defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>;
+defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>;
+defm DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>;
+defm DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>;
+defm DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>;
+defm DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>;
+defm DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>;
+defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>;
+defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>;
+defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>;
+defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>;
+defm DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "ds_mskor_b32", VGPR_32>;
+defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>;
+defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>;
+defm DS_MIN_F32 : DS_1A1D_NORET <0x12, "ds_min_f32", VGPR_32>;
+defm DS_MAX_F32 : DS_1A1D_NORET <0x13, "ds_max_f32", VGPR_32>;
+
+defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">;
+defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">;
+defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">;
+defm DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">;
+defm DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">;
+defm DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">;
+defm DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">;
+defm DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">;
+defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">;
+defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">;
+defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">;
+defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">;
+defm DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">;
+defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>;
+//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2_b32">;
+//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2st64_b32">;
+defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">;
+defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
+defm DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
+defm DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
 
 let SubtargetPredicate = isCI in {
-def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VReg_32, "ds_wrap_f32">;
+defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">;
 } // End isCI
 
 
-def DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>;
-def DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>;
-def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>;
-def DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>;
-def DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>;
-def DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>;
-def DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>;
-def DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>;
-def DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>;
-def DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>;
-def DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>;
-def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>;
-def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "ds_mskor_b64", VReg_64>;
-def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>;
-def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>;
-def DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>;
-def DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>;
-
-def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">;
-def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">;
-def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">;
-def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">;
-def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">;
-def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">;
-def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">;
-def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">;
-def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">;
-def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">;
-def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">;
-def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">;
-def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">;
-def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">;
+defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>;
+defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>;
+defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>;
+defm DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>;
+defm DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>;
+defm DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>;
+defm DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>;
+defm DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>;
+defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>;
+defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>;
+defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>;
+defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>;
+defm DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "ds_mskor_b64", VReg_64>;
+defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>;
+defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>;
+defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>;
+defm DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>;
+
+defm DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">;
+defm DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">;
+defm DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">;
+defm DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">;
+defm DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">;
+defm DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">;
+defm DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">;
+defm DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">;
+defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">;
+defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">;
+defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">;
+defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">;
+defm DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">;
+defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">;
 //def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2_b64">;
 //def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2st64_b64">;
-def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">;
-def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">;
-def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_f64", VReg_64, "ds_min_f64">;
-def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_f64", VReg_64, "ds_max_f64">;
+defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">;
+defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">;
+defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">;
+defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">;
 
 //let SubtargetPredicate = isCI in {
 // DS_CONDXCHG32_RTN_B64
@@ -825,139 +860,140 @@ def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_f64", VReg_64, "ds_max_f64">;
 
 // TODO: _SRC2_* forms
 
-def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "ds_write_b32", VReg_32>;
-def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "ds_write_b8", VReg_32>;
-def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "ds_write_b16", VReg_32>;
-def DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "ds_write_b64", VReg_64>;
+defm DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "ds_write_b32", VGPR_32>;
+defm DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "ds_write_b8", VGPR_32>;
+defm DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "ds_write_b16", VGPR_32>;
+defm DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "ds_write_b64", VReg_64>;
 
-def DS_READ_B32 : DS_Load_Helper <0x00000036, "ds_read_b32", VReg_32>;
-def DS_READ_I8 : DS_Load_Helper <0x00000039, "ds_read_i8", VReg_32>;
-def DS_READ_U8 : DS_Load_Helper <0x0000003a, "ds_read_u8", VReg_32>;
-def DS_READ_I16 : DS_Load_Helper <0x0000003b, "ds_read_i16", VReg_32>;
-def DS_READ_U16 : DS_Load_Helper <0x0000003c, "ds_read_u16", VReg_32>;
-def DS_READ_B64 : DS_Load_Helper <0x00000076, "ds_read_b64", VReg_64>;
+defm DS_READ_B32 : DS_Load_Helper <0x00000036, "ds_read_b32", VGPR_32>;
+defm DS_READ_I8 : DS_Load_Helper <0x00000039, "ds_read_i8", VGPR_32>;
+defm DS_READ_U8 : DS_Load_Helper <0x0000003a, "ds_read_u8", VGPR_32>;
+defm DS_READ_I16 : DS_Load_Helper <0x0000003b, "ds_read_i16", VGPR_32>;
+defm DS_READ_U16 : DS_Load_Helper <0x0000003c, "ds_read_u16", VGPR_32>;
+defm DS_READ_B64 : DS_Load_Helper <0x00000076, "ds_read_b64", VReg_64>;
 
 // 2 forms.
-def DS_WRITE2_B32 : DS_Store2_Helper <0x0000000E, "ds_write2_b32", VReg_32>;
-def DS_WRITE2ST64_B32 : DS_Store2_Helper <0x0000000F, "ds_write2st64_b32", VReg_32>;
-def DS_WRITE2_B64 : DS_Store2_Helper <0x0000004E, "ds_write2_b64", VReg_64>;
-def DS_WRITE2ST64_B64 : DS_Store2_Helper <0x0000004F, "ds_write2st64_b64", VReg_64>;
+defm DS_WRITE2_B32 : DS_Store2_Helper <0x0000000E, "ds_write2_b32", VGPR_32>;
+defm DS_WRITE2ST64_B32 : DS_Store2_Helper <0x0000000F, "ds_write2st64_b32", VGPR_32>;
+defm DS_WRITE2_B64 : DS_Store2_Helper <0x0000004E, "ds_write2_b64", VReg_64>;
+defm DS_WRITE2ST64_B64 : DS_Store2_Helper <0x0000004F, "ds_write2st64_b64", VReg_64>;
 
-def DS_READ2_B32 : DS_Load2_Helper <0x00000037, "ds_read2_b32", VReg_64>;
-def DS_READ2ST64_B32 : DS_Load2_Helper <0x00000038, "ds_read2st64_b32", VReg_64>;
-def DS_READ2_B64 : DS_Load2_Helper <0x00000075, "ds_read2_b64", VReg_128>;
-def DS_READ2ST64_B64 : DS_Load2_Helper <0x00000076, "ds_read2st64_b64", VReg_128>;
+defm DS_READ2_B32 : DS_Load2_Helper <0x00000037, "ds_read2_b32", VReg_64>;
+defm DS_READ2ST64_B32 : DS_Load2_Helper <0x00000038, "ds_read2st64_b32", VReg_64>;
+defm DS_READ2_B64 : DS_Load2_Helper <0x00000075, "ds_read2_b64", VReg_128>;
+defm DS_READ2ST64_B64 : DS_Load2_Helper <0x00000076, "ds_read2st64_b64", VReg_128>;
 
 //===----------------------------------------------------------------------===//
 // MUBUF Instructions
 //===----------------------------------------------------------------------===//
 
-//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "buffer_load_format_x", []>;
-//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "buffer_load_format_xy", []>;
-//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "buffer_load_format_xyz", []>;
-defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "buffer_load_format_xyzw", VReg_128>;
-//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "buffer_store_format_x", []>;
-//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "buffer_store_format_xy", []>;
-//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "buffer_store_format_xyz", []>;
-//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "buffer_store_format_xyzw", []>;
+//def BUFFER_LOAD_FORMAT_X : MUBUF_ <mubuf<0x00>, "buffer_load_format_x", []>;
+//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <mubuf<0x01>, "buffer_load_format_xy", []>;
+//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <mubuf<0x02>, "buffer_load_format_xyz", []>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <mubuf<0x03>, "buffer_load_format_xyzw", VReg_128>;
+//def BUFFER_STORE_FORMAT_X : MUBUF_ <mubuf<0x04>, "buffer_store_format_x", []>;
+//def BUFFER_STORE_FORMAT_XY : MUBUF_ <mubuf<0x05>, "buffer_store_format_xy", []>;
+//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <mubuf<0x06>, "buffer_store_format_xyz", []>;
+//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <mubuf<0x07>, "buffer_store_format_xyzw", []>;
 defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <
-  0x00000008, "buffer_load_ubyte", VReg_32, i32, az_extloadi8_global
+  mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global
 >;
 defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <
-  0x00000009, "buffer_load_sbyte", VReg_32, i32, sextloadi8_global
+  mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global
 >;
 defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <
-  0x0000000a, "buffer_load_ushort", VReg_32, i32, az_extloadi16_global
+  mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global
 >;
 defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
-  0x0000000b, "buffer_load_sshort", VReg_32, i32, sextloadi16_global
+  mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global
 >;
 defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
-  0x0000000c, "buffer_load_dword", VReg_32, i32, global_load
+  mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load
 >;
 defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <
-  0x0000000d, "buffer_load_dwordx2", VReg_64, v2i32, global_load
+  mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load
 >;
 defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
-  0x0000000e, "buffer_load_dwordx4", VReg_128, v4i32, global_load
+  mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load
 >;
 
 defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
-  0x00000018, "buffer_store_byte", VReg_32, i32, truncstorei8_global
+  mubuf<0x18>, "buffer_store_byte", VGPR_32, i32, truncstorei8_global
 >;
 
 defm BUFFER_STORE_SHORT : MUBUF_Store_Helper <
-  0x0000001a, "buffer_store_short", VReg_32, i32, truncstorei16_global
+  mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global
 >;
 
 defm BUFFER_STORE_DWORD : MUBUF_Store_Helper <
-  0x0000001c, "buffer_store_dword", VReg_32, i32, global_store
+  mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store
 >;
 
 defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
-  0x0000001d, "buffer_store_dwordx2", VReg_64, v2i32, global_store
+  mubuf<0x1d>, "buffer_store_dwordx2", VReg_64, v2i32, global_store
 >;
 
 defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
-  0x0000001e, "buffer_store_dwordx4", VReg_128, v4i32, global_store
+  mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store
 >;
-//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "buffer_atomic_swap", []>;
+
 defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic <
-  0x00000030, "buffer_atomic_swap", VReg_32, i32, atomic_swap_global
+  mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
 >;
-//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "buffer_atomic_cmpswap", []>;
+//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", []>;
 defm BUFFER_ATOMIC_ADD : MUBUF_Atomic <
-  0x00000032, "buffer_atomic_add", VReg_32, i32, atomic_add_global
+  mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global
 >;
 defm BUFFER_ATOMIC_SUB : MUBUF_Atomic <
-  0x00000033, "buffer_atomic_sub", VReg_32, i32, atomic_sub_global
+  mubuf<0x33, 0x43>, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global
 >;
-//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "buffer_atomic_rsub", []>;
+//def BUFFER_ATOMIC_RSUB : MUBUF_ <mubuf<0x34>, "buffer_atomic_rsub", []>; // isn't on CI & VI
 defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic <
-  0x00000035, "buffer_atomic_smin", VReg_32, i32, atomic_min_global
+  mubuf<0x35, 0x44>, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global
 >;
 defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic <
-  0x00000036, "buffer_atomic_umin", VReg_32, i32, atomic_umin_global
+  mubuf<0x36, 0x45>, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global
 >;
 defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic <
-  0x00000037, "buffer_atomic_smax", VReg_32, i32, atomic_max_global
+  mubuf<0x37, 0x46>, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global
 >;
 defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic <
-  0x00000038, "buffer_atomic_umax", VReg_32, i32, atomic_umax_global
+  mubuf<0x38, 0x47>, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global
 >;
 defm BUFFER_ATOMIC_AND : MUBUF_Atomic <
-  0x00000039, "buffer_atomic_and", VReg_32, i32, atomic_and_global
+  mubuf<0x39, 0x48>, "buffer_atomic_and", VGPR_32, i32, atomic_and_global
 >;
 defm BUFFER_ATOMIC_OR : MUBUF_Atomic <
-  0x0000003a, "buffer_atomic_or", VReg_32, i32, atomic_or_global
+  mubuf<0x3a, 0x49>, "buffer_atomic_or", VGPR_32, i32, atomic_or_global
 >;
 defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
-  0x0000003b, "buffer_atomic_xor", VReg_32, i32, atomic_xor_global
->;
-//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "buffer_atomic_inc", []>;
-//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "buffer_atomic_dec", []>;
-//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "buffer_atomic_fcmpswap", []>;
-//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "buffer_atomic_fmin", []>;
-//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "buffer_atomic_fmax", []>;
-//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "buffer_atomic_swap_x2", []>;
-//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "buffer_atomic_cmpswap_x2", []>;
-//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "buffer_atomic_add_x2", []>;
-//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "buffer_atomic_sub_x2", []>;
-//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "buffer_atomic_rsub_x2", []>;
-//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "buffer_atomic_smin_x2", []>;
-//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "buffer_atomic_umin_x2", []>;
-//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "buffer_atomic_smax_x2", []>;
-//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "buffer_atomic_umax_x2", []>;
-//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "buffer_atomic_and_x2", []>;
-//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "buffer_atomic_or_x2", []>;
-//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "buffer_atomic_xor_x2", []>;
-//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "buffer_atomic_inc_x2", []>;
-//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "buffer_atomic_dec_x2", []>;
-//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "buffer_atomic_fcmpswap_x2", []>;
-//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "buffer_atomic_fmin_x2", []>;
-//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "buffer_atomic_fmax_x2", []>;
-//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "buffer_wbinvl1_sc", []>;
-//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "buffer_wbinvl1", []>;
+  mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global
+>;
+//def BUFFER_ATOMIC_INC : MUBUF_ <mubuf<0x3c, 0x4b>, "buffer_atomic_inc", []>;
+//def BUFFER_ATOMIC_DEC : MUBUF_ <mubuf<0x3d, 0x4c>, "buffer_atomic_dec", []>;
+//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMIN : MUBUF_ <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMAX : MUBUF_ <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI
+//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", []>;
+//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", []>;
+//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <mubuf<0x52, 0x62>, "buffer_atomic_add_x2", []>;
+//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", []>;
+//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI
+//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", []>;
+//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", []>;
+//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", []>;
+//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", []>;
+//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <mubuf<0x59, 0x68>, "buffer_atomic_and_x2", []>;
+//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", []>;
+//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", []>;
+//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", []>;
+//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", []>;
+//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI
+//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <mubuf<0x70>, "buffer_wbinvl1_sc", []>; // isn't on CI & VI
+//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 <mubuf<0x70, 0x3f>, "buffer_wbinvl1_vol", []>; // isn't on SI
+//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <mubuf<0x71, 0x3e>, "buffer_wbinvl1", []>;
 
 //===----------------------------------------------------------------------===//
 // MTBUF Instructions
@@ -967,7 +1003,7 @@ defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
 //def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>;
 //def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>;
 defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>;
-defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VReg_32>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VGPR_32>;
 defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>;
 defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>;
 defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>;
@@ -1004,63 +1040,63 @@ defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
 //def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>;
 //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>;
 //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>;
-defm IMAGE_SAMPLE           : MIMG_Sampler <0x00000020, "image_sample">;
-defm IMAGE_SAMPLE_CL        : MIMG_Sampler <0x00000021, "image_sample_cl">;
+defm IMAGE_SAMPLE           : MIMG_Sampler_WQM <0x00000020, "image_sample">;
+defm IMAGE_SAMPLE_CL        : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">;
 defm IMAGE_SAMPLE_D         : MIMG_Sampler <0x00000022, "image_sample_d">;
 defm IMAGE_SAMPLE_D_CL      : MIMG_Sampler <0x00000023, "image_sample_d_cl">;
 defm IMAGE_SAMPLE_L         : MIMG_Sampler <0x00000024, "image_sample_l">;
-defm IMAGE_SAMPLE_B         : MIMG_Sampler <0x00000025, "image_sample_b">;
-defm IMAGE_SAMPLE_B_CL      : MIMG_Sampler <0x00000026, "image_sample_b_cl">;
+defm IMAGE_SAMPLE_B         : MIMG_Sampler_WQM <0x00000025, "image_sample_b">;
+defm IMAGE_SAMPLE_B_CL      : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">;
 defm IMAGE_SAMPLE_LZ        : MIMG_Sampler <0x00000027, "image_sample_lz">;
-defm IMAGE_SAMPLE_C         : MIMG_Sampler <0x00000028, "image_sample_c">;
-defm IMAGE_SAMPLE_C_CL      : MIMG_Sampler <0x00000029, "image_sample_c_cl">;
+defm IMAGE_SAMPLE_C         : MIMG_Sampler_WQM <0x00000028, "image_sample_c">;
+defm IMAGE_SAMPLE_C_CL      : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">;
 defm IMAGE_SAMPLE_C_D       : MIMG_Sampler <0x0000002a, "image_sample_c_d">;
 defm IMAGE_SAMPLE_C_D_CL    : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">;
 defm IMAGE_SAMPLE_C_L       : MIMG_Sampler <0x0000002c, "image_sample_c_l">;
-defm IMAGE_SAMPLE_C_B       : MIMG_Sampler <0x0000002d, "image_sample_c_b">;
-defm IMAGE_SAMPLE_C_B_CL    : MIMG_Sampler <0x0000002e, "image_sample_c_b_cl">;
+defm IMAGE_SAMPLE_C_B       : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">;
+defm IMAGE_SAMPLE_C_B_CL    : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">;
 defm IMAGE_SAMPLE_C_LZ      : MIMG_Sampler <0x0000002f, "image_sample_c_lz">;
-defm IMAGE_SAMPLE_O         : MIMG_Sampler <0x00000030, "image_sample_o">;
-defm IMAGE_SAMPLE_CL_O      : MIMG_Sampler <0x00000031, "image_sample_cl_o">;
+defm IMAGE_SAMPLE_O         : MIMG_Sampler_WQM <0x00000030, "image_sample_o">;
+defm IMAGE_SAMPLE_CL_O      : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">;
 defm IMAGE_SAMPLE_D_O       : MIMG_Sampler <0x00000032, "image_sample_d_o">;
 defm IMAGE_SAMPLE_D_CL_O    : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">;
 defm IMAGE_SAMPLE_L_O       : MIMG_Sampler <0x00000034, "image_sample_l_o">;
-defm IMAGE_SAMPLE_B_O       : MIMG_Sampler <0x00000035, "image_sample_b_o">;
-defm IMAGE_SAMPLE_B_CL_O    : MIMG_Sampler <0x00000036, "image_sample_b_cl_o">;
+defm IMAGE_SAMPLE_B_O       : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">;
+defm IMAGE_SAMPLE_B_CL_O    : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">;
 defm IMAGE_SAMPLE_LZ_O      : MIMG_Sampler <0x00000037, "image_sample_lz_o">;
-defm IMAGE_SAMPLE_C_O       : MIMG_Sampler <0x00000038, "image_sample_c_o">;
-defm IMAGE_SAMPLE_C_CL_O    : MIMG_Sampler <0x00000039, "image_sample_c_cl_o">;
+defm IMAGE_SAMPLE_C_O       : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">;
+defm IMAGE_SAMPLE_C_CL_O    : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">;
 defm IMAGE_SAMPLE_C_D_O     : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">;
 defm IMAGE_SAMPLE_C_D_CL_O  : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">;
 defm IMAGE_SAMPLE_C_L_O     : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">;
-defm IMAGE_SAMPLE_C_B_O     : MIMG_Sampler <0x0000003d, "image_sample_c_b_o">;
-defm IMAGE_SAMPLE_C_B_CL_O  : MIMG_Sampler <0x0000003e, "image_sample_c_b_cl_o">;
+defm IMAGE_SAMPLE_C_B_O     : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">;
+defm IMAGE_SAMPLE_C_B_CL_O  : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">;
 defm IMAGE_SAMPLE_C_LZ_O    : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">;
-defm IMAGE_GATHER4          : MIMG_Gather <0x00000040, "image_gather4">;
-defm IMAGE_GATHER4_CL       : MIMG_Gather <0x00000041, "image_gather4_cl">;
+defm IMAGE_GATHER4          : MIMG_Gather_WQM <0x00000040, "image_gather4">;
+defm IMAGE_GATHER4_CL       : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">;
 defm IMAGE_GATHER4_L        : MIMG_Gather <0x00000044, "image_gather4_l">;
-defm IMAGE_GATHER4_B        : MIMG_Gather <0x00000045, "image_gather4_b">;
-defm IMAGE_GATHER4_B_CL     : MIMG_Gather <0x00000046, "image_gather4_b_cl">;
+defm IMAGE_GATHER4_B        : MIMG_Gather_WQM <0x00000045, "image_gather4_b">;
+defm IMAGE_GATHER4_B_CL     : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">;
 defm IMAGE_GATHER4_LZ       : MIMG_Gather <0x00000047, "image_gather4_lz">;
-defm IMAGE_GATHER4_C        : MIMG_Gather <0x00000048, "image_gather4_c">;
-defm IMAGE_GATHER4_C_CL     : MIMG_Gather <0x00000049, "image_gather4_c_cl">;
+defm IMAGE_GATHER4_C        : MIMG_Gather_WQM <0x00000048, "image_gather4_c">;
+defm IMAGE_GATHER4_C_CL     : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">;
 defm IMAGE_GATHER4_C_L      : MIMG_Gather <0x0000004c, "image_gather4_c_l">;
-defm IMAGE_GATHER4_C_B      : MIMG_Gather <0x0000004d, "image_gather4_c_b">;
-defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather <0x0000004e, "image_gather4_c_b_cl">;
+defm IMAGE_GATHER4_C_B      : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">;
+defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">;
 defm IMAGE_GATHER4_C_LZ     : MIMG_Gather <0x0000004f, "image_gather4_c_lz">;
-defm IMAGE_GATHER4_O        : MIMG_Gather <0x00000050, "image_gather4_o">;
-defm IMAGE_GATHER4_CL_O     : MIMG_Gather <0x00000051, "image_gather4_cl_o">;
+defm IMAGE_GATHER4_O        : MIMG_Gather_WQM <0x00000050, "image_gather4_o">;
+defm IMAGE_GATHER4_CL_O     : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">;
 defm IMAGE_GATHER4_L_O      : MIMG_Gather <0x00000054, "image_gather4_l_o">;
-defm IMAGE_GATHER4_B_O      : MIMG_Gather <0x00000055, "image_gather4_b_o">;
+defm IMAGE_GATHER4_B_O      : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">;
 defm IMAGE_GATHER4_B_CL_O   : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">;
 defm IMAGE_GATHER4_LZ_O     : MIMG_Gather <0x00000057, "image_gather4_lz_o">;
-defm IMAGE_GATHER4_C_O      : MIMG_Gather <0x00000058, "image_gather4_c_o">;
-defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather <0x00000059, "image_gather4_c_cl_o">;
+defm IMAGE_GATHER4_C_O      : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">;
+defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">;
 defm IMAGE_GATHER4_C_L_O    : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">;
-defm IMAGE_GATHER4_C_B_O    : MIMG_Gather <0x0000005d, "image_gather4_c_b_o">;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "image_gather4_c_b_cl_o">;
+defm IMAGE_GATHER4_C_B_O    : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">;
 defm IMAGE_GATHER4_C_LZ_O   : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
-defm IMAGE_GET_LOD          : MIMG_Sampler <0x00000060, "image_get_lod">;
+defm IMAGE_GET_LOD          : MIMG_Sampler_WQM <0x00000060, "image_get_lod">;
 defm IMAGE_SAMPLE_CD        : MIMG_Sampler <0x00000068, "image_sample_cd">;
 defm IMAGE_SAMPLE_CD_CL     : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
 defm IMAGE_SAMPLE_C_CD      : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
@@ -1077,25 +1113,25 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o"
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasFlatAddressSpace] in {
-def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "flat_load_ubyte", VReg_32>;
-def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "flat_load_sbyte", VReg_32>;
-def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "flat_load_ushort", VReg_32>;
-def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "flat_load_sshort", VReg_32>;
-def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "flat_load_dword", VReg_32>;
+def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "flat_load_ubyte", VGPR_32>;
+def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "flat_load_sbyte", VGPR_32>;
+def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "flat_load_ushort", VGPR_32>;
+def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "flat_load_sshort", VGPR_32>;
+def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "flat_load_dword", VGPR_32>;
 def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0x0000000d, "flat_load_dwordx2", VReg_64>;
 def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0x0000000e, "flat_load_dwordx4", VReg_128>;
 def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0x00000010, "flat_load_dwordx3", VReg_96>;
 
 def FLAT_STORE_BYTE : FLAT_Store_Helper <
-  0x00000018, "flat_store_byte", VReg_32
+  0x00000018, "flat_store_byte", VGPR_32
 >;
 
 def FLAT_STORE_SHORT : FLAT_Store_Helper <
-  0x0000001a, "flat_store_short", VReg_32
+  0x0000001a, "flat_store_short", VGPR_32
 >;
 
 def FLAT_STORE_DWORD : FLAT_Store_Helper <
-  0x0000001c, "flat_store_dword", VReg_32
+  0x0000001c, "flat_store_dword", VGPR_32
 >;
 
 def FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
@@ -1150,7 +1186,9 @@ def FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
 // VOP1 Instructions
 //===----------------------------------------------------------------------===//
 
-//def V_NOP : VOP1_ <0x00000000, "v_nop", []>;
+let vdst = 0, src0 = 0 in {
+defm V_NOP : VOP1_m <vop1<0x0>, (outs), (ins), "v_nop", [], "v_nop">;
+}
 
 let isMoveImm = 1 in {
 defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>;
@@ -1158,16 +1196,20 @@ defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>;
 
 let Uses = [EXEC] in {
 
+// FIXME: Specify SchedRW for READFIRSTLANE_B32
+
 def V_READFIRSTLANE_B32 : VOP1 <
   0x00000002,
   (outs SReg_32:$vdst),
-  (ins VReg_32:$src0),
+  (ins VGPR_32:$src0),
   "v_readfirstlane_b32 $vdst, $src0",
   []
 >;
 
 }
 
+let SchedRW = [WriteQuarterRate32] in {
+
 defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64",
   VOP_I32_F64, fp_to_sint
 >;
@@ -1193,9 +1235,11 @@ defm V_CVT_F16_F32 : VOP1Inst <vop1<0xa>, "v_cvt_f16_f32",
 defm V_CVT_F32_F16 : VOP1Inst <vop1<0xb>, "v_cvt_f32_f16",
   VOP_F32_I32, f16_to_fp
 >;
-//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "v_cvt_rpi_i32_f32", []>;
-//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "v_cvt_flr_i32_f32", []>;
-//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "v_cvt_off_f32_i4", []>;
+defm V_CVT_RPI_I32_F32 : VOP1Inst <vop1<0xc>, "v_cvt_rpi_i32_f32",
+  VOP_I32_F32, cvt_rpi_i32_f32>;
+defm V_CVT_FLR_I32_F32 : VOP1Inst <vop1<0xd>, "v_cvt_flr_i32_f32",
+  VOP_I32_F32, cvt_flr_i32_f32>;
+defm V_CVT_OFF_F32_I4 : VOP1Inst  <vop1<0x0e>, "v_cvt_off_f32_i4", VOP_F32_I32>;
 defm V_CVT_F32_F64 : VOP1Inst <vop1<0xf>, "v_cvt_f32_f64",
   VOP_F32_F64, fround
 >;
@@ -1221,493 +1265,580 @@ defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32",
   VOP_F64_I32, uint_to_fp
 >;
 
-defm V_FRACT_F32 : VOP1Inst <vop1<0x20>, "v_fract_f32",
+} // let SchedRW = [WriteQuarterRate32]
+
+defm V_FRACT_F32 : VOP1Inst <vop1<0x20, 0x1b>, "v_fract_f32",
   VOP_F32_F32, AMDGPUfract
 >;
-defm V_TRUNC_F32 : VOP1Inst <vop1<0x21>, "v_trunc_f32",
+defm V_TRUNC_F32 : VOP1Inst <vop1<0x21, 0x1c>, "v_trunc_f32",
   VOP_F32_F32, ftrunc
 >;
-defm V_CEIL_F32 : VOP1Inst <vop1<0x22>, "v_ceil_f32",
+defm V_CEIL_F32 : VOP1Inst <vop1<0x22, 0x1d>, "v_ceil_f32",
   VOP_F32_F32, fceil
 >;
-defm V_RNDNE_F32 : VOP1Inst <vop1<0x23>, "v_rndne_f32",
+defm V_RNDNE_F32 : VOP1Inst <vop1<0x23, 0x1e>, "v_rndne_f32",
   VOP_F32_F32, frint
 >;
-defm V_FLOOR_F32 : VOP1Inst <vop1<0x24>, "v_floor_f32",
+defm V_FLOOR_F32 : VOP1Inst <vop1<0x24, 0x1f>, "v_floor_f32",
   VOP_F32_F32, ffloor
 >;
-defm V_EXP_F32 : VOP1Inst <vop1<0x25>, "v_exp_f32",
+defm V_EXP_F32 : VOP1Inst <vop1<0x25, 0x20>, "v_exp_f32",
   VOP_F32_F32, fexp2
 >;
-defm V_LOG_CLAMP_F32 : VOP1Inst <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>;
-defm V_LOG_F32 : VOP1Inst <vop1<0x27>, "v_log_f32",
+
+let SchedRW = [WriteQuarterRate32] in {
+
+defm V_LOG_F32 : VOP1Inst <vop1<0x27, 0x21>, "v_log_f32",
   VOP_F32_F32, flog2
 >;
-
-defm V_RCP_CLAMP_F32 : VOP1Inst <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>;
-defm V_RCP_LEGACY_F32 : VOP1Inst <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>;
-defm V_RCP_F32 : VOP1Inst <vop1<0x2a>, "v_rcp_f32",
+defm V_RCP_F32 : VOP1Inst <vop1<0x2a, 0x22>, "v_rcp_f32",
   VOP_F32_F32, AMDGPUrcp
 >;
-defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b>, "v_rcp_iflag_f32", VOP_F32_F32>;
-defm V_RSQ_CLAMP_F32 : VOP1Inst <vop1<0x2c>, "v_rsq_clamp_f32",
-  VOP_F32_F32, AMDGPUrsq_clamped
+defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b, 0x23>, "v_rcp_iflag_f32",
+  VOP_F32_F32
 >;
-defm V_RSQ_LEGACY_F32 : VOP1Inst <vop1<0x2d>, "v_rsq_legacy_f32",
-  VOP_F32_F32, AMDGPUrsq_legacy
->;
-defm V_RSQ_F32 : VOP1Inst <vop1<0x2e>, "v_rsq_f32",
+defm V_RSQ_F32 : VOP1Inst <vop1<0x2e, 0x24>, "v_rsq_f32",
   VOP_F32_F32, AMDGPUrsq
 >;
-defm V_RCP_F64 : VOP1Inst <vop1<0x2f>, "v_rcp_f64",
+
+} //let SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+
+defm V_RCP_F64 : VOP1Inst <vop1<0x2f, 0x25>, "v_rcp_f64",
   VOP_F64_F64, AMDGPUrcp
 >;
-defm V_RCP_CLAMP_F64 : VOP1Inst <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>;
-defm V_RSQ_F64 : VOP1Inst <vop1<0x31>, "v_rsq_f64",
+defm V_RSQ_F64 : VOP1Inst <vop1<0x31, 0x26>, "v_rsq_f64",
   VOP_F64_F64, AMDGPUrsq
 >;
-defm V_RSQ_CLAMP_F64 : VOP1Inst <vop1<0x32>, "v_rsq_clamp_f64",
-  VOP_F64_F64, AMDGPUrsq_clamped
->;
-defm V_SQRT_F32 : VOP1Inst <vop1<0x33>, "v_sqrt_f32",
+
+} // let SchedRW = [WriteDouble];
+
+defm V_SQRT_F32 : VOP1Inst <vop1<0x33, 0x27>, "v_sqrt_f32",
   VOP_F32_F32, fsqrt
 >;
-defm V_SQRT_F64 : VOP1Inst <vop1<0x34>, "v_sqrt_f64",
+
+let SchedRW = [WriteDouble] in {
+
+defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64",
   VOP_F64_F64, fsqrt
 >;
-defm V_SIN_F32 : VOP1Inst <vop1<0x35>, "v_sin_f32",
+
+} // let SchedRW = [WriteDouble]
+
+defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32",
   VOP_F32_F32, AMDGPUsin
 >;
-defm V_COS_F32 : VOP1Inst <vop1<0x36>, "v_cos_f32",
+defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32",
   VOP_F32_F32, AMDGPUcos
 >;
-defm V_NOT_B32 : VOP1Inst <vop1<0x37>, "v_not_b32", VOP_I32_I32>;
-defm V_BFREV_B32 : VOP1Inst <vop1<0x38>, "v_bfrev_b32", VOP_I32_I32>;
-defm V_FFBH_U32 : VOP1Inst <vop1<0x39>, "v_ffbh_u32", VOP_I32_I32>;
-defm V_FFBL_B32 : VOP1Inst <vop1<0x3a>, "v_ffbl_b32", VOP_I32_I32>;
-defm V_FFBH_I32 : VOP1Inst <vop1<0x3b>, "v_ffbh_i32", VOP_I32_I32>;
-//defm V_FREXP_EXP_I32_F64 : VOPInst <0x0000003c, "v_frexp_exp_i32_f64", VOP_I32_F32>;
-defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d>, "v_frexp_mant_f64", VOP_F64_F64>;
-defm V_FRACT_F64 : VOP1Inst <vop1<0x3e>, "v_fract_f64", VOP_F64_F64>;
-//defm V_FREXP_EXP_I32_F32 : VOPInst <0x0000003f, "v_frexp_exp_i32_f32", VOP_I32_F32>;
-defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40>, "v_frexp_mant_f32", VOP_F32_F32>;
-//def V_CLREXCP : VOP1_ <0x00000041, "v_clrexcp", []>;
-defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42>, "v_movreld_b32", VOP_I32_I32>;
-defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43>, "v_movrels_b32", VOP_I32_I32>;
-defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44>, "v_movrelsd_b32", VOP_I32_I32>;
+defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>;
+defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>;
+defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>;
+defm V_FFBL_B32 : VOP1Inst <vop1<0x3a, 0x2e>, "v_ffbl_b32", VOP_I32_I32>;
+defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>;
+defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64",
+  VOP_I32_F64
+>;
+defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64",
+  VOP_F64_F64
+>;
+defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", VOP_F64_F64>;
+defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32",
+  VOP_I32_F32
+>;
+defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32",
+  VOP_F32_F32
+>;
+let vdst = 0, src0 = 0 in {
+defm V_CLREXCP : VOP1_m <vop1<0x41,0x35>, (outs), (ins), "v_clrexcp", [],
+  "v_clrexcp"
+>;
+}
+defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>;
+defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>;
+defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>;
+
+// These instruction only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+
+let SchedRW = [WriteQuarterRate32] in {
+
+defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>;
+defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>;
+defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>;
+defm V_RSQ_CLAMP_F32 : VOP1InstSI <vop1<0x2c>, "v_rsq_clamp_f32",
+  VOP_F32_F32, AMDGPUrsq_clamped
+>;
+defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32",
+  VOP_F32_F32, AMDGPUrsq_legacy
+>;
+
+} // End let SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+
+defm V_RCP_CLAMP_F64 : VOP1InstSI <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>;
+defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64",
+  VOP_F64_F64, AMDGPUrsq_clamped
+>;
+
+} // End SchedRW = [WriteDouble]
 
+} // End SubtargetPredicate = isSICI
 
 //===----------------------------------------------------------------------===//
 // VINTRP Instructions
 //===----------------------------------------------------------------------===//
 
-def V_INTERP_P1_F32 : VINTRP <
-  0x00000000,
-  (outs VReg_32:$dst),
-  (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+// FIXME: Specify SchedRW for VINTRP insturctions.
+defm V_INTERP_P1_F32 : VINTRP_m <
+  0x00000000, "v_interp_p1_f32",
+  (outs VGPR_32:$dst),
+  (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
   "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [$m0]",
-  []> {
-  let DisableEncoding = "$m0";
-}
+  "$m0">;
 
-def V_INTERP_P2_F32 : VINTRP <
-  0x00000001,
-  (outs VReg_32:$dst),
-  (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+defm V_INTERP_P2_F32 : VINTRP_m <
+  0x00000001, "v_interp_p2_f32",
+  (outs VGPR_32:$dst),
+  (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
   "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]",
-  []> {
-
-  let Constraints = "$src0 = $dst";
-  let DisableEncoding = "$src0,$m0";
+  "$src0,$m0",
+  "$src0 = $dst">;
 
-}
-
-def V_INTERP_MOV_F32 : VINTRP <
-  0x00000002,
-  (outs VReg_32:$dst),
+defm V_INTERP_MOV_F32 : VINTRP_m <
+  0x00000002, "v_interp_mov_f32",
+  (outs VGPR_32:$dst),
   (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
   "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [$m0]",
-  []> {
-  let DisableEncoding = "$m0";
-}
+  "$m0">;
 
 //===----------------------------------------------------------------------===//
 // VOP2 Instructions
 //===----------------------------------------------------------------------===//
 
-def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
-  (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc),
-  "v_cndmask_b32_e32 $dst, $src0, $src1, [$vcc]",
-  []
->{
-  let DisableEncoding = "$vcc";
-}
-
-def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
+defm V_CNDMASK_B32_e64 : VOP3_m_nomods <vop3<0x100>, (outs VGPR_32:$dst),
   (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2),
   "v_cndmask_b32_e64 $dst, $src0, $src1, $src2",
-  [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))]
-> {
-  let src0_modifiers = 0;
-  let src1_modifiers = 0;
-  let src2_modifiers = 0;
-}
-
-def V_READLANE_B32 : VOP2 <
-  0x00000001,
-  (outs SReg_32:$vdst),
-  (ins VReg_32:$src0, SSrc_32:$vsrc1),
-  "v_readlane_b32 $vdst, $src0, $vsrc1",
-  []
+  [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))],
+  "v_cndmask_b32_e64", 3
 >;
 
-def V_WRITELANE_B32 : VOP2 <
-  0x00000002,
-  (outs VReg_32:$vdst),
-  (ins SReg_32:$src0, SSrc_32:$vsrc1),
-  "v_writelane_b32 $vdst, $src0, $vsrc1",
-  []
->;
 
 let isCommutable = 1 in {
-defm V_ADD_F32 : VOP2Inst <vop2<0x3>, "v_add_f32",
+defm V_ADD_F32 : VOP2Inst <vop2<0x3, 0x1>, "v_add_f32",
   VOP_F32_F32_F32, fadd
 >;
 
-defm V_SUB_F32 : VOP2Inst <vop2<0x4>, "v_sub_f32", VOP_F32_F32_F32, fsub>;
-defm V_SUBREV_F32 : VOP2Inst <vop2<0x5>, "v_subrev_f32",
+defm V_SUB_F32 : VOP2Inst <vop2<0x4, 0x2>, "v_sub_f32", VOP_F32_F32_F32, fsub>;
+defm V_SUBREV_F32 : VOP2Inst <vop2<0x5, 0x3>, "v_subrev_f32",
   VOP_F32_F32_F32, null_frag, "v_sub_f32"
 >;
 } // End isCommutable = 1
 
 let isCommutable = 1 in {
 
-defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "v_mac_legacy_f32",
-  VOP_F32_F32_F32
->;
-
-defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7>, "v_mul_legacy_f32",
+defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7, 0x4>, "v_mul_legacy_f32",
   VOP_F32_F32_F32, int_AMDGPU_mul
 >;
 
-defm V_MUL_F32 : VOP2Inst <vop2<0x8>, "v_mul_f32",
+defm V_MUL_F32 : VOP2Inst <vop2<0x8, 0x5>, "v_mul_f32",
   VOP_F32_F32_F32, fmul
 >;
 
-defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9>, "v_mul_i32_i24",
+defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9, 0x6>, "v_mul_i32_i24",
   VOP_I32_I32_I32, AMDGPUmul_i24
 >;
-//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "v_mul_hi_i32_i24", []>;
-defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb>, "v_mul_u32_u24",
-  VOP_I32_I32_I32, AMDGPUmul_u24
->;
-//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "v_mul_hi_u32_u24", []>;
-
 
-defm V_MIN_LEGACY_F32 : VOP2Inst <vop2<0xd>, "v_min_legacy_f32",
-  VOP_F32_F32_F32, AMDGPUfmin_legacy
+defm V_MUL_HI_I32_I24 : VOP2Inst <vop2<0xa,0x7>, "v_mul_hi_i32_i24",
+  VOP_I32_I32_I32
 >;
 
-defm V_MAX_LEGACY_F32 : VOP2Inst <vop2<0xe>, "v_max_legacy_f32",
-  VOP_F32_F32_F32, AMDGPUfmax_legacy
+defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb, 0x8>, "v_mul_u32_u24",
+  VOP_I32_I32_I32, AMDGPUmul_u24
 >;
 
-defm V_MIN_F32 : VOP2Inst <vop2<0xf>, "v_min_f32", VOP_F32_F32_F32, fminnum>;
-defm V_MAX_F32 : VOP2Inst <vop2<0x10>, "v_max_f32", VOP_F32_F32_F32, fmaxnum>;
-defm V_MIN_I32 : VOP2Inst <vop2<0x11>, "v_min_i32", VOP_I32_I32_I32, AMDGPUsmin>;
-defm V_MAX_I32 : VOP2Inst <vop2<0x12>, "v_max_i32", VOP_I32_I32_I32, AMDGPUsmax>;
-defm V_MIN_U32 : VOP2Inst <vop2<0x13>, "v_min_u32", VOP_I32_I32_I32, AMDGPUumin>;
-defm V_MAX_U32 : VOP2Inst <vop2<0x14>, "v_max_u32", VOP_I32_I32_I32, AMDGPUumax>;
+defm V_MUL_HI_U32_U24 : VOP2Inst <vop2<0xc,0x9>, "v_mul_hi_u32_u24",
+ VOP_I32_I32_I32
+>;
 
-defm V_LSHR_B32 : VOP2Inst <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32, srl>;
+defm V_MIN_F32 : VOP2Inst <vop2<0xf, 0xa>, "v_min_f32", VOP_F32_F32_F32,
+  fminnum>;
+defm V_MAX_F32 : VOP2Inst <vop2<0x10, 0xb>, "v_max_f32", VOP_F32_F32_F32,
+  fmaxnum>;
+defm V_MIN_I32 : VOP2Inst <vop2<0x11, 0xc>, "v_min_i32", VOP_I32_I32_I32>;
+defm V_MAX_I32 : VOP2Inst <vop2<0x12, 0xd>, "v_max_i32", VOP_I32_I32_I32>;
+defm V_MIN_U32 : VOP2Inst <vop2<0x13, 0xe>, "v_min_u32", VOP_I32_I32_I32>;
+defm V_MAX_U32 : VOP2Inst <vop2<0x14, 0xf>, "v_max_u32", VOP_I32_I32_I32>;
 
 defm V_LSHRREV_B32 : VOP2Inst <
-  vop2<0x16>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32"
+  vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag,
+    "v_lshr_b32"
 >;
 
-defm V_ASHR_I32 : VOP2Inst <vop2<0x17>, "v_ashr_i32",
-  VOP_I32_I32_I32, sra
->;
 defm V_ASHRREV_I32 : VOP2Inst <
-  vop2<0x18>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32"
+  vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag,
+    "v_ashr_i32"
 >;
 
-let hasPostISelHook = 1 in {
-
-defm V_LSHL_B32 : VOP2Inst <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32, shl>;
-
-}
 defm V_LSHLREV_B32 : VOP2Inst <
-  vop2<0x1a>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32"
+  vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag,
+    "v_lshl_b32"
 >;
 
-defm V_AND_B32 : VOP2Inst <vop2<0x1b>, "v_and_b32",
-  VOP_I32_I32_I32, and>;
-defm V_OR_B32 : VOP2Inst <vop2<0x1c>, "v_or_b32",
-  VOP_I32_I32_I32, or
->;
-defm V_XOR_B32 : VOP2Inst <vop2<0x1d>, "v_xor_b32",
-  VOP_I32_I32_I32, xor
->;
-
-} // End isCommutable = 1
-
-defm V_BFM_B32 : VOP2Inst <vop2<0x1e>, "v_bfm_b32",
-  VOP_I32_I32_I32, AMDGPUbfm>;
+defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>;
+defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>;
+defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>;
 
-let isCommutable = 1 in {
-defm V_MAC_F32 : VOP2Inst <vop2<0x1f>, "v_mac_f32", VOP_F32_F32_F32>;
+defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>;
 } // End isCommutable = 1
 
-defm V_MADMK_F32 : VOP2Inst <vop2<0x20>, "v_madmk_f32", VOP_F32_F32_F32>;
+defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">;
 
 let isCommutable = 1 in {
-defm V_MADAK_F32 : VOP2Inst <vop2<0x21>, "v_madak_f32", VOP_F32_F32_F32>;
+defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">;
 } // End isCommutable = 1
 
-
-defm V_BCNT_U32_B32 : VOP2Inst <vop2<0x22>, "v_bcnt_u32_b32", VOP_I32_I32_I32>;
-defm V_MBCNT_LO_U32_B32 : VOP2Inst <vop2<0x23>, "v_mbcnt_lo_u32_b32",
-
-  VOP_I32_I32_I32
->;
-defm V_MBCNT_HI_U32_B32 : VOP2Inst <vop2<0x24>, "v_mbcnt_hi_u32_b32",
-  VOP_I32_I32_I32
->;
-
 let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
-defm V_ADD_I32 : VOP2bInst <vop2<0x25>, "v_add_i32",
+
+// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
+// but the VI instructions behave the same as the SI versions.
+defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32",
   VOP_I32_I32_I32, add
 >;
-defm V_SUB_I32 : VOP2bInst <vop2<0x26>, "v_sub_i32",
-  VOP_I32_I32_I32, sub
->;
-defm V_SUBREV_I32 : VOP2bInst <vop2<0x27>, "v_subrev_i32",
+defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP_I32_I32_I32>;
+
+defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32",
   VOP_I32_I32_I32, null_frag, "v_sub_i32"
 >;
 
 let Uses = [VCC] in { // Carry-in comes from VCC
-defm V_ADDC_U32 : VOP2bInst <vop2<0x28>, "v_addc_u32",
-  VOP_I32_I32_I32_VCC, adde
+defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32",
+  VOP_I32_I32_I32_VCC
 >;
-defm V_SUBB_U32 : VOP2bInst <vop2<0x29>, "v_subb_u32",
-  VOP_I32_I32_I32_VCC, sube
+defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32",
+  VOP_I32_I32_I32_VCC
 >;
-defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a>, "v_subbrev_u32",
+defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32",
   VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32"
 >;
 
 } // End Uses = [VCC]
 } // End isCommutable = 1, Defs = [VCC]
 
-defm V_LDEXP_F32 : VOP2Inst <vop2<0x2b>, "v_ldexp_f32",
+defm V_READLANE_B32 : VOP2SI_3VI_m <
+  vop3 <0x001, 0x289>,
+  "v_readlane_b32",
+  (outs SReg_32:$vdst),
+  (ins VGPR_32:$src0, SCSrc_32:$src1),
+  "v_readlane_b32 $vdst, $src0, $src1"
+>;
+
+defm V_WRITELANE_B32 : VOP2SI_3VI_m <
+  vop3 <0x002, 0x28a>,
+  "v_writelane_b32",
+  (outs VGPR_32:$vdst),
+  (ins SReg_32:$src0, SCSrc_32:$src1),
+  "v_writelane_b32 $vdst, $src0, $src1"
+>;
+
+// These instructions only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+
+defm V_MIN_LEGACY_F32 : VOP2InstSI <vop2<0xd>, "v_min_legacy_f32",
+  VOP_F32_F32_F32, AMDGPUfmin_legacy
+>;
+defm V_MAX_LEGACY_F32 : VOP2InstSI <vop2<0xe>, "v_max_legacy_f32",
+  VOP_F32_F32_F32, AMDGPUfmax_legacy
+>;
+
+let isCommutable = 1 in {
+defm V_LSHR_B32 : VOP2InstSI <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32>;
+defm V_ASHR_I32 : VOP2InstSI <vop2<0x17>, "v_ashr_i32", VOP_I32_I32_I32>;
+defm V_LSHL_B32 : VOP2InstSI <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32>;
+} // End isCommutable = 1
+} // End let SubtargetPredicate = SICI
+
+let isCommutable = 1 in {
+defm V_MAC_LEGACY_F32 : VOP2_VI3_Inst <vop23<0x6, 0x28e>, "v_mac_legacy_f32",
+  VOP_F32_F32_F32
+>;
+} // End isCommutable = 1
+
+defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32", VOP_I32_I32_I32,
+  AMDGPUbfm
+>;
+defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32",
+  VOP_I32_I32_I32
+>;
+defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32",
+  VOP_I32_I32_I32
+>;
+defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32",
+  VOP_I32_I32_I32
+>;
+defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32",
   VOP_F32_F32_I32, AMDGPUldexp
 >;
-////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "v_cvt_pkaccum_u8_f32", []>;
-////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "v_cvt_pknorm_i16_f32", []>;
-////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "v_cvt_pknorm_u16_f32", []>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <vop2<0x2f>, "v_cvt_pkrtz_f16_f32",
- VOP_I32_F32_F32, int_SI_packf16
+
+
+defm V_CVT_PKACCUM_U8_F32 : VOP2_VI3_Inst <vop23<0x2c, 0x1f0>, "v_cvt_pkaccum_u8_f32",
+  VOP_I32_F32_I32>; // TODO: set "Uses = dst"
+
+defm V_CVT_PKNORM_I16_F32 : VOP2_VI3_Inst <vop23<0x2d, 0x294>, "v_cvt_pknorm_i16_f32",
+  VOP_I32_F32_F32
+>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_VI3_Inst <vop23<0x2e, 0x295>, "v_cvt_pknorm_u16_f32",
+  VOP_I32_F32_F32
+>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst <vop23<0x2f, 0x296>, "v_cvt_pkrtz_f16_f32",
+  VOP_I32_F32_F32, int_SI_packf16
+>;
+defm V_CVT_PK_U16_U32 : VOP2_VI3_Inst <vop23<0x30, 0x297>, "v_cvt_pk_u16_u32",
+  VOP_I32_I32_I32
+>;
+defm V_CVT_PK_I16_I32 : VOP2_VI3_Inst <vop23<0x31, 0x298>, "v_cvt_pk_i16_i32",
+  VOP_I32_I32_I32
 >;
-////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "v_cvt_pk_u16_u32", []>;
-////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "v_cvt_pk_i16_i32", []>;
 
 //===----------------------------------------------------------------------===//
 // VOP3 Instructions
 //===----------------------------------------------------------------------===//
 
 let isCommutable = 1 in {
-defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140>, "v_mad_legacy_f32",
+defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140, 0x1c0>, "v_mad_legacy_f32",
   VOP_F32_F32_F32_F32
 >;
 
-defm V_MAD_F32 : VOP3Inst <vop3<0x141>, "v_mad_f32",
+defm V_MAD_F32 : VOP3Inst <vop3<0x141, 0x1c1>, "v_mad_f32",
   VOP_F32_F32_F32_F32, fmad
 >;
 
-defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142>, "v_mad_i32_i24",
+defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142, 0x1c2>, "v_mad_i32_i24",
   VOP_I32_I32_I32_I32, AMDGPUmad_i24
 >;
-defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143>, "v_mad_u32_u24",
+defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143, 0x1c3>, "v_mad_u32_u24",
   VOP_I32_I32_I32_I32, AMDGPUmad_u24
 >;
 } // End isCommutable = 1
 
-defm V_CUBEID_F32 : VOP3Inst <vop3<0x144>, "v_cubeid_f32",
+defm V_CUBEID_F32 : VOP3Inst <vop3<0x144, 0x1c4>, "v_cubeid_f32",
   VOP_F32_F32_F32_F32
 >;
-defm V_CUBESC_F32 : VOP3Inst <vop3<0x145>, "v_cubesc_f32",
+defm V_CUBESC_F32 : VOP3Inst <vop3<0x145, 0x1c5>, "v_cubesc_f32",
   VOP_F32_F32_F32_F32
 >;
-defm V_CUBETC_F32 : VOP3Inst <vop3<0x146>, "v_cubetc_f32",
+defm V_CUBETC_F32 : VOP3Inst <vop3<0x146, 0x1c6>, "v_cubetc_f32",
   VOP_F32_F32_F32_F32
 >;
-defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147>, "v_cubema_f32",
+defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147, 0x1c7>, "v_cubema_f32",
   VOP_F32_F32_F32_F32
 >;
-defm V_BFE_U32 : VOP3Inst <vop3<0x148>, "v_bfe_u32",
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+defm V_BFE_U32 : VOP3Inst <vop3<0x148, 0x1c8>, "v_bfe_u32",
   VOP_I32_I32_I32_I32, AMDGPUbfe_u32
 >;
-defm V_BFE_I32 : VOP3Inst <vop3<0x149>, "v_bfe_i32",
+defm V_BFE_I32 : VOP3Inst <vop3<0x149, 0x1c9>, "v_bfe_i32",
   VOP_I32_I32_I32_I32, AMDGPUbfe_i32
 >;
-defm V_BFI_B32 : VOP3Inst <vop3<0x14a>, "v_bfi_b32",
+}
+
+defm V_BFI_B32 : VOP3Inst <vop3<0x14a, 0x1ca>, "v_bfi_b32",
   VOP_I32_I32_I32_I32, AMDGPUbfi
 >;
 
 let isCommutable = 1 in {
-defm V_FMA_F32 : VOP3Inst <vop3<0x14b>, "v_fma_f32",
+defm V_FMA_F32 : VOP3Inst <vop3<0x14b, 0x1cb>, "v_fma_f32",
   VOP_F32_F32_F32_F32, fma
 >;
-defm V_FMA_F64 : VOP3Inst <vop3<0x14c>, "v_fma_f64",
+defm V_FMA_F64 : VOP3Inst <vop3<0x14c, 0x1cc>, "v_fma_f64",
   VOP_F64_F64_F64_F64, fma
 >;
 } // End isCommutable = 1
 
 //def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>;
-defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e>, "v_alignbit_b32",
+defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e, 0x1ce>, "v_alignbit_b32",
   VOP_I32_I32_I32_I32
 >;
-defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f>, "v_alignbyte_b32",
+defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f, 0x1cf>, "v_alignbyte_b32",
   VOP_I32_I32_I32_I32
 >;
-defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32",
-  VOP_F32_F32_F32_F32>;
-defm V_MIN3_F32 : VOP3Inst <vop3<0x151>, "v_min3_f32",
+
+defm V_MIN3_F32 : VOP3Inst <vop3<0x151, 0x1d0>, "v_min3_f32",
   VOP_F32_F32_F32_F32, AMDGPUfmin3>;
 
-defm V_MIN3_I32 : VOP3Inst <vop3<0x152>, "v_min3_i32",
+defm V_MIN3_I32 : VOP3Inst <vop3<0x152, 0x1d1>, "v_min3_i32",
   VOP_I32_I32_I32_I32, AMDGPUsmin3
 >;
-defm V_MIN3_U32 : VOP3Inst <vop3<0x153>, "v_min3_u32",
+defm V_MIN3_U32 : VOP3Inst <vop3<0x153, 0x1d2>, "v_min3_u32",
   VOP_I32_I32_I32_I32, AMDGPUumin3
 >;
-defm V_MAX3_F32 : VOP3Inst <vop3<0x154>, "v_max3_f32",
+defm V_MAX3_F32 : VOP3Inst <vop3<0x154, 0x1d3>, "v_max3_f32",
   VOP_F32_F32_F32_F32, AMDGPUfmax3
 >;
-defm V_MAX3_I32 : VOP3Inst <vop3<0x155>, "v_max3_i32",
+defm V_MAX3_I32 : VOP3Inst <vop3<0x155, 0x1d4>, "v_max3_i32",
   VOP_I32_I32_I32_I32, AMDGPUsmax3
 >;
-defm V_MAX3_U32 : VOP3Inst <vop3<0x156>, "v_max3_u32",
+defm V_MAX3_U32 : VOP3Inst <vop3<0x156, 0x1d5>, "v_max3_u32",
   VOP_I32_I32_I32_I32, AMDGPUumax3
 >;
-//def V_MED3_F32 : VOP3_MED3 <0x00000157, "v_med3_f32", []>;
-//def V_MED3_I32 : VOP3_MED3 <0x00000158, "v_med3_i32", []>;
-//def V_MED3_U32 : VOP3_MED3 <0x00000159, "v_med3_u32", []>;
+defm V_MED3_F32 : VOP3Inst <vop3<0x157, 0x1d6>, "v_med3_f32",
+  VOP_F32_F32_F32_F32
+>;
+defm V_MED3_I32 : VOP3Inst <vop3<0x158, 0x1d7>, "v_med3_i32",
+  VOP_I32_I32_I32_I32
+>;
+defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32",
+  VOP_I32_I32_I32_I32
+>;
+
 //def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>;
 //def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>;
 //def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>;
-defm V_SAD_U32 : VOP3Inst <vop3<0x15d>, "v_sad_u32",
+defm V_SAD_U32 : VOP3Inst <vop3<0x15d, 0x1dc>, "v_sad_u32",
   VOP_I32_I32_I32_I32
 >;
 ////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>;
 defm V_DIV_FIXUP_F32 : VOP3Inst <
-  vop3<0x15f>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
+  vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
 >;
+
+let SchedRW = [WriteDouble] in {
+
 defm V_DIV_FIXUP_F64 : VOP3Inst <
-  vop3<0x160>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup
+  vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup
 >;
 
-defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64",
-  VOP_I64_I64_I32, shl
->;
-defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64",
-  VOP_I64_I64_I32, srl
->;
-defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64",
-  VOP_I64_I64_I32, sra
->;
+} // let SchedRW = [WriteDouble]
 
+let SchedRW = [WriteDouble] in {
 let isCommutable = 1 in {
 
-defm V_ADD_F64 : VOP3Inst <vop3<0x164>, "v_add_f64",
+defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64",
   VOP_F64_F64_F64, fadd
 >;
-defm V_MUL_F64 : VOP3Inst <vop3<0x165>, "v_mul_f64",
+defm V_MUL_F64 : VOP3Inst <vop3<0x165, 0x281>, "v_mul_f64",
   VOP_F64_F64_F64, fmul
 >;
 
-defm V_MIN_F64 : VOP3Inst <vop3<0x166>, "v_min_f64",
+defm V_MIN_F64 : VOP3Inst <vop3<0x166, 0x282>, "v_min_f64",
   VOP_F64_F64_F64, fminnum
 >;
-defm V_MAX_F64 : VOP3Inst <vop3<0x167>, "v_max_f64",
+defm V_MAX_F64 : VOP3Inst <vop3<0x167, 0x283>, "v_max_f64",
   VOP_F64_F64_F64, fmaxnum
 >;
 
 } // isCommutable = 1
 
-defm V_LDEXP_F64 : VOP3Inst <vop3<0x168>, "v_ldexp_f64",
+defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64",
   VOP_F64_F64_I32, AMDGPUldexp
 >;
 
-let isCommutable = 1 in {
+} // let SchedRW = [WriteDouble]
+
+let isCommutable = 1, SchedRW = [WriteQuarterRate32] in {
 
-defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169>, "v_mul_lo_u32",
+defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169, 0x285>, "v_mul_lo_u32",
   VOP_I32_I32_I32
 >;
-defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a>, "v_mul_hi_u32",
+defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a, 0x286>, "v_mul_hi_u32",
   VOP_I32_I32_I32
 >;
-defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b>, "v_mul_lo_i32",
+
+defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b, 0x285>, "v_mul_lo_i32",
   VOP_I32_I32_I32
 >;
-defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c>, "v_mul_hi_i32",
+defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32",
   VOP_I32_I32_I32
 >;
 
-} // isCommutable = 1
+} // isCommutable = 1, SchedRW = [WriteQuarterRate32]
 
-defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d>, "v_div_scale_f32", []>;
+let SchedRW = [WriteFloatFMA, WriteSALU] in {
+defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>;
+}
 
+let SchedRW = [WriteDouble, WriteSALU] in {
 // Double precision division pre-scale.
-defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e>, "v_div_scale_f64", []>;
+defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>;
+} // let SchedRW = [WriteDouble]
 
-let isCommutable = 1 in {
-defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f>, "v_div_fmas_f32",
+let isCommutable = 1, Uses = [VCC] in {
+
+// v_div_fmas_f32:
+//   result = src0 * src1 + src2
+//   if (vcc)
+//     result *= 2^32
+//
+defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32",
   VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
 >;
-defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170>, "v_div_fmas_f64",
+
+let SchedRW = [WriteDouble] in {
+// v_div_fmas_f64:
+//   result = src0 * src1 + src2
+//   if (vcc)
+//     result *= 2^64
+//
+defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64",
   VOP_F64_F64_F64_F64, AMDGPUdiv_fmas
 >;
+
+} // End SchedRW = [WriteDouble]
 } // End isCommutable = 1
 
 //def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
 //def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
 //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>;
 
+let SchedRW = [WriteDouble] in {
 defm V_TRIG_PREOP_F64 : VOP3Inst <
-  vop3<0x174>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
+  vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
 >;
 
-//===----------------------------------------------------------------------===//
-// Pseudo Instructions
-//===----------------------------------------------------------------------===//
+} // let SchedRW = [WriteDouble]
 
-let isCodeGenOnly = 1, isPseudo = 1 in {
+// These instructions only exist on SI and CI
+let SubtargetPredicate = isSICI in {
 
-def V_MOV_I1 : InstSI <
-  (outs VReg_1:$dst),
-  (ins i1imm:$src),
-  "", [(set i1:$dst, (imm:$src))]
->;
+defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", VOP_I64_I64_I32>;
+defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", VOP_I64_I64_I32>;
+defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", VOP_I64_I64_I32>;
 
-def V_AND_I1 : InstSI <
-   (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
-   [(set i1:$dst, (and i1:$src0, i1:$src1))]
->;
+defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32",
+  VOP_F32_F32_F32_F32>;
 
-def V_OR_I1 : InstSI <
-   (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
-   [(set i1:$dst, (or i1:$src0, i1:$src1))]
->;
+} // End SubtargetPredicate = isSICI
+
+let SubtargetPredicate = isVI in {
 
-def V_XOR_I1 : InstSI <
-  (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
-  [(set i1:$dst, (xor i1:$src0, i1:$src1))]
+defm V_LSHLREV_B64 : VOP3Inst <vop3<0, 0x28f>, "v_lshlrev_b64",
+  VOP_I64_I32_I64
+>;
+defm V_LSHRREV_B64 : VOP3Inst <vop3<0, 0x290>, "v_lshrrev_b64",
+  VOP_I64_I32_I64
+>;
+defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64",
+  VOP_I64_I32_I64
 >;
 
+} // End SubtargetPredicate = isVI
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
+let isCodeGenOnly = 1, isPseudo = 1 in {
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+// 64-bit vector move instruction.  This is mainly used by the SIFoldOperands
+// pass to enable folding of inline immediates.
+def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>;
+} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0
+
 let hasSideEffects = 1 in {
 def SGPR_USE : InstSI <(outs),(ins), "", []>;
 }
@@ -1785,12 +1916,12 @@ def SI_KILL : InstSI <
 
 let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
 
-//defm SI_ : RegisterLoadStore <VReg_32, FRAMEri, ADDRIndirect>;
+//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>;
 
 let UseNamedOperandTable = 1 in {
 
 def SI_RegisterLoad : InstSI <
-  (outs VReg_32:$dst, SReg_64:$temp),
+  (outs VGPR_32:$dst, SReg_64:$temp),
   (ins FRAMEri32:$addr, i32imm:$chan),
   "", []
 > {
@@ -1800,7 +1931,7 @@ def SI_RegisterLoad : InstSI <
 
 class SIRegStore<dag outs> : InstSI <
   outs,
-  (ins VReg_32:$val, FRAMEri32:$addr, i32imm:$chan),
+  (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan),
   "", []
 > {
   let isRegisterStore = 1;
@@ -1816,7 +1947,7 @@ def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>;
 } // End UseNamedOperandTable = 1
 
 def SI_INDIRECT_SRC : InstSI <
-  (outs VReg_32:$dst, SReg_64:$temp),
+  (outs VGPR_32:$dst, SReg_64:$temp),
   (ins unknown:$src, VSrc_32:$idx, i32imm:$off),
   "si_indirect_src $dst, $temp, $src, $idx, $off",
   []
@@ -1824,14 +1955,14 @@ def SI_INDIRECT_SRC : InstSI <
 
 class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
   (outs rc:$dst, SReg_64:$temp),
-  (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VReg_32:$val),
+  (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val),
   "si_indirect_dst $dst, $temp, $src, $idx, $off, $val",
   []
 > {
   let Constraints = "$src = $dst";
 }
 
-def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VReg_32>;
+def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
 def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
 def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
 def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
@@ -1839,31 +1970,22 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
 
 } // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0]
 
-let usesCustomInserter = 1 in {
-
-def V_SUB_F64 : InstSI <
-  (outs VReg_64:$dst),
-  (ins VReg_64:$src0, VReg_64:$src1),
-  "v_sub_f64 $dst, $src0, $src1",
-  [(set f64:$dst, (fsub f64:$src0, f64:$src1))]
->;
-
-} // end usesCustomInserter
-
 multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
 
-  def _SAVE : InstSI <
-    (outs),
-    (ins sgpr_class:$src, i32imm:$frame_idx),
-    "", []
-  >;
-
-  def _RESTORE : InstSI <
-    (outs sgpr_class:$dst),
-    (ins i32imm:$frame_idx),
-    "", []
-  >;
-
+  let UseNamedOperandTable = 1 in {
+    def _SAVE : InstSI <
+      (outs),
+      (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
+           SReg_32:$scratch_offset),
+      "", []
+    >;
+
+    def _RESTORE : InstSI <
+      (outs sgpr_class:$dst),
+      (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
+      "", []
+    >;
+  } // End UseNamedOperandTable = 1
 }
 
 defm SI_SPILL_S32  : SI_SPILL_SGPR <SReg_32>;
@@ -1873,20 +1995,23 @@ defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
 defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
 
 multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
-  def _SAVE : InstSI <
-    (outs),
-    (ins vgpr_class:$src, i32imm:$frame_idx),
-    "", []
-  >;
-
-  def _RESTORE : InstSI <
-    (outs vgpr_class:$dst),
-    (ins i32imm:$frame_idx),
-    "", []
-  >;
+  let UseNamedOperandTable = 1 in {
+    def _SAVE : InstSI <
+      (outs),
+      (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
+           SReg_32:$scratch_offset),
+      "", []
+    >;
+
+    def _RESTORE : InstSI <
+      (outs vgpr_class:$dst),
+      (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
+      "", []
+    >;
+  } // End UseNamedOperandTable = 1
 }
 
-defm SI_SPILL_V32  : SI_SPILL_VGPR <VReg_32>;
+defm SI_SPILL_V32  : SI_SPILL_VGPR <VGPR_32>;
 defm SI_SPILL_V64  : SI_SPILL_VGPR <VReg_64>;
 defm SI_SPILL_V96  : SI_SPILL_VGPR <VReg_96>;
 defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
@@ -1905,9 +2030,9 @@ def SI_CONSTDATA_PTR : InstSI <
 
 } // end IsCodeGenOnly, isPseudo
 
-} // end SubtargetPredicate = SI
+} // end SubtargetPredicate = isGCN
 
-let Predicates = [isSI] in {
+let Predicates = [isGCN] in {
 
 def : Pat<
   (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2),
@@ -1941,7 +2066,7 @@ def : Pat <
 
 multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
 
-  // 1. Offset as 8bit DWORD immediate
+  // 1. SI-CI: Offset as 8bit DWORD immediate
   def : Pat <
     (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
     (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
@@ -1960,6 +2085,28 @@ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
   >;
 }
 
+multiclass SMRD_Pattern_vi <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+
+  // 1. VI: Offset as 20bit immediate in bytes
+  def : Pat <
+    (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))),
+    (vt (Instr_IMM $sbase, (as_i32imm $offset)))
+  >;
+
+  // 2. Offset loaded in an 32bit SGPR
+  def : Pat <
+    (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
+    (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
+  >;
+
+  // 3. No offset at all
+  def : Pat <
+    (constant_load i64:$sbase),
+    (vt (Instr_IMM $sbase, 0))
+  >;
+}
+
+let Predicates = [isSICI] in {
 defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
 defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
 defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
@@ -1967,6 +2114,19 @@ defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
 defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
 defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
 defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
+} // End Predicates = [isSICI]
+
+let Predicates = [isVI] in {
+defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
+} // End Predicates = [isVI]
+
+let Predicates = [isSICI] in {
 
 // 1. Offset as 8bit DWORD immediate
 def : Pat <
@@ -1974,14 +2134,14 @@ def : Pat <
   (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
 >;
 
+} // End Predicates = [isSICI]
+
 // 2. Offset loaded in an 32bit SGPR
 def : Pat <
   (SIload_constant v4i32:$sbase, imm:$offset),
   (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
 >;
 
-} // Predicates = [isSI] in {
-
 //===----------------------------------------------------------------------===//
 // SOP1 Patterns
 //===----------------------------------------------------------------------===//
@@ -2004,8 +2164,6 @@ def : Pat <
   (S_ADD_U32 $src0, $src1)
 >;
 
-let  Predicates = [isSI] in {
-
 //===----------------------------------------------------------------------===//
 // SOPP Patterns
 //===----------------------------------------------------------------------===//
@@ -2020,9 +2178,13 @@ def : Pat <
 //===----------------------------------------------------------------------===//
 
 let Predicates = [UnsafeFPMath] in {
-def : RcpPat<V_RCP_F64_e32, f64>;
-defm : RsqPat<V_RSQ_F64_e32, f64>;
-defm : RsqPat<V_RSQ_F32_e32, f32>;
+
+//def : RcpPat<V_RCP_F64_e32, f64>;
+//defm : RsqPat<V_RSQ_F64_e32, f64>;
+//defm : RsqPat<V_RSQ_F32_e32, f32>;
+
+def : RsqPat<V_RSQ_F32_e32, f32>;
+def : RsqPat<V_RSQ_F64_e32, f64>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2369,10 +2531,10 @@ foreach Index = 0-15 in {
 }
 
 def : BitConvert <i32, f32, SReg_32>;
-def : BitConvert <i32, f32, VReg_32>;
+def : BitConvert <i32, f32, VGPR_32>;
 
 def : BitConvert <f32, i32, SReg_32>;
-def : BitConvert <f32, i32, VReg_32>;
+def : BitConvert <f32, i32, VGPR_32>;
 
 def : BitConvert <i64, f64, VReg_64>;
 
@@ -2475,7 +2637,7 @@ def : Pat <
 
 def : Pat <
   (SGPRImm<(f32 fpimm)>:$imm),
-  (S_MOV_B32 fpimm:$imm)
+  (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
 >;
 
 def : Pat <
@@ -2485,7 +2647,7 @@ def : Pat <
 
 def : Pat <
   (f32 fpimm:$imm),
-  (V_MOV_B32_e32 fpimm:$imm)
+  (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
 >;
 
 def : Pat <
@@ -2493,21 +2655,38 @@ def : Pat <
   (S_MOV_B64 InlineImm<i64>:$imm)
 >;
 
+// XXX - Should this use a s_cmp to set SCC?
+
+// Set to sign-extended 64-bit value (true = -1, false = 0)
+def : Pat <
+  (i1 imm:$imm),
+  (S_MOV_B64 (i64 (as_i64imm $imm)))
+>;
+
+def : Pat <
+  (f64 InlineFPImm<f64>:$imm),
+  (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm)))
+>;
+
 /********** ===================== **********/
 /********** Interpolation Paterns **********/
 /********** ===================== **********/
 
+// The value of $params is constant through out the entire kernel.
+// We need to use S_MOV_B32 $params, because CSE ignores copies, so
+// without it we end up with a lot of redundant moves.
+
 def : Pat <
   (int_SI_fs_constant imm:$attr_chan, imm:$attr, i32:$params),
-  (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, $params)
+  (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, (S_MOV_B32 $params))
 >;
 
 def : Pat <
-  (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, v2i32:$ij),
+  (int_SI_fs_interp imm:$attr_chan, imm:$attr, i32:$params, v2i32:$ij),
   (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG v2i32:$ij, sub0),
-                                    imm:$attr_chan, imm:$attr, i32:$params),
+                                    imm:$attr_chan, imm:$attr, (S_MOV_B32 $params)),
                    (EXTRACT_SUBREG $ij, sub1),
-                   imm:$attr_chan, imm:$attr, $params)
+                   imm:$attr_chan, imm:$attr, (S_MOV_B32 $params))
 >;
 
 /********** ================== **********/
@@ -2522,13 +2701,6 @@ def : Pat <
   (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1))
 >;
 
-def : Pat<
-  (fdiv f64:$src0, f64:$src1),
-  (V_MUL_F64 0 /* src0_modifiers */, $src0,
-             0 /* src1_modifiers */, (V_RCP_F64_e32 $src1),
-             0 /* clamp */, 0 /* omod */)
->;
-
 def : Pat <
   (int_AMDGPU_cube v4f32:$src),
   (REG_SEQUENCE VReg_128,
@@ -2579,7 +2751,7 @@ def : Pat <
 
 def : Pat <
   (int_SI_tid),
-  (V_MBCNT_HI_U32_B32_e32 0xffffffff,
+  (V_MBCNT_HI_U32_B32_e64 0xffffffff,
                           (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0))
 >;
 
@@ -2600,9 +2772,6 @@ def : Pat <
   (V_MUL_HI_I32 $src0, $src1)
 >;
 
-def : Vop3ModPat<V_MAD_F32, VOP_F32_F32_F32_F32, AMDGPUmad>;
-
-
 defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
 def : ROTRPattern <V_ALIGNBIT_B32>;
 
@@ -2612,7 +2781,7 @@ def : ROTRPattern <V_ALIGNBIT_B32>;
 
 class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat <
   (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
-  (inst (i1 0), $ptr, (as_i16imm $offset))
+  (inst (i1 0), $ptr, (as_i16imm $offset), (S_MOV_B32 -1))
 >;
 
 def : DSReadPat <DS_READ_I8,  i32, sextloadi8_local>;
@@ -2630,12 +2799,12 @@ def : DSReadPat <DS_READ_B64, v2i32, local_load_aligned8bytes>;
 def : Pat <
   (v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
                                                     i8:$offset1))),
-  (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1)
+  (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1, (S_MOV_B32 -1))
 >;
 
 class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat <
   (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
-  (inst (i1 0), $ptr, $value, (as_i16imm $offset))
+  (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1))
 >;
 
 def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
@@ -2651,12 +2820,13 @@ def : Pat <
   (local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
                                                             i8:$offset1)),
   (DS_WRITE2_B32 (i1 0), $ptr, (EXTRACT_SUBREG $value, sub0),
-                        (EXTRACT_SUBREG $value, sub1), $offset0, $offset1)
+                        (EXTRACT_SUBREG $value, sub1), $offset0, $offset1,
+                        (S_MOV_B32 -1))
 >;
 
 class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
   (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
-  (inst (i1 0), $ptr, $value, (as_i16imm $offset))
+  (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1))
 >;
 
 // Special case of DSAtomicRetPat for add / sub 1 -> inc / dec
@@ -2672,13 +2842,13 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
 class DSAtomicIncRetPat<DS inst, ValueType vt,
                         Instruction LoadImm, PatFrag frag> : Pat <
   (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)),
-  (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset))
+  (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (S_MOV_B32 -1))
 >;
 
 
 class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
   (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
-  (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset))
+  (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset), (S_MOV_B32 -1))
 >;
 
 
@@ -2728,11 +2898,12 @@ def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>;
 multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
                               PatFrag constant_ld> {
   def : Pat <
-     (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))),
-     (Instr_ADDR64 $srsrc, $vaddr, $offset)
+     (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset))),
+     (Instr_ADDR64 $srsrc, $vaddr, $soffset, $offset)
   >;
 }
 
+let Predicates = [isSICI] in {
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
@@ -2740,6 +2911,7 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>;
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>;
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>;
+} // End Predicates = [isSICI]
 
 class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
   (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
@@ -2785,9 +2957,9 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
 
   def : Pat <
     (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
-                                  imm, 1, 1, imm:$glc, imm:$slc,
+                                  imm:$offset, 1, 1, imm:$glc, imm:$slc,
                                   imm:$tfe)),
-    (bothen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
+    (bothen $rsrc, $vaddr, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
             (as_i1imm $tfe))
   >;
 }
@@ -2817,11 +2989,13 @@ class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
   (Instr $value, $srsrc, $vaddr, $offset)
 >;
 
+let Predicates = [isSICI] in {
 def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>;
 def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>;
 def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>;
 def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>;
 def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>;
+} // End Predicates = [isSICI]
 
 */
 
@@ -2848,20 +3022,6 @@ def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
 
 let SubtargetPredicate = isCI in {
 
-// Sea island new arithmetic instructinos
-defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64",
-  VOP_F64_F64, ftrunc
->;
-defm V_CEIL_F64 : VOP1Inst <vop1<0x18>, "v_ceil_f64",
-  VOP_F64_F64, fceil
->;
-defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64",
-  VOP_F64_F64, ffloor
->;
-defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64",
-  VOP_F64_F64, frint
->;
-
 defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8",
   VOP_I32_I32_I32
 >;
@@ -2890,8 +3050,6 @@ defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
 // S_CBRANCH_CDBGSYS_OR_USER
 // S_CBRANCH_CDBGSYS_AND_USER
 // S_DCACHE_INV_VOL
-// V_EXP_LEGACY_F32
-// V_LOG_LEGACY_F32
 // DS_NOP
 // DS_GWS_SEMA_RELEASE_ALL
 // DS_WRAP_RTN_B32
@@ -2904,7 +3062,7 @@ defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
 // BUFFER_LOAD_DWORDX3
 // BUFFER_STORE_DWORDX3
 
-} // End iSCI
+} // End isCI
 
 //===----------------------------------------------------------------------===//
 // Flat Patterns
@@ -3038,6 +3196,27 @@ def : Pat <
     (V_CNDMASK_B32_e64 0, -1, $src), sub1)
 >;
 
+// If we need to perform a logical operation on i1 values, we need to
+// use vector comparisons since there is only one SCC register. Vector
+// comparisions still write to a pair of SGPRs, so treat these as
+// 64-bit comparisons. When legalizing SGPR copies, instructions
+// resulting in the copies from SCC to these instructions will be
+// moved to the VALU.
+def : Pat <
+  (i1 (and i1:$src0, i1:$src1)),
+  (S_AND_B64 $src0, $src1)
+>;
+
+def : Pat <
+  (i1 (or i1:$src0, i1:$src1)),
+  (S_OR_B64 $src0, $src1)
+>;
+
+def : Pat <
+  (i1 (xor i1:$src0, i1:$src1)),
+  (S_XOR_B64 $src0, $src1)
+>;
+
 def : Pat <
   (f32 (sint_to_fp i1:$src)),
   (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src)
@@ -3050,7 +3229,7 @@ def : Pat <
 
 def : Pat <
   (f64 (sint_to_fp i1:$src)),
-    (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
+  (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
 >;
 
 def : Pat <
@@ -3073,16 +3252,27 @@ def : Pat <
 >;
 
 def : Pat <
+  (i1 (trunc i64:$a)),
+  (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1),
+                    (EXTRACT_SUBREG $a, sub0)), 1)
+>;
+
+def : Pat <
   (i32 (bswap i32:$a)),
   (V_BFI_B32 (S_MOV_B32 0x00ff00ff),
              (V_ALIGNBIT_B32 $a, $a, 24),
              (V_ALIGNBIT_B32 $a, $a, 8))
 >;
 
+def : Pat <
+  (f32 (select i1:$src2, f32:$src1, f32:$src0)),
+  (V_CNDMASK_B32_e64 $src0, $src1, $src2)
+>;
+
 //============================================================================//
 // Miscellaneous Optimization Patterns
 //============================================================================//
 
 def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
 
-} // End isSI predicate
+} // End isGCN predicate
diff --git a/lib/Target/R600/SILoadStoreOptimizer.cpp b/lib/Target/R600/SILoadStoreOptimizer.cpp
index 4140196..46630d0 100644
--- a/lib/Target/R600/SILoadStoreOptimizer.cpp
+++ b/lib/Target/R600/SILoadStoreOptimizer.cpp
@@ -55,7 +55,6 @@ namespace {
 
 class SILoadStoreOptimizer : public MachineFunctionPass {
 private:
-  const TargetMachine *TM;
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
@@ -86,20 +85,11 @@ private:
 public:
   static char ID;
 
-  SILoadStoreOptimizer() :
-    MachineFunctionPass(ID),
-    TM(nullptr),
-    TII(nullptr),
-    TRI(nullptr),
-    MRI(nullptr),
-    LIS(nullptr) {
+  SILoadStoreOptimizer()
+      : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
+        LIS(nullptr) {}
 
-  }
-
-  SILoadStoreOptimizer(const TargetMachine &TM_) :
-    MachineFunctionPass(ID),
-    TM(&TM_),
-    TII(static_cast<const SIInstrInfo*>(TM->getSubtargetImpl()->getInstrInfo())) {
+  SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
   }
 
@@ -222,6 +212,7 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   // Be careful, since the addresses could be subregisters themselves in weird
   // cases, like vectors of pointers.
   const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
+  const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0);
 
   unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg();
   unsigned DestReg1
@@ -262,6 +253,7 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
     .addOperand(*AddrReg) // addr
     .addImm(NewOffset0) // offset0
     .addImm(NewOffset1) // offset1
+    .addOperand(*M0Reg) // M0
     .addMemOperand(*I->memoperands_begin())
     .addMemOperand(*Paired->memoperands_begin());
 
@@ -280,6 +272,18 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
   LIS->shrinkToUses(&AddrRegLI);
 
+  LiveInterval &M0RegLI = LIS->getInterval(M0Reg->getReg());
+  LIS->shrinkToUses(&M0RegLI);
+
+  // Currently m0 is treated as a register class with one member instead of an
+  // implicit physical register. We are using the virtual register for the first
+  // one, but we still need to update the live range of the now unused second m0
+  // virtual register to avoid verifier errors.
+  const MachineOperand *PairedM0Reg
+    = TII->getNamedOperand(*Paired, AMDGPU::OpName::m0);
+  LiveInterval &PairedM0RegLI = LIS->getInterval(PairedM0Reg->getReg());
+  LIS->shrinkToUses(&PairedM0RegLI);
+
   LIS->getInterval(DestReg); // Create new LI
 
   DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
@@ -295,6 +299,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
   // sure we preserve the subregister index and any register flags set on them.
   const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
+  const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0);
   const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
   const MachineOperand *Data1
     = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);
@@ -333,11 +338,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
     .addOperand(*Data1) // data1
     .addImm(NewOffset0) // offset0
     .addImm(NewOffset1) // offset1
+    .addOperand(*M0Reg)  // m0
     .addMemOperand(*I->memoperands_begin())
     .addMemOperand(*Paired->memoperands_begin());
 
   // XXX - How do we express subregisters here?
-  unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
+  unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg(),
+                          M0Reg->getReg()};
 
   LIS->RemoveMachineInstrFromMaps(I);
   LIS->RemoveMachineInstrFromMaps(Paired);
@@ -397,9 +404,9 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
 }
 
 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
-  const TargetSubtargetInfo *STM = MF.getTarget().getSubtargetImpl();
-  TRI = static_cast<const SIRegisterInfo*>(STM->getRegisterInfo());
-  TII = static_cast<const SIInstrInfo*>(STM->getInstrInfo());
+  const TargetSubtargetInfo &STM = MF.getSubtarget();
+  TRI = static_cast<const SIRegisterInfo *>(STM.getRegisterInfo());
+  TII = static_cast<const SIInstrInfo *>(STM.getInstrInfo());
   MRI = &MF.getRegInfo();
 
   LIS = &getAnalysis<LiveIntervals>();
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index 9702565..2e08c9f 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -88,7 +88,6 @@ private:
   void Kill(MachineInstr &MI);
   void Branch(MachineInstr &MI);
 
-  void InitM0ForLDS(MachineBasicBlock::iterator MI);
   void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
   void IndirectSrc(MachineInstr &MI);
   void IndirectDst(MachineInstr &MI);
@@ -309,10 +308,9 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
 #endif
 
   // Clear this thread from the exec mask if the operand is negative
-  if ((Op.isImm() || Op.isFPImm())) {
+  if ((Op.isImm())) {
     // Constant operand: Set exec mask to 0 or do nothing
-    if (Op.isImm() ? (Op.getImm() & 0x80000000) :
-        Op.getFPImm()->isNegative()) {
+    if (Op.getImm() & 0x80000000) {
       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
               .addImm(0);
     }
@@ -325,14 +323,6 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-/// The m0 register stores the maximum allowable address for LDS reads and
-/// writes.  Its value must be at least the size in bytes of LDS allocated by
-/// the shader.  For simplicity, we set it to the maximum possible value.
-void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) {
-    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),  TII->get(AMDGPU::S_MOV_B32),
-            AMDGPU::M0).addImm(0xffffffff);
-}
-
 void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
 
   MachineBasicBlock &MBB = *MI.getParent();
@@ -349,7 +339,7 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
   } else {
 
     assert(AMDGPU::SReg_64RegClass.contains(Save));
-    assert(AMDGPU::VReg_32RegClass.contains(Idx));
+    assert(AMDGPU::VGPR_32RegClass.contains(Idx));
 
     // Save the EXEC mask
     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
@@ -391,12 +381,6 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
             .addReg(Save);
 
   }
-  // FIXME: Are there any values other than the LDS address clamp that need to
-  // be stored in the m0 register and may be live for more than a few
-  // instructions?  If so, we should save the m0 register at the beginning
-  // of this function and restore it here.
-  // FIXME: Add support for LDS direct loads.
-  InitM0ForLDS(&MI);
   MI.eraseFromParent();
 }
 
@@ -450,7 +434,6 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   bool HaveKill = false;
-  bool NeedM0 = false;
   bool NeedWQM = false;
   bool NeedFlat = false;
   unsigned Depth = 0;
@@ -464,16 +447,12 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
       Next = std::next(I);
 
       MachineInstr &MI = *I;
-      if (TII->isDS(MI.getOpcode())) {
-        NeedM0 = true;
+      if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode()))
         NeedWQM = true;
-      }
 
       // Flat uses m0 in case it needs to access LDS.
-      if (TII->isFLAT(MI.getOpcode())) {
-        NeedM0 = true;
+      if (TII->isFLAT(MI.getOpcode()))
         NeedFlat = true;
-      }
 
       switch (MI.getOpcode()) {
         default: break;
@@ -534,23 +513,10 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
         case AMDGPU::SI_INDIRECT_DST_V16:
           IndirectDst(MI);
           break;
-
-        case AMDGPU::V_INTERP_P1_F32:
-        case AMDGPU::V_INTERP_P2_F32:
-        case AMDGPU::V_INTERP_MOV_F32:
-          NeedWQM = true;
-          break;
       }
     }
   }
 
-  if (NeedM0) {
-    MachineBasicBlock &MBB = MF.front();
-    // Initialize M0 to a value that won't cause LDS access to be discarded
-    // due to offset clamping
-    InitM0ForLDS(MBB.getFirstNonPHI());
-  }
-
   if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
     MachineBasicBlock &MBB = MF.front();
     BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp
index 65b892c..67421e2 100644
--- a/lib/Target/R600/SILowerI1Copies.cpp
+++ b/lib/Target/R600/SILowerI1Copies.cpp
@@ -85,30 +85,6 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
       Next = std::next(I);
       MachineInstr &MI = *I;
 
-      if (MI.getOpcode() == AMDGPU::V_MOV_I1) {
-        I1Defs.push_back(MI.getOperand(0).getReg());
-        MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
-        continue;
-      }
-
-      if (MI.getOpcode() == AMDGPU::V_AND_I1) {
-        I1Defs.push_back(MI.getOperand(0).getReg());
-        MI.setDesc(TII->get(AMDGPU::V_AND_B32_e32));
-        continue;
-      }
-
-      if (MI.getOpcode() == AMDGPU::V_OR_I1) {
-        I1Defs.push_back(MI.getOperand(0).getReg());
-        MI.setDesc(TII->get(AMDGPU::V_OR_B32_e32));
-        continue;
-      }
-
-      if (MI.getOpcode() == AMDGPU::V_XOR_I1) {
-        I1Defs.push_back(MI.getOperand(0).getReg());
-        MI.setDesc(TII->get(AMDGPU::V_XOR_B32_e32));
-        continue;
-      }
-
       if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
         unsigned Reg = MI.getOperand(0).getReg();
         const TargetRegisterClass *RC = MRI.getRegClass(Reg);
@@ -117,39 +93,59 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
         continue;
       }
 
-      if (MI.getOpcode() != AMDGPU::COPY ||
-          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) ||
-          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg()))
+      if (MI.getOpcode() != AMDGPU::COPY)
         continue;
 
+      const MachineOperand &Dst = MI.getOperand(0);
+      const MachineOperand &Src = MI.getOperand(1);
+
+      if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) ||
+          !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+        continue;
 
-      const TargetRegisterClass *DstRC =
-          MRI.getRegClass(MI.getOperand(0).getReg());
-      const TargetRegisterClass *SrcRC =
-          MRI.getRegClass(MI.getOperand(1).getReg());
+      const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
+      const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
 
       if (DstRC == &AMDGPU::VReg_1RegClass &&
           TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
-        I1Defs.push_back(MI.getOperand(0).getReg());
-        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64))
-                .addOperand(MI.getOperand(0))
-                .addImm(0)
-                .addImm(-1)
-                .addOperand(MI.getOperand(1));
+        I1Defs.push_back(Dst.getReg());
+        DebugLoc DL = MI.getDebugLoc();
+
+        MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
+        if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
+          if (DefInst->getOperand(1).isImm()) {
+            I1Defs.push_back(Dst.getReg());
+
+            int64_t Val = DefInst->getOperand(1).getImm();
+            assert(Val == 0 || Val == -1);
+
+            BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
+              .addOperand(Dst)
+              .addImm(Val);
+            MI.eraseFromParent();
+            continue;
+          }
+        }
+
+        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
+          .addOperand(Dst)
+          .addImm(0)
+          .addImm(-1)
+          .addOperand(Src);
         MI.eraseFromParent();
       } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
                  SrcRC == &AMDGPU::VReg_1RegClass) {
         BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64))
-                .addOperand(MI.getOperand(0))
-                .addOperand(MI.getOperand(1))
-                .addImm(0);
+          .addOperand(Dst)
+          .addOperand(Src)
+          .addImm(0);
         MI.eraseFromParent();
       }
     }
   }
 
   for (unsigned Reg : I1Defs)
-    MRI.setRegClass(Reg, &AMDGPU::VReg_32RegClass);
+    MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass);
 
   return false;
 }
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index d58f31d..587ea63 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -29,6 +29,7 @@ void SIMachineFunctionInfo::anchor() {}
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
     TIDReg(AMDGPU::NoRegister),
+    HasSpilledVGPRs(false),
     PSInputAddr(0),
     NumUserSGPRs(0),
     LDSWaveSpillSize(0) { }
@@ -38,8 +39,8 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
                                                        unsigned FrameIndex,
                                                        unsigned SubIdx) {
   const MachineFrameInfo *FrameInfo = MF->getFrameInfo();
-  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(
-      MF->getTarget().getSubtarget<AMDGPUSubtarget>().getRegisterInfo());
+  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
+      MF->getSubtarget<AMDGPUSubtarget>().getRegisterInfo());
   MachineRegisterInfo &MRI = MF->getRegInfo();
   int64_t Offset = FrameInfo->getObjectOffset(FrameIndex);
   Offset += SubIdx * 4;
@@ -50,7 +51,7 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
   struct SpilledReg Spill;
 
   if (!LaneVGPRs.count(LaneVGPRIdx)) {
-    unsigned LaneVGPR = TRI->findUnusedVGPR(MRI);
+    unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
     LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
     MRI.setPhysRegUsed(LaneVGPR);
 
@@ -69,7 +70,7 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
 
 unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
                                               const MachineFunction &MF) const {
-  const AMDGPUSubtarget &ST = MF.getTarget().getSubtarget<AMDGPUSubtarget>();
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
   // FIXME: We should get this information from kernel attributes if it
   // is available.
   return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize();
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index 6bb8f9d..667da4c 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -29,6 +29,7 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
   void anchor() override;
 
   unsigned TIDReg;
+  bool HasSpilledVGPRs;
 
 public:
 
@@ -49,9 +50,12 @@ public:
   unsigned NumUserSGPRs;
   std::map<unsigned, unsigned> LaneVGPRs;
   unsigned LDSWaveSpillSize;
+  unsigned ScratchOffsetReg;
   bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
   unsigned getTIDReg() const { return TIDReg; };
   void setTIDReg(unsigned Reg) { TIDReg = Reg; }
+  bool hasSpilledVGPRs() const { return HasSpilledVGPRs; }
+  void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; }
 
   unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
 };
diff --git a/lib/Target/R600/SIPrepareScratchRegs.cpp b/lib/Target/R600/SIPrepareScratchRegs.cpp
new file mode 100644
index 0000000..0a57a5b
--- /dev/null
+++ b/lib/Target/R600/SIPrepareScratchRegs.cpp
@@ -0,0 +1,208 @@
+//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This pass loads scratch pointer and scratch offset into a register or a
+/// frame index which can be used anywhere in the program.  These values will
+/// be used for spilling VGPRs.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+
+using namespace llvm;
+
+namespace {
+
+class SIPrepareScratchRegs : public MachineFunctionPass {
+
+private:
+  static char ID;
+
+public:
+  SIPrepareScratchRegs() : MachineFunctionPass(ID) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI prepare scratch registers";
+  }
+
+};
+
+} // End anonymous namespace
+
+char SIPrepareScratchRegs::ID = 0;
+
+FunctionPass *llvm::createSIPrepareScratchRegs() {
+  return new SIPrepareScratchRegs();
+}
+
+bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+  MachineBasicBlock *Entry = MF.begin();
+  MachineBasicBlock::iterator I = Entry->begin();
+  DebugLoc DL = I->getDebugLoc();
+
+  // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to
+  // run this pass.
+  if (!MFI->hasSpilledVGPRs())
+    return false;
+
+  unsigned ScratchPtrPreloadReg =
+      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
+  unsigned ScratchOffsetPreloadReg =
+      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+
+  if (!Entry->isLiveIn(ScratchPtrPreloadReg))
+    Entry->addLiveIn(ScratchPtrPreloadReg);
+
+  if (!Entry->isLiveIn(ScratchOffsetPreloadReg))
+    Entry->addLiveIn(ScratchOffsetPreloadReg);
+
+  // Load the scratch offset.
+  unsigned ScratchOffsetReg =
+      TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass);
+  int ScratchOffsetFI = -1;
+
+  if (ScratchOffsetReg != AMDGPU::NoRegister) {
+    // Found an SGPR to use
+    MRI.setPhysRegUsed(ScratchOffsetReg);
+    BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
+            .addReg(ScratchOffsetPreloadReg);
+  } else {
+    // No SGPR is available, we must spill.
+    ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4);
+    BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE))
+            .addReg(ScratchOffsetPreloadReg)
+            .addFrameIndex(ScratchOffsetFI)
+            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
+            .addReg(AMDGPU::SGPR0, RegState::Undef);
+  }
+
+
+  // Now that we have the scratch pointer and offset values, we need to
+  // add them to all the SI_SPILL_V* instructions.
+
+  RegScavenger RS;
+  unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4);
+  RS.addScavengingFrameIndex(ScratchRsrcFI);
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    // Add the scratch offset reg as a live-in so that the register scavenger
+    // doesn't re-use it.
+    if (!MBB.isLiveIn(ScratchOffsetReg) &&
+        ScratchOffsetReg != AMDGPU::NoRegister)
+      MBB.addLiveIn(ScratchOffsetReg);
+    RS.enterBasicBlock(&MBB);
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      MachineInstr &MI = *I;
+      RS.forward(I);
+      DebugLoc DL = MI.getDebugLoc();
+      switch(MI.getOpcode()) {
+        default: break;
+        case AMDGPU::SI_SPILL_V512_SAVE:
+        case AMDGPU::SI_SPILL_V256_SAVE:
+        case AMDGPU::SI_SPILL_V128_SAVE:
+        case AMDGPU::SI_SPILL_V96_SAVE:
+        case AMDGPU::SI_SPILL_V64_SAVE:
+        case AMDGPU::SI_SPILL_V32_SAVE:
+        case AMDGPU::SI_SPILL_V32_RESTORE:
+        case AMDGPU::SI_SPILL_V64_RESTORE:
+        case AMDGPU::SI_SPILL_V128_RESTORE:
+        case AMDGPU::SI_SPILL_V256_RESTORE:
+        case AMDGPU::SI_SPILL_V512_RESTORE:
+
+          // Scratch resource
+          unsigned ScratchRsrcReg =
+              RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0);
+
+          uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
+                          0xffffffff; // Size
+
+          unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+          unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+          unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+          unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+
+          BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
+                  .addExternalSymbol("SCRATCH_RSRC_DWORD0")
+                  .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+          BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
+                  .addExternalSymbol("SCRATCH_RSRC_DWORD1")
+                  .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+          BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
+                  .addImm(Rsrc & 0xffffffff)
+                  .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+          BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
+                  .addImm(Rsrc >> 32)
+                  .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+          // Scratch Offset
+          if (ScratchOffsetReg == AMDGPU::NoRegister) {
+            ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+            BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE),
+                    ScratchOffsetReg)
+                    .addFrameIndex(ScratchOffsetFI)
+                    .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
+                    .addReg(AMDGPU::SGPR0, RegState::Undef);
+          } else if (!MBB.isLiveIn(ScratchOffsetReg)) {
+            MBB.addLiveIn(ScratchOffsetReg);
+          }
+
+          if (ScratchRsrcReg == AMDGPU::NoRegister ||
+              ScratchOffsetReg == AMDGPU::NoRegister) {
+            LLVMContext &Ctx = MF.getFunction()->getContext();
+            Ctx.emitError("ran out of SGPRs for spilling VGPRs");
+            ScratchRsrcReg = AMDGPU::SGPR0;
+            ScratchOffsetReg = AMDGPU::SGPR0;
+          }
+          MI.getOperand(2).setReg(ScratchRsrcReg);
+          MI.getOperand(2).setIsKill(true);
+          MI.getOperand(2).setIsUndef(false);
+          MI.getOperand(3).setReg(ScratchOffsetReg);
+          MI.getOperand(3).setIsUndef(false);
+          MI.getOperand(3).setIsKill(false);
+          MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true));
+          MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true));
+          MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true));
+          MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true));
+
+          break;
+      }
+    }
+  }
+  return true;
+}
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index cffea12..9224e14 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -40,6 +40,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
   Reserved.set(AMDGPU::FLAT_SCR);
+  Reserved.set(AMDGPU::FLAT_SCR_LO);
+  Reserved.set(AMDGPU::FLAT_SCR_HI);
 
   // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs
   Reserved.set(AMDGPU::VGPR255);
@@ -48,9 +50,32 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
-unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
-                                             MachineFunction &MF) const {
-  return RC->getNumRegs();
+unsigned SIRegisterInfo::getRegPressureSetLimit(unsigned Idx) const {
+
+  // FIXME: We should adjust the max number of waves based on LDS size.
+  unsigned SGPRLimit = getNumSGPRsAllowed(ST.getMaxWavesPerCU());
+  unsigned VGPRLimit = getNumVGPRsAllowed(ST.getMaxWavesPerCU());
+
+  for (regclass_iterator I = regclass_begin(), E = regclass_end();
+       I != E; ++I) {
+
+    unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1);
+    unsigned Limit;
+
+    if (isSGPRClass(*I)) {
+      Limit = SGPRLimit / NumSubRegs;
+    } else {
+      Limit = VGPRLimit / NumSubRegs;
+    }
+
+    const int *Sets = getRegClassPressureSets(*I);
+    assert(Sets);
+    for (unsigned i = 0; Sets[i] != -1; ++i) {
+	    if (Sets[i] == (int)Idx)
+        return Limit;
+    }
+  }
+  return 256;
 }
 
 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
@@ -92,6 +117,60 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   }
 }
 
+void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
+                                           unsigned LoadStoreOp,
+                                           unsigned Value,
+                                           unsigned ScratchRsrcReg,
+                                           unsigned ScratchOffset,
+                                           int64_t Offset,
+                                           RegScavenger *RS) const {
+
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
+  MachineBasicBlock *MBB = MI->getParent();
+  const MachineFunction *MF = MI->getParent()->getParent();
+  LLVMContext &Ctx = MF->getFunction()->getContext();
+  DebugLoc DL = MI->getDebugLoc();
+  bool IsLoad = TII->get(LoadStoreOp).mayLoad();
+
+  bool RanOutOfSGPRs = false;
+  unsigned SOffset = ScratchOffset;
+
+  unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+  unsigned Size = NumSubRegs * 4;
+
+  if (!isUInt<12>(Offset + Size)) {
+    SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0);
+    if (SOffset == AMDGPU::NoRegister) {
+      RanOutOfSGPRs = true;
+      SOffset = AMDGPU::SGPR0;
+    }
+    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
+            .addReg(ScratchOffset)
+            .addImm(Offset);
+    Offset = 0;
+  }
+
+  if (RanOutOfSGPRs)
+    Ctx.emitError("Ran out of SGPRs for spilling VGPRS");
+
+  for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
+    unsigned SubReg = NumSubRegs > 1 ?
+        getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
+        Value;
+    bool IsKill = (i == e - 1);
+
+    BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
+            .addReg(SubReg, getDefRegState(IsLoad))
+            .addReg(ScratchRsrcReg, getKillRegState(IsKill))
+            .addImm(Offset)
+            .addReg(SOffset)
+            .addImm(0) // glc
+            .addImm(0) // slc
+            .addImm(0) // tfe
+            .addReg(Value, RegState::Implicit | getDefRegState(IsLoad));
+  }
+}
+
 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                         int SPAdj, unsigned FIOperandNum,
                                         RegScavenger *RS) const {
@@ -125,7 +204,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
            Ctx.emitError("Ran out of VGPRs for spilling SGPR");
         }
 
-        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
+        BuildMI(*MBB, MI, DL,
+                TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+                Spill.VGPR)
                 .addReg(SubReg)
                 .addImm(Spill.Lane);
 
@@ -154,13 +235,15 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
            Ctx.emitError("Ran out of VGPRs for spilling SGPR");
         }
 
-        if (isM0) {
+        if (isM0)
           SubReg = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0);
-        }
 
-        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
+        BuildMI(*MBB, MI, DL,
+                TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+                SubReg)
                 .addReg(Spill.VGPR)
-                .addImm(Spill.Lane);
+                .addImm(Spill.Lane)
+                .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
         if (isM0) {
           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
                   .addReg(SubReg);
@@ -177,71 +260,25 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_V128_SAVE:
     case AMDGPU::SI_SPILL_V96_SAVE:
     case AMDGPU::SI_SPILL_V64_SAVE:
-    case AMDGPU::SI_SPILL_V32_SAVE: {
-      unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
-      unsigned SrcReg = MI->getOperand(0).getReg();
-      int64_t Offset = FrameInfo->getObjectOffset(Index);
-      unsigned Size = NumSubRegs * 4;
-      unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
-
-      for (unsigned i = 0, e = NumSubRegs; i != e; ++i) {
-        unsigned SubReg = NumSubRegs > 1 ?
-            getPhysRegSubReg(SrcReg, &AMDGPU::VGPR_32RegClass, i) :
-            SrcReg;
-        Offset += (i * 4);
-        MFI->LDSWaveSpillSize = std::max((unsigned)Offset + 4, (unsigned)MFI->LDSWaveSpillSize);
-
-        unsigned AddrReg = TII->calculateLDSSpillAddress(*MBB, MI, RS, TmpReg,
-                                                         Offset, Size);
-
-        if (AddrReg == AMDGPU::NoRegister) {
-           LLVMContext &Ctx = MF->getFunction()->getContext();
-           Ctx.emitError("Ran out of VGPRs for spilling VGPRS");
-           AddrReg = AMDGPU::VGPR0;
-        }
-
-        // Store the value in LDS
-        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::DS_WRITE_B32))
-                .addImm(0) // gds
-                .addReg(AddrReg, RegState::Kill) // addr
-                .addReg(SubReg) // data0
-                .addImm(0); // offset
-      }
-
+    case AMDGPU::SI_SPILL_V32_SAVE:
+      buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
+            TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
+             FrameInfo->getObjectOffset(Index), RS);
       MI->eraseFromParent();
       break;
-    }
     case AMDGPU::SI_SPILL_V32_RESTORE:
     case AMDGPU::SI_SPILL_V64_RESTORE:
+    case AMDGPU::SI_SPILL_V96_RESTORE:
     case AMDGPU::SI_SPILL_V128_RESTORE:
     case AMDGPU::SI_SPILL_V256_RESTORE:
     case AMDGPU::SI_SPILL_V512_RESTORE: {
-      unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
-      unsigned DstReg = MI->getOperand(0).getReg();
-      int64_t Offset = FrameInfo->getObjectOffset(Index);
-      unsigned Size = NumSubRegs * 4;
-      unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
-
-      // FIXME: We could use DS_READ_B64 here to optimize for larger registers.
-      for (unsigned i = 0, e = NumSubRegs; i != e; ++i) {
-        unsigned SubReg = NumSubRegs > 1 ?
-            getPhysRegSubReg(DstReg, &AMDGPU::VGPR_32RegClass, i) :
-            DstReg;
-
-        Offset += (i * 4);
-        unsigned AddrReg = TII->calculateLDSSpillAddress(*MBB, MI, RS, TmpReg,
-                                                          Offset, Size);
-        if (AddrReg == AMDGPU::NoRegister) {
-           LLVMContext &Ctx = MF->getFunction()->getContext();
-           Ctx.emitError("Ran out of VGPRs for spilling VGPRs");
-           AddrReg = AMDGPU::VGPR0;
-        }
-
-        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::DS_READ_B32), SubReg)
-                .addImm(0) // gds
-                .addReg(AddrReg, RegState::Kill) // addr
-                .addImm(0); //offset
-      }
+      buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+            TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
+            FrameInfo->getObjectOffset(Index), RS);
       MI->eraseFromParent();
       break;
     }
@@ -250,11 +287,11 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
       int64_t Offset = FrameInfo->getObjectOffset(Index);
       FIOp.ChangeToImmediate(Offset);
       if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
-        unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VReg_32RegClass, MI, SPAdj);
+        unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj);
         BuildMI(*MBB, MI, MI->getDebugLoc(),
                 TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
                 .addImm(Offset);
-        FIOp.ChangeToRegister(TmpReg, false);
+        FIOp.ChangeToRegister(TmpReg, false, false, true);
       }
     }
   }
@@ -264,7 +301,7 @@ const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
                                                                    MVT VT) const {
   switch(VT.SimpleTy) {
     default:
-    case MVT::i32: return &AMDGPU::VReg_32RegClass;
+    case MVT::i32: return &AMDGPU::VGPR_32RegClass;
   }
 }
 
@@ -276,7 +313,7 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
   assert(!TargetRegisterInfo::isVirtualRegister(Reg));
 
   static const TargetRegisterClass *BaseClasses[] = {
-    &AMDGPU::VReg_32RegClass,
+    &AMDGPU::VGPR_32RegClass,
     &AMDGPU::SReg_32RegClass,
     &AMDGPU::VReg_64RegClass,
     &AMDGPU::SReg_64RegClass,
@@ -297,7 +334,7 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
 }
 
 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
-  return getCommonSubClass(&AMDGPU::VReg_32RegClass, RC) ||
+  return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) ||
          getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) ||
          getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) ||
          getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) ||
@@ -312,7 +349,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
     } else if (SRC == &AMDGPU::SCCRegRegClass) {
       return &AMDGPU::VCCRegRegClass;
     } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) {
-      return &AMDGPU::VReg_32RegClass;
+      return &AMDGPU::VGPR_32RegClass;
     } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) {
       return &AMDGPU::VReg_64RegClass;
     } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) {
@@ -388,40 +425,17 @@ unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
   return SubRC->getRegister(Index + Channel);
 }
 
-bool SIRegisterInfo::regClassCanUseLiteralConstant(int RCID) const {
-  switch (RCID) {
-  default: return false;
-  case AMDGPU::SSrc_32RegClassID:
-  case AMDGPU::SSrc_64RegClassID:
-  case AMDGPU::VSrc_32RegClassID:
-  case AMDGPU::VSrc_64RegClassID:
-    return true;
-  }
-}
-
-bool SIRegisterInfo::regClassCanUseLiteralConstant(
-                             const TargetRegisterClass *RC) const {
-  return regClassCanUseLiteralConstant(RC->getID());
+bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
+  return OpType == AMDGPU::OPERAND_REG_IMM32;
 }
 
-bool SIRegisterInfo::regClassCanUseInlineConstant(int RCID) const {
-  if (regClassCanUseLiteralConstant(RCID))
+bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
+  if (opCanUseLiteralConstant(OpType))
     return true;
 
-  switch (RCID) {
-  default: return false;
-  case AMDGPU::VCSrc_32RegClassID:
-  case AMDGPU::VCSrc_64RegClassID:
-    return true;
-  }
-}
-
-bool SIRegisterInfo::regClassCanUseInlineConstant(
-                            const TargetRegisterClass *RC) const {
-  return regClassCanUseInlineConstant(RC->getID());
+  return OpType == AMDGPU::OPERAND_REG_INLINE_C;
 }
 
-
 unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
                                            enum PreloadedValue Value) const {
 
@@ -434,6 +448,8 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
   case SIRegisterInfo::TGID_Z:
     return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
   case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
+    if (MFI->getShaderType() != ShaderType::COMPUTE)
+      return MFI->ScratchOffsetReg;
     return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
   case SIRegisterInfo::SCRATCH_PTR:
     return AMDGPU::SGPR2_SGPR3;
@@ -452,9 +468,8 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
 /// \brief Returns a register that is not used at any point in the function.
 ///        If all registers are used, then this function will return
 //         AMDGPU::NoRegister.
-unsigned SIRegisterInfo::findUnusedVGPR(const MachineRegisterInfo &MRI) const {
-
-  const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass;
+unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
+                                           const TargetRegisterClass *RC) const {
 
   for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
        I != E; ++I) {
@@ -464,3 +479,29 @@ unsigned SIRegisterInfo::findUnusedVGPR(const MachineRegisterInfo &MRI) const {
   return AMDGPU::NoRegister;
 }
 
+unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {
+  switch(WaveCount) {
+    case 10: return 24;
+    case 9:  return 28;
+    case 8:  return 32;
+    case 7:  return 36;
+    case 6:  return 40;
+    case 5:  return 48;
+    case 4:  return 64;
+    case 3:  return 84;
+    case 2:  return 128;
+    default: return 256;
+  }
+}
+
+unsigned SIRegisterInfo::getNumSGPRsAllowed(unsigned WaveCount) const {
+  switch(WaveCount) {
+    case 10: return 48;
+    case 9:  return 56;
+    case 8:  return 64;
+    case 7:  return 72;
+    case 6:  return 80;
+    case 5:  return 96;
+    default: return 103;
+  }
+}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index c7e54db..d908ffd 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -17,6 +17,7 @@
 #define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
 
 #include "AMDGPURegisterInfo.h"
+#include "llvm/Support/Debug.h"
 
 namespace llvm {
 
@@ -26,8 +27,7 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
-  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                               MachineFunction &MF) const override;
+  unsigned getRegPressureSetLimit(unsigned Idx) const override;
 
   bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
 
@@ -42,7 +42,7 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
   unsigned getHWRegIndex(unsigned Reg) const override;
 
   /// \brief Return the 'base' register class for this register.
-  /// e.g. SGPR0 => SReg_32, VGPR => VReg_32 SGPR0_SGPR1 -> SReg_32, etc.
+  /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc.
   const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
 
   /// \returns true if this class contains only SGPR registers
@@ -80,22 +80,14 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
   unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC,
                             unsigned Channel) const;
 
-  /// \returns True if operands defined with this register class can accept
+  /// \returns True if operands defined with this operand type can accept
   /// a literal constant (i.e. any 32-bit immediate).
-  bool regClassCanUseLiteralConstant(int RCID) const;
+  bool opCanUseLiteralConstant(unsigned OpType) const;
 
-  /// \returns True if operands defined with this register class can accept
-  /// a literal constant (i.e. any 32-bit immediate).
-  bool regClassCanUseLiteralConstant(const TargetRegisterClass *RC) const;
-
-  /// \returns True if operands defined with this register class can accept
+  /// \returns True if operands defined with this operand type can accept
   /// an inline constant. i.e. An integer value in the range (-16, 64) or
   /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. 
-  bool regClassCanUseInlineConstant(int RCID) const;
-
-  /// \returns True if operands defined with this register class can accept
-  /// a literal constant. i.e. A value in the range (-16, 64).
-  bool regClassCanUseInlineConstant(const TargetRegisterClass *RC) const;
+  bool opCanUseInlineConstant(unsigned OpType) const;
 
   enum PreloadedValue {
     TGID_X,
@@ -113,7 +105,22 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
   unsigned getPreloadedValue(const MachineFunction &MF,
                              enum PreloadedValue Value) const;
 
-  unsigned findUnusedVGPR(const MachineRegisterInfo &MRI) const;
+  /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount
+  ///        concurrent waves.
+  unsigned getNumVGPRsAllowed(unsigned WaveCount) const;
+
+  /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount
+  ///        concurrent waves.
+  unsigned getNumSGPRsAllowed(unsigned WaveCount) const;
+
+  unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
+                              const TargetRegisterClass *RC) const;
+
+private:
+  void buildScratchLoadStore(MachineBasicBlock::iterator MI,
+                             unsigned LoadStoreOp, unsigned Value,
+                             unsigned ScratchRsrcReg, unsigned ScratchOffset,
+                             int64_t Offset, RegScavenger *RS) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index 45c2b41..8b25e95 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -21,7 +21,7 @@ def VCC_LO : SIReg<"vcc_lo", 106>;
 def VCC_HI : SIReg<"vcc_hi", 107>;
 
 // VCC for 64-bit instructions
-def VCC : RegisterWithSubRegs<"VCC", [VCC_LO, VCC_HI]> {
+def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
   let HWEncoding = 106;
@@ -36,14 +36,14 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> {
   let HWEncoding = 126;
 }
 
-def SCC : SIReg<"SCC", 253>;
-def M0 : SIReg <"M0", 124>;
+def SCC : SIReg<"scc", 253>;
+def M0 : SIReg <"m0", 124>;
 
 def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes.
 def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes.
 
 // Pair to indicate location of scratch space for flat accesses.
-def FLAT_SCR : RegisterWithSubRegs <"FLAT_SCR", [FLAT_SCR_LO, FLAT_SCR_HI]> {
+def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
   let HWEncoding = 104;
@@ -184,9 +184,9 @@ def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
   (add SGPR_32, M0Reg, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)
 >;
 
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64], 64, (add SGPR_64Regs)>;
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>;
 
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64,
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64,
   (add SGPR_64, VCCReg, EXECReg, FLAT_SCR)
 >;
 
@@ -197,8 +197,6 @@ def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256
 def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>;
 
 // Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>;
-
 def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>;
 
 def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
@@ -211,31 +209,53 @@ def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256
 
 def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
 
-def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)>;
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
+  let Size = 32;
+}
+
+class RegImmOperand <RegisterClass rc> : RegisterOperand<rc> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_REG_IMM32";
+}
+
+class RegInlineOperand <RegisterClass rc> : RegisterOperand<rc> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_REG_INLINE_C";
+}
 
 //===----------------------------------------------------------------------===//
 //  SSrc_* Operands with an SGPR or a 32-bit immediate
 //===----------------------------------------------------------------------===//
 
-def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>;
+def SSrc_32 : RegImmOperand<SReg_32>;
 
-def SSrc_64 : RegisterClass<"AMDGPU", [i64, f64, i1], 64, (add SReg_64)>;
+def SSrc_64 : RegImmOperand<SReg_64>;
+
+//===----------------------------------------------------------------------===//
+//  SCSrc_* Operands with an SGPR or a inline constant
+//===----------------------------------------------------------------------===//
+
+def SCSrc_32 : RegInlineOperand<SReg_32>;
 
 //===----------------------------------------------------------------------===//
 //  VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
 //===----------------------------------------------------------------------===//
 
-def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
+def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>;
+
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+
+def VSrc_32 : RegImmOperand<VS_32>;
 
-def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+def VSrc_64 : RegImmOperand<VS_64>;
 
 //===----------------------------------------------------------------------===//
 //  VCSrc_* Operands with an SGPR, VGPR or an inline constant
 //===----------------------------------------------------------------------===//
 
-def VCSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
+def VCSrc_32 : RegInlineOperand<VS_32>;
 
-def VCSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+def VCSrc_64 : RegInlineOperand<VS_64>;
 
 //===----------------------------------------------------------------------===//
 // SGPR and VGPR register classes
diff --git a/lib/Target/R600/SISchedule.td b/lib/Target/R600/SISchedule.td
index 28b65b8..9b1f676 100644
--- a/lib/Target/R600/SISchedule.td
+++ b/lib/Target/R600/SISchedule.td
@@ -7,9 +7,85 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: This is just a place holder for now.
+// MachineModel definitions for Southern Islands (SI)
 //
 //===----------------------------------------------------------------------===//
 
+def WriteBranch : SchedWrite;
+def WriteExport : SchedWrite;
+def WriteLDS    : SchedWrite;
+def WriteSALU   : SchedWrite;
+def WriteSMEM   : SchedWrite;
+def WriteVMEM   : SchedWrite;
 
-def SI_Itin : ProcessorItineraries <[], [], []>;
+// Vector ALU instructions
+def Write32Bit         : SchedWrite;
+def WriteQuarterRate32 : SchedWrite;
+
+def WriteFloatFMA   : SchedWrite;
+
+def WriteDouble     : SchedWrite;
+def WriteDoubleAdd  : SchedWrite;
+
+def SIFullSpeedModel : SchedMachineModel;
+def SIQuarterSpeedModel : SchedMachineModel;
+
+// BufferSize = 0 means the processors are in-order.
+let BufferSize = 0 in {
+
+// XXX: Are the resource counts correct?
+def HWBranch : ProcResource<1>;
+def HWExport : ProcResource<7>;   // Taken from S_WAITCNT
+def HWLGKM   : ProcResource<31>;  // Taken from S_WAITCNT
+def HWSALU   : ProcResource<1>;
+def HWVMEM   : ProcResource<15>;  // Taken from S_WAITCNT
+def HWVALU   : ProcResource<1>;
+
+}
+
+class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
+                 int latency> : WriteRes<write, resources> {
+  let Latency = latency;
+}
+
+class HWVALUWriteRes<SchedWrite write, int latency> :
+  HWWriteRes<write, [HWVALU], latency>;
+
+
+// The latency numbers are taken from AMD Accelerated Parallel Processing
+// guide.  They may not be acurate.
+
+// The latency values are 1 / (operations / cycle) / 4.
+multiclass SICommonWriteRes {
+
+  def : HWWriteRes<WriteBranch,  [HWBranch], 100>; // XXX: Guessed ???
+  def : HWWriteRes<WriteExport,  [HWExport], 100>; // XXX: Guessed ???
+  def : HWWriteRes<WriteLDS,     [HWLGKM],    32>; // 2 - 64
+  def : HWWriteRes<WriteSALU,    [HWSALU],     1>;
+  def : HWWriteRes<WriteSMEM,    [HWLGKM],    10>; // XXX: Guessed ???
+  def : HWWriteRes<WriteVMEM,    [HWVMEM],   450>; // 300 - 600
+
+  def : HWVALUWriteRes<Write32Bit,         1>;
+  def : HWVALUWriteRes<WriteQuarterRate32, 4>;
+}
+
+
+let SchedModel = SIFullSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA,   1>;
+def : HWVALUWriteRes<WriteDouble,     4>;
+def : HWVALUWriteRes<WriteDoubleAdd,  2>;
+
+} // End SchedModel = SIFullSpeedModel
+
+let SchedModel = SIQuarterSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA, 16>;
+def : HWVALUWriteRes<WriteDouble,   16>;
+def : HWVALUWriteRes<WriteDoubleAdd, 8>;
+
+}  // End SchedModel = SIQuarterSpeedModel
diff --git a/lib/Target/R600/SIShrinkInstructions.cpp b/lib/Target/R600/SIShrinkInstructions.cpp
index 45e83f5..97bbd78 100644
--- a/lib/Target/R600/SIShrinkInstructions.cpp
+++ b/lib/Target/R600/SIShrinkInstructions.cpp
@@ -10,6 +10,7 @@
 //
 
 #include "AMDGPU.h"
+#include "AMDGPUMCInstLower.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "llvm/ADT/Statistic.h"
@@ -126,37 +127,32 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
          TII->isVOPC(MI.getOpcode()));
 
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
-  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+  MachineOperand &Src0 = MI.getOperand(Src0Idx);
 
   // Only one literal constant is allowed per instruction, so if src0 is a
   // literal constant then we can't do any folding.
-  if ((Src0->isImm() || Src0->isFPImm()) && TII->isLiteralConstant(*Src0))
+  if (Src0.isImm() &&
+      TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx)))
     return;
 
-
   // Literal constants and SGPRs can only be used in Src0, so if Src0 is an
   // SGPR, we cannot commute the instruction, so we can't fold any literal
   // constants.
-  if (Src0->isReg() && !isVGPR(Src0, TRI, MRI))
+  if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI))
     return;
 
   // Try to fold Src0
-  if (Src0->isReg()) {
-    unsigned Reg = Src0->getReg();
+  if (Src0.isReg()) {
+    unsigned Reg = Src0.getReg();
     MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
     if (Def && Def->isMoveImmediate()) {
       MachineOperand &MovSrc = Def->getOperand(1);
       bool ConstantFolded = false;
 
       if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) {
-        Src0->ChangeToImmediate(MovSrc.getImm());
+        Src0.ChangeToImmediate(MovSrc.getImm());
         ConstantFolded = true;
-      } else if (MovSrc.isFPImm()) {
-        const ConstantFP *CFP = MovSrc.getFPImm();
-        if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEsingle) {
-          Src0->ChangeToFPImmediate(CFP);
-          ConstantFolded = true;
-        }
       }
       if (ConstantFolded) {
         if (MRI.use_empty(Reg))
@@ -193,13 +189,12 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
         const MachineOperand &Src = MI.getOperand(1);
 
-        // TODO: Handle FPImm?
         if (Src.isImm()) {
-          if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src)) {
+          if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4))
             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
-            continue;
-          }
         }
+
+        continue;
       }
 
       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
@@ -213,13 +208,13 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           continue;
       }
 
-      int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
-
-      // Op32 could be -1 here if we started with an instruction that had a
+      // getVOPe32 could be -1 here if we started with an instruction that had
       // a 32-bit encoding and then commuted it to an instruction that did not.
-      if (Op32 == -1)
+      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
         continue;
 
+      int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
+
       if (TII->isVOPC(Op32)) {
         unsigned DstReg = MI.getOperand(0).getReg();
         if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp
index 9318dc1..27bbf4f 100644
--- a/lib/Target/R600/SITypeRewriter.cpp
+++ b/lib/Target/R600/SITypeRewriter.cpp
@@ -61,8 +61,7 @@ bool SITypeRewriter::doInitialization(Module &M) {
 }
 
 bool SITypeRewriter::runOnFunction(Function &F) {
-  AttributeSet Set = F.getAttributes();
-  Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, "ShaderType");
+  Attribute A = F.getFnAttribute("ShaderType");
 
   unsigned ShaderType = ShaderType::COMPUTE;
   if (A.isStringAttribute()) {
diff --git a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
index f437564..d723d6e 100644
--- a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
@@ -16,11 +16,15 @@
 
 using namespace llvm;
 
-/// \brief The target for the AMDGPU backend
+/// \brief The target which suports all AMD GPUs.  This will eventually
+///         be deprecated and there will be a R600 target and a GCN target.
 Target llvm::TheAMDGPUTarget;
+/// \brief The target for GCN GPUs
+Target llvm::TheGCNTarget;
 
 /// \brief Extern function to initialize the targets for the AMDGPU backend
 extern "C" void LLVMInitializeR600TargetInfo() {
   RegisterTarget<Triple::r600, false>
     R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX");
+  RegisterTarget<Triple::amdgcn, false> GCN(TheGCNTarget, "amdgcn", "AMD GCN GPUs");
 }
diff --git a/lib/Target/R600/VIInstrFormats.td b/lib/Target/R600/VIInstrFormats.td
new file mode 100644
index 0000000..d8738f9
--- /dev/null
+++ b/lib/Target/R600/VIInstrFormats.td
@@ -0,0 +1,166 @@
+//===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// VI Instruction format definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class DSe_vi <bits<8> op> : Enc64 {
+  bits<8> vdst;
+  bits<1> gds;
+  bits<8> addr;
+  bits<8> data0;
+  bits<8> data1;
+  bits<8> offset0;
+  bits<8> offset1;
+
+  let Inst{7-0} = offset0;
+  let Inst{15-8} = offset1;
+  let Inst{16} = gds;
+  let Inst{24-17} = op;
+  let Inst{31-26} = 0x36; //encoding
+  let Inst{39-32} = addr;
+  let Inst{47-40} = data0;
+  let Inst{55-48} = data1;
+  let Inst{63-56} = vdst;
+}
+
+class MUBUFe_vi <bits<7> op> : Enc64 {
+  bits<12> offset;
+  bits<1> offen;
+  bits<1> idxen;
+  bits<1> glc;
+  bits<1> lds;
+  bits<8> vaddr;
+  bits<8> vdata;
+  bits<7> srsrc;
+  bits<1> slc;
+  bits<1> tfe;
+  bits<8> soffset;
+
+  let Inst{11-0} = offset;
+  let Inst{12} = offen;
+  let Inst{13} = idxen;
+  let Inst{14} = glc;
+  let Inst{16} = lds;
+  let Inst{17} = slc;
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x38; //encoding
+  let Inst{39-32} = vaddr;
+  let Inst{47-40} = vdata;
+  let Inst{52-48} = srsrc{6-2};
+  let Inst{55} = tfe;
+  let Inst{63-56} = soffset;
+}
+
+class MTBUFe_vi <bits<4> op> : Enc64 {
+  bits<12> offset;
+  bits<1>  offen;
+  bits<1>  idxen;
+  bits<1>  glc;
+  bits<4>  dfmt;
+  bits<3>  nfmt;
+  bits<8>  vaddr;
+  bits<8>  vdata;
+  bits<7>  srsrc;
+  bits<1>  slc;
+  bits<1>  tfe;
+  bits<8>  soffset;
+
+  let Inst{11-0}  = offset;
+  let Inst{12}    = offen;
+  let Inst{13}    = idxen;
+  let Inst{14}    = glc;
+  let Inst{18-15} = op;
+  let Inst{22-19} = dfmt;
+  let Inst{25-23} = nfmt;
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = vaddr;
+  let Inst{47-40} = vdata;
+  let Inst{52-48} = srsrc{6-2};
+  let Inst{54}    = slc;
+  let Inst{55}    = tfe;
+  let Inst{63-56} = soffset;
+}
+
+class SMEMe_vi <bits<8> op, bit imm> : Enc64 {
+  bits<7>  sbase;
+  bits<7>  sdata;
+  bits<1>  glc;
+  bits<20> offset;
+
+  let Inst{5-0}   = sbase{6-1};
+  let Inst{12-6}  = sdata;
+  let Inst{16}    = glc;
+  let Inst{17}    = imm;
+  let Inst{25-18} = op;
+  let Inst{31-26} = 0x30; //encoding
+  let Inst{51-32} = offset;
+}
+
+class VOP3e_vi <bits<10> op> : Enc64 {
+  bits<8> vdst;
+  bits<2> src0_modifiers;
+  bits<9> src0;
+  bits<2> src1_modifiers;
+  bits<9> src1;
+  bits<2> src2_modifiers;
+  bits<9> src2;
+  bits<1> clamp;
+  bits<2> omod;
+
+  let Inst{7-0}   = vdst;
+  let Inst{8}     = src0_modifiers{1};
+  let Inst{9}     = src1_modifiers{1};
+  let Inst{10}    = src2_modifiers{1};
+  let Inst{15}    = clamp;
+  let Inst{25-16} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{61} = src0_modifiers{0};
+  let Inst{62} = src1_modifiers{0};
+  let Inst{63} = src2_modifiers{0};
+}
+
+class VOP3be_vi <bits<10> op> : Enc64 {
+  bits<8> vdst;
+  bits<2> src0_modifiers;
+  bits<9> src0;
+  bits<2> src1_modifiers;
+  bits<9> src1;
+  bits<2> src2_modifiers;
+  bits<9> src2;
+  bits<7> sdst;
+  bits<2> omod;
+  bits<1> clamp;
+
+  let Inst{7-0} = vdst;
+  let Inst{14-8} = sdst;
+  let Inst{15} = clamp;
+  let Inst{25-16} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{61} = src0_modifiers{0};
+  let Inst{62} = src1_modifiers{0};
+  let Inst{63} = src2_modifiers{0};
+}
+
+class EXPe_vi : EXPe {
+  let Inst{31-26} = 0x31; //encoding
+}
+
+class VINTRPe_vi <bits<2> op> : VINTRPe <op> {
+  let Inst{31-26} = 0x35; // encoding
+}
diff --git a/lib/Target/R600/VIInstructions.td b/lib/Target/R600/VIInstructions.td
new file mode 100644
index 0000000..4a6e933
--- /dev/null
+++ b/lib/Target/R600/VIInstructions.td
@@ -0,0 +1,25 @@
+//===-- VIInstructions.td - VI Instruction Defintions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Instruction definitions for VI and newer.
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// SMEM Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isVI] in {
+
+// 1. Offset as 20bit DWORD immediate
+def : Pat <
+  (SIload_constant v4i32:$sbase, IMM20bit:$offset),
+  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
+>;
+
+} // End Predicates = [isVI]
diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index d0b362c..551189c 100644
--- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -393,9 +393,6 @@ bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   unsigned MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
                                               MatchingInlineAsm);
   switch (MatchResult) {
-  default:
-    break;
-
   case Match_Success: {
     Inst.setLoc(IDLoc);
     Out.EmitInstruction(Inst, STI);
@@ -422,7 +419,7 @@ bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_MnemonicFail:
     return Error(IDLoc, "invalid instruction mnemonic");
   }
-  return true;
+  llvm_unreachable("Implement any new match types added!");
 }
 
 bool SparcAsmParser::
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index 28369fd..38bff44 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -45,10 +45,7 @@ namespace {
     const SparcSubtarget *Subtarget;
 
     static char ID;
-    Filler(TargetMachine &tm)
-      : MachineFunctionPass(ID), TM(tm),
-        Subtarget(&TM.getSubtarget<SparcSubtarget>()) {
-    }
+    Filler(TargetMachine &tm) : MachineFunctionPass(ID), TM(tm) {}
 
     const char *getPassName() const override {
       return "SPARC Delay Slot Filler";
@@ -57,6 +54,7 @@ namespace {
     bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
     bool runOnMachineFunction(MachineFunction &F) override {
       bool Changed = false;
+      Subtarget = &F.getSubtarget<SparcSubtarget>();
 
       // This pass invalidates liveness information when it reorders
       // instructions to fill delay slot.
@@ -109,8 +107,8 @@ FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) {
 ///
 bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
-
-  const TargetInstrInfo *TII = TM.getSubtargetImpl()->getInstrInfo();
+  Subtarget = &MBB.getParent()->getSubtarget<SparcSubtarget>();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
 
   for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
     MachineBasicBlock::iterator MI = I;
@@ -187,7 +185,7 @@ Filler::findDelayInstr(MachineBasicBlock &MBB,
     if (J->getOpcode() == SP::RESTORErr
         || J->getOpcode() == SP::RESTOREri) {
       // change retl to ret.
-      slot->setDesc(TM.getSubtargetImpl()->getInstrInfo()->get(SP::RET));
+      slot->setDesc(Subtarget->getInstrInfo()->get(SP::RET));
       return J;
     }
   }
@@ -329,8 +327,7 @@ void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
 bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg)
 {
   // Check Reg and all aliased Registers.
-  for (MCRegAliasIterator AI(Reg, TM.getSubtargetImpl()->getRegisterInfo(),
-                             true);
+  for (MCRegAliasIterator AI(Reg, Subtarget->getRegisterInfo(), true);
        AI.isValid(); ++AI)
     if (RegSet.count(*AI))
       return true;
@@ -483,7 +480,7 @@ bool Filler::tryCombineRestoreWithPrevInst(MachineBasicBlock &MBB,
   if (PrevInst->isBundledWithSucc())
     return false;
 
-  const TargetInstrInfo *TII = TM.getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
 
   switch (PrevInst->getOpcode()) {
   default: break;
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 3a9c987..6767e4b 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -42,9 +42,7 @@ SparcELFMCAsmInfo::SparcELFMCAsmInfo(StringRef TT) {
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
 
-  if (TheTriple.getOS() == llvm::Triple::Solaris ||
-      TheTriple.getOS() == llvm::Triple::OpenBSD)
-    UseIntegratedAssembler = true;
+  UseIntegratedAssembler = true;
 }
 
 const MCExpr*
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index eea9626..5128843 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -31,8 +31,8 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace {
 class SparcMCCodeEmitter : public MCCodeEmitter {
-  SparcMCCodeEmitter(const SparcMCCodeEmitter &) LLVM_DELETED_FUNCTION;
-  void operator=(const SparcMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  SparcMCCodeEmitter(const SparcMCCodeEmitter &) = delete;
+  void operator=(const SparcMCCodeEmitter &) = delete;
   MCContext &Ctx;
 
 public:
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index 6432003..0439f9d 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -43,8 +43,9 @@ namespace {
           *OutStreamer.getTargetStreamer());
     }
   public:
-    explicit SparcAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : AsmPrinter(TM, Streamer) {}
+    explicit SparcAsmPrinter(TargetMachine &TM,
+                             std::unique_ptr<MCStreamer> Streamer)
+        : AsmPrinter(TM, std::move(Streamer)) {}
 
     const char *getPassName() const override {
       return "Sparc Assembly Printer";
@@ -277,7 +278,7 @@ void SparcAsmPrinter::EmitInstruction(const MachineInstr *MI)
 }
 
 void SparcAsmPrinter::EmitFunctionBodyStart() {
-  if (!TM.getSubtarget<SparcSubtarget>().is64Bit())
+  if (!MF->getSubtarget<SparcSubtarget>().is64Bit())
     return;
 
   const MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -296,7 +297,7 @@ void SparcAsmPrinter::EmitFunctionBodyStart() {
 
 void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                    raw_ostream &O) {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   const MachineOperand &MO = MI->getOperand (opNum);
   SparcMCExpr::VariantKind TF = (SparcMCExpr::VariantKind) MO.getTargetFlags();
 
@@ -450,8 +451,7 @@ void SparcAsmPrinter::EmitEndOfAsmFile(Module &M) {
   MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
   if (!Stubs.empty()) {
     OutStreamer.SwitchSection(TLOFELF.getDataSection());
-    unsigned PtrSize =
-        TM.getSubtargetImpl()->getDataLayout()->getPointerSize(0);
+    unsigned PtrSize = TM.getDataLayout()->getPointerSize(0);
     for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
       OutStreamer.EmitLabel(Stubs[i].first);
       OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(), PtrSize);
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 1b67b4b..a065d3a 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -103,9 +103,7 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
     SAVEri = SP::ADDri;
     SAVErr = SP::ADDrr;
   }
-  NumBytes =
-      -MF.getTarget().getSubtarget<SparcSubtarget>().getAdjustedFrameSize(
-          NumBytes);
+  NumBytes = -MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes);
   emitSPAdjustment(MF, MBB, MBBI, NumBytes, SAVErr, SAVEri);
 
   MachineModuleInfo &MMI = MF.getMMI();
@@ -168,8 +166,7 @@ void SparcFrameLowering::emitEpilogue(MachineFunction &MF,
   if (NumBytes == 0)
     return;
 
-  NumBytes = MF.getTarget().getSubtarget<SparcSubtarget>().getAdjustedFrameSize(
-      NumBytes);
+  NumBytes = MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes);
   emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri);
 }
 
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index b3b029e..9f03b04 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -32,13 +32,13 @@ namespace {
 class SparcDAGToDAGISel : public SelectionDAGISel {
   /// Subtarget - Keep a pointer to the Sparc Subtarget around so that we can
   /// make the right decision when generating code for different targets.
-  const SparcSubtarget &Subtarget;
-  SparcTargetMachine &TM;
+  const SparcSubtarget *Subtarget;
 public:
-  explicit SparcDAGToDAGISel(SparcTargetMachine &tm)
-    : SelectionDAGISel(tm),
-      Subtarget(tm.getSubtarget<SparcSubtarget>()),
-      TM(tm) {
+  explicit SparcDAGToDAGISel(SparcTargetMachine &tm) : SelectionDAGISel(tm) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    Subtarget = &MF.getSubtarget<SparcSubtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
   }
 
   SDNode *Select(SDNode *N) override;
@@ -66,8 +66,7 @@ private:
 }  // end anonymous namespace
 
 SDNode* SparcDAGToDAGISel::getGlobalBaseReg() {
-  unsigned GlobalBaseReg =
-      TM.getSubtargetImpl()->getInstrInfo()->getGlobalBaseReg(MF);
+  unsigned GlobalBaseReg = Subtarget->getInstrInfo()->getGlobalBaseReg(MF);
   return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode();
 }
 
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index e6a69d2..6774977 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -57,7 +57,7 @@ static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT,
     SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
   };
   // Try to get first reg.
-  if (unsigned Reg = State.AllocateReg(RegList, 6)) {
+  if (unsigned Reg = State.AllocateReg(RegList)) {
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   } else {
     // Assign whole thing in stack.
@@ -68,7 +68,7 @@ static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT,
   }
 
   // Try to get second reg.
-  if (unsigned Reg = State.AllocateReg(RegList, 6))
+  if (unsigned Reg = State.AllocateReg(RegList))
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   else
     State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
@@ -497,7 +497,7 @@ LowerFormalArguments_32(SDValue Chain,
     static const MCPhysReg ArgRegs[] = {
       SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
     };
-    unsigned NumAllocated = CCInfo.getFirstUnallocated(ArgRegs, 6);
+    unsigned NumAllocated = CCInfo.getFirstUnallocated(ArgRegs);
     const MCPhysReg *CurArgReg = ArgRegs+NumAllocated, *ArgRegEnd = ArgRegs+6;
     unsigned ArgOffset = CCInfo.getNextStackOffset();
     if (NumAllocated == 6)
@@ -914,8 +914,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const SparcRegisterInfo *TRI =
-      getTargetMachine().getSubtarget<SparcSubtarget>().getRegisterInfo();
+  const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const uint32_t *Mask = ((hasReturnsTwice)
                           ? TRI->getRTCallPreservedMask(CallConv)
                           : TRI->getCallPreservedMask(CallConv));
@@ -1227,8 +1226,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const SparcRegisterInfo *TRI =
-      getTargetMachine().getSubtarget<SparcSubtarget>().getRegisterInfo();
+  const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const uint32_t *Mask =
       ((hasReturnsTwice) ? TRI->getRTCallPreservedMask(CLI.CallConv)
                          : TRI->getCallPreservedMask(CLI.CallConv));
@@ -1365,10 +1363,9 @@ static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
   }
 }
 
-SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
-  : TargetLowering(TM) {
-  Subtarget = &TM.getSubtarget<SparcSubtarget>();
-
+SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
+                                         const SparcSubtarget &STI)
+    : TargetLowering(TM), Subtarget(&STI) {
   // Set up the register classes.
   addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
   addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
@@ -1378,11 +1375,14 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
     addRegisterClass(MVT::i64, &SP::I64RegsRegClass);
 
   // Turn FP extload into load/fextend
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
+  for (MVT VT : MVT::fp_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
+  }
 
   // Sparc doesn't have i1 sign extending load
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  for (MVT VT : MVT::integer_valuetypes())
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 
   // Turn FP truncstore into trunc + store.
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
@@ -1669,7 +1669,7 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
 
   setMinFunctionAlignment(2);
 
-  computeRegisterProperties();
+  computeRegisterProperties(Subtarget->getRegisterInfo());
 }
 
 const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -1904,10 +1904,8 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
     Ops.push_back(Callee);
     Ops.push_back(Symbol);
     Ops.push_back(DAG.getRegister(SP::O0, PtrVT));
-    const uint32_t *Mask = getTargetMachine()
-                               .getSubtargetImpl()
-                               ->getRegisterInfo()
-                               ->getCallPreservedMask(CallingConv::C);
+    const uint32_t *Mask =
+        Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
     assert(Mask && "Missing call preserved mask for calling convention");
     Ops.push_back(DAG.getRegisterMask(Mask));
     Ops.push_back(InFlag);
@@ -2903,8 +2901,7 @@ MachineBasicBlock*
 SparcTargetLowering::expandSelectCC(MachineInstr *MI,
                                     MachineBasicBlock *BB,
                                     unsigned BROpcode) const {
-  const TargetInstrInfo &TII =
-      *getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
   unsigned CC = (SPCC::CondCodes)MI->getOperand(3).getImm();
 
@@ -2965,8 +2962,7 @@ SparcTargetLowering::expandAtomicRMW(MachineInstr *MI,
                                      MachineBasicBlock *MBB,
                                      unsigned Opcode,
                                      unsigned CondCode) const {
-  const TargetInstrInfo &TII =
-      *getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -3134,8 +3130,9 @@ LowerAsmOperandForConstraint(SDValue Op,
   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
-std::pair<unsigned, const TargetRegisterClass*>
-SparcTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+std::pair<unsigned, const TargetRegisterClass *>
+SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                  const std::string &Constraint,
                                                   MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
@@ -3160,11 +3157,12 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       char regIdx = '0' + (intVal % 8);
       char tmp[] = { '{', regType, regIdx, '}', 0 };
       std::string newConstraint = std::string(tmp);
-      return TargetLowering::getRegForInlineAsmConstraint(newConstraint, VT);
+      return TargetLowering::getRegForInlineAsmConstraint(TRI, newConstraint,
+                                                          VT);
     }
   }
 
-  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
 
 bool
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index a62d569..8715326 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -54,7 +54,7 @@ namespace llvm {
   class SparcTargetLowering : public TargetLowering {
     const SparcSubtarget *Subtarget;
   public:
-    SparcTargetLowering(TargetMachine &TM);
+    SparcTargetLowering(TargetMachine &TM, const SparcSubtarget &STI);
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
     /// computeKnownBitsForTargetNode - Determine which of the bits specified
@@ -80,8 +80,10 @@ namespace llvm {
                                       std::string &Constraint,
                                       std::vector<SDValue> &Ops,
                                       SelectionDAG &DAG) const override;
-    std::pair<unsigned, const TargetRegisterClass*>
-    getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const override;
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 const std::string &Constraint,
+                                 MVT VT) const override;
 
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
     MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index c320239..17daeca 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -22,38 +22,38 @@ include "SparcInstrFormats.td"
 //===----------------------------------------------------------------------===//
 
 // True when generating 32-bit code.
-def Is32Bit : Predicate<"!Subtarget.is64Bit()">;
+def Is32Bit : Predicate<"!Subtarget->is64Bit()">;
 
 // True when generating 64-bit code. This also implies HasV9.
-def Is64Bit : Predicate<"Subtarget.is64Bit()">;
+def Is64Bit : Predicate<"Subtarget->is64Bit()">;
 
 // HasV9 - This predicate is true when the target processor supports V9
 // instructions.  Note that the machine may be running in 32-bit mode.
-def HasV9   : Predicate<"Subtarget.isV9()">,
+def HasV9   : Predicate<"Subtarget->isV9()">,
               AssemblerPredicate<"FeatureV9">;
 
 // HasNoV9 - This predicate is true when the target doesn't have V9
 // instructions.  Use of this is just a hack for the isel not having proper
 // costs for V8 instructions that are more expensive than their V9 ones.
-def HasNoV9 : Predicate<"!Subtarget.isV9()">;
+def HasNoV9 : Predicate<"!Subtarget->isV9()">;
 
 // HasVIS - This is true when the target processor has VIS extensions.
-def HasVIS : Predicate<"Subtarget.isVIS()">,
+def HasVIS : Predicate<"Subtarget->isVIS()">,
              AssemblerPredicate<"FeatureVIS">;
-def HasVIS2 : Predicate<"Subtarget.isVIS2()">,
+def HasVIS2 : Predicate<"Subtarget->isVIS2()">,
              AssemblerPredicate<"FeatureVIS2">;
-def HasVIS3 : Predicate<"Subtarget.isVIS3()">,
+def HasVIS3 : Predicate<"Subtarget->isVIS3()">,
              AssemblerPredicate<"FeatureVIS3">;
 
 // HasHardQuad - This is true when the target processor supports quad floating
 // point instructions.
-def HasHardQuad : Predicate<"Subtarget.hasHardQuad()">;
+def HasHardQuad : Predicate<"Subtarget->hasHardQuad()">;
 
 // UseDeprecatedInsts - This predicate is true when the target processor is a
 // V8, or when it is V9 but the V8 deprecated instructions are efficient enough
 // to use when appropriate.  In either of these cases, the instruction selector
 // will pick deprecated instructions.
-def UseDeprecatedInsts : Predicate<"Subtarget.useDeprecatedV8Instructions()">;
+def UseDeprecatedInsts : Predicate<"Subtarget->useDeprecatedV8Instructions()">;
 
 //===----------------------------------------------------------------------===//
 // Instruction Pattern Stuff
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index eea0c8c..ce1105f 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -26,32 +26,6 @@ using namespace llvm;
 
 void SparcSubtarget::anchor() { }
 
-static std::string computeDataLayout(const SparcSubtarget &ST) {
-  // Sparc is big endian.
-  std::string Ret = "E-m:e";
-
-  // Some ABIs have 32bit pointers.
-  if (!ST.is64Bit())
-    Ret += "-p:32:32";
-
-  // Alignments for 64 bit integers.
-  Ret += "-i64:64";
-
-  // On SparcV9 128 floats are aligned to 128 bits, on others only to 64.
-  // On SparcV9 registers can hold 64 or 32 bits, on others only 32.
-  if (ST.is64Bit())
-    Ret += "-n32:64";
-  else
-    Ret += "-f128:64-n32";
-
-  if (ST.is64Bit())
-    Ret += "-S128";
-  else
-    Ret += "-S64";
-
-  return Ret;
-}
-
 SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                                 StringRef FS) {
   IsV9 = false;
@@ -79,8 +53,8 @@ SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU,
                                const std::string &FS, TargetMachine &TM,
                                bool is64Bit)
     : SparcGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit),
-      DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS))),
-      InstrInfo(*this), TLInfo(TM), TSInfo(DL), FrameLowering(*this) {}
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
+      TSInfo(*TM.getDataLayout()), FrameLowering(*this) {}
 
 int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
 
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index d503b2b..e6cf460 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -37,7 +37,6 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
   bool Is64Bit;
   bool HasHardQuad;
   bool UsePopc;
-  const DataLayout DL;       // Calculates type size & alignment
   SparcInstrInfo InstrInfo;
   SparcTargetLowering TLInfo;
   SparcSelectionDAGInfo TSInfo;
@@ -60,7 +59,6 @@ public:
   const SparcSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
-  const DataLayout *getDataLayout() const override { return &DL; }
 
   bool isV9() const { return IsV9; }
   bool isVIS() const { return IsVIS; }
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 489bb69..1c423dc 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -14,7 +14,7 @@
 #include "SparcTargetObjectFile.h"
 #include "Sparc.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
@@ -24,6 +24,32 @@ extern "C" void LLVMInitializeSparcTarget() {
   RegisterTargetMachine<SparcV9TargetMachine> Y(TheSparcV9Target);
 }
 
+static std::string computeDataLayout(bool is64Bit) {
+  // Sparc is big endian.
+  std::string Ret = "E-m:e";
+
+  // Some ABIs have 32bit pointers.
+  if (!is64Bit)
+    Ret += "-p:32:32";
+
+  // Alignments for 64 bit integers.
+  Ret += "-i64:64";
+
+  // On SparcV9 128 floats are aligned to 128 bits, on others only to 64.
+  // On SparcV9 registers can hold 64 or 32 bits, on others only 32.
+  if (is64Bit)
+    Ret += "-n32:64";
+  else
+    Ret += "-f128:64-n32";
+
+  if (is64Bit)
+    Ret += "-S128";
+  else
+    Ret += "-S64";
+
+  return Ret;
+}
+
 /// SparcTargetMachine ctor - Create an ILP32 architecture model
 ///
 SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT,
@@ -34,6 +60,7 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT,
                                        bool is64bit)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     TLOF(make_unique<SparcELFTargetObjectFile>()),
+    DL(computeDataLayout(is64bit)),
     Subtarget(TT, CPU, FS, *this, is64bit) {
   initAsmInfo();
 }
@@ -53,7 +80,7 @@ public:
 
   void addIRPasses() override;
   bool addInstSelector() override;
-  bool addPreEmitPass() override;
+  void addPreEmitPass() override;
 };
 } // namespace
 
@@ -72,12 +99,8 @@ bool SparcPassConfig::addInstSelector() {
   return false;
 }
 
-/// addPreEmitPass - This pass may be implemented by targets that want to run
-/// passes immediately before machine code is emitted.  This should return
-/// true if -print-machineinstrs should print out the code after the passes.
-bool SparcPassConfig::addPreEmitPass(){
+void SparcPassConfig::addPreEmitPass(){
   addPass(createSparcDelaySlotFillerPass(getSparcTargetMachine()));
-  return true;
 }
 
 void SparcV8TargetMachine::anchor() { }
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index 096e7c8..4f93980 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -22,6 +22,7 @@ namespace llvm {
 
 class SparcTargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  const DataLayout DL;
   SparcSubtarget Subtarget;
 public:
   SparcTargetMachine(const Target &T, StringRef TT,
@@ -30,6 +31,7 @@ public:
                      CodeGenOpt::Level OL, bool is64bit);
   ~SparcTargetMachine() override;
 
+  const DataLayout *getDataLayout() const override { return &DL; }
   const SparcSubtarget *getSubtargetImpl() const override { return &Subtarget; }
 
   // Pass Pipeline Configuration
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 0955f4a..9181ff7 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -57,6 +57,7 @@ private:
     KindReg,
     KindAccessReg,
     KindImm,
+    KindImmTLS,
     KindMem
   };
 
@@ -96,11 +97,19 @@ private:
     const MCExpr *Length;
   };
 
+  // Imm is an immediate operand, and Sym is an optional TLS symbol
+  // for use with a __tls_get_offset marker relocation.
+  struct ImmTLSOp {
+    const MCExpr *Imm;
+    const MCExpr *Sym;
+  };
+
   union {
     TokenOp Token;
     RegOp Reg;
     unsigned AccessReg;
     const MCExpr *Imm;
+    ImmTLSOp ImmTLS;
     MemOp Mem;
   };
 
@@ -160,6 +169,14 @@ public:
     Op->Mem.Length = Length;
     return Op;
   }
+  static std::unique_ptr<SystemZOperand>
+  createImmTLS(const MCExpr *Imm, const MCExpr *Sym,
+               SMLoc StartLoc, SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindImmTLS, StartLoc, EndLoc);
+    Op->ImmTLS.Imm = Imm;
+    Op->ImmTLS.Sym = Sym;
+    return Op;
+  }
 
   // Token operands
   bool isToken() const override {
@@ -200,6 +217,11 @@ public:
     return Imm;
   }
 
+  // Immediate operands with optional TLS symbol.
+  bool isImmTLS() const {
+    return Kind == KindImmTLS;
+  }
+
   // Memory operands.
   bool isMem() const override {
     return Kind == KindMem;
@@ -260,6 +282,13 @@ public:
     addExpr(Inst, Mem.Disp);
     addExpr(Inst, Mem.Length);
   }
+  void addImmTLSOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands");
+    assert(Kind == KindImmTLS && "Invalid operand type");
+    addExpr(Inst, ImmTLS.Imm);
+    if (ImmTLS.Sym)
+      addExpr(Inst, ImmTLS.Sym);
+  }
 
   // Used by the TableGen code to check for particular operand types.
   bool isGR32() const { return isReg(GR32Reg); }
@@ -325,6 +354,9 @@ private:
                                     const unsigned *Regs, RegisterKind RegKind,
                                     MemoryKind MemKind);
 
+  OperandMatchResultTy parsePCRel(OperandVector &Operands, int64_t MinVal,
+                                  int64_t MaxVal, bool AllowTLS);
+
   bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
 
 public:
@@ -395,13 +427,17 @@ public:
     return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDLMem);
   }
   OperandMatchResultTy parseAccessReg(OperandVector &Operands);
-  OperandMatchResultTy parsePCRel(OperandVector &Operands, int64_t MinVal,
-                                  int64_t MaxVal);
   OperandMatchResultTy parsePCRel16(OperandVector &Operands) {
-    return parsePCRel(Operands, -(1LL << 16), (1LL << 16) - 1);
+    return parsePCRel(Operands, -(1LL << 16), (1LL << 16) - 1, false);
   }
   OperandMatchResultTy parsePCRel32(OperandVector &Operands) {
-    return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1);
+    return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1, false);
+  }
+  OperandMatchResultTy parsePCRelTLS16(OperandVector &Operands) {
+    return parsePCRel(Operands, -(1LL << 16), (1LL << 16) - 1, true);
+  }
+  OperandMatchResultTy parsePCRelTLS32(OperandVector &Operands) {
+    return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1, true);
   }
 };
 } // end anonymous namespace
@@ -685,7 +721,6 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
                                      MatchingInlineAsm);
   switch (MatchResult) {
-  default: break;
   case Match_Success:
     Inst.setLoc(IDLoc);
     Out.EmitInstruction(Inst, STI);
@@ -744,7 +779,7 @@ SystemZAsmParser::parseAccessReg(OperandVector &Operands) {
 
 SystemZAsmParser::OperandMatchResultTy
 SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
-                             int64_t MaxVal) {
+                             int64_t MaxVal, bool AllowTLS) {
   MCContext &Ctx = getContext();
   MCStreamer &Out = getStreamer();
   const MCExpr *Expr;
@@ -767,9 +802,54 @@ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
     Expr = Value == 0 ? Base : MCBinaryExpr::CreateAdd(Base, Expr, Ctx);
   }
 
+  // Optionally match :tls_gdcall: or :tls_ldcall: followed by a TLS symbol.
+  const MCExpr *Sym = nullptr;
+  if (AllowTLS && getLexer().is(AsmToken::Colon)) {
+    Parser.Lex();
+
+    if (Parser.getTok().isNot(AsmToken::Identifier)) {
+      Error(Parser.getTok().getLoc(), "unexpected token");
+      return MatchOperand_ParseFail;
+    }
+
+    MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+    StringRef Name = Parser.getTok().getString();
+    if (Name == "tls_gdcall")
+      Kind = MCSymbolRefExpr::VK_TLSGD;
+    else if (Name == "tls_ldcall")
+      Kind = MCSymbolRefExpr::VK_TLSLDM;
+    else {
+      Error(Parser.getTok().getLoc(), "unknown TLS tag");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+
+    if (Parser.getTok().isNot(AsmToken::Colon)) {
+      Error(Parser.getTok().getLoc(), "unexpected token");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+
+    if (Parser.getTok().isNot(AsmToken::Identifier)) {
+      Error(Parser.getTok().getLoc(), "unexpected token");
+      return MatchOperand_ParseFail;
+    }
+
+    StringRef Identifier = Parser.getTok().getString();
+    Sym = MCSymbolRefExpr::Create(Ctx.GetOrCreateSymbol(Identifier),
+                                  Kind, Ctx);
+    Parser.Lex();
+  }
+
   SMLoc EndLoc =
     SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-  Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
+
+  if (AllowTLS)
+    Operands.push_back(SystemZOperand::createImmTLS(Expr, Sym,
+                                                    StartLoc, EndLoc));
+  else
+    Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
+
   return MatchOperand_Success;
 }
 
diff --git a/lib/Target/SystemZ/CMakeLists.txt b/lib/Target/SystemZ/CMakeLists.txt
index 41a614d..60a3912 100644
--- a/lib/Target/SystemZ/CMakeLists.txt
+++ b/lib/Target/SystemZ/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_target(SystemZCodeGen
   SystemZISelDAGToDAG.cpp
   SystemZISelLowering.cpp
   SystemZInstrInfo.cpp
+  SystemZLDCleanup.cpp
   SystemZLongBranch.cpp
   SystemZMachineFunctionInfo.cpp
   SystemZMCInstLower.cpp
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
index d2ba9b6..996a492 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -10,6 +10,7 @@
 #include "SystemZInstPrinter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -124,6 +125,29 @@ void SystemZInstPrinter::printPCRelOperand(const MCInst *MI, int OpNum,
     O << *MO.getExpr();
 }
 
+void SystemZInstPrinter::printPCRelTLSOperand(const MCInst *MI, int OpNum,
+                                              raw_ostream &O) {
+  // Output the PC-relative operand.
+  printPCRelOperand(MI, OpNum, O);
+
+  // Output the TLS marker if present.
+  if ((unsigned)OpNum + 1 < MI->getNumOperands()) {
+    const MCOperand &MO = MI->getOperand(OpNum + 1);
+    const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*MO.getExpr());
+    switch (refExp.getKind()) {
+      case MCSymbolRefExpr::VK_TLSGD:
+        O << ":tls_gdcall:";
+        break;
+      case MCSymbolRefExpr::VK_TLSLDM:
+        O << ":tls_ldcall:";
+        break;
+      default:
+        llvm_unreachable("Unexpected symbol kind");
+    }
+    O << refExp.getSymbol().getName();
+  }
+}
+
 void SystemZInstPrinter::printOperand(const MCInst *MI, int OpNum,
                                       raw_ostream &O) {
   printOperand(MI->getOperand(OpNum), O);
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
index 753903c..732e5fa 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
@@ -56,6 +56,7 @@ private:
   void printS32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printU32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printPCRelOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printPCRelTLSOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printAccessRegOperand(const MCInst *MI, int OpNum, raw_ostream &O);
 
   // Print the mnemonic for a condition-code mask ("ne", "lh", etc.)
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 6e7268d..b79b1d8 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -27,9 +27,10 @@ static uint64_t extractBitsForFixup(MCFixupKind Kind, uint64_t Value) {
   switch (unsigned(Kind)) {
   case SystemZ::FK_390_PC16DBL:
   case SystemZ::FK_390_PC32DBL:
-  case SystemZ::FK_390_PLT16DBL:
-  case SystemZ::FK_390_PLT32DBL:
     return (int64_t)Value / 2;
+
+  case SystemZ::FK_390_TLS_CALL:
+    return 0;
   }
 
   llvm_unreachable("Unknown fixup kind!");
@@ -72,8 +73,7 @@ SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   const static MCFixupKindInfo Infos[SystemZ::NumTargetFixupKinds] = {
     { "FK_390_PC16DBL",  0, 16, MCFixupKindInfo::FKF_IsPCRel },
     { "FK_390_PC32DBL",  0, 32, MCFixupKindInfo::FKF_IsPCRel },
-    { "FK_390_PLT16DBL", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
-    { "FK_390_PLT32DBL", 0, 32, MCFixupKindInfo::FKF_IsPCRel }
+    { "FK_390_TLS_CALL", 0, 0, 0 }
   };
 
   if (Kind < FirstTargetFixupKind)
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index 35887fa..0161d62 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -24,4 +24,6 @@ SystemZMCAsmInfo::SystemZMCAsmInfo(StringRef TT) {
   UsesELFSectionDirectiveForBSS = true;
   SupportsDebugInformation = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  UseIntegratedAssembler = true;
 }
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index 27b4bd8..d9bb916 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -74,20 +74,36 @@ private:
   // Operand OpNum of MI needs a PC-relative fixup of kind Kind at
   // Offset bytes from the start of MI.  Add the fixup to Fixups
   // and return the in-place addend, which since we're a RELA target
-  // is always 0.
+  // is always 0.  If AllowTLS is true and optional operand OpNum + 1
+  // is present, also emit a TLS call fixup for it.
   uint64_t getPCRelEncoding(const MCInst &MI, unsigned OpNum,
                             SmallVectorImpl<MCFixup> &Fixups,
-                            unsigned Kind, int64_t Offset) const;
+                            unsigned Kind, int64_t Offset,
+                            bool AllowTLS) const;
 
   uint64_t getPC16DBLEncoding(const MCInst &MI, unsigned OpNum,
                               SmallVectorImpl<MCFixup> &Fixups,
                               const MCSubtargetInfo &STI) const {
-    return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PC16DBL, 2);
+    return getPCRelEncoding(MI, OpNum, Fixups,
+                            SystemZ::FK_390_PC16DBL, 2, false);
   }
   uint64_t getPC32DBLEncoding(const MCInst &MI, unsigned OpNum,
                               SmallVectorImpl<MCFixup> &Fixups,
                               const MCSubtargetInfo &STI) const {
-    return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PC32DBL, 2);
+    return getPCRelEncoding(MI, OpNum, Fixups,
+                            SystemZ::FK_390_PC32DBL, 2, false);
+  }
+  uint64_t getPC16DBLTLSEncoding(const MCInst &MI, unsigned OpNum,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const {
+    return getPCRelEncoding(MI, OpNum, Fixups,
+                            SystemZ::FK_390_PC16DBL, 2, true);
+  }
+  uint64_t getPC32DBLTLSEncoding(const MCInst &MI, unsigned OpNum,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const {
+    return getPCRelEncoding(MI, OpNum, Fixups,
+                            SystemZ::FK_390_PC32DBL, 2, true);
   }
 };
 } // end anonymous namespace
@@ -181,7 +197,8 @@ getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
 uint64_t
 SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
                                        SmallVectorImpl<MCFixup> &Fixups,
-                                       unsigned Kind, int64_t Offset) const {
+                                       unsigned Kind, int64_t Offset,
+                                       bool AllowTLS) const {
   const MCOperand &MO = MI.getOperand(OpNum);
   const MCExpr *Expr;
   if (MO.isImm())
@@ -198,6 +215,13 @@ SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
     }
   }
   Fixups.push_back(MCFixup::Create(Offset, Expr, (MCFixupKind)Kind));
+
+  // Output the fixup for the TLS marker if present.
+  if (AllowTLS && OpNum + 1 < MI.getNumOperands()) {
+    const MCOperand &MOTLS = MI.getOperand(OpNum + 1);
+    Fixups.push_back(MCFixup::Create(0, MOTLS.getExpr(),
+                                     (MCFixupKind)SystemZ::FK_390_TLS_CALL));
+  }
   return 0;
 }
 
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
index 52a8d1d..229ab5d 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
@@ -18,8 +18,7 @@ enum FixupKind {
   // These correspond directly to R_390_* relocations.
   FK_390_PC16DBL = FirstTargetFixupKind,
   FK_390_PC32DBL,
-  FK_390_PLT16DBL,
-  FK_390_PLT32DBL,
+  FK_390_TLS_CALL,
 
   // Marker
   LastTargetFixupKind,
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
index c6a1816..2632518 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -55,8 +55,6 @@ static unsigned getPCRelReloc(unsigned Kind) {
   case FK_Data_8:                return ELF::R_390_PC64;
   case SystemZ::FK_390_PC16DBL:  return ELF::R_390_PC16DBL;
   case SystemZ::FK_390_PC32DBL:  return ELF::R_390_PC32DBL;
-  case SystemZ::FK_390_PLT16DBL: return ELF::R_390_PLT16DBL;
-  case SystemZ::FK_390_PLT32DBL: return ELF::R_390_PLT32DBL;
   }
   llvm_unreachable("Unsupported PC-relative address");
 }
@@ -70,6 +68,35 @@ static unsigned getTLSLEReloc(unsigned Kind) {
   llvm_unreachable("Unsupported absolute address");
 }
 
+// Return the R_390_TLS_LDO* relocation type for MCFixupKind Kind.
+static unsigned getTLSLDOReloc(unsigned Kind) {
+  switch (Kind) {
+  case FK_Data_4: return ELF::R_390_TLS_LDO32;
+  case FK_Data_8: return ELF::R_390_TLS_LDO64;
+  }
+  llvm_unreachable("Unsupported absolute address");
+}
+
+// Return the R_390_TLS_LDM* relocation type for MCFixupKind Kind.
+static unsigned getTLSLDMReloc(unsigned Kind) {
+  switch (Kind) {
+  case FK_Data_4: return ELF::R_390_TLS_LDM32;
+  case FK_Data_8: return ELF::R_390_TLS_LDM64;
+  case SystemZ::FK_390_TLS_CALL: return ELF::R_390_TLS_LDCALL;
+  }
+  llvm_unreachable("Unsupported absolute address");
+}
+
+// Return the R_390_TLS_GD* relocation type for MCFixupKind Kind.
+static unsigned getTLSGDReloc(unsigned Kind) {
+  switch (Kind) {
+  case FK_Data_4: return ELF::R_390_TLS_GD32;
+  case FK_Data_8: return ELF::R_390_TLS_GD64;
+  case SystemZ::FK_390_TLS_CALL: return ELF::R_390_TLS_GDCALL;
+  }
+  llvm_unreachable("Unsupported absolute address");
+}
+
 // Return the PLT relocation counterpart of MCFixupKind Kind.
 static unsigned getPLTReloc(unsigned Kind) {
   switch (Kind) {
@@ -94,6 +121,23 @@ unsigned SystemZObjectWriter::GetRelocType(const MCValue &Target,
     assert(!IsPCRel && "NTPOFF shouldn't be PC-relative");
     return getTLSLEReloc(Kind);
 
+  case MCSymbolRefExpr::VK_INDNTPOFF:
+    if (IsPCRel && Kind == SystemZ::FK_390_PC32DBL)
+      return ELF::R_390_TLS_IEENT;
+    llvm_unreachable("Only PC-relative INDNTPOFF accesses are supported for now");
+
+  case MCSymbolRefExpr::VK_DTPOFF:
+    assert(!IsPCRel && "DTPOFF shouldn't be PC-relative");
+    return getTLSLDOReloc(Kind);
+
+  case MCSymbolRefExpr::VK_TLSLDM:
+    assert(!IsPCRel && "TLSLDM shouldn't be PC-relative");
+    return getTLSLDMReloc(Kind);
+
+  case MCSymbolRefExpr::VK_TLSGD:
+    assert(!IsPCRel && "TLSGD shouldn't be PC-relative");
+    return getTLSGDReloc(Kind);
+
   case MCSymbolRefExpr::VK_GOT:
     if (IsPCRel && Kind == SystemZ::FK_390_PC32DBL)
       return ELF::R_390_GOTENT;
diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h
index c8b95b2..5f17edb 100644
--- a/lib/Target/SystemZ/SystemZ.h
+++ b/lib/Target/SystemZ/SystemZ.h
@@ -111,6 +111,7 @@ FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM,
 FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM);
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index f4f3ec7..18e37e3 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -66,6 +66,20 @@ static MCInst lowerRIEfLow(const MachineInstr *MI, unsigned Opcode) {
     .addImm(MI->getOperand(5).getImm());
 }
 
+static const MCSymbolRefExpr *getTLSGetOffset(MCContext &Context) {
+  StringRef Name = "__tls_get_offset";
+  return MCSymbolRefExpr::Create(Context.GetOrCreateSymbol(Name),
+                                 MCSymbolRefExpr::VK_PLT,
+                                 Context);
+}
+
+static const MCSymbolRefExpr *getGlobalOffsetTable(MCContext &Context) {
+  StringRef Name = "_GLOBAL_OFFSET_TABLE_";
+  return MCSymbolRefExpr::Create(Context.GetOrCreateSymbol(Name),
+                                 MCSymbolRefExpr::VK_None,
+                                 Context);
+}
+
 void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   SystemZMCInstLower Lower(MF->getContext(), *this);
   MCInst LoweredMI;
@@ -95,6 +109,26 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R1D);
     break;
 
+  case SystemZ::TLS_GDCALL:
+    LoweredMI = MCInstBuilder(SystemZ::BRASL)
+      .addReg(SystemZ::R14D)
+      .addExpr(getTLSGetOffset(MF->getContext()))
+      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_TLSGD));
+    break;
+
+  case SystemZ::TLS_LDCALL:
+    LoweredMI = MCInstBuilder(SystemZ::BRASL)
+      .addReg(SystemZ::R14D)
+      .addExpr(getTLSGetOffset(MF->getContext()))
+      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_TLSLDM));
+    break;
+
+  case SystemZ::GOT:
+    LoweredMI = MCInstBuilder(SystemZ::LARL)
+      .addReg(MI->getOperand(0).getReg())
+      .addExpr(getGlobalOffsetTable(MF->getContext()));
+    break;
+
   case SystemZ::IILF64:
     LoweredMI = MCInstBuilder(SystemZ::IILF)
       .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
@@ -152,7 +186,7 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 #undef LOWER_HIGH
 
   case SystemZ::Serialize:
-    if (Subtarget->hasFastSerialization())
+    if (MF->getSubtarget<SystemZSubtarget>().hasFastSerialization())
       LoweredMI = MCInstBuilder(SystemZ::AsmBCR)
         .addImm(14).addReg(SystemZ::R0D);
     else
@@ -172,6 +206,9 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 static MCSymbolRefExpr::VariantKind
 getModifierVariantKind(SystemZCP::SystemZCPModifier Modifier) {
   switch (Modifier) {
+  case SystemZCP::TLSGD: return MCSymbolRefExpr::VK_TLSGD;
+  case SystemZCP::TLSLDM: return MCSymbolRefExpr::VK_TLSLDM;
+  case SystemZCP::DTPOFF: return MCSymbolRefExpr::VK_DTPOFF;
   case SystemZCP::NTPOFF: return MCSymbolRefExpr::VK_NTPOFF;
   }
   llvm_unreachable("Invalid SystemCPModifier!");
@@ -185,8 +222,7 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
     MCSymbolRefExpr::Create(getSymbol(ZCPV->getGlobalValue()),
                             getModifierVariantKind(ZCPV->getModifier()),
                             OutContext);
-  uint64_t Size =
-      TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(ZCPV->getType());
+  uint64_t Size = TM.getDataLayout()->getTypeAllocSize(ZCPV->getType());
 
   OutStreamer.EmitValue(Expr, Size);
 }
@@ -220,7 +256,7 @@ bool SystemZAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 }
 
 void SystemZAsmPrinter::EmitEndOfAsmFile(Module &M) {
-  if (Subtarget->isTargetELF()) {
+  if (Triple(TM.getTargetTriple()).isOSBinFormatELF()) {
     auto &TLOFELF =
       static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
 
@@ -230,7 +266,7 @@ void SystemZAsmPrinter::EmitEndOfAsmFile(Module &M) {
     MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
     if (!Stubs.empty()) {
       OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
-      const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
+      const DataLayout *TD = TM.getDataLayout();
 
       for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
         OutStreamer.EmitLabel(Stubs[i].first);
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.h b/lib/Target/SystemZ/SystemZAsmPrinter.h
index 6467279..a4d5b78 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.h
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -22,14 +22,9 @@ class Module;
 class raw_ostream;
 
 class LLVM_LIBRARY_VISIBILITY SystemZAsmPrinter : public AsmPrinter {
-private:
-  const SystemZSubtarget *Subtarget;
-
 public:
-  SystemZAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer) {
-    Subtarget = &TM.getSubtarget<SystemZSubtarget>();
-  }
+  SystemZAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)) {}
 
   // Override AsmPrinter.
   const char *getPassName() const override {
diff --git a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
index 19cec21..44ea1d2 100644
--- a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
+++ b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -28,6 +28,11 @@ SystemZConstantPoolValue::Create(const GlobalValue *GV,
 
 unsigned SystemZConstantPoolValue::getRelocationInfo() const {
   switch (Modifier) {
+  case SystemZCP::TLSGD:
+  case SystemZCP::TLSLDM:
+  case SystemZCP::DTPOFF:
+    // May require a dynamic relocation.
+    return 2;
   case SystemZCP::NTPOFF:
     // May require a relocation, but the relocations are always resolved
     // by the static linker.
diff --git a/lib/Target/SystemZ/SystemZConstantPoolValue.h b/lib/Target/SystemZ/SystemZConstantPoolValue.h
index 0bd8c20..e5f1bb1 100644
--- a/lib/Target/SystemZ/SystemZConstantPoolValue.h
+++ b/lib/Target/SystemZ/SystemZConstantPoolValue.h
@@ -19,13 +19,17 @@ class GlobalValue;
 
 namespace SystemZCP {
 enum SystemZCPModifier {
+  TLSGD,
+  TLSLDM,
+  DTPOFF,
   NTPOFF
 };
 } // end namespace SystemZCP
 
 /// A SystemZ-specific constant pool value.  At present, the only
-/// defined constant pool values are offsets of thread-local variables
-/// (written x@NTPOFF).
+/// defined constant pool values are module IDs or offsets of
+/// thread-local variables (written x@TLSGD, x@TLSLDM, x@DTPOFF,
+/// or x@NTPOFF).
 class SystemZConstantPoolValue : public MachineConstantPoolValue {
   const GlobalValue *GV;
   SystemZCP::SystemZCPModifier Modifier;
diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp
index ce99ee5..16f9adc 100644
--- a/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -47,7 +47,7 @@ struct Reference {
     return *this;
   }
 
-  LLVM_EXPLICIT operator bool() const { return Def || Use; }
+  explicit operator bool() const { return Def || Use; }
 
   // True if the register is defined or used in some form, either directly or
   // via a sub- or super-register.
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 5f84624..b8b0db9 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -127,8 +127,7 @@ struct RxSBGOperands {
 };
 
 class SystemZDAGToDAGISel : public SelectionDAGISel {
-  const SystemZTargetLowering &Lowering;
-  const SystemZSubtarget &Subtarget;
+  const SystemZSubtarget *Subtarget;
 
   // Used by SystemZOperands.td to create integer constants.
   inline SDValue getImm(const SDNode *Node, uint64_t Imm) const {
@@ -140,7 +139,7 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   }
 
   const SystemZInstrInfo *getInstrInfo() const {
-    return getTargetMachine().getSubtargetImpl()->getInstrInfo();
+    return Subtarget->getInstrInfo();
   }
 
   // Try to fold more of the base or index of AM into AM, where IsBase
@@ -315,9 +314,12 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
 
 public:
   SystemZDAGToDAGISel(SystemZTargetMachine &TM, CodeGenOpt::Level OptLevel)
-      : SelectionDAGISel(TM, OptLevel),
-        Lowering(*TM.getSubtargetImpl()->getTargetLowering()),
-        Subtarget(*TM.getSubtargetImpl()) {}
+      : SelectionDAGISel(TM, OptLevel) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    Subtarget = &MF.getSubtarget<SystemZSubtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
 
   // Override MachineFunctionPass.
   const char *getPassName() const override {
@@ -897,7 +899,7 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
 
   unsigned Opcode = SystemZ::RISBG;
   EVT OpcodeVT = MVT::i64;
-  if (VT == MVT::i32 && Subtarget.hasHighWord()) {
+  if (VT == MVT::i32 && Subtarget->hasHighWord()) {
     Opcode = SystemZ::RISBMux;
     OpcodeVT = MVT::i32;
     RISBG.Start &= 31;
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index b282fca..e96398d 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -80,9 +80,9 @@ static MachineOperand earlyUseOperand(MachineOperand Op) {
   return Op;
 }
 
-SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm)
-    : TargetLowering(tm),
-      Subtarget(tm.getSubtarget<SystemZSubtarget>()) {
+SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
+                                             const SystemZSubtarget &STI)
+    : TargetLowering(tm), Subtarget(STI) {
   MVT PtrVT = getPointerTy();
 
   // Set up the register classes.
@@ -96,7 +96,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm)
   addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
 
   // Compute derived properties from the register classes
-  computeRegisterProperties();
+  computeRegisterProperties(Subtarget.getRegisterInfo());
 
   // Set up special registers.
   setExceptionPointerRegister(SystemZ::R6D);
@@ -218,10 +218,12 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm)
   setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
 
   // We have native instructions for i8, i16 and i32 extensions, but not i1.
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::EXTLOAD,  VT, MVT::i1, Promote);
+  }
 
   // Handle the various types of symbolic address.
   setOperationAction(ISD::ConstantPool,     PtrVT, Custom);
@@ -275,7 +277,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm)
   // Needed so that we don't try to implement f128 constant loads using
   // a load-and-extend of a f80 constant (in cases where the constant
   // would fit in an f80).
-  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
+  for (MVT VT : MVT::fp_valuetypes())
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
 
   // Floating-point truncation and stores need to be done separately.
   setTruncStoreAction(MVT::f64,  MVT::f32, Expand);
@@ -496,8 +499,10 @@ parseRegisterNumber(const std::string &Constraint,
   return std::make_pair(0U, nullptr);
 }
 
-std::pair<unsigned, const TargetRegisterClass *> SystemZTargetLowering::
-getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const {
+std::pair<unsigned, const TargetRegisterClass *>
+SystemZTargetLowering::getRegForInlineAsmConstraint(
+    const TargetRegisterInfo *TRI, const std::string &Constraint,
+    MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC Constraint Letters
     switch (Constraint[0]) {
@@ -554,7 +559,7 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const {
                                  SystemZMC::FP64Regs);
     }
   }
-  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
 
 void SystemZTargetLowering::
@@ -673,9 +678,9 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SystemZMachineFunctionInfo *FuncInfo =
-    MF.getInfo<SystemZMachineFunctionInfo>();
-  auto *TFL = static_cast<const SystemZFrameLowering *>(
-      DAG.getSubtarget().getFrameLowering());
+      MF.getInfo<SystemZMachineFunctionInfo>();
+  auto *TFL =
+      static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -914,8 +919,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
                                   RegsToPass[I].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
@@ -1778,12 +1782,8 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
     }
   }
 
-  SmallVector<SDValue, 5> Ops;
-  Ops.push_back(TrueOp);
-  Ops.push_back(FalseOp);
-  Ops.push_back(DAG.getConstant(C.CCValid, MVT::i32));
-  Ops.push_back(DAG.getConstant(C.CCMask, MVT::i32));
-  Ops.push_back(Glue);
+  SDValue Ops[] = {TrueOp, FalseOp, DAG.getConstant(C.CCValid, MVT::i32),
+                   DAG.getConstant(C.CCMask, MVT::i32), Glue};
 
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
   return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
@@ -1828,6 +1828,52 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
   return Result;
 }
 
+SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
+                                                 SelectionDAG &DAG,
+                                                 unsigned Opcode,
+                                                 SDValue GOTOffset) const {
+  SDLoc DL(Node);
+  EVT PtrVT = getPointerTy();
+  SDValue Chain = DAG.getEntryNode();
+  SDValue Glue;
+
+  // __tls_get_offset takes the GOT offset in %r2 and the GOT in %r12.
+  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
+  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R12D, GOT, Glue);
+  Glue = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R2D, GOTOffset, Glue);
+  Glue = Chain.getValue(1);
+
+  // The first call operand is the chain and the second is the TLS symbol.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getTargetGlobalAddress(Node->getGlobal(), DL,
+                                           Node->getValueType(0),
+                                           0, 0));
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  Ops.push_back(DAG.getRegister(SystemZ::R2D, PtrVT));
+  Ops.push_back(DAG.getRegister(SystemZ::R12D, PtrVT));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CallingConv::C);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  // Glue the call to the argument copies.
+  Ops.push_back(Glue);
+
+  // Emit the call.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(Opcode, DL, NodeTys, Ops);
+  Glue = Chain.getValue(1);
+
+  // Copy the return value from %r2.
+  return DAG.getCopyFromReg(Chain, DL, SystemZ::R2D, PtrVT, Glue);
+}
+
 SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
 						     SelectionDAG &DAG) const {
   SDLoc DL(Node);
@@ -1835,9 +1881,6 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
   EVT PtrVT = getPointerTy();
   TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
 
-  if (model != TLSModel::LocalExec)
-    llvm_unreachable("only local-exec TLS mode supported");
-
   // The high part of the thread pointer is in access register 0.
   SDValue TPHi = DAG.getNode(SystemZISD::EXTRACT_ACCESS, DL, MVT::i32,
                              DAG.getConstant(0, MVT::i32));
@@ -1853,15 +1896,79 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
 				    DAG.getConstant(32, PtrVT));
   SDValue TP = DAG.getNode(ISD::OR, DL, PtrVT, TPHiShifted, TPLo);
 
-  // Get the offset of GA from the thread pointer.
-  SystemZConstantPoolValue *CPV =
-    SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
+  // Get the offset of GA from the thread pointer, based on the TLS model.
+  SDValue Offset;
+  switch (model) {
+    case TLSModel::GeneralDynamic: {
+      // Load the GOT offset of the tls_index (module ID / per-symbol offset).
+      SystemZConstantPoolValue *CPV =
+        SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD);
+
+      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+                           Offset, MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+
+      // Call __tls_get_offset to retrieve the offset.
+      Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset);
+      break;
+    }
+
+    case TLSModel::LocalDynamic: {
+      // Load the GOT offset of the module ID.
+      SystemZConstantPoolValue *CPV =
+        SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM);
+
+      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+                           Offset, MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+
+      // Call __tls_get_offset to retrieve the module base offset.
+      Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset);
+
+      // Note: The SystemZLDCleanupPass will remove redundant computations
+      // of the module base offset.  Count total number of local-dynamic
+      // accesses to trigger execution of that pass.
+      SystemZMachineFunctionInfo* MFI =
+        DAG.getMachineFunction().getInfo<SystemZMachineFunctionInfo>();
+      MFI->incNumLocalDynamicTLSAccesses();
+
+      // Add the per-symbol offset.
+      CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF);
+
+      SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8);
+      DTPOffset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+                              DTPOffset, MachinePointerInfo::getConstantPool(),
+                              false, false, false, 0);
 
-  // Force the offset into the constant pool and load it from there.
-  SDValue CPAddr = DAG.getConstantPool(CPV, PtrVT, 8);
-  SDValue Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
-			       CPAddr, MachinePointerInfo::getConstantPool(),
-			       false, false, false, 0);
+      Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset);
+      break;
+    }
+
+    case TLSModel::InitialExec: {
+      // Load the offset from the GOT.
+      Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                          SystemZII::MO_INDNTPOFF);
+      Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset);
+      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+                           Offset, MachinePointerInfo::getGOT(),
+                           false, false, false, 0);
+      break;
+    }
+
+    case TLSModel::LocalExec: {
+      // Force the offset into the constant pool and load it from there.
+      SystemZConstantPoolValue *CPV =
+        SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
+
+      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+                           Offset, MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+      break;
+    }
+  }
 
   // Add the base and offset together.
   return DAG.getNode(ISD::ADD, DL, PtrVT, TP, Offset);
@@ -2611,8 +2718,8 @@ static unsigned forceReg(MachineInstr *MI, MachineOperand &Base,
 MachineBasicBlock *
 SystemZTargetLowering::emitSelect(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const {
-  const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(
-      MBB->getParent()->getSubtarget().getInstrInfo());
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
 
   unsigned DestReg  = MI->getOperand(0).getReg();
   unsigned TrueReg  = MI->getOperand(1).getReg();
@@ -2660,8 +2767,8 @@ SystemZTargetLowering::emitCondStore(MachineInstr *MI,
                                      MachineBasicBlock *MBB,
                                      unsigned StoreOpcode, unsigned STOCOpcode,
                                      bool Invert) const {
-  const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(
-      MBB->getParent()->getSubtarget().getInstrInfo());
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
 
   unsigned SrcReg     = MI->getOperand(0).getReg();
   MachineOperand Base = MI->getOperand(1);
@@ -2730,7 +2837,7 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
                                             bool Invert) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
   bool IsSubWord = (BitSize < 32);
 
@@ -2850,7 +2957,7 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
                                             unsigned BitSize) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
   bool IsSubWord = (BitSize < 32);
 
@@ -2962,7 +3069,7 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
                                           MachineBasicBlock *MBB) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Extract the operands.  Base can be a register or a frame index.
@@ -3079,7 +3186,7 @@ SystemZTargetLowering::emitExt128(MachineInstr *MI,
                                   bool ClearEven, unsigned SubReg) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -3111,7 +3218,7 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
                                          unsigned Opcode) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -3281,7 +3388,7 @@ SystemZTargetLowering::emitStringWrapper(MachineInstr *MI,
                                          unsigned Opcode) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
 
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 887c236..a2b10b0 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -34,6 +34,11 @@ enum {
   CALL,
   SIBCALL,
 
+  // TLS calls.  Like regular calls, except operand 1 is the TLS symbol.
+  // (The call target is implicitly __tls_get_offset.)
+  TLS_GDCALL,
+  TLS_LDCALL,
+
   // Wraps a TargetGlobalAddress that should be loaded using PC-relative
   // accesses (LARL).  Operand 0 is the address.
   PCREL_WRAPPER,
@@ -198,7 +203,8 @@ class SystemZTargetMachine;
 
 class SystemZTargetLowering : public TargetLowering {
 public:
-  explicit SystemZTargetLowering(const TargetMachine &TM);
+  explicit SystemZTargetLowering(const TargetMachine &TM,
+                                 const SystemZSubtarget &STI);
 
   // Override TargetLowering.
   MVT getScalarShiftAmountTy(EVT LHSTy) const override {
@@ -215,8 +221,9 @@ public:
   bool isTruncateFree(EVT, EVT) const override;
   const char *getTargetNodeName(unsigned Opcode) const override;
   std::pair<unsigned, const TargetRegisterClass *>
-    getRegForInlineAsmConstraint(const std::string &Constraint,
-                                 MVT VT) const override;
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               const std::string &Constraint,
+                               MVT VT) const override;
   TargetLowering::ConstraintType
     getConstraintType(const std::string &Constraint) const override;
   TargetLowering::ConstraintWeight
@@ -257,6 +264,9 @@ private:
   SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerGlobalAddress(GlobalAddressSDNode *Node,
                              SelectionDAG &DAG) const;
+  SDValue lowerTLSGetOffset(GlobalAddressSDNode *Node,
+                            SelectionDAG &DAG, unsigned Opcode,
+                            SDValue GOTOffset) const;
   SDValue lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
                                 SelectionDAG &DAG) const;
   SDValue lowerBlockAddress(BlockAddressSDNode *Node,
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index e8841e1..4a5582f 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -26,14 +26,14 @@ defm CondStoreF64 : CondStores<FP64, nonvolatile_store,
 //===----------------------------------------------------------------------===//
 
 // Load zero.
-let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1 in {
   def LZER : InherentRRE<"lzer", 0xB374, FP32,  (fpimm0)>;
   def LZDR : InherentRRE<"lzdr", 0xB375, FP64,  (fpimm0)>;
   def LZXR : InherentRRE<"lzxr", 0xB376, FP128, (fpimm0)>;
 }
 
 // Moves between two floating-point registers.
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   def LER : UnaryRR <"le", 0x38,   null_frag, FP32,  FP32>;
   def LDR : UnaryRR <"ld", 0x28,   null_frag, FP64,  FP64>;
   def LXR : UnaryRRE<"lx", 0xB365, null_frag, FP128, FP128>;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 8ff9553..8488ec8 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -633,7 +633,7 @@ struct LogicOp {
   LogicOp(unsigned regSize, unsigned immLSB, unsigned immSize)
     : RegSize(regSize), ImmLSB(immLSB), ImmSize(immSize) {}
 
-  LLVM_EXPLICIT operator bool() const { return RegSize; }
+  explicit operator bool() const { return RegSize; }
 
   unsigned RegSize, ImmLSB, ImmSize;
 };
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index d2e3f54..e711f89 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -56,10 +56,13 @@ static inline unsigned getCompareZeroCCMask(unsigned int Flags) {
 // SystemZ MachineOperand target flags.
 enum {
   // Masks out the bits for the access model.
-  MO_SYMBOL_MODIFIER = (1 << 0),
+  MO_SYMBOL_MODIFIER = (3 << 0),
 
   // @GOT (aka @GOTENT)
-  MO_GOT = (1 << 0)
+  MO_GOT = (1 << 0),
+
+  // @INDNTPOFF
+  MO_INDNTPOFF = (2 << 0)
 };
 // Classifies a branch.
 enum BranchType {
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index f4951ad..a7f7747 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -16,7 +16,7 @@ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
 def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
                               [(callseq_end timm:$amt1, timm:$amt2)]>;
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   // Takes as input the value of the stack pointer after a dynamic allocation
   // has been made.  Sets the output to the address of the dynamically-
   // allocated area itself, skipping the outgoing arguments.
@@ -249,11 +249,21 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
     def CallBR : Alias<2, (outs), (ins), [(z_sibcall R1D)]>;
 }
 
+// TLS calls.  These will be lowered into a call to __tls_get_offset,
+// with an extra relocation specifying the TLS symbol.
+let isCall = 1, Defs = [R14D, CC] in {
+  def TLS_GDCALL : Alias<6, (outs), (ins tlssym:$I2, variable_ops),
+                         [(z_tls_gdcall tglobaltlsaddr:$I2)]>;
+  def TLS_LDCALL : Alias<6, (outs), (ins tlssym:$I2, variable_ops),
+                         [(z_tls_ldcall tglobaltlsaddr:$I2)]>;
+}
+
 // Define the general form of the call instructions for the asm parser.
 // These instructions don't hard-code %r14 as the return address register.
-def BRAS  : InstRI<0xA75, (outs), (ins GR64:$R1, brtarget16:$I2),
+// Allow an optional TLS marker symbol to generate TLS call relocations.
+def BRAS  : InstRI<0xA75, (outs), (ins GR64:$R1, brtarget16tls:$I2),
                    "bras\t$R1, $I2", []>;
-def BRASL : InstRIL<0xC05, (outs), (ins GR64:$R1, brtarget32:$I2),
+def BRASL : InstRIL<0xC05, (outs), (ins GR64:$R1, brtarget32tls:$I2),
                     "brasl\t$R1, $I2", []>;
 def BASR  : InstRR<0x0D, (outs), (ins GR64:$R1, ADDR64:$R2),
                    "basr\t$R1, $R2", []>;
@@ -263,7 +273,7 @@ def BASR  : InstRR<0x0D, (outs), (ins GR64:$R1, ADDR64:$R2),
 //===----------------------------------------------------------------------===//
 
 // Register moves.
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   // Expands to LR, RISBHG or RISBLG, depending on the choice of registers.
   def LRMux : UnaryRRPseudo<"l", null_frag, GRX32, GRX32>,
               Requires<[FeatureHighWord]>;
@@ -286,7 +296,7 @@ let Uses = [CC] in {
 }
 
 // Immediate moves.
-let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
+let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1,
     isReMaterializable = 1 in {
   // 16-bit sign-extended immediates.  LHIMux expands to LHI or IIHF,
   // deopending on the choice of register.
@@ -402,13 +412,13 @@ let mayLoad = 1, mayStore = 1, Defs = [CC], Uses = [R0L] in
 //===----------------------------------------------------------------------===//
 
 // 32-bit extensions from registers.
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   def LBR : UnaryRRE<"lb", 0xB926, sext8,  GR32, GR32>;
   def LHR : UnaryRRE<"lh", 0xB927, sext16, GR32, GR32>;
 }
 
 // 64-bit extensions from registers.
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   def LGBR : UnaryRRE<"lgb", 0xB906, sext8,  GR64, GR64>;
   def LGHR : UnaryRRE<"lgh", 0xB907, sext16, GR64, GR64>;
   def LGFR : UnaryRRE<"lgf", 0xB914, sext32, GR64, GR32>;
@@ -452,7 +462,7 @@ let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
 //===----------------------------------------------------------------------===//
 
 // 32-bit extensions from registers.
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   // Expands to LLCR or RISB[LH]G, depending on the choice of registers.
   def LLCRMux : UnaryRRPseudo<"llc", zext8, GRX32, GRX32>,
                 Requires<[FeatureHighWord]>;
@@ -464,7 +474,7 @@ let neverHasSideEffects = 1 in {
 }
 
 // 64-bit extensions from registers.
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   def LLGCR : UnaryRRE<"llgc", 0xB984, zext8,  GR64, GR64>;
   def LLGHR : UnaryRRE<"llgh", 0xB985, zext16, GR64, GR64>;
   def LLGFR : UnaryRRE<"llgf", 0xB916, zext32, GR64, GR32>;
@@ -546,7 +556,7 @@ def STMG : StoreMultipleRSY<"stmg", 0xEB24, GR64>;
 //===----------------------------------------------------------------------===//
 
 // Byte-swapping register moves.
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   def LRVR  : UnaryRRE<"lrv",  0xB91F, bswap, GR32, GR32>;
   def LRVGR : UnaryRRE<"lrvg", 0xB90F, bswap, GR64, GR64>;
 }
@@ -566,7 +576,7 @@ def STRVG : StoreRXY<"strvg", 0xE32F, storeu<bswap, nonvolatile_store>,
 //===----------------------------------------------------------------------===//
 
 // Load BDX-style addresses.
-let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isReMaterializable = 1,
+let hasSideEffects = 0, isAsCheapAsAMove = 1, isReMaterializable = 1,
     DispKey = "la" in {
   let DispSize = "12" in
     def LA : InstRX<0x41, (outs GR64:$R1), (ins laaddr12pair:$XBD2),
@@ -580,13 +590,19 @@ let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isReMaterializable = 1,
 
 // Load a PC-relative address.  There's no version of this instruction
 // with a 16-bit offset, so there's no relaxation.
-let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
+let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1,
     isReMaterializable = 1 in {
   def LARL : InstRIL<0xC00, (outs GR64:$R1), (ins pcrel32:$I2),
                      "larl\t$R1, $I2",
                      [(set GR64:$R1, pcrel32:$I2)]>;
 }
 
+// Load the Global Offset Table address.  This will be lowered into a
+//     larl $R1, _GLOBAL_OFFSET_TABLE_
+// instruction.
+def GOT : Alias<6, (outs GR64:$R1), (ins),
+                [(set GR64:$R1, (global_offset_table))]>;
+
 //===----------------------------------------------------------------------===//
 // Absolute and Negation
 //===----------------------------------------------------------------------===//
@@ -1012,13 +1028,13 @@ def DLG  : BinaryRXY<"dlg",  0xE387, z_udivrem64, GR128, load, 8>;
 //===----------------------------------------------------------------------===//
 
 // Shift left.
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>;
   def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>;
 }
 
 // Logical shift right.
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>;
   def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>;
 }
@@ -1030,7 +1046,7 @@ let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
 }
 
 // Rotate left.
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   def RLL  : BinaryRSY<"rll",  0xEB1D, rotl, GR32>;
   def RLLG : BinaryRSY<"rllg", 0xEB1C, rotl, GR64>;
 }
diff --git a/lib/Target/SystemZ/SystemZLDCleanup.cpp b/lib/Target/SystemZ/SystemZLDCleanup.cpp
new file mode 100644
index 0000000..24165be
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZLDCleanup.cpp
@@ -0,0 +1,143 @@
+//===-- SystemZLDCleanup.cpp - Clean up local-dynamic TLS accesses --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass combines multiple accesses to local-dynamic TLS variables so that
+// the TLS base address for the module is only fetched once per execution path
+// through the function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "SystemZMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class SystemZLDCleanup : public MachineFunctionPass {
+public:
+  static char ID;
+  SystemZLDCleanup(const SystemZTargetMachine &tm)
+    : MachineFunctionPass(ID), TII(nullptr), MF(nullptr) {}
+
+  const char *getPassName() const override {
+    return "SystemZ Local Dynamic TLS Access Clean-up";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg);
+  MachineInstr *ReplaceTLSCall(MachineInstr *I, unsigned TLSBaseAddrReg);
+  MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg);
+
+  const SystemZInstrInfo *TII;
+  MachineFunction *MF;
+};
+
+char SystemZLDCleanup::ID = 0;
+
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZLDCleanupPass(SystemZTargetMachine &TM) {
+  return new SystemZLDCleanup(TM);
+}
+
+void SystemZLDCleanup::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool SystemZLDCleanup::runOnMachineFunction(MachineFunction &F) {
+  TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
+  MF = &F;
+
+  SystemZMachineFunctionInfo* MFI = F.getInfo<SystemZMachineFunctionInfo>();
+  if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+    // No point folding accesses if there isn't at least two.
+    return false;
+  }
+
+  MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+  return VisitNode(DT->getRootNode(), 0);
+}
+
+// Visit the dominator subtree rooted at Node in pre-order.
+// If TLSBaseAddrReg is non-null, then use that to replace any
+// TLS_LDCALL instructions. Otherwise, create the register
+// when the first such instruction is seen, and then use it
+// as we encounter more instructions.
+bool SystemZLDCleanup::VisitNode(MachineDomTreeNode *Node,
+                                 unsigned TLSBaseAddrReg) {
+  MachineBasicBlock *BB = Node->getBlock();
+  bool Changed = false;
+
+  // Traverse the current block.
+  for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+    switch (I->getOpcode()) {
+      case SystemZ::TLS_LDCALL:
+        if (TLSBaseAddrReg)
+          I = ReplaceTLSCall(I, TLSBaseAddrReg);
+        else
+          I = SetRegister(I, &TLSBaseAddrReg);
+        Changed = true;
+        break;
+      default:
+        break;
+    }
+  }
+
+  // Visit the children of this block in the dominator tree.
+  for (auto I = Node->begin(), E = Node->end(); I != E; ++I)
+    Changed |= VisitNode(*I, TLSBaseAddrReg);
+
+  return Changed;
+}
+
+// Replace the TLS_LDCALL instruction I with a copy from TLSBaseAddrReg,
+// returning the new instruction.
+MachineInstr *SystemZLDCleanup::ReplaceTLSCall(MachineInstr *I,
+                                               unsigned TLSBaseAddrReg) {
+  // Insert a Copy from TLSBaseAddrReg to R2.
+  MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                               TII->get(TargetOpcode::COPY), SystemZ::R2D)
+                               .addReg(TLSBaseAddrReg);
+
+  // Erase the TLS_LDCALL instruction.
+  I->eraseFromParent();
+
+  return Copy;
+}
+
+// Create a virtal register in *TLSBaseAddrReg, and populate it by
+// inserting a copy instruction after I. Returns the new instruction.
+MachineInstr *SystemZLDCleanup::SetRegister(MachineInstr *I,
+                                            unsigned *TLSBaseAddrReg) {
+  // Create a virtual register for the TLS base address.
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  *TLSBaseAddrReg = RegInfo.createVirtualRegister(&SystemZ::GR64BitRegClass);
+
+  // Insert a copy from R2 to TLSBaseAddrReg.
+  MachineInstr *Next = I->getNextNode();
+  MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+                               TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
+                               .addReg(SystemZ::R2D);
+
+  return Copy;
+}
+
diff --git a/lib/Target/SystemZ/SystemZMCInstLower.cpp b/lib/Target/SystemZ/SystemZMCInstLower.cpp
index df561e2..6bb96f1 100644
--- a/lib/Target/SystemZ/SystemZMCInstLower.cpp
+++ b/lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -22,6 +22,8 @@ static MCSymbolRefExpr::VariantKind getVariantKind(unsigned Flags) {
       return MCSymbolRefExpr::VK_None;
     case SystemZII::MO_GOT:
       return MCSymbolRefExpr::VK_GOT;
+    case SystemZII::MO_INDNTPOFF:
+      return MCSymbolRefExpr::VK_INDNTPOFF;
   }
   llvm_unreachable("Unrecognised MO_ACCESS_MODEL");
 }
diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
index 92c2ce7..34fc36d 100644
--- a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -23,11 +23,13 @@ class SystemZMachineFunctionInfo : public MachineFunctionInfo {
   unsigned VarArgsFrameIndex;
   unsigned RegSaveFrameIndex;
   bool ManipulatesSP;
+  unsigned NumLocalDynamics;
 
 public:
   explicit SystemZMachineFunctionInfo(MachineFunction &MF)
     : LowSavedGPR(0), HighSavedGPR(0), VarArgsFirstGPR(0), VarArgsFirstFPR(0),
-      VarArgsFrameIndex(0), RegSaveFrameIndex(0), ManipulatesSP(false) {}
+      VarArgsFrameIndex(0), RegSaveFrameIndex(0), ManipulatesSP(false),
+      NumLocalDynamics(0) {}
 
   // Get and set the first call-saved GPR that should be saved and restored
   // by this function.  This is 0 if no GPRs need to be saved or restored.
@@ -61,6 +63,10 @@ public:
   // e.g. through STACKSAVE or STACKRESTORE.
   bool getManipulatesSP() const { return ManipulatesSP; }
   void setManipulatesSP(bool MSP) { ManipulatesSP = MSP; }
+
+  // Count number of local-dynamic TLS symbols used.
+  unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
index 7be81dc..1b5b7d7 100644
--- a/lib/Target/SystemZ/SystemZOperands.td
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -16,6 +16,11 @@ class ImmediateAsmOperand<string name>
   let Name = name;
   let RenderMethod = "addImmOperands";
 }
+class ImmediateTLSAsmOperand<string name>
+  : AsmOperandClass {
+  let Name = name;
+  let RenderMethod = "addImmTLSOperands";
+}
 
 // Constructs both a DAG pattern and instruction operand for an immediate
 // of type VT.  PRED returns true if a node is acceptable and XFORM returns
@@ -34,6 +39,11 @@ class PCRelAsmOperand<string size> : ImmediateAsmOperand<"PCRel"##size> {
   let PredicateMethod = "isImm";
   let ParserMethod = "parsePCRel"##size;
 }
+class PCRelTLSAsmOperand<string size>
+  : ImmediateTLSAsmOperand<"PCRelTLS"##size> {
+  let PredicateMethod = "isImmTLS";
+  let ParserMethod = "parsePCRelTLS"##size;
+}
 
 // Constructs an operand for a PC-relative address with address type VT.
 // ASMOP is the associated asm operand.
@@ -41,6 +51,10 @@ class PCRelOperand<ValueType vt, AsmOperandClass asmop> : Operand<vt> {
   let PrintMethod = "printPCRelOperand";
   let ParserMatchClass = asmop;
 }
+class PCRelTLSOperand<ValueType vt, AsmOperandClass asmop> : Operand<vt> {
+  let PrintMethod = "printPCRelTLSOperand";
+  let ParserMatchClass = asmop;
+}
 
 // Constructs both a DAG pattern and instruction operand for a PC-relative
 // address with address size VT.  SELF is the name of the operand and
@@ -370,6 +384,8 @@ def fpimmneg0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(-0.0); }]>;
 // PC-relative asm operands.
 def PCRel16 : PCRelAsmOperand<"16">;
 def PCRel32 : PCRelAsmOperand<"32">;
+def PCRelTLS16 : PCRelTLSAsmOperand<"16">;
+def PCRelTLS32 : PCRelTLSAsmOperand<"32">;
 
 // PC-relative offsets of a basic block.  The offset is sign-extended
 // and multiplied by 2.
@@ -382,6 +398,20 @@ def brtarget32 : PCRelOperand<OtherVT, PCRel32> {
   let DecoderMethod = "decodePC32DBLOperand";
 }
 
+// Variants of brtarget16/32 with an optional additional TLS symbol.
+// These are used to annotate calls to __tls_get_offset.
+def tlssym : Operand<i64> { }
+def brtarget16tls : PCRelTLSOperand<OtherVT, PCRelTLS16> {
+  let MIOperandInfo = (ops brtarget16:$func, tlssym:$sym);
+  let EncoderMethod = "getPC16DBLTLSEncoding";
+  let DecoderMethod = "decodePC16DBLOperand";
+}
+def brtarget32tls : PCRelTLSOperand<OtherVT, PCRelTLS32> {
+  let MIOperandInfo = (ops brtarget32:$func, tlssym:$sym);
+  let EncoderMethod = "getPC32DBLTLSEncoding";
+  let DecoderMethod = "decodePC32DBLOperand";
+}
+
 // A PC-relative offset of a global value.  The offset is sign-extended
 // and multiplied by 2.
 def pcrel32 : PCRelAddress<i64, "pcrel32", PCRel32> {
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index c70e662..51ac5da 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -90,6 +90,7 @@ def callseq_start       : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart,
 def callseq_end         : SDNode<"ISD::CALLSEQ_END",   SDT_CallSeqEnd,
                                  [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue,
                                   SDNPOutGlue]>;
+def global_offset_table : SDNode<"ISD::GLOBAL_OFFSET_TABLE", SDTPtrLeaf>;
 
 // Nodes for SystemZISD::*.  See SystemZISelLowering.h for more details.
 def z_retflag           : SDNode<"SystemZISD::RET_FLAG", SDTNone,
@@ -100,6 +101,12 @@ def z_call              : SDNode<"SystemZISD::CALL", SDT_ZCall,
 def z_sibcall           : SDNode<"SystemZISD::SIBCALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                                   SDNPVariadic]>;
+def z_tls_gdcall        : SDNode<"SystemZISD::TLS_GDCALL", SDT_ZCall,
+                                 [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                  SDNPVariadic]>;
+def z_tls_ldcall        : SDNode<"SystemZISD::TLS_LDCALL", SDT_ZCall,
+                                 [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                  SDNPVariadic]>;
 def z_pcrel_wrapper     : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
 def z_pcrel_offset      : SDNode<"SystemZISD::PCREL_OFFSET",
                                  SDT_ZWrapOffset, []>;
diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td
index e6b58f1..1594854 100644
--- a/lib/Target/SystemZ/SystemZProcessors.td
+++ b/lib/Target/SystemZ/SystemZProcessors.td
@@ -12,12 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 class SystemZFeature<string extname, string intname, string desc>
-  : Predicate<"Subtarget.has"##intname##"()">,
+  : Predicate<"Subtarget->has"##intname##"()">,
     AssemblerPredicate<"Feature"##intname, extname>,
     SubtargetFeature<extname, "Has"##intname, "true", desc>;
 
 class SystemZMissingFeature<string intname>
-  : Predicate<"!Subtarget.has"##intname##"()">;
+  : Predicate<"!Subtarget->has"##intname##"()">;
 
 def FeatureDistinctOps : SystemZFeature<
   "distinct-ops", "DistinctOps",
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index a3cba64..12fc198 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -103,7 +103,7 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
       // we can move at most 2 halfwords.
       uint64_t ByteVal = CByte->getZExtValue();
       if (ByteVal == 0 || ByteVal == 255 ?
-          Bytes <= 16 && CountPopulation_64(Bytes) <= 2 :
+          Bytes <= 16 && countPopulation(Bytes) <= 2 :
           Bytes <= 4) {
         unsigned Size1 = Bytes == 16 ? 8 : 1 << findLastSet(Bytes);
         unsigned Size2 = Bytes - Size1;
@@ -222,12 +222,9 @@ EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
 
   // Now select between End and null, depending on whether the character
   // was found.
-  SmallVector<SDValue, 5> Ops;
-  Ops.push_back(End);
-  Ops.push_back(DAG.getConstant(0, PtrVT));
-  Ops.push_back(DAG.getConstant(SystemZ::CCMASK_SRST, MVT::i32));
-  Ops.push_back(DAG.getConstant(SystemZ::CCMASK_SRST_FOUND, MVT::i32));
-  Ops.push_back(Glue);
+  SDValue Ops[] = {End, DAG.getConstant(0, PtrVT),
+                   DAG.getConstant(SystemZ::CCMASK_SRST, MVT::i32),
+                   DAG.getConstant(SystemZ::CCMASK_SRST_FOUND, MVT::i32), Glue};
   VTs = DAG.getVTList(PtrVT, MVT::Glue);
   End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
   return std::make_pair(End, Chain);
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index e160bc8..31a2bff 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -44,13 +44,8 @@ SystemZSubtarget::SystemZSubtarget(const std::string &TT,
     : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
       HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
       HasFastSerialization(false), HasInterlockedAccess1(false),
-      TargetTriple(TT),
-      // Make sure that global data has at least 16 bits of alignment by
-      // default, so that we can refer to it using LARL.  We don't have any
-      // special requirements for stack variables though.
-      DL("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64"),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM),
-      TSInfo(DL), FrameLowering() {}
+      TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+      TLInfo(TM, *this), TSInfo(*TM.getDataLayout()), FrameLowering() {}
 
 // Return true if GV binds locally under reloc model RM.
 static bool bindsLocally(const GlobalValue *GV, Reloc::Model RM) {
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index f881552..99cb1ad 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -43,7 +43,6 @@ protected:
 
 private:
   Triple TargetTriple;
-  const DataLayout DL;
   SystemZInstrInfo InstrInfo;
   SystemZTargetLowering TLInfo;
   SystemZSelectionDAGInfo TSInfo;
@@ -59,7 +58,6 @@ public:
     return &FrameLowering;
   }
   const SystemZInstrInfo *getInstrInfo() const override { return &InstrInfo; }
-  const DataLayout *getDataLayout() const override { return &DL; }
   const SystemZRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index d7c432e..73198b1 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -27,6 +27,10 @@ SystemZTargetMachine::SystemZTargetMachine(const Target &T, StringRef TT,
                                            CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
       TLOF(make_unique<TargetLoweringObjectFileELF>()),
+      // Make sure that global data has at least 16 bits of alignment by
+      // default, so that we can refer to it using LARL.  We don't have any
+      // special requirements for stack variables though.
+      DL("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64"),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
@@ -46,8 +50,8 @@ public:
 
   void addIRPasses() override;
   bool addInstSelector() override;
-  bool addPreSched2() override;
-  bool addPreEmitPass() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
 };
 } // end anonymous namespace
 
@@ -57,17 +61,20 @@ void SystemZPassConfig::addIRPasses() {
 
 bool SystemZPassConfig::addInstSelector() {
   addPass(createSystemZISelDag(getSystemZTargetMachine(), getOptLevel()));
+
+ if (getOptLevel() != CodeGenOpt::None)
+    addPass(createSystemZLDCleanupPass(getSystemZTargetMachine()));
+
   return false;
 }
 
-bool SystemZPassConfig::addPreSched2() {
+void SystemZPassConfig::addPreSched2() {
   if (getOptLevel() != CodeGenOpt::None &&
       getSystemZTargetMachine().getSubtargetImpl()->hasLoadStoreOnCond())
     addPass(&IfConverterID);
-  return true;
 }
 
-bool SystemZPassConfig::addPreEmitPass() {
+void SystemZPassConfig::addPreEmitPass() {
   // We eliminate comparisons here rather than earlier because some
   // transformations can change the set of available CC values and we
   // generally want those transformations to have priority.  This is
@@ -92,11 +99,10 @@ bool SystemZPassConfig::addPreEmitPass() {
   // between the comparison and the branch, but it isn't clear whether
   // preventing that would be a win or not.
   if (getOptLevel() != CodeGenOpt::None)
-    addPass(createSystemZElimComparePass(getSystemZTargetMachine()));
+    addPass(createSystemZElimComparePass(getSystemZTargetMachine()), false);
   if (getOptLevel() != CodeGenOpt::None)
-    addPass(createSystemZShortenInstPass(getSystemZTargetMachine()));
+    addPass(createSystemZShortenInstPass(getSystemZTargetMachine()), false);
   addPass(createSystemZLongBranchPass(getSystemZTargetMachine()));
-  return true;
 }
 
 TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index 9fae5e4..52ccc5a 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -24,6 +24,7 @@ class TargetFrameLowering;
 
 class SystemZTargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  const DataLayout DL;
   SystemZSubtarget        Subtarget;
 
 public:
@@ -34,6 +35,7 @@ public:
   ~SystemZTargetMachine() override;
 
   // Override TargetMachine.
+  const DataLayout *getDataLayout() const override { return &DL; }
   const SystemZSubtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
index 4b51b3f..5b7953d 100644
--- a/lib/Target/Target.cpp
+++ b/lib/Target/Target.cpp
@@ -18,24 +18,25 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/PassManager.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include <cstring>
 
 using namespace llvm;
 
-inline TargetLibraryInfo *unwrap(LLVMTargetLibraryInfoRef P) {
-  return reinterpret_cast<TargetLibraryInfo*>(P);
+inline TargetLibraryInfoImpl *unwrap(LLVMTargetLibraryInfoRef P) {
+  return reinterpret_cast<TargetLibraryInfoImpl*>(P);
 }
 
-inline LLVMTargetLibraryInfoRef wrap(const TargetLibraryInfo *P) {
-  TargetLibraryInfo *X = const_cast<TargetLibraryInfo*>(P);
+inline LLVMTargetLibraryInfoRef wrap(const TargetLibraryInfoImpl *P) {
+  TargetLibraryInfoImpl *X = const_cast<TargetLibraryInfoImpl*>(P);
   return reinterpret_cast<LLVMTargetLibraryInfoRef>(X);
 }
 
 void llvm::initializeTarget(PassRegistry &Registry) {
   initializeDataLayoutPassPass(Registry);
-  initializeTargetLibraryInfoPass(Registry);
+  initializeTargetLibraryInfoWrapperPassPass(Registry);
+  initializeTargetTransformInfoWrapperPassPass(Registry);
 }
 
 void LLVMInitializeTarget(LLVMPassRegistryRef R) {
@@ -54,7 +55,7 @@ void LLVMAddTargetData(LLVMTargetDataRef TD, LLVMPassManagerRef PM) {
 
 void LLVMAddTargetLibraryInfo(LLVMTargetLibraryInfoRef TLI,
                               LLVMPassManagerRef PM) {
-  unwrap(PM)->add(new TargetLibraryInfo(*unwrap(TLI)));
+  unwrap(PM)->add(new TargetLibraryInfoWrapperPass(*unwrap(TLI)));
 }
 
 char *LLVMCopyStringRepOfTargetData(LLVMTargetDataRef TD) {
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
deleted file mode 100644
index bca56b5..0000000
--- a/lib/Target/TargetLibraryInfo.cpp
+++ /dev/null
@@ -1,753 +0,0 @@
-//===-- TargetLibraryInfo.cpp - Runtime library information ----------------==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the TargetLibraryInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/ADT/Triple.h"
-using namespace llvm;
-
-// Register the default implementation.
-INITIALIZE_PASS(TargetLibraryInfo, "targetlibinfo",
-                "Target Library Information", false, true)
-char TargetLibraryInfo::ID = 0;
-
-void TargetLibraryInfo::anchor() { }
-
-const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
-  {
-    "_IO_getc",
-    "_IO_putc",
-    "_ZdaPv",
-    "_ZdaPvRKSt9nothrow_t",
-    "_ZdaPvj",
-    "_ZdaPvm",
-    "_ZdlPv",
-    "_ZdlPvRKSt9nothrow_t",
-    "_ZdlPvj",
-    "_ZdlPvm",
-    "_Znaj",
-    "_ZnajRKSt9nothrow_t",
-    "_Znam",
-    "_ZnamRKSt9nothrow_t",
-    "_Znwj",
-    "_ZnwjRKSt9nothrow_t",
-    "_Znwm",
-    "_ZnwmRKSt9nothrow_t",
-    "__cospi",
-    "__cospif",
-    "__cxa_atexit",
-    "__cxa_guard_abort",
-    "__cxa_guard_acquire",
-    "__cxa_guard_release",
-    "__isoc99_scanf",
-    "__isoc99_sscanf",
-    "__memcpy_chk",
-    "__memmove_chk",
-    "__memset_chk",
-    "__sincospi_stret",
-    "__sincospif_stret",
-    "__sinpi",
-    "__sinpif",
-    "__sqrt_finite",
-    "__sqrtf_finite",
-    "__sqrtl_finite",
-    "__stpcpy_chk",
-    "__stpncpy_chk",
-    "__strcpy_chk",
-    "__strdup",
-    "__strncpy_chk",
-    "__strndup",
-    "__strtok_r",
-    "abs",
-    "access",
-    "acos",
-    "acosf",
-    "acosh",
-    "acoshf",
-    "acoshl",
-    "acosl",
-    "asin",
-    "asinf",
-    "asinh",
-    "asinhf",
-    "asinhl",
-    "asinl",
-    "atan",
-    "atan2",
-    "atan2f",
-    "atan2l",
-    "atanf",
-    "atanh",
-    "atanhf",
-    "atanhl",
-    "atanl",
-    "atof",
-    "atoi",
-    "atol",
-    "atoll",
-    "bcmp",
-    "bcopy",
-    "bzero",
-    "calloc",
-    "cbrt",
-    "cbrtf",
-    "cbrtl",
-    "ceil",
-    "ceilf",
-    "ceill",
-    "chmod",
-    "chown",
-    "clearerr",
-    "closedir",
-    "copysign",
-    "copysignf",
-    "copysignl",
-    "cos",
-    "cosf",
-    "cosh",
-    "coshf",
-    "coshl",
-    "cosl",
-    "ctermid",
-    "exp",
-    "exp10",
-    "exp10f",
-    "exp10l",
-    "exp2",
-    "exp2f",
-    "exp2l",
-    "expf",
-    "expl",
-    "expm1",
-    "expm1f",
-    "expm1l",
-    "fabs",
-    "fabsf",
-    "fabsl",
-    "fclose",
-    "fdopen",
-    "feof",
-    "ferror",
-    "fflush",
-    "ffs",
-    "ffsl",
-    "ffsll",
-    "fgetc",
-    "fgetpos",
-    "fgets",
-    "fileno",
-    "fiprintf",
-    "flockfile",
-    "floor",
-    "floorf",
-    "floorl",
-    "fmax",
-    "fmaxf",
-    "fmaxl",
-    "fmin",
-    "fminf",
-    "fminl",
-    "fmod",
-    "fmodf",
-    "fmodl",
-    "fopen",
-    "fopen64",
-    "fprintf",
-    "fputc",
-    "fputs",
-    "fread",
-    "free",
-    "frexp",
-    "frexpf",
-    "frexpl",
-    "fscanf",
-    "fseek",
-    "fseeko",
-    "fseeko64",
-    "fsetpos",
-    "fstat",
-    "fstat64",
-    "fstatvfs",
-    "fstatvfs64",
-    "ftell",
-    "ftello",
-    "ftello64",
-    "ftrylockfile",
-    "funlockfile",
-    "fwrite",
-    "getc",
-    "getc_unlocked",
-    "getchar",
-    "getenv",
-    "getitimer",
-    "getlogin_r",
-    "getpwnam",
-    "gets",
-    "gettimeofday",
-    "htonl",
-    "htons",
-    "iprintf",
-    "isascii",
-    "isdigit",
-    "labs",
-    "lchown",
-    "ldexp",
-    "ldexpf",
-    "ldexpl",
-    "llabs",
-    "log",
-    "log10",
-    "log10f",
-    "log10l",
-    "log1p",
-    "log1pf",
-    "log1pl",
-    "log2",
-    "log2f",
-    "log2l",
-    "logb",
-    "logbf",
-    "logbl",
-    "logf",
-    "logl",
-    "lstat",
-    "lstat64",
-    "malloc",
-    "memalign",
-    "memccpy",
-    "memchr",
-    "memcmp",
-    "memcpy",
-    "memmove",
-    "memrchr",
-    "memset",
-    "memset_pattern16",
-    "mkdir",
-    "mktime",
-    "modf",
-    "modff",
-    "modfl",
-    "nearbyint",
-    "nearbyintf",
-    "nearbyintl",
-    "ntohl",
-    "ntohs",
-    "open",
-    "open64",
-    "opendir",
-    "pclose",
-    "perror",
-    "popen",
-    "posix_memalign",
-    "pow",
-    "powf",
-    "powl",
-    "pread",
-    "printf",
-    "putc",
-    "putchar",
-    "puts",
-    "pwrite",
-    "qsort",
-    "read",
-    "readlink",
-    "realloc",
-    "reallocf",
-    "realpath",
-    "remove",
-    "rename",
-    "rewind",
-    "rint",
-    "rintf",
-    "rintl",
-    "rmdir",
-    "round",
-    "roundf",
-    "roundl",
-    "scanf",
-    "setbuf",
-    "setitimer",
-    "setvbuf",
-    "sin",
-    "sinf",
-    "sinh",
-    "sinhf",
-    "sinhl",
-    "sinl",
-    "siprintf",
-    "snprintf",
-    "sprintf",
-    "sqrt",
-    "sqrtf",
-    "sqrtl",
-    "sscanf",
-    "stat",
-    "stat64",
-    "statvfs",
-    "statvfs64",
-    "stpcpy",
-    "stpncpy",
-    "strcasecmp",
-    "strcat",
-    "strchr",
-    "strcmp",
-    "strcoll",
-    "strcpy",
-    "strcspn",
-    "strdup",
-    "strlen",
-    "strncasecmp",
-    "strncat",
-    "strncmp",
-    "strncpy",
-    "strndup",
-    "strnlen",
-    "strpbrk",
-    "strrchr",
-    "strspn",
-    "strstr",
-    "strtod",
-    "strtof",
-    "strtok",
-    "strtok_r",
-    "strtol",
-    "strtold",
-    "strtoll",
-    "strtoul",
-    "strtoull",
-    "strxfrm",
-    "system",
-    "tan",
-    "tanf",
-    "tanh",
-    "tanhf",
-    "tanhl",
-    "tanl",
-    "times",
-    "tmpfile",
-    "tmpfile64",
-    "toascii",
-    "trunc",
-    "truncf",
-    "truncl",
-    "uname",
-    "ungetc",
-    "unlink",
-    "unsetenv",
-    "utime",
-    "utimes",
-    "valloc",
-    "vfprintf",
-    "vfscanf",
-    "vprintf",
-    "vscanf",
-    "vsnprintf",
-    "vsprintf",
-    "vsscanf",
-    "write"
-  };
-
-static bool hasSinCosPiStret(const Triple &T) {
-  // Only Darwin variants have _stret versions of combined trig functions.
-  if (!T.isOSDarwin())
-    return false;
-
-  // The ABI is rather complicated on x86, so don't do anything special there.
-  if (T.getArch() == Triple::x86)
-    return false;
-
-  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 9))
-    return false;
-
-  if (T.isiOS() && T.isOSVersionLT(7, 0))
-    return false;
-
-  return true;
-}
-
-/// initialize - Initialize the set of available library functions based on the
-/// specified target triple.  This should be carefully written so that a missing
-/// target triple gets a sane set of defaults.
-static void initialize(TargetLibraryInfo &TLI, const Triple &T,
-                       const char **StandardNames) {
-  initializeTargetLibraryInfoPass(*PassRegistry::getPassRegistry());
-
-#ifndef NDEBUG
-  // Verify that the StandardNames array is in alphabetical order.
-  for (unsigned F = 1; F < LibFunc::NumLibFuncs; ++F) {
-    if (strcmp(StandardNames[F-1], StandardNames[F]) >= 0)
-      llvm_unreachable("TargetLibraryInfo function names must be sorted");
-  }
-#endif // !NDEBUG
-
-  // There are no library implementations of mempcy and memset for r600 and
-  // these can be difficult to lower in the backend.
-  if (T.getArch() == Triple::r600) {
-    TLI.setUnavailable(LibFunc::memcpy);
-    TLI.setUnavailable(LibFunc::memset);
-    TLI.setUnavailable(LibFunc::memset_pattern16);
-    return;
-  }
-
-  // memset_pattern16 is only available on iOS 3.0 and Mac OS X 10.5 and later.
-  if (T.isMacOSX()) {
-    if (T.isMacOSXVersionLT(10, 5))
-      TLI.setUnavailable(LibFunc::memset_pattern16);
-  } else if (T.isiOS()) {
-    if (T.isOSVersionLT(3, 0))
-      TLI.setUnavailable(LibFunc::memset_pattern16);
-  } else {
-    TLI.setUnavailable(LibFunc::memset_pattern16);
-  }
-
-  if (!hasSinCosPiStret(T)) {
-    TLI.setUnavailable(LibFunc::sinpi);
-    TLI.setUnavailable(LibFunc::sinpif);
-    TLI.setUnavailable(LibFunc::cospi);
-    TLI.setUnavailable(LibFunc::cospif);
-    TLI.setUnavailable(LibFunc::sincospi_stret);
-    TLI.setUnavailable(LibFunc::sincospif_stret);
-  }
-
-  if (T.isMacOSX() && T.getArch() == Triple::x86 &&
-      !T.isMacOSXVersionLT(10, 7)) {
-    // x86-32 OSX has a scheme where fwrite and fputs (and some other functions
-    // we don't care about) have two versions; on recent OSX, the one we want
-    // has a $UNIX2003 suffix. The two implementations are identical except
-    // for the return value in some edge cases.  However, we don't want to
-    // generate code that depends on the old symbols.
-    TLI.setAvailableWithName(LibFunc::fwrite, "fwrite$UNIX2003");
-    TLI.setAvailableWithName(LibFunc::fputs, "fputs$UNIX2003");
-  }
-
-  // iprintf and friends are only available on XCore and TCE.
-  if (T.getArch() != Triple::xcore && T.getArch() != Triple::tce) {
-    TLI.setUnavailable(LibFunc::iprintf);
-    TLI.setUnavailable(LibFunc::siprintf);
-    TLI.setUnavailable(LibFunc::fiprintf);
-  }
-
-  if (T.isOSWindows() && !T.isOSCygMing()) {
-    // Win32 does not support long double
-    TLI.setUnavailable(LibFunc::acosl);
-    TLI.setUnavailable(LibFunc::asinl);
-    TLI.setUnavailable(LibFunc::atanl);
-    TLI.setUnavailable(LibFunc::atan2l);
-    TLI.setUnavailable(LibFunc::ceill);
-    TLI.setUnavailable(LibFunc::copysignl);
-    TLI.setUnavailable(LibFunc::cosl);
-    TLI.setUnavailable(LibFunc::coshl);
-    TLI.setUnavailable(LibFunc::expl);
-    TLI.setUnavailable(LibFunc::fabsf); // Win32 and Win64 both lack fabsf
-    TLI.setUnavailable(LibFunc::fabsl);
-    TLI.setUnavailable(LibFunc::floorl);
-    TLI.setUnavailable(LibFunc::fmaxl);
-    TLI.setUnavailable(LibFunc::fminl);
-    TLI.setUnavailable(LibFunc::fmodl);
-    TLI.setUnavailable(LibFunc::frexpl);
-    TLI.setUnavailable(LibFunc::ldexpf);
-    TLI.setUnavailable(LibFunc::ldexpl);
-    TLI.setUnavailable(LibFunc::logl);
-    TLI.setUnavailable(LibFunc::modfl);
-    TLI.setUnavailable(LibFunc::powl);
-    TLI.setUnavailable(LibFunc::sinl);
-    TLI.setUnavailable(LibFunc::sinhl);
-    TLI.setUnavailable(LibFunc::sqrtl);
-    TLI.setUnavailable(LibFunc::tanl);
-    TLI.setUnavailable(LibFunc::tanhl);
-
-    // Win32 only has C89 math
-    TLI.setUnavailable(LibFunc::acosh);
-    TLI.setUnavailable(LibFunc::acoshf);
-    TLI.setUnavailable(LibFunc::acoshl);
-    TLI.setUnavailable(LibFunc::asinh);
-    TLI.setUnavailable(LibFunc::asinhf);
-    TLI.setUnavailable(LibFunc::asinhl);
-    TLI.setUnavailable(LibFunc::atanh);
-    TLI.setUnavailable(LibFunc::atanhf);
-    TLI.setUnavailable(LibFunc::atanhl);
-    TLI.setUnavailable(LibFunc::cbrt);
-    TLI.setUnavailable(LibFunc::cbrtf);
-    TLI.setUnavailable(LibFunc::cbrtl);
-    TLI.setUnavailable(LibFunc::exp2);
-    TLI.setUnavailable(LibFunc::exp2f);
-    TLI.setUnavailable(LibFunc::exp2l);
-    TLI.setUnavailable(LibFunc::expm1);
-    TLI.setUnavailable(LibFunc::expm1f);
-    TLI.setUnavailable(LibFunc::expm1l);
-    TLI.setUnavailable(LibFunc::log2);
-    TLI.setUnavailable(LibFunc::log2f);
-    TLI.setUnavailable(LibFunc::log2l);
-    TLI.setUnavailable(LibFunc::log1p);
-    TLI.setUnavailable(LibFunc::log1pf);
-    TLI.setUnavailable(LibFunc::log1pl);
-    TLI.setUnavailable(LibFunc::logb);
-    TLI.setUnavailable(LibFunc::logbf);
-    TLI.setUnavailable(LibFunc::logbl);
-    TLI.setUnavailable(LibFunc::nearbyint);
-    TLI.setUnavailable(LibFunc::nearbyintf);
-    TLI.setUnavailable(LibFunc::nearbyintl);
-    TLI.setUnavailable(LibFunc::rint);
-    TLI.setUnavailable(LibFunc::rintf);
-    TLI.setUnavailable(LibFunc::rintl);
-    TLI.setUnavailable(LibFunc::round);
-    TLI.setUnavailable(LibFunc::roundf);
-    TLI.setUnavailable(LibFunc::roundl);
-    TLI.setUnavailable(LibFunc::trunc);
-    TLI.setUnavailable(LibFunc::truncf);
-    TLI.setUnavailable(LibFunc::truncl);
-
-    // Win32 provides some C99 math with mangled names
-    TLI.setAvailableWithName(LibFunc::copysign, "_copysign");
-
-    if (T.getArch() == Triple::x86) {
-      // Win32 on x86 implements single-precision math functions as macros
-      TLI.setUnavailable(LibFunc::acosf);
-      TLI.setUnavailable(LibFunc::asinf);
-      TLI.setUnavailable(LibFunc::atanf);
-      TLI.setUnavailable(LibFunc::atan2f);
-      TLI.setUnavailable(LibFunc::ceilf);
-      TLI.setUnavailable(LibFunc::copysignf);
-      TLI.setUnavailable(LibFunc::cosf);
-      TLI.setUnavailable(LibFunc::coshf);
-      TLI.setUnavailable(LibFunc::expf);
-      TLI.setUnavailable(LibFunc::floorf);
-      TLI.setUnavailable(LibFunc::fminf);
-      TLI.setUnavailable(LibFunc::fmaxf);
-      TLI.setUnavailable(LibFunc::fmodf);
-      TLI.setUnavailable(LibFunc::logf);
-      TLI.setUnavailable(LibFunc::powf);
-      TLI.setUnavailable(LibFunc::sinf);
-      TLI.setUnavailable(LibFunc::sinhf);
-      TLI.setUnavailable(LibFunc::sqrtf);
-      TLI.setUnavailable(LibFunc::tanf);
-      TLI.setUnavailable(LibFunc::tanhf);
-    }
-
-    // Win32 does *not* provide provide these functions, but they are
-    // generally available on POSIX-compliant systems:
-    TLI.setUnavailable(LibFunc::access);
-    TLI.setUnavailable(LibFunc::bcmp);
-    TLI.setUnavailable(LibFunc::bcopy);
-    TLI.setUnavailable(LibFunc::bzero);
-    TLI.setUnavailable(LibFunc::chmod);
-    TLI.setUnavailable(LibFunc::chown);
-    TLI.setUnavailable(LibFunc::closedir);
-    TLI.setUnavailable(LibFunc::ctermid);
-    TLI.setUnavailable(LibFunc::fdopen);
-    TLI.setUnavailable(LibFunc::ffs);
-    TLI.setUnavailable(LibFunc::fileno);
-    TLI.setUnavailable(LibFunc::flockfile);
-    TLI.setUnavailable(LibFunc::fseeko);
-    TLI.setUnavailable(LibFunc::fstat);
-    TLI.setUnavailable(LibFunc::fstatvfs);
-    TLI.setUnavailable(LibFunc::ftello);
-    TLI.setUnavailable(LibFunc::ftrylockfile);
-    TLI.setUnavailable(LibFunc::funlockfile);
-    TLI.setUnavailable(LibFunc::getc_unlocked);
-    TLI.setUnavailable(LibFunc::getitimer);
-    TLI.setUnavailable(LibFunc::getlogin_r);
-    TLI.setUnavailable(LibFunc::getpwnam);
-    TLI.setUnavailable(LibFunc::gettimeofday);
-    TLI.setUnavailable(LibFunc::htonl);
-    TLI.setUnavailable(LibFunc::htons);
-    TLI.setUnavailable(LibFunc::lchown);
-    TLI.setUnavailable(LibFunc::lstat);
-    TLI.setUnavailable(LibFunc::memccpy);
-    TLI.setUnavailable(LibFunc::mkdir);
-    TLI.setUnavailable(LibFunc::ntohl);
-    TLI.setUnavailable(LibFunc::ntohs);
-    TLI.setUnavailable(LibFunc::open);
-    TLI.setUnavailable(LibFunc::opendir);
-    TLI.setUnavailable(LibFunc::pclose);
-    TLI.setUnavailable(LibFunc::popen);
-    TLI.setUnavailable(LibFunc::pread);
-    TLI.setUnavailable(LibFunc::pwrite);
-    TLI.setUnavailable(LibFunc::read);
-    TLI.setUnavailable(LibFunc::readlink);
-    TLI.setUnavailable(LibFunc::realpath);
-    TLI.setUnavailable(LibFunc::rmdir);
-    TLI.setUnavailable(LibFunc::setitimer);
-    TLI.setUnavailable(LibFunc::stat);
-    TLI.setUnavailable(LibFunc::statvfs);
-    TLI.setUnavailable(LibFunc::stpcpy);
-    TLI.setUnavailable(LibFunc::stpncpy);
-    TLI.setUnavailable(LibFunc::strcasecmp);
-    TLI.setUnavailable(LibFunc::strncasecmp);
-    TLI.setUnavailable(LibFunc::times);
-    TLI.setUnavailable(LibFunc::uname);
-    TLI.setUnavailable(LibFunc::unlink);
-    TLI.setUnavailable(LibFunc::unsetenv);
-    TLI.setUnavailable(LibFunc::utime);
-    TLI.setUnavailable(LibFunc::utimes);
-    TLI.setUnavailable(LibFunc::write);
-
-    // Win32 does *not* provide provide these functions, but they are
-    // specified by C99:
-    TLI.setUnavailable(LibFunc::atoll);
-    TLI.setUnavailable(LibFunc::frexpf);
-    TLI.setUnavailable(LibFunc::llabs);
-  }
-
-  switch (T.getOS()) {
-  case Triple::MacOSX:
-    // exp10 and exp10f are not available on OS X until 10.9 and iOS until 7.0
-    // and their names are __exp10 and __exp10f. exp10l is not available on
-    // OS X or iOS.
-    TLI.setUnavailable(LibFunc::exp10l);
-    if (T.isMacOSXVersionLT(10, 9)) {
-      TLI.setUnavailable(LibFunc::exp10);
-      TLI.setUnavailable(LibFunc::exp10f);
-    } else {
-      TLI.setAvailableWithName(LibFunc::exp10, "__exp10");
-      TLI.setAvailableWithName(LibFunc::exp10f, "__exp10f");
-    }
-    break;
-  case Triple::IOS:
-    TLI.setUnavailable(LibFunc::exp10l);
-    if (T.isOSVersionLT(7, 0)) {
-      TLI.setUnavailable(LibFunc::exp10);
-      TLI.setUnavailable(LibFunc::exp10f);
-    } else {
-      TLI.setAvailableWithName(LibFunc::exp10, "__exp10");
-      TLI.setAvailableWithName(LibFunc::exp10f, "__exp10f");
-    }
-    break;
-  case Triple::Linux:
-    // exp10, exp10f, exp10l is available on Linux (GLIBC) but are extremely
-    // buggy prior to glibc version 2.18. Until this version is widely deployed
-    // or we have a reasonable detection strategy, we cannot use exp10 reliably
-    // on Linux.
-    //
-    // Fall through to disable all of them.
-  default:
-    TLI.setUnavailable(LibFunc::exp10);
-    TLI.setUnavailable(LibFunc::exp10f);
-    TLI.setUnavailable(LibFunc::exp10l);
-  }
-
-  // ffsl is available on at least Darwin, Mac OS X, iOS, FreeBSD, and
-  // Linux (GLIBC):
-  // http://developer.apple.com/library/mac/#documentation/Darwin/Reference/ManPages/man3/ffsl.3.html
-  // http://svn.freebsd.org/base/user/eri/pf45/head/lib/libc/string/ffsl.c
-  // http://www.gnu.org/software/gnulib/manual/html_node/ffsl.html
-  switch (T.getOS()) {
-  case Triple::Darwin:
-  case Triple::MacOSX:
-  case Triple::IOS:
-  case Triple::FreeBSD:
-  case Triple::Linux:
-    break;
-  default:
-    TLI.setUnavailable(LibFunc::ffsl);
-  }
-
-  // ffsll is available on at least FreeBSD and Linux (GLIBC):
-  // http://svn.freebsd.org/base/user/eri/pf45/head/lib/libc/string/ffsll.c
-  // http://www.gnu.org/software/gnulib/manual/html_node/ffsll.html
-  switch (T.getOS()) {
-  case Triple::FreeBSD:
-  case Triple::Linux:
-    break;
-  default:
-    TLI.setUnavailable(LibFunc::ffsll);
-  }
-
-  // The following functions are available on at least Linux:
-  if (!T.isOSLinux()) {
-    TLI.setUnavailable(LibFunc::dunder_strdup);
-    TLI.setUnavailable(LibFunc::dunder_strtok_r);
-    TLI.setUnavailable(LibFunc::dunder_isoc99_scanf);
-    TLI.setUnavailable(LibFunc::dunder_isoc99_sscanf);
-    TLI.setUnavailable(LibFunc::under_IO_getc);
-    TLI.setUnavailable(LibFunc::under_IO_putc);
-    TLI.setUnavailable(LibFunc::memalign);
-    TLI.setUnavailable(LibFunc::fopen64);
-    TLI.setUnavailable(LibFunc::fseeko64);
-    TLI.setUnavailable(LibFunc::fstat64);
-    TLI.setUnavailable(LibFunc::fstatvfs64);
-    TLI.setUnavailable(LibFunc::ftello64);
-    TLI.setUnavailable(LibFunc::lstat64);
-    TLI.setUnavailable(LibFunc::open64);
-    TLI.setUnavailable(LibFunc::stat64);
-    TLI.setUnavailable(LibFunc::statvfs64);
-    TLI.setUnavailable(LibFunc::tmpfile64);
-  }
-}
-
-
-TargetLibraryInfo::TargetLibraryInfo() : ImmutablePass(ID) {
-  // Default to everything being available.
-  memset(AvailableArray, -1, sizeof(AvailableArray));
-
-  initialize(*this, Triple(), StandardNames);
-}
-
-TargetLibraryInfo::TargetLibraryInfo(const Triple &T) : ImmutablePass(ID) {
-  // Default to everything being available.
-  memset(AvailableArray, -1, sizeof(AvailableArray));
-  
-  initialize(*this, T, StandardNames);
-}
-
-TargetLibraryInfo::TargetLibraryInfo(const TargetLibraryInfo &TLI)
-  : ImmutablePass(ID) {
-  memcpy(AvailableArray, TLI.AvailableArray, sizeof(AvailableArray));
-  CustomNames = TLI.CustomNames;
-}
-
-namespace {
-struct StringComparator {
-  /// Compare two strings and return true if LHS is lexicographically less than
-  /// RHS. Requires that RHS doesn't contain any zero bytes.
-  bool operator()(const char *LHS, StringRef RHS) const {
-    // Compare prefixes with strncmp. If prefixes match we know that LHS is
-    // greater or equal to RHS as RHS can't contain any '\0'.
-    return std::strncmp(LHS, RHS.data(), RHS.size()) < 0;
-  }
-
-  // Provided for compatibility with MSVC's debug mode.
-  bool operator()(StringRef LHS, const char *RHS) const { return LHS < RHS; }
-  bool operator()(StringRef LHS, StringRef RHS) const { return LHS < RHS; }
-  bool operator()(const char *LHS, const char *RHS) const {
-    return std::strcmp(LHS, RHS) < 0;
-  }
-};
-}
-
-bool TargetLibraryInfo::getLibFunc(StringRef funcName,
-                                   LibFunc::Func &F) const {
-  const char **Start = &StandardNames[0];
-  const char **End = &StandardNames[LibFunc::NumLibFuncs];
-
-  // Filter out empty names and names containing null bytes, those can't be in
-  // our table.
-  if (funcName.empty() || funcName.find('\0') != StringRef::npos)
-    return false;
-
-  // Check for \01 prefix that is used to mangle __asm declarations and
-  // strip it if present.
-  if (funcName.front() == '\01')
-    funcName = funcName.substr(1);
-  const char **I = std::lower_bound(Start, End, funcName, StringComparator());
-  if (I != End && *I == funcName) {
-    F = (LibFunc::Func)(I - Start);
-    return true;
-  }
-  return false;
-}
-
-/// disableAllFunctions - This disables all builtins, which is used for options
-/// like -fno-builtin.
-void TargetLibraryInfo::disableAllFunctions() {
-  memset(AvailableArray, 0, sizeof(AvailableArray));
-}
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 01139fb..faa6fbe 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -43,7 +43,7 @@ using namespace llvm;
 void TargetLoweringObjectFile::Initialize(MCContext &ctx,
                                           const TargetMachine &TM) {
   Ctx = &ctx;
-  DL = TM.getSubtargetImpl()->getDataLayout();
+  DL = TM.getDataLayout();
   InitMCObjectFileInfo(TM.getTargetTriple(),
                        TM.getRelocationModel(), TM.getCodeModel(), *Ctx);
 }
@@ -200,12 +200,12 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
       // Otherwise, just drop it into a mergable constant section.  If we have
       // a section for this size, use it, otherwise use the arbitrary sized
       // mergable section.
-      switch (TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(
-          C->getType())) {
+      switch (TM.getDataLayout()->getTypeAllocSize(C->getType())) {
       case 4:  return SectionKind::getMergeableConst4();
       case 8:  return SectionKind::getMergeableConst8();
       case 16: return SectionKind::getMergeableConst16();
-      default: return SectionKind::getMergeableConst();
+      default:
+        return SectionKind::getReadOnly();
       }
 
     case Constant::LocalRelocation:
@@ -270,11 +270,28 @@ SectionForGlobal(const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
   return SelectSectionForGlobal(GV, Kind, Mang, TM);
 }
 
-bool TargetLoweringObjectFile::isSectionAtomizableBySymbols(
-    const MCSection &Section) const {
-  return false;
+const MCSection *TargetLoweringObjectFile::getSectionForJumpTable(
+    const Function &F, Mangler &Mang, const TargetMachine &TM) const {
+  return getSectionForConstant(SectionKind::getReadOnly(), /*C=*/nullptr);
 }
 
+bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  // In PIC mode, we need to emit the jump table to the same section as the
+  // function body itself, otherwise the label differences won't make sense.
+  // FIXME: Need a better predicate for this: what about custom entries?
+  if (UsesLabelDifference)
+    return true;
+
+  // We should also do if the section name is NULL or function is declared
+  // in discardable section
+  // FIXME: this isn't the right predicate, should be based on the MCSection
+  // for the function.
+  if (F.isWeakForLinker())
+    return true;
+
+  return false;
+}
 
 /// getSectionForConstant - Given a mergable constant with the
 /// specified size and relocation information, return a section that it
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 309e1bf..307e93c 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
@@ -21,8 +22,10 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/SectionKind.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -51,10 +54,8 @@ TargetMachine::~TargetMachine() {
 void TargetMachine::resetTargetOptions(const Function &F) const {
 #define RESET_OPTION(X, Y)                                                     \
   do {                                                                         \
-    if (F.hasFnAttribute(Y))                                                  \
-      Options.X = (F.getAttributes()                                          \
-                       .getAttribute(AttributeSet::FunctionIndex, Y)           \
-                       .getValueAsString() == "true");                         \
+    if (F.hasFnAttribute(Y))                                                   \
+      Options.X = (F.getFnAttribute(Y).getValueAsString() == "true");          \
   } while (0)
 
   RESET_OPTION(NoFramePointerElim, "no-frame-pointer-elim");
@@ -145,28 +146,22 @@ void TargetMachine::setOptLevel(CodeGenOpt::Level Level) const {
     CodeGenInfo->setOptLevel(Level);
 }
 
-bool TargetMachine::getAsmVerbosityDefault() const {
-  return Options.MCOptions.AsmVerbose;
+TargetIRAnalysis TargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis(
+      [this](Function &) { return TargetTransformInfo(getDataLayout()); });
 }
 
-void TargetMachine::setAsmVerbosityDefault(bool V) {
-  Options.MCOptions.AsmVerbose = V;
-}
-
-bool TargetMachine::getFunctionSections() const {
-  return Options.FunctionSections;
-}
+static bool canUsePrivateLabel(const MCAsmInfo &AsmInfo,
+                               const MCSection &Section) {
+  if (!AsmInfo.isSectionAtomizableBySymbols(Section))
+    return true;
 
-bool TargetMachine::getDataSections() const {
-  return Options.DataSections;
-}
-
-void TargetMachine::setFunctionSections(bool V) {
-  Options.FunctionSections = V;
-}
+  // If it is not dead stripped, it is safe to use private labels.
+  const MCSectionMachO &SMO = cast<MCSectionMachO>(Section);
+  if (SMO.hasAttribute(MachO::S_ATTR_NO_DEAD_STRIP))
+    return true;
 
-void TargetMachine::setDataSections(bool V) {
-  Options.DataSections = V;
+  return false;
 }
 
 void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
@@ -179,17 +174,15 @@ void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
     return;
   }
   SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, *this);
-  const TargetLoweringObjectFile &TLOF =
-      getSubtargetImpl()->getTargetLowering()->getObjFileLowering();
-  const MCSection *TheSection = TLOF.SectionForGlobal(GV, GVKind, Mang, *this);
-  bool CannotUsePrivateLabel = TLOF.isSectionAtomizableBySymbols(*TheSection);
+  const TargetLoweringObjectFile *TLOF = getObjFileLowering();
+  const MCSection *TheSection = TLOF->SectionForGlobal(GV, GVKind, Mang, *this);
+  bool CannotUsePrivateLabel = !canUsePrivateLabel(*AsmInfo, *TheSection);
   Mang.getNameWithPrefix(Name, GV, CannotUsePrivateLabel);
 }
 
 MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV, Mangler &Mang) const {
   SmallString<60> NameStr;
   getNameWithPrefix(NameStr, GV, Mang);
-  const TargetLoweringObjectFile &TLOF =
-      getSubtargetImpl()->getTargetLowering()->getObjFileLowering();
-  return TLOF.getContext().GetOrCreateSymbol(NameStr.str());
+  const TargetLoweringObjectFile *TLOF = getObjFileLowering();
+  return TLOF->getContext().GetOrCreateSymbol(NameStr.str());
 }
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index b3e07df..c7838a9 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -14,9 +14,10 @@
 #include "llvm-c/TargetMachine.h"
 #include "llvm-c/Core.h"
 #include "llvm-c/Target.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Module.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
@@ -173,12 +174,12 @@ char* LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T) {
 }
 
 LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T) {
-  return wrap(unwrap(T)->getSubtargetImpl()->getDataLayout());
+  return wrap(unwrap(T)->getDataLayout());
 }
 
 void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T,
                                       LLVMBool VerboseAsm) {
-  unwrap(T)->setAsmVerbosityDefault(VerboseAsm);
+  unwrap(T)->Options.MCOptions.AsmVerbose = VerboseAsm;
 }
 
 static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
@@ -186,11 +187,11 @@ static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
   TargetMachine* TM = unwrap(T);
   Module* Mod = unwrap(M);
 
-  PassManager pass;
+  legacy::PassManager pass;
 
   std::string error;
 
-  const DataLayout *td = TM->getSubtargetImpl()->getDataLayout();
+  const DataLayout *td = TM->getDataLayout();
 
   if (!td) {
     error = "No DataLayout in TargetMachine";
@@ -255,5 +256,6 @@ char *LLVMGetDefaultTargetTriple(void) {
 }
 
 void LLVMAddAnalysisPasses(LLVMTargetMachineRef T, LLVMPassManagerRef PM) {
-  unwrap(T)->addAnalysisPasses(*unwrap(PM));
+  unwrap(PM)->add(
+      createTargetTransformInfoWrapperPass(unwrap(T)->getTargetIRAnalysis()));
 }
diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk
index 861a41d..08646d0 100644
--- a/lib/Target/X86/Android.mk
+++ b/lib/Target/X86/Android.mk
@@ -12,6 +12,7 @@ x86_codegen_TBLGEN_TABLES := \
 
 x86_codegen_SRC_FILES := \
   X86AsmPrinter.cpp \
+  X86CallFrameOptimization.cpp \
   X86FastISel.cpp \
   X86FixupLEAs.cpp \
   X86FloatingPoint.cpp \
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
index 9c49a11..543af8e 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -165,9 +165,9 @@ public:
     }
 
     unsigned ChooseFrameReg(MVT::SimpleValueType VT) const {
-      static const unsigned Candidates[] = { X86::RBP, X86::RAX, X86::RBX,
-                                             X86::RCX, X86::RDX, X86::RDI,
-                                             X86::RSI };
+      static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX,
+                                              X86::RCX, X86::RDX, X86::RDI,
+                                              X86::RSI };
       for (unsigned Reg : Candidates) {
         if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg))
           return convReg(Reg, VT);
@@ -261,6 +261,23 @@ protected:
                                               int64_t Displacement,
                                               MCContext &Ctx, int64_t *Residue);
 
+  bool is64BitMode() const {
+    return (STI.getFeatureBits() & X86::Mode64Bit) != 0;
+  }
+  bool is32BitMode() const {
+    return (STI.getFeatureBits() & X86::Mode32Bit) != 0;
+  }
+  bool is16BitMode() const {
+    return (STI.getFeatureBits() & X86::Mode16Bit) != 0;
+  }
+
+  unsigned getPointerWidth() {
+    if (is16BitMode()) return 16;
+    if (is32BitMode()) return 32;
+    if (is64BitMode()) return 64;
+    llvm_unreachable("invalid mode");
+  }
+
   // True when previous instruction was actually REP prefix.
   bool RepPrefix;
 
@@ -301,7 +318,7 @@ void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg,
   {
     const MCExpr *Disp = MCConstantExpr::Create(0, Ctx);
     std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
-        0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc()));
+        getPointerWidth(), 0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc()));
     InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
                          Out);
   }
@@ -310,7 +327,8 @@ void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg,
   {
     const MCExpr *Disp = MCConstantExpr::Create(-1, Ctx);
     std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
-        0, Disp, SrcReg, CntReg, AccessSize, SMLoc(), SMLoc()));
+        getPointerWidth(), 0, Disp, SrcReg, CntReg, AccessSize, SMLoc(),
+        SMLoc()));
     InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
                          Out);
   }
@@ -319,7 +337,7 @@ void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg,
   {
     const MCExpr *Disp = MCConstantExpr::Create(0, Ctx);
     std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
-        0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc()));
+        getPointerWidth(), 0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc()));
     InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
   }
 
@@ -327,7 +345,8 @@ void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg,
   {
     const MCExpr *Disp = MCConstantExpr::Create(-1, Ctx);
     std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
-        0, Disp, DstReg, CntReg, AccessSize, SMLoc(), SMLoc()));
+        getPointerWidth(), 0, Disp, DstReg, CntReg, AccessSize, SMLoc(),
+        SMLoc()));
     InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
   }
 
@@ -445,7 +464,8 @@ void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op,
     const MCConstantExpr *Disp =
         MCConstantExpr::Create(ApplyDisplacementBounds(Residue), Ctx);
     std::unique_ptr<X86Operand> DispOp =
-        X86Operand::CreateMem(0, Disp, Reg, 0, 1, SMLoc(), SMLoc());
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(),
+                              SMLoc());
     EmitLEA(*DispOp, VT, Reg, Out);
     Residue -= Disp->getValue();
   }
@@ -459,9 +479,10 @@ X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement,
   if (Displacement == 0 ||
       (Op.getMemDisp() && Op.getMemDisp()->getKind() != MCExpr::Constant)) {
     *Residue = Displacement;
-    return X86Operand::CreateMem(Op.getMemSegReg(), Op.getMemDisp(),
-                                 Op.getMemBaseReg(), Op.getMemIndexReg(),
-                                 Op.getMemScale(), SMLoc(), SMLoc());
+    return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(),
+                                 Op.getMemDisp(), Op.getMemBaseReg(),
+                                 Op.getMemIndexReg(), Op.getMemScale(),
+                                 SMLoc(), SMLoc());
   }
 
   int64_t OrigDisplacement =
@@ -474,9 +495,9 @@ X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement,
 
   *Residue = Displacement - NewDisplacement;
   const MCExpr *Disp = MCConstantExpr::Create(NewDisplacement, Ctx);
-  return X86Operand::CreateMem(Op.getMemSegReg(), Disp, Op.getMemBaseReg(),
-                               Op.getMemIndexReg(), Op.getMemScale(), SMLoc(),
-                               SMLoc());
+  return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), Disp,
+                               Op.getMemBaseReg(), Op.getMemIndexReg(),
+                               Op.getMemScale(), SMLoc(), SMLoc());
 }
 
 class X86AddressSanitizer32 : public X86AddressSanitizer {
@@ -625,7 +646,8 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall(
     Inst.addOperand(MCOperand::CreateReg(ShadowRegI8));
     const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
     std::unique_ptr<X86Operand> Op(
-        X86Operand::CreateMem(0, Disp, ShadowRegI32, 0, 1, SMLoc(), SMLoc()));
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1,
+                              SMLoc(), SMLoc()));
     Op->addMemOperands(Inst, 5);
     EmitInstruction(Out, Inst);
   }
@@ -634,7 +656,7 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall(
       Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
   MCSymbol *DoneSym = Ctx.CreateTempSymbol();
   const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
-  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
 
   EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
                            AddressRegI32));
@@ -644,12 +666,14 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall(
                            .addImm(7));
 
   switch (AccessSize) {
+  default: llvm_unreachable("Incorrect access size");
   case 1:
     break;
   case 2: {
     const MCExpr *Disp = MCConstantExpr::Create(1, Ctx);
     std::unique_ptr<X86Operand> Op(
-        X86Operand::CreateMem(0, Disp, ScratchRegI32, 0, 1, SMLoc(), SMLoc()));
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
+                              SMLoc(), SMLoc()));
     EmitLEA(*Op, MVT::i32, ScratchRegI32, Out);
     break;
   }
@@ -659,9 +683,6 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall(
                              .addReg(ScratchRegI32)
                              .addImm(3));
     break;
-  default:
-    assert(false && "Incorrect access size");
-    break;
   }
 
   EmitInstruction(
@@ -669,7 +690,7 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall(
       MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8));
   EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg(
                            ShadowRegI32));
-  EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr));
+  EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr));
 
   EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
   EmitLabel(Out, DoneSym);
@@ -692,26 +713,25 @@ void X86AddressSanitizer32::InstrumentMemOperandLarge(
   {
     MCInst Inst;
     switch (AccessSize) {
+    default: llvm_unreachable("Incorrect access size");
     case 8:
       Inst.setOpcode(X86::CMP8mi);
       break;
     case 16:
       Inst.setOpcode(X86::CMP16mi);
       break;
-    default:
-      assert(false && "Incorrect access size");
-      break;
     }
     const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
     std::unique_ptr<X86Operand> Op(
-        X86Operand::CreateMem(0, Disp, ShadowRegI32, 0, 1, SMLoc(), SMLoc()));
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1,
+                              SMLoc(), SMLoc()));
     Op->addMemOperands(Inst, 5);
     Inst.addOperand(MCOperand::CreateImm(0));
     EmitInstruction(Out, Inst);
   }
   MCSymbol *DoneSym = Ctx.CreateTempSymbol();
   const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
-  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
 
   EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
   EmitLabel(Out, DoneSym);
@@ -727,7 +747,7 @@ void X86AddressSanitizer32::InstrumentMOVSImpl(unsigned AccessSize,
   const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
   EmitInstruction(
       Out, MCInstBuilder(X86::TEST32rr).addReg(X86::ECX).addReg(X86::ECX));
-  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
 
   // Instrument first and last elements in src and dst range.
   InstrumentMOVSBase(X86::EDI /* DstReg */, X86::ESI /* SrcReg */,
@@ -843,7 +863,8 @@ private:
   void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) {
     const MCExpr *Disp = MCConstantExpr::Create(Offset, Ctx);
     std::unique_ptr<X86Operand> Op(
-        X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc()));
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1,
+                              SMLoc(), SMLoc()));
     EmitLEA(*Op, MVT::i64, X86::RSP, Out);
     OrigSPOffset += Offset;
   }
@@ -896,7 +917,8 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall(
     Inst.addOperand(MCOperand::CreateReg(ShadowRegI8));
     const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
     std::unique_ptr<X86Operand> Op(
-        X86Operand::CreateMem(0, Disp, ShadowRegI64, 0, 1, SMLoc(), SMLoc()));
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1,
+                              SMLoc(), SMLoc()));
     Op->addMemOperands(Inst, 5);
     EmitInstruction(Out, Inst);
   }
@@ -905,7 +927,7 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall(
       Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
   MCSymbol *DoneSym = Ctx.CreateTempSymbol();
   const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
-  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
 
   EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
                            AddressRegI32));
@@ -915,12 +937,14 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall(
                            .addImm(7));
 
   switch (AccessSize) {
+  default: llvm_unreachable("Incorrect access size");
   case 1:
     break;
   case 2: {
     const MCExpr *Disp = MCConstantExpr::Create(1, Ctx);
     std::unique_ptr<X86Operand> Op(
-        X86Operand::CreateMem(0, Disp, ScratchRegI32, 0, 1, SMLoc(), SMLoc()));
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
+                              SMLoc(), SMLoc()));
     EmitLEA(*Op, MVT::i32, ScratchRegI32, Out);
     break;
   }
@@ -930,9 +954,6 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall(
                              .addReg(ScratchRegI32)
                              .addImm(3));
     break;
-  default:
-    assert(false && "Incorrect access size");
-    break;
   }
 
   EmitInstruction(
@@ -940,7 +961,7 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall(
       MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8));
   EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg(
                            ShadowRegI32));
-  EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr));
+  EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr));
 
   EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
   EmitLabel(Out, DoneSym);
@@ -963,19 +984,18 @@ void X86AddressSanitizer64::InstrumentMemOperandLarge(
   {
     MCInst Inst;
     switch (AccessSize) {
+    default: llvm_unreachable("Incorrect access size");
     case 8:
       Inst.setOpcode(X86::CMP8mi);
       break;
     case 16:
       Inst.setOpcode(X86::CMP16mi);
       break;
-    default:
-      assert(false && "Incorrect access size");
-      break;
     }
     const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
     std::unique_ptr<X86Operand> Op(
-        X86Operand::CreateMem(0, Disp, ShadowRegI64, 0, 1, SMLoc(), SMLoc()));
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1,
+                              SMLoc(), SMLoc()));
     Op->addMemOperands(Inst, 5);
     Inst.addOperand(MCOperand::CreateImm(0));
     EmitInstruction(Out, Inst);
@@ -983,7 +1003,7 @@ void X86AddressSanitizer64::InstrumentMemOperandLarge(
 
   MCSymbol *DoneSym = Ctx.CreateTempSymbol();
   const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
-  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
 
   EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
   EmitLabel(Out, DoneSym);
@@ -999,7 +1019,7 @@ void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize,
   const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
   EmitInstruction(
       Out, MCInstBuilder(X86::TEST64rr).addReg(X86::RCX).addReg(X86::RCX));
-  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
 
   // Instrument first and last elements in src and dst range.
   InstrumentMOVSBase(X86::RDI /* DstReg */, X86::RSI /* SrcReg */,
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 8ef2a55..0b6fb52 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -86,7 +86,7 @@ private:
     typedef std::pair< InfixCalculatorTok, int64_t > ICToken;
     SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
     SmallVector<ICToken, 4> PostfixStack;
-    
+
   public:
     int64_t popOperand() {
       assert (!PostfixStack.empty() && "Poped an empty stack!");
@@ -100,7 +100,7 @@ private:
               "Unexpected operand!");
       PostfixStack.push_back(std::make_pair(Op, Val));
     }
-    
+
     void popOperator() { InfixOperatorStack.pop_back(); }
     void pushOperator(InfixCalculatorTok Op) {
       // Push the new operator if the stack is empty.
@@ -108,7 +108,7 @@ private:
         InfixOperatorStack.push_back(Op);
         return;
       }
-      
+
       // Push the new operator if it has a higher precedence than the operator
       // on the top of the stack or the operator on the top of the stack is a
       // left parentheses.
@@ -118,7 +118,7 @@ private:
         InfixOperatorStack.push_back(Op);
         return;
       }
-      
+
       // The operator on the top of the stack has higher precedence than the
       // new operator.
       unsigned ParenCount = 0;
@@ -126,17 +126,17 @@ private:
         // Nothing to process.
         if (InfixOperatorStack.empty())
           break;
-        
+
         Idx = InfixOperatorStack.size() - 1;
         StackOp = InfixOperatorStack[Idx];
         if (!(OpPrecedence[StackOp] >= OpPrecedence[Op] || ParenCount))
           break;
-        
+
         // If we have an even parentheses count and we see a left parentheses,
         // then stop processing.
         if (!ParenCount && StackOp == IC_LPAREN)
           break;
-        
+
         if (StackOp == IC_RPAREN) {
           ++ParenCount;
           InfixOperatorStack.pop_back();
@@ -158,10 +158,10 @@ private:
         if (StackOp != IC_LPAREN && StackOp != IC_RPAREN)
           PostfixStack.push_back(std::make_pair(StackOp, 0));
       }
-      
+
       if (PostfixStack.empty())
         return 0;
-      
+
       SmallVector<ICToken, 16> OperandStack;
       for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) {
         ICToken Op = PostfixStack[i];
@@ -263,7 +263,7 @@ private:
       State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0),
       Scale(1), Imm(imm), Sym(nullptr), StopOnLBrac(stoponlbrac),
       AddImmPrefix(addimmprefix) { Info.clear(); }
-    
+
     unsigned getBaseReg() { return BaseReg; }
     unsigned getIndexReg() { return IndexReg; }
     unsigned getScale() { return Scale; }
@@ -684,6 +684,7 @@ private:
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
   bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
 
+  bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
   bool processInstruction(MCInst &Inst, const OperandVector &Ops);
 
   /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds
@@ -711,13 +712,6 @@ private:
                                     uint64_t &ErrorInfo,
                                     bool MatchingInlineAsm);
 
-  unsigned getPointerSize() {
-    if (is16BitMode()) return 16;
-    if (is32BitMode()) return 32;
-    if (is64BitMode()) return 64;
-    llvm_unreachable("invalid mode");
-  }
-
   bool OmitRegisterFromClobberLists(unsigned RegNo) override;
 
   /// doSrcDstMatch - Returns true if operands are matching in their
@@ -977,16 +971,18 @@ std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
   unsigned basereg =
     is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI);
   const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
-  return X86Operand::CreateMem(/*SegReg=*/0, Disp, /*BaseReg=*/basereg,
-                               /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0);
+  return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+                               /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1,
+                               Loc, Loc, 0);
 }
 
 std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
   unsigned basereg =
     is64BitMode() ? X86::RDI : (is32BitMode() ? X86::EDI : X86::DI);
   const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
-  return X86Operand::CreateMem(/*SegReg=*/0, Disp, /*BaseReg=*/basereg,
-                               /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0);
+  return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+                               /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1,
+                               Loc, Loc, 0);
 }
 
 std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
@@ -1027,8 +1023,8 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
 
     // Create an absolute memory reference in order to match against
     // instructions taking a PC relative operand.
-    return X86Operand::CreateMem(Disp, Start, End, Size, Identifier,
-                                 Info.OpDecl);
+    return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size,
+                                 Identifier, Info.OpDecl);
   }
 
   // We either have a direct symbol reference, or an offset from a symbol.  The
@@ -1050,8 +1046,9 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
   // if we don't know the actual value at this time.  This is necessary to
   // get the matching correct in some cases.
   BaseReg = BaseReg ? BaseReg : 1;
-  return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
-                               End, Size, Identifier, Info.OpDecl);
+  return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
+                               IndexReg, Scale, Start, End, Size, Identifier,
+                               Info.OpDecl);
 }
 
 static void
@@ -1103,7 +1100,7 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites,
       (*I).Kind = AOK_Delete;
   }
   const char *SymLocPtr = SymName.data();
-  // Skip everything before the symbol.        
+  // Skip everything before the symbol.
   if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) {
     assert(Len > 0 && "Expected a non-negative length.");
     AsmRewrites->push_back(AsmRewrite(AOK_Skip, StartInBrac, Len));
@@ -1128,7 +1125,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
     // identifier.  Don't try an parse it as a register.
     if (Tok.getString().startswith("."))
       break;
-    
+
     // If we're parsing an immediate expression, we don't expect a '['.
     if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac)
       break;
@@ -1194,7 +1191,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
           MCSymbol *Sym =
               getContext().GetDirectionalLocalSymbol(IntVal, IDVal == "b");
           MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
-          const MCExpr *Val = 
+          const MCExpr *Val =
 	    MCSymbolRefExpr::Create(Sym, Variant, getContext());
           if (IDVal == "b" && Sym->isUndefined())
             return Error(Loc, "invalid reference to undefined symbol");
@@ -1279,7 +1276,7 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
     const MCExpr *NewDisp;
     if (ParseIntelDotOperator(Disp, NewDisp))
       return nullptr;
-    
+
     End = Tok.getEndLoc();
     Parser.Lex();  // Eat the field.
     Disp = NewDisp;
@@ -1292,17 +1289,17 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
     // handle [-42]
     if (!BaseReg && !IndexReg) {
       if (!SegReg)
-        return X86Operand::CreateMem(Disp, Start, End, Size);
-      else
-        return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, Start, End, Size);
+        return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size);
+      return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
+                                   Start, End, Size);
     }
     StringRef ErrMsg;
     if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
       Error(StartInBrac, ErrMsg);
       return nullptr;
     }
-    return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
-                                 End, Size);
+    return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
+                                 IndexReg, Scale, Start, End, Size);
   }
 
   InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
@@ -1383,9 +1380,9 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start,
       // be followed by a bracketed expression.  If it isn't we know we have our
       // final segment override.
       const MCExpr *Disp = MCConstantExpr::Create(ImmDisp, getContext());
-      return X86Operand::CreateMem(SegReg, Disp, /*BaseReg=*/0, /*IndexReg=*/0,
-                                   /*Scale=*/1, Start, ImmDispToken.getEndLoc(),
-                                   Size);
+      return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
+                                   /*BaseReg=*/0, /*IndexReg=*/0, /*Scale=*/1,
+                                   Start, ImmDispToken.getEndLoc(), Size);
     }
   }
 
@@ -1398,7 +1395,7 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start,
     if (getParser().parsePrimaryExpr(Val, End))
       return ErrorOperand(Tok.getLoc(), "unknown token in expression");
 
-    return X86Operand::CreateMem(Val, Start, End, Size);
+    return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size);
   }
 
   InlineAsmIdentifierInfo Info;
@@ -1428,7 +1425,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp,
     if (getParser().parsePrimaryExpr(Val, End))
       return ErrorOperand(Tok.getLoc(), "unknown token in expression");
 
-    return X86Operand::CreateMem(Val, Start, End, Size);
+    return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size);
   }
 
   InlineAsmIdentifierInfo Info;
@@ -1466,9 +1463,9 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp,
   // BaseReg is non-zero to avoid assertions.  In the context of inline asm,
   // we're pointing to a local variable in memory, so the base register is
   // really the frame or stack pointer.
-  return X86Operand::CreateMem(/*SegReg=*/0, Disp, /*BaseReg=*/1, /*IndexReg=*/0,
-                               /*Scale=*/1, Start, End, Size, Identifier,
-                               Info.OpDecl);
+  return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+                               /*BaseReg=*/1, /*IndexReg=*/0, /*Scale=*/1,
+                               Start, End, Size, Identifier, Info.OpDecl);
 }
 
 /// Parse the '.' operator.
@@ -1643,7 +1640,8 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
       // to the MCExpr with the directional local symbol and this is a
       // memory operand not an immediate operand.
       if (SM.getSym())
-        return X86Operand::CreateMem(SM.getSym(), Start, End, Size);
+        return X86Operand::CreateMem(getPointerWidth(), SM.getSym(), Start, End,
+                                     Size);
 
       const MCExpr *ImmExpr = MCConstantExpr::Create(Imm, getContext());
       return X86Operand::CreateImm(ImmExpr, Start, End);
@@ -1802,8 +1800,9 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
     if (getLexer().isNot(AsmToken::LParen)) {
       // Unless we have a segment register, treat this as an immediate.
       if (SegReg == 0)
-        return X86Operand::CreateMem(Disp, MemStart, ExprEnd);
-      return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd);
+        return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, ExprEnd);
+      return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
+                                   MemStart, ExprEnd);
     }
 
     // Eat the '('.
@@ -1829,8 +1828,10 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
       if (getLexer().isNot(AsmToken::LParen)) {
         // Unless we have a segment register, treat this as an immediate.
         if (SegReg == 0)
-          return X86Operand::CreateMem(Disp, LParenLoc, ExprEnd);
-        return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd);
+          return X86Operand::CreateMem(getPointerWidth(), Disp, LParenLoc,
+                                       ExprEnd);
+        return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
+                                     MemStart, ExprEnd);
       }
 
       // Eat the '('.
@@ -1946,9 +1947,9 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
   }
 
   if (SegReg || BaseReg || IndexReg)
-    return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
-                                 MemStart, MemEnd);
-  return X86Operand::CreateMem(Disp, MemStart, MemEnd);
+    return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
+                                 IndexReg, Scale, MemStart, MemEnd);
+  return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, MemEnd);
 }
 
 bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
@@ -1963,14 +1964,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     PatchedName = PatchedName.substr(0, Name.size()-1);
 
   // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
-  const MCExpr *ExtraImmOp = nullptr;
   if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
       (PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
        PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
     bool IsVCMP = PatchedName[0] == 'v';
-    unsigned SSECCIdx = IsVCMP ? 4 : 3;
-    unsigned SSEComparisonCode = StringSwitch<unsigned>(
-      PatchedName.slice(SSECCIdx, PatchedName.size() - 2))
+    unsigned CCIdx = IsVCMP ? 4 : 3;
+    unsigned ComparisonCode = StringSwitch<unsigned>(
+      PatchedName.slice(CCIdx, PatchedName.size() - 2))
       .Case("eq",       0x00)
       .Case("lt",       0x01)
       .Case("le",       0x02)
@@ -2005,27 +2005,75 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       .Case("gt_oq",    0x1E)
       .Case("true_us",  0x1F)
       .Default(~0U);
-    if (SSEComparisonCode != ~0U && (IsVCMP || SSEComparisonCode < 8)) {
-      ExtraImmOp = MCConstantExpr::Create(SSEComparisonCode,
-                                          getParser().getContext());
-      if (PatchedName.endswith("ss")) {
-        PatchedName = IsVCMP ? "vcmpss" : "cmpss";
-      } else if (PatchedName.endswith("sd")) {
-        PatchedName = IsVCMP ? "vcmpsd" : "cmpsd";
-      } else if (PatchedName.endswith("ps")) {
-        PatchedName = IsVCMP ? "vcmpps" : "cmpps";
-      } else {
-        assert(PatchedName.endswith("pd") && "Unexpected mnemonic!");
-        PatchedName = IsVCMP ? "vcmppd" : "cmppd";
-      }
+    if (ComparisonCode != ~0U && (IsVCMP || ComparisonCode < 8)) {
+
+      Operands.push_back(X86Operand::CreateToken(PatchedName.slice(0, CCIdx),
+                                                 NameLoc));
+
+      const MCExpr *ImmOp = MCConstantExpr::Create(ComparisonCode,
+                                                   getParser().getContext());
+      Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+
+      PatchedName = PatchedName.substr(PatchedName.size() - 2);
+    }
+  }
+
+  // FIXME: Hack to recognize vpcmp<comparison code>{ub,uw,ud,uq,b,w,d,q}.
+  if (PatchedName.startswith("vpcmp") &&
+      (PatchedName.endswith("b") || PatchedName.endswith("w") ||
+       PatchedName.endswith("d") || PatchedName.endswith("q"))) {
+    unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1;
+    unsigned ComparisonCode = StringSwitch<unsigned>(
+      PatchedName.slice(5, PatchedName.size() - CCIdx))
+      .Case("eq",    0x0) // Only allowed on unsigned. Checked below.
+      .Case("lt",    0x1)
+      .Case("le",    0x2)
+      //.Case("false", 0x3) // Not a documented alias.
+      .Case("neq",   0x4)
+      .Case("nlt",   0x5)
+      .Case("nle",   0x6)
+      //.Case("true",  0x7) // Not a documented alias.
+      .Default(~0U);
+    if (ComparisonCode != ~0U && (ComparisonCode != 0 || CCIdx == 2)) {
+      Operands.push_back(X86Operand::CreateToken("vpcmp", NameLoc));
+
+      const MCExpr *ImmOp = MCConstantExpr::Create(ComparisonCode,
+                                                   getParser().getContext());
+      Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+
+      PatchedName = PatchedName.substr(PatchedName.size() - CCIdx);
+    }
+  }
+
+  // FIXME: Hack to recognize vpcom<comparison code>{ub,uw,ud,uq,b,w,d,q}.
+  if (PatchedName.startswith("vpcom") &&
+      (PatchedName.endswith("b") || PatchedName.endswith("w") ||
+       PatchedName.endswith("d") || PatchedName.endswith("q"))) {
+    unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1;
+    unsigned ComparisonCode = StringSwitch<unsigned>(
+      PatchedName.slice(5, PatchedName.size() - CCIdx))
+      .Case("lt",    0x0)
+      .Case("le",    0x1)
+      .Case("gt",    0x2)
+      .Case("ge",    0x3)
+      .Case("eq",    0x4)
+      .Case("neq",   0x5)
+      .Case("false", 0x6)
+      .Case("true",  0x7)
+      .Default(~0U);
+    if (ComparisonCode != ~0U) {
+      Operands.push_back(X86Operand::CreateToken("vpcom", NameLoc));
+
+      const MCExpr *ImmOp = MCConstantExpr::Create(ComparisonCode,
+                                                   getParser().getContext());
+      Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+
+      PatchedName = PatchedName.substr(PatchedName.size() - CCIdx);
     }
   }
 
   Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
 
-  if (ExtraImmOp && !isParsingIntelSyntax())
-    Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc));
-
   // Determine whether this is an instruction prefix.
   bool isPrefix =
     Name == "lock" || Name == "rep" ||
@@ -2071,9 +2119,6 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       (isPrefix && getLexer().is(AsmToken::Slash)))
     Parser.Lex();
 
-  if (ExtraImmOp && isParsingIntelSyntax())
-    Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc));
-
   // This is a terrible hack to handle "out[bwl]? %al, (%dx)" ->
   // "outb %al, %dx".  Out doesn't take a memory form, but this is a widely
   // documented form in various unofficial manuals, so a lot of code uses it.
@@ -2272,6 +2317,22 @@ static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode,
   return convertToSExti8(Inst, Opcode, X86::RAX, isCmp);
 }
 
+bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
+  switch (Inst.getOpcode()) {
+  default: return true;
+  case X86::INT:
+    X86Operand &Op = static_cast<X86Operand &>(*Ops[1]);
+    assert(Op.isImm() && "expected immediate");
+    int64_t Res;
+    if (!Op.getImm()->EvaluateAsAbsolute(Res) || Res > 255) {
+      Error(Op.getStartLoc(), "interrupt vector must be in range [0-255]");
+      return false;
+    }
+    return true;
+  }
+  llvm_unreachable("handle the instruction appropriately");
+}
+
 bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
   switch (Inst.getOpcode()) {
   default: return false;
@@ -2432,8 +2493,11 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
   switch (MatchInstructionImpl(Operands, Inst,
                                ErrorInfo, MatchingInlineAsm,
                                isParsingIntelSyntax())) {
-  default: break;
+  default: llvm_unreachable("Unexpected match result!");
   case Match_Success:
+    if (!validateInstruction(Inst, Operands))
+      return true;
+
     // Some instructions need post-processing to, for example, tweak which
     // encoding is selected. Loop on it while changes happen so the
     // individual transformations can chain off each other.
@@ -2614,7 +2678,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
     static const char *const PtrSizedInstrs[] = {"call", "jmp", "push"};
     for (const char *Instr : PtrSizedInstrs) {
       if (Mnemonic == Instr) {
-        UnsizedMemOp->Mem.Size = getPointerSize();
+        UnsizedMemOp->Mem.Size = getPointerWidth();
         break;
       }
     }
@@ -2626,7 +2690,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
   SmallVector<unsigned, 8> Match;
   uint64_t ErrorInfoMissingFeature = 0;
   if (UnsizedMemOp && UnsizedMemOp->isMemUnsized()) {
-    static const unsigned MopSizes[] = {8, 16, 32, 64, 80};
+    static const unsigned MopSizes[] = {8, 16, 32, 64, 80, 128, 256, 512};
     for (unsigned Size : MopSizes) {
       UnsizedMemOp->Mem.Size = Size;
       uint64_t ErrorInfoIgnore;
@@ -2648,7 +2712,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
   }
 
   // If we haven't matched anything yet, this is not a basic integer or FPU
-  // operation.  There shouldn't be any ambiguity in our mneumonic table, so try
+  // operation.  There shouldn't be any ambiguity in our mnemonic table, so try
   // matching with the unsized operand.
   if (Match.empty()) {
     Match.push_back(MatchInstructionImpl(Operands, Inst, ErrorInfo,
@@ -2677,6 +2741,9 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
   unsigned NumSuccessfulMatches =
       std::count(std::begin(Match), std::end(Match), Match_Success);
   if (NumSuccessfulMatches == 1) {
+    if (!validateInstruction(Inst, Operands))
+      return true;
+
     // Some instructions need post-processing to, for example, tweak which
     // encoding is selected. Loop on it while changes happen so the individual
     // transformations can chain off each other.
diff --git a/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
index 72aeeaa..7610806 100644
--- a/lib/Target/X86/AsmParser/X86AsmParserCommon.h
+++ b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
@@ -34,6 +34,11 @@ inline bool isImmSExti64i32Value(uint64_t Value) {
           (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
 }
 
+inline bool isImmUnsignedi8Value(uint64_t Value) {
+  return ((                                  Value <= 0x00000000000000FFULL)||
+          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
 } // End of namespace llvm
 
 #endif
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index e0fab8d..d67e119 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -53,6 +53,7 @@ struct X86Operand : public MCParsedAsmOperand {
     unsigned IndexReg;
     unsigned Scale;
     unsigned Size;
+    unsigned ModeSize;
   };
 
   union {
@@ -120,6 +121,10 @@ struct X86Operand : public MCParsedAsmOperand {
     assert(Kind == Memory && "Invalid access!");
     return Mem.Scale;
   }
+  unsigned getMemModeSize() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.ModeSize;
+  }
 
   bool isToken() const override {return Kind == Token; }
 
@@ -182,6 +187,13 @@ struct X86Operand : public MCParsedAsmOperand {
     return isImmSExti64i32Value(CE->getValue());
   }
 
+  bool isImmUnsignedi8() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    return isImmUnsignedi8Value(CE->getValue());
+  }
+
   bool isOffsetOf() const override {
     return OffsetOfLoc.getPointer();
   }
@@ -249,6 +261,10 @@ struct X86Operand : public MCParsedAsmOperand {
       !getMemIndexReg() && getMemScale() == 1;
   }
 
+  bool isAbsMem16() const {
+    return isAbsMem() && Mem.ModeSize == 16;
+  }
+
   bool isSrcIdx() const {
     return !getMemIndexReg() && getMemScale() == 1 &&
       (getMemBaseReg() == X86::RSI || getMemBaseReg() == X86::ESI ||
@@ -288,21 +304,43 @@ struct X86Operand : public MCParsedAsmOperand {
     return isMem64() && isDstIdx();
   }
 
-  bool isMemOffs8() const {
-    return Kind == Memory && !getMemBaseReg() &&
-      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 8);
+  bool isMemOffs() const {
+    return Kind == Memory && !getMemBaseReg() && !getMemIndexReg() &&
+      getMemScale() == 1;
+  }
+
+  bool isMemOffs16_8() const {
+    return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMemOffs16_16() const {
+    return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 16);
   }
-  bool isMemOffs16() const {
-    return Kind == Memory && !getMemBaseReg() &&
-      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 16);
+  bool isMemOffs16_32() const {
+    return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 32);
   }
-  bool isMemOffs32() const {
-    return Kind == Memory && !getMemBaseReg() &&
-      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 32);
+  bool isMemOffs32_8() const {
+    return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 8);
   }
-  bool isMemOffs64() const {
-    return Kind == Memory && !getMemBaseReg() &&
-      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 64);
+  bool isMemOffs32_16() const {
+    return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMemOffs32_32() const {
+    return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMemOffs32_64() const {
+    return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 64);
+  }
+  bool isMemOffs64_8() const {
+    return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMemOffs64_16() const {
+    return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMemOffs64_32() const {
+    return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMemOffs64_64() const {
+    return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 64);
   }
 
   bool isReg() const override { return Kind == Register; }
@@ -430,8 +468,9 @@ struct X86Operand : public MCParsedAsmOperand {
 
   /// Create an absolute memory operand.
   static std::unique_ptr<X86Operand>
-  CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, unsigned Size = 0,
-            StringRef SymName = StringRef(), void *OpDecl = nullptr) {
+  CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
+            unsigned Size = 0, StringRef SymName = StringRef(),
+            void *OpDecl = nullptr) {
     auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
     Res->Mem.SegReg   = 0;
     Res->Mem.Disp     = Disp;
@@ -439,6 +478,7 @@ struct X86Operand : public MCParsedAsmOperand {
     Res->Mem.IndexReg = 0;
     Res->Mem.Scale    = 1;
     Res->Mem.Size     = Size;
+    Res->Mem.ModeSize = ModeSize;
     Res->SymName      = SymName;
     Res->OpDecl       = OpDecl;
     Res->AddressOf    = false;
@@ -447,9 +487,9 @@ struct X86Operand : public MCParsedAsmOperand {
 
   /// Create a generalized memory operand.
   static std::unique_ptr<X86Operand>
-  CreateMem(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
-            unsigned IndexReg, unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
-            unsigned Size = 0, StringRef SymName = StringRef(),
+  CreateMem(unsigned ModeSize, unsigned SegReg, const MCExpr *Disp,
+            unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc,
+            SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(),
             void *OpDecl = nullptr) {
     // We should never just have a displacement, that should be parsed as an
     // absolute memory operand.
@@ -465,6 +505,7 @@ struct X86Operand : public MCParsedAsmOperand {
     Res->Mem.IndexReg = IndexReg;
     Res->Mem.Scale    = Scale;
     Res->Mem.Size     = Size;
+    Res->Mem.ModeSize = ModeSize;
     Res->SymName      = SymName;
     Res->OpDecl       = OpDecl;
     Res->AddressOf    = false;
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 1083fad..be61b47 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -14,6 +14,7 @@ add_public_tablegen_target(X86CommonTableGen)
 
 set(sources
   X86AsmPrinter.cpp
+  X86CallFrameOptimization.cpp
   X86FastISel.cpp
   X86FloatingPoint.cpp
   X86FrameLowering.cpp
@@ -38,7 +39,7 @@ if( CMAKE_CL_64 )
   ADD_CUSTOM_COMMAND(
     OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj
     MAIN_DEPENDENCY X86CompilationCallback_Win64.asm
-    COMMAND ${CMAKE_ASM_MASM_COMPILER} /Fo ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj /c ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm
+    COMMAND ${CMAKE_ASM_MASM_COMPILER} /nologo /Fo ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj /c ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm
    )
    set(sources ${sources} ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj)
 endif()
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 5e8c2d6..99fb1ab 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -51,8 +51,8 @@ const char *llvm::X86Disassembler::GetInstrName(unsigned Opcode,
 
 #define debug(s) DEBUG(Debug(__FILE__, __LINE__, s));
 
-namespace llvm {  
-  
+namespace llvm {
+
 // Fill-ins to make the compiler happy.  These constants are never actually
 //   assigned; they are just filler to make an automatically-generated switch
 //   statement work.
@@ -127,11 +127,11 @@ static int regionReader(const void *Arg, uint8_t *Byte, uint64_t Address) {
 static void logger(void* arg, const char* log) {
   if (!arg)
     return;
-  
+
   raw_ostream &vStream = *(static_cast<raw_ostream*>(arg));
   vStream << log << "\n";
-}  
-  
+}
+
 //
 // Public interface for the disassembler
 //
@@ -184,7 +184,7 @@ static void translateRegister(MCInst &mcInst, Reg reg) {
 }
 
 /// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
-/// immediate Value in the MCInst. 
+/// immediate Value in the MCInst.
 ///
 /// @param Value      - The immediate Value, has had any PC adjustment made by
 ///                     the caller.
@@ -196,7 +196,7 @@ static void translateRegister(MCInst &mcInst, Reg reg) {
 /// If the getOpInfo() function was set when setupForSymbolicDisassembly() was
 /// called then that function is called to get any symbolic information for the
 /// immediate in the instruction using the Address, Offset and Width.  If that
-/// returns non-zero then the symbolic information it returns is used to create 
+/// returns non-zero then the symbolic information it returns is used to create
 /// an MCExpr and that is added as an operand to the MCInst.  If getOpInfo()
 /// returns zero and isBranch is true then a symbol look up for immediate Value
 /// is done and if a symbol is found an MCExpr is created with that, else
@@ -204,8 +204,8 @@ static void translateRegister(MCInst &mcInst, Reg reg) {
 /// if it adds an operand to the MCInst and false otherwise.
 static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
                                      uint64_t Address, uint64_t Offset,
-                                     uint64_t Width, MCInst &MI, 
-                                     const MCDisassembler *Dis) {  
+                                     uint64_t Width, MCInst &MI,
+                                     const MCDisassembler *Dis) {
   return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
                                        Offset, Width);
 }
@@ -215,7 +215,7 @@ static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
 /// These can often be addresses in a literal pool.  The Address of the
 /// instruction and its immediate Value are used to determine the address
 /// being referenced in the literal pool entry.  The SymbolLookUp call back will
-/// return a pointer to a literal 'C' string if the referenced address is an 
+/// return a pointer to a literal 'C' string if the referenced address is an
 /// address into a section with 'C' string literals.
 static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value,
                                             const void *Decoder) {
@@ -287,7 +287,7 @@ static bool translateDstIndex(MCInst &mcInst, InternalInstruction &insn) {
 static void translateImmediate(MCInst &mcInst, uint64_t immediate,
                                const OperandSpecifier &operand,
                                InternalInstruction &insn,
-                               const MCDisassembler *Dis) {  
+                               const MCDisassembler *Dis) {
   // Sign-extend the immediate if necessary.
 
   OperandType type = (OperandType)operand.type;
@@ -320,24 +320,12 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
   // By default sign-extend all X86 immediates based on their encoding.
   else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 ||
            type == TYPE_IMM64 || type == TYPE_IMMv) {
-    uint32_t Opcode = mcInst.getOpcode();
     switch (operand.encoding) {
     default:
       break;
     case ENCODING_IB:
-      // Special case those X86 instructions that use the imm8 as a set of
-      // bits, bit count, etc. and are not sign-extend.
-      if (Opcode != X86::BLENDPSrri && Opcode != X86::BLENDPDrri &&
-          Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri &&
-          Opcode != X86::DPPSrri && Opcode != X86::DPPDrri &&
-          Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri &&
-          Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri &&
-          Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri &&
-          Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri &&
-          Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri &&
-          Opcode != X86::VINSERTPSrr)
-        if(immediate & 0x80)
-          immediate |= ~(0xffull);
+      if(immediate & 0x80)
+        immediate |= ~(0xffull);
       break;
     case ENCODING_IW:
       if(immediate & 0x8000)
@@ -350,6 +338,199 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
     case ENCODING_IO:
       break;
     }
+  } else if (type == TYPE_IMM3) {
+    // Check for immediates that printSSECC can't handle.
+    if (immediate >= 8) {
+      unsigned NewOpc;
+      switch (mcInst.getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case X86::CMPPDrmi:  NewOpc = X86::CMPPDrmi_alt;  break;
+      case X86::CMPPDrri:  NewOpc = X86::CMPPDrri_alt;  break;
+      case X86::CMPPSrmi:  NewOpc = X86::CMPPSrmi_alt;  break;
+      case X86::CMPPSrri:  NewOpc = X86::CMPPSrri_alt;  break;
+      case X86::CMPSDrm:   NewOpc = X86::CMPSDrm_alt;   break;
+      case X86::CMPSDrr:   NewOpc = X86::CMPSDrr_alt;   break;
+      case X86::CMPSSrm:   NewOpc = X86::CMPSSrm_alt;   break;
+      case X86::CMPSSrr:   NewOpc = X86::CMPSSrr_alt;   break;
+      case X86::VPCOMBri:  NewOpc = X86::VPCOMBri_alt;  break;
+      case X86::VPCOMBmi:  NewOpc = X86::VPCOMBmi_alt;  break;
+      case X86::VPCOMWri:  NewOpc = X86::VPCOMWri_alt;  break;
+      case X86::VPCOMWmi:  NewOpc = X86::VPCOMWmi_alt;  break;
+      case X86::VPCOMDri:  NewOpc = X86::VPCOMDri_alt;  break;
+      case X86::VPCOMDmi:  NewOpc = X86::VPCOMDmi_alt;  break;
+      case X86::VPCOMQri:  NewOpc = X86::VPCOMQri_alt;  break;
+      case X86::VPCOMQmi:  NewOpc = X86::VPCOMQmi_alt;  break;
+      case X86::VPCOMUBri: NewOpc = X86::VPCOMUBri_alt; break;
+      case X86::VPCOMUBmi: NewOpc = X86::VPCOMUBmi_alt; break;
+      case X86::VPCOMUWri: NewOpc = X86::VPCOMUWri_alt; break;
+      case X86::VPCOMUWmi: NewOpc = X86::VPCOMUWmi_alt; break;
+      case X86::VPCOMUDri: NewOpc = X86::VPCOMUDri_alt; break;
+      case X86::VPCOMUDmi: NewOpc = X86::VPCOMUDmi_alt; break;
+      case X86::VPCOMUQri: NewOpc = X86::VPCOMUQri_alt; break;
+      case X86::VPCOMUQmi: NewOpc = X86::VPCOMUQmi_alt; break;
+      }
+      // Switch opcode to the one that doesn't get special printing.
+      mcInst.setOpcode(NewOpc);
+    }
+  } else if (type == TYPE_IMM5) {
+    // Check for immediates that printAVXCC can't handle.
+    if (immediate >= 32) {
+      unsigned NewOpc;
+      switch (mcInst.getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case X86::VCMPPDrmi:  NewOpc = X86::VCMPPDrmi_alt;  break;
+      case X86::VCMPPDrri:  NewOpc = X86::VCMPPDrri_alt;  break;
+      case X86::VCMPPSrmi:  NewOpc = X86::VCMPPSrmi_alt;  break;
+      case X86::VCMPPSrri:  NewOpc = X86::VCMPPSrri_alt;  break;
+      case X86::VCMPSDrm:   NewOpc = X86::VCMPSDrm_alt;   break;
+      case X86::VCMPSDrr:   NewOpc = X86::VCMPSDrr_alt;   break;
+      case X86::VCMPSSrm:   NewOpc = X86::VCMPSSrm_alt;   break;
+      case X86::VCMPSSrr:   NewOpc = X86::VCMPSSrr_alt;   break;
+      case X86::VCMPPDYrmi: NewOpc = X86::VCMPPDYrmi_alt; break;
+      case X86::VCMPPDYrri: NewOpc = X86::VCMPPDYrri_alt; break;
+      case X86::VCMPPSYrmi: NewOpc = X86::VCMPPSYrmi_alt; break;
+      case X86::VCMPPSYrri: NewOpc = X86::VCMPPSYrri_alt; break;
+      case X86::VCMPPDZrmi: NewOpc = X86::VCMPPDZrmi_alt; break;
+      case X86::VCMPPDZrri: NewOpc = X86::VCMPPDZrri_alt; break;
+      case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break;
+      case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break;
+      case X86::VCMPSDZrm:  NewOpc = X86::VCMPSDZrmi_alt; break;
+      case X86::VCMPSDZrr:  NewOpc = X86::VCMPSDZrri_alt; break;
+      case X86::VCMPSSZrm:  NewOpc = X86::VCMPSSZrmi_alt; break;
+      case X86::VCMPSSZrr:  NewOpc = X86::VCMPSSZrri_alt; break;
+      }
+      // Switch opcode to the one that doesn't get special printing.
+      mcInst.setOpcode(NewOpc);
+    }
+  } else if (type == TYPE_AVX512ICC) {
+    if (immediate >= 8 || ((immediate & 0x3) == 3)) {
+      unsigned NewOpc;
+      switch (mcInst.getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case X86::VPCMPBZ128rmi:    NewOpc = X86::VPCMPBZ128rmi_alt;    break;
+      case X86::VPCMPBZ128rmik:   NewOpc = X86::VPCMPBZ128rmik_alt;   break;
+      case X86::VPCMPBZ128rri:    NewOpc = X86::VPCMPBZ128rri_alt;    break;
+      case X86::VPCMPBZ128rrik:   NewOpc = X86::VPCMPBZ128rrik_alt;   break;
+      case X86::VPCMPBZ256rmi:    NewOpc = X86::VPCMPBZ256rmi_alt;    break;
+      case X86::VPCMPBZ256rmik:   NewOpc = X86::VPCMPBZ256rmik_alt;   break;
+      case X86::VPCMPBZ256rri:    NewOpc = X86::VPCMPBZ256rri_alt;    break;
+      case X86::VPCMPBZ256rrik:   NewOpc = X86::VPCMPBZ256rrik_alt;   break;
+      case X86::VPCMPBZrmi:       NewOpc = X86::VPCMPBZrmi_alt;       break;
+      case X86::VPCMPBZrmik:      NewOpc = X86::VPCMPBZrmik_alt;      break;
+      case X86::VPCMPBZrri:       NewOpc = X86::VPCMPBZrri_alt;       break;
+      case X86::VPCMPBZrrik:      NewOpc = X86::VPCMPBZrrik_alt;      break;
+      case X86::VPCMPDZ128rmi:    NewOpc = X86::VPCMPDZ128rmi_alt;    break;
+      case X86::VPCMPDZ128rmib:   NewOpc = X86::VPCMPDZ128rmib_alt;   break;
+      case X86::VPCMPDZ128rmibk:  NewOpc = X86::VPCMPDZ128rmibk_alt;  break;
+      case X86::VPCMPDZ128rmik:   NewOpc = X86::VPCMPDZ128rmik_alt;   break;
+      case X86::VPCMPDZ128rri:    NewOpc = X86::VPCMPDZ128rri_alt;    break;
+      case X86::VPCMPDZ128rrik:   NewOpc = X86::VPCMPDZ128rrik_alt;   break;
+      case X86::VPCMPDZ256rmi:    NewOpc = X86::VPCMPDZ256rmi_alt;    break;
+      case X86::VPCMPDZ256rmib:   NewOpc = X86::VPCMPDZ256rmib_alt;   break;
+      case X86::VPCMPDZ256rmibk:  NewOpc = X86::VPCMPDZ256rmibk_alt;  break;
+      case X86::VPCMPDZ256rmik:   NewOpc = X86::VPCMPDZ256rmik_alt;   break;
+      case X86::VPCMPDZ256rri:    NewOpc = X86::VPCMPDZ256rri_alt;    break;
+      case X86::VPCMPDZ256rrik:   NewOpc = X86::VPCMPDZ256rrik_alt;   break;
+      case X86::VPCMPDZrmi:       NewOpc = X86::VPCMPDZrmi_alt;       break;
+      case X86::VPCMPDZrmib:      NewOpc = X86::VPCMPDZrmib_alt;      break;
+      case X86::VPCMPDZrmibk:     NewOpc = X86::VPCMPDZrmibk_alt;     break;
+      case X86::VPCMPDZrmik:      NewOpc = X86::VPCMPDZrmik_alt;      break;
+      case X86::VPCMPDZrri:       NewOpc = X86::VPCMPDZrri_alt;       break;
+      case X86::VPCMPDZrrik:      NewOpc = X86::VPCMPDZrrik_alt;      break;
+      case X86::VPCMPQZ128rmi:    NewOpc = X86::VPCMPQZ128rmi_alt;    break;
+      case X86::VPCMPQZ128rmib:   NewOpc = X86::VPCMPQZ128rmib_alt;   break;
+      case X86::VPCMPQZ128rmibk:  NewOpc = X86::VPCMPQZ128rmibk_alt;  break;
+      case X86::VPCMPQZ128rmik:   NewOpc = X86::VPCMPQZ128rmik_alt;   break;
+      case X86::VPCMPQZ128rri:    NewOpc = X86::VPCMPQZ128rri_alt;    break;
+      case X86::VPCMPQZ128rrik:   NewOpc = X86::VPCMPQZ128rrik_alt;   break;
+      case X86::VPCMPQZ256rmi:    NewOpc = X86::VPCMPQZ256rmi_alt;    break;
+      case X86::VPCMPQZ256rmib:   NewOpc = X86::VPCMPQZ256rmib_alt;   break;
+      case X86::VPCMPQZ256rmibk:  NewOpc = X86::VPCMPQZ256rmibk_alt;  break;
+      case X86::VPCMPQZ256rmik:   NewOpc = X86::VPCMPQZ256rmik_alt;   break;
+      case X86::VPCMPQZ256rri:    NewOpc = X86::VPCMPQZ256rri_alt;    break;
+      case X86::VPCMPQZ256rrik:   NewOpc = X86::VPCMPQZ256rrik_alt;   break;
+      case X86::VPCMPQZrmi:       NewOpc = X86::VPCMPQZrmi_alt;       break;
+      case X86::VPCMPQZrmib:      NewOpc = X86::VPCMPQZrmib_alt;      break;
+      case X86::VPCMPQZrmibk:     NewOpc = X86::VPCMPQZrmibk_alt;     break;
+      case X86::VPCMPQZrmik:      NewOpc = X86::VPCMPQZrmik_alt;      break;
+      case X86::VPCMPQZrri:       NewOpc = X86::VPCMPQZrri_alt;       break;
+      case X86::VPCMPQZrrik:      NewOpc = X86::VPCMPQZrrik_alt;      break;
+      case X86::VPCMPUBZ128rmi:   NewOpc = X86::VPCMPUBZ128rmi_alt;   break;
+      case X86::VPCMPUBZ128rmik:  NewOpc = X86::VPCMPUBZ128rmik_alt;  break;
+      case X86::VPCMPUBZ128rri:   NewOpc = X86::VPCMPUBZ128rri_alt;   break;
+      case X86::VPCMPUBZ128rrik:  NewOpc = X86::VPCMPUBZ128rrik_alt;  break;
+      case X86::VPCMPUBZ256rmi:   NewOpc = X86::VPCMPUBZ256rmi_alt;   break;
+      case X86::VPCMPUBZ256rmik:  NewOpc = X86::VPCMPUBZ256rmik_alt;  break;
+      case X86::VPCMPUBZ256rri:   NewOpc = X86::VPCMPUBZ256rri_alt;   break;
+      case X86::VPCMPUBZ256rrik:  NewOpc = X86::VPCMPUBZ256rrik_alt;  break;
+      case X86::VPCMPUBZrmi:      NewOpc = X86::VPCMPUBZrmi_alt;      break;
+      case X86::VPCMPUBZrmik:     NewOpc = X86::VPCMPUBZrmik_alt;     break;
+      case X86::VPCMPUBZrri:      NewOpc = X86::VPCMPUBZrri_alt;      break;
+      case X86::VPCMPUBZrrik:     NewOpc = X86::VPCMPUBZrrik_alt;     break;
+      case X86::VPCMPUDZ128rmi:   NewOpc = X86::VPCMPUDZ128rmi_alt;   break;
+      case X86::VPCMPUDZ128rmib:  NewOpc = X86::VPCMPUDZ128rmib_alt;  break;
+      case X86::VPCMPUDZ128rmibk: NewOpc = X86::VPCMPUDZ128rmibk_alt; break;
+      case X86::VPCMPUDZ128rmik:  NewOpc = X86::VPCMPUDZ128rmik_alt;  break;
+      case X86::VPCMPUDZ128rri:   NewOpc = X86::VPCMPUDZ128rri_alt;   break;
+      case X86::VPCMPUDZ128rrik:  NewOpc = X86::VPCMPUDZ128rrik_alt;  break;
+      case X86::VPCMPUDZ256rmi:   NewOpc = X86::VPCMPUDZ256rmi_alt;   break;
+      case X86::VPCMPUDZ256rmib:  NewOpc = X86::VPCMPUDZ256rmib_alt;  break;
+      case X86::VPCMPUDZ256rmibk: NewOpc = X86::VPCMPUDZ256rmibk_alt; break;
+      case X86::VPCMPUDZ256rmik:  NewOpc = X86::VPCMPUDZ256rmik_alt;  break;
+      case X86::VPCMPUDZ256rri:   NewOpc = X86::VPCMPUDZ256rri_alt;   break;
+      case X86::VPCMPUDZ256rrik:  NewOpc = X86::VPCMPUDZ256rrik_alt;  break;
+      case X86::VPCMPUDZrmi:      NewOpc = X86::VPCMPUDZrmi_alt;      break;
+      case X86::VPCMPUDZrmib:     NewOpc = X86::VPCMPUDZrmib_alt;     break;
+      case X86::VPCMPUDZrmibk:    NewOpc = X86::VPCMPUDZrmibk_alt;    break;
+      case X86::VPCMPUDZrmik:     NewOpc = X86::VPCMPUDZrmik_alt;     break;
+      case X86::VPCMPUDZrri:      NewOpc = X86::VPCMPUDZrri_alt;      break;
+      case X86::VPCMPUDZrrik:     NewOpc = X86::VPCMPUDZrrik_alt;     break;
+      case X86::VPCMPUQZ128rmi:   NewOpc = X86::VPCMPUQZ128rmi_alt;   break;
+      case X86::VPCMPUQZ128rmib:  NewOpc = X86::VPCMPUQZ128rmib_alt;  break;
+      case X86::VPCMPUQZ128rmibk: NewOpc = X86::VPCMPUQZ128rmibk_alt; break;
+      case X86::VPCMPUQZ128rmik:  NewOpc = X86::VPCMPUQZ128rmik_alt;  break;
+      case X86::VPCMPUQZ128rri:   NewOpc = X86::VPCMPUQZ128rri_alt;   break;
+      case X86::VPCMPUQZ128rrik:  NewOpc = X86::VPCMPUQZ128rrik_alt;  break;
+      case X86::VPCMPUQZ256rmi:   NewOpc = X86::VPCMPUQZ256rmi_alt;   break;
+      case X86::VPCMPUQZ256rmib:  NewOpc = X86::VPCMPUQZ256rmib_alt;  break;
+      case X86::VPCMPUQZ256rmibk: NewOpc = X86::VPCMPUQZ256rmibk_alt; break;
+      case X86::VPCMPUQZ256rmik:  NewOpc = X86::VPCMPUQZ256rmik_alt;  break;
+      case X86::VPCMPUQZ256rri:   NewOpc = X86::VPCMPUQZ256rri_alt;   break;
+      case X86::VPCMPUQZ256rrik:  NewOpc = X86::VPCMPUQZ256rrik_alt;  break;
+      case X86::VPCMPUQZrmi:      NewOpc = X86::VPCMPUQZrmi_alt;      break;
+      case X86::VPCMPUQZrmib:     NewOpc = X86::VPCMPUQZrmib_alt;     break;
+      case X86::VPCMPUQZrmibk:    NewOpc = X86::VPCMPUQZrmibk_alt;    break;
+      case X86::VPCMPUQZrmik:     NewOpc = X86::VPCMPUQZrmik_alt;     break;
+      case X86::VPCMPUQZrri:      NewOpc = X86::VPCMPUQZrri_alt;      break;
+      case X86::VPCMPUQZrrik:     NewOpc = X86::VPCMPUQZrrik_alt;     break;
+      case X86::VPCMPUWZ128rmi:   NewOpc = X86::VPCMPUWZ128rmi_alt;   break;
+      case X86::VPCMPUWZ128rmik:  NewOpc = X86::VPCMPUWZ128rmik_alt;  break;
+      case X86::VPCMPUWZ128rri:   NewOpc = X86::VPCMPUWZ128rri_alt;   break;
+      case X86::VPCMPUWZ128rrik:  NewOpc = X86::VPCMPUWZ128rrik_alt;  break;
+      case X86::VPCMPUWZ256rmi:   NewOpc = X86::VPCMPUWZ256rmi_alt;   break;
+      case X86::VPCMPUWZ256rmik:  NewOpc = X86::VPCMPUWZ256rmik_alt;  break;
+      case X86::VPCMPUWZ256rri:   NewOpc = X86::VPCMPUWZ256rri_alt;   break;
+      case X86::VPCMPUWZ256rrik:  NewOpc = X86::VPCMPUWZ256rrik_alt;  break;
+      case X86::VPCMPUWZrmi:      NewOpc = X86::VPCMPUWZrmi_alt;      break;
+      case X86::VPCMPUWZrmik:     NewOpc = X86::VPCMPUWZrmik_alt;     break;
+      case X86::VPCMPUWZrri:      NewOpc = X86::VPCMPUWZrri_alt;      break;
+      case X86::VPCMPUWZrrik:     NewOpc = X86::VPCMPUWZrrik_alt;     break;
+      case X86::VPCMPWZ128rmi:    NewOpc = X86::VPCMPWZ128rmi_alt;    break;
+      case X86::VPCMPWZ128rmik:   NewOpc = X86::VPCMPWZ128rmik_alt;   break;
+      case X86::VPCMPWZ128rri:    NewOpc = X86::VPCMPWZ128rri_alt;    break;
+      case X86::VPCMPWZ128rrik:   NewOpc = X86::VPCMPWZ128rrik_alt;   break;
+      case X86::VPCMPWZ256rmi:    NewOpc = X86::VPCMPWZ256rmi_alt;    break;
+      case X86::VPCMPWZ256rmik:   NewOpc = X86::VPCMPWZ256rmik_alt;   break;
+      case X86::VPCMPWZ256rri:    NewOpc = X86::VPCMPWZ256rri_alt;    break;
+      case X86::VPCMPWZ256rrik:   NewOpc = X86::VPCMPWZ256rrik_alt;   break;
+      case X86::VPCMPWZrmi:       NewOpc = X86::VPCMPWZrmi_alt;       break;
+      case X86::VPCMPWZrmik:      NewOpc = X86::VPCMPWZrmik_alt;      break;
+      case X86::VPCMPWZrri:       NewOpc = X86::VPCMPWZrri_alt;       break;
+      case X86::VPCMPWZrrik:      NewOpc = X86::VPCMPWZrrik_alt;      break;
+      }
+      // Switch opcode to the one that doesn't get special printing.
+      mcInst.setOpcode(NewOpc);
+    }
   }
 
   switch (type) {
@@ -407,7 +588,7 @@ static bool translateRMRegister(MCInst &mcInst,
     debug("A R/M register operand may not have a SIB byte");
     return true;
   }
-  
+
   switch (insn.eaBase) {
   default:
     debug("Unexpected EA base register");
@@ -427,7 +608,7 @@ static bool translateRMRegister(MCInst &mcInst,
   ALL_REGS
 #undef ENTRY
   }
-  
+
   return false;
 }
 
@@ -440,26 +621,26 @@ static bool translateRMRegister(MCInst &mcInst,
 ///                       from.
 /// @return             - 0 on success; nonzero otherwise
 static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
-                              const MCDisassembler *Dis) {  
+                              const MCDisassembler *Dis) {
   // Addresses in an MCInst are represented as five operands:
-  //   1. basereg       (register)  The R/M base, or (if there is a SIB) the 
+  //   1. basereg       (register)  The R/M base, or (if there is a SIB) the
   //                                SIB base
-  //   2. scaleamount   (immediate) 1, or (if there is a SIB) the specified 
+  //   2. scaleamount   (immediate) 1, or (if there is a SIB) the specified
   //                                scale amount
   //   3. indexreg      (register)  x86_registerNONE, or (if there is a SIB)
-  //                                the index (which is multiplied by the 
+  //                                the index (which is multiplied by the
   //                                scale amount)
   //   4. displacement  (immediate) 0, or the displacement if there is one
   //   5. segmentreg    (register)  x86_registerNONE for now, but could be set
   //                                if we have segment overrides
-  
+
   MCOperand baseReg;
   MCOperand scaleAmount;
   MCOperand indexReg;
   MCOperand displacement;
   MCOperand segmentReg;
   uint64_t pcrel = 0;
-  
+
   if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
     if (insn.sibBase != SIB_BASE_NONE) {
       switch (insn.sibBase) {
@@ -512,7 +693,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
                          (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX);
       SIBIndex IndexBase = IndexIs512 ? SIB_INDEX_ZMM0 :
                            IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0;
-      insn.sibIndex = (SIBIndex)(IndexBase + 
+      insn.sibIndex = (SIBIndex)(IndexBase +
                            (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset));
     }
 
@@ -534,7 +715,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
     } else {
       indexReg = MCOperand::CreateReg(0);
     }
-    
+
     scaleAmount = MCOperand::CreateImm(insn.sibScale);
   } else {
     switch (insn.eaBase) {
@@ -553,7 +734,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
       }
       else
         baseReg = MCOperand::CreateReg(0);
-      
+
       indexReg = MCOperand::CreateReg(0);
       break;
     case EA_BASE_BX_SI:
@@ -584,7 +765,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
         //   placeholders to keep the compiler happy.
 #define ENTRY(x)                                        \
       case EA_BASE_##x:                                 \
-        baseReg = MCOperand::CreateReg(X86::x); break; 
+        baseReg = MCOperand::CreateReg(X86::x); break;
       ALL_EA_BASES
 #undef ENTRY
 #define ENTRY(x) case EA_REG_##x:
@@ -595,14 +776,14 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
         return true;
       }
     }
-    
+
     scaleAmount = MCOperand::CreateImm(1);
   }
-  
+
   displacement = MCOperand::CreateImm(insn.displacement);
 
   segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]);
-  
+
   mcInst.addOperand(baseReg);
   mcInst.addOperand(scaleAmount);
   mcInst.addOperand(indexReg);
@@ -623,7 +804,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
 ///                       from.
 /// @return             - 0 on success; nonzero otherwise
 static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
-                        InternalInstruction &insn, const MCDisassembler *Dis) {  
+                        InternalInstruction &insn, const MCDisassembler *Dis) {
   switch (operand.type) {
   default:
     debug("Unexpected type for a R/M operand");
@@ -633,8 +814,6 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
   case TYPE_R32:
   case TYPE_R64:
   case TYPE_Rv:
-  case TYPE_MM:
-  case TYPE_MM32:
   case TYPE_MM64:
   case TYPE_XMM:
   case TYPE_XMM32:
@@ -660,9 +839,6 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
   case TYPE_M32FP:
   case TYPE_M64FP:
   case TYPE_M80FP:
-  case TYPE_M16INT:
-  case TYPE_M32INT:
-  case TYPE_M64INT:
   case TYPE_M1616:
   case TYPE_M1632:
   case TYPE_M1664:
@@ -670,7 +846,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
     return translateRMMemory(mcInst, insn, Dis);
   }
 }
-  
+
 /// translateFPRegister - Translates a stack position on the FPU stack to its
 ///   LLVM form, and appends it to an MCInst.
 ///
@@ -698,7 +874,7 @@ static bool translateMaskRegister(MCInst &mcInst,
   return false;
 }
 
-/// translateOperand - Translates an operand stored in an internal instruction 
+/// translateOperand - Translates an operand stored in an internal instruction
 ///   to LLVM's format and appends it to an MCInst.
 ///
 /// @param mcInst       - The MCInst to append to.
@@ -707,7 +883,7 @@ static bool translateMaskRegister(MCInst &mcInst,
 /// @return             - false on success; true otherwise.
 static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
                              InternalInstruction &insn,
-                             const MCDisassembler *Dis) {  
+                             const MCDisassembler *Dis) {
   switch (operand.encoding) {
   default:
     debug("Unhandled operand encoding during translation");
@@ -761,7 +937,7 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
                             insn, Dis);
   }
 }
-  
+
 /// translateInstruction - Translates an internal instruction and all its
 ///   operands to an MCInst.
 ///
@@ -770,12 +946,12 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
 /// @return             - false on success; true otherwise.
 static bool translateInstruction(MCInst &mcInst,
                                 InternalInstruction &insn,
-                                const MCDisassembler *Dis) {  
+                                const MCDisassembler *Dis) {
   if (!insn.spec) {
     debug("Instruction has no specification");
     return true;
   }
-  
+
   mcInst.setOpcode(insn.instructionID);
   // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3
   // prefix bytes should be disassembled as xrelease and xacquire then set the
@@ -786,9 +962,9 @@ static bool translateInstruction(MCInst &mcInst,
     else if(mcInst.getOpcode() == X86::REPNE_PREFIX)
       mcInst.setOpcode(X86::XACQUIRE_PREFIX);
   }
-  
+
   insn.numImmediatesTranslated = 0;
-  
+
   for (const auto &Op : insn.operands) {
     if (Op.encoding != ENCODING_NONE) {
       if (translateOperand(mcInst, Op, insn, Dis)) {
@@ -796,7 +972,7 @@ static bool translateInstruction(MCInst &mcInst,
       }
     }
   }
-  
+
   return false;
 }
 
@@ -807,9 +983,9 @@ static MCDisassembler *createX86Disassembler(const Target &T,
   return new X86Disassembler::X86GenericDisassembler(STI, Ctx, std::move(MII));
 }
 
-extern "C" void LLVMInitializeX86Disassembler() { 
+extern "C" void LLVMInitializeX86Disassembler() {
   // Register the disassembler.
-  TargetRegistry::RegisterMCDisassembler(TheX86_32Target, 
+  TargetRegistry::RegisterMCDisassembler(TheX86_32Target,
                                          createX86Disassembler);
   TargetRegistry::RegisterMCDisassembler(TheX86_64Target,
                                          createX86Disassembler);
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index 98b3440..619a0d4 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -975,27 +975,16 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
   if (insn->rexPrefix & 0x08)
     attrMask |= ATTR_REXW;
 
-  if (getIDWithAttrMask(&instructionID, insn, attrMask))
-    return -1;
-
   /*
    * JCXZ/JECXZ need special handling for 16-bit mode because the meaning
    * of the AdSize prefix is inverted w.r.t. 32-bit mode.
    */
-  if (insn->mode == MODE_16BIT && insn->opcode == 0xE3) {
-    const struct InstructionSpecifier *spec;
-    spec = specifierForUID(instructionID);
+  if (insn->mode == MODE_16BIT && insn->opcodeType == ONEBYTE &&
+      insn->opcode == 0xE3)
+    attrMask ^= ATTR_ADSIZE;
 
-    /*
-     * Check for Ii8PCRel instructions. We could alternatively do a
-     * string-compare on the names, but this is probably cheaper.
-     */
-    if (x86OperandSets[spec->operands][0].type == TYPE_REL8) {
-      attrMask ^= ATTR_ADSIZE;
-      if (getIDWithAttrMask(&instructionID, insn, attrMask))
-        return -1;
-    }
-  }
+  if (getIDWithAttrMask(&instructionID, insn, attrMask))
+    return -1;
 
   /* The following clauses compensate for limitations of the tables. */
 
@@ -1030,6 +1019,32 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
     }
   }
 
+  /*
+   * Absolute moves need special handling.
+   * -For 16-bit mode because the meaning of the AdSize and OpSize prefixes are
+   *  inverted w.r.t.
+   * -For 32-bit mode we need to ensure the ADSIZE prefix is observed in
+   *  any position.
+   */
+  if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) {
+    /* Make sure we observed the prefixes in any position. */
+    if (insn->prefixPresent[0x67])
+      attrMask |= ATTR_ADSIZE;
+    if (insn->prefixPresent[0x66])
+      attrMask |= ATTR_OPSIZE;
+
+    /* In 16-bit, invert the attributes. */
+    if (insn->mode == MODE_16BIT)
+      attrMask ^= ATTR_ADSIZE | ATTR_OPSIZE;
+
+    if (getIDWithAttrMask(&instructionID, insn, attrMask))
+      return -1;
+
+    insn->instructionID = instructionID;
+    insn->spec = specifierForUID(instructionID);
+    return 0;
+  }
+
   if ((insn->mode == MODE_16BIT || insn->prefixPresent[0x66]) &&
       !(attrMask & ATTR_OPSIZE)) {
     /*
@@ -1445,22 +1460,14 @@ static int readModRM(struct InternalInstruction* insn) {
     case TYPE_VK16:                                       \
       return prefix##_K0 + index;                         \
     case TYPE_MM64:                                       \
-    case TYPE_MM32:                                       \
-    case TYPE_MM:                                         \
-      if (index > 7)                                      \
-        *valid = 0;                                       \
-      return prefix##_MM0 + index;                        \
+      return prefix##_MM0 + (index & 0x7);                \
     case TYPE_SEGMENTREG:                                 \
       if (index > 5)                                      \
         *valid = 0;                                       \
       return prefix##_ES + index;                         \
     case TYPE_DEBUGREG:                                   \
-      if (index > 7)                                      \
-        *valid = 0;                                       \
       return prefix##_DR0 + index;                        \
     case TYPE_CONTROLREG:                                 \
-      if (index > 8)                                      \
-        *valid = 0;                                       \
       return prefix##_CR0 + index;                        \
     }                                                     \
   }
@@ -1737,12 +1744,6 @@ static int readOperands(struct InternalInstruction* insn) {
       }
       if (readImmediate(insn, 1))
         return -1;
-      if (Op.type == TYPE_IMM3 &&
-          insn->immediates[insn->numImmediatesConsumed - 1] > 7)
-        return -1;
-      if (Op.type == TYPE_IMM5 &&
-          insn->immediates[insn->numImmediatesConsumed - 1] > 31)
-        return -1;
       if (Op.type == TYPE_XMM128 ||
           Op.type == TYPE_XMM256)
         sawRegImm = 1;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 457b382..a79a923 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -341,7 +341,15 @@ namespace X86Disassembler {
   ENTRY(DR4)        \
   ENTRY(DR5)        \
   ENTRY(DR6)        \
-  ENTRY(DR7)
+  ENTRY(DR7)        \
+  ENTRY(DR8)        \
+  ENTRY(DR9)        \
+  ENTRY(DR10)       \
+  ENTRY(DR11)       \
+  ENTRY(DR12)       \
+  ENTRY(DR13)       \
+  ENTRY(DR14)       \
+  ENTRY(DR15)
 
 #define REGS_CONTROL  \
   ENTRY(CR0)          \
@@ -352,7 +360,14 @@ namespace X86Disassembler {
   ENTRY(CR5)          \
   ENTRY(CR6)          \
   ENTRY(CR7)          \
-  ENTRY(CR8)
+  ENTRY(CR8)          \
+  ENTRY(CR9)          \
+  ENTRY(CR10)         \
+  ENTRY(CR11)         \
+  ENTRY(CR12)         \
+  ENTRY(CR13)         \
+  ENTRY(CR14)         \
+  ENTRY(CR15)
 
 #define ALL_EA_BASES  \
   EA_BASES_16BIT      \
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index bec4f0e..70c6042 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -82,6 +82,7 @@ enum attributeBits {
                                         "operands change width")               \
   ENUM_ENTRY(IC_ADSIZE,             3,  "requires an ADSIZE prefix, so "       \
                                         "operands change width")               \
+  ENUM_ENTRY(IC_OPSIZE_ADSIZE,      4,  "requires ADSIZE and OPSIZE prefixes") \
   ENUM_ENTRY(IC_XD,                 2,  "may say something about the opcode "  \
                                         "but not the operands")                \
   ENUM_ENTRY(IC_XS,                 2,  "may say something about the opcode "  \
@@ -90,20 +91,24 @@ enum attributeBits {
                                         "operands change width")               \
   ENUM_ENTRY(IC_XS_OPSIZE,          3,  "requires an OPSIZE prefix, so "       \
                                         "operands change width")               \
-  ENUM_ENTRY(IC_64BIT_REXW,         4,  "requires a REX.W prefix, so operands "\
+  ENUM_ENTRY(IC_64BIT_REXW,         5,  "requires a REX.W prefix, so operands "\
                                         "change width; overrides IC_OPSIZE")   \
+  ENUM_ENTRY(IC_64BIT_REXW_ADSIZE,  6,  "requires a REX.W prefix and 0x67 "    \
+                                        "prefix")                              \
   ENUM_ENTRY(IC_64BIT_OPSIZE,       3,  "Just as meaningful as IC_OPSIZE")     \
   ENUM_ENTRY(IC_64BIT_ADSIZE,       3,  "Just as meaningful as IC_ADSIZE")     \
-  ENUM_ENTRY(IC_64BIT_XD,           5,  "XD instructions are SSE; REX.W is "   \
+  ENUM_ENTRY(IC_64BIT_OPSIZE_ADSIZE, 4, "Just as meaningful as IC_OPSIZE/"     \
+                                        "IC_ADSIZE")                           \
+  ENUM_ENTRY(IC_64BIT_XD,           6,  "XD instructions are SSE; REX.W is "   \
                                         "secondary")                           \
-  ENUM_ENTRY(IC_64BIT_XS,           5,  "Just as meaningful as IC_64BIT_XD")   \
+  ENUM_ENTRY(IC_64BIT_XS,           6,  "Just as meaningful as IC_64BIT_XD")   \
   ENUM_ENTRY(IC_64BIT_XD_OPSIZE,    3,  "Just as meaningful as IC_XD_OPSIZE")  \
   ENUM_ENTRY(IC_64BIT_XS_OPSIZE,    3,  "Just as meaningful as IC_XS_OPSIZE")  \
-  ENUM_ENTRY(IC_64BIT_REXW_XS,      6,  "OPSIZE could mean a different "       \
+  ENUM_ENTRY(IC_64BIT_REXW_XS,      7,  "OPSIZE could mean a different "       \
                                         "opcode")                              \
-  ENUM_ENTRY(IC_64BIT_REXW_XD,      6,  "Just as meaningful as "               \
+  ENUM_ENTRY(IC_64BIT_REXW_XD,      7,  "Just as meaningful as "               \
                                         "IC_64BIT_REXW_XS")                    \
-  ENUM_ENTRY(IC_64BIT_REXW_OPSIZE,  7,  "The Dynamic Duo!  Prefer over all "   \
+  ENUM_ENTRY(IC_64BIT_REXW_OPSIZE,  8,  "The Dynamic Duo!  Prefer over all "   \
                                         "else because this changes most "      \
                                         "operands' meaning")                   \
   ENUM_ENTRY(IC_VEX,                1,  "requires a VEX prefix")               \
@@ -401,6 +406,8 @@ enum OperandEncoding {
   ENUM_ENTRY(TYPE_IMM64,      "8-byte")                                        \
   ENUM_ENTRY(TYPE_IMM3,       "1-byte immediate operand between 0 and 7")      \
   ENUM_ENTRY(TYPE_IMM5,       "1-byte immediate operand between 0 and 31")     \
+  ENUM_ENTRY(TYPE_AVX512ICC,  "1-byte immediate operand for AVX512 icmp")      \
+  ENUM_ENTRY(TYPE_UIMM8,      "1-byte unsigned immediate operand")             \
   ENUM_ENTRY(TYPE_RM8,        "1-byte register or memory operand")             \
   ENUM_ENTRY(TYPE_RM16,       "2-byte")                                        \
   ENUM_ENTRY(TYPE_RM32,       "4-byte")                                        \
@@ -416,10 +423,6 @@ enum OperandEncoding {
   ENUM_ENTRY(TYPE_M1616,      "2+2-byte segment+offset address")               \
   ENUM_ENTRY(TYPE_M1632,      "2+4-byte")                                      \
   ENUM_ENTRY(TYPE_M1664,      "2+8-byte")                                      \
-  ENUM_ENTRY(TYPE_M16_32,     "2+4-byte two-part memory operand (LIDT, LGDT)") \
-  ENUM_ENTRY(TYPE_M16_16,     "2+2-byte (BOUND)")                              \
-  ENUM_ENTRY(TYPE_M32_32,     "4+4-byte (BOUND)")                              \
-  ENUM_ENTRY(TYPE_M16_64,     "2+8-byte (LIDT, LGDT)")                         \
   ENUM_ENTRY(TYPE_SRCIDX8,    "1-byte memory at source index")                 \
   ENUM_ENTRY(TYPE_SRCIDX16,   "2-byte memory at source index")                 \
   ENUM_ENTRY(TYPE_SRCIDX32,   "4-byte memory at source index")                 \
@@ -438,14 +441,8 @@ enum OperandEncoding {
   ENUM_ENTRY(TYPE_M32FP,      "32-bit IEE754 memory floating-point operand")   \
   ENUM_ENTRY(TYPE_M64FP,      "64-bit")                                        \
   ENUM_ENTRY(TYPE_M80FP,      "80-bit extended")                               \
-  ENUM_ENTRY(TYPE_M16INT,     "2-byte memory integer operand for use in "      \
-                              "floating-point instructions")                   \
-  ENUM_ENTRY(TYPE_M32INT,     "4-byte")                                        \
-  ENUM_ENTRY(TYPE_M64INT,     "8-byte")                                        \
   ENUM_ENTRY(TYPE_ST,         "Position on the floating-point stack")          \
-  ENUM_ENTRY(TYPE_MM,         "MMX register operand")                          \
-  ENUM_ENTRY(TYPE_MM32,       "4-byte MMX register or memory operand")         \
-  ENUM_ENTRY(TYPE_MM64,       "8-byte")                                        \
+  ENUM_ENTRY(TYPE_MM64,       "8-byte MMX register")                           \
   ENUM_ENTRY(TYPE_XMM,        "XMM register operand")                          \
   ENUM_ENTRY(TYPE_XMM32,      "4-byte XMM register or memory operand")         \
   ENUM_ENTRY(TYPE_XMM64,      "8-byte")                                        \
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index b72730c..65461af 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -72,35 +72,11 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   printAnnotation(OS, Annot);
 }
 
-void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
-                                   raw_ostream &O) {
-  int64_t Imm = MI->getOperand(Op).getImm() & 0xf;
-  switch (Imm) {
-  default: llvm_unreachable("Invalid ssecc argument!");
-  case    0: O << "eq"; break;
-  case    1: O << "lt"; break;
-  case    2: O << "le"; break;
-  case    3: O << "unord"; break;
-  case    4: O << "neq"; break;
-  case    5: O << "nlt"; break;
-  case    6: O << "nle"; break;
-  case    7: O << "ord"; break;
-  case    8: O << "eq_uq"; break;
-  case    9: O << "nge"; break;
-  case  0xa: O << "ngt"; break;
-  case  0xb: O << "false"; break;
-  case  0xc: O << "neq_oq"; break;
-  case  0xd: O << "ge"; break;
-  case  0xe: O << "gt"; break;
-  case  0xf: O << "true"; break;
-  }
-}
-
-void X86ATTInstPrinter::printAVXCC(const MCInst *MI, unsigned Op,
-                                   raw_ostream &O) {
-  int64_t Imm = MI->getOperand(Op).getImm() & 0x1f;
+void X86ATTInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op,
+                                      raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm();
   switch (Imm) {
-  default: llvm_unreachable("Invalid avxcc argument!");
+  default: llvm_unreachable("Invalid ssecc/avxcc argument!");
   case    0: O << "eq"; break;
   case    1: O << "lt"; break;
   case    2: O << "le"; break;
@@ -136,8 +112,24 @@ void X86ATTInstPrinter::printAVXCC(const MCInst *MI, unsigned Op,
   }
 }
 
-void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
+void X86ATTInstPrinter::printXOPCC(const MCInst *MI, unsigned Op,
                                    raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm();
+  switch (Imm) {
+  default: llvm_unreachable("Invalid xopcc argument!");
+  case 0: O << "lt"; break;
+  case 1: O << "le"; break;
+  case 2: O << "gt"; break;
+  case 3: O << "ge"; break;
+  case 4: O << "eq"; break;
+  case 5: O << "neq"; break;
+  case 6: O << "false"; break;
+  case 7: O << "true"; break;
+  }
+}
+
+void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
+                                            raw_ostream &O) {
   int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
   switch (Imm) {
   case 0: O << "{rn-sae}"; break;
@@ -163,8 +155,7 @@ void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
     int64_t Address;
     if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
       O << formatHex((uint64_t)Address);
-    }
-    else {
+    } else {
       // Otherwise, just print the expression.
       O << *Op.getExpr();
     }
@@ -295,3 +286,10 @@ void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
 
   O << markup(">");
 }
+
+void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
+                                   raw_ostream &O) {
+  O << markup("<imm:")
+    << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff)
+    << markup(">");
+}
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index 41be14b..f71cb81 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -45,18 +45,23 @@ public:
 
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
   void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
-  void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS);
-  void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
   void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
   void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
   void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
   void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
   void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS);
+
+  void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
 
   void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
-  
+
   void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
@@ -137,7 +142,7 @@ public:
 private:
   bool HasCustomInstComment;
 };
-  
+
 }
 
 #endif
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index a8f15e6..10a1482 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -1,724 +1,982 @@
-//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This defines functionality used to emit comments about X86 instructions to
-// an output stream for -fverbose-asm.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86InstComments.h"
-#include "MCTargetDesc/X86MCTargetDesc.h"
-#include "Utils/X86ShuffleDecode.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/CodeGen/MachineValueType.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-//===----------------------------------------------------------------------===//
-// Top Level Entrypoint
-//===----------------------------------------------------------------------===//
-
-/// EmitAnyX86InstComments - This function decodes x86 instructions and prints
-/// newline terminated strings to the specified string if desired.  This
-/// information is shown in disassembly dumps when verbose assembly is enabled.
-bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
-                                  const char *(*getRegName)(unsigned)) {
-  // If this is a shuffle operation, the switch should fill in this state.
-  SmallVector<int, 8> ShuffleMask;
-  const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr;
-
-  switch (MI->getOpcode()) {
-  default:
-    // Not an instruction for which we can decode comments.
-    return false;
-
-  case X86::BLENDPDrri:
-  case X86::VBLENDPDrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::BLENDPDrmi:
-  case X86::VBLENDPDrmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodeBLENDMask(MVT::v2f64,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VBLENDPDYrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VBLENDPDYrmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodeBLENDMask(MVT::v4f64,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-
-  case X86::BLENDPSrri:
-  case X86::VBLENDPSrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::BLENDPSrmi:
-  case X86::VBLENDPSrmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodeBLENDMask(MVT::v4f32,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VBLENDPSYrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VBLENDPSYrmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodeBLENDMask(MVT::v8f32,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-
-  case X86::PBLENDWrri:
-  case X86::VPBLENDWrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PBLENDWrmi:
-  case X86::VPBLENDWrmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodeBLENDMask(MVT::v8i16,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VPBLENDWYrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPBLENDWYrmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodeBLENDMask(MVT::v16i16,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-
-  case X86::VPBLENDDrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPBLENDDrmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodeBLENDMask(MVT::v4i32,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-
-  case X86::VPBLENDDYrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPBLENDDYrmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodeBLENDMask(MVT::v8i32,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-
-  case X86::INSERTPSrr:
-  case X86::VINSERTPSrr:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    if(MI->getOperand(3).isImm())
-      DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask);
-    break;
-
-  case X86::MOVLHPSrr:
-  case X86::VMOVLHPSrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVLHPSMask(2, ShuffleMask);
-    break;
-
-  case X86::MOVHLPSrr:
-  case X86::VMOVHLPSrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVHLPSMask(2, ShuffleMask);
-    break;
-
-  case X86::MOVSLDUPrr:
-  case X86::VMOVSLDUPrr:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::MOVSLDUPrm:
-  case X86::VMOVSLDUPrm:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVSLDUPMask(MVT::v4f32, ShuffleMask);
-    break;
-
-  case X86::VMOVSHDUPYrr:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::VMOVSHDUPYrm:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVSHDUPMask(MVT::v8f32, ShuffleMask);
-    break;
-
-  case X86::VMOVSLDUPYrr:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::VMOVSLDUPYrm:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVSLDUPMask(MVT::v8f32, ShuffleMask);
-    break;
-
-  case X86::MOVSHDUPrr:
-  case X86::VMOVSHDUPrr:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::MOVSHDUPrm:
-  case X86::VMOVSHDUPrm:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask);
-    break;
-
-  case X86::PSLLDQri:
-  case X86::VPSLLDQri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodePSLLDQMask(MVT::v16i8,
-                       MI->getOperand(MI->getNumOperands()-1).getImm(),
-                       ShuffleMask);
-    break;
-
-  case X86::VPSLLDQYri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodePSLLDQMask(MVT::v32i8,
-                       MI->getOperand(MI->getNumOperands()-1).getImm(),
-                       ShuffleMask);
-    break;
-
-  case X86::PSRLDQri:
-  case X86::VPSRLDQri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodePSRLDQMask(MVT::v16i8,
-                       MI->getOperand(MI->getNumOperands()-1).getImm(),
-                       ShuffleMask);
-    break;
-
-  case X86::VPSRLDQYri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodePSRLDQMask(MVT::v32i8,
-                       MI->getOperand(MI->getNumOperands()-1).getImm(),
-                       ShuffleMask);
-    break;
-
-  case X86::PALIGNR128rr:
-  case X86::VPALIGNR128rr:
-    Src1Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PALIGNR128rm:
-  case X86::VPALIGNR128rm:
-    Src2Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodePALIGNRMask(MVT::v16i8,
-                        MI->getOperand(MI->getNumOperands()-1).getImm(),
-                        ShuffleMask);
-    break;
-  case X86::VPALIGNR256rr:
-    Src1Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPALIGNR256rm:
-    Src2Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodePALIGNRMask(MVT::v32i8,
-                        MI->getOperand(MI->getNumOperands()-1).getImm(),
-                        ShuffleMask);
-    break;
-
-  case X86::PSHUFDri:
-  case X86::VPSHUFDri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::PSHUFDmi:
-  case X86::VPSHUFDmi:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodePSHUFMask(MVT::v4i32,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    break;
-  case X86::VPSHUFDYri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::VPSHUFDYmi:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodePSHUFMask(MVT::v8i32,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    break;
-
-
-  case X86::PSHUFHWri:
-  case X86::VPSHUFHWri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::PSHUFHWmi:
-  case X86::VPSHUFHWmi:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodePSHUFHWMask(MVT::v8i16,
-                        MI->getOperand(MI->getNumOperands()-1).getImm(),
-                        ShuffleMask);
-    break;
-  case X86::VPSHUFHWYri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::VPSHUFHWYmi:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodePSHUFHWMask(MVT::v16i16,
-                        MI->getOperand(MI->getNumOperands()-1).getImm(),
-                        ShuffleMask);
-    break;
-  case X86::PSHUFLWri:
-  case X86::VPSHUFLWri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::PSHUFLWmi:
-  case X86::VPSHUFLWmi:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodePSHUFLWMask(MVT::v8i16,
-                        MI->getOperand(MI->getNumOperands()-1).getImm(),
-                        ShuffleMask);
-    break;
-  case X86::VPSHUFLWYri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::VPSHUFLWYmi:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodePSHUFLWMask(MVT::v16i16,
-                        MI->getOperand(MI->getNumOperands()-1).getImm(),
-                        ShuffleMask);
-    break;
-
-  case X86::PUNPCKHBWrr:
-  case X86::VPUNPCKHBWrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKHBWrm:
-  case X86::VPUNPCKHBWrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v16i8, ShuffleMask);
-    break;
-  case X86::VPUNPCKHBWYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPUNPCKHBWYrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v32i8, ShuffleMask);
-    break;
-  case X86::PUNPCKHWDrr:
-  case X86::VPUNPCKHWDrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKHWDrm:
-  case X86::VPUNPCKHWDrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v8i16, ShuffleMask);
-    break;
-  case X86::VPUNPCKHWDYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPUNPCKHWDYrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v16i16, ShuffleMask);
-    break;
-  case X86::PUNPCKHDQrr:
-  case X86::VPUNPCKHDQrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKHDQrm:
-  case X86::VPUNPCKHDQrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v4i32, ShuffleMask);
-    break;
-  case X86::VPUNPCKHDQYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPUNPCKHDQYrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v8i32, ShuffleMask);
-    break;
-  case X86::PUNPCKHQDQrr:
-  case X86::VPUNPCKHQDQrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKHQDQrm:
-  case X86::VPUNPCKHQDQrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v2i64, ShuffleMask);
-    break;
-  case X86::VPUNPCKHQDQYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPUNPCKHQDQYrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v4i64, ShuffleMask);
-    break;
-
-  case X86::PUNPCKLBWrr:
-  case X86::VPUNPCKLBWrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKLBWrm:
-  case X86::VPUNPCKLBWrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v16i8, ShuffleMask);
-    break;
-  case X86::VPUNPCKLBWYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPUNPCKLBWYrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v32i8, ShuffleMask);
-    break;
-  case X86::PUNPCKLWDrr:
-  case X86::VPUNPCKLWDrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKLWDrm:
-  case X86::VPUNPCKLWDrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v8i16, ShuffleMask);
-    break;
-  case X86::VPUNPCKLWDYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPUNPCKLWDYrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v16i16, ShuffleMask);
-    break;
-  case X86::PUNPCKLDQrr:
-  case X86::VPUNPCKLDQrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKLDQrm:
-  case X86::VPUNPCKLDQrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v4i32, ShuffleMask);
-    break;
-  case X86::VPUNPCKLDQYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPUNPCKLDQYrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v8i32, ShuffleMask);
-    break;
-  case X86::PUNPCKLQDQrr:
-  case X86::VPUNPCKLQDQrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKLQDQrm:
-  case X86::VPUNPCKLQDQrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v2i64, ShuffleMask);
-    break;
-  case X86::VPUNPCKLQDQYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPUNPCKLQDQYrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v4i64, ShuffleMask);
-    break;
-
-  case X86::SHUFPDrri:
-  case X86::VSHUFPDrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::SHUFPDrmi:
-  case X86::VSHUFPDrmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodeSHUFPMask(MVT::v2f64,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VSHUFPDYrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VSHUFPDYrmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodeSHUFPMask(MVT::v4f64,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-
-  case X86::SHUFPSrri:
-  case X86::VSHUFPSrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::SHUFPSrmi:
-  case X86::VSHUFPSrmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodeSHUFPMask(MVT::v4f32,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VSHUFPSYrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VSHUFPSYrmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodeSHUFPMask(MVT::v8f32,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-
-  case X86::UNPCKLPDrr:
-  case X86::VUNPCKLPDrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::UNPCKLPDrm:
-  case X86::VUNPCKLPDrm:
-    DecodeUNPCKLMask(MVT::v2f64, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VUNPCKLPDYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VUNPCKLPDYrm:
-    DecodeUNPCKLMask(MVT::v4f64, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::UNPCKLPSrr:
-  case X86::VUNPCKLPSrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::UNPCKLPSrm:
-  case X86::VUNPCKLPSrm:
-    DecodeUNPCKLMask(MVT::v4f32, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VUNPCKLPSYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VUNPCKLPSYrm:
-    DecodeUNPCKLMask(MVT::v8f32, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::UNPCKHPDrr:
-  case X86::VUNPCKHPDrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::UNPCKHPDrm:
-  case X86::VUNPCKHPDrm:
-    DecodeUNPCKHMask(MVT::v2f64, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VUNPCKHPDYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VUNPCKHPDYrm:
-    DecodeUNPCKHMask(MVT::v4f64, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::UNPCKHPSrr:
-  case X86::VUNPCKHPSrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::UNPCKHPSrm:
-  case X86::VUNPCKHPSrm:
-    DecodeUNPCKHMask(MVT::v4f32, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VUNPCKHPSYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VUNPCKHPSYrm:
-    DecodeUNPCKHMask(MVT::v8f32, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VPERMILPSri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::VPERMILPSmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodePSHUFMask(MVT::v4f32,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VPERMILPSYri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::VPERMILPSYmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodePSHUFMask(MVT::v8f32,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VPERMILPDri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::VPERMILPDmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodePSHUFMask(MVT::v2f64,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VPERMILPDYri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::VPERMILPDYmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodePSHUFMask(MVT::v4f64,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VPERM2F128rr:
-  case X86::VPERM2I128rr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPERM2F128rm:
-  case X86::VPERM2I128rm:
-    // For instruction comments purpose, assume the 256-bit vector is v4i64.
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodeVPERM2X128Mask(MVT::v4i64,
-                           MI->getOperand(MI->getNumOperands()-1).getImm(),
-                           ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VPERMQYri:
-  case X86::VPERMPDYri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::VPERMQYmi:
-  case X86::VPERMPDYmi:
-    if(MI->getOperand(MI->getNumOperands()-1).isImm())
-      DecodeVPERMMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  }
-
-  // The only comments we decode are shuffles, so give up if we were unable to
-  // decode a shuffle mask.
-  if (ShuffleMask.empty())
-    return false;
-
-  if (!DestName) DestName = Src1Name;
-  OS << (DestName ? DestName : "mem") << " = ";
-
-  // If the two sources are the same, canonicalize the input elements to be
-  // from the first src so that we get larger element spans.
-  if (Src1Name == Src2Name) {
-    for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
-      if ((int)ShuffleMask[i] >= 0 && // Not sentinel.
-          ShuffleMask[i] >= (int)e)        // From second mask.
-        ShuffleMask[i] -= e;
-    }
-  }
-
-  // The shuffle mask specifies which elements of the src1/src2 fill in the
-  // destination, with a few sentinel values.  Loop through and print them
-  // out.
-  for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
-    if (i != 0)
-      OS << ',';
-    if (ShuffleMask[i] == SM_SentinelZero) {
-      OS << "zero";
-      continue;
-    }
-
-    // Otherwise, it must come from src1 or src2.  Print the span of elements
-    // that comes from this src.
-    bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size();
-    const char *SrcName = isSrc1 ? Src1Name : Src2Name;
-    OS << (SrcName ? SrcName : "mem") << '[';
-    bool IsFirst = true;
-    while (i != e && (int)ShuffleMask[i] != SM_SentinelZero &&
-           (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) {
-      if (!IsFirst)
-        OS << ',';
-      else
-        IsFirst = false;
-      if (ShuffleMask[i] == SM_SentinelUndef)
-        OS << "u";
-      else
-        OS << ShuffleMask[i] % ShuffleMask.size();
-      ++i;
-    }
-    OS << ']';
-    --i;  // For loop increments element #.
-  }
-  //MI->print(OS, 0);
-  OS << "\n";
-
-  // We successfully added a comment to this instruction.
-  return true;
-}
+//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstComments.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+/// \brief Extracts the src/dst types for a given zero extension instruction.
+/// \note While the number of elements in DstVT type correct, the
+/// number in the SrcVT type is expanded to fill the src xmm register and the
+/// upper elements may not be included in the dst xmm/ymm register.
+static void getZeroExtensionTypes(const MCInst *MI, MVT &SrcVT, MVT &DstVT) {
+  switch (MI->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown zero extension instruction");
+  // i8 zero extension
+  case X86::PMOVZXBWrm:
+  case X86::PMOVZXBWrr:
+  case X86::VPMOVZXBWrm:
+  case X86::VPMOVZXBWrr:
+    SrcVT = MVT::v16i8;
+    DstVT = MVT::v8i16;
+    break;
+  case X86::VPMOVZXBWYrm:
+  case X86::VPMOVZXBWYrr:
+    SrcVT = MVT::v16i8;
+    DstVT = MVT::v16i16;
+    break;
+  case X86::PMOVZXBDrm:
+  case X86::PMOVZXBDrr:
+  case X86::VPMOVZXBDrm:
+  case X86::VPMOVZXBDrr:
+    SrcVT = MVT::v16i8;
+    DstVT = MVT::v4i32;
+    break;
+  case X86::VPMOVZXBDYrm:
+  case X86::VPMOVZXBDYrr:
+    SrcVT = MVT::v16i8;
+    DstVT = MVT::v8i32;
+    break;
+  case X86::PMOVZXBQrm:
+  case X86::PMOVZXBQrr:
+  case X86::VPMOVZXBQrm:
+  case X86::VPMOVZXBQrr:
+    SrcVT = MVT::v16i8;
+    DstVT = MVT::v2i64;
+    break;
+  case X86::VPMOVZXBQYrm:
+  case X86::VPMOVZXBQYrr:
+    SrcVT = MVT::v16i8;
+    DstVT = MVT::v4i64;
+    break;
+  // i16 zero extension
+  case X86::PMOVZXWDrm:
+  case X86::PMOVZXWDrr:
+  case X86::VPMOVZXWDrm:
+  case X86::VPMOVZXWDrr:
+    SrcVT = MVT::v8i16;
+    DstVT = MVT::v4i32;
+    break;
+  case X86::VPMOVZXWDYrm:
+  case X86::VPMOVZXWDYrr:
+    SrcVT = MVT::v8i16;
+    DstVT = MVT::v8i32;
+    break;
+  case X86::PMOVZXWQrm:
+  case X86::PMOVZXWQrr:
+  case X86::VPMOVZXWQrm:
+  case X86::VPMOVZXWQrr:
+    SrcVT = MVT::v8i16;
+    DstVT = MVT::v2i64;
+    break;
+  case X86::VPMOVZXWQYrm:
+  case X86::VPMOVZXWQYrr:
+    SrcVT = MVT::v8i16;
+    DstVT = MVT::v4i64;
+    break;
+  // i32 zero extension
+  case X86::PMOVZXDQrm:
+  case X86::PMOVZXDQrr:
+  case X86::VPMOVZXDQrm:
+  case X86::VPMOVZXDQrr:
+    SrcVT = MVT::v4i32;
+    DstVT = MVT::v2i64;
+    break;
+  case X86::VPMOVZXDQYrm:
+  case X86::VPMOVZXDQYrr:
+    SrcVT = MVT::v4i32;
+    DstVT = MVT::v4i64;
+    break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Top Level Entrypoint
+//===----------------------------------------------------------------------===//
+
+/// EmitAnyX86InstComments - This function decodes x86 instructions and prints
+/// newline terminated strings to the specified string if desired.  This
+/// information is shown in disassembly dumps when verbose assembly is enabled.
+bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+                                  const char *(*getRegName)(unsigned)) {
+  // If this is a shuffle operation, the switch should fill in this state.
+  SmallVector<int, 8> ShuffleMask;
+  const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr;
+
+  switch (MI->getOpcode()) {
+  default:
+    // Not an instruction for which we can decode comments.
+    return false;
+
+  case X86::BLENDPDrri:
+  case X86::VBLENDPDrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::BLENDPDrmi:
+  case X86::VBLENDPDrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v2f64,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VBLENDPDYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VBLENDPDYrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v4f64,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::BLENDPSrri:
+  case X86::VBLENDPSrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::BLENDPSrmi:
+  case X86::VBLENDPSrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v4f32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VBLENDPSYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VBLENDPSYrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v8f32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::PBLENDWrri:
+  case X86::VPBLENDWrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PBLENDWrmi:
+  case X86::VPBLENDWrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v8i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPBLENDWYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPBLENDWYrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v16i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::VPBLENDDrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPBLENDDrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v4i32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::VPBLENDDYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPBLENDDYrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v8i32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::INSERTPSrr:
+  case X86::VINSERTPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::INSERTPSrm:
+  case X86::VINSERTPSrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeINSERTPSMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+                         ShuffleMask);
+    break;
+
+  case X86::MOVLHPSrr:
+  case X86::VMOVLHPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVLHPSMask(2, ShuffleMask);
+    break;
+
+  case X86::MOVHLPSrr:
+  case X86::VMOVHLPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVHLPSMask(2, ShuffleMask);
+    break;
+
+  case X86::MOVSLDUPrr:
+  case X86::VMOVSLDUPrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::MOVSLDUPrm:
+  case X86::VMOVSLDUPrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVSLDUPMask(MVT::v4f32, ShuffleMask);
+    break;
+
+  case X86::VMOVSHDUPYrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VMOVSHDUPYrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVSHDUPMask(MVT::v8f32, ShuffleMask);
+    break;
+
+  case X86::VMOVSLDUPYrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VMOVSLDUPYrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVSLDUPMask(MVT::v8f32, ShuffleMask);
+    break;
+
+  case X86::MOVSHDUPrr:
+  case X86::VMOVSHDUPrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::MOVSHDUPrm:
+  case X86::VMOVSHDUPrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask);
+    break;
+
+  case X86::VMOVDDUPYrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VMOVDDUPYrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVDDUPMask(MVT::v4f64, ShuffleMask);
+    break;
+
+  case X86::MOVDDUPrr:
+  case X86::VMOVDDUPrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::MOVDDUPrm:
+  case X86::VMOVDDUPrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVDDUPMask(MVT::v2f64, ShuffleMask);
+    break;
+
+  case X86::PSLLDQri:
+  case X86::VPSLLDQri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSLLDQMask(MVT::v16i8,
+                       MI->getOperand(MI->getNumOperands()-1).getImm(),
+                       ShuffleMask);
+    break;
+
+  case X86::VPSLLDQYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSLLDQMask(MVT::v32i8,
+                       MI->getOperand(MI->getNumOperands()-1).getImm(),
+                       ShuffleMask);
+    break;
+
+  case X86::PSRLDQri:
+  case X86::VPSRLDQri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSRLDQMask(MVT::v16i8,
+                       MI->getOperand(MI->getNumOperands()-1).getImm(),
+                       ShuffleMask);
+    break;
+
+  case X86::VPSRLDQYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSRLDQMask(MVT::v32i8,
+                       MI->getOperand(MI->getNumOperands()-1).getImm(),
+                       ShuffleMask);
+    break;
+
+  case X86::PALIGNR128rr:
+  case X86::VPALIGNR128rr:
+    Src1Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PALIGNR128rm:
+  case X86::VPALIGNR128rm:
+    Src2Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePALIGNRMask(MVT::v16i8,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
+    break;
+  case X86::VPALIGNR256rr:
+    Src1Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPALIGNR256rm:
+    Src2Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePALIGNRMask(MVT::v32i8,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
+    break;
+
+  case X86::PSHUFDri:
+  case X86::VPSHUFDri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::PSHUFDmi:
+  case X86::VPSHUFDmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v4i32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+  case X86::VPSHUFDYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPSHUFDYmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v8i32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+
+
+  case X86::PSHUFHWri:
+  case X86::VPSHUFHWri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::PSHUFHWmi:
+  case X86::VPSHUFHWmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFHWMask(MVT::v8i16,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
+    break;
+  case X86::VPSHUFHWYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPSHUFHWYmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFHWMask(MVT::v16i16,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
+    break;
+  case X86::PSHUFLWri:
+  case X86::VPSHUFLWri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::PSHUFLWmi:
+  case X86::VPSHUFLWmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFLWMask(MVT::v8i16,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
+    break;
+  case X86::VPSHUFLWYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPSHUFLWYmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFLWMask(MVT::v16i16,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
+    break;
+
+  case X86::PUNPCKHBWrr:
+  case X86::VPUNPCKHBWrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKHBWrm:
+  case X86::VPUNPCKHBWrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v16i8, ShuffleMask);
+    break;
+  case X86::VPUNPCKHBWYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHBWYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v32i8, ShuffleMask);
+    break;
+  case X86::PUNPCKHWDrr:
+  case X86::VPUNPCKHWDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKHWDrm:
+  case X86::VPUNPCKHWDrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v8i16, ShuffleMask);
+    break;
+  case X86::VPUNPCKHWDYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHWDYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v16i16, ShuffleMask);
+    break;
+  case X86::PUNPCKHDQrr:
+  case X86::VPUNPCKHDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKHDQrm:
+  case X86::VPUNPCKHDQrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v4i32, ShuffleMask);
+    break;
+  case X86::VPUNPCKHDQYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHDQYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v8i32, ShuffleMask);
+    break;
+  case X86::VPUNPCKHDQZrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHDQZrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v16i32, ShuffleMask);
+    break;
+  case X86::PUNPCKHQDQrr:
+  case X86::VPUNPCKHQDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKHQDQrm:
+  case X86::VPUNPCKHQDQrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v2i64, ShuffleMask);
+    break;
+  case X86::VPUNPCKHQDQYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHQDQYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v4i64, ShuffleMask);
+    break;
+  case X86::VPUNPCKHQDQZrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHQDQZrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v8i64, ShuffleMask);
+    break;
+
+  case X86::PUNPCKLBWrr:
+  case X86::VPUNPCKLBWrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKLBWrm:
+  case X86::VPUNPCKLBWrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v16i8, ShuffleMask);
+    break;
+  case X86::VPUNPCKLBWYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLBWYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v32i8, ShuffleMask);
+    break;
+  case X86::PUNPCKLWDrr:
+  case X86::VPUNPCKLWDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKLWDrm:
+  case X86::VPUNPCKLWDrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v8i16, ShuffleMask);
+    break;
+  case X86::VPUNPCKLWDYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLWDYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v16i16, ShuffleMask);
+    break;
+  case X86::PUNPCKLDQrr:
+  case X86::VPUNPCKLDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKLDQrm:
+  case X86::VPUNPCKLDQrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v4i32, ShuffleMask);
+    break;
+  case X86::VPUNPCKLDQYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLDQYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v8i32, ShuffleMask);
+    break;
+  case X86::VPUNPCKLDQZrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLDQZrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v16i32, ShuffleMask);
+    break;
+  case X86::PUNPCKLQDQrr:
+  case X86::VPUNPCKLQDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKLQDQrm:
+  case X86::VPUNPCKLQDQrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v2i64, ShuffleMask);
+    break;
+  case X86::VPUNPCKLQDQYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLQDQYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v4i64, ShuffleMask);
+    break;
+  case X86::VPUNPCKLQDQZrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLQDQZrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v8i64, ShuffleMask);
+    break;
+
+  case X86::SHUFPDrri:
+  case X86::VSHUFPDrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::SHUFPDrmi:
+  case X86::VSHUFPDrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeSHUFPMask(MVT::v2f64,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VSHUFPDYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VSHUFPDYrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeSHUFPMask(MVT::v4f64,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::SHUFPSrri:
+  case X86::VSHUFPSrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::SHUFPSrmi:
+  case X86::VSHUFPSrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeSHUFPMask(MVT::v4f32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VSHUFPSYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VSHUFPSYrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeSHUFPMask(MVT::v8f32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::UNPCKLPDrr:
+  case X86::VUNPCKLPDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::UNPCKLPDrm:
+  case X86::VUNPCKLPDrm:
+    DecodeUNPCKLMask(MVT::v2f64, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKLPDYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKLPDYrm:
+    DecodeUNPCKLMask(MVT::v4f64, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKLPDZrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKLPDZrm:
+    DecodeUNPCKLMask(MVT::v8f64, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::UNPCKLPSrr:
+  case X86::VUNPCKLPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::UNPCKLPSrm:
+  case X86::VUNPCKLPSrm:
+    DecodeUNPCKLMask(MVT::v4f32, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKLPSYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKLPSYrm:
+    DecodeUNPCKLMask(MVT::v8f32, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKLPSZrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKLPSZrm:
+    DecodeUNPCKLMask(MVT::v16f32, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::UNPCKHPDrr:
+  case X86::VUNPCKHPDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::UNPCKHPDrm:
+  case X86::VUNPCKHPDrm:
+    DecodeUNPCKHMask(MVT::v2f64, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKHPDYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKHPDYrm:
+    DecodeUNPCKHMask(MVT::v4f64, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKHPDZrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKHPDZrm:
+    DecodeUNPCKHMask(MVT::v8f64, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::UNPCKHPSrr:
+  case X86::VUNPCKHPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::UNPCKHPSrm:
+  case X86::VUNPCKHPSrm:
+    DecodeUNPCKHMask(MVT::v4f32, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKHPSYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKHPSYrm:
+    DecodeUNPCKHMask(MVT::v8f32, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKHPSZrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKHPSZrm:
+    DecodeUNPCKHMask(MVT::v16f32, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPSri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMILPSmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v4f32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPSYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMILPSYmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v8f32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPDri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMILPDmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v2f64,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPDYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMILPDYmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v4f64,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERM2F128rr:
+  case X86::VPERM2I128rr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPERM2F128rm:
+  case X86::VPERM2I128rm:
+    // For instruction comments purpose, assume the 256-bit vector is v4i64.
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeVPERM2X128Mask(MVT::v4i64,
+                           MI->getOperand(MI->getNumOperands()-1).getImm(),
+                           ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMQYri:
+  case X86::VPERMPDYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMQYmi:
+  case X86::VPERMPDYmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeVPERMMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::MOVSDrr:
+  case X86::VMOVSDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::MOVSDrm:
+  case X86::VMOVSDrm:
+    DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::MOVSSrr:
+  case X86::VMOVSSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::MOVSSrm:
+  case X86::VMOVSSrm:
+    DecodeScalarMoveMask(MVT::v4f32, nullptr == Src2Name, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::MOVPQI2QIrr:
+  case X86::MOVZPQILo2PQIrr:
+  case X86::VMOVPQI2QIrr:
+  case X86::VMOVZPQILo2PQIrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+  // FALL THROUGH.
+  case X86::MOVQI2PQIrm:
+  case X86::MOVZQI2PQIrm:
+  case X86::MOVZPQILo2PQIrm:
+  case X86::VMOVQI2PQIrm:
+  case X86::VMOVZQI2PQIrm:
+  case X86::VMOVZPQILo2PQIrm:
+    DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::MOVDI2PDIrm:
+  case X86::VMOVDI2PDIrm:
+    DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::PMOVZXBWrr:
+  case X86::PMOVZXBDrr:
+  case X86::PMOVZXBQrr:
+  case X86::PMOVZXWDrr:
+  case X86::PMOVZXWQrr:
+  case X86::PMOVZXDQrr:
+  case X86::VPMOVZXBWrr:
+  case X86::VPMOVZXBDrr:
+  case X86::VPMOVZXBQrr:
+  case X86::VPMOVZXWDrr:
+  case X86::VPMOVZXWQrr:
+  case X86::VPMOVZXDQrr:
+  case X86::VPMOVZXBWYrr:
+  case X86::VPMOVZXBDYrr:
+  case X86::VPMOVZXBQYrr:
+  case X86::VPMOVZXWDYrr:
+  case X86::VPMOVZXWQYrr:
+  case X86::VPMOVZXDQYrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+  // FALL THROUGH.
+  case X86::PMOVZXBWrm:
+  case X86::PMOVZXBDrm:
+  case X86::PMOVZXBQrm:
+  case X86::PMOVZXWDrm:
+  case X86::PMOVZXWQrm:
+  case X86::PMOVZXDQrm:
+  case X86::VPMOVZXBWrm:
+  case X86::VPMOVZXBDrm:
+  case X86::VPMOVZXBQrm:
+  case X86::VPMOVZXWDrm:
+  case X86::VPMOVZXWQrm:
+  case X86::VPMOVZXDQrm:
+  case X86::VPMOVZXBWYrm:
+  case X86::VPMOVZXBDYrm:
+  case X86::VPMOVZXBQYrm:
+  case X86::VPMOVZXWDYrm:
+  case X86::VPMOVZXWQYrm:
+  case X86::VPMOVZXDQYrm: {
+    MVT SrcVT, DstVT;
+    getZeroExtensionTypes(MI, SrcVT, DstVT);
+    DecodeZeroExtendMask(SrcVT, DstVT, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+  } break;
+  }
+
+  // The only comments we decode are shuffles, so give up if we were unable to
+  // decode a shuffle mask.
+  if (ShuffleMask.empty())
+    return false;
+
+  if (!DestName) DestName = Src1Name;
+  OS << (DestName ? DestName : "mem") << " = ";
+
+  // If the two sources are the same, canonicalize the input elements to be
+  // from the first src so that we get larger element spans.
+  if (Src1Name == Src2Name) {
+    for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+      if ((int)ShuffleMask[i] >= 0 && // Not sentinel.
+          ShuffleMask[i] >= (int)e)        // From second mask.
+        ShuffleMask[i] -= e;
+    }
+  }
+
+  // The shuffle mask specifies which elements of the src1/src2 fill in the
+  // destination, with a few sentinel values.  Loop through and print them
+  // out.
+  for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+    if (i != 0)
+      OS << ',';
+    if (ShuffleMask[i] == SM_SentinelZero) {
+      OS << "zero";
+      continue;
+    }
+
+    // Otherwise, it must come from src1 or src2.  Print the span of elements
+    // that comes from this src.
+    bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size();
+    const char *SrcName = isSrc1 ? Src1Name : Src2Name;
+    OS << (SrcName ? SrcName : "mem") << '[';
+    bool IsFirst = true;
+    while (i != e && (int)ShuffleMask[i] != SM_SentinelZero &&
+           (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) {
+      if (!IsFirst)
+        OS << ',';
+      else
+        IsFirst = false;
+      if (ShuffleMask[i] == SM_SentinelUndef)
+        OS << "u";
+      else
+        OS << ShuffleMask[i] % ShuffleMask.size();
+      ++i;
+    }
+    OS << ']';
+    --i;  // For loop increments element #.
+  }
+  //MI->print(OS, 0);
+  OS << "\n";
+
+  // We successfully added a comment to this instruction.
+  return true;
+}
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 1c8466b..91d1828 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -50,33 +50,9 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
     EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
 }
 
-void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
-                                     raw_ostream &O) {
-  int64_t Imm = MI->getOperand(Op).getImm() & 0xf;
-  switch (Imm) {
-  default: llvm_unreachable("Invalid ssecc argument!");
-  case    0: O << "eq"; break;
-  case    1: O << "lt"; break;
-  case    2: O << "le"; break;
-  case    3: O << "unord"; break;
-  case    4: O << "neq"; break;
-  case    5: O << "nlt"; break;
-  case    6: O << "nle"; break;
-  case    7: O << "ord"; break;
-  case    8: O << "eq_uq"; break;
-  case    9: O << "nge"; break;
-  case  0xa: O << "ngt"; break;
-  case  0xb: O << "false"; break;
-  case  0xc: O << "neq_oq"; break;
-  case  0xd: O << "ge"; break;
-  case  0xe: O << "gt"; break;
-  case  0xf: O << "true"; break;
-  }
-}
-
-void X86IntelInstPrinter::printAVXCC(const MCInst *MI, unsigned Op,
-                                     raw_ostream &O) {
-  int64_t Imm = MI->getOperand(Op).getImm() & 0x1f;
+void X86IntelInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op,
+                                        raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm();
   switch (Imm) {
   default: llvm_unreachable("Invalid avxcc argument!");
   case    0: O << "eq"; break;
@@ -114,8 +90,24 @@ void X86IntelInstPrinter::printAVXCC(const MCInst *MI, unsigned Op,
   }
 }
 
+void X86IntelInstPrinter::printXOPCC(const MCInst *MI, unsigned Op,
+                                     raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm();
+  switch (Imm) {
+  default: llvm_unreachable("Invalid xopcc argument!");
+  case 0: O << "lt"; break;
+  case 1: O << "le"; break;
+  case 2: O << "gt"; break;
+  case 3: O << "ge"; break;
+  case 4: O << "eq"; break;
+  case 5: O << "neq"; break;
+  case 6: O << "false"; break;
+  case 7: O << "true"; break;
+  }
+}
+
 void X86IntelInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
-                                   raw_ostream &O) {
+                                               raw_ostream &O) {
   int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
   switch (Imm) {
   case 0: O << "{rn-sae}"; break;
@@ -168,21 +160,21 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
   const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
   const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
   const MCOperand &SegReg   = MI->getOperand(Op+X86::AddrSegmentReg);
-  
+
   // If this has a segment register, print it.
   if (SegReg.getReg()) {
     printOperand(MI, Op+X86::AddrSegmentReg, O);
     O << ':';
   }
-  
+
   O << '[';
-  
+
   bool NeedPlus = false;
   if (BaseReg.getReg()) {
     printOperand(MI, Op+X86::AddrBaseReg, O);
     NeedPlus = true;
   }
-  
+
   if (IndexReg.getReg()) {
     if (NeedPlus) O << " + ";
     if (ScaleVal != 1)
@@ -209,7 +201,7 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
       O << formatImm(DispVal);
     }
   }
-  
+
   O << ']';
 }
 
@@ -257,3 +249,8 @@ void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
 
   O << ']';
 }
+
+void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
+                                     raw_ostream &O) {
+  O << formatImm(MI->getOperand(Op).getImm() & 0xff);
+}
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index d082f0b..2150144 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -36,19 +36,24 @@ public:
 
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
-  void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O);
-  void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &O);
   void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O);
+
+  void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
 
   void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     O << "opaque ptr ";
     printMemReference(MI, OpNo, O);
   }
-  
+
   void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     O << "byte ptr ";
     printMemReference(MI, OpNo, O);
@@ -152,7 +157,7 @@ public:
     printMemOffset(MI, OpNo, O);
   }
 };
-  
+
 }
 
 #endif
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index befa6c2..719b761 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -512,7 +512,7 @@ protected:
         // Defines a new offset for the CFA. E.g.
         //
         //  With frame:
-        //  
+        //
         //     pushq %rbp
         //  L0:
         //     .cfi_def_cfa_offset 16
@@ -682,7 +682,7 @@ private:
     //     4       3
     //     5       3
     //
-    for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
+    for (unsigned i = 0; i < RegCount; ++i) {
       int CUReg = getCompactUnwindRegNum(SavedRegs[i]);
       if (CUReg == -1) return ~0U;
       SavedRegs[i] = CUReg;
@@ -777,39 +777,6 @@ public:
                                      MachO::CPU_TYPE_X86_64, Subtype);
   }
 
-  bool doesSectionRequireSymbols(const MCSection &Section) const override {
-    // Temporary labels in the string literals sections require symbols. The
-    // issue is that the x86_64 relocation format does not allow symbol +
-    // offset, and so the linker does not have enough information to resolve the
-    // access to the appropriate atom unless an external relocation is used. For
-    // non-cstring sections, we expect the compiler to use a non-temporary label
-    // for anything that could have an addend pointing outside the symbol.
-    //
-    // See <rdar://problem/4765733>.
-    const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
-    return SMO.getType() == MachO::S_CSTRING_LITERALS;
-  }
-
-  bool isSectionAtomizable(const MCSection &Section) const override {
-    const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
-    // Fixed sized data sections are uniqued, they cannot be diced into atoms.
-    switch (SMO.getType()) {
-    default:
-      return true;
-
-    case MachO::S_4BYTE_LITERALS:
-    case MachO::S_8BYTE_LITERALS:
-    case MachO::S_16BYTE_LITERALS:
-    case MachO::S_LITERAL_POINTERS:
-    case MachO::S_NON_LAZY_SYMBOL_POINTERS:
-    case MachO::S_LAZY_SYMBOL_POINTERS:
-    case MachO::S_MOD_INIT_FUNC_POINTERS:
-    case MachO::S_MOD_TERM_FUNC_POINTERS:
-    case MachO::S_INTERPOSING:
-      return false;
-    }
-  }
-
   /// \brief Generate the compact unwind encoding for the CFI instructions.
   uint32_t generateCompactUnwindEncoding(
                              ArrayRef<MCCFIInstruction> Instrs) const override {
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 365cf0c..d4698bf 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -302,19 +302,21 @@ namespace X86II {
 
     //// MRM_XX - A mod/rm byte of exactly 0xXX.
     MRM_C0 = 32, MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35,
-    MRM_C4 = 36, MRM_C8 = 37, MRM_C9 = 38, MRM_CA = 39,
-    MRM_CB = 40, MRM_CF = 41, MRM_D0 = 42, MRM_D1 = 43,
-    MRM_D4 = 44, MRM_D5 = 45, MRM_D6 = 46, MRM_D7 = 47,
-    MRM_D8 = 48, MRM_D9 = 49, MRM_DA = 50, MRM_DB = 51,
-    MRM_DC = 52, MRM_DD = 53, MRM_DE = 54, MRM_DF = 55,
-    MRM_E0 = 56, MRM_E1 = 57, MRM_E2 = 58, MRM_E3 = 59,
-    MRM_E4 = 60, MRM_E5 = 61, MRM_E8 = 62, MRM_E9 = 63,
-    MRM_EA = 64, MRM_EB = 65, MRM_EC = 66, MRM_ED = 67,
-    MRM_EE = 68, MRM_F0 = 69, MRM_F1 = 70, MRM_F2 = 71,
-    MRM_F3 = 72, MRM_F4 = 73, MRM_F5 = 74, MRM_F6 = 75,
-    MRM_F7 = 76, MRM_F8 = 77, MRM_F9 = 78, MRM_FA = 79,
-    MRM_FB = 80, MRM_FC = 81, MRM_FD = 82, MRM_FE = 83,
-    MRM_FF = 84,
+    MRM_C4 = 36, MRM_C5 = 37, MRM_C6 = 38, MRM_C7 = 39,
+    MRM_C8 = 40, MRM_C9 = 41, MRM_CA = 42, MRM_CB = 43,
+    MRM_CC = 44, MRM_CD = 45, MRM_CE = 46, MRM_CF = 47,
+    MRM_D0 = 48, MRM_D1 = 49, MRM_D2 = 50, MRM_D3 = 51,
+    MRM_D4 = 52, MRM_D5 = 53, MRM_D6 = 54, MRM_D7 = 55,
+    MRM_D8 = 56, MRM_D9 = 57, MRM_DA = 58, MRM_DB = 59,
+    MRM_DC = 60, MRM_DD = 61, MRM_DE = 62, MRM_DF = 63,
+    MRM_E0 = 64, MRM_E1 = 65, MRM_E2 = 66, MRM_E3 = 67,
+    MRM_E4 = 68, MRM_E5 = 69, MRM_E6 = 70, MRM_E7 = 71,
+    MRM_E8 = 72, MRM_E9 = 73, MRM_EA = 74, MRM_EB = 75,
+    MRM_EC = 76, MRM_ED = 77, MRM_EE = 78, MRM_EF = 79,
+    MRM_F0 = 80, MRM_F1 = 81, MRM_F2 = 82, MRM_F3 = 83,
+    MRM_F4 = 84, MRM_F5 = 85, MRM_F6 = 86, MRM_F7 = 87,
+    MRM_F8 = 88, MRM_F9 = 89, MRM_FA = 90, MRM_FB = 91,
+    MRM_FC = 92, MRM_FD = 93, MRM_FE = 94, MRM_FF = 95,
 
     FormMask       = 127,
 
@@ -328,21 +330,28 @@ namespace X86II {
     OpSizeShift = 7,
     OpSizeMask = 0x3 << OpSizeShift,
 
-    OpSize16 = 1 << OpSizeShift,
-    OpSize32 = 2 << OpSizeShift,
+    OpSizeFixed = 0 << OpSizeShift,
+    OpSize16    = 1 << OpSizeShift,
+    OpSize32    = 2 << OpSizeShift,
 
-    // AsSize - Set if this instruction requires an operand size prefix (0x67),
-    // which most often indicates that the instruction address 16 bit address
-    // instead of 32 bit address (or 32 bit address in 64 bit mode).
+    // AsSize - AdSizeX implies this instruction determines its need of 0x67
+    // prefix from a normal ModRM memory operand. The other types indicate that
+    // an operand is encoded with a specific width and a prefix is needed if
+    // it differs from the current mode.
     AdSizeShift = OpSizeShift + 2,
-    AdSize      = 1 << AdSizeShift,
+    AdSizeMask  = 0x3 << AdSizeShift,
+
+    AdSizeX  = 1 << AdSizeShift,
+    AdSize16 = 1 << AdSizeShift,
+    AdSize32 = 2 << AdSizeShift,
+    AdSize64 = 3 << AdSizeShift,
 
     //===------------------------------------------------------------------===//
     // OpPrefix - There are several prefix bytes that are used as opcode
     // extensions. These are 0x66, 0xF3, and 0xF2. If this field is 0 there is
     // no prefix.
     //
-    OpPrefixShift = AdSizeShift + 1,
+    OpPrefixShift = AdSizeShift + 2,
     OpPrefixMask  = 0x7 << OpPrefixShift,
 
     // PS, PD - Prefix code for packed single and double precision vector
@@ -669,19 +678,10 @@ namespace X86II {
        return -1;
     case X86II::MRMDestMem:
       return 0;
-    case X86II::MRMSrcMem: {
-      unsigned FirstMemOp = 1;
-      if (HasVEX_4V)
-        ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
-      if (HasMemOp4)
-        ++FirstMemOp;// Skip the register source (which is encoded in I8IMM).
-      if (HasEVEX_K)
-        ++FirstMemOp;// Skip the mask register
-      // FIXME: Maybe lea should have its own form?  This is a horrible hack.
-      //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
-      //    Opcode == X86::LEA16r || Opcode == X86::LEA32r)
-      return FirstMemOp;
-    }
+    case X86II::MRMSrcMem:
+      // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a
+      // mask register.
+      return 1 + HasVEX_4V + HasMemOp4 + HasEVEX_K;
     case X86II::MRMXr:
     case X86II::MRM0r: case X86II::MRM1r:
     case X86II::MRM2r: case X86II::MRM3r:
@@ -692,15 +692,9 @@ namespace X86II {
     case X86II::MRM0m: case X86II::MRM1m:
     case X86II::MRM2m: case X86II::MRM3m:
     case X86II::MRM4m: case X86II::MRM5m:
-    case X86II::MRM6m: case X86II::MRM7m: {
-      bool HasVEX_4V = TSFlags & X86II::VEX_4V;
-      unsigned FirstMemOp = 0;
-      if (HasVEX_4V)
-        ++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV).
-      if (HasEVEX_K)
-        ++FirstMemOp;// Skip the mask register
-      return FirstMemOp;
-    }
+    case X86II::MRM6m: case X86II::MRM7m:
+      // Start from 0, skip registers encoded in VEX_VVVV or a mask register.
+      return 0 + HasVEX_4V + HasEVEX_K;
     case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
     case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8:
     case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
@@ -759,7 +753,7 @@ namespace X86II {
             (RegNo > X86::ZMM15 && RegNo <= X86::ZMM31));
   }
 
-  
+
   inline bool isX86_64NonExtLowByteReg(unsigned reg) {
     return (reg == X86::SPL || reg == X86::BPL ||
             reg == X86::SIL || reg == X86::DIL);
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index be6a8e4..e8b0b4c 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -222,6 +222,9 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
         case MCSymbolRefExpr::VK_GOT:
           Type = ELF::R_386_GOT32;
           break;
+        case MCSymbolRefExpr::VK_PLT:
+          Type = ELF::R_386_PLT32;
+          break;
         case MCSymbolRefExpr::VK_GOTOFF:
           Type = ELF::R_386_GOTOFF;
           break;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 5679d63..e64b963 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -108,12 +108,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
   // Exceptions handling
   ExceptionsType = ExceptionHandling::DwarfCFI;
 
-  // OpenBSD and Bitrig have buggy support for .quad in 32-bit mode, just split
-  // into two .words.
-  if ((T.getOS() == Triple::OpenBSD || T.getOS() == Triple::Bitrig) &&
-       T.getArch() == Triple::x86)
-    Data64bitsDirective = nullptr;
-
   // Always enable the integrated assembler by default.
   // Clang also enabled it when the OS is Solaris but that is redundant here.
   UseIntegratedAssembler = true;
@@ -135,9 +129,10 @@ void X86MCAsmInfoMicrosoft::anchor() { }
 X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
   if (Triple.getArch() == Triple::x86_64) {
     PrivateGlobalPrefix = ".L";
+    PrivateLabelPrefix = ".L";
     PointerSize = 8;
     WinEHEncodingType = WinEH::EncodingType::Itanium;
-    ExceptionsType = ExceptionHandling::ItaniumWinEH;
+    ExceptionsType = ExceptionHandling::WinEH;
   }
 
   AssemblerDialect = AsmWriterFlavor;
@@ -155,9 +150,10 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
   assert(Triple.isOSWindows() && "Windows is the only supported COFF target");
   if (Triple.getArch() == Triple::x86_64) {
     PrivateGlobalPrefix = ".L";
+    PrivateLabelPrefix = ".L";
     PointerSize = 8;
     WinEHEncodingType = WinEH::EncodingType::Itanium;
-    ExceptionsType = ExceptionHandling::ItaniumWinEH;
+    ExceptionsType = ExceptionHandling::WinEH;
   } else {
     ExceptionsType = ExceptionHandling::DwarfCFI;
   }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
index f2f06c3..deaad2a 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -23,7 +23,8 @@ namespace llvm {
   class Triple;
 
   class X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
-    void anchor() override;
+    virtual void anchor();
+
   public:
     explicit X86MCAsmInfoDarwin(const Triple &Triple);
   };
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 31b8e2d..3ad8ab1 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -30,8 +30,8 @@ using namespace llvm;
 
 namespace {
 class X86MCCodeEmitter : public MCCodeEmitter {
-  X86MCCodeEmitter(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
-  void operator=(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  X86MCCodeEmitter(const X86MCCodeEmitter &) = delete;
+  void operator=(const X86MCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   MCContext &Ctx;
 public:
@@ -590,6 +590,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
                                            int MemOperand, const MCInst &MI,
                                            const MCInstrDesc &Desc,
                                            raw_ostream &OS) const {
+  assert(!(TSFlags & X86II::LOCK) && "Can't have LOCK VEX.");
+
   uint64_t Encoding = TSFlags & X86II::EncodingMask;
   bool HasEVEX_K = TSFlags & X86II::EVEX_K;
   bool HasVEX_4V = TSFlags & X86II::VEX_4V;
@@ -721,7 +723,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  MemAddr, src1(VEX_4V), src2(ModR/M)
     //  MemAddr, src1(ModR/M), imm8
     //
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + 
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand +
                                                  X86::AddrBaseReg).getReg()))
       VEX_B = 0x0;
     if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand +
@@ -863,7 +865,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
         EVEX_rc = MI.getOperand(RcOperand).getImm() & 0x3;
       }
       EncodeRC = true;
-    }      
+    }
     break;
   case X86II::MRMDestReg:
     // MRMDestReg instructions forms:
@@ -1109,6 +1111,10 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
                                                          : X86II::OpSize16))
     EmitByte(0x66, CurByte, OS);
 
+  // Emit the LOCK opcode prefix.
+  if (TSFlags & X86II::LOCK)
+    EmitByte(0xF0, CurByte, OS);
+
   switch (TSFlags & X86II::OpPrefixMask) {
   case X86II::PD:   // 66
     EmitByte(0x66, CurByte, OS);
@@ -1182,10 +1188,6 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
   if (MemoryOperand != -1) MemoryOperand += CurOp;
 
-  // Emit the lock opcode prefix as needed.
-  if (TSFlags & X86II::LOCK)
-    EmitByte(0xF0, CurByte, OS);
-
   // Emit segment override opcode prefix as needed.
   if (MemoryOperand >= 0)
     EmitSegmentOverridePrefix(CurByte, MemoryOperand+X86::AddrSegmentReg,
@@ -1197,16 +1199,10 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
 
   // Emit the address size opcode prefix as needed.
   bool need_address_override;
-  // The AdSize prefix is only for 32-bit and 64-bit modes. Hm, perhaps we
-  // should introduce an AdSize16 bit instead of having seven special cases?
-  if ((!is16BitMode(STI) && TSFlags & X86II::AdSize) ||
-      (is16BitMode(STI) && (MI.getOpcode() == X86::JECXZ_32 ||
-                         MI.getOpcode() == X86::MOV8o8a ||
-                         MI.getOpcode() == X86::MOV16o16a ||
-                         MI.getOpcode() == X86::MOV32o32a ||
-                         MI.getOpcode() == X86::MOV8ao8 ||
-                         MI.getOpcode() == X86::MOV16ao16 ||
-                         MI.getOpcode() == X86::MOV32ao32))) {
+  uint64_t AdSize = TSFlags & X86II::AdSizeMask;
+  if ((is16BitMode(STI) && AdSize == X86II::AdSize32) ||
+      (is32BitMode(STI) && AdSize == X86II::AdSize16) ||
+      (is64BitMode(STI) && AdSize == X86II::AdSize32)) {
     need_address_override = true;
   } else if (MemoryOperand < 0) {
     need_address_override = false;
@@ -1430,83 +1426,31 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     break;
   }
   case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
-  case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8:
+  case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
+  case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
   case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+  case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE:
   case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1:
-  case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6:
-  case X86II::MRM_D7: case X86II::MRM_D8: case X86II::MRM_D9:
-  case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC:
-  case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF:
-  case X86II::MRM_E0: case X86II::MRM_E1: case X86II::MRM_E2:
-  case X86II::MRM_E3: case X86II::MRM_E4: case X86II::MRM_E5:
-  case X86II::MRM_E8: case X86II::MRM_E9: case X86II::MRM_EA:
-  case X86II::MRM_EB: case X86II::MRM_EC: case X86II::MRM_ED:
-  case X86II::MRM_EE: case X86II::MRM_F0: case X86II::MRM_F1:
-  case X86II::MRM_F2: case X86II::MRM_F3: case X86II::MRM_F4:
-  case X86II::MRM_F5: case X86II::MRM_F6: case X86II::MRM_F7:
-  case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_FA:
-  case X86II::MRM_FB: case X86II::MRM_FC: case X86II::MRM_FD:
-  case X86II::MRM_FE: case X86II::MRM_FF:
+  case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4:
+  case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7:
+  case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA:
+  case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD:
+  case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0:
+  case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3:
+  case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6:
+  case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9:
+  case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+  case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF:
+  case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2:
+  case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5:
+  case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8:
+  case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB:
+  case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE:
+  case X86II::MRM_FF:
     EmitByte(BaseOpcode, CurByte, OS);
 
-    unsigned char MRM;
-    switch (TSFlags & X86II::FormMask) {
-    default: llvm_unreachable("Invalid Form");
-    case X86II::MRM_C0: MRM = 0xC0; break;
-    case X86II::MRM_C1: MRM = 0xC1; break;
-    case X86II::MRM_C2: MRM = 0xC2; break;
-    case X86II::MRM_C3: MRM = 0xC3; break;
-    case X86II::MRM_C4: MRM = 0xC4; break;
-    case X86II::MRM_C8: MRM = 0xC8; break;
-    case X86II::MRM_C9: MRM = 0xC9; break;
-    case X86II::MRM_CA: MRM = 0xCA; break;
-    case X86II::MRM_CB: MRM = 0xCB; break;
-    case X86II::MRM_CF: MRM = 0xCF; break;
-    case X86II::MRM_D0: MRM = 0xD0; break;
-    case X86II::MRM_D1: MRM = 0xD1; break;
-    case X86II::MRM_D4: MRM = 0xD4; break;
-    case X86II::MRM_D5: MRM = 0xD5; break;
-    case X86II::MRM_D6: MRM = 0xD6; break;
-    case X86II::MRM_D7: MRM = 0xD7; break;
-    case X86II::MRM_D8: MRM = 0xD8; break;
-    case X86II::MRM_D9: MRM = 0xD9; break;
-    case X86II::MRM_DA: MRM = 0xDA; break;
-    case X86II::MRM_DB: MRM = 0xDB; break;
-    case X86II::MRM_DC: MRM = 0xDC; break;
-    case X86II::MRM_DD: MRM = 0xDD; break;
-    case X86II::MRM_DE: MRM = 0xDE; break;
-    case X86II::MRM_DF: MRM = 0xDF; break;
-    case X86II::MRM_E0: MRM = 0xE0; break;
-    case X86II::MRM_E1: MRM = 0xE1; break;
-    case X86II::MRM_E2: MRM = 0xE2; break;
-    case X86II::MRM_E3: MRM = 0xE3; break;
-    case X86II::MRM_E4: MRM = 0xE4; break;
-    case X86II::MRM_E5: MRM = 0xE5; break;
-    case X86II::MRM_E8: MRM = 0xE8; break;
-    case X86II::MRM_E9: MRM = 0xE9; break;
-    case X86II::MRM_EA: MRM = 0xEA; break;
-    case X86II::MRM_EB: MRM = 0xEB; break;
-    case X86II::MRM_EC: MRM = 0xEC; break;
-    case X86II::MRM_ED: MRM = 0xED; break;
-    case X86II::MRM_EE: MRM = 0xEE; break;
-    case X86II::MRM_F0: MRM = 0xF0; break;
-    case X86II::MRM_F1: MRM = 0xF1; break;
-    case X86II::MRM_F2: MRM = 0xF2; break;
-    case X86II::MRM_F3: MRM = 0xF3; break;
-    case X86II::MRM_F4: MRM = 0xF4; break;
-    case X86II::MRM_F5: MRM = 0xF5; break;
-    case X86II::MRM_F6: MRM = 0xF6; break;
-    case X86II::MRM_F7: MRM = 0xF7; break;
-    case X86II::MRM_F8: MRM = 0xF8; break;
-    case X86II::MRM_F9: MRM = 0xF9; break;
-    case X86II::MRM_FA: MRM = 0xFA; break;
-    case X86II::MRM_FB: MRM = 0xFB; break;
-    case X86II::MRM_FC: MRM = 0xFC; break;
-    case X86II::MRM_FD: MRM = 0xFD; break;
-    case X86II::MRM_FE: MRM = 0xFE; break;
-    case X86II::MRM_FF: MRM = 0xFF; break;
-    }
-    EmitByte(MRM, CurByte, OS);
+    uint64_t Form = TSFlags & X86II::FormMask;
+    EmitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS);
     break;
   }
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 5a9181d..0e7b4e5 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -134,18 +134,13 @@ bool X86_MC::GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX,
             "c" (subleaf));
     return false;
   #elif defined(_MSC_VER)
-    // __cpuidex was added in MSVC++ 9.0 SP1
-    #if (_MSC_VER > 1500) || (_MSC_VER == 1500 && _MSC_FULL_VER >= 150030729)
-      int registers[4];
-      __cpuidex(registers, value, subleaf);
-      *rEAX = registers[0];
-      *rEBX = registers[1];
-      *rECX = registers[2];
-      *rEDX = registers[3];
-      return false;
-    #else
-      return true;
-    #endif
+    int registers[4];
+    __cpuidex(registers, value, subleaf);
+    *rEAX = registers[0];
+    *rEBX = registers[1];
+    *rECX = registers[2];
+    *rEDX = registers[3];
+    return false;
   #else
     return true;
   #endif
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index aef9571..d8320b9 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -40,8 +40,8 @@ namespace DWARFFlavour {
   enum {
     X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
   };
-} 
-  
+}
+
 /// N86 namespace - Native X86 register numbers
 ///
 namespace N86 {
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 5685a7f..7a83f4c 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -10,6 +10,7 @@
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "MCTargetDesc/X86FixupKinds.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -47,23 +48,21 @@ class X86MachObjectWriter : public MCMachObjectTargetWriter {
                               const MCFixup &Fixup,
                               MCValue Target,
                               uint64_t &FixedValue);
-  void RecordX86_64Relocation(MachObjectWriter *Writer,
-                              const MCAssembler &Asm,
+  void RecordX86_64Relocation(MachObjectWriter *Writer, MCAssembler &Asm,
                               const MCAsmLayout &Layout,
-                              const MCFragment *Fragment,
-                              const MCFixup &Fixup,
-                              MCValue Target,
-                              uint64_t &FixedValue);
+                              const MCFragment *Fragment, const MCFixup &Fixup,
+                              MCValue Target, uint64_t &FixedValue);
+
 public:
   X86MachObjectWriter(bool Is64Bit, uint32_t CPUType,
                       uint32_t CPUSubtype)
     : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
                                /*UseAggressiveSymbolFolding=*/Is64Bit) {}
 
-  void RecordRelocation(MachObjectWriter *Writer,
-                        const MCAssembler &Asm, const MCAsmLayout &Layout,
-                        const MCFragment *Fragment, const MCFixup &Fixup,
-                        MCValue Target, uint64_t &FixedValue) override {
+  void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override {
     if (Writer->is64Bit())
       RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
                              FixedValue);
@@ -97,13 +96,10 @@ static unsigned getFixupKindLog2Size(unsigned Kind) {
   }
 }
 
-void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
-                                                 const MCAssembler &Asm,
-                                                 const MCAsmLayout &Layout,
-                                                 const MCFragment *Fragment,
-                                                 const MCFixup &Fixup,
-                                                 MCValue Target,
-                                                 uint64_t &FixedValue) {
+void X86MachObjectWriter::RecordX86_64Relocation(
+    MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    uint64_t &FixedValue) {
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
   unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind());
   unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
@@ -117,6 +113,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
   unsigned Index = 0;
   unsigned IsExtern = 0;
   unsigned Type = 0;
+  const MCSymbolData *RelSymbol = nullptr;
 
   Value = Target.getConstant();
 
@@ -132,7 +129,6 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
   if (Target.isAbsolute()) { // constant
     // SymbolNum of 0 indicates the absolute section.
     Type = MachO::X86_64_RELOC_UNSIGNED;
-    Index = 0;
 
     // FIXME: I believe this is broken, I don't think the linker can understand
     // it. I think it would require a local relocation, but I'm not sure if that
@@ -184,7 +180,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
     if (A->isUndefined() || B->isUndefined()) {
       StringRef Name = A->isUndefined() ? A->getName() : B->getName();
       Asm.getContext().FatalError(Fixup.getLoc(),
-        "unsupported relocation with subtraction expression, symbol '" + 
+        "unsupported relocation with subtraction expression, symbol '" +
         Name + "' can not be undefined in a subtraction expression");
     }
 
@@ -193,38 +189,30 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
     Value -= Writer->getSymbolAddress(&B_SD, Layout) -
       (!B_Base ? 0 : Writer->getSymbolAddress(B_Base, Layout));
 
-    if (A_Base) {
-      Index = A_Base->getIndex();
-      IsExtern = 1;
-    }
-    else {
+    if (!A_Base)
       Index = A_SD.getFragment()->getParent()->getOrdinal() + 1;
-      IsExtern = 0;
-    }
     Type = MachO::X86_64_RELOC_UNSIGNED;
 
     MachO::any_relocation_info MRE;
     MRE.r_word0 = FixupOffset;
-    MRE.r_word1 = ((Index     <<  0) |
-                   (IsPCRel   << 24) |
-                   (Log2Size  << 25) |
-                   (IsExtern  << 27) |
-                   (Type      << 28));
-    Writer->addRelocation(Fragment->getParent(), MRE);
-
-    if (B_Base) {
-      Index = B_Base->getIndex();
-      IsExtern = 1;
-    }
-    else {
+    MRE.r_word1 =
+        (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+    Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
+
+    if (B_Base)
+      RelSymbol = B_Base;
+    else
       Index = B_SD.getFragment()->getParent()->getOrdinal() + 1;
-      IsExtern = 0;
-    }
     Type = MachO::X86_64_RELOC_SUBTRACTOR;
   } else {
     const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+    if (Symbol->isTemporary() && Value) {
+      const MCSection &Sec = Symbol->getSection();
+      if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
+        Asm.addLocalUsedInReloc(*Symbol);
+    }
     const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
-    const MCSymbolData *Base = Asm.getAtom(&SD);
+    RelSymbol = Asm.getAtom(&SD);
 
     // Relocations inside debug sections always use local relocations when
     // possible. This seems to be done because the debugger doesn't fully
@@ -234,23 +222,20 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
       const MCSectionMachO &Section = static_cast<const MCSectionMachO&>(
         Fragment->getParent()->getSection());
       if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
-        Base = nullptr;
+        RelSymbol = nullptr;
     }
 
     // x86_64 almost always uses external relocations, except when there is no
     // symbol to use as a base address (a local symbol with no preceding
     // non-local symbol).
-    if (Base) {
-      Index = Base->getIndex();
-      IsExtern = 1;
-
+    if (RelSymbol) {
       // Add the local offset, if needed.
-      if (Base != &SD)
-        Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
+      if (RelSymbol != &SD)
+        Value +=
+            Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(RelSymbol);
     } else if (Symbol->isInSection() && !Symbol->isVariable()) {
       // The index is the section ordinal (1-based).
       Index = SD.getFragment()->getParent()->getOrdinal() + 1;
-      IsExtern = 0;
       Value += Writer->getSymbolAddress(&SD, Layout);
 
       if (IsPCRel)
@@ -349,12 +334,9 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
   // struct relocation_info (8 bytes)
   MachO::any_relocation_info MRE;
   MRE.r_word0 = FixupOffset;
-  MRE.r_word1 = ((Index     <<  0) |
-                 (IsPCRel   << 24) |
-                 (Log2Size  << 25) |
-                 (IsExtern  << 27) |
-                 (Type      << 28));
-  Writer->addRelocation(Fragment->getParent(), MRE);
+  MRE.r_word1 = (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                (IsExtern << 27) | (Type << 28);
+  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
 }
 
 bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
@@ -426,7 +408,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
                    (IsPCRel                   << 30) |
                    MachO::R_SCATTERED);
     MRE.r_word1 = Value2;
-    Writer->addRelocation(Fragment->getParent(), MRE);
+    Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
   } else {
     // If the offset is more than 24-bits, it won't fit in a scattered
     // relocation offset field, so we fall back to using a non-scattered
@@ -448,7 +430,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
                  (IsPCRel     << 30) |
                  MachO::R_SCATTERED);
   MRE.r_word1 = Value;
-  Writer->addRelocation(Fragment->getParent(), MRE);
+  Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
   return true;
 }
 
@@ -469,7 +451,6 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
 
   // Get the symbol data.
   const MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol());
-  unsigned Index = SD_A->getIndex();
 
   // We're only going to have a second symbol in pic mode and it'll be a
   // subtraction from the picbase. For 32-bit pic the addend is the difference
@@ -492,12 +473,9 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
   // struct relocation_info (8 bytes)
   MachO::any_relocation_info MRE;
   MRE.r_word0 = Value;
-  MRE.r_word1 = ((Index                    <<  0) |
-                 (IsPCRel                  << 24) |
-                 (Log2Size                 << 25) |
-                 (1                        << 27) | // r_extern
-                 (MachO::GENERIC_RELOC_TLV << 28)); // r_type
-  Writer->addRelocation(Fragment->getParent(), MRE);
+  MRE.r_word1 =
+      (IsPCRel << 24) | (Log2Size << 25) | (MachO::GENERIC_RELOC_TLV << 28);
+  Writer->addRelocation(SD_A, Fragment->getParent(), MRE);
 }
 
 void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
@@ -548,8 +526,8 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
   // See <reloc.h>.
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
   unsigned Index = 0;
-  unsigned IsExtern = 0;
   unsigned Type = 0;
+  const MCSymbolData *RelSymbol = nullptr;
 
   if (Target.isAbsolute()) { // constant
     // SymbolNum of 0 indicates the absolute section.
@@ -570,8 +548,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
 
     // Check whether we need an external or internal relocation.
     if (Writer->doesSymbolRequireExternRelocation(SD)) {
-      IsExtern = 1;
-      Index = SD->getIndex();
+      RelSymbol = SD;
       // For external relocations, make sure to offset the fixup value to
       // compensate for the addend of the symbol address, if it was
       // undefined. This occurs with weak definitions, for example.
@@ -593,12 +570,9 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
   // struct relocation_info (8 bytes)
   MachO::any_relocation_info MRE;
   MRE.r_word0 = FixupOffset;
-  MRE.r_word1 = ((Index     <<  0) |
-                 (IsPCRel   << 24) |
-                 (Log2Size  << 25) |
-                 (IsExtern  << 27) |
-                 (Type      << 28));
-  Writer->addRelocation(Fragment->getParent(), MRE);
+  MRE.r_word1 =
+      (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
 }
 
 MCObjectWriter *llvm::createX86MachObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 40af822..e1df5c2 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -28,7 +28,8 @@ namespace {
     virtual ~X86WinCOFFObjectWriter();
 
     unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
-                          bool IsCrossSection) const override;
+                          bool IsCrossSection,
+                          const MCAsmBackend &MAB) const override;
   };
 }
 
@@ -40,7 +41,8 @@ X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {}
 
 unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
                                               const MCFixup &Fixup,
-                                              bool IsCrossSection) const {
+                                              bool IsCrossSection,
+                                              const MCAsmBackend &MAB) const {
   unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind();
 
   MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
index 1ea8798..fceb083 100644
--- a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
+++ b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -13,7 +13,7 @@ using namespace llvm;
 
 Target llvm::TheX86_32Target, llvm::TheX86_64Target;
 
-extern "C" void LLVMInitializeX86TargetInfo() { 
+extern "C" void LLVMInitializeX86TargetInfo() {
   RegisterTarget<Triple::x86, /*HasJIT=*/true>
     X(TheX86_32Target, "x86", "32-bit X86: Pentium-Pro and above");
 
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index ba6cbc8..a7101e4 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -1,395 +1,434 @@
-//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Define several functions to decode x86 specific shuffle semantics into a
-// generic vector mask.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86ShuffleDecode.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/CodeGen/MachineValueType.h"
-
-//===----------------------------------------------------------------------===//
-//  Vector Mask Decoding
-//===----------------------------------------------------------------------===//
-
-namespace llvm {
-
-void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  // Defaults the copying the dest value.
-  ShuffleMask.push_back(0);
-  ShuffleMask.push_back(1);
-  ShuffleMask.push_back(2);
-  ShuffleMask.push_back(3);
-
-  // Decode the immediate.
-  unsigned ZMask = Imm & 15;
-  unsigned CountD = (Imm >> 4) & 3;
-  unsigned CountS = (Imm >> 6) & 3;
-
-  // CountS selects which input element to use.
-  unsigned InVal = 4+CountS;
-  // CountD specifies which element of destination to update.
-  ShuffleMask[CountD] = InVal;
-  // ZMask zaps values, potentially overriding the CountD elt.
-  if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero;
-  if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero;
-  if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero;
-  if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
-}
-
-// <3,1> or <6,7,2,3>
-void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
-  for (unsigned i = NElts/2; i != NElts; ++i)
-    ShuffleMask.push_back(NElts+i);
-
-  for (unsigned i = NElts/2; i != NElts; ++i)
-    ShuffleMask.push_back(i);
-}
-
-// <0,2> or <0,1,4,5>
-void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
-  for (unsigned i = 0; i != NElts/2; ++i)
-    ShuffleMask.push_back(i);
-
-  for (unsigned i = 0; i != NElts/2; ++i)
-    ShuffleMask.push_back(NElts+i);
-}
-
-void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-  for (int i = 0, e = NumElts / 2; i < e; ++i) {
-    ShuffleMask.push_back(2 * i);
-    ShuffleMask.push_back(2 * i);
-  }
-}
-
-void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-  for (int i = 0, e = NumElts / 2; i < e; ++i) {
-    ShuffleMask.push_back(2 * i + 1);
-    ShuffleMask.push_back(2 * i + 1);
-  }
-}
-
-void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned VectorSizeInBits = VT.getSizeInBits();
-  unsigned NumElts = VectorSizeInBits / 8;
-  unsigned NumLanes = VectorSizeInBits / 128;
-  unsigned NumLaneElts = NumElts / NumLanes;
-
-  for (unsigned l = 0; l < NumElts; l += NumLaneElts)
-    for (unsigned i = 0; i < NumLaneElts; ++i) {
-      int M = SM_SentinelZero;
-      if (i >= Imm) M = i - Imm + l;
-      ShuffleMask.push_back(M);
-    }
-}
-
-void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned VectorSizeInBits = VT.getSizeInBits();
-  unsigned NumElts = VectorSizeInBits / 8;
-  unsigned NumLanes = VectorSizeInBits / 128;
-  unsigned NumLaneElts = NumElts / NumLanes;
-
-  for (unsigned l = 0; l < NumElts; l += NumLaneElts)
-    for (unsigned i = 0; i < NumLaneElts; ++i) {
-      unsigned Base = i + Imm;
-      int M = Base + l;
-      if (Base >= NumLaneElts) M = SM_SentinelZero;
-      ShuffleMask.push_back(M);
-    }
-}
-
-void DecodePALIGNRMask(MVT VT, unsigned Imm,
-                       SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8);
-
-  unsigned NumLanes = VT.getSizeInBits() / 128;
-  unsigned NumLaneElts = NumElts / NumLanes;
-
-  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
-    for (unsigned i = 0; i != NumLaneElts; ++i) {
-      unsigned Base = i + Offset;
-      // if i+offset is out of this lane then we actually need the other source
-      if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
-      ShuffleMask.push_back(Base + l);
-    }
-  }
-}
-
-/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
-/// VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
-void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  unsigned NumLanes = VT.getSizeInBits() / 128;
-  unsigned NumLaneElts = NumElts / NumLanes;
-
-  unsigned NewImm = Imm;
-  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
-    for (unsigned i = 0; i != NumLaneElts; ++i) {
-      ShuffleMask.push_back(NewImm % NumLaneElts + l);
-      NewImm /= NumLaneElts;
-    }
-    if (NumLaneElts == 4) NewImm = Imm; // reload imm
-  }
-}
-
-void DecodePSHUFHWMask(MVT VT, unsigned Imm,
-                       SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  for (unsigned l = 0; l != NumElts; l += 8) {
-    unsigned NewImm = Imm;
-    for (unsigned i = 0, e = 4; i != e; ++i) {
-      ShuffleMask.push_back(l + i);
-    }
-    for (unsigned i = 4, e = 8; i != e; ++i) {
-      ShuffleMask.push_back(l + 4 + (NewImm & 3));
-      NewImm >>= 2;
-    }
-  }
-}
-
-void DecodePSHUFLWMask(MVT VT, unsigned Imm,
-                       SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  for (unsigned l = 0; l != NumElts; l += 8) {
-    unsigned NewImm = Imm;
-    for (unsigned i = 0, e = 4; i != e; ++i) {
-      ShuffleMask.push_back(l + (NewImm & 3));
-      NewImm >>= 2;
-    }
-    for (unsigned i = 4, e = 8; i != e; ++i) {
-      ShuffleMask.push_back(l + i);
-    }
-  }
-}
-
-/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
-/// the type of the vector allowing it to handle different datatypes and vector
-/// widths.
-void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  unsigned NumLanes = VT.getSizeInBits() / 128;
-  unsigned NumLaneElts = NumElts / NumLanes;
-
-  unsigned NewImm = Imm;
-  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
-    // each half of a lane comes from different source
-    for (unsigned s = 0; s != NumElts*2; s += NumElts) {
-      for (unsigned i = 0; i != NumLaneElts/2; ++i) {
-        ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
-        NewImm /= NumLaneElts;
-      }
-    }
-    if (NumLaneElts == 4) NewImm = Imm; // reload imm
-  }
-}
-
-/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
-/// and punpckh*. VT indicates the type of the vector allowing it to handle
-/// different datatypes and vector widths.
-void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
-  // independently on 128-bit lanes.
-  unsigned NumLanes = VT.getSizeInBits() / 128;
-  if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
-  unsigned NumLaneElts = NumElts / NumLanes;
-
-  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
-    for (unsigned i = l + NumLaneElts/2, e = l + NumLaneElts; i != e; ++i) {
-      ShuffleMask.push_back(i);          // Reads from dest/src1
-      ShuffleMask.push_back(i+NumElts);  // Reads from src/src2
-    }
-  }
-}
-
-/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
-/// and punpckl*. VT indicates the type of the vector allowing it to handle
-/// different datatypes and vector widths.
-void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
-  // independently on 128-bit lanes.
-  unsigned NumLanes = VT.getSizeInBits() / 128;
-  if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
-  unsigned NumLaneElts = NumElts / NumLanes;
-
-  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
-    for (unsigned i = l, e = l + NumLaneElts/2; i != e; ++i) {
-      ShuffleMask.push_back(i);          // Reads from dest/src1
-      ShuffleMask.push_back(i+NumElts);  // Reads from src/src2
-    }
-  }
-}
-
-void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
-                          SmallVectorImpl<int> &ShuffleMask) {
-  if (Imm & 0x88)
-    return; // Not a shuffle
-
-  unsigned HalfSize = VT.getVectorNumElements()/2;
-
-  for (unsigned l = 0; l != 2; ++l) {
-    unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize;
-    for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i)
-      ShuffleMask.push_back(i);
-  }
-}
-
-void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  assert(MaskTy->isVectorTy() && "Expected a vector constant mask!");
-  assert(MaskTy->getVectorElementType()->isIntegerTy(8) &&
-         "Expected i8 constant mask elements!");
-  int NumElements = MaskTy->getVectorNumElements();
-  // FIXME: Add support for AVX-512.
-  assert((NumElements == 16 || NumElements == 32) &&
-         "Only 128-bit and 256-bit vectors supported!");
-  ShuffleMask.reserve(NumElements);
-
-  if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
-    assert((unsigned)NumElements == CDS->getNumElements() &&
-           "Constant mask has a different number of elements!");
-
-    for (int i = 0; i < NumElements; ++i) {
-      // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
-      // lane of the vector we're inside.
-      int Base = i < 16 ? 0 : 16;
-      uint64_t Element = CDS->getElementAsInteger(i);
-      // If the high bit (7) of the byte is set, the element is zeroed.
-      if (Element & (1 << 7))
-        ShuffleMask.push_back(SM_SentinelZero);
-      else {
-        // Only the least significant 4 bits of the byte are used.
-        int Index = Base + (Element & 0xf);
-        ShuffleMask.push_back(Index);
-      }
-    }
-  } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
-    assert((unsigned)NumElements == CV->getNumOperands() &&
-           "Constant mask has a different number of elements!");
-
-    for (int i = 0; i < NumElements; ++i) {
-      // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
-      // lane of the vector we're inside.
-      int Base = i < 16 ? 0 : 16;
-      Constant *COp = CV->getOperand(i);
-      if (isa<UndefValue>(COp)) {
-        ShuffleMask.push_back(SM_SentinelUndef);
-        continue;
-      }
-      uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
-      // If the high bit (7) of the byte is set, the element is zeroed.
-      if (Element & (1 << 7))
-        ShuffleMask.push_back(SM_SentinelZero);
-      else {
-        // Only the least significant 4 bits of the byte are used.
-        int Index = Base + (Element & 0xf);
-        ShuffleMask.push_back(Index);
-      }
-    }
-  }
-}
-
-void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
-                      SmallVectorImpl<int> &ShuffleMask) {
-  for (int i = 0, e = RawMask.size(); i < e; ++i) {
-    uint64_t M = RawMask[i];
-    if (M == (uint64_t)SM_SentinelUndef) {
-      ShuffleMask.push_back(M);
-      continue;
-    }
-    // For AVX vectors with 32 bytes the base of the shuffle is the half of
-    // the vector we're inside.
-    int Base = i < 16 ? 0 : 16;
-    // If the high bit (7) of the byte is set, the element is zeroed.
-    if (M & (1 << 7))
-      ShuffleMask.push_back(SM_SentinelZero);
-    else {
-      // Only the least significant 4 bits of the byte are used.
-      int Index = Base + (M & 0xf);
-      ShuffleMask.push_back(Index);
-    }
-  }
-}
-
-void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  int ElementBits = VT.getScalarSizeInBits();
-  int NumElements = VT.getVectorNumElements();
-  for (int i = 0; i < NumElements; ++i) {
-    // If there are more than 8 elements in the vector, then any immediate blend
-    // mask applies to each 128-bit lane. There can never be more than
-    // 8 elements in a 128-bit lane with an immediate blend.
-    int Bit = NumElements > 8 ? i % (128 / ElementBits) : i;
-    assert(Bit < 8 &&
-           "Immediate blends only operate over 8 elements at a time!");
-    ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i);
-  }
-}
-
-/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
-/// No VT provided since it only works on 256-bit, 4 element vectors.
-void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  for (unsigned i = 0; i != 4; ++i) {
-    ShuffleMask.push_back((Imm >> (2*i)) & 3);
-  }
-}
-
-void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  assert(MaskTy->isVectorTy() && "Expected a vector constant mask!");
-  assert(MaskTy->getVectorElementType()->isIntegerTy() &&
-         "Expected integer constant mask elements!");
-  int ElementBits = MaskTy->getScalarSizeInBits();
-  int NumElements = MaskTy->getVectorNumElements();
-  assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
-         "Unexpected number of vector elements.");
-  ShuffleMask.reserve(NumElements);
-  if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
-    assert((unsigned)NumElements == CDS->getNumElements() &&
-           "Constant mask has a different number of elements!");
-
-    for (int i = 0; i < NumElements; ++i) {
-      int Base = (i * ElementBits / 128) * (128 / ElementBits);
-      uint64_t Element = CDS->getElementAsInteger(i);
-      // Only the least significant 2 bits of the integer are used.
-      int Index = Base + (Element & 0x3);
-      ShuffleMask.push_back(Index);
-    }
-  } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
-    assert((unsigned)NumElements == C->getNumOperands() &&
-           "Constant mask has a different number of elements!");
-
-    for (int i = 0; i < NumElements; ++i) {
-      int Base = (i * ElementBits / 128) * (128 / ElementBits);
-      Constant *COp = CV->getOperand(i);
-      if (isa<UndefValue>(COp)) {
-        ShuffleMask.push_back(SM_SentinelUndef);
-        continue;
-      }
-      uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
-      // Only the least significant 2 bits of the integer are used.
-      int Index = Base + (Element & 0x3);
-      ShuffleMask.push_back(Index);
-    }
-  }
-}
-
-} // llvm namespace
+//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecode.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/CodeGen/MachineValueType.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  // Defaults the copying the dest value.
+  ShuffleMask.push_back(0);
+  ShuffleMask.push_back(1);
+  ShuffleMask.push_back(2);
+  ShuffleMask.push_back(3);
+
+  // Decode the immediate.
+  unsigned ZMask = Imm & 15;
+  unsigned CountD = (Imm >> 4) & 3;
+  unsigned CountS = (Imm >> 6) & 3;
+
+  // CountS selects which input element to use.
+  unsigned InVal = 4+CountS;
+  // CountD specifies which element of destination to update.
+  ShuffleMask[CountD] = InVal;
+  // ZMask zaps values, potentially overriding the CountD elt.
+  if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero;
+  if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero;
+  if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero;
+  if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
+}
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = NElts/2; i != NElts; ++i)
+    ShuffleMask.push_back(NElts+i);
+
+  for (unsigned i = NElts/2; i != NElts; ++i)
+    ShuffleMask.push_back(i);
+}
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = 0; i != NElts/2; ++i)
+    ShuffleMask.push_back(i);
+
+  for (unsigned i = 0; i != NElts/2; ++i)
+    ShuffleMask.push_back(NElts+i);
+}
+
+void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  for (int i = 0, e = NumElts / 2; i < e; ++i) {
+    ShuffleMask.push_back(2 * i);
+    ShuffleMask.push_back(2 * i);
+  }
+}
+
+void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  for (int i = 0, e = NumElts / 2; i < e; ++i) {
+    ShuffleMask.push_back(2 * i + 1);
+    ShuffleMask.push_back(2 * i + 1);
+  }
+}
+
+void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VectorSizeInBits = VT.getSizeInBits();
+  unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VectorSizeInBits / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+  unsigned NumLaneSubElts = 64 / ScalarSizeInBits;
+
+  for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+    for (unsigned i = 0; i < NumLaneElts; i += NumLaneSubElts)
+      for (unsigned s = 0; s != NumLaneSubElts; s++)
+        ShuffleMask.push_back(l + s);
+}
+
+void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VectorSizeInBits = VT.getSizeInBits();
+  unsigned NumElts = VectorSizeInBits / 8;
+  unsigned NumLanes = VectorSizeInBits / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+    for (unsigned i = 0; i < NumLaneElts; ++i) {
+      int M = SM_SentinelZero;
+      if (i >= Imm) M = i - Imm + l;
+      ShuffleMask.push_back(M);
+    }
+}
+
+void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VectorSizeInBits = VT.getSizeInBits();
+  unsigned NumElts = VectorSizeInBits / 8;
+  unsigned NumLanes = VectorSizeInBits / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+    for (unsigned i = 0; i < NumLaneElts; ++i) {
+      unsigned Base = i + Imm;
+      int M = Base + l;
+      if (Base >= NumLaneElts) M = SM_SentinelZero;
+      ShuffleMask.push_back(M);
+    }
+}
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8);
+
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      unsigned Base = i + Offset;
+      // if i+offset is out of this lane then we actually need the other source
+      if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
+      ShuffleMask.push_back(Base + l);
+    }
+  }
+}
+
+/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  unsigned NewImm = Imm;
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      ShuffleMask.push_back(NewImm % NumLaneElts + l);
+      NewImm /= NumLaneElts;
+    }
+    if (NumLaneElts == 4) NewImm = Imm; // reload imm
+  }
+}
+
+void DecodePSHUFHWMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    unsigned NewImm = Imm;
+    for (unsigned i = 0, e = 4; i != e; ++i) {
+      ShuffleMask.push_back(l + i);
+    }
+    for (unsigned i = 4, e = 8; i != e; ++i) {
+      ShuffleMask.push_back(l + 4 + (NewImm & 3));
+      NewImm >>= 2;
+    }
+  }
+}
+
+void DecodePSHUFLWMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    unsigned NewImm = Imm;
+    for (unsigned i = 0, e = 4; i != e; ++i) {
+      ShuffleMask.push_back(l + (NewImm & 3));
+      NewImm >>= 2;
+    }
+    for (unsigned i = 4, e = 8; i != e; ++i) {
+      ShuffleMask.push_back(l + i);
+    }
+  }
+}
+
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  unsigned NewImm = Imm;
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    // each half of a lane comes from different source
+    for (unsigned s = 0; s != NumElts*2; s += NumElts) {
+      for (unsigned i = 0; i != NumLaneElts/2; ++i) {
+        ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
+        NewImm /= NumLaneElts;
+      }
+    }
+    if (NumLaneElts == 4) NewImm = Imm; // reload imm
+  }
+}
+
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = l + NumLaneElts/2, e = l + NumLaneElts; i != e; ++i) {
+      ShuffleMask.push_back(i);          // Reads from dest/src1
+      ShuffleMask.push_back(i+NumElts);  // Reads from src/src2
+    }
+  }
+}
+
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = l, e = l + NumLaneElts/2; i != e; ++i) {
+      ShuffleMask.push_back(i);          // Reads from dest/src1
+      ShuffleMask.push_back(i+NumElts);  // Reads from src/src2
+    }
+  }
+}
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+                          SmallVectorImpl<int> &ShuffleMask) {
+  if (Imm & 0x88)
+    return; // Not a shuffle
+
+  unsigned HalfSize = VT.getVectorNumElements()/2;
+
+  for (unsigned l = 0; l != 2; ++l) {
+    unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize;
+    for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i)
+      ShuffleMask.push_back(i);
+  }
+}
+
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  // It is not an error for the PSHUFB mask to not be a vector of i8 because the
+  // constant pool uniques constants by their bit representation.
+  // e.g. the following take up the same space in the constant pool:
+  //   i128 -170141183420855150465331762880109871104
+  //
+  //   <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
+  //
+  //   <4 x i32> <i32 -2147483648, i32 -2147483648,
+  //              i32 -2147483648, i32 -2147483648>
+
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+
+  if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512.
+    return;
+
+  // This is a straightforward byte vector.
+  if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) {
+    int NumElements = MaskTy->getVectorNumElements();
+    ShuffleMask.reserve(NumElements);
+
+    for (int i = 0; i < NumElements; ++i) {
+      // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
+      // lane of the vector we're inside.
+      int Base = i < 16 ? 0 : 16;
+      Constant *COp = C->getAggregateElement(i);
+      if (!COp) {
+        ShuffleMask.clear();
+        return;
+      } else if (isa<UndefValue>(COp)) {
+        ShuffleMask.push_back(SM_SentinelUndef);
+        continue;
+      }
+      uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+      // If the high bit (7) of the byte is set, the element is zeroed.
+      if (Element & (1 << 7))
+        ShuffleMask.push_back(SM_SentinelZero);
+      else {
+        // Only the least significant 4 bits of the byte are used.
+        int Index = Base + (Element & 0xf);
+        ShuffleMask.push_back(Index);
+      }
+    }
+  }
+  // TODO: Handle funny-looking vectors too.
+}
+
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  for (int i = 0, e = RawMask.size(); i < e; ++i) {
+    uint64_t M = RawMask[i];
+    if (M == (uint64_t)SM_SentinelUndef) {
+      ShuffleMask.push_back(M);
+      continue;
+    }
+    // For AVX vectors with 32 bytes the base of the shuffle is the half of
+    // the vector we're inside.
+    int Base = i < 16 ? 0 : 16;
+    // If the high bit (7) of the byte is set, the element is zeroed.
+    if (M & (1 << 7))
+      ShuffleMask.push_back(SM_SentinelZero);
+    else {
+      // Only the least significant 4 bits of the byte are used.
+      int Index = Base + (M & 0xf);
+      ShuffleMask.push_back(Index);
+    }
+  }
+}
+
+void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  int ElementBits = VT.getScalarSizeInBits();
+  int NumElements = VT.getVectorNumElements();
+  for (int i = 0; i < NumElements; ++i) {
+    // If there are more than 8 elements in the vector, then any immediate blend
+    // mask applies to each 128-bit lane. There can never be more than
+    // 8 elements in a 128-bit lane with an immediate blend.
+    int Bit = NumElements > 8 ? i % (128 / ElementBits) : i;
+    assert(Bit < 8 &&
+           "Immediate blends only operate over 8 elements at a time!");
+    ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i);
+  }
+}
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = 0; i != 4; ++i) {
+    ShuffleMask.push_back((Imm >> (2*i)) & 3);
+  }
+}
+
+void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  assert(MaskTy->isVectorTy() && "Expected a vector constant mask!");
+  assert(MaskTy->getVectorElementType()->isIntegerTy() &&
+         "Expected integer constant mask elements!");
+  int ElementBits = MaskTy->getScalarSizeInBits();
+  int NumElements = MaskTy->getVectorNumElements();
+  assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
+         "Unexpected number of vector elements.");
+  ShuffleMask.reserve(NumElements);
+  if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
+    assert((unsigned)NumElements == CDS->getNumElements() &&
+           "Constant mask has a different number of elements!");
+
+    for (int i = 0; i < NumElements; ++i) {
+      int Base = (i * ElementBits / 128) * (128 / ElementBits);
+      uint64_t Element = CDS->getElementAsInteger(i);
+      // Only the least significant 2 bits of the integer are used.
+      int Index = Base + (Element & 0x3);
+      ShuffleMask.push_back(Index);
+    }
+  } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
+    assert((unsigned)NumElements == C->getNumOperands() &&
+           "Constant mask has a different number of elements!");
+
+    for (int i = 0; i < NumElements; ++i) {
+      int Base = (i * ElementBits / 128) * (128 / ElementBits);
+      Constant *COp = CV->getOperand(i);
+      if (isa<UndefValue>(COp)) {
+        ShuffleMask.push_back(SM_SentinelUndef);
+        continue;
+      }
+      uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+      // Only the least significant 2 bits of the integer are used.
+      int Index = Base + (Element & 0x3);
+      ShuffleMask.push_back(Index);
+    }
+  }
+}
+
+void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl<int> &Mask) {
+  unsigned NumDstElts = DstVT.getVectorNumElements();
+  unsigned SrcScalarBits = SrcVT.getScalarSizeInBits();
+  unsigned DstScalarBits = DstVT.getScalarSizeInBits();
+  unsigned Scale = DstScalarBits / SrcScalarBits;
+  assert(SrcScalarBits < DstScalarBits &&
+         "Expected zero extension mask to increase scalar size");
+  assert(SrcVT.getVectorNumElements() >= NumDstElts &&
+         "Too many zero extension lanes");
+
+  for (unsigned i = 0; i != NumDstElts; i++) {
+    Mask.push_back(i);
+    for (unsigned j = 1; j != Scale; j++)
+      Mask.push_back(SM_SentinelZero);
+  }
+}
+
+void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  ShuffleMask.push_back(0);
+  for (unsigned i = 1; i < NumElts; i++)
+    ShuffleMask.push_back(SM_SentinelZero);
+}
+
+void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) {
+  // First element comes from the first element of second source.
+  // Remaining elements: Load zero extends / Move copies from first source.
+  unsigned NumElts = VT.getVectorNumElements();
+  Mask.push_back(NumElts);
+  for (unsigned i = 1; i < NumElts; i++)
+    Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
+}
+} // llvm namespace
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 6ba3c64..5c9a8cf 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -1,93 +1,105 @@
-//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Define several functions to decode x86 specific shuffle semantics into a
-// generic vector mask.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
-#define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/ArrayRef.h"
-
-//===----------------------------------------------------------------------===//
-//  Vector Mask Decoding
-//===----------------------------------------------------------------------===//
-
-namespace llvm {
-class Constant;
-class MVT;
-
-enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
-
-void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
-
-// <3,1> or <6,7,2,3>
-void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
-
-// <0,2> or <0,1,4,5>
-void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
-
-void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
-
-void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
-
-void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
-
-void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
-
-void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
-
-void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
-
-void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
-
-void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
-
-/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
-/// the type of the vector allowing it to handle different datatypes and vector
-/// widths.
-void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
-
-/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
-/// and punpckh*. VT indicates the type of the vector allowing it to handle
-/// different datatypes and vector widths.
-void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
-
-/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
-/// and punpckl*. VT indicates the type of the vector allowing it to handle
-/// different datatypes and vector widths.
-void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
-
-/// \brief Decode a PSHUFB mask from an IR-level vector constant.
-void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
-
-/// \brief Decode a PSHUFB mask from a raw array of constants such as from
-/// BUILD_VECTOR.
-void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
-                      SmallVectorImpl<int> &ShuffleMask);
-
-/// \brief Decode a BLEND immediate mask into a shuffle mask.
-void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
-
-void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
-                          SmallVectorImpl<int> &ShuffleMask);
-
-/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
-/// No VT provided since it only works on 256-bit, 4 element vectors.
-void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
-
-/// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
-void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
-
-} // llvm namespace
-
-#endif
+//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+#define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/ArrayRef.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class Constant;
+class MVT;
+
+enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a PSHUFB mask from an IR-level vector constant.
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a PSHUFB mask from a raw array of constants such as from
+/// BUILD_VECTOR.
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a BLEND immediate mask into a shuffle mask.
+void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+                          SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
+void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a zero extension instruction as a shuffle mask.
+void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT,
+                          SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a move lower and zero upper instruction as a shuffle mask.
+void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a scalar float move instruction as a shuffle mask.
+void DecodeScalarMoveMask(MVT VT, bool IsLoad,
+                          SmallVectorImpl<int> &ShuffleMask);
+} // llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 8bd5817..8b0a4cf 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -55,9 +55,6 @@ FunctionPass *createX86IssueVZeroUpperPass();
 ///
 FunctionPass *createEmitX86CodeToMemory();
 
-/// \brief Creates an X86-specific Target Transformation Info pass.
-ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM);
-
 /// createX86PadShortFunctions - Return a pass that pads short functions
 /// with NOOPs. This will prevent a stall when returning on the Atom.
 FunctionPass *createX86PadShortFunctions();
@@ -67,6 +64,11 @@ FunctionPass *createX86PadShortFunctions();
 /// to eliminate execution delays in some Atom processors.
 FunctionPass *createX86FixupLEAs();
 
+/// createX86CallFrameOptimization - Return a pass that optimizes
+/// the code-size of x86 call sequences. This is done by replacing
+/// esp-relative movs with pushes.
+FunctionPass *createX86CallFrameOptimization();
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 83f55d3..4f9836d 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -79,9 +79,16 @@ def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
                                        "Bit testing of memory is slow">;
 def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
                                        "SHLD instruction is slow">;
+// FIXME: This is a 16-byte (SSE/AVX) feature; we should rename it to make that
+// explicit. Also, it seems this would be the default state for most chips
+// going forward, so it would probably be better to negate the logic and
+// match the 32-byte "slow mem" feature below.
 def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
                                         "IsUAMemFast", "true",
                                         "Fast unaligned memory access">;
+def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
+                            "IsUAMem32Slow", "true",
+                            "Slow unaligned 32-byte memory access">;
 def FeatureSSE4A   : SubtargetFeature<"sse4a", "HasSSE4A", "true",
                                       "Support SSE 4a instructions",
                                       [FeatureSSE3]>;
@@ -125,9 +132,9 @@ def FeatureFMA4    : SubtargetFeature<"fma4", "HasFMA4", "true",
 def FeatureXOP     : SubtargetFeature<"xop", "HasXOP", "true",
                                       "Enable XOP instructions",
                                       [FeatureFMA4]>;
-def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
-                                          "HasVectorUAMem", "true",
-                 "Allow unaligned memory operands on vector/SIMD instructions">;
+def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem",
+                                          "HasSSEUnalignedMem", "true",
+                      "Allow unaligned memory operands with SSE instructions">;
 def FeatureAES     : SubtargetFeature<"aes", "HasAES", "true",
                                       "Enable AES instructions",
                                       [FeatureSSE2]>;
@@ -157,19 +164,18 @@ def FeatureADX     : SubtargetFeature<"adx", "HasADX", "true",
 def FeatureSHA     : SubtargetFeature<"sha", "HasSHA", "true",
                                       "Enable SHA instructions",
                                       [FeatureSSE2]>;
-def FeatureSGX     : SubtargetFeature<"sgx", "HasSGX", "true",
-                                      "Support SGX instructions">;
 def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
                                       "Support PRFCHW instructions">;
 def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
                                       "Support RDSEED instruction">;
-def FeatureSMAP    : SubtargetFeature<"smap", "HasSMAP", "true",
-                                      "Support SMAP instructions">;
 def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
                                      "Use LEA for adjusting the stack pointer">;
-def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
-                                     "HasSlowDivide", "true",
-                                     "Use small divide for positive values less than 256">;
+def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
+                                     "HasSlowDivide32", "true",
+                                     "Use 8-bit divide for positive values less than 256">;
+def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw",
+                                     "HasSlowDivide64", "true",
+                                     "Use 16-bit divide for positive values less than 65536">;
 def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
                                      "PadShortFunctions", "true",
                                      "Pad short functions">;
@@ -230,86 +236,166 @@ def : ProcessorModel<"core2", SandyBridgeModel,
 def : ProcessorModel<"penryn", SandyBridgeModel,
                      [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
 
-// Atom.
-def : ProcessorModel<"atom", AtomModel,
-                     [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B,
-                      FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
-                      FeatureSlowDivide,
-                      FeatureCallRegIndirect,
-                      FeatureLEAUsesAG,
-                      FeaturePadShortFunctions]>;
-
-// Atom Silvermont.
-def : ProcessorModel<"slm",  SLMModel, [ProcIntelSLM,
-                               FeatureSSE42, FeatureCMPXCHG16B,
-                               FeatureMOVBE, FeaturePOPCNT,
-                               FeaturePCLMUL, FeatureAES,
-                               FeatureCallRegIndirect,
-                               FeaturePRFCHW,
-                               FeatureSlowLEA, FeatureSlowIncDec,
-                               FeatureSlowBTMem, FeatureFastUAMem]>;
+// Atom CPUs.
+class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
+                                   ProcIntelAtom,
+                                   FeatureSSSE3,
+                                   FeatureCMPXCHG16B,
+                                   FeatureMOVBE,
+                                   FeatureSlowBTMem,
+                                   FeatureLeaForSP,
+                                   FeatureSlowDivide32,
+                                   FeatureSlowDivide64,
+                                   FeatureCallRegIndirect,
+                                   FeatureLEAUsesAG,
+                                   FeaturePadShortFunctions
+                                 ]>;
+def : BonnellProc<"bonnell">;
+def : BonnellProc<"atom">; // Pin the generic name to the baseline.
+
+class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
+                                      ProcIntelSLM,
+                                      FeatureSSE42,
+                                      FeatureCMPXCHG16B,
+                                      FeatureMOVBE,
+                                      FeaturePOPCNT,
+                                      FeaturePCLMUL,
+                                      FeatureAES,
+                                      FeatureSlowDivide64,
+                                      FeatureCallRegIndirect,
+                                      FeaturePRFCHW,
+                                      FeatureSlowLEA,
+                                      FeatureSlowIncDec,
+                                      FeatureSlowBTMem,
+                                      FeatureFastUAMem
+                                    ]>;
+def : SilvermontProc<"silvermont">;
+def : SilvermontProc<"slm">; // Legacy alias.
+
 // "Arrandale" along with corei3 and corei5
-def : ProcessorModel<"corei7", SandyBridgeModel,
-                     [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem,
-                      FeatureFastUAMem, FeaturePOPCNT, FeatureAES]>;
+class NehalemProc<string Name, list<SubtargetFeature> AdditionalFeatures>
+    : ProcessorModel<Name, SandyBridgeModel, !listconcat([
+                                                           FeatureSSE42,
+                                                           FeatureCMPXCHG16B,
+                                                           FeatureSlowBTMem,
+                                                           FeatureFastUAMem,
+                                                           FeaturePOPCNT
+                                                         ],
+                                                         AdditionalFeatures)>;
+def : NehalemProc<"nehalem", []>;
+def : NehalemProc<"corei7", [FeatureAES]>;
 
-def : ProcessorModel<"nehalem", SandyBridgeModel,
-                     [FeatureSSE42,  FeatureCMPXCHG16B, FeatureSlowBTMem,
-                      FeatureFastUAMem, FeaturePOPCNT]>;
 // Westmere is a similar machine to nehalem with some additional features.
 // Westmere is the corei3/i5/i7 path from nehalem to sandybridge
-def : ProcessorModel<"westmere", SandyBridgeModel,
-                     [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem,
-                      FeatureFastUAMem, FeaturePOPCNT, FeatureAES,
-                      FeaturePCLMUL]>;
-// Sandy Bridge
+class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+                                    FeatureSSE42,
+                                    FeatureCMPXCHG16B,
+                                    FeatureSlowBTMem,
+                                    FeatureFastUAMem,
+                                    FeaturePOPCNT,
+                                    FeatureAES,
+                                    FeaturePCLMUL
+                                  ]>;
+def : WestmereProc<"westmere">;
+
 // SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
 // rather than a superset.
-def : ProcessorModel<"corei7-avx", SandyBridgeModel,
-                     [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
-                      FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>;
-// Ivy Bridge
-def : ProcessorModel<"core-avx-i", SandyBridgeModel,
-                     [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
-                      FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND,
-                      FeatureF16C, FeatureFSGSBase]>;
-
-// Haswell
-def : ProcessorModel<"core-avx2", HaswellModel,
-                     [FeatureAVX2, FeatureCMPXCHG16B, FeatureFastUAMem,
-                      FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND,
-                      FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT,
-                      FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM,
-                      FeatureHLE, FeatureSlowIncDec]>;
-
-// Broadwell
-def : ProcessorModel<"broadwell", HaswellModel,
-                     [FeatureAVX2, FeatureCMPXCHG16B, FeatureFastUAMem,
-                      FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND,
-                      FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT,
-                      FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM,
-                      FeatureHLE, FeatureADX, FeatureRDSEED, FeatureSMAP,
-                      FeatureSlowIncDec]>;
-// KNL
+class SandyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+                                       FeatureAVX,
+                                       FeatureCMPXCHG16B,
+                                       FeatureFastUAMem,
+                                       FeatureSlowUAMem32,
+                                       FeaturePOPCNT,
+                                       FeatureAES,
+                                       FeaturePCLMUL
+                                     ]>;
+def : SandyBridgeProc<"sandybridge">;
+def : SandyBridgeProc<"corei7-avx">; // Legacy alias.
+
+class IvyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+                                     FeatureAVX,
+                                     FeatureCMPXCHG16B,
+                                     FeatureFastUAMem,
+                                     FeatureSlowUAMem32,
+                                     FeaturePOPCNT,
+                                     FeatureAES,
+                                     FeaturePCLMUL,
+                                     FeatureRDRAND,
+                                     FeatureF16C,
+                                     FeatureFSGSBase
+                                   ]>;
+def : IvyBridgeProc<"ivybridge">;
+def : IvyBridgeProc<"core-avx-i">; // Legacy alias.
+
+class HaswellProc<string Name> : ProcessorModel<Name, HaswellModel, [
+                                   FeatureAVX2,
+                                   FeatureCMPXCHG16B,
+                                   FeatureFastUAMem,
+                                   FeaturePOPCNT,
+                                   FeatureAES,
+                                   FeaturePCLMUL,
+                                   FeatureRDRAND,
+                                   FeatureF16C,
+                                   FeatureFSGSBase,
+                                   FeatureMOVBE,
+                                   FeatureLZCNT,
+                                   FeatureBMI,
+                                   FeatureBMI2,
+                                   FeatureFMA,
+                                   FeatureRTM,
+                                   FeatureHLE,
+                                   FeatureSlowIncDec
+                                 ]>;
+def : HaswellProc<"haswell">;
+def : HaswellProc<"core-avx2">; // Legacy alias.
+
+class BroadwellProc<string Name> : ProcessorModel<Name, HaswellModel, [
+                                     FeatureAVX2,
+                                     FeatureCMPXCHG16B,
+                                     FeatureFastUAMem,
+                                     FeaturePOPCNT,
+                                     FeatureAES,
+                                     FeaturePCLMUL,
+                                     FeatureRDRAND,
+                                     FeatureF16C,
+                                     FeatureFSGSBase,
+                                     FeatureMOVBE,
+                                     FeatureLZCNT,
+                                     FeatureBMI,
+                                     FeatureBMI2,
+                                     FeatureFMA,
+                                     FeatureRTM,
+                                     FeatureHLE,
+                                     FeatureADX,
+                                     FeatureRDSEED,
+                                     FeatureSlowIncDec
+                                   ]>;
+def : BroadwellProc<"broadwell">;
+
 // FIXME: define KNL model
-def : ProcessorModel<"knl", HaswellModel,
+class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel,
                      [FeatureAVX512, FeatureERI, FeatureCDI, FeaturePFI,
                       FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT,
                       FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
                       FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
                       FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE,
                       FeatureSlowIncDec]>;
+def : KnightsLandingProc<"knl">;
 
-// SKX
 // FIXME: define SKX model
-def : ProcessorModel<"skx", HaswellModel,
+class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel,
                      [FeatureAVX512, FeatureCDI,
                       FeatureDQI, FeatureBWI, FeatureVLX,
                       FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT,
                       FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
                       FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
                       FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE,
-                      FeatureSlowIncDec, FeatureSGX]>;
+                      FeatureSlowIncDec]>;
+def : SkylakeProc<"skylake">;
+def : SkylakeProc<"skx">; // Legacy alias.
+
+
+// AMD CPUs.
 
 def : Proc<"k6",              [FeatureMMX]>;
 def : Proc<"k6-2",            [Feature3DNow]>;
@@ -318,7 +404,7 @@ def : Proc<"athlon",          [Feature3DNowA, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
 def : Proc<"athlon-tbird",    [Feature3DNowA, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
-def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem, 
+def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
 def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
@@ -342,6 +428,10 @@ def : Proc<"amdfam10",        [FeatureSSE4A,
                                Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
                                FeaturePOPCNT, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
+def : Proc<"barcelona",       [FeatureSSE4A,
+                               Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
 // Bobcat
 def : Proc<"btver1",          [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
                                FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT,
@@ -352,8 +442,10 @@ def : ProcessorModel<"btver2", BtVer2Model,
                      [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
                       FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
                       FeatureBMI, FeatureF16C, FeatureMOVBE,
-                      FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD,
-                      FeatureUseSqrtEst, FeatureUseRecipEst]>;
+                      FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem,
+                      FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>;
+
+// TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips.
 
 // Bulldozer
 def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
@@ -394,7 +486,7 @@ def : Proc<"c3-2",            [FeatureSSE1]>;
 // be good for modern chips without enabling instruction set encodings past the
 // basic SSE2 and 64-bit ones. It disables slow things from any mainstream and
 // modern 64-bit x86 chip, and enables features that are generally beneficial.
-// 
+//
 // We currently use the Sandy Bridge model as the default scheduling model as
 // we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which
 // covers a huge swath of x86 processors. If there are specific scheduling
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 4e5b7b8..bb0b9ce 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -47,6 +47,8 @@ using namespace llvm;
 /// runOnMachineFunction - Emit the function body.
 ///
 bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<X86Subtarget>();
+
   SMShadowTracker.startFunction(MF);
 
   SetupMachineFunction(MF);
@@ -505,13 +507,15 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 }
 
 void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
-  if (Subtarget->isTargetMacho())
+  Triple TT(TM.getTargetTriple());
+
+  if (TT.isOSBinFormatMachO())
     OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
 
-  if (Subtarget->isTargetCOFF()) {
+  if (TT.isOSBinFormatCOFF()) {
     // Emit an absolute @feat.00 symbol.  This appears to be some kind of
     // compiler features bitfield read by link.exe.
-    if (!Subtarget->is64Bit()) {
+    if (TT.getArch() == Triple::x86) {
       MCSymbol *S = MMI->getContext().GetOrCreateSymbol(StringRef("@feat.00"));
       OutStreamer.BeginCOFFSymbolDef(S);
       OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
@@ -558,8 +562,7 @@ MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const {
     const MachineConstantPoolEntry &CPE =
         MF->getConstantPool()->getConstants()[CPID];
     if (!CPE.isMachineConstantPoolEntry()) {
-      SectionKind Kind =
-          CPE.getSectionKind(TM.getSubtargetImpl()->getDataLayout());
+      SectionKind Kind = CPE.getSectionKind(TM.getDataLayout());
       const Constant *C = CPE.Val.ConstVal;
       if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
             getObjFileLowering().getSectionForConstant(Kind, C))) {
@@ -579,20 +582,21 @@ void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) {
   SmallString<128> Directive;
   raw_svector_ostream OS(Directive);
   StringRef Name = Sym->getName();
+  Triple TT(TM.getTargetTriple());
 
-  if (Subtarget->isTargetKnownWindowsMSVC())
+  if (TT.isKnownWindowsMSVCEnvironment())
     OS << " /EXPORT:";
   else
     OS << " -export:";
 
-  if ((Subtarget->isTargetWindowsGNU() || Subtarget->isTargetWindowsCygwin()) &&
+  if ((TT.isWindowsGNUEnvironment() || TT.isWindowsCygwinEnvironment()) &&
       (Name[0] == getDataLayout().getGlobalPrefix()))
     Name = Name.drop_front();
 
   OS << Name;
 
   if (IsData) {
-    if (Subtarget->isTargetKnownWindowsMSVC())
+    if (TT.isKnownWindowsMSVCEnvironment())
       OS << ",DATA";
     else
       OS << ",data";
@@ -603,10 +607,12 @@ void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) {
 }
 
 void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
-  if (Subtarget->isTargetMacho()) {
+  Triple TT(TM.getTargetTriple());
+
+  if (TT.isOSBinFormatMachO()) {
     // All darwin targets use mach-o.
     MachineModuleInfoMachO &MMIMacho =
-      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+        MMI->getObjFileInfo<MachineModuleInfoMachO>();
 
     // Output stubs for dynamically-linked functions.
     MachineModuleInfoMachO::SymbolListTy Stubs;
@@ -677,22 +683,23 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
     OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
 
-  if (Subtarget->isTargetKnownWindowsMSVC() && MMI->usesVAFloatArgument()) {
-    StringRef SymbolName = Subtarget->is64Bit() ? "_fltused" : "__fltused";
+  if (TT.isKnownWindowsMSVCEnvironment() && MMI->usesVAFloatArgument()) {
+    StringRef SymbolName =
+        (TT.getArch() == Triple::x86_64) ? "_fltused" : "__fltused";
     MCSymbol *S = MMI->getContext().GetOrCreateSymbol(SymbolName);
     OutStreamer.EmitSymbolAttribute(S, MCSA_Global);
   }
 
-  if (Subtarget->isTargetCOFF()) {
+  if (TT.isOSBinFormatCOFF()) {
     // Necessary for dllexport support
     std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals;
 
     for (const auto &Function : M)
-      if (Function.hasDLLExportStorageClass())
+      if (Function.hasDLLExportStorageClass() && !Function.isDeclaration())
         DLLExportedFns.push_back(getSymbol(&Function));
 
     for (const auto &Global : M.globals())
-      if (Global.hasDLLExportStorageClass())
+      if (Global.hasDLLExportStorageClass() && !Global.isDeclaration())
         DLLExportedGlobals.push_back(getSymbol(&Global));
 
     for (const auto &Alias : M.aliases()) {
@@ -719,7 +726,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
     }
   }
 
-  if (Subtarget->isTargetELF()) {
+  if (TT.isOSBinFormatELF()) {
     const TargetLoweringObjectFileELF &TLOFELF =
       static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
 
@@ -729,7 +736,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
     MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
     if (!Stubs.empty()) {
       OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
-      const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
+      const DataLayout *TD = TM.getDataLayout();
 
       for (const auto &Stub : Stubs) {
         OutStreamer.EmitLabel(Stub.first);
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 748b948..d101b8c 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -57,6 +57,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
     void emitShadowPadding(MCStreamer &OutStreamer, const MCSubtargetInfo &STI);
   private:
     TargetMachine &TM;
+    const MachineFunction *MF;
     std::unique_ptr<MCCodeEmitter> CodeEmitter;
     bool InShadow;
 
@@ -85,10 +86,9 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI);
 
  public:
-  explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer), SM(*this), SMShadowTracker(TM) {
-    Subtarget = &TM.getSubtarget<X86Subtarget>();
-  }
+   explicit X86AsmPrinter(TargetMachine &TM,
+                          std::unique_ptr<MCStreamer> Streamer)
+       : AsmPrinter(TM, std::move(Streamer)), SM(*this), SMShadowTracker(TM) {}
 
   const char *getPassName() const override {
     return "X86 Assembly / Object Emitter";
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
new file mode 100644
index 0000000..5e8d374
--- /dev/null
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -0,0 +1,480 @@
+//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that optimizes call sequences on x86.
+// Currently, it converts movs of function parameters onto the stack into
+// pushes. This is beneficial for two main reasons:
+// 1) The push instruction encoding is much smaller than an esp-relative mov
+// 2) It is possible to push memory arguments directly. So, if the
+//    the transformation is preformed pre-reg-alloc, it can help relieve
+//    register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-cf-opt"
+
+static cl::opt<bool>
+    NoX86CFOpt("no-x86-call-frame-opt",
+               cl::desc("Avoid optimizing x86 call frames for size"),
+               cl::init(false), cl::Hidden);
+
+namespace {
+class X86CallFrameOptimization : public MachineFunctionPass {
+public:
+  X86CallFrameOptimization() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  // Information we know about a particular call site
+  struct CallContext {
+    CallContext()
+        : Call(nullptr), SPCopy(nullptr), ExpectedDist(0),
+          MovVector(4, nullptr), NoStackParams(false), UsePush(false){};
+
+    // Actuall call instruction
+    MachineInstr *Call;
+
+    // A copy of the stack pointer
+    MachineInstr *SPCopy;
+
+    // The total displacement of all passed parameters
+    int64_t ExpectedDist;
+
+    // The sequence of movs used to pass the parameters
+    SmallVector<MachineInstr *, 4> MovVector;
+
+    // True if this call site has no stack parameters
+    bool NoStackParams;
+
+    // True of this callsite can use push instructions
+    bool UsePush;
+  };
+
+  typedef DenseMap<MachineInstr *, CallContext> ContextMap;
+
+  bool isLegal(MachineFunction &MF);
+  
+  bool isProfitable(MachineFunction &MF, ContextMap &CallSeqMap);
+
+  void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator I, CallContext &Context);
+
+  bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock::iterator I,
+                          const CallContext &Context);
+
+  MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
+                                   unsigned Reg);
+
+  const char *getPassName() const override { return "X86 Optimize Call Frame"; }
+
+  const TargetInstrInfo *TII;
+  const TargetFrameLowering *TFL;
+  const MachineRegisterInfo *MRI;
+  static char ID;
+};
+
+char X86CallFrameOptimization::ID = 0;
+}
+
+FunctionPass *llvm::createX86CallFrameOptimization() {
+  return new X86CallFrameOptimization();
+}
+
+// This checks whether the transformation is legal. 
+// Also returns false in cases where it's potentially legal, but
+// we don't even want to try.
+bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
+  if (NoX86CFOpt.getValue())
+    return false;
+
+  // We currently only support call sequences where *all* parameters.
+  // are passed on the stack.
+  // No point in running this in 64-bit mode, since some arguments are
+  // passed in-register in all common calling conventions, so the pattern
+  // we're looking for will never match.
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  if (STI.is64Bit())
+    return false;
+
+  // You would expect straight-line code between call-frame setup and
+  // call-frame destroy. You would be wrong. There are circumstances (e.g.
+  // CMOV_GR8 expansion of a select that feeds a function call!) where we can
+  // end up with the setup and the destroy in different basic blocks.
+  // This is bad, and breaks SP adjustment.
+  // So, check that all of the frames in the function are closed inside
+  // the same block, and, for good measure, that there are no nested frames.
+  int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+  int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+  for (MachineBasicBlock &BB : MF) {
+    bool InsideFrameSequence = false;
+    for (MachineInstr &MI : BB) {
+      if (MI.getOpcode() == FrameSetupOpcode) {
+        if (InsideFrameSequence)
+          return false;
+        InsideFrameSequence = true;
+      } else if (MI.getOpcode() == FrameDestroyOpcode) {
+        if (!InsideFrameSequence)
+          return false;
+        InsideFrameSequence = false;
+      }
+    }
+
+    if (InsideFrameSequence)
+      return false;
+  }
+
+  return true;
+}
+
+// Check whether this trasnformation is profitable for a particular
+// function - in terms of code size.
+bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, 
+  ContextMap &CallSeqMap) {
+  // This transformation is always a win when we do not expect to have
+  // a reserved call frame. Under other circumstances, it may be either
+  // a win or a loss, and requires a heuristic.
+  bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects();
+  if (CannotReserveFrame)
+    return true;
+
+  // Don't do this when not optimizing for size.
+  bool OptForSize =
+      MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) ||
+      MF.getFunction()->hasFnAttribute(Attribute::MinSize);
+
+  if (!OptForSize)
+    return false;
+
+
+  unsigned StackAlign = TFL->getStackAlignment();
+  
+  int64_t Advantage = 0;
+  for (auto CC : CallSeqMap) {
+    // Call sites where no parameters are passed on the stack
+    // do not affect the cost, since there needs to be no
+    // stack adjustment.
+    if (CC.second.NoStackParams)
+      continue;
+
+    if (!CC.second.UsePush) {
+      // If we don't use pushes for a particular call site,
+      // we pay for not having a reserved call frame with an
+      // additional sub/add esp pair. The cost is ~3 bytes per instruction,
+      // depending on the size of the constant.
+      // TODO: Callee-pop functions should have a smaller penalty, because
+      // an add is needed even with a reserved call frame.
+      Advantage -= 6;
+    } else {
+      // We can use pushes. First, account for the fixed costs.
+      // We'll need a add after the call.
+      Advantage -= 3;
+      // If we have to realign the stack, we'll also need and sub before
+      if (CC.second.ExpectedDist % StackAlign)
+        Advantage -= 3;
+      // Now, for each push, we save ~3 bytes. For small constants, we actually,
+      // save more (up to 5 bytes), but 3 should be a good approximation.
+      Advantage += (CC.second.ExpectedDist / 4) * 3;
+    }
+  }
+
+  return (Advantage >= 0);
+}
+
+
+bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getSubtarget().getInstrInfo();
+  TFL = MF.getSubtarget().getFrameLowering();
+  MRI = &MF.getRegInfo();
+
+  if (!isLegal(MF))
+    return false;
+
+  int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+
+  bool Changed = false;
+
+  ContextMap CallSeqMap;
+
+  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
+      if (I->getOpcode() == FrameSetupOpcode) {
+        CallContext &Context = CallSeqMap[I];
+        collectCallInfo(MF, *BB, I, Context);
+      }
+
+  if (!isProfitable(MF, CallSeqMap))
+    return false;
+
+  for (auto CC : CallSeqMap)
+    if (CC.second.UsePush)
+      Changed |= adjustCallSequence(MF, CC.first, CC.second);
+
+  return Changed;
+}
+
+void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
+                                               MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator I,
+                                               CallContext &Context) {
+  // Check that this particular call sequence is amenable to the
+  // transformation.
+  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
+                                       MF.getSubtarget().getRegisterInfo());
+  unsigned StackPtr = RegInfo.getStackRegister();
+  int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+
+  // We expect to enter this at the beginning of a call sequence
+  assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
+  MachineBasicBlock::iterator FrameSetup = I++;
+
+  // How much do we adjust the stack? This puts an upper bound on
+  // the number of parameters actually passed on it.
+  unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;  
+  
+  // A zero adjustment means no stack parameters
+  if (!MaxAdjust) {
+    Context.NoStackParams = true;
+    return;
+  }
+
+  // For globals in PIC mode, we can have some LEAs here.
+  // Ignore them, they don't bother us.
+  // TODO: Extend this to something that covers more cases.
+  while (I->getOpcode() == X86::LEA32r)
+    ++I;
+
+  // We expect a copy instruction here.
+  // TODO: The copy instruction is a lowering artifact.
+  //       We should also support a copy-less version, where the stack
+  //       pointer is used directly.
+  if (!I->isCopy() || !I->getOperand(0).isReg())
+    return;
+  Context.SPCopy = I++;
+  StackPtr = Context.SPCopy->getOperand(0).getReg();
+
+  // Scan the call setup sequence for the pattern we're looking for.
+  // We only handle a simple case - a sequence of MOV32mi or MOV32mr
+  // instructions, that push a sequence of 32-bit values onto the stack, with
+  // no gaps between them.
+  if (MaxAdjust > 4)
+    Context.MovVector.resize(MaxAdjust, nullptr);
+
+  do {
+    int Opcode = I->getOpcode();
+    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
+      break;
+
+    // We only want movs of the form:
+    // movl imm/r32, k(%esp)
+    // If we run into something else, bail.
+    // Note that AddrBaseReg may, counter to its name, not be a register,
+    // but rather a frame index.
+    // TODO: Support the fi case. This should probably work now that we
+    // have the infrastructure to track the stack pointer within a call
+    // sequence.
+    if (!I->getOperand(X86::AddrBaseReg).isReg() ||
+        (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
+        !I->getOperand(X86::AddrScaleAmt).isImm() ||
+        (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
+        (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
+        (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
+        !I->getOperand(X86::AddrDisp).isImm())
+      return;
+
+    int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
+    assert(StackDisp >= 0 &&
+           "Negative stack displacement when passing parameters");
+
+    // We really don't want to consider the unaligned case.
+    if (StackDisp % 4)
+      return;
+    StackDisp /= 4;
+
+    assert((size_t)StackDisp < Context.MovVector.size() &&
+           "Function call has more parameters than the stack is adjusted for.");
+
+    // If the same stack slot is being filled twice, something's fishy.
+    if (Context.MovVector[StackDisp] != nullptr)
+      return;
+    Context.MovVector[StackDisp] = I;
+
+    ++I;
+  } while (I != MBB.end());
+
+  // We now expect the end of the sequence - a call and a stack adjust.
+  if (I == MBB.end())
+    return;
+
+  // For PCrel calls, we expect an additional COPY of the basereg.
+  // If we find one, skip it.
+  if (I->isCopy()) {
+    if (I->getOperand(1).getReg() ==
+        MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg())
+      ++I;
+    else
+      return;
+  }
+
+  if (!I->isCall())
+    return;
+
+  Context.Call = I;
+  if ((++I)->getOpcode() != FrameDestroyOpcode)
+    return;
+
+  // Now, go through the vector, and see that we don't have any gaps,
+  // but only a series of 32-bit MOVs.
+  auto MMI = Context.MovVector.begin(), MME = Context.MovVector.end();
+  for (; MMI != MME; ++MMI, Context.ExpectedDist += 4)
+    if (*MMI == nullptr)
+      break;
+
+  // If the call had no parameters, do nothing
+  if (MMI == Context.MovVector.begin())
+    return;
+
+  // We are either at the last parameter, or a gap.
+  // Make sure it's not a gap
+  for (; MMI != MME; ++MMI)
+    if (*MMI != nullptr)
+      return;
+
+  Context.UsePush = true;
+  return;
+}
+
+bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
+                                                  MachineBasicBlock::iterator I,
+                                                  const CallContext &Context) {
+  // Ok, we can in fact do the transformation for this call.
+  // Do not remove the FrameSetup instruction, but adjust the parameters.
+  // PEI will end up finalizing the handling of this.
+  MachineBasicBlock::iterator FrameSetup = I;
+  MachineBasicBlock &MBB = *(I->getParent());
+  FrameSetup->getOperand(1).setImm(Context.ExpectedDist);
+
+  DebugLoc DL = I->getDebugLoc();
+  // Now, iterate through the vector in reverse order, and replace the movs
+  // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
+  // replace uses.
+  for (int Idx = (Context.ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
+    MachineBasicBlock::iterator MOV = *Context.MovVector[Idx];
+    MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
+    if (MOV->getOpcode() == X86::MOV32mi) {
+      unsigned PushOpcode = X86::PUSHi32;
+      // If the operand is a small (8-bit) immediate, we can use a
+      // PUSH instruction with a shorter encoding.
+      // Note that isImm() may fail even though this is a MOVmi, because
+      // the operand can also be a symbol.
+      if (PushOp.isImm()) {
+        int64_t Val = PushOp.getImm();
+        if (isInt<8>(Val))
+          PushOpcode = X86::PUSH32i8;
+      }
+      BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).addOperand(PushOp);
+    } else {
+      unsigned int Reg = PushOp.getReg();
+
+      // If PUSHrmm is not slow on this target, try to fold the source of the
+      // push into the instruction.
+      const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+      bool SlowPUSHrmm = ST.isAtom() || ST.isSLM();
+
+      // Check that this is legal to fold. Right now, we're extremely
+      // conservative about that.
+      MachineInstr *DefMov = nullptr;
+      if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
+        MachineInstr *Push =
+            BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm));
+
+        unsigned NumOps = DefMov->getDesc().getNumOperands();
+        for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
+          Push->addOperand(DefMov->getOperand(i));
+
+        DefMov->eraseFromParent();
+      } else {
+        BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r))
+            .addReg(Reg)
+            .getInstr();
+      }
+    }
+
+    MBB.erase(MOV);
+  }
+
+  // The stack-pointer copy is no longer used in the call sequences.
+  // There should not be any other users, but we can't commit to that, so:
+  if (MRI->use_empty(Context.SPCopy->getOperand(0).getReg()))
+    Context.SPCopy->eraseFromParent();
+
+  // Once we've done this, we need to make sure PEI doesn't assume a reserved
+  // frame.
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  FuncInfo->setHasPushSequences(true);
+
+  return true;
+}
+
+MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
+    MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
+  // Do an extremely restricted form of load folding.
+  // ISel will often create patterns like:
+  // movl    4(%edi), %eax
+  // movl    8(%edi), %ecx
+  // movl    12(%edi), %edx
+  // movl    %edx, 8(%esp)
+  // movl    %ecx, 4(%esp)
+  // movl    %eax, (%esp)
+  // call
+  // Get rid of those with prejudice.
+  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    return nullptr;
+
+  // Make sure this is the only use of Reg.
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return nullptr;
+
+  MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg);
+
+  // Make sure the def is a MOV from memory.
+  // If the def is an another block, give up.
+  if (DefMI->getOpcode() != X86::MOV32rm ||
+      DefMI->getParent() != FrameSetup->getParent())
+    return nullptr;
+
+  // Now, make sure everything else up until the ADJCALLSTACK is a sequence
+  // of MOVs. To be less conservative would require duplicating a lot of the
+  // logic from PeepholeOptimizer.
+  // FIXME: A possibly better approach would be to teach the PeepholeOptimizer
+  // to be smarter about folding into pushes.
+  for (auto I = DefMI; I != FrameSetup; ++I)
+    if (I->getOpcode() != X86::MOV32rm)
+      return nullptr;
+
+  return DefMI;
+}
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 75a2ec0..41c759a 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -461,6 +461,10 @@ def CC_X86_32_Common : CallingConv<[
                 CCIfSubtarget<"hasFp256()",
                 CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
 
+  // The first 4 AVX 512-bit vector arguments are passed in ZMM registers.
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+                CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>,
+
   // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
 
@@ -468,6 +472,10 @@ def CC_X86_32_Common : CallingConv<[
   CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
            CCAssignToStack<32, 32>>,
 
+  // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+           CCAssignToStack<64, 64>>,
+
   // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
   // passed in the parameter area.
   CCIfType<[x86mmx], CCAssignToStack<8, 4>>]>;
@@ -626,6 +634,9 @@ def CC_Intel_OCL_BI : CallingConv<[
   CCIfType<[v16f32, v8f64, v16i32, v8i64],
            CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>,
 
+  // Pass masks in mask registers
+  CCIfType<[v16i1, v8i1], CCAssignToReg<[K1]>>,
+
   CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
   CCIfSubtarget<"is64Bit()",       CCDelegateTo<CC_X86_64_C>>,
   CCDelegateTo<CC_X86_32_C>
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 95cb718..a17f052 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -37,6 +37,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
@@ -58,8 +59,8 @@ class X86FastISel final : public FastISel {
 public:
   explicit X86FastISel(FunctionLoweringInfo &funcInfo,
                        const TargetLibraryInfo *libInfo)
-    : FastISel(funcInfo, libInfo) {
-    Subtarget = &TM.getSubtarget<X86Subtarget>();
+      : FastISel(funcInfo, libInfo) {
+    Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
     X86ScalarSSEf64 = Subtarget->hasSSE2();
     X86ScalarSSEf32 = Subtarget->hasSSE1();
   }
@@ -80,7 +81,7 @@ public:
 #include "X86GenFastISel.inc"
 
 private:
-  bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT);
+  bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, DebugLoc DL);
 
   bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO,
                        unsigned &ResultReg);
@@ -123,11 +124,15 @@ private:
 
   bool X86SelectTrunc(const Instruction *I);
 
+  bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc,
+                               const TargetRegisterClass *RC);
+
   bool X86SelectFPExt(const Instruction *I);
   bool X86SelectFPTrunc(const Instruction *I);
+  bool X86SelectSIToFP(const Instruction *I);
 
   const X86InstrInfo *getInstrInfo() const {
-    return getTargetMachine()->getSubtargetImpl()->getInstrInfo();
+    return Subtarget->getInstrInfo();
   }
   const X86TargetMachine *getTargetMachine() const {
     return static_cast<const X86TargetMachine *>(&TM);
@@ -137,7 +142,7 @@ private:
 
   unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);
   unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);
-  unsigned X86MaterializeGV(const GlobalValue *GV,MVT VT);
+  unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT);
   unsigned fastMaterializeConstant(const Constant *C) override;
 
   unsigned fastMaterializeAlloca(const AllocaInst *C) override;
@@ -544,7 +549,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
 
       // Ok, we need to do a load from a stub.  If we've already loaded from
       // this stub, reuse the loaded pointer, otherwise emit the load now.
-      DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
+      DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V);
       unsigned LoadReg;
       if (I != LocalValueMap.end() && I->second != 0) {
         LoadReg = I->second;
@@ -655,7 +660,7 @@ redo_gep:
   case Instruction::Alloca: {
     // Do static allocas.
     const AllocaInst *A = cast<AllocaInst>(V);
-    DenseMap<const AllocaInst*, int>::iterator SI =
+    DenseMap<const AllocaInst *, int>::iterator SI =
       FuncInfo.StaticAllocaMap.find(A);
     if (SI != FuncInfo.StaticAllocaMap.end()) {
       AM.BaseType = X86AddressMode::FrameIndexBase;
@@ -903,7 +908,7 @@ bool X86FastISel::X86SelectStore(const Instruction *I) {
 
   unsigned Alignment = S->getAlignment();
   unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
-  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+  if (Alignment == 0) // Ensure that codegen never sees alignment 0
     Alignment = ABIAlignment;
   bool Aligned = Alignment >= ABIAlignment;
 
@@ -1009,12 +1014,12 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
 
     // Make the copy.
     unsigned DstReg = VA.getLocReg();
-    const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg);
+    const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
     // Avoid a cross-class copy. This is very unlikely.
     if (!SrcRC->contains(DstReg))
       return false;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
-            DstReg).addReg(SrcReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
 
     // Add register to return instruction.
     RetRegs.push_back(VA.getLocReg());
@@ -1030,14 +1035,15 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
     assert(Reg &&
            "SRetReturnReg should have been set in LowerFormalArguments()!");
     unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
-            RetReg).addReg(Reg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);
     RetRegs.push_back(RetReg);
   }
 
   // Now emit the RET.
   MachineInstrBuilder MIB =
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
   for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
     MIB.addReg(RetRegs[i], RegState::Implicit);
   return true;
@@ -1108,7 +1114,7 @@ static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
 }
 
 bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
-                                     EVT VT) {
+                                     EVT VT, DebugLoc CurDbgLoc) {
   unsigned Op0Reg = getRegForValue(Op0);
   if (Op0Reg == 0) return false;
 
@@ -1121,7 +1127,7 @@ bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
   // CMPri, otherwise use CMPrr.
   if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
     if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CompareImmOpc))
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc))
         .addReg(Op0Reg)
         .addImm(Op1C->getSExtValue());
       return true;
@@ -1133,7 +1139,7 @@ bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
 
   unsigned Op1Reg = getRegForValue(Op1);
   if (Op1Reg == 0) return false;
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CompareOpc))
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc))
     .addReg(Op0Reg)
     .addReg(Op1Reg);
 
@@ -1201,7 +1207,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
 
   ResultReg = createResultReg(&X86::GR8RegClass);
   if (SETFOpc) {
-    if (!X86FastEmitCompare(LHS, RHS, VT))
+    if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
       return false;
 
     unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
@@ -1226,7 +1232,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
     std::swap(LHS, RHS);
 
   // Emit a compare of LHS/RHS.
-  if (!X86FastEmitCompare(LHS, RHS, VT))
+  if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
     return false;
 
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
@@ -1284,7 +1290,6 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
   return true;
 }
 
-
 bool X86FastISel::X86SelectBranch(const Instruction *I) {
   // Unconditional branches are selected by tablegen-generated code.
   // Handle a conditional branch.
@@ -1353,7 +1358,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
         std::swap(CmpLHS, CmpRHS);
 
       // Emit a compare of the LHS and RHS, setting the flags.
-      if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT))
+      if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
         return false;
 
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
@@ -1362,7 +1367,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
       // X86 requires a second branch to handle UNE (and OEQ, which is mapped
       // to UNE above).
       if (NeedExtraBranch) {
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_4))
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1))
           .addMBB(TrueMBB);
       }
 
@@ -1399,10 +1404,10 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
           .addReg(OpReg).addImm(1);
 
-        unsigned JmpOpc = X86::JNE_4;
+        unsigned JmpOpc = X86::JNE_1;
         if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
           std::swap(TrueMBB, FalseMBB);
-          JmpOpc = X86::JE_4;
+          JmpOpc = X86::JE_1;
         }
 
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
@@ -1444,7 +1449,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
 
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
     .addReg(OpReg).addImm(1);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_4))
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1))
     .addMBB(TrueMBB);
   fastEmitBranch(FalseMBB, DbgLoc);
   uint32_t BranchWeight = 0;
@@ -1632,8 +1637,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
               TII.get(X86::MOV32r0), Zero32);
 
       // Copy the zero into the appropriate sub/super/identical physical
-      // register. Unfortunately the operations needed are not uniform enough to
-      // fit neatly into the table above.
+      // register. Unfortunately the operations needed are not uniform enough
+      // to fit neatly into the table above.
       if (VT.SimpleTy == MVT::i16) {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(Copy), TypeEntry.HighInReg)
@@ -1740,8 +1745,8 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
 
     EVT CmpVT = TLI.getValueType(CmpLHS->getType());
     // Emit a compare of the LHS and RHS, setting the flags.
-    if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT))
-     return false;
+    if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
+      return false;
 
     if (SETFOpc) {
       unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
@@ -1820,7 +1825,7 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
 
   if (I->getType() != CI->getOperand(0)->getType() ||
       !((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
-        (Subtarget->hasSSE2() && RetVT == MVT::f64)    ))
+        (Subtarget->hasSSE2() && RetVT == MVT::f64)))
     return false;
 
   const Value *CmpLHS = CI->getOperand(0);
@@ -1924,7 +1929,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
       std::swap(CmpLHS, CmpRHS);
 
     EVT CmpVT = TLI.getValueType(CmpLHS->getType());
-    if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT))
+    if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
       return false;
   } else {
     unsigned CondReg = getRegForValue(Cond);
@@ -2001,41 +2006,91 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) {
   return false;
 }
 
+bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
+  if (!I->getOperand(0)->getType()->isIntegerTy(32))
+    return false;
+
+  // Select integer to float/double conversion.
+  unsigned OpReg = getRegForValue(I->getOperand(0));
+  if (OpReg == 0)
+    return false;
+
+  bool HasAVX = Subtarget->hasAVX();
+  const TargetRegisterClass *RC = nullptr;
+  unsigned Opcode;
+
+  if (I->getType()->isDoubleTy() && X86ScalarSSEf64) {
+    // sitofp int -> double
+    Opcode = HasAVX ? X86::VCVTSI2SDrr : X86::CVTSI2SDrr;
+    RC = &X86::FR64RegClass;
+  } else if (I->getType()->isFloatTy() && X86ScalarSSEf32) {
+    // sitofp int -> float
+    Opcode = HasAVX ? X86::VCVTSI2SSrr : X86::CVTSI2SSrr;
+    RC = &X86::FR32RegClass;
+  } else
+    return false;
+
+
+  unsigned ImplicitDefReg = 0;
+  if (HasAVX) {
+    ImplicitDefReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+  }
+
+  const MCInstrDesc &II = TII.get(Opcode);
+  OpReg = constrainOperandRegClass(II, OpReg, (HasAVX ? 2 : 1));
+  
+  unsigned ResultReg = createResultReg(RC);
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg);
+  if (ImplicitDefReg)
+    MIB.addReg(ImplicitDefReg, RegState::Kill);
+  MIB.addReg(OpReg);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+// Helper method used by X86SelectFPExt and X86SelectFPTrunc.
+bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
+                                          unsigned TargetOpc,
+                                          const TargetRegisterClass *RC) {
+  assert((I->getOpcode() == Instruction::FPExt ||
+          I->getOpcode() == Instruction::FPTrunc) &&
+         "Instruction must be an FPExt or FPTrunc!");
+
+  unsigned OpReg = getRegForValue(I->getOperand(0));
+  if (OpReg == 0)
+    return false;
+
+  unsigned ResultReg = createResultReg(RC);
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc),
+                ResultReg);
+  if (Subtarget->hasAVX())
+    MIB.addReg(OpReg);
+  MIB.addReg(OpReg);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
 bool X86FastISel::X86SelectFPExt(const Instruction *I) {
-  // fpext from float to double.
-  if (X86ScalarSSEf64 &&
-      I->getType()->isDoubleTy()) {
-    const Value *V = I->getOperand(0);
-    if (V->getType()->isFloatTy()) {
-      unsigned OpReg = getRegForValue(V);
-      if (OpReg == 0) return false;
-      unsigned ResultReg = createResultReg(&X86::FR64RegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(X86::CVTSS2SDrr), ResultReg)
-        .addReg(OpReg);
-      updateValueMap(I, ResultReg);
-      return true;
-    }
+  if (X86ScalarSSEf64 && I->getType()->isDoubleTy() &&
+      I->getOperand(0)->getType()->isFloatTy()) {
+    // fpext from float to double.
+    unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;
+    return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR64RegClass);
   }
 
   return false;
 }
 
 bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
-  if (X86ScalarSSEf64) {
-    if (I->getType()->isFloatTy()) {
-      const Value *V = I->getOperand(0);
-      if (V->getType()->isDoubleTy()) {
-        unsigned OpReg = getRegForValue(V);
-        if (OpReg == 0) return false;
-        unsigned ResultReg = createResultReg(&X86::FR32RegClass);
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                TII.get(X86::CVTSD2SSrr), ResultReg)
-          .addReg(OpReg);
-        updateValueMap(I, ResultReg);
-        return true;
-      }
-    }
+  if (X86ScalarSSEf64 && I->getType()->isFloatTy() &&
+      I->getOperand(0)->getType()->isDoubleTy()) {
+    // fptrunc from double to float.
+    unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;
+    return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR32RegClass);
   }
 
   return false;
@@ -2065,12 +2120,11 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
   if (!Subtarget->is64Bit()) {
     // If we're on x86-32; we can't extract an i8 from a general register.
     // First issue a copy to GR16_ABCD or GR32_ABCD.
-    const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) ?
-      (const TargetRegisterClass*)&X86::GR16_ABCDRegClass :
-      (const TargetRegisterClass*)&X86::GR32_ABCDRegClass;
+    const TargetRegisterClass *CopyRC =
+      (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass;
     unsigned CopyReg = createResultReg(CopyRC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
-            CopyReg).addReg(InputReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg);
     InputReg = CopyReg;
   }
 
@@ -2107,9 +2161,8 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
       VT = MVT::i32;
     else if (Len >= 2)
       VT = MVT::i16;
-    else {
+    else
       VT = MVT::i8;
-    }
 
     unsigned Reg;
     bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
@@ -2129,7 +2182,73 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
   // FIXME: Handle more intrinsics.
   switch (II->getIntrinsicID()) {
   default: return false;
+  case Intrinsic::convert_from_fp16:
+  case Intrinsic::convert_to_fp16: {
+    if (TM.Options.UseSoftFloat || !Subtarget->hasF16C())
+      return false;
+
+    const Value *Op = II->getArgOperand(0);
+    unsigned InputReg = getRegForValue(Op);
+    if (InputReg == 0)
+      return false;
+
+    // F16C only allows converting from float to half and from half to float.
+    bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16;
+    if (IsFloatToHalf) {
+      if (!Op->getType()->isFloatTy())
+        return false;
+    } else {
+      if (!II->getType()->isFloatTy())
+        return false;
+    }
+
+    unsigned ResultReg = 0;
+    const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16);
+    if (IsFloatToHalf) {
+      // 'InputReg' is implicitly promoted from register class FR32 to
+      // register class VR128 by method 'constrainOperandRegClass' which is
+      // directly called by 'fastEmitInst_ri'.
+      // Instruction VCVTPS2PHrr takes an extra immediate operand which is
+      // used to provide rounding control.
+      InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 0);
+
+      // Move the lower 32-bits of ResultReg to another register of class GR32.
+      ResultReg = createResultReg(&X86::GR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(X86::VMOVPDI2DIrr), ResultReg)
+          .addReg(InputReg, RegState::Kill);
+      
+      // The result value is in the lower 16-bits of ResultReg.
+      unsigned RegIdx = X86::sub_16bit;
+      ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx);
+    } else {
+      assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
+      // Explicitly sign-extend the input to 32-bit.
+      InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg,
+                            /*Kill=*/false);
+
+      // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
+      InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
+                            InputReg, /*Kill=*/true);
+
+      InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /*Kill=*/true);
+
+      // The result value is in the lower 32-bits of ResultReg.
+      // Emit an explicit copy from register class VR128 to register class FR32.
+      ResultReg = createResultReg(&X86::FR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+          .addReg(InputReg, RegState::Kill);
+    }
+
+    updateValueMap(II, ResultReg);
+    return true;
+  }
   case Intrinsic::frameaddress: {
+    MachineFunction *MF = FuncInfo.MF;
+    if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI())
+      return false;
+
     Type *RetTy = II->getCalledFunction()->getReturnType();
 
     MVT VT;
@@ -2145,14 +2264,13 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
     }
 
-    // This needs to be set before we call getFrameRegister, otherwise we get
-    // the wrong frame register.
-    MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
+    // This needs to be set before we call getPtrSizedFrameRegister, otherwise
+    // we get the wrong frame register.
+    MachineFrameInfo *MFI = MF->getFrameInfo();
     MFI->setFrameAddressIsTaken(true);
 
-    const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
-        TM.getSubtargetImpl()->getRegisterInfo());
-    unsigned FrameReg = RegInfo->getFrameRegister(*(FuncInfo.MF));
+    const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+    unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF);
     assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
             (FrameReg == X86::EBP && VT == MVT::i32)) &&
            "Invalid Frame Register!");
@@ -2372,19 +2490,16 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     unsigned ResultReg = 0;
     // Check if we have an immediate version.
     if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
-      static const unsigned Opc[2][2][4] = {
-        { { X86::INC8r, X86::INC16r,    X86::INC32r,    X86::INC64r },
-          { X86::DEC8r, X86::DEC16r,    X86::DEC32r,    X86::DEC64r }  },
-        { { X86::INC8r, X86::INC64_16r, X86::INC64_32r, X86::INC64r },
-          { X86::DEC8r, X86::DEC64_16r, X86::DEC64_32r, X86::DEC64r }  }
+      static const unsigned Opc[2][4] = {
+        { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
+        { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
       };
 
       if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) {
         ResultReg = createResultReg(TLI.getRegClassFor(VT));
-        bool Is64Bit = Subtarget->is64Bit();
         bool IsDec = BaseOpc == X86ISD::DEC;
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                TII.get(Opc[Is64Bit][IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
+                TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
           .addReg(LHSReg, getKillRegState(LHSIsKill));
       } else
         ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
@@ -2529,7 +2644,7 @@ bool X86FastISel::fastLowerArguments() {
 
   if (!Subtarget->is64Bit())
     return false;
-  
+
   // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
   unsigned GPRCnt = 0;
   unsigned FPRCnt = 0;
@@ -2674,6 +2789,9 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
                        TM.Options.GuaranteedTailCallOpt))
     return false;
 
+  SmallVector<MVT, 16> OutVTs;
+  SmallVector<unsigned, 16> ArgRegs;
+
   // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
   // instruction. This is safe because it is common to all FastISel supported
   // calling conventions on x86.
@@ -2691,28 +2809,34 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
     // Passing bools around ends up doing a trunc to i1 and passing it.
     // Codegen this as an argument + "and 1".
-    if (auto *TI = dyn_cast<TruncInst>(Val)) {
-      if (TI->getType()->isIntegerTy(1) && CLI.CS &&
-          (TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
-          TI->hasOneUse()) {
-        Val = cast<TruncInst>(Val)->getOperand(0);
-        unsigned ResultReg = getRegForValue(Val);
-
-        if (!ResultReg)
-          return false;
-
-        MVT ArgVT;
-        if (!isTypeLegal(Val->getType(), ArgVT))
-          return false;
+    MVT VT;
+    auto *TI = dyn_cast<TruncInst>(Val);
+    unsigned ResultReg;
+    if (TI && TI->getType()->isIntegerTy(1) && CLI.CS &&
+              (TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
+              TI->hasOneUse()) {
+      Value *PrevVal = TI->getOperand(0);
+      ResultReg = getRegForValue(PrevVal);
+
+      if (!ResultReg)
+        return false;
 
-        ResultReg =
-          fastEmit_ri(ArgVT, ArgVT, ISD::AND, ResultReg, Val->hasOneUse(), 1);
+      if (!isTypeLegal(PrevVal->getType(), VT))
+        return false;
 
-        if (!ResultReg)
-          return false;
-        updateValueMap(Val, ResultReg);
-      }
+      ResultReg =
+        fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1);
+    } else {
+      if (!isTypeLegal(Val->getType(), VT))
+        return false;
+      ResultReg = getRegForValue(Val);
     }
+
+    if (!ResultReg)
+      return false;
+
+    ArgRegs.push_back(ResultReg);
+    OutVTs.push_back(VT);
   }
 
   // Analyze operands of the call, assigning locations to each operand.
@@ -2723,13 +2847,6 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   if (IsWin64)
     CCInfo.AllocateStack(32, 8);
 
-  SmallVector<MVT, 16> OutVTs;
-  for (auto *Val : OutVals) {
-    MVT VT;
-    if (!isTypeLegal(Val->getType(), VT))
-      return false;
-    OutVTs.push_back(VT);
-  }
   CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -2738,11 +2855,10 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
-    .addImm(NumBytes);
+    .addImm(NumBytes).addImm(0);
 
   // Walk the register/memloc assignments, inserting copies/loads.
-  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
-      TM.getSubtargetImpl()->getRegisterInfo());
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign const &VA = ArgLocs[i];
     const Value *ArgVal = OutVals[VA.getValNo()];
@@ -2751,9 +2867,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     if (ArgVT == MVT::x86mmx)
       return false;
 
-    unsigned ArgReg = getRegForValue(ArgVal);
-    if (!ArgReg)
-      return false;
+    unsigned ArgReg = ArgRegs[VA.getValNo()];
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
@@ -2875,7 +2989,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
     };
-    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
     assert((Subtarget->hasSSE1() || !NumXMMRegs)
            && "SSE registers cannot be used when SSE is disabled");
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
@@ -3049,6 +3163,8 @@ X86FastISel::fastSelectInstruction(const Instruction *I)  {
     return X86SelectFPExt(I);
   case Instruction::FPTrunc:
     return X86SelectFPTrunc(I);
+  case Instruction::SIToFP:
+    return X86SelectSIToFP(I);
   case Instruction::IntToPtr: // Deliberate fall-through.
   case Instruction::PtrToInt: {
     EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
@@ -3194,8 +3310,8 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
                                       TII.get(Opc), ResultReg);
     addDirectMem(MIB, AddrReg);
     MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
-      MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
-      TM.getSubtargetImpl()->getDataLayout()->getPointerSize(), Align);
+        MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
+        TM.getDataLayout()->getPointerSize(), Align);
     MIB->addMemOperand(*FuncInfo.MF, MMO);
     return ResultReg;
   }
@@ -3229,7 +3345,10 @@ unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
               ResultReg)
         .addGlobalAddress(GV);
     } else {
-      unsigned Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
+      unsigned Opc = TLI.getPointerTy() == MVT::i32
+                     ? (Subtarget->isTarget64BitILP32()
+                        ? X86::LEA64_32r : X86::LEA32r)
+                     : X86::LEA64r;
       addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                              TII.get(Opc), ResultReg), AM);
     }
@@ -3271,7 +3390,10 @@ unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
   X86AddressMode AM;
   if (!X86SelectAddress(C, AM))
     return 0;
-  unsigned Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
+  unsigned Opc = TLI.getPointerTy() == MVT::i32
+                 ? (Subtarget->isTarget64BitILP32()
+                    ? X86::LEA64_32r : X86::LEA32r)
+                 : X86::LEA64r;
   const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
   unsigned ResultReg = createResultReg(RC);
   addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -3325,7 +3447,7 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
   if (!X86SelectAddress(Ptr, AM))
     return false;
 
-  const X86InstrInfo &XII = (const X86InstrInfo&)TII;
+  const X86InstrInfo &XII = (const X86InstrInfo &)TII;
 
   unsigned Size = DL.getTypeAllocSize(LI->getType());
   unsigned Alignment = LI->getAlignment();
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 02736ac..b39c5ab 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -88,7 +88,6 @@ public:
 
 private:
   MachineFunction *MF;
-  const TargetMachine *TM;
   const X86InstrInfo *TII; // Machine instruction info.
 };
 char FixupLEAPass::ID = 0;
@@ -150,13 +149,11 @@ FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
 
 bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
-  TM = &Func.getTarget();
-  const X86Subtarget &ST = TM->getSubtarget<X86Subtarget>();
+  const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
   if (!ST.LEAusesAG() && !ST.slowLEA())
     return false;
 
-  TII =
-      static_cast<const X86InstrInfo *>(TM->getSubtargetImpl()->getInstrInfo());
+  TII = ST.getInstrInfo();
 
   DEBUG(dbgs() << "Start X86FixupLEAs\n";);
   // Process all basic blocks.
@@ -219,7 +216,7 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
       return CurInst;
     }
     InstrDistance += TII->getInstrLatency(
-        TM->getSubtargetImpl()->getInstrItineraryData(), CurInst);
+        MF->getSubtarget().getInstrItineraryData(), CurInst);
     Found = getPreviousInstr(CurInst, MFI);
   }
   return nullptr;
@@ -283,6 +280,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
     return;
   int addrr_opcode, addri_opcode;
   switch (opcode) {
+  default: llvm_unreachable("Unexpected LEA instruction");
   case X86::LEA16r:
     addrr_opcode = X86::ADD16rr;
     addri_opcode = X86::ADD16ri;
@@ -296,8 +294,6 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
     addrr_opcode = X86::ADD64rr;
     addri_opcode = X86::ADD64ri32;
     break;
-  default:
-    assert(false && "Unexpected LEA instruction");
   }
   DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
   DEBUG(dbgs() << "FixLEA: Replaced by: ";);
@@ -334,7 +330,7 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
                                      MachineFunction::iterator MFI) {
 
   for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
-    if (TM->getSubtarget<X86Subtarget>().isSLM())
+    if (MF.getSubtarget<X86Subtarget>().isSLM())
       processInstructionForSLM(I, MFI);
     else
       processInstruction(I, MFI);
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 6189109..c8e5f64 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -898,7 +898,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
 
   // Now we should have the correct registers live.
   DEBUG(dumpStack());
-  assert(StackTop == CountPopulation_32(Mask) && "Live count mismatch");
+  assert(StackTop == countPopulation(Mask) && "Live count mismatch");
 }
 
 /// shuffleStackTop - emit fxch instructions before I to shuffle the top
@@ -943,7 +943,7 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) {
     }
   }
 
-  unsigned N = CountTrailingOnes_32(STReturns);
+  unsigned N = countTrailingOnes(STReturns);
 
   // FP registers used for function return must be consecutive starting at
   // FP0.
@@ -1420,14 +1420,14 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
 
     if (STUses && !isMask_32(STUses))
       MI->emitError("fixed input regs must be last on the x87 stack");
-    unsigned NumSTUses = CountTrailingOnes_32(STUses);
+    unsigned NumSTUses = countTrailingOnes(STUses);
 
     // Defs must be contiguous from the stack top. ST0-STn.
     if (STDefs && !isMask_32(STDefs)) {
       MI->emitError("output regs must be last on the x87 stack");
       STDefs = NextPowerOf2(STDefs) - 1;
     }
-    unsigned NumSTDefs = CountTrailingOnes_32(STDefs);
+    unsigned NumSTDefs = countTrailingOnes(STDefs);
 
     // So must the clobbered stack slots. ST0-STm, m >= n.
     if (STClobbers && !isMask_32(STDefs | STClobbers))
@@ -1437,7 +1437,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
     unsigned STPopped = STUses & (STDefs | STClobbers);
     if (STPopped && !isMask_32(STPopped))
       MI->emitError("implicitly popped regs must be last on the x87 stack");
-    unsigned NumSTPopped = CountTrailingOnes_32(STPopped);
+    unsigned NumSTPopped = countTrailingOnes(STPopped);
 
     DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
                  << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n");
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index b9920b1..cead099 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -38,7 +38,34 @@ using namespace llvm;
 extern cl::opt<bool> ForceStackAlign;
 
 bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
-  return !MF.getFrameInfo()->hasVarSizedObjects();
+  return !MF.getFrameInfo()->hasVarSizedObjects() &&
+         !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+}
+
+/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
+/// call frame pseudos can be simplified.  Having a FP, as in the default
+/// implementation, is not sufficient here since we can't always use it.
+/// Use a more nuanced condition.
+bool
+X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
+  const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>
+                               (MF.getSubtarget().getRegisterInfo());
+  return hasReservedCallFrame(MF) ||
+         (hasFP(MF) && !TRI->needsStackRealignment(MF))
+         || TRI->hasBasePointer(MF);
+}
+
+// needsFrameIndexResolution - Do we need to perform FI resolution for
+// this function. Normally, this is required only when the function
+// has any stack objects. However, FI resolution actually has another job,
+// not apparent from the title - it resolves callframesetup/destroy 
+// that were not simplified earlier.
+// So, this is required for x86 functions that have push sequences even
+// when there are no stack objects.
+bool
+X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
+  return MF.getFrameInfo()->hasStackObjects() ||
+         MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
 }
 
 /// hasFP - Return true if the specified function should have a dedicated frame
@@ -82,6 +109,14 @@ static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
   }
 }
 
+static unsigned getSUBrrOpcode(unsigned isLP64) {
+  return isLP64 ? X86::SUB64rr : X86::SUB32rr;
+}
+
+static unsigned getADDrrOpcode(unsigned isLP64) {
+  return isLP64 ? X86::ADD64rr : X86::ADD32rr;
+}
+
 static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
   if (IsLP64) {
     if (isInt<8>(Imm))
@@ -155,6 +190,18 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
   return 0;
 }
 
+static bool isEAXLiveIn(MachineFunction &MF) {
+  for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(),
+       EE = MF.getRegInfo().livein_end(); II != EE; ++II) {
+    unsigned Reg = II->first;
+
+    if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX ||
+        Reg == X86::AH || Reg == X86::AL)
+      return true;
+  }
+
+  return false;
+}
 
 /// emitSPUpdate - Emit a series of instructions to increment / decrement the
 /// stack pointer by a constant value.
@@ -177,7 +224,33 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
   DebugLoc DL = MBB.findDebugLoc(MBBI);
 
   while (Offset) {
-    uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
+    if (Offset > Chunk) {
+      // Rather than emit a long series of instructions for large offsets,
+      // load the offset into a register and do one sub/add
+      unsigned Reg = 0;
+
+      if (isSub && !isEAXLiveIn(*MBB.getParent()))
+        Reg = (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX);
+      else
+        Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget);
+
+      if (Reg) {
+        Opc = Is64BitTarget ? X86::MOV64ri : X86::MOV32ri;
+        BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg)
+          .addImm(Offset);
+        Opc = isSub
+          ? getSUBrrOpcode(Is64BitTarget)
+          : getADDrrOpcode(Is64BitTarget);
+        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+          .addReg(StackPtr)
+          .addReg(Reg);
+        MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+        Offset = 0;
+        continue;
+      }
+    }
+
+    uint64_t ThisVal = std::min(Offset, Chunk);
     if (ThisVal == (Is64BitTarget ? 8 : 4)) {
       // Use push / pop instead.
       unsigned Reg = isSub
@@ -239,38 +312,6 @@ void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
   }
 }
 
-/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower
-/// iterator.
-static
-void mergeSPUpdatesDown(MachineBasicBlock &MBB,
-                        MachineBasicBlock::iterator &MBBI,
-                        unsigned StackPtr, uint64_t *NumBytes = nullptr) {
-  // FIXME:  THIS ISN'T RUN!!!
-  return;
-
-  if (MBBI == MBB.end()) return;
-
-  MachineBasicBlock::iterator NI = std::next(MBBI);
-  if (NI == MBB.end()) return;
-
-  unsigned Opc = NI->getOpcode();
-  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
-       Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
-      NI->getOperand(0).getReg() == StackPtr) {
-    if (NumBytes)
-      *NumBytes -= NI->getOperand(2).getImm();
-    MBB.erase(NI);
-    MBBI = NI;
-  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
-              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
-             NI->getOperand(0).getReg() == StackPtr) {
-    if (NumBytes)
-      *NumBytes += NI->getOperand(2).getImm();
-    MBB.erase(NI);
-    MBBI = NI;
-  }
-}
-
 /// mergeSPUpdates - Checks the instruction before/after the passed
 /// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and
 /// the stack adjustment is returned as a positive value for ADD/LEA and a
@@ -306,19 +347,6 @@ static int mergeSPUpdates(MachineBasicBlock &MBB,
   return Offset;
 }
 
-static bool isEAXLiveIn(MachineFunction &MF) {
-  for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(),
-       EE = MF.getRegInfo().livein_end(); II != EE; ++II) {
-    unsigned Reg = II->first;
-
-    if (Reg == X86::EAX || Reg == X86::AX ||
-        Reg == X86::AH || Reg == X86::AL)
-      return true;
-  }
-
-  return false;
-}
-
 void
 X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
                                             MachineBasicBlock::iterator MBBI,
@@ -365,12 +393,23 @@ static bool usesTheStack(const MachineFunction &MF) {
   return false;
 }
 
-void X86FrameLowering::getStackProbeFunction(const X86Subtarget &STI,
-                                             unsigned &CallOp,
-                                             const char *&Symbol) {
-  CallOp = STI.is64Bit() ? X86::W64ALLOCA : X86::CALLpcrel32;
+void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
+                                          MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          DebugLoc DL) {
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  bool Is64Bit = STI.is64Bit();
+  bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
 
-  if (STI.is64Bit()) {
+  unsigned CallOp;
+  if (Is64Bit)
+    CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
+  else
+    CallOp = X86::CALLpcrel32;
+
+  const char *Symbol;
+  if (Is64Bit) {
     if (STI.isTargetCygMing()) {
       Symbol = "___chkstk_ms";
     } else {
@@ -380,6 +419,66 @@ void X86FrameLowering::getStackProbeFunction(const X86Subtarget &STI,
     Symbol = "_alloca";
   else
     Symbol = "_chkstk";
+
+  MachineInstrBuilder CI;
+
+  // All current stack probes take AX and SP as input, clobber flags, and
+  // preserve all registers. x86_64 probes leave RSP unmodified.
+  if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
+    // For the large code model, we have to call through a register. Use R11,
+    // as it is scratch in all supported calling conventions.
+    BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
+        .addExternalSymbol(Symbol);
+    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
+  } else {
+    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol);
+  }
+
+  unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
+  unsigned SP = Is64Bit ? X86::RSP : X86::ESP;
+  CI.addReg(AX, RegState::Implicit)
+      .addReg(SP, RegState::Implicit)
+      .addReg(AX, RegState::Define | RegState::Implicit)
+      .addReg(SP, RegState::Define | RegState::Implicit)
+      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+
+  if (Is64Bit) {
+    // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
+    // themselves. It also does not clobber %rax so we can reuse it when
+    // adjusting %rsp.
+    BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
+        .addReg(X86::RSP)
+        .addReg(X86::RAX);
+  }
+}
+
+static unsigned calculateSetFPREG(uint64_t SPAdjust) {
+  // Win64 ABI has a less restrictive limitation of 240; 128 works equally well
+  // and might require smaller successive adjustments.
+  const uint64_t Win64MaxSEHOffset = 128;
+  uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset);
+  // Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode.
+  return SEHFrameOffset & -16;
+}
+
+// If we're forcing a stack realignment we can't rely on just the frame
+// info, we need to know the ABI stack alignment as well in case we
+// have a call out.  Otherwise just make sure we have some alignment - we'll
+// go with the minimum SlotSize.
+static uint64_t calculateMaxStackAlign(const MachineFunction &MF) {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment.
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const X86RegisterInfo *RegInfo = STI.getRegisterInfo();
+  unsigned SlotSize = RegInfo->getSlotSize();
+  unsigned StackAlign = STI.getFrameLowering()->getStackAlignment();
+  if (ForceStackAlign) {
+    if (MFI->hasCalls())
+      MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
+    else if (MaxAlign < SlotSize)
+      MaxAlign = SlotSize;
+  }
+  return MaxAlign;
 }
 
 /// emitPrologue - Push callee-saved registers onto the stack, which
@@ -448,6 +547,8 @@ void X86FrameLowering::getStackProbeFunction(const X86Subtarget &STI,
 
   [if needs base pointer]
       mov  %rsp, %rbx
+      [if needs to restore base pointer]
+          mov %rsp, -MMM(%rbp)
 
   ; Emit CFI info
   [if needs FP]
@@ -469,67 +570,65 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *Fn = MF.getFunction();
-  const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const X86RegisterInfo *RegInfo = STI.getRegisterInfo();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-  uint64_t MaxAlign  = MFI->getMaxAlignment(); // Desired stack alignment.
+  uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
   uint64_t StackSize = MFI->getStackSize();    // Number of bytes to allocate.
   bool HasFP = hasFP(MF);
-  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool Is64Bit = STI.is64Bit();
   // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
   const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
   bool IsWin64 = STI.isTargetWin64();
   // Not necessarily synonymous with IsWin64.
-  bool IsWinEH = MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() ==
-                 ExceptionHandling::ItaniumWinEH;
+  bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
   bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry();
   bool NeedsDwarfCFI =
       !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
   bool UseLEA = STI.useLeaForSP();
-  unsigned StackAlign = getStackAlignment();
   unsigned SlotSize = RegInfo->getSlotSize();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
-  const unsigned MachineFramePtr = STI.isTarget64BitILP32() ?
-                 getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr;
+  const unsigned MachineFramePtr =
+      STI.isTarget64BitILP32()
+          ? getX86SubSuperRegister(FramePtr, MVT::i64, false)
+          : FramePtr;
   unsigned StackPtr = RegInfo->getStackRegister();
   unsigned BasePtr = RegInfo->getBaseRegister();
   DebugLoc DL;
 
-  // If we're forcing a stack realignment we can't rely on just the frame
-  // info, we need to know the ABI stack alignment as well in case we
-  // have a call out.  Otherwise just make sure we have some alignment - we'll
-  // go with the minimum SlotSize.
-  if (ForceStackAlign) {
-    if (MFI->hasCalls())
-      MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
-    else if (MaxAlign < SlotSize)
-      MaxAlign = SlotSize;
-  }
-
   // Add RETADDR move area to callee saved frame size.
   int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+  if (TailCallReturnAddrDelta && IsWinEH)
+    report_fatal_error("Can't handle guaranteed tail call under win64 yet");
+
   if (TailCallReturnAddrDelta < 0)
     X86FI->setCalleeSavedFrameSize(
       X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
 
-  bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMacho());
-  
+  bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO());
+
+  // The default stack probe size is 4096 if the function has no stackprobesize
+  // attribute.
+  unsigned StackProbeSize = 4096;
+  if (Fn->hasFnAttribute("stack-probe-size"))
+    Fn->getFnAttribute("stack-probe-size")
+        .getValueAsString()
+        .getAsInteger(0, StackProbeSize);
+
   // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
   // function, and use up to 128 bytes of stack space, don't have a frame
   // pointer, calls, or dynamic alloca then we do not need to adjust the
   // stack pointer (we fit in the Red Zone). We also check that we don't
   // push and pop from the stack.
-  if (Is64Bit && !Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                                   Attribute::NoRedZone) &&
+  if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) &&
       !RegInfo->needsStackRealignment(MF) &&
-      !MFI->hasVarSizedObjects() &&                     // No dynamic alloca.
-      !MFI->adjustsStack() &&                           // No calls.
-      !IsWin64 &&                                       // Win64 has no Red Zone
-      !usesTheStack(MF) &&                              // Don't push and pop.
-      !MF.shouldSplitStack()) {                         // Regular stack
+      !MFI->hasVarSizedObjects() && // No dynamic alloca.
+      !MFI->adjustsStack() &&       // No calls.
+      !IsWin64 &&                   // Win64 has no Red Zone
+      !usesTheStack(MF) &&          // Don't push and pop.
+      !MF.shouldSplitStack()) {     // Regular stack
     uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
     if (HasFP) MinSize += SlotSize;
     StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
@@ -570,14 +669,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   if (HasFP) {
     // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
-    if (RegInfo->needsStackRealignment(MF)) {
-      // Callee-saved registers are pushed on stack before the stack
-      // is realigned.
-      FrameSize -= X86FI->getCalleeSavedFrameSize();
-      NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
-    } else {
-      NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
-    }
+    // If required, include space for extra hidden slot for stashing base pointer.
+    if (X86FI->getRestoreBasePointer())
+      FrameSize += SlotSize;
+
+    NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+
+    // Callee-saved registers are pushed on stack before the stack is realigned.
+    if (RegInfo->needsStackRealignment(MF) && !IsWinEH)
+      NumBytes = RoundUpToAlignment(NumBytes, MaxAlign);
 
     // Get the offset of the stack slot for the EBP register, which is
     // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
@@ -613,11 +713,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
           .setMIFlag(MachineInstr::FrameSetup);
     }
 
-    // Update EBP with the new base value.
-    BuildMI(MBB, MBBI, DL,
-            TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr)
-        .addReg(StackPtr)
-        .setMIFlag(MachineInstr::FrameSetup);
+    if (!IsWinEH) {
+      // Update EBP with the new base value.
+      BuildMI(MBB, MBBI, DL,
+              TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
+              FramePtr)
+          .addReg(StackPtr)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
 
     if (NeedsDwarfCFI) {
       // Mark effective beginning of when frame pointer becomes valid.
@@ -666,15 +769,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
 
   // Realign stack after we pushed callee-saved registers (so that we'll be
   // able to calculate their offsets from the frame pointer).
-  if (RegInfo->needsStackRealignment(MF)) {
+  // Don't do this for Win64, it needs to realign the stack after the prologue.
+  if (!IsWinEH && RegInfo->needsStackRealignment(MF)) {
     assert(HasFP && "There should be a frame pointer if stack is realigned.");
     uint64_t Val = -MaxAlign;
     MachineInstr *MI =
-      BuildMI(MBB, MBBI, DL,
-              TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), StackPtr)
-      .addReg(StackPtr)
-      .addImm(Val)
-      .setMIFlag(MachineInstr::FrameSetup);
+        BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)),
+                StackPtr)
+            .addReg(StackPtr)
+            .addImm(Val)
+            .setMIFlag(MachineInstr::FrameSetup);
 
     // The EFLAGS implicit def is dead.
     MI->getOperand(3).setIsDead();
@@ -685,14 +789,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   // the callee has more arguments then the caller.
   NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);
 
-  // If there is an ADD32ri or SUB32ri of ESP immediately after this
-  // instruction, merge the two instructions.
-  mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);
-
   // Adjust stack pointer: ESP -= numbytes.
 
-  static const size_t PageSize = 4096;
-
   // Windows and cygwin/mingw require a prologue helper routine when allocating
   // more than 4K bytes on the stack.  Windows uses __chkstk and cygwin/mingw
   // uses __alloca.  __alloca and the 32-bit version of __chkstk will probe the
@@ -701,12 +799,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   // responsible for adjusting the stack pointer.  Touching the stack at 4K
   // increments is necessary to ensure that the guard pages used by the OS
   // virtual memory manager are allocated in correct sequence.
-  if (NumBytes >= PageSize && UseStackProbe) {
-    const char *StackProbeSymbol;
-    unsigned CallOp;
-
-    getStackProbeFunction(STI, CallOp, StackProbeSymbol);
-
+  uint64_t AlignedNumBytes = NumBytes;
+  if (IsWinEH && RegInfo->needsStackRealignment(MF))
+    AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign);
+  if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
     // Check whether EAX is livein for this function.
     bool isEAXAlive = isEAXLiveIn(MF);
 
@@ -724,9 +820,19 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     if (Is64Bit) {
       // Handle the 64-bit Windows ABI case where we need to call __chkstk.
       // Function prologue is responsible for adjusting the stack pointer.
-      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
-        .addImm(NumBytes)
-        .setMIFlag(MachineInstr::FrameSetup);
+      if (isUInt<32>(NumBytes)) {
+        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+            .addImm(NumBytes)
+            .setMIFlag(MachineInstr::FrameSetup);
+      } else if (isInt<32>(NumBytes)) {
+        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX)
+            .addImm(NumBytes)
+            .setMIFlag(MachineInstr::FrameSetup);
+      } else {
+        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
+            .addImm(NumBytes)
+            .setMIFlag(MachineInstr::FrameSetup);
+      }
     } else {
       // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
       // We'll also use 4 already allocated bytes for EAX.
@@ -735,22 +841,17 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
         .setMIFlag(MachineInstr::FrameSetup);
     }
 
-    BuildMI(MBB, MBBI, DL,
-            TII.get(CallOp))
-      .addExternalSymbol(StackProbeSymbol)
-      .addReg(StackPtr,    RegState::Define | RegState::Implicit)
-      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit)
-      .setMIFlag(MachineInstr::FrameSetup);
+    // Save a pointer to the MI where we set AX.
+    MachineBasicBlock::iterator SetRAX = MBBI;
+    --SetRAX;
+
+    // Call __chkstk, __chkstk_ms, or __alloca.
+    emitStackProbeCall(MF, MBB, MBBI, DL);
+
+    // Apply the frame setup flag to all inserted instrs.
+    for (; SetRAX != MBBI; ++SetRAX)
+      SetRAX->setFlag(MachineInstr::FrameSetup);
 
-    if (Is64Bit) {
-      // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
-      // themself. It also does not clobber %rax so we can reuse it when
-      // adjusting %rsp.
-      BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr)
-        .addReg(StackPtr)
-        .addReg(X86::RAX)
-        .setMIFlag(MachineInstr::FrameSetup);
-    }
     if (isEAXAlive) {
       // Restore EAX
       MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
@@ -764,68 +865,66 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
                  UseLEA, TII, *RegInfo);
   }
 
+  if (NeedsWinEH && NumBytes)
+    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
+        .addImm(NumBytes)
+        .setMIFlag(MachineInstr::FrameSetup);
+
   int SEHFrameOffset = 0;
-  if (NeedsWinEH) {
-    if (HasFP) {
-      // We need to set frame base offset low enough such that all saved
-      // register offsets would be positive relative to it, but we can't
-      // just use NumBytes, because .seh_setframe offset must be <=240.
-      // So we pretend to have only allocated enough space to spill the
-      // non-volatile registers.
-      // We don't care about the rest of stack allocation, because unwinder
-      // will restore SP to (BP - SEHFrameOffset)
-      for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) {
-        int offset = MFI->getObjectOffset(Info.getFrameIdx());
-        SEHFrameOffset = std::max(SEHFrameOffset, std::abs(offset));
-      }
-      SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant
-
-      // This only needs to account for XMM spill slots, GPR slots
-      // are covered by the .seh_pushreg's emitted above.
-      unsigned Size = SEHFrameOffset - X86FI->getCalleeSavedFrameSize();
-      if (Size) {
-        BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
-            .addImm(Size)
-            .setMIFlag(MachineInstr::FrameSetup);
-      }
+  if (IsWinEH && HasFP) {
+    SEHFrameOffset = calculateSetFPREG(NumBytes);
+    if (SEHFrameOffset)
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr),
+                   StackPtr, false, SEHFrameOffset);
+    else
+      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr).addReg(StackPtr);
 
+    if (NeedsWinEH)
       BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
           .addImm(FramePtr)
           .addImm(SEHFrameOffset)
           .setMIFlag(MachineInstr::FrameSetup);
-    } else {
-      // SP will be the base register for restoring XMMs
-      if (NumBytes) {
-        BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
-            .addImm(NumBytes)
-            .setMIFlag(MachineInstr::FrameSetup);
-      }
-    }
   }
 
-  // Skip the rest of register spilling code
-  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
+  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) {
+    const MachineInstr *FrameInstr = &*MBBI;
     ++MBBI;
 
-  // Emit SEH info for non-GPRs
-  if (NeedsWinEH) {
-    for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) {
-      unsigned Reg = Info.getReg();
-      if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
-        continue;
-      assert(X86::FR64RegClass.contains(Reg) && "Unexpected register class");
-
-      int Offset = getFrameIndexOffset(MF, Info.getFrameIdx());
-      Offset += SEHFrameOffset;
-
-      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
-          .addImm(Reg)
-          .addImm(Offset)
-          .setMIFlag(MachineInstr::FrameSetup);
+    if (NeedsWinEH) {
+      int FI;
+      if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
+        if (X86::FR64RegClass.contains(Reg)) {
+          int Offset = getFrameIndexOffset(MF, FI);
+          Offset += SEHFrameOffset;
+
+          BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
+              .addImm(Reg)
+              .addImm(Offset)
+              .setMIFlag(MachineInstr::FrameSetup);
+        }
+      }
     }
+  }
 
+  if (NeedsWinEH)
     BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
         .setMIFlag(MachineInstr::FrameSetup);
+
+  // Realign stack after we spilled callee-saved registers (so that we'll be
+  // able to calculate their offsets from the frame pointer).
+  // Win64 requires aligning the stack after the prologue.
+  if (IsWinEH && RegInfo->needsStackRealignment(MF)) {
+    assert(HasFP && "There should be a frame pointer if stack is realigned.");
+    uint64_t Val = -MaxAlign;
+    MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)),
+                StackPtr)
+            .addReg(StackPtr)
+            .addImm(Val)
+            .setMIFlag(MachineInstr::FrameSetup);
+
+    // The EFLAGS implicit def is dead.
+    MI->getOperand(3).setIsDead();
   }
 
   // If we need a base pointer, set it up here. It's whatever the value
@@ -838,6 +937,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
       .addReg(StackPtr)
       .setMIFlag(MachineInstr::FrameSetup);
+    if (X86FI->getRestoreBasePointer()) {
+      // Stash value of base pointer.  Saving RSP instead of EBP shortens dependence chain.
+      unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
+                   FramePtr, true, X86FI->getRestoreBasePointerOffset())
+        .addReg(StackPtr)
+        .setMIFlag(MachineInstr::FrameSetup);
+    }
   }
 
   if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
@@ -863,33 +970,45 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-  const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const X86RegisterInfo *RegInfo = STI.getRegisterInfo();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   assert(MBBI != MBB.end() && "Returning block has no instructions");
   unsigned RetOpcode = MBBI->getOpcode();
   DebugLoc DL = MBBI->getDebugLoc();
-  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool Is64Bit = STI.is64Bit();
   // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
   const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+  bool HasFP = hasFP(MF);
   const bool Is64BitILP32 = STI.isTarget64BitILP32();
-  bool UseLEA = STI.useLeaForSP();
-  unsigned StackAlign = getStackAlignment();
   unsigned SlotSize = RegInfo->getSlotSize();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
-  unsigned MachineFramePtr = Is64BitILP32 ?
-             getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr;
+  unsigned MachineFramePtr =
+      Is64BitILP32 ? getX86SubSuperRegister(FramePtr, MVT::i64, false)
+                   : FramePtr;
   unsigned StackPtr = RegInfo->getStackRegister();
 
-  bool IsWinEH = MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() ==
-                 ExceptionHandling::ItaniumWinEH;
+  bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
   bool NeedsWinEH = IsWinEH && MF.getFunction()->needsUnwindTableEntry();
+  bool UseLEAForSP = false;
+
+  // We can't use LEA instructions for adjusting the stack pointer if this is a
+  // leaf function in the Win64 ABI.  Only ADD instructions may be used to
+  // deallocate the stack.
+  if (STI.useLeaForSP()) {
+    if (!IsWinEH) {
+      // We *aren't* using the Win64 ABI which means we are free to use LEA.
+      UseLEAForSP = true;
+    } else if (HasFP) {
+      // We *have* a frame pointer which means we are permitted to use LEA.
+      UseLEAForSP = true;
+    }
+  }
 
   switch (RetOpcode) {
   default:
-    llvm_unreachable("Can only insert epilog into returning blocks");
+    llvm_unreachable("Can only insert epilogue into returning blocks");
   case X86::RETQ:
   case X86::RETL:
   case X86::RETIL:
@@ -907,32 +1026,19 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 
   // Get the number of bytes to allocate from the FrameInfo.
   uint64_t StackSize = MFI->getStackSize();
-  uint64_t MaxAlign  = MFI->getMaxAlignment();
+  uint64_t MaxAlign = calculateMaxStackAlign(MF);
   unsigned CSSize = X86FI->getCalleeSavedFrameSize();
   uint64_t NumBytes = 0;
 
-  // If we're forcing a stack realignment we can't rely on just the frame
-  // info, we need to know the ABI stack alignment as well in case we
-  // have a call out.  Otherwise just make sure we have some alignment - we'll
-  // go with the minimum.
-  if (ForceStackAlign) {
-    if (MFI->hasCalls())
-      MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
-    else
-      MaxAlign = MaxAlign ? MaxAlign : 4;
-  }
-
   if (hasFP(MF)) {
     // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
-    if (RegInfo->needsStackRealignment(MF)) {
-      // Callee-saved registers were pushed on stack before the stack
-      // was realigned.
-      FrameSize -= CSSize;
-      NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
-    } else {
-      NumBytes = FrameSize - CSSize;
-    }
+    NumBytes = FrameSize - CSSize;
+
+    // Callee-saved registers were pushed on stack before the stack was
+    // realigned.
+    if (RegInfo->needsStackRealignment(MF) && !IsWinEH)
+      NumBytes = RoundUpToAlignment(FrameSize, MaxAlign);
 
     // Pop EBP.
     BuildMI(MBB, MBBI, DL,
@@ -940,6 +1046,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   } else {
     NumBytes = StackSize - CSSize;
   }
+  uint64_t SEHStackAllocAmt = NumBytes;
 
   // Skip the callee-saved pop instructions.
   while (MBBI != MBB.begin()) {
@@ -967,10 +1074,20 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) {
     if (RegInfo->needsStackRealignment(MF))
       MBBI = FirstCSPop;
-    if (CSSize != 0) {
+    unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt);
+    uint64_t LEAAmount = IsWinEH ? SEHStackAllocAmt - SEHFrameOffset : -CSSize;
+
+    // There are only two legal forms of epilogue:
+    // - add SEHAllocationSize, %rsp
+    // - lea SEHAllocationSize(%FramePtr), %rsp
+    //
+    // 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence.
+    // However, we may use this sequence if we have a frame pointer because the
+    // effects of the prologue can safely be undone.
+    if (LEAAmount != 0) {
       unsigned Opc = getLEArOpcode(Uses64BitFramePtr);
       addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
-                   FramePtr, false, -CSSize);
+                   FramePtr, false, LEAAmount);
       --MBBI;
     } else {
       unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);
@@ -980,8 +1097,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     }
   } else if (NumBytes) {
     // Adjust stack pointer back: ESP += numbytes.
-    emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA,
-                 TII, *RegInfo);
+    emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr,
+                 UseLEAForSP, TII, *RegInfo);
     --MBBI;
   }
 
@@ -1027,14 +1144,16 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
       // Check for possible merge with preceding ADD instruction.
       Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
       emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, Uses64BitFramePtr,
-                   UseLEA, TII, *RegInfo);
+                   UseLEAForSP, TII, *RegInfo);
     }
 
     // Jump to label or value in register.
+    bool IsWin64 = STI.isTargetWin64();
     if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) {
-      MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNdi)
-                                       ? X86::TAILJMPd : X86::TAILJMPd64));
+      unsigned Op = (RetOpcode == X86::TCRETURNdi)
+                        ? X86::TAILJMPd
+                        : (IsWin64 ? X86::TAILJMPd64_REX : X86::TAILJMPd64);
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(Op));
       if (JumpTarget.isGlobal())
         MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
                              JumpTarget.getTargetFlags());
@@ -1044,14 +1163,16 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
                               JumpTarget.getTargetFlags());
       }
     } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) {
-      MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNmi)
-                                       ? X86::TAILJMPm : X86::TAILJMPm64));
+      unsigned Op = (RetOpcode == X86::TCRETURNmi)
+                        ? X86::TAILJMPm
+                        : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(Op));
       for (unsigned i = 0; i != 5; ++i)
         MIB.addOperand(MBBI->getOperand(i));
     } else if (RetOpcode == X86::TCRETURNri64) {
-      BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64)).
-        addReg(JumpTarget.getReg(), RegState::Kill);
+      BuildMI(MBB, MBBI, DL,
+              TII.get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
+          .addReg(JumpTarget.getReg(), RegState::Kill);
     } else {
       BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)).
         addReg(JumpTarget.getReg(), RegState::Kill);
@@ -1071,24 +1192,58 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 
     // Check for possible merge with preceding ADD instruction.
     delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
-    emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, Uses64BitFramePtr, UseLEA, TII,
-                 *RegInfo);
+    emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, Uses64BitFramePtr,
+                 UseLEAForSP, TII, *RegInfo);
   }
 }
 
 int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
                                           int FI) const {
   const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+      MF.getSubtarget<X86Subtarget>().getRegisterInfo();
   const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Offset will hold the offset from the stack pointer at function entry to the
+  // object.
+  // We need to factor in additional offsets applied during the prologue to the
+  // frame, base, and stack pointer depending on which is used.
   int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
+  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
   uint64_t StackSize = MFI->getStackSize();
+  unsigned SlotSize = RegInfo->getSlotSize();
+  bool HasFP = hasFP(MF);
+  bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+  int64_t FPDelta = 0;
+
+  if (IsWinEH) {
+    assert(!MFI->hasCalls() || (StackSize % 16) == 8);
+
+    // Calculate required stack adjustment.
+    uint64_t FrameSize = StackSize - SlotSize;
+    // If required, include space for extra hidden slot for stashing base pointer.
+    if (X86FI->getRestoreBasePointer())
+      FrameSize += SlotSize;
+    uint64_t NumBytes = FrameSize - CSSize;
+
+    uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes);
+    if (FI && FI == X86FI->getFAIndex())
+      return -SEHFrameOffset;
+
+    // FPDelta is the offset from the "traditional" FP location of the old base
+    // pointer followed by return address and the location required by the
+    // restricted Win64 prologue.
+    // Add FPDelta to all offsets below that go through the frame pointer.
+    FPDelta = FrameSize - SEHFrameOffset;
+    assert((!MFI->hasCalls() || (FPDelta % 16) == 0) &&
+           "FPDelta isn't aligned per the Win64 ABI!");
+  }
+
 
   if (RegInfo->hasBasePointer(MF)) {
-    assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!");
+    assert(HasFP && "VLAs and dynamic stack realign, but no FP?!");
     if (FI < 0) {
       // Skip the saved EBP.
-      return Offset + RegInfo->getSlotSize();
+      return Offset + SlotSize + FPDelta;
     } else {
       assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
       return Offset + StackSize;
@@ -1096,33 +1251,32 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
   } else if (RegInfo->needsStackRealignment(MF)) {
     if (FI < 0) {
       // Skip the saved EBP.
-      return Offset + RegInfo->getSlotSize();
+      return Offset + SlotSize + FPDelta;
     } else {
       assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
       return Offset + StackSize;
     }
     // FIXME: Support tail calls
   } else {
-    if (!hasFP(MF))
+    if (!HasFP)
       return Offset + StackSize;
 
     // Skip the saved EBP.
-    Offset += RegInfo->getSlotSize();
+    Offset += SlotSize;
 
     // Skip the RETADDR move area
-    const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
     int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
     if (TailCallReturnAddrDelta < 0)
       Offset -= TailCallReturnAddrDelta;
   }
 
-  return Offset;
+  return Offset + FPDelta;
 }
 
 int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
                                              unsigned &FrameReg) const {
   const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+      MF.getSubtarget<X86Subtarget>().getRegisterInfo();
   // We can't calculate offset from frame pointer if the stack is realigned,
   // so enforce usage of stack/base pointer.  The base pointer is used when we
   // have dynamic allocas in addition to dynamic realignment.
@@ -1135,12 +1289,85 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
   return getFrameIndexOffset(MF, FI);
 }
 
+// Simplified from getFrameIndexOffset keeping only StackPointer cases
+int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Does not include any dynamic realign.
+  const uint64_t StackSize = MFI->getStackSize();
+  {
+#ifndef NDEBUG
+    const X86RegisterInfo *RegInfo =
+        MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+    // Note: LLVM arranges the stack as:
+    // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP)
+    //      > "Stack Slots" (<--SP)
+    // We can always address StackSlots from RSP.  We can usually (unless
+    // needsStackRealignment) address CSRs from RSP, but sometimes need to
+    // address them from RBP.  FixedObjects can be placed anywhere in the stack
+    // frame depending on their specific requirements (i.e. we can actually
+    // refer to arguments to the function which are stored in the *callers*
+    // frame).  As a result, THE RESULT OF THIS CALL IS MEANINGLESS FOR CSRs
+    // AND FixedObjects IFF needsStackRealignment or hasVarSizedObject.
+
+    assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case");
+
+    // We don't handle tail calls, and shouldn't be seeing them
+    // either.
+    int TailCallReturnAddrDelta =
+        MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta();
+    assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!");
+#endif
+  }
+
+  // This is how the math works out:
+  //
+  //  %rsp grows (i.e. gets lower) left to right. Each box below is
+  //  one word (eight bytes).  Obj0 is the stack slot we're trying to
+  //  get to.
+  //
+  //    ----------------------------------
+  //    | BP | Obj0 | Obj1 | ... | ObjN |
+  //    ----------------------------------
+  //    ^    ^      ^                   ^
+  //    A    B      C                   E
+  //
+  // A is the incoming stack pointer.
+  // (B - A) is the local area offset (-8 for x86-64) [1]
+  // (C - A) is the Offset returned by MFI->getObjectOffset for Obj0 [2]
+  //
+  // |(E - B)| is the StackSize (absolute value, positive).  For a
+  // stack that grown down, this works out to be (B - E). [3]
+  //
+  // E is also the value of %rsp after stack has been set up, and we
+  // want (C - E) -- the value we can add to %rsp to get to Obj0.  Now
+  // (C - E) == (C - A) - (B - A) + (B - E)
+  //            { Using [1], [2] and [3] above }
+  //         == getObjectOffset - LocalAreaOffset + StackSize
+  //
+
+  // Get the Offset from the StackPointer
+  int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
+
+  return Offset + StackSize;
+}
+// Simplified from getFrameIndexReference keeping only StackPointer cases
+int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
+                                                   int FI,
+                                                   unsigned &FrameReg) const {
+  const X86RegisterInfo *RegInfo =
+      MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+  assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case");
+
+  FrameReg = RegInfo->getStackRegister();
+  return getFrameIndexOffsetFromSP(MF, FI);
+}
+
 bool X86FrameLowering::assignCalleeSavedSpillSlots(
     MachineFunction &MF, const TargetRegisterInfo *TRI,
     std::vector<CalleeSavedInfo> &CSI) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+      MF.getSubtarget<X86Subtarget>().getRegisterInfo();
   unsigned SlotSize = RegInfo->getSlotSize();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
 
@@ -1207,8 +1434,8 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
   DebugLoc DL = MBB.findDebugLoc(MI);
 
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
 
   // Push GPRs. It increases frame size.
   unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
@@ -1228,8 +1455,7 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
   // It can be done by spilling XMMs to stack frame.
   for (unsigned i = CSI.size(); i != 0; --i) {
     unsigned Reg = CSI[i-1].getReg();
-    if (X86::GR64RegClass.contains(Reg) ||
-        X86::GR32RegClass.contains(Reg))
+    if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
       continue;
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(Reg);
@@ -1255,8 +1481,8 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   DebugLoc DL = MBB.findDebugLoc(MI);
 
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
 
   // Reload XMMs from stack frame.
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
@@ -1287,7 +1513,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                                        RegScavenger *RS) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+      MF.getSubtarget<X86Subtarget>().getRegisterInfo();
   unsigned SlotSize = RegInfo->getSlotSize();
 
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
@@ -1368,9 +1594,9 @@ void
 X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   MachineBasicBlock &prologueMBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
   uint64_t StackSize;
-  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool Is64Bit = STI.is64Bit();
   const bool IsLP64 = STI.isTarget64BitLP64();
   unsigned TlsReg, TlsOffset;
@@ -1382,8 +1608,9 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
 
   if (MF.getFunction()->isVarArg())
     report_fatal_error("Segmented stacks do not support vararg functions.");
-  if (!STI.isTargetLinux() && !STI.isTargetDarwin() &&
-      !STI.isTargetWin32() && !STI.isTargetWin64() && !STI.isTargetFreeBSD())
+  if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() &&
+      !STI.isTargetWin64() && !STI.isTargetFreeBSD() &&
+      !STI.isTargetDragonFly())
     report_fatal_error("Segmented stacks not supported on this platform.");
 
   // Eventually StackSize will be calculated by a link-time pass; which will
@@ -1437,6 +1664,9 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
     } else if (STI.isTargetFreeBSD()) {
       TlsReg = X86::FS;
       TlsOffset = 0x18;
+    } else if (STI.isTargetDragonFly()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x20; // use tls_tcb.tcb_segstack
     } else {
       report_fatal_error("Segmented stacks not supported on this platform.");
     }
@@ -1459,6 +1689,9 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
     } else if (STI.isTargetWin32()) {
       TlsReg = X86::FS;
       TlsOffset = 0x14; // pvArbitrary, reserved for application use
+    } else if (STI.isTargetDragonFly()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x10; // use tls_tcb.tcb_segstack
     } else if (STI.isTargetFreeBSD()) {
       report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
     } else {
@@ -1471,7 +1704,8 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
       BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
         .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
 
-    if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64()) {
+    if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() ||
+        STI.isTargetDragonFly()) {
       BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
         .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
     } else if (STI.isTargetDarwin()) {
@@ -1515,7 +1749,7 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
 
   // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
   // It jumps to normal execution of the function body.
-  BuildMI(checkMBB, DL, TII.get(X86::JA_4)).addMBB(&prologueMBB);
+  BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&prologueMBB);
 
   // On 32 bit we first push the arguments size and then the frame size. On 64
   // bit, we pass the stack frame size in r10 and the argument size in r11.
@@ -1546,12 +1780,36 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   }
 
   // __morestack is in libgcc
-  if (Is64Bit)
-    BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
-      .addExternalSymbol("__morestack");
-  else
-    BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
-      .addExternalSymbol("__morestack");
+  if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
+    // Under the large code model, we cannot assume that __morestack lives
+    // within 2^31 bytes of the call site, so we cannot use pc-relative
+    // addressing. We cannot perform the call via a temporary register,
+    // as the rax register may be used to store the static chain, and all
+    // other suitable registers may be either callee-save or used for
+    // parameter passing. We cannot use the stack at this point either
+    // because __morestack manipulates the stack directly.
+    //
+    // To avoid these issues, perform an indirect call via a read-only memory
+    // location containing the address.
+    //
+    // This solution is not perfect, as it assumes that the .rodata section
+    // is laid out within 2^31 bytes of each function body, but this seems
+    // to be sufficient for JIT.
+    BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
+        .addReg(X86::RIP)
+        .addImm(0)
+        .addReg(0)
+        .addExternalSymbol("__morestack_addr")
+        .addReg(0);
+    MF.getMMI().setUsesMorestackAddr(true);
+  } else {
+    if (Is64Bit)
+      BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
+        .addExternalSymbol("__morestack");
+    else
+      BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
+        .addExternalSymbol("__morestack");
+  }
 
   if (IsNested)
     BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
@@ -1584,12 +1842,10 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
 ///       temp0 = sp - MaxStack
 ///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
 void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  const unsigned SlotSize =
-      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo())
-          ->getSlotSize();
-  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
+  const unsigned SlotSize = STI.getRegisterInfo()->getSlotSize();
   const bool Is64Bit = STI.is64Bit();
   const bool IsLP64 = STI.isTarget64BitLP64();
   DebugLoc DL;
@@ -1695,7 +1951,7 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
     // SPLimitOffset is in a fixed heap location (pointed by BP).
     addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
                  .addReg(ScratchReg), PReg, false, SPLimitOffset);
-    BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_4)).addMBB(&prologueMBB);
+    BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&prologueMBB);
 
     // Create new MBB for IncStack:
     BuildMI(incStackMBB, DL, TII.get(CALLop)).
@@ -1704,7 +1960,7 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
                  SPReg, false, -MaxStack);
     addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
                  .addReg(ScratchReg), PReg, false, SPLimitOffset);
-    BuildMI(incStackMBB, DL, TII.get(X86::JLE_4)).addMBB(incStackMBB);
+    BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB);
 
     stackCheckMBB->addSuccessor(&prologueMBB, 99);
     stackCheckMBB->addSuccessor(incStackMBB, 1);
@@ -1719,50 +1975,45 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
 void X86FrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
-                                       MF.getSubtarget().getRegisterInfo());
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  const X86RegisterInfo &RegInfo = *STI.getRegisterInfo();
   unsigned StackPtr = RegInfo.getStackRegister();
-  bool reseveCallFrame = hasReservedCallFrame(MF);
+  bool reserveCallFrame = hasReservedCallFrame(MF);
   int Opcode = I->getOpcode();
   bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
-  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool IsLP64 = STI.isTarget64BitLP64();
   DebugLoc DL = I->getDebugLoc();
-  uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0;
-  uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0;
+  uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
+  uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
   I = MBB.erase(I);
 
-  if (!reseveCallFrame) {
+  if (!reserveCallFrame) {
     // If the stack pointer can be changed after prologue, turn the
     // adjcallstackup instruction into a 'sub ESP, <amt>' and the
     // adjcallstackdown instruction into 'add ESP, <amt>'
-    // TODO: consider using push / pop instead of sub + store / add
     if (Amount == 0)
       return;
 
     // We need to keep the stack aligned properly.  To do this, we round the
     // amount of space needed for the outgoing arguments up to the next
     // alignment boundary.
-    unsigned StackAlign = MF.getTarget()
-                              .getSubtargetImpl()
-                              ->getFrameLowering()
-                              ->getStackAlignment();
-    Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
+    unsigned StackAlign = getStackAlignment();
+    Amount = RoundUpToAlignment(Amount, StackAlign);
 
     MachineInstr *New = nullptr;
-    if (Opcode == TII.getCallFrameSetupOpcode()) {
-      New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
-                    StackPtr)
-        .addReg(StackPtr)
-        .addImm(Amount);
-    } else {
-      assert(Opcode == TII.getCallFrameDestroyOpcode());
 
-      // Factor out the amount the callee already popped.
-      Amount -= CalleeAmt;
+    // Factor out the amount that gets handled inside the sequence
+    // (Pushes of argument for frame setup, callee pops for frame destroy)
+    Amount -= InternalAmt;
+
+    if (Amount) {
+      if (Opcode == TII.getCallFrameSetupOpcode()) {
+        New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr)
+          .addReg(StackPtr).addImm(Amount);
+      } else {
+        assert(Opcode == TII.getCallFrameDestroyOpcode());
 
-      if (Amount) {
         unsigned Opc = getADDriOpcode(IsLP64, Amount);
         New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
           .addReg(StackPtr).addImm(Amount);
@@ -1780,13 +2031,13 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     return;
   }
 
-  if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) {
+  if (Opcode == TII.getCallFrameDestroyOpcode() && InternalAmt) {
     // If we are performing frame pointer elimination and if the callee pops
     // something off the stack pointer, add it back.  We do this until we have
     // more advanced stack pointer tracking ability.
-    unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt);
+    unsigned Opc = getSUBriOpcode(IsLP64, InternalAmt);
     MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
-      .addReg(StackPtr).addImm(CalleeAmt);
+      .addReg(StackPtr).addImm(InternalAmt);
 
     // The EFLAGS implicit def is dead.
     New->getOperand(3).setIsDead();
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 7740c3a..542bbbc 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -18,18 +18,16 @@
 
 namespace llvm {
 
-class MCSymbol;
-class X86TargetMachine;
-class X86Subtarget;
-
 class X86FrameLowering : public TargetFrameLowering {
 public:
   explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO)
     : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {}
 
-  static void getStackProbeFunction(const X86Subtarget &STI,
-                                    unsigned &CallOp,
-                                    const char *&Symbol);
+  /// Emit a call to the target's stack probe function. This is required for all
+  /// large stack allocations on Windows. The caller is required to materialize
+  /// the number of bytes to probe in RAX/EAX.
+  static void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI, DebugLoc DL);
 
   void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
@@ -64,14 +62,30 @@ public:
 
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+  bool needsFrameIndexResolution(const MachineFunction &MF) const override;
 
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const override;
 
+  int getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const;
+  int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI,
+                                   unsigned &FrameReg) const override;
+
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                  MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI) const override;
+
+private:
+  /// convertArgMovsToPushes - This method tries to convert a call sequence
+  /// that uses sub and mov instructions to put the argument onto the stack
+  /// into a series of pushes.
+  /// Returns true if the transformation succeeded, false if not.
+  bool convertArgMovsToPushes(MachineFunction &MF, 
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I, 
+                              uint64_t Amount) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 3ef7b2c..8d50ae1 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -156,9 +156,7 @@ namespace {
 
   public:
     explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
-      : SelectionDAGISel(tm, OptLevel),
-        Subtarget(&tm.getSubtarget<X86Subtarget>()),
-        OptForSize(false) {}
+        : SelectionDAGISel(tm, OptLevel), OptForSize(false) {}
 
     const char *getPassName() const override {
       return "X86 DAG->DAG Instruction Selection";
@@ -166,7 +164,7 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &MF) override {
       // Reset the subtarget each time through.
-      Subtarget = &TM.getSubtarget<X86Subtarget>();
+      Subtarget = &MF.getSubtarget<X86Subtarget>();
       SelectionDAGISel::runOnMachineFunction(MF);
       return true;
     }
@@ -233,7 +231,7 @@ namespace {
                                       char ConstraintCode,
                                       std::vector<SDValue> &OutOps) override;
 
-    void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI);
+    void EmitSpecialCodeForMain();
 
     inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base,
                                    SDValue &Scale, SDValue &Index,
@@ -298,7 +296,7 @@ namespace {
     /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
     /// to the target-specific type.
     const X86InstrInfo *getInstrInfo() const {
-      return getTargetMachine().getSubtargetImpl()->getInstrInfo();
+      return Subtarget->getInstrInfo();
     }
 
     /// \brief Address-mode matching performs shift-of-and to and-of-shift
@@ -395,17 +393,14 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
     Ops.clear();
     Ops.push_back(NewChain);
   }
-  for (unsigned i = 1, e = OrigChain.getNumOperands(); i != e; ++i)
-    Ops.push_back(OrigChain.getOperand(i));
+  Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
   CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
   CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
                              Load.getOperand(1), Load.getOperand(2));
 
-  unsigned NumOps = Call.getNode()->getNumOperands();
   Ops.clear();
   Ops.push_back(SDValue(Load.getNode(), 1));
-  for (unsigned i = 1, e = NumOps; i != e; ++i)
-    Ops.push_back(Call.getOperand(i));
+  Ops.append(Call->op_begin() + 1, Call->op_end());
   CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
 }
 
@@ -453,8 +448,7 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
 
 void X86DAGToDAGISel::PreprocessISelDAG() {
   // OptForSize is used in pattern predicates that isel is matching.
-  OptForSize = MF->getFunction()->getAttributes().
-    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+  OptForSize = MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
 
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
        E = CurDAG->allnodes_end(); I != E; ) {
@@ -571,14 +565,18 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
 
 /// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
 /// the main function.
-void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
-                                             MachineFrameInfo *MFI) {
-  const TargetInstrInfo *TII = TM.getSubtargetImpl()->getInstrInfo();
+void X86DAGToDAGISel::EmitSpecialCodeForMain() {
   if (Subtarget->isTargetCygMing()) {
-    unsigned CallOp =
-      Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32;
-    BuildMI(BB, DebugLoc(),
-            TII->get(CallOp)).addExternalSymbol("__main");
+    TargetLowering::ArgListTy Args;
+
+    TargetLowering::CallLoweringInfo CLI(*CurDAG);
+    CLI.setChain(CurDAG->getRoot())
+        .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
+                   CurDAG->getExternalSymbol("__main", TLI->getPointerTy()),
+                   std::move(Args), 0);
+    const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
+    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
+    CurDAG->setRoot(Result.second);
   }
 }
 
@@ -586,7 +584,7 @@ void X86DAGToDAGISel::EmitFunctionEntryCode() {
   // If this is main, emit special code for main.
   if (const Function *Fn = MF->getFunction())
     if (Fn->hasExternalLinkage() && Fn->getName() == "main")
-      EmitSpecialCodeForMain(MF->begin(), MF->getFrameInfo());
+      EmitSpecialCodeForMain();
 }
 
 static bool isDispSafeForFrameIndex(int64_t Val) {
@@ -918,7 +916,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
   if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
 
   // We also need to ensure that mask is a continuous run of bits.
-  if (CountTrailingOnes_64(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
+  if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
 
   // Scale the leading zero count down based on the actual size of the value.
   // Also scale it down based on the size of the shift.
@@ -1891,8 +1889,8 @@ static bool HasNoSignedComparisonUses(SDNode *N) {
       case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr:
       case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm:
       case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm:
-      case X86::JA_4: case X86::JAE_4: case X86::JB_4: case X86::JBE_4:
-      case X86::JE_4: case X86::JNE_4: case X86::JP_4: case X86::JNP_4:
+      case X86::JA_1: case X86::JAE_1: case X86::JB_1: case X86::JBE_1:
+      case X86::JE_1: case X86::JNE_1: case X86::JP_1: case X86::JNP_1:
       case X86::CMOVA16rr: case X86::CMOVA16rm:
       case X86::CMOVA32rr: case X86::CMOVA32rm:
       case X86::CMOVA64rr: case X86::CMOVA64rm:
@@ -2504,7 +2502,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
           SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
       } else {
         // Zero out the high part, effectively zero extending the input.
-        SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);       
+        SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
         switch (NVT.SimpleTy) {
         case MVT::i16:
           ClrNode =
@@ -2612,26 +2610,9 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue N1 = Node->getOperand(1);
 
     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
-        HasNoSignedComparisonUses(Node)) {
-      // Look for (X86cmp (truncate $op, i1), 0) and try to convert to a
-      // smaller encoding
-      if (Opcode == X86ISD::CMP && N0.getValueType() == MVT::i1 &&
-          X86::isZeroNode(N1)) {
-        SDValue Reg = N0.getOperand(0);
-        SDValue Imm = CurDAG->getTargetConstant(1, MVT::i8);
-
-        // Emit testb
-        if (Reg.getScalarValueSizeInBits() > 8)
-          Reg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Reg);
-        // Emit a testb.
-        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
-                                                Reg, Imm);
-        ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
-        return nullptr;
-      }
-
+        HasNoSignedComparisonUses(Node))
       N0 = N0.getOperand(0);
-    }
+
     // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
     // use a smaller encoding.
     // Look past the truncate if CMP is the only use of it.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index f05b6c6..6866be7 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -15,6 +15,7 @@
 #include "X86ISelLowering.h"
 #include "Utils/X86ShuffleDecode.h"
 #include "X86CallingConv.h"
+#include "X86FrameLowering.h"
 #include "X86InstrBuilder.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86TargetMachine.h"
@@ -66,11 +67,6 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
              "rather than promotion."),
     cl::Hidden);
 
-static cl::opt<bool> ExperimentalVectorShuffleLowering(
-    "x86-experimental-vector-shuffle-lowering", cl::init(true),
-    cl::desc("Enable an experimental vector shuffle lowering code path."),
-    cl::Hidden);
-
 static cl::opt<int> ReciprocalEstimateRefinementSteps(
     "x86-recip-refinement-steps", cl::init(1),
     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
@@ -107,21 +103,18 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
   // If the input is a buildvector just emit a smaller one.
   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
-                       makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
+                       makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
                                     ElemsPerChunk));
 
   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
-  SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
-                               VecIdx);
-
-  return Result;
-
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
 }
+
 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
 /// instructions or a simple subregister reference. Idx is an index in the
-/// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
+/// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
 /// lowering EXTRACT_VECTOR_ELT operations easier.
 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
                                    SelectionDAG &DAG, SDLoc dl) {
@@ -158,25 +151,23 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec,
                                * ElemsPerChunk);
 
   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
-  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
-                     VecIdx);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
 }
+
 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
 /// simple superregister reference.  Idx is an index in the 128 bits
-/// we want.  It need not be aligned to a 128-bit bounday.  That makes
+/// we want.  It need not be aligned to a 128-bit boundary.  That makes
 /// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
-                                  unsigned IdxVal, SelectionDAG &DAG,
-                                  SDLoc dl) {
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+                                  SelectionDAG &DAG,SDLoc dl) {
   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 }
 
-static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
-                                  unsigned IdxVal, SelectionDAG &DAG,
-                                  SDLoc dl) {
+static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+                                  SelectionDAG &DAG, SDLoc dl) {
   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 }
@@ -199,44 +190,23 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
 }
 
-// FIXME: This should stop caching the target machine as soon as
-// we can remove resetOperationActions et al.
-X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
-    : TargetLowering(TM) {
-  Subtarget = &TM.getSubtarget<X86Subtarget>();
+X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
+                                     const X86Subtarget &STI)
+    : TargetLowering(TM), Subtarget(&STI) {
   X86ScalarSSEf64 = Subtarget->hasSSE2();
   X86ScalarSSEf32 = Subtarget->hasSSE1();
   TD = getDataLayout();
 
-  resetOperationActions();
-}
-
-void X86TargetLowering::resetOperationActions() {
-  const TargetMachine &TM = getTargetMachine();
-  static bool FirstTimeThrough = true;
-
-  // If none of the target options have changed, then we don't need to reset the
-  // operation actions.
-  if (!FirstTimeThrough && TO == TM.Options) return;
-
-  if (!FirstTimeThrough) {
-    // Reinitialize the actions.
-    initActions();
-    FirstTimeThrough = false;
-  }
-
-  TO = TM.Options;
-
   // Set up the TargetLowering object.
   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
 
-  // X86 is weird, it always uses i8 for shift amounts and setcc results.
+  // X86 is weird. It always uses i8 for shift amounts and setcc results.
   setBooleanContents(ZeroOrOneBooleanContent);
   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
-  // For 64-bit since we have so many registers use the ILP scheduler, for
-  // 32-bit code use the register pressure specific scheduling.
+  // For 64-bit, since we have so many registers, use the ILP scheduler.
+  // For 32-bit, use the register pressure specific scheduling.
   // For Atom, always use ILP scheduling.
   if (Subtarget->isAtom())
     setSchedulingPreference(Sched::ILP);
@@ -244,14 +214,14 @@ void X86TargetLowering::resetOperationActions() {
     setSchedulingPreference(Sched::ILP);
   else
     setSchedulingPreference(Sched::RegPressure);
-  const X86RegisterInfo *RegInfo =
-      TM.getSubtarget<X86Subtarget>().getRegisterInfo();
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 
-  // Bypass expensive divides on Atom when compiling with O2
-  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
-    addBypassSlowDiv(32, 8);
-    if (Subtarget->is64Bit())
+  // Bypass expensive divides on Atom when compiling with O2.
+  if (TM.getOptLevel() >= CodeGenOpt::Default) {
+    if (Subtarget->hasSlowDivide32())
+      addBypassSlowDiv(32, 8);
+    if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
       addBypassSlowDiv(64, 16);
   }
 
@@ -296,7 +266,8 @@ void X86TargetLowering::resetOperationActions() {
   if (Subtarget->is64Bit())
     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  for (MVT VT : MVT::integer_valuetypes())
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 
   // We don't accept any truncstore of integer registers.
   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
@@ -521,7 +492,9 @@ void X86TargetLowering::resetOperationActions() {
   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 
-  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
@@ -805,9 +778,7 @@ void X86TargetLowering::resetOperationActions() {
   // First set operation action for all vector types to either promote
   // (for widening) or expand (for scalarization). Then we will selectively
   // turn on ones that can be effectively codegen'd.
-  for (int i = MVT::FIRST_VECTOR_VALUETYPE;
-           i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
-    MVT VT = (MVT::SimpleValueType)i;
+  for (MVT VT : MVT::vector_valuetypes()) {
     setOperationAction(ISD::ADD , VT, Expand);
     setOperationAction(ISD::SUB , VT, Expand);
     setOperationAction(ISD::FADD, VT, Expand);
@@ -876,18 +847,19 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
     setOperationAction(ISD::VSELECT, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
-    for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
-             InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
-      setTruncStoreAction(VT,
-                          (MVT::SimpleValueType)InnerVT, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
+    for (MVT InnerVT : MVT::vector_valuetypes()) {
+      setTruncStoreAction(InnerVT, VT, Expand);
+
+      setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 
-    // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types,
-    // we have to deal with them whether we ask for Expansion or not. Setting
-    // Expand causes its own optimisation problems though, so leave them legal.
-    if (VT.getVectorElementType() == MVT::i1)
-      setLoadExtAction(ISD::EXTLOAD, VT, Expand);
+      // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
+      // types, we have to deal with them whether we ask for Expansion or not.
+      // Setting Expand causes its own optimisation problems though, so leave
+      // them legal.
+      if (VT.getVectorElementType() == MVT::i1)
+        setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+    }
   }
 
   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
@@ -942,6 +914,7 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
@@ -991,6 +964,14 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 
+    // Only provide customized ctpop vector bit twiddling for vector types we
+    // know to perform better than using the popcnt instructions on each vector
+    // element. If popcnt isn't supported, always provide the custom version.
+    if (!Subtarget->hasPOPCNT()) {
+      setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
+      setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
+    }
+
     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
       MVT VT = (MVT::SimpleValueType)i;
@@ -1002,6 +983,7 @@ void X86TargetLowering::resetOperationActions() {
         continue;
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::VSELECT,            VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
@@ -1009,20 +991,24 @@ void X86TargetLowering::resetOperationActions() {
     // memory vector types which we can load as a scalar (or sequence of
     // scalars) and extend in-register to a legal 128-bit vector type. For sext
     // loads these must work with a single scalar load.
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Custom);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Custom);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v8i8, Custom);
-    setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Custom);
-    setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Custom);
-    setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, Custom);
-    setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
-    setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Custom);
-    setLoadExtAction(ISD::EXTLOAD, MVT::v8i8, Custom);
+    for (MVT VT : MVT::integer_vector_valuetypes()) {
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
+    }
 
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
 
@@ -1070,7 +1056,8 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
 
-    setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
+    for (MVT VT : MVT::fp_vector_valuetypes())
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
 
     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
@@ -1103,20 +1090,32 @@ void X86TargetLowering::resetOperationActions() {
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
-    setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
-    setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
-    setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
-    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
-    setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
-    // There is no BLENDI for byte vectors. We don't need to custom lower
-    // some vselects for now.
+    // We directly match byte blends in the backend as they match the VSELECT
+    // condition form.
     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
 
     // SSE41 brings specific instructions for doing vector sign extend even in
     // cases where we don't have SRA.
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Custom);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Custom);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, Custom);
+    for (MVT VT : MVT::integer_vector_valuetypes()) {
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
+    }
+
+    // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
 
     // i8 and i16 vectors are custom because the source register and source
     // source memory operand types are not the same width.  f32 vectors are
@@ -1212,7 +1211,8 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
 
-    setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
+    for (MVT VT : MVT::fp_vector_valuetypes())
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
 
     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
@@ -1232,11 +1232,6 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
 
-    setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
-    setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
-    setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
-    setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
-
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
@@ -1280,12 +1275,34 @@ void X86TargetLowering::resetOperationActions() {
       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
 
-      setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
-      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
-
       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
       // when we have a 256bit-wide blend with immediate.
       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
+
+      // Only provide customized ctpop vector bit twiddling for vector types we
+      // know to perform better than using the popcnt instructions on each
+      // vector element. If popcnt isn't supported, always provide the custom
+      // version.
+      if (!Subtarget->hasPOPCNT())
+        setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
+
+      // Custom CTPOP always performs better on natively supported v8i32
+      setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
+
+      // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
+
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
     } else {
       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
@@ -1314,21 +1331,23 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
 
     // Custom lower several nodes for 256-bit types.
-    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
-             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
-      MVT VT = (MVT::SimpleValueType)i;
-
+    for (MVT VT : MVT::vector_valuetypes()) {
+      if (VT.getScalarSizeInBits() >= 32) {
+        setOperationAction(ISD::MLOAD,  VT, Legal);
+        setOperationAction(ISD::MSTORE, VT, Legal);
+      }
       // Extract subvector is special because the value type
       // (result) is 128-bit but the source is 256-bit wide.
-      if (VT.is128BitVector())
+      if (VT.is128BitVector()) {
         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-
+      }
       // Do not attempt to custom lower other non-256-bit vectors
       if (!VT.is256BitVector())
         continue;
 
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::VSELECT,            VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
@@ -1336,6 +1355,10 @@ void X86TargetLowering::resetOperationActions() {
       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
     }
 
+    if (Subtarget->hasInt256())
+      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
+
+
     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
       MVT VT = (MVT::SimpleValueType)i;
@@ -1367,12 +1390,14 @@ void X86TargetLowering::resetOperationActions() {
     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
 
+    for (MVT VT : MVT::fp_vector_valuetypes())
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
+
     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
     setOperationAction(ISD::AND,                MVT::i1,    Legal);
-    setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
@@ -1434,6 +1459,17 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
 
+    setOperationAction(ISD::FFLOOR,             MVT::v16f32, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v8f64, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v16f32, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v8f64, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v16f32, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v8f64, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v16f32, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v8f64, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v16f32, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v8f64, Legal);
+
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
@@ -1486,16 +1522,13 @@ void X86TargetLowering::resetOperationActions() {
     }
 
     // Custom lower several nodes.
-    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
-             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
-      MVT VT = (MVT::SimpleValueType)i;
-
+    for (MVT VT : MVT::vector_valuetypes()) {
       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
       // Extract subvector is special because the value type
       // (result) is 256/128-bit but the source is 512-bit wide.
-      if (VT.is128BitVector() || VT.is256BitVector())
+      if (VT.is128BitVector() || VT.is256BitVector()) {
         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-
+      }
       if (VT.getVectorElementType() == MVT::i1)
         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
 
@@ -1511,12 +1544,14 @@ void X86TargetLowering::resetOperationActions() {
         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
+        setOperationAction(ISD::MLOAD,               VT, Legal);
+        setOperationAction(ISD::MSTORE,              VT, Legal);
       }
     }
     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
       MVT VT = (MVT::SimpleValueType)i;
 
-      // Do not attempt to promote non-256-bit vectors
+      // Do not attempt to promote non-512-bit vectors.
       if (!VT.is512BitVector())
         continue;
 
@@ -1536,17 +1571,22 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
+    setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
+    setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
+    setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
+    setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
 
     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
       const MVT VT = (MVT::SimpleValueType)i;
 
       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
 
-      // Do not attempt to promote non-256-bit vectors
+      // Do not attempt to promote non-512-bit vectors.
       if (!VT.is512BitVector())
         continue;
 
-      if ( EltSize < 32) {
+      if (EltSize < 32) {
         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
         setOperationAction(ISD::VSELECT,             VT, Legal);
       }
@@ -1560,14 +1600,13 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
-  }
 
-  // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
-  // of this type with custom code.
-  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
-           VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
-    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
-                       Custom);
+    setOperationAction(ISD::AND,                MVT::v8i32, Legal);
+    setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
+    setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
+    setOperationAction(ISD::AND,                MVT::v4i32, Legal);
+    setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
+    setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
   }
 
   // We want to custom lower some of our intrinsics.
@@ -1607,9 +1646,8 @@ void X86TargetLowering::resetOperationActions() {
     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
     setLibcallName(RTLIB::SINCOS_F64, "sincos");
     if (Subtarget->isTargetDarwin()) {
-      // For MacOSX, we don't want to the normal expansion of a libcall to
-      // sincos. We want to issue a libcall to __sincos_stret to avoid memory
-      // traffic.
+      // For MacOSX, we don't want the normal expansion of a libcall to sincos.
+      // We want to issue a libcall to __sincos_stret to avoid memory traffic.
       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
     }
@@ -1627,6 +1665,7 @@ void X86TargetLowering::resetOperationActions() {
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::BITCAST);
   setTargetDAGCombine(ISD::VSELECT);
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::SHL);
@@ -1640,7 +1679,9 @@ void X86TargetLowering::resetOperationActions() {
   setTargetDAGCombine(ISD::FMA);
   setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::LOAD);
+  setTargetDAGCombine(ISD::MLOAD);
   setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::MSTORE);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
@@ -1650,11 +1691,10 @@ void X86TargetLowering::resetOperationActions() {
   setTargetDAGCombine(ISD::SETCC);
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
   setTargetDAGCombine(ISD::BUILD_VECTOR);
-  if (Subtarget->is64Bit())
-    setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::XOR);
 
-  computeRegisterProperties();
+  computeRegisterProperties(Subtarget->getRegisterInfo());
 
   // On Darwin, -Os means optimize for size without hurting performance,
   // do not reduce the limit.
@@ -1668,7 +1708,7 @@ void X86TargetLowering::resetOperationActions() {
 
   // Predictable cmov don't hurt on atom because it's in-order.
   PredictableSelectIsExpensive = !Subtarget->isAtom();
-
+  EnableExtLdPromotion = true;
   setPrefFunctionAlignment(4); // 2^4 bytes.
 
   verifyIntrinsicTables();
@@ -1676,8 +1716,7 @@ void X86TargetLowering::resetOperationActions() {
 
 // This has so far only been implemented for 64-bit MachO.
 bool X86TargetLowering::useLoadStackGuardNode() const {
-  return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO &&
-         Subtarget->is64Bit();
+  return Subtarget->isTargetMachO() && Subtarget->is64Bit();
 }
 
 TargetLoweringBase::LegalizeTypeAction
@@ -1733,7 +1772,7 @@ EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   return VT.changeVectorElementTypeToInteger();
 }
 
-/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
+/// Helper for getByValTypeAlignment to determine
 /// the desired ByVal argument alignment.
 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
   if (MaxAlign == 16)
@@ -1758,7 +1797,7 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
   }
 }
 
-/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area. For X86, aggregates
 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
 /// are at 4-byte boundaries.
@@ -1777,7 +1816,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
   return Align;
 }
 
-/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// Returns the target specific optimal type for load
 /// and store operations as a result of memset, memcpy, and memmove
 /// lowering. If DstAlign is zero that means it's safe to destination
 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
@@ -1796,8 +1835,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
                                        MachineFunction &MF) const {
   const Function *F = MF.getFunction();
   if ((!IsMemset || ZeroMemset) &&
-      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::NoImplicitFloat)) {
+      !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
     if (Size >= 16 &&
         (Subtarget->isUnalignedMemAccessFast() ||
          ((DstAlign == 0 || DstAlign >= 16) &&
@@ -1843,7 +1881,7 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   return true;
 }
 
-/// getJumpTableEncoding - Return the entry encoding for a jump table in the
+/// Return the entry encoding for a jump table in the
 /// current function.  The returned value is a member of the
 /// MachineJumpTableInfo::JTEntryKind enum.
 unsigned X86TargetLowering::getJumpTableEncoding() const {
@@ -1869,8 +1907,7 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
 }
 
-/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
-/// jumptable.
+/// Returns relocation base for the given PIC jumptable.
 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
                                                     SelectionDAG &DAG) const {
   if (!Subtarget->is64Bit())
@@ -1880,9 +1917,8 @@ SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
   return Table;
 }
 
-/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
-/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
-/// MCExpr.
+/// This returns the relocation base for the given PIC jumptable,
+/// the same as getPICJumpTableRelocBase, but as an MCExpr.
 const MCExpr *X86TargetLowering::
 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
                              MCContext &Ctx) const {
@@ -1894,14 +1930,14 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
 }
 
-// FIXME: Why this routine is here? Move to RegInfo!
-std::pair<const TargetRegisterClass*, uint8_t>
-X86TargetLowering::findRepresentativeClass(MVT VT) const{
+std::pair<const TargetRegisterClass *, uint8_t>
+X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+                                           MVT VT) const {
   const TargetRegisterClass *RRC = nullptr;
   uint8_t Cost = 1;
   switch (VT.SimpleTy) {
   default:
-    return TargetLowering::findRepresentativeClass(VT);
+    return TargetLowering::findRepresentativeClass(TRI, VT);
   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
     break;
@@ -1994,7 +2030,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
     SDValue ValToCopy = OutVals[i];
     EVT ValVT = ValToCopy.getValueType();
 
-    // Promote values to the appropriate types
+    // Promote values to the appropriate types.
     if (VA.getLocInfo() == CCValAssign::SExt)
       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
     else if (VA.getLocInfo() == CCValAssign::ZExt)
@@ -2005,7 +2041,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
 
     assert(VA.getLocInfo() != CCValAssign::FPExt &&
-           "Unexpected FP-extend for return value.");  
+           "Unexpected FP-extend for return value.");
 
     // If this is x86-64, and we disabled SSE, we can't return FP values,
     // or SSE or MMX vectors.
@@ -2060,14 +2096,15 @@ X86TargetLowering::LowerReturn(SDValue Chain,
   // Win32 requires us to put the sret argument to %eax as well.
   // We saved the argument into a virtual register in the entry block,
   // so now we copy the value out and into %rax/%eax.
-  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
-      (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
-    MachineFunction &MF = DAG.getMachineFunction();
-    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
-    unsigned Reg = FuncInfo->getSRetReturnReg();
-    assert(Reg &&
-           "SRetReturnReg should have been set in LowerFormalArguments().");
-    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+  //
+  // Checking Function.hasStructRetAttr() here is insufficient because the IR
+  // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
+  // false, then an sret argument may be implicitly inserted in the SelDAG. In
+  // either case FuncInfo->setSRetReturnReg() will have been called.
+  if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
+    assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) &&
+           "No need for an sret register");
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
 
     unsigned RetValReg
         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
@@ -2141,7 +2178,7 @@ X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
   return VT.bitsLT(MinVT) ? MinVT : VT;
 }
 
-/// LowerCallResult - Lower the result values of a call into the
+/// Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
 ///
 SDValue
@@ -2221,8 +2258,7 @@ callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
   return StackStructReturn;
 }
 
-/// ArgsAreStructReturn - Determines whether a function uses struct
-/// return semantics.
+/// Determines whether a function uses struct return semantics.
 static StructReturnType
 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
   if (Ins.empty())
@@ -2236,10 +2272,9 @@ argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
   return StackStructReturn;
 }
 
-/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
-/// by "Src" to address "Dst" with size and alignment information specified by
-/// the specific parameter attribute. The copy will be passed as a byval
-/// function parameter.
+/// Make a copy of an aggregate at address specified by "Src" to address
+/// "Dst" with size and alignment information specified by the specific
+/// parameter attribute. The copy will be passed as a byval function parameter.
 static SDValue
 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
@@ -2251,7 +2286,7 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
                        MachinePointerInfo(), MachinePointerInfo());
 }
 
-/// IsTailCallConvention - Return true if the calling convention is one that
+/// Return true if the calling convention is one that
 /// supports tail call optimization.
 static bool IsTailCallConvention(CallingConv::ID CC) {
   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
@@ -2276,7 +2311,7 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
   return true;
 }
 
-/// FuncIsMadeTailCallSafe - Return true if the function is being made into
+/// Return true if the function is being made into
 /// a tailcall target by changing its ABI.
 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
                                    bool GuaranteedTailCallOpt) {
@@ -2356,8 +2391,7 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
   }
 
   const Function *Fn = MF.getFunction();
-  bool NoImplicitFloatOps = Fn->getAttributes().
-      hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+  bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
          "SSE register cannot be used when SSE is disabled!");
   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
@@ -2523,18 +2557,19 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         MFI->CreateFixedObject(1, StackSize, true));
   }
 
+  // Figure out if XMM registers are in use.
+  assert(!(MF.getTarget().Options.UseSoftFloat &&
+           Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
+         "SSE register cannot be used when SSE is disabled!");
+
   // 64-bit calling conventions support varargs and register parameters, so we
-  // have to do extra work to spill them in the prologue or forward them to
-  // musttail calls.
-  if (Is64Bit && isVarArg &&
-      (MFI->hasVAStart() || MFI->hasMustTailInVarArgFunc())) {
+  // have to do extra work to spill them in the prologue.
+  if (Is64Bit && isVarArg && MFI->hasVAStart()) {
     // Find the first unallocated argument registers.
     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
-    unsigned NumIntRegs =
-        CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
-    unsigned NumXMMRegs =
-        CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
+    unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
            "SSE register cannot be used when SSE is disabled!");
 
@@ -2557,90 +2592,99 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
       }
     }
 
-    // Store them to the va_list returned by va_start.
-    if (MFI->hasVAStart()) {
-      if (IsWin64) {
-        const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
-        // Get to the caller-allocated home save location.  Add 8 to account
-        // for the return address.
-        int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
-        FuncInfo->setRegSaveFrameIndex(
+    if (IsWin64) {
+      const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+      // Get to the caller-allocated home save location.  Add 8 to account
+      // for the return address.
+      int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
+      FuncInfo->setRegSaveFrameIndex(
           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
-        // Fixup to set vararg frame on shadow area (4 x i64).
-        if (NumIntRegs < 4)
-          FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
-      } else {
-        // For X86-64, if there are vararg parameters that are passed via
-        // registers, then we must store them to their spots on the stack so
-        // they may be loaded by deferencing the result of va_next.
-        FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
-        FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
-        FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
-            ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
-      }
-
-      // Store the integer parameter registers.
-      SmallVector<SDValue, 8> MemOps;
-      SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
-                                        getPointerTy());
-      unsigned Offset = FuncInfo->getVarArgsGPOffset();
-      for (SDValue Val : LiveGPRs) {
-        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
-                                  DAG.getIntPtrConstant(Offset));
-        SDValue Store =
-          DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                       MachinePointerInfo::getFixedStack(
-                         FuncInfo->getRegSaveFrameIndex(), Offset),
-                       false, false, 0);
-        MemOps.push_back(Store);
-        Offset += 8;
-      }
-
-      if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
-        // Now store the XMM (fp + vector) parameter registers.
-        SmallVector<SDValue, 12> SaveXMMOps;
-        SaveXMMOps.push_back(Chain);
-        SaveXMMOps.push_back(ALVal);
-        SaveXMMOps.push_back(DAG.getIntPtrConstant(
-                               FuncInfo->getRegSaveFrameIndex()));
-        SaveXMMOps.push_back(DAG.getIntPtrConstant(
-                               FuncInfo->getVarArgsFPOffset()));
-        SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
-                          LiveXMMRegs.end());
-        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
-                                     MVT::Other, SaveXMMOps));
-      }
-
-      if (!MemOps.empty())
-        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+      // Fixup to set vararg frame on shadow area (4 x i64).
+      if (NumIntRegs < 4)
+        FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
     } else {
-      // Add all GPRs, al, and XMMs to the list of forwards.  We will add then
-      // to the liveout set on a musttail call.
-      assert(MFI->hasMustTailInVarArgFunc());
-      auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
-      typedef X86MachineFunctionInfo::Forward Forward;
-
-      for (unsigned I = 0, E = LiveGPRs.size(); I != E; ++I) {
-        unsigned VReg =
-            MF.getRegInfo().createVirtualRegister(&X86::GR64RegClass);
-        Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveGPRs[I]);
-        Forwards.push_back(Forward(VReg, ArgGPRs[NumIntRegs + I], MVT::i64));
-      }
-
-      if (!ArgXMMs.empty()) {
-        unsigned ALVReg =
-            MF.getRegInfo().createVirtualRegister(&X86::GR8RegClass);
-        Chain = DAG.getCopyToReg(Chain, dl, ALVReg, ALVal);
-        Forwards.push_back(Forward(ALVReg, X86::AL, MVT::i8));
-
-        for (unsigned I = 0, E = LiveXMMRegs.size(); I != E; ++I) {
-          unsigned VReg =
-              MF.getRegInfo().createVirtualRegister(&X86::VR128RegClass);
-          Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveXMMRegs[I]);
-          Forwards.push_back(
-              Forward(VReg, ArgXMMs[NumXMMRegs + I], MVT::v4f32));
-        }
-      }
+      // For X86-64, if there are vararg parameters that are passed via
+      // registers, then we must store them to their spots on the stack so
+      // they may be loaded by deferencing the result of va_next.
+      FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+      FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
+      FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
+          ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
+    }
+
+    // Store the integer parameter registers.
+    SmallVector<SDValue, 8> MemOps;
+    SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+                                      getPointerTy());
+    unsigned Offset = FuncInfo->getVarArgsGPOffset();
+    for (SDValue Val : LiveGPRs) {
+      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
+                                DAG.getIntPtrConstant(Offset));
+      SDValue Store =
+        DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                     MachinePointerInfo::getFixedStack(
+                       FuncInfo->getRegSaveFrameIndex(), Offset),
+                     false, false, 0);
+      MemOps.push_back(Store);
+      Offset += 8;
+    }
+
+    if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
+      // Now store the XMM (fp + vector) parameter registers.
+      SmallVector<SDValue, 12> SaveXMMOps;
+      SaveXMMOps.push_back(Chain);
+      SaveXMMOps.push_back(ALVal);
+      SaveXMMOps.push_back(DAG.getIntPtrConstant(
+                             FuncInfo->getRegSaveFrameIndex()));
+      SaveXMMOps.push_back(DAG.getIntPtrConstant(
+                             FuncInfo->getVarArgsFPOffset()));
+      SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
+                        LiveXMMRegs.end());
+      MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
+                                   MVT::Other, SaveXMMOps));
+    }
+
+    if (!MemOps.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+  }
+
+  if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
+    // Find the largest legal vector type.
+    MVT VecVT = MVT::Other;
+    // FIXME: Only some x86_32 calling conventions support AVX512.
+    if (Subtarget->hasAVX512() &&
+        (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
+                     CallConv == CallingConv::Intel_OCL_BI)))
+      VecVT = MVT::v16f32;
+    else if (Subtarget->hasAVX())
+      VecVT = MVT::v8f32;
+    else if (Subtarget->hasSSE2())
+      VecVT = MVT::v4f32;
+
+    // We forward some GPRs and some vector types.
+    SmallVector<MVT, 2> RegParmTypes;
+    MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
+    RegParmTypes.push_back(IntVT);
+    if (VecVT != MVT::Other)
+      RegParmTypes.push_back(VecVT);
+
+    // Compute the set of forwarded registers. The rest are scratch.
+    SmallVectorImpl<ForwardedRegister> &Forwards =
+        FuncInfo->getForwardedMustTailRegParms();
+    CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
+
+    // Conservatively forward AL on x86_64, since it might be used for varargs.
+    if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
+      unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+      Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
+    }
+
+    // Copy all forwards from physical to virtual registers.
+    for (ForwardedRegister &F : Forwards) {
+      // FIXME: Can we use a less constrained schedule?
+      SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
+      F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
+      Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
     }
   }
 
@@ -2688,7 +2732,7 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
                       false, false, 0);
 }
 
-/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
+/// Emit a load of return address if tail call
 /// optimization is performed and it is required.
 SDValue
 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
@@ -2705,7 +2749,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
   return SDValue(OutRetAddr.getNode(), 1);
 }
 
-/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
+/// Emit a store of the return address if tail call
 /// optimization is performed and it is required (FPDiff!=0).
 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
                                         SDValue Chain, SDValue RetAddrFrIdx,
@@ -2838,8 +2882,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
-  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
-      DAG.getSubtarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     // Skip inalloca arguments, they have already been written.
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -2952,7 +2995,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
     };
-    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
     assert((Subtarget->hasSSE1() || !NumXMMRegs)
            && "SSE registers cannot be used when SSE is disabled");
 
@@ -2960,7 +3003,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
   }
 
-  if (Is64Bit && isVarArg && IsMustTail) {
+  if (isVarArg && IsMustTail) {
     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
     for (const auto &F : Forwards) {
       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
@@ -3044,10 +3087,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // through a register, since the call instruction's 32-bit
     // pc-relative offset may not be large enough to hold the whole
     // address.
-  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+  } else if (Callee->getOpcode() == ISD::GlobalAddress) {
     // If the callee is a GlobalAddress node (quite common, every direct call
     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
     // it.
+    GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
 
     // We should use extra load for direct calls to dllimported functions in
     // non-JIT mode.
@@ -3073,11 +3117,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         // unless we're building with the leopard linker or later, which
         // automatically synthesizes these stubs.
         OpFlags = X86II::MO_DARWIN_STUB;
-      } else if (Subtarget->isPICStyleRIPRel() &&
-                 isa<Function>(GV) &&
-                 cast<Function>(GV)->getAttributes().
-                   hasAttribute(AttributeSet::FunctionIndex,
-                                Attribute::NonLazyBind)) {
+      } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) &&
+                 cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) {
         // If the function is marked as non-lazy, generate an indirect call
         // which loads from the GOT directly. This avoids runtime overhead
         // at the cost of eager binding (and one extra byte of encoding).
@@ -3117,7 +3158,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
                                          OpFlags);
-  } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
+  } else if (Subtarget->isTarget64BitILP32() &&
+             Callee->getValueType(0) == MVT::i32) {
     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
   }
@@ -3146,7 +3188,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
@@ -3235,11 +3277,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 unsigned
 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
                                                SelectionDAG& DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  const TargetMachine &TM = MF.getTarget();
-  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
-      TM.getSubtargetImpl()->getRegisterInfo());
-  const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
   unsigned StackAlignment = TFI.getStackAlignment();
   uint64_t AlignMask = StackAlignment - 1;
   int64_t Offset = StackSize;
@@ -3276,7 +3315,8 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
         return false;
     } else {
       unsigned Opcode = Def->getOpcode();
-      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
+      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
+           Opcode == X86::LEA64_32r) &&
           Def->getOperand(1).isFI()) {
         FI = Def->getOperand(1).getIndex();
         Bytes = Flags.getByValSize();
@@ -3341,6 +3381,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
 
+  // Win64 functions have extra shadow space for argument homing. Don't do the
+  // sibcall if the caller and callee have mismatched expectations for this
+  // space.
+  if (IsCalleeWin64 != IsCallerWin64)
+    return false;
+
   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
     if (IsTailCallConvention(CalleeCC) && CCMatch)
       return true;
@@ -3352,8 +3398,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
 
   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   // emit a special epilogue.
-  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
-      DAG.getSubtarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   if (RegInfo->needsStackRealignment(MF))
     return false;
 
@@ -3465,8 +3510,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
       // the caller's fixed stack objects.
       MachineFrameInfo *MFI = MF.getFrameInfo();
       const MachineRegisterInfo *MRI = &MF.getRegInfo();
-      const X86InstrInfo *TII =
-          static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
+      const X86InstrInfo *TII = Subtarget->getInstrInfo();
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
         SDValue Arg = OutVals[i];
@@ -3494,7 +3538,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
       // In PIC we need an extra register to formulate the address computation
       // for the callee.
       unsigned MaxInRegs =
-	(DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+        (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
 
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
@@ -3563,17 +3607,6 @@ static bool isTargetShuffle(unsigned Opcode) {
 }
 
 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
-                                    SDValue V1, SelectionDAG &DAG) {
-  switch(Opc) {
-  default: llvm_unreachable("Unknown x86 shuffle node");
-  case X86ISD::MOVSHDUP:
-  case X86ISD::MOVSLDUP:
-  case X86ISD::MOVDDUP:
-    return DAG.getNode(Opc, dl, VT, V1);
-  }
-}
-
-static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
                                     SDValue V1, unsigned TargetMask,
                                     SelectionDAG &DAG) {
   switch(Opc) {
@@ -3588,20 +3621,6 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
 }
 
 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
-                                    SDValue V1, SDValue V2, unsigned TargetMask,
-                                    SelectionDAG &DAG) {
-  switch(Opc) {
-  default: llvm_unreachable("Unknown x86 shuffle node");
-  case X86ISD::PALIGNR:
-  case X86ISD::VALIGN:
-  case X86ISD::SHUFP:
-  case X86ISD::VPERM2X128:
-    return DAG.getNode(Opc, dl, VT, V1, V2,
-                       DAG.getConstant(TargetMask, MVT::i8));
-  }
-}
-
-static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
   switch(Opc) {
   default: llvm_unreachable("Unknown x86 shuffle node");
@@ -3620,8 +3639,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
 
 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
-  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
-      DAG.getSubtarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   int ReturnAddrIndex = FuncInfo->getRAIndex();
 
@@ -3661,7 +3679,7 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
   // For kernel code model we know that all object resist in the negative half
   // of 32bits address space. We may not accept negative offsets, since they may
   // be just off and we may accept pretty large positive ones.
-  if (M == CodeModel::Kernel && Offset > 0)
+  if (M == CodeModel::Kernel && Offset >= 0)
     return true;
 
   return false;
@@ -3823,6 +3841,18 @@ bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   return false;
 }
 
+bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
+                                              ISD::LoadExtType ExtTy,
+                                              EVT NewVT) const {
+  // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
+  // relocation target a movq or addq instruction: don't let the load shrink.
+  SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
+  if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
+    if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
+      return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
+  return true;
+}
+
 /// \brief Returns true if it is beneficial to convert a load of a constant
 /// to just the constant itself.
 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
@@ -3835,6 +3865,24 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   return true;
 }
 
+bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
+                                                unsigned Index) const {
+  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+    return false;
+
+  return (Index == 0 || Index == ResVT.getVectorNumElements());
+}
+
+bool X86TargetLowering::isCheapToSpeculateCttz() const {
+  // Speculate cttz only if we can directly use TZCNT.
+  return Subtarget->hasBMI();
+}
+
+bool X86TargetLowering::isCheapToSpeculateCtlz() const {
+  // Speculate ctlz only if we can directly use LZCNT.
+  return Subtarget->hasLZCNT();
+}
+
 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
 /// the specified range (L, H].
 static bool isUndefOrInRange(int Val, int Low, int Hi) {
@@ -3849,7 +3897,7 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
 
 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
 /// from position Pos and ending in Pos+Size, falls within the specified
-/// sequential range (L, L+Pos]. or is undef.
+/// sequential range (Low, Low+Size]. or is undef.
 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
                                        unsigned Pos, unsigned Size, int Low) {
   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
@@ -3858,176 +3906,6 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
   return true;
 }
 
-/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
-/// is suitable for input to PSHUFD. That is, it doesn't reference the other
-/// operand - by default will match for first operand.
-static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
-                         bool TestSecondOperand = false) {
-  if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
-      VT != MVT::v2f64 && VT != MVT::v2i64)
-    return false;
-
-  unsigned NumElems = VT.getVectorNumElements();
-  unsigned Lo = TestSecondOperand ? NumElems : 0;
-  unsigned Hi = Lo + NumElems;
-
-  for (unsigned i = 0; i < NumElems; ++i)
-    if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
-      return false;
-
-  return true;
-}
-
-/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
-/// is suitable for input to PSHUFHW.
-static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
-  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
-    return false;
-
-  // Lower quadword copied in order or undef.
-  if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
-    return false;
-
-  // Upper quadword shuffled.
-  for (unsigned i = 4; i != 8; ++i)
-    if (!isUndefOrInRange(Mask[i], 4, 8))
-      return false;
-
-  if (VT == MVT::v16i16) {
-    // Lower quadword copied in order or undef.
-    if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
-      return false;
-
-    // Upper quadword shuffled.
-    for (unsigned i = 12; i != 16; ++i)
-      if (!isUndefOrInRange(Mask[i], 12, 16))
-        return false;
-  }
-
-  return true;
-}
-
-/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
-/// is suitable for input to PSHUFLW.
-static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
-  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
-    return false;
-
-  // Upper quadword copied in order.
-  if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
-    return false;
-
-  // Lower quadword shuffled.
-  for (unsigned i = 0; i != 4; ++i)
-    if (!isUndefOrInRange(Mask[i], 0, 4))
-      return false;
-
-  if (VT == MVT::v16i16) {
-    // Upper quadword copied in order.
-    if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
-      return false;
-
-    // Lower quadword shuffled.
-    for (unsigned i = 8; i != 12; ++i)
-      if (!isUndefOrInRange(Mask[i], 8, 12))
-        return false;
-  }
-
-  return true;
-}
-
-/// \brief Return true if the mask specifies a shuffle of elements that is
-/// suitable for input to intralane (palignr) or interlane (valign) vector
-/// right-shift.
-static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
-  unsigned NumLaneElts = NumElts/NumLanes;
-
-  // Do not handle 64-bit element shuffles with palignr.
-  if (NumLaneElts == 2)
-    return false;
-
-  for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
-    unsigned i;
-    for (i = 0; i != NumLaneElts; ++i) {
-      if (Mask[i+l] >= 0)
-        break;
-    }
-
-    // Lane is all undef, go to next lane
-    if (i == NumLaneElts)
-      continue;
-
-    int Start = Mask[i+l];
-
-    // Make sure its in this lane in one of the sources
-    if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
-        !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
-      return false;
-
-    // If not lane 0, then we must match lane 0
-    if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
-      return false;
-
-    // Correct second source to be contiguous with first source
-    if (Start >= (int)NumElts)
-      Start -= NumElts - NumLaneElts;
-
-    // Make sure we're shifting in the right direction.
-    if (Start <= (int)(i+l))
-      return false;
-
-    Start -= i;
-
-    // Check the rest of the elements to see if they are consecutive.
-    for (++i; i != NumLaneElts; ++i) {
-      int Idx = Mask[i+l];
-
-      // Make sure its in this lane
-      if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
-          !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
-        return false;
-
-      // If not lane 0, then we must match lane 0
-      if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
-        return false;
-
-      if (Idx >= (int)NumElts)
-        Idx -= NumElts - NumLaneElts;
-
-      if (!isUndefOrEqual(Idx, Start+i))
-        return false;
-
-    }
-  }
-
-  return true;
-}
-
-/// \brief Return true if the node specifies a shuffle of elements that is
-/// suitable for input to PALIGNR.
-static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
-                          const X86Subtarget *Subtarget) {
-  if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
-      (VT.is256BitVector() && !Subtarget->hasInt256()) ||
-      VT.is512BitVector())
-    // FIXME: Add AVX512BW.
-    return false;
-
-  return isAlignrMask(Mask, VT, false);
-}
-
-/// \brief Return true if the node specifies a shuffle of elements that is
-/// suitable for input to VALIGN.
-static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
-                          const X86Subtarget *Subtarget) {
-  // FIXME: Add AVX512VL.
-  if (!VT.is512BitVector() || !Subtarget->hasAVX512())
-    return false;
-  return isAlignrMask(Mask, VT, true);
-}
-
 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
 /// the two vector operands have swapped position.
 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
@@ -4043,664 +3921,6 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
   }
 }
 
-/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 128/256-bit
-/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
-/// reverse of what x86 shuffles want.
-static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
-
-  unsigned NumElems = VT.getVectorNumElements();
-  unsigned NumLanes = VT.getSizeInBits()/128;
-  unsigned NumLaneElems = NumElems/NumLanes;
-
-  if (NumLaneElems != 2 && NumLaneElems != 4)
-    return false;
-
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-  bool symetricMaskRequired =
-    (VT.getSizeInBits() >= 256) && (EltSize == 32);
-
-  // VSHUFPSY divides the resulting vector into 4 chunks.
-  // The sources are also splitted into 4 chunks, and each destination
-  // chunk must come from a different source chunk.
-  //
-  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
-  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
-  //
-  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
-  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
-  //
-  // VSHUFPDY divides the resulting vector into 4 chunks.
-  // The sources are also splitted into 4 chunks, and each destination
-  // chunk must come from a different source chunk.
-  //
-  //  SRC1 =>      X3       X2       X1       X0
-  //  SRC2 =>      Y3       Y2       Y1       Y0
-  //
-  //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
-  //
-  SmallVector<int, 4> MaskVal(NumLaneElems, -1);
-  unsigned HalfLaneElems = NumLaneElems/2;
-  for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
-    for (unsigned i = 0; i != NumLaneElems; ++i) {
-      int Idx = Mask[i+l];
-      unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
-      if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
-        return false;
-      // For VSHUFPSY, the mask of the second half must be the same as the
-      // first but with the appropriate offsets. This works in the same way as
-      // VPERMILPS works with masks.
-      if (!symetricMaskRequired || Idx < 0)
-        continue;
-      if (MaskVal[i] < 0) {
-        MaskVal[i] = Idx - l;
-        continue;
-      }
-      if ((signed)(Idx - l) != MaskVal[i])
-        return false;
-    }
-  }
-
-  return true;
-}
-
-/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
-static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
-  if (!VT.is128BitVector())
-    return false;
-
-  unsigned NumElems = VT.getVectorNumElements();
-
-  if (NumElems != 4)
-    return false;
-
-  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
-  return isUndefOrEqual(Mask[0], 6) &&
-         isUndefOrEqual(Mask[1], 7) &&
-         isUndefOrEqual(Mask[2], 2) &&
-         isUndefOrEqual(Mask[3], 3);
-}
-
-/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
-/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
-/// <2, 3, 2, 3>
-static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
-  if (!VT.is128BitVector())
-    return false;
-
-  unsigned NumElems = VT.getVectorNumElements();
-
-  if (NumElems != 4)
-    return false;
-
-  return isUndefOrEqual(Mask[0], 2) &&
-         isUndefOrEqual(Mask[1], 3) &&
-         isUndefOrEqual(Mask[2], 2) &&
-         isUndefOrEqual(Mask[3], 3);
-}
-
-/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
-static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
-  if (!VT.is128BitVector())
-    return false;
-
-  unsigned NumElems = VT.getVectorNumElements();
-
-  if (NumElems != 2 && NumElems != 4)
-    return false;
-
-  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
-    if (!isUndefOrEqual(Mask[i], i + NumElems))
-      return false;
-
-  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
-    if (!isUndefOrEqual(Mask[i], i))
-      return false;
-
-  return true;
-}
-
-/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
-static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
-  if (!VT.is128BitVector())
-    return false;
-
-  unsigned NumElems = VT.getVectorNumElements();
-
-  if (NumElems != 2 && NumElems != 4)
-    return false;
-
-  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
-    if (!isUndefOrEqual(Mask[i], i))
-      return false;
-
-  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
-    if (!isUndefOrEqual(Mask[i + e], i + NumElems))
-      return false;
-
-  return true;
-}
-
-/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to INSERTPS.
-/// i. e: If all but one element come from the same vector.
-static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
-  // TODO: Deal with AVX's VINSERTPS
-  if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
-    return false;
-
-  unsigned CorrectPosV1 = 0;
-  unsigned CorrectPosV2 = 0;
-  for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
-    if (Mask[i] == -1) {
-      ++CorrectPosV1;
-      ++CorrectPosV2;
-      continue;
-    }
-
-    if (Mask[i] == i)
-      ++CorrectPosV1;
-    else if (Mask[i] == i + 4)
-      ++CorrectPosV2;
-  }
-
-  if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
-    // We have 3 elements (undefs count as elements from any vector) from one
-    // vector, and one from another.
-    return true;
-
-  return false;
-}
-
-//
-// Some special combinations that can be optimized.
-//
-static
-SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
-                               SelectionDAG &DAG) {
-  MVT VT = SVOp->getSimpleValueType(0);
-  SDLoc dl(SVOp);
-
-  if (VT != MVT::v8i32 && VT != MVT::v8f32)
-    return SDValue();
-
-  ArrayRef<int> Mask = SVOp->getMask();
-
-  // These are the special masks that may be optimized.
-  static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
-  static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
-  bool MatchEvenMask = true;
-  bool MatchOddMask  = true;
-  for (int i=0; i<8; ++i) {
-    if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
-      MatchEvenMask = false;
-    if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
-      MatchOddMask = false;
-  }
-
-  if (!MatchEvenMask && !MatchOddMask)
-    return SDValue();
-
-  SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
-
-  SDValue Op0 = SVOp->getOperand(0);
-  SDValue Op1 = SVOp->getOperand(1);
-
-  if (MatchEvenMask) {
-    // Shift the second operand right to 32 bits.
-    static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
-    Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
-  } else {
-    // Shift the first operand left to 32 bits.
-    static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
-    Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
-  }
-  static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
-  return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
-}
-
-/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to UNPCKL.
-static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
-                         bool HasInt256, bool V2IsSplat = false) {
-
-  assert(VT.getSizeInBits() >= 128 &&
-         "Unsupported vector type for unpckl");
-
-  unsigned NumElts = VT.getVectorNumElements();
-  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
-      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
-    return false;
-
-  assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
-         "Unsupported vector type for unpckh");
-
-  // AVX defines UNPCK* to operate independently on 128-bit lanes.
-  unsigned NumLanes = VT.getSizeInBits()/128;
-  unsigned NumLaneElts = NumElts/NumLanes;
-
-  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
-    for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
-      int BitI  = Mask[l+i];
-      int BitI1 = Mask[l+i+1];
-      if (!isUndefOrEqual(BitI, j))
-        return false;
-      if (V2IsSplat) {
-        if (!isUndefOrEqual(BitI1, NumElts))
-          return false;
-      } else {
-        if (!isUndefOrEqual(BitI1, j + NumElts))
-          return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to UNPCKH.
-static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
-                         bool HasInt256, bool V2IsSplat = false) {
-  assert(VT.getSizeInBits() >= 128 &&
-         "Unsupported vector type for unpckh");
-
-  unsigned NumElts = VT.getVectorNumElements();
-  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
-      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
-    return false;
-
-  assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
-         "Unsupported vector type for unpckh");
-
-  // AVX defines UNPCK* to operate independently on 128-bit lanes.
-  unsigned NumLanes = VT.getSizeInBits()/128;
-  unsigned NumLaneElts = NumElts/NumLanes;
-
-  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
-    for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
-      int BitI  = Mask[l+i];
-      int BitI1 = Mask[l+i+1];
-      if (!isUndefOrEqual(BitI, j))
-        return false;
-      if (V2IsSplat) {
-        if (isUndefOrEqual(BitI1, NumElts))
-          return false;
-      } else {
-        if (!isUndefOrEqual(BitI1, j+NumElts))
-          return false;
-      }
-    }
-  }
-  return true;
-}
-
-/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
-/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
-/// <0, 0, 1, 1>
-static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
-  unsigned NumElts = VT.getVectorNumElements();
-  bool Is256BitVec = VT.is256BitVector();
-
-  if (VT.is512BitVector())
-    return false;
-  assert((VT.is128BitVector() || VT.is256BitVector()) &&
-         "Unsupported vector type for unpckh");
-
-  if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
-      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
-    return false;
-
-  // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
-  // FIXME: Need a better way to get rid of this, there's no latency difference
-  // between UNPCKLPD and MOVDDUP, the later should always be checked first and
-  // the former later. We should also remove the "_undef" special mask.
-  if (NumElts == 4 && Is256BitVec)
-    return false;
-
-  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
-  // independently on 128-bit lanes.
-  unsigned NumLanes = VT.getSizeInBits()/128;
-  unsigned NumLaneElts = NumElts/NumLanes;
-
-  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
-    for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
-      int BitI  = Mask[l+i];
-      int BitI1 = Mask[l+i+1];
-
-      if (!isUndefOrEqual(BitI, j))
-        return false;
-      if (!isUndefOrEqual(BitI1, j))
-        return false;
-    }
-  }
-
-  return true;
-}
-
-/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
-/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
-/// <2, 2, 3, 3>
-static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  if (VT.is512BitVector())
-    return false;
-
-  assert((VT.is128BitVector() || VT.is256BitVector()) &&
-         "Unsupported vector type for unpckh");
-
-  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
-      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
-    return false;
-
-  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
-  // independently on 128-bit lanes.
-  unsigned NumLanes = VT.getSizeInBits()/128;
-  unsigned NumLaneElts = NumElts/NumLanes;
-
-  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
-    for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
-      int BitI  = Mask[l+i];
-      int BitI1 = Mask[l+i+1];
-      if (!isUndefOrEqual(BitI, j))
-        return false;
-      if (!isUndefOrEqual(BitI1, j))
-        return false;
-    }
-  }
-  return true;
-}
-
-// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
-// (src1[0], src0[1]), manipulation with 256-bit sub-vectors
-static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
-  if (!VT.is512BitVector())
-    return false;
-
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned HalfSize = NumElts/2;
-  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
-    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
-      *Imm = 1;
-      return true;
-    }
-  }
-  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
-    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
-      *Imm = 0;
-      return true;
-    }
-  }
-  return false;
-}
-
-/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVSS,
-/// MOVSD, and MOVD, i.e. setting the lowest element.
-static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
-  if (VT.getVectorElementType().getSizeInBits() < 32)
-    return false;
-  if (!VT.is128BitVector())
-    return false;
-
-  unsigned NumElts = VT.getVectorNumElements();
-
-  if (!isUndefOrEqual(Mask[0], NumElts))
-    return false;
-
-  for (unsigned i = 1; i != NumElts; ++i)
-    if (!isUndefOrEqual(Mask[i], i))
-      return false;
-
-  return true;
-}
-
-/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
-/// as permutations between 128-bit chunks or halves. As an example: this
-/// shuffle bellow:
-///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
-/// The first half comes from the second half of V1 and the second half from the
-/// the second half of V2.
-static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
-  if (!HasFp256 || !VT.is256BitVector())
-    return false;
-
-  // The shuffle result is divided into half A and half B. In total the two
-  // sources have 4 halves, namely: C, D, E, F. The final values of A and
-  // B must come from C, D, E or F.
-  unsigned HalfSize = VT.getVectorNumElements()/2;
-  bool MatchA = false, MatchB = false;
-
-  // Check if A comes from one of C, D, E, F.
-  for (unsigned Half = 0; Half != 4; ++Half) {
-    if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
-      MatchA = true;
-      break;
-    }
-  }
-
-  // Check if B comes from one of C, D, E, F.
-  for (unsigned Half = 0; Half != 4; ++Half) {
-    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
-      MatchB = true;
-      break;
-    }
-  }
-
-  return MatchA && MatchB;
-}
-
-/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
-static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
-  MVT VT = SVOp->getSimpleValueType(0);
-
-  unsigned HalfSize = VT.getVectorNumElements()/2;
-
-  unsigned FstHalf = 0, SndHalf = 0;
-  for (unsigned i = 0; i < HalfSize; ++i) {
-    if (SVOp->getMaskElt(i) > 0) {
-      FstHalf = SVOp->getMaskElt(i)/HalfSize;
-      break;
-    }
-  }
-  for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
-    if (SVOp->getMaskElt(i) > 0) {
-      SndHalf = SVOp->getMaskElt(i)/HalfSize;
-      break;
-    }
-  }
-
-  return (FstHalf | (SndHalf << 4));
-}
-
-// Symetric in-lane mask. Each lane has 4 elements (for imm8)
-static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-  if (EltSize < 32)
-    return false;
-
-  unsigned NumElts = VT.getVectorNumElements();
-  Imm8 = 0;
-  if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
-    for (unsigned i = 0; i != NumElts; ++i) {
-      if (Mask[i] < 0)
-        continue;
-      Imm8 |= Mask[i] << (i*2);
-    }
-    return true;
-  }
-
-  unsigned LaneSize = 4;
-  SmallVector<int, 4> MaskVal(LaneSize, -1);
-
-  for (unsigned l = 0; l != NumElts; l += LaneSize) {
-    for (unsigned i = 0; i != LaneSize; ++i) {
-      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
-        return false;
-      if (Mask[i+l] < 0)
-        continue;
-      if (MaskVal[i] < 0) {
-        MaskVal[i] = Mask[i+l] - l;
-        Imm8 |= MaskVal[i] << (i*2);
-        continue;
-      }
-      if (Mask[i+l] != (signed)(MaskVal[i]+l))
-        return false;
-    }
-  }
-  return true;
-}
-
-/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
-/// Note that VPERMIL mask matching is different depending whether theunderlying
-/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
-/// to the same elements of the low, but to the higher half of the source.
-/// In VPERMILPD the two lanes could be shuffled independently of each other
-/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
-static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-  if (VT.getSizeInBits() < 256 || EltSize < 32)
-    return false;
-  bool symetricMaskRequired = (EltSize == 32);
-  unsigned NumElts = VT.getVectorNumElements();
-
-  unsigned NumLanes = VT.getSizeInBits()/128;
-  unsigned LaneSize = NumElts/NumLanes;
-  // 2 or 4 elements in one lane
-
-  SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
-  for (unsigned l = 0; l != NumElts; l += LaneSize) {
-    for (unsigned i = 0; i != LaneSize; ++i) {
-      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
-        return false;
-      if (symetricMaskRequired) {
-        if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
-          ExpectedMaskVal[i] = Mask[i+l] - l;
-          continue;
-        }
-        if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
-          return false;
-      }
-    }
-  }
-  return true;
-}
-
-/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
-/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
-/// element of vector 2 and the other elements to come from vector 1 in order.
-static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
-                               bool V2IsSplat = false, bool V2IsUndef = false) {
-  if (!VT.is128BitVector())
-    return false;
-
-  unsigned NumOps = VT.getVectorNumElements();
-  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
-    return false;
-
-  if (!isUndefOrEqual(Mask[0], 0))
-    return false;
-
-  for (unsigned i = 1; i != NumOps; ++i)
-    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
-          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
-          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
-      return false;
-
-  return true;
-}
-
-/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
-/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
-static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
-                           const X86Subtarget *Subtarget) {
-  if (!Subtarget->hasSSE3())
-    return false;
-
-  unsigned NumElems = VT.getVectorNumElements();
-
-  if ((VT.is128BitVector() && NumElems != 4) ||
-      (VT.is256BitVector() && NumElems != 8) ||
-      (VT.is512BitVector() && NumElems != 16))
-    return false;
-
-  // "i+1" is the value the indexed mask element must have
-  for (unsigned i = 0; i != NumElems; i += 2)
-    if (!isUndefOrEqual(Mask[i], i+1) ||
-        !isUndefOrEqual(Mask[i+1], i+1))
-      return false;
-
-  return true;
-}
-
-/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
-/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
-static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
-                           const X86Subtarget *Subtarget) {
-  if (!Subtarget->hasSSE3())
-    return false;
-
-  unsigned NumElems = VT.getVectorNumElements();
-
-  if ((VT.is128BitVector() && NumElems != 4) ||
-      (VT.is256BitVector() && NumElems != 8) ||
-      (VT.is512BitVector() && NumElems != 16))
-    return false;
-
-  // "i" is the value the indexed mask element must have
-  for (unsigned i = 0; i != NumElems; i += 2)
-    if (!isUndefOrEqual(Mask[i], i) ||
-        !isUndefOrEqual(Mask[i+1], i))
-      return false;
-
-  return true;
-}
-
-/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 256-bit
-/// version of MOVDDUP.
-static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
-  if (!HasFp256 || !VT.is256BitVector())
-    return false;
-
-  unsigned NumElts = VT.getVectorNumElements();
-  if (NumElts != 4)
-    return false;
-
-  for (unsigned i = 0; i != NumElts/2; ++i)
-    if (!isUndefOrEqual(Mask[i], 0))
-      return false;
-  for (unsigned i = NumElts/2; i != NumElts; ++i)
-    if (!isUndefOrEqual(Mask[i], NumElts/2))
-      return false;
-  return true;
-}
-
-/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 128-bit
-/// version of MOVDDUP.
-static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
-  if (!VT.is128BitVector())
-    return false;
-
-  unsigned e = VT.getVectorNumElements() / 2;
-  for (unsigned i = 0; i != e; ++i)
-    if (!isUndefOrEqual(Mask[i], i))
-      return false;
-  for (unsigned i = 0; i != e; ++i)
-    if (!isUndefOrEqual(Mask[e+i], i))
-      return false;
-  return true;
-}
-
 /// isVEXTRACTIndex - Return true if the specified
 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
 /// suitable for instruction that extract 128 or 256 bit vectors
@@ -4754,125 +3974,6 @@ bool X86::isVEXTRACT256Index(SDNode *N) {
   return isVEXTRACTIndex(N, 256);
 }
 
-/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
-/// Handles 128-bit and 256-bit.
-static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
-  MVT VT = N->getSimpleValueType(0);
-
-  assert((VT.getSizeInBits() >= 128) &&
-         "Unsupported vector type for PSHUF/SHUFP");
-
-  // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
-  // independently on 128-bit lanes.
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.getSizeInBits()/128;
-  unsigned NumLaneElts = NumElts/NumLanes;
-
-  assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
-         "Only supports 2, 4 or 8 elements per lane");
-
-  unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
-  unsigned Mask = 0;
-  for (unsigned i = 0; i != NumElts; ++i) {
-    int Elt = N->getMaskElt(i);
-    if (Elt < 0) continue;
-    Elt &= NumLaneElts - 1;
-    unsigned ShAmt = (i << Shift) % 8;
-    Mask |= Elt << ShAmt;
-  }
-
-  return Mask;
-}
-
-/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
-static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
-  MVT VT = N->getSimpleValueType(0);
-
-  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
-         "Unsupported vector type for PSHUFHW");
-
-  unsigned NumElts = VT.getVectorNumElements();
-
-  unsigned Mask = 0;
-  for (unsigned l = 0; l != NumElts; l += 8) {
-    // 8 nodes per lane, but we only care about the last 4.
-    for (unsigned i = 0; i < 4; ++i) {
-      int Elt = N->getMaskElt(l+i+4);
-      if (Elt < 0) continue;
-      Elt &= 0x3; // only 2-bits.
-      Mask |= Elt << (i * 2);
-    }
-  }
-
-  return Mask;
-}
-
-/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
-static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
-  MVT VT = N->getSimpleValueType(0);
-
-  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
-         "Unsupported vector type for PSHUFHW");
-
-  unsigned NumElts = VT.getVectorNumElements();
-
-  unsigned Mask = 0;
-  for (unsigned l = 0; l != NumElts; l += 8) {
-    // 8 nodes per lane, but we only care about the first 4.
-    for (unsigned i = 0; i < 4; ++i) {
-      int Elt = N->getMaskElt(l+i);
-      if (Elt < 0) continue;
-      Elt &= 0x3; // only 2-bits
-      Mask |= Elt << (i * 2);
-    }
-  }
-
-  return Mask;
-}
-
-/// \brief Return the appropriate immediate to shuffle the specified
-/// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
-/// VALIGN (if Interlane is true) instructions.
-static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
-                                           bool InterLane) {
-  MVT VT = SVOp->getSimpleValueType(0);
-  unsigned EltSize = InterLane ? 1 :
-    VT.getVectorElementType().getSizeInBits() >> 3;
-
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
-  unsigned NumLaneElts = NumElts/NumLanes;
-
-  int Val = 0;
-  unsigned i;
-  for (i = 0; i != NumElts; ++i) {
-    Val = SVOp->getMaskElt(i);
-    if (Val >= 0)
-      break;
-  }
-  if (Val >= (int)NumElts)
-    Val -= NumElts - NumLaneElts;
-
-  assert(Val - i > 0 && "PALIGNR imm should be positive");
-  return (Val - i) * EltSize;
-}
-
-/// \brief Return the appropriate immediate to shuffle the specified
-/// VECTOR_SHUFFLE mask with the PALIGNR instruction.
-static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
-  return getShuffleAlignrImmediate(SVOp, false);
-}
-
-/// \brief Return the appropriate immediate to shuffle the specified
-/// VECTOR_SHUFFLE mask with the VALIGN instruction.
-static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
-  return getShuffleAlignrImmediate(SVOp, true);
-}
-
-
 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
@@ -4947,119 +4048,6 @@ bool X86::isZeroNode(SDValue Elt) {
   return false;
 }
 
-/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
-/// match movhlps. The lower half elements should come from upper half of
-/// V1 (and in order), and the upper half elements should come from the upper
-/// half of V2 (and in order).
-static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
-  if (!VT.is128BitVector())
-    return false;
-  if (VT.getVectorNumElements() != 4)
-    return false;
-  for (unsigned i = 0, e = 2; i != e; ++i)
-    if (!isUndefOrEqual(Mask[i], i+2))
-      return false;
-  for (unsigned i = 2; i != 4; ++i)
-    if (!isUndefOrEqual(Mask[i], i+4))
-      return false;
-  return true;
-}
-
-/// isScalarLoadToVector - Returns true if the node is a scalar load that
-/// is promoted to a vector. It also returns the LoadSDNode by reference if
-/// required.
-static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
-  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
-    return false;
-  N = N->getOperand(0).getNode();
-  if (!ISD::isNON_EXTLoad(N))
-    return false;
-  if (LD)
-    *LD = cast<LoadSDNode>(N);
-  return true;
-}
-
-// Test whether the given value is a vector value which will be legalized
-// into a load.
-static bool WillBeConstantPoolLoad(SDNode *N) {
-  if (N->getOpcode() != ISD::BUILD_VECTOR)
-    return false;
-
-  // Check for any non-constant elements.
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-    switch (N->getOperand(i).getNode()->getOpcode()) {
-    case ISD::UNDEF:
-    case ISD::ConstantFP:
-    case ISD::Constant:
-      break;
-    default:
-      return false;
-    }
-
-  // Vectors of all-zeros and all-ones are materialized with special
-  // instructions rather than being loaded.
-  return !ISD::isBuildVectorAllZeros(N) &&
-         !ISD::isBuildVectorAllOnes(N);
-}
-
-/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
-/// match movlp{s|d}. The lower half elements should come from lower half of
-/// V1 (and in order), and the upper half elements should come from the upper
-/// half of V2 (and in order). And since V1 will become the source of the
-/// MOVLP, it must be either a vector load or a scalar load to vector.
-static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
-                               ArrayRef<int> Mask, MVT VT) {
-  if (!VT.is128BitVector())
-    return false;
-
-  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
-    return false;
-  // Is V2 is a vector load, don't do this transformation. We will try to use
-  // load folding shufps op.
-  if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
-    return false;
-
-  unsigned NumElems = VT.getVectorNumElements();
-
-  if (NumElems != 2 && NumElems != 4)
-    return false;
-  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
-    if (!isUndefOrEqual(Mask[i], i))
-      return false;
-  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
-    if (!isUndefOrEqual(Mask[i], i+NumElems))
-      return false;
-  return true;
-}
-
-/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
-/// to an zero vector.
-/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
-static bool isZeroShuffle(ShuffleVectorSDNode *N) {
-  SDValue V1 = N->getOperand(0);
-  SDValue V2 = N->getOperand(1);
-  unsigned NumElems = N->getValueType(0).getVectorNumElements();
-  for (unsigned i = 0; i != NumElems; ++i) {
-    int Idx = N->getMaskElt(i);
-    if (Idx >= (int)NumElems) {
-      unsigned Opc = V2.getOpcode();
-      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
-        continue;
-      if (Opc != ISD::BUILD_VECTOR ||
-          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
-        return false;
-    } else if (Idx >= 0) {
-      unsigned Opc = V1.getOpcode();
-      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
-        continue;
-      if (Opc != ISD::BUILD_VECTOR ||
-          !X86::isZeroNode(V1.getOperand(Idx)))
-        return false;
-    }
-  }
-  return true;
-}
-
 /// getZeroVector - Returns a vector of specified type with all zero elements.
 ///
 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
@@ -5131,16 +4119,6 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
 }
 
-/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
-/// that point to V2 points to its first element.
-static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
-  for (unsigned i = 0; i != NumElems; ++i) {
-    if (Mask[i] > (int)NumElems) {
-      Mask[i] = NumElems;
-    }
-  }
-}
-
 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
 /// operation of specified width.
 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
@@ -5177,92 +4155,6 @@ static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
 }
 
-// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
-// a generic shuffle instruction because the target has no such instructions.
-// Generate shuffles which repeat i16 and i8 several times until they can be
-// represented by v4f32 and then be manipulated by target suported shuffles.
-static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
-  MVT VT = V.getSimpleValueType();
-  int NumElems = VT.getVectorNumElements();
-  SDLoc dl(V);
-
-  while (NumElems > 4) {
-    if (EltNo < NumElems/2) {
-      V = getUnpackl(DAG, dl, VT, V, V);
-    } else {
-      V = getUnpackh(DAG, dl, VT, V, V);
-      EltNo -= NumElems/2;
-    }
-    NumElems >>= 1;
-  }
-  return V;
-}
-
-/// getLegalSplat - Generate a legal splat with supported x86 shuffles
-static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
-  MVT VT = V.getSimpleValueType();
-  SDLoc dl(V);
-
-  if (VT.is128BitVector()) {
-    V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
-    int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
-    V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
-                             &SplatMask[0]);
-  } else if (VT.is256BitVector()) {
-    // To use VPERMILPS to splat scalars, the second half of indicies must
-    // refer to the higher part, which is a duplication of the lower one,
-    // because VPERMILPS can only handle in-lane permutations.
-    int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
-                         EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
-
-    V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
-    V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
-                             &SplatMask[0]);
-  } else
-    llvm_unreachable("Vector size not supported");
-
-  return DAG.getNode(ISD::BITCAST, dl, VT, V);
-}
-
-/// PromoteSplat - Splat is promoted to target supported vector shuffles.
-static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
-  MVT SrcVT = SV->getSimpleValueType(0);
-  SDValue V1 = SV->getOperand(0);
-  SDLoc dl(SV);
-
-  int EltNo = SV->getSplatIndex();
-  int NumElems = SrcVT.getVectorNumElements();
-  bool Is256BitVec = SrcVT.is256BitVector();
-
-  assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
-         "Unknown how to promote splat for type");
-
-  // Extract the 128-bit part containing the splat element and update
-  // the splat element index when it refers to the higher register.
-  if (Is256BitVec) {
-    V1 = Extract128BitVector(V1, EltNo, DAG, dl);
-    if (EltNo >= NumElems/2)
-      EltNo -= NumElems/2;
-  }
-
-  // All i16 and i8 vector types can't be used directly by a generic shuffle
-  // instruction because the target has no such instruction. Generate shuffles
-  // which repeat i16 and i8 several times until they fit in i32, and then can
-  // be manipulated by target suported shuffles.
-  MVT EltVT = SrcVT.getVectorElementType();
-  if (EltVT == MVT::i8 || EltVT == MVT::i16)
-    V1 = PromoteSplati8i16(V1, DAG, EltNo);
-
-  // Recreate the 256-bit vector and place the same 128-bit vector
-  // into the low and high part. This is necessary because we want
-  // to use VPERM* to shuffle the vectors
-  if (Is256BitVec) {
-    V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
-  }
-
-  return getLegalSplat(DAG, V1, EltNo);
-}
-
 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
 /// vector of zero or undef vector.  This produces a shuffle where the low
 /// element of V2 is swizzled into the zero/undef vector, landing at element
@@ -5394,13 +4286,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
       return false;
 
     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
-      // FIXME: Support AVX-512 here.
-      Type *Ty = C->getType();
-      if (!Ty->isVectorTy() || (Ty->getVectorNumElements() != 16 &&
-                                Ty->getVectorNumElements() != 32))
-        return false;
-
       DecodePSHUFBMask(C, Mask);
+      if (Mask.empty())
+        return false;
       break;
     }
 
@@ -5412,16 +4300,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
     IsUnary = true;
     break;
   case X86ISD::MOVSS:
-  case X86ISD::MOVSD: {
-    // The index 0 always comes from the first element of the second source,
-    // this is why MOVSS and MOVSD are used in the first place. The other
-    // elements come from the other positions of the first source vector
-    Mask.push_back(NumElems);
-    for (unsigned i = 1; i != NumElems; ++i) {
-      Mask.push_back(i);
-    }
+  case X86ISD::MOVSD:
+    DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
     break;
-  }
   case X86ISD::VPERM2X128:
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
@@ -5429,11 +4310,16 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
     break;
   case X86ISD::MOVSLDUP:
     DecodeMOVSLDUPMask(VT, Mask);
+    IsUnary = true;
     break;
   case X86ISD::MOVSHDUP:
     DecodeMOVSHDUPMask(VT, Mask);
+    IsUnary = true;
     break;
   case X86ISD::MOVDDUP:
+    DecodeMOVDDUPMask(VT, Mask);
+    IsUnary = true;
+    break;
   case X86ISD::MOVLHPD:
   case X86ISD::MOVLPD:
   case X86ISD::MOVLPS:
@@ -5517,148 +4403,6 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// getNumOfConsecutiveZeros - Return the number of elements of a vector
-/// shuffle operation which come from a consecutively from a zero. The
-/// search can start in two different directions, from left or right.
-/// We count undefs as zeros until PreferredNum is reached.
-static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
-                                         unsigned NumElems, bool ZerosFromLeft,
-                                         SelectionDAG &DAG,
-                                         unsigned PreferredNum = -1U) {
-  unsigned NumZeros = 0;
-  for (unsigned i = 0; i != NumElems; ++i) {
-    unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
-    SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
-    if (!Elt.getNode())
-      break;
-
-    if (X86::isZeroNode(Elt))
-      ++NumZeros;
-    else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
-      NumZeros = std::min(NumZeros + 1, PreferredNum);
-    else
-      break;
-  }
-
-  return NumZeros;
-}
-
-/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
-/// correspond consecutively to elements from one of the vector operands,
-/// starting from its index OpIdx. Also tell OpNum which source vector operand.
-static
-bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
-                              unsigned MaskI, unsigned MaskE, unsigned OpIdx,
-                              unsigned NumElems, unsigned &OpNum) {
-  bool SeenV1 = false;
-  bool SeenV2 = false;
-
-  for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
-    int Idx = SVOp->getMaskElt(i);
-    // Ignore undef indicies
-    if (Idx < 0)
-      continue;
-
-    if (Idx < (int)NumElems)
-      SeenV1 = true;
-    else
-      SeenV2 = true;
-
-    // Only accept consecutive elements from the same vector
-    if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
-      return false;
-  }
-
-  OpNum = SeenV1 ? 0 : 1;
-  return true;
-}
-
-/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
-/// logical left shift of a vector.
-static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
-                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
-  unsigned NumElems =
-    SVOp->getSimpleValueType(0).getVectorNumElements();
-  unsigned NumZeros = getNumOfConsecutiveZeros(
-      SVOp, NumElems, false /* check zeros from right */, DAG,
-      SVOp->getMaskElt(0));
-  unsigned OpSrc;
-
-  if (!NumZeros)
-    return false;
-
-  // Considering the elements in the mask that are not consecutive zeros,
-  // check if they consecutively come from only one of the source vectors.
-  //
-  //               V1 = {X, A, B, C}     0
-  //                         \  \  \    /
-  //   vector_shuffle V1, V2 <1, 2, 3, X>
-  //
-  if (!isShuffleMaskConsecutive(SVOp,
-            0,                   // Mask Start Index
-            NumElems-NumZeros,   // Mask End Index(exclusive)
-            NumZeros,            // Where to start looking in the src vector
-            NumElems,            // Number of elements in vector
-            OpSrc))              // Which source operand ?
-    return false;
-
-  isLeft = false;
-  ShAmt = NumZeros;
-  ShVal = SVOp->getOperand(OpSrc);
-  return true;
-}
-
-/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
-/// logical left shift of a vector.
-static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
-                              bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
-  unsigned NumElems =
-    SVOp->getSimpleValueType(0).getVectorNumElements();
-  unsigned NumZeros = getNumOfConsecutiveZeros(
-      SVOp, NumElems, true /* check zeros from left */, DAG,
-      NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
-  unsigned OpSrc;
-
-  if (!NumZeros)
-    return false;
-
-  // Considering the elements in the mask that are not consecutive zeros,
-  // check if they consecutively come from only one of the source vectors.
-  //
-  //                           0    { A, B, X, X } = V2
-  //                          / \    /  /
-  //   vector_shuffle V1, V2 <X, X, 4, 5>
-  //
-  if (!isShuffleMaskConsecutive(SVOp,
-            NumZeros,     // Mask Start Index
-            NumElems,     // Mask End Index(exclusive)
-            0,            // Where to start looking in the src vector
-            NumElems,     // Number of elements in vector
-            OpSrc))       // Which source operand ?
-    return false;
-
-  isLeft = true;
-  ShAmt = NumZeros;
-  ShVal = SVOp->getOperand(OpSrc);
-  return true;
-}
-
-/// isVectorShift - Returns true if the shuffle can be implemented as a
-/// logical left or right shift of a vector.
-static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
-                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
-  // Although the logic below support any bitwidth size, there are no
-  // shift instructions which handle more than 128-bit vectors.
-  if (!SVOp->getSimpleValueType(0).is128BitVector())
-    return false;
-
-  if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
-      isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
-    return true;
-
-  return false;
-}
-
 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
 ///
 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
@@ -5744,19 +4488,19 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
                                      const X86Subtarget *Subtarget,
                                      const TargetLowering &TLI) {
   // Find all zeroable elements.
-  bool Zeroable[4];
+  std::bitset<4> Zeroable;
   for (int i=0; i < 4; ++i) {
     SDValue Elt = Op->getOperand(i);
     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
   }
-  assert(std::count_if(&Zeroable[0], &Zeroable[4],
-                       [](bool M) { return !M; }) > 1 &&
+  assert(Zeroable.size() - Zeroable.count() > 1 &&
          "We expect at least two non-zero elements!");
 
   // We only know how to deal with build_vector nodes where elements are either
   // zeroable or extract_vector_elt with constant index.
   SDValue FirstNonZero;
-  for (int i=0; i < 4; ++i) {
+  unsigned FirstNonZeroIdx;
+  for (unsigned i=0; i < 4; ++i) {
     if (Zeroable[i])
       continue;
     SDValue Elt = Op->getOperand(i);
@@ -5767,8 +4511,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
     MVT VT = Elt.getOperand(0).getSimpleValueType();
     if (!VT.is128BitVector())
       return SDValue();
-    if (!FirstNonZero.getNode())
+    if (!FirstNonZero.getNode()) {
       FirstNonZero = Elt;
+      FirstNonZeroIdx = i;
+    }
   }
 
   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
@@ -5807,14 +4553,14 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
     return SDValue();
 
   SDValue V2 = Elt.getOperand(0);
-  if (Elt == FirstNonZero)
+  if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
     V1 = SDValue();
 
   bool CanFold = true;
   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
     if (Zeroable[i])
       continue;
-    
+
     SDValue Current = Op->getOperand(i);
     SDValue SrcVector = Current->getOperand(0);
     if (!V1.getNode())
@@ -5833,10 +4579,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
 
   // Ok, we can emit an INSERTPS instruction.
-  unsigned ZMask = 0;
-  for (int i = 0; i < 4; ++i)
-    if (Zeroable[i])
-      ZMask |= 1 << i;
+  unsigned ZMask = Zeroable.to_ulong();
 
   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
@@ -5845,19 +4588,19 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
 }
 
-/// getVShift - Return a vector logical shift node.
-///
+/// Return a vector logical shift node.
 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                          unsigned NumBits, SelectionDAG &DAG,
                          const TargetLowering &TLI, SDLoc dl) {
   assert(VT.is128BitVector() && "Unknown type for VShift");
-  EVT ShVT = MVT::v2i64;
+  MVT ShVT = MVT::v2i64;
   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
+  MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
+  assert(NumBits % 8 == 0 && "Only support byte sized shifts");
+  SDValue ShiftVal = DAG.getConstant(NumBits/8, ScalarShiftTy);
   return DAG.getNode(ISD::BITCAST, dl, VT,
-                     DAG.getNode(Opc, dl, ShVT, SrcOp,
-                             DAG.getConstant(NumBits,
-                                  TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
+                     DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
 }
 
 static SDValue
@@ -5924,9 +4667,7 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
                              LD->getPointerInfo().getWithOffset(StartOffset),
                              false, false, false, 0);
 
-    SmallVector<int, 8> Mask;
-    for (unsigned i = 0; i != NumElems; ++i)
-      Mask.push_back(EltNo);
+    SmallVector<int, 8> Mask(NumElems, EltNo);
 
     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
   }
@@ -5934,19 +4675,18 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
   return SDValue();
 }
 
-/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
-/// vector of type 'VT', see if the elements can be replaced by a single large
-/// load which has the same value as a build_vector whose operands are 'elts'.
+/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
+/// elements can be replaced by a single large load which has the same value as
+/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
 ///
 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
 ///
 /// FIXME: we'd also like to handle the case where the last elements are zero
 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
 /// There's even a handy isZeroNode for that purpose.
-static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
+static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
                                         SDLoc &DL, SelectionDAG &DAG,
                                         bool isAfterLegalize) {
-  EVT EltVT = VT.getVectorElementType();
   unsigned NumElems = Elts.size();
 
   LoadSDNode *LDBase = nullptr;
@@ -5957,7 +4697,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
   // non-consecutive, bail out.
   for (unsigned i = 0; i < NumElems; ++i) {
     SDValue Elt = Elts[i];
-
+    // Look through a bitcast.
+    if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
+      Elt = Elt.getOperand(0);
     if (!Elt.getNode() ||
         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
       return SDValue();
@@ -5972,7 +4714,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
       continue;
 
     LoadSDNode *LD = cast<LoadSDNode>(Elt);
-    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+    EVT LdVT = Elt.getValueType();
+    // Each loaded element must be the correct fractional portion of the
+    // requested vector load.
+    if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
+      return SDValue();
+    if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
       return SDValue();
     LastLoadedElt = i;
   }
@@ -5981,6 +4728,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
   // load of the entire vector width starting at the base pointer.  If we found
   // consecutive loads for the low half, generate a vzext_load node.
   if (LastLoadedElt == NumElems - 1) {
+    assert(LDBase && "Did not find base load for merging consecutive loads");
+    EVT EltVT = LDBase->getValueType(0);
+    // Ensure that the input vector size for the merged loads matches the
+    // cumulative size of the input elements.
+    if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
+      return SDValue();
 
     if (isAfterLegalize &&
         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
@@ -5988,15 +4741,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
 
     SDValue NewLd = SDValue();
 
-    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
-      NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
-                          LDBase->getPointerInfo(),
-                          LDBase->isVolatile(), LDBase->isNonTemporal(),
-                          LDBase->isInvariant(), 0);
     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
-                        LDBase->getPointerInfo(),
-                        LDBase->isVolatile(), LDBase->isNonTemporal(),
-                        LDBase->isInvariant(), LDBase->getAlignment());
+                        LDBase->getPointerInfo(), LDBase->isVolatile(),
+                        LDBase->isNonTemporal(), LDBase->isInvariant(),
+                        LDBase->getAlignment());
 
     if (LDBase->hasAnyUseOfValue(1)) {
       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
@@ -6009,7 +4757,11 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
 
     return NewLd;
   }
-  if (NumElems == 4 && LastLoadedElt == 1 &&
+
+  //TODO: The code below fires only for for loading the low v2i32 / v2f32
+  //of a v4i32 / v4f32. It's probably worth generalizing.
+  EVT EltVT = VT.getVectorElementType();
+  if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
@@ -6134,8 +4886,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
   // it may be detrimental to overall size. There needs to be a way to detect
   // that condition to know if this is truly a size win.
   const Function *F = DAG.getMachineFunction().getFunction();
-  bool OptForSize = F->getAttributes().
-    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+  bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
 
   // Handle broadcasting a single constant scalar from the constant pool
   // into a vector.
@@ -6183,7 +4934,8 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
   if (!IsLoad)
     return SDValue();
 
-  if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
+  if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+      (Subtarget->hasVLX() && ScalarSize == 64))
     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
 
   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
@@ -6339,8 +5091,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
       AllContants = false;
       NonConstIdx = idx;
       NumNonConsts++;
-    }
-    else {
+    } else {
       NumConsts++;
       if (cast<ConstantSDNode>(In)->getZExtValue())
       Immediate |= (1ULL << idx);
@@ -6363,7 +5114,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
                                          MVT::getIntegerVT(VT.getSizeInBits()));
       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
     }
-    else 
+    else
       DstVec = DAG.getUNDEF(VT);
     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
                        Op.getOperand(NonConstIdx),
@@ -6386,7 +5137,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
 
 /// \brief Return true if \p N implements a horizontal binop and return the
 /// operands for the horizontal binop into V0 and V1.
-/// 
+///
 /// This is a helper function of PerformBUILD_VECTORCombine.
 /// This function checks that the build_vector \p N in input implements a
 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
@@ -6407,7 +5158,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
          "Invalid Vector in input!");
-  
+
   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
   bool CanFold = true;
   unsigned ExpectedVExtractIdx = BaseIdx;
@@ -6476,13 +5227,13 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
 }
 
 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
-/// a concat_vector. 
+/// a concat_vector.
 ///
 /// This is a helper function of PerformBUILD_VECTORCombine.
 /// This function expects two 256-bit vectors called V0 and V1.
 /// At first, each vector is split into two separate 128-bit vectors.
 /// Then, the resulting 128-bit vectors are used to implement two
-/// horizontal binary operations. 
+/// horizontal binary operations.
 ///
 /// The kind of horizontal binary operation is defined by \p X86Opcode.
 ///
@@ -6566,7 +5317,7 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
   bool AddFound = false;
   bool SubFound = false;
 
-  for (unsigned i = 0, e = NumElts; i != e; i++) {
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
     SDValue Op = BV->getOperand(i);
 
     // Skip 'undef' values.
@@ -6676,18 +5427,18 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
     // Try to match an SSE3 float HADD/HSUB.
     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
-    
+
     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
     // Try to match an SSSE3 integer HADD/HSUB.
     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
-    
+
     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
   }
-  
+
   if (!Subtarget->hasAVX())
     return SDValue();
 
@@ -6738,7 +5489,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
       // Do this only if the target has AVX2.
       if (Subtarget->hasAVX2())
         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
- 
+
       // Do not try to expand this build_vector into a pair of horizontal
       // add/sub if we can emit a pair of scalar add/sub.
       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
@@ -6863,32 +5614,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
         // Handle SSE only.
         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
         EVT VecVT = MVT::v4i32;
-        unsigned VecElts = 4;
 
         // Truncate the value (which may itself be a constant) to i32, and
         // convert it to a vector with movd (S2V+shuffle to zero extend).
         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
-
-        // If using the new shuffle lowering, just directly insert this.
-        if (ExperimentalVectorShuffleLowering)
-          return DAG.getNode(
-              ISD::BITCAST, dl, VT,
-              getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
-
-        Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
-
-        // Now we have our 32-bit value zero extended in the low element of
-        // a vector.  If Idx != 0, swizzle it into place.
-        if (Idx != 0) {
-          SmallVector<int, 4> Mask;
-          Mask.push_back(Idx);
-          for (unsigned i = 1; i != VecElts; ++i)
-            Mask.push_back(i);
-          Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
-                                      &Mask[0]);
-        }
-        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
+        return DAG.getNode(
+            ISD::BITCAST, dl, VT,
+            getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
       }
     }
 
@@ -6948,17 +5681,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     // place.
     if (EVTBits == 32) {
       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
-
-      // If using the new shuffle lowering, just directly insert this.
-      if (ExperimentalVectorShuffleLowering)
-        return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
-
-      // Turn it into a shuffle of zero and zero-extended scalar to vector.
-      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
-      SmallVector<int, 8> MaskVec;
-      for (unsigned i = 0; i != NumElems; ++i)
-        MaskVec.push_back(i == Idx ? 0 : 1);
-      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
+      return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
     }
   }
 
@@ -6982,12 +5705,15 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (IsAllConstants)
     return SDValue();
 
-  // For AVX-length vectors, build the individual 128-bit pieces and use
+  // For AVX-length vectors, see if we can use a vector load to get all of the
+  // elements, otherwise build the individual 128-bit pieces and use
   // shuffles to put them in place.
   if (VT.is256BitVector() || VT.is512BitVector()) {
-    SmallVector<SDValue, 64> V;
-    for (unsigned i = 0; i != NumElems; ++i)
-      V.push_back(Op.getOperand(i));
+    SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
+
+    // Check for a build vector of consecutive loads.
+    if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
+      return LD;
 
     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
 
@@ -7091,7 +5817,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       return Sh;
 
     // For SSE 4.1, use insertps to put the high elements into the low element.
-    if (getSubtarget()->hasSSE41()) {
+    if (Subtarget->hasSSE41()) {
       SDValue Result;
       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
@@ -7271,38 +5997,40 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
   return true;
 }
 
-// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
-// 2013 will allow us to use it as a non-type template parameter.
-namespace {
-
-/// \brief Implementation of the \c isShuffleEquivalent variadic functor.
-///
-/// See its documentation for details.
-bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
-  if (Mask.size() != Args.size())
-    return false;
-  for (int i = 0, e = Mask.size(); i < e; ++i) {
-    assert(*Args[i] >= 0 && "Arguments must be positive integers!");
-    if (Mask[i] != -1 && Mask[i] != *Args[i])
-      return false;
-  }
-  return true;
-}
-
-} // namespace
-
 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
 /// arguments.
 ///
 /// This is a fast way to test a shuffle mask against a fixed pattern:
 ///
-///   if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
+///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
 ///
 /// It returns true if the mask is exactly as wide as the argument list, and
 /// each element of the mask is either -1 (signifying undef) or the value given
 /// in the argument.
-static const VariadicFunction1<
-    bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
+static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
+                                ArrayRef<int> ExpectedMask) {
+  if (Mask.size() != ExpectedMask.size())
+    return false;
+
+  int Size = Mask.size();
+
+  // If the values are build vectors, we can look through them to find
+  // equivalent inputs that make the shuffles equivalent.
+  auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
+  auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
+
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) {
+      auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
+      auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
+      if (!MaskBV || !ExpectedBV ||
+          MaskBV->getOperand(Mask[i] % Size) !=
+              ExpectedBV->getOperand(ExpectedMask[i] % Size))
+        return false;
+    }
+
+  return true;
+}
 
 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
 ///
@@ -7328,6 +6056,37 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
   return DAG.getConstant(Imm, MVT::i8);
 }
 
+/// \brief Try to emit a blend instruction for a shuffle using bit math.
+///
+/// This is used as a fallback approach when first class blend instructions are
+/// unavailable. Currently it is only suitable for integer vectors, but could
+/// be generalized for floating point vectors if desirable.
+static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
+                                            SDValue V2, ArrayRef<int> Mask,
+                                            SelectionDAG &DAG) {
+  assert(VT.isInteger() && "Only supports integer vector types!");
+  MVT EltVT = VT.getScalarType();
+  int NumEltBits = EltVT.getSizeInBits();
+  SDValue Zero = DAG.getConstant(0, EltVT);
+  SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), EltVT);
+  SmallVector<SDValue, 16> MaskOps;
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size)
+      return SDValue(); // Shuffled input!
+    MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
+  }
+
+  SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps);
+  V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
+  // We have to cast V2 around.
+  MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+  V2 = DAG.getNode(ISD::BITCAST, DL, VT,
+                   DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
+                               DAG.getNode(ISD::BITCAST, DL, MaskVT, V1Mask),
+                               DAG.getNode(ISD::BITCAST, DL, MaskVT, V2)));
+  return DAG.getNode(ISD::OR, DL, VT, V1, V2);
+}
+
 /// \brief Try to emit a blend instruction for a shuffle.
 ///
 /// This doesn't do any checks for the availability of instructions for blending
@@ -7338,7 +6097,6 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
                                          SDValue V2, ArrayRef<int> Mask,
                                          const X86Subtarget *Subtarget,
                                          SelectionDAG &DAG) {
-
   unsigned BlendMask = 0;
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     if (Mask[i] >= Size) {
@@ -7415,11 +6173,17 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
     }
   }
     // FALLTHROUGH
+  case MVT::v16i8:
   case MVT::v32i8: {
-    assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+    assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) &&
+           "256-bit byte-blends require AVX2 support!");
+
     // Scale the blend by the number of bytes per element.
-    int Scale =  VT.getScalarSizeInBits() / 8;
-    assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
+    int Scale = VT.getScalarSizeInBits() / 8;
+
+    // This form of blend is always done on bytes. Compute the byte vector
+    // type.
+    MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
 
     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
     // mix of LLVM's code generator and the x86 backend. We tell the code
@@ -7432,19 +6196,19 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
     // the LLVM model for boolean values in vector elements gets the relevant
     // bit set, it is set backwards and over constrained relative to x86's
     // actual model.
-    SDValue VSELECTMask[32];
+    SmallVector<SDValue, 32> VSELECTMask;
     for (int i = 0, Size = Mask.size(); i < Size; ++i)
       for (int j = 0; j < Scale; ++j)
-        VSELECTMask[Scale * i + j] =
+        VSELECTMask.push_back(
             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
-                        : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
+                        : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8));
 
-    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
-    V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
+    V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
+    V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
     return DAG.getNode(
         ISD::BITCAST, DL, VT,
-        DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
-                    DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
+        DAG.getNode(ISD::VSELECT, DL, BlendVT,
+                    DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask),
                     V1, V2));
   }
 
@@ -7453,12 +6217,45 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
   }
 }
 
-/// \brief Generic routine to lower a shuffle and blend as a decomposed set of
-/// unblended shuffles followed by an unshuffled blend.
+/// \brief Try to lower as a blend of elements from two inputs followed by
+/// a single-input permutation.
+///
+/// This matches the pattern where we can blend elements from two inputs and
+/// then reduce the shuffle to a single-input permutation.
+static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
+                                                   SDValue V2,
+                                                   ArrayRef<int> Mask,
+                                                   SelectionDAG &DAG) {
+  // We build up the blend mask while checking whether a blend is a viable way
+  // to reduce the shuffle.
+  SmallVector<int, 32> BlendMask(Mask.size(), -1);
+  SmallVector<int, 32> PermuteMask(Mask.size(), -1);
+
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
+
+    assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
+
+    if (BlendMask[Mask[i] % Size] == -1)
+      BlendMask[Mask[i] % Size] = Mask[i];
+    else if (BlendMask[Mask[i] % Size] != Mask[i])
+      return SDValue(); // Can't blend in the needed input!
+
+    PermuteMask[i] = Mask[i] % Size;
+  }
+
+  SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+  return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
+}
+
+/// \brief Generic routine to decompose a shuffle and blend into indepndent
+/// blends and permutes.
 ///
 /// This matches the extremely common pattern for handling combined
 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
-/// operations.
+/// operations. It will try to pick the best arrangement of shuffles and
+/// blends.
 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
                                                           SDValue V1,
                                                           SDValue V2,
@@ -7478,6 +6275,16 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
       BlendMask[i] = i + Size;
     }
 
+  // Try to lower with the simpler initial blend strategy unless one of the
+  // input shuffles would be a no-op. We prefer to shuffle inputs as the
+  // shuffle may be able to fold with a load or other benefit. However, when
+  // we'll have to do 2x as many shuffles in order to achieve this, blending
+  // first is a better strategy.
+  if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
+    if (SDValue BlendPerm =
+            lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+      return BlendPerm;
+
   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
@@ -7492,15 +6299,13 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
 /// does not check for the profitability of lowering either as PALIGNR or
 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
 /// This matches shuffle vectors that look like:
-/// 
+///
 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
-/// 
+///
 /// Essentially it concatenates V1 and V2, shifts right by some number of
 /// elements, and takes the low elements as the result. Note that while this is
 /// specified as a *right shift* because x86 is little-endian, it is a *left
 /// rotate* of the vector lanes.
-///
-/// Note that this only handles 128-bit vector widths currently.
 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
                                               SDValue V2,
                                               ArrayRef<int> Mask,
@@ -7508,6 +6313,10 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
                                               SelectionDAG &DAG) {
   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
 
+  int NumElts = Mask.size();
+  int NumLanes = VT.getSizeInBits() / 128;
+  int NumLaneElts = NumElts / NumLanes;
+
   // We need to detect various ways of spelling a rotation:
   //   [11, 12, 13, 14, 15,  0,  1,  2]
   //   [-1, 12, 13, 14, -1, -1,  1, -1]
@@ -7517,44 +6326,52 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
   //   [-1,  4,  5,  6, -1, -1, -1, -1]
   int Rotation = 0;
   SDValue Lo, Hi;
-  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
-    if (Mask[i] == -1)
-      continue;
-    assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
+  for (int l = 0; l < NumElts; l += NumLaneElts) {
+    for (int i = 0; i < NumLaneElts; ++i) {
+      if (Mask[l + i] == -1)
+        continue;
+      assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
 
-    // Based on the mod-Size value of this mask element determine where
-    // a rotated vector would have started.
-    int StartIdx = i - (Mask[i] % Size);
-    if (StartIdx == 0)
-      // The identity rotation isn't interesting, stop.
-      return SDValue();
+      // Get the mod-Size index and lane correct it.
+      int LaneIdx = (Mask[l + i] % NumElts) - l;
+      // Make sure it was in this lane.
+      if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
+        return SDValue();
 
-    // If we found the tail of a vector the rotation must be the missing
-    // front. If we found the head of a vector, it must be how much of the head.
-    int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
+      // Determine where a rotated vector would have started.
+      int StartIdx = i - LaneIdx;
+      if (StartIdx == 0)
+        // The identity rotation isn't interesting, stop.
+        return SDValue();
 
-    if (Rotation == 0)
-      Rotation = CandidateRotation;
-    else if (Rotation != CandidateRotation)
-      // The rotations don't match, so we can't match this mask.
-      return SDValue();
+      // If we found the tail of a vector the rotation must be the missing
+      // front. If we found the head of a vector, it must be how much of the
+      // head.
+      int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
 
-    // Compute which value this mask is pointing at.
-    SDValue MaskV = Mask[i] < Size ? V1 : V2;
-
-    // Compute which of the two target values this index should be assigned to.
-    // This reflects whether the high elements are remaining or the low elements
-    // are remaining.
-    SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
-
-    // Either set up this value if we've not encountered it before, or check
-    // that it remains consistent.
-    if (!TargetV)
-      TargetV = MaskV;
-    else if (TargetV != MaskV)
-      // This may be a rotation, but it pulls from the inputs in some
-      // unsupported interleaving.
-      return SDValue();
+      if (Rotation == 0)
+        Rotation = CandidateRotation;
+      else if (Rotation != CandidateRotation)
+        // The rotations don't match, so we can't match this mask.
+        return SDValue();
+
+      // Compute which value this mask is pointing at.
+      SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
+
+      // Compute which of the two target values this index should be assigned
+      // to. This reflects whether the high elements are remaining or the low
+      // elements are remaining.
+      SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
+
+      // Either set up this value if we've not encountered it before, or check
+      // that it remains consistent.
+      if (!TargetV)
+        TargetV = MaskV;
+      else if (TargetV != MaskV)
+        // This may be a rotation, but it pulls from the inputs in some
+        // unsupported interleaving.
+        return SDValue();
+    }
   }
 
   // Check that we successfully analyzed the mask, and normalize the results.
@@ -7565,26 +6382,27 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
   else if (!Hi)
     Hi = Lo;
 
-  assert(VT.getSizeInBits() == 128 &&
-         "Rotate-based lowering only supports 128-bit lowering!");
-  assert(Mask.size() <= 16 &&
-         "Can shuffle at most 16 bytes in a 128-bit vector!");
-
   // The actual rotate instruction rotates bytes, so we need to scale the
-  // rotation based on how many bytes are in the vector.
-  int Scale = 16 / Mask.size();
+  // rotation based on how many bytes are in the vector lane.
+  int Scale = 16 / NumLaneElts;
 
-  // SSSE3 targets can use the palignr instruction
+  // SSSE3 targets can use the palignr instruction.
   if (Subtarget->hasSSSE3()) {
-    // Cast the inputs to v16i8 to match PALIGNR.
-    Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
-    Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
+    // Cast the inputs to i8 vector of correct length to match PALIGNR.
+    MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
+    Lo = DAG.getNode(ISD::BITCAST, DL, AlignVT, Lo);
+    Hi = DAG.getNode(ISD::BITCAST, DL, AlignVT, Hi);
 
     return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
+                       DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
                                    DAG.getConstant(Rotation * Scale, MVT::i8)));
   }
 
+  assert(VT.getSizeInBits() == 128 &&
+         "Rotate-based lowering only supports 128-bit lowering!");
+  assert(Mask.size() <= 16 &&
+         "Can shuffle at most 16 bytes in a 128-bit vector!");
+
   // Default SSE2 implementation
   int LoByteShift = 16 - Rotation * Scale;
   int HiByteShift = Rotation * Scale;
@@ -7594,9 +6412,9 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
   Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
 
   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
-                                DAG.getConstant(8 * LoByteShift, MVT::i8));
+                                DAG.getConstant(LoByteShift, MVT::i8));
   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
-                                DAG.getConstant(8 * HiByteShift, MVT::i8));
+                                DAG.getConstant(HiByteShift, MVT::i8));
   return DAG.getNode(ISD::BITCAST, DL, VT,
                      DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
 }
@@ -7613,6 +6431,11 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
                                                      SDValue V1, SDValue V2) {
   SmallBitVector Zeroable(Mask.size(), false);
 
+  while (V1.getOpcode() == ISD::BITCAST)
+    V1 = V1->getOperand(0);
+  while (V2.getOpcode() == ISD::BITCAST)
+    V2 = V2->getOperand(0);
+
   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
 
@@ -7624,10 +6447,10 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
       continue;
     }
 
-    // If this is an index into a build_vector node, dig out the input value and
-    // use it.
+    // If this is an index into a build_vector node (which has the same number
+    // of elements), dig out the input value and use it.
     SDValue V = M < Size ? V1 : V2;
-    if (V.getOpcode() != ISD::BUILD_VECTOR)
+    if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
       continue;
 
     SDValue Input = V.getOperand(M % Size);
@@ -7640,85 +6463,133 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
   return Zeroable;
 }
 
-/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
-///
-/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
-/// byte-shift instructions. The mask must consist of a shifted sequential
-/// shuffle from one of the input vectors and zeroable elements for the
-/// remaining 'shifted in' elements.
+/// \brief Try to emit a bitmask instruction for a shuffle.
 ///
-/// Note that this only handles 128-bit vector widths currently.
-static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
-                                             SDValue V2, ArrayRef<int> Mask,
-                                             SelectionDAG &DAG) {
-  assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+/// This handles cases where we can model a blend exactly as a bitmask due to
+/// one of the inputs being zeroable.
+static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
+                                           SDValue V2, ArrayRef<int> Mask,
+                                           SelectionDAG &DAG) {
+  MVT EltVT = VT.getScalarType();
+  int NumEltBits = EltVT.getSizeInBits();
+  MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
+  SDValue Zero = DAG.getConstant(0, IntEltVT);
+  SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
+  if (EltVT.isFloatingPoint()) {
+    Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
+    AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
+  }
+  SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  SDValue V;
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    if (Zeroable[i])
+      continue;
+    if (Mask[i] % Size != i)
+      return SDValue(); // Not a blend.
+    if (!V)
+      V = Mask[i] < Size ? V1 : V2;
+    else if (V != (Mask[i] < Size ? V1 : V2))
+      return SDValue(); // Can only let one input through the mask.
+
+    VMaskOps[i] = AllOnes;
+  }
+  if (!V)
+    return SDValue(); // No non-zeroable elements!
+
+  SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
+  V = DAG.getNode(VT.isFloatingPoint()
+                  ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
+                  DL, VT, V, VMask);
+  return V;
+}
 
+/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
+///
+/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
+/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
+/// matches elements from one of the input vectors shuffled to the left or
+/// right with zeroable elements 'shifted in'. It handles both the strictly
+/// bit-wise element shifts and the byte shift across an entire 128-bit double
+/// quad word lane.
+///
+/// PSHL : (little-endian) left bit shift.
+/// [ zz, 0, zz,  2 ]
+/// [ -1, 4, zz, -1 ]
+/// PSRL : (little-endian) right bit shift.
+/// [  1, zz,  3, zz]
+/// [ -1, -1,  7, zz]
+/// PSLLDQ : (little-endian) left byte shift
+/// [ zz,  0,  1,  2,  3,  4,  5,  6]
+/// [ zz, zz, -1, -1,  2,  3,  4, -1]
+/// [ zz, zz, zz, zz, zz, zz, -1,  1]
+/// PSRLDQ : (little-endian) right byte shift
+/// [  5, 6,  7, zz, zz, zz, zz, zz]
+/// [ -1, 5,  6,  7, zz, zz, zz, zz]
+/// [  1, 2, -1, -1, -1, -1, zz, zz]
+static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
+                                         SDValue V2, ArrayRef<int> Mask,
+                                         SelectionDAG &DAG) {
   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
 
   int Size = Mask.size();
-  int Scale = 16 / Size;
+  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+  auto CheckZeros = [&](int Shift, int Scale, bool Left) {
+    for (int i = 0; i < Size; i += Scale)
+      for (int j = 0; j < Shift; ++j)
+        if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
+          return false;
 
-  auto isSequential = [](int Base, int StartIndex, int EndIndex, int MaskOffset,
-                         ArrayRef<int> Mask) {
-    for (int i = StartIndex; i < EndIndex; i++) {
-      if (Mask[i] < 0)
-        continue;
-      if (i + Base != Mask[i] - MaskOffset)
-        return false;
-    }
     return true;
   };
 
-  for (int Shift = 1; Shift < Size; Shift++) {
-    int ByteShift = Shift * Scale;
-
-    // PSRLDQ : (little-endian) right byte shift
-    // [ 5,  6,  7, zz, zz, zz, zz, zz]
-    // [ -1, 5,  6,  7, zz, zz, zz, zz]
-    // [  1, 2, -1, -1, -1, -1, zz, zz]
-    bool ZeroableRight = true;
-    for (int i = Size - Shift; i < Size; i++) {
-      ZeroableRight &= Zeroable[i];
-    }
-
-    if (ZeroableRight) {
-      bool ValidShiftRight1 = isSequential(Shift, 0, Size - Shift, 0, Mask);
-      bool ValidShiftRight2 = isSequential(Shift, 0, Size - Shift, Size, Mask);
-
-      if (ValidShiftRight1 || ValidShiftRight2) {
-        // Cast the inputs to v2i64 to match PSRLDQ.
-        SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
-        SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
-        SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
-                                      DAG.getConstant(ByteShift * 8, MVT::i8));
-        return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
-      }
+  auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
+    for (int i = 0; i != Size; i += Scale) {
+      unsigned Pos = Left ? i + Shift : i;
+      unsigned Low = Left ? i : i + Shift;
+      unsigned Len = Scale - Shift;
+      if (!isSequentialOrUndefInRange(Mask, Pos, Len,
+                                      Low + (V == V1 ? 0 : Size)))
+        return SDValue();
     }
 
-    // PSLLDQ : (little-endian) left byte shift
-    // [ zz,  0,  1,  2,  3,  4,  5,  6]
-    // [ zz, zz, -1, -1,  2,  3,  4, -1]
-    // [ zz, zz, zz, zz, zz, zz, -1,  1]
-    bool ZeroableLeft = true;
-    for (int i = 0; i < Shift; i++) {
-      ZeroableLeft &= Zeroable[i];
-    }
+    int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
+    bool ByteShift = ShiftEltBits > 64;
+    unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
+                           : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
+    int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
 
-    if (ZeroableLeft) {
-      bool ValidShiftLeft1 = isSequential(-Shift, Shift, Size, 0, Mask);
-      bool ValidShiftLeft2 = isSequential(-Shift, Shift, Size, Size, Mask);
+    // Normalize the scale for byte shifts to still produce an i64 element
+    // type.
+    Scale = ByteShift ? Scale / 2 : Scale;
 
-      if (ValidShiftLeft1 || ValidShiftLeft2) {
-        // Cast the inputs to v2i64 to match PSLLDQ.
-        SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
-        SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
-        SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
-                                      DAG.getConstant(ByteShift * 8, MVT::i8));
-        return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
-      }
-    }
-  }
+    // We need to round trip through the appropriate type for the shift.
+    MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
+    MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
+    assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
+           "Illegal integer vector type");
+    V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
 
+    V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
+    return DAG.getNode(ISD::BITCAST, DL, VT, V);
+  };
+
+  // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
+  // keep doubling the size of the integer elements up to that. We can
+  // then shift the elements of the integer vector by whole multiples of
+  // their width within the elements of the larger integer vector. Test each
+  // multiple to see if we can find a match with the moved element indices
+  // and that the shifted in elements are all zeroable.
+  for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2)
+    for (int Shift = 1; Shift != Scale; ++Shift)
+      for (bool Left : {true, false})
+        if (CheckZeros(Shift, Scale, Left))
+          for (SDValue V : {V1, V2})
+            if (SDValue Match = MatchShift(Shift, Scale, Left, V))
+              return Match;
+
+  // no match
   return SDValue();
 }
 
@@ -7728,10 +6599,11 @@ static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
 /// stride, produce either a zero or any extension based on the available
 /// features of the subtarget.
 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
-    SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV,
+    SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   assert(Scale > 1 && "Need a scale to extend.");
-  int EltBits = VT.getSizeInBits() / NumElements;
+  int NumElements = VT.getVectorNumElements();
+  int EltBits = VT.getScalarSizeInBits();
   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
          "Only 8, 16, and 32 bit elements can be extended.");
   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
@@ -7739,10 +6611,8 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
   // Found a valid zext mask! Try various lowering strategies based on the
   // input type and available ISA extensions.
   if (Subtarget->hasSSE41()) {
-    MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
                                  NumElements / Scale);
-    InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
     return DAG.getNode(ISD::BITCAST, DL, VT,
                        DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
   }
@@ -7800,7 +6670,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
   return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
 }
 
-/// \brief Try to lower a vector shuffle as a zero extension on any micrarch.
+/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
 ///
 /// This routine will try to do everything in its power to cleverly lower
 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
@@ -7818,7 +6688,10 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
 
   int Bits = VT.getSizeInBits();
-  int NumElements = Mask.size();
+  int NumElements = VT.getVectorNumElements();
+  assert(VT.getScalarSizeInBits() <= 32 &&
+         "Exceeds 32-bit integer zero extension limit");
+  assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
 
   // Define a helper function to check a particular ext-scale and lower to it if
   // valid.
@@ -7829,11 +6702,11 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
       if (Mask[i] == -1)
         continue; // Valid anywhere but doesn't tell us anything.
       if (i % Scale != 0) {
-        // Each of the extend elements needs to be zeroable.
+        // Each of the extended elements need to be zeroable.
         if (!Zeroable[i])
           return SDValue();
 
-        // We no lorger are in the anyext case.
+        // We no longer are in the anyext case.
         AnyExt = false;
         continue;
       }
@@ -7847,7 +6720,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
         return SDValue(); // Flip-flopping inputs.
 
       if (Mask[i] % NumElements != i / Scale)
-        return SDValue(); // Non-consecutive strided elemenst.
+        return SDValue(); // Non-consecutive strided elements.
     }
 
     // If we fail to find an input, we have a zero-shuffle which should always
@@ -7857,7 +6730,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
       return SDValue();
 
     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
-        DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG);
+        DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
   };
 
   // The widest scale possible for extending is to a 64-bit integer.
@@ -7869,11 +6742,34 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
   // many elements.
   for (; NumExtElements < NumElements; NumExtElements *= 2) {
     assert(NumElements % NumExtElements == 0 &&
-           "The input vector size must be divisble by the extended size.");
+           "The input vector size must be divisible by the extended size.");
     if (SDValue V = Lower(NumElements / NumExtElements))
       return V;
   }
 
+  // General extends failed, but 128-bit vectors may be able to use MOVQ.
+  if (Bits != 128)
+    return SDValue();
+
+  // Returns one of the source operands if the shuffle can be reduced to a
+  // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
+  auto CanZExtLowHalf = [&]() {
+    for (int i = NumElements / 2; i != NumElements; ++i)
+      if (!Zeroable[i])
+        return SDValue();
+    if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
+      return V1;
+    if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
+      return V2;
+    return SDValue();
+  };
+
+  if (SDValue V = CanZExtLowHalf()) {
+    V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
+    V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
+    return DAG.getNode(ISD::BITCAST, DL, VT, V);
+  }
+
   // No viable ext lowering found.
   return SDValue();
 }
@@ -7916,7 +6812,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
 /// This is a common pattern that we have especially efficient patterns to lower
 /// across all subtarget feature sets.
 static SDValue lowerVectorShuffleAsElementInsertion(
-    MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   MVT ExtVT = VT;
@@ -7983,6 +6879,10 @@ static SDValue lowerVectorShuffleAsElementInsertion(
                        ExtVT, V1, V2);
   }
 
+  // This lowering only works for the low element with floating point vectors.
+  if (VT.isFloatingPoint() && V2Index != 0)
+    return SDValue();
+
   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
   if (ExtVT != VT)
     V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
@@ -8001,7 +6901,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
       V2 = DAG.getNode(
           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
           DAG.getConstant(
-              V2Index * EltVT.getSizeInBits(),
+              V2Index * EltVT.getSizeInBits()/8,
               DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
       V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
     }
@@ -8014,7 +6914,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
 /// For convenience, this code also bundles all of the subtarget feature set
 /// filtering. While a little annoying to re-dispatch on type here, there isn't
 /// a convenient way to factor it out.
-static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
+static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
                                              ArrayRef<int> Mask,
                                              const X86Subtarget *Subtarget,
                                              SelectionDAG &DAG) {
@@ -8086,6 +6986,199 @@ static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
   return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
 }
 
+// Check for whether we can use INSERTPS to perform the shuffle. We only use
+// INSERTPS when the V1 elements are already in the correct locations
+// because otherwise we can just always use two SHUFPS instructions which
+// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
+// perform INSERTPS if a single V1 element is out of place and all V2
+// elements are zeroable.
+static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
+                                            ArrayRef<int> Mask,
+                                            SelectionDAG &DAG) {
+  assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+  unsigned ZMask = 0;
+  int V1DstIndex = -1;
+  int V2DstIndex = -1;
+  bool V1UsedInPlace = false;
+
+  for (int i = 0; i < 4; ++i) {
+    // Synthesize a zero mask from the zeroable elements (includes undefs).
+    if (Zeroable[i]) {
+      ZMask |= 1 << i;
+      continue;
+    }
+
+    // Flag if we use any V1 inputs in place.
+    if (i == Mask[i]) {
+      V1UsedInPlace = true;
+      continue;
+    }
+
+    // We can only insert a single non-zeroable element.
+    if (V1DstIndex != -1 || V2DstIndex != -1)
+      return SDValue();
+
+    if (Mask[i] < 4) {
+      // V1 input out of place for insertion.
+      V1DstIndex = i;
+    } else {
+      // V2 input for insertion.
+      V2DstIndex = i;
+    }
+  }
+
+  // Don't bother if we have no (non-zeroable) element for insertion.
+  if (V1DstIndex == -1 && V2DstIndex == -1)
+    return SDValue();
+
+  // Determine element insertion src/dst indices. The src index is from the
+  // start of the inserted vector, not the start of the concatenated vector.
+  unsigned V2SrcIndex = 0;
+  if (V1DstIndex != -1) {
+    // If we have a V1 input out of place, we use V1 as the V2 element insertion
+    // and don't use the original V2 at all.
+    V2SrcIndex = Mask[V1DstIndex];
+    V2DstIndex = V1DstIndex;
+    V2 = V1;
+  } else {
+    V2SrcIndex = Mask[V2DstIndex] - 4;
+  }
+
+  // If no V1 inputs are used in place, then the result is created only from
+  // the zero mask and the V2 insertion - so remove V1 dependency.
+  if (!V1UsedInPlace)
+    V1 = DAG.getUNDEF(MVT::v4f32);
+
+  unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
+  assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+
+  // Insert the V2 element into the desired position.
+  SDLoc DL(Op);
+  return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+                     DAG.getConstant(InsertPSMask, MVT::i8));
+}
+
+/// \brief Try to lower a shuffle as a permute of the inputs followed by an
+/// UNPCK instruction.
+///
+/// This specifically targets cases where we end up with alternating between
+/// the two inputs, and so can permute them into something that feeds a single
+/// UNPCK instruction. Note that this routine only targets integer vectors
+/// because for floating point vectors we have a generalized SHUFPS lowering
+/// strategy that handles everything that doesn't *exactly* match an unpack,
+/// making this clever lowering unnecessary.
+static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1,
+                                          SDValue V2, ArrayRef<int> Mask,
+                                          SelectionDAG &DAG) {
+  assert(!VT.isFloatingPoint() &&
+         "This routine only supports integer vectors.");
+  assert(!isSingleInputShuffleMask(Mask) &&
+         "This routine should only be used when blending two inputs.");
+  assert(Mask.size() >= 2 && "Single element masks are invalid.");
+
+  int Size = Mask.size();
+
+  int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
+    return M >= 0 && M % Size < Size / 2;
+  });
+  int NumHiInputs = std::count_if(
+      Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; });
+
+  bool UnpackLo = NumLoInputs >= NumHiInputs;
+
+  auto TryUnpack = [&](MVT UnpackVT, int Scale) {
+    SmallVector<int, 32> V1Mask(Mask.size(), -1);
+    SmallVector<int, 32> V2Mask(Mask.size(), -1);
+
+    for (int i = 0; i < Size; ++i) {
+      if (Mask[i] < 0)
+        continue;
+
+      // Each element of the unpack contains Scale elements from this mask.
+      int UnpackIdx = i / Scale;
+
+      // We only handle the case where V1 feeds the first slots of the unpack.
+      // We rely on canonicalization to ensure this is the case.
+      if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
+        return SDValue();
+
+      // Setup the mask for this input. The indexing is tricky as we have to
+      // handle the unpack stride.
+      SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
+      VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
+          Mask[i] % Size;
+    }
+
+    // If we will have to shuffle both inputs to use the unpack, check whether
+    // we can just unpack first and shuffle the result. If so, skip this unpack.
+    if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
+        !isNoopShuffleMask(V2Mask))
+      return SDValue();
+
+    // Shuffle the inputs into place.
+    V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+    V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+
+    // Cast the inputs to the type we will use to unpack them.
+    V1 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V1);
+    V2 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V2);
+
+    // Unpack the inputs and cast the result back to the desired type.
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
+                                   DL, UnpackVT, V1, V2));
+  };
+
+  // We try each unpack from the largest to the smallest to try and find one
+  // that fits this mask.
+  int OrigNumElements = VT.getVectorNumElements();
+  int OrigScalarSize = VT.getScalarSizeInBits();
+  for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
+    int Scale = ScalarSize / OrigScalarSize;
+    int NumElements = OrigNumElements / Scale;
+    MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
+    if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
+      return Unpack;
+  }
+
+  // If none of the unpack-rooted lowerings worked (or were profitable) try an
+  // initial unpack.
+  if (NumLoInputs == 0 || NumHiInputs == 0) {
+    assert((NumLoInputs > 0 || NumHiInputs > 0) &&
+           "We have to have *some* inputs!");
+    int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
+
+    // FIXME: We could consider the total complexity of the permute of each
+    // possible unpacking. Or at the least we should consider how many
+    // half-crossings are created.
+    // FIXME: We could consider commuting the unpacks.
+
+    SmallVector<int, 32> PermMask;
+    PermMask.assign(Size, -1);
+    for (int i = 0; i < Size; ++i) {
+      if (Mask[i] < 0)
+        continue;
+
+      assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
+
+      PermMask[i] =
+          2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
+    }
+    return DAG.getVectorShuffle(
+        VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
+                            DL, VT, V1, V2),
+        DAG.getUNDEF(VT), PermMask);
+  }
+
+  return SDValue();
+}
+
 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
 ///
 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
@@ -8105,6 +7198,11 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
 
   if (isSingleInputShuffleMask(Mask)) {
+    // Use low duplicate instructions for masks that match their pattern.
+    if (Subtarget->hasSSE3())
+      if (isShuffleEquivalent(V1, V2, Mask, {0, 0}))
+        return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
+
     // Straight shuffle of a single input vector. Simulate this by using the
     // single input as both of the "inputs" to this instruction..
     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
@@ -8122,29 +7220,24 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 2))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
-  if (isShuffleEquivalent(Mask, 1, 3))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
-
   // If we have a single input, insert that into V1 if we can do so cheaply.
   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
-            MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
+            DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
       return Insertion;
     // Try inverting the insertion since for v2 masks it is easy to do and we
     // can't reliably sort the mask one way or the other.
     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
-            MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
+            DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
       return Insertion;
   }
 
   // Try to use one of the special instruction patterns to handle two common
   // blend patterns if a zero-blend above didn't work.
-  if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+      isShuffleEquivalent(V1, V2, Mask, {1, 3}))
     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
       // We can either use a special instruction to load over the low double or
       // to move just the low double.
@@ -8158,6 +7251,12 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                   Subtarget, DAG))
       return Blend;
 
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
+
   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
   return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
                      DAG.getConstant(SHUFPDMask, MVT::i8));
@@ -8182,7 +7281,7 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 
   if (isSingleInputShuffleMask(Mask)) {
     // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1,
                                                           Mask, Subtarget, DAG))
       return Broadcast;
 
@@ -8198,37 +7297,60 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
         DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
                     getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
   }
+  assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
+  assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
+  assert(Mask[0] < 2 && "We sort V1 to be the first input.");
+  assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+
+  // If we have a blend of two PACKUS operations an the blend aligns with the
+  // low and half halves, we can just merge the PACKUS operations. This is
+  // particularly important as it lets us merge shuffles that this routine itself
+  // creates.
+  auto GetPackNode = [](SDValue V) {
+    while (V.getOpcode() == ISD::BITCAST)
+      V = V.getOperand(0);
 
-  // If we have a single input from V2 insert that into V1 if we can do so
-  // cheaply.
-  if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
-    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
-            MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
-      return Insertion;
-    // Try inverting the insertion since for v2 masks it is easy to do and we
-    // can't reliably sort the mask one way or the other.
-    int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
-                          Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
-    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
-            MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
-      return Insertion;
-  }
-
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 2))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
-  if (isShuffleEquivalent(Mask, 1, 3))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
+    return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
+  };
+  if (SDValue V1Pack = GetPackNode(V1))
+    if (SDValue V2Pack = GetPackNode(V2))
+      return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                         DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
+                                     Mask[0] == 0 ? V1Pack.getOperand(0)
+                                                  : V1Pack.getOperand(1),
+                                     Mask[1] == 2 ? V2Pack.getOperand(0)
+                                                  : V2Pack.getOperand(1)));
+
+  // Try to use shift instructions.
+  if (SDValue Shift =
+          lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG))
+    return Shift;
 
-  if (Subtarget->hasSSE41())
+  // When loading a scalar and then shuffling it into a vector we can often do
+  // the insertion cheaply.
+  if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+          DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+    return Insertion;
+  // Try inverting the insertion since for v2 masks it is easy to do and we
+  // can't reliably sort the mask one way or the other.
+  int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
+  if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+          DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
+    return Insertion;
+
+  // We have different paths for blend lowering, but they all must use the
+  // *exact* same predicate.
+  bool IsBlendSupported = Subtarget->hasSSE41();
+  if (IsBlendSupported)
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
                                                   Subtarget, DAG))
       return Blend;
 
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v2i64, V1, V2, Mask, DAG))
-    return Shift;
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
 
   // Try to use byte rotation instructions.
   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
@@ -8237,6 +7359,12 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
       return Rotate;
 
+  // If we have direct support for blends, we should lower by decomposing into
+  // a permute. That will be faster than the domain cross.
+  if (IsBlendSupported)
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
+                                                      Mask, DAG);
+
   // We implement this with SHUFPD which is pretty lame because it will likely
   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
   // However, all the alternatives are still more cycles and newer chips don't
@@ -8247,6 +7375,24 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                      DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
 }
 
+/// \brief Test whether this can be lowered with a single SHUFPS instruction.
+///
+/// This is used to disable more specialized lowerings when the shufps lowering
+/// will happen to be efficient.
+static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
+  // This routine only handles 128-bit shufps.
+  assert(Mask.size() == 4 && "Unsupported mask size!");
+
+  // To lower with a single SHUFPS we need to have the low half and high half
+  // each requiring a single input.
+  if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4))
+    return false;
+  if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4))
+    return false;
+
+  return true;
+}
+
 /// \brief Lower a vector shuffle using the SHUFPS instruction.
 ///
 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
@@ -8358,10 +7504,18 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 
   if (NumV2Elements == 0) {
     // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1,
                                                           Mask, Subtarget, DAG))
       return Broadcast;
 
+    // Use even/odd duplicate instructions for masks that match their pattern.
+    if (Subtarget->hasSSE3()) {
+      if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+        return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
+      if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
+        return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
+    }
+
     if (Subtarget->hasAVX()) {
       // If we have AVX, we can use VPERMILPS which will allow folding a load
       // into the shuffle.
@@ -8375,70 +7529,41 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                        getV4X86ShuffleImm8ForMask(Mask, DAG));
   }
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
-  if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
-
   // There are special ways we can lower some single-element blends. However, we
   // have custom ways we can lower more complex single-element blends below that
   // we defer to if both this and BLENDPS fail to match, so restrict this to
   // when the V2 input is targeting element 0 of the mask -- that is the fast
   // case here.
   if (NumV2Elements == 1 && Mask[0] >= 4)
-    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
                                                          Mask, Subtarget, DAG))
       return V;
 
-  if (Subtarget->hasSSE41())
+  if (Subtarget->hasSSE41()) {
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
                                                   Subtarget, DAG))
       return Blend;
 
-  // Check for whether we can use INSERTPS to perform the blend. We only use
-  // INSERTPS when the V1 elements are already in the correct locations
-  // because otherwise we can just always use two SHUFPS instructions which
-  // are much smaller to encode than a SHUFPS and an INSERTPS.
-  if (NumV2Elements == 1 && Subtarget->hasSSE41()) {
-    int V2Index =
-        std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
-        Mask.begin();
-
-    // When using INSERTPS we can zero any lane of the destination. Collect
-    // the zero inputs into a mask and drop them from the lanes of V1 which
-    // actually need to be present as inputs to the INSERTPS.
-    SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-
-    // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
-    bool InsertNeedsShuffle = false;
-    unsigned ZMask = 0;
-    for (int i = 0; i < 4; ++i)
-      if (i != V2Index) {
-        if (Zeroable[i]) {
-          ZMask |= 1 << i;
-        } else if (Mask[i] != i) {
-          InsertNeedsShuffle = true;
-          break;
-        }
-      }
-
-    // We don't want to use INSERTPS or other insertion techniques if it will
-    // require shuffling anyways.
-    if (!InsertNeedsShuffle) {
-      // If all of V1 is zeroable, replace it with undef.
-      if ((ZMask | 1 << V2Index) == 0xF)
-        V1 = DAG.getUNDEF(MVT::v4f32);
-
-      unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
-      assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+    // Use INSERTPS if we can complete the shuffle efficiently.
+    if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
+      return V;
 
-      // Insert the V2 element into the desired position.
-      return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
-                         DAG.getConstant(InsertPSMask, MVT::i8));
-    }
+    if (!isSingleSHUFPSMask(Mask))
+      if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
+              DL, MVT::v4f32, V1, V2, Mask, DAG))
+        return BlendPerm;
   }
 
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1);
+  if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1);
+
   // Otherwise fall back to a SHUFPS lowering strategy.
   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
 }
@@ -8470,7 +7595,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 
   if (NumV2Elements == 0) {
     // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1,
                                                           Mask, Subtarget, DAG))
       return Broadcast;
 
@@ -8481,36 +7606,47 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     // so prevents folding a load into this instruction or making a copy.
     const int UnpackLoMask[] = {0, 0, 1, 1};
     const int UnpackHiMask[] = {2, 2, 3, 3};
-    if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
       Mask = UnpackLoMask;
-    else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
+    else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
       Mask = UnpackHiMask;
 
     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
                        getV4X86ShuffleImm8ForMask(Mask, DAG));
   }
 
+  // Try to use shift instructions.
+  if (SDValue Shift =
+          lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG))
+    return Shift;
+
   // There are special ways we can lower some single-element blends.
   if (NumV2Elements == 1)
-    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
                                                          Mask, Subtarget, DAG))
       return V;
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
-  if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
-
-  if (Subtarget->hasSSE41())
+  // We have different paths for blend lowering, but they all must use the
+  // *exact* same predicate.
+  bool IsBlendSupported = Subtarget->hasSSE41();
+  if (IsBlendSupported)
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
                                                   Subtarget, DAG))
       return Blend;
 
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v4i32, V1, V2, Mask, DAG))
-    return Shift;
+  if (SDValue Masked =
+          lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
+    return Masked;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1);
+  if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1);
 
   // Try to use byte rotation instructions.
   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
@@ -8519,6 +7655,17 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
       return Rotate;
 
+  // If we have direct support for blends, we should lower by decomposing into
+  // a permute. That will be faster than the domain cross.
+  if (IsBlendSupported)
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
+                                                      Mask, DAG);
+
+  // Try to lower by permuting the inputs into an unpack instruction.
+  if (SDValue Unpack =
+          lowerVectorShuffleAsUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG))
+    return Unpack;
+
   // We implement this with SHUFPS because it can blend from two vectors.
   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
   // up the inputs, bypassing domain shift penalties that we would encur if we
@@ -8542,7 +7689,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 /// The exact breakdown of how to form these dword pairs and align them on the
 /// correct sides is really tricky. See the comments within the function for
 /// more of the details.
-static SDValue lowerV8I16SingleInputVectorShuffle(
+static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
     SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
@@ -8570,27 +7717,6 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
 
-  // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
-                                                        Mask, Subtarget, DAG))
-    return Broadcast;
-
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
-  if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
-
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v8i16, V, V, Mask, DAG))
-    return Shift;
-
-  // Try to use byte rotation instructions.
-  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
-          DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
-    return Rotate;
-
   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
   // such inputs we can swap two of the dwords across the half mark and end up
   // with <=2 inputs to each half in each half. Once there, we can fall through
@@ -8993,158 +8119,56 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
   return V;
 }
 
-/// \brief Detect whether the mask pattern should be lowered through
-/// interleaving.
-///
-/// This essentially tests whether viewing the mask as an interleaving of two
-/// sub-sequences reduces the cross-input traffic of a blend operation. If so,
-/// lowering it through interleaving is a significantly better strategy.
-static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
-  int NumEvenInputs[2] = {0, 0};
-  int NumOddInputs[2] = {0, 0};
-  int NumLoInputs[2] = {0, 0};
-  int NumHiInputs[2] = {0, 0};
-  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
-    if (Mask[i] < 0)
-      continue;
-
-    int InputIdx = Mask[i] >= Size;
-
-    if (i < Size / 2)
-      ++NumLoInputs[InputIdx];
-    else
-      ++NumHiInputs[InputIdx];
-
-    if ((i % 2) == 0)
-      ++NumEvenInputs[InputIdx];
-    else
-      ++NumOddInputs[InputIdx];
-  }
-
-  // The minimum number of cross-input results for both the interleaved and
-  // split cases. If interleaving results in fewer cross-input results, return
-  // true.
-  int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
-                                    NumEvenInputs[0] + NumOddInputs[1]);
-  int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
-                              NumLoInputs[0] + NumHiInputs[1]);
-  return InterleavedCrosses < SplitCrosses;
-}
-
-/// \brief Blend two v8i16 vectors using a naive unpack strategy.
-///
-/// This strategy only works when the inputs from each vector fit into a single
-/// half of that vector, and generally there are not so many inputs as to leave
-/// the in-place shuffles required highly constrained (and thus expensive). It
-/// shifts all the inputs into a single side of both input vectors and then
-/// uses an unpack to interleave these inputs in a single vector. At that
-/// point, we will fall back on the generic single input shuffle lowering.
-static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
-                                                 SDValue V2,
-                                                 MutableArrayRef<int> Mask,
-                                                 const X86Subtarget *Subtarget,
-                                                 SelectionDAG &DAG) {
-  assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
-  assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
-  SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
-  for (int i = 0; i < 8; ++i)
-    if (Mask[i] >= 0 && Mask[i] < 4)
-      LoV1Inputs.push_back(i);
-    else if (Mask[i] >= 4 && Mask[i] < 8)
-      HiV1Inputs.push_back(i);
-    else if (Mask[i] >= 8 && Mask[i] < 12)
-      LoV2Inputs.push_back(i);
-    else if (Mask[i] >= 12)
-      HiV2Inputs.push_back(i);
-
-  int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
-  int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
-  (void)NumV1Inputs;
-  (void)NumV2Inputs;
-  assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
-  assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
-  assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
-
-  bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
-                     HiV1Inputs.size() + HiV2Inputs.size();
-
-  auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
-                              ArrayRef<int> HiInputs, bool MoveToLo,
-                              int MaskOffset) {
-    ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
-    ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
-    if (BadInputs.empty())
-      return V;
-
-    int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
-    int MoveOffset = MoveToLo ? 0 : 4;
+/// \brief Helper to form a PSHUFB-based shuffle+blend.
+static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
+                                          SDValue V2, ArrayRef<int> Mask,
+                                          SelectionDAG &DAG, bool &V1InUse,
+                                          bool &V2InUse) {
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  SDValue V1Mask[16];
+  SDValue V2Mask[16];
+  V1InUse = false;
+  V2InUse = false;
 
-    if (GoodInputs.empty()) {
-      for (int BadInput : BadInputs) {
-        MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
-        Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
-      }
+  int Size = Mask.size();
+  int Scale = 16 / Size;
+  for (int i = 0; i < 16; ++i) {
+    if (Mask[i / Scale] == -1) {
+      V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
     } else {
-      if (GoodInputs.size() == 2) {
-        // If the low inputs are spread across two dwords, pack them into
-        // a single dword.
-        MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
-        MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
-        Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
-        Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
-      } else {
-        // Otherwise pin the good inputs.
-        for (int GoodInput : GoodInputs)
-          MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
-      }
-
-      if (BadInputs.size() == 2) {
-        // If we have two bad inputs then there may be either one or two good
-        // inputs fixed in place. Find a fixed input, and then find the *other*
-        // two adjacent indices by using modular arithmetic.
-        int GoodMaskIdx =
-            std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
-                         [](int M) { return M >= 0; }) -
-            std::begin(MoveMask);
-        int MoveMaskIdx =
-            ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
-        assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
-        assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
-        MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
-        MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
-        Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
-        Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
-      } else {
-        assert(BadInputs.size() == 1 && "All sizes handled");
-        int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
-                                    std::end(MoveMask), -1) -
-                          std::begin(MoveMask);
-        MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
-        Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
-      }
-    }
-
-    return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
-                                MoveMask);
-  };
-  V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
-                        /*MaskOffset*/ 0);
-  V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
-                        /*MaskOffset*/ 8);
-
-  // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
-  // cross-half traffic in the final shuffle.
+      const int ZeroMask = 0x80;
+      int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
+                                          : ZeroMask;
+      int V2Idx = Mask[i / Scale] < Size
+                      ? ZeroMask
+                      : (Mask[i / Scale] - Size) * Scale + i % Scale;
+      if (Zeroable[i / Scale])
+        V1Idx = V2Idx = ZeroMask;
+      V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
+      V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
+      V1InUse |= (ZeroMask != V1Idx);
+      V2InUse |= (ZeroMask != V2Idx);
+    }
+  }
+
+  if (V1InUse)
+    V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+                     DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V1),
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
+  if (V2InUse)
+    V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+                     DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V2),
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
 
-  // Munge the mask to be a single-input mask after the unpack merges the
-  // results.
-  for (int &M : Mask)
-    if (M != -1)
-      M = 2 * (M % 4) + (M / 8);
+  // If we need shuffled inputs from both, blend the two.
+  SDValue V;
+  if (V1InUse && V2InUse)
+    V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+  else
+    V = V1InUse ? V1 : V2;
 
-  return DAG.getVectorShuffle(
-      MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
-                                  DL, MVT::v8i16, V1, V2),
-      DAG.getUNDEF(MVT::v8i16), Mask);
+  // Cast the result back to the correct type.
+  return DAG.getNode(ISD::BITCAST, DL, VT, V);
 }
 
 /// \brief Generic lowering of 8-lane i16 shuffles.
@@ -9181,85 +8205,95 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return ZExt;
 
   auto isV1 = [](int M) { return M >= 0 && M < 8; };
+  (void)isV1;
   auto isV2 = [](int M) { return M >= 8; };
 
-  int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
   int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
 
-  if (NumV2Inputs == 0)
-    return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
+  if (NumV2Inputs == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
+    // Try to use shift instructions.
+    if (SDValue Shift =
+            lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG))
+      return Shift;
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3}))
+      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1);
+    if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7}))
+      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1);
+
+    // Try to use byte rotation instructions.
+    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
+                                                        Mask, Subtarget, DAG))
+      return Rotate;
+
+    return lowerV8I16GeneralSingleInputVectorShuffle(DL, V1, Mask, Subtarget,
+                                                     DAG);
+  }
 
-  assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
-                            "to be V1-input shuffles.");
+  assert(std::any_of(Mask.begin(), Mask.end(), isV1) &&
+         "All single-input shuffles should be canonicalized to be V1-input "
+         "shuffles.");
+
+  // Try to use shift instructions.
+  if (SDValue Shift =
+          lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
+    return Shift;
 
   // There are special ways we can lower some single-element blends.
   if (NumV2Inputs == 1)
-    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
                                                          Mask, Subtarget, DAG))
       return V;
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
-  if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
-
-  if (Subtarget->hasSSE41())
+  // We have different paths for blend lowering, but they all must use the
+  // *exact* same predicate.
+  bool IsBlendSupported = Subtarget->hasSSE41();
+  if (IsBlendSupported)
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
                                                   Subtarget, DAG))
       return Blend;
 
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v8i16, V1, V2, Mask, DAG))
-    return Shift;
+  if (SDValue Masked =
+          lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
+    return Masked;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11}))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15}))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
-  if (NumV1Inputs + NumV2Inputs <= 4)
-    return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
-
-  // Check whether an interleaving lowering is likely to be more efficient.
-  // This isn't perfect but it is a strong heuristic that tends to work well on
-  // the kinds of shuffles that show up in practice.
-  //
-  // FIXME: Handle 1x, 2x, and 4x interleaving.
-  if (shouldLowerAsInterleaving(Mask)) {
-    // FIXME: Figure out whether we should pack these into the low or high
-    // halves.
+  if (SDValue BitBlend =
+          lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+    return BitBlend;
 
-    int EMask[8], OMask[8];
-    for (int i = 0; i < 4; ++i) {
-      EMask[i] = Mask[2*i];
-      OMask[i] = Mask[2*i + 1];
-      EMask[i + 4] = -1;
-      OMask[i + 4] = -1;
-    }
+  if (SDValue Unpack =
+          lowerVectorShuffleAsUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG))
+    return Unpack;
 
-    SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
-    SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
-
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
-  }
-
-  int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
-  int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
-
-  for (int i = 0; i < 4; ++i) {
-    LoBlendMask[i] = Mask[i];
-    HiBlendMask[i] = Mask[i + 4];
+  // If we can't directly blend but can use PSHUFB, that will be better as it
+  // can both shuffle and set up the inefficient blend.
+  if (!IsBlendSupported && Subtarget->hasSSSE3()) {
+    bool V1InUse, V2InUse;
+    return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG,
+                                      V1InUse, V2InUse);
   }
 
-  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
-  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
-  LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
-  HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
-
-  return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                     DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
+  // We can always bit-blend if we have to so the fallback strategy is to
+  // decompose into single-input permutes and blends.
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
+                                                      Mask, DAG);
 }
 
 /// \brief Check whether a compaction lowering can be done by dropping even
@@ -9345,40 +8379,31 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> OrigMask = SVOp->getMask();
-  assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
 
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v16i8, V1, V2, OrigMask, DAG))
+  // Try to use shift instructions.
+  if (SDValue Shift =
+          lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG))
     return Shift;
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
-          DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+          DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
   // Try to use a zext lowering.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
-          DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+          DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
     return ZExt;
 
-  int MaskStorage[16] = {
-      OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
-      OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
-      OrigMask[8],  OrigMask[9],  OrigMask[10], OrigMask[11],
-      OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
-  MutableArrayRef<int> Mask(MaskStorage);
-  MutableArrayRef<int> LoMask = Mask.slice(0, 8);
-  MutableArrayRef<int> HiMask = Mask.slice(8, 8);
-
   int NumV2Elements =
       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
 
   // For single-input shuffles, there are some nicer lowering tricks we can use.
   if (NumV2Elements == 0) {
     // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1,
                                                           Mask, Subtarget, DAG))
       return Broadcast;
 
@@ -9475,36 +8500,17 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
       return V;
   }
 
-  // Check whether an interleaving lowering is likely to be more efficient.
-  // This isn't perfect but it is a strong heuristic that tends to work well on
-  // the kinds of shuffles that show up in practice.
-  //
-  // FIXME: We need to handle other interleaving widths (i16, i32, ...).
-  if (shouldLowerAsInterleaving(Mask)) {
-    int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
-      return (M >= 0 && M < 8) || (M >= 16 && M < 24);
-    });
-    int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
-      return (M >= 8 && M < 16) || M >= 24;
-    });
-    int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
-                     -1, -1, -1, -1, -1, -1, -1, -1};
-    int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
-                     -1, -1, -1, -1, -1, -1, -1, -1};
-    bool UnpackLo = NumLoHalf >= NumHiHalf;
-    MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
-    MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
-    for (int i = 0; i < 8; ++i) {
-      TargetEMask[i] = Mask[2 * i];
-      TargetOMask[i] = Mask[2 * i + 1];
-    }
-
-    SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
-    SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
-
-    return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
-                       MVT::v16i8, Evens, Odds);
-  }
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
+                                         0, 16, 1, 17, 2, 18, 3, 19,
+                                         // High half.
+                                         4, 20, 5, 21, 6, 22, 7, 23}))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
+                                         8, 24, 9, 25, 10, 26, 11, 27,
+                                         // High half.
+                                         12, 28, 13, 29, 14, 30, 15, 31}))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2);
 
   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
   // with PSHUFB. It is important to do this before we attempt to generate any
@@ -9520,33 +8526,47 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   // interleavings with direct instructions supporting them. We currently don't
   // handle those well here.
   if (Subtarget->hasSSSE3()) {
-    SDValue V1Mask[16];
-    SDValue V2Mask[16];
-    for (int i = 0; i < 16; ++i)
-      if (Mask[i] == -1) {
-        V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
-      } else {
-        V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8);
-        V2Mask[i] =
-            DAG.getConstant(Mask[i] < 16 ? 0x80 : Mask[i] - 16, MVT::i8);
-      }
-    V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
-                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
-    if (isSingleInputShuffleMask(Mask))
-      return V1; // Single inputs are easy.
+    bool V1InUse = false;
+    bool V2InUse = false;
 
-    // Otherwise, blend the two.
-    V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
-                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
-    return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+    SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask,
+                                                DAG, V1InUse, V2InUse);
+
+    // If both V1 and V2 are in use and we can use a direct blend or an unpack,
+    // do so. This avoids using them to handle blends-with-zero which is
+    // important as a single pshufb is significantly faster for that.
+    if (V1InUse && V2InUse) {
+      if (Subtarget->hasSSE41())
+        if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
+                                                      Mask, Subtarget, DAG))
+          return Blend;
+
+      // We can use an unpack to do the blending rather than an or in some
+      // cases. Even though the or may be (very minorly) more efficient, we
+      // preference this lowering because there are common cases where part of
+      // the complexity of the shuffles goes away when we do the final blend as
+      // an unpack.
+      // FIXME: It might be worth trying to detect if the unpack-feeding
+      // shuffles will both be pshufb, in which case we shouldn't bother with
+      // this.
+      if (SDValue Unpack =
+              lowerVectorShuffleAsUnpack(DL, MVT::v16i8, V1, V2, Mask, DAG))
+        return Unpack;
+    }
+
+    return PSHUFB;
   }
 
   // There are special ways we can lower some single-element blends.
   if (NumV2Elements == 1)
-    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
                                                          Mask, Subtarget, DAG))
       return V;
 
+  if (SDValue BitBlend =
+          lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
+    return BitBlend;
+
   // Check whether a compaction lowering can be done. This handles shuffles
   // which take every Nth element for some even N. See the helper function for
   // details.
@@ -9585,72 +8605,58 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return Result;
   }
 
-  int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
-  int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
-  int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
-  int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  // Handle multi-input cases by blending single-input shuffles.
+  if (NumV2Elements > 0)
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
+                                                      Mask, DAG);
 
-  auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
-                            MutableArrayRef<int> V1HalfBlendMask,
-                            MutableArrayRef<int> V2HalfBlendMask) {
-    for (int i = 0; i < 8; ++i)
-      if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
-        V1HalfBlendMask[i] = HalfMask[i];
-        HalfMask[i] = i;
-      } else if (HalfMask[i] >= 16) {
-        V2HalfBlendMask[i] = HalfMask[i] - 16;
-        HalfMask[i] = i + 8;
-      }
-  };
-  buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
-  buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
+  // The fallback path for single-input shuffles widens this into two v8i16
+  // vectors with unpacks, shuffles those, and then pulls them back together
+  // with a pack.
+  SDValue V = V1;
 
-  SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
+  int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  for (int i = 0; i < 16; ++i)
+    if (Mask[i] >= 0)
+      (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
 
-  auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
-                             MutableArrayRef<int> HiBlendMask) {
-    SDValue V1, V2;
-    // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
-    // them out and avoid using UNPCK{L,H} to extract the elements of V as
-    // i16s.
-    if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
-                     [](int M) { return M >= 0 && M % 2 == 1; }) &&
-        std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
-                     [](int M) { return M >= 0 && M % 2 == 1; })) {
-      // Use a mask to drop the high bytes.
-      V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
-      V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
-                       DAG.getConstant(0x00FF, MVT::v8i16));
-
-      // This will be a single vector shuffle instead of a blend so nuke V2.
-      V2 = DAG.getUNDEF(MVT::v8i16);
-
-      // Squash the masks to point directly into V1.
-      for (int &M : LoBlendMask)
-        if (M >= 0)
-          M /= 2;
-      for (int &M : HiBlendMask)
-        if (M >= 0)
-          M /= 2;
-    } else {
-      // Otherwise just unpack the low half of V into V1 and the high half into
-      // V2 so that we can blend them as i16s.
-      V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                       DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
-      V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                       DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
-    }
+  SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
 
-    SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
-    SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
-    return std::make_pair(BlendedLo, BlendedHi);
-  };
-  SDValue V1Lo, V1Hi, V2Lo, V2Hi;
-  std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
-  std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
+  SDValue VLoHalf, VHiHalf;
+  // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+  // them out and avoid using UNPCK{L,H} to extract the elements of V as
+  // i16s.
+  if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
+                   [](int M) { return M >= 0 && M % 2 == 1; }) &&
+      std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
+                   [](int M) { return M >= 0 && M % 2 == 1; })) {
+    // Use a mask to drop the high bytes.
+    VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+    VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
+                     DAG.getConstant(0x00FF, MVT::v8i16));
+
+    // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
+    VHiHalf = DAG.getUNDEF(MVT::v8i16);
+
+    // Squash the masks to point directly into VLoHalf.
+    for (int &M : LoBlendMask)
+      if (M >= 0)
+        M /= 2;
+    for (int &M : HiBlendMask)
+      if (M >= 0)
+        M /= 2;
+  } else {
+    // Otherwise just unpack the low half of V into VLoHalf and the high half into
+    // VHiHalf so that we can blend them as i16s.
+    VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                     DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+    VHiHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                     DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+  }
 
-  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
-  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
+  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
+  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
 
   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
 }
@@ -9736,7 +8742,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
   return true;
 }
 
-/// \brief Generic routine to split ector shuffle into half-sized shuffles.
+/// \brief Generic routine to split vector shuffle into half-sized shuffles.
 ///
 /// This routine just extracts two subvectors, shuffles them independently, and
 /// then concatenates them back together. This should work effectively with all
@@ -9757,14 +8763,43 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
   MVT ScalarVT = VT.getScalarType();
   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
 
-  SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
-                             DAG.getIntPtrConstant(0));
-  SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
-                             DAG.getIntPtrConstant(SplitNumElements));
-  SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
-                             DAG.getIntPtrConstant(0));
-  SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
-                             DAG.getIntPtrConstant(SplitNumElements));
+  // Rather than splitting build-vectors, just build two narrower build
+  // vectors. This helps shuffling with splats and zeros.
+  auto SplitVector = [&](SDValue V) {
+    while (V.getOpcode() == ISD::BITCAST)
+      V = V->getOperand(0);
+
+    MVT OrigVT = V.getSimpleValueType();
+    int OrigNumElements = OrigVT.getVectorNumElements();
+    int OrigSplitNumElements = OrigNumElements / 2;
+    MVT OrigScalarVT = OrigVT.getScalarType();
+    MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
+
+    SDValue LoV, HiV;
+
+    auto *BV = dyn_cast<BuildVectorSDNode>(V);
+    if (!BV) {
+      LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+                        DAG.getIntPtrConstant(0));
+      HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+                        DAG.getIntPtrConstant(OrigSplitNumElements));
+    } else {
+
+      SmallVector<SDValue, 16> LoOps, HiOps;
+      for (int i = 0; i < OrigSplitNumElements; ++i) {
+        LoOps.push_back(BV->getOperand(i));
+        HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
+      }
+      LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
+      HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
+    }
+    return std::make_pair(DAG.getNode(ISD::BITCAST, DL, SplitVT, LoV),
+                          DAG.getNode(ISD::BITCAST, DL, SplitVT, HiV));
+  };
+
+  SDValue LoV1, HiV1, LoV2, HiV2;
+  std::tie(LoV1, HiV1) = SplitVector(V1);
+  std::tie(LoV2, HiV2) = SplitVector(V2);
 
   // Now create two 4-way blends of these half-width vectors.
   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
@@ -9960,15 +8995,15 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
                                VT.getVectorNumElements() / 2);
   // Check for patterns which can be matched with a single insert of a 128-bit
   // subvector.
-  if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
-      isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}) ||
+      isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
                               DAG.getIntPtrConstant(0));
     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
                               Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
   }
-  if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 6, 7})) {
     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
                               DAG.getIntPtrConstant(0));
     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
@@ -9983,6 +9018,104 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
                      DAG.getConstant(PermMask, MVT::i8));
 }
 
+/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
+/// shuffling each lane.
+///
+/// This will only succeed when the result of fixing the 128-bit lanes results
+/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
+/// each 128-bit lanes. This handles many cases where we can quickly blend away
+/// the lane crosses early and then use simpler shuffles within each lane.
+///
+/// FIXME: It might be worthwhile at some point to support this without
+/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
+/// in x86 only floating point has interesting non-repeating shuffles, and even
+/// those are still *marginally* more expensive.
+static SDValue lowerVectorShuffleByMerging128BitLanes(
+    SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  assert(!isSingleInputShuffleMask(Mask) &&
+         "This is only useful with multiple inputs.");
+
+  int Size = Mask.size();
+  int LaneSize = 128 / VT.getScalarSizeInBits();
+  int NumLanes = Size / LaneSize;
+  assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
+
+  // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
+  // check whether the in-128-bit lane shuffles share a repeating pattern.
+  SmallVector<int, 4> Lanes;
+  Lanes.resize(NumLanes, -1);
+  SmallVector<int, 4> InLaneMask;
+  InLaneMask.resize(LaneSize, -1);
+  for (int i = 0; i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
+
+    int j = i / LaneSize;
+
+    if (Lanes[j] < 0) {
+      // First entry we've seen for this lane.
+      Lanes[j] = Mask[i] / LaneSize;
+    } else if (Lanes[j] != Mask[i] / LaneSize) {
+      // This doesn't match the lane selected previously!
+      return SDValue();
+    }
+
+    // Check that within each lane we have a consistent shuffle mask.
+    int k = i % LaneSize;
+    if (InLaneMask[k] < 0) {
+      InLaneMask[k] = Mask[i] % LaneSize;
+    } else if (InLaneMask[k] != Mask[i] % LaneSize) {
+      // This doesn't fit a repeating in-lane mask.
+      return SDValue();
+    }
+  }
+
+  // First shuffle the lanes into place.
+  MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
+                                VT.getSizeInBits() / 64);
+  SmallVector<int, 8> LaneMask;
+  LaneMask.resize(NumLanes * 2, -1);
+  for (int i = 0; i < NumLanes; ++i)
+    if (Lanes[i] >= 0) {
+      LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
+      LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
+    }
+
+  V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
+  V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
+  SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
+
+  // Cast it back to the type we actually want.
+  LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
+
+  // Now do a simple shuffle that isn't lane crossing.
+  SmallVector<int, 8> NewMask;
+  NewMask.resize(Size, -1);
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0)
+      NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
+  assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
+         "Must not introduce lane crosses at this point!");
+
+  return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
+}
+
+/// \brief Test whether the specified input (0 or 1) is in-place blended by the
+/// given mask.
+///
+/// This returns true if the elements from a particular input are already in the
+/// slot required by the given mask and require no permutation.
+static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
+  assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
+      return false;
+
+  return true;
+}
+
 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
 ///
 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
@@ -10004,10 +9137,14 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 
   if (isSingleInputShuffleMask(Mask)) {
     // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1,
                                                           Mask, Subtarget, DAG))
       return Broadcast;
 
+    // Use low duplicate instructions for masks that match their pattern.
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+      return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
+
     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
       // Non-half-crossing single input shuffles can be lowerid with an
       // interleaved permutation.
@@ -10029,10 +9166,14 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 
   // X86 has dedicated unpack instructions that can handle specific blend
   // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
-  if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
+  if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1);
+  if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
 
   // If we have a single input to the zero element, insert that into V1 if we
   // can do so cheaply.
@@ -10040,7 +9181,7 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
   if (NumV2Elements == 1 && Mask[0] >= 4)
     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
-            MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
+            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return Insertion;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
@@ -10067,6 +9208,16 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                        DAG.getConstant(SHUFPDMask, MVT::i8));
   }
 
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle. However, if we have AVX2 and either inputs are already in place,
+  // we will be able to shuffle even across lanes the other input in a single
+  // instruction so skip this pattern.
+  if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+                                 isShuffleMaskInputInPlace(1, Mask))))
+    if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+      return Result;
+
   // If we have AVX2 then we always want to lower with a blend because an v4 we
   // can fully permute the elements.
   if (Subtarget->hasAVX2())
@@ -10102,7 +9253,7 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return Blend;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
@@ -10123,12 +9274,6 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                       DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
     }
-
-    // Use dedicated unpack instructions for masks that match their pattern.
-    if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
-      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
-    if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
-      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
   }
 
   // AVX2 provides a direct instruction for permuting a single input across
@@ -10137,6 +9282,31 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
                        getV4X86ShuffleImm8ForMask(Mask, DAG));
 
+  // Try to use shift instructions.
+  if (SDValue Shift =
+          lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG))
+    return Shift;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1);
+  if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1);
+
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle. However, if we have AVX2 and either inputs are already in place,
+  // we will be able to shuffle even across lanes the other input in a single
+  // instruction so skip this pattern.
+  if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+                                 isShuffleMaskInputInPlace(1, Mask))))
+    if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+            DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
+      return Result;
+
   // Otherwise fall back on generic blend lowering.
   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
                                                     Mask, DAG);
@@ -10161,7 +9331,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return Blend;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
@@ -10171,15 +9341,26 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
     assert(RepeatedMask.size() == 4 &&
            "Repeated masks must be half the mask width!");
+
+    // Use even/odd duplicate instructions for masks that match their pattern.
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
+      return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
+    if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7}))
+      return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
+
     if (isSingleInputShuffleMask(Mask))
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
 
     // Use dedicated unpack instructions for masks that match their pattern.
-    if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
-    if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+    if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
+    if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
+      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1);
+    if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
+      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1);
 
     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
     // have already handled any direct blends. We also need to squash the
@@ -10214,6 +9395,12 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                    DAG);
   }
 
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle.
+  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+          DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
+    return Result;
+
   // If we have AVX2 then we always want to lower with a blend because at v8 we
   // can fully permute the elements.
   if (Subtarget->hasAVX2())
@@ -10239,12 +9426,19 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
 
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative. It also allows us to fold memory operands into the
+  // shuffle in many cases.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
+                                                         Mask, Subtarget, DAG))
+    return ZExt;
+
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
                                                 Subtarget, DAG))
     return Blend;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
@@ -10259,12 +9453,25 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
 
     // Use dedicated unpack instructions for masks that match their pattern.
-    if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
-    if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+    if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
+    if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
+      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1);
+    if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
+      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1);
   }
 
+  // Try to use shift instructions.
+  if (SDValue Shift =
+          lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG))
+    return Shift;
+
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
   // If the shuffle patterns aren't repeated but it is a single input, directly
   // generate a cross-lane VPERMD instruction.
   if (isSingleInputShuffleMask(Mask)) {
@@ -10277,6 +9484,12 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
   }
 
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle.
+  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+    return Result;
+
   // Otherwise fall back on generic blend lowering.
   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
                                                     Mask, DAG);
@@ -10297,36 +9510,53 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
 
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative. It also allows us to fold memory operands into the
+  // shuffle in many cases.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
+                                                         Mask, Subtarget, DAG))
+    return ZExt;
+
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
-  // There are no generalized cross-lane shuffle operations available on i16
-  // element types.
-  if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
-    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
-                                                   Mask, DAG);
-
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
                                                 Subtarget, DAG))
     return Blend;
 
   // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask,
-                          // First 128-bit lane:
-                          0, 16, 1, 17, 2, 18, 3, 19,
-                          // Second 128-bit lane:
-                          8, 24, 9, 25, 10, 26, 11, 27))
+  if (isShuffleEquivalent(V1, V2, Mask,
+                          {// First 128-bit lane:
+                           0, 16, 1, 17, 2, 18, 3, 19,
+                           // Second 128-bit lane:
+                           8, 24, 9, 25, 10, 26, 11, 27}))
     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
-  if (isShuffleEquivalent(Mask,
-                          // First 128-bit lane:
-                          4, 20, 5, 21, 6, 22, 7, 23,
-                          // Second 128-bit lane:
-                          12, 28, 13, 29, 14, 30, 15, 31))
+  if (isShuffleEquivalent(V1, V2, Mask,
+                          {// First 128-bit lane:
+                           4, 20, 5, 21, 6, 22, 7, 23,
+                           // Second 128-bit lane:
+                           12, 28, 13, 29, 14, 30, 15, 31}))
     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
 
+  // Try to use shift instructions.
+  if (SDValue Shift =
+          lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
   if (isSingleInputShuffleMask(Mask)) {
+    // There are no generalized cross-lane shuffle operations available on i16
+    // element types.
+    if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
+      return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
+                                                     Mask, DAG);
+
     SDValue PSHUFBMask[32];
     for (int i = 0; i < 16; ++i) {
       if (Mask[i] == -1) {
@@ -10347,6 +9577,12 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
             DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
   }
 
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle.
+  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+    return Result;
+
   // Otherwise fall back on generic lowering.
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
 }
@@ -10366,17 +9602,18 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
   assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
 
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative. It also allows us to fold memory operands into the
+  // shuffle in many cases.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
+                                                         Mask, Subtarget, DAG))
+    return ZExt;
+
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
-  // There are no generalized cross-lane shuffle operations available on i8
-  // element types.
-  if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
-    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
-                                                   Mask, DAG);
-
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
                                                 Subtarget, DAG))
     return Blend;
@@ -10385,21 +9622,37 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   // Note that these are repeated 128-bit lane unpacks, not unpacks across all
   // 256-bit lanes.
   if (isShuffleEquivalent(
-          Mask,
-          // First 128-bit lane:
-          0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
-          // Second 128-bit lane:
-          16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
+          V1, V2, Mask,
+          {// First 128-bit lane:
+           0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
+           // Second 128-bit lane:
+           16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55}))
     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
   if (isShuffleEquivalent(
-          Mask,
-          // First 128-bit lane:
-          8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
-          // Second 128-bit lane:
-          24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
+          V1, V2, Mask,
+          {// First 128-bit lane:
+           8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+           // Second 128-bit lane:
+           24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63}))
     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
 
+  // Try to use shift instructions.
+  if (SDValue Shift =
+          lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
   if (isSingleInputShuffleMask(Mask)) {
+    // There are no generalized cross-lane shuffle operations available on i8
+    // element types.
+    if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
+      return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
+                                                     Mask, DAG);
+
     SDValue PSHUFBMask[32];
     for (int i = 0; i < 32; ++i)
       PSHUFBMask[i] =
@@ -10412,6 +9665,12 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
   }
 
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle.
+  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+    return Result;
+
   // Otherwise fall back on generic lowering.
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
 }
@@ -10478,6 +9737,13 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
+  // X86 has dedicated unpack instructions that can handle specific blend
+  // operations: UNPCKH and UNPCKL.
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
+
   // FIXME: Implement direct support for this type!
   return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
 }
@@ -10493,6 +9759,20 @@ static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
 
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(V1, V2, Mask,
+                          {// First 128-bit lane.
+                           0, 16, 1, 17, 4, 20, 5, 21,
+                           // Second 128-bit lane.
+                           8, 24, 9, 25, 12, 28, 13, 29}))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask,
+                          {// First 128-bit lane.
+                           2, 18, 3, 19, 6, 22, 7, 23,
+                           // Second 128-bit lane.
+                           10, 26, 11, 27, 14, 30, 15, 31}))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
+
   // FIXME: Implement direct support for this type!
   return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
 }
@@ -10508,6 +9788,13 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
+  // X86 has dedicated unpack instructions that can handle specific blend
+  // operations: UNPCKH and UNPCKL.
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
+
   // FIXME: Implement direct support for this type!
   return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
 }
@@ -10523,6 +9810,20 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
 
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(V1, V2, Mask,
+                          {// First 128-bit lane.
+                           0, 16, 1, 17, 4, 20, 5, 21,
+                           // Second 128-bit lane.
+                           8, 24, 9, 25, 12, 28, 13, 29}))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask,
+                          {// First 128-bit lane.
+                           2, 18, 3, 19, 6, 22, 7, 23,
+                           // Second 128-bit lane.
+                           10, 26, 11, 27, 14, 30, 15, 31}))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
+
   // FIXME: Implement direct support for this type!
   return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
 }
@@ -10574,8 +9875,8 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
          "Cannot lower 512-bit vectors w/ basic ISA!");
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
-                                                        Mask, Subtarget, DAG))
+  if (SDValue Broadcast =
+          lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG))
     return Broadcast;
 
   // Dispatch to each element type for lowering. If we don't have supprot for
@@ -10651,6 +9952,13 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
         return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
       }
 
+  // We actually see shuffles that are entirely re-arrangements of a set of
+  // zero inputs. This mostly happens while decomposing complex shuffles into
+  // simple ones. Directly lower these as a buildvector of zeros.
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  if (Zeroable.all())
+    return getZeroVector(VT, Subtarget, DAG, dl);
+
   // Try to collapse shuffles into using a vector type with fewer elements but
   // wider element types. We cap this to not form integers or floating point
   // elements wider than 64 bits, but it might be interesting to form i128
@@ -10690,7 +9998,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
   // When the number of V1 and V2 elements are the same, try to minimize the
   // number of uses of V2 in the low half of the vector. When that is tied,
   // ensure that the sum of indices for V1 is equal to or lower than the sum
-  // indices for V2.
+  // indices for V2. When those are equal, try to ensure that the number of odd
+  // indices for V1 is lower than the number of odd indices for V2.
   if (NumV1Elements == NumV2Elements) {
     int LowV1Elements = 0, LowV2Elements = 0;
     for (int M : SVOp->getMask().slice(0, NumElements / 2))
@@ -10707,8 +10016,18 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
           SumV2Indices += i;
         else if (SVOp->getMask()[i] >= 0)
           SumV1Indices += i;
-      if (SumV2Indices < SumV1Indices)
+      if (SumV2Indices < SumV1Indices) {
         return DAG.getCommutedVectorShuffle(*SVOp);
+      } else if (SumV2Indices == SumV1Indices) {
+        int NumV1OddIndices = 0, NumV2OddIndices = 0;
+        for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
+          if (SVOp->getMask()[i] >= NumElements)
+            NumV2OddIndices += i % 2;
+          else if (SVOp->getMask()[i] >= 0)
+            NumV1OddIndices += i % 2;
+        if (NumV2OddIndices < NumV1OddIndices)
+          return DAG.getCommutedVectorShuffle(*SVOp);
+      }
     }
   }
 
@@ -10727,1586 +10046,6 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
   llvm_unreachable("Unimplemented!");
 }
 
-
-//===----------------------------------------------------------------------===//
-// Legacy vector shuffle lowering
-//
-// This code is the legacy code handling vector shuffles until the above
-// replaces its functionality and performance.
-//===----------------------------------------------------------------------===//
-
-static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
-                        bool hasInt256, unsigned *MaskOut = nullptr) {
-  MVT EltVT = VT.getVectorElementType();
-
-  // There is no blend with immediate in AVX-512.
-  if (VT.is512BitVector())
-    return false;
-
-  if (!hasSSE41 || EltVT == MVT::i8)
-    return false;
-  if (!hasInt256 && VT == MVT::v16i16)
-    return false;
-
-  unsigned MaskValue = 0;
-  unsigned NumElems = VT.getVectorNumElements();
-  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
-  unsigned NumLanes = (NumElems - 1) / 8 + 1;
-  unsigned NumElemsInLane = NumElems / NumLanes;
-
-  // Blend for v16i16 should be symetric for the both lanes.
-  for (unsigned i = 0; i < NumElemsInLane; ++i) {
-
-    int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
-    int EltIdx = MaskVals[i];
-
-    if ((EltIdx < 0 || EltIdx == (int)i) &&
-        (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
-      continue;
-
-    if (((unsigned)EltIdx == (i + NumElems)) &&
-        (SndLaneEltIdx < 0 ||
-         (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
-      MaskValue |= (1 << i);
-    else
-      return false;
-  }
-
-  if (MaskOut)
-    *MaskOut = MaskValue;
-  return true;
-}
-
-// Try to lower a shuffle node into a simple blend instruction.
-// This function assumes isBlendMask returns true for this
-// SuffleVectorSDNode
-static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
-                                          unsigned MaskValue,
-                                          const X86Subtarget *Subtarget,
-                                          SelectionDAG &DAG) {
-  MVT VT = SVOp->getSimpleValueType(0);
-  MVT EltVT = VT.getVectorElementType();
-  assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
-                     Subtarget->hasInt256() && "Trying to lower a "
-                                               "VECTOR_SHUFFLE to a Blend but "
-                                               "with the wrong mask"));
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
-  SDLoc dl(SVOp);
-  unsigned NumElems = VT.getVectorNumElements();
-
-  // Convert i32 vectors to floating point if it is not AVX2.
-  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
-  MVT BlendVT = VT;
-  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
-    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
-                               NumElems);
-    V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
-    V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
-  }
-
-  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
-                            DAG.getConstant(MaskValue, MVT::i32));
-  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
-}
-
-/// In vector type \p VT, return true if the element at index \p InputIdx
-/// falls on a different 128-bit lane than \p OutputIdx.
-static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
-                                     unsigned OutputIdx) {
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-  return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
-}
-
-/// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
-/// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
-/// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
-/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
-/// zero.
-static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
-                         SelectionDAG &DAG) {
-  MVT VT = V1.getSimpleValueType();
-  assert(VT.is128BitVector() || VT.is256BitVector());
-
-  MVT EltVT = VT.getVectorElementType();
-  unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
-  unsigned NumElts = VT.getVectorNumElements();
-
-  SmallVector<SDValue, 32> PshufbMask;
-  for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
-    int InputIdx = MaskVals[OutputIdx];
-    unsigned InputByteIdx;
-
-    if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
-      InputByteIdx = 0x80;
-    else {
-      // Cross lane is not allowed.
-      if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
-        return SDValue();
-      InputByteIdx = InputIdx * EltSizeInBytes;
-      // Index is an byte offset within the 128-bit lane.
-      InputByteIdx &= 0xf;
-    }
-
-    for (unsigned j = 0; j < EltSizeInBytes; ++j) {
-      PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
-      if (InputByteIdx != 0x80)
-        ++InputByteIdx;
-    }
-  }
-
-  MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
-  if (ShufVT != VT)
-    V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
-  return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
-                     DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
-}
-
-// v8i16 shuffles - Prefer shuffles in the following order:
-// 1. [all]   pshuflw, pshufhw, optional move
-// 2. [ssse3] 1 x pshufb
-// 3. [ssse3] 2 x pshufb + 1 x por
-// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
-static SDValue
-LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
-                         SelectionDAG &DAG) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
-  SDLoc dl(SVOp);
-  SmallVector<int, 8> MaskVals;
-
-  // Determine if more than 1 of the words in each of the low and high quadwords
-  // of the result come from the same quadword of one of the two inputs.  Undef
-  // mask values count as coming from any quadword, for better codegen.
-  //
-  // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
-  // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
-  unsigned LoQuad[] = { 0, 0, 0, 0 };
-  unsigned HiQuad[] = { 0, 0, 0, 0 };
-  // Indices of quads used.
-  std::bitset<4> InputQuads;
-  for (unsigned i = 0; i < 8; ++i) {
-    unsigned *Quad = i < 4 ? LoQuad : HiQuad;
-    int EltIdx = SVOp->getMaskElt(i);
-    MaskVals.push_back(EltIdx);
-    if (EltIdx < 0) {
-      ++Quad[0];
-      ++Quad[1];
-      ++Quad[2];
-      ++Quad[3];
-      continue;
-    }
-    ++Quad[EltIdx / 4];
-    InputQuads.set(EltIdx / 4);
-  }
-
-  int BestLoQuad = -1;
-  unsigned MaxQuad = 1;
-  for (unsigned i = 0; i < 4; ++i) {
-    if (LoQuad[i] > MaxQuad) {
-      BestLoQuad = i;
-      MaxQuad = LoQuad[i];
-    }
-  }
-
-  int BestHiQuad = -1;
-  MaxQuad = 1;
-  for (unsigned i = 0; i < 4; ++i) {
-    if (HiQuad[i] > MaxQuad) {
-      BestHiQuad = i;
-      MaxQuad = HiQuad[i];
-    }
-  }
-
-  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
-  // of the two input vectors, shuffle them into one input vector so only a
-  // single pshufb instruction is necessary. If there are more than 2 input
-  // quads, disable the next transformation since it does not help SSSE3.
-  bool V1Used = InputQuads[0] || InputQuads[1];
-  bool V2Used = InputQuads[2] || InputQuads[3];
-  if (Subtarget->hasSSSE3()) {
-    if (InputQuads.count() == 2 && V1Used && V2Used) {
-      BestLoQuad = InputQuads[0] ? 0 : 1;
-      BestHiQuad = InputQuads[2] ? 2 : 3;
-    }
-    if (InputQuads.count() > 2) {
-      BestLoQuad = -1;
-      BestHiQuad = -1;
-    }
-  }
-
-  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
-  // the shuffle mask.  If a quad is scored as -1, that means that it contains
-  // words from all 4 input quadwords.
-  SDValue NewV;
-  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
-    int MaskV[] = {
-      BestLoQuad < 0 ? 0 : BestLoQuad,
-      BestHiQuad < 0 ? 1 : BestHiQuad
-    };
-    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
-                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
-                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
-    NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
-
-    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
-    // source words for the shuffle, to aid later transformations.
-    bool AllWordsInNewV = true;
-    bool InOrder[2] = { true, true };
-    for (unsigned i = 0; i != 8; ++i) {
-      int idx = MaskVals[i];
-      if (idx != (int)i)
-        InOrder[i/4] = false;
-      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
-        continue;
-      AllWordsInNewV = false;
-      break;
-    }
-
-    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
-    if (AllWordsInNewV) {
-      for (int i = 0; i != 8; ++i) {
-        int idx = MaskVals[i];
-        if (idx < 0)
-          continue;
-        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
-        if ((idx != i) && idx < 4)
-          pshufhw = false;
-        if ((idx != i) && idx > 3)
-          pshuflw = false;
-      }
-      V1 = NewV;
-      V2Used = false;
-      BestLoQuad = 0;
-      BestHiQuad = 1;
-    }
-
-    // If we've eliminated the use of V2, and the new mask is a pshuflw or
-    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
-    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
-      unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
-      unsigned TargetMask = 0;
-      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
-                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
-      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
-      TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
-                             getShufflePSHUFLWImmediate(SVOp);
-      V1 = NewV.getOperand(0);
-      return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
-    }
-  }
-
-  // Promote splats to a larger type which usually leads to more efficient code.
-  // FIXME: Is this true if pshufb is available?
-  if (SVOp->isSplat())
-    return PromoteSplat(SVOp, DAG);
-
-  // If we have SSSE3, and all words of the result are from 1 input vector,
-  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
-  // is present, fall back to case 4.
-  if (Subtarget->hasSSSE3()) {
-    SmallVector<SDValue,16> pshufbMask;
-
-    // If we have elements from both input vectors, set the high bit of the
-    // shuffle mask element to zero out elements that come from V2 in the V1
-    // mask, and elements that come from V1 in the V2 mask, so that the two
-    // results can be OR'd together.
-    bool TwoInputs = V1Used && V2Used;
-    V1 = getPSHUFB(MaskVals, V1, dl, DAG);
-    if (!TwoInputs)
-      return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
-
-    // Calculate the shuffle mask for the second input, shuffle it, and
-    // OR it with the first shuffled input.
-    CommuteVectorShuffleMask(MaskVals, 8);
-    V2 = getPSHUFB(MaskVals, V2, dl, DAG);
-    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
-    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
-  }
-
-  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
-  // and update MaskVals with new element order.
-  std::bitset<8> InOrder;
-  if (BestLoQuad >= 0) {
-    int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
-    for (int i = 0; i != 4; ++i) {
-      int idx = MaskVals[i];
-      if (idx < 0) {
-        InOrder.set(i);
-      } else if ((idx / 4) == BestLoQuad) {
-        MaskV[i] = idx & 3;
-        InOrder.set(i);
-      }
-    }
-    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
-                                &MaskV[0]);
-
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
-      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
-      NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
-                                  NewV.getOperand(0),
-                                  getShufflePSHUFLWImmediate(SVOp), DAG);
-    }
-  }
-
-  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
-  // and update MaskVals with the new element order.
-  if (BestHiQuad >= 0) {
-    int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
-    for (unsigned i = 4; i != 8; ++i) {
-      int idx = MaskVals[i];
-      if (idx < 0) {
-        InOrder.set(i);
-      } else if ((idx / 4) == BestHiQuad) {
-        MaskV[i] = (idx & 3) + 4;
-        InOrder.set(i);
-      }
-    }
-    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
-                                &MaskV[0]);
-
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
-      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
-      NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
-                                  NewV.getOperand(0),
-                                  getShufflePSHUFHWImmediate(SVOp), DAG);
-    }
-  }
-
-  // In case BestHi & BestLo were both -1, which means each quadword has a word
-  // from each of the four input quadwords, calculate the InOrder bitvector now
-  // before falling through to the insert/extract cleanup.
-  if (BestLoQuad == -1 && BestHiQuad == -1) {
-    NewV = V1;
-    for (int i = 0; i != 8; ++i)
-      if (MaskVals[i] < 0 || MaskVals[i] == i)
-        InOrder.set(i);
-  }
-
-  // The other elements are put in the right place using pextrw and pinsrw.
-  for (unsigned i = 0; i != 8; ++i) {
-    if (InOrder[i])
-      continue;
-    int EltIdx = MaskVals[i];
-    if (EltIdx < 0)
-      continue;
-    SDValue ExtOp = (EltIdx < 8) ?
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
-                  DAG.getIntPtrConstant(EltIdx)) :
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
-                  DAG.getIntPtrConstant(EltIdx - 8));
-    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
-                       DAG.getIntPtrConstant(i));
-  }
-  return NewV;
-}
-
-/// \brief v16i16 shuffles
-///
-/// FIXME: We only support generation of a single pshufb currently.  We can
-/// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
-/// well (e.g 2 x pshufb + 1 x por).
-static SDValue
-LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
-  SDLoc dl(SVOp);
-
-  if (V2.getOpcode() != ISD::UNDEF)
-    return SDValue();
-
-  SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
-  return getPSHUFB(MaskVals, V1, dl, DAG);
-}
-
-// v16i8 shuffles - Prefer shuffles in the following order:
-// 1. [ssse3] 1 x pshufb
-// 2. [ssse3] 2 x pshufb + 1 x por
-// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
-static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
-                                        const X86Subtarget* Subtarget,
-                                        SelectionDAG &DAG) {
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
-  SDLoc dl(SVOp);
-  ArrayRef<int> MaskVals = SVOp->getMask();
-
-  // Promote splats to a larger type which usually leads to more efficient code.
-  // FIXME: Is this true if pshufb is available?
-  if (SVOp->isSplat())
-    return PromoteSplat(SVOp, DAG);
-
-  // If we have SSSE3, case 1 is generated when all result bytes come from
-  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
-  // present, fall back to case 3.
-
-  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
-  if (Subtarget->hasSSSE3()) {
-    SmallVector<SDValue,16> pshufbMask;
-
-    // If all result elements are from one input vector, then only translate
-    // undef mask values to 0x80 (zero out result) in the pshufb mask.
-    //
-    // Otherwise, we have elements from both input vectors, and must zero out
-    // elements that come from V2 in the first mask, and V1 in the second mask
-    // so that we can OR them together.
-    for (unsigned i = 0; i != 16; ++i) {
-      int EltIdx = MaskVals[i];
-      if (EltIdx < 0 || EltIdx >= 16)
-        EltIdx = 0x80;
-      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
-    }
-    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
-                     DAG.getNode(ISD::BUILD_VECTOR, dl,
-                                 MVT::v16i8, pshufbMask));
-
-    // As PSHUFB will zero elements with negative indices, it's safe to ignore
-    // the 2nd operand if it's undefined or zero.
-    if (V2.getOpcode() == ISD::UNDEF ||
-        ISD::isBuildVectorAllZeros(V2.getNode()))
-      return V1;
-
-    // Calculate the shuffle mask for the second input, shuffle it, and
-    // OR it with the first shuffled input.
-    pshufbMask.clear();
-    for (unsigned i = 0; i != 16; ++i) {
-      int EltIdx = MaskVals[i];
-      EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
-      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
-    }
-    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
-                     DAG.getNode(ISD::BUILD_VECTOR, dl,
-                                 MVT::v16i8, pshufbMask));
-    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
-  }
-
-  // No SSSE3 - Calculate in place words and then fix all out of place words
-  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
-  // the 16 different words that comprise the two doublequadword input vectors.
-  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
-  V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
-  SDValue NewV = V1;
-  for (int i = 0; i != 8; ++i) {
-    int Elt0 = MaskVals[i*2];
-    int Elt1 = MaskVals[i*2+1];
-
-    // This word of the result is all undef, skip it.
-    if (Elt0 < 0 && Elt1 < 0)
-      continue;
-
-    // This word of the result is already in the correct place, skip it.
-    if ((Elt0 == i*2) && (Elt1 == i*2+1))
-      continue;
-
-    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
-    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
-    SDValue InsElt;
-
-    // If Elt0 and Elt1 are defined, are consecutive, and can be load
-    // using a single extract together, load it and store it.
-    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
-      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
-                           DAG.getIntPtrConstant(Elt1 / 2));
-      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
-                        DAG.getIntPtrConstant(i));
-      continue;
-    }
-
-    // If Elt1 is defined, extract it from the appropriate source.  If the
-    // source byte is not also odd, shift the extracted word left 8 bits
-    // otherwise clear the bottom 8 bits if we need to do an or.
-    if (Elt1 >= 0) {
-      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
-                           DAG.getIntPtrConstant(Elt1 / 2));
-      if ((Elt1 & 1) == 0)
-        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
-                             DAG.getConstant(8,
-                                  TLI.getShiftAmountTy(InsElt.getValueType())));
-      else if (Elt0 >= 0)
-        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
-                             DAG.getConstant(0xFF00, MVT::i16));
-    }
-    // If Elt0 is defined, extract it from the appropriate source.  If the
-    // source byte is not also even, shift the extracted word right 8 bits. If
-    // Elt1 was also defined, OR the extracted values together before
-    // inserting them in the result.
-    if (Elt0 >= 0) {
-      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
-                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
-      if ((Elt0 & 1) != 0)
-        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
-                              DAG.getConstant(8,
-                                 TLI.getShiftAmountTy(InsElt0.getValueType())));
-      else if (Elt1 >= 0)
-        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
-                             DAG.getConstant(0x00FF, MVT::i16));
-      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
-                         : InsElt0;
-    }
-    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
-                       DAG.getIntPtrConstant(i));
-  }
-  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
-}
-
-// v32i8 shuffles - Translate to VPSHUFB if possible.
-static
-SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
-                                 const X86Subtarget *Subtarget,
-                                 SelectionDAG &DAG) {
-  MVT VT = SVOp->getSimpleValueType(0);
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
-  SDLoc dl(SVOp);
-  SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
-
-  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
-  bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
-  bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
-
-  // VPSHUFB may be generated if
-  // (1) one of input vector is undefined or zeroinitializer.
-  // The mask value 0x80 puts 0 in the corresponding slot of the vector.
-  // And (2) the mask indexes don't cross the 128-bit lane.
-  if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
-      (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
-    return SDValue();
-
-  if (V1IsAllZero && !V2IsAllZero) {
-    CommuteVectorShuffleMask(MaskVals, 32);
-    V1 = V2;
-  }
-  return getPSHUFB(MaskVals, V1, dl, DAG);
-}
-
-/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
-/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
-/// done when every pair / quad of shuffle mask elements point to elements in
-/// the right sequence. e.g.
-/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
-static
-SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
-                                 SelectionDAG &DAG) {
-  MVT VT = SVOp->getSimpleValueType(0);
-  SDLoc dl(SVOp);
-  unsigned NumElems = VT.getVectorNumElements();
-  MVT NewVT;
-  unsigned Scale;
-  switch (VT.SimpleTy) {
-  default: llvm_unreachable("Unexpected!");
-  case MVT::v2i64:
-  case MVT::v2f64:
-           return SDValue(SVOp, 0);
-  case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
-  case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
-  case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
-  case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
-  case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
-  case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
-  }
-
-  SmallVector<int, 8> MaskVec;
-  for (unsigned i = 0; i != NumElems; i += Scale) {
-    int StartIdx = -1;
-    for (unsigned j = 0; j != Scale; ++j) {
-      int EltIdx = SVOp->getMaskElt(i+j);
-      if (EltIdx < 0)
-        continue;
-      if (StartIdx < 0)
-        StartIdx = (EltIdx / Scale);
-      if (EltIdx != (int)(StartIdx*Scale + j))
-        return SDValue();
-    }
-    MaskVec.push_back(StartIdx);
-  }
-
-  SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
-  SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
-  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
-}
-
-/// getVZextMovL - Return a zero-extending vector move low node.
-///
-static SDValue getVZextMovL(MVT VT, MVT OpVT,
-                            SDValue SrcOp, SelectionDAG &DAG,
-                            const X86Subtarget *Subtarget, SDLoc dl) {
-  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
-    LoadSDNode *LD = nullptr;
-    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
-      LD = dyn_cast<LoadSDNode>(SrcOp);
-    if (!LD) {
-      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
-      // instead.
-      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
-      if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
-          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
-          SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
-          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
-        // PR2108
-        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
-        return DAG.getNode(ISD::BITCAST, dl, VT,
-                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
-                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-                                                   OpVT,
-                                                   SrcOp.getOperand(0)
-                                                          .getOperand(0))));
-      }
-    }
-  }
-
-  return DAG.getNode(ISD::BITCAST, dl, VT,
-                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
-                                 DAG.getNode(ISD::BITCAST, dl,
-                                             OpVT, SrcOp)));
-}
-
-/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
-/// which could not be matched by any known target speficic shuffle
-static SDValue
-LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
-
-  SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
-  if (NewOp.getNode())
-    return NewOp;
-
-  MVT VT = SVOp->getSimpleValueType(0);
-
-  unsigned NumElems = VT.getVectorNumElements();
-  unsigned NumLaneElems = NumElems / 2;
-
-  SDLoc dl(SVOp);
-  MVT EltVT = VT.getVectorElementType();
-  MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
-  SDValue Output[2];
-
-  SmallVector<int, 16> Mask;
-  for (unsigned l = 0; l < 2; ++l) {
-    // Build a shuffle mask for the output, discovering on the fly which
-    // input vectors to use as shuffle operands (recorded in InputUsed).
-    // If building a suitable shuffle vector proves too hard, then bail
-    // out with UseBuildVector set.
-    bool UseBuildVector = false;
-    int InputUsed[2] = { -1, -1 }; // Not yet discovered.
-    unsigned LaneStart = l * NumLaneElems;
-    for (unsigned i = 0; i != NumLaneElems; ++i) {
-      // The mask element.  This indexes into the input.
-      int Idx = SVOp->getMaskElt(i+LaneStart);
-      if (Idx < 0) {
-        // the mask element does not index into any input vector.
-        Mask.push_back(-1);
-        continue;
-      }
-
-      // The input vector this mask element indexes into.
-      int Input = Idx / NumLaneElems;
-
-      // Turn the index into an offset from the start of the input vector.
-      Idx -= Input * NumLaneElems;
-
-      // Find or create a shuffle vector operand to hold this input.
-      unsigned OpNo;
-      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
-        if (InputUsed[OpNo] == Input)
-          // This input vector is already an operand.
-          break;
-        if (InputUsed[OpNo] < 0) {
-          // Create a new operand for this input vector.
-          InputUsed[OpNo] = Input;
-          break;
-        }
-      }
-
-      if (OpNo >= array_lengthof(InputUsed)) {
-        // More than two input vectors used!  Give up on trying to create a
-        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
-        UseBuildVector = true;
-        break;
-      }
-
-      // Add the mask index for the new shuffle vector.
-      Mask.push_back(Idx + OpNo * NumLaneElems);
-    }
-
-    if (UseBuildVector) {
-      SmallVector<SDValue, 16> SVOps;
-      for (unsigned i = 0; i != NumLaneElems; ++i) {
-        // The mask element.  This indexes into the input.
-        int Idx = SVOp->getMaskElt(i+LaneStart);
-        if (Idx < 0) {
-          SVOps.push_back(DAG.getUNDEF(EltVT));
-          continue;
-        }
-
-        // The input vector this mask element indexes into.
-        int Input = Idx / NumElems;
-
-        // Turn the index into an offset from the start of the input vector.
-        Idx -= Input * NumElems;
-
-        // Extract the vector element by hand.
-        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
-                                    SVOp->getOperand(Input),
-                                    DAG.getIntPtrConstant(Idx)));
-      }
-
-      // Construct the output using a BUILD_VECTOR.
-      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
-    } else if (InputUsed[0] < 0) {
-      // No input vectors were used! The result is undefined.
-      Output[l] = DAG.getUNDEF(NVT);
-    } else {
-      SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
-                                        (InputUsed[0] % 2) * NumLaneElems,
-                                        DAG, dl);
-      // If only one input was used, use an undefined vector for the other.
-      SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
-        Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
-                            (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
-      // At least one input vector was used. Create a new shuffle vector.
-      Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
-    }
-
-    Mask.clear();
-  }
-
-  // Concatenate the result back
-  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
-}
-
-/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
-/// 4 elements, and match them with several different shuffle types.
-static SDValue
-LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
-  SDLoc dl(SVOp);
-  MVT VT = SVOp->getSimpleValueType(0);
-
-  assert(VT.is128BitVector() && "Unsupported vector size");
-
-  std::pair<int, int> Locs[4];
-  int Mask1[] = { -1, -1, -1, -1 };
-  SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
-
-  unsigned NumHi = 0;
-  unsigned NumLo = 0;
-  for (unsigned i = 0; i != 4; ++i) {
-    int Idx = PermMask[i];
-    if (Idx < 0) {
-      Locs[i] = std::make_pair(-1, -1);
-    } else {
-      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
-      if (Idx < 4) {
-        Locs[i] = std::make_pair(0, NumLo);
-        Mask1[NumLo] = Idx;
-        NumLo++;
-      } else {
-        Locs[i] = std::make_pair(1, NumHi);
-        if (2+NumHi < 4)
-          Mask1[2+NumHi] = Idx;
-        NumHi++;
-      }
-    }
-  }
-
-  if (NumLo <= 2 && NumHi <= 2) {
-    // If no more than two elements come from either vector. This can be
-    // implemented with two shuffles. First shuffle gather the elements.
-    // The second shuffle, which takes the first shuffle as both of its
-    // vector operands, put the elements into the right order.
-    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
-
-    int Mask2[] = { -1, -1, -1, -1 };
-
-    for (unsigned i = 0; i != 4; ++i)
-      if (Locs[i].first != -1) {
-        unsigned Idx = (i < 2) ? 0 : 4;
-        Idx += Locs[i].first * 2 + Locs[i].second;
-        Mask2[i] = Idx;
-      }
-
-    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
-  }
-
-  if (NumLo == 3 || NumHi == 3) {
-    // Otherwise, we must have three elements from one vector, call it X, and
-    // one element from the other, call it Y.  First, use a shufps to build an
-    // intermediate vector with the one element from Y and the element from X
-    // that will be in the same half in the final destination (the indexes don't
-    // matter). Then, use a shufps to build the final vector, taking the half
-    // containing the element from Y from the intermediate, and the other half
-    // from X.
-    if (NumHi == 3) {
-      // Normalize it so the 3 elements come from V1.
-      CommuteVectorShuffleMask(PermMask, 4);
-      std::swap(V1, V2);
-    }
-
-    // Find the element from V2.
-    unsigned HiIndex;
-    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
-      int Val = PermMask[HiIndex];
-      if (Val < 0)
-        continue;
-      if (Val >= 4)
-        break;
-    }
-
-    Mask1[0] = PermMask[HiIndex];
-    Mask1[1] = -1;
-    Mask1[2] = PermMask[HiIndex^1];
-    Mask1[3] = -1;
-    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
-
-    if (HiIndex >= 2) {
-      Mask1[0] = PermMask[0];
-      Mask1[1] = PermMask[1];
-      Mask1[2] = HiIndex & 1 ? 6 : 4;
-      Mask1[3] = HiIndex & 1 ? 4 : 6;
-      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
-    }
-
-    Mask1[0] = HiIndex & 1 ? 2 : 0;
-    Mask1[1] = HiIndex & 1 ? 0 : 2;
-    Mask1[2] = PermMask[2];
-    Mask1[3] = PermMask[3];
-    if (Mask1[2] >= 0)
-      Mask1[2] += 4;
-    if (Mask1[3] >= 0)
-      Mask1[3] += 4;
-    return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
-  }
-
-  // Break it into (shuffle shuffle_hi, shuffle_lo).
-  int LoMask[] = { -1, -1, -1, -1 };
-  int HiMask[] = { -1, -1, -1, -1 };
-
-  int *MaskPtr = LoMask;
-  unsigned MaskIdx = 0;
-  unsigned LoIdx = 0;
-  unsigned HiIdx = 2;
-  for (unsigned i = 0; i != 4; ++i) {
-    if (i == 2) {
-      MaskPtr = HiMask;
-      MaskIdx = 1;
-      LoIdx = 0;
-      HiIdx = 2;
-    }
-    int Idx = PermMask[i];
-    if (Idx < 0) {
-      Locs[i] = std::make_pair(-1, -1);
-    } else if (Idx < 4) {
-      Locs[i] = std::make_pair(MaskIdx, LoIdx);
-      MaskPtr[LoIdx] = Idx;
-      LoIdx++;
-    } else {
-      Locs[i] = std::make_pair(MaskIdx, HiIdx);
-      MaskPtr[HiIdx] = Idx;
-      HiIdx++;
-    }
-  }
-
-  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
-  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
-  int MaskOps[] = { -1, -1, -1, -1 };
-  for (unsigned i = 0; i != 4; ++i)
-    if (Locs[i].first != -1)
-      MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
-  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
-}
-
-static bool MayFoldVectorLoad(SDValue V) {
-  while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
-    V = V.getOperand(0);
-
-  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
-    V = V.getOperand(0);
-  if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
-      V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
-    // BUILD_VECTOR (load), undef
-    V = V.getOperand(0);
-
-  return MayFoldLoad(V);
-}
-
-static
-SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
-
-  // Canonizalize to v2f64.
-  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
-  return DAG.getNode(ISD::BITCAST, dl, VT,
-                     getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
-                                          V1, DAG));
-}
-
-static
-SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
-                        bool HasSSE2) {
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  MVT VT = Op.getSimpleValueType();
-
-  assert(VT != MVT::v2i64 && "unsupported shuffle type");
-
-  if (HasSSE2 && VT == MVT::v2f64)
-    return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
-
-  // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
-  return DAG.getNode(ISD::BITCAST, dl, VT,
-                     getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
-                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
-                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
-}
-
-static
-SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  MVT VT = Op.getSimpleValueType();
-
-  assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
-         "unsupported shuffle type");
-
-  if (V2.getOpcode() == ISD::UNDEF)
-    V2 = V1;
-
-  // v4i32 or v4f32
-  return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
-}
-
-static
-SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  MVT VT = Op.getSimpleValueType();
-  unsigned NumElems = VT.getVectorNumElements();
-
-  // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
-  // operand of these instructions is only memory, so check if there's a
-  // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
-  // same masks.
-  bool CanFoldLoad = false;
-
-  // Trivial case, when V2 comes from a load.
-  if (MayFoldVectorLoad(V2))
-    CanFoldLoad = true;
-
-  // When V1 is a load, it can be folded later into a store in isel, example:
-  //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
-  //    turns into:
-  //  (MOVLPSmr addr:$src1, VR128:$src2)
-  // So, recognize this potential and also use MOVLPS or MOVLPD
-  else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
-    CanFoldLoad = true;
-
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  if (CanFoldLoad) {
-    if (HasSSE2 && NumElems == 2)
-      return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
-
-    if (NumElems == 4)
-      // If we don't care about the second element, proceed to use movss.
-      if (SVOp->getMaskElt(1) != -1)
-        return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
-  }
-
-  // movl and movlp will both match v2i64, but v2i64 is never matched by
-  // movl earlier because we make it strict to avoid messing with the movlp load
-  // folding logic (see the code above getMOVLP call). Match it here then,
-  // this is horrible, but will stay like this until we move all shuffle
-  // matching to x86 specific nodes. Note that for the 1st condition all
-  // types are matched with movsd.
-  if (HasSSE2) {
-    // FIXME: isMOVLMask should be checked and matched before getMOVLP,
-    // as to remove this logic from here, as much as possible
-    if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
-      return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
-    return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
-  }
-
-  assert(VT != MVT::v4i32 && "unsupported shuffle type");
-
-  // Invert the operand order and use SHUFPS to match it.
-  return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
-                              getShuffleSHUFImmediate(SVOp), DAG);
-}
-
-static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
-                                         SelectionDAG &DAG) {
-  SDLoc dl(Load);
-  MVT VT = Load->getSimpleValueType(0);
-  MVT EVT = VT.getVectorElementType();
-  SDValue Addr = Load->getOperand(1);
-  SDValue NewAddr = DAG.getNode(
-      ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
-      DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
-
-  SDValue NewLoad =
-      DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
-                  DAG.getMachineFunction().getMachineMemOperand(
-                      Load->getMemOperand(), 0, EVT.getStoreSize()));
-  return NewLoad;
-}
-
-// It is only safe to call this function if isINSERTPSMask is true for
-// this shufflevector mask.
-static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
-                           SelectionDAG &DAG) {
-  // Generate an insertps instruction when inserting an f32 from memory onto a
-  // v4f32 or when copying a member from one v4f32 to another.
-  // We also use it for transferring i32 from one register to another,
-  // since it simply copies the same bits.
-  // If we're transferring an i32 from memory to a specific element in a
-  // register, we output a generic DAG that will match the PINSRD
-  // instruction.
-  MVT VT = SVOp->getSimpleValueType(0);
-  MVT EVT = VT.getVectorElementType();
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
-  auto Mask = SVOp->getMask();
-  assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
-         "unsupported vector type for insertps/pinsrd");
-
-  auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
-  auto FromV2Predicate = [](const int &i) { return i >= 4; };
-  int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
-
-  SDValue From;
-  SDValue To;
-  unsigned DestIndex;
-  if (FromV1 == 1) {
-    From = V1;
-    To = V2;
-    DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
-                Mask.begin();
-
-    // If we have 1 element from each vector, we have to check if we're
-    // changing V1's element's place. If so, we're done. Otherwise, we
-    // should assume we're changing V2's element's place and behave
-    // accordingly.
-    int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
-    assert(DestIndex <= INT32_MAX && "truncated destination index");
-    if (FromV1 == FromV2 &&
-        static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
-      From = V2;
-      To = V1;
-      DestIndex =
-          std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
-    }
-  } else {
-    assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
-           "More than one element from V1 and from V2, or no elements from one "
-           "of the vectors. This case should not have returned true from "
-           "isINSERTPSMask");
-    From = V2;
-    To = V1;
-    DestIndex =
-        std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
-  }
-
-  // Get an index into the source vector in the range [0,4) (the mask is
-  // in the range [0,8) because it can address V1 and V2)
-  unsigned SrcIndex = Mask[DestIndex] % 4;
-  if (MayFoldLoad(From)) {
-    // Trivial case, when From comes from a load and is only used by the
-    // shuffle. Make it use insertps from the vector that we need from that
-    // load.
-    SDValue NewLoad =
-        NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
-    if (!NewLoad.getNode())
-      return SDValue();
-
-    if (EVT == MVT::f32) {
-      // Create this as a scalar to vector to match the instruction pattern.
-      SDValue LoadScalarToVector =
-          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
-      SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
-      return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
-                         InsertpsMask);
-    } else { // EVT == MVT::i32
-      // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
-      // instruction, to match the PINSRD instruction, which loads an i32 to a
-      // certain vector element.
-      return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
-                         DAG.getConstant(DestIndex, MVT::i32));
-    }
-  }
-
-  // Vector-element-to-vector
-  SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
-  return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
-}
-
-// Reduce a vector shuffle to zext.
-static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
-                                    SelectionDAG &DAG) {
-  // PMOVZX is only available from SSE41.
-  if (!Subtarget->hasSSE41())
-    return SDValue();
-
-  MVT VT = Op.getSimpleValueType();
-
-  // Only AVX2 support 256-bit vector integer extending.
-  if (!Subtarget->hasInt256() && VT.is256BitVector())
-    return SDValue();
-
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  SDLoc DL(Op);
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  unsigned NumElems = VT.getVectorNumElements();
-
-  // Extending is an unary operation and the element type of the source vector
-  // won't be equal to or larger than i64.
-  if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
-      VT.getVectorElementType() == MVT::i64)
-    return SDValue();
-
-  // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
-  unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
-  while ((1U << Shift) < NumElems) {
-    if (SVOp->getMaskElt(1U << Shift) == 1)
-      break;
-    Shift += 1;
-    // The maximal ratio is 8, i.e. from i8 to i64.
-    if (Shift > 3)
-      return SDValue();
-  }
-
-  // Check the shuffle mask.
-  unsigned Mask = (1U << Shift) - 1;
-  for (unsigned i = 0; i != NumElems; ++i) {
-    int EltIdx = SVOp->getMaskElt(i);
-    if ((i & Mask) != 0 && EltIdx != -1)
-      return SDValue();
-    if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
-      return SDValue();
-  }
-
-  unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
-  MVT NeVT = MVT::getIntegerVT(NBits);
-  MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
-
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
-    return SDValue();
-
-  return DAG.getNode(ISD::BITCAST, DL, VT,
-                     DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
-}
-
-static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
-                                      SelectionDAG &DAG) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  MVT VT = Op.getSimpleValueType();
-  SDLoc dl(Op);
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-
-  if (isZeroShuffle(SVOp))
-    return getZeroVector(VT, Subtarget, DAG, dl);
-
-  // Handle splat operations
-  if (SVOp->isSplat()) {
-    // Use vbroadcast whenever the splat comes from a foldable load
-    SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
-    if (Broadcast.getNode())
-      return Broadcast;
-  }
-
-  // Check integer expanding shuffles.
-  SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
-  if (NewOp.getNode())
-    return NewOp;
-
-  // If the shuffle can be profitably rewritten as a narrower shuffle, then
-  // do it!
-  if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
-      VT == MVT::v32i8) {
-    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
-    if (NewOp.getNode())
-      return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
-  } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
-    // FIXME: Figure out a cleaner way to do this.
-    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
-      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
-      if (NewOp.getNode()) {
-        MVT NewVT = NewOp.getSimpleValueType();
-        if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
-                               NewVT, true, false))
-          return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
-                              dl);
-      }
-    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
-      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
-      if (NewOp.getNode()) {
-        MVT NewVT = NewOp.getSimpleValueType();
-        if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
-          return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
-                              dl);
-      }
-    }
-  }
-  return SDValue();
-}
-
-SDValue
-X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  MVT VT = Op.getSimpleValueType();
-  SDLoc dl(Op);
-  unsigned NumElems = VT.getVectorNumElements();
-  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
-  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
-  bool V1IsSplat = false;
-  bool V2IsSplat = false;
-  bool HasSSE2 = Subtarget->hasSSE2();
-  bool HasFp256    = Subtarget->hasFp256();
-  bool HasInt256   = Subtarget->hasInt256();
-  MachineFunction &MF = DAG.getMachineFunction();
-  bool OptForSize = MF.getFunction()->getAttributes().
-    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
-
-  // Check if we should use the experimental vector shuffle lowering. If so,
-  // delegate completely to that code path.
-  if (ExperimentalVectorShuffleLowering)
-    return lowerVectorShuffle(Op, Subtarget, DAG);
-
-  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
-
-  if (V1IsUndef && V2IsUndef)
-    return DAG.getUNDEF(VT);
-
-  // When we create a shuffle node we put the UNDEF node to second operand,
-  // but in some cases the first operand may be transformed to UNDEF.
-  // In this case we should just commute the node.
-  if (V1IsUndef)
-    return DAG.getCommutedVectorShuffle(*SVOp);
-
-  // Vector shuffle lowering takes 3 steps:
-  //
-  // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
-  //    narrowing and commutation of operands should be handled.
-  // 2) Matching of shuffles with known shuffle masks to x86 target specific
-  //    shuffle nodes.
-  // 3) Rewriting of unmatched masks into new generic shuffle operations,
-  //    so the shuffle can be broken into other shuffles and the legalizer can
-  //    try the lowering again.
-  //
-  // The general idea is that no vector_shuffle operation should be left to
-  // be matched during isel, all of them must be converted to a target specific
-  // node here.
-
-  // Normalize the input vectors. Here splats, zeroed vectors, profitable
-  // narrowing and commutation of operands should be handled. The actual code
-  // doesn't include all of those, work in progress...
-  SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
-  if (NewOp.getNode())
-    return NewOp;
-
-  SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
-
-  // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
-  // unpckh_undef). Only use pshufd if speed is more important than size.
-  if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
-    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
-  if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
-    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
-
-  if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
-      V2IsUndef && MayFoldVectorLoad(V1))
-    return getMOVDDup(Op, dl, V1, DAG);
-
-  if (isMOVHLPS_v_undef_Mask(M, VT))
-    return getMOVHighToLow(Op, dl, DAG);
-
-  // Use to match splats
-  if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
-      (VT == MVT::v2f64 || VT == MVT::v2i64))
-    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
-
-  if (isPSHUFDMask(M, VT)) {
-    // The actual implementation will match the mask in the if above and then
-    // during isel it can match several different instructions, not only pshufd
-    // as its name says, sad but true, emulate the behavior for now...
-    if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
-      return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
-
-    unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
-
-    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
-      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
-
-    if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
-      return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
-                                  DAG);
-
-    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
-                                TargetMask, DAG);
-  }
-
-  if (isPALIGNRMask(M, VT, Subtarget))
-    return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
-                                getShufflePALIGNRImmediate(SVOp),
-                                DAG);
-
-  if (isVALIGNMask(M, VT, Subtarget))
-    return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
-                                getShuffleVALIGNImmediate(SVOp),
-                                DAG);
-
-  // Check if this can be converted into a logical shift.
-  bool isLeft = false;
-  unsigned ShAmt = 0;
-  SDValue ShVal;
-  bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
-  if (isShift && ShVal.hasOneUse()) {
-    // If the shifted value has multiple uses, it may be cheaper to use
-    // v_set0 + movlhps or movhlps, etc.
-    MVT EltVT = VT.getVectorElementType();
-    ShAmt *= EltVT.getSizeInBits();
-    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
-  }
-
-  if (isMOVLMask(M, VT)) {
-    if (ISD::isBuildVectorAllZeros(V1.getNode()))
-      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
-    if (!isMOVLPMask(M, VT)) {
-      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
-        return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
-
-      if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
-    }
-  }
-
-  // FIXME: fold these into legal mask.
-  if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
-    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
-
-  if (isMOVHLPSMask(M, VT))
-    return getMOVHighToLow(Op, dl, DAG);
-
-  if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
-    return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
-
-  if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
-    return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
-
-  if (isMOVLPMask(M, VT))
-    return getMOVLP(Op, dl, DAG, HasSSE2);
-
-  if (ShouldXformToMOVHLPS(M, VT) ||
-      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
-    return DAG.getCommutedVectorShuffle(*SVOp);
-
-  if (isShift) {
-    // No better options. Use a vshldq / vsrldq.
-    MVT EltVT = VT.getVectorElementType();
-    ShAmt *= EltVT.getSizeInBits();
-    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
-  }
-
-  bool Commuted = false;
-  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
-  // 1,1,1,1 -> v8i16 though.
-  BitVector UndefElements;
-  if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
-    if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
-      V1IsSplat = true;
-  if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
-    if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
-      V2IsSplat = true;
-
-  // Canonicalize the splat or undef, if present, to be on the RHS.
-  if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
-    CommuteVectorShuffleMask(M, NumElems);
-    std::swap(V1, V2);
-    std::swap(V1IsSplat, V2IsSplat);
-    Commuted = true;
-  }
-
-  if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
-    // Shuffling low element of v1 into undef, just return v1.
-    if (V2IsUndef)
-      return V1;
-    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
-    // the instruction selector will not match, so get a canonical MOVL with
-    // swapped operands to undo the commute.
-    return getMOVL(DAG, dl, VT, V2, V1);
-  }
-
-  if (isUNPCKLMask(M, VT, HasInt256))
-    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
-
-  if (isUNPCKHMask(M, VT, HasInt256))
-    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
-
-  if (V2IsSplat) {
-    // Normalize mask so all entries that point to V2 points to its first
-    // element then try to match unpck{h|l} again. If match, return a
-    // new vector_shuffle with the corrected mask.p
-    SmallVector<int, 8> NewMask(M.begin(), M.end());
-    NormalizeMask(NewMask, NumElems);
-    if (isUNPCKLMask(NewMask, VT, HasInt256, true))
-      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
-    if (isUNPCKHMask(NewMask, VT, HasInt256, true))
-      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
-  }
-
-  if (Commuted) {
-    // Commute is back and try unpck* again.
-    // FIXME: this seems wrong.
-    CommuteVectorShuffleMask(M, NumElems);
-    std::swap(V1, V2);
-    std::swap(V1IsSplat, V2IsSplat);
-
-    if (isUNPCKLMask(M, VT, HasInt256))
-      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
-
-    if (isUNPCKHMask(M, VT, HasInt256))
-      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
-  }
-
-  // Normalize the node to match x86 shuffle ops if needed
-  if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
-    return DAG.getCommutedVectorShuffle(*SVOp);
-
-  // The checks below are all present in isShuffleMaskLegal, but they are
-  // inlined here right now to enable us to directly emit target specific
-  // nodes, and remove one by one until they don't return Op anymore.
-
-  if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
-      SVOp->getSplatIndex() == 0 && V2IsUndef) {
-    if (VT == MVT::v2f64 || VT == MVT::v2i64)
-      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
-  }
-
-  if (isPSHUFHWMask(M, VT, HasInt256))
-    return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
-                                getShufflePSHUFHWImmediate(SVOp),
-                                DAG);
-
-  if (isPSHUFLWMask(M, VT, HasInt256))
-    return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
-                                getShufflePSHUFLWImmediate(SVOp),
-                                DAG);
-
-  unsigned MaskValue;
-  if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
-                  &MaskValue))
-    return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
-
-  if (isSHUFPMask(M, VT))
-    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
-                                getShuffleSHUFImmediate(SVOp), DAG);
-
-  if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
-    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
-  if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
-    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
-
-  //===--------------------------------------------------------------------===//
-  // Generate target specific nodes for 128 or 256-bit shuffles only
-  // supported in the AVX instruction set.
-  //
-
-  // Handle VMOVDDUPY permutations
-  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
-    return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
-
-  // Handle VPERMILPS/D* permutations
-  if (isVPERMILPMask(M, VT)) {
-    if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
-      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
-                                  getShuffleSHUFImmediate(SVOp), DAG);
-    return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
-                                getShuffleSHUFImmediate(SVOp), DAG);
-  }
-
-  unsigned Idx;
-  if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
-    return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
-                              Idx*(NumElems/2), DAG, dl);
-
-  // Handle VPERM2F128/VPERM2I128 permutations
-  if (isVPERM2X128Mask(M, VT, HasFp256))
-    return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
-                                V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
-
-  if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
-    return getINSERTPS(SVOp, dl, DAG);
-
-  unsigned Imm8;
-  if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
-    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
-
-  if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
-      VT.is512BitVector()) {
-    MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
-    MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
-    SmallVector<SDValue, 16> permclMask;
-    for (unsigned i = 0; i != NumElems; ++i) {
-      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
-    }
-
-    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
-    if (V2IsUndef)
-      // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
-      return DAG.getNode(X86ISD::VPERMV, dl, VT,
-                          DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
-    return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
-                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
-  }
-
-  //===--------------------------------------------------------------------===//
-  // Since no target specific shuffle was selected for this generic one,
-  // lower it into other known shuffles. FIXME: this isn't true yet, but
-  // this is the plan.
-  //
-
-  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
-  if (VT == MVT::v8i16) {
-    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
-    if (NewOp.getNode())
-      return NewOp;
-  }
-
-  if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
-    SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
-    if (NewOp.getNode())
-      return NewOp;
-  }
-
-  if (VT == MVT::v16i8) {
-    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
-    if (NewOp.getNode())
-      return NewOp;
-  }
-
-  if (VT == MVT::v32i8) {
-    SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
-    if (NewOp.getNode())
-      return NewOp;
-  }
-
-  // Handle all 128-bit wide vectors with 4 elements, and match them with
-  // several different shuffle types.
-  if (NumElems == 4 && VT.is128BitVector())
-    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
-
-  // Handle general 256-bit shuffles
-  if (VT.is256BitVector())
-    return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
-
-  return SDValue();
-}
-
 // This function assumes its argument is a BUILD_VECTOR of constants or
 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
 // true.
@@ -12344,48 +10083,29 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
   return true;
 }
 
-/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
-/// instruction.
-static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
-                                    SelectionDAG &DAG) {
+/// \brief Try to lower a VSELECT instruction to a vector shuffle.
+static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
+                                           const X86Subtarget *Subtarget,
+                                           SelectionDAG &DAG) {
   SDValue Cond = Op.getOperand(0);
   SDValue LHS = Op.getOperand(1);
   SDValue RHS = Op.getOperand(2);
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
-  MVT EltVT = VT.getVectorElementType();
-  unsigned NumElems = VT.getVectorNumElements();
-
-  // There is no blend with immediate in AVX-512.
-  if (VT.is512BitVector())
-    return SDValue();
-
-  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
-    return SDValue();
-  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
-    return SDValue();
 
   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
     return SDValue();
+  auto *CondBV = cast<BuildVectorSDNode>(Cond);
 
-  // Check the mask for BLEND and build the value.
-  unsigned MaskValue = 0;
-  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
-    return SDValue();
-
-  // Convert i32 vectors to floating point if it is not AVX2.
-  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
-  MVT BlendVT = VT;
-  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
-    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
-                               NumElems);
-    LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
-    RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+  // Only non-legal VSELECTs reach this lowering, convert those into generic
+  // shuffles and re-use the shuffle lowering path for blends.
+  SmallVector<int, 32> Mask;
+  for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
+    SDValue CondElt = CondBV->getOperand(i);
+    Mask.push_back(
+        isa<ConstantSDNode>(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1);
   }
-
-  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
-                            DAG.getConstant(MaskValue, MVT::i32));
-  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+  return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
 }
 
 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -12396,28 +10116,41 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
     return SDValue();
 
-  SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
+  // Try to lower this to a blend-style vector shuffle. This can handle all
+  // constant condition cases.
+  SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG);
   if (BlendOp.getNode())
     return BlendOp;
 
-  // Some types for vselect were previously set to Expand, not Legal or
-  // Custom. Return an empty SDValue so we fall-through to Expand, after
-  // the Custom lowering phase.
-  MVT VT = Op.getSimpleValueType();
-  switch (VT.SimpleTy) {
+  // Variable blends are only legal from SSE4.1 onward.
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  // Only some types will be legal on some subtargets. If we can emit a legal
+  // VSELECT-matching blend, return Op, and but if we need to expand, return
+  // a null value.
+  switch (Op.getSimpleValueType().SimpleTy) {
   default:
-    break;
+    // Most of the vector types have blends past SSE4.1.
+    return Op;
+
+  case MVT::v32i8:
+    // The byte blends for AVX vectors were introduced only in AVX2.
+    if (Subtarget->hasAVX2())
+      return Op;
+
+    return SDValue();
+
   case MVT::v8i16:
   case MVT::v16i16:
+    // AVX-512 BWI and VLX features support VSELECT with i16 elements.
     if (Subtarget->hasBWI() && Subtarget->hasVLX())
-      break;
+      return Op;
+
+    // FIXME: We should custom lower this by fixing the condition and using i8
+    // blends.
     return SDValue();
   }
-
-  // We couldn't create a "Blend with immediate" node.
-  // This node should still be legal, but we'll have to emit a blendv*
-  // instruction.
-  return Op;
 }
 
 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
@@ -12493,6 +10226,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
   MVT EltVT = Op.getSimpleValueType();
 
   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
+  assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
+         "Unexpected vector type in ExtractBitFromMaskVector");
 
   // variable index can't be handled in mask registers,
   // extend vector to VR512
@@ -12506,6 +10241,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
 
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   const TargetRegisterClass* rc = getRegClassFor(VecVT);
+  if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
+    rc = getRegClassFor(MVT::v16i1);
   unsigned MaxSift = rc->getSize()*8 - 1;
   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
@@ -12631,7 +10368,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 
 /// Insert one bit to mask vector, like v16i1 or v8i1.
 /// AVX-512 feature.
-SDValue 
+SDValue
 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   SDValue Vec = Op.getOperand(0);
@@ -12644,7 +10381,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
     // insert element and then truncate the result.
     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
-    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, 
+    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
@@ -12815,27 +10552,47 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
 // the upper bits of a vector.
 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
                                      SelectionDAG &DAG) {
-  if (Subtarget->hasFp256()) {
-    SDLoc dl(Op.getNode());
-    SDValue Vec = Op.getNode()->getOperand(0);
-    SDValue SubVec = Op.getNode()->getOperand(1);
-    SDValue Idx = Op.getNode()->getOperand(2);
-
-    if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
-         Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
-        SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
-        isa<ConstantSDNode>(Idx)) {
-      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-      return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
-    }
+  if (!Subtarget->hasAVX())
+    return SDValue();
 
-    if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
-        SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
-        isa<ConstantSDNode>(Idx)) {
-      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-      return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue SubVec = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+
+  if (!isa<ConstantSDNode>(Idx))
+    return SDValue();
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  MVT OpVT = Op.getSimpleValueType();
+  MVT SubVecVT = SubVec.getSimpleValueType();
+
+  // Fold two 16-byte subvector loads into one 32-byte load:
+  // (insert_subvector (insert_subvector undef, (load addr), 0),
+  //                   (load addr + 16), Elts/2)
+  // --> load32 addr
+  if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+      Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
+      !Subtarget->isUnalignedMem32Slow()) {
+    SDValue SubVec2 = Vec.getOperand(1);
+    if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
+      if (Idx2->getZExtValue() == 0) {
+        SDValue Ops[] = { SubVec2, SubVec };
+        SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
+        if (LD.getNode())
+          return LD;
+      }
     }
   }
+
+  if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
+      SubVecVT.is128BitVector())
+    return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
+  if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
+    return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
   return SDValue();
 }
 
@@ -13392,7 +11149,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
     }
     return SDValue();
   }
-  
+
   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
          "Unknown SINT_TO_FP to lower!");
 
@@ -14039,7 +11796,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
       In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
       InVT = ExtVT;
     }
-    
+
     SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
     const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
     SDValue CP = DAG.getConstantPool(C, getPointerTy());
@@ -14233,7 +11990,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
     EltVT = VT.getVectorElementType();
     NumElts = VT.getVectorNumElements();
   }
-  
+
   unsigned EltBits = EltVT.getSizeInBits();
   LLVMContext *Context = DAG.getContext();
   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
@@ -14260,7 +12017,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
     return DAG.getNode(ISD::BITCAST, dl, VT,
                        DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
   }
-  
+
   // If not vector, then scalar.
   unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
@@ -14290,19 +12047,17 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   // At this point the operands and the result should have the same
   // type, and that won't be f80 since that is not custom lowered.
 
-  // First get the sign bit of second operand.
-  SmallVector<Constant*,4> CV;
-  if (SrcVT == MVT::f64) {
-    const fltSemantics &Sem = APFloat::IEEEdouble;
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
-  } else {
-    const fltSemantics &Sem = APFloat::IEEEsingle;
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
-  }
+  const fltSemantics &Sem =
+      VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
+  const unsigned SizeInBits = VT.getSizeInBits();
+
+  SmallVector<Constant *, 4> CV(
+      VT == MVT::f64 ? 2 : 4,
+      ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
+
+  // First, clear all bits but the sign bit from the second operand (sign).
+  CV[0] = ConstantFP::get(*Context,
+                          APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
   Constant *C = ConstantVector::get(CV);
   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
@@ -14310,40 +12065,30 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
                               false, false, false, 16);
   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
 
-  // Shift sign bit right or left if the two operands have different types.
-  if (SrcVT.bitsGT(VT)) {
-    // Op0 is MVT::f32, Op1 is MVT::f64.
-    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
-    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
-                          DAG.getConstant(32, MVT::i32));
-    SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
-    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
-                          DAG.getIntPtrConstant(0));
-  }
-
-  // Clear first operand sign bit.
-  CV.clear();
-  if (VT == MVT::f64) {
-    const fltSemantics &Sem = APFloat::IEEEdouble;
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
-                                                   APInt(64, ~(1ULL << 63)))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
+  // Next, clear the sign bit from the first operand (magnitude).
+  // If it's a constant, we can clear it here.
+  if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
+    APFloat APF = Op0CN->getValueAPF();
+    // If the magnitude is a positive zero, the sign bit alone is enough.
+    if (APF.isPosZero())
+      return SignBit;
+    APF.clearSign();
+    CV[0] = ConstantFP::get(*Context, APF);
   } else {
-    const fltSemantics &Sem = APFloat::IEEEsingle;
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
-                                                   APInt(32, ~(1U << 31)))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV[0] = ConstantFP::get(
+        *Context,
+        APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
   }
   C = ConstantVector::get(CV);
   CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
-  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
-                              MachinePointerInfo::getConstantPool(),
-                              false, false, false, 16);
-  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
-
-  // Or the value with the sign bit.
+  SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                            MachinePointerInfo::getConstantPool(),
+                            false, false, false, 16);
+  // If the magnitude operand wasn't a constant, we need to AND out the sign.
+  if (!isa<ConstantFPSDNode>(Op0))
+    Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
+
+  // OR the magnitude value with the sign bit.
   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
 }
 
@@ -14473,11 +12218,11 @@ static bool hasNonFlagsUse(SDValue Op) {
 /// equivalent.
 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
                                     SelectionDAG &DAG) const {
-  if (Op.getValueType() == MVT::i1)
-    // KORTEST instruction should be selected
-    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
-                       DAG.getConstant(0, Op.getValueType()));
-
+  if (Op.getValueType() == MVT::i1) {
+    SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
+                       DAG.getConstant(0, MVT::i8));
+  }
   // CF and OF aren't always set the way we want. Determine which
   // of these we need.
   bool NeedCF = false;
@@ -14697,9 +12442,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
                        DAG.getConstant(0, Op.getValueType()));
 
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-  SmallVector<SDValue, 4> Ops;
-  for (unsigned i = 0; i != NumOperands; ++i)
-    Ops.push_back(Op.getOperand(i));
+  SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
 
   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
   DAG.ReplaceAllUsesWith(Op, New);
@@ -14717,16 +12460,16 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
      if (Op0.getValueType() == MVT::i1)
        llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
   }
- 
+
   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
-    // Do the comparison at i32 if it's smaller, besides the Atom case. 
-    // This avoids subregister aliasing issues. Keep the smaller reference 
-    // if we're optimizing for size, however, as that'll allow better folding 
+    // Do the comparison at i32 if it's smaller, besides the Atom case.
+    // This avoids subregister aliasing issues. Keep the smaller reference
+    // if we're optimizing for size, however, as that'll allow better folding
     // of memory operations.
     if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
-        !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
-             AttributeSet::FunctionIndex, Attribute::MinSize) &&
+        !DAG.getMachineFunction().getFunction()->hasFnAttribute(
+            Attribute::MinSize) &&
         !Subtarget->isAtom()) {
       unsigned ExtendOp =
           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
@@ -14780,7 +12523,7 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
     return SDValue();
 
   EVT VT = Op.getValueType();
-  
+
   // SSE1 has rsqrtss and rsqrtps.
   // TODO: Add support for AVX512 (v16f32).
   // It is likely not profitable to do this for f64 because a double-precision
@@ -14808,9 +12551,9 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
   // significant digits in the divisor.
   if (!Subtarget->useReciprocalEst())
     return SDValue();
-  
+
   EVT VT = Op.getValueType();
-  
+
   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
   // TODO: Add support for AVX512 (v16f32).
   // It is likely not profitable to do this for f64 because a double-precision
@@ -15307,8 +13050,11 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
       cast<ConstantSDNode>(Op1)->isNullValue() &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
-    if (NewSetCC.getNode())
+    if (NewSetCC.getNode()) {
+      if (VT == MVT::i1)
+        return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
       return NewSetCC;
+    }
   }
 
   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
@@ -15629,11 +13375,11 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget
 
        ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
-    
+
        ((Subtarget->hasDQI() && VT.is512BitVector() &&
         VTElt.getSizeInBits() >= 32))))
     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
-    
+
   unsigned int NumElts = VT.getVectorNumElements();
 
   if (NumElts != 8 && NumElts != 16)
@@ -15718,6 +13464,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
 // may emit an illegal shuffle but the expansion is still better than scalar
 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
 // we'll emit a shuffle and a arithmetic shift.
+// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
 // TODO: It is possible to support ZExt by zeroing the undef values during
 // the shuffle phase or after the shuffle.
 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
@@ -15797,9 +13544,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
   // Attempt to load the original value using scalar loads.
   // Find the largest scalar type that divides the total loaded size.
   MVT SclrLoadTy = MVT::i8;
-  for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
-       tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
-    MVT Tp = (MVT::SimpleValueType)tp;
+  for (MVT Tp : MVT::integer_valuetypes()) {
     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
       SclrLoadTy = Tp;
     }
@@ -16232,7 +13977,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool SplitStack = MF.shouldSplitStack();
-  bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) ||
+  bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
                SplitStack;
   SDLoc dl(Op);
 
@@ -16258,7 +14003,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
     Chain = SP.getValue(1);
     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
-    const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering();
+    const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
     unsigned StackAlign = TFI.getStackAlignment();
     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
     if (Align > StackAlign)
@@ -16316,8 +14061,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 
     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
 
-    const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
-        DAG.getSubtarget().getRegisterInfo());
+    const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
     unsigned SPReg = RegInfo->getStackRegister();
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
     Chain = SP.getValue(1);
@@ -16427,21 +14171,16 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   if (ArgMode == 2) {
     // Sanity Check: Make sure using fp_offset makes sense.
     assert(!DAG.getTarget().Options.UseSoftFloat &&
-           !(DAG.getMachineFunction()
-                .getFunction()->getAttributes()
-                .hasAttribute(AttributeSet::FunctionIndex,
-                              Attribute::NoImplicitFloat)) &&
+           !(DAG.getMachineFunction().getFunction()->hasFnAttribute(
+               Attribute::NoImplicitFloat)) &&
            Subtarget->hasSSE1());
   }
 
   // Insert VAARG_64 node into the DAG
   // VAARG_64 returns two values: Variable Argument Address, Chain
-  SmallVector<SDValue, 11> InstOps;
-  InstOps.push_back(Chain);
-  InstOps.push_back(SrcPtr);
-  InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
-  InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
-  InstOps.push_back(DAG.getConstant(Align, MVT::i32));
+  SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, MVT::i32),
+                       DAG.getConstant(ArgMode, MVT::i8),
+                       DAG.getConstant(Align, MVT::i32)};
   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
                                           VTs, InstOps, MVT::i64,
@@ -16558,7 +14297,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
                                    SDValue SrcOp, SDValue ShAmt,
                                    SelectionDAG &DAG) {
-  assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
+  MVT SVT = ShAmt.getSimpleValueType();
+  assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
 
   // Catch shift-by-constant.
   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
@@ -16573,13 +14313,28 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
   }
 
-  // Need to build a vector containing shift amount
-  // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
-  SDValue ShOps[4];
-  ShOps[0] = ShAmt;
-  ShOps[1] = DAG.getConstant(0, MVT::i32);
-  ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
-  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps);
+  const X86Subtarget &Subtarget =
+      static_cast<const X86Subtarget &>(DAG.getSubtarget());
+  if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
+      ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
+    // Let the shuffle legalizer expand this shift amount node.
+    SDValue Op0 = ShAmt.getOperand(0);
+    Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
+    ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
+  } else {
+    // Need to build a vector containing shift amount.
+    // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
+    SmallVector<SDValue, 4> ShOps;
+    ShOps.push_back(ShAmt);
+    if (SVT == MVT::i32) {
+      ShOps.push_back(DAG.getConstant(0, SVT));
+      ShOps.push_back(DAG.getUNDEF(SVT));
+    }
+    ShOps.push_back(DAG.getUNDEF(SVT));
+
+    MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
+    ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
+  }
 
   // The return type has to be a 128-bit type with the same element
   // type as the input type.
@@ -16628,52 +14383,28 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
     return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
 }
 
-static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_fma_vfmadd_ps:
-    case Intrinsic::x86_fma_vfmadd_pd:
-    case Intrinsic::x86_fma_vfmadd_ps_256:
-    case Intrinsic::x86_fma_vfmadd_pd_256:
-    case Intrinsic::x86_fma_mask_vfmadd_ps_512:
-    case Intrinsic::x86_fma_mask_vfmadd_pd_512:
-      return X86ISD::FMADD;
-    case Intrinsic::x86_fma_vfmsub_ps:
-    case Intrinsic::x86_fma_vfmsub_pd:
-    case Intrinsic::x86_fma_vfmsub_ps_256:
-    case Intrinsic::x86_fma_vfmsub_pd_256:
-    case Intrinsic::x86_fma_mask_vfmsub_ps_512:
-    case Intrinsic::x86_fma_mask_vfmsub_pd_512:
-      return X86ISD::FMSUB;
-    case Intrinsic::x86_fma_vfnmadd_ps:
-    case Intrinsic::x86_fma_vfnmadd_pd:
-    case Intrinsic::x86_fma_vfnmadd_ps_256:
-    case Intrinsic::x86_fma_vfnmadd_pd_256:
-    case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
-    case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
-      return X86ISD::FNMADD;
-    case Intrinsic::x86_fma_vfnmsub_ps:
-    case Intrinsic::x86_fma_vfnmsub_pd:
-    case Intrinsic::x86_fma_vfnmsub_ps_256:
-    case Intrinsic::x86_fma_vfnmsub_pd_256:
-    case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
-    case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
-      return X86ISD::FNMSUB;
-    case Intrinsic::x86_fma_vfmaddsub_ps:
-    case Intrinsic::x86_fma_vfmaddsub_pd:
-    case Intrinsic::x86_fma_vfmaddsub_ps_256:
-    case Intrinsic::x86_fma_vfmaddsub_pd_256:
-    case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
-    case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
-      return X86ISD::FMADDSUB;
-    case Intrinsic::x86_fma_vfmsubadd_ps:
-    case Intrinsic::x86_fma_vfmsubadd_pd:
-    case Intrinsic::x86_fma_vfmsubadd_ps_256:
-    case Intrinsic::x86_fma_vfmsubadd_pd_256:
-    case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
-    case Intrinsic::x86_fma_mask_vfmsubadd_pd_512:
-      return X86ISD::FMSUBADD;
-    }
+/// \brief Creates an SDNode for a predicated scalar operation.
+/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
+/// The mask is comming as MVT::i8 and it should be truncated
+/// to MVT::i1 while lowering masking intrinsics.
+/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
+/// "X86select" instead of "vselect". We just can't create the "vselect" node for
+/// a scalar instruction.
+static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
+                                    SDValue PreservedSrc,
+                                    const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
+    if (isAllOnes(Mask))
+      return Op;
+
+    EVT VT = Op.getValueType();
+    SDLoc dl(Op);
+    // The mask should be of type MVT::i1
+    SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+
+    if (PreservedSrc.getOpcode() == ISD::UNDEF)
+      PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+    return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
 }
 
 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
@@ -16701,7 +14432,73 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                               RoundingMode),
                                   Mask, Src0, Subtarget, DAG);
     }
-                                              
+    case INTR_TYPE_SCALAR_MASK_RM: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src0 = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      // There are 2 kinds of intrinsics in this group:
+      // (1) With supress-all-exceptions (sae) - 6 operands
+      // (2) With rounding mode and sae - 7 operands.
+      if (Op.getNumOperands() == 6) {
+        SDValue Sae  = Op.getOperand(5);
+        return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+                                                Sae),
+                                    Mask, Src0, Subtarget, DAG);
+      }
+      assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
+      SDValue RoundingMode  = Op.getOperand(5);
+      SDValue Sae  = Op.getOperand(6);
+      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+                                              RoundingMode, Sae),
+                                  Mask, Src0, Subtarget, DAG);
+    }
+    case INTR_TYPE_2OP_MASK: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue PassThru = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      // We specify 2 possible opcodes for intrinsics with rounding modes.
+      // First, we check if the intrinsic may have non-default rounding mode,
+      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(5);
+        unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+        if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                      dl, Op.getValueType(),
+                                      Src1, Src2, Rnd),
+                                      Mask, PassThru, Subtarget, DAG);
+        }
+      }
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              Src1,Src2),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case FMA_OP_MASK: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      // We specify 2 possible opcodes for intrinsics with rounding modes.
+      // First, we check if the intrinsic may have non-default rounding mode,
+      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(5);
+        if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+            X86::STATIC_ROUNDING::CUR_DIRECTION)
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                                  dl, Op.getValueType(),
+                                                  Src1, Src2, Src3, Rnd),
+                                      Mask, Src1, Subtarget, DAG);
+      }
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+                                              dl, Op.getValueType(),
+                                              Src1, Src2, Src3),
+                                  Mask, Src1, Subtarget, DAG);
+    }
     case CMP_MASK:
     case CMP_MASK_CC: {
       // Comparison intrinsics with masks.
@@ -16751,9 +14548,45 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
                                  Op.getOperand(1), Op.getOperand(2), DAG);
     case VSHIFT_MASK:
-      return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
-                                                      Op.getOperand(1), Op.getOperand(2), DAG),
-                                  Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);;
+      return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
+                                                      Op.getSimpleValueType(),
+                                                      Op.getOperand(1),
+                                                      Op.getOperand(2), DAG),
+                                  Op.getOperand(4), Op.getOperand(3), Subtarget,
+                                  DAG);
+    case COMPRESS_EXPAND_IN_REG: {
+      SDValue Mask = Op.getOperand(3);
+      SDValue DataToCompress = Op.getOperand(1);
+      SDValue PassThru = Op.getOperand(2);
+      if (isAllOnes(Mask)) // return data as is
+        return Op.getOperand(1);
+      EVT VT = Op.getValueType();
+      EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                    VT.getVectorNumElements());
+      EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                       Mask.getValueType().getSizeInBits());
+      SDLoc dl(Op);
+      SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                                  DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+                                  DAG.getIntPtrConstant(0));
+
+      return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
+                         PassThru);
+    }
+    case BLEND: {
+      SDValue Mask = Op.getOperand(3);
+      EVT VT = Op.getValueType();
+      EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                    VT.getVectorNumElements());
+      EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                       Mask.getValueType().getSizeInBits());
+      SDLoc dl(Op);
+      SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                                  DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+                                  DAG.getIntPtrConstant(0));
+      return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
+                         Op.getOperand(2));
+    }
     default:
       break;
     }
@@ -16762,138 +14595,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
 
-  // Arithmetic intrinsics.
-  case Intrinsic::x86_sse2_pmulu_dq:
-  case Intrinsic::x86_avx2_pmulu_dq:
-    return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-
-  case Intrinsic::x86_sse41_pmuldq:
-  case Intrinsic::x86_avx2_pmul_dq:
-    return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-
-  case Intrinsic::x86_sse2_pmulhu_w:
-  case Intrinsic::x86_avx2_pmulhu_w:
-    return DAG.getNode(ISD::MULHU, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-
-  case Intrinsic::x86_sse2_pmulh_w:
-  case Intrinsic::x86_avx2_pmulh_w:
-    return DAG.getNode(ISD::MULHS, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-
-  // SSE/SSE2/AVX floating point max/min intrinsics.
-  case Intrinsic::x86_sse_max_ps:
-  case Intrinsic::x86_sse2_max_pd:
-  case Intrinsic::x86_avx_max_ps_256:
-  case Intrinsic::x86_avx_max_pd_256:
-  case Intrinsic::x86_sse_min_ps:
-  case Intrinsic::x86_sse2_min_pd:
-  case Intrinsic::x86_avx_min_ps_256:
-  case Intrinsic::x86_avx_min_pd_256: {
-    unsigned Opcode;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_sse_max_ps:
-    case Intrinsic::x86_sse2_max_pd:
-    case Intrinsic::x86_avx_max_ps_256:
-    case Intrinsic::x86_avx_max_pd_256:
-      Opcode = X86ISD::FMAX;
-      break;
-    case Intrinsic::x86_sse_min_ps:
-    case Intrinsic::x86_sse2_min_pd:
-    case Intrinsic::x86_avx_min_ps_256:
-    case Intrinsic::x86_avx_min_pd_256:
-      Opcode = X86ISD::FMIN;
-      break;
-    }
-    return DAG.getNode(Opcode, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-  }
-
-  // AVX2 variable shift intrinsics
-  case Intrinsic::x86_avx2_psllv_d:
-  case Intrinsic::x86_avx2_psllv_q:
-  case Intrinsic::x86_avx2_psllv_d_256:
-  case Intrinsic::x86_avx2_psllv_q_256:
-  case Intrinsic::x86_avx2_psrlv_d:
-  case Intrinsic::x86_avx2_psrlv_q:
-  case Intrinsic::x86_avx2_psrlv_d_256:
-  case Intrinsic::x86_avx2_psrlv_q_256:
-  case Intrinsic::x86_avx2_psrav_d:
-  case Intrinsic::x86_avx2_psrav_d_256: {
-    unsigned Opcode;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_avx2_psllv_d:
-    case Intrinsic::x86_avx2_psllv_q:
-    case Intrinsic::x86_avx2_psllv_d_256:
-    case Intrinsic::x86_avx2_psllv_q_256:
-      Opcode = ISD::SHL;
-      break;
-    case Intrinsic::x86_avx2_psrlv_d:
-    case Intrinsic::x86_avx2_psrlv_q:
-    case Intrinsic::x86_avx2_psrlv_d_256:
-    case Intrinsic::x86_avx2_psrlv_q_256:
-      Opcode = ISD::SRL;
-      break;
-    case Intrinsic::x86_avx2_psrav_d:
-    case Intrinsic::x86_avx2_psrav_d_256:
-      Opcode = ISD::SRA;
-      break;
-    }
-    return DAG.getNode(Opcode, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-  }
-
-  case Intrinsic::x86_sse2_packssdw_128:
-  case Intrinsic::x86_sse2_packsswb_128:
-  case Intrinsic::x86_avx2_packssdw:
-  case Intrinsic::x86_avx2_packsswb:
-    return DAG.getNode(X86ISD::PACKSS, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-
-  case Intrinsic::x86_sse2_packuswb_128:
-  case Intrinsic::x86_sse41_packusdw:
-  case Intrinsic::x86_avx2_packuswb:
-  case Intrinsic::x86_avx2_packusdw:
-    return DAG.getNode(X86ISD::PACKUS, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-
-  case Intrinsic::x86_ssse3_pshuf_b_128:
-  case Intrinsic::x86_avx2_pshuf_b:
-    return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-
-  case Intrinsic::x86_sse2_pshuf_d:
-    return DAG.getNode(X86ISD::PSHUFD, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-
-  case Intrinsic::x86_sse2_pshufl_w:
-    return DAG.getNode(X86ISD::PSHUFLW, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-
-  case Intrinsic::x86_sse2_pshufh_w:
-    return DAG.getNode(X86ISD::PSHUFHW, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-
-  case Intrinsic::x86_ssse3_psign_b_128:
-  case Intrinsic::x86_ssse3_psign_w_128:
-  case Intrinsic::x86_ssse3_psign_d_128:
-  case Intrinsic::x86_avx2_psign_b:
-  case Intrinsic::x86_avx2_psign_w:
-  case Intrinsic::x86_avx2_psign_d:
-    return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-
-  case Intrinsic::x86_avx2_permd:
-  case Intrinsic::x86_avx2_permps:
-    // Operands intentionally swapped. Mask is last operand to intrinsic,
-    // but second operand for node/instruction.
-    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
-                       Op.getOperand(2), Op.getOperand(1));
-
   case Intrinsic::x86_avx512_mask_valign_q_512:
   case Intrinsic::x86_avx512_mask_valign_d_512:
     // Vector source operands are swapped.
@@ -17056,58 +14757,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
     return DAG.getNode(Opcode, dl, VTs, NewOps);
   }
-
-  case Intrinsic::x86_fma_mask_vfmadd_ps_512:
-  case Intrinsic::x86_fma_mask_vfmadd_pd_512:
-  case Intrinsic::x86_fma_mask_vfmsub_ps_512:
-  case Intrinsic::x86_fma_mask_vfmsub_pd_512:
-  case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
-  case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
-  case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
-  case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
-  case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
-  case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
-  case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
-  case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: {
-    auto *SAE = cast<ConstantSDNode>(Op.getOperand(5));
-    if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION)
-      return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo),
-                                              dl, Op.getValueType(),
-                                              Op.getOperand(1),
-                                              Op.getOperand(2),
-                                              Op.getOperand(3)),
-                                  Op.getOperand(4), Op.getOperand(1),
-                                  Subtarget, DAG);
-    else
-      return SDValue();
-  }
-
-  case Intrinsic::x86_fma_vfmadd_ps:
-  case Intrinsic::x86_fma_vfmadd_pd:
-  case Intrinsic::x86_fma_vfmsub_ps:
-  case Intrinsic::x86_fma_vfmsub_pd:
-  case Intrinsic::x86_fma_vfnmadd_ps:
-  case Intrinsic::x86_fma_vfnmadd_pd:
-  case Intrinsic::x86_fma_vfnmsub_ps:
-  case Intrinsic::x86_fma_vfnmsub_pd:
-  case Intrinsic::x86_fma_vfmaddsub_ps:
-  case Intrinsic::x86_fma_vfmaddsub_pd:
-  case Intrinsic::x86_fma_vfmsubadd_ps:
-  case Intrinsic::x86_fma_vfmsubadd_pd:
-  case Intrinsic::x86_fma_vfmadd_ps_256:
-  case Intrinsic::x86_fma_vfmadd_pd_256:
-  case Intrinsic::x86_fma_vfmsub_ps_256:
-  case Intrinsic::x86_fma_vfmsub_pd_256:
-  case Intrinsic::x86_fma_vfnmadd_ps_256:
-  case Intrinsic::x86_fma_vfnmadd_pd_256:
-  case Intrinsic::x86_fma_vfnmsub_ps_256:
-  case Intrinsic::x86_fma_vfnmsub_pd_256:
-  case Intrinsic::x86_fma_vfmaddsub_ps_256:
-  case Intrinsic::x86_fma_vfmaddsub_pd_256:
-  case Intrinsic::x86_fma_vfmsubadd_ps_256:
-  case Intrinsic::x86_fma_vfmsubadd_pd_256:
-    return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   }
 }
 
@@ -17305,7 +14954,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
   switch(IntrData->Type) {
   default:
     llvm_unreachable("Unknown Intrinsic Type");
-    break;    
+    break;
   case RDSEED:
   case RDRAND: {
     // Emit the node with the right value type.
@@ -17403,6 +15052,58 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
     Results.push_back(Store);
     return DAG.getMergeValues(Results, dl);
   }
+  case COMPRESS_TO_MEM: {
+    SDLoc dl(Op);
+    SDValue Mask = Op.getOperand(4);
+    SDValue DataToCompress = Op.getOperand(3);
+    SDValue Addr = Op.getOperand(2);
+    SDValue Chain = Op.getOperand(0);
+
+    if (isAllOnes(Mask)) // return just a store
+      return DAG.getStore(Chain, dl, DataToCompress, Addr,
+                          MachinePointerInfo(), false, false, 0);
+
+    EVT VT = DataToCompress.getValueType();
+    EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                  VT.getVectorNumElements());
+    EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                     Mask.getValueType().getSizeInBits());
+    SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                                DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+                                DAG.getIntPtrConstant(0));
+
+    SDValue Compressed =  DAG.getNode(IntrData->Opc0, dl, VT, VMask,
+                                      DataToCompress, DAG.getUNDEF(VT));
+    return DAG.getStore(Chain, dl, Compressed, Addr,
+                        MachinePointerInfo(), false, false, 0);
+  }
+  case EXPAND_FROM_MEM: {
+    SDLoc dl(Op);
+    SDValue Mask = Op.getOperand(4);
+    SDValue PathThru = Op.getOperand(3);
+    SDValue Addr = Op.getOperand(2);
+    SDValue Chain = Op.getOperand(0);
+    EVT VT = Op.getValueType();
+
+    if (isAllOnes(Mask)) // return just a load
+      return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
+                         false, 0);
+    EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                  VT.getVectorNumElements());
+    EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                     Mask.getValueType().getSizeInBits());
+    SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                                DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+                                DAG.getIntPtrConstant(0));
+
+    SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
+                                   false, false, false, 0);
+
+    SDValue Results[] = {
+        DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, PathThru),
+        Chain};
+    return DAG.getMergeValues(Results, dl);
+  }
   }
 }
 
@@ -17420,8 +15121,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
-        DAG.getSubtarget().getRegisterInfo());
+    const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, PtrVT,
@@ -17436,15 +15136,33 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
 }
 
 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  EVT VT = Op.getValueType();
+
   MFI->setFrameAddressIsTaken(true);
 
-  EVT VT = Op.getValueType();
+  if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
+    // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
+    // is not possible to crawl up the stack without looking at the unwind codes
+    // simultaneously.
+    int FrameAddrIndex = FuncInfo->getFAIndex();
+    if (!FrameAddrIndex) {
+      // Set up a frame object for the return address.
+      unsigned SlotSize = RegInfo->getSlotSize();
+      FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
+          SlotSize, /*Offset=*/INT64_MIN, /*IsImmutable=*/false);
+      FuncInfo->setFAIndex(FrameAddrIndex);
+    }
+    return DAG.getFrameIndex(FrameAddrIndex, VT);
+  }
+
+  unsigned FrameReg =
+      RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
   SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
-      DAG.getSubtarget().getRegisterInfo());
-  unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
           (FrameReg == X86::EBP && VT == MVT::i32)) &&
          "Invalid Frame Register!");
@@ -17471,8 +15189,7 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName,
 
 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
-      DAG.getSubtarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
 }
 
@@ -17483,8 +15200,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl      (Op);
 
   EVT PtrVT = getPointerTy();
-  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
-      DAG.getSubtarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
@@ -17531,7 +15247,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   SDLoc dl (Op);
 
   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
-  const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
 
   if (Subtarget->is64Bit()) {
     SDValue OutChains[6];
@@ -17694,8 +15410,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   */
 
   MachineFunction &MF = DAG.getMachineFunction();
-  const TargetMachine &TM = MF.getTarget();
-  const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
+  const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
   unsigned StackAlignment = TFI.getStackAlignment();
   MVT VT = Op.getSimpleValueType();
   SDLoc DL(Op);
@@ -18090,76 +15805,29 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                             DAG);
       }
 
-      if (VT == MVT::v16i8) {
-        if (Op.getOpcode() == ISD::SHL) {
-          // Make a large shift.
-          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
-                                                   MVT::v8i16, R, ShiftAmt,
-                                                   DAG);
-          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
-          // Zero out the rightmost bits.
-          SmallVector<SDValue, 16> V(16,
-                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
-                                                     MVT::i8));
-          return DAG.getNode(ISD::AND, dl, VT, SHL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
-        }
-        if (Op.getOpcode() == ISD::SRL) {
-          // Make a large shift.
-          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
-                                                   MVT::v8i16, R, ShiftAmt,
-                                                   DAG);
-          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
-          // Zero out the leftmost bits.
-          SmallVector<SDValue, 16> V(16,
-                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
-                                                     MVT::i8));
-          return DAG.getNode(ISD::AND, dl, VT, SRL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
-        }
-        if (Op.getOpcode() == ISD::SRA) {
-          if (ShiftAmt == 7) {
-            // R s>> 7  ===  R s< 0
-            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
-            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
-          }
-
-          // R s>> a === ((R u>> a) ^ m) - m
-          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
-          SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
-                                                         MVT::i8));
-          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
-          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
-          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
-          return Res;
-        }
-        llvm_unreachable("Unknown shift opcode.");
-      }
+      if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
+        unsigned NumElts = VT.getVectorNumElements();
+        MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
 
-      if (Subtarget->hasInt256() && VT == MVT::v32i8) {
         if (Op.getOpcode() == ISD::SHL) {
           // Make a large shift.
-          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
-                                                   MVT::v16i16, R, ShiftAmt,
-                                                   DAG);
+          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
+                                                   R, ShiftAmt, DAG);
           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
           // Zero out the rightmost bits.
-          SmallVector<SDValue, 32> V(32,
-                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
-                                                     MVT::i8));
+          SmallVector<SDValue, 32> V(
+              NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), MVT::i8));
           return DAG.getNode(ISD::AND, dl, VT, SHL,
                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
         }
         if (Op.getOpcode() == ISD::SRL) {
           // Make a large shift.
-          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
-                                                   MVT::v16i16, R, ShiftAmt,
-                                                   DAG);
+          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
+                                                   R, ShiftAmt, DAG);
           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
           // Zero out the leftmost bits.
-          SmallVector<SDValue, 32> V(32,
-                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
-                                                     MVT::i8));
+          SmallVector<SDValue, 32> V(
+              NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, MVT::i8));
           return DAG.getNode(ISD::AND, dl, VT, SRL,
                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
         }
@@ -18172,8 +15840,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
 
           // R s>> a === ((R u>> a) ^ m) - m
           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
-          SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
-                                                         MVT::i8));
+          SmallVector<SDValue, 32> V(NumElts,
+                                     DAG.getConstant(128 >> ShiftAmt, MVT::i8));
           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
@@ -18249,55 +15917,43 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
     SDValue BaseShAmt;
     EVT EltVT = VT.getVectorElementType();
 
-    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
-      unsigned NumElts = VT.getVectorNumElements();
-      unsigned i, j;
-      for (i = 0; i != NumElts; ++i) {
-        if (Amt.getOperand(i).getOpcode() == ISD::UNDEF)
-          continue;
-        break;
-      }
-      for (j = i; j != NumElts; ++j) {
-        SDValue Arg = Amt.getOperand(j);
-        if (Arg.getOpcode() == ISD::UNDEF) continue;
-        if (Arg != Amt.getOperand(i))
-          break;
-      }
-      if (i != NumElts && j == NumElts)
-        BaseShAmt = Amt.getOperand(i);
+    if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
+      // Check if this build_vector node is doing a splat.
+      // If so, then set BaseShAmt equal to the splat value.
+      BaseShAmt = BV->getSplatValue();
+      if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
+        BaseShAmt = SDValue();
     } else {
       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
         Amt = Amt.getOperand(0);
-      if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE &&
-               cast<ShuffleVectorSDNode>(Amt)->isSplat()) {
+
+      ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
+      if (SVN && SVN->isSplat()) {
+        unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
         SDValue InVec = Amt.getOperand(0);
         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
-          unsigned NumElts = InVec.getValueType().getVectorNumElements();
-          unsigned i = 0;
-          for (; i != NumElts; ++i) {
-            SDValue Arg = InVec.getOperand(i);
-            if (Arg.getOpcode() == ISD::UNDEF) continue;
-            BaseShAmt = Arg;
-            break;
-          }
+          assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
+                 "Unexpected shuffle index found!");
+          BaseShAmt = InVec.getOperand(SplatIdx);
         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
            if (ConstantSDNode *C =
                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
-             unsigned SplatIdx =
-               cast<ShuffleVectorSDNode>(Amt)->getSplatIndex();
              if (C->getZExtValue() == SplatIdx)
                BaseShAmt = InVec.getOperand(1);
            }
         }
-        if (!BaseShAmt.getNode())
-          BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
-                                  DAG.getIntPtrConstant(0));
+
+        if (!BaseShAmt)
+          // Avoid introducing an extract element from a shuffle.
+          BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
+                                    DAG.getIntPtrConstant(SplatIdx));
       }
     }
 
     if (BaseShAmt.getNode()) {
-      if (EltVT.bitsGT(MVT::i32))
-        BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt);
+      assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
+      if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
+        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
       else if (EltVT.bitsLT(MVT::i32))
         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
 
@@ -18415,7 +16071,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
   // If possible, lower this packed shift into a vector multiply instead of
   // expanding it into a sequence of scalar shifts.
   // Do this only if the vector shift count is a constant build_vector.
-  if (Op.getOpcode() == ISD::SHL && 
+  if (Op.getOpcode() == ISD::SHL &&
       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
        (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
@@ -18507,15 +16163,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
           CanBeSimplified = Amt2 == Amt->getOperand(j);
       }
     }
-    
+
     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
         isa<ConstantSDNode>(Amt2)) {
       // Replace this node with two shifts followed by a MOVSS/MOVSD.
       EVT CastVT = MVT::v4i32;
-      SDValue Splat1 = 
+      SDValue Splat1 =
         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
-      SDValue Splat2 = 
+      SDValue Splat2 =
         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
       if (TargetOpcode == X86ISD::MOVSD)
@@ -18704,81 +16360,17 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
-// Sign extension of the low part of vector elements. This may be used either
-// when sign extend instructions are not available or if the vector element
-// sizes already match the sign-extended size. If the vector elements are in
-// their pre-extended size and sign extend instructions are available, that will
-// be handled by LowerSIGN_EXTEND.
-SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
-  MVT VT = Op.getSimpleValueType();
-
-  if (!Subtarget->hasSSE2() || !VT.isVector())
-    return SDValue();
-
-  unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
-                      ExtraVT.getScalarType().getSizeInBits();
-
-  switch (VT.SimpleTy) {
-    default: return SDValue();
-    case MVT::v8i32:
-    case MVT::v16i16:
-      if (!Subtarget->hasFp256())
-        return SDValue();
-      if (!Subtarget->hasInt256()) {
-        // needs to be split
-        unsigned NumElems = VT.getVectorNumElements();
-
-        // Extract the LHS vectors
-        SDValue LHS = Op.getOperand(0);
-        SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
-        SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
-
-        MVT EltVT = VT.getVectorElementType();
-        EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
-        EVT ExtraEltVT = ExtraVT.getVectorElementType();
-        unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
-        ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
-                                   ExtraNumElems/2);
-        SDValue Extra = DAG.getValueType(ExtraVT);
-
-        LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
-        LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
-
-        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
-      }
-      // fall through
-    case MVT::v4i32:
-    case MVT::v8i16: {
-      SDValue Op0 = Op.getOperand(0);
-
-      // This is a sign extension of some low part of vector elements without
-      // changing the size of the vector elements themselves:
-      // Shift-Left + Shift-Right-Algebraic.
-      SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
-                                               BitsDiff, DAG);
-      return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
-                                        DAG);
-    }
-  }
-}
-
 /// Returns true if the operand type is exactly twice the native width, and
 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
-  const X86Subtarget &Subtarget =
-      getTargetMachine().getSubtarget<X86Subtarget>();
   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
 
   if (OpWidth == 64)
-    return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
+    return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
   else if (OpWidth == 128)
-    return Subtarget.hasCmpxchg16b();
+    return Subtarget->hasCmpxchg16b();
   else
     return false;
 }
@@ -18795,9 +16387,7 @@ bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
 }
 
 bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
-  const X86Subtarget &Subtarget =
-      getTargetMachine().getSubtarget<X86Subtarget>();
-  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+  unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
   const Type *MemType = AI->getType();
 
   // If the operand is too big, we must see if cmpxchg8/16b is available
@@ -18840,9 +16430,7 @@ static bool hasMFENCE(const X86Subtarget& Subtarget) {
 
 LoadInst *
 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
-  const X86Subtarget &Subtarget =
-      getTargetMachine().getSubtarget<X86Subtarget>();
-  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+  unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
   const Type *MemType = AI->getType();
   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
   // there is no benefit in turning such RMWs into loads, and it is actually
@@ -18878,7 +16466,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
     // the IR level, so we must wrap it in an intrinsic.
     return nullptr;
-  } else if (hasMFENCE(Subtarget)) {
+  } else if (hasMFENCE(*Subtarget)) {
     Function *MFence = llvm::Intrinsic::getDeclaration(M,
             Intrinsic::x86_sse2_mfence);
     Builder.CreateCall(MFence);
@@ -18997,9 +16585,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
                                  DAG.getIntPtrConstant(i)));
 
     // Explicitly mark the extra elements as Undef.
-    SDValue Undef = DAG.getUNDEF(SVT);
-    for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
-      Elts.push_back(Undef);
+    Elts.append(NumElts, DAG.getUNDEF(SVT));
 
     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
@@ -19025,6 +16611,139 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
   return SDValue();
 }
 
+static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+                          SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  SDLoc dl(Node);
+
+  Op = Op.getOperand(0);
+  EVT VT = Op.getValueType();
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "CTPOP lowering only implemented for 128/256-bit wide vector types");
+
+  unsigned NumElts = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+  unsigned Len = EltVT.getSizeInBits();
+
+  // This is the vectorized version of the "best" algorithm from
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+  // with a minor tweak to use a series of adds + shifts instead of vector
+  // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
+  //
+  //  v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
+  //  v8i32 => Always profitable
+  //
+  // FIXME: There a couple of possible improvements:
+  //
+  // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
+  // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
+  //
+  assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
+         "CTPOP not implemented for this vector element type.");
+
+  // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
+  // extra legalization.
+  bool NeedsBitcast = EltVT == MVT::i32;
+  MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
+
+  SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
+  SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
+  SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
+
+  // v = v - ((v >> 1) & 0x55555555...)
+  SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
+  SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
+  SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
+  if (NeedsBitcast)
+    Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
+
+  SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
+  SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
+  if (NeedsBitcast)
+    M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
+
+  SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
+  if (VT != And.getValueType())
+    And = DAG.getNode(ISD::BITCAST, dl, VT, And);
+  SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
+
+  // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+  SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
+  SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
+  SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
+  SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
+
+  Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
+  if (NeedsBitcast) {
+    Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
+    M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
+    Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
+  }
+
+  SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
+  SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
+  if (VT != AndRHS.getValueType()) {
+    AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
+    AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
+  }
+  SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
+
+  // v = (v + (v >> 4)) & 0x0F0F0F0F...
+  SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
+  SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
+  Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
+  Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
+
+  SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
+  SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
+  if (NeedsBitcast) {
+    Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
+    M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
+  }
+  And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
+  if (VT != And.getValueType())
+    And = DAG.getNode(ISD::BITCAST, dl, VT, And);
+
+  // The algorithm mentioned above uses:
+  //    v = (v * 0x01010101...) >> (Len - 8)
+  //
+  // Change it to use vector adds + vector shifts which yield faster results on
+  // Haswell than using vector integer multiplication.
+  //
+  // For i32 elements:
+  //    v = v + (v >> 8)
+  //    v = v + (v >> 16)
+  //
+  // For i64 elements:
+  //    v = v + (v >> 8)
+  //    v = v + (v >> 16)
+  //    v = v + (v >> 32)
+  //
+  Add = And;
+  SmallVector<SDValue, 8> Csts;
+  for (unsigned i = 8; i <= Len/2; i *= 2) {
+    Csts.assign(NumElts, DAG.getConstant(i, EltVT));
+    SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
+    Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
+    Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
+    Csts.clear();
+  }
+
+  // The result is on the least significant 6-bits on i32 and 7-bits on i64.
+  SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
+  SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
+  SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
+  if (NeedsBitcast) {
+    Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
+    M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
+  }
+  And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
+  if (VT != And.getValueType())
+    And = DAG.getNode(ISD::BITCAST, dl, VT, And);
+
+  return And;
+}
+
 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
   SDNode *Node = Op.getNode();
   SDLoc dl(Node);
@@ -19148,15 +16867,15 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Should not custom lower this!");
-  case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
     return LowerCMP_SWAP(Op, Subtarget, DAG);
+  case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
-  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
@@ -19243,6 +16962,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Do not know how to custom type legalize this operation!");
+  // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
+  case X86ISD::FMINC:
+  case X86ISD::FMIN:
+  case X86ISD::FMAXC:
+  case X86ISD::FMAX: {
+    EVT VT = N->getValueType(0);
+    if (VT != MVT::v2f32)
+      llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
+    SDValue UNDEF = DAG.getUNDEF(VT);
+    SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+                              N->getOperand(0), UNDEF);
+    SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+                              N->getOperand(1), UNDEF);
+    Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
+    return;
+  }
   case ISD::SIGN_EXTEND_INREG:
   case ISD::ADDC:
   case ISD::ADDE:
@@ -19599,6 +17334,16 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
   case X86ISD::XTEST:              return "X86ISD::XTEST";
+  case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
+  case X86ISD::EXPAND:             return "X86ISD::EXPAND";
+  case X86ISD::SELECT:             return "X86ISD::SELECT";
+  case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
+  case X86ISD::RCP28:              return "X86ISD::RCP28";
+  case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
+  case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
+  case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
+  case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
+  case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
   }
 }
 
@@ -19747,6 +17492,8 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   return false;
 }
 
+bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
+
 bool
 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
@@ -19783,68 +17530,20 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
   if (!VT.isSimple())
     return false;
 
-  MVT SVT = VT.getSimpleVT();
-
   // Very little shuffling can be done for 64-bit vectors right now.
   if (VT.getSizeInBits() == 64)
     return false;
 
-  // If this is a single-input shuffle with no 128 bit lane crossings we can
-  // lower it into pshufb.
-  if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
-      (SVT.is256BitVector() && Subtarget->hasInt256())) {
-    bool isLegal = true;
-    for (unsigned I = 0, E = M.size(); I != E; ++I) {
-      if (M[I] >= (int)SVT.getVectorNumElements() ||
-          ShuffleCrosses128bitLane(SVT, I, M[I])) {
-        isLegal = false;
-        break;
-      }
-    }
-    if (isLegal)
-      return true;
-  }
-
-  // FIXME: blends, shifts.
-  return (SVT.getVectorNumElements() == 2 ||
-          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
-          isMOVLMask(M, SVT) ||
-          isMOVHLPSMask(M, SVT) ||
-          isSHUFPMask(M, SVT) ||
-          isSHUFPMask(M, SVT, /* Commuted */ true) ||
-          isPSHUFDMask(M, SVT) ||
-          isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
-          isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
-          isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
-          isPALIGNRMask(M, SVT, Subtarget) ||
-          isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
-          isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
-          isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
-          isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
-          isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
-          (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
+  // We only care that the types being shuffled are legal. The lowering can
+  // handle any possible shuffle mask that results.
+  return isTypeLegal(VT.getSimpleVT());
 }
 
 bool
 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
                                           EVT VT) const {
-  if (!VT.isSimple())
-    return false;
-
-  MVT SVT = VT.getSimpleVT();
-  unsigned NumElts = SVT.getVectorNumElements();
-  // FIXME: This collection of masks seems suspect.
-  if (NumElts == 2)
-    return true;
-  if (NumElts == 4 && SVT.is128BitVector()) {
-    return (isMOVLMask(Mask, SVT)  ||
-            isCommutedMOVLMask(Mask, SVT, true) ||
-            isSHUFPMask(Mask, SVT) ||
-            isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
-            isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
-                        Subtarget->hasInt256()));
-  }
-  return false;
+  // Just delegate to the generic legality, clear masks aren't special.
+  return isShuffleMaskLegal(Mask, VT);
 }
 
 //===----------------------------------------------------------------------===//
@@ -19982,11 +17681,10 @@ static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
   return BB;
 }
 
-static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
-                                       const TargetInstrInfo *TII,
-                                       const X86Subtarget* Subtarget) {
+static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
+                                      const X86Subtarget *Subtarget) {
   DebugLoc dl = MI->getDebugLoc();
-
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   // Address into RAX/EAX, other two args into ECX, EDX.
   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
   unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
@@ -20008,9 +17706,8 @@ static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
 }
 
 MachineBasicBlock *
-X86TargetLowering::EmitVAARG64WithCustomInserter(
-                   MachineInstr *MI,
-                   MachineBasicBlock *MBB) const {
+X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *MBB) const {
   // Emit va_arg instruction on X86-64.
 
   // Operands to this pseudo-instruction:
@@ -20040,7 +17737,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
 
   // Machine Information
-  const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
@@ -20192,7 +17889,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
       .setMemRefs(MMOBegin, MMOEnd);
 
     // Jump to endMBB
-    BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
+    BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
       .addMBB(endMBB);
   }
 
@@ -20296,7 +17993,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   XMMSaveMBB->addSuccessor(EndMBB);
 
   // Now add the instructions.
-  const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   unsigned CountReg = MI->getOperand(0).getReg();
@@ -20306,7 +18003,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   if (!Subtarget->isTargetWin64()) {
     // If %al is 0, branch around the XMM save block.
     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
-    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
+    BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
     MBB->addSuccessor(EndMBB);
   }
 
@@ -20379,7 +18076,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
                                      MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   // To "insert" a SELECT_CC instruction, we actually have to insert the
@@ -20405,8 +18102,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
 
   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   // live into the sink and copy blocks.
-  const TargetRegisterInfo *TRI =
-      BB->getParent()->getSubtarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
   if (!MI->killsRegister(X86::EFLAGS) &&
       !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
     copy0MBB->addLiveIn(X86::EFLAGS);
@@ -20448,7 +18144,7 @@ MachineBasicBlock *
 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
                                         MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
-  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 
@@ -20510,7 +18206,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
     .addReg(SPLimitVReg);
-  BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
+  BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
 
   // bumpMBB simply decreases the stack pointer, since we know the current
   // stacklet has enough space.
@@ -20518,13 +18214,11 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
     .addReg(SPLimitVReg);
   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
     .addReg(SPLimitVReg);
-  BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+  BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
 
   // Calls into a routine in libgcc to allocate more space from the heap.
-  const uint32_t *RegMask = MF->getTarget()
-                                .getSubtargetImpl()
-                                ->getRegisterInfo()
-                                ->getCallPreservedMask(CallingConv::C);
+  const uint32_t *RegMask =
+      Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
   if (IsLP64) {
     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
       .addReg(sizeVReg);
@@ -20557,7 +18251,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
 
   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
     .addReg(IsLP64 ? X86::RAX : X86::EAX);
-  BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+  BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
 
   // Set up the CFG correctly.
   BB->addSuccessor(bumpMBB);
@@ -20581,52 +18275,11 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
                                         MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
-  assert(!Subtarget->isTargetMacho());
-
-  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
-  // non-trivial part is impdef of ESP.
-
-  if (Subtarget->isTargetWin64()) {
-    if (Subtarget->isTargetCygMing()) {
-      // ___chkstk(Mingw64):
-      // Clobbers R10, R11, RAX and EFLAGS.
-      // Updates RSP.
-      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
-        .addExternalSymbol("___chkstk")
-        .addReg(X86::RAX, RegState::Implicit)
-        .addReg(X86::RSP, RegState::Implicit)
-        .addReg(X86::RAX, RegState::Define | RegState::Implicit)
-        .addReg(X86::RSP, RegState::Define | RegState::Implicit)
-        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
-    } else {
-      // __chkstk(MSVCRT): does not update stack pointer.
-      // Clobbers R10, R11 and EFLAGS.
-      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
-        .addExternalSymbol("__chkstk")
-        .addReg(X86::RAX, RegState::Implicit)
-        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
-      // RAX has the offset to be subtracted from RSP.
-      BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
-        .addReg(X86::RSP)
-        .addReg(X86::RAX);
-    }
-  } else {
-    const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() ||
-                                    Subtarget->isTargetWindowsItanium())
-                                       ? "_chkstk"
-                                       : "_alloca";
+  assert(!Subtarget->isTargetMachO());
 
-    BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
-      .addExternalSymbol(StackProbeSymbol)
-      .addReg(X86::EAX, RegState::Implicit)
-      .addReg(X86::ESP, RegState::Implicit)
-      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
-      .addReg(X86::ESP, RegState::Define | RegState::Implicit)
-      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
-  }
+  X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
 
   MI->eraseFromParent();   // The pseudo instruction is gone now.
   return BB;
@@ -20640,8 +18293,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
   // or EAX and doing an indirect call.  The return value will then
   // be in the normal return register.
   MachineFunction *F = BB->getParent();
-  const X86InstrInfo *TII =
-      static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo());
+  const X86InstrInfo *TII = Subtarget->getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
@@ -20650,10 +18302,8 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
   // Get a register mask for the lowered call.
   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
   // proper register mask.
-  const uint32_t *RegMask = F->getTarget()
-                                .getSubtargetImpl()
-                                ->getRegisterInfo()
-                                ->getCallPreservedMask(CallingConv::C);
+  const uint32_t *RegMask =
+      Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
   if (Subtarget->is64Bit()) {
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
                                       TII->get(X86::MOV64rm), X86::RDI)
@@ -20698,7 +18348,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
                                     MachineBasicBlock *MBB) const {
   DebugLoc DL = MI->getDebugLoc();
   MachineFunction *MF = MBB->getParent();
-  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   const BasicBlock *BB = MBB->getBasicBlock();
@@ -20739,6 +18389,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   //  v = phi(main, restore)
   //
   // restoreMBB:
+  //  if base pointer being used, load it from frame
   //  v_restore = 1
 
   MachineBasicBlock *thisMBB = MBB;
@@ -20804,8 +18455,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
           .addMBB(restoreMBB);
 
-  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
-      MF->getSubtarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   MIB.addRegMask(RegInfo->getNoPreservedMask());
   thisMBB->addSuccessor(mainMBB);
   thisMBB->addSuccessor(restoreMBB);
@@ -20822,8 +18472,20 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
     .addReg(restoreDstReg).addMBB(restoreMBB);
 
   // restoreMBB:
+  if (RegInfo->hasBasePointer(*MF)) {
+    const bool Uses64BitFramePtr =
+        Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
+    X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+    X86FI->setRestoreBasePointer(MF);
+    unsigned FramePtr = RegInfo->getFrameRegister(*MF);
+    unsigned BasePtr = RegInfo->getBaseRegister();
+    unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
+    addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
+                 FramePtr, true, X86FI->getRestoreBasePointerOffset())
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
-  BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
+  BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
   restoreMBB->addSuccessor(sinkMBB);
 
   MI->eraseFromParent();
@@ -20835,7 +18497,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
                                      MachineBasicBlock *MBB) const {
   DebugLoc DL = MI->getDebugLoc();
   MachineFunction *MF = MBB->getParent();
-  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   // Memory Reference
@@ -20850,8 +18512,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
   unsigned Tmp = MRI.createVirtualRegister(RC);
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
-  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
-      MF->getSubtarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
   unsigned SP = RegInfo->getStackRegister();
 
@@ -20895,7 +18556,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
 
 // Replace 213-type (isel default) FMA3 instructions with 231-type for
 // accumulator loops. Writing back to the accumulator allows the coalescer
-// to remove extra copies in the loop.   
+// to remove extra copies in the loop.
 MachineBasicBlock *
 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
                                  MachineBasicBlock *MBB) const {
@@ -20970,7 +18631,7 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
         default: llvm_unreachable("Unrecognized FMA variant.");
       }
 
-      const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+      const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
       MachineInstrBuilder MIB =
         BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
         .addOperand(MI->getOperand(0))
@@ -20993,6 +18654,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::TAILJMPd64:
   case X86::TAILJMPr64:
   case X86::TAILJMPm64:
+  case X86::TAILJMPd64_REX:
+  case X86::TAILJMPr64_REX:
+  case X86::TAILJMPm64_REX:
     llvm_unreachable("TAILJMP64 would not be touched here.");
   case X86::TCRETURNdi64:
   case X86::TCRETURNri64:
@@ -21035,7 +18699,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::FP80_TO_INT32_IN_MEM:
   case X86::FP80_TO_INT64_IN_MEM: {
     MachineFunction *F = BB->getParent();
-    const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
+    const TargetInstrInfo *TII = Subtarget->getInstrInfo();
     DebugLoc DL = MI->getDebugLoc();
 
     // Change the floating point control register to use "round towards zero"
@@ -21119,7 +18783,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::VPCMPESTRM128MEM:
     assert(Subtarget->hasSSE42() &&
            "Target must have SSE4.2 or AVX features enabled");
-    return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+    return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
 
   // String/text processing lowering.
   case X86::PCMPISTRIREG:
@@ -21132,16 +18796,15 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::VPCMPESTRIMEM:
     assert(Subtarget->hasSSE42() &&
            "Target must have SSE4.2 or AVX features enabled");
-    return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+    return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
 
   // Thread synchronization.
   case X86::MONITOR:
-    return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(),
-                       Subtarget);
+    return EmitMonitor(MI, BB, Subtarget);
 
   // xbegin
   case X86::XBEGIN:
-    return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+    return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
 
   case X86::VASTART_SAVE_XMM_REGS:
     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
@@ -21157,6 +18820,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::EH_SjLj_LongJmp64:
     return emitEHSjLjLongJmp(MI, BB);
 
+  case TargetOpcode::STATEPOINT:
+    // As an implementation detail, STATEPOINT shares the STACKMAP format at
+    // this point in the process.  We diverge later.
+    return emitPatchPoint(MI, BB);
+
   case TargetOpcode::STACKMAP:
   case TargetOpcode::PATCHPOINT:
     return emitPatchPoint(MI, BB);
@@ -22118,9 +19786,9 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
 
   // We're looking for blends between FADD and FSUB nodes. We insist on these
   // nodes being lined up in a specific expected pattern.
-  if (!(isShuffleEquivalent(Mask, 0, 3) ||
-        isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
-        isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
+  if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+        isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
+        isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
     return SDValue();
 
   // Only specific types are legal at this point, assert so we notice if and
@@ -22176,7 +19844,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
     EVT SVT = BC0.getValueType();
     unsigned Opcode = BC0.getOpcode();
     unsigned NumElts = VT.getVectorNumElements();
-    
+
     if (BC0.hasOneUse() && SVT.isVector() &&
         SVT.getVectorNumElements() * 2 == NumElts &&
         TLI.isOperationLegal(Opcode, VT)) {
@@ -22304,7 +19972,8 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
                                          : InVec.getOperand(1);
 
   // If inputs to shuffle are the same for both ops, then allow 2 uses
-  unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
+  unsigned AllowedUses = InVec.getNumOperands() > 1 &&
+                         InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
 
   if (LdNode.getOpcode() == ISD::BITCAST) {
     // Don't duplicate a load with other uses.
@@ -22349,9 +20018,30 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
                      EltNo);
 }
 
+/// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
+/// special and don't usually play with other vector types, it's better to
+/// handle them early to be sure we emit efficient code by avoiding
+/// store-load conversions.
+static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
+  if (N->getValueType(0) != MVT::x86mmx ||
+      N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
+      N->getOperand(0)->getValueType(0) != MVT::v2i32)
+    return SDValue();
+
+  SDValue V = N->getOperand(0);
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
+  if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
+    return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
+                       N->getValueType(0), V.getOperand(0));
+
+  return SDValue();
+}
+
 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
 /// generation and convert it from being a bunch of shuffles and extracts
-/// to a simple store and scalar loads to extract the elements.
+/// into a somewhat faster sequence. For i686, the best sequence is apparently
+/// storing the value and loading scalars back, while for x64 we should
+/// use 64-bit extracts and shifts.
 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
                                          TargetLowering::DAGCombinerInfo &DCI) {
   SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
@@ -22360,14 +20050,29 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
 
   SDValue InputVector = N->getOperand(0);
 
-  // Detect whether we are trying to convert from mmx to i32 and the bitcast
-  // from mmx to v2i32 has a single usage.
-  if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
-      InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
-      InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
-    return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
-                       N->getValueType(0),
-                       InputVector.getNode()->getOperand(0));
+  // Detect mmx to i32 conversion through a v2i32 elt extract.
+  if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+      N->getValueType(0) == MVT::i32 &&
+      InputVector.getValueType() == MVT::v2i32) {
+
+    // The bitcast source is a direct mmx result.
+    SDValue MMXSrc = InputVector.getNode()->getOperand(0);
+    if (MMXSrc.getValueType() == MVT::x86mmx)
+      return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+                         N->getValueType(0),
+                         InputVector.getNode()->getOperand(0));
+
+    // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
+    SDValue MMXSrcOp = MMXSrc.getOperand(0);
+    if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
+        MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
+        MMXSrcOp.getOpcode() == ISD::BITCAST &&
+        MMXSrcOp.getValueType() == MVT::v1i64 &&
+        MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
+      return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+                         N->getValueType(0),
+                         MMXSrcOp.getOperand(0));
+  }
 
   // Only operate on vectors of 4 elements, where the alternative shuffling
   // gets to be more expensive.
@@ -22410,36 +20115,61 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // Ok, we've now decided to do the transformation.
+  // If 64-bit shifts are legal, use the extract-shift sequence,
+  // otherwise bounce the vector off the cache.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Vals[4];
   SDLoc dl(InputVector);
 
-  // Store the value to a temporary stack slot.
-  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
-  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
-                            MachinePointerInfo(), false, false, 0);
+  if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
+    SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
+    EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
+    SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
+      DAG.getConstant(0, VecIdxTy));
+    SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
+      DAG.getConstant(1, VecIdxTy));
+
+    SDValue ShAmt = DAG.getConstant(32,
+      DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
+    Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
+    Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+      DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
+    Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
+    Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+      DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
+  } else {
+    // Store the value to a temporary stack slot.
+    SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+    SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
+      MachinePointerInfo(), false, false, 0);
 
-  // Replace each use (extract) with a load of the appropriate element.
-  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
-       UE = Uses.end(); UI != UE; ++UI) {
-    SDNode *Extract = *UI;
+    EVT ElementType = InputVector.getValueType().getVectorElementType();
+    unsigned EltSize = ElementType.getSizeInBits() / 8;
 
-    // cOMpute the element's address.
-    SDValue Idx = Extract->getOperand(1);
-    unsigned EltSize =
-        InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
-    uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
+    // Replace each use (extract) with a load of the appropriate element.
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t Offset = EltSize * i;
+      SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
+
+      SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
+                                       StackPtr, OffsetVal);
+
+      // Load the scalar.
+      Vals[i] = DAG.getLoad(ElementType, dl, Ch,
+                            ScalarAddr, MachinePointerInfo(),
+                            false, false, false, 0);
 
-    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
-                                     StackPtr, OffsetVal);
+    }
+  }
 
-    // Load the scalar.
-    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
-                                     ScalarAddr, MachinePointerInfo(),
-                                     false, false, false, 0);
+  // Replace the extracts
+  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
+    UE = Uses.end(); UI != UE; ++UI) {
+    SDNode *Extract = *UI;
 
-    // Replace the exact with the load.
-    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
+    SDValue Idx = Extract->getOperand(1);
+    uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
   }
 
   // The replacement was made in place; don't return anything.
@@ -22456,6 +20186,21 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
   bool NeedSplit = false;
   switch (VT.getSimpleVT().SimpleTy) {
   default: return std::make_pair(0, false);
+  case MVT::v4i64:
+  case MVT::v2i64:
+    if (!Subtarget->hasVLX())
+      return std::make_pair(0, false);
+    break;
+  case MVT::v64i8:
+  case MVT::v32i16:
+    if (!Subtarget->hasBWI())
+      return std::make_pair(0, false);
+    break;
+  case MVT::v16i32:
+  case MVT::v8i64:
+    if (!Subtarget->hasAVX512())
+      return std::make_pair(0, false);
+    break;
   case MVT::v32i8:
   case MVT::v16i16:
   case MVT::v8i32:
@@ -22522,7 +20267,7 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
 }
 
 static SDValue
-TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
                                       const X86Subtarget *Subtarget) {
   SDLoc dl(N);
   SDValue Cond = N->getOperand(0);
@@ -22535,18 +20280,6 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
       Cond = CondSrc->getOperand(0);
   }
 
-  MVT VT = N->getSimpleValueType(0);
-  MVT EltVT = VT.getVectorElementType();
-  unsigned NumElems = VT.getVectorNumElements();
-  // There is no blend with immediate in AVX-512.
-  if (VT.is512BitVector())
-    return SDValue();
-
-  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
-    return SDValue();
-  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
-    return SDValue();
-
   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
     return SDValue();
 
@@ -22560,6 +20293,8 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
     return SDValue();
 
+  MVT VT = N->getSimpleValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> ShuffleMask(NumElems, -1);
   for (unsigned i = 0; i < NumElems; ++i) {
     // Be sure we emit undef where we can.
@@ -22569,6 +20304,9 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
       ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
   }
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
+    return SDValue();
   return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
 }
 
@@ -22589,8 +20327,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   // instructions match the semantics of the common C idiom x<y?x:y but not
   // x<=y?x:y, because of how they handle negative zero (which can be
   // ignored in unsafe-math mode).
+  // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
-      VT != MVT::f80 && TLI.isTypeLegal(VT) &&
+      VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
       (Subtarget->hasSSE2() ||
        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -23008,96 +20747,31 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // Try to fold this VSELECT into a MOVSS/MOVSD
-  if (N->getOpcode() == ISD::VSELECT &&
-      Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) {
-    if (VT == MVT::v4i32 || VT == MVT::v4f32 ||
-        (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) {
-      bool CanFold = false;
-      unsigned NumElems = Cond.getNumOperands();
-      SDValue A = LHS;
-      SDValue B = RHS;
-      
-      if (isZero(Cond.getOperand(0))) {
-        CanFold = true;
-
-        // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B)
-        // fold (vselect <0,-1> -> (movsd A, B)
-        for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
-          CanFold = isAllOnes(Cond.getOperand(i));
-      } else if (isAllOnes(Cond.getOperand(0))) {
-        CanFold = true;
-        std::swap(A, B);
-
-        // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A)
-        // fold (vselect <-1,0> -> (movsd B, A)
-        for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
-          CanFold = isZero(Cond.getOperand(i));
-      }
-
-      if (CanFold) {
-        if (VT == MVT::v4i32 || VT == MVT::v4f32)
-          return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG);
-        return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG);
-      }
-
-      if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) {
-        // fold (v4i32: vselect <0,0,-1,-1>, A, B) ->
-        //      (v4i32 (bitcast (movsd (v2i64 (bitcast A)),
-        //                             (v2i64 (bitcast B)))))
-        //
-        // fold (v4f32: vselect <0,0,-1,-1>, A, B) ->
-        //      (v4f32 (bitcast (movsd (v2f64 (bitcast A)),
-        //                             (v2f64 (bitcast B)))))
-        //
-        // fold (v4i32: vselect <-1,-1,0,0>, A, B) ->
-        //      (v4i32 (bitcast (movsd (v2i64 (bitcast B)),
-        //                             (v2i64 (bitcast A)))))
-        //
-        // fold (v4f32: vselect <-1,-1,0,0>, A, B) ->
-        //      (v4f32 (bitcast (movsd (v2f64 (bitcast B)),
-        //                             (v2f64 (bitcast A)))))
-
-        CanFold = (isZero(Cond.getOperand(0)) &&
-                   isZero(Cond.getOperand(1)) &&
-                   isAllOnes(Cond.getOperand(2)) &&
-                   isAllOnes(Cond.getOperand(3)));
-
-        if (!CanFold && isAllOnes(Cond.getOperand(0)) &&
-            isAllOnes(Cond.getOperand(1)) &&
-            isZero(Cond.getOperand(2)) &&
-            isZero(Cond.getOperand(3))) {
-          CanFold = true;
-          std::swap(LHS, RHS);
-        }
-
-        if (CanFold) {
-          EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64;
-          SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS);
-          SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS);
-          SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA,
-                                                NewB, DAG);
-          return DAG.getNode(ISD::BITCAST, DL, VT, Select);
-        }
-      }
-    }
+  // We should generate an X86ISD::BLENDI from a vselect if its argument
+  // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+  // constants. This specific pattern gets generated when we split a
+  // selector for a 512 bit vector in a machine without AVX512 (but with
+  // 256-bit vectors), during legalization:
+  //
+  // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+  //
+  // Iff we find this pattern and the build_vectors are built from
+  // constants, we translate the vselect into a shuffle_vector that we
+  // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+  if ((N->getOpcode() == ISD::VSELECT ||
+       N->getOpcode() == X86ISD::SHRUNKBLEND) &&
+      !DCI.isBeforeLegalize()) {
+    SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+    if (Shuffle.getNode())
+      return Shuffle;
   }
 
-  // If we know that this node is legal then we know that it is going to be
-  // matched by one of the SSE/AVX BLEND instructions. These instructions only
-  // depend on the highest bit in each word. Try to use SimplifyDemandedBits
-  // to simplify previous instructions.
+  // If this is a *dynamic* select (non-constant condition) and we can match
+  // this node with one of the variable blend instructions, restructure the
+  // condition so that the blends can use the high bit of each element and use
+  // SimplifyDemandedBits to simplify the condition operand.
   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
       !DCI.isBeforeLegalize() &&
-      // We explicitly check against v8i16 and v16i16 because, although
-      // they're marked as Custom, they might only be legal when Cond is a
-      // build_vector of constants. This will be taken care in a later
-      // condition.
-      (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
-       VT != MVT::v8i16) &&
-      // Don't optimize vector of constants. Those are handled by
-      // the generic code and all the bits must be properly set for
-      // the generic optimizer.
       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
 
@@ -23105,6 +20779,31 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     if (BitWidth == 1)
       return SDValue();
 
+    // We can only handle the cases where VSELECT is directly legal on the
+    // subtarget. We custom lower VSELECT nodes with constant conditions and
+    // this makes it hard to see whether a dynamic VSELECT will correctly
+    // lower, so we both check the operation's status and explicitly handle the
+    // cases where a *dynamic* blend will fail even though a constant-condition
+    // blend could be custom lowered.
+    // FIXME: We should find a better way to handle this class of problems.
+    // Potentially, we should combine constant-condition vselect nodes
+    // pre-legalization into shuffles and not mark as many types as custom
+    // lowered.
+    if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+      return SDValue();
+    // FIXME: We don't support i16-element blends currently. We could and
+    // should support them by making *all* the bits in the condition be set
+    // rather than just the high bit and using an i8-element blend.
+    if (VT.getScalarType() == MVT::i16)
+      return SDValue();
+    // Dynamic blending was only available from SSE4.1 onward.
+    if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41())
+      return SDValue();
+    // Byte blends are only available in AVX2
+    if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 &&
+        !Subtarget->hasAVX2())
+      return SDValue();
+
     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
 
@@ -23153,25 +20852,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // We should generate an X86ISD::BLENDI from a vselect if its argument
-  // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
-  // constants. This specific pattern gets generated when we split a
-  // selector for a 512 bit vector in a machine without AVX512 (but with
-  // 256-bit vectors), during legalization:
-  //
-  // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
-  //
-  // Iff we find this pattern and the build_vectors are built from
-  // constants, we translate the vselect into a shuffle_vector that we
-  // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
-  if ((N->getOpcode() == ISD::VSELECT ||
-       N->getOpcode() == X86ISD::SHRUNKBLEND) &&
-      !DCI.isBeforeLegalize()) {
-    SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
-    if (Shuffle.getNode())
-      return Shuffle;
-  }
-
   return SDValue();
 }
 
@@ -23524,7 +21204,7 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
     // fold (blend A, B, allOnes) -> B
     if (ISD::isBuildVectorAllOnes(Mask.getNode()))
       return Op1;
-    
+
     // Simplify the case where the mask is a constant i32 value.
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
       if (C->isNullValue())
@@ -23590,7 +21270,7 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   EVT VT = N->getValueType(0);
-  if (VT != MVT::i64)
+  if (VT != MVT::i64 && VT != MVT::i32)
     return SDValue();
 
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
@@ -23948,24 +21628,118 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   }
 }
 
+static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget *Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  // A vector zext_in_reg may be represented as a shuffle,
+  // feeding into a bitcast (this represents anyext) feeding into
+  // an and with a mask.
+  // We'd like to try to combine that into a shuffle with zero
+  // plus a bitcast, removing the and.
+  if (N0.getOpcode() != ISD::BITCAST || 
+      N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
+    return SDValue();
+
+  // The other side of the AND should be a splat of 2^C, where C
+  // is the number of bits in the source type.
+  if (N1.getOpcode() == ISD::BITCAST)
+    N1 = N1.getOperand(0);
+  if (N1.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+  BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
+
+  ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
+  EVT SrcType = Shuffle->getValueType(0);
+
+  // We expect a single-source shuffle
+  if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF)
+    return SDValue();
+
+  unsigned SrcSize = SrcType.getScalarSizeInBits();
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (!Vector->isConstantSplat(SplatValue, SplatUndef,
+                                SplatBitSize, HasAnyUndefs))
+    return SDValue();
+
+  unsigned ResSize = N1.getValueType().getScalarSizeInBits();
+  // Make sure the splat matches the mask we expect
+  if (SplatBitSize > ResSize || 
+      (SplatValue + 1).exactLogBase2() != (int)SrcSize)
+    return SDValue();
+
+  // Make sure the input and output size make sense
+  if (SrcSize >= ResSize || ResSize % SrcSize)
+    return SDValue();
+
+  // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
+  // The number of u's between each two values depends on the ratio between
+  // the source and dest type.
+  unsigned ZextRatio = ResSize / SrcSize;
+  bool IsZext = true;
+  for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) {
+    if (i % ZextRatio) {
+      if (Shuffle->getMaskElt(i) > 0) {
+        // Expected undef
+        IsZext = false;
+        break;
+      }
+    } else {
+      if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
+        // Expected element number
+        IsZext = false;
+        break;
+      }
+    }
+  }
+
+  if (!IsZext)
+    return SDValue();
+
+  // Ok, perform the transformation - replace the shuffle with
+  // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
+  // (instead of undef) where the k elements come from the zero vector.
+  SmallVector<int, 8> Mask;
+  unsigned NumElems = SrcType.getVectorNumElements();
+  for (unsigned i = 0; i < NumElems; ++i)
+    if (i % ZextRatio)
+      Mask.push_back(NumElems);
+    else
+      Mask.push_back(i / ZextRatio);
+
+  SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
+    Shuffle->getOperand(0), DAG.getConstant(0, SrcType), Mask);
+  return DAG.getNode(ISD::BITCAST, DL,  N0.getValueType(), NewShuffle);
+}
+
 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget *Subtarget) {
-  EVT VT = N->getValueType(0);
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget);
+  if (Zext.getNode())
+    return Zext;
+
   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
   if (R.getNode())
     return R;
 
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
   // Create BEXTR instructions
   // BEXTR is ((X >> imm) & (2**size-1))
   if (VT == MVT::i32 || VT == MVT::i64) {
-    SDValue N0 = N->getOperand(0);
-    SDValue N1 = N->getOperand(1);
-    SDLoc DL(N);
-
     // Check for BEXTR.
     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
@@ -23975,7 +21749,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
         uint64_t Mask = MaskNode->getZExtValue();
         uint64_t Shift = ShiftNode->getZExtValue();
         if (isMask_64(Mask)) {
-          uint64_t MaskSize = CountPopulation_64(Mask);
+          uint64_t MaskSize = countPopulation(Mask);
           if (Shift + MaskSize <= VT.getSizeInBits())
             return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
                                DAG.getConstant(Shift | (MaskSize << 8), VT));
@@ -23993,10 +21767,6 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
   if (VT != MVT::v2i64 && VT != MVT::v4i64)
     return SDValue();
 
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDLoc DL(N);
-
   // Check LHS for vnot
   if (N0.getOpcode() == ISD::XOR &&
       //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
@@ -24108,8 +21878,8 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
 
   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
   MachineFunction &MF = DAG.getMachineFunction();
-  bool OptForSize = MF.getFunction()->getAttributes().
-    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+  bool OptForSize =
+      MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
 
   // SHLD/SHRD instructions have lower register pressure, but on some
   // platforms they have higher latency than the equivalent
@@ -24233,11 +22003,12 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
   SDLoc dl(Ld);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
-  // On Sandybridge unaligned 256bit loads are inefficient.
+  // For chips with slow 32-byte unaligned loads, break the 32-byte operation
+  // into two 16-byte operations.
   ISD::LoadExtType Ext = Ld->getExtensionType();
   unsigned Alignment = Ld->getAlignment();
   bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
-  if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+  if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
       !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
     unsigned NumElems = RegVT.getVectorNumElements();
     if (NumElems < 2)
@@ -24270,6 +22041,166 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// PerformMLOADCombine - Resolve extending loads
+static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
+  MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
+  if (Mld->getExtensionType() != ISD::SEXTLOAD)
+    return SDValue();
+
+  EVT VT = Mld->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT LdVT = Mld->getMemoryVT();
+  SDLoc dl(Mld);
+
+  assert(LdVT != VT && "Cannot extend to the same type");
+  unsigned ToSz = VT.getVectorElementType().getSizeInBits();
+  unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
+  // From, To sizes and ElemCount must be pow of two
+  assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+    "Unexpected size for extending masked load");
+
+  unsigned SizeRatio  = ToSz / FromSz;
+  assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
+
+  // Create a type on which we perform the shuffle
+  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+          LdVT.getScalarType(), NumElems*SizeRatio);
+  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+  // Convert Src0 value
+  SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
+  if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
+    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
+
+    // Can't shuffle using an illegal type.
+    assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
+	    && "WideVecVT should be legal");
+    WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
+                                    DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+  }
+  // Prepare the new mask
+  SDValue NewMask;
+  SDValue Mask = Mld->getMask();
+  if (Mask.getValueType() == VT) {
+    // Mask and original value have the same type
+    NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
+    for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+      ShuffleVec[i] = NumElems*SizeRatio;
+    NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+                                   DAG.getConstant(0, WideVecVT),
+                                   &ShuffleVec[0]);
+  }
+  else {
+    assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+    unsigned WidenNumElts = NumElems*SizeRatio;
+    unsigned MaskNumElts = VT.getVectorNumElements();
+    EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
+                                     WidenNumElts);
+
+    unsigned NumConcat = WidenNumElts / MaskNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
+    Ops[0] = Mask;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = ZeroVal;
+
+    NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+  }
+
+  SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
+                                     Mld->getBasePtr(), NewMask, WideSrc0,
+                                     Mld->getMemoryVT(), Mld->getMemOperand(),
+                                     ISD::NON_EXTLOAD);
+  SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
+  return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
+
+}
+/// PerformMSTORECombine - Resolve truncating stores
+static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
+                                    const X86Subtarget *Subtarget) {
+  MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
+  if (!Mst->isTruncatingStore())
+    return SDValue();
+
+  EVT VT = Mst->getValue().getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT StVT = Mst->getMemoryVT();
+  SDLoc dl(Mst);
+
+  assert(StVT != VT && "Cannot truncate to the same type");
+  unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+  unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+  // From, To sizes and ElemCount must be pow of two
+  assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+    "Unexpected size for truncating masked store");
+  // We are going to use the original vector elt for storing.
+  // Accumulated smaller vector elements must be a multiple of the store size.
+  assert (((NumElems * FromSz) % ToSz) == 0 &&
+          "Unexpected ratio for truncating masked store");
+
+  unsigned SizeRatio  = FromSz / ToSz;
+  assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+  // Create a type on which we perform the shuffle
+  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+          StVT.getScalarType(), NumElems*SizeRatio);
+
+  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+  SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
+  SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+  for (unsigned i = 0; i != NumElems; ++i)
+    ShuffleVec[i] = i * SizeRatio;
+
+  // Can't shuffle using an illegal type.
+  assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
+	  && "WideVecVT should be legal");
+
+  SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+                                        DAG.getUNDEF(WideVecVT),
+                                        &ShuffleVec[0]);
+
+  SDValue NewMask;
+  SDValue Mask = Mst->getMask();
+  if (Mask.getValueType() == VT) {
+    // Mask and original value have the same type
+    NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
+    for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+      ShuffleVec[i] = NumElems*SizeRatio;
+    NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+                                   DAG.getConstant(0, WideVecVT),
+                                   &ShuffleVec[0]);
+  }
+  else {
+    assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+    unsigned WidenNumElts = NumElems*SizeRatio;
+    unsigned MaskNumElts = VT.getVectorNumElements();
+    EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
+                                     WidenNumElts);
+
+    unsigned NumConcat = WidenNumElts / MaskNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
+    Ops[0] = Mask;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = ZeroVal;
+
+    NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+  }
+
+  return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
+                            NewMask, StVT, Mst->getMemOperand(), false);
+}
 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
                                    const X86Subtarget *Subtarget) {
@@ -24280,13 +22211,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   SDValue StoredVal = St->getOperand(1);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
-  // If we are saving a concatenation of two XMM registers, perform two stores.
-  // On Sandy Bridge, 256-bit memory operations are executed by two
-  // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
-  // memory  operation.
+  // If we are saving a concatenation of two XMM registers and 32-byte stores
+  // are slow, such as on Sandy Bridge, perform two 16-byte stores.
   unsigned Alignment = St->getAlignment();
   bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
-  if (VT.is256BitVector() && !Subtarget->hasInt256() &&
+  if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
       StVT == VT && !IsAligned) {
     unsigned NumElems = VT.getVectorNumElements();
     if (NumElems < 2)
@@ -24352,9 +22281,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
 
     // Find the largest store unit
     MVT StoreType = MVT::i8;
-    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
-         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
-      MVT Tp = (MVT::SimpleValueType)tp;
+    for (MVT Tp : MVT::integer_valuetypes()) {
       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
         StoreType = Tp;
     }
@@ -24399,8 +22326,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   const Function *F = DAG.getMachineFunction().getFunction();
-  bool NoImplicitFloatOps = F->getAttributes().
-    hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+  bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
   bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
                      && Subtarget->hasSSE2();
   if ((VT.isVector() ||
@@ -24500,7 +22426,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
+/// Return 'true' if this vector operation is "horizontal"
 /// and return the operands for the horizontal operation in LHS and RHS.  A
 /// horizontal operation performs the binary operation on successive elements
 /// of its first operand, then on successive elements of its second operand,
@@ -24626,7 +22552,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   return true;
 }
 
-/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
+/// Do target-specific dag combines on floating point adds.
 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
                                   const X86Subtarget *Subtarget) {
   EVT VT = N->getValueType(0);
@@ -24641,7 +22567,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
+/// Do target-specific dag combines on floating point subs.
 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
                                   const X86Subtarget *Subtarget) {
   EVT VT = N->getValueType(0);
@@ -24656,23 +22582,23 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
-/// X86ISD::FXOR nodes.
+/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+
   // F[X]OR(0.0, x) -> x
-  // F[X]OR(x, 0.0) -> x
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
     if (C->getValueAPF().isPosZero())
       return N->getOperand(1);
+
+  // F[X]OR(x, 0.0) -> x
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
     if (C->getValueAPF().isPosZero())
       return N->getOperand(0);
   return SDValue();
 }
 
-/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
-/// X86ISD::FMAX nodes.
+/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
 
@@ -24693,29 +22619,33 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
                      N->getOperand(0), N->getOperand(1));
 }
 
-/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
+/// Do target-specific dag combines on X86ISD::FAND nodes.
 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
   // FAND(0.0, x) -> 0.0
-  // FAND(x, 0.0) -> 0.0
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
     if (C->getValueAPF().isPosZero())
       return N->getOperand(0);
+
+  // FAND(x, 0.0) -> 0.0
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
     if (C->getValueAPF().isPosZero())
       return N->getOperand(1);
+  
   return SDValue();
 }
 
-/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes
+/// Do target-specific dag combines on X86ISD::FANDN nodes
 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
-  // FANDN(x, 0.0) -> 0.0
   // FANDN(0.0, x) -> x
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
     if (C->getValueAPF().isPosZero())
       return N->getOperand(1);
+
+  // FANDN(x, 0.0) -> 0.0
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
     if (C->getValueAPF().isPosZero())
       return N->getOperand(1);
+
   return SDValue();
 }
 
@@ -24978,6 +22908,23 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
+                                         SelectionDAG &DAG) {
+  SDLoc dl(Load);
+  MVT VT = Load->getSimpleValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  SDValue Addr = Load->getOperand(1);
+  SDValue NewAddr = DAG.getNode(
+      ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+      DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
+
+  SDValue NewLoad =
+      DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+                  DAG.getMachineFunction().getMachineMemOperand(
+                      Load->getMemOperand(), 0, EVT.getStoreSize()));
+  return NewLoad;
+}
+
 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
                                       const X86Subtarget *Subtarget) {
   SDLoc dl(N);
@@ -24989,20 +22936,47 @@ static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
   if (MayFoldLoad(Ld)) {
     // Extract the countS bits from the immediate so we can get the proper
     // address when narrowing the vector load to a specific element.
-    // When the second source op is a memory address, interps doesn't use
+    // When the second source op is a memory address, insertps doesn't use
     // countS and just gets an f32 from that address.
     unsigned DestIndex =
         cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+    
     Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
-  } else
-    return SDValue();
 
-  // Create this as a scalar to vector to match the instruction pattern.
-  SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
-  // countS bits are ignored when loading from memory on insertps, which
-  // means we don't need to explicitly set them to 0.
-  return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
-                     LoadScalarToVector, N->getOperand(2));
+    // Create this as a scalar to vector to match the instruction pattern.
+    SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
+    // countS bits are ignored when loading from memory on insertps, which
+    // means we don't need to explicitly set them to 0.
+    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
+                       LoadScalarToVector, N->getOperand(2));
+  }
+  return SDValue();
+}
+
+static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue V0 = N->getOperand(0);
+  SDValue V1 = N->getOperand(1);
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
+  // operands and changing the mask to 1. This saves us a bunch of
+  // pattern-matching possibilities related to scalar math ops in SSE/AVX.
+  // x86InstrInfo knows how to commute this back after instruction selection
+  // if it would help register allocation.
+  
+  // TODO: If optimizing for size or a processor that doesn't suffer from
+  // partial register update stalls, this should be transformed into a MOVSD
+  // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
+
+  if (VT == MVT::v2f64)
+    if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+      if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
+        SDValue NewMask = DAG.getConstant(1, MVT::i8);
+        return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
+      }
+
+  return SDValue();
 }
 
 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
@@ -25134,7 +23108,7 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
 }
 
 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
-                                        const X86TargetLowering *XTLI) {
+                                        const X86Subtarget *Subtarget) {
   // First try to optimize away the conversion entirely when it's
   // conditionally from a constant. Vectors only.
   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
@@ -25160,10 +23134,9 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
     EVT VT = Ld->getValueType(0);
     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
-        !XTLI->getSubtarget()->is64Bit() &&
-        VT == MVT::i64) {
-      SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
-                                          Ld->getChain(), Op0, DAG);
+        !Subtarget->is64Bit() && VT == MVT::i64) {
+      SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
+          SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
       return FILDChain;
     }
@@ -25362,6 +23335,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SELECT:
   case X86ISD::SHRUNKBLEND:
     return PerformSELECTCombine(N, DAG, DCI, Subtarget);
+  case ISD::BITCAST:        return PerformBITCASTCombine(N, DAG);
   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
@@ -25374,8 +23348,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
+  case ISD::MLOAD:          return PerformMLOADCombine(N, DAG, DCI, Subtarget);
   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
-  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
+  case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
+  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
   case X86ISD::FXOR:
@@ -25414,8 +23390,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
   case ISD::INTRINSIC_WO_CHAIN:
     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
-  case X86ISD::INSERTPS:
-    return PerformINSERTPSCombine(N, DAG, Subtarget);
+  case X86ISD::INSERTPS: {
+    if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
+      return PerformINSERTPSCombine(N, DAG, Subtarget);
+    break;
+  }
+  case X86ISD::BLENDI:    return PerformBLENDICombine(N, DAG);
   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
   }
 
@@ -25841,6 +23821,23 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
       }
     }
     return;
+  case 'L':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
+          (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
+        Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'M':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 3) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
   case 'N':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 255) {
@@ -25849,6 +23846,14 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
       }
     }
     return;
+  case 'O':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 127) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
   case 'e': {
     // 32-bit signed value
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
@@ -25938,8 +23943,9 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
-std::pair<unsigned, const TargetRegisterClass*>
-X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+std::pair<unsigned, const TargetRegisterClass *>
+X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                const std::string &Constraint,
                                                 MVT VT) const {
   // First, see if this is a constraint that directly corresponds to an LLVM
   // register class.
@@ -26045,7 +24051,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   // Use the default implementation in TargetLowering to convert the register
   // constraint into a member of a register class.
   std::pair<unsigned, const TargetRegisterClass*> Res;
-  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 
   // Not found as a standard register?
   if (!Res.second) {
@@ -26193,7 +24199,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
   // "load" ports instead of the dedicated "store" port.
   // E.g., on Haswell:
   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
-  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.   
+  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
   if (isLegalAddressingMode(AM, Ty))
     // Scale represents reg2 * scale, thus account for 1
     // as soon as we use a second register.
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 7c6ffa2..4423015 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -158,6 +158,10 @@ namespace llvm {
       /// vector to a GPR.
       MMX_MOVD2W,
 
+      /// MMX_MOVW2D - Copies a GPR into the low 32-bit word of a MMX vector
+      /// and zero out the high word.
+      MMX_MOVW2D,
+
       /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to
       /// i32, corresponds to X86::PEXTRB.
       PEXTRB,
@@ -197,7 +201,12 @@ namespace llvm {
 
       /// ADDSUB - Combined add and sub on an FP vector.
       ADDSUB,
-
+      //  FADD, FSUB, FMUL, FDIV, FMIN, FMAX - FP vector ops with rounding mode.
+      FADD_RND,
+      FSUB_RND,
+      FMUL_RND,
+      FDIV_RND,
+      
       // SUBUS - Integer sub with unsigned saturation.
       SUBUS,
 
@@ -378,6 +387,18 @@ namespace llvm {
       FNMSUB,
       FMADDSUB,
       FMSUBADD,
+      // FMA with rounding mode
+      FMADD_RND,
+      FNMADD_RND,
+      FMSUB_RND,
+      FNMSUB_RND,
+      FMADDSUB_RND,
+      FMSUBADD_RND,
+      RNDSCALE,
+
+      // Compress and expand
+      COMPRESS,
+      EXPAND,
 
       // Save xmm argument registers to the stack, according to %al. An operator
       // is needed so that this can be expanded with control flow.
@@ -543,7 +564,8 @@ namespace llvm {
   //  X86 Implementation of the TargetLowering interface
   class X86TargetLowering final : public TargetLowering {
   public:
-    explicit X86TargetLowering(const X86TargetMachine &TM);
+    explicit X86TargetLowering(const X86TargetMachine &TM,
+                               const X86Subtarget &STI);
 
     unsigned getJumpTableEncoding() const override;
 
@@ -629,6 +651,10 @@ namespace llvm {
     /// This method returns the name of a target specific DAG node.
     const char *getTargetNodeName(unsigned Opcode) const override;
 
+    bool isCheapToSpeculateCttz() const override;
+
+    bool isCheapToSpeculateCtlz() const override;
+
     /// Return the value type to use for ISD::SETCC.
     EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
@@ -675,9 +701,10 @@ namespace llvm {
     /// (e.g. {edx}), return the register number and the register class for the
     /// register.  This should only be used for C_Register constraints.  On
     /// error, this returns a register number of 0.
-    std::pair<unsigned, const TargetRegisterClass*>
-      getRegForInlineAsmConstraint(const std::string &Constraint,
-                                   MVT VT) const override;
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 const std::string &Constraint,
+                                 MVT VT) const override;
 
     /// Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
@@ -724,6 +751,10 @@ namespace llvm {
     bool isZExtFree(EVT VT1, EVT VT2) const override;
     bool isZExtFree(SDValue Val, EVT VT2) const override;
 
+    /// Return true if folding a vector load into ExtVal (a sign, zero, or any
+    /// extend node) is profitable.
+    bool isVectorLoadExtDesirable(SDValue) const override;
+
     /// Return true if an FMA operation is faster than a pair of fmul and fadd
     /// instructions. fmuladd intrinsics will be expanded to FMAs when this
     /// method returns true, otherwise fmuladd is expanded to fmul + fadd.
@@ -762,9 +793,10 @@ namespace llvm {
       return !X86ScalarSSEf64 || VT == MVT::f80;
     }
 
-    const X86Subtarget* getSubtarget() const {
-      return Subtarget;
-    }
+    /// Return true if we believe it is correct and profitable to reduce the
+    /// load node to a smaller type.
+    bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
+                               EVT NewVT) const override;
 
     /// Return true if the specified scalar FP type is computed in an SSE
     /// register, not on the X87 floating point stack.
@@ -787,6 +819,10 @@ namespace llvm {
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
 
+    /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
+    /// with this index.
+    bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override;
+
     /// Intel processors have a unified instruction and data cache
     const char * getClearCacheBuiltinName() const override {
       return nullptr; // nothing to do, move along.
@@ -810,16 +846,14 @@ namespace llvm {
 
     bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
 
-    /// \brief Reset the operation actions based on target options.
-    void resetOperationActions() override;
-
     bool useLoadStackGuardNode() const override;
     /// \brief Customize the preferred legalization strategy for certain types.
     LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
 
   protected:
-    std::pair<const TargetRegisterClass*, uint8_t>
-    findRepresentativeClass(MVT VT) const override;
+    std::pair<const TargetRegisterClass *, uint8_t>
+    findRepresentativeClass(const TargetRegisterInfo *TRI,
+                            MVT VT) const override;
 
   private:
     /// Keep a pointer to the X86Subtarget around so that we can
@@ -827,10 +861,6 @@ namespace llvm {
     const X86Subtarget *Subtarget;
     const DataLayout *TD;
 
-    /// Used to store the TargetOptions so that we don't waste time resetting
-    /// the operation actions unless we have to.
-    TargetOptions TO;
-
     /// Select between SSE or x87 floating point ops.
     /// When SSE is available, use it for f32 operations.
     /// When SSE2 is available, use it for f64 operations.
@@ -930,7 +960,6 @@ namespace llvm {
     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index b188cd5..4923bc5 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -1,10 +1,27 @@
+//===-- X86InstrAVX512.td - AVX512 Instruction Set ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 AVX512 instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
 // Group template arguments that can be derived from the vector type (EltNum x
 // EltVT).  These are things like the register class for the writemask, etc.
 // The idea is to pass one of these as the template argument rather than the
 // individual arguments.
-class X86VectorVTInfo<int numelts, ValueType EltVT, RegisterClass rc,
+// The template is also used for scalar types, in this case numelts is 1.
+class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
                       string suffix = ""> {
   RegisterClass RC = rc;
+  ValueType EltVT = eltvt;
   int NumElts = numelts;
 
   // Corresponding mask register class.
@@ -23,7 +40,13 @@ class X86VectorVTInfo<int numelts, ValueType EltVT, RegisterClass rc,
   // Suffix used in the instruction mnemonic.
   string Suffix = suffix;
 
-  string VTName = "v" # NumElts # EltVT;
+  // VTName is a string name for vector VT. For vector types it will be
+  // v # NumElts # EltVT, so for vector of 8 elements of i32 it will be v8i32
+  // It is a little bit complex for scalar types, where NumElts = 1.
+  // In this case we build v4f32 or v2f64
+  string VTName = "v" # !if (!eq (NumElts, 1),
+                        !if (!eq (EltVT.Size, 32), 4,
+                        !if (!eq (EltVT.Size, 64), 2, NumElts)), NumElts) # EltVT;
 
   // The vector VT.
   ValueType VT = !cast<ValueType>(VTName);
@@ -53,14 +76,6 @@ class X86VectorVTInfo<int numelts, ValueType EltVT, RegisterClass rc,
                                             VTName)), VTName));
   PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
 
-  // Load patterns used for memory operands.  We only have this defined in
-  // case of i64 element types for sub-512 integer vectors.  For now, keep
-  // MemOpFrag undefined in these cases.
-  PatFrag MemOpFrag =
-    !if (!eq (TypeVariantName, "f"), !cast<PatFrag>("memop" # VTName),
-    !if (!eq (EltTypeName, "i64"),   !cast<PatFrag>("memop" # VTName),
-    !if (!eq (VTName, "v16i32"),     !cast<PatFrag>("memop" # VTName), ?)));
-
   // The corresponding float type, e.g. v16f32 for v16i32
   // Note: For EltSize < 32, FloatVT is illegal and TableGen
   //       fails to compile, so we choose FloatVT = VT
@@ -86,6 +101,8 @@ class X86VectorVTInfo<int numelts, ValueType EltVT, RegisterClass rc,
                      !if (!eq (EltTypeName, "f64"), SSEPackedDouble,
                      SSEPackedInt));
 
+  RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X);
+
   // A vector type of the same width with element type i32.  This is used to
   // create the canonical constant zero node ImmAllZerosV.
   ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32");
@@ -114,6 +131,11 @@ def v2i64x_info  : X86VectorVTInfo<2,  i64, VR128X, "q">;
 def v4f32x_info  : X86VectorVTInfo<4,  f32, VR128X, "ps">;
 def v2f64x_info  : X86VectorVTInfo<2,  f64, VR128X, "pd">;
 
+// We map scalar types to the smallest (128-bit) vector type
+// with the appropriate element type. This allows to use the same masking logic.
+def f32x_info    : X86VectorVTInfo<1,  f32, VR128X, "ss">;
+def f64x_info    : X86VectorVTInfo<1,  f64, VR128X, "sd">;
+
 class AVX512VLVectorVTInfo<X86VectorVTInfo i512, X86VectorVTInfo i256,
                            X86VectorVTInfo i128> {
   X86VectorVTInfo info512 = i512;
@@ -183,7 +205,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
                                   string OpcodeStr,
                                   string AttSrcAsm, string IntelSrcAsm,
                                   dag RHS, dag MaskingRHS,
-                                  string Round = "",
+                                  SDNode Select = vselect, string Round = "",
                                   string MaskingConstraint = "",
                                   InstrItinClass itin = NoItinerary,
                                   bit IsCommutable = 0> :
@@ -192,11 +214,11 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
                          [(set _.RC:$dst, RHS)],
                          [(set _.RC:$dst, MaskingRHS)],
                          [(set _.RC:$dst,
-                               (vselect _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
+                               (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
                          Round, MaskingConstraint, NoItinerary, IsCommutable>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
-// the zero-masking variant of the instruction.  In the masking case, the
+// the zero-masking variant of the vector instruction.  In the masking case, the
 // perserved vector elements come from a new dummy input operand tied to $dst.
 multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
@@ -208,8 +230,23 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
                           !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
                           !con((ins _.KRCWM:$mask), Ins),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
-                          (vselect _.KRCWM:$mask, RHS, _.RC:$src0), Round,
-                          "$src0 = $dst", itin, IsCommutable>;
+                          (vselect _.KRCWM:$mask, RHS, _.RC:$src0), vselect,
+                          Round, "$src0 = $dst", itin, IsCommutable>;
+
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the scalar instruction.
+multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
+                           dag Outs, dag Ins, string OpcodeStr,
+                           string AttSrcAsm, string IntelSrcAsm,
+                           dag RHS, string Round = "",
+                           InstrItinClass itin = NoItinerary,
+                           bit IsCommutable = 0> :
+   AVX512_maskable_common<O, F, _, Outs, Ins,
+                          !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+                          !con((ins _.KRCWM:$mask), Ins),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+                          (X86select _.KRCWM:$mask, RHS, _.RC:$src0), X86select,
+                          Round, "$src0 = $dst", itin, IsCommutable>;
 
 // Similar to AVX512_maskable but in this case one of the source operands
 // ($src1) is already tied to $dst so we just use that for the preserved
@@ -364,7 +401,7 @@ multiclass vinsert_for_size_no_alt<int Opcode,
                                    SDNodeXForm INSERT_get_vinsert_imm> {
   let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
     def rr : AVX512AIi8<Opcode, MRMSrcReg, (outs VR512:$dst),
-               (ins VR512:$src1, From.RC:$src2, i8imm:$src3),
+               (ins VR512:$src1, From.RC:$src2, u8imm:$src3),
                "vinsert" # From.EltTypeName # "x" # From.NumElts #
                                                 "\t{$src3, $src2, $src1, $dst|"
                                                    "$dst, $src1, $src2, $src3}",
@@ -375,7 +412,7 @@ multiclass vinsert_for_size_no_alt<int Opcode,
 
     let mayLoad = 1 in
     def rm : AVX512AIi8<Opcode, MRMSrcMem, (outs VR512:$dst),
-               (ins VR512:$src1, From.MemOp:$src2, i8imm:$src3),
+               (ins VR512:$src1, From.MemOp:$src2, u8imm:$src3),
                "vinsert" # From.EltTypeName # "x" # From.NumElts #
                                                 "\t{$src3, $src2, $src1, $dst|"
                                                    "$dst, $src1, $src2, $src3}",
@@ -437,12 +474,12 @@ defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>;
 
 // vinsertps - insert f32 to XMM
 def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
-      (ins VR128X:$src1, VR128X:$src2, i8imm:$src3),
+      (ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
       EVEX_4V;
 def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
-      (ins VR128X:$src1, f32mem:$src2, i8imm:$src3),
+      (ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set VR128X:$dst, (X86insertps VR128X:$src1,
                           (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
@@ -459,7 +496,7 @@ multiclass vextract_for_size<int Opcode,
                              SDNodeXForm EXTRACT_get_vextract_imm> {
   let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
     defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst),
-                (ins VR512:$src1, i8imm:$idx),
+                (ins VR512:$src1, u8imm:$idx),
                 "vextract" # To.EltTypeName # "x4",
                 "$idx, $src1", "$src1, $idx",
                 [(set To.RC:$dst, (vextract_extract:$idx (From.VT VR512:$src1),
@@ -467,7 +504,7 @@ multiclass vextract_for_size<int Opcode,
               AVX512AIi8Base, EVEX, EVEX_V512;
     let mayStore = 1 in
     def rm : AVX512AIi8<Opcode, MRMDestMem, (outs),
-            (ins To.MemOp:$dst, VR512:$src1, i8imm:$src2),
+            (ins To.MemOp:$dst, VR512:$src1, u8imm:$src2),
             "vextract" # To.EltTypeName # "x4\t{$src2, $src1, $dst|"
                                                "$dst, $src1, $src2}",
             []>, EVEX, EVEX_V512, EVEX_CD8<To.EltSize, CD8VT4>;
@@ -566,13 +603,13 @@ def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)),
 
 // vextractps - extract 32 bits from XMM
 def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
-      (ins VR128X:$src1, i32i8imm:$src2),
+      (ins VR128X:$src1, u8imm:$src2),
       "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
       EVEX;
 
 def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs),
-      (ins f32mem:$dst, VR128X:$src1, i32i8imm:$src2),
+      (ins f32mem:$dst, VR128X:$src1, u8imm:$src2),
       "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
                           addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>;
@@ -622,6 +659,45 @@ let ExeDomain = SSEPackedDouble in {
                               avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VT1>;
 }
 
+// avx512_broadcast_pat introduces patterns for broadcast with a scalar argument.
+// Later, we can canonize broadcast instructions before ISel phase and 
+// eliminate additional patterns on ISel.
+// SrcRC_v and SrcRC_s are RegisterClasses for vector and scalar
+// representations of source
+multiclass avx512_broadcast_pat<string InstName, SDNode OpNode,
+                                X86VectorVTInfo _, RegisterClass SrcRC_v,
+                                RegisterClass SrcRC_s> {
+  def : Pat<(_.VT (OpNode  (_.EltVT SrcRC_s:$src))),
+            (!cast<Instruction>(InstName##"r")
+              (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>;
+
+  let AddedComplexity = 30 in {
+    def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                (OpNode (_.EltVT SrcRC_s:$src)), _.RC:$src0)),
+              (!cast<Instruction>(InstName##"rk") _.RC:$src0, _.KRCWM:$mask,
+                (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>;
+
+    def : Pat<(_.VT(vselect _.KRCWM:$mask,
+                (OpNode (_.EltVT SrcRC_s:$src)), _.ImmAllZerosV)),
+              (!cast<Instruction>(InstName##"rkz") _.KRCWM:$mask,
+                (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>;
+  }
+}
+
+defm : avx512_broadcast_pat<"VBROADCASTSSZ", X86VBroadcast, v16f32_info,
+                            VR128X, FR32X>;
+defm : avx512_broadcast_pat<"VBROADCASTSDZ", X86VBroadcast, v8f64_info,
+                            VR128X, FR64X>;
+
+let Predicates = [HasVLX] in {
+  defm : avx512_broadcast_pat<"VBROADCASTSSZ256", X86VBroadcast,
+                              v8f32x_info, VR128X, FR32X>;
+  defm : avx512_broadcast_pat<"VBROADCASTSSZ128", X86VBroadcast,
+                              v4f32x_info, VR128X, FR32X>;
+  defm : avx512_broadcast_pat<"VBROADCASTSDZ256", X86VBroadcast,
+                              v4f64x_info, VR128X, FR64X>;
+}
+
 def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))),
           (VBROADCASTSSZm addr:$src)>;
 def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))),
@@ -632,74 +708,84 @@ def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src),
 def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
           (VBROADCASTSDZm addr:$src)>;
 
-multiclass avx512_int_broadcast_reg<bits<8> opc, string OpcodeStr,
-                          RegisterClass SrcRC, RegisterClass KRC> {
-  def Zrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins SrcRC:$src),
-                   !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
-                   []>, EVEX, EVEX_V512;
-  def Zkrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), 
-                   (ins KRC:$mask, SrcRC:$src),
-                   !strconcat(OpcodeStr, 
-                        " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
-                   []>, EVEX, EVEX_V512, EVEX_KZ;
-}
-
-defm VPBROADCASTDr  : avx512_int_broadcast_reg<0x7C, "vpbroadcastd", GR32, VK16WM>;
-defm VPBROADCASTQr  : avx512_int_broadcast_reg<0x7C, "vpbroadcastq", GR64, VK8WM>,
-                                            VEX_W;
-                                            
+multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _,
+                                    RegisterClass SrcRC> {
+  defm r : AVX512_maskable_in_asm<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins SrcRC:$src),  "vpbroadcast"##_.Suffix,
+                           "$src", "$src", []>, T8PD, EVEX;
+}
+
+multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _,
+                                       RegisterClass SrcRC, Predicate prd> {
+  let Predicates = [prd] in
+    defm Z : avx512_int_broadcast_reg<opc, _.info512, SrcRC>, EVEX_V512;
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_int_broadcast_reg<opc, _.info256, SrcRC>, EVEX_V256;
+    defm Z128 : avx512_int_broadcast_reg<opc, _.info128, SrcRC>, EVEX_V128;
+  }
+}
+
+defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, GR32,
+                                                 HasBWI>;
+defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR32,
+                                                 HasBWI>;
+defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, GR32,
+                                                 HasAVX512>;
+defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64,
+                                                 HasAVX512>, VEX_W;
+
 def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
-           (VPBROADCASTDrZkrr VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
+           (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
 
 def : Pat <(v8i64 (X86vzext VK8WM:$mask)),
-           (VPBROADCASTQrZkrr VK8WM:$mask, (i64 (MOV64ri 0x1)))>;
+           (VPBROADCASTQrZrkz VK8WM:$mask, (i64 (MOV64ri 0x1)))>;
 
 def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))),
-        (VPBROADCASTDrZrr GR32:$src)>;
+        (VPBROADCASTDrZr GR32:$src)>;
 def : Pat<(v16i32 (X86VBroadcastm VK16WM:$mask, (i32 GR32:$src))),
-        (VPBROADCASTDrZkrr VK16WM:$mask, GR32:$src)>;
+        (VPBROADCASTDrZrkz VK16WM:$mask, GR32:$src)>;
 def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))),
-        (VPBROADCASTQrZrr GR64:$src)>;
+        (VPBROADCASTQrZr GR64:$src)>;
 def : Pat<(v8i64 (X86VBroadcastm VK8WM:$mask, (i64 GR64:$src))),
-        (VPBROADCASTQrZkrr VK8WM:$mask, GR64:$src)>;
+        (VPBROADCASTQrZrkz VK8WM:$mask, GR64:$src)>;
 
 def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))),
-        (VPBROADCASTDrZrr GR32:$src)>;
+        (VPBROADCASTDrZr GR32:$src)>;
 def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_i64_512 (i64 GR64:$src))),
-        (VPBROADCASTQrZrr GR64:$src)>;
+        (VPBROADCASTQrZr GR64:$src)>;
 
 def : Pat<(v16i32 (int_x86_avx512_mask_pbroadcast_d_gpr_512 (i32 GR32:$src),
                    (v16i32 immAllZerosV), (i16 GR16:$mask))),
-          (VPBROADCASTDrZkrr (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>;
+          (VPBROADCASTDrZrkz (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>;
 def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src),
                    (bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))),
-          (VPBROADCASTQrZkrr (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>;
+          (VPBROADCASTQrZrkz (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>;
 
 multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr,
                           X86MemOperand x86memop, PatFrag ld_frag,
                           RegisterClass DstRC, ValueType OpVT, ValueType SrcVT,
                           RegisterClass KRC> {
   def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src),
-                  !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set DstRC:$dst,
                     (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX;
   def krr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask,
                                                          VR128X:$src),
-                    !strconcat(OpcodeStr, 
-                    " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+                    !strconcat(OpcodeStr,
+                    "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
                     [(set DstRC:$dst,
                       (OpVT (X86VBroadcastm KRC:$mask, (SrcVT VR128X:$src))))]>,
                     EVEX, EVEX_KZ;
   let mayLoad = 1 in {
   def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
-                  !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
-                  [(set DstRC:$dst, 
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set DstRC:$dst,
                     (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX;
   def krm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask,
                                                          x86memop:$src),
-                  !strconcat(OpcodeStr, 
-                      " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
-                  [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, 
+                  !strconcat(OpcodeStr,
+                      "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+                  [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask,
                                      (ld_frag addr:$src))))]>, EVEX, EVEX_KZ;
   }
 }
@@ -716,12 +802,12 @@ multiclass avx512_int_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
                           RegisterClass KRC> {
   let mayLoad = 1 in {
   def rm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins x86memop:$src),
-                  !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   []>, EVEX;
   def krm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask,
                                                          x86memop:$src),
                   !strconcat(OpcodeStr,
-                      " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+                      "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
                   []>, EVEX, EVEX_KZ;
   }
 }
@@ -752,7 +838,7 @@ def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))),
           (VBROADCASTSSZr VR128X:$src)>;
 def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))),
           (VBROADCASTSDZr VR128X:$src)>;
-    
+
 // Provide fallback in case the load node that is used in the patterns above
 // is used by additional users, which prevents the pattern selection.
 def : Pat<(v16f32 (X86VBroadcast FR32X:$src)),
@@ -763,7 +849,7 @@ def : Pat<(v8f64 (X86VBroadcast FR64X:$src)),
 
 let Predicates = [HasAVX512] in {
 def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))),
-           (EXTRACT_SUBREG 
+           (EXTRACT_SUBREG
               (v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
                        addr:$src)), sub_ymm)>;
 }
@@ -775,15 +861,15 @@ multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
                        RegisterClass KRC> {
 let Predicates = [HasCDI] in
 def Zrr : AVX512XS8I<opc, MRMSrcReg, (outs VR512:$dst), (ins KRC:$src),
-                  !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   []>, EVEX, EVEX_V512;
-                  
+
 let Predicates = [HasCDI, HasVLX] in {
 def Z128rr : AVX512XS8I<opc, MRMSrcReg, (outs VR128:$dst), (ins KRC:$src),
-                  !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   []>, EVEX, EVEX_V128;
 def Z256rr : AVX512XS8I<opc, MRMSrcReg, (outs VR256:$dst), (ins KRC:$src),
-                  !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   []>, EVEX, EVEX_V256;
 }
 }
@@ -803,18 +889,18 @@ multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in {
   def ri : AVX512AIi8<opc, MRMSrcReg, (outs _.RC:$dst),
-                     (ins _.RC:$src1, i8imm:$src2),
+                     (ins _.RC:$src1, u8imm:$src2),
                      !strconcat(OpcodeStr,
-                         " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set _.RC:$dst,
                        (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>,
                      EVEX;
   def mi : AVX512AIi8<opc, MRMSrcMem, (outs _.RC:$dst),
-                     (ins _.MemOp:$src1, i8imm:$src2),
+                     (ins _.MemOp:$src1, u8imm:$src2),
                      !strconcat(OpcodeStr,
-                         " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set _.RC:$dst,
-                       (_.VT (OpNode (_.MemOpFrag addr:$src1),
+                       (_.VT (OpNode (_.LdFrag addr:$src1),
                               (i8 imm:$src2))))]>,
            EVEX, EVEX_CD8<_.EltSize, CD8VF>;
 }
@@ -827,7 +913,7 @@ multiclass avx512_permil<bits<8> OpcImm, bits<8> OpcVar, X86VectorVTInfo _,
     def rr : AVX5128I<OpcVar, MRMSrcReg, (outs _.RC:$dst),
                      (ins _.RC:$src1, _.RC:$src2),
                      !strconcat("vpermil" # _.Suffix,
-                         " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set _.RC:$dst,
                          (_.VT (X86VPermilpv _.RC:$src1,
                                   (Ctrl.VT Ctrl.RC:$src2))))]>,
@@ -835,10 +921,10 @@ multiclass avx512_permil<bits<8> OpcImm, bits<8> OpcVar, X86VectorVTInfo _,
     def rm : AVX5128I<OpcVar, MRMSrcMem, (outs _.RC:$dst),
                      (ins _.RC:$src1, Ctrl.MemOp:$src2),
                      !strconcat("vpermil" # _.Suffix,
-                         " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set _.RC:$dst,
                          (_.VT (X86VPermilpv _.RC:$src1,
-                                  (Ctrl.VT (Ctrl.MemOpFrag addr:$src2)))))]>,
+                                  (Ctrl.VT (Ctrl.LdFrag addr:$src2)))))]>,
              EVEX_4V;
   }
 }
@@ -859,34 +945,34 @@ def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))),
           (VPERMILPDZri VR512:$src1, imm:$imm)>;
 
 // -- VPERM - register form --
-multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC, 
+multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC,
                      PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> {
 
   def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2),
                    !strconcat(OpcodeStr,
-                       " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                    [(set RC:$dst,
                      (OpVT (X86VPermv RC:$src1, RC:$src2)))]>, EVEX_4V;
 
   def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, x86memop:$src2),
                    !strconcat(OpcodeStr,
-                       " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                    [(set RC:$dst,
                      (OpVT (X86VPermv RC:$src1, (mem_frag addr:$src2))))]>,
                      EVEX_4V;
 }
 
-defm VPERMDZ   : avx512_perm<0x36, "vpermd",  VR512,  memopv16i32, i512mem,
+defm VPERMDZ   : avx512_perm<0x36, "vpermd",  VR512,  loadv16i32, i512mem,
                            v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPERMQZ   : avx512_perm<0x36, "vpermq",  VR512,  memopv8i64,  i512mem, 
+defm VPERMQZ   : avx512_perm<0x36, "vpermq",  VR512,  loadv8i64,  i512mem,
                            v8i64>,  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
 let ExeDomain = SSEPackedSingle in
-defm VPERMPSZ  : avx512_perm<0x16, "vpermps", VR512,  memopv16f32, f512mem,
+defm VPERMPSZ  : avx512_perm<0x16, "vpermps", VR512,  loadv16f32, f512mem,
                            v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
 let ExeDomain = SSEPackedDouble in
-defm VPERMPDZ  : avx512_perm<0x16, "vpermpd", VR512,  memopv8f64, f512mem, 
+defm VPERMPDZ  : avx512_perm<0x16, "vpermpd", VR512,  loadv8f64, f512mem,
                            v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
 
 // -- VPERM2I - 3 source operands form --
@@ -897,7 +983,7 @@ let Constraints = "$src1 = $dst" in {
   def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, RC:$src3),
                    !strconcat(OpcodeStr,
-                       " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                       "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst,
                      (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
                     EVEX_4V;
@@ -905,7 +991,7 @@ let Constraints = "$src1 = $dst" in {
   def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3),
                    !strconcat(OpcodeStr,
-                       " \t{$src3, $src2, $dst {${mask}}|"
+                       "\t{$src3, $src2, $dst {${mask}}|"
                        "$dst {${mask}}, $src2, $src3}"),
                    [(set RC:$dst, (OpVT (vselect KRC:$mask,
                                            (OpNode RC:$src1, RC:$src2,
@@ -917,7 +1003,7 @@ let Constraints = "$src1 = $dst" in {
     def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3),
                    !strconcat(OpcodeStr,
-                       " \t{$src3, $src2, $dst {${mask}} {z} |",
+                       "\t{$src3, $src2, $dst {${mask}} {z} |",
                        "$dst {${mask}} {z}, $src2, $src3}"),
                    [(set RC:$dst, (OpVT (vselect KRC:$mask,
                                            (OpNode RC:$src1, RC:$src2,
@@ -929,7 +1015,7 @@ let Constraints = "$src1 = $dst" in {
   def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, x86memop:$src3),
                    !strconcat(OpcodeStr,
-                    " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst,
                      (OpVT (OpNode RC:$src1, RC:$src2,
                       (mem_frag addr:$src3))))]>, EVEX_4V;
@@ -937,7 +1023,7 @@ let Constraints = "$src1 = $dst" in {
   def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3),
                    !strconcat(OpcodeStr,
-                    " \t{$src3, $src2, $dst {${mask}}|"
+                    "\t{$src3, $src2, $dst {${mask}}|"
                     "$dst {${mask}}, $src2, $src3}"),
                    [(set RC:$dst,
                        (OpVT (vselect KRC:$mask,
@@ -950,7 +1036,7 @@ let Constraints = "$src1 = $dst" in {
     def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3),
                    !strconcat(OpcodeStr,
-                    " \t{$src3, $src2, $dst {${mask}} {z}|"
+                    "\t{$src3, $src2, $dst {${mask}} {z}|"
                     "$dst {${mask}} {z}, $src2, $src3}"),
                    [(set RC:$dst,
                      (OpVT (vselect KRC:$mask,
@@ -961,16 +1047,16 @@ let Constraints = "$src1 = $dst" in {
                     EVEX_4V, EVEX_KZ;
   }
 }
-defm VPERMI2D  : avx512_perm_3src<0x76, "vpermi2d",  VR512, memopv16i32,
+defm VPERMI2D  : avx512_perm_3src<0x76, "vpermi2d",  VR512, loadv16i32,
                                   i512mem, X86VPermiv3, v16i32, VK16WM>,
                  EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPERMI2Q  : avx512_perm_3src<0x76, "vpermi2q",  VR512, memopv8i64,
+defm VPERMI2Q  : avx512_perm_3src<0x76, "vpermi2q",  VR512, loadv8i64,
                                   i512mem, X86VPermiv3, v8i64, VK8WM>,
                  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps",  VR512, memopv16f32,
+defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps",  VR512, loadv16f32,
                                   i512mem, X86VPermiv3, v16f32, VK16WM>,
                  EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd",  VR512, memopv8f64,
+defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd",  VR512, loadv8f64,
                                   i512mem, X86VPermiv3, v8f64, VK8WM>,
                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
 
@@ -990,93 +1076,126 @@ multiclass avx512_perm_table_3src<bits<8> opc, string Suffix, RegisterClass RC,
               (MaskVT (COPY_TO_REGCLASS MRC:$mask, KRC)), VR512:$idx, VR512:$src2)>;
 }
 
-defm VPERMT2D  : avx512_perm_table_3src<0x7E, "d",  VR512, memopv16i32, i512mem,
+defm VPERMT2D  : avx512_perm_table_3src<0x7E, "d",  VR512, loadv16i32, i512mem,
                                X86VPermv3, v16i32, VK16WM, v16i1, GR16>,
                  EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPERMT2Q  : avx512_perm_table_3src<0x7E, "q",  VR512, memopv8i64, i512mem,
+defm VPERMT2Q  : avx512_perm_table_3src<0x7E, "q",  VR512, loadv8i64, i512mem,
                                X86VPermv3, v8i64, VK8WM, v8i1, GR8>,
                  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMT2PS : avx512_perm_table_3src<0x7F, "ps",  VR512, memopv16f32, i512mem,
+defm VPERMT2PS : avx512_perm_table_3src<0x7F, "ps",  VR512, loadv16f32, i512mem,
                                X86VPermv3, v16f32, VK16WM, v16i1, GR16>,
                  EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd",  VR512, memopv8f64, i512mem,
+defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd",  VR512, loadv8f64, i512mem,
                                X86VPermv3, v8f64, VK8WM, v8i1, GR8>,
                  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512 - BLEND using mask
 //
-multiclass avx512_blendmask<bits<8> opc, string OpcodeStr,
-                          RegisterClass KRC, RegisterClass RC,
-                          X86MemOperand x86memop, PatFrag mem_frag,
-                          SDNode OpNode, ValueType vt> {
-  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
-             (ins KRC:$mask, RC:$src1, RC:$src2),
+multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+             (ins _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
-             " \t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
-             [(set RC:$dst, (OpNode KRC:$mask, (vt RC:$src2),
-                 (vt RC:$src1)))]>, EVEX_4V, EVEX_K;
-  let mayLoad = 1 in
-  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
-             (ins KRC:$mask, RC:$src1, x86memop:$src2),
+             "\t{$src2, $src1, ${dst} |${dst}, $src1, $src2}"),
+             []>, EVEX_4V;
+  def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+             (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+             !strconcat(OpcodeStr,
+             "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+             [(set _.RC:$dst, (X86select _.KRCWM:$mask, (_.VT _.RC:$src1),
+                 (_.VT _.RC:$src2)))]>, EVEX_4V, EVEX_K;
+  def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+             (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+             !strconcat(OpcodeStr,
+             "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
+             []>, EVEX_4V, EVEX_KZ;
+  let mayLoad = 1 in {
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+             (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
-             " \t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
-             []>, EVEX_4V, EVEX_K;
+             "\t{$src2, $src1, ${dst} |${dst},  $src1, $src2}"),
+             []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+  def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+             (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+             !strconcat(OpcodeStr,
+             "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+             [(set _.RC:$dst, (X86select _.KRCWM:$mask, (_.VT _.RC:$src1),
+              (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
+              EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>;
+  def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+             (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+             !strconcat(OpcodeStr,
+             "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
+             []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>;
+  }
+  }
 }
+multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+
+  def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+      (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
+       !strconcat(OpcodeStr,
+            "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+            "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+      [(set _.RC:$dst,(X86select _.KRCWM:$mask, (_.VT _.RC:$src1),
+                       (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>,
+      EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+
+  def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+      (ins _.RC:$src1, _.ScalarMemOp:$src2),
+       !strconcat(OpcodeStr,
+            "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
+            "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+      []>,  EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+
+}
+
+multiclass blendmask_dq <bits<8> opc, string OpcodeStr,
+                                 AVX512VLVectorVTInfo VTInfo> {
+  defm Z : avx512_blendmask      <opc, OpcodeStr, VTInfo.info512>,
+           avx512_blendmask_rmb  <opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z256 : avx512_blendmask<opc, OpcodeStr, VTInfo.info256>,
+                avx512_blendmask_rmb  <opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+    defm Z128 : avx512_blendmask<opc, OpcodeStr, VTInfo.info128>,
+                avx512_blendmask_rmb  <opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+  }
+}
+
+multiclass blendmask_bw <bits<8> opc, string OpcodeStr,
+                         AVX512VLVectorVTInfo VTInfo> {
+  let Predicates = [HasBWI] in
+    defm Z : avx512_blendmask    <opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [HasBWI, HasVLX] in {
+    defm Z256 : avx512_blendmask <opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+    defm Z128 : avx512_blendmask <opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+  }
+}
+
+
+defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", avx512vl_f32_info>;
+defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", avx512vl_f64_info>, VEX_W;
+defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", avx512vl_i32_info>;
+defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", avx512vl_i64_info>, VEX_W;
+defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>;
+defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W;
 
-let ExeDomain = SSEPackedSingle in
-defm VBLENDMPSZ : avx512_blendmask<0x65, "vblendmps", 
-                              VK16WM, VR512, f512mem,
-                              memopv16f32, vselect, v16f32>, 
-                              EVEX_CD8<32, CD8VF>, EVEX_V512;
-let ExeDomain = SSEPackedDouble in
-defm VBLENDMPDZ : avx512_blendmask<0x65, "vblendmpd", 
-                              VK8WM, VR512, f512mem,
-                              memopv8f64, vselect, v8f64>, 
-                              VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512;
-
-def : Pat<(v16f32 (int_x86_avx512_mask_blend_ps_512 (v16f32 VR512:$src1),
-                 (v16f32 VR512:$src2), (i16 GR16:$mask))),
-        (VBLENDMPSZrr (COPY_TO_REGCLASS GR16:$mask, VK16WM),
-         VR512:$src1, VR512:$src2)>;
-
-def : Pat<(v8f64 (int_x86_avx512_mask_blend_pd_512 (v8f64 VR512:$src1),
-                 (v8f64 VR512:$src2), (i8 GR8:$mask))),
-        (VBLENDMPDZrr (COPY_TO_REGCLASS GR8:$mask, VK8WM),
-         VR512:$src1, VR512:$src2)>;
-
-defm VPBLENDMDZ : avx512_blendmask<0x64, "vpblendmd", 
-                              VK16WM, VR512, f512mem, 
-                              memopv16i32, vselect, v16i32>, 
-                              EVEX_CD8<32, CD8VF>, EVEX_V512;
-
-defm VPBLENDMQZ : avx512_blendmask<0x64, "vpblendmq", 
-                              VK8WM, VR512, f512mem, 
-                              memopv8i64, vselect, v8i64>, 
-                              VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512;
-
-def : Pat<(v16i32 (int_x86_avx512_mask_blend_d_512 (v16i32 VR512:$src1),
-                 (v16i32 VR512:$src2), (i16 GR16:$mask))),
-        (VPBLENDMDZrr (COPY_TO_REGCLASS GR16:$mask, VK16),
-         VR512:$src1, VR512:$src2)>;
-
-def : Pat<(v8i64 (int_x86_avx512_mask_blend_q_512 (v8i64 VR512:$src1),
-                 (v8i64 VR512:$src2), (i8 GR8:$mask))),
-        (VPBLENDMQZrr (COPY_TO_REGCLASS GR8:$mask, VK8),
-         VR512:$src1, VR512:$src2)>;
 
 let Predicates = [HasAVX512] in {
 def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
                             (v8f32 VR256X:$src2))),
-            (EXTRACT_SUBREG 
-              (v16f32 (VBLENDMPSZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), 
+            (EXTRACT_SUBREG
+              (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
             (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
             (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
 
 def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
                             (v8i32 VR256X:$src2))),
-            (EXTRACT_SUBREG 
-                (v16i32 (VPBLENDMDZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), 
+            (EXTRACT_SUBREG
+                (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
             (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
             (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
 }
@@ -1086,35 +1205,40 @@ def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
 
 // avx512_cmp_scalar - AVX512 CMPSS and CMPSD
 multiclass avx512_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
-                            Operand CC, SDNode OpNode, ValueType VT,
-                            PatFrag ld_frag, string asm, string asm_alt> {
+                            SDNode OpNode, ValueType VT,
+                            PatFrag ld_frag, string Suffix> {
   def rr : AVX512Ii8<0xC2, MRMSrcReg,
-                (outs VK1:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+                (outs VK1:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc),
+                !strconcat("vcmp${cc}", Suffix,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set VK1:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
                 IIC_SSE_ALU_F32S_RR>, EVEX_4V;
   def rm : AVX512Ii8<0xC2, MRMSrcMem,
-                (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+                (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc),
+                !strconcat("vcmp${cc}", Suffix,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set VK1:$dst, (OpNode (VT RC:$src1),
                 (ld_frag addr:$src2), imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
     def rri_alt : AVX512Ii8<0xC2, MRMSrcReg,
-               (outs VK1:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
-               asm_alt, [], IIC_SSE_ALU_F32S_RR>, EVEX_4V;
+               (outs VK1:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
+               !strconcat("vcmp", Suffix,
+                          "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32S_RR>, EVEX_4V;
+    let mayLoad = 1 in
     def rmi_alt : AVX512Ii8<0xC2, MRMSrcMem,
-               (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
-               asm_alt, [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+               (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
+               !strconcat("vcmp", Suffix,
+                          "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
   }
 }
 
 let Predicates = [HasAVX512] in {
-defm VCMPSSZ : avx512_cmp_scalar<FR32X, f32mem, AVXCC, X86cmpms, f32, loadf32,
-                 "vcmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                 "vcmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
-                 XS;
-defm VCMPSDZ : avx512_cmp_scalar<FR64X, f64mem, AVXCC, X86cmpms, f64, loadf64,
-                 "vcmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                 "vcmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
-                 XD, VEX_W;
+defm VCMPSSZ : avx512_cmp_scalar<FR32X, f32mem, X86cmpms, f32, loadf32, "ss">,
+                                 XS;
+defm VCMPSDZ : avx512_cmp_scalar<FR64X, f64mem, X86cmpms, f64, loadf64, "sd">,
+                                 XD, VEX_W;
 }
 
 multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -1249,7 +1373,7 @@ def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
 multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
                           X86VectorVTInfo _> {
   def rri : AVX512AIi8<opc, MRMSrcReg,
-             (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc),
              !strconcat("vpcmp${cc}", Suffix,
                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
@@ -1257,7 +1381,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
              IIC_SSE_ALU_F32P_RR>, EVEX_4V;
   let mayLoad = 1 in
   def rmi : AVX512AIi8<opc, MRMSrcMem,
-             (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc),
              !strconcat("vpcmp${cc}", Suffix,
                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
@@ -1266,7 +1390,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
              IIC_SSE_ALU_F32P_RM>, EVEX_4V;
   def rrik : AVX512AIi8<opc, MRMSrcReg,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
-                                      AVXCC:$cc),
+                                      AVX512ICC:$cc),
               !strconcat("vpcmp${cc}", Suffix,
                          "\t{$src2, $src1, $dst {${mask}}|",
                          "$dst {${mask}}, $src1, $src2}"),
@@ -1277,7 +1401,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
   let mayLoad = 1 in
   def rmik : AVX512AIi8<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
-                                    AVXCC:$cc),
+                                    AVX512ICC:$cc),
               !strconcat("vpcmp${cc}", Suffix,
                          "\t{$src2, $src1, $dst {${mask}}|",
                          "$dst {${mask}}, $src1, $src2}"),
@@ -1290,25 +1414,27 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
   // Accept explicit immediate argument form instead of comparison code.
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
     def rri_alt : AVX512AIi8<opc, MRMSrcReg,
-               (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, i8imm:$cc),
+               (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
                !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
                           "$dst, $src1, $src2, $cc}"),
                [], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+    let mayLoad = 1 in
     def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
-               (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, i8imm:$cc),
+               (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
                !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
                           "$dst, $src1, $src2, $cc}"),
                [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
     def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
                (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
-                                       i8imm:$cc),
+                                       u8imm:$cc),
                !strconcat("vpcmp", Suffix,
                           "\t{$cc, $src2, $src1, $dst {${mask}}|",
                           "$dst {${mask}}, $src1, $src2, $cc}"),
                [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+    let mayLoad = 1 in
     def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
                (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
-                                       i8imm:$cc),
+                                       u8imm:$cc),
                !strconcat("vpcmp", Suffix,
                           "\t{$cc, $src2, $src1, $dst {${mask}}|",
                           "$dst {${mask}}, $src1, $src2, $cc}"),
@@ -1319,10 +1445,9 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
 multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
                               X86VectorVTInfo _> :
            avx512_icmp_cc<opc, Suffix, OpNode, _> {
-  let mayLoad = 1 in {
   def rmib : AVX512AIi8<opc, MRMSrcMem,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
-                                     AVXCC:$cc),
+                                     AVX512ICC:$cc),
              !strconcat("vpcmp${cc}", Suffix,
                         "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
                         "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
@@ -1332,7 +1457,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
              IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
   def rmibk : AVX512AIi8<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
-                                       _.ScalarMemOp:$src2, AVXCC:$cc),
+                                       _.ScalarMemOp:$src2, AVX512ICC:$cc),
               !strconcat("vpcmp${cc}", Suffix,
                        "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
                        "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
@@ -1341,20 +1466,19 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
                                     (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
                                     imm:$cc)))],
               IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
-  }
 
   // Accept explicit immediate argument form instead of comparison code.
-  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+  let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in {
     def rmib_alt : AVX512AIi8<opc, MRMSrcMem,
                (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
-                                       i8imm:$cc),
+                                       u8imm:$cc),
                !strconcat("vpcmp", Suffix,
                    "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|",
                    "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
                [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
     def rmibk_alt : AVX512AIi8<opc, MRMSrcMem,
                (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
-                                       _.ScalarMemOp:$src2, i8imm:$cc),
+                                       _.ScalarMemOp:$src2, u8imm:$cc),
                !strconcat("vpcmp", Suffix,
                   "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
                   "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
@@ -1414,30 +1538,32 @@ multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC,
   def rri : AVX512PIi8<0xC2, MRMSrcReg,
              (outs KRC:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc),
              !strconcat("vcmp${cc}", suffix,
-                        " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set KRC:$dst, (X86cmpm (vt RC:$src1), (vt RC:$src2), imm:$cc))], d>;
+  let hasSideEffects = 0 in
   def rrib: AVX512PIi8<0xC2, MRMSrcReg,
              (outs KRC:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc),
      !strconcat("vcmp${cc}", suffix,
-                " \t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"),
+                "\t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"),
                 [], d>, EVEX_B;
   def rmi : AVX512PIi8<0xC2, MRMSrcMem,
              (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc),
               !strconcat("vcmp${cc}", suffix,
-                         " \t{$src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
              [(set KRC:$dst,
-              (X86cmpm (vt RC:$src1), (memop addr:$src2), imm:$cc))], d>;
+              (X86cmpm (vt RC:$src1), (load addr:$src2), imm:$cc))], d>;
 
   // Accept explicit immediate argument form instead of comparison code.
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
     def rri_alt : AVX512PIi8<0xC2, MRMSrcReg,
-               (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+               (outs KRC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
               !strconcat("vcmp", suffix,
-                        " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>;
+                        "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>;
+    let mayLoad = 1 in
     def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem,
-               (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+               (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
               !strconcat("vcmp", suffix,
-                        " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>;
+                        "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>;
   }
 }
 
@@ -1465,25 +1591,25 @@ def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
             imm:$cc), VK8)>;
 
 def : Pat<(i16 (int_x86_avx512_mask_cmp_ps_512 (v16f32 VR512:$src1),
-                (v16f32 VR512:$src2), imm:$cc, (i16 -1),
+                (v16f32 VR512:$src2), i8immZExt5:$cc, (i16 -1),
                  FROUND_NO_EXC)),
           (COPY_TO_REGCLASS (VCMPPSZrrib VR512:$src1, VR512:$src2,
                              (I8Imm imm:$cc)), GR16)>;
-           
+
 def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1),
-                (v8f64 VR512:$src2), imm:$cc, (i8 -1),
+                (v8f64 VR512:$src2), i8immZExt5:$cc, (i8 -1),
                  FROUND_NO_EXC)),
           (COPY_TO_REGCLASS (VCMPPDZrrib VR512:$src1, VR512:$src2,
                              (I8Imm imm:$cc)), GR8)>;
 
 def : Pat<(i16 (int_x86_avx512_mask_cmp_ps_512 (v16f32 VR512:$src1),
-                (v16f32 VR512:$src2), imm:$cc, (i16 -1),
+                (v16f32 VR512:$src2), i8immZExt5:$cc, (i16 -1),
                 FROUND_CURRENT)),
           (COPY_TO_REGCLASS (VCMPPSZrri VR512:$src1, VR512:$src2,
                              (I8Imm imm:$cc)), GR16)>;
 
 def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1),
-                (v8f64 VR512:$src2), imm:$cc, (i8 -1),
+                (v8f64 VR512:$src2), i8immZExt5:$cc, (i8 -1),
                  FROUND_CURRENT)),
           (COPY_TO_REGCLASS (VCMPPDZrri VR512:$src1, VR512:$src2,
                              (I8Imm imm:$cc)), GR8)>;
@@ -1495,17 +1621,18 @@ def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1),
 //
 multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
                          string OpcodeStr, RegisterClass KRC,
-                         ValueType vvt, ValueType ivt, X86MemOperand x86memop> {
+                         ValueType vvt, X86MemOperand x86memop> {
   let hasSideEffects = 0 in {
     def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
-               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>;
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
     let mayLoad = 1 in
     def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
-               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
-               [(set KRC:$dst, (vvt (bitconvert (ivt (load addr:$src)))))]>;
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+               [(set KRC:$dst, (vvt (load addr:$src)))]>;
     let mayStore = 1 in
     def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
-               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>;
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+               [(store KRC:$src, addr:$dst)]>;
   }
 }
 
@@ -1514,34 +1641,32 @@ multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
                              RegisterClass KRC, RegisterClass GRC> {
   let hasSideEffects = 0 in {
     def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src),
-               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>;
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
     def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src),
-               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>;
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
   }
 }
 
 let Predicates = [HasDQI] in
-  defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8,
-                               i8mem>,
+  defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>,
                avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>,
                VEX, PD;
 
 let Predicates = [HasAVX512] in
-  defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16,
-                               i16mem>,
+  defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
                avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
                VEX, PS;
 
 let Predicates = [HasBWI] in {
-  defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1, i32,
-                               i32mem>, VEX, PD, VEX_W;
+  defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>,
+               VEX, PD, VEX_W;
   defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>,
                VEX, XD;
 }
 
 let Predicates = [HasBWI] in {
-  defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64,
-                               i64mem>, VEX, PS, VEX_W;
+  defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>,
+               VEX, PS, VEX_W;
   defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>,
                VEX, XD, VEX_W;
 }
@@ -1572,24 +1697,34 @@ let Predicates = [HasBWI] in {
 let Predicates = [HasDQI] in {
   def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
             (KMOVBmk addr:$dst, VK8:$src)>;
+  def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
+            (KMOVBkm addr:$src)>;
+}
+let Predicates = [HasAVX512, NoDQI] in {
+  def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
+            (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
+  def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
+            (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
 }
 let Predicates = [HasAVX512] in {
   def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
             (KMOVWmk addr:$dst, VK16:$src)>;
-  def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
-            (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
   def : Pat<(i1 (load addr:$src)),
             (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK1)>;
-  def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
-            (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
+  def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
+            (KMOVWkm addr:$src)>;
 }
 let Predicates = [HasBWI] in {
   def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst),
             (KMOVDmk addr:$dst, VK32:$src)>;
+  def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))),
+            (KMOVDkm addr:$src)>;
 }
 let Predicates = [HasBWI] in {
   def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst),
             (KMOVQmk addr:$dst, VK64:$src)>;
+  def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))),
+            (KMOVQkm addr:$src)>;
 }
 
 let Predicates = [HasAVX512] in {
@@ -1666,7 +1801,7 @@ multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr,
                             Predicate prd> {
   let Predicates = [prd] in
     def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
-               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                [(set KRC:$dst, (OpNode KRC:$src))]>;
 }
 
@@ -1703,7 +1838,7 @@ let Predicates = [HasBWI] in
 def : Pat<(xor VK64:$src1, (v64i1 immAllOnesV)), (KNOTQrr VK64:$src1)>;
 
 // KNL does not support KMOVB, 8-bit mask is promoted to 16-bit
-let Predicates = [HasAVX512] in {
+let Predicates = [HasAVX512, NoDQI] in {
 def : Pat<(xor VK8:$src1,  (v8i1 immAllOnesV)),
           (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>;
 
@@ -1720,7 +1855,7 @@ multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
   let Predicates = [prd] in
     def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
                !strconcat(OpcodeStr,
-                          " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>;
 }
 
@@ -1796,7 +1931,7 @@ multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr,
   let Predicates = [HasAVX512] in
     def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
                !strconcat(OpcodeStr,
-                          " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
 }
 
 multiclass avx512_mask_unpck_bw<bits<8> opc, string OpcodeStr> {
@@ -1825,35 +1960,50 @@ multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
                             SDNode OpNode> {
   let Predicates = [HasAVX512], Defs = [EFLAGS] in
     def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
-               !strconcat(OpcodeStr, " \t{$src2, $src1|$src1, $src2}"),
+               !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>;
 }
 
 multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> {
   defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
                             VEX, PS;
+  let Predicates = [HasDQI] in
+  defm B : avx512_mask_testop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode>,
+                            VEX, PD;
+  let Predicates = [HasBWI] in {
+  defm Q : avx512_mask_testop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode>,
+                            VEX, PS, VEX_W;
+  defm D : avx512_mask_testop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode>,
+                            VEX, PD, VEX_W;
+  }
 }
 
 defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>;
 
-def : Pat<(X86cmp VK1:$src1, (i1 0)),
-          (KORTESTWrr (COPY_TO_REGCLASS VK1:$src1, VK16),
-           (COPY_TO_REGCLASS VK1:$src1, VK16))>;
-
 // Mask shift
 multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
                              SDNode OpNode> {
   let Predicates = [HasAVX512] in
-    def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, i8imm:$imm),
+    def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm),
                  !strconcat(OpcodeStr,
-                            " \t{$imm, $src, $dst|$dst, $src, $imm}"),
+                            "\t{$imm, $src, $dst|$dst, $src, $imm}"),
                             [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>;
 }
 
 multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
                                SDNode OpNode> {
   defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
-                             VEX, TAPD, VEX_W;
+                               VEX, TAPD, VEX_W;
+  let Predicates = [HasDQI] in
+  defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode>,
+                               VEX, TAPD;
+  let Predicates = [HasBWI] in {
+  defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode>,
+                               VEX, TAPD, VEX_W;
+  let Predicates = [HasDQI] in
+  defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>,
+                               VEX, TAPD;
+  }  
 }
 
 defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>;
@@ -1904,10 +2054,14 @@ let Predicates = [HasVLX] in {
 }
 
 def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))),
-          (v8i1 (COPY_TO_REGCLASS (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>;
+          (v8i1 (COPY_TO_REGCLASS
+                 (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16),
+                  (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>;
 
 def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))),
-          (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>;
+          (v8i1 (COPY_TO_REGCLASS
+                 (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16),
+                  (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>;
 //===----------------------------------------------------------------------===//
 // AVX-512 - Aligned and unaligned load and store
 //
@@ -2001,7 +2155,7 @@ multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, string ld_pat,
 multiclass avx512_store<bits<8> opc, string OpcodeStr, PatFrag st_frag,
                         ValueType OpVT, RegisterClass KRC, RegisterClass RC,
                         X86MemOperand memop, Domain d> {
-  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
   def rr_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), (ins RC:$src),
               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], d>,
               EVEX;
@@ -2088,6 +2242,22 @@ def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr,
                  (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)),
        (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
 
+def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr,
+                (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)),
+       (VMOVAPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
+
+def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr,
+                 (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)),
+       (VMOVAPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
+
+def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr,
+                (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
+       (VMOVAPDZrm addr:$ptr)>;
+
+def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr,
+                 (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
+       (VMOVAPSZrm addr:$ptr)>;
+
 def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src),
           GR16:$mask),
          (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
@@ -2097,6 +2267,55 @@ def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src),
          (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
             VR512:$src)>;
 
+def: Pat<(int_x86_avx512_mask_store_ps_512 addr:$ptr, (v16f32 VR512:$src),
+          GR16:$mask),
+         (VMOVAPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
+            VR512:$src)>;
+def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src),
+          GR8:$mask),
+         (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
+            VR512:$src)>;
+
+def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)),
+         (VMOVUPSZmrk addr:$ptr,
+         (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)),
+         (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>;
+
+def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)),
+         (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz 
+          (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
+
+def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src)),
+         (VMOVUPSZmrk addr:$ptr, VK16WM:$mask, VR512:$src)>;
+
+def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src)),
+         (VMOVUPDZmrk addr:$ptr, VK8WM:$mask, VR512:$src)>;
+
+def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, undef)),
+         (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>;
+
+def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask,
+                              (bc_v16f32 (v16i32 immAllZerosV)))),
+         (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>;
+
+def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src0))),
+         (VMOVUPSZrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>;
+
+def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, undef)),
+         (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>;
+
+def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask,
+                             (bc_v8f64 (v16i32 immAllZerosV)))),
+         (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>;
+
+def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src0))),
+         (VMOVUPDZrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>;
+
+def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))),
+         (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk
+         (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm),
+          (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
+
 defm VMOVDQA32 : avx512_load_vl<0x6F, "vmovdqa32", "alignedload", "i", "32",
                                 "16", "8", "4", SSEPackedInt, HasAVX512>,
                  avx512_store_vl<0x7F, "vmovdqa32", "alignedstore",
@@ -2171,6 +2390,46 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
                   (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
 }
 
+def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 immAllZerosV))),
+         (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>;
+
+def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, undef)),
+         (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>;
+
+def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src0))),
+         (VMOVDQU32Zrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>;
+
+def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask,
+                             (bc_v8i64 (v16i32 immAllZerosV)))),
+         (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>;
+
+def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, undef)),
+         (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>;
+
+def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src0))),
+         (VMOVDQU64Zrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>;
+
+def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src)),
+         (VMOVDQU32Zmrk addr:$ptr, VK16WM:$mask, VR512:$src)>;
+
+def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src)),
+         (VMOVDQU64Zmrk addr:$ptr, VK8WM:$mask, VR512:$src)>;
+
+// SKX replacement
+def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)),
+         (VMOVDQU32Z256mrk addr:$ptr, VK8WM:$mask, VR256:$src)>;
+
+// KNL replacement
+def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)),
+         (VMOVDQU32Zmrk addr:$ptr,
+         (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)),
+         (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>;
+
+def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)),
+         (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz 
+          (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
+
+
 // Move Int Doubleword to Packed Double Int
 //
 def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
@@ -2277,12 +2536,12 @@ def VMOVQI2PQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
 // AVX-512  MOVSS, MOVSD
 //===----------------------------------------------------------------------===//
 
-multiclass avx512_move_scalar <string asm, RegisterClass RC, 
+multiclass avx512_move_scalar <string asm, RegisterClass RC,
                               SDNode OpNode, ValueType vt,
                               X86MemOperand x86memop, PatFrag mem_pat> {
   let hasSideEffects = 0 in {
-  def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), 
-              !strconcat(asm, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+  def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2),
+              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
               [(set VR128X:$dst, (vt (OpNode VR128X:$src1,
                                       (scalar_to_vector RC:$src2))))],
               IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG;
@@ -2290,19 +2549,19 @@ multiclass avx512_move_scalar <string asm, RegisterClass RC,
   def rrk : SI<0x10, MRMSrcReg, (outs VR128X:$dst),
               (ins VR128X:$src1, VK1WM:$mask, RC:$src2, RC:$src3),
               !strconcat(asm,
-                " \t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"),
+                "\t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"),
               [], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG, EVEX_K;
   def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-              !strconcat(asm, " \t{$src, $dst|$dst, $src}"),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
               [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>,
               EVEX, VEX_LIG;
   let mayStore = 1 in {
   def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
-             !strconcat(asm, " \t{$src, $dst|$dst, $src}"),
+             !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
              [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
              EVEX, VEX_LIG;
   def mrk: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, VK1WM:$mask, RC:$src),
-             !strconcat(asm, " \t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+             !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
              [], IIC_SSE_MOV_S_MR>,
              EVEX, VEX_LIG, EVEX_K;
   } // mayStore
@@ -2359,7 +2618,7 @@ let Predicates = [HasAVX512] in {
   // Move low f32 and clear high bits.
   def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSSZrr (v4f32 (V_SET0)), 
+             (VMOVSSZrr (v4f32 (V_SET0)),
               (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>;
   def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
             (SUBREG_TO_REG (i32 0),
@@ -2488,7 +2747,7 @@ let AddedComplexity = 15 in
 def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
                                 (ins VR128X:$src),
                                 "vmovq\t{$src, $dst|$dst, $src}",
-                                [(set VR128X:$dst, (v2i64 (X86vzmovl 
+                                [(set VR128X:$dst, (v2i64 (X86vzmovl
                                                    (v2i64 VR128X:$src))))],
                                 IIC_SSE_MOVQ_RR>, EVEX, VEX_W;
 
@@ -2510,7 +2769,7 @@ let Predicates = [HasAVX512] in {
               (VMOV64toPQIZrr GR64:$src)>;
     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
               (VMOVDI2PDIZrr GR32:$src)>;
-              
+
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
               (VMOVDI2PDIZrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
@@ -2751,48 +3010,48 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, ValueType DstVT,
   {
     def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, RC:$src2),
-       !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        []>, EVEX_4V;
     def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
                (ins KRC:$mask, RC:$src1, RC:$src2),
                !strconcat(OpcodeStr,
-                  " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+                  "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
                [], itins.rr>, EVEX_4V, EVEX_K;
     def rrkz : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
                 (ins KRC:$mask, RC:$src1, RC:$src2),
-                !strconcat(OpcodeStr, " \t{$src2, $src1, $dst {${mask}} {z}" ,
+                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}} {z}" ,
                     "|$dst {${mask}} {z}, $src1, $src2}"),
                 [], itins.rr>, EVEX_4V, EVEX_KZ;
   }
   let mayLoad = 1 in {
     def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
               (ins RC:$src1, x86memop:$src2),
-              !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
               []>, EVEX_4V;
     def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
                (ins KRC:$mask, RC:$src1, x86memop:$src2),
                !strconcat(OpcodeStr,
-                   " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+                   "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
                [], itins.rm>, EVEX_4V, EVEX_K;
     def rmkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
                 (ins KRC:$mask, RC:$src1, x86memop:$src2),
                 !strconcat(OpcodeStr,
-                    " \t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"),
+                    "\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"),
                 [], itins.rm>, EVEX_4V, EVEX_KZ;
     def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
                (ins RC:$src1, x86scalar_mop:$src2),
-               !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+               !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
                           ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
                [], itins.rm>, EVEX_4V, EVEX_B;
     def rmbk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
                 (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2),
-                !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+                !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
                            ", $src1, $dst {${mask}}|$dst {${mask}}, $src1, ${src2}",
                            BrdcstStr, "}"),
                 [], itins.rm>, EVEX_4V, EVEX_B, EVEX_K;
     def rmbkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
                  (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2),
-                 !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+                 !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
                             ", $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, ${src2}",
                             BrdcstStr, "}"),
                  [], itins.rm>, EVEX_4V, EVEX_B, EVEX_KZ;
@@ -2811,12 +3070,12 @@ defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul,
                                    SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD;
 
 defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32, VK8WM, VR512,
-                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   loadv8i64, i512mem, loadi64, i64mem, "{1to8}",
                    SSE_INTALU_ITINS_P, 1>, T8PD, EVEX_V512,
                    EVEX_CD8<64, CD8VF>, VEX_W;
 
 defm VPMULUDQZ : avx512_binop_rm2<0xF4, "vpmuludq", v8i64, v16i32, VK8WM, VR512,
-                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   loadv8i64, i512mem, loadi64, i64mem, "{1to8}",
                    SSE_INTMUL_ITINS_P, 1>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W;
 
 def : Pat<(v8i64 (X86pmuludq (v16i32 VR512:$src1), (v16i32 VR512:$src2))),
@@ -2902,16 +3161,16 @@ multiclass avx512_unpack_fp<bits<8> opc, SDNode OpNode, ValueType vt,
                         d>, EVEX_4V;
 }
 
-defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, memopv8f64,
+defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, loadv8f64,
       VR512, f512mem, "vunpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, memopv8f64,
+defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, loadv8f64,
       VR512, f512mem, "vunpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, memopv8f64,
+defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, loadv8f64,
       VR512, f512mem, "vunpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, memopv8f64,
+defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, loadv8f64,
       VR512, f512mem, "vunpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
 
@@ -2920,52 +3179,52 @@ multiclass avx512_unpack_int<bits<8> opc, string OpcodeStr, SDNode OpNode,
                         X86MemOperand x86memop> {
   def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, RC:$src2),
-       !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], 
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))],
        IIC_SSE_UNPCK>, EVEX_4V;
   def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, x86memop:$src2),
-       !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1),
                                      (bitconvert (memop_frag addr:$src2)))))],
                                      IIC_SSE_UNPCK>, EVEX_4V;
 }
 defm VPUNPCKLDQZ  : avx512_unpack_int<0x62, "vpunpckldq", X86Unpckl, v16i32,
-                                VR512, memopv16i32, i512mem>, EVEX_V512,
+                                VR512, loadv16i32, i512mem>, EVEX_V512,
                                 EVEX_CD8<32, CD8VF>;
 defm VPUNPCKLQDQZ : avx512_unpack_int<0x6C, "vpunpcklqdq", X86Unpckl, v8i64,
-                                VR512, memopv8i64, i512mem>, EVEX_V512,
+                                VR512, loadv8i64, i512mem>, EVEX_V512,
                                 VEX_W, EVEX_CD8<64, CD8VF>;
 defm VPUNPCKHDQZ  : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32,
-                                VR512, memopv16i32, i512mem>, EVEX_V512,
+                                VR512, loadv16i32, i512mem>, EVEX_V512,
                                 EVEX_CD8<32, CD8VF>;
 defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64,
-                                VR512, memopv8i64, i512mem>, EVEX_V512,
+                                VR512, loadv8i64, i512mem>, EVEX_V512,
                                 VEX_W, EVEX_CD8<64, CD8VF>;
 //===----------------------------------------------------------------------===//
 // AVX-512 - PSHUFD
 //
 
 multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                         SDNode OpNode, PatFrag mem_frag, 
+                         SDNode OpNode, PatFrag mem_frag,
                          X86MemOperand x86memop, ValueType OpVT> {
   def ri : AVX512Ii8<opc, MRMSrcReg, (outs RC:$dst),
-                     (ins RC:$src1, i8imm:$src2),
+                     (ins RC:$src1, u8imm:$src2),
                      !strconcat(OpcodeStr,
-                         " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set RC:$dst,
                        (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
                      EVEX;
   def mi : AVX512Ii8<opc, MRMSrcMem, (outs RC:$dst),
-                     (ins x86memop:$src1, i8imm:$src2),
+                     (ins x86memop:$src1, u8imm:$src2),
                      !strconcat(OpcodeStr,
-                         " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set RC:$dst,
                        (OpVT (OpNode (mem_frag addr:$src1),
                               (i8 imm:$src2))))]>, EVEX;
 }
 
-defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32,
+defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, loadv16i32,
                       i512mem, v16i32>, PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
 
 //===----------------------------------------------------------------------===//
@@ -3027,7 +3286,16 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
   }//let mayLoad = 1
 }
 
-multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
+                            X86VectorVTInfo _, bit IsCommutable> {
+  defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix,
+                  "$rc, $src2, $src1", "$src1, $src2, $rc",
+                  (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>,
+                  EVEX_4V, EVEX_B, EVEX_RC;
+}
+
+multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, 
                              bit IsCommutable = 0> {
   defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
                               IsCommutable>, EVEX_V512, PS,
@@ -3053,12 +3321,23 @@ multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
   }
 }
 
-defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>;
-defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>;
+multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> {
+  defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info, 0>,
+                              EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info, 0>,
+                              EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+}
+
+defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>,
+            avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>;
+defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>,
+            avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>;
+defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>, 
+            avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>;
+defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>,
+            avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>;
 defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>;
 defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>;
-defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>;
-defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>;
 
 def : Pat<(v16f32 (int_x86_avx512_mask_max_ps_512 (v16f32 VR512:$src1),
                    (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)),
@@ -3083,34 +3362,34 @@ def : Pat<(v8f64 (int_x86_avx512_mask_min_pd_512 (v8f64 VR512:$src1),
 // AVX-512  VPTESTM instructions
 //===----------------------------------------------------------------------===//
 
-multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC, 
-              RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+              RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag,
               SDNode OpNode, ValueType vt> {
   def rr : AVX512PI<opc, MRMSrcReg,
-             (outs KRC:$dst), (ins RC:$src1, RC:$src2), 
-             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))],
              SSEPackedInt>, EVEX_4V;
   def rm : AVX512PI<opc, MRMSrcMem,
-             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), 
-             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set KRC:$dst, (OpNode (vt RC:$src1), 
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1),
               (bitconvert (memop_frag addr:$src2))))], SSEPackedInt>, EVEX_4V;
 }
 
 defm VPTESTMDZ  : avx512_vptest<0x27, "vptestmd", VK16, VR512,  f512mem,
-                              memopv16i32, X86testm, v16i32>, T8PD, EVEX_V512,
+                              loadv16i32, X86testm, v16i32>, T8PD, EVEX_V512,
                               EVEX_CD8<32, CD8VF>;
 defm VPTESTMQZ  : avx512_vptest<0x27, "vptestmq", VK8, VR512,  f512mem,
-                              memopv8i64, X86testm, v8i64>, T8PD, EVEX_V512, VEX_W,
+                              loadv8i64, X86testm, v8i64>, T8PD, EVEX_V512, VEX_W,
                               EVEX_CD8<64, CD8VF>;
 
 let Predicates = [HasCDI] in {
 defm VPTESTNMDZ  : avx512_vptest<0x27, "vptestnmd", VK16, VR512,  f512mem,
-                              memopv16i32, X86testnm, v16i32>, T8XS, EVEX_V512,
+                              loadv16i32, X86testnm, v16i32>, T8XS, EVEX_V512,
                               EVEX_CD8<32, CD8VF>;
 defm VPTESTNMQZ  : avx512_vptest<0x27, "vptestnmq", VK8, VR512,  f512mem,
-                              memopv8i64, X86testnm, v8i64>, T8XS, EVEX_V512, VEX_W,
+                              loadv8i64, X86testnm, v8i64>, T8XS, EVEX_V512, VEX_W,
                               EVEX_CD8<64, CD8VF>;
 }
 
@@ -3121,147 +3400,127 @@ def : Pat <(i16 (int_x86_avx512_mask_ptestm_d_512 (v16i32 VR512:$src1),
 def : Pat <(i8 (int_x86_avx512_mask_ptestm_q_512 (v8i64 VR512:$src1),
                  (v8i64 VR512:$src2), (i8 -1))),
                  (COPY_TO_REGCLASS (VPTESTMQZrr VR512:$src1, VR512:$src2), GR8)>;
+
 //===----------------------------------------------------------------------===//
 // AVX-512  Shift instructions
 //===----------------------------------------------------------------------===//
 multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
-                         string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { 
+                         string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
   defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst),
-                   (ins _.RC:$src1, i8imm:$src2), OpcodeStr,
+                   (ins _.RC:$src1, u8imm:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))),
                    " ",  SSE_INTSHIFT_ITINS_P.rr>, AVX512BIi8Base, EVEX_4V;
   defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
-                   (ins _.MemOp:$src1, i8imm:$src2), OpcodeStr,
+                   (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (_.VT (OpNode (_.MemOpFrag addr:$src1), (i8 imm:$src2))),
+                   (_.VT (OpNode (_.LdFrag addr:$src1), (i8 imm:$src2))),
                    " ",  SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V;
 }
 
 multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                          RegisterClass RC, ValueType vt, ValueType SrcVT,
-                          PatFrag bc_frag, RegisterClass KRC> {
-  // src2 is always 128-bit
-  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
-       (ins RC:$src1, VR128X:$src2),
-           !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       [(set RC:$dst, (vt (OpNode RC:$src1, (SrcVT VR128X:$src2))))],
-        SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V;
-  def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
-       (ins KRC:$mask, RC:$src1, VR128X:$src2),
-           !strconcat(OpcodeStr,
-                " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
-       [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K;
-  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
-       (ins RC:$src1, i128mem:$src2),
-           !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       [(set RC:$dst, (vt (OpNode RC:$src1,
-                       (bc_frag (memopv2i64 addr:$src2)))))],
-                        SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V;
-  def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
-       (ins KRC:$mask, RC:$src1, i128mem:$src2),
-           !strconcat(OpcodeStr,
-                " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
-       [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K;
+                            ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> {
+   // src2 is always 128-bit
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, VR128X:$src2), OpcodeStr,
+                      "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))),
+                   " ",  SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V;
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, i128mem:$src2), OpcodeStr,
+                       "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))),
+                   " ",  SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase, EVEX_4V;
+}
+
+multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> {
+  defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, _>, EVEX_V512;
+}
+
+multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, string OpcodeStr,
+                                 SDNode OpNode> {
+  defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, v4i32, bc_v4i32,
+                                 v16i32_info>, EVEX_CD8<32, CD8VQ>;
+  defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, v2i64, bc_v2i64,
+                                 v8i64_info>, EVEX_CD8<64, CD8VQ>, VEX_W;
 }
 
 defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli,
                            v16i32_info>,
                            EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPSRLDZ : avx512_shift_rrm<0xD2, "vpsrld", X86vsrl,
-                           VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
-                           EVEX_CD8<32, CD8VQ>;
-                           
 defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli,
                            v8i64_info>, EVEX_V512,
                            EVEX_CD8<64, CD8VF>, VEX_W;
-defm VPSRLQZ : avx512_shift_rrm<0xD3, "vpsrlq", X86vsrl,
-                           VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
-                           EVEX_CD8<64, CD8VQ>, VEX_W;
 
 defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli,
                            v16i32_info>, EVEX_V512,
                            EVEX_CD8<32, CD8VF>;
-defm VPSLLDZ : avx512_shift_rrm<0xF2, "vpslld", X86vshl,
-                           VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
-                           EVEX_CD8<32, CD8VQ>;
-                           
 defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli,
                            v8i64_info>, EVEX_V512,
                            EVEX_CD8<64, CD8VF>, VEX_W;
-defm VPSLLQZ : avx512_shift_rrm<0xF3, "vpsllq", X86vshl,
-                           VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
-                           EVEX_CD8<64, CD8VQ>, VEX_W;
 
 defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai,
                            v16i32_info>,
                            EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPSRADZ : avx512_shift_rrm<0xE2, "vpsrad", X86vsra,
-                           VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
-                           EVEX_CD8<32, CD8VQ>;
-                           
 defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai,
                            v8i64_info>, EVEX_V512,
                            EVEX_CD8<64, CD8VF>, VEX_W;
-defm VPSRAQZ : avx512_shift_rrm<0xE2, "vpsraq", X86vsra,
-                           VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
-                           EVEX_CD8<64, CD8VQ>, VEX_W;
+
+defm VPSLL : avx512_shift_types<0xF2, 0xF3, "vpsll", X86vshl>;
+defm VPSRA : avx512_shift_types<0xE2, 0xE2, "vpsra", X86vsra>;
+defm VPSRL : avx512_shift_types<0xD2, 0xD3, "vpsrl", X86vsrl>;
 
 //===-------------------------------------------------------------------===//
 // Variable Bit Shifts
 //===-------------------------------------------------------------------===//
 multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           RegisterClass RC, ValueType vt,
-                           X86MemOperand x86memop, PatFrag mem_frag> {
-  def rr  : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
-             (ins RC:$src1, RC:$src2),
-             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst,
-               (vt (OpNode RC:$src1, (vt RC:$src2))))]>,
-             EVEX_4V;
-  def rm  : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
-             (ins RC:$src1, x86memop:$src2),
-             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst,
-               (vt (OpNode RC:$src1, (mem_frag addr:$src2))))]>,
-             EVEX_4V;
+                            X86VectorVTInfo _> {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                      "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))),
+                   " ",  SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V;
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                       "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2))),
+                   " ",  SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V;
 }
 
-defm VPSLLVDZ : avx512_var_shift<0x47, "vpsllvd", shl, VR512, v16i32, 
-                               i512mem, memopv16i32>, EVEX_V512,
-                               EVEX_CD8<32, CD8VF>;
-defm VPSLLVQZ : avx512_var_shift<0x47, "vpsllvq", shl, VR512, v8i64, 
-                               i512mem, memopv8i64>, EVEX_V512, VEX_W,
-                               EVEX_CD8<64, CD8VF>;
-defm VPSRLVDZ : avx512_var_shift<0x45, "vpsrlvd", srl, VR512, v16i32, 
-                               i512mem, memopv16i32>, EVEX_V512,
-                               EVEX_CD8<32, CD8VF>;
-defm VPSRLVQZ : avx512_var_shift<0x45, "vpsrlvq", srl, VR512, v8i64, 
-                               i512mem, memopv8i64>, EVEX_V512, VEX_W,
-                               EVEX_CD8<64, CD8VF>;
-defm VPSRAVDZ : avx512_var_shift<0x46, "vpsravd", sra, VR512, v16i32, 
-                               i512mem, memopv16i32>, EVEX_V512,
-                               EVEX_CD8<32, CD8VF>;
-defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64, 
-                               i512mem, memopv8i64>, EVEX_V512, VEX_W,
-                               EVEX_CD8<64, CD8VF>;
+multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  AVX512VLVectorVTInfo _> {
+  defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+}
+
+multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr,
+                                 SDNode OpNode> {
+  defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode,
+                                 avx512vl_i32_info>, EVEX_CD8<32, CD8VQ>;
+  defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode,
+                                 avx512vl_i64_info>, EVEX_CD8<64, CD8VQ>, VEX_W;
+}
+
+defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>;
+defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>;
+defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512 - MOVDDUP
 //===----------------------------------------------------------------------===//
 
-multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT, 
+multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT,
                         X86MemOperand x86memop, PatFrag memop_frag> {
 def rr  : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
-                    !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX;
 def rm  : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-                    !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set RC:$dst,
                       (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX;
 }
 
-defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, memopv8f64>,
+defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, loadv8f64>,
                  VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
 def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))),
           (VMOVDDUPZrm addr:$src)>;
@@ -3273,26 +3532,26 @@ multiclass avx512_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
                               ValueType vt, RegisterClass RC, PatFrag mem_frag,
                               X86MemOperand x86memop> {
   def rr : AVX512XSI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
-                    !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                       [(set RC:$dst, (vt (OpNode RC:$src)))]>, EVEX;
   let mayLoad = 1 in
   def rm : AVX512XSI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-                    !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                       [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, EVEX;
 }
 
 defm VMOVSHDUPZ  : avx512_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                       v16f32, VR512, memopv16f32, f512mem>, EVEX_V512,
+                       v16f32, VR512, loadv16f32, f512mem>, EVEX_V512,
                        EVEX_CD8<32, CD8VF>;
 defm VMOVSLDUPZ  : avx512_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                       v16f32, VR512, memopv16f32, f512mem>, EVEX_V512,
+                       v16f32, VR512, loadv16f32, f512mem>, EVEX_V512,
                        EVEX_CD8<32, CD8VF>;
 
 def : Pat<(v16i32 (X86Movshdup VR512:$src)), (VMOVSHDUPZrr VR512:$src)>;
-def : Pat<(v16i32 (X86Movshdup (memopv16i32 addr:$src))),
+def : Pat<(v16i32 (X86Movshdup (loadv16i32 addr:$src))),
            (VMOVSHDUPZrm addr:$src)>;
 def : Pat<(v16i32 (X86Movsldup VR512:$src)), (VMOVSLDUPZrr VR512:$src)>;
-def : Pat<(v16i32 (X86Movsldup (memopv16i32 addr:$src))),
+def : Pat<(v16i32 (X86Movsldup (loadv16i32 addr:$src))),
            (VMOVSLDUPZrm addr:$src)>;
 
 //===----------------------------------------------------------------------===//
@@ -3336,73 +3595,93 @@ multiclass avx512_fma3p_rm<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
          AVX512FMA3Base;
 
   let mayLoad = 1 in
-  def m: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst),
-          (ins _.RC:$src1, _.RC:$src2, _.MemOp:$src3),
-          !strconcat(OpcodeStr, " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-          [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2,
-                                               (_.MemOpFrag addr:$src3))))]>;
-   def mb: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst),
-           (ins _.RC:$src1, _.RC:$src2, _.ScalarMemOp:$src3),
-           !strconcat(OpcodeStr, " \t{${src3}", _.BroadcastStr,
-            ", $src2, $dst|$dst, $src2, ${src3}", _.BroadcastStr, "}"),
-           [(set _.RC:$dst, (OpNode _.RC:$src1, _.RC:$src2,
-           (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))]>, EVEX_B;
-}
+  defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+            (ins _.RC:$src2, _.MemOp:$src3),
+            OpcodeStr, "$src3, $src2", "$src2, $src3",
+            (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>,
+            AVX512FMA3Base; 
+
+  defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+              (ins _.RC:$src2, _.ScalarMemOp:$src3),
+              OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ),
+              (OpNode _.RC:$src1, _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,	
+              AVX512FMA3Base, EVEX_B;
+ }
+} // Constraints = "$src1 = $dst"
+
+let Constraints = "$src1 = $dst" in {
+// Omitting the parameter OpNode (= null_frag) disables ISel pattern matching.
+multiclass avx512_fma3_round_rrb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                           SDPatternOperator OpNode> {
+   defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+          OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+          (_.VT ( OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc)))>,
+          AVX512FMA3Base, EVEX_B, EVEX_RC;
+ }
 } // Constraints = "$src1 = $dst"
 
+multiclass avx512_fma3_round_forms<bits<8> opc213, string OpcodeStr,
+                              X86VectorVTInfo VTI, SDPatternOperator OpNode> {
+  defm v213r : avx512_fma3_round_rrb<opc213, !strconcat(OpcodeStr, "213", VTI.Suffix),
+                              VTI, OpNode>, EVEX_CD8<VTI.EltSize, CD8VF>;
+}
+
 multiclass avx512_fma3p_forms<bits<8> opc213, bits<8> opc231,
                               string OpcodeStr, X86VectorVTInfo VTI,
                               SDPatternOperator OpNode> {
-  defm v213 : avx512_fma3p_rm<opc213, !strconcat(OpcodeStr, "213", VTI.Suffix),
-                              VTI, OpNode>,
-              EVEX_V512, EVEX_CD8<VTI.EltSize, CD8VF>;
+  defm v213r : avx512_fma3p_rm<opc213, !strconcat(OpcodeStr, "213", VTI.Suffix),
+                              VTI, OpNode>, EVEX_CD8<VTI.EltSize, CD8VF>;
 
-  defm v231 : avx512_fma3p_rm<opc231, !strconcat(OpcodeStr, "231", VTI.Suffix),
-                              VTI>,
-              EVEX_V512, EVEX_CD8<VTI.EltSize, CD8VF>;
+  defm v231r : avx512_fma3p_rm<opc231, !strconcat(OpcodeStr, "231", VTI.Suffix),
+                              VTI>, EVEX_CD8<VTI.EltSize, CD8VF>;
 }
 
+multiclass avx512_fma3p<bits<8> opc213, bits<8> opc231,
+                              string OpcodeStr,
+                              SDPatternOperator OpNode,
+                              SDPatternOperator OpNodeRnd> {
 let ExeDomain = SSEPackedSingle in {
-  defm VFMADDPSZ    : avx512_fma3p_forms<0xA8, 0xB8, "vfmadd",
-                                         v16f32_info, X86Fmadd>;
-  defm VFMSUBPSZ    : avx512_fma3p_forms<0xAA, 0xBA, "vfmsub",
-                                         v16f32_info, X86Fmsub>;
-  defm VFMADDSUBPSZ : avx512_fma3p_forms<0xA6, 0xB6, "vfmaddsub",
-                                         v16f32_info, X86Fmaddsub>;
-  defm VFMSUBADDPSZ : avx512_fma3p_forms<0xA7, 0xB7, "vfmsubadd",
-                                         v16f32_info, X86Fmsubadd>;
-  defm VFNMADDPSZ   : avx512_fma3p_forms<0xAC, 0xBC, "vfnmadd",
-                                         v16f32_info, X86Fnmadd>;
-  defm VFNMSUBPSZ   : avx512_fma3p_forms<0xAE, 0xBE, "vfnmsub",
-                                         v16f32_info, X86Fnmsub>;
-}
+    defm NAME##PSZ      : avx512_fma3p_forms<opc213, opc231, OpcodeStr,
+                                             v16f32_info, OpNode>,
+                          avx512_fma3_round_forms<opc213, OpcodeStr,
+                                             v16f32_info, OpNodeRnd>, EVEX_V512;
+    defm NAME##PSZ256   : avx512_fma3p_forms<opc213, opc231, OpcodeStr,
+                                             v8f32x_info, OpNode>, EVEX_V256;
+    defm NAME##PSZ128   : avx512_fma3p_forms<opc213, opc231, OpcodeStr,
+                                             v4f32x_info, OpNode>, EVEX_V128;
+  }
 let ExeDomain = SSEPackedDouble in {
-  defm VFMADDPDZ    : avx512_fma3p_forms<0xA8, 0xB8, "vfmadd",
-                                         v8f64_info, X86Fmadd>, VEX_W;
-  defm VFMSUBPDZ    : avx512_fma3p_forms<0xAA, 0xBA, "vfmsub",
-                                         v8f64_info, X86Fmsub>, VEX_W;
-  defm VFMADDSUBPDZ : avx512_fma3p_forms<0xA6, 0xB6, "vfmaddsub",
-                                         v8f64_info, X86Fmaddsub>, VEX_W;
-  defm VFMSUBADDPDZ : avx512_fma3p_forms<0xA7, 0xB7, "vfmsubadd",
-                                         v8f64_info, X86Fmsubadd>, VEX_W;
-  defm VFNMADDPDZ :   avx512_fma3p_forms<0xAC, 0xBC, "vfnmadd",
-                                         v8f64_info, X86Fnmadd>, VEX_W;
-  defm VFNMSUBPDZ :   avx512_fma3p_forms<0xAE, 0xBE, "vfnmsub",
-                                         v8f64_info, X86Fnmsub>, VEX_W;
+    defm  NAME##PDZ     : avx512_fma3p_forms<opc213, opc231, OpcodeStr,
+                                             v8f64_info, OpNode>,
+                          avx512_fma3_round_forms<opc213, OpcodeStr,
+                                             v8f64_info, OpNodeRnd>, EVEX_V512, VEX_W;
+    defm  NAME##PDZ256  : avx512_fma3p_forms<opc213, opc231, OpcodeStr,
+                                             v4f64x_info, OpNode>, EVEX_V256, VEX_W;
+    defm  NAME##PDZ128  : avx512_fma3p_forms<opc213, opc231, OpcodeStr,
+                                             v2f64x_info, OpNode>, EVEX_V128, VEX_W;
+  }
 }
 
+defm VFMADD    : avx512_fma3p<0xA8, 0xB8, "vfmadd", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB    : avx512_fma3p<0xAA, 0xBA, "vfmsub", X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB : avx512_fma3p<0xA6, 0xB6, "vfmaddsub", X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD : avx512_fma3p<0xA7, 0xB7, "vfmsubadd", X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD   : avx512_fma3p<0xAC, 0xBC, "vfnmadd", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB   : avx512_fma3p<0xAE, 0xBE, "vfnmsub", X86Fnmsub, X86FnmsubRnd>;
+
 let Constraints = "$src1 = $dst" in {
 multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr, SDNode OpNode,
                              X86VectorVTInfo _> {
   let mayLoad = 1 in
   def m: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst),
           (ins _.RC:$src1, _.RC:$src3, _.MemOp:$src2),
-          !strconcat(OpcodeStr, " \t{$src2, $src3, $dst|$dst, $src3, $src2}"),
-          [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, (_.MemOpFrag addr:$src2),
+          !strconcat(OpcodeStr, "\t{$src2, $src3, $dst|$dst, $src3, $src2}"),
+          [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2),
                                                     _.RC:$src3)))]>;
    def mb: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst),
            (ins _.RC:$src1, _.RC:$src3, _.ScalarMemOp:$src2),
-           !strconcat(OpcodeStr, " \t{${src2}", _.BroadcastStr,
+           !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr,
             ", $src3, $dst|$dst, $src3, ${src2}", _.BroadcastStr, "}"),
            [(set _.RC:$dst,
                (OpNode _.RC:$src1, (_.VT (X86VBroadcast
@@ -3412,65 +3691,54 @@ multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr, SDNode OpNode,
 } // Constraints = "$src1 = $dst"
 
 
+multiclass avx512_fma3p_m132_f<bits<8> opc,
+                              string OpcodeStr,
+                              SDNode OpNode> {
+
 let ExeDomain = SSEPackedSingle in {
-  defm VFMADD132PSZ    : avx512_fma3p_m132<0x98, "vfmadd132ps", X86Fmadd,
-                                           v16f32_info>,
-                         EVEX_V512, EVEX_CD8<32, CD8VF>;
-  defm VFMSUB132PSZ    : avx512_fma3p_m132<0x9A, "vfmsub132ps", X86Fmsub,
-                                           v16f32_info>,
-                         EVEX_V512, EVEX_CD8<32, CD8VF>;
-  defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", X86Fmaddsub,
-                                           v16f32_info>,
-                         EVEX_V512, EVEX_CD8<32, CD8VF>;
-  defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", X86Fmsubadd,
-                                           v16f32_info>,
-                         EVEX_V512, EVEX_CD8<32, CD8VF>;
-  defm VFNMADD132PSZ   : avx512_fma3p_m132<0x9C, "vfnmadd132ps", X86Fnmadd,
-                                           v16f32_info>,
-                         EVEX_V512, EVEX_CD8<32, CD8VF>;
-  defm VFNMSUB132PSZ   : avx512_fma3p_m132<0x9E, "vfnmsub132ps", X86Fnmsub,
-                                           v16f32_info>,
-                         EVEX_V512, EVEX_CD8<32, CD8VF>;
-}
+    defm NAME##PSZ      : avx512_fma3p_m132<opc, OpcodeStr##ps,
+                                             OpNode,v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+    defm NAME##PSZ256   : avx512_fma3p_m132<opc, OpcodeStr##ps,
+                                             OpNode, v8f32x_info>, EVEX_V256, EVEX_CD8<32, CD8VF>;
+    defm NAME##PSZ128   : avx512_fma3p_m132<opc, OpcodeStr##ps,
+                                             OpNode, v4f32x_info>, EVEX_V128, EVEX_CD8<32, CD8VF>;
+  }
 let ExeDomain = SSEPackedDouble in {
-  defm VFMADD132PDZ    : avx512_fma3p_m132<0x98, "vfmadd132pd", X86Fmadd,
-                                           v8f64_info>,
-                         EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-  defm VFMSUB132PDZ    : avx512_fma3p_m132<0x9A, "vfmsub132pd", X86Fmsub,
-                                           v8f64_info>,
-                         EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-  defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", X86Fmaddsub,
-                                           v8f64_info>,
-                         EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-  defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", X86Fmsubadd,
-                                           v8f64_info>,
-                         EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-  defm VFNMADD132PDZ :   avx512_fma3p_m132<0x9C, "vfnmadd132pd", X86Fnmadd,
-                                           v8f64_info>,
-                         EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-  defm VFNMSUB132PDZ :   avx512_fma3p_m132<0x9E, "vfnmsub132pd", X86Fnmsub,
-                                           v8f64_info>,
-                         EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+    defm  NAME##PDZ       : avx512_fma3p_m132<opc, OpcodeStr##pd,
+                                           OpNode, v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VF>;
+    defm  NAME##PDZ256    : avx512_fma3p_m132<opc, OpcodeStr##pd,
+                                           OpNode, v4f64x_info>, EVEX_V256, VEX_W, EVEX_CD8<32, CD8VF>;
+    defm  NAME##PDZ128    : avx512_fma3p_m132<opc, OpcodeStr##pd,
+                                           OpNode, v2f64x_info>, EVEX_V128, VEX_W, EVEX_CD8<32, CD8VF>;
+  }
 }
 
+defm VFMADD132    : avx512_fma3p_m132_f<0x98, "vfmadd132", X86Fmadd>;
+defm VFMSUB132    : avx512_fma3p_m132_f<0x9A, "vfmsub132", X86Fmsub>;
+defm VFMADDSUB132 : avx512_fma3p_m132_f<0x96, "vfmaddsub132", X86Fmaddsub>;
+defm VFMSUBADD132 : avx512_fma3p_m132_f<0x97, "vfmsubadd132", X86Fmsubadd>;
+defm VFNMADD132   : avx512_fma3p_m132_f<0x9C, "vfnmadd132", X86Fnmadd>;
+defm VFNMSUB132   : avx512_fma3p_m132_f<0x9E, "vfnmsub132", X86Fnmsub>;
+
+
 // Scalar FMA
 let Constraints = "$src1 = $dst" in {
-multiclass avx512_fma3s_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 
-                 RegisterClass RC, ValueType OpVT, 
-                 X86MemOperand x86memop, Operand memop, 
+multiclass avx512_fma3s_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                 RegisterClass RC, ValueType OpVT,
+                 X86MemOperand x86memop, Operand memop,
                  PatFrag mem_frag> {
   let isCommutable = 1 in
   def r     : AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, RC:$src3),
                    !strconcat(OpcodeStr,
-                              " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst,
                      (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
   let mayLoad = 1 in
   def m     : AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, f128mem:$src3),
                    !strconcat(OpcodeStr,
-                              " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst,
                      (OpVT (OpNode RC:$src2, RC:$src1,
                             (mem_frag addr:$src3))))]>;
@@ -3503,12 +3771,12 @@ multiclass avx512_vcvtsi<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                           X86MemOperand x86memop, string asm> {
 let hasSideEffects = 0 in {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
-              !strconcat(asm," \t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
               EVEX_4V;
   let mayLoad = 1 in
   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
               (ins DstRC:$src1, x86memop:$src),
-              !strconcat(asm," \t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
               EVEX_4V;
 } // hasSideEffects = 0
 }
@@ -3576,12 +3844,12 @@ multiclass avx512_cvt_s_int<bits<8> opc, RegisterClass SrcRC, RegisterClass DstR
                           string asm> {
 let hasSideEffects = 0 in {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
-              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG,
               Requires<[HasAVX512]>;
   let mayLoad = 1 in
   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
-              !strconcat(asm," \t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG,
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG,
               Requires<[HasAVX512]>;
 } // hasSideEffects = 0
 }
@@ -3679,10 +3947,10 @@ multiclass avx512_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                          SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
                          string asm> {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
-              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst, (OpNode SrcRC:$src))]>, EVEX;
   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
-              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, EVEX;
 }
 
@@ -3755,21 +4023,21 @@ def : Pat<(extloadf32 addr:$src),
 def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>,
            Requires<[HasAVX512]>;
 
-multiclass avx512_vcvt_fp_with_rc<bits<8> opc, string asm, RegisterClass SrcRC, 
-               RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, 
+multiclass avx512_vcvt_fp_with_rc<bits<8> opc, string asm, RegisterClass SrcRC,
+               RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag,
                X86MemOperand x86memop, ValueType OpVT, ValueType InVT,
                Domain d> {
 let hasSideEffects = 0 in {
   def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
-              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst,
                 (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX;
   def rrb : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc),
-              !strconcat(asm," \t{$rc, $src, $dst|$dst, $src, $rc}"),
+              !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
               [], d>, EVEX, EVEX_B, EVEX_RC;
   let mayLoad = 1 in
   def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
-              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst,
                 (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX;
 } // hasSideEffects = 0
@@ -3781,29 +4049,29 @@ multiclass avx512_vcvt_fp<bits<8> opc, string asm, RegisterClass SrcRC,
                Domain d> {
 let hasSideEffects = 0 in {
   def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
-              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst,
                 (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX;
   let mayLoad = 1 in
   def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
-              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst,
                 (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX;
 } // hasSideEffects = 0
 }
 
 defm VCVTPD2PSZ : avx512_vcvt_fp_with_rc<0x5A, "vcvtpd2ps", VR512, VR256X, fround,
-                                memopv8f64, f512mem, v8f32, v8f64,
+                                loadv8f64, f512mem, v8f32, v8f64,
                                 SSEPackedSingle>, EVEX_V512, VEX_W, PD,
                                 EVEX_CD8<64, CD8VF>;
 
 defm VCVTPS2PDZ : avx512_vcvt_fp<0x5A, "vcvtps2pd", VR256X, VR512, fextend,
-                                memopv4f64, f256mem, v8f64, v8f32,
+                                loadv4f64, f256mem, v8f64, v8f32,
                                 SSEPackedDouble>, EVEX_V512, PS,
                                 EVEX_CD8<32, CD8VH>;
 def : Pat<(v8f64 (extloadv8f32 addr:$src)),
             (VCVTPS2PDZrm addr:$src)>;
-            
+
 def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src),
                    (bc_v8f32(v8i32 immAllZerosV)), (i8 -1), (i32 FROUND_CURRENT))),
           (VCVTPD2PSZrr VR512:$src)>;
@@ -3817,27 +4085,27 @@ def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src),
 //===----------------------------------------------------------------------===//
 
 defm VCVTDQ2PSZ : avx512_vcvt_fp_with_rc<0x5B, "vcvtdq2ps", VR512, VR512, sint_to_fp,
-                                memopv8i64, i512mem, v16f32, v16i32,
+                                loadv8i64, i512mem, v16f32, v16i32,
                                 SSEPackedSingle>, EVEX_V512, PS,
                                 EVEX_CD8<32, CD8VF>;
 
 defm VCVTDQ2PDZ : avx512_vcvt_fp<0xE6, "vcvtdq2pd", VR256X, VR512, sint_to_fp,
-                                memopv4i64, i256mem, v8f64, v8i32,
+                                loadv4i64, i256mem, v8f64, v8i32,
                                 SSEPackedDouble>, EVEX_V512, XS,
                                 EVEX_CD8<32, CD8VH>;
 
 defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint,
-                                 memopv16f32, f512mem, v16i32, v16f32,
+                                 loadv16f32, f512mem, v16i32, v16f32,
                                  SSEPackedSingle>, EVEX_V512, XS,
                                  EVEX_CD8<32, CD8VF>;
 
 defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint,
-                                 memopv8f64, f512mem, v8i32, v8f64, 
+                                 loadv8f64, f512mem, v8i32, v8f64,
                                  SSEPackedDouble>, EVEX_V512, PD, VEX_W,
                                  EVEX_CD8<64, CD8VF>;
 
 defm VCVTTPS2UDQZ : avx512_vcvt_fp<0x78, "vcvttps2udq", VR512, VR512, fp_to_uint,
-                                 memopv16f32, f512mem, v16i32, v16f32,
+                                 loadv16f32, f512mem, v16i32, v16f32,
                                  SSEPackedSingle>, EVEX_V512, PS,
                                  EVEX_CD8<32, CD8VF>;
 
@@ -3847,29 +4115,29 @@ def : Pat<(v16i32 (int_x86_avx512_mask_cvttps2udq_512 (v16f32 VR512:$src),
           (VCVTTPS2UDQZrr VR512:$src)>;
 
 defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uint,
-                                 memopv8f64, f512mem, v8i32, v8f64,
+                                 loadv8f64, f512mem, v8i32, v8f64,
                                  SSEPackedDouble>, EVEX_V512, PS, VEX_W,
                                  EVEX_CD8<64, CD8VF>;
-                                 
+
 // cvttpd2udq (src, 0, mask-all-ones, sae-current)
 def : Pat<(v8i32 (int_x86_avx512_mask_cvttpd2udq_512 (v8f64 VR512:$src),
                    (v8i32 immAllZerosV), (i8 -1), FROUND_CURRENT)),
           (VCVTTPD2UDQZrr VR512:$src)>;
 
 defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp,
-                                 memopv4i64, f256mem, v8f64, v8i32,
+                                 loadv4i64, f256mem, v8f64, v8i32,
                                  SSEPackedDouble>, EVEX_V512, XS,
                                  EVEX_CD8<32, CD8VH>;
-                                 
+
 defm VCVTUDQ2PSZ : avx512_vcvt_fp_with_rc<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp,
-                                 memopv16i32, f512mem, v16f32, v16i32,
+                                 loadv16i32, f512mem, v16f32, v16i32,
                                  SSEPackedSingle>, EVEX_V512, XD,
                                  EVEX_CD8<32, CD8VF>;
 
 def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
-          (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr 
+          (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
-                                 
+
 def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))),
           (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
            (v16f32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
@@ -3877,7 +4145,7 @@ def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))),
 def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))),
           (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
-           
+
 def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
           (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
            (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
@@ -3904,23 +4172,23 @@ multiclass avx512_vcvt_fp2int<bits<8> opc, string asm, RegisterClass SrcRC,
                X86MemOperand x86memop, Domain d> {
 let hasSideEffects = 0 in {
   def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
-              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
               [], d>, EVEX;
   def rrb : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc),
-              !strconcat(asm," \t{$rc, $src, $dst|$dst, $src, $rc}"),
+              !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
               [], d>, EVEX, EVEX_B, EVEX_RC;
   let mayLoad = 1 in
   def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
-              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
               [], d>, EVEX;
 } // hasSideEffects = 0
 }
 
 defm VCVTPS2DQZ : avx512_vcvt_fp2int<0x5B, "vcvtps2dq", VR512, VR512,
-                                 memopv16f32, f512mem, SSEPackedSingle>, PD,
+                                 loadv16f32, f512mem, SSEPackedSingle>, PD,
                                  EVEX_V512, EVEX_CD8<32, CD8VF>;
 defm VCVTPD2DQZ : avx512_vcvt_fp2int<0xE6, "vcvtpd2dq", VR512, VR256X,
-                                 memopv8f64, f512mem, SSEPackedDouble>, XD, VEX_W,
+                                 loadv8f64, f512mem, SSEPackedDouble>, XD, VEX_W,
                                  EVEX_V512, EVEX_CD8<64, CD8VF>;
 
 def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2dq_512 (v16f32 VR512:$src),
@@ -3932,10 +4200,10 @@ def : Pat <(v8i32 (int_x86_avx512_mask_cvtpd2dq_512 (v8f64 VR512:$src),
            (VCVTPD2DQZrrb VR512:$src, imm:$rc)>;
 
 defm VCVTPS2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtps2udq", VR512, VR512,
-                                 memopv16f32, f512mem, SSEPackedSingle>,
+                                 loadv16f32, f512mem, SSEPackedSingle>,
                                  PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
 defm VCVTPD2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtpd2udq", VR512, VR256X,
-                                 memopv8f64, f512mem, SSEPackedDouble>, VEX_W,
+                                 loadv8f64, f512mem, SSEPackedDouble>, VEX_W,
                                  PS, EVEX_V512, EVEX_CD8<64, CD8VF>;
 
 def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2udq_512 (v16f32 VR512:$src),
@@ -3969,13 +4237,13 @@ multiclass avx512_cvtph2ps<RegisterClass destRC, RegisterClass srcRC,
 multiclass avx512_cvtps2ph<RegisterClass destRC, RegisterClass srcRC,
                              X86MemOperand x86memop> {
   def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst),
-               (ins srcRC:$src1, i32i8imm:$src2),
-               "vcvtps2ph \t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               (ins srcRC:$src1, i32u8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                []>, EVEX;
   let hasSideEffects = 0, mayStore = 1 in
   def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
-               (ins x86memop:$dst, srcRC:$src1, i32i8imm:$src2),
-               "vcvtps2ph \t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX;
+               (ins x86memop:$dst, srcRC:$src1, i32u8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX;
 }
 
 defm VCVTPH2PSZ : avx512_cvtph2ps<VR512, VR256X, f256mem>, EVEX_V512,
@@ -4022,7 +4290,7 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
                               VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
   }
 }
-  
+
 /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
 multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
                             X86MemOperand x86memop> {
@@ -4030,12 +4298,12 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
   def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
                (ins RC:$src1, RC:$src2),
                !strconcat(OpcodeStr,
-               " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
+               "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
   let mayLoad = 1 in {
   def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
                (ins RC:$src1, x86memop:$src2),
                !strconcat(OpcodeStr,
-               " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
+               "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
   }
 }
 }
@@ -4130,60 +4398,40 @@ def : Pat <(v8f64 (int_x86_avx512_rcp14_pd_512 (v8f64 VR512:$src),
            (VRCP14PDZr VR512:$src)>;
 
 /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
-multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                            X86MemOperand x86memop> {
-  let hasSideEffects = 0, Predicates = [HasERI] in {
-  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
-               (ins RC:$src1, RC:$src2),
-               !strconcat(OpcodeStr,
-               " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
-  def rrb : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
-               (ins RC:$src1, RC:$src2),
-               !strconcat(OpcodeStr,
-               " \t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"),
-               []>, EVEX_4V, EVEX_B;
-  let mayLoad = 1 in {
-  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
-               (ins RC:$src1, x86memop:$src2),
-               !strconcat(OpcodeStr,
-               " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
-  }
-}
-}
-
-defm VRCP28SS   : avx512_fp28_s<0xCB, "vrcp28ss", FR32X, f32mem>,
-                  EVEX_CD8<32, CD8VT1>;
-defm VRCP28SD   : avx512_fp28_s<0xCB, "vrcp28sd", FR64X, f64mem>,
-                  VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VRSQRT28SS   : avx512_fp28_s<0xCD, "vrsqrt28ss", FR32X, f32mem>,
-                  EVEX_CD8<32, CD8VT1>;
-defm VRSQRT28SD   : avx512_fp28_s<0xCD, "vrsqrt28sd", FR64X, f64mem>,
-                  VEX_W, EVEX_CD8<64, CD8VT1>;
+multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+                         SDNode OpNode> {
 
-def : Pat <(v4f32 (int_x86_avx512_rcp28_ss (v4f32 VR128X:$src1),
-              (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1),
-                   FROUND_NO_EXC)),
-           (COPY_TO_REGCLASS (VRCP28SSrrb (COPY_TO_REGCLASS VR128X:$src1, FR32X),
-                       (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>;
+  defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                           "$src2, $src1", "$src1, $src2",
+                           (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                           (i32 FROUND_CURRENT))>;
 
-def : Pat <(v2f64 (int_x86_avx512_rcp28_sd (v2f64 VR128X:$src1),
-              (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1),
-                   FROUND_NO_EXC)),
-           (COPY_TO_REGCLASS (VRCP28SDrrb (COPY_TO_REGCLASS VR128X:$src1, FR64X),
-                       (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;
+  defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                            "$src2, $src1", "$src1, $src2",
+                            (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                            (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B;
 
-def : Pat <(v4f32 (int_x86_avx512_rsqrt28_ss (v4f32 VR128X:$src1),
-              (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1),
-                   FROUND_NO_EXC)),
-           (COPY_TO_REGCLASS (VRSQRT28SSrrb (COPY_TO_REGCLASS VR128X:$src1, FR32X),
-                       (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>;
+  defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (OpNode (_.VT _.RC:$src1),
+                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                         (i32 FROUND_CURRENT))>;
+}
 
-def : Pat <(v2f64 (int_x86_avx512_rsqrt28_sd (v2f64 VR128X:$src1),
-              (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1),
-                   FROUND_NO_EXC)),
-           (COPY_TO_REGCLASS (VRSQRT28SDrrb (COPY_TO_REGCLASS VR128X:$src1, FR64X),
-                       (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;
+multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm SS : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode>,
+              EVEX_CD8<32, CD8VT1>;
+  defm SD : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode>,
+              EVEX_CD8<64, CD8VT1>, VEX_W;
+}
 
+let hasSideEffects = 0, Predicates = [HasERI] in {
+  defm VRCP28   : avx512_eri_s<0xCB, "vrcp28",   X86rcp28s>,   T8PD, EVEX_4V;
+  defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V;
+}
 /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
 
 multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
@@ -4196,12 +4444,14 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
   defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src), OpcodeStr,
                         "$src", "$src",
-                        (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B;
+                        (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)),
+                        "{sae}">, EVEX_B;
 
   defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
                          (OpNode (_.FloatVT
-                             (bitconvert (_.LdFrag addr:$src))), (i32 FROUND_CURRENT))>;
+                             (bitconvert (_.LdFrag addr:$src))),
+                          (i32 FROUND_CURRENT))>;
 
   defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
@@ -4218,7 +4468,7 @@ multiclass  avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode> {
 }
 
 let Predicates = [HasERI], hasSideEffects = 0 in {
-  
+
  defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX, EVEX_V512, T8PD;
  defm VRCP28   : avx512_eri<0xCA, "vrcp28",   X86rcp28>,   EVEX, EVEX_V512, T8PD;
  defm VEXP2    : avx512_eri<0xC8, "vexp2",    X86exp2>,    EVEX, EVEX_V512, T8PD;
@@ -4257,7 +4507,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
                (ins VR128X:$src1, VR128X:$src2),
                !strconcat(OpcodeStr,
                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-               [(set VR128X:$dst, 
+               [(set VR128X:$dst,
                  (F32Int VR128X:$src1, VR128X:$src2))],
                itins_s.rr>, XS, EVEX_4V;
   let mayLoad = 1 in {
@@ -4271,7 +4521,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
                    (ins VR128X:$src1, ssmem:$src2),
                    !strconcat(OpcodeStr,
                  "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                   [(set VR128X:$dst, 
+                   [(set VR128X:$dst,
                      (F32Int VR128X:$src1, sse_load_f32:$src2))],
                    itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
   }
@@ -4285,7 +4535,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
                (ins VR128X:$src1, VR128X:$src2),
                !strconcat(OpcodeStr,
                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-               [(set VR128X:$dst, 
+               [(set VR128X:$dst,
                  (F64Int VR128X:$src1, VR128X:$src2))],
                itins_s.rr>, XD, EVEX_4V, VEX_W;
   let mayLoad = 1 in {
@@ -4299,8 +4549,8 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
                   (ins VR128X:$src1, sdmem:$src2),
                    !strconcat(OpcodeStr,
                   "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  [(set VR128X:$dst, 
-                    (F64Int VR128X:$src1, sse_load_f64:$src2))]>, 
+                  [(set VR128X:$dst,
+                    (F64Int VR128X:$src1, sse_load_f64:$src2))]>,
                   XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
   }
 }
@@ -4332,8 +4582,8 @@ multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
 
 defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>;
 
-defm VSQRT  : avx512_sqrt_scalar<0x51, "sqrt", 
-                int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, 
+defm VSQRT  : avx512_sqrt_scalar<0x51, "sqrt",
+                int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd,
                 SSE_SQRTSS, SSE_SQRTSD>;
 
 let Predicates = [HasAVX512] in {
@@ -4343,7 +4593,7 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v8f64 (int_x86_avx512_sqrt_pd_512 (v8f64 VR512:$src1),
                     (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_CURRENT)),
                    (VSQRTPDZr VR512:$src1)>;
-  
+
   def : Pat<(f32 (fsqrt FR32X:$src)),
             (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
   def : Pat<(f32 (fsqrt (load addr:$src))),
@@ -4383,107 +4633,6 @@ let Predicates = [HasAVX512] in {
 }
 
 
-multiclass avx512_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
-                            X86MemOperand x86memop, RegisterClass RC,
-                            PatFrag mem_frag32, PatFrag mem_frag64,
-                            Intrinsic V4F32Int, Intrinsic V2F64Int,
-                            CD8VForm VForm> {
-let ExeDomain = SSEPackedSingle in {
-  // Intrinsic operation, reg.
-  // Vector intrinsic operation, reg
-  def PSr : AVX512AIi8<opcps, MRMSrcReg,
-                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
-                    !strconcat(OpcodeStr,
-                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>;
-
-  // Vector intrinsic operation, mem
-  def PSm : AVX512AIi8<opcps, MRMSrcMem,
-                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
-                    !strconcat(OpcodeStr,
-                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set RC:$dst,
-                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>,
-                    EVEX_CD8<32, VForm>;
-} // ExeDomain = SSEPackedSingle
-
-let ExeDomain = SSEPackedDouble in {
-  // Vector intrinsic operation, reg
-  def PDr : AVX512AIi8<opcpd, MRMSrcReg,
-                     (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
-                     !strconcat(OpcodeStr,
-                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                     [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>;
-
-  // Vector intrinsic operation, mem
-  def PDm : AVX512AIi8<opcpd, MRMSrcMem,
-                     (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
-                     !strconcat(OpcodeStr,
-                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                     [(set RC:$dst,
-                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>,
-                     EVEX_CD8<64, VForm>;
-} // ExeDomain = SSEPackedDouble
-}
-
-multiclass avx512_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
-                            string OpcodeStr,
-                            Intrinsic F32Int,
-                            Intrinsic F64Int> {
-let ExeDomain = GenericDomain in {
-  // Operation, reg.
-  let hasSideEffects = 0 in
-  def SSr : AVX512AIi8<opcss, MRMSrcReg,
-      (outs FR32X:$dst), (ins FR32X:$src1, FR32X:$src2, i32i8imm:$src3),
-      !strconcat(OpcodeStr,
-              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-      []>;
-
-  // Intrinsic operation, reg.
-  let isCodeGenOnly = 1 in
-  def SSr_Int : AVX512AIi8<opcss, MRMSrcReg,
-        (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3),
-        !strconcat(OpcodeStr,
-                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-        [(set VR128X:$dst, (F32Int VR128X:$src1, VR128X:$src2, imm:$src3))]>;
-
-  // Intrinsic operation, mem.
-  def SSm : AVX512AIi8<opcss, MRMSrcMem, (outs VR128X:$dst),
-                     (ins VR128X:$src1, ssmem:$src2, i32i8imm:$src3),
-                     !strconcat(OpcodeStr,
-                   "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                     [(set VR128X:$dst, (F32Int VR128X:$src1, 
-                                         sse_load_f32:$src2, imm:$src3))]>,
-                     EVEX_CD8<32, CD8VT1>;
-
-  // Operation, reg.
-  let hasSideEffects = 0 in
-  def SDr : AVX512AIi8<opcsd, MRMSrcReg,
-        (outs FR64X:$dst), (ins FR64X:$src1, FR64X:$src2, i32i8imm:$src3),
-        !strconcat(OpcodeStr,
-                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-        []>, VEX_W;
-
-  // Intrinsic operation, reg.
-  let isCodeGenOnly = 1 in
-  def SDr_Int : AVX512AIi8<opcsd, MRMSrcReg,
-        (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3),
-        !strconcat(OpcodeStr,
-                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-        [(set VR128X:$dst, (F64Int VR128X:$src1, VR128X:$src2, imm:$src3))]>,
-        VEX_W;
-
-  // Intrinsic operation, mem.
-  def SDm : AVX512AIi8<opcsd, MRMSrcMem,
-        (outs VR128X:$dst), (ins VR128X:$src1, sdmem:$src2, i32i8imm:$src3),
-        !strconcat(OpcodeStr,
-                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-        [(set VR128X:$dst,
-              (F64Int VR128X:$src1, sse_load_f64:$src2, imm:$src3))]>,
-        VEX_W, EVEX_CD8<64, CD8VT1>;
-} // ExeDomain = GenericDomain
-}
-
 multiclass avx512_rndscale<bits<8> opc, string OpcodeStr,
                             X86MemOperand x86memop, RegisterClass RC,
                             PatFrag mem_frag, Domain d> {
@@ -4491,23 +4640,22 @@ let ExeDomain = d in {
   // Intrinsic operation, reg.
   // Vector intrinsic operation, reg
   def r : AVX512AIi8<opc, MRMSrcReg,
-                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
                     !strconcat(OpcodeStr,
-                    " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     []>, EVEX;
 
   // Vector intrinsic operation, mem
   def m : AVX512AIi8<opc, MRMSrcMem,
-                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
                     !strconcat(OpcodeStr,
-                    " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     []>, EVEX;
 } // ExeDomain
 }
 
-
 defm VRNDSCALEPSZ : avx512_rndscale<0x08, "vrndscaleps", f512mem, VR512,
-                                memopv16f32, SSEPackedSingle>, EVEX_V512,
+                                loadv16f32, SSEPackedSingle>, EVEX_V512,
                                 EVEX_CD8<32, CD8VF>;
 
 def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1),
@@ -4517,7 +4665,7 @@ def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1),
 
 
 defm VRNDSCALEPDZ : avx512_rndscale<0x09, "vrndscalepd", f512mem, VR512,
-                                memopv8f64, SSEPackedDouble>, EVEX_V512,
+                                loadv8f64, SSEPackedDouble>, EVEX_V512,
                                 VEX_W, EVEX_CD8<64, CD8VF>;
 
 def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1),
@@ -4525,50 +4673,72 @@ def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1),
                   FROUND_CURRENT)),
                    (VRNDSCALEPDZr VR512:$src1, imm:$src2)>;
 
-multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
-                     Operand x86memop, RegisterClass RC, Domain d> {
-let ExeDomain = d in {
-  def r : AVX512AIi8<opc, MRMSrcReg,
-                    (outs RC:$dst), (ins RC:$src1, RC:$src2, i32i8imm:$src3),
-                    !strconcat(OpcodeStr,
-                    " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, EVEX_4V;
+multiclass
+avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
 
-  def m : AVX512AIi8<opc, MRMSrcMem,
-                    (outs RC:$dst), (ins RC:$src1, x86memop:$src2,  i32i8imm:$src3),
-                    !strconcat(OpcodeStr,
-                    " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, EVEX_4V;
-} // ExeDomain
+  let ExeDomain = _.ExeDomain in {
+  defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
+                           "$src3, $src2, $src1", "$src1, $src2, $src3",
+                           (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                            (i32 imm:$src3), (i32 FROUND_CURRENT)))>;
+
+  defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
+                         "$src3, $src2, $src1", "$src1, $src2, $src3",
+                         (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                         (i32 imm:$src3), (i32 FROUND_NO_EXC))), "{sae}">, EVEX_B;
+
+  let mayLoad = 1 in
+  defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr,
+                         "$src3, $src2, $src1", "$src1, $src2, $src3",
+                         (_.VT (X86RndScale (_.VT _.RC:$src1),
+                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                          (i32 imm:$src3), (i32 FROUND_CURRENT)))>;
+  }
+  let Predicates = [HasAVX512] in {
+  def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x1))), _.FRC)>;
+  def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x2))), _.FRC)>;
+  def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x3))), _.FRC)>;
+  def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>;
+  def : Pat<(fnearbyint _.FRC:$src), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xc))), _.FRC)>;
+
+  def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+             addr:$src, (i32 0x1))), _.FRC)>;
+  def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+             addr:$src, (i32 0x2))), _.FRC)>;
+  def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+             addr:$src, (i32 0x3))), _.FRC)>;
+  def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+             addr:$src, (i32 0x4))), _.FRC)>;
+  def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+             addr:$src, (i32 0xc))), _.FRC)>;
+  }
 }
 
-defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", ssmem, FR32X,
-                                SSEPackedSingle>, EVEX_CD8<32, CD8VT1>;
-                                
-defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", sdmem, FR64X,
-                                SSEPackedDouble>, EVEX_CD8<64, CD8VT1>;
-
-def : Pat<(ffloor FR32X:$src),
-          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x1))>;
-def : Pat<(f64 (ffloor FR64X:$src)),
-          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x1))>;
-def : Pat<(f32 (fnearbyint FR32X:$src)),
-          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0xC))>;
-def : Pat<(f64 (fnearbyint FR64X:$src)),
-          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0xC))>;
-def : Pat<(f32 (fceil FR32X:$src)),
-          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x2))>;
-def : Pat<(f64 (fceil FR64X:$src)),
-          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x2))>;
-def : Pat<(f32 (frint FR32X:$src)),
-          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x4))>;
-def : Pat<(f64 (frint FR64X:$src)),
-          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x4))>;
-def : Pat<(f32 (ftrunc FR32X:$src)),
-          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x3))>;
-def : Pat<(f64 (ftrunc FR64X:$src)),
-          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x3))>;
+defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>,
+                                AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VT1>;
 
+defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W,
+                                AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>;
+
+let Predicates = [HasAVX512] in {
 def : Pat<(v16f32 (ffloor VR512:$src)),
           (VRNDSCALEPSZr VR512:$src, (i32 0x1))>;
 def : Pat<(v16f32 (fnearbyint VR512:$src)),
@@ -4590,7 +4760,7 @@ def : Pat<(v8f64 (frint VR512:$src)),
           (VRNDSCALEPDZr VR512:$src, (i32 0x4))>;
 def : Pat<(v8f64 (ftrunc VR512:$src)),
           (VRNDSCALEPDZr VR512:$src, (i32 0x3))>;
-
+}
 //-------------------------------------------------
 // Integer truncate and extend operations
 //-------------------------------------------------
@@ -4600,32 +4770,32 @@ multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr,
                           RegisterClass KRC, X86MemOperand x86memop> {
   def rr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
                (ins srcRC:$src),
-               !strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"),
+               !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
                []>, EVEX;
 
   def rrk : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
                (ins KRC:$mask, srcRC:$src),
                !strconcat(OpcodeStr,
-                 " \t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
+                 "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
                []>, EVEX, EVEX_K;
 
   def rrkz : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
                (ins KRC:$mask, srcRC:$src),
                !strconcat(OpcodeStr,
-                 " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+                 "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
                []>, EVEX, EVEX_KZ;
 
   def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src),
-               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                []>, EVEX;
 
   def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
                (ins x86memop:$dst, KRC:$mask, srcRC:$src),
-               !strconcat(OpcodeStr, " \t{$src, $dst {${mask}}|${dst} {${mask}}, $src}"),
+               !strconcat(OpcodeStr, "\t{$src, $dst {${mask}}|${dst} {${mask}}, $src}"),
                []>, EVEX, EVEX_K;
 
 }
-defm VPMOVQB    : avx512_trunc_sat<0x32, "vpmovqb",   VR128X, VR512, VK8WM, 
+defm VPMOVQB    : avx512_trunc_sat<0x32, "vpmovqb",   VR128X, VR512, VK8WM,
                                  i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
 defm VPMOVSQB   : avx512_trunc_sat<0x22, "vpmovsqb",  VR128X, VR512, VK8WM,
                                  i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
@@ -4679,151 +4849,158 @@ multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass KRC,
 
   def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
               (ins SrcRC:$src),
-              !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))]>, EVEX;
 
   def rrk : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
               (ins KRC:$mask, SrcRC:$src),
-              !strconcat(OpcodeStr, " \t{$src, $dst {${mask}} |$dst {${mask}}, $src}"),
+              !strconcat(OpcodeStr, "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}"),
               []>, EVEX, EVEX_K;
 
   def rrkz : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
               (ins KRC:$mask, SrcRC:$src),
-              !strconcat(OpcodeStr, " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+              !strconcat(OpcodeStr, "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
               []>, EVEX, EVEX_KZ;
 
   let mayLoad = 1 in {
     def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
               (ins x86memop:$src),
-              !strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"),
+              !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst,
                 (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))]>,
               EVEX;
 
     def rmk : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
               (ins KRC:$mask, x86memop:$src),
-              !strconcat(OpcodeStr," \t{$src, $dst {${mask}} |$dst {${mask}}, $src}"),
+              !strconcat(OpcodeStr,"\t{$src, $dst {${mask}} |$dst {${mask}}, $src}"),
               []>,
               EVEX, EVEX_K;
 
     def rmkz : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
               (ins KRC:$mask, x86memop:$src),
-              !strconcat(OpcodeStr," \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+              !strconcat(OpcodeStr,"\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
               []>,
               EVEX, EVEX_KZ;
   }
 }
 
 defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VK16WM, VR512, VR128X, X86vzext,
-                             memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
+                             loadv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
                              EVEX_CD8<8, CD8VQ>;
 defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VK8WM, VR512, VR128X, X86vzext,
-                             memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
+                             loadv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
                              EVEX_CD8<8, CD8VO>;
 defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VK16WM, VR512, VR256X, X86vzext,
-                             memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
+                             loadv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
                              EVEX_CD8<16, CD8VH>;
 defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VK8WM, VR512, VR128X, X86vzext,
-                             memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
+                             loadv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
                              EVEX_CD8<16, CD8VQ>;
 defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VK8WM, VR512, VR256X, X86vzext,
-                             memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
+                             loadv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
                              EVEX_CD8<32, CD8VH>;
 
 defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VK16WM, VR512, VR128X, X86vsext,
-                             memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
+                             loadv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
                              EVEX_CD8<8, CD8VQ>;
 defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VK8WM, VR512, VR128X, X86vsext,
-                             memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
+                             loadv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
                              EVEX_CD8<8, CD8VO>;
 defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VK16WM, VR512, VR256X, X86vsext,
-                             memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
+                             loadv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
                              EVEX_CD8<16, CD8VH>;
 defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VK8WM, VR512, VR128X, X86vsext,
-                             memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
+                             loadv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
                              EVEX_CD8<16, CD8VQ>;
 defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VK8WM, VR512, VR256X, X86vsext,
-                             memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
+                             loadv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
                              EVEX_CD8<32, CD8VH>;
 
 //===----------------------------------------------------------------------===//
 // GATHER - SCATTER Operations
 
-multiclass avx512_gather<bits<8> opc, string OpcodeStr, RegisterClass KRC,
-                       RegisterClass RC, X86MemOperand memop> {
-let mayLoad = 1,
+multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86MemOperand memop, PatFrag GatherNode> {
+let mayLoad = 1, hasTwoExplicitDefs = 1,
   Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb" in
-  def rm  : AVX5128I<opc, MRMSrcMem, (outs RC:$dst, KRC:$mask_wb),
-            (ins RC:$src1, KRC:$mask, memop:$src2),
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, _.KRCWM:$mask_wb),
+            (ins _.RC:$src1, _.KRCWM:$mask, memop:$src2),
             !strconcat(OpcodeStr,
-            " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
-            []>, EVEX, EVEX_K;
+            "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+            [(set _.RC:$dst, _.KRCWM:$mask_wb,
+              (_.VT (GatherNode  (_.VT _.RC:$src1), _.KRCWM:$mask,
+                     vectoraddr:$src2)))]>, EVEX, EVEX_K,
+             EVEX_CD8<_.EltSize, CD8VT1>;
 }
 
 let ExeDomain = SSEPackedDouble in {
-defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", VK8WM, VR512, vy64xmem>,
-                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", VK8WM, VR512, vz64mem>,
-                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", v8f64_info, vy64xmem,
+                                 mgatherv8i32>, EVEX_V512, VEX_W;
+defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", v8f64_info, vz64mem,
+                                 mgatherv8i64>, EVEX_V512, VEX_W;
 }
 
 let ExeDomain = SSEPackedSingle in {
-defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", VK16WM, VR512, vz32mem>,
-                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
-defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", VK8WM, VR256X, vz64mem>,
-                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", v16f32_info, vz32mem,
+                                 mgatherv16i32>, EVEX_V512;
+defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", v8f32x_info, vz64mem,
+                                 mgatherv8i64>,  EVEX_V512;
 }
-  
-defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", VK8WM, VR512,  vy64xmem>,
-                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", VK16WM, VR512, vz32mem>,
-                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
-
-defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", VK8WM, VR512,  vz64mem>,
-                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", VK8WM, VR256X,  vz64mem>,
-                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
-
-multiclass avx512_scatter<bits<8> opc, string OpcodeStr, RegisterClass KRC,
-                       RegisterClass RC, X86MemOperand memop> {
+
+defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", v8i64_info,  vy64xmem,
+                                 mgatherv8i32>, EVEX_V512, VEX_W;
+defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", v16i32_info, vz32mem,
+                                 mgatherv16i32>, EVEX_V512;
+
+defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", v8i64_info,  vz64mem,
+                                 mgatherv8i64>, EVEX_V512, VEX_W;
+defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", v8i32x_info,  vz64mem,
+                                 mgatherv8i64>, EVEX_V512;
+
+multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                          X86MemOperand memop, PatFrag ScatterNode> {
+
 let mayStore = 1, Constraints = "$mask = $mask_wb" in
-  def mr  : AVX5128I<opc, MRMDestMem, (outs KRC:$mask_wb),
-            (ins memop:$dst, KRC:$mask, RC:$src2),
+
+  def mr  : AVX5128I<opc, MRMDestMem, (outs _.KRCWM:$mask_wb),
+            (ins memop:$dst, _.KRCWM:$mask, _.RC:$src),
             !strconcat(OpcodeStr,
-            " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
-            []>, EVEX, EVEX_K;
+            "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
+            [(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src),
+                                     _.KRCWM:$mask,  vectoraddr:$dst))]>,
+            EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
 }
 
 let ExeDomain = SSEPackedDouble in {
-defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", VK8WM, VR512, vy64xmem>,
-                                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", VK8WM, VR512, vz64mem>,
-                                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", v8f64_info, vy64xmem,
+                                   mscatterv8i32>, EVEX_V512, VEX_W;
+defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", v8f64_info, vz64mem,
+                                   mscatterv8i64>, EVEX_V512, VEX_W;
 }
 
 let ExeDomain = SSEPackedSingle in {
-defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", VK16WM, VR512, vz32mem>,
-                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
-defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", VK8WM, VR256X, vz64mem>,
-                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
+defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", v16f32_info, vz32mem,
+                                   mscatterv16i32>, EVEX_V512;
+defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", v8f32x_info, vz64mem,
+                                   mscatterv8i64>, EVEX_V512;
 }
 
-defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", VK8WM, VR512, vy64xmem>,
-                                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", VK16WM, VR512, vz32mem>,
-                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
+defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", v8i64_info, vy64xmem,
+                                   mscatterv8i32>, EVEX_V512, VEX_W;
+defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", v16i32_info, vz32mem,
+                                   mscatterv16i32>, EVEX_V512;
 
-defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>,
-                                  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>,
-                                  EVEX_V512, EVEX_CD8<32, CD8VT1>;
+defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", v8i64_info, vz64mem,
+                                   mscatterv8i64>, EVEX_V512, VEX_W;
+defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", v8i32x_info, vz64mem,
+                                   mscatterv8i64>, EVEX_V512;
 
 // prefetch
 multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
                        RegisterClass KRC, X86MemOperand memop> {
   let Predicates = [HasPFI], hasSideEffects = 1 in
   def m  : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
-            !strconcat(OpcodeStr, " \t{$src {${mask}}|{${mask}}, $src}"),
+            !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"),
             []>, EVEX, EVEX_K;
 }
 
@@ -4838,7 +5015,7 @@ defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
 
 defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
                      VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
-                     
+
 defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
                      VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
@@ -4881,41 +5058,41 @@ multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop,
                       ValueType vt, string OpcodeStr, PatFrag mem_frag,
                       Domain d> {
   def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
-                   (ins RC:$src1, x86memop:$src2, i8imm:$src3),
+                   (ins RC:$src1, x86memop:$src2, u8imm:$src3),
                    !strconcat(OpcodeStr,
-                   " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                    [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
                                        (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
                    EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>;
   def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
-                   (ins RC:$src1, RC:$src2, i8imm:$src3),
+                   (ins RC:$src1, RC:$src2, u8imm:$src3),
                    !strconcat(OpcodeStr,
-                   " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                    [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
                                        (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
                    EVEX_4V, Sched<[WriteShuffle]>;
 }
 
-defm VSHUFPSZ  : avx512_shufp<VR512, f512mem, v16f32, "vshufps", memopv16f32,
+defm VSHUFPSZ  : avx512_shufp<VR512, f512mem, v16f32, "vshufps", loadv16f32,
                   SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VSHUFPDZ  : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", memopv8f64,
+defm VSHUFPDZ  : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", loadv8f64,
                   SSEPackedDouble>, PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
 
 def : Pat<(v16i32 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))),
           (VSHUFPSZrri VR512:$src1, VR512:$src2, imm:$imm)>;
 def : Pat<(v16i32 (X86Shufp VR512:$src1,
-                    (memopv16i32 addr:$src2), (i8 imm:$imm))),
+                    (loadv16i32 addr:$src2), (i8 imm:$imm))),
           (VSHUFPSZrmi VR512:$src1, addr:$src2, imm:$imm)>;
 
 def : Pat<(v8i64 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))),
           (VSHUFPDZrri VR512:$src1, VR512:$src2, imm:$imm)>;
 def : Pat<(v8i64 (X86Shufp VR512:$src1,
-                            (memopv8i64 addr:$src2), (i8 imm:$imm))),
+                            (loadv8i64 addr:$src2), (i8 imm:$imm))),
           (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>;
 
 multiclass avx512_valign<X86VectorVTInfo _> {
   defm rri : AVX512_maskable<0x03, MRMSrcReg, _, (outs _.RC:$dst),
-                     (ins _.RC:$src1, _.RC:$src2, i8imm:$src3),
+                     (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
                      "valign"##_.Suffix,
                      "$src3, $src2, $src1", "$src1, $src2, $src3",
                      (_.VT (X86VAlign _.RC:$src2, _.RC:$src1,
@@ -4928,9 +5105,9 @@ multiclass avx512_valign<X86VectorVTInfo _> {
 
   let mayLoad = 1 in
   def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs _.RC:$dst),
-                     (ins _.RC:$src1, _.MemOp:$src2, i8imm:$src3),
+                     (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
                      !strconcat("valign"##_.Suffix,
-                     " \t{$src3, $src2, $src1, $dst|"
+                     "\t{$src3, $src2, $src1, $dst|"
                          "$dst, $src1, $src2, $src3}"),
                      []>, EVEX_4V;
 }
@@ -4946,43 +5123,43 @@ multiclass avx512_vpabs<bits<8> opc, string OpcodeStr, ValueType OpVT,
                         X86MemOperand x86memop, X86MemOperand x86scalar_mop,
                         string BrdcstStr> {
   def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
-            !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
             []>, EVEX;
   def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src),
-             !strconcat(OpcodeStr, " \t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+             !strconcat(OpcodeStr, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
              []>, EVEX, EVEX_K;
   def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src),
               !strconcat(OpcodeStr,
-                         " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+                         "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
               []>, EVEX, EVEX_KZ;
   let mayLoad = 1 in {
     def rm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
               (ins x86memop:$src),
-              !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
               []>, EVEX;
     def rmk : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
                (ins KRC:$mask, x86memop:$src),
                !strconcat(OpcodeStr,
-                          " \t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+                          "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
                []>, EVEX, EVEX_K;
     def rmkz : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
                 (ins KRC:$mask, x86memop:$src),
                 !strconcat(OpcodeStr,
-                           " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+                           "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
                 []>, EVEX, EVEX_KZ;
     def rmb : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
                (ins x86scalar_mop:$src),
-               !strconcat(OpcodeStr, " \t{${src}", BrdcstStr,
+               !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
                           ", $dst|$dst, ${src}", BrdcstStr, "}"),
                []>, EVEX, EVEX_B;
     def rmbk : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
                 (ins KRC:$mask, x86scalar_mop:$src),
-                !strconcat(OpcodeStr, " \t{${src}", BrdcstStr,
+                !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
                            ", $dst {${mask}}|$dst {${mask}}, ${src}", BrdcstStr, "}"),
                 []>, EVEX, EVEX_B, EVEX_K;
     def rmbkz : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
                  (ins KRC:$mask, x86scalar_mop:$src),
-                 !strconcat(OpcodeStr, " \t{${src}", BrdcstStr,
+                 !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
                             ", $dst {${mask}} {z}|$dst {${mask}} {z}, ${src}",
                             BrdcstStr, "}"),
                  []>, EVEX, EVEX_B, EVEX_KZ;
@@ -5012,57 +5189,65 @@ def : Pat<(v8i64 (int_x86_avx512_mask_pabs_q_512 (v8i64 VR512:$src),
                    (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))),
           (VPABSQZrr VR512:$src)>;
 
-multiclass avx512_conflict<bits<8> opc, string OpcodeStr, 
+multiclass avx512_conflict<bits<8> opc, string OpcodeStr,
                         RegisterClass RC, RegisterClass KRC,
                         X86MemOperand x86memop,
                         X86MemOperand x86scalar_mop, string BrdcstStr> {
+  let hasSideEffects = 0 in {
   def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src),
-       !strconcat(OpcodeStr, " \t{$src, ${dst} |${dst}, $src}"),
+       !strconcat(OpcodeStr, "\t{$src, ${dst} |${dst}, $src}"),
        []>, EVEX;
+  let mayLoad = 1 in
   def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
        (ins x86memop:$src),
-       !strconcat(OpcodeStr, " \t{$src, ${dst}|${dst}, $src}"),
+       !strconcat(OpcodeStr, "\t{$src, ${dst}|${dst}, $src}"),
        []>, EVEX;
+  let mayLoad = 1 in
   def rmb : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
        (ins x86scalar_mop:$src),
-       !strconcat(OpcodeStr, " \t{${src}", BrdcstStr,
+       !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
                   ", ${dst}|${dst}, ${src}", BrdcstStr, "}"),
        []>, EVEX, EVEX_B;
   def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
        (ins KRC:$mask, RC:$src),
        !strconcat(OpcodeStr,
-                  " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+                  "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
        []>, EVEX, EVEX_KZ;
+  let mayLoad = 1 in
   def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
        (ins KRC:$mask, x86memop:$src),
        !strconcat(OpcodeStr,
-                  " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+                  "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
        []>, EVEX, EVEX_KZ;
+  let mayLoad = 1 in
   def rmbkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
        (ins KRC:$mask, x86scalar_mop:$src),
-       !strconcat(OpcodeStr, " \t{${src}", BrdcstStr,
+       !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
                   ", ${dst} {${mask}} {z}|${dst} {${mask}} {z}, ${src}",
                   BrdcstStr, "}"),
        []>, EVEX, EVEX_KZ, EVEX_B;
-       
+
   let Constraints = "$src1 = $dst" in {
   def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, KRC:$mask, RC:$src2),
        !strconcat(OpcodeStr,
-                  " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+                  "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
        []>, EVEX, EVEX_K;
+  let mayLoad = 1 in
   def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, KRC:$mask, x86memop:$src2),
        !strconcat(OpcodeStr,
-                  " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+                  "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
        []>, EVEX, EVEX_K;
+  let mayLoad = 1 in
   def rmbk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, KRC:$mask, x86scalar_mop:$src2),
-       !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+       !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
                   ", ${dst} {${mask}}|${dst} {${mask}}, ${src2}", BrdcstStr, "}"),
        []>, EVEX, EVEX_K, EVEX_B;
-   }
+  }
+  }
 }
 
 let Predicates = [HasCDI] in {
@@ -5109,11 +5294,11 @@ def : Pat<(int_x86_avx512_mask_lzcnt_q_512 VR512:$src2, VR512:$src1,
           (VPLZCNTQrrk VR512:$src1,
            (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
 
-def : Pat<(v16i32 (ctlz (memopv16i32 addr:$src))),
+def : Pat<(v16i32 (ctlz (loadv16i32 addr:$src))),
           (VPLZCNTDrm addr:$src)>;
 def : Pat<(v16i32 (ctlz (v16i32 VR512:$src))),
           (VPLZCNTDrr VR512:$src)>;
-def : Pat<(v8i64 (ctlz (memopv8i64 addr:$src))),
+def : Pat<(v8i64 (ctlz (loadv8i64 addr:$src))),
           (VPLZCNTQrm addr:$src)>;
 def : Pat<(v8i64 (ctlz (v8i64 VR512:$src))),
           (VPLZCNTQrr VR512:$src)>;
@@ -5123,7 +5308,14 @@ def : Pat<(store (i1  1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
 def : Pat<(store (i1  0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
 
 def : Pat<(store VK1:$src, addr:$dst),
-          (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK16))>;
+          (MOV8mr addr:$dst,
+           (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)),
+            sub_8bit))>, Requires<[HasAVX512, NoDQI]>;
+
+def : Pat<(store VK8:$src, addr:$dst),
+          (MOV8mr addr:$dst,
+           (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
+            sub_8bit))>, Requires<[HasAVX512, NoDQI]>;
 
 def truncstorei1 : PatFrag<(ops node:$val, node:$ptr),
                            (truncstore node:$val, node:$ptr), [{
@@ -5135,10 +5327,10 @@ def : Pat<(truncstorei1 GR8:$src, addr:$dst),
 
 multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > {
 def rr : AVX512XS8I<opc, MRMDestReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
-                  !strconcat(OpcodeStr##Vec.Suffix, " \t{$src, $dst|$dst, $src}"),
+                  !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
                   [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX;
 }
-          
+
 multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
                                  string OpcodeStr, Predicate prd> {
 let Predicates = [prd] in
@@ -5160,5 +5352,108 @@ multiclass avx512_convert_mask_to_vector<string OpcodeStr> {
   defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr,
                                        HasDQI>, VEX_W;
 }
-          
+
 defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - COMPRESS and EXPAND
+//
+multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _,
+                                 string OpcodeStr> {
+  def rrkz : AVX5128I<opc, MRMDestReg, (outs _.RC:$dst),
+              (ins _.KRCWM:$mask, _.RC:$src),
+              OpcodeStr # "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+              [(set _.RC:$dst, (_.VT (X86compress _.KRCWM:$mask, _.RC:$src,
+                                      _.ImmAllZerosV)))]>, EVEX_KZ;
+
+  let Constraints = "$src0 = $dst" in
+  def rrk : AVX5128I<opc, MRMDestReg, (outs _.RC:$dst),
+                    (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src),
+                    OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
+                    [(set _.RC:$dst, (_.VT (X86compress _.KRCWM:$mask, _.RC:$src,
+                                            _.RC:$src0)))]>, EVEX_K;
+
+  let mayStore = 1 in {
+  def mrk : AVX5128I<opc, MRMDestMem, (outs),
+              (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
+              OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
+              [(store (_.VT (X86compress _.KRCWM:$mask, _.RC:$src, undef)),
+                addr:$dst)]>,
+              EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
+  }
+}
+
+multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr,
+                                 AVX512VLVectorVTInfo VTInfo> {
+  defm Z : compress_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z256 : compress_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
+    defm Z128 : compress_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
+  }
+}
+
+defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", avx512vl_i32_info>,
+                                         EVEX;
+defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", avx512vl_i64_info>,
+                                         EVEX, VEX_W;
+defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", avx512vl_f32_info>,
+                                         EVEX;
+defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>,
+                                         EVEX, VEX_W;
+
+// expand
+multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
+                                 string OpcodeStr> {
+  def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+              (ins _.KRCWM:$mask, _.RC:$src),
+              OpcodeStr # "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+              [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask, (_.VT _.RC:$src),
+                                      _.ImmAllZerosV)))]>, EVEX_KZ;
+
+  let Constraints = "$src0 = $dst" in
+  def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+                    (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src),
+                    OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
+                    [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask,
+                                      (_.VT _.RC:$src), _.RC:$src0)))]>, EVEX_K;
+
+  let mayLoad = 1, Constraints = "$src0 = $dst" in
+  def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+              (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src),
+              OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
+              [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask,
+                                      (_.VT (bitconvert
+                                              (_.LdFrag addr:$src))),
+                                      _.RC:$src0)))]>,
+              EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
+  
+  let mayLoad = 1 in
+  def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+              (ins _.KRCWM:$mask, _.MemOp:$src),
+              OpcodeStr # "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+              [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask,
+                                      (_.VT (bitconvert (_.LdFrag addr:$src))),
+                                     _.ImmAllZerosV)))]>,
+              EVEX_KZ, EVEX_CD8<_.EltSize, CD8VT1>;
+  
+}
+
+multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr,
+                                 AVX512VLVectorVTInfo VTInfo> {
+  defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
+    defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
+  }
+}
+
+defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", avx512vl_i32_info>,
+                                         EVEX;
+defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", avx512vl_i64_info>,
+                                         EVEX, VEX_W;
+defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>,
+                                         EVEX;
+defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>,
+                                         EVEX, VEX_W;
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 25e1e80..78efc4d 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -15,13 +15,13 @@
 //===----------------------------------------------------------------------===//
 // LEA - Load Effective Address
 let SchedRW = [WriteLEA] in {
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 def LEA16r   : I<0x8D, MRMSrcMem,
-                 (outs GR16:$dst), (ins i32mem:$src),
+                 (outs GR16:$dst), (ins anymem:$src),
                  "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize16;
 let isReMaterializable = 1 in
 def LEA32r   : I<0x8D, MRMSrcMem,
-                 (outs GR32:$dst), (ins i32mem:$src),
+                 (outs GR32:$dst), (ins anymem:$src),
                  "lea{l}\t{$src|$dst}, {$dst|$src}",
                  [(set GR32:$dst, lea32addr:$src)], IIC_LEA>,
                  OpSize32, Requires<[Not64BitMode]>;
@@ -65,18 +65,18 @@ def MUL8r  : I<0xF6, MRM4r, (outs),  (ins GR8:$src), "mul{b}\t$src",
                [(set AL, (mul AL, GR8:$src)),
                 (implicit EFLAGS)], IIC_MUL8>, Sched<[WriteIMul]>;
 // AX,DX = AX*GR16
-let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in
+let Defs = [AX,DX,EFLAGS], Uses = [AX], hasSideEffects = 0 in
 def MUL16r : I<0xF7, MRM4r, (outs),  (ins GR16:$src),
                "mul{w}\t$src",
                [], IIC_MUL16_REG>, OpSize16, Sched<[WriteIMul]>;
 // EAX,EDX = EAX*GR32
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], hasSideEffects = 0 in
 def MUL32r : I<0xF7, MRM4r, (outs),  (ins GR32:$src),
                "mul{l}\t$src",
                [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/],
                IIC_MUL32_REG>, OpSize32, Sched<[WriteIMul]>;
 // RAX,RDX = RAX*GR64
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], hasSideEffects = 0 in
 def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
                 "mul{q}\t$src",
                 [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/],
@@ -91,7 +91,7 @@ def MUL8m  : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
                [(set AL, (mul AL, (loadi8 addr:$src))),
                 (implicit EFLAGS)], IIC_MUL8>, SchedLoadReg<WriteIMulLd>;
 // AX,DX = AX*[mem16]
-let mayLoad = 1, neverHasSideEffects = 1 in {
+let mayLoad = 1, hasSideEffects = 0 in {
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
 def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
                "mul{w}\t$src",
@@ -107,7 +107,7 @@ def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
                 "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>;
 }
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 // AL,AH = AL*GR8
 let Defs = [AL,EFLAGS,AX], Uses = [AL] in
 def IMUL8r  : I<0xF6, MRM5r, (outs),  (ins GR8:$src), "imul{b}\t$src", [],
@@ -145,7 +145,7 @@ let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
                  "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>;
 }
-} // neverHasSideEffects
+} // hasSideEffects
 
 
 let Defs = [EFLAGS] in {
@@ -456,64 +456,29 @@ def INC8r  : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
                "inc{b}\t$dst",
                [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))],
                IIC_UNARY_REG>;
-
-let isConvertibleToThreeAddress = 1, CodeSize = 1 in {  // Can xform into LEA.
-def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
+def INC16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
                "inc{w}\t$dst",
-               [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], IIC_UNARY_REG>,
-             OpSize16, Requires<[Not64BitMode]>;
-def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+               [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))],
+               IIC_UNARY_REG>, OpSize16;
+def INC32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
                "inc{l}\t$dst",
                [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
-               IIC_UNARY_REG>,
-             OpSize32, Requires<[Not64BitMode]>;
+               IIC_UNARY_REG>, OpSize32;
 def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst",
                 [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))],
                 IIC_UNARY_REG>;
-} // isConvertibleToThreeAddress = 1, CodeSize = 1
-
-
-// In 64-bit mode, single byte INC and DEC cannot be encoded.
-let isConvertibleToThreeAddress = 1, CodeSize = 2 in {
-// Can transform into LEA.
-def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
-                  "inc{w}\t$dst",
-                  [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))],
-                  IIC_UNARY_REG>,
-                OpSize16, Requires<[In64BitMode]>;
-def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
-                  "inc{l}\t$dst",
-                  [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
-                  IIC_UNARY_REG>,
-                OpSize32, Requires<[In64BitMode]>;
-def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
-                  "dec{w}\t$dst",
-                  [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
-                  IIC_UNARY_REG>,
-                OpSize16, Requires<[In64BitMode]>;
-def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
-                  "dec{l}\t$dst",
-                  [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
-                  IIC_UNARY_REG>,
-                OpSize32, Requires<[In64BitMode]>;
 } // isConvertibleToThreeAddress = 1, CodeSize = 2
 
-let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
-    CodeSize = 2 in {
-def INC32_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
-                  "inc{w}\t$dst", [], IIC_UNARY_REG>,
-                OpSize16, Requires<[Not64BitMode]>;
-def INC32_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
-                  "inc{l}\t$dst", [], IIC_UNARY_REG>,
-                OpSize32, Requires<[Not64BitMode]>;
-def DEC32_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
-                  "dec{w}\t$dst", [], IIC_UNARY_REG>,
-                OpSize16, Requires<[Not64BitMode]>;
-def DEC32_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
-                  "dec{l}\t$dst", [], IIC_UNARY_REG>,
-                OpSize32, Requires<[Not64BitMode]>;
-} // isCodeGenOnly = 1, ForceDisassemble = 1, HasSideEffects = 0, CodeSize = 2
-
+// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
+let CodeSize = 1, hasSideEffects = 0 in {
+def INC16r_alt : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+                   "inc{w}\t$dst", [], IIC_UNARY_REG>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def INC32r_alt : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+                   "inc{l}\t$dst", [], IIC_UNARY_REG>,
+                 OpSize32, Requires<[Not64BitMode]>;
+} // CodeSize = 1, hasSideEffects = 0
 } // Constraints = "$src1 = $dst", SchedRW
 
 let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
@@ -522,35 +487,13 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
                 (implicit EFLAGS)], IIC_UNARY_MEM>;
   def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
                [(store (add (loadi16 addr:$dst), 1), addr:$dst),
-                (implicit EFLAGS)], IIC_UNARY_MEM>,
-               OpSize16, Requires<[Not64BitMode]>;
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
   def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
                [(store (add (loadi32 addr:$dst), 1), addr:$dst),
-                (implicit EFLAGS)], IIC_UNARY_MEM>,
-               OpSize32, Requires<[Not64BitMode]>;
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
   def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
                   [(store (add (loadi64 addr:$dst), 1), addr:$dst),
                    (implicit EFLAGS)], IIC_UNARY_MEM>;
-
-// These are duplicates of their 32-bit counterparts. Only needed so X86 knows
-// how to unfold them.
-// FIXME: What is this for??
-def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
-                  [(store (add (loadi16 addr:$dst), 1), addr:$dst),
-                    (implicit EFLAGS)], IIC_UNARY_MEM>,
-                OpSize16, Requires<[In64BitMode]>;
-def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
-                  [(store (add (loadi32 addr:$dst), 1), addr:$dst),
-                    (implicit EFLAGS)], IIC_UNARY_MEM>,
-                OpSize32, Requires<[In64BitMode]>;
-def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
-                  [(store (add (loadi16 addr:$dst), -1), addr:$dst),
-                    (implicit EFLAGS)], IIC_UNARY_MEM>,
-                OpSize16, Requires<[In64BitMode]>;
-def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
-                  [(store (add (loadi32 addr:$dst), -1), addr:$dst),
-                    (implicit EFLAGS)], IIC_UNARY_MEM>,
-                OpSize32, Requires<[In64BitMode]>;
 } // CodeSize = 2, SchedRW
 
 let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
@@ -559,21 +502,29 @@ def DEC8r  : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
                "dec{b}\t$dst",
                [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))],
                IIC_UNARY_REG>;
-let isConvertibleToThreeAddress = 1, CodeSize = 1 in {   // Can xform into LEA.
-def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
+def DEC16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
                "dec{w}\t$dst",
                [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
-               IIC_UNARY_REG>,
-             OpSize16, Requires<[Not64BitMode]>;
-def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+               IIC_UNARY_REG>, OpSize16;
+def DEC32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
                "dec{l}\t$dst",
                [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
-               IIC_UNARY_REG>,
-             OpSize32, Requires<[Not64BitMode]>;
+               IIC_UNARY_REG>, OpSize32;
 def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst",
                 [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))],
                 IIC_UNARY_REG>;
-} // CodeSize = 2
+} // isConvertibleToThreeAddress = 1, CodeSize = 2
+
+// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
+let CodeSize = 1, hasSideEffects = 0 in {
+def DEC16r_alt : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+                   "dec{w}\t$dst", [], IIC_UNARY_REG>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def DEC32r_alt : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+                   "dec{l}\t$dst", [], IIC_UNARY_REG>,
+                 OpSize32, Requires<[Not64BitMode]>;
+} // CodeSize = 1, hasSideEffects = 0
 } // Constraints = "$src1 = $dst", SchedRW
 
 
@@ -583,12 +534,10 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
                 (implicit EFLAGS)], IIC_UNARY_MEM>;
   def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
                [(store (add (loadi16 addr:$dst), -1), addr:$dst),
-                (implicit EFLAGS)], IIC_UNARY_MEM>,
-               OpSize16, Requires<[Not64BitMode]>;
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
   def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
                [(store (add (loadi32 addr:$dst), -1), addr:$dst),
-                (implicit EFLAGS)], IIC_UNARY_MEM>,
-               OpSize32, Requires<[Not64BitMode]>;
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
   def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
                   [(store (add (loadi64 addr:$dst), -1), addr:$dst),
                    (implicit EFLAGS)], IIC_UNARY_MEM>;
@@ -710,15 +659,6 @@ class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
         mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
     Sched<[WriteALU]>;
 
-// BinOpRR_R - Instructions like "add reg, reg, reg", where the pattern has
-// just a regclass (no eflags) as a result.
-class BinOpRR_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                SDNode opnode>
-  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
-            [(set typeinfo.RegClass:$dst,
-                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
-                  IIC_BIN_NONMEM>;
-
 // BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has
 // just a EFLAGS as a result.
 class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
@@ -825,13 +765,6 @@ class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   let ImmT = typeinfo.ImmEncoding;
 }
 
-// BinOpRI_R - Instructions like "add reg, reg, imm".
-class BinOpRI_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                SDNode opnode, Format f>
-  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
-            [(set typeinfo.RegClass:$dst,
-                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
-
 // BinOpRI_F - Instructions like "cmp reg, imm".
 class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                 SDPatternOperator opnode, Format f>
@@ -864,30 +797,23 @@ class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   let ImmT = Imm8; // Always 8-bit immediate.
 }
 
-// BinOpRI8_R - Instructions like "add reg, reg, imm8".
-class BinOpRI8_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  SDNode opnode, Format f>
-  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
-             [(set typeinfo.RegClass:$dst,
-               (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
-
 // BinOpRI8_F - Instructions like "cmp reg, imm8".
 class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  SDNode opnode, Format f>
+                  SDPatternOperator opnode, Format f>
   : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs),
              [(set EFLAGS,
                (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
 
 // BinOpRI8_RF - Instructions like "add reg, reg, imm8".
 class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  SDNode opnode, Format f>
+                  SDPatternOperator opnode, Format f>
   : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
              [(set typeinfo.RegClass:$dst, EFLAGS,
                (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
 
 // BinOpRI8_RFF - Instructions like "adc reg, reg, imm8".
 class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                   SDNode opnode, Format f>
+                   SDPatternOperator opnode, Format f>
   : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
              [(set typeinfo.RegClass:$dst, EFLAGS,
                (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2,
@@ -923,8 +849,8 @@ class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
             [(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>;
 
 // BinOpMI - Instructions like "add [mem], imm".
-class BinOpMI<string mnemonic, X86TypeInfo typeinfo,
-              Format f, list<dag> pattern, bits<8> opcode = 0x80,
+class BinOpMI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              Format f, list<dag> pattern,
               InstrItinClass itin = IIC_BIN_MEM>
   : ITy<opcode, f, typeinfo,
         (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
@@ -934,27 +860,26 @@ class BinOpMI<string mnemonic, X86TypeInfo typeinfo,
 }
 
 // BinOpMI_RMW - Instructions like "add [mem], imm".
-class BinOpMI_RMW<string mnemonic, X86TypeInfo typeinfo,
+class BinOpMI_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                   SDNode opnode, Format f>
-  : BinOpMI<mnemonic, typeinfo, f,
+  : BinOpMI<opcode, mnemonic, typeinfo, f,
             [(store (opnode (typeinfo.VT (load addr:$dst)),
                             typeinfo.ImmOperator:$src), addr:$dst),
              (implicit EFLAGS)]>;
 // BinOpMI_RMW_FF - Instructions like "adc [mem], imm".
-class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
-                  SDNode opnode, Format f>
-  : BinOpMI<mnemonic, typeinfo, f,
+class BinOpMI_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                     SDNode opnode, Format f>
+  : BinOpMI<opcode, mnemonic, typeinfo, f,
             [(store (opnode (typeinfo.VT (load addr:$dst)),
                             typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
-             (implicit EFLAGS)], 0x80, IIC_BIN_CARRY_MEM>;
+             (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
 
 // BinOpMI_F - Instructions like "cmp [mem], imm".
-class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo,
-                SDPatternOperator opnode, Format f, bits<8> opcode = 0x80>
-  : BinOpMI<mnemonic, typeinfo, f,
+class BinOpMI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                SDPatternOperator opnode, Format f>
+  : BinOpMI<opcode, mnemonic, typeinfo, f,
             [(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)),
-                                               typeinfo.ImmOperator:$src))],
-            opcode>;
+                                               typeinfo.ImmOperator:$src))]>;
 
 // BinOpMI8 - Instructions like "add [mem], imm8".
 class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
@@ -969,7 +894,7 @@ class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
 
 // BinOpMI8_RMW - Instructions like "add [mem], imm8".
 class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo,
-                   SDNode opnode, Format f>
+                   SDPatternOperator opnode, Format f>
   : BinOpMI8<mnemonic, typeinfo, f,
              [(store (opnode (load addr:$dst),
                              typeinfo.Imm8Operator:$src), addr:$dst),
@@ -977,7 +902,7 @@ class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo,
 
 // BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8".
 class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
-                   SDNode opnode, Format f>
+                      SDPatternOperator opnode, Format f>
   : BinOpMI8<mnemonic, typeinfo, f,
              [(store (opnode (load addr:$dst),
                              typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst),
@@ -985,7 +910,7 @@ class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
 
 // BinOpMI8_F - Instructions like "cmp [mem], imm8".
 class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
-                 SDNode opnode, Format f>
+                 SDPatternOperator opnode, Format f>
   : BinOpMI8<mnemonic, typeinfo, f,
              [(set EFLAGS, (opnode (load addr:$dst),
                                    typeinfo.Imm8Operator:$src))]>;
@@ -1023,12 +948,13 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
                          bit CommutableRR, bit ConvertibleToThreeAddress> {
   let Defs = [EFLAGS] in {
     let Constraints = "$src1 = $dst" in {
-      let isCommutable = CommutableRR,
-          isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+      let isCommutable = CommutableRR in {
         def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
-        def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
-        def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
-        def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
+        let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+          def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
+          def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
+          def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
+        } // isConvertibleToThreeAddress
       } // isCommutable
 
       def NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
@@ -1041,6 +967,8 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
       def NAME#32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
       def NAME#64rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
 
+      def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+
       let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
         // NOTE: These are order specific, we want the ri8 forms to be listed
         // first so that they are slightly preferred to the ri forms.
@@ -1048,7 +976,6 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
         def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>;
         def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>;
 
-        def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
         def NAME#16ri  : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>;
         def NAME#32ri  : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>;
         def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>;
@@ -1066,10 +993,20 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def NAME#32mi8  : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>;
     def NAME#64mi8  : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>;
 
-    def NAME#8mi    : BinOpMI_RMW<mnemonic, Xi8 , opnode, MemMRM>;
-    def NAME#16mi   : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>;
-    def NAME#32mi   : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>;
-    def NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>;
+    def NAME#8mi    : BinOpMI_RMW<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi   : BinOpMI_RMW<0x80, mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi   : BinOpMI_RMW<0x80, mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32 : BinOpMI_RMW<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+    // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+    // not in 64-bit mode.
+    let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+        hasSideEffects = 0 in {
+      let Constraints = "$src1 = $dst" in
+        def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+      let mayLoad = 1, mayStore = 1 in
+        def NAME#8mi8 : BinOpMI8_RMW<mnemonic, Xi8, null_frag, MemMRM>;
+    }
   } // Defs = [EFLAGS]
 
   def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
@@ -1094,12 +1031,13 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
                            bit ConvertibleToThreeAddress> {
   let Uses = [EFLAGS], Defs = [EFLAGS] in {
     let Constraints = "$src1 = $dst" in {
-      let isCommutable = CommutableRR,
-          isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+      let isCommutable = CommutableRR in {
         def NAME#8rr  : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>;
-        def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>;
-        def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>;
-        def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
+        let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+          def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>;
+          def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>;
+          def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
+        } // isConvertibleToThreeAddress
       } // isCommutable
 
       def NAME#8rr_REV  : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>;
@@ -1112,6 +1050,8 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
       def NAME#32rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>;
       def NAME#64rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>;
 
+      def NAME#8ri   : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+
       let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
         // NOTE: These are order specific, we want the ri8 forms to be listed
         // first so that they are slightly preferred to the ri forms.
@@ -1119,7 +1059,6 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
         def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>;
         def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>;
 
-        def NAME#8ri   : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
         def NAME#16ri  : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>;
         def NAME#32ri  : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>;
         def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>;
@@ -1137,10 +1076,20 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def NAME#32mi8  : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
     def NAME#64mi8  : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
 
-    def NAME#8mi    : BinOpMI_RMW_FF<mnemonic, Xi8 , opnode, MemMRM>;
-    def NAME#16mi   : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
-    def NAME#32mi   : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
-    def NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
+    def NAME#8mi    : BinOpMI_RMW_FF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi   : BinOpMI_RMW_FF<0x80, mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi   : BinOpMI_RMW_FF<0x80, mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32 : BinOpMI_RMW_FF<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+    // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+    // not in 64-bit mode.
+    let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+        hasSideEffects = 0 in {
+      let Constraints = "$src1 = $dst" in
+        def NAME#8ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+      let mayLoad = 1, mayStore = 1 in
+        def NAME#8mi8 : BinOpMI8_RMW_FF<mnemonic, Xi8, null_frag, MemMRM>;
+    }
   } // Uses = [EFLAGS], Defs = [EFLAGS]
 
   def NAME#8i8   : BinOpAI_FF<BaseOpc4, mnemonic, Xi8 , AL,
@@ -1162,12 +1111,13 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
                         SDNode opnode,
                         bit CommutableRR, bit ConvertibleToThreeAddress> {
   let Defs = [EFLAGS] in {
-    let isCommutable = CommutableRR,
-        isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+    let isCommutable = CommutableRR in {
       def NAME#8rr  : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
-      def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>;
-      def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>;
-      def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
+      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>;
+        def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>;
+        def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
+      }
     } // isCommutable
 
     def NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
@@ -1180,6 +1130,8 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def NAME#32rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>;
     def NAME#64rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
 
+    def NAME#8ri   : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+
     let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
       // NOTE: These are order specific, we want the ri8 forms to be listed
       // first so that they are slightly preferred to the ri forms.
@@ -1187,7 +1139,6 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
       def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>;
       def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>;
 
-      def NAME#8ri   : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
       def NAME#16ri  : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>;
       def NAME#32ri  : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>;
       def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>;
@@ -1204,10 +1155,19 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def NAME#32mi8  : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>;
     def NAME#64mi8  : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>;
 
-    def NAME#8mi    : BinOpMI_F<mnemonic, Xi8 , opnode, MemMRM>;
-    def NAME#16mi   : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>;
-    def NAME#32mi   : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>;
-    def NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>;
+    def NAME#8mi    : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi   : BinOpMI_F<0x80, mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi   : BinOpMI_F<0x80, mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32 : BinOpMI_F<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+    // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+    // not in 64-bit mode.
+    let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+        hasSideEffects = 0 in {
+      def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+      let mayLoad = 1 in
+        def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, null_frag, MemMRM>;
+    }
   } // Defs = [EFLAGS]
 
   def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
@@ -1272,15 +1232,15 @@ let isCompare = 1 in {
     def TEST32ri   : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
     def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
 
-    def TEST8mi    : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>;
-    def TEST16mi   : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>;
-    def TEST32mi   : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>;
-    def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>;
+    def TEST8mi    : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
+    def TEST16mi   : BinOpMI_F<0xF6, "test", Xi16, X86testpat, MRM0m>;
+    def TEST32mi   : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>;
+    def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>;
 
     // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
     // register class is constrained to GR8_NOREX. This pseudo is explicitly
     // marked side-effect free, since it doesn't have an isel pattern like
-    // other test instructions. 
+    // other test instructions.
     let isPseudo = 1, hasSideEffects = 0 in
     def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
                           "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
@@ -1332,7 +1292,7 @@ let Predicates = [HasBMI] in {
 // MULX Instruction
 //
 multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop> {
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   let isCommutable = 1 in
   def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
@@ -1355,19 +1315,19 @@ let Predicates = [HasBMI2] in {
 //===----------------------------------------------------------------------===//
 // ADCX Instruction
 //
-let hasSideEffects = 0, Defs = [EFLAGS], Uses = [EFLAGS],
+let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS],
     Constraints = "$src0 = $dst", AddedComplexity = 10 in {
   let SchedRW = [WriteALU] in {
   def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst),
              (ins GR32:$src0, GR32:$src), "adcx{l}\t{$src, $dst|$dst, $src}",
              [(set GR32:$dst, EFLAGS,
                  (X86adc_flag GR32:$src0, GR32:$src, EFLAGS))],
-             IIC_BIN_CARRY_NONMEM>, T8PD, Requires<[HasADX]>;
+             IIC_BIN_CARRY_NONMEM>, T8PD;
   def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst),
              (ins GR64:$src0, GR64:$src), "adcx{q}\t{$src, $dst|$dst, $src}",
              [(set GR64:$dst, EFLAGS,
                  (X86adc_flag GR64:$src0, GR64:$src, EFLAGS))],
-             IIC_BIN_CARRY_NONMEM>, T8PD, Requires<[HasADX, In64BitMode]>;
+             IIC_BIN_CARRY_NONMEM>, T8PD;
   } // SchedRW
 
   let mayLoad = 1, SchedRW = [WriteALULd] in {
@@ -1375,37 +1335,34 @@ let hasSideEffects = 0, Defs = [EFLAGS], Uses = [EFLAGS],
              (ins GR32:$src0, i32mem:$src), "adcx{l}\t{$src, $dst|$dst, $src}",
              [(set GR32:$dst, EFLAGS,
                  (X86adc_flag GR32:$src0, (loadi32 addr:$src), EFLAGS))],
-             IIC_BIN_CARRY_MEM>, T8PD, Requires<[HasADX]>;
+             IIC_BIN_CARRY_MEM>, T8PD;
 
   def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst),
              (ins GR64:$src0, i64mem:$src), "adcx{q}\t{$src, $dst|$dst, $src}",
              [(set GR64:$dst, EFLAGS,
                  (X86adc_flag GR64:$src0, (loadi64 addr:$src), EFLAGS))],
-             IIC_BIN_CARRY_MEM>, T8PD, Requires<[HasADX, In64BitMode]>;
+             IIC_BIN_CARRY_MEM>, T8PD;
   }
 }
 
 //===----------------------------------------------------------------------===//
 // ADOX Instruction
 //
-let hasSideEffects = 0, Defs = [EFLAGS], Uses = [EFLAGS] in {
+let Predicates = [HasADX], hasSideEffects = 0, Defs = [EFLAGS],
+    Uses = [EFLAGS] in {
   let SchedRW = [WriteALU] in {
   def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-             "adox{l}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_NONMEM>, T8XS, Requires<[HasADX]>;
+             "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
 
   def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-             "adox{q}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_NONMEM>, T8XS, Requires<[HasADX, In64BitMode]>;
+             "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
   } // SchedRW
 
   let mayLoad = 1, SchedRW = [WriteALULd] in {
   def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-             "adox{l}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_MEM>, T8XS, Requires<[HasADX]>;
+             "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS;
 
   def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
-             "adox{q}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_MEM>, T8XS, Requires<[HasADX, In64BitMode]>;
+             "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS;
   }
 }
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 117b6ff..18bbe5d 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -32,7 +32,7 @@ def GetLo8XForm : SDNodeXForm<imm, [{
 // PIC base construction.  This expands to code that looks like this:
 //     call  $next_inst
 //     popl %destreg"
-let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in
+let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in
   def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
                       "", []>;
 
@@ -43,15 +43,18 @@ let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber EFLAGS.
 let Defs = [ESP, EFLAGS], Uses = [ESP] in {
-def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKDOWN",
-                           [(X86callseq_start timm:$amt)]>,
+                           []>,
                           Requires<[NotLP64]>;
 def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKUP",
                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
                           Requires<[NotLP64]>;
 }
+def : Pat<(X86callseq_start timm:$amt1),
+          (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
+
 
 // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
 // a stack adjustment and the codegen must know that they may modify the stack
@@ -59,16 +62,17 @@ def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber EFLAGS.
 let Defs = [RSP, EFLAGS], Uses = [RSP] in {
-def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKDOWN",
-                           [(X86callseq_start timm:$amt)]>,
+                           []>,
                           Requires<[IsLP64]>;
 def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKUP",
                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
                           Requires<[IsLP64]>;
 }
-
+def : Pat<(X86callseq_start timm:$amt1),
+          (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
 
 
 // x86-64 va_start lowering magic.
@@ -259,7 +263,7 @@ def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
 // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
 // that would make it more difficult to rematerialize.
 let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isCodeGenOnly = 1, neverHasSideEffects = 1 in
+    isCodeGenOnly = 1, hasSideEffects = 0 in
 def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src),
                      "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>;
 
@@ -471,59 +475,50 @@ def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
 //===----------------------------------------------------------------------===//
 // Conditional Move Pseudo Instructions
 
-// X86 doesn't have 8-bit conditional moves. Use a customInserter to
-// emit control flow. An alternative to this is to mark i8 SELECT as Promote,
-// however that requires promoting the operands, and can induce additional
-// i8 register pressure.
-let usesCustomInserter = 1, Uses = [EFLAGS] in {
-def CMOV_GR8 : I<0, Pseudo,
-                 (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond),
-                 "#CMOV_GR8 PSEUDO!",
-                 [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2,
-                                          imm:$cond, EFLAGS))]>;
-
-let Predicates = [NoCMov] in {
-def CMOV_GR32 : I<0, Pseudo,
-                    (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond),
-                    "#CMOV_GR32* PSEUDO!",
-                    [(set GR32:$dst,
-                      (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>;
-def CMOV_GR16 : I<0, Pseudo,
-                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond),
-                    "#CMOV_GR16* PSEUDO!",
-                    [(set GR16:$dst,
-                      (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>;
-} // Predicates = [NoCMov]
-
-// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
-// SSE1.
-let Predicates = [FPStackf32] in
-def CMOV_RFP32 : I<0, Pseudo,
-                    (outs RFP32:$dst),
-                    (ins RFP32:$src1, RFP32:$src2, i8imm:$cond),
-                    "#CMOV_RFP32 PSEUDO!",
-                    [(set RFP32:$dst,
-                      (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond,
-                                                  EFLAGS))]>;
-// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
-// SSE2.
-let Predicates = [FPStackf64] in
-def CMOV_RFP64 : I<0, Pseudo,
-                    (outs RFP64:$dst),
-                    (ins RFP64:$src1, RFP64:$src2, i8imm:$cond),
-                    "#CMOV_RFP64 PSEUDO!",
-                    [(set RFP64:$dst,
-                      (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond,
-                                                  EFLAGS))]>;
-def CMOV_RFP80 : I<0, Pseudo,
-                    (outs RFP80:$dst),
-                    (ins RFP80:$src1, RFP80:$src2, i8imm:$cond),
-                    "#CMOV_RFP80 PSEUDO!",
-                    [(set RFP80:$dst,
-                      (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
-                                                  EFLAGS))]>;
-} // UsesCustomInserter = 1, Uses = [EFLAGS]
+// CMOV* - Used to implement the SELECT DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> {
+  def CMOV#NAME  : I<0, Pseudo,
+                    (outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond),
+                    "#CMOV_"#NAME#" PSEUDO!",
+                    [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, imm:$cond,
+                                                EFLAGS)))]>;
+}
 
+let usesCustomInserter = 1, Uses = [EFLAGS] in {
+  // X86 doesn't have 8-bit conditional moves. Use a customInserter to
+  // emit control flow. An alternative to this is to mark i8 SELECT as Promote,
+  // however that requires promoting the operands, and can induce additional
+  // i8 register pressure.
+  defm _GR8 : CMOVrr_PSEUDO<GR8, i8>;
+
+  let Predicates = [NoCMov] in {
+    defm _GR32 : CMOVrr_PSEUDO<GR32, i32>;
+    defm _GR16 : CMOVrr_PSEUDO<GR16, i16>;
+  } // Predicates = [NoCMov]
+
+  // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
+  // SSE1/SSE2.
+  let Predicates = [FPStackf32] in
+    defm _RFP32 : CMOVrr_PSEUDO<RFP32, f32>;
+
+  let Predicates = [FPStackf64] in
+    defm _RFP64 : CMOVrr_PSEUDO<RFP64, f64>;
+
+  defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>;
+
+  defm _FR32   : CMOVrr_PSEUDO<FR32, f32>;
+  defm _FR64   : CMOVrr_PSEUDO<FR64, f64>;
+  defm _V4F32  : CMOVrr_PSEUDO<VR128, v4f32>;
+  defm _V2F64  : CMOVrr_PSEUDO<VR128, v2f64>;
+  defm _V2I64  : CMOVrr_PSEUDO<VR128, v2i64>;
+  defm _V8F32  : CMOVrr_PSEUDO<VR256, v8f32>;
+  defm _V4F64  : CMOVrr_PSEUDO<VR256, v4f64>;
+  defm _V4I64  : CMOVrr_PSEUDO<VR256, v4i64>;
+  defm _V8I64  : CMOVrr_PSEUDO<VR512, v8i64>;
+  defm _V8F64  : CMOVrr_PSEUDO<VR512, v8f64>;
+  defm _V16F32 : CMOVrr_PSEUDO<VR512, v16f32>;
+} // usesCustomInserter = 1, Uses = [EFLAGS]
 
 //===----------------------------------------------------------------------===//
 // Normal-Instructions-With-Lock-Prefix Pseudo Instructions
@@ -600,12 +595,12 @@ def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                                  "{$src2, $dst|$dst, $src2}"),
                       [], IIC_ALU_MEM>, OpSize32, LOCK;
 
-def NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
-                         ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
-                         ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
-                         !strconcat(mnemonic, "{q}\t",
-                                    "{$src2, $dst|$dst, $src2}"),
-                         [], IIC_ALU_MEM>, LOCK;
+def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                          ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                          ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
+                          !strconcat(mnemonic, "{q}\t",
+                                     "{$src2, $dst|$dst, $src2}"),
+                          [], IIC_ALU_MEM>, LOCK;
 
 def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
@@ -859,79 +854,6 @@ def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
 def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
                       "#ACQUIRE_MOV PSEUDO!",
                       [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
-//===----------------------------------------------------------------------===//
-// Conditional Move Pseudo Instructions.
-//===----------------------------------------------------------------------===//
-
-// CMOV* - Used to implement the SSE SELECT DAG operation.  Expanded after
-// instruction selection into a branch sequence.
-let Uses = [EFLAGS], usesCustomInserter = 1 in {
-  def CMOV_FR32 : I<0, Pseudo,
-                    (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond),
-                    "#CMOV_FR32 PSEUDO!",
-                    [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond,
-                                                  EFLAGS))]>;
-  def CMOV_FR64 : I<0, Pseudo,
-                    (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond),
-                    "#CMOV_FR64 PSEUDO!",
-                    [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond,
-                                                  EFLAGS))]>;
-  def CMOV_V4F32 : I<0, Pseudo,
-                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
-                    "#CMOV_V4F32 PSEUDO!",
-                    [(set VR128:$dst,
-                      (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond,
-                                          EFLAGS)))]>;
-  def CMOV_V2F64 : I<0, Pseudo,
-                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
-                    "#CMOV_V2F64 PSEUDO!",
-                    [(set VR128:$dst,
-                      (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
-                                          EFLAGS)))]>;
-  def CMOV_V2I64 : I<0, Pseudo,
-                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
-                    "#CMOV_V2I64 PSEUDO!",
-                    [(set VR128:$dst,
-                      (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
-                                          EFLAGS)))]>;
-  def CMOV_V8F32 : I<0, Pseudo,
-                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
-                    "#CMOV_V8F32 PSEUDO!",
-                    [(set VR256:$dst,
-                      (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond,
-                                          EFLAGS)))]>;
-  def CMOV_V4F64 : I<0, Pseudo,
-                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
-                    "#CMOV_V4F64 PSEUDO!",
-                    [(set VR256:$dst,
-                      (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
-                                          EFLAGS)))]>;
-  def CMOV_V4I64 : I<0, Pseudo,
-                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
-                    "#CMOV_V4I64 PSEUDO!",
-                    [(set VR256:$dst,
-                      (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
-                                          EFLAGS)))]>;
-  def CMOV_V8I64 : I<0, Pseudo,
-                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
-                    "#CMOV_V8I64 PSEUDO!",
-                    [(set VR512:$dst,
-                      (v8i64 (X86cmov VR512:$t, VR512:$f, imm:$cond,
-                                          EFLAGS)))]>;
-  def CMOV_V8F64 : I<0, Pseudo,
-                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
-                    "#CMOV_V8F64 PSEUDO!",
-                    [(set VR512:$dst,
-                      (v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond,
-                                          EFLAGS)))]>;
-  def CMOV_V16F32 : I<0, Pseudo,
-                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
-                    "#CMOV_V16F32 PSEUDO!",
-                    [(set VR512:$dst,
-                      (v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond,
-                                          EFLAGS)))]>;
-}
-
 
 //===----------------------------------------------------------------------===//
 // DAG Pattern Matching Rules
@@ -1010,6 +932,9 @@ def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, tblockaddress:$src)>,
           Requires<[NearData, IsStatic]>;
 
+def : Pat<(i32 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
+def : Pat<(i64 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV64ri texternalsym:$dst)>;
+
 // Calls
 
 // tls has some funny stuff here...
@@ -1058,12 +983,12 @@ def : Pat<(X86tcret (load addr:$dst), imm:$off),
           Requires<[Not64BitMode, IsNotPIC]>;
 
 def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
-          (TCRETURNdi texternalsym:$dst, imm:$off)>,
-          Requires<[Not64BitMode]>;
+          (TCRETURNdi tglobaladdr:$dst, imm:$off)>,
+          Requires<[NotLP64]>;
 
 def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
           (TCRETURNdi texternalsym:$dst, imm:$off)>,
-          Requires<[Not64BitMode]>;
+          Requires<[NotLP64]>;
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
           (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
@@ -1077,11 +1002,11 @@ def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
 
 def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
           (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
-          Requires<[In64BitMode]>;
+          Requires<[IsLP64]>;
 
 def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
           (TCRETURNdi64 texternalsym:$dst, imm:$off)>,
-          Requires<[In64BitMode]>;
+          Requires<[IsLP64]>;
 
 // Normal calls, with various flavors of addresses.
 def : Pat<(X86call (i32 tglobaladdr:$dst)),
@@ -1556,8 +1481,12 @@ def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
 def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
 
 // Helper imms that check if a mask doesn't change significant shift bits.
-def immShift32 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 5; }]>;
-def immShift64 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 6; }]>;
+def immShift32 : ImmLeaf<i8, [{
+  return countTrailingOnes<uint64_t>(Imm) >= 5;
+}]>;
+def immShift64 : ImmLeaf<i8, [{
+  return countTrailingOnes<uint64_t>(Imm) >= 6;
+}]>;
 
 // Shift amount is implicitly masked.
 multiclass MaskedShiftAmountPats<SDNode frag, string name> {
@@ -1724,35 +1653,18 @@ def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
 def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
           (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
 
-// Increment reg.
-// Do not make INC if it is slow
-def : Pat<(add GR8:$src, 1),
-          (INC8r GR8:$src)>, Requires<[NotSlowIncDec]>;
-def : Pat<(add GR16:$src, 1),
-          (INC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
-def : Pat<(add GR16:$src, 1),
-          (INC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
-def : Pat<(add GR32:$src, 1),
-          (INC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
-def : Pat<(add GR32:$src, 1),
-          (INC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
-def : Pat<(add GR64:$src, 1),
-          (INC64r GR64:$src)>, Requires<[NotSlowIncDec]>;
-
-// Decrement reg.
-// Do not make DEC if it is slow
-def : Pat<(add GR8:$src, -1),
-          (DEC8r GR8:$src)>, Requires<[NotSlowIncDec]>;
-def : Pat<(add GR16:$src, -1),
-          (DEC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
-def : Pat<(add GR16:$src, -1),
-          (DEC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
-def : Pat<(add GR32:$src, -1),
-          (DEC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
-def : Pat<(add GR32:$src, -1),
-          (DEC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
-def : Pat<(add GR64:$src, -1),
-          (DEC64r GR64:$src)>, Requires<[NotSlowIncDec]>;
+// Increment/Decrement reg.
+// Do not make INC/DEC if it is slow
+let Predicates = [NotSlowIncDec] in {
+  def : Pat<(add GR8:$src, 1),   (INC8r GR8:$src)>;
+  def : Pat<(add GR16:$src, 1),  (INC16r GR16:$src)>;
+  def : Pat<(add GR32:$src, 1),  (INC32r GR32:$src)>;
+  def : Pat<(add GR64:$src, 1),  (INC64r GR64:$src)>;
+  def : Pat<(add GR8:$src, -1),  (DEC8r GR8:$src)>;
+  def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>;
+  def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>;
+  def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>;
+}
 
 // or reg/reg.
 def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr  GR8 :$src1, GR8 :$src2)>;
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index 39ad395..6ab961f 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -57,33 +57,32 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
 
 // Unconditional branches.
 let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
-  def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst),
-                        "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>, OpSize32;
-  def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst),
-                        "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>, OpSize16,
-                        Requires<[In16BitMode]>;
-  let hasSideEffects = 0 in
   def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
-                       "jmp\t$dst", [], IIC_JMP_REL>;
+                       "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
+  let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+    def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget16:$dst),
+                          "jmp\t$dst", [], IIC_JMP_REL>, OpSize16;
+    def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget32:$dst),
+                          "jmp\t$dst", [], IIC_JMP_REL>, OpSize32;
+  }
 }
 
 // Conditional Branches.
 let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
   multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
-    let hasSideEffects = 0 in
-    def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [],
-                       IIC_Jcc>;
-    def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm,
-                       [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, OpSize16,
-		       TB, Requires<[In16BitMode]>;
-    def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm,
-                       [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, TB,
-             OpSize32;
+    def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm,
+                       [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>;
+    let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+      def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm,
+                         [], IIC_Jcc>, OpSize16, TB;
+      def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm,
+                         [], IIC_Jcc>, TB, OpSize32;
+    }
   }
 }
 
 defm JO  : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>;
-defm JNO : ICBr<0x71, 0x81, "jno\t$dst" , X86_COND_NO>;
+defm JNO : ICBr<0x71, 0x81, "jno\t$dst", X86_COND_NO>;
 defm JB  : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>;
 defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>;
 defm JE  : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>;
@@ -106,20 +105,14 @@ let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in
   // jecxz.
   let Uses = [CX] in
     def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
-                        "jcxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[Not64BitMode]>;
+                        "jcxz\t$dst", [], IIC_JCXZ>, AdSize16;
   let Uses = [ECX] in
-    def JECXZ_32 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
-                           "jecxz\t$dst", [], IIC_JCXZ>, Requires<[Not64BitMode]>;
+    def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+                        "jecxz\t$dst", [], IIC_JCXZ>, AdSize32;
 
-  // J*CXZ instruction: 64-bit versions of this instruction for the asmparser.
-  // In 64-bit mode, the address size prefix is jecxz and the unprefixed version
-  // is jrcxz.
-  let Uses = [ECX] in
-    def JECXZ_64 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
-                            "jecxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In64BitMode]>;
   let Uses = [RCX] in
     def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
-                           "jrcxz\t$dst", [], IIC_JCXZ>, Requires<[In64BitMode]>;
+                           "jrcxz\t$dst", [], IIC_JCXZ>, AdSize64;
 }
 
 // Indirect branches
@@ -145,14 +138,16 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
                      [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>,
                    Requires<[In64BitMode]>, Sched<[WriteJumpLd]>;
 
-  def FARJMP16i  : Iseg16<0xEA, RawFrmImm16, (outs),
-                          (ins i16imm:$off, i16imm:$seg),
-                          "ljmp{w}\t{$seg, $off|$off, $seg}", [],
-                          IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
-  def FARJMP32i  : Iseg32<0xEA, RawFrmImm16, (outs),
-                          (ins i32imm:$off, i16imm:$seg),
-                          "ljmp{l}\t{$seg, $off|$off, $seg}", [],
-                          IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+  let Predicates = [Not64BitMode] in {
+    def FARJMP16i  : Iseg16<0xEA, RawFrmImm16, (outs),
+                            (ins i16imm:$off, i16imm:$seg),
+                            "ljmp{w}\t$seg, $off", [],
+                            IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
+    def FARJMP32i  : Iseg32<0xEA, RawFrmImm16, (outs),
+                            (ins i32imm:$off, i16imm:$seg),
+                            "ljmp{l}\t$seg, $off", [],
+                            IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+  }
   def FARJMP64   : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
                       "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
                    Sched<[WriteJump]>;
@@ -186,10 +181,11 @@ let isCall = 1 in
                            (outs), (ins i32imm_pcrel:$dst),
                            "call{l}\t$dst", [], IIC_CALL_RI>, OpSize32,
                       Requires<[Not64BitMode]>, Sched<[WriteJump]>;
-    def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
-                           (outs), (ins i16imm_pcrel:$dst),
-                           "call{w}\t$dst", [], IIC_CALL_RI>, OpSize16,
-                      Sched<[WriteJump]>;
+    let hasSideEffects = 0 in
+      def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
+                             (outs), (ins i16imm_pcrel:$dst),
+                             "call{w}\t$dst", [], IIC_CALL_RI>, OpSize16,
+                        Sched<[WriteJump]>;
     def CALL16r     : I<0xFF, MRM2r, (outs), (ins GR16:$dst),
                         "call{w}\t{*}$dst", [(X86call GR16:$dst)], IIC_CALL_RI>,
                       OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
@@ -207,14 +203,16 @@ let isCall = 1 in
                       Requires<[Not64BitMode,FavorMemIndirectCall]>,
                       Sched<[WriteJumpLd]>;
 
-    def FARCALL16i  : Iseg16<0x9A, RawFrmImm16, (outs),
-                             (ins i16imm:$off, i16imm:$seg),
-                             "lcall{w}\t{$seg, $off|$off, $seg}", [],
-                             IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
-    def FARCALL32i  : Iseg32<0x9A, RawFrmImm16, (outs),
-                             (ins i32imm:$off, i16imm:$seg),
-                             "lcall{l}\t{$seg, $off|$off, $seg}", [],
-                             IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+    let Predicates = [Not64BitMode] in {
+      def FARCALL16i  : Iseg16<0x9A, RawFrmImm16, (outs),
+                               (ins i16imm:$off, i16imm:$seg),
+                               "lcall{w}\t$seg, $off", [],
+                               IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
+      def FARCALL32i  : Iseg32<0x9A, RawFrmImm16, (outs),
+                               (ins i32imm:$off, i16imm:$seg),
+                               "lcall{l}\t$seg, $off", [],
+                               IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+    }
 
     def FARCALL16m  : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
                         "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16,
@@ -242,13 +240,13 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
   // mcinst.
   def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
                            (ins i32imm_pcrel:$dst),
-                           "jmp\t$dst  # TAILCALL",
+                           "jmp\t$dst",
                            [], IIC_JMP_REL>;
   def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
                    "", [], IIC_JMP_REG>;  // FIXME: Remove encoding when JIT is dead.
   let mayLoad = 1 in
   def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
-                   "jmp{l}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
+                   "jmp{l}\t{*}$dst", [], IIC_JMP_MEM>;
 }
 
 
@@ -280,17 +278,6 @@ let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
                        "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
 }
 
-let isCall = 1, isCodeGenOnly = 1 in
-  // __chkstk(MSVC):     clobber R10, R11 and EFLAGS.
-  // ___chkstk(Mingw64): clobber R10, R11, RAX and EFLAGS, and update RSP.
-  let Defs = [RAX, R10, R11, RSP, EFLAGS],
-      Uses = [RSP] in {
-    def W64ALLOCA : Ii32PCRel<0xE8, RawFrm,
-                      (outs), (ins i64i32imm_pcrel:$dst),
-                      "call{q}\t$dst", [], IIC_CALL_RI>,
-                    Requires<[IsWin64]>, Sched<[WriteJump]>;
-  }
-
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
     isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1,
     SchedRW = [WriteJump] in {
@@ -303,13 +290,25 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
   def TCRETURNmi64 : PseudoI<(outs),
                        (ins i64mem_TC:$dst, i32imm:$offset), []>;
 
-  def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs),
-                                      (ins i64i32imm_pcrel:$dst),
-                   "jmp\t$dst  # TAILCALL", [], IIC_JMP_REL>;
+  def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst),
+                   "jmp\t$dst", [], IIC_JMP_REL>;
   def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
-                     "jmp{q}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
+                     "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
 
   let mayLoad = 1 in
   def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
-                     "jmp{q}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
+                     "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+
+  // Win64 wants jumps leaving the function to have a REX_W prefix.
+  let hasREX_WPrefix = 1 in {
+    def TAILJMPd64_REX : Ii32PCRel<0xE9, RawFrm, (outs),
+                                   (ins i64i32imm_pcrel:$dst),
+                                   "rex64 jmp\t$dst", [], IIC_JMP_REL>;
+    def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+                           "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+
+    let mayLoad = 1 in
+    def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
+                           "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+  }
 }
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index b38129a..c4b2d6d 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   let Defs = [AX], Uses = [AL] in
   def CBW : I<0x98, RawFrm, (outs), (ins),
               "{cbtw|cbw}", [], IIC_CBW>, OpSize16;  // AX = signext(AL)
@@ -39,7 +39,7 @@ let neverHasSideEffects = 1 in {
 
 
 // Sign/Zero extenders
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
                    "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>,
                    TB, OpSize16, Sched<[WriteALU]>;
@@ -47,7 +47,7 @@ let mayLoad = 1 in
 def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
                    "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>,
                    TB, OpSize16, Sched<[WriteALULd]>;
-} // neverHasSideEffects = 1
+} // hasSideEffects = 0
 def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
                    "movs{bl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
@@ -65,7 +65,7 @@ def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
                    [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>,
                    OpSize32, TB, Sched<[WriteALULd]>;
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
                    "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>,
                    TB, OpSize16, Sched<[WriteALU]>;
@@ -73,7 +73,7 @@ let mayLoad = 1 in
 def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
                    "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>,
                    TB, OpSize16, Sched<[WriteALULd]>;
-} // neverHasSideEffects = 1
+} // hasSideEffects = 0
 def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
                    "movz{bl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB,
@@ -94,7 +94,7 @@ def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
 // These are the same as the regular MOVZX32rr8 and MOVZX32rm8
 // except that they use GR32_NOREX for the output operand register class
 // instead of GR32. This allows them to operate on h registers on x86-64.
-let neverHasSideEffects = 1, isCodeGenOnly = 1 in {
+let hasSideEffects = 0, isCodeGenOnly = 1 in {
 def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
                          (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
                          "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
@@ -139,11 +139,11 @@ def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
 def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
                     "movs{lq|xd}\t{$src, $dst|$dst, $src}",
                     [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>,
-                    Sched<[WriteALU]>;
+                    Sched<[WriteALU]>, Requires<[In64BitMode]>;
 def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
                     "movs{lq|xd}\t{$src, $dst|$dst, $src}",
                     [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>,
-                    Sched<[WriteALULd]>;
+                    Sched<[WriteALULd]>, Requires<[In64BitMode]>;
 
 // movzbq and movzwq encodings for the disassembler
 def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index c0a6864..2993e42 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -69,7 +69,7 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        /* IsRVariantCommutable */ 1,
                        /* IsMVariantCommutable */ 1,
                        Op>;
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   defm r132 : fma3p_rm<opc132,
                        !strconcat(OpcodeStr, "132", PackTy),
                        MemFrag128, MemFrag256, OpTy128, OpTy256>;
@@ -81,7 +81,7 @@ let neverHasSideEffects = 1 in {
                        MemFrag128, MemFrag256, OpTy128, OpTy256,
                        /* IsRVariantCommutable */ 1,
                        /* IsMVariantCommutable */ 0>;
-} // neverHasSideEffects = 1
+} // hasSideEffects = 0
 }
 
 // Fused Multiply-Add
@@ -155,7 +155,7 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        SDNode OpNode, RegisterClass RC, ValueType OpVT,
                        X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
                        ComplexPattern mem_cpat> {
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
                        x86memop, RC, OpVT, mem_frag>;
   // See the other defm of r231 for the explanation regarding the
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index d9f173e..6cd5e79 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -17,13 +17,13 @@
 // FPStack specific DAG Nodes.
 //===----------------------------------------------------------------------===//
 
-def SDTX86FpGet2    : SDTypeProfile<2, 0, [SDTCisVT<0, f80>, 
+def SDTX86FpGet2    : SDTypeProfile<2, 0, [SDTCisVT<0, f80>,
                                            SDTCisVT<1, f80>]>;
 def SDTX86Fld       : SDTypeProfile<1, 2, [SDTCisFP<0>,
-                                           SDTCisPtrTy<1>, 
+                                           SDTCisPtrTy<1>,
                                            SDTCisVT<2, OtherVT>]>;
 def SDTX86Fst       : SDTypeProfile<0, 3, [SDTCisFP<0>,
-                                           SDTCisPtrTy<1>, 
+                                           SDTCisPtrTy<1>,
                                            SDTCisVT<2, OtherVT>]>;
 def SDTX86Fild      : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
                                            SDTCisVT<2, OtherVT>]>;
@@ -98,7 +98,7 @@ let usesCustomInserter = 1 in {  // Expanded after instruction selection.
 // All FP Stack operations are represented with four instructions here.  The
 // first three instructions, generated by the instruction selector, use "RFP32"
 // "RFP64" or "RFP80" registers: traditional register files to reference 32-bit,
-// 64-bit or 80-bit floating point values.  These sizes apply to the values, 
+// 64-bit or 80-bit floating point values.  These sizes apply to the values,
 // not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be
 // copied to each other without losing information.  These instructions are all
 // pseudo instructions and use the "_Fp" suffix.
@@ -107,7 +107,7 @@ let usesCustomInserter = 1 in {  // Expanded after instruction selection.
 // The second instruction is defined with FPI, which is the actual instruction
 // emitted by the assembler.  These use "RST" registers, although frequently
 // the actual register(s) used are implicit.  These are always 80 bits.
-// The FP stackifier pass converts one to the other after register allocation 
+// The FP stackifier pass converts one to the other after register allocation
 // occurs.
 //
 // Note that the FpI instruction should have instruction selection info (e.g.
@@ -139,66 +139,66 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
 // These instructions cannot address 80-bit memory.
 multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> {
 // ST(0) = ST(0) + [mem]
-def _Fp32m  : FpIf32<(outs RFP32:$dst), 
+def _Fp32m  : FpIf32<(outs RFP32:$dst),
                      (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
-                  [(set RFP32:$dst, 
+                  [(set RFP32:$dst,
                     (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>;
-def _Fp64m  : FpIf64<(outs RFP64:$dst), 
+def _Fp64m  : FpIf64<(outs RFP64:$dst),
                      (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
-                  [(set RFP64:$dst, 
+                  [(set RFP64:$dst,
                     (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>;
-def _Fp64m32: FpIf64<(outs RFP64:$dst), 
+def _Fp64m32: FpIf64<(outs RFP64:$dst),
                      (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
-                  [(set RFP64:$dst, 
+                  [(set RFP64:$dst,
                     (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>;
-def _Fp80m32: FpI_<(outs RFP80:$dst), 
+def _Fp80m32: FpI_<(outs RFP80:$dst),
                    (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
-                  [(set RFP80:$dst, 
+                  [(set RFP80:$dst,
                     (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>;
-def _Fp80m64: FpI_<(outs RFP80:$dst), 
+def _Fp80m64: FpI_<(outs RFP80:$dst),
                    (ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
-                  [(set RFP80:$dst, 
+                  [(set RFP80:$dst,
                     (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>;
-def _F32m  : FPI<0xD8, fp, (outs), (ins f32mem:$src), 
-                 !strconcat("f", asmstring, "{s}\t$src")> { 
-  let mayLoad = 1; 
+def _F32m  : FPI<0xD8, fp, (outs), (ins f32mem:$src),
+                 !strconcat("f", asmstring, "{s}\t$src")> {
+  let mayLoad = 1;
 }
-def _F64m  : FPI<0xDC, fp, (outs), (ins f64mem:$src), 
-                 !strconcat("f", asmstring, "{l}\t$src")> { 
-  let mayLoad = 1; 
+def _F64m  : FPI<0xDC, fp, (outs), (ins f64mem:$src),
+                 !strconcat("f", asmstring, "{l}\t$src")> {
+  let mayLoad = 1;
 }
 // ST(0) = ST(0) + [memint]
-def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), 
+def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2),
                        OneArgFPRW,
                     [(set RFP32:$dst, (OpNode RFP32:$src1,
                                        (X86fild addr:$src2, i16)))]>;
-def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), 
+def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2),
                        OneArgFPRW,
                     [(set RFP32:$dst, (OpNode RFP32:$src1,
                                        (X86fild addr:$src2, i32)))]>;
-def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), 
+def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2),
                        OneArgFPRW,
                     [(set RFP64:$dst, (OpNode RFP64:$src1,
                                        (X86fild addr:$src2, i16)))]>;
-def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), 
+def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2),
                        OneArgFPRW,
                     [(set RFP64:$dst, (OpNode RFP64:$src1,
                                        (X86fild addr:$src2, i32)))]>;
-def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), 
+def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2),
                        OneArgFPRW,
                     [(set RFP80:$dst, (OpNode RFP80:$src1,
                                        (X86fild addr:$src2, i16)))]>;
-def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), 
+def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2),
                        OneArgFPRW,
                     [(set RFP80:$dst, (OpNode RFP80:$src1,
                                        (X86fild addr:$src2, i32)))]>;
-def _FI16m  : FPI<0xDE, fp, (outs), (ins i16mem:$src), 
-                  !strconcat("fi", asmstring, "{s}\t$src")> { 
-  let mayLoad = 1; 
+def _FI16m  : FPI<0xDE, fp, (outs), (ins i16mem:$src),
+                  !strconcat("fi", asmstring, "{s}\t$src")> {
+  let mayLoad = 1;
 }
-def _FI32m  : FPI<0xDA, fp, (outs), (ins i32mem:$src), 
-                  !strconcat("fi", asmstring, "{l}\t$src")> { 
-  let mayLoad = 1; 
+def _FI32m  : FPI<0xDA, fp, (outs), (ins i32mem:$src),
+                  !strconcat("fi", asmstring, "{l}\t$src")> {
+  let mayLoad = 1;
 }
 }
 
@@ -282,7 +282,7 @@ defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">;
 defm SIN : FPUnary<fsin, MRM_FE, "fsin">;
 defm COS : FPUnary<fcos, MRM_FF, "fcos">;
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 def TST_Fp32  : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
 def TST_Fp64  : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
 def TST_Fp80  : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
@@ -415,7 +415,7 @@ def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP,
                   [(truncstoref64 RFP80:$src, addr:$op)]>;
 // FST does not support 80-bit memory target; FSTP must be used.
 
-let mayStore = 1, neverHasSideEffects = 1 in {
+let mayStore = 1, hasSideEffects = 0 in {
 def ST_FpP32m    : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>;
 def ST_FpP64m32  : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>;
 def ST_FpP64m    : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>;
@@ -424,7 +424,7 @@ def ST_FpP80m64  : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>;
 }
 def ST_FpP80m    : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
                     [(store RFP80:$src, addr:$op)]>;
-let mayStore = 1, neverHasSideEffects = 1 in {
+let mayStore = 1, hasSideEffects = 0 in {
 def IST_Fp16m32  : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
 def IST_Fp32m32  : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
 def IST_Fp64m32  : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>;
@@ -500,7 +500,7 @@ def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst",
   IIC_FST>;
 def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst",
   IIC_FST>;
-def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), 
+def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst),
   "fisttp{ll}\t$dst", IIC_FST>;
 }
 
@@ -636,12 +636,12 @@ def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>;
 def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
                "fxsave\t$dst", [], IIC_FXSAVE>, TB;
 def FXSAVE64 : RI<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
-                  "fxsave{q|64}\t$dst", [], IIC_FXSAVE>, TB, 
+                  "fxsave64\t$dst", [], IIC_FXSAVE>, TB,
                   Requires<[In64BitMode]>;
 def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
                 "fxrstor\t$src", [], IIC_FXRSTOR>, TB;
 def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
-                  "fxrstor{q|64}\t$src", [], IIC_FXRSTOR>, TB,
+                  "fxrstor64\t$src", [], IIC_FXRSTOR>, TB,
                   Requires<[In64BitMode]>;
 } // SchedRW
 
@@ -656,12 +656,12 @@ def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>;
 
 // Required for CALL which return f32 / f64 / f80 values.
 def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
-def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, 
+def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op,
                                                           RFP64:$src)>;
 def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
-def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, 
+def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op,
                                                           RFP80:$src)>;
-def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, 
+def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op,
                                                           RFP80:$src)>;
 def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op,
                                                          RFP80:$src)>;
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index fe4ead1..56043fb 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -34,23 +34,27 @@ def MRM0m  : Format<24>; def MRM1m  : Format<25>; def MRM2m  : Format<26>;
 def MRM3m  : Format<27>; def MRM4m  : Format<28>; def MRM5m  : Format<29>;
 def MRM6m  : Format<30>; def MRM7m  : Format<31>;
 def MRM_C0 : Format<32>; def MRM_C1 : Format<33>; def MRM_C2 : Format<34>;
-def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C8 : Format<37>;
-def MRM_C9 : Format<38>; def MRM_CA : Format<39>; def MRM_CB : Format<40>;
-def MRM_CF : Format<41>; def MRM_D0 : Format<42>; def MRM_D1 : Format<43>;
-def MRM_D4 : Format<44>; def MRM_D5 : Format<45>; def MRM_D6 : Format<46>;
-def MRM_D7 : Format<47>; def MRM_D8 : Format<48>; def MRM_D9 : Format<49>;
-def MRM_DA : Format<50>; def MRM_DB : Format<51>; def MRM_DC : Format<52>;
-def MRM_DD : Format<53>; def MRM_DE : Format<54>; def MRM_DF : Format<55>;
-def MRM_E0 : Format<56>; def MRM_E1 : Format<57>; def MRM_E2 : Format<58>;
-def MRM_E3 : Format<59>; def MRM_E4 : Format<60>; def MRM_E5 : Format<61>;
-def MRM_E8 : Format<62>; def MRM_E9 : Format<63>; def MRM_EA : Format<64>;
-def MRM_EB : Format<65>; def MRM_EC : Format<66>; def MRM_ED : Format<67>;
-def MRM_EE : Format<68>; def MRM_F0 : Format<69>; def MRM_F1 : Format<70>;
-def MRM_F2 : Format<71>; def MRM_F3 : Format<72>; def MRM_F4 : Format<73>;
-def MRM_F5 : Format<74>; def MRM_F6 : Format<75>; def MRM_F7 : Format<76>;
-def MRM_F8 : Format<77>; def MRM_F9 : Format<78>; def MRM_FA : Format<79>;
-def MRM_FB : Format<80>; def MRM_FC : Format<81>; def MRM_FD : Format<82>;
-def MRM_FE : Format<83>; def MRM_FF : Format<84>;
+def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C5 : Format<37>;
+def MRM_C6 : Format<38>; def MRM_C7 : Format<39>; def MRM_C8 : Format<40>;
+def MRM_C9 : Format<41>; def MRM_CA : Format<42>; def MRM_CB : Format<43>;
+def MRM_CC : Format<44>; def MRM_CD : Format<45>; def MRM_CE : Format<46>;
+def MRM_CF : Format<47>; def MRM_D0 : Format<48>; def MRM_D1 : Format<49>;
+def MRM_D2 : Format<50>; def MRM_D3 : Format<51>; def MRM_D4 : Format<52>;
+def MRM_D5 : Format<53>; def MRM_D6 : Format<54>; def MRM_D7 : Format<55>;
+def MRM_D8 : Format<56>; def MRM_D9 : Format<57>; def MRM_DA : Format<58>;
+def MRM_DB : Format<59>; def MRM_DC : Format<60>; def MRM_DD : Format<61>;
+def MRM_DE : Format<62>; def MRM_DF : Format<63>; def MRM_E0 : Format<64>;
+def MRM_E1 : Format<65>; def MRM_E2 : Format<66>; def MRM_E3 : Format<67>;
+def MRM_E4 : Format<68>; def MRM_E5 : Format<69>; def MRM_E6 : Format<70>;
+def MRM_E7 : Format<71>; def MRM_E8 : Format<72>; def MRM_E9 : Format<73>;
+def MRM_EA : Format<74>; def MRM_EB : Format<75>; def MRM_EC : Format<76>;
+def MRM_ED : Format<77>; def MRM_EE : Format<78>; def MRM_EF : Format<79>;
+def MRM_F0 : Format<80>; def MRM_F1 : Format<81>; def MRM_F2 : Format<82>;
+def MRM_F3 : Format<83>; def MRM_F4 : Format<84>; def MRM_F5 : Format<85>;
+def MRM_F6 : Format<86>; def MRM_F7 : Format<87>; def MRM_F8 : Format<88>;
+def MRM_F9 : Format<89>; def MRM_FA : Format<90>; def MRM_FB : Format<91>;
+def MRM_FC : Format<92>; def MRM_FD : Format<93>; def MRM_FE : Format<94>;
+def MRM_FF : Format<95>;
 
 // ImmType - This specifies the immediate type used by an instruction. This is
 // part of the ad-hoc solution used to emit machine instruction encodings by our
@@ -146,11 +150,22 @@ def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix.
 def OpSize16    : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode.
 def OpSize32    : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode.
 
+// Address size for encodings that change based on mode.
+class AddressSize<bits<2> val> {
+  bits<2> Value = val;
+}
+def AdSizeX  : AddressSize<0>; // Address size determined using addr operand.
+def AdSize16 : AddressSize<1>; // Encodes a 16-bit address.
+def AdSize32 : AddressSize<2>; // Encodes a 32-bit address.
+def AdSize64 : AddressSize<3>; // Encodes a 64-bit address.
+
 // Prefix byte classes which are used to indicate to the ad-hoc machine code
 // emitter that various prefix bytes are required.
 class OpSize16 { OperandSize OpSize = OpSize16; }
 class OpSize32 { OperandSize OpSize = OpSize32; }
-class AdSize { bit hasAdSizePrefix = 1; }
+class AdSize16 { AddressSize AdSize = AdSize16; }
+class AdSize32 { AddressSize AdSize = AdSize32; }
+class AdSize64 { AddressSize AdSize = AdSize64; }
 class REX_W  { bit hasREX_WPrefix = 1; }
 class LOCK   { bit hasLockPrefix = 1; }
 class REP    { bit hasREPPrefix = 1; }
@@ -231,9 +246,11 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
                             // AsmString from the parser, but still disassemble.
 
   OperandSize OpSize = OpSizeFixed; // Does this instruction's encoding change
-                                    // based on operand size of the mode
+                                    // based on operand size of the mode?
   bits<2> OpSizeBits = OpSize.Value;
-  bit hasAdSizePrefix = 0;  // Does this inst have a 0x67 prefix?
+  AddressSize AdSize = AdSizeX; // Does this instruction's encoding change
+                                // based on address size of the mode?
+  bits<2> AdSizeBits = AdSize.Value;
 
   Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have?
   bits<3> OpPrefixBits = OpPrefix.Value;
@@ -284,35 +301,35 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
                                      CD8_EltSize,
                                      !srl(VectSize, CD8_Form{1-0}))), 0);
 
-  // TSFlags layout should be kept in sync with X86InstrInfo.h.
+  // TSFlags layout should be kept in sync with X86BaseInfo.h.
   let TSFlags{6-0}   = FormBits;
   let TSFlags{8-7}   = OpSizeBits;
-  let TSFlags{9}     = hasAdSizePrefix;
-  let TSFlags{12-10} = OpPrefixBits;
-  let TSFlags{15-13} = OpMapBits;
-  let TSFlags{16}    = hasREX_WPrefix;
-  let TSFlags{20-17} = ImmT.Value;
-  let TSFlags{23-21} = FPForm.Value;
-  let TSFlags{24}    = hasLockPrefix;
-  let TSFlags{25}    = hasREPPrefix;
-  let TSFlags{27-26} = ExeDomain.Value;
-  let TSFlags{29-28} = OpEncBits;
-  let TSFlags{37-30} = Opcode;
-  let TSFlags{38}    = hasVEX_WPrefix;
-  let TSFlags{39}    = hasVEX_4V;
-  let TSFlags{40}    = hasVEX_4VOp3;
-  let TSFlags{41}    = hasVEX_i8ImmReg;
-  let TSFlags{42}    = hasVEX_L;
-  let TSFlags{43}    = ignoresVEX_L;
-  let TSFlags{44}    = hasEVEX_K;
-  let TSFlags{45}    = hasEVEX_Z;
-  let TSFlags{46}    = hasEVEX_L2;
-  let TSFlags{47}    = hasEVEX_B;
+  let TSFlags{10-9}  = AdSizeBits;
+  let TSFlags{13-11} = OpPrefixBits;
+  let TSFlags{16-14} = OpMapBits;
+  let TSFlags{17}    = hasREX_WPrefix;
+  let TSFlags{21-18} = ImmT.Value;
+  let TSFlags{24-22} = FPForm.Value;
+  let TSFlags{25}    = hasLockPrefix;
+  let TSFlags{26}    = hasREPPrefix;
+  let TSFlags{28-27} = ExeDomain.Value;
+  let TSFlags{30-29} = OpEncBits;
+  let TSFlags{38-31} = Opcode;
+  let TSFlags{39}    = hasVEX_WPrefix;
+  let TSFlags{40}    = hasVEX_4V;
+  let TSFlags{41}    = hasVEX_4VOp3;
+  let TSFlags{42}    = hasVEX_i8ImmReg;
+  let TSFlags{43}    = hasVEX_L;
+  let TSFlags{44}    = ignoresVEX_L;
+  let TSFlags{45}    = hasEVEX_K;
+  let TSFlags{46}    = hasEVEX_Z;
+  let TSFlags{47}    = hasEVEX_L2;
+  let TSFlags{48}    = hasEVEX_B;
   // If we run out of TSFlags bits, it's possible to encode this in 3 bits.
-  let TSFlags{54-48} = CD8_Scale;
-  let TSFlags{55}    = has3DNow0F0FOpcode;
-  let TSFlags{56}    = hasMemOp4Prefix;
-  let TSFlags{57}    = hasEVEX_RC;
+  let TSFlags{55-49} = CD8_Scale;
+  let TSFlags{56}    = has3DNow0F0FOpcode;
+  let TSFlags{57}    = hasMemOp4Prefix;
+  let TSFlags{58}    = hasEVEX_RC;
 }
 
 class PseudoI<dag oops, dag iops, list<dag> pattern>
@@ -327,26 +344,26 @@ class I<bits<8> o, Format f, dag outs, dag ins, string asm,
   let Pattern = pattern;
   let CodeSize = 3;
 }
-class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, 
+class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary,
            Domain d = GenericDomain>
   : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
-class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, 
+class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
                list<dag> pattern, InstrItinClass itin = NoItinerary>
   : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
-class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm, 
+class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
   : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
-class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm, 
+class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
   : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
   let Pattern = pattern;
@@ -359,14 +376,14 @@ class Ii32S<bits<8> o, Format f, dag outs, dag ins, string asm,
   let CodeSize = 3;
 }
 
-class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, 
+class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
            : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
 
-class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, 
+class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
   : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> {
   let Pattern = pattern;
@@ -393,14 +410,14 @@ class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
 //   Iseg16 - 16-bit segment selector, 16-bit offset
 //   Iseg32 - 16-bit segment selector, 32-bit offset
 
-class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm, 
+class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm,
               list<dag> pattern, InstrItinClass itin = NoItinerary>
       : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
 
-class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, 
+class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
               list<dag> pattern, InstrItinClass itin = NoItinerary>
       : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
   let Pattern = pattern;
@@ -409,8 +426,9 @@ class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
 
 // SI - SSE 1 & 2 scalar instructions
 class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
-         list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin> {
+         list<dag> pattern, InstrItinClass itin = NoItinerary,
+         Domain d = GenericDomain>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
   let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
                    !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
                    !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
@@ -478,7 +496,7 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 }
 
 // SSE1 Instruction Templates:
-// 
+//
 //   SSI   - SSE1 instructions with XS prefix.
 //   PSI   - SSE1 instructions with PS prefix.
 //   PSIi8 - SSE1 instructions with ImmT == Imm8 and PS prefix.
@@ -509,7 +527,7 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[HasAVX]>;
 
 // SSE2 Instruction Templates:
-// 
+//
 //   SDI    - SSE2 instructions with XD prefix.
 //   SDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix.
 //   S2SI   - SSE2 instructions with XS prefix.
@@ -573,16 +591,16 @@ class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
       : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
 
 // SSE3 Instruction Templates:
-// 
+//
 //   S3I   - SSE3 instructions with PD prefixes.
 //   S3SI  - SSE3 instructions with XS prefix.
 //   S3DI  - SSE3 instructions with XD prefix.
 
-class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, 
+class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS,
         Requires<[UseSSE3]>;
-class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, 
+class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD,
         Requires<[UseSSE3]>;
@@ -593,7 +611,7 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 
 // SSSE3 Instruction Templates:
-// 
+//
 //   SS38I - SSSE3 instructions with T8 prefix.
 //   SS3AI - SSSE3 instructions with TA prefix.
 //   MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands.
@@ -621,7 +639,7 @@ class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[HasSSSE3]>;
 
 // SSE4.1 Instruction Templates:
-// 
+//
 //   SS48I - SSE 4.1 instructions with T8 prefix.
 //   SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
 //
@@ -635,7 +653,7 @@ class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[UseSSE41]>;
 
 // SSE4.2 Instruction Templates:
-// 
+//
 //   SS428I - SSE 4.2 instructions with T8 prefix.
 class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = NoItinerary>
@@ -699,6 +717,9 @@ class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
         Requires<[HasAVX512]>;
+class AVX5128IBase : T8PD {
+  Domain ExeDomain = SSEPackedInt;
+}
 class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS,
@@ -868,27 +889,27 @@ class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
 // MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
 // MMXID  - MMX instructions with XD prefix.
 // MMXIS  - MMX instructions with XS prefix.
-class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, 
+class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
-class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm, 
+class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,Not64BitMode]>;
-class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, 
+class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,In64BitMode]>;
-class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, 
+class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, PS, REX_W, Requires<[HasMMX]>;
-class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, 
+class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[HasMMX]>;
-class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, 
+class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii8<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
-class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, 
+class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>;
-class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, 
+class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 1c7215c..bf515a8 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -12,10 +12,23 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
+// MMX specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+// Low word of MMX to GPR.
+def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>;
+// GPR to low word of MMX.
+def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, x86mmx>, SDTCisVT<1, i32>]>>;
+
+//===----------------------------------------------------------------------===//
 // MMX Pattern Fragments
 //===----------------------------------------------------------------------===//
 
 def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>;
+def load_mvmmx : PatFrag<(ops node:$ptr),
+                         (x86mmx (MMX_X86movw2d (load node:$ptr)))>;
 def bc_mmx  : PatFrag<(ops node:$in), (x86mmx  (bitconvert node:$in))>;
 
 //===----------------------------------------------------------------------===//
@@ -201,10 +214,19 @@ def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>;
 def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                              SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>;
 
+def SDTFPBinOpRound : SDTypeProfile<1, 3, [      // fadd_round, fmul_round, etc.
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisInt<3>]>;
+
 def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
                            SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
+def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
+                           SDTCisSameAs<1,2>, SDTCisSameAs<1,3>, SDTCisInt<4>]>;
 def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>,
                            SDTCisVec<0>, SDTCisInt<2>]>;
+def STDFp2SrcRm : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
+                           SDTCisVec<0>, SDTCisInt<3>]>;
+def STDFp3SrcRm : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
+                           SDTCisVec<0>, SDTCisInt<3>, SDTCisInt<4>]>;
 
 def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
 def X86VAlign  : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
@@ -256,6 +278,11 @@ def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
 
 def X86Addsub    : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>;
 
+def X86faddRnd   : SDNode<"X86ISD::FADD_RND",  SDTFPBinOpRound>;
+def X86fsubRnd   : SDNode<"X86ISD::FSUB_RND",  SDTFPBinOpRound>;
+def X86fmulRnd   : SDNode<"X86ISD::FMUL_RND",  SDTFPBinOpRound>;
+def X86fdivRnd   : SDNode<"X86ISD::FDIV_RND",  SDTFPBinOpRound>;
+
 def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
 def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
 def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFma>;
@@ -263,9 +290,22 @@ def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFma>;
 def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
 def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
 
+def X86FmaddRnd     : SDNode<"X86ISD::FMADD_RND",     SDTFmaRound>;
+def X86FnmaddRnd    : SDNode<"X86ISD::FNMADD_RND",    SDTFmaRound>;
+def X86FmsubRnd     : SDNode<"X86ISD::FMSUB_RND",     SDTFmaRound>;
+def X86FnmsubRnd    : SDNode<"X86ISD::FNMSUB_RND",    SDTFmaRound>;
+def X86FmaddsubRnd  : SDNode<"X86ISD::FMADDSUB_RND",  SDTFmaRound>;
+def X86FmsubaddRnd  : SDNode<"X86ISD::FMSUBADD_RND",  SDTFmaRound>;
+
 def X86rsqrt28   : SDNode<"X86ISD::RSQRT28",  STDFp1SrcRm>;
 def X86rcp28     : SDNode<"X86ISD::RCP28",    STDFp1SrcRm>;
-def X86exp2      : SDNode<"X86ISD::EXP2",  STDFp1SrcRm>;
+def X86exp2      : SDNode<"X86ISD::EXP2",     STDFp1SrcRm>;
+
+def X86rsqrt28s  : SDNode<"X86ISD::RSQRT28",  STDFp2SrcRm>;
+def X86rcp28s    : SDNode<"X86ISD::RCP28",    STDFp2SrcRm>;
+def X86RndScale  : SDNode<"X86ISD::RNDSCALE", STDFp3SrcRm>;
+def X86mgather   : SDNode<"X86ISD::GATHER", SDTypeProfile<1, 3, 
+                          [SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>]>>;
 
 def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                          SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
@@ -278,6 +318,13 @@ def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
 def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>;
 def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>;
 
+def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 3,
+                              [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,
+                               SDTCisVec<3>, SDTCisVec<1>, SDTCisInt<1>]>, []>;
+def X86expand  : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 3,
+                              [SDTCisSameAs<0, 3>,
+                               SDTCisVec<3>, SDTCisVec<1>, SDTCisInt<1>]>, []>;
+
 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
 //===----------------------------------------------------------------------===//
@@ -334,6 +381,15 @@ def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
 def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
 def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
 
+// These are needed to match a scalar load that is used in a vector-only
+// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
+// The memory operand is required to be a 128-bit load, so it must be converted
+// from a vector to a scalar.
+def loadf32_128 : PatFrag<(ops node:$ptr),
+  (f32 (vector_extract (loadv4f32 node:$ptr), (iPTR 0)))>;
+def loadf64_128 : PatFrag<(ops node:$ptr),
+  (f64 (vector_extract (loadv2f64 node:$ptr), (iPTR 0)))>;
+
 // Like 'store', but always requires 128-bit vector alignment.
 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
                            (store node:$val, node:$ptr), [{
@@ -412,20 +468,10 @@ def alignedloadv8i64  : PatFrag<(ops node:$ptr),
 // setting a feature bit in the processor (on startup, for example).
 // Opteron 10h and later implement such a feature.
 def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  return    Subtarget->hasVectorUAMem()
+  return    Subtarget->hasSSEUnalignedMem()
          || cast<LoadSDNode>(N)->getAlignment() >= 16;
 }]>;
 
-def memop4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  return    Subtarget->hasVectorUAMem()
-         || cast<LoadSDNode>(N)->getAlignment() >= 4;
-}]>;
-
-def memop8 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  return    Subtarget->hasVectorUAMem()
-         || cast<LoadSDNode>(N)->getAlignment() >= 8;
-}]>;
-
 def memopfsf32 : PatFrag<(ops node:$ptr), (f32   (memop node:$ptr))>;
 def memopfsf64 : PatFrag<(ops node:$ptr), (f64   (memop node:$ptr))>;
 
@@ -435,17 +481,15 @@ def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
 def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
 def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
 
-// 256-bit memop pattern fragments
-// NOTE: all 256-bit integer vector loads are promoted to v4i64
-def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>;
-def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>;
-def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>;
+// These are needed to match a scalar memop that is used in a vector-only
+// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
+// The memory operand is required to be a 128-bit load, so it must be converted
+// from a vector to a scalar.
+def memopfsf32_128 : PatFrag<(ops node:$ptr),
+  (f32 (vector_extract (memopv4f32 node:$ptr), (iPTR 0)))>;
+def memopfsf64_128 : PatFrag<(ops node:$ptr),
+  (f64 (vector_extract (memopv2f64 node:$ptr), (iPTR 0)))>;
 
-// 512-bit memop pattern fragments
-def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop4 node:$ptr))>;
-def memopv8f64  : PatFrag<(ops node:$ptr), (v8f64  (memop8 node:$ptr))>;
-def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop4 node:$ptr))>;
-def memopv8i64  : PatFrag<(ops node:$ptr), (v8i64  (memop8 node:$ptr))>;
 
 // SSSE3 uses MMX registers for some instructions. They aren't aligned on a
 // 16-byte boundary.
@@ -482,6 +526,58 @@ def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
   return false;
 }]>;
 
+def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_gather node:$src1, node:$src2, node:$src3) , [{
+  //if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+  //  return (Mgt->getIndex().getValueType() == MVT::v8i32 ||
+  //          Mgt->getBasePtr().getValueType() == MVT::v8i32);
+  //return false;
+  return N != 0;
+}]>;
+
+def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_gather node:$src1, node:$src2, node:$src3) , [{
+  //if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+  //  return (Mgt->getIndex().getValueType() == MVT::v8i64 ||
+  //          Mgt->getBasePtr().getValueType() == MVT::v8i64);
+  //return false;
+  return N != 0;
+}]>;
+def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_gather node:$src1, node:$src2, node:$src3) , [{
+  //if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+  //  return (Mgt->getIndex().getValueType() == MVT::v16i32 ||
+  //          Mgt->getBasePtr().getValueType() == MVT::v16i32);
+  //return false;
+  return N != 0;
+}]>;
+
+def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+  //if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+  //  return (Sc->getIndex().getValueType() == MVT::v8i32 ||
+  //          Sc->getBasePtr().getValueType() == MVT::v8i32);
+  //return false;
+  return N != 0;
+}]>;
+
+def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+  //if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+  //  return (Sc->getIndex().getValueType() == MVT::v8i64 ||
+  //          Sc->getBasePtr().getValueType() == MVT::v8i64);
+  //return false;
+  return N != 0;
+}]>;
+def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+  //if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+  //  return (Sc->getIndex().getValueType() == MVT::v16i32 ||
+  //          Sc->getBasePtr().getValueType() == MVT::v16i32);
+  //return false;
+  return N != 0;
+}]>;
+
 // 128-bit bitconvert pattern fragments
 def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
 def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 7f87bdd..f5b9680 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -65,6 +65,7 @@ enum {
   TB_INDEX_1    = 1,
   TB_INDEX_2    = 2,
   TB_INDEX_3    = 3,
+  TB_INDEX_4    = 4,
   TB_INDEX_MASK = 0xf,
 
   // Do not insert the reverse map (MemOp -> RegOp) into the table.
@@ -90,7 +91,7 @@ enum {
   TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT
 };
 
-struct X86OpTblEntry {
+struct X86MemoryFoldTableEntry {
   uint16_t RegOp;
   uint16_t MemOp;
   uint16_t Flags;
@@ -105,7 +106,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
           (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)),
       Subtarget(STI), RI(STI) {
 
-  static const X86OpTblEntry OpTbl2Addr[] = {
+  static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
     { X86::ADC32ri,     X86::ADC32mi,    0 },
     { X86::ADC32ri8,    X86::ADC32mi8,   0 },
     { X86::ADC32rr,     X86::ADC32mr,    0 },
@@ -145,14 +146,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::AND8rr,      X86::AND8mr,     0 },
     { X86::DEC16r,      X86::DEC16m,     0 },
     { X86::DEC32r,      X86::DEC32m,     0 },
-    { X86::DEC64_16r,   X86::DEC64_16m,  0 },
-    { X86::DEC64_32r,   X86::DEC64_32m,  0 },
     { X86::DEC64r,      X86::DEC64m,     0 },
     { X86::DEC8r,       X86::DEC8m,      0 },
     { X86::INC16r,      X86::INC16m,     0 },
     { X86::INC32r,      X86::INC32m,     0 },
-    { X86::INC64_16r,   X86::INC64_16m,  0 },
-    { X86::INC64_32r,   X86::INC64_32m,  0 },
     { X86::INC64r,      X86::INC64m,     0 },
     { X86::INC8r,       X86::INC8m,      0 },
     { X86::NEG16r,      X86::NEG16m,     0 },
@@ -272,17 +269,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::XOR8rr,      X86::XOR8mr,     0 }
   };
 
-  for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) {
-    unsigned RegOp = OpTbl2Addr[i].RegOp;
-    unsigned MemOp = OpTbl2Addr[i].MemOp;
-    unsigned Flags = OpTbl2Addr[i].Flags;
+  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2Addr); i != e; ++i) {
+    unsigned RegOp = MemoryFoldTable2Addr[i].RegOp;
+    unsigned MemOp = MemoryFoldTable2Addr[i].MemOp;
+    unsigned Flags = MemoryFoldTable2Addr[i].Flags;
     AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
                   RegOp, MemOp,
                   // Index 0, folded load and store, no alignment requirement.
                   Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
   }
 
-  static const X86OpTblEntry OpTbl0[] = {
+  static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
     { X86::BT16ri8,     X86::BT16mi8,       TB_FOLDED_LOAD },
     { X86::BT32ri8,     X86::BT32mi8,       TB_FOLDED_LOAD },
     { X86::BT64ri8,     X86::BT64mi8,       TB_FOLDED_LOAD },
@@ -336,6 +333,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::MUL32r,      X86::MUL32m,        TB_FOLDED_LOAD },
     { X86::MUL64r,      X86::MUL64m,        TB_FOLDED_LOAD },
     { X86::MUL8r,       X86::MUL8m,         TB_FOLDED_LOAD },
+    { X86::PEXTRDrr,    X86::PEXTRDmr,      TB_FOLDED_STORE },
+    { X86::PEXTRQrr,    X86::PEXTRQmr,      TB_FOLDED_STORE },
     { X86::SETAEr,      X86::SETAEm,        TB_FOLDED_STORE },
     { X86::SETAr,       X86::SETAm,         TB_FOLDED_STORE },
     { X86::SETBEr,      X86::SETBEm,        TB_FOLDED_STORE },
@@ -354,10 +353,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::SETSr,       X86::SETSm,         TB_FOLDED_STORE },
     { X86::TAILJMPr,    X86::TAILJMPm,      TB_FOLDED_LOAD },
     { X86::TAILJMPr64,  X86::TAILJMPm64,    TB_FOLDED_LOAD },
+    { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
     { X86::TEST16ri,    X86::TEST16mi,      TB_FOLDED_LOAD },
     { X86::TEST32ri,    X86::TEST32mi,      TB_FOLDED_LOAD },
     { X86::TEST64ri32,  X86::TEST64mi32,    TB_FOLDED_LOAD },
     { X86::TEST8ri,     X86::TEST8mi,       TB_FOLDED_LOAD },
+
     // AVX 128-bit versions of foldable instructions
     { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr,  TB_FOLDED_STORE  },
     { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
@@ -370,6 +371,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVSS2DIrr, X86::VMOVSS2DImr,   TB_FOLDED_STORE },
     { X86::VMOVUPDrr,   X86::VMOVUPDmr,     TB_FOLDED_STORE },
     { X86::VMOVUPSrr,   X86::VMOVUPSmr,     TB_FOLDED_STORE },
+    { X86::VPEXTRDrr,   X86::VPEXTRDmr,     TB_FOLDED_STORE },
+    { X86::VPEXTRQrr,   X86::VPEXTRQmr,     TB_FOLDED_STORE },
+
     // AVX 256-bit foldable instructions
     { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVAPDYrr,  X86::VMOVAPDYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
@@ -377,6 +381,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVDQAYrr,  X86::VMOVDQAYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
     { X86::VMOVUPDYrr,  X86::VMOVUPDYmr,    TB_FOLDED_STORE },
     { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE },
+
     // AVX-512 foldable instructions
     { X86::VMOVPDI2DIZrr,   X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
     { X86::VMOVAPDZrr,      X86::VMOVAPDZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
@@ -389,6 +394,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVDQU16Zrr,    X86::VMOVDQU16Zmr,  TB_FOLDED_STORE },
     { X86::VMOVDQU32Zrr,    X86::VMOVDQU32Zmr,  TB_FOLDED_STORE },
     { X86::VMOVDQU64Zrr,    X86::VMOVDQU64Zmr,  TB_FOLDED_STORE },
+
     // AVX-512 foldable instructions (256-bit versions)
     { X86::VMOVAPDZ256rr,      X86::VMOVAPDZ256mr,    TB_FOLDED_STORE | TB_ALIGN_32 },
     { X86::VMOVAPSZ256rr,      X86::VMOVAPSZ256mr,    TB_FOLDED_STORE | TB_ALIGN_32 },
@@ -400,6 +406,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVDQU16Z256rr,    X86::VMOVDQU16Z256mr,  TB_FOLDED_STORE },
     { X86::VMOVDQU32Z256rr,    X86::VMOVDQU32Z256mr,  TB_FOLDED_STORE },
     { X86::VMOVDQU64Z256rr,    X86::VMOVDQU64Z256mr,  TB_FOLDED_STORE },
+
     // AVX-512 foldable instructions (128-bit versions)
     { X86::VMOVAPDZ128rr,      X86::VMOVAPDZ128mr,    TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVAPSZ128rr,      X86::VMOVAPSZ128mr,    TB_FOLDED_STORE | TB_ALIGN_16 },
@@ -410,18 +417,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVDQU8Z128rr,     X86::VMOVDQU8Z128mr,   TB_FOLDED_STORE },
     { X86::VMOVDQU16Z128rr,    X86::VMOVDQU16Z128mr,  TB_FOLDED_STORE },
     { X86::VMOVDQU32Z128rr,    X86::VMOVDQU32Z128mr,  TB_FOLDED_STORE },
-    { X86::VMOVDQU64Z128rr,    X86::VMOVDQU64Z128mr,  TB_FOLDED_STORE }
+    { X86::VMOVDQU64Z128rr,    X86::VMOVDQU64Z128mr,  TB_FOLDED_STORE },
+
+    // F16C foldable instructions
+    { X86::VCVTPS2PHrr,        X86::VCVTPS2PHmr,      TB_FOLDED_STORE },
+    { X86::VCVTPS2PHYrr,       X86::VCVTPS2PHYmr,     TB_FOLDED_STORE }
   };
 
-  for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
-    unsigned RegOp      = OpTbl0[i].RegOp;
-    unsigned MemOp      = OpTbl0[i].MemOp;
-    unsigned Flags      = OpTbl0[i].Flags;
+  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable0); i != e; ++i) {
+    unsigned RegOp      = MemoryFoldTable0[i].RegOp;
+    unsigned MemOp      = MemoryFoldTable0[i].MemOp;
+    unsigned Flags      = MemoryFoldTable0[i].Flags;
     AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
                   RegOp, MemOp, TB_INDEX_0 | Flags);
   }
 
-  static const X86OpTblEntry OpTbl1[] = {
+  static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
     { X86::CMP16rr,         X86::CMP16rm,             0 },
     { X86::CMP32rr,         X86::CMP32rm,             0 },
     { X86::CMP64rr,         X86::CMP64rm,             0 },
@@ -448,9 +459,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::CVTSD2SIrr,      X86::CVTSD2SIrm,          0 },
     { X86::CVTSS2SI64rr,    X86::CVTSS2SI64rm,        0 },
     { X86::CVTSS2SIrr,      X86::CVTSS2SIrm,          0 },
+    { X86::CVTDQ2PDrr,      X86::CVTDQ2PDrm,          TB_ALIGN_16 },
     { X86::CVTDQ2PSrr,      X86::CVTDQ2PSrm,          TB_ALIGN_16 },
     { X86::CVTPD2DQrr,      X86::CVTPD2DQrm,          TB_ALIGN_16 },
+    { X86::CVTPD2PSrr,      X86::CVTPD2PSrm,          TB_ALIGN_16 },
     { X86::CVTPS2DQrr,      X86::CVTPS2DQrm,          TB_ALIGN_16 },
+    { X86::CVTPS2PDrr,      X86::CVTPS2PDrm,          TB_ALIGN_16 },
     { X86::CVTTPD2DQrr,     X86::CVTTPD2DQrm,         TB_ALIGN_16 },
     { X86::CVTTPS2DQrr,     X86::CVTTPS2DQrm,         TB_ALIGN_16 },
     { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm,  0 },
@@ -490,11 +504,31 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PABSBrr128,      X86::PABSBrm128,          TB_ALIGN_16 },
     { X86::PABSDrr128,      X86::PABSDrm128,          TB_ALIGN_16 },
     { X86::PABSWrr128,      X86::PABSWrm128,          TB_ALIGN_16 },
+    { X86::PCMPESTRIrr,     X86::PCMPESTRIrm,         TB_ALIGN_16 },
+    { X86::PCMPESTRM128rr,  X86::PCMPESTRM128rm,      TB_ALIGN_16 },
+    { X86::PCMPISTRIrr,     X86::PCMPISTRIrm,         TB_ALIGN_16 },
+    { X86::PCMPISTRM128rr,  X86::PCMPISTRM128rm,      TB_ALIGN_16 },
+    { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128,     TB_ALIGN_16 },
+    { X86::PMOVSXBDrr,      X86::PMOVSXBDrm,          TB_ALIGN_16 },
+    { X86::PMOVSXBQrr,      X86::PMOVSXBQrm,          TB_ALIGN_16 },
+    { X86::PMOVSXBWrr,      X86::PMOVSXBWrm,          TB_ALIGN_16 },
+    { X86::PMOVSXDQrr,      X86::PMOVSXDQrm,          TB_ALIGN_16 },
+    { X86::PMOVSXWDrr,      X86::PMOVSXWDrm,          TB_ALIGN_16 },
+    { X86::PMOVSXWQrr,      X86::PMOVSXWQrm,          TB_ALIGN_16 },
+    { X86::PMOVZXBDrr,      X86::PMOVZXBDrm,          TB_ALIGN_16 },
+    { X86::PMOVZXBQrr,      X86::PMOVZXBQrm,          TB_ALIGN_16 },
+    { X86::PMOVZXBWrr,      X86::PMOVZXBWrm,          TB_ALIGN_16 },
+    { X86::PMOVZXDQrr,      X86::PMOVZXDQrm,          TB_ALIGN_16 },
+    { X86::PMOVZXWDrr,      X86::PMOVZXWDrm,          TB_ALIGN_16 },
+    { X86::PMOVZXWQrr,      X86::PMOVZXWQrm,          TB_ALIGN_16 },
     { X86::PSHUFDri,        X86::PSHUFDmi,            TB_ALIGN_16 },
     { X86::PSHUFHWri,       X86::PSHUFHWmi,           TB_ALIGN_16 },
     { X86::PSHUFLWri,       X86::PSHUFLWmi,           TB_ALIGN_16 },
+    { X86::PTESTrr,         X86::PTESTrm,             TB_ALIGN_16 },
     { X86::RCPPSr,          X86::RCPPSm,              TB_ALIGN_16 },
     { X86::RCPPSr_Int,      X86::RCPPSm_Int,          TB_ALIGN_16 },
+    { X86::ROUNDPDr,        X86::ROUNDPDm,            TB_ALIGN_16 },
+    { X86::ROUNDPSr,        X86::ROUNDPSm,            TB_ALIGN_16 },
     { X86::RSQRTPSr,        X86::RSQRTPSm,            TB_ALIGN_16 },
     { X86::RSQRTPSr_Int,    X86::RSQRTPSm_Int,        TB_ALIGN_16 },
     { X86::RSQRTSSr,        X86::RSQRTSSm,            0 },
@@ -512,6 +546,19 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
     { X86::UCOMISDrr,       X86::UCOMISDrm,           0 },
     { X86::UCOMISSrr,       X86::UCOMISSrm,           0 },
+
+    // MMX version of foldable instructions
+    { X86::MMX_CVTPD2PIirr,   X86::MMX_CVTPD2PIirm,   0 },
+    { X86::MMX_CVTPI2PDirr,   X86::MMX_CVTPI2PDirm,   0 },
+    { X86::MMX_CVTPS2PIirr,   X86::MMX_CVTPS2PIirm,   0 },
+    { X86::MMX_CVTTPD2PIirr,  X86::MMX_CVTTPD2PIirm,  0 },
+    { X86::MMX_CVTTPS2PIirr,  X86::MMX_CVTTPS2PIirm,  0 },
+    { X86::MMX_MOVD64to64rr,  X86::MMX_MOVQ64rm,      0 },
+    { X86::MMX_PABSBrr64,     X86::MMX_PABSBrm64,     0 },
+    { X86::MMX_PABSDrr64,     X86::MMX_PABSDrm64,     0 },
+    { X86::MMX_PABSWrr64,     X86::MMX_PABSWrm64,     0 },
+    { X86::MMX_PSHUFWri,      X86::MMX_PSHUFWmi,      0 },
+
     // AVX 128-bit versions of foldable instructions
     { X86::Int_VCOMISDrr,   X86::Int_VCOMISDrm,       0 },
     { X86::Int_VCOMISSrr,   X86::Int_VCOMISSrm,       0 },
@@ -529,9 +576,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VCVTSD2SIrr,     X86::VCVTSD2SIrm,         0 },
     { X86::VCVTSS2SI64rr,   X86::VCVTSS2SI64rm,       0 },
     { X86::VCVTSS2SIrr,     X86::VCVTSS2SIrm,         0 },
+    { X86::VCVTDQ2PDrr,     X86::VCVTDQ2PDrm,         0 },
     { X86::VCVTDQ2PSrr,     X86::VCVTDQ2PSrm,         0 },
     { X86::VCVTPD2DQrr,     X86::VCVTPD2DQXrm,        0 },
+    { X86::VCVTPD2PSrr,     X86::VCVTPD2PSXrm,        0 },
     { X86::VCVTPS2DQrr,     X86::VCVTPS2DQrm,         0 },
+    { X86::VCVTPS2PDrr,     X86::VCVTPS2PDrm,         0 },
     { X86::VCVTTPD2DQrr,    X86::VCVTTPD2DQXrm,       0 },
     { X86::VCVTTPS2DQrr,    X86::VCVTTPS2DQrm,        0 },
     { X86::VMOV64toPQIrr,   X86::VMOVQI2PQIrm,        0 },
@@ -542,8 +592,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVDI2PDIrr,    X86::VMOVDI2PDIrm,        0 },
     { X86::VMOVDI2SSrr,     X86::VMOVDI2SSrm,         0 },
     { X86::VMOVDQArr,       X86::VMOVDQArm,           TB_ALIGN_16 },
-    { X86::VMOVSLDUPrr,     X86::VMOVSLDUPrm,         TB_ALIGN_16 },
-    { X86::VMOVSHDUPrr,     X86::VMOVSHDUPrm,         TB_ALIGN_16 },
+    { X86::VMOVSLDUPrr,     X86::VMOVSLDUPrm,         0 },
+    { X86::VMOVSHDUPrr,     X86::VMOVSHDUPrm,         0 },
     { X86::VMOVUPDrr,       X86::VMOVUPDrm,           0 },
     { X86::VMOVUPSrr,       X86::VMOVUPSrm,           0 },
     { X86::VMOVZQI2PQIrr,   X86::VMOVZQI2PQIrm,       0 },
@@ -551,50 +601,151 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPABSBrr128,     X86::VPABSBrm128,         0 },
     { X86::VPABSDrr128,     X86::VPABSDrm128,         0 },
     { X86::VPABSWrr128,     X86::VPABSWrm128,         0 },
+    { X86::VPCMPESTRIrr,    X86::VPCMPESTRIrm,        0 },
+    { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm,     0 },
+    { X86::VPCMPISTRIrr,    X86::VPCMPISTRIrm,        0 },
+    { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm,     0 },
+    { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128,   0 },
     { X86::VPERMILPDri,     X86::VPERMILPDmi,         0 },
     { X86::VPERMILPSri,     X86::VPERMILPSmi,         0 },
+    { X86::VPMOVSXBDrr,     X86::VPMOVSXBDrm,         0 },
+    { X86::VPMOVSXBQrr,     X86::VPMOVSXBQrm,         0 },
+    { X86::VPMOVSXBWrr,     X86::VPMOVSXBWrm,         0 },
+    { X86::VPMOVSXDQrr,     X86::VPMOVSXDQrm,         0 },
+    { X86::VPMOVSXWDrr,     X86::VPMOVSXWDrm,         0 },
+    { X86::VPMOVSXWQrr,     X86::VPMOVSXWQrm,         0 },
+    { X86::VPMOVZXBDrr,     X86::VPMOVZXBDrm,         0 },
+    { X86::VPMOVZXBQrr,     X86::VPMOVZXBQrm,         0 },
+    { X86::VPMOVZXBWrr,     X86::VPMOVZXBWrm,         0 },
+    { X86::VPMOVZXDQrr,     X86::VPMOVZXDQrm,         0 },
+    { X86::VPMOVZXWDrr,     X86::VPMOVZXWDrm,         0 },
+    { X86::VPMOVZXWQrr,     X86::VPMOVZXWQrm,         0 },
     { X86::VPSHUFDri,       X86::VPSHUFDmi,           0 },
     { X86::VPSHUFHWri,      X86::VPSHUFHWmi,          0 },
     { X86::VPSHUFLWri,      X86::VPSHUFLWmi,          0 },
+    { X86::VPTESTrr,        X86::VPTESTrm,            0 },
     { X86::VRCPPSr,         X86::VRCPPSm,             0 },
     { X86::VRCPPSr_Int,     X86::VRCPPSm_Int,         0 },
+    { X86::VROUNDPDr,       X86::VROUNDPDm,           0 },
+    { X86::VROUNDPSr,       X86::VROUNDPSm,           0 },
     { X86::VRSQRTPSr,       X86::VRSQRTPSm,           0 },
     { X86::VRSQRTPSr_Int,   X86::VRSQRTPSm_Int,       0 },
     { X86::VSQRTPDr,        X86::VSQRTPDm,            0 },
     { X86::VSQRTPSr,        X86::VSQRTPSm,            0 },
+    { X86::VTESTPDrr,       X86::VTESTPDrm,           0 },
+    { X86::VTESTPSrr,       X86::VTESTPSrm,           0 },
     { X86::VUCOMISDrr,      X86::VUCOMISDrm,          0 },
     { X86::VUCOMISSrr,      X86::VUCOMISSrm,          0 },
-    { X86::VBROADCASTSSrr,  X86::VBROADCASTSSrm,      TB_NO_REVERSE },
 
     // AVX 256-bit foldable instructions
+    { X86::VCVTDQ2PDYrr,    X86::VCVTDQ2PDYrm,        0 },
     { X86::VCVTDQ2PSYrr,    X86::VCVTDQ2PSYrm,        0 },
     { X86::VCVTPD2DQYrr,    X86::VCVTPD2DQYrm,        0 },
+    { X86::VCVTPD2PSYrr,    X86::VCVTPD2PSYrm,        0 },
     { X86::VCVTPS2DQYrr,    X86::VCVTPS2DQYrm,        0 },
+    { X86::VCVTPS2PDYrr,    X86::VCVTPS2PDYrm,        0 },
     { X86::VCVTTPD2DQYrr,   X86::VCVTTPD2DQYrm,       0 },
     { X86::VCVTTPS2DQYrr,   X86::VCVTTPS2DQYrm,       0 },
     { X86::VMOVAPDYrr,      X86::VMOVAPDYrm,          TB_ALIGN_32 },
     { X86::VMOVAPSYrr,      X86::VMOVAPSYrm,          TB_ALIGN_32 },
+    { X86::VMOVDDUPYrr,     X86::VMOVDDUPYrm,         0 },
     { X86::VMOVDQAYrr,      X86::VMOVDQAYrm,          TB_ALIGN_32 },
+    { X86::VMOVSLDUPYrr,    X86::VMOVSLDUPYrm,        0 },
+    { X86::VMOVSHDUPYrr,    X86::VMOVSHDUPYrm,        0 },
     { X86::VMOVUPDYrr,      X86::VMOVUPDYrm,          0 },
     { X86::VMOVUPSYrr,      X86::VMOVUPSYrm,          0 },
     { X86::VPERMILPDYri,    X86::VPERMILPDYmi,        0 },
     { X86::VPERMILPSYri,    X86::VPERMILPSYmi,        0 },
+    { X86::VPTESTYrr,       X86::VPTESTYrm,           0 },
     { X86::VRCPPSYr,        X86::VRCPPSYm,            0 },
     { X86::VRCPPSYr_Int,    X86::VRCPPSYm_Int,        0 },
+    { X86::VROUNDYPDr,      X86::VROUNDYPDm,          0 },
+    { X86::VROUNDYPSr,      X86::VROUNDYPSm,          0 },
     { X86::VRSQRTPSYr,      X86::VRSQRTPSYm,          0 },
+    { X86::VRSQRTPSYr_Int,  X86::VRSQRTPSYm_Int,      0 },
     { X86::VSQRTPDYr,       X86::VSQRTPDYm,           0 },
     { X86::VSQRTPSYr,       X86::VSQRTPSYm,           0 },
-    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
-    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
+    { X86::VTESTPDYrr,      X86::VTESTPDYrm,          0 },
+    { X86::VTESTPSYrr,      X86::VTESTPSYrm,          0 },
 
     // AVX2 foldable instructions
+
+    // VBROADCASTS{SD}rr register instructions were an AVX2 addition while the
+    // VBROADCASTS{SD}rm memory instructions were available from AVX1.
+    // TB_NO_REVERSE prevents unfolding from introducing an illegal instruction
+    // on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions
+    // so they don't need an equivalent limitation.
+    { X86::VBROADCASTSSrr,  X86::VBROADCASTSSrm,      TB_NO_REVERSE },
+    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
     { X86::VPABSBrr256,     X86::VPABSBrm256,         0 },
     { X86::VPABSDrr256,     X86::VPABSDrm256,         0 },
     { X86::VPABSWrr256,     X86::VPABSWrm256,         0 },
+    { X86::VPBROADCASTBrr,  X86::VPBROADCASTBrm,      0 },
+    { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm,     0 },
+    { X86::VPBROADCASTDrr,  X86::VPBROADCASTDrm,      0 },
+    { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm,     0 },
+    { X86::VPBROADCASTQrr,  X86::VPBROADCASTQrm,      0 },
+    { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm,     0 },
+    { X86::VPBROADCASTWrr,  X86::VPBROADCASTWrm,      0 },
+    { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm,     0 },
+    { X86::VPERMPDYri,      X86::VPERMPDYmi,          0 },
+    { X86::VPERMQYri,       X86::VPERMQYmi,           0 },
+    { X86::VPMOVSXBDYrr,    X86::VPMOVSXBDYrm,        0 },
+    { X86::VPMOVSXBQYrr,    X86::VPMOVSXBQYrm,        0 },
+    { X86::VPMOVSXBWYrr,    X86::VPMOVSXBWYrm,        0 },
+    { X86::VPMOVSXDQYrr,    X86::VPMOVSXDQYrm,        0 },
+    { X86::VPMOVSXWDYrr,    X86::VPMOVSXWDYrm,        0 },
+    { X86::VPMOVSXWQYrr,    X86::VPMOVSXWQYrm,        0 },
+    { X86::VPMOVZXBDYrr,    X86::VPMOVZXBDYrm,        0 },
+    { X86::VPMOVZXBQYrr,    X86::VPMOVZXBQYrm,        0 },
+    { X86::VPMOVZXBWYrr,    X86::VPMOVZXBWYrm,        0 },
+    { X86::VPMOVZXDQYrr,    X86::VPMOVZXDQYrm,        0 },
+    { X86::VPMOVZXWDYrr,    X86::VPMOVZXWDYrm,        0 },
+    { X86::VPMOVZXWQYrr,    X86::VPMOVZXWQYrm,        0 },
     { X86::VPSHUFDYri,      X86::VPSHUFDYmi,          0 },
     { X86::VPSHUFHWYri,     X86::VPSHUFHWYmi,         0 },
     { X86::VPSHUFLWYri,     X86::VPSHUFLWYmi,         0 },
 
+    // XOP foldable instructions
+    { X86::VFRCZPDrr,          X86::VFRCZPDrm,        0 },
+    { X86::VFRCZPDrrY,         X86::VFRCZPDrmY,       0 },
+    { X86::VFRCZPSrr,          X86::VFRCZPSrm,        0 },
+    { X86::VFRCZPSrrY,         X86::VFRCZPSrmY,       0 },
+    { X86::VFRCZSDrr,          X86::VFRCZSDrm,        0 },
+    { X86::VFRCZSSrr,          X86::VFRCZSSrm,        0 },
+    { X86::VPHADDBDrr,         X86::VPHADDBDrm,       0 },
+    { X86::VPHADDBQrr,         X86::VPHADDBQrm,       0 },
+    { X86::VPHADDBWrr,         X86::VPHADDBWrm,       0 },
+    { X86::VPHADDDQrr,         X86::VPHADDDQrm,       0 },
+    { X86::VPHADDWDrr,         X86::VPHADDWDrm,       0 },
+    { X86::VPHADDWQrr,         X86::VPHADDWQrm,       0 },
+    { X86::VPHADDUBDrr,        X86::VPHADDUBDrm,      0 },
+    { X86::VPHADDUBQrr,        X86::VPHADDUBQrm,      0 },
+    { X86::VPHADDUBWrr,        X86::VPHADDUBWrm,      0 },
+    { X86::VPHADDUDQrr,        X86::VPHADDUDQrm,      0 },
+    { X86::VPHADDUWDrr,        X86::VPHADDUWDrm,      0 },
+    { X86::VPHADDUWQrr,        X86::VPHADDUWQrm,      0 },
+    { X86::VPHSUBBWrr,         X86::VPHSUBBWrm,       0 },
+    { X86::VPHSUBDQrr,         X86::VPHSUBDQrm,       0 },
+    { X86::VPHSUBWDrr,         X86::VPHSUBWDrm,       0 },
+    { X86::VPROTBri,           X86::VPROTBmi,         0 },
+    { X86::VPROTBrr,           X86::VPROTBmr,         0 },
+    { X86::VPROTDri,           X86::VPROTDmi,         0 },
+    { X86::VPROTDrr,           X86::VPROTDmr,         0 },
+    { X86::VPROTQri,           X86::VPROTQmi,         0 },
+    { X86::VPROTQrr,           X86::VPROTQmr,         0 },
+    { X86::VPROTWri,           X86::VPROTWmi,         0 },
+    { X86::VPROTWrr,           X86::VPROTWmr,         0 },
+    { X86::VPSHABrr,           X86::VPSHABmr,         0 },
+    { X86::VPSHADrr,           X86::VPSHADmr,         0 },
+    { X86::VPSHAQrr,           X86::VPSHAQmr,         0 },
+    { X86::VPSHAWrr,           X86::VPSHAWmr,         0 },
+    { X86::VPSHLBrr,           X86::VPSHLBmr,         0 },
+    { X86::VPSHLDrr,           X86::VPSHLDmr,         0 },
+    { X86::VPSHLQrr,           X86::VPSHLQmr,         0 },
+    { X86::VPSHLWrr,           X86::VPSHLWmr,         0 },
+
     // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
     { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
     { X86::BEXTR64rr,       X86::BEXTR64rm,           0 },
@@ -659,6 +810,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVUPSZrr,      X86::VMOVUPSZrm,          0 },
     { X86::VPABSDZrr,       X86::VPABSDZrm,           0 },
     { X86::VPABSQZrr,       X86::VPABSQZrm,           0 },
+    { X86::VBROADCASTSSZr,  X86::VBROADCASTSSZm,      TB_NO_REVERSE },
+    { X86::VBROADCASTSDZr,  X86::VBROADCASTSDZm,      TB_NO_REVERSE },
+
     // AVX-512 foldable instructions (256-bit versions)
     { X86::VMOVAPDZ256rr,      X86::VMOVAPDZ256rm,          TB_ALIGN_32 },
     { X86::VMOVAPSZ256rr,      X86::VMOVAPSZ256rm,          TB_ALIGN_32 },
@@ -670,6 +824,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVDQU64Z256rr,    X86::VMOVDQU64Z256rm,        0 },
     { X86::VMOVUPDZ256rr,      X86::VMOVUPDZ256rm,          0 },
     { X86::VMOVUPSZ256rr,      X86::VMOVUPSZ256rm,          0 },
+    { X86::VBROADCASTSSZ256r,  X86::VBROADCASTSSZ256m,      TB_NO_REVERSE },
+    { X86::VBROADCASTSDZ256r,  X86::VBROADCASTSDZ256m,      TB_NO_REVERSE },
+
     // AVX-512 foldable instructions (256-bit versions)
     { X86::VMOVAPDZ128rr,      X86::VMOVAPDZ128rm,          TB_ALIGN_16 },
     { X86::VMOVAPSZ128rr,      X86::VMOVAPSZ128rm,          TB_ALIGN_16 },
@@ -681,25 +838,30 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVDQU64Z128rr,    X86::VMOVDQU64Z128rm,        0 },
     { X86::VMOVUPDZ128rr,      X86::VMOVUPDZ128rm,          0 },
     { X86::VMOVUPSZ128rr,      X86::VMOVUPSZ128rm,          0 },
+    { X86::VBROADCASTSSZ128r,  X86::VBROADCASTSSZ128m,      TB_NO_REVERSE },
+
+    // F16C foldable instructions
+    { X86::VCVTPH2PSrr,        X86::VCVTPH2PSrm,            0 },
+    { X86::VCVTPH2PSYrr,       X86::VCVTPH2PSYrm,           0 },
 
     // AES foldable instructions
     { X86::AESIMCrr,              X86::AESIMCrm,              TB_ALIGN_16 },
     { X86::AESKEYGENASSIST128rr,  X86::AESKEYGENASSIST128rm,  TB_ALIGN_16 },
-    { X86::VAESIMCrr,             X86::VAESIMCrm,             TB_ALIGN_16 },
-    { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 }
+    { X86::VAESIMCrr,             X86::VAESIMCrm,             0 },
+    { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
   };
 
-  for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
-    unsigned RegOp = OpTbl1[i].RegOp;
-    unsigned MemOp = OpTbl1[i].MemOp;
-    unsigned Flags = OpTbl1[i].Flags;
+  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable1); i != e; ++i) {
+    unsigned RegOp = MemoryFoldTable1[i].RegOp;
+    unsigned MemOp = MemoryFoldTable1[i].MemOp;
+    unsigned Flags = MemoryFoldTable1[i].Flags;
     AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
                   RegOp, MemOp,
                   // Index 1, folded load
                   Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
   }
 
-  static const X86OpTblEntry OpTbl2[] = {
+  static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
     { X86::ADC32rr,         X86::ADC32rm,       0 },
     { X86::ADC64rr,         X86::ADC64rm,       0 },
     { X86::ADD16rr,         X86::ADD16rm,       0 },
@@ -712,7 +874,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::ADDPDrr,         X86::ADDPDrm,       TB_ALIGN_16 },
     { X86::ADDPSrr,         X86::ADDPSrm,       TB_ALIGN_16 },
     { X86::ADDSDrr,         X86::ADDSDrm,       0 },
+    { X86::ADDSDrr_Int,     X86::ADDSDrm_Int,   0 },
     { X86::ADDSSrr,         X86::ADDSSrm,       0 },
+    { X86::ADDSSrr_Int,     X86::ADDSSrm_Int,   0 },
     { X86::ADDSUBPDrr,      X86::ADDSUBPDrm,    TB_ALIGN_16 },
     { X86::ADDSUBPSrr,      X86::ADDSUBPSrm,    TB_ALIGN_16 },
     { X86::AND16rr,         X86::AND16rm,       0 },
@@ -782,7 +946,16 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::DIVPDrr,         X86::DIVPDrm,       TB_ALIGN_16 },
     { X86::DIVPSrr,         X86::DIVPSrm,       TB_ALIGN_16 },
     { X86::DIVSDrr,         X86::DIVSDrm,       0 },
+    { X86::DIVSDrr_Int,     X86::DIVSDrm_Int,   0 },
     { X86::DIVSSrr,         X86::DIVSSrm,       0 },
+    { X86::DIVSSrr_Int,     X86::DIVSSrm_Int,   0 },
+    { X86::DPPDrri,         X86::DPPDrmi,       TB_ALIGN_16 },
+    { X86::DPPSrri,         X86::DPPSrmi,       TB_ALIGN_16 },
+
+    // FIXME: We should not be folding Fs* scalar loads into vector
+    // instructions because the vector instructions require vector-sized
+    // loads. Lowering should create vector-sized instructions (the Fv*
+    // variants below) to allow load folding.
     { X86::FsANDNPDrr,      X86::FsANDNPDrm,    TB_ALIGN_16 },
     { X86::FsANDNPSrr,      X86::FsANDNPSrm,    TB_ALIGN_16 },
     { X86::FsANDPDrr,       X86::FsANDPDrm,     TB_ALIGN_16 },
@@ -791,6 +964,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::FsORPSrr,        X86::FsORPSrm,      TB_ALIGN_16 },
     { X86::FsXORPDrr,       X86::FsXORPDrm,     TB_ALIGN_16 },
     { X86::FsXORPSrr,       X86::FsXORPSrm,     TB_ALIGN_16 },
+
+    { X86::FvANDNPDrr,      X86::FvANDNPDrm,    TB_ALIGN_16 },
+    { X86::FvANDNPSrr,      X86::FvANDNPSrm,    TB_ALIGN_16 },
+    { X86::FvANDPDrr,       X86::FvANDPDrm,     TB_ALIGN_16 },
+    { X86::FvANDPSrr,       X86::FvANDPSrm,     TB_ALIGN_16 },
+    { X86::FvORPDrr,        X86::FvORPDrm,      TB_ALIGN_16 },
+    { X86::FvORPSrr,        X86::FvORPSrm,      TB_ALIGN_16 },
+    { X86::FvXORPDrr,       X86::FvXORPDrm,     TB_ALIGN_16 },
+    { X86::FvXORPSrr,       X86::FvXORPSrm,     TB_ALIGN_16 },
     { X86::HADDPDrr,        X86::HADDPDrm,      TB_ALIGN_16 },
     { X86::HADDPSrr,        X86::HADDPSrm,      TB_ALIGN_16 },
     { X86::HSUBPDrr,        X86::HSUBPDrm,      TB_ALIGN_16 },
@@ -809,16 +991,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::MAXPDrr,         X86::MAXPDrm,       TB_ALIGN_16 },
     { X86::MAXPSrr,         X86::MAXPSrm,       TB_ALIGN_16 },
     { X86::MAXSDrr,         X86::MAXSDrm,       0 },
+    { X86::MAXSDrr_Int,     X86::MAXSDrm_Int,   0 },
     { X86::MAXSSrr,         X86::MAXSSrm,       0 },
+    { X86::MAXSSrr_Int,     X86::MAXSSrm_Int,   0 },
     { X86::MINPDrr,         X86::MINPDrm,       TB_ALIGN_16 },
     { X86::MINPSrr,         X86::MINPSrm,       TB_ALIGN_16 },
     { X86::MINSDrr,         X86::MINSDrm,       0 },
+    { X86::MINSDrr_Int,     X86::MINSDrm_Int,   0 },
     { X86::MINSSrr,         X86::MINSSrm,       0 },
+    { X86::MINSSrr_Int,     X86::MINSSrm_Int,   0 },
     { X86::MPSADBWrri,      X86::MPSADBWrmi,    TB_ALIGN_16 },
     { X86::MULPDrr,         X86::MULPDrm,       TB_ALIGN_16 },
     { X86::MULPSrr,         X86::MULPSrm,       TB_ALIGN_16 },
     { X86::MULSDrr,         X86::MULSDrm,       0 },
+    { X86::MULSDrr_Int,     X86::MULSDrm_Int,   0 },
     { X86::MULSSrr,         X86::MULSSrm,       0 },
+    { X86::MULSSrr_Int,     X86::MULSSrm_Int,   0 },
     { X86::OR16rr,          X86::OR16rm,        0 },
     { X86::OR32rr,          X86::OR32rm,        0 },
     { X86::OR64rr,          X86::OR64rm,        0 },
@@ -842,7 +1030,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PANDrr,          X86::PANDrm,        TB_ALIGN_16 },
     { X86::PAVGBrr,         X86::PAVGBrm,       TB_ALIGN_16 },
     { X86::PAVGWrr,         X86::PAVGWrm,       TB_ALIGN_16 },
+    { X86::PBLENDVBrr0,     X86::PBLENDVBrm0,   TB_ALIGN_16 },
     { X86::PBLENDWrri,      X86::PBLENDWrmi,    TB_ALIGN_16 },
+    { X86::PCLMULQDQrr,     X86::PCLMULQDQrm,   TB_ALIGN_16 },
     { X86::PCMPEQBrr,       X86::PCMPEQBrm,     TB_ALIGN_16 },
     { X86::PCMPEQDrr,       X86::PCMPEQDrm,     TB_ALIGN_16 },
     { X86::PCMPEQQrr,       X86::PCMPEQQrm,     TB_ALIGN_16 },
@@ -857,7 +1047,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PHSUBDrr,        X86::PHSUBDrm,      TB_ALIGN_16 },
     { X86::PHSUBSWrr128,    X86::PHSUBSWrm128,  TB_ALIGN_16 },
     { X86::PHSUBWrr,        X86::PHSUBWrm,      TB_ALIGN_16 },
-    { X86::PINSRWrri,       X86::PINSRWrmi,     TB_ALIGN_16 },
+    { X86::PINSRBrr,        X86::PINSRBrm,      0 },
+    { X86::PINSRDrr,        X86::PINSRDrm,      0 },
+    { X86::PINSRQrr,        X86::PINSRQrm,      0 },
+    { X86::PINSRWrri,       X86::PINSRWrmi,     0 },
     { X86::PMADDUBSWrr128,  X86::PMADDUBSWrm128, TB_ALIGN_16 },
     { X86::PMADDWDrr,       X86::PMADDWDrm,     TB_ALIGN_16 },
     { X86::PMAXSWrr,        X86::PMAXSWrm,      TB_ALIGN_16 },
@@ -895,8 +1088,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PSRLWrr,         X86::PSRLWrm,       TB_ALIGN_16 },
     { X86::PSUBBrr,         X86::PSUBBrm,       TB_ALIGN_16 },
     { X86::PSUBDrr,         X86::PSUBDrm,       TB_ALIGN_16 },
+    { X86::PSUBQrr,         X86::PSUBQrm,       TB_ALIGN_16 },
     { X86::PSUBSBrr,        X86::PSUBSBrm,      TB_ALIGN_16 },
     { X86::PSUBSWrr,        X86::PSUBSWrm,      TB_ALIGN_16 },
+    { X86::PSUBUSBrr,       X86::PSUBUSBrm,     TB_ALIGN_16 },
+    { X86::PSUBUSWrr,       X86::PSUBUSWrm,     TB_ALIGN_16 },
     { X86::PSUBWrr,         X86::PSUBWrm,       TB_ALIGN_16 },
     { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm,   TB_ALIGN_16 },
     { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm,   TB_ALIGN_16 },
@@ -918,7 +1114,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::SUBPDrr,         X86::SUBPDrm,       TB_ALIGN_16 },
     { X86::SUBPSrr,         X86::SUBPSrm,       TB_ALIGN_16 },
     { X86::SUBSDrr,         X86::SUBSDrm,       0 },
+    { X86::SUBSDrr_Int,     X86::SUBSDrm_Int,   0 },
     { X86::SUBSSrr,         X86::SUBSSrm,       0 },
+    { X86::SUBSSrr_Int,     X86::SUBSSrm_Int,   0 },
     // FIXME: TEST*rr -> swapped operand of TEST*mr.
     { X86::UNPCKHPDrr,      X86::UNPCKHPDrm,    TB_ALIGN_16 },
     { X86::UNPCKHPSrr,      X86::UNPCKHPSrm,    TB_ALIGN_16 },
@@ -930,6 +1128,79 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::XOR8rr,          X86::XOR8rm,        0 },
     { X86::XORPDrr,         X86::XORPDrm,       TB_ALIGN_16 },
     { X86::XORPSrr,         X86::XORPSrm,       TB_ALIGN_16 },
+
+    // MMX version of foldable instructions
+    { X86::MMX_CVTPI2PSirr,   X86::MMX_CVTPI2PSirm,   0 },
+    { X86::MMX_PACKSSDWirr,   X86::MMX_PACKSSDWirm,   0 },
+    { X86::MMX_PACKSSWBirr,   X86::MMX_PACKSSWBirm,   0 },
+    { X86::MMX_PACKUSWBirr,   X86::MMX_PACKUSWBirm,   0 },
+    { X86::MMX_PADDBirr,      X86::MMX_PADDBirm,      0 },
+    { X86::MMX_PADDDirr,      X86::MMX_PADDDirm,      0 },
+    { X86::MMX_PADDQirr,      X86::MMX_PADDQirm,      0 },
+    { X86::MMX_PADDSBirr,     X86::MMX_PADDSBirm,     0 },
+    { X86::MMX_PADDSWirr,     X86::MMX_PADDSWirm,     0 },
+    { X86::MMX_PADDUSBirr,    X86::MMX_PADDUSBirm,    0 },
+    { X86::MMX_PADDUSWirr,    X86::MMX_PADDUSWirm,    0 },
+    { X86::MMX_PADDWirr,      X86::MMX_PADDWirm,      0 },
+    { X86::MMX_PALIGNR64irr,  X86::MMX_PALIGNR64irm,  0 },
+    { X86::MMX_PANDNirr,      X86::MMX_PANDNirm,      0 },
+    { X86::MMX_PANDirr,       X86::MMX_PANDirm,       0 },
+    { X86::MMX_PAVGBirr,      X86::MMX_PAVGBirm,      0 },
+    { X86::MMX_PAVGWirr,      X86::MMX_PAVGWirm,      0 },
+    { X86::MMX_PCMPEQBirr,    X86::MMX_PCMPEQBirm,    0 },
+    { X86::MMX_PCMPEQDirr,    X86::MMX_PCMPEQDirm,    0 },
+    { X86::MMX_PCMPEQWirr,    X86::MMX_PCMPEQWirm,    0 },
+    { X86::MMX_PCMPGTBirr,    X86::MMX_PCMPGTBirm,    0 },
+    { X86::MMX_PCMPGTDirr,    X86::MMX_PCMPGTDirm,    0 },
+    { X86::MMX_PCMPGTWirr,    X86::MMX_PCMPGTWirm,    0 },
+    { X86::MMX_PHADDSWrr64,   X86::MMX_PHADDSWrm64,   0 },
+    { X86::MMX_PHADDWrr64,    X86::MMX_PHADDWrm64,    0 },
+    { X86::MMX_PHADDrr64,     X86::MMX_PHADDrm64,     0 },
+    { X86::MMX_PHSUBDrr64,    X86::MMX_PHSUBDrm64,    0 },
+    { X86::MMX_PHSUBSWrr64,   X86::MMX_PHSUBSWrm64,   0 },
+    { X86::MMX_PHSUBWrr64,    X86::MMX_PHSUBWrm64,    0 },
+    { X86::MMX_PINSRWirri,    X86::MMX_PINSRWirmi,    0 },
+    { X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 },
+    { X86::MMX_PMADDWDirr,    X86::MMX_PMADDWDirm,    0 },
+    { X86::MMX_PMAXSWirr,     X86::MMX_PMAXSWirm,     0 },
+    { X86::MMX_PMAXUBirr,     X86::MMX_PMAXUBirm,     0 },
+    { X86::MMX_PMINSWirr,     X86::MMX_PMINSWirm,     0 },
+    { X86::MMX_PMINUBirr,     X86::MMX_PMINUBirm,     0 },
+    { X86::MMX_PMULHRSWrr64,  X86::MMX_PMULHRSWrm64,  0 },
+    { X86::MMX_PMULHUWirr,    X86::MMX_PMULHUWirm,    0 },
+    { X86::MMX_PMULHWirr,     X86::MMX_PMULHWirm,     0 },
+    { X86::MMX_PMULLWirr,     X86::MMX_PMULLWirm,     0 },
+    { X86::MMX_PMULUDQirr,    X86::MMX_PMULUDQirm,    0 },
+    { X86::MMX_PORirr,        X86::MMX_PORirm,        0 },
+    { X86::MMX_PSADBWirr,     X86::MMX_PSADBWirm,     0 },
+    { X86::MMX_PSHUFBrr64,    X86::MMX_PSHUFBrm64,    0 },
+    { X86::MMX_PSIGNBrr64,    X86::MMX_PSIGNBrm64,    0 },
+    { X86::MMX_PSIGNDrr64,    X86::MMX_PSIGNDrm64,    0 },
+    { X86::MMX_PSIGNWrr64,    X86::MMX_PSIGNWrm64,    0 },
+    { X86::MMX_PSLLDrr,       X86::MMX_PSLLDrm,       0 },
+    { X86::MMX_PSLLQrr,       X86::MMX_PSLLQrm,       0 },
+    { X86::MMX_PSLLWrr,       X86::MMX_PSLLWrm,       0 },
+    { X86::MMX_PSRADrr,       X86::MMX_PSRADrm,       0 },
+    { X86::MMX_PSRAWrr,       X86::MMX_PSRAWrm,       0 },
+    { X86::MMX_PSRLDrr,       X86::MMX_PSRLDrm,       0 },
+    { X86::MMX_PSRLQrr,       X86::MMX_PSRLQrm,       0 },
+    { X86::MMX_PSRLWrr,       X86::MMX_PSRLWrm,       0 },
+    { X86::MMX_PSUBBirr,      X86::MMX_PSUBBirm,      0 },
+    { X86::MMX_PSUBDirr,      X86::MMX_PSUBDirm,      0 },
+    { X86::MMX_PSUBQirr,      X86::MMX_PSUBQirm,      0 },
+    { X86::MMX_PSUBSBirr,     X86::MMX_PSUBSBirm,     0 },
+    { X86::MMX_PSUBSWirr,     X86::MMX_PSUBSWirm,     0 },
+    { X86::MMX_PSUBUSBirr,    X86::MMX_PSUBUSBirm,    0 },
+    { X86::MMX_PSUBUSWirr,    X86::MMX_PSUBUSWirm,    0 },
+    { X86::MMX_PSUBWirr,      X86::MMX_PSUBWirm,      0 },
+    { X86::MMX_PUNPCKHBWirr,  X86::MMX_PUNPCKHBWirm,  0 },
+    { X86::MMX_PUNPCKHDQirr,  X86::MMX_PUNPCKHDQirm,  0 },
+    { X86::MMX_PUNPCKHWDirr,  X86::MMX_PUNPCKHWDirm,  0 },
+    { X86::MMX_PUNPCKLBWirr,  X86::MMX_PUNPCKLBWirm,  0 },
+    { X86::MMX_PUNPCKLDQirr,  X86::MMX_PUNPCKLDQirm,  0 },
+    { X86::MMX_PUNPCKLWDirr,  X86::MMX_PUNPCKLWDirm,  0 },
+    { X86::MMX_PXORirr,       X86::MMX_PXORirm,       0 },
+
     // AVX 128-bit versions of foldable instructions
     { X86::VCVTSD2SSrr,       X86::VCVTSD2SSrm,        0 },
     { X86::Int_VCVTSD2SSrr,   X86::Int_VCVTSD2SSrm,    0 },
@@ -943,13 +1214,16 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::Int_VCVTSI2SSrr,   X86::Int_VCVTSI2SSrm,    0 },
     { X86::VCVTSS2SDrr,       X86::VCVTSS2SDrm,        0 },
     { X86::Int_VCVTSS2SDrr,   X86::Int_VCVTSS2SDrm,    0 },
+    { X86::VRCPSSr,           X86::VRCPSSm,            0 },
     { X86::VRSQRTSSr,         X86::VRSQRTSSm,          0 },
     { X86::VSQRTSDr,          X86::VSQRTSDm,           0 },
     { X86::VSQRTSSr,          X86::VSQRTSSm,           0 },
     { X86::VADDPDrr,          X86::VADDPDrm,           0 },
     { X86::VADDPSrr,          X86::VADDPSrm,           0 },
     { X86::VADDSDrr,          X86::VADDSDrm,           0 },
+    { X86::VADDSDrr_Int,      X86::VADDSDrm_Int,       0 },
     { X86::VADDSSrr,          X86::VADDSSrm,           0 },
+    { X86::VADDSSrr_Int,      X86::VADDSSrm_Int,       0 },
     { X86::VADDSUBPDrr,       X86::VADDSUBPDrm,        0 },
     { X86::VADDSUBPSrr,       X86::VADDSUBPSrm,        0 },
     { X86::VANDNPDrr,         X86::VANDNPDrm,          0 },
@@ -967,15 +1241,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VDIVPDrr,          X86::VDIVPDrm,           0 },
     { X86::VDIVPSrr,          X86::VDIVPSrm,           0 },
     { X86::VDIVSDrr,          X86::VDIVSDrm,           0 },
+    { X86::VDIVSDrr_Int,      X86::VDIVSDrm_Int,       0 },
     { X86::VDIVSSrr,          X86::VDIVSSrm,           0 },
-    { X86::VFsANDNPDrr,       X86::VFsANDNPDrm,        TB_ALIGN_16 },
-    { X86::VFsANDNPSrr,       X86::VFsANDNPSrm,        TB_ALIGN_16 },
-    { X86::VFsANDPDrr,        X86::VFsANDPDrm,         TB_ALIGN_16 },
-    { X86::VFsANDPSrr,        X86::VFsANDPSrm,         TB_ALIGN_16 },
-    { X86::VFsORPDrr,         X86::VFsORPDrm,          TB_ALIGN_16 },
-    { X86::VFsORPSrr,         X86::VFsORPSrm,          TB_ALIGN_16 },
-    { X86::VFsXORPDrr,        X86::VFsXORPDrm,         TB_ALIGN_16 },
-    { X86::VFsXORPSrr,        X86::VFsXORPSrm,         TB_ALIGN_16 },
+    { X86::VDIVSSrr_Int,      X86::VDIVSSrm_Int,       0 },
+    { X86::VDPPDrri,          X86::VDPPDrmi,           0 },
+    { X86::VDPPSrri,          X86::VDPPSrmi,           0 },
+    // Do not fold VFs* loads because there are no scalar load variants for
+    // these instructions. When folded, the load is required to be 128-bits, so
+    // the load size would not match.
+    { X86::VFvANDNPDrr,       X86::VFvANDNPDrm,        0 },
+    { X86::VFvANDNPSrr,       X86::VFvANDNPSrm,        0 },
+    { X86::VFvANDPDrr,        X86::VFvANDPDrm,         0 },
+    { X86::VFvANDPSrr,        X86::VFvANDPSrm,         0 },
+    { X86::VFvORPDrr,         X86::VFvORPDrm,          0 },
+    { X86::VFvORPSrr,         X86::VFvORPSrm,          0 },
+    { X86::VFvXORPDrr,        X86::VFvXORPDrm,         0 },
+    { X86::VFvXORPSrr,        X86::VFvXORPSrm,         0 },
     { X86::VHADDPDrr,         X86::VHADDPDrm,          0 },
     { X86::VHADDPSrr,         X86::VHADDPSrm,          0 },
     { X86::VHSUBPDrr,         X86::VHSUBPDrm,          0 },
@@ -985,16 +1266,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMAXPDrr,          X86::VMAXPDrm,           0 },
     { X86::VMAXPSrr,          X86::VMAXPSrm,           0 },
     { X86::VMAXSDrr,          X86::VMAXSDrm,           0 },
+    { X86::VMAXSDrr_Int,      X86::VMAXSDrm_Int,       0 },
     { X86::VMAXSSrr,          X86::VMAXSSrm,           0 },
+    { X86::VMAXSSrr_Int,      X86::VMAXSSrm_Int,       0 },
     { X86::VMINPDrr,          X86::VMINPDrm,           0 },
     { X86::VMINPSrr,          X86::VMINPSrm,           0 },
     { X86::VMINSDrr,          X86::VMINSDrm,           0 },
+    { X86::VMINSDrr_Int,      X86::VMINSDrm_Int,       0 },
     { X86::VMINSSrr,          X86::VMINSSrm,           0 },
+    { X86::VMINSSrr_Int,      X86::VMINSSrm_Int,       0 },
     { X86::VMPSADBWrri,       X86::VMPSADBWrmi,        0 },
     { X86::VMULPDrr,          X86::VMULPDrm,           0 },
     { X86::VMULPSrr,          X86::VMULPSrm,           0 },
     { X86::VMULSDrr,          X86::VMULSDrm,           0 },
+    { X86::VMULSDrr_Int,      X86::VMULSDrm_Int,       0 },
     { X86::VMULSSrr,          X86::VMULSSrm,           0 },
+    { X86::VMULSSrr_Int,      X86::VMULSSrm_Int,       0 },
     { X86::VORPDrr,           X86::VORPDrm,            0 },
     { X86::VORPSrr,           X86::VORPSrm,            0 },
     { X86::VPACKSSDWrr,       X86::VPACKSSDWrm,        0 },
@@ -1014,7 +1301,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDrr,           X86::VPANDrm,            0 },
     { X86::VPAVGBrr,          X86::VPAVGBrm,           0 },
     { X86::VPAVGWrr,          X86::VPAVGWrm,           0 },
+    { X86::VPBLENDVBrr,       X86::VPBLENDVBrm,        0 },
     { X86::VPBLENDWrri,       X86::VPBLENDWrmi,        0 },
+    { X86::VPCLMULQDQrr,      X86::VPCLMULQDQrm,       0 },
     { X86::VPCMPEQBrr,        X86::VPCMPEQBrm,         0 },
     { X86::VPCMPEQDrr,        X86::VPCMPEQDrm,         0 },
     { X86::VPCMPEQQrr,        X86::VPCMPEQQrm,         0 },
@@ -1031,6 +1320,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPHSUBWrr,         X86::VPHSUBWrm,          0 },
     { X86::VPERMILPDrr,       X86::VPERMILPDrm,        0 },
     { X86::VPERMILPSrr,       X86::VPERMILPSrm,        0 },
+    { X86::VPINSRBrr,         X86::VPINSRBrm,          0 },
+    { X86::VPINSRDrr,         X86::VPINSRDrm,          0 },
+    { X86::VPINSRQrr,         X86::VPINSRQrm,          0 },
     { X86::VPINSRWrri,        X86::VPINSRWrmi,         0 },
     { X86::VPMADDUBSWrr128,   X86::VPMADDUBSWrm128,    0 },
     { X86::VPMADDWDrr,        X86::VPMADDWDrm,         0 },
@@ -1069,8 +1361,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSRLWrr,          X86::VPSRLWrm,           0 },
     { X86::VPSUBBrr,          X86::VPSUBBrm,           0 },
     { X86::VPSUBDrr,          X86::VPSUBDrm,           0 },
+    { X86::VPSUBQrr,          X86::VPSUBQrm,           0 },
     { X86::VPSUBSBrr,         X86::VPSUBSBrm,          0 },
     { X86::VPSUBSWrr,         X86::VPSUBSWrm,          0 },
+    { X86::VPSUBUSBrr,        X86::VPSUBUSBrm,         0 },
+    { X86::VPSUBUSWrr,        X86::VPSUBUSWrm,         0 },
     { X86::VPSUBWrr,          X86::VPSUBWrm,           0 },
     { X86::VPUNPCKHBWrr,      X86::VPUNPCKHBWrm,       0 },
     { X86::VPUNPCKHDQrr,      X86::VPUNPCKHDQrm,       0 },
@@ -1086,13 +1381,16 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VSUBPDrr,          X86::VSUBPDrm,           0 },
     { X86::VSUBPSrr,          X86::VSUBPSrm,           0 },
     { X86::VSUBSDrr,          X86::VSUBSDrm,           0 },
+    { X86::VSUBSDrr_Int,      X86::VSUBSDrm_Int,       0 },
     { X86::VSUBSSrr,          X86::VSUBSSrm,           0 },
+    { X86::VSUBSSrr_Int,      X86::VSUBSSrm_Int,       0 },
     { X86::VUNPCKHPDrr,       X86::VUNPCKHPDrm,        0 },
     { X86::VUNPCKHPSrr,       X86::VUNPCKHPSrm,        0 },
     { X86::VUNPCKLPDrr,       X86::VUNPCKLPDrm,        0 },
     { X86::VUNPCKLPSrr,       X86::VUNPCKLPSrm,        0 },
     { X86::VXORPDrr,          X86::VXORPDrm,           0 },
     { X86::VXORPSrr,          X86::VXORPSrm,           0 },
+
     // AVX 256-bit foldable instructions
     { X86::VADDPDYrr,         X86::VADDPDYrm,          0 },
     { X86::VADDPSYrr,         X86::VADDPSYrm,          0 },
@@ -1110,6 +1408,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VCMPPSYrri,        X86::VCMPPSYrmi,         0 },
     { X86::VDIVPDYrr,         X86::VDIVPDYrm,          0 },
     { X86::VDIVPSYrr,         X86::VDIVPSYrm,          0 },
+    { X86::VDPPSYrri,         X86::VDPPSYrmi,          0 },
     { X86::VHADDPDYrr,        X86::VHADDPDYrm,         0 },
     { X86::VHADDPSYrr,        X86::VHADDPSYrm,         0 },
     { X86::VHSUBPDYrr,        X86::VHSUBPDYrm,         0 },
@@ -1136,6 +1435,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VUNPCKLPSYrr,      X86::VUNPCKLPSYrm,       0 },
     { X86::VXORPDYrr,         X86::VXORPDYrm,          0 },
     { X86::VXORPSYrr,         X86::VXORPSYrm,          0 },
+
     // AVX2 foldable instructions
     { X86::VINSERTI128rr,     X86::VINSERTI128rm,      0 },
     { X86::VPACKSSDWYrr,      X86::VPACKSSDWYrm,       0 },
@@ -1157,6 +1457,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPAVGWYrr,         X86::VPAVGWYrm,          0 },
     { X86::VPBLENDDrri,       X86::VPBLENDDrmi,        0 },
     { X86::VPBLENDDYrri,      X86::VPBLENDDYrmi,       0 },
+    { X86::VPBLENDVBYrr,      X86::VPBLENDVBYrm,       0 },
     { X86::VPBLENDWYrri,      X86::VPBLENDWYrmi,       0 },
     { X86::VPCMPEQBYrr,       X86::VPCMPEQBYrm,        0 },
     { X86::VPCMPEQDYrr,       X86::VPCMPEQDYrm,        0 },
@@ -1168,9 +1469,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPCMPGTWYrr,       X86::VPCMPGTWYrm,        0 },
     { X86::VPERM2I128rr,      X86::VPERM2I128rm,       0 },
     { X86::VPERMDYrr,         X86::VPERMDYrm,          0 },
-    { X86::VPERMPDYri,        X86::VPERMPDYmi,         0 },
     { X86::VPERMPSYrr,        X86::VPERMPSYrm,         0 },
-    { X86::VPERMQYri,         X86::VPERMQYmi,          0 },
     { X86::VPHADDDYrr,        X86::VPHADDDYrm,         0 },
     { X86::VPHADDSWrr256,     X86::VPHADDSWrm256,      0 },
     { X86::VPHADDWYrr,        X86::VPHADDWYrm,         0 },
@@ -1225,8 +1524,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSRLVQYrr,        X86::VPSRLVQYrm,         0 },
     { X86::VPSUBBYrr,         X86::VPSUBBYrm,          0 },
     { X86::VPSUBDYrr,         X86::VPSUBDYrm,          0 },
+    { X86::VPSUBQYrr,         X86::VPSUBQYrm,          0 },
     { X86::VPSUBSBYrr,        X86::VPSUBSBYrm,         0 },
     { X86::VPSUBSWYrr,        X86::VPSUBSWYrm,         0 },
+    { X86::VPSUBUSBYrr,       X86::VPSUBUSBYrm,        0 },
+    { X86::VPSUBUSWYrr,       X86::VPSUBUSWYrm,        0 },
     { X86::VPSUBWYrr,         X86::VPSUBWYrm,          0 },
     { X86::VPUNPCKHBWYrr,     X86::VPUNPCKHBWYrm,      0 },
     { X86::VPUNPCKHDQYrr,     X86::VPUNPCKHDQYrm,      0 },
@@ -1237,41 +1539,81 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLQDQYrr,    X86::VPUNPCKLQDQYrm,     0 },
     { X86::VPUNPCKLWDYrr,     X86::VPUNPCKLWDYrm,      0 },
     { X86::VPXORYrr,          X86::VPXORYrm,           0 },
-    // FIXME: add AVX 256-bit foldable instructions
 
     // FMA4 foldable patterns
-    { X86::VFMADDSS4rr,       X86::VFMADDSS4mr,        0           },
-    { X86::VFMADDSD4rr,       X86::VFMADDSD4mr,        0           },
-    { X86::VFMADDPS4rr,       X86::VFMADDPS4mr,        TB_ALIGN_16 },
-    { X86::VFMADDPD4rr,       X86::VFMADDPD4mr,        TB_ALIGN_16 },
-    { X86::VFMADDPS4rrY,      X86::VFMADDPS4mrY,       TB_ALIGN_32 },
-    { X86::VFMADDPD4rrY,      X86::VFMADDPD4mrY,       TB_ALIGN_32 },
-    { X86::VFNMADDSS4rr,      X86::VFNMADDSS4mr,       0           },
-    { X86::VFNMADDSD4rr,      X86::VFNMADDSD4mr,       0           },
-    { X86::VFNMADDPS4rr,      X86::VFNMADDPS4mr,       TB_ALIGN_16 },
-    { X86::VFNMADDPD4rr,      X86::VFNMADDPD4mr,       TB_ALIGN_16 },
-    { X86::VFNMADDPS4rrY,     X86::VFNMADDPS4mrY,      TB_ALIGN_32 },
-    { X86::VFNMADDPD4rrY,     X86::VFNMADDPD4mrY,      TB_ALIGN_32 },
-    { X86::VFMSUBSS4rr,       X86::VFMSUBSS4mr,        0           },
-    { X86::VFMSUBSD4rr,       X86::VFMSUBSD4mr,        0           },
-    { X86::VFMSUBPS4rr,       X86::VFMSUBPS4mr,        TB_ALIGN_16 },
-    { X86::VFMSUBPD4rr,       X86::VFMSUBPD4mr,        TB_ALIGN_16 },
-    { X86::VFMSUBPS4rrY,      X86::VFMSUBPS4mrY,       TB_ALIGN_32 },
-    { X86::VFMSUBPD4rrY,      X86::VFMSUBPD4mrY,       TB_ALIGN_32 },
-    { X86::VFNMSUBSS4rr,      X86::VFNMSUBSS4mr,       0           },
-    { X86::VFNMSUBSD4rr,      X86::VFNMSUBSD4mr,       0           },
-    { X86::VFNMSUBPS4rr,      X86::VFNMSUBPS4mr,       TB_ALIGN_16 },
-    { X86::VFNMSUBPD4rr,      X86::VFNMSUBPD4mr,       TB_ALIGN_16 },
-    { X86::VFNMSUBPS4rrY,     X86::VFNMSUBPS4mrY,      TB_ALIGN_32 },
-    { X86::VFNMSUBPD4rrY,     X86::VFNMSUBPD4mrY,      TB_ALIGN_32 },
-    { X86::VFMADDSUBPS4rr,    X86::VFMADDSUBPS4mr,     TB_ALIGN_16 },
-    { X86::VFMADDSUBPD4rr,    X86::VFMADDSUBPD4mr,     TB_ALIGN_16 },
-    { X86::VFMADDSUBPS4rrY,   X86::VFMADDSUBPS4mrY,    TB_ALIGN_32 },
-    { X86::VFMADDSUBPD4rrY,   X86::VFMADDSUBPD4mrY,    TB_ALIGN_32 },
-    { X86::VFMSUBADDPS4rr,    X86::VFMSUBADDPS4mr,     TB_ALIGN_16 },
-    { X86::VFMSUBADDPD4rr,    X86::VFMSUBADDPD4mr,     TB_ALIGN_16 },
-    { X86::VFMSUBADDPS4rrY,   X86::VFMSUBADDPS4mrY,    TB_ALIGN_32 },
-    { X86::VFMSUBADDPD4rrY,   X86::VFMSUBADDPD4mrY,    TB_ALIGN_32 },
+    { X86::VFMADDSS4rr,       X86::VFMADDSS4mr,        0 },
+    { X86::VFMADDSD4rr,       X86::VFMADDSD4mr,        0 },
+    { X86::VFMADDPS4rr,       X86::VFMADDPS4mr,        0 },
+    { X86::VFMADDPD4rr,       X86::VFMADDPD4mr,        0 },
+    { X86::VFMADDPS4rrY,      X86::VFMADDPS4mrY,       0 },
+    { X86::VFMADDPD4rrY,      X86::VFMADDPD4mrY,       0 },
+    { X86::VFNMADDSS4rr,      X86::VFNMADDSS4mr,       0 },
+    { X86::VFNMADDSD4rr,      X86::VFNMADDSD4mr,       0 },
+    { X86::VFNMADDPS4rr,      X86::VFNMADDPS4mr,       0 },
+    { X86::VFNMADDPD4rr,      X86::VFNMADDPD4mr,       0 },
+    { X86::VFNMADDPS4rrY,     X86::VFNMADDPS4mrY,      0 },
+    { X86::VFNMADDPD4rrY,     X86::VFNMADDPD4mrY,      0 },
+    { X86::VFMSUBSS4rr,       X86::VFMSUBSS4mr,        0 },
+    { X86::VFMSUBSD4rr,       X86::VFMSUBSD4mr,        0 },
+    { X86::VFMSUBPS4rr,       X86::VFMSUBPS4mr,        0 },
+    { X86::VFMSUBPD4rr,       X86::VFMSUBPD4mr,        0 },
+    { X86::VFMSUBPS4rrY,      X86::VFMSUBPS4mrY,       0 },
+    { X86::VFMSUBPD4rrY,      X86::VFMSUBPD4mrY,       0 },
+    { X86::VFNMSUBSS4rr,      X86::VFNMSUBSS4mr,       0 },
+    { X86::VFNMSUBSD4rr,      X86::VFNMSUBSD4mr,       0 },
+    { X86::VFNMSUBPS4rr,      X86::VFNMSUBPS4mr,       0 },
+    { X86::VFNMSUBPD4rr,      X86::VFNMSUBPD4mr,       0 },
+    { X86::VFNMSUBPS4rrY,     X86::VFNMSUBPS4mrY,      0 },
+    { X86::VFNMSUBPD4rrY,     X86::VFNMSUBPD4mrY,      0 },
+    { X86::VFMADDSUBPS4rr,    X86::VFMADDSUBPS4mr,     0 },
+    { X86::VFMADDSUBPD4rr,    X86::VFMADDSUBPD4mr,     0 },
+    { X86::VFMADDSUBPS4rrY,   X86::VFMADDSUBPS4mrY,    0 },
+    { X86::VFMADDSUBPD4rrY,   X86::VFMADDSUBPD4mrY,    0 },
+    { X86::VFMSUBADDPS4rr,    X86::VFMSUBADDPS4mr,     0 },
+    { X86::VFMSUBADDPD4rr,    X86::VFMSUBADDPD4mr,     0 },
+    { X86::VFMSUBADDPS4rrY,   X86::VFMSUBADDPS4mrY,    0 },
+    { X86::VFMSUBADDPD4rrY,   X86::VFMSUBADDPD4mrY,    0 },
+
+    // XOP foldable instructions
+    { X86::VPCMOVrr,          X86::VPCMOVmr,            0 },
+    { X86::VPCMOVrrY,         X86::VPCMOVmrY,           0 },
+    { X86::VPCOMBri,          X86::VPCOMBmi,            0 },
+    { X86::VPCOMDri,          X86::VPCOMDmi,            0 },
+    { X86::VPCOMQri,          X86::VPCOMQmi,            0 },
+    { X86::VPCOMWri,          X86::VPCOMWmi,            0 },
+    { X86::VPCOMUBri,         X86::VPCOMUBmi,           0 },
+    { X86::VPCOMUDri,         X86::VPCOMUDmi,           0 },
+    { X86::VPCOMUQri,         X86::VPCOMUQmi,           0 },
+    { X86::VPCOMUWri,         X86::VPCOMUWmi,           0 },
+    { X86::VPERMIL2PDrr,      X86::VPERMIL2PDmr,        0 },
+    { X86::VPERMIL2PDrrY,     X86::VPERMIL2PDmrY,       0 },
+    { X86::VPERMIL2PSrr,      X86::VPERMIL2PSmr,        0 },
+    { X86::VPERMIL2PSrrY,     X86::VPERMIL2PSmrY,       0 },
+    { X86::VPMACSDDrr,        X86::VPMACSDDrm,          0 },
+    { X86::VPMACSDQHrr,       X86::VPMACSDQHrm,         0 },
+    { X86::VPMACSDQLrr,       X86::VPMACSDQLrm,         0 },
+    { X86::VPMACSSDDrr,       X86::VPMACSSDDrm,         0 },
+    { X86::VPMACSSDQHrr,      X86::VPMACSSDQHrm,        0 },
+    { X86::VPMACSSDQLrr,      X86::VPMACSSDQLrm,        0 },
+    { X86::VPMACSSWDrr,       X86::VPMACSSWDrm,         0 },
+    { X86::VPMACSSWWrr,       X86::VPMACSSWWrm,         0 },
+    { X86::VPMACSWDrr,        X86::VPMACSWDrm,          0 },
+    { X86::VPMACSWWrr,        X86::VPMACSWWrm,          0 },
+    { X86::VPMADCSSWDrr,      X86::VPMADCSSWDrm,        0 },
+    { X86::VPMADCSWDrr,       X86::VPMADCSWDrm,         0 },
+    { X86::VPPERMrr,          X86::VPPERMmr,            0 },
+    { X86::VPROTBrr,          X86::VPROTBrm,            0 },
+    { X86::VPROTDrr,          X86::VPROTDrm,            0 },
+    { X86::VPROTQrr,          X86::VPROTQrm,            0 },
+    { X86::VPROTWrr,          X86::VPROTWrm,            0 },
+    { X86::VPSHABrr,          X86::VPSHABrm,            0 },
+    { X86::VPSHADrr,          X86::VPSHADrm,            0 },
+    { X86::VPSHAQrr,          X86::VPSHAQrm,            0 },
+    { X86::VPSHAWrr,          X86::VPSHAWrm,            0 },
+    { X86::VPSHLBrr,          X86::VPSHLBrm,            0 },
+    { X86::VPSHLDrr,          X86::VPSHLDrm,            0 },
+    { X86::VPSHLQrr,          X86::VPSHLQrm,            0 },
+    { X86::VPSHLWrr,          X86::VPSHLWrm,            0 },
 
     // BMI/BMI2 foldable instructions
     { X86::ANDN32rr,          X86::ANDN32rm,            0 },
@@ -1321,16 +1663,29 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VALIGNQrri,        X86::VALIGNQrmi,          0 },
     { X86::VALIGNDrri,        X86::VALIGNDrmi,          0 },
     { X86::VPMULUDQZrr,       X86::VPMULUDQZrm,         0 },
+    { X86::VBROADCASTSSZrkz,  X86::VBROADCASTSSZmkz,    TB_NO_REVERSE },
+    { X86::VBROADCASTSDZrkz,  X86::VBROADCASTSDZmkz,    TB_NO_REVERSE },
+
+    // AVX-512{F,VL} foldable instructions
+    { X86::VBROADCASTSSZ256rkz,  X86::VBROADCASTSSZ256mkz,      TB_NO_REVERSE },
+    { X86::VBROADCASTSDZ256rkz,  X86::VBROADCASTSDZ256mkz,      TB_NO_REVERSE },
+    { X86::VBROADCASTSSZ128rkz,  X86::VBROADCASTSSZ128mkz,      TB_NO_REVERSE },
+
+    // AVX-512{F,VL} foldable instructions
+    { X86::VADDPDZ128rr,      X86::VADDPDZ128rm,        0 },
+    { X86::VADDPDZ256rr,      X86::VADDPDZ256rm,        0 },
+    { X86::VADDPSZ128rr,      X86::VADDPSZ128rm,        0 },
+    { X86::VADDPSZ256rr,      X86::VADDPSZ256rm,        0 },
 
     // AES foldable instructions
     { X86::AESDECLASTrr,      X86::AESDECLASTrm,        TB_ALIGN_16 },
     { X86::AESDECrr,          X86::AESDECrm,            TB_ALIGN_16 },
     { X86::AESENCLASTrr,      X86::AESENCLASTrm,        TB_ALIGN_16 },
     { X86::AESENCrr,          X86::AESENCrm,            TB_ALIGN_16 },
-    { X86::VAESDECLASTrr,     X86::VAESDECLASTrm,       TB_ALIGN_16 },
-    { X86::VAESDECrr,         X86::VAESDECrm,           TB_ALIGN_16 },
-    { X86::VAESENCLASTrr,     X86::VAESENCLASTrm,       TB_ALIGN_16 },
-    { X86::VAESENCrr,         X86::VAESENCrm,           TB_ALIGN_16 },
+    { X86::VAESDECLASTrr,     X86::VAESDECLASTrm,       0 },
+    { X86::VAESDECrr,         X86::VAESDECrm,           0 },
+    { X86::VAESENCLASTrr,     X86::VAESENCLASTrm,       0 },
+    { X86::VAESENCrr,         X86::VAESENCrm,           0 },
 
     // SHA foldable instructions
     { X86::SHA1MSG1rr,        X86::SHA1MSG1rm,          TB_ALIGN_16 },
@@ -1339,20 +1694,20 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::SHA1RNDS4rri,      X86::SHA1RNDS4rmi,        TB_ALIGN_16 },
     { X86::SHA256MSG1rr,      X86::SHA256MSG1rm,        TB_ALIGN_16 },
     { X86::SHA256MSG2rr,      X86::SHA256MSG2rm,        TB_ALIGN_16 },
-    { X86::SHA256RNDS2rr,     X86::SHA256RNDS2rm,       TB_ALIGN_16 },
+    { X86::SHA256RNDS2rr,     X86::SHA256RNDS2rm,       TB_ALIGN_16 }
   };
 
-  for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
-    unsigned RegOp = OpTbl2[i].RegOp;
-    unsigned MemOp = OpTbl2[i].MemOp;
-    unsigned Flags = OpTbl2[i].Flags;
+  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2); i != e; ++i) {
+    unsigned RegOp = MemoryFoldTable2[i].RegOp;
+    unsigned MemOp = MemoryFoldTable2[i].MemOp;
+    unsigned Flags = MemoryFoldTable2[i].Flags;
     AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
                   RegOp, MemOp,
                   // Index 2, folded load
                   Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
   }
 
-  static const X86OpTblEntry OpTbl3[] = {
+  static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
     // FMA foldable instructions
     { X86::VFMADDSSr231r,         X86::VFMADDSSr231m,         TB_ALIGN_NONE },
     { X86::VFMADDSDr231r,         X86::VFMADDSDr231m,         TB_ALIGN_NONE },
@@ -1493,6 +1848,16 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_16 },
     { X86::VFMSUBADDPS4rrY,       X86::VFMSUBADDPS4rmY,       TB_ALIGN_32 },
     { X86::VFMSUBADDPD4rrY,       X86::VFMSUBADDPD4rmY,       TB_ALIGN_32 },
+
+    // XOP foldable instructions
+    { X86::VPCMOVrr,              X86::VPCMOVrm,              0 },
+    { X86::VPCMOVrrY,             X86::VPCMOVrmY,             0 },
+    { X86::VPERMIL2PDrr,          X86::VPERMIL2PDrm,          0 },
+    { X86::VPERMIL2PDrrY,         X86::VPERMIL2PDrmY,         0 },
+    { X86::VPERMIL2PSrr,          X86::VPERMIL2PSrm,          0 },
+    { X86::VPERMIL2PSrrY,         X86::VPERMIL2PSrmY,         0 },
+    { X86::VPPERMrr,              X86::VPPERMrm,              0 },
+
     // AVX-512 VPERMI instructions with 3 source operands.
     { X86::VPERMI2Drr,            X86::VPERMI2Drm,            0 },
     { X86::VPERMI2Qrr,            X86::VPERMI2Qrm,            0 },
@@ -1501,19 +1866,114 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VBLENDMPDZrr,          X86::VBLENDMPDZrm,          0 },
     { X86::VBLENDMPSZrr,          X86::VBLENDMPSZrm,          0 },
     { X86::VPBLENDMDZrr,          X86::VPBLENDMDZrm,          0 },
-    { X86::VPBLENDMQZrr,          X86::VPBLENDMQZrm,          0 }
+    { X86::VPBLENDMQZrr,          X86::VPBLENDMQZrm,          0 },
+    { X86::VBROADCASTSSZrk,       X86::VBROADCASTSSZmk,       TB_NO_REVERSE },
+    { X86::VBROADCASTSDZrk,       X86::VBROADCASTSDZmk,       TB_NO_REVERSE },
+    { X86::VBROADCASTSSZ256rk,    X86::VBROADCASTSSZ256mk,    TB_NO_REVERSE },
+    { X86::VBROADCASTSDZ256rk,    X86::VBROADCASTSDZ256mk,    TB_NO_REVERSE },
+    { X86::VBROADCASTSSZ128rk,    X86::VBROADCASTSSZ128mk,    TB_NO_REVERSE },
+     // AVX-512 arithmetic instructions
+    { X86::VADDPSZrrkz,           X86::VADDPSZrmkz,           0 },
+    { X86::VADDPDZrrkz,           X86::VADDPDZrmkz,           0 },
+    { X86::VSUBPSZrrkz,           X86::VSUBPSZrmkz,           0 },
+    { X86::VSUBPDZrrkz,           X86::VSUBPDZrmkz,           0 },
+    { X86::VMULPSZrrkz,           X86::VMULPSZrmkz,           0 },
+    { X86::VMULPDZrrkz,           X86::VMULPDZrmkz,           0 },
+    { X86::VDIVPSZrrkz,           X86::VDIVPSZrmkz,           0 },
+    { X86::VDIVPDZrrkz,           X86::VDIVPDZrmkz,           0 },
+    { X86::VMINPSZrrkz,           X86::VMINPSZrmkz,           0 },
+    { X86::VMINPDZrrkz,           X86::VMINPDZrmkz,           0 },
+    { X86::VMAXPSZrrkz,           X86::VMAXPSZrmkz,           0 },
+    { X86::VMAXPDZrrkz,           X86::VMAXPDZrmkz,           0 },
+    // AVX-512{F,VL} arithmetic instructions 256-bit
+    { X86::VADDPSZ256rrkz,        X86::VADDPSZ256rmkz,        0 },
+    { X86::VADDPDZ256rrkz,        X86::VADDPDZ256rmkz,        0 },
+    { X86::VSUBPSZ256rrkz,        X86::VSUBPSZ256rmkz,        0 },
+    { X86::VSUBPDZ256rrkz,        X86::VSUBPDZ256rmkz,        0 },
+    { X86::VMULPSZ256rrkz,        X86::VMULPSZ256rmkz,        0 },
+    { X86::VMULPDZ256rrkz,        X86::VMULPDZ256rmkz,        0 },
+    { X86::VDIVPSZ256rrkz,        X86::VDIVPSZ256rmkz,        0 },
+    { X86::VDIVPDZ256rrkz,        X86::VDIVPDZ256rmkz,        0 },
+    { X86::VMINPSZ256rrkz,        X86::VMINPSZ256rmkz,        0 },
+    { X86::VMINPDZ256rrkz,        X86::VMINPDZ256rmkz,        0 },
+    { X86::VMAXPSZ256rrkz,        X86::VMAXPSZ256rmkz,        0 },
+    { X86::VMAXPDZ256rrkz,        X86::VMAXPDZ256rmkz,        0 },
+    // AVX-512{F,VL} arithmetic instructions 128-bit
+    { X86::VADDPSZ128rrkz,        X86::VADDPSZ128rmkz,        0 },
+    { X86::VADDPDZ128rrkz,        X86::VADDPDZ128rmkz,        0 },
+    { X86::VSUBPSZ128rrkz,        X86::VSUBPSZ128rmkz,        0 },
+    { X86::VSUBPDZ128rrkz,        X86::VSUBPDZ128rmkz,        0 },
+    { X86::VMULPSZ128rrkz,        X86::VMULPSZ128rmkz,        0 },
+    { X86::VMULPDZ128rrkz,        X86::VMULPDZ128rmkz,        0 },
+    { X86::VDIVPSZ128rrkz,        X86::VDIVPSZ128rmkz,        0 },
+    { X86::VDIVPDZ128rrkz,        X86::VDIVPDZ128rmkz,        0 },
+    { X86::VMINPSZ128rrkz,        X86::VMINPSZ128rmkz,        0 },
+    { X86::VMINPDZ128rrkz,        X86::VMINPDZ128rmkz,        0 },
+    { X86::VMAXPSZ128rrkz,        X86::VMAXPSZ128rmkz,        0 },
+    { X86::VMAXPDZ128rrkz,        X86::VMAXPDZ128rmkz,        0 }
   };
 
-  for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
-    unsigned RegOp = OpTbl3[i].RegOp;
-    unsigned MemOp = OpTbl3[i].MemOp;
-    unsigned Flags = OpTbl3[i].Flags;
+  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable3); i != e; ++i) {
+    unsigned RegOp = MemoryFoldTable3[i].RegOp;
+    unsigned MemOp = MemoryFoldTable3[i].MemOp;
+    unsigned Flags = MemoryFoldTable3[i].Flags;
     AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
                   RegOp, MemOp,
                   // Index 3, folded load
                   Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
   }
 
+  static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
+     // AVX-512 foldable instructions
+    { X86::VADDPSZrrk,         X86::VADDPSZrmk,           0 },
+    { X86::VADDPDZrrk,         X86::VADDPDZrmk,           0 },
+    { X86::VSUBPSZrrk,         X86::VSUBPSZrmk,           0 },
+    { X86::VSUBPDZrrk,         X86::VSUBPDZrmk,           0 },
+    { X86::VMULPSZrrk,         X86::VMULPSZrmk,           0 },
+    { X86::VMULPDZrrk,         X86::VMULPDZrmk,           0 },
+    { X86::VDIVPSZrrk,         X86::VDIVPSZrmk,           0 },
+    { X86::VDIVPDZrrk,         X86::VDIVPDZrmk,           0 },
+    { X86::VMINPSZrrk,         X86::VMINPSZrmk,           0 },
+    { X86::VMINPDZrrk,         X86::VMINPDZrmk,           0 },
+    { X86::VMAXPSZrrk,         X86::VMAXPSZrmk,           0 },
+    { X86::VMAXPDZrrk,         X86::VMAXPDZrmk,           0 },
+    // AVX-512{F,VL} foldable instructions 256-bit
+    { X86::VADDPSZ256rrk,      X86::VADDPSZ256rmk,        0 },
+    { X86::VADDPDZ256rrk,      X86::VADDPDZ256rmk,        0 },
+    { X86::VSUBPSZ256rrk,      X86::VSUBPSZ256rmk,        0 },
+    { X86::VSUBPDZ256rrk,      X86::VSUBPDZ256rmk,        0 },
+    { X86::VMULPSZ256rrk,      X86::VMULPSZ256rmk,        0 },
+    { X86::VMULPDZ256rrk,      X86::VMULPDZ256rmk,        0 },
+    { X86::VDIVPSZ256rrk,      X86::VDIVPSZ256rmk,        0 },
+    { X86::VDIVPDZ256rrk,      X86::VDIVPDZ256rmk,        0 },
+    { X86::VMINPSZ256rrk,      X86::VMINPSZ256rmk,        0 },
+    { X86::VMINPDZ256rrk,      X86::VMINPDZ256rmk,        0 },
+    { X86::VMAXPSZ256rrk,      X86::VMAXPSZ256rmk,        0 },
+    { X86::VMAXPDZ256rrk,      X86::VMAXPDZ256rmk,        0 },
+    // AVX-512{F,VL} foldable instructions 128-bit
+    { X86::VADDPSZ128rrk,      X86::VADDPSZ128rmk,        0 },
+    { X86::VADDPDZ128rrk,      X86::VADDPDZ128rmk,        0 },
+    { X86::VSUBPSZ128rrk,      X86::VSUBPSZ128rmk,        0 },
+    { X86::VSUBPDZ128rrk,      X86::VSUBPDZ128rmk,        0 },
+    { X86::VMULPSZ128rrk,      X86::VMULPSZ128rmk,        0 },
+    { X86::VMULPDZ128rrk,      X86::VMULPDZ128rmk,        0 },
+    { X86::VDIVPSZ128rrk,      X86::VDIVPSZ128rmk,        0 },
+    { X86::VDIVPDZ128rrk,      X86::VDIVPDZ128rmk,        0 },
+    { X86::VMINPSZ128rrk,      X86::VMINPSZ128rmk,        0 },
+    { X86::VMINPDZ128rrk,      X86::VMINPDZ128rmk,        0 },
+    { X86::VMAXPSZ128rrk,      X86::VMAXPSZ128rmk,        0 },
+    { X86::VMAXPDZ128rrk,      X86::VMAXPDZ128rmk,        0 }
+  };
+
+  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable4); i != e; ++i) {
+    unsigned RegOp = MemoryFoldTable4[i].RegOp;
+    unsigned MemOp = MemoryFoldTable4[i].MemOp;
+    unsigned Flags = MemoryFoldTable4[i].Flags;
+    AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 4, folded load
+                  Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
+  }
 }
 
 void
@@ -1579,7 +2039,59 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
   return false;
 }
 
-/// isFrameOperand - Return true and the FrameIndex if the specified
+int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+
+  if (MI->getOpcode() == getCallFrameSetupOpcode() ||
+      MI->getOpcode() == getCallFrameDestroyOpcode()) {
+    unsigned StackAlign = TFI->getStackAlignment();
+    int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign *
+                 StackAlign;
+
+    SPAdj -= MI->getOperand(1).getImm();
+
+    if (MI->getOpcode() == getCallFrameSetupOpcode())
+      return SPAdj;
+    else
+      return -SPAdj;
+  }
+
+  // To know whether a call adjusts the stack, we need information
+  // that is bound to the following ADJCALLSTACKUP pseudo.
+  // Look for the next ADJCALLSTACKUP that follows the call.
+  if (MI->isCall()) {
+    const MachineBasicBlock* MBB = MI->getParent();
+    auto I = ++MachineBasicBlock::const_iterator(MI);
+    for (auto E = MBB->end(); I != E; ++I) {
+      if (I->getOpcode() == getCallFrameDestroyOpcode() ||
+          I->isCall())
+        break;
+    }
+
+    // If we could not find a frame destroy opcode, then it has already
+    // been simplified, so we don't care.
+    if (I->getOpcode() != getCallFrameDestroyOpcode())
+      return 0;
+
+    return -(I->getOperand(1).getImm());
+  }
+
+  // Currently handle only PUSHes we can reasonably expect to see
+  // in call sequences
+  switch (MI->getOpcode()) {
+  default:
+    return 0;
+  case X86::PUSH32i8:
+  case X86::PUSH32r:
+  case X86::PUSH32rmm:
+  case X86::PUSH32rmr:
+  case X86::PUSHi32:
+    return 4;
+  }
+}
+
+/// Return true and the FrameIndex if the specified
 /// operand and follow operands form a reference to the stack frame.
 bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
                                   int &FrameIndex) const {
@@ -1706,8 +2218,7 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
   return 0;
 }
 
-/// regIsPICBase - Return true if register is PIC base (i.e.g defined by
-/// X86::MOVPC32r.
+/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
 static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
   // Don't waste compile time scanning use-def chains of physregs.
   if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
@@ -1903,8 +2414,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
   NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
 }
 
-/// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that
-/// is not marked dead.
+/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
 static bool hasLiveCondCodeDef(MachineInstr *MI) {
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI->getOperand(i);
@@ -1916,8 +2426,7 @@ static bool hasLiveCondCodeDef(MachineInstr *MI) {
   return false;
 }
 
-/// getTruncatedShiftCount - check whether the shift count for a machine operand
-/// is non-zero.
+/// Check whether the shift count for a machine operand is non-zero.
 inline static unsigned getTruncatedShiftCount(MachineInstr *MI,
                                               unsigned ShiftAmtOperandIdx) {
   // The shift count is six bits with the REX.W prefix and five bits without.
@@ -1926,7 +2435,7 @@ inline static unsigned getTruncatedShiftCount(MachineInstr *MI,
   return Imm & ShiftCountMask;
 }
 
-/// isTruncatedShiftCountForLEA - check whether the given shift count is appropriate
+/// Check whether the given shift count is appropriate
 /// can be represented by a LEA instruction.
 inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
   // Left shift instructions can be transformed into load-effective-address
@@ -2008,10 +2517,9 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
   return true;
 }
 
-/// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when
-/// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting
-/// to a 32-bit superregister and then truncating back down to a 16-bit
-/// subregister.
+/// Helper for convertToThreeAddress when 16-bit LEA is disabled, use 32-bit
+/// LEA to form 3-address code by promoting to a 32-bit superregister and then
+/// truncating back down to a 16-bit subregister.
 MachineInstr *
 X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
                                            MachineFunction::iterator &MFI,
@@ -2058,11 +2566,9 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
     break;
   }
   case X86::INC16r:
-  case X86::INC64_16r:
     addRegOffset(MIB, leaInReg, true, 1);
     break;
   case X86::DEC16r:
-  case X86::DEC64_16r:
     addRegOffset(MIB, leaInReg, true, -1);
     break;
   case X86::ADD16ri:
@@ -2120,7 +2626,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   return ExtMI;
 }
 
-/// convertToThreeAddress - This method must be implemented by targets that
+/// This method must be implemented by targets that
 /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
 /// may be able to convert a two-address instruction into a true
 /// three-address instruction on demand.  This allows the X86 target (for
@@ -2156,6 +2662,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
   unsigned MIOpc = MI->getOpcode();
   switch (MIOpc) {
+  default: return nullptr;
   case X86::SHL64ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
@@ -2210,185 +2717,175 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
     break;
   }
-  default: {
+  case X86::INC64r:
+  case X86::INC32r: {
+    assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+    unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
+      : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                        SrcReg, isKill, isUndef, ImplicitOp))
+      return nullptr;
 
-    switch (MIOpc) {
-    default: return nullptr;
-    case X86::INC64r:
-    case X86::INC32r:
-    case X86::INC64_32r: {
-      assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
-      unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
-        : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
-      bool isKill, isUndef;
-      unsigned SrcReg;
-      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
-                          SrcReg, isKill, isUndef, ImplicitOp))
-        return nullptr;
+    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+        .addOperand(Dest)
+        .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef));
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
 
-      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
-          .addOperand(Dest)
-          .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef));
-      if (ImplicitOp.getReg() != 0)
-        MIB.addOperand(ImplicitOp);
+    NewMI = addOffset(MIB, 1);
+    break;
+  }
+  case X86::INC16r:
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                     : nullptr;
+    assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+    NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                      .addOperand(Dest).addOperand(Src), 1);
+    break;
+  case X86::DEC64r:
+  case X86::DEC32r: {
+    assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+    unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
+      : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
 
-      NewMI = addOffset(MIB, 1);
-      break;
-    }
-    case X86::INC16r:
-    case X86::INC64_16r:
-      if (DisableLEA16)
-        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
-                       : nullptr;
-      assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
-      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                        .addOperand(Dest).addOperand(Src), 1);
-      break;
-    case X86::DEC64r:
-    case X86::DEC32r:
-    case X86::DEC64_32r: {
-      assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
-      unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
-        : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
-
-      bool isKill, isUndef;
-      unsigned SrcReg;
-      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
-                          SrcReg, isKill, isUndef, ImplicitOp))
-        return nullptr;
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                        SrcReg, isKill, isUndef, ImplicitOp))
+      return nullptr;
 
-      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
-          .addOperand(Dest)
-          .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
-      if (ImplicitOp.getReg() != 0)
-        MIB.addOperand(ImplicitOp);
+    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+        .addOperand(Dest)
+        .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
 
-      NewMI = addOffset(MIB, -1);
+    NewMI = addOffset(MIB, -1);
 
-      break;
-    }
-    case X86::DEC16r:
-    case X86::DEC64_16r:
-      if (DisableLEA16)
-        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
-                       : nullptr;
-      assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
-      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                        .addOperand(Dest).addOperand(Src), -1);
-      break;
-    case X86::ADD64rr:
-    case X86::ADD64rr_DB:
-    case X86::ADD32rr:
-    case X86::ADD32rr_DB: {
-      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      unsigned Opc;
-      if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
-        Opc = X86::LEA64r;
-      else
-        Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+    break;
+  }
+  case X86::DEC16r:
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                     : nullptr;
+    assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+    NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                      .addOperand(Dest).addOperand(Src), -1);
+    break;
+  case X86::ADD64rr:
+  case X86::ADD64rr_DB:
+  case X86::ADD32rr:
+  case X86::ADD32rr_DB: {
+    assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+    unsigned Opc;
+    if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
+      Opc = X86::LEA64r;
+    else
+      Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
-      bool isKill, isUndef;
-      unsigned SrcReg;
-      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
-                          SrcReg, isKill, isUndef, ImplicitOp))
-        return nullptr;
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+                        SrcReg, isKill, isUndef, ImplicitOp))
+      return nullptr;
 
-      const MachineOperand &Src2 = MI->getOperand(2);
-      bool isKill2, isUndef2;
-      unsigned SrcReg2;
-      MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
-      if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
-                          SrcReg2, isKill2, isUndef2, ImplicitOp2))
-        return nullptr;
+    const MachineOperand &Src2 = MI->getOperand(2);
+    bool isKill2, isUndef2;
+    unsigned SrcReg2;
+    MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
+                        SrcReg2, isKill2, isUndef2, ImplicitOp2))
+      return nullptr;
 
-      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
-        .addOperand(Dest);
-      if (ImplicitOp.getReg() != 0)
-        MIB.addOperand(ImplicitOp);
-      if (ImplicitOp2.getReg() != 0)
-        MIB.addOperand(ImplicitOp2);
+    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+      .addOperand(Dest);
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
+    if (ImplicitOp2.getReg() != 0)
+      MIB.addOperand(ImplicitOp2);
 
-      NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
+    NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
 
-      // Preserve undefness of the operands.
-      NewMI->getOperand(1).setIsUndef(isUndef);
-      NewMI->getOperand(3).setIsUndef(isUndef2);
+    // Preserve undefness of the operands.
+    NewMI->getOperand(1).setIsUndef(isUndef);
+    NewMI->getOperand(3).setIsUndef(isUndef2);
 
-      if (LV && Src2.isKill())
-        LV->replaceKillInstruction(SrcReg2, MI, NewMI);
-      break;
-    }
-    case X86::ADD16rr:
-    case X86::ADD16rr_DB: {
-      if (DisableLEA16)
-        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
-                       : nullptr;
-      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      unsigned Src2 = MI->getOperand(2).getReg();
-      bool isKill2 = MI->getOperand(2).isKill();
-      NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                        .addOperand(Dest),
-                        Src.getReg(), Src.isKill(), Src2, isKill2);
-
-      // Preserve undefness of the operands.
-      bool isUndef = MI->getOperand(1).isUndef();
-      bool isUndef2 = MI->getOperand(2).isUndef();
-      NewMI->getOperand(1).setIsUndef(isUndef);
-      NewMI->getOperand(3).setIsUndef(isUndef2);
-
-      if (LV && isKill2)
-        LV->replaceKillInstruction(Src2, MI, NewMI);
-      break;
-    }
-    case X86::ADD64ri32:
-    case X86::ADD64ri8:
-    case X86::ADD64ri32_DB:
-    case X86::ADD64ri8_DB:
-      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
-                        .addOperand(Dest).addOperand(Src),
-                        MI->getOperand(2).getImm());
-      break;
-    case X86::ADD32ri:
-    case X86::ADD32ri8:
-    case X86::ADD32ri_DB:
-    case X86::ADD32ri8_DB: {
-      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-
-      bool isKill, isUndef;
-      unsigned SrcReg;
-      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
-                          SrcReg, isKill, isUndef, ImplicitOp))
-        return nullptr;
+    if (LV && Src2.isKill())
+      LV->replaceKillInstruction(SrcReg2, MI, NewMI);
+    break;
+  }
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB: {
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                     : nullptr;
+    assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+    unsigned Src2 = MI->getOperand(2).getReg();
+    bool isKill2 = MI->getOperand(2).isKill();
+    NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                      .addOperand(Dest),
+                      Src.getReg(), Src.isKill(), Src2, isKill2);
+
+    // Preserve undefness of the operands.
+    bool isUndef = MI->getOperand(1).isUndef();
+    bool isUndef2 = MI->getOperand(2).isUndef();
+    NewMI->getOperand(1).setIsUndef(isUndef);
+    NewMI->getOperand(3).setIsUndef(isUndef2);
+
+    if (LV && isKill2)
+      LV->replaceKillInstruction(Src2, MI, NewMI);
+    break;
+  }
+  case X86::ADD64ri32:
+  case X86::ADD64ri8:
+  case X86::ADD64ri32_DB:
+  case X86::ADD64ri8_DB:
+    assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+    NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+                      .addOperand(Dest).addOperand(Src),
+                      MI->getOperand(2).getImm());
+    break;
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+  case X86::ADD32ri_DB:
+  case X86::ADD32ri8_DB: {
+    assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+    unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
-      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
-          .addOperand(Dest)
-          .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
-      if (ImplicitOp.getReg() != 0)
-        MIB.addOperand(ImplicitOp);
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+                        SrcReg, isKill, isUndef, ImplicitOp))
+      return nullptr;
 
-      NewMI = addOffset(MIB, MI->getOperand(2).getImm());
-      break;
-    }
-    case X86::ADD16ri:
-    case X86::ADD16ri8:
-    case X86::ADD16ri_DB:
-    case X86::ADD16ri8_DB:
-      if (DisableLEA16)
-        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
-                       : nullptr;
-      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                        .addOperand(Dest).addOperand(Src),
-                        MI->getOperand(2).getImm());
-      break;
-    }
+    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+        .addOperand(Dest)
+        .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
+
+    NewMI = addOffset(MIB, MI->getOperand(2).getImm());
+    break;
   }
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri_DB:
+  case X86::ADD16ri8_DB:
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                     : nullptr;
+    assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+    NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                      .addOperand(Dest).addOperand(Src),
+                      MI->getOperand(2).getImm());
+    break;
   }
 
   if (!NewMI) return nullptr;
@@ -2404,8 +2901,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   return NewMI;
 }
 
-/// commuteInstruction - We have a few instructions that must be hacked on to
-/// commute them.
+/// We have a few instructions that must be hacked on to commute them.
 ///
 MachineInstr *
 X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
@@ -2473,6 +2969,71 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
     MI->getOperand(3).setImm(Mask ^ Imm);
     return TargetInstrInfo::commuteInstruction(MI, NewMI);
   }
+  case X86::PCLMULQDQrr:
+  case X86::VPCLMULQDQrr:{
+    // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
+    // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
+    unsigned Imm = MI->getOperand(3).getImm();
+    unsigned Src1Hi = Imm & 0x01;
+    unsigned Src2Hi = Imm & 0x10;
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
+    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+  }
+  case X86::CMPPDrri:
+  case X86::CMPPSrri:
+  case X86::VCMPPDrri:
+  case X86::VCMPPSrri:
+  case X86::VCMPPDYrri:
+  case X86::VCMPPSYrri: {
+    // Float comparison can be safely commuted for
+    // Ordered/Unordered/Equal/NotEqual tests
+    unsigned Imm = MI->getOperand(3).getImm() & 0x7;
+    switch (Imm) {
+    case 0x00: // EQUAL
+    case 0x03: // UNORDERED
+    case 0x04: // NOT EQUAL
+    case 0x07: // ORDERED
+      if (NewMI) {
+        MachineFunction &MF = *MI->getParent()->getParent();
+        MI = MF.CloneMachineInstr(MI);
+        NewMI = false;
+      }
+      return TargetInstrInfo::commuteInstruction(MI, NewMI);
+    default:
+      return nullptr;
+    }
+  }
+  case X86::VPCOMBri: case X86::VPCOMUBri:
+  case X86::VPCOMDri: case X86::VPCOMUDri:
+  case X86::VPCOMQri: case X86::VPCOMUQri:
+  case X86::VPCOMWri: case X86::VPCOMUWri: {
+    // Flip comparison mode immediate (if necessary).
+    unsigned Imm = MI->getOperand(3).getImm() & 0x7;
+    switch (Imm) {
+    case 0x00: Imm = 0x02; break; // LT -> GT
+    case 0x01: Imm = 0x03; break; // LE -> GE
+    case 0x02: Imm = 0x00; break; // GT -> LT
+    case 0x03: Imm = 0x01; break; // GE -> LE
+    case 0x04: // EQ
+    case 0x05: // NE
+    case 0x06: // FALSE
+    case 0x07: // TRUE
+    default:
+      break;
+    }
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->getOperand(3).setImm(Imm);
+    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+  }
   case X86::CMOVB16rr:  case X86::CMOVB32rr:  case X86::CMOVB64rr:
   case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
   case X86::CMOVE16rr:  case X86::CMOVE32rr:  case X86::CMOVE64rr:
@@ -2557,20 +3118,26 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
 bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
                                          unsigned &SrcOpIdx2) const {
   switch (MI->getOpcode()) {
-    case X86::BLENDPDrri:
-    case X86::BLENDPSrri:
-    case X86::PBLENDWrri:
-    case X86::VBLENDPDrri:
-    case X86::VBLENDPSrri:
-    case X86::VBLENDPDYrri:
-    case X86::VBLENDPSYrri:
-    case X86::VPBLENDDrri:
-    case X86::VPBLENDDYrri:
-    case X86::VPBLENDWrri:
-    case X86::VPBLENDWYrri:
-      SrcOpIdx1 = 1;
-      SrcOpIdx2 = 2;
-      return true;
+    case X86::CMPPDrri:
+    case X86::CMPPSrri:
+    case X86::VCMPPDrri:
+    case X86::VCMPPSrri:
+    case X86::VCMPPDYrri:
+    case X86::VCMPPSYrri: {
+      // Float comparison can be safely commuted for
+      // Ordered/Unordered/Equal/NotEqual tests
+      unsigned Imm = MI->getOperand(3).getImm() & 0x7;
+      switch (Imm) {
+      case 0x00: // EQUAL
+      case 0x03: // UNORDERED
+      case 0x04: // NOT EQUAL
+      case 0x07: // ORDERED
+        SrcOpIdx1 = 1;
+        SrcOpIdx2 = 2;
+        return true;
+      }
+      return false;
+    }
     case X86::VFMADDPDr231r:
     case X86::VFMADDPSr231r:
     case X86::VFMADDSDr231r:
@@ -2606,26 +3173,26 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
 static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
   switch (BrOpc) {
   default: return X86::COND_INVALID;
-  case X86::JE_4:  return X86::COND_E;
-  case X86::JNE_4: return X86::COND_NE;
-  case X86::JL_4:  return X86::COND_L;
-  case X86::JLE_4: return X86::COND_LE;
-  case X86::JG_4:  return X86::COND_G;
-  case X86::JGE_4: return X86::COND_GE;
-  case X86::JB_4:  return X86::COND_B;
-  case X86::JBE_4: return X86::COND_BE;
-  case X86::JA_4:  return X86::COND_A;
-  case X86::JAE_4: return X86::COND_AE;
-  case X86::JS_4:  return X86::COND_S;
-  case X86::JNS_4: return X86::COND_NS;
-  case X86::JP_4:  return X86::COND_P;
-  case X86::JNP_4: return X86::COND_NP;
-  case X86::JO_4:  return X86::COND_O;
-  case X86::JNO_4: return X86::COND_NO;
+  case X86::JE_1:  return X86::COND_E;
+  case X86::JNE_1: return X86::COND_NE;
+  case X86::JL_1:  return X86::COND_L;
+  case X86::JLE_1: return X86::COND_LE;
+  case X86::JG_1:  return X86::COND_G;
+  case X86::JGE_1: return X86::COND_GE;
+  case X86::JB_1:  return X86::COND_B;
+  case X86::JBE_1: return X86::COND_BE;
+  case X86::JA_1:  return X86::COND_A;
+  case X86::JAE_1: return X86::COND_AE;
+  case X86::JS_1:  return X86::COND_S;
+  case X86::JNS_1: return X86::COND_NS;
+  case X86::JP_1:  return X86::COND_P;
+  case X86::JNP_1: return X86::COND_NP;
+  case X86::JO_1:  return X86::COND_O;
+  case X86::JNO_1: return X86::COND_NO;
   }
 }
 
-/// getCondFromSETOpc - return condition code of a SET opcode.
+/// Return condition code of a SET opcode.
 static X86::CondCode getCondFromSETOpc(unsigned Opc) {
   switch (Opc) {
   default: return X86::COND_INVALID;
@@ -2648,7 +3215,7 @@ static X86::CondCode getCondFromSETOpc(unsigned Opc) {
   }
 }
 
-/// getCondFromCmovOpc - return condition code of a CMov opcode.
+/// Return condition code of a CMov opcode.
 X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
   switch (Opc) {
   default: return X86::COND_INVALID;
@@ -2706,26 +3273,26 @@ X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
 unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
   switch (CC) {
   default: llvm_unreachable("Illegal condition code!");
-  case X86::COND_E:  return X86::JE_4;
-  case X86::COND_NE: return X86::JNE_4;
-  case X86::COND_L:  return X86::JL_4;
-  case X86::COND_LE: return X86::JLE_4;
-  case X86::COND_G:  return X86::JG_4;
-  case X86::COND_GE: return X86::JGE_4;
-  case X86::COND_B:  return X86::JB_4;
-  case X86::COND_BE: return X86::JBE_4;
-  case X86::COND_A:  return X86::JA_4;
-  case X86::COND_AE: return X86::JAE_4;
-  case X86::COND_S:  return X86::JS_4;
-  case X86::COND_NS: return X86::JNS_4;
-  case X86::COND_P:  return X86::JP_4;
-  case X86::COND_NP: return X86::JNP_4;
-  case X86::COND_O:  return X86::JO_4;
-  case X86::COND_NO: return X86::JNO_4;
+  case X86::COND_E:  return X86::JE_1;
+  case X86::COND_NE: return X86::JNE_1;
+  case X86::COND_L:  return X86::JL_1;
+  case X86::COND_LE: return X86::JLE_1;
+  case X86::COND_G:  return X86::JG_1;
+  case X86::COND_GE: return X86::JGE_1;
+  case X86::COND_B:  return X86::JB_1;
+  case X86::COND_BE: return X86::JBE_1;
+  case X86::COND_A:  return X86::JA_1;
+  case X86::COND_AE: return X86::JAE_1;
+  case X86::COND_S:  return X86::JS_1;
+  case X86::COND_NS: return X86::JNS_1;
+  case X86::COND_P:  return X86::JP_1;
+  case X86::COND_NP: return X86::JNP_1;
+  case X86::COND_O:  return X86::JO_1;
+  case X86::COND_NO: return X86::JNO_1;
   }
 }
 
-/// GetOppositeBranchCondition - Return the inverse of the specified condition,
+/// Return the inverse of the specified condition,
 /// e.g. turning COND_E to COND_NE.
 X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
   switch (CC) {
@@ -2749,9 +3316,8 @@ X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
   }
 }
 
-/// getSwappedCondition - assume the flags are set by MI(a,b), return
-/// the condition code if we modify the instructions such that flags are
-/// set by MI(b,a).
+/// Assuming the flags are set by MI(a,b), return the condition code if we
+/// modify the instructions such that flags are set by MI(b,a).
 static X86::CondCode getSwappedCondition(X86::CondCode CC) {
   switch (CC) {
   default: return X86::COND_INVALID;
@@ -2768,7 +3334,7 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) {
   }
 }
 
-/// getSETFromCond - Return a set opcode for the given condition and
+/// Return a set opcode for the given condition and
 /// whether it has memory operand.
 unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
   static const uint16_t Opc[16][2] = {
@@ -2794,7 +3360,7 @@ unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
   return Opc[CC][HasMemoryOperand ? 1 : 0];
 }
 
-/// getCMovFromCond - Return a cmov opcode for the given condition,
+/// Return a cmov opcode for the given condition,
 /// register size in bytes, and operand type.
 unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
                               bool HasMemoryOperand) {
@@ -2879,7 +3445,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       return true;
 
     // Handle unconditional branches.
-    if (I->getOpcode() == X86::JMP_4) {
+    if (I->getOpcode() == X86::JMP_1) {
       UnCondBrIter = I;
 
       if (!AllowModify) {
@@ -2941,7 +3507,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
         BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
           .addMBB(UnCondBrIter->getOperand(0).getMBB());
-        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_4))
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
           .addMBB(TargetBB);
 
         OldInst->eraseFromParent();
@@ -3006,7 +3572,7 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
     --I;
     if (I->isDebugValue())
       continue;
-    if (I->getOpcode() != X86::JMP_4 &&
+    if (I->getOpcode() != X86::JMP_1 &&
         getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
       break;
     // Remove the branch.
@@ -3031,7 +3597,7 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   if (Cond.empty()) {
     // Unconditional branch?
     assert(!FBB && "Unconditional branch with multiple successors!");
-    BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
     return 1;
   }
 
@@ -3041,16 +3607,16 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   switch (CC) {
   case X86::COND_NP_OR_E:
     // Synthesize NP_OR_E with two branches.
-    BuildMI(&MBB, DL, get(X86::JNP_4)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
     ++Count;
-    BuildMI(&MBB, DL, get(X86::JE_4)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(X86::JE_1)).addMBB(TBB);
     ++Count;
     break;
   case X86::COND_NE_OR_P:
     // Synthesize NE_OR_P with two branches.
-    BuildMI(&MBB, DL, get(X86::JNE_4)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB);
     ++Count;
-    BuildMI(&MBB, DL, get(X86::JP_4)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB);
     ++Count;
     break;
   default: {
@@ -3061,7 +3627,7 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   }
   if (FBB) {
     // Two-way Conditional branch. Insert the second branch.
-    BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(FBB);
+    BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
     ++Count;
   }
   return Count;
@@ -3117,7 +3683,7 @@ void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
    BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
 }
 
-/// isHReg - Test if the given register is a physical h register.
+/// Test if the given register is a physical h register.
 static bool isHReg(unsigned Reg) {
   return X86::GR8_ABCD_HRegClass.contains(Reg);
 }
@@ -3389,11 +3955,9 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
          "Stack slot too small for store");
   unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
-  bool isAligned = (MF.getTarget()
-                        .getSubtargetImpl()
-                        ->getFrameLowering()
-                        ->getStackAlignment() >= Alignment) ||
-                   RI.canRealignStack(MF);
+  bool isAligned =
+      (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+      RI.canRealignStack(MF);
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
   DebugLoc DL = MBB.findDebugLoc(MI);
   addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
@@ -3428,11 +3992,9 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
   unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
-  bool isAligned = (MF.getTarget()
-                        .getSubtargetImpl()
-                        ->getFrameLowering()
-                        ->getStackAlignment() >= Alignment) ||
-                   RI.canRealignStack(MF);
+  bool isAligned =
+      (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+      RI.canRealignStack(MF);
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
   DebugLoc DL = MBB.findDebugLoc(MI);
   addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
@@ -3528,7 +4090,7 @@ analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
   return false;
 }
 
-/// isRedundantFlagInstr - check whether the first instruction, whose only
+/// Check whether the first instruction, whose only
 /// purpose is to update flags, can be made redundant.
 /// CMPrr can be made redundant by SUBrr if the operands are the same.
 /// This function can be extended later on.
@@ -3571,7 +4133,7 @@ inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg,
   return false;
 }
 
-/// isDefConvertible - check whether the definition can be converted
+/// Check whether the definition can be converted
 /// to remove a comparison against zero.
 inline static bool isDefConvertible(MachineInstr *MI) {
   switch (MI->getOpcode()) {
@@ -3601,14 +4163,12 @@ inline static bool isDefConvertible(MachineInstr *MI) {
   case X86::SUB16rr:   case X86::SUB8rr:   case X86::SUB64rm:
   case X86::SUB32rm:   case X86::SUB16rm:  case X86::SUB8rm:
   case X86::DEC64r:    case X86::DEC32r:   case X86::DEC16r: case X86::DEC8r:
-  case X86::DEC64_32r: case X86::DEC64_16r:
   case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
   case X86::ADD32ri8:  case X86::ADD16ri:  case X86::ADD16ri8:
   case X86::ADD8ri:    case X86::ADD64rr:  case X86::ADD32rr:
   case X86::ADD16rr:   case X86::ADD8rr:   case X86::ADD64rm:
   case X86::ADD32rm:   case X86::ADD16rm:  case X86::ADD8rm:
   case X86::INC64r:    case X86::INC32r:   case X86::INC16r: case X86::INC8r:
-  case X86::INC64_32r: case X86::INC64_16r:
   case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
   case X86::AND32ri8:  case X86::AND16ri:  case X86::AND16ri8:
   case X86::AND8ri:    case X86::AND64rr:  case X86::AND32rr:
@@ -3659,8 +4219,7 @@ inline static bool isDefConvertible(MachineInstr *MI) {
   }
 }
 
-/// isUseDefConvertible - check whether the use can be converted
-/// to remove a comparison against zero.
+/// Check whether the use can be converted to remove a comparison against zero.
 static X86::CondCode isUseDefConvertible(MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default: return X86::COND_INVALID;
@@ -3679,7 +4238,7 @@ static X86::CondCode isUseDefConvertible(MachineInstr *MI) {
   }
 }
 
-/// optimizeCompareInstr - Check if there exists an earlier instruction that
+/// Check if there exists an earlier instruction that
 /// operates on the same source operands and sets flags in the same way as
 /// Compare; remove Compare if possible.
 bool X86InstrInfo::
@@ -3970,7 +4529,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   return true;
 }
 
-/// optimizeLoadInstr - Try to remove the load by folding it to a register
+/// Try to remove the load by folding it to a register
 /// operand at the use. We fold the load instructions if load defines a virtual
 /// register, the virtual register is used once in the same BB, and the
 /// instructions in-between do not load or store, and have no side effects.
@@ -4025,9 +4584,9 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
   return nullptr;
 }
 
-/// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr
-/// instruction with two undef reads of the register being defined.  This is
-/// used for mapping:
+/// Expand a single-def pseudo instruction to a two-addr
+/// instruction with two undef reads of the register being defined.
+/// This is used for mapping:
 ///   %xmm4 = V_SET0
 /// to:
 ///   %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
@@ -4099,7 +4658,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case X86::TEST8ri_NOREX:
     MI->setDesc(get(X86::TEST8ri));
     return true;
-  case X86::KSET0B: 
+  case X86::KSET0B:
   case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr));
   case X86::KSET1B:
   case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr));
@@ -4179,7 +4738,7 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
 
 MachineInstr*
 X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                    MachineInstr *MI, unsigned i,
+                                    MachineInstr *MI, unsigned OpNum,
                                     const SmallVectorImpl<MachineOperand> &MOs,
                                     unsigned Size, unsigned Align,
                                     bool AllowCommute) const {
@@ -4188,12 +4747,11 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   bool isCallRegIndirect = Subtarget.callRegIndirect();
   bool isTwoAddrFold = false;
 
-  // Atom favors register form of call. So, we do not fold loads into calls
-  // when X86Subtarget is Atom.
+  // For CPUs that favor the register form of a call,
+  // do not fold loads into calls.
   if (isCallRegIndirect &&
-    (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) {
+    (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r))
     return nullptr;
-  }
 
   unsigned NumOps = MI->getDesc().getNumOperands();
   bool isTwoAddr = NumOps > 1 &&
@@ -4209,13 +4767,13 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   // Folding a memory location into the two-address part of a two-address
   // instruction is different than folding it other places.  It requires
   // replacing the *two* registers with the memory location.
-  if (isTwoAddr && NumOps >= 2 && i < 2 &&
+  if (isTwoAddr && NumOps >= 2 && OpNum < 2 &&
       MI->getOperand(0).isReg() &&
       MI->getOperand(1).isReg() &&
       MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) {
     OpcodeTablePtr = &RegOp2MemOpTable2Addr;
     isTwoAddrFold = true;
-  } else if (i == 0) { // If operand 0
+  } else if (OpNum == 0) {
     if (MI->getOpcode() == X86::MOV32r0) {
       NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
       if (NewMI)
@@ -4223,12 +4781,14 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     }
 
     OpcodeTablePtr = &RegOp2MemOpTable0;
-  } else if (i == 1) {
+  } else if (OpNum == 1) {
     OpcodeTablePtr = &RegOp2MemOpTable1;
-  } else if (i == 2) {
+  } else if (OpNum == 2) {
     OpcodeTablePtr = &RegOp2MemOpTable2;
-  } else if (i == 3) {
+  } else if (OpNum == 3) {
     OpcodeTablePtr = &RegOp2MemOpTable3;
+  } else if (OpNum == 4) {
+    OpcodeTablePtr = &RegOp2MemOpTable4;
   }
 
   // If table selected...
@@ -4243,7 +4803,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
         return nullptr;
       bool NarrowToMOV32rm = false;
       if (Size) {
-        unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize();
+        unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
         if (Size < RCSize) {
           // Check if it's safe to fold the load. If the size of the object is
           // narrower than the load width, then it's not.
@@ -4262,7 +4822,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       if (isTwoAddrFold)
         NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this);
       else
-        NewMI = FuseInst(MF, Opcode, i, MOs, MI, *this);
+        NewMI = FuseInst(MF, Opcode, OpNum, MOs, MI, *this);
 
       if (NarrowToMOV32rm) {
         // If this is the special case where we use a MOV32rm to load a 32-bit
@@ -4281,7 +4841,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   // If the instruction and target operand are commutable, commute the
   // instruction and try again.
   if (AllowCommute) {
-    unsigned OriginalOpIdx = i, CommuteOpIdx1, CommuteOpIdx2;
+    unsigned OriginalOpIdx = OpNum, CommuteOpIdx1, CommuteOpIdx2;
     if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
       bool HasDef = MI->getDesc().getNumDefs();
       unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
@@ -4339,11 +4899,11 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
 
   // No fusion
   if (PrintFailedFusing && !MI->isCopy())
-    dbgs() << "We failed to fuse operand " << i << " in " << *MI;
+    dbgs() << "We failed to fuse operand " << OpNum << " in " << *MI;
   return nullptr;
 }
 
-/// hasPartialRegUpdate - Return true for all instructions that only update
+/// Return true for all instructions that only update
 /// the first 32 or 64-bits of the destination register and leave the rest
 /// unmodified. This can be used to avoid folding loads if the instructions
 /// only update part of the destination register, and the non-updated part is
@@ -4362,30 +4922,50 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
 static bool hasPartialRegUpdate(unsigned Opcode) {
   switch (Opcode) {
   case X86::CVTSI2SSrr:
+  case X86::CVTSI2SSrm:
   case X86::CVTSI2SS64rr:
+  case X86::CVTSI2SS64rm:
   case X86::CVTSI2SDrr:
+  case X86::CVTSI2SDrm:
   case X86::CVTSI2SD64rr:
+  case X86::CVTSI2SD64rm:
   case X86::CVTSD2SSrr:
+  case X86::CVTSD2SSrm:
   case X86::Int_CVTSD2SSrr:
+  case X86::Int_CVTSD2SSrm:
   case X86::CVTSS2SDrr:
+  case X86::CVTSS2SDrm:
   case X86::Int_CVTSS2SDrr:
+  case X86::Int_CVTSS2SDrm:
   case X86::RCPSSr:
+  case X86::RCPSSm:
   case X86::RCPSSr_Int:
+  case X86::RCPSSm_Int:
   case X86::ROUNDSDr:
+  case X86::ROUNDSDm:
   case X86::ROUNDSDr_Int:
   case X86::ROUNDSSr:
+  case X86::ROUNDSSm:
   case X86::ROUNDSSr_Int:
   case X86::RSQRTSSr:
+  case X86::RSQRTSSm:
   case X86::RSQRTSSr_Int:
+  case X86::RSQRTSSm_Int:
   case X86::SQRTSSr:
+  case X86::SQRTSSm:
   case X86::SQRTSSr_Int:
+  case X86::SQRTSSm_Int:
+  case X86::SQRTSDr:
+  case X86::SQRTSDm:
+  case X86::SQRTSDr_Int:
+  case X86::SQRTSDm_Int:
     return true;
   }
 
   return false;
 }
 
-/// getPartialRegUpdateClearance - Inform the ExeDepsFix pass how many idle
+/// Inform the ExeDepsFix pass how many idle
 /// instructions we would like before a partial register update.
 unsigned X86InstrInfo::
 getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
@@ -4415,28 +4995,52 @@ getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
 static bool hasUndefRegUpdate(unsigned Opcode) {
   switch (Opcode) {
   case X86::VCVTSI2SSrr:
+  case X86::VCVTSI2SSrm:
   case X86::Int_VCVTSI2SSrr:
+  case X86::Int_VCVTSI2SSrm:
   case X86::VCVTSI2SS64rr:
+  case X86::VCVTSI2SS64rm:
   case X86::Int_VCVTSI2SS64rr:
+  case X86::Int_VCVTSI2SS64rm:
   case X86::VCVTSI2SDrr:
+  case X86::VCVTSI2SDrm:
   case X86::Int_VCVTSI2SDrr:
+  case X86::Int_VCVTSI2SDrm:
   case X86::VCVTSI2SD64rr:
+  case X86::VCVTSI2SD64rm:
   case X86::Int_VCVTSI2SD64rr:
+  case X86::Int_VCVTSI2SD64rm:
   case X86::VCVTSD2SSrr:
+  case X86::VCVTSD2SSrm:
   case X86::Int_VCVTSD2SSrr:
+  case X86::Int_VCVTSD2SSrm:
   case X86::VCVTSS2SDrr:
+  case X86::VCVTSS2SDrm:
   case X86::Int_VCVTSS2SDrr:
+  case X86::Int_VCVTSS2SDrm:
   case X86::VRCPSSr:
+  case X86::VRCPSSm:
+  case X86::VRCPSSm_Int:
   case X86::VROUNDSDr:
+  case X86::VROUNDSDm:
   case X86::VROUNDSDr_Int:
   case X86::VROUNDSSr:
+  case X86::VROUNDSSm:
   case X86::VROUNDSSr_Int:
   case X86::VRSQRTSSr:
+  case X86::VRSQRTSSm:
+  case X86::VRSQRTSSm_Int:
   case X86::VSQRTSSr:
-
-  // AVX-512
+  case X86::VSQRTSSm:
+  case X86::VSQRTSSm_Int:
+  case X86::VSQRTSDr:
+  case X86::VSQRTSDm:
+  case X86::VSQRTSDm_Int:
+    // AVX-512
   case X86::VCVTSD2SSZrr:
+  case X86::VCVTSD2SSZrm:
   case X86::VCVTSS2SDZrr:
+  case X86::VCVTSS2SDZrm:
     return true;
   }
 
@@ -4509,8 +5113,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
 
   // Unless optimizing for size, don't fold to avoid partial
   // register update stalls
-  if (!MF.getFunction()->getAttributes().
-        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
+  if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) &&
       hasPartialRegUpdate(MI->getOpcode()))
     return nullptr;
 
@@ -4520,10 +5123,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
   // If the function stack isn't realigned we don't want to fold instructions
   // that need increased alignment.
   if (!RI.needsStackRealignment(MF))
-    Alignment = std::min(Alignment, MF.getTarget()
-                                        .getSubtargetImpl()
-                                        ->getFrameLowering()
-                                        ->getStackAlignment());
+    Alignment =
+        std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment());
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
     unsigned RCSize = 0;
@@ -4587,8 +5188,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
 
   // Unless optimizing for size, don't fold to avoid partial
   // register update stalls
-  if (!MF.getFunction()->getAttributes().
-        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
+  if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) &&
       hasPartialRegUpdate(MI->getOpcode()))
     return nullptr;
 
@@ -4743,7 +5343,7 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
                  std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
   if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
     OpcodeTablePtr = &RegOp2MemOpTable2Addr;
-  } else if (OpNum == 0) { // If operand 0
+  } else if (OpNum == 0) {
     if (Opc == X86::MOV32r0)
       return true;
 
@@ -4986,7 +5586,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
     NewNodes.push_back(Store);
 
     // Preserve memory reference information.
-    cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
+    cast<MachineSDNode>(Store)->setMemRefs(MMOs.first, MMOs.second);
   }
 
   return true;
@@ -5181,26 +5781,26 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
   switch(Second->getOpcode()) {
   default:
     return false;
-  case X86::JE_4:
-  case X86::JNE_4:
-  case X86::JL_4:
-  case X86::JLE_4:
-  case X86::JG_4:
-  case X86::JGE_4:
+  case X86::JE_1:
+  case X86::JNE_1:
+  case X86::JL_1:
+  case X86::JLE_1:
+  case X86::JG_1:
+  case X86::JGE_1:
     FuseKind = FuseInc;
     break;
-  case X86::JB_4:
-  case X86::JBE_4:
-  case X86::JA_4:
-  case X86::JAE_4:
+  case X86::JB_1:
+  case X86::JBE_1:
+  case X86::JA_1:
+  case X86::JAE_1:
     FuseKind = FuseCmp;
     break;
-  case X86::JS_4:
-  case X86::JNS_4:
-  case X86::JP_4:
-  case X86::JNP_4:
-  case X86::JO_4:
-  case X86::JNO_4:
+  case X86::JS_1:
+  case X86::JNS_1:
+  case X86::JP_1:
+  case X86::JNP_1:
+  case X86::JO_1:
+  case X86::JNO_1:
     FuseKind = FuseTest;
     break;
   }
@@ -5313,14 +5913,10 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
     return FuseKind == FuseCmp || FuseKind == FuseInc;
   case X86::INC16r:
   case X86::INC32r:
-  case X86::INC64_16r:
-  case X86::INC64_32r:
   case X86::INC64r:
   case X86::INC8r:
   case X86::DEC16r:
   case X86::DEC32r:
-  case X86::DEC64_16r:
-  case X86::DEC64_32r:
   case X86::DEC64r:
   case X86::DEC8r:
     return FuseKind == FuseInc;
@@ -5345,7 +5941,7 @@ isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
            RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
 }
 
-/// getGlobalBaseReg - Return a virtual register initialized with the
+/// Return a virtual register initialized with the
 /// the global base register value. Output instructions required to
 /// initialize the register in the function entry block, if necessary.
 ///
@@ -5478,7 +6074,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
   MI->setDesc(get(table[Domain-1]));
 }
 
-/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+/// Return the noop instruction to use for a noop.
 void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
   NopInst.setOpcode(X86::NOOP);
 }
@@ -5489,7 +6085,7 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
 // getUnconditionalBranch and getTrap.
 void X86InstrInfo::getUnconditionalBranch(
     MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
-  Branch.setOpcode(X86::JMP_4);
+  Branch.setOpcode(X86::JMP_1);
   Branch.addOperand(MCOperand::CreateExpr(BranchTarget));
 }
 
@@ -5595,7 +6191,7 @@ hasHighOperandLatency(const InstrItineraryData *ItinData,
 }
 
 namespace {
-  /// CGBR - Create Global Base Reg pass. This initializes the PIC
+  /// Create Global Base Reg pass. This initializes the PIC
   /// global base register for x86-32.
   struct CGBR : public MachineFunctionPass {
     static char ID;
@@ -5604,10 +6200,11 @@ namespace {
     bool runOnMachineFunction(MachineFunction &MF) override {
       const X86TargetMachine *TM =
         static_cast<const X86TargetMachine *>(&MF.getTarget());
+      const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
 
       // Don't do anything if this is 64-bit as 64-bit PIC
       // uses RIP relative addressing.
-      if (TM->getSubtarget<X86Subtarget>().is64Bit())
+      if (STI.is64Bit())
         return false;
 
       // Only emit a global base reg in PIC mode.
@@ -5626,10 +6223,10 @@ namespace {
       MachineBasicBlock::iterator MBBI = FirstMBB.begin();
       DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
       MachineRegisterInfo &RegInfo = MF.getRegInfo();
-      const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
+      const X86InstrInfo *TII = STI.getInstrInfo();
 
       unsigned PC;
-      if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT())
+      if (STI.isPICStyleGOT())
         PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
       else
         PC = GlobalBaseReg;
@@ -5640,7 +6237,7 @@ namespace {
 
       // If we're using vanilla 'GOT' PIC style, we should use relative addressing
       // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
-      if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) {
+      if (STI.isPICStyleGOT()) {
         // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
         BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
           .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
@@ -5721,10 +6318,9 @@ namespace {
     MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
                                          unsigned TLSBaseAddrReg) {
       MachineFunction *MF = I->getParent()->getParent();
-      const X86TargetMachine *TM =
-          static_cast<const X86TargetMachine *>(&MF->getTarget());
-      const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
-      const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
+      const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
+      const bool is64Bit = STI.is64Bit();
+      const X86InstrInfo *TII = STI.getInstrInfo();
 
       // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
       MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
@@ -5742,10 +6338,9 @@ namespace {
     // inserting a copy instruction after I. Returns the new instruction.
     MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
       MachineFunction *MF = I->getParent()->getParent();
-      const X86TargetMachine *TM =
-          static_cast<const X86TargetMachine *>(&MF->getTarget());
-      const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
-      const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
+      const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
+      const bool is64Bit = STI.is64Bit();
+      const X86InstrInfo *TII = STI.getInstrInfo();
 
       // Create a virtual register for the TLS base address.
       MachineRegisterInfo &RegInfo = MF->getRegInfo();
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 57b1958..4d15467 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -152,6 +152,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
   RegOp2MemOpTableType RegOp2MemOpTable1;
   RegOp2MemOpTableType RegOp2MemOpTable2;
   RegOp2MemOpTableType RegOp2MemOpTable3;
+  RegOp2MemOpTableType RegOp2MemOpTable4;
 
   /// MemOp2RegOpTable - Load / store unfolding opcode map.
   ///
@@ -174,6 +175,11 @@ public:
   ///
   const X86RegisterInfo &getRegisterInfo() const { return RI; }
 
+  /// getSPAdjust - This returns the stack pointer adjustment made by
+  /// this instruction. For x86, we need to handle more complex call
+  /// sequences involving PUSHes.
+  int getSPAdjust(const MachineInstr *MI) const override;
+
   /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
   /// extension instruction. That is, it's like a copy where it's legal for the
   /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 3dbf819..9881caf 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -32,7 +32,8 @@ def SDTX86Cmov    : SDTypeProfile<1, 4,
 
 // Unary and binary operator instructions that set EFLAGS as a side-effect.
 def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
-                                           [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+                                           [SDTCisSameAs<0, 2>,
+                                            SDTCisInt<0>, SDTCisVT<1, i32>]>;
 
 def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
                                             [SDTCisSameAs<0, 2>,
@@ -188,11 +189,15 @@ def X86rdtsc   : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
 def X86rdtscp  : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void,
                         [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
 def X86rdpmc   : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void,
-                        [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; 
+                        [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
 
 def X86Wrapper    : SDNode<"X86ISD::Wrapper",     SDTX86Wrapper>;
 def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
 
+def X86RecoverFrameAlloc : SDNode<"ISD::FRAME_ALLOC_RECOVER",
+                                  SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+                                                       SDTCisInt<1>]>>;
+
 def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
@@ -261,121 +266,75 @@ def ptr_rc_nosp : PointerLikeRegClass<1>;
 def X86MemAsmOperand : AsmOperandClass {
  let Name = "Mem";
 }
-def X86Mem8AsmOperand : AsmOperandClass {
-  let Name = "Mem8"; let RenderMethod = "addMemOperands";
-}
-def X86Mem16AsmOperand : AsmOperandClass {
-  let Name = "Mem16"; let RenderMethod = "addMemOperands";
-}
-def X86Mem32AsmOperand : AsmOperandClass {
-  let Name = "Mem32"; let RenderMethod = "addMemOperands";
-}
-def X86Mem64AsmOperand : AsmOperandClass {
-  let Name = "Mem64"; let RenderMethod = "addMemOperands";
-}
-def X86Mem80AsmOperand : AsmOperandClass {
-  let Name = "Mem80"; let RenderMethod = "addMemOperands";
-}
-def X86Mem128AsmOperand : AsmOperandClass {
-  let Name = "Mem128"; let RenderMethod = "addMemOperands";
-}
-def X86Mem256AsmOperand : AsmOperandClass {
-  let Name = "Mem256"; let RenderMethod = "addMemOperands";
-}
-def X86Mem512AsmOperand : AsmOperandClass {
-  let Name = "Mem512"; let RenderMethod = "addMemOperands";
-}
-
-// Gather mem operands
-def X86MemVX32Operand : AsmOperandClass {
-  let Name = "MemVX32"; let RenderMethod = "addMemOperands";
-}
-def X86MemVY32Operand : AsmOperandClass {
-  let Name = "MemVY32"; let RenderMethod = "addMemOperands";
-}
-def X86MemVZ32Operand : AsmOperandClass {
-  let Name = "MemVZ32"; let RenderMethod = "addMemOperands";
-}
-def X86MemVX64Operand : AsmOperandClass {
-  let Name = "MemVX64"; let RenderMethod = "addMemOperands";
-}
-def X86MemVY64Operand : AsmOperandClass {
-  let Name = "MemVY64"; let RenderMethod = "addMemOperands";
-}
-def X86MemVZ64Operand : AsmOperandClass {
-  let Name = "MemVZ64"; let RenderMethod = "addMemOperands";
+let RenderMethod = "addMemOperands" in {
+  def X86Mem8AsmOperand   : AsmOperandClass { let Name = "Mem8"; }
+  def X86Mem16AsmOperand  : AsmOperandClass { let Name = "Mem16"; }
+  def X86Mem32AsmOperand  : AsmOperandClass { let Name = "Mem32"; }
+  def X86Mem64AsmOperand  : AsmOperandClass { let Name = "Mem64"; }
+  def X86Mem80AsmOperand  : AsmOperandClass { let Name = "Mem80"; }
+  def X86Mem128AsmOperand : AsmOperandClass { let Name = "Mem128"; }
+  def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; }
+  def X86Mem512AsmOperand : AsmOperandClass { let Name = "Mem512"; }
+  // Gather mem operands
+  def X86MemVX32Operand : AsmOperandClass { let Name = "MemVX32"; }
+  def X86MemVY32Operand : AsmOperandClass { let Name = "MemVY32"; }
+  def X86MemVZ32Operand : AsmOperandClass { let Name = "MemVZ32"; }
+  def X86MemVX64Operand : AsmOperandClass { let Name = "MemVX64"; }
+  def X86MemVY64Operand : AsmOperandClass { let Name = "MemVY64"; }
+  def X86MemVZ64Operand : AsmOperandClass { let Name = "MemVZ64"; }
 }
 
 def X86AbsMemAsmOperand : AsmOperandClass {
   let Name = "AbsMem";
   let SuperClasses = [X86MemAsmOperand];
 }
-class X86MemOperand<string printMethod> : Operand<iPTR> {
+
+class X86MemOperand<string printMethod,
+          AsmOperandClass parserMatchClass = X86MemAsmOperand> : Operand<iPTR> {
   let PrintMethod = printMethod;
   let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
-  let ParserMatchClass = X86MemAsmOperand;
+  let ParserMatchClass = parserMatchClass;
+  let OperandType = "OPERAND_MEMORY";
 }
 
-let OperandType = "OPERAND_MEMORY" in {
+// Gather mem operands
+class X86VMemOperand<RegisterClass RC, string printMethod,
+                     AsmOperandClass parserMatchClass>
+    : X86MemOperand<printMethod, parserMatchClass> {
+  let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, i8imm);
+}
+
+def anymem : X86MemOperand<"printanymem">;
+
 def opaque32mem : X86MemOperand<"printopaquemem">;
 def opaque48mem : X86MemOperand<"printopaquemem">;
 def opaque80mem : X86MemOperand<"printopaquemem">;
 def opaque512mem : X86MemOperand<"printopaquemem">;
 
-def i8mem   : X86MemOperand<"printi8mem"> {
-  let ParserMatchClass = X86Mem8AsmOperand; }
-def i16mem  : X86MemOperand<"printi16mem"> {
-  let ParserMatchClass = X86Mem16AsmOperand; }
-def i32mem  : X86MemOperand<"printi32mem"> {
-  let ParserMatchClass = X86Mem32AsmOperand; }
-def i64mem  : X86MemOperand<"printi64mem"> {
-  let ParserMatchClass = X86Mem64AsmOperand; }
-def i128mem : X86MemOperand<"printi128mem"> {
-  let ParserMatchClass = X86Mem128AsmOperand; }
-def i256mem : X86MemOperand<"printi256mem"> {
-  let ParserMatchClass = X86Mem256AsmOperand; }
-def i512mem : X86MemOperand<"printi512mem"> {
-  let ParserMatchClass = X86Mem512AsmOperand; }
-def f32mem  : X86MemOperand<"printf32mem"> {
-  let ParserMatchClass = X86Mem32AsmOperand; }
-def f64mem  : X86MemOperand<"printf64mem"> {
-  let ParserMatchClass = X86Mem64AsmOperand; }
-def f80mem  : X86MemOperand<"printf80mem"> {
-  let ParserMatchClass = X86Mem80AsmOperand; }
-def f128mem : X86MemOperand<"printf128mem"> {
-  let ParserMatchClass = X86Mem128AsmOperand; }
-def f256mem : X86MemOperand<"printf256mem">{
-  let ParserMatchClass = X86Mem256AsmOperand; }
-def f512mem : X86MemOperand<"printf512mem">{
-  let ParserMatchClass = X86Mem512AsmOperand; }
-def v512mem : Operand<iPTR> {
-  let PrintMethod = "printf512mem";
-  let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm);
-  let ParserMatchClass = X86Mem512AsmOperand; }
+def i8mem   : X86MemOperand<"printi8mem",   X86Mem8AsmOperand>;
+def i16mem  : X86MemOperand<"printi16mem",  X86Mem16AsmOperand>;
+def i32mem  : X86MemOperand<"printi32mem",  X86Mem32AsmOperand>;
+def i64mem  : X86MemOperand<"printi64mem",  X86Mem64AsmOperand>;
+def i128mem : X86MemOperand<"printi128mem", X86Mem128AsmOperand>;
+def i256mem : X86MemOperand<"printi256mem", X86Mem256AsmOperand>;
+def i512mem : X86MemOperand<"printi512mem", X86Mem512AsmOperand>;
+def f32mem  : X86MemOperand<"printf32mem",  X86Mem32AsmOperand>;
+def f64mem  : X86MemOperand<"printf64mem",  X86Mem64AsmOperand>;
+def f80mem  : X86MemOperand<"printf80mem",  X86Mem80AsmOperand>;
+def f128mem : X86MemOperand<"printf128mem", X86Mem128AsmOperand>;
+def f256mem : X86MemOperand<"printf256mem", X86Mem256AsmOperand>;
+def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>;
+
+def v512mem : X86VMemOperand<VR512, "printf512mem", X86Mem512AsmOperand>;
 
 // Gather mem operands
-def vx32mem : X86MemOperand<"printi32mem">{
-  let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm);
-  let ParserMatchClass = X86MemVX32Operand; }
-def vy32mem : X86MemOperand<"printi32mem">{
-  let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
-  let ParserMatchClass = X86MemVY32Operand; }
-def vx64mem : X86MemOperand<"printi64mem">{
-  let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm);
-  let ParserMatchClass = X86MemVX64Operand; }
-def vy64mem : X86MemOperand<"printi64mem">{
-  let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
-  let ParserMatchClass = X86MemVY64Operand; }
-def vy64xmem : X86MemOperand<"printi64mem">{
-  let MIOperandInfo = (ops ptr_rc, i8imm, VR256X, i32imm, i8imm);
-  let ParserMatchClass = X86MemVY64Operand; }
-def vz32mem : X86MemOperand<"printi32mem">{
-  let MIOperandInfo = (ops ptr_rc, i16imm, VR512, i32imm, i8imm);
-  let ParserMatchClass = X86MemVZ32Operand; }
-def vz64mem : X86MemOperand<"printi64mem">{
-  let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm);
-  let ParserMatchClass = X86MemVZ64Operand; }
-}
+def vx32mem  : X86VMemOperand<VR128,  "printi32mem", X86MemVX32Operand>;
+def vy32mem  : X86VMemOperand<VR256,  "printi32mem", X86MemVY32Operand>;
+def vx64mem  : X86VMemOperand<VR128,  "printi64mem", X86MemVX64Operand>;
+def vy64mem  : X86VMemOperand<VR256,  "printi64mem", X86MemVY64Operand>;
+def vy64xmem : X86VMemOperand<VR256X, "printi64mem", X86MemVY64Operand>;
+def vz32mem  : X86VMemOperand<VR512,  "printi32mem", X86MemVZ32Operand>;
+def vz64mem  : X86VMemOperand<VR512,  "printi64mem", X86MemVZ64Operand>;
 
 // A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
 // plain GR64, so that it doesn't potentially require a REX prefix.
@@ -424,125 +383,180 @@ def brtarget8 : Operand<OtherVT>;
 
 }
 
-def X86SrcIdx8Operand : AsmOperandClass {
-  let Name = "SrcIdx8";
-  let RenderMethod = "addSrcIdxOperands";
-  let SuperClasses = [X86Mem8AsmOperand];
-}
-def X86SrcIdx16Operand : AsmOperandClass {
-  let Name = "SrcIdx16";
-  let RenderMethod = "addSrcIdxOperands";
-  let SuperClasses = [X86Mem16AsmOperand];
-}
-def X86SrcIdx32Operand : AsmOperandClass {
-  let Name = "SrcIdx32";
-  let RenderMethod = "addSrcIdxOperands";
-  let SuperClasses = [X86Mem32AsmOperand];
-}
-def X86SrcIdx64Operand : AsmOperandClass {
-  let Name = "SrcIdx64";
-  let RenderMethod = "addSrcIdxOperands";
-  let SuperClasses = [X86Mem64AsmOperand];
-}
-def X86DstIdx8Operand : AsmOperandClass {
-  let Name = "DstIdx8";
-  let RenderMethod = "addDstIdxOperands";
-  let SuperClasses = [X86Mem8AsmOperand];
-}
-def X86DstIdx16Operand : AsmOperandClass {
-  let Name = "DstIdx16";
-  let RenderMethod = "addDstIdxOperands";
-  let SuperClasses = [X86Mem16AsmOperand];
-}
-def X86DstIdx32Operand : AsmOperandClass {
-  let Name = "DstIdx32";
-  let RenderMethod = "addDstIdxOperands";
-  let SuperClasses = [X86Mem32AsmOperand];
-}
-def X86DstIdx64Operand : AsmOperandClass {
-  let Name = "DstIdx64";
-  let RenderMethod = "addDstIdxOperands";
-  let SuperClasses = [X86Mem64AsmOperand];
-}
-def X86MemOffs8AsmOperand : AsmOperandClass {
-  let Name = "MemOffs8";
-  let RenderMethod = "addMemOffsOperands";
-  let SuperClasses = [X86Mem8AsmOperand];
-}
-def X86MemOffs16AsmOperand : AsmOperandClass {
-  let Name = "MemOffs16";
-  let RenderMethod = "addMemOffsOperands";
-  let SuperClasses = [X86Mem16AsmOperand];
-}
-def X86MemOffs32AsmOperand : AsmOperandClass {
-  let Name = "MemOffs32";
-  let RenderMethod = "addMemOffsOperands";
-  let SuperClasses = [X86Mem32AsmOperand];
-}
-def X86MemOffs64AsmOperand : AsmOperandClass {
-  let Name = "MemOffs64";
-  let RenderMethod = "addMemOffsOperands";
-  let SuperClasses = [X86Mem64AsmOperand];
-}
-let OperandType = "OPERAND_MEMORY" in {
-def srcidx8 : Operand<iPTR> {
-  let ParserMatchClass = X86SrcIdx8Operand;
-  let MIOperandInfo = (ops ptr_rc, i8imm);
-  let PrintMethod = "printSrcIdx8"; }
-def srcidx16 : Operand<iPTR> {
-  let ParserMatchClass = X86SrcIdx16Operand;
-  let MIOperandInfo = (ops ptr_rc, i8imm);
-  let PrintMethod = "printSrcIdx16"; }
-def srcidx32 : Operand<iPTR> {
-  let ParserMatchClass = X86SrcIdx32Operand;
-  let MIOperandInfo = (ops ptr_rc, i8imm);
-  let PrintMethod = "printSrcIdx32"; }
-def srcidx64 : Operand<iPTR> {
-  let ParserMatchClass = X86SrcIdx64Operand;
+// Special parser to detect 16-bit mode to select 16-bit displacement.
+def X86AbsMem16AsmOperand : AsmOperandClass {
+  let Name = "AbsMem16";
+  let RenderMethod = "addAbsMemOperands";
+  let SuperClasses = [X86AbsMemAsmOperand];
+}
+
+// Branch targets have OtherVT type and print as pc-relative values.
+let OperandType = "OPERAND_PCREL",
+    PrintMethod = "printPCRelImm" in {
+let ParserMatchClass = X86AbsMem16AsmOperand in
+  def brtarget16 : Operand<OtherVT>;
+let ParserMatchClass = X86AbsMemAsmOperand in
+  def brtarget32 : Operand<OtherVT>;
+}
+
+let RenderMethod = "addSrcIdxOperands" in {
+  def X86SrcIdx8Operand : AsmOperandClass {
+    let Name = "SrcIdx8";
+    let SuperClasses = [X86Mem8AsmOperand];
+  }
+  def X86SrcIdx16Operand : AsmOperandClass {
+    let Name = "SrcIdx16";
+    let SuperClasses = [X86Mem16AsmOperand];
+  }
+  def X86SrcIdx32Operand : AsmOperandClass {
+    let Name = "SrcIdx32";
+    let SuperClasses = [X86Mem32AsmOperand];
+  }
+  def X86SrcIdx64Operand : AsmOperandClass {
+    let Name = "SrcIdx64";
+    let SuperClasses = [X86Mem64AsmOperand];
+  }
+} // RenderMethod = "addSrcIdxOperands"
+
+let RenderMethod = "addDstIdxOperands" in {
+ def X86DstIdx8Operand : AsmOperandClass {
+   let Name = "DstIdx8";
+   let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86DstIdx16Operand : AsmOperandClass {
+   let Name = "DstIdx16";
+   let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86DstIdx32Operand : AsmOperandClass {
+   let Name = "DstIdx32";
+   let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86DstIdx64Operand : AsmOperandClass {
+   let Name = "DstIdx64";
+   let SuperClasses = [X86Mem64AsmOperand];
+ }
+} // RenderMethod = "addDstIdxOperands"
+
+let RenderMethod = "addMemOffsOperands" in {
+  def X86MemOffs16_8AsmOperand : AsmOperandClass {
+    let Name = "MemOffs16_8";
+    let SuperClasses = [X86Mem8AsmOperand];
+  }
+  def X86MemOffs16_16AsmOperand : AsmOperandClass {
+    let Name = "MemOffs16_16";
+    let SuperClasses = [X86Mem16AsmOperand];
+  }
+  def X86MemOffs16_32AsmOperand : AsmOperandClass {
+    let Name = "MemOffs16_32";
+    let SuperClasses = [X86Mem32AsmOperand];
+  }
+  def X86MemOffs32_8AsmOperand : AsmOperandClass {
+    let Name = "MemOffs32_8";
+    let SuperClasses = [X86Mem8AsmOperand];
+  }
+  def X86MemOffs32_16AsmOperand : AsmOperandClass {
+    let Name = "MemOffs32_16";
+    let SuperClasses = [X86Mem16AsmOperand];
+  }
+  def X86MemOffs32_32AsmOperand : AsmOperandClass {
+    let Name = "MemOffs32_32";
+    let SuperClasses = [X86Mem32AsmOperand];
+  }
+  def X86MemOffs32_64AsmOperand : AsmOperandClass {
+    let Name = "MemOffs32_64";
+    let SuperClasses = [X86Mem64AsmOperand];
+  }
+  def X86MemOffs64_8AsmOperand : AsmOperandClass {
+    let Name = "MemOffs64_8";
+    let SuperClasses = [X86Mem8AsmOperand];
+  }
+  def X86MemOffs64_16AsmOperand : AsmOperandClass {
+    let Name = "MemOffs64_16";
+    let SuperClasses = [X86Mem16AsmOperand];
+  }
+  def X86MemOffs64_32AsmOperand : AsmOperandClass {
+    let Name = "MemOffs64_32";
+    let SuperClasses = [X86Mem32AsmOperand];
+  }
+  def X86MemOffs64_64AsmOperand : AsmOperandClass {
+    let Name = "MemOffs64_64";
+    let SuperClasses = [X86Mem64AsmOperand];
+  }
+} // RenderMethod = "addMemOffsOperands"
+
+class X86SrcIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
+    : X86MemOperand<printMethod, parserMatchClass> {
   let MIOperandInfo = (ops ptr_rc, i8imm);
-  let PrintMethod = "printSrcIdx64"; }
-def dstidx8 : Operand<iPTR> {
-  let ParserMatchClass = X86DstIdx8Operand;
-  let MIOperandInfo = (ops ptr_rc);
-  let PrintMethod = "printDstIdx8"; }
-def dstidx16 : Operand<iPTR> {
-  let ParserMatchClass = X86DstIdx16Operand;
-  let MIOperandInfo = (ops ptr_rc);
-  let PrintMethod = "printDstIdx16"; }
-def dstidx32 : Operand<iPTR> {
-  let ParserMatchClass = X86DstIdx32Operand;
-  let MIOperandInfo = (ops ptr_rc);
-  let PrintMethod = "printDstIdx32"; }
-def dstidx64 : Operand<iPTR> {
-  let ParserMatchClass = X86DstIdx64Operand;
+}
+
+class X86DstIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
+    : X86MemOperand<printMethod, parserMatchClass> {
   let MIOperandInfo = (ops ptr_rc);
-  let PrintMethod = "printDstIdx64"; }
-def offset8 : Operand<iPTR> {
-  let ParserMatchClass = X86MemOffs8AsmOperand;
-  let MIOperandInfo = (ops i64imm, i8imm);
-  let PrintMethod = "printMemOffs8"; }
-def offset16 : Operand<iPTR> {
-  let ParserMatchClass = X86MemOffs16AsmOperand;
-  let MIOperandInfo = (ops i64imm, i8imm);
-  let PrintMethod = "printMemOffs16"; }
-def offset32 : Operand<iPTR> {
-  let ParserMatchClass = X86MemOffs32AsmOperand;
-  let MIOperandInfo = (ops i64imm, i8imm);
-  let PrintMethod = "printMemOffs32"; }
-def offset64 : Operand<iPTR> {
-  let ParserMatchClass = X86MemOffs64AsmOperand;
-  let MIOperandInfo = (ops i64imm, i8imm);
-  let PrintMethod = "printMemOffs64"; }
 }
 
+def srcidx8  : X86SrcIdxOperand<"printSrcIdx8",  X86SrcIdx8Operand>;
+def srcidx16 : X86SrcIdxOperand<"printSrcIdx16", X86SrcIdx16Operand>;
+def srcidx32 : X86SrcIdxOperand<"printSrcIdx32", X86SrcIdx32Operand>;
+def srcidx64 : X86SrcIdxOperand<"printSrcIdx64", X86SrcIdx64Operand>;
+def dstidx8  : X86DstIdxOperand<"printDstIdx8",  X86DstIdx8Operand>;
+def dstidx16 : X86DstIdxOperand<"printDstIdx16", X86DstIdx16Operand>;
+def dstidx32 : X86DstIdxOperand<"printDstIdx32", X86DstIdx32Operand>;
+def dstidx64 : X86DstIdxOperand<"printDstIdx64", X86DstIdx64Operand>;
+
+class X86MemOffsOperand<Operand immOperand, string printMethod,
+                        AsmOperandClass parserMatchClass>
+    : X86MemOperand<printMethod, parserMatchClass> {
+  let MIOperandInfo = (ops immOperand, i8imm);
+}
+
+def offset16_8  : X86MemOffsOperand<i16imm, "printMemOffs8",
+                                    X86MemOffs16_8AsmOperand>;
+def offset16_16 : X86MemOffsOperand<i16imm, "printMemOffs16",
+                                    X86MemOffs16_16AsmOperand>;
+def offset16_32 : X86MemOffsOperand<i16imm, "printMemOffs32",
+                                    X86MemOffs16_32AsmOperand>;
+def offset32_8  : X86MemOffsOperand<i32imm, "printMemOffs8",
+                                    X86MemOffs32_8AsmOperand>;
+def offset32_16 : X86MemOffsOperand<i32imm, "printMemOffs16",
+                                    X86MemOffs32_16AsmOperand>;
+def offset32_32 : X86MemOffsOperand<i32imm, "printMemOffs32",
+                                    X86MemOffs32_32AsmOperand>;
+def offset32_64 : X86MemOffsOperand<i32imm, "printMemOffs64",
+                                    X86MemOffs32_64AsmOperand>;
+def offset64_8  : X86MemOffsOperand<i64imm, "printMemOffs8",
+                                    X86MemOffs64_8AsmOperand>;
+def offset64_16 : X86MemOffsOperand<i64imm, "printMemOffs16",
+                                    X86MemOffs64_16AsmOperand>;
+def offset64_32 : X86MemOffsOperand<i64imm, "printMemOffs32",
+                                    X86MemOffs64_32AsmOperand>;
+def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64",
+                                    X86MemOffs64_64AsmOperand>;
 
 def SSECC : Operand<i8> {
-  let PrintMethod = "printSSECC";
+  let PrintMethod = "printSSEAVXCC";
   let OperandType = "OPERAND_IMMEDIATE";
 }
 
+def i8immZExt3 : ImmLeaf<i8, [{
+  return Imm >= 0 && Imm < 8;
+}]>;
+
 def AVXCC : Operand<i8> {
-  let PrintMethod = "printAVXCC";
+  let PrintMethod = "printSSEAVXCC";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def i8immZExt5 : ImmLeaf<i8, [{
+  return Imm >= 0 && Imm < 32;
+}]>;
+
+def AVX512ICC : Operand<i8> {
+  let PrintMethod = "printSSEAVXCC";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def XOPCC : Operand<i8> {
+  let PrintMethod = "printXOPCC";
   let OperandType = "OPERAND_IMMEDIATE";
 }
 
@@ -599,6 +613,14 @@ def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
                       ImmSExti64i32AsmOperand];
 }
 
+// Unsigned immediate used by SSE/AVX instructions
+// [0, 0xFF]
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmUnsignedi8AsmOperand : AsmOperandClass {
+  let Name = "ImmUnsignedi8";
+  let RenderMethod = "addImmOperands";
+}
+
 // A couple of more descriptive operand definitions.
 // 16-bits but only 8 bits are significant.
 def i16i8imm  : Operand<i16> {
@@ -617,6 +639,27 @@ def i64i32imm  : Operand<i64> {
   let OperandType = "OPERAND_IMMEDIATE";
 }
 
+// 64-bits but only 8 bits are significant.
+def i64i8imm   : Operand<i64> {
+  let ParserMatchClass = ImmSExti64i8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// Unsigned 8-bit immediate used by SSE/AVX instructions.
+def u8imm : Operand<i8> {
+  let PrintMethod = "printU8Imm";
+  let ParserMatchClass = ImmUnsignedi8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 32-bit immediate but only 8-bits are significant and they are unsigned.
+// Used by some SSE/AVX instructions that use intrinsics.
+def i32u8imm : Operand<i32> {
+  let PrintMethod = "printU8Imm";
+  let ParserMatchClass = ImmUnsignedi8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
 // 64-bits but only 32 bits are significant, and those bits are treated as being
 // pc relative.
 def i64i32imm_pcrel : Operand<i64> {
@@ -625,21 +668,15 @@ def i64i32imm_pcrel : Operand<i64> {
   let OperandType = "OPERAND_PCREL";
 }
 
-// 64-bits but only 8 bits are significant.
-def i64i8imm   : Operand<i64> {
-  let ParserMatchClass = ImmSExti64i8AsmOperand;
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-
 def lea64_32mem : Operand<i32> {
-  let PrintMethod = "printi32mem";
+  let PrintMethod = "printanymem";
   let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
   let ParserMatchClass = X86MemAsmOperand;
 }
 
 // Memory operands that use 64-bit pointers in both ILP32 and LP64.
 def lea64mem : Operand<i64> {
-  let PrintMethod = "printi64mem";
+  let PrintMethod = "printanymem";
   let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
   let ParserMatchClass = X86MemAsmOperand;
 }
@@ -676,6 +713,9 @@ def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
 def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
                                [tglobaltlsaddr], []>;
 
+def vectoraddr : ComplexPattern<iPTR, 5, "SelectAddr", [],[SDNPWantParent]>;
+//def vectoraddr : ComplexPattern<iPTR, 5, "SelectVectorAddr", [],[SDNPWantParent]>;
+
 //===----------------------------------------------------------------------===//
 // X86 Instruction Predicate Definitions.
 def HasCMov      : Predicate<"Subtarget->hasCMov()">;
@@ -706,14 +746,19 @@ def HasAVX512    : Predicate<"Subtarget->hasAVX512()">,
 def UseAVX       : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
 def UseAVX2      : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
 def NoAVX512     : Predicate<"!Subtarget->hasAVX512()">;
-def HasCDI       : Predicate<"Subtarget->hasCDI()">;
-def HasPFI       : Predicate<"Subtarget->hasPFI()">;
-def HasERI       : Predicate<"Subtarget->hasERI()">;
-def HasDQI       : Predicate<"Subtarget->hasDQI()">;
+def HasCDI       : Predicate<"Subtarget->hasCDI()">,
+                     AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">;
+def HasPFI       : Predicate<"Subtarget->hasPFI()">,
+                     AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">;
+def HasERI       : Predicate<"Subtarget->hasERI()">,
+                     AssemblerPredicate<"FeatureERI", "AVX-512 ER ISA">;
+def HasDQI       : Predicate<"Subtarget->hasDQI()">,
+                     AssemblerPredicate<"FeatureDQI", "AVX-512 DQ ISA">;
 def NoDQI        : Predicate<"!Subtarget->hasDQI()">;
-def HasBWI       : Predicate<"Subtarget->hasBWI()">;
+def HasBWI       : Predicate<"Subtarget->hasBWI()">,
+                     AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">;
 def HasVLX       : Predicate<"Subtarget->hasVLX()">,
-                     AssemblerPredicate<"FeatureVLX", "AVX-512 VLX ISA">;
+                     AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">;
 def NoVLX        : Predicate<"!Subtarget->hasVLX()">;
 
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
@@ -736,10 +781,8 @@ def HasHLE       : Predicate<"Subtarget->hasHLE()">;
 def HasTSX       : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
 def HasADX       : Predicate<"Subtarget->hasADX()">;
 def HasSHA       : Predicate<"Subtarget->hasSHA()">;
-def HasSGX       : Predicate<"Subtarget->hasSGX()">;
 def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
 def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
-def HasSMAP      : Predicate<"Subtarget->hasSMAP()">;
 def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
@@ -757,6 +800,9 @@ def Not16BitMode : Predicate<"!Subtarget->is16Bit()">,
 def In32BitMode  : Predicate<"Subtarget->is32Bit()">,
                              AssemblerPredicate<"Mode32Bit", "32-bit mode">;
 def IsWin64      : Predicate<"Subtarget->isTargetWin64()">;
+def NotWin64     : Predicate<"!Subtarget->isTargetWin64()">;
+def IsPS4        : Predicate<"Subtarget->isTargetPS4()">;
+def NotPS4       : Predicate<"!Subtarget->isTargetPS4()">;
 def IsNaCl       : Predicate<"Subtarget->isTargetNaCl()">;
 def NotNaCl      : Predicate<"!Subtarget->isTargetNaCl()">;
 def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
@@ -773,6 +819,7 @@ def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
 def CallImmAddr  : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
 def FavorMemIndirectCall  : Predicate<"!Subtarget->callRegIndirect()">;
 def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
+def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
 
 //===----------------------------------------------------------------------===//
 // X86 Instruction Format Definitions.
@@ -803,6 +850,11 @@ def X86_COND_O   : PatLeaf<(i8 13)>;
 def X86_COND_P   : PatLeaf<(i8 14)>; // alt. COND_PE
 def X86_COND_S   : PatLeaf<(i8 15)>;
 
+// Predicate used to help when pattern matching LZCNT/TZCNT.
+def X86_COND_E_OR_NE : ImmLeaf<i8, [{
+  return (Imm == X86::COND_E) || (Imm == X86::COND_NE);
+}]>;
+
 let FastIselShouldIgnore = 1 in { // FastIsel should ignore all simm8 instrs.
   def i16immSExt8  : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>;
   def i32immSExt8  : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>;
@@ -905,7 +957,7 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
 //
 
 // Nop
-let neverHasSideEffects = 1, SchedRW = [WriteZero] in {
+let hasSideEffects = 0, SchedRW = [WriteZero] in {
   def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
   def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero),
                 "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16;
@@ -919,12 +971,12 @@ def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
                  "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>;
 
 let SchedRW = [WriteALU] in {
-let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in
+let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, hasSideEffects=0 in
 def LEAVE    : I<0xC9, RawFrm,
                  (outs), (ins), "leave", [], IIC_LEAVE>,
                  Requires<[Not64BitMode]>;
 
-let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in
+let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, hasSideEffects = 0 in
 def LEAVE64  : I<0xC9, RawFrm,
                  (outs), (ins), "leave", [], IIC_LEAVE>,
                  Requires<[In64BitMode]>;
@@ -934,7 +986,7 @@ def LEAVE64  : I<0xC9, RawFrm,
 //  Miscellaneous Instructions.
 //
 
-let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in {
+let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in {
 let mayLoad = 1, SchedRW = [WriteLoad] in {
 def POP16r  : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
                 IIC_POP_REG16>, OpSize16;
@@ -948,11 +1000,6 @@ def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
                 IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
 def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [],
                 IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>;
-
-def POPF16   : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>,
-                OpSize16;
-def POPF32   : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
-                OpSize32, Requires<[Not64BitMode]>;
 } // mayLoad, SchedRW
 
 let mayStore = 1, SchedRW = [WriteStore] in {
@@ -981,16 +1028,26 @@ def PUSHi16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
 def PUSHi32  : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
                    "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
                    Requires<[Not64BitMode]>;
+} // mayStore, SchedRW
+}
+
+let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0,
+    SchedRW = [WriteLoad] in {
+def POPF16   : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>,
+                OpSize16;
+def POPF32   : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
+                OpSize32, Requires<[Not64BitMode]>;
+}
 
+let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, hasSideEffects=0,
+    SchedRW = [WriteStore] in {
 def PUSHF16  : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
                  OpSize16;
 def PUSHF32  : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>,
                OpSize32, Requires<[Not64BitMode]>;
-
-} // mayStore, SchedRW
 }
 
-let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in {
+let Defs = [RSP], Uses = [RSP], hasSideEffects=0 in {
 let mayLoad = 1, SchedRW = [WriteLoad] in {
 def POP64r   : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
                  IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
@@ -1009,7 +1066,7 @@ def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
 } // mayStore, SchedRW
 }
 
-let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1,
+let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1,
     SchedRW = [WriteStore] in {
 def PUSH64i8   : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
                     "push{q}\t$imm", [], IIC_PUSH_IMM>, Requires<[In64BitMode]>;
@@ -1021,22 +1078,22 @@ def PUSH64i32  : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
                     Requires<[In64BitMode]>;
 }
 
-let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, neverHasSideEffects=1 in
+let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in
 def POPF64   : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
                OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>;
-let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in
+let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, hasSideEffects=0 in
 def PUSHF64    : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
                  OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>;
 
 let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
-    mayLoad = 1, neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
+    mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in {
 def POPA32   : I<0x61, RawFrm, (outs), (ins), "popal", [], IIC_POP_A>,
                OpSize32, Requires<[Not64BitMode]>;
 def POPA16   : I<0x61, RawFrm, (outs), (ins), "popaw", [], IIC_POP_A>,
                OpSize16, Requires<[Not64BitMode]>;
 }
 let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
-    mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
+    mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
 def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pushal", [], IIC_PUSH_A>,
                OpSize32, Requires<[Not64BitMode]>;
 def PUSHA16  : I<0x60, RawFrm, (outs), (ins), "pushaw", [], IIC_PUSH_A>,
@@ -1166,7 +1223,7 @@ def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
 //  Move Instructions.
 //
 let SchedRW = [WriteMove] in {
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 def MOV8rr  : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
                 "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
@@ -1225,62 +1282,67 @@ def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
 
 let hasSideEffects = 0 in {
 
-/// moffs8, moffs16 and moffs32 versions of moves.  The immediate is a
-/// 32-bit offset from the segment base. These are only valid in x86-32 mode.
+/// Memory offset versions of moves. The immediate is an address mode sized
+/// offset from the segment base.
 let SchedRW = [WriteALU] in {
 let mayLoad = 1 in {
 let Defs = [AL] in
-def MOV8o8a : Ii32 <0xA0, RawFrmMemOffs, (outs), (ins offset8:$src),
-                   "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
-                   Requires<[In32BitMode]>;
+def MOV8ao32 : Ii32<0xA0, RawFrmMemOffs, (outs), (ins offset32_8:$src),
+                    "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
+                    AdSize32;
 let Defs = [AX] in
-def MOV16o16a : Ii32 <0xA1, RawFrmMemOffs, (outs), (ins offset16:$src),
-                      "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
-                      OpSize16, Requires<[In32BitMode]>;
+def MOV16ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_16:$src),
+                     "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+                     OpSize16, AdSize32;
 let Defs = [EAX] in
-def MOV32o32a : Ii32 <0xA1, RawFrmMemOffs, (outs), (ins offset32:$src),
-                      "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
-                      OpSize32, Requires<[In32BitMode]>;
+def MOV32ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_32:$src),
+                     "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+                     OpSize32, AdSize32;
+let Defs = [RAX] in
+def MOV64ao32 : RIi32<0xA1, RawFrmMemOffs, (outs), (ins offset32_64:$src),
+                      "mov{q}\t{$src, %rax|rax, $src}", [], IIC_MOV_MEM>,
+                      AdSize32;
 
 let Defs = [AL] in
-def MOV8o8a_16 : Ii16 <0xA0, RawFrmMemOffs, (outs), (ins offset8:$src),
-                   "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
-                   AdSize, Requires<[In16BitMode]>;
+def MOV8ao16 : Ii16<0xA0, RawFrmMemOffs, (outs), (ins offset16_8:$src),
+                    "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, AdSize16;
 let Defs = [AX] in
-def MOV16o16a_16 : Ii16 <0xA1, RawFrmMemOffs, (outs), (ins offset16:$src),
-                      "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
-                      OpSize16, AdSize, Requires<[In16BitMode]>;
+def MOV16ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_16:$src),
+                     "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+                     OpSize16, AdSize16;
 let Defs = [EAX] in
-def MOV32o32a_16 : Ii16 <0xA1, RawFrmMemOffs, (outs), (ins offset32:$src),
-                      "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
-                      AdSize, OpSize32, Requires<[In16BitMode]>;
+def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src),
+                     "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+                     AdSize16, OpSize32;
 }
 let mayStore = 1 in {
 let Uses = [AL] in
-def MOV8ao8 : Ii32 <0xA2, RawFrmMemOffs, (outs offset8:$dst), (ins),
-                   "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>,
-                  Requires<[In32BitMode]>;
+def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs offset32_8:$dst), (ins),
+                    "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize32;
 let Uses = [AX] in
-def MOV16ao16 : Ii32 <0xA3, RawFrmMemOffs, (outs offset16:$dst), (ins),
-                      "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
-                      OpSize16, Requires<[In32BitMode]>;
+def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs offset32_16:$dst), (ins),
+                     "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+                     OpSize16, AdSize32;
 let Uses = [EAX] in
-def MOV32ao32 : Ii32 <0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins),
-                      "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
-                     OpSize32, Requires<[In32BitMode]>;
+def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs offset32_32:$dst), (ins),
+                     "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+                     OpSize32, AdSize32;
+let Uses = [RAX] in
+def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs offset32_64:$dst), (ins),
+                      "mov{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>,
+                      AdSize32;
 
 let Uses = [AL] in
-def MOV8ao8_16 : Ii16 <0xA2, RawFrmMemOffs, (outs offset8:$dst), (ins),
-                   "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>,
-                  AdSize, Requires<[In16BitMode]>;
+def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs offset16_8:$dst), (ins),
+                    "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize16;
 let Uses = [AX] in
-def MOV16ao16_16 : Ii16 <0xA3, RawFrmMemOffs, (outs offset16:$dst), (ins),
-                      "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
-                      OpSize16, AdSize, Requires<[In16BitMode]>;
+def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs offset16_16:$dst), (ins),
+                     "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+                     OpSize16, AdSize16;
 let Uses = [EAX] in
-def MOV32ao32_16 : Ii16 <0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins),
-                      "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
-                     OpSize32, AdSize, Requires<[In16BitMode]>;
+def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs offset16_32:$dst), (ins),
+                     "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+                     OpSize32, AdSize16;
 }
 }
 
@@ -1288,40 +1350,34 @@ def MOV32ao32_16 : Ii16 <0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins),
 // and use the movabs mnemonic to indicate this specific form.
 let mayLoad = 1 in {
 let Defs = [AL] in
-def MOV64o8a : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset8:$src),
-                     "movabs{b}\t{$src, %al|al, $src}", []>,
-                     Requires<[In64BitMode]>;
+def MOV8ao64 : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src),
+                     "movabs{b}\t{$src, %al|al, $src}", []>, AdSize64;
 let Defs = [AX] in
-def MOV64o16a : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset16:$src),
-                     "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize16,
-                     Requires<[In64BitMode]>;
+def MOV16ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src),
+                     "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize16, AdSize64;
 let Defs = [EAX] in
-def MOV64o32a : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset32:$src),
+def MOV32ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src),
                      "movabs{l}\t{$src, %eax|eax, $src}", []>, OpSize32,
-                     Requires<[In64BitMode]>;
+                     AdSize64;
 let Defs = [RAX] in
-def MOV64o64a : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64:$src),
-                     "movabs{q}\t{$src, %rax|rax, $src}", []>,
-                     Requires<[In64BitMode]>;
+def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src),
+                     "movabs{q}\t{$src, %rax|rax, $src}", []>, AdSize64;
 }
 
 let mayStore = 1 in {
 let Uses = [AL] in
-def MOV64ao8 : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs offset8:$dst), (ins),
-                     "movabs{b}\t{%al, $dst|$dst, al}", []>,
-                     Requires<[In64BitMode]>;
+def MOV8o64a : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs offset64_8:$dst), (ins),
+                     "movabs{b}\t{%al, $dst|$dst, al}", []>, AdSize64;
 let Uses = [AX] in
-def MOV64ao16 : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset16:$dst), (ins),
-                     "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize16,
-                     Requires<[In64BitMode]>;
+def MOV16o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset64_16:$dst), (ins),
+                     "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize16, AdSize64;
 let Uses = [EAX] in
-def MOV64ao32 : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins),
+def MOV32o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset64_32:$dst), (ins),
                      "movabs{l}\t{%eax, $dst|$dst, eax}", []>, OpSize32,
-                     Requires<[In64BitMode]>;
+                     AdSize64;
 let Uses = [RAX] in
-def MOV64ao64 : RIi64<0xA3, RawFrmMemOffs, (outs offset64:$dst), (ins),
-                     "movabs{q}\t{%rax, $dst|$dst, rax}", []>,
-                     Requires<[In64BitMode]>;
+def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs offset64_64:$dst), (ins),
+                     "movabs{q}\t{%rax, $dst|$dst, rax}", []>, AdSize64;
 }
 } // hasSideEffects = 0
 
@@ -1371,17 +1427,17 @@ def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
 // that they can be used for copying and storing h registers, which can't be
 // encoded when a REX prefix is present.
 let isCodeGenOnly = 1 in {
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 def MOV8rr_NOREX : I<0x88, MRMDestReg,
                      (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
                      "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [], IIC_MOV>,
                    Sched<[WriteMove]>;
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, hasSideEffects = 0 in
 def MOV8mr_NOREX : I<0x88, MRMDestMem,
                      (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
                      "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
                      IIC_MOV_MEM>, Sched<[WriteStore]>;
-let mayLoad = 1, neverHasSideEffects = 1,
+let mayLoad = 1, hasSideEffects = 0,
     canFoldAsLoad = 1, isReMaterializable = 1 in
 def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
                      (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
@@ -1395,7 +1451,7 @@ let SchedRW = [WriteALU] in {
 let Defs = [EFLAGS], Uses = [AH] in
 def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf",
                  [(set EFLAGS, (X86sahf AH))], IIC_AHF>;
-let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in
+let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in
 def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", [],
                 IIC_AHF>;  // AH = flags
 } // SchedRW
@@ -1981,42 +2037,42 @@ let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
 }
 
 let Predicates = [HasLZCNT] in {
-  def : Pat<(X86cmov (ctlz GR16:$src), (i16 16), (X86_COND_E),
-              (X86cmp GR16:$src, (i16 0))), 
+  def : Pat<(X86cmov (ctlz GR16:$src), (i16 16), (X86_COND_E_OR_NE),
+              (X86cmp GR16:$src, (i16 0))),
             (LZCNT16rr GR16:$src)>;
-  def : Pat<(X86cmov (ctlz GR32:$src), (i32 32), (X86_COND_E),
+  def : Pat<(X86cmov (ctlz GR32:$src), (i32 32), (X86_COND_E_OR_NE),
               (X86cmp GR32:$src, (i32 0))),
             (LZCNT32rr GR32:$src)>;
-  def : Pat<(X86cmov (ctlz GR64:$src), (i64 64), (X86_COND_E),
+  def : Pat<(X86cmov (ctlz GR64:$src), (i64 64), (X86_COND_E_OR_NE),
               (X86cmp GR64:$src, (i64 0))),
             (LZCNT64rr GR64:$src)>;
-  def : Pat<(X86cmov (i16 16), (ctlz GR16:$src), (X86_COND_E),
+  def : Pat<(X86cmov (i16 16), (ctlz GR16:$src), (X86_COND_E_OR_NE),
               (X86cmp GR16:$src, (i16 0))),
             (LZCNT16rr GR16:$src)>;
-  def : Pat<(X86cmov (i32 32), (ctlz GR32:$src), (X86_COND_E),
+  def : Pat<(X86cmov (i32 32), (ctlz GR32:$src), (X86_COND_E_OR_NE),
               (X86cmp GR32:$src, (i32 0))),
             (LZCNT32rr GR32:$src)>;
-  def : Pat<(X86cmov (i64 64), (ctlz GR64:$src), (X86_COND_E),
+  def : Pat<(X86cmov (i64 64), (ctlz GR64:$src), (X86_COND_E_OR_NE),
               (X86cmp GR64:$src, (i64 0))),
             (LZCNT64rr GR64:$src)>;
 
-  def : Pat<(X86cmov (ctlz (loadi16 addr:$src)), (i16 16), (X86_COND_E),
-              (X86cmp (loadi16 addr:$src), (i16 0))), 
+  def : Pat<(X86cmov (ctlz (loadi16 addr:$src)), (i16 16), (X86_COND_E_OR_NE),
+              (X86cmp (loadi16 addr:$src), (i16 0))),
             (LZCNT16rm addr:$src)>;
-  def : Pat<(X86cmov (ctlz (loadi32 addr:$src)), (i32 32), (X86_COND_E),
-              (X86cmp (loadi32 addr:$src), (i32 0))), 
+  def : Pat<(X86cmov (ctlz (loadi32 addr:$src)), (i32 32), (X86_COND_E_OR_NE),
+              (X86cmp (loadi32 addr:$src), (i32 0))),
             (LZCNT32rm addr:$src)>;
-  def : Pat<(X86cmov (ctlz (loadi64 addr:$src)), (i64 64), (X86_COND_E),
-              (X86cmp (loadi64 addr:$src), (i64 0))), 
+  def : Pat<(X86cmov (ctlz (loadi64 addr:$src)), (i64 64), (X86_COND_E_OR_NE),
+              (X86cmp (loadi64 addr:$src), (i64 0))),
             (LZCNT64rm addr:$src)>;
-  def : Pat<(X86cmov (i16 16), (ctlz (loadi16 addr:$src)), (X86_COND_E),
-              (X86cmp (loadi16 addr:$src), (i16 0))), 
+  def : Pat<(X86cmov (i16 16), (ctlz (loadi16 addr:$src)), (X86_COND_E_OR_NE),
+              (X86cmp (loadi16 addr:$src), (i16 0))),
             (LZCNT16rm addr:$src)>;
-  def : Pat<(X86cmov (i32 32), (ctlz (loadi32 addr:$src)), (X86_COND_E),
-              (X86cmp (loadi32 addr:$src), (i32 0))), 
+  def : Pat<(X86cmov (i32 32), (ctlz (loadi32 addr:$src)), (X86_COND_E_OR_NE),
+              (X86cmp (loadi32 addr:$src), (i32 0))),
             (LZCNT32rm addr:$src)>;
-  def : Pat<(X86cmov (i64 64), (ctlz (loadi64 addr:$src)), (X86_COND_E),
-              (X86cmp (loadi64 addr:$src), (i64 0))), 
+  def : Pat<(X86cmov (i64 64), (ctlz (loadi64 addr:$src)), (X86_COND_E_OR_NE),
+              (X86cmp (loadi64 addr:$src), (i64 0))),
             (LZCNT64rm addr:$src)>;
 }
 
@@ -2097,42 +2153,42 @@ let Predicates = [HasBMI] in {
 }
 
 let Predicates = [HasBMI] in {
-  def : Pat<(X86cmov (cttz GR16:$src), (i16 16), (X86_COND_E),
+  def : Pat<(X86cmov (cttz GR16:$src), (i16 16), (X86_COND_E_OR_NE),
               (X86cmp GR16:$src, (i16 0))),
             (TZCNT16rr GR16:$src)>;
-  def : Pat<(X86cmov (cttz GR32:$src), (i32 32), (X86_COND_E),
+  def : Pat<(X86cmov (cttz GR32:$src), (i32 32), (X86_COND_E_OR_NE),
               (X86cmp GR32:$src, (i32 0))),
             (TZCNT32rr GR32:$src)>;
-  def : Pat<(X86cmov (cttz GR64:$src), (i64 64), (X86_COND_E),
+  def : Pat<(X86cmov (cttz GR64:$src), (i64 64), (X86_COND_E_OR_NE),
               (X86cmp GR64:$src, (i64 0))),
             (TZCNT64rr GR64:$src)>;
-  def : Pat<(X86cmov (i16 16), (cttz GR16:$src), (X86_COND_E),
+  def : Pat<(X86cmov (i16 16), (cttz GR16:$src), (X86_COND_E_OR_NE),
               (X86cmp GR16:$src, (i16 0))),
             (TZCNT16rr GR16:$src)>;
-  def : Pat<(X86cmov (i32 32), (cttz GR32:$src), (X86_COND_E),
+  def : Pat<(X86cmov (i32 32), (cttz GR32:$src), (X86_COND_E_OR_NE),
               (X86cmp GR32:$src, (i32 0))),
             (TZCNT32rr GR32:$src)>;
-  def : Pat<(X86cmov (i64 64), (cttz GR64:$src), (X86_COND_E),
+  def : Pat<(X86cmov (i64 64), (cttz GR64:$src), (X86_COND_E_OR_NE),
               (X86cmp GR64:$src, (i64 0))),
             (TZCNT64rr GR64:$src)>;
 
-  def : Pat<(X86cmov (cttz (loadi16 addr:$src)), (i16 16), (X86_COND_E),
-              (X86cmp (loadi16 addr:$src), (i16 0))), 
+  def : Pat<(X86cmov (cttz (loadi16 addr:$src)), (i16 16), (X86_COND_E_OR_NE),
+              (X86cmp (loadi16 addr:$src), (i16 0))),
             (TZCNT16rm addr:$src)>;
-  def : Pat<(X86cmov (cttz (loadi32 addr:$src)), (i32 32), (X86_COND_E),
-              (X86cmp (loadi32 addr:$src), (i32 0))), 
+  def : Pat<(X86cmov (cttz (loadi32 addr:$src)), (i32 32), (X86_COND_E_OR_NE),
+              (X86cmp (loadi32 addr:$src), (i32 0))),
             (TZCNT32rm addr:$src)>;
-  def : Pat<(X86cmov (cttz (loadi64 addr:$src)), (i64 64), (X86_COND_E),
-              (X86cmp (loadi64 addr:$src), (i64 0))), 
+  def : Pat<(X86cmov (cttz (loadi64 addr:$src)), (i64 64), (X86_COND_E_OR_NE),
+              (X86cmp (loadi64 addr:$src), (i64 0))),
             (TZCNT64rm addr:$src)>;
-  def : Pat<(X86cmov (i16 16), (cttz (loadi16 addr:$src)), (X86_COND_E),
-              (X86cmp (loadi16 addr:$src), (i16 0))), 
+  def : Pat<(X86cmov (i16 16), (cttz (loadi16 addr:$src)), (X86_COND_E_OR_NE),
+              (X86cmp (loadi16 addr:$src), (i16 0))),
             (TZCNT16rm addr:$src)>;
-  def : Pat<(X86cmov (i32 32), (cttz (loadi32 addr:$src)), (X86_COND_E),
-              (X86cmp (loadi32 addr:$src), (i32 0))), 
+  def : Pat<(X86cmov (i32 32), (cttz (loadi32 addr:$src)), (X86_COND_E_OR_NE),
+              (X86cmp (loadi32 addr:$src), (i32 0))),
             (TZCNT32rm addr:$src)>;
-  def : Pat<(X86cmov (i64 64), (cttz (loadi64 addr:$src)), (X86_COND_E),
-              (X86cmp (loadi64 addr:$src), (i64 0))), 
+  def : Pat<(X86cmov (i64 64), (cttz (loadi64 addr:$src)), (X86_COND_E_OR_NE),
+              (X86cmp (loadi64 addr:$src), (i64 0))),
             (TZCNT64rm addr:$src)>;
 }
 
@@ -2167,11 +2223,11 @@ let Predicates = [HasBMI2], Defs = [EFLAGS] in {
 
 def CountTrailingOnes : SDNodeXForm<imm, [{
   // Count the trailing ones in the immediate.
-  return getI8Imm(CountTrailingOnes_64(N->getZExtValue()));
+  return getI8Imm(countTrailingOnes(N->getZExtValue()));
 }]>;
 
 def BZHIMask : ImmLeaf<i64, [{
-  return isMask_64(Imm) && (CountTrailingOnes_64(Imm) > 32);
+  return isMask_64(Imm) && (countTrailingOnes<uint64_t>(Imm) > 32);
 }]>;
 
 let Predicates = [HasBMI2] in {
@@ -2361,6 +2417,16 @@ let Predicates = [HasTBM] in {
 } // HasTBM
 
 //===----------------------------------------------------------------------===//
+// Memory Instructions
+//
+
+def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+                   "clflushopt\t$src", []>, PD;
+def CLWB       : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", []>, PD;
+def PCOMMIT    : I<0xAE, MRM_F8, (outs), (ins), "pcommit", []>, PD;
+
+
+//===----------------------------------------------------------------------===//
 // Subsystems.
 //===----------------------------------------------------------------------===//
 
@@ -2513,6 +2579,12 @@ def : MnemonicAlias<"fnstsww",  "fnstsw",   "att">;
 def : MnemonicAlias<"fucomip",  "fucompi",  "att">;
 def : MnemonicAlias<"fwait",    "wait">;
 
+def : MnemonicAlias<"fxsaveq",   "fxsave64",   "att">;
+def : MnemonicAlias<"fxrstorq",  "fxrstor64",  "att">;
+def : MnemonicAlias<"xsaveq",    "xsave64",    "att">;
+def : MnemonicAlias<"xrstorq",   "xrstor64",   "att">;
+def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">;
+
 
 class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond,
                     string VariantName>
@@ -2700,28 +2772,28 @@ def : InstAlias<"fnstsw"     , (FNSTSW16r)>;
 // this is compatible with what GAS does.
 def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
 def : InstAlias<"ljmp $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
-def : InstAlias<"lcall *$dst",      (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
-def : InstAlias<"ljmp *$dst",       (FARJMP32m  opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall {*}$dst",    (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp {*}$dst",     (FARJMP32m  opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
 def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
 def : InstAlias<"ljmp $seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"lcall *$dst",      (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"ljmp *$dst",       (FARJMP16m  opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall {*}$dst",    (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp {*}$dst",     (FARJMP16m  opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
 
-def : InstAlias<"call *$dst",       (CALL64m i16mem:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"jmp *$dst",        (JMP64m  i16mem:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"call *$dst",       (CALL32m i16mem:$dst), 0>, Requires<[In32BitMode]>;
-def : InstAlias<"jmp *$dst",        (JMP32m  i16mem:$dst), 0>, Requires<[In32BitMode]>;
-def : InstAlias<"call *$dst",       (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"jmp *$dst",        (JMP16m  i16mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"call {*}$dst",     (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"jmp {*}$dst",      (JMP64m  i64mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"call {*}$dst",     (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"jmp {*}$dst",      (JMP32m  i32mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"call {*}$dst",     (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp {*}$dst",      (JMP16m  i16mem:$dst), 0>, Requires<[In16BitMode]>;
 
 
 // "imul <imm>, B" is an alias for "imul <imm>, B, B".
-def : InstAlias<"imulw $imm, $r", (IMUL16rri  GR16:$r, GR16:$r, i16imm:$imm)>;
-def : InstAlias<"imulw $imm, $r", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm)>;
-def : InstAlias<"imull $imm, $r", (IMUL32rri  GR32:$r, GR32:$r, i32imm:$imm)>;
-def : InstAlias<"imull $imm, $r", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm)>;
-def : InstAlias<"imulq $imm, $r",(IMUL64rri32 GR64:$r, GR64:$r,i64i32imm:$imm)>;
-def : InstAlias<"imulq $imm, $r", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm)>;
+def : InstAlias<"imulw {$imm, $r|$r, $imm}", (IMUL16rri  GR16:$r, GR16:$r, i16imm:$imm), 0>;
+def : InstAlias<"imulw {$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>;
+def : InstAlias<"imull {$imm, $r|$r, $imm}", (IMUL32rri  GR32:$r, GR32:$r, i32imm:$imm), 0>;
+def : InstAlias<"imull {$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>;
+def : InstAlias<"imulq {$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>;
+def : InstAlias<"imulq {$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>;
 
 // inb %dx -> inb %al, %dx
 def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>;
@@ -2745,34 +2817,34 @@ def : InstAlias<"jmpl $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>;
 // Force mov without a suffix with a segment and mem to prefer the 'l' form of
 // the move.  All segment/mem forms are equivalent, this has the shortest
 // encoding.
-def : InstAlias<"mov $mem, $seg", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>;
-def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
+def : InstAlias<"mov {$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>;
+def : InstAlias<"mov {$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
 
 // Match 'movq <largeimm>, <reg>' as an alias for movabsq.
-def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
+def : InstAlias<"movq {$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
 
 // Match 'movq GR64, MMX' as an alias for movd.
-def : InstAlias<"movq $src, $dst",
+def : InstAlias<"movq {$src, $dst|$dst, $src}",
                 (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
-def : InstAlias<"movq $src, $dst",
+def : InstAlias<"movq {$src, $dst|$dst, $src}",
                 (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;
 
 // movsx aliases
-def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
-def : InstAlias<"movsx $src, $dst", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
-def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
-def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
-def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
-def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
-def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
 
 // movzx aliases
-def : InstAlias<"movzx $src, $dst", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
-def : InstAlias<"movzx $src, $dst", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
-def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
-def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
-def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>;
-def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>;
 // Note: No GR32->GR64 movzx form.
 
 // outb %dx -> outb %al, %dx
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 9001fba..eaa7894 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -125,9 +125,9 @@ let Constraints = "$src1 = $dst" in {
                                     (bitconvert (load_mmx addr:$src2))))],
                   itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
     def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
-                                   (ins VR64:$src1, i32i8imm:$src2),
+                                   (ins VR64:$src1, i32u8imm:$src2),
                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-           [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))], itins.ri>,
+           [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))], itins.ri>,
            Sched<[WriteVecShift]>;
   }
 }
@@ -170,12 +170,12 @@ multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
 /// PALIGN MMX instructions (require SSSE3).
 multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
   def R64irr  : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
-      (ins VR64:$src1, VR64:$src2, i8imm:$src3),
-      !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 
+      (ins VR64:$src1, VR64:$src2, u8imm:$src3),
+      !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
       [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>,
       Sched<[WriteShuffle]>;
   def R64irm  : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
-      (ins VR64:$src1, i64mem:$src2, i8imm:$src3),
+      (ins VR64:$src1, i64mem:$src2, u8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
       [(set VR64:$dst, (IntId VR64:$src1,
                        (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>,
@@ -220,23 +220,29 @@ def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms",
 // Data Transfer Instructions
 def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
                         "movd\t{$src, $dst|$dst, $src}",
-                        [(set VR64:$dst, 
+                        [(set VR64:$dst,
                          (x86mmx (scalar_to_vector GR32:$src)))],
                         IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
-let canFoldAsLoad = 1 in
 def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
                         "movd\t{$src, $dst|$dst, $src}",
                         [(set VR64:$dst,
                         (x86mmx (scalar_to_vector (loadi32 addr:$src))))],
                         IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
+
+let Predicates = [HasMMX] in {
+  let AddedComplexity = 15 in
+    def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)),
+              (MMX_MOVD64rr GR32:$src)>;
+  let AddedComplexity = 20 in
+    def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))),
+              (MMX_MOVD64rm addr:$src)>;
+}
+
 let mayStore = 1 in
 def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
                         "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>,
                    Sched<[WriteStore]>;
 
-// Low word of MMX to GPR.
-def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1,
-                            [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>;
 def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
                          "movd\t{$src, $dst|$dst, $src}",
                          [(set GR32:$dst,
@@ -248,16 +254,21 @@ def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
                              [(set VR64:$dst, (bitconvert GR64:$src))],
                              IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
 
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst),
+                             (ins i64mem:$src), "movd\t{$src, $dst|$dst, $src}",
+                             [], IIC_MMX_MOVQ_RM>, Sched<[WriteLoad]>;
+
 // These are 64 bit moves, but since the OS X assembler doesn't
 // recognize a register-register movq, we write them as
 // movd.
 let SchedRW = [WriteMove] in {
 def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
                                (outs GR64:$dst), (ins VR64:$src),
-                               "movd\t{$src, $dst|$dst, $src}", 
+                               "movd\t{$src, $dst|$dst, $src}",
                              [(set GR64:$dst,
                               (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>;
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}", [],
                         IIC_MMX_MOVQ_RR>;
@@ -268,6 +279,12 @@ def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src),
 }
 } // SchedRW
 
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def MMX_MOVD64from64rm : MMXRI<0x7E, MRMDestMem,
+                               (outs i64mem:$dst), (ins VR64:$src),
+                               "movd\t{$src, $dst|$dst, $src}",
+                               [], IIC_MMX_MOV_REG_MM>, Sched<[WriteStore]>;
+
 let SchedRW = [WriteLoad] in {
 let canFoldAsLoad = 1 in
 def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
@@ -453,6 +470,13 @@ defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
                                     int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
                                     MMX_SHIFT_ITINS>;
 
+def : Pat<(int_x86_mmx_psrl_w VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSRLWrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psrl_d VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSRLDrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psrl_q VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSRLQrm VR64:$src1, addr:$src2)>;
+
 defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
                                     int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
                                     MMX_SHIFT_ITINS>;
@@ -463,6 +487,13 @@ defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
                                     int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
                                     MMX_SHIFT_ITINS>;
 
+def : Pat<(int_x86_mmx_psll_w VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSLLWrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psll_d VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSLLDrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psll_q VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSLLQrm VR64:$src1, addr:$src2)>;
+
 defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
                                     int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
                                     MMX_SHIFT_ITINS>;
@@ -470,6 +501,11 @@ defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
                                     int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
                                     MMX_SHIFT_ITINS>;
 
+def : Pat<(int_x86_mmx_psra_w VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSRAWrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psra_d VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSRADrm VR64:$src1, addr:$src2)>;
+
 // Comparison Instructions
 defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
                                      MMX_INTALU_ITINS>;
@@ -486,19 +522,19 @@ defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d,
                                      MMX_INTALU_ITINS>;
 
 // -- Unpack Instructions
-defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw", 
+defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw",
                                        int_x86_mmx_punpckhbw,
                                        MMX_UNPCK_H_ITINS>;
-defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd", 
+defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd",
                                        int_x86_mmx_punpckhwd,
                                        MMX_UNPCK_H_ITINS>;
-defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq", 
+defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq",
                                        int_x86_mmx_punpckhdq,
                                        MMX_UNPCK_H_ITINS>;
-defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw", 
+defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw",
                                        int_x86_mmx_punpcklbw,
                                        MMX_UNPCK_L_ITINS>;
-defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd", 
+defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd",
                                        int_x86_mmx_punpcklwd,
                                        MMX_UNPCK_L_ITINS>;
 defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq",
@@ -518,13 +554,13 @@ defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
                                        MMX_PSHUF_ITINS>;
 
 def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
-                          (outs VR64:$dst), (ins VR64:$src1, i8imm:$src2),
+                          (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2),
                           "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                           [(set VR64:$dst,
                              (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))],
                           IIC_MMX_PSHUF>, Sched<[WriteShuffle]>;
 def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
-                          (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2),
+                          (outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2),
                           "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                           [(set VR64:$dst,
                              (int_x86_sse_pshuf_w (load_mmx addr:$src1),
@@ -559,27 +595,27 @@ let Constraints = "$src1 = $dst" in {
 
 // Extract / Insert
 def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
-                       (outs GR32orGR64:$dst), (ins VR64:$src1, i32i8imm:$src2),
+                       (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2),
                        "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
-                                         (iPTR imm:$src2)))],
+                                               imm:$src2))],
                        IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
 let Constraints = "$src1 = $dst" in {
   def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
-                      (outs VR64:$dst), 
-                      (ins VR64:$src1, GR32orGR64:$src2, i32i8imm:$src3),
+                      (outs VR64:$dst),
+                      (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3),
                       "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
-                                        GR32orGR64:$src2, (iPTR imm:$src3)))],
+                                        GR32orGR64:$src2, imm:$src3))],
                       IIC_MMX_PINSRW>, Sched<[WriteShuffle]>;
 
   def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
                      (outs VR64:$dst),
-                     (ins VR64:$src1, i16mem:$src2, i32i8imm:$src3),
+                     (ins VR64:$src1, i16mem:$src2, i32u8imm:$src3),
                      "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
                                          (i32 (anyext (loadi16 addr:$src2))),
-                                       (iPTR imm:$src3)))],
+                                       imm:$src3))],
                      IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
diff --git a/lib/Target/X86/X86InstrSGX.td b/lib/Target/X86/X86InstrSGX.td
index 47c5dc5..84119ad 100644
--- a/lib/Target/X86/X86InstrSGX.td
+++ b/lib/Target/X86/X86InstrSGX.td
@@ -17,8 +17,8 @@
 
 // ENCLS - Execute an Enclave System Function of Specified Leaf Number
 def ENCLS : I<0x01, MRM_CF, (outs), (ins),
-             "encls", []>, TB, Requires<[HasSGX]>;
+             "encls", []>, TB;
 
 // ENCLU - Execute an Enclave User Function of Specified Leaf Number
 def ENCLU : I<0x01, MRM_D7, (outs), (ins),
-             "enclu", []>, TB, Requires<[HasSGX]>;
+             "enclu", []>, TB;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index cc896f0..d2929d2 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -548,13 +548,13 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 
 multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
                          X86MemOperand x86memop, string base_opc,
-                         string asm_opr> {
+                         string asm_opr, Domain d = GenericDomain> {
   def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
               (ins VR128:$src1, RC:$src2),
               !strconcat(base_opc, asm_opr),
               [(set VR128:$dst, (vt (OpNode VR128:$src1,
                                  (scalar_to_vector RC:$src2))))],
-              IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
+              IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>;
 
   // For the disassembler
   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
@@ -565,49 +565,55 @@ multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
 }
 
 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
-                      X86MemOperand x86memop, string OpcodeStr> {
+                      X86MemOperand x86memop, string OpcodeStr,
+                      Domain d = GenericDomain> {
   // AVX
   defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
-                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
                               VEX_4V, VEX_LIG;
 
   def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
                      VEX, VEX_LIG, Sched<[WriteStore]>;
   // SSE1 & 2
   let Constraints = "$src1 = $dst" in {
     defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
-                              "\t{$src2, $dst|$dst, $src2}">;
+                              "\t{$src2, $dst|$dst, $src2}", d>;
   }
 
   def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
                   Sched<[WriteStore]>;
 }
 
 // Loading from memory automatically zeroing upper bits.
 multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
-                         PatFrag mem_pat, string OpcodeStr> {
+                         PatFrag mem_pat, string OpcodeStr,
+                         Domain d = GenericDomain> {
   def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(set RC:$dst, (mem_pat addr:$src))],
-                     IIC_SSE_MOV_S_RM>, VEX, VEX_LIG, Sched<[WriteLoad]>;
+                     IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>;
   def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(set RC:$dst, (mem_pat addr:$src))],
-                     IIC_SSE_MOV_S_RM>, Sched<[WriteLoad]>;
+                     IIC_SSE_MOV_S_RM, d>, Sched<[WriteLoad]>;
 }
 
-defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss">, XS;
-defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd">, XD;
+defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
+                        SSEPackedSingle>, XS;
+defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
+                        SSEPackedDouble>, XD;
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
-  defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS;
+  defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
+                             SSEPackedSingle>, XS;
 
   let AddedComplexity = 20 in
-    defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
+    defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
+                               SSEPackedDouble>, XD;
 }
 
 // Patterns
@@ -809,7 +815,7 @@ multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
                             string asm, Domain d,
                             OpndItins itins,
                             bit IsReMaterializable = 1> {
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
   def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
               !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
            Sched<[WriteFShuffle]>;
@@ -1332,6 +1338,8 @@ let Predicates = [HasAVX] in {
                  (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
             (VMOVHPSrm VR128:$src1, addr:$src2)>;
 
+  // VMOVHPD patterns
+
   // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
   // is during lowering, where it's not possible to recognize the load fold
   // cause it has two uses through a bitcast. One use disappears at isel time
@@ -1344,6 +1352,11 @@ let Predicates = [HasAVX] in {
   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
                       (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
             (VMOVHPDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(store (f64 (vector_extract
+                          (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
+                          (iPTR 0))), addr:$dst),
+            (VMOVHPDmr addr:$dst, VR128:$src)>;
 }
 
 let Predicates = [UseSSE1] in {
@@ -1357,6 +1370,8 @@ let Predicates = [UseSSE1] in {
 }
 
 let Predicates = [UseSSE2] in {
+  // MOVHPD patterns
+
   // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
   // is during lowering, where it's not possible to recognize the load fold
   // cause it has two uses through a bitcast. One use disappears at isel time
@@ -1369,6 +1384,11 @@ let Predicates = [UseSSE2] in {
   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
                       (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
             (MOVHPDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(store (f64 (vector_extract
+                          (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
+                          (iPTR 0))), addr:$dst),
+            (MOVHPDmr addr:$dst, VR128:$src)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1477,7 +1497,7 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
 multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                        X86MemOperand x86memop, string asm, Domain d,
                        OpndItins itins> {
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
              [], itins.rr, d>, Sched<[itins.Sched]>;
   let mayLoad = 1 in
@@ -1488,7 +1508,7 @@ let neverHasSideEffects = 1 in {
 
 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                           X86MemOperand x86memop, string asm> {
-let neverHasSideEffects = 1, Predicates = [UseAVX] in {
+let hasSideEffects = 0, Predicates = [UseAVX] in {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
            Sched<[WriteCvtI2F]>;
@@ -1497,7 +1517,7 @@ let neverHasSideEffects = 1, Predicates = [UseAVX] in {
               (ins DstRC:$src1, x86memop:$src),
               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
            Sched<[WriteCvtI2FLd, ReadAfterLd]>;
-} // neverHasSideEffects = 1
+} // hasSideEffects = 0
 }
 
 let Predicates = [UseAVX] in {
@@ -1804,7 +1824,7 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
 /// SSE 2 Only
 
 // Convert scalar double to scalar single
-let neverHasSideEffects = 1, Predicates = [UseAVX] in {
+let hasSideEffects = 0, Predicates = [UseAVX] in {
 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
                        (ins FR64:$src1, FR64:$src2),
                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
@@ -1869,7 +1889,7 @@ def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
 
 // Convert scalar single to scalar double
 // SSE2 instructions with XS prefix
-let neverHasSideEffects = 1, Predicates = [UseAVX] in {
+let hasSideEffects = 0, Predicates = [UseAVX] in {
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
                     (ins FR32:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -2191,7 +2211,7 @@ def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
 
 // Convert Packed DW Integers to Packed Double FP
 let Predicates = [HasAVX] in {
-let neverHasSideEffects = 1, mayLoad = 1 in
+let hasSideEffects = 0, mayLoad = 1 in
 def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                      "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                      []>, VEX, Sched<[WriteCvtI2FLd]>;
@@ -2213,7 +2233,7 @@ def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                     Sched<[WriteCvtI2F]>;
 }
 
-let neverHasSideEffects = 1, mayLoad = 1 in
+let hasSideEffects = 0, mayLoad = 1 in
 def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
                        IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
@@ -2319,26 +2339,26 @@ let Predicates = [UseSSE2] in {
 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
                             Operand CC, SDNode OpNode, ValueType VT,
                             PatFrag ld_frag, string asm, string asm_alt,
-                            OpndItins itins> {
+                            OpndItins itins, ImmLeaf immLeaf> {
   def rr : SIi8<0xC2, MRMSrcReg,
                 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
-                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
+                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))],
                 itins.rr>, Sched<[itins.Sched]>;
   def rm : SIi8<0xC2, MRMSrcMem,
                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
                 [(set RC:$dst, (OpNode (VT RC:$src1),
-                                         (ld_frag addr:$src2), imm:$cc))],
+                                         (ld_frag addr:$src2), immLeaf:$cc))],
                                          itins.rm>,
            Sched<[itins.Sched.Folded, ReadAfterLd]>;
 
   // Accept explicit immediate argument form instead of comparison code.
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
     def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
-                      (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [],
+                      (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [],
                       IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
     let mayLoad = 1 in
     def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
-                      (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [],
+                      (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [],
                       IIC_SSE_ALU_F32S_RM>,
                       Sched<[itins.Sched.Folded, ReadAfterLd]>;
   }
@@ -2347,38 +2367,37 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
                  "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                  "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-                 SSE_ALU_F32S>,
-                 XS, VEX_4V, VEX_LIG;
+                 SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG;
 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
                  "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                  "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-                 SSE_ALU_F32S>, // same latency as 32 bit compare
+                 SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare
                  XD, VEX_4V, VEX_LIG;
 
 let Constraints = "$src1 = $dst" in {
   defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
                   "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
-                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>,
-                  XS;
+                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S,
+                  i8immZExt3>, XS;
   defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
                   "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
                   "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                  SSE_ALU_F64S>,
-                  XD;
+                  SSE_ALU_F64S, i8immZExt3>, XD;
 }
 
 multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
-                         Intrinsic Int, string asm, OpndItins itins> {
+                         Intrinsic Int, string asm, OpndItins itins,
+                         ImmLeaf immLeaf> {
   def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
                       (ins VR128:$src1, VR128:$src, CC:$cc), asm,
                         [(set VR128:$dst, (Int VR128:$src1,
-                                               VR128:$src, imm:$cc))],
+                                               VR128:$src, immLeaf:$cc))],
                                                itins.rr>,
            Sched<[itins.Sched]>;
   def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
                         [(set VR128:$dst, (Int VR128:$src1,
-                                               (load addr:$src), imm:$cc))],
+                                               (load addr:$src), immLeaf:$cc))],
                                                itins.rm>,
            Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
@@ -2387,19 +2406,19 @@ let isCodeGenOnly = 1 in {
   // Aliases to match intrinsics which expect XMM operand(s).
   defm Int_VCMPSS  : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
                        "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
-                       SSE_ALU_F32S>,
+                       SSE_ALU_F32S, i8immZExt5>,
                        XS, VEX_4V;
   defm Int_VCMPSD  : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
                        "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
-                       SSE_ALU_F32S>, // same latency as f32
+                       SSE_ALU_F32S, i8immZExt5>, // same latency as f32
                        XD, VEX_4V;
   let Constraints = "$src1 = $dst" in {
     defm Int_CMPSS  : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
                          "cmp${cc}ss\t{$src, $dst|$dst, $src}",
-                         SSE_ALU_F32S>, XS;
+                         SSE_ALU_F32S, i8immZExt3>, XS;
     defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
                          "cmp${cc}sd\t{$src, $dst|$dst, $src}",
-                         SSE_ALU_F64S>,
+                         SSE_ALU_F64S, i8immZExt3>,
                          XD;
 }
 }
@@ -2473,26 +2492,28 @@ let Defs = [EFLAGS] in {
 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
                             Operand CC, Intrinsic Int, string asm,
-                            string asm_alt, Domain d,
-                            OpndItins itins = SSE_ALU_F32P> {
+                            string asm_alt, Domain d, ImmLeaf immLeaf,
+                            PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> {
+  let isCommutable = 1 in
   def rri : PIi8<0xC2, MRMSrcReg,
              (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
-             [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))],
+             [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))],
              itins.rr, d>,
             Sched<[WriteFAdd]>;
   def rmi : PIi8<0xC2, MRMSrcMem,
              (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
-             [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))],
+             [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))],
              itins.rm, d>,
             Sched<[WriteFAddLd, ReadAfterLd]>;
 
   // Accept explicit immediate argument form instead of comparison code.
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
     def rri_alt : PIi8<0xC2, MRMSrcReg,
-               (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+               (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
                asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
+    let mayLoad = 1 in
     def rmi_alt : PIi8<0xC2, MRMSrcMem,
-               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
                asm_alt, [], itins.rm, d>,
                Sched<[WriteFAddLd, ReadAfterLd]>;
   }
@@ -2501,61 +2522,61 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedSingle>, PS, VEX_4V;
+               SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V;
 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedDouble>, PD, VEX_4V;
+               SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V;
 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedSingle>, PS, VEX_4V, VEX_L;
+               SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L;
 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedDouble>, PD, VEX_4V, VEX_L;
+               SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L;
 let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
                  "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
                  "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SSEPackedSingle, SSE_ALU_F32P>, PS;
+                 SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS;
   defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
                  "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
                  "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SSEPackedDouble, SSE_ALU_F64P>, PD;
+                 SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD;
 }
 
 let Predicates = [HasAVX] in {
 def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
           (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
-def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
           (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
 def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
           (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
-def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
           (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
 
 def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
           (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
-def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)),
+def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
           (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
 def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
           (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
-def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)),
+def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
           (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
 }
 
 let Predicates = [UseSSE1] in {
 def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
           (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
-def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
           (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
 }
 
 let Predicates = [UseSSE2] in {
 def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
           (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
-def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
           (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
 }
 
@@ -2568,12 +2589,12 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
                          ValueType vt, string asm, PatFrag mem_frag,
                          Domain d> {
   def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
-                   (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm,
+                   (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
                    [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
                                        (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
             Sched<[WriteFShuffleLd, ReadAfterLd]>;
   def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
-                 (ins RC:$src1, RC:$src2, i8imm:$src3), asm,
+                 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
                  [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
                                      (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
             Sched<[WriteFShuffle]>;
@@ -2729,24 +2750,6 @@ let Predicates = [HasAVX1Only] in {
             (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
 }
 
-let Predicates = [HasAVX] in {
-  // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
-  // problem is during lowering, where it's not possible to recognize the load
-  // fold cause it has two uses through a bitcast. One use disappears at isel
-  // time and the fold opportunity reappears.
-  def : Pat<(v2f64 (X86Movddup VR128:$src)),
-            (VUNPCKLPDrr VR128:$src, VR128:$src)>;
-}
-
-let Predicates = [UseSSE2] in {
-  // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
-  // problem is during lowering, where it's not possible to recognize the load
-  // fold cause it has two uses through a bitcast. One use disappears at isel
-  // time and the fold opportunity reappears.
-  def : Pat<(v2f64 (X86Movddup VR128:$src)),
-            (UNPCKLPDrr VR128:$src, VR128:$src)>;
-}
-
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Extract Floating-Point Sign mask
 //===----------------------------------------------------------------------===//
@@ -2838,7 +2841,7 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
                          ValueType OpVT128, ValueType OpVT256,
                          OpndItins itins, bit IsCommutable = 0> {
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoVLX] in
   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
                     VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
 
@@ -2846,7 +2849,7 @@ let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
                            memopv2i64, i128mem, itins, IsCommutable, 1>;
 
-let Predicates = [HasAVX2] in
+let Predicates = [HasAVX2, NoVLX] in
   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
                                OpVT256, VR256, loadv4i64, i256mem, itins,
                                IsCommutable, 0>, VEX_4V, VEX_L;
@@ -2867,40 +2870,73 @@ defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
 // SSE 1 & 2 - Logical Instructions
 //===----------------------------------------------------------------------===//
 
-/// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops
-///
-multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
-                                       SDNode OpNode, OpndItins itins> {
+// Multiclass for scalars using the X86 logical operation aliases for FP.
+multiclass sse12_fp_packed_scalar_logical_alias<
+    bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
+  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+                FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>,
+                PS, VEX_4V;
+
+  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+                FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>,
+                PD, VEX_4V;
+
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
+                f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS;
+
+    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
+                f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD;
+  }
+}
+
+let isCodeGenOnly = 1 in {
+  defm FsAND  : sse12_fp_packed_scalar_logical_alias<0x54, "and", X86fand,
+                SSE_BIT_ITINS_P>;
+  defm FsOR   : sse12_fp_packed_scalar_logical_alias<0x56, "or", X86for,
+                SSE_BIT_ITINS_P>;
+  defm FsXOR  : sse12_fp_packed_scalar_logical_alias<0x57, "xor", X86fxor,
+                SSE_BIT_ITINS_P>;
+
+  let isCommutable = 0 in
+    defm FsANDN : sse12_fp_packed_scalar_logical_alias<0x55, "andn", X86fandn,
+                  SSE_BIT_ITINS_P>;
+}
+
+// Multiclass for vectors using the X86 logical operation aliases for FP.
+multiclass sse12_fp_packed_vector_logical_alias<
+    bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
+  let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
-              FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>,
+              VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>,
               PS, VEX_4V;
 
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
-        FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>,
+        VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>,
         PD, VEX_4V;
+  }
 
   let Constraints = "$src1 = $dst" in {
-    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
-                f32, f128mem, memopfsf32, SSEPackedSingle, itins>,
+    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
+                v4f32, f128mem, memopv4f32, SSEPackedSingle, itins>,
                 PS;
 
-    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
-                f64, f128mem, memopfsf64, SSEPackedDouble, itins>,
+    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
+                v2f64, f128mem, memopv2f64, SSEPackedDouble, itins>,
                 PD;
   }
 }
 
-// Alias bitwise logical operations using SSE logical ops on packed FP values.
 let isCodeGenOnly = 1 in {
-  defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
+  defm FvAND  : sse12_fp_packed_vector_logical_alias<0x54, "and", X86fand,
                 SSE_BIT_ITINS_P>;
-  defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
+  defm FvOR   : sse12_fp_packed_vector_logical_alias<0x56, "or", X86for,
                 SSE_BIT_ITINS_P>;
-  defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
+  defm FvXOR  : sse12_fp_packed_vector_logical_alias<0x57, "xor", X86fxor,
                 SSE_BIT_ITINS_P>;
 
   let isCommutable = 0 in
-    defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn,
+    defm FvANDN : sse12_fp_packed_vector_logical_alias<0x55, "andn", X86fandn,
                   SSE_BIT_ITINS_P>;
 }
 
@@ -2908,6 +2944,7 @@ let isCodeGenOnly = 1 in {
 ///
 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
                                    SDNode OpNode> {
+  let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
         !strconcat(OpcodeStr, "ps"), f256mem,
         [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
@@ -2938,6 +2975,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
        [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
                                  (loadv2i64 addr:$src2)))], 0>,
                                                  PD, VEX_4V;
+  }
 
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
@@ -2993,6 +3031,7 @@ let Predicates = [HasAVX1Only] in {
 /// classes below
 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
                                   SDNode OpNode, SizeItins itins> {
+  let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
                                VR128, v4f32, f128mem, loadv4f32,
                                SSEPackedSingle, itins.s, 0>, PS, VEX_4V;
@@ -3006,6 +3045,7 @@ multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
                         OpNode, VR256, v4f64, f256mem, loadv4f64,
                         SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L;
+  }
 
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
@@ -3081,10 +3121,9 @@ let isCodeGenOnly = 1 in {
 }
 
 // Patterns used to select SSE scalar fp arithmetic instructions from
-// a scalar fp operation followed by a blend.
+// either:
 //
-// These patterns know, for example, how to select an ADDSS from a
-// float add plus vector insert.
+// (1) a scalar fp operation followed by a blend
 //
 // The effect is that the backend no longer emits unnecessary vector
 // insert instructions immediately after SSE scalar fp instructions
@@ -3096,218 +3135,14 @@ let isCodeGenOnly = 1 in {
 //     return A;
 //   }
 //
-// previously we generated:
+// Previously we generated:
 //   addss %xmm0, %xmm1
 //   movss %xmm1, %xmm0
-// 
-// we now generate:
+//
+// We now generate:
 //   addss %xmm1, %xmm0
-
-let Predicates = [UseSSE1] in {
-  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
-                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                      FR32:$src))))),
-            (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
-                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                      FR32:$src))))),
-            (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
-                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                      FR32:$src))))),
-            (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
-                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                      FR32:$src))))),
-            (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-}
-
-let Predicates = [UseSSE2] in {
-  // SSE2 patterns to select scalar double-precision fp arithmetic instructions
-  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))))),
-            (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))))),
-            (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))))),
-            (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))))),
-            (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-}
-
-let Predicates = [UseSSE41] in {
-  // If the subtarget has SSE4.1 but not AVX, the vector insert instruction is
-  // lowered into a X86insertps or a X86Blendi rather than a X86Movss. When
-  // selecting SSE scalar single-precision fp arithmetic instructions, make
-  // sure that we correctly match them.
-
-  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-                  (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                    FR32:$src))), (iPTR 0))),
-            (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-                  (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                    FR32:$src))), (iPTR 0))),
-            (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-                  (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                    FR32:$src))), (iPTR 0))),
-            (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-                  (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                    FR32:$src))), (iPTR 0))),
-            (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-
-  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
-                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                      FR32:$src))), (i8 1))),
-            (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
-                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                      FR32:$src))), (i8 1))),
-            (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
-                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                      FR32:$src))), (i8 1))),
-            (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
-                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                      FR32:$src))), (i8 1))),
-            (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-
-  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))), (i8 1))),
-            (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))), (i8 1))),
-            (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))), (i8 1))),
-            (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))), (i8 1))),
-            (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-
-  def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
-            (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
-            (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
-            (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
-            (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-}
-
-let Predicates = [HasAVX] in {
-  // The following patterns select AVX Scalar single/double precision fp
-  // arithmetic instructions.
-
-  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))))),
-            (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))))),
-            (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))))),
-            (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))))),
-            (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-                 (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                       FR32:$src))), (iPTR 0))),
-            (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-                 (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                       FR32:$src))), (iPTR 0))),
-            (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-                 (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                       FR32:$src))), (iPTR 0))),
-            (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-                 (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                       FR32:$src))), (iPTR 0))),
-            (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-
-  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
-                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                      FR32:$src))), (i8 1))),
-            (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
-                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                      FR32:$src))), (i8 1))),
-            (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
-                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                      FR32:$src))), (i8 1))),
-            (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
-                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
-                      FR32:$src))), (i8 1))),
-            (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-
-  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))), (i8 1))),
-            (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))), (i8 1))),
-            (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))), (i8 1))),
-            (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))), (i8 1))),
-            (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-
-  def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
-            (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
-            (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
-            (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv
-                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-                      FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
-            (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-}
-
-// Patterns used to select SSE scalar fp arithmetic instructions from
-// a vector packed single/double fp operation followed by a vector insert.
+//
+// (2) a vector packed single/double fp operation followed by a vector insert
 //
 // The effect is that the backend converts the packed fp instruction
 // followed by a vector insert into a single SSE scalar fp instruction.
@@ -3318,160 +3153,151 @@ let Predicates = [HasAVX] in {
 //     return (__m128) {c[0], a[1], a[2], a[3]};
 //   }
 //
-// previously we generated:
+// Previously we generated:
 //   addps %xmm0, %xmm1
 //   movss %xmm1, %xmm0
-// 
-// we now generate:
+//
+// We now generate:
 //   addss %xmm1, %xmm0
 
-let Predicates = [UseSSE1] in {
-  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
-                   (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
-            (ADDSSrr_Int v4f32:$dst, v4f32:$src)>;
-  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 
-                   (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
-            (SUBSSrr_Int v4f32:$dst, v4f32:$src)>;
-  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
-                   (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
-            (MULSSrr_Int v4f32:$dst, v4f32:$src)>;
-  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 
-                   (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
-            (DIVSSrr_Int v4f32:$dst, v4f32:$src)>;
-}
+// TODO: Some canonicalization in lowering would simplify the number of
+// patterns we have to try to match.
+multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
+  let Predicates = [UseSSE1] in {
+    // extracted scalar math op with insert via movss
+    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+          FR32:$src))))),
+      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
+          (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+    // vector math op with insert via movss
+    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+  }
+  
+  // With SSE 4.1, insertps/blendi are preferred to movsd, so match those too.
+  let Predicates = [UseSSE41] in {
+    // extracted scalar math op with insert via insertps
+    def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+          FR32:$src))), (iPTR 0))),
+      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
+          (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+    // extracted scalar math op with insert via blend
+    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+          FR32:$src))), (i8 1))),
+      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
+          (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+    // vector math op with insert via blend
+    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+      (!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>;
 
-let Predicates = [UseSSE2] in {
-  // SSE2 patterns to select scalar double-precision fp arithmetic instructions
-  // from a packed double-precision fp instruction plus movsd.
-
-  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
-                   (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
-            (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
-                   (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
-            (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
-                   (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
-            (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
-                   (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
-            (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
-}
+  }
 
-let Predicates = [UseSSE41] in {
-  // With SSE4.1 we may see these operations using X86Blendi rather than
-  // X86Movs{s,d}.
-  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
-                   (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
-            (ADDSSrr_Int v4f32:$dst, v4f32:$src)>;
-  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 
-                   (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
-            (SUBSSrr_Int v4f32:$dst, v4f32:$src)>;
-  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
-                   (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
-            (MULSSrr_Int v4f32:$dst, v4f32:$src)>;
-  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 
-                   (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
-            (DIVSSrr_Int v4f32:$dst, v4f32:$src)>;
-
-  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
-                   (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
-            (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
-                   (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
-            (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
-                   (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
-            (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
-                   (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
-            (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
-
-  def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)),
-                              (v2f64 VR128:$dst), (i8 2))),
-            (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)),
-                   (v2f64 VR128:$dst), (i8 2))),
-            (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)),
-                   (v2f64 VR128:$dst), (i8 2))),
-            (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)),
-                   (v2f64 VR128:$dst), (i8 2))),
-            (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+  // Repeat everything for AVX, except for the movss + scalar combo...
+  // because that one shouldn't occur with AVX codegen?
+  let Predicates = [HasAVX] in {
+    // extracted scalar math op with insert via insertps
+    def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+          FR32:$src))), (iPTR 0))),
+      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
+          (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ 
+    // extracted scalar math op with insert via blend
+    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+          FR32:$src))), (i8 1))),
+      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
+          (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+    // vector math op with insert via movss
+    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+
+    // vector math op with insert via blend
+    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+  }
 }
 
-let Predicates = [HasAVX] in {
-  // The following patterns select AVX Scalar single/double precision fp
-  // arithmetic instructions from a packed single precision fp instruction
-  // plus movss/movsd.
-
-  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
-                   (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
-            (VADDSSrr_Int v4f32:$dst, v4f32:$src)>;
-  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
-                   (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
-            (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>;
-  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
-                   (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
-            (VMULSSrr_Int v4f32:$dst, v4f32:$src)>;
-  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
-                   (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
-            (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>;
-  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
-                   (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
-            (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
-                   (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
-            (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
-                   (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
-            (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
-                   (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
-            (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
-
-  // Also handle X86Blendi-based patterns.
-  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
-                   (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
-            (VADDSSrr_Int v4f32:$dst, v4f32:$src)>;
-  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 
-                   (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
-            (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>;
-  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
-                   (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
-            (VMULSSrr_Int v4f32:$dst, v4f32:$src)>;
-  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 
-                   (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
-            (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>;
-
-  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
-                   (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
-            (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
-                   (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
-            (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
-                   (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
-            (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
-                   (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
-            (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
-
-  def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)),
-                              (v2f64 VR128:$dst), (i8 2))),
-            (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)),
-                   (v2f64 VR128:$dst), (i8 2))),
-            (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)),
-                   (v2f64 VR128:$dst), (i8 2))),
-            (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
-  def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)),
-                   (v2f64 VR128:$dst), (i8 2))),
-            (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+defm : scalar_math_f32_patterns<fadd, "ADD">;
+defm : scalar_math_f32_patterns<fsub, "SUB">;
+defm : scalar_math_f32_patterns<fmul, "MUL">;
+defm : scalar_math_f32_patterns<fdiv, "DIV">;
+
+multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
+  let Predicates = [UseSSE2] in {
+    // extracted scalar math op with insert via movsd
+    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+          (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+          FR64:$src))))),
+      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
+          (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+    // vector math op with insert via movsd
+    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+  }
+
+  // With SSE 4.1, blendi is preferred to movsd, so match those too.
+  let Predicates = [UseSSE41] in {
+    // extracted scalar math op with insert via blend
+    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+          (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+          FR64:$src))), (i8 1))),
+      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
+          (COPY_TO_REGCLASS FR64:$src, VR128))>;
+          
+    // vector math op with insert via blend
+    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+  }
+
+  // Repeat everything for AVX.
+  let Predicates = [HasAVX] in {
+    // extracted scalar math op with insert via movsd
+    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+          (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+          FR64:$src))))),
+      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
+          (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+    // extracted scalar math op with insert via blend
+    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+          (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+          FR64:$src))), (i8 1))),
+      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
+          (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+    // vector math op with insert via movsd
+    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+
+    // vector math op with insert via blend
+    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+  }
 }
 
+defm : scalar_math_f64_patterns<fadd, "ADD">;
+defm : scalar_math_f64_patterns<fsub, "SUB">;
+defm : scalar_math_f64_patterns<fmul, "MUL">;
+defm : scalar_math_f64_patterns<fdiv, "DIV">;
+
+
 /// Unop Arithmetic
 /// In addition, we also have a special variant of the scalar form here to
 /// represent the associated intrinsic operation.  This form is unlike the
@@ -3518,103 +3344,106 @@ def SSE_RCPS : OpndItins<
 >;
 }
 
-/// sse1_fp_unop_s - SSE1 unops in scalar form.
-multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
-                          SDNode OpNode, Intrinsic F32Int, OpndItins itins> {
-let Predicates = [HasAVX], hasSideEffects = 0 in {
-  def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
-                      (ins FR32:$src1, FR32:$src2),
-                      !strconcat("v", OpcodeStr,
-                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
-  let mayLoad = 1 in {
-  def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
-                      (ins FR32:$src1,f32mem:$src2),
-                      !strconcat("v", OpcodeStr,
-                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, VEX_4V, VEX_LIG,
-                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
-  let isCodeGenOnly = 1 in
-  def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
-                      (ins VR128:$src1, ssmem:$src2),
-                      !strconcat("v", OpcodeStr,
-                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, VEX_4V, VEX_LIG,
-                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+/// sse_fp_unop_s - SSE1 unops in scalar form
+/// For the non-AVX defs, we need $src1 to be tied to $dst because
+/// the HW instructions are 2 operand / destructive.
+multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                          ValueType vt, ValueType ScalarVT,
+                          X86MemOperand x86memop, Operand vec_memop,
+                          ComplexPattern mem_cpat, Intrinsic Intr,
+                          SDNode OpNode, OpndItins itins, Predicate target,
+                          string Suffix> {
+  let hasSideEffects = 0 in {
+  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
+              !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
+            [(set RC:$dst, (OpNode RC:$src1))], itins.rr>, Sched<[itins.Sched]>,
+            Requires<[target]>;
+  let mayLoad = 1 in
+  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
+            !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
+            [(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm>,
+            Sched<[itins.Sched.Folded, ReadAfterLd]>,
+            Requires<[target, OptForSize]>;
+
+  let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
+  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+              !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+            []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  let mayLoad = 1 in
+  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, vec_memop:$src2),
+              !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+            []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
   }
-}
 
-  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
-                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
-                [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>;
-  // For scalar unary operations, fold a load into the operation
-  // only in OptForSize mode. It eliminates an instruction, but it also
-  // eliminates a whole-register clobber (the load), so it introduces a
-  // partial register update condition.
-  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
-                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
-                [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
-            Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
-let isCodeGenOnly = 1 in {
-  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>,
-                Sched<[itins.Sched]>;
-  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
-                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>,
-                Sched<[itins.Sched.Folded]>;
-}
-}
-
-/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand.
-multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           OpndItins itins> {
-let Predicates = [HasAVX], hasSideEffects = 0 in {
-  def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
-                       (ins FR32:$src1, FR32:$src2),
-                       !strconcat("v", OpcodeStr,
-                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
-  let mayLoad = 1 in {
-  def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
-                      (ins FR32:$src1,f32mem:$src2),
-                      !strconcat("v", OpcodeStr,
-                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, VEX_4V, VEX_LIG,
-                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
-  let isCodeGenOnly = 1 in
-  def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
-                      (ins VR128:$src1, ssmem:$src2),
-                      !strconcat("v", OpcodeStr,
-                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, VEX_4V, VEX_LIG,
-                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  let Predicates = [target] in {
+  def : Pat<(vt (OpNode mem_cpat:$src)),
+            (vt (COPY_TO_REGCLASS (vt (!cast<Instruction>(NAME#Suffix##m_Int)
+                 (vt (IMPLICIT_DEF)), mem_cpat:$src)), RC))>;
+  // These are unary operations, but they are modeled as having 2 source operands
+  // because the high elements of the destination are unchanged in SSE.
+  def : Pat<(Intr VR128:$src),
+            (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>;
+  def : Pat<(Intr (load addr:$src)), 
+            (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m)
+                                      addr:$src), VR128))>;
+   def : Pat<(Intr mem_cpat:$src),
+             (!cast<Instruction>(NAME#Suffix##m_Int)
+                    (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
   }
 }
 
-  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
-                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
-                [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>;
-  // For scalar unary operations, fold a load into the operation
-  // only in OptForSize mode. It eliminates an instruction, but it also
-  // eliminates a whole-register clobber (the load), so it introduces a
-  // partial register update condition.
-  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
-                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
-                [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
-            Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
-  let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
-    def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
-                      (ins VR128:$src1, VR128:$src2),
-                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
-                      [], itins.rr>, Sched<[itins.Sched]>;
-    let mayLoad = 1, hasSideEffects = 0 in
-    def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
-                      (ins VR128:$src1, ssmem:$src2),
-                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
-                      [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                          ValueType vt, ValueType ScalarVT,
+                          X86MemOperand x86memop, Operand vec_memop,
+                          ComplexPattern mem_cpat,
+                          Intrinsic Intr, SDNode OpNode, OpndItins itins,
+                          Predicate target, string Suffix> {
+  let hasSideEffects = 0 in {
+  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [], itins.rr>, Sched<[itins.Sched]>;
+  let mayLoad = 1 in 
+  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  let isCodeGenOnly = 1 in {
+  // todo: uncomment when all r_Int forms will be added to X86InstrInfo.cpp
+  //def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), 
+  //              (ins VR128:$src1, VR128:$src2),
+  //           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+  //          []>, Sched<[itins.Sched.Folded]>;
+  let mayLoad = 1 in
+  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), 
+                (ins VR128:$src1, vec_memop:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
   }
+
+  let Predicates = [target] in {
+   def : Pat<(OpNode RC:$src),  (!cast<Instruction>("V"#NAME#Suffix##r)
+                                (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
+
+   def : Pat<(vt (OpNode mem_cpat:$src)), 
+             (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)),
+                                  mem_cpat:$src)>;
+
+   // todo: use r_Int form when it will be ready
+   //def : Pat<(Intr VR128:$src), (!cast<Instruction>("V"#NAME#Suffix##r_Int)
+   //                 (VT (IMPLICIT_DEF)), VR128:$src)>;
+   def : Pat<(Intr VR128:$src),
+             (vt (COPY_TO_REGCLASS(
+             !cast<Instruction>("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)), 
+                    (ScalarVT (COPY_TO_REGCLASS VR128:$src, RC))), VR128))>;
+   def : Pat<(Intr mem_cpat:$src),
+             (!cast<Instruction>("V"#NAME#Suffix##m_Int)
+                    (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
+  }
+  let Predicates = [target, OptForSize] in
+  def : Pat<(ScalarVT (OpNode (load addr:$src))), 
+            (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
+             addr:$src)>;
 }
 
 /// sse1_fp_unop_p - SSE1 unops in packed form.
@@ -3693,53 +3522,6 @@ let Predicates = [HasAVX] in {
 } // isCodeGenOnly = 1
 }
 
-/// sse2_fp_unop_s - SSE2 unops in scalar form.
-multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
-                          SDNode OpNode, Intrinsic F64Int, OpndItins itins> {
-let Predicates = [HasAVX], hasSideEffects = 0 in {
-  def V#NAME#SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst),
-                      (ins FR64:$src1, FR64:$src2),
-                      !strconcat("v", OpcodeStr,
-                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
-  let mayLoad = 1 in {
-  def V#NAME#SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
-                      (ins FR64:$src1,f64mem:$src2),
-                      !strconcat("v", OpcodeStr,
-                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, VEX_4V, VEX_LIG,
-                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
-  let isCodeGenOnly = 1 in
-  def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
-                      (ins VR128:$src1, sdmem:$src2),
-                      !strconcat("v", OpcodeStr,
-                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, VEX_4V, VEX_LIG,
-                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
-  }
-}
-
-  def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
-                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
-                [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>,
-            Sched<[itins.Sched]>;
-  // See the comments in sse1_fp_unop_s for why this is OptForSize.
-  def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
-                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
-                [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD,
-            Requires<[UseSSE2, OptForSize]>, Sched<[itins.Sched.Folded]>;
-let isCodeGenOnly = 1 in {
-  def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>,
-                Sched<[itins.Sched]>;
-  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
-                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>,
-                Sched<[itins.Sched.Folded]>;
-}
-}
-
 /// sse2_fp_unop_p - SSE2 unops in vector forms.
 multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
                           SDNode OpNode, OpndItins itins> {
@@ -3776,90 +3558,47 @@ let Predicates = [HasAVX] in {
             Sched<[itins.Sched.Folded]>;
 }
 
+multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          OpndItins itins> {
+  defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
+                      ssmem, sse_load_f32,
+                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
+                      itins, UseSSE1, "SS">, XS;
+  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
+                      f32mem, ssmem, sse_load_f32,
+                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
+                      itins, HasAVX, "SS">, XS, VEX_4V, VEX_LIG;
+}
+
+multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          OpndItins itins> {
+  defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
+                         sdmem, sse_load_f64,
+                         !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
+                         OpNode, itins, UseSSE2, "SD">, XD;
+  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
+                         f64mem, sdmem, sse_load_f64,
+                         !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
+                         OpNode, itins, HasAVX, "SD">, XD, VEX_4V, VEX_LIG;
+}
+
 // Square root.
-defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss,
-                            SSE_SQRTSS>,
+defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
              sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>,
-             sse2_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse2_sqrt_sd,
-                            SSE_SQRTSD>,
+             sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>,
              sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
 
 // Reciprocal approximations. Note that these typically require refinement
 // in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
+defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
              sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>,
              sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
                                 int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>;
-defm RCP   : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
+defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
              sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
              sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
                                 int_x86_avx_rcp_ps_256, SSE_RCPP>;
 
-let Predicates = [UseAVX] in {
-  def : Pat<(f32 (fsqrt FR32:$src)),
-            (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
-  def : Pat<(f32 (fsqrt (load addr:$src))),
-            (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
-            Requires<[HasAVX, OptForSize]>;
-  def : Pat<(f64 (fsqrt FR64:$src)),
-            (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
-  def : Pat<(f64 (fsqrt (load addr:$src))),
-            (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
-            Requires<[HasAVX, OptForSize]>;
-
-  def : Pat<(f32 (X86frsqrt FR32:$src)),
-            (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
-  def : Pat<(f32 (X86frsqrt (load addr:$src))),
-            (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
-            Requires<[HasAVX, OptForSize]>;
-
-  def : Pat<(f32 (X86frcp FR32:$src)),
-            (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
-  def : Pat<(f32 (X86frcp (load addr:$src))),
-            (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
-            Requires<[HasAVX, OptForSize]>;
-}
-let Predicates = [UseAVX] in {
-  def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
-            (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)),
-                                        (COPY_TO_REGCLASS VR128:$src, FR32)),
-                              VR128)>;
-  def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
-            (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
-
-  def : Pat<(int_x86_sse2_sqrt_sd VR128:$src),
-            (COPY_TO_REGCLASS (VSQRTSDr (f64 (IMPLICIT_DEF)),
-                                        (COPY_TO_REGCLASS VR128:$src, FR64)),
-                              VR128)>;
-  def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
-            (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
-}
-
-let Predicates = [HasAVX] in {
-  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
-            (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)),
-                                         (COPY_TO_REGCLASS VR128:$src, FR32)),
-                              VR128)>;
-  def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src),
-            (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
-
-  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
-            (COPY_TO_REGCLASS (VRCPSSr (f32 (IMPLICIT_DEF)),
-                                       (COPY_TO_REGCLASS VR128:$src, FR32)),
-                              VR128)>;
-  def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src),
-            (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
-}
-
-// Reciprocal approximations. Note that these typically require refinement
-// in order to obtain suitable precision.
-let Predicates = [UseSSE1] in {
-  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
-            (RSQRTSSr_Int VR128:$src, VR128:$src)>;
-  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
-            (RCPSSr_Int VR128:$src, VR128:$src)>;
-}
-
 // There is no f64 version of the reciprocal approximation instructions.
 
 //===----------------------------------------------------------------------===//
@@ -3974,14 +3713,14 @@ let SchedRW = [WriteLoad] in {
 // Flush cache
 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
                "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
-               IIC_SSE_PREFETCH>, TB, Requires<[HasSSE2]>;
+               IIC_SSE_PREFETCH>, PS, Requires<[HasSSE2]>;
 }
 
 let SchedRW = [WriteNop] in {
 // Pause. This "instruction" is encoded as "rep; nop", so even though it
 // was introduced with SSE2, it's backward compatible.
-def PAUSE : I<0x90, RawFrm, (outs), (ins),  
-              "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, 
+def PAUSE : I<0x90, RawFrm, (outs), (ins),
+              "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>,
               OBXS, Requires<[HasSSE2]>;
 }
 
@@ -3989,7 +3728,7 @@ let SchedRW = [WriteFence] in {
 // Load, store, and memory fence
 def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
                "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
-               TB, Requires<[HasSSE1]>;
+               PS, Requires<[HasSSE1]>;
 def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
                "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
                TB, Requires<[HasSSE2]>;
@@ -4013,12 +3752,14 @@ def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
                   "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
                   IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
 
-def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
-                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
-                  IIC_SSE_LDMXCSR>, Sched<[WriteLoad]>;
-def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
-                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
-                  IIC_SSE_STMXCSR>, Sched<[WriteStore]>;
+let Predicates = [UseSSE1] in {
+def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
+                "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+                IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
+def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+                "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+                IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
+}
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
@@ -4026,7 +3767,7 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
 
 let ExeDomain = SSEPackedInt in { // SSE integer instructions
 
-let neverHasSideEffects = 1, SchedRW = [WriteMove] in {
+let hasSideEffects = 0, SchedRW = [WriteMove] in {
 def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
                     VEX;
@@ -4061,7 +3802,7 @@ def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
 }
 
 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
-    neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
+    hasSideEffects = 0, SchedRW = [WriteLoad] in {
 def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
                    VEX;
@@ -4078,7 +3819,7 @@ let Predicates = [HasAVX] in {
 }
 }
 
-let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
+let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
 def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
                      (ins i128mem:$dst, VR128:$src),
                      "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
@@ -4098,7 +3839,7 @@ def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
 }
 
 let SchedRW = [WriteMove] in {
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
 
@@ -4119,7 +3860,7 @@ def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
 } // SchedRW
 
 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
-    neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
+    hasSideEffects = 0, SchedRW = [WriteLoad] in {
 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "movdqa\t{$src, $dst|$dst, $src}",
                    [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
@@ -4131,7 +3872,7 @@ def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                  XS, Requires<[UseSSE2]>;
 }
 
-let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
+let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                    "movdqa\t{$src, $dst|$dst, $src}",
                    [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
@@ -4211,7 +3952,7 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
                          string OpcodeStr, SDNode OpNode,
                          SDNode OpNode2, RegisterClass RC,
                          ValueType DstVT, ValueType SrcVT, PatFrag bc_frag,
-                         ShiftOpndItins itins,
+                         PatFrag ld_frag, ShiftOpndItins itins,
                          bit Is2Addr = 1> {
   // src2 is always 128-bit
   def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
@@ -4227,10 +3968,10 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (DstVT (OpNode RC:$src1,
-                       (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>,
+                       (bc_frag (ld_frag addr:$src2)))))], itins.rm>,
       Sched<[WriteVecShiftLd, ReadAfterLd]>;
   def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
-       (ins RC:$src1, i8imm:$src2),
+       (ins RC:$src1, u8imm:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
@@ -4338,45 +4079,45 @@ defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
 
 let Predicates = [HasAVX] in {
 defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
-                            VR128, v8i16, v8i16, bc_v8i16,
+                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
-                            VR128, v4i32, v4i32, bc_v4i32,
+                            VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
-                            VR128, v2i64, v2i64, bc_v2i64,
+                            VR128, v2i64, v2i64, bc_v2i64, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 
 defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
-                            VR128, v8i16, v8i16, bc_v8i16,
+                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
-                            VR128, v4i32, v4i32, bc_v4i32,
+                            VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
-                            VR128, v2i64, v2i64, bc_v2i64,
+                            VR128, v2i64, v2i64, bc_v2i64, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 
 defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
-                            VR128, v8i16, v8i16, bc_v8i16,
+                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
-                            VR128, v4i32, v4i32, bc_v4i32,
+                            VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 
 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
   // 128-bit logical shifts.
   def VPSLLDQri : PDIi8<0x73, MRM7r,
-                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
                     "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst,
-                      (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>,
+                      (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>,
                     VEX_4V;
   def VPSRLDQri : PDIi8<0x73, MRM3r,
-                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
                     "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst,
-                      (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>,
+                      (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>,
                     VEX_4V;
   // PSRADQri doesn't exist in SSE[1-3].
 }
@@ -4384,45 +4125,45 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
 
 let Predicates = [HasAVX2] in {
 defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
-                             VR256, v16i16, v8i16, bc_v8i16,
+                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
-                             VR256, v8i32, v4i32, bc_v4i32,
+                             VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
-                             VR256, v4i64, v2i64, bc_v2i64,
+                             VR256, v4i64, v2i64, bc_v2i64, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
 defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
-                             VR256, v16i16, v8i16, bc_v8i16,
+                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
-                             VR256, v8i32, v4i32, bc_v4i32,
+                             VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
-                             VR256, v4i64, v2i64, bc_v2i64,
+                             VR256, v4i64, v2i64, bc_v2i64, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
 defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
-                             VR256, v16i16, v8i16, bc_v8i16,
+                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
-                             VR256, v8i32, v4i32, bc_v4i32,
+                             VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
-let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
   // 256-bit logical shifts.
   def VPSLLDQYri : PDIi8<0x73, MRM7r,
-                    (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
+                    (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
                     "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR256:$dst,
-                      (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>,
+                      (v4i64 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>,
                     VEX_4V, VEX_L;
   def VPSRLDQYri : PDIi8<0x73, MRM3r,
-                    (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
+                    (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
                     "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR256:$dst,
-                      (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>,
+                      (v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>,
                     VEX_4V, VEX_L;
   // PSRADQYri doesn't exist in SSE[1-3].
 }
@@ -4430,85 +4171,58 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
 
 let Constraints = "$src1 = $dst" in {
 defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
-                           VR128, v8i16, v8i16, bc_v8i16,
+                           VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
                            SSE_INTSHIFT_ITINS_P>;
 defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
-                           VR128, v4i32, v4i32, bc_v4i32,
+                           VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
                            SSE_INTSHIFT_ITINS_P>;
 defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
-                           VR128, v2i64, v2i64, bc_v2i64,
+                           VR128, v2i64, v2i64, bc_v2i64, memopv2i64,
                            SSE_INTSHIFT_ITINS_P>;
 
 defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
-                           VR128, v8i16, v8i16, bc_v8i16,
+                           VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
                            SSE_INTSHIFT_ITINS_P>;
 defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
-                           VR128, v4i32, v4i32, bc_v4i32,
+                           VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
                            SSE_INTSHIFT_ITINS_P>;
 defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
-                           VR128, v2i64, v2i64, bc_v2i64,
+                           VR128, v2i64, v2i64, bc_v2i64, memopv2i64,
                            SSE_INTSHIFT_ITINS_P>;
 
 defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
-                           VR128, v8i16, v8i16, bc_v8i16,
+                           VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
                            SSE_INTSHIFT_ITINS_P>;
 defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
-                           VR128, v4i32, v4i32, bc_v4i32,
+                           VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
                            SSE_INTSHIFT_ITINS_P>;
 
-let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
   // 128-bit logical shifts.
   def PSLLDQri : PDIi8<0x73, MRM7r,
-                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                       (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
                        "pslldq\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))],
-                         IIC_SSE_INTSHDQ_P_RI>;
+                         (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))],
+                       IIC_SSE_INTSHDQ_P_RI>;
   def PSRLDQri : PDIi8<0x73, MRM3r,
-                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                       (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
                        "psrldq\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))],
-                         IIC_SSE_INTSHDQ_P_RI>;
+                         (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))],
+                       IIC_SSE_INTSHDQ_P_RI>;
   // PSRADQri doesn't exist in SSE[1-3].
 }
 } // Constraints = "$src1 = $dst"
 
 let Predicates = [HasAVX] in {
-  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
-            (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
-  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
-            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
   def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
             (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
-
-  // Shift up / down and insert zero's.
-  def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
-            (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
-  def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
-            (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
-}
-
-let Predicates = [HasAVX2] in {
-  def : Pat<(int_x86_avx2_psll_dq VR256:$src1, imm:$src2),
-            (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
-  def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2),
-            (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
 }
 
 let Predicates = [UseSSE2] in {
-  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
-            (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
-  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
-            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
   def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
             (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
-
-  // Shift up / down and insert zero's.
-  def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
-            (PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
-  def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
-            (PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -4537,14 +4251,14 @@ multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
                          SDNode OpNode> {
 let Predicates = [HasAVX] in {
   def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
-                      (ins VR128:$src1, i8imm:$src2),
+                      (ins VR128:$src1, u8imm:$src2),
                       !strconcat("v", OpcodeStr,
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR128:$dst,
                         (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
                       IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
   def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
-                      (ins i128mem:$src1, i8imm:$src2),
+                      (ins i128mem:$src1, u8imm:$src2),
                       !strconcat("v", OpcodeStr,
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR128:$dst,
@@ -4555,14 +4269,14 @@ let Predicates = [HasAVX] in {
 
 let Predicates = [HasAVX2] in {
   def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
-                       (ins VR256:$src1, i8imm:$src2),
+                       (ins VR256:$src1, u8imm:$src2),
                        !strconcat("v", OpcodeStr,
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                        [(set VR256:$dst,
                          (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
                        IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
   def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
-                       (ins i256mem:$src1, i8imm:$src2),
+                       (ins i256mem:$src1, u8imm:$src2),
                        !strconcat("v", OpcodeStr,
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR256:$dst,
@@ -4573,14 +4287,14 @@ let Predicates = [HasAVX2] in {
 
 let Predicates = [UseSSE2] in {
   def ri : Ii8<0x70, MRMSrcReg,
-               (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+               (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
                !strconcat(OpcodeStr,
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set VR128:$dst,
                   (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
                 IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
   def mi : Ii8<0x70, MRMSrcMem,
-               (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+               (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
                !strconcat(OpcodeStr,
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set VR128:$dst,
@@ -4616,7 +4330,7 @@ let Predicates = [UseSSE2] in {
 let ExeDomain = SSEPackedInt in {
 multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                      ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
-                     bit Is2Addr = 1> {
+                     PatFrag ld_frag, bit Is2Addr = 1> {
   def rr : PDI<opc, MRMSrcReg,
                (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                !if(Is2Addr,
@@ -4634,7 +4348,7 @@ multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
                [(set VR128:$dst,
                      (OutVT (OpNode VR128:$src1,
-                                    (bc_frag (memopv2i64 addr:$src2)))))]>,
+                                    (bc_frag (ld_frag addr:$src2)))))]>,
                Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
@@ -4653,13 +4367,13 @@ multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set VR256:$dst,
                       (OutVT (OpNode VR256:$src1,
-                                     (bc_frag (memopv4i64 addr:$src2)))))]>,
+                                     (bc_frag (loadv4i64 addr:$src2)))))]>,
                 Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
 multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                      ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
-                     bit Is2Addr = 1> {
+                     PatFrag ld_frag, bit Is2Addr = 1> {
   def rr : SS48I<opc, MRMSrcReg,
                  (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                  !if(Is2Addr,
@@ -4677,7 +4391,7 @@ multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
                  [(set VR128:$dst,
                        (OutVT (OpNode VR128:$src1,
-                                      (bc_frag (memopv2i64 addr:$src2)))))]>,
+                                      (bc_frag (ld_frag addr:$src2)))))]>,
                  Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
@@ -4696,20 +4410,20 @@ multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                   [(set VR256:$dst,
                         (OutVT (OpNode VR256:$src1,
-                                       (bc_frag (memopv4i64 addr:$src2)))))]>,
+                                       (bc_frag (loadv4i64 addr:$src2)))))]>,
                   Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in {
   defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
-                             bc_v8i16, 0>, VEX_4V;
+                             bc_v8i16, loadv2i64, 0>, VEX_4V;
   defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
-                             bc_v4i32, 0>, VEX_4V;
+                             bc_v4i32, loadv2i64, 0>, VEX_4V;
 
   defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
-                             bc_v8i16, 0>, VEX_4V;
+                             bc_v8i16, loadv2i64, 0>, VEX_4V;
   defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
-                             bc_v4i32, 0>, VEX_4V;
+                             bc_v4i32, loadv2i64, 0>, VEX_4V;
 }
 
 let Predicates = [HasAVX2] in {
@@ -4726,16 +4440,16 @@ let Predicates = [HasAVX2] in {
 
 let Constraints = "$src1 = $dst" in {
   defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss,
-                            bc_v8i16>;
+                            bc_v8i16, memopv2i64>;
   defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss,
-                            bc_v4i32>;
+                            bc_v4i32, memopv2i64>;
 
   defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus,
-                            bc_v8i16>;
+                            bc_v8i16, memopv2i64>;
 
   let Predicates = [HasSSE41] in
   defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus,
-                            bc_v4i32>;
+                            bc_v4i32, memopv2i64>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -4745,7 +4459,8 @@ let Constraints = "$src1 = $dst" in {
 
 let ExeDomain = SSEPackedInt in {
 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
-                       SDNode OpNode, PatFrag bc_frag, bit Is2Addr = 1> {
+                       SDNode OpNode, PatFrag bc_frag, PatFrag ld_frag,
+                       bit Is2Addr = 1> {
   def rr : PDI<opc, MRMSrcReg,
       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
       !if(Is2Addr,
@@ -4759,8 +4474,7 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set VR128:$dst, (OpNode VR128:$src1,
-                                  (bc_frag (memopv2i64
-                                               addr:$src2))))],
+                                  (bc_frag (ld_frag addr:$src2))))],
                                                IIC_SSE_UNPCK>,
       Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
@@ -4776,28 +4490,28 @@ multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
       (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
       !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
       [(set VR256:$dst, (OpNode VR256:$src1,
-                                  (bc_frag (memopv4i64 addr:$src2))))]>,
+                                  (bc_frag (loadv4i64 addr:$src2))))]>,
       Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in {
   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
-                                 bc_v16i8, 0>, VEX_4V;
+                                 bc_v16i8, loadv2i64, 0>, VEX_4V;
   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
-                                 bc_v8i16, 0>, VEX_4V;
+                                 bc_v8i16, loadv2i64, 0>, VEX_4V;
   defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
-                                 bc_v4i32, 0>, VEX_4V;
+                                 bc_v4i32, loadv2i64, 0>, VEX_4V;
   defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
-                                 bc_v2i64, 0>, VEX_4V;
+                                 bc_v2i64, loadv2i64, 0>, VEX_4V;
 
   defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
-                                 bc_v16i8, 0>, VEX_4V;
+                                 bc_v16i8, loadv2i64, 0>, VEX_4V;
   defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
-                                 bc_v8i16, 0>, VEX_4V;
+                                 bc_v8i16, loadv2i64, 0>, VEX_4V;
   defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
-                                 bc_v4i32, 0>, VEX_4V;
+                                 bc_v4i32, loadv2i64, 0>, VEX_4V;
   defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
-                                 bc_v2i64, 0>, VEX_4V;
+                                 bc_v2i64, loadv2i64, 0>, VEX_4V;
 }
 
 let Predicates = [HasAVX2] in {
@@ -4822,22 +4536,22 @@ let Predicates = [HasAVX2] in {
 
 let Constraints = "$src1 = $dst" in {
   defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
-                                bc_v16i8>;
+                                bc_v16i8, memopv2i64>;
   defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
-                                bc_v8i16>;
+                                bc_v8i16, memopv2i64>;
   defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
-                                bc_v4i32>;
+                                bc_v4i32, memopv2i64>;
   defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
-                                bc_v2i64>;
+                                bc_v2i64, memopv2i64>;
 
   defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
-                                bc_v16i8>;
+                                bc_v16i8, memopv2i64>;
   defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
-                                bc_v8i16>;
+                                bc_v8i16, memopv2i64>;
   defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
-                                bc_v4i32>;
+                                bc_v4i32, memopv2i64>;
   defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
-                                bc_v2i64>;
+                                bc_v2i64, memopv2i64>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -4849,7 +4563,7 @@ let ExeDomain = SSEPackedInt in {
 multiclass sse2_pinsrw<bit Is2Addr = 1> {
   def rri : Ii8<0xC4, MRMSrcReg,
        (outs VR128:$dst), (ins VR128:$src1,
-        GR32orGR64:$src2, i32i8imm:$src3),
+        GR32orGR64:$src2, u8imm:$src3),
        !if(Is2Addr,
            "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
@@ -4858,7 +4572,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
        IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
   def rmi : Ii8<0xC4, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1,
-                        i16mem:$src2, i32i8imm:$src3),
+                        i16mem:$src2, u8imm:$src3),
        !if(Is2Addr,
            "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
@@ -4871,13 +4585,13 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
 // Extract
 let Predicates = [HasAVX] in
 def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
-                    (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
                                             imm:$src2))]>, PD, VEX,
                 Sched<[WriteShuffle]>;
 def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
-                    (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
                     "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
                                             imm:$src2))], IIC_SSE_PEXTRW>,
@@ -4974,6 +4688,10 @@ def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                         [(set VR128:$dst,
                           (v2i64 (scalar_to_vector GR64:$src)))],
                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>;
 let isCodeGenOnly = 1 in
 def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
                        "movq\t{$src, $dst|$dst, $src}",
@@ -4995,6 +4713,10 @@ def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                         [(set VR128:$dst,
                           (v2i64 (scalar_to_vector GR64:$src)))],
                           IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
 let isCodeGenOnly = 1 in
 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}",
@@ -5081,6 +4803,15 @@ def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                                                          IIC_SSE_MOVD_ToGP>;
 } //SchedRW
 
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs i64mem:$dst),
+                          (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}",
+                          [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs i64mem:$dst), (ins VR128:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+
 //===---------------------------------------------------------------------===//
 // Bitcast FR64 <-> GR64
 //
@@ -5213,7 +4944,7 @@ def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
 // Move Quadword Int to Packed Quadword Int
 //
 
-let SchedRW = [WriteLoad] in {
+let ExeDomain = SSEPackedInt, SchedRW = [WriteLoad] in {
 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
@@ -5225,12 +4956,12 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
                       IIC_SSE_MOVDQ>, XS,
                     Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
-} // SchedRW
+} // ExeDomain, SchedRW
 
 //===---------------------------------------------------------------------===//
 // Move Packed Quadword Int to Quadword Int
 //
-let SchedRW = [WriteStore] in {
+let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
 def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
                       [(store (i64 (vector_extract (v2i64 VR128:$src),
@@ -5241,7 +4972,7 @@ def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       [(store (i64 (vector_extract (v2i64 VR128:$src),
                                     (iPTR 0))), addr:$dst)],
                                     IIC_SSE_MOVDQ>;
-} // SchedRW
+} // ExeDomain, SchedRW
 
 // For disassembler only
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
@@ -5262,7 +4993,7 @@ let Predicates = [UseSSE2] in
 def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
           (MOVPQI2QImr addr:$dst, VR128:$src)>;
 
-let isCodeGenOnly = 1, AddedComplexity = 20 in {
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, AddedComplexity = 20 in {
 def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                      "vmovq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst,
@@ -5278,7 +5009,7 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                                                  (loadi64 addr:$src))))))],
                                                  IIC_SSE_MOVDQ>,
                      XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
-}
+} // ExeDomain, isCodeGenOnly, AddedComplexity
 
 let Predicates = [UseAVX], AddedComplexity = 20 in {
   def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
@@ -5304,7 +5035,7 @@ def : Pat<(v4i64 (X86vzload addr:$src)),
 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
 // IA32 document. movq xmm1, xmm2 does clear the high bits.
 //
-let SchedRW = [WriteVecLogic] in {
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
 let AddedComplexity = 15 in
 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
@@ -5317,9 +5048,9 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
                     IIC_SSE_MOVQ_RR>,
                       XS, Requires<[UseSSE2]>;
-} // SchedRW
+} // ExeDomain, SchedRW
 
-let isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in {
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in {
 let AddedComplexity = 20 in
 def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
@@ -5335,7 +5066,7 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                                              IIC_SSE_MOVDQ>,
                       XS, Requires<[UseSSE2]>;
 }
-} // isCodeGenOnly, SchedRW
+} // ExeDomain, isCodeGenOnly, SchedRW
 
 let AddedComplexity = 20 in {
   let Predicates = [UseAVX] in {
@@ -5414,10 +5145,10 @@ let Predicates = [UseSSE3] in {
 //===---------------------------------------------------------------------===//
 
 multiclass sse3_replicate_dfp<string OpcodeStr> {
-let neverHasSideEffects = 1 in
 def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    [], IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+                    [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))],
+                    IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
 def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst,
@@ -5514,7 +5245,7 @@ def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
 
 multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
                        X86MemOperand x86memop, OpndItins itins,
-                       bit Is2Addr = 1> {
+                       PatFrag ld_frag, bit Is2Addr = 1> {
   def rr : I<0xD0, MRMSrcReg,
        (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
@@ -5527,62 +5258,62 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>,
+       [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], itins.rr>,
        Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in {
   let ExeDomain = SSEPackedSingle in {
     defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
-                                 f128mem, SSE_ALU_F32P, 0>, XD, VEX_4V;
+                               f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V;
     defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
-                               f256mem, SSE_ALU_F32P, 0>, XD, VEX_4V, VEX_L;
+                        f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
-                                 f128mem, SSE_ALU_F64P, 0>, PD, VEX_4V;
+                               f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V;
     defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
-                           f256mem, SSE_ALU_F64P, 0>, PD, VEX_4V, VEX_L;
+                        f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L;
   }
 }
 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
   let ExeDomain = SSEPackedSingle in
   defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
-                              f128mem, SSE_ALU_F32P>, XD;
+                              f128mem, SSE_ALU_F32P, memopv4f32>, XD;
   let ExeDomain = SSEPackedDouble in
   defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
-                              f128mem, SSE_ALU_F64P>, PD;
+                              f128mem, SSE_ALU_F64P, memopv2f64>, PD;
 }
 
 // Patterns used to select 'addsub' instructions.
 let Predicates = [HasAVX] in {
   def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
             (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
-  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))),
+  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (loadv4f32 addr:$rhs))),
             (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
   def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
             (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
-  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))),
+  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (loadv2f64 addr:$rhs))),
             (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
 
   def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))),
             (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>;
-  def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 (memop addr:$rhs)))),
+  def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (loadv8f32 addr:$rhs))),
             (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>;
   def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))),
             (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
-  def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 (memop addr:$rhs)))),
+  def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (loadv4f64 addr:$rhs))),
             (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>;
 }
 
 let Predicates = [UseSSE3] in {
   def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
             (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
-  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))),
+  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (memopv4f32 addr:$rhs))),
             (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
   def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
             (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
-  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))),
+  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (memopv2f64 addr:$rhs))),
             (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
 }
 
@@ -5592,7 +5323,8 @@ let Predicates = [UseSSE3] in {
 
 // Horizontal ops
 multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
-                   X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
+                   X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
+                   bit Is2Addr = 1> {
   def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
@@ -5604,11 +5336,12 @@ multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
+      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
         IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
 }
 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
-                  X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
+                  X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
+                  bit Is2Addr = 1> {
   def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
@@ -5620,41 +5353,45 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
+      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
         IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in {
   let ExeDomain = SSEPackedSingle in {
     defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
-                            X86fhadd, 0>, VEX_4V;
+                            X86fhadd, loadv4f32, 0>, VEX_4V;
     defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
-                            X86fhsub, 0>, VEX_4V;
+                            X86fhsub, loadv4f32, 0>, VEX_4V;
     defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
-                            X86fhadd, 0>, VEX_4V, VEX_L;
+                            X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L;
     defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
-                            X86fhsub, 0>, VEX_4V, VEX_L;
+                            X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
-                            X86fhadd, 0>, VEX_4V;
+                            X86fhadd, loadv2f64, 0>, VEX_4V;
     defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
-                            X86fhsub, 0>, VEX_4V;
+                            X86fhsub, loadv2f64, 0>, VEX_4V;
     defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
-                            X86fhadd, 0>, VEX_4V, VEX_L;
+                            X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L;
     defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
-                            X86fhsub, 0>, VEX_4V, VEX_L;
+                            X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L;
   }
 }
 
 let Constraints = "$src1 = $dst" in {
   let ExeDomain = SSEPackedSingle in {
-    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>;
-    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>;
+    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
+                          memopv4f32>;
+    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
+                          memopv4f32>;
   }
   let ExeDomain = SSEPackedDouble in {
-    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>;
-    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>;
+    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
+                         memopv2f64>;
+    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
+                         memopv2f64>;
   }
 }
 
@@ -5664,8 +5401,8 @@ let Constraints = "$src1 = $dst" in {
 
 
 /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
-multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
-                            Intrinsic IntId128> {
+multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
+                            PatFrag ld_frag> {
   def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -5677,7 +5414,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst,
                       (IntId128
-                       (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>,
+                       (bitconvert (ld_frag addr:$src))))], IIC_SSE_PABS_RM>,
                     Sched<[WriteVecALULd]>;
 }
 
@@ -5695,7 +5432,7 @@ multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst,
                       (IntId256
-                       (bitconvert (memopv4i64 addr:$src))))]>,
+                       (bitconvert (loadv4i64 addr:$src))))]>,
                     Sched<[WriteVecALULd]>;
 }
 
@@ -5710,12 +5447,12 @@ def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
 def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
 
 let Predicates = [HasAVX] in {
-  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb",
-                                  int_x86_ssse3_pabs_b_128>, VEX;
-  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw",
-                                  int_x86_ssse3_pabs_w_128>, VEX;
-  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd",
-                                  int_x86_ssse3_pabs_d_128>, VEX;
+  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb", int_x86_ssse3_pabs_b_128,
+                                  loadv2i64>, VEX;
+  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw", int_x86_ssse3_pabs_w_128,
+                                  loadv2i64>, VEX;
+  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd", int_x86_ssse3_pabs_d_128,
+                                  loadv2i64>, VEX;
 
   def : Pat<(xor
             (bc_v2i64 (v16i1sextv16i8)),
@@ -5753,12 +5490,12 @@ let Predicates = [HasAVX2] in {
             (VPABSDrr256 VR256:$src)>;
 }
 
-defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb",
-                              int_x86_ssse3_pabs_b_128>;
-defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw",
-                              int_x86_ssse3_pabs_w_128>;
-defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd",
-                              int_x86_ssse3_pabs_d_128>;
+defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", int_x86_ssse3_pabs_b_128,
+                              memopv2i64>;
+defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", int_x86_ssse3_pabs_w_128,
+                              memopv2i64>;
+defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", int_x86_ssse3_pabs_d_128,
+                              memopv2i64>;
 
 let Predicates = [HasSSSE3] in {
   def : Pat<(xor
@@ -5830,7 +5567,7 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
 multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
                              Intrinsic IntId128, OpndItins itins,
-                             bit Is2Addr = 1> {
+                             PatFrag ld_frag, bit Is2Addr = 1> {
   let isCommutable = 1 in
   def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
        (ins VR128:$src1, VR128:$src2),
@@ -5846,7 +5583,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
          (IntId128 VR128:$src1,
-          (bitconvert (memopv2i64 addr:$src2))))]>,
+          (bitconvert (ld_frag addr:$src2))))]>,
        Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
@@ -5895,17 +5632,17 @@ let isCommutable = 0 in {
                                   SSE_PSHUFB, 0>, VEX_4V;
   defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
                                       int_x86_ssse3_phadd_sw_128,
-                                      SSE_PHADDSUBSW, 0>, VEX_4V;
+                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
   defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
                                       int_x86_ssse3_phsub_sw_128,
-                                      SSE_PHADDSUBSW, 0>, VEX_4V;
+                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
   defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
                                       int_x86_ssse3_pmadd_ub_sw_128,
-                                      SSE_PMADD, 0>, VEX_4V;
+                                      SSE_PMADD, loadv2i64, 0>, VEX_4V;
 }
 defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
                                       int_x86_ssse3_pmul_hr_sw_128,
-                                      SSE_PMULHRSW, 0>, VEX_4V;
+                                      SSE_PMULHRSW, loadv2i64, 0>, VEX_4V;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX2] in {
@@ -5970,16 +5707,17 @@ let isCommutable = 0 in {
                                  memopv2i64, i128mem, SSE_PSHUFB>;
   defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
                                      int_x86_ssse3_phadd_sw_128,
-                                     SSE_PHADDSUBSW>;
+                                     SSE_PHADDSUBSW, memopv2i64>;
   defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
                                      int_x86_ssse3_phsub_sw_128,
-                                     SSE_PHADDSUBSW>;
+                                     SSE_PHADDSUBSW, memopv2i64>;
   defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
-                                     int_x86_ssse3_pmadd_ub_sw_128, SSE_PMADD>;
+                                     int_x86_ssse3_pmadd_ub_sw_128,
+                                     SSE_PMADD, memopv2i64>;
 }
 defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw",
                                      int_x86_ssse3_pmul_hr_sw_128,
-                                     SSE_PMULHRSW>;
+                                     SSE_PMULHRSW, memopv2i64>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -5987,9 +5725,9 @@ defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw",
 //===---------------------------------------------------------------------===//
 
 multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
-  let neverHasSideEffects = 1 in {
+  let hasSideEffects = 0 in {
   def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
-      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
@@ -5997,7 +5735,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
       [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>;
   let mayLoad = 1 in
   def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
-      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+      (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
@@ -6007,15 +5745,15 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
 }
 
 multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
-  let neverHasSideEffects = 1 in {
+  let hasSideEffects = 0 in {
   def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
-      (ins VR256:$src1, VR256:$src2, i8imm:$src3),
+      (ins VR256:$src1, VR256:$src2, u8imm:$src3),
       !strconcat(asm,
                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
       []>, Sched<[WriteShuffle]>;
   let mayLoad = 1 in
   def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
-      (ins VR256:$src1, i256mem:$src2, i8imm:$src3),
+      (ins VR256:$src1, i256mem:$src2, u8imm:$src3),
       !strconcat(asm,
                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
       []>, Sched<[WriteShuffleLd, ReadAfterLd]>;
@@ -6094,552 +5832,271 @@ def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
 // SSE4.1 - Packed Move with Sign/Zero Extend
 //===----------------------------------------------------------------------===//
 
-multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId,
-                               OpndItins itins = DEFAULT_ITINS> {
-  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>,
-                 Sched<[itins.Sched]>;
-
-  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
-                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-       [(set VR128:$dst,
-         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))],
-         itins.rm>, Sched<[itins.Sched.Folded]>;
-}
-
-multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr,
-                                 Intrinsic IntId, X86FoldableSchedWrite Sched> {
-  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
-                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
-
-  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
-                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set VR256:$dst, (IntId (load addr:$src)))]>,
-                  Sched<[Sched.Folded]>;
-}
-
-let Predicates = [HasAVX] in {
-defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw",
-                                     int_x86_sse41_pmovsxbw,
-                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
-defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd",
-                                     int_x86_sse41_pmovsxwd,
-                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
-defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq",
-                                     int_x86_sse41_pmovsxdq,
-                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
-defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw",
-                                     int_x86_sse41_pmovzxbw,
-                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
-defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd",
-                                     int_x86_sse41_pmovzxwd,
-                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
-defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq",
-                                     int_x86_sse41_pmovzxdq,
-                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
-}
-
-let Predicates = [HasAVX2] in {
-defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw",
-                                        int_x86_avx2_pmovsxbw,
-                                        WriteShuffle>, VEX, VEX_L;
-defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd",
-                                        int_x86_avx2_pmovsxwd,
-                                        WriteShuffle>, VEX, VEX_L;
-defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq",
-                                        int_x86_avx2_pmovsxdq,
-                                        WriteShuffle>, VEX, VEX_L;
-defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw",
-                                        int_x86_avx2_pmovzxbw,
-                                        WriteShuffle>, VEX, VEX_L;
-defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd",
-                                        int_x86_avx2_pmovzxwd,
-                                        WriteShuffle>, VEX, VEX_L;
-defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq",
-                                        int_x86_avx2_pmovzxdq,
-                                        WriteShuffle>, VEX, VEX_L;
-}
-
-defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw,
-                                      SSE_INTALU_ITINS_SHUFF_P>;
-defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd,
-                                      SSE_INTALU_ITINS_SHUFF_P>;
-defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq,
-                                      SSE_INTALU_ITINS_SHUFF_P>;
-defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw,
-                                      SSE_INTALU_ITINS_SHUFF_P>;
-defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd,
-                                      SSE_INTALU_ITINS_SHUFF_P>;
-defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq,
-                                      SSE_INTALU_ITINS_SHUFF_P>;
-
-let Predicates = [HasAVX] in {
-  // Common patterns involving scalar load.
-  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
-            (VPMOVSXBWrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
-            (VPMOVSXBWrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
-            (VPMOVSXBWrm addr:$src)>;
-
-  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
-            (VPMOVSXWDrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
-            (VPMOVSXWDrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
-            (VPMOVSXWDrm addr:$src)>;
-
-  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
-            (VPMOVSXDQrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
-            (VPMOVSXDQrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
-            (VPMOVSXDQrm addr:$src)>;
-
-  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
-            (VPMOVZXBWrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
-            (VPMOVZXBWrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
-            (VPMOVZXBWrm addr:$src)>;
-
-  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
-            (VPMOVZXWDrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
-            (VPMOVZXWDrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
-            (VPMOVZXWDrm addr:$src)>;
-
-  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
-            (VPMOVZXDQrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
-            (VPMOVZXDQrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
-            (VPMOVZXDQrm addr:$src)>;
-}
-
-let Predicates = [UseSSE41] in {
-  // Common patterns involving scalar load.
-  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
-            (PMOVSXBWrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
-            (PMOVSXBWrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
-            (PMOVSXBWrm addr:$src)>;
-
-  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
-            (PMOVSXWDrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
-            (PMOVSXWDrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
-            (PMOVSXWDrm addr:$src)>;
-
-  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
-            (PMOVSXDQrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
-            (PMOVSXDQrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
-            (PMOVSXDQrm addr:$src)>;
-
-  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
-            (PMOVZXBWrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
-            (PMOVZXBWrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
-            (PMOVZXBWrm addr:$src)>;
-
-  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
-            (PMOVZXWDrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
-            (PMOVZXWDrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
-            (PMOVZXWDrm addr:$src)>;
-
-  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
-            (PMOVZXDQrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
-            (PMOVZXDQrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
-            (PMOVZXDQrm addr:$src)>;
-}
-
-multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId,
-                               OpndItins itins = DEFAULT_ITINS> {
-  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
+                          RegisterClass OutRC, RegisterClass InRC,
+                          OpndItins itins> {
+  def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>,
+                 [], itins.rr>,
                  Sched<[itins.Sched]>;
 
-  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
-                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-       [(set VR128:$dst,
-         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))],
-         itins.rm>, Sched<[itins.Sched.Folded]>;
-}
-
-multiclass SS41I_binop_rm_int8_y<bits<8> opc, string OpcodeStr,
-                                 Intrinsic IntId, X86FoldableSchedWrite Sched> {
-  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
-                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
-
-  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i32mem:$src),
-                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-       [(set VR256:$dst,
-         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
-         Sched<[Sched.Folded]>;
-}
-
-let Predicates = [HasAVX] in {
-defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd,
-                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
-defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq,
-                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
-defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd,
-                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
-defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq,
-                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
-}
-
-let Predicates = [HasAVX2] in {
-defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd",
-                                       int_x86_avx2_pmovsxbd, WriteShuffle>,
-                                       VEX, VEX_L;
-defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq",
-                                       int_x86_avx2_pmovsxwq, WriteShuffle>,
-                                       VEX, VEX_L;
-defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd",
-                                       int_x86_avx2_pmovzxbd, WriteShuffle>,
-                                       VEX, VEX_L;
-defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq",
-                                       int_x86_avx2_pmovzxwq, WriteShuffle>,
-                                       VEX, VEX_L;
-}
-
-defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd,
-                                      SSE_INTALU_ITINS_SHUFF_P>;
-defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq,
-                                      SSE_INTALU_ITINS_SHUFF_P>;
-defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd,
-                                      SSE_INTALU_ITINS_SHUFF_P>;
-defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq,
-                                      SSE_INTALU_ITINS_SHUFF_P>;
-
-let Predicates = [HasAVX] in {
-  // Common patterns involving scalar load
-  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
-            (VPMOVSXBDrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
-            (VPMOVSXWQrm addr:$src)>;
-
-  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
-            (VPMOVZXBDrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
-            (VPMOVZXWQrm addr:$src)>;
-}
-
-let Predicates = [UseSSE41] in {
-  // Common patterns involving scalar load
-  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
-            (PMOVSXBDrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
-            (PMOVSXWQrm addr:$src)>;
-
-  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
-            (PMOVZXBDrm addr:$src)>;
-  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
-            (PMOVZXWQrm addr:$src)>;
-}
-
-multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId,
-                               X86FoldableSchedWrite Sched> {
-  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR128:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
-
-  // Expecting a i16 load any extended to i32 value.
-  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
+  def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR128:$dst, (IntId (bitconvert
-                     (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
-                 Sched<[Sched.Folded]>;
+                 [],
+                 itins.rm>, Sched<[itins.Sched.Folded]>;
 }
 
-multiclass SS41I_binop_rm_int4_y<bits<8> opc, string OpcodeStr,
-                                 Intrinsic IntId, X86FoldableSchedWrite Sched> {
-  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
-                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
-
-  // Expecting a i16 load any extended to i32 value.
-  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i16mem:$src),
-                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set VR256:$dst, (IntId (bitconvert
-                      (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
-                 Sched<[Sched.Folded]>;
-}
-
-let Predicates = [HasAVX] in {
-defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq,
-                                     WriteShuffle>, VEX;
-defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq,
-                                     WriteShuffle>, VEX;
-}
-let Predicates = [HasAVX2] in {
-defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq", int_x86_avx2_pmovsxbq,
-                                       WriteShuffle>, VEX, VEX_L;
-defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq", int_x86_avx2_pmovzxbq,
-                                       WriteShuffle>, VEX, VEX_L;
+multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
+                          X86MemOperand MemOp, X86MemOperand MemYOp,
+                          OpndItins SSEItins, OpndItins AVXItins,
+                          OpndItins AVX2Itins> {
+  defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>;
+  let Predicates = [HasAVX] in
+    defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
+                                     VR128, VR128, AVXItins>, VEX;
+  let Predicates = [HasAVX2] in
+    defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
+                                     VR256, VR128, AVX2Itins>, VEX, VEX_L;
+}
+
+multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr,
+                                X86MemOperand MemOp, X86MemOperand MemYOp> {
+  defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
+                                        MemOp, MemYOp,
+                                        SSE_INTALU_ITINS_SHUFF_P,
+                                        DEFAULT_ITINS_SHUFFLESCHED,
+                                        DEFAULT_ITINS_SHUFFLESCHED>;
+  defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
+                                        !strconcat("pmovzx", OpcodeStr),
+                                        MemOp, MemYOp,
+                                        SSE_INTALU_ITINS_SHUFF_P,
+                                        DEFAULT_ITINS_SHUFFLESCHED,
+                                        DEFAULT_ITINS_SHUFFLESCHED>;
+}
+
+defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem>;
+defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem>;
+defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem>;
+
+defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem>;
+defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem>;
+
+defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem>;
+
+// AVX2 Patterns
+multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> {
+  // Register-Register patterns
+  def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
+  def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
+  def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
+
+  def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
+            (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
+  def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))),
+            (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
+
+  def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
+            (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
+
+  // On AVX2, we also support 256bit inputs.
+  // FIXME: remove these patterns when the old shuffle lowering goes away.
+  def : Pat<(v16i16 (ExtOp (v32i8 VR256:$src))),
+            (!cast<I>(OpcPrefix#BWYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v8i32 (ExtOp (v32i8 VR256:$src))),
+            (!cast<I>(OpcPrefix#BDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (ExtOp (v32i8 VR256:$src))),
+            (!cast<I>(OpcPrefix#BQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v8i32 (ExtOp (v16i16 VR256:$src))),
+            (!cast<I>(OpcPrefix#WDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (ExtOp (v16i16 VR256:$src))),
+            (!cast<I>(OpcPrefix#WQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v4i64 (ExtOp (v8i32 VR256:$src))),
+            (!cast<I>(OpcPrefix#DQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  // Simple Register-Memory patterns
+  def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+
+  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+
+  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
+            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+
+  // AVX2 Register-Memory patterns
+  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+  def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+  def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+
+  def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+
+  def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+
+  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+
+  def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+
+  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
 }
-defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq,
-                                      WriteShuffle>;
-defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq,
-                                      WriteShuffle>;
 
 let Predicates = [HasAVX2] in {
-  def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
-  def : Pat<(v8i32  (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDYrr VR128:$src)>;
-  def : Pat<(v4i64  (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQYrr VR128:$src)>;
-
-  def : Pat<(v8i32  (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
-  def : Pat<(v4i64  (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQYrr VR128:$src)>;
-
-  def : Pat<(v4i64  (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
-
-  def : Pat<(v16i16 (X86vsext (v32i8 VR256:$src))),
-            (VPMOVSXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-  def : Pat<(v8i32 (X86vsext (v32i8 VR256:$src))),
-            (VPMOVSXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-  def : Pat<(v4i64 (X86vsext (v32i8 VR256:$src))),
-            (VPMOVSXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-
-  def : Pat<(v8i32 (X86vsext (v16i16 VR256:$src))),
-            (VPMOVSXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-  def : Pat<(v4i64 (X86vsext (v16i16 VR256:$src))),
-            (VPMOVSXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-
-  def : Pat<(v4i64 (X86vsext (v8i32 VR256:$src))),
-            (VPMOVSXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-
-  def : Pat<(v8i32 (X86vsext (v8i16 (bitconvert (v2i64 (load addr:$src)))))),
-            (VPMOVSXWDYrm addr:$src)>;
-  def : Pat<(v4i64 (X86vsext (v4i32 (bitconvert (v2i64 (load addr:$src)))))),
-            (VPMOVSXDQYrm addr:$src)>;
-
-  def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64 
-                    (scalar_to_vector (loadi64 addr:$src))))))),
-            (VPMOVSXBDYrm addr:$src)>;
-  def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64 
-                    (scalar_to_vector (loadf64 addr:$src))))))),
-            (VPMOVSXBDYrm addr:$src)>;
-
-  def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64 
-                    (scalar_to_vector (loadi64 addr:$src))))))),
-            (VPMOVSXWQYrm addr:$src)>;
-  def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64 
-                    (scalar_to_vector (loadf64 addr:$src))))))),
-            (VPMOVSXWQYrm addr:$src)>;
-
-  def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32 
-                    (scalar_to_vector (loadi32 addr:$src))))))),
-            (VPMOVSXBQYrm addr:$src)>;
+  defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>;
+  defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;
+}
+
+// SSE4.1/AVX patterns.
+multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
+                                SDNode ExtOp, PatFrag ExtLoad16> {
+  def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
+  def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
+  def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
+
+  def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
+            (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
+  def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
+            (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
+
+  def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
+            (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
+
+  def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+
+  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+
+  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+
+  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+
+  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+
+  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+
+  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+
+  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+
+  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
 }
 
 let Predicates = [HasAVX] in {
-  // Common patterns involving scalar load
-  def : Pat<(int_x86_sse41_pmovsxbq
-              (bitconvert (v4i32 (X86vzmovl
-                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
-            (VPMOVSXBQrm addr:$src)>;
-
-  def : Pat<(int_x86_sse41_pmovzxbq
-              (bitconvert (v4i32 (X86vzmovl
-                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
-            (VPMOVZXBQrm addr:$src)>;
+  defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
+  defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
 }
 
 let Predicates = [UseSSE41] in {
-  def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>;
-  def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (PMOVSXBDrr VR128:$src)>;
-  def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (PMOVSXBQrr VR128:$src)>;
-
-  def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
-  def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (PMOVSXWQrr VR128:$src)>;
-
-  def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
-
-  // Common patterns involving scalar load
-  def : Pat<(int_x86_sse41_pmovsxbq
-              (bitconvert (v4i32 (X86vzmovl
-                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
-            (PMOVSXBQrm addr:$src)>;
-
-  def : Pat<(int_x86_sse41_pmovzxbq
-              (bitconvert (v4i32 (X86vzmovl
-                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
-            (PMOVZXBQrm addr:$src)>;
-
-  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
-                    (scalar_to_vector (loadi64 addr:$src))))))),
-            (PMOVSXWDrm addr:$src)>;
-  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
-                    (scalar_to_vector (loadf64 addr:$src))))))),
-            (PMOVSXWDrm addr:$src)>;
-  def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
-                    (scalar_to_vector (loadi32 addr:$src))))))),
-            (PMOVSXBDrm addr:$src)>;
-  def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
-                    (scalar_to_vector (loadi32 addr:$src))))))),
-            (PMOVSXWQrm addr:$src)>;
-  def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
-                    (scalar_to_vector (extloadi32i16 addr:$src))))))),
-            (PMOVSXBQrm addr:$src)>;
-  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
-                    (scalar_to_vector (loadi64 addr:$src))))))),
-            (PMOVSXDQrm addr:$src)>;
-  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
-                    (scalar_to_vector (loadf64 addr:$src))))))),
-            (PMOVSXDQrm addr:$src)>;
-  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
-                    (scalar_to_vector (loadi64 addr:$src))))))),
-            (PMOVSXBWrm addr:$src)>;
-  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
-                    (scalar_to_vector (loadf64 addr:$src))))))),
-            (PMOVSXBWrm addr:$src)>;
-}
-
-let Predicates = [HasAVX2] in {
-  def : Pat<(v16i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWYrr VR128:$src)>;
-  def : Pat<(v8i32  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDYrr VR128:$src)>;
-  def : Pat<(v4i64  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQYrr VR128:$src)>;
-
-  def : Pat<(v8i32  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>;
-  def : Pat<(v4i64  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQYrr VR128:$src)>;
-
-  def : Pat<(v4i64  (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQYrr VR128:$src)>;
-
-  def : Pat<(v16i16 (X86vzext (v32i8 VR256:$src))),
-            (VPMOVZXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-  def : Pat<(v8i32 (X86vzext (v32i8 VR256:$src))),
-            (VPMOVZXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-  def : Pat<(v4i64 (X86vzext (v32i8 VR256:$src))),
-            (VPMOVZXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-
-  def : Pat<(v8i32 (X86vzext (v16i16 VR256:$src))),
-            (VPMOVZXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-  def : Pat<(v4i64 (X86vzext (v16i16 VR256:$src))),
-            (VPMOVZXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-
-  def : Pat<(v4i64 (X86vzext (v8i32 VR256:$src))),
-            (VPMOVZXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-}
-
-let Predicates = [HasAVX] in {
-  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWrr VR128:$src)>;
-  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDrr VR128:$src)>;
-  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQrr VR128:$src)>;
-
-  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDrr VR128:$src)>;
-  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQrr VR128:$src)>;
-
-  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQrr VR128:$src)>;
-
-  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
-            (VPMOVZXBWrm addr:$src)>;
-  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
-            (VPMOVZXBWrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
-            (VPMOVZXBDrm addr:$src)>;
-  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
-            (VPMOVZXBQrm addr:$src)>;
-
-  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
-            (VPMOVZXWDrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
-            (VPMOVZXWDrm addr:$src)>;
-  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
-            (VPMOVZXWQrm addr:$src)>;
-
-  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
-            (VPMOVZXDQrm addr:$src)>;
-  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
-            (VPMOVZXDQrm addr:$src)>;
-  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
-            (VPMOVZXDQrm addr:$src)>;
-
-  def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>;
-  def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDrr VR128:$src)>;
-  def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQrr VR128:$src)>;
-
-  def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
-  def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQrr VR128:$src)>;
-
-  def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
-
-  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
-                    (scalar_to_vector (loadi64 addr:$src))))))),
-            (VPMOVSXWDrm addr:$src)>;
-  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
-                    (scalar_to_vector (loadi64 addr:$src))))))),
-            (VPMOVSXDQrm addr:$src)>;
-  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
-                    (scalar_to_vector (loadf64 addr:$src))))))),
-            (VPMOVSXWDrm addr:$src)>;
-  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
-                    (scalar_to_vector (loadf64 addr:$src))))))),
-            (VPMOVSXDQrm addr:$src)>;
-  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
-                    (scalar_to_vector (loadi64 addr:$src))))))),
-            (VPMOVSXBWrm addr:$src)>;
-  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
-                    (scalar_to_vector (loadf64 addr:$src))))))),
-            (VPMOVSXBWrm addr:$src)>;
-
-  def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
-                    (scalar_to_vector (loadi32 addr:$src))))))),
-            (VPMOVSXBDrm addr:$src)>;
-  def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
-                    (scalar_to_vector (loadi32 addr:$src))))))),
-            (VPMOVSXWQrm addr:$src)>;
-  def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
-                    (scalar_to_vector (extloadi32i16 addr:$src))))))),
-            (VPMOVSXBQrm addr:$src)>;
-}
-
-let Predicates = [UseSSE41] in {
-  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (PMOVZXBWrr VR128:$src)>;
-  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (PMOVZXBDrr VR128:$src)>;
-  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (PMOVZXBQrr VR128:$src)>;
-
-  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (PMOVZXWDrr VR128:$src)>;
-  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (PMOVZXWQrr VR128:$src)>;
-
-  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (PMOVZXDQrr VR128:$src)>;
-
-  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
-            (PMOVZXBWrm addr:$src)>;
-  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
-            (PMOVZXBWrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
-            (PMOVZXBDrm addr:$src)>;
-  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
-            (PMOVZXBQrm addr:$src)>;
-
-  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
-            (PMOVZXWDrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
-            (PMOVZXWDrm addr:$src)>;
-  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
-            (PMOVZXWQrm addr:$src)>;
-
-  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
-            (PMOVZXDQrm addr:$src)>;
-  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
-            (PMOVZXDQrm addr:$src)>;
-  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
-            (PMOVZXDQrm addr:$src)>;
+  defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>;
+  defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -6649,20 +6106,20 @@ let Predicates = [UseSSE41] in {
 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
   def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
-                 (ins VR128:$src1, i32i8imm:$src2),
+                 (ins VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
                                          imm:$src2))]>,
                   Sched<[WriteShuffle]>;
-  let neverHasSideEffects = 1, mayStore = 1,
+  let hasSideEffects = 0, mayStore = 1,
       SchedRW = [WriteShuffleLd, WriteRMW] in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
-                 (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1),
-						 imm:$src2)))), addr:$dst)]>;
+                                                 imm:$src2)))), addr:$dst)]>;
 }
 
 let Predicates = [HasAVX] in
@@ -6675,19 +6132,19 @@ defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
   def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
-                   (ins VR128:$src1, i32i8imm:$src2),
+                   (ins VR128:$src1, u8imm:$src2),
                    !strconcat(OpcodeStr,
                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                    []>, Sched<[WriteShuffle]>;
 
-  let neverHasSideEffects = 1, mayStore = 1,
+  let hasSideEffects = 0, mayStore = 1,
       SchedRW = [WriteShuffleLd, WriteRMW] in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
-                 (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1),
-						  imm:$src2)))), addr:$dst)]>;
+                                                  imm:$src2)))), addr:$dst)]>;
 }
 
 let Predicates = [HasAVX] in
@@ -6699,7 +6156,7 @@ defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
   def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
-                 (ins VR128:$src1, i32i8imm:$src2),
+                 (ins VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR32:$dst,
@@ -6707,7 +6164,7 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
                   Sched<[WriteShuffle]>;
   let SchedRW = [WriteShuffleLd, WriteRMW] in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
-                 (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
@@ -6722,7 +6179,7 @@ defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
 multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
   def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
-                 (ins VR128:$src1, i32i8imm:$src2),
+                 (ins VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR64:$dst,
@@ -6730,7 +6187,7 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
                   Sched<[WriteShuffle]>, REX_W;
   let SchedRW = [WriteShuffleLd, WriteRMW] in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
-                 (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
@@ -6747,7 +6204,7 @@ defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
 multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
                             OpndItins itins = DEFAULT_ITINS> {
   def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
-                 (ins VR128:$src1, i32i8imm:$src2),
+                 (ins VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR32orGR64:$dst,
@@ -6755,7 +6212,7 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
                     itins.rr>, Sched<[WriteFBlend]>;
   let SchedRW = [WriteFBlendLd, WriteRMW] in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
-                 (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
@@ -6786,7 +6243,7 @@ def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
 
 multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-      (ins VR128:$src1, GR32orGR64:$src2, i32i8imm:$src3),
+      (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
@@ -6795,7 +6252,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
         (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
       Sched<[WriteShuffle]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
-      (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
+      (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
@@ -6812,7 +6269,7 @@ let Constraints = "$src1 = $dst" in
 
 multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-      (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+      (ins VR128:$src1, GR32:$src2, u8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
@@ -6821,7 +6278,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
         (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
       Sched<[WriteShuffle]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
-      (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
+      (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
@@ -6838,7 +6295,7 @@ let Constraints = "$src1 = $dst" in
 
 multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-      (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
+      (ins VR128:$src1, GR64:$src2, u8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
@@ -6847,7 +6304,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
         (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
       Sched<[WriteShuffle]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
-      (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3),
+      (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
@@ -6869,7 +6326,7 @@ let Constraints = "$src1 = $dst" in
 multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
                            OpndItins itins = DEFAULT_ITINS> {
   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
@@ -6878,7 +6335,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
         (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
       Sched<[WriteFShuffle]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
-      (ins VR128:$src1, f32mem:$src2, i8imm:$src3),
+      (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
@@ -6932,7 +6389,7 @@ let ExeDomain = SSEPackedSingle in {
   // Intrinsic operation, reg.
   // Vector intrinsic operation, reg
   def PSr : SS4AIi8<opcps, MRMSrcReg,
-                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
@@ -6940,7 +6397,7 @@ let ExeDomain = SSEPackedSingle in {
 
   // Vector intrinsic operation, mem
   def PSm : SS4AIi8<opcps, MRMSrcMem,
-                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst,
@@ -6951,7 +6408,7 @@ let ExeDomain = SSEPackedSingle in {
 let ExeDomain = SSEPackedDouble in {
   // Vector intrinsic operation, reg
   def PDr : SS4AIi8<opcpd, MRMSrcReg,
-                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
                     !strconcat(OpcodeStr,
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
@@ -6959,7 +6416,7 @@ let ExeDomain = SSEPackedDouble in {
 
   // Vector intrinsic operation, mem
   def PDm : SS4AIi8<opcpd, MRMSrcMem,
-                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
                     !strconcat(OpcodeStr,
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst,
@@ -6976,7 +6433,7 @@ let ExeDomain = GenericDomain in {
   // Operation, reg.
   let hasSideEffects = 0 in
   def SSr : SS4AIi8<opcss, MRMSrcReg,
-      (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3),
+      (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
       !if(Is2Addr,
           !strconcat(OpcodeStr,
               "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -6987,7 +6444,7 @@ let ExeDomain = GenericDomain in {
   // Intrinsic operation, reg.
   let isCodeGenOnly = 1 in
   def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
-        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
                 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -6998,7 +6455,7 @@ let ExeDomain = GenericDomain in {
 
   // Intrinsic operation, mem.
   def SSm : SS4AIi8<opcss, MRMSrcMem,
-        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
+        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
                 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -7011,7 +6468,7 @@ let ExeDomain = GenericDomain in {
   // Operation, reg.
   let hasSideEffects = 0 in
   def SDr : SS4AIi8<opcsd, MRMSrcReg,
-        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3),
+        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
                 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -7022,7 +6479,7 @@ let ExeDomain = GenericDomain in {
   // Intrinsic operation, reg.
   let isCodeGenOnly = 1 in
   def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
-        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
                 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -7033,7 +6490,7 @@ let ExeDomain = GenericDomain in {
 
   // Intrinsic operation, mem.
   def SDm : SS4AIi8<opcsd, MRMSrcMem,
-        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
+        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
                 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -7059,7 +6516,9 @@ let Predicates = [HasAVX] in {
   defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
                                   int_x86_sse41_round_ss,
                                   int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+}
 
+let Predicates = [UseAVX] in {
   def : Pat<(ffloor FR32:$src),
             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
   def : Pat<(f64 (ffloor FR64:$src)),
@@ -7080,7 +6539,9 @@ let Predicates = [HasAVX] in {
             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
   def : Pat<(f64 (ftrunc FR64:$src)),
             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+}
 
+let Predicates = [HasAVX] in {
   def : Pat<(v4f32 (ffloor VR128:$src)),
             (VROUNDPSr VR128:$src, (i32 0x1))>;
   def : Pat<(v4f32 (fnearbyint VR128:$src)),
@@ -7284,7 +6745,7 @@ let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
 
 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
 multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
-                                 Intrinsic IntId128,
+                                 Intrinsic IntId128, PatFrag ld_frag,
                                  X86FoldableSchedWrite Sched> {
   def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src),
@@ -7295,7 +6756,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
                      (ins i128mem:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(set VR128:$dst,
-                       (IntId128 (bitconvert (memopv2i64 addr:$src))))]>,
+                       (IntId128 (bitconvert (ld_frag addr:$src))))]>,
                     Sched<[Sched.Folded]>;
 }
 
@@ -7303,53 +6764,12 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
 // model, although the naming is misleading.
 let Predicates = [HasAVX] in
 defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
-                                         int_x86_sse41_phminposuw,
+                                         int_x86_sse41_phminposuw, loadv2i64,
                                          WriteVecIMul>, VEX;
 defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
-                                         int_x86_sse41_phminposuw,
+                                         int_x86_sse41_phminposuw, memopv2i64,
                                          WriteVecIMul>;
 
-/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
-multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
-                              Intrinsic IntId128, bit Is2Addr = 1,
-                              OpndItins itins = DEFAULT_ITINS> {
-  let isCommutable = 1 in
-  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
-       (ins VR128:$src1, VR128:$src2),
-       !if(Is2Addr,
-           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))],
-       itins.rr>, Sched<[itins.Sched]>;
-  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
-       (ins VR128:$src1, i128mem:$src2),
-       !if(Is2Addr,
-           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set VR128:$dst,
-         (IntId128 VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))],
-       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
-}
-
-/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator
-multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
-                                Intrinsic IntId256,
-                                X86FoldableSchedWrite Sched> {
-  let isCommutable = 1 in
-  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst),
-       (ins VR256:$src1, VR256:$src2),
-       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
-       Sched<[Sched]>;
-  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst),
-       (ins VR256:$src1, i256mem:$src2),
-       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       [(set VR256:$dst,
-         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
-       Sched<[Sched.Folded, ReadAfterLd]>;
-}
-
-
 /// SS48I_binop_rm - Simple SSE41 binary operator.
 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
@@ -7398,7 +6818,7 @@ multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
        Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
   let isCommutable = 0 in
   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
@@ -7429,7 +6849,7 @@ let Predicates = [HasAVX] in {
                                    SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
 }
 
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2, NoVLX] in {
   let isCommutable = 0 in
   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
@@ -7483,7 +6903,7 @@ let Constraints = "$src1 = $dst" in {
                                   SSE_INTMUL_ITINS_P, 1>;
 }
 
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
   defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
                                  memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
                                  VEX_4V;
@@ -7493,10 +6913,10 @@ let Predicates = [HasAVX] in {
 }
 let Predicates = [HasAVX2] in {
   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
-                                  memopv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
+                                  loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
                                   VEX_4V, VEX_L;
   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
-                                  memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
 }
 
@@ -7514,7 +6934,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                  OpndItins itins = DEFAULT_ITINS> {
   let isCommutable = 1 in
   def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
-        (ins RC:$src1, RC:$src2, i8imm:$src3),
+        (ins RC:$src1, RC:$src2, u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -7523,7 +6943,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
         [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
         Sched<[itins.Sched]>;
   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
-        (ins RC:$src1, x86memop:$src2, i8imm:$src3),
+        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -7580,13 +7000,13 @@ let Predicates = [HasAVX] in {
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
-  defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
-                                  VR256, loadv4i64, i256mem, 0,
-                                  DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
                                   VR256, loadv4i64, i256mem, 0,
                                   DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L;
   }
+  defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
+                                  VR256, loadv4i64, i256mem, 0,
+                                  DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -7734,7 +7154,7 @@ let Predicates = [UseAVX] in {
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
             (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
-            (VBLENDPSrri (v4i32 (V_SET0)), VR128:$src, (i8 1))>;
+            (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
             (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
 
@@ -7769,7 +7189,7 @@ let Predicates = [UseSSE41] in {
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
             (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
-            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
+            (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
   def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
             (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>;
 }
@@ -7909,141 +7329,149 @@ let Constraints = "$src1 = $dst" in
 //===----------------------------------------------------------------------===//
 
 // Packed Compare Implicit Length Strings, Return Mask
-multiclass pseudo_pcmpistrm<string asm> {
+multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
   def REG : PseudoI<(outs VR128:$dst),
-                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+                    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
     [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
                                                   imm:$src3))]>;
   def MEM : PseudoI<(outs VR128:$dst),
-                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+                    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
     [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
-                       (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
+                       (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
 }
 
 let Defs = [EFLAGS], usesCustomInserter = 1 in {
-  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
-  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>;
+  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
+                         Requires<[HasAVX]>;
+  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>,
+                         Requires<[UseSSE42]>;
 }
 
 multiclass pcmpistrm_SS42AI<string asm> {
   def rr : SS42AI<0x62, MRMSrcReg, (outs),
-    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
     []>, Sched<[WritePCmpIStrM]>;
   let mayLoad = 1 in
   def rm :SS42AI<0x62, MRMSrcMem, (outs),
-    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
     []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>;
 }
 
-let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in {
+let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
   let Predicates = [HasAVX] in
   defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
   defm PCMPISTRM128  : pcmpistrm_SS42AI<"pcmpistrm"> ;
 }
 
 // Packed Compare Explicit Length Strings, Return Mask
-multiclass pseudo_pcmpestrm<string asm> {
+multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> {
   def REG : PseudoI<(outs VR128:$dst),
-                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+                    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
     [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
                        VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
   def MEM : PseudoI<(outs VR128:$dst),
-                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+                    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
     [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
-                       (bc_v16i8 (memopv2i64 addr:$src3)), EDX, imm:$src5))]>;
+                       (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>;
 }
 
 let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
-  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
-  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>;
+  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>,
+                         Requires<[HasAVX]>;
+  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>,
+                         Requires<[UseSSE42]>;
 }
 
 multiclass SS42AI_pcmpestrm<string asm> {
   def rr : SS42AI<0x60, MRMSrcReg, (outs),
-    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
     []>, Sched<[WritePCmpEStrM]>;
   let mayLoad = 1 in
   def rm : SS42AI<0x60, MRMSrcMem, (outs),
-    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
     []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>;
 }
 
-let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
+let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
   let Predicates = [HasAVX] in
   defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
   defm PCMPESTRM128 :  SS42AI_pcmpestrm<"pcmpestrm">;
 }
 
 // Packed Compare Implicit Length Strings, Return Index
-multiclass pseudo_pcmpistri<string asm> {
+multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
   def REG : PseudoI<(outs GR32:$dst),
-                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+                    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
     [(set GR32:$dst, EFLAGS,
       (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
   def MEM : PseudoI<(outs GR32:$dst),
-                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+                    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
     [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
-                              (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
+                              (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
 }
 
 let Defs = [EFLAGS], usesCustomInserter = 1 in {
-  defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI">, Requires<[HasAVX]>;
-  defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI">, Requires<[UseSSE42]>;
+  defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
+                      Requires<[HasAVX]>;
+  defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>,
+                      Requires<[UseSSE42]>;
 }
 
 multiclass SS42AI_pcmpistri<string asm> {
   def rr : SS42AI<0x63, MRMSrcReg, (outs),
-    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
     []>, Sched<[WritePCmpIStrI]>;
   let mayLoad = 1 in
   def rm : SS42AI<0x63, MRMSrcMem, (outs),
-    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
     []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>;
 }
 
-let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
+let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
   let Predicates = [HasAVX] in
   defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
   defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
 }
 
 // Packed Compare Explicit Length Strings, Return Index
-multiclass pseudo_pcmpestri<string asm> {
+multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> {
   def REG : PseudoI<(outs GR32:$dst),
-                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+                    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
     [(set GR32:$dst, EFLAGS,
       (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
   def MEM : PseudoI<(outs GR32:$dst),
-                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+                    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
     [(set GR32:$dst, EFLAGS,
-      (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (memopv2i64 addr:$src3)), EDX,
+      (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX,
        imm:$src5))]>;
 }
 
 let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
-  defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI">, Requires<[HasAVX]>;
-  defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI">, Requires<[UseSSE42]>;
+  defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>,
+                      Requires<[HasAVX]>;
+  defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>,
+                      Requires<[UseSSE42]>;
 }
 
 multiclass SS42AI_pcmpestri<string asm> {
   def rr : SS42AI<0x61, MRMSrcReg, (outs),
-    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
     []>, Sched<[WritePCmpEStrI]>;
   let mayLoad = 1 in
   def rm : SS42AI<0x61, MRMSrcMem, (outs),
-    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
     []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>;
 }
 
-let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
+let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
   let Predicates = [HasAVX] in
   defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
   defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
@@ -8123,13 +7551,13 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
 
 let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
   def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
-                         (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+                         (ins VR128:$src1, VR128:$src2, u8imm:$src3),
                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
                             (i8 imm:$src3)))]>, TA;
   def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
-                         (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+                         (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1,
@@ -8157,8 +7585,8 @@ def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
 // AES-NI Instructions
 //===----------------------------------------------------------------------===//
 
-multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
-                              Intrinsic IntId128, bit Is2Addr = 1> {
+multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
+                             PatFrag ld_frag, bit Is2Addr = 1> {
   def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
        (ins VR128:$src1, VR128:$src2),
        !if(Is2Addr,
@@ -8172,31 +7600,31 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
-         (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>,
+         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
        Sched<[WriteAESDecEncLd, ReadAfterLd]>;
 }
 
 // Perform One Round of an AES Encryption/Decryption Flow
 let Predicates = [HasAVX, HasAES] in {
   defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
-                         int_x86_aesni_aesenc, 0>, VEX_4V;
+                         int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V;
   defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
-                         int_x86_aesni_aesenclast, 0>, VEX_4V;
+                         int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V;
   defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
-                         int_x86_aesni_aesdec, 0>, VEX_4V;
+                         int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V;
   defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
-                         int_x86_aesni_aesdeclast, 0>, VEX_4V;
+                         int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
-                         int_x86_aesni_aesenc>;
+                         int_x86_aesni_aesenc, memopv2i64>;
   defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
-                         int_x86_aesni_aesenclast>;
+                         int_x86_aesni_aesenclast, memopv2i64>;
   defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
-                         int_x86_aesni_aesdec>;
+                         int_x86_aesni_aesdec, memopv2i64>;
   defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
-                         int_x86_aesni_aesdeclast>;
+                         int_x86_aesni_aesdeclast, memopv2i64>;
 }
 
 // Perform the AES InvMixColumn Transformation
@@ -8227,26 +7655,26 @@ def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
 // AES Round Key Generation Assist
 let Predicates = [HasAVX, HasAES] in {
   def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
-      (ins VR128:$src1, i8imm:$src2),
+      (ins VR128:$src1, u8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
         (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
       Sched<[WriteAESKeyGen]>, VEX;
   def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
-      (ins i128mem:$src1, i8imm:$src2),
+      (ins i128mem:$src1, u8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
         (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
       Sched<[WriteAESKeyGenLd]>, VEX;
 }
 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
-  (ins VR128:$src1, i8imm:$src2),
+  (ins VR128:$src1, u8imm:$src2),
   "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   [(set VR128:$dst,
     (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
   Sched<[WriteAESKeyGen]>;
 def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
-  (ins i128mem:$src1, i8imm:$src2),
+  (ins i128mem:$src1, u8imm:$src2),
   "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   [(set VR128:$dst,
     (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
@@ -8257,15 +7685,16 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
 //===----------------------------------------------------------------------===//
 
 // AVX carry-less Multiplication instructions
+let isCommutable = 1 in
 def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+           (ins VR128:$src1, VR128:$src2, u8imm:$src3),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            [(set VR128:$dst,
              (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
            Sched<[WriteCLMul]>;
 
 def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+           (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
                               (loadv2i64 addr:$src2), imm:$src3))]>,
@@ -8273,15 +7702,16 @@ def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
 
 // Carry-less Multiplication instructions
 let Constraints = "$src1 = $dst" in {
+let isCommutable = 1 in
 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+           (ins VR128:$src1, VR128:$src2, u8imm:$src3),
            "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            [(set VR128:$dst,
              (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
              IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
 
 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+           (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
            "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
                               (memopv2i64 addr:$src2), imm:$src3))],
@@ -8320,7 +7750,7 @@ let Predicates = [HasSSE4A] in {
 
 let Constraints = "$src = $dst" in {
 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
-                 (ins VR128:$src, i8imm:$len, i8imm:$idx),
+                 (ins VR128:$src, u8imm:$len, u8imm:$idx),
                  "extrq\t{$idx, $len, $src|$src, $len, $idx}",
                  [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
                                     imm:$idx))]>, PD;
@@ -8331,7 +7761,7 @@ def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
                                  VR128:$mask))]>, PD;
 
 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
-                   (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx),
+                   (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
                    "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
                    [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,
                                       VR128:$src2, imm:$len, imm:$idx))]>, XD;
@@ -8422,14 +7852,14 @@ def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
 //===----------------------------------------------------------------------===//
 // VINSERTF128 - Insert packed floating-point values
 //
-let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
-          (ins VR256:$src1, VR128:$src2, i8imm:$src3),
+          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L;
 let mayLoad = 1 in
 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
-          (ins VR256:$src1, f128mem:$src2, i8imm:$src3),
+          (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
 }
@@ -8496,14 +7926,14 @@ def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
 //===----------------------------------------------------------------------===//
 // VEXTRACTF128 - Extract packed floating-point values
 //
-let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
-          (ins VR256:$src1, i8imm:$src2),
+          (ins VR256:$src1, u8imm:$src2),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
           []>, Sched<[WriteFShuffle]>, VEX, VEX_L;
 let mayStore = 1 in
 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
-          (ins f128mem:$dst, VR256:$src1, i8imm:$src2),
+          (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
           []>, Sched<[WriteStore]>, VEX, VEX_L;
 }
@@ -8624,15 +8054,15 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
              Sched<[WriteFShuffleLd, ReadAfterLd]>;
 
   def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
-             (ins RC:$src1, i8imm:$src2),
+             (ins RC:$src1, u8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
              Sched<[WriteFShuffle]>;
   def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
-             (ins x86memop_f:$src1, i8imm:$src2),
+             (ins x86memop_f:$src1, u8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst,
-               (vt (X86VPermilpi (memop addr:$src1), (i8 imm:$src2))))]>, VEX,
+               (vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
              Sched<[WriteFShuffleLd]>;
 }
 
@@ -8689,13 +8119,13 @@ def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),
 //
 let ExeDomain = SSEPackedSingle in {
 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
-          (ins VR256:$src1, VR256:$src2, i8imm:$src3),
+          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
                               (i8 imm:$src3))))]>, VEX_4V, VEX_L,
           Sched<[WriteFShuffle]>;
 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
-          (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
+          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
                              (i8 imm:$src3)))]>, VEX_4V, VEX_L,
@@ -8756,7 +8186,7 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
              [(set RC:$dst, (Int VR128:$src))]>,
              T8PD, VEX, Sched<[WriteCvtF2F]>;
-  let neverHasSideEffects = 1, mayLoad = 1 in
+  let hasSideEffects = 0, mayLoad = 1 in
   def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
              Sched<[WriteCvtF2FLd]>;
@@ -8764,14 +8194,14 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
 
 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
   def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
-               (ins RC:$src1, i32i8imm:$src2),
+               (ins RC:$src1, i32u8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
                TAPD, VEX, Sched<[WriteCvtF2F]>;
-  let neverHasSideEffects = 1, mayStore = 1,
+  let hasSideEffects = 0, mayStore = 1,
       SchedRW = [WriteCvtF2FLd, WriteRMW] in
   def mr : Ii8<0x1D, MRMDestMem, (outs),
-               (ins x86memop:$dst, RC:$src1, i32i8imm:$src2),
+               (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
                TAPD, VEX;
 }
@@ -8814,13 +8244,13 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr,
                  X86MemOperand x86memop> {
   let isCommutable = 1 in
   def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
-        (ins RC:$src1, RC:$src2, i8imm:$src3),
+        (ins RC:$src1, RC:$src2, u8imm:$src3),
         !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
         Sched<[WriteBlend]>, VEX_4V;
   def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
-        (ins RC:$src1, x86memop:$src2, i8imm:$src3),
+        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
         !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         [(set RC:$dst,
@@ -9061,14 +8491,14 @@ defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>;
 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                          ValueType OpVT, X86FoldableSchedWrite Sched> {
   def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
-                     (ins VR256:$src1, i8imm:$src2),
+                     (ins VR256:$src1, u8imm:$src2),
                      !strconcat(OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
                        (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
                      Sched<[Sched]>, VEX, VEX_L;
   def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
-                     (ins i256mem:$src1, i8imm:$src2),
+                     (ins i256mem:$src1, u8imm:$src2),
                      !strconcat(OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
@@ -9087,13 +8517,13 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
 //
 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
-          (ins VR256:$src1, VR256:$src2, i8imm:$src3),
+          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
                             (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
           VEX_4V, VEX_L;
 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
-          (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
+          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
                              (i8 imm:$src3)))]>,
@@ -9122,14 +8552,14 @@ def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
 //===----------------------------------------------------------------------===//
 // VINSERTI128 - Insert packed integer values
 //
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
-          (ins VR256:$src1, VR128:$src2, i8imm:$src3),
+          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
 let mayLoad = 1 in
 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
-          (ins VR256:$src1, i128mem:$src2, i8imm:$src3),
+          (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
 }
@@ -9177,14 +8607,14 @@ def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
 // VEXTRACTI128 - Extract packed integer values
 //
 def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
-          (ins VR256:$src1, i8imm:$src2),
+          (ins VR256:$src1, u8imm:$src2),
           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
           [(set VR128:$dst,
             (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>,
           Sched<[WriteShuffle256]>, VEX, VEX_L;
-let neverHasSideEffects = 1, mayStore = 1 in
+let hasSideEffects = 0, mayStore = 1 in
 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
-          (ins i128mem:$dst, VR256:$src1, i8imm:$src2),
+          (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
           Sched<[WriteStore]>, VEX, VEX_L;
 
@@ -9260,6 +8690,115 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
                                 int_x86_avx2_maskstore_q,
                                 int_x86_avx2_maskstore_q_256>, VEX_W;
 
+def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)),
+         (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>;
+
+def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)),
+         (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
+
+def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)),
+         (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>;
+
+def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)),
+         (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>;
+
+def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
+         (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask),
+                             (bc_v8f32 (v8i32 immAllZerosV)))),
+         (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))),
+         (VBLENDVPSYrr VR256:$src0, (VMASKMOVPSYrm VR256:$mask, addr:$ptr),
+                       VR256:$mask)>;
+
+def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
+         (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))),
+         (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))),
+         (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr),
+                       VR256:$mask)>;
+
+def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)),
+         (VMASKMOVPSrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask),
+                             (bc_v4f32 (v4i32 immAllZerosV)))),
+         (VMASKMOVPSrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src0))),
+         (VBLENDVPSrr VR128:$src0, (VMASKMOVPSrm VR128:$mask, addr:$ptr),
+                       VR128:$mask)>;
+
+def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)),
+         (VPMASKMOVDrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 immAllZerosV))),
+         (VPMASKMOVDrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0))),
+         (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr),
+                       VR128:$mask)>;
+
+def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)),
+         (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
+
+def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)),
+         (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>;
+
+def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
+         (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
+                             (v4f64 immAllZerosV))),
+         (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))),
+         (VBLENDVPDYrr VR256:$src0, (VMASKMOVPDYrm VR256:$mask, addr:$ptr),
+                       VR256:$mask)>;
+
+def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
+         (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
+                             (bc_v4i64 (v8i32 immAllZerosV)))),
+         (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))),
+         (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr),
+                       VR256:$mask)>;
+
+def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)),
+         (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>;
+
+def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)),
+         (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>;
+
+def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
+         (VMASKMOVPDrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask),
+                             (v2f64 immAllZerosV))),
+         (VMASKMOVPDrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src0))),
+         (VBLENDVPDrr VR128:$src0, (VMASKMOVPDrm VR128:$mask, addr:$ptr),
+                       VR128:$mask)>;
+
+def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
+         (VPMASKMOVQrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask),
+                             (bc_v2i64 (v4i32 immAllZerosV)))),
+         (VPMASKMOVQrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src0))),
+         (VBLENDVPDrr VR128:$src0, (VPMASKMOVQrm VR128:$mask, addr:$ptr),
+                       VR128:$mask)>;
 
 //===----------------------------------------------------------------------===//
 // Variable Bit Shifts
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index d0bb523..c706d43 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -49,6 +49,7 @@ def SHL64ri  : RIi8<0xC1, MRM4r, (outs GR64:$dst),
                     "shl{q}\t{$src2, $dst|$dst, $src2}",
                     [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))],
                     IIC_SR>;
+} // isConvertibleToThreeAddress = 1
 
 // NOTE: We don't include patterns for shifts of a register by one, because
 // 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one).
@@ -62,7 +63,6 @@ def SHL32r1  : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
 def SHL64r1  : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
                  "shl{q}\t$dst", [], IIC_SR>;
 } // hasSideEffects = 0
-} // isConvertibleToThreeAddress = 1
 } // Constraints = "$src = $dst", SchedRW
 
 
@@ -289,11 +289,11 @@ def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
                  "sar{w}\t{%cl, $dst|$dst, cl}",
                  [(store (sra (loadi16 addr:$dst), CL), addr:$dst)],
                  IIC_SR>, OpSize16;
-def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), 
+def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst),
                  "sar{l}\t{%cl, $dst|$dst, cl}",
                  [(store (sra (loadi32 addr:$dst), CL), addr:$dst)],
                  IIC_SR>, OpSize32;
-def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), 
+def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
                  "sar{q}\t{%cl, $dst|$dst, cl}",
                  [(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
                  IIC_SR>;
@@ -347,7 +347,7 @@ def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
 let Uses = [CL] in
 def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
                 "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
-  
+
 def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
                 "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
 def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
@@ -381,7 +381,7 @@ def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
 let Uses = [CL] in
 def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
                 "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
-  
+
 def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
                 "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
 def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
@@ -397,7 +397,7 @@ def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
 let Uses = [CL] in
 def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
                  "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
-                 
+
 def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
                  "rcr{q}\t$dst", [], IIC_SR>;
 def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
@@ -493,7 +493,7 @@ def ROL32ri  : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
                    "rol{l}\t{$src2, $dst|$dst, $src2}",
                    [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))],
                    IIC_SR>, OpSize32;
-def ROL64ri  : RIi8<0xC1, MRM0r, (outs GR64:$dst), 
+def ROL64ri  : RIi8<0xC1, MRM0r, (outs GR64:$dst),
                     (ins GR64:$src1, i8imm:$src2),
                     "rol{q}\t{$src2, $dst|$dst, $src2}",
                     [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))],
@@ -600,7 +600,7 @@ def ROR32ri  : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
                    "ror{l}\t{$src2, $dst|$dst, $src2}",
                    [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))],
                    IIC_SR>, OpSize32;
-def ROR64ri  : RIi8<0xC1, MRM1r, (outs GR64:$dst), 
+def ROR64ri  : RIi8<0xC1, MRM1r, (outs GR64:$dst),
                     (ins GR64:$src1, i8imm:$src2),
                     "ror{q}\t{$src2, $dst|$dst, $src2}",
                     [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))],
@@ -635,11 +635,11 @@ def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
                  "ror{w}\t{%cl, $dst|$dst, cl}",
                  [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)],
                  IIC_SR>, OpSize16;
-def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), 
+def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst),
                  "ror{l}\t{%cl, $dst|$dst, cl}",
                  [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)],
                  IIC_SR>, OpSize32;
-def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), 
+def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
                   "ror{q}\t{%cl, $dst|$dst, cl}",
                   [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
                   IIC_SR>;
@@ -688,19 +688,19 @@ def ROR64m1  : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
 let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
 
 let Uses = [CL] in {
-def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), 
+def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
                    (ins GR16:$src1, GR16:$src2),
                    "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))],
                     IIC_SHD16_REG_CL>,
                    TB, OpSize16;
-def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), 
+def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
                    (ins GR16:$src1, GR16:$src2),
                    "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))],
                     IIC_SHD16_REG_CL>,
                    TB, OpSize16;
-def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), 
+def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
                    (ins GR32:$src1, GR32:$src2),
                    "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))],
@@ -710,58 +710,58 @@ def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
                    "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))],
                    IIC_SHD32_REG_CL>, TB, OpSize32;
-def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), 
+def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
                     (ins GR64:$src1, GR64:$src2),
                     "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                     [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))],
-                    IIC_SHD64_REG_CL>, 
+                    IIC_SHD64_REG_CL>,
                     TB;
-def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), 
+def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
                     (ins GR64:$src1, GR64:$src2),
                     "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                     [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))],
-                    IIC_SHD64_REG_CL>, 
+                    IIC_SHD64_REG_CL>,
                     TB;
 }
 
 let isCommutable = 1 in {  // These instructions commute to each other.
 def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
-                     (outs GR16:$dst), 
+                     (outs GR16:$dst),
                      (ins GR16:$src1, GR16:$src2, i8imm:$src3),
                      "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
                                       (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
                      TB, OpSize16;
 def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
-                     (outs GR16:$dst), 
+                     (outs GR16:$dst),
                      (ins GR16:$src1, GR16:$src2, i8imm:$src3),
                      "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
                                       (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
                      TB, OpSize16;
 def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
-                     (outs GR32:$dst), 
+                     (outs GR32:$dst),
                      (ins GR32:$src1, GR32:$src2, i8imm:$src3),
                      "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
                                       (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
                  TB, OpSize32;
 def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
-                     (outs GR32:$dst), 
+                     (outs GR32:$dst),
                      (ins GR32:$src1, GR32:$src2, i8imm:$src3),
                      "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
                                       (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
                  TB, OpSize32;
 def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
-                      (outs GR64:$dst), 
+                      (outs GR64:$dst),
                       (ins GR64:$src1, GR64:$src2, i8imm:$src3),
                       "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
                                        (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
                  TB;
 def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
-                      (outs GR64:$dst), 
+                      (outs GR64:$dst),
                       (ins GR64:$src1, GR64:$src2, i8imm:$src3),
                       "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
@@ -789,7 +789,7 @@ def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
                   "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                   [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
                     addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
-                    
+
 def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
                     "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                     [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
@@ -807,7 +807,7 @@ def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
                                       (i8 imm:$src3)), addr:$dst)],
                                       IIC_SHD16_MEM_IM>,
                     TB, OpSize16;
-def SHRD16mri8 : Ii8<0xAC, MRMDestMem, 
+def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
                      (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
                      "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                     [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
@@ -822,7 +822,7 @@ def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
                                       (i8 imm:$src3)), addr:$dst)],
                                       IIC_SHD32_MEM_IM>,
                     TB, OpSize32;
-def SHRD32mri8 : Ii8<0xAC, MRMDestMem, 
+def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
                      (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
                      "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
@@ -837,7 +837,7 @@ def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
                                        (i8 imm:$src3)), addr:$dst)],
                                        IIC_SHD64_MEM_IM>,
                  TB;
-def SHRD64mri8 : RIi8<0xAC, MRMDestMem, 
+def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
                       (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
                       "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
@@ -859,7 +859,7 @@ def ROT64L2R_imm8  : SDNodeXForm<imm, [{
 }]>;
 
 multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> {
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2),
                !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                []>, TAXD, VEX, Sched<[WriteShift]>;
@@ -872,7 +872,7 @@ let neverHasSideEffects = 1 in {
 }
 
 multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> {
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   def rr : I<0xF7, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
              VEX_4VOp3, Sched<[WriteShift]>;
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 8cabdd0..0350566 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -38,9 +38,6 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
               [(int_x86_int (i8 3))], IIC_INT3>;
 } // SchedRW
 
-def : Pat<(debugtrap),
-          (INT3)>;
-
 // The long form of "int $3" turns into int3 as a size optimization.
 // FIXME: This doesn't work because InstAlias can't match immediate constants.
 //def : InstAlias<"int\t$3", (INT3)>;
@@ -71,6 +68,10 @@ def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>,
              Requires<[In64BitMode]>;
 } // SchedRW
 
+def : Pat<(debugtrap),
+          (INT3)>, Requires<[NotPS4]>;
+def : Pat<(debugtrap),
+          (INT (i8 0x41))>, Requires<[IsPS4]>;
 
 //===----------------------------------------------------------------------===//
 //  Input/Output Instructions.
@@ -207,7 +208,7 @@ def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
 let SchedRW = [WriteSystem] in {
 def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
 
-def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 
+def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                 "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
                 OpSize16;
 def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
@@ -215,14 +216,14 @@ def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                 OpSize16;
 
 // i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
-def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), 
+def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
                 "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
                 OpSize32;
 def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                 "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
                 OpSize32;
 // i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
-def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), 
+def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
                  "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
 def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
                  "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
@@ -240,7 +241,7 @@ def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                 "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
                 OpSize32;
 def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
-                 "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; 
+                 "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB;
 def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                  "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
 
@@ -260,7 +261,7 @@ def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
              "ltr{w}\t$src", [], IIC_LTR>, TB;
 def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
              "ltr{w}\t$src", [], IIC_LTR>, TB;
-             
+
 def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
                  "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>,
                  OpSize16, Requires<[Not64BitMode]>;
@@ -347,31 +348,31 @@ def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
                 "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16;
 def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
                 "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32;
-                
+
 def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
                 "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
 def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
                 "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
 def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
                  "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
-                
+
 def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
                 "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16;
 def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
                 "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32;
-                
+
 def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
                 "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
 def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
                 "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
 def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
                  "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
-                
+
 def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
                 "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
 def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
                 "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
-                
+
 def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
                  "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
 
@@ -408,7 +409,7 @@ def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins),
                 "sldt{w}\t$dst", [], IIC_SLDT>, TB;
 def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
                 "sldt{l}\t$dst", [], IIC_SLDT>, OpSize32, TB;
-                
+
 // LLDT is not interpreted specially in 64-bit mode because there is no sign
 //   extension.
 def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
@@ -437,19 +438,21 @@ def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
 //===----------------------------------------------------------------------===//
 // Specialized register support
 let SchedRW = [WriteSystem] in {
+let Uses = [EAX, ECX, EDX] in
 def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB;
+let Defs = [EAX, EDX], Uses = [ECX] in
 def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
 
 let Defs = [RAX, RDX], Uses = [ECX] in
   def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)], IIC_RDPMC>,
               TB;
 
-def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), 
+def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins),
                 "smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB;
-def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins), 
+def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins),
                 "smsw{l}\t$dst", [], IIC_SMSW>, OpSize32, TB;
 // no m form encodable; use SMSW16m
-def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins), 
+def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins),
                  "smsw{q}\t$dst", [], IIC_SMSW>, TB;
 
 // For memory operands, there is only a 16-bit form
@@ -485,15 +488,28 @@ let Uses = [RDX, RAX] in {
   def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
                "xsave\t$dst", []>, TB;
   def XSAVE64 : RI<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
-                 "xsave{q|64}\t$dst", []>, TB, Requires<[In64BitMode]>;
+                 "xsave64\t$dst", []>, TB, Requires<[In64BitMode]>;
   def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
                "xrstor\t$dst", []>, TB;
   def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
-                 "xrstor{q|64}\t$dst", []>, TB, Requires<[In64BitMode]>;
+                 "xrstor64\t$dst", []>, TB, Requires<[In64BitMode]>;
   def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
-                  "xsaveopt\t$dst", []>, TB;
+                  "xsaveopt\t$dst", []>, PS;
   def XSAVEOPT64 : RI<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
-                    "xsaveopt{q|64}\t$dst", []>, TB, Requires<[In64BitMode]>;
+                    "xsaveopt64\t$dst", []>, PS, Requires<[In64BitMode]>;
+
+  def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
+                "xrstors\t$dst", []>, TB;
+  def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
+                  "xrstors64\t$dst", []>, TB, Requires<[In64BitMode]>;
+  def XSAVEC : I<0xC7, MRM4m, (outs opaque512mem:$dst), (ins),
+                "xsavec\t$dst", []>, TB;
+  def XSAVEC64 : RI<0xC7, MRM4m, (outs opaque512mem:$dst), (ins),
+                  "xsavec64\t$dst", []>, TB, Requires<[In64BitMode]>;
+  def XSAVES : I<0xC7, MRM5m, (outs opaque512mem:$dst), (ins),
+                "xsaves\t$dst", []>, TB;
+  def XSAVES64 : RI<0xC7, MRM5m, (outs opaque512mem:$dst), (ins),
+                  "xsaves64\t$dst", []>, TB, Requires<[In64BitMode]>;
 }
 } // SchedRW
 
@@ -559,7 +575,13 @@ def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
 
 //===----------------------------------------------------------------------===//
 // SMAP Instruction
-let Predicates = [HasSMAP], Defs = [EFLAGS] in {
+let Defs = [EFLAGS] in {
   def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB;
   def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB;
 }
+
+//===----------------------------------------------------------------------===//
+// SMX Instruction
+let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in {
+  def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB;
+}
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index 4940efc..7267d75 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -23,9 +23,12 @@ def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
                "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>,
              Requires<[HasRTM]>;
 
-let isBranch = 1, isTerminator = 1, Defs = [EAX] in
-def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget:$dst),
-                         "xbegin\t$dst", []>, Requires<[HasRTM]>;
+let isBranch = 1, isTerminator = 1, Defs = [EAX] in {
+def XBEGIN_2 : Ii16PCRel<0xc7, MRM_F8, (outs), (ins brtarget16:$dst),
+                         "xbegin\t$dst", []>, OpSize16, Requires<[HasRTM]>;
+def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst),
+                         "xbegin\t$dst", []>, OpSize32, Requires<[HasRTM]>;
+}
 
 def XEND : I<0x01, MRM_D5, (outs), (ins),
              "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 45e2ff0..8455b8d 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -20,21 +20,23 @@ multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
            [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
 }
 
-defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>;
-defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>;
-defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>;
-defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>;
-defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>;
-defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>;
-defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>;
-defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>;
-defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>;
-defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>;
-defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>;
-defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>;
-defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>;
-defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>;
-defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>;
+let ExeDomain = SSEPackedInt in {
+  defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>;
+  defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>;
+  defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>;
+  defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>;
+  defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>;
+  defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>;
+  defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>;
+  defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>;
+  defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>;
+  defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>;
+  defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>;
+  defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>;
+  defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>;
+  defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>;
+  defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>;
+}
 
 // Scalar load 2 addr operand instructions
 multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
@@ -47,11 +49,6 @@ multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
            [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP;
 }
 
-defm VFRCZSS   : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
-                 ssmem, sse_load_f32>;
-defm VFRCZSD   : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
-                 sdmem, sse_load_f64>;
-
 multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      PatFrag memop> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -62,9 +59,6 @@ multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
            [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
 }
 
-defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
-defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
-
 multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      PatFrag memop> {
   def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
@@ -75,8 +69,19 @@ multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
            [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L;
 }
 
-defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, memopv8f32>;
-defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, memopv4f64>;
+let ExeDomain = SSEPackedSingle in {
+  defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
+                           ssmem, sse_load_f32>;
+  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>;
+  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+  defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
+                           sdmem, sse_load_f64>;
+  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>;
+  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>;
+}
 
 multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
@@ -87,28 +92,30 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            (ins VR128:$src1, i128mem:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-              (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>,
+              (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2))))]>,
            XOP_4V, VEX_W;
   def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
            (ins i128mem:$src1, VR128:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-              (Int (bitconvert (memopv2i64 addr:$src1)), VR128:$src2))]>,
+              (Int (bitconvert (loadv2i64 addr:$src1)), VR128:$src2))]>,
              XOP_4VOp3;
 }
 
-defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
-defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
-defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
-defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
-defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
-defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
-defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
-defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
-defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
-defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
-defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
-defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
+let ExeDomain = SSEPackedInt in {
+  defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
+  defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
+  defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
+  defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
+  defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
+  defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
+  defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
+  defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
+  defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
+  defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
+  defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
+  defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
+}
 
 multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
@@ -119,16 +126,19 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            (ins i128mem:$src1, i8imm:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-             (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, XOP;
+             (Int (bitconvert (loadv2i64 addr:$src1)), imm:$src2))]>, XOP;
 }
 
-defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
-defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
-defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
-defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
+let ExeDomain = SSEPackedInt in {
+  defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
+  defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
+  defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
+  defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
+}
 
 // Instruction where second source can be memory, but third must be register
 multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  let isCommutable = 1 in
   def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
@@ -140,48 +150,66 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-              (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+              (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
               VR128:$src3))]>, XOP_4V, VEX_I8IMM;
 }
 
-defm VPMADCSWD  : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
-defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
-defm VPMACSWW   : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
-defm VPMACSWD   : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
-defm VPMACSSWW  : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
-defm VPMACSSWD  : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
-defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
-defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
-defm VPMACSSDD  : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
-defm VPMACSDQL  : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
-defm VPMACSDQH  : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
-defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
+let ExeDomain = SSEPackedInt in {
+  defm VPMADCSWD  : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
+  defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
+  defm VPMACSWW   : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
+  defm VPMACSWD   : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
+  defm VPMACSSWW  : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
+  defm VPMACSSWD  : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
+  defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
+  defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
+  defm VPMACSSDD  : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
+  defm VPMACSDQL  : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
+  defm VPMACSDQH  : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
+  defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
+}
 
 // Instruction where second source can be memory, third must be imm8
-multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> {
+  let isCommutable = 1 in
   def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
-           !strconcat(OpcodeStr,
-           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+           (ins VR128:$src1, VR128:$src2, XOPCC:$cc),
+           !strconcat("vpcom${cc}", Suffix,
+           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, i8immZExt3:$cc))]>,
            XOP_4V;
   def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-           !strconcat(OpcodeStr,
-           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           (ins VR128:$src1, i128mem:$src2, XOPCC:$cc),
+           !strconcat("vpcom${cc}", Suffix,
+           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-             (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
-              imm:$src3))]>, XOP_4V;
+             (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
+              i8immZExt3:$cc))]>, XOP_4V;
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+                 (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+                 !strconcat("vpcom", Suffix,
+                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                 []>, XOP_4V;
+    let mayLoad = 1 in
+    def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+                 (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+                 !strconcat("vpcom", Suffix,
+                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                 []>, XOP_4V;
+  }
 }
 
-defm VPCOMB  : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>;
-defm VPCOMW  : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>;
-defm VPCOMD  : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>;
-defm VPCOMQ  : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>;
-defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>;
-defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>;
-defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>;
-defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>;
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+  defm VPCOMB  : xopvpcom<0xCC, "b", int_x86_xop_vpcomb>;
+  defm VPCOMW  : xopvpcom<0xCD, "w", int_x86_xop_vpcomw>;
+  defm VPCOMD  : xopvpcom<0xCE, "d", int_x86_xop_vpcomd>;
+  defm VPCOMQ  : xopvpcom<0xCF, "q", int_x86_xop_vpcomq>;
+  defm VPCOMUB : xopvpcom<0xEC, "ub", int_x86_xop_vpcomub>;
+  defm VPCOMUW : xopvpcom<0xED, "uw", int_x86_xop_vpcomuw>;
+  defm VPCOMUD : xopvpcom<0xEE, "ud", int_x86_xop_vpcomud>;
+  defm VPCOMUQ : xopvpcom<0xEF, "uq", int_x86_xop_vpcomuq>;
+}
 
 // Instruction where either second or third source can be memory
 multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
@@ -197,20 +225,22 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
              (Int VR128:$src1, VR128:$src2,
-              (bitconvert (memopv2i64 addr:$src3))))]>,
+              (bitconvert (loadv2i64 addr:$src3))))]>,
            XOP_4V, VEX_I8IMM, VEX_W, MemOp4;
   def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-             (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+             (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
               VR128:$src3))]>,
            XOP_4V, VEX_I8IMM;
 }
 
-defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
-defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
+let ExeDomain = SSEPackedInt in {
+  defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
+  defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
+}
 
 multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
@@ -225,19 +255,20 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst,
              (Int VR256:$src1, VR256:$src2,
-              (bitconvert (memopv4i64 addr:$src3))))]>,
+              (bitconvert (loadv4i64 addr:$src3))))]>,
            XOP_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L;
   def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, f256mem:$src2, VR256:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst,
-             (Int VR256:$src1, (bitconvert (memopv4i64 addr:$src2)),
+             (Int VR256:$src1, (bitconvert (loadv4i64 addr:$src2)),
               VR256:$src3))]>,
            XOP_4V, VEX_I8IMM, VEX_L;
 }
 
-defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
+let ExeDomain = SSEPackedInt in
+  defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
 
 multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
                   Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
@@ -282,8 +313,11 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
         VEX_L;
 }
 
-defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
-                         int_x86_xop_vpermil2pd_256, memopv2f64, memopv4f64>;
-defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps,
-                         int_x86_xop_vpermil2ps_256, memopv4f32, memopv8f32>;
+let ExeDomain = SSEPackedDouble in
+  defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
+                           int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>;
+
+let ExeDomain = SSEPackedSingle in
+  defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps,
+                           int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>;
 
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index d252f72..e436811 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -20,8 +20,9 @@ enum IntrinsicType {
   INTR_NO_TYPE,
   GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
   INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,
-  CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, 
-  INTR_TYPE_1OP_MASK_RM
+  CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,
+  INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, FMA_OP_MASK, INTR_TYPE_SCALAR_MASK_RM,
+  COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, EXPAND_FROM_MEM, BLEND
 };
 
 struct IntrinsicData {
@@ -51,7 +52,7 @@ static const IntrinsicData IntrinsicsWithChain[] = {
   X86_INTRINSIC_DATA(addcarry_u64,  ADX, X86ISD::ADC, 0),
   X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0),
   X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0),
-  
+
   X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
   X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
   X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
@@ -60,7 +61,7 @@ static const IntrinsicData IntrinsicsWithChain[] = {
   X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0),
   X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0),
   X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0),
-  
+
   X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH,
                      X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm),
   X86_INTRINSIC_DATA(avx512_gatherpf_dps_512, PREFETCH,
@@ -69,7 +70,55 @@ static const IntrinsicData IntrinsicsWithChain[] = {
                      X86::VGATHERPF0QPDm, X86::VGATHERPF1QPDm),
   X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
                      X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
-  
+
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_d_128,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_d_256,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_d_512,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_128,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_256,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_512,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_128,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_256,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_512,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_q_128,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_q_256,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_d_128,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_d_256,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_d_512,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_128,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_256,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_512,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_128,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_256,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_512,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_q_128,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_q_256,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
   X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
   X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
   X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
@@ -78,7 +127,7 @@ static const IntrinsicData IntrinsicsWithChain[] = {
   X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0),
   X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0),
   X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0),
-  
+
   X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH,
                      X86::VSCATTERPF0DPDm, X86::VSCATTERPF1DPDm),
   X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH,
@@ -87,7 +136,7 @@ static const IntrinsicData IntrinsicsWithChain[] = {
                      X86::VSCATTERPF0QPDm, X86::VSCATTERPF1QPDm),
   X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH,
                      X86::VSCATTERPF0QPSm, X86::VSCATTERPF1QPSm),
-  
+
   X86_INTRINSIC_DATA(rdpmc,     RDPMC,  X86ISD::RDPMC_DAG, 0),
   X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0),
   X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0),
@@ -97,7 +146,7 @@ static const IntrinsicData IntrinsicsWithChain[] = {
   X86_INTRINSIC_DATA(rdseed_64, RDSEED, X86ISD::RDSEED, 0),
   X86_INTRINSIC_DATA(rdtsc,     RDTSC,  X86ISD::RDTSC_DAG, 0),
   X86_INTRINSIC_DATA(rdtscp,    RDTSC,  X86ISD::RDTSCP_DAG, 0),
-  
+
   X86_INTRINSIC_DATA(subborrow_u32, ADX, X86ISD::SBB, 0),
   X86_INTRINSIC_DATA(subborrow_u64, ADX, X86ISD::SBB, 0),
   X86_INTRINSIC_DATA(xtest,     XTEST,  X86ISD::XTEST,  0),
@@ -122,6 +171,12 @@ static const IntrinsicData* getIntrinsicWithChain(unsigned IntNo) {
  * the alphabetical order.
  */
 static const IntrinsicData  IntrinsicsWithoutChain[] = {
+  X86_INTRINSIC_DATA(avx2_packssdw,     INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx2_packsswb,     INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx2_packusdw,     INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx2_packuswb,     INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx2_permd,        INTR_TYPE_2OP, X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx2_permps,       INTR_TYPE_2OP, X86ISD::VPERMV, 0),
   X86_INTRINSIC_DATA(avx2_phadd_d,      INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(avx2_phadd_w,      INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(avx2_phsub_d,      INTR_TYPE_2OP, X86ISD::HSUB, 0),
@@ -138,27 +193,79 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_pminu_b,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
   X86_INTRINSIC_DATA(avx2_pminu_d,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
   X86_INTRINSIC_DATA(avx2_pminu_w,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pmovsxbd,     INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovsxbq,     INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovsxbw,     INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovsxdq,     INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovsxwd,     INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovsxwq,     INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovzxbd,     INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovzxbq,     INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovzxbw,     INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovzxdq,     INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovzxwd,     INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovzxwq,     INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmul_dq,      INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+  X86_INTRINSIC_DATA(avx2_pmulh_w,      INTR_TYPE_2OP, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(avx2_pmulhu_w,     INTR_TYPE_2OP, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(avx2_pmulu_dq,     INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+  X86_INTRINSIC_DATA(avx2_pshuf_b,      INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+  X86_INTRINSIC_DATA(avx2_psign_b,      INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+  X86_INTRINSIC_DATA(avx2_psign_d,      INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+  X86_INTRINSIC_DATA(avx2_psign_w,      INTR_TYPE_2OP, X86ISD::PSIGN, 0),
   X86_INTRINSIC_DATA(avx2_psll_d,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(avx2_psll_q,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(avx2_psll_w,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(avx2_pslli_d,      VSHIFT, X86ISD::VSHLI, 0),
   X86_INTRINSIC_DATA(avx2_pslli_q,      VSHIFT, X86ISD::VSHLI, 0),
   X86_INTRINSIC_DATA(avx2_pslli_w,      VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx2_psllv_d,      INTR_TYPE_2OP, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx2_psllv_d_256,  INTR_TYPE_2OP, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx2_psllv_q,      INTR_TYPE_2OP, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx2_psllv_q_256,  INTR_TYPE_2OP, ISD::SHL, 0),
   X86_INTRINSIC_DATA(avx2_psra_d,       INTR_TYPE_2OP, X86ISD::VSRA, 0),
   X86_INTRINSIC_DATA(avx2_psra_w,       INTR_TYPE_2OP, X86ISD::VSRA, 0),
   X86_INTRINSIC_DATA(avx2_psrai_d,      VSHIFT, X86ISD::VSRAI, 0),
   X86_INTRINSIC_DATA(avx2_psrai_w,      VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx2_psrav_d,      INTR_TYPE_2OP, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx2_psrav_d_256,  INTR_TYPE_2OP, ISD::SRA, 0),
   X86_INTRINSIC_DATA(avx2_psrl_d,       INTR_TYPE_2OP, X86ISD::VSRL, 0),
   X86_INTRINSIC_DATA(avx2_psrl_q,       INTR_TYPE_2OP, X86ISD::VSRL, 0),
   X86_INTRINSIC_DATA(avx2_psrl_w,       INTR_TYPE_2OP, X86ISD::VSRL, 0),
   X86_INTRINSIC_DATA(avx2_psrli_d,      VSHIFT, X86ISD::VSRLI, 0),
   X86_INTRINSIC_DATA(avx2_psrli_q,      VSHIFT, X86ISD::VSRLI, 0),
   X86_INTRINSIC_DATA(avx2_psrli_w,      VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx2_psrlv_d,      INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrlv_d_256,  INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrlv_q,      INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrlv_q_256,  INTR_TYPE_2OP, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx2_psubus_b,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(avx2_psubus_w,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(avx2_vperm2i128,   INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
   X86_INTRINSIC_DATA(avx512_exp2_pd,    INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0),
   X86_INTRINSIC_DATA(avx512_exp2_ps,    INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0),
+  X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
+                     X86ISD::FADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD,
+                     X86ISD::FADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask_blend_b_128,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_b_256,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_b_512,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_d_128,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_d_256,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_d_512,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_pd_128, BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_pd_256, BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_pd_512, BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_ps_128, BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_ps_256, BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_ps_512, BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_q_128,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_q_256,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_q_512,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_w_128,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_w_256,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_w_512,  BLEND, X86ISD::SELECT, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_b_128,     CMP_MASK_CC,  X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_b_256,     CMP_MASK_CC,  X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_b_512,     CMP_MASK_CC,  X86ISD::CMPM, 0),
@@ -171,6 +278,64 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_cmp_w_128,     CMP_MASK_CC,  X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_w_256,     CMP_MASK_CC,  X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_w_512,     CMP_MASK_CC,  X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_d_128,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_d_256,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_d_512,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_q_128,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_q_256,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_q_512,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+
+  X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
+                     X86ISD::FDIV_RND),
+  X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
+                     X86ISD::FDIV_RND),
+  X86_INTRINSIC_DATA(avx512_mask_expand_d_128,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_d_256,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_d_512,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_pd_128, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_pd_256, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_pd_512, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_ps_128, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_ps_256, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_ps_512, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_q_128,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_q_256,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_q_512,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  
+  X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
+                     X86ISD::FMUL_RND),
+  X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
+                     X86ISD::FMUL_RND),
   X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128,  CMP_MASK,  X86ISD::PCMPEQM, 0),
   X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256,  CMP_MASK,  X86ISD::PCMPEQM, 0),
   X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512,  CMP_MASK,  X86ISD::PCMPEQM, 0),
@@ -195,12 +360,32 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
   X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
   X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_d,        INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_q,        INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_pslli_d,       VSHIFT_MASK, X86ISD::VSHLI, 0),
   X86_INTRINSIC_DATA(avx512_mask_pslli_q,       VSHIFT_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv_d,       INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv_q,       INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_d,        INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_q,        INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrai_d,       VSHIFT_MASK, X86ISD::VSRAI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrai_q,       VSHIFT_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav_d,       INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav_q,       INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_d,        INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_q,        INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrli_d,       VSHIFT_MASK, X86ISD::VSRLI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrli_q,       VSHIFT_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv_d,       INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv_q,       INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_sd,   INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::RNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_ss,   INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::RNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
+                     X86ISD::FSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
+                     X86ISD::FSUB_RND),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
@@ -215,27 +400,118 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_rcp28_pd,   INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0),
   X86_INTRINSIC_DATA(avx512_rcp28_ps,   INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0),
+  X86_INTRINSIC_DATA(avx512_rcp28_sd,   INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
+  X86_INTRINSIC_DATA(avx512_rcp28_ss,   INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx_hadd_pd_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
   X86_INTRINSIC_DATA(avx_hadd_ps_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
   X86_INTRINSIC_DATA(avx_hsub_pd_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
   X86_INTRINSIC_DATA(avx_hsub_ps_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+  X86_INTRINSIC_DATA(avx_max_pd_256,    INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx_max_ps_256,    INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx_min_pd_256,    INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx_min_ps_256,    INTR_TYPE_2OP, X86ISD::FMIN, 0),
   X86_INTRINSIC_DATA(avx_sqrt_pd_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
   X86_INTRINSIC_DATA(avx_sqrt_ps_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
   X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
   X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
   X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_128,    FMA_OP_MASK, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_256,    FMA_OP_MASK, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_512,    FMA_OP_MASK, X86ISD::FMADD,
+                     X86ISD::FMADD_RND),
+  X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_128,    FMA_OP_MASK, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_256,    FMA_OP_MASK, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_512,    FMA_OP_MASK, X86ISD::FMADD,
+                     X86ISD::FMADD_RND),
+  X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+  X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_512, FMA_OP_MASK, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+  X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_128,    FMA_OP_MASK, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_256,    FMA_OP_MASK, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_512,    FMA_OP_MASK, X86ISD::FMSUB,
+                     X86ISD::FMSUB_RND),
+  X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_128,    FMA_OP_MASK, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_256,    FMA_OP_MASK, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_512,    FMA_OP_MASK, X86ISD::FMSUB,
+                     X86ISD::FMSUB_RND),
+  X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_128, FMA_OP_MASK, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_256, FMA_OP_MASK, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_512, FMA_OP_MASK, X86ISD::FMSUBADD,
+                     X86ISD::FMSUBADD_RND),
+  X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_128, FMA_OP_MASK, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_256, FMA_OP_MASK, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_512, FMA_OP_MASK, X86ISD::FMSUBADD,
+                     X86ISD::FMSUBADD_RND),
+  X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_128,   FMA_OP_MASK, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_256,   FMA_OP_MASK, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_512,   FMA_OP_MASK, X86ISD::FNMADD,
+                     X86ISD::FNMADD_RND),
+  X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_128,   FMA_OP_MASK, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_256,   FMA_OP_MASK, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_512,   FMA_OP_MASK, X86ISD::FNMADD,
+                     X86ISD::FNMADD_RND),
+  X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_128,   FMA_OP_MASK, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_256,   FMA_OP_MASK, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_512,   FMA_OP_MASK, X86ISD::FNMSUB,
+                     X86ISD::FNMSUB_RND),
+  X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_128,   FMA_OP_MASK, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_256,   FMA_OP_MASK, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_512,   FMA_OP_MASK, X86ISD::FNMSUB,
+                     X86ISD::FNMSUB_RND),
+  X86_INTRINSIC_DATA(fma_vfmadd_pd,        INTR_TYPE_3OP, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmadd_pd_256,    INTR_TYPE_3OP, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmadd_ps,        INTR_TYPE_3OP, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmadd_ps_256,    INTR_TYPE_3OP, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmaddsub_pd,     INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmaddsub_ps,     INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmsub_pd,        INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmsub_pd_256,    INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmsub_ps,        INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmsub_ps_256,    INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmsubadd_pd,     INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmsubadd_ps,     INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmsubadd_ps_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(fma_vfnmadd_pd,       INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfnmadd_pd_256,   INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfnmadd_ps,       INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfnmadd_ps_256,   INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfnmsub_pd,       INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfnmsub_pd_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfnmsub_ps,       INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfnmsub_ps_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
   X86_INTRINSIC_DATA(sse2_comieq_sd,    COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse2_comige_sd,    COMI, X86ISD::COMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse2_comigt_sd,    COMI, X86ISD::COMI, ISD::SETGT),
   X86_INTRINSIC_DATA(sse2_comile_sd,    COMI, X86ISD::COMI, ISD::SETLE),
   X86_INTRINSIC_DATA(sse2_comilt_sd,    COMI, X86ISD::COMI, ISD::SETLT),
   X86_INTRINSIC_DATA(sse2_comineq_sd,   COMI, X86ISD::COMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(sse2_max_pd,       INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(sse2_min_pd,       INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(sse2_pmaxs_w,      INTR_TYPE_2OP, X86ISD::SMAX, 0),
   X86_INTRINSIC_DATA(sse2_pmaxu_b,      INTR_TYPE_2OP, X86ISD::UMAX, 0),
   X86_INTRINSIC_DATA(sse2_pmins_w,      INTR_TYPE_2OP, X86ISD::SMIN, 0),
   X86_INTRINSIC_DATA(sse2_pminu_b,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(sse2_pmulh_w,      INTR_TYPE_2OP, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(sse2_pmulhu_w,     INTR_TYPE_2OP, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(sse2_pmulu_dq,     INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+  X86_INTRINSIC_DATA(sse2_pshuf_d,      INTR_TYPE_2OP, X86ISD::PSHUFD, 0),
+  X86_INTRINSIC_DATA(sse2_pshufh_w,     INTR_TYPE_2OP, X86ISD::PSHUFHW, 0),
+  X86_INTRINSIC_DATA(sse2_pshufl_w,     INTR_TYPE_2OP, X86ISD::PSHUFLW, 0),
   X86_INTRINSIC_DATA(sse2_psll_d,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(sse2_psll_q,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(sse2_psll_w,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
@@ -266,6 +542,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse3_hsub_pd,      INTR_TYPE_2OP, X86ISD::FHSUB, 0),
   X86_INTRINSIC_DATA(sse3_hsub_ps,      INTR_TYPE_2OP, X86ISD::FHSUB, 0),
   X86_INTRINSIC_DATA(sse41_insertps,    INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
+  X86_INTRINSIC_DATA(sse41_packusdw,    INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(sse41_pmaxsb,      INTR_TYPE_2OP, X86ISD::SMAX, 0),
   X86_INTRINSIC_DATA(sse41_pmaxsd,      INTR_TYPE_2OP, X86ISD::SMAX, 0),
   X86_INTRINSIC_DATA(sse41_pmaxud,      INTR_TYPE_2OP, X86ISD::UMAX, 0),
@@ -274,12 +551,27 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse41_pminsd,      INTR_TYPE_2OP, X86ISD::SMIN, 0),
   X86_INTRINSIC_DATA(sse41_pminud,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
   X86_INTRINSIC_DATA(sse41_pminuw,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(sse41_pmovsxbd,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmovsxbq,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmovsxbw,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmovsxdq,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmovsxwd,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmovsxwq,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmovzxbd,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmovzxbq,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmovzxbw,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmovzxdq,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmovzxwd,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmovzxwq,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmuldq,      INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse_comigt_ss,     COMI, X86ISD::COMI, ISD::SETGT),
   X86_INTRINSIC_DATA(sse_comile_ss,     COMI, X86ISD::COMI, ISD::SETLE),
   X86_INTRINSIC_DATA(sse_comilt_ss,     COMI, X86ISD::COMI, ISD::SETLT),
   X86_INTRINSIC_DATA(sse_comineq_ss,    COMI, X86ISD::COMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(sse_max_ps,        INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(sse_min_ps,        INTR_TYPE_2OP, X86ISD::FMIN, 0),
   X86_INTRINSIC_DATA(sse_sqrt_ps,       INTR_TYPE_1OP, ISD::FSQRT, 0),
   X86_INTRINSIC_DATA(sse_ucomieq_ss,    COMI, X86ISD::UCOMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse_ucomige_ss,    COMI, X86ISD::UCOMI, ISD::SETGE),
@@ -290,7 +582,11 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
-  X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0)
+  X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+  X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+  X86_INTRINSIC_DATA(ssse3_psign_b_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+  X86_INTRINSIC_DATA(ssse3_psign_d_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+  X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0)
 };
 
 /*
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 4e0d594..6af59d4 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -74,11 +74,11 @@ namespace llvm {
   X86AsmPrinter::StackMapShadowTracker::~StackMapShadowTracker() {}
 
   void
-  X86AsmPrinter::StackMapShadowTracker::startFunction(MachineFunction &MF) {
+  X86AsmPrinter::StackMapShadowTracker::startFunction(MachineFunction &F) {
+    MF = &F;
     CodeEmitter.reset(TM.getTarget().createMCCodeEmitter(
-        *TM.getSubtargetImpl()->getInstrInfo(),
-        *TM.getSubtargetImpl()->getRegisterInfo(), *TM.getSubtargetImpl(),
-        MF.getContext()));
+        *MF->getSubtarget().getInstrInfo(), *MF->getSubtarget().getRegisterInfo(),
+        MF->getSubtarget(), MF->getContext()));
   }
 
   void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
@@ -100,7 +100,7 @@ namespace llvm {
     if (InShadow && CurrentShadowSize < RequiredShadowSize) {
       InShadow = false;
       EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
-               TM.getSubtarget<X86Subtarget>().is64Bit(), STI);
+               MF->getSubtarget<X86Subtarget>().is64Bit(), STI);
     }
   }
 
@@ -112,8 +112,8 @@ namespace llvm {
 
 X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
                                X86AsmPrinter &asmprinter)
-: Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()),
-  MAI(*TM.getMCAsmInfo()), AsmPrinter(asmprinter) {}
+    : Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), MAI(*TM.getMCAsmInfo()),
+      AsmPrinter(asmprinter) {}
 
 MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
   return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
@@ -124,7 +124,7 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
 /// operand to an MCSymbol.
 MCSymbol *X86MCInstLower::
 GetSymbolFromOperand(const MachineOperand &MO) const {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
 
   SmallString<128> Name;
@@ -390,9 +390,8 @@ static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
   Inst.addOperand(Seg);
 }
 
-static unsigned getRetOpcode(const X86Subtarget &Subtarget)
-{
-	return Subtarget.is64Bit() ? X86::RETQ : X86::RETL;
+static unsigned getRetOpcode(const X86Subtarget &Subtarget) {
+  return Subtarget.is64Bit() ? X86::RETQ : X86::RETL;
 }
 
 void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
@@ -510,6 +509,7 @@ ReSimplify:
   // inputs modeled as normal uses instead of implicit uses.  As such, truncate
   // off all but the first operand (the callee).  FIXME: Change isel.
   case X86::TAILJMPr64:
+  case X86::TAILJMPr64_REX:
   case X86::CALL64r:
   case X86::CALL64pcrel32: {
     unsigned Opcode = OutMI.getOpcode();
@@ -546,6 +546,24 @@ ReSimplify:
     break;
   }
 
+  case X86::DEC16r:
+  case X86::DEC32r:
+  case X86::INC16r:
+  case X86::INC32r:
+    // If we aren't in 64-bit mode we can use the 1-byte inc/dec instructions.
+    if (!AsmPrinter.getSubtarget().is64Bit()) {
+      unsigned Opcode;
+      switch (OutMI.getOpcode()) {
+      default: llvm_unreachable("Invalid opcode");
+      case X86::DEC16r: Opcode = X86::DEC16r_alt; break;
+      case X86::DEC32r: Opcode = X86::DEC32r_alt; break;
+      case X86::INC16r: Opcode = X86::INC16r_alt; break;
+      case X86::INC32r: Opcode = X86::INC32r_alt; break;
+      }
+      OutMI.setOpcode(Opcode);
+    }
+    break;
+
   // These are pseudo-ops for OR to help with the OR->ADD transformation.  We do
   // this with an ugly goto in case the resultant OR uses EAX and needs the
   // short form.
@@ -559,28 +577,6 @@ ReSimplify:
   case X86::ADD32ri8_DB:  OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
   case X86::ADD64ri8_DB:  OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;
 
-  // The assembler backend wants to see branches in their small form and relax
-  // them to their large form.  The JIT can only handle the large form because
-  // it does not do relaxation.  For now, translate the large form to the
-  // small one here.
-  case X86::JMP_4: OutMI.setOpcode(X86::JMP_1); break;
-  case X86::JO_4:  OutMI.setOpcode(X86::JO_1); break;
-  case X86::JNO_4: OutMI.setOpcode(X86::JNO_1); break;
-  case X86::JB_4:  OutMI.setOpcode(X86::JB_1); break;
-  case X86::JAE_4: OutMI.setOpcode(X86::JAE_1); break;
-  case X86::JE_4:  OutMI.setOpcode(X86::JE_1); break;
-  case X86::JNE_4: OutMI.setOpcode(X86::JNE_1); break;
-  case X86::JBE_4: OutMI.setOpcode(X86::JBE_1); break;
-  case X86::JA_4:  OutMI.setOpcode(X86::JA_1); break;
-  case X86::JS_4:  OutMI.setOpcode(X86::JS_1); break;
-  case X86::JNS_4: OutMI.setOpcode(X86::JNS_1); break;
-  case X86::JP_4:  OutMI.setOpcode(X86::JP_1); break;
-  case X86::JNP_4: OutMI.setOpcode(X86::JNP_1); break;
-  case X86::JL_4:  OutMI.setOpcode(X86::JL_1); break;
-  case X86::JGE_4: OutMI.setOpcode(X86::JGE_1); break;
-  case X86::JLE_4: OutMI.setOpcode(X86::JLE_1); break;
-  case X86::JG_4:  OutMI.setOpcode(X86::JG_1); break;
-
   // Atomic load and store require a separate pseudo-inst because Acquire
   // implies mayStore and Release implies mayLoad; fix these to regular MOV
   // instructions here
@@ -625,13 +621,13 @@ ReSimplify:
   // MOV64ao8, MOV64o8a
   // XCHG16ar, XCHG32ar, XCHG64ar
   case X86::MOV8mr_NOREX:
-  case X86::MOV8mr:     SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8ao8); break;
+  case X86::MOV8mr:     SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8o32a); break;
   case X86::MOV8rm_NOREX:
-  case X86::MOV8rm:     SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8o8a); break;
-  case X86::MOV16mr:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16ao16); break;
-  case X86::MOV16rm:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16o16a); break;
-  case X86::MOV32mr:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32ao32); break;
-  case X86::MOV32rm:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32o32a); break;
+  case X86::MOV8rm:     SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8ao32); break;
+  case X86::MOV16mr:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16o32a); break;
+  case X86::MOV16rm:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16ao32); break;
+  case X86::MOV32mr:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32o32a); break;
+  case X86::MOV32rm:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32ao32); break;
 
   case X86::ADC8ri:     SimplifyShortImmForm(OutMI, X86::ADC8i8);    break;
   case X86::ADC16ri:    SimplifyShortImmForm(OutMI, X86::ADC16i16);  break;
@@ -808,6 +804,58 @@ static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, const MCSu
   } // while (NumBytes)
 }
 
+static void LowerSTATEPOINT(MCStreamer &OS, StackMaps &SM,
+                            const MachineInstr &MI, bool Is64Bit,
+                            const TargetMachine& TM,
+                            const MCSubtargetInfo& STI,
+                            X86MCInstLower &MCInstLowering) {
+  assert(Is64Bit && "Statepoint currently only supports X86-64");
+
+  // Lower call target and choose correct opcode
+  const MachineOperand &call_target = StatepointOpers(&MI).getCallTarget();
+  MCOperand call_target_mcop;
+  unsigned call_opcode;
+  switch (call_target.getType()) {
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+    call_target_mcop = MCInstLowering.LowerSymbolOperand(
+      call_target,
+      MCInstLowering.GetSymbolFromOperand(call_target));
+    call_opcode = X86::CALL64pcrel32;
+    // Currently, we only support relative addressing with statepoints.
+    // Otherwise, we'll need a scratch register to hold the target
+    // address.  You'll fail asserts during load & relocation if this
+    // symbol is to far away. (TODO: support non-relative addressing)
+    break;
+  case MachineOperand::MO_Immediate:
+    call_target_mcop = MCOperand::CreateImm(call_target.getImm());
+    call_opcode = X86::CALL64pcrel32;
+    // Currently, we only support relative addressing with statepoints.
+    // Otherwise, we'll need a scratch register to hold the target
+    // immediate.  You'll fail asserts during load & relocation if this
+    // address is to far away. (TODO: support non-relative addressing)
+    break;
+  case MachineOperand::MO_Register:
+    call_target_mcop = MCOperand::CreateReg(call_target.getReg());
+    call_opcode = X86::CALL64r;
+    break;
+  default:
+    llvm_unreachable("Unsupported operand type in statepoint call target");
+    break;
+  }
+
+  // Emit call
+  MCInst call_inst;
+  call_inst.setOpcode(call_opcode);
+  call_inst.addOperand(call_target_mcop);
+  OS.EmitInstruction(call_inst, STI);
+
+  // Record our statepoint node in the same section used by STACKMAP
+  // and PATCHPOINT
+  SM.recordStatepoint(MI);
+}
+
+
 // Lower a stackmap of the form:
 // <id>, <shadowBytes>, ...
 void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
@@ -941,8 +989,7 @@ static std::string getShuffleComment(const MachineOperand &DstOp,
 
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   X86MCInstLower MCInstLowering(*MF, *this);
-  const X86RegisterInfo *RI = static_cast<const X86RegisterInfo *>(
-      TM.getSubtargetImpl()->getRegisterInfo());
+  const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo();
 
   switch (MI->getOpcode()) {
   case TargetOpcode::DBG_VALUE:
@@ -963,8 +1010,14 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     break;
   }
   case X86::TAILJMPr:
+  case X86::TAILJMPm:
   case X86::TAILJMPd:
+  case X86::TAILJMPr64:
+  case X86::TAILJMPm64:
   case X86::TAILJMPd64:
+  case X86::TAILJMPr64_REX:
+  case X86::TAILJMPm64_REX:
+  case X86::TAILJMPd64_REX:
     // Lower these as normal, but add some comments.
     OutStreamer.AddComment("TAILCALL");
     break;
@@ -1030,6 +1083,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addExpr(DotExpr));
     return;
   }
+  case TargetOpcode::STATEPOINT:
+    return LowerSTATEPOINT(OutStreamer, SM, *MI, Subtarget->is64Bit(), TM,
+                           getSubtargetInfo(), MCInstLowering);
 
   case TargetOpcode::STACKMAP:
     return LowerSTACKMAP(*MI);
diff --git a/lib/Target/X86/X86MachineFunctionInfo.cpp b/lib/Target/X86/X86MachineFunctionInfo.cpp
index 568dc22..ac2cdc8 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.cpp
+++ b/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -8,7 +8,26 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
 void X86MachineFunctionInfo::anchor() { }
+
+void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) {
+  if (!RestoreBasePointerOffset) {
+    const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      MF->getSubtarget().getRegisterInfo());
+    unsigned SlotSize = RegInfo->getSlotSize();
+    for (const MCPhysReg *CSR =
+      RegInfo->X86RegisterInfo::getCalleeSavedRegs(MF);
+      unsigned Reg = *CSR;
+       ++CSR)
+    {
+      if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+        RestoreBasePointerOffset -= SlotSize;
+    }
+  }
+}
+
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index 79a51b3..d598b55 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
 
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include <vector>
@@ -31,6 +32,12 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// contains stack pointer re-alignment code which requires FP.
   bool ForceFramePointer;
 
+  /// RestoreBasePointerOffset - Non-zero if the function has base pointer
+  /// and makes call to llvm.eh.sjlj.setjmp. When non-zero, the value is a
+  /// displacement from the frame pointer to a slot where the base pointer
+  /// is stashed.
+  signed char RestoreBasePointerOffset;
+
   /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
   /// stack frame in bytes.
   unsigned CalleeSavedFrameSize;
@@ -43,6 +50,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// ReturnAddrIndex - FrameIndex for return slot.
   int ReturnAddrIndex;
 
+  /// \brief FrameIndex for return slot.
+  int FrameAddrIndex;
+
   /// TailCallReturnAddrDelta - The number of bytes by which return address
   /// stack slot is moved as the result of tail call optimization.
   int TailCallReturnAddrDelta;
@@ -70,28 +80,22 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   unsigned ArgumentStackSize;
   /// NumLocalDynamics - Number of local-dynamic TLS accesses.
   unsigned NumLocalDynamics;
-
-public:
-  /// Describes a register that needs to be forwarded from the prologue to a
-  /// musttail call.
-  struct Forward {
-    Forward(unsigned VReg, MCPhysReg PReg, MVT VT)
-        : VReg(VReg), PReg(PReg), VT(VT) {}
-    unsigned VReg;
-    MCPhysReg PReg;
-    MVT VT;
-  };
+  /// HasPushSequences - Keeps track of whether this function uses sequences
+  /// of pushes to pass function parameters.
+  bool HasPushSequences;
 
 private:
   /// ForwardedMustTailRegParms - A list of virtual and physical registers
   /// that must be forwarded to every musttail call.
-  std::vector<Forward> ForwardedMustTailRegParms;
+  SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
 
 public:
   X86MachineFunctionInfo() : ForceFramePointer(false),
+                             RestoreBasePointerOffset(0),
                              CalleeSavedFrameSize(0),
                              BytesToPopOnReturn(0),
                              ReturnAddrIndex(0),
+                             FrameAddrIndex(0),
                              TailCallReturnAddrDelta(0),
                              SRetReturnReg(0),
                              GlobalBaseReg(0),
@@ -100,13 +104,16 @@ public:
                              VarArgsGPOffset(0),
                              VarArgsFPOffset(0),
                              ArgumentStackSize(0),
-                             NumLocalDynamics(0) {}
+                             NumLocalDynamics(0),
+                             HasPushSequences(false) {}
 
   explicit X86MachineFunctionInfo(MachineFunction &MF)
     : ForceFramePointer(false),
+      RestoreBasePointerOffset(0),
       CalleeSavedFrameSize(0),
       BytesToPopOnReturn(0),
       ReturnAddrIndex(0),
+      FrameAddrIndex(0),
       TailCallReturnAddrDelta(0),
       SRetReturnReg(0),
       GlobalBaseReg(0),
@@ -115,11 +122,19 @@ public:
       VarArgsGPOffset(0),
       VarArgsFPOffset(0),
       ArgumentStackSize(0),
-      NumLocalDynamics(0) {}
+      NumLocalDynamics(0),
+      HasPushSequences(false) {}
 
   bool getForceFramePointer() const { return ForceFramePointer;}
   void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
 
+  bool getHasPushSequences() const { return HasPushSequences; }
+  void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; }
+
+  bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; }
+  void setRestoreBasePointer(const MachineFunction *MF);
+  int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; }
+
   unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
   void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
 
@@ -129,6 +144,9 @@ public:
   int getRAIndex() const { return ReturnAddrIndex; }
   void setRAIndex(int Index) { ReturnAddrIndex = Index; }
 
+  int getFAIndex() const { return FrameAddrIndex; }
+  void setFAIndex(int Index) { FrameAddrIndex = Index; }
+
   int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
   void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
 
@@ -156,7 +174,7 @@ public:
   unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
   void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
 
-  std::vector<Forward> &getForwardedMustTailRegParms() {
+  SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
     return ForwardedMustTailRegParms;
   }
 };
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index adc05b2..143e70b 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -51,7 +51,7 @@ namespace {
   struct PadShortFunc : public MachineFunctionPass {
     static char ID;
     PadShortFunc() : MachineFunctionPass(ID)
-                   , Threshold(4), TM(nullptr), TII(nullptr) {}
+                   , Threshold(4), STI(nullptr), TII(nullptr) {}
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -79,7 +79,7 @@ namespace {
     // VisitedBBs - Cache of previously visited BBs.
     DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs;
 
-    const TargetMachine *TM;
+    const X86Subtarget *STI;
     const TargetInstrInfo *TII;
   };
 
@@ -93,19 +93,16 @@ FunctionPass *llvm::createX86PadShortFunctions() {
 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
 /// NOOP instructions before early exits.
 bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
-  const AttributeSet &FnAttrs = MF.getFunction()->getAttributes();
-  if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                           Attribute::OptimizeForSize) ||
-      FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                           Attribute::MinSize)) {
+  if (MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) ||
+      MF.getFunction()->hasFnAttribute(Attribute::MinSize)) {
     return false;
   }
 
-  TM = &MF.getTarget();
-  if (!TM->getSubtarget<X86Subtarget>().padShortFunctions())
+  STI = &MF.getSubtarget<X86Subtarget>();
+  if (!STI->padShortFunctions())
     return false;
 
-  TII = TM->getSubtargetImpl()->getInstrInfo();
+  TII = STI->getInstrInfo();
 
   // Search through basic blocks and mark the ones that have early returns
   ReturnBBs.clear();
@@ -195,8 +192,7 @@ bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
       return true;
     }
 
-    CyclesToEnd += TII->getInstrLatency(
-        TM->getSubtargetImpl()->getInstrItineraryData(), MI);
+    CyclesToEnd += TII->getInstrLatency(STI->getInstrItineraryData(), MI);
   }
 
   VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd);
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index a4a366d..cab7ce8 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -66,21 +66,22 @@ X86RegisterInfo::X86RegisterInfo(const X86Subtarget &STI)
   Is64Bit = Subtarget.is64Bit();
   IsWin64 = Subtarget.isTargetWin64();
 
+  // Use a callee-saved register as the base pointer.  These registers must
+  // not conflict with any ABI requirements.  For example, in 32-bit mode PIC
+  // requires GOT in the EBX register before function calls via PLT GOT pointer.
   if (Is64Bit) {
     SlotSize = 8;
-    StackPtr = (Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64()) ?
-        X86::RSP : X86::ESP;
-    FramePtr = (Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64()) ?
-        X86::RBP : X86::EBP;
+    bool Use64BitReg = 
+      Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
+    StackPtr = Use64BitReg ? X86::RSP : X86::ESP;
+    FramePtr = Use64BitReg ? X86::RBP : X86::EBP;
+    BasePtr = Use64BitReg ? X86::RBX : X86::EBX;
   } else {
     SlotSize = 4;
     StackPtr = X86::ESP;
     FramePtr = X86::EBP;
+    BasePtr = X86::ESI;
   }
-  // Use a callee-saved register as the base pointer.  These registers must
-  // not conflict with any ABI requirements.  For example, in 32-bit mode PIC
-  // requires GOT in the EBX register before function calls via PLT GOT pointer.
-  BasePtr = Is64Bit ? X86::RBX : X86::ESI;
 }
 
 bool
@@ -354,7 +355,9 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
         "Stack realignment in presence of dynamic allocas is not supported with"
         "this calling convention.");
 
-    for (MCSubRegIterator I(getBaseRegister(), this, /*IncludeSelf=*/true);
+    unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), MVT::i64,
+                                              false);
+    for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true);
          I.isValid(); ++I)
       Reserved.set(*I);
   }
@@ -445,10 +448,8 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   const Function *F = MF.getFunction();
   unsigned StackAlign =
     MF.getSubtarget().getFrameLowering()->getStackAlignment();
-  bool requiresRealignment =
-    ((MFI->getMaxAlignment() > StackAlign) ||
-     F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                     Attribute::StackAlignment));
+  bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
+                              F->hasFnAttribute(Attribute::StackAlignment));
 
   // If we've requested that we force align the stack do so now.
   if (ForceStackAlign)
@@ -468,8 +469,6 @@ void
 X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                      int SPAdj, unsigned FIOperandNum,
                                      RegScavenger *RS) const {
-  assert(SPAdj == 0 && "Unexpected");
-
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
@@ -506,6 +505,9 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   } else
     FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex);
 
+  if (BasePtr == StackPtr)
+    FIOffset += SPAdj;
+
   // The frame index format for stackmaps and patchpoints is different from the
   // X86 format. It only has a FI and an offset.
   if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) {
@@ -535,6 +537,14 @@ unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return TFI->hasFP(MF) ? FramePtr : StackPtr;
 }
 
+unsigned X86RegisterInfo::getPtrSizedFrameRegister(
+    const MachineFunction &MF) const {
+  unsigned FrameReg = getFrameRegister(MF);
+  if (Subtarget.isTarget64BitILP32())
+    FrameReg = getX86SubSuperRegister(FrameReg, MVT::i32, false);
+  return FrameReg;
+}
+
 namespace llvm {
 unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
                                 bool High) {
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index cc0a7b2..406b1fc 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -122,6 +122,7 @@ public:
 
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const override;
+  unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const;
   unsigned getStackRegister() const { return StackPtr; }
   unsigned getBaseRegister() const { return BasePtr; }
   // FIXME: Move to FrameInfok
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 311a717..2e735fa 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -263,14 +263,22 @@ def FS : X86Reg<"fs", 4>;
 def GS : X86Reg<"gs", 5>;
 
 // Debug registers
-def DR0 : X86Reg<"dr0", 0>;
-def DR1 : X86Reg<"dr1", 1>;
-def DR2 : X86Reg<"dr2", 2>;
-def DR3 : X86Reg<"dr3", 3>;
-def DR4 : X86Reg<"dr4", 4>;
-def DR5 : X86Reg<"dr5", 5>;
-def DR6 : X86Reg<"dr6", 6>;
-def DR7 : X86Reg<"dr7", 7>;
+def DR0  : X86Reg<"dr0",   0>;
+def DR1  : X86Reg<"dr1",   1>;
+def DR2  : X86Reg<"dr2",   2>;
+def DR3  : X86Reg<"dr3",   3>;
+def DR4  : X86Reg<"dr4",   4>;
+def DR5  : X86Reg<"dr5",   5>;
+def DR6  : X86Reg<"dr6",   6>;
+def DR7  : X86Reg<"dr7",   7>;
+def DR8  : X86Reg<"dr8",   8>;
+def DR9  : X86Reg<"dr9",   9>;
+def DR10 : X86Reg<"dr10", 10>;
+def DR11 : X86Reg<"dr11", 11>;
+def DR12 : X86Reg<"dr12", 12>;
+def DR13 : X86Reg<"dr13", 13>;
+def DR14 : X86Reg<"dr14", 14>;
+def DR15 : X86Reg<"dr15", 15>;
 
 // Control registers
 def CR0  : X86Reg<"cr0",   0>;
@@ -317,7 +325,7 @@ def GR8 : RegisterClass<"X86", [i8],  8,
                              R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> {
   let AltOrders = [(sub GR8, AH, BH, CH, DH)];
   let AltOrderSelect = [{
-    return MF.getTarget().getSubtarget<X86Subtarget>().is64Bit();
+    return MF.getSubtarget<X86Subtarget>().is64Bit();
   }];
 }
 
@@ -369,7 +377,7 @@ def GR8_NOREX : RegisterClass<"X86", [i8], 8,
                               (add AL, CL, DL, AH, CH, DH, BL, BH)> {
   let AltOrders = [(sub GR8_NOREX, AH, BH, CH, DH)];
   let AltOrderSelect = [{
-    return MF.getTarget().getSubtarget<X86Subtarget>().is64Bit();
+    return MF.getSubtarget<X86Subtarget>().is64Bit();
   }];
 }
 // GR16_NOREX - GR16 registers which do not require a REX prefix.
@@ -461,18 +469,18 @@ def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
                           256, (sequence "YMM%u", 0, 31)>;
 
 // Mask registers
-def VK1     : RegisterClass<"X86", [i1],    16, (sequence "K%u", 0, 7)> {let Size = 16;}
-def VK2     : RegisterClass<"X86", [v2i1],  16, (add VK1)> {let Size = 16;}
-def VK4     : RegisterClass<"X86", [v4i1],  16, (add VK2)> {let Size = 16;}
-def VK8     : RegisterClass<"X86", [v8i1],  16, (add VK4)> {let Size = 16;}
+def VK1     : RegisterClass<"X86", [i1],    8,  (sequence "K%u", 0, 7)> {let Size = 8;}
+def VK2     : RegisterClass<"X86", [v2i1],  8,  (add VK1)> {let Size = 8;}
+def VK4     : RegisterClass<"X86", [v4i1],  8,  (add VK2)> {let Size = 8;}
+def VK8     : RegisterClass<"X86", [v8i1],  8,  (add VK4)> {let Size = 8;}
 def VK16    : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
 def VK32    : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;}
 def VK64    : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;}
 
-def VK1WM   : RegisterClass<"X86", [i1],    16, (sub VK1, K0)> {let Size = 16;}
-def VK2WM   : RegisterClass<"X86", [v2i1],  16, (sub VK2, K0)> {let Size = 16;}
-def VK4WM   : RegisterClass<"X86", [v4i1],  16, (sub VK4, K0)> {let Size = 16;}
-def VK8WM   : RegisterClass<"X86", [v8i1],  16, (sub VK8, K0)> {let Size = 16;}
+def VK1WM   : RegisterClass<"X86", [i1],    8,  (sub VK1, K0)> {let Size = 8;}
+def VK2WM   : RegisterClass<"X86", [v2i1],  8,  (sub VK2, K0)> {let Size = 8;}
+def VK4WM   : RegisterClass<"X86", [v4i1],  8,  (sub VK4, K0)> {let Size = 8;}
+def VK8WM   : RegisterClass<"X86", [v8i1],  8,  (sub VK8, K0)> {let Size = 8;}
 def VK16WM  : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>   {let Size = 16;}
 def VK32WM  : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;}
 def VK64WM  : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index 73a3230..61c0600 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -1895,7 +1895,7 @@ def : InstRW<[WriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>;
 
 // x,m / v,v,m.
 def WriteMULm : SchedWriteRes<[HWPort01, HWPort23]> {
-  let Latency = 4;
+  let Latency = 9;
   let NumMicroOps = 2;
   let ResourceCycles = [1, 1];
 }
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 821044f..7feabf6 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -57,7 +57,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                                              bool isVolatile,
                                          MachinePointerInfo DstPtrInfo) const {
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
-  const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>();
+  const X86Subtarget &Subtarget =
+      DAG.getMachineFunction().getSubtarget<X86Subtarget>();
 
 #ifndef NDEBUG
   // If the base register might conflict with our physical registers, bail out.
@@ -199,17 +200,15 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
   return Chain;
 }
 
-SDValue
-X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
-                                        SDValue Chain, SDValue Dst, SDValue Src,
-                                        SDValue Size, unsigned Align,
-                                        bool isVolatile, bool AlwaysInline,
-                                         MachinePointerInfo DstPtrInfo,
-                                         MachinePointerInfo SrcPtrInfo) const {
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   // This requires the copy size to be a constant, preferably
   // within a subtarget-specific limit.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
-  const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>();
+  const X86Subtarget &Subtarget =
+      DAG.getMachineFunction().getSubtarget<X86Subtarget>();
   if (!ConstantSize)
     return SDValue();
   uint64_t SizeVal = ConstantSize->getZExtValue();
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 9d877c9..de30c75 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -257,17 +257,17 @@ void X86Subtarget::initializeEnvironment() {
   HasVLX = false;
   HasADX = false;
   HasSHA = false;
-  HasSGX = false;
   HasPRFCHW = false;
   HasRDSEED = false;
-  HasSMAP = false;
   IsBTMemSlow = false;
   IsSHLDSlow = false;
   IsUAMemFast = false;
-  HasVectorUAMem = false;
+  IsUAMem32Slow = false;
+  HasSSEUnalignedMem = false;
   HasCmpxchg16b = false;
   UseLeaForSP = false;
-  HasSlowDivide = false;
+  HasSlowDivide32 = false;
+  HasSlowDivide64 = false;
   PadShortFunctions = false;
   CallRegIndirect = false;
   LEAUsesAG = false;
@@ -280,46 +280,6 @@ void X86Subtarget::initializeEnvironment() {
   MaxInlineSizeThreshold = 128;
 }
 
-static std::string computeDataLayout(const Triple &TT) {
-  // X86 is little endian
-  std::string Ret = "e";
-
-  Ret += DataLayout::getManglingComponent(TT);
-  // X86 and x32 have 32 bit pointers.
-  if ((TT.isArch64Bit() &&
-       (TT.getEnvironment() == Triple::GNUX32 || TT.isOSNaCl())) ||
-      !TT.isArch64Bit())
-    Ret += "-p:32:32";
-
-  // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
-  if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl())
-    Ret += "-i64:64";
-  else
-    Ret += "-f64:32:64";
-
-  // Some ABIs align long double to 128 bits, others to 32.
-  if (TT.isOSNaCl())
-    ; // No f80
-  else if (TT.isArch64Bit() || TT.isOSDarwin())
-    Ret += "-f80:128";
-  else
-    Ret += "-f80:32";
-
-  // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
-  if (TT.isArch64Bit())
-    Ret += "-n8:16:32:64";
-  else
-    Ret += "-n8:16:32";
-
-  // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
-  if (!TT.isArch64Bit() && TT.isOSWindows())
-    Ret += "-S32";
-  else
-    Ret += "-S128";
-
-  return Ret;
-}
-
 X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
                                                             StringRef FS) {
   initializeEnvironment();
@@ -332,16 +292,16 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
                            unsigned StackAlignOverride)
     : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
       PICStyle(PICStyles::None), TargetTriple(TT),
-      DL(computeDataLayout(TargetTriple)),
       StackAlignOverride(StackAlignOverride),
       In64BitMode(TargetTriple.getArch() == Triple::x86_64),
       In32BitMode(TargetTriple.getArch() == Triple::x86 &&
                   TargetTriple.getEnvironment() != Triple::CODE16),
       In16BitMode(TargetTriple.getArch() == Triple::x86 &&
                   TargetTriple.getEnvironment() == Triple::CODE16),
-      TSInfo(DL), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
-      TLInfo(TM), FrameLowering(TargetFrameLowering::StackGrowsDown,
-                                getStackAlignment(), is64Bit() ? -8 : -4) {
+      TSInfo(*TM.getDataLayout()),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
+      FrameLowering(TargetFrameLowering::StackGrowsDown, getStackAlignment(),
+                    is64Bit() ? -8 : -4) {
   // Determine the PICStyle based on the target selected.
   if (TM.getRelocationModel() == Reloc::Static) {
     // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 091b6c4..4c31f78 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -31,7 +31,7 @@ class GlobalValue;
 class StringRef;
 class TargetMachine;
 
-/// PICStyles - The X86 backend supports a number of different styles of PIC.
+/// The X86 backend supports a number of different styles of PIC.
 ///
 namespace PICStyles {
 enum Style {
@@ -58,138 +58,136 @@ protected:
     Others, IntelAtom, IntelSLM
   };
 
-  /// X86ProcFamily - X86 processor family: Intel Atom, and others
+  /// X86 processor family: Intel Atom, and others
   X86ProcFamilyEnum X86ProcFamily;
 
-  /// PICStyle - Which PIC style to use
-  ///
+  /// Which PIC style to use
   PICStyles::Style PICStyle;
 
-  /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or
-  /// none supported.
+  /// MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
   X86SSEEnum X86SSELevel;
 
-  /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported.
-  ///
+  /// 3DNow, 3DNow Athlon, or none supported.
   X863DNowEnum X863DNowLevel;
 
-  /// HasCMov - True if this processor has conditional move instructions
+  /// True if this processor has conditional move instructions
   /// (generally pentium pro+).
   bool HasCMov;
 
-  /// HasX86_64 - True if the processor supports X86-64 instructions.
-  ///
+  /// True if the processor supports X86-64 instructions.
   bool HasX86_64;
 
-  /// HasPOPCNT - True if the processor supports POPCNT.
+  /// True if the processor supports POPCNT.
   bool HasPOPCNT;
 
-  /// HasSSE4A - True if the processor supports SSE4A instructions.
+  /// True if the processor supports SSE4A instructions.
   bool HasSSE4A;
 
-  /// HasAES - Target has AES instructions
+  /// Target has AES instructions
   bool HasAES;
 
-  /// HasPCLMUL - Target has carry-less multiplication
+  /// Target has carry-less multiplication
   bool HasPCLMUL;
 
-  /// HasFMA - Target has 3-operand fused multiply-add
+  /// Target has 3-operand fused multiply-add
   bool HasFMA;
 
-  /// HasFMA4 - Target has 4-operand fused multiply-add
+  /// Target has 4-operand fused multiply-add
   bool HasFMA4;
 
-  /// HasXOP - Target has XOP instructions
+  /// Target has XOP instructions
   bool HasXOP;
 
-  /// HasTBM - Target has TBM instructions.
+  /// Target has TBM instructions.
   bool HasTBM;
 
-  /// HasMOVBE - True if the processor has the MOVBE instruction.
+  /// True if the processor has the MOVBE instruction.
   bool HasMOVBE;
 
-  /// HasRDRAND - True if the processor has the RDRAND instruction.
+  /// True if the processor has the RDRAND instruction.
   bool HasRDRAND;
 
-  /// HasF16C - Processor has 16-bit floating point conversion instructions.
+  /// Processor has 16-bit floating point conversion instructions.
   bool HasF16C;
 
-  /// HasFSGSBase - Processor has FS/GS base insturctions.
+  /// Processor has FS/GS base insturctions.
   bool HasFSGSBase;
 
-  /// HasLZCNT - Processor has LZCNT instruction.
+  /// Processor has LZCNT instruction.
   bool HasLZCNT;
 
-  /// HasBMI - Processor has BMI1 instructions.
+  /// Processor has BMI1 instructions.
   bool HasBMI;
 
-  /// HasBMI2 - Processor has BMI2 instructions.
+  /// Processor has BMI2 instructions.
   bool HasBMI2;
 
-  /// HasRTM - Processor has RTM instructions.
+  /// Processor has RTM instructions.
   bool HasRTM;
 
-  /// HasHLE - Processor has HLE.
+  /// Processor has HLE.
   bool HasHLE;
 
-  /// HasADX - Processor has ADX instructions.
+  /// Processor has ADX instructions.
   bool HasADX;
 
-  /// HasSHA - Processor has SHA instructions.
+  /// Processor has SHA instructions.
   bool HasSHA;
 
-  /// HasSGX - Processor has SGX instructions.
-  bool HasSGX;
-
-  /// HasPRFCHW - Processor has PRFCHW instructions.
+  /// Processor has PRFCHW instructions.
   bool HasPRFCHW;
 
-  /// HasRDSEED - Processor has RDSEED instructions.
+  /// Processor has RDSEED instructions.
   bool HasRDSEED;
 
-  /// HasSMAP - Processor has SMAP instructions.
-  bool HasSMAP;
-
-  /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
+  /// True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
-  /// IsSHLDSlow - True if SHLD instructions are slow.
+  /// True if SHLD instructions are slow.
   bool IsSHLDSlow;
 
-  /// IsUAMemFast - True if unaligned memory access is fast.
+  /// True if unaligned memory access is fast.
   bool IsUAMemFast;
 
-  /// HasVectorUAMem - True if SIMD operations can have unaligned memory
-  /// operands. This may require setting a feature bit in the processor.
-  bool HasVectorUAMem;
+  /// True if unaligned 32-byte memory accesses are slow.
+  bool IsUAMem32Slow;
+
+  /// True if SSE operations can have unaligned memory operands.
+  /// This may require setting a configuration bit in the processor.
+  bool HasSSEUnalignedMem;
 
-  /// HasCmpxchg16b - True if this processor has the CMPXCHG16B instruction;
+  /// True if this processor has the CMPXCHG16B instruction;
   /// this is true for most x86-64 chips, but not the first AMD chips.
   bool HasCmpxchg16b;
 
-  /// UseLeaForSP - True if the LEA instruction should be used for adjusting
+  /// True if the LEA instruction should be used for adjusting
   /// the stack pointer. This is an optimization for Intel Atom processors.
   bool UseLeaForSP;
 
-  /// HasSlowDivide - True if smaller divides are significantly faster than
-  /// full divides and should be used when possible.
-  bool HasSlowDivide;
+  /// True if 8-bit divisions are significantly faster than
+  /// 32-bit divisions and should be used when possible.
+  bool HasSlowDivide32;
+
+  /// True if 16-bit divides are significantly faster than
+  /// 64-bit divisions and should be used when possible.
+  bool HasSlowDivide64;
 
-  /// PadShortFunctions - True if the short functions should be padded to prevent
+  /// True if the short functions should be padded to prevent
   /// a stall when returning too early.
   bool PadShortFunctions;
 
-  /// CallRegIndirect - True if the Calls with memory reference should be converted
+  /// True if the Calls with memory reference should be converted
   /// to a register-based indirect call.
   bool CallRegIndirect;
-  /// LEAUsesAG - True if the LEA instruction inputs have to be ready at
-  ///             address generation (AG) time.
+
+  /// True if the LEA instruction inputs have to be ready at address generation
+  /// (AG) time.
   bool LEAUsesAG;
 
-  /// SlowLEA - True if the LEA instruction with certain arguments is slow
+  /// True if the LEA instruction with certain arguments is slow
   bool SlowLEA;
 
-  /// SlowIncDec - True if INC and DEC instructions are slow when writing to flags
+  /// True if INC and DEC instructions are slow when writing to flags
   bool SlowIncDec;
 
   /// Use the RSQRT* instructions to optimize square root calculations.
@@ -201,7 +199,7 @@ protected:
   /// For this to be profitable, the cost of FDIV must be
   /// substantially higher than normal FP ops like FADD and FMUL.
   bool UseReciprocalEst;
-  
+
   /// Processor has AVX-512 PreFetch Instructions
   bool HasPFI;
 
@@ -220,7 +218,7 @@ protected:
   /// Processor has AVX-512 Vector Length eXtenstions
   bool HasVLX;
 
-  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned stackAlignment;
 
@@ -228,26 +226,24 @@ protected:
   ///
   unsigned MaxInlineSizeThreshold;
 
-  /// TargetTriple - What processor and OS we're targeting.
+  /// What processor and OS we're targeting.
   Triple TargetTriple;
 
   /// Instruction itineraries for scheduling
   InstrItineraryData InstrItins;
 
 private:
-  // Calculates type size & alignment
-  const DataLayout DL;
 
-  /// StackAlignOverride - Override the stack alignment.
+  /// Override the stack alignment.
   unsigned StackAlignOverride;
 
-  /// In64BitMode - True if compiling for 64-bit, false for 16-bit or 32-bit.
+  /// True if compiling for 64-bit, false for 16-bit or 32-bit.
   bool In64BitMode;
 
-  /// In32BitMode - True if compiling for 32-bit, false for 16-bit or 64-bit.
+  /// True if compiling for 32-bit, false for 16-bit or 64-bit.
   bool In32BitMode;
 
-  /// In16BitMode - True if compiling for 16-bit, false for 32-bit or 64-bit.
+  /// True if compiling for 16-bit, false for 32-bit or 64-bit.
   bool In16BitMode;
 
   X86SelectionDAGInfo TSInfo;
@@ -269,7 +265,6 @@ public:
     return &TLInfo;
   }
   const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; }
-  const DataLayout *getDataLayout() const override { return &DL; }
   const X86FrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
@@ -280,12 +275,12 @@ public:
     return &getInstrInfo()->getRegisterInfo();
   }
 
-  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// Returns the minimum alignment known to hold of the
   /// stack frame on entry to the function and which must be maintained by every
   /// function for this subtarget.
   unsigned getStackAlignment() const { return stackAlignment; }
 
-  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// Returns the maximum memset / memcpy size
   /// that still makes it profitable to inline the call.
   unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }
 
@@ -294,7 +289,7 @@ public:
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
 private:
-  /// \brief Initialize the full set of dependencies so we can use an initializer
+  /// Initialize the full set of dependencies so we can use an initializer
   /// list for X86Subtarget.
   X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
   void initializeEnvironment();
@@ -316,13 +311,13 @@ public:
   /// Is this x86_64 with the ILP32 programming model (x32 ABI)?
   bool isTarget64BitILP32() const {
     return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 ||
-                           TargetTriple.getOS() == Triple::NaCl);
+                           TargetTriple.isOSNaCl());
   }
 
   /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
   bool isTarget64BitLP64() const {
     return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 &&
-                           TargetTriple.getOS() != Triple::NaCl);
+                           !TargetTriple.isOSNaCl());
   }
 
   PICStyles::Style getPICStyle() const { return PICStyle; }
@@ -363,17 +358,17 @@ public:
   bool hasHLE() const { return HasHLE; }
   bool hasADX() const { return HasADX; }
   bool hasSHA() const { return HasSHA; }
-  bool hasSGX() const { return HasSGX; }
   bool hasPRFCHW() const { return HasPRFCHW; }
   bool hasRDSEED() const { return HasRDSEED; }
-  bool hasSMAP() const { return HasSMAP; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isSHLDSlow() const { return IsSHLDSlow; }
   bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
-  bool hasVectorUAMem() const { return HasVectorUAMem; }
+  bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
+  bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
   bool useLeaForSP() const { return UseLeaForSP; }
-  bool hasSlowDivide() const { return HasSlowDivide; }
+  bool hasSlowDivide32() const { return HasSlowDivide32; }
+  bool hasSlowDivide64() const { return HasSlowDivide64; }
   bool padShortFunctions() const { return PadShortFunctions; }
   bool callRegIndirect() const { return CallRegIndirect; }
   bool LEAusesAG() const { return LEAUsesAG; }
@@ -394,16 +389,14 @@ public:
   const Triple &getTargetTriple() const { return TargetTriple; }
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
-  bool isTargetFreeBSD() const {
-    return TargetTriple.getOS() == Triple::FreeBSD;
-  }
-  bool isTargetSolaris() const {
-    return TargetTriple.getOS() == Triple::Solaris;
-  }
+  bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); }
+  bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); }
+  bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); }
+  bool isTargetPS4() const { return TargetTriple.isPS4(); }
 
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
-  bool isTargetMacho() const { return TargetTriple.isOSBinFormatMachO(); }
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
 
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
@@ -469,13 +462,11 @@ public:
   unsigned char ClassifyGlobalReference(const GlobalValue *GV,
                                         const TargetMachine &TM)const;
 
-  /// ClassifyBlockAddressReference - Classify a blockaddress reference for the
-  /// current subtarget according to how we should reference it in a non-pcrel
-  /// context.
+  /// Classify a blockaddress reference for the current subtarget according to
+  /// how we should reference it in a non-pcrel context.
   unsigned char ClassifyBlockAddressReference() const;
 
-  /// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
-  /// to immediate address.
+  /// Return true if the subtarget allows calls to immediate address.
   bool IsLegalToCallImmediateAddr(const TargetMachine &TM) const;
 
   /// This function returns the name of a function which has an interface
@@ -494,8 +485,7 @@ public:
 
   bool enableEarlyIfConversion() const override;
 
-  /// getInstrItins = Return the instruction itineraries based on the
-  /// subtarget selection.
+  /// Return the instruction itineraries based on the subtarget selection.
   const InstrItineraryData *getInstrItineraryData() const override {
     return &InstrItins;
   }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 8802feb..4bde053 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -14,9 +14,10 @@
 #include "X86TargetMachine.h"
 #include "X86.h"
 #include "X86TargetObjectFile.h"
+#include "X86TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -47,6 +48,46 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   llvm_unreachable("unknown subtarget type");
 }
 
+static std::string computeDataLayout(const Triple &TT) {
+  // X86 is little endian
+  std::string Ret = "e";
+
+  Ret += DataLayout::getManglingComponent(TT);
+  // X86 and x32 have 32 bit pointers.
+  if ((TT.isArch64Bit() &&
+       (TT.getEnvironment() == Triple::GNUX32 || TT.isOSNaCl())) ||
+      !TT.isArch64Bit())
+    Ret += "-p:32:32";
+
+  // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
+  if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl())
+    Ret += "-i64:64";
+  else
+    Ret += "-f64:32:64";
+
+  // Some ABIs align long double to 128 bits, others to 32.
+  if (TT.isOSNaCl())
+    ; // No f80
+  else if (TT.isArch64Bit() || TT.isOSDarwin())
+    Ret += "-f80:128";
+  else
+    Ret += "-f80:32";
+
+  // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
+  if (TT.isArch64Bit())
+    Ret += "-n8:16:32:64";
+  else
+    Ret += "-n8:16:32";
+
+  // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
+  if (!TT.isArch64Bit() && TT.isOSWindows())
+    Ret += "-S32";
+  else
+    Ret += "-S128";
+
+  return Ret;
+}
+
 /// X86TargetMachine ctor - Create an X86 target.
 ///
 X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU,
@@ -55,6 +96,7 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU,
                                    CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
       TLOF(createTLOF(Triple(getTargetTriple()))),
+      DL(computeDataLayout(Triple(TT))),
       Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) {
   // default to hard float ABI
   if (Options.FloatABIType == FloatABI::Default)
@@ -74,11 +116,8 @@ X86TargetMachine::~X86TargetMachine() {}
 
 const X86Subtarget *
 X86TargetMachine::getSubtargetImpl(const Function &F) const {
-  AttributeSet FnAttrs = F.getAttributes();
-  Attribute CPUAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
-  Attribute FSAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
 
   std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
                         ? CPUAttr.getValueAsString().str()
@@ -92,8 +131,7 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
   // function before we can generate a subtarget. We also need to use
   // it as a key for the subtarget since that can be the only difference
   // between two functions.
-  Attribute SFAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "use-soft-float");
+  Attribute SFAttr = F.getFnAttribute("use-soft-float");
   bool SoftFloat = !SFAttr.hasAttribute(Attribute::None)
                        ? SFAttr.getValueAsString() == "true"
                        : Options.UseSoftFloat;
@@ -120,15 +158,12 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
   cl::init(true));
 
 //===----------------------------------------------------------------------===//
-// X86 Analysis Pass Setup
+// X86 TTI query.
 //===----------------------------------------------------------------------===//
 
-void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  // Add first the target-independent BasicTTI pass, then our X86 pass. This
-  // allows the X86 pass to delegate to the target independent layer when
-  // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(this));
-  PM.add(createX86TargetTransformInfoPass(this));
+TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis(
+      [this](Function &F) { return TargetTransformInfo(X86TTIImpl(this, F)); });
 }
 
 
@@ -147,16 +182,12 @@ public:
     return getTM<X86TargetMachine>();
   }
 
-  const X86Subtarget &getX86Subtarget() const {
-    return *getX86TargetMachine().getSubtargetImpl();
-  }
-
   void addIRPasses() override;
   bool addInstSelector() override;
   bool addILPOpts() override;
-  bool addPreRegAlloc() override;
-  bool addPostRegAlloc() override;
-  bool addPreEmitPass() override;
+  void addPreRegAlloc() override;
+  void addPostRegAlloc() override;
+  void addPreEmitPass() override;
 };
 } // namespace
 
@@ -175,7 +206,8 @@ bool X86PassConfig::addInstSelector() {
   addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
 
   // For ELF, cleanup any local-dynamic TLS accesses.
-  if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
+  if (Triple(TM->getTargetTriple()).isOSBinFormatELF() &&
+      getOptLevel() != CodeGenOpt::None)
     addPass(createCleanupLocalDynamicTLSPass());
 
   addPass(createX86GlobalBaseRegPass());
@@ -188,32 +220,23 @@ bool X86PassConfig::addILPOpts() {
   return true;
 }
 
-bool X86PassConfig::addPreRegAlloc() {
-  return false;  // -print-machineinstr shouldn't print after this.
+void X86PassConfig::addPreRegAlloc() {
+  addPass(createX86CallFrameOptimization());
 }
 
-bool X86PassConfig::addPostRegAlloc() {
+void X86PassConfig::addPostRegAlloc() {
   addPass(createX86FloatingPointStackifierPass());
-  return true;  // -print-machineinstr should print after this.
 }
 
-bool X86PassConfig::addPreEmitPass() {
-  bool ShouldPrint = false;
-  if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2()) {
+void X86PassConfig::addPreEmitPass() {
+  if (getOptLevel() != CodeGenOpt::None)
     addPass(createExecutionDependencyFixPass(&X86::VR128RegClass));
-    ShouldPrint = true;
-  }
 
-  if (UseVZeroUpper) {
+  if (UseVZeroUpper)
     addPass(createX86IssueVZeroUpperPass());
-    ShouldPrint = true;
-  }
 
   if (getOptLevel() != CodeGenOpt::None) {
     addPass(createX86PadShortFunctions());
     addPass(createX86FixupLEAs());
-    ShouldPrint = true;
   }
-
-  return ShouldPrint;
 }
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 916278c..283858d 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -24,22 +24,22 @@ class StringRef;
 
 class X86TargetMachine final : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
-  X86Subtarget       Subtarget;
+  // Calculates type size & alignment
+  const DataLayout DL;
+  X86Subtarget Subtarget;
 
   mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap;
 
 public:
-  X86TargetMachine(const Target &T, StringRef TT,
-                   StringRef CPU, StringRef FS, const TargetOptions &Options,
-                   Reloc::Model RM, CodeModel::Model CM,
-                   CodeGenOpt::Level OL);
+  X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+                   const TargetOptions &Options, Reloc::Model RM,
+                   CodeModel::Model CM, CodeGenOpt::Level OL);
   ~X86TargetMachine() override;
-
+  const DataLayout *getDataLayout() const override { return &DL; }
   const X86Subtarget *getSubtargetImpl() const override { return &Subtarget; }
   const X86Subtarget *getSubtargetImpl(const Function &F) const override;
 
-  /// \brief Register X86 analysis passes with a pass manager.
-  void addAnalysisPasses(PassManagerBase &PM) override;
+  TargetIRAnalysis getTargetIRAnalysis() override;
 
   // Set up the pass pipeline.
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index f8bcd61..1d1c32e 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -21,6 +21,11 @@
 using namespace llvm;
 using namespace dwarf;
 
+X86_64MachoTargetObjectFile::X86_64MachoTargetObjectFile()
+  : TargetLoweringObjectFileMachO() {
+  SupportIndirectSymViaGOTPCRel = true;
+}
+
 const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference(
     const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
     const TargetMachine &TM, MachineModuleInfo *MMI,
@@ -46,6 +51,17 @@ MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol(
   return TM.getSymbol(GV, Mang);
 }
 
+const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
+    const MCSymbol *Sym, int64_t Offset) const {
+  // On Darwin/X86-64, we need to use foo@GOTPCREL+4 to access the got entry
+  // from a data section. In case there's an additional offset, then use
+  // foo@GOTPCREL+4+<offset>.
+  const MCExpr *Res =
+    MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+  const MCExpr *Off = MCConstantExpr::Create(Offset+4, getContext());
+  return MCBinaryExpr::CreateAdd(Res, Off, getContext());
+}
+
 void
 X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 6a6988a..f745538 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -19,6 +19,8 @@ namespace llvm {
   /// x86-64.
   class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
   public:
+    X86_64MachoTargetObjectFile();
+
     const MCExpr *
     getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding,
                             Mangler &Mang, const TargetMachine &TM,
@@ -30,6 +32,10 @@ namespace llvm {
     MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
                                       const TargetMachine &TM,
                                       MachineModuleInfo *MMI) const override;
+
+    const MCExpr *
+      getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
+                                int64_t Offset) const override;
   };
 
   /// X86LinuxTargetObjectFile - This implementation is used for linux x86
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 2b70fd0..5136619 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -14,9 +14,9 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "X86.h"
-#include "X86TargetMachine.h"
+#include "X86TargetTransformInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
@@ -25,123 +25,22 @@ using namespace llvm;
 
 #define DEBUG_TYPE "x86tti"
 
-// Declare the pass initialization routine locally as target-specific passes
-// don't have a target-wide initialization entry point, and so we rely on the
-// pass constructor initialization.
-namespace llvm {
-void initializeX86TTIPass(PassRegistry &);
-}
-
-namespace {
-
-class X86TTI final : public ImmutablePass, public TargetTransformInfo {
-  const X86Subtarget *ST;
-  const X86TargetLowering *TLI;
-
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
-
-public:
-  X86TTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) {
-    llvm_unreachable("This pass cannot be directly constructed");
-  }
-
-  X86TTI(const X86TargetMachine *TM)
-      : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
-        TLI(TM->getSubtargetImpl()->getTargetLowering()) {
-    initializeX86TTIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void initializePass() override {
-    pushTTIStack(this);
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    TargetTransformInfo::getAnalysisUsage(AU);
-  }
-
-  /// Pass identification.
-  static char ID;
-
-  /// Provide necessary pointer adjustments for the two base classes.
-  void *getAdjustedAnalysisPointer(const void *ID) override {
-    if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo*)this;
-    return this;
-  }
-
-  /// \name Scalar TTI Implementations
-  /// @{
-  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
-
-  /// @}
-
-  /// \name Vector TTI Implementations
-  /// @{
-
-  unsigned getNumberOfRegisters(bool Vector) const override;
-  unsigned getRegisterBitWidth(bool Vector) const override;
-  unsigned getMaxInterleaveFactor() const override;
-  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
-                                  OperandValueKind, OperandValueProperties,
-                                  OperandValueProperties) const override;
-  unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
-                          int Index, Type *SubTp) const override;
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
-                            Type *Src) const override;
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                              Type *CondTy) const override;
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
-                              unsigned Index) const override;
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) const override;
-
-  unsigned getAddressComputationCost(Type *PtrTy,
-                                     bool IsComplex) const override;
-
-  unsigned getReductionCost(unsigned Opcode, Type *Ty,
-                            bool IsPairwiseForm) const override;
-
-  unsigned getIntImmCost(int64_t) const;
-
-  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
-
-  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                         Type *Ty) const override;
-  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                         Type *Ty) const override;
-
-  /// @}
-};
-
-} // end anonymous namespace
-
-INITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti",
-                   "X86 Target Transform Info", true, true, false)
-char X86TTI::ID = 0;
-
-ImmutablePass *
-llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) {
-  return new X86TTI(TM);
-}
-
-
 //===----------------------------------------------------------------------===//
 //
 // X86 cost model.
 //
 //===----------------------------------------------------------------------===//
 
-X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
+TargetTransformInfo::PopcntSupportKind
+X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
   // TODO: Currently the __builtin_popcount() implementation using SSE3
   //   instructions is inefficient. Once the problem is fixed, we should
   //   call ST->hasSSE3() instead of ST->hasPOPCNT().
-  return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software;
+  return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
 }
 
-unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
+unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
   if (Vector && !ST->hasSSE1())
     return 0;
 
@@ -153,7 +52,7 @@ unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
   return 8;
 }
 
-unsigned X86TTI::getRegisterBitWidth(bool Vector) const {
+unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
   if (Vector) {
     if (ST->hasAVX512()) return 512;
     if (ST->hasAVX()) return 256;
@@ -167,7 +66,7 @@ unsigned X86TTI::getRegisterBitWidth(bool Vector) const {
 
 }
 
-unsigned X86TTI::getMaxInterleaveFactor() const {
+unsigned X86TTIImpl::getMaxInterleaveFactor() {
   if (ST->isAtom())
     return 1;
 
@@ -179,10 +78,10 @@ unsigned X86TTI::getMaxInterleaveFactor() const {
   return 2;
 }
 
-unsigned X86TTI::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, OperandValueKind Op1Info,
-    OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo,
-    OperandValueProperties Opd2PropInfo) const {
+unsigned X86TTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
+    TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
   // Legalize the type.
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
 
@@ -352,7 +251,7 @@ unsigned X86TTI::getArithmeticInstrCost(
     { ISD::SHL,  MVT::v8i16,  8*10 }, // Scalarized.
     { ISD::SHL,  MVT::v4i32,  2*5 }, // We optimized this using mul.
     { ISD::SHL,  MVT::v2i64,  2*10 }, // Scalarized.
-    { ISD::SHL,  MVT::v4i64,  4*10 }, // Scalarized. 
+    { ISD::SHL,  MVT::v4i64,  4*10 }, // Scalarized.
 
     { ISD::SRL,  MVT::v16i8,  16*10 }, // Scalarized.
     { ISD::SRL,  MVT::v8i16,  8*10 }, // Scalarized.
@@ -437,17 +336,16 @@ unsigned X86TTI::getArithmeticInstrCost(
     return LT.first * 6;
 
   // Fallback to the default implementation.
-  return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
-                                                     Op2Info);
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
 }
 
-unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
-                                Type *SubTp) const {
+unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                                    Type *SubTp) {
   // We only estimate the cost of reverse and alternate shuffles.
-  if (Kind != SK_Reverse && Kind != SK_Alternate)
-    return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+  if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
+    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 
-  if (Kind == SK_Reverse) {
+  if (Kind == TTI::SK_Reverse) {
     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
     unsigned Cost = 1;
     if (LT.second.getSizeInBits() > 128)
@@ -457,7 +355,7 @@ unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
     return Cost * LT.first;
   }
 
-  if (Kind == SK_Alternate) {
+  if (Kind == TTI::SK_Alternate) {
     // 64-bit packed float vectors (v2f32) are widened to type v4f32.
     // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
@@ -525,7 +423,7 @@ unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
       {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or
       {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}  // pshufb + pshufb + or
     };
- 
+
     if (ST->hasSSSE3()) {
       int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
       if (Idx != -1)
@@ -538,7 +436,7 @@ unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
 
       {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd
       {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd
- 
+
       // This is expanded into a long sequence of four extract + four insert.
       {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw.
 
@@ -546,17 +444,17 @@ unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
       {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48}
     };
 
-    // Fall-back (SSE3 and SSE2). 
+    // Fall-back (SSE3 and SSE2).
     int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
     if (Idx != -1)
       return LT.first * SSEAltShuffleTbl[Idx].Cost;
-    return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
   }
 
-  return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
-unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
+unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -638,7 +536,7 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
 
   // The function getSimpleVT only handles simple value types.
   if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+    return BaseT::getCastInstrCost(Opcode, Dst, Src);
 
   static const TypeConversionCostTblEntry<MVT::SimpleValueType>
   AVX2ConversionTbl[] = {
@@ -757,11 +655,11 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
       return AVXConversionTbl[Idx].Cost;
   }
 
-  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+  return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                    Type *CondTy) const {
+unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                        Type *CondTy) {
   // Legalize the type.
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
 
@@ -827,11 +725,11 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       return LT.first * SSE42CostTbl[Idx].Cost;
   }
 
-  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 }
 
-unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                    unsigned Index) const {
+unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                        unsigned Index) {
   assert(Val->isVectorTy() && "This must be a vector type");
 
   if (Index != -1U) {
@@ -851,26 +749,27 @@ unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
       return 0;
   }
 
-  return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+  return BaseT::getVectorInstrCost(Opcode, Val, Index);
 }
 
-unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert,
-                                            bool Extract) const {
+unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,
+                                              bool Extract) {
   assert (Ty->isVectorTy() && "Can only scalarize vectors");
   unsigned Cost = 0;
 
   for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
     if (Insert)
-      Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+      Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i);
     if (Extract)
-      Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+      Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i);
   }
 
   return Cost;
 }
 
-unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                 unsigned AddressSpace) const {
+unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                     unsigned Alignment,
+                                     unsigned AddressSpace) {
   // Handle non-power-of-two vectors such as <3 x float>
   if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
     unsigned NumElem = VTy->getVectorNumElements();
@@ -888,10 +787,8 @@ unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
 
     // Assume that all other non-power-of-two numbers are scalarized.
     if (!isPowerOf2_32(NumElem)) {
-      unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode,
-                                                           VTy->getScalarType(),
-                                                           Alignment,
-                                                           AddressSpace);
+      unsigned Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(),
+                                             Alignment, AddressSpace);
       unsigned SplitCost = getScalarizationOverhead(Src,
                                                     Opcode == Instruction::Load,
                                                     Opcode==Instruction::Store);
@@ -915,7 +812,60 @@ unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
   return Cost;
 }
 
-unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
+                                           unsigned Alignment,
+                                           unsigned AddressSpace) {
+  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
+  if (!SrcVTy)
+    // To calculate scalar take the regular cost, without mask
+    return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
+
+  unsigned NumElem = SrcVTy->getVectorNumElements();
+  VectorType *MaskTy =
+    VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem);
+  if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy, 1)) ||
+      (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy, 1)) ||
+      !isPowerOf2_32(NumElem)) {
+    // Scalarization
+    unsigned MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
+    unsigned ScalarCompareCost =
+      getCmpSelInstrCost(Instruction::ICmp,
+                         Type::getInt8Ty(getGlobalContext()), NULL);
+    unsigned BranchCost = getCFInstrCost(Instruction::Br);
+    unsigned MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
+
+    unsigned ValueSplitCost =
+      getScalarizationOverhead(SrcVTy, Opcode == Instruction::Load,
+                               Opcode == Instruction::Store);
+    unsigned MemopCost =
+        NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+                                         Alignment, AddressSpace);
+    return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
+  }
+
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(SrcVTy);
+  unsigned Cost = 0;
+  if (LT.second != TLI->getValueType(SrcVTy).getSimpleVT() &&
+      LT.second.getVectorNumElements() == NumElem)
+    // Promotion requires expand/truncate for data and a shuffle for mask.
+    Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) +
+            getShuffleCost(TTI::SK_Alternate, MaskTy, 0, 0);
+
+  else if (LT.second.getVectorNumElements() > NumElem) {
+    VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
+                                            LT.second.getVectorNumElements());
+    // Expanding requires fill mask with zeroes
+    Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
+  }
+  if (!ST->hasAVX512())
+    return Cost + LT.first*4; // Each maskmov costs 4
+
+  // AVX-512 masked load/store is cheapper
+  return Cost+LT.first;
+}
+
+unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
@@ -925,22 +875,22 @@ unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
   if (Ty->isVectorTy() && IsComplex)
     return NumVectorInstToHideOverhead;
 
-  return TargetTransformInfo::getAddressComputationCost(Ty, IsComplex);
+  return BaseT::getAddressComputationCost(Ty, IsComplex);
 }
 
-unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy,
-                                  bool IsPairwise) const {
-  
+unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
+                                      bool IsPairwise) {
+
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
-  
+
   MVT MTy = LT.second;
-  
+
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
- 
-  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 
-  // and make it as the cost. 
- 
+
+  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
+  // and make it as the cost.
+
   static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = {
     { ISD::FADD,  MVT::v2f64,   2 },
     { ISD::FADD,  MVT::v4f32,   4 },
@@ -948,7 +898,7 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy,
     { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
     { ISD::ADD,   MVT::v8i16,   5 },
   };
- 
+
   static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = {
     { ISD::FADD,  MVT::v4f32,   4 },
     { ISD::FADD,  MVT::v4f64,   5 },
@@ -967,7 +917,7 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy,
     { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
     { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
   };
-  
+
   static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = {
     { ISD::FADD,  MVT::v4f32,   3 },
     { ISD::FADD,  MVT::v4f64,   3 },
@@ -978,14 +928,14 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy,
     { ISD::ADD,   MVT::v8i16,   4 },
     { ISD::ADD,   MVT::v8i32,   5 },
   };
-  
+
   if (IsPairwise) {
     if (ST->hasAVX()) {
       int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy);
       if (Idx != -1)
         return LT.first * AVX1CostTblPairWise[Idx].Cost;
     }
-  
+
     if (ST->hasSSE42()) {
       int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy);
       if (Idx != -1)
@@ -997,7 +947,7 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy,
       if (Idx != -1)
         return LT.first * AVX1CostTblNoPairWise[Idx].Cost;
     }
-    
+
     if (ST->hasSSE42()) {
       int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy);
       if (Idx != -1)
@@ -1005,23 +955,23 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy,
     }
   }
 
-  return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise);
+  return BaseT::getReductionCost(Opcode, ValTy, IsPairwise);
 }
 
 /// \brief Calculate the cost of materializing a 64-bit value. This helper
 /// method might only calculate a fraction of a larger immediate. Therefore it
 /// is valid to return a cost of ZERO.
-unsigned X86TTI::getIntImmCost(int64_t Val) const {
+unsigned X86TTIImpl::getIntImmCost(int64_t Val) {
   if (Val == 0)
-    return TCC_Free;
+    return TTI::TCC_Free;
 
   if (isInt<32>(Val))
-    return TCC_Basic;
+    return TTI::TCC_Basic;
 
-  return 2 * TCC_Basic;
+  return 2 * TTI::TCC_Basic;
 }
 
-unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+unsigned X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -1033,10 +983,10 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
   // Fixme: Create a cost model for types larger than i128 once the codegen
   // issues have been fixed.
   if (BitSize > 128)
-    return TCC_Free;
+    return TTI::TCC_Free;
 
   if (Imm == 0)
-    return TCC_Free;
+    return TTI::TCC_Free;
 
   // Sign-extend all constants to a multiple of 64-bit.
   APInt ImmVal = Imm;
@@ -1055,26 +1005,27 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
   return std::max(1U, Cost);
 }
 
-unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                               Type *Ty) const {
+unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
+                                   const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
   // There is no cost model for constants with a bit size of 0. Return TCC_Free
   // here, so that constant hoisting will ignore this constant.
   if (BitSize == 0)
-    return TCC_Free;
+    return TTI::TCC_Free;
 
   unsigned ImmIdx = ~0U;
   switch (Opcode) {
-  default: return TCC_Free;
+  default:
+    return TTI::TCC_Free;
   case Instruction::GetElementPtr:
     // Always hoist the base address of a GetElementPtr. This prevents the
     // creation of new constants for every base constant that gets constant
     // folded with the offset.
     if (Idx == 0)
-      return 2 * TCC_Basic;
-    return TCC_Free;
+      return 2 * TTI::TCC_Basic;
+    return TTI::TCC_Free;
   case Instruction::Store:
     ImmIdx = 0;
     break;
@@ -1096,7 +1047,7 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
   case Instruction::LShr:
   case Instruction::AShr:
     if (Idx == 1)
-      return TCC_Free;
+      return TTI::TCC_Free;
     break;
   case Instruction::Trunc:
   case Instruction::ZExt:
@@ -1114,27 +1065,28 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
 
   if (Idx == ImmIdx) {
     unsigned NumConstants = (BitSize + 63) / 64;
-    unsigned Cost = X86TTI::getIntImmCost(Imm, Ty);
-    return (Cost <= NumConstants * TCC_Basic)
-      ? static_cast<unsigned>(TCC_Free)
-      : Cost;
+    unsigned Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
+    return (Cost <= NumConstants * TTI::TCC_Basic)
+               ? static_cast<unsigned>(TTI::TCC_Free)
+               : Cost;
   }
 
-  return X86TTI::getIntImmCost(Imm, Ty);
+  return X86TTIImpl::getIntImmCost(Imm, Ty);
 }
 
-unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
-                               const APInt &Imm, Type *Ty) const {
+unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                                   const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
   // There is no cost model for constants with a bit size of 0. Return TCC_Free
   // here, so that constant hoisting will ignore this constant.
   if (BitSize == 0)
-    return TCC_Free;
+    return TTI::TCC_Free;
 
   switch (IID) {
-  default: return TCC_Free;
+  default:
+    return TTI::TCC_Free;
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::ssub_with_overflow:
@@ -1142,17 +1094,33 @@ unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
   case Intrinsic::smul_with_overflow:
   case Intrinsic::umul_with_overflow:
     if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
-      return TCC_Free;
+      return TTI::TCC_Free;
     break;
   case Intrinsic::experimental_stackmap:
     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
-      return TCC_Free;
+      return TTI::TCC_Free;
     break;
   case Intrinsic::experimental_patchpoint_void:
   case Intrinsic::experimental_patchpoint_i64:
     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
-      return TCC_Free;
+      return TTI::TCC_Free;
     break;
   }
-  return X86TTI::getIntImmCost(Imm, Ty);
+  return X86TTIImpl::getIntImmCost(Imm, Ty);
+}
+
+bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, int Consecutive) {
+  int DataWidth = DataTy->getPrimitiveSizeInBits();
+  
+  // Todo: AVX512 allows gather/scatter, works with strided and random as well
+  if ((DataWidth < 32) || (Consecutive == 0))
+    return false;
+  if (ST->hasAVX512() || ST->hasAVX2()) 
+    return true;
+  return false;
 }
+
+bool X86TTIImpl::isLegalMaskedStore(Type *DataType, int Consecutive) {
+  return isLegalMaskedLoad(DataType, Consecutive);
+}
+
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
new file mode 100644
index 0000000..9f0adcf
--- /dev/null
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -0,0 +1,112 @@
+//===-- X86TargetTransformInfo.h - X86 specific TTI -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// X86 target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H
+
+#include "X86.h"
+#include "X86TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
+  typedef BasicTTIImplBase<X86TTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const X86Subtarget *ST;
+  const X86TargetLowering *TLI;
+
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+
+  const X86Subtarget *getST() const { return ST; }
+  const X86TargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit X86TTIImpl(const X86TargetMachine *TM, Function &F)
+      : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {}
+
+  // Provide value semantics. MSVC requires that we spell all of these out.
+  X86TTIImpl(const X86TTIImpl &Arg)
+      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
+  X86TTIImpl(X86TTIImpl &&Arg)
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+        TLI(std::move(Arg.TLI)) {}
+  X86TTIImpl &operator=(const X86TTIImpl &RHS) {
+    BaseT::operator=(static_cast<const BaseT &>(RHS));
+    ST = RHS.ST;
+    TLI = RHS.TLI;
+    return *this;
+  }
+  X86TTIImpl &operator=(X86TTIImpl &&RHS) {
+    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
+    ST = std::move(RHS.ST);
+    TLI = std::move(RHS.TLI);
+    return *this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getMaxInterleaveFactor();
+  unsigned getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                          Type *SubTp);
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace);
+  unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                 unsigned AddressSpace);
+
+  unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex);
+
+  unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
+
+  unsigned getIntImmCost(int64_t);
+
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty);
+
+  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                         Type *Ty);
+  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                         Type *Ty);
+  bool isLegalMaskedLoad(Type *DataType, int Consecutive);
+  bool isLegalMaskedStore(Type *DataType, int Consecutive);
+
+  /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index d93baeb..99ba4c0 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -9,7 +9,7 @@
 //
 // This file defines the pass which inserts x86 AVX vzeroupper instructions
 // before calls to SSE encoded functions. This avoids transition latency
-// penalty when tranfering control between AVX encoded instructions and old
+// penalty when transferring control between AVX encoded instructions and old
 // SSE encoding mode.
 //
 //===----------------------------------------------------------------------===//
@@ -171,7 +171,7 @@ void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
 }
 
 /// processBasicBlock - Loop over all of the instructions in the basic block,
-/// inserting vzero upper instructions before function calls.
+/// inserting vzeroupper instructions before function calls.
 void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
 
   // Start by assuming that the block PASS_THROUGH, which implies no unguarded
@@ -202,7 +202,7 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
     // If the call won't clobber any YMM register, skip it as well. It usually
     // happens on helper function calls (such as '_chkstk', '_ftol2') where
     // standard calling convention is not used (RegMask is not used to mark
-    // register clobbered and register usage (def/imp-def/use) is well-dfined
+    // register clobbered and register usage (def/imp-def/use) is well-defined
     // and explicitly specified.
     if (MI->isCall() && !callClobbersAnyYmmReg(MI))
       continue;
@@ -245,25 +245,29 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
 }
 
 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
-/// vzero upper instructions before function calls.
+/// vzeroupper instructions before function calls.
 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
-  const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
+  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
   if (!ST.hasAVX() || ST.hasAVX512())
     return false;
-  TII = MF.getSubtarget().getInstrInfo();
+  TII = ST.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   EverMadeChange = false;
 
+  bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
+
   // Fast check: if the function doesn't use any ymm registers, we don't need
   // to insert any VZEROUPPER instructions.  This is constant-time, so it is
   // cheap in the common case of no ymm use.
-  bool YMMUsed = false;
-  const TargetRegisterClass *RC = &X86::VR256RegClass;
-  for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end();
-       i != e; i++) {
-    if (!MRI.reg_nodbg_empty(*i)) {
-      YMMUsed = true;
-      break;
+  bool YMMUsed = FnHasLiveInYmm;
+  if (!YMMUsed) {
+    const TargetRegisterClass *RC = &X86::VR256RegClass;
+    for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
+         i++) {
+      if (!MRI.reg_nodbg_empty(*i)) {
+        YMMUsed = true;
+        break;
+      }
     }
   }
   if (!YMMUsed) {
@@ -282,7 +286,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
 
   // If any YMM regs are live in to this function, add the entry block to the
   // DirtySuccessors list
-  if (checkFnHasLiveInYmm(MRI))
+  if (FnHasLiveInYmm)
     addDirtySuccessor(MF.front());
 
   // Re-visit all blocks that are successors of EXITS_DIRTY bsocks. Add
diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
index 5ad0754..0a609ef 100644
--- a/lib/Target/XCore/CMakeLists.txt
+++ b/lib/Target/XCore/CMakeLists.txt
@@ -22,7 +22,6 @@ add_llvm_target(XCoreCodeGen
   XCoreSubtarget.cpp
   XCoreTargetMachine.cpp
   XCoreTargetObjectFile.cpp
-  XCoreTargetTransformInfo.cpp
   XCoreSelectionDAGInfo.cpp
   XCoreFrameToArgsOffsetElim.cpp
   )
diff --git a/lib/Target/XCore/XCore.h b/lib/Target/XCore/XCore.h
index 140ba2a..ba6ca84 100644
--- a/lib/Target/XCore/XCore.h
+++ b/lib/Target/XCore/XCore.h
@@ -32,8 +32,6 @@ namespace llvm {
                                    CodeGenOpt::Level OptLevel);
   ModulePass *createXCoreLowerThreadLocalPass();
 
-  ImmutablePass *createXCoreTargetTransformInfoPass(const XCoreTargetMachine *TM);
-
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 82e4e36..4f7a7e9 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -50,14 +50,13 @@ using namespace llvm;
 
 namespace {
   class XCoreAsmPrinter : public AsmPrinter {
-    const XCoreSubtarget &Subtarget;
     XCoreMCInstLower MCInstLowering;
     XCoreTargetStreamer &getTargetStreamer();
 
   public:
-    explicit XCoreAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : AsmPrinter(TM, Streamer), Subtarget(TM.getSubtarget<XCoreSubtarget>()),
-        MCInstLowering(*this) {}
+    explicit XCoreAsmPrinter(TargetMachine &TM,
+                             std::unique_ptr<MCStreamer> Streamer)
+        : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(*this) {}
 
     const char *getPassName() const override {
       return "XCore Assembly Printer";
@@ -105,7 +104,6 @@ void XCoreAsmPrinter::emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV) {
                                                       OutContext));
     if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
         GV->hasCommonLinkage()) {
-      // TODO Use COMDAT groups for LinkOnceLinkage
       OutStreamer.EmitSymbolAttribute(SymGlob, MCSA_Weak);
     }
   }
@@ -117,7 +115,7 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
       EmitSpecialLLVMGlobal(GV))
     return;
 
-  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *TD = TM.getDataLayout();
   OutStreamer.SwitchSection(
       getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
 
@@ -140,7 +138,6 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     emitArrayBound(GVSym, GV);
     OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
 
-    // TODO Use COMDAT groups for LinkOnceLinkage
     if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
         GV->hasCommonLinkage())
       OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Weak);
@@ -210,7 +207,7 @@ printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O,
 
 void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                    raw_ostream &O) {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   const MachineOperand &MO = MI->getOperand(opNum);
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index 7c74340..e0ac0e5 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -226,8 +226,7 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo *MMI = &MF.getMMI();
   const MCRegisterInfo *MRI = MMI->getContext().getRegisterInfo();
-  const XCoreInstrInfo &TII =
-      *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const XCoreInstrInfo &TII = *MF.getSubtarget<XCoreSubtarget>().getInstrInfo();
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   // Debug location must be unknown since the first debug location is used
   // to determine the end of the prologue.
@@ -341,8 +340,7 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
                                      MachineBasicBlock &MBB) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  const XCoreInstrInfo &TII =
-      *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const XCoreInstrInfo &TII = *MF.getSubtarget<XCoreSubtarget>().getInstrInfo();
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   DebugLoc dl = MBBI->getDebugLoc();
   unsigned RetOpcode = MBBI->getOpcode();
@@ -480,8 +478,7 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 void XCoreFrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
-  const XCoreInstrInfo &TII =
-      *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const XCoreInstrInfo &TII = *MF.getSubtarget<XCoreSubtarget>().getInstrInfo();
   if (!hasReservedCallFrame(MF)) {
     // Turn the adjcallstackdown instruction into 'extsp <amt>' and the
     // adjcallstackup instruction into 'ldaw sp, sp[<amt>]'
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index 86bc6f2..f79b78b 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -37,12 +37,10 @@ using namespace llvm;
 ///
 namespace {
   class XCoreDAGToDAGISel : public SelectionDAGISel {
-    const XCoreSubtarget &Subtarget;
 
   public:
     XCoreDAGToDAGISel(XCoreTargetMachine &TM, CodeGenOpt::Level OptLevel)
-      : SelectionDAGISel(TM, OptLevel),
-        Subtarget(*TM.getSubtargetImpl()) { }
+      : SelectionDAGISel(TM, OptLevel) {}
 
     SDNode *Select(SDNode *N) override;
     SDNode *SelectBRIND(SDNode *N);
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 96c43ae..6e8a95a 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -68,15 +68,15 @@ getTargetNodeName(unsigned Opcode) const
   }
 }
 
-XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM)
-    : TargetLowering(TM), TM(TM),
-      Subtarget(TM.getSubtarget<XCoreSubtarget>()) {
+XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM,
+                                         const XCoreSubtarget &Subtarget)
+    : TargetLowering(TM), TM(TM), Subtarget(Subtarget) {
 
   // Set up the register classes.
   addRegisterClass(MVT::i32, &XCore::GRRegsRegClass);
 
   // Compute derived properties from the register classes
-  computeRegisterProperties();
+  computeRegisterProperties(Subtarget.getRegisterInfo());
 
   // Division is expensive
   setIntDivIsCheap(false);
@@ -127,12 +127,14 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM)
   setOperationAction(ISD::ConstantPool, MVT::i32,   Custom);
 
   // Loads
-  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Expand);
+  }
 
   // Custom expand misaligned loads / stores.
   setOperationAction(ISD::LOAD, MVT::i32, Custom);
@@ -805,8 +807,7 @@ SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op,
     return SDValue();
 
   MachineFunction &MF = DAG.getMachineFunction();
-  const TargetRegisterInfo *RegInfo =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
+  const TargetRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op),
                             RegInfo->getFrameRegister(MF), MVT::i32);
 }
@@ -852,8 +853,7 @@ LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   // Absolute SP = (FP + FrameToArgs) + Offset
-  const TargetRegisterInfo *RegInfo =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
+  const TargetRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   SDValue Stack = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
                             RegInfo->getFrameRegister(MF), MVT::i32);
   SDValue FrameToArgs = DAG.getNode(XCoreISD::FRAME_TO_ARGS_OFFSET, dl,
@@ -1371,8 +1371,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
       XCore::R0, XCore::R1, XCore::R2, XCore::R3
     };
     XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
-    unsigned FirstVAReg = CCInfo.getFirstUnallocated(ArgRegs,
-                                                     array_lengthof(ArgRegs));
+    unsigned FirstVAReg = CCInfo.getFirstUnallocated(ArgRegs);
     if (FirstVAReg < array_lengthof(ArgRegs)) {
       int offset = 0;
       // Save remaining registers, storing higher register numbers at a higher
@@ -1548,8 +1547,7 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
 MachineBasicBlock *
 XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                  MachineBasicBlock *BB) const {
-  const TargetInstrInfo &TII =
-      *getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
   assert((MI->getOpcode() == XCore::SELECT_CC) &&
          "Unexpected instr type to insert");
@@ -1922,7 +1920,7 @@ XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
   if (Ty->getTypeID() == Type::VoidTyID)
     return AM.Scale == 0 && isImmUs(AM.BaseOffs) && isImmUs4(AM.BaseOffs);
 
-  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *TD = TM.getDataLayout();
   unsigned Size = TD->getTypeAllocSize(Ty);
   if (AM.BaseGV) {
     return Size >= 4 && !AM.HasBaseReg && AM.Scale == 0 &&
@@ -1959,10 +1957,10 @@ XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
 //                           XCore Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
-std::pair<unsigned, const TargetRegisterClass*>
-XCoreTargetLowering::
-getRegForInlineAsmConstraint(const std::string &Constraint,
-                             MVT VT) const {
+std::pair<unsigned, const TargetRegisterClass *>
+XCoreTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                  const std::string &Constraint,
+                                                  MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default : break;
@@ -1972,5 +1970,5 @@ getRegForInlineAsmConstraint(const std::string &Constraint,
   }
   // Use the default implementation in TargetLowering to convert the register
   // constraint into a member of a register class.
-  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 13154c6..213ae4a 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -93,8 +93,8 @@ namespace llvm {
   class XCoreTargetLowering : public TargetLowering
   {
   public:
-
-    explicit XCoreTargetLowering(const TargetMachine &TM);
+    explicit XCoreTargetLowering(const TargetMachine &TM,
+                                 const XCoreSubtarget &Subtarget);
 
     using TargetLowering::isZExtFree;
     bool isZExtFree(SDValue Val, EVT VT2) const override;
@@ -172,8 +172,9 @@ namespace llvm {
     SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
 
     // Inline asm support
-    std::pair<unsigned, const TargetRegisterClass*>
-    getRegForInlineAsmConstraint(const std::string &Constraint,
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 const std::string &Constraint,
                                  MVT VT) const override;
 
     // Expand specifics
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index d34ed7a..8e9bb45 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -381,7 +381,7 @@ def Int_MemBarrier : PseudoInstXCore<(outs), (ins), "#MEMBARRIER",
 // Three operand short
 defm ADD : F3R_2RUS<0b00010, 0b10010, "add", add>;
 defm SUB : F3R_2RUS<0b00011, 0b10011, "sub", sub>;
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 defm EQ : F3R_2RUS_np<0b00110, 0b10110, "eq">;
 def LSS_3r : F3R_np<0b11000, "lss">;
 def LSU_3r : F3R_np<0b11001, "lsu">;
@@ -432,7 +432,7 @@ def LDAWF_l3r : _FL3R<0b000111100, (outs GRRegs:$dst),
                       [(set GRRegs:$dst,
                          (ldawf GRRegs:$addr, GRRegs:$offset))]>;
 
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 def LDAWF_l2rus : _FL2RUS<0b100111100, (outs GRRegs:$dst),
                           (ins GRRegs:$addr, i32imm:$offset),
                           "ldaw $dst, $addr[$offset]", []>;
@@ -443,7 +443,7 @@ def LDAWB_l3r : _FL3R<0b001001100, (outs GRRegs:$dst),
                       [(set GRRegs:$dst,
                          (ldawb GRRegs:$addr, GRRegs:$offset))]>;
 
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 def LDAWB_l2rus : _FL2RUS<0b101001100, (outs GRRegs:$dst),
                          (ins GRRegs:$addr, i32imm:$offset),
                          "ldaw $dst, $addr[-$offset]", []>;
@@ -538,7 +538,7 @@ def LMUL_l6r : _FL6R<
 // Register - U6
 
 //let Uses = [DP] in ...
-let neverHasSideEffects = 1, isReMaterializable = 1 in
+let hasSideEffects = 0, isReMaterializable = 1 in
 def LDAWDP_ru6: _FRU6<0b011000, (outs RRegs:$a), (ins i32imm:$b),
                       "ldaw $a, dp[$b]", []>;
 
@@ -564,7 +564,7 @@ def STWDP_lru6 : _FLRU6<0b010100, (outs), (ins RRegs:$a, i32imm:$b),
                         [(store RRegs:$a, (dprelwrapper tglobaladdr:$b))]>;
 
 //let Uses = [CP] in ..
-let mayLoad = 1, isReMaterializable = 1, neverHasSideEffects = 1 in {
+let mayLoad = 1, isReMaterializable = 1, hasSideEffects = 0 in {
 def LDWCP_ru6 : _FRU6<0b011011, (outs RRegs:$a), (ins i32imm:$b),
                       "ldw $a, cp[$b]", []>;
 def LDWCP_lru6: _FLRU6<0b011011, (outs RRegs:$a), (ins i32imm:$b),
@@ -593,7 +593,7 @@ def LDWSP_lru6 : _FLRU6<0b010111, (outs RRegs:$a), (ins i32imm:$b),
                         [(set RRegs:$a, (XCoreLdwsp immU16:$b))]>;
 }
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 def LDAWSP_ru6 : _FRU6<0b011001, (outs RRegs:$a), (ins i32imm:$b),
                        "ldaw $a, sp[$b]", []>;
 
@@ -628,7 +628,7 @@ defm BRBF: FRU6_LRU6_backwards_branch<0b011111, "bf">;
 
 // U6
 let Defs = [SP], Uses = [SP] in {
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 defm EXTSP : FU6_LU6_np<0b0111011110, "extsp">;
 
 let mayStore = 1 in
@@ -639,7 +639,7 @@ defm RETSP : FU6_LU6<0b0111011111, "retsp", XCoreRetsp>;
 }
 }
 
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 defm EXTDP : FU6_LU6_np<0b0111001110, "extdp">;
 
 let Uses = [R11], isCall=1 in
@@ -656,7 +656,7 @@ def BRFU_lu6 : _FLU6<0b0111001100, (outs), (ins brtarget:$a), "bu $a", []>;
 }
 
 //let Uses = [CP] in ...
-let Defs = [R11], neverHasSideEffects = 1, isReMaterializable = 1 in
+let Defs = [R11], hasSideEffects = 0, isReMaterializable = 1 in
 def LDAWCP_u6: _FU6<0b0111111101, (outs), (ins i32imm:$a), "ldaw r11, cp[$a]",
                     []>;
 
@@ -690,17 +690,17 @@ defm KRESTSP : FU6_LU6_np<0b0111101111, "krestsp">;
 // U10
 
 let Defs = [R11], isReMaterializable = 1 in {
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 def LDAPF_u10 : _FU10<0b110110, (outs), (ins pcrel_imm:$a), "ldap r11, $a", []>;
 
 def LDAPF_lu10 : _FLU10<0b110110, (outs), (ins pcrel_imm:$a), "ldap r11, $a",
                         [(set R11, (pcrelwrapper tglobaladdr:$a))]>;
 
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 def LDAPB_u10 : _FU10<0b110111, (outs), (ins pcrel_imm_neg:$a), "ldap r11, $a",
                       []>;
 
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 def LDAPB_lu10 : _FLU10<0b110111, (outs), (ins pcrel_imm_neg:$a),
                         "ldap r11, $a",
                         [(set R11, (pcrelwrapper tglobaladdr:$a))]>;
@@ -729,7 +729,7 @@ def BLRB_lu10 : _FLU10<0b110101, (outs), (ins pcrel_imm_neg:$a), "bl $a", []>;
 }
 
 let Defs = [R11], mayLoad = 1, isReMaterializable = 1,
-    neverHasSideEffects = 1 in {
+    hasSideEffects = 0 in {
 def LDWCP_u10 : _FU10<0b111001, (outs), (ins i32imm:$a), "ldw r11, cp[$a]", []>;
 
 def LDWCP_lu10 : _FLU10<0b111001, (outs), (ins i32imm:$a), "ldw r11, cp[$a]",
@@ -772,7 +772,7 @@ def ANDNOT_2r :
              [(set GRRegs:$dst, (and GRRegs:$src1, (not GRRegs:$src2)))]>;
 }
 
-let isReMaterializable = 1, neverHasSideEffects = 1 in
+let isReMaterializable = 1, hasSideEffects = 0 in
 def MKMSK_rus : _FRUSBitp<0b101001, (outs GRRegs:$dst), (ins i32imm:$size),
                           "mkmsk $dst, $size", []>;
 
@@ -972,13 +972,13 @@ def BR_JT32 : PseudoInstXCore<(outs), (ins InlineJT32:$t, GRRegs:$i),
 let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
 def BRU_1r : _F1R<0b001010, (outs), (ins GRRegs:$a), "bru $a", []>;
 
-let Defs=[SP], neverHasSideEffects=1 in
+let Defs=[SP], hasSideEffects=0 in
 def SETSP_1r : _F1R<0b001011, (outs), (ins GRRegs:$a), "set sp, $a", []>;
 
-let neverHasSideEffects=1 in
+let hasSideEffects=0 in
 def SETDP_1r : _F1R<0b001100, (outs), (ins GRRegs:$a), "set dp, $a", []>;
 
-let neverHasSideEffects=1 in
+let hasSideEffects=0 in
 def SETCP_1r : _F1R<0b001101, (outs), (ins GRRegs:$a), "set cp, $a", []>;
 
 let hasCtrlDep = 1 in 
diff --git a/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index ac3bae5..b4c6a50 100644
--- a/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -137,7 +137,7 @@ static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
             if (PN->getIncomingValue(I) == CE) {
               BasicBlock *PredBB = PN->getIncomingBlock(I);
               if (PredBB->getTerminator()->getNumSuccessors() > 1)
-                PredBB = SplitEdge(PredBB, PN->getParent(), P);
+                PredBB = SplitEdge(PredBB, PN->getParent());
               Instruction *InsertPos = PredBB->getTerminator();
               Instruction *NewInst = createReplacementInstr(CE, InsertPos);
               PN->setOperand(I, NewInst);
diff --git a/lib/Target/XCore/XCoreSubtarget.cpp b/lib/Target/XCore/XCoreSubtarget.cpp
index 7227411..7996020 100644
--- a/lib/Target/XCore/XCoreSubtarget.cpp
+++ b/lib/Target/XCore/XCoreSubtarget.cpp
@@ -27,6 +27,5 @@ void XCoreSubtarget::anchor() { }
 
 XCoreSubtarget::XCoreSubtarget(const std::string &TT, const std::string &CPU,
                                const std::string &FS, const TargetMachine &TM)
-    : XCoreGenSubtargetInfo(TT, CPU, FS),
-      DL("e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32"),
-      InstrInfo(), FrameLowering(*this), TLInfo(TM), TSInfo(DL) {}
+    : XCoreGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(*this),
+      TLInfo(TM, *this), TSInfo(*TM.getDataLayout()) {}
diff --git a/lib/Target/XCore/XCoreSubtarget.h b/lib/Target/XCore/XCoreSubtarget.h
index 695578d..da51ef1 100644
--- a/lib/Target/XCore/XCoreSubtarget.h
+++ b/lib/Target/XCore/XCoreSubtarget.h
@@ -31,7 +31,6 @@ class StringRef;
 
 class XCoreSubtarget : public XCoreGenSubtargetInfo {
   virtual void anchor();
-  const DataLayout DL;       // Calculates type size & alignment
   XCoreInstrInfo InstrInfo;
   XCoreFrameLowering FrameLowering;
   XCoreTargetLowering TLInfo;
@@ -61,7 +60,6 @@ public:
   const TargetRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
-  const DataLayout *getDataLayout() const override { return &DL; }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 0fa8c21..7998fc1 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -12,10 +12,11 @@
 
 #include "XCoreTargetMachine.h"
 #include "XCoreTargetObjectFile.h"
+#include "XCoreTargetTransformInfo.h"
 #include "XCore.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Module.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
@@ -28,6 +29,7 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT,
                                        CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
       TLOF(make_unique<XCoreTargetObjectFile>()),
+      DL("e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32"),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
@@ -48,7 +50,7 @@ public:
   void addIRPasses() override;
   bool addPreISel() override;
   bool addInstSelector() override;
-  bool addPreEmitPass() override;
+  void addPreEmitPass() override;
 };
 } // namespace
 
@@ -72,9 +74,8 @@ bool XCorePassConfig::addInstSelector() {
   return false;
 }
 
-bool XCorePassConfig::addPreEmitPass() {
-  addPass(createXCoreFrameToArgsOffsetEliminationPass());
-  return false;
+void XCorePassConfig::addPreEmitPass() {
+  addPass(createXCoreFrameToArgsOffsetEliminationPass(), false);
 }
 
 // Force static initialization.
@@ -82,10 +83,7 @@ extern "C" void LLVMInitializeXCoreTarget() {
   RegisterTargetMachine<XCoreTargetMachine> X(TheXCoreTarget);
 }
 
-void XCoreTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  // Add first the target-independent BasicTTI pass, then our XCore pass. This
-  // allows the XCore pass to delegate to the target independent layer when
-  // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(this));
-  PM.add(createXCoreTargetTransformInfoPass(this));
+TargetIRAnalysis XCoreTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis(
+      [this](Function &) { return TargetTransformInfo(XCoreTTIImpl(this)); });
 }
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index 8ff9269..c5df07c 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -21,6 +21,7 @@ namespace llvm {
 
 class XCoreTargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  const DataLayout DL; // Calculates type size & alignment
   XCoreSubtarget Subtarget;
 public:
   XCoreTargetMachine(const Target &T, StringRef TT,
@@ -29,12 +30,13 @@ public:
                      CodeGenOpt::Level OL);
   ~XCoreTargetMachine() override;
 
+  const DataLayout *getDataLayout() const override { return &DL; }
   const XCoreSubtarget *getSubtargetImpl() const override { return &Subtarget; }
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  void addAnalysisPasses(PassManagerBase &PM) override;
+  TargetIRAnalysis getTargetIRAnalysis() override;
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
diff --git a/lib/Target/XCore/XCoreTargetObjectFile.cpp b/lib/Target/XCore/XCoreTargetObjectFile.cpp
index 86d0de6..c435b36 100644
--- a/lib/Target/XCore/XCoreTargetObjectFile.cpp
+++ b/lib/Target/XCore/XCoreTargetObjectFile.cpp
@@ -21,66 +21,43 @@ using namespace llvm;
 void XCoreTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
 
-  BSSSection =
-    Ctx.getELFSection(".dp.bss", ELF::SHT_NOBITS,
-                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
-                      ELF::XCORE_SHF_DP_SECTION,
-                      SectionKind::getBSS());
-  BSSSectionLarge =
-    Ctx.getELFSection(".dp.bss.large", ELF::SHT_NOBITS,
-                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
-                      ELF::XCORE_SHF_DP_SECTION,
-                      SectionKind::getBSS());
-  DataSection =
-    Ctx.getELFSection(".dp.data", ELF::SHT_PROGBITS,
-                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
-                      ELF::XCORE_SHF_DP_SECTION,
-                      SectionKind::getDataRel());
-  DataSectionLarge =
-    Ctx.getELFSection(".dp.data.large", ELF::SHT_PROGBITS,
-                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
-                      ELF::XCORE_SHF_DP_SECTION,
-                      SectionKind::getDataRel());
-  DataRelROSection =
-    Ctx.getELFSection(".dp.rodata", ELF::SHT_PROGBITS,
-                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
-                      ELF::XCORE_SHF_DP_SECTION,
-                      SectionKind::getReadOnlyWithRel());
-  DataRelROSectionLarge =
-    Ctx.getELFSection(".dp.rodata.large", ELF::SHT_PROGBITS,
-                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
-                      ELF::XCORE_SHF_DP_SECTION,
-                      SectionKind::getReadOnlyWithRel());
+  BSSSection = Ctx.getELFSection(".dp.bss", ELF::SHT_NOBITS,
+                                 ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                                     ELF::XCORE_SHF_DP_SECTION);
+  BSSSectionLarge = Ctx.getELFSection(".dp.bss.large", ELF::SHT_NOBITS,
+                                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                                          ELF::XCORE_SHF_DP_SECTION);
+  DataSection = Ctx.getELFSection(".dp.data", ELF::SHT_PROGBITS,
+                                  ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                                      ELF::XCORE_SHF_DP_SECTION);
+  DataSectionLarge = Ctx.getELFSection(".dp.data.large", ELF::SHT_PROGBITS,
+                                       ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                                           ELF::XCORE_SHF_DP_SECTION);
+  DataRelROSection = Ctx.getELFSection(".dp.rodata", ELF::SHT_PROGBITS,
+                                       ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                                           ELF::XCORE_SHF_DP_SECTION);
+  DataRelROSectionLarge = Ctx.getELFSection(
+      ".dp.rodata.large", ELF::SHT_PROGBITS,
+      ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::XCORE_SHF_DP_SECTION);
   ReadOnlySection =
-    Ctx.getELFSection(".cp.rodata", ELF::SHT_PROGBITS,
-                      ELF::SHF_ALLOC |
-                      ELF::XCORE_SHF_CP_SECTION,
-                      SectionKind::getReadOnlyWithRel());
+      Ctx.getELFSection(".cp.rodata", ELF::SHT_PROGBITS,
+                        ELF::SHF_ALLOC | ELF::XCORE_SHF_CP_SECTION);
   ReadOnlySectionLarge =
-    Ctx.getELFSection(".cp.rodata.large", ELF::SHT_PROGBITS,
-                      ELF::SHF_ALLOC |
-                      ELF::XCORE_SHF_CP_SECTION,
-                      SectionKind::getReadOnlyWithRel());
-  MergeableConst4Section = 
-    Ctx.getELFSection(".cp.rodata.cst4", ELF::SHT_PROGBITS,
-                      ELF::SHF_ALLOC | ELF::SHF_MERGE |
-                      ELF::XCORE_SHF_CP_SECTION,
-                      SectionKind::getMergeableConst4());
-  MergeableConst8Section = 
-    Ctx.getELFSection(".cp.rodata.cst8", ELF::SHT_PROGBITS,
-                      ELF::SHF_ALLOC | ELF::SHF_MERGE |
-                      ELF::XCORE_SHF_CP_SECTION,
-                      SectionKind::getMergeableConst8());
-  MergeableConst16Section = 
-    Ctx.getELFSection(".cp.rodata.cst16", ELF::SHT_PROGBITS,
-                      ELF::SHF_ALLOC | ELF::SHF_MERGE |
-                      ELF::XCORE_SHF_CP_SECTION,
-                      SectionKind::getMergeableConst16());
+      Ctx.getELFSection(".cp.rodata.large", ELF::SHT_PROGBITS,
+                        ELF::SHF_ALLOC | ELF::XCORE_SHF_CP_SECTION);
+  MergeableConst4Section = Ctx.getELFSection(
+      ".cp.rodata.cst4", ELF::SHT_PROGBITS,
+      ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::XCORE_SHF_CP_SECTION, 4, "");
+  MergeableConst8Section = Ctx.getELFSection(
+      ".cp.rodata.cst8", ELF::SHT_PROGBITS,
+      ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::XCORE_SHF_CP_SECTION, 8, "");
+  MergeableConst16Section = Ctx.getELFSection(
+      ".cp.rodata.cst16", ELF::SHT_PROGBITS,
+      ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::XCORE_SHF_CP_SECTION, 16, "");
   CStringSection =
-    Ctx.getELFSection(".cp.rodata.string", ELF::SHT_PROGBITS,
-                      ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::SHF_STRINGS |
-                      ELF::XCORE_SHF_CP_SECTION,
-                      SectionKind::getReadOnlyWithRel());
+      Ctx.getELFSection(".cp.rodata.string", ELF::SHT_PROGBITS,
+                        ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::SHF_STRINGS |
+                            ELF::XCORE_SHF_CP_SECTION);
   // TextSection       - see MObjectFileInfo.cpp
   // StaticCtorSection - see MObjectFileInfo.cpp
   // StaticDtorSection - see MObjectFileInfo.cpp
@@ -128,7 +105,7 @@ XCoreTargetObjectFile::getExplicitSectionGlobal(const GlobalValue *GV,
   if (IsCPRel && !Kind.isReadOnly())
     report_fatal_error("Using .cp. section for writeable object.");
   return getContext().getELFSection(SectionName, getXCoreSectionType(Kind),
-                                    getXCoreSectionFlags(Kind, IsCPRel), Kind);
+                                    getXCoreSectionFlags(Kind, IsCPRel));
 }
 
 const MCSection *XCoreTargetObjectFile::
@@ -146,8 +123,7 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
   }
   Type *ObjType = GV->getType()->getPointerElementType();
   if (TM.getCodeModel() == CodeModel::Small || !ObjType->isSized() ||
-      TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(ObjType) <
-          CodeModelLargeSize) {
+      TM.getDataLayout()->getTypeAllocSize(ObjType) < CodeModelLargeSize) {
     if (Kind.isReadOnly())              return UseCPRel? ReadOnlySection
                                                        : DataRelROSection;
     if (Kind.isBSS() || Kind.isCommon())return BSSSection;
diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.cpp b/lib/Target/XCore/XCoreTargetTransformInfo.cpp
deleted file mode 100644
index da232da..0000000
--- a/lib/Target/XCore/XCoreTargetTransformInfo.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-//===-- XCoreTargetTransformInfo.cpp - XCore specific TTI pass ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file implements a TargetTransformInfo analysis pass specific to the
-/// XCore target machine. It uses the target's detailed information to provide
-/// more precise answers to certain TTI queries, while letting the target
-/// independent and default TTI implementations handle the rest.
-///
-//===----------------------------------------------------------------------===//
-
-#include "XCore.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/CostTable.h"
-#include "llvm/Target/TargetLowering.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "xcoretti"
-
-// Declare the pass initialization routine locally as target-specific passes
-// don't have a target-wide initialization entry point, and so we rely on the
-// pass constructor initialization.
-namespace llvm {
-void initializeXCoreTTIPass(PassRegistry &);
-}
-
-namespace {
-
-class XCoreTTI final : public ImmutablePass, public TargetTransformInfo {
-public:
-  XCoreTTI() : ImmutablePass(ID) {
-    llvm_unreachable("This pass cannot be directly constructed");
-  }
-
-  XCoreTTI(const XCoreTargetMachine *TM)
-      : ImmutablePass(ID) {
-    initializeXCoreTTIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void initializePass() override {
-    pushTTIStack(this);
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    TargetTransformInfo::getAnalysisUsage(AU);
-  }
-
-  static char ID;
-
-  void *getAdjustedAnalysisPointer(const void *ID) override {
-    if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo*)this;
-    return this;
-  }
-
-  unsigned getNumberOfRegisters(bool Vector) const override {
-    if (Vector) {
-       return 0;
-    }
-    return 12;
-  }
-};
-
-} // end anonymous namespace
-
-INITIALIZE_AG_PASS(XCoreTTI, TargetTransformInfo, "xcoretti",
-                   "XCore Target Transform Info", true, true, false)
-char XCoreTTI::ID = 0;
-
-
-ImmutablePass *
-llvm::createXCoreTargetTransformInfoPass(const XCoreTargetMachine *TM) {
-  return new XCoreTTI(TM);
-}
diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.h b/lib/Target/XCore/XCoreTargetTransformInfo.h
new file mode 100644
index 0000000..70b47df
--- /dev/null
+++ b/lib/Target/XCore/XCoreTargetTransformInfo.h
@@ -0,0 +1,72 @@
+//===-- XCoreTargetTransformInfo.h - XCore specific TTI ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// XCore target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCORETARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_XCORE_XCORETARGETTRANSFORMINFO_H
+
+#include "XCore.h"
+#include "XCoreTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class XCoreTTIImpl : public BasicTTIImplBase<XCoreTTIImpl> {
+  typedef BasicTTIImplBase<XCoreTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const XCoreSubtarget *ST;
+  const XCoreTargetLowering *TLI;
+
+  const XCoreSubtarget *getST() const { return ST; }
+  const XCoreTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit XCoreTTIImpl(const XCoreTargetMachine *TM)
+      : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {}
+
+  // Provide value semantics. MSVC requires that we spell all of these out.
+  XCoreTTIImpl(const XCoreTTIImpl &Arg)
+      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
+  XCoreTTIImpl(XCoreTTIImpl &&Arg)
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+        TLI(std::move(Arg.TLI)) {}
+  XCoreTTIImpl &operator=(const XCoreTTIImpl &RHS) {
+    BaseT::operator=(static_cast<const BaseT &>(RHS));
+    ST = RHS.ST;
+    TLI = RHS.TLI;
+    return *this;
+  }
+  XCoreTTIImpl &operator=(XCoreTTIImpl &&RHS) {
+    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
+    ST = std::move(RHS.ST);
+    TLI = std::move(RHS.TLI);
+    return *this;
+  }
+
+  unsigned getNumberOfRegisters(bool Vector) {
+    if (Vector) {
+      return 0;
+    }
+    return 12;
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Transforms/IPO/Android.mk b/lib/Transforms/IPO/Android.mk
index 1fe7d63..f08b0ad 100644
--- a/lib/Transforms/IPO/Android.mk
+++ b/lib/Transforms/IPO/Android.mk
@@ -16,6 +16,7 @@ transforms_ipo_SRC_FILES := \
   Inliner.cpp \
   Internalize.cpp \
   LoopExtractor.cpp \
+  LowerBitSets.cpp \
   MergeFunctions.cpp \
   PartialInlining.cpp \
   PassManagerBuilder.cpp \
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index c4706e8..7e48ce3 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -554,14 +554,14 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,
     BasicBlock *BB = Load->getParent();
 
     AliasAnalysis::Location Loc = AA.getLocation(Load);
-    if (AA.canInstructionRangeModify(BB->front(), *Load, Loc))
+    if (AA.canInstructionRangeModRef(BB->front(), *Load, Loc,
+        AliasAnalysis::Mod))
       return false;  // Pointer is invalidated!
 
     // Now check every path from the entry block to the load for transparency.
     // To do this, we perform a depth first search on the inverse CFG from the
     // loading block.
-    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
-      BasicBlock *P = *PI;
+    for (BasicBlock *P : predecessors(BB)) {
       for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks))
         if (AA.canBasicBlockModify(*TranspBB, Loc))
           return false;
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
index 90c1c33..3df17b9 100644
--- a/lib/Transforms/IPO/CMakeLists.txt
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -14,12 +14,17 @@ add_llvm_library(LLVMipo
   Inliner.cpp
   Internalize.cpp
   LoopExtractor.cpp
+  LowerBitSets.cpp
   MergeFunctions.cpp
   PartialInlining.cpp
   PassManagerBuilder.cpp
   PruneEH.cpp
   StripDeadPrototypes.cpp
   StripSymbols.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/IPO
   )
 
 add_dependencies(LLVMipo intrinsics_gen)
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 4045c09..4431311 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -146,7 +146,7 @@ namespace {
   private:
     Liveness MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses);
     Liveness SurveyUse(const Use *U, UseVector &MaybeLiveUses,
-                       unsigned RetValNum = 0);
+                       unsigned RetValNum = -1U);
     Liveness SurveyUses(const Value *V, UseVector &MaybeLiveUses);
 
     void SurveyFunction(const Function &F);
@@ -387,14 +387,32 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
 /// for void functions and 1 for functions not returning a struct. It returns
 /// the number of struct elements for functions returning a struct.
 static unsigned NumRetVals(const Function *F) {
-  if (F->getReturnType()->isVoidTy())
+  Type *RetTy = F->getReturnType();
+  if (RetTy->isVoidTy())
     return 0;
-  else if (StructType *STy = dyn_cast<StructType>(F->getReturnType()))
+  else if (StructType *STy = dyn_cast<StructType>(RetTy))
     return STy->getNumElements();
+  else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy))
+    return ATy->getNumElements();
   else
     return 1;
 }
 
+/// Returns the sub-type a function will return at a given Idx. Should
+/// correspond to the result type of an ExtractValue instruction executed with
+/// just that one Idx (i.e. only top-level structure is considered).
+static Type *getRetComponentType(const Function *F, unsigned Idx) {
+  Type *RetTy = F->getReturnType();
+  assert(!RetTy->isVoidTy() && "void type has no subtype");
+
+  if (StructType *STy = dyn_cast<StructType>(RetTy))
+    return STy->getElementType(Idx);
+  else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy))
+    return ATy->getElementType();
+  else
+    return RetTy;
+}
+
 /// MarkIfNotLive - This checks Use for liveness in LiveValues. If Use is not
 /// live, it adds Use to the MaybeLiveUses argument. Returns the determined
 /// liveness of Use.
@@ -425,9 +443,24 @@ DAE::Liveness DAE::SurveyUse(const Use *U,
       // function's return value is live. We use RetValNum here, for the case
       // that U is really a use of an insertvalue instruction that uses the
       // original Use.
-      RetOrArg Use = CreateRet(RI->getParent()->getParent(), RetValNum);
-      // We might be live, depending on the liveness of Use.
-      return MarkIfNotLive(Use, MaybeLiveUses);
+      const Function *F = RI->getParent()->getParent();
+      if (RetValNum != -1U) {
+        RetOrArg Use = CreateRet(F, RetValNum);
+        // We might be live, depending on the liveness of Use.
+        return MarkIfNotLive(Use, MaybeLiveUses);
+      } else {
+        DAE::Liveness Result = MaybeLive;
+        for (unsigned i = 0; i < NumRetVals(F); ++i) {
+          RetOrArg Use = CreateRet(F, i);
+          // We might be live, depending on the liveness of Use. If any
+          // sub-value is live, then the entire value is considered live. This
+          // is a conservative choice, and better tracking is possible.
+          DAE::Liveness SubResult = MarkIfNotLive(Use, MaybeLiveUses);
+          if (Result != Live)
+            Result = SubResult;
+        }
+        return Result;
+      }
     }
     if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) {
       if (U->getOperandNo() != InsertValueInst::getAggregateOperandIndex()
@@ -541,7 +574,6 @@ void DAE::SurveyFunction(const Function &F) {
   // Keep track of the number of live retvals, so we can skip checks once all
   // of them turn out to be live.
   unsigned NumLiveRetVals = 0;
-  Type *STy = dyn_cast<StructType>(F.getReturnType());
   // Loop all uses of the function.
   for (const Use &U : F.uses()) {
     // If the function is PASSED IN as an argument, its address has been
@@ -563,34 +595,35 @@ void DAE::SurveyFunction(const Function &F) {
 
     // Now, check how our return value(s) is/are used in this caller. Don't
     // bother checking return values if all of them are live already.
-    if (NumLiveRetVals != RetCount) {
-      if (STy) {
-        // Check all uses of the return value.
-        for (const User *U : TheCall->users()) {
-          const ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(U);
-          if (Ext && Ext->hasIndices()) {
-            // This use uses a part of our return value, survey the uses of
-            // that part and store the results for this index only.
-            unsigned Idx = *Ext->idx_begin();
-            if (RetValLiveness[Idx] != Live) {
-              RetValLiveness[Idx] = SurveyUses(Ext, MaybeLiveRetUses[Idx]);
-              if (RetValLiveness[Idx] == Live)
-                NumLiveRetVals++;
-            }
-          } else {
-            // Used by something else than extractvalue. Mark all return
-            // values as live.
-            for (unsigned i = 0; i != RetCount; ++i )
-              RetValLiveness[i] = Live;
-            NumLiveRetVals = RetCount;
-            break;
-          }
+    if (NumLiveRetVals == RetCount)
+      continue;
+
+    // Check all uses of the return value.
+    for (const Use &U : TheCall->uses()) {
+      if (ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(U.getUser())) {
+        // This use uses a part of our return value, survey the uses of
+        // that part and store the results for this index only.
+        unsigned Idx = *Ext->idx_begin();
+        if (RetValLiveness[Idx] != Live) {
+          RetValLiveness[Idx] = SurveyUses(Ext, MaybeLiveRetUses[Idx]);
+          if (RetValLiveness[Idx] == Live)
+            NumLiveRetVals++;
         }
       } else {
-        // Single return value
-        RetValLiveness[0] = SurveyUses(TheCall, MaybeLiveRetUses[0]);
-        if (RetValLiveness[0] == Live)
+        // Used by something else than extractvalue. Survey, but assume that the
+        // result applies to all sub-values.
+        UseVector MaybeLiveAggregateUses;
+        if (SurveyUse(&U, MaybeLiveAggregateUses) == Live) {
           NumLiveRetVals = RetCount;
+          RetValLiveness.assign(RetCount, Live);
+          break;
+        } else {
+          for (unsigned i = 0; i != RetCount; ++i) {
+            if (RetValLiveness[i] != Live)
+              MaybeLiveRetUses[i].append(MaybeLiveAggregateUses.begin(),
+                                         MaybeLiveAggregateUses.end());
+          }
+        }
       }
     }
   }
@@ -775,39 +808,29 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   if (RetTy->isVoidTy() || HasLiveReturnedArg) {
     NRetTy = RetTy;
   } else {
-    StructType *STy = dyn_cast<StructType>(RetTy);
-    if (STy)
-      // Look at each of the original return values individually.
-      for (unsigned i = 0; i != RetCount; ++i) {
-        RetOrArg Ret = CreateRet(F, i);
-        if (LiveValues.erase(Ret)) {
-          RetTypes.push_back(STy->getElementType(i));
-          NewRetIdxs[i] = RetTypes.size() - 1;
-        } else {
-          ++NumRetValsEliminated;
-          DEBUG(dbgs() << "DAE - Removing return value " << i << " from "
-                << F->getName() << "\n");
-        }
-      }
-    else
-      // We used to return a single value.
-      if (LiveValues.erase(CreateRet(F, 0))) {
-        RetTypes.push_back(RetTy);
-        NewRetIdxs[0] = 0;
+    // Look at each of the original return values individually.
+    for (unsigned i = 0; i != RetCount; ++i) {
+      RetOrArg Ret = CreateRet(F, i);
+      if (LiveValues.erase(Ret)) {
+        RetTypes.push_back(getRetComponentType(F, i));
+        NewRetIdxs[i] = RetTypes.size() - 1;
       } else {
-        DEBUG(dbgs() << "DAE - Removing return value from " << F->getName()
-              << "\n");
         ++NumRetValsEliminated;
+        DEBUG(dbgs() << "DAE - Removing return value " << i << " from "
+              << F->getName() << "\n");
+      }
+    }
+    if (RetTypes.size() > 1) {
+      // More than one return type? Reduce it down to size.
+      if (StructType *STy = dyn_cast<StructType>(RetTy)) {
+        // Make the new struct packed if we used to return a packed struct
+        // already.
+        NRetTy = StructType::get(STy->getContext(), RetTypes, STy->isPacked());
+      } else {
+        assert(isa<ArrayType>(RetTy) && "unexpected multi-value return");
+        NRetTy = ArrayType::get(RetTypes[0], RetTypes.size());
       }
-    if (RetTypes.size() > 1)
-      // More than one return type? Return a struct with them. Also, if we used
-      // to return a struct and didn't change the number of return values,
-      // return a struct again. This prevents changing {something} into
-      // something and {} into void.
-      // Make the new struct packed if we used to return a packed struct
-      // already.
-      NRetTy = StructType::get(STy->getContext(), RetTypes, STy->isPacked());
-    else if (RetTypes.size() == 1)
+    } else if (RetTypes.size() == 1)
       // One return type? Just a simple value then, but only if we didn't use to
       // return a struct with that simple value before.
       NRetTy = RetTypes.front();
@@ -959,9 +982,9 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
         if (!Call->getType()->isX86_MMXTy())
           Call->replaceAllUsesWith(Constant::getNullValue(Call->getType()));
       } else {
-        assert(RetTy->isStructTy() &&
+        assert((RetTy->isStructTy() || RetTy->isArrayTy()) &&
                "Return type changed, but not into a void. The old return type"
-               " must have been a struct!");
+               " must have been a struct or an array!");
         Instruction *InsertPt = Call;
         if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
           BasicBlock::iterator IP = II->getNormalDest()->begin();
@@ -969,9 +992,9 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
           InsertPt = IP;
         }
 
-        // We used to return a struct. Instead of doing smart stuff with all the
-        // uses of this struct, we will just rebuild it using
-        // extract/insertvalue chaining and let instcombine clean that up.
+        // We used to return a struct or array. Instead of doing smart stuff
+        // with all the uses, we will just rebuild it using extract/insertvalue
+        // chaining and let instcombine clean that up.
         //
         // Start out building up our return value from undef
         Value *RetVal = UndefValue::get(RetTy);
@@ -1034,8 +1057,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
         if (NFTy->getReturnType()->isVoidTy()) {
           RetVal = nullptr;
         } else {
-          assert (RetTy->isStructTy());
-          // The original return value was a struct, insert
+          assert(RetTy->isStructTy() || RetTy->isArrayTy());
+          // The original return value was a struct or array, insert
           // extractvalue/insertvalue chains to extract only the values we need
           // to return and insert them into our new result.
           // This does generate messy code, but we'll let it to instcombine to
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 823ae53..8925e4c 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -31,7 +31,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "functionattrs"
@@ -124,7 +124,7 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       AU.addRequired<AliasAnalysis>();
-      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
       CallGraphSCCPass::getAnalysisUsage(AU);
     }
 
@@ -139,7 +139,7 @@ INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs",
                 "Deduce function attributes", false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(FunctionAttrs, "functionattrs",
                 "Deduce function attributes", false, false)
 
@@ -1702,7 +1702,7 @@ bool FunctionAttrs::annotateLibraryCalls(const CallGraphSCC &SCC) {
 
 bool FunctionAttrs::runOnSCC(CallGraphSCC &SCC) {
   AA = &getAnalysis<AliasAnalysis>();
-  TLI = &getAnalysis<TargetLibraryInfo>();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   bool Changed = annotateLibraryCalls(SCC);
   Changed |= AddReadAttrs(SCC);
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 705e929..0c844fe 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -219,6 +219,9 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) {
     if (F->hasPrefixData())
       MarkUsedGlobalsAsNeeded(F->getPrefixData());
 
+    if (F->hasPrologueData())
+      MarkUsedGlobalsAsNeeded(F->getPrologueData());
+
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
         for (User::op_iterator U = I->op_begin(), E = I->op_end(); U != E; ++U)
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 6e0ae83..45e04f1 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -38,7 +38,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/CtorUtils.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -68,7 +68,7 @@ STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
 namespace {
   struct GlobalOpt : public ModulePass {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
     static char ID; // Pass identification, replacement for typeid
     GlobalOpt() : ModulePass(ID) {
@@ -95,7 +95,7 @@ namespace {
 char GlobalOpt::ID = 0;
 INITIALIZE_PASS_BEGIN(GlobalOpt, "globalopt",
                 "Global Variable Optimizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(GlobalOpt, "globalopt",
                 "Global Variable Optimizer", false, false)
 
@@ -3042,7 +3042,7 @@ bool GlobalOpt::runOnModule(Module &M) {
 
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
-  TLI = &getAnalysis<TargetLibraryInfo>();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   bool LocalChange = true;
   while (LocalChange) {
diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp
index b4d31d8..fcacec3 100644
--- a/lib/Transforms/IPO/IPO.cpp
+++ b/lib/Transforms/IPO/IPO.cpp
@@ -16,7 +16,7 @@
 #include "llvm-c/Initialization.h"
 #include "llvm-c/Transforms/IPO.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Transforms/IPO.h"
 
 using namespace llvm;
@@ -36,6 +36,7 @@ void llvm::initializeIPO(PassRegistry &Registry) {
   initializeLoopExtractorPass(Registry);
   initializeBlockExtractorPassPass(Registry);
   initializeSingleLoopExtractorPass(Registry);
+  initializeLowerBitSetsPass(Registry);
   initializeMergeFunctionsPass(Registry);
   initializePartialInlinerPass(Registry);
   initializePruneEHPass(Registry);
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index 819b2e0..dc56a02 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -15,7 +15,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/IR/CallSite.h"
@@ -68,7 +68,7 @@ char AlwaysInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
 INITIALIZE_PASS_END(AlwaysInliner, "always-inline",
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index d9a2b9e..9b01d81 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -13,7 +13,7 @@
 
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/IR/CallSite.h"
@@ -76,7 +76,7 @@ char SimpleInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
 INITIALIZE_PASS_END(SimpleInliner, "inline",
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 3abe7a8..305ad7a 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -17,7 +17,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/IR/CallSite.h"
@@ -29,7 +29,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
@@ -77,7 +77,7 @@ Inliner::Inliner(char &ID, int Threshold, bool InsertLifetime)
 /// always explicitly call the implementation here.
 void Inliner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AliasAnalysis>();
-  AU.addRequired<AssumptionTracker>();
+  AU.addRequired<AssumptionCacheTracker>();
   CallGraphSCCPass::getAnalysisUsage(AU);
 }
 
@@ -97,25 +97,17 @@ static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) {
   AttributeSet OldSSPAttr = AttributeSet::get(Caller->getContext(),
                                               AttributeSet::FunctionIndex,
                                               B);
-  AttributeSet CallerAttr = Caller->getAttributes(),
-               CalleeAttr = Callee->getAttributes();
 
-  if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex,
-                              Attribute::StackProtectReq)) {
+  if (Callee->hasFnAttribute(Attribute::StackProtectReq)) {
     Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
     Caller->addFnAttr(Attribute::StackProtectReq);
-  } else if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex,
-                                     Attribute::StackProtectStrong) &&
-             !CallerAttr.hasAttribute(AttributeSet::FunctionIndex,
-                                      Attribute::StackProtectReq)) {
+  } else if (Callee->hasFnAttribute(Attribute::StackProtectStrong) &&
+             !Caller->hasFnAttribute(Attribute::StackProtectReq)) {
     Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
     Caller->addFnAttr(Attribute::StackProtectStrong);
-  } else if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex,
-                                     Attribute::StackProtect) &&
-           !CallerAttr.hasAttribute(AttributeSet::FunctionIndex,
-                                    Attribute::StackProtectReq) &&
-           !CallerAttr.hasAttribute(AttributeSet::FunctionIndex,
-                                    Attribute::StackProtectStrong))
+  } else if (Callee->hasFnAttribute(Attribute::StackProtect) &&
+             !Caller->hasFnAttribute(Attribute::StackProtectReq) &&
+             !Caller->hasFnAttribute(Attribute::StackProtectStrong))
     Caller->addFnAttr(Attribute::StackProtect);
 }
 
@@ -273,8 +265,7 @@ unsigned Inliner::getInlineThreshold(CallSite CS) const {
   // would decrease the threshold.
   Function *Caller = CS.getCaller();
   bool OptSize = Caller && !Caller->isDeclaration() &&
-    Caller->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                         Attribute::OptimizeForSize);
+                 Caller->hasFnAttribute(Attribute::OptimizeForSize);
   if (!(InlineLimit.getNumOccurrences() > 0) && OptSize &&
       OptSizeThreshold < thres)
     thres = OptSizeThreshold;
@@ -283,17 +274,14 @@ unsigned Inliner::getInlineThreshold(CallSite CS) const {
   // and the caller does not need to minimize its size.
   Function *Callee = CS.getCalledFunction();
   bool InlineHint = Callee && !Callee->isDeclaration() &&
-    Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                         Attribute::InlineHint);
-  if (InlineHint && HintThreshold > thres
-      && !Caller->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                               Attribute::MinSize))
+                    Callee->hasFnAttribute(Attribute::InlineHint);
+  if (InlineHint && HintThreshold > thres &&
+      !Caller->hasFnAttribute(Attribute::MinSize))
     thres = HintThreshold;
 
   // Listen to the cold attribute when it would decrease the threshold.
   bool ColdCallee = Callee && !Callee->isDeclaration() &&
-    Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                         Attribute::Cold);
+                    Callee->hasFnAttribute(Attribute::Cold);
   // Command line argument for InlineLimit will override the default
   // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold,
   // do not use the default cold threshold even if it is smaller.
@@ -443,10 +431,11 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,
 
 bool Inliner::runOnSCC(CallGraphSCC &SCC) {
   CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
-  AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
+  AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
-  const TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+  const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
   AliasAnalysis *AA = &getAnalysis<AliasAnalysis>();
 
   SmallPtrSet<Function*, 8> SCCFunctions;
@@ -506,8 +495,8 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
 
   
   InlinedArrayAllocasTy InlinedArrayAllocas;
-  InlineFunctionInfo InlineInfo(&CG, DL, AA, AT);
-  
+  InlineFunctionInfo InlineInfo(&CG, DL, AA, ACT);
+
   // Now that we have all of the call sites, loop over them and inline them if
   // it looks profitable to do so.
   bool Changed = false;
@@ -658,9 +647,7 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) {
     // Handle the case when this function is called and we only want to care
     // about always-inline functions. This is a bit of a hack to share code
     // between here and the InlineAlways pass.
-    if (AlwaysInlineOnly &&
-        !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                         Attribute::AlwaysInline))
+    if (AlwaysInlineOnly && !F->hasFnAttribute(Attribute::AlwaysInline))
       continue;
 
     // If the only remaining users of the function are dead constants, remove
diff --git a/lib/Transforms/IPO/LLVMBuild.txt b/lib/Transforms/IPO/LLVMBuild.txt
index 77e0b22..575dce4 100644
--- a/lib/Transforms/IPO/LLVMBuild.txt
+++ b/lib/Transforms/IPO/LLVMBuild.txt
@@ -20,4 +20,4 @@ type = Library
 name = IPO
 parent = Transforms
 library_name = ipo
-required_libraries = Analysis Core IPA InstCombine Scalar Support Target TransformUtils Vectorize
+required_libraries = Analysis Core IPA InstCombine Scalar Support TransformUtils Vectorize
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
index 20414aa..41334ca 100644
--- a/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -242,7 +242,7 @@ void BlockExtractorPass::SplitLandingPadPreds(Function *F) {
     if (!Split) continue;
 
     SmallVector<BasicBlock*, 2> NewBBs;
-    SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", nullptr, NewBBs);
+    SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", NewBBs);
   }
 }
 
diff --git a/lib/Transforms/IPO/LowerBitSets.cpp b/lib/Transforms/IPO/LowerBitSets.cpp
new file mode 100644
index 0000000..0a22a80
--- /dev/null
+++ b/lib/Transforms/IPO/LowerBitSets.cpp
@@ -0,0 +1,612 @@
+//===-- LowerBitSets.cpp - Bitset lowering pass ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers bitset metadata and calls to the llvm.bitset.test intrinsic.
+// See http://llvm.org/docs/LangRef.html#bitsets for more information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/LowerBitSets.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lowerbitsets"
+
+STATISTIC(NumBitSetsCreated, "Number of bitsets created");
+STATISTIC(NumBitSetCallsLowered, "Number of bitset calls lowered");
+STATISTIC(NumBitSetDisjointSets, "Number of disjoint sets of bitsets");
+
+bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const {
+  if (Offset < ByteOffset)
+    return false;
+
+  if ((Offset - ByteOffset) % (uint64_t(1) << AlignLog2) != 0)
+    return false;
+
+  uint64_t BitOffset = (Offset - ByteOffset) >> AlignLog2;
+  if (BitOffset >= BitSize)
+    return false;
+
+  return (Bits[BitOffset / 8] >> (BitOffset % 8)) & 1;
+}
+
+bool BitSetInfo::containsValue(
+    const DataLayout *DL,
+    const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout, Value *V,
+    uint64_t COffset) const {
+  if (auto GV = dyn_cast<GlobalVariable>(V)) {
+    auto I = GlobalLayout.find(GV);
+    if (I == GlobalLayout.end())
+      return false;
+    return containsGlobalOffset(I->second + COffset);
+  }
+
+  if (auto GEP = dyn_cast<GEPOperator>(V)) {
+    APInt APOffset(DL->getPointerSizeInBits(0), 0);
+    bool Result = GEP->accumulateConstantOffset(*DL, APOffset);
+    if (!Result)
+      return false;
+    COffset += APOffset.getZExtValue();
+    return containsValue(DL, GlobalLayout, GEP->getPointerOperand(),
+                         COffset);
+  }
+
+  if (auto Op = dyn_cast<Operator>(V)) {
+    if (Op->getOpcode() == Instruction::BitCast)
+      return containsValue(DL, GlobalLayout, Op->getOperand(0), COffset);
+
+    if (Op->getOpcode() == Instruction::Select)
+      return containsValue(DL, GlobalLayout, Op->getOperand(1), COffset) &&
+             containsValue(DL, GlobalLayout, Op->getOperand(2), COffset);
+  }
+
+  return false;
+}
+
+BitSetInfo BitSetBuilder::build() {
+  if (Min > Max)
+    Min = 0;
+
+  // Normalize each offset against the minimum observed offset, and compute
+  // the bitwise OR of each of the offsets. The number of trailing zeros
+  // in the mask gives us the log2 of the alignment of all offsets, which
+  // allows us to compress the bitset by only storing one bit per aligned
+  // address.
+  uint64_t Mask = 0;
+  for (uint64_t &Offset : Offsets) {
+    Offset -= Min;
+    Mask |= Offset;
+  }
+
+  BitSetInfo BSI;
+  BSI.ByteOffset = Min;
+
+  BSI.AlignLog2 = 0;
+  // FIXME: Can probably do something smarter if all offsets are 0.
+  if (Mask != 0)
+    BSI.AlignLog2 = countTrailingZeros(Mask, ZB_Undefined);
+
+  // Build the compressed bitset while normalizing the offsets against the
+  // computed alignment.
+  BSI.BitSize = ((Max - Min) >> BSI.AlignLog2) + 1;
+  uint64_t ByteSize = (BSI.BitSize + 7) / 8;
+  BSI.Bits.resize(ByteSize);
+  for (uint64_t Offset : Offsets) {
+    Offset >>= BSI.AlignLog2;
+    BSI.Bits[Offset / 8] |= 1 << (Offset % 8);
+  }
+
+  return BSI;
+}
+
+void GlobalLayoutBuilder::addFragment(const std::set<uint64_t> &F) {
+  // Create a new fragment to hold the layout for F.
+  Fragments.emplace_back();
+  std::vector<uint64_t> &Fragment = Fragments.back();
+  uint64_t FragmentIndex = Fragments.size() - 1;
+
+  for (auto ObjIndex : F) {
+    uint64_t OldFragmentIndex = FragmentMap[ObjIndex];
+    if (OldFragmentIndex == 0) {
+      // We haven't seen this object index before, so just add it to the current
+      // fragment.
+      Fragment.push_back(ObjIndex);
+    } else {
+      // This index belongs to an existing fragment. Copy the elements of the
+      // old fragment into this one and clear the old fragment. We don't update
+      // the fragment map just yet, this ensures that any further references to
+      // indices from the old fragment in this fragment do not insert any more
+      // indices.
+      std::vector<uint64_t> &OldFragment = Fragments[OldFragmentIndex];
+      Fragment.insert(Fragment.end(), OldFragment.begin(), OldFragment.end());
+      OldFragment.clear();
+    }
+  }
+
+  // Update the fragment map to point our object indices to this fragment.
+  for (uint64_t ObjIndex : Fragment)
+    FragmentMap[ObjIndex] = FragmentIndex;
+}
+
+namespace {
+
+struct LowerBitSets : public ModulePass {
+  static char ID;
+  LowerBitSets() : ModulePass(ID) {
+    initializeLowerBitSetsPass(*PassRegistry::getPassRegistry());
+  }
+
+  const DataLayout *DL;
+  IntegerType *Int1Ty;
+  IntegerType *Int8Ty;
+  IntegerType *Int32Ty;
+  Type *Int32PtrTy;
+  IntegerType *Int64Ty;
+  Type *IntPtrTy;
+
+  // The llvm.bitsets named metadata.
+  NamedMDNode *BitSetNM;
+
+  // Mapping from bitset mdstrings to the call sites that test them.
+  DenseMap<MDString *, std::vector<CallInst *>> BitSetTestCallSites;
+
+  BitSetInfo
+  buildBitSet(MDString *BitSet,
+              const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout);
+  Value *createBitSetTest(IRBuilder<> &B, const BitSetInfo &BSI,
+                          GlobalVariable *BitSetGlobal, Value *BitOffset);
+  Value *
+  lowerBitSetCall(CallInst *CI, const BitSetInfo &BSI,
+                  GlobalVariable *BitSetGlobal, GlobalVariable *CombinedGlobal,
+                  const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout);
+  void buildBitSetsFromGlobals(Module &M,
+                               const std::vector<MDString *> &BitSets,
+                               const std::vector<GlobalVariable *> &Globals);
+  bool buildBitSets(Module &M);
+  bool eraseBitSetMetadata(Module &M);
+
+  bool doInitialization(Module &M) override;
+  bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+INITIALIZE_PASS_BEGIN(LowerBitSets, "lowerbitsets",
+                "Lower bitset metadata", false, false)
+INITIALIZE_PASS_END(LowerBitSets, "lowerbitsets",
+                "Lower bitset metadata", false, false)
+char LowerBitSets::ID = 0;
+
+ModulePass *llvm::createLowerBitSetsPass() { return new LowerBitSets; }
+
+bool LowerBitSets::doInitialization(Module &M) {
+  DL = M.getDataLayout();
+  if (!DL)
+    report_fatal_error("Data layout required");
+
+  Int1Ty = Type::getInt1Ty(M.getContext());
+  Int8Ty = Type::getInt8Ty(M.getContext());
+  Int32Ty = Type::getInt32Ty(M.getContext());
+  Int32PtrTy = PointerType::getUnqual(Int32Ty);
+  Int64Ty = Type::getInt64Ty(M.getContext());
+  IntPtrTy = DL->getIntPtrType(M.getContext(), 0);
+
+  BitSetNM = M.getNamedMetadata("llvm.bitsets");
+
+  BitSetTestCallSites.clear();
+
+  return false;
+}
+
+/// Build a bit set for BitSet using the object layouts in
+/// GlobalLayout.
+BitSetInfo LowerBitSets::buildBitSet(
+    MDString *BitSet,
+    const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout) {
+  BitSetBuilder BSB;
+
+  // Compute the byte offset of each element of this bitset.
+  if (BitSetNM) {
+    for (MDNode *Op : BitSetNM->operands()) {
+      if (Op->getOperand(0) != BitSet || !Op->getOperand(1))
+        continue;
+      auto OpGlobal = cast<GlobalVariable>(
+          cast<ConstantAsMetadata>(Op->getOperand(1))->getValue());
+      uint64_t Offset =
+          cast<ConstantInt>(cast<ConstantAsMetadata>(Op->getOperand(2))
+                                ->getValue())->getZExtValue();
+
+      Offset += GlobalLayout.find(OpGlobal)->second;
+
+      BSB.addOffset(Offset);
+    }
+  }
+
+  return BSB.build();
+}
+
+/// Build a test that bit BitOffset mod sizeof(Bits)*8 is set in
+/// Bits. This pattern matches to the bt instruction on x86.
+static Value *createMaskedBitTest(IRBuilder<> &B, Value *Bits,
+                                  Value *BitOffset) {
+  auto BitsType = cast<IntegerType>(Bits->getType());
+  unsigned BitWidth = BitsType->getBitWidth();
+
+  BitOffset = B.CreateZExtOrTrunc(BitOffset, BitsType);
+  Value *BitIndex =
+      B.CreateAnd(BitOffset, ConstantInt::get(BitsType, BitWidth - 1));
+  Value *BitMask = B.CreateShl(ConstantInt::get(BitsType, 1), BitIndex);
+  Value *MaskedBits = B.CreateAnd(Bits, BitMask);
+  return B.CreateICmpNE(MaskedBits, ConstantInt::get(BitsType, 0));
+}
+
+/// Build a test that bit BitOffset is set in BSI, where
+/// BitSetGlobal is a global containing the bits in BSI.
+Value *LowerBitSets::createBitSetTest(IRBuilder<> &B, const BitSetInfo &BSI,
+                                      GlobalVariable *BitSetGlobal,
+                                      Value *BitOffset) {
+  if (BSI.Bits.size() <= 8) {
+    // If the bit set is sufficiently small, we can avoid a load by bit testing
+    // a constant.
+    IntegerType *BitsTy;
+    if (BSI.Bits.size() <= 4)
+      BitsTy = Int32Ty;
+    else
+      BitsTy = Int64Ty;
+
+    uint64_t Bits = 0;
+    for (auto I = BSI.Bits.rbegin(), E = BSI.Bits.rend(); I != E; ++I) {
+      Bits <<= 8;
+      Bits |= *I;
+    }
+    Constant *BitsConst = ConstantInt::get(BitsTy, Bits);
+    return createMaskedBitTest(B, BitsConst, BitOffset);
+  } else {
+    // TODO: We might want to use the memory variant of the bt instruction
+    // with the previously computed bit offset at -Os. This instruction does
+    // exactly what we want but has been benchmarked as being slower than open
+    // coding the load+bt.
+    Value *BitSetGlobalOffset =
+        B.CreateLShr(BitOffset, ConstantInt::get(IntPtrTy, 5));
+    Value *BitSetEntryAddr = B.CreateGEP(
+        ConstantExpr::getBitCast(BitSetGlobal, Int32PtrTy), BitSetGlobalOffset);
+    Value *BitSetEntry = B.CreateLoad(BitSetEntryAddr);
+
+    return createMaskedBitTest(B, BitSetEntry, BitOffset);
+  }
+}
+
+/// Lower a llvm.bitset.test call to its implementation. Returns the value to
+/// replace the call with.
+Value *LowerBitSets::lowerBitSetCall(
+    CallInst *CI, const BitSetInfo &BSI, GlobalVariable *BitSetGlobal,
+    GlobalVariable *CombinedGlobal,
+    const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout) {
+  Value *Ptr = CI->getArgOperand(0);
+
+  if (BSI.containsValue(DL, GlobalLayout, Ptr))
+    return ConstantInt::getTrue(BitSetGlobal->getParent()->getContext());
+
+  Constant *GlobalAsInt = ConstantExpr::getPtrToInt(CombinedGlobal, IntPtrTy);
+  Constant *OffsetedGlobalAsInt = ConstantExpr::getAdd(
+      GlobalAsInt, ConstantInt::get(IntPtrTy, BSI.ByteOffset));
+
+  BasicBlock *InitialBB = CI->getParent();
+
+  IRBuilder<> B(CI);
+
+  Value *PtrAsInt = B.CreatePtrToInt(Ptr, IntPtrTy);
+
+  if (BSI.isSingleOffset())
+    return B.CreateICmpEQ(PtrAsInt, OffsetedGlobalAsInt);
+
+  Value *PtrOffset = B.CreateSub(PtrAsInt, OffsetedGlobalAsInt);
+
+  Value *BitOffset;
+  if (BSI.AlignLog2 == 0) {
+    BitOffset = PtrOffset;
+  } else {
+    // We need to check that the offset both falls within our range and is
+    // suitably aligned. We can check both properties at the same time by
+    // performing a right rotate by log2(alignment) followed by an integer
+    // comparison against the bitset size. The rotate will move the lower
+    // order bits that need to be zero into the higher order bits of the
+    // result, causing the comparison to fail if they are nonzero. The rotate
+    // also conveniently gives us a bit offset to use during the load from
+    // the bitset.
+    Value *OffsetSHR =
+        B.CreateLShr(PtrOffset, ConstantInt::get(IntPtrTy, BSI.AlignLog2));
+    Value *OffsetSHL = B.CreateShl(
+        PtrOffset, ConstantInt::get(IntPtrTy, DL->getPointerSizeInBits(0) -
+                                                  BSI.AlignLog2));
+    BitOffset = B.CreateOr(OffsetSHR, OffsetSHL);
+  }
+
+  Constant *BitSizeConst = ConstantInt::get(IntPtrTy, BSI.BitSize);
+  Value *OffsetInRange = B.CreateICmpULT(BitOffset, BitSizeConst);
+
+  // If the bit set is all ones, testing against it is unnecessary.
+  if (BSI.isAllOnes())
+    return OffsetInRange;
+
+  TerminatorInst *Term = SplitBlockAndInsertIfThen(OffsetInRange, CI, false);
+  IRBuilder<> ThenB(Term);
+
+  // Now that we know that the offset is in range and aligned, load the
+  // appropriate bit from the bitset.
+  Value *Bit = createBitSetTest(ThenB, BSI, BitSetGlobal, BitOffset);
+
+  // The value we want is 0 if we came directly from the initial block
+  // (having failed the range or alignment checks), or the loaded bit if
+  // we came from the block in which we loaded it.
+  B.SetInsertPoint(CI);
+  PHINode *P = B.CreatePHI(Int1Ty, 2);
+  P->addIncoming(ConstantInt::get(Int1Ty, 0), InitialBB);
+  P->addIncoming(Bit, ThenB.GetInsertBlock());
+  return P;
+}
+
+/// Given a disjoint set of bitsets and globals, layout the globals, build the
+/// bit sets and lower the llvm.bitset.test calls.
+void LowerBitSets::buildBitSetsFromGlobals(
+    Module &M,
+    const std::vector<MDString *> &BitSets,
+    const std::vector<GlobalVariable *> &Globals) {
+  // Build a new global with the combined contents of the referenced globals.
+  std::vector<Constant *> GlobalInits;
+  for (GlobalVariable *G : Globals) {
+    GlobalInits.push_back(G->getInitializer());
+    uint64_t InitSize = DL->getTypeAllocSize(G->getInitializer()->getType());
+
+    // Compute the amount of padding required to align the next element to the
+    // next power of 2.
+    uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize;
+
+    // Cap at 128 was found experimentally to have a good data/instruction
+    // overhead tradeoff.
+    if (Padding > 128)
+      Padding = RoundUpToAlignment(InitSize, 128) - InitSize;
+
+    GlobalInits.push_back(
+        ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding)));
+  }
+  if (!GlobalInits.empty())
+    GlobalInits.pop_back();
+  Constant *NewInit = ConstantStruct::getAnon(M.getContext(), GlobalInits);
+  auto CombinedGlobal =
+      new GlobalVariable(M, NewInit->getType(), /*isConstant=*/true,
+                         GlobalValue::PrivateLinkage, NewInit);
+
+  const StructLayout *CombinedGlobalLayout =
+      DL->getStructLayout(cast<StructType>(NewInit->getType()));
+
+  // Compute the offsets of the original globals within the new global.
+  DenseMap<GlobalVariable *, uint64_t> GlobalLayout;
+  for (unsigned I = 0; I != Globals.size(); ++I)
+    // Multiply by 2 to account for padding elements.
+    GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I * 2);
+
+  // For each bitset in this disjoint set...
+  for (MDString *BS : BitSets) {
+    // Build the bitset.
+    BitSetInfo BSI = buildBitSet(BS, GlobalLayout);
+
+    // Create a global in which to store it.
+    ++NumBitSetsCreated;
+    Constant *BitsConst = ConstantDataArray::get(M.getContext(), BSI.Bits);
+    auto BitSetGlobal = new GlobalVariable(
+        M, BitsConst->getType(), /*isConstant=*/true,
+        GlobalValue::PrivateLinkage, BitsConst, BS->getString() + ".bits");
+
+    // Lower each call to llvm.bitset.test for this bitset.
+    for (CallInst *CI : BitSetTestCallSites[BS]) {
+      ++NumBitSetCallsLowered;
+      Value *Lowered =
+          lowerBitSetCall(CI, BSI, BitSetGlobal, CombinedGlobal, GlobalLayout);
+      CI->replaceAllUsesWith(Lowered);
+      CI->eraseFromParent();
+    }
+  }
+
+  // Build aliases pointing to offsets into the combined global for each
+  // global from which we built the combined global, and replace references
+  // to the original globals with references to the aliases.
+  for (unsigned I = 0; I != Globals.size(); ++I) {
+    // Multiply by 2 to account for padding elements.
+    Constant *CombinedGlobalIdxs[] = {ConstantInt::get(Int32Ty, 0),
+                                      ConstantInt::get(Int32Ty, I * 2)};
+    Constant *CombinedGlobalElemPtr =
+        ConstantExpr::getGetElementPtr(CombinedGlobal, CombinedGlobalIdxs);
+    GlobalAlias *GAlias = GlobalAlias::create(
+        Globals[I]->getType()->getElementType(),
+        Globals[I]->getType()->getAddressSpace(), Globals[I]->getLinkage(),
+        "", CombinedGlobalElemPtr, &M);
+    GAlias->takeName(Globals[I]);
+    Globals[I]->replaceAllUsesWith(GAlias);
+    Globals[I]->eraseFromParent();
+  }
+}
+
+/// Lower all bit sets in this module.
+bool LowerBitSets::buildBitSets(Module &M) {
+  Function *BitSetTestFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::bitset_test));
+  if (!BitSetTestFunc)
+    return false;
+
+  // Equivalence class set containing bitsets and the globals they reference.
+  // This is used to partition the set of bitsets in the module into disjoint
+  // sets.
+  typedef EquivalenceClasses<PointerUnion<GlobalVariable *, MDString *>>
+      GlobalClassesTy;
+  GlobalClassesTy GlobalClasses;
+
+  for (const Use &U : BitSetTestFunc->uses()) {
+    auto CI = cast<CallInst>(U.getUser());
+
+    auto BitSetMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
+    if (!BitSetMDVal || !isa<MDString>(BitSetMDVal->getMetadata()))
+      report_fatal_error(
+          "Second argument of llvm.bitset.test must be metadata string");
+    auto BitSet = cast<MDString>(BitSetMDVal->getMetadata());
+
+    // Add the call site to the list of call sites for this bit set. We also use
+    // BitSetTestCallSites to keep track of whether we have seen this bit set
+    // before. If we have, we don't need to re-add the referenced globals to the
+    // equivalence class.
+    std::pair<DenseMap<MDString *, std::vector<CallInst *>>::iterator,
+              bool> Ins =
+        BitSetTestCallSites.insert(
+            std::make_pair(BitSet, std::vector<CallInst *>()));
+    Ins.first->second.push_back(CI);
+    if (!Ins.second)
+      continue;
+
+    // Add the bitset to the equivalence class.
+    GlobalClassesTy::iterator GCI = GlobalClasses.insert(BitSet);
+    GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI);
+
+    if (!BitSetNM)
+      continue;
+
+    // Verify the bitset metadata and add the referenced globals to the bitset's
+    // equivalence class.
+    for (MDNode *Op : BitSetNM->operands()) {
+      if (Op->getNumOperands() != 3)
+        report_fatal_error(
+            "All operands of llvm.bitsets metadata must have 3 elements");
+
+      if (Op->getOperand(0) != BitSet || !Op->getOperand(1))
+        continue;
+
+      auto OpConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(1));
+      if (!OpConstMD)
+        report_fatal_error("Bit set element must be a constant");
+      auto OpGlobal = dyn_cast<GlobalVariable>(OpConstMD->getValue());
+      if (!OpGlobal)
+        report_fatal_error("Bit set element must refer to global");
+
+      auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(2));
+      if (!OffsetConstMD)
+        report_fatal_error("Bit set element offset must be a constant");
+      auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue());
+      if (!OffsetInt)
+        report_fatal_error(
+            "Bit set element offset must be an integer constant");
+
+      CurSet = GlobalClasses.unionSets(
+          CurSet, GlobalClasses.findLeader(GlobalClasses.insert(OpGlobal)));
+    }
+  }
+
+  if (GlobalClasses.empty())
+    return false;
+
+  // For each disjoint set we found...
+  for (GlobalClassesTy::iterator I = GlobalClasses.begin(),
+                                 E = GlobalClasses.end();
+       I != E; ++I) {
+    if (!I->isLeader()) continue;
+
+    ++NumBitSetDisjointSets;
+
+    // Build the list of bitsets and referenced globals in this disjoint set.
+    std::vector<MDString *> BitSets;
+    std::vector<GlobalVariable *> Globals;
+    llvm::DenseMap<MDString *, uint64_t> BitSetIndices;
+    llvm::DenseMap<GlobalVariable *, uint64_t> GlobalIndices;
+    for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I);
+         MI != GlobalClasses.member_end(); ++MI) {
+      if ((*MI).is<MDString *>()) {
+        BitSetIndices[MI->get<MDString *>()] = BitSets.size();
+        BitSets.push_back(MI->get<MDString *>());
+      } else {
+        GlobalIndices[MI->get<GlobalVariable *>()] = Globals.size();
+        Globals.push_back(MI->get<GlobalVariable *>());
+      }
+    }
+
+    // For each bitset, build a set of indices that refer to globals referenced
+    // by the bitset.
+    std::vector<std::set<uint64_t>> BitSetMembers(BitSets.size());
+    if (BitSetNM) {
+      for (MDNode *Op : BitSetNM->operands()) {
+        // Op = { bitset name, global, offset }
+        if (!Op->getOperand(1))
+          continue;
+        auto I = BitSetIndices.find(cast<MDString>(Op->getOperand(0)));
+        if (I == BitSetIndices.end())
+          continue;
+
+        auto OpGlobal = cast<GlobalVariable>(
+            cast<ConstantAsMetadata>(Op->getOperand(1))->getValue());
+        BitSetMembers[I->second].insert(GlobalIndices[OpGlobal]);
+      }
+    }
+
+    // Order the sets of indices by size. The GlobalLayoutBuilder works best
+    // when given small index sets first.
+    std::stable_sort(
+        BitSetMembers.begin(), BitSetMembers.end(),
+        [](const std::set<uint64_t> &O1, const std::set<uint64_t> &O2) {
+          return O1.size() < O2.size();
+        });
+
+    // Create a GlobalLayoutBuilder and provide it with index sets as layout
+    // fragments. The GlobalLayoutBuilder tries to lay out members of fragments
+    // as close together as possible.
+    GlobalLayoutBuilder GLB(Globals.size());
+    for (auto &&MemSet : BitSetMembers)
+      GLB.addFragment(MemSet);
+
+    // Build a vector of globals with the computed layout.
+    std::vector<GlobalVariable *> OrderedGlobals(Globals.size());
+    auto OGI = OrderedGlobals.begin();
+    for (auto &&F : GLB.Fragments)
+      for (auto &&Offset : F)
+        *OGI++ = Globals[Offset];
+
+    // Order bitsets by name for determinism.
+    std::sort(BitSets.begin(), BitSets.end(), [](MDString *S1, MDString *S2) {
+      return S1->getString() < S2->getString();
+    });
+
+    // Build the bitsets from this disjoint set.
+    buildBitSetsFromGlobals(M, BitSets, OrderedGlobals);
+  }
+
+  return true;
+}
+
+bool LowerBitSets::eraseBitSetMetadata(Module &M) {
+  if (!BitSetNM)
+    return false;
+
+  M.eraseNamedMetadata(BitSetNM);
+  return true;
+}
+
+bool LowerBitSets::runOnModule(Module &M) {
+  bool Changed = buildBitSets(M);
+  Changed |= eraseBitSetMetadata(M);
+  return Changed;
+}
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 76d6dfa..4a7cb7b 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -58,13 +58,13 @@ Function* PartialInliner::unswitchFunction(Function* F) {
   BasicBlock* returnBlock = nullptr;
   BasicBlock* nonReturnBlock = nullptr;
   unsigned returnCount = 0;
-  for (succ_iterator SI = succ_begin(entryBlock), SE = succ_end(entryBlock);
-       SI != SE; ++SI)
-    if (isa<ReturnInst>((*SI)->getTerminator())) {
-      returnBlock = *SI;
+  for (BasicBlock *BB : successors(entryBlock)) {
+    if (isa<ReturnInst>(BB->getTerminator())) {
+      returnBlock = BB;
       returnCount++;
     } else
-      nonReturnBlock = *SI;
+      nonReturnBlock = BB;
+  }
   
   if (returnCount != 1)
     return nullptr;
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index da85a91..9a75050 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -19,12 +19,11 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Vectorize.h"
@@ -118,7 +117,7 @@ void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) {
 }
 
 void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy,
-                                           PassManagerBase &PM) const {
+                                           legacy::PassManagerBase &PM) const {
   for (unsigned i = 0, e = GlobalExtensions->size(); i != e; ++i)
     if ((*GlobalExtensions)[i].first == ETy)
       (*GlobalExtensions)[i].second(*this, PM);
@@ -127,8 +126,8 @@ void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy,
       Extensions[i].second(*this, PM);
 }
 
-void
-PassManagerBuilder::addInitialAliasAnalysisPasses(PassManagerBase &PM) const {
+void PassManagerBuilder::addInitialAliasAnalysisPasses(
+    legacy::PassManagerBase &PM) const {
   // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
   // BasicAliasAnalysis wins if they disagree. This is intended to help
   // support "obvious" type-punning idioms.
@@ -139,11 +138,13 @@ PassManagerBuilder::addInitialAliasAnalysisPasses(PassManagerBase &PM) const {
   PM.add(createBasicAliasAnalysisPass());
 }
 
-void PassManagerBuilder::populateFunctionPassManager(FunctionPassManager &FPM) {
+void PassManagerBuilder::populateFunctionPassManager(
+    legacy::FunctionPassManager &FPM) {
   addExtensionsToPM(EP_EarlyAsPossible, FPM);
 
   // Add LibraryInfo if we have some.
-  if (LibraryInfo) FPM.add(new TargetLibraryInfo(*LibraryInfo));
+  if (LibraryInfo)
+    FPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
 
   if (OptLevel == 0) return;
 
@@ -158,7 +159,8 @@ void PassManagerBuilder::populateFunctionPassManager(FunctionPassManager &FPM) {
   FPM.add(createLowerExpectIntrinsicPass());
 }
 
-void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
+void PassManagerBuilder::populateModulePassManager(
+    legacy::PassManagerBase &MPM) {
   // If all optimizations are disabled, just run the always-inline pass and,
   // if enabled, the function merging pass.
   if (OptLevel == 0) {
@@ -182,7 +184,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   }
 
   // Add LibraryInfo if we have some.
-  if (LibraryInfo) MPM.add(new TargetLibraryInfo(*LibraryInfo));
+  if (LibraryInfo)
+    MPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
 
   addInitialAliasAnalysisPasses(MPM);
 
@@ -228,7 +231,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
     MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
   MPM.add(createReassociatePass());           // Reassociate expressions
-  MPM.add(createLoopRotatePass());            // Rotate Loop
+  // Rotate Loop - disable header duplication at -Oz
+  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
   MPM.add(createLICMPass());                  // Hoist loop invariants
   MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
   MPM.add(createInstructionCombiningPass());
@@ -248,6 +252,11 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
   MPM.add(createSCCPPass());                  // Constant prop with SCCP
 
+  // Delete dead bit computations (instcombine runs after to fold away the dead
+  // computations, and then ADCE will run later to exploit any new DCE
+  // opportunities that creates).
+  MPM.add(createBitTrackingDCEPass());        // Delete dead bit computations
+
   // Run instcombine after redundancy elimination to exploit opportunities
   // opened up by them.
   MPM.add(createInstructionCombiningPass());
@@ -255,6 +264,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   MPM.add(createJumpThreadingPass());         // Thread jumps
   MPM.add(createCorrelatedValuePropagationPass());
   MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
+  MPM.add(createLICMPass());
 
   addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
 
@@ -373,7 +383,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   addExtensionsToPM(EP_OptimizerLast, MPM);
 }
 
-void PassManagerBuilder::addLTOOptimizationPasses(PassManagerBase &PM) {
+void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // Provide AliasAnalysis services for optimizations.
   addInitialAliasAnalysisPasses(PM);
 
@@ -464,6 +474,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(PassManagerBase &PM) {
 
   PM.add(createJumpThreadingPass());
 
+  // Lower bitset metadata to bitsets.
+  PM.add(createLowerBitSetsPass());
+
   // Delete basic blocks, which optimization passes may have killed.
   PM.add(createCFGSimplificationPass());
 
@@ -476,15 +489,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(PassManagerBase &PM) {
     PM.add(createMergeFunctionsPass());
 }
 
-void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
-                                                TargetMachine *TM) {
-  if (TM) {
-    PM.add(new DataLayoutPass());
-    TM->addAnalysisPasses(PM);
-  }
-
+void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
   if (LibraryInfo)
-    PM.add(new TargetLibraryInfo(*LibraryInfo));
+    PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
 
   if (VerifyInput)
     PM.add(createVerifierPass());
@@ -567,7 +574,7 @@ void
 LLVMPassManagerBuilderPopulateFunctionPassManager(LLVMPassManagerBuilderRef PMB,
                                                   LLVMPassManagerRef PM) {
   PassManagerBuilder *Builder = unwrap(PMB);
-  FunctionPassManager *FPM = unwrap<FunctionPassManager>(PM);
+  legacy::FunctionPassManager *FPM = unwrap<legacy::FunctionPassManager>(PM);
   Builder->populateFunctionPassManager(*FPM);
 }
 
@@ -575,7 +582,7 @@ void
 LLVMPassManagerBuilderPopulateModulePassManager(LLVMPassManagerBuilderRef PMB,
                                                 LLVMPassManagerRef PM) {
   PassManagerBuilder *Builder = unwrap(PMB);
-  PassManagerBase *MPM = unwrap(PM);
+  legacy::PassManagerBase *MPM = unwrap(PM);
   Builder->populateModulePassManager(*MPM);
 }
 
@@ -584,7 +591,7 @@ void LLVMPassManagerBuilderPopulateLTOPassManager(LLVMPassManagerBuilderRef PMB,
                                                   LLVMBool Internalize,
                                                   LLVMBool RunInliner) {
   PassManagerBuilder *Builder = unwrap(PMB);
-  PassManagerBase *LPM = unwrap(PM);
+  legacy::PassManagerBase *LPM = unwrap(PM);
 
   // A small backwards compatibility hack. populateLTOPassManager used to take
   // an RunInliner option.
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index b2c4a09..1943b93 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -18,8 +18,10 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LibCallSemantics.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -175,7 +177,7 @@ bool PruneEH::SimplifyFunction(Function *F) {
   bool MadeChange = false;
   for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
-      if (II->doesNotThrow()) {
+      if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(II)) {
         SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
         // Insert a call instruction before the invoke.
         CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II);
@@ -200,7 +202,7 @@ bool PruneEH::SimplifyFunction(Function *F) {
         BB->getInstList().pop_back();
 
         // If the unwind block is now dead, nuke it.
-        if (pred_begin(UnwindBlock) == pred_end(UnwindBlock))
+        if (pred_empty(UnwindBlock))
           DeleteBasicBlock(UnwindBlock);  // Delete the new BB.
 
         ++NumRemoved;
@@ -234,7 +236,7 @@ bool PruneEH::SimplifyFunction(Function *F) {
 /// updating the callgraph to reflect any now-obsolete edges due to calls that
 /// exist in the BB.
 void PruneEH::DeleteBasicBlock(BasicBlock *BB) {
-  assert(pred_begin(BB) == pred_end(BB) && "BB is not dead!");
+  assert(pred_empty(BB) && "BB is not dead!");
   CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
 
   CallGraphNode *CGN = CG[BB->getParent()];
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 3412b9e..816978e 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -301,8 +301,8 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
   // For each compile unit, find the live set of global variables/functions and
   // replace the current list of potentially dead global variables/functions
   // with the live list.
-  SmallVector<Value *, 64> LiveGlobalVariables;
-  SmallVector<Value *, 64> LiveSubprograms;
+  SmallVector<Metadata *, 64> LiveGlobalVariables;
+  SmallVector<Metadata *, 64> LiveSubprograms;
   DenseSet<const MDNode *> VisitedSet;
 
   for (DICompileUnit DIC : F.compile_units()) {
diff --git a/lib/Transforms/InstCombine/CMakeLists.txt b/lib/Transforms/InstCombine/CMakeLists.txt
index a25696e..0ed8e62 100644
--- a/lib/Transforms/InstCombine/CMakeLists.txt
+++ b/lib/Transforms/InstCombine/CMakeLists.txt
@@ -12,6 +12,10 @@ add_llvm_library(LLVMInstCombine
   InstCombineShifts.cpp
   InstCombineSimplifyDemanded.cpp
   InstCombineVectorOps.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/InstCombine
   )
 
 add_dependencies(LLVMInstCombine intrinsics_gen)
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
deleted file mode 100644
index d4b252b..0000000
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ /dev/null
@@ -1,432 +0,0 @@
-//===- InstCombine.h - Main InstCombine pass definition ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINE_H
-#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINE_H
-
-#include "InstCombineWorklist.h"
-#include "llvm/Analysis/AssumptionTracker.h"
-#include "llvm/Analysis/TargetFolder.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
-
-#define DEBUG_TYPE "instcombine"
-
-namespace llvm {
-class CallSite;
-class DataLayout;
-class DominatorTree;
-class TargetLibraryInfo;
-class DbgDeclareInst;
-class MemIntrinsic;
-class MemSetInst;
-
-/// SelectPatternFlavor - We can match a variety of different patterns for
-/// select operations.
-enum SelectPatternFlavor {
-  SPF_UNKNOWN = 0,
-  SPF_SMIN,
-  SPF_UMIN,
-  SPF_SMAX,
-  SPF_UMAX,
-  SPF_ABS,
-  SPF_NABS
-};
-
-/// getComplexity:  Assign a complexity or rank value to LLVM Values...
-///   0 -> undef, 1 -> Const, 2 -> Other, 3 -> Arg, 3 -> Unary, 4 -> OtherInst
-static inline unsigned getComplexity(Value *V) {
-  if (isa<Instruction>(V)) {
-    if (BinaryOperator::isNeg(V) || BinaryOperator::isFNeg(V) ||
-        BinaryOperator::isNot(V))
-      return 3;
-    return 4;
-  }
-  if (isa<Argument>(V))
-    return 3;
-  return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
-}
-
-/// AddOne - Add one to a Constant
-static inline Constant *AddOne(Constant *C) {
-  return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
-}
-/// SubOne - Subtract one from a Constant
-static inline Constant *SubOne(Constant *C) {
-  return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1));
-}
-
-/// InstCombineIRInserter - This is an IRBuilder insertion helper that works
-/// just like the normal insertion helper, but also adds any new instructions
-/// to the instcombine worklist.
-class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter
-    : public IRBuilderDefaultInserter<true> {
-  InstCombineWorklist &Worklist;
-  AssumptionTracker *AT;
-
-public:
-  InstCombineIRInserter(InstCombineWorklist &WL, AssumptionTracker *AT)
-    : Worklist(WL), AT(AT) {}
-
-  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
-                    BasicBlock::iterator InsertPt) const {
-    IRBuilderDefaultInserter<true>::InsertHelper(I, Name, BB, InsertPt);
-    Worklist.Add(I);
-
-    using namespace llvm::PatternMatch;
-    if (match(I, m_Intrinsic<Intrinsic::assume>()))
-      AT->registerAssumption(cast<CallInst>(I));
-  }
-};
-
-/// InstCombiner - The -instcombine pass.
-class LLVM_LIBRARY_VISIBILITY InstCombiner
-    : public FunctionPass,
-      public InstVisitor<InstCombiner, Instruction *> {
-  AssumptionTracker *AT;
-  const DataLayout *DL;
-  TargetLibraryInfo *TLI;
-  DominatorTree *DT; // not required
-  bool MadeIRChange;
-  LibCallSimplifier *Simplifier;
-  bool MinimizeSize;
-
-public:
-  /// Worklist - All of the instructions that need to be simplified.
-  InstCombineWorklist Worklist;
-
-  /// Builder - This is an IRBuilder that automatically inserts new
-  /// instructions into the worklist when they are created.
-  typedef IRBuilder<true, TargetFolder, InstCombineIRInserter> BuilderTy;
-  BuilderTy *Builder;
-
-  static char ID; // Pass identification, replacement for typeid
-  InstCombiner() : FunctionPass(ID), DL(nullptr), Builder(nullptr) {
-    MinimizeSize = false;
-    initializeInstCombinerPass(*PassRegistry::getPassRegistry());
-  }
-
-public:
-  bool runOnFunction(Function &F) override;
-
-  bool DoOneIteration(Function &F, unsigned ItNum);
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-  AssumptionTracker *getAssumptionTracker() const { return AT; }
-
-  const DataLayout *getDataLayout() const { return DL; }
-  
-  DominatorTree *getDominatorTree() const { return DT; }
-
-  TargetLibraryInfo *getTargetLibraryInfo() const { return TLI; }
-
-  // Visitation implementation - Implement instruction combining for different
-  // instruction types.  The semantics are as follows:
-  // Return Value:
-  //    null        - No change was made
-  //     I          - Change was made, I is still valid, I may be dead though
-  //   otherwise    - Change was made, replace I with returned instruction
-  //
-  Instruction *visitAdd(BinaryOperator &I);
-  Instruction *visitFAdd(BinaryOperator &I);
-  Value *OptimizePointerDifference(Value *LHS, Value *RHS, Type *Ty);
-  Instruction *visitSub(BinaryOperator &I);
-  Instruction *visitFSub(BinaryOperator &I);
-  Instruction *visitMul(BinaryOperator &I);
-  Value *foldFMulConst(Instruction *FMulOrDiv, Constant *C,
-                       Instruction *InsertBefore);
-  Instruction *visitFMul(BinaryOperator &I);
-  Instruction *visitURem(BinaryOperator &I);
-  Instruction *visitSRem(BinaryOperator &I);
-  Instruction *visitFRem(BinaryOperator &I);
-  bool SimplifyDivRemOfSelect(BinaryOperator &I);
-  Instruction *commonRemTransforms(BinaryOperator &I);
-  Instruction *commonIRemTransforms(BinaryOperator &I);
-  Instruction *commonDivTransforms(BinaryOperator &I);
-  Instruction *commonIDivTransforms(BinaryOperator &I);
-  Instruction *visitUDiv(BinaryOperator &I);
-  Instruction *visitSDiv(BinaryOperator &I);
-  Instruction *visitFDiv(BinaryOperator &I);
-  Value *FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS);
-  Value *FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
-  Instruction *visitAnd(BinaryOperator &I);
-  Value *FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction *CxtI);
-  Value *FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
-  Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op, Value *A,
-                                   Value *B, Value *C);
-  Instruction *FoldXorWithConstants(BinaryOperator &I, Value *Op, Value *A,
-                                    Value *B, Value *C);
-  Instruction *visitOr(BinaryOperator &I);
-  Instruction *visitXor(BinaryOperator &I);
-  Instruction *visitShl(BinaryOperator &I);
-  Instruction *visitAShr(BinaryOperator &I);
-  Instruction *visitLShr(BinaryOperator &I);
-  Instruction *commonShiftTransforms(BinaryOperator &I);
-  Instruction *FoldFCmp_IntToFP_Cst(FCmpInst &I, Instruction *LHSI,
-                                    Constant *RHSC);
-  Instruction *FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
-                                            GlobalVariable *GV, CmpInst &ICI,
-                                            ConstantInt *AndCst = nullptr);
-  Instruction *visitFCmpInst(FCmpInst &I);
-  Instruction *visitICmpInst(ICmpInst &I);
-  Instruction *visitICmpInstWithCastAndCast(ICmpInst &ICI);
-  Instruction *visitICmpInstWithInstAndIntCst(ICmpInst &ICI, Instruction *LHS,
-                                              ConstantInt *RHS);
-  Instruction *FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
-                              ConstantInt *DivRHS);
-  Instruction *FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *DivI,
-                              ConstantInt *DivRHS);
-  Instruction *FoldICmpCstShrCst(ICmpInst &I, Value *Op, Value *A,
-                                 ConstantInt *CI1, ConstantInt *CI2);
-  Instruction *FoldICmpCstShlCst(ICmpInst &I, Value *Op, Value *A,
-                                 ConstantInt *CI1, ConstantInt *CI2);
-  Instruction *FoldICmpAddOpCst(Instruction &ICI, Value *X, ConstantInt *CI,
-                                ICmpInst::Predicate Pred);
-  Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
-                           ICmpInst::Predicate Cond, Instruction &I);
-  Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1,
-                                   BinaryOperator &I);
-  Instruction *commonCastTransforms(CastInst &CI);
-  Instruction *commonPointerCastTransforms(CastInst &CI);
-  Instruction *visitTrunc(TruncInst &CI);
-  Instruction *visitZExt(ZExtInst &CI);
-  Instruction *visitSExt(SExtInst &CI);
-  Instruction *visitFPTrunc(FPTruncInst &CI);
-  Instruction *visitFPExt(CastInst &CI);
-  Instruction *visitFPToUI(FPToUIInst &FI);
-  Instruction *visitFPToSI(FPToSIInst &FI);
-  Instruction *visitUIToFP(CastInst &CI);
-  Instruction *visitSIToFP(CastInst &CI);
-  Instruction *visitPtrToInt(PtrToIntInst &CI);
-  Instruction *visitIntToPtr(IntToPtrInst &CI);
-  Instruction *visitBitCast(BitCastInst &CI);
-  Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI);
-  Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI);
-  Instruction *FoldSelectIntoOp(SelectInst &SI, Value *, Value *);
-  Instruction *FoldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1,
-                            Value *A, Value *B, Instruction &Outer,
-                            SelectPatternFlavor SPF2, Value *C);
-  Instruction *visitSelectInst(SelectInst &SI);
-  Instruction *visitSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
-  Instruction *visitCallInst(CallInst &CI);
-  Instruction *visitInvokeInst(InvokeInst &II);
-
-  Instruction *SliceUpIllegalIntegerPHI(PHINode &PN);
-  Instruction *visitPHINode(PHINode &PN);
-  Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
-  Instruction *visitAllocaInst(AllocaInst &AI);
-  Instruction *visitAllocSite(Instruction &FI);
-  Instruction *visitFree(CallInst &FI);
-  Instruction *visitLoadInst(LoadInst &LI);
-  Instruction *visitStoreInst(StoreInst &SI);
-  Instruction *visitBranchInst(BranchInst &BI);
-  Instruction *visitSwitchInst(SwitchInst &SI);
-  Instruction *visitReturnInst(ReturnInst &RI);
-  Instruction *visitInsertValueInst(InsertValueInst &IV);
-  Instruction *visitInsertElementInst(InsertElementInst &IE);
-  Instruction *visitExtractElementInst(ExtractElementInst &EI);
-  Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
-  Instruction *visitExtractValueInst(ExtractValueInst &EV);
-  Instruction *visitLandingPadInst(LandingPadInst &LI);
-
-  // visitInstruction - Specify what to return for unhandled instructions...
-  Instruction *visitInstruction(Instruction &I) { return nullptr; }
-
-private:
-  bool ShouldChangeType(Type *From, Type *To) const;
-  Value *dyn_castNegVal(Value *V) const;
-  Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const;
-  Type *FindElementAtOffset(Type *PtrTy, int64_t Offset,
-                            SmallVectorImpl<Value *> &NewIndices);
-  Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
-
-  /// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually
-  /// results in any code being generated and is interesting to optimize out. If
-  /// the cast can be eliminated by some other simple transformation, we prefer
-  /// to do the simplification first.
-  bool ShouldOptimizeCast(Instruction::CastOps opcode, const Value *V,
-                          Type *Ty);
-
-  Instruction *visitCallSite(CallSite CS);
-  Instruction *tryOptimizeCall(CallInst *CI, const DataLayout *DL);
-  bool transformConstExprCastCall(CallSite CS);
-  Instruction *transformCallThroughTrampoline(CallSite CS,
-                                              IntrinsicInst *Tramp);
-  Instruction *transformZExtICmp(ICmpInst *ICI, Instruction &CI,
-                                 bool DoXform = true);
-  Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
-  bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction *CxtI);
-  bool WillNotOverflowUnsignedAdd(Value *LHS, Value *RHS, Instruction *CxtI);
-  bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction *CxtI);
-  bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction *CxtI);
-  Value *EmitGEPOffset(User *GEP);
-  Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
-  Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
-
-public:
-  // InsertNewInstBefore - insert an instruction New before instruction Old
-  // in the program.  Add the new instruction to the worklist.
-  //
-  Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) {
-    assert(New && !New->getParent() &&
-           "New instruction already inserted into a basic block!");
-    BasicBlock *BB = Old.getParent();
-    BB->getInstList().insert(&Old, New); // Insert inst
-    Worklist.Add(New);
-    return New;
-  }
-
-  // InsertNewInstWith - same as InsertNewInstBefore, but also sets the
-  // debug loc.
-  //
-  Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
-    New->setDebugLoc(Old.getDebugLoc());
-    return InsertNewInstBefore(New, Old);
-  }
-
-  // ReplaceInstUsesWith - This method is to be used when an instruction is
-  // found to be dead, replacable with another preexisting expression.  Here
-  // we add all uses of I to the worklist, replace all uses of I with the new
-  // value, then return I, so that the inst combiner will know that I was
-  // modified.
-  //
-  Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) {
-    Worklist.AddUsersToWorkList(I); // Add all modified instrs to worklist.
-
-    // If we are replacing the instruction with itself, this must be in a
-    // segment of unreachable code, so just clobber the instruction.
-    if (&I == V)
-      V = UndefValue::get(I.getType());
-
-    DEBUG(dbgs() << "IC: Replacing " << I << "\n"
-                    "    with " << *V << '\n');
-
-    I.replaceAllUsesWith(V);
-    return &I;
-  }
-
-  // EraseInstFromFunction - When dealing with an instruction that has side
-  // effects or produces a void value, we can't rely on DCE to delete the
-  // instruction.  Instead, visit methods should return the value returned by
-  // this function.
-  Instruction *EraseInstFromFunction(Instruction &I) {
-    DEBUG(dbgs() << "IC: ERASE " << I << '\n');
-
-    assert(I.use_empty() && "Cannot erase instruction that is used!");
-    // Make sure that we reprocess all operands now that we reduced their
-    // use counts.
-    if (I.getNumOperands() < 8) {
-      for (User::op_iterator i = I.op_begin(), e = I.op_end(); i != e; ++i)
-        if (Instruction *Op = dyn_cast<Instruction>(*i))
-          Worklist.Add(Op);
-    }
-    Worklist.Remove(&I);
-    I.eraseFromParent();
-    MadeIRChange = true;
-    return nullptr; // Don't do anything with FI
-  }
-
-  void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
-                        unsigned Depth = 0, Instruction *CxtI = nullptr) const {
-    return llvm::computeKnownBits(V, KnownZero, KnownOne, DL, Depth,
-                                  AT, CxtI, DT);
-  }
-
-  bool MaskedValueIsZero(Value *V, const APInt &Mask,
-                         unsigned Depth = 0,
-                         Instruction *CxtI = nullptr) const {
-    return llvm::MaskedValueIsZero(V, Mask, DL, Depth, AT, CxtI, DT);
-  }
-  unsigned ComputeNumSignBits(Value *Op, unsigned Depth = 0,
-                              Instruction *CxtI = nullptr) const {
-    return llvm::ComputeNumSignBits(Op, DL, Depth, AT, CxtI, DT);
-  }
-
-private:
-  /// SimplifyAssociativeOrCommutative - This performs a few simplifications for
-  /// operators which are associative or commutative.
-  bool SimplifyAssociativeOrCommutative(BinaryOperator &I);
-
-  /// SimplifyUsingDistributiveLaws - This tries to simplify binary operations
-  /// which some other binary operation distributes over either by factorizing
-  /// out common terms (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this
-  /// results in simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is
-  /// a win).  Returns the simplified value, or null if it didn't simplify.
-  Value *SimplifyUsingDistributiveLaws(BinaryOperator &I);
-
-  /// SimplifyDemandedUseBits - Attempts to replace V with a simpler value
-  /// based on the demanded bits.
-  Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt &KnownZero,
-                                 APInt &KnownOne, unsigned Depth,
-                                 Instruction *CxtI = nullptr);
-  bool SimplifyDemandedBits(Use &U, APInt DemandedMask, APInt &KnownZero,
-                            APInt &KnownOne, unsigned Depth = 0);
-  /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
-  /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence.
-  Value *SimplifyShrShlDemandedBits(Instruction *Lsr, Instruction *Sftl,
-                                    APInt DemandedMask, APInt &KnownZero,
-                                    APInt &KnownOne);
-
-  /// SimplifyDemandedInstructionBits - Inst is an integer instruction that
-  /// SimplifyDemandedBits knows about.  See if the instruction has any
-  /// properties that allow us to simplify its operands.
-  bool SimplifyDemandedInstructionBits(Instruction &Inst);
-
-  Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
-                                    APInt &UndefElts, unsigned Depth = 0);
-
-  Value *SimplifyVectorOp(BinaryOperator &Inst);
-
-  // FoldOpIntoPhi - Given a binary operator, cast instruction, or select
-  // which has a PHI node as operand #0, see if we can fold the instruction
-  // into the PHI (which is only possible if all operands to the PHI are
-  // constants).
-  //
-  Instruction *FoldOpIntoPhi(Instruction &I);
-
-  // FoldPHIArgOpIntoPHI - If all operands to a PHI node are the same "unary"
-  // operator and they all are only used by the PHI, PHI together their
-  // inputs, and do the operation once, to the result of the PHI.
-  Instruction *FoldPHIArgOpIntoPHI(PHINode &PN);
-  Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN);
-  Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN);
-  Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN);
-
-  Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,
-                        ConstantInt *AndRHS, BinaryOperator &TheAnd);
-
-  Value *FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask,
-                            bool isSub, Instruction &I);
-  Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, bool isSigned,
-                         bool Inside);
-  Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
-  Instruction *MatchBSwap(BinaryOperator &I);
-  bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
-  Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
-  Instruction *SimplifyMemSet(MemSetInst *MI);
-
-  Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned);
-
-  /// Descale - Return a value X such that Val = X * Scale, or null if none.  If
-  /// the multiplication is known not to overflow then NoSignedWrap is set.
-  Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
-};
-
-} // end namespace llvm.
-
-#undef DEBUG_TYPE
-
-#endif
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 902b640..752f79d 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "InstCombine.h"
+#include "InstCombineInternal.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/DataLayout.h"
@@ -751,8 +751,7 @@ Value *FAddCombine::createNaryFAdd
   return LastVal;
 }
 
-Value *FAddCombine::createFSub
-  (Value *Opnd0, Value *Opnd1) {
+Value *FAddCombine::createFSub(Value *Opnd0, Value *Opnd1) {
   Value *V = Builder->CreateFSub(Opnd0, Opnd1);
   if (Instruction *I = dyn_cast<Instruction>(V))
     createInstPostProc(I);
@@ -760,15 +759,14 @@ Value *FAddCombine::createFSub
 }
 
 Value *FAddCombine::createFNeg(Value *V) {
-  Value *Zero = cast<Value>(ConstantFP::get(V->getType(), 0.0));
+  Value *Zero = cast<Value>(ConstantFP::getZeroValueForNegation(V->getType()));
   Value *NewV = createFSub(Zero, V);
   if (Instruction *I = dyn_cast<Instruction>(NewV))
     createInstPostProc(I, true); // fneg's don't receive instruction numbers.
   return NewV;
 }
 
-Value *FAddCombine::createFAdd
-  (Value *Opnd0, Value *Opnd1) {
+Value *FAddCombine::createFAdd(Value *Opnd0, Value *Opnd1) {
   Value *V = Builder->CreateFAdd(Opnd0, Opnd1);
   if (Instruction *I = dyn_cast<Instruction>(V))
     createInstPostProc(I);
@@ -789,8 +787,7 @@ Value *FAddCombine::createFDiv(Value *Opnd0, Value *Opnd1) {
   return V;
 }
 
-void FAddCombine::createInstPostProc(Instruction *NewInstr,
-                                     bool NoNumber) {
+void FAddCombine::createInstPostProc(Instruction *NewInstr, bool NoNumber) {
   NewInstr->setDebugLoc(Instr->getDebugLoc());
 
   // Keep track of the number of instruction created.
@@ -840,8 +837,7 @@ unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) {
 // <C, V>             "fmul V, C"      false
 //
 // NOTE: Keep this function in sync with FAddCombine::calcInstrNumber.
-Value *FAddCombine::createAddendVal
-  (const FAddend &Opnd, bool &NeedNeg) {
+Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) {
   const FAddendCoef &Coeff = Opnd.getCoef();
 
   if (Opnd.isConstant()) {
@@ -894,7 +890,6 @@ static bool checkRippleForAdd(const APInt &Op0KnownZero,
 ///    (sext (add LHS, RHS))  === (add (sext LHS), (sext RHS))
 /// This basically requires proving that the add in the original type would not
 /// overflow to change the sign bit or have a carry out.
-/// TODO: Handle this for Vectors.
 bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS,
                                             Instruction *CxtI) {
   // There are different heuristics we can use for this.  Here are some simple
@@ -918,42 +913,25 @@ bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS,
       ComputeNumSignBits(RHS, 0, CxtI) > 1)
     return true;
 
-  if (IntegerType *IT = dyn_cast<IntegerType>(LHS->getType())) {
-    int BitWidth = IT->getBitWidth();
-    APInt LHSKnownZero(BitWidth, 0);
-    APInt LHSKnownOne(BitWidth, 0);
-    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, CxtI);
-
-    APInt RHSKnownZero(BitWidth, 0);
-    APInt RHSKnownOne(BitWidth, 0);
-    computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, CxtI);
-
-    // Addition of two 2's compliment numbers having opposite signs will never
-    // overflow.
-    if ((LHSKnownOne[BitWidth - 1] && RHSKnownZero[BitWidth - 1]) ||
-        (LHSKnownZero[BitWidth - 1] && RHSKnownOne[BitWidth - 1]))
-      return true;
-
-    // Check if carry bit of addition will not cause overflow.
-    if (checkRippleForAdd(LHSKnownZero, RHSKnownZero))
-      return true;
-    if (checkRippleForAdd(RHSKnownZero, LHSKnownZero))
-      return true;
-  }
-  return false;
-}
+  unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
+  APInt LHSKnownZero(BitWidth, 0);
+  APInt LHSKnownOne(BitWidth, 0);
+  computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, CxtI);
 
-/// WillNotOverflowUnsignedAdd - Return true if we can prove that:
-///    (zext (add LHS, RHS))  === (add (zext LHS), (zext RHS))
-bool InstCombiner::WillNotOverflowUnsignedAdd(Value *LHS, Value *RHS,
-                                              Instruction *CxtI) {
-  // There are different heuristics we can use for this. Here is a simple one.
-  // If the sign bit of LHS and that of RHS are both zero, no unsigned wrap.
-  bool LHSKnownNonNegative, LHSKnownNegative;
-  bool RHSKnownNonNegative, RHSKnownNegative;
-  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, 0, AT, CxtI, DT);
-  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, 0, AT, CxtI, DT);
-  if (LHSKnownNonNegative && RHSKnownNonNegative)
+  APInt RHSKnownZero(BitWidth, 0);
+  APInt RHSKnownOne(BitWidth, 0);
+  computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, CxtI);
+
+  // Addition of two 2's compliment numbers having opposite signs will never
+  // overflow.
+  if ((LHSKnownOne[BitWidth - 1] && RHSKnownZero[BitWidth - 1]) ||
+      (LHSKnownZero[BitWidth - 1] && RHSKnownOne[BitWidth - 1]))
+    return true;
+
+  // Check if carry bit of addition will not cause overflow.
+  if (checkRippleForAdd(LHSKnownZero, RHSKnownZero))
+    return true;
+  if (checkRippleForAdd(RHSKnownZero, LHSKnownZero))
     return true;
 
   return false;
@@ -972,24 +950,22 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,
       ComputeNumSignBits(RHS, 0, CxtI) > 1)
     return true;
 
-  if (IntegerType *IT = dyn_cast<IntegerType>(LHS->getType())) {
-    unsigned BitWidth = IT->getBitWidth();
-    APInt LHSKnownZero(BitWidth, 0);
-    APInt LHSKnownOne(BitWidth, 0);
-    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, CxtI);
+  unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
+  APInt LHSKnownZero(BitWidth, 0);
+  APInt LHSKnownOne(BitWidth, 0);
+  computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, CxtI);
 
-    APInt RHSKnownZero(BitWidth, 0);
-    APInt RHSKnownOne(BitWidth, 0);
-    computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, CxtI);
+  APInt RHSKnownZero(BitWidth, 0);
+  APInt RHSKnownOne(BitWidth, 0);
+  computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, CxtI);
 
-    // Subtraction of two 2's compliment numbers having identical signs will
-    // never overflow.
-    if ((LHSKnownOne[BitWidth - 1] && RHSKnownOne[BitWidth - 1]) ||
-        (LHSKnownZero[BitWidth - 1] && RHSKnownZero[BitWidth - 1]))
-      return true;
+  // Subtraction of two 2's compliment numbers having identical signs will
+  // never overflow.
+  if ((LHSKnownOne[BitWidth - 1] && RHSKnownOne[BitWidth - 1]) ||
+      (LHSKnownZero[BitWidth - 1] && RHSKnownZero[BitWidth - 1]))
+    return true;
 
-    // TODO: implement logic similar to checkRippleForAdd
-  }
+  // TODO: implement logic similar to checkRippleForAdd
   return false;
 }
 
@@ -1000,8 +976,8 @@ bool InstCombiner::WillNotOverflowUnsignedSub(Value *LHS, Value *RHS,
   // If the LHS is negative and the RHS is non-negative, no unsigned wrap.
   bool LHSKnownNonNegative, LHSKnownNegative;
   bool RHSKnownNonNegative, RHSKnownNegative;
-  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, 0, AT, CxtI, DT);
-  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, 0, AT, CxtI, DT);
+  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, /*Depth=*/0, CxtI);
+  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, /*Depth=*/0, CxtI);
   if (LHSKnownNegative && RHSKnownNonNegative)
     return true;
 
@@ -1077,7 +1053,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
      return ReplaceInstUsesWith(I, V);
 
    if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(),
-                                  I.hasNoUnsignedWrap(), DL, TLI, DT, AT))
+                                  I.hasNoUnsignedWrap(), DL, TLI, DT, AC))
      return ReplaceInstUsesWith(I, V);
 
    // (A*B)+(A*C) -> A*(B+C) etc
@@ -1335,7 +1311,9 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     Changed = true;
     I.setHasNoSignedWrap(true);
   }
-  if (!I.hasNoUnsignedWrap() && WillNotOverflowUnsignedAdd(LHS, RHS, &I)) {
+  if (!I.hasNoUnsignedWrap() &&
+      computeOverflowForUnsignedAdd(LHS, RHS, &I) ==
+          OverflowResult::NeverOverflows) {
     Changed = true;
     I.setHasNoUnsignedWrap(true);
   }
@@ -1350,8 +1328,8 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), DL,
-                                  TLI, DT, AT))
+  if (Value *V =
+          SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   if (isa<Constant>(RHS)) {
@@ -1529,7 +1507,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     return ReplaceInstUsesWith(I, V);
 
   if (Value *V = SimplifySubInst(Op0, Op1, I.hasNoSignedWrap(),
-                                 I.hasNoUnsignedWrap(), DL, TLI, DT, AT))
+                                 I.hasNoUnsignedWrap(), DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   // (A*B)-(A*C) -> A*(B-C) etc
@@ -1717,10 +1695,18 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), DL,
-                                  TLI, DT, AT))
+  if (Value *V =
+          SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
+  // fsub nsz 0, X ==> fsub nsz -0.0, X
+  if (I.getFastMathFlags().noSignedZeros() && match(Op0, m_Zero())) {
+    // Subtraction from -0.0 is the canonical form of fneg.
+    Instruction *NewI = BinaryOperator::CreateFNeg(Op1);
+    NewI->copyFastMathFlags(&I);
+    return NewI;
+  }
+
   if (isa<Constant>(Op0))
     if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
       if (Instruction *NV = FoldOpIntoSelect(I, SI))
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 55ebced..863eeaf 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "InstCombine.h"
+#include "InstCombineInternal.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Intrinsics.h"
@@ -22,30 +22,12 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
-/// isFreeToInvert - Return true if the specified value is free to invert (apply
-/// ~ to).  This happens in cases where the ~ can be eliminated.
-static inline bool isFreeToInvert(Value *V) {
-  // ~(~(X)) -> X.
-  if (BinaryOperator::isNot(V))
-    return true;
-
-  // Constants can be considered to be not'ed values.
-  if (isa<ConstantInt>(V))
-    return true;
-
-  // Compares can be inverted if they have a single use.
-  if (CmpInst *CI = dyn_cast<CmpInst>(V))
-    return CI->hasOneUse();
-
-  return false;
-}
-
 static inline Value *dyn_castNotVal(Value *V) {
   // If this is not(not(x)) don't return that this is a not: we want the two
   // not's to be folded first.
   if (BinaryOperator::isNot(V)) {
     Value *Operand = BinaryOperator::getNotArgument(V);
-    if (!isFreeToInvert(Operand))
+    if (!IsFreeToInvert(Operand, Operand->hasOneUse()))
       return Operand;
   }
 
@@ -117,6 +99,61 @@ static Value *getFCmpValue(bool isordered, unsigned code,
   return Builder->CreateFCmp(Pred, LHS, RHS);
 }
 
+/// \brief Transform BITWISE_OP(BSWAP(A),BSWAP(B)) to BSWAP(BITWISE_OP(A, B))
+/// \param I Binary operator to transform.
+/// \return Pointer to node that must replace the original binary operator, or
+///         null pointer if no transformation was made.
+Value *InstCombiner::SimplifyBSwap(BinaryOperator &I) {
+  IntegerType *ITy = dyn_cast<IntegerType>(I.getType());
+
+  // Can't do vectors.
+  if (I.getType()->isVectorTy()) return nullptr;
+
+  // Can only do bitwise ops.
+  unsigned Op = I.getOpcode();
+  if (Op != Instruction::And && Op != Instruction::Or &&
+      Op != Instruction::Xor)
+    return nullptr;
+
+  Value *OldLHS = I.getOperand(0);
+  Value *OldRHS = I.getOperand(1);
+  ConstantInt *ConstLHS = dyn_cast<ConstantInt>(OldLHS);
+  ConstantInt *ConstRHS = dyn_cast<ConstantInt>(OldRHS);
+  IntrinsicInst *IntrLHS = dyn_cast<IntrinsicInst>(OldLHS);
+  IntrinsicInst *IntrRHS = dyn_cast<IntrinsicInst>(OldRHS);
+  bool IsBswapLHS = (IntrLHS && IntrLHS->getIntrinsicID() == Intrinsic::bswap);
+  bool IsBswapRHS = (IntrRHS && IntrRHS->getIntrinsicID() == Intrinsic::bswap);
+
+  if (!IsBswapLHS && !IsBswapRHS)
+    return nullptr;
+
+  if (!IsBswapLHS && !ConstLHS)
+    return nullptr;
+
+  if (!IsBswapRHS && !ConstRHS)
+    return nullptr;
+
+  /// OP( BSWAP(x), BSWAP(y) ) -> BSWAP( OP(x, y) )
+  /// OP( BSWAP(x), CONSTANT ) -> BSWAP( OP(x, BSWAP(CONSTANT) ) )
+  Value *NewLHS = IsBswapLHS ? IntrLHS->getOperand(0) :
+                  Builder->getInt(ConstLHS->getValue().byteSwap());
+
+  Value *NewRHS = IsBswapRHS ? IntrRHS->getOperand(0) :
+                  Builder->getInt(ConstRHS->getValue().byteSwap());
+
+  Value *BinOp = nullptr;
+  if (Op == Instruction::And)
+    BinOp = Builder->CreateAnd(NewLHS, NewRHS);
+  else if (Op == Instruction::Or)
+    BinOp = Builder->CreateOr(NewLHS, NewRHS);
+  else //if (Op == Instruction::Xor)
+    BinOp = Builder->CreateXor(NewLHS, NewRHS);
+
+  Module *M = I.getParent()->getParent()->getParent();
+  Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, ITy);
+  return Builder->CreateCall(F, BinOp);
+}
+
 // OptAndOp - This handles expressions of the form ((val OP C1) & C2).  Where
 // the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.  Op is
 // guaranteed to be a binary operator.
@@ -785,6 +822,62 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
   return nullptr;
 }
 
+/// Try to fold a signed range checked with lower bound 0 to an unsigned icmp.
+/// Example: (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
+/// If \p Inverted is true then the check is for the inverted range, e.g.
+/// (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
+Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
+                                        bool Inverted) {
+  // Check the lower range comparison, e.g. x >= 0
+  // InstCombine already ensured that if there is a constant it's on the RHS.
+  ConstantInt *RangeStart = dyn_cast<ConstantInt>(Cmp0->getOperand(1));
+  if (!RangeStart)
+    return nullptr;
+
+  ICmpInst::Predicate Pred0 = (Inverted ? Cmp0->getInversePredicate() :
+                               Cmp0->getPredicate());
+
+  // Accept x > -1 or x >= 0 (after potentially inverting the predicate).
+  if (!((Pred0 == ICmpInst::ICMP_SGT && RangeStart->isMinusOne()) ||
+        (Pred0 == ICmpInst::ICMP_SGE && RangeStart->isZero())))
+    return nullptr;
+
+  ICmpInst::Predicate Pred1 = (Inverted ? Cmp1->getInversePredicate() :
+                               Cmp1->getPredicate());
+
+  Value *Input = Cmp0->getOperand(0);
+  Value *RangeEnd;
+  if (Cmp1->getOperand(0) == Input) {
+    // For the upper range compare we have: icmp x, n
+    RangeEnd = Cmp1->getOperand(1);
+  } else if (Cmp1->getOperand(1) == Input) {
+    // For the upper range compare we have: icmp n, x
+    RangeEnd = Cmp1->getOperand(0);
+    Pred1 = ICmpInst::getSwappedPredicate(Pred1);
+  } else {
+    return nullptr;
+  }
+
+  // Check the upper range comparison, e.g. x < n
+  ICmpInst::Predicate NewPred;
+  switch (Pred1) {
+    case ICmpInst::ICMP_SLT: NewPred = ICmpInst::ICMP_ULT; break;
+    case ICmpInst::ICMP_SLE: NewPred = ICmpInst::ICMP_ULE; break;
+    default: return nullptr;
+  }
+
+  // This simplification is only valid if the upper range is not negative.
+  bool IsNegative, IsNotNegative;
+  ComputeSignBit(RangeEnd, IsNotNegative, IsNegative, /*Depth=*/0, Cmp1);
+  if (!IsNotNegative)
+    return nullptr;
+
+  if (Inverted)
+    NewPred = ICmpInst::getInversePredicate(NewPred);
+
+  return Builder->CreateICmp(NewPred, Input, RangeEnd);
+}
+
 /// FoldAndOfICmps - Fold (icmp)&(icmp) if possible.
 Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
@@ -807,6 +900,14 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, true, Builder))
     return V;
 
+  // E.g. (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
+  if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/false))
+    return V;
+
+  // E.g. (icmp slt x, n) & (icmp sge x, 0) --> icmp ult x, n
+  if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/false))
+    return V;
+
   // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
   Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0);
   ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
@@ -1108,7 +1209,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyAndInst(Op0, Op1, DL, TLI, DT, AT))
+  if (Value *V = SimplifyAndInst(Op0, Op1, DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   // (A|B)&(A|C) -> A|(B&C) etc
@@ -1120,6 +1221,9 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   if (SimplifyDemandedInstructionBits(I))
     return &I;
 
+  if (Value *V = SimplifyBSwap(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) {
     const APInt &AndRHSMask = AndRHS->getValue();
 
@@ -1605,15 +1709,15 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
       Value *Mask = nullptr;
       Value *Masked = nullptr;
       if (LAnd->getOperand(0) == RAnd->getOperand(0) &&
-          isKnownToBeAPowerOfTwo(LAnd->getOperand(1), false, 0, AT, CxtI, DT) &&
-          isKnownToBeAPowerOfTwo(RAnd->getOperand(1), false, 0, AT, CxtI, DT)) {
+          isKnownToBeAPowerOfTwo(LAnd->getOperand(1), false, 0, AC, CxtI, DT) &&
+          isKnownToBeAPowerOfTwo(RAnd->getOperand(1), false, 0, AC, CxtI, DT)) {
         Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1));
         Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask);
       } else if (LAnd->getOperand(1) == RAnd->getOperand(1) &&
-                 isKnownToBeAPowerOfTwo(LAnd->getOperand(0),
-                                        false, 0, AT, CxtI, DT) &&
-                 isKnownToBeAPowerOfTwo(RAnd->getOperand(0),
-                                        false, 0, AT, CxtI, DT)) {
+                 isKnownToBeAPowerOfTwo(LAnd->getOperand(0), false, 0, AC, CxtI,
+                                        DT) &&
+                 isKnownToBeAPowerOfTwo(RAnd->getOperand(0), false, 0, AC, CxtI,
+                                        DT)) {
         Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0));
         Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask);
       }
@@ -1724,6 +1828,14 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
           Builder->CreateAdd(B, ConstantInt::getSigned(B->getType(), -1)), A);
   }
 
+  // E.g. (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
+  if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/true))
+    return V;
+
+  // E.g. (icmp sgt x, n) | (icmp slt x, 0) --> icmp ugt x, n
+  if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/true))
+    return V;
+ 
   // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
   if (!LHSCst || !RHSCst) return nullptr;
 
@@ -2033,7 +2145,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyOrInst(Op0, Op1, DL, TLI, DT, AT))
+  if (Value *V = SimplifyOrInst(Op0, Op1, DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   // (A&B)|(A&C) -> A&(B|C) etc
@@ -2045,6 +2157,9 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (SimplifyDemandedInstructionBits(I))
     return &I;
 
+  if (Value *V = SimplifyBSwap(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
     ConstantInt *C1 = nullptr; Value *X = nullptr;
     // (X & C1) | C2 --> (X | C2) & (C1|C2)
@@ -2305,11 +2420,34 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (SwappedForXor)
     std::swap(Op0, Op1);
 
-  if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
-    if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
+  {
+    ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
+    ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
+    if (LHS && RHS)
       if (Value *Res = FoldOrOfICmps(LHS, RHS, &I))
         return ReplaceInstUsesWith(I, Res);
 
+    // TODO: Make this recursive; it's a little tricky because an arbitrary
+    // number of 'or' instructions might have to be created.
+    Value *X, *Y;
+    if (LHS && match(Op1, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
+      if (auto *Cmp = dyn_cast<ICmpInst>(X))
+        if (Value *Res = FoldOrOfICmps(LHS, Cmp, &I))
+          return ReplaceInstUsesWith(I, Builder->CreateOr(Res, Y));
+      if (auto *Cmp = dyn_cast<ICmpInst>(Y))
+        if (Value *Res = FoldOrOfICmps(LHS, Cmp, &I))
+          return ReplaceInstUsesWith(I, Builder->CreateOr(Res, X));
+    }
+    if (RHS && match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
+      if (auto *Cmp = dyn_cast<ICmpInst>(X))
+        if (Value *Res = FoldOrOfICmps(Cmp, RHS, &I))
+          return ReplaceInstUsesWith(I, Builder->CreateOr(Res, Y));
+      if (auto *Cmp = dyn_cast<ICmpInst>(Y))
+        if (Value *Res = FoldOrOfICmps(Cmp, RHS, &I))
+          return ReplaceInstUsesWith(I, Builder->CreateOr(Res, X));
+    }
+  }
+
   // (fcmp uno x, c) | (fcmp uno y, c)  -> (fcmp uno x, y)
   if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
     if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
@@ -2394,7 +2532,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyXorInst(Op0, Op1, DL, TLI, DT, AT))
+  if (Value *V = SimplifyXorInst(Op0, Op1, DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   // (A&B)^(A&C) -> A&(B^C) etc
@@ -2406,6 +2544,9 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   if (SimplifyDemandedInstructionBits(I))
     return &I;
 
+  if (Value *V = SimplifyBSwap(I))
+    return ReplaceInstUsesWith(I, V);
+
   // Is this a ~ operation?
   if (Value *NotOp = dyn_castNotVal(&I)) {
     if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(NotOp)) {
@@ -2426,8 +2567,10 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
 
         // ~(X & Y) --> (~X | ~Y) - De Morgan's Law
         // ~(X | Y) === (~X & ~Y) - De Morgan's Law
-        if (isFreeToInvert(Op0I->getOperand(0)) &&
-            isFreeToInvert(Op0I->getOperand(1))) {
+        if (IsFreeToInvert(Op0I->getOperand(0),
+                           Op0I->getOperand(0)->hasOneUse()) &&
+            IsFreeToInvert(Op0I->getOperand(1),
+                           Op0I->getOperand(1)->hasOneUse())) {
           Value *NotX =
             Builder->CreateNot(Op0I->getOperand(0), "notlhs");
           Value *NotY =
@@ -2445,15 +2588,16 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     }
   }
 
-
-  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
-    if (RHS->isOne() && Op0->hasOneUse())
+  if (Constant *RHS = dyn_cast<Constant>(Op1)) {
+    if (RHS->isAllOnesValue() && Op0->hasOneUse())
       // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B
       if (CmpInst *CI = dyn_cast<CmpInst>(Op0))
         return CmpInst::Create(CI->getOpcode(),
                                CI->getInversePredicate(),
                                CI->getOperand(0), CI->getOperand(1));
+  }
 
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
     // fold (xor(zext(cmp)), 1) and (xor(sext(cmp)), -1) to ext(!cmp).
     if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
       if (CmpInst *CI = dyn_cast<CmpInst>(Op0C->getOperand(0))) {
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 87e49a1..05e7162 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -11,15 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "InstCombine.h"
+#include "InstCombineInternal.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Statepoint.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -59,8 +61,8 @@ static Type *reduceToSingleValueType(Type *T) {
 }
 
 Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
-  unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, AT, MI, DT);
-  unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, AT, MI, DT);
+  unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, AC, MI, DT);
+  unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, AC, MI, DT);
   unsigned MinAlign = std::min(DstAlign, SrcAlign);
   unsigned CopyAlign = MI->getAlignment();
 
@@ -118,15 +120,14 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
         // If the memcpy has metadata describing the members, see if we can
         // get the TBAA tag describing our copy.
         if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
-          if (M->getNumOperands() == 3 &&
-              M->getOperand(0) &&
-              isa<ConstantInt>(M->getOperand(0)) &&
-              cast<ConstantInt>(M->getOperand(0))->isNullValue() &&
+          if (M->getNumOperands() == 3 && M->getOperand(0) &&
+              mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
+              mdconst::extract<ConstantInt>(M->getOperand(0))->isNullValue() &&
               M->getOperand(1) &&
-              isa<ConstantInt>(M->getOperand(1)) &&
-              cast<ConstantInt>(M->getOperand(1))->getValue() == Size &&
-              M->getOperand(2) &&
-              isa<MDNode>(M->getOperand(2)))
+              mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
+              mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
+                  Size &&
+              M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
             CopyMD = cast<MDNode>(M->getOperand(2));
         }
       }
@@ -155,7 +156,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
 }
 
 Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
-  unsigned Alignment = getKnownAlignment(MI->getDest(), DL, AT, MI, DT);
+  unsigned Alignment = getKnownAlignment(MI->getDest(), DL, AC, MI, DT);
   if (MI->getAlignment() < Alignment) {
     MI->setAlignment(ConstantInt::get(MI->getAlignmentType(),
                                              Alignment, false));
@@ -352,48 +353,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   case Intrinsic::uadd_with_overflow: {
     Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
-    IntegerType *IT = cast<IntegerType>(II->getArgOperand(0)->getType());
-    uint32_t BitWidth = IT->getBitWidth();
-    APInt LHSKnownZero(BitWidth, 0);
-    APInt LHSKnownOne(BitWidth, 0);
-    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, II);
-    bool LHSKnownNegative = LHSKnownOne[BitWidth - 1];
-    bool LHSKnownPositive = LHSKnownZero[BitWidth - 1];
-
-    if (LHSKnownNegative || LHSKnownPositive) {
-      APInt RHSKnownZero(BitWidth, 0);
-      APInt RHSKnownOne(BitWidth, 0);
-      computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, II);
-      bool RHSKnownNegative = RHSKnownOne[BitWidth - 1];
-      bool RHSKnownPositive = RHSKnownZero[BitWidth - 1];
-      if (LHSKnownNegative && RHSKnownNegative) {
-        // The sign bit is set in both cases: this MUST overflow.
-        // Create a simple add instruction, and insert it into the struct.
-        Value *Add = Builder->CreateAdd(LHS, RHS);
-        Add->takeName(&CI);
-        Constant *V[] = {
-          UndefValue::get(LHS->getType()),
-          ConstantInt::getTrue(II->getContext())
-        };
-        StructType *ST = cast<StructType>(II->getType());
-        Constant *Struct = ConstantStruct::get(ST, V);
-        return InsertValueInst::Create(Struct, Add, 0);
-      }
-
-      if (LHSKnownPositive && RHSKnownPositive) {
-        // The sign bit is clear in both cases: this CANNOT overflow.
-        // Create a simple add instruction, and insert it into the struct.
-        Value *Add = Builder->CreateNUWAdd(LHS, RHS);
-        Add->takeName(&CI);
-        Constant *V[] = {
-          UndefValue::get(LHS->getType()),
-          ConstantInt::getFalse(II->getContext())
-        };
-        StructType *ST = cast<StructType>(II->getType());
-        Constant *Struct = ConstantStruct::get(ST, V);
-        return InsertValueInst::Create(Struct, Add, 0);
-      }
-    }
+    OverflowResult OR = computeOverflowForUnsignedAdd(LHS, RHS, II);
+    if (OR == OverflowResult::NeverOverflows)
+      return CreateOverflowTuple(II, Builder->CreateNUWAdd(LHS, RHS), false);
+    if (OR == OverflowResult::AlwaysOverflows)
+      return CreateOverflowTuple(II, Builder->CreateAdd(LHS, RHS), true);
   }
   // FALL THROUGH uadd into sadd
   case Intrinsic::sadd_with_overflow:
@@ -413,13 +377,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (ConstantInt *RHS = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
       // X + 0 -> {X, false}
       if (RHS->isZero()) {
-        Constant *V[] = {
-          UndefValue::get(II->getArgOperand(0)->getType()),
-          ConstantInt::getFalse(II->getContext())
-        };
-        Constant *Struct =
-          ConstantStruct::get(cast<StructType>(II->getType()), V);
-        return InsertValueInst::Create(Struct, II->getArgOperand(0), 0);
+        return CreateOverflowTuple(II, II->getArgOperand(0), false,
+                                    /*ReUseName*/false);
       }
     }
 
@@ -428,65 +387,43 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (II->getIntrinsicID() == Intrinsic::sadd_with_overflow) {
       Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
       if (WillNotOverflowSignedAdd(LHS, RHS, II)) {
-        Value *Add = Builder->CreateNSWAdd(LHS, RHS);
-        Add->takeName(&CI);
-        Constant *V[] = {UndefValue::get(Add->getType()), Builder->getFalse()};
-        StructType *ST = cast<StructType>(II->getType());
-        Constant *Struct = ConstantStruct::get(ST, V);
-        return InsertValueInst::Create(Struct, Add, 0);
+        return CreateOverflowTuple(II, Builder->CreateNSWAdd(LHS, RHS), false);
       }
     }
 
     break;
   case Intrinsic::usub_with_overflow:
-  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::ssub_with_overflow: {
+    Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
     // undef - X -> undef
     // X - undef -> undef
-    if (isa<UndefValue>(II->getArgOperand(0)) ||
-        isa<UndefValue>(II->getArgOperand(1)))
+    if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS))
       return ReplaceInstUsesWith(CI, UndefValue::get(II->getType()));
 
-    if (ConstantInt *RHS = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+    if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) {
       // X - 0 -> {X, false}
-      if (RHS->isZero()) {
-        Constant *V[] = {
-          UndefValue::get(II->getArgOperand(0)->getType()),
-          ConstantInt::getFalse(II->getContext())
-        };
-        Constant *Struct =
-          ConstantStruct::get(cast<StructType>(II->getType()), V);
-        return InsertValueInst::Create(Struct, II->getArgOperand(0), 0);
+      if (ConstRHS->isZero()) {
+        return CreateOverflowTuple(II, LHS, false, /*ReUseName*/false);
+      }
+    }
+    if (II->getIntrinsicID() == Intrinsic::ssub_with_overflow) {
+      if (WillNotOverflowSignedSub(LHS, RHS, II)) {
+        return CreateOverflowTuple(II, Builder->CreateNSWSub(LHS, RHS), false);
+      }
+    } else {
+      if (WillNotOverflowUnsignedSub(LHS, RHS, II)) {
+        return CreateOverflowTuple(II, Builder->CreateNUWSub(LHS, RHS), false);
       }
     }
     break;
+  }
   case Intrinsic::umul_with_overflow: {
     Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
-    unsigned BitWidth = cast<IntegerType>(LHS->getType())->getBitWidth();
-
-    APInt LHSKnownZero(BitWidth, 0);
-    APInt LHSKnownOne(BitWidth, 0);
-    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, II);
-    APInt RHSKnownZero(BitWidth, 0);
-    APInt RHSKnownOne(BitWidth, 0);
-    computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, II);
-
-    // Get the largest possible values for each operand.
-    APInt LHSMax = ~LHSKnownZero;
-    APInt RHSMax = ~RHSKnownZero;
-
-    // If multiplying the maximum values does not overflow then we can turn
-    // this into a plain NUW mul.
-    bool Overflow;
-    LHSMax.umul_ov(RHSMax, Overflow);
-    if (!Overflow) {
-      Value *Mul = Builder->CreateNUWMul(LHS, RHS, "umul_with_overflow");
-      Constant *V[] = {
-        UndefValue::get(LHS->getType()),
-        Builder->getFalse()
-      };
-      Constant *Struct = ConstantStruct::get(cast<StructType>(II->getType()),V);
-      return InsertValueInst::Create(Struct, Mul, 0);
-    }
+    OverflowResult OR = computeOverflowForUnsignedMul(LHS, RHS, II);
+    if (OR == OverflowResult::NeverOverflows)
+      return CreateOverflowTuple(II, Builder->CreateNUWMul(LHS, RHS), false);
+    if (OR == OverflowResult::AlwaysOverflows)
+      return CreateOverflowTuple(II, Builder->CreateMul(LHS, RHS), true);
   } // FALL THROUGH
   case Intrinsic::smul_with_overflow:
     // Canonicalize constants into the RHS.
@@ -509,13 +446,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
       // X * 1 -> {X, false}
       if (RHSI->equalsInt(1)) {
-        Constant *V[] = {
-          UndefValue::get(II->getArgOperand(0)->getType()),
-          ConstantInt::getFalse(II->getContext())
-        };
-        Constant *Struct =
-          ConstantStruct::get(cast<StructType>(II->getType()), V);
-        return InsertValueInst::Create(Struct, II->getArgOperand(0), 0);
+        return CreateOverflowTuple(II, II->getArgOperand(0), false,
+                                    /*ReUseName*/false);
+      }
+    }
+    if (II->getIntrinsicID() == Intrinsic::smul_with_overflow) {
+      Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
+      if (WillNotOverflowSignedMul(LHS, RHS, II)) {
+        return CreateOverflowTuple(II, Builder->CreateNSWMul(LHS, RHS), false);
       }
     }
     break;
@@ -606,8 +544,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ppc_altivec_lvx:
   case Intrinsic::ppc_altivec_lvxl:
     // Turn PPC lvx -> load if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16,
-                                   DL, AT, II, DT) >= 16) {
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, AC, II, DT) >=
+        16) {
       Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
                                          PointerType::getUnqual(II->getType()));
       return new LoadInst(Ptr);
@@ -623,8 +561,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ppc_altivec_stvx:
   case Intrinsic::ppc_altivec_stvxl:
     // Turn stvx -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16,
-                                   DL, AT, II, DT) >= 16) {
+    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, AC, II, DT) >=
+        16) {
       Type *OpPtrTy =
         PointerType::getUnqual(II->getArgOperand(0)->getType());
       Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
@@ -638,12 +576,50 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
     return new StoreInst(II->getArgOperand(0), Ptr, false, 1);
   }
+  case Intrinsic::ppc_qpx_qvlfs:
+    // Turn PPC QPX qvlfs -> load if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, AC, II, DT) >=
+        16) {
+      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
+                                         PointerType::getUnqual(II->getType()));
+      return new LoadInst(Ptr);
+    }
+    break;
+  case Intrinsic::ppc_qpx_qvlfd:
+    // Turn PPC QPX qvlfd -> load if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, AC, II, DT) >=
+        32) {
+      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
+                                         PointerType::getUnqual(II->getType()));
+      return new LoadInst(Ptr);
+    }
+    break;
+  case Intrinsic::ppc_qpx_qvstfs:
+    // Turn PPC QPX qvstfs -> store if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, AC, II, DT) >=
+        16) {
+      Type *OpPtrTy =
+        PointerType::getUnqual(II->getArgOperand(0)->getType());
+      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
+      return new StoreInst(II->getArgOperand(0), Ptr);
+    }
+    break;
+  case Intrinsic::ppc_qpx_qvstfd:
+    // Turn PPC QPX qvstfd -> store if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, AC, II, DT) >=
+        32) {
+      Type *OpPtrTy =
+        PointerType::getUnqual(II->getArgOperand(0)->getType());
+      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
+      return new StoreInst(II->getArgOperand(0), Ptr);
+    }
+    break;
   case Intrinsic::x86_sse_storeu_ps:
   case Intrinsic::x86_sse2_storeu_pd:
   case Intrinsic::x86_sse2_storeu_dq:
     // Turn X86 storeu -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16,
-                                   DL, AT, II, DT) >= 16) {
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, AC, II, DT) >=
+        16) {
       Type *OpPtrTy =
         PointerType::getUnqual(II->getArgOperand(1)->getType());
       Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy);
@@ -774,7 +750,22 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // TODO: eventually we should lower this intrinsic to IR
     if (auto CIWidth = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
       if (auto CIStart = dyn_cast<ConstantInt>(II->getArgOperand(3))) {
-        if (CIWidth->equalsInt(64) && CIStart->isZero()) {
+        unsigned Index = CIStart->getZExtValue();
+        // From AMD documentation: "a value of zero in the field length is
+        // defined as length of 64".
+        unsigned Length = CIWidth->equalsInt(0) ? 64 : CIWidth->getZExtValue();
+
+        // From AMD documentation: "If the sum of the bit index + length field
+        // is greater than 64, the results are undefined".
+
+        // Note that both field index and field length are 8-bit quantities.
+        // Since variables 'Index' and 'Length' are unsigned values
+        // obtained from zero-extending field index and field length
+        // respectively, their sum should never wrap around.
+        if ((Index + Length) > 64)
+          return ReplaceInstUsesWith(CI, UndefValue::get(II->getType()));
+
+        if (Length == 64 && Index == 0) {
           Value *Vec = II->getArgOperand(1);
           Value *Undef = UndefValue::get(Vec->getType());
           const uint32_t Mask[] = { 0, 2 };
@@ -988,7 +979,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::arm_neon_vst2lane:
   case Intrinsic::arm_neon_vst3lane:
   case Intrinsic::arm_neon_vst4lane: {
-    unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), DL, AT, II, DT);
+    unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), DL, AC, II, DT);
     unsigned AlignArg = II->getNumArgOperands() - 1;
     ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg));
     if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) {
@@ -1128,7 +1119,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           cast<Constant>(RHS)->isNullValue()) {
         LoadInst* LI = cast<LoadInst>(LHS);
         if (isValidAssumeForContext(II, LI, DL, DT)) {
-          MDNode* MD = MDNode::get(II->getContext(), ArrayRef<Value*>());
+          MDNode *MD = MDNode::get(II->getContext(), None);
           LI->setMetadata(LLVMContext::MD_nonnull, MD);
           return EraseInstFromFunction(*II);
         }
@@ -1145,6 +1136,48 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     break;
   }
+  case Intrinsic::experimental_gc_relocate: {
+    // Translate facts known about a pointer before relocating into
+    // facts about the relocate value, while being careful to
+    // preserve relocation semantics.
+    GCRelocateOperands Operands(II);
+    Value *DerivedPtr = Operands.derivedPtr();
+
+    // Remove the relocation if unused, note that this check is required
+    // to prevent the cases below from looping forever.
+    if (II->use_empty())
+      return EraseInstFromFunction(*II);
+
+    // Undef is undef, even after relocation.
+    // TODO: provide a hook for this in GCStrategy.  This is clearly legal for
+    // most practical collectors, but there was discussion in the review thread
+    // about whether it was legal for all possible collectors.
+    if (isa<UndefValue>(DerivedPtr))
+      return ReplaceInstUsesWith(*II, DerivedPtr);
+
+    // The relocation of null will be null for most any collector.
+    // TODO: provide a hook for this in GCStrategy.  There might be some weird
+    // collector this property does not hold for.
+    if (isa<ConstantPointerNull>(DerivedPtr))
+      return ReplaceInstUsesWith(*II, DerivedPtr);
+
+    // isKnownNonNull -> nonnull attribute
+    if (isKnownNonNull(DerivedPtr))
+      II->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+
+    // isDereferenceablePointer -> deref attribute
+    if (DerivedPtr->isDereferenceablePointer(DL)) {
+      if (Argument *A = dyn_cast<Argument>(DerivedPtr)) {
+        uint64_t Bytes = A->getDereferenceableBytes();
+        II->addDereferenceableAttr(AttributeSet::ReturnIndex, Bytes);
+      }
+    }
+
+    // TODO: bitcast(relocate(p)) -> relocate(bitcast(p))
+    // Canonicalize on the type from the uses to the defs
+
+    // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...)
+  }
   }
 
   return visitCallSite(II);
@@ -1165,6 +1198,14 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS,
   if (!CI->isLosslessCast())
     return false;
 
+  // If this is a GC intrinsic, avoid munging types.  We need types for
+  // statepoint reconstruction in SelectionDAG.
+  // TODO: This is probably something which should be expanded to all
+  // intrinsics since the entire point of intrinsics is that
+  // they are understandable by the optimizer.
+  if (isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS))
+    return false;
+
   // The size of ByVal or InAlloca arguments is derived from the type, so we
   // can't change to a type with a different size.  If the size were
   // passed explicitly we could avoid this check.
@@ -1188,7 +1229,11 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS,
 Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const DataLayout *DL) {
   if (!CI->getCalledFunction()) return nullptr;
 
-  if (Value *With = Simplifier->optimizeCall(CI)) {
+  auto InstCombineRAUW = [this](Instruction *From, Value *With) {
+    ReplaceInstUsesWith(*From, With);
+  };
+  LibCallSimplifier Simplifier(DL, TLI, InstCombineRAUW);
+  if (Value *With = Simplifier.optimizeCall(CI)) {
     ++NumSimplified;
     return CI->use_empty() ? CI : ReplaceInstUsesWith(*CI, With);
   }
@@ -1380,6 +1425,10 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
   if (!Callee)
     return false;
+  // The prototype of thunks are a lie, don't try to directly call such
+  // functions.
+  if (Callee->hasFnAttribute("thunk"))
+    return false;
   Instruction *Caller = CS.getInstruction();
   const AttributeSet &CallerPAL = CS.getAttributes();
 
@@ -1397,7 +1446,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     if (NewRetTy->isStructTy())
       return false; // TODO: Handle multiple return values.
 
-    if (!CastInst::isBitCastable(NewRetTy, OldRetTy)) {
+    if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) {
       if (Callee->isDeclaration())
         return false;   // Cannot transform this return value.
 
@@ -1432,12 +1481,21 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   unsigned NumActualArgs = CS.arg_size();
   unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs);
 
+  // Prevent us turning:
+  // declare void @takes_i32_inalloca(i32* inalloca)
+  //  call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0)
+  //
+  // into:
+  //  call void @takes_i32_inalloca(i32* null)
+  if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca))
+    return false;
+
   CallSite::arg_iterator AI = CS.arg_begin();
   for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) {
     Type *ParamTy = FT->getParamType(i);
     Type *ActTy = (*AI)->getType();
 
-    if (!CastInst::isBitCastable(ActTy, ParamTy))
+    if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
       return false;   // Cannot transform this parameter value.
 
     if (AttrBuilder(CallerPAL.getParamAttributes(i + 1), i + 1).
@@ -1532,7 +1590,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     if ((*AI)->getType() == ParamTy) {
       Args.push_back(*AI);
     } else {
-      Args.push_back(Builder->CreateBitCast(*AI, ParamTy));
+      Args.push_back(Builder->CreateBitOrPointerCast(*AI, ParamTy));
     }
 
     // Add any parameter attributes.
@@ -1603,7 +1661,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   Value *NV = NC;
   if (OldRetTy != NV->getType() && !Caller->use_empty()) {
     if (!NV->getType()->isVoidTy()) {
-      NV = NC = CastInst::Create(CastInst::BitCast, NC, OldRetTy);
+      NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy);
       NC->setDebugLoc(Caller->getDebugLoc());
 
       // If this is an invoke instruction, we should insert it after the first
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index aba77bb..3e2b719 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -11,11 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "InstCombine.h"
+#include "InstCombineInternal.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -1064,6 +1064,15 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
   Value *Src = CI.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = CI.getType();
 
+  // If we know that the value being extended is positive, we can use a zext
+  // instead. 
+  bool KnownZero, KnownOne;
+  ComputeSignBit(Src, KnownZero, KnownOne, 0, &CI);
+  if (KnownZero) {
+    Value *ZExt = Builder->CreateZExt(Src, DestTy);
+    return ReplaceInstUsesWith(CI, ZExt);
+  }
+
   // Attempt to extend the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
@@ -1269,6 +1278,8 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         // type of OpI doesn't enter into things at all.  We simply evaluate
         // in whichever source type is larger, then convert to the
         // destination type.
+        if (SrcWidth == OpWidth)
+          break;
         if (LHSWidth < SrcWidth)
           LHSOrig = Builder->CreateFPExt(LHSOrig, RHSOrig->getType());
         else if (RHSWidth <= SrcWidth)
@@ -1330,22 +1341,57 @@ Instruction *InstCombiner::visitFPExt(CastInst &CI) {
   return commonCastTransforms(CI);
 }
 
+// fpto{s/u}i({u/s}itofp(X)) --> X or zext(X) or sext(X) or trunc(X)
+// This is safe if the intermediate type has enough bits in its mantissa to
+// accurately represent all values of X.  For example, this won't work with
+// i64 -> float -> i64.
+Instruction *InstCombiner::FoldItoFPtoI(Instruction &FI) {
+  if (!isa<UIToFPInst>(FI.getOperand(0)) && !isa<SIToFPInst>(FI.getOperand(0)))
+    return nullptr;
+  Instruction *OpI = cast<Instruction>(FI.getOperand(0));
+
+  Value *SrcI = OpI->getOperand(0);
+  Type *FITy = FI.getType();
+  Type *OpITy = OpI->getType();
+  Type *SrcTy = SrcI->getType();
+  bool IsInputSigned = isa<SIToFPInst>(OpI);
+  bool IsOutputSigned = isa<FPToSIInst>(FI);
+
+  // We can safely assume the conversion won't overflow the output range,
+  // because (for example) (uint8_t)18293.f is undefined behavior.
+
+  // Since we can assume the conversion won't overflow, our decision as to
+  // whether the input will fit in the float should depend on the minimum
+  // of the input range and output range.
+
+  // This means this is also safe for a signed input and unsigned output, since
+  // a negative input would lead to undefined behavior.
+  int InputSize = (int)SrcTy->getScalarSizeInBits() - IsInputSigned;
+  int OutputSize = (int)FITy->getScalarSizeInBits() - IsOutputSigned;
+  int ActualSize = std::min(InputSize, OutputSize);
+
+  if (ActualSize <= OpITy->getFPMantissaWidth()) {
+    if (FITy->getScalarSizeInBits() > SrcTy->getScalarSizeInBits()) {
+      if (IsInputSigned && IsOutputSigned)
+        return new SExtInst(SrcI, FITy);
+      return new ZExtInst(SrcI, FITy);
+    }
+    if (FITy->getScalarSizeInBits() < SrcTy->getScalarSizeInBits())
+      return new TruncInst(SrcI, FITy);
+    if (SrcTy == FITy)
+      return ReplaceInstUsesWith(FI, SrcI);
+    return new BitCastInst(SrcI, FITy);
+  }
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
   Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
   if (!OpI)
     return commonCastTransforms(FI);
 
-  // fptoui(uitofp(X)) --> X
-  // fptoui(sitofp(X)) --> X
-  // This is safe if the intermediate type has enough bits in its mantissa to
-  // accurately represent all values of X.  For example, do not do this with
-  // i64->float->i64.  This is also safe for sitofp case, because any negative
-  // 'X' value would cause an undefined result for the fptoui.
-  if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) &&
-      OpI->getOperand(0)->getType() == FI.getType() &&
-      (int)FI.getType()->getScalarSizeInBits() < /*extra bit for sign */
-                    OpI->getType()->getFPMantissaWidth())
-    return ReplaceInstUsesWith(FI, OpI->getOperand(0));
+  if (Instruction *I = FoldItoFPtoI(FI))
+    return I;
 
   return commonCastTransforms(FI);
 }
@@ -1355,17 +1401,8 @@ Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) {
   if (!OpI)
     return commonCastTransforms(FI);
 
-  // fptosi(sitofp(X)) --> X
-  // fptosi(uitofp(X)) --> X
-  // This is safe if the intermediate type has enough bits in its mantissa to
-  // accurately represent all values of X.  For example, do not do this with
-  // i64->float->i64.  This is also safe for sitofp case, because any negative
-  // 'X' value would cause an undefined result for the fptoui.
-  if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) &&
-      OpI->getOperand(0)->getType() == FI.getType() &&
-      (int)FI.getType()->getScalarSizeInBits() <=
-                    OpI->getType()->getFPMantissaWidth())
-    return ReplaceInstUsesWith(FI, OpI->getOperand(0));
+  if (Instruction *I = FoldItoFPtoI(FI))
+    return I;
 
   return commonCastTransforms(FI);
 }
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 399f1c3..f48d89b 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -11,7 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "InstCombine.h"
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
@@ -20,12 +22,20 @@
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+
 using namespace llvm;
 using namespace PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
+// How many times is a select replaced by one of its operands?
+STATISTIC(NumSel, "Number of select opts");
+
+// Initialization Routines
+
 static ConstantInt *getOne(Constant *C) {
   return ConstantInt::get(cast<IntegerType>(C->getType()), 1);
 }
@@ -1921,14 +1931,17 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
   if (DL && LHSCI->getOpcode() == Instruction::PtrToInt &&
       DL->getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) {
     Value *RHSOp = nullptr;
-    if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) {
+    if (PtrToIntOperator *RHSC = dyn_cast<PtrToIntOperator>(ICI.getOperand(1))) {
+      Value *RHSCIOp = RHSC->getOperand(0);
+      if (RHSCIOp->getType()->getPointerAddressSpace() ==
+          LHSCIOp->getType()->getPointerAddressSpace()) {
+        RHSOp = RHSC->getOperand(0);
+        // If the pointer types don't match, insert a bitcast.
+        if (LHSCIOp->getType() != RHSOp->getType())
+          RHSOp = Builder->CreateBitCast(RHSOp, LHSCIOp->getType());
+      }
+    } else if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1)))
       RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy);
-    } else if (PtrToIntInst *RHSC = dyn_cast<PtrToIntInst>(ICI.getOperand(1))) {
-      RHSOp = RHSC->getOperand(0);
-      // If the pointer types don't match, insert a bitcast.
-      if (LHSCIOp->getType() != RHSOp->getType())
-        RHSOp = Builder->CreateBitCast(RHSOp, LHSCIOp->getType());
-    }
 
     if (RHSOp)
       return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSOp);
@@ -2446,6 +2459,122 @@ static bool swapMayExposeCSEOpportunities(const Value * Op0,
   return GlobalSwapBenefits > 0;
 }
 
+/// \brief Check that one use is in the same block as the definition and all
+/// other uses are in blocks dominated by a given block
+///
+/// \param DI Definition
+/// \param UI Use
+/// \param DB Block that must dominate all uses of \p DI outside
+///           the parent block
+/// \return true when \p UI is the only use of \p DI in the parent block
+/// and all other uses of \p DI are in blocks dominated by \p DB.
+///
+bool InstCombiner::dominatesAllUses(const Instruction *DI,
+                                    const Instruction *UI,
+                                    const BasicBlock *DB) const {
+  assert(DI && UI && "Instruction not defined\n");
+  // ignore incomplete definitions
+  if (!DI->getParent())
+    return false;
+  // DI and UI must be in the same block
+  if (DI->getParent() != UI->getParent())
+    return false;
+  // Protect from self-referencing blocks
+  if (DI->getParent() == DB)
+    return false;
+  // DominatorTree available?
+  if (!DT)
+    return false;
+  for (const User *U : DI->users()) {
+    auto *Usr = cast<Instruction>(U);
+    if (Usr != UI && !DT->dominates(DB, Usr->getParent()))
+      return false;
+  }
+  return true;
+}
+
+///
+/// true when the instruction sequence within a block is select-cmp-br.
+///
+static bool isChainSelectCmpBranch(const SelectInst *SI) {
+  const BasicBlock *BB = SI->getParent();
+  if (!BB)
+    return false;
+  auto *BI = dyn_cast_or_null<BranchInst>(BB->getTerminator());
+  if (!BI || BI->getNumSuccessors() != 2)
+    return false;
+  auto *IC = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!IC || (IC->getOperand(0) != SI && IC->getOperand(1) != SI))
+    return false;
+  return true;
+}
+
+///
+/// \brief True when a select result is replaced by one of its operands
+/// in select-icmp sequence. This will eventually result in the elimination
+/// of the select.
+///
+/// \param SI    Select instruction
+/// \param Icmp  Compare instruction
+/// \param SIOpd Operand that replaces the select
+///
+/// Notes:
+/// - The replacement is global and requires dominator information
+/// - The caller is responsible for the actual replacement
+///
+/// Example:
+///
+/// entry:
+///  %4 = select i1 %3, %C* %0, %C* null
+///  %5 = icmp eq %C* %4, null
+///  br i1 %5, label %9, label %7
+///  ...
+///  ; <label>:7                                       ; preds = %entry
+///  %8 = getelementptr inbounds %C* %4, i64 0, i32 0
+///  ...
+///
+/// can be transformed to
+///
+///  %5 = icmp eq %C* %0, null
+///  %6 = select i1 %3, i1 %5, i1 true
+///  br i1 %6, label %9, label %7
+///  ...
+///  ; <label>:7                                       ; preds = %entry
+///  %8 = getelementptr inbounds %C* %0, i64 0, i32 0  // replace by %0!
+///
+/// Similar when the first operand of the select is a constant or/and
+/// the compare is for not equal rather than equal.
+///
+/// NOTE: The function is only called when the select and compare constants
+/// are equal, the optimization can work only for EQ predicates. This is not a
+/// major restriction since a NE compare should be 'normalized' to an equal
+/// compare, which usually happens in the combiner and test case
+/// select-cmp-br.ll
+/// checks for it.
+bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
+                                             const ICmpInst *Icmp,
+                                             const unsigned SIOpd) {
+  assert((SIOpd == 1 || SIOpd == 2) && "Invalid select operand!");
+  if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) {
+    BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1);
+    // The check for the unique predecessor is not the best that can be
+    // done. But it protects efficiently against cases like  when SI's
+    // home block has two successors, Succ and Succ1, and Succ1 predecessor
+    // of Succ. Then SI can't be replaced by SIOpd because the use that gets
+    // replaced can be reached on either path. So the uniqueness check
+    // guarantees that the path all uses of SI (outside SI's parent) are on
+    // is disjoint from all other paths out of SI. But that information
+    // is more expensive to compute, and the trade-off here is in favor
+    // of compile-time.
+    if (Succ->getUniquePredecessor() && dominatesAllUses(SI, Icmp, Succ)) {
+      NumSel++;
+      SI->replaceUsesOutsideBlock(SI->getOperand(SIOpd), SI->getParent());
+      return true;
+    }
+  }
+  return false;
+}
+
 Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -2463,7 +2592,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     Changed = true;
   }
 
-  if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, DL, TLI, DT, AT))
+  if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   // comparing -val or val with non-zero is the same as just comparing val
@@ -2560,11 +2689,33 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         return Res;
     }
 
-    // (icmp ne/eq (sub A B) 0) -> (icmp ne/eq A, B)
-    if (I.isEquality() && CI->isZero() &&
-        match(Op0, m_Sub(m_Value(A), m_Value(B)))) {
-      // (icmp cond A B) if cond is equality
-      return new ICmpInst(I.getPredicate(), A, B);
+    // The following transforms are only 'worth it' if the only user of the
+    // subtraction is the icmp.
+    if (Op0->hasOneUse()) {
+      // (icmp ne/eq (sub A B) 0) -> (icmp ne/eq A, B)
+      if (I.isEquality() && CI->isZero() &&
+          match(Op0, m_Sub(m_Value(A), m_Value(B))))
+        return new ICmpInst(I.getPredicate(), A, B);
+
+      // (icmp sgt (sub nsw A B), -1) -> (icmp sge A, B)
+      if (I.getPredicate() == ICmpInst::ICMP_SGT && CI->isAllOnesValue() &&
+          match(Op0, m_NSWSub(m_Value(A), m_Value(B))))
+        return new ICmpInst(ICmpInst::ICMP_SGE, A, B);
+
+      // (icmp sgt (sub nsw A B), 0) -> (icmp sgt A, B)
+      if (I.getPredicate() == ICmpInst::ICMP_SGT && CI->isZero() &&
+          match(Op0, m_NSWSub(m_Value(A), m_Value(B))))
+        return new ICmpInst(ICmpInst::ICMP_SGT, A, B);
+
+      // (icmp slt (sub nsw A B), 0) -> (icmp slt A, B)
+      if (I.getPredicate() == ICmpInst::ICMP_SLT && CI->isZero() &&
+          match(Op0, m_NSWSub(m_Value(A), m_Value(B))))
+        return new ICmpInst(ICmpInst::ICMP_SLT, A, B);
+
+      // (icmp slt (sub nsw A B), 1) -> (icmp sle A, B)
+      if (I.getPredicate() == ICmpInst::ICMP_SLT && CI->isOne() &&
+          match(Op0, m_NSWSub(m_Value(A), m_Value(B))))
+        return new ICmpInst(ICmpInst::ICMP_SLE, A, B);
     }
 
     // If we have an icmp le or icmp ge instruction, turn it into the
@@ -2898,18 +3049,39 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         // comparison into the select arms, which will cause one to be
         // constant folded and the select turned into a bitwise or.
         Value *Op1 = nullptr, *Op2 = nullptr;
-        if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1)))
+        ConstantInt *CI = 0;
+        if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
           Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
-        if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2)))
+          CI = dyn_cast<ConstantInt>(Op1);
+        }
+        if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
           Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+          CI = dyn_cast<ConstantInt>(Op2);
+        }
 
         // We only want to perform this transformation if it will not lead to
         // additional code. This is true if either both sides of the select
         // fold to a constant (in which case the icmp is replaced with a select
         // which will usually simplify) or this is the only user of the
         // select (in which case we are trading a select+icmp for a simpler
-        // select+icmp).
-        if ((Op1 && Op2) || (LHSI->hasOneUse() && (Op1 || Op2))) {
+        // select+icmp) or all uses of the select can be replaced based on
+        // dominance information ("Global cases").
+        bool Transform = false;
+        if (Op1 && Op2)
+          Transform = true;
+        else if (Op1 || Op2) {
+          // Local case
+          if (LHSI->hasOneUse())
+            Transform = true;
+          // Global cases
+          else if (CI && !CI->isZero())
+            // When Op1 is constant try replacing select with second operand.
+            // Otherwise Op2 is constant and try replacing select with first
+            // operand.
+            Transform = replacedSelectWithOperand(cast<SelectInst>(LHSI), &I,
+                                                  Op1 ? 2 : 1);
+        }
+        if (Transform) {
           if (!Op1)
             Op1 = Builder->CreateICmp(I.getPredicate(), LHSI->getOperand(1),
                                       RHSC, I.getName());
@@ -3255,9 +3427,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     // and       (A & ~B) != 0 --> (A & B) == 0
     // if A is a power of 2.
     if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
-        match(Op1, m_Zero()) && isKnownToBeAPowerOfTwo(A, false,
-                                                       0, AT, &I, DT) &&
-                                I.isEquality())
+        match(Op1, m_Zero()) &&
+        isKnownToBeAPowerOfTwo(A, false, 0, AC, &I, DT) && I.isEquality())
       return new ICmpInst(I.getInversePredicate(),
                           Builder->CreateAnd(A, B),
                           Op1);
@@ -3448,7 +3619,6 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
 }
 
 /// FoldFCmp_IntToFP_Cst - Fold fcmp ([us]itofp x, cst) if possible.
-///
 Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
                                                 Instruction *LHSI,
                                                 Constant *RHSC) {
@@ -3460,18 +3630,49 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
   int MantissaWidth = LHSI->getType()->getFPMantissaWidth();
   if (MantissaWidth == -1) return nullptr;  // Unknown.
 
+  IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
+
   // Check to see that the input is converted from an integer type that is small
   // enough that preserves all bits.  TODO: check here for "known" sign bits.
   // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e.
-  unsigned InputSize = LHSI->getOperand(0)->getType()->getScalarSizeInBits();
+  unsigned InputSize = IntTy->getScalarSizeInBits();
 
   // If this is a uitofp instruction, we need an extra bit to hold the sign.
   bool LHSUnsigned = isa<UIToFPInst>(LHSI);
   if (LHSUnsigned)
     ++InputSize;
 
+  if (I.isEquality()) {
+    FCmpInst::Predicate P = I.getPredicate();
+    bool IsExact = false;
+    APSInt RHSCvt(IntTy->getBitWidth(), LHSUnsigned);
+    RHS.convertToInteger(RHSCvt, APFloat::rmNearestTiesToEven, &IsExact);
+
+    // If the floating point constant isn't an integer value, we know if we will
+    // ever compare equal / not equal to it.
+    if (!IsExact) {
+      // TODO: Can never be -0.0 and other non-representable values
+      APFloat RHSRoundInt(RHS);
+      RHSRoundInt.roundToIntegral(APFloat::rmNearestTiesToEven);
+      if (RHS.compare(RHSRoundInt) != APFloat::cmpEqual) {
+        if (P == FCmpInst::FCMP_OEQ || P == FCmpInst::FCMP_UEQ)
+          return ReplaceInstUsesWith(I, Builder->getFalse());
+
+        assert(P == FCmpInst::FCMP_ONE || P == FCmpInst::FCMP_UNE);
+        return ReplaceInstUsesWith(I, Builder->getTrue());
+      }
+    }
+
+    // TODO: If the constant is exactly representable, is it always OK to do
+    // equality compares as integer?
+  }
+
+  // Comparisons with zero are a special case where we know we won't lose
+  // information.
+  bool IsCmpZero = RHS.isPosZero();
+
   // If the conversion would lose info, don't hack on this.
-  if ((int)InputSize > MantissaWidth)
+  if ((int)InputSize > MantissaWidth && !IsCmpZero)
     return nullptr;
 
   // Otherwise, we can potentially simplify the comparison.  We know that it
@@ -3512,8 +3713,6 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
     return ReplaceInstUsesWith(I, Builder->getFalse());
   }
 
-  IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
-
   // Now we know that the APFloat is a normal number, zero or inf.
 
   // See if the FP constant is too large for the integer.  For example,
@@ -3663,7 +3862,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
-  if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1, DL, TLI, DT, AT))
+  if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1, DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   // Simplify 'fcmp pred X, X'
@@ -3766,40 +3965,42 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
         }
         break;
       case Instruction::Call: {
+        if (!RHSC->isNullValue())
+          break;
+
         CallInst *CI = cast<CallInst>(LHSI);
-        LibFunc::Func Func;
+        const Function *F = CI->getCalledFunction();
+        if (!F)
+          break;
+
         // Various optimization for fabs compared with zero.
-        if (RHSC->isNullValue() && CI->getCalledFunction() &&
-            TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
-            TLI->has(Func)) {
-          if (Func == LibFunc::fabs || Func == LibFunc::fabsf ||
-              Func == LibFunc::fabsl) {
-            switch (I.getPredicate()) {
-            default: break;
+        LibFunc::Func Func;
+        if (F->getIntrinsicID() == Intrinsic::fabs ||
+            (TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
+             (Func == LibFunc::fabs || Func == LibFunc::fabsf ||
+              Func == LibFunc::fabsl))) {
+          switch (I.getPredicate()) {
+          default:
+            break;
             // fabs(x) < 0 --> false
-            case FCmpInst::FCMP_OLT:
-              return ReplaceInstUsesWith(I, Builder->getFalse());
+          case FCmpInst::FCMP_OLT:
+            return ReplaceInstUsesWith(I, Builder->getFalse());
             // fabs(x) > 0 --> x != 0
-            case FCmpInst::FCMP_OGT:
-              return new FCmpInst(FCmpInst::FCMP_ONE, CI->getArgOperand(0),
-                                  RHSC);
+          case FCmpInst::FCMP_OGT:
+            return new FCmpInst(FCmpInst::FCMP_ONE, CI->getArgOperand(0), RHSC);
             // fabs(x) <= 0 --> x == 0
-            case FCmpInst::FCMP_OLE:
-              return new FCmpInst(FCmpInst::FCMP_OEQ, CI->getArgOperand(0),
-                                  RHSC);
+          case FCmpInst::FCMP_OLE:
+            return new FCmpInst(FCmpInst::FCMP_OEQ, CI->getArgOperand(0), RHSC);
             // fabs(x) >= 0 --> !isnan(x)
-            case FCmpInst::FCMP_OGE:
-              return new FCmpInst(FCmpInst::FCMP_ORD, CI->getArgOperand(0),
-                                  RHSC);
+          case FCmpInst::FCMP_OGE:
+            return new FCmpInst(FCmpInst::FCMP_ORD, CI->getArgOperand(0), RHSC);
             // fabs(x) == 0 --> x == 0
             // fabs(x) != 0 --> x != 0
-            case FCmpInst::FCMP_OEQ:
-            case FCmpInst::FCMP_UEQ:
-            case FCmpInst::FCMP_ONE:
-            case FCmpInst::FCMP_UNE:
-              return new FCmpInst(I.getPredicate(), CI->getArgOperand(0),
-                                  RHSC);
-            }
+          case FCmpInst::FCMP_OEQ:
+          case FCmpInst::FCMP_UEQ:
+          case FCmpInst::FCMP_ONE:
+          case FCmpInst::FCMP_UNE:
+            return new FCmpInst(I.getPredicate(), CI->getArgOperand(0), RHSC);
           }
         }
       }
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
new file mode 100644
index 0000000..2fd5318
--- /dev/null
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -0,0 +1,529 @@
+//===- InstCombineInternal.h - InstCombine pass internals -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file provides internal interfaces used to implement the InstCombine.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
+#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
+
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+
+#define DEBUG_TYPE "instcombine"
+
+namespace llvm {
+class CallSite;
+class DataLayout;
+class DominatorTree;
+class TargetLibraryInfo;
+class DbgDeclareInst;
+class MemIntrinsic;
+class MemSetInst;
+
+/// \brief Specific patterns of select instructions we can match.
+enum SelectPatternFlavor {
+  SPF_UNKNOWN = 0,
+  SPF_SMIN,
+  SPF_UMIN,
+  SPF_SMAX,
+  SPF_UMAX,
+  SPF_ABS,
+  SPF_NABS
+};
+
+/// \brief Assign a complexity or rank value to LLVM Values.
+///
+/// This routine maps IR values to various complexity ranks:
+///   0 -> undef
+///   1 -> Constants
+///   2 -> Other non-instructions
+///   3 -> Arguments
+///   3 -> Unary operations
+///   4 -> Other instructions
+static inline unsigned getComplexity(Value *V) {
+  if (isa<Instruction>(V)) {
+    if (BinaryOperator::isNeg(V) || BinaryOperator::isFNeg(V) ||
+        BinaryOperator::isNot(V))
+      return 3;
+    return 4;
+  }
+  if (isa<Argument>(V))
+    return 3;
+  return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
+}
+
+/// \brief Add one to a Constant
+static inline Constant *AddOne(Constant *C) {
+  return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
+}
+/// \brief Subtract one from a Constant
+static inline Constant *SubOne(Constant *C) {
+  return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1));
+}
+
+/// \brief Return true if the specified value is free to invert (apply ~ to).
+/// This happens in cases where the ~ can be eliminated.  If WillInvertAllUses
+/// is true, work under the assumption that the caller intends to remove all
+/// uses of V and only keep uses of ~V.
+///
+static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) {
+  // ~(~(X)) -> X.
+  if (BinaryOperator::isNot(V))
+    return true;
+
+  // Constants can be considered to be not'ed values.
+  if (isa<ConstantInt>(V))
+    return true;
+
+  // Compares can be inverted if all of their uses are being modified to use the
+  // ~V.
+  if (isa<CmpInst>(V))
+    return WillInvertAllUses;
+
+  // If `V` is of the form `A + Constant` then `-1 - V` can be folded into `(-1
+  // - Constant) - A` if we are willing to invert all of the uses.
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V))
+    if (BO->getOpcode() == Instruction::Add ||
+        BO->getOpcode() == Instruction::Sub)
+      if (isa<Constant>(BO->getOperand(0)) || isa<Constant>(BO->getOperand(1)))
+        return WillInvertAllUses;
+
+  return false;
+}
+
+/// \brief An IRBuilder inserter that adds new instructions to the instcombine
+/// worklist.
+class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter
+    : public IRBuilderDefaultInserter<true> {
+  InstCombineWorklist &Worklist;
+  AssumptionCache *AC;
+
+public:
+  InstCombineIRInserter(InstCombineWorklist &WL, AssumptionCache *AC)
+      : Worklist(WL), AC(AC) {}
+
+  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
+                    BasicBlock::iterator InsertPt) const {
+    IRBuilderDefaultInserter<true>::InsertHelper(I, Name, BB, InsertPt);
+    Worklist.Add(I);
+
+    using namespace llvm::PatternMatch;
+    if (match(I, m_Intrinsic<Intrinsic::assume>()))
+      AC->registerAssumption(cast<CallInst>(I));
+  }
+};
+
+/// \brief The core instruction combiner logic.
+///
+/// This class provides both the logic to recursively visit instructions and
+/// combine them, as well as the pass infrastructure for running this as part
+/// of the LLVM pass pipeline.
+class LLVM_LIBRARY_VISIBILITY InstCombiner
+    : public InstVisitor<InstCombiner, Instruction *> {
+  // FIXME: These members shouldn't be public.
+public:
+  /// \brief A worklist of the instructions that need to be simplified.
+  InstCombineWorklist &Worklist;
+
+  /// \brief An IRBuilder that automatically inserts new instructions into the
+  /// worklist.
+  typedef IRBuilder<true, TargetFolder, InstCombineIRInserter> BuilderTy;
+  BuilderTy *Builder;
+
+private:
+  // Mode in which we are running the combiner.
+  const bool MinimizeSize;
+
+  // Required analyses.
+  // FIXME: These can never be null and should be references.
+  AssumptionCache *AC;
+  TargetLibraryInfo *TLI;
+  DominatorTree *DT;
+
+  // Optional analyses. When non-null, these can both be used to do better
+  // combining and will be updated to reflect any changes.
+  const DataLayout *DL;
+  LoopInfo *LI;
+
+  bool MadeIRChange;
+
+public:
+  InstCombiner(InstCombineWorklist &Worklist, BuilderTy *Builder,
+               bool MinimizeSize, AssumptionCache *AC, TargetLibraryInfo *TLI,
+               DominatorTree *DT, const DataLayout *DL, LoopInfo *LI)
+      : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize),
+        AC(AC), TLI(TLI), DT(DT), DL(DL), LI(LI), MadeIRChange(false) {}
+
+  /// \brief Run the combiner over the entire worklist until it is empty.
+  ///
+  /// \returns true if the IR is changed.
+  bool run();
+
+  AssumptionCache *getAssumptionCache() const { return AC; }
+
+  const DataLayout *getDataLayout() const { return DL; }
+
+  DominatorTree *getDominatorTree() const { return DT; }
+
+  LoopInfo *getLoopInfo() const { return LI; }
+
+  TargetLibraryInfo *getTargetLibraryInfo() const { return TLI; }
+
+  // Visitation implementation - Implement instruction combining for different
+  // instruction types.  The semantics are as follows:
+  // Return Value:
+  //    null        - No change was made
+  //     I          - Change was made, I is still valid, I may be dead though
+  //   otherwise    - Change was made, replace I with returned instruction
+  //
+  Instruction *visitAdd(BinaryOperator &I);
+  Instruction *visitFAdd(BinaryOperator &I);
+  Value *OptimizePointerDifference(Value *LHS, Value *RHS, Type *Ty);
+  Instruction *visitSub(BinaryOperator &I);
+  Instruction *visitFSub(BinaryOperator &I);
+  Instruction *visitMul(BinaryOperator &I);
+  Value *foldFMulConst(Instruction *FMulOrDiv, Constant *C,
+                       Instruction *InsertBefore);
+  Instruction *visitFMul(BinaryOperator &I);
+  Instruction *visitURem(BinaryOperator &I);
+  Instruction *visitSRem(BinaryOperator &I);
+  Instruction *visitFRem(BinaryOperator &I);
+  bool SimplifyDivRemOfSelect(BinaryOperator &I);
+  Instruction *commonRemTransforms(BinaryOperator &I);
+  Instruction *commonIRemTransforms(BinaryOperator &I);
+  Instruction *commonDivTransforms(BinaryOperator &I);
+  Instruction *commonIDivTransforms(BinaryOperator &I);
+  Instruction *visitUDiv(BinaryOperator &I);
+  Instruction *visitSDiv(BinaryOperator &I);
+  Instruction *visitFDiv(BinaryOperator &I);
+  Value *simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted);
+  Value *FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS);
+  Value *FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
+  Instruction *visitAnd(BinaryOperator &I);
+  Value *FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction *CxtI);
+  Value *FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
+  Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op, Value *A,
+                                   Value *B, Value *C);
+  Instruction *FoldXorWithConstants(BinaryOperator &I, Value *Op, Value *A,
+                                    Value *B, Value *C);
+  Instruction *visitOr(BinaryOperator &I);
+  Instruction *visitXor(BinaryOperator &I);
+  Instruction *visitShl(BinaryOperator &I);
+  Instruction *visitAShr(BinaryOperator &I);
+  Instruction *visitLShr(BinaryOperator &I);
+  Instruction *commonShiftTransforms(BinaryOperator &I);
+  Instruction *FoldFCmp_IntToFP_Cst(FCmpInst &I, Instruction *LHSI,
+                                    Constant *RHSC);
+  Instruction *FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
+                                            GlobalVariable *GV, CmpInst &ICI,
+                                            ConstantInt *AndCst = nullptr);
+  Instruction *visitFCmpInst(FCmpInst &I);
+  Instruction *visitICmpInst(ICmpInst &I);
+  Instruction *visitICmpInstWithCastAndCast(ICmpInst &ICI);
+  Instruction *visitICmpInstWithInstAndIntCst(ICmpInst &ICI, Instruction *LHS,
+                                              ConstantInt *RHS);
+  Instruction *FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
+                              ConstantInt *DivRHS);
+  Instruction *FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *DivI,
+                              ConstantInt *DivRHS);
+  Instruction *FoldICmpCstShrCst(ICmpInst &I, Value *Op, Value *A,
+                                 ConstantInt *CI1, ConstantInt *CI2);
+  Instruction *FoldICmpCstShlCst(ICmpInst &I, Value *Op, Value *A,
+                                 ConstantInt *CI1, ConstantInt *CI2);
+  Instruction *FoldICmpAddOpCst(Instruction &ICI, Value *X, ConstantInt *CI,
+                                ICmpInst::Predicate Pred);
+  Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
+                           ICmpInst::Predicate Cond, Instruction &I);
+  Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1,
+                                   BinaryOperator &I);
+  Instruction *commonCastTransforms(CastInst &CI);
+  Instruction *commonPointerCastTransforms(CastInst &CI);
+  Instruction *visitTrunc(TruncInst &CI);
+  Instruction *visitZExt(ZExtInst &CI);
+  Instruction *visitSExt(SExtInst &CI);
+  Instruction *visitFPTrunc(FPTruncInst &CI);
+  Instruction *visitFPExt(CastInst &CI);
+  Instruction *visitFPToUI(FPToUIInst &FI);
+  Instruction *visitFPToSI(FPToSIInst &FI);
+  Instruction *visitUIToFP(CastInst &CI);
+  Instruction *visitSIToFP(CastInst &CI);
+  Instruction *visitPtrToInt(PtrToIntInst &CI);
+  Instruction *visitIntToPtr(IntToPtrInst &CI);
+  Instruction *visitBitCast(BitCastInst &CI);
+  Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI);
+  Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI);
+  Instruction *FoldSelectIntoOp(SelectInst &SI, Value *, Value *);
+  Instruction *FoldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1,
+                            Value *A, Value *B, Instruction &Outer,
+                            SelectPatternFlavor SPF2, Value *C);
+  Instruction *FoldItoFPtoI(Instruction &FI);
+  Instruction *visitSelectInst(SelectInst &SI);
+  Instruction *visitSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
+  Instruction *visitCallInst(CallInst &CI);
+  Instruction *visitInvokeInst(InvokeInst &II);
+
+  Instruction *SliceUpIllegalIntegerPHI(PHINode &PN);
+  Instruction *visitPHINode(PHINode &PN);
+  Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
+  Instruction *visitAllocaInst(AllocaInst &AI);
+  Instruction *visitAllocSite(Instruction &FI);
+  Instruction *visitFree(CallInst &FI);
+  Instruction *visitLoadInst(LoadInst &LI);
+  Instruction *visitStoreInst(StoreInst &SI);
+  Instruction *visitBranchInst(BranchInst &BI);
+  Instruction *visitSwitchInst(SwitchInst &SI);
+  Instruction *visitReturnInst(ReturnInst &RI);
+  Instruction *visitInsertValueInst(InsertValueInst &IV);
+  Instruction *visitInsertElementInst(InsertElementInst &IE);
+  Instruction *visitExtractElementInst(ExtractElementInst &EI);
+  Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
+  Instruction *visitExtractValueInst(ExtractValueInst &EV);
+  Instruction *visitLandingPadInst(LandingPadInst &LI);
+
+  // visitInstruction - Specify what to return for unhandled instructions...
+  Instruction *visitInstruction(Instruction &I) { return nullptr; }
+
+  // True when DB dominates all uses of DI execpt UI.
+  // UI must be in the same block as DI.
+  // The routine checks that the DI parent and DB are different.
+  bool dominatesAllUses(const Instruction *DI, const Instruction *UI,
+                        const BasicBlock *DB) const;
+
+  // Replace select with select operand SIOpd in SI-ICmp sequence when possible
+  bool replacedSelectWithOperand(SelectInst *SI, const ICmpInst *Icmp,
+                                 const unsigned SIOpd);
+
+private:
+  bool ShouldChangeType(Type *From, Type *To) const;
+  Value *dyn_castNegVal(Value *V) const;
+  Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const;
+  Type *FindElementAtOffset(Type *PtrTy, int64_t Offset,
+                            SmallVectorImpl<Value *> &NewIndices);
+  Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
+
+  /// \brief Classify whether a cast is worth optimizing.
+  ///
+  /// Returns true if the cast from "V to Ty" actually results in any code
+  /// being generated and is interesting to optimize out. If the cast can be
+  /// eliminated by some other simple transformation, we prefer to do the
+  /// simplification first.
+  bool ShouldOptimizeCast(Instruction::CastOps opcode, const Value *V,
+                          Type *Ty);
+
+  Instruction *visitCallSite(CallSite CS);
+  Instruction *tryOptimizeCall(CallInst *CI, const DataLayout *DL);
+  bool transformConstExprCastCall(CallSite CS);
+  Instruction *transformCallThroughTrampoline(CallSite CS,
+                                              IntrinsicInst *Tramp);
+  Instruction *transformZExtICmp(ICmpInst *ICI, Instruction &CI,
+                                 bool DoXform = true);
+  Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
+  bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction *CxtI);
+  bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction *CxtI);
+  bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction *CxtI);
+  bool WillNotOverflowSignedMul(Value *LHS, Value *RHS, Instruction *CxtI);
+  Value *EmitGEPOffset(User *GEP);
+  Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
+  Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
+
+public:
+  /// \brief Inserts an instruction \p New before instruction \p Old
+  ///
+  /// Also adds the new instruction to the worklist and returns \p New so that
+  /// it is suitable for use as the return from the visitation patterns.
+  Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) {
+    assert(New && !New->getParent() &&
+           "New instruction already inserted into a basic block!");
+    BasicBlock *BB = Old.getParent();
+    BB->getInstList().insert(&Old, New); // Insert inst
+    Worklist.Add(New);
+    return New;
+  }
+
+  /// \brief Same as InsertNewInstBefore, but also sets the debug loc.
+  Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
+    New->setDebugLoc(Old.getDebugLoc());
+    return InsertNewInstBefore(New, Old);
+  }
+
+  /// \brief A combiner-aware RAUW-like routine.
+  ///
+  /// This method is to be used when an instruction is found to be dead,
+  /// replacable with another preexisting expression. Here we add all uses of
+  /// I to the worklist, replace all uses of I with the new value, then return
+  /// I, so that the inst combiner will know that I was modified.
+  Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) {
+    Worklist.AddUsersToWorkList(I); // Add all modified instrs to worklist.
+
+    // If we are replacing the instruction with itself, this must be in a
+    // segment of unreachable code, so just clobber the instruction.
+    if (&I == V)
+      V = UndefValue::get(I.getType());
+
+    DEBUG(dbgs() << "IC: Replacing " << I << "\n"
+                 << "    with " << *V << '\n');
+
+    I.replaceAllUsesWith(V);
+    return &I;
+  }
+
+  /// Creates a result tuple for an overflow intrinsic \p II with a given
+  /// \p Result and a constant \p Overflow value. If \p ReUseName is true the
+  /// \p Result's name is taken from \p II.
+  Instruction *CreateOverflowTuple(IntrinsicInst *II, Value *Result,
+                                   bool Overflow, bool ReUseName = true) {
+    if (ReUseName)
+      Result->takeName(II);
+    Constant *V[] = {UndefValue::get(Result->getType()),
+                     Overflow ? Builder->getTrue() : Builder->getFalse()};
+    StructType *ST = cast<StructType>(II->getType());
+    Constant *Struct = ConstantStruct::get(ST, V);
+    return InsertValueInst::Create(Struct, Result, 0);
+  }
+
+  /// \brief Combiner aware instruction erasure.
+  ///
+  /// When dealing with an instruction that has side effects or produces a void
+  /// value, we can't rely on DCE to delete the instruction. Instead, visit
+  /// methods should return the value returned by this function.
+  Instruction *EraseInstFromFunction(Instruction &I) {
+    DEBUG(dbgs() << "IC: ERASE " << I << '\n');
+
+    assert(I.use_empty() && "Cannot erase instruction that is used!");
+    // Make sure that we reprocess all operands now that we reduced their
+    // use counts.
+    if (I.getNumOperands() < 8) {
+      for (User::op_iterator i = I.op_begin(), e = I.op_end(); i != e; ++i)
+        if (Instruction *Op = dyn_cast<Instruction>(*i))
+          Worklist.Add(Op);
+    }
+    Worklist.Remove(&I);
+    I.eraseFromParent();
+    MadeIRChange = true;
+    return nullptr; // Don't do anything with FI
+  }
+
+  void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
+                        unsigned Depth = 0, Instruction *CxtI = nullptr) const {
+    return llvm::computeKnownBits(V, KnownZero, KnownOne, DL, Depth, AC, CxtI,
+                                  DT);
+  }
+
+  bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth = 0,
+                         Instruction *CxtI = nullptr) const {
+    return llvm::MaskedValueIsZero(V, Mask, DL, Depth, AC, CxtI, DT);
+  }
+  unsigned ComputeNumSignBits(Value *Op, unsigned Depth = 0,
+                              Instruction *CxtI = nullptr) const {
+    return llvm::ComputeNumSignBits(Op, DL, Depth, AC, CxtI, DT);
+  }
+  void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
+                      unsigned Depth = 0, Instruction *CxtI = nullptr) const {
+    return llvm::ComputeSignBit(V, KnownZero, KnownOne, DL, Depth, AC, CxtI,
+                                DT);
+  }
+  OverflowResult computeOverflowForUnsignedMul(Value *LHS, Value *RHS,
+                                               const Instruction *CxtI) {
+    return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, AC, CxtI, DT);
+  }
+  OverflowResult computeOverflowForUnsignedAdd(Value *LHS, Value *RHS,
+                                               const Instruction *CxtI) {
+    return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, AC, CxtI, DT);
+  }
+
+private:
+  /// \brief Performs a few simplifications for operators which are associative
+  /// or commutative.
+  bool SimplifyAssociativeOrCommutative(BinaryOperator &I);
+
+  /// \brief Tries to simplify binary operations which some other binary
+  /// operation distributes over.
+  ///
+  /// It does this by either by factorizing out common terms (eg "(A*B)+(A*C)"
+  /// -> "A*(B+C)") or expanding out if this results in simplifications (eg: "A
+  /// & (B | C) -> (A&B) | (A&C)" if this is a win).  Returns the simplified
+  /// value, or null if it didn't simplify.
+  Value *SimplifyUsingDistributiveLaws(BinaryOperator &I);
+
+  /// \brief Attempts to replace V with a simpler value based on the demanded
+  /// bits.
+  Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt &KnownZero,
+                                 APInt &KnownOne, unsigned Depth,
+                                 Instruction *CxtI = nullptr);
+  bool SimplifyDemandedBits(Use &U, APInt DemandedMask, APInt &KnownZero,
+                            APInt &KnownOne, unsigned Depth = 0);
+  /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
+  /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence.
+  Value *SimplifyShrShlDemandedBits(Instruction *Lsr, Instruction *Sftl,
+                                    APInt DemandedMask, APInt &KnownZero,
+                                    APInt &KnownOne);
+
+  /// \brief Tries to simplify operands to an integer instruction based on its
+  /// demanded bits.
+  bool SimplifyDemandedInstructionBits(Instruction &Inst);
+
+  Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
+                                    APInt &UndefElts, unsigned Depth = 0);
+
+  Value *SimplifyVectorOp(BinaryOperator &Inst);
+  Value *SimplifyBSwap(BinaryOperator &Inst);
+
+  // FoldOpIntoPhi - Given a binary operator, cast instruction, or select
+  // which has a PHI node as operand #0, see if we can fold the instruction
+  // into the PHI (which is only possible if all operands to the PHI are
+  // constants).
+  //
+  Instruction *FoldOpIntoPhi(Instruction &I);
+
+  /// \brief Try to rotate an operation below a PHI node, using PHI nodes for
+  /// its operands.
+  Instruction *FoldPHIArgOpIntoPHI(PHINode &PN);
+  Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN);
+  Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN);
+  Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN);
+
+  Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,
+                        ConstantInt *AndRHS, BinaryOperator &TheAnd);
+
+  Value *FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask,
+                            bool isSub, Instruction &I);
+  Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, bool isSigned,
+                         bool Inside);
+  Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
+  Instruction *MatchBSwap(BinaryOperator &I);
+  bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
+  Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
+  Instruction *SimplifyMemSet(MemSetInst *MI);
+
+  Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned);
+
+  /// \brief Returns a value X such that Val = X * Scale, or null if none.
+  ///
+  /// If the multiplication is known not to overflow then NoSignedWrap is set.
+  Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
+};
+
+} // end namespace llvm.
+
+#undef DEBUG_TYPE
+
+#endif
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index f3ac44c..b9eb986 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -11,12 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "InstCombine.h"
+#include "InstCombineInternal.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
@@ -268,9 +269,8 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
     // is only subsequently read.
     SmallVector<Instruction *, 4> ToDelete;
     if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {
-      unsigned SourceAlign = getOrEnforceKnownAlignment(Copy->getSource(),
-                                                        AI.getAlignment(),
-                                                        DL, AT, &AI, DT);
+      unsigned SourceAlign = getOrEnforceKnownAlignment(
+          Copy->getSource(), AI.getAlignment(), DL, AC, &AI, DT);
       if (AI.getAlignment() <= SourceAlign) {
         DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
         DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
@@ -310,6 +310,7 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
   LoadInst *NewLoad = IC.Builder->CreateAlignedLoad(
       IC.Builder->CreateBitCast(Ptr, NewTy->getPointerTo(AS)),
       LI.getAlignment(), LI.getName());
+  MDBuilder MDB(NewLoad->getContext());
   for (const auto &MDPair : MD) {
     unsigned ID = MDPair.first;
     MDNode *N = MDPair.second;
@@ -331,20 +332,86 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
     case LLVMContext::MD_noalias:
     case LLVMContext::MD_nontemporal:
     case LLVMContext::MD_mem_parallel_loop_access:
-    case LLVMContext::MD_nonnull:
       // All of these directly apply.
       NewLoad->setMetadata(ID, N);
       break;
 
+    case LLVMContext::MD_nonnull:
+      // This only directly applies if the new type is also a pointer.
+      if (NewTy->isPointerTy()) {
+        NewLoad->setMetadata(ID, N);
+        break;
+      }
+      // If it's integral now, translate it to !range metadata.
+      if (NewTy->isIntegerTy()) {
+        auto *ITy = cast<IntegerType>(NewTy);
+        auto *NullInt = ConstantExpr::getPtrToInt(
+            ConstantPointerNull::get(cast<PointerType>(Ptr->getType())), ITy);
+        auto *NonNullInt =
+            ConstantExpr::getAdd(NullInt, ConstantInt::get(ITy, 1));
+        NewLoad->setMetadata(LLVMContext::MD_range,
+                             MDB.createRange(NonNullInt, NullInt));
+      }
+      break;
+
     case LLVMContext::MD_range:
       // FIXME: It would be nice to propagate this in some way, but the type
-      // conversions make it hard.
+      // conversions make it hard. If the new type is a pointer, we could
+      // translate it to !nonnull metadata.
       break;
     }
   }
   return NewLoad;
 }
 
+/// \brief Combine a store to a new type.
+///
+/// Returns the newly created store instruction.
+static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value *V) {
+  Value *Ptr = SI.getPointerOperand();
+  unsigned AS = SI.getPointerAddressSpace();
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+  SI.getAllMetadata(MD);
+
+  StoreInst *NewStore = IC.Builder->CreateAlignedStore(
+      V, IC.Builder->CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
+      SI.getAlignment());
+  for (const auto &MDPair : MD) {
+    unsigned ID = MDPair.first;
+    MDNode *N = MDPair.second;
+    // Note, essentially every kind of metadata should be preserved here! This
+    // routine is supposed to clone a store instruction changing *only its
+    // type*. The only metadata it makes sense to drop is metadata which is
+    // invalidated when the pointer type changes. This should essentially
+    // never be the case in LLVM, but we explicitly switch over only known
+    // metadata to be conservatively correct. If you are adding metadata to
+    // LLVM which pertains to stores, you almost certainly want to add it
+    // here.
+    switch (ID) {
+    case LLVMContext::MD_dbg:
+    case LLVMContext::MD_tbaa:
+    case LLVMContext::MD_prof:
+    case LLVMContext::MD_fpmath:
+    case LLVMContext::MD_tbaa_struct:
+    case LLVMContext::MD_alias_scope:
+    case LLVMContext::MD_noalias:
+    case LLVMContext::MD_nontemporal:
+    case LLVMContext::MD_mem_parallel_loop_access:
+      // All of these directly apply.
+      NewStore->setMetadata(ID, N);
+      break;
+
+    case LLVMContext::MD_invariant_load:
+    case LLVMContext::MD_nonnull:
+    case LLVMContext::MD_range:
+      // These don't apply for stores.
+      break;
+    }
+  }
+
+  return NewStore;
+}
+
 /// \brief Combine loads to match the type of value their uses after looking
 /// through intervening bitcasts.
 ///
@@ -371,6 +438,35 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
   if (LI.use_empty())
     return nullptr;
 
+  Type *Ty = LI.getType();
+
+  // Try to canonicalize loads which are only ever stored to operate over
+  // integers instead of any other type. We only do this when the loaded type
+  // is sized and has a size exactly the same as its store size and the store
+  // size is a legal integer type.
+  const DataLayout *DL = IC.getDataLayout();
+  if (!Ty->isIntegerTy() && Ty->isSized() && DL &&
+      DL->isLegalInteger(DL->getTypeStoreSizeInBits(Ty)) &&
+      DL->getTypeStoreSizeInBits(Ty) == DL->getTypeSizeInBits(Ty)) {
+    if (std::all_of(LI.user_begin(), LI.user_end(), [&LI](User *U) {
+          auto *SI = dyn_cast<StoreInst>(U);
+          return SI && SI->getPointerOperand() != &LI;
+        })) {
+      LoadInst *NewLoad = combineLoadToNewType(
+          IC, LI,
+          Type::getIntNTy(LI.getContext(), DL->getTypeStoreSizeInBits(Ty)));
+      // Replace all the stores with stores of the newly loaded value.
+      for (auto UI = LI.user_begin(), UE = LI.user_end(); UI != UE;) {
+        auto *SI = cast<StoreInst>(*UI++);
+        IC.Builder->SetInsertPoint(SI);
+        combineStoreToNewValue(IC, *SI, NewLoad);
+        IC.EraseInstFromFunction(*SI);
+      }
+      assert(LI.use_empty() && "Failed to remove all users of the load!");
+      // Return the old load so the combiner can delete it safely.
+      return &LI;
+    }
+  }
 
   // Fold away bit casts of the loaded value by loading the desired type.
   if (LI.hasOneUse())
@@ -386,6 +482,181 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
   return nullptr;
 }
 
+// If we can determine that all possible objects pointed to by the provided
+// pointer value are, not only dereferenceable, but also definitively less than
+// or equal to the provided maximum size, then return true. Otherwise, return
+// false (constant global values and allocas fall into this category).
+//
+// FIXME: This should probably live in ValueTracking (or similar).
+static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
+                                     const DataLayout *DL) {
+  SmallPtrSet<Value *, 4> Visited;
+  SmallVector<Value *, 4> Worklist(1, V);
+
+  do {
+    Value *P = Worklist.pop_back_val();
+    P = P->stripPointerCasts();
+
+    if (!Visited.insert(P).second)
+      continue;
+
+    if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
+      Worklist.push_back(SI->getTrueValue());
+      Worklist.push_back(SI->getFalseValue());
+      continue;
+    }
+
+    if (PHINode *PN = dyn_cast<PHINode>(P)) {
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        Worklist.push_back(PN->getIncomingValue(i));
+      continue;
+    }
+
+    if (GlobalAlias *GA = dyn_cast<GlobalAlias>(P)) {
+      if (GA->mayBeOverridden())
+        return false;
+      Worklist.push_back(GA->getAliasee());
+      continue;
+    }
+
+    // If we know how big this object is, and it is less than MaxSize, continue
+    // searching. Otherwise, return false.
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(P)) {
+      if (!AI->getAllocatedType()->isSized())
+        return false;
+
+      ConstantInt *CS = dyn_cast<ConstantInt>(AI->getArraySize());
+      if (!CS)
+        return false;
+
+      uint64_t TypeSize = DL->getTypeAllocSize(AI->getAllocatedType());
+      // Make sure that, even if the multiplication below would wrap as an
+      // uint64_t, we still do the right thing.
+      if ((CS->getValue().zextOrSelf(128)*APInt(128, TypeSize)).ugt(MaxSize))
+        return false;
+      continue;
+    }
+
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
+      if (!GV->hasDefinitiveInitializer() || !GV->isConstant())
+        return false;
+
+      uint64_t InitSize = DL->getTypeAllocSize(GV->getType()->getElementType());
+      if (InitSize > MaxSize)
+        return false;
+      continue;
+    }
+
+    return false;
+  } while (!Worklist.empty());
+
+  return true;
+}
+
+// If we're indexing into an object of a known size, and the outer index is
+// not a constant, but having any value but zero would lead to undefined
+// behavior, replace it with zero.
+//
+// For example, if we have:
+// @f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4
+// ...
+// %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %x
+// ... = load i32* %arrayidx, align 4
+// Then we know that we can replace %x in the GEP with i64 0.
+//
+// FIXME: We could fold any GEP index to zero that would cause UB if it were
+// not zero. Currently, we only handle the first such index. Also, we could
+// also search through non-zero constant indices if we kept track of the
+// offsets those indices implied.
+static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
+                                     Instruction *MemI, unsigned &Idx) {
+  const DataLayout *DL = IC.getDataLayout();
+  if (GEPI->getNumOperands() < 2 || !DL)
+    return false;
+
+  // Find the first non-zero index of a GEP. If all indices are zero, return
+  // one past the last index.
+  auto FirstNZIdx = [](const GetElementPtrInst *GEPI) {
+    unsigned I = 1;
+    for (unsigned IE = GEPI->getNumOperands(); I != IE; ++I) {
+      Value *V = GEPI->getOperand(I);
+      if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
+        if (CI->isZero())
+          continue;
+
+      break;
+    }
+
+    return I;
+  };
+
+  // Skip through initial 'zero' indices, and find the corresponding pointer
+  // type. See if the next index is not a constant.
+  Idx = FirstNZIdx(GEPI);
+  if (Idx == GEPI->getNumOperands())
+    return false;
+  if (isa<Constant>(GEPI->getOperand(Idx)))
+    return false;
+
+  SmallVector<Value *, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx);
+  Type *AllocTy =
+    GetElementPtrInst::getIndexedType(GEPI->getOperand(0)->getType(), Ops);
+  if (!AllocTy || !AllocTy->isSized())
+    return false;
+  uint64_t TyAllocSize = DL->getTypeAllocSize(AllocTy);
+
+  // If there are more indices after the one we might replace with a zero, make
+  // sure they're all non-negative. If any of them are negative, the overall
+  // address being computed might be before the base address determined by the
+  // first non-zero index.
+  auto IsAllNonNegative = [&]() {
+    for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) {
+      bool KnownNonNegative, KnownNegative;
+      IC.ComputeSignBit(GEPI->getOperand(i), KnownNonNegative,
+                        KnownNegative, 0, MemI);
+      if (KnownNonNegative)
+        continue;
+      return false;
+    }
+
+    return true;
+  };
+
+  // FIXME: If the GEP is not inbounds, and there are extra indices after the
+  // one we'll replace, those could cause the address computation to wrap
+  // (rendering the IsAllNonNegative() check below insufficient). We can do
+  // better, ignoring zero indicies (and other indicies we can prove small
+  // enough not to wrap).
+  if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds())
+    return false;
+
+  // Note that isObjectSizeLessThanOrEq will return true only if the pointer is
+  // also known to be dereferenceable.
+  return isObjectSizeLessThanOrEq(GEPI->getOperand(0), TyAllocSize, DL) &&
+         IsAllNonNegative();
+}
+
+// If we're indexing into an object with a variable index for the memory
+// access, but the object has only one element, we can assume that the index
+// will always be zero. If we replace the GEP, return it.
+template <typename T>
+static Instruction *replaceGEPIdxWithZero(InstCombiner &IC, Value *Ptr,
+                                          T &MemI) {
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) {
+    unsigned Idx;
+    if (canReplaceGEPIdxWithZero(IC, GEPI, &MemI, Idx)) {
+      Instruction *NewGEPI = GEPI->clone();
+      NewGEPI->setOperand(Idx,
+        ConstantInt::get(GEPI->getOperand(Idx)->getType(), 0));
+      NewGEPI->insertBefore(GEPI);
+      MemI.setOperand(MemI.getPointerOperandIndex(), NewGEPI);
+      return NewGEPI;
+    }
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   Value *Op = LI.getOperand(0);
 
@@ -395,9 +666,8 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
 
   // Attempt to improve the alignment.
   if (DL) {
-    unsigned KnownAlign =
-      getOrEnforceKnownAlignment(Op, DL->getPrefTypeAlignment(LI.getType()),
-                                 DL, AT, &LI, DT);
+    unsigned KnownAlign = getOrEnforceKnownAlignment(
+        Op, DL->getPrefTypeAlignment(LI.getType()), DL, AC, &LI, DT);
     unsigned LoadAlign = LI.getAlignment();
     unsigned EffectiveLoadAlign = LoadAlign != 0 ? LoadAlign :
       DL->getABITypeAlignment(LI.getType());
@@ -408,6 +678,12 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
       LI.setAlignment(EffectiveLoadAlign);
   }
 
+  // Replace GEP indices if possible.
+  if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) {
+      Worklist.Add(NewGEPI);
+      return &LI;
+  }
+
   // None of the following transforms are legal for volatile/atomic loads.
   // FIXME: Some of it is okay for atomic loads; needs refactoring.
   if (!LI.isSimple()) return nullptr;
@@ -418,7 +694,8 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   BasicBlock::iterator BBI = &LI;
   if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI,6))
     return ReplaceInstUsesWith(
-        LI, Builder->CreateBitCast(AvailableVal, LI.getType()));
+        LI, Builder->CreateBitOrPointerCast(AvailableVal, LI.getType(),
+                                            LI.getName() + ".cast"));
 
   // load(gep null, ...) -> unreachable
   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
@@ -473,119 +750,61 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
       }
 
       // load (select (cond, null, P)) -> load P
-      if (Constant *C = dyn_cast<Constant>(SI->getOperand(1)))
-        if (C->isNullValue()) {
-          LI.setOperand(0, SI->getOperand(2));
-          return &LI;
-        }
+      if (isa<ConstantPointerNull>(SI->getOperand(1)) && 
+          LI.getPointerAddressSpace() == 0) {
+        LI.setOperand(0, SI->getOperand(2));
+        return &LI;
+      }
 
       // load (select (cond, P, null)) -> load P
-      if (Constant *C = dyn_cast<Constant>(SI->getOperand(2)))
-        if (C->isNullValue()) {
-          LI.setOperand(0, SI->getOperand(1));
-          return &LI;
-        }
+      if (isa<ConstantPointerNull>(SI->getOperand(2)) &&
+          LI.getPointerAddressSpace() == 0) {
+        LI.setOperand(0, SI->getOperand(1));
+        return &LI;
+      }
     }
   }
   return nullptr;
 }
 
-/// InstCombineStoreToCast - Fold store V, (cast P) -> store (cast V), P
-/// when possible.  This makes it generally easy to do alias analysis and/or
-/// SROA/mem2reg of the memory object.
-static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
-  User *CI = cast<User>(SI.getOperand(1));
-  Value *CastOp = CI->getOperand(0);
-
-  Type *DestPTy = CI->getType()->getPointerElementType();
-  PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType());
-  if (!SrcTy) return nullptr;
-
-  Type *SrcPTy = SrcTy->getElementType();
-
-  if (!DestPTy->isIntegerTy() && !DestPTy->isPointerTy())
-    return nullptr;
-
-  /// NewGEPIndices - If SrcPTy is an aggregate type, we can emit a "noop gep"
-  /// to its first element.  This allows us to handle things like:
-  ///   store i32 xxx, (bitcast {foo*, float}* %P to i32*)
-  /// on 32-bit hosts.
-  SmallVector<Value*, 4> NewGEPIndices;
-
-  // If the source is an array, the code below will not succeed.  Check to
-  // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
-  // constants.
-  if (SrcPTy->isArrayTy() || SrcPTy->isStructTy()) {
-    // Index through pointer.
-    Constant *Zero = Constant::getNullValue(Type::getInt32Ty(SI.getContext()));
-    NewGEPIndices.push_back(Zero);
-
-    while (1) {
-      if (StructType *STy = dyn_cast<StructType>(SrcPTy)) {
-        if (!STy->getNumElements()) /* Struct can be empty {} */
-          break;
-        NewGEPIndices.push_back(Zero);
-        SrcPTy = STy->getElementType(0);
-      } else if (ArrayType *ATy = dyn_cast<ArrayType>(SrcPTy)) {
-        NewGEPIndices.push_back(Zero);
-        SrcPTy = ATy->getElementType();
-      } else {
-        break;
-      }
-    }
-
-    SrcTy = PointerType::get(SrcPTy, SrcTy->getAddressSpace());
-  }
-
-  if (!SrcPTy->isIntegerTy() && !SrcPTy->isPointerTy())
-    return nullptr;
-
-  // If the pointers point into different address spaces don't do the
-  // transformation.
-  if (SrcTy->getAddressSpace() != CI->getType()->getPointerAddressSpace())
-    return nullptr;
-
-  // If the pointers point to values of different sizes don't do the
-  // transformation.
-  if (!IC.getDataLayout() ||
-      IC.getDataLayout()->getTypeSizeInBits(SrcPTy) !=
-      IC.getDataLayout()->getTypeSizeInBits(DestPTy))
-    return nullptr;
+/// \brief Combine stores to match the type of value being stored.
+///
+/// The core idea here is that the memory does not have any intrinsic type and
+/// where we can we should match the type of a store to the type of value being
+/// stored.
+///
+/// However, this routine must never change the width of a store or the number of
+/// stores as that would introduce a semantic change. This combine is expected to
+/// be a semantic no-op which just allows stores to more closely model the types
+/// of their incoming values.
+///
+/// Currently, we also refuse to change the precise type used for an atomic or
+/// volatile store. This is debatable, and might be reasonable to change later.
+/// However, it is risky in case some backend or other part of LLVM is relying
+/// on the exact type stored to select appropriate atomic operations.
+///
+/// \returns true if the store was successfully combined away. This indicates
+/// the caller must erase the store instruction. We have to let the caller erase
+/// the store instruction sas otherwise there is no way to signal whether it was
+/// combined or not: IC.EraseInstFromFunction returns a null pointer.
+static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) {
+  // FIXME: We could probably with some care handle both volatile and atomic
+  // stores here but it isn't clear that this is important.
+  if (!SI.isSimple())
+    return false;
 
-  // If the pointers point to pointers to different address spaces don't do the
-  // transformation. It is not safe to introduce an addrspacecast instruction in
-  // this case since, depending on the target, addrspacecast may not be a no-op
-  // cast.
-  if (SrcPTy->isPointerTy() && DestPTy->isPointerTy() &&
-      SrcPTy->getPointerAddressSpace() != DestPTy->getPointerAddressSpace())
-    return nullptr;
+  Value *V = SI.getValueOperand();
 
-  // Okay, we are casting from one integer or pointer type to another of
-  // the same size.  Instead of casting the pointer before
-  // the store, cast the value to be stored.
-  Value *NewCast;
-  Instruction::CastOps opcode = Instruction::BitCast;
-  Type* CastSrcTy = DestPTy;
-  Type* CastDstTy = SrcPTy;
-  if (CastDstTy->isPointerTy()) {
-    if (CastSrcTy->isIntegerTy())
-      opcode = Instruction::IntToPtr;
-  } else if (CastDstTy->isIntegerTy()) {
-    if (CastSrcTy->isPointerTy())
-      opcode = Instruction::PtrToInt;
+  // Fold away bit casts of the stored value by storing the original type.
+  if (auto *BC = dyn_cast<BitCastInst>(V)) {
+    V = BC->getOperand(0);
+    combineStoreToNewValue(IC, SI, V);
+    return true;
   }
 
-  // SIOp0 is a pointer to aggregate and this is a store to the first field,
-  // emit a GEP to index into its first field.
-  if (!NewGEPIndices.empty())
-    CastOp = IC.Builder->CreateInBoundsGEP(CastOp, NewGEPIndices);
-
-  Value *SIOp0 = SI.getOperand(0);
-  NewCast = IC.Builder->CreateCast(opcode, SIOp0, CastDstTy,
-                                   SIOp0->getName()+".c");
-  SI.setOperand(0, NewCast);
-  SI.setOperand(1, CastOp);
-  return &SI;
+  // FIXME: We should also canonicalize loads of vectors when their elements are
+  // cast to other types.
+  return false;
 }
 
 /// equivalentAddressValues - Test if A and B will obviously have the same
@@ -621,11 +840,14 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   Value *Val = SI.getOperand(0);
   Value *Ptr = SI.getOperand(1);
 
+  // Try to canonicalize the stored type.
+  if (combineStoreToValueType(*this, SI))
+    return EraseInstFromFunction(SI);
+
   // Attempt to improve the alignment.
   if (DL) {
-    unsigned KnownAlign =
-      getOrEnforceKnownAlignment(Ptr, DL->getPrefTypeAlignment(Val->getType()),
-                                 DL, AT, &SI, DT);
+    unsigned KnownAlign = getOrEnforceKnownAlignment(
+        Ptr, DL->getPrefTypeAlignment(Val->getType()), DL, AC, &SI, DT);
     unsigned StoreAlign = SI.getAlignment();
     unsigned EffectiveStoreAlign = StoreAlign != 0 ? StoreAlign :
       DL->getABITypeAlignment(Val->getType());
@@ -636,6 +858,12 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       SI.setAlignment(EffectiveStoreAlign);
   }
 
+  // Replace GEP indices if possible.
+  if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) {
+      Worklist.Add(NewGEPI);
+      return &SI;
+  }
+
   // Don't hack volatile/atomic stores.
   // FIXME: Some bits are legal for atomic stores; needs refactoring.
   if (!SI.isSimple()) return nullptr;
@@ -712,17 +940,6 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   if (isa<UndefValue>(Val))
     return EraseInstFromFunction(SI);
 
-  // If the pointer destination is a cast, see if we can fold the cast into the
-  // source instead.
-  if (isa<CastInst>(Ptr))
-    if (Instruction *Res = InstCombineStoreToCast(*this, SI))
-      return Res;
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
-    if (CE->isCast())
-      if (Instruction *Res = InstCombineStoreToCast(*this, SI))
-        return Res;
-
-
   // If this store is the last instruction in the basic block (possibly
   // excepting debug info instructions), and if the block ends with an
   // unconditional branch, try to move it to the successor block.
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 8c48dce..c48e3c9 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "InstCombine.h"
+#include "InstCombineInternal.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
@@ -46,10 +46,10 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
   // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it
   // inexact.  Similarly for <<.
   if (BinaryOperator *I = dyn_cast<BinaryOperator>(V))
-    if (I->isLogicalShift() && isKnownToBeAPowerOfTwo(I->getOperand(0), false,
-                                                      0, IC.getAssumptionTracker(),
-                                                      CxtI,
-                                                      IC.getDominatorTree())) {
+    if (I->isLogicalShift() &&
+        isKnownToBeAPowerOfTwo(I->getOperand(0), false, 0,
+                               IC.getAssumptionCache(), CxtI,
+                               IC.getDominatorTree())) {
       // We know that this is an exact/nuw shift and that the input is a
       // non-zero context as well.
       if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) {
@@ -123,6 +123,48 @@ static Constant *getLogBase2Vector(ConstantDataVector *CV) {
   return ConstantVector::get(Elts);
 }
 
+/// \brief Return true if we can prove that:
+///    (mul LHS, RHS)  === (mul nsw LHS, RHS)
+bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS,
+                                            Instruction *CxtI) {
+  // Multiplying n * m significant bits yields a result of n + m significant
+  // bits. If the total number of significant bits does not exceed the
+  // result bit width (minus 1), there is no overflow.
+  // This means if we have enough leading sign bits in the operands
+  // we can guarantee that the result does not overflow.
+  // Ref: "Hacker's Delight" by Henry Warren
+  unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
+
+  // Note that underestimating the number of sign bits gives a more
+  // conservative answer.
+  unsigned SignBits = ComputeNumSignBits(LHS, 0, CxtI) +
+                      ComputeNumSignBits(RHS, 0, CxtI);
+
+  // First handle the easy case: if we have enough sign bits there's
+  // definitely no overflow.
+  if (SignBits > BitWidth + 1)
+    return true;
+
+  // There are two ambiguous cases where there can be no overflow:
+  //   SignBits == BitWidth + 1    and
+  //   SignBits == BitWidth
+  // The second case is difficult to check, therefore we only handle the
+  // first case.
+  if (SignBits == BitWidth + 1) {
+    // It overflows only when both arguments are negative and the true
+    // product is exactly the minimum negative number.
+    // E.g. mul i16 with 17 sign bits: 0xff00 * 0xff80 = 0x8000
+    // For simplicity we just check if at least one side is not negative.
+    bool LHSNonNegative, LHSNegative;
+    bool RHSNonNegative, RHSNegative;
+    ComputeSignBit(LHS, LHSNonNegative, LHSNegative, /*Depth=*/0, CxtI);
+    ComputeSignBit(RHS, RHSNonNegative, RHSNegative, /*Depth=*/0, CxtI);
+    if (LHSNonNegative || RHSNonNegative)
+      return true;
+  }
+  return false;
+}
+
 Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -130,14 +172,19 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyMulInst(Op0, Op1, DL, TLI, DT, AT))
+  if (Value *V = SimplifyMulInst(Op0, Op1, DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (match(Op1, m_AllOnes()))  // X * -1 == 0 - X
-    return BinaryOperator::CreateNeg(Op0, I.getName());
+  // X * -1 == 0 - X
+  if (match(Op1, m_AllOnes())) {
+    BinaryOperator *BO = BinaryOperator::CreateNeg(Op0, I.getName());
+    if (I.hasNoSignedWrap())
+      BO->setHasNoSignedWrap();
+    return BO;
+  }
 
   // Also allow combining multiply instructions on vectors.
   {
@@ -146,9 +193,18 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     const APInt *IVal;
     if (match(&I, m_Mul(m_Shl(m_Value(NewOp), m_Constant(C2)),
                         m_Constant(C1))) &&
-        match(C1, m_APInt(IVal)))
-      // ((X << C1)*C2) == (X * (C2 << C1))
-      return BinaryOperator::CreateMul(NewOp, ConstantExpr::getShl(C1, C2));
+        match(C1, m_APInt(IVal))) {
+      // ((X << C2)*C1) == (X * (C1 << C2))
+      Constant *Shl = ConstantExpr::getShl(C1, C2);
+      BinaryOperator *Mul = cast<BinaryOperator>(I.getOperand(0));
+      BinaryOperator *BO = BinaryOperator::CreateMul(NewOp, Shl);
+      if (I.hasNoUnsignedWrap() && Mul->hasNoUnsignedWrap())
+        BO->setHasNoUnsignedWrap();
+      if (I.hasNoSignedWrap() && Mul->hasNoSignedWrap() &&
+          Shl->isNotMinSignedValue())
+        BO->setHasNoSignedWrap();
+      return BO;
+    }
 
     if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) {
       Constant *NewCst = nullptr;
@@ -165,6 +221,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
 
         if (I.hasNoUnsignedWrap())
           Shl->setHasNoUnsignedWrap();
+        if (I.hasNoSignedWrap() && NewCst->isNotMinSignedValue())
+          Shl->setHasNoSignedWrap();
 
         return Shl;
       }
@@ -221,9 +279,16 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     }
   }
 
-  if (Value *Op0v = dyn_castNegVal(Op0))     // -X * -Y = X*Y
-    if (Value *Op1v = dyn_castNegVal(Op1))
-      return BinaryOperator::CreateMul(Op0v, Op1v);
+  if (Value *Op0v = dyn_castNegVal(Op0)) {   // -X * -Y = X*Y
+    if (Value *Op1v = dyn_castNegVal(Op1)) {
+      BinaryOperator *BO = BinaryOperator::CreateMul(Op0v, Op1v);
+      if (I.hasNoSignedWrap() &&
+          match(Op0, m_NSWSub(m_Value(), m_Value())) &&
+          match(Op1, m_NSWSub(m_Value(), m_Value())))
+        BO->setHasNoSignedWrap();
+      return BO;
+    }
+  }
 
   // (X / Y) *  Y = X - (X % Y)
   // (X / Y) * -Y = (X % Y) - X
@@ -272,10 +337,22 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   // (1 << Y)*X --> X << Y
   {
     Value *Y;
-    if (match(Op0, m_Shl(m_One(), m_Value(Y))))
-      return BinaryOperator::CreateShl(Op1, Y);
-    if (match(Op1, m_Shl(m_One(), m_Value(Y))))
-      return BinaryOperator::CreateShl(Op0, Y);
+    BinaryOperator *BO = nullptr;
+    bool ShlNSW = false;
+    if (match(Op0, m_Shl(m_One(), m_Value(Y)))) {
+      BO = BinaryOperator::CreateShl(Op1, Y);
+      ShlNSW = cast<ShlOperator>(Op0)->hasNoSignedWrap();
+    } else if (match(Op1, m_Shl(m_One(), m_Value(Y)))) {
+      BO = BinaryOperator::CreateShl(Op0, Y);
+      ShlNSW = cast<ShlOperator>(Op1)->hasNoSignedWrap();
+    }
+    if (BO) {
+      if (I.hasNoUnsignedWrap())
+        BO->setHasNoUnsignedWrap();
+      if (I.hasNoSignedWrap() && ShlNSW)
+        BO->setHasNoSignedWrap();
+      return BO;
+    }
   }
 
   // If one of the operands of the multiply is a cast from a boolean value, then
@@ -298,6 +375,18 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     }
   }
 
+  if (!I.hasNoSignedWrap() && WillNotOverflowSignedMul(Op0, Op1, &I)) {
+    Changed = true;
+    I.setHasNoSignedWrap(true);
+  }
+
+  if (!I.hasNoUnsignedWrap() &&
+      computeOverflowForUnsignedMul(Op0, Op1, &I) ==
+          OverflowResult::NeverOverflows) {
+    Changed = true;
+    I.setHasNoUnsignedWrap(true);
+  }
+
   return Changed ? &I : nullptr;
 }
 
@@ -441,8 +530,8 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   if (isa<Constant>(Op0))
     std::swap(Op0, Op1);
 
-  if (Value *V = SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), DL, TLI,
-                                  DT, AT))
+  if (Value *V =
+          SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   bool AllowReassociate = I.hasUnsafeAlgebra();
@@ -946,7 +1035,7 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyUDivInst(Op0, Op1, DL, TLI, DT, AT))
+  if (Value *V = SimplifyUDivInst(Op0, Op1, DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   // Handle the integer div common cases
@@ -961,9 +1050,14 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
         match(Op1, m_APInt(C2))) {
       bool Overflow;
       APInt C2ShlC1 = C2->ushl_ov(*C1, Overflow);
-      if (!Overflow)
-        return BinaryOperator::CreateUDiv(
+      if (!Overflow) {
+        bool IsExact = I.isExact() && match(Op0, m_Exact(m_Value()));
+        BinaryOperator *BO = BinaryOperator::CreateUDiv(
             X, ConstantInt::get(X->getType(), C2ShlC1));
+        if (IsExact)
+          BO->setIsExact();
+        return BO;
+      }
     }
   }
 
@@ -1014,7 +1108,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifySDivInst(Op0, Op1, DL, TLI, DT, AT))
+  if (Value *V = SimplifySDivInst(Op0, Op1, DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   // Handle the integer div common cases
@@ -1041,10 +1135,12 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
       return new ZExtInst(Builder->CreateICmpEQ(Op0, Op1), I.getType());
 
     // -X/C  -->  X/-C  provided the negation doesn't overflow.
-    if (SubOperator *Sub = dyn_cast<SubOperator>(Op0))
-      if (match(Sub->getOperand(0), m_Zero()) && Sub->hasNoSignedWrap())
-        return BinaryOperator::CreateSDiv(Sub->getOperand(1),
-                                          ConstantExpr::getNeg(RHS));
+    Value *X;
+    if (match(Op0, m_NSWSub(m_Zero(), m_Value(X)))) {
+      auto *BO = BinaryOperator::CreateSDiv(X, ConstantExpr::getNeg(RHS));
+      BO->setIsExact(I.isExact());
+      return BO;
+    }
   }
 
   // If the sign bits of both operands are zero (i.e. we can prove they are
@@ -1054,15 +1150,19 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
     if (MaskedValueIsZero(Op0, Mask, 0, &I)) {
       if (MaskedValueIsZero(Op1, Mask, 0, &I)) {
         // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set
-        return BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
+        auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
+        BO->setIsExact(I.isExact());
+        return BO;
       }
 
-      if (match(Op1, m_Shl(m_Power2(), m_Value()))) {
+      if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, AC, &I, DT)) {
         // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y)
         // Safe because the only negative value (1 << Y) can take on is
         // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have
         // the sign bit set.
-        return BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
+        auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
+        BO->setIsExact(I.isExact());
+        return BO;
       }
     }
   }
@@ -1106,7 +1206,8 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyFDivInst(Op0, Op1, DL, TLI, DT, AT))
+  if (Value *V = SimplifyFDivInst(Op0, Op1, I.getFastMathFlags(),
+                                  DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   if (isa<Constant>(Op0))
@@ -1271,7 +1372,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyURemInst(Op0, Op1, DL, TLI, DT, AT))
+  if (Value *V = SimplifyURemInst(Op0, Op1, DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   if (Instruction *common = commonIRemTransforms(I))
@@ -1284,7 +1385,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
                           I.getType());
 
   // X urem Y -> X and Y-1, where Y is a power of 2,
-  if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/true, 0, AT, &I, DT)) {
+  if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, AC, &I, DT)) {
     Constant *N1 = Constant::getAllOnesValue(I.getType());
     Value *Add = Builder->CreateAdd(Op1, N1);
     return BinaryOperator::CreateAnd(Op0, Add);
@@ -1306,7 +1407,7 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifySRemInst(Op0, Op1, DL, TLI, DT, AT))
+  if (Value *V = SimplifySRemInst(Op0, Op1, DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   // Handle the integer rem common cases
@@ -1381,7 +1482,8 @@ Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyFRemInst(Op0, Op1, DL, TLI, DT, AT))
+  if (Value *V = SimplifyFRemInst(Op0, Op1, I.getFastMathFlags(),
+                                  DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   // Handle cases involving: rem X, (select Cond, Y, Z)
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 794263a..0e73db8 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "InstCombine.h"
+#include "InstCombineInternal.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -788,7 +788,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
 // PHINode simplification
 //
 Instruction *InstCombiner::visitPHINode(PHINode &PN) {
-  if (Value *V = SimplifyInstruction(&PN, DL, TLI, DT, AT))
+  if (Value *V = SimplifyInstruction(&PN, DL, TLI, DT, AC))
     return ReplaceInstUsesWith(PN, V);
 
   // If all PHI operands are the same operation, pull them through the PHI,
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 079ae34..dd0e65f 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "InstCombine.h"
+#include "InstCombineInternal.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/PatternMatch.h"
@@ -314,8 +314,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
 static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
                                      const DataLayout *TD,
                                      const TargetLibraryInfo *TLI,
-                                     DominatorTree *DT,
-                                     AssumptionTracker *AT) {
+                                     DominatorTree *DT, AssumptionCache *AC) {
   // Trivial replacement.
   if (V == Op)
     return RepOp;
@@ -336,10 +335,10 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   if (CmpInst *C = dyn_cast<CmpInst>(I)) {
     if (C->getOperand(0) == Op)
       return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), TD,
-                             TLI, DT, AT);
+                             TLI, DT, AC);
     if (C->getOperand(1) == Op)
       return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, TD,
-                             TLI, DT, AT);
+                             TLI, DT, AC);
   }
 
   // TODO: We could hand off more cases to instsimplify here.
@@ -389,15 +388,7 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
 /// 1. The icmp predicate is inverted
 /// 2. The select operands are reversed
 /// 3. The magnitude of C2 and C1 are flipped
-///
-/// This also tries to turn
-/// --- Single bit tests:
-/// if ((x & C) == 0) x |= C    to  x |= C
-/// if ((x & C) != 0) x ^= C    to  x &= ~C
-/// if ((x & C) == 0) x ^= C    to  x |= C
-/// if ((x & C) != 0) x &= ~C   to  x &= ~C
-/// if ((x & C) == 0) x &= ~C   to  nothing
-static Value *foldSelectICmpAndOr(SelectInst &SI, Value *TrueVal,
+static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
                                   Value *FalseVal,
                                   InstCombiner::BuilderTy *Builder) {
   const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
@@ -416,25 +407,6 @@ static Value *foldSelectICmpAndOr(SelectInst &SI, Value *TrueVal,
     return nullptr;
 
   const APInt *C2;
-  if (match(TrueVal, m_Specific(X))) {
-    // if ((X & C) != 0) X ^= C becomes X &= ~C
-    if (match(FalseVal, m_Xor(m_Specific(X), m_APInt(C2))) && C1 == C2)
-      return Builder->CreateAnd(X, ~(*C1));
-    // if ((X & C) != 0) X &= ~C becomes X &= ~C
-    if (match(FalseVal, m_And(m_Specific(X), m_APInt(C2))) && *C1 == ~(*C2))
-      return FalseVal;
-  } else if (match(FalseVal, m_Specific(X))) {
-    // if ((X & C) == 0) X ^= C becomes X |= C
-    if (match(TrueVal, m_Xor(m_Specific(X), m_APInt(C2))) && C1 == C2)
-      return Builder->CreateOr(X, *C1);
-    // if ((X & C) == 0) X &= ~C becomes nothing
-    if (match(TrueVal, m_And(m_Specific(X), m_APInt(C2))) && *C1 == ~(*C2))
-      return X;
-    // if ((X & C) == 0) X |= C becomes X |= C
-    if (match(TrueVal, m_Or(m_Specific(X), m_APInt(C2))) && C1 == C2)
-      return TrueVal;
-  }
-
   bool OrOnTrueVal = false;
   bool OrOnFalseVal = match(FalseVal, m_Or(m_Specific(TrueVal), m_Power2(C2)));
   if (!OrOnFalseVal)
@@ -465,6 +437,62 @@ static Value *foldSelectICmpAndOr(SelectInst &SI, Value *TrueVal,
   return Builder->CreateOr(V, Y);
 }
 
+/// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single
+/// call to cttz/ctlz with flag 'is_zero_undef' cleared.
+///
+/// For example, we can fold the following code sequence:
+/// \code
+///   %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+///   %1 = icmp ne i32 %x, 0
+///   %2 = select i1 %1, i32 %0, i32 32
+/// \code
+/// 
+/// into:
+///   %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
+                                  InstCombiner::BuilderTy *Builder) {
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *CmpLHS = ICI->getOperand(0);
+  Value *CmpRHS = ICI->getOperand(1);
+
+  // Check if the condition value compares a value for equality against zero.
+  if (!ICI->isEquality() || !match(CmpRHS, m_Zero()))
+    return nullptr;
+
+  Value *Count = FalseVal;
+  Value *ValueOnZero = TrueVal;
+  if (Pred == ICmpInst::ICMP_NE)
+    std::swap(Count, ValueOnZero);
+
+  // Skip zero extend/truncate.
+  Value *V = nullptr;
+  if (match(Count, m_ZExt(m_Value(V))) ||
+      match(Count, m_Trunc(m_Value(V))))
+    Count = V;
+
+  // Check if the value propagated on zero is a constant number equal to the
+  // sizeof in bits of 'Count'.
+  unsigned SizeOfInBits = Count->getType()->getScalarSizeInBits();
+  if (!match(ValueOnZero, m_SpecificInt(SizeOfInBits)))
+    return nullptr;
+
+  // Check that 'Count' is a call to intrinsic cttz/ctlz. Also check that the
+  // input to the cttz/ctlz is used as LHS for the compare instruction.
+  if (match(Count, m_Intrinsic<Intrinsic::cttz>(m_Specific(CmpLHS))) ||
+      match(Count, m_Intrinsic<Intrinsic::ctlz>(m_Specific(CmpLHS)))) {
+    IntrinsicInst *II = cast<IntrinsicInst>(Count);
+    IRBuilder<> Builder(II);
+    // Explicitly clear the 'undef_on_zero' flag.
+    IntrinsicInst *NewI = cast<IntrinsicInst>(II->clone());
+    Type *Ty = NewI->getArgOperand(1)->getType();
+    NewI->setArgOperand(1, Constant::getNullValue(Ty));
+    Builder.Insert(NewI);
+    return Builder.CreateZExtOrTrunc(NewI, ValueOnZero->getType());
+  }
+
+  return nullptr;
+}
+
 /// visitSelectInstWithICmp - Visit a SelectInst that has an
 /// ICmpInst as its first operand.
 ///
@@ -607,26 +635,26 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
   // arms of the select. See if substituting this value into the arm and
   // simplifying the result yields the same value as the other arm.
   if (Pred == ICmpInst::ICMP_EQ) {
-    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI,
-                               DT, AT) == TrueVal ||
-        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI,
-                               DT, AT) == TrueVal)
+    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI, DT, AC) ==
+            TrueVal ||
+        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI, DT, AC) ==
+            TrueVal)
       return ReplaceInstUsesWith(SI, FalseVal);
-    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI,
-                               DT, AT) == FalseVal ||
-        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI,
-                               DT, AT) == FalseVal)
+    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI, DT, AC) ==
+            FalseVal ||
+        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI, DT, AC) ==
+            FalseVal)
       return ReplaceInstUsesWith(SI, FalseVal);
   } else if (Pred == ICmpInst::ICMP_NE) {
-    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI,
-                               DT, AT) == FalseVal ||
-        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI,
-                               DT, AT) == FalseVal)
+    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI, DT, AC) ==
+            FalseVal ||
+        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI, DT, AC) ==
+            FalseVal)
       return ReplaceInstUsesWith(SI, TrueVal);
-    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI,
-                               DT, AT) == TrueVal ||
-        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI,
-                               DT, AT) == TrueVal)
+    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI, DT, AC) ==
+            TrueVal ||
+        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI, DT, AC) ==
+            TrueVal)
       return ReplaceInstUsesWith(SI, TrueVal);
   }
 
@@ -644,9 +672,58 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
     }
   }
 
+  if (unsigned BitWidth = TrueVal->getType()->getScalarSizeInBits()) {
+    APInt MinSignedValue = APInt::getSignBit(BitWidth);
+    Value *X;
+    const APInt *Y, *C;
+    bool TrueWhenUnset;
+    bool IsBitTest = false;
+    if (ICmpInst::isEquality(Pred) &&
+        match(CmpLHS, m_And(m_Value(X), m_Power2(Y))) &&
+        match(CmpRHS, m_Zero())) {
+      IsBitTest = true;
+      TrueWhenUnset = Pred == ICmpInst::ICMP_EQ;
+    } else if (Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_Zero())) {
+      X = CmpLHS;
+      Y = &MinSignedValue;
+      IsBitTest = true;
+      TrueWhenUnset = false;
+    } else if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_AllOnes())) {
+      X = CmpLHS;
+      Y = &MinSignedValue;
+      IsBitTest = true;
+      TrueWhenUnset = true;
+    }
+    if (IsBitTest) {
+      Value *V = nullptr;
+      // (X & Y) == 0 ? X : X ^ Y  --> X & ~Y
+      if (TrueWhenUnset && TrueVal == X &&
+          match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
+        V = Builder->CreateAnd(X, ~(*Y));
+      // (X & Y) != 0 ? X ^ Y : X  --> X & ~Y
+      else if (!TrueWhenUnset && FalseVal == X &&
+               match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
+        V = Builder->CreateAnd(X, ~(*Y));
+      // (X & Y) == 0 ? X ^ Y : X  --> X | Y
+      else if (TrueWhenUnset && FalseVal == X &&
+               match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
+        V = Builder->CreateOr(X, *Y);
+      // (X & Y) != 0 ? X : X ^ Y  --> X | Y
+      else if (!TrueWhenUnset && TrueVal == X &&
+               match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
+        V = Builder->CreateOr(X, *Y);
+
+      if (V)
+        return ReplaceInstUsesWith(SI, V);
+    }
+  }
+
   if (Value *V = foldSelectICmpAndOr(SI, TrueVal, FalseVal, Builder))
     return ReplaceInstUsesWith(SI, V);
 
+  if (Value *V = foldSelectCttzCtlz(ICI, TrueVal, FalseVal, Builder))
+    return ReplaceInstUsesWith(SI, V);
+
   return Changed ? &SI : nullptr;
 }
 
@@ -835,8 +912,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   Value *TrueVal = SI.getTrueValue();
   Value *FalseVal = SI.getFalseValue();
 
-  if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal, DL, TLI,
-                                    DT, AT))
+  if (Value *V =
+          SimplifySelectInst(CondVal, TrueVal, FalseVal, DL, TLI, DT, AC))
     return ReplaceInstUsesWith(SI, V);
 
   if (SI.getType()->isIntegerTy(1)) {
@@ -928,8 +1005,22 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
              !CFPf->getValueAPF().isZero()))
         return ReplaceInstUsesWith(SI, TrueVal);
       }
-      // NOTE: if we wanted to, this is where to detect MIN/MAX
 
+      // Canonicalize to use ordered comparisons by swapping the select
+      // operands.
+      //
+      // e.g.
+      // (X ugt Y) ? X : Y -> (X ole Y) ? Y : X
+      if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) {
+        FCmpInst::Predicate InvPred = FCI->getInversePredicate();
+        Value *NewCond = Builder->CreateFCmp(InvPred, TrueVal, FalseVal,
+                                             FCI->getName() + ".inv");
+
+        return SelectInst::Create(NewCond, FalseVal, TrueVal,
+                                  SI.getName() + ".p");
+      }
+
+      // NOTE: if we wanted to, this is where to detect MIN/MAX
     } else if (FCI->getOperand(0) == FalseVal && FCI->getOperand(1) == TrueVal){
       // Transform (X == Y) ? Y : X  -> X
       if (FCI->getPredicate() == FCmpInst::FCMP_OEQ) {
@@ -955,6 +1046,21 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
              !CFPf->getValueAPF().isZero()))
           return ReplaceInstUsesWith(SI, TrueVal);
       }
+
+      // Canonicalize to use ordered comparisons by swapping the select
+      // operands.
+      //
+      // e.g.
+      // (X ugt Y) ? X : Y -> (X ole Y) ? X : Y
+      if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) {
+        FCmpInst::Predicate InvPred = FCI->getInversePredicate();
+        Value *NewCond = Builder->CreateFCmp(InvPred, FalseVal, TrueVal,
+                                             FCI->getName() + ".inv");
+
+        return SelectInst::Create(NewCond, FalseVal, TrueVal,
+                                  SI.getName() + ".p");
+      }
+
       // NOTE: if we wanted to, this is where to detect MIN/MAX
     }
     // NOTE: if we wanted to, this is where to detect ABS
@@ -1039,12 +1145,14 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     if (Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal))
       return FoldI;
 
+    Value *LHS, *RHS, *LHS2, *RHS2;
+    SelectPatternFlavor SPF = MatchSelectPattern(&SI, LHS, RHS);
+
     // MAX(MAX(a, b), a) -> MAX(a, b)
     // MIN(MIN(a, b), a) -> MIN(a, b)
     // MAX(MIN(a, b), a) -> a
     // MIN(MAX(a, b), a) -> a
-    Value *LHS, *RHS, *LHS2, *RHS2;
-    if (SelectPatternFlavor SPF = MatchSelectPattern(&SI, LHS, RHS)) {
+    if (SPF) {
       if (SelectPatternFlavor SPF2 = MatchSelectPattern(LHS, LHS2, RHS2))
         if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2,
                                           SI, SPF, RHS))
@@ -1055,6 +1163,33 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
           return R;
     }
 
+    // MAX(~a, ~b) -> ~MIN(a, b)
+    if (SPF == SPF_SMAX || SPF == SPF_UMAX) {
+      if (IsFreeToInvert(LHS, LHS->hasNUses(2)) &&
+          IsFreeToInvert(RHS, RHS->hasNUses(2))) {
+
+        // This transform adds a xor operation and that extra cost needs to be
+        // justified.  We look for simplifications that will result from
+        // applying this rule:
+
+        bool Profitable =
+            (LHS->hasNUses(2) && match(LHS, m_Not(m_Value()))) ||
+            (RHS->hasNUses(2) && match(RHS, m_Not(m_Value()))) ||
+            (SI.hasOneUse() && match(*SI.user_begin(), m_Not(m_Value())));
+
+        if (Profitable) {
+          Value *NewLHS = Builder->CreateNot(LHS);
+          Value *NewRHS = Builder->CreateNot(RHS);
+          Value *NewCmp = SPF == SPF_SMAX
+                              ? Builder->CreateICmpSLT(NewLHS, NewRHS)
+                              : Builder->CreateICmpULT(NewLHS, NewRHS);
+          Value *NewSI =
+              Builder->CreateNot(Builder->CreateSelect(NewCmp, NewLHS, NewRHS));
+          return ReplaceInstUsesWith(SI, NewSI);
+        }
+      }
+    }
+
     // TODO.
     // ABS(-X) -> ABS(X)
   }
@@ -1068,20 +1203,38 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
         return NV;
 
   if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) {
+    // select(C, select(C, a, b), c) -> select(C, a, c)
     if (TrueSI->getCondition() == CondVal) {
       if (SI.getTrueValue() == TrueSI->getTrueValue())
         return nullptr;
       SI.setOperand(1, TrueSI->getTrueValue());
       return &SI;
     }
+    // select(C0, select(C1, a, b), b) -> select(C0&C1, a, b)
+    // We choose this as normal form to enable folding on the And and shortening
+    // paths for the values (this helps GetUnderlyingObjects() for example).
+    if (TrueSI->getFalseValue() == FalseVal && TrueSI->hasOneUse()) {
+      Value *And = Builder->CreateAnd(CondVal, TrueSI->getCondition());
+      SI.setOperand(0, And);
+      SI.setOperand(1, TrueSI->getTrueValue());
+      return &SI;
+    }
   }
   if (SelectInst *FalseSI = dyn_cast<SelectInst>(FalseVal)) {
+    // select(C, a, select(C, b, c)) -> select(C, a, c)
     if (FalseSI->getCondition() == CondVal) {
       if (SI.getFalseValue() == FalseSI->getFalseValue())
         return nullptr;
       SI.setOperand(2, FalseSI->getFalseValue());
       return &SI;
     }
+    // select(C0, a, select(C1, a, b)) -> select(C0|C1, a, b)
+    if (FalseSI->getTrueValue() == TrueVal && FalseSI->hasOneUse()) {
+      Value *Or = Builder->CreateOr(CondVal, FalseSI->getCondition());
+      SI.setOperand(0, Or);
+      SI.setOperand(2, FalseSI->getFalseValue());
+      return &SI;
+    }
   }
 
   if (BinaryOperator::isNot(CondVal)) {
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index afa907a..b4976e0 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "InstCombine.h"
+#include "InstCombineInternal.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -693,9 +693,9 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1),
-                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
-                                 DL, TLI, DT, AT))
+  if (Value *V =
+          SimplifyShlInst(I.getOperand(0), I.getOperand(1), I.hasNoSignedWrap(),
+                          I.hasNoUnsignedWrap(), DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   if (Instruction *V = commonShiftTransforms(I))
@@ -735,8 +735,8 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1),
-                                  I.isExact(), DL, TLI, DT, AT))
+  if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
+                                  DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   if (Instruction *R = commonShiftTransforms(I))
@@ -779,8 +779,8 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1),
-                                  I.isExact(), DL, TLI, DT, AT))
+  if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
+                                  DL, TLI, DT, AC))
     return ReplaceInstUsesWith(I, V);
 
   if (Instruction *R = commonShiftTransforms(I))
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index ad6983a..c5603aa 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "InstCombine.h"
+#include "InstCombineInternal.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index cb16584..e07efb5 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -12,7 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "InstCombine.h"
+#include "InstCombineInternal.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/PatternMatch.h"
 using namespace llvm;
 using namespace PatternMatch;
@@ -853,10 +854,32 @@ static void RecognizeIdentityMask(const SmallVectorImpl<int> &Mask,
   }
 }
 
+// Returns true if the shuffle is extracting a contiguous range of values from
+// LHS, for example:
+//                 +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+//   Input:        |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP|
+//   Shuffles to:  |EE|FF|GG|HH|
+//                 +--+--+--+--+
+static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,
+                                       SmallVector<int, 16> &Mask) {
+  unsigned LHSElems =
+      cast<VectorType>(SVI.getOperand(0)->getType())->getNumElements();
+  unsigned MaskElems = Mask.size();
+  unsigned BegIdx = Mask.front();
+  unsigned EndIdx = Mask.back();
+  if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1)
+    return false;
+  for (unsigned I = 0; I != MaskElems; ++I)
+    if (static_cast<unsigned>(Mask[I]) != BegIdx + I)
+      return false;
+  return true;
+}
+
 Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Value *LHS = SVI.getOperand(0);
   Value *RHS = SVI.getOperand(1);
   SmallVector<int, 16> Mask = SVI.getShuffleMask();
+  Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
 
   bool MadeChange = false;
 
@@ -892,18 +915,17 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     SmallVector<Constant*, 16> Elts;
     for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) {
       if (Mask[i] < 0) {
-        Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext())));
+        Elts.push_back(UndefValue::get(Int32Ty));
         continue;
       }
 
       if ((Mask[i] >= (int)e && isa<UndefValue>(RHS)) ||
           (Mask[i] <  (int)e && isa<UndefValue>(LHS))) {
         Mask[i] = -1;     // Turn into undef.
-        Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext())));
+        Elts.push_back(UndefValue::get(Int32Ty));
       } else {
         Mask[i] = Mask[i] % e;  // Force to LHS.
-        Elts.push_back(ConstantInt::get(Type::getInt32Ty(SVI.getContext()),
-                                        Mask[i]));
+        Elts.push_back(ConstantInt::get(Int32Ty, Mask[i]));
       }
     }
     SVI.setOperand(0, SVI.getOperand(1));
@@ -929,6 +951,96 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     return ReplaceInstUsesWith(SVI, V);
   }
 
+  // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to
+  // a non-vector type. We can instead bitcast the original vector followed by
+  // an extract of the desired element:
+  //
+  //   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef,
+  //                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  //   %1 = bitcast <4 x i8> %sroa to i32
+  // Becomes:
+  //   %bc = bitcast <16 x i8> %in to <4 x i32>
+  //   %ext = extractelement <4 x i32> %bc, i32 0
+  //
+  // If the shuffle is extracting a contiguous range of values from the input
+  // vector then each use which is a bitcast of the extracted size can be
+  // replaced. This will work if the vector types are compatible, and the begin
+  // index is aligned to a value in the casted vector type. If the begin index
+  // isn't aligned then we can shuffle the original vector (keeping the same
+  // vector type) before extracting.
+  //
+  // This code will bail out if the target type is fundamentally incompatible
+  // with vectors of the source type.
+  //
+  // Example of <16 x i8>, target type i32:
+  // Index range [4,8):         v-----------v Will work.
+  //                +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+  //     <16 x i8>: |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
+  //     <4 x i32>: |           |           |           |           |
+  //                +-----------+-----------+-----------+-----------+
+  // Index range [6,10):              ^-----------^ Needs an extra shuffle.
+  // Target type i40:           ^--------------^ Won't work, bail.
+  if (isShuffleExtractingFromLHS(SVI, Mask)) {
+    Value *V = LHS;
+    unsigned MaskElems = Mask.size();
+    unsigned BegIdx = Mask.front();
+    VectorType *SrcTy = cast<VectorType>(V->getType());
+    unsigned VecBitWidth = SrcTy->getBitWidth();
+    unsigned SrcElemBitWidth =
+        SrcTy->getElementType()->getPrimitiveSizeInBits();
+    assert(SrcElemBitWidth && "vector elements must have a bitwidth");
+    unsigned SrcNumElems = SrcTy->getNumElements();
+    SmallVector<BitCastInst *, 8> BCs;
+    DenseMap<Type *, Value *> NewBCs;
+    for (User *U : SVI.users())
+      if (BitCastInst *BC = dyn_cast<BitCastInst>(U))
+        if (!BC->use_empty())
+          // Only visit bitcasts that weren't previously handled.
+          BCs.push_back(BC);
+    for (BitCastInst *BC : BCs) {
+      Type *TgtTy = BC->getDestTy();
+      unsigned TgtElemBitWidth = TgtTy->getPrimitiveSizeInBits();
+      if (!TgtElemBitWidth)
+        continue;
+      unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth;
+      bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth;
+      bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth);
+      if (!VecBitWidthsEqual)
+        continue;
+      if (!VectorType::isValidElementType(TgtTy))
+        continue;
+      VectorType *CastSrcTy = VectorType::get(TgtTy, TgtNumElems);
+      if (!BegIsAligned) {
+        // Shuffle the input so [0,NumElements) contains the output, and
+        // [NumElems,SrcNumElems) is undef.
+        SmallVector<Constant *, 16> ShuffleMask(SrcNumElems,
+                                                UndefValue::get(Int32Ty));
+        for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I)
+          ShuffleMask[I] = ConstantInt::get(Int32Ty, Idx);
+        V = Builder->CreateShuffleVector(V, UndefValue::get(V->getType()),
+                                         ConstantVector::get(ShuffleMask),
+                                         SVI.getName() + ".extract");
+        BegIdx = 0;
+      }
+      unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth;
+      assert(SrcElemsPerTgtElem);
+      BegIdx /= SrcElemsPerTgtElem;
+      bool BCAlreadyExists = NewBCs.find(CastSrcTy) != NewBCs.end();
+      auto *NewBC =
+          BCAlreadyExists
+              ? NewBCs[CastSrcTy]
+              : Builder->CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc");
+      if (!BCAlreadyExists)
+        NewBCs[CastSrcTy] = NewBC;
+      auto *Ext = Builder->CreateExtractElement(
+          NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract");
+      // The shufflevector isn't being replaced: the bitcast that used it
+      // is. InstCombine will visit the newly-created instructions.
+      ReplaceInstUsesWith(*BC, Ext);
+      MadeChange = true;
+    }
+  }
+
   // If the LHS is a shufflevector itself, see if we can combine it with this
   // one without producing an unusual shuffle.
   // Cases that might be simplified:
@@ -1099,7 +1211,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   // or is a splat, do the replacement.
   if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {
     SmallVector<Constant*, 16> Elts;
-    Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
     for (unsigned i = 0, e = newMask.size(); i != e; ++i) {
       if (newMask[i] < 0) {
         Elts.push_back(UndefValue::get(Int32Ty));
diff --git a/lib/Transforms/InstCombine/InstCombineWorklist.h b/lib/Transforms/InstCombine/InstCombineWorklist.h
deleted file mode 100644
index 8d857d0..0000000
--- a/lib/Transforms/InstCombine/InstCombineWorklist.h
+++ /dev/null
@@ -1,107 +0,0 @@
-//===- InstCombineWorklist.h - Worklist for InstCombine pass ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEWORKLIST_H
-#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEWORKLIST_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-#define DEBUG_TYPE "instcombine"
-
-namespace llvm {
-
-/// InstCombineWorklist - This is the worklist management logic for
-/// InstCombine.
-class LLVM_LIBRARY_VISIBILITY InstCombineWorklist {
-  SmallVector<Instruction*, 256> Worklist;
-  DenseMap<Instruction*, unsigned> WorklistMap;
-
-  void operator=(const InstCombineWorklist&RHS) LLVM_DELETED_FUNCTION;
-  InstCombineWorklist(const InstCombineWorklist&) LLVM_DELETED_FUNCTION;
-public:
-  InstCombineWorklist() {}
-
-  bool isEmpty() const { return Worklist.empty(); }
-
-  /// Add - Add the specified instruction to the worklist if it isn't already
-  /// in it.
-  void Add(Instruction *I) {
-    if (WorklistMap.insert(std::make_pair(I, Worklist.size())).second) {
-      DEBUG(dbgs() << "IC: ADD: " << *I << '\n');
-      Worklist.push_back(I);
-    }
-  }
-
-  void AddValue(Value *V) {
-    if (Instruction *I = dyn_cast<Instruction>(V))
-      Add(I);
-  }
-
-  /// AddInitialGroup - Add the specified batch of stuff in reverse order.
-  /// which should only be done when the worklist is empty and when the group
-  /// has no duplicates.
-  void AddInitialGroup(Instruction *const *List, unsigned NumEntries) {
-    assert(Worklist.empty() && "Worklist must be empty to add initial group");
-    Worklist.reserve(NumEntries+16);
-    WorklistMap.resize(NumEntries);
-    DEBUG(dbgs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n");
-    for (unsigned Idx = 0; NumEntries; --NumEntries) {
-      Instruction *I = List[NumEntries-1];
-      WorklistMap.insert(std::make_pair(I, Idx++));
-      Worklist.push_back(I);
-    }
-  }
-
-  // Remove - remove I from the worklist if it exists.
-  void Remove(Instruction *I) {
-    DenseMap<Instruction*, unsigned>::iterator It = WorklistMap.find(I);
-    if (It == WorklistMap.end()) return; // Not in worklist.
-
-    // Don't bother moving everything down, just null out the slot.
-    Worklist[It->second] = nullptr;
-
-    WorklistMap.erase(It);
-  }
-
-  Instruction *RemoveOne() {
-    Instruction *I = Worklist.pop_back_val();
-    WorklistMap.erase(I);
-    return I;
-  }
-
-  /// AddUsersToWorkList - When an instruction is simplified, add all users of
-  /// the instruction to the work lists because they might get more simplified
-  /// now.
-  ///
-  void AddUsersToWorkList(Instruction &I) {
-    for (User *U : I.users())
-      Add(cast<Instruction>(U));
-  }
-
-
-  /// Zap - check that the worklist is empty and nuke the backing store for
-  /// the map if it is large.
-  void Zap() {
-    assert(WorklistMap.empty() && "Worklist empty, but map not?");
-
-    // Do an explicit clear, this shrinks the map if needed.
-    WorklistMap.clear();
-  }
-};
-
-} // end namespace llvm.
-
-#undef DEBUG_TYPE
-
-#endif
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index e4a4fef..88fcd53 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -33,18 +33,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
-#include "InstCombine.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "InstCombineInternal.h"
 #include "llvm-c/Initialization.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LibCallSemantics.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
@@ -55,7 +57,7 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <climits>
@@ -72,30 +74,6 @@ STATISTIC(NumExpand,    "Number of expansions");
 STATISTIC(NumFactor   , "Number of factorizations");
 STATISTIC(NumReassoc  , "Number of reassociations");
 
-// Initialization Routines
-void llvm::initializeInstCombine(PassRegistry &Registry) {
-  initializeInstCombinerPass(Registry);
-}
-
-void LLVMInitializeInstCombine(LLVMPassRegistryRef R) {
-  initializeInstCombine(*unwrap(R));
-}
-
-char InstCombiner::ID = 0;
-INITIALIZE_PASS_BEGIN(InstCombiner, "instcombine",
-                "Combine redundant instructions", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
-INITIALIZE_PASS_END(InstCombiner, "instcombine",
-                "Combine redundant instructions", false, false)
-
-void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-  AU.addRequired<AssumptionTracker>();
-  AU.addRequired<TargetLibraryInfo>();
-}
-
-
 Value *InstCombiner::EmitGEPOffset(User *GEP) {
   return llvm::EmitGEPOffset(Builder, *getDataLayout(), GEP);
 }
@@ -796,8 +774,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
     // If the incoming non-constant value is in I's block, we will remove one
     // instruction, but insert another equivalent one, leading to infinite
     // instcombine.
-    if (isPotentiallyReachable(I.getParent(), NonConstBB, DT,
-                               getAnalysisIfAvailable<LoopInfo>()))
+    if (isPotentiallyReachable(I.getParent(), NonConstBB, DT, LI))
       return nullptr;
   }
 
@@ -1316,7 +1293,7 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
 Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
 
-  if (Value *V = SimplifyGEPInst(Ops, DL, TLI, DT, AT))
+  if (Value *V = SimplifyGEPInst(Ops, DL, TLI, DT, AC))
     return ReplaceInstUsesWith(GEP, V);
 
   Value *PtrOp = GEP.getOperand(0);
@@ -1414,8 +1391,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     if (DI == -1) {
       // All the GEPs feeding the PHI are identical. Clone one down into our
       // BB so that it can be merged with the current GEP.
-      GEP.getParent()->getInstList().insert(GEP.getParent()->getFirstNonPHI(),
-                                            NewGEP);
+      GEP.getParent()->getInstList().insert(
+          GEP.getParent()->getFirstInsertionPt(), NewGEP);
     } else {
       // All the GEPs feeding the PHI differ at a single offset. Clone a GEP
       // into the current block so it can be merged, and create a new PHI to
@@ -1431,8 +1408,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
                            PN->getIncomingBlock(I));
 
       NewGEP->setOperand(DI, NewPN);
-      GEP.getParent()->getInstList().insert(GEP.getParent()->getFirstNonPHI(),
-                                            NewGEP);
+      GEP.getParent()->getInstList().insert(
+          GEP.getParent()->getFirstInsertionPt(), NewGEP);
       NewGEP->setOperand(DI, NewPN);
     }
 
@@ -2092,7 +2069,10 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
   // the largest legal integer type. We need to be conservative here since
   // x86 generates redundant zero-extenstion instructions if the operand is
   // truncated to i8 or i16.
-  if (BitWidth > NewWidth && NewWidth >= DL->getLargestLegalIntTypeSize()) {
+  bool TruncCond = false;
+  if (DL && BitWidth > NewWidth &&
+      NewWidth >= DL->getLargestLegalIntTypeSize()) {
+    TruncCond = true;
     IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
     Builder->SetInsertPoint(&SI);
     Value *NewCond = Builder->CreateTrunc(SI.getCondition(), Ty, "trunc");
@@ -2111,8 +2091,12 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
         for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end();
              i != e; ++i) {
           ConstantInt* CaseVal = i.getCaseValue();
-          Constant* NewCaseVal = ConstantExpr::getSub(cast<Constant>(CaseVal),
-                                                      AddRHS);
+          Constant *LHS = CaseVal;
+          if (TruncCond)
+            LHS = LeadingKnownZeros
+                      ? ConstantExpr::getZExt(CaseVal, Cond->getType())
+                      : ConstantExpr::getSExt(CaseVal, Cond->getType());
+          Constant* NewCaseVal = ConstantExpr::getSub(LHS, AddRHS);
           assert(isa<ConstantInt>(NewCaseVal) &&
                  "Result of expression should be constant");
           i.setValue(cast<ConstantInt>(NewCaseVal));
@@ -2122,7 +2106,8 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
         return &SI;
       }
   }
-  return nullptr;
+
+  return TruncCond ? &SI : nullptr;
 }
 
 Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
@@ -2275,41 +2260,27 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
   return nullptr;
 }
 
-enum Personality_Type {
-  Unknown_Personality,
-  GNU_Ada_Personality,
-  GNU_CXX_Personality,
-  GNU_ObjC_Personality
-};
-
-/// RecognizePersonality - See if the given exception handling personality
-/// function is one that we understand.  If so, return a description of it;
-/// otherwise return Unknown_Personality.
-static Personality_Type RecognizePersonality(Value *Pers) {
-  Function *F = dyn_cast<Function>(Pers->stripPointerCasts());
-  if (!F)
-    return Unknown_Personality;
-  return StringSwitch<Personality_Type>(F->getName())
-    .Case("__gnat_eh_personality", GNU_Ada_Personality)
-    .Case("__gxx_personality_v0",  GNU_CXX_Personality)
-    .Case("__objc_personality_v0", GNU_ObjC_Personality)
-    .Default(Unknown_Personality);
-}
-
 /// isCatchAll - Return 'true' if the given typeinfo will match anything.
-static bool isCatchAll(Personality_Type Personality, Constant *TypeInfo) {
+static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) {
   switch (Personality) {
-  case Unknown_Personality:
+  case EHPersonality::GNU_C:
+    // The GCC C EH personality only exists to support cleanups, so it's not
+    // clear what the semantics of catch clauses are.
     return false;
-  case GNU_Ada_Personality:
+  case EHPersonality::Unknown:
+    return false;
+  case EHPersonality::GNU_Ada:
     // While __gnat_all_others_value will match any Ada exception, it doesn't
     // match foreign exceptions (or didn't, before gcc-4.7).
     return false;
-  case GNU_CXX_Personality:
-  case GNU_ObjC_Personality:
+  case EHPersonality::GNU_CXX:
+  case EHPersonality::GNU_ObjC:
+  case EHPersonality::MSVC_X86SEH:
+  case EHPersonality::MSVC_Win64SEH:
+  case EHPersonality::MSVC_CXX:
     return TypeInfo->isNullValue();
   }
-  llvm_unreachable("Unknown personality!");
+  llvm_unreachable("invalid enum");
 }
 
 static bool shorter_filter(const Value *LHS, const Value *RHS) {
@@ -2323,7 +2294,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
   // The logic here should be correct for any real-world personality function.
   // However if that turns out not to be true, the offending logic can always
   // be conditioned on the personality function, like the catch-all logic is.
-  Personality_Type Personality = RecognizePersonality(LI.getPersonalityFn());
+  EHPersonality Personality = classifyEHPersonality(LI.getPersonalityFn());
 
   // Simplify the list of clauses, eg by removing repeated catch clauses
   // (these are often created by inlining).
@@ -2614,9 +2585,6 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
   return nullptr;
 }
 
-
-
-
 /// TryToSinkInstruction - Try to move the specified instruction from its
 /// current block into the beginning of DestBlock, which can only happen if it's
 /// safe to move the instruction past all of the instructions between it and the
@@ -2649,6 +2617,135 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
   return true;
 }
 
+bool InstCombiner::run() {
+  while (!Worklist.isEmpty()) {
+    Instruction *I = Worklist.RemoveOne();
+    if (I == nullptr) continue;  // skip null values.
+
+    // Check to see if we can DCE the instruction.
+    if (isInstructionTriviallyDead(I, TLI)) {
+      DEBUG(dbgs() << "IC: DCE: " << *I << '\n');
+      EraseInstFromFunction(*I);
+      ++NumDeadInst;
+      MadeIRChange = true;
+      continue;
+    }
+
+    // Instruction isn't dead, see if we can constant propagate it.
+    if (!I->use_empty() && isa<Constant>(I->getOperand(0)))
+      if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) {
+        DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
+
+        // Add operands to the worklist.
+        ReplaceInstUsesWith(*I, C);
+        ++NumConstProp;
+        EraseInstFromFunction(*I);
+        MadeIRChange = true;
+        continue;
+      }
+
+    // See if we can trivially sink this instruction to a successor basic block.
+    if (I->hasOneUse()) {
+      BasicBlock *BB = I->getParent();
+      Instruction *UserInst = cast<Instruction>(*I->user_begin());
+      BasicBlock *UserParent;
+
+      // Get the block the use occurs in.
+      if (PHINode *PN = dyn_cast<PHINode>(UserInst))
+        UserParent = PN->getIncomingBlock(*I->use_begin());
+      else
+        UserParent = UserInst->getParent();
+
+      if (UserParent != BB) {
+        bool UserIsSuccessor = false;
+        // See if the user is one of our successors.
+        for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI)
+          if (*SI == UserParent) {
+            UserIsSuccessor = true;
+            break;
+          }
+
+        // If the user is one of our immediate successors, and if that successor
+        // only has us as a predecessors (we'd have to split the critical edge
+        // otherwise), we can keep going.
+        if (UserIsSuccessor && UserParent->getSinglePredecessor()) {
+          // Okay, the CFG is simple enough, try to sink this instruction.
+          if (TryToSinkInstruction(I, UserParent)) {
+            MadeIRChange = true;
+            // We'll add uses of the sunk instruction below, but since sinking
+            // can expose opportunities for it's *operands* add them to the
+            // worklist
+            for (Use &U : I->operands())
+              if (Instruction *OpI = dyn_cast<Instruction>(U.get()))
+                Worklist.Add(OpI);
+          }
+        }
+      }
+    }
+
+    // Now that we have an instruction, try combining it to simplify it.
+    Builder->SetInsertPoint(I->getParent(), I);
+    Builder->SetCurrentDebugLocation(I->getDebugLoc());
+
+#ifndef NDEBUG
+    std::string OrigI;
+#endif
+    DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
+    DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n');
+
+    if (Instruction *Result = visit(*I)) {
+      ++NumCombined;
+      // Should we replace the old instruction with a new one?
+      if (Result != I) {
+        DEBUG(dbgs() << "IC: Old = " << *I << '\n'
+                     << "    New = " << *Result << '\n');
+
+        if (!I->getDebugLoc().isUnknown())
+          Result->setDebugLoc(I->getDebugLoc());
+        // Everything uses the new instruction now.
+        I->replaceAllUsesWith(Result);
+
+        // Move the name to the new instruction first.
+        Result->takeName(I);
+
+        // Push the new instruction and any users onto the worklist.
+        Worklist.Add(Result);
+        Worklist.AddUsersToWorkList(*Result);
+
+        // Insert the new instruction into the basic block...
+        BasicBlock *InstParent = I->getParent();
+        BasicBlock::iterator InsertPos = I;
+
+        // If we replace a PHI with something that isn't a PHI, fix up the
+        // insertion point.
+        if (!isa<PHINode>(Result) && isa<PHINode>(InsertPos))
+          InsertPos = InstParent->getFirstInsertionPt();
+
+        InstParent->getInstList().insert(InsertPos, Result);
+
+        EraseInstFromFunction(*I);
+      } else {
+#ifndef NDEBUG
+        DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
+                     << "    New = " << *I << '\n');
+#endif
+
+        // If the instruction was modified, it's possible that it is now dead.
+        // if so, remove it.
+        if (isInstructionTriviallyDead(I, TLI)) {
+          EraseInstFromFunction(*I);
+        } else {
+          Worklist.Add(I);
+          Worklist.AddUsersToWorkList(*I);
+        }
+      }
+      MadeIRChange = true;
+    }
+  }
+
+  Worklist.Zap();
+  return MadeIRChange;
+}
 
 /// AddReachableCodeToWorklist - Walk the function in depth-first order, adding
 /// all reachable code to the worklist.
@@ -2661,7 +2758,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
 ///
 static bool AddReachableCodeToWorklist(BasicBlock *BB,
                                        SmallPtrSetImpl<BasicBlock*> &Visited,
-                                       InstCombiner &IC,
+                                       InstCombineWorklist &ICWorklist,
                                        const DataLayout *DL,
                                        const TargetLibraryInfo *TLI) {
   bool MadeIRChange = false;
@@ -2759,244 +2856,183 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
   // of the function down.  This jives well with the way that it adds all uses
   // of instructions to the worklist after doing a transformation, thus avoiding
   // some N^2 behavior in pathological cases.
-  IC.Worklist.AddInitialGroup(&InstrsForInstCombineWorklist[0],
-                              InstrsForInstCombineWorklist.size());
+  ICWorklist.AddInitialGroup(&InstrsForInstCombineWorklist[0],
+                             InstrsForInstCombineWorklist.size());
 
   return MadeIRChange;
 }
 
-bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
-  MadeIRChange = false;
-
-  DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
-               << F.getName() << "\n");
-
-  {
-    // Do a depth-first traversal of the function, populate the worklist with
-    // the reachable instructions.  Ignore blocks that are not reachable.  Keep
-    // track of which blocks we visit.
-    SmallPtrSet<BasicBlock*, 64> Visited;
-    MadeIRChange |= AddReachableCodeToWorklist(F.begin(), Visited, *this, DL,
-                                               TLI);
-
-    // Do a quick scan over the function.  If we find any blocks that are
-    // unreachable, remove any instructions inside of them.  This prevents
-    // the instcombine code from having to deal with some bad special cases.
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-      if (Visited.count(BB)) continue;
-
-      // Delete the instructions backwards, as it has a reduced likelihood of
-      // having to update as many def-use and use-def chains.
-      Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
-      while (EndInst != BB->begin()) {
-        // Delete the next to last instruction.
-        BasicBlock::iterator I = EndInst;
-        Instruction *Inst = --I;
-        if (!Inst->use_empty())
-          Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
-        if (isa<LandingPadInst>(Inst)) {
-          EndInst = Inst;
-          continue;
-        }
-        if (!isa<DbgInfoIntrinsic>(Inst)) {
-          ++NumDeadInst;
-          MadeIRChange = true;
-        }
-        Inst->eraseFromParent();
-      }
-    }
-  }
-
-  while (!Worklist.isEmpty()) {
-    Instruction *I = Worklist.RemoveOne();
-    if (I == nullptr) continue;  // skip null values.
+/// \brief Populate the IC worklist from a function, and prune any dead basic
+/// blocks discovered in the process.
+///
+/// This also does basic constant propagation and other forward fixing to make
+/// the combiner itself run much faster.
+static bool prepareICWorklistFromFunction(Function &F, const DataLayout *DL,
+                                          TargetLibraryInfo *TLI,
+                                          InstCombineWorklist &ICWorklist) {
+  bool MadeIRChange = false;
 
-    // Check to see if we can DCE the instruction.
-    if (isInstructionTriviallyDead(I, TLI)) {
-      DEBUG(dbgs() << "IC: DCE: " << *I << '\n');
-      EraseInstFromFunction(*I);
-      ++NumDeadInst;
-      MadeIRChange = true;
+  // Do a depth-first traversal of the function, populate the worklist with
+  // the reachable instructions.  Ignore blocks that are not reachable.  Keep
+  // track of which blocks we visit.
+  SmallPtrSet<BasicBlock *, 64> Visited;
+  MadeIRChange |=
+      AddReachableCodeToWorklist(F.begin(), Visited, ICWorklist, DL, TLI);
+
+  // Do a quick scan over the function.  If we find any blocks that are
+  // unreachable, remove any instructions inside of them.  This prevents
+  // the instcombine code from having to deal with some bad special cases.
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (Visited.count(BB))
       continue;
-    }
 
-    // Instruction isn't dead, see if we can constant propagate it.
-    if (!I->use_empty() && isa<Constant>(I->getOperand(0)))
-      if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) {
-        DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
-
-        // Add operands to the worklist.
-        ReplaceInstUsesWith(*I, C);
-        ++NumConstProp;
-        EraseInstFromFunction(*I);
-        MadeIRChange = true;
+    // Delete the instructions backwards, as it has a reduced likelihood of
+    // having to update as many def-use and use-def chains.
+    Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
+    while (EndInst != BB->begin()) {
+      // Delete the next to last instruction.
+      BasicBlock::iterator I = EndInst;
+      Instruction *Inst = --I;
+      if (!Inst->use_empty())
+        Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+      if (isa<LandingPadInst>(Inst)) {
+        EndInst = Inst;
         continue;
       }
-
-    // See if we can trivially sink this instruction to a successor basic block.
-    if (I->hasOneUse()) {
-      BasicBlock *BB = I->getParent();
-      Instruction *UserInst = cast<Instruction>(*I->user_begin());
-      BasicBlock *UserParent;
-
-      // Get the block the use occurs in.
-      if (PHINode *PN = dyn_cast<PHINode>(UserInst))
-        UserParent = PN->getIncomingBlock(*I->use_begin());
-      else
-        UserParent = UserInst->getParent();
-
-      if (UserParent != BB) {
-        bool UserIsSuccessor = false;
-        // See if the user is one of our successors.
-        for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI)
-          if (*SI == UserParent) {
-            UserIsSuccessor = true;
-            break;
-          }
-
-        // If the user is one of our immediate successors, and if that successor
-        // only has us as a predecessors (we'd have to split the critical edge
-        // otherwise), we can keep going.
-        if (UserIsSuccessor && UserParent->getSinglePredecessor()) {
-          // Okay, the CFG is simple enough, try to sink this instruction.
-          if (TryToSinkInstruction(I, UserParent)) {
-            MadeIRChange = true;
-            // We'll add uses of the sunk instruction below, but since sinking
-            // can expose opportunities for it's *operands* add them to the
-            // worklist
-            for (Use &U : I->operands())
-              if (Instruction *OpI = dyn_cast<Instruction>(U.get()))
-                Worklist.Add(OpI);
-          }
-        }
+      if (!isa<DbgInfoIntrinsic>(Inst)) {
+        ++NumDeadInst;
+        MadeIRChange = true;
       }
+      Inst->eraseFromParent();
     }
+  }
 
-    // Now that we have an instruction, try combining it to simplify it.
-    Builder->SetInsertPoint(I->getParent(), I);
-    Builder->SetCurrentDebugLocation(I->getDebugLoc());
+  return MadeIRChange;
+}
 
-#ifndef NDEBUG
-    std::string OrigI;
-#endif
-    DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
-    DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n');
+static bool combineInstructionsOverFunction(
+    Function &F, InstCombineWorklist &Worklist, AssumptionCache &AC,
+    TargetLibraryInfo &TLI, DominatorTree &DT, const DataLayout *DL = nullptr,
+    LoopInfo *LI = nullptr) {
+  // Minimizing size?
+  bool MinimizeSize = F.hasFnAttribute(Attribute::MinSize);
 
-    if (Instruction *Result = visit(*I)) {
-      ++NumCombined;
-      // Should we replace the old instruction with a new one?
-      if (Result != I) {
-        DEBUG(dbgs() << "IC: Old = " << *I << '\n'
-                     << "    New = " << *Result << '\n');
+  /// Builder - This is an IRBuilder that automatically inserts new
+  /// instructions into the worklist when they are created.
+  IRBuilder<true, TargetFolder, InstCombineIRInserter> Builder(
+      F.getContext(), TargetFolder(DL), InstCombineIRInserter(Worklist, &AC));
 
-        if (!I->getDebugLoc().isUnknown())
-          Result->setDebugLoc(I->getDebugLoc());
-        // Everything uses the new instruction now.
-        I->replaceAllUsesWith(Result);
+  // Lower dbg.declare intrinsics otherwise their value may be clobbered
+  // by instcombiner.
+  bool DbgDeclaresChanged = LowerDbgDeclare(F);
 
-        // Move the name to the new instruction first.
-        Result->takeName(I);
+  // Iterate while there is work to do.
+  int Iteration = 0;
+  for (;;) {
+    ++Iteration;
+    DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
+                 << F.getName() << "\n");
 
-        // Push the new instruction and any users onto the worklist.
-        Worklist.Add(Result);
-        Worklist.AddUsersToWorkList(*Result);
+    bool Changed = false;
+    if (prepareICWorklistFromFunction(F, DL, &TLI, Worklist))
+      Changed = true;
 
-        // Insert the new instruction into the basic block...
-        BasicBlock *InstParent = I->getParent();
-        BasicBlock::iterator InsertPos = I;
+    InstCombiner IC(Worklist, &Builder, MinimizeSize, &AC, &TLI, &DT, DL, LI);
+    if (IC.run())
+      Changed = true;
 
-        // If we replace a PHI with something that isn't a PHI, fix up the
-        // insertion point.
-        if (!isa<PHINode>(Result) && isa<PHINode>(InsertPos))
-          InsertPos = InstParent->getFirstInsertionPt();
+    if (!Changed)
+      break;
+  }
 
-        InstParent->getInstList().insert(InsertPos, Result);
+  return DbgDeclaresChanged || Iteration > 1;
+}
 
-        EraseInstFromFunction(*I);
-      } else {
-#ifndef NDEBUG
-        DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
-                     << "    New = " << *I << '\n');
-#endif
+PreservedAnalyses InstCombinePass::run(Function &F,
+                                       AnalysisManager<Function> *AM) {
+  auto *DL = F.getParent()->getDataLayout();
 
-        // If the instruction was modified, it's possible that it is now dead.
-        // if so, remove it.
-        if (isInstructionTriviallyDead(I, TLI)) {
-          EraseInstFromFunction(*I);
-        } else {
-          Worklist.Add(I);
-          Worklist.AddUsersToWorkList(*I);
-        }
-      }
-      MadeIRChange = true;
-    }
-  }
+  auto &AC = AM->getResult<AssumptionAnalysis>(F);
+  auto &DT = AM->getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM->getResult<TargetLibraryAnalysis>(F);
 
-  Worklist.Zap();
-  return MadeIRChange;
+  auto *LI = AM->getCachedResult<LoopAnalysis>(F);
+
+  if (!combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, DL, LI))
+    // No changes, all analyses are preserved.
+    return PreservedAnalyses::all();
+
+  // Mark all the analyses that instcombine updates as preserved.
+  // FIXME: Need a way to preserve CFG analyses here!
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
 }
 
 namespace {
-class InstCombinerLibCallSimplifier final : public LibCallSimplifier {
-  InstCombiner *IC;
+/// \brief The legacy pass manager's instcombine pass.
+///
+/// This is a basic whole-function wrapper around the instcombine utility. It
+/// will try to combine all instructions in the function.
+class InstructionCombiningPass : public FunctionPass {
+  InstCombineWorklist Worklist;
+
 public:
-  InstCombinerLibCallSimplifier(const DataLayout *DL,
-                                const TargetLibraryInfo *TLI,
-                                InstCombiner *IC)
-    : LibCallSimplifier(DL, TLI) {
-    this->IC = IC;
-  }
+  static char ID; // Pass identification, replacement for typeid
 
-  /// replaceAllUsesWith - override so that instruction replacement
-  /// can be defined in terms of the instruction combiner framework.
-  void replaceAllUsesWith(Instruction *I, Value *With) const override {
-    IC->ReplaceInstUsesWith(*I, With);
+  InstructionCombiningPass() : FunctionPass(ID) {
+    initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry());
   }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
 };
 }
 
-bool InstCombiner::runOnFunction(Function &F) {
+void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+}
+
+bool InstructionCombiningPass::runOnFunction(Function &F) {
   if (skipOptnoneFunction(F))
     return false;
 
-  AT = &getAnalysis<AssumptionTracker>();
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
-  TLI = &getAnalysis<TargetLibraryInfo>();
+  // Required analyses.
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
-  DominatorTreeWrapperPass *DTWP =
-      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DT = DTWP ? &DTWP->getDomTree() : nullptr;
-
-  // Minimizing size?
-  MinimizeSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                                Attribute::MinSize);
-
-  /// Builder - This is an IRBuilder that automatically inserts new
-  /// instructions into the worklist when they are created.
-  IRBuilder<true, TargetFolder, InstCombineIRInserter>
-    TheBuilder(F.getContext(), TargetFolder(DL),
-               InstCombineIRInserter(Worklist, AT));
-  Builder = &TheBuilder;
+  // Optional analyses.
+  auto *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  auto *DL = DLP ? &DLP->getDataLayout() : nullptr;
+  auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+  auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
 
-  InstCombinerLibCallSimplifier TheSimplifier(DL, TLI, this);
-  Simplifier = &TheSimplifier;
+  return combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, DL, LI);
+}
 
-  bool EverMadeChange = false;
+char InstructionCombiningPass::ID = 0;
+INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine",
+                      "Combine redundant instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine",
+                    "Combine redundant instructions", false, false)
 
-  // Lower dbg.declare intrinsics otherwise their value may be clobbered
-  // by instcombiner.
-  EverMadeChange = LowerDbgDeclare(F);
-
-  // Iterate while there is work to do.
-  unsigned Iteration = 0;
-  while (DoOneIteration(F, Iteration++))
-    EverMadeChange = true;
+// Initialization Routines
+void llvm::initializeInstCombine(PassRegistry &Registry) {
+  initializeInstructionCombiningPassPass(Registry);
+}
 
-  Builder = nullptr;
-  return EverMadeChange;
+void LLVMInitializeInstCombine(LLVMPassRegistryRef R) {
+  initializeInstructionCombiningPassPass(*unwrap(R));
 }
 
 FunctionPass *llvm::createInstructionCombiningPass() {
-  return new InstCombiner();
+  return new InstructionCombiningPass();
 }
diff --git a/lib/Transforms/InstCombine/LLVMBuild.txt b/lib/Transforms/InstCombine/LLVMBuild.txt
index 62c6161..c26e0e3 100644
--- a/lib/Transforms/InstCombine/LLVMBuild.txt
+++ b/lib/Transforms/InstCombine/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = InstCombine
 parent = Transforms
-required_libraries = Analysis Core Support Target TransformUtils
+required_libraries = Analysis Core Support TransformUtils
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 38f587f..882aab0 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
@@ -36,10 +37,12 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -61,9 +64,11 @@ static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
 static const uint64_t kSmallX86_64ShadowOffset = 0x7FFF8000;  // < 2G.
 static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41;
 static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000;
-static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 36;
+static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37;
+static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
 static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
 static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
+static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
 
 static const size_t kMinStackMallocSize = 1 << 6;  // 64B
 static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
@@ -81,7 +86,7 @@ static const char *const kAsanUnregisterGlobalsName =
     "__asan_unregister_globals";
 static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
 static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
-static const char *const kAsanInitName = "__asan_init_v4";
+static const char *const kAsanInitName = "__asan_init_v5";
 static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp";
 static const char *const kAsanPtrSub = "__sanitizer_ptr_sub";
 static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return";
@@ -105,6 +110,12 @@ static const int kAsanStackAfterReturnMagic = 0xf5;
 // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
 static const size_t kNumberOfAccessSizes = 5;
 
+static const unsigned kAllocaRzSize = 32;
+static const unsigned kAsanAllocaLeftMagic = 0xcacacacaU;
+static const unsigned kAsanAllocaRightMagic = 0xcbcbcbcbU;
+static const unsigned kAsanAllocaPartialVal1 = 0xcbcbcb00U;
+static const unsigned kAsanAllocaPartialVal2 = 0x000000cbU;
+
 // Command-line flags.
 
 // This flag may need to be replaced with -f[no-]asan-reads.
@@ -152,19 +163,8 @@ static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
        "asan-memory-access-callback-prefix",
        cl::desc("Prefix for memory access callbacks"), cl::Hidden,
        cl::init("__asan_"));
-
-// This is an experimental feature that will allow to choose between
-// instrumented and non-instrumented code at link-time.
-// If this option is on, just before instrumenting a function we create its
-// clone; if the function is not changed by asan the clone is deleted.
-// If we end up with a clone, we put the instrumented function into a section
-// called "ASAN" and the uninstrumented function into a section called "NOASAN".
-//
-// This is still a prototype, we need to figure out a way to keep two copies of
-// a function so that the linker can easily choose one of them.
-static cl::opt<bool> ClKeepUninstrumented("asan-keep-uninstrumented-functions",
-       cl::desc("Keep uninstrumented copies of functions"),
-       cl::Hidden, cl::init(false));
+static cl::opt<bool> ClInstrumentAllocas("asan-instrument-allocas",
+       cl::desc("instrument dynamic allocas"), cl::Hidden, cl::init(false));
 
 // These flags allow to change the shadow mapping.
 // The shadow mapping looks like
@@ -186,6 +186,11 @@ static cl::opt<bool> ClCheckLifetime("asan-check-lifetime",
        cl::desc("Use llvm.lifetime intrinsics to insert extra checks"),
        cl::Hidden, cl::init(false));
 
+static cl::opt<bool> ClDynamicAllocaStack(
+    "asan-stack-dynamic-alloca",
+    cl::desc("Use dynamic alloca to represent stack variables"), cl::Hidden,
+    cl::init(true));
+
 // Debug flags.
 static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
                             cl::init(0));
@@ -200,6 +205,8 @@ static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"),
 
 STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
 STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumInstrumentedDynamicAllocas,
+          "Number of instrumented dynamic allocas");
 STATISTIC(NumOptimizedAccessesToGlobalArray,
           "Number of optimized accesses to global arrays");
 STATISTIC(NumOptimizedAccessesToGlobalVar,
@@ -220,8 +227,10 @@ struct LocationMetadata {
     assert(MDN->getNumOperands() == 3);
     MDString *MDFilename = cast<MDString>(MDN->getOperand(0));
     Filename = MDFilename->getString();
-    LineNo = cast<ConstantInt>(MDN->getOperand(1))->getLimitedValue();
-    ColumnNo = cast<ConstantInt>(MDN->getOperand(2))->getLimitedValue();
+    LineNo =
+        mdconst::extract<ConstantInt>(MDN->getOperand(1))->getLimitedValue();
+    ColumnNo =
+        mdconst::extract<ConstantInt>(MDN->getOperand(2))->getLimitedValue();
   }
 };
 
@@ -249,23 +258,22 @@ class GlobalsMetadata {
     for (auto MDN : Globals->operands()) {
       // Metadata node contains the global and the fields of "Entry".
       assert(MDN->getNumOperands() == 5);
-      Value *V = MDN->getOperand(0);
+      auto *GV = mdconst::extract_or_null<GlobalVariable>(MDN->getOperand(0));
       // The optimizer may optimize away a global entirely.
-      if (!V)
+      if (!GV)
         continue;
-      GlobalVariable *GV = cast<GlobalVariable>(V);
       // We can already have an entry for GV if it was merged with another
       // global.
       Entry &E = Entries[GV];
-      if (Value *Loc = MDN->getOperand(1))
-        E.SourceLoc.parse(cast<MDNode>(Loc));
-      if (Value *Name = MDN->getOperand(2)) {
-        MDString *MDName = cast<MDString>(Name);
-        E.Name = MDName->getString();
-      }
-      ConstantInt *IsDynInit = cast<ConstantInt>(MDN->getOperand(3));
+      if (auto *Loc = cast_or_null<MDNode>(MDN->getOperand(1)))
+        E.SourceLoc.parse(Loc);
+      if (auto *Name = cast_or_null<MDString>(MDN->getOperand(2)))
+        E.Name = Name->getString();
+      ConstantInt *IsDynInit =
+          mdconst::extract<ConstantInt>(MDN->getOperand(3));
       E.IsDynInit |= IsDynInit->isOne();
-      ConstantInt *IsBlacklisted = cast<ConstantInt>(MDN->getOperand(4));
+      ConstantInt *IsBlacklisted =
+          mdconst::extract<ConstantInt>(MDN->getOperand(4));
       E.IsBlacklisted |= IsBlacklisted->isOne();
     }
   }
@@ -289,12 +297,11 @@ struct ShadowMapping {
   bool OrShadowOffset;
 };
 
-static ShadowMapping getShadowMapping(const Module &M, int LongSize) {
-  llvm::Triple TargetTriple(M.getTargetTriple());
+static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize) {
   bool IsAndroid = TargetTriple.getEnvironment() == llvm::Triple::Android;
   bool IsIOS = TargetTriple.isiOS();
-  bool IsFreeBSD = TargetTriple.getOS() == llvm::Triple::FreeBSD;
-  bool IsLinux = TargetTriple.getOS() == llvm::Triple::Linux;
+  bool IsFreeBSD = TargetTriple.isOSFreeBSD();
+  bool IsLinux = TargetTriple.isOSLinux();
   bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64 ||
                  TargetTriple.getArch() == llvm::Triple::ppc64le;
   bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
@@ -302,6 +309,8 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize) {
                   TargetTriple.getArch() == llvm::Triple::mipsel;
   bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 ||
                   TargetTriple.getArch() == llvm::Triple::mips64el;
+  bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64;
+  bool IsWindows = TargetTriple.isOSWindows();
 
   ShadowMapping Mapping;
 
@@ -314,6 +323,8 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize) {
       Mapping.Offset = kFreeBSD_ShadowOffset32;
     else if (IsIOS)
       Mapping.Offset = kIOSShadowOffset32;
+    else if (IsWindows)
+      Mapping.Offset = kWindowsShadowOffset32;
     else
       Mapping.Offset = kDefaultShadowOffset32;
   } else {  // LongSize == 64
@@ -325,6 +336,8 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize) {
       Mapping.Offset = kSmallX86_64ShadowOffset;
     else if (IsMIPS64)
       Mapping.Offset = kMIPS64_ShadowOffset64;
+    else if (IsAArch64)
+      Mapping.Offset = kAArch64_ShadowOffset64;
     else
       Mapping.Offset = kDefaultShadowOffset64;
   }
@@ -350,10 +363,15 @@ static size_t RedzoneSizeForScale(int MappingScale) {
 
 /// AddressSanitizer: instrument the code in module to find memory bugs.
 struct AddressSanitizer : public FunctionPass {
-  AddressSanitizer() : FunctionPass(ID) {}
+  AddressSanitizer() : FunctionPass(ID) {
+    initializeAddressSanitizerPass(*PassRegistry::getPassRegistry());
+  }
   const char *getPassName() const override {
     return "AddressSanitizerFunctionPass";
   }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+  }
   void instrumentMop(Instruction *I, bool UseCalls);
   void instrumentPointerComparisonOrSubtraction(Instruction *I);
   void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
@@ -371,6 +389,8 @@ struct AddressSanitizer : public FunctionPass {
   bool doInitialization(Module &M) override;
   static char ID;  // Pass identification, replacement for typeid
 
+  DominatorTree &getDominatorTree() const { return *DT; }
+
  private:
   void initializeCallbacks(Module &M);
 
@@ -379,9 +399,11 @@ struct AddressSanitizer : public FunctionPass {
 
   LLVMContext *C;
   const DataLayout *DL;
+  Triple TargetTriple;
   int LongSize;
   Type *IntptrTy;
   ShadowMapping Mapping;
+  DominatorTree *DT;
   Function *AsanCtorFunction;
   Function *AsanInitFunction;
   Function *AsanHandleNoReturnFunc;
@@ -423,6 +445,7 @@ class AddressSanitizerModule : public ModulePass {
   Type *IntptrTy;
   LLVMContext *C;
   const DataLayout *DL;
+  Triple TargetTriple;
   ShadowMapping Mapping;
   Function *AsanPoisonGlobals;
   Function *AsanUnpoisonGlobals;
@@ -465,15 +488,36 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   };
   SmallVector<AllocaPoisonCall, 8> AllocaPoisonCallVec;
 
+  // Stores left and right redzone shadow addresses for dynamic alloca
+  // and pointer to alloca instruction itself.
+  // LeftRzAddr is a shadow address for alloca left redzone.
+  // RightRzAddr is a shadow address for alloca right redzone.
+  struct DynamicAllocaCall {
+    AllocaInst *AI;
+    Value *LeftRzAddr;
+    Value *RightRzAddr;
+    bool Poison;
+    explicit DynamicAllocaCall(AllocaInst *AI,
+                      Value *LeftRzAddr = nullptr,
+                      Value *RightRzAddr = nullptr)
+      : AI(AI), LeftRzAddr(LeftRzAddr), RightRzAddr(RightRzAddr), Poison(true)
+    {}
+  };
+  SmallVector<DynamicAllocaCall, 1> DynamicAllocaVec;
+
   // Maps Value to an AllocaInst from which the Value is originated.
   typedef DenseMap<Value*, AllocaInst*> AllocaForValueMapTy;
   AllocaForValueMapTy AllocaForValue;
 
+  bool HasNonEmptyInlineAsm;
+  std::unique_ptr<CallInst> EmptyInlineAsm;
+
   FunctionStackPoisoner(Function &F, AddressSanitizer &ASan)
-      : F(F), ASan(ASan), DIB(*F.getParent()), C(ASan.C),
-        IntptrTy(ASan.IntptrTy), IntptrPtrTy(PointerType::get(IntptrTy, 0)),
-        Mapping(ASan.Mapping),
-        StackAlignment(1 << Mapping.Scale) {}
+      : F(F), ASan(ASan), DIB(*F.getParent(), /*AllowUnresolved*/ false),
+        C(ASan.C), IntptrTy(ASan.IntptrTy),
+        IntptrPtrTy(PointerType::get(IntptrTy, 0)), Mapping(ASan.Mapping),
+        StackAlignment(1 << Mapping.Scale), HasNonEmptyInlineAsm(false),
+        EmptyInlineAsm(CallInst::Create(ASan.EmptyAsm)) {}
 
   bool runOnFunction() {
     if (!ClStack) return false;
@@ -481,7 +525,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     for (BasicBlock *BB : depth_first(&F.getEntryBlock()))
       visit(*BB);
 
-    if (AllocaVec.empty()) return false;
+    if (AllocaVec.empty() && DynamicAllocaVec.empty()) return false;
 
     initializeCallbacks(*F.getParent());
 
@@ -493,7 +537,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     return true;
   }
 
-  // Finds all static Alloca instructions and puts
+  // Finds all Alloca instructions and puts
   // poisoned red zones around all of them.
   // Then unpoison everything back before the function returns.
   void poisonStack();
@@ -504,12 +548,64 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     RetVec.push_back(&RI);
   }
 
+  // Unpoison dynamic allocas redzones.
+  void unpoisonDynamicAlloca(DynamicAllocaCall &AllocaCall) {
+    if (!AllocaCall.Poison)
+      return;
+    for (auto Ret : RetVec) {
+      IRBuilder<> IRBRet(Ret);
+      PointerType *Int32PtrTy = PointerType::getUnqual(IRBRet.getInt32Ty());
+      Value *Zero = Constant::getNullValue(IRBRet.getInt32Ty());
+      Value *PartialRzAddr = IRBRet.CreateSub(AllocaCall.RightRzAddr,
+                                              ConstantInt::get(IntptrTy, 4));
+      IRBRet.CreateStore(Zero, IRBRet.CreateIntToPtr(AllocaCall.LeftRzAddr,
+                                                     Int32PtrTy));
+      IRBRet.CreateStore(Zero, IRBRet.CreateIntToPtr(PartialRzAddr,
+                                                     Int32PtrTy));
+      IRBRet.CreateStore(Zero, IRBRet.CreateIntToPtr(AllocaCall.RightRzAddr,
+                                                     Int32PtrTy));
+    }
+  }
+
+  // Right shift for BigEndian and left shift for LittleEndian.
+  Value *shiftAllocaMagic(Value *Val, IRBuilder<> &IRB, Value *Shift) {
+    return ASan.DL->isLittleEndian() ? IRB.CreateShl(Val, Shift)
+                                     : IRB.CreateLShr(Val, Shift);
+  }
+
+  // Compute PartialRzMagic for dynamic alloca call. Since we don't know the
+  // size of requested memory until runtime, we should compute it dynamically.
+  // If PartialSize is 0, PartialRzMagic would contain kAsanAllocaRightMagic,
+  // otherwise it would contain the value that we will use to poison the
+  // partial redzone for alloca call.
+  Value *computePartialRzMagic(Value *PartialSize, IRBuilder<> &IRB);
+
+  // Deploy and poison redzones around dynamic alloca call. To do this, we
+  // should replace this call with another one with changed parameters and
+  // replace all its uses with new address, so
+  //   addr = alloca type, old_size, align
+  // is replaced by
+  //   new_size = (old_size + additional_size) * sizeof(type)
+  //   tmp = alloca i8, new_size, max(align, 32)
+  //   addr = tmp + 32 (first 32 bytes are for the left redzone).
+  // Additional_size is added to make new memory allocation contain not only
+  // requested memory, but also left, partial and right redzones.
+  // After that, we should poison redzones:
+  // (1) Left redzone with kAsanAllocaLeftMagic.
+  // (2) Partial redzone with the value, computed in runtime by
+  //     computePartialRzMagic function.
+  // (3) Right redzone with kAsanAllocaRightMagic.
+  void handleDynamicAllocaCall(DynamicAllocaCall &AllocaCall);
+
   /// \brief Collect Alloca instructions we want (and can) handle.
   void visitAllocaInst(AllocaInst &AI) {
     if (!isInterestingAlloca(AI)) return;
 
     StackAlignment = std::max(StackAlignment, AI.getAlignment());
-    AllocaVec.push_back(&AI);
+    if (isDynamicAlloca(AI))
+      DynamicAllocaVec.push_back(DynamicAllocaCall(&AI));
+    else
+      AllocaVec.push_back(&AI);
   }
 
   /// \brief Collect lifetime intrinsic calls to check for use-after-scope
@@ -538,13 +634,29 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     AllocaPoisonCallVec.push_back(APC);
   }
 
+  void visitCallInst(CallInst &CI) {
+    HasNonEmptyInlineAsm |=
+        CI.isInlineAsm() && !CI.isIdenticalTo(EmptyInlineAsm.get());
+  }
+
   // ---------------------- Helpers.
   void initializeCallbacks(Module &M);
 
+  bool doesDominateAllExits(const Instruction *I) const {
+    for (auto Ret : RetVec) {
+      if (!ASan.getDominatorTree().dominates(I, Ret))
+        return false;
+    }
+    return true;
+  }
+
+  bool isDynamicAlloca(AllocaInst &AI) const {
+    return AI.isArrayAllocation() || !AI.isStaticAlloca();
+  }
+
   // Check if we want (and can) handle this alloca.
   bool isInterestingAlloca(AllocaInst &AI) const {
-    return (!AI.isArrayAllocation() && AI.isStaticAlloca() &&
-            AI.getAllocatedType()->isSized() &&
+    return (AI.getAllocatedType()->isSized() &&
             // alloca() may be called with 0 size, ignore it.
             getAllocaSizeInBytes(&AI) > 0);
   }
@@ -562,12 +674,20 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
 
   void SetShadowToStackAfterReturnInlined(IRBuilder<> &IRB, Value *ShadowBase,
                                           int Size);
+  Value *createAllocaForLayout(IRBuilder<> &IRB, const ASanStackFrameLayout &L,
+                               bool Dynamic);
+  PHINode *createPHI(IRBuilder<> &IRB, Value *Cond, Value *ValueIfTrue,
+                     Instruction *ThenTerm, Value *ValueIfFalse);
 };
 
 }  // namespace
 
 char AddressSanitizer::ID = 0;
-INITIALIZE_PASS(AddressSanitizer, "asan",
+INITIALIZE_PASS_BEGIN(AddressSanitizer, "asan",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.",
+    false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AddressSanitizer, "asan",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs.",
     false, false)
 FunctionPass *llvm::createAddressSanitizerFunctionPass() {
@@ -951,37 +1071,47 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
 
   if (G->hasSection()) {
     StringRef Section(G->getSection());
-    // Ignore the globals from the __OBJC section. The ObjC runtime assumes
-    // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to
-    // them.
-    if (Section.startswith("__OBJC,") ||
-        Section.startswith("__DATA, __objc_")) {
-      DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n");
-      return false;
-    }
-    // See http://code.google.com/p/address-sanitizer/issues/detail?id=32
-    // Constant CFString instances are compiled in the following way:
-    //  -- the string buffer is emitted into
-    //     __TEXT,__cstring,cstring_literals
-    //  -- the constant NSConstantString structure referencing that buffer
-    //     is placed into __DATA,__cfstring
-    // Therefore there's no point in placing redzones into __DATA,__cfstring.
-    // Moreover, it causes the linker to crash on OS X 10.7
-    if (Section.startswith("__DATA,__cfstring")) {
-      DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n");
-      return false;
-    }
-    // The linker merges the contents of cstring_literals and removes the
-    // trailing zeroes.
-    if (Section.startswith("__TEXT,__cstring,cstring_literals")) {
-      DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n");
-      return false;
-    }
-    if (Section.startswith("__TEXT,__objc_methname,cstring_literals")) {
-      DEBUG(dbgs() << "Ignoring objc_methname cstring global: " << *G << "\n");
-      return false;
-    }
 
+    if (TargetTriple.isOSBinFormatMachO()) {
+      StringRef ParsedSegment, ParsedSection;
+      unsigned TAA = 0, StubSize = 0;
+      bool TAAParsed;
+      std::string ErrorCode =
+        MCSectionMachO::ParseSectionSpecifier(Section, ParsedSegment,
+                                              ParsedSection, TAA, TAAParsed,
+                                              StubSize);
+      if (!ErrorCode.empty()) {
+        report_fatal_error("Invalid section specifier '" + ParsedSection +
+                           "': " + ErrorCode + ".");
+      }
+
+      // Ignore the globals from the __OBJC section. The ObjC runtime assumes
+      // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to
+      // them.
+      if (ParsedSegment == "__OBJC" ||
+          (ParsedSegment == "__DATA" && ParsedSection.startswith("__objc_"))) {
+        DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n");
+        return false;
+      }
+      // See http://code.google.com/p/address-sanitizer/issues/detail?id=32
+      // Constant CFString instances are compiled in the following way:
+      //  -- the string buffer is emitted into
+      //     __TEXT,__cstring,cstring_literals
+      //  -- the constant NSConstantString structure referencing that buffer
+      //     is placed into __DATA,__cfstring
+      // Therefore there's no point in placing redzones into __DATA,__cfstring.
+      // Moreover, it causes the linker to crash on OS X 10.7
+      if (ParsedSegment == "__DATA" && ParsedSection == "__cfstring") {
+        DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n");
+        return false;
+      }
+      // The linker merges the contents of cstring_literals and removes the
+      // trailing zeroes.
+      if (ParsedSegment == "__TEXT" && (TAA & MachO::S_CSTRING_LITERALS)) {
+        DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n");
+        return false;
+      }
+    }
 
     // Callbacks put into the CRT initializer/terminator sections
     // should not be instrumented.
@@ -1165,7 +1295,8 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   C = &(M.getContext());
   int LongSize = DL->getPointerSizeInBits();
   IntptrTy = Type::getIntNTy(*C, LongSize);
-  Mapping = getShadowMapping(M, LongSize);
+  TargetTriple = Triple(M.getTargetTriple());
+  Mapping = getShadowMapping(TargetTriple, LongSize);
   initializeCallbacks(M);
 
   bool Changed = false;
@@ -1247,6 +1378,7 @@ bool AddressSanitizer::doInitialization(Module &M) {
   C = &(M.getContext());
   LongSize = DL->getPointerSizeInBits();
   IntptrTy = Type::getIntNTy(*C, LongSize);
+  TargetTriple = Triple(M.getTargetTriple());
 
   AsanCtorFunction = Function::Create(
       FunctionType::get(Type::getVoidTy(*C), false),
@@ -1259,7 +1391,7 @@ bool AddressSanitizer::doInitialization(Module &M) {
   AsanInitFunction->setLinkage(Function::ExternalLinkage);
   IRB.CreateCall(AsanInitFunction);
 
-  Mapping = getShadowMapping(M, LongSize);
+  Mapping = getShadowMapping(TargetTriple, LongSize);
 
   appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority);
   return true;
@@ -1287,6 +1419,8 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
   initializeCallbacks(*F.getParent());
 
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
   // If needed, insert __asan_init before checking for SanitizeAddress attr.
   maybeInsertAsanInitAtFunctionEntry(F);
 
@@ -1345,17 +1479,6 @@ bool AddressSanitizer::runOnFunction(Function &F) {
     }
   }
 
-  Function *UninstrumentedDuplicate = nullptr;
-  bool LikelyToInstrument =
-      !NoReturnCalls.empty() || !ToInstrument.empty() || (NumAllocas > 0);
-  if (ClKeepUninstrumented && LikelyToInstrument) {
-    ValueToValueMapTy VMap;
-    UninstrumentedDuplicate = CloneFunction(&F, VMap, false);
-    UninstrumentedDuplicate->removeFnAttr(Attribute::SanitizeAddress);
-    UninstrumentedDuplicate->setName("NOASAN_" + F.getName());
-    F.getParent()->getFunctionList().push_back(UninstrumentedDuplicate);
-  }
-
   bool UseCalls = false;
   if (ClInstrumentationWithCallsThreshold >= 0 &&
       ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold)
@@ -1393,20 +1516,6 @@ bool AddressSanitizer::runOnFunction(Function &F) {
 
   DEBUG(dbgs() << "ASAN done instrumenting: " << res << " " << F << "\n");
 
-  if (ClKeepUninstrumented) {
-    if (!res) {
-      // No instrumentation is done, no need for the duplicate.
-      if (UninstrumentedDuplicate)
-        UninstrumentedDuplicate->eraseFromParent();
-    } else {
-      // The function was instrumented. We must have the duplicate.
-      assert(UninstrumentedDuplicate);
-      UninstrumentedDuplicate->setSection("NOASAN");
-      assert(!F.hasSection());
-      F.setSection("ASAN");
-    }
-  }
-
   return res;
 }
 
@@ -1426,12 +1535,11 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(*C);
   for (int i = 0; i <= kMaxAsanStackMallocSizeClass; i++) {
     std::string Suffix = itostr(i);
-    AsanStackMallocFunc[i] = checkInterfaceFunction(
-        M.getOrInsertFunction(kAsanStackMallocNameTemplate + Suffix, IntptrTy,
-                              IntptrTy, IntptrTy, nullptr));
-    AsanStackFreeFunc[i] = checkInterfaceFunction(M.getOrInsertFunction(
-        kAsanStackFreeNameTemplate + Suffix, IRB.getVoidTy(), IntptrTy,
-        IntptrTy, IntptrTy, nullptr));
+    AsanStackMallocFunc[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        kAsanStackMallocNameTemplate + Suffix, IntptrTy, IntptrTy, nullptr));
+    AsanStackFreeFunc[i] = checkInterfaceFunction(
+        M.getOrInsertFunction(kAsanStackFreeNameTemplate + Suffix,
+                              IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
   }
   AsanPoisonStackMemoryFunc = checkInterfaceFunction(
       M.getOrInsertFunction(kAsanPoisonStackMemoryName, IRB.getVoidTy(),
@@ -1503,11 +1611,52 @@ static DebugLoc getFunctionEntryDebugLocation(Function &F) {
   return DebugLoc();
 }
 
+PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond,
+                                          Value *ValueIfTrue,
+                                          Instruction *ThenTerm,
+                                          Value *ValueIfFalse) {
+  PHINode *PHI = IRB.CreatePHI(IntptrTy, 2);
+  BasicBlock *CondBlock = cast<Instruction>(Cond)->getParent();
+  PHI->addIncoming(ValueIfFalse, CondBlock);
+  BasicBlock *ThenBlock = ThenTerm->getParent();
+  PHI->addIncoming(ValueIfTrue, ThenBlock);
+  return PHI;
+}
+
+Value *FunctionStackPoisoner::createAllocaForLayout(
+    IRBuilder<> &IRB, const ASanStackFrameLayout &L, bool Dynamic) {
+  AllocaInst *Alloca;
+  if (Dynamic) {
+    Alloca = IRB.CreateAlloca(IRB.getInt8Ty(),
+                              ConstantInt::get(IRB.getInt64Ty(), L.FrameSize),
+                              "MyAlloca");
+  } else {
+    Alloca = IRB.CreateAlloca(ArrayType::get(IRB.getInt8Ty(), L.FrameSize),
+                              nullptr, "MyAlloca");
+    assert(Alloca->isStaticAlloca());
+  }
+  assert((ClRealignStack & (ClRealignStack - 1)) == 0);
+  size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack);
+  Alloca->setAlignment(FrameAlignment);
+  return IRB.CreatePointerCast(Alloca, IntptrTy);
+}
+
 void FunctionStackPoisoner::poisonStack() {
+  assert(AllocaVec.size() > 0 || DynamicAllocaVec.size() > 0);
+
+  if (ClInstrumentAllocas) {
+    // Handle dynamic allocas.
+    for (auto &AllocaCall : DynamicAllocaVec) {
+      handleDynamicAllocaCall(AllocaCall);
+      unpoisonDynamicAlloca(AllocaCall);
+    }
+  }
+
+  if (AllocaVec.size() == 0) return;
+
   int StackMallocIdx = -1;
   DebugLoc EntryDebugLocation = getFunctionEntryDebugLocation(F);
 
-  assert(AllocaVec.size() > 0);
   Instruction *InsBefore = AllocaVec[0];
   IRBuilder<> IRB(InsBefore);
   IRB.SetCurrentDebugLocation(EntryDebugLocation);
@@ -1529,42 +1678,56 @@ void FunctionStackPoisoner::poisonStack() {
   uint64_t LocalStackSize = L.FrameSize;
   bool DoStackMalloc =
       ClUseAfterReturn && LocalStackSize <= kMaxStackMallocSize;
+  // Don't do dynamic alloca in presence of inline asm: too often it
+  // makes assumptions on which registers are available.
+  bool DoDynamicAlloca = ClDynamicAllocaStack && !HasNonEmptyInlineAsm;
 
-  Type *ByteArrayTy = ArrayType::get(IRB.getInt8Ty(), LocalStackSize);
-  AllocaInst *MyAlloca =
-      new AllocaInst(ByteArrayTy, "MyAlloca", InsBefore);
-  MyAlloca->setDebugLoc(EntryDebugLocation);
-  assert((ClRealignStack & (ClRealignStack - 1)) == 0);
-  size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack);
-  MyAlloca->setAlignment(FrameAlignment);
-  assert(MyAlloca->isStaticAlloca());
-  Value *OrigStackBase = IRB.CreatePointerCast(MyAlloca, IntptrTy);
-  Value *LocalStackBase = OrigStackBase;
+  Value *StaticAlloca =
+      DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false);
+
+  Value *FakeStack;
+  Value *LocalStackBase;
 
   if (DoStackMalloc) {
-    // LocalStackBase = OrigStackBase
-    // if (__asan_option_detect_stack_use_after_return)
-    //   LocalStackBase = __asan_stack_malloc_N(LocalStackBase, OrigStackBase);
-    StackMallocIdx = StackMallocSizeClass(LocalStackSize);
-    assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass);
+    // void *FakeStack = __asan_option_detect_stack_use_after_return
+    //     ? __asan_stack_malloc_N(LocalStackSize)
+    //     : nullptr;
+    // void *LocalStackBase = (FakeStack) ? FakeStack : alloca(LocalStackSize);
     Constant *OptionDetectUAR = F.getParent()->getOrInsertGlobal(
         kAsanOptionDetectUAR, IRB.getInt32Ty());
-    Value *Cmp = IRB.CreateICmpNE(IRB.CreateLoad(OptionDetectUAR),
-                                  Constant::getNullValue(IRB.getInt32Ty()));
-    Instruction *Term = SplitBlockAndInsertIfThen(Cmp, InsBefore, false);
-    BasicBlock *CmpBlock = cast<Instruction>(Cmp)->getParent();
+    Value *UARIsEnabled =
+        IRB.CreateICmpNE(IRB.CreateLoad(OptionDetectUAR),
+                         Constant::getNullValue(IRB.getInt32Ty()));
+    Instruction *Term =
+        SplitBlockAndInsertIfThen(UARIsEnabled, InsBefore, false);
     IRBuilder<> IRBIf(Term);
     IRBIf.SetCurrentDebugLocation(EntryDebugLocation);
-    LocalStackBase = IRBIf.CreateCall2(
-        AsanStackMallocFunc[StackMallocIdx],
-        ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase);
-    BasicBlock *SetBlock = cast<Instruction>(LocalStackBase)->getParent();
+    StackMallocIdx = StackMallocSizeClass(LocalStackSize);
+    assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass);
+    Value *FakeStackValue =
+        IRBIf.CreateCall(AsanStackMallocFunc[StackMallocIdx],
+                         ConstantInt::get(IntptrTy, LocalStackSize));
     IRB.SetInsertPoint(InsBefore);
     IRB.SetCurrentDebugLocation(EntryDebugLocation);
-    PHINode *Phi = IRB.CreatePHI(IntptrTy, 2);
-    Phi->addIncoming(OrigStackBase, CmpBlock);
-    Phi->addIncoming(LocalStackBase, SetBlock);
-    LocalStackBase = Phi;
+    FakeStack = createPHI(IRB, UARIsEnabled, FakeStackValue, Term,
+                          ConstantInt::get(IntptrTy, 0));
+
+    Value *NoFakeStack =
+        IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy));
+    Term = SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false);
+    IRBIf.SetInsertPoint(Term);
+    IRBIf.SetCurrentDebugLocation(EntryDebugLocation);
+    Value *AllocaValue =
+        DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca;
+    IRB.SetInsertPoint(InsBefore);
+    IRB.SetCurrentDebugLocation(EntryDebugLocation);
+    LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack);
+  } else {
+    // void *FakeStack = nullptr;
+    // void *LocalStackBase = alloca(LocalStackSize);
+    FakeStack = ConstantInt::get(IntptrTy, 0);
+    LocalStackBase =
+        DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca;
   }
 
   // Insert poison calls for lifetime intrinsics for alloca.
@@ -1583,7 +1746,7 @@ void FunctionStackPoisoner::poisonStack() {
     Value *NewAllocaPtr = IRB.CreateIntToPtr(
         IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
         AI->getType());
-    replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB);
+    replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB, /*Deref=*/true);
     AI->replaceAllUsesWith(NewAllocaPtr);
   }
 
@@ -1621,17 +1784,18 @@ void FunctionStackPoisoner::poisonStack() {
                        BasePlus0);
     if (DoStackMalloc) {
       assert(StackMallocIdx >= 0);
-      // if LocalStackBase != OrigStackBase:
+      // if FakeStack != 0  // LocalStackBase == FakeStack
       //     // In use-after-return mode, poison the whole stack frame.
       //     if StackMallocIdx <= 4
       //         // For small sizes inline the whole thing:
       //         memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize);
-      //         **SavedFlagPtr(LocalStackBase) = 0
+      //         **SavedFlagPtr(FakeStack) = 0
       //     else
-      //         __asan_stack_free_N(LocalStackBase, OrigStackBase)
+      //         __asan_stack_free_N(FakeStack, LocalStackSize)
       // else
       //     <This is not a fake stack; unpoison the redzones>
-      Value *Cmp = IRBRet.CreateICmpNE(LocalStackBase, OrigStackBase);
+      Value *Cmp =
+          IRBRet.CreateICmpNE(FakeStack, Constant::getNullValue(IntptrTy));
       TerminatorInst *ThenTerm, *ElseTerm;
       SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm);
 
@@ -1641,7 +1805,7 @@ void FunctionStackPoisoner::poisonStack() {
         SetShadowToStackAfterReturnInlined(IRBPoison, ShadowBase,
                                            ClassSize >> Mapping.Scale);
         Value *SavedFlagPtrPtr = IRBPoison.CreateAdd(
-            LocalStackBase,
+            FakeStack,
             ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8));
         Value *SavedFlagPtr = IRBPoison.CreateLoad(
             IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy));
@@ -1650,9 +1814,8 @@ void FunctionStackPoisoner::poisonStack() {
             IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy()));
       } else {
         // For larger frames call __asan_stack_free_*.
-        IRBPoison.CreateCall3(AsanStackFreeFunc[StackMallocIdx], LocalStackBase,
-                              ConstantInt::get(IntptrTy, LocalStackSize),
-                              OrigStackBase);
+        IRBPoison.CreateCall2(AsanStackFreeFunc[StackMallocIdx], FakeStack,
+                              ConstantInt::get(IntptrTy, LocalStackSize));
       }
 
       IRBuilder<> IRBElse(ElseTerm);
@@ -1660,7 +1823,6 @@ void FunctionStackPoisoner::poisonStack() {
     } else if (HavePoisonedAllocas) {
       // If we poisoned some allocas in llvm.lifetime analysis,
       // unpoison whole stack frame now.
-      assert(LocalStackBase == OrigStackBase);
       poisonAlloca(LocalStackBase, LocalStackSize, IRBRet, false);
     } else {
       poisonRedZones(L.ShadowBytes, IRBRet, ShadowBase, false);
@@ -1722,3 +1884,140 @@ AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) {
     AllocaForValue[V] = Res;
   return Res;
 }
+
+// Compute PartialRzMagic for dynamic alloca call. PartialRzMagic is
+// constructed from two separate 32-bit numbers: PartialRzMagic = Val1 | Val2.
+// (1) Val1 is resposible for forming base value for PartialRzMagic, containing
+//     only 00 for fully addressable and 0xcb for fully poisoned bytes for each
+//     8-byte chunk of user memory respectively.
+// (2) Val2 forms the value for marking first poisoned byte in shadow memory
+//     with appropriate value (0x01 - 0x07 or 0xcb if Padding % 8 == 0).
+
+// Shift = Padding & ~7; // the number of bits we need to shift to access first
+//                          chunk in shadow memory, containing nonzero bytes.
+// Example:
+// Padding = 21                       Padding = 16
+// Shadow:  |00|00|05|cb|          Shadow:  |00|00|cb|cb|
+//                ^                               ^
+//                |                               |
+// Shift = 21 & ~7 = 16            Shift = 16 & ~7 = 16
+//
+// Val1 = 0xcbcbcbcb << Shift;
+// PartialBits = Padding ? Padding & 7 : 0xcb;
+// Val2 = PartialBits << Shift;
+// Result = Val1 | Val2;
+Value *FunctionStackPoisoner::computePartialRzMagic(Value *PartialSize,
+                                                    IRBuilder<> &IRB) {
+  PartialSize = IRB.CreateIntCast(PartialSize, IRB.getInt32Ty(), false);
+  Value *Shift = IRB.CreateAnd(PartialSize, IRB.getInt32(~7));
+  unsigned Val1Int = kAsanAllocaPartialVal1;
+  unsigned Val2Int = kAsanAllocaPartialVal2;
+  if (!ASan.DL->isLittleEndian()) {
+    Val1Int = sys::getSwappedBytes(Val1Int);
+    Val2Int = sys::getSwappedBytes(Val2Int);
+  }
+  Value *Val1 = shiftAllocaMagic(IRB.getInt32(Val1Int), IRB, Shift);
+  Value *PartialBits = IRB.CreateAnd(PartialSize, IRB.getInt32(7));
+  // For BigEndian get 0x000000YZ -> 0xYZ000000.
+  if (ASan.DL->isBigEndian())
+    PartialBits = IRB.CreateShl(PartialBits, IRB.getInt32(24));
+  Value *Val2 = IRB.getInt32(Val2Int);
+  Value *Cond =
+      IRB.CreateICmpNE(PartialBits, Constant::getNullValue(IRB.getInt32Ty()));
+  Val2 = IRB.CreateSelect(Cond, shiftAllocaMagic(PartialBits, IRB, Shift),
+                          shiftAllocaMagic(Val2, IRB, Shift));
+  return IRB.CreateOr(Val1, Val2);
+}
+
+void FunctionStackPoisoner::handleDynamicAllocaCall(
+    DynamicAllocaCall &AllocaCall) {
+  AllocaInst *AI = AllocaCall.AI;
+  if (!doesDominateAllExits(AI)) {
+    // We do not yet handle complex allocas
+    AllocaCall.Poison = false;
+    return;
+  }
+
+  IRBuilder<> IRB(AI);
+
+  PointerType *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
+  const unsigned Align = std::max(kAllocaRzSize, AI->getAlignment());
+  const uint64_t AllocaRedzoneMask = kAllocaRzSize - 1;
+
+  Value *Zero = Constant::getNullValue(IntptrTy);
+  Value *AllocaRzSize = ConstantInt::get(IntptrTy, kAllocaRzSize);
+  Value *AllocaRzMask = ConstantInt::get(IntptrTy, AllocaRedzoneMask);
+  Value *NotAllocaRzMask = ConstantInt::get(IntptrTy, ~AllocaRedzoneMask);
+
+  // Since we need to extend alloca with additional memory to locate
+  // redzones, and OldSize is number of allocated blocks with
+  // ElementSize size, get allocated memory size in bytes by
+  // OldSize * ElementSize.
+  unsigned ElementSize = ASan.DL->getTypeAllocSize(AI->getAllocatedType());
+  Value *OldSize = IRB.CreateMul(AI->getArraySize(),
+                                 ConstantInt::get(IntptrTy, ElementSize));
+
+  // PartialSize = OldSize % 32
+  Value *PartialSize = IRB.CreateAnd(OldSize, AllocaRzMask);
+
+  // Misalign = kAllocaRzSize - PartialSize;
+  Value *Misalign = IRB.CreateSub(AllocaRzSize, PartialSize);
+
+  // PartialPadding = Misalign != kAllocaRzSize ? Misalign : 0;
+  Value *Cond = IRB.CreateICmpNE(Misalign, AllocaRzSize);
+  Value *PartialPadding = IRB.CreateSelect(Cond, Misalign, Zero);
+
+  // AdditionalChunkSize = Align + PartialPadding + kAllocaRzSize
+  // Align is added to locate left redzone, PartialPadding for possible
+  // partial redzone and kAllocaRzSize for right redzone respectively.
+  Value *AdditionalChunkSize = IRB.CreateAdd(
+      ConstantInt::get(IntptrTy, Align + kAllocaRzSize), PartialPadding);
+
+  Value *NewSize = IRB.CreateAdd(OldSize, AdditionalChunkSize);
+
+  // Insert new alloca with new NewSize and Align params.
+  AllocaInst *NewAlloca = IRB.CreateAlloca(IRB.getInt8Ty(), NewSize);
+  NewAlloca->setAlignment(Align);
+
+  // NewAddress = Address + Align
+  Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy),
+                                    ConstantInt::get(IntptrTy, Align));
+
+  Value *NewAddressPtr = IRB.CreateIntToPtr(NewAddress, AI->getType());
+
+  // LeftRzAddress = NewAddress - kAllocaRzSize
+  Value *LeftRzAddress = IRB.CreateSub(NewAddress, AllocaRzSize);
+
+  // Poisoning left redzone.
+  AllocaCall.LeftRzAddr = ASan.memToShadow(LeftRzAddress, IRB);
+  IRB.CreateStore(ConstantInt::get(IRB.getInt32Ty(), kAsanAllocaLeftMagic),
+                  IRB.CreateIntToPtr(AllocaCall.LeftRzAddr, Int32PtrTy));
+
+  // PartialRzAligned = PartialRzAddr & ~AllocaRzMask
+  Value *PartialRzAddr = IRB.CreateAdd(NewAddress, OldSize);
+  Value *PartialRzAligned = IRB.CreateAnd(PartialRzAddr, NotAllocaRzMask);
+
+  // Poisoning partial redzone.
+  Value *PartialRzMagic = computePartialRzMagic(PartialSize, IRB);
+  Value *PartialRzShadowAddr = ASan.memToShadow(PartialRzAligned, IRB);
+  IRB.CreateStore(PartialRzMagic,
+                  IRB.CreateIntToPtr(PartialRzShadowAddr, Int32PtrTy));
+
+  // RightRzAddress
+  //   =  (PartialRzAddr + AllocaRzMask) & ~AllocaRzMask
+  Value *RightRzAddress = IRB.CreateAnd(
+      IRB.CreateAdd(PartialRzAddr, AllocaRzMask), NotAllocaRzMask);
+
+  // Poisoning right redzone.
+  AllocaCall.RightRzAddr = ASan.memToShadow(RightRzAddress, IRB);
+  IRB.CreateStore(ConstantInt::get(IRB.getInt32Ty(), kAsanAllocaRightMagic),
+                  IRB.CreateIntToPtr(AllocaCall.RightRzAddr, Int32PtrTy));
+
+  // Replace all uses of AddessReturnedByAlloca with NewAddress.
+  AI->replaceAllUsesWith(NewAddressPtr);
+
+  // We are done. Erase old alloca and store left, partial and right redzones
+  // shadow addresses for future unpoisoning.
+  AI->eraseFromParent();
+  NumInstrumentedDynamicAllocas++;
+}
diff --git a/lib/Transforms/Instrumentation/Android.mk b/lib/Transforms/Instrumentation/Android.mk
index 1f21028..46f0281 100644
--- a/lib/Transforms/Instrumentation/Android.mk
+++ b/lib/Transforms/Instrumentation/Android.mk
@@ -4,8 +4,8 @@ instrumentation_SRC_FILES := \
   AddressSanitizer.cpp \
   BoundsChecking.cpp \
   DataFlowSanitizer.cpp \
-  DebugIR.cpp \
   GCOVProfiling.cpp \
+  InstrProfiling.cpp \
   Instrumentation.cpp \
   MemorySanitizer.cpp \
   SanitizerCoverage.cpp \
diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 9a5cea8..2b5f39c 100644
--- a/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -24,7 +24,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "bounds-checking"
@@ -50,7 +50,7 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<DataLayoutPass>();
-      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
 
   private:
@@ -166,7 +166,7 @@ bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) {
 
 bool BoundsChecking::runOnFunction(Function &F) {
   DL = &getAnalysis<DataLayoutPass>().getDataLayout();
-  TLI = &getAnalysis<TargetLibraryInfo>();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   TrapBB = nullptr;
   BuilderTy TheBuilder(F.getContext(), TargetFolder(DL));
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 139e514..b2ff033 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -2,12 +2,15 @@ add_llvm_library(LLVMInstrumentation
   AddressSanitizer.cpp
   BoundsChecking.cpp
   DataFlowSanitizer.cpp
-  DebugIR.cpp
   GCOVProfiling.cpp
   MemorySanitizer.cpp
   Instrumentation.cpp
+  InstrProfiling.cpp
   SanitizerCoverage.cpp
   ThreadSanitizer.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
   )
 
 add_dependencies(LLVMInstrumentation intrinsics_gen)
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index c5a4860..6adf0d2 100644
--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -49,6 +49,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/DebugInfo.h"
@@ -82,14 +83,14 @@ static cl::opt<bool> ClPreserveAlignment(
     cl::desc("respect alignment requirements provided by input IR"), cl::Hidden,
     cl::init(false));
 
-// The ABI list file controls how shadow parameters are passed.  The pass treats
+// The ABI list files control how shadow parameters are passed. The pass treats
 // every function labelled "uninstrumented" in the ABI list file as conforming
 // to the "native" (i.e. unsanitized) ABI.  Unless the ABI list contains
 // additional annotations for those functions, a call to one of those functions
 // will produce a warning message, as the labelling behaviour of the function is
 // unknown.  The other supported annotations are "functional" and "discard",
 // which are described below under DataFlowSanitizer::WrapperKind.
-static cl::opt<std::string> ClABIListFile(
+static cl::list<std::string> ClABIListFiles(
     "dfsan-abilist",
     cl::desc("File listing native ABI functions and how the pass treats them"),
     cl::Hidden);
@@ -140,7 +141,9 @@ class DFSanABIList {
   std::unique_ptr<SpecialCaseList> SCL;
 
  public:
-  DFSanABIList(std::unique_ptr<SpecialCaseList> SCL) : SCL(std::move(SCL)) {}
+  DFSanABIList() {}
+
+  void set(std::unique_ptr<SpecialCaseList> List) { SCL = std::move(List); }
 
   /// Returns whether either this function or its source file are listed in the
   /// given category.
@@ -263,9 +266,9 @@ class DataFlowSanitizer : public ModulePass {
   Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName);
 
  public:
-  DataFlowSanitizer(StringRef ABIListFile = StringRef(),
-                    void *(*getArgTLS)() = nullptr,
-                    void *(*getRetValTLS)() = nullptr);
+  DataFlowSanitizer(
+      const std::vector<std::string> &ABIListFiles = std::vector<std::string>(),
+      void *(*getArgTLS)() = nullptr, void *(*getRetValTLS)() = nullptr);
   static char ID;
   bool doInitialization(Module &M) override;
   bool runOnModule(Module &M) override;
@@ -350,25 +353,26 @@ char DataFlowSanitizer::ID;
 INITIALIZE_PASS(DataFlowSanitizer, "dfsan",
                 "DataFlowSanitizer: dynamic data flow analysis.", false, false)
 
-ModulePass *llvm::createDataFlowSanitizerPass(StringRef ABIListFile,
-                                              void *(*getArgTLS)(),
-                                              void *(*getRetValTLS)()) {
-  return new DataFlowSanitizer(ABIListFile, getArgTLS, getRetValTLS);
+ModulePass *
+llvm::createDataFlowSanitizerPass(const std::vector<std::string> &ABIListFiles,
+                                  void *(*getArgTLS)(),
+                                  void *(*getRetValTLS)()) {
+  return new DataFlowSanitizer(ABIListFiles, getArgTLS, getRetValTLS);
 }
 
-DataFlowSanitizer::DataFlowSanitizer(StringRef ABIListFile,
-                                     void *(*getArgTLS)(),
-                                     void *(*getRetValTLS)())
-    : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS),
-      ABIList(SpecialCaseList::createOrDie(ABIListFile.empty() ? ClABIListFile
-                                                               : ABIListFile)) {
+DataFlowSanitizer::DataFlowSanitizer(
+    const std::vector<std::string> &ABIListFiles, void *(*getArgTLS)(),
+    void *(*getRetValTLS)())
+    : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS) {
+  std::vector<std::string> AllABIListFiles(std::move(ABIListFiles));
+  AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(),
+                         ClABIListFiles.end());
+  ABIList.set(SpecialCaseList::createOrDie(AllABIListFiles));
 }
 
 FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
-  llvm::SmallVector<Type *, 4> ArgTypes;
-  std::copy(T->param_begin(), T->param_end(), std::back_inserter(ArgTypes));
-  for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
-    ArgTypes.push_back(ShadowTy);
+  llvm::SmallVector<Type *, 4> ArgTypes(T->param_begin(), T->param_end());
+  ArgTypes.append(T->getNumParams(), ShadowTy);
   if (T->isVarArg())
     ArgTypes.push_back(ShadowPtrTy);
   Type *RetType = T->getReturnType();
@@ -381,9 +385,8 @@ FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) {
   assert(!T->isVarArg());
   llvm::SmallVector<Type *, 4> ArgTypes;
   ArgTypes.push_back(T->getPointerTo());
-  std::copy(T->param_begin(), T->param_end(), std::back_inserter(ArgTypes));
-  for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
-    ArgTypes.push_back(ShadowTy);
+  ArgTypes.append(T->param_begin(), T->param_end());
+  ArgTypes.append(T->getNumParams(), ShadowTy);
   Type *RetType = T->getReturnType();
   if (!RetType->isVoidTy())
     ArgTypes.push_back(ShadowPtrTy);
@@ -414,6 +417,11 @@ FunctionType *DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
 }
 
 bool DataFlowSanitizer::doInitialization(Module &M) {
+  llvm::Triple TargetTriple(M.getTargetTriple());
+  bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
+  bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 ||
+                  TargetTriple.getArch() == llvm::Triple::mips64el;
+
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   if (!DLP)
     report_fatal_error("data layout missing");
@@ -425,8 +433,13 @@ bool DataFlowSanitizer::doInitialization(Module &M) {
   ShadowPtrTy = PointerType::getUnqual(ShadowTy);
   IntptrTy = DL->getIntPtrType(*Ctx);
   ZeroShadow = ConstantInt::getSigned(ShadowTy, 0);
-  ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
   ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidth / 8);
+  if (IsX86_64)
+    ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
+  else if (IsMIPS64)
+    ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL);
+  else
+    report_fatal_error("unsupported triple");
 
   Type *DFSanUnionArgs[2] = { ShadowTy, ShadowTy };
   DFSanUnionFnTy =
@@ -1521,7 +1534,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
         Next = II->getNormalDest()->begin();
       } else {
         BasicBlock *NewBB =
-            SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DFS);
+            SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DT);
         Next = NewBB->begin();
       }
     } else {
diff --git a/lib/Transforms/Instrumentation/DebugIR.cpp b/lib/Transforms/Instrumentation/DebugIR.cpp
deleted file mode 100644
index 5234341..0000000
--- a/lib/Transforms/Instrumentation/DebugIR.cpp
+++ /dev/null
@@ -1,617 +0,0 @@
-//===--- DebugIR.cpp - Transform debug metadata to allow debugging IR -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// A Module transform pass that emits a succinct version of the IR and replaces
-// the source file metadata to allow debuggers to step through the IR.
-//
-// FIXME: instead of replacing debug metadata, this pass should allow for
-// additional metadata to be used to point capable debuggers to the IR file
-// without destroying the mapping to the original source file.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/IR/ValueMap.h"
-#include "DebugIR.h"
-#include "llvm/IR/AssemblyAnnotationWriter.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include <string>
-
-#define STR_HELPER(x) #x
-#define STR(x) STR_HELPER(x)
-
-using namespace llvm;
-
-#define DEBUG_TYPE "debug-ir"
-
-namespace {
-
-/// Builds a map of Value* to line numbers on which the Value appears in a
-/// textual representation of the IR by plugging into the AssemblyWriter by
-/// masquerading as an AssemblyAnnotationWriter.
-class ValueToLineMap : public AssemblyAnnotationWriter {
-  ValueMap<const Value *, unsigned int> Lines;
-  typedef ValueMap<const Value *, unsigned int>::const_iterator LineIter;
-
-  void addEntry(const Value *V, formatted_raw_ostream &Out) {
-    Out.flush();
-    Lines.insert(std::make_pair(V, Out.getLine() + 1));
-  }
-
-public:
-
-  /// Prints Module to a null buffer in order to build the map of Value pointers
-  /// to line numbers.
-  ValueToLineMap(const Module *M) {
-    raw_null_ostream ThrowAway;
-    M->print(ThrowAway, this);
-  }
-
-  // This function is called after an Instruction, GlobalValue, or GlobalAlias
-  // is printed.
-  void printInfoComment(const Value &V, formatted_raw_ostream &Out) override {
-    addEntry(&V, Out);
-  }
-
-  void emitFunctionAnnot(const Function *F,
-                         formatted_raw_ostream &Out) override {
-    addEntry(F, Out);
-  }
-
-  /// If V appears on a line in the textual IR representation, sets Line to the
-  /// line number and returns true, otherwise returns false.
-  bool getLine(const Value *V, unsigned int &Line) const {
-    LineIter i = Lines.find(V);
-    if (i != Lines.end()) {
-      Line = i->second;
-      return true;
-    }
-    return false;
-  }
-};
-
-/// Removes debug intrisncs like llvm.dbg.declare and llvm.dbg.value.
-class DebugIntrinsicsRemover : public InstVisitor<DebugIntrinsicsRemover> {
-  void remove(Instruction &I) { I.eraseFromParent(); }
-
-public:
-  static void process(Module &M) {
-    DebugIntrinsicsRemover Remover;
-    Remover.visit(&M);
-  }
-  void visitDbgDeclareInst(DbgDeclareInst &I) { remove(I); }
-  void visitDbgValueInst(DbgValueInst &I) { remove(I); }
-  void visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) { remove(I); }
-};
-
-/// Removes debug metadata (!dbg) nodes from all instructions, and optionally
-/// metadata named "llvm.dbg.cu" if RemoveNamedInfo is true.
-class DebugMetadataRemover : public InstVisitor<DebugMetadataRemover> {
-  bool RemoveNamedInfo;
-
-public:
-  static void process(Module &M, bool RemoveNamedInfo = true) {
-    DebugMetadataRemover Remover(RemoveNamedInfo);
-    Remover.run(&M);
-  }
-
-  DebugMetadataRemover(bool RemoveNamedInfo)
-      : RemoveNamedInfo(RemoveNamedInfo) {}
-
-  void visitInstruction(Instruction &I) {
-    if (I.getMetadata(LLVMContext::MD_dbg))
-      I.setMetadata(LLVMContext::MD_dbg, nullptr);
-  }
-
-  void run(Module *M) {
-    // Remove debug metadata attached to instructions
-    visit(M);
-
-    if (RemoveNamedInfo) {
-      // Remove CU named metadata (and all children nodes)
-      NamedMDNode *Node = M->getNamedMetadata("llvm.dbg.cu");
-      if (Node)
-        M->eraseNamedMetadata(Node);
-    }
-  }
-};
-
-/// Updates debug metadata in a Module:
-///   - changes Filename/Directory to values provided on construction
-///   - adds/updates line number (DebugLoc) entries associated with each
-///     instruction to reflect the instruction's location in an LLVM IR file
-class DIUpdater : public InstVisitor<DIUpdater> {
-  /// Builder of debug information
-  DIBuilder Builder;
-
-  /// Helper for type attributes/sizes/etc
-  DataLayout Layout;
-
-  /// Map of Value* to line numbers
-  const ValueToLineMap LineTable;
-
-  /// Map of Value* (in original Module) to Value* (in optional cloned Module)
-  const ValueToValueMapTy *VMap;
-
-  /// Directory of debug metadata
-  DebugInfoFinder Finder;
-
-  /// Source filename and directory
-  StringRef Filename;
-  StringRef Directory;
-
-  // CU nodes needed when creating DI subprograms
-  MDNode *FileNode;
-  MDNode *LexicalBlockFileNode;
-  const MDNode *CUNode;
-
-  ValueMap<const Function *, MDNode *> SubprogramDescriptors;
-  DenseMap<const Type *, MDNode *> TypeDescriptors;
-
-public:
-  DIUpdater(Module &M, StringRef Filename = StringRef(),
-            StringRef Directory = StringRef(), const Module *DisplayM = nullptr,
-            const ValueToValueMapTy *VMap = nullptr)
-      : Builder(M), Layout(&M), LineTable(DisplayM ? DisplayM : &M), VMap(VMap),
-        Finder(), Filename(Filename), Directory(Directory), FileNode(nullptr),
-        LexicalBlockFileNode(nullptr), CUNode(nullptr) {
-    Finder.processModule(M);
-    visit(&M);
-  }
-
-  ~DIUpdater() { Builder.finalize(); }
-
-  void visitModule(Module &M) {
-    if (Finder.compile_unit_count() > 1)
-      report_fatal_error("DebugIR pass supports only a signle compile unit per "
-                         "Module.");
-    createCompileUnit(Finder.compile_unit_count() == 1 ?
-                      (MDNode*)*Finder.compile_units().begin() : nullptr);
-  }
-
-  void visitFunction(Function &F) {
-    if (F.isDeclaration() || findDISubprogram(&F))
-      return;
-
-    StringRef MangledName = F.getName();
-    DICompositeType Sig = createFunctionSignature(&F);
-
-    // find line of function declaration
-    unsigned Line = 0;
-    if (!findLine(&F, Line)) {
-      DEBUG(dbgs() << "WARNING: No line for Function " << F.getName().str()
-                   << "\n");
-      return;
-    }
-
-    Instruction *FirstInst = F.begin()->begin();
-    unsigned ScopeLine = 0;
-    if (!findLine(FirstInst, ScopeLine)) {
-      DEBUG(dbgs() << "WARNING: No line for 1st Instruction in Function "
-                   << F.getName().str() << "\n");
-      return;
-    }
-
-    bool Local = F.hasInternalLinkage();
-    bool IsDefinition = !F.isDeclaration();
-    bool IsOptimized = false;
-
-    int FuncFlags = llvm::DIDescriptor::FlagPrototyped;
-    assert(CUNode && FileNode);
-    DISubprogram Sub = Builder.createFunction(
-        DICompileUnit(CUNode), F.getName(), MangledName, DIFile(FileNode), Line,
-        Sig, Local, IsDefinition, ScopeLine, FuncFlags, IsOptimized, &F);
-    assert(Sub.isSubprogram());
-    DEBUG(dbgs() << "create subprogram mdnode " << *Sub << ": "
-                 << "\n");
-
-    SubprogramDescriptors.insert(std::make_pair(&F, Sub));
-  }
-
-  void visitInstruction(Instruction &I) {
-    DebugLoc Loc(I.getDebugLoc());
-
-    /// If a ValueToValueMap is provided, use it to get the real instruction as
-    /// the line table was generated on a clone of the module on which we are
-    /// operating.
-    Value *RealInst = nullptr;
-    if (VMap)
-      RealInst = VMap->lookup(&I);
-
-    if (!RealInst)
-      RealInst = &I;
-
-    unsigned Col = 0; // FIXME: support columns
-    unsigned Line;
-    if (!LineTable.getLine(RealInst, Line)) {
-      // Instruction has no line, it may have been removed (in the module that
-      // will be passed to the debugger) so there is nothing to do here.
-      DEBUG(dbgs() << "WARNING: no LineTable entry for instruction " << RealInst
-                   << "\n");
-      DEBUG(RealInst->dump());
-      return;
-    }
-
-    DebugLoc NewLoc;
-    if (!Loc.isUnknown())
-      // I had a previous debug location: re-use the DebugLoc
-      NewLoc = DebugLoc::get(Line, Col, Loc.getScope(RealInst->getContext()),
-                             Loc.getInlinedAt(RealInst->getContext()));
-    else if (MDNode *scope = findScope(&I))
-      NewLoc = DebugLoc::get(Line, Col, scope, nullptr);
-    else {
-      DEBUG(dbgs() << "WARNING: no valid scope for instruction " << &I
-                   << ". no DebugLoc will be present."
-                   << "\n");
-      return;
-    }
-
-    addDebugLocation(I, NewLoc);
-  }
-
-private:
-
-  void createCompileUnit(MDNode *CUToReplace) {
-    std::string Flags;
-    bool IsOptimized = false;
-    StringRef Producer;
-    unsigned RuntimeVersion(0);
-    StringRef SplitName;
-
-    if (CUToReplace) {
-      // save fields from existing CU to re-use in the new CU
-      DICompileUnit ExistingCU(CUToReplace);
-      Producer = ExistingCU.getProducer();
-      IsOptimized = ExistingCU.isOptimized();
-      Flags = ExistingCU.getFlags();
-      RuntimeVersion = ExistingCU.getRunTimeVersion();
-      SplitName = ExistingCU.getSplitDebugFilename();
-    } else {
-      Producer =
-          "LLVM Version " STR(LLVM_VERSION_MAJOR) "." STR(LLVM_VERSION_MINOR);
-    }
-
-    CUNode =
-        Builder.createCompileUnit(dwarf::DW_LANG_C99, Filename, Directory,
-                                  Producer, IsOptimized, Flags, RuntimeVersion);
-
-    if (CUToReplace)
-      CUToReplace->replaceAllUsesWith(const_cast<MDNode *>(CUNode));
-
-    DICompileUnit CU(CUNode);
-    FileNode = Builder.createFile(Filename, Directory);
-    LexicalBlockFileNode = Builder.createLexicalBlockFile(CU, DIFile(FileNode));
-  }
-
-  /// Returns the MDNode* that represents the DI scope to associate with I
-  MDNode *findScope(const Instruction *I) {
-    const Function *F = I->getParent()->getParent();
-    if (MDNode *ret = findDISubprogram(F))
-      return ret;
-
-    DEBUG(dbgs() << "WARNING: Using fallback lexical block file scope "
-                 << LexicalBlockFileNode << " as scope for instruction " << I
-                 << "\n");
-    return LexicalBlockFileNode;
-  }
-
-  /// Returns the MDNode* that is the descriptor for F
-  MDNode *findDISubprogram(const Function *F) {
-    typedef ValueMap<const Function *, MDNode *>::const_iterator FuncNodeIter;
-    FuncNodeIter i = SubprogramDescriptors.find(F);
-    if (i != SubprogramDescriptors.end())
-      return i->second;
-
-    DEBUG(dbgs() << "searching for DI scope node for Function " << F
-                 << " in a list of " << Finder.subprogram_count()
-                 << " subprogram nodes"
-                 << "\n");
-
-    for (DISubprogram S : Finder.subprograms()) {
-      if (S.getFunction() == F) {
-        DEBUG(dbgs() << "Found DISubprogram " << S << " for function "
-                     << S.getFunction() << "\n");
-        return S;
-      }
-    }
-    DEBUG(dbgs() << "unable to find DISubprogram node for function "
-                 << F->getName().str() << "\n");
-    return nullptr;
-  }
-
-  /// Sets Line to the line number on which V appears and returns true. If a
-  /// line location for V is not found, returns false.
-  bool findLine(const Value *V, unsigned &Line) {
-    if (LineTable.getLine(V, Line))
-      return true;
-
-    if (VMap) {
-      Value *mapped = VMap->lookup(V);
-      if (mapped && LineTable.getLine(mapped, Line))
-        return true;
-    }
-    return false;
-  }
-
-  std::string getTypeName(Type *T) {
-    std::string TypeName;
-    raw_string_ostream TypeStream(TypeName);
-    if (T)
-      T->print(TypeStream);
-    else
-      TypeStream << "Printing <null> Type";
-    TypeStream.flush();
-    return TypeName;
-  }
-
-  /// Returns the MDNode that represents type T if it is already created, or 0
-  /// if it is not.
-  MDNode *getType(const Type *T) {
-    typedef DenseMap<const Type *, MDNode *>::const_iterator TypeNodeIter;
-    TypeNodeIter i = TypeDescriptors.find(T);
-    if (i != TypeDescriptors.end())
-      return i->second;
-    return nullptr;
-  }
-
-  /// Returns a DebugInfo type from an LLVM type T.
-  DIDerivedType getOrCreateType(Type *T) {
-    MDNode *N = getType(T);
-    if (N)
-      return DIDerivedType(N);
-    else if (T->isVoidTy())
-      return DIDerivedType(nullptr);
-    else if (T->isStructTy()) {
-      N = Builder.createStructType(
-          DIScope(LexicalBlockFileNode), T->getStructName(), DIFile(FileNode),
-          0, Layout.getTypeSizeInBits(T), Layout.getABITypeAlignment(T), 0,
-          DIType(nullptr), DIArray(nullptr)); // filled in later
-
-      // N is added to the map (early) so that element search below can find it,
-      // so as to avoid infinite recursion for structs that contain pointers to
-      // their own type.
-      TypeDescriptors[T] = N;
-      DICompositeType StructDescriptor(N);
-
-      SmallVector<Value *, 4> Elements;
-      for (unsigned i = 0; i < T->getStructNumElements(); ++i)
-        Elements.push_back(getOrCreateType(T->getStructElementType(i)));
-
-      // set struct elements
-      StructDescriptor.setArrays(Builder.getOrCreateArray(Elements));
-    } else if (T->isPointerTy()) {
-      Type *PointeeTy = T->getPointerElementType();
-      if (!(N = getType(PointeeTy)))
-        N = Builder.createPointerType(
-            getOrCreateType(PointeeTy), Layout.getPointerTypeSizeInBits(T),
-            Layout.getPrefTypeAlignment(T), getTypeName(T));
-    } else if (T->isArrayTy()) {
-      SmallVector<Value *, 1> Subrange;
-      Subrange.push_back(
-          Builder.getOrCreateSubrange(0, T->getArrayNumElements() - 1));
-
-      N = Builder.createArrayType(Layout.getTypeSizeInBits(T),
-                                  Layout.getPrefTypeAlignment(T),
-                                  getOrCreateType(T->getArrayElementType()),
-                                  Builder.getOrCreateArray(Subrange));
-    } else {
-      int encoding = llvm::dwarf::DW_ATE_signed;
-      if (T->isIntegerTy())
-        encoding = llvm::dwarf::DW_ATE_unsigned;
-      else if (T->isFloatingPointTy())
-        encoding = llvm::dwarf::DW_ATE_float;
-
-      N = Builder.createBasicType(getTypeName(T), T->getPrimitiveSizeInBits(),
-                                  0, encoding);
-    }
-    TypeDescriptors[T] = N;
-    return DIDerivedType(N);
-  }
-
-  /// Returns a DebugInfo type that represents a function signature for Func.
-  DICompositeType createFunctionSignature(const Function *Func) {
-    SmallVector<Value *, 4> Params;
-    DIDerivedType ReturnType(getOrCreateType(Func->getReturnType()));
-    Params.push_back(ReturnType);
-
-    const Function::ArgumentListType &Args(Func->getArgumentList());
-    for (Function::ArgumentListType::const_iterator i = Args.begin(),
-                                                    e = Args.end();
-         i != e; ++i) {
-      Type *T(i->getType());
-      Params.push_back(getOrCreateType(T));
-    }
-
-    DITypeArray ParamArray = Builder.getOrCreateTypeArray(Params);
-    return Builder.createSubroutineType(DIFile(FileNode), ParamArray);
-  }
-
-  /// Associates Instruction I with debug location Loc.
-  void addDebugLocation(Instruction &I, DebugLoc Loc) {
-    MDNode *MD = Loc.getAsMDNode(I.getContext());
-    I.setMetadata(LLVMContext::MD_dbg, MD);
-  }
-};
-
-/// Sets Filename/Directory from the Module identifier and returns true, or
-/// false if source information is not present.
-bool getSourceInfoFromModule(const Module &M, std::string &Directory,
-                             std::string &Filename) {
-  std::string PathStr(M.getModuleIdentifier());
-  if (PathStr.length() == 0 || PathStr == "<stdin>")
-    return false;
-
-  Filename = sys::path::filename(PathStr);
-  SmallVector<char, 16> Path(PathStr.begin(), PathStr.end());
-  sys::path::remove_filename(Path);
-  Directory = StringRef(Path.data(), Path.size());
-  return true;
-}
-
-// Sets Filename/Directory from debug information in M and returns true, or
-// false if no debug information available, or cannot be parsed.
-bool getSourceInfoFromDI(const Module &M, std::string &Directory,
-                         std::string &Filename) {
-  NamedMDNode *CUNode = M.getNamedMetadata("llvm.dbg.cu");
-  if (!CUNode || CUNode->getNumOperands() == 0)
-    return false;
-
-  DICompileUnit CU(CUNode->getOperand(0));
-  if (!CU.Verify())
-    return false;
-
-  Filename = CU.getFilename();
-  Directory = CU.getDirectory();
-  return true;
-}
-
-} // anonymous namespace
-
-namespace llvm {
-
-bool DebugIR::getSourceInfo(const Module &M) {
-  ParsedPath = getSourceInfoFromDI(M, Directory, Filename) ||
-               getSourceInfoFromModule(M, Directory, Filename);
-  return ParsedPath;
-}
-
-bool DebugIR::updateExtension(StringRef NewExtension) {
-  size_t dot = Filename.find_last_of(".");
-  if (dot == std::string::npos)
-    return false;
-
-  Filename.erase(dot);
-  Filename += NewExtension.str();
-  return true;
-}
-
-void DebugIR::generateFilename(std::unique_ptr<int> &fd) {
-  SmallVector<char, 16> PathVec;
-  fd.reset(new int);
-  sys::fs::createTemporaryFile("debug-ir", "ll", *fd, PathVec);
-  StringRef Path(PathVec.data(), PathVec.size());
-  Filename = sys::path::filename(Path);
-  sys::path::remove_filename(PathVec);
-  Directory = StringRef(PathVec.data(), PathVec.size());
-
-  GeneratedPath = true;
-}
-
-std::string DebugIR::getPath() {
-  SmallVector<char, 16> Path;
-  sys::path::append(Path, Directory, Filename);
-  Path.resize(Filename.size() + Directory.size() + 2);
-  Path[Filename.size() + Directory.size() + 1] = '\0';
-  return std::string(Path.data());
-}
-
-void DebugIR::writeDebugBitcode(const Module *M, int *fd) {
-  std::unique_ptr<raw_fd_ostream> Out;
-  std::error_code EC;
-
-  if (!fd) {
-    std::string Path = getPath();
-    Out.reset(new raw_fd_ostream(Path, EC, sys::fs::F_Text));
-    DEBUG(dbgs() << "WRITING debug bitcode from Module " << M << " to file "
-                 << Path << "\n");
-  } else {
-    DEBUG(dbgs() << "WRITING debug bitcode from Module " << M << " to fd "
-                 << *fd << "\n");
-    Out.reset(new raw_fd_ostream(*fd, true));
-  }
-
-  M->print(*Out, nullptr);
-  Out->close();
-}
-
-void DebugIR::createDebugInfo(Module &M, std::unique_ptr<Module> &DisplayM) {
-  if (M.getFunctionList().size() == 0)
-    // no functions -- no debug info needed
-    return;
-
-  std::unique_ptr<ValueToValueMapTy> VMap;
-
-  if (WriteSourceToDisk && (HideDebugIntrinsics || HideDebugMetadata)) {
-    VMap.reset(new ValueToValueMapTy);
-    DisplayM.reset(CloneModule(&M, *VMap));
-
-    if (HideDebugIntrinsics)
-      DebugIntrinsicsRemover::process(*DisplayM);
-
-    if (HideDebugMetadata)
-      DebugMetadataRemover::process(*DisplayM);
-  }
-
-  DIUpdater R(M, Filename, Directory, DisplayM.get(), VMap.get());
-}
-
-bool DebugIR::isMissingPath() { return Filename.empty() || Directory.empty(); }
-
-bool DebugIR::runOnModule(Module &M) {
-  std::unique_ptr<int> fd;
-
-  if (isMissingPath() && !getSourceInfo(M)) {
-    if (!WriteSourceToDisk)
-      report_fatal_error("DebugIR unable to determine file name in input. "
-                         "Ensure Module contains an identifier, a valid "
-                         "DICompileUnit, or construct DebugIR with "
-                         "non-empty Filename/Directory parameters.");
-    else
-      generateFilename(fd);
-  }
-
-  if (!GeneratedPath && WriteSourceToDisk)
-    updateExtension(".debug-ll");
-
-  // Clear line numbers. Keep debug info (if any) if we were able to read the
-  // file name from the DICompileUnit descriptor.
-  DebugMetadataRemover::process(M, !ParsedPath);
-
-  std::unique_ptr<Module> DisplayM;
-  createDebugInfo(M, DisplayM);
-  if (WriteSourceToDisk) {
-    Module *OutputM = DisplayM.get() ? DisplayM.get() : &M;
-    writeDebugBitcode(OutputM, fd.get());
-  }
-
-  DEBUG(M.dump());
-  return true;
-}
-
-bool DebugIR::runOnModule(Module &M, std::string &Path) {
-  bool result = runOnModule(M);
-  Path = getPath();
-  return result;
-}
-
-} // llvm namespace
-
-char DebugIR::ID = 0;
-INITIALIZE_PASS(DebugIR, "debug-ir", "Enable debugging IR", false, false)
-
-ModulePass *llvm::createDebugIRPass(bool HideDebugIntrinsics,
-                                    bool HideDebugMetadata, StringRef Directory,
-                                    StringRef Filename) {
-  return new DebugIR(HideDebugIntrinsics, HideDebugMetadata, Directory,
-                     Filename);
-}
-
-ModulePass *llvm::createDebugIRPass() { return new DebugIR(); }
diff --git a/lib/Transforms/Instrumentation/DebugIR.h b/lib/Transforms/Instrumentation/DebugIR.h
deleted file mode 100644
index 8d74a4d..0000000
--- a/lib/Transforms/Instrumentation/DebugIR.h
+++ /dev/null
@@ -1,98 +0,0 @@
-//===- llvm/Transforms/Instrumentation/DebugIR.h - Interface ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the interface of the DebugIR pass. For most users,
-// including Instrumentation.h and calling createDebugIRPass() is sufficient and
-// there is no need to include this file.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H
-#define LLVM_LIB_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H
-
-#include "llvm/Pass.h"
-
-namespace llvm {
-
-class DebugIR : public llvm::ModulePass {
-  /// If true, write a source file to disk.
-  bool WriteSourceToDisk;
-
-  /// Hide certain (non-essential) debug information (only relevant if
-  /// createSource is true.
-  bool HideDebugIntrinsics;
-  bool HideDebugMetadata;
-
-  /// The location of the source file.
-  std::string Directory;
-  std::string Filename;
-
-  /// True if a temporary file name was generated.
-  bool GeneratedPath;
-
-  /// True if the file name was read from the Module.
-  bool ParsedPath;
-
-public:
-  static char ID;
-
-  const char *getPassName() const override { return "DebugIR"; }
-
-  /// Generate a file on disk to be displayed in a debugger. If Filename and
-  /// Directory are empty, a temporary path will be generated.
-  DebugIR(bool HideDebugIntrinsics, bool HideDebugMetadata,
-          llvm::StringRef Directory, llvm::StringRef Filename)
-      : ModulePass(ID), WriteSourceToDisk(true),
-        HideDebugIntrinsics(HideDebugIntrinsics),
-        HideDebugMetadata(HideDebugMetadata), Directory(Directory),
-        Filename(Filename), GeneratedPath(false), ParsedPath(false) {}
-
-  /// Modify input in-place; do not generate additional files, and do not hide
-  /// any debug intrinsics/metadata that might be present.
-  DebugIR()
-      : ModulePass(ID), WriteSourceToDisk(false), HideDebugIntrinsics(false),
-        HideDebugMetadata(false), GeneratedPath(false), ParsedPath(false) {}
-
-  /// Run pass on M and set Path to the source file path in the output module.
-  bool runOnModule(llvm::Module &M, std::string &Path);
-  bool runOnModule(llvm::Module &M) override;
-
-private:
-
-  /// Returns the concatenated Directory + Filename, without error checking
-  std::string getPath();
-
-  /// Attempts to read source information from debug information in M, and if
-  /// that fails, from M's identifier. Returns true on success, false otherwise.
-  bool getSourceInfo(const llvm::Module &M);
-
-  /// Replace the extension of Filename with NewExtension, and return true if
-  /// successful. Return false if extension could not be found or Filename is
-  /// empty.
-  bool updateExtension(llvm::StringRef NewExtension);
-
-  /// Generate a temporary filename and open an fd
-  void generateFilename(std::unique_ptr<int> &fd);
-
-  /// Creates DWARF CU/Subroutine metadata
-  void createDebugInfo(llvm::Module &M,
-                       std::unique_ptr<llvm::Module> &DisplayM);
-
-  /// Returns true if either Directory or Filename is missing, false otherwise.
-  bool isMissingPath();
-
-  /// Write M to disk, optionally passing in an fd to an open file which is
-  /// closed by this function after writing. If no fd is specified, a new file
-  /// is opened, written, and closed.
-  void writeDebugBitcode(const llvm::Module *M, int *fd = nullptr);
-};
-
-} // llvm namespace
-
-#endif
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 220d7f8..cb965fb 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -285,6 +285,14 @@ namespace {
       DeleteContainerSeconds(LinesByFile);
     }
 
+    GCOVBlock(const GCOVBlock &RHS) : GCOVRecord(RHS), Number(RHS.Number) {
+      // Only allow copy before edges and lines have been added. After that,
+      // there are inter-block pointers (eg: edges) that won't take kindly to
+      // blocks being copied or moved around.
+      assert(LinesByFile.empty());
+      assert(OutEdges.empty());
+    }
+
    private:
     friend class GCOVFunction;
 
@@ -303,18 +311,22 @@ namespace {
   // object users can construct, the blocks and lines will be rooted here.
   class GCOVFunction : public GCOVRecord {
    public:
-    GCOVFunction(DISubprogram SP, raw_ostream *os, uint32_t Ident,
-                 bool UseCfgChecksum) :
-        SP(SP), Ident(Ident), UseCfgChecksum(UseCfgChecksum), CfgChecksum(0) {
+     GCOVFunction(DISubprogram SP, raw_ostream *os, uint32_t Ident,
+                  bool UseCfgChecksum)
+         : SP(SP), Ident(Ident), UseCfgChecksum(UseCfgChecksum), CfgChecksum(0),
+           ReturnBlock(1, os) {
       this->os = os;
 
       Function *F = SP.getFunction();
       DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");
+
       uint32_t i = 0;
-      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-        Blocks[BB] = new GCOVBlock(i++, os);
+      for (auto &BB : *F) {
+        // Skip index 1 (0, 2, 3, 4, ...) because that's assigned to the
+        // ReturnBlock.
+        bool first = i == 0;
+        Blocks.insert(std::make_pair(&BB, GCOVBlock(i++ + !first, os)));
       }
-      ReturnBlock = new GCOVBlock(i++, os);
 
       std::string FunctionNameAndLine;
       raw_string_ostream FNLOS(FunctionNameAndLine);
@@ -323,17 +335,12 @@ namespace {
       FuncChecksum = hash_value(FunctionNameAndLine);
     }
 
-    ~GCOVFunction() {
-      DeleteContainerSeconds(Blocks);
-      delete ReturnBlock;
-    }
-
     GCOVBlock &getBlock(BasicBlock *BB) {
-      return *Blocks[BB];
+      return Blocks.find(BB)->second;
     }
 
     GCOVBlock &getReturnBlock() {
-      return *ReturnBlock;
+      return ReturnBlock;
     }
 
     std::string getEdgeDestinations() {
@@ -341,7 +348,7 @@ namespace {
       raw_string_ostream EDOS(EdgeDestinations);
       Function *F = Blocks.begin()->first->getParent();
       for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
-        GCOVBlock &Block = *Blocks[I];
+        GCOVBlock &Block = getBlock(I);
         for (int i = 0, e = Block.OutEdges.size(); i != e; ++i)
           EDOS << Block.OutEdges[i]->Number;
       }
@@ -383,7 +390,7 @@ namespace {
       if (Blocks.empty()) return;
       Function *F = Blocks.begin()->first->getParent();
       for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
-        GCOVBlock &Block = *Blocks[I];
+        GCOVBlock &Block = getBlock(I);
         if (Block.OutEdges.empty()) continue;
 
         writeBytes(EdgeTag, 4);
@@ -399,7 +406,7 @@ namespace {
 
       // Emit lines for each block.
       for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
-        Blocks[I]->writeOut();
+        getBlock(I).writeOut();
       }
     }
 
@@ -409,8 +416,8 @@ namespace {
     uint32_t FuncChecksum;
     bool UseCfgChecksum;
     uint32_t CfgChecksum;
-    DenseMap<BasicBlock *, GCOVBlock *> Blocks;
-    GCOVBlock *ReturnBlock;
+    DenseMap<BasicBlock *, GCOVBlock> Blocks;
+    GCOVBlock ReturnBlock;
   };
 }
 
diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp
new file mode 100644
index 0000000..b5a491f
--- /dev/null
+++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -0,0 +1,351 @@
+//===-- InstrProfiling.cpp - Frontend instrumentation based profiling -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers instrprof_increment intrinsics emitted by a frontend for
+// profiling. It also builds the data structures and initialization code needed
+// for updating execution counts and emitting the profile at runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "instrprof"
+
+namespace {
+
+class InstrProfiling : public ModulePass {
+public:
+  static char ID;
+
+  InstrProfiling() : ModulePass(ID) {}
+
+  InstrProfiling(const InstrProfOptions &Options)
+      : ModulePass(ID), Options(Options) {}
+
+  const char *getPassName() const override {
+    return "Frontend instrumentation-based coverage lowering";
+  }
+
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+
+private:
+  InstrProfOptions Options;
+  Module *M;
+  DenseMap<GlobalVariable *, GlobalVariable *> RegionCounters;
+  std::vector<Value *> UsedVars;
+
+  bool isMachO() const {
+    return Triple(M->getTargetTriple()).isOSBinFormatMachO();
+  }
+
+  /// Get the section name for the counter variables.
+  StringRef getCountersSection() const {
+    return isMachO() ? "__DATA,__llvm_prf_cnts" : "__llvm_prf_cnts";
+  }
+
+  /// Get the section name for the name variables.
+  StringRef getNameSection() const {
+    return isMachO() ? "__DATA,__llvm_prf_names" : "__llvm_prf_names";
+  }
+
+  /// Get the section name for the profile data variables.
+  StringRef getDataSection() const {
+    return isMachO() ? "__DATA,__llvm_prf_data" : "__llvm_prf_data";
+  }
+
+  /// Get the section name for the coverage mapping data.
+  StringRef getCoverageSection() const {
+    return isMachO() ? "__DATA,__llvm_covmap" : "__llvm_covmap";
+  }
+
+  /// Replace instrprof_increment with an increment of the appropriate value.
+  void lowerIncrement(InstrProfIncrementInst *Inc);
+
+  /// Set up the section and uses for coverage data and its references.
+  void lowerCoverageData(GlobalVariable *CoverageData);
+
+  /// Get the region counters for an increment, creating them if necessary.
+  ///
+  /// If the counter array doesn't yet exist, the profile data variables
+  /// referring to them will also be created.
+  GlobalVariable *getOrCreateRegionCounters(InstrProfIncrementInst *Inc);
+
+  /// Emit runtime registration functions for each profile data variable.
+  void emitRegistration();
+
+  /// Emit the necessary plumbing to pull in the runtime initialization.
+  void emitRuntimeHook();
+
+  /// Add uses of our data variables and runtime hook.
+  void emitUses();
+
+  /// Create a static initializer for our data, on platforms that need it.
+  void emitInitialization();
+};
+
+} // anonymous namespace
+
+char InstrProfiling::ID = 0;
+INITIALIZE_PASS(InstrProfiling, "instrprof",
+                "Frontend instrumentation-based coverage lowering.", false,
+                false)
+
+ModulePass *llvm::createInstrProfilingPass(const InstrProfOptions &Options) {
+  return new InstrProfiling(Options);
+}
+
+bool InstrProfiling::runOnModule(Module &M) {
+  bool MadeChange = false;
+
+  this->M = &M;
+  RegionCounters.clear();
+  UsedVars.clear();
+
+  for (Function &F : M)
+    for (BasicBlock &BB : F)
+      for (auto I = BB.begin(), E = BB.end(); I != E;)
+        if (auto *Inc = dyn_cast<InstrProfIncrementInst>(I++)) {
+          lowerIncrement(Inc);
+          MadeChange = true;
+        }
+  if (GlobalVariable *Coverage = M.getNamedGlobal("__llvm_coverage_mapping")) {
+    lowerCoverageData(Coverage);
+    MadeChange = true;
+  }
+  if (!MadeChange)
+    return false;
+
+  emitRegistration();
+  emitRuntimeHook();
+  emitUses();
+  emitInitialization();
+  return true;
+}
+
+void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
+  GlobalVariable *Counters = getOrCreateRegionCounters(Inc);
+
+  IRBuilder<> Builder(Inc->getParent(), *Inc);
+  uint64_t Index = Inc->getIndex()->getZExtValue();
+  llvm::Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Index);
+  llvm::Value *Count = Builder.CreateLoad(Addr, "pgocount");
+  Count = Builder.CreateAdd(Count, Builder.getInt64(1));
+  Inc->replaceAllUsesWith(Builder.CreateStore(Count, Addr));
+  Inc->eraseFromParent();
+}
+
+void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageData) {
+  CoverageData->setSection(getCoverageSection());
+  CoverageData->setAlignment(8);
+
+  Constant *Init = CoverageData->getInitializer();
+  // We're expecting { i32, i32, i32, i32, [n x { i8*, i32, i32 }], [m x i8] }
+  // for some C. If not, the frontend's given us something broken.
+  assert(Init->getNumOperands() == 6 && "bad number of fields in coverage map");
+  assert(isa<ConstantArray>(Init->getAggregateElement(4)) &&
+         "invalid function list in coverage map");
+  ConstantArray *Records = cast<ConstantArray>(Init->getAggregateElement(4));
+  for (unsigned I = 0, E = Records->getNumOperands(); I < E; ++I) {
+    Constant *Record = Records->getOperand(I);
+    Value *V = const_cast<Value *>(Record->getOperand(0))->stripPointerCasts();
+
+    assert(isa<GlobalVariable>(V) && "Missing reference to function name");
+    GlobalVariable *Name = cast<GlobalVariable>(V);
+
+    // If we have region counters for this name, we've already handled it.
+    auto It = RegionCounters.find(Name);
+    if (It != RegionCounters.end())
+      continue;
+
+    // Move the name variable to the right section.
+    Name->setSection(getNameSection());
+    Name->setAlignment(1);
+  }
+}
+
+/// Get the name of a profiling variable for a particular function.
+static std::string getVarName(InstrProfIncrementInst *Inc, StringRef VarName) {
+  auto *Arr = cast<ConstantDataArray>(Inc->getName()->getInitializer());
+  StringRef Name = Arr->isCString() ? Arr->getAsCString() : Arr->getAsString();
+  return ("__llvm_profile_" + VarName + "_" + Name).str();
+}
+
+GlobalVariable *
+InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
+  GlobalVariable *Name = Inc->getName();
+  auto It = RegionCounters.find(Name);
+  if (It != RegionCounters.end())
+    return It->second;
+
+  // Move the name variable to the right section.
+  Name->setSection(getNameSection());
+  Name->setAlignment(1);
+
+  uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
+  LLVMContext &Ctx = M->getContext();
+  ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);
+
+  // Create the counters variable.
+  auto *Counters = new GlobalVariable(*M, CounterTy, false, Name->getLinkage(),
+                                      Constant::getNullValue(CounterTy),
+                                      getVarName(Inc, "counters"));
+  Counters->setVisibility(Name->getVisibility());
+  Counters->setSection(getCountersSection());
+  Counters->setAlignment(8);
+
+  RegionCounters[Inc->getName()] = Counters;
+
+  // Create data variable.
+  auto *NameArrayTy = Name->getType()->getPointerElementType();
+  auto *Int32Ty = Type::getInt32Ty(Ctx);
+  auto *Int64Ty = Type::getInt64Ty(Ctx);
+  auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
+  auto *Int64PtrTy = Type::getInt64PtrTy(Ctx);
+
+  Type *DataTypes[] = {Int32Ty, Int32Ty, Int64Ty, Int8PtrTy, Int64PtrTy};
+  auto *DataTy = StructType::get(Ctx, makeArrayRef(DataTypes));
+  Constant *DataVals[] = {
+      ConstantInt::get(Int32Ty, NameArrayTy->getArrayNumElements()),
+      ConstantInt::get(Int32Ty, NumCounters),
+      ConstantInt::get(Int64Ty, Inc->getHash()->getZExtValue()),
+      ConstantExpr::getBitCast(Name, Int8PtrTy),
+      ConstantExpr::getBitCast(Counters, Int64PtrTy)};
+  auto *Data = new GlobalVariable(*M, DataTy, true, Name->getLinkage(),
+                                  ConstantStruct::get(DataTy, DataVals),
+                                  getVarName(Inc, "data"));
+  Data->setVisibility(Name->getVisibility());
+  Data->setSection(getDataSection());
+  Data->setAlignment(8);
+
+  // Mark the data variable as used so that it isn't stripped out.
+  UsedVars.push_back(Data);
+
+  return Counters;
+}
+
+void InstrProfiling::emitRegistration() {
+  // Don't do this for Darwin.  compiler-rt uses linker magic.
+  if (Triple(M->getTargetTriple()).isOSDarwin())
+    return;
+
+  // Construct the function.
+  auto *VoidTy = Type::getVoidTy(M->getContext());
+  auto *VoidPtrTy = Type::getInt8PtrTy(M->getContext());
+  auto *RegisterFTy = FunctionType::get(VoidTy, false);
+  auto *RegisterF = Function::Create(RegisterFTy, GlobalValue::InternalLinkage,
+                                     "__llvm_profile_register_functions", M);
+  RegisterF->setUnnamedAddr(true);
+  if (Options.NoRedZone)
+    RegisterF->addFnAttr(Attribute::NoRedZone);
+
+  auto *RuntimeRegisterTy = llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
+  auto *RuntimeRegisterF =
+      Function::Create(RuntimeRegisterTy, GlobalVariable::ExternalLinkage,
+                       "__llvm_profile_register_function", M);
+
+  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", RegisterF));
+  for (Value *Data : UsedVars)
+    IRB.CreateCall(RuntimeRegisterF, IRB.CreateBitCast(Data, VoidPtrTy));
+  IRB.CreateRetVoid();
+}
+
+void InstrProfiling::emitRuntimeHook() {
+  const char *const RuntimeVarName = "__llvm_profile_runtime";
+  const char *const RuntimeUserName = "__llvm_profile_runtime_user";
+
+  // If the module's provided its own runtime, we don't need to do anything.
+  if (M->getGlobalVariable(RuntimeVarName))
+    return;
+
+  // Declare an external variable that will pull in the runtime initialization.
+  auto *Int32Ty = Type::getInt32Ty(M->getContext());
+  auto *Var =
+      new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
+                         nullptr, RuntimeVarName);
+
+  // Make a function that uses it.
+  auto *User =
+      Function::Create(FunctionType::get(Int32Ty, false),
+                       GlobalValue::LinkOnceODRLinkage, RuntimeUserName, M);
+  User->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    User->addFnAttr(Attribute::NoRedZone);
+  User->setVisibility(GlobalValue::HiddenVisibility);
+
+  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User));
+  auto *Load = IRB.CreateLoad(Var);
+  IRB.CreateRet(Load);
+
+  // Mark the user variable as used so that it isn't stripped out.
+  UsedVars.push_back(User);
+}
+
+void InstrProfiling::emitUses() {
+  if (UsedVars.empty())
+    return;
+
+  GlobalVariable *LLVMUsed = M->getGlobalVariable("llvm.used");
+  std::vector<Constant*> MergedVars;
+  if (LLVMUsed) {
+    // Collect the existing members of llvm.used.
+    ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+    for (unsigned I = 0, E = Inits->getNumOperands(); I != E; ++I)
+      MergedVars.push_back(Inits->getOperand(I));
+    LLVMUsed->eraseFromParent();
+  }
+
+  Type *i8PTy = Type::getInt8PtrTy(M->getContext());
+  // Add uses for our data.
+  for (auto *Value : UsedVars)
+    MergedVars.push_back(
+        ConstantExpr::getBitCast(cast<llvm::Constant>(Value), i8PTy));
+
+  // Recreate llvm.used.
+  ArrayType *ATy = ArrayType::get(i8PTy, MergedVars.size());
+  LLVMUsed = new llvm::GlobalVariable(
+      *M, ATy, false, llvm::GlobalValue::AppendingLinkage,
+      llvm::ConstantArray::get(ATy, MergedVars), "llvm.used");
+
+  LLVMUsed->setSection("llvm.metadata");
+}
+
+void InstrProfiling::emitInitialization() {
+  Constant *RegisterF = M->getFunction("__llvm_profile_register_functions");
+  if (!RegisterF)
+    return;
+
+  // Create the initialization function.
+  auto *VoidTy = Type::getVoidTy(M->getContext());
+  auto *F =
+      Function::Create(FunctionType::get(VoidTy, false),
+                       GlobalValue::InternalLinkage, "__llvm_profile_init", M);
+  F->setUnnamedAddr(true);
+  F->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    F->addFnAttr(Attribute::NoRedZone);
+
+  // Add the basic block and the necessary calls.
+  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", F));
+  IRB.CreateCall(RegisterF);
+  IRB.CreateRetVoid();
+
+  appendToGlobalCtors(*M, F, 0);
+}
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index 8e95367..a91fc0e 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -25,6 +25,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeAddressSanitizerModulePass(Registry);
   initializeBoundsCheckingPass(Registry);
   initializeGCOVProfilerPass(Registry);
+  initializeInstrProfilingPass(Registry);
   initializeMemorySanitizerPass(Registry);
   initializeThreadSanitizerPass(Registry);
   initializeSanitizerCoverageModulePass(Registry);
diff --git a/lib/Transforms/Instrumentation/LLVMBuild.txt b/lib/Transforms/Instrumentation/LLVMBuild.txt
index 99e95df..14c1743 100644
--- a/lib/Transforms/Instrumentation/LLVMBuild.txt
+++ b/lib/Transforms/Instrumentation/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = Instrumentation
 parent = Transforms
-required_libraries = Analysis Core Support Target TransformUtils
+required_libraries = Analysis Core MC Support TransformUtils
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 1261259..4152679 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -120,10 +120,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "msan"
 
-static const uint64_t kShadowMask32 = 1ULL << 31;
-static const uint64_t kShadowMask64 = 1ULL << 46;
-static const uint64_t kOriginOffset32 = 1ULL << 30;
-static const uint64_t kOriginOffset64 = 1ULL << 45;
+static const unsigned kOriginSize = 4;
 static const unsigned kMinOriginAlignment = 4;
 static const unsigned kShadowTLSAlignment = 8;
 
@@ -187,18 +184,6 @@ static cl::opt<int> ClInstrumentationWithCallThreshold(
         "inline checks (-1 means never use callbacks)."),
     cl::Hidden, cl::init(3500));
 
-// Experimental. Wraps all indirect calls in the instrumented code with
-// a call to the given function. This is needed to assist the dynamic
-// helper tool (MSanDR) to regain control on transition between instrumented and
-// non-instrumented code.
-static cl::opt<std::string> ClWrapIndirectCalls("msan-wrap-indirect-calls",
-       cl::desc("Wrap indirect calls with a given function"),
-       cl::Hidden);
-
-static cl::opt<bool> ClWrapIndirectCallsFast("msan-wrap-indirect-calls-fast",
-       cl::desc("Do not wrap indirect calls with target in the same module"),
-       cl::Hidden, cl::init(true));
-
 // This is an experiment to enable handling of cases where shadow is a non-zero
 // compile-time constant. For some unexplainable reason they were silently
 // ignored in the instrumentation.
@@ -208,6 +193,77 @@ static cl::opt<bool> ClCheckConstantShadow("msan-check-constant-shadow",
 
 namespace {
 
+// Memory map parameters used in application-to-shadow address calculation.
+// Offset = (Addr & ~AndMask) ^ XorMask
+// Shadow = ShadowBase + Offset
+// Origin = OriginBase + Offset
+struct MemoryMapParams {
+  uint64_t AndMask;
+  uint64_t XorMask;
+  uint64_t ShadowBase;
+  uint64_t OriginBase;
+};
+
+struct PlatformMemoryMapParams {
+  const MemoryMapParams *bits32;
+  const MemoryMapParams *bits64;
+};
+
+// i386 Linux
+static const MemoryMapParams Linux_I386_MemoryMapParams = {
+  0x000080000000,  // AndMask
+  0,               // XorMask (not used)
+  0,               // ShadowBase (not used)
+  0x000040000000,  // OriginBase
+};
+
+// x86_64 Linux
+static const MemoryMapParams Linux_X86_64_MemoryMapParams = {
+  0x400000000000,  // AndMask
+  0,               // XorMask (not used)
+  0,               // ShadowBase (not used)
+  0x200000000000,  // OriginBase
+};
+
+// mips64 Linux
+static const MemoryMapParams Linux_MIPS64_MemoryMapParams = {
+  0x004000000000,  // AndMask
+  0,               // XorMask (not used)
+  0,               // ShadowBase (not used)
+  0x002000000000,  // OriginBase
+};
+
+// i386 FreeBSD
+static const MemoryMapParams FreeBSD_I386_MemoryMapParams = {
+  0x000180000000,  // AndMask
+  0x000040000000,  // XorMask
+  0x000020000000,  // ShadowBase
+  0x000700000000,  // OriginBase
+};
+
+// x86_64 FreeBSD
+static const MemoryMapParams FreeBSD_X86_64_MemoryMapParams = {
+  0xc00000000000,  // AndMask
+  0x200000000000,  // XorMask
+  0x100000000000,  // ShadowBase
+  0x380000000000,  // OriginBase
+};
+
+static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = {
+  &Linux_I386_MemoryMapParams,
+  &Linux_X86_64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_MIPS_MemoryMapParams = {
+  NULL,
+  &Linux_MIPS64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = {
+  &FreeBSD_I386_MemoryMapParams,
+  &FreeBSD_X86_64_MemoryMapParams,
+};
+
 /// \brief An instrumentation pass implementing detection of uninitialized
 /// reads.
 ///
@@ -219,8 +275,7 @@ class MemorySanitizer : public FunctionPass {
       : FunctionPass(ID),
         TrackOrigins(std::max(TrackOrigins, (int)ClTrackOrigins)),
         DL(nullptr),
-        WarningFn(nullptr),
-        WrapIndirectCalls(!ClWrapIndirectCalls.empty()) {}
+        WarningFn(nullptr) {}
   const char *getPassName() const override { return "MemorySanitizer"; }
   bool runOnFunction(Function &F) override;
   bool doInitialization(Module &M) override;
@@ -254,9 +309,6 @@ class MemorySanitizer : public FunctionPass {
   /// function.
   GlobalVariable *OriginTLS;
 
-  GlobalVariable *MsandrModuleStart;
-  GlobalVariable *MsandrModuleEnd;
-
   /// \brief The run-time callback to print a warning.
   Value *WarningFn;
   // These arrays are indexed by log2(AccessSize).
@@ -274,27 +326,18 @@ class MemorySanitizer : public FunctionPass {
   /// \brief MSan runtime replacements for memmove, memcpy and memset.
   Value *MemmoveFn, *MemcpyFn, *MemsetFn;
 
-  /// \brief Address mask used in application-to-shadow address calculation.
-  /// ShadowAddr is computed as ApplicationAddr & ~ShadowMask.
-  uint64_t ShadowMask;
-  /// \brief Offset of the origin shadow from the "normal" shadow.
-  /// OriginAddr is computed as (ShadowAddr + OriginOffset) & ~3ULL
-  uint64_t OriginOffset;
-  /// \brief Branch weights for error reporting.
+  /// \brief Memory map parameters used in application-to-shadow calculation.
+  const MemoryMapParams *MapParams;
+
   MDNode *ColdCallWeights;
   /// \brief Branch weights for origin store.
   MDNode *OriginStoreWeights;
   /// \brief An empty volatile inline asm that prevents callback merge.
   InlineAsm *EmptyAsm;
 
-  bool WrapIndirectCalls;
-  /// \brief Run-time wrapper for indirect calls.
-  Value *IndirectCallWrapperFn;
-  // Argument and return type of IndirectCallWrapperFn: void (*f)(void).
-  Type *AnyFunctionPtrTy;
-
   friend struct MemorySanitizerVisitor;
   friend struct VarArgAMD64Helper;
+  friend struct VarArgMIPS64Helper;
 };
 }  // namespace
 
@@ -400,24 +443,6 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
                             /*hasSideEffects=*/true);
-
-  if (WrapIndirectCalls) {
-    AnyFunctionPtrTy =
-        PointerType::getUnqual(FunctionType::get(IRB.getVoidTy(), false));
-    IndirectCallWrapperFn = M.getOrInsertFunction(
-        ClWrapIndirectCalls, AnyFunctionPtrTy, AnyFunctionPtrTy, nullptr);
-  }
-
-  if (WrapIndirectCalls && ClWrapIndirectCallsFast) {
-    MsandrModuleStart = new GlobalVariable(
-        M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage,
-        nullptr, "__executable_start");
-    MsandrModuleStart->setVisibility(GlobalVariable::HiddenVisibility);
-    MsandrModuleEnd = new GlobalVariable(
-        M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage,
-        nullptr, "_end");
-    MsandrModuleEnd->setVisibility(GlobalVariable::HiddenVisibility);
-  }
 }
 
 /// \brief Module-level initialization.
@@ -429,22 +454,41 @@ bool MemorySanitizer::doInitialization(Module &M) {
     report_fatal_error("data layout missing");
   DL = &DLP->getDataLayout();
 
-  C = &(M.getContext());
-  unsigned PtrSize = DL->getPointerSizeInBits(/* AddressSpace */0);
-  switch (PtrSize) {
-    case 64:
-      ShadowMask = kShadowMask64;
-      OriginOffset = kOriginOffset64;
+  Triple TargetTriple(M.getTargetTriple());
+  switch (TargetTriple.getOS()) {
+    case Triple::FreeBSD:
+      switch (TargetTriple.getArch()) {
+        case Triple::x86_64:
+          MapParams = FreeBSD_X86_MemoryMapParams.bits64;
+          break;
+        case Triple::x86:
+          MapParams = FreeBSD_X86_MemoryMapParams.bits32;
+          break;
+        default:
+          report_fatal_error("unsupported architecture");
+      }
       break;
-    case 32:
-      ShadowMask = kShadowMask32;
-      OriginOffset = kOriginOffset32;
+    case Triple::Linux:
+      switch (TargetTriple.getArch()) {
+        case Triple::x86_64:
+          MapParams = Linux_X86_MemoryMapParams.bits64;
+          break;
+        case Triple::x86:
+          MapParams = Linux_X86_MemoryMapParams.bits32;
+          break;
+        case Triple::mips64:
+        case Triple::mips64el:
+          MapParams = Linux_MIPS_MemoryMapParams.bits64;
+          break;
+        default:
+          report_fatal_error("unsupported architecture");
+      }
       break;
     default:
-      report_fatal_error("unsupported pointer size");
-      break;
+      report_fatal_error("unsupported operating system");
   }
 
+  C = &(M.getContext());
   IRBuilder<> IRB(*C);
   IntptrTy = IRB.getIntPtrTy(DL);
   OriginTy = IRB.getInt32Ty();
@@ -537,12 +581,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   };
   SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList;
   SmallVector<Instruction*, 16> StoreList;
-  SmallVector<CallSite, 16> IndirectCallList;
 
   MemorySanitizerVisitor(Function &F, MemorySanitizer &MS)
       : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) {
-    bool SanitizeFunction = F.getAttributes().hasAttribute(
-        AttributeSet::FunctionIndex, Attribute::SanitizeMemory);
+    bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeMemory);
     InsertChecks = SanitizeFunction;
     PropagateShadow = SanitizeFunction;
     PoisonStack = SanitizeFunction && ClPoisonStack;
@@ -561,18 +603,63 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return IRB.CreateCall(MS.MsanChainOriginFn, V);
   }
 
+  Value *originToIntptr(IRBuilder<> &IRB, Value *Origin) {
+    unsigned IntptrSize = MS.DL->getTypeStoreSize(MS.IntptrTy);
+    if (IntptrSize == kOriginSize) return Origin;
+    assert(IntptrSize == kOriginSize * 2);
+    Origin = IRB.CreateIntCast(Origin, MS.IntptrTy, /* isSigned */ false);
+    return IRB.CreateOr(Origin, IRB.CreateShl(Origin, kOriginSize * 8));
+  }
+
+  /// \brief Fill memory range with the given origin value.
+  void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr,
+                   unsigned Size, unsigned Alignment) {
+    unsigned IntptrAlignment = MS.DL->getABITypeAlignment(MS.IntptrTy);
+    unsigned IntptrSize = MS.DL->getTypeStoreSize(MS.IntptrTy);
+    assert(IntptrAlignment >= kMinOriginAlignment);
+    assert(IntptrSize >= kOriginSize);
+
+    unsigned Ofs = 0;
+    unsigned CurrentAlignment = Alignment;
+    if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) {
+      Value *IntptrOrigin = originToIntptr(IRB, Origin);
+      Value *IntptrOriginPtr =
+          IRB.CreatePointerCast(OriginPtr, PointerType::get(MS.IntptrTy, 0));
+      for (unsigned i = 0; i < Size / IntptrSize; ++i) {
+        Value *Ptr =
+            i ? IRB.CreateConstGEP1_32(IntptrOriginPtr, i) : IntptrOriginPtr;
+        IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment);
+        Ofs += IntptrSize / kOriginSize;
+        CurrentAlignment = IntptrAlignment;
+      }
+    }
+
+    for (unsigned i = Ofs; i < (Size + kOriginSize - 1) / kOriginSize; ++i) {
+      Value *GEP = i ? IRB.CreateConstGEP1_32(OriginPtr, i) : OriginPtr;
+      IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment);
+      CurrentAlignment = kMinOriginAlignment;
+    }
+  }
+
   void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin,
                    unsigned Alignment, bool AsCall) {
+    unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+    unsigned StoreSize = MS.DL->getTypeStoreSize(Shadow->getType());
     if (isa<StructType>(Shadow->getType())) {
-      IRB.CreateAlignedStore(updateOrigin(Origin, IRB), getOriginPtr(Addr, IRB),
-                             Alignment);
+      paintOrigin(IRB, updateOrigin(Origin, IRB),
+                  getOriginPtr(Addr, IRB, Alignment), StoreSize,
+                  OriginAlignment);
     } else {
       Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
-      // TODO(eugenis): handle non-zero constant shadow by inserting an
-      // unconditional check (can not simply fail compilation as this could
-      // be in the dead code).
-      if (!ClCheckConstantShadow)
-        if (isa<Constant>(ConvertedShadow)) return;
+      Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow);
+      if (ConstantShadow) {
+        if (ClCheckConstantShadow && !ConstantShadow->isZeroValue())
+          paintOrigin(IRB, updateOrigin(Origin, IRB),
+                      getOriginPtr(Addr, IRB, Alignment), StoreSize,
+                      OriginAlignment);
+        return;
+      }
+
       unsigned TypeSizeInBits =
           MS.DL->getTypeSizeInBits(ConvertedShadow->getType());
       unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
@@ -589,8 +676,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         Instruction *CheckTerm = SplitBlockAndInsertIfThen(
             Cmp, IRB.GetInsertPoint(), false, MS.OriginStoreWeights);
         IRBuilder<> IRBNew(CheckTerm);
-        IRBNew.CreateAlignedStore(updateOrigin(Origin, IRBNew),
-                                  getOriginPtr(Addr, IRBNew), Alignment);
+        paintOrigin(IRBNew, updateOrigin(Origin, IRBNew),
+                    getOriginPtr(Addr, IRBNew, Alignment), StoreSize,
+                    OriginAlignment);
       }
     }
   }
@@ -614,11 +702,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
       if (SI.isAtomic()) SI.setOrdering(addReleaseOrdering(SI.getOrdering()));
 
-      if (MS.TrackOrigins) {
-        unsigned Alignment = std::max(kMinOriginAlignment, SI.getAlignment());
-        storeOrigin(IRB, Addr, Shadow, getOrigin(Val), Alignment,
+      if (MS.TrackOrigins && !SI.isAtomic())
+        storeOrigin(IRB, Addr, Shadow, getOrigin(Val), SI.getAlignment(),
                     InstrumentWithCalls);
-      }
     }
   }
 
@@ -628,9 +714,23 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
     Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
     DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
-    // See the comment in storeOrigin().
-    if (!ClCheckConstantShadow)
-      if (isa<Constant>(ConvertedShadow)) return;
+
+    Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow);
+    if (ConstantShadow) {
+      if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) {
+        if (MS.TrackOrigins) {
+          IRB.CreateStore(Origin ? (Value *)Origin : (Value *)IRB.getInt32(0),
+                          MS.OriginTLS);
+        }
+        IRB.CreateCall(MS.WarningFn);
+        IRB.CreateCall(MS.EmptyAsm);
+        // FIXME: Insert UnreachableInst if !ClKeepGoing?
+        // This may invalidate some of the following checks and needs to be done
+        // at the very end.
+      }
+      return;
+    }
+
     unsigned TypeSizeInBits =
         MS.DL->getTypeSizeInBits(ConvertedShadow->getType());
     unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
@@ -669,47 +769,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     DEBUG(dbgs() << "DONE:\n" << F);
   }
 
-  void materializeIndirectCalls() {
-    for (auto &CS : IndirectCallList) {
-      Instruction *I = CS.getInstruction();
-      BasicBlock *B = I->getParent();
-      IRBuilder<> IRB(I);
-      Value *Fn0 = CS.getCalledValue();
-      Value *Fn = IRB.CreateBitCast(Fn0, MS.AnyFunctionPtrTy);
-
-      if (ClWrapIndirectCallsFast) {
-        // Check that call target is inside this module limits.
-        Value *Start =
-            IRB.CreateBitCast(MS.MsandrModuleStart, MS.AnyFunctionPtrTy);
-        Value *End = IRB.CreateBitCast(MS.MsandrModuleEnd, MS.AnyFunctionPtrTy);
-
-        Value *NotInThisModule = IRB.CreateOr(IRB.CreateICmpULT(Fn, Start),
-                                              IRB.CreateICmpUGE(Fn, End));
-
-        PHINode *NewFnPhi =
-            IRB.CreatePHI(Fn0->getType(), 2, "msandr.indirect_target");
-
-        Instruction *CheckTerm = SplitBlockAndInsertIfThen(
-            NotInThisModule, NewFnPhi,
-            /* Unreachable */ false, MS.ColdCallWeights);
-
-        IRB.SetInsertPoint(CheckTerm);
-        // Slow path: call wrapper function to possibly transform the call
-        // target.
-        Value *NewFn = IRB.CreateBitCast(
-            IRB.CreateCall(MS.IndirectCallWrapperFn, Fn), Fn0->getType());
-
-        NewFnPhi->addIncoming(Fn0, B);
-        NewFnPhi->addIncoming(NewFn, dyn_cast<Instruction>(NewFn)->getParent());
-        CS.setCalledFunction(NewFnPhi);
-      } else {
-        Value *NewFn = IRB.CreateBitCast(
-            IRB.CreateCall(MS.IndirectCallWrapperFn, Fn), Fn0->getType());
-        CS.setCalledFunction(NewFn);
-      }
-    }
-  }
-
   /// \brief Add MemorySanitizer instrumentation to a function.
   bool runOnFunction() {
     MS.initializeCallbacks(*F.getParent());
@@ -752,9 +811,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Insert shadow value checks.
     materializeChecks(InstrumentWithCalls);
 
-    // Wrap indirect calls.
-    materializeIndirectCalls();
-
     return true;
   }
 
@@ -808,32 +864,57 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return IRB.CreateBitCast(V, NoVecTy);
   }
 
+  /// \brief Compute the integer shadow offset that corresponds to a given
+  /// application address.
+  ///
+  /// Offset = (Addr & ~AndMask) ^ XorMask
+  Value *getShadowPtrOffset(Value *Addr, IRBuilder<> &IRB) {
+    uint64_t AndMask = MS.MapParams->AndMask;
+    assert(AndMask != 0 && "AndMask shall be specified");
+    Value *OffsetLong =
+      IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy),
+                    ConstantInt::get(MS.IntptrTy, ~AndMask));
+
+    uint64_t XorMask = MS.MapParams->XorMask;
+    if (XorMask != 0)
+      OffsetLong = IRB.CreateXor(OffsetLong,
+                                 ConstantInt::get(MS.IntptrTy, XorMask));
+    return OffsetLong;
+  }
+
   /// \brief Compute the shadow address that corresponds to a given application
   /// address.
   ///
-  /// Shadow = Addr & ~ShadowMask.
+  /// Shadow = ShadowBase + Offset
   Value *getShadowPtr(Value *Addr, Type *ShadowTy,
                       IRBuilder<> &IRB) {
-    Value *ShadowLong =
-      IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy),
-                    ConstantInt::get(MS.IntptrTy, ~MS.ShadowMask));
+    Value *ShadowLong = getShadowPtrOffset(Addr, IRB);
+    uint64_t ShadowBase = MS.MapParams->ShadowBase;
+    if (ShadowBase != 0)
+      ShadowLong =
+        IRB.CreateAdd(ShadowLong,
+                      ConstantInt::get(MS.IntptrTy, ShadowBase));
     return IRB.CreateIntToPtr(ShadowLong, PointerType::get(ShadowTy, 0));
   }
 
   /// \brief Compute the origin address that corresponds to a given application
   /// address.
   ///
-  /// OriginAddr = (ShadowAddr + OriginOffset) & ~3ULL
-  Value *getOriginPtr(Value *Addr, IRBuilder<> &IRB) {
-    Value *ShadowLong =
-      IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy),
-                    ConstantInt::get(MS.IntptrTy, ~MS.ShadowMask));
-    Value *Add =
-      IRB.CreateAdd(ShadowLong,
-                    ConstantInt::get(MS.IntptrTy, MS.OriginOffset));
-    Value *SecondAnd =
-      IRB.CreateAnd(Add, ConstantInt::get(MS.IntptrTy, ~3ULL));
-    return IRB.CreateIntToPtr(SecondAnd, PointerType::get(IRB.getInt32Ty(), 0));
+  /// OriginAddr = (OriginBase + Offset) & ~3ULL
+  Value *getOriginPtr(Value *Addr, IRBuilder<> &IRB, unsigned Alignment) {
+    Value *OriginLong = getShadowPtrOffset(Addr, IRB);
+    uint64_t OriginBase = MS.MapParams->OriginBase;
+    if (OriginBase != 0)
+      OriginLong =
+        IRB.CreateAdd(OriginLong,
+                      ConstantInt::get(MS.IntptrTy, OriginBase));
+    if (Alignment < kMinOriginAlignment) {
+      uint64_t Mask = kMinOriginAlignment - 1;
+      OriginLong = IRB.CreateAnd(OriginLong,
+                                 ConstantInt::get(MS.IntptrTy, ~Mask));
+    }
+    return IRB.CreateIntToPtr(OriginLong,
+                              PointerType::get(IRB.getInt32Ty(), 0));
   }
 
   /// \brief Compute the shadow address for a given function argument.
@@ -1006,6 +1087,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
             Value *OriginPtr =
                 getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset);
             setOrigin(A, EntryIRB.CreateLoad(OriginPtr));
+          } else {
+            setOrigin(A, getCleanOrigin());
           }
         }
         ArgOffset += RoundUpToAlignment(Size, kShadowTLSAlignment);
@@ -1025,15 +1108,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// \brief Get the origin for a value.
   Value *getOrigin(Value *V) {
     if (!MS.TrackOrigins) return nullptr;
-    if (isa<Instruction>(V) || isa<Argument>(V)) {
-      Value *Origin = OriginMap[V];
-      if (!Origin) {
-        DEBUG(dbgs() << "NO ORIGIN: " << *V << "\n");
-        Origin = getCleanOrigin();
-      }
-      return Origin;
-    }
-    return getCleanOrigin();
+    if (!PropagateShadow) return getCleanOrigin();
+    if (isa<Constant>(V)) return getCleanOrigin();
+    assert((isa<Instruction>(V) || isa<Argument>(V)) &&
+           "Unexpected value type in getOrigin()");
+    Value *Origin = OriginMap[V];
+    assert(Origin && "Missing origin");
+    return Origin;
   }
 
   /// \brief Get the origin for i-th argument of the instruction I.
@@ -1121,7 +1202,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRB(I.getNextNode());
     Type *ShadowTy = getShadowTy(&I);
     Value *Addr = I.getPointerOperand();
-    if (PropagateShadow) {
+    if (PropagateShadow && !I.getMetadata("nosanitize")) {
       Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB);
       setShadow(&I,
                 IRB.CreateAlignedLoad(ShadowPtr, I.getAlignment(), "_msld"));
@@ -1137,9 +1218,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     if (MS.TrackOrigins) {
       if (PropagateShadow) {
-        unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment());
-        setOrigin(&I,
-                  IRB.CreateAlignedLoad(getOriginPtr(Addr, IRB), Alignment));
+        unsigned Alignment = I.getAlignment();
+        unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+        setOrigin(&I, IRB.CreateAlignedLoad(getOriginPtr(Addr, IRB, Alignment),
+                                            OriginAlignment));
       } else {
         setOrigin(&I, getCleanOrigin());
       }
@@ -1173,6 +1255,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRB.CreateStore(getCleanShadow(&I), ShadowPtr);
 
     setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
   }
 
   void visitAtomicRMWInst(AtomicRMWInst &I) {
@@ -1790,7 +1873,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // FIXME: use ClStoreCleanOrigin
     // FIXME: factor out common code from materializeStores
     if (MS.TrackOrigins)
-      IRB.CreateStore(getOrigin(&I, 1), getOriginPtr(Addr, IRB));
+      IRB.CreateStore(getOrigin(&I, 1), getOriginPtr(Addr, IRB, 1));
     return true;
   }
 
@@ -1817,7 +1900,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     if (MS.TrackOrigins) {
       if (PropagateShadow)
-        setOrigin(&I, IRB.CreateLoad(getOriginPtr(Addr, IRB)));
+        setOrigin(&I, IRB.CreateLoad(getOriginPtr(Addr, IRB, 1)));
       else
         setOrigin(&I, getCleanOrigin());
     }
@@ -1981,6 +2064,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       setOrigin(&I, getOrigin(CopyOp));
     } else {
       setShadow(&I, getCleanShadow(&I));
+      setOrigin(&I, getCleanOrigin());
     }
   }
 
@@ -2179,15 +2263,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case llvm::Intrinsic::x86_sse_cvttps2pi:
       handleVectorConvertIntrinsic(I, 2);
       break;
-    case llvm::Intrinsic::x86_avx512_psll_dq:
-    case llvm::Intrinsic::x86_avx512_psrl_dq:
     case llvm::Intrinsic::x86_avx2_psll_w:
     case llvm::Intrinsic::x86_avx2_psll_d:
     case llvm::Intrinsic::x86_avx2_psll_q:
     case llvm::Intrinsic::x86_avx2_pslli_w:
     case llvm::Intrinsic::x86_avx2_pslli_d:
     case llvm::Intrinsic::x86_avx2_pslli_q:
-    case llvm::Intrinsic::x86_avx2_psll_dq:
     case llvm::Intrinsic::x86_avx2_psrl_w:
     case llvm::Intrinsic::x86_avx2_psrl_d:
     case llvm::Intrinsic::x86_avx2_psrl_q:
@@ -2198,14 +2279,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case llvm::Intrinsic::x86_avx2_psrli_q:
     case llvm::Intrinsic::x86_avx2_psrai_w:
     case llvm::Intrinsic::x86_avx2_psrai_d:
-    case llvm::Intrinsic::x86_avx2_psrl_dq:
     case llvm::Intrinsic::x86_sse2_psll_w:
     case llvm::Intrinsic::x86_sse2_psll_d:
     case llvm::Intrinsic::x86_sse2_psll_q:
     case llvm::Intrinsic::x86_sse2_pslli_w:
     case llvm::Intrinsic::x86_sse2_pslli_d:
     case llvm::Intrinsic::x86_sse2_pslli_q:
-    case llvm::Intrinsic::x86_sse2_psll_dq:
     case llvm::Intrinsic::x86_sse2_psrl_w:
     case llvm::Intrinsic::x86_sse2_psrl_d:
     case llvm::Intrinsic::x86_sse2_psrl_q:
@@ -2216,7 +2295,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case llvm::Intrinsic::x86_sse2_psrli_q:
     case llvm::Intrinsic::x86_sse2_psrai_w:
     case llvm::Intrinsic::x86_sse2_psrai_d:
-    case llvm::Intrinsic::x86_sse2_psrl_dq:
     case llvm::Intrinsic::x86_mmx_psll_w:
     case llvm::Intrinsic::x86_mmx_psll_d:
     case llvm::Intrinsic::x86_mmx_psll_q:
@@ -2248,14 +2326,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleVectorShiftIntrinsic(I, /* Variable */ true);
       break;
 
-    // Byte shifts are not implemented.
-    // case llvm::Intrinsic::x86_avx512_psll_dq_bs:
-    // case llvm::Intrinsic::x86_avx512_psrl_dq_bs:
-    // case llvm::Intrinsic::x86_avx2_psll_dq_bs:
-    // case llvm::Intrinsic::x86_avx2_psrl_dq_bs:
-    // case llvm::Intrinsic::x86_sse2_psll_dq_bs:
-    // case llvm::Intrinsic::x86_sse2_psrl_dq_bs:
-
     case llvm::Intrinsic::x86_sse2_packsswb_128:
     case llvm::Intrinsic::x86_sse2_packssdw_128:
     case llvm::Intrinsic::x86_sse2_packuswb_128:
@@ -2337,9 +2407,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
     IRBuilder<> IRB(&I);
 
-    if (MS.WrapIndirectCalls && !CS.getCalledFunction())
-      IndirectCallList.push_back(CS);
-
     unsigned ArgOffset = 0;
     DEBUG(dbgs() << "  CallSite: " << I << "\n");
     for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
@@ -2448,6 +2515,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRB(&I);
     if (!PropagateShadow) {
       setShadow(&I, getCleanShadow(&I));
+      setOrigin(&I, getCleanOrigin());
       return;
     }
 
@@ -2461,6 +2529,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   void visitAllocaInst(AllocaInst &I) {
     setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
     IRBuilder<> IRB(I.getNextNode());
     uint64_t Size = MS.DL->getTypeAllocSize(I.getAllocatedType());
     if (PoisonStack && ClPoisonStackWithCall) {
@@ -2474,7 +2543,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
 
     if (PoisonStack && MS.TrackOrigins) {
-      setOrigin(&I, getCleanOrigin());
       SmallString<2048> StackDescriptionStorage;
       raw_svector_ostream StackDescription(StackDescriptionStorage);
       // We create a string with a description of the stack allocation and
@@ -2540,9 +2608,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       }
       // a = select b, c, d
       // Oa = Sb ? Ob : (b ? Oc : Od)
-      setOrigin(&I, IRB.CreateSelect(
-                        Sb, getOrigin(I.getCondition()),
-                        IRB.CreateSelect(B, getOrigin(C), getOrigin(D))));
+      setOrigin(
+          &I, IRB.CreateSelect(Sb, getOrigin(I.getCondition()),
+                               IRB.CreateSelect(B, getOrigin(I.getTrueValue()),
+                                                getOrigin(I.getFalseValue()))));
     }
   }
 
@@ -2776,6 +2845,106 @@ struct VarArgAMD64Helper : public VarArgHelper {
   }
 };
 
+/// \brief MIPS64-specific implementation of VarArgHelper.
+struct VarArgMIPS64Helper : public VarArgHelper {
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy;
+  Value *VAArgSize;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  VarArgMIPS64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV)
+    : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(nullptr),
+      VAArgSize(nullptr) {}
+
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+    unsigned VAArgOffset = 0;
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin() + 1, End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      Value *Base;
+      uint64_t ArgSize = MS.DL->getTypeAllocSize(A->getType());
+#if defined(__MIPSEB__) || defined(MIPSEB)
+      // Adjusting the shadow for argument with size < 8 to match the placement
+      // of bits in big endian system
+      if (ArgSize < 8)
+        VAArgOffset += (8 - ArgSize);
+#endif
+      Base = getShadowPtrForVAArgument(A->getType(), IRB, VAArgOffset);
+      VAArgOffset += ArgSize;
+      VAArgOffset = RoundUpToAlignment(VAArgOffset, 8);
+      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+    }
+
+    Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(), VAArgOffset);
+    // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
+    // a new class member i.e. it is the total size of all VarArgs.
+    IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// \brief Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */8, /* alignment */8, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */8, /* alignment */8, false);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+    VAArgSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
+    Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
+                                    VAArgSize);
+
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+      Value *RegSaveAreaPtrPtr =
+        IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        Type::getInt64PtrTy(*MS.C));
+      Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr);
+      Value *RegSaveAreaShadowPtr =
+      MSV.getShadowPtr(RegSaveAreaPtr, IRB.getInt8Ty(), IRB);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy, CopySize, 8);
+    }
+  }
+};
+
 /// \brief A no-op implementation of VarArgHelper.
 struct VarArgNoOpHelper : public VarArgHelper {
   VarArgNoOpHelper(Function &F, MemorySanitizer &MS,
@@ -2797,6 +2966,9 @@ VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
   llvm::Triple TargetTriple(Func.getParent()->getTargetTriple());
   if (TargetTriple.getArch() == llvm::Triple::x86_64)
     return new VarArgAMD64Helper(Func, Msan, Visitor);
+  else if (TargetTriple.getArch() == llvm::Triple::mips64 ||
+           TargetTriple.getArch() == llvm::Triple::mips64el)
+    return new VarArgMIPS64Helper(Func, Msan, Visitor);
   else
     return new VarArgNoOpHelper(Func, Msan, Visitor);
 }
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index f882072..8c56e87 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -10,12 +10,11 @@
 // Coverage instrumentation that works with AddressSanitizer
 // and potentially with other Sanitizers.
 //
-// We create a Guard boolean variable with the same linkage
+// We create a Guard variable with the same linkage
 // as the function and inject this code into the entry block (CoverageLevel=1)
 // or all blocks (CoverageLevel>=2):
-// if (*Guard) {
-//    __sanitizer_cov();
-//    *Guard = 1;
+// if (Guard < 0) {
+//    __sanitizer_cov(&Guard);
 // }
 // The accesses to Guard are atomic. The rest of the logic is
 // in __sanitizer_cov (it's fine to call it more than once).
@@ -38,6 +37,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
@@ -55,11 +55,12 @@ using namespace llvm;
 
 static const char *const kSanCovModuleInitName = "__sanitizer_cov_module_init";
 static const char *const kSanCovName = "__sanitizer_cov";
+static const char *const kSanCovWithCheckName = "__sanitizer_cov_with_check";
 static const char *const kSanCovIndirCallName = "__sanitizer_cov_indir_call16";
 static const char *const kSanCovTraceEnter = "__sanitizer_cov_trace_func_enter";
 static const char *const kSanCovTraceBB = "__sanitizer_cov_trace_basic_block";
 static const char *const kSanCovModuleCtorName = "sancov.module_ctor";
-static const uint64_t    kSanCtorAndDtorPriority = 1;
+static const uint64_t    kSanCtorAndDtorPriority = 2;
 
 static cl::opt<int> ClCoverageLevel("sanitizer-coverage-level",
        cl::desc("Sanitizer Coverage. 0: none, 1: entry block, 2: all blocks, "
@@ -67,11 +68,11 @@ static cl::opt<int> ClCoverageLevel("sanitizer-coverage-level",
                 "4: above plus indirect calls"),
        cl::Hidden, cl::init(0));
 
-static cl::opt<int> ClCoverageBlockThreshold(
+static cl::opt<unsigned> ClCoverageBlockThreshold(
     "sanitizer-coverage-block-threshold",
-    cl::desc("Add coverage instrumentation only to the entry block if there "
-             "are more than this number of blocks."),
-    cl::Hidden, cl::init(1500));
+    cl::desc("Use a callback with a guard check inside it if there are"
+             " more than this number of blocks."),
+    cl::Hidden, cl::init(1000));
 
 static cl::opt<bool>
     ClExperimentalTracing("sanitizer-coverage-experimental-tracing",
@@ -102,15 +103,18 @@ class SanitizerCoverageModule : public ModulePass {
                                       ArrayRef<Instruction *> IndirCalls);
   bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks,
                       ArrayRef<Instruction *> IndirCalls);
-  bool InjectTracing(Function &F, ArrayRef<BasicBlock *> AllBlocks);
-  void InjectCoverageAtBlock(Function &F, BasicBlock &BB);
+  void InjectCoverageAtBlock(Function &F, BasicBlock &BB, bool UseCalls);
   Function *SanCovFunction;
+  Function *SanCovWithCheckFunction;
   Function *SanCovIndirCallFunction;
   Function *SanCovModuleInit;
   Function *SanCovTraceEnter, *SanCovTraceBB;
+  InlineAsm *EmptyAsm;
   Type *IntptrTy;
   LLVMContext *C;
 
+  GlobalVariable *GuardArray;
+
   int CoverageLevel;
 };
 
@@ -132,6 +136,9 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   DataLayoutPass *DLP = &getAnalysis<DataLayoutPass>();
   IntptrTy = Type::getIntNTy(*C, DLP->getDataLayout().getPointerSizeInBits());
   Type *VoidTy = Type::getVoidTy(*C);
+  IRBuilder<> IRB(*C);
+  Type *Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty());
+  Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
 
   Function *CtorFunc =
       Function::Create(FunctionType::get(VoidTy, false),
@@ -139,37 +146,73 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   ReturnInst::Create(*C, BasicBlock::Create(*C, "", CtorFunc));
   appendToGlobalCtors(M, CtorFunc, kSanCtorAndDtorPriority);
 
-  SanCovFunction =
-      checkInterfaceFunction(M.getOrInsertFunction(kSanCovName, VoidTy, nullptr));
+  SanCovFunction = checkInterfaceFunction(
+      M.getOrInsertFunction(kSanCovName, VoidTy, Int32PtrTy, nullptr));
+  SanCovWithCheckFunction = checkInterfaceFunction(
+      M.getOrInsertFunction(kSanCovWithCheckName, VoidTy, Int32PtrTy, nullptr));
   SanCovIndirCallFunction = checkInterfaceFunction(M.getOrInsertFunction(
       kSanCovIndirCallName, VoidTy, IntptrTy, IntptrTy, nullptr));
-  SanCovModuleInit = checkInterfaceFunction(M.getOrInsertFunction(
-      kSanCovModuleInitName, Type::getVoidTy(*C), IntptrTy, nullptr));
+  SanCovModuleInit = checkInterfaceFunction(
+      M.getOrInsertFunction(kSanCovModuleInitName, Type::getVoidTy(*C),
+                            Int32PtrTy, IntptrTy, Int8PtrTy, nullptr));
   SanCovModuleInit->setLinkage(Function::ExternalLinkage);
+  // We insert an empty inline asm after cov callbacks to avoid callback merge.
+  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
+                            StringRef(""), StringRef(""),
+                            /*hasSideEffects=*/true);
 
   if (ClExperimentalTracing) {
     SanCovTraceEnter = checkInterfaceFunction(
-        M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, IntptrTy, nullptr));
+        M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr));
     SanCovTraceBB = checkInterfaceFunction(
-        M.getOrInsertFunction(kSanCovTraceBB, VoidTy, IntptrTy, nullptr));
+        M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr));
   }
 
+  // At this point we create a dummy array of guards because we don't
+  // know how many elements we will need.
+  Type *Int32Ty = IRB.getInt32Ty();
+  GuardArray =
+      new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage,
+                         nullptr, "__sancov_gen_cov_tmp");
+
   for (auto &F : M)
     runOnFunction(F);
 
-  IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
-  IRB.CreateCall(SanCovModuleInit,
-                 ConstantInt::get(IntptrTy, SanCovFunction->getNumUses()));
+  // Now we know how many elements we need. Create an array of guards
+  // with one extra element at the beginning for the size.
+  Type *Int32ArrayNTy =
+      ArrayType::get(Int32Ty, SanCovFunction->getNumUses() + 1);
+  GlobalVariable *RealGuardArray = new GlobalVariable(
+      M, Int32ArrayNTy, false, GlobalValue::PrivateLinkage,
+      Constant::getNullValue(Int32ArrayNTy), "__sancov_gen_cov");
+
+  // Replace the dummy array with the real one.
+  GuardArray->replaceAllUsesWith(
+      IRB.CreatePointerCast(RealGuardArray, Int32PtrTy));
+  GuardArray->eraseFromParent();
+
+  // Create variable for module (compilation unit) name
+  Constant *ModNameStrConst =
+      ConstantDataArray::getString(M.getContext(), M.getName(), true);
+  GlobalVariable *ModuleName =
+      new GlobalVariable(M, ModNameStrConst->getType(), true,
+                         GlobalValue::PrivateLinkage, ModNameStrConst);
+
+  // Call __sanitizer_cov_module_init
+  IRB.SetInsertPoint(CtorFunc->getEntryBlock().getTerminator());
+  IRB.CreateCall3(SanCovModuleInit,
+                  IRB.CreatePointerCast(RealGuardArray, Int32PtrTy),
+                  ConstantInt::get(IntptrTy, SanCovFunction->getNumUses()),
+                  IRB.CreatePointerCast(ModuleName, Int8PtrTy));
   return true;
 }
 
 bool SanitizerCoverageModule::runOnFunction(Function &F) {
   if (F.empty()) return false;
-  // For now instrument only functions that will also be asan-instrumented.
-  if (!F.hasFnAttribute(Attribute::SanitizeAddress))
-    return false;
+  if (F.getName().find(".module_ctor") != std::string::npos)
+    return false;  // Should not instrument sanitizer init functions.
   if (CoverageLevel >= 3)
-    SplitAllCriticalEdges(F, this);
+    SplitAllCriticalEdges(F);
   SmallVector<Instruction*, 8> IndirCalls;
   SmallVector<BasicBlock*, 16> AllBlocks;
   for (auto &BB : F) {
@@ -182,25 +225,6 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
       }
   }
   InjectCoverage(F, AllBlocks, IndirCalls);
-  InjectTracing(F, AllBlocks);
-  return true;
-}
-
-// Experimental support for tracing.
-// Basicaly, insert a callback at the beginning of every basic block.
-// Every callback gets a pointer to a uniqie global for internal storage.
-bool SanitizerCoverageModule::InjectTracing(Function &F,
-                                            ArrayRef<BasicBlock *> AllBlocks) {
-  if (!ClExperimentalTracing) return false;
-  Type *Ty = ArrayType::get(IntptrTy, 1);  // May need to use more words later.
-  for (auto BB : AllBlocks) {
-    IRBuilder<> IRB(BB->getFirstInsertionPt());
-    GlobalVariable *TraceCache = new GlobalVariable(
-        *F.getParent(), Ty, false, GlobalValue::PrivateLinkage,
-        Constant::getNullValue(Ty), "__sancov_gen_trace_cache");
-    IRB.CreateCall(&F.getEntryBlock() == BB ? SanCovTraceEnter : SanCovTraceBB,
-                   IRB.CreatePointerCast(TraceCache, IntptrTy));
-  }
   return true;
 }
 
@@ -210,12 +234,12 @@ SanitizerCoverageModule::InjectCoverage(Function &F,
                                         ArrayRef<Instruction *> IndirCalls) {
   if (!CoverageLevel) return false;
 
-  if (CoverageLevel == 1 ||
-      (unsigned)ClCoverageBlockThreshold < AllBlocks.size()) {
-    InjectCoverageAtBlock(F, F.getEntryBlock());
+  if (CoverageLevel == 1) {
+    InjectCoverageAtBlock(F, F.getEntryBlock(), false);
   } else {
     for (auto BB : AllBlocks)
-      InjectCoverageAtBlock(F, *BB);
+      InjectCoverageAtBlock(F, *BB,
+                            ClCoverageBlockThreshold < AllBlocks.size());
   }
   InjectCoverageForIndirectCalls(F, IndirCalls);
   return true;
@@ -249,8 +273,8 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls(
   }
 }
 
-void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F,
-                                                    BasicBlock &BB) {
+void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
+                                                    bool UseCalls) {
   BasicBlock::iterator IP = BB.getFirstInsertionPt(), BE = BB.end();
   // Skip static allocas at the top of the entry block so they don't become
   // dynamic when we split the block.  If we used our optimized stack layout,
@@ -261,28 +285,41 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F,
       break;
   }
 
-  DebugLoc EntryLoc = &BB == &F.getEntryBlock()
-                          ? IP->getDebugLoc().getFnDebugLoc(*C)
-                          : IP->getDebugLoc();
+  bool IsEntryBB = &BB == &F.getEntryBlock();
+  DebugLoc EntryLoc =
+      IsEntryBB ? IP->getDebugLoc().getFnDebugLoc(*C) : IP->getDebugLoc();
   IRBuilder<> IRB(IP);
   IRB.SetCurrentDebugLocation(EntryLoc);
-  Type *Int8Ty = IRB.getInt8Ty();
-  GlobalVariable *Guard = new GlobalVariable(
-      *F.getParent(), Int8Ty, false, GlobalValue::PrivateLinkage,
-      Constant::getNullValue(Int8Ty), "__sancov_gen_cov_" + F.getName());
-  LoadInst *Load = IRB.CreateLoad(Guard);
-  Load->setAtomic(Monotonic);
-  Load->setAlignment(1);
-  Value *Cmp = IRB.CreateICmpEQ(Constant::getNullValue(Int8Ty), Load);
-  Instruction *Ins = SplitBlockAndInsertIfThen(
-      Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000));
-  IRB.SetInsertPoint(Ins);
-  IRB.SetCurrentDebugLocation(EntryLoc);
-  // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC.
-  IRB.CreateCall(SanCovFunction);
-  StoreInst *Store = IRB.CreateStore(ConstantInt::get(Int8Ty, 1), Guard);
-  Store->setAtomic(Monotonic);
-  Store->setAlignment(1);
+  SmallVector<Value *, 1> Indices;
+  Value *GuardP = IRB.CreateAdd(
+      IRB.CreatePointerCast(GuardArray, IntptrTy),
+      ConstantInt::get(IntptrTy, (1 + SanCovFunction->getNumUses()) * 4));
+  Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
+  GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy);
+  if (UseCalls) {
+    IRB.CreateCall(SanCovWithCheckFunction, GuardP);
+  } else {
+    LoadInst *Load = IRB.CreateLoad(GuardP);
+    Load->setAtomic(Monotonic);
+    Load->setAlignment(4);
+    Load->setMetadata(F.getParent()->getMDKindID("nosanitize"),
+                      MDNode::get(*C, None));
+    Value *Cmp = IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load);
+    Instruction *Ins = SplitBlockAndInsertIfThen(
+        Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000));
+    IRB.SetInsertPoint(Ins);
+    IRB.SetCurrentDebugLocation(EntryLoc);
+    // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC.
+    IRB.CreateCall(SanCovFunction, GuardP);
+    IRB.CreateCall(EmptyAsm);  // Avoids callback merge.
+  }
+
+  if (ClExperimentalTracing) {
+    // Experimental support for tracing.
+    // Insert a callback with the same guard variable as used for coverage.
+    IRB.SetInsertPoint(IP);
+    IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP);
+  }
 }
 
 char SanitizerCoverageModule::ID = 0;
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 8a56a1f..e4a4911 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -19,6 +19,8 @@
 // The rest is handled by the run-time library.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
@@ -68,6 +70,7 @@ STATISTIC(NumInstrumentedVtableReads, "Number of vtable ptr reads");
 STATISTIC(NumOmittedReadsFromConstantGlobals,
           "Number of reads from constant globals");
 STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads");
+STATISTIC(NumOmittedNonCaptured, "Number of accesses ignored due to capturing");
 
 namespace {
 
@@ -99,6 +102,8 @@ struct ThreadSanitizer : public FunctionPass {
   static const size_t kNumberOfAccessSizes = 5;
   Function *TsanRead[kNumberOfAccessSizes];
   Function *TsanWrite[kNumberOfAccessSizes];
+  Function *TsanUnalignedRead[kNumberOfAccessSizes];
+  Function *TsanUnalignedWrite[kNumberOfAccessSizes];
   Function *TsanAtomicLoad[kNumberOfAccessSizes];
   Function *TsanAtomicStore[kNumberOfAccessSizes];
   Function *TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1][kNumberOfAccessSizes];
@@ -150,6 +155,16 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
     TsanWrite[i] = checkInterfaceFunction(M.getOrInsertFunction(
         WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
 
+    SmallString<64> UnalignedReadName("__tsan_unaligned_read" +
+        itostr(ByteSize));
+    TsanUnalignedRead[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        UnalignedReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+
+    SmallString<64> UnalignedWriteName("__tsan_unaligned_write" +
+        itostr(ByteSize));
+    TsanUnalignedWrite[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        UnalignedWriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+
     Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
     Type *PtrTy = Ty->getPointerTo();
     SmallString<32> AtomicLoadName("__tsan_atomic" + itostr(BitSize) +
@@ -260,6 +275,7 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
 // Instrumenting some of the accesses may be proven redundant.
 // Currently handled:
 //  - read-before-write (within same BB, no calls between)
+//  - not captured variables
 //
 // We do not handle some of the patterns that should not survive
 // after the classic compiler optimizations.
@@ -291,6 +307,17 @@ void ThreadSanitizer::chooseInstructionsToInstrument(
         continue;
       }
     }
+    Value *Addr = isa<StoreInst>(*I)
+        ? cast<StoreInst>(I)->getPointerOperand()
+        : cast<LoadInst>(I)->getPointerOperand();
+    if (isa<AllocaInst>(GetUnderlyingObject(Addr, nullptr)) &&
+        !PointerMayBeCaptured(Addr, true, true)) {
+      // The variable is addressable but not captured, so it cannot be
+      // referenced from a different thread and participate in a data race
+      // (see llvm/Analysis/CaptureTracking.h for details).
+      NumOmittedNonCaptured++;
+      continue;
+    }
     All.push_back(I);
   }
   Local.clear();
@@ -412,7 +439,16 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) {
     NumInstrumentedVtableReads++;
     return true;
   }
-  Value *OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
+  const unsigned Alignment = IsWrite
+      ? cast<StoreInst>(I)->getAlignment()
+      : cast<LoadInst>(I)->getAlignment();
+  Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
+  const uint32_t TypeSize = DL->getTypeStoreSizeInBits(OrigTy);
+  Value *OnAccessFunc = nullptr;
+  if (Alignment == 0 || Alignment >= 8 || (Alignment % (TypeSize / 8)) == 0)
+    OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
+  else
+    OnAccessFunc = IsWrite ? TsanUnalignedWrite[Idx] : TsanUnalignedRead[Idx];
   IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
   if (IsWrite) NumInstrumentedWrites++;
   else         NumInstrumentedReads++;
@@ -422,7 +458,7 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) {
 static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
   uint32_t v = 0;
   switch (ord) {
-    case NotAtomic:              assert(false);
+    case NotAtomic: llvm_unreachable("unexpected atomic ordering!");
     case Unordered:              // Fall-through.
     case Monotonic:              v = 0; break;
     // case Consume:                v = 1; break;  // Not specified yet.
diff --git a/lib/Transforms/ObjCARC/ARCInstKind.cpp b/lib/Transforms/ObjCARC/ARCInstKind.cpp
new file mode 100644
index 0000000..f1e9dce
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ARCInstKind.cpp
@@ -0,0 +1,645 @@
+//===- ARCInstKind.cpp - ObjC ARC Optimization ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines several utility functions used by various ARC
+/// optimizations which are IMHO too big to be in a header file.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm/IR/Intrinsics.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS,
+                                       const ARCInstKind Class) {
+  switch (Class) {
+  case ARCInstKind::Retain:
+    return OS << "ARCInstKind::Retain";
+  case ARCInstKind::RetainRV:
+    return OS << "ARCInstKind::RetainRV";
+  case ARCInstKind::RetainBlock:
+    return OS << "ARCInstKind::RetainBlock";
+  case ARCInstKind::Release:
+    return OS << "ARCInstKind::Release";
+  case ARCInstKind::Autorelease:
+    return OS << "ARCInstKind::Autorelease";
+  case ARCInstKind::AutoreleaseRV:
+    return OS << "ARCInstKind::AutoreleaseRV";
+  case ARCInstKind::AutoreleasepoolPush:
+    return OS << "ARCInstKind::AutoreleasepoolPush";
+  case ARCInstKind::AutoreleasepoolPop:
+    return OS << "ARCInstKind::AutoreleasepoolPop";
+  case ARCInstKind::NoopCast:
+    return OS << "ARCInstKind::NoopCast";
+  case ARCInstKind::FusedRetainAutorelease:
+    return OS << "ARCInstKind::FusedRetainAutorelease";
+  case ARCInstKind::FusedRetainAutoreleaseRV:
+    return OS << "ARCInstKind::FusedRetainAutoreleaseRV";
+  case ARCInstKind::LoadWeakRetained:
+    return OS << "ARCInstKind::LoadWeakRetained";
+  case ARCInstKind::StoreWeak:
+    return OS << "ARCInstKind::StoreWeak";
+  case ARCInstKind::InitWeak:
+    return OS << "ARCInstKind::InitWeak";
+  case ARCInstKind::LoadWeak:
+    return OS << "ARCInstKind::LoadWeak";
+  case ARCInstKind::MoveWeak:
+    return OS << "ARCInstKind::MoveWeak";
+  case ARCInstKind::CopyWeak:
+    return OS << "ARCInstKind::CopyWeak";
+  case ARCInstKind::DestroyWeak:
+    return OS << "ARCInstKind::DestroyWeak";
+  case ARCInstKind::StoreStrong:
+    return OS << "ARCInstKind::StoreStrong";
+  case ARCInstKind::CallOrUser:
+    return OS << "ARCInstKind::CallOrUser";
+  case ARCInstKind::Call:
+    return OS << "ARCInstKind::Call";
+  case ARCInstKind::User:
+    return OS << "ARCInstKind::User";
+  case ARCInstKind::IntrinsicUser:
+    return OS << "ARCInstKind::IntrinsicUser";
+  case ARCInstKind::None:
+    return OS << "ARCInstKind::None";
+  }
+  llvm_unreachable("Unknown instruction class!");
+}
+
+ARCInstKind llvm::objcarc::GetFunctionClass(const Function *F) {
+  Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+
+  // No (mandatory) arguments.
+  if (AI == AE)
+    return StringSwitch<ARCInstKind>(F->getName())
+        .Case("objc_autoreleasePoolPush", ARCInstKind::AutoreleasepoolPush)
+        .Case("clang.arc.use", ARCInstKind::IntrinsicUser)
+        .Default(ARCInstKind::CallOrUser);
+
+  // One argument.
+  const Argument *A0 = AI++;
+  if (AI == AE)
+    // Argument is a pointer.
+    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) {
+      Type *ETy = PTy->getElementType();
+      // Argument is i8*.
+      if (ETy->isIntegerTy(8))
+        return StringSwitch<ARCInstKind>(F->getName())
+            .Case("objc_retain", ARCInstKind::Retain)
+            .Case("objc_retainAutoreleasedReturnValue", ARCInstKind::RetainRV)
+            .Case("objc_retainBlock", ARCInstKind::RetainBlock)
+            .Case("objc_release", ARCInstKind::Release)
+            .Case("objc_autorelease", ARCInstKind::Autorelease)
+            .Case("objc_autoreleaseReturnValue", ARCInstKind::AutoreleaseRV)
+            .Case("objc_autoreleasePoolPop", ARCInstKind::AutoreleasepoolPop)
+            .Case("objc_retainedObject", ARCInstKind::NoopCast)
+            .Case("objc_unretainedObject", ARCInstKind::NoopCast)
+            .Case("objc_unretainedPointer", ARCInstKind::NoopCast)
+            .Case("objc_retain_autorelease",
+                  ARCInstKind::FusedRetainAutorelease)
+            .Case("objc_retainAutorelease", ARCInstKind::FusedRetainAutorelease)
+            .Case("objc_retainAutoreleaseReturnValue",
+                  ARCInstKind::FusedRetainAutoreleaseRV)
+            .Case("objc_sync_enter", ARCInstKind::User)
+            .Case("objc_sync_exit", ARCInstKind::User)
+            .Default(ARCInstKind::CallOrUser);
+
+      // Argument is i8**
+      if (PointerType *Pte = dyn_cast<PointerType>(ETy))
+        if (Pte->getElementType()->isIntegerTy(8))
+          return StringSwitch<ARCInstKind>(F->getName())
+              .Case("objc_loadWeakRetained", ARCInstKind::LoadWeakRetained)
+              .Case("objc_loadWeak", ARCInstKind::LoadWeak)
+              .Case("objc_destroyWeak", ARCInstKind::DestroyWeak)
+              .Default(ARCInstKind::CallOrUser);
+    }
+
+  // Two arguments, first is i8**.
+  const Argument *A1 = AI++;
+  if (AI == AE)
+    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType()))
+      if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType()))
+        if (Pte->getElementType()->isIntegerTy(8))
+          if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) {
+            Type *ETy1 = PTy1->getElementType();
+            // Second argument is i8*
+            if (ETy1->isIntegerTy(8))
+              return StringSwitch<ARCInstKind>(F->getName())
+                  .Case("objc_storeWeak", ARCInstKind::StoreWeak)
+                  .Case("objc_initWeak", ARCInstKind::InitWeak)
+                  .Case("objc_storeStrong", ARCInstKind::StoreStrong)
+                  .Default(ARCInstKind::CallOrUser);
+            // Second argument is i8**.
+            if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1))
+              if (Pte1->getElementType()->isIntegerTy(8))
+                return StringSwitch<ARCInstKind>(F->getName())
+                    .Case("objc_moveWeak", ARCInstKind::MoveWeak)
+                    .Case("objc_copyWeak", ARCInstKind::CopyWeak)
+                    // Ignore annotation calls. This is important to stop the
+                    // optimizer from treating annotations as uses which would
+                    // make the state of the pointers they are attempting to
+                    // elucidate to be incorrect.
+                    .Case("llvm.arc.annotation.topdown.bbstart",
+                          ARCInstKind::None)
+                    .Case("llvm.arc.annotation.topdown.bbend",
+                          ARCInstKind::None)
+                    .Case("llvm.arc.annotation.bottomup.bbstart",
+                          ARCInstKind::None)
+                    .Case("llvm.arc.annotation.bottomup.bbend",
+                          ARCInstKind::None)
+                    .Default(ARCInstKind::CallOrUser);
+          }
+
+  // Anything else.
+  return ARCInstKind::CallOrUser;
+}
+
+/// \brief Determine what kind of construct V is.
+ARCInstKind llvm::objcarc::GetARCInstKind(const Value *V) {
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    // Any instruction other than bitcast and gep with a pointer operand have a
+    // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer
+    // to a subsequent use, rather than using it themselves, in this sense.
+    // As a short cut, several other opcodes are known to have no pointer
+    // operands of interest. And ret is never followed by a release, so it's
+    // not interesting to examine.
+    switch (I->getOpcode()) {
+    case Instruction::Call: {
+      const CallInst *CI = cast<CallInst>(I);
+      // Check for calls to special functions.
+      if (const Function *F = CI->getCalledFunction()) {
+        ARCInstKind Class = GetFunctionClass(F);
+        if (Class != ARCInstKind::CallOrUser)
+          return Class;
+
+        // None of the intrinsic functions do objc_release. For intrinsics, the
+        // only question is whether or not they may be users.
+        switch (F->getIntrinsicID()) {
+        case Intrinsic::returnaddress:
+        case Intrinsic::frameaddress:
+        case Intrinsic::stacksave:
+        case Intrinsic::stackrestore:
+        case Intrinsic::vastart:
+        case Intrinsic::vacopy:
+        case Intrinsic::vaend:
+        case Intrinsic::objectsize:
+        case Intrinsic::prefetch:
+        case Intrinsic::stackprotector:
+        case Intrinsic::eh_return_i32:
+        case Intrinsic::eh_return_i64:
+        case Intrinsic::eh_typeid_for:
+        case Intrinsic::eh_dwarf_cfa:
+        case Intrinsic::eh_sjlj_lsda:
+        case Intrinsic::eh_sjlj_functioncontext:
+        case Intrinsic::init_trampoline:
+        case Intrinsic::adjust_trampoline:
+        case Intrinsic::lifetime_start:
+        case Intrinsic::lifetime_end:
+        case Intrinsic::invariant_start:
+        case Intrinsic::invariant_end:
+        // Don't let dbg info affect our results.
+        case Intrinsic::dbg_declare:
+        case Intrinsic::dbg_value:
+          // Short cut: Some intrinsics obviously don't use ObjC pointers.
+          return ARCInstKind::None;
+        default:
+          break;
+        }
+      }
+      return GetCallSiteClass(CI);
+    }
+    case Instruction::Invoke:
+      return GetCallSiteClass(cast<InvokeInst>(I));
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::Select:
+    case Instruction::PHI:
+    case Instruction::Ret:
+    case Instruction::Br:
+    case Instruction::Switch:
+    case Instruction::IndirectBr:
+    case Instruction::Alloca:
+    case Instruction::VAArg:
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+    case Instruction::FDiv:
+    case Instruction::SRem:
+    case Instruction::URem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::SExt:
+    case Instruction::ZExt:
+    case Instruction::Trunc:
+    case Instruction::IntToPtr:
+    case Instruction::FCmp:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::InsertElement:
+    case Instruction::ExtractElement:
+    case Instruction::ShuffleVector:
+    case Instruction::ExtractValue:
+      break;
+    case Instruction::ICmp:
+      // Comparing a pointer with null, or any other constant, isn't an
+      // interesting use, because we don't care what the pointer points to, or
+      // about the values of any other dynamic reference-counted pointers.
+      if (IsPotentialRetainableObjPtr(I->getOperand(1)))
+        return ARCInstKind::User;
+      break;
+    default:
+      // For anything else, check all the operands.
+      // Note that this includes both operands of a Store: while the first
+      // operand isn't actually being dereferenced, it is being stored to
+      // memory where we can no longer track who might read it and dereference
+      // it, so we have to consider it potentially used.
+      for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end();
+           OI != OE; ++OI)
+        if (IsPotentialRetainableObjPtr(*OI))
+          return ARCInstKind::User;
+    }
+  }
+
+  // Otherwise, it's totally inert for ARC purposes.
+  return ARCInstKind::None;
+}
+
+/// \brief Test if the given class is a kind of user.
+bool llvm::objcarc::IsUser(ARCInstKind Class) {
+  switch (Class) {
+  case ARCInstKind::User:
+  case ARCInstKind::CallOrUser:
+  case ARCInstKind::IntrinsicUser:
+    return true;
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV:
+  case ARCInstKind::RetainBlock:
+  case ARCInstKind::Release:
+  case ARCInstKind::Autorelease:
+  case ARCInstKind::AutoreleaseRV:
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::AutoreleasepoolPop:
+  case ARCInstKind::NoopCast:
+  case ARCInstKind::FusedRetainAutorelease:
+  case ARCInstKind::FusedRetainAutoreleaseRV:
+  case ARCInstKind::LoadWeakRetained:
+  case ARCInstKind::StoreWeak:
+  case ARCInstKind::InitWeak:
+  case ARCInstKind::LoadWeak:
+  case ARCInstKind::MoveWeak:
+  case ARCInstKind::CopyWeak:
+  case ARCInstKind::DestroyWeak:
+  case ARCInstKind::StoreStrong:
+  case ARCInstKind::Call:
+  case ARCInstKind::None:
+    return false;
+  }
+  llvm_unreachable("covered switch isn't covered?");
+}
+
+/// \brief Test if the given class is objc_retain or equivalent.
+bool llvm::objcarc::IsRetain(ARCInstKind Class) {
+  switch (Class) {
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV:
+    return true;
+  // I believe we treat retain block as not a retain since it can copy its
+  // block.
+  case ARCInstKind::RetainBlock:
+  case ARCInstKind::Release:
+  case ARCInstKind::Autorelease:
+  case ARCInstKind::AutoreleaseRV:
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::AutoreleasepoolPop:
+  case ARCInstKind::NoopCast:
+  case ARCInstKind::FusedRetainAutorelease:
+  case ARCInstKind::FusedRetainAutoreleaseRV:
+  case ARCInstKind::LoadWeakRetained:
+  case ARCInstKind::StoreWeak:
+  case ARCInstKind::InitWeak:
+  case ARCInstKind::LoadWeak:
+  case ARCInstKind::MoveWeak:
+  case ARCInstKind::CopyWeak:
+  case ARCInstKind::DestroyWeak:
+  case ARCInstKind::StoreStrong:
+  case ARCInstKind::IntrinsicUser:
+  case ARCInstKind::CallOrUser:
+  case ARCInstKind::Call:
+  case ARCInstKind::User:
+  case ARCInstKind::None:
+    return false;
+  }
+  llvm_unreachable("covered switch isn't covered?");
+}
+
+/// \brief Test if the given class is objc_autorelease or equivalent.
+bool llvm::objcarc::IsAutorelease(ARCInstKind Class) {
+  switch (Class) {
+  case ARCInstKind::Autorelease:
+  case ARCInstKind::AutoreleaseRV:
+    return true;
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV:
+  case ARCInstKind::RetainBlock:
+  case ARCInstKind::Release:
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::AutoreleasepoolPop:
+  case ARCInstKind::NoopCast:
+  case ARCInstKind::FusedRetainAutorelease:
+  case ARCInstKind::FusedRetainAutoreleaseRV:
+  case ARCInstKind::LoadWeakRetained:
+  case ARCInstKind::StoreWeak:
+  case ARCInstKind::InitWeak:
+  case ARCInstKind::LoadWeak:
+  case ARCInstKind::MoveWeak:
+  case ARCInstKind::CopyWeak:
+  case ARCInstKind::DestroyWeak:
+  case ARCInstKind::StoreStrong:
+  case ARCInstKind::IntrinsicUser:
+  case ARCInstKind::CallOrUser:
+  case ARCInstKind::Call:
+  case ARCInstKind::User:
+  case ARCInstKind::None:
+    return false;
+  }
+  llvm_unreachable("covered switch isn't covered?");
+}
+
+/// \brief Test if the given class represents instructions which return their
+/// argument verbatim.
+bool llvm::objcarc::IsForwarding(ARCInstKind Class) {
+  switch (Class) {
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV:
+  case ARCInstKind::Autorelease:
+  case ARCInstKind::AutoreleaseRV:
+  case ARCInstKind::NoopCast:
+    return true;
+  case ARCInstKind::RetainBlock:
+  case ARCInstKind::Release:
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::AutoreleasepoolPop:
+  case ARCInstKind::FusedRetainAutorelease:
+  case ARCInstKind::FusedRetainAutoreleaseRV:
+  case ARCInstKind::LoadWeakRetained:
+  case ARCInstKind::StoreWeak:
+  case ARCInstKind::InitWeak:
+  case ARCInstKind::LoadWeak:
+  case ARCInstKind::MoveWeak:
+  case ARCInstKind::CopyWeak:
+  case ARCInstKind::DestroyWeak:
+  case ARCInstKind::StoreStrong:
+  case ARCInstKind::IntrinsicUser:
+  case ARCInstKind::CallOrUser:
+  case ARCInstKind::Call:
+  case ARCInstKind::User:
+  case ARCInstKind::None:
+    return false;
+  }
+  llvm_unreachable("covered switch isn't covered?");
+}
+
+/// \brief Test if the given class represents instructions which do nothing if
+/// passed a null pointer.
+bool llvm::objcarc::IsNoopOnNull(ARCInstKind Class) {
+  switch (Class) {
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV:
+  case ARCInstKind::Release:
+  case ARCInstKind::Autorelease:
+  case ARCInstKind::AutoreleaseRV:
+  case ARCInstKind::RetainBlock:
+    return true;
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::AutoreleasepoolPop:
+  case ARCInstKind::FusedRetainAutorelease:
+  case ARCInstKind::FusedRetainAutoreleaseRV:
+  case ARCInstKind::LoadWeakRetained:
+  case ARCInstKind::StoreWeak:
+  case ARCInstKind::InitWeak:
+  case ARCInstKind::LoadWeak:
+  case ARCInstKind::MoveWeak:
+  case ARCInstKind::CopyWeak:
+  case ARCInstKind::DestroyWeak:
+  case ARCInstKind::StoreStrong:
+  case ARCInstKind::IntrinsicUser:
+  case ARCInstKind::CallOrUser:
+  case ARCInstKind::Call:
+  case ARCInstKind::User:
+  case ARCInstKind::None:
+  case ARCInstKind::NoopCast:
+    return false;
+  }
+  llvm_unreachable("covered switch isn't covered?");
+}
+
+/// \brief Test if the given class represents instructions which are always safe
+/// to mark with the "tail" keyword.
+bool llvm::objcarc::IsAlwaysTail(ARCInstKind Class) {
+  // ARCInstKind::RetainBlock may be given a stack argument.
+  switch (Class) {
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV:
+  case ARCInstKind::AutoreleaseRV:
+    return true;
+  case ARCInstKind::Release:
+  case ARCInstKind::Autorelease:
+  case ARCInstKind::RetainBlock:
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::AutoreleasepoolPop:
+  case ARCInstKind::FusedRetainAutorelease:
+  case ARCInstKind::FusedRetainAutoreleaseRV:
+  case ARCInstKind::LoadWeakRetained:
+  case ARCInstKind::StoreWeak:
+  case ARCInstKind::InitWeak:
+  case ARCInstKind::LoadWeak:
+  case ARCInstKind::MoveWeak:
+  case ARCInstKind::CopyWeak:
+  case ARCInstKind::DestroyWeak:
+  case ARCInstKind::StoreStrong:
+  case ARCInstKind::IntrinsicUser:
+  case ARCInstKind::CallOrUser:
+  case ARCInstKind::Call:
+  case ARCInstKind::User:
+  case ARCInstKind::None:
+  case ARCInstKind::NoopCast:
+    return false;
+  }
+  llvm_unreachable("covered switch isn't covered?");
+}
+
+/// \brief Test if the given class represents instructions which are never safe
+/// to mark with the "tail" keyword.
+bool llvm::objcarc::IsNeverTail(ARCInstKind Class) {
+  /// It is never safe to tail call objc_autorelease since by tail calling
+  /// objc_autorelease: fast autoreleasing causing our object to be potentially
+  /// reclaimed from the autorelease pool which violates the semantics of
+  /// __autoreleasing types in ARC.
+  switch (Class) {
+  case ARCInstKind::Autorelease:
+    return true;
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV:
+  case ARCInstKind::AutoreleaseRV:
+  case ARCInstKind::Release:
+  case ARCInstKind::RetainBlock:
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::AutoreleasepoolPop:
+  case ARCInstKind::FusedRetainAutorelease:
+  case ARCInstKind::FusedRetainAutoreleaseRV:
+  case ARCInstKind::LoadWeakRetained:
+  case ARCInstKind::StoreWeak:
+  case ARCInstKind::InitWeak:
+  case ARCInstKind::LoadWeak:
+  case ARCInstKind::MoveWeak:
+  case ARCInstKind::CopyWeak:
+  case ARCInstKind::DestroyWeak:
+  case ARCInstKind::StoreStrong:
+  case ARCInstKind::IntrinsicUser:
+  case ARCInstKind::CallOrUser:
+  case ARCInstKind::Call:
+  case ARCInstKind::User:
+  case ARCInstKind::None:
+  case ARCInstKind::NoopCast:
+    return false;
+  }
+  llvm_unreachable("covered switch isn't covered?");
+}
+
+/// \brief Test if the given class represents instructions which are always safe
+/// to mark with the nounwind attribute.
+bool llvm::objcarc::IsNoThrow(ARCInstKind Class) {
+  // objc_retainBlock is not nounwind because it calls user copy constructors
+  // which could theoretically throw.
+  switch (Class) {
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV:
+  case ARCInstKind::Release:
+  case ARCInstKind::Autorelease:
+  case ARCInstKind::AutoreleaseRV:
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::AutoreleasepoolPop:
+    return true;
+  case ARCInstKind::RetainBlock:
+  case ARCInstKind::FusedRetainAutorelease:
+  case ARCInstKind::FusedRetainAutoreleaseRV:
+  case ARCInstKind::LoadWeakRetained:
+  case ARCInstKind::StoreWeak:
+  case ARCInstKind::InitWeak:
+  case ARCInstKind::LoadWeak:
+  case ARCInstKind::MoveWeak:
+  case ARCInstKind::CopyWeak:
+  case ARCInstKind::DestroyWeak:
+  case ARCInstKind::StoreStrong:
+  case ARCInstKind::IntrinsicUser:
+  case ARCInstKind::CallOrUser:
+  case ARCInstKind::Call:
+  case ARCInstKind::User:
+  case ARCInstKind::None:
+  case ARCInstKind::NoopCast:
+    return false;
+  }
+  llvm_unreachable("covered switch isn't covered?");
+}
+
+/// Test whether the given instruction can autorelease any pointer or cause an
+/// autoreleasepool pop.
+///
+/// This means that it *could* interrupt the RV optimization.
+bool llvm::objcarc::CanInterruptRV(ARCInstKind Class) {
+  switch (Class) {
+  case ARCInstKind::AutoreleasepoolPop:
+  case ARCInstKind::CallOrUser:
+  case ARCInstKind::Call:
+  case ARCInstKind::Autorelease:
+  case ARCInstKind::AutoreleaseRV:
+  case ARCInstKind::FusedRetainAutorelease:
+  case ARCInstKind::FusedRetainAutoreleaseRV:
+    return true;
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV:
+  case ARCInstKind::Release:
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::RetainBlock:
+  case ARCInstKind::LoadWeakRetained:
+  case ARCInstKind::StoreWeak:
+  case ARCInstKind::InitWeak:
+  case ARCInstKind::LoadWeak:
+  case ARCInstKind::MoveWeak:
+  case ARCInstKind::CopyWeak:
+  case ARCInstKind::DestroyWeak:
+  case ARCInstKind::StoreStrong:
+  case ARCInstKind::IntrinsicUser:
+  case ARCInstKind::User:
+  case ARCInstKind::None:
+  case ARCInstKind::NoopCast:
+    return false;
+  }
+  llvm_unreachable("covered switch isn't covered?");
+}
+
+bool llvm::objcarc::CanDecrementRefCount(ARCInstKind Kind) {
+  switch (Kind) {
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV:
+  case ARCInstKind::Autorelease:
+  case ARCInstKind::AutoreleaseRV:
+  case ARCInstKind::NoopCast:
+  case ARCInstKind::FusedRetainAutorelease:
+  case ARCInstKind::FusedRetainAutoreleaseRV:
+  case ARCInstKind::IntrinsicUser:
+  case ARCInstKind::User:
+  case ARCInstKind::None:
+    return false;
+
+  // The cases below are conservative.
+
+  // RetainBlock can result in user defined copy constructors being called
+  // implying releases may occur.
+  case ARCInstKind::RetainBlock:
+  case ARCInstKind::Release:
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::AutoreleasepoolPop:
+  case ARCInstKind::LoadWeakRetained:
+  case ARCInstKind::StoreWeak:
+  case ARCInstKind::InitWeak:
+  case ARCInstKind::LoadWeak:
+  case ARCInstKind::MoveWeak:
+  case ARCInstKind::CopyWeak:
+  case ARCInstKind::DestroyWeak:
+  case ARCInstKind::StoreStrong:
+  case ARCInstKind::CallOrUser:
+  case ARCInstKind::Call:
+    return true;
+  }
+
+  llvm_unreachable("covered switch isn't covered?");
+}
diff --git a/lib/Transforms/ObjCARC/ARCInstKind.h b/lib/Transforms/ObjCARC/ARCInstKind.h
new file mode 100644
index 0000000..636c65c
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ARCInstKind.h
@@ -0,0 +1,123 @@
+//===--- ARCInstKind.h - ARC instruction equivalence classes -*- C++ -*----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_ARCINSTKIND_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_ARCINSTKIND_H
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+
+namespace llvm {
+namespace objcarc {
+
+/// \enum ARCInstKind
+///
+/// \brief Equivalence classes of instructions in the ARC Model.
+///
+/// Since we do not have "instructions" to represent ARC concepts in LLVM IR,
+/// we instead operate on equivalence classes of instructions.
+///
+/// TODO: This should be split into two enums: a runtime entry point enum
+/// (possibly united with the ARCRuntimeEntrypoint class) and an enum that deals
+/// with effects of instructions in the ARC model (which would handle the notion
+/// of a User or CallOrUser).
+enum class ARCInstKind {
+  Retain,                   ///< objc_retain
+  RetainRV,                 ///< objc_retainAutoreleasedReturnValue
+  RetainBlock,              ///< objc_retainBlock
+  Release,                  ///< objc_release
+  Autorelease,              ///< objc_autorelease
+  AutoreleaseRV,            ///< objc_autoreleaseReturnValue
+  AutoreleasepoolPush,      ///< objc_autoreleasePoolPush
+  AutoreleasepoolPop,       ///< objc_autoreleasePoolPop
+  NoopCast,                 ///< objc_retainedObject, etc.
+  FusedRetainAutorelease,   ///< objc_retainAutorelease
+  FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue
+  LoadWeakRetained,         ///< objc_loadWeakRetained (primitive)
+  StoreWeak,                ///< objc_storeWeak (primitive)
+  InitWeak,                 ///< objc_initWeak (derived)
+  LoadWeak,                 ///< objc_loadWeak (derived)
+  MoveWeak,                 ///< objc_moveWeak (derived)
+  CopyWeak,                 ///< objc_copyWeak (derived)
+  DestroyWeak,              ///< objc_destroyWeak (derived)
+  StoreStrong,              ///< objc_storeStrong (derived)
+  IntrinsicUser,            ///< clang.arc.use
+  CallOrUser,               ///< could call objc_release and/or "use" pointers
+  Call,                     ///< could call objc_release
+  User,                     ///< could "use" a pointer
+  None                      ///< anything that is inert from an ARC perspective.
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const ARCInstKind Class);
+
+/// \brief Test if the given class is a kind of user.
+bool IsUser(ARCInstKind Class);
+
+/// \brief Test if the given class is objc_retain or equivalent.
+bool IsRetain(ARCInstKind Class);
+
+/// \brief Test if the given class is objc_autorelease or equivalent.
+bool IsAutorelease(ARCInstKind Class);
+
+/// \brief Test if the given class represents instructions which return their
+/// argument verbatim.
+bool IsForwarding(ARCInstKind Class);
+
+/// \brief Test if the given class represents instructions which do nothing if
+/// passed a null pointer.
+bool IsNoopOnNull(ARCInstKind Class);
+
+/// \brief Test if the given class represents instructions which are always safe
+/// to mark with the "tail" keyword.
+bool IsAlwaysTail(ARCInstKind Class);
+
+/// \brief Test if the given class represents instructions which are never safe
+/// to mark with the "tail" keyword.
+bool IsNeverTail(ARCInstKind Class);
+
+/// \brief Test if the given class represents instructions which are always safe
+/// to mark with the nounwind attribute.
+bool IsNoThrow(ARCInstKind Class);
+
+/// Test whether the given instruction can autorelease any pointer or cause an
+/// autoreleasepool pop.
+bool CanInterruptRV(ARCInstKind Class);
+
+/// \brief Determine if F is one of the special known Functions.  If it isn't,
+/// return ARCInstKind::CallOrUser.
+ARCInstKind GetFunctionClass(const Function *F);
+
+/// \brief Determine which objc runtime call instruction class V belongs to.
+///
+/// This is similar to GetARCInstKind except that it only detects objc
+/// runtime calls. This allows it to be faster.
+///
+static inline ARCInstKind GetBasicARCInstKind(const Value *V) {
+  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+    if (const Function *F = CI->getCalledFunction())
+      return GetFunctionClass(F);
+    // Otherwise, be conservative.
+    return ARCInstKind::CallOrUser;
+  }
+
+  // Otherwise, be conservative.
+  return isa<InvokeInst>(V) ? ARCInstKind::CallOrUser : ARCInstKind::User;
+}
+
+/// Map V to its ARCInstKind equivalence class.
+ARCInstKind GetARCInstKind(const Value *V);
+
+/// Returns false if conservatively we can prove that any instruction mapped to
+/// this kind can not decrement ref counts. Returns true otherwise.
+bool CanDecrementRefCount(ARCInstKind Kind);
+
+} // end namespace objcarc
+} // end namespace llvm
+
+#endif
diff --git a/lib/Transforms/ObjCARC/Android.mk b/lib/Transforms/ObjCARC/Android.mk
index cf45a95..97c5a9d 100644
--- a/lib/Transforms/ObjCARC/Android.mk
+++ b/lib/Transforms/ObjCARC/Android.mk
@@ -1,6 +1,7 @@
 LOCAL_PATH:= $(call my-dir)
 
 transforms_objcarc_SRC_FILES := \
+  ARCInstKind.cpp \
   DependencyAnalysis.cpp \
   ObjCARCAliasAnalysis.cpp \
   ObjCARCAPElim.cpp \
@@ -8,7 +9,6 @@ transforms_objcarc_SRC_FILES := \
   ObjCARC.cpp \
   ObjCARCExpand.cpp \
   ObjCARCOpts.cpp \
-  ObjCARCUtil.cpp \
   ProvenanceAnalysis.cpp \
   ProvenanceAnalysisEvaluator.cpp
 
diff --git a/lib/Transforms/ObjCARC/CMakeLists.txt b/lib/Transforms/ObjCARC/CMakeLists.txt
index b449fac..2adea88 100644
--- a/lib/Transforms/ObjCARC/CMakeLists.txt
+++ b/lib/Transforms/ObjCARC/CMakeLists.txt
@@ -4,11 +4,14 @@ add_llvm_library(LLVMObjCARCOpts
   ObjCARCExpand.cpp
   ObjCARCAPElim.cpp
   ObjCARCAliasAnalysis.cpp
-  ObjCARCUtil.cpp
+  ARCInstKind.cpp
   ObjCARCContract.cpp
   DependencyAnalysis.cpp
   ProvenanceAnalysis.cpp
   ProvenanceAnalysisEvaluator.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
   )
 
 add_dependencies(LLVMObjCARCOpts intrinsics_gen)
diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index f6c236c..4985d0e 100644
--- a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -32,15 +32,14 @@ using namespace llvm::objcarc;
 
 /// Test whether the given instruction can result in a reference count
 /// modification (positive or negative) for the pointer's object.
-bool
-llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
-                                ProvenanceAnalysis &PA,
-                                InstructionClass Class) {
+bool llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+                                     ProvenanceAnalysis &PA,
+                                     ARCInstKind Class) {
   switch (Class) {
-  case IC_Autorelease:
-  case IC_AutoreleaseRV:
-  case IC_IntrinsicUser:
-  case IC_User:
+  case ARCInstKind::Autorelease:
+  case ARCInstKind::AutoreleaseRV:
+  case ARCInstKind::IntrinsicUser:
+  case ARCInstKind::User:
     // These operations never directly modify a reference count.
     return false;
   default: break;
@@ -67,13 +66,25 @@ llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
   return true;
 }
 
+bool llvm::objcarc::CanDecrementRefCount(const Instruction *Inst,
+                                         const Value *Ptr,
+                                         ProvenanceAnalysis &PA,
+                                         ARCInstKind Class) {
+  // First perform a quick check if Class can not touch ref counts.
+  if (!CanDecrementRefCount(Class))
+    return false;
+
+  // Otherwise, just use CanAlterRefCount for now.
+  return CanAlterRefCount(Inst, Ptr, PA, Class);
+}
+
 /// Test whether the given instruction can "use" the given pointer's object in a
 /// way that requires the reference count to be positive.
-bool
-llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
-                      ProvenanceAnalysis &PA, InstructionClass Class) {
-  // IC_Call operations (as opposed to IC_CallOrUser) never "use" objc pointers.
-  if (Class == IC_Call)
+bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
+                           ProvenanceAnalysis &PA, ARCInstKind Class) {
+  // ARCInstKind::Call operations (as opposed to
+  // ARCInstKind::CallOrUser) never "use" objc pointers.
+  if (Class == ARCInstKind::Call)
     return false;
 
   // Consider various instructions which may have pointer arguments which are
@@ -123,11 +134,11 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst,
 
   switch (Flavor) {
   case NeedsPositiveRetainCount: {
-    InstructionClass Class = GetInstructionClass(Inst);
+    ARCInstKind Class = GetARCInstKind(Inst);
     switch (Class) {
-    case IC_AutoreleasepoolPop:
-    case IC_AutoreleasepoolPush:
-    case IC_None:
+    case ARCInstKind::AutoreleasepoolPop:
+    case ARCInstKind::AutoreleasepoolPush:
+    case ARCInstKind::None:
       return false;
     default:
       return CanUse(Inst, Arg, PA, Class);
@@ -135,10 +146,10 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst,
   }
 
   case AutoreleasePoolBoundary: {
-    InstructionClass Class = GetInstructionClass(Inst);
+    ARCInstKind Class = GetARCInstKind(Inst);
     switch (Class) {
-    case IC_AutoreleasepoolPop:
-    case IC_AutoreleasepoolPush:
+    case ARCInstKind::AutoreleasepoolPop:
+    case ARCInstKind::AutoreleasepoolPush:
       // These mark the end and begin of an autorelease pool scope.
       return true;
     default:
@@ -148,13 +159,13 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst,
   }
 
   case CanChangeRetainCount: {
-    InstructionClass Class = GetInstructionClass(Inst);
+    ARCInstKind Class = GetARCInstKind(Inst);
     switch (Class) {
-    case IC_AutoreleasepoolPop:
+    case ARCInstKind::AutoreleasepoolPop:
       // Conservatively assume this can decrement any count.
       return true;
-    case IC_AutoreleasepoolPush:
-    case IC_None:
+    case ARCInstKind::AutoreleasepoolPush:
+    case ARCInstKind::None:
       return false;
     default:
       return CanAlterRefCount(Inst, Arg, PA, Class);
@@ -162,28 +173,28 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst,
   }
 
   case RetainAutoreleaseDep:
-    switch (GetBasicInstructionClass(Inst)) {
-    case IC_AutoreleasepoolPop:
-    case IC_AutoreleasepoolPush:
+    switch (GetBasicARCInstKind(Inst)) {
+    case ARCInstKind::AutoreleasepoolPop:
+    case ARCInstKind::AutoreleasepoolPush:
       // Don't merge an objc_autorelease with an objc_retain inside a different
       // autoreleasepool scope.
       return true;
-    case IC_Retain:
-    case IC_RetainRV:
+    case ARCInstKind::Retain:
+    case ARCInstKind::RetainRV:
       // Check for a retain of the same pointer for merging.
-      return GetObjCArg(Inst) == Arg;
+      return GetArgRCIdentityRoot(Inst) == Arg;
     default:
       // Nothing else matters for objc_retainAutorelease formation.
       return false;
     }
 
   case RetainAutoreleaseRVDep: {
-    InstructionClass Class = GetBasicInstructionClass(Inst);
+    ARCInstKind Class = GetBasicARCInstKind(Inst);
     switch (Class) {
-    case IC_Retain:
-    case IC_RetainRV:
+    case ARCInstKind::Retain:
+    case ARCInstKind::RetainRV:
       // Check for a retain of the same pointer for merging.
-      return GetObjCArg(Inst) == Arg;
+      return GetArgRCIdentityRoot(Inst) == Arg;
     default:
       // Anything that can autorelease interrupts
       // retainAutoreleaseReturnValue formation.
@@ -192,7 +203,7 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst,
   }
 
   case RetainRVDep:
-    return CanInterruptRV(GetBasicInstructionClass(Inst));
+    return CanInterruptRV(GetBasicARCInstKind(Inst));
   }
 
   llvm_unreachable("Invalid dependence flavor");
diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.h b/lib/Transforms/ObjCARC/DependencyAnalysis.h
index 7b5601a..8e042d4 100644
--- a/lib/Transforms/ObjCARC/DependencyAnalysis.h
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.h
@@ -63,15 +63,24 @@ Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
 
 /// Test whether the given instruction can "use" the given pointer's object in a
 /// way that requires the reference count to be positive.
-bool
-CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
-       InstructionClass Class);
+bool CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
+            ARCInstKind Class);
 
 /// Test whether the given instruction can result in a reference count
 /// modification (positive or negative) for the pointer's object.
-bool
-CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
-                 ProvenanceAnalysis &PA, InstructionClass Class);
+bool CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+                      ProvenanceAnalysis &PA, ARCInstKind Class);
+
+/// Returns true if we can not conservatively prove that Inst can not decrement
+/// the reference count of Ptr. Returns false if we can.
+bool CanDecrementRefCount(const Instruction *Inst, const Value *Ptr,
+                          ProvenanceAnalysis &PA, ARCInstKind Class);
+
+static inline bool CanDecrementRefCount(const Instruction *Inst,
+                                        const Value *Ptr,
+                                        ProvenanceAnalysis &PA) {
+  return CanDecrementRefCount(Inst, Ptr, PA, GetARCInstKind(Inst));
+}
 
 } // namespace objcarc
 } // namespace llvm
diff --git a/lib/Transforms/ObjCARC/ObjCARC.h b/lib/Transforms/ObjCARC/ObjCARC.h
index 7a7eae8..df29f05 100644
--- a/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/lib/Transforms/ObjCARC/ObjCARC.h
@@ -33,6 +33,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "ARCInstKind.h"
 
 namespace llvm {
 class raw_ostream;
@@ -68,160 +69,13 @@ static inline bool ModuleHasARC(const Module &M) {
     M.getNamedValue("clang.arc.use");
 }
 
-/// \enum InstructionClass
-/// \brief A simple classification for instructions.
-enum InstructionClass {
-  IC_Retain,              ///< objc_retain
-  IC_RetainRV,            ///< objc_retainAutoreleasedReturnValue
-  IC_RetainBlock,         ///< objc_retainBlock
-  IC_Release,             ///< objc_release
-  IC_Autorelease,         ///< objc_autorelease
-  IC_AutoreleaseRV,       ///< objc_autoreleaseReturnValue
-  IC_AutoreleasepoolPush, ///< objc_autoreleasePoolPush
-  IC_AutoreleasepoolPop,  ///< objc_autoreleasePoolPop
-  IC_NoopCast,            ///< objc_retainedObject, etc.
-  IC_FusedRetainAutorelease, ///< objc_retainAutorelease
-  IC_FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue
-  IC_LoadWeakRetained,    ///< objc_loadWeakRetained (primitive)
-  IC_StoreWeak,           ///< objc_storeWeak (primitive)
-  IC_InitWeak,            ///< objc_initWeak (derived)
-  IC_LoadWeak,            ///< objc_loadWeak (derived)
-  IC_MoveWeak,            ///< objc_moveWeak (derived)
-  IC_CopyWeak,            ///< objc_copyWeak (derived)
-  IC_DestroyWeak,         ///< objc_destroyWeak (derived)
-  IC_StoreStrong,         ///< objc_storeStrong (derived)
-  IC_IntrinsicUser,       ///< clang.arc.use
-  IC_CallOrUser,          ///< could call objc_release and/or "use" pointers
-  IC_Call,                ///< could call objc_release
-  IC_User,                ///< could "use" a pointer
-  IC_None                 ///< anything else
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const InstructionClass Class);
-
-/// \brief Test if the given class is a kind of user.
-inline static bool IsUser(InstructionClass Class) {
-  return Class == IC_User ||
-         Class == IC_CallOrUser ||
-         Class == IC_IntrinsicUser;
-}
-
-/// \brief Test if the given class is objc_retain or equivalent.
-static inline bool IsRetain(InstructionClass Class) {
-  return Class == IC_Retain ||
-         Class == IC_RetainRV;
-}
-
-/// \brief Test if the given class is objc_autorelease or equivalent.
-static inline bool IsAutorelease(InstructionClass Class) {
-  return Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV;
-}
-
-/// \brief Test if the given class represents instructions which return their
-/// argument verbatim.
-static inline bool IsForwarding(InstructionClass Class) {
-  return Class == IC_Retain ||
-         Class == IC_RetainRV ||
-         Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV ||
-         Class == IC_NoopCast;
-}
-
-/// \brief Test if the given class represents instructions which do nothing if
-/// passed a null pointer.
-static inline bool IsNoopOnNull(InstructionClass Class) {
-  return Class == IC_Retain ||
-         Class == IC_RetainRV ||
-         Class == IC_Release ||
-         Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV ||
-         Class == IC_RetainBlock;
-}
-
-/// \brief Test if the given class represents instructions which are always safe
-/// to mark with the "tail" keyword.
-static inline bool IsAlwaysTail(InstructionClass Class) {
-  // IC_RetainBlock may be given a stack argument.
-  return Class == IC_Retain ||
-         Class == IC_RetainRV ||
-         Class == IC_AutoreleaseRV;
-}
-
-/// \brief Test if the given class represents instructions which are never safe
-/// to mark with the "tail" keyword.
-static inline bool IsNeverTail(InstructionClass Class) {
-  /// It is never safe to tail call objc_autorelease since by tail calling
-  /// objc_autorelease, we also tail call -[NSObject autorelease] which supports
-  /// fast autoreleasing causing our object to be potentially reclaimed from the
-  /// autorelease pool which violates the semantics of __autoreleasing types in
-  /// ARC.
-  return Class == IC_Autorelease;
-}
-
-/// \brief Test if the given class represents instructions which are always safe
-/// to mark with the nounwind attribute.
-static inline bool IsNoThrow(InstructionClass Class) {
-  // objc_retainBlock is not nounwind because it calls user copy constructors
-  // which could theoretically throw.
-  return Class == IC_Retain ||
-         Class == IC_RetainRV ||
-         Class == IC_Release ||
-         Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV ||
-         Class == IC_AutoreleasepoolPush ||
-         Class == IC_AutoreleasepoolPop;
-}
-
-/// Test whether the given instruction can autorelease any pointer or cause an
-/// autoreleasepool pop.
-static inline bool
-CanInterruptRV(InstructionClass Class) {
-  switch (Class) {
-  case IC_AutoreleasepoolPop:
-  case IC_CallOrUser:
-  case IC_Call:
-  case IC_Autorelease:
-  case IC_AutoreleaseRV:
-  case IC_FusedRetainAutorelease:
-  case IC_FusedRetainAutoreleaseRV:
-    return true;
-  default:
-    return false;
-  }
-}
-
-/// \brief Determine if F is one of the special known Functions.  If it isn't,
-/// return IC_CallOrUser.
-InstructionClass GetFunctionClass(const Function *F);
-
-/// \brief Determine which objc runtime call instruction class V belongs to.
-///
-/// This is similar to GetInstructionClass except that it only detects objc
-/// runtime calls. This allows it to be faster.
-///
-static inline InstructionClass GetBasicInstructionClass(const Value *V) {
-  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
-    if (const Function *F = CI->getCalledFunction())
-      return GetFunctionClass(F);
-    // Otherwise, be conservative.
-    return IC_CallOrUser;
-  }
-
-  // Otherwise, be conservative.
-  return isa<InvokeInst>(V) ? IC_CallOrUser : IC_User;
-}
-
-/// \brief Determine what kind of construct V is.
-InstructionClass GetInstructionClass(const Value *V);
-
 /// \brief This is a wrapper around getUnderlyingObject which also knows how to
 /// look through objc_retain and objc_autorelease calls, which we know to return
 /// their argument verbatim.
 static inline const Value *GetUnderlyingObjCPtr(const Value *V) {
   for (;;) {
     V = GetUnderlyingObject(V);
-    if (!IsForwarding(GetBasicInstructionClass(V)))
+    if (!IsForwarding(GetBasicARCInstKind(V)))
       break;
     V = cast<CallInst>(V)->getArgOperand(0);
   }
@@ -229,37 +83,44 @@ static inline const Value *GetUnderlyingObjCPtr(const Value *V) {
   return V;
 }
 
-/// \brief This is a wrapper around Value::stripPointerCasts which also knows
-/// how to look through objc_retain and objc_autorelease calls, which we know to
-/// return their argument verbatim.
-static inline const Value *StripPointerCastsAndObjCCalls(const Value *V) {
+/// The RCIdentity root of a value \p V is a dominating value U for which
+/// retaining or releasing U is equivalent to retaining or releasing V. In other
+/// words, ARC operations on \p V are equivalent to ARC operations on \p U.
+///
+/// We use this in the ARC optimizer to make it easier to match up ARC
+/// operations by always mapping ARC operations to RCIdentityRoots instead of
+/// pointers themselves.
+///
+/// The two ways that we see RCIdentical values in ObjC are via:
+///
+///   1. PointerCasts
+///   2. Forwarding Calls that return their argument verbatim.
+///
+/// Thus this function strips off pointer casts and forwarding calls. *NOTE*
+/// This implies that two RCIdentical values must alias.
+static inline const Value *GetRCIdentityRoot(const Value *V) {
   for (;;) {
     V = V->stripPointerCasts();
-    if (!IsForwarding(GetBasicInstructionClass(V)))
+    if (!IsForwarding(GetBasicARCInstKind(V)))
       break;
     V = cast<CallInst>(V)->getArgOperand(0);
   }
   return V;
 }
 
-/// \brief This is a wrapper around Value::stripPointerCasts which also knows
-/// how to look through objc_retain and objc_autorelease calls, which we know to
-/// return their argument verbatim.
-static inline Value *StripPointerCastsAndObjCCalls(Value *V) {
-  for (;;) {
-    V = V->stripPointerCasts();
-    if (!IsForwarding(GetBasicInstructionClass(V)))
-      break;
-    V = cast<CallInst>(V)->getArgOperand(0);
-  }
-  return V;
+/// Helper which calls const Value *GetRCIdentityRoot(const Value *V) and just
+/// casts away the const of the result. For documentation about what an
+/// RCIdentityRoot (and by extension GetRCIdentityRoot is) look at that
+/// function.
+static inline Value *GetRCIdentityRoot(Value *V) {
+  return const_cast<Value *>(GetRCIdentityRoot((const Value *)V));
 }
 
 /// \brief Assuming the given instruction is one of the special calls such as
-/// objc_retain or objc_release, return the argument value, stripped of no-op
-/// casts and forwarding calls.
-static inline Value *GetObjCArg(Value *Inst) {
-  return StripPointerCastsAndObjCCalls(cast<CallInst>(Inst)->getArgOperand(0));
+/// objc_retain or objc_release, return the RCIdentity root of the argument of
+/// the call.
+static inline Value *GetArgRCIdentityRoot(Value *Inst) {
+  return GetRCIdentityRoot(cast<CallInst>(Inst)->getArgOperand(0));
 }
 
 static inline bool IsNullOrUndef(const Value *V) {
@@ -286,8 +147,8 @@ static inline void EraseInstruction(Instruction *CI) {
 
   if (!Unused) {
     // Replace the return value with the argument.
-    assert((IsForwarding(GetBasicInstructionClass(CI)) ||
-            (IsNoopOnNull(GetBasicInstructionClass(CI)) &&
+    assert((IsForwarding(GetBasicARCInstKind(CI)) ||
+            (IsNoopOnNull(GetBasicARCInstKind(CI)) &&
              isa<ConstantPointerNull>(OldArg))) &&
            "Can't delete non-forwarding instruction with users!");
     CI->replaceAllUsesWith(OldArg);
@@ -344,15 +205,15 @@ static inline bool IsPotentialRetainableObjPtr(const Value *Op,
   return true;
 }
 
-/// \brief Helper for GetInstructionClass. Determines what kind of construct CS
+/// \brief Helper for GetARCInstKind. Determines what kind of construct CS
 /// is.
-static inline InstructionClass GetCallSiteClass(ImmutableCallSite CS) {
+static inline ARCInstKind GetCallSiteClass(ImmutableCallSite CS) {
   for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
        I != E; ++I)
     if (IsPotentialRetainableObjPtr(*I))
-      return CS.onlyReadsMemory() ? IC_User : IC_CallOrUser;
+      return CS.onlyReadsMemory() ? ARCInstKind::User : ARCInstKind::CallOrUser;
 
-  return CS.onlyReadsMemory() ? IC_None : IC_Call;
+  return CS.onlyReadsMemory() ? ARCInstKind::None : ARCInstKind::Call;
 }
 
 /// \brief Return true if this value refers to a distinct and identifiable
@@ -371,7 +232,7 @@ static inline bool IsObjCIdentifiedObject(const Value *V) {
 
   if (const LoadInst *LI = dyn_cast<LoadInst>(V)) {
     const Value *Pointer =
-      StripPointerCastsAndObjCCalls(LI->getPointerOperand());
+      GetRCIdentityRoot(LI->getPointerOperand());
     if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) {
       // A constant pointer can't be pointing to an object on the heap. It may
       // be reference-counted, but it won't be deleted.
diff --git a/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
index 1a25391..d318643 100644
--- a/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -97,11 +97,11 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
   Instruction *Push = nullptr;
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
     Instruction *Inst = I++;
-    switch (GetBasicInstructionClass(Inst)) {
-    case IC_AutoreleasepoolPush:
+    switch (GetBasicARCInstKind(Inst)) {
+    case ARCInstKind::AutoreleasepoolPush:
       Push = Inst;
       break;
-    case IC_AutoreleasepoolPop:
+    case ARCInstKind::AutoreleasepoolPop:
       // If this pop matches a push and nothing in between can autorelease,
       // zap the pair.
       if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) {
@@ -115,7 +115,7 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
       }
       Push = nullptr;
       break;
-    case IC_CallOrUser:
+    case ARCInstKind::CallOrUser:
       if (MayAutorelease(ImmutableCallSite(Inst)))
         Push = nullptr;
       break;
diff --git a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
index c61b6b0..be291a0 100644
--- a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
@@ -59,8 +59,8 @@ ObjCARCAliasAnalysis::alias(const Location &LocA, const Location &LocB) {
 
   // First, strip off no-ops, including ObjC-specific no-ops, and try making a
   // precise alias query.
-  const Value *SA = StripPointerCastsAndObjCCalls(LocA.Ptr);
-  const Value *SB = StripPointerCastsAndObjCCalls(LocB.Ptr);
+  const Value *SA = GetRCIdentityRoot(LocA.Ptr);
+  const Value *SB = GetRCIdentityRoot(LocB.Ptr);
   AliasResult Result =
     AliasAnalysis::alias(Location(SA, LocA.Size, LocA.AATags),
                          Location(SB, LocB.Size, LocB.AATags));
@@ -92,7 +92,7 @@ ObjCARCAliasAnalysis::pointsToConstantMemory(const Location &Loc,
 
   // First, strip off no-ops, including ObjC-specific no-ops, and try making
   // a precise alias query.
-  const Value *S = StripPointerCastsAndObjCCalls(Loc.Ptr);
+  const Value *S = GetRCIdentityRoot(Loc.Ptr);
   if (AliasAnalysis::pointsToConstantMemory(Location(S, Loc.Size, Loc.AATags),
                                             OrLocal))
     return true;
@@ -120,7 +120,7 @@ ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) {
     return AliasAnalysis::getModRefBehavior(F);
 
   switch (GetFunctionClass(F)) {
-  case IC_NoopCast:
+  case ARCInstKind::NoopCast:
     return DoesNotAccessMemory;
   default:
     break;
@@ -134,15 +134,15 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
   if (!EnableARCOpts)
     return AliasAnalysis::getModRefInfo(CS, Loc);
 
-  switch (GetBasicInstructionClass(CS.getInstruction())) {
-  case IC_Retain:
-  case IC_RetainRV:
-  case IC_Autorelease:
-  case IC_AutoreleaseRV:
-  case IC_NoopCast:
-  case IC_AutoreleasepoolPush:
-  case IC_FusedRetainAutorelease:
-  case IC_FusedRetainAutoreleaseRV:
+  switch (GetBasicARCInstKind(CS.getInstruction())) {
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV:
+  case ARCInstKind::Autorelease:
+  case ARCInstKind::AutoreleaseRV:
+  case ARCInstKind::NoopCast:
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::FusedRetainAutorelease:
+  case ARCInstKind::FusedRetainAutoreleaseRV:
     // These functions don't access any memory visible to the compiler.
     // Note that this doesn't include objc_retainBlock, because it updates
     // pointers when it copies block data.
diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index eb325eb..6473d3a 100644
--- a/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -44,6 +44,10 @@ using namespace llvm::objcarc;
 STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
 STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
 
+//===----------------------------------------------------------------------===//
+//                                Declarations
+//===----------------------------------------------------------------------===//
+
 namespace {
   /// \brief Late ARC optimizations
   ///
@@ -68,17 +72,23 @@ namespace {
     /// "tail".
     SmallPtrSet<CallInst *, 8> StoreStrongCalls;
 
-    bool OptimizeRetainCall(Function &F, Instruction *Retain);
+    /// Returns true if we eliminated Inst.
+    bool tryToPeepholeInstruction(Function &F, Instruction *Inst,
+                                  inst_iterator &Iter,
+                                  SmallPtrSetImpl<Instruction *> &DepInsts,
+                                  SmallPtrSetImpl<const BasicBlock *> &Visited,
+                                  bool &TailOkForStoreStrong);
 
-    bool ContractAutorelease(Function &F, Instruction *Autorelease,
-                             InstructionClass Class,
-                             SmallPtrSetImpl<Instruction *>
-                               &DependingInstructions,
-                             SmallPtrSetImpl<const BasicBlock *>
-                               &Visited);
+    bool optimizeRetainCall(Function &F, Instruction *Retain);
 
-    void ContractRelease(Instruction *Release,
-                         inst_iterator &Iter);
+    bool
+    contractAutorelease(Function &F, Instruction *Autorelease,
+                        ARCInstKind Class,
+                        SmallPtrSetImpl<Instruction *> &DependingInstructions,
+                        SmallPtrSetImpl<const BasicBlock *> &Visited);
+
+    void tryToContractReleaseIntoStoreStrong(Instruction *Release,
+                                             inst_iterator &Iter);
 
     void getAnalysisUsage(AnalysisUsage &AU) const override;
     bool doInitialization(Module &M) override;
@@ -92,30 +102,15 @@ namespace {
   };
 }
 
-char ObjCARCContract::ID = 0;
-INITIALIZE_PASS_BEGIN(ObjCARCContract,
-                      "objc-arc-contract", "ObjC ARC contraction", false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(ObjCARCContract,
-                    "objc-arc-contract", "ObjC ARC contraction", false, false)
-
-Pass *llvm::createObjCARCContractPass() {
-  return new ObjCARCContract();
-}
-
-void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AliasAnalysis>();
-  AU.addRequired<DominatorTreeWrapperPass>();
-  AU.setPreservesCFG();
-}
+//===----------------------------------------------------------------------===//
+//                               Implementation
+//===----------------------------------------------------------------------===//
 
 /// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a
 /// return value. We do this late so we do not disrupt the dataflow analysis in
 /// ObjCARCOpt.
-bool
-ObjCARCContract::OptimizeRetainCall(Function &F, Instruction *Retain) {
-  ImmutableCallSite CS(GetObjCArg(Retain));
+bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) {
+  ImmutableCallSite CS(GetArgRCIdentityRoot(Retain));
   const Instruction *Call = CS.getInstruction();
   if (!Call)
     return false;
@@ -147,19 +142,16 @@ ObjCARCContract::OptimizeRetainCall(Function &F, Instruction *Retain) {
 }
 
 /// Merge an autorelease with a retain into a fused call.
-bool
-ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
-                                     InstructionClass Class,
-                                     SmallPtrSetImpl<Instruction *>
-                                       &DependingInstructions,
-                                     SmallPtrSetImpl<const BasicBlock *>
-                                       &Visited) {
-  const Value *Arg = GetObjCArg(Autorelease);
+bool ObjCARCContract::contractAutorelease(
+    Function &F, Instruction *Autorelease, ARCInstKind Class,
+    SmallPtrSetImpl<Instruction *> &DependingInstructions,
+    SmallPtrSetImpl<const BasicBlock *> &Visited) {
+  const Value *Arg = GetArgRCIdentityRoot(Autorelease);
 
   // Check that there are no instructions between the retain and the autorelease
   // (such as an autorelease_pop) which may change the count.
   CallInst *Retain = nullptr;
-  if (Class == IC_AutoreleaseRV)
+  if (Class == ARCInstKind::AutoreleaseRV)
     FindDependencies(RetainAutoreleaseRVDep, Arg,
                      Autorelease->getParent(), Autorelease,
                      DependingInstructions, Visited, PA);
@@ -177,94 +169,208 @@ ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
   Retain = dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
   DependingInstructions.clear();
 
-  if (!Retain ||
-      GetBasicInstructionClass(Retain) != IC_Retain ||
-      GetObjCArg(Retain) != Arg)
+  if (!Retain || GetBasicARCInstKind(Retain) != ARCInstKind::Retain ||
+      GetArgRCIdentityRoot(Retain) != Arg)
     return false;
 
   Changed = true;
   ++NumPeeps;
 
-  DEBUG(dbgs() << "ObjCARCContract::ContractAutorelease: Fusing "
-                  "retain/autorelease. Erasing: " << *Autorelease << "\n"
-                  "                                      Old Retain: "
-               << *Retain << "\n");
+  DEBUG(dbgs() << "    Fusing retain/autorelease!\n"
+                  "        Autorelease:" << *Autorelease << "\n"
+                  "        Retain: " << *Retain << "\n");
 
-  Constant *Decl = EP.get(Class == IC_AutoreleaseRV ?
-                          ARCRuntimeEntryPoints::EPT_RetainAutoreleaseRV :
-                          ARCRuntimeEntryPoints::EPT_RetainAutorelease);
+  Constant *Decl = EP.get(Class == ARCInstKind::AutoreleaseRV
+                              ? ARCRuntimeEntryPoints::EPT_RetainAutoreleaseRV
+                              : ARCRuntimeEntryPoints::EPT_RetainAutorelease);
   Retain->setCalledFunction(Decl);
 
-  DEBUG(dbgs() << "                                      New Retain: "
-               << *Retain << "\n");
+  DEBUG(dbgs() << "        New RetainAutorelease: " << *Retain << "\n");
 
   EraseInstruction(Autorelease);
   return true;
 }
 
-/// Attempt to merge an objc_release with a store, load, and objc_retain to form
-/// an objc_storeStrong. This can be a little tricky because the instructions
-/// don't always appear in order, and there may be unrelated intervening
-/// instructions.
-void ObjCARCContract::ContractRelease(Instruction *Release,
-                                      inst_iterator &Iter) {
-  LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release));
-  if (!Load || !Load->isSimple()) return;
+static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load,
+                                                         Instruction *Release,
+                                                         ProvenanceAnalysis &PA,
+                                                         AliasAnalysis *AA) {
+  StoreInst *Store = nullptr;
+  bool SawRelease = false;
 
-  // For now, require everything to be in one basic block.
-  BasicBlock *BB = Release->getParent();
-  if (Load->getParent() != BB) return;
+  // Get the location associated with Load.
+  AliasAnalysis::Location Loc = AA->getLocation(Load);
 
   // Walk down to find the store and the release, which may be in either order.
-  BasicBlock::iterator I = Load, End = BB->end();
-  ++I;
-  AliasAnalysis::Location Loc = AA->getLocation(Load);
-  StoreInst *Store = nullptr;
-  bool SawRelease = false;
-  for (; !Store || !SawRelease; ++I) {
-    if (I == End)
-      return;
+  for (auto I = std::next(BasicBlock::iterator(Load)),
+            E = Load->getParent()->end();
+       I != E; ++I) {
+    // If we found the store we were looking for and saw the release,
+    // break. There is no more work to be done.
+    if (Store && SawRelease)
+      break;
 
-    Instruction *Inst = I;
+    // Now we know that we have not seen either the store or the release. If I
+    // is the the release, mark that we saw the release and continue.
+    Instruction *Inst = &*I;
     if (Inst == Release) {
       SawRelease = true;
       continue;
     }
 
-    InstructionClass Class = GetBasicInstructionClass(Inst);
+    // Otherwise, we check if Inst is a "good" store. Grab the instruction class
+    // of Inst.
+    ARCInstKind Class = GetBasicARCInstKind(Inst);
 
-    // Unrelated retains are harmless.
+    // If Inst is an unrelated retain, we don't care about it.
+    //
+    // TODO: This is one area where the optimization could be made more
+    // aggressive.
     if (IsRetain(Class))
       continue;
 
+    // If we have seen the store, but not the release...
     if (Store) {
-      // The store is the point where we're going to put the objc_storeStrong,
-      // so make sure there are no uses after it.
-      if (CanUse(Inst, Load, PA, Class))
-        return;
-    } else if (AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod) {
-      // We are moving the load down to the store, so check for anything
-      // else which writes to the memory between the load and the store.
-      Store = dyn_cast<StoreInst>(Inst);
-      if (!Store || !Store->isSimple()) return;
-      if (Store->getPointerOperand() != Loc.Ptr) return;
+      // We need to make sure that it is safe to move the release from its
+      // current position to the store. This implies proving that any
+      // instruction in between Store and the Release conservatively can not use
+      // the RCIdentityRoot of Release. If we can prove we can ignore Inst, so
+      // continue...
+      if (!CanUse(Inst, Load, PA, Class)) {
+        continue;
+      }
+
+      // Otherwise, be conservative and return nullptr.
+      return nullptr;
     }
+
+    // Ok, now we know we have not seen a store yet. See if Inst can write to
+    // our load location, if it can not, just ignore the instruction.
+    if (!(AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod))
+      continue;
+
+    Store = dyn_cast<StoreInst>(Inst);
+
+    // If Inst can, then check if Inst is a simple store. If Inst is not a
+    // store or a store that is not simple, then we have some we do not
+    // understand writing to this memory implying we can not move the load
+    // over the write to any subsequent store that we may find.
+    if (!Store || !Store->isSimple())
+      return nullptr;
+
+    // Then make sure that the pointer we are storing to is Ptr. If so, we
+    // found our Store!
+    if (Store->getPointerOperand() == Loc.Ptr)
+      continue;
+
+    // Otherwise, we have an unknown store to some other ptr that clobbers
+    // Loc.Ptr. Bail!
+    return nullptr;
   }
 
-  Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand());
+  // If we did not find the store or did not see the release, fail.
+  if (!Store || !SawRelease)
+    return nullptr;
+
+  // We succeeded!
+  return Store;
+}
 
-  // Walk up to find the retain.
-  I = Store;
-  BasicBlock::iterator Begin = BB->begin();
-  while (I != Begin && GetBasicInstructionClass(I) != IC_Retain)
+static Instruction *
+findRetainForStoreStrongContraction(Value *New, StoreInst *Store,
+                                    Instruction *Release,
+                                    ProvenanceAnalysis &PA) {
+  // Walk up from the Store to find the retain.
+  BasicBlock::iterator I = Store;
+  BasicBlock::iterator Begin = Store->getParent()->begin();
+  while (I != Begin && GetBasicARCInstKind(I) != ARCInstKind::Retain) {
+    Instruction *Inst = &*I;
+
+    // It is only safe to move the retain to the store if we can prove
+    // conservatively that nothing besides the release can decrement reference
+    // counts in between the retain and the store.
+    if (CanDecrementRefCount(Inst, New, PA) && Inst != Release)
+      return nullptr;
     --I;
+  }
   Instruction *Retain = I;
-  if (GetBasicInstructionClass(Retain) != IC_Retain) return;
-  if (GetObjCArg(Retain) != New) return;
+  if (GetBasicARCInstKind(Retain) != ARCInstKind::Retain)
+    return nullptr;
+  if (GetArgRCIdentityRoot(Retain) != New)
+    return nullptr;
+  return Retain;
+}
+
+/// Attempt to merge an objc_release with a store, load, and objc_retain to form
+/// an objc_storeStrong. An objc_storeStrong:
+///
+///   objc_storeStrong(i8** %old_ptr, i8* new_value)
+///
+/// is equivalent to the following IR sequence:
+///
+///   ; Load old value.
+///   %old_value = load i8** %old_ptr               (1)
+///
+///   ; Increment the new value and then release the old value. This must occur
+///   ; in order in case old_value releases new_value in its destructor causing
+///   ; us to potentially have a dangling ptr.
+///   tail call i8* @objc_retain(i8* %new_value)    (2)
+///   tail call void @objc_release(i8* %old_value)  (3)
+///
+///   ; Store the new_value into old_ptr
+///   store i8* %new_value, i8** %old_ptr           (4)
+///
+/// The safety of this optimization is based around the following
+/// considerations:
+///
+///  1. We are forming the store strong at the store. Thus to perform this
+///     optimization it must be safe to move the retain, load, and release to
+///     (4).
+///  2. We need to make sure that any re-orderings of (1), (2), (3), (4) are
+///     safe.
+void ObjCARCContract::tryToContractReleaseIntoStoreStrong(Instruction *Release,
+                                                          inst_iterator &Iter) {
+  // See if we are releasing something that we just loaded.
+  auto *Load = dyn_cast<LoadInst>(GetArgRCIdentityRoot(Release));
+  if (!Load || !Load->isSimple())
+    return;
+
+  // For now, require everything to be in one basic block.
+  BasicBlock *BB = Release->getParent();
+  if (Load->getParent() != BB)
+    return;
+
+  // First scan down the BB from Load, looking for a store of the RCIdentityRoot
+  // of Load's
+  StoreInst *Store =
+      findSafeStoreForStoreStrongContraction(Load, Release, PA, AA);
+  // If we fail, bail.
+  if (!Store)
+    return;
+
+  // Then find what new_value's RCIdentity Root is.
+  Value *New = GetRCIdentityRoot(Store->getValueOperand());
+
+  // Then walk up the BB and look for a retain on New without any intervening
+  // instructions which conservatively might decrement ref counts.
+  Instruction *Retain =
+      findRetainForStoreStrongContraction(New, Store, Release, PA);
+
+  // If we fail, bail.
+  if (!Retain)
+    return;
 
   Changed = true;
   ++NumStoreStrongs;
 
+  DEBUG(
+      llvm::dbgs() << "    Contracting retain, release into objc_storeStrong.\n"
+                   << "        Old:\n"
+                   << "            Store:   " << *Store << "\n"
+                   << "            Release: " << *Release << "\n"
+                   << "            Retain:  " << *Retain << "\n"
+                   << "            Load:    " << *Load << "\n");
+
   LLVMContext &C = Release->getContext();
   Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
   Type *I8XX = PointerType::getUnqual(I8X);
@@ -284,6 +390,8 @@ void ObjCARCContract::ContractRelease(Instruction *Release,
   // we can set the tail flag once we know it's safe.
   StoreStrongCalls.insert(StoreStrong);
 
+  DEBUG(llvm::dbgs() << "        New Store Strong: " << *StoreStrong << "\n");
+
   if (&*Iter == Store) ++Iter;
   Store->eraseFromParent();
   Release->eraseFromParent();
@@ -292,85 +400,34 @@ void ObjCARCContract::ContractRelease(Instruction *Release,
     Load->eraseFromParent();
 }
 
-bool ObjCARCContract::doInitialization(Module &M) {
-  // If nothing in the Module uses ARC, don't do anything.
-  Run = ModuleHasARC(M);
-  if (!Run)
-    return false;
-
-  EP.Initialize(&M);
-
-  // Initialize RetainRVMarker.
-  RetainRVMarker = nullptr;
-  if (NamedMDNode *NMD =
-        M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker"))
-    if (NMD->getNumOperands() == 1) {
-      const MDNode *N = NMD->getOperand(0);
-      if (N->getNumOperands() == 1)
-        if (const MDString *S = dyn_cast<MDString>(N->getOperand(0)))
-          RetainRVMarker = S;
-    }
-
-  return false;
-}
-
-bool ObjCARCContract::runOnFunction(Function &F) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  if (!Run)
-    return false;
-
-  Changed = false;
-  AA = &getAnalysis<AliasAnalysis>();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
-  PA.setAA(&getAnalysis<AliasAnalysis>());
-
-  // Track whether it's ok to mark objc_storeStrong calls with the "tail"
-  // keyword. Be conservative if the function has variadic arguments.
-  // It seems that functions which "return twice" are also unsafe for the
-  // "tail" argument, because they are setjmp, which could need to
-  // return to an earlier stack state.
-  bool TailOkForStoreStrongs = !F.isVarArg() &&
-                               !F.callsFunctionThatReturnsTwice();
-
-  // For ObjC library calls which return their argument, replace uses of the
-  // argument with uses of the call return value, if it dominates the use. This
-  // reduces register pressure.
-  SmallPtrSet<Instruction *, 4> DependingInstructions;
-  SmallPtrSet<const BasicBlock *, 4> Visited;
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
-    Instruction *Inst = &*I++;
-
-    DEBUG(dbgs() << "ObjCARCContract: Visiting: " << *Inst << "\n");
-
+bool ObjCARCContract::tryToPeepholeInstruction(
+  Function &F, Instruction *Inst, inst_iterator &Iter,
+  SmallPtrSetImpl<Instruction *> &DependingInsts,
+  SmallPtrSetImpl<const BasicBlock *> &Visited,
+  bool &TailOkForStoreStrongs) {
     // Only these library routines return their argument. In particular,
     // objc_retainBlock does not necessarily return its argument.
-    InstructionClass Class = GetBasicInstructionClass(Inst);
+  ARCInstKind Class = GetBasicARCInstKind(Inst);
     switch (Class) {
-    case IC_FusedRetainAutorelease:
-    case IC_FusedRetainAutoreleaseRV:
-      break;
-    case IC_Autorelease:
-    case IC_AutoreleaseRV:
-      if (ContractAutorelease(F, Inst, Class, DependingInstructions, Visited))
-        continue;
-      break;
-    case IC_Retain:
+    case ARCInstKind::FusedRetainAutorelease:
+    case ARCInstKind::FusedRetainAutoreleaseRV:
+      return false;
+    case ARCInstKind::Autorelease:
+    case ARCInstKind::AutoreleaseRV:
+      return contractAutorelease(F, Inst, Class, DependingInsts, Visited);
+    case ARCInstKind::Retain:
       // Attempt to convert retains to retainrvs if they are next to function
       // calls.
-      if (!OptimizeRetainCall(F, Inst))
-        break;
+      if (!optimizeRetainCall(F, Inst))
+        return false;
       // If we succeed in our optimization, fall through.
       // FALLTHROUGH
-    case IC_RetainRV: {
+    case ARCInstKind::RetainRV: {
       // If we're compiling for a target which needs a special inline-asm
       // marker to do the retainAutoreleasedReturnValue optimization,
       // insert it now.
       if (!RetainRVMarker)
-        break;
+        return false;
       BasicBlock::iterator BBI = Inst;
       BasicBlock *InstParent = Inst->getParent();
 
@@ -388,8 +445,8 @@ bool ObjCARCContract::runOnFunction(Function &F) {
         --BBI;
       } while (IsNoopInstruction(BBI));
 
-      if (&*BBI == GetObjCArg(Inst)) {
-        DEBUG(dbgs() << "ObjCARCContract: Adding inline asm marker for "
+      if (&*BBI == GetArgRCIdentityRoot(Inst)) {
+        DEBUG(dbgs() << "Adding inline asm marker for "
                         "retainAutoreleasedReturnValue optimization.\n");
         Changed = true;
         InlineAsm *IA =
@@ -400,9 +457,9 @@ bool ObjCARCContract::runOnFunction(Function &F) {
         CallInst::Create(IA, "", Inst);
       }
     decline_rv_optimization:
-      break;
+      return false;
     }
-    case IC_InitWeak: {
+    case ARCInstKind::InitWeak: {
       // objc_initWeak(p, null) => *p = null
       CallInst *CI = cast<CallInst>(Inst);
       if (IsNullOrUndef(CI->getArgOperand(1))) {
@@ -417,31 +474,80 @@ bool ObjCARCContract::runOnFunction(Function &F) {
         CI->replaceAllUsesWith(Null);
         CI->eraseFromParent();
       }
-      continue;
+      return true;
     }
-    case IC_Release:
-      ContractRelease(Inst, I);
-      continue;
-    case IC_User:
+    case ARCInstKind::Release:
+      // Try to form an objc store strong from our release. If we fail, there is
+      // nothing further to do below, so continue.
+      tryToContractReleaseIntoStoreStrong(Inst, Iter);
+      return true;
+    case ARCInstKind::User:
       // Be conservative if the function has any alloca instructions.
       // Technically we only care about escaping alloca instructions,
       // but this is sufficient to handle some interesting cases.
       if (isa<AllocaInst>(Inst))
         TailOkForStoreStrongs = false;
-      continue;
-    case IC_IntrinsicUser:
+      return true;
+    case ARCInstKind::IntrinsicUser:
       // Remove calls to @clang.arc.use(...).
       Inst->eraseFromParent();
-      continue;
+      return true;
     default:
-      continue;
+      return true;
     }
+}
+
+//===----------------------------------------------------------------------===//
+//                              Top Level Driver
+//===----------------------------------------------------------------------===//
+
+bool ObjCARCContract::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
 
-    DEBUG(dbgs() << "ObjCARCContract: Finished List.\n\n");
+  Changed = false;
+  AA = &getAnalysis<AliasAnalysis>();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  PA.setAA(&getAnalysis<AliasAnalysis>());
+
+  DEBUG(llvm::dbgs() << "**** ObjCARC Contract ****\n");
+
+  // Track whether it's ok to mark objc_storeStrong calls with the "tail"
+  // keyword. Be conservative if the function has variadic arguments.
+  // It seems that functions which "return twice" are also unsafe for the
+  // "tail" argument, because they are setjmp, which could need to
+  // return to an earlier stack state.
+  bool TailOkForStoreStrongs =
+      !F.isVarArg() && !F.callsFunctionThatReturnsTwice();
 
-    // Don't use GetObjCArg because we don't want to look through bitcasts
+  // For ObjC library calls which return their argument, replace uses of the
+  // argument with uses of the call return value, if it dominates the use. This
+  // reduces register pressure.
+  SmallPtrSet<Instruction *, 4> DependingInstructions;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E;) {
+    Instruction *Inst = &*I++;
+
+    DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
+
+    // First try to peephole Inst. If there is nothing further we can do in
+    // terms of undoing objc-arc-expand, process the next inst.
+    if (tryToPeepholeInstruction(F, Inst, I, DependingInstructions, Visited,
+                                 TailOkForStoreStrongs))
+      continue;
+
+    // Otherwise, try to undo objc-arc-expand.
+
+    // Don't use GetArgRCIdentityRoot because we don't want to look through bitcasts
     // and such; to do the replacement, the argument must have type i8*.
     Value *Arg = cast<CallInst>(Inst)->getArgOperand(0);
+
+    // TODO: Change this to a do-while.
     for (;;) {
       // If we're compiling bugpointed code, don't get in trouble.
       if (!isa<Instruction>(Arg) && !isa<Argument>(Arg))
@@ -458,7 +564,7 @@ bool ObjCARCContract::runOnFunction(Function &F) {
         // reachability here because an unreachable call is considered to
         // trivially dominate itself, which would lead us to rewriting its
         // argument in terms of its return value, which would lead to
-        // infinite loops in GetObjCArg.
+        // infinite loops in GetArgRCIdentityRoot.
         if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) {
           Changed = true;
           Instruction *Replacement = Inst;
@@ -514,3 +620,45 @@ bool ObjCARCContract::runOnFunction(Function &F) {
 
   return Changed;
 }
+
+//===----------------------------------------------------------------------===//
+//                             Misc Pass Manager
+//===----------------------------------------------------------------------===//
+
+char ObjCARCContract::ID = 0;
+INITIALIZE_PASS_BEGIN(ObjCARCContract, "objc-arc-contract",
+                      "ObjC ARC contraction", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ObjCARCContract, "objc-arc-contract",
+                    "ObjC ARC contraction", false, false)
+
+void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.setPreservesCFG();
+}
+
+Pass *llvm::createObjCARCContractPass() { return new ObjCARCContract(); }
+
+bool ObjCARCContract::doInitialization(Module &M) {
+  // If nothing in the Module uses ARC, don't do anything.
+  Run = ModuleHasARC(M);
+  if (!Run)
+    return false;
+
+  EP.Initialize(&M);
+
+  // Initialize RetainRVMarker.
+  RetainRVMarker = nullptr;
+  if (NamedMDNode *NMD =
+          M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker"))
+    if (NMD->getNumOperands() == 1) {
+      const MDNode *N = NMD->getOperand(0);
+      if (N->getNumOperands() == 1)
+        if (const MDString *S = dyn_cast<MDString>(N->getOperand(0)))
+          RetainRVMarker = S;
+    }
+
+  return false;
+}
diff --git a/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
index bf9fcbb..53c19c3 100644
--- a/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -99,13 +99,13 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
 
     DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n");
 
-    switch (GetBasicInstructionClass(Inst)) {
-    case IC_Retain:
-    case IC_RetainRV:
-    case IC_Autorelease:
-    case IC_AutoreleaseRV:
-    case IC_FusedRetainAutorelease:
-    case IC_FusedRetainAutoreleaseRV: {
+    switch (GetBasicARCInstKind(Inst)) {
+    case ARCInstKind::Retain:
+    case ARCInstKind::RetainRV:
+    case ARCInstKind::Autorelease:
+    case ARCInstKind::AutoreleaseRV:
+    case ARCInstKind::FusedRetainAutorelease:
+    case ARCInstKind::FusedRetainAutoreleaseRV: {
       // These calls return their argument verbatim, as a low-level
       // optimization. However, this makes high-level optimizations
       // harder. Undo any uses of this optimization that the front-end
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 95c6674..f55b77f 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -144,7 +144,7 @@ namespace {
 /// \defgroup ARCUtilities Utility declarations/definitions specific to ARC.
 /// @{
 
-/// \brief This is similar to StripPointerCastsAndObjCCalls but it stops as soon
+/// \brief This is similar to GetRCIdentityRoot but it stops as soon
 /// as it finds a value with multiple uses.
 static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
   if (Arg->hasOneUse()) {
@@ -153,7 +153,7 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
     if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg))
       if (GEP->hasAllZeroIndices())
         return FindSingleUseIdentifiedObject(GEP->getPointerOperand());
-    if (IsForwarding(GetBasicInstructionClass(Arg)))
+    if (IsForwarding(GetBasicARCInstKind(Arg)))
       return FindSingleUseIdentifiedObject(
                cast<CallInst>(Arg)->getArgOperand(0));
     if (!IsObjCIdentifiedObject(Arg))
@@ -165,7 +165,7 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
   // trivial uses, we can still consider this to be a single-use value.
   if (IsObjCIdentifiedObject(Arg)) {
     for (const User *U : Arg->users())
-      if (!U->use_empty() || StripPointerCastsAndObjCCalls(U) != Arg)
+      if (!U->use_empty() || GetRCIdentityRoot(U) != Arg)
          return nullptr;
 
     return Arg;
@@ -880,11 +880,9 @@ static void AppendMDNodeToInstForPtr(unsigned NodeId,
                                      Sequence OldSeq,
                                      Sequence NewSeq) {
   MDNode *Node = nullptr;
-  Value *tmp[3] = {PtrSourceMDNodeID,
-                   SequenceToMDString(Inst->getContext(),
-                                      OldSeq),
-                   SequenceToMDString(Inst->getContext(),
-                                      NewSeq)};
+  Metadata *tmp[3] = {PtrSourceMDNodeID,
+                      SequenceToMDString(Inst->getContext(), OldSeq),
+                      SequenceToMDString(Inst->getContext(), NewSeq)};
   Node = MDNode::get(Inst->getContext(), tmp);
 
   Inst->setMetadata(NodeId, Node);
@@ -1098,7 +1096,7 @@ namespace {
 
     bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
     void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
-                                   InstructionClass &Class);
+                                   ARCInstKind &Class);
     void OptimizeIndividualCalls(Function &F);
 
     void CheckForCFGHazards(const BasicBlock *BB,
@@ -1193,7 +1191,7 @@ void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
 bool
 ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
   // Check for the argument being from an immediately preceding call or invoke.
-  const Value *Arg = GetObjCArg(RetainRV);
+  const Value *Arg = GetArgRCIdentityRoot(RetainRV);
   ImmutableCallSite CS(Arg);
   if (const Instruction *Call = CS.getInstruction()) {
     if (Call->getParent() == RetainRV->getParent()) {
@@ -1218,8 +1216,8 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
   BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin();
   if (I != Begin) {
     do --I; while (I != Begin && IsNoopInstruction(I));
-    if (GetBasicInstructionClass(I) == IC_AutoreleaseRV &&
-        GetObjCArg(I) == Arg) {
+    if (GetBasicARCInstKind(I) == ARCInstKind::AutoreleaseRV &&
+        GetArgRCIdentityRoot(I) == Arg) {
       Changed = true;
       ++NumPeeps;
 
@@ -1250,17 +1248,17 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
 
 /// Turn objc_autoreleaseReturnValue into objc_autorelease if the result is not
 /// used as a return value.
-void
-ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
-                                      InstructionClass &Class) {
+void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F,
+                                           Instruction *AutoreleaseRV,
+                                           ARCInstKind &Class) {
   // Check for a return of the pointer value.
-  const Value *Ptr = GetObjCArg(AutoreleaseRV);
+  const Value *Ptr = GetArgRCIdentityRoot(AutoreleaseRV);
   SmallVector<const Value *, 2> Users;
   Users.push_back(Ptr);
   do {
     Ptr = Users.pop_back_val();
     for (const User *U : Ptr->users()) {
-      if (isa<ReturnInst>(U) || GetBasicInstructionClass(U) == IC_RetainRV)
+      if (isa<ReturnInst>(U) || GetBasicARCInstKind(U) == ARCInstKind::RetainRV)
         return;
       if (isa<BitCastInst>(U))
         Users.push_back(U);
@@ -1279,7 +1277,7 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
   Constant *NewDecl = EP.get(ARCRuntimeEntryPoints::EPT_Autorelease);
   AutoreleaseRVCI->setCalledFunction(NewDecl);
   AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease.
-  Class = IC_Autorelease;
+  Class = ARCInstKind::Autorelease;
 
   DEBUG(dbgs() << "New: " << *AutoreleaseRV << "\n");
 
@@ -1296,7 +1294,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
   for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
     Instruction *Inst = &*I++;
 
-    InstructionClass Class = GetBasicInstructionClass(Inst);
+    ARCInstKind Class = GetBasicARCInstKind(Inst);
 
     DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n");
 
@@ -1311,7 +1309,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     // There are gray areas here, as the ability to cast reference-counted
     // pointers to raw void* and back allows code to break ARC assumptions,
     // however these are currently considered to be unimportant.
-    case IC_NoopCast:
+    case ARCInstKind::NoopCast:
       Changed = true;
       ++NumNoops;
       DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n");
@@ -1319,11 +1317,11 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       continue;
 
     // If the pointer-to-weak-pointer is null, it's undefined behavior.
-    case IC_StoreWeak:
-    case IC_LoadWeak:
-    case IC_LoadWeakRetained:
-    case IC_InitWeak:
-    case IC_DestroyWeak: {
+    case ARCInstKind::StoreWeak:
+    case ARCInstKind::LoadWeak:
+    case ARCInstKind::LoadWeakRetained:
+    case ARCInstKind::InitWeak:
+    case ARCInstKind::DestroyWeak: {
       CallInst *CI = cast<CallInst>(Inst);
       if (IsNullOrUndef(CI->getArgOperand(0))) {
         Changed = true;
@@ -1340,8 +1338,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       }
       break;
     }
-    case IC_CopyWeak:
-    case IC_MoveWeak: {
+    case ARCInstKind::CopyWeak:
+    case ARCInstKind::MoveWeak: {
       CallInst *CI = cast<CallInst>(Inst);
       if (IsNullOrUndef(CI->getArgOperand(0)) ||
           IsNullOrUndef(CI->getArgOperand(1))) {
@@ -1361,11 +1359,11 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       }
       break;
     }
-    case IC_RetainRV:
+    case ARCInstKind::RetainRV:
       if (OptimizeRetainRVCall(F, Inst))
         continue;
       break;
-    case IC_AutoreleaseRV:
+    case ARCInstKind::AutoreleaseRV:
       OptimizeAutoreleaseRVCall(F, Inst, Class);
       break;
     }
@@ -1393,7 +1391,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
 
         EraseInstruction(Call);
         Inst = NewCall;
-        Class = IC_Release;
+        Class = ARCInstKind::Release;
       }
     }
 
@@ -1424,11 +1422,11 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     }
 
     if (!IsNoopOnNull(Class)) {
-      UsedInThisFunction |= 1 << Class;
+      UsedInThisFunction |= 1 << unsigned(Class);
       continue;
     }
 
-    const Value *Arg = GetObjCArg(Inst);
+    const Value *Arg = GetArgRCIdentityRoot(Inst);
 
     // ARC calls with null are no-ops. Delete them.
     if (IsNullOrUndef(Arg)) {
@@ -1442,7 +1440,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
 
     // Keep track of which of retain, release, autorelease, and retain_block
     // are actually present in this function.
-    UsedInThisFunction |= 1 << Class;
+    UsedInThisFunction |= 1 << unsigned(Class);
 
     // If Arg is a PHI, and one or more incoming values to the
     // PHI are null, and the call is control-equivalent to the PHI, and there
@@ -1465,7 +1463,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       bool HasCriticalEdges = false;
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         Value *Incoming =
-          StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
+          GetRCIdentityRoot(PN->getIncomingValue(i));
         if (IsNullOrUndef(Incoming))
           HasNull = true;
         else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back())
@@ -1482,25 +1480,25 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
         // Check that there is nothing that cares about the reference
         // count between the call and the phi.
         switch (Class) {
-        case IC_Retain:
-        case IC_RetainBlock:
+        case ARCInstKind::Retain:
+        case ARCInstKind::RetainBlock:
           // These can always be moved up.
           break;
-        case IC_Release:
+        case ARCInstKind::Release:
           // These can't be moved across things that care about the retain
           // count.
           FindDependencies(NeedsPositiveRetainCount, Arg,
                            Inst->getParent(), Inst,
                            DependingInstructions, Visited, PA);
           break;
-        case IC_Autorelease:
+        case ARCInstKind::Autorelease:
           // These can't be moved across autorelease pool scope boundaries.
           FindDependencies(AutoreleasePoolBoundary, Arg,
                            Inst->getParent(), Inst,
                            DependingInstructions, Visited, PA);
           break;
-        case IC_RetainRV:
-        case IC_AutoreleaseRV:
+        case ARCInstKind::RetainRV:
+        case ARCInstKind::AutoreleaseRV:
           // Don't move these; the RV optimization depends on the autoreleaseRV
           // being tail called, and the retainRV being immediately after a call
           // (which might still happen if we get lucky with codegen layout, but
@@ -1519,7 +1517,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
           Type *ParamTy = CInst->getArgOperand(0)->getType();
           for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
             Value *Incoming =
-              StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
+              GetRCIdentityRoot(PN->getIncomingValue(i));
             if (!IsNullOrUndef(Incoming)) {
               CallInst *Clone = cast<CallInst>(CInst->clone());
               Value *Op = PN->getIncomingValue(i);
@@ -1713,14 +1711,14 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
                                      MapVector<Value *, RRInfo> &Retains,
                                      BBState &MyStates) {
   bool NestingDetected = false;
-  InstructionClass Class = GetInstructionClass(Inst);
+  ARCInstKind Class = GetARCInstKind(Inst);
   const Value *Arg = nullptr;
 
   DEBUG(dbgs() << "Class: " << Class << "\n");
 
   switch (Class) {
-  case IC_Release: {
-    Arg = GetObjCArg(Inst);
+  case ARCInstKind::Release: {
+    Arg = GetArgRCIdentityRoot(Inst);
 
     PtrState &S = MyStates.getPtrBottomUpState(Arg);
 
@@ -1747,14 +1745,14 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     S.SetKnownPositiveRefCount();
     break;
   }
-  case IC_RetainBlock:
+  case ARCInstKind::RetainBlock:
     // In OptimizeIndividualCalls, we have strength reduced all optimizable
     // objc_retainBlocks to objc_retains. Thus at this point any
     // objc_retainBlocks that we see are not optimizable.
     break;
-  case IC_Retain:
-  case IC_RetainRV: {
-    Arg = GetObjCArg(Inst);
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV: {
+    Arg = GetArgRCIdentityRoot(Inst);
 
     PtrState &S = MyStates.getPtrBottomUpState(Arg);
     S.SetKnownPositiveRefCount();
@@ -1771,9 +1769,10 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
         S.ClearReverseInsertPts();
       // FALL THROUGH
     case S_CanRelease:
-      // Don't do retain+release tracking for IC_RetainRV, because it's
+      // Don't do retain+release tracking for ARCInstKind::RetainRV,
+      // because it's
       // better to let it remain as the first instruction after a call.
-      if (Class != IC_RetainRV)
+      if (Class != ARCInstKind::RetainRV)
         Retains[Inst] = S.GetRRInfo();
       S.ClearSequenceProgress();
       break;
@@ -1786,15 +1785,15 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     // A retain moving bottom up can be a use.
     break;
   }
-  case IC_AutoreleasepoolPop:
+  case ARCInstKind::AutoreleasepoolPop:
     // Conservatively, clear MyStates for all known pointers.
     MyStates.clearBottomUpPointers();
     return NestingDetected;
-  case IC_AutoreleasepoolPush:
-  case IC_None:
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::None:
     // These are irrelevant.
     return NestingDetected;
-  case IC_User:
+  case ARCInstKind::User:
     // If we have a store into an alloca of a pointer we are tracking, the
     // pointer has multiple owners implying that we must be more conservative.
     //
@@ -1810,7 +1809,7 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
       if (AreAnyUnderlyingObjectsAnAlloca(SI->getPointerOperand())) {
         BBState::ptr_iterator I = MyStates.findPtrBottomUpState(
-          StripPointerCastsAndObjCCalls(SI->getValueOperand()));
+          GetRCIdentityRoot(SI->getValueOperand()));
         if (I != MyStates.bottom_up_ptr_end())
           MultiOwnersSet.insert(I->first);
       }
@@ -1969,24 +1968,25 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
                                     DenseMap<Value *, RRInfo> &Releases,
                                     BBState &MyStates) {
   bool NestingDetected = false;
-  InstructionClass Class = GetInstructionClass(Inst);
+  ARCInstKind Class = GetARCInstKind(Inst);
   const Value *Arg = nullptr;
 
   switch (Class) {
-  case IC_RetainBlock:
+  case ARCInstKind::RetainBlock:
     // In OptimizeIndividualCalls, we have strength reduced all optimizable
     // objc_retainBlocks to objc_retains. Thus at this point any
     // objc_retainBlocks that we see are not optimizable.
     break;
-  case IC_Retain:
-  case IC_RetainRV: {
-    Arg = GetObjCArg(Inst);
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV: {
+    Arg = GetArgRCIdentityRoot(Inst);
 
     PtrState &S = MyStates.getPtrTopDownState(Arg);
 
-    // Don't do retain+release tracking for IC_RetainRV, because it's
+    // Don't do retain+release tracking for ARCInstKind::RetainRV, because
+    // it's
     // better to let it remain as the first instruction after a call.
-    if (Class != IC_RetainRV) {
+    if (Class != ARCInstKind::RetainRV) {
       // If we see two retains in a row on the same pointer. If so, make
       // a note, and we'll cicle back to revisit it after we've
       // hopefully eliminated the second retain, which may allow us to
@@ -2009,8 +2009,8 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
     // code below.
     break;
   }
-  case IC_Release: {
-    Arg = GetObjCArg(Inst);
+  case ARCInstKind::Release: {
+    Arg = GetArgRCIdentityRoot(Inst);
 
     PtrState &S = MyStates.getPtrTopDownState(Arg);
     S.ClearKnownPositiveRefCount();
@@ -2041,12 +2041,12 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
     }
     break;
   }
-  case IC_AutoreleasepoolPop:
+  case ARCInstKind::AutoreleasepoolPop:
     // Conservatively, clear MyStates for all known pointers.
     MyStates.clearTopDownPointers();
     return NestingDetected;
-  case IC_AutoreleasepoolPush:
-  case IC_None:
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::None:
     // These are irrelevant.
     return NestingDetected;
   default:
@@ -2374,7 +2374,7 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
       const RRInfo &NewRetainRRI = It->second;
       KnownSafeTD &= NewRetainRRI.KnownSafe;
       MultipleOwners =
-        MultipleOwners || MultiOwnersSet.count(GetObjCArg(NewRetain));
+        MultipleOwners || MultiOwnersSet.count(GetArgRCIdentityRoot(NewRetain));
       for (Instruction *NewRetainRelease : NewRetainRRI.Calls) {
         DenseMap<Value *, RRInfo>::const_iterator Jt =
           Releases.find(NewRetainRelease);
@@ -2583,7 +2583,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
 
     DEBUG(dbgs() << "Visiting: " << *Retain << "\n");
 
-    Value *Arg = GetObjCArg(Retain);
+    Value *Arg = GetArgRCIdentityRoot(Retain);
 
     // If the object being released is in static or stack storage, we know it's
     // not being managed by ObjC reference counting, so we can delete pairs
@@ -2595,7 +2595,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
     if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
       if (const GlobalVariable *GV =
             dyn_cast<GlobalVariable>(
-              StripPointerCastsAndObjCCalls(LI->getPointerOperand())))
+              GetRCIdentityRoot(LI->getPointerOperand())))
         if (GV->isConstant())
           KnownSafe = true;
 
@@ -2642,12 +2642,13 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
 
     DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
 
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-    if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained)
+    ARCInstKind Class = GetBasicARCInstKind(Inst);
+    if (Class != ARCInstKind::LoadWeak &&
+        Class != ARCInstKind::LoadWeakRetained)
       continue;
 
     // Delete objc_loadWeak calls with no users.
-    if (Class == IC_LoadWeak && Inst->use_empty()) {
+    if (Class == ARCInstKind::LoadWeak && Inst->use_empty()) {
       Inst->eraseFromParent();
       continue;
     }
@@ -2662,10 +2663,10 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
                               J = Current.getInstructionIterator();
          J != B; --J) {
       Instruction *EarlierInst = &*std::prev(J);
-      InstructionClass EarlierClass = GetInstructionClass(EarlierInst);
+      ARCInstKind EarlierClass = GetARCInstKind(EarlierInst);
       switch (EarlierClass) {
-      case IC_LoadWeak:
-      case IC_LoadWeakRetained: {
+      case ARCInstKind::LoadWeak:
+      case ARCInstKind::LoadWeakRetained: {
         // If this is loading from the same pointer, replace this load's value
         // with that one.
         CallInst *Call = cast<CallInst>(Inst);
@@ -2676,7 +2677,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
         case AliasAnalysis::MustAlias:
           Changed = true;
           // If the load has a builtin retain, insert a plain retain for it.
-          if (Class == IC_LoadWeakRetained) {
+          if (Class == ARCInstKind::LoadWeakRetained) {
             Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain);
             CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call);
             CI->setTailCall();
@@ -2693,8 +2694,8 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
         }
         break;
       }
-      case IC_StoreWeak:
-      case IC_InitWeak: {
+      case ARCInstKind::StoreWeak:
+      case ARCInstKind::InitWeak: {
         // If this is storing to the same pointer and has the same size etc.
         // replace this load's value with the stored value.
         CallInst *Call = cast<CallInst>(Inst);
@@ -2705,7 +2706,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
         case AliasAnalysis::MustAlias:
           Changed = true;
           // If the load has a builtin retain, insert a plain retain for it.
-          if (Class == IC_LoadWeakRetained) {
+          if (Class == ARCInstKind::LoadWeakRetained) {
             Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain);
             CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call);
             CI->setTailCall();
@@ -2722,14 +2723,14 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
         }
         break;
       }
-      case IC_MoveWeak:
-      case IC_CopyWeak:
+      case ARCInstKind::MoveWeak:
+      case ARCInstKind::CopyWeak:
         // TOOD: Grab the copied value.
         goto clobbered;
-      case IC_AutoreleasepoolPush:
-      case IC_None:
-      case IC_IntrinsicUser:
-      case IC_User:
+      case ARCInstKind::AutoreleasepoolPush:
+      case ARCInstKind::None:
+      case ARCInstKind::IntrinsicUser:
+      case ARCInstKind::User:
         // Weak pointers are only modified through the weak entry points
         // (and arbitrary calls, which could call the weak entry points).
         break;
@@ -2745,8 +2746,8 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
   // the alloca and all its users can be zapped.
   for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
     Instruction *Inst = &*I++;
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-    if (Class != IC_DestroyWeak)
+    ARCInstKind Class = GetBasicARCInstKind(Inst);
+    if (Class != ARCInstKind::DestroyWeak)
       continue;
 
     CallInst *Call = cast<CallInst>(Inst);
@@ -2754,10 +2755,10 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
     if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) {
       for (User *U : Alloca->users()) {
         const Instruction *UserInst = cast<Instruction>(U);
-        switch (GetBasicInstructionClass(UserInst)) {
-        case IC_InitWeak:
-        case IC_StoreWeak:
-        case IC_DestroyWeak:
+        switch (GetBasicARCInstKind(UserInst)) {
+        case ARCInstKind::InitWeak:
+        case ARCInstKind::StoreWeak:
+        case ARCInstKind::DestroyWeak:
           continue;
         default:
           goto done;
@@ -2766,13 +2767,13 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
       Changed = true;
       for (auto UI = Alloca->user_begin(), UE = Alloca->user_end(); UI != UE;) {
         CallInst *UserInst = cast<CallInst>(*UI++);
-        switch (GetBasicInstructionClass(UserInst)) {
-        case IC_InitWeak:
-        case IC_StoreWeak:
+        switch (GetBasicARCInstKind(UserInst)) {
+        case ARCInstKind::InitWeak:
+        case ARCInstKind::StoreWeak:
           // These functions return their second argument.
           UserInst->replaceAllUsesWith(UserInst->getArgOperand(1));
           break;
-        case IC_DestroyWeak:
+        case ARCInstKind::DestroyWeak:
           // No return value.
           break;
         default:
@@ -2835,8 +2836,8 @@ HasSafePathToPredecessorCall(const Value *Arg, Instruction *Retain,
     return false;
 
   // Check that the call is a regular call.
-  InstructionClass Class = GetBasicInstructionClass(Call);
-  if (Class != IC_CallOrUser && Class != IC_Call)
+  ARCInstKind Class = GetBasicARCInstKind(Call);
+  if (Class != ARCInstKind::CallOrUser && Class != ARCInstKind::Call)
     return false;
 
   return true;
@@ -2860,9 +2861,8 @@ FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB,
     dyn_cast_or_null<CallInst>(*DepInsts.begin());
 
   // Check that we found a retain with the same argument.
-  if (!Retain ||
-      !IsRetain(GetBasicInstructionClass(Retain)) ||
-      GetObjCArg(Retain) != Arg) {
+  if (!Retain || !IsRetain(GetBasicARCInstKind(Retain)) ||
+      GetArgRCIdentityRoot(Retain) != Arg) {
     return nullptr;
   }
 
@@ -2887,10 +2887,10 @@ FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB,
     dyn_cast_or_null<CallInst>(*DepInsts.begin());
   if (!Autorelease)
     return nullptr;
-  InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease);
+  ARCInstKind AutoreleaseClass = GetBasicARCInstKind(Autorelease);
   if (!IsAutorelease(AutoreleaseClass))
     return nullptr;
-  if (GetObjCArg(Autorelease) != Arg)
+  if (GetArgRCIdentityRoot(Autorelease) != Arg)
     return nullptr;
 
   return Autorelease;
@@ -2921,7 +2921,7 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
     if (!Ret)
       continue;
 
-    const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0));
+    const Value *Arg = GetRCIdentityRoot(Ret->getOperand(0));
 
     // Look for an ``autorelease'' instruction that is a predecessor of Ret and
     // dependent on Arg such that there are no instructions dependent on Arg
@@ -2976,13 +2976,13 @@ ObjCARCOpt::GatherStatistics(Function &F, bool AfterOptimization) {
 
   for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
     Instruction *Inst = &*I++;
-    switch (GetBasicInstructionClass(Inst)) {
+    switch (GetBasicARCInstKind(Inst)) {
     default:
       break;
-    case IC_Retain:
+    case ARCInstKind::Retain:
       ++NumRetains;
       break;
-    case IC_Release:
+    case ARCInstKind::Release:
       ++NumReleases;
       break;
     }
@@ -3054,27 +3054,27 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
   OptimizeIndividualCalls(F);
 
   // Optimizations for weak pointers.
-  if (UsedInThisFunction & ((1 << IC_LoadWeak) |
-                            (1 << IC_LoadWeakRetained) |
-                            (1 << IC_StoreWeak) |
-                            (1 << IC_InitWeak) |
-                            (1 << IC_CopyWeak) |
-                            (1 << IC_MoveWeak) |
-                            (1 << IC_DestroyWeak)))
+  if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::LoadWeak)) |
+                            (1 << unsigned(ARCInstKind::LoadWeakRetained)) |
+                            (1 << unsigned(ARCInstKind::StoreWeak)) |
+                            (1 << unsigned(ARCInstKind::InitWeak)) |
+                            (1 << unsigned(ARCInstKind::CopyWeak)) |
+                            (1 << unsigned(ARCInstKind::MoveWeak)) |
+                            (1 << unsigned(ARCInstKind::DestroyWeak))))
     OptimizeWeakCalls(F);
 
   // Optimizations for retain+release pairs.
-  if (UsedInThisFunction & ((1 << IC_Retain) |
-                            (1 << IC_RetainRV) |
-                            (1 << IC_RetainBlock)))
-    if (UsedInThisFunction & (1 << IC_Release))
+  if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::Retain)) |
+                            (1 << unsigned(ARCInstKind::RetainRV)) |
+                            (1 << unsigned(ARCInstKind::RetainBlock))))
+    if (UsedInThisFunction & (1 << unsigned(ARCInstKind::Release)))
       // Run OptimizeSequences until it either stops making changes or
       // no retain+release pair nesting is detected.
       while (OptimizeSequences(F)) {}
 
   // Optimizations if objc_autorelease is used.
-  if (UsedInThisFunction & ((1 << IC_Autorelease) |
-                            (1 << IC_AutoreleaseRV)))
+  if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::Autorelease)) |
+                            (1 << unsigned(ARCInstKind::AutoreleaseRV))))
     OptimizeReturns(F);
 
   // Gather statistics after optimization.
diff --git a/lib/Transforms/ObjCARC/ObjCARCUtil.cpp b/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
deleted file mode 100644
index 53c077e..0000000
--- a/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-//===- ObjCARCUtil.cpp - ObjC ARC Optimization ----------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file defines several utility functions used by various ARC
-/// optimizations which are IMHO too big to be in a header file.
-///
-/// WARNING: This file knows about certain library functions. It recognizes them
-/// by name, and hardwires knowledge of their semantics.
-///
-/// WARNING: This file knows about how certain Objective-C library functions are
-/// used. Naive LLVM IR transformations which would otherwise be
-/// behavior-preserving may break these assumptions.
-///
-//===----------------------------------------------------------------------===//
-
-#include "ObjCARC.h"
-#include "llvm/IR/Intrinsics.h"
-
-using namespace llvm;
-using namespace llvm::objcarc;
-
-raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS,
-                                       const InstructionClass Class) {
-  switch (Class) {
-  case IC_Retain:
-    return OS << "IC_Retain";
-  case IC_RetainRV:
-    return OS << "IC_RetainRV";
-  case IC_RetainBlock:
-    return OS << "IC_RetainBlock";
-  case IC_Release:
-    return OS << "IC_Release";
-  case IC_Autorelease:
-    return OS << "IC_Autorelease";
-  case IC_AutoreleaseRV:
-    return OS << "IC_AutoreleaseRV";
-  case IC_AutoreleasepoolPush:
-    return OS << "IC_AutoreleasepoolPush";
-  case IC_AutoreleasepoolPop:
-    return OS << "IC_AutoreleasepoolPop";
-  case IC_NoopCast:
-    return OS << "IC_NoopCast";
-  case IC_FusedRetainAutorelease:
-    return OS << "IC_FusedRetainAutorelease";
-  case IC_FusedRetainAutoreleaseRV:
-    return OS << "IC_FusedRetainAutoreleaseRV";
-  case IC_LoadWeakRetained:
-    return OS << "IC_LoadWeakRetained";
-  case IC_StoreWeak:
-    return OS << "IC_StoreWeak";
-  case IC_InitWeak:
-    return OS << "IC_InitWeak";
-  case IC_LoadWeak:
-    return OS << "IC_LoadWeak";
-  case IC_MoveWeak:
-    return OS << "IC_MoveWeak";
-  case IC_CopyWeak:
-    return OS << "IC_CopyWeak";
-  case IC_DestroyWeak:
-    return OS << "IC_DestroyWeak";
-  case IC_StoreStrong:
-    return OS << "IC_StoreStrong";
-  case IC_CallOrUser:
-    return OS << "IC_CallOrUser";
-  case IC_Call:
-    return OS << "IC_Call";
-  case IC_User:
-    return OS << "IC_User";
-  case IC_IntrinsicUser:
-    return OS << "IC_IntrinsicUser";
-  case IC_None:
-    return OS << "IC_None";
-  }
-  llvm_unreachable("Unknown instruction class!");
-}
-
-InstructionClass llvm::objcarc::GetFunctionClass(const Function *F) {
-  Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
-
-  // No (mandatory) arguments.
-  if (AI == AE)
-    return StringSwitch<InstructionClass>(F->getName())
-      .Case("objc_autoreleasePoolPush",  IC_AutoreleasepoolPush)
-      .Case("clang.arc.use", IC_IntrinsicUser)
-      .Default(IC_CallOrUser);
-
-  // One argument.
-  const Argument *A0 = AI++;
-  if (AI == AE)
-    // Argument is a pointer.
-    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) {
-      Type *ETy = PTy->getElementType();
-      // Argument is i8*.
-      if (ETy->isIntegerTy(8))
-        return StringSwitch<InstructionClass>(F->getName())
-          .Case("objc_retain",                IC_Retain)
-          .Case("objc_retainAutoreleasedReturnValue", IC_RetainRV)
-          .Case("objc_retainBlock",           IC_RetainBlock)
-          .Case("objc_release",               IC_Release)
-          .Case("objc_autorelease",           IC_Autorelease)
-          .Case("objc_autoreleaseReturnValue", IC_AutoreleaseRV)
-          .Case("objc_autoreleasePoolPop",    IC_AutoreleasepoolPop)
-          .Case("objc_retainedObject",        IC_NoopCast)
-          .Case("objc_unretainedObject",      IC_NoopCast)
-          .Case("objc_unretainedPointer",     IC_NoopCast)
-          .Case("objc_retain_autorelease",    IC_FusedRetainAutorelease)
-          .Case("objc_retainAutorelease",     IC_FusedRetainAutorelease)
-          .Case("objc_retainAutoreleaseReturnValue",IC_FusedRetainAutoreleaseRV)
-          .Case("objc_sync_enter", IC_User)
-          .Case("objc_sync_exit", IC_User)
-          .Default(IC_CallOrUser);
-
-      // Argument is i8**
-      if (PointerType *Pte = dyn_cast<PointerType>(ETy))
-        if (Pte->getElementType()->isIntegerTy(8))
-          return StringSwitch<InstructionClass>(F->getName())
-            .Case("objc_loadWeakRetained",      IC_LoadWeakRetained)
-            .Case("objc_loadWeak",              IC_LoadWeak)
-            .Case("objc_destroyWeak",           IC_DestroyWeak)
-            .Default(IC_CallOrUser);
-    }
-
-  // Two arguments, first is i8**.
-  const Argument *A1 = AI++;
-  if (AI == AE)
-    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType()))
-      if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType()))
-        if (Pte->getElementType()->isIntegerTy(8))
-          if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) {
-            Type *ETy1 = PTy1->getElementType();
-            // Second argument is i8*
-            if (ETy1->isIntegerTy(8))
-              return StringSwitch<InstructionClass>(F->getName())
-                .Case("objc_storeWeak",             IC_StoreWeak)
-                .Case("objc_initWeak",              IC_InitWeak)
-                .Case("objc_storeStrong",           IC_StoreStrong)
-                .Default(IC_CallOrUser);
-            // Second argument is i8**.
-            if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1))
-              if (Pte1->getElementType()->isIntegerTy(8))
-                return StringSwitch<InstructionClass>(F->getName())
-                  .Case("objc_moveWeak",              IC_MoveWeak)
-                  .Case("objc_copyWeak",              IC_CopyWeak)
-                  // Ignore annotation calls. This is important to stop the
-                  // optimizer from treating annotations as uses which would
-                  // make the state of the pointers they are attempting to
-                  // elucidate to be incorrect.
-                  .Case("llvm.arc.annotation.topdown.bbstart", IC_None)
-                  .Case("llvm.arc.annotation.topdown.bbend", IC_None)
-                  .Case("llvm.arc.annotation.bottomup.bbstart", IC_None)
-                  .Case("llvm.arc.annotation.bottomup.bbend", IC_None)
-                  .Default(IC_CallOrUser);
-          }
-
-  // Anything else.
-  return IC_CallOrUser;
-}
-
-/// \brief Determine what kind of construct V is.
-InstructionClass
-llvm::objcarc::GetInstructionClass(const Value *V) {
-  if (const Instruction *I = dyn_cast<Instruction>(V)) {
-    // Any instruction other than bitcast and gep with a pointer operand have a
-    // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer
-    // to a subsequent use, rather than using it themselves, in this sense.
-    // As a short cut, several other opcodes are known to have no pointer
-    // operands of interest. And ret is never followed by a release, so it's
-    // not interesting to examine.
-    switch (I->getOpcode()) {
-    case Instruction::Call: {
-      const CallInst *CI = cast<CallInst>(I);
-      // Check for calls to special functions.
-      if (const Function *F = CI->getCalledFunction()) {
-        InstructionClass Class = GetFunctionClass(F);
-        if (Class != IC_CallOrUser)
-          return Class;
-
-        // None of the intrinsic functions do objc_release. For intrinsics, the
-        // only question is whether or not they may be users.
-        switch (F->getIntrinsicID()) {
-        case Intrinsic::returnaddress: case Intrinsic::frameaddress:
-        case Intrinsic::stacksave: case Intrinsic::stackrestore:
-        case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend:
-        case Intrinsic::objectsize: case Intrinsic::prefetch:
-        case Intrinsic::stackprotector:
-        case Intrinsic::eh_return_i32: case Intrinsic::eh_return_i64:
-        case Intrinsic::eh_typeid_for: case Intrinsic::eh_dwarf_cfa:
-        case Intrinsic::eh_sjlj_lsda: case Intrinsic::eh_sjlj_functioncontext:
-        case Intrinsic::init_trampoline: case Intrinsic::adjust_trampoline:
-        case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
-        case Intrinsic::invariant_start: case Intrinsic::invariant_end:
-        // Don't let dbg info affect our results.
-        case Intrinsic::dbg_declare: case Intrinsic::dbg_value:
-          // Short cut: Some intrinsics obviously don't use ObjC pointers.
-          return IC_None;
-        default:
-          break;
-        }
-      }
-      return GetCallSiteClass(CI);
-    }
-    case Instruction::Invoke:
-      return GetCallSiteClass(cast<InvokeInst>(I));
-    case Instruction::BitCast:
-    case Instruction::GetElementPtr:
-    case Instruction::Select: case Instruction::PHI:
-    case Instruction::Ret: case Instruction::Br:
-    case Instruction::Switch: case Instruction::IndirectBr:
-    case Instruction::Alloca: case Instruction::VAArg:
-    case Instruction::Add: case Instruction::FAdd:
-    case Instruction::Sub: case Instruction::FSub:
-    case Instruction::Mul: case Instruction::FMul:
-    case Instruction::SDiv: case Instruction::UDiv: case Instruction::FDiv:
-    case Instruction::SRem: case Instruction::URem: case Instruction::FRem:
-    case Instruction::Shl: case Instruction::LShr: case Instruction::AShr:
-    case Instruction::And: case Instruction::Or: case Instruction::Xor:
-    case Instruction::SExt: case Instruction::ZExt: case Instruction::Trunc:
-    case Instruction::IntToPtr: case Instruction::FCmp:
-    case Instruction::FPTrunc: case Instruction::FPExt:
-    case Instruction::FPToUI: case Instruction::FPToSI:
-    case Instruction::UIToFP: case Instruction::SIToFP:
-    case Instruction::InsertElement: case Instruction::ExtractElement:
-    case Instruction::ShuffleVector:
-    case Instruction::ExtractValue:
-      break;
-    case Instruction::ICmp:
-      // Comparing a pointer with null, or any other constant, isn't an
-      // interesting use, because we don't care what the pointer points to, or
-      // about the values of any other dynamic reference-counted pointers.
-      if (IsPotentialRetainableObjPtr(I->getOperand(1)))
-        return IC_User;
-      break;
-    default:
-      // For anything else, check all the operands.
-      // Note that this includes both operands of a Store: while the first
-      // operand isn't actually being dereferenced, it is being stored to
-      // memory where we can no longer track who might read it and dereference
-      // it, so we have to consider it potentially used.
-      for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end();
-           OI != OE; ++OI)
-        if (IsPotentialRetainableObjPtr(*OI))
-          return IC_User;
-    }
-  }
-
-  // Otherwise, it's totally inert for ARC purposes.
-  return IC_None;
-}
diff --git a/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
index 7820468..4b5f4d8 100644
--- a/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
+++ b/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -57,8 +57,8 @@ class ProvenanceAnalysis {
   bool relatedSelect(const SelectInst *A, const Value *B);
   bool relatedPHI(const PHINode *A, const Value *B);
 
-  void operator=(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
-  ProvenanceAnalysis(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
+  void operator=(const ProvenanceAnalysis &) = delete;
+  ProvenanceAnalysis(const ProvenanceAnalysis &) = delete;
 
 public:
   ProvenanceAnalysis() {}
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index 3d91984..d6fc916 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -32,19 +32,18 @@ using namespace llvm;
 STATISTIC(NumRemoved, "Number of instructions removed");
 
 namespace {
-  struct ADCE : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    ADCE() : FunctionPass(ID) {
-      initializeADCEPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnFunction(Function& F) override;
+struct ADCE : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  ADCE() : FunctionPass(ID) {
+    initializeADCEPass(*PassRegistry::getPassRegistry());
+  }
 
-    void getAnalysisUsage(AnalysisUsage& AU) const override {
-      AU.setPreservesCFG();
-    }
+  bool runOnFunction(Function& F) override;
 
-  };
+  void getAnalysisUsage(AnalysisUsage& AU) const override {
+    AU.setPreservesCFG();
+  }
+};
 }
 
 char ADCE::ID = 0;
@@ -54,46 +53,45 @@ bool ADCE::runOnFunction(Function& F) {
   if (skipOptnoneFunction(F))
     return false;
 
-  SmallPtrSet<Instruction*, 128> alive;
-  SmallVector<Instruction*, 128> worklist;
+  SmallPtrSet<Instruction*, 128> Alive;
+  SmallVector<Instruction*, 128> Worklist;
 
   // Collect the set of "root" instructions that are known live.
-  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
-    if (isa<TerminatorInst>(I.getInstructionIterator()) ||
-        isa<DbgInfoIntrinsic>(I.getInstructionIterator()) ||
-        isa<LandingPadInst>(I.getInstructionIterator()) ||
-        I->mayHaveSideEffects()) {
-      alive.insert(I.getInstructionIterator());
-      worklist.push_back(I.getInstructionIterator());
+  for (Instruction &I : inst_range(F)) {
+    if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) ||
+        isa<LandingPadInst>(I) || I.mayHaveSideEffects()) {
+      Alive.insert(&I);
+      Worklist.push_back(&I);
     }
+  }
 
   // Propagate liveness backwards to operands.
-  while (!worklist.empty()) {
-    Instruction* curr = worklist.pop_back_val();
-    for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end();
-         OI != OE; ++OI)
-      if (Instruction* Inst = dyn_cast<Instruction>(OI))
-        if (alive.insert(Inst).second)
-          worklist.push_back(Inst);
+  while (!Worklist.empty()) {
+    Instruction *Curr = Worklist.pop_back_val();
+    for (Use &OI : Curr->operands()) {
+      if (Instruction *Inst = dyn_cast<Instruction>(OI))
+        if (Alive.insert(Inst).second)
+          Worklist.push_back(Inst);
+    }
   }
 
   // The inverse of the live set is the dead set.  These are those instructions
   // which have no side effects and do not influence the control flow or return
   // value of the function, and may therefore be deleted safely.
-  // NOTE: We reuse the worklist vector here for memory efficiency.
-  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
-    if (!alive.count(I.getInstructionIterator())) {
-      worklist.push_back(I.getInstructionIterator());
-      I->dropAllReferences();
+  // NOTE: We reuse the Worklist vector here for memory efficiency.
+  for (Instruction &I : inst_range(F)) {
+    if (!Alive.count(&I)) {
+      Worklist.push_back(&I);
+      I.dropAllReferences();
     }
+  }
 
-  for (SmallVectorImpl<Instruction *>::iterator I = worklist.begin(),
-       E = worklist.end(); I != E; ++I) {
+  for (Instruction *&I : Worklist) {
     ++NumRemoved;
-    (*I)->eraseFromParent();
+    I->eraseFromParent();
   }
 
-  return !worklist.empty();
+  return !Worklist.empty();
 }
 
 FunctionPass *llvm::createAggressiveDCEPass() {
diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index 06c3dfd..5c74885 100644
--- a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -21,7 +21,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -53,12 +53,12 @@ struct AlignmentFromAssumptions : public FunctionPass {
   bool runOnFunction(Function &F);
 
   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.addRequired<AssumptionTracker>();
+    AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<ScalarEvolution>();
     AU.addRequired<DominatorTreeWrapperPass>();
 
     AU.setPreservesCFG();
-    AU.addPreserved<LoopInfo>();
+    AU.addPreserved<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<ScalarEvolution>();
   }
@@ -69,7 +69,6 @@ struct AlignmentFromAssumptions : public FunctionPass {
   // another assumption later, then we may change the alignment at that point.
   DenseMap<MemTransferInst *, unsigned> NewDestAlignments, NewSrcAlignments;
 
-  AssumptionTracker *AT;
   ScalarEvolution *SE;
   DominatorTree *DT;
   const DataLayout *DL;
@@ -84,7 +83,7 @@ char AlignmentFromAssumptions::ID = 0;
 static const char aip_name[] = "Alignment from assumptions";
 INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME,
                       aip_name, false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME,
@@ -411,7 +410,7 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) {
 
 bool AlignmentFromAssumptions::runOnFunction(Function &F) {
   bool Changed = false;
-  AT = &getAnalysis<AssumptionTracker>();
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   SE = &getAnalysis<ScalarEvolution>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
@@ -420,8 +419,9 @@ bool AlignmentFromAssumptions::runOnFunction(Function &F) {
   NewDestAlignments.clear();
   NewSrcAlignments.clear();
 
-  for (auto &I : AT->assumptions(&F))
-    Changed |= processAssumption(I);
+  for (auto &AssumeVH : AC.assumptions())
+    if (AssumeVH)
+      Changed |= processAssumption(cast<CallInst>(AssumeVH));
 
   return Changed;
 }
diff --git a/lib/Transforms/Scalar/Android.mk b/lib/Transforms/Scalar/Android.mk
index 9028b42..ed803cd 100644
--- a/lib/Transforms/Scalar/Android.mk
+++ b/lib/Transforms/Scalar/Android.mk
@@ -2,6 +2,7 @@ LOCAL_PATH:= $(call my-dir)
 
 transforms_scalar_SRC_FILES := \
   ADCE.cpp \
+  BDCE.cpp \
   AlignmentFromAssumptions.cpp \
   ConstantProp.cpp \
   ConstantHoisting.cpp \
@@ -12,6 +13,7 @@ transforms_scalar_SRC_FILES := \
   FlattenCFGPass.cpp \
   GVN.cpp \
   IndVarSimplify.cpp \
+  InductiveRangeCheckElimination.cpp \
   JumpThreading.cpp \
   LICM.cpp \
   LoadCombine.cpp \
@@ -24,11 +26,14 @@ transforms_scalar_SRC_FILES := \
   LoopUnrollPass.cpp \
   LoopUnswitch.cpp \
   LowerAtomic.cpp \
+  LowerExpectIntrinsic.cpp \
   MemCpyOptimizer.cpp \
   MergedLoadStoreMotion.cpp \
   PartiallyInlineLibCalls.cpp \
+  PlaceSafepoints.cpp \
   Reassociate.cpp \
   Reg2Mem.cpp \
+  RewriteStatepointsForGC.cpp \
   SCCP.cpp \
   SROA.cpp \
   SampleProfile.cpp \
@@ -38,6 +43,7 @@ transforms_scalar_SRC_FILES := \
   SeparateConstOffsetFromGEP.cpp \
   SimplifyCFGPass.cpp \
   Sink.cpp \
+  StraightLineStrengthReduce.cpp \
   StructurizeCFG.cpp \
   TailRecursionElimination.cpp
 
diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp
new file mode 100644
index 0000000..c7bd79d
--- /dev/null
+++ b/lib/Transforms/Scalar/BDCE.cpp
@@ -0,0 +1,411 @@
+//===---- BDCE.cpp - Bit-tracking dead code elimination -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Bit-Tracking Dead Code Elimination pass. Some
+// instructions (shifts, some ands, ors, etc.) kill some of their input bits.
+// We track these dead bits and remove instructions that compute only these
+// dead bits.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bdce"
+
+STATISTIC(NumRemoved, "Number of instructions removed (unused)");
+STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");
+
+namespace {
+struct BDCE : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  BDCE() : FunctionPass(ID) {
+    initializeBDCEPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function& F) override;
+
+  void getAnalysisUsage(AnalysisUsage& AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+  }
+
+  void determineLiveOperandBits(const Instruction *UserI,
+                                const Instruction *I, unsigned OperandNo,
+                                const APInt &AOut, APInt &AB,
+                                APInt &KnownZero, APInt &KnownOne,
+                                APInt &KnownZero2, APInt &KnownOne2);
+
+  AssumptionCache *AC;
+  const DataLayout *DL;
+  DominatorTree *DT;
+};
+}
+
+char BDCE::ID = 0;
+INITIALIZE_PASS_BEGIN(BDCE, "bdce", "Bit-Tracking Dead Code Elimination",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(BDCE, "bdce", "Bit-Tracking Dead Code Elimination",
+                    false, false)
+
+static bool isAlwaysLive(Instruction *I) {
+  return isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) ||
+         isa<LandingPadInst>(I) || I->mayHaveSideEffects();
+}
+
+void BDCE::determineLiveOperandBits(const Instruction *UserI,
+                                    const Instruction *I, unsigned OperandNo,
+                                    const APInt &AOut, APInt &AB,
+                                    APInt &KnownZero, APInt &KnownOne,
+                                    APInt &KnownZero2, APInt &KnownOne2) {
+  unsigned BitWidth = AB.getBitWidth();
+
+  // We're called once per operand, but for some instructions, we need to
+  // compute known bits of both operands in order to determine the live bits of
+  // either (when both operands are instructions themselves). We don't,
+  // however, want to do this twice, so we cache the result in APInts that live
+  // in the caller. For the two-relevant-operands case, both operand values are
+  // provided here.
+  auto ComputeKnownBits = [&](unsigned BitWidth, const Value *V1,
+                              const Value *V2) {
+    KnownZero = APInt(BitWidth, 0);
+    KnownOne =  APInt(BitWidth, 0);
+    computeKnownBits(const_cast<Value*>(V1), KnownZero, KnownOne, DL, 0, AC,
+                     UserI, DT);
+
+    if (V2) {
+      KnownZero2 = APInt(BitWidth, 0);
+      KnownOne2 =  APInt(BitWidth, 0);
+      computeKnownBits(const_cast<Value*>(V2), KnownZero2, KnownOne2, DL, 0, AC,
+                       UserI, DT);
+    }
+  };
+
+  switch (UserI->getOpcode()) {
+  default: break;
+  case Instruction::Call:
+  case Instruction::Invoke:
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(UserI))
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::bswap:
+        // The alive bits of the input are the swapped alive bits of
+        // the output.
+        AB = AOut.byteSwap();
+        break;
+      case Intrinsic::ctlz:
+        if (OperandNo == 0) {
+          // We need some output bits, so we need all bits of the
+          // input to the left of, and including, the leftmost bit
+          // known to be one.
+          ComputeKnownBits(BitWidth, I, nullptr);
+          AB = APInt::getHighBitsSet(BitWidth,
+                 std::min(BitWidth, KnownOne.countLeadingZeros()+1));
+        }
+        break;
+      case Intrinsic::cttz:
+        if (OperandNo == 0) {
+          // We need some output bits, so we need all bits of the
+          // input to the right of, and including, the rightmost bit
+          // known to be one.
+          ComputeKnownBits(BitWidth, I, nullptr);
+          AB = APInt::getLowBitsSet(BitWidth,
+                 std::min(BitWidth, KnownOne.countTrailingZeros()+1));
+        }
+        break;
+      }
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+    // Find the highest live output bit. We don't need any more input
+    // bits than that (adds, and thus subtracts, ripple only to the
+    // left).
+    AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits());
+    break;
+  case Instruction::Shl:
+    if (OperandNo == 0)
+      if (ConstantInt *CI =
+            dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
+        AB = AOut.lshr(ShiftAmt);
+
+        // If the shift is nuw/nsw, then the high bits are not dead
+        // (because we've promised that they *must* be zero).
+        const ShlOperator *S = cast<ShlOperator>(UserI);
+        if (S->hasNoSignedWrap())
+          AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1);
+        else if (S->hasNoUnsignedWrap())
+          AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
+      }
+    break;
+  case Instruction::LShr:
+    if (OperandNo == 0)
+      if (ConstantInt *CI =
+            dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
+        AB = AOut.shl(ShiftAmt);
+
+        // If the shift is exact, then the low bits are not dead
+        // (they must be zero).
+        if (cast<LShrOperator>(UserI)->isExact())
+          AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+      }
+    break;
+  case Instruction::AShr:
+    if (OperandNo == 0)
+      if (ConstantInt *CI =
+            dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
+        AB = AOut.shl(ShiftAmt);
+        // Because the high input bit is replicated into the
+        // high-order bits of the result, if we need any of those
+        // bits, then we must keep the highest input bit.
+        if ((AOut & APInt::getHighBitsSet(BitWidth, ShiftAmt))
+            .getBoolValue())
+          AB.setBit(BitWidth-1);
+
+        // If the shift is exact, then the low bits are not dead
+        // (they must be zero).
+        if (cast<AShrOperator>(UserI)->isExact())
+          AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+      }
+    break;
+  case Instruction::And:
+    AB = AOut;
+
+    // For bits that are known zero, the corresponding bits in the
+    // other operand are dead (unless they're both zero, in which
+    // case they can't both be dead, so just mark the LHS bits as
+    // dead).
+    if (OperandNo == 0) {
+      ComputeKnownBits(BitWidth, I, UserI->getOperand(1));
+      AB &= ~KnownZero2;
+    } else {
+      if (!isa<Instruction>(UserI->getOperand(0)))
+        ComputeKnownBits(BitWidth, UserI->getOperand(0), I);
+      AB &= ~(KnownZero & ~KnownZero2);
+    }
+    break;
+  case Instruction::Or:
+    AB = AOut;
+
+    // For bits that are known one, the corresponding bits in the
+    // other operand are dead (unless they're both one, in which
+    // case they can't both be dead, so just mark the LHS bits as
+    // dead).
+    if (OperandNo == 0) {
+      ComputeKnownBits(BitWidth, I, UserI->getOperand(1));
+      AB &= ~KnownOne2;
+    } else {
+      if (!isa<Instruction>(UserI->getOperand(0)))
+        ComputeKnownBits(BitWidth, UserI->getOperand(0), I);
+      AB &= ~(KnownOne & ~KnownOne2);
+    }
+    break;
+  case Instruction::Xor:
+  case Instruction::PHI:
+    AB = AOut;
+    break;
+  case Instruction::Trunc:
+    AB = AOut.zext(BitWidth);
+    break;
+  case Instruction::ZExt:
+    AB = AOut.trunc(BitWidth);
+    break;
+  case Instruction::SExt:
+    AB = AOut.trunc(BitWidth);
+    // Because the high input bit is replicated into the
+    // high-order bits of the result, if we need any of those
+    // bits, then we must keep the highest input bit.
+    if ((AOut & APInt::getHighBitsSet(AOut.getBitWidth(),
+                                      AOut.getBitWidth() - BitWidth))
+        .getBoolValue())
+      AB.setBit(BitWidth-1);
+    break;
+  case Instruction::Select:
+    if (OperandNo != 0)
+      AB = AOut;
+    break;
+  }
+}
+
+bool BDCE::runOnFunction(Function& F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  DL = F.getParent()->getDataLayout();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  DenseMap<Instruction *, APInt> AliveBits;
+  SmallVector<Instruction*, 128> Worklist;
+
+  // The set of visited instructions (non-integer-typed only).
+  SmallPtrSet<Instruction*, 128> Visited;
+
+  // Collect the set of "root" instructions that are known live.
+  for (Instruction &I : inst_range(F)) {
+    if (!isAlwaysLive(&I))
+      continue;
+
+    DEBUG(dbgs() << "BDCE: Root: " << I << "\n");
+    // For integer-valued instructions, set up an initial empty set of alive
+    // bits and add the instruction to the work list. For other instructions
+    // add their operands to the work list (for integer values operands, mark
+    // all bits as live).
+    if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
+      if (!AliveBits.count(&I)) {
+        AliveBits[&I] = APInt(IT->getBitWidth(), 0);
+        Worklist.push_back(&I);
+      }
+
+      continue;
+    }
+
+    // Non-integer-typed instructions...
+    for (Use &OI : I.operands()) {
+      if (Instruction *J = dyn_cast<Instruction>(OI)) {
+        if (IntegerType *IT = dyn_cast<IntegerType>(J->getType()))
+          AliveBits[J] = APInt::getAllOnesValue(IT->getBitWidth());
+        Worklist.push_back(J);
+      }
+    }
+    // To save memory, we don't add I to the Visited set here. Instead, we
+    // check isAlwaysLive on every instruction when searching for dead
+    // instructions later (we need to check isAlwaysLive for the
+    // integer-typed instructions anyway).
+  }
+
+  // Propagate liveness backwards to operands.
+  while (!Worklist.empty()) {
+    Instruction *UserI = Worklist.pop_back_val();
+
+    DEBUG(dbgs() << "BDCE: Visiting: " << *UserI);
+    APInt AOut;
+    if (UserI->getType()->isIntegerTy()) {
+      AOut = AliveBits[UserI];
+      DEBUG(dbgs() << " Alive Out: " << AOut);
+    }
+    DEBUG(dbgs() << "\n");
+
+    if (!UserI->getType()->isIntegerTy())
+      Visited.insert(UserI);
+
+    APInt KnownZero, KnownOne, KnownZero2, KnownOne2;
+    // Compute the set of alive bits for each operand. These are anded into the
+    // existing set, if any, and if that changes the set of alive bits, the
+    // operand is added to the work-list.
+    for (Use &OI : UserI->operands()) {
+      if (Instruction *I = dyn_cast<Instruction>(OI)) {
+        if (IntegerType *IT = dyn_cast<IntegerType>(I->getType())) {
+          unsigned BitWidth = IT->getBitWidth();
+          APInt AB = APInt::getAllOnesValue(BitWidth);
+          if (UserI->getType()->isIntegerTy() && !AOut &&
+              !isAlwaysLive(UserI)) {
+            AB = APInt(BitWidth, 0);
+          } else {
+            // If all bits of the output are dead, then all bits of the input 
+            // Bits of each operand that are used to compute alive bits of the
+            // output are alive, all others are dead.
+            determineLiveOperandBits(UserI, I, OI.getOperandNo(), AOut, AB,
+                                     KnownZero, KnownOne,
+                                     KnownZero2, KnownOne2);
+          }
+
+          // If we've added to the set of alive bits (or the operand has not
+          // been previously visited), then re-queue the operand to be visited
+          // again.
+          APInt ABPrev(BitWidth, 0);
+          auto ABI = AliveBits.find(I);
+          if (ABI != AliveBits.end())
+            ABPrev = ABI->second;
+
+          APInt ABNew = AB | ABPrev;
+          if (ABNew != ABPrev || ABI == AliveBits.end()) {
+            AliveBits[I] = std::move(ABNew);
+            Worklist.push_back(I);
+          }
+        } else if (!Visited.count(I)) {
+          Worklist.push_back(I);
+        }
+      }
+    }
+  }
+
+  bool Changed = false;
+  // The inverse of the live set is the dead set.  These are those instructions
+  // which have no side effects and do not influence the control flow or return
+  // value of the function, and may therefore be deleted safely.
+  // NOTE: We reuse the Worklist vector here for memory efficiency.
+  for (Instruction &I : inst_range(F)) {
+    // For live instructions that have all dead bits, first make them dead by
+    // replacing all uses with something else. Then, if they don't need to
+    // remain live (because they have side effects, etc.) we can remove them.
+    if (I.getType()->isIntegerTy()) {
+      auto ABI = AliveBits.find(&I);
+      if (ABI != AliveBits.end()) {
+        if (ABI->second.getBoolValue())
+          continue;
+
+        DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
+        // FIXME: In theory we could substitute undef here instead of zero.
+        // This should be reconsidered once we settle on the semantics of
+        // undef, poison, etc.
+        Value *Zero = ConstantInt::get(I.getType(), 0);
+        ++NumSimplified;
+        I.replaceAllUsesWith(Zero);
+        Changed = true;
+      }
+    } else if (Visited.count(&I)) {
+      continue;
+    }
+
+    if (isAlwaysLive(&I))
+      continue;
+
+    Worklist.push_back(&I);
+    I.dropAllReferences();
+    Changed = true;
+  }
+
+  for (Instruction *&I : Worklist) {
+    ++NumRemoved;
+    I->eraseFromParent();
+  }
+
+  return Changed;
+}
+
+FunctionPass *llvm::createBitTrackingDCEPass() {
+  return new BDCE();
+}
+
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index b3ee11e..d297eb1 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_library(LLVMScalarOpts
   ADCE.cpp
   AlignmentFromAssumptions.cpp
+  BDCE.cpp
   ConstantHoisting.cpp
   ConstantProp.cpp
   CorrelatedValuePropagation.cpp
@@ -9,6 +10,7 @@ add_llvm_library(LLVMScalarOpts
   EarlyCSE.cpp
   FlattenCFGPass.cpp
   GVN.cpp
+  InductiveRangeCheckElimination.cpp
   IndVarSimplify.cpp
   JumpThreading.cpp
   LICM.cpp
@@ -22,11 +24,14 @@ add_llvm_library(LLVMScalarOpts
   LoopUnrollPass.cpp
   LoopUnswitch.cpp
   LowerAtomic.cpp
+  LowerExpectIntrinsic.cpp
   MemCpyOptimizer.cpp
   MergedLoadStoreMotion.cpp
   PartiallyInlineLibCalls.cpp
+  PlaceSafepoints.cpp
   Reassociate.cpp
   Reg2Mem.cpp
+  RewriteStatepointsForGC.cpp
   SCCP.cpp
   SROA.cpp
   SampleProfile.cpp
@@ -36,8 +41,13 @@ add_llvm_library(LLVMScalarOpts
   SeparateConstOffsetFromGEP.cpp
   SimplifyCFGPass.cpp
   Sink.cpp
+  StraightLineStrengthReduce.cpp
   StructurizeCFG.cpp
   TailRecursionElimination.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Scalar
   )
 
 add_dependencies(LLVMScalarOpts intrinsics_gen)
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index 27c177a..e3aab4b 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -131,14 +131,14 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<TargetTransformInfo>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
   }
 
 private:
   /// \brief Initialize the pass.
   void setup(Function &Fn) {
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    TTI = &getAnalysis<TargetTransformInfo>();
+    TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
     Entry = &Fn.getEntryBlock();
   }
 
@@ -176,7 +176,7 @@ char ConstantHoisting::ID = 0;
 INITIALIZE_PASS_BEGIN(ConstantHoisting, "consthoist", "Constant Hoisting",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(ConstantHoisting, "consthoist", "Constant Hoisting",
                     false, false)
 
@@ -186,6 +186,9 @@ FunctionPass *llvm::createConstantHoistingPass() {
 
 /// \brief Perform the constant hoisting optimization for the given function.
 bool ConstantHoisting::runOnFunction(Function &Fn) {
+  if (skipOptnoneFunction(Fn))
+    return false;
+
   DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
   DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
 
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
index dd51ce1..29d4e05 100644
--- a/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -26,7 +26,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/Pass.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include <set>
 using namespace llvm;
 
@@ -45,7 +45,7 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
-      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
   };
 }
@@ -53,7 +53,7 @@ namespace {
 char ConstantPropagation::ID = 0;
 INITIALIZE_PASS_BEGIN(ConstantPropagation, "constprop",
                 "Simple constant propagation", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(ConstantPropagation, "constprop",
                 "Simple constant propagation", false, false)
 
@@ -70,7 +70,8 @@ bool ConstantPropagation::runOnFunction(Function &F) {
   bool Changed = false;
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
-  TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+  TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   while (!WorkList.empty()) {
     Instruction *I = *WorkList.begin();
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index 99fac75..3b262a2 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -21,7 +21,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/Pass.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -42,7 +42,8 @@ namespace {
     bool runOnBasicBlock(BasicBlock &BB) override {
       if (skipOptnoneFunction(BB))
         return false;
-      TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+      auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+      TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
       bool Changed = false;
       for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
         Instruction *Inst = DI++;
@@ -95,7 +96,8 @@ bool DCE::runOnFunction(Function &F) {
   if (skipOptnoneFunction(F))
     return false;
 
-  TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+  TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
 
   // Start out with all of the instructions in the worklist...
   std::vector<Instruction*> WorkList;
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index a1ddc00..c2ce1d5 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -33,7 +33,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index cd2ecad..9309623 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -12,12 +12,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/EarlyCSE.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/ScopedHashTable.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
@@ -26,7 +27,8 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/RecyclingAllocator.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <deque>
 using namespace llvm;
@@ -40,49 +42,44 @@ STATISTIC(NumCSELoad,  "Number of load instructions CSE'd");
 STATISTIC(NumCSECall,  "Number of call instructions CSE'd");
 STATISTIC(NumDSE,      "Number of trivial dead stores removed");
 
-static unsigned getHash(const void *V) {
-  return DenseMapInfo<const void*>::getHashValue(V);
-}
-
 //===----------------------------------------------------------------------===//
 // SimpleValue
 //===----------------------------------------------------------------------===//
 
 namespace {
-  /// SimpleValue - Instances of this struct represent available values in the
-  /// scoped hash table.
-  struct SimpleValue {
-    Instruction *Inst;
+/// \brief Struct representing the available values in the scoped hash table.
+struct SimpleValue {
+  Instruction *Inst;
 
-    SimpleValue(Instruction *I) : Inst(I) {
-      assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
-    }
+  SimpleValue(Instruction *I) : Inst(I) {
+    assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+  }
 
-    bool isSentinel() const {
-      return Inst == DenseMapInfo<Instruction*>::getEmptyKey() ||
-             Inst == DenseMapInfo<Instruction*>::getTombstoneKey();
-    }
+  bool isSentinel() const {
+    return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+           Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
 
-    static bool canHandle(Instruction *Inst) {
-      // This can only handle non-void readnone functions.
-      if (CallInst *CI = dyn_cast<CallInst>(Inst))
-        return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
-      return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) ||
-             isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) ||
-             isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
-             isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
-             isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst);
-    }
-  };
+  static bool canHandle(Instruction *Inst) {
+    // This can only handle non-void readnone functions.
+    if (CallInst *CI = dyn_cast<CallInst>(Inst))
+      return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
+    return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) ||
+           isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) ||
+           isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
+           isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
+           isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst);
+  }
+};
 }
 
 namespace llvm {
-template<> struct DenseMapInfo<SimpleValue> {
+template <> struct DenseMapInfo<SimpleValue> {
   static inline SimpleValue getEmptyKey() {
-    return DenseMapInfo<Instruction*>::getEmptyKey();
+    return DenseMapInfo<Instruction *>::getEmptyKey();
   }
   static inline SimpleValue getTombstoneKey() {
-    return DenseMapInfo<Instruction*>::getTombstoneKey();
+    return DenseMapInfo<Instruction *>::getTombstoneKey();
   }
   static unsigned getHashValue(SimpleValue Val);
   static bool isEqual(SimpleValue LHS, SimpleValue RHS);
@@ -92,7 +89,7 @@ template<> struct DenseMapInfo<SimpleValue> {
 unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
   Instruction *Inst = Val.Inst;
   // Hash in all of the operands as pointers.
-  if (BinaryOperator* BinOp = dyn_cast<BinaryOperator>(Inst)) {
+  if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst)) {
     Value *LHS = BinOp->getOperand(0);
     Value *RHS = BinOp->getOperand(1);
     if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1))
@@ -101,8 +98,9 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
     if (isa<OverflowingBinaryOperator>(BinOp)) {
       // Hash the overflow behavior
       unsigned Overflow =
-        BinOp->hasNoSignedWrap()   * OverflowingBinaryOperator::NoSignedWrap |
-        BinOp->hasNoUnsignedWrap() * OverflowingBinaryOperator::NoUnsignedWrap;
+          BinOp->hasNoSignedWrap() * OverflowingBinaryOperator::NoSignedWrap |
+          BinOp->hasNoUnsignedWrap() *
+              OverflowingBinaryOperator::NoUnsignedWrap;
       return hash_combine(BinOp->getOpcode(), Overflow, LHS, RHS);
     }
 
@@ -135,12 +133,13 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
   assert((isa<CallInst>(Inst) || isa<BinaryOperator>(Inst) ||
           isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
           isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
-          isa<ShuffleVectorInst>(Inst)) && "Invalid/unknown instruction");
+          isa<ShuffleVectorInst>(Inst)) &&
+         "Invalid/unknown instruction");
 
   // Mix in the opcode.
-  return hash_combine(Inst->getOpcode(),
-                      hash_combine_range(Inst->value_op_begin(),
-                                         Inst->value_op_end()));
+  return hash_combine(
+      Inst->getOpcode(),
+      hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
 }
 
 bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
@@ -149,22 +148,24 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
   if (LHS.isSentinel() || RHS.isSentinel())
     return LHSI == RHSI;
 
-  if (LHSI->getOpcode() != RHSI->getOpcode()) return false;
-  if (LHSI->isIdenticalTo(RHSI)) return true;
+  if (LHSI->getOpcode() != RHSI->getOpcode())
+    return false;
+  if (LHSI->isIdenticalTo(RHSI))
+    return true;
 
   // If we're not strictly identical, we still might be a commutable instruction
   if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) {
     if (!LHSBinOp->isCommutative())
       return false;
 
-    assert(isa<BinaryOperator>(RHSI)
-           && "same opcode, but different instruction type?");
+    assert(isa<BinaryOperator>(RHSI) &&
+           "same opcode, but different instruction type?");
     BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI);
 
     // Check overflow attributes
     if (isa<OverflowingBinaryOperator>(LHSBinOp)) {
-      assert(isa<OverflowingBinaryOperator>(RHSBinOp)
-             && "same opcode, but different operator type?");
+      assert(isa<OverflowingBinaryOperator>(RHSBinOp) &&
+             "same opcode, but different operator type?");
       if (LHSBinOp->hasNoUnsignedWrap() != RHSBinOp->hasNoUnsignedWrap() ||
           LHSBinOp->hasNoSignedWrap() != RHSBinOp->hasNoSignedWrap())
         return false;
@@ -172,16 +173,16 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
 
     // Commuted equality
     return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) &&
-      LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0);
+           LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0);
   }
   if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) {
-    assert(isa<CmpInst>(RHSI)
-           && "same opcode, but different instruction type?");
+    assert(isa<CmpInst>(RHSI) &&
+           "same opcode, but different instruction type?");
     CmpInst *RHSCmp = cast<CmpInst>(RHSI);
     // Commuted equality
     return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) &&
-      LHSCmp->getOperand(1) == RHSCmp->getOperand(0) &&
-      LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
+           LHSCmp->getOperand(1) == RHSCmp->getOperand(0) &&
+           LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
   }
 
   return false;
@@ -192,57 +193,52 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
 //===----------------------------------------------------------------------===//
 
 namespace {
-  /// CallValue - Instances of this struct represent available call values in
-  /// the scoped hash table.
-  struct CallValue {
-    Instruction *Inst;
+/// \brief Struct representing the available call values in the scoped hash
+/// table.
+struct CallValue {
+  Instruction *Inst;
 
-    CallValue(Instruction *I) : Inst(I) {
-      assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
-    }
+  CallValue(Instruction *I) : Inst(I) {
+    assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+  }
 
-    bool isSentinel() const {
-      return Inst == DenseMapInfo<Instruction*>::getEmptyKey() ||
-             Inst == DenseMapInfo<Instruction*>::getTombstoneKey();
-    }
+  bool isSentinel() const {
+    return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+           Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
 
-    static bool canHandle(Instruction *Inst) {
-      // Don't value number anything that returns void.
-      if (Inst->getType()->isVoidTy())
-        return false;
+  static bool canHandle(Instruction *Inst) {
+    // Don't value number anything that returns void.
+    if (Inst->getType()->isVoidTy())
+      return false;
 
-      CallInst *CI = dyn_cast<CallInst>(Inst);
-      if (!CI || !CI->onlyReadsMemory())
-        return false;
-      return true;
-    }
-  };
+    CallInst *CI = dyn_cast<CallInst>(Inst);
+    if (!CI || !CI->onlyReadsMemory())
+      return false;
+    return true;
+  }
+};
 }
 
 namespace llvm {
-  template<> struct DenseMapInfo<CallValue> {
-    static inline CallValue getEmptyKey() {
-      return DenseMapInfo<Instruction*>::getEmptyKey();
-    }
-    static inline CallValue getTombstoneKey() {
-      return DenseMapInfo<Instruction*>::getTombstoneKey();
-    }
-    static unsigned getHashValue(CallValue Val);
-    static bool isEqual(CallValue LHS, CallValue RHS);
-  };
+template <> struct DenseMapInfo<CallValue> {
+  static inline CallValue getEmptyKey() {
+    return DenseMapInfo<Instruction *>::getEmptyKey();
+  }
+  static inline CallValue getTombstoneKey() {
+    return DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+  static unsigned getHashValue(CallValue Val);
+  static bool isEqual(CallValue LHS, CallValue RHS);
+};
 }
+
 unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
   Instruction *Inst = Val.Inst;
-  // Hash in all of the operands as pointers.
-  unsigned Res = 0;
-  for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) {
-    assert(!Inst->getOperand(i)->getType()->isMetadataTy() &&
-           "Cannot value number calls with metadata operands");
-    Res ^= getHash(Inst->getOperand(i)) << (i & 0xF);
-  }
-
-  // Mix in the opcode.
-  return (Res << 1) ^ Inst->getOpcode();
+  // Hash all of the operands as pointers and mix in the opcode.
+  return hash_combine(
+      Inst->getOpcode(),
+      hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
 }
 
 bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
@@ -252,103 +248,106 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
   return LHSI->isIdenticalTo(RHSI);
 }
 
-
 //===----------------------------------------------------------------------===//
-// EarlyCSE pass.
+// EarlyCSE implementation
 //===----------------------------------------------------------------------===//
 
 namespace {
-
-/// EarlyCSE - This pass does a simple depth-first walk over the dominator
-/// tree, eliminating trivially redundant instructions and using instsimplify
-/// to canonicalize things as it goes.  It is intended to be fast and catch
-/// obvious cases so that instcombine and other passes are more effective.  It
-/// is expected that a later pass of GVN will catch the interesting/hard
-/// cases.
-class EarlyCSE : public FunctionPass {
+/// \brief A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+class EarlyCSE {
 public:
+  Function &F;
   const DataLayout *DL;
-  const TargetLibraryInfo *TLI;
-  DominatorTree *DT;
-  AssumptionTracker *AT;
-  typedef RecyclingAllocator<BumpPtrAllocator,
-                      ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy;
-  typedef ScopedHashTable<SimpleValue, Value*, DenseMapInfo<SimpleValue>,
+  const TargetLibraryInfo &TLI;
+  const TargetTransformInfo &TTI;
+  DominatorTree &DT;
+  AssumptionCache &AC;
+  typedef RecyclingAllocator<
+      BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy;
+  typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
                           AllocatorTy> ScopedHTType;
 
-  /// AvailableValues - This scoped hash table contains the current values of
-  /// all of our simple scalar expressions.  As we walk down the domtree, we
-  /// look to see if instructions are in this: if so, we replace them with what
-  /// we find, otherwise we insert them so that dominated values can succeed in
-  /// their lookup.
-  ScopedHTType *AvailableValues;
-
-  /// AvailableLoads - This scoped hash table contains the current values
-  /// of loads.  This allows us to get efficient access to dominating loads when
-  /// we have a fully redundant load.  In addition to the most recent load, we
-  /// keep track of a generation count of the read, which is compared against
-  /// the current generation count.  The current generation count is
-  /// incremented after every possibly writing memory operation, which ensures
-  /// that we only CSE loads with other loads that have no intervening store.
-  typedef RecyclingAllocator<BumpPtrAllocator,
-    ScopedHashTableVal<Value*, std::pair<Value*, unsigned> > > LoadMapAllocator;
-  typedef ScopedHashTable<Value*, std::pair<Value*, unsigned>,
-                          DenseMapInfo<Value*>, LoadMapAllocator> LoadHTType;
-  LoadHTType *AvailableLoads;
-
-  /// AvailableCalls - This scoped hash table contains the current values
-  /// of read-only call values.  It uses the same generation count as loads.
-  typedef ScopedHashTable<CallValue, std::pair<Value*, unsigned> > CallHTType;
-  CallHTType *AvailableCalls;
-
-  /// CurrentGeneration - This is the current generation of the memory value.
+  /// \brief A scoped hash table of the current values of all of our simple
+  /// scalar expressions.
+  ///
+  /// As we walk down the domtree, we look to see if instructions are in this:
+  /// if so, we replace them with what we find, otherwise we insert them so
+  /// that dominated values can succeed in their lookup.
+  ScopedHTType AvailableValues;
+
+  /// \brief A scoped hash table of the current values of loads.
+  ///
+  /// This allows us to get efficient access to dominating loads when we have
+  /// a fully redundant load.  In addition to the most recent load, we keep
+  /// track of a generation count of the read, which is compared against the
+  /// current generation count.  The current generation count is incremented
+  /// after every possibly writing memory operation, which ensures that we only
+  /// CSE loads with other loads that have no intervening store.
+  typedef RecyclingAllocator<
+      BumpPtrAllocator,
+      ScopedHashTableVal<Value *, std::pair<Value *, unsigned>>>
+      LoadMapAllocator;
+  typedef ScopedHashTable<Value *, std::pair<Value *, unsigned>,
+                          DenseMapInfo<Value *>, LoadMapAllocator> LoadHTType;
+  LoadHTType AvailableLoads;
+
+  /// \brief A scoped hash table of the current values of read-only call
+  /// values.
+  ///
+  /// It uses the same generation count as loads.
+  typedef ScopedHashTable<CallValue, std::pair<Value *, unsigned>> CallHTType;
+  CallHTType AvailableCalls;
+
+  /// \brief This is the current generation of the memory value.
   unsigned CurrentGeneration;
 
-  static char ID;
-  explicit EarlyCSE() : FunctionPass(ID) {
-    initializeEarlyCSEPass(*PassRegistry::getPassRegistry());
+  /// \brief Set up the EarlyCSE runner for a particular function.
+  EarlyCSE(Function &F, const DataLayout *DL, const TargetLibraryInfo &TLI,
+           const TargetTransformInfo &TTI, DominatorTree &DT,
+           AssumptionCache &AC)
+      : F(F), DL(DL), TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {
   }
 
-  bool runOnFunction(Function &F) override;
+  bool run();
 
 private:
-
-  // NodeScope - almost a POD, but needs to call the constructors for the
-  // scoped hash tables so that a new scope gets pushed on. These are RAII so
-  // that the scope gets popped when the NodeScope is destroyed.
+  // Almost a POD, but needs to call the constructors for the scoped hash
+  // tables so that a new scope gets pushed on. These are RAII so that the
+  // scope gets popped when the NodeScope is destroyed.
   class NodeScope {
-   public:
-    NodeScope(ScopedHTType *availableValues,
-              LoadHTType *availableLoads,
-              CallHTType *availableCalls) :
-        Scope(*availableValues),
-        LoadScope(*availableLoads),
-        CallScope(*availableCalls) {}
-
-   private:
-    NodeScope(const NodeScope&) LLVM_DELETED_FUNCTION;
-    void operator=(const NodeScope&) LLVM_DELETED_FUNCTION;
+  public:
+    NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
+              CallHTType &AvailableCalls)
+        : Scope(AvailableValues), LoadScope(AvailableLoads),
+          CallScope(AvailableCalls) {}
+
+  private:
+    NodeScope(const NodeScope &) = delete;
+    void operator=(const NodeScope &) = delete;
 
     ScopedHTType::ScopeTy Scope;
     LoadHTType::ScopeTy LoadScope;
     CallHTType::ScopeTy CallScope;
   };
 
-  // StackNode - contains all the needed information to create a stack for
-  // doing a depth first tranversal of the tree. This includes scopes for
-  // values, loads, and calls as well as the generation. There is a child
-  // iterator so that the children do not need to be store spearately.
+  // Contains all the needed information to create a stack for doing a depth
+  // first tranversal of the tree. This includes scopes for values, loads, and
+  // calls as well as the generation. There is a child iterator so that the
+  // children do not need to be store spearately.
   class StackNode {
-   public:
-    StackNode(ScopedHTType *availableValues,
-              LoadHTType *availableLoads,
-              CallHTType *availableCalls,
-              unsigned cg, DomTreeNode *n,
-              DomTreeNode::iterator child, DomTreeNode::iterator end) :
-        CurrentGeneration(cg), ChildGeneration(cg), Node(n),
-        ChildIter(child), EndIter(end),
-        Scopes(availableValues, availableLoads, availableCalls),
-        Processed(false) {}
+  public:
+    StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
+              CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n,
+              DomTreeNode::iterator child, DomTreeNode::iterator end)
+        : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
+          EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls),
+          Processed(false) {}
 
     // Accessors.
     unsigned currentGeneration() { return CurrentGeneration; }
@@ -365,9 +364,9 @@ private:
     bool isProcessed() { return Processed; }
     void process() { Processed = true; }
 
-   private:
-    StackNode(const StackNode&) LLVM_DELETED_FUNCTION;
-    void operator=(const StackNode&) LLVM_DELETED_FUNCTION;
+  private:
+    StackNode(const StackNode &) = delete;
+    void operator=(const StackNode &) = delete;
 
     // Members.
     unsigned CurrentGeneration;
@@ -379,31 +378,78 @@ private:
     bool Processed;
   };
 
+  /// \brief Wrapper class to handle memory instructions, including loads,
+  /// stores and intrinsic loads and stores defined by the target.
+  class ParseMemoryInst {
+  public:
+    ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
+        : Load(false), Store(false), Vol(false), MayReadFromMemory(false),
+          MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) {
+      MayReadFromMemory = Inst->mayReadFromMemory();
+      MayWriteToMemory = Inst->mayWriteToMemory();
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+        MemIntrinsicInfo Info;
+        if (!TTI.getTgtMemIntrinsic(II, Info))
+          return;
+        if (Info.NumMemRefs == 1) {
+          Store = Info.WriteMem;
+          Load = Info.ReadMem;
+          MatchingId = Info.MatchingId;
+          MayReadFromMemory = Info.ReadMem;
+          MayWriteToMemory = Info.WriteMem;
+          Vol = Info.Vol;
+          Ptr = Info.PtrVal;
+        }
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        Load = true;
+        Vol = !LI->isSimple();
+        Ptr = LI->getPointerOperand();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        Store = true;
+        Vol = !SI->isSimple();
+        Ptr = SI->getPointerOperand();
+      }
+    }
+    bool isLoad() { return Load; }
+    bool isStore() { return Store; }
+    bool isVolatile() { return Vol; }
+    bool isMatchingMemLoc(const ParseMemoryInst &Inst) {
+      return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId;
+    }
+    bool isValid() { return Ptr != nullptr; }
+    int getMatchingId() { return MatchingId; }
+    Value *getPtr() { return Ptr; }
+    bool mayReadFromMemory() { return MayReadFromMemory; }
+    bool mayWriteToMemory() { return MayWriteToMemory; }
+
+  private:
+    bool Load;
+    bool Store;
+    bool Vol;
+    bool MayReadFromMemory;
+    bool MayWriteToMemory;
+    // For regular (non-intrinsic) loads/stores, this is set to -1. For
+    // intrinsic loads/stores, the id is retrieved from the corresponding
+    // field in the MemIntrinsicInfo structure.  That field contains
+    // non-negative values only.
+    int MatchingId;
+    Value *Ptr;
+  };
+
   bool processNode(DomTreeNode *Node);
 
-  // This transformation requires dominator postdominator info
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AssumptionTracker>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<TargetLibraryInfo>();
-    AU.setPreservesCFG();
+  Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+      return LI;
+    else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+      return SI->getValueOperand();
+    assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
+    return TTI.getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst),
+                                                 ExpectedType);
   }
 };
 }
 
-char EarlyCSE::ID = 0;
-
-// createEarlyCSEPass - The public interface to this file.
-FunctionPass *llvm::createEarlyCSEPass() {
-  return new EarlyCSE();
-}
-
-INITIALIZE_PASS_BEGIN(EarlyCSE, "early-cse", "Early CSE", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
-INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false)
-
 bool EarlyCSE::processNode(DomTreeNode *Node) {
   BasicBlock *BB = Node->getBlock();
 
@@ -420,17 +466,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
   /// as long as there in no instruction that reads memory.  If we see a store
   /// to the same location, we delete the dead store.  This zaps trivial dead
   /// stores which can occur in bitfield code among other things.
-  StoreInst *LastStore = nullptr;
+  Instruction *LastStore = nullptr;
 
   bool Changed = false;
 
   // See if any instructions in the block can be eliminated.  If so, do it.  If
   // not, add them to AvailableValues.
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
     Instruction *Inst = I++;
 
     // Dead instructions should just be removed.
-    if (isInstructionTriviallyDead(Inst, TLI)) {
+    if (isInstructionTriviallyDead(Inst, &TLI)) {
       DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
       Inst->eraseFromParent();
       Changed = true;
@@ -449,7 +495,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
     // If the instruction can be simplified (e.g. X+0 = X) then replace it with
     // its simpler value.
-    if (Value *V = SimplifyInstruction(Inst, DL, TLI, DT, AT)) {
+    if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) {
       DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << "  to: " << *V << '\n');
       Inst->replaceAllUsesWith(V);
       Inst->eraseFromParent();
@@ -461,7 +507,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     // If this is a simple instruction that we can value number, process it.
     if (SimpleValue::canHandle(Inst)) {
       // See if the instruction has an available value.  If so, use it.
-      if (Value *V = AvailableValues->lookup(Inst)) {
+      if (Value *V = AvailableValues.lookup(Inst)) {
         DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << "  to: " << *V << '\n');
         Inst->replaceAllUsesWith(V);
         Inst->eraseFromParent();
@@ -471,52 +517,66 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       }
 
       // Otherwise, just remember that this value is available.
-      AvailableValues->insert(Inst, Inst);
+      AvailableValues.insert(Inst, Inst);
       continue;
     }
 
+    ParseMemoryInst MemInst(Inst, TTI);
     // If this is a non-volatile load, process it.
-    if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+    if (MemInst.isValid() && MemInst.isLoad()) {
       // Ignore volatile loads.
-      if (!LI->isSimple()) {
+      if (MemInst.isVolatile()) {
         LastStore = nullptr;
+        // Don't CSE across synchronization boundaries.
+        if (Inst->mayWriteToMemory())
+          ++CurrentGeneration;
         continue;
       }
 
       // If we have an available version of this load, and if it is the right
       // generation, replace this instruction.
-      std::pair<Value*, unsigned> InVal =
-        AvailableLoads->lookup(Inst->getOperand(0));
+      std::pair<Value *, unsigned> InVal =
+          AvailableLoads.lookup(MemInst.getPtr());
       if (InVal.first != nullptr && InVal.second == CurrentGeneration) {
-        DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst << "  to: "
-              << *InVal.first << '\n');
-        if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first);
-        Inst->eraseFromParent();
-        Changed = true;
-        ++NumCSELoad;
-        continue;
+        Value *Op = getOrCreateResult(InVal.first, Inst->getType());
+        if (Op != nullptr) {
+          DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
+                       << "  to: " << *InVal.first << '\n');
+          if (!Inst->use_empty())
+            Inst->replaceAllUsesWith(Op);
+          Inst->eraseFromParent();
+          Changed = true;
+          ++NumCSELoad;
+          continue;
+        }
       }
 
       // Otherwise, remember that we have this instruction.
-      AvailableLoads->insert(Inst->getOperand(0),
-                          std::pair<Value*, unsigned>(Inst, CurrentGeneration));
+      AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>(
+                                                  Inst, CurrentGeneration));
       LastStore = nullptr;
       continue;
     }
 
     // If this instruction may read from memory, forget LastStore.
-    if (Inst->mayReadFromMemory())
+    // Load/store intrinsics will indicate both a read and a write to
+    // memory.  The target may override this (e.g. so that a store intrinsic
+    // does not read  from memory, and thus will be treated the same as a
+    // regular store for commoning purposes).
+    if (Inst->mayReadFromMemory() &&
+        !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
       LastStore = nullptr;
 
     // If this is a read-only call, process it.
     if (CallValue::canHandle(Inst)) {
       // If we have an available version of this call, and if it is the right
       // generation, replace this instruction.
-      std::pair<Value*, unsigned> InVal = AvailableCalls->lookup(Inst);
+      std::pair<Value *, unsigned> InVal = AvailableCalls.lookup(Inst);
       if (InVal.first != nullptr && InVal.second == CurrentGeneration) {
-        DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst << "  to: "
-                     << *InVal.first << '\n');
-        if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first);
+        DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst
+                     << "  to: " << *InVal.first << '\n');
+        if (!Inst->use_empty())
+          Inst->replaceAllUsesWith(InVal.first);
         Inst->eraseFromParent();
         Changed = true;
         ++NumCSECall;
@@ -524,8 +584,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       }
 
       // Otherwise, remember that we have this instruction.
-      AvailableCalls->insert(Inst,
-                         std::pair<Value*, unsigned>(Inst, CurrentGeneration));
+      AvailableCalls.insert(
+          Inst, std::pair<Value *, unsigned>(Inst, CurrentGeneration));
       continue;
     }
 
@@ -535,17 +595,19 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     if (Inst->mayWriteToMemory()) {
       ++CurrentGeneration;
 
-      if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      if (MemInst.isValid() && MemInst.isStore()) {
         // We do a trivial form of DSE if there are two stores to the same
         // location with no intervening loads.  Delete the earlier store.
-        if (LastStore &&
-            LastStore->getPointerOperand() == SI->getPointerOperand()) {
-          DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore << "  due to: "
-                       << *Inst << '\n');
-          LastStore->eraseFromParent();
-          Changed = true;
-          ++NumDSE;
-          LastStore = nullptr;
+        if (LastStore) {
+          ParseMemoryInst LastStoreMemInst(LastStore, TTI);
+          if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
+            DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
+                         << "  due to: " << *Inst << '\n');
+            LastStore->eraseFromParent();
+            Changed = true;
+            ++NumDSE;
+            LastStore = nullptr;
+          }
           // fallthrough - we can exploit information about this store
         }
 
@@ -554,12 +616,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         // version of the pointer.  It is safe to forward from volatile stores
         // to non-volatile loads, so we don't have to check for volatility of
         // the store.
-        AvailableLoads->insert(SI->getPointerOperand(),
-         std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration));
+        AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>(
+                                                    Inst, CurrentGeneration));
 
         // Remember that this was the last store we saw for DSE.
-        if (SI->isSimple())
-          LastStore = SI;
+        if (!MemInst.isVolatile())
+          LastStore = Inst;
       }
     }
   }
@@ -567,40 +629,20 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
   return Changed;
 }
 
-
-bool EarlyCSE::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
-    return false;
-
-  // Note, deque is being used here because there is significant performance gains
-  // over vector when the container becomes very large due to the specific access
-  // patterns. For more information see the mailing list discussion on this:
+bool EarlyCSE::run() {
+  // Note, deque is being used here because there is significant performance
+  // gains over vector when the container becomes very large due to the
+  // specific access patterns. For more information see the mailing list
+  // discussion on this:
   // http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
   std::deque<StackNode *> nodesToProcess;
 
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
-  TLI = &getAnalysis<TargetLibraryInfo>();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  AT = &getAnalysis<AssumptionTracker>();
-
-  // Tables that the pass uses when walking the domtree.
-  ScopedHTType AVTable;
-  AvailableValues = &AVTable;
-  LoadHTType LoadTable;
-  AvailableLoads = &LoadTable;
-  CallHTType CallTable;
-  AvailableCalls = &CallTable;
-
-  CurrentGeneration = 0;
   bool Changed = false;
 
   // Process the root node.
-  nodesToProcess.push_back(
-      new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
-                    CurrentGeneration, DT->getRootNode(),
-                    DT->getRootNode()->begin(),
-                    DT->getRootNode()->end()));
+  nodesToProcess.push_back(new StackNode(
+      AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration,
+      DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end()));
 
   // Save the current generation.
   unsigned LiveOutGeneration = CurrentGeneration;
@@ -624,11 +666,9 @@ bool EarlyCSE::runOnFunction(Function &F) {
       // Push the next child onto the stack.
       DomTreeNode *child = NodeToProcess->nextChild();
       nodesToProcess.push_back(
-          new StackNode(AvailableValues,
-                        AvailableLoads,
-                        AvailableCalls,
-                        NodeToProcess->childGeneration(), child,
-                        child->begin(), child->end()));
+          new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
+                        NodeToProcess->childGeneration(), child, child->begin(),
+                        child->end()));
     } else {
       // It has been processed, and there are no more children to process,
       // so delete it and pop it off the stack.
@@ -642,3 +682,78 @@ bool EarlyCSE::runOnFunction(Function &F) {
 
   return Changed;
 }
+
+PreservedAnalyses EarlyCSEPass::run(Function &F,
+                                    AnalysisManager<Function> *AM) {
+  const DataLayout *DL = F.getParent()->getDataLayout();
+
+  auto &TLI = AM->getResult<TargetLibraryAnalysis>(F);
+  auto &TTI = AM->getResult<TargetIRAnalysis>(F);
+  auto &DT = AM->getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM->getResult<AssumptionAnalysis>(F);
+
+  EarlyCSE CSE(F, DL, TLI, TTI, DT, AC);
+
+  if (!CSE.run())
+    return PreservedAnalyses::all();
+
+  // CSE preserves the dominator tree because it doesn't mutate the CFG.
+  // FIXME: Bundle this with other CFG-preservation.
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
+
+namespace {
+/// \brief A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+class EarlyCSELegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  EarlyCSELegacyPass() : FunctionPass(ID) {
+    initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipOptnoneFunction(F))
+      return false;
+
+    DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+    auto *DL = DLP ? &DLP->getDataLayout() : nullptr;
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+    EarlyCSE CSE(F, DL, TLI, TTI, DT, AC);
+
+    return CSE.run();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+}
+
+char EarlyCSELegacyPass::ID = 0;
+
+FunctionPass *llvm::createEarlyCSEPass() { return new EarlyCSELegacyPass(); }
+
+INITIALIZE_PASS_BEGIN(EarlyCSELegacyPass, "early-cse", "Early CSE", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false)
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 7dba4e2..73a1f25 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -20,11 +20,12 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -44,7 +45,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -457,7 +458,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) {
   return e;
 }
 
-/// lookup - Returns the value number of the specified value. Fails if
+/// Returns the value number of the specified value. Fails if
 /// the value has not yet been numbered.
 uint32_t ValueTable::lookup(Value *V) const {
   DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
@@ -465,7 +466,7 @@ uint32_t ValueTable::lookup(Value *V) const {
   return VI->second;
 }
 
-/// lookup_or_add_cmp - Returns the value number of the given comparison,
+/// Returns the value number of the given comparison,
 /// assigning it a new number if it did not have one before.  Useful when
 /// we deduced the result of a comparison, but don't immediately have an
 /// instruction realizing that comparison to hand.
@@ -478,14 +479,14 @@ uint32_t ValueTable::lookup_or_add_cmp(unsigned Opcode,
   return e;
 }
 
-/// clear - Remove all entries from the ValueTable.
+/// Remove all entries from the ValueTable.
 void ValueTable::clear() {
   valueNumbering.clear();
   expressionNumbering.clear();
   nextValueNumber = 1;
 }
 
-/// erase - Remove a value from the value numbering.
+/// Remove a value from the value numbering.
 void ValueTable::erase(Value *V) {
   valueNumbering.erase(V);
 }
@@ -581,8 +582,8 @@ namespace {
       return cast<MemIntrinsic>(Val.getPointer());
     }
   
-    /// MaterializeAdjustedValue - Emit code into this block to adjust the value
-    /// defined here to the specified type.  This handles various coercion cases.
+    /// Emit code into this block to adjust the value defined here to the
+    /// specified type. This handles various coercion cases.
     Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const;
   };
 
@@ -592,12 +593,12 @@ namespace {
     DominatorTree *DT;
     const DataLayout *DL;
     const TargetLibraryInfo *TLI;
-    AssumptionTracker *AT;
+    AssumptionCache *AC;
     SetVector<BasicBlock *> DeadBlocks;
 
     ValueTable VN;
 
-    /// LeaderTable - A mapping from value numbers to lists of Value*'s that
+    /// A mapping from value numbers to lists of Value*'s that
     /// have that value number.  Use findLeader to query it.
     struct LeaderTableEntry {
       Value *Val;
@@ -622,7 +623,7 @@ namespace {
 
     bool runOnFunction(Function &F) override;
 
-    /// markInstructionForDeletion - This removes the specified instruction from
+    /// This removes the specified instruction from
     /// our various maps and marks it for deletion.
     void markInstructionForDeletion(Instruction *I) {
       VN.erase(I);
@@ -634,8 +635,7 @@ namespace {
     AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); }
     MemoryDependenceAnalysis &getMemDep() const { return *MD; }
   private:
-    /// addToLeaderTable - Push a new Value to the LeaderTable onto the list for
-    /// its value number.
+    /// Push a new Value to the LeaderTable onto the list for its value number.
     void addToLeaderTable(uint32_t N, Value *V, const BasicBlock *BB) {
       LeaderTableEntry &Curr = LeaderTable[N];
       if (!Curr.Val) {
@@ -651,7 +651,7 @@ namespace {
       Curr.Next = Node;
     }
 
-    /// removeFromLeaderTable - Scan the list of values corresponding to a given
+    /// Scan the list of values corresponding to a given
     /// value number, and remove the given instruction if encountered.
     void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) {
       LeaderTableEntry* Prev = nullptr;
@@ -682,9 +682,9 @@ namespace {
 
     // This transformation requires dominator postdominator info
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionTracker>();
+      AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
       if (!NoLoads)
         AU.addRequired<MemoryDependenceAnalysis>();
       AU.addRequired<AliasAnalysis>();
@@ -709,6 +709,9 @@ namespace {
     void dump(DenseMap<uint32_t, Value*> &d);
     bool iterateOnFunction(Function &F);
     bool performPRE(Function &F);
+    bool performScalarPRE(Instruction *I);
+    bool performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
+                                   unsigned int ValNo);
     Value *findLeader(const BasicBlock *BB, uint32_t num);
     void cleanupGlobalSets();
     void verifyRemoved(const Instruction *I) const;
@@ -725,16 +728,16 @@ namespace {
   char GVN::ID = 0;
 }
 
-// createGVNPass - The public interface to this file...
+// The public interface to this file...
 FunctionPass *llvm::createGVNPass(bool NoLoads) {
   return new GVN(NoLoads);
 }
 
 INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false)
 
@@ -750,7 +753,7 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) {
 }
 #endif
 
-/// IsValueFullyAvailableInBlock - Return true if we can prove that the value
+/// Return true if we can prove that the value
 /// we're analyzing is fully available in the specified block.  As we go, keep
 /// track of which blocks we know are fully alive in FullyAvailableBlocks.  This
 /// map is actually a tri-state map with the following values:
@@ -796,7 +799,7 @@ static bool IsValueFullyAvailableInBlock(BasicBlock *BB,
 
   return true;
 
-// SpeculationFailure - If we get here, we found out that this is not, after
+// If we get here, we found out that this is not, after
 // all, a fully-available block.  We have a problem if we speculated on this and
 // used the speculation to mark other blocks as available.
 SpeculationFailure:
@@ -831,8 +834,7 @@ SpeculationFailure:
 }
 
 
-/// CanCoerceMustAliasedValueToLoad - Return true if
-/// CoerceAvailableValueToLoadType will succeed.
+/// Return true if CoerceAvailableValueToLoadType will succeed.
 static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
                                             Type *LoadTy,
                                             const DataLayout &DL) {
@@ -851,7 +853,7 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
   return true;
 }
 
-/// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and
+/// If we saw a store of a value to memory, and
 /// then a load from a must-aliased pointer of a different type, try to coerce
 /// the stored value.  LoadedTy is the type of the load we want to replace and
 /// InsertPt is the place to insert new instructions.
@@ -936,7 +938,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
   return new BitCastInst(StoredVal, LoadedTy, "bitcast", InsertPt);
 }
 
-/// AnalyzeLoadFromClobberingWrite - This function is called when we have a
+/// This function is called when we have a
 /// memdep query of a load that ends up being a clobbering memory write (store,
 /// memset, memcpy, memmove).  This means that the write *may* provide bits used
 /// by the load but we can't be sure because the pointers don't mustalias.
@@ -1016,7 +1018,7 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
   return LoadOffset-StoreOffset;
 }
 
-/// AnalyzeLoadFromClobberingStore - This function is called when we have a
+/// This function is called when we have a
 /// memdep query of a load that ends up being a clobbering store.
 static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
                                           StoreInst *DepSI,
@@ -1032,7 +1034,7 @@ static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
                                         StorePtr, StoreSize, DL);
 }
 
-/// AnalyzeLoadFromClobberingLoad - This function is called when we have a
+/// This function is called when we have a
 /// memdep query of a load that ends up being clobbered by another load.  See if
 /// the other load can feed into the second load.
 static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
@@ -1108,7 +1110,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
 }
 
 
-/// GetStoreValueForLoad - This function is called when we have a
+/// This function is called when we have a
 /// memdep query of a load that ends up being a clobbering store.  This means
 /// that the store provides bits used by the load but we the pointers don't
 /// mustalias.  Check this case to see if there is anything more we can do
@@ -1147,7 +1149,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
   return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, DL);
 }
 
-/// GetLoadValueForLoad - This function is called when we have a
+/// This function is called when we have a
 /// memdep query of a load that ends up being a clobbering load.  This means
 /// that the load *may* provide bits used by the load but we can't be sure
 /// because the pointers don't mustalias.  Check this case to see if there is
@@ -1210,7 +1212,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
 }
 
 
-/// GetMemInstValueForLoad - This function is called when we have a
+/// This function is called when we have a
 /// memdep query of a load that ends up being a clobbering mem intrinsic.
 static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
                                      Type *LoadTy, Instruction *InsertPt,
@@ -1267,7 +1269,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
 }
 
 
-/// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock,
+/// Given a set of loads specified by ValuesPerBlock,
 /// construct SSA form, allowing us to eliminate LI.  This returns the value
 /// that should be used at LI's definition site.
 static Value *ConstructSSAForLoadSet(LoadInst *LI,
@@ -1621,7 +1623,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     // If all preds have a single successor, then we know it is safe to insert
     // the load on the pred (?!?), so we can insert code to materialize the
     // pointer if it is not available.
-    PHITransAddr Address(LI->getPointerOperand(), DL, AT);
+    PHITransAddr Address(LI->getPointerOperand(), DL, AC);
     Value *LoadPtr = nullptr;
     LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
                                                 *DT, NewInsts);
@@ -1702,13 +1704,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   return true;
 }
 
-/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are
+/// Attempt to eliminate a load whose dependencies are
 /// non-local by performing PHI construction.
 bool GVN::processNonLocalLoad(LoadInst *LI) {
   // Step 1: Find the non-local dependencies of the load.
   LoadDepVect Deps;
-  AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI);
-  MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps);
+  MD->getNonLocalPointerDependency(LI, Deps);
 
   // If we had to process more than one hundred blocks to find the
   // dependencies, this load isn't worth worrying about.  Optimizing
@@ -1729,6 +1730,15 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     return false;
   }
 
+  // If this load follows a GEP, see if we can PRE the indices before analyzing.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) {
+    for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(),
+                                        OE = GEP->idx_end();
+         OI != OE; ++OI)
+      if (Instruction *I = dyn_cast<Instruction>(OI->get()))
+        performScalarPRE(I);
+  }
+
   // Step 2: Analyze the availability of the load
   AvailValInBlkVect ValuesPerBlock;
   UnavailBlkVect UnavailableBlocks;
@@ -1807,7 +1817,7 @@ static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
   I->replaceAllUsesWith(Repl);
 }
 
-/// processLoad - Attempt to eliminate a load, first by eliminating it
+/// Attempt to eliminate a load, first by eliminating it
 /// locally, and then attempting non-local elimination if that fails.
 bool GVN::processLoad(LoadInst *L) {
   if (!MD)
@@ -2006,7 +2016,7 @@ bool GVN::processLoad(LoadInst *L) {
   return false;
 }
 
-// findLeader - In order to find a leader for a given value number at a
+// In order to find a leader for a given value number at a
 // specific basic block, we first obtain the list of all Values for that number,
 // and then scan the list to find one whose block dominates the block in
 // question.  This is fast because dominator tree queries consist of only
@@ -2034,9 +2044,8 @@ Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) {
   return Val;
 }
 
-/// replaceAllDominatedUsesWith - Replace all uses of 'From' with 'To' if the
-/// use is dominated by the given basic block.  Returns the number of uses that
-/// were replaced.
+/// Replace all uses of 'From' with 'To' if the use is dominated by the given
+/// basic block.  Returns the number of uses that were replaced.
 unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To,
                                           const BasicBlockEdge &Root) {
   unsigned Count = 0;
@@ -2052,7 +2061,7 @@ unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To,
   return Count;
 }
 
-/// isOnlyReachableViaThisEdge - There is an edge from 'Src' to 'Dst'.  Return
+/// There is an edge from 'Src' to 'Dst'.  Return
 /// true if every path from the entry block to 'Dst' passes via this edge.  In
 /// particular 'Dst' must not be reachable via another edge from 'Src'.
 static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
@@ -2069,7 +2078,7 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
   return Pred != nullptr;
 }
 
-/// propagateEquality - The given values are known to be equal in every block
+/// The given values are known to be equal in every block
 /// dominated by 'Root'.  Exploit this, for example by replacing 'LHS' with
 /// 'RHS' everywhere in the scope.  Returns whether a change was made.
 bool GVN::propagateEquality(Value *LHS, Value *RHS,
@@ -2096,15 +2105,15 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
       std::swap(LHS, RHS);
     assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!");
 
-    // If there is no obvious reason to prefer the left-hand side over the right-
-    // hand side, ensure the longest lived term is on the right-hand side, so the
-    // shortest lived term will be replaced by the longest lived.  This tends to
-    // expose more simplifications.
+    // If there is no obvious reason to prefer the left-hand side over the
+    // right-hand side, ensure the longest lived term is on the right-hand side,
+    // so the shortest lived term will be replaced by the longest lived.
+    // This tends to expose more simplifications.
     uint32_t LVN = VN.lookup_or_add(LHS);
     if ((isa<Argument>(LHS) && isa<Argument>(RHS)) ||
         (isa<Instruction>(LHS) && isa<Instruction>(RHS))) {
-      // Move the 'oldest' value to the right-hand side, using the value number as
-      // a proxy for age.
+      // Move the 'oldest' value to the right-hand side, using the value number
+      // as a proxy for age.
       uint32_t RVN = VN.lookup_or_add(RHS);
       if (LVN < RVN) {
         std::swap(LHS, RHS);
@@ -2133,10 +2142,10 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
       NumGVNEqProp += NumReplacements;
     }
 
-    // Now try to deduce additional equalities from this one.  For example, if the
-    // known equality was "(A != B)" == "false" then it follows that A and B are
-    // equal in the scope.  Only boolean equalities with an explicit true or false
-    // RHS are currently supported.
+    // Now try to deduce additional equalities from this one. For example, if
+    // the known equality was "(A != B)" == "false" then it follows that A and B
+    // are equal in the scope. Only boolean equalities with an explicit true or
+    // false RHS are currently supported.
     if (!RHS->getType()->isIntegerTy(1))
       // Not a boolean equality - bail out.
       continue;
@@ -2161,7 +2170,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
     // If we are propagating an equality like "(A == B)" == "true" then also
     // propagate the equality A == B.  When propagating a comparison such as
     // "(A >= B)" == "true", replace all instances of "A < B" with "false".
-    if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) {
+    if (CmpInst *Cmp = dyn_cast<CmpInst>(LHS)) {
       Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
 
       // If "A == B" is known true, or "A != B" is known false, then replace
@@ -2170,12 +2179,28 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
           (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE))
         Worklist.push_back(std::make_pair(Op0, Op1));
 
+      // Handle the floating point versions of equality comparisons too.
+      if ((isKnownTrue && Cmp->getPredicate() == CmpInst::FCMP_OEQ) ||
+          (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE)) {
+
+        // Floating point -0.0 and 0.0 compare equal, so we can only
+        // propagate values if we know that we have a constant and that
+        // its value is non-zero.
+        
+        // FIXME: We should do this optimization if 'no signed zeros' is
+        // applicable via an instruction-level fast-math-flag or some other
+        // indicator that relaxed FP semantics are being used.
+
+        if (isa<ConstantFP>(Op1) && !cast<ConstantFP>(Op1)->isZero())
+          Worklist.push_back(std::make_pair(Op0, Op1));
+      }
+ 
       // If "A >= B" is known true, replace "A < B" with false everywhere.
       CmpInst::Predicate NotPred = Cmp->getInversePredicate();
       Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
-      // Since we don't have the instruction "A < B" immediately to hand, work out
-      // the value number that it would have and use that to find an appropriate
-      // instruction (if any).
+      // Since we don't have the instruction "A < B" immediately to hand, work
+      // out the value number that it would have and use that to find an
+      // appropriate instruction (if any).
       uint32_t NextNum = VN.getNextUnusedValueNumber();
       uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1);
       // If the number we were assigned was brand new then there is no point in
@@ -2203,7 +2228,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
   return Changed;
 }
 
-/// processInstruction - When calculating availability, handle an instruction
+/// When calculating availability, handle an instruction
 /// by inserting it into the appropriate sets
 bool GVN::processInstruction(Instruction *I) {
   // Ignore dbg info intrinsics.
@@ -2214,7 +2239,7 @@ bool GVN::processInstruction(Instruction *I) {
   // to value numbering it.  Value numbering often exposes redundancies, for
   // example if it determines that %y is equal to %x then the instruction
   // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
-  if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AT)) {
+  if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
     I->replaceAllUsesWith(V);
     if (MD && V->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
@@ -2334,8 +2359,8 @@ bool GVN::runOnFunction(Function& F) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
-  AT = &getAnalysis<AssumptionTracker>();
-  TLI = &getAnalysis<TargetLibraryInfo>();
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>());
   VN.setMemDep(MD);
   VN.setDomTree(DT);
@@ -2348,7 +2373,8 @@ bool GVN::runOnFunction(Function& F) {
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
     BasicBlock *BB = FI++;
 
-    bool removedBlock = MergeBlockIntoPredecessor(BB, this);
+    bool removedBlock = MergeBlockIntoPredecessor(
+        BB, DT, /* LoopInfo */ nullptr, VN.getAliasAnalysis(), MD);
     if (removedBlock) ++NumGVNBlocks;
 
     Changed |= removedBlock;
@@ -2431,175 +2457,204 @@ bool GVN::processBlock(BasicBlock *BB) {
   return ChangedFunction;
 }
 
-/// performPRE - Perform a purely local form of PRE that looks for diamond
-/// control flow patterns and attempts to perform simple PRE at the join point.
-bool GVN::performPRE(Function &F) {
-  bool Changed = false;
+// Instantiate an expression in a predecessor that lacked it.
+bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
+                                    unsigned int ValNo) {
+  // Because we are going top-down through the block, all value numbers
+  // will be available in the predecessor by the time we need them.  Any
+  // that weren't originally present will have been instantiated earlier
+  // in this loop.
+  bool success = true;
+  for (unsigned i = 0, e = Instr->getNumOperands(); i != e; ++i) {
+    Value *Op = Instr->getOperand(i);
+    if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
+      continue;
+
+    if (Value *V = findLeader(Pred, VN.lookup(Op))) {
+      Instr->setOperand(i, V);
+    } else {
+      success = false;
+      break;
+    }
+  }
+
+  // Fail out if we encounter an operand that is not available in
+  // the PRE predecessor.  This is typically because of loads which
+  // are not value numbered precisely.
+  if (!success)
+    return false;
+
+  Instr->insertBefore(Pred->getTerminator());
+  Instr->setName(Instr->getName() + ".pre");
+  Instr->setDebugLoc(Instr->getDebugLoc());
+  VN.add(Instr, ValNo);
+
+  // Update the availability map to include the new instruction.
+  addToLeaderTable(ValNo, Instr, Pred);
+  return true;
+}
+
+bool GVN::performScalarPRE(Instruction *CurInst) {
   SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap;
-  for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) {
-    // Nothing to PRE in the entry block.
-    if (CurrentBlock == &F.getEntryBlock()) continue;
 
-    // Don't perform PRE on a landing pad.
-    if (CurrentBlock->isLandingPad()) continue;
+  if (isa<AllocaInst>(CurInst) || isa<TerminatorInst>(CurInst) ||
+      isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
+      CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
+      isa<DbgInfoIntrinsic>(CurInst))
+    return false;
 
-    for (BasicBlock::iterator BI = CurrentBlock->begin(),
-         BE = CurrentBlock->end(); BI != BE; ) {
-      Instruction *CurInst = BI++;
+  // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
+  // sinking the compare again, and it would force the code generator to
+  // move the i1 from processor flags or predicate registers into a general
+  // purpose register.
+  if (isa<CmpInst>(CurInst))
+    return false;
 
-      if (isa<AllocaInst>(CurInst) ||
-          isa<TerminatorInst>(CurInst) || isa<PHINode>(CurInst) ||
-          CurInst->getType()->isVoidTy() ||
-          CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
-          isa<DbgInfoIntrinsic>(CurInst))
-        continue;
+  // We don't currently value number ANY inline asm calls.
+  if (CallInst *CallI = dyn_cast<CallInst>(CurInst))
+    if (CallI->isInlineAsm())
+      return false;
 
-      // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
-      // sinking the compare again, and it would force the code generator to
-      // move the i1 from processor flags or predicate registers into a general
-      // purpose register.
-      if (isa<CmpInst>(CurInst))
-        continue;
+  uint32_t ValNo = VN.lookup(CurInst);
+
+  // Look for the predecessors for PRE opportunities.  We're
+  // only trying to solve the basic diamond case, where
+  // a value is computed in the successor and one predecessor,
+  // but not the other.  We also explicitly disallow cases
+  // where the successor is its own predecessor, because they're
+  // more complicated to get right.
+  unsigned NumWith = 0;
+  unsigned NumWithout = 0;
+  BasicBlock *PREPred = nullptr;
+  BasicBlock *CurrentBlock = CurInst->getParent();
+  predMap.clear();
+
+  for (pred_iterator PI = pred_begin(CurrentBlock), PE = pred_end(CurrentBlock);
+       PI != PE; ++PI) {
+    BasicBlock *P = *PI;
+    // We're not interested in PRE where the block is its
+    // own predecessor, or in blocks with predecessors
+    // that are not reachable.
+    if (P == CurrentBlock) {
+      NumWithout = 2;
+      break;
+    } else if (!DT->isReachableFromEntry(P)) {
+      NumWithout = 2;
+      break;
+    }
 
-      // We don't currently value number ANY inline asm calls.
-      if (CallInst *CallI = dyn_cast<CallInst>(CurInst))
-        if (CallI->isInlineAsm())
-          continue;
+    Value *predV = findLeader(P, ValNo);
+    if (!predV) {
+      predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
+      PREPred = P;
+      ++NumWithout;
+    } else if (predV == CurInst) {
+      /* CurInst dominates this predecessor. */
+      NumWithout = 2;
+      break;
+    } else {
+      predMap.push_back(std::make_pair(predV, P));
+      ++NumWith;
+    }
+  }
 
-      uint32_t ValNo = VN.lookup(CurInst);
-
-      // Look for the predecessors for PRE opportunities.  We're
-      // only trying to solve the basic diamond case, where
-      // a value is computed in the successor and one predecessor,
-      // but not the other.  We also explicitly disallow cases
-      // where the successor is its own predecessor, because they're
-      // more complicated to get right.
-      unsigned NumWith = 0;
-      unsigned NumWithout = 0;
-      BasicBlock *PREPred = nullptr;
-      predMap.clear();
-
-      for (pred_iterator PI = pred_begin(CurrentBlock),
-           PE = pred_end(CurrentBlock); PI != PE; ++PI) {
-        BasicBlock *P = *PI;
-        // We're not interested in PRE where the block is its
-        // own predecessor, or in blocks with predecessors
-        // that are not reachable.
-        if (P == CurrentBlock) {
-          NumWithout = 2;
-          break;
-        } else if (!DT->isReachableFromEntry(P))  {
-          NumWithout = 2;
-          break;
-        }
+  // Don't do PRE when it might increase code size, i.e. when
+  // we would need to insert instructions in more than one pred.
+  if (NumWithout > 1 || NumWith == 0)
+    return false;
 
-        Value* predV = findLeader(P, ValNo);
-        if (!predV) {
-          predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
-          PREPred = P;
-          ++NumWithout;
-        } else if (predV == CurInst) {
-          /* CurInst dominates this predecessor. */
-          NumWithout = 2;
-          break;
-        } else {
-          predMap.push_back(std::make_pair(predV, P));
-          ++NumWith;
-        }
-      }
+  // We may have a case where all predecessors have the instruction,
+  // and we just need to insert a phi node. Otherwise, perform
+  // insertion.
+  Instruction *PREInstr = nullptr;
 
-      // Don't do PRE when it might increase code size, i.e. when
-      // we would need to insert instructions in more than one pred.
-      if (NumWithout != 1 || NumWith == 0)
-        continue;
+  if (NumWithout != 0) {
+    // Don't do PRE across indirect branch.
+    if (isa<IndirectBrInst>(PREPred->getTerminator()))
+      return false;
 
-      // Don't do PRE across indirect branch.
-      if (isa<IndirectBrInst>(PREPred->getTerminator()))
-        continue;
+    // We can't do PRE safely on a critical edge, so instead we schedule
+    // the edge to be split and perform the PRE the next time we iterate
+    // on the function.
+    unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock);
+    if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) {
+      toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum));
+      return false;
+    }
+    // We need to insert somewhere, so let's give it a shot
+    PREInstr = CurInst->clone();
+    if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) {
+      // If we failed insertion, make sure we remove the instruction.
+      DEBUG(verifyRemoved(PREInstr));
+      delete PREInstr;
+      return false;
+    }
+  }
 
-      // We can't do PRE safely on a critical edge, so instead we schedule
-      // the edge to be split and perform the PRE the next time we iterate
-      // on the function.
-      unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock);
-      if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) {
-        toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum));
-        continue;
-      }
+  // Either we should have filled in the PRE instruction, or we should
+  // not have needed insertions.
+  assert (PREInstr != nullptr || NumWithout == 0);
 
-      // Instantiate the expression in the predecessor that lacked it.
-      // Because we are going top-down through the block, all value numbers
-      // will be available in the predecessor by the time we need them.  Any
-      // that weren't originally present will have been instantiated earlier
-      // in this loop.
-      Instruction *PREInstr = CurInst->clone();
-      bool success = true;
-      for (unsigned i = 0, e = CurInst->getNumOperands(); i != e; ++i) {
-        Value *Op = PREInstr->getOperand(i);
-        if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
-          continue;
+  ++NumGVNPRE;
 
-        if (Value *V = findLeader(PREPred, VN.lookup(Op))) {
-          PREInstr->setOperand(i, V);
-        } else {
-          success = false;
-          break;
-        }
-      }
+  // Create a PHI to make the value available in this block.
+  PHINode *Phi =
+      PHINode::Create(CurInst->getType(), predMap.size(),
+                      CurInst->getName() + ".pre-phi", CurrentBlock->begin());
+  for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
+    if (Value *V = predMap[i].first)
+      Phi->addIncoming(V, predMap[i].second);
+    else
+      Phi->addIncoming(PREInstr, PREPred);
+  }
+
+  VN.add(Phi, ValNo);
+  addToLeaderTable(ValNo, Phi, CurrentBlock);
+  Phi->setDebugLoc(CurInst->getDebugLoc());
+  CurInst->replaceAllUsesWith(Phi);
+  if (Phi->getType()->getScalarType()->isPointerTy()) {
+    // Because we have added a PHI-use of the pointer value, it has now
+    // "escaped" from alias analysis' perspective.  We need to inform
+    // AA of this.
+    for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; ++ii) {
+      unsigned jj = PHINode::getOperandNumForIncomingValue(ii);
+      VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj));
+    }
 
-      // Fail out if we encounter an operand that is not available in
-      // the PRE predecessor.  This is typically because of loads which
-      // are not value numbered precisely.
-      if (!success) {
-        DEBUG(verifyRemoved(PREInstr));
-        delete PREInstr;
-        continue;
-      }
+    if (MD)
+      MD->invalidateCachedPointerInfo(Phi);
+  }
+  VN.erase(CurInst);
+  removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
 
-      PREInstr->insertBefore(PREPred->getTerminator());
-      PREInstr->setName(CurInst->getName() + ".pre");
-      PREInstr->setDebugLoc(CurInst->getDebugLoc());
-      VN.add(PREInstr, ValNo);
-      ++NumGVNPRE;
-
-      // Update the availability map to include the new instruction.
-      addToLeaderTable(ValNo, PREInstr, PREPred);
-
-      // Create a PHI to make the value available in this block.
-      PHINode* Phi = PHINode::Create(CurInst->getType(), predMap.size(),
-                                     CurInst->getName() + ".pre-phi",
-                                     CurrentBlock->begin());
-      for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
-        if (Value *V = predMap[i].first)
-          Phi->addIncoming(V, predMap[i].second);
-        else
-          Phi->addIncoming(PREInstr, PREPred);
-      }
+  DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
+  if (MD)
+    MD->removeInstruction(CurInst);
+  DEBUG(verifyRemoved(CurInst));
+  CurInst->eraseFromParent();
+  ++NumGVNInstr;
+  
+  return true;
+}
 
-      VN.add(Phi, ValNo);
-      addToLeaderTable(ValNo, Phi, CurrentBlock);
-      Phi->setDebugLoc(CurInst->getDebugLoc());
-      CurInst->replaceAllUsesWith(Phi);
-      if (Phi->getType()->getScalarType()->isPointerTy()) {
-        // Because we have added a PHI-use of the pointer value, it has now
-        // "escaped" from alias analysis' perspective.  We need to inform
-        // AA of this.
-        for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee;
-             ++ii) {
-          unsigned jj = PHINode::getOperandNumForIncomingValue(ii);
-          VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj));
-        }
+/// Perform a purely local form of PRE that looks for diamond
+/// control flow patterns and attempts to perform simple PRE at the join point.
+bool GVN::performPRE(Function &F) {
+  bool Changed = false;
+  for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) {
+    // Nothing to PRE in the entry block.
+    if (CurrentBlock == &F.getEntryBlock())
+      continue;
 
-        if (MD)
-          MD->invalidateCachedPointerInfo(Phi);
-      }
-      VN.erase(CurInst);
-      removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
+    // Don't perform PRE on a landing pad.
+    if (CurrentBlock->isLandingPad())
+      continue;
 
-      DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
-      if (MD) MD->removeInstruction(CurInst);
-      DEBUG(verifyRemoved(CurInst));
-      CurInst->eraseFromParent();
-      Changed = true;
+    for (BasicBlock::iterator BI = CurrentBlock->begin(),
+                              BE = CurrentBlock->end();
+         BI != BE;) {
+      Instruction *CurInst = BI++;
+      Changed = performScalarPRE(CurInst);
     }
   }
 
@@ -2612,50 +2667,48 @@ bool GVN::performPRE(Function &F) {
 /// Split the critical edge connecting the given two blocks, and return
 /// the block inserted to the critical edge.
 BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
-  BasicBlock *BB = SplitCriticalEdge(Pred, Succ, this);
+  BasicBlock *BB = SplitCriticalEdge(
+      Pred, Succ, CriticalEdgeSplittingOptions(getAliasAnalysis(), DT));
   if (MD)
     MD->invalidateCachedPredecessors();
   return BB;
 }
 
-/// splitCriticalEdges - Split critical edges found during the previous
+/// Split critical edges found during the previous
 /// iteration that may enable further optimization.
 bool GVN::splitCriticalEdges() {
   if (toSplit.empty())
     return false;
   do {
     std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val();
-    SplitCriticalEdge(Edge.first, Edge.second, this);
+    SplitCriticalEdge(Edge.first, Edge.second,
+                      CriticalEdgeSplittingOptions(getAliasAnalysis(), DT));
   } while (!toSplit.empty());
   if (MD) MD->invalidateCachedPredecessors();
   return true;
 }
 
-/// iterateOnFunction - Executes one iteration of GVN
+/// Executes one iteration of GVN
 bool GVN::iterateOnFunction(Function &F) {
   cleanupGlobalSets();
 
   // Top-down walk of the dominator tree
   bool Changed = false;
-#if 0
-  // Needed for value numbering with phi construction to work.
-  ReversePostOrderTraversal<Function*> RPOT(&F);
-  for (ReversePostOrderTraversal<Function*>::rpo_iterator RI = RPOT.begin(),
-       RE = RPOT.end(); RI != RE; ++RI)
-    Changed |= processBlock(*RI);
-#else
   // Save the blocks this function have before transformation begins. GVN may
   // split critical edge, and hence may invalidate the RPO/DT iterator.
   //
   std::vector<BasicBlock *> BBVect;
   BBVect.reserve(256);
-  for (DomTreeNode *X : depth_first(DT->getRootNode()))
-    BBVect.push_back(X->getBlock());
+  // Needed for value numbering with phi construction to work.
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  for (ReversePostOrderTraversal<Function *>::rpo_iterator RI = RPOT.begin(),
+                                                           RE = RPOT.end();
+       RI != RE; ++RI)
+    BBVect.push_back(*RI);
 
   for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end();
        I != E; I++)
     Changed |= processBlock(*I);
-#endif
 
   return Changed;
 }
@@ -2666,7 +2719,7 @@ void GVN::cleanupGlobalSets() {
   TableAllocator.Reset();
 }
 
-/// verifyRemoved - Verify that the specified instruction does not occur in our
+/// Verify that the specified instruction does not occur in our
 /// internal data structures.
 void GVN::verifyRemoved(const Instruction *Inst) const {
   VN.verifyRemoved(Inst);
@@ -2685,11 +2738,10 @@ void GVN::verifyRemoved(const Instruction *Inst) const {
   }
 }
 
-// BB is declared dead, which implied other blocks become dead as well. This
-// function is to add all these blocks to "DeadBlocks". For the dead blocks'
-// live successors, update their phi nodes by replacing the operands
-// corresponding to dead blocks with UndefVal.
-//
+/// BB is declared dead, which implied other blocks become dead as well. This
+/// function is to add all these blocks to "DeadBlocks". For the dead blocks'
+/// live successors, update their phi nodes by replacing the operands
+/// corresponding to dead blocks with UndefVal.
 void GVN::addDeadBlock(BasicBlock *BB) {
   SmallVector<BasicBlock *, 4> NewDead;
   SmallSetVector<BasicBlock *, 4> DF;
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index c01f57f..f99ebbc 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -44,7 +44,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
@@ -91,7 +91,7 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<LoopInfo>();
+      AU.addRequired<LoopInfoWrapperPass>();
       AU.addRequired<ScalarEvolution>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
@@ -126,7 +126,7 @@ char IndVarSimplify::ID = 0;
 INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars",
                 "Induction Variable Simplification", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -1929,13 +1929,15 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (!L->isLoopSimplifyForm())
     return false;
 
-  LI = &getAnalysis<LoopInfo>();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   SE = &getAnalysis<ScalarEvolution>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
-  TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
-  TTI = getAnalysisIfAvailable<TargetTransformInfo>();
+  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+  TLI = TLIP ? &TLIP->getTLI() : nullptr;
+  auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
+  TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr;
 
   DeadInsts.clear();
   Changed = false;
diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
new file mode 100644
index 0000000..8559e63
--- /dev/null
+++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -0,0 +1,1422 @@
+//===-- InductiveRangeCheckElimination.cpp - ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// The InductiveRangeCheckElimination pass splits a loop's iteration space into
+// three disjoint ranges.  It does that in a way such that the loop running in
+// the middle loop provably does not need range checks. As an example, it will
+// convert
+//
+//   len = < known positive >
+//   for (i = 0; i < n; i++) {
+//     if (0 <= i && i < len) {
+//       do_something();
+//     } else {
+//       throw_out_of_bounds();
+//     }
+//   }
+//
+// to
+//
+//   len = < known positive >
+//   limit = smin(n, len)
+//   // no first segment
+//   for (i = 0; i < limit; i++) {
+//     if (0 <= i && i < len) { // this check is fully redundant
+//       do_something();
+//     } else {
+//       throw_out_of_bounds();
+//     }
+//   }
+//   for (i = limit; i < n; i++) {
+//     if (0 <= i && i < len) {
+//       do_something();
+//     } else {
+//       throw_out_of_bounds();
+//     }
+//   }
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Optional.h"
+
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/Verifier.h"
+
+#include "llvm/Support/Debug.h"
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+
+#include "llvm/Pass.h"
+
+#include <array>
+
+using namespace llvm;
+
+static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden,
+                                        cl::init(64));
+
+static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden,
+                                       cl::init(false));
+
+static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal",
+                                          cl::Hidden, cl::init(10));
+
+#define DEBUG_TYPE "irce"
+
+namespace {
+
+/// An inductive range check is conditional branch in a loop with
+///
+///  1. a very cold successor (i.e. the branch jumps to that successor very
+///     rarely)
+///
+///  and
+///
+///  2. a condition that is provably true for some range of values taken by the
+///     containing loop's induction variable.
+///
+/// Currently all inductive range checks are branches conditional on an
+/// expression of the form
+///
+///   0 <= (Offset + Scale * I) < Length
+///
+/// where `I' is the canonical induction variable of a loop to which Offset and
+/// Scale are loop invariant, and Length is >= 0.  Currently the 'false' branch
+/// is considered cold, looking at profiling data to verify that is a TODO.
+
+class InductiveRangeCheck {
+  const SCEV *Offset;
+  const SCEV *Scale;
+  Value *Length;
+  BranchInst *Branch;
+
+  InductiveRangeCheck() :
+    Offset(nullptr), Scale(nullptr), Length(nullptr), Branch(nullptr) { }
+
+public:
+  const SCEV *getOffset() const { return Offset; }
+  const SCEV *getScale() const { return Scale; }
+  Value *getLength() const { return Length; }
+
+  void print(raw_ostream &OS) const {
+    OS << "InductiveRangeCheck:\n";
+    OS << "  Offset: ";
+    Offset->print(OS);
+    OS << "  Scale: ";
+    Scale->print(OS);
+    OS << "  Length: ";
+    Length->print(OS);
+    OS << "  Branch: ";
+    getBranch()->print(OS);
+    OS << "\n";
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void dump() {
+    print(dbgs());
+  }
+#endif
+
+  BranchInst *getBranch() const { return Branch; }
+
+  /// Represents an signed integer range [Range.getBegin(), Range.getEnd()).  If
+  /// R.getEnd() sle R.getBegin(), then R denotes the empty range.
+
+  class Range {
+    const SCEV *Begin;
+    const SCEV *End;
+
+  public:
+    Range(const SCEV *Begin, const SCEV *End) : Begin(Begin), End(End) {
+      assert(Begin->getType() == End->getType() && "ill-typed range!");
+    }
+
+    Type *getType() const { return Begin->getType(); }
+    const SCEV *getBegin() const { return Begin; }
+    const SCEV *getEnd() const { return End; }
+  };
+
+  typedef SpecificBumpPtrAllocator<InductiveRangeCheck> AllocatorTy;
+
+  /// This is the value the condition of the branch needs to evaluate to for the
+  /// branch to take the hot successor (see (1) above).
+  bool getPassingDirection() { return true; }
+
+  /// Computes a range for the induction variable (IndVar) in which the range
+  /// check is redundant and can be constant-folded away.  The induction
+  /// variable is not required to be the canonical {0,+,1} induction variable.
+  Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE,
+                                            const SCEVAddRecExpr *IndVar,
+                                            IRBuilder<> &B) const;
+
+  /// Create an inductive range check out of BI if possible, else return
+  /// nullptr.
+  static InductiveRangeCheck *create(AllocatorTy &Alloc, BranchInst *BI,
+                                     Loop *L, ScalarEvolution &SE,
+                                     BranchProbabilityInfo &BPI);
+};
+
+class InductiveRangeCheckElimination : public LoopPass {
+  InductiveRangeCheck::AllocatorTy Allocator;
+
+public:
+  static char ID;
+  InductiveRangeCheckElimination() : LoopPass(ID) {
+    initializeInductiveRangeCheckEliminationPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<BranchProbabilityInfo>();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+char InductiveRangeCheckElimination::ID = 0;
+}
+
+INITIALIZE_PASS(InductiveRangeCheckElimination, "irce",
+                "Inductive range check elimination", false, false)
+
+static bool IsLowerBoundCheck(Value *Check, Value *&IndexV) {
+  using namespace llvm::PatternMatch;
+
+  ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
+  Value *LHS = nullptr, *RHS = nullptr;
+
+  if (!match(Check, m_ICmp(Pred, m_Value(LHS), m_Value(RHS))))
+    return false;
+
+  switch (Pred) {
+  default:
+    return false;
+
+  case ICmpInst::ICMP_SLE:
+    std::swap(LHS, RHS);
+  // fallthrough
+  case ICmpInst::ICMP_SGE:
+    if (!match(RHS, m_ConstantInt<0>()))
+      return false;
+    IndexV = LHS;
+    return true;
+
+  case ICmpInst::ICMP_SLT:
+    std::swap(LHS, RHS);
+  // fallthrough
+  case ICmpInst::ICMP_SGT:
+    if (!match(RHS, m_ConstantInt<-1>()))
+      return false;
+    IndexV = LHS;
+    return true;
+  }
+}
+
+static bool IsUpperBoundCheck(Value *Check, Value *Index, Value *&UpperLimit) {
+  using namespace llvm::PatternMatch;
+
+  ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
+  Value *LHS = nullptr, *RHS = nullptr;
+
+  if (!match(Check, m_ICmp(Pred, m_Value(LHS), m_Value(RHS))))
+    return false;
+
+  switch (Pred) {
+  default:
+    return false;
+
+  case ICmpInst::ICMP_SGT:
+    std::swap(LHS, RHS);
+  // fallthrough
+  case ICmpInst::ICMP_SLT:
+    if (LHS != Index)
+      return false;
+    UpperLimit = RHS;
+    return true;
+
+  case ICmpInst::ICMP_UGT:
+    std::swap(LHS, RHS);
+  // fallthrough
+  case ICmpInst::ICMP_ULT:
+    if (LHS != Index)
+      return false;
+    UpperLimit = RHS;
+    return true;
+  }
+}
+
+/// Split a condition into something semantically equivalent to (0 <= I <
+/// Limit), both comparisons signed and Len loop invariant on L and positive.
+/// On success, return true and set Index to I and UpperLimit to Limit.  Return
+/// false on failure (we may still write to UpperLimit and Index on failure).
+/// It does not try to interpret I as a loop index.
+///
+static bool SplitRangeCheckCondition(Loop *L, ScalarEvolution &SE,
+                                     Value *Condition, const SCEV *&Index,
+                                     Value *&UpperLimit) {
+
+  // TODO: currently this catches some silly cases like comparing "%idx slt 1".
+  // Our transformations are still correct, but less likely to be profitable in
+  // those cases.  We have to come up with some heuristics that pick out the
+  // range checks that are more profitable to clone a loop for.  This function
+  // in general can be made more robust.
+
+  using namespace llvm::PatternMatch;
+
+  Value *A = nullptr;
+  Value *B = nullptr;
+  ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
+
+  // In these early checks we assume that the matched UpperLimit is positive.
+  // We'll verify that fact later, before returning true.
+
+  if (match(Condition, m_And(m_Value(A), m_Value(B)))) {
+    Value *IndexV = nullptr;
+    Value *ExpectedUpperBoundCheck = nullptr;
+
+    if (IsLowerBoundCheck(A, IndexV))
+      ExpectedUpperBoundCheck = B;
+    else if (IsLowerBoundCheck(B, IndexV))
+      ExpectedUpperBoundCheck = A;
+    else
+      return false;
+
+    if (!IsUpperBoundCheck(ExpectedUpperBoundCheck, IndexV, UpperLimit))
+      return false;
+
+    Index = SE.getSCEV(IndexV);
+
+    if (isa<SCEVCouldNotCompute>(Index))
+      return false;
+
+  } else if (match(Condition, m_ICmp(Pred, m_Value(A), m_Value(B)))) {
+    switch (Pred) {
+    default:
+      return false;
+
+    case ICmpInst::ICMP_SGT:
+      std::swap(A, B);
+    // fall through
+    case ICmpInst::ICMP_SLT:
+      UpperLimit = B;
+      Index = SE.getSCEV(A);
+      if (isa<SCEVCouldNotCompute>(Index) || !SE.isKnownNonNegative(Index))
+        return false;
+      break;
+
+    case ICmpInst::ICMP_UGT:
+      std::swap(A, B);
+    // fall through
+    case ICmpInst::ICMP_ULT:
+      UpperLimit = B;
+      Index = SE.getSCEV(A);
+      if (isa<SCEVCouldNotCompute>(Index))
+        return false;
+      break;
+    }
+  } else {
+    return false;
+  }
+
+  const SCEV *UpperLimitSCEV = SE.getSCEV(UpperLimit);
+  if (isa<SCEVCouldNotCompute>(UpperLimitSCEV) ||
+      !SE.isKnownNonNegative(UpperLimitSCEV))
+    return false;
+
+  if (SE.getLoopDisposition(UpperLimitSCEV, L) !=
+      ScalarEvolution::LoopInvariant) {
+    DEBUG(dbgs() << " in function: " << L->getHeader()->getParent()->getName()
+                 << " ";
+          dbgs() << " UpperLimit is not loop invariant: "
+                 << UpperLimit->getName() << "\n";);
+    return false;
+  }
+
+  return true;
+}
+
+
+InductiveRangeCheck *
+InductiveRangeCheck::create(InductiveRangeCheck::AllocatorTy &A, BranchInst *BI,
+                            Loop *L, ScalarEvolution &SE,
+                            BranchProbabilityInfo &BPI) {
+
+  if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
+    return nullptr;
+
+  BranchProbability LikelyTaken(15, 16);
+
+  if (BPI.getEdgeProbability(BI->getParent(), (unsigned) 0) < LikelyTaken)
+    return nullptr;
+
+  Value *Length = nullptr;
+  const SCEV *IndexSCEV = nullptr;
+
+  if (!SplitRangeCheckCondition(L, SE, BI->getCondition(), IndexSCEV, Length))
+    return nullptr;
+
+  assert(IndexSCEV && Length && "contract with SplitRangeCheckCondition!");
+
+  const SCEVAddRecExpr *IndexAddRec = dyn_cast<SCEVAddRecExpr>(IndexSCEV);
+  bool IsAffineIndex =
+      IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine();
+
+  if (!IsAffineIndex)
+    return nullptr;
+
+  InductiveRangeCheck *IRC = new (A.Allocate()) InductiveRangeCheck;
+  IRC->Length = Length;
+  IRC->Offset = IndexAddRec->getStart();
+  IRC->Scale = IndexAddRec->getStepRecurrence(SE);
+  IRC->Branch = BI;
+  return IRC;
+}
+
+namespace {
+
+// Keeps track of the structure of a loop.  This is similar to llvm::Loop,
+// except that it is more lightweight and can track the state of a loop through
+// changing and potentially invalid IR.  This structure also formalizes the
+// kinds of loops we can deal with -- ones that have a single latch that is also
+// an exiting block *and* have a canonical induction variable.
+struct LoopStructure {
+  const char *Tag;
+
+  BasicBlock *Header;
+  BasicBlock *Latch;
+
+  // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th
+  // successor is `LatchExit', the exit block of the loop.
+  BranchInst *LatchBr;
+  BasicBlock *LatchExit;
+  unsigned LatchBrExitIdx;
+
+  Value *IndVarNext;
+  Value *IndVarStart;
+  Value *LoopExitAt;
+  bool IndVarIncreasing;
+
+  LoopStructure()
+      : Tag(""), Header(nullptr), Latch(nullptr), LatchBr(nullptr),
+        LatchExit(nullptr), LatchBrExitIdx(-1), IndVarNext(nullptr),
+        IndVarStart(nullptr), LoopExitAt(nullptr), IndVarIncreasing(false) {}
+
+  template <typename M> LoopStructure map(M Map) const {
+    LoopStructure Result;
+    Result.Tag = Tag;
+    Result.Header = cast<BasicBlock>(Map(Header));
+    Result.Latch = cast<BasicBlock>(Map(Latch));
+    Result.LatchBr = cast<BranchInst>(Map(LatchBr));
+    Result.LatchExit = cast<BasicBlock>(Map(LatchExit));
+    Result.LatchBrExitIdx = LatchBrExitIdx;
+    Result.IndVarNext = Map(IndVarNext);
+    Result.IndVarStart = Map(IndVarStart);
+    Result.LoopExitAt = Map(LoopExitAt);
+    Result.IndVarIncreasing = IndVarIncreasing;
+    return Result;
+  }
+
+  static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &,
+                                                    BranchProbabilityInfo &BPI,
+                                                    Loop &,
+                                                    const char *&);
+};
+
+/// This class is used to constrain loops to run within a given iteration space.
+/// The algorithm this class implements is given a Loop and a range [Begin,
+/// End).  The algorithm then tries to break out a "main loop" out of the loop
+/// it is given in a way that the "main loop" runs with the induction variable
+/// in a subset of [Begin, End).  The algorithm emits appropriate pre and post
+/// loops to run any remaining iterations.  The pre loop runs any iterations in
+/// which the induction variable is < Begin, and the post loop runs any
+/// iterations in which the induction variable is >= End.
+///
+class LoopConstrainer {
+  // The representation of a clone of the original loop we started out with.
+  struct ClonedLoop {
+    // The cloned blocks
+    std::vector<BasicBlock *> Blocks;
+
+    // `Map` maps values in the clonee into values in the cloned version
+    ValueToValueMapTy Map;
+
+    // An instance of `LoopStructure` for the cloned loop
+    LoopStructure Structure;
+  };
+
+  // Result of rewriting the range of a loop.  See changeIterationSpaceEnd for
+  // more details on what these fields mean.
+  struct RewrittenRangeInfo {
+    BasicBlock *PseudoExit;
+    BasicBlock *ExitSelector;
+    std::vector<PHINode *> PHIValuesAtPseudoExit;
+    PHINode *IndVarEnd;
+
+    RewrittenRangeInfo()
+        : PseudoExit(nullptr), ExitSelector(nullptr), IndVarEnd(nullptr) {}
+  };
+
+  // Calculated subranges we restrict the iteration space of the main loop to.
+  // See the implementation of `calculateSubRanges' for more details on how
+  // these fields are computed.  `LowLimit` is None if there is no restriction
+  // on low end of the restricted iteration space of the main loop.  `HighLimit`
+  // is None if there is no restriction on high end of the restricted iteration
+  // space of the main loop.
+
+  struct SubRanges {
+    Optional<const SCEV *> LowLimit;
+    Optional<const SCEV *> HighLimit;
+  };
+
+  // A utility function that does a `replaceUsesOfWith' on the incoming block
+  // set of a `PHINode' -- replaces instances of `Block' in the `PHINode's
+  // incoming block list with `ReplaceBy'.
+  static void replacePHIBlock(PHINode *PN, BasicBlock *Block,
+                              BasicBlock *ReplaceBy);
+
+  // Compute a safe set of limits for the main loop to run in -- effectively the
+  // intersection of `Range' and the iteration space of the original loop.
+  // Return None if unable to compute the set of subranges.
+  //
+  Optional<SubRanges> calculateSubRanges() const;
+
+  // Clone `OriginalLoop' and return the result in CLResult.  The IR after
+  // running `cloneLoop' is well formed except for the PHI nodes in CLResult --
+  // the PHI nodes say that there is an incoming edge from `OriginalPreheader`
+  // but there is no such edge.
+  //
+  void cloneLoop(ClonedLoop &CLResult, const char *Tag) const;
+
+  // Rewrite the iteration space of the loop denoted by (LS, Preheader). The
+  // iteration space of the rewritten loop ends at ExitLoopAt.  The start of the
+  // iteration space is not changed.  `ExitLoopAt' is assumed to be slt
+  // `OriginalHeaderCount'.
+  //
+  // If there are iterations left to execute, control is made to jump to
+  // `ContinuationBlock', otherwise they take the normal loop exit.  The
+  // returned `RewrittenRangeInfo' object is populated as follows:
+  //
+  //  .PseudoExit is a basic block that unconditionally branches to
+  //      `ContinuationBlock'.
+  //
+  //  .ExitSelector is a basic block that decides, on exit from the loop,
+  //      whether to branch to the "true" exit or to `PseudoExit'.
+  //
+  //  .PHIValuesAtPseudoExit are PHINodes in `PseudoExit' that compute the value
+  //      for each PHINode in the loop header on taking the pseudo exit.
+  //
+  // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate
+  // preheader because it is made to branch to the loop header only
+  // conditionally.
+  //
+  RewrittenRangeInfo
+  changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader,
+                          Value *ExitLoopAt,
+                          BasicBlock *ContinuationBlock) const;
+
+  // The loop denoted by `LS' has `OldPreheader' as its preheader.  This
+  // function creates a new preheader for `LS' and returns it.
+  //
+  BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader,
+                              const char *Tag) const;
+
+  // `ContinuationBlockAndPreheader' was the continuation block for some call to
+  // `changeIterationSpaceEnd' and is the preheader to the loop denoted by `LS'.
+  // This function rewrites the PHI nodes in `LS.Header' to start with the
+  // correct value.
+  void rewriteIncomingValuesForPHIs(
+      LoopStructure &LS, BasicBlock *ContinuationBlockAndPreheader,
+      const LoopConstrainer::RewrittenRangeInfo &RRI) const;
+
+  // Even though we do not preserve any passes at this time, we at least need to
+  // keep the parent loop structure consistent.  The `LPPassManager' seems to
+  // verify this after running a loop pass.  This function adds the list of
+  // blocks denoted by BBs to this loops parent loop if required.
+  void addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs);
+
+  // Some global state.
+  Function &F;
+  LLVMContext &Ctx;
+  ScalarEvolution &SE;
+
+  // Information about the original loop we started out with.
+  Loop &OriginalLoop;
+  LoopInfo &OriginalLoopInfo;
+  const SCEV *LatchTakenCount;
+  BasicBlock *OriginalPreheader;
+
+  // The preheader of the main loop.  This may or may not be different from
+  // `OriginalPreheader'.
+  BasicBlock *MainLoopPreheader;
+
+  // The range we need to run the main loop in.
+  InductiveRangeCheck::Range Range;
+
+  // The structure of the main loop (see comment at the beginning of this class
+  // for a definition)
+  LoopStructure MainLoopStructure;
+
+public:
+  LoopConstrainer(Loop &L, LoopInfo &LI, const LoopStructure &LS,
+                  ScalarEvolution &SE, InductiveRangeCheck::Range R)
+      : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()),
+        SE(SE), OriginalLoop(L), OriginalLoopInfo(LI), LatchTakenCount(nullptr),
+        OriginalPreheader(nullptr), MainLoopPreheader(nullptr), Range(R),
+        MainLoopStructure(LS) {}
+
+  // Entry point for the algorithm.  Returns true on success.
+  bool run();
+};
+
+}
+
+void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block,
+                                      BasicBlock *ReplaceBy) {
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingBlock(i) == Block)
+      PN->setIncomingBlock(i, ReplaceBy);
+}
+
+static bool CanBeSMax(ScalarEvolution &SE, const SCEV *S) {
+  APInt SMax =
+      APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth());
+  return SE.getSignedRange(S).contains(SMax) &&
+         SE.getUnsignedRange(S).contains(SMax);
+}
+
+static bool CanBeSMin(ScalarEvolution &SE, const SCEV *S) {
+  APInt SMin =
+      APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth());
+  return SE.getSignedRange(S).contains(SMin) &&
+         SE.getUnsignedRange(S).contains(SMin);
+}
+
+Optional<LoopStructure>
+LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BPI,
+                                  Loop &L, const char *&FailureReason) {
+  assert(L.isLoopSimplifyForm() && "should follow from addRequired<>");
+
+  BasicBlock *Latch = L.getLoopLatch();
+  if (!L.isLoopExiting(Latch)) {
+    FailureReason = "no loop latch";
+    return None;
+  }
+
+  BasicBlock *Header = L.getHeader();
+  BasicBlock *Preheader = L.getLoopPreheader();
+  if (!Preheader) {
+    FailureReason = "no preheader";
+    return None;
+  }
+
+  BranchInst *LatchBr = dyn_cast<BranchInst>(&*Latch->rbegin());
+  if (!LatchBr || LatchBr->isUnconditional()) {
+    FailureReason = "latch terminator not conditional branch";
+    return None;
+  }
+
+  unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0;
+
+  BranchProbability ExitProbability =
+    BPI.getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx);
+
+  if (ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) {
+    FailureReason = "short running loop, not profitable";
+    return None;
+  }
+
+  ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition());
+  if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) {
+    FailureReason = "latch terminator branch not conditional on integral icmp";
+    return None;
+  }
+
+  const SCEV *LatchCount = SE.getExitCount(&L, Latch);
+  if (isa<SCEVCouldNotCompute>(LatchCount)) {
+    FailureReason = "could not compute latch count";
+    return None;
+  }
+
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *LeftValue = ICI->getOperand(0);
+  const SCEV *LeftSCEV = SE.getSCEV(LeftValue);
+  IntegerType *IndVarTy = cast<IntegerType>(LeftValue->getType());
+
+  Value *RightValue = ICI->getOperand(1);
+  const SCEV *RightSCEV = SE.getSCEV(RightValue);
+
+  // We canonicalize `ICI` such that `LeftSCEV` is an add recurrence.
+  if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
+    if (isa<SCEVAddRecExpr>(RightSCEV)) {
+      std::swap(LeftSCEV, RightSCEV);
+      std::swap(LeftValue, RightValue);
+      Pred = ICmpInst::getSwappedPredicate(Pred);
+    } else {
+      FailureReason = "no add recurrences in the icmp";
+      return None;
+    }
+  }
+
+  auto IsInductionVar = [&SE](const SCEVAddRecExpr *AR, bool &IsIncreasing) {
+    if (!AR->isAffine())
+      return false;
+
+    IntegerType *Ty = cast<IntegerType>(AR->getType());
+    IntegerType *WideTy =
+        IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2);
+
+    // Currently we only work with induction variables that have been proved to
+    // not wrap.  This restriction can potentially be lifted in the future.
+
+    const SCEVAddRecExpr *ExtendAfterOp =
+        dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+    if (!ExtendAfterOp)
+      return false;
+
+    const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy);
+    const SCEV *ExtendedStep =
+        SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy);
+
+    bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart &&
+                        ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep;
+
+    if (!NoSignedWrap)
+      return false;
+
+    if (const SCEVConstant *StepExpr =
+            dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) {
+      ConstantInt *StepCI = StepExpr->getValue();
+      if (StepCI->isOne() || StepCI->isMinusOne()) {
+        IsIncreasing = StepCI->isOne();
+        return true;
+      }
+    }
+
+    return false;
+  };
+
+  // `ICI` is interpreted as taking the backedge if the *next* value of the
+  // induction variable satisfies some constraint.
+
+  const SCEVAddRecExpr *IndVarNext = cast<SCEVAddRecExpr>(LeftSCEV);
+  bool IsIncreasing = false;
+  if (!IsInductionVar(IndVarNext, IsIncreasing)) {
+    FailureReason = "LHS in icmp not induction variable";
+    return None;
+  }
+
+  ConstantInt *One = ConstantInt::get(IndVarTy, 1);
+  // TODO: generalize the predicates here to also match their unsigned variants.
+  if (IsIncreasing) {
+    bool FoundExpectedPred =
+        (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) ||
+        (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0);
+
+    if (!FoundExpectedPred) {
+      FailureReason = "expected icmp slt semantically, found something else";
+      return None;
+    }
+
+    if (LatchBrExitIdx == 0) {
+      if (CanBeSMax(SE, RightSCEV)) {
+        // TODO: this restriction is easily removable -- we just have to
+        // remember that the icmp was an slt and not an sle.
+        FailureReason = "limit may overflow when coercing sle to slt";
+        return None;
+      }
+
+      IRBuilder<> B(&*Preheader->rbegin());
+      RightValue = B.CreateAdd(RightValue, One);
+    }
+
+  } else {
+    bool FoundExpectedPred =
+        (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) ||
+        (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0);
+
+    if (!FoundExpectedPred) {
+      FailureReason = "expected icmp sgt semantically, found something else";
+      return None;
+    }
+
+    if (LatchBrExitIdx == 0) {
+      if (CanBeSMin(SE, RightSCEV)) {
+        // TODO: this restriction is easily removable -- we just have to
+        // remember that the icmp was an sgt and not an sge.
+        FailureReason = "limit may overflow when coercing sge to sgt";
+        return None;
+      }
+
+      IRBuilder<> B(&*Preheader->rbegin());
+      RightValue = B.CreateSub(RightValue, One);
+    }
+  }
+
+  const SCEV *StartNext = IndVarNext->getStart();
+  const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
+  const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
+
+  BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
+
+  assert(SE.getLoopDisposition(LatchCount, &L) ==
+             ScalarEvolution::LoopInvariant &&
+         "loop variant exit count doesn't make sense!");
+
+  assert(!L.contains(LatchExit) && "expected an exit block!");
+
+  Value *IndVarStartV = SCEVExpander(SE, "irce").expandCodeFor(
+      IndVarStart, IndVarTy, &*Preheader->rbegin());
+  IndVarStartV->setName("indvar.start");
+
+  LoopStructure Result;
+
+  Result.Tag = "main";
+  Result.Header = Header;
+  Result.Latch = Latch;
+  Result.LatchBr = LatchBr;
+  Result.LatchExit = LatchExit;
+  Result.LatchBrExitIdx = LatchBrExitIdx;
+  Result.IndVarStart = IndVarStartV;
+  Result.IndVarNext = LeftValue;
+  Result.IndVarIncreasing = IsIncreasing;
+  Result.LoopExitAt = RightValue;
+
+  FailureReason = nullptr;
+
+  return Result;
+}
+
+Optional<LoopConstrainer::SubRanges>
+LoopConstrainer::calculateSubRanges() const {
+  IntegerType *Ty = cast<IntegerType>(LatchTakenCount->getType());
+
+  if (Range.getType() != Ty)
+    return None;
+
+  LoopConstrainer::SubRanges Result;
+
+  // I think we can be more aggressive here and make this nuw / nsw if the
+  // addition that feeds into the icmp for the latch's terminating branch is nuw
+  // / nsw.  In any case, a wrapping 2's complement addition is safe.
+  ConstantInt *One = ConstantInt::get(Ty, 1);
+  const SCEV *Start = SE.getSCEV(MainLoopStructure.IndVarStart);
+  const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt);
+
+  bool Increasing = MainLoopStructure.IndVarIncreasing;
+  // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the
+  // range of values the induction variable takes.
+  const SCEV *Smallest =
+      Increasing ? Start : SE.getAddExpr(End, SE.getSCEV(One));
+  const SCEV *Greatest =
+      Increasing ? End : SE.getAddExpr(Start, SE.getSCEV(One));
+
+  auto Clamp = [this, Smallest, Greatest](const SCEV *S) {
+    return SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S));
+  };
+
+  // In some cases we can prove that we don't need a pre or post loop
+
+  bool ProvablyNoPreloop =
+      SE.isKnownPredicate(ICmpInst::ICMP_SLE, Range.getBegin(), Smallest);
+  if (!ProvablyNoPreloop)
+    Result.LowLimit = Clamp(Range.getBegin());
+
+  bool ProvablyNoPostLoop =
+      SE.isKnownPredicate(ICmpInst::ICMP_SLE, Greatest, Range.getEnd());
+  if (!ProvablyNoPostLoop)
+    Result.HighLimit = Clamp(Range.getEnd());
+
+  return Result;
+}
+
+void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
+                                const char *Tag) const {
+  for (BasicBlock *BB : OriginalLoop.getBlocks()) {
+    BasicBlock *Clone = CloneBasicBlock(BB, Result.Map, Twine(".") + Tag, &F);
+    Result.Blocks.push_back(Clone);
+    Result.Map[BB] = Clone;
+  }
+
+  auto GetClonedValue = [&Result](Value *V) {
+    assert(V && "null values not in domain!");
+    auto It = Result.Map.find(V);
+    if (It == Result.Map.end())
+      return V;
+    return static_cast<Value *>(It->second);
+  };
+
+  Result.Structure = MainLoopStructure.map(GetClonedValue);
+  Result.Structure.Tag = Tag;
+
+  for (unsigned i = 0, e = Result.Blocks.size(); i != e; ++i) {
+    BasicBlock *ClonedBB = Result.Blocks[i];
+    BasicBlock *OriginalBB = OriginalLoop.getBlocks()[i];
+
+    assert(Result.Map[OriginalBB] == ClonedBB && "invariant!");
+
+    for (Instruction &I : *ClonedBB)
+      RemapInstruction(&I, Result.Map,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+
+    // Exit blocks will now have one more predecessor and their PHI nodes need
+    // to be edited to reflect that.  No phi nodes need to be introduced because
+    // the loop is in LCSSA.
+
+    for (auto SBBI = succ_begin(OriginalBB), SBBE = succ_end(OriginalBB);
+         SBBI != SBBE; ++SBBI) {
+
+      if (OriginalLoop.contains(*SBBI))
+        continue; // not an exit block
+
+      for (Instruction &I : **SBBI) {
+        if (!isa<PHINode>(&I))
+          break;
+
+        PHINode *PN = cast<PHINode>(&I);
+        Value *OldIncoming = PN->getIncomingValueForBlock(OriginalBB);
+        PN->addIncoming(GetClonedValue(OldIncoming), ClonedBB);
+      }
+    }
+  }
+}
+
+LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
+    const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt,
+    BasicBlock *ContinuationBlock) const {
+
+  // We start with a loop with a single latch:
+  //
+  //    +--------------------+
+  //    |                    |
+  //    |     preheader      |
+  //    |                    |
+  //    +--------+-----------+
+  //             |      ----------------\
+  //             |     /                |
+  //    +--------v----v------+          |
+  //    |                    |          |
+  //    |      header        |          |
+  //    |                    |          |
+  //    +--------------------+          |
+  //                                    |
+  //            .....                   |
+  //                                    |
+  //    +--------------------+          |
+  //    |                    |          |
+  //    |       latch        >----------/
+  //    |                    |
+  //    +-------v------------+
+  //            |
+  //            |
+  //            |   +--------------------+
+  //            |   |                    |
+  //            +--->   original exit    |
+  //                |                    |
+  //                +--------------------+
+  //
+  // We change the control flow to look like
+  //
+  //
+  //    +--------------------+
+  //    |                    |
+  //    |     preheader      >-------------------------+
+  //    |                    |                         |
+  //    +--------v-----------+                         |
+  //             |    /-------------+                  |
+  //             |   /              |                  |
+  //    +--------v--v--------+      |                  |
+  //    |                    |      |                  |
+  //    |      header        |      |   +--------+     |
+  //    |                    |      |   |        |     |
+  //    +--------------------+      |   |  +-----v-----v-----------+
+  //                                |   |  |                       |
+  //                                |   |  |     .pseudo.exit      |
+  //                                |   |  |                       |
+  //                                |   |  +-----------v-----------+
+  //                                |   |              |
+  //            .....               |   |              |
+  //                                |   |     +--------v-------------+
+  //    +--------------------+      |   |     |                      |
+  //    |                    |      |   |     |   ContinuationBlock  |
+  //    |       latch        >------+   |     |                      |
+  //    |                    |          |     +----------------------+
+  //    +---------v----------+          |
+  //              |                     |
+  //              |                     |
+  //              |     +---------------^-----+
+  //              |     |                     |
+  //              +----->    .exit.selector   |
+  //                    |                     |
+  //                    +----------v----------+
+  //                               |
+  //     +--------------------+    |
+  //     |                    |    |
+  //     |   original exit    <----+
+  //     |                    |
+  //     +--------------------+
+  //
+
+  RewrittenRangeInfo RRI;
+
+  auto BBInsertLocation = std::next(Function::iterator(LS.Latch));
+  RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector",
+                                        &F, BBInsertLocation);
+  RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F,
+                                      BBInsertLocation);
+
+  BranchInst *PreheaderJump = cast<BranchInst>(&*Preheader->rbegin());
+  bool Increasing = LS.IndVarIncreasing;
+
+  IRBuilder<> B(PreheaderJump);
+
+  // EnterLoopCond - is it okay to start executing this `LS'?
+  Value *EnterLoopCond = Increasing
+                             ? B.CreateICmpSLT(LS.IndVarStart, ExitSubloopAt)
+                             : B.CreateICmpSGT(LS.IndVarStart, ExitSubloopAt);
+
+  B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit);
+  PreheaderJump->eraseFromParent();
+
+  LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector);
+  B.SetInsertPoint(LS.LatchBr);
+  Value *TakeBackedgeLoopCond =
+      Increasing ? B.CreateICmpSLT(LS.IndVarNext, ExitSubloopAt)
+                 : B.CreateICmpSGT(LS.IndVarNext, ExitSubloopAt);
+  Value *CondForBranch = LS.LatchBrExitIdx == 1
+                             ? TakeBackedgeLoopCond
+                             : B.CreateNot(TakeBackedgeLoopCond);
+
+  LS.LatchBr->setCondition(CondForBranch);
+
+  B.SetInsertPoint(RRI.ExitSelector);
+
+  // IterationsLeft - are there any more iterations left, given the original
+  // upper bound on the induction variable?  If not, we branch to the "real"
+  // exit.
+  Value *IterationsLeft = Increasing
+                              ? B.CreateICmpSLT(LS.IndVarNext, LS.LoopExitAt)
+                              : B.CreateICmpSGT(LS.IndVarNext, LS.LoopExitAt);
+  B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit);
+
+  BranchInst *BranchToContinuation =
+      BranchInst::Create(ContinuationBlock, RRI.PseudoExit);
+
+  // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of
+  // each of the PHI nodes in the loop header.  This feeds into the initial
+  // value of the same PHI nodes if/when we continue execution.
+  for (Instruction &I : *LS.Header) {
+    if (!isa<PHINode>(&I))
+      break;
+
+    PHINode *PN = cast<PHINode>(&I);
+
+    PHINode *NewPHI = PHINode::Create(PN->getType(), 2, PN->getName() + ".copy",
+                                      BranchToContinuation);
+
+    NewPHI->addIncoming(PN->getIncomingValueForBlock(Preheader), Preheader);
+    NewPHI->addIncoming(PN->getIncomingValueForBlock(LS.Latch),
+                        RRI.ExitSelector);
+    RRI.PHIValuesAtPseudoExit.push_back(NewPHI);
+  }
+
+  RRI.IndVarEnd = PHINode::Create(LS.IndVarNext->getType(), 2, "indvar.end",
+                                  BranchToContinuation);
+  RRI.IndVarEnd->addIncoming(LS.IndVarStart, Preheader);
+  RRI.IndVarEnd->addIncoming(LS.IndVarNext, RRI.ExitSelector);
+
+  // The latch exit now has a branch from `RRI.ExitSelector' instead of
+  // `LS.Latch'.  The PHI nodes need to be updated to reflect that.
+  for (Instruction &I : *LS.LatchExit) {
+    if (PHINode *PN = dyn_cast<PHINode>(&I))
+      replacePHIBlock(PN, LS.Latch, RRI.ExitSelector);
+    else
+      break;
+  }
+
+  return RRI;
+}
+
+void LoopConstrainer::rewriteIncomingValuesForPHIs(
+    LoopStructure &LS, BasicBlock *ContinuationBlock,
+    const LoopConstrainer::RewrittenRangeInfo &RRI) const {
+
+  unsigned PHIIndex = 0;
+  for (Instruction &I : *LS.Header) {
+    if (!isa<PHINode>(&I))
+      break;
+
+    PHINode *PN = cast<PHINode>(&I);
+
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i)
+      if (PN->getIncomingBlock(i) == ContinuationBlock)
+        PN->setIncomingValue(i, RRI.PHIValuesAtPseudoExit[PHIIndex++]);
+  }
+
+  LS.IndVarStart = RRI.IndVarEnd;
+}
+
+BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS,
+                                             BasicBlock *OldPreheader,
+                                             const char *Tag) const {
+
+  BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header);
+  BranchInst::Create(LS.Header, Preheader);
+
+  for (Instruction &I : *LS.Header) {
+    if (!isa<PHINode>(&I))
+      break;
+
+    PHINode *PN = cast<PHINode>(&I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i)
+      replacePHIBlock(PN, OldPreheader, Preheader);
+  }
+
+  return Preheader;
+}
+
+void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
+  Loop *ParentLoop = OriginalLoop.getParentLoop();
+  if (!ParentLoop)
+    return;
+
+  for (BasicBlock *BB : BBs)
+    ParentLoop->addBasicBlockToLoop(BB, OriginalLoopInfo);
+}
+
+bool LoopConstrainer::run() {
+  BasicBlock *Preheader = nullptr;
+  LatchTakenCount = SE.getExitCount(&OriginalLoop, MainLoopStructure.Latch);
+  Preheader = OriginalLoop.getLoopPreheader();
+  assert(!isa<SCEVCouldNotCompute>(LatchTakenCount) && Preheader != nullptr &&
+         "preconditions!");
+
+  OriginalPreheader = Preheader;
+  MainLoopPreheader = Preheader;
+
+  Optional<SubRanges> MaybeSR = calculateSubRanges();
+  if (!MaybeSR.hasValue()) {
+    DEBUG(dbgs() << "irce: could not compute subranges\n");
+    return false;
+  }
+
+  SubRanges SR = MaybeSR.getValue();
+  bool Increasing = MainLoopStructure.IndVarIncreasing;
+  IntegerType *IVTy =
+      cast<IntegerType>(MainLoopStructure.IndVarNext->getType());
+
+  SCEVExpander Expander(SE, "irce");
+  Instruction *InsertPt = OriginalPreheader->getTerminator();
+
+  // It would have been better to make `PreLoop' and `PostLoop'
+  // `Optional<ClonedLoop>'s, but `ValueToValueMapTy' does not have a copy
+  // constructor.
+  ClonedLoop PreLoop, PostLoop;
+  bool NeedsPreLoop =
+      Increasing ? SR.LowLimit.hasValue() : SR.HighLimit.hasValue();
+  bool NeedsPostLoop =
+      Increasing ? SR.HighLimit.hasValue() : SR.LowLimit.hasValue();
+
+  Value *ExitPreLoopAt = nullptr;
+  Value *ExitMainLoopAt = nullptr;
+  const SCEVConstant *MinusOneS =
+      cast<SCEVConstant>(SE.getConstant(IVTy, -1, true /* isSigned */));
+
+  if (NeedsPreLoop) {
+    const SCEV *ExitPreLoopAtSCEV = nullptr;
+
+    if (Increasing)
+      ExitPreLoopAtSCEV = *SR.LowLimit;
+    else {
+      if (CanBeSMin(SE, *SR.HighLimit)) {
+        DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+                     << "preloop exit limit.  HighLimit = " << *(*SR.HighLimit)
+                     << "\n");
+        return false;
+      }
+      ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
+    }
+
+    ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt);
+    ExitPreLoopAt->setName("exit.preloop.at");
+  }
+
+  if (NeedsPostLoop) {
+    const SCEV *ExitMainLoopAtSCEV = nullptr;
+
+    if (Increasing)
+      ExitMainLoopAtSCEV = *SR.HighLimit;
+    else {
+      if (CanBeSMin(SE, *SR.LowLimit)) {
+        DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+                     << "mainloop exit limit.  LowLimit = " << *(*SR.LowLimit)
+                     << "\n");
+        return false;
+      }
+      ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
+    }
+
+    ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt);
+    ExitMainLoopAt->setName("exit.mainloop.at");
+  }
+
+  // We clone these ahead of time so that we don't have to deal with changing
+  // and temporarily invalid IR as we transform the loops.
+  if (NeedsPreLoop)
+    cloneLoop(PreLoop, "preloop");
+  if (NeedsPostLoop)
+    cloneLoop(PostLoop, "postloop");
+
+  RewrittenRangeInfo PreLoopRRI;
+
+  if (NeedsPreLoop) {
+    Preheader->getTerminator()->replaceUsesOfWith(MainLoopStructure.Header,
+                                                  PreLoop.Structure.Header);
+
+    MainLoopPreheader =
+        createPreheader(MainLoopStructure, Preheader, "mainloop");
+    PreLoopRRI = changeIterationSpaceEnd(PreLoop.Structure, Preheader,
+                                         ExitPreLoopAt, MainLoopPreheader);
+    rewriteIncomingValuesForPHIs(MainLoopStructure, MainLoopPreheader,
+                                 PreLoopRRI);
+  }
+
+  BasicBlock *PostLoopPreheader = nullptr;
+  RewrittenRangeInfo PostLoopRRI;
+
+  if (NeedsPostLoop) {
+    PostLoopPreheader =
+        createPreheader(PostLoop.Structure, Preheader, "postloop");
+    PostLoopRRI = changeIterationSpaceEnd(MainLoopStructure, MainLoopPreheader,
+                                          ExitMainLoopAt, PostLoopPreheader);
+    rewriteIncomingValuesForPHIs(PostLoop.Structure, PostLoopPreheader,
+                                 PostLoopRRI);
+  }
+
+  BasicBlock *NewMainLoopPreheader =
+      MainLoopPreheader != Preheader ? MainLoopPreheader : nullptr;
+  BasicBlock *NewBlocks[] = {PostLoopPreheader,        PreLoopRRI.PseudoExit,
+                             PreLoopRRI.ExitSelector,  PostLoopRRI.PseudoExit,
+                             PostLoopRRI.ExitSelector, NewMainLoopPreheader};
+
+  // Some of the above may be nullptr, filter them out before passing to
+  // addToParentLoopIfNeeded.
+  auto NewBlocksEnd =
+      std::remove(std::begin(NewBlocks), std::end(NewBlocks), nullptr);
+
+  addToParentLoopIfNeeded(makeArrayRef(std::begin(NewBlocks), NewBlocksEnd));
+  addToParentLoopIfNeeded(PreLoop.Blocks);
+  addToParentLoopIfNeeded(PostLoop.Blocks);
+
+  return true;
+}
+
+/// Computes and returns a range of values for the induction variable (IndVar)
+/// in which the range check can be safely elided.  If it cannot compute such a
+/// range, returns None.
+Optional<InductiveRangeCheck::Range>
+InductiveRangeCheck::computeSafeIterationSpace(ScalarEvolution &SE,
+                                               const SCEVAddRecExpr *IndVar,
+                                               IRBuilder<> &) const {
+  // IndVar is of the form "A + B * I" (where "I" is the canonical induction
+  // variable, that may or may not exist as a real llvm::Value in the loop) and
+  // this inductive range check is a range check on the "C + D * I" ("C" is
+  // getOffset() and "D" is getScale()).  We rewrite the value being range
+  // checked to "M + N * IndVar" where "N" = "D * B^(-1)" and "M" = "C - NA".
+  // Currently we support this only for "B" = "D" = { 1 or -1 }, but the code
+  // can be generalized as needed.
+  //
+  // The actual inequalities we solve are of the form
+  //
+  //   0 <= M + 1 * IndVar < L given L >= 0  (i.e. N == 1)
+  //
+  // The inequality is satisfied by -M <= IndVar < (L - M) [^1].  All additions
+  // and subtractions are twos-complement wrapping and comparisons are signed.
+  //
+  // Proof:
+  //
+  //   If there exists IndVar such that -M <= IndVar < (L - M) then it follows
+  //   that -M <= (-M + L) [== Eq. 1].  Since L >= 0, if (-M + L) sign-overflows
+  //   then (-M + L) < (-M).  Hence by [Eq. 1], (-M + L) could not have
+  //   overflown.
+  //
+  //   This means IndVar = t + (-M) for t in [0, L).  Hence (IndVar + M) = t.
+  //   Hence 0 <= (IndVar + M) < L
+
+  // [^1]: Note that the solution does _not_ apply if L < 0; consider values M =
+  // 127, IndVar = 126 and L = -2 in an i8 world.
+
+  if (!IndVar->isAffine())
+    return None;
+
+  const SCEV *A = IndVar->getStart();
+  const SCEVConstant *B = dyn_cast<SCEVConstant>(IndVar->getStepRecurrence(SE));
+  if (!B)
+    return None;
+
+  const SCEV *C = getOffset();
+  const SCEVConstant *D = dyn_cast<SCEVConstant>(getScale());
+  if (D != B)
+    return None;
+
+  ConstantInt *ConstD = D->getValue();
+  if (!(ConstD->isMinusOne() || ConstD->isOne()))
+    return None;
+
+  const SCEV *M = SE.getMinusSCEV(C, A);
+
+  const SCEV *Begin = SE.getNegativeSCEV(M);
+  const SCEV *End = SE.getMinusSCEV(SE.getSCEV(getLength()), M);
+
+  return InductiveRangeCheck::Range(Begin, End);
+}
+
+static Optional<InductiveRangeCheck::Range>
+IntersectRange(ScalarEvolution &SE,
+               const Optional<InductiveRangeCheck::Range> &R1,
+               const InductiveRangeCheck::Range &R2, IRBuilder<> &B) {
+  if (!R1.hasValue())
+    return R2;
+  auto &R1Value = R1.getValue();
+
+  // TODO: we could widen the smaller range and have this work; but for now we
+  // bail out to keep things simple.
+  if (R1Value.getType() != R2.getType())
+    return None;
+
+  const SCEV *NewBegin = SE.getSMaxExpr(R1Value.getBegin(), R2.getBegin());
+  const SCEV *NewEnd = SE.getSMinExpr(R1Value.getEnd(), R2.getEnd());
+
+  return InductiveRangeCheck::Range(NewBegin, NewEnd);
+}
+
+bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (L->getBlocks().size() >= LoopSizeCutoff) {
+    DEBUG(dbgs() << "irce: giving up constraining loop, too large\n";);
+    return false;
+  }
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) {
+    DEBUG(dbgs() << "irce: loop has no preheader, leaving\n");
+    return false;
+  }
+
+  LLVMContext &Context = Preheader->getContext();
+  InductiveRangeCheck::AllocatorTy IRCAlloc;
+  SmallVector<InductiveRangeCheck *, 16> RangeChecks;
+  ScalarEvolution &SE = getAnalysis<ScalarEvolution>();
+  BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>();
+
+  for (auto BBI : L->getBlocks())
+    if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
+      if (InductiveRangeCheck *IRC =
+          InductiveRangeCheck::create(IRCAlloc, TBI, L, SE, BPI))
+        RangeChecks.push_back(IRC);
+
+  if (RangeChecks.empty())
+    return false;
+
+  DEBUG(dbgs() << "irce: looking at loop "; L->print(dbgs());
+        dbgs() << "irce: loop has " << RangeChecks.size()
+               << " inductive range checks: \n";
+        for (InductiveRangeCheck *IRC : RangeChecks)
+          IRC->print(dbgs());
+    );
+
+  const char *FailureReason = nullptr;
+  Optional<LoopStructure> MaybeLoopStructure =
+      LoopStructure::parseLoopStructure(SE, BPI, *L, FailureReason);
+  if (!MaybeLoopStructure.hasValue()) {
+    DEBUG(dbgs() << "irce: could not parse loop structure: " << FailureReason
+                 << "\n";);
+    return false;
+  }
+  LoopStructure LS = MaybeLoopStructure.getValue();
+  bool Increasing = LS.IndVarIncreasing;
+  const SCEV *MinusOne =
+      SE.getConstant(LS.IndVarNext->getType(), Increasing ? -1 : 1, true);
+  const SCEVAddRecExpr *IndVar =
+      cast<SCEVAddRecExpr>(SE.getAddExpr(SE.getSCEV(LS.IndVarNext), MinusOne));
+
+  Optional<InductiveRangeCheck::Range> SafeIterRange;
+  Instruction *ExprInsertPt = Preheader->getTerminator();
+
+  SmallVector<InductiveRangeCheck *, 4> RangeChecksToEliminate;
+
+  IRBuilder<> B(ExprInsertPt);
+  for (InductiveRangeCheck *IRC : RangeChecks) {
+    auto Result = IRC->computeSafeIterationSpace(SE, IndVar, B);
+    if (Result.hasValue()) {
+      auto MaybeSafeIterRange =
+        IntersectRange(SE, SafeIterRange, Result.getValue(), B);
+      if (MaybeSafeIterRange.hasValue()) {
+        RangeChecksToEliminate.push_back(IRC);
+        SafeIterRange = MaybeSafeIterRange.getValue();
+      }
+    }
+  }
+
+  if (!SafeIterRange.hasValue())
+    return false;
+
+  LoopConstrainer LC(*L, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), LS,
+                     SE, SafeIterRange.getValue());
+  bool Changed = LC.run();
+
+  if (Changed) {
+    auto PrintConstrainedLoopInfo = [L]() {
+      dbgs() << "irce: in function ";
+      dbgs() << L->getHeader()->getParent()->getName() << ": ";
+      dbgs() << "constrained ";
+      L->print(dbgs());
+    };
+
+    DEBUG(PrintConstrainedLoopInfo());
+
+    if (PrintChangedLoops)
+      PrintConstrainedLoopInfo();
+
+    // Optimize away the now-redundant range checks.
+
+    for (InductiveRangeCheck *IRC : RangeChecksToEliminate) {
+      ConstantInt *FoldedRangeCheck = IRC->getPassingDirection()
+                                          ? ConstantInt::getTrue(Context)
+                                          : ConstantInt::getFalse(Context);
+      IRC->getBranch()->setCondition(FoldedRangeCheck);
+    }
+  }
+
+  return Changed;
+}
+
+Pass *llvm::createInductiveRangeCheckEliminationPass() {
+  return new InductiveRangeCheckElimination;
+}
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 60a4925..8b54abd 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -32,7 +32,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -115,7 +115,7 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<LazyValueInfo>();
       AU.addPreserved<LazyValueInfo>();
-      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
 
     void FindLoopHeaders(Function &F);
@@ -145,7 +145,7 @@ char JumpThreading::ID = 0;
 INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
 INITIALIZE_PASS_DEPENDENCY(LazyValueInfo)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
 
@@ -161,7 +161,7 @@ bool JumpThreading::runOnFunction(Function &F) {
   DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
-  TLI = &getAnalysis<TargetLibraryInfo>();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   LVI = &getAnalysis<LazyValueInfo>();
 
   // Remove unreachable blocks from function as they may result in infinite
@@ -188,7 +188,7 @@ bool JumpThreading::runOnFunction(Function &F) {
 
       // If the block is trivially dead, zap it.  This eliminates the successor
       // edges which simplifies the CFG.
-      if (pred_begin(BB) == pred_end(BB) &&
+      if (pred_empty(BB) &&
           BB != &BB->getParent()->getEntryBlock()) {
         DEBUG(dbgs() << "  JT: Deleting dead block '" << BB->getName()
               << "' with terminator: " << *BB->getTerminator() << '\n');
@@ -662,7 +662,7 @@ static bool hasAddressTakenAndUsed(BasicBlock *BB) {
 bool JumpThreading::ProcessBlock(BasicBlock *BB) {
   // If the block is trivially dead, just return and let the caller nuke it.
   // This simplifies other transformations.
-  if (pred_begin(BB) == pred_end(BB) &&
+  if (pred_empty(BB) &&
       BB != &BB->getParent()->getEntryBlock())
     return false;
 
@@ -797,7 +797,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
       }
 
     } else if (CondBr && CondConst && CondBr->isConditional()) {
-      // There might be an invairant in the same block with the conditional
+      // There might be an invariant in the same block with the conditional
       // that can determine the predicate.
 
       LazyValueInfo::Tristate Ret =
@@ -902,8 +902,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     // only happen in dead loops.
     if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType());
     if (AvailableVal->getType() != LI->getType())
-      AvailableVal = CastInst::Create(CastInst::BitCast, AvailableVal,
-                                      LI->getType(), "", LI);
+      AvailableVal =
+          CastInst::CreateBitOrPointerCast(AvailableVal, LI->getType(), "", LI);
     LI->replaceAllUsesWith(AvailableVal);
     LI->eraseFromParent();
     return true;
@@ -993,7 +993,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
     // Split them out to their own block.
     UnavailablePred =
-      SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split", this);
+      SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split");
   }
 
   // If the value isn't available in all predecessors, then there will be
@@ -1040,8 +1040,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     // predecessor use the same bitcast.
     Value *&PredV = I->second;
     if (PredV->getType() != LI->getType())
-      PredV = CastInst::Create(CastInst::BitCast, PredV, LI->getType(), "",
-                               P->getTerminator());
+      PredV = CastInst::CreateBitOrPointerCast(PredV, LI->getType(), "",
+                                               P->getTerminator());
 
     PN->addIncoming(PredV, I->first);
   }
@@ -1418,7 +1418,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
   else {
     DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
           << " common predecessors.\n");
-    PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this);
+    PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm");
   }
 
   // And finally, do it!
@@ -1561,7 +1561,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
   else {
     DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
           << " common predecessors.\n");
-    PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this);
+    PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm");
   }
 
   // Okay, we decided to do this!  Clone all the instructions in BB onto the end
@@ -1575,7 +1575,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
   BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
 
   if (!OldPredBranch || !OldPredBranch->isUnconditional()) {
-    PredBB = SplitEdge(PredBB, BB, this);
+    PredBB = SplitEdge(PredBB, BB);
     OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
   }
 
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 5f00bb9..14af38b 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -52,7 +52,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -71,6 +71,27 @@ static cl::opt<bool>
 DisablePromotion("disable-licm-promotion", cl::Hidden,
                  cl::desc("Disable memory promotion in LICM pass"));
 
+static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
+static bool isNotUsedInLoop(Instruction &I, Loop *CurLoop);
+static bool hoist(Instruction &I, BasicBlock *Preheader);
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, 
+                 Loop *CurLoop, AliasSetTracker *CurAST );
+static bool isGuaranteedToExecute(Instruction &Inst, DominatorTree *DT, 
+                                  Loop *CurLoop, LICMSafetyInfo * SafetyInfo); 
+static bool isSafeToExecuteUnconditionally(Instruction &Inst,DominatorTree *DT, 
+                                           const DataLayout *DL, Loop *CurLoop,
+                                           LICMSafetyInfo * SafetyInfo);
+static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
+                                     const AAMDNodes &AAInfo, 
+                                     AliasSetTracker *CurAST);
+static Instruction *CloneInstructionInExitBlock(Instruction &I,
+                                                BasicBlock &ExitBlock,
+                                                PHINode &PN, LoopInfo *LI);
+static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, 
+                               DominatorTree *DT, const DataLayout *DL, 
+                               Loop *CurLoop, AliasSetTracker *CurAST, 
+                               LICMSafetyInfo * SafetyInfo);
+
 namespace {
   struct LICM : public LoopPass {
     static char ID; // Pass identification, replacement for typeid
@@ -86,7 +107,7 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<LoopInfo>();
+      AU.addRequired<LoopInfoWrapperPass>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addPreservedID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
@@ -94,7 +115,7 @@ namespace {
       AU.addRequired<AliasAnalysis>();
       AU.addPreserved<AliasAnalysis>();
       AU.addPreserved<ScalarEvolution>();
-      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
 
     using llvm::Pass::doFinalization;
@@ -117,9 +138,6 @@ namespace {
     BasicBlock *Preheader;   // The preheader block of the current loop...
     Loop *CurLoop;           // The current loop we are working on...
     AliasSetTracker *CurAST; // AliasSet information for the current loop...
-    bool MayThrow;           // The current loop contains an instruction which
-                             // may throw, thus preventing code motion of
-                             // instructions with side effects.
     DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap;
 
     /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
@@ -132,88 +150,17 @@ namespace {
 
     /// Simple Analysis hook. Delete loop L from alias set map.
     void deleteAnalysisLoop(Loop *L) override;
-
-    /// SinkRegion - Walk the specified region of the CFG (defined by all blocks
-    /// dominated by the specified block, and that are in the current loop) in
-    /// reverse depth first order w.r.t the DominatorTree.  This allows us to
-    /// visit uses before definitions, allowing us to sink a loop body in one
-    /// pass without iteration.
-    ///
-    void SinkRegion(DomTreeNode *N);
-
-    /// HoistRegion - Walk the specified region of the CFG (defined by all
-    /// blocks dominated by the specified block, and that are in the current
-    /// loop) in depth first order w.r.t the DominatorTree.  This allows us to
-    /// visit definitions before uses, allowing us to hoist a loop body in one
-    /// pass without iteration.
-    ///
-    void HoistRegion(DomTreeNode *N);
-
-    /// inSubLoop - Little predicate that returns true if the specified basic
-    /// block is in a subloop of the current one, not the current one itself.
-    ///
-    bool inSubLoop(BasicBlock *BB) {
-      assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop");
-      return LI->getLoopFor(BB) != CurLoop;
-    }
-
-    /// sink - When an instruction is found to only be used outside of the loop,
-    /// this function moves it to the exit blocks and patches up SSA form as
-    /// needed.
-    ///
-    void sink(Instruction &I);
-
-    /// hoist - When an instruction is found to only use loop invariant operands
-    /// that is safe to hoist, this instruction is called to do the dirty work.
-    ///
-    void hoist(Instruction &I);
-
-    /// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it
-    /// is not a trapping instruction or if it is a trapping instruction and is
-    /// guaranteed to execute.
-    ///
-    bool isSafeToExecuteUnconditionally(Instruction &I);
-
-    /// isGuaranteedToExecute - Check that the instruction is guaranteed to
-    /// execute.
-    ///
-    bool isGuaranteedToExecute(Instruction &I);
-
-    /// pointerInvalidatedByLoop - Return true if the body of this loop may
-    /// store into the memory location pointed to by V.
-    ///
-    bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
-                                  const AAMDNodes &AAInfo) {
-      // Check to see if any of the basic blocks in CurLoop invalidate *V.
-      return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod();
-    }
-
-    bool canSinkOrHoistInst(Instruction &I);
-    bool isNotUsedInLoop(Instruction &I);
-
-    void PromoteAliasSet(AliasSet &AS,
-                         SmallVectorImpl<BasicBlock*> &ExitBlocks,
-                         SmallVectorImpl<Instruction*> &InsertPts,
-                         PredIteratorCache &PIC);
-
-    /// \brief Create a copy of the instruction in the exit block and patch up
-    /// SSA.
-    /// PN is a user of I in ExitBlock that can be used to get the number and
-    /// list of predecessors fast.
-    Instruction *CloneInstructionInExitBlock(Instruction &I,
-                                             BasicBlock &ExitBlock,
-                                             PHINode &PN);
   };
 }
 
 char LICM::ID = 0;
 INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false)
 
@@ -230,13 +177,13 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   Changed = false;
 
   // Get our Loop and Alias Analysis information...
-  LI = &getAnalysis<LoopInfo>();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   AA = &getAnalysis<AliasAnalysis>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
-  TLI = &getAnalysis<TargetLibraryInfo>();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
 
@@ -273,14 +220,9 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
       CurAST->add(*BB);                 // Incorporate the specified basic block
   }
 
-  MayThrow = false;
-  // TODO: We've already searched for instructions which may throw in subloops.
-  // We may want to reuse this information.
-  for (Loop::block_iterator BB = L->block_begin(), BBE = L->block_end();
-       (BB != BBE) && !MayThrow ; ++BB)
-    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
-         (I != E) && !MayThrow; ++I)
-      MayThrow |= I->mayThrow();
+  // Compute loop safety information.
+  LICMSafetyInfo SafetyInfo;
+  computeLICMSafetyInfo(&SafetyInfo, CurLoop);
 
   // We want to visit all of the instructions in this loop... that are not parts
   // of our subloops (they have already had their invariants hoisted out of
@@ -293,9 +235,11 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   // instructions, we perform another pass to hoist them out of the loop.
   //
   if (L->hasDedicatedExits())
-    SinkRegion(DT->getNode(L->getHeader()));
+    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, DL, TLI, 
+                          CurLoop, CurAST, &SafetyInfo);
   if (Preheader)
-    HoistRegion(DT->getNode(L->getHeader()));
+    Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, DL, TLI, 
+                           CurLoop, CurAST, &SafetyInfo);
 
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can.
@@ -307,7 +251,9 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
     // Loop over all of the alias sets in the tracker object.
     for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
          I != E; ++I)
-      PromoteAliasSet(*I, ExitBlocks, InsertPts, PIC);
+      Changed |= promoteLoopAccessesToScalars(*I, ExitBlocks, InsertPts, 
+                                              PIC, LI, DT, CurLoop, 
+                                              CurAST, &SafetyInfo);
 
     // Once we have promoted values across the loop body we have to recursively
     // reform LCSSA as any nested loop may now have values defined within the
@@ -316,7 +262,8 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
     // SSAUpdater strategy during promotion that was LCSSA aware and reformed
     // it as it went.
     if (Changed)
-      formLCSSARecursively(*L, *DT, getAnalysisIfAvailable<ScalarEvolution>());
+      formLCSSARecursively(*L, *DT, LI,
+                           getAnalysisIfAvailable<ScalarEvolution>());
   }
 
   // Check that neither this loop nor its parent have had LCSSA broken. LICM is
@@ -339,27 +286,36 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   return Changed;
 }
 
-/// SinkRegion - Walk the specified region of the CFG (defined by all blocks
-/// dominated by the specified block, and that are in the current loop) in
-/// reverse depth first order w.r.t the DominatorTree.  This allows us to visit
-/// uses before definitions, allowing us to sink a loop body in one pass without
-/// iteration.
+/// Walk the specified region of the CFG (defined by all blocks dominated by
+/// the specified block, and that are in the current loop) in reverse depth 
+/// first order w.r.t the DominatorTree.  This allows us to visit uses before
+/// definitions, allowing us to sink a loop body in one pass without iteration.
 ///
-void LICM::SinkRegion(DomTreeNode *N) {
-  assert(N != nullptr && "Null dominator tree node?");
+bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, 
+                      DominatorTree *DT, const DataLayout *DL, 
+                      TargetLibraryInfo *TLI, Loop *CurLoop, 
+                      AliasSetTracker *CurAST, LICMSafetyInfo * SafetyInfo) { 
+
+  // Verify inputs.
+  assert(N != nullptr && AA != nullptr && LI != nullptr && 
+         DT != nullptr && CurLoop != nullptr && CurAST != nullptr && 
+         SafetyInfo != nullptr && "Unexpected input to sinkRegion");
+
+  // Set changed as false.
+  bool Changed = false;
+  // Get basic block
   BasicBlock *BB = N->getBlock();
-
   // If this subregion is not in the top level loop at all, exit.
-  if (!CurLoop->contains(BB)) return;
+  if (!CurLoop->contains(BB)) return Changed;
 
   // We are processing blocks in reverse dfo, so process children first.
   const std::vector<DomTreeNode*> &Children = N->getChildren();
   for (unsigned i = 0, e = Children.size(); i != e; ++i)
-    SinkRegion(Children[i]);
-
+    Changed |= sinkRegion(Children[i], AA, LI, DT, DL, TLI, CurLoop, 
+                          CurAST, SafetyInfo);
   // Only need to process the contents of this block if it is not part of a
   // subloop (which would already have been processed).
-  if (inSubLoop(BB)) return;
+  if (inSubLoop(BB,CurLoop,LI)) return Changed;
 
   for (BasicBlock::iterator II = BB->end(); II != BB->begin(); ) {
     Instruction &I = *--II;
@@ -380,31 +336,39 @@ void LICM::SinkRegion(DomTreeNode *N) {
     // outside of the loop.  In this case, it doesn't even matter if the
     // operands of the instruction are loop invariant.
     //
-    if (isNotUsedInLoop(I) && canSinkOrHoistInst(I)) {
+    if (isNotUsedInLoop(I, CurLoop) && 
+        canSinkOrHoistInst(I, AA, DT, DL, CurLoop, CurAST, SafetyInfo)) {
       ++II;
-      sink(I);
+      Changed |= sink(I, LI, DT, CurLoop, CurAST);
     }
   }
+  return Changed;
 }
 
-/// HoistRegion - Walk the specified region of the CFG (defined by all blocks
-/// dominated by the specified block, and that are in the current loop) in depth
-/// first order w.r.t the DominatorTree.  This allows us to visit definitions
-/// before uses, allowing us to hoist a loop body in one pass without iteration.
+/// Walk the specified region of the CFG (defined by all blocks dominated by
+/// the specified block, and that are in the current loop) in depth first
+/// order w.r.t the DominatorTree.  This allows us to visit definitions before
+/// uses, allowing us to hoist a loop body in one pass without iteration.
 ///
-void LICM::HoistRegion(DomTreeNode *N) {
-  assert(N != nullptr && "Null dominator tree node?");
+bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, 
+                       DominatorTree *DT, const DataLayout *DL, 
+                       TargetLibraryInfo *TLI, Loop *CurLoop,
+                       AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) { 
+  // Verify inputs.
+  assert(N != nullptr && AA != nullptr && LI != nullptr && 
+         DT != nullptr && CurLoop != nullptr && CurAST != nullptr && 
+         SafetyInfo != nullptr && "Unexpected input to hoistRegion");
+  // Set changed as false.
+  bool Changed = false;
+  // Get basic block
   BasicBlock *BB = N->getBlock();
-
   // If this subregion is not in the top level loop at all, exit.
-  if (!CurLoop->contains(BB)) return;
-
+  if (!CurLoop->contains(BB)) return Changed;
   // Only need to process the contents of this block if it is not part of a
   // subloop (which would already have been processed).
-  if (!inSubLoop(BB))
+  if (!inSubLoop(BB, CurLoop, LI))
     for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) {
       Instruction &I = *II++;
-
       // Try constant folding this instruction.  If all the operands are
       // constants, it is technically hoistable, but it would be better to just
       // fold it.
@@ -421,20 +385,49 @@ void LICM::HoistRegion(DomTreeNode *N) {
       // if all of the operands of the instruction are loop invariant and if it
       // is safe to hoist the instruction.
       //
-      if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I) &&
-          isSafeToExecuteUnconditionally(I))
-        hoist(I);
+      if (CurLoop->hasLoopInvariantOperands(&I) && 
+          canSinkOrHoistInst(I, AA, DT, DL, CurLoop, CurAST, SafetyInfo) &&
+          isSafeToExecuteUnconditionally(I, DT, DL, CurLoop, SafetyInfo))
+        Changed |= hoist(I, CurLoop->getLoopPreheader());
     }
 
   const std::vector<DomTreeNode*> &Children = N->getChildren();
   for (unsigned i = 0, e = Children.size(); i != e; ++i)
-    HoistRegion(Children[i]);
+    Changed |= hoistRegion(Children[i], AA, LI, DT, DL, TLI, CurLoop, 
+                           CurAST, SafetyInfo); 
+  return Changed;
+}
+
+/// Computes loop safety information, checks loop body & header
+/// for the possiblity of may throw exception.
+///
+void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) {
+  assert(CurLoop != nullptr && "CurLoop cant be null");
+  BasicBlock *Header = CurLoop->getHeader();
+  // Setting default safety values.
+  SafetyInfo->MayThrow = false;
+  SafetyInfo->HeaderMayThrow = false;
+  // Iterate over header and compute dafety info.
+  for (BasicBlock::iterator I = Header->begin(), E = Header->end();
+       (I != E) && !SafetyInfo->HeaderMayThrow; ++I)
+    SafetyInfo->HeaderMayThrow |= I->mayThrow();
+  
+  SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow;
+  // Iterate over loop instructions and compute safety info. 
+  for (Loop::block_iterator BB = CurLoop->block_begin(), 
+       BBE = CurLoop->block_end(); (BB != BBE) && !SafetyInfo->MayThrow ; ++BB)
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
+         (I != E) && !SafetyInfo->MayThrow; ++I)
+      SafetyInfo->MayThrow |= I->mayThrow();
 }
 
 /// canSinkOrHoistInst - Return true if the hoister and sinker can handle this
 /// instruction.
 ///
-bool LICM::canSinkOrHoistInst(Instruction &I) {
+bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, 
+                        DominatorTree *DT, const DataLayout *DL, 
+                        Loop *CurLoop, AliasSetTracker *CurAST, 
+                        LICMSafetyInfo * SafetyInfo) {
   // Loads have extra constraints we have to verify before we can hoist them.
   if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
     if (!LI->isUnordered())
@@ -455,7 +448,7 @@ bool LICM::canSinkOrHoistInst(Instruction &I) {
     AAMDNodes AAInfo;
     LI->getAAMetadata(AAInfo);
 
-    return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo);
+    return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo, CurAST);
   } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
     // Don't sink or hoist dbg info; it's legal, but not useful.
     if (isa<DbgInfoIntrinsic>(I))
@@ -494,14 +487,14 @@ bool LICM::canSinkOrHoistInst(Instruction &I) {
       !isa<InsertValueInst>(I))
     return false;
 
-  return isSafeToExecuteUnconditionally(I);
+  return isSafeToExecuteUnconditionally(I, DT, DL, CurLoop, SafetyInfo);
 }
 
-/// \brief Returns true if a PHINode is a trivially replaceable with an
+/// Returns true if a PHINode is a trivially replaceable with an
 /// Instruction.
+/// This is true when all incoming values are that instruction.
+/// This pattern occurs most often with LCSSA PHI nodes.
 ///
-/// This is true when all incoming values are that instruction. This pattern
-/// occurs most often with LCSSA PHI nodes.
 static bool isTriviallyReplacablePHI(PHINode &PN, Instruction &I) {
   for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
     if (PN.getIncomingValue(i) != &I)
@@ -510,11 +503,11 @@ static bool isTriviallyReplacablePHI(PHINode &PN, Instruction &I) {
   return true;
 }
 
-/// isNotUsedInLoop - Return true if the only users of this instruction are
-/// outside of the loop.  If this is true, we can sink the instruction to the
-/// exit blocks of the loop.
+/// Return true if the only users of this instruction are outside of
+/// the loop. If this is true, we can sink the instruction to the exit
+/// blocks of the loop.
 ///
-bool LICM::isNotUsedInLoop(Instruction &I) {
+static bool isNotUsedInLoop(Instruction &I, Loop *CurLoop) {
   for (User *U : I.users()) {
     Instruction *UI = cast<Instruction>(U);
     if (PHINode *PN = dyn_cast<PHINode>(UI)) {
@@ -545,9 +538,9 @@ bool LICM::isNotUsedInLoop(Instruction &I) {
   return true;
 }
 
-Instruction *LICM::CloneInstructionInExitBlock(Instruction &I,
-                                               BasicBlock &ExitBlock,
-                                               PHINode &PN) {
+static Instruction *CloneInstructionInExitBlock(Instruction &I,
+                                                BasicBlock &ExitBlock,
+                                                PHINode &PN, LoopInfo *LI) {
   Instruction *New = I.clone();
   ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New);
   if (!I.getName().empty()) New->setName(I.getName() + ".le");
@@ -574,14 +567,15 @@ Instruction *LICM::CloneInstructionInExitBlock(Instruction &I,
   return New;
 }
 
-/// sink - When an instruction is found to only be used outside of the loop,
-/// this function moves it to the exit blocks and patches up SSA form as needed.
+/// When an instruction is found to only be used outside of the loop, this
+/// function moves it to the exit blocks and patches up SSA form as needed.
 /// This method is guaranteed to remove the original instruction from its
 /// position, and may either delete it or move it to outside of the loop.
 ///
-void LICM::sink(Instruction &I) {
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, 
+                 Loop *CurLoop, AliasSetTracker *CurAST ) {
   DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
-
+  bool Changed = false;
   if (isa<LoadInst>(I)) ++NumMovedLoads;
   else if (isa<CallInst>(I)) ++NumMovedCalls;
   ++NumSunk;
@@ -590,7 +584,8 @@ void LICM::sink(Instruction &I) {
 #ifndef NDEBUG
   SmallVector<BasicBlock *, 32> ExitBlocks;
   CurLoop->getUniqueExitBlocks(ExitBlocks);
-  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end());
+  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), 
+                                             ExitBlocks.end());
 #endif
 
   // Clones of this instruction. Don't create more than one per exit block!
@@ -618,7 +613,7 @@ void LICM::sink(Instruction &I) {
       New = It->second;
     else
       New = SunkCopies[ExitBlock] =
-          CloneInstructionInExitBlock(I, *ExitBlock, *PN);
+            CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI);
 
     PN->replaceAllUsesWith(New);
     PN->eraseFromParent();
@@ -626,44 +621,41 @@ void LICM::sink(Instruction &I) {
 
   CurAST->deleteValue(&I);
   I.eraseFromParent();
+  return Changed;
 }
 
-/// hoist - When an instruction is found to only use loop invariant operands
-/// that is safe to hoist, this instruction is called to do the dirty work.
+/// When an instruction is found to only use loop invariant operands that
+/// is safe to hoist, this instruction is called to do the dirty work.
 ///
-void LICM::hoist(Instruction &I) {
+static bool hoist(Instruction &I, BasicBlock *Preheader) {
   DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": "
         << I << "\n");
-
   // Move the new node to the Preheader, before its terminator.
   I.moveBefore(Preheader->getTerminator());
 
   if (isa<LoadInst>(I)) ++NumMovedLoads;
   else if (isa<CallInst>(I)) ++NumMovedCalls;
   ++NumHoisted;
-  Changed = true;
+  return true;
 }
 
-/// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it is
-/// not a trapping instruction or if it is a trapping instruction and is
-/// guaranteed to execute.
+/// Only sink or hoist an instruction if it is not a trapping instruction
+/// or if it is a trapping instruction and is guaranteed to execute.
 ///
-bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) {
+static bool isSafeToExecuteUnconditionally(Instruction &Inst, DominatorTree *DT,
+                                           const DataLayout *DL, Loop *CurLoop, 
+                                           LICMSafetyInfo * SafetyInfo) {
   // If it is not a trapping instruction, it is always safe to hoist.
   if (isSafeToSpeculativelyExecute(&Inst, DL))
     return true;
 
-  return isGuaranteedToExecute(Inst);
+  return isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo);
 }
 
-bool LICM::isGuaranteedToExecute(Instruction &Inst) {
-
-  // Somewhere in this loop there is an instruction which may throw and make us
-  // exit the loop.
-  if (MayThrow)
-    return false;
+static bool isGuaranteedToExecute(Instruction &Inst, DominatorTree *DT, 
+                                  Loop *CurLoop, LICMSafetyInfo * SafetyInfo) {
 
-  // Otherwise we have to check to make sure that the instruction dominates all
+  // We have to check to make sure that the instruction dominates all
   // of the exit blocks.  If it doesn't, then there is a path out of the loop
   // which does not execute this instruction, so we can't hoist it.
 
@@ -671,7 +663,14 @@ bool LICM::isGuaranteedToExecute(Instruction &Inst) {
   // common), it is always guaranteed to dominate the exit blocks.  Since this
   // is a common case, and can save some work, check it now.
   if (Inst.getParent() == CurLoop->getHeader())
-    return true;
+    // If there's a throw in the header block, we can't guarantee we'll reach
+    // Inst.
+    return !SafetyInfo->HeaderMayThrow;
+
+  // Somewhere in this loop there is an instruction which may throw and make us
+  // exit the loop.
+  if (SafetyInfo->MayThrow)
+    return false;
 
   // Get the exit blocks for the current loop.
   SmallVector<BasicBlock*, 8> ExitBlocks;
@@ -768,25 +767,37 @@ namespace {
   };
 } // end anon namespace
 
-/// PromoteAliasSet - Try to promote memory values to scalars by sinking
-/// stores out of the loop and moving loads to before the loop.  We do this by
-/// looping over the stores in the loop, looking for stores to Must pointers
-/// which are loop invariant.
+/// Try to promote memory values to scalars by sinking stores out of the
+/// loop and moving loads to before the loop.  We do this by looping over
+/// the stores in the loop, looking for stores to Must pointers which are
+/// loop invariant.
 ///
-void LICM::PromoteAliasSet(AliasSet &AS,
-                           SmallVectorImpl<BasicBlock*> &ExitBlocks,
-                           SmallVectorImpl<Instruction*> &InsertPts,
-                           PredIteratorCache &PIC) {
+bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
+                                        SmallVectorImpl<BasicBlock*>&ExitBlocks,
+                                        SmallVectorImpl<Instruction*>&InsertPts,
+                                        PredIteratorCache &PIC, LoopInfo *LI, 
+                                        DominatorTree *DT, Loop *CurLoop, 
+                                        AliasSetTracker *CurAST, 
+                                        LICMSafetyInfo * SafetyInfo) { 
+  // Verify inputs.
+  assert(LI != nullptr && DT != nullptr && 
+         CurLoop != nullptr && CurAST != nullptr && 
+         SafetyInfo != nullptr && 
+         "Unexpected Input to promoteLoopAccessesToScalars");
+  // Initially set Changed status to false.
+  bool Changed = false;
   // We can promote this alias set if it has a store, if it is a "Must" alias
   // set, if the pointer is loop invariant, and if we are not eliminating any
   // volatile loads or stores.
   if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
       AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue()))
-    return;
+    return Changed;
 
   assert(!AS.empty() &&
          "Must alias set should have at least one pointer element in it!");
+
   Value *SomePtr = AS.begin()->getValue();
+  BasicBlock * Preheader = CurLoop->getLoopPreheader();
 
   // It isn't safe to promote a load/store from the loop if the load/store is
   // conditional.  For example, turning:
@@ -810,6 +821,7 @@ void LICM::PromoteAliasSet(AliasSet &AS,
   // us to prove better alignment.
   unsigned Alignment = 1;
   AAMDNodes AATags;
+  bool HasDedicatedExits = CurLoop->hasDedicatedExits();
 
   // Check that all of the pointers in the alias set have the same type.  We
   // cannot (yet) promote a memory location that is loaded and stored in
@@ -822,7 +834,7 @@ void LICM::PromoteAliasSet(AliasSet &AS,
     // cannot (yet) promote a memory location that is loaded and stored in
     // different sizes.
     if (SomePtr->getType() != ASIV->getType())
-      return;
+      return Changed;
 
     for (User *U : ASIV->users()) {
       // Ignore instructions that are outside the loop.
@@ -835,7 +847,7 @@ void LICM::PromoteAliasSet(AliasSet &AS,
       if (LoadInst *load = dyn_cast<LoadInst>(UI)) {
         assert(!load->isVolatile() && "AST broken");
         if (!load->isSimple())
-          return;
+          return Changed;
       } else if (StoreInst *store = dyn_cast<StoreInst>(UI)) {
         // Stores *of* the pointer are not interesting, only stores *to* the
         // pointer.
@@ -843,7 +855,14 @@ void LICM::PromoteAliasSet(AliasSet &AS,
           continue;
         assert(!store->isVolatile() && "AST broken");
         if (!store->isSimple())
-          return;
+          return Changed;
+        // Don't sink stores from loops without dedicated block exits. Exits
+        // containing indirect branches are not transformed by loop simplify,
+        // make sure we catch that. An additional load may be generated in the
+        // preheader for SSA updater, so also avoid sinking when no preheader
+        // is available.
+        if (!HasDedicatedExits || !Preheader)
+          return Changed;
 
         // Note that we only check GuaranteedToExecute inside the store case
         // so that we do not introduce stores where they did not exist before
@@ -855,16 +874,17 @@ void LICM::PromoteAliasSet(AliasSet &AS,
         // Larger is better, with the exception of 0 being the best alignment.
         unsigned InstAlignment = store->getAlignment();
         if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0)
-          if (isGuaranteedToExecute(*UI)) {
+          if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) {
             GuaranteedToExecute = true;
             Alignment = InstAlignment;
           }
 
         if (!GuaranteedToExecute)
-          GuaranteedToExecute = isGuaranteedToExecute(*UI);
+          GuaranteedToExecute = isGuaranteedToExecute(*UI, DT, 
+                                                      CurLoop, SafetyInfo);
 
       } else
-        return; // Not a load or store.
+        return Changed; // Not a load or store.
 
       // Merge the AA tags.
       if (LoopUses.empty()) {
@@ -880,7 +900,7 @@ void LICM::PromoteAliasSet(AliasSet &AS,
 
   // If there isn't a guaranteed-to-execute instruction, we can't promote.
   if (!GuaranteedToExecute)
-    return;
+    return Changed;
 
   // Otherwise, this is safe to promote, lets do it!
   DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n');
@@ -925,10 +945,12 @@ void LICM::PromoteAliasSet(AliasSet &AS,
   // If the SSAUpdater didn't use the load in the preheader, just zap it now.
   if (PreheaderLoad->use_empty())
     PreheaderLoad->eraseFromParent();
-}
 
+  return Changed;
+}
 
-/// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
+/// Simple Analysis hook. Clone alias set info.
+///
 void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) {
   AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
   if (!AST)
@@ -937,8 +959,8 @@ void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) {
   AST->copyValue(From, To);
 }
 
-/// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
-/// set.
+/// Simple Analysis hook. Delete value V from alias set
+///
 void LICM::deleteAnalysisValue(Value *V, Loop *L) {
   AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
   if (!AST)
@@ -948,6 +970,7 @@ void LICM::deleteAnalysisValue(Value *V, Loop *L) {
 }
 
 /// Simple Analysis hook. Delete value L from alias set map.
+///
 void LICM::deleteAnalysisLoop(Loop *L) {
   AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
   if (!AST)
@@ -956,3 +979,23 @@ void LICM::deleteAnalysisLoop(Loop *L) {
   delete AST;
   LoopToAliasSetMap.erase(L);
 }
+
+
+/// Return true if the body of this loop may store into the memory
+/// location pointed to by V.
+///
+static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
+                                     const AAMDNodes &AAInfo, 
+                                     AliasSetTracker *CurAST) {
+  // Check to see if any of the basic blocks in CurLoop invalidate *V.
+  return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod();
+}
+
+/// Little predicate that returns true if the specified basic block is in
+/// a subloop of the current one, not the current one itself.
+///
+static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI) {
+  assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop");
+  return LI->getLoopFor(BB) != CurLoop;
+}
+
diff --git a/lib/Transforms/Scalar/LLVMBuild.txt b/lib/Transforms/Scalar/LLVMBuild.txt
index 2bb49a3..deea9e2 100644
--- a/lib/Transforms/Scalar/LLVMBuild.txt
+++ b/lib/Transforms/Scalar/LLVMBuild.txt
@@ -20,4 +20,4 @@ type = Library
 name = Scalar
 parent = Transforms
 library_name = ScalarOpts
-required_libraries = Analysis Core InstCombine ProfileData Support Target TransformUtils
+required_libraries = Analysis Core InstCombine ProfileData Support TransformUtils
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 1d1f33a..98b068e 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -39,14 +39,14 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<LoopInfo>();
+      AU.addRequired<LoopInfoWrapperPass>();
       AU.addRequired<ScalarEvolution>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
 
       AU.addPreserved<ScalarEvolution>();
       AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<LoopInfo>();
+      AU.addPreserved<LoopInfoWrapperPass>();
       AU.addPreservedID(LoopSimplifyID);
       AU.addPreservedID(LCSSAID);
     }
@@ -63,7 +63,7 @@ char LoopDeletion::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion",
                 "Delete dead loops", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -236,7 +236,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Finally, the blocks from loopinfo.  This has to happen late because
   // otherwise our loop iterators won't work.
-  LoopInfo &loopInfo = getAnalysis<LoopInfo>();
+  LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   SmallPtrSet<BasicBlock*, 8> blocks;
   blocks.insert(L->block_begin(), L->block_end());
   for (BasicBlock *BB : blocks)
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index a12f5a7..243c624 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -56,7 +56,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -163,8 +163,8 @@ namespace {
     /// loop preheaders be inserted into the CFG.
     ///
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<LoopInfo>();
-      AU.addPreserved<LoopInfo>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addPreservedID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
@@ -175,8 +175,8 @@ namespace {
       AU.addPreserved<ScalarEvolution>();
       AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<TargetLibraryInfo>();
-      AU.addRequired<TargetTransformInfo>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
     }
 
     const DataLayout *getDataLayout() {
@@ -197,11 +197,16 @@ namespace {
     }
 
     TargetLibraryInfo *getTargetLibraryInfo() {
-      return TLI ? TLI : (TLI = &getAnalysis<TargetLibraryInfo>());
+      if (!TLI)
+        TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+      return TLI;
     }
 
     const TargetTransformInfo *getTargetTransformInfo() {
-      return TTI ? TTI : (TTI = &getAnalysis<TargetTransformInfo>());
+      return TTI ? TTI
+                 : (TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+                        *CurLoop->getHeader()->getParent()));
     }
 
     Loop *getLoop() const { return CurLoop; }
@@ -215,14 +220,14 @@ namespace {
 char LoopIdiomRecognize::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
                       false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
                     false, false)
 
@@ -232,44 +237,13 @@ Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); }
 /// and zero out all the operands of this instruction.  If any of them become
 /// dead, delete them and the computation tree that feeds them.
 ///
-static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE,
+static void deleteDeadInstruction(Instruction *I,
                                   const TargetLibraryInfo *TLI) {
-  SmallVector<Instruction*, 32> NowDeadInsts;
-
-  NowDeadInsts.push_back(I);
-
-  // Before we touch this instruction, remove it from SE!
-  do {
-    Instruction *DeadInst = NowDeadInsts.pop_back_val();
-
-    // This instruction is dead, zap it, in stages.  Start by removing it from
-    // SCEV.
-    SE.forgetValue(DeadInst);
-
-    for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
-      Value *Op = DeadInst->getOperand(op);
-      DeadInst->setOperand(op, nullptr);
-
-      // If this operand just became dead, add it to the NowDeadInsts list.
-      if (!Op->use_empty()) continue;
-
-      if (Instruction *OpI = dyn_cast<Instruction>(Op))
-        if (isInstructionTriviallyDead(OpI, TLI))
-          NowDeadInsts.push_back(OpI);
-    }
-
-    DeadInst->eraseFromParent();
-
-  } while (!NowDeadInsts.empty());
-}
-
-/// deleteIfDeadInstruction - If the specified value is a dead instruction,
-/// delete it and any recursively used instructions.
-static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE,
-                                    const TargetLibraryInfo *TLI) {
-  if (Instruction *I = dyn_cast<Instruction>(V))
-    if (isInstructionTriviallyDead(I, TLI))
-      deleteDeadInstruction(I, SE, TLI);
+  SmallVector<Value *, 16> Operands(I->value_op_begin(), I->value_op_end());
+  I->replaceAllUsesWith(UndefValue::get(I->getType()));
+  I->eraseFromParent();
+  for (Value *Op : Operands)
+    RecursivelyDeleteTriviallyDeadInstructions(Op, TLI);
 }
 
 //===----------------------------------------------------------------------===//
@@ -285,7 +259,7 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE,
 // the concern of breaking data dependence.
 bool LIRUtil::isAlmostEmpty(BasicBlock *BB) {
   if (BranchInst *Br = getBranch(BB)) {
-    return Br->isUnconditional() && BB->size() == 1;
+    return Br->isUnconditional() && Br == BB->begin();
   }
   return false;
 }
@@ -542,7 +516,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst,
       cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
     PreCond->replaceAllUsesWith(NewPreCond);
 
-    deleteDeadInstruction(PreCond, *SE, TLI);
+    RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
   }
 
   // Step 3: Note that the population count is exactly the trip count of the
@@ -592,15 +566,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst,
   // Step 4: All the references to the original population counter outside
   //  the loop are replaced with the NewCount -- the value returned from
   //  __builtin_ctpop().
-  {
-    SmallVector<Value *, 4> CntUses;
-    for (User *U : CntInst->users())
-      if (cast<Instruction>(U)->getParent() != Body)
-        CntUses.push_back(U);
-    for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) {
-      (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount);
-    }
-  }
+  CntInst->replaceUsesOutsideBlock(NewCount, Body);
 
   // step 5: Forget the "non-computable" trip-count SCEV associated with the
   //   loop. The loop would otherwise not be deleted even if it becomes empty.
@@ -666,8 +632,8 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
   // set DT
   (void)getDominatorTree();
 
-  LoopInfo &LI = getAnalysis<LoopInfo>();
-  TLI = &getAnalysis<TargetLibraryInfo>();
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   // set TLI
   (void)getTargetLibraryInfo();
@@ -997,7 +963,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
                             StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(BasePtr, *SE, TLI);
+    RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI);
     return false;
   }
 
@@ -1053,7 +1019,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  deleteDeadInstruction(TheStore, *SE, TLI);
+  deleteDeadInstruction(TheStore, TLI);
   ++NumMemSet;
   return true;
 }
@@ -1094,7 +1060,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
                             getAnalysis<AliasAnalysis>(), SI)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
+    RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
     return false;
   }
 
@@ -1109,8 +1075,8 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
                             StoreSize, getAnalysis<AliasAnalysis>(), SI)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(LoadBasePtr, *SE, TLI);
-    deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
+    RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
+    RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
     return false;
   }
 
@@ -1143,7 +1109,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  deleteDeadInstruction(SI, *SE, TLI);
+  deleteDeadInstruction(SI, TLI);
   ++NumMemCpy;
   return true;
 }
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 8fd7c8f..6dc600e 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -14,15 +14,16 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -42,13 +43,13 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
-      AU.addRequired<AssumptionTracker>();
-      AU.addRequired<LoopInfo>();
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<LoopInfoWrapperPass>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addPreservedID(LoopSimplifyID);
       AU.addPreservedID(LCSSAID);
-      AU.addPreserved("scalar-evolution");
-      AU.addRequired<TargetLibraryInfo>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
   };
 }
@@ -56,10 +57,10 @@ namespace {
 char LoopInstSimplify::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify",
                 "Simplify instructions in loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_END(LoopInstSimplify, "loop-instsimplify",
                 "Simplify instructions in loops", false, false)
@@ -75,11 +76,13 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  LoopInfo *LI = &getAnalysis<LoopInfo>();
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
-  const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
-  AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+      *L->getHeader()->getParent());
 
   SmallVector<BasicBlock*, 8> ExitBlocks;
   L->getUniqueExitBlocks(ExitBlocks);
@@ -120,7 +123,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
         // Don't bother simplifying unused instructions.
         if (!I->use_empty()) {
-          Value *V = SimplifyInstruction(I, DL, TLI, DT, AT);
+          Value *V = SimplifyInstruction(I, DL, TLI, DT, &AC);
           if (V && LI->replacementPreservesLCSSAForm(I, V)) {
             // Mark all uses for resimplification next time round the loop.
             for (User *U : I->users())
diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
index 8f12204..fdf7e3b 100644
--- a/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -12,7 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -28,7 +30,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -43,6 +45,12 @@ static cl::opt<unsigned>
 MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
   cl::desc("The maximum increment for loop rerolling"));
 
+static cl::opt<unsigned>
+NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
+                          cl::Hidden,
+                          cl::desc("The maximum number of failures to tolerate"
+                                   " during fuzzy matching. (default: 400)"));
+
 // This loop re-rolling transformation aims to transform loops like this:
 //
 // int foo(int a);
@@ -119,6 +127,16 @@ MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
 // br %cmp, header, exit
 
 namespace {
+  enum IterationLimits {
+    /// The maximum number of iterations that we'll try and reroll. This
+    /// has to be less than 25 in order to fit into a SmallBitVector.
+    IL_MaxRerollIterations = 16,
+    /// The bitvector index used by loop induction variables and other
+    /// instructions that belong to all iterations.
+    IL_All,
+    IL_End
+  };
+
   class LoopReroll : public LoopPass {
   public:
     static char ID; // Pass ID, replacement for typeid
@@ -130,15 +148,15 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<AliasAnalysis>();
-      AU.addRequired<LoopInfo>();
-      AU.addPreserved<LoopInfo>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addRequired<ScalarEvolution>();
-      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
 
-protected:
+  protected:
     AliasAnalysis *AA;
     LoopInfo *LI;
     ScalarEvolution *SE;
@@ -311,26 +329,116 @@ protected:
       DenseSet<int> Reds;
     };
 
+    // A DAGRootSet models an induction variable being used in a rerollable
+    // loop. For example,
+    //
+    //   x[i*3+0] = y1
+    //   x[i*3+1] = y2
+    //   x[i*3+2] = y3
+    //
+    //   Base instruction -> i*3               
+    //                    +---+----+
+    //                   /    |     \
+    //               ST[y1]  +1     +2  <-- Roots
+    //                        |      |
+    //                      ST[y2] ST[y3]
+    //
+    // There may be multiple DAGRoots, for example:
+    //
+    //   x[i*2+0] = ...   (1)
+    //   x[i*2+1] = ...   (1)
+    //   x[i*2+4] = ...   (2)
+    //   x[i*2+5] = ...   (2)
+    //   x[(i+1234)*2+5678] = ... (3)
+    //   x[(i+1234)*2+5679] = ... (3)
+    //
+    // The loop will be rerolled by adding a new loop induction variable,
+    // one for the Base instruction in each DAGRootSet.
+    //
+    struct DAGRootSet {
+      Instruction *BaseInst;
+      SmallInstructionVector Roots;
+      // The instructions between IV and BaseInst (but not including BaseInst).
+      SmallInstructionSet SubsumedInsts;
+    };
+
+    // The set of all DAG roots, and state tracking of all roots
+    // for a particular induction variable.
+    struct DAGRootTracker {
+      DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,
+                     ScalarEvolution *SE, AliasAnalysis *AA,
+                     TargetLibraryInfo *TLI, const DataLayout *DL)
+        : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI),
+          DL(DL), IV(IV) {
+      }
+
+      /// Stage 1: Find all the DAG roots for the induction variable.
+      bool findRoots();
+      /// Stage 2: Validate if the found roots are valid.
+      bool validate(ReductionTracker &Reductions);
+      /// Stage 3: Assuming validate() returned true, perform the
+      /// replacement.
+      /// @param IterCount The maximum iteration count of L.
+      void replace(const SCEV *IterCount);
+
+    protected:
+      typedef MapVector<Instruction*, SmallBitVector> UsesTy;
+
+      bool findRootsRecursive(Instruction *IVU,
+                              SmallInstructionSet SubsumedInsts);
+      bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts);
+      bool collectPossibleRoots(Instruction *Base,
+                                std::map<int64_t,Instruction*> &Roots);
+
+      bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet);
+      void collectInLoopUserSet(const SmallInstructionVector &Roots,
+                                const SmallInstructionSet &Exclude,
+                                const SmallInstructionSet &Final,
+                                DenseSet<Instruction *> &Users);
+      void collectInLoopUserSet(Instruction *Root,
+                                const SmallInstructionSet &Exclude,
+                                const SmallInstructionSet &Final,
+                                DenseSet<Instruction *> &Users);
+
+      UsesTy::iterator nextInstr(int Val, UsesTy &In,
+                                 const SmallInstructionSet &Exclude,
+                                 UsesTy::iterator *StartI=nullptr);
+      bool isBaseInst(Instruction *I);
+      bool isRootInst(Instruction *I);
+      bool instrDependsOn(Instruction *I,
+                          UsesTy::iterator Start,
+                          UsesTy::iterator End);
+
+      LoopReroll *Parent;
+
+      // Members of Parent, replicated here for brevity.
+      Loop *L;
+      ScalarEvolution *SE;
+      AliasAnalysis *AA;
+      TargetLibraryInfo *TLI;
+      const DataLayout *DL;
+
+      // The loop induction variable.
+      Instruction *IV;
+      // Loop step amount.
+      uint64_t Inc;
+      // Loop reroll count; if Inc == 1, this records the scaling applied
+      // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
+      // If Inc is not 1, Scale = Inc.
+      uint64_t Scale;
+      // The roots themselves.
+      SmallVector<DAGRootSet,16> RootSets;
+      // All increment instructions for IV.
+      SmallInstructionVector LoopIncs;
+      // Map of all instructions in the loop (in order) to the iterations
+      // they are used in (or specially, IL_All for instructions
+      // used in the loop increment mechanism).
+      UsesTy Uses;
+    };
+
     void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
     void collectPossibleReductions(Loop *L,
            ReductionTracker &Reductions);
-    void collectInLoopUserSet(Loop *L,
-           const SmallInstructionVector &Roots,
-           const SmallInstructionSet &Exclude,
-           const SmallInstructionSet &Final,
-           DenseSet<Instruction *> &Users);
-    void collectInLoopUserSet(Loop *L,
-           Instruction * Root,
-           const SmallInstructionSet &Exclude,
-           const SmallInstructionSet &Final,
-           DenseSet<Instruction *> &Users);
-    bool findScaleFromMul(Instruction *RealIV, uint64_t &Scale,
-                          Instruction *&IV,
-                          SmallInstructionVector &LoopIncs);
-    bool collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, Instruction *IV,
-                         SmallVector<SmallInstructionVector, 32> &Roots,
-                         SmallInstructionSet &AllRoots,
-                         SmallInstructionVector &LoopIncs);
     bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
                 ReductionTracker &Reductions);
   };
@@ -339,10 +447,10 @@ protected:
 char LoopReroll::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
 
 Pass *llvm::createLoopRerollPass() {
@@ -353,10 +461,10 @@ Pass *llvm::createLoopRerollPass() {
 // This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
 // non-loop blocks to be outside the loop.
 static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
-  for (User *U : I->users())
+  for (User *U : I->users()) {
     if (!L->contains(cast<Instruction>(U)))
       return true;
-
+  }
   return false;
 }
 
@@ -403,6 +511,8 @@ void LoopReroll::SimpleLoopReduction::add(Loop *L) {
   // (including the PHI), except for the last value (which is used by the PHI
   // and also outside the loop).
   Instruction *C = Instructions.front();
+  if (C->user_empty())
+    return;
 
   do {
     C = cast<Instruction>(*C->user_begin());
@@ -424,11 +534,12 @@ void LoopReroll::SimpleLoopReduction::add(Loop *L) {
     return;
 
   // C is now the (potential) last instruction in the reduction chain.
-  for (User *U : C->users())
+  for (User *U : C->users()) {
     // The only in-loop user can be the initial PHI.
     if (L->contains(cast<Instruction>(U)))
       if (cast<Instruction>(U) != Instructions.front())
         return;
+  }
 
   Instructions.push_back(C);
   Valid = true;
@@ -467,7 +578,7 @@ void LoopReroll::collectPossibleReductions(Loop *L,
 //   if they are users, but their users are not added. This is used, for
 //   example, to prevent a reduction update from forcing all later reduction
 //   updates into the use set.
-void LoopReroll::collectInLoopUserSet(Loop *L,
+void LoopReroll::DAGRootTracker::collectInLoopUserSet(
   Instruction *Root, const SmallInstructionSet &Exclude,
   const SmallInstructionSet &Final,
   DenseSet<Instruction *> &Users) {
@@ -504,14 +615,14 @@ void LoopReroll::collectInLoopUserSet(Loop *L,
 
 // Collect all of the users of all of the provided root instructions (combined
 // into a single set).
-void LoopReroll::collectInLoopUserSet(Loop *L,
+void LoopReroll::DAGRootTracker::collectInLoopUserSet(
   const SmallInstructionVector &Roots,
   const SmallInstructionSet &Exclude,
   const SmallInstructionSet &Final,
   DenseSet<Instruction *> &Users) {
   for (SmallInstructionVector::const_iterator I = Roots.begin(),
        IE = Roots.end(); I != IE; ++I)
-    collectInLoopUserSet(L, *I, Exclude, Final, Users);
+    collectInLoopUserSet(*I, Exclude, Final, Users);
 }
 
 static bool isSimpleLoadStore(Instruction *I) {
@@ -524,289 +635,372 @@ static bool isSimpleLoadStore(Instruction *I) {
   return false;
 }
 
-// Recognize loops that are setup like this:
-//
-// %iv = phi [ (preheader, ...), (body, %iv.next) ]
-// %scaled.iv = mul %iv, scale
-// f(%scaled.iv)
-// %scaled.iv.1 = add %scaled.iv, 1
-// f(%scaled.iv.1)
-// %scaled.iv.2 = add %scaled.iv, 2
-// f(%scaled.iv.2)
-// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
-// f(%scaled.iv.scale_m_1)
-// ...
-// %iv.next = add %iv, 1
-// %cmp = icmp(%iv, ...)
-// br %cmp, header, exit
-//
-// and, if found, set IV = %scaled.iv, and add %iv.next to LoopIncs.
-bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale,
-                                  Instruction *&IV,
-                                  SmallInstructionVector &LoopIncs) {
-  // This is a special case: here we're looking for all uses (except for
-  // the increment) to be multiplied by a common factor. The increment must
-  // be by one. This is to capture loops like:
-  //   for (int i = 0; i < 500; ++i) {
-  //     foo(3*i); foo(3*i+1); foo(3*i+2);
-  //   }
-  if (RealIV->getNumUses() != 2)
-    return false;
-  const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV));
-  Instruction *User1 = cast<Instruction>(*RealIV->user_begin()),
-              *User2 = cast<Instruction>(*std::next(RealIV->user_begin()));
-  if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType()))
-    return false;
-  const SCEVAddRecExpr *User1SCEV =
-                         dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User1)),
-                       *User2SCEV =
-                         dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User2));
-  if (!User1SCEV || !User1SCEV->isAffine() ||
-      !User2SCEV || !User2SCEV->isAffine())
+/// Return true if IVU is a "simple" arithmetic operation.
+/// This is used for narrowing the search space for DAGRoots; only arithmetic
+/// and GEPs can be part of a DAGRoot.
+static bool isSimpleArithmeticOp(User *IVU) {
+  if (Instruction *I = dyn_cast<Instruction>(IVU)) {
+    switch (I->getOpcode()) {
+    default: return false;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::Shl:
+    case Instruction::AShr:
+    case Instruction::LShr:
+    case Instruction::GetElementPtr:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool isLoopIncrement(User *U, Instruction *IV) {
+  BinaryOperator *BO = dyn_cast<BinaryOperator>(U);
+  if (!BO || BO->getOpcode() != Instruction::Add)
     return false;
 
-  // We assume below that User1 is the scale multiply and User2 is the
-  // increment. If this can't be true, then swap them.
-  if (User1SCEV == RealIVSCEV->getPostIncExpr(*SE)) {
-    std::swap(User1, User2);
-    std::swap(User1SCEV, User2SCEV);
+  for (auto *UU : BO->users()) {
+    PHINode *PN = dyn_cast<PHINode>(UU);
+    if (PN && PN == IV)
+      return true;
   }
+  return false;
+}
+
+bool LoopReroll::DAGRootTracker::
+collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
+  SmallInstructionVector BaseUsers;
+
+  for (auto *I : Base->users()) {
+    ConstantInt *CI = nullptr;
+
+    if (isLoopIncrement(I, IV)) {
+      LoopIncs.push_back(cast<Instruction>(I));
+      continue;
+    }
+
+    // The root nodes must be either GEPs, ORs or ADDs.
+    if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+      if (BO->getOpcode() == Instruction::Add ||
+          BO->getOpcode() == Instruction::Or)
+        CI = dyn_cast<ConstantInt>(BO->getOperand(1));
+    } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+      Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1);
+      CI = dyn_cast<ConstantInt>(LastOperand);
+    }
+
+    if (!CI) {
+      if (Instruction *II = dyn_cast<Instruction>(I)) {
+        BaseUsers.push_back(II);
+        continue;
+      } else {
+        DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I << "\n");
+        return false;
+      }
+    }
+
+    int64_t V = CI->getValue().getSExtValue();
+    if (Roots.find(V) != Roots.end())
+      // No duplicates, please.
+      return false;
 
-  if (User2SCEV != RealIVSCEV->getPostIncExpr(*SE))
+    // FIXME: Add support for negative values.
+    if (V < 0) {
+      DEBUG(dbgs() << "LRR: Aborting due to negative value: " << V << "\n");
+      return false;
+    }
+
+    Roots[V] = cast<Instruction>(I);
+  }
+
+  if (Roots.empty())
     return false;
-  assert(User2SCEV->getStepRecurrence(*SE)->isOne() &&
-         "Invalid non-unit step for multiplicative scaling");
-  LoopIncs.push_back(User2);
-
-  if (const SCEVConstant *MulScale =
-      dyn_cast<SCEVConstant>(User1SCEV->getStepRecurrence(*SE))) {
-    // Make sure that both the start and step have the same multiplier.
-    if (RealIVSCEV->getStart()->getType() != MulScale->getType())
+
+  // If we found non-loop-inc, non-root users of Base, assume they are
+  // for the zeroth root index. This is because "add %a, 0" gets optimized
+  // away.
+  if (BaseUsers.size()) {
+    if (Roots.find(0) != Roots.end()) {
+      DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
       return false;
-    if (SE->getMulExpr(RealIVSCEV->getStart(), MulScale) !=
-        User1SCEV->getStart())
+    }
+    Roots[0] = Base;
+  }
+
+  // Calculate the number of users of the base, or lowest indexed, iteration.
+  unsigned NumBaseUses = BaseUsers.size();
+  if (NumBaseUses == 0)
+    NumBaseUses = Roots.begin()->second->getNumUses();
+  
+  // Check that every node has the same number of users.
+  for (auto &KV : Roots) {
+    if (KV.first == 0)
+      continue;
+    if (KV.second->getNumUses() != NumBaseUses) {
+      DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
+            << "#Base=" << NumBaseUses << ", #Root=" <<
+            KV.second->getNumUses() << "\n");
       return false;
+    }
+  }
+
+  return true; 
+}
 
-    ConstantInt *MulScaleCI = MulScale->getValue();
-    if (!MulScaleCI->uge(2) || MulScaleCI->uge(MaxInc))
+bool LoopReroll::DAGRootTracker::
+findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
+  // Does the user look like it could be part of a root set?
+  // All its users must be simple arithmetic ops.
+  if (I->getNumUses() > IL_MaxRerollIterations)
+    return false;
+
+  if ((I->getOpcode() == Instruction::Mul ||
+       I->getOpcode() == Instruction::PHI) &&
+      I != IV &&
+      findRootsBase(I, SubsumedInsts))
+    return true;
+
+  SubsumedInsts.insert(I);
+
+  for (User *V : I->users()) {
+    Instruction *I = dyn_cast<Instruction>(V);
+    if (std::find(LoopIncs.begin(), LoopIncs.end(), I) != LoopIncs.end())
+      continue;
+
+    if (!I || !isSimpleArithmeticOp(I) ||
+        !findRootsRecursive(I, SubsumedInsts))
       return false;
-    Scale = MulScaleCI->getZExtValue();
-    IV = User1;
-  } else
+  }
+  return true;
+}
+
+bool LoopReroll::DAGRootTracker::
+findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
+
+  // The base instruction needs to be a multiply so
+  // that we can erase it.
+  if (IVU->getOpcode() != Instruction::Mul &&
+      IVU->getOpcode() != Instruction::PHI)
     return false;
 
-  DEBUG(dbgs() << "LRR: Found possible scaling " << *User1 << "\n");
+  std::map<int64_t, Instruction*> V;
+  if (!collectPossibleRoots(IVU, V))
+    return false;
+
+  // If we didn't get a root for index zero, then IVU must be 
+  // subsumed.
+  if (V.find(0) == V.end())
+    SubsumedInsts.insert(IVU);
+
+  // Partition the vector into monotonically increasing indexes.
+  DAGRootSet DRS;
+  DRS.BaseInst = nullptr;
+
+  for (auto &KV : V) {
+    if (!DRS.BaseInst) {
+      DRS.BaseInst = KV.second;
+      DRS.SubsumedInsts = SubsumedInsts;
+    } else if (DRS.Roots.empty()) {
+      DRS.Roots.push_back(KV.second);
+    } else if (V.find(KV.first - 1) != V.end()) {
+      DRS.Roots.push_back(KV.second);
+    } else {
+      // Linear sequence terminated.
+      RootSets.push_back(DRS);
+      DRS.BaseInst = KV.second;
+      DRS.SubsumedInsts = SubsumedInsts;
+      DRS.Roots.clear();
+    }
+  }
+  RootSets.push_back(DRS);
+
   return true;
 }
 
-// Collect all root increments with respect to the provided induction variable
-// (normally the PHI, but sometimes a multiply). A root increment is an
-// instruction, normally an add, with a positive constant less than Scale. In a
-// rerollable loop, each of these increments is the root of an instruction
-// graph isomorphic to the others. Also, we collect the final induction
-// increment (the increment equal to the Scale), and its users in LoopIncs.
-bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale,
-                                 Instruction *IV,
-                                 SmallVector<SmallInstructionVector, 32> &Roots,
-                                 SmallInstructionSet &AllRoots,
-                                 SmallInstructionVector &LoopIncs) {
-  for (User *U : IV->users()) {
-    Instruction *UI = cast<Instruction>(U);
-    if (!SE->isSCEVable(UI->getType()))
-      continue;
-    if (UI->getType() != IV->getType())
-      continue;
-    if (!L->contains(UI))
-      continue;
-    if (hasUsesOutsideLoop(UI, L))
-      continue;
+bool LoopReroll::DAGRootTracker::findRoots() {
 
-    if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV(
-          SE->getSCEV(UI), SE->getSCEV(IV)))) {
-      uint64_t Idx = Diff->getValue()->getValue().getZExtValue();
-      if (Idx > 0 && Idx < Scale) {
-        Roots[Idx-1].push_back(UI);
-        AllRoots.insert(UI);
-      } else if (Idx == Scale && Inc > 1) {
-        LoopIncs.push_back(UI);
-      }
+  const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV));
+  Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))->
+    getValue()->getZExtValue();
+
+  assert(RootSets.empty() && "Unclean state!");
+  if (Inc == 1) {
+    for (auto *IVU : IV->users()) {
+      if (isLoopIncrement(IVU, IV))
+        LoopIncs.push_back(cast<Instruction>(IVU));
     }
+    if (!findRootsRecursive(IV, SmallInstructionSet()))
+      return false;
+    LoopIncs.push_back(IV);
+  } else {
+    if (!findRootsBase(IV, SmallInstructionSet()))
+      return false;
   }
 
-  if (Roots[0].empty())
+  // Ensure all sets have the same size.
+  if (RootSets.empty()) {
+    DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
     return false;
-  bool AllSame = true;
-  for (unsigned i = 1; i < Scale-1; ++i)
-    if (Roots[i].size() != Roots[0].size()) {
-      AllSame = false;
-      break;
+  }
+  for (auto &V : RootSets) {
+    if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
+      DEBUG(dbgs()
+            << "LRR: Aborting because not all root sets have the same size\n");
+      return false;
     }
+  }
 
-  if (!AllSame)
+  // And ensure all loop iterations are consecutive. We rely on std::map
+  // providing ordered traversal.
+  for (auto &V : RootSets) {
+    const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(V.BaseInst));
+    if (!ADR)
+      return false;
+
+    // Consider a DAGRootSet with N-1 roots (so N different values including
+    //   BaseInst).
+    // Define d = Roots[0] - BaseInst, which should be the same as
+    //   Roots[I] - Roots[I-1] for all I in [1..N).
+    // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the
+    //   loop iteration J.
+    //
+    // Now, For the loop iterations to be consecutive:
+    //   D = d * N
+
+    unsigned N = V.Roots.size() + 1;
+    const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(V.Roots[0]), ADR);
+    const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N);
+    if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV)) {
+      DEBUG(dbgs() << "LRR: Aborting because iterations are not consecutive\n");
+      return false;
+    }
+  }
+  Scale = RootSets[0].Roots.size() + 1;
+
+  if (Scale > IL_MaxRerollIterations) {
+    DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
+          << "#Found=" << Scale << ", #Max=" << IL_MaxRerollIterations
+          << "\n");
     return false;
+  }
+
+  DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale << "\n");
 
   return true;
 }
 
-// Validate the selected reductions. All iterations must have an isomorphic
-// part of the reduction chain and, for non-associative reductions, the chain
-// entries must appear in order.
-bool LoopReroll::ReductionTracker::validateSelected() {
-  // For a non-associative reduction, the chain entries must appear in order.
-  for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
-       RI != RIE; ++RI) {
-    int i = *RI;
-    int PrevIter = 0, BaseCount = 0, Count = 0;
-    for (Instruction *J : PossibleReds[i]) {
-      // Note that all instructions in the chain must have been found because
-      // all instructions in the function must have been assigned to some
-      // iteration.
-      int Iter = PossibleRedIter[J];
-      if (Iter != PrevIter && Iter != PrevIter + 1 &&
-          !PossibleReds[i].getReducedValue()->isAssociative()) {
-        DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
-                        J << "\n");
-        return false;
-      }
+bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) {
+  // Populate the MapVector with all instructions in the block, in order first,
+  // so we can iterate over the contents later in perfect order.
+  for (auto &I : *L->getHeader()) {
+    Uses[&I].resize(IL_End);
+  }
 
-      if (Iter != PrevIter) {
-        if (Count != BaseCount) {
-          DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
-                " reduction use count " << Count <<
-                " is not equal to the base use count " <<
-                BaseCount << "\n");
-          return false;
-        }
+  SmallInstructionSet Exclude;
+  for (auto &DRS : RootSets) {
+    Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
+    Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
+    Exclude.insert(DRS.BaseInst);
+  }
+  Exclude.insert(LoopIncs.begin(), LoopIncs.end());
 
-        Count = 0;
+  for (auto &DRS : RootSets) {
+    DenseSet<Instruction*> VBase;
+    collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase);
+    for (auto *I : VBase) {
+      Uses[I].set(0);
+    }
+
+    unsigned Idx = 1;
+    for (auto *Root : DRS.Roots) {
+      DenseSet<Instruction*> V;
+      collectInLoopUserSet(Root, Exclude, PossibleRedSet, V);
+
+      // While we're here, check the use sets are the same size.
+      if (V.size() != VBase.size()) {
+        DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
+        return false;
       }
 
-      ++Count;
-      if (Iter == 0)
-        ++BaseCount;
+      for (auto *I : V) {
+        Uses[I].set(Idx);
+      }
+      ++Idx;
+    }
 
-      PrevIter = Iter;
+    // Make sure our subsumed instructions are remembered too.
+    for (auto *I : DRS.SubsumedInsts) {
+      Uses[I].set(IL_All);
     }
   }
 
-  return true;
-}
-
-// For all selected reductions, remove all parts except those in the first
-// iteration (and the PHI). Replace outside uses of the reduced value with uses
-// of the first-iteration reduced value (in other words, reroll the selected
-// reductions).
-void LoopReroll::ReductionTracker::replaceSelected() {
-  // Fixup reductions to refer to the last instruction associated with the
-  // first iteration (not the last).
-  for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
-       RI != RIE; ++RI) {
-    int i = *RI;
-    int j = 0;
-    for (int e = PossibleReds[i].size(); j != e; ++j)
-      if (PossibleRedIter[PossibleReds[i][j]] != 0) {
-        --j;
-        break;
-      }
+  // Make sure the loop increments are also accounted for.
 
-    // Replace users with the new end-of-chain value.
-    SmallInstructionVector Users;
-    for (User *U : PossibleReds[i].getReducedValue()->users())
-      Users.push_back(cast<Instruction>(U));
+  Exclude.clear();
+  for (auto &DRS : RootSets) {
+    Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
+    Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
+    Exclude.insert(DRS.BaseInst);
+  }
 
-    for (SmallInstructionVector::iterator J = Users.begin(),
-         JE = Users.end(); J != JE; ++J)
-      (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
-                              PossibleReds[i][j]);
+  DenseSet<Instruction*> V;
+  collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
+  for (auto *I : V) {
+    Uses[I].set(IL_All);
   }
-}
 
-// Reroll the provided loop with respect to the provided induction variable.
-// Generally, we're looking for a loop like this:
-//
-// %iv = phi [ (preheader, ...), (body, %iv.next) ]
-// f(%iv)
-// %iv.1 = add %iv, 1                <-- a root increment
-// f(%iv.1)
-// %iv.2 = add %iv, 2                <-- a root increment
-// f(%iv.2)
-// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
-// f(%iv.scale_m_1)
-// ...
-// %iv.next = add %iv, scale
-// %cmp = icmp(%iv, ...)
-// br %cmp, header, exit
-//
-// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
-// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
-// be intermixed with eachother. The restriction imposed by this algorithm is
-// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
-// etc. be the same.
-//
-// First, we collect the use set of %iv, excluding the other increment roots.
-// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
-// times, having collected the use set of f(%iv.(i+1)), during which we:
-//   - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
-//     the next unmatched instruction in f(%iv.(i+1)).
-//   - Ensure that both matched instructions don't have any external users
-//     (with the exception of last-in-chain reduction instructions).
-//   - Track the (aliasing) write set, and other side effects, of all
-//     instructions that belong to future iterations that come before the matched
-//     instructions. If the matched instructions read from that write set, then
-//     f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
-//     f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
-//     if any of these future instructions had side effects (could not be
-//     speculatively executed), and so do the matched instructions, when we
-//     cannot reorder those side-effect-producing instructions, and rerolling
-//     fails.
-//
-// Finally, we make sure that all loop instructions are either loop increment
-// roots, belong to simple latch code, parts of validated reductions, part of
-// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
-// have been validated), then we reroll the loop.
-bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
-                        const SCEV *IterCount,
-                        ReductionTracker &Reductions) {
-  const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV));
-  uint64_t Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))->
-                   getValue()->getZExtValue();
-  // The collection of loop increment instructions.
-  SmallInstructionVector LoopIncs;
-  uint64_t Scale = Inc;
-
-  // The effective induction variable, IV, is normally also the real induction
-  // variable. When we're dealing with a loop like:
-  //   for (int i = 0; i < 500; ++i)
-  //     x[3*i] = ...;
-  //     x[3*i+1] = ...;
-  //     x[3*i+2] = ...;
-  // then the real IV is still i, but the effective IV is (3*i).
-  Instruction *RealIV = IV;
-  if (Inc == 1 && !findScaleFromMul(RealIV, Scale, IV, LoopIncs))
-    return false;
+  return true;
 
-  assert(Scale <= MaxInc && "Scale is too large");
-  assert(Scale > 1 && "Scale must be at least 2");
+}
 
-  // The set of increment instructions for each increment value.
-  SmallVector<SmallInstructionVector, 32> Roots(Scale-1);
-  SmallInstructionSet AllRoots;
-  if (!collectAllRoots(L, Inc, Scale, IV, Roots, AllRoots, LoopIncs))
-    return false;
+/// Get the next instruction in "In" that is a member of set Val.
+/// Start searching from StartI, and do not return anything in Exclude.
+/// If StartI is not given, start from In.begin().
+LoopReroll::DAGRootTracker::UsesTy::iterator
+LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In,
+                                      const SmallInstructionSet &Exclude,
+                                      UsesTy::iterator *StartI) {
+  UsesTy::iterator I = StartI ? *StartI : In.begin();
+  while (I != In.end() && (I->second.test(Val) == 0 ||
+                           Exclude.count(I->first) != 0))
+    ++I;
+  return I;
+}
 
-  DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
-                  *RealIV << "\n");
+bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) {
+  for (auto &DRS : RootSets) {
+    if (DRS.BaseInst == I)
+      return true;
+  }
+  return false;
+}
 
-  // An array of just the possible reductions for this scale factor. When we
-  // collect the set of all users of some root instructions, these reduction
-  // instructions are treated as 'final' (their uses are not considered).
-  // This is important because we don't want the root use set to search down
-  // the reduction chain.
-  SmallInstructionSet PossibleRedSet;
-  SmallInstructionSet PossibleRedLastSet, PossibleRedPHISet;
-  Reductions.restrictToScale(Scale, PossibleRedSet, PossibleRedPHISet,
-                             PossibleRedLastSet);
+bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) {
+  for (auto &DRS : RootSets) {
+    if (std::find(DRS.Roots.begin(), DRS.Roots.end(), I) != DRS.Roots.end())
+      return true;
+  }
+  return false;
+}
 
+/// Return true if instruction I depends on any instruction between
+/// Start and End.
+bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I,
+                                                UsesTy::iterator Start,
+                                                UsesTy::iterator End) {
+  for (auto *U : I->users()) {
+    for (auto It = Start; It != End; ++It)
+      if (U == It->first)
+        return true;
+  }
+  return false;
+}
+
+bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
   // We now need to check for equivalence of the use graph of each root with
   // that of the primary induction variable (excluding the roots). Our goal
   // here is not to solve the full graph isomorphism problem, but rather to
@@ -815,121 +1009,167 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
   // is the same (although we will not make an assumption about how the
   // different iterations are intermixed). Note that while the order must be
   // the same, the instructions may not be in the same basic block.
-  SmallInstructionSet Exclude(AllRoots);
-  Exclude.insert(LoopIncs.begin(), LoopIncs.end());
 
-  DenseSet<Instruction *> BaseUseSet;
-  collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet);
+  // An array of just the possible reductions for this scale factor. When we
+  // collect the set of all users of some root instructions, these reduction
+  // instructions are treated as 'final' (their uses are not considered).
+  // This is important because we don't want the root use set to search down
+  // the reduction chain.
+  SmallInstructionSet PossibleRedSet;
+  SmallInstructionSet PossibleRedLastSet;
+  SmallInstructionSet PossibleRedPHISet;
+  Reductions.restrictToScale(Scale, PossibleRedSet,
+                             PossibleRedPHISet, PossibleRedLastSet);
 
-  DenseSet<Instruction *> AllRootUses;
-  std::vector<DenseSet<Instruction *> > RootUseSets(Scale-1);
+  // Populate "Uses" with where each instruction is used.
+  if (!collectUsedInstructions(PossibleRedSet))
+    return false;
 
-  bool MatchFailed = false;
-  for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) {
-    DenseSet<Instruction *> &RootUseSet = RootUseSets[i];
-    collectInLoopUserSet(L, Roots[i], SmallInstructionSet(),
-                         PossibleRedSet, RootUseSet);
+  // Make sure we mark the reduction PHIs as used in all iterations.
+  for (auto *I : PossibleRedPHISet) {
+    Uses[I].set(IL_All);
+  }
 
-    DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() <<
-                    " vs. iteration increment " << (i+1) <<
-                    " use set size: " << RootUseSet.size() << "\n");
+  // Make sure all instructions in the loop are in one and only one
+  // set.
+  for (auto &KV : Uses) {
+    if (KV.second.count() != 1) {
+      DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
+            << *KV.first << " (#uses=" << KV.second.count() << ")\n");
+      return false;
+    }
+  }
 
-    if (BaseUseSet.size() != RootUseSet.size()) {
-      MatchFailed = true;
-      break;
+  DEBUG(
+    for (auto &KV : Uses) {
+      dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
     }
+    );
 
+  for (unsigned Iter = 1; Iter < Scale; ++Iter) {
     // In addition to regular aliasing information, we need to look for
     // instructions from later (future) iterations that have side effects
     // preventing us from reordering them past other instructions with side
     // effects.
     bool FutureSideEffects = false;
     AliasSetTracker AST(*AA);
-
     // The map between instructions in f(%iv.(i+1)) and f(%iv).
     DenseMap<Value *, Value *> BaseMap;
 
-    assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops");
-    for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(),
-         JE = Header->end(); J1 != JE && !MatchFailed; ++J1) {
-      if (cast<Instruction>(J1) == RealIV)
-        continue;
-      if (cast<Instruction>(J1) == IV)
-        continue;
-      if (!BaseUseSet.count(J1))
-        continue;
-      if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs.
-        continue;
-
-      while (J2 != JE && (!RootUseSet.count(J2) ||
-             std::find(Roots[i].begin(), Roots[i].end(), J2) !=
-               Roots[i].end())) {
-        // As we iterate through the instructions, instructions that don't
-        // belong to previous iterations (or the base case), must belong to
-        // future iterations. We want to track the alias set of writes from
-        // previous iterations.
-        if (!isa<PHINode>(J2) && !BaseUseSet.count(J2) &&
-            !AllRootUses.count(J2)) {
-          if (J2->mayWriteToMemory())
-            AST.add(J2);
-
-          // Note: This is specifically guarded by a check on isa<PHINode>,
-          // which while a valid (somewhat arbitrary) micro-optimization, is
-          // needed because otherwise isSafeToSpeculativelyExecute returns
-          // false on PHI nodes.
-          if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL))
-            FutureSideEffects = true;
+    // Compare iteration Iter to the base.
+    SmallInstructionSet Visited;
+    auto BaseIt = nextInstr(0, Uses, Visited);
+    auto RootIt = nextInstr(Iter, Uses, Visited);
+    auto LastRootIt = Uses.begin();
+
+    while (BaseIt != Uses.end() && RootIt != Uses.end()) {
+      Instruction *BaseInst = BaseIt->first;
+      Instruction *RootInst = RootIt->first;
+
+      // Skip over the IV or root instructions; only match their users.
+      bool Continue = false;
+      if (isBaseInst(BaseInst)) {
+        Visited.insert(BaseInst);
+        BaseIt = nextInstr(0, Uses, Visited);
+        Continue = true;
+      }
+      if (isRootInst(RootInst)) {
+        LastRootIt = RootIt;
+        Visited.insert(RootInst);
+        RootIt = nextInstr(Iter, Uses, Visited);
+        Continue = true;
+      }
+      if (Continue) continue;
+
+      if (!BaseInst->isSameOperationAs(RootInst)) {
+        // Last chance saloon. We don't try and solve the full isomorphism
+        // problem, but try and at least catch the case where two instructions
+        // *of different types* are round the wrong way. We won't be able to
+        // efficiently tell, given two ADD instructions, which way around we
+        // should match them, but given an ADD and a SUB, we can at least infer
+        // which one is which.
+        //
+        // This should allow us to deal with a greater subset of the isomorphism
+        // problem. It does however change a linear algorithm into a quadratic
+        // one, so limit the number of probes we do.
+        auto TryIt = RootIt;
+        unsigned N = NumToleratedFailedMatches;
+        while (TryIt != Uses.end() &&
+               !BaseInst->isSameOperationAs(TryIt->first) &&
+               N--) {
+          ++TryIt;
+          TryIt = nextInstr(Iter, Uses, Visited, &TryIt);
         }
 
-        ++J2;
+        if (TryIt == Uses.end() || TryIt == RootIt ||
+            instrDependsOn(TryIt->first, RootIt, TryIt)) {
+          DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                " vs. " << *RootInst << "\n");
+          return false;
+        }
+        
+        RootIt = TryIt;
+        RootInst = TryIt->first;
       }
 
-      if (!J1->isSameOperationAs(J2)) {
-        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
-                        " vs. " << *J2 << "\n");
-        MatchFailed = true;
-        break;
+      // All instructions between the last root and this root
+      // may belong to some other iteration. If they belong to a 
+      // future iteration, then they're dangerous to alias with.
+      // 
+      // Note that because we allow a limited amount of flexibility in the order
+      // that we visit nodes, LastRootIt might be *before* RootIt, in which
+      // case we've already checked this set of instructions so we shouldn't
+      // do anything.
+      for (; LastRootIt < RootIt; ++LastRootIt) {
+        Instruction *I = LastRootIt->first;
+        if (LastRootIt->second.find_first() < (int)Iter)
+          continue;
+        if (I->mayWriteToMemory())
+          AST.add(I);
+        // Note: This is specifically guarded by a check on isa<PHINode>,
+        // which while a valid (somewhat arbitrary) micro-optimization, is
+        // needed because otherwise isSafeToSpeculativelyExecute returns
+        // false on PHI nodes.
+        if (!isa<PHINode>(I) && !isSimpleLoadStore(I) &&
+            !isSafeToSpeculativelyExecute(I, DL))
+          // Intervening instructions cause side effects.
+          FutureSideEffects = true;
       }
 
       // Make sure that this instruction, which is in the use set of this
       // root instruction, does not also belong to the base set or the set of
-      // some previous root instruction.
-      if (BaseUseSet.count(J2) || AllRootUses.count(J2)) {
-        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
-                        " vs. " << *J2 << " (prev. case overlap)\n");
-        MatchFailed = true;
-        break;
+      // some other root instruction.
+      if (RootIt->second.count() > 1) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                        " vs. " << *RootInst << " (prev. case overlap)\n");
+        return false;
       }
 
       // Make sure that we don't alias with any instruction in the alias set
       // tracker. If we do, then we depend on a future iteration, and we
       // can't reroll.
-      if (J2->mayReadFromMemory()) {
-        for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end();
-             K != KE && !MatchFailed; ++K) {
-          if (K->aliasesUnknownInst(J2, *AA)) {
-            DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
-                            " vs. " << *J2 << " (depends on future store)\n");
-            MatchFailed = true;
-            break;
+      if (RootInst->mayReadFromMemory())
+        for (auto &K : AST) {
+          if (K.aliasesUnknownInst(RootInst, *AA)) {
+            DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                            " vs. " << *RootInst << " (depends on future store)\n");
+            return false;
           }
         }
-      }
 
       // If we've past an instruction from a future iteration that may have
       // side effects, and this instruction might also, then we can't reorder
       // them, and this matching fails. As an exception, we allow the alias
       // set tracker to handle regular (simple) load/store dependencies.
       if (FutureSideEffects &&
-            ((!isSimpleLoadStore(J1) &&
-              !isSafeToSpeculativelyExecute(J1, DL)) ||
-             (!isSimpleLoadStore(J2) &&
-              !isSafeToSpeculativelyExecute(J2, DL)))) {
-        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
-                        " vs. " << *J2 <<
+            ((!isSimpleLoadStore(BaseInst) &&
+              !isSafeToSpeculativelyExecute(BaseInst, DL)) ||
+             (!isSimpleLoadStore(RootInst) &&
+              !isSafeToSpeculativelyExecute(RootInst, DL)))) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                        " vs. " << *RootInst <<
                         " (side effects prevent reordering)\n");
-        MatchFailed = true;
-        break;
+        return false;
       }
 
       // For instructions that are part of a reduction, if the operation is
@@ -942,42 +1182,46 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
       //   x += a[i]; x += b[i];
       //   x += a[i+1]; x += b[i+1];
       //   x += b[i+2]; x += a[i+2];
-      bool InReduction = Reductions.isPairInSame(J1, J2);
+      bool InReduction = Reductions.isPairInSame(BaseInst, RootInst);
 
-      if (!(InReduction && J1->isAssociative())) {
+      if (!(InReduction && BaseInst->isAssociative())) {
         bool Swapped = false, SomeOpMatched = false;
-        for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) {
-          Value *Op2 = J2->getOperand(j);
+        for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) {
+          Value *Op2 = RootInst->getOperand(j);
 
           // If this is part of a reduction (and the operation is not
           // associatve), then we match all operands, but not those that are
           // part of the reduction.
           if (InReduction)
             if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
-              if (Reductions.isPairInSame(J2, Op2I))
+              if (Reductions.isPairInSame(RootInst, Op2I))
                 continue;
 
           DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
-          if (BMI != BaseMap.end())
+          if (BMI != BaseMap.end()) {
             Op2 = BMI->second;
-          else if (std::find(Roots[i].begin(), Roots[i].end(),
-                             (Instruction*) Op2) != Roots[i].end())
-            Op2 = IV;
+          } else {
+            for (auto &DRS : RootSets) {
+              if (DRS.Roots[Iter-1] == (Instruction*) Op2) {
+                Op2 = DRS.BaseInst;
+                break;
+              }
+            }
+          }
 
-          if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
+          if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
             // If we've not already decided to swap the matched operands, and
             // we've not already matched our first operand (note that we could
             // have skipped matching the first operand because it is part of a
             // reduction above), and the instruction is commutative, then try
             // the swapped match.
-            if (!Swapped && J1->isCommutative() && !SomeOpMatched &&
-                J1->getOperand(!j) == Op2) {
+            if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched &&
+                BaseInst->getOperand(!j) == Op2) {
               Swapped = true;
             } else {
-              DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
-                              " vs. " << *J2 << " (operand " << j << ")\n");
-              MatchFailed = true;
-              break;
+              DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+                    << " vs. " << *RootInst << " (operand " << j << ")\n");
+              return false;
             }
           }
 
@@ -985,81 +1229,41 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
         }
       }
 
-      if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) ||
-          (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) {
-        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
-                        " vs. " << *J2 << " (uses outside loop)\n");
-        MatchFailed = true;
-        break;
+      if ((!PossibleRedLastSet.count(BaseInst) &&
+           hasUsesOutsideLoop(BaseInst, L)) ||
+          (!PossibleRedLastSet.count(RootInst) &&
+           hasUsesOutsideLoop(RootInst, L))) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                        " vs. " << *RootInst << " (uses outside loop)\n");
+        return false;
       }
 
-      if (!MatchFailed)
-        BaseMap.insert(std::pair<Value *, Value *>(J2, J1));
-
-      AllRootUses.insert(J2);
-      Reductions.recordPair(J1, J2, i+1);
+      Reductions.recordPair(BaseInst, RootInst, Iter);
+      BaseMap.insert(std::make_pair(RootInst, BaseInst));
 
-      ++J2;
+      LastRootIt = RootIt;
+      Visited.insert(BaseInst);
+      Visited.insert(RootInst);
+      BaseIt = nextInstr(0, Uses, Visited);
+      RootIt = nextInstr(Iter, Uses, Visited);
     }
+    assert (BaseIt == Uses.end() && RootIt == Uses.end() &&
+            "Mismatched set sizes!");
   }
 
-  if (MatchFailed)
-    return false;
-
   DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
-                  *RealIV << "\n");
-
-  DenseSet<Instruction *> LoopIncUseSet;
-  collectInLoopUserSet(L, LoopIncs, SmallInstructionSet(),
-                       SmallInstructionSet(), LoopIncUseSet);
-  DEBUG(dbgs() << "LRR: Loop increment set size: " <<
-                  LoopIncUseSet.size() << "\n");
-
-  // Make sure that all instructions in the loop have been included in some
-  // use set.
-  for (BasicBlock::iterator J = Header->begin(), JE = Header->end();
-       J != JE; ++J) {
-    if (isa<DbgInfoIntrinsic>(J))
-      continue;
-    if (cast<Instruction>(J) == RealIV)
-      continue;
-    if (cast<Instruction>(J) == IV)
-      continue;
-    if (BaseUseSet.count(J) || AllRootUses.count(J) ||
-        (LoopIncUseSet.count(J) && (J->isTerminator() ||
-                                    isSafeToSpeculativelyExecute(J, DL))))
-      continue;
-
-    if (AllRoots.count(J))
-      continue;
-
-    if (Reductions.isSelectedPHI(J))
-      continue;
+                  *IV << "\n");
 
-    DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV <<
-                    " unprocessed instruction found: " << *J << "\n");
-    MatchFailed = true;
-    break;
-  }
-
-  if (MatchFailed)
-    return false;
-
-  DEBUG(dbgs() << "LRR: all instructions processed from " <<
-                  *RealIV << "\n");
-
-  if (!Reductions.validateSelected())
-    return false;
-
-  // At this point, we've validated the rerolling, and we're committed to
-  // making changes!
-
-  Reductions.replaceSelected();
+  return true;
+}
 
+void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
+  BasicBlock *Header = L->getHeader();
   // Remove instructions associated with non-base iterations.
   for (BasicBlock::reverse_iterator J = Header->rbegin();
        J != Header->rend();) {
-    if (AllRootUses.count(&*J)) {
+    unsigned I = Uses[&*J].find_first();
+    if (I > 0 && I < IL_All) {
       Instruction *D = &*J;
       DEBUG(dbgs() << "LRR: removing: " << *D << "\n");
       D->eraseFromParent();
@@ -1069,57 +1273,198 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
     ++J;
   }
 
-  // Insert the new induction variable.
-  const SCEV *Start = RealIVSCEV->getStart();
-  if (Inc == 1)
-    Start = SE->getMulExpr(Start,
-                           SE->getConstant(Start->getType(), Scale));
-  const SCEVAddRecExpr *H =
-    cast<SCEVAddRecExpr>(SE->getAddRecExpr(Start,
-                           SE->getConstant(RealIVSCEV->getType(), 1),
-                           L, SCEV::FlagAnyWrap));
-  { // Limit the lifetime of SCEVExpander.
-    SCEVExpander Expander(*SE, "reroll");
-    Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin());
-
-    for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(),
-         JE = BaseUseSet.end(); J != JE; ++J)
-      (*J)->replaceUsesOfWith(IV, NewIV);
-
-    if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
-      if (LoopIncUseSet.count(BI)) {
-        const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
-        if (Inc == 1)
-          ICSCEV =
-            SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale));
-        // Iteration count SCEV minus 1
-        const SCEV *ICMinus1SCEV =
-          SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1));
-
-        Value *ICMinus1; // Iteration count minus 1
-        if (isa<SCEVConstant>(ICMinus1SCEV)) {
-          ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI);
-        } else {
-          BasicBlock *Preheader = L->getLoopPreheader();
-          if (!Preheader)
-            Preheader = InsertPreheaderForLoop(L, this);
-
-          ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(),
-                                            Preheader->getTerminator());
-        }
+  // We need to create a new induction variable for each different BaseInst.
+  for (auto &DRS : RootSets) {
+    // Insert the new induction variable.
+    const SCEVAddRecExpr *RealIVSCEV =
+      cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
+    const SCEV *Start = RealIVSCEV->getStart();
+    const SCEVAddRecExpr *H = cast<SCEVAddRecExpr>
+      (SE->getAddRecExpr(Start,
+                         SE->getConstant(RealIVSCEV->getType(), 1),
+                         L, SCEV::FlagAnyWrap));
+    { // Limit the lifetime of SCEVExpander.
+      SCEVExpander Expander(*SE, "reroll");
+      Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin());
+
+      for (auto &KV : Uses) {
+        if (KV.second.find_first() == 0)
+          KV.first->replaceUsesOfWith(DRS.BaseInst, NewIV);
+      }
 
-        Value *Cond =
+      if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
+        // FIXME: Why do we need this check?
+        if (Uses[BI].find_first() == IL_All) {
+          const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
+
+          // Iteration count SCEV minus 1
+          const SCEV *ICMinus1SCEV =
+            SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1));
+
+          Value *ICMinus1; // Iteration count minus 1
+          if (isa<SCEVConstant>(ICMinus1SCEV)) {
+            ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI);
+          } else {
+            BasicBlock *Preheader = L->getLoopPreheader();
+            if (!Preheader)
+              Preheader = InsertPreheaderForLoop(L, Parent);
+
+            ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(),
+                                              Preheader->getTerminator());
+          }
+
+          Value *Cond =
             new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond");
-        BI->setCondition(Cond);
+          BI->setCondition(Cond);
 
-        if (BI->getSuccessor(1) != Header)
-          BI->swapSuccessors();
+          if (BI->getSuccessor(1) != Header)
+            BI->swapSuccessors();
+        }
       }
     }
   }
 
   SimplifyInstructionsInBlock(Header, DL, TLI);
   DeleteDeadPHIs(Header, TLI);
+}
+
+// Validate the selected reductions. All iterations must have an isomorphic
+// part of the reduction chain and, for non-associative reductions, the chain
+// entries must appear in order.
+bool LoopReroll::ReductionTracker::validateSelected() {
+  // For a non-associative reduction, the chain entries must appear in order.
+  for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+       RI != RIE; ++RI) {
+    int i = *RI;
+    int PrevIter = 0, BaseCount = 0, Count = 0;
+    for (Instruction *J : PossibleReds[i]) {
+      // Note that all instructions in the chain must have been found because
+      // all instructions in the function must have been assigned to some
+      // iteration.
+      int Iter = PossibleRedIter[J];
+      if (Iter != PrevIter && Iter != PrevIter + 1 &&
+          !PossibleReds[i].getReducedValue()->isAssociative()) {
+        DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
+                        J << "\n");
+        return false;
+      }
+
+      if (Iter != PrevIter) {
+        if (Count != BaseCount) {
+          DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
+                " reduction use count " << Count <<
+                " is not equal to the base use count " <<
+                BaseCount << "\n");
+          return false;
+        }
+
+        Count = 0;
+      }
+
+      ++Count;
+      if (Iter == 0)
+        ++BaseCount;
+
+      PrevIter = Iter;
+    }
+  }
+
+  return true;
+}
+
+// For all selected reductions, remove all parts except those in the first
+// iteration (and the PHI). Replace outside uses of the reduced value with uses
+// of the first-iteration reduced value (in other words, reroll the selected
+// reductions).
+void LoopReroll::ReductionTracker::replaceSelected() {
+  // Fixup reductions to refer to the last instruction associated with the
+  // first iteration (not the last).
+  for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+       RI != RIE; ++RI) {
+    int i = *RI;
+    int j = 0;
+    for (int e = PossibleReds[i].size(); j != e; ++j)
+      if (PossibleRedIter[PossibleReds[i][j]] != 0) {
+        --j;
+        break;
+      }
+
+    // Replace users with the new end-of-chain value.
+    SmallInstructionVector Users;
+    for (User *U : PossibleReds[i].getReducedValue()->users()) {
+      Users.push_back(cast<Instruction>(U));
+    }
+
+    for (SmallInstructionVector::iterator J = Users.begin(),
+         JE = Users.end(); J != JE; ++J)
+      (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
+                              PossibleReds[i][j]);
+  }
+}
+
+// Reroll the provided loop with respect to the provided induction variable.
+// Generally, we're looking for a loop like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1                <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2                <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
+// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
+// be intermixed with eachother. The restriction imposed by this algorithm is
+// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
+// etc. be the same.
+//
+// First, we collect the use set of %iv, excluding the other increment roots.
+// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
+// times, having collected the use set of f(%iv.(i+1)), during which we:
+//   - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
+//     the next unmatched instruction in f(%iv.(i+1)).
+//   - Ensure that both matched instructions don't have any external users
+//     (with the exception of last-in-chain reduction instructions).
+//   - Track the (aliasing) write set, and other side effects, of all
+//     instructions that belong to future iterations that come before the matched
+//     instructions. If the matched instructions read from that write set, then
+//     f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
+//     f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
+//     if any of these future instructions had side effects (could not be
+//     speculatively executed), and so do the matched instructions, when we
+//     cannot reorder those side-effect-producing instructions, and rerolling
+//     fails.
+//
+// Finally, we make sure that all loop instructions are either loop increment
+// roots, belong to simple latch code, parts of validated reductions, part of
+// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
+// have been validated), then we reroll the loop.
+bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+                        const SCEV *IterCount,
+                        ReductionTracker &Reductions) {
+  DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DL);
+
+  if (!DAGRoots.findRoots())
+    return false;
+  DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
+                  *IV << "\n");
+  
+  if (!DAGRoots.validate(Reductions))
+    return false;
+  if (!Reductions.validateSelected())
+    return false;
+  // At this point, we've validated the rerolling, and we're committed to
+  // making changes!
+
+  Reductions.replaceSelected();
+  DAGRoots.replace(IterCount);
+
   ++NumRerolledLoops;
   return true;
 }
@@ -1129,9 +1474,9 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     return false;
 
   AA = &getAnalysis<AliasAnalysis>();
-  LI = &getAnalysis<LoopInfo>();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   SE = &getAnalysis<ScalarEvolution>();
-  TLI = &getAnalysis<TargetLibraryInfo>();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index afd2eca..4d12349 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -13,7 +13,7 @@
 
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -54,16 +54,16 @@ namespace {
 
     // LCSSA form makes instruction renaming easier.
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionTracker>();
+      AU.addRequired<AssumptionCacheTracker>();
       AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addRequired<LoopInfo>();
-      AU.addPreserved<LoopInfo>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addPreservedID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
       AU.addPreservedID(LCSSAID);
       AU.addPreserved<ScalarEvolution>();
-      AU.addRequired<TargetTransformInfo>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
     }
 
     bool runOnLoop(Loop *L, LPPassManager &LPM) override;
@@ -74,15 +74,16 @@ namespace {
     unsigned MaxHeaderSize;
     LoopInfo *LI;
     const TargetTransformInfo *TTI;
-    AssumptionTracker *AT;
+    AssumptionCache *AC;
+    DominatorTree *DT;
   };
 }
 
 char LoopRotate::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
@@ -100,9 +101,13 @@ bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) {
   // Save the loop metadata.
   MDNode *LoopMD = L->getLoopID();
 
-  LI = &getAnalysis<LoopInfo>();
-  TTI = &getAnalysis<TargetTransformInfo>();
-  AT = &getAnalysis<AssumptionTracker>();
+  Function &F = *L->getHeader()->getParent();
+
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DT = DTWP ? &DTWP->getDomTree() : nullptr;
 
   // Simplify the loop latch before attempting to rotate the header
   // upward. Rotation may not be needed if the loop tail can be folded into the
@@ -225,20 +230,17 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
     case Instruction::Shl:
     case Instruction::LShr:
     case Instruction::AShr: {
-      Value *IVOpnd = nullptr;
-      if (isa<ConstantInt>(I->getOperand(0)))
-        IVOpnd = I->getOperand(1);
-
-      if (isa<ConstantInt>(I->getOperand(1))) {
-        if (IVOpnd)
-          return false;
-
-        IVOpnd = I->getOperand(0);
-      }
+      Value *IVOpnd = !isa<Constant>(I->getOperand(0))
+                          ? I->getOperand(0)
+                          : !isa<Constant>(I->getOperand(1))
+                                ? I->getOperand(1)
+                                : nullptr;
+      if (!IVOpnd)
+        return false;
 
       // If increment operand is used outside of the loop, this speculation
       // could cause extra live range interference.
-      if (MultiExitLoop && IVOpnd) {
+      if (MultiExitLoop) {
         for (User *UseI : IVOpnd->users()) {
           auto *UserInst = cast<Instruction>(UseI);
           if (!L->contains(UserInst))
@@ -307,9 +309,8 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) {
   // Nuke the Latch block.
   assert(Latch->empty() && "unable to evacuate Latch");
   LI->removeBlock(Latch);
-  if (DominatorTreeWrapperPass *DTWP =
-          getAnalysisIfAvailable<DominatorTreeWrapperPass>())
-    DTWP->getDomTree().eraseNode(Latch);
+  if (DT)
+    DT->eraseNode(Latch);
   Latch->eraseFromParent();
   return true;
 }
@@ -356,7 +357,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // duplicate blocks inside it.
   {
     SmallPtrSet<const Value *, 32> EphValues;
-    CodeMetrics::collectEphemeralValues(L, AT, EphValues);
+    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
 
     CodeMetrics Metrics;
     Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
@@ -441,7 +442,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     // With the operands remapped, see if the instruction constant folds or is
     // otherwise simplifyable.  This commonly occurs because the entry from PHI
     // nodes allows icmps and other instructions to fold.
-    // FIXME: Provide DL, TLI, DT, AT to SimplifyInstruction.
+    // FIXME: Provide DL, TLI, DT, AC to SimplifyInstruction.
     Value *V = SimplifyInstruction(C);
     if (V && LI->replacementPreservesLCSSAForm(C, V)) {
       // If so, then delete the temporary instruction and stick the folded value
@@ -494,31 +495,31 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     // The conditional branch can't be folded, handle the general case.
     // Update DominatorTree to reflect the CFG change we just made.  Then split
     // edges as necessary to preserve LoopSimplify form.
-    if (DominatorTreeWrapperPass *DTWP =
-            getAnalysisIfAvailable<DominatorTreeWrapperPass>()) {
-      DominatorTree &DT = DTWP->getDomTree();
+    if (DT) {
       // Everything that was dominated by the old loop header is now dominated
       // by the original loop preheader. Conceptually the header was merged
       // into the preheader, even though we reuse the actual block as a new
       // loop latch.
-      DomTreeNode *OrigHeaderNode = DT.getNode(OrigHeader);
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
       SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
                                                    OrigHeaderNode->end());
-      DomTreeNode *OrigPreheaderNode = DT.getNode(OrigPreheader);
+      DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
       for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
-        DT.changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
+        DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
 
-      assert(DT.getNode(Exit)->getIDom() == OrigPreheaderNode);
-      assert(DT.getNode(NewHeader)->getIDom() == OrigPreheaderNode);
+      assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
+      assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
 
       // Update OrigHeader to be dominated by the new header block.
-      DT.changeImmediateDominator(OrigHeader, OrigLatch);
+      DT->changeImmediateDominator(OrigHeader, OrigLatch);
     }
 
     // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
     // thus is not a preheader anymore.
     // Split the edge to form a real preheader.
-    BasicBlock *NewPH = SplitCriticalEdge(OrigPreheader, NewHeader, this);
+    BasicBlock *NewPH = SplitCriticalEdge(
+        OrigPreheader, NewHeader,
+        CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
     NewPH->setName(NewHeader->getName() + ".lr.ph");
 
     // Preserve canonical loop form, which means that 'Exit' should have only
@@ -534,8 +535,11 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       Loop *PredLoop = LI->getLoopFor(*PI);
       if (!PredLoop || PredLoop->contains(Exit))
         continue;
+      if (isa<IndirectBrInst>((*PI)->getTerminator()))
+        continue;
       SplitLatchEdge |= L->getLoopLatch() == *PI;
-      BasicBlock *ExitSplit = SplitCriticalEdge(*PI, Exit, this);
+      BasicBlock *ExitSplit = SplitCriticalEdge(
+          *PI, Exit, CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
       ExitSplit->moveBefore(Exit);
     }
     assert(SplitLatchEdge &&
@@ -549,17 +553,15 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     PHBI->eraseFromParent();
 
     // With our CFG finalized, update DomTree if it is available.
-    if (DominatorTreeWrapperPass *DTWP =
-            getAnalysisIfAvailable<DominatorTreeWrapperPass>()) {
-      DominatorTree &DT = DTWP->getDomTree();
+    if (DT) {
       // Update OrigHeader to be dominated by the new header block.
-      DT.changeImmediateDominator(NewHeader, OrigPreheader);
-      DT.changeImmediateDominator(OrigHeader, OrigLatch);
+      DT->changeImmediateDominator(NewHeader, OrigPreheader);
+      DT->changeImmediateDominator(OrigHeader, OrigLatch);
 
       // Brute force incremental dominator tree update. Call
       // findNearestCommonDominator on all CFG predecessors of each child of the
       // original header.
-      DomTreeNode *OrigHeaderNode = DT.getNode(OrigHeader);
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
       SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
                                                    OrigHeaderNode->end());
       bool Changed;
@@ -572,11 +574,11 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
           pred_iterator PI = pred_begin(BB);
           BasicBlock *NearestDom = *PI;
           for (pred_iterator PE = pred_end(BB); PI != PE; ++PI)
-            NearestDom = DT.findNearestCommonDominator(NearestDom, *PI);
+            NearestDom = DT->findNearestCommonDominator(NearestDom, *PI);
 
           // Remember if this changes the DomTree.
           if (Node->getIDom()->getBlock() != NearestDom) {
-            DT.changeImmediateDominator(BB, NearestDom);
+            DT->changeImmediateDominator(BB, NearestDom);
             Changed = true;
           }
         }
@@ -594,7 +596,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // the OrigHeader block into OrigLatch.  This will succeed if they are
   // connected by an unconditional branch.  This is just a cleanup so the
   // emitted code isn't too gross in this common case.
-  MergeBlockIntoPredecessor(OrigHeader, this);
+  MergeBlockIntoPredecessor(OrigHeader, DT, LI);
 
   DEBUG(dbgs() << "LoopRotation: into "; L->dump());
 
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 7b60373..318065e 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1327,11 +1327,9 @@ void LSRUse::DeleteFormula(Formula &F) {
 /// RecomputeRegs - Recompute the Regs field, and update RegUses.
 void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
   // Now that we've filtered out some formulae, recompute the Regs set.
-  SmallPtrSet<const SCEV *, 4> OldRegs = Regs;
+  SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
   Regs.clear();
-  for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(),
-       E = Formulae.end(); I != E; ++I) {
-    const Formula &F = *I;
+  for (const Formula &F : Formulae) {
     if (F.ScaledReg) Regs.insert(F.ScaledReg);
     Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
   }
@@ -4728,12 +4726,14 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
           // Split the critical edge.
           BasicBlock *NewBB = nullptr;
           if (!Parent->isLandingPad()) {
-            NewBB = SplitCriticalEdge(BB, Parent, P,
-                                      /*MergeIdenticalEdges=*/true,
-                                      /*DontDeleteUselessPhis=*/true);
+            NewBB = SplitCriticalEdge(BB, Parent,
+                                      CriticalEdgeSplittingOptions(&DT, &LI)
+                                          .setMergeIdenticalEdges()
+                                          .setDontDeleteUselessPHIs());
           } else {
             SmallVector<BasicBlock*, 2> NewBBs;
-            SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs);
+            SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs,
+                                        /*AliasAnalysis*/ nullptr, &DT, &LI);
             NewBB = NewBBs[0];
           }
           // If NewBB==NULL, then SplitCriticalEdge refused to split because all
@@ -4863,9 +4863,10 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
 LSRInstance::LSRInstance(Loop *L, Pass *P)
     : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()),
       DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()),
-      LI(P->getAnalysis<LoopInfo>()),
-      TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false),
-      IVIncInsertPos(nullptr) {
+      LI(P->getAnalysis<LoopInfoWrapperPass>().getLoopInfo()),
+      TTI(P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+          *L->getHeader()->getParent())),
+      L(L), Changed(false), IVIncInsertPos(nullptr) {
   // If LoopSimplify form is not available, stay out of trouble.
   if (!L->isLoopSimplifyForm())
     return;
@@ -5041,11 +5042,11 @@ private:
 char LoopStrengthReduce::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
                 "Loop Strength Reduction", false, false)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(IVUsers)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
                 "Loop Strength Reduction", false, false)
@@ -5064,8 +5065,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
   // many analyses if they are around.
   AU.addPreservedID(LoopSimplifyID);
 
-  AU.addRequired<LoopInfo>();
-  AU.addPreserved<LoopInfo>();
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addPreserved<LoopInfoWrapperPass>();
   AU.addRequiredID(LoopSimplifyID);
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addPreserved<DominatorTreeWrapperPass>();
@@ -5076,7 +5077,7 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequiredID(LoopSimplifyID);
   AU.addRequired<IVUsers>();
   AU.addPreserved<IVUsers>();
-  AU.addRequired<TargetTransformInfo>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
 }
 
 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
@@ -5098,7 +5099,8 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
 #endif
     unsigned numFolded = Rewriter.replaceCongruentIVs(
         L, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), DeadInsts,
-        &getAnalysis<TargetTransformInfo>());
+        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+            *L->getHeader()->getParent()));
     if (numFolded) {
       Changed = true;
       DeleteTriviallyDeadInstructions(DeadInsts);
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index f60d990..924be16 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -13,11 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/FunctionTargetTransformInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -28,6 +29,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include <climits>
 
 using namespace llvm;
@@ -38,6 +41,22 @@ static cl::opt<unsigned>
 UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden,
   cl::desc("The cut-off point for automatic loop unrolling"));
 
+static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
+    "unroll-max-iteration-count-to-analyze", cl::init(0), cl::Hidden,
+    cl::desc("Don't allow loop unrolling to simulate more than this number of"
+             "iterations when checking full unroll profitability"));
+
+static cl::opt<unsigned> UnrollMinPercentOfOptimized(
+    "unroll-percent-of-optimized-for-complete-unroll", cl::init(20), cl::Hidden,
+    cl::desc("If complete unrolling could trigger further optimizations, and, "
+             "by that, remove the given percent of instructions, perform the "
+             "complete unroll even if it's beyond the threshold"));
+
+static cl::opt<unsigned> UnrollAbsoluteThreshold(
+    "unroll-absolute-threshold", cl::init(2000), cl::Hidden,
+    cl::desc("Don't unroll if the unrolled size is bigger than this threshold,"
+             " even if we can remove big portion of instructions later."));
+
 static cl::opt<unsigned>
 UnrollCount("unroll-count", cl::init(0), cl::Hidden,
   cl::desc("Use this unroll count for all loops including those with "
@@ -63,11 +82,16 @@ namespace {
     static char ID; // Pass ID, replacement for typeid
     LoopUnroll(int T = -1, int C = -1, int P = -1, int R = -1) : LoopPass(ID) {
       CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T);
+      CurrentAbsoluteThreshold = UnrollAbsoluteThreshold;
+      CurrentMinPercentOfOptimized = UnrollMinPercentOfOptimized;
       CurrentCount = (C == -1) ? UnrollCount : unsigned(C);
       CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P;
       CurrentRuntime = (R == -1) ? UnrollRuntime : (bool)R;
 
       UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0);
+      UserAbsoluteThreshold = (UnrollAbsoluteThreshold.getNumOccurrences() > 0);
+      UserPercentOfOptimized =
+          (UnrollMinPercentOfOptimized.getNumOccurrences() > 0);
       UserAllowPartial = (P != -1) ||
                          (UnrollAllowPartial.getNumOccurrences() > 0);
       UserRuntime = (R != -1) || (UnrollRuntime.getNumOccurrences() > 0);
@@ -91,10 +115,16 @@ namespace {
 
     unsigned CurrentCount;
     unsigned CurrentThreshold;
+    unsigned CurrentAbsoluteThreshold;
+    unsigned CurrentMinPercentOfOptimized;
     bool     CurrentAllowPartial;
     bool     CurrentRuntime;
     bool     UserCount;            // CurrentCount is user-specified.
     bool     UserThreshold;        // CurrentThreshold is user-specified.
+    bool UserAbsoluteThreshold;    // CurrentAbsoluteThreshold is
+                                   // user-specified.
+    bool UserPercentOfOptimized;   // CurrentMinPercentOfOptimized is
+                                   // user-specified.
     bool     UserAllowPartial;     // CurrentAllowPartial is user-specified.
     bool     UserRuntime;          // CurrentRuntime is user-specified.
 
@@ -104,17 +134,16 @@ namespace {
     /// loop preheaders be inserted into the CFG...
     ///
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionTracker>();
-      AU.addRequired<LoopInfo>();
-      AU.addPreserved<LoopInfo>();
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addPreservedID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
       AU.addPreservedID(LCSSAID);
       AU.addRequired<ScalarEvolution>();
       AU.addPreserved<ScalarEvolution>();
-      AU.addRequired<TargetTransformInfo>();
-      AU.addRequired<FunctionTargetTransformInfo>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
       // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
       // If loop unroll does not preserve dom info then LCSSA pass on next
       // loop will receive invalid dom info.
@@ -124,9 +153,11 @@ namespace {
 
     // Fill in the UnrollingPreferences parameter with values from the
     // TargetTransformationInfo.
-    void getUnrollingPreferences(Loop *L, const FunctionTargetTransformInfo &FTTI,
+    void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI,
                                  TargetTransformInfo::UnrollingPreferences &UP) {
       UP.Threshold = CurrentThreshold;
+      UP.AbsoluteThreshold = CurrentAbsoluteThreshold;
+      UP.MinPercentOfOptimized = CurrentMinPercentOfOptimized;
       UP.OptSizeThreshold = OptSizeUnrollThreshold;
       UP.PartialThreshold = CurrentThreshold;
       UP.PartialOptSizeThreshold = OptSizeUnrollThreshold;
@@ -134,7 +165,7 @@ namespace {
       UP.MaxCount = UINT_MAX;
       UP.Partial = CurrentAllowPartial;
       UP.Runtime = CurrentRuntime;
-      FTTI.getUnrollingPreferences(L, UP);
+      TTI.getUnrollingPreferences(L, UP);
     }
 
     // Select and return an unroll count based on parameters from
@@ -153,18 +184,37 @@ namespace {
     // unrolled loops respectively.
     void selectThresholds(const Loop *L, bool HasPragma,
                           const TargetTransformInfo::UnrollingPreferences &UP,
-                          unsigned &Threshold, unsigned &PartialThreshold) {
+                          unsigned &Threshold, unsigned &PartialThreshold,
+                          unsigned NumberOfOptimizedInstructions) {
       // Determine the current unrolling threshold.  While this is
       // normally set from UnrollThreshold, it is overridden to a
       // smaller value if the current function is marked as
       // optimize-for-size, and the unroll threshold was not user
       // specified.
       Threshold = UserThreshold ? CurrentThreshold : UP.Threshold;
+
+      // If we are allowed to completely unroll if we can remove M% of
+      // instructions, and we know that with complete unrolling we'll be able
+      // to kill N instructions, then we can afford to completely unroll loops
+      // with unrolled size up to N*100/M.
+      // Adjust the threshold according to that:
+      unsigned PercentOfOptimizedForCompleteUnroll =
+          UserPercentOfOptimized ? CurrentMinPercentOfOptimized
+                                 : UP.MinPercentOfOptimized;
+      unsigned AbsoluteThreshold = UserAbsoluteThreshold
+                                       ? CurrentAbsoluteThreshold
+                                       : UP.AbsoluteThreshold;
+      if (PercentOfOptimizedForCompleteUnroll)
+        Threshold = std::max<unsigned>(Threshold,
+                                       NumberOfOptimizedInstructions * 100 /
+                                           PercentOfOptimizedForCompleteUnroll);
+      // But don't allow unrolling loops bigger than absolute threshold.
+      Threshold = std::min<unsigned>(Threshold, AbsoluteThreshold);
+
       PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold;
       if (!UserThreshold &&
-          L->getHeader()->getParent()->getAttributes().
-              hasAttribute(AttributeSet::FunctionIndex,
-                           Attribute::OptimizeForSize)) {
+          L->getHeader()->getParent()->hasFnAttribute(
+              Attribute::OptimizeForSize)) {
         Threshold = UP.OptSizeThreshold;
         PartialThreshold = UP.PartialOptSizeThreshold;
       }
@@ -185,10 +235,9 @@ namespace {
 
 char LoopUnroll::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
-INITIALIZE_PASS_DEPENDENCY(FunctionTargetTransformInfo)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
@@ -203,13 +252,333 @@ Pass *llvm::createSimpleLoopUnrollPass() {
   return llvm::createLoopUnrollPass(-1, -1, 0, 0);
 }
 
+static bool isLoadFromConstantInitializer(Value *V) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    if (GV->isConstant() && GV->hasDefinitiveInitializer())
+      return GV->getInitializer();
+  return false;
+}
+
+struct FindConstantPointers {
+  bool LoadCanBeConstantFolded;
+  bool IndexIsConstant;
+  APInt Step;
+  APInt StartValue;
+  Value *BaseAddress;
+  const Loop *L;
+  ScalarEvolution &SE;
+  FindConstantPointers(const Loop *loop, ScalarEvolution &SE)
+      : LoadCanBeConstantFolded(true), IndexIsConstant(true), L(loop), SE(SE) {}
+
+  bool follow(const SCEV *S) {
+    if (const SCEVUnknown *SC = dyn_cast<SCEVUnknown>(S)) {
+      // We've reached the leaf node of SCEV, it's most probably just a
+      // variable. Now it's time to see if it corresponds to a global constant
+      // global (in which case we can eliminate the load), or not.
+      BaseAddress = SC->getValue();
+      LoadCanBeConstantFolded =
+          IndexIsConstant && isLoadFromConstantInitializer(BaseAddress);
+      return false;
+    }
+    if (isa<SCEVConstant>(S))
+      return true;
+    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+      // If the current SCEV expression is AddRec, and its loop isn't the loop
+      // we are about to unroll, then we won't get a constant address after
+      // unrolling, and thus, won't be able to eliminate the load.
+      if (AR->getLoop() != L)
+        return IndexIsConstant = false;
+      // If the step isn't constant, we won't get constant addresses in unrolled
+      // version. Bail out.
+      if (const SCEVConstant *StepSE =
+              dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE)))
+        Step = StepSE->getValue()->getValue();
+      else
+        return IndexIsConstant = false;
+
+      return IndexIsConstant;
+    }
+    // If Result is true, continue traversal.
+    // Otherwise, we have found something that prevents us from (possible) load
+    // elimination.
+    return IndexIsConstant;
+  }
+  bool isDone() const { return !IndexIsConstant; }
+};
+
+// This class is used to get an estimate of the optimization effects that we
+// could get from complete loop unrolling. It comes from the fact that some
+// loads might be replaced with concrete constant values and that could trigger
+// a chain of instruction simplifications.
+//
+// E.g. we might have:
+//   int a[] = {0, 1, 0};
+//   v = 0;
+//   for (i = 0; i < 3; i ++)
+//     v += b[i]*a[i];
+// If we completely unroll the loop, we would get:
+//   v = b[0]*a[0] + b[1]*a[1] + b[2]*a[2]
+// Which then will be simplified to:
+//   v = b[0]* 0 + b[1]* 1 + b[2]* 0
+// And finally:
+//   v = b[1]
+class UnrollAnalyzer : public InstVisitor<UnrollAnalyzer, bool> {
+  typedef InstVisitor<UnrollAnalyzer, bool> Base;
+  friend class InstVisitor<UnrollAnalyzer, bool>;
+
+  const Loop *L;
+  unsigned TripCount;
+  ScalarEvolution &SE;
+  const TargetTransformInfo &TTI;
+
+  DenseMap<Value *, Constant *> SimplifiedValues;
+  DenseMap<LoadInst *, Value *> LoadBaseAddresses;
+  SmallPtrSet<Instruction *, 32> CountedInstructions;
+
+  /// \brief Count the number of optimized instructions.
+  unsigned NumberOfOptimizedInstructions;
+
+  // Provide base case for our instruction visit.
+  bool visitInstruction(Instruction &I) { return false; };
+  // TODO: We should also visit ICmp, FCmp, GetElementPtr, Trunc, ZExt, SExt,
+  // FPTrunc, FPExt, FPToUI, FPToSI, UIToFP, SIToFP, BitCast, Select,
+  // ExtractElement, InsertElement, ShuffleVector, ExtractValue, InsertValue.
+  //
+  // Probaly it's worth to hoist the code for estimating the simplifications
+  // effects to a separate class, since we have a very similar code in
+  // InlineCost already.
+  bool visitBinaryOperator(BinaryOperator &I) {
+    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+    if (!isa<Constant>(LHS))
+      if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
+        LHS = SimpleLHS;
+    if (!isa<Constant>(RHS))
+      if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
+        RHS = SimpleRHS;
+    Value *SimpleV = nullptr;
+    if (auto FI = dyn_cast<FPMathOperator>(&I))
+      SimpleV =
+          SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags());
+    else
+      SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS);
+
+    if (SimpleV && CountedInstructions.insert(&I).second)
+      NumberOfOptimizedInstructions += TTI.getUserCost(&I);
+
+    if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
+      SimplifiedValues[&I] = C;
+      return true;
+    }
+    return false;
+  }
+
+  Constant *computeLoadValue(LoadInst *LI, unsigned Iteration) {
+    if (!LI)
+      return nullptr;
+    Value *BaseAddr = LoadBaseAddresses[LI];
+    if (!BaseAddr)
+      return nullptr;
+
+    auto GV = dyn_cast<GlobalVariable>(BaseAddr);
+    if (!GV)
+      return nullptr;
+
+    ConstantDataSequential *CDS =
+        dyn_cast<ConstantDataSequential>(GV->getInitializer());
+    if (!CDS)
+      return nullptr;
+
+    const SCEV *BaseAddrSE = SE.getSCEV(BaseAddr);
+    const SCEV *S = SE.getSCEV(LI->getPointerOperand());
+    const SCEV *OffSE = SE.getMinusSCEV(S, BaseAddrSE);
+
+    APInt StepC, StartC;
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OffSE);
+    if (!AR)
+      return nullptr;
+
+    if (const SCEVConstant *StepSE =
+            dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE)))
+      StepC = StepSE->getValue()->getValue();
+    else
+      return nullptr;
+
+    if (const SCEVConstant *StartSE = dyn_cast<SCEVConstant>(AR->getStart()))
+      StartC = StartSE->getValue()->getValue();
+    else
+      return nullptr;
+
+    unsigned ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;
+    unsigned Start = StartC.getLimitedValue();
+    unsigned Step = StepC.getLimitedValue();
+
+    unsigned Index = (Start + Step * Iteration) / ElemSize;
+    if (Index >= CDS->getNumElements())
+      return nullptr;
+
+    Constant *CV = CDS->getElementAsConstant(Index);
+
+    return CV;
+  }
+
+public:
+  UnrollAnalyzer(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
+                 const TargetTransformInfo &TTI)
+      : L(L), TripCount(TripCount), SE(SE), TTI(TTI),
+        NumberOfOptimizedInstructions(0) {}
+
+  // Visit all loads the loop L, and for those that, after complete loop
+  // unrolling, would have a constant address and it will point to a known
+  // constant initializer, record its base address for future use.  It is used
+  // when we estimate number of potentially simplified instructions.
+  void findConstFoldableLoads() {
+    for (auto BB : L->getBlocks()) {
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+        if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+          if (!LI->isSimple())
+            continue;
+          Value *AddrOp = LI->getPointerOperand();
+          const SCEV *S = SE.getSCEV(AddrOp);
+          FindConstantPointers Visitor(L, SE);
+          SCEVTraversal<FindConstantPointers> T(Visitor);
+          T.visitAll(S);
+          if (Visitor.IndexIsConstant && Visitor.LoadCanBeConstantFolded) {
+            LoadBaseAddresses[LI] = Visitor.BaseAddress;
+          }
+        }
+      }
+    }
+  }
+
+  // Given a list of loads that could be constant-folded (LoadBaseAddresses),
+  // estimate number of optimized instructions after substituting the concrete
+  // values for the given Iteration. Also track how many instructions become
+  // dead through this process.
+  unsigned estimateNumberOfOptimizedInstructions(unsigned Iteration) {
+    // We keep a set vector for the worklist so that we don't wast space in the
+    // worklist queuing up the same instruction repeatedly. This can happen due
+    // to multiple operands being the same instruction or due to the same
+    // instruction being an operand of lots of things that end up dead or
+    // simplified.
+    SmallSetVector<Instruction *, 8> Worklist;
+
+    // Clear the simplified values and counts for this iteration.
+    SimplifiedValues.clear();
+    CountedInstructions.clear();
+    NumberOfOptimizedInstructions = 0;
+
+    // We start by adding all loads to the worklist.
+    for (auto &LoadDescr : LoadBaseAddresses) {
+      LoadInst *LI = LoadDescr.first;
+      SimplifiedValues[LI] = computeLoadValue(LI, Iteration);
+      if (CountedInstructions.insert(LI).second)
+        NumberOfOptimizedInstructions += TTI.getUserCost(LI);
+
+      for (User *U : LI->users())
+        Worklist.insert(cast<Instruction>(U));
+    }
+
+    // And then we try to simplify every user of every instruction from the
+    // worklist. If we do simplify a user, add it to the worklist to process
+    // its users as well.
+    while (!Worklist.empty()) {
+      Instruction *I = Worklist.pop_back_val();
+      if (!L->contains(I))
+        continue;
+      if (!visit(I))
+        continue;
+      for (User *U : I->users())
+        Worklist.insert(cast<Instruction>(U));
+    }
+
+    // Now that we know the potentially simplifed instructions, estimate number
+    // of instructions that would become dead if we do perform the
+    // simplification.
+
+    // The dead instructions are held in a separate set. This is used to
+    // prevent us from re-examining instructions and make sure we only count
+    // the benifit once. The worklist's internal set handles insertion
+    // deduplication.
+    SmallPtrSet<Instruction *, 16> DeadInstructions;
+
+    // Lambda to enque operands onto the worklist.
+    auto EnqueueOperands = [&](Instruction &I) {
+      for (auto *Op : I.operand_values())
+        if (auto *OpI = dyn_cast<Instruction>(Op))
+          if (!OpI->use_empty())
+            Worklist.insert(OpI);
+    };
+
+    // Start by initializing worklist with simplified instructions.
+    for (auto &FoldedKeyValue : SimplifiedValues)
+      if (auto *FoldedInst = dyn_cast<Instruction>(FoldedKeyValue.first)) {
+        DeadInstructions.insert(FoldedInst);
+
+        // Add each instruction operand of this dead instruction to the
+        // worklist.
+        EnqueueOperands(*FoldedInst);
+      }
+
+    // If a definition of an insn is only used by simplified or dead
+    // instructions, it's also dead. Check defs of all instructions from the
+    // worklist.
+    while (!Worklist.empty()) {
+      Instruction *I = Worklist.pop_back_val();
+      if (!L->contains(I))
+        continue;
+      if (DeadInstructions.count(I))
+        continue;
+
+      if (std::all_of(I->user_begin(), I->user_end(), [&](User *U) {
+            return DeadInstructions.count(cast<Instruction>(U));
+          })) {
+        NumberOfOptimizedInstructions += TTI.getUserCost(I);
+        DeadInstructions.insert(I);
+        EnqueueOperands(*I);
+      }
+    }
+    return NumberOfOptimizedInstructions;
+  }
+};
+
+// Complete loop unrolling can make some loads constant, and we need to know if
+// that would expose any further optimization opportunities.
+// This routine estimates this optimization effect and returns the number of
+// instructions, that potentially might be optimized away.
+static unsigned
+approximateNumberOfOptimizedInstructions(const Loop *L, ScalarEvolution &SE,
+                                         unsigned TripCount,
+                                         const TargetTransformInfo &TTI) {
+  if (!TripCount || !UnrollMaxIterationsCountToAnalyze)
+    return 0;
+
+  UnrollAnalyzer UA(L, TripCount, SE, TTI);
+  UA.findConstFoldableLoads();
+
+  // Estimate number of instructions, that could be simplified if we replace a
+  // load with the corresponding constant. Since the same load will take
+  // different values on different iterations, we have to go through all loop's
+  // iterations here. To limit ourselves here, we check only first N
+  // iterations, and then scale the found number, if necessary.
+  unsigned IterationsNumberForEstimate =
+      std::min<unsigned>(UnrollMaxIterationsCountToAnalyze, TripCount);
+  unsigned NumberOfOptimizedInstructions = 0;
+  for (unsigned i = 0; i < IterationsNumberForEstimate; ++i)
+    NumberOfOptimizedInstructions +=
+        UA.estimateNumberOfOptimizedInstructions(i);
+
+  NumberOfOptimizedInstructions *= TripCount / IterationsNumberForEstimate;
+
+  return NumberOfOptimizedInstructions;
+}
+
 /// ApproximateLoopSize - Approximate the size of the loop.
 static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
                                     bool &NotDuplicatable,
                                     const TargetTransformInfo &TTI,
-                                    AssumptionTracker *AT) {
+                                    AssumptionCache *AC) {
   SmallPtrSet<const Value *, 32> EphValues;
-  CodeMetrics::collectEphemeralValues(L, AT, EphValues);
+  CodeMetrics::collectEphemeralValues(L, AC, EphValues);
 
   CodeMetrics Metrics;
   for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
@@ -222,8 +591,11 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
 
   // Don't allow an estimate of size zero.  This would allows unrolling of loops
   // with huge iteration counts, which is a compile time problem even if it's
-  // not a problem for code quality.
-  if (LoopSize == 0) LoopSize = 1;
+  // not a problem for code quality. Also, the code using this size may assume
+  // that each loop has at least three instructions (likely a conditional
+  // branch, a comparison feeding that branch, and some kind of loop increment
+  // feeding that comparison instruction).
+  LoopSize = std::max(LoopSize, 3u);
 
   return LoopSize;
 }
@@ -231,48 +603,31 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
 // Returns the loop hint metadata node with the given name (for example,
 // "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is
 // returned.
-static const MDNode *GetUnrollMetadata(const Loop *L, StringRef Name) {
-  MDNode *LoopID = L->getLoopID();
-  if (!LoopID)
-    return nullptr;
-
-  // First operand should refer to the loop id itself.
-  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
-  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
-
-  for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
-    const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
-    if (!MD)
-      continue;
-
-    const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
-    if (!S)
-      continue;
-
-    if (Name.equals(S->getString()))
-      return MD;
-  }
+static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
+  if (MDNode *LoopID = L->getLoopID())
+    return GetUnrollMetadata(LoopID, Name);
   return nullptr;
 }
 
 // Returns true if the loop has an unroll(full) pragma.
 static bool HasUnrollFullPragma(const Loop *L) {
-  return GetUnrollMetadata(L, "llvm.loop.unroll.full");
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
 }
 
 // Returns true if the loop has an unroll(disable) pragma.
 static bool HasUnrollDisablePragma(const Loop *L) {
-  return GetUnrollMetadata(L, "llvm.loop.unroll.disable");
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable");
 }
 
 // If loop has an unroll_count pragma return the (necessarily
 // positive) value from the pragma.  Otherwise return 0.
 static unsigned UnrollCountPragmaValue(const Loop *L) {
-  const MDNode *MD = GetUnrollMetadata(L, "llvm.loop.unroll.count");
+  MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
   if (MD) {
     assert(MD->getNumOperands() == 2 &&
            "Unroll count hint metadata should have two operands.");
-    unsigned Count = cast<ConstantInt>(MD->getOperand(1))->getZExtValue();
+    unsigned Count =
+        mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
     assert(Count >= 1 && "Unroll count must be positive.");
     return Count;
   }
@@ -288,9 +643,9 @@ static void SetLoopAlreadyUnrolled(Loop *L) {
   if (!LoopID) return;
 
   // First remove any existing loop unrolling metadata.
-  SmallVector<Value *, 4> Vals;
+  SmallVector<Metadata *, 4> MDs;
   // Reserve first location for self reference to the LoopID metadata node.
-  Vals.push_back(nullptr);
+  MDs.push_back(nullptr);
   for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
     bool IsUnrollMetadata = false;
     MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
@@ -298,17 +653,18 @@ static void SetLoopAlreadyUnrolled(Loop *L) {
       const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
       IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
     }
-    if (!IsUnrollMetadata) Vals.push_back(LoopID->getOperand(i));
+    if (!IsUnrollMetadata)
+      MDs.push_back(LoopID->getOperand(i));
   }
 
   // Add unroll(disable) metadata to disable future unrolling.
   LLVMContext &Context = L->getHeader()->getContext();
-  SmallVector<Value *, 1> DisableOperands;
+  SmallVector<Metadata *, 1> DisableOperands;
   DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
   MDNode *DisableNode = MDNode::get(Context, DisableOperands);
-  Vals.push_back(DisableNode);
+  MDs.push_back(DisableNode);
 
-  MDNode *NewLoopID = MDNode::get(Context, Vals);
+  MDNode *NewLoopID = MDNode::get(Context, MDs);
   // Set operand 0 to refer to the loop id itself.
   NewLoopID->replaceOperandWith(0, NewLoopID);
   L->setLoopID(NewLoopID);
@@ -358,12 +714,13 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (skipOptnoneFunction(L))
     return false;
 
-  LoopInfo *LI = &getAnalysis<LoopInfo>();
+  Function &F = *L->getHeader()->getParent();
+
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
-  const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
-  const FunctionTargetTransformInfo &FTTI =
-      getAnalysis<FunctionTargetTransformInfo>();
-  AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
+  const TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
   BasicBlock *Header = L->getHeader();
   DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
@@ -377,7 +734,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   bool HasPragma = PragmaFullUnroll || PragmaCount > 0;
 
   TargetTransformInfo::UnrollingPreferences UP;
-  getUnrollingPreferences(L, FTTI, UP);
+  getUnrollingPreferences(L, TTI, UP);
 
   // Find trip count and trip multiple if count is not available
   unsigned TripCount = 0;
@@ -402,9 +759,13 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   unsigned NumInlineCandidates;
   bool notDuplicatable;
   unsigned LoopSize =
-      ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, AT);
+      ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, &AC);
   DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
-  uint64_t UnrolledSize = (uint64_t)LoopSize * Count;
+
+  // When computing the unrolled size, note that the conditional branch on the
+  // backedge and the comparison feeding it are not replicated like the rest of
+  // the loop body (which is why 2 is subtracted).
+  uint64_t UnrolledSize = (uint64_t)(LoopSize-2) * Count + 2;
   if (notDuplicatable) {
     DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
                  << " instructions.\n");
@@ -415,8 +776,14 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     return false;
   }
 
+  unsigned NumberOfOptimizedInstructions =
+      approximateNumberOfOptimizedInstructions(L, *SE, TripCount, TTI);
+  DEBUG(dbgs() << "  Complete unrolling could save: "
+               << NumberOfOptimizedInstructions << "\n");
+
   unsigned Threshold, PartialThreshold;
-  selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold);
+  selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold,
+                   NumberOfOptimizedInstructions);
 
   // Given Count, TripCount and thresholds determine the type of
   // unrolling which is to be performed.
@@ -449,7 +816,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     }
     if (PartialThreshold != NoThreshold && UnrolledSize > PartialThreshold) {
       // Reduce unroll count to be modulo of TripCount for partial unrolling.
-      Count = PartialThreshold / LoopSize;
+      Count = (std::max(PartialThreshold, 3u)-2) / (LoopSize-2);
       while (Count != 0 && TripCount % Count != 0)
         Count--;
     }
@@ -463,7 +830,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     // the original count which satisfies the threshold limit.
     while (Count != 0 && UnrolledSize > PartialThreshold) {
       Count >>= 1;
-      UnrolledSize = LoopSize * Count;
+      UnrolledSize = (LoopSize-2) * Count + 2;
     }
     if (Count > UP.MaxCount)
       Count = UP.MaxCount;
@@ -509,7 +876,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Unroll the loop.
   if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this,
-                  &LPM, AT))
+                  &LPM, &AC))
     return false;
 
   return true;
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index ef43483..987dc96 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -30,7 +30,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -105,7 +105,7 @@ namespace {
       // Analyze loop. Check its size, calculate is it possible to unswitch
       // it. Returns true if we can unswitch this loop.
       bool countLoop(const Loop *L, const TargetTransformInfo &TTI,
-                     AssumptionTracker *AT);
+                     AssumptionCache *AC);
 
       // Clean all data related to given loop.
       void forgetLoop(const Loop *L);
@@ -128,7 +128,7 @@ namespace {
   class LoopUnswitch : public LoopPass {
     LoopInfo *LI;  // Loop information
     LPPassManager *LPM;
-    AssumptionTracker *AT;
+    AssumptionCache *AC;
 
     // LoopProcessWorklist - Used to check if second loop needs processing
     // after RewriteLoopBodyWithConditionConstant rewrites first loop.
@@ -167,16 +167,16 @@ namespace {
     /// loop preheaders be inserted into the CFG.
     ///
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionTracker>();
+      AU.addRequired<AssumptionCacheTracker>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addPreservedID(LoopSimplifyID);
-      AU.addRequired<LoopInfo>();
-      AU.addPreserved<LoopInfo>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
       AU.addRequiredID(LCSSAID);
       AU.addPreservedID(LCSSAID);
       AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addPreserved<ScalarEvolution>();
-      AU.addRequired<TargetTransformInfo>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
     }
 
   private:
@@ -217,7 +217,7 @@ namespace {
 // Analyze loop. Check its size, calculate is it possible to unswitch
 // it. Returns true if we can unswitch this loop.
 bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
-                                AssumptionTracker *AT) {
+                                AssumptionCache *AC) {
 
   LoopPropsMapIt PropsIt;
   bool Inserted;
@@ -235,7 +235,7 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
     // This is a very ad-hoc heuristic.
 
     SmallPtrSet<const Value *, 32> EphValues;
-    CodeMetrics::collectEphemeralValues(L, AT, EphValues);
+    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
 
     // FIXME: This is overly conservative because it does not take into
     // consideration code simplification opportunities and code that can
@@ -333,10 +333,10 @@ void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
 char LoopUnswitch::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
@@ -385,8 +385,9 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
   if (skipOptnoneFunction(L))
     return false;
 
-  AT = &getAnalysis<AssumptionTracker>();
-  LI = &getAnalysis<LoopInfo>();
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+      *L->getHeader()->getParent());
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   LPM = &LPM_Ref;
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
@@ -431,8 +432,10 @@ bool LoopUnswitch::processCurrentLoop() {
 
   // Probably we reach the quota of branches for this loop. If so
   // stop unswitching.
-  if (!BranchesInfo.countLoop(currentLoop, getAnalysis<TargetTransformInfo>(),
-                              AT))
+  if (!BranchesInfo.countLoop(
+          currentLoop, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+                           *currentLoop->getHeader()->getParent()),
+          AC))
     return false;
 
   // Loop over all of the basic blocks in the loop.  If we find an interior
@@ -654,9 +657,7 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) {
   // Check to see if it would be profitable to unswitch current loop.
 
   // Do not do non-trivial unswitch while optimizing for size.
-  if (OptimizeForSize ||
-      F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                      Attribute::OptimizeForSize))
+  if (OptimizeForSize || F->hasFnAttribute(Attribute::OptimizeForSize))
     return false;
 
   UnswitchNontrivialCondition(LoopCond, Val, currentLoop);
@@ -674,7 +675,7 @@ static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
   for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
        I != E; ++I)
     if (LI->getLoopFor(*I) == L)
-      New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), LI->getBase());
+      New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
 
   // Add all of the subloops to the new loop.
   for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
@@ -705,8 +706,9 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
 
   // If either edge is critical, split it. This helps preserve LoopSimplify
   // form for enclosing loops.
-  SplitCriticalEdge(BI, 0, this, false, false, true);
-  SplitCriticalEdge(BI, 1, this, false, false, true);
+  auto Options = CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA();
+  SplitCriticalEdge(BI, 0, Options);
+  SplitCriticalEdge(BI, 1, Options);
 }
 
 /// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable
@@ -725,7 +727,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond,
   // First step, split the preheader, so that we know that there is a safe place
   // to insert the conditional branch.  We will change loopPreheader to have a
   // conditional branch on Cond.
-  BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, this);
+  BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, DT, LI);
 
   // Now that we have a place to insert the conditional branch, create a place
   // to branch to: this is the exit block out of the loop that we should
@@ -736,7 +738,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond,
   // without actually branching to it (the exit block should be dominated by the
   // loop header, not the preheader).
   assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
-  BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), this);
+  BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), DT, LI);
 
   // Okay, now we have a position to branch from and a position to branch to,
   // insert the new conditional branch.
@@ -767,13 +769,9 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
 
     // Although SplitBlockPredecessors doesn't preserve loop-simplify in
     // general, if we call it on all predecessors of all exits then it does.
-    if (!ExitBlock->isLandingPad()) {
-      SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", this);
-    } else {
-      SmallVector<BasicBlock*, 2> NewBBs;
-      SplitLandingPadPredecessors(ExitBlock, Preds, ".us-lcssa", ".us-lcssa",
-                                  this, NewBBs);
-    }
+    SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa",
+                           /*AliasAnalysis*/ nullptr, DT, LI,
+                           /*PreserveLCSSA*/ true);
   }
 }
 
@@ -796,7 +794,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
 
   // First step, split the preheader and exit blocks, and add these blocks to
   // the LoopBlocks list.
-  BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, this);
+  BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, DT, LI);
   LoopBlocks.push_back(NewPreheader);
 
   // We want the loop to come after the preheader, but before the exit blocks.
@@ -836,7 +834,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
 
   // FIXME: We could register any cloned assumptions instead of clearing the
   // whole function's cache.
-  AT->forgetCachedAssumptions(F);
+  AC->clear();
 
   // Now we create the new Loop object for the versioned loop.
   Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
@@ -849,14 +847,14 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   if (ParentLoop) {
     // Make sure to add the cloned preheader and exit blocks to the parent loop
     // as well.
-    ParentLoop->addBasicBlockToLoop(NewBlocks[0], LI->getBase());
+    ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI);
   }
 
   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
     BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]);
     // The new exit block should be in the same loop as the old one.
     if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i]))
-      ExitBBLoop->addBasicBlockToLoop(NewExit, LI->getBase());
+      ExitBBLoop->addBasicBlockToLoop(NewExit, *LI);
 
     assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
            "Exit block should have been split to have one successor!");
@@ -1042,7 +1040,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     // and hooked up so as to preserve the loop structure, because
     // trying to update it is complicated.  So instead we preserve the
     // loop structure and put the block on a dead code path.
-    SplitEdge(Switch, SISucc, this);
+    SplitEdge(Switch, SISucc, DT, LI);
     // Compute the successors instead of relying on the return value
     // of SplitEdge, since it may have split the switch successor
     // after PHI nodes.
diff --git a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
new file mode 100644
index 0000000..0c47cbd
--- /dev/null
+++ b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -0,0 +1,192 @@
+//===- LowerExpectIntrinsic.cpp - Lower expect intrinsic ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the 'expect' intrinsic to LLVM metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lower-expect-intrinsic"
+
+STATISTIC(ExpectIntrinsicsHandled,
+          "Number of 'expect' intrinsic instructions handled");
+
+static cl::opt<uint32_t>
+LikelyBranchWeight("likely-branch-weight", cl::Hidden, cl::init(64),
+                   cl::desc("Weight of the branch likely to be taken (default = 64)"));
+static cl::opt<uint32_t>
+UnlikelyBranchWeight("unlikely-branch-weight", cl::Hidden, cl::init(4),
+                   cl::desc("Weight of the branch unlikely to be taken (default = 4)"));
+
+static bool handleSwitchExpect(SwitchInst &SI) {
+  CallInst *CI = dyn_cast<CallInst>(SI.getCondition());
+  if (!CI)
+    return false;
+
+  Function *Fn = CI->getCalledFunction();
+  if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
+    return false;
+
+  Value *ArgValue = CI->getArgOperand(0);
+  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!ExpectedValue)
+    return false;
+
+  SwitchInst::CaseIt Case = SI.findCaseValue(ExpectedValue);
+  unsigned n = SI.getNumCases(); // +1 for default case.
+  SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight);
+
+  if (Case == SI.case_default())
+    Weights[0] = LikelyBranchWeight;
+  else
+    Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight;
+
+  SI.setMetadata(LLVMContext::MD_prof,
+                 MDBuilder(CI->getContext()).createBranchWeights(Weights));
+
+  SI.setCondition(ArgValue);
+  return true;
+}
+
+static bool handleBranchExpect(BranchInst &BI) {
+  if (BI.isUnconditional())
+    return false;
+
+  // Handle non-optimized IR code like:
+  //   %expval = call i64 @llvm.expect.i64(i64 %conv1, i64 1)
+  //   %tobool = icmp ne i64 %expval, 0
+  //   br i1 %tobool, label %if.then, label %if.end
+  //
+  // Or the following simpler case:
+  //   %expval = call i1 @llvm.expect.i1(i1 %cmp, i1 1)
+  //   br i1 %expval, label %if.then, label %if.end
+
+  CallInst *CI;
+
+  ICmpInst *CmpI = dyn_cast<ICmpInst>(BI.getCondition());
+  if (!CmpI) {
+    CI = dyn_cast<CallInst>(BI.getCondition());
+  } else {
+    if (CmpI->getPredicate() != CmpInst::ICMP_NE)
+      return false;
+    CI = dyn_cast<CallInst>(CmpI->getOperand(0));
+  }
+
+  if (!CI)
+    return false;
+
+  Function *Fn = CI->getCalledFunction();
+  if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
+    return false;
+
+  Value *ArgValue = CI->getArgOperand(0);
+  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!ExpectedValue)
+    return false;
+
+  MDBuilder MDB(CI->getContext());
+  MDNode *Node;
+
+  // If expect value is equal to 1 it means that we are more likely to take
+  // branch 0, in other case more likely is branch 1.
+  if (ExpectedValue->isOne())
+    Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight);
+  else
+    Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
+
+  BI.setMetadata(LLVMContext::MD_prof, Node);
+
+  if (CmpI)
+    CmpI->setOperand(0, ArgValue);
+  else
+    BI.setCondition(ArgValue);
+  return true;
+}
+
+static bool lowerExpectIntrinsic(Function &F) {
+  bool Changed = false;
+
+  for (BasicBlock &BB : F) {
+    // Create "block_weights" metadata.
+    if (BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
+      if (handleBranchExpect(*BI))
+        ExpectIntrinsicsHandled++;
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
+      if (handleSwitchExpect(*SI))
+        ExpectIntrinsicsHandled++;
+    }
+
+    // remove llvm.expect intrinsics.
+    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
+      CallInst *CI = dyn_cast<CallInst>(BI++);
+      if (!CI)
+        continue;
+
+      Function *Fn = CI->getCalledFunction();
+      if (Fn && Fn->getIntrinsicID() == Intrinsic::expect) {
+        Value *Exp = CI->getArgOperand(0);
+        CI->replaceAllUsesWith(Exp);
+        CI->eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F) {
+  if (lowerExpectIntrinsic(F))
+    return PreservedAnalyses::none();
+
+  return PreservedAnalyses::all();
+}
+
+namespace {
+/// \brief Legacy pass for lowering expect intrinsics out of the IR.
+///
+/// When this pass is run over a function it uses expect intrinsics which feed
+/// branches and switches to provide branch weight metadata for those
+/// terminators. It then removes the expect intrinsics from the IR so the rest
+/// of the optimizer can ignore them.
+class LowerExpectIntrinsic : public FunctionPass {
+public:
+  static char ID;
+  LowerExpectIntrinsic() : FunctionPass(ID) {
+    initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override { return lowerExpectIntrinsic(F); }
+};
+}
+
+char LowerExpectIntrinsic::ID = 0;
+INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect",
+                "Lower 'expect' Intrinsics", false, false)
+
+FunctionPass *llvm::createLowerExpectIntrinsicPass() {
+  return new LowerExpectIntrinsic();
+}
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index be524be..006b885 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -16,7 +16,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
@@ -28,7 +28,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <list>
 using namespace llvm;
@@ -330,11 +330,11 @@ namespace {
     // This transformation requires dominator postdominator info
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
-      AU.addRequired<AssumptionTracker>();
+      AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<MemoryDependenceAnalysis>();
       AU.addRequired<AliasAnalysis>();
-      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
       AU.addPreserved<AliasAnalysis>();
       AU.addPreserved<MemoryDependenceAnalysis>();
     }
@@ -363,10 +363,10 @@ FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
 
 INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
                       false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
                     false, false)
@@ -750,6 +750,16 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
   // its dependence information by changing its parameter.
   MD->removeInstruction(C);
 
+  // Update AA metadata
+  // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
+  // handled here, but combineMetadata doesn't support them yet
+  unsigned KnownIDs[] = {
+    LLVMContext::MD_tbaa,
+    LLVMContext::MD_alias_scope,
+    LLVMContext::MD_noalias,
+  };
+  combineMetadata(C, cpy, KnownIDs);
+
   // Remove the memcpy.
   MD->removeInstruction(cpy);
   ++NumMemCpyInstr;
@@ -982,11 +992,13 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
 
   // If it is greater than the memcpy, then we check to see if we can force the
   // source of the memcpy to the alignment we need.  If we fail, we bail out.
-  AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
+  AssumptionCache &AC =
+      getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+          *CS->getParent()->getParent());
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   if (MDep->getAlignment() < ByValAlign &&
-      getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign,
-                                 DL, AT, CS.getInstruction(), &DT) < ByValAlign)
+      getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &AC,
+                                 CS.getInstruction(), &DT) < ByValAlign)
     return false;
 
   // Verify that the copied-from memory doesn't change in between the memcpy and
@@ -1067,7 +1079,7 @@ bool MemCpyOpt::runOnFunction(Function &F) {
   MD = &getAnalysis<MemoryDependenceAnalysis>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
-  TLI = &getAnalysis<TargetLibraryInfo>();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   // If we don't have at least memset and memcpy, there is little point of doing
   // anything here.  These are required by a freestanding implementation, so if
diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 8281c59..8fad63f 100644
--- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -86,7 +86,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <vector>
@@ -115,7 +115,7 @@ public:
 private:
   // This transformation requires dominator postdominator info
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<TargetLibraryInfo>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<MemoryDependenceAnalysis>();
     AU.addRequired<AliasAnalysis>();
     AU.addPreserved<AliasAnalysis>();
@@ -143,7 +143,9 @@ private:
   // Routines for sinking stores
   StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
   PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
-  bool isStoreSinkBarrier(Instruction *Inst);
+  bool isStoreSinkBarrierInRange(const Instruction& Start,
+                                 const Instruction& End,
+                                 AliasAnalysis::Location Loc);
   bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst);
   bool mergeStores(BasicBlock *BB);
   // The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
@@ -166,7 +168,7 @@ FunctionPass *llvm::createMergedLoadStoreMotionPass() {
 INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion",
                       "MergedLoadStoreMotion", false, false)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion",
                     "MergedLoadStoreMotion", false, false)
@@ -239,7 +241,7 @@ bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start,
                                                       const Instruction& End,
                                                       LoadInst* LI) {
   AliasAnalysis::Location Loc = AA->getLocation(LI);
-  return AA->canInstructionRangeModify(Start, End, Loc);
+  return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Mod);
 }
 
 ///
@@ -389,26 +391,19 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {
 }
 
 ///
-/// \brief True when instruction is sink barrier for a store
-/// 
-bool MergedLoadStoreMotion::isStoreSinkBarrier(Instruction *Inst) {
-  // FIXME: Conservatively let a load instruction block the store.
-  // Use alias analysis instead.
-  if (isa<LoadInst>(Inst))
-    return true;
-  if (isa<CallInst>(Inst))
-    return true;
-  if (isa<TerminatorInst>(Inst) && !isa<BranchInst>(Inst))
-    return true;
-  // Note: mayHaveSideEffects covers all instructions that could
-  // trigger a change to state. Eg. in-flight stores have to be executed
-  // before ordered loads or fences, calls could invoke functions that store
-  // data to memory etc.
-  if (!isa<StoreInst>(Inst) && Inst->mayHaveSideEffects()) {
-    return true;
-  }
-  DEBUG(dbgs() << "No Sink Barrier\n");
-  return false;
+/// \brief True when instruction is a sink barrier for a store
+/// located in Loc
+///
+/// Whenever an instruction could possibly read or modify the
+/// value being stored or protect against the store from
+/// happening it is considered a sink barrier.
+///
+
+bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction& Start,
+                                                      const Instruction& End,
+                                                      AliasAnalysis::Location
+                                                      Loc) {
+  return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::ModRef);
 }
 
 ///
@@ -416,27 +411,30 @@ bool MergedLoadStoreMotion::isStoreSinkBarrier(Instruction *Inst) {
 ///
 /// \return The store in \p  when it is safe to sink. Otherwise return Null.
 ///
-StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB,
-                                                   StoreInst *SI) {
-  StoreInst *I = 0;
-  DEBUG(dbgs() << "can Sink? : "; SI->dump(); dbgs() << "\n");
-  for (BasicBlock::reverse_iterator RBI = BB->rbegin(), RBE = BB->rend();
+StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
+                                                   StoreInst *Store0) {
+  DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
+  BasicBlock *BB0 = Store0->getParent();
+  for (BasicBlock::reverse_iterator RBI = BB1->rbegin(), RBE = BB1->rend();
        RBI != RBE; ++RBI) {
     Instruction *Inst = &*RBI;
 
-    // Only move loads if they are used in the block.
-    if (isStoreSinkBarrier(Inst))
-      break;
-    if (isa<StoreInst>(Inst)) {
-      AliasAnalysis::Location LocSI = AA->getLocation(SI);
-      AliasAnalysis::Location LocInst = AA->getLocation((StoreInst *)Inst);
-      if (AA->isMustAlias(LocSI, LocInst)) {
-        I = (StoreInst *)Inst;
-        break;
-      }
+    if (!isa<StoreInst>(Inst))
+       continue;
+
+    StoreInst *Store1 = cast<StoreInst>(Inst);
+
+    AliasAnalysis::Location Loc0 = AA->getLocation(Store0);
+    AliasAnalysis::Location Loc1 = AA->getLocation(Store1);
+    if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) &&
+      !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store1))),
+                                 BB1->back(), Loc1) &&
+      !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store0))),
+                                 BB0->back(), Loc0)) {
+      return Store1;
     }
   }
-  return I;
+  return nullptr;
 }
 
 ///
@@ -548,8 +546,7 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
 
     Instruction *I = &*RBI;
     ++RBI;
-    if (isStoreSinkBarrier(I))
-      break;
+
     // Sink move non-simple (atomic, volatile) stores
     if (!isa<StoreInst>(I))
       continue;
diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 5c8bed5..31d7df3 100644
--- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -18,7 +18,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
@@ -52,16 +52,18 @@ INITIALIZE_PASS(PartiallyInlineLibCalls, "partially-inline-libcalls",
                 "Partially inline calls to library functions", false, false)
 
 void PartiallyInlineLibCalls::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<TargetLibraryInfo>();
-  AU.addRequired<TargetTransformInfo>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
   FunctionPass::getAnalysisUsage(AU);
 }
 
 bool PartiallyInlineLibCalls::runOnFunction(Function &F) {
   bool Changed = false;
   Function::iterator CurrBB;
-  TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
-  const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfo>();
+  TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  const TargetTransformInfo *TTI =
+      &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
     CurrBB = BB++;
 
@@ -126,7 +128,7 @@ bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call,
 
   // Move all instructions following Call to newly created block JoinBB.
   // Create phi and replace all uses.
-  BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode(), this);
+  BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode());
   IRBuilder<> Builder(JoinBB, JoinBB->begin());
   PHINode *Phi = Builder.CreatePHI(Call->getType(), 2);
   Call->replaceAllUsesWith(Phi);
diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp
new file mode 100644
index 0000000..944725a
--- /dev/null
+++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -0,0 +1,989 @@
+//===- PlaceSafepoints.cpp - Place GC Safepoints --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Place garbage collection safepoints at appropriate locations in the IR. This
+// does not make relocation semantics or variable liveness explicit.  That's
+// done by RewriteStatepointsForGC.
+//
+// Terminology:
+// - A call is said to be "parseable" if there is a stack map generated for the
+// return PC of the call.  A runtime can determine where values listed in the
+// deopt arguments and (after RewriteStatepointsForGC) gc arguments are located
+// on the stack when the code is suspended inside such a call.  Every parse
+// point is represented by a call wrapped in an gc.statepoint intrinsic.  
+// - A "poll" is an explicit check in the generated code to determine if the
+// runtime needs the generated code to cooperate by calling a helper routine
+// and thus suspending its execution at a known state. The call to the helper
+// routine will be parseable.  The (gc & runtime specific) logic of a poll is
+// assumed to be provided in a function of the name "gc.safepoint_poll".
+//
+// We aim to insert polls such that running code can quickly be brought to a
+// well defined state for inspection by the collector.  In the current
+// implementation, this is done via the insertion of poll sites at method entry
+// and the backedge of most loops.  We try to avoid inserting more polls than
+// are neccessary to ensure a finite period between poll sites.  This is not
+// because the poll itself is expensive in the generated code; it's not.  Polls
+// do tend to impact the optimizer itself in negative ways; we'd like to avoid
+// perturbing the optimization of the method as much as we can.
+//
+// We also need to make most call sites parseable.  The callee might execute a
+// poll (or otherwise be inspected by the GC).  If so, the entire stack
+// (including the suspended frame of the current method) must be parseable.
+//
+// This pass will insert:
+// - Call parse points ("call safepoints") for any call which may need to
+// reach a safepoint during the execution of the callee function.
+// - Backedge safepoint polls and entry safepoint polls to ensure that
+// executing code reaches a safepoint poll in a finite amount of time.
+//
+// We do not currently support return statepoints, but adding them would not
+// be hard.  They are not required for correctness - entry safepoints are an
+// alternative - but some GCs may prefer them.  Patches welcome.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#define DEBUG_TYPE "safepoint-placement"
+STATISTIC(NumEntrySafepoints, "Number of entry safepoints inserted");
+STATISTIC(NumCallSafepoints, "Number of call safepoints inserted");
+STATISTIC(NumBackedgeSafepoints, "Number of backedge safepoints inserted");
+
+STATISTIC(CallInLoop, "Number of loops w/o safepoints due to calls in loop");
+STATISTIC(FiniteExecution, "Number of loops w/o safepoints finite execution");
+
+using namespace llvm;
+
+// Ignore oppurtunities to avoid placing safepoints on backedges, useful for
+// validation
+static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden,
+                                  cl::init(false));
+
+/// If true, do not place backedge safepoints in counted loops.
+static cl::opt<bool> SkipCounted("spp-counted", cl::Hidden, cl::init(true));
+
+// If true, split the backedge of a loop when placing the safepoint, otherwise
+// split the latch block itself.  Both are useful to support for
+// experimentation, but in practice, it looks like splitting the backedge
+// optimizes better.
+static cl::opt<bool> SplitBackedge("spp-split-backedge", cl::Hidden,
+                                   cl::init(false));
+
+// Print tracing output
+static cl::opt<bool> TraceLSP("spp-trace", cl::Hidden, cl::init(false));
+
+namespace {
+
+/** An analysis pass whose purpose is to identify each of the backedges in
+    the function which require a safepoint poll to be inserted. */
+struct PlaceBackedgeSafepointsImpl : public LoopPass {
+  static char ID;
+
+  /// The output of the pass - gives a list of each backedge (described by
+  /// pointing at the branch) which need a poll inserted.
+  std::vector<TerminatorInst *> PollLocations;
+
+  /// True unless we're running spp-no-calls in which case we need to disable
+  /// the call dependend placement opts.
+  bool CallSafepointsEnabled;
+  PlaceBackedgeSafepointsImpl(bool CallSafepoints = false)
+      : LoopPass(ID), CallSafepointsEnabled(CallSafepoints) {
+    initializePlaceBackedgeSafepointsImplPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *, LPPassManager &LPM) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // needed for determining if the loop is finite
+    AU.addRequired<ScalarEvolution>();
+    // to ensure each edge has a single backedge
+    // TODO: is this still required?
+    AU.addRequiredID(LoopSimplifyID);
+
+    // We no longer modify the IR at all in this pass.  Thus all
+    // analysis are preserved.
+    AU.setPreservesAll();
+  }
+};
+}
+
+static cl::opt<bool> NoEntry("spp-no-entry", cl::Hidden, cl::init(false));
+static cl::opt<bool> NoCall("spp-no-call", cl::Hidden, cl::init(false));
+static cl::opt<bool> NoBackedge("spp-no-backedge", cl::Hidden, cl::init(false));
+
+namespace {
+struct PlaceSafepoints : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+
+  PlaceSafepoints() : ModulePass(ID) {
+    initializePlaceSafepointsPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override {
+    bool modified = false;
+    for (Function &F : M) {
+      modified |= runOnFunction(F);
+    }
+    return modified;
+  }
+  bool runOnFunction(Function &F);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // We modify the graph wholesale (inlining, block insertion, etc).  We
+    // preserve nothing at the moment.  We could potentially preserve dom tree
+    // if that was worth doing
+  }
+};
+}
+
+// Insert a safepoint poll immediately before the given instruction.  Does
+// not handle the parsability of state at the runtime call, that's the
+// callers job.
+static void
+InsertSafepointPoll(DominatorTree &DT, Instruction *after,
+                    std::vector<CallSite> &ParsePointsNeeded /*rval*/);
+
+static bool isGCLeafFunction(const CallSite &CS);
+
+static bool needsStatepoint(const CallSite &CS) {
+  if (isGCLeafFunction(CS))
+    return false;
+  if (CS.isCall()) {
+    CallInst *call = cast<CallInst>(CS.getInstruction());
+    if (call->isInlineAsm())
+      return false;
+  }
+  if (isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS)) {
+    return false;
+  }
+  return true;
+}
+
+static Value *ReplaceWithStatepoint(const CallSite &CS, Pass *P);
+
+/// Returns true if this loop is known to contain a call safepoint which
+/// must unconditionally execute on any iteration of the loop which returns
+/// to the loop header via an edge from Pred.  Returns a conservative correct
+/// answer; i.e. false is always valid.
+static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
+                                               BasicBlock *Pred,
+                                               DominatorTree &DT) {
+  // In general, we're looking for any cut of the graph which ensures
+  // there's a call safepoint along every edge between Header and Pred.
+  // For the moment, we look only for the 'cuts' that consist of a single call
+  // instruction in a block which is dominated by the Header and dominates the
+  // loop latch (Pred) block.  Somewhat surprisingly, walking the entire chain
+  // of such dominating blocks gets substaintially more occurences than just
+  // checking the Pred and Header blocks themselves.  This may be due to the
+  // density of loop exit conditions caused by range and null checks.
+  // TODO: structure this as an analysis pass, cache the result for subloops,
+  // avoid dom tree recalculations
+  assert(DT.dominates(Header, Pred) && "loop latch not dominated by header?");
+
+  BasicBlock *Current = Pred;
+  while (true) {
+    for (Instruction &I : *Current) {
+      if (CallSite CS = &I)
+        // Note: Technically, needing a safepoint isn't quite the right
+        // condition here.  We should instead be checking if the target method
+        // has an
+        // unconditional poll. In practice, this is only a theoretical concern
+        // since we don't have any methods with conditional-only safepoint
+        // polls.
+        if (needsStatepoint(CS))
+          return true;
+    }
+
+    if (Current == Header)
+      break;
+    Current = DT.getNode(Current)->getIDom()->getBlock();
+  }
+
+  return false;
+}
+
+/// Returns true if this loop is known to terminate in a finite number of
+/// iterations.  Note that this function may return false for a loop which
+/// does actual terminate in a finite constant number of iterations due to
+/// conservatism in the analysis.
+static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
+                                    BasicBlock *Pred) {
+  // Only used when SkipCounted is off
+  const unsigned upperTripBound = 8192;
+
+  // A conservative bound on the loop as a whole.
+  const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L);
+  if (MaxTrips != SE->getCouldNotCompute()) {
+    if (SE->getUnsignedRange(MaxTrips).getUnsignedMax().ult(upperTripBound))
+      return true;
+    if (SkipCounted &&
+        SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(32))
+      return true;
+  }
+
+  // If this is a conditional branch to the header with the alternate path
+  // being outside the loop, we can ask questions about the execution frequency
+  // of the exit block.
+  if (L->isLoopExiting(Pred)) {
+    // This returns an exact expression only.  TODO: We really only need an
+    // upper bound here, but SE doesn't expose that.
+    const SCEV *MaxExec = SE->getExitCount(L, Pred);
+    if (MaxExec != SE->getCouldNotCompute()) {
+      if (SE->getUnsignedRange(MaxExec).getUnsignedMax().ult(upperTripBound))
+        return true;
+      if (SkipCounted &&
+          SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(32))
+        return true;
+    }
+  }
+
+  return /* not finite */ false;
+}
+
+static void scanOneBB(Instruction *start, Instruction *end,
+                      std::vector<CallInst *> &calls,
+                      std::set<BasicBlock *> &seen,
+                      std::vector<BasicBlock *> &worklist) {
+  for (BasicBlock::iterator itr(start);
+       itr != start->getParent()->end() && itr != BasicBlock::iterator(end);
+       itr++) {
+    if (CallInst *CI = dyn_cast<CallInst>(&*itr)) {
+      calls.push_back(CI);
+    }
+    // FIXME: This code does not handle invokes
+    assert(!dyn_cast<InvokeInst>(&*itr) &&
+           "support for invokes in poll code needed");
+    // Only add the successor blocks if we reach the terminator instruction
+    // without encountering end first
+    if (itr->isTerminator()) {
+      BasicBlock *BB = itr->getParent();
+      for (BasicBlock *Succ : successors(BB)) {
+        if (seen.count(Succ) == 0) {
+          worklist.push_back(Succ);
+          seen.insert(Succ);
+        }
+      }
+    }
+  }
+}
+static void scanInlinedCode(Instruction *start, Instruction *end,
+                            std::vector<CallInst *> &calls,
+                            std::set<BasicBlock *> &seen) {
+  calls.clear();
+  std::vector<BasicBlock *> worklist;
+  seen.insert(start->getParent());
+  scanOneBB(start, end, calls, seen, worklist);
+  while (!worklist.empty()) {
+    BasicBlock *BB = worklist.back();
+    worklist.pop_back();
+    scanOneBB(&*BB->begin(), end, calls, seen, worklist);
+  }
+}
+
+bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L, LPPassManager &LPM) {
+  ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
+
+  // Loop through all predecessors of the loop header and identify all
+  // backedges.  We need to place a safepoint on every backedge (potentially).
+  // Note: Due to LoopSimplify there should only be one.  Assert?  Or can we
+  // relax this?
+  BasicBlock *header = L->getHeader();
+
+  // TODO: Use the analysis pass infrastructure for this.  There is no reason
+  // to recalculate this here.
+  DominatorTree DT;
+  DT.recalculate(*header->getParent());
+
+  bool modified = false;
+  for (BasicBlock *pred : predecessors(header)) {
+    if (!L->contains(pred)) {
+      // This is not a backedge, it's coming from outside the loop
+      continue;
+    }
+
+    // Make a policy decision about whether this loop needs a safepoint or
+    // not.  Note that this is about unburdening the optimizer in loops, not
+    // avoiding the runtime cost of the actual safepoint.
+    if (!AllBackedges) {
+      if (mustBeFiniteCountedLoop(L, SE, pred)) {
+        if (TraceLSP)
+          errs() << "skipping safepoint placement in finite loop\n";
+        FiniteExecution++;
+        continue;
+      }
+      if (CallSafepointsEnabled &&
+          containsUnconditionalCallSafepoint(L, header, pred, DT)) {
+        // Note: This is only semantically legal since we won't do any further
+        // IPO or inlining before the actual call insertion..  If we hadn't, we
+        // might latter loose this call safepoint.
+        if (TraceLSP)
+          errs() << "skipping safepoint placement due to unconditional call\n";
+        CallInLoop++;
+        continue;
+      }
+    }
+
+    // TODO: We can create an inner loop which runs a finite number of
+    // iterations with an outer loop which contains a safepoint.  This would
+    // not help runtime performance that much, but it might help our ability to
+    // optimize the inner loop.
+
+    // We're unconditionally going to modify this loop.
+    modified = true;
+
+    // Safepoint insertion would involve creating a new basic block (as the
+    // target of the current backedge) which does the safepoint (of all live
+    // variables) and branches to the true header
+    TerminatorInst *term = pred->getTerminator();
+
+    if (TraceLSP) {
+      errs() << "[LSP] terminator instruction: ";
+      term->dump();
+    }
+
+    PollLocations.push_back(term);
+  }
+
+  return modified;
+}
+
+static Instruction *findLocationForEntrySafepoint(Function &F,
+                                                  DominatorTree &DT) {
+
+  // Conceptually, this poll needs to be on method entry, but in
+  // practice, we place it as late in the entry block as possible.  We
+  // can place it as late as we want as long as it dominates all calls
+  // that can grow the stack.  This, combined with backedge polls,
+  // give us all the progress guarantees we need.
+
+  // Due to the way the frontend generates IR, we may have a couple of initial
+  // basic blocks before the first bytecode.  These will be single-entry
+  // single-exit blocks which conceptually are just part of the first 'real
+  // basic block'.  Since we don't have deopt state until the first bytecode,
+  // walk forward until we've found the first unconditional branch or merge.
+
+  // hasNextInstruction and nextInstruction are used to iterate
+  // through a "straight line" execution sequence.
+
+  auto hasNextInstruction = [](Instruction *I) {
+    if (!I->isTerminator()) {
+      return true;
+    }
+    BasicBlock *nextBB = I->getParent()->getUniqueSuccessor();
+    return nextBB && (nextBB->getUniquePredecessor() != nullptr);
+  };
+
+  auto nextInstruction = [&hasNextInstruction](Instruction *I) {
+    assert(hasNextInstruction(I) &&
+           "first check if there is a next instruction!");
+    if (I->isTerminator()) {
+      return I->getParent()->getUniqueSuccessor()->begin();
+    } else {
+      return std::next(BasicBlock::iterator(I));
+    }
+  };
+
+  Instruction *cursor = nullptr;
+  for (cursor = F.getEntryBlock().begin(); hasNextInstruction(cursor);
+       cursor = nextInstruction(cursor)) {
+
+    // We need to stop going forward as soon as we see a call that can
+    // grow the stack (i.e. the call target has a non-zero frame
+    // size).
+    if (CallSite CS = cursor) {
+      (void)CS; // Silence an unused variable warning by gcc 4.8.2
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(cursor)) {
+        // llvm.assume(...) are not really calls.
+        if (II->getIntrinsicID() == Intrinsic::assume) {
+          continue;
+        }
+      }
+      break;
+    }
+  }
+
+  assert((hasNextInstruction(cursor) || cursor->isTerminator()) &&
+         "either we stopped because of a call, or because of terminator");
+
+  if (cursor->isTerminator()) {
+    return cursor;
+  }
+
+  BasicBlock *BB = cursor->getParent();
+  SplitBlock(BB, cursor, nullptr);
+
+  // Note: SplitBlock modifies the DT.  Simply passing a Pass (which is a
+  // module pass) is not enough.
+  DT.recalculate(F);
+#ifndef NDEBUG
+  // SplitBlock updates the DT
+  DT.verifyDomTree();
+#endif
+
+  return BB->getTerminator();
+}
+
+/// Identify the list of call sites which need to be have parseable state
+static void findCallSafepoints(Function &F,
+                               std::vector<CallSite> &Found /*rval*/) {
+  assert(Found.empty() && "must be empty!");
+  for (Instruction &I : inst_range(F)) {
+    Instruction *inst = &I;
+    if (isa<CallInst>(inst) || isa<InvokeInst>(inst)) {
+      CallSite CS(inst);
+
+      // No safepoint needed or wanted
+      if (!needsStatepoint(CS)) {
+        continue;
+      }
+
+      Found.push_back(CS);
+    }
+  }
+}
+
+/// Implement a unique function which doesn't require we sort the input
+/// vector.  Doing so has the effect of changing the output of a couple of
+/// tests in ways which make them less useful in testing fused safepoints.
+template <typename T> static void unique_unsorted(std::vector<T> &vec) {
+  std::set<T> seen;
+  std::vector<T> tmp;
+  vec.reserve(vec.size());
+  std::swap(tmp, vec);
+  for (auto V : tmp) {
+    if (seen.insert(V).second) {
+      vec.push_back(V);
+    }
+  }
+}
+
+static std::string GCSafepointPollName("gc.safepoint_poll");
+
+static bool isGCSafepointPoll(Function &F) {
+  return F.getName().equals(GCSafepointPollName);
+}
+
+/// Returns true if this function should be rewritten to include safepoint
+/// polls and parseable call sites.  The main point of this function is to be
+/// an extension point for custom logic. 
+static bool shouldRewriteFunction(Function &F) {
+  // TODO: This should check the GCStrategy
+  if (F.hasGC()) {
+    const std::string StatepointExampleName("statepoint-example");
+    return StatepointExampleName == F.getGC();
+  } else
+    return false;
+}
+
+// TODO: These should become properties of the GCStrategy, possibly with
+// command line overrides.
+static bool enableEntrySafepoints(Function &F) { return !NoEntry; }
+static bool enableBackedgeSafepoints(Function &F) { return !NoBackedge; }
+static bool enableCallSafepoints(Function &F) { return !NoCall; }
+
+
+bool PlaceSafepoints::runOnFunction(Function &F) {
+  if (F.isDeclaration() || F.empty()) {
+    // This is a declaration, nothing to do.  Must exit early to avoid crash in
+    // dom tree calculation
+    return false;
+  }
+
+  if (isGCSafepointPoll(F)) {
+    // Given we're inlining this inside of safepoint poll insertion, this
+    // doesn't make any sense.  Note that we do make any contained calls
+    // parseable after we inline a poll.  
+    return false;
+  }
+
+  if (!shouldRewriteFunction(F))
+    return false;
+
+  bool modified = false;
+
+  // In various bits below, we rely on the fact that uses are reachable from
+  // defs.  When there are basic blocks unreachable from the entry, dominance
+  // and reachablity queries return non-sensical results.  Thus, we preprocess
+  // the function to ensure these properties hold.
+  modified |= removeUnreachableBlocks(F);
+
+  // STEP 1 - Insert the safepoint polling locations.  We do not need to
+  // actually insert parse points yet.  That will be done for all polls and
+  // calls in a single pass.
+
+  // Note: With the migration, we need to recompute this for each 'pass'.  Once
+  // we merge these, we'll do it once before the analysis
+  DominatorTree DT;
+
+  std::vector<CallSite> ParsePointNeeded;
+
+  if (enableBackedgeSafepoints(F)) {
+    // Construct a pass manager to run the LoopPass backedge logic.  We
+    // need the pass manager to handle scheduling all the loop passes
+    // appropriately.  Doing this by hand is painful and just not worth messing
+    // with for the moment.
+    legacy::FunctionPassManager FPM(F.getParent());
+    bool CanAssumeCallSafepoints = enableCallSafepoints(F);
+    PlaceBackedgeSafepointsImpl *PBS =
+      new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints);
+    FPM.add(PBS);
+    // Note: While the analysis pass itself won't modify the IR, LoopSimplify
+    // (which it depends on) may.  i.e. analysis must be recalculated after run
+    FPM.run(F);
+
+    // We preserve dominance information when inserting the poll, otherwise
+    // we'd have to recalculate this on every insert
+    DT.recalculate(F);
+
+    // Insert a poll at each point the analysis pass identified
+    for (size_t i = 0; i < PBS->PollLocations.size(); i++) {
+      // We are inserting a poll, the function is modified
+      modified = true;
+
+      // The poll location must be the terminator of a loop latch block.
+      TerminatorInst *Term = PBS->PollLocations[i];
+
+      std::vector<CallSite> ParsePoints;
+      if (SplitBackedge) {
+        // Split the backedge of the loop and insert the poll within that new
+        // basic block.  This creates a loop with two latches per original
+        // latch (which is non-ideal), but this appears to be easier to
+        // optimize in practice than inserting the poll immediately before the
+        // latch test.
+
+        // Since this is a latch, at least one of the successors must dominate
+        // it. Its possible that we have a) duplicate edges to the same header
+        // and b) edges to distinct loop headers.  We need to insert pools on
+        // each. (Note: This still relies on LoopSimplify.)
+        DenseSet<BasicBlock *> Headers;
+        for (unsigned i = 0; i < Term->getNumSuccessors(); i++) {
+          BasicBlock *Succ = Term->getSuccessor(i);
+          if (DT.dominates(Succ, Term->getParent())) {
+            Headers.insert(Succ);
+          }
+        }
+        assert(!Headers.empty() && "poll location is not a loop latch?");
+
+        // The split loop structure here is so that we only need to recalculate
+        // the dominator tree once.  Alternatively, we could just keep it up to
+        // date and use a more natural merged loop.
+        DenseSet<BasicBlock *> SplitBackedges;
+        for (BasicBlock *Header : Headers) {
+          BasicBlock *NewBB = SplitEdge(Term->getParent(), Header, nullptr);
+          SplitBackedges.insert(NewBB);
+        }
+        DT.recalculate(F);
+        for (BasicBlock *NewBB : SplitBackedges) {
+          InsertSafepointPoll(DT, NewBB->getTerminator(), ParsePoints);
+          NumBackedgeSafepoints++;
+        }
+
+      } else {
+        // Split the latch block itself, right before the terminator.
+        InsertSafepointPoll(DT, Term, ParsePoints);
+        NumBackedgeSafepoints++;
+      }
+
+      // Record the parse points for later use
+      ParsePointNeeded.insert(ParsePointNeeded.end(), ParsePoints.begin(),
+                              ParsePoints.end());
+    }
+  }
+
+  if (enableEntrySafepoints(F)) {
+    DT.recalculate(F);
+    Instruction *term = findLocationForEntrySafepoint(F, DT);
+    if (!term) {
+      // policy choice not to insert?
+    } else {
+      std::vector<CallSite> RuntimeCalls;
+      InsertSafepointPoll(DT, term, RuntimeCalls);
+      modified = true;
+      NumEntrySafepoints++;
+      ParsePointNeeded.insert(ParsePointNeeded.end(), RuntimeCalls.begin(),
+                              RuntimeCalls.end());
+    }
+  }
+
+  if (enableCallSafepoints(F)) {
+    DT.recalculate(F);
+    std::vector<CallSite> Calls;
+    findCallSafepoints(F, Calls);
+    NumCallSafepoints += Calls.size();
+    ParsePointNeeded.insert(ParsePointNeeded.end(), Calls.begin(), Calls.end());
+  }
+
+  // Unique the vectors since we can end up with duplicates if we scan the call
+  // site for call safepoints after we add it for entry or backedge.  The
+  // only reason we need tracking at all is that some functions might have
+  // polls but not call safepoints and thus we might miss marking the runtime
+  // calls for the polls. (This is useful in test cases!)
+  unique_unsorted(ParsePointNeeded);
+
+  // Any parse point (no matter what source) will be handled here
+  DT.recalculate(F); // Needed?
+
+  // We're about to start modifying the function
+  if (!ParsePointNeeded.empty())
+    modified = true;
+
+  // Now run through and insert the safepoints, but do _NOT_ update or remove
+  // any existing uses.  We have references to live variables that need to
+  // survive to the last iteration of this loop.
+  std::vector<Value *> Results;
+  Results.reserve(ParsePointNeeded.size());
+  for (size_t i = 0; i < ParsePointNeeded.size(); i++) {
+    CallSite &CS = ParsePointNeeded[i];
+    Value *GCResult = ReplaceWithStatepoint(CS, nullptr);
+    Results.push_back(GCResult);
+  }
+  assert(Results.size() == ParsePointNeeded.size());
+
+  // Adjust all users of the old call sites to use the new ones instead
+  for (size_t i = 0; i < ParsePointNeeded.size(); i++) {
+    CallSite &CS = ParsePointNeeded[i];
+    Value *GCResult = Results[i];
+    if (GCResult) {
+      // In case if we inserted result in a different basic block than the
+      // original safepoint (this can happen for invokes). We need to be sure
+      // that
+      // original result value was not used in any of the phi nodes at the
+      // beginning of basic block with gc result. Because we know that all such
+      // blocks will have single predecessor we can safely assume that all phi
+      // nodes have single entry (because of normalizeBBForInvokeSafepoint).
+      // Just remove them all here.
+      if (CS.isInvoke()) {
+        FoldSingleEntryPHINodes(cast<Instruction>(GCResult)->getParent(),
+                                nullptr);
+        assert(
+            !isa<PHINode>(cast<Instruction>(GCResult)->getParent()->begin()));
+      }
+
+      // Replace all uses with the new call
+      CS.getInstruction()->replaceAllUsesWith(GCResult);
+    }
+
+    // Now that we've handled all uses, remove the original call itself
+    // Note: The insert point can't be the deleted instruction!
+    CS.getInstruction()->eraseFromParent();
+  }
+  return modified;
+}
+
+char PlaceBackedgeSafepointsImpl::ID = 0;
+char PlaceSafepoints::ID = 0;
+
+ModulePass *llvm::createPlaceSafepointsPass() { return new PlaceSafepoints(); }
+
+INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl,
+                      "place-backedge-safepoints-impl",
+                      "Place Backedge Safepoints", false, false)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl,
+                    "place-backedge-safepoints-impl",
+                    "Place Backedge Safepoints", false, false)
+
+INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints",
+                      false, false)
+INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints",
+                    false, false)
+
+static bool isGCLeafFunction(const CallSite &CS) {
+  Instruction *inst = CS.getInstruction();
+  if (isa<IntrinsicInst>(inst)) {
+    // Most LLVM intrinsics are things which can never take a safepoint.
+    // As a result, we don't need to have the stack parsable at the
+    // callsite.  This is a highly useful optimization since intrinsic
+    // calls are fairly prevelent, particularly in debug builds.
+    return true;
+  }
+
+  // If this function is marked explicitly as a leaf call, we don't need to
+  // place a safepoint of it.  In fact, for correctness we *can't* in many
+  // cases.  Note: Indirect calls return Null for the called function,
+  // these obviously aren't runtime functions with attributes
+  // TODO: Support attributes on the call site as well.
+  const Function *F = CS.getCalledFunction();
+  bool isLeaf =
+      F &&
+      F->getFnAttribute("gc-leaf-function").getValueAsString().equals("true");
+  if (isLeaf) {
+    return true;
+  }
+  return false;
+}
+
+static void
+InsertSafepointPoll(DominatorTree &DT, Instruction *term,
+                    std::vector<CallSite> &ParsePointsNeeded /*rval*/) {
+  Module *M = term->getParent()->getParent()->getParent();
+  assert(M);
+
+  // Inline the safepoint poll implementation - this will get all the branch,
+  // control flow, etc..  Most importantly, it will introduce the actual slow
+  // path call - where we need to insert a safepoint (parsepoint).
+  FunctionType *ftype =
+      FunctionType::get(Type::getVoidTy(M->getContext()), false);
+  assert(ftype && "null?");
+  // Note: This cast can fail if there's a function of the same name with a
+  // different type inserted previously
+  Function *F =
+      dyn_cast<Function>(M->getOrInsertFunction("gc.safepoint_poll", ftype));
+  assert(F && "void @gc.safepoint_poll() must be defined");
+  assert(!F->empty() && "gc.safepoint_poll must be a non-empty function");
+  CallInst *poll = CallInst::Create(F, "", term);
+
+  // Record some information about the call site we're replacing
+  BasicBlock *OrigBB = term->getParent();
+  BasicBlock::iterator before(poll), after(poll);
+  bool isBegin(false);
+  if (before == term->getParent()->begin()) {
+    isBegin = true;
+  } else {
+    before--;
+  }
+  after++;
+  assert(after != poll->getParent()->end() && "must have successor");
+  assert(DT.dominates(before, after) && "trivially true");
+
+  // do the actual inlining
+  InlineFunctionInfo IFI;
+  bool inlineStatus = InlineFunction(poll, IFI);
+  assert(inlineStatus && "inline must succeed");
+  (void)inlineStatus; // suppress warning in release-asserts
+
+  // Check post conditions
+  assert(IFI.StaticAllocas.empty() && "can't have allocs");
+
+  std::vector<CallInst *> calls; // new calls
+  std::set<BasicBlock *> BBs;    // new BBs + insertee
+  // Include only the newly inserted instructions, Note: begin may not be valid
+  // if we inserted to the beginning of the basic block
+  BasicBlock::iterator start;
+  if (isBegin) {
+    start = OrigBB->begin();
+  } else {
+    start = before;
+    start++;
+  }
+
+  // If your poll function includes an unreachable at the end, that's not
+  // valid.  Bugpoint likes to create this, so check for it.
+  assert(isPotentiallyReachable(&*start, &*after, nullptr, nullptr) &&
+         "malformed poll function");
+
+  scanInlinedCode(&*(start), &*(after), calls, BBs);
+
+  // Recompute since we've invalidated cached data.  Conceptually we
+  // shouldn't need to do this, but implementation wise we appear to.  Needed
+  // so we can insert safepoints correctly.
+  // TODO: update more cheaply
+  DT.recalculate(*after->getParent()->getParent());
+
+  assert(!calls.empty() && "slow path not found for safepoint poll");
+
+  // Record the fact we need a parsable state at the runtime call contained in
+  // the poll function.  This is required so that the runtime knows how to
+  // parse the last frame when we actually take  the safepoint (i.e. execute
+  // the slow path)
+  assert(ParsePointsNeeded.empty());
+  for (size_t i = 0; i < calls.size(); i++) {
+
+    // No safepoint needed or wanted
+    if (!needsStatepoint(calls[i])) {
+      continue;
+    }
+
+    // These are likely runtime calls.  Should we assert that via calling
+    // convention or something?
+    ParsePointsNeeded.push_back(CallSite(calls[i]));
+  }
+  assert(ParsePointsNeeded.size() <= calls.size());
+}
+
+// Normalize basic block to make it ready to be target of invoke statepoint.
+// It means spliting it to have single predecessor. Return newly created BB
+// ready to be successor of invoke statepoint.
+static BasicBlock *normalizeBBForInvokeSafepoint(BasicBlock *BB,
+                                                 BasicBlock *InvokeParent) {
+  BasicBlock *ret = BB;
+
+  if (!BB->getUniquePredecessor()) {
+    ret = SplitBlockPredecessors(BB, InvokeParent, "");
+  }
+
+  // Another requirement for such basic blocks is to not have any phi nodes.
+  // Since we just ensured that new BB will have single predecessor,
+  // all phi nodes in it will have one value. Here it would be naturall place
+  // to
+  // remove them all. But we can not do this because we are risking to remove
+  // one of the values stored in liveset of another statepoint. We will do it
+  // later after placing all safepoints.
+
+  return ret;
+}
+
+/// Replaces the given call site (Call or Invoke) with a gc.statepoint
+/// intrinsic with an empty deoptimization arguments list.  This does
+/// NOT do explicit relocation for GC support.
+static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */
+                                    Pass *P) {
+  BasicBlock *BB = CS.getInstruction()->getParent();
+  Function *F = BB->getParent();
+  Module *M = F->getParent();
+  assert(M && "must be set");
+
+  // TODO: technically, a pass is not allowed to get functions from within a
+  // function pass since it might trigger a new function addition.  Refactor
+  // this logic out to the initialization of the pass.  Doesn't appear to
+  // matter in practice.
+
+  // Then go ahead and use the builder do actually do the inserts.  We insert
+  // immediately before the previous instruction under the assumption that all
+  // arguments will be available here.  We can't insert afterwards since we may
+  // be replacing a terminator.
+  Instruction *insertBefore = CS.getInstruction();
+  IRBuilder<> Builder(insertBefore);
+
+  // Note: The gc args are not filled in at this time, that's handled by
+  // RewriteStatepointsForGC (which is currently under review).
+
+  // Create the statepoint given all the arguments
+  Instruction *token = nullptr;
+  AttributeSet return_attributes;
+  if (CS.isCall()) {
+    CallInst *toReplace = cast<CallInst>(CS.getInstruction());
+    CallInst *Call = Builder.CreateGCStatepoint(
+        CS.getCalledValue(), makeArrayRef(CS.arg_begin(), CS.arg_end()), None,
+        None, "safepoint_token");
+    Call->setTailCall(toReplace->isTailCall());
+    Call->setCallingConv(toReplace->getCallingConv());
+
+    // Before we have to worry about GC semantics, all attributes are legal
+    AttributeSet new_attrs = toReplace->getAttributes();
+    // In case if we can handle this set of sttributes - set up function attrs
+    // directly on statepoint and return attrs later for gc_result intrinsic.
+    Call->setAttributes(new_attrs.getFnAttributes());
+    return_attributes = new_attrs.getRetAttributes();
+    // TODO: handle param attributes
+
+    token = Call;
+
+    // Put the following gc_result and gc_relocate calls immediately after the
+    // the old call (which we're about to delete)
+    BasicBlock::iterator next(toReplace);
+    assert(BB->end() != next && "not a terminator, must have next");
+    next++;
+    Instruction *IP = &*(next);
+    Builder.SetInsertPoint(IP);
+    Builder.SetCurrentDebugLocation(IP->getDebugLoc());
+
+  } else if (CS.isInvoke()) {
+    // TODO: make CreateGCStatepoint return an Instruction that we can cast to a
+    // Call or Invoke, instead of doing this junk here.
+
+    // Fill in the one generic type'd argument (the function is also
+    // vararg)
+    std::vector<Type *> argTypes;
+    argTypes.push_back(CS.getCalledValue()->getType());
+
+    Function *gc_statepoint_decl = Intrinsic::getDeclaration(
+        M, Intrinsic::experimental_gc_statepoint, argTypes);
+
+    // First, create the statepoint (with all live ptrs as arguments).
+    std::vector<llvm::Value *> args;
+    // target, #call args, unused, ... call parameters, #deopt args, ... deopt
+    // parameters, ... gc parameters
+    Value *Target = CS.getCalledValue();
+    args.push_back(Target);
+    int callArgSize = CS.arg_size();
+    // #call args
+    args.push_back(Builder.getInt32(callArgSize));
+    // unused
+    args.push_back(Builder.getInt32(0));
+    // call parameters
+    args.insert(args.end(), CS.arg_begin(), CS.arg_end());
+    // #deopt args: 0
+    args.push_back(Builder.getInt32(0));
+
+    InvokeInst *toReplace = cast<InvokeInst>(CS.getInstruction());
+
+    // Insert the new invoke into the old block.  We'll remove the old one in a
+    // moment at which point this will become the new terminator for the
+    // original block.
+    InvokeInst *invoke = InvokeInst::Create(
+        gc_statepoint_decl, toReplace->getNormalDest(),
+        toReplace->getUnwindDest(), args, "", toReplace->getParent());
+    invoke->setCallingConv(toReplace->getCallingConv());
+
+    // Currently we will fail on parameter attributes and on certain
+    // function attributes.
+    AttributeSet new_attrs = toReplace->getAttributes();
+    // In case if we can handle this set of sttributes - set up function attrs
+    // directly on statepoint and return attrs later for gc_result intrinsic.
+    invoke->setAttributes(new_attrs.getFnAttributes());
+    return_attributes = new_attrs.getRetAttributes();
+
+    token = invoke;
+
+    // We'll insert the gc.result into the normal block
+    BasicBlock *normalDest = normalizeBBForInvokeSafepoint(
+        toReplace->getNormalDest(), invoke->getParent());
+    Instruction *IP = &*(normalDest->getFirstInsertionPt());
+    Builder.SetInsertPoint(IP);
+  } else {
+    llvm_unreachable("unexpect type of CallSite");
+  }
+  assert(token);
+
+  // Handle the return value of the original call - update all uses to use a
+  // gc_result hanging off the statepoint node we just inserted
+
+  // Only add the gc_result iff there is actually a used result
+  if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) {
+    std::string takenName =
+      CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : "";
+    CallInst *gc_result =
+        Builder.CreateGCResult(token, CS.getType(), takenName);
+    gc_result->setAttributes(return_attributes);
+    return gc_result;
+  } else {
+    // No return value for the call.
+    return nullptr;
+  }
+}
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 1bbaaf3..98016b4 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -917,10 +917,13 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,
 /// version of the value is returned, and BI is left pointing at the instruction
 /// that should be processed next by the reassociation pass.
 static Value *NegateValue(Value *V, Instruction *BI) {
-  if (ConstantFP *C = dyn_cast<ConstantFP>(V))
-    return ConstantExpr::getFNeg(C);
-  if (Constant *C = dyn_cast<Constant>(V))
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    if (C->getType()->isFPOrFPVectorTy()) {
+      return ConstantExpr::getFNeg(C);
+    }
     return ConstantExpr::getNeg(C);
+  }
+
 
   // We are trying to expose opportunity for reassociation.  One of the things
   // that we want to do to achieve this is to push a negation as deep into an
@@ -1512,7 +1515,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
         ++NumFound;
       } while (i != Ops.size() && Ops[i].Op == TheOp);
 
-      DEBUG(errs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
+      DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
       ++NumFactor;
 
       // Insert a new multiply.
@@ -1650,7 +1653,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
 
   // If any factor occurred more than one time, we can pull it out.
   if (MaxOcc > 1) {
-    DEBUG(errs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
+    DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
     ++NumFactor;
 
     // Create a new instruction that uses the MaxOccVal twice.  If we don't do
@@ -1988,7 +1991,7 @@ Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) {
   Constant *C = C0 ? C0 : C1;
   unsigned ConstIdx = C0 ? 0 : 1;
   if (auto *CI = dyn_cast<ConstantInt>(C)) {
-    if (!CI->isNegative())
+    if (!CI->isNegative() || CI->isMinValue(true))
       return nullptr;
   } else if (auto *CF = dyn_cast<ConstantFP>(C)) {
     if (!CF->isNegative())
diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp
index b6023e2..1b46727 100644
--- a/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -73,7 +73,7 @@ bool RegToMem::runOnFunction(Function &F) {
 
   // Insert all new allocas into entry block.
   BasicBlock *BBEntry = &F.getEntryBlock();
-  assert(pred_begin(BBEntry) == pred_end(BBEntry) &&
+  assert(pred_empty(BBEntry) &&
          "Entry block to function must not have predecessors!");
 
   // Find first non-alloca instruction and create insertion point. This is
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
new file mode 100644
index 0000000..ca9ab54
--- /dev/null
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -0,0 +1,1897 @@
+//===- RewriteStatepointsForGC.cpp - Make GC relocations explicit ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Rewrite an existing set of gc.statepoints such that they make potential
+// relocations performed by the garbage collector explicit in the IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+
+#define DEBUG_TYPE "rewrite-statepoints-for-gc"
+
+using namespace llvm;
+
+// Print tracing output
+static cl::opt<bool> TraceLSP("trace-rewrite-statepoints", cl::Hidden,
+                              cl::init(false));
+
+// Print the liveset found at the insert location
+static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden,
+                                  cl::init(false));
+static cl::opt<bool> PrintLiveSetSize("spp-print-liveset-size",
+                                      cl::Hidden, cl::init(false));
+// Print out the base pointers for debugging
+static cl::opt<bool> PrintBasePointers("spp-print-base-pointers",
+                                       cl::Hidden, cl::init(false));
+
+namespace {
+struct RewriteStatepointsForGC : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  RewriteStatepointsForGC() : FunctionPass(ID) {
+    initializeRewriteStatepointsForGCPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // We add and rewrite a bunch of instructions, but don't really do much
+    // else.  We could in theory preserve a lot more analyses here.
+    AU.addRequired<DominatorTreeWrapperPass>();
+  }
+};
+} // namespace
+
+char RewriteStatepointsForGC::ID = 0;
+
+FunctionPass *llvm::createRewriteStatepointsForGCPass() {
+  return new RewriteStatepointsForGC();
+}
+
+INITIALIZE_PASS_BEGIN(RewriteStatepointsForGC, "rewrite-statepoints-for-gc",
+                      "Make relocations explicit at statepoints", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(RewriteStatepointsForGC, "rewrite-statepoints-for-gc",
+                    "Make relocations explicit at statepoints", false, false)
+
+namespace {
+// The type of the internal cache used inside the findBasePointers family
+// of functions.  From the callers perspective, this is an opaque type and
+// should not be inspected.
+//
+// In the actual implementation this caches two relations:
+// - The base relation itself (i.e. this pointer is based on that one)
+// - The base defining value relation (i.e. before base_phi insertion)
+// Generally, after the execution of a full findBasePointer call, only the
+// base relation will remain.  Internally, we add a mixture of the two
+// types, then update all the second type to the first type
+typedef DenseMap<Value *, Value *> DefiningValueMapTy;
+typedef DenseSet<llvm::Value *> StatepointLiveSetTy;
+
+struct PartiallyConstructedSafepointRecord {
+  /// The set of values known to be live accross this safepoint
+  StatepointLiveSetTy liveset;
+
+  /// Mapping from live pointers to a base-defining-value
+  DenseMap<llvm::Value *, llvm::Value *> PointerToBase;
+
+  /// Any new values which were added to the IR during base pointer analysis
+  /// for this safepoint
+  DenseSet<llvm::Value *> NewInsertedDefs;
+
+  /// The *new* gc.statepoint instruction itself.  This produces the token
+  /// that normal path gc.relocates and the gc.result are tied to.
+  Instruction *StatepointToken;
+
+  /// Instruction to which exceptional gc relocates are attached
+  /// Makes it easier to iterate through them during relocationViaAlloca.
+  Instruction *UnwindToken;
+};
+}
+
+// TODO: Once we can get to the GCStrategy, this becomes
+// Optional<bool> isGCManagedPointer(const Value *V) const override {
+
+static bool isGCPointerType(const Type *T) {
+  if (const PointerType *PT = dyn_cast<PointerType>(T))
+    // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
+    // GC managed heap.  We know that a pointer into this heap needs to be
+    // updated and that no other pointer does.
+    return (1 == PT->getAddressSpace());
+  return false;
+}
+
+/// Return true if the Value is a gc reference type which is potentially used
+/// after the instruction 'loc'.  This is only used with the edge reachability
+/// liveness code.  Note: It is assumed the V dominates loc.
+static bool isLiveGCReferenceAt(Value &V, Instruction *loc, DominatorTree &DT,
+                                LoopInfo *LI) {
+  if (!isGCPointerType(V.getType()))
+    return false;
+
+  if (V.use_empty())
+    return false;
+
+  // Given assumption that V dominates loc, this may be live
+  return true;
+}
+
+#ifndef NDEBUG
+static bool isAggWhichContainsGCPtrType(Type *Ty) {
+  if (VectorType *VT = dyn_cast<VectorType>(Ty))
+    return isGCPointerType(VT->getScalarType());
+  if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
+    return isGCPointerType(AT->getElementType()) ||
+           isAggWhichContainsGCPtrType(AT->getElementType());
+  if (StructType *ST = dyn_cast<StructType>(Ty))
+    return std::any_of(ST->subtypes().begin(), ST->subtypes().end(),
+                       [](Type *SubType) {
+                         return isGCPointerType(SubType) ||
+                                isAggWhichContainsGCPtrType(SubType);
+                       });
+  return false;
+}
+#endif
+
+// Conservatively identifies any definitions which might be live at the
+// given instruction. The  analysis is performed immediately before the
+// given instruction. Values defined by that instruction are not considered
+// live.  Values used by that instruction are considered live.
+//
+// preconditions: valid IR graph, term is either a terminator instruction or
+// a call instruction, pred is the basic block of term, DT, LI are valid
+//
+// side effects: none, does not mutate IR
+//
+//  postconditions: populates liveValues as discussed above
+static void findLiveGCValuesAtInst(Instruction *term, BasicBlock *pred,
+                                   DominatorTree &DT, LoopInfo *LI,
+                                   StatepointLiveSetTy &liveValues) {
+  liveValues.clear();
+
+  assert(isa<CallInst>(term) || isa<InvokeInst>(term) || term->isTerminator());
+
+  Function *F = pred->getParent();
+
+  auto is_live_gc_reference =
+      [&](Value &V) { return isLiveGCReferenceAt(V, term, DT, LI); };
+
+  // Are there any gc pointer arguments live over this point?  This needs to be
+  // special cased since arguments aren't defined in basic blocks.
+  for (Argument &arg : F->args()) {
+    assert(!isAggWhichContainsGCPtrType(arg.getType()) &&
+           "support for FCA unimplemented");
+
+    if (is_live_gc_reference(arg)) {
+      liveValues.insert(&arg);
+    }
+  }
+
+  // Walk through all dominating blocks - the ones which can contain
+  // definitions used in this block - and check to see if any of the values
+  // they define are used in locations potentially reachable from the
+  // interesting instruction.
+  BasicBlock *BBI = pred;
+  while (true) {
+    if (TraceLSP) {
+      errs() << "[LSP] Looking at dominating block " << pred->getName() << "\n";
+    }
+    assert(DT.dominates(BBI, pred));
+    assert(isPotentiallyReachable(BBI, pred, &DT) &&
+           "dominated block must be reachable");
+
+    // Walk through the instructions in dominating blocks and keep any
+    // that have a use potentially reachable from the block we're
+    // considering putting the safepoint in
+    for (Instruction &inst : *BBI) {
+      if (TraceLSP) {
+        errs() << "[LSP] Looking at instruction ";
+        inst.dump();
+      }
+
+      if (pred == BBI && (&inst) == term) {
+        if (TraceLSP) {
+          errs() << "[LSP] stopped because we encountered the safepoint "
+                    "instruction.\n";
+        }
+
+        // If we're in the block which defines the interesting instruction,
+        // we don't want to include any values as live which are defined
+        // _after_ the interesting line or as part of the line itself
+        // i.e. "term" is the call instruction for a call safepoint, the
+        // results of the call should not be considered live in that stackmap
+        break;
+      }
+
+      assert(!isAggWhichContainsGCPtrType(inst.getType()) &&
+             "support for FCA unimplemented");
+
+      if (is_live_gc_reference(inst)) {
+        if (TraceLSP) {
+          errs() << "[LSP] found live value for this safepoint ";
+          inst.dump();
+          term->dump();
+        }
+        liveValues.insert(&inst);
+      }
+    }
+    if (!DT.getNode(BBI)->getIDom()) {
+      assert(BBI == &F->getEntryBlock() &&
+             "failed to find a dominator for something other than "
+             "the entry block");
+      break;
+    }
+    BBI = DT.getNode(BBI)->getIDom()->getBlock();
+  }
+}
+
+static bool order_by_name(llvm::Value *a, llvm::Value *b) {
+  if (a->hasName() && b->hasName()) {
+    return -1 == a->getName().compare(b->getName());
+  } else if (a->hasName() && !b->hasName()) {
+    return true;
+  } else if (!a->hasName() && b->hasName()) {
+    return false;
+  } else {
+    // Better than nothing, but not stable
+    return a < b;
+  }
+}
+
+/// Find the initial live set. Note that due to base pointer
+/// insertion, the live set may be incomplete.
+static void
+analyzeParsePointLiveness(DominatorTree &DT, const CallSite &CS,
+                          PartiallyConstructedSafepointRecord &result) {
+  Instruction *inst = CS.getInstruction();
+
+  BasicBlock *BB = inst->getParent();
+  StatepointLiveSetTy liveset;
+  findLiveGCValuesAtInst(inst, BB, DT, nullptr, liveset);
+
+  if (PrintLiveSet) {
+    // Note: This output is used by several of the test cases
+    // The order of elemtns in a set is not stable, put them in a vec and sort
+    // by name
+    SmallVector<Value *, 64> temp;
+    temp.insert(temp.end(), liveset.begin(), liveset.end());
+    std::sort(temp.begin(), temp.end(), order_by_name);
+    errs() << "Live Variables:\n";
+    for (Value *V : temp) {
+      errs() << " " << V->getName(); // no newline
+      V->dump();
+    }
+  }
+  if (PrintLiveSetSize) {
+    errs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n";
+    errs() << "Number live values: " << liveset.size() << "\n";
+  }
+  result.liveset = liveset;
+}
+
+/// True iff this value is the null pointer constant (of any pointer type)
+static bool LLVM_ATTRIBUTE_UNUSED isNullConstant(Value *V) {
+  return isa<Constant>(V) && isa<PointerType>(V->getType()) &&
+         cast<Constant>(V)->isNullValue();
+}
+
+/// Helper function for findBasePointer - Will return a value which either a)
+/// defines the base pointer for the input or b) blocks the simple search
+/// (i.e. a PHI or Select of two derived pointers)
+static Value *findBaseDefiningValue(Value *I) {
+  assert(I->getType()->isPointerTy() &&
+         "Illegal to ask for the base pointer of a non-pointer type");
+
+  // There are instructions which can never return gc pointer values.  Sanity
+  // check
+  // that this is actually true.
+  assert(!isa<InsertElementInst>(I) && !isa<ExtractElementInst>(I) &&
+         !isa<ShuffleVectorInst>(I) && "Vector types are not gc pointers");
+  assert((!isa<Instruction>(I) || isa<InvokeInst>(I) ||
+          !cast<Instruction>(I)->isTerminator()) &&
+         "With the exception of invoke terminators don't define values");
+  assert(!isa<StoreInst>(I) && !isa<FenceInst>(I) &&
+         "Can't be definitions to start with");
+  assert(!isa<ICmpInst>(I) && !isa<FCmpInst>(I) &&
+         "Comparisons don't give ops");
+  // There's a bunch of instructions which just don't make sense to apply to
+  // a pointer.  The only valid reason for this would be pointer bit
+  // twiddling which we're just not going to support.
+  assert((!isa<Instruction>(I) || !cast<Instruction>(I)->isBinaryOp()) &&
+         "Binary ops on pointer values are meaningless.  Unless your "
+         "bit-twiddling which we don't support");
+
+  if (Argument *Arg = dyn_cast<Argument>(I)) {
+    // An incoming argument to the function is a base pointer
+    // We should have never reached here if this argument isn't an gc value
+    assert(Arg->getType()->isPointerTy() &&
+           "Base for pointer must be another pointer");
+    return Arg;
+  }
+
+  if (GlobalVariable *global = dyn_cast<GlobalVariable>(I)) {
+    // base case
+    assert(global->getType()->isPointerTy() &&
+           "Base for pointer must be another pointer");
+    return global;
+  }
+
+  // inlining could possibly introduce phi node that contains
+  // undef if callee has multiple returns
+  if (UndefValue *undef = dyn_cast<UndefValue>(I)) {
+    assert(undef->getType()->isPointerTy() &&
+           "Base for pointer must be another pointer");
+    return undef; // utterly meaningless, but useful for dealing with
+                  // partially optimized code.
+  }
+
+  // Due to inheritance, this must be _after_ the global variable and undef
+  // checks
+  if (Constant *con = dyn_cast<Constant>(I)) {
+    assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) &&
+           "order of checks wrong!");
+    // Note: Finding a constant base for something marked for relocation
+    // doesn't really make sense.  The most likely case is either a) some
+    // screwed up the address space usage or b) your validating against
+    // compiled C++ code w/o the proper separation.  The only real exception
+    // is a null pointer.  You could have generic code written to index of
+    // off a potentially null value and have proven it null.  We also use
+    // null pointers in dead paths of relocation phis (which we might later
+    // want to find a base pointer for).
+    assert(con->getType()->isPointerTy() &&
+           "Base for pointer must be another pointer");
+    assert(con->isNullValue() && "null is the only case which makes sense");
+    return con;
+  }
+
+  if (CastInst *CI = dyn_cast<CastInst>(I)) {
+    Value *def = CI->stripPointerCasts();
+    assert(def->getType()->isPointerTy() &&
+           "Base for pointer must be another pointer");
+    // If we find a cast instruction here, it means we've found a cast which is
+    // not simply a pointer cast (i.e. an inttoptr).  We don't know how to
+    // handle int->ptr conversion.
+    assert(!isa<CastInst>(def) && "shouldn't find another cast here");
+    return findBaseDefiningValue(def);
+  }
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (LI->getType()->isPointerTy()) {
+      Value *Op = LI->getOperand(0);
+      (void)Op;
+      // Has to be a pointer to an gc object, or possibly an array of such?
+      assert(Op->getType()->isPointerTy());
+      return LI; // The value loaded is an gc base itself
+    }
+  }
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    Value *Op = GEP->getOperand(0);
+    if (Op->getType()->isPointerTy()) {
+      return findBaseDefiningValue(Op); // The base of this GEP is the base
+    }
+  }
+
+  if (AllocaInst *alloc = dyn_cast<AllocaInst>(I)) {
+    // An alloca represents a conceptual stack slot.  It's the slot itself
+    // that the GC needs to know about, not the value in the slot.
+    assert(alloc->getType()->isPointerTy() &&
+           "Base for pointer must be another pointer");
+    return alloc;
+  }
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      // fall through to general call handling
+      break;
+    case Intrinsic::experimental_gc_statepoint:
+    case Intrinsic::experimental_gc_result_float:
+    case Intrinsic::experimental_gc_result_int:
+      llvm_unreachable("these don't produce pointers");
+    case Intrinsic::experimental_gc_result_ptr:
+      // This is just a special case of the CallInst check below to handle a
+      // statepoint with deopt args which hasn't been rewritten for GC yet.
+      // TODO: Assert that the statepoint isn't rewritten yet.
+      return II;
+    case Intrinsic::experimental_gc_relocate: {
+      // Rerunning safepoint insertion after safepoints are already
+      // inserted is not supported.  It could probably be made to work,
+      // but why are you doing this?  There's no good reason.
+      llvm_unreachable("repeat safepoint insertion is not supported");
+    }
+    case Intrinsic::gcroot:
+      // Currently, this mechanism hasn't been extended to work with gcroot.
+      // There's no reason it couldn't be, but I haven't thought about the
+      // implications much.
+      llvm_unreachable(
+          "interaction with the gcroot mechanism is not supported");
+    }
+  }
+  // We assume that functions in the source language only return base
+  // pointers.  This should probably be generalized via attributes to support
+  // both source language and internal functions.
+  if (CallInst *call = dyn_cast<CallInst>(I)) {
+    assert(call->getType()->isPointerTy() &&
+           "Base for pointer must be another pointer");
+    return call;
+  }
+  if (InvokeInst *invoke = dyn_cast<InvokeInst>(I)) {
+    assert(invoke->getType()->isPointerTy() &&
+           "Base for pointer must be another pointer");
+    return invoke;
+  }
+
+  // I have absolutely no idea how to implement this part yet.  It's not
+  // neccessarily hard, I just haven't really looked at it yet.
+  assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented");
+
+  if (AtomicCmpXchgInst *cas = dyn_cast<AtomicCmpXchgInst>(I)) {
+    // A CAS is effectively a atomic store and load combined under a
+    // predicate.  From the perspective of base pointers, we just treat it
+    // like a load.  We loaded a pointer from a address in memory, that value
+    // had better be a valid base pointer.
+    return cas->getPointerOperand();
+  }
+  if (AtomicRMWInst *atomic = dyn_cast<AtomicRMWInst>(I)) {
+    assert(AtomicRMWInst::Xchg == atomic->getOperation() &&
+           "All others are binary ops which don't apply to base pointers");
+    // semantically, a load, store pair.  Treat it the same as a standard load
+    return atomic->getPointerOperand();
+  }
+
+  // The aggregate ops.  Aggregates can either be in the heap or on the
+  // stack, but in either case, this is simply a field load.  As a result,
+  // this is a defining definition of the base just like a load is.
+  if (ExtractValueInst *ev = dyn_cast<ExtractValueInst>(I)) {
+    return ev;
+  }
+
+  // We should never see an insert vector since that would require we be
+  // tracing back a struct value not a pointer value.
+  assert(!isa<InsertValueInst>(I) &&
+         "Base pointer for a struct is meaningless");
+
+  // The last two cases here don't return a base pointer.  Instead, they
+  // return a value which dynamically selects from amoung several base
+  // derived pointers (each with it's own base potentially).  It's the job of
+  // the caller to resolve these.
+  if (SelectInst *select = dyn_cast<SelectInst>(I)) {
+    return select;
+  }
+
+  return cast<PHINode>(I);
+}
+
+/// Returns the base defining value for this value.
+static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &cache) {
+  Value *&Cached = cache[I];
+  if (!Cached) {
+    Cached = findBaseDefiningValue(I);
+  }
+  assert(cache[I] != nullptr);
+
+  if (TraceLSP) {
+    errs() << "fBDV-cached: " << I->getName() << " -> " << Cached->getName()
+           << "\n";
+  }
+  return Cached;
+}
+
+/// Return a base pointer for this value if known.  Otherwise, return it's
+/// base defining value.
+static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &cache) {
+  Value *def = findBaseDefiningValueCached(I, cache);
+  auto Found = cache.find(def);
+  if (Found != cache.end()) {
+    // Either a base-of relation, or a self reference.  Caller must check.
+    return Found->second;
+  }
+  // Only a BDV available
+  return def;
+}
+
+/// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV,
+/// is it known to be a base pointer?  Or do we need to continue searching.
+static bool isKnownBaseResult(Value *v) {
+  if (!isa<PHINode>(v) && !isa<SelectInst>(v)) {
+    // no recursion possible
+    return true;
+  }
+  if (cast<Instruction>(v)->getMetadata("is_base_value")) {
+    // This is a previously inserted base phi or select.  We know
+    // that this is a base value.
+    return true;
+  }
+
+  // We need to keep searching
+  return false;
+}
+
+// TODO: find a better name for this
+namespace {
+class PhiState {
+public:
+  enum Status { Unknown, Base, Conflict };
+
+  PhiState(Status s, Value *b = nullptr) : status(s), base(b) {
+    assert(status != Base || b);
+  }
+  PhiState(Value *b) : status(Base), base(b) {}
+  PhiState() : status(Unknown), base(nullptr) {}
+  PhiState(const PhiState &other) : status(other.status), base(other.base) {
+    assert(status != Base || base);
+  }
+
+  Status getStatus() const { return status; }
+  Value *getBase() const { return base; }
+
+  bool isBase() const { return getStatus() == Base; }
+  bool isUnknown() const { return getStatus() == Unknown; }
+  bool isConflict() const { return getStatus() == Conflict; }
+
+  bool operator==(const PhiState &other) const {
+    return base == other.base && status == other.status;
+  }
+
+  bool operator!=(const PhiState &other) const { return !(*this == other); }
+
+  void dump() {
+    errs() << status << " (" << base << " - "
+           << (base ? base->getName() : "nullptr") << "): ";
+  }
+
+private:
+  Status status;
+  Value *base; // non null only if status == base
+};
+
+typedef DenseMap<Value *, PhiState> ConflictStateMapTy;
+// Values of type PhiState form a lattice, and this is a helper
+// class that implementes the meet operation.  The meat of the meet
+// operation is implemented in MeetPhiStates::pureMeet
+class MeetPhiStates {
+public:
+  // phiStates is a mapping from PHINodes and SelectInst's to PhiStates.
+  explicit MeetPhiStates(const ConflictStateMapTy &phiStates)
+      : phiStates(phiStates) {}
+
+  // Destructively meet the current result with the base V.  V can
+  // either be a merge instruction (SelectInst / PHINode), in which
+  // case its status is looked up in the phiStates map; or a regular
+  // SSA value, in which case it is assumed to be a base.
+  void meetWith(Value *V) {
+    PhiState otherState = getStateForBDV(V);
+    assert((MeetPhiStates::pureMeet(otherState, currentResult) ==
+            MeetPhiStates::pureMeet(currentResult, otherState)) &&
+           "math is wrong: meet does not commute!");
+    currentResult = MeetPhiStates::pureMeet(otherState, currentResult);
+  }
+
+  PhiState getResult() const { return currentResult; }
+
+private:
+  const ConflictStateMapTy &phiStates;
+  PhiState currentResult;
+
+  /// Return a phi state for a base defining value.  We'll generate a new
+  /// base state for known bases and expect to find a cached state otherwise
+  PhiState getStateForBDV(Value *baseValue) {
+    if (isKnownBaseResult(baseValue)) {
+      return PhiState(baseValue);
+    } else {
+      return lookupFromMap(baseValue);
+    }
+  }
+
+  PhiState lookupFromMap(Value *V) {
+    auto I = phiStates.find(V);
+    assert(I != phiStates.end() && "lookup failed!");
+    return I->second;
+  }
+
+  static PhiState pureMeet(const PhiState &stateA, const PhiState &stateB) {
+    switch (stateA.getStatus()) {
+    case PhiState::Unknown:
+      return stateB;
+
+    case PhiState::Base:
+      assert(stateA.getBase() && "can't be null");
+      if (stateB.isUnknown())
+        return stateA;
+
+      if (stateB.isBase()) {
+        if (stateA.getBase() == stateB.getBase()) {
+          assert(stateA == stateB && "equality broken!");
+          return stateA;
+        }
+        return PhiState(PhiState::Conflict);
+      }
+      assert(stateB.isConflict() && "only three states!");
+      return PhiState(PhiState::Conflict);
+
+    case PhiState::Conflict:
+      return stateA;
+    }
+    llvm_unreachable("only three states!");
+  }
+};
+}
+/// For a given value or instruction, figure out what base ptr it's derived
+/// from.  For gc objects, this is simply itself.  On success, returns a value
+/// which is the base pointer.  (This is reliable and can be used for
+/// relocation.)  On failure, returns nullptr.
+static Value *findBasePointer(Value *I, DefiningValueMapTy &cache,
+                              DenseSet<llvm::Value *> &NewInsertedDefs) {
+  Value *def = findBaseOrBDV(I, cache);
+
+  if (isKnownBaseResult(def)) {
+    return def;
+  }
+
+  // Here's the rough algorithm:
+  // - For every SSA value, construct a mapping to either an actual base
+  //   pointer or a PHI which obscures the base pointer.
+  // - Construct a mapping from PHI to unknown TOP state.  Use an
+  //   optimistic algorithm to propagate base pointer information.  Lattice
+  //   looks like:
+  //   UNKNOWN
+  //   b1 b2 b3 b4
+  //   CONFLICT
+  //   When algorithm terminates, all PHIs will either have a single concrete
+  //   base or be in a conflict state.
+  // - For every conflict, insert a dummy PHI node without arguments.  Add
+  //   these to the base[Instruction] = BasePtr mapping.  For every
+  //   non-conflict, add the actual base.
+  //  - For every conflict, add arguments for the base[a] of each input
+  //   arguments.
+  //
+  // Note: A simpler form of this would be to add the conflict form of all
+  // PHIs without running the optimistic algorithm.  This would be
+  // analougous to pessimistic data flow and would likely lead to an
+  // overall worse solution.
+
+  ConflictStateMapTy states;
+  states[def] = PhiState();
+  // Recursively fill in all phis & selects reachable from the initial one
+  // for which we don't already know a definite base value for
+  // PERF: Yes, this is as horribly inefficient as it looks.
+  bool done = false;
+  while (!done) {
+    done = true;
+    for (auto Pair : states) {
+      Value *v = Pair.first;
+      assert(!isKnownBaseResult(v) && "why did it get added?");
+      if (PHINode *phi = dyn_cast<PHINode>(v)) {
+        assert(phi->getNumIncomingValues() > 0 &&
+               "zero input phis are illegal");
+        for (Value *InVal : phi->incoming_values()) {
+          Value *local = findBaseOrBDV(InVal, cache);
+          if (!isKnownBaseResult(local) && states.find(local) == states.end()) {
+            states[local] = PhiState();
+            done = false;
+          }
+        }
+      } else if (SelectInst *sel = dyn_cast<SelectInst>(v)) {
+        Value *local = findBaseOrBDV(sel->getTrueValue(), cache);
+        if (!isKnownBaseResult(local) && states.find(local) == states.end()) {
+          states[local] = PhiState();
+          done = false;
+        }
+        local = findBaseOrBDV(sel->getFalseValue(), cache);
+        if (!isKnownBaseResult(local) && states.find(local) == states.end()) {
+          states[local] = PhiState();
+          done = false;
+        }
+      }
+    }
+  }
+
+  if (TraceLSP) {
+    errs() << "States after initialization:\n";
+    for (auto Pair : states) {
+      Instruction *v = cast<Instruction>(Pair.first);
+      PhiState state = Pair.second;
+      state.dump();
+      v->dump();
+    }
+  }
+
+  // TODO: come back and revisit the state transitions around inputs which
+  // have reached conflict state.  The current version seems too conservative.
+
+  bool progress = true;
+  size_t oldSize = 0;
+  while (progress) {
+    oldSize = states.size();
+    progress = false;
+    for (auto Pair : states) {
+      MeetPhiStates calculateMeet(states);
+      Value *v = Pair.first;
+      assert(!isKnownBaseResult(v) && "why did it get added?");
+      if (SelectInst *select = dyn_cast<SelectInst>(v)) {
+        calculateMeet.meetWith(findBaseOrBDV(select->getTrueValue(), cache));
+        calculateMeet.meetWith(findBaseOrBDV(select->getFalseValue(), cache));
+      } else
+        for (Value *Val : cast<PHINode>(v)->incoming_values())
+          calculateMeet.meetWith(findBaseOrBDV(Val, cache));
+
+      PhiState oldState = states[v];
+      PhiState newState = calculateMeet.getResult();
+      if (oldState != newState) {
+        progress = true;
+        states[v] = newState;
+      }
+    }
+
+    assert(oldSize <= states.size());
+    assert(oldSize == states.size() || progress);
+  }
+
+  if (TraceLSP) {
+    errs() << "States after meet iteration:\n";
+    for (auto Pair : states) {
+      Instruction *v = cast<Instruction>(Pair.first);
+      PhiState state = Pair.second;
+      state.dump();
+      v->dump();
+    }
+  }
+
+  // Insert Phis for all conflicts
+  for (auto Pair : states) {
+    Instruction *v = cast<Instruction>(Pair.first);
+    PhiState state = Pair.second;
+    assert(!isKnownBaseResult(v) && "why did it get added?");
+    assert(!state.isUnknown() && "Optimistic algorithm didn't complete!");
+    if (state.isConflict()) {
+      if (isa<PHINode>(v)) {
+        int num_preds =
+            std::distance(pred_begin(v->getParent()), pred_end(v->getParent()));
+        assert(num_preds > 0 && "how did we reach here");
+        PHINode *phi = PHINode::Create(v->getType(), num_preds, "base_phi", v);
+        NewInsertedDefs.insert(phi);
+        // Add metadata marking this as a base value
+        auto *const_1 = ConstantInt::get(
+            Type::getInt32Ty(
+                v->getParent()->getParent()->getParent()->getContext()),
+            1);
+        auto MDConst = ConstantAsMetadata::get(const_1);
+        MDNode *md = MDNode::get(
+            v->getParent()->getParent()->getParent()->getContext(), MDConst);
+        phi->setMetadata("is_base_value", md);
+        states[v] = PhiState(PhiState::Conflict, phi);
+      } else if (SelectInst *sel = dyn_cast<SelectInst>(v)) {
+        // The undef will be replaced later
+        UndefValue *undef = UndefValue::get(sel->getType());
+        SelectInst *basesel = SelectInst::Create(sel->getCondition(), undef,
+                                                 undef, "base_select", sel);
+        NewInsertedDefs.insert(basesel);
+        // Add metadata marking this as a base value
+        auto *const_1 = ConstantInt::get(
+            Type::getInt32Ty(
+                v->getParent()->getParent()->getParent()->getContext()),
+            1);
+        auto MDConst = ConstantAsMetadata::get(const_1);
+        MDNode *md = MDNode::get(
+            v->getParent()->getParent()->getParent()->getContext(), MDConst);
+        basesel->setMetadata("is_base_value", md);
+        states[v] = PhiState(PhiState::Conflict, basesel);
+      } else
+        llvm_unreachable("unknown conflict type");
+    }
+  }
+
+  // Fixup all the inputs of the new PHIs
+  for (auto Pair : states) {
+    Instruction *v = cast<Instruction>(Pair.first);
+    PhiState state = Pair.second;
+
+    assert(!isKnownBaseResult(v) && "why did it get added?");
+    assert(!state.isUnknown() && "Optimistic algorithm didn't complete!");
+    if (state.isConflict()) {
+      if (PHINode *basephi = dyn_cast<PHINode>(state.getBase())) {
+        PHINode *phi = cast<PHINode>(v);
+        unsigned NumPHIValues = phi->getNumIncomingValues();
+        for (unsigned i = 0; i < NumPHIValues; i++) {
+          Value *InVal = phi->getIncomingValue(i);
+          BasicBlock *InBB = phi->getIncomingBlock(i);
+
+          // If we've already seen InBB, add the same incoming value
+          // we added for it earlier.  The IR verifier requires phi
+          // nodes with multiple entries from the same basic block
+          // to have the same incoming value for each of those
+          // entries.  If we don't do this check here and basephi
+          // has a different type than base, we'll end up adding two
+          // bitcasts (and hence two distinct values) as incoming
+          // values for the same basic block.
+
+          int blockIndex = basephi->getBasicBlockIndex(InBB);
+          if (blockIndex != -1) {
+            Value *oldBase = basephi->getIncomingValue(blockIndex);
+            basephi->addIncoming(oldBase, InBB);
+#ifndef NDEBUG
+            Value *base = findBaseOrBDV(InVal, cache);
+            if (!isKnownBaseResult(base)) {
+              // Either conflict or base.
+              assert(states.count(base));
+              base = states[base].getBase();
+              assert(base != nullptr && "unknown PhiState!");
+              assert(NewInsertedDefs.count(base) &&
+                     "should have already added this in a prev. iteration!");
+            }
+
+            // In essense this assert states: the only way two
+            // values incoming from the same basic block may be
+            // different is by being different bitcasts of the same
+            // value.  A cleanup that remains TODO is changing
+            // findBaseOrBDV to return an llvm::Value of the correct
+            // type (and still remain pure).  This will remove the
+            // need to add bitcasts.
+            assert(base->stripPointerCasts() == oldBase->stripPointerCasts() &&
+                   "sanity -- findBaseOrBDV should be pure!");
+#endif
+            continue;
+          }
+
+          // Find either the defining value for the PHI or the normal base for
+          // a non-phi node
+          Value *base = findBaseOrBDV(InVal, cache);
+          if (!isKnownBaseResult(base)) {
+            // Either conflict or base.
+            assert(states.count(base));
+            base = states[base].getBase();
+            assert(base != nullptr && "unknown PhiState!");
+          }
+          assert(base && "can't be null");
+          // Must use original input BB since base may not be Instruction
+          // The cast is needed since base traversal may strip away bitcasts
+          if (base->getType() != basephi->getType()) {
+            base = new BitCastInst(base, basephi->getType(), "cast",
+                                   InBB->getTerminator());
+            NewInsertedDefs.insert(base);
+          }
+          basephi->addIncoming(base, InBB);
+        }
+        assert(basephi->getNumIncomingValues() == NumPHIValues);
+      } else if (SelectInst *basesel = dyn_cast<SelectInst>(state.getBase())) {
+        SelectInst *sel = cast<SelectInst>(v);
+        // Operand 1 & 2 are true, false path respectively. TODO: refactor to
+        // something more safe and less hacky.
+        for (int i = 1; i <= 2; i++) {
+          Value *InVal = sel->getOperand(i);
+          // Find either the defining value for the PHI or the normal base for
+          // a non-phi node
+          Value *base = findBaseOrBDV(InVal, cache);
+          if (!isKnownBaseResult(base)) {
+            // Either conflict or base.
+            assert(states.count(base));
+            base = states[base].getBase();
+            assert(base != nullptr && "unknown PhiState!");
+          }
+          assert(base && "can't be null");
+          // Must use original input BB since base may not be Instruction
+          // The cast is needed since base traversal may strip away bitcasts
+          if (base->getType() != basesel->getType()) {
+            base = new BitCastInst(base, basesel->getType(), "cast", basesel);
+            NewInsertedDefs.insert(base);
+          }
+          basesel->setOperand(i, base);
+        }
+      } else
+        llvm_unreachable("unexpected conflict type");
+    }
+  }
+
+  // Cache all of our results so we can cheaply reuse them
+  // NOTE: This is actually two caches: one of the base defining value
+  // relation and one of the base pointer relation!  FIXME
+  for (auto item : states) {
+    Value *v = item.first;
+    Value *base = item.second.getBase();
+    assert(v && base);
+    assert(!isKnownBaseResult(v) && "why did it get added?");
+
+    if (TraceLSP) {
+      std::string fromstr =
+          cache.count(v) ? (cache[v]->hasName() ? cache[v]->getName() : "")
+                         : "none";
+      errs() << "Updating base value cache"
+             << " for: " << (v->hasName() ? v->getName() : "")
+             << " from: " << fromstr
+             << " to: " << (base->hasName() ? base->getName() : "") << "\n";
+    }
+
+    assert(isKnownBaseResult(base) &&
+           "must be something we 'know' is a base pointer");
+    if (cache.count(v)) {
+      // Once we transition from the BDV relation being store in the cache to
+      // the base relation being stored, it must be stable
+      assert((!isKnownBaseResult(cache[v]) || cache[v] == base) &&
+             "base relation should be stable");
+    }
+    cache[v] = base;
+  }
+  assert(cache.find(def) != cache.end());
+  return cache[def];
+}
+
+// For a set of live pointers (base and/or derived), identify the base
+// pointer of the object which they are derived from.  This routine will
+// mutate the IR graph as needed to make the 'base' pointer live at the
+// definition site of 'derived'.  This ensures that any use of 'derived' can
+// also use 'base'.  This may involve the insertion of a number of
+// additional PHI nodes.
+//
+// preconditions: live is a set of pointer type Values
+//
+// side effects: may insert PHI nodes into the existing CFG, will preserve
+// CFG, will not remove or mutate any existing nodes
+//
+// post condition: PointerToBase contains one (derived, base) pair for every
+// pointer in live.  Note that derived can be equal to base if the original
+// pointer was a base pointer.
+static void findBasePointers(const StatepointLiveSetTy &live,
+                             DenseMap<llvm::Value *, llvm::Value *> &PointerToBase,
+                             DominatorTree *DT, DefiningValueMapTy &DVCache,
+                             DenseSet<llvm::Value *> &NewInsertedDefs) {
+  for (Value *ptr : live) {
+    Value *base = findBasePointer(ptr, DVCache, NewInsertedDefs);
+    assert(base && "failed to find base pointer");
+    PointerToBase[ptr] = base;
+    assert((!isa<Instruction>(base) || !isa<Instruction>(ptr) ||
+            DT->dominates(cast<Instruction>(base)->getParent(),
+                          cast<Instruction>(ptr)->getParent())) &&
+           "The base we found better dominate the derived pointer");
+
+    // If you see this trip and like to live really dangerously, the code should
+    // be correct, just with idioms the verifier can't handle.  You can try
+    // disabling the verifier at your own substaintial risk.
+    assert(!isNullConstant(base) && "the relocation code needs adjustment to "
+                                    "handle the relocation of a null pointer "
+                                    "constant without causing false positives "
+                                    "in the safepoint ir verifier.");
+  }
+}
+
+/// Find the required based pointers (and adjust the live set) for the given
+/// parse point.
+static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,
+                             const CallSite &CS,
+                             PartiallyConstructedSafepointRecord &result) {
+  DenseMap<llvm::Value *, llvm::Value *> PointerToBase;
+  DenseSet<llvm::Value *> NewInsertedDefs;
+  findBasePointers(result.liveset, PointerToBase, &DT, DVCache, NewInsertedDefs);
+
+  if (PrintBasePointers) {
+    errs() << "Base Pairs (w/o Relocation):\n";
+    for (auto Pair : PointerToBase) {
+      errs() << " derived %" << Pair.first->getName() << " base %"
+             << Pair.second->getName() << "\n";
+    }
+  }
+
+  result.PointerToBase = PointerToBase;
+  result.NewInsertedDefs = NewInsertedDefs;
+}
+
+/// Check for liveness of items in the insert defs and add them to the live
+/// and base pointer sets
+static void fixupLiveness(DominatorTree &DT, const CallSite &CS,
+                          const DenseSet<Value *> &allInsertedDefs,
+                          PartiallyConstructedSafepointRecord &result) {
+  Instruction *inst = CS.getInstruction();
+
+  auto liveset = result.liveset;
+  auto PointerToBase = result.PointerToBase;
+
+  auto is_live_gc_reference =
+      [&](Value &V) { return isLiveGCReferenceAt(V, inst, DT, nullptr); };
+
+  // For each new definition, check to see if a) the definition dominates the
+  // instruction we're interested in, and b) one of the uses of that definition
+  // is edge-reachable from the instruction we're interested in.  This is the
+  // same definition of liveness we used in the intial liveness analysis
+  for (Value *newDef : allInsertedDefs) {
+    if (liveset.count(newDef)) {
+      // already live, no action needed
+      continue;
+    }
+
+    // PERF: Use DT to check instruction domination might not be good for
+    // compilation time, and we could change to optimal solution if this
+    // turn to be a issue
+    if (!DT.dominates(cast<Instruction>(newDef), inst)) {
+      // can't possibly be live at inst
+      continue;
+    }
+
+    if (is_live_gc_reference(*newDef)) {
+      // Add the live new defs into liveset and PointerToBase
+      liveset.insert(newDef);
+      PointerToBase[newDef] = newDef;
+    }
+  }
+
+  result.liveset = liveset;
+  result.PointerToBase = PointerToBase;
+}
+
+static void fixupLiveReferences(
+    Function &F, DominatorTree &DT, Pass *P,
+    const DenseSet<llvm::Value *> &allInsertedDefs,
+    ArrayRef<CallSite> toUpdate,
+    MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+  for (size_t i = 0; i < records.size(); i++) {
+    struct PartiallyConstructedSafepointRecord &info = records[i];
+    const CallSite &CS = toUpdate[i];
+    fixupLiveness(DT, CS, allInsertedDefs, info);
+  }
+}
+
+// Normalize basic block to make it ready to be target of invoke statepoint.
+// It means spliting it to have single predecessor. Return newly created BB
+// ready to be successor of invoke statepoint.
+static BasicBlock *normalizeBBForInvokeSafepoint(BasicBlock *BB,
+                                                 BasicBlock *InvokeParent,
+                                                 Pass *P) {
+  BasicBlock *ret = BB;
+
+  if (!BB->getUniquePredecessor()) {
+    ret = SplitBlockPredecessors(BB, InvokeParent, "");
+  }
+
+  // Another requirement for such basic blocks is to not have any phi nodes.
+  // Since we just ensured that new BB will have single predecessor,
+  // all phi nodes in it will have one value. Here it would be naturall place
+  // to
+  // remove them all. But we can not do this because we are risking to remove
+  // one of the values stored in liveset of another statepoint. We will do it
+  // later after placing all safepoints.
+
+  return ret;
+}
+
+static int find_index(ArrayRef<Value *> livevec, Value *val) {
+  auto itr = std::find(livevec.begin(), livevec.end(), val);
+  assert(livevec.end() != itr);
+  size_t index = std::distance(livevec.begin(), itr);
+  assert(index < livevec.size());
+  return index;
+}
+
+// Create new attribute set containing only attributes which can be transfered
+// from original call to the safepoint.
+static AttributeSet legalizeCallAttributes(AttributeSet AS) {
+  AttributeSet ret;
+
+  for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) {
+    unsigned index = AS.getSlotIndex(Slot);
+
+    if (index == AttributeSet::ReturnIndex ||
+        index == AttributeSet::FunctionIndex) {
+
+      for (auto it = AS.begin(Slot), it_end = AS.end(Slot); it != it_end;
+           ++it) {
+        Attribute attr = *it;
+
+        // Do not allow certain attributes - just skip them
+        // Safepoint can not be read only or read none.
+        if (attr.hasAttribute(Attribute::ReadNone) ||
+            attr.hasAttribute(Attribute::ReadOnly))
+          continue;
+
+        ret = ret.addAttributes(
+            AS.getContext(), index,
+            AttributeSet::get(AS.getContext(), index, AttrBuilder(attr)));
+      }
+    }
+
+    // Just skip parameter attributes for now
+  }
+
+  return ret;
+}
+
+/// Helper function to place all gc relocates necessary for the given
+/// statepoint.
+/// Inputs:
+///   liveVariables - list of variables to be relocated.
+///   liveStart - index of the first live variable.
+///   basePtrs - base pointers.
+///   statepointToken - statepoint instruction to which relocates should be
+///   bound.
+///   Builder - Llvm IR builder to be used to construct new calls.
+void CreateGCRelocates(ArrayRef<llvm::Value *> liveVariables,
+                       const int liveStart,
+                       ArrayRef<llvm::Value *> basePtrs,
+                       Instruction *statepointToken, IRBuilder<> Builder) {
+
+  SmallVector<Instruction *, 64> NewDefs;
+  NewDefs.reserve(liveVariables.size());
+
+  Module *M = statepointToken->getParent()->getParent()->getParent();
+
+  for (unsigned i = 0; i < liveVariables.size(); i++) {
+    // We generate a (potentially) unique declaration for every pointer type
+    // combination.  This results is some blow up the function declarations in
+    // the IR, but removes the need for argument bitcasts which shrinks the IR
+    // greatly and makes it much more readable.
+    SmallVector<Type *, 1> types;                    // one per 'any' type
+    types.push_back(liveVariables[i]->getType()); // result type
+    Value *gc_relocate_decl = Intrinsic::getDeclaration(
+        M, Intrinsic::experimental_gc_relocate, types);
+
+    // Generate the gc.relocate call and save the result
+    Value *baseIdx =
+        ConstantInt::get(Type::getInt32Ty(M->getContext()),
+                         liveStart + find_index(liveVariables, basePtrs[i]));
+    Value *liveIdx = ConstantInt::get(
+        Type::getInt32Ty(M->getContext()),
+        liveStart + find_index(liveVariables, liveVariables[i]));
+
+    // only specify a debug name if we can give a useful one
+    Value *reloc = Builder.CreateCall3(
+        gc_relocate_decl, statepointToken, baseIdx, liveIdx,
+        liveVariables[i]->hasName() ? liveVariables[i]->getName() + ".relocated"
+                                    : "");
+    // Trick CodeGen into thinking there are lots of free registers at this
+    // fake call.
+    cast<CallInst>(reloc)->setCallingConv(CallingConv::Cold);
+
+    NewDefs.push_back(cast<Instruction>(reloc));
+  }
+  assert(NewDefs.size() == liveVariables.size() &&
+         "missing or extra redefinition at safepoint");
+}
+
+static void
+makeStatepointExplicitImpl(const CallSite &CS, /* to replace */
+                           const SmallVectorImpl<llvm::Value *> &basePtrs,
+                           const SmallVectorImpl<llvm::Value *> &liveVariables,
+                           Pass *P,
+                           PartiallyConstructedSafepointRecord &result) {
+  assert(basePtrs.size() == liveVariables.size());
+  assert(isStatepoint(CS) &&
+         "This method expects to be rewriting a statepoint");
+
+  BasicBlock *BB = CS.getInstruction()->getParent();
+  assert(BB);
+  Function *F = BB->getParent();
+  assert(F && "must be set");
+  Module *M = F->getParent();
+  (void)M;
+  assert(M && "must be set");
+
+  // We're not changing the function signature of the statepoint since the gc
+  // arguments go into the var args section.
+  Function *gc_statepoint_decl = CS.getCalledFunction();
+
+  // Then go ahead and use the builder do actually do the inserts.  We insert
+  // immediately before the previous instruction under the assumption that all
+  // arguments will be available here.  We can't insert afterwards since we may
+  // be replacing a terminator.
+  Instruction *insertBefore = CS.getInstruction();
+  IRBuilder<> Builder(insertBefore);
+  // Copy all of the arguments from the original statepoint - this includes the
+  // target, call args, and deopt args
+  SmallVector<llvm::Value *, 64> args;
+  args.insert(args.end(), CS.arg_begin(), CS.arg_end());
+  // TODO: Clear the 'needs rewrite' flag
+
+  // add all the pointers to be relocated (gc arguments)
+  // Capture the start of the live variable list for use in the gc_relocates
+  const int live_start = args.size();
+  args.insert(args.end(), liveVariables.begin(), liveVariables.end());
+
+  // Create the statepoint given all the arguments
+  Instruction *token = nullptr;
+  AttributeSet return_attributes;
+  if (CS.isCall()) {
+    CallInst *toReplace = cast<CallInst>(CS.getInstruction());
+    CallInst *call =
+        Builder.CreateCall(gc_statepoint_decl, args, "safepoint_token");
+    call->setTailCall(toReplace->isTailCall());
+    call->setCallingConv(toReplace->getCallingConv());
+
+    // Currently we will fail on parameter attributes and on certain
+    // function attributes.
+    AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes());
+    // In case if we can handle this set of sttributes - set up function attrs
+    // directly on statepoint and return attrs later for gc_result intrinsic.
+    call->setAttributes(new_attrs.getFnAttributes());
+    return_attributes = new_attrs.getRetAttributes();
+
+    token = call;
+
+    // Put the following gc_result and gc_relocate calls immediately after the
+    // the old call (which we're about to delete)
+    BasicBlock::iterator next(toReplace);
+    assert(BB->end() != next && "not a terminator, must have next");
+    next++;
+    Instruction *IP = &*(next);
+    Builder.SetInsertPoint(IP);
+    Builder.SetCurrentDebugLocation(IP->getDebugLoc());
+
+  } else {
+    InvokeInst *toReplace = cast<InvokeInst>(CS.getInstruction());
+
+    // Insert the new invoke into the old block.  We'll remove the old one in a
+    // moment at which point this will become the new terminator for the
+    // original block.
+    InvokeInst *invoke = InvokeInst::Create(
+        gc_statepoint_decl, toReplace->getNormalDest(),
+        toReplace->getUnwindDest(), args, "", toReplace->getParent());
+    invoke->setCallingConv(toReplace->getCallingConv());
+
+    // Currently we will fail on parameter attributes and on certain
+    // function attributes.
+    AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes());
+    // In case if we can handle this set of sttributes - set up function attrs
+    // directly on statepoint and return attrs later for gc_result intrinsic.
+    invoke->setAttributes(new_attrs.getFnAttributes());
+    return_attributes = new_attrs.getRetAttributes();
+
+    token = invoke;
+
+    // Generate gc relocates in exceptional path
+    BasicBlock *unwindBlock = normalizeBBForInvokeSafepoint(
+        toReplace->getUnwindDest(), invoke->getParent(), P);
+
+    Instruction *IP = &*(unwindBlock->getFirstInsertionPt());
+    Builder.SetInsertPoint(IP);
+    Builder.SetCurrentDebugLocation(toReplace->getDebugLoc());
+
+    // Extract second element from landingpad return value. We will attach
+    // exceptional gc relocates to it.
+    const unsigned idx = 1;
+    Instruction *exceptional_token =
+        cast<Instruction>(Builder.CreateExtractValue(
+            unwindBlock->getLandingPadInst(), idx, "relocate_token"));
+    result.UnwindToken = exceptional_token;
+
+    // Just throw away return value. We will use the one we got for normal
+    // block.
+    (void)CreateGCRelocates(liveVariables, live_start, basePtrs,
+                            exceptional_token, Builder);
+
+    // Generate gc relocates and returns for normal block
+    BasicBlock *normalDest = normalizeBBForInvokeSafepoint(
+        toReplace->getNormalDest(), invoke->getParent(), P);
+
+    IP = &*(normalDest->getFirstInsertionPt());
+    Builder.SetInsertPoint(IP);
+
+    // gc relocates will be generated later as if it were regular call
+    // statepoint
+  }
+  assert(token);
+
+  // Take the name of the original value call if it had one.
+  token->takeName(CS.getInstruction());
+
+  // The GCResult is already inserted, we just need to find it
+#ifndef NDEBUG
+  Instruction *toReplace = CS.getInstruction();
+  assert((toReplace->hasNUses(0) || toReplace->hasNUses(1)) &&
+         "only valid use before rewrite is gc.result");
+  assert(!toReplace->hasOneUse() ||
+         isGCResult(cast<Instruction>(*toReplace->user_begin())));
+#endif
+
+  // Update the gc.result of the original statepoint (if any) to use the newly
+  // inserted statepoint.  This is safe to do here since the token can't be
+  // considered a live reference.
+  CS.getInstruction()->replaceAllUsesWith(token);
+
+  result.StatepointToken = token;
+
+  // Second, create a gc.relocate for every live variable
+  CreateGCRelocates(liveVariables, live_start, basePtrs, token, Builder);
+
+}
+
+namespace {
+struct name_ordering {
+  Value *base;
+  Value *derived;
+  bool operator()(name_ordering const &a, name_ordering const &b) {
+    return -1 == a.derived->getName().compare(b.derived->getName());
+  }
+};
+}
+static void stablize_order(SmallVectorImpl<Value *> &basevec,
+                           SmallVectorImpl<Value *> &livevec) {
+  assert(basevec.size() == livevec.size());
+
+  SmallVector<name_ordering, 64> temp;
+  for (size_t i = 0; i < basevec.size(); i++) {
+    name_ordering v;
+    v.base = basevec[i];
+    v.derived = livevec[i];
+    temp.push_back(v);
+  }
+  std::sort(temp.begin(), temp.end(), name_ordering());
+  for (size_t i = 0; i < basevec.size(); i++) {
+    basevec[i] = temp[i].base;
+    livevec[i] = temp[i].derived;
+  }
+}
+
+// Replace an existing gc.statepoint with a new one and a set of gc.relocates
+// which make the relocations happening at this safepoint explicit.
+// 
+// WARNING: Does not do any fixup to adjust users of the original live
+// values.  That's the callers responsibility.
+static void
+makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, Pass *P,
+                       PartiallyConstructedSafepointRecord &result) {
+  auto liveset = result.liveset;
+  auto PointerToBase = result.PointerToBase;
+
+  // Convert to vector for efficient cross referencing.
+  SmallVector<Value *, 64> basevec, livevec;
+  livevec.reserve(liveset.size());
+  basevec.reserve(liveset.size());
+  for (Value *L : liveset) {
+    livevec.push_back(L);
+
+    assert(PointerToBase.find(L) != PointerToBase.end());
+    Value *base = PointerToBase[L];
+    basevec.push_back(base);
+  }
+  assert(livevec.size() == basevec.size());
+
+  // To make the output IR slightly more stable (for use in diffs), ensure a
+  // fixed order of the values in the safepoint (by sorting the value name).
+  // The order is otherwise meaningless.
+  stablize_order(basevec, livevec);
+
+  // Do the actual rewriting and delete the old statepoint
+  makeStatepointExplicitImpl(CS, basevec, livevec, P, result);
+  CS.getInstruction()->eraseFromParent();
+}
+
+// Helper function for the relocationViaAlloca.
+// It receives iterator to the statepoint gc relocates and emits store to the
+// assigned
+// location (via allocaMap) for the each one of them.
+// Add visited values into the visitedLiveValues set we will later use them
+// for sanity check.
+static void
+insertRelocationStores(iterator_range<Value::user_iterator> gcRelocs,
+                       DenseMap<Value *, Value *> &allocaMap,
+                       DenseSet<Value *> &visitedLiveValues) {
+
+  for (User *U : gcRelocs) {
+    if (!isa<IntrinsicInst>(U))
+      continue;
+
+    IntrinsicInst *relocatedValue = cast<IntrinsicInst>(U);
+
+    // We only care about relocates
+    if (relocatedValue->getIntrinsicID() !=
+        Intrinsic::experimental_gc_relocate) {
+      continue;
+    }
+
+    GCRelocateOperands relocateOperands(relocatedValue);
+    Value *originalValue = const_cast<Value *>(relocateOperands.derivedPtr());
+    assert(allocaMap.count(originalValue));
+    Value *alloca = allocaMap[originalValue];
+
+    // Emit store into the related alloca
+    StoreInst *store = new StoreInst(relocatedValue, alloca);
+    store->insertAfter(relocatedValue);
+
+#ifndef NDEBUG
+    visitedLiveValues.insert(originalValue);
+#endif
+  }
+}
+
+/// do all the relocation update via allocas and mem2reg
+static void relocationViaAlloca(
+    Function &F, DominatorTree &DT, ArrayRef<Value *> live,
+    ArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+#ifndef NDEBUG
+  int initialAllocaNum = 0;
+
+  // record initial number of allocas
+  for (inst_iterator itr = inst_begin(F), end = inst_end(F); itr != end;
+       itr++) {
+    if (isa<AllocaInst>(*itr))
+      initialAllocaNum++;
+  }
+#endif
+
+  // TODO-PERF: change data structures, reserve
+  DenseMap<Value *, Value *> allocaMap;
+  SmallVector<AllocaInst *, 200> PromotableAllocas;
+  PromotableAllocas.reserve(live.size());
+
+  // emit alloca for each live gc pointer
+  for (unsigned i = 0; i < live.size(); i++) {
+    Value *liveValue = live[i];
+    AllocaInst *alloca = new AllocaInst(liveValue->getType(), "",
+                                        F.getEntryBlock().getFirstNonPHI());
+    allocaMap[liveValue] = alloca;
+    PromotableAllocas.push_back(alloca);
+  }
+
+  // The next two loops are part of the same conceptual operation.  We need to
+  // insert a store to the alloca after the original def and at each
+  // redefinition.  We need to insert a load before each use.  These are split
+  // into distinct loops for performance reasons.
+
+  // update gc pointer after each statepoint
+  // either store a relocated value or null (if no relocated value found for
+  // this gc pointer and it is not a gc_result)
+  // this must happen before we update the statepoint with load of alloca
+  // otherwise we lose the link between statepoint and old def
+  for (size_t i = 0; i < records.size(); i++) {
+    const struct PartiallyConstructedSafepointRecord &info = records[i];
+    Value *Statepoint = info.StatepointToken;
+
+    // This will be used for consistency check
+    DenseSet<Value *> visitedLiveValues;
+
+    // Insert stores for normal statepoint gc relocates
+    insertRelocationStores(Statepoint->users(), allocaMap, visitedLiveValues);
+
+    // In case if it was invoke statepoint
+    // we will insert stores for exceptional path gc relocates.
+    if (isa<InvokeInst>(Statepoint)) {
+      insertRelocationStores(info.UnwindToken->users(),
+                             allocaMap, visitedLiveValues);
+    }
+
+#ifndef NDEBUG
+    // As a debuging aid, pretend that an unrelocated pointer becomes null at
+    // the gc.statepoint.  This will turn some subtle GC problems into slightly
+    // easier to debug SEGVs
+    SmallVector<AllocaInst *, 64> ToClobber;
+    for (auto Pair : allocaMap) {
+      Value *Def = Pair.first;
+      AllocaInst *Alloca = cast<AllocaInst>(Pair.second);
+
+      // This value was relocated
+      if (visitedLiveValues.count(Def)) {
+        continue;
+      }
+      ToClobber.push_back(Alloca);
+    }
+
+    auto InsertClobbersAt = [&](Instruction *IP) {
+      for (auto *AI : ToClobber) {
+        auto AIType = cast<PointerType>(AI->getType());
+        auto PT = cast<PointerType>(AIType->getElementType());
+        Constant *CPN = ConstantPointerNull::get(PT);
+        StoreInst *store = new StoreInst(CPN, AI);
+        store->insertBefore(IP);
+      }
+    };
+
+    // Insert the clobbering stores.  These may get intermixed with the
+    // gc.results and gc.relocates, but that's fine.  
+    if (auto II = dyn_cast<InvokeInst>(Statepoint)) {
+      InsertClobbersAt(II->getNormalDest()->getFirstInsertionPt());
+      InsertClobbersAt(II->getUnwindDest()->getFirstInsertionPt());
+    } else {
+      BasicBlock::iterator Next(cast<CallInst>(Statepoint));
+      Next++;
+      InsertClobbersAt(Next);
+    }
+#endif
+  }
+  // update use with load allocas and add store for gc_relocated
+  for (auto Pair : allocaMap) {
+    Value *def = Pair.first;
+    Value *alloca = Pair.second;
+
+    // we pre-record the uses of allocas so that we dont have to worry about
+    // later update
+    // that change the user information.
+    SmallVector<Instruction *, 20> uses;
+    // PERF: trade a linear scan for repeated reallocation
+    uses.reserve(std::distance(def->user_begin(), def->user_end()));
+    for (User *U : def->users()) {
+      if (!isa<ConstantExpr>(U)) {
+        // If the def has a ConstantExpr use, then the def is either a
+        // ConstantExpr use itself or null.  In either case
+        // (recursively in the first, directly in the second), the oop
+        // it is ultimately dependent on is null and this particular
+        // use does not need to be fixed up.
+        uses.push_back(cast<Instruction>(U));
+      }
+    }
+
+    std::sort(uses.begin(), uses.end());
+    auto last = std::unique(uses.begin(), uses.end());
+    uses.erase(last, uses.end());
+
+    for (Instruction *use : uses) {
+      if (isa<PHINode>(use)) {
+        PHINode *phi = cast<PHINode>(use);
+        for (unsigned i = 0; i < phi->getNumIncomingValues(); i++) {
+          if (def == phi->getIncomingValue(i)) {
+            LoadInst *load = new LoadInst(
+                alloca, "", phi->getIncomingBlock(i)->getTerminator());
+            phi->setIncomingValue(i, load);
+          }
+        }
+      } else {
+        LoadInst *load = new LoadInst(alloca, "", use);
+        use->replaceUsesOfWith(def, load);
+      }
+    }
+
+    // emit store for the initial gc value
+    // store must be inserted after load, otherwise store will be in alloca's
+    // use list and an extra load will be inserted before it
+    StoreInst *store = new StoreInst(def, alloca);
+    if (isa<Instruction>(def)) {
+      store->insertAfter(cast<Instruction>(def));
+    } else {
+      assert((isa<Argument>(def) || isa<GlobalVariable>(def) ||
+              (isa<Constant>(def) && cast<Constant>(def)->isNullValue())) &&
+             "Must be argument or global");
+      store->insertAfter(cast<Instruction>(alloca));
+    }
+  }
+
+  assert(PromotableAllocas.size() == live.size() &&
+         "we must have the same allocas with lives");
+  if (!PromotableAllocas.empty()) {
+    // apply mem2reg to promote alloca to SSA
+    PromoteMemToReg(PromotableAllocas, DT);
+  }
+
+#ifndef NDEBUG
+  for (inst_iterator itr = inst_begin(F), end = inst_end(F); itr != end;
+       itr++) {
+    if (isa<AllocaInst>(*itr))
+      initialAllocaNum--;
+  }
+  assert(initialAllocaNum == 0 && "We must not introduce any extra allocas");
+#endif
+}
+
+/// Implement a unique function which doesn't require we sort the input
+/// vector.  Doing so has the effect of changing the output of a couple of
+/// tests in ways which make them less useful in testing fused safepoints.
+template <typename T> static void unique_unsorted(SmallVectorImpl<T> &Vec) {
+  DenseSet<T> Seen;
+  SmallVector<T, 128> TempVec;
+  TempVec.reserve(Vec.size());
+  for (auto Element : Vec)
+    TempVec.push_back(Element);
+  Vec.clear();
+  for (auto V : TempVec) {
+    if (Seen.insert(V).second) {
+      Vec.push_back(V);
+    }
+  }
+}
+
+static Function *getUseHolder(Module &M) {
+  FunctionType *ftype =
+      FunctionType::get(Type::getVoidTy(M.getContext()), true);
+  Function *Func = cast<Function>(M.getOrInsertFunction("__tmp_use", ftype));
+  return Func;
+}
+
+/// Insert holders so that each Value is obviously live through the entire
+/// liftetime of the call.
+static void insertUseHolderAfter(CallSite &CS, const ArrayRef<Value *> Values,
+                                 SmallVectorImpl<CallInst *> &holders) {
+  Module *M = CS.getInstruction()->getParent()->getParent()->getParent();
+  Function *Func = getUseHolder(*M);
+  if (CS.isCall()) {
+    // For call safepoints insert dummy calls right after safepoint
+    BasicBlock::iterator next(CS.getInstruction());
+    next++;
+    CallInst *base_holder = CallInst::Create(Func, Values, "", next);
+    holders.push_back(base_holder);
+  } else if (CS.isInvoke()) {
+    // For invoke safepooints insert dummy calls both in normal and
+    // exceptional destination blocks
+    InvokeInst *invoke = cast<InvokeInst>(CS.getInstruction());
+    CallInst *normal_holder = CallInst::Create(
+        Func, Values, "", invoke->getNormalDest()->getFirstInsertionPt());
+    CallInst *unwind_holder = CallInst::Create(
+        Func, Values, "", invoke->getUnwindDest()->getFirstInsertionPt());
+    holders.push_back(normal_holder);
+    holders.push_back(unwind_holder);
+  } else
+    llvm_unreachable("unsupported call type");
+}
+
+static void findLiveReferences(
+    Function &F, DominatorTree &DT, Pass *P, ArrayRef<CallSite> toUpdate,
+    MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+  for (size_t i = 0; i < records.size(); i++) {
+    struct PartiallyConstructedSafepointRecord &info = records[i];
+    const CallSite &CS = toUpdate[i];
+    analyzeParsePointLiveness(DT, CS, info);
+  }
+}
+
+static void addBasesAsLiveValues(StatepointLiveSetTy &liveset,
+                                 DenseMap<Value *, Value *> &PointerToBase) {
+  // Identify any base pointers which are used in this safepoint, but not
+  // themselves relocated.  We need to relocate them so that later inserted
+  // safepoints can get the properly relocated base register.
+  DenseSet<Value *> missing;
+  for (Value *L : liveset) {
+    assert(PointerToBase.find(L) != PointerToBase.end());
+    Value *base = PointerToBase[L];
+    assert(base);
+    if (liveset.find(base) == liveset.end()) {
+      assert(PointerToBase.find(base) == PointerToBase.end());
+      // uniqued by set insert
+      missing.insert(base);
+    }
+  }
+
+  // Note that we want these at the end of the list, otherwise
+  // register placement gets screwed up once we lower to STATEPOINT
+  // instructions.  This is an utter hack, but there doesn't seem to be a
+  // better one.
+  for (Value *base : missing) {
+    assert(base);
+    liveset.insert(base);
+    PointerToBase[base] = base;
+  }
+  assert(liveset.size() == PointerToBase.size());
+}
+
+static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P,
+                              SmallVectorImpl<CallSite> &toUpdate) {
+#ifndef NDEBUG
+  // sanity check the input
+  std::set<CallSite> uniqued;
+  uniqued.insert(toUpdate.begin(), toUpdate.end());
+  assert(uniqued.size() == toUpdate.size() && "no duplicates please!");
+
+  for (size_t i = 0; i < toUpdate.size(); i++) {
+    CallSite &CS = toUpdate[i];
+    assert(CS.getInstruction()->getParent()->getParent() == &F);
+    assert(isStatepoint(CS) && "expected to already be a deopt statepoint");
+  }
+#endif
+
+  // A list of dummy calls added to the IR to keep various values obviously
+  // live in the IR.  We'll remove all of these when done.
+  SmallVector<CallInst *, 64> holders;
+
+  // Insert a dummy call with all of the arguments to the vm_state we'll need
+  // for the actual safepoint insertion.  This ensures reference arguments in
+  // the deopt argument list are considered live through the safepoint (and
+  // thus makes sure they get relocated.)
+  for (size_t i = 0; i < toUpdate.size(); i++) {
+    CallSite &CS = toUpdate[i];
+    Statepoint StatepointCS(CS);
+
+    SmallVector<Value *, 64> DeoptValues;
+    for (Use &U : StatepointCS.vm_state_args()) {
+      Value *Arg = cast<Value>(&U);
+      if (isGCPointerType(Arg->getType()))
+        DeoptValues.push_back(Arg);
+    }
+    insertUseHolderAfter(CS, DeoptValues, holders);
+  }
+
+  SmallVector<struct PartiallyConstructedSafepointRecord, 64> records;
+  records.reserve(toUpdate.size());
+  for (size_t i = 0; i < toUpdate.size(); i++) {
+    struct PartiallyConstructedSafepointRecord info;
+    records.push_back(info);
+  }
+  assert(records.size() == toUpdate.size());
+
+  // A) Identify all gc pointers which are staticly live at the given call
+  // site.
+  findLiveReferences(F, DT, P, toUpdate, records);
+
+  // B) Find the base pointers for each live pointer
+  /* scope for caching */ {
+    // Cache the 'defining value' relation used in the computation and
+    // insertion of base phis and selects.  This ensures that we don't insert
+    // large numbers of duplicate base_phis.
+    DefiningValueMapTy DVCache;
+
+    for (size_t i = 0; i < records.size(); i++) {
+      struct PartiallyConstructedSafepointRecord &info = records[i];
+      CallSite &CS = toUpdate[i];
+      findBasePointers(DT, DVCache, CS, info);
+    }
+  } // end of cache scope
+
+  // The base phi insertion logic (for any safepoint) may have inserted new
+  // instructions which are now live at some safepoint.  The simplest such
+  // example is:
+  // loop:
+  //   phi a  <-- will be a new base_phi here
+  //   safepoint 1 <-- that needs to be live here
+  //   gep a + 1
+  //   safepoint 2
+  //   br loop
+  DenseSet<llvm::Value *> allInsertedDefs;
+  for (size_t i = 0; i < records.size(); i++) {
+    struct PartiallyConstructedSafepointRecord &info = records[i];
+    allInsertedDefs.insert(info.NewInsertedDefs.begin(),
+                           info.NewInsertedDefs.end());
+  }
+
+  // We insert some dummy calls after each safepoint to definitely hold live
+  // the base pointers which were identified for that safepoint.  We'll then
+  // ask liveness for _every_ base inserted to see what is now live.  Then we
+  // remove the dummy calls.
+  holders.reserve(holders.size() + records.size());
+  for (size_t i = 0; i < records.size(); i++) {
+    struct PartiallyConstructedSafepointRecord &info = records[i];
+    CallSite &CS = toUpdate[i];
+
+    SmallVector<Value *, 128> Bases;
+    for (auto Pair : info.PointerToBase) {
+      Bases.push_back(Pair.second);
+    }
+    insertUseHolderAfter(CS, Bases, holders);
+  }
+
+  // Add the bases explicitly to the live vector set.  This may result in a few
+  // extra relocations, but the base has to be available whenever a pointer
+  // derived from it is used.  Thus, we need it to be part of the statepoint's
+  // gc arguments list.  TODO: Introduce an explicit notion (in the following
+  // code) of the GC argument list as seperate from the live Values at a
+  // given statepoint.
+  for (size_t i = 0; i < records.size(); i++) {
+    struct PartiallyConstructedSafepointRecord &info = records[i];
+    addBasesAsLiveValues(info.liveset, info.PointerToBase);
+  }
+
+  // If we inserted any new values, we need to adjust our notion of what is
+  // live at a particular safepoint.
+  if (!allInsertedDefs.empty()) {
+    fixupLiveReferences(F, DT, P, allInsertedDefs, toUpdate, records);
+  }
+  if (PrintBasePointers) {
+    for (size_t i = 0; i < records.size(); i++) {
+      struct PartiallyConstructedSafepointRecord &info = records[i];
+      errs() << "Base Pairs: (w/Relocation)\n";
+      for (auto Pair : info.PointerToBase) {
+        errs() << " derived %" << Pair.first->getName() << " base %"
+               << Pair.second->getName() << "\n";
+      }
+    }
+  }
+  for (size_t i = 0; i < holders.size(); i++) {
+    holders[i]->eraseFromParent();
+    holders[i] = nullptr;
+  }
+  holders.clear();
+
+  // Now run through and replace the existing statepoints with new ones with
+  // the live variables listed.  We do not yet update uses of the values being
+  // relocated. We have references to live variables that need to
+  // survive to the last iteration of this loop.  (By construction, the
+  // previous statepoint can not be a live variable, thus we can and remove
+  // the old statepoint calls as we go.)
+  for (size_t i = 0; i < records.size(); i++) {
+    struct PartiallyConstructedSafepointRecord &info = records[i];
+    CallSite &CS = toUpdate[i];
+    makeStatepointExplicit(DT, CS, P, info);
+  }
+  toUpdate.clear(); // prevent accident use of invalid CallSites
+
+  // In case if we inserted relocates in a different basic block than the
+  // original safepoint (this can happen for invokes). We need to be sure that
+  // original values were not used in any of the phi nodes at the
+  // beginning of basic block containing them. Because we know that all such
+  // blocks will have single predecessor we can safely assume that all phi
+  // nodes have single entry (because of normalizeBBForInvokeSafepoint).
+  // Just remove them all here.
+  for (size_t i = 0; i < records.size(); i++) {
+    Instruction *I = records[i].StatepointToken;
+
+    if (InvokeInst *invoke = dyn_cast<InvokeInst>(I)) {
+      FoldSingleEntryPHINodes(invoke->getNormalDest());
+      assert(!isa<PHINode>(invoke->getNormalDest()->begin()));
+
+      FoldSingleEntryPHINodes(invoke->getUnwindDest());
+      assert(!isa<PHINode>(invoke->getUnwindDest()->begin()));
+    }
+  }
+
+  // Do all the fixups of the original live variables to their relocated selves
+  SmallVector<Value *, 128> live;
+  for (size_t i = 0; i < records.size(); i++) {
+    struct PartiallyConstructedSafepointRecord &info = records[i];
+    // We can't simply save the live set from the original insertion.  One of
+    // the live values might be the result of a call which needs a safepoint.
+    // That Value* no longer exists and we need to use the new gc_result.
+    // Thankfully, the liveset is embedded in the statepoint (and updated), so
+    // we just grab that.
+    Statepoint statepoint(info.StatepointToken);
+    live.insert(live.end(), statepoint.gc_args_begin(),
+                statepoint.gc_args_end());
+  }
+  unique_unsorted(live);
+
+#ifndef NDEBUG
+  // sanity check
+  for (auto ptr : live) {
+    assert(isGCPointerType(ptr->getType()) && "must be a gc pointer type");
+  }
+#endif
+
+  relocationViaAlloca(F, DT, live, records);
+  return !records.empty();
+}
+
+/// Returns true if this function should be rewritten by this pass.  The main
+/// point of this function is as an extension point for custom logic.
+static bool shouldRewriteStatepointsIn(Function &F) {
+  // TODO: This should check the GCStrategy
+  if (F.hasGC()) {
+    const std::string StatepointExampleName("statepoint-example");
+    return StatepointExampleName == F.getGC();
+  } else
+    return false;
+}
+
+bool RewriteStatepointsForGC::runOnFunction(Function &F) {
+  // Nothing to do for declarations.
+  if (F.isDeclaration() || F.empty())
+    return false;
+
+  // Policy choice says not to rewrite - the most common reason is that we're
+  // compiling code without a GCStrategy.
+  if (!shouldRewriteStatepointsIn(F))
+    return false;
+
+  // Gather all the statepoints which need rewritten.
+  SmallVector<CallSite, 64> ParsePointNeeded;
+  for (Instruction &I : inst_range(F)) {
+    // TODO: only the ones with the flag set!
+    if (isStatepoint(I))
+      ParsePointNeeded.push_back(CallSite(&I));
+  }
+
+  // Return early if no work to do.
+  if (ParsePointNeeded.empty())
+    return false;
+
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  return insertParsePoints(F, DT, this, ParsePointNeeded);
+}
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index cfc9a8e..05b9608 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -35,7 +35,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
@@ -1504,7 +1504,7 @@ namespace {
   ///
   struct SCCP : public FunctionPass {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
     static char ID; // Pass identification, replacement for typeid
     SCCP() : FunctionPass(ID) {
@@ -1563,7 +1563,8 @@ bool SCCP::runOnFunction(Function &F) {
   DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
   const DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
-  const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   SCCPSolver Solver(DL, TLI);
 
   // Mark the first block of the function as being executable.
@@ -1637,7 +1638,7 @@ namespace {
   ///
   struct IPSCCP : public ModulePass {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
     static char ID;
     IPSCCP() : ModulePass(ID) {
@@ -1651,7 +1652,7 @@ char IPSCCP::ID = 0;
 INITIALIZE_PASS_BEGIN(IPSCCP, "ipsccp",
                 "Interprocedural Sparse Conditional Constant Propagation",
                 false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(IPSCCP, "ipsccp",
                 "Interprocedural Sparse Conditional Constant Propagation",
                 false, false)
@@ -1692,7 +1693,8 @@ static bool AddressIsTaken(const GlobalValue *GV) {
 bool IPSCCP::runOnModule(Module &M) {
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
-  const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   SCCPSolver Solver(DL, TLI);
 
   // AddressTakenFunctions - This set keeps track of the address-taken functions
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 6135114..f69c750 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -28,7 +28,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -79,8 +79,8 @@ STATISTIC(NumVectorized, "Number of vectorized aggregates");
 
 /// Hidden option to force the pass to not use DomTree and mem2reg, instead
 /// forming SSA values through the SSAUpdater infrastructure.
-static cl::opt<bool>
-ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden);
+static cl::opt<bool> ForceSSAUpdater("force-ssa-updater", cl::init(false),
+                                     cl::Hidden);
 
 /// Hidden option to enable randomly shuffling the slices to help uncover
 /// instability in their order.
@@ -89,15 +89,15 @@ static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
 
 /// Hidden option to experiment with completely strict handling of inbounds
 /// GEPs.
-static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds",
-                                        cl::init(false), cl::Hidden);
+static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
+                                        cl::Hidden);
 
 namespace {
 /// \brief A custom IRBuilder inserter which prefixes all names if they are
 /// preserved.
 template <bool preserveNames = true>
-class IRBuilderPrefixedInserter :
-    public IRBuilderDefaultInserter<preserveNames> {
+class IRBuilderPrefixedInserter
+    : public IRBuilderDefaultInserter<preserveNames> {
   std::string Prefix;
 
 public:
@@ -113,19 +113,19 @@ protected:
 
 // Specialization for not preserving the name is trivial.
 template <>
-class IRBuilderPrefixedInserter<false> :
-    public IRBuilderDefaultInserter<false> {
+class IRBuilderPrefixedInserter<false>
+    : public IRBuilderDefaultInserter<false> {
 public:
   void SetNamePrefix(const Twine &P) {}
 };
 
 /// \brief Provide a typedef for IRBuilder that drops names in release builds.
 #ifndef NDEBUG
-typedef llvm::IRBuilder<true, ConstantFolder,
-                        IRBuilderPrefixedInserter<true> > IRBuilderTy;
+typedef llvm::IRBuilder<true, ConstantFolder, IRBuilderPrefixedInserter<true>>
+    IRBuilderTy;
 #else
-typedef llvm::IRBuilder<false, ConstantFolder,
-                        IRBuilderPrefixedInserter<false> > IRBuilderTy;
+typedef llvm::IRBuilder<false, ConstantFolder, IRBuilderPrefixedInserter<false>>
+    IRBuilderTy;
 #endif
 }
 
@@ -171,10 +171,14 @@ public:
   /// decreasing. Thus the spanning range comes first in a cluster with the
   /// same start position.
   bool operator<(const Slice &RHS) const {
-    if (beginOffset() < RHS.beginOffset()) return true;
-    if (beginOffset() > RHS.beginOffset()) return false;
-    if (isSplittable() != RHS.isSplittable()) return !isSplittable();
-    if (endOffset() > RHS.endOffset()) return true;
+    if (beginOffset() < RHS.beginOffset())
+      return true;
+    if (beginOffset() > RHS.beginOffset())
+      return false;
+    if (isSplittable() != RHS.isSplittable())
+      return !isSplittable();
+    if (endOffset() > RHS.endOffset())
+      return true;
     return false;
   }
 
@@ -198,9 +202,7 @@ public:
 
 namespace llvm {
 template <typename T> struct isPodLike;
-template <> struct isPodLike<Slice> {
-   static const bool value = true;
-};
+template <> struct isPodLike<Slice> { static const bool value = true; };
 }
 
 namespace {
@@ -235,6 +237,298 @@ public:
   const_iterator end() const { return Slices.end(); }
   /// @}
 
+  /// \brief Erase a range of slices.
+  void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
+
+  /// \brief Insert new slices for this alloca.
+  ///
+  /// This moves the slices into the alloca's slices collection, and re-sorts
+  /// everything so that the usual ordering properties of the alloca's slices
+  /// hold.
+  void insert(ArrayRef<Slice> NewSlices) {
+    int OldSize = Slices.size();
+    std::move(NewSlices.begin(), NewSlices.end(), std::back_inserter(Slices));
+    auto SliceI = Slices.begin() + OldSize;
+    std::sort(SliceI, Slices.end());
+    std::inplace_merge(Slices.begin(), SliceI, Slices.end());
+  }
+
+  // Forward declare an iterator to befriend it.
+  class partition_iterator;
+
+  /// \brief A partition of the slices.
+  ///
+  /// An ephemeral representation for a range of slices which can be viewed as
+  /// a partition of the alloca. This range represents a span of the alloca's
+  /// memory which cannot be split, and provides access to all of the slices
+  /// overlapping some part of the partition.
+  ///
+  /// Objects of this type are produced by traversing the alloca's slices, but
+  /// are only ephemeral and not persistent.
+  class Partition {
+  private:
+    friend class AllocaSlices;
+    friend class AllocaSlices::partition_iterator;
+
+    /// \brief The begining and ending offsets of the alloca for this partition.
+    uint64_t BeginOffset, EndOffset;
+
+    /// \brief The start end end iterators of this partition.
+    iterator SI, SJ;
+
+    /// \brief A collection of split slice tails overlapping the partition.
+    SmallVector<Slice *, 4> SplitTails;
+
+    /// \brief Raw constructor builds an empty partition starting and ending at
+    /// the given iterator.
+    Partition(iterator SI) : SI(SI), SJ(SI) {}
+
+  public:
+    /// \brief The start offset of this partition.
+    ///
+    /// All of the contained slices start at or after this offset.
+    uint64_t beginOffset() const { return BeginOffset; }
+
+    /// \brief The end offset of this partition.
+    ///
+    /// All of the contained slices end at or before this offset.
+    uint64_t endOffset() const { return EndOffset; }
+
+    /// \brief The size of the partition.
+    ///
+    /// Note that this can never be zero.
+    uint64_t size() const {
+      assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
+      return EndOffset - BeginOffset;
+    }
+
+    /// \brief Test whether this partition contains no slices, and merely spans
+    /// a region occupied by split slices.
+    bool empty() const { return SI == SJ; }
+
+    /// \name Iterate slices that start within the partition.
+    /// These may be splittable or unsplittable. They have a begin offset >= the
+    /// partition begin offset.
+    /// @{
+    // FIXME: We should probably define a "concat_iterator" helper and use that
+    // to stitch together pointee_iterators over the split tails and the
+    // contiguous iterators of the partition. That would give a much nicer
+    // interface here. We could then additionally expose filtered iterators for
+    // split, unsplit, and unsplittable splices based on the usage patterns.
+    iterator begin() const { return SI; }
+    iterator end() const { return SJ; }
+    /// @}
+
+    /// \brief Get the sequence of split slice tails.
+    ///
+    /// These tails are of slices which start before this partition but are
+    /// split and overlap into the partition. We accumulate these while forming
+    /// partitions.
+    ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
+  };
+
+  /// \brief An iterator over partitions of the alloca's slices.
+  ///
+  /// This iterator implements the core algorithm for partitioning the alloca's
+  /// slices. It is a forward iterator as we don't support backtracking for
+  /// efficiency reasons, and re-use a single storage area to maintain the
+  /// current set of split slices.
+  ///
+  /// It is templated on the slice iterator type to use so that it can operate
+  /// with either const or non-const slice iterators.
+  class partition_iterator
+      : public iterator_facade_base<partition_iterator,
+                                    std::forward_iterator_tag, Partition> {
+    friend class AllocaSlices;
+
+    /// \brief Most of the state for walking the partitions is held in a class
+    /// with a nice interface for examining them.
+    Partition P;
+
+    /// \brief We need to keep the end of the slices to know when to stop.
+    AllocaSlices::iterator SE;
+
+    /// \brief We also need to keep track of the maximum split end offset seen.
+    /// FIXME: Do we really?
+    uint64_t MaxSplitSliceEndOffset;
+
+    /// \brief Sets the partition to be empty at given iterator, and sets the
+    /// end iterator.
+    partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
+        : P(SI), SE(SE), MaxSplitSliceEndOffset(0) {
+      // If not already at the end, advance our state to form the initial
+      // partition.
+      if (SI != SE)
+        advance();
+    }
+
+    /// \brief Advance the iterator to the next partition.
+    ///
+    /// Requires that the iterator not be at the end of the slices.
+    void advance() {
+      assert((P.SI != SE || !P.SplitTails.empty()) &&
+             "Cannot advance past the end of the slices!");
+
+      // Clear out any split uses which have ended.
+      if (!P.SplitTails.empty()) {
+        if (P.EndOffset >= MaxSplitSliceEndOffset) {
+          // If we've finished all splits, this is easy.
+          P.SplitTails.clear();
+          MaxSplitSliceEndOffset = 0;
+        } else {
+          // Remove the uses which have ended in the prior partition. This
+          // cannot change the max split slice end because we just checked that
+          // the prior partition ended prior to that max.
+          P.SplitTails.erase(
+              std::remove_if(
+                  P.SplitTails.begin(), P.SplitTails.end(),
+                  [&](Slice *S) { return S->endOffset() <= P.EndOffset; }),
+              P.SplitTails.end());
+          assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(),
+                             [&](Slice *S) {
+                               return S->endOffset() == MaxSplitSliceEndOffset;
+                             }) &&
+                 "Could not find the current max split slice offset!");
+          assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(),
+                             [&](Slice *S) {
+                               return S->endOffset() <= MaxSplitSliceEndOffset;
+                             }) &&
+                 "Max split slice end offset is not actually the max!");
+        }
+      }
+
+      // If P.SI is already at the end, then we've cleared the split tail and
+      // now have an end iterator.
+      if (P.SI == SE) {
+        assert(P.SplitTails.empty() && "Failed to clear the split slices!");
+        return;
+      }
+
+      // If we had a non-empty partition previously, set up the state for
+      // subsequent partitions.
+      if (P.SI != P.SJ) {
+        // Accumulate all the splittable slices which started in the old
+        // partition into the split list.
+        for (Slice &S : P)
+          if (S.isSplittable() && S.endOffset() > P.EndOffset) {
+            P.SplitTails.push_back(&S);
+            MaxSplitSliceEndOffset =
+                std::max(S.endOffset(), MaxSplitSliceEndOffset);
+          }
+
+        // Start from the end of the previous partition.
+        P.SI = P.SJ;
+
+        // If P.SI is now at the end, we at most have a tail of split slices.
+        if (P.SI == SE) {
+          P.BeginOffset = P.EndOffset;
+          P.EndOffset = MaxSplitSliceEndOffset;
+          return;
+        }
+
+        // If the we have split slices and the next slice is after a gap and is
+        // not splittable immediately form an empty partition for the split
+        // slices up until the next slice begins.
+        if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
+            !P.SI->isSplittable()) {
+          P.BeginOffset = P.EndOffset;
+          P.EndOffset = P.SI->beginOffset();
+          return;
+        }
+      }
+
+      // OK, we need to consume new slices. Set the end offset based on the
+      // current slice, and step SJ past it. The beginning offset of the
+      // parttion is the beginning offset of the next slice unless we have
+      // pre-existing split slices that are continuing, in which case we begin
+      // at the prior end offset.
+      P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
+      P.EndOffset = P.SI->endOffset();
+      ++P.SJ;
+
+      // There are two strategies to form a partition based on whether the
+      // partition starts with an unsplittable slice or a splittable slice.
+      if (!P.SI->isSplittable()) {
+        // When we're forming an unsplittable region, it must always start at
+        // the first slice and will extend through its end.
+        assert(P.BeginOffset == P.SI->beginOffset());
+
+        // Form a partition including all of the overlapping slices with this
+        // unsplittable slice.
+        while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+          if (!P.SJ->isSplittable())
+            P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+          ++P.SJ;
+        }
+
+        // We have a partition across a set of overlapping unsplittable
+        // partitions.
+        return;
+      }
+
+      // If we're starting with a splittable slice, then we need to form
+      // a synthetic partition spanning it and any other overlapping splittable
+      // splices.
+      assert(P.SI->isSplittable() && "Forming a splittable partition!");
+
+      // Collect all of the overlapping splittable slices.
+      while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
+             P.SJ->isSplittable()) {
+        P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+        ++P.SJ;
+      }
+
+      // Back upiP.EndOffset if we ended the span early when encountering an
+      // unsplittable slice. This synthesizes the early end offset of
+      // a partition spanning only splittable slices.
+      if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+        assert(!P.SJ->isSplittable());
+        P.EndOffset = P.SJ->beginOffset();
+      }
+    }
+
+  public:
+    bool operator==(const partition_iterator &RHS) const {
+      assert(SE == RHS.SE &&
+             "End iterators don't match between compared partition iterators!");
+
+      // The observed positions of partitions is marked by the P.SI iterator and
+      // the emptyness of the split slices. The latter is only relevant when
+      // P.SI == SE, as the end iterator will additionally have an empty split
+      // slices list, but the prior may have the same P.SI and a tail of split
+      // slices.
+      if (P.SI == RHS.P.SI &&
+          P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
+        assert(P.SJ == RHS.P.SJ &&
+               "Same set of slices formed two different sized partitions!");
+        assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
+               "Same slice position with differently sized non-empty split "
+               "slice tails!");
+        return true;
+      }
+      return false;
+    }
+
+    partition_iterator &operator++() {
+      advance();
+      return *this;
+    }
+
+    Partition &operator*() { return P; }
+  };
+
+  /// \brief A forward range over the partitions of the alloca's slices.
+  ///
+  /// This accesses an iterator range over the partitions of the alloca's
+  /// slices. It computes these partitions on the fly based on the overlapping
+  /// offsets of the slices and the ability to split them. It will visit "empty"
+  /// partitions to cover regions of the alloca only accessed via split
+  /// slices.
+  iterator_range<partition_iterator> partitions() {
+    return make_range(partition_iterator(begin(), end()),
+                      partition_iterator(end(), end()));
+  }
+
   /// \brief Access the dead users for this alloca.
   ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
 
@@ -308,7 +602,7 @@ static Value *foldSelectInst(SelectInst &SI) {
   // being selected between, fold the select. Yes this does (rarely) happen
   // early on.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
-    return SI.getOperand(1+CI->isZero());
+    return SI.getOperand(1 + CI->isZero());
   if (SI.getOperand(1) == SI.getOperand(2))
     return SI.getOperand(1);
 
@@ -421,7 +715,8 @@ private:
           GEPOffset +=
               APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
         } else {
-          // For array or vector indices, scale the index by the size of the type.
+          // For array or vector indices, scale the index by the size of the
+          // type.
           APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
           GEPOffset += Index * APInt(Offset.getBitWidth(),
                                      DL.getTypeAllocSize(GTI.getIndexedType()));
@@ -440,16 +735,10 @@ private:
 
   void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
                          uint64_t Size, bool IsVolatile) {
-    // We allow splitting of loads and stores where the type is an integer type
-    // and cover the entire alloca. This prevents us from splitting over
-    // eagerly.
-    // FIXME: In the great blue eventually, we should eagerly split all integer
-    // loads and stores, and then have a separate step that merges adjacent
-    // alloca partitions into a single partition suitable for integer widening.
-    // Or we should skip the merge step and rely on GVN and other passes to
-    // merge adjacent loads and stores that survive mem2reg.
-    bool IsSplittable =
-        Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize;
+    // We allow splitting of non-volatile loads and stores where the type is an
+    // integer type. These may be used to implement 'memcpy' or other "transfer
+    // of bits" patterns.
+    bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
 
     insertUse(I, Offset, Size, IsSplittable);
   }
@@ -495,7 +784,6 @@ private:
     handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
   }
 
-
   void visitMemSetInst(MemSetInst &II) {
     assert(II.getRawDest() == *U && "Pointer use is not the destination?");
     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
@@ -507,9 +795,8 @@ private:
     if (!IsOffsetKnown)
       return PI.setAborted(&II);
 
-    insertUse(II, Offset,
-              Length ? Length->getLimitedValue()
-                     : AllocSize - Offset.getLimitedValue(),
+    insertUse(II, Offset, Length ? Length->getLimitedValue()
+                                 : AllocSize - Offset.getLimitedValue(),
               (bool)Length);
   }
 
@@ -533,15 +820,15 @@ private:
     // FIXME: Yet another place we really should bypass this when
     // instrumenting for ASan.
     if (Offset.uge(AllocSize)) {
-      SmallDenseMap<Instruction *, unsigned>::iterator MTPI = MemTransferSliceMap.find(&II);
+      SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
+          MemTransferSliceMap.find(&II);
       if (MTPI != MemTransferSliceMap.end())
         AS.Slices[MTPI->second].kill();
       return markAsDead(II);
     }
 
     uint64_t RawOffset = Offset.getLimitedValue();
-    uint64_t Size = Length ? Length->getLimitedValue()
-                           : AllocSize - RawOffset;
+    uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
 
     // Check for the special case where the same exact value is used for both
     // source and dest.
@@ -697,18 +984,12 @@ private:
     insertUse(I, Offset, Size);
   }
 
-  void visitPHINode(PHINode &PN) {
-    visitPHINodeOrSelectInst(PN);
-  }
+  void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
 
-  void visitSelectInst(SelectInst &SI) {
-    visitPHINodeOrSelectInst(SI);
-  }
+  void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
 
   /// \brief Disable SROA entirely if there are unhandled users of the alloca.
-  void visitInstruction(Instruction &I) {
-    PI.setAborted(&I);
-  }
+  void visitInstruction(Instruction &I) { PI.setAborted(&I); }
 };
 
 AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
@@ -729,7 +1010,9 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
   }
 
   Slices.erase(std::remove_if(Slices.begin(), Slices.end(),
-                              std::mem_fun_ref(&Slice::isDead)),
+                              [](const Slice &S) {
+                                return S.isDead();
+                              }),
                Slices.end());
 
 #if __cplusplus >= 201103L && !defined(NDEBUG)
@@ -749,6 +1032,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
 void AllocaSlices::print(raw_ostream &OS, const_iterator I,
                          StringRef Indent) const {
   printSlice(OS, I, Indent);
+  OS << "\n";
   printUse(OS, I, Indent);
 }
 
@@ -756,7 +1040,7 @@ void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
                               StringRef Indent) const {
   OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
      << " slice #" << (I - begin())
-     << (I->isSplittable() ? " (splittable)" : "") << "\n";
+     << (I->isSplittable() ? " (splittable)" : "");
 }
 
 void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
@@ -804,15 +1088,17 @@ public:
                  AllocaInst &AI, DIBuilder &DIB)
       : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
 
-  void run(const SmallVectorImpl<Instruction*> &Insts) {
+  void run(const SmallVectorImpl<Instruction *> &Insts) {
     // Retain the debug information attached to the alloca for use when
     // rewriting loads and stores.
-    if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) {
-      for (User *U : DebugNode->users())
-        if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
-          DDIs.push_back(DDI);
-        else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
-          DVIs.push_back(DVI);
+    if (auto *L = LocalAsMetadata::getIfExists(&AI)) {
+      if (auto *DebugNode = MetadataAsValue::getIfExists(AI.getContext(), L)) {
+        for (User *U : DebugNode->users())
+          if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
+            DDIs.push_back(DDI);
+          else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
+            DVIs.push_back(DVI);
+      }
     }
 
     LoadAndStorePromoter::run(Insts);
@@ -825,8 +1111,9 @@ public:
       DVIs.pop_back_val()->eraseFromParent();
   }
 
-  bool isInstInList(Instruction *I,
-                    const SmallVectorImpl<Instruction*> &Insts) const override {
+  bool
+  isInstInList(Instruction *I,
+               const SmallVectorImpl<Instruction *> &Insts) const override {
     Value *Ptr;
     if (LoadInst *LI = dyn_cast<LoadInst>(I))
       Ptr = LI->getOperand(0);
@@ -884,7 +1171,6 @@ public:
 };
 } // end anon namespace
 
-
 namespace {
 /// \brief An optimization pass providing Scalar Replacement of Aggregates.
 ///
@@ -910,7 +1196,7 @@ class SROA : public FunctionPass {
   LLVMContext *C;
   const DataLayout *DL;
   DominatorTree *DT;
-  AssumptionTracker *AT;
+  AssumptionCache *AC;
 
   /// \brief Worklist of alloca instructions to simplify.
   ///
@@ -919,12 +1205,12 @@ class SROA : public FunctionPass {
   /// directly promoted. Finally, each time we rewrite a use of an alloca other
   /// the one being actively rewritten, we add it back onto the list if not
   /// already present to ensure it is re-visited.
-  SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist;
+  SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> Worklist;
 
   /// \brief A collection of instructions to delete.
   /// We try to batch deletions to simplify code and make things a bit more
   /// efficient.
-  SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts;
+  SetVector<Instruction *, SmallVector<Instruction *, 8>> DeadInsts;
 
   /// \brief Post-promotion worklist.
   ///
@@ -934,7 +1220,7 @@ class SROA : public FunctionPass {
   ///
   /// Note that we have to be very careful to clear allocas out of this list in
   /// the event they are deleted.
-  SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist;
+  SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> PostPromotionWorklist;
 
   /// \brief A collection of alloca instructions we can directly promote.
   std::vector<AllocaInst *> PromotableAllocas;
@@ -944,7 +1230,7 @@ class SROA : public FunctionPass {
   /// All of these PHIs have been checked for the safety of speculation and by
   /// being speculated will allow promoting allocas currently in the promotable
   /// queue.
-  SetVector<PHINode *, SmallVector<PHINode *, 2> > SpeculatablePHIs;
+  SetVector<PHINode *, SmallVector<PHINode *, 2>> SpeculatablePHIs;
 
   /// \brief A worklist of select instructions to speculate prior to promoting
   /// allocas.
@@ -952,12 +1238,12 @@ class SROA : public FunctionPass {
   /// All of these select instructions have been checked for the safety of
   /// speculation and by being speculated will allow promoting allocas
   /// currently in the promotable queue.
-  SetVector<SelectInst *, SmallVector<SelectInst *, 2> > SpeculatableSelects;
+  SetVector<SelectInst *, SmallVector<SelectInst *, 2>> SpeculatableSelects;
 
 public:
   SROA(bool RequiresDomTree = true)
-      : FunctionPass(ID), RequiresDomTree(RequiresDomTree),
-        C(nullptr), DL(nullptr), DT(nullptr) {
+      : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr),
+        DL(nullptr), DT(nullptr) {
     initializeSROAPass(*PassRegistry::getPassRegistry());
   }
   bool runOnFunction(Function &F) override;
@@ -970,10 +1256,9 @@ private:
   friend class PHIOrSelectSpeculator;
   friend class AllocaSliceRewriter;
 
-  bool rewritePartition(AllocaInst &AI, AllocaSlices &AS,
-                        AllocaSlices::iterator B, AllocaSlices::iterator E,
-                        int64_t BeginOffset, int64_t EndOffset,
-                        ArrayRef<AllocaSlices::iterator> SplitUses);
+  bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
+  AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS,
+                               AllocaSlices::Partition &P);
   bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
   bool runOnAlloca(AllocaInst &AI);
   void clobberUse(Use &U);
@@ -988,12 +1273,12 @@ FunctionPass *llvm::createSROAPass(bool RequiresDomTree) {
   return new SROA(RequiresDomTree);
 }
 
-INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates",
-                    false, false)
+INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false,
+                    false)
 
 /// Walk the range of a partitioning looking for a common type to cover this
 /// sequence of slices.
@@ -1064,8 +1349,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
 ///
 /// FIXME: This should be hoisted into a generic utility, likely in
 /// Transforms/Util/Local.h
-static bool isSafePHIToSpeculate(PHINode &PN,
-                                 const DataLayout *DL = nullptr) {
+static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) {
   // For now, we can only do this promotion if the load is in the same block
   // as the PHI, and if there are no stores between the phi and load.
   // TODO: Allow recursive phi users.
@@ -1325,7 +1609,8 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
                                        SmallVectorImpl<Value *> &Indices,
                                        Twine NamePrefix) {
   if (Offset == 0)
-    return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, NamePrefix);
+    return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
+                                 NamePrefix);
 
   // We can't recurse through pointer types.
   if (Ty->isPointerTy())
@@ -1433,8 +1718,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
 /// a single GEP as possible, thus making each GEP more independent of the
 /// surrounding code.
 static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
-                             APInt Offset, Type *PointerTy,
-                             Twine NamePrefix) {
+                             APInt Offset, Type *PointerTy, Twine NamePrefix) {
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
   SmallPtrSet<Value *, 4> Visited;
@@ -1443,8 +1727,9 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
 
   // We may end up computing an offset pointer that has the wrong type. If we
   // never are able to compute one directly that has the correct type, we'll
-  // fall back to it, so keep it around here.
+  // fall back to it, so keep it and the base it was computed from around here.
   Value *OffsetPtr = nullptr;
+  Value *OffsetBasePtr;
 
   // Remember any i8 pointer we come across to re-use if we need to do a raw
   // byte offset.
@@ -1469,16 +1754,19 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
     Indices.clear();
     if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
                                            Indices, NamePrefix)) {
-      if (P->getType() == PointerTy) {
-        // Zap any offset pointer that we ended up computing in previous rounds.
-        if (OffsetPtr && OffsetPtr->use_empty())
-          if (Instruction *I = dyn_cast<Instruction>(OffsetPtr))
-            I->eraseFromParent();
+      // If we have a new natural pointer at the offset, clear out any old
+      // offset pointer we computed. Unless it is the base pointer or
+      // a non-instruction, we built a GEP we don't need. Zap it.
+      if (OffsetPtr && OffsetPtr != OffsetBasePtr)
+        if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
+          assert(I->use_empty() && "Built a GEP with uses some how!");
+          I->eraseFromParent();
+        }
+      OffsetPtr = P;
+      OffsetBasePtr = Ptr;
+      // If we also found a pointer of the right type, we're done.
+      if (P->getType() == PointerTy)
         return P;
-      }
-      if (!OffsetPtr) {
-        OffsetPtr = P;
-      }
     }
 
     // Stash this pointer if we've found an i8*.
@@ -1508,9 +1796,10 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
       Int8PtrOffset = Offset;
     }
 
-    OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr :
-      IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
-                            NamePrefix + "sroa_raw_idx");
+    OffsetPtr = Int8PtrOffset == 0
+                    ? Int8Ptr
+                    : IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
+                                            NamePrefix + "sroa_raw_idx");
   }
   Ptr = OffsetPtr;
 
@@ -1521,6 +1810,27 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
   return Ptr;
 }
 
+/// \brief Compute the adjusted alignment for a load or store from an offset.
+static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
+                                     const DataLayout &DL) {
+  unsigned Alignment;
+  Type *Ty;
+  if (auto *LI = dyn_cast<LoadInst>(I)) {
+    Alignment = LI->getAlignment();
+    Ty = LI->getType();
+  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+    Alignment = SI->getAlignment();
+    Ty = SI->getValueOperand()->getType();
+  } else {
+    llvm_unreachable("Only loads and stores are allowed!");
+  }
+
+  if (!Alignment)
+    Alignment = DL.getABITypeAlignment(Ty);
+
+  return MinAlign(Alignment, Offset);
+}
+
 /// \brief Test whether we can convert a value from the old to the new type.
 ///
 /// This predicate should be used to guard calls to convertValue in order to
@@ -1614,19 +1924,19 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
 ///
 /// This function is called to test each entry in a partioning which is slated
 /// for a single slice.
-static bool
-isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset,
-                                uint64_t SliceEndOffset, VectorType *Ty,
-                                uint64_t ElementSize, const Slice &S) {
+static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P,
+                                            const Slice &S, VectorType *Ty,
+                                            uint64_t ElementSize,
+                                            const DataLayout &DL) {
   // First validate the slice offsets.
   uint64_t BeginOffset =
-      std::max(S.beginOffset(), SliceBeginOffset) - SliceBeginOffset;
+      std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
   uint64_t BeginIndex = BeginOffset / ElementSize;
   if (BeginIndex * ElementSize != BeginOffset ||
       BeginIndex >= Ty->getNumElements())
     return false;
   uint64_t EndOffset =
-      std::min(S.endOffset(), SliceEndOffset) - SliceBeginOffset;
+      std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
   uint64_t EndIndex = EndOffset / ElementSize;
   if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements())
     return false;
@@ -1658,7 +1968,7 @@ isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset,
     if (LI->isVolatile())
       return false;
     Type *LTy = LI->getType();
-    if (SliceBeginOffset > S.beginOffset() || SliceEndOffset < S.endOffset()) {
+    if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
       assert(LTy->isIntegerTy());
       LTy = SplitIntTy;
     }
@@ -1668,7 +1978,7 @@ isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset,
     if (SI->isVolatile())
       return false;
     Type *STy = SI->getValueOperand()->getType();
-    if (SliceBeginOffset > S.beginOffset() || SliceEndOffset < S.endOffset()) {
+    if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
       assert(STy->isIntegerTy());
       STy = SplitIntTy;
     }
@@ -1690,11 +2000,8 @@ isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset,
 /// SSA value. We only can ensure this for a limited set of operations, and we
 /// don't want to do the rewrites unless we are confident that the result will
 /// be promotable, so we have an early test here.
-static VectorType *
-isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy,
-                        uint64_t SliceBeginOffset, uint64_t SliceEndOffset,
-                        AllocaSlices::const_range Slices,
-                        ArrayRef<AllocaSlices::iterator> SplitUses) {
+static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P,
+                                           const DataLayout &DL) {
   // Collect the candidate types for vector-based promotion. Also track whether
   // we have different element types.
   SmallVector<VectorType *, 4> CandidateTys;
@@ -1709,11 +2016,10 @@ isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy,
         HaveCommonEltTy = false;
     }
   };
-  CheckCandidateType(AllocaTy);
   // Consider any loads or stores that are the exact size of the slice.
-  for (const auto &S : Slices)
-    if (S.beginOffset() == SliceBeginOffset &&
-        S.endOffset() == SliceEndOffset) {
+  for (const Slice &S : P)
+    if (S.beginOffset() == P.beginOffset() &&
+        S.endOffset() == P.endOffset()) {
       if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
         CheckCandidateType(LI->getType());
       else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
@@ -1780,14 +2086,12 @@ isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy,
            "vector size not a multiple of element size?");
     ElementSize /= 8;
 
-    for (const auto &S : Slices)
-      if (!isVectorPromotionViableForSlice(DL, SliceBeginOffset, SliceEndOffset,
-                                           VTy, ElementSize, S))
+    for (const Slice &S : P)
+      if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
         return false;
 
-    for (const auto &SI : SplitUses)
-      if (!isVectorPromotionViableForSlice(DL, SliceBeginOffset, SliceEndOffset,
-                                           VTy, ElementSize, *SI))
+    for (const Slice *S : P.splitSliceTails())
+      if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
         return false;
 
     return true;
@@ -1803,12 +2107,13 @@ isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy,
 ///
 /// This implements the necessary checking for the \c isIntegerWideningViable
 /// test below on a single slice of the alloca.
-static bool isIntegerWideningViableForSlice(const DataLayout &DL,
-                                            Type *AllocaTy,
+static bool isIntegerWideningViableForSlice(const Slice &S,
                                             uint64_t AllocBeginOffset,
-                                            uint64_t Size,
-                                            const Slice &S,
+                                            Type *AllocaTy,
+                                            const DataLayout &DL,
                                             bool &WholeAllocaOp) {
+  uint64_t Size = DL.getTypeStoreSize(AllocaTy);
+
   uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
   uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
 
@@ -1876,11 +2181,8 @@ static bool isIntegerWideningViableForSlice(const DataLayout &DL,
 /// This is a quick test to check whether we can rewrite the integer loads and
 /// stores to a particular alloca into wider loads and stores and be able to
 /// promote the resulting alloca.
-static bool
-isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy,
-                        uint64_t AllocBeginOffset,
-                        AllocaSlices::const_range Slices,
-                        ArrayRef<AllocaSlices::iterator> SplitUses) {
+static bool isIntegerWideningViable(AllocaSlices::Partition &P, Type *AllocaTy,
+                                    const DataLayout &DL) {
   uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy);
   // Don't create integer types larger than the maximum bitwidth.
   if (SizeInBits > IntegerType::MAX_INT_BITS)
@@ -1898,24 +2200,24 @@ isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy,
       !canConvertValue(DL, IntTy, AllocaTy))
     return false;
 
-  uint64_t Size = DL.getTypeStoreSize(AllocaTy);
-
   // While examining uses, we ensure that the alloca has a covering load or
   // store. We don't want to widen the integer operations only to fail to
   // promote due to some other unsplittable entry (which we may make splittable
   // later). However, if there are only splittable uses, go ahead and assume
   // that we cover the alloca.
+  // FIXME: We shouldn't consider split slices that happen to start in the
+  // partition here...
   bool WholeAllocaOp =
-      Slices.begin() != Slices.end() ? false : DL.isLegalInteger(SizeInBits);
+      P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits);
 
-  for (const auto &S : Slices)
-    if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size,
-                                         S, WholeAllocaOp))
+  for (const Slice &S : P)
+    if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
+                                         WholeAllocaOp))
       return false;
 
-  for (const auto &SI : SplitUses)
-    if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size,
-                                         *SI, WholeAllocaOp))
+  for (const Slice *S : P.splitSliceTails())
+    if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
+                                         WholeAllocaOp))
       return false;
 
   return WholeAllocaOp;
@@ -1928,9 +2230,9 @@ static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
   IntegerType *IntTy = cast<IntegerType>(V->getType());
   assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
          "Element extends past full value");
-  uint64_t ShAmt = 8*Offset;
+  uint64_t ShAmt = 8 * Offset;
   if (DL.isBigEndian())
-    ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+    ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
   if (ShAmt) {
     V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
     DEBUG(dbgs() << "     shifted: " << *V << "\n");
@@ -1957,9 +2259,9 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
   }
   assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
          "Element store outside of alloca store");
-  uint64_t ShAmt = 8*Offset;
+  uint64_t ShAmt = 8 * Offset;
   if (DL.isBigEndian())
-    ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+    ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
   if (ShAmt) {
     V = IRB.CreateShl(V, ShAmt, Name + ".shift");
     DEBUG(dbgs() << "     shifted: " << *V << "\n");
@@ -1975,9 +2277,8 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
   return V;
 }
 
-static Value *extractVector(IRBuilderTy &IRB, Value *V,
-                            unsigned BeginIndex, unsigned EndIndex,
-                            const Twine &Name) {
+static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
+                            unsigned EndIndex, const Twine &Name) {
   VectorType *VecTy = cast<VectorType>(V->getType());
   unsigned NumElements = EndIndex - BeginIndex;
   assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
@@ -1992,13 +2293,12 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V,
     return V;
   }
 
-  SmallVector<Constant*, 8> Mask;
+  SmallVector<Constant *, 8> Mask;
   Mask.reserve(NumElements);
   for (unsigned i = BeginIndex; i != EndIndex; ++i)
     Mask.push_back(IRB.getInt32(i));
   V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
-                              ConstantVector::get(Mask),
-                              Name + ".extract");
+                              ConstantVector::get(Mask), Name + ".extract");
   DEBUG(dbgs() << "     shuffle: " << *V << "\n");
   return V;
 }
@@ -2013,7 +2313,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
     // Single element to insert.
     V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
                                 Name + ".insert");
-    DEBUG(dbgs() <<  "     insert: " << *V << "\n");
+    DEBUG(dbgs() << "     insert: " << *V << "\n");
     return V;
   }
 
@@ -2029,7 +2329,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
   // use a shuffle vector to widen it with undef elements, and then
   // a second shuffle vector to select between the loaded vector and the
   // incoming vector.
-  SmallVector<Constant*, 8> Mask;
+  SmallVector<Constant *, 8> Mask;
   Mask.reserve(VecTy->getNumElements());
   for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
     if (i >= BeginIndex && i < EndIndex)
@@ -2037,8 +2337,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
     else
       Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
   V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
-                              ConstantVector::get(Mask),
-                              Name + ".expand");
+                              ConstantVector::get(Mask), Name + ".expand");
   DEBUG(dbgs() << "    shuffle: " << *V << "\n");
 
   Mask.clear();
@@ -2148,6 +2447,9 @@ public:
     IsSplittable = I->isSplittable();
     IsSplit =
         BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
+    DEBUG(dbgs() << "  rewriting " << (IsSplit ? "split " : ""));
+    DEBUG(AS.printSlice(dbgs(), I, ""));
+    DEBUG(dbgs() << "\n");
 
     // Compute the intersecting offset range.
     assert(BeginOffset < NewAllocaEndOffset);
@@ -2218,7 +2520,8 @@ private:
                           );
   }
 
-  /// \brief Compute suitable alignment to access this slice of the *new* alloca.
+  /// \brief Compute suitable alignment to access this slice of the *new*
+  /// alloca.
   ///
   /// You can optionally pass a type to this routine and if that type's ABI
   /// alignment is itself suitable, this will return zero.
@@ -2226,7 +2529,8 @@ private:
     unsigned NewAIAlign = NewAI.getAlignment();
     if (!NewAIAlign)
       NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType());
-    unsigned Align = MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
+    unsigned Align =
+        MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
     return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align;
   }
 
@@ -2250,16 +2554,14 @@ private:
     unsigned EndIndex = getIndex(NewEndOffset);
     assert(EndIndex > BeginIndex && "Empty vector!");
 
-    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                     "load");
+    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
     return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
   }
 
   Value *rewriteIntegerLoad(LoadInst &LI) {
     assert(IntTy && "We cannot insert an integer to the alloca");
     assert(!LI.isVolatile());
-    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                     "load");
+    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
     V = convertValue(DL, IRB, V, IntTy);
     assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
     uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
@@ -2284,8 +2586,8 @@ private:
       V = rewriteIntegerLoad(LI);
     } else if (NewBeginOffset == NewAllocaBeginOffset &&
                canConvertValue(DL, NewAllocaTy, LI.getType())) {
-      V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                LI.isVolatile(), LI.getName());
+      V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(),
+                                LI.getName());
     } else {
       Type *LTy = TargetTy->getPointerTo();
       V = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
@@ -2302,7 +2604,7 @@ private:
       assert(SliceSize < DL.getTypeStoreSize(LI.getType()) &&
              "Split load isn't smaller than original load");
       assert(LI.getType()->getIntegerBitWidth() ==
-             DL.getTypeStoreSizeInBits(LI.getType()) &&
+                 DL.getTypeStoreSizeInBits(LI.getType()) &&
              "Non-byte-multiple bit width");
       // Move the insertion point just past the load so that we can refer to it.
       IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI)));
@@ -2310,9 +2612,9 @@ private:
       // basis for the new value. This allows us to replace the uses of LI with
       // the computed value, and then replace the placeholder with LI, leaving
       // LI only used for this computation.
-      Value *Placeholder
-        = new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
-      V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset,
+      Value *Placeholder =
+          new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
+      V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
                         "insert");
       LI.replaceAllUsesWith(V);
       Placeholder->replaceAllUsesWith(&LI);
@@ -2334,15 +2636,14 @@ private:
       assert(EndIndex > BeginIndex && "Empty vector!");
       unsigned NumElements = EndIndex - BeginIndex;
       assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
-      Type *SliceTy =
-          (NumElements == 1) ? ElementTy
-                             : VectorType::get(ElementTy, NumElements);
+      Type *SliceTy = (NumElements == 1)
+                          ? ElementTy
+                          : VectorType::get(ElementTy, NumElements);
       if (V->getType() != SliceTy)
         V = convertValue(DL, IRB, V, SliceTy);
 
       // Mix in the existing elements.
-      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         "load");
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
       V = insertVector(IRB, Old, V, BeginIndex, "vec");
     }
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
@@ -2357,13 +2658,12 @@ private:
     assert(IntTy && "We cannot extract an integer from the alloca");
     assert(!SI.isVolatile());
     if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
-      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         "oldload");
+      Value *Old =
+          IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
       Old = convertValue(DL, IRB, Old, IntTy);
       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-      V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset,
-                        "insert");
+      V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
     }
     V = convertValue(DL, IRB, V, NewAllocaTy);
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
@@ -2391,10 +2691,10 @@ private:
       assert(V->getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
       assert(V->getType()->getIntegerBitWidth() ==
-             DL.getTypeStoreSizeInBits(V->getType()) &&
+                 DL.getTypeStoreSizeInBits(V->getType()) &&
              "Non-byte-multiple bit width");
       IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
-      V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset,
+      V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
                          "extract");
     }
 
@@ -2439,14 +2739,14 @@ private:
     if (Size == 1)
       return V;
 
-    Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8);
-    V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, "zext"),
-                      ConstantExpr::getUDiv(
-                        Constant::getAllOnesValue(SplatIntTy),
-                        ConstantExpr::getZExt(
-                          Constant::getAllOnesValue(V->getType()),
-                          SplatIntTy)),
-                      "isplat");
+    Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
+    V = IRB.CreateMul(
+        IRB.CreateZExt(V, SplatIntTy, "zext"),
+        ConstantExpr::getUDiv(
+            Constant::getAllOnesValue(SplatIntTy),
+            ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
+                                  SplatIntTy)),
+        "isplat");
     return V;
   }
 
@@ -2483,12 +2783,11 @@ private:
     // If this doesn't map cleanly onto the alloca type, and that type isn't
     // a single value type, just emit a memset.
     if (!VecTy && !IntTy &&
-        (BeginOffset > NewAllocaBeginOffset ||
-         EndOffset < NewAllocaEndOffset ||
+        (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
          SliceSize != DL.getTypeStoreSize(AllocaTy) ||
          !AllocaTy->isSingleValueType() ||
          !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) ||
-         DL.getTypeSizeInBits(ScalarTy)%8 != 0)) {
+         DL.getTypeSizeInBits(ScalarTy) % 8 != 0)) {
       Type *SizeTy = II.getLength()->getType();
       Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
       CallInst *New = IRB.CreateMemSet(
@@ -2522,8 +2821,8 @@ private:
       if (NumElements > 1)
         Splat = getVectorSplat(Splat, NumElements);
 
-      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         "oldload");
+      Value *Old =
+          IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
       V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
     } else if (IntTy) {
       // If this is a memset on an alloca where we can widen stores, insert the
@@ -2535,8 +2834,8 @@ private:
 
       if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
                     EndOffset != NewAllocaBeginOffset)) {
-        Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                           "oldload");
+        Value *Old =
+            IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
         Old = convertValue(DL, IRB, Old, IntTy);
         uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
         V = insertInteger(DL, IRB, Old, V, Offset, "insert");
@@ -2633,8 +2932,8 @@ private:
     // Strip all inbounds GEPs and pointer casts to try to dig out any root
     // alloca that should be re-examined after rewriting this instruction.
     Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
-    if (AllocaInst *AI
-          = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
+    if (AllocaInst *AI =
+            dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
       assert(AI != &OldAI && AI != &NewAI &&
              "Splittable transfers cannot reach the same alloca on both ends.");
       Pass.Worklist.insert(AI);
@@ -2673,8 +2972,8 @@ private:
     unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
     unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
     unsigned NumElements = EndIndex - BeginIndex;
-    IntegerType *SubIntTy
-      = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : nullptr;
+    IntegerType *SubIntTy =
+        IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
 
     // Reset the other pointer type to match the register type we're going to
     // use, but using the address space of the original other pointer.
@@ -2703,27 +3002,25 @@ private:
 
     Value *Src;
     if (VecTy && !IsWholeAlloca && !IsDest) {
-      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                  "load");
+      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
       Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
     } else if (IntTy && !IsWholeAlloca && !IsDest) {
-      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                  "load");
+      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
       Src = convertValue(DL, IRB, Src, IntTy);
       uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
       Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
     } else {
-      Src = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(),
-                                  "copyload");
+      Src =
+          IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload");
     }
 
     if (VecTy && !IsWholeAlloca && IsDest) {
-      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         "oldload");
+      Value *Old =
+          IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
       Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
     } else if (IntTy && !IsWholeAlloca && IsDest) {
-      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         "oldload");
+      Value *Old =
+          IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
       Old = convertValue(DL, IRB, Old, IntTy);
       uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
       Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
@@ -2746,8 +3043,8 @@ private:
     // Record this instruction for deletion.
     Pass.DeadInsts.insert(&II);
 
-    ConstantInt *Size
-      = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
+    ConstantInt *Size =
+        ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
                          NewEndOffset - NewBeginOffset);
     Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
     Value *New;
@@ -2814,7 +3111,6 @@ private:
     SelectUsers.insert(&SI);
     return true;
   }
-
 };
 }
 
@@ -2869,8 +3165,7 @@ private:
   bool visitInstruction(Instruction &I) { return false; }
 
   /// \brief Generic recursive split emission class.
-  template <typename Derived>
-  class OpSplitter {
+  template <typename Derived> class OpSplitter {
   protected:
     /// The builder used to form new instructions.
     IRBuilderTy IRB;
@@ -2887,7 +3182,7 @@ private:
     /// Initialize the splitter with an insertion point, Ptr and start with a
     /// single zero GEP index.
     OpSplitter(Instruction *InsertionPoint, Value *Ptr)
-      : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
+        : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
 
   public:
     /// \brief Generic recursive split emission routine.
@@ -2943,7 +3238,7 @@ private:
 
   struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
     LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
-      : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
+        : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
 
     /// Emit a leaf load of a single value. This is called at the leaves of the
     /// recursive emission to actually load values.
@@ -2974,7 +3269,7 @@ private:
 
   struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
     StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
-      : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
+        : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
 
     /// Emit a leaf store of a single value. This is called at the leaves of the
     /// recursive emission to actually produce stores.
@@ -2982,8 +3277,8 @@ private:
       assert(Ty->isSingleValueType());
       // Extract the single value and store it using the indices.
       Value *Store = IRB.CreateStore(
-        IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
-        IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"));
+          IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
+          IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"));
       (void)Store;
       DEBUG(dbgs() << "          to: " << *Store << "\n");
     }
@@ -3069,8 +3364,8 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
 /// when the size or offset cause either end of type-based partition to be off.
 /// Also, this is a best-effort routine. It is reasonable to give up and not
 /// return a type if necessary.
-static Type *getTypePartition(const DataLayout &DL, Type *Ty,
-                              uint64_t Offset, uint64_t Size) {
+static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
+                              uint64_t Size) {
   if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size)
     return stripAggregateTypeWrapping(DL, Ty);
   if (Offset > DL.getTypeAllocSize(Ty) ||
@@ -3162,8 +3457,8 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty,
   }
 
   // Try to build up a sub-structure.
-  StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE),
-                                      STy->isPacked());
+  StructType *SubTy =
+      StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
   const StructLayout *SubSL = DL.getStructLayout(SubTy);
   if (Size != SubSL->getSizeInBytes())
     return nullptr; // The sub-struct doesn't have quite the size needed.
@@ -3171,6 +3466,494 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty,
   return SubTy;
 }
 
+/// \brief Pre-split loads and stores to simplify rewriting.
+///
+/// We want to break up the splittable load+store pairs as much as
+/// possible. This is important to do as a preprocessing step, as once we
+/// start rewriting the accesses to partitions of the alloca we lose the
+/// necessary information to correctly split apart paired loads and stores
+/// which both point into this alloca. The case to consider is something like
+/// the following:
+///
+///   %a = alloca [12 x i8]
+///   %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
+///   %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
+///   %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
+///   %iptr1 = bitcast i8* %gep1 to i64*
+///   %iptr2 = bitcast i8* %gep2 to i64*
+///   %fptr1 = bitcast i8* %gep1 to float*
+///   %fptr2 = bitcast i8* %gep2 to float*
+///   %fptr3 = bitcast i8* %gep3 to float*
+///   store float 0.0, float* %fptr1
+///   store float 1.0, float* %fptr2
+///   %v = load i64* %iptr1
+///   store i64 %v, i64* %iptr2
+///   %f1 = load float* %fptr2
+///   %f2 = load float* %fptr3
+///
+/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
+/// promote everything so we recover the 2 SSA values that should have been
+/// there all along.
+///
+/// \returns true if any changes are made.
+bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
+  DEBUG(dbgs() << "Pre-splitting loads and stores\n");
+
+  // Track the loads and stores which are candidates for pre-splitting here, in
+  // the order they first appear during the partition scan. These give stable
+  // iteration order and a basis for tracking which loads and stores we
+  // actually split.
+  SmallVector<LoadInst *, 4> Loads;
+  SmallVector<StoreInst *, 4> Stores;
+
+  // We need to accumulate the splits required of each load or store where we
+  // can find them via a direct lookup. This is important to cross-check loads
+  // and stores against each other. We also track the slice so that we can kill
+  // all the slices that end up split.
+  struct SplitOffsets {
+    Slice *S;
+    std::vector<uint64_t> Splits;
+  };
+  SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
+
+  // Track loads out of this alloca which cannot, for any reason, be pre-split.
+  // This is important as we also cannot pre-split stores of those loads!
+  // FIXME: This is all pretty gross. It means that we can be more aggressive
+  // in pre-splitting when the load feeding the store happens to come from
+  // a separate alloca. Put another way, the effectiveness of SROA would be
+  // decreased by a frontend which just concatenated all of its local allocas
+  // into one big flat alloca. But defeating such patterns is exactly the job
+  // SROA is tasked with! Sadly, to not have this discrepancy we would have
+  // change store pre-splitting to actually force pre-splitting of the load
+  // that feeds it *and all stores*. That makes pre-splitting much harder, but
+  // maybe it would make it more principled?
+  SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
+
+  DEBUG(dbgs() << "  Searching for candidate loads and stores\n");
+  for (auto &P : AS.partitions()) {
+    for (Slice &S : P) {
+      Instruction *I = cast<Instruction>(S.getUse()->getUser());
+      if (!S.isSplittable() ||S.endOffset() <= P.endOffset()) {
+        // If this was a load we have to track that it can't participate in any
+        // pre-splitting!
+        if (auto *LI = dyn_cast<LoadInst>(I))
+          UnsplittableLoads.insert(LI);
+        continue;
+      }
+      assert(P.endOffset() > S.beginOffset() &&
+             "Empty or backwards partition!");
+
+      // Determine if this is a pre-splittable slice.
+      if (auto *LI = dyn_cast<LoadInst>(I)) {
+        assert(!LI->isVolatile() && "Cannot split volatile loads!");
+
+        // The load must be used exclusively to store into other pointers for
+        // us to be able to arbitrarily pre-split it. The stores must also be
+        // simple to avoid changing semantics.
+        auto IsLoadSimplyStored = [](LoadInst *LI) {
+          for (User *LU : LI->users()) {
+            auto *SI = dyn_cast<StoreInst>(LU);
+            if (!SI || !SI->isSimple())
+              return false;
+          }
+          return true;
+        };
+        if (!IsLoadSimplyStored(LI)) {
+          UnsplittableLoads.insert(LI);
+          continue;
+        }
+
+        Loads.push_back(LI);
+      } else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) {
+        if (!SI ||
+            S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
+          continue;
+        auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
+        if (!StoredLoad || !StoredLoad->isSimple())
+          continue;
+        assert(!SI->isVolatile() && "Cannot split volatile stores!");
+
+        Stores.push_back(SI);
+      } else {
+        // Other uses cannot be pre-split.
+        continue;
+      }
+
+      // Record the initial split.
+      DEBUG(dbgs() << "    Candidate: " << *I << "\n");
+      auto &Offsets = SplitOffsetsMap[I];
+      assert(Offsets.Splits.empty() &&
+             "Should not have splits the first time we see an instruction!");
+      Offsets.S = &S;
+      Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
+    }
+
+    // Now scan the already split slices, and add a split for any of them which
+    // we're going to pre-split.
+    for (Slice *S : P.splitSliceTails()) {
+      auto SplitOffsetsMapI =
+          SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
+      if (SplitOffsetsMapI == SplitOffsetsMap.end())
+        continue;
+      auto &Offsets = SplitOffsetsMapI->second;
+
+      assert(Offsets.S == S && "Found a mismatched slice!");
+      assert(!Offsets.Splits.empty() &&
+             "Cannot have an empty set of splits on the second partition!");
+      assert(Offsets.Splits.back() ==
+                 P.beginOffset() - Offsets.S->beginOffset() &&
+             "Previous split does not end where this one begins!");
+
+      // Record each split. The last partition's end isn't needed as the size
+      // of the slice dictates that.
+      if (S->endOffset() > P.endOffset())
+        Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
+    }
+  }
+
+  // We may have split loads where some of their stores are split stores. For
+  // such loads and stores, we can only pre-split them if their splits exactly
+  // match relative to their starting offset. We have to verify this prior to
+  // any rewriting.
+  Stores.erase(
+      std::remove_if(Stores.begin(), Stores.end(),
+                     [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
+                       // Lookup the load we are storing in our map of split
+                       // offsets.
+                       auto *LI = cast<LoadInst>(SI->getValueOperand());
+                       // If it was completely unsplittable, then we're done,
+                       // and this store can't be pre-split.
+                       if (UnsplittableLoads.count(LI))
+                         return true;
+
+                       auto LoadOffsetsI = SplitOffsetsMap.find(LI);
+                       if (LoadOffsetsI == SplitOffsetsMap.end())
+                         return false; // Unrelated loads are definitely safe.
+                       auto &LoadOffsets = LoadOffsetsI->second;
+
+                       // Now lookup the store's offsets.
+                       auto &StoreOffsets = SplitOffsetsMap[SI];
+
+                       // If the relative offsets of each split in the load and
+                       // store match exactly, then we can split them and we
+                       // don't need to remove them here.
+                       if (LoadOffsets.Splits == StoreOffsets.Splits)
+                         return false;
+
+                       DEBUG(dbgs()
+                             << "    Mismatched splits for load and store:\n"
+                             << "      " << *LI << "\n"
+                             << "      " << *SI << "\n");
+
+                       // We've found a store and load that we need to split
+                       // with mismatched relative splits. Just give up on them
+                       // and remove both instructions from our list of
+                       // candidates.
+                       UnsplittableLoads.insert(LI);
+                       return true;
+                     }),
+      Stores.end());
+  // Now we have to go *back* through all te stores, because a later store may
+  // have caused an earlier store's load to become unsplittable and if it is
+  // unsplittable for the later store, then we can't rely on it being split in
+  // the earlier store either.
+  Stores.erase(std::remove_if(Stores.begin(), Stores.end(),
+                              [&UnsplittableLoads](StoreInst *SI) {
+                                auto *LI =
+                                    cast<LoadInst>(SI->getValueOperand());
+                                return UnsplittableLoads.count(LI);
+                              }),
+               Stores.end());
+  // Once we've established all the loads that can't be split for some reason,
+  // filter any that made it into our list out.
+  Loads.erase(std::remove_if(Loads.begin(), Loads.end(),
+                             [&UnsplittableLoads](LoadInst *LI) {
+                               return UnsplittableLoads.count(LI);
+                             }),
+              Loads.end());
+
+
+  // If no loads or stores are left, there is no pre-splitting to be done for
+  // this alloca.
+  if (Loads.empty() && Stores.empty())
+    return false;
+
+  // From here on, we can't fail and will be building new accesses, so rig up
+  // an IR builder.
+  IRBuilderTy IRB(&AI);
+
+  // Collect the new slices which we will merge into the alloca slices.
+  SmallVector<Slice, 4> NewSlices;
+
+  // Track any allocas we end up splitting loads and stores for so we iterate
+  // on them.
+  SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
+
+  // At this point, we have collected all of the loads and stores we can
+  // pre-split, and the specific splits needed for them. We actually do the
+  // splitting in a specific order in order to handle when one of the loads in
+  // the value operand to one of the stores.
+  //
+  // First, we rewrite all of the split loads, and just accumulate each split
+  // load in a parallel structure. We also build the slices for them and append
+  // them to the alloca slices.
+  SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
+  std::vector<LoadInst *> SplitLoads;
+  for (LoadInst *LI : Loads) {
+    SplitLoads.clear();
+
+    IntegerType *Ty = cast<IntegerType>(LI->getType());
+    uint64_t LoadSize = Ty->getBitWidth() / 8;
+    assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
+
+    auto &Offsets = SplitOffsetsMap[LI];
+    assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+           "Slice size should always match load size exactly!");
+    uint64_t BaseOffset = Offsets.S->beginOffset();
+    assert(BaseOffset + LoadSize > BaseOffset &&
+           "Cannot represent alloca access size using 64-bit integers!");
+
+    Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
+    IRB.SetInsertPoint(BasicBlock::iterator(LI));
+
+    DEBUG(dbgs() << "  Splitting load: " << *LI << "\n");
+
+    uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+    int Idx = 0, Size = Offsets.Splits.size();
+    for (;;) {
+      auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+      auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+      LoadInst *PLoad = IRB.CreateAlignedLoad(
+          getAdjustedPtr(IRB, *DL, BasePtr,
+                         APInt(DL->getPointerSizeInBits(), PartOffset),
+                         PartPtrTy, BasePtr->getName() + "."),
+          getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false,
+          LI->getName());
+
+      // Append this load onto the list of split loads so we can find it later
+      // to rewrite the stores.
+      SplitLoads.push_back(PLoad);
+
+      // Now build a new slice for the alloca.
+      NewSlices.push_back(
+          Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+                &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
+                /*IsSplittable*/ false));
+      DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
+                   << ", " << NewSlices.back().endOffset() << "): " << *PLoad
+                   << "\n");
+
+      // See if we've handled all the splits.
+      if (Idx >= Size)
+        break;
+
+      // Setup the next partition.
+      PartOffset = Offsets.Splits[Idx];
+      ++Idx;
+      PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
+    }
+
+    // Now that we have the split loads, do the slow walk over all uses of the
+    // load and rewrite them as split stores, or save the split loads to use
+    // below if the store is going to be split there anyways.
+    bool DeferredStores = false;
+    for (User *LU : LI->users()) {
+      StoreInst *SI = cast<StoreInst>(LU);
+      if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
+        DeferredStores = true;
+        DEBUG(dbgs() << "    Deferred splitting of store: " << *SI << "\n");
+        continue;
+      }
+
+      Value *StoreBasePtr = SI->getPointerOperand();
+      IRB.SetInsertPoint(BasicBlock::iterator(SI));
+
+      DEBUG(dbgs() << "    Splitting store of load: " << *SI << "\n");
+
+      for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
+        LoadInst *PLoad = SplitLoads[Idx];
+        uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
+        auto *PartPtrTy =
+            PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
+
+        StoreInst *PStore = IRB.CreateAlignedStore(
+            PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr,
+                                  APInt(DL->getPointerSizeInBits(), PartOffset),
+                                  PartPtrTy, StoreBasePtr->getName() + "."),
+            getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false);
+        (void)PStore;
+        DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
+      }
+
+      // We want to immediately iterate on any allocas impacted by splitting
+      // this store, and we have to track any promotable alloca (indicated by
+      // a direct store) as needing to be resplit because it is no longer
+      // promotable.
+      if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
+        ResplitPromotableAllocas.insert(OtherAI);
+        Worklist.insert(OtherAI);
+      } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+                     StoreBasePtr->stripInBoundsOffsets())) {
+        Worklist.insert(OtherAI);
+      }
+
+      // Mark the original store as dead.
+      DeadInsts.insert(SI);
+    }
+
+    // Save the split loads if there are deferred stores among the users.
+    if (DeferredStores)
+      SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
+
+    // Mark the original load as dead and kill the original slice.
+    DeadInsts.insert(LI);
+    Offsets.S->kill();
+  }
+
+  // Second, we rewrite all of the split stores. At this point, we know that
+  // all loads from this alloca have been split already. For stores of such
+  // loads, we can simply look up the pre-existing split loads. For stores of
+  // other loads, we split those loads first and then write split stores of
+  // them.
+  for (StoreInst *SI : Stores) {
+    auto *LI = cast<LoadInst>(SI->getValueOperand());
+    IntegerType *Ty = cast<IntegerType>(LI->getType());
+    uint64_t StoreSize = Ty->getBitWidth() / 8;
+    assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
+
+    auto &Offsets = SplitOffsetsMap[SI];
+    assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+           "Slice size should always match load size exactly!");
+    uint64_t BaseOffset = Offsets.S->beginOffset();
+    assert(BaseOffset + StoreSize > BaseOffset &&
+           "Cannot represent alloca access size using 64-bit integers!");
+
+    Value *LoadBasePtr = LI->getPointerOperand();
+    Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
+
+    DEBUG(dbgs() << "  Splitting store: " << *SI << "\n");
+
+    // Check whether we have an already split load.
+    auto SplitLoadsMapI = SplitLoadsMap.find(LI);
+    std::vector<LoadInst *> *SplitLoads = nullptr;
+    if (SplitLoadsMapI != SplitLoadsMap.end()) {
+      SplitLoads = &SplitLoadsMapI->second;
+      assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
+             "Too few split loads for the number of splits in the store!");
+    } else {
+      DEBUG(dbgs() << "          of load: " << *LI << "\n");
+    }
+
+    uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+    int Idx = 0, Size = Offsets.Splits.size();
+    for (;;) {
+      auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+      auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
+
+      // Either lookup a split load or create one.
+      LoadInst *PLoad;
+      if (SplitLoads) {
+        PLoad = (*SplitLoads)[Idx];
+      } else {
+        IRB.SetInsertPoint(BasicBlock::iterator(LI));
+        PLoad = IRB.CreateAlignedLoad(
+            getAdjustedPtr(IRB, *DL, LoadBasePtr,
+                           APInt(DL->getPointerSizeInBits(), PartOffset),
+                           PartPtrTy, LoadBasePtr->getName() + "."),
+            getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false,
+            LI->getName());
+      }
+
+      // And store this partition.
+      IRB.SetInsertPoint(BasicBlock::iterator(SI));
+      StoreInst *PStore = IRB.CreateAlignedStore(
+          PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr,
+                                APInt(DL->getPointerSizeInBits(), PartOffset),
+                                PartPtrTy, StoreBasePtr->getName() + "."),
+          getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false);
+
+      // Now build a new slice for the alloca.
+      NewSlices.push_back(
+          Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+                &PStore->getOperandUse(PStore->getPointerOperandIndex()),
+                /*IsSplittable*/ false));
+      DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
+                   << ", " << NewSlices.back().endOffset() << "): " << *PStore
+                   << "\n");
+      if (!SplitLoads) {
+        DEBUG(dbgs() << "      of split load: " << *PLoad << "\n");
+      }
+
+      // See if we've finished all the splits.
+      if (Idx >= Size)
+        break;
+
+      // Setup the next partition.
+      PartOffset = Offsets.Splits[Idx];
+      ++Idx;
+      PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
+    }
+
+    // We want to immediately iterate on any allocas impacted by splitting
+    // this load, which is only relevant if it isn't a load of this alloca and
+    // thus we didn't already split the loads above. We also have to keep track
+    // of any promotable allocas we split loads on as they can no longer be
+    // promoted.
+    if (!SplitLoads) {
+      if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
+        assert(OtherAI != &AI && "We can't re-split our own alloca!");
+        ResplitPromotableAllocas.insert(OtherAI);
+        Worklist.insert(OtherAI);
+      } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+                     LoadBasePtr->stripInBoundsOffsets())) {
+        assert(OtherAI != &AI && "We can't re-split our own alloca!");
+        Worklist.insert(OtherAI);
+      }
+    }
+
+    // Mark the original store as dead now that we've split it up and kill its
+    // slice. Note that we leave the original load in place unless this store
+    // was its ownly use. It may in turn be split up if it is an alloca load
+    // for some other alloca, but it may be a normal load. This may introduce
+    // redundant loads, but where those can be merged the rest of the optimizer
+    // should handle the merging, and this uncovers SSA splits which is more
+    // important. In practice, the original loads will almost always be fully
+    // split and removed eventually, and the splits will be merged by any
+    // trivial CSE, including instcombine.
+    if (LI->hasOneUse()) {
+      assert(*LI->user_begin() == SI && "Single use isn't this store!");
+      DeadInsts.insert(LI);
+    }
+    DeadInsts.insert(SI);
+    Offsets.S->kill();
+  }
+
+  // Remove the killed slices that have ben pre-split.
+  AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) {
+    return S.isDead();
+  }), AS.end());
+
+  // Insert our new slices. This will sort and merge them into the sorted
+  // sequence.
+  AS.insert(NewSlices);
+
+  DEBUG(dbgs() << "  Pre-split slices:\n");
+#ifndef NDEBUG
+  for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
+    DEBUG(AS.print(dbgs(), I, "    "));
+#endif
+
+  // Finally, don't try to promote any allocas that new require re-splitting.
+  // They have already been added to the worklist above.
+  PromotableAllocas.erase(
+      std::remove_if(
+          PromotableAllocas.begin(), PromotableAllocas.end(),
+          [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
+      PromotableAllocas.end());
+
+  return true;
+}
+
 /// \brief Rewrite an alloca partition's users.
 ///
 /// This routine drives both of the rewriting goals of the SROA pass. It tries
@@ -3181,40 +3964,31 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty,
 /// appropriate new offsets. It also evaluates how successful the rewrite was
 /// at enabling promotion and if it was successful queues the alloca to be
 /// promoted.
-bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
-                            AllocaSlices::iterator B, AllocaSlices::iterator E,
-                            int64_t BeginOffset, int64_t EndOffset,
-                            ArrayRef<AllocaSlices::iterator> SplitUses) {
-  assert(BeginOffset < EndOffset);
-  uint64_t SliceSize = EndOffset - BeginOffset;
-
+AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
+                                   AllocaSlices::Partition &P) {
   // Try to compute a friendly type for this partition of the alloca. This
   // won't always succeed, in which case we fall back to a legal integer type
   // or an i8 array of an appropriate size.
   Type *SliceTy = nullptr;
-  if (Type *CommonUseTy = findCommonType(B, E, EndOffset))
-    if (DL->getTypeAllocSize(CommonUseTy) >= SliceSize)
+  if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()))
+    if (DL->getTypeAllocSize(CommonUseTy) >= P.size())
       SliceTy = CommonUseTy;
   if (!SliceTy)
     if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(),
-                                                 BeginOffset, SliceSize))
+                                                 P.beginOffset(), P.size()))
       SliceTy = TypePartitionTy;
   if ((!SliceTy || (SliceTy->isArrayTy() &&
                     SliceTy->getArrayElementType()->isIntegerTy())) &&
-      DL->isLegalInteger(SliceSize * 8))
-    SliceTy = Type::getIntNTy(*C, SliceSize * 8);
+      DL->isLegalInteger(P.size() * 8))
+    SliceTy = Type::getIntNTy(*C, P.size() * 8);
   if (!SliceTy)
-    SliceTy = ArrayType::get(Type::getInt8Ty(*C), SliceSize);
-  assert(DL->getTypeAllocSize(SliceTy) >= SliceSize);
+    SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
+  assert(DL->getTypeAllocSize(SliceTy) >= P.size());
 
-  bool IsIntegerPromotable = isIntegerWideningViable(
-      *DL, SliceTy, BeginOffset, AllocaSlices::const_range(B, E), SplitUses);
+  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, *DL);
 
   VectorType *VecTy =
-      IsIntegerPromotable
-          ? nullptr
-          : isVectorPromotionViable(*DL, SliceTy, BeginOffset, EndOffset,
-                                    AllocaSlices::const_range(B, E), SplitUses);
+      IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, *DL);
   if (VecTy)
     SliceTy = VecTy;
 
@@ -3224,11 +3998,12 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   // perform phi and select speculation.
   AllocaInst *NewAI;
   if (SliceTy == AI.getAllocatedType()) {
-    assert(BeginOffset == 0 &&
+    assert(P.beginOffset() == 0 &&
            "Non-zero begin offset but same alloca type");
     NewAI = &AI;
     // FIXME: We should be able to bail at this point with "nothing changed".
     // FIXME: We might want to defer PHI speculation until after here.
+    // FIXME: return nullptr;
   } else {
     unsigned Alignment = AI.getAlignment();
     if (!Alignment) {
@@ -3237,20 +4012,20 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
       // type.
       Alignment = DL->getABITypeAlignment(AI.getAllocatedType());
     }
-    Alignment = MinAlign(Alignment, BeginOffset);
+    Alignment = MinAlign(Alignment, P.beginOffset());
     // If we will get at least this much alignment from the type alone, leave
     // the alloca's alignment unconstrained.
     if (Alignment <= DL->getABITypeAlignment(SliceTy))
       Alignment = 0;
-    NewAI =
-        new AllocaInst(SliceTy, nullptr, Alignment,
-                       AI.getName() + ".sroa." + Twine(B - AS.begin()), &AI);
+    NewAI = new AllocaInst(
+        SliceTy, nullptr, Alignment,
+        AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
     ++NumNewAllocas;
   }
 
   DEBUG(dbgs() << "Rewriting alloca partition "
-               << "[" << BeginOffset << "," << EndOffset << ") to: " << *NewAI
-               << "\n");
+               << "[" << P.beginOffset() << "," << P.endOffset()
+               << ") to: " << *NewAI << "\n");
 
   // Track the high watermark on the worklist as it is only relevant for
   // promoted allocas. We will reset it to this point if the alloca is not in
@@ -3260,20 +4035,16 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   SmallPtrSet<PHINode *, 8> PHIUsers;
   SmallPtrSet<SelectInst *, 8> SelectUsers;
 
-  AllocaSliceRewriter Rewriter(*DL, AS, *this, AI, *NewAI, BeginOffset,
-                               EndOffset, IsIntegerPromotable, VecTy, PHIUsers,
-                               SelectUsers);
+  AllocaSliceRewriter Rewriter(*DL, AS, *this, AI, *NewAI, P.beginOffset(),
+                               P.endOffset(), IsIntegerPromotable, VecTy,
+                               PHIUsers, SelectUsers);
   bool Promotable = true;
-  for (auto & SplitUse : SplitUses) {
-    DEBUG(dbgs() << "  rewriting split ");
-    DEBUG(AS.printSlice(dbgs(), SplitUse, ""));
-    Promotable &= Rewriter.visit(SplitUse);
+  for (Slice *S : P.splitSliceTails()) {
+    Promotable &= Rewriter.visit(S);
     ++NumUses;
   }
-  for (AllocaSlices::iterator I = B; I != E; ++I) {
-    DEBUG(dbgs() << "  rewriting ");
-    DEBUG(AS.printSlice(dbgs(), I, ""));
-    Promotable &= Rewriter.visit(I);
+  for (Slice &S : P) {
+    Promotable &= Rewriter.visit(&S);
     ++NumUses;
   }
 
@@ -3328,32 +4099,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
       PostPromotionWorklist.pop_back();
   }
 
-  return true;
-}
-
-static void
-removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses,
-                        uint64_t &MaxSplitUseEndOffset, uint64_t Offset) {
-  if (Offset >= MaxSplitUseEndOffset) {
-    SplitUses.clear();
-    MaxSplitUseEndOffset = 0;
-    return;
-  }
-
-  size_t SplitUsesOldSize = SplitUses.size();
-  SplitUses.erase(std::remove_if(SplitUses.begin(), SplitUses.end(),
-                                 [Offset](const AllocaSlices::iterator &I) {
-                    return I->endOffset() <= Offset;
-                  }),
-                  SplitUses.end());
-  if (SplitUsesOldSize == SplitUses.size())
-    return;
-
-  // Recompute the max. While this is linear, so is remove_if.
-  MaxSplitUseEndOffset = 0;
-  for (AllocaSlices::iterator SplitUse : SplitUses)
-    MaxSplitUseEndOffset =
-        std::max(SplitUse->endOffset(), MaxSplitUseEndOffset);
+  return NewAI;
 }
 
 /// \brief Walks the slices of an alloca and form partitions based on them,
@@ -3364,108 +4110,100 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
 
   unsigned NumPartitions = 0;
   bool Changed = false;
-  SmallVector<AllocaSlices::iterator, 4> SplitUses;
-  uint64_t MaxSplitUseEndOffset = 0;
-
-  uint64_t BeginOffset = AS.begin()->beginOffset();
-
-  for (AllocaSlices::iterator SI = AS.begin(), SJ = std::next(SI),
-                              SE = AS.end();
-       SI != SE; SI = SJ) {
-    uint64_t MaxEndOffset = SI->endOffset();
-
-    if (!SI->isSplittable()) {
-      // When we're forming an unsplittable region, it must always start at the
-      // first slice and will extend through its end.
-      assert(BeginOffset == SI->beginOffset());
-
-      // Form a partition including all of the overlapping slices with this
-      // unsplittable slice.
-      while (SJ != SE && SJ->beginOffset() < MaxEndOffset) {
-        if (!SJ->isSplittable())
-          MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset());
-        ++SJ;
-      }
-    } else {
-      assert(SI->isSplittable()); // Established above.
-
-      // Collect all of the overlapping splittable slices.
-      while (SJ != SE && SJ->beginOffset() < MaxEndOffset &&
-             SJ->isSplittable()) {
-        MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset());
-        ++SJ;
-      }
-
-      // Back up MaxEndOffset and SJ if we ended the span early when
-      // encountering an unsplittable slice.
-      if (SJ != SE && SJ->beginOffset() < MaxEndOffset) {
-        assert(!SJ->isSplittable());
-        MaxEndOffset = SJ->beginOffset();
-      }
-    }
-
-    // Check if we have managed to move the end offset forward yet. If so,
-    // we'll have to rewrite uses and erase old split uses.
-    if (BeginOffset < MaxEndOffset) {
-      // Rewrite a sequence of overlapping slices.
-      Changed |= rewritePartition(AI, AS, SI, SJ, BeginOffset, MaxEndOffset,
-                                  SplitUses);
-      ++NumPartitions;
-
-      removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, MaxEndOffset);
-    }
 
-    // Accumulate all the splittable slices from the [SI,SJ) region which
-    // overlap going forward.
-    for (AllocaSlices::iterator SK = SI; SK != SJ; ++SK)
-      if (SK->isSplittable() && SK->endOffset() > MaxEndOffset) {
-        SplitUses.push_back(SK);
-        MaxSplitUseEndOffset = std::max(SK->endOffset(), MaxSplitUseEndOffset);
-      }
-
-    // If we're already at the end and we have no split uses, we're done.
-    if (SJ == SE && SplitUses.empty())
-      break;
+  // First try to pre-split loads and stores.
+  Changed |= presplitLoadsAndStores(AI, AS);
 
-    // If we have no split uses or no gap in offsets, we're ready to move to
-    // the next slice.
-    if (SplitUses.empty() || (SJ != SE && MaxEndOffset == SJ->beginOffset())) {
-      BeginOffset = SJ->beginOffset();
+  // Now that we have identified any pre-splitting opportunities, mark any
+  // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail
+  // to split these during pre-splitting, we want to force them to be
+  // rewritten into a partition.
+  bool IsSorted = true;
+  for (Slice &S : AS) {
+    if (!S.isSplittable())
       continue;
-    }
-
-    // Even if we have split slices, if the next slice is splittable and the
-    // split slices reach it, we can simply set up the beginning offset of the
-    // next iteration to bridge between them.
-    if (SJ != SE && SJ->isSplittable() &&
-        MaxSplitUseEndOffset > SJ->beginOffset()) {
-      BeginOffset = MaxEndOffset;
+    // FIXME: We currently leave whole-alloca splittable loads and stores. This
+    // used to be the only splittable loads and stores and we need to be
+    // confident that the above handling of splittable loads and stores is
+    // completely sufficient before we forcibly disable the remaining handling.
+    if (S.beginOffset() == 0 &&
+        S.endOffset() >= DL->getTypeAllocSize(AI.getAllocatedType()))
       continue;
+    if (isa<LoadInst>(S.getUse()->getUser()) ||
+        isa<StoreInst>(S.getUse()->getUser())) {
+      S.makeUnsplittable();
+      IsSorted = false;
+    }
+  }
+  if (!IsSorted)
+    std::sort(AS.begin(), AS.end());
+
+  /// \brief Describes the allocas introduced by rewritePartition
+  /// in order to migrate the debug info.
+  struct Piece {
+    AllocaInst *Alloca;
+    uint64_t Offset;
+    uint64_t Size;
+    Piece(AllocaInst *AI, uint64_t O, uint64_t S)
+      : Alloca(AI), Offset(O), Size(S) {}
+  };
+  SmallVector<Piece, 4> Pieces;
+
+  // Rewrite each partition.
+  for (auto &P : AS.partitions()) {
+    if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
+      Changed = true;
+      if (NewAI != &AI) {
+        uint64_t SizeOfByte = 8;
+        uint64_t AllocaSize = DL->getTypeSizeInBits(NewAI->getAllocatedType());
+        // Don't include any padding.
+        uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
+        Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size));
+      }
     }
-
-    // Otherwise, we have a tail of split slices. Rewrite them with an empty
-    // range of slices.
-    uint64_t PostSplitEndOffset =
-        SJ == SE ? MaxSplitUseEndOffset : SJ->beginOffset();
-
-    Changed |= rewritePartition(AI, AS, SJ, SJ, MaxEndOffset,
-                                PostSplitEndOffset, SplitUses);
     ++NumPartitions;
-
-    if (SJ == SE)
-      break; // Skip the rest, we don't need to do any cleanup.
-
-    removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset,
-                            PostSplitEndOffset);
-
-    // Now just reset the begin offset for the next iteration.
-    BeginOffset = SJ->beginOffset();
   }
 
   NumAllocaPartitions += NumPartitions;
   MaxPartitionsPerAlloca =
       std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca);
 
+  // Migrate debug information from the old alloca to the new alloca(s)
+  // and the individial partitions.
+  if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) {
+    DIVariable Var(DbgDecl->getVariable());
+    DIExpression Expr(DbgDecl->getExpression());
+    DIBuilder DIB(*AI.getParent()->getParent()->getParent(),
+                  /*AllowUnresolved*/ false);
+    bool IsSplit = Pieces.size() > 1;
+    for (auto Piece : Pieces) {
+      // Create a piece expression describing the new partition or reuse AI's
+      // expression if there is only one partition.
+      DIExpression PieceExpr = Expr;
+      if (IsSplit || Expr.isBitPiece()) {
+        // If this alloca is already a scalar replacement of a larger aggregate,
+        // Piece.Offset describes the offset inside the scalar.
+        uint64_t Offset = Expr.isBitPiece() ? Expr.getBitPieceOffset() : 0;
+        uint64_t Start = Offset + Piece.Offset;
+        uint64_t Size = Piece.Size;
+        if (Expr.isBitPiece()) {
+          uint64_t AbsEnd = Expr.getBitPieceOffset() + Expr.getBitPieceSize();
+          if (Start >= AbsEnd)
+            // No need to describe a SROAed padding.
+            continue;
+          Size = std::min(Size, AbsEnd - Start);
+        }
+        PieceExpr = DIB.createBitPieceExpression(Start, Size);
+      }
+
+      // Remove any existing dbg.declare intrinsic describing the same alloca.
+      if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Piece.Alloca))
+        OldDDI->eraseFromParent();
+
+      auto *NewDDI = DIB.insertDeclare(Piece.Alloca, Var, PieceExpr, &AI);
+      NewDDI->setDebugLoc(DbgDecl->getDebugLoc());
+    }
+  }
   return Changed;
 }
 
@@ -3561,7 +4299,8 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
 ///
 /// We also record the alloca instructions deleted here so that they aren't
 /// subsequently handed to mem2reg to promote.
-void SROA::deleteDeadInstructions(SmallPtrSetImpl<AllocaInst*> &DeletedAllocas) {
+void SROA::deleteDeadInstructions(
+    SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
   while (!DeadInsts.empty()) {
     Instruction *I = DeadInsts.pop_back_val();
     DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
@@ -3576,8 +4315,11 @@ void SROA::deleteDeadInstructions(SmallPtrSetImpl<AllocaInst*> &DeletedAllocas)
           DeadInsts.insert(U);
       }
 
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
       DeletedAllocas.insert(AI);
+      if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI))
+        DbgDecl->eraseFromParent();
+    }
 
     ++NumDeleted;
     I->eraseFromParent();
@@ -3608,14 +4350,14 @@ bool SROA::promoteAllocas(Function &F) {
 
   if (DT && !ForceSSAUpdater) {
     DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
-    PromoteMemToReg(PromotableAllocas, *DT, nullptr, AT);
+    PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);
     PromotableAllocas.clear();
     return true;
   }
 
   DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n");
   SSAUpdater SSA;
-  DIBuilder DIB(*F.getParent());
+  DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
   SmallVector<Instruction *, 64> Insts;
 
   // We need a worklist to walk the uses of each alloca.
@@ -3690,13 +4432,14 @@ bool SROA::runOnFunction(Function &F) {
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  AT = &getAnalysis<AssumptionTracker>();
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
   BasicBlock &EntryBB = F.getEntryBlock();
   for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
-       I != E; ++I)
+       I != E; ++I) {
     if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
       Worklist.insert(AI);
+  }
 
   bool Changed = false;
   // A set of deleted alloca instruction pointers which should be removed from
@@ -3711,9 +4454,7 @@ bool SROA::runOnFunction(Function &F) {
       // Remove the deleted allocas from various lists so that we don't try to
       // continue processing them.
       if (!DeletedAllocas.empty()) {
-        auto IsInSet = [&](AllocaInst *AI) {
-          return DeletedAllocas.count(AI);
-        };
+        auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
         Worklist.remove_if(IsInSet);
         PostPromotionWorklist.remove_if(IsInSet);
         PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(),
@@ -3734,7 +4475,7 @@ bool SROA::runOnFunction(Function &F) {
 }
 
 void SROA::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AssumptionTracker>();
+  AU.addRequired<AssumptionCacheTracker>();
   if (RequiresDomTree)
     AU.addRequired<DominatorTreeWrapperPass>();
   AU.setPreservesCFG();
diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp
index 179bbf7..c7232a9 100644
--- a/lib/Transforms/Scalar/SampleProfile.cpp
+++ b/lib/Transforms/Scalar/SampleProfile.cpp
@@ -95,7 +95,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<LoopInfo>();
+    AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<PostDominatorTree>();
   }
@@ -731,7 +731,7 @@ INITIALIZE_PASS_BEGIN(SampleProfileLoader, "sample-profile",
                       "Sample Profile loader", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(PostDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AddDiscriminators)
 INITIALIZE_PASS_END(SampleProfileLoader, "sample-profile",
                     "Sample Profile loader", false, false)
@@ -762,7 +762,7 @@ bool SampleProfileLoader::runOnFunction(Function &F) {
 
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   PDT = &getAnalysis<PostDominatorTree>();
-  LI = &getAnalysis<LoopInfo>();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   Ctx = &F.getParent()->getContext();
   Samples = Reader->getSamplesFor(F);
   if (!Samples->empty())
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index a16e9e2..621633b 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -20,7 +20,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 
 using namespace llvm;
 
@@ -28,6 +28,7 @@ using namespace llvm;
 /// ScalarOpts library.
 void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeADCEPass(Registry);
+  initializeBDCEPass(Registry);
   initializeAlignmentFromAssumptionsPass(Registry);
   initializeSampleProfileLoaderPass(Registry);
   initializeConstantHoistingPass(Registry);
@@ -38,12 +39,14 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeScalarizerPass(Registry);
   initializeDSEPass(Registry);
   initializeGVNPass(Registry);
-  initializeEarlyCSEPass(Registry);
+  initializeEarlyCSELegacyPassPass(Registry);
   initializeFlattenCFGPassPass(Registry);
+  initializeInductiveRangeCheckEliminationPass(Registry);
   initializeIndVarSimplifyPass(Registry);
   initializeJumpThreadingPass(Registry);
   initializeLICMPass(Registry);
   initializeLoopDeletionPass(Registry);
+  initializeLoopAccessAnalysisPass(Registry);
   initializeLoopInstSimplifyPass(Registry);
   initializeLoopRotatePass(Registry);
   initializeLoopStrengthReducePass(Registry);
@@ -58,6 +61,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializePartiallyInlineLibCallsPass(Registry);
   initializeReassociatePass(Registry);
   initializeRegToMemPass(Registry);
+  initializeRewriteStatepointsForGCPass(Registry);
   initializeSCCPPass(Registry);
   initializeIPSCCPPass(Registry);
   initializeSROAPass(Registry);
@@ -68,7 +72,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeSinkingPass(Registry);
   initializeTailCallElimPass(Registry);
   initializeSeparateConstOffsetFromGEPPass(Registry);
+  initializeStraightLineStrengthReducePass(Registry);
   initializeLoadCombinePass(Registry);
+  initializePlaceBackedgeSafepointsImplPass(Registry);
+  initializePlaceSafepointsPass(Registry);
 }
 
 void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
@@ -79,6 +86,10 @@ void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createAggressiveDCEPass());
 }
 
+void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createBitTrackingDCEPass());
+}
+
 void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createAlignmentFromAssumptionsPass());
 }
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index f7fa917..5c49a55 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -23,7 +23,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CallSite.h"
@@ -198,7 +198,7 @@ namespace {
     // getAnalysisUsage - This pass does not require any passes, but we know it
     // will not alter the CFG, so say so.
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionTracker>();
+      AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.setPreservesCFG();
     }
@@ -216,7 +216,7 @@ namespace {
     // getAnalysisUsage - This pass does not require any passes, but we know it
     // will not alter the CFG, so say so.
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionTracker>();
+      AU.addRequired<AssumptionCacheTracker>();
       AU.setPreservesCFG();
     }
   };
@@ -228,14 +228,14 @@ char SROA_SSAUp::ID = 0;
 
 INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl",
                 "Scalar Replacement of Aggregates (DT)", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(SROA_DT, "scalarrepl",
                 "Scalar Replacement of Aggregates (DT)", false, false)
 
 INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa",
                       "Scalar Replacement of Aggregates (SSAUp)", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa",
                     "Scalar Replacement of Aggregates (SSAUp)", false, false)
 
@@ -1068,12 +1068,14 @@ public:
   void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) {
     // Remember which alloca we're promoting (for isInstInList).
     this->AI = AI;
-    if (MDNode *DebugNode = MDNode::getIfExists(AI->getContext(), AI)) {
-      for (User *U : DebugNode->users())
-        if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
-          DDIs.push_back(DDI);
-        else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
-          DVIs.push_back(DVI);
+    if (auto *L = LocalAsMetadata::getIfExists(AI)) {
+      if (auto *DebugNode = MetadataAsValue::getIfExists(AI->getContext(), L)) {
+        for (User *U : DebugNode->users())
+          if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
+            DDIs.push_back(DDI);
+          else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
+            DVIs.push_back(DVI);
+      }
     }
 
     LoadAndStorePromoter::run(Insts);
@@ -1417,10 +1419,11 @@ bool SROA::performPromotion(Function &F) {
   DominatorTree *DT = nullptr;
   if (HasDomTree)
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
+  AssumptionCache &AC =
+      getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
   BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
-  DIBuilder DIB(*F.getParent());
+  DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
   bool Changed = false;
   SmallVector<Instruction*, 64> Insts;
   while (1) {
@@ -1436,7 +1439,7 @@ bool SROA::performPromotion(Function &F) {
     if (Allocas.empty()) break;
 
     if (HasDomTree)
-      PromoteMemToReg(Allocas, *DT, nullptr, AT);
+      PromoteMemToReg(Allocas, *DT, nullptr, &AC);
     else {
       SSAUpdater SSA;
       for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 6157746..bffe8df 100644
--- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -313,7 +313,8 @@ class SeparateConstOffsetFromGEP : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DataLayoutPass>();
-    AU.addRequired<TargetTransformInfo>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
   }
 
   bool doInitialization(Module &M) override {
@@ -384,7 +385,7 @@ INITIALIZE_PASS_BEGIN(
     SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
     "Split GEPs to a variadic base and a constant offset for better CSE", false,
     false)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DataLayoutPass)
 INITIALIZE_PASS_END(
     SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
@@ -857,7 +858,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   // of variable indices. Therefore, we don't check for addressing modes in that
   // case.
   if (!LowerGEP) {
-    TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
+    TargetTransformInfo &TTI =
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+            *GEP->getParent()->getParent());
     if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(),
                                    /*BaseGV=*/nullptr, AccumulativeByteOffset,
                                    /*HasBaseReg=*/true, /*Scale=*/0)) {
@@ -910,7 +913,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   if (LowerGEP) {
     // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
     // arithmetic operations if the target uses alias analysis in codegen.
-    if (TM && TM->getSubtarget<TargetSubtargetInfo>().useAA())
+    if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA())
       lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
     else
       lowerToArithmetics(GEP, AccumulativeByteOffset);
@@ -996,6 +999,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
 }
 
 bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
   if (DisableSeparateConstOffsetFromGEP)
     return false;
 
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 046a7cb..fb8fe38 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -21,11 +21,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CFG.h"
@@ -37,6 +37,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "simplifycfg"
@@ -47,36 +48,6 @@ UserBonusInstThreshold("bonus-inst-threshold", cl::Hidden, cl::init(1),
 
 STATISTIC(NumSimpl, "Number of blocks simplified");
 
-namespace {
-struct CFGSimplifyPass : public FunctionPass {
-  static char ID; // Pass identification, replacement for typeid
-  unsigned BonusInstThreshold;
-  CFGSimplifyPass(int T = -1) : FunctionPass(ID) {
-    BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
-    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
-  }
-  bool runOnFunction(Function &F) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AssumptionTracker>();
-    AU.addRequired<TargetTransformInfo>();
-  }
-};
-}
-
-char CFGSimplifyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
-                      false)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
-INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
-                    false)
-
-// Public interface to the CFGSimplification pass
-FunctionPass *llvm::createCFGSimplificationPass(int Threshold) {
-  return new CFGSimplifyPass(Threshold);
-}
-
 /// mergeEmptyReturnBlocks - If we have more than one empty (other than phi
 /// node) return blocks, merge them together to promote recursive block merging.
 static bool mergeEmptyReturnBlocks(Function &F) {
@@ -156,8 +127,7 @@ static bool mergeEmptyReturnBlocks(Function &F) {
 /// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function,
 /// iterating until no more changes are made.
 static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
-                                   const DataLayout *DL,
-                                   AssumptionTracker *AT,
+                                   const DataLayout *DL, AssumptionCache *AC,
                                    unsigned BonusInstThreshold) {
   bool Changed = false;
   bool LocalChange = true;
@@ -167,7 +137,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
     // Loop over all of the basic blocks and remove them if they are unneeded...
     //
     for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
-      if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, DL, AT)) {
+      if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, DL, AC)) {
         LocalChange = true;
         ++NumSimpl;
       }
@@ -177,20 +147,12 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
   return Changed;
 }
 
-// It is possible that we may require multiple passes over the code to fully
-// simplify the CFG.
-//
-bool CFGSimplifyPass::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
-    return false;
-
-  AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
-  const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
+static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
+                                const DataLayout *DL, AssumptionCache *AC,
+                                int BonusInstThreshold) {
   bool EverChanged = removeUnreachableBlocks(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
-  EverChanged |= iterativelySimplifyCFG(F, TTI, DL, AT, BonusInstThreshold);
+  EverChanged |= iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
@@ -204,9 +166,69 @@ bool CFGSimplifyPass::runOnFunction(Function &F) {
     return true;
 
   do {
-    EverChanged = iterativelySimplifyCFG(F, TTI, DL, AT, BonusInstThreshold);
+    EverChanged = iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold);
     EverChanged |= removeUnreachableBlocks(F);
   } while (EverChanged);
 
   return true;
 }
+
+SimplifyCFGPass::SimplifyCFGPass()
+    : BonusInstThreshold(UserBonusInstThreshold) {}
+
+SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold)
+    : BonusInstThreshold(BonusInstThreshold) {}
+
+PreservedAnalyses SimplifyCFGPass::run(Function &F,
+                                       AnalysisManager<Function> *AM) {
+  auto *DL = F.getParent()->getDataLayout();
+  auto &TTI = AM->getResult<TargetIRAnalysis>(F);
+  auto &AC = AM->getResult<AssumptionAnalysis>(F);
+
+  if (!simplifyFunctionCFG(F, TTI, DL, &AC, BonusInstThreshold))
+    return PreservedAnalyses::none();
+
+  return PreservedAnalyses::all();
+}
+
+namespace {
+struct CFGSimplifyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  unsigned BonusInstThreshold;
+  CFGSimplifyPass(int T = -1) : FunctionPass(ID) {
+    BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
+    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override {
+    if (skipOptnoneFunction(F))
+      return false;
+
+    AssumptionCache *AC =
+        &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    const TargetTransformInfo &TTI =
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+    const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
+    return simplifyFunctionCFG(F, TTI, DL, AC, BonusInstThreshold);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+};
+}
+
+char CFGSimplifyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+                    false)
+
+// Public interface to the CFGSimplification pass
+FunctionPass *llvm::createCFGSimplificationPass(int Threshold) {
+  return new CFGSimplifyPass(Threshold);
+}
+
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index 903b675..d0ee0a6 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -50,9 +50,9 @@ namespace {
       FunctionPass::getAnalysisUsage(AU);
       AU.addRequired<AliasAnalysis>();
       AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<LoopInfo>();
+      AU.addRequired<LoopInfoWrapperPass>();
       AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<LoopInfo>();
+      AU.addPreserved<LoopInfoWrapperPass>();
     }
   private:
     bool ProcessBlock(BasicBlock &BB);
@@ -64,7 +64,7 @@ namespace {
 
 char Sinking::ID = 0;
 INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false)
@@ -98,7 +98,7 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst,
 
 bool Sinking::runOnFunction(Function &F) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  LI = &getAnalysis<LoopInfo>();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   AA = &getAnalysis<AliasAnalysis>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
new file mode 100644
index 0000000..4edc86c
--- /dev/null
+++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -0,0 +1,274 @@
+//===-- StraightLineStrengthReduce.cpp - ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements straight-line strength reduction (SLSR). Unlike loop
+// strength reduction, this algorithm is designed to reduce arithmetic
+// redundancy in straight-line code instead of loops. It has proven to be
+// effective in simplifying arithmetic statements derived from an unrolled loop.
+// It can also simplify the logic of SeparateConstOffsetFromGEP.
+//
+// There are many optimizations we can perform in the domain of SLSR. This file
+// for now contains only an initial step. Specifically, we look for strength
+// reduction candidate in the form of
+//
+// (B + i) * S
+//
+// where B and S are integer constants or variables, and i is a constant
+// integer. If we found two such candidates
+//
+// S1: X = (B + i) * S S2: Y = (B + i') * S
+//
+// and S1 dominates S2, we call S1 a basis of S2, and can replace S2 with
+//
+// Y = X + (i' - i) * S
+//
+// where (i' - i) * S is folded to the extent possible. When S2 has multiple
+// bases, we pick the one that is closest to S2, or S2's "immediate" basis.
+//
+// TODO:
+//
+// - Handle candidates in the form of B + i * S
+//
+// - Handle candidates in the form of pointer arithmetics. e.g., B[i * S]
+//
+// - Floating point arithmetics when fast math is enabled.
+//
+// - SLSR may decrease ILP at the architecture level. Targets that are very
+//   sensitive to ILP may want to disable it. Having SLSR to consider ILP is
+//   left as future work.
+#include <vector>
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+namespace {
+
+class StraightLineStrengthReduce : public FunctionPass {
+ public:
+  // SLSR candidate. Such a candidate must be in the form of
+  //   (Base + Index) * Stride
+  struct Candidate : public ilist_node<Candidate> {
+    Candidate(Value *B = nullptr, ConstantInt *Idx = nullptr,
+              Value *S = nullptr, Instruction *I = nullptr)
+        : Base(B), Index(Idx), Stride(S), Ins(I), Basis(nullptr) {}
+    Value *Base;
+    ConstantInt *Index;
+    Value *Stride;
+    // The instruction this candidate corresponds to. It helps us to rewrite a
+    // candidate with respect to its immediate basis. Note that one instruction
+    // can corresponds to multiple candidates depending on how you associate the
+    // expression. For instance,
+    //
+    // (a + 1) * (b + 2)
+    //
+    // can be treated as
+    //
+    // <Base: a, Index: 1, Stride: b + 2>
+    //
+    // or
+    //
+    // <Base: b, Index: 2, Stride: a + 1>
+    Instruction *Ins;
+    // Points to the immediate basis of this candidate, or nullptr if we cannot
+    // find any basis for this candidate.
+    Candidate *Basis;
+  };
+
+  static char ID;
+
+  StraightLineStrengthReduce() : FunctionPass(ID), DT(nullptr) {
+    initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    // We do not modify the shape of the CFG.
+    AU.setPreservesCFG();
+  }
+
+  bool runOnFunction(Function &F) override;
+
+ private:
+  // Returns true if Basis is a basis for C, i.e., Basis dominates C and they
+  // share the same base and stride.
+  bool isBasisFor(const Candidate &Basis, const Candidate &C);
+  // Checks whether I is in a candidate form. If so, adds all the matching forms
+  // to Candidates, and tries to find the immediate basis for each of them.
+  void allocateCandidateAndFindBasis(Instruction *I);
+  // Given that I is in the form of "(B + Idx) * S", adds this form to
+  // Candidates, and finds its immediate basis.
+  void allocateCandidateAndFindBasis(Value *B, ConstantInt *Idx, Value *S,
+                                     Instruction *I);
+  // Rewrites candidate C with respect to Basis.
+  void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis);
+
+  DominatorTree *DT;
+  ilist<Candidate> Candidates;
+  // Temporarily holds all instructions that are unlinked (but not deleted) by
+  // rewriteCandidateWithBasis. These instructions will be actually removed
+  // after all rewriting finishes.
+  DenseSet<Instruction *> UnlinkedInstructions;
+};
+}  // anonymous namespace
+
+char StraightLineStrengthReduce::ID = 0;
+INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr",
+                      "Straight line strength reduction", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr",
+                    "Straight line strength reduction", false, false)
+
+FunctionPass *llvm::createStraightLineStrengthReducePass() {
+  return new StraightLineStrengthReduce();
+}
+
+bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
+                                            const Candidate &C) {
+  return (Basis.Ins != C.Ins && // skip the same instruction
+          // Basis must dominate C in order to rewrite C with respect to Basis.
+          DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) &&
+          // They share the same base and stride.
+          Basis.Base == C.Base &&
+          Basis.Stride == C.Stride);
+}
+
+// TODO: We currently implement an algorithm whose time complexity is linear to
+// the number of existing candidates. However, a better algorithm exists. We
+// could depth-first search the dominator tree, and maintain a hash table that
+// contains all candidates that dominate the node being traversed.  This hash
+// table is indexed by the base and the stride of a candidate.  Therefore,
+// finding the immediate basis of a candidate boils down to one hash-table look
+// up.
+void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Value *B,
+                                                               ConstantInt *Idx,
+                                                               Value *S,
+                                                               Instruction *I) {
+  Candidate C(B, Idx, S, I);
+  // Try to compute the immediate basis of C.
+  unsigned NumIterations = 0;
+  // Limit the scan radius to avoid running forever.
+  static const unsigned MaxNumIterations = 50;
+  for (auto Basis = Candidates.rbegin();
+       Basis != Candidates.rend() && NumIterations < MaxNumIterations;
+       ++Basis, ++NumIterations) {
+    if (isBasisFor(*Basis, C)) {
+      C.Basis = &(*Basis);
+      break;
+    }
+  }
+  // Regardless of whether we find a basis for C, we need to push C to the
+  // candidate list.
+  Candidates.push_back(C);
+}
+
+void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Instruction *I) {
+  Value *B = nullptr;
+  ConstantInt *Idx = nullptr;
+  // "(Base + Index) * Stride" must be a Mul instruction at the first hand.
+  if (I->getOpcode() == Instruction::Mul) {
+    if (IntegerType *ITy = dyn_cast<IntegerType>(I->getType())) {
+      Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+      for (unsigned Swapped = 0; Swapped < 2; ++Swapped) {
+        // Only handle the canonical operand ordering.
+        if (match(LHS, m_Add(m_Value(B), m_ConstantInt(Idx)))) {
+          // If LHS is in the form of "Base + Index", then I is in the form of
+          // "(Base + Index) * RHS".
+          allocateCandidateAndFindBasis(B, Idx, RHS, I);
+        } else {
+          // Otherwise, at least try the form (LHS + 0) * RHS.
+          allocateCandidateAndFindBasis(LHS, ConstantInt::get(ITy, 0), RHS, I);
+        }
+        // Swap LHS and RHS so that we also cover the cases where LHS is the
+        // stride.
+        if (LHS == RHS)
+          break;
+        std::swap(LHS, RHS);
+      }
+    }
+  }
+}
+
+void StraightLineStrengthReduce::rewriteCandidateWithBasis(
+    const Candidate &C, const Candidate &Basis) {
+  // An instruction can correspond to multiple candidates. Therefore, instead of
+  // simply deleting an instruction when we rewrite it, we mark its parent as
+  // nullptr (i.e. unlink it) so that we can skip the candidates whose
+  // instruction is already rewritten.
+  if (!C.Ins->getParent())
+    return;
+  assert(C.Base == Basis.Base && C.Stride == Basis.Stride);
+  // Basis = (B + i) * S
+  // C     = (B + i') * S
+  //   ==>
+  // C     = Basis + (i' - i) * S
+  IRBuilder<> Builder(C.Ins);
+  ConstantInt *IndexOffset = ConstantInt::get(
+      C.Ins->getContext(), C.Index->getValue() - Basis.Index->getValue());
+  Value *Reduced;
+  // TODO: preserve nsw/nuw in some cases.
+  if (IndexOffset->isOne()) {
+    // If (i' - i) is 1, fold C into Basis + S.
+    Reduced = Builder.CreateAdd(Basis.Ins, C.Stride);
+  } else if (IndexOffset->isMinusOne()) {
+    // If (i' - i) is -1, fold C into Basis - S.
+    Reduced = Builder.CreateSub(Basis.Ins, C.Stride);
+  } else {
+    Value *Bump = Builder.CreateMul(C.Stride, IndexOffset);
+    Reduced = Builder.CreateAdd(Basis.Ins, Bump);
+  }
+  Reduced->takeName(C.Ins);
+  C.Ins->replaceAllUsesWith(Reduced);
+  C.Ins->dropAllReferences();
+  // Unlink C.Ins so that we can skip other candidates also corresponding to
+  // C.Ins. The actual deletion is postponed to the end of runOnFunction.
+  C.Ins->removeFromParent();
+  UnlinkedInstructions.insert(C.Ins);
+}
+
+bool StraightLineStrengthReduce::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  // Traverse the dominator tree in the depth-first order. This order makes sure
+  // all bases of a candidate are in Candidates when we process it.
+  for (auto node = GraphTraits<DominatorTree *>::nodes_begin(DT);
+       node != GraphTraits<DominatorTree *>::nodes_end(DT); ++node) {
+    BasicBlock *B = node->getBlock();
+    for (auto I = B->begin(); I != B->end(); ++I) {
+      allocateCandidateAndFindBasis(I);
+    }
+  }
+
+  // Rewrite candidates in the reverse depth-first order. This order makes sure
+  // a candidate being rewritten is not a basis for any other candidate.
+  while (!Candidates.empty()) {
+    const Candidate &C = Candidates.back();
+    if (C.Basis != nullptr) {
+      rewriteCandidateWithBasis(C, *C.Basis);
+    }
+    Candidates.pop_back();
+  }
+
+  // Delete all unlink instructions.
+  for (auto I : UnlinkedInstructions) {
+    delete I;
+  }
+  bool Ret = !UnlinkedInstructions.empty();
+  UnlinkedInstructions.clear();
+  return Ret;
+}
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index b9673ed..aaf6f9a 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -10,11 +10,14 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/RegionPass.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 
 using namespace llvm;
@@ -166,6 +169,7 @@ class StructurizeCFG : public RegionPass {
   Region *ParentRegion;
 
   DominatorTree *DT;
+  LoopInfo *LI;
 
   RNVector Order;
   BBSet Visited;
@@ -247,6 +251,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequiredID(LowerSwitchID);
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     RegionPass::getAnalysisUsage(AU);
   }
@@ -278,11 +283,65 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
 
 /// \brief Build up the general order of nodes
 void StructurizeCFG::orderNodes() {
-  scc_iterator<Region *> I = scc_begin(ParentRegion);
-  for (Order.clear(); !I.isAtEnd(); ++I) {
-    const std::vector<RegionNode *> &Nodes = *I;
-    Order.append(Nodes.begin(), Nodes.end());
+  RNVector TempOrder;
+  ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
+  TempOrder.append(RPOT.begin(), RPOT.end());
+
+  std::map<Loop*, unsigned> LoopBlocks;
+
+
+  // The reverse post-order traversal of the list gives us an ordering close
+  // to what we want.  The only problem with it is that sometimes backedges
+  // for outer loops will be visited before backedges for inner loops.
+  for (RegionNode *RN : TempOrder) {
+    BasicBlock *BB = RN->getEntry();
+    Loop *Loop = LI->getLoopFor(BB);
+    if (!LoopBlocks.count(Loop)) {
+      LoopBlocks[Loop] = 1;
+      continue;
+    }
+    LoopBlocks[Loop]++;
   }
+
+  unsigned CurrentLoopDepth = 0;
+  Loop *CurrentLoop = nullptr;
+  BBSet TempVisited;
+  for (RNVector::iterator I = TempOrder.begin(), E = TempOrder.end(); I != E; ++I) {
+    BasicBlock *BB = (*I)->getEntry();
+    unsigned LoopDepth = LI->getLoopDepth(BB);
+
+    if (std::find(Order.begin(), Order.end(), *I) != Order.end())
+      continue;
+
+    if (LoopDepth < CurrentLoopDepth) {
+      // Make sure we have visited all blocks in this loop before moving back to
+      // the outer loop.
+
+      RNVector::iterator LoopI = I;
+      while(LoopBlocks[CurrentLoop]) {
+        LoopI++;
+        BasicBlock *LoopBB = (*LoopI)->getEntry();
+        if (LI->getLoopFor(LoopBB) == CurrentLoop) {
+          LoopBlocks[CurrentLoop]--;
+          Order.push_back(*LoopI);
+        }
+      }
+    }
+
+    CurrentLoop = LI->getLoopFor(BB);
+    if (CurrentLoop) {
+      LoopBlocks[CurrentLoop]--;
+    }
+
+    CurrentLoopDepth = LoopDepth;
+    Order.push_back(*I);
+  }
+
+  // This pass originally used a post-order traversal and then operated on
+  // the list in reverse. Now that we are using a reverse post-order traversal
+  // rather than re-working the whole pass to operate on the list in order,
+  // we just reverse the list and continue to operate on it in reverse.
+  std::reverse(Order.begin(), Order.end());
 }
 
 /// \brief Determine the end of the loops
@@ -301,8 +360,9 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
     for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
       BasicBlock *Succ = Term->getSuccessor(i);
 
-      if (Visited.count(Succ))
+      if (Visited.count(Succ)) {
         Loops[Succ] = BB;
+      }
     }
   }
 }
@@ -437,6 +497,10 @@ void StructurizeCFG::collectInfos() {
   for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend();
        OI != OE; ++OI) {
 
+    DEBUG(dbgs() << "Visiting: " <<
+                    ((*OI)->isSubRegion() ? "SubRegion with entry: " : "") <<
+                    (*OI)->getEntry()->getName() << " Loop Depth: " << LI->getLoopDepth((*OI)->getEntry()) << "\n");
+
     // Analyze all the conditions leading to a node
     gatherPredicates(*OI);
 
@@ -862,6 +926,7 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
   ParentRegion = R;
 
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
   orderNodes();
   collectInfos();
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index f3c3e30..715ddeb 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -126,7 +126,7 @@ namespace {
 char TailCallElim::ID = 0;
 INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim",
                       "Tail Call Elimination", false, false)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(TailCallElim, "tailcallelim",
                     "Tail Call Elimination", false, false)
 
@@ -136,7 +136,7 @@ FunctionPass *llvm::createTailCallEliminationPass() {
 }
 
 void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<TargetTransformInfo>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
 }
 
 /// \brief Scan the specified function for alloca instructions.
@@ -386,7 +386,7 @@ bool TailCallElim::runTRE(Function &F) {
   // right, so don't even try to convert it...
   if (F.getFunctionType()->isVarArg()) return false;
 
-  TTI = &getAnalysis<TargetTransformInfo>();
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   BasicBlock *OldEntry = nullptr;
   bool TailCallsAreMarkedTail = false;
   SmallVector<PHINode*, 8> ArgumentPHIs;
diff --git a/lib/Transforms/Utils/ASanStackFrameLayout.cpp b/lib/Transforms/Utils/ASanStackFrameLayout.cpp
index cce016a..03c3a80 100644
--- a/lib/Transforms/Utils/ASanStackFrameLayout.cpp
+++ b/lib/Transforms/Utils/ASanStackFrameLayout.cpp
@@ -13,6 +13,7 @@
 #include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/MathExtras.h"
 #include <algorithm>
 
 namespace llvm {
@@ -33,11 +34,6 @@ static inline bool CompareVars(const ASanStackVariableDescription &a,
 // with e.g. alignment 1 and alignment 16 do not get reordered by CompareVars.
 static const size_t kMinAlignment = 16;
 
-static size_t RoundUpTo(size_t X, size_t RoundTo) {
-  assert((RoundTo & (RoundTo - 1)) == 0);
-  return (X + RoundTo - 1) & ~(RoundTo - 1);
-}
-
 // The larger the variable Size the larger is the redzone.
 // The resulting frame size is a multiple of Alignment.
 static size_t VarAndRedzoneSize(size_t Size, size_t Alignment) {
@@ -48,7 +44,7 @@ static size_t VarAndRedzoneSize(size_t Size, size_t Alignment) {
   else if (Size <= 512) Res = Size + 64;
   else if (Size <= 4096) Res = Size + 128;
   else                   Res = Size + 256;
-  return RoundUpTo(Res, Alignment);
+  return RoundUpToAlignment(Res, Alignment);
 }
 
 void
diff --git a/lib/Transforms/Utils/AddDiscriminators.cpp b/lib/Transforms/Utils/AddDiscriminators.cpp
index f8e5af5..820544b 100644
--- a/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -167,7 +167,7 @@ bool AddDiscriminators::runOnFunction(Function &F) {
   bool Changed = false;
   Module *M = F.getParent();
   LLVMContext &Ctx = M->getContext();
-  DIBuilder Builder(*M);
+  DIBuilder Builder(*M, /*AllowUnresolved*/ false);
 
   // Traverse all the blocks looking for instructions in different
   // blocks that are at the same file:line location.
diff --git a/lib/Transforms/Utils/Android.mk b/lib/Transforms/Utils/Android.mk
index e20dc0a..4d24928 100644
--- a/lib/Transforms/Utils/Android.mk
+++ b/lib/Transforms/Utils/Android.mk
@@ -22,7 +22,6 @@ transforms_utils_SRC_FILES := \
   LoopSimplify.cpp \
   LoopUnroll.cpp \
   LoopUnrollRuntime.cpp \
-  LowerExpectIntrinsic.cpp \
   LowerInvoke.cpp \
   LowerSwitch.cpp \
   Mem2Reg.cpp \
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 983f025..b455257 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -65,16 +65,10 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
 /// any single-entry PHI nodes in it, fold them away.  This handles the case
 /// when all entries to the PHI nodes in a block are guaranteed equal, such as
 /// when the block has exactly one predecessor.
-void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, Pass *P) {
+void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, AliasAnalysis *AA,
+                                   MemoryDependenceAnalysis *MemDep) {
   if (!isa<PHINode>(BB->begin())) return;
 
-  AliasAnalysis *AA = nullptr;
-  MemoryDependenceAnalysis *MemDep = nullptr;
-  if (P) {
-    AA = P->getAnalysisIfAvailable<AliasAnalysis>();
-    MemDep = P->getAnalysisIfAvailable<MemoryDependenceAnalysis>();
-  }
-
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
     if (PN->getIncomingValue(0) != PN)
       PN->replaceAllUsesWith(PN->getIncomingValue(0));
@@ -113,7 +107,9 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
 
 /// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor,
 /// if possible.  The return value indicates success or failure.
-bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
+bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
+                                     LoopInfo *LI, AliasAnalysis *AA,
+                                     MemoryDependenceAnalysis *MemDep) {
   // Don't merge away blocks who have their address taken.
   if (BB->hasAddressTaken()) return false;
 
@@ -149,7 +145,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
 
   // Begin by getting rid of unneeded PHIs.
   if (isa<PHINode>(BB->front()))
-    FoldSingleEntryPHINodes(BB, P);
+    FoldSingleEntryPHINodes(BB, AA, MemDep);
 
   // Delete the unconditional branch from the predecessor...
   PredBB->getInstList().pop_back();
@@ -166,28 +162,23 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
     PredBB->takeName(BB);
 
   // Finally, erase the old block and update dominator info.
-  if (P) {
-    if (DominatorTreeWrapperPass *DTWP =
-            P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) {
-      DominatorTree &DT = DTWP->getDomTree();
-      if (DomTreeNode *DTN = DT.getNode(BB)) {
-        DomTreeNode *PredDTN = DT.getNode(PredBB);
-        SmallVector<DomTreeNode*, 8> Children(DTN->begin(), DTN->end());
-        for (SmallVectorImpl<DomTreeNode *>::iterator DI = Children.begin(),
-             DE = Children.end(); DI != DE; ++DI)
-          DT.changeImmediateDominator(*DI, PredDTN);
-
-        DT.eraseNode(BB);
-      }
+  if (DT)
+    if (DomTreeNode *DTN = DT->getNode(BB)) {
+      DomTreeNode *PredDTN = DT->getNode(PredBB);
+      SmallVector<DomTreeNode *, 8> Children(DTN->begin(), DTN->end());
+      for (SmallVectorImpl<DomTreeNode *>::iterator DI = Children.begin(),
+                                                    DE = Children.end();
+           DI != DE; ++DI)
+        DT->changeImmediateDominator(*DI, PredDTN);
+
+      DT->eraseNode(BB);
+    }
 
-      if (LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>())
-        LI->removeBlock(BB);
+  if (LI)
+    LI->removeBlock(BB);
 
-      if (MemoryDependenceAnalysis *MD =
-            P->getAnalysisIfAvailable<MemoryDependenceAnalysis>())
-        MD->invalidateCachedPredecessors();
-    }
-  }
+  if (MemDep)
+    MemDep->invalidateCachedPredecessors();
 
   BB->eraseFromParent();
   return true;
@@ -240,12 +231,14 @@ void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) {
 
 /// SplitEdge -  Split the edge connecting specified block. Pass P must
 /// not be NULL.
-BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
+BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,
+                            LoopInfo *LI) {
   unsigned SuccNum = GetSuccessorNumber(BB, Succ);
 
   // If this is a critical edge, let SplitCriticalEdge do it.
   TerminatorInst *LatchTerm = BB->getTerminator();
-  if (SplitCriticalEdge(LatchTerm, SuccNum, P))
+  if (SplitCriticalEdge(LatchTerm, SuccNum, CriticalEdgeSplittingOptions(DT, LI)
+                                                .setPreserveLCSSA()))
     return LatchTerm->getSuccessor(SuccNum);
 
   // If the edge isn't critical, then BB has a single successor or Succ has a
@@ -255,23 +248,25 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
     // block.
     assert(SP == BB && "CFG broken");
     SP = nullptr;
-    return SplitBlock(Succ, Succ->begin(), P);
+    return SplitBlock(Succ, Succ->begin(), DT, LI);
   }
 
   // Otherwise, if BB has a single successor, split it at the bottom of the
   // block.
   assert(BB->getTerminator()->getNumSuccessors() == 1 &&
          "Should have a single succ!");
-  return SplitBlock(BB, BB->getTerminator(), P);
+  return SplitBlock(BB, BB->getTerminator(), DT, LI);
 }
 
-unsigned llvm::SplitAllCriticalEdges(Function &F, Pass *P) {
+unsigned
+llvm::SplitAllCriticalEdges(Function &F,
+                            const CriticalEdgeSplittingOptions &Options) {
   unsigned NumBroken = 0;
   for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
     TerminatorInst *TI = I->getTerminator();
     if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI))
       for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-        if (SplitCriticalEdge(TI, i, P))
+        if (SplitCriticalEdge(TI, i, Options))
           ++NumBroken;
   }
   return NumBroken;
@@ -282,7 +277,8 @@ unsigned llvm::SplitAllCriticalEdges(Function &F, Pass *P) {
 /// to a new block.  The two blocks are joined by an unconditional branch and
 /// the loop info is updated.
 ///
-BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) {
+BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,
+                             DominatorTree *DT, LoopInfo *LI) {
   BasicBlock::iterator SplitIt = SplitPt;
   while (isa<PHINode>(SplitIt) || isa<LandingPadInst>(SplitIt))
     ++SplitIt;
@@ -290,26 +286,23 @@ BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) {
 
   // The new block lives in whichever loop the old one did. This preserves
   // LCSSA as well, because we force the split point to be after any PHI nodes.
-  if (LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>())
+  if (LI)
     if (Loop *L = LI->getLoopFor(Old))
-      L->addBasicBlockToLoop(New, LI->getBase());
+      L->addBasicBlockToLoop(New, *LI);
 
-  if (DominatorTreeWrapperPass *DTWP =
-          P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) {
-    DominatorTree &DT = DTWP->getDomTree();
+  if (DT)
     // Old dominates New. New node dominates all other nodes dominated by Old.
-    if (DomTreeNode *OldNode = DT.getNode(Old)) {
+    if (DomTreeNode *OldNode = DT->getNode(Old)) {
       std::vector<DomTreeNode *> Children;
       for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end();
            I != E; ++I)
         Children.push_back(*I);
 
-      DomTreeNode *NewNode = DT.addNewBlock(New, Old);
+      DomTreeNode *NewNode = DT->addNewBlock(New, Old);
       for (std::vector<DomTreeNode *>::iterator I = Children.begin(),
              E = Children.end(); I != E; ++I)
-        DT.changeImmediateDominator(*I, NewNode);
+        DT->changeImmediateDominator(*I, NewNode);
     }
-  }
 
   return New;
 }
@@ -318,45 +311,46 @@ BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) {
 /// analysis information.
 static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
                                       ArrayRef<BasicBlock *> Preds,
-                                      Pass *P, bool &HasLoopExit) {
-  if (!P) return;
+                                      DominatorTree *DT, LoopInfo *LI,
+                                      bool PreserveLCSSA, bool &HasLoopExit) {
+  // Update dominator tree if available.
+  if (DT)
+    DT->splitBlock(NewBB);
+
+  // The rest of the logic is only relevant for updating the loop structures.
+  if (!LI)
+    return;
 
-  LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>();
-  Loop *L = LI ? LI->getLoopFor(OldBB) : nullptr;
+  Loop *L = LI->getLoopFor(OldBB);
 
   // If we need to preserve loop analyses, collect some information about how
   // this split will affect loops.
   bool IsLoopEntry = !!L;
   bool SplitMakesNewLoopHeader = false;
-  if (LI) {
-    bool PreserveLCSSA = P->mustPreserveAnalysisID(LCSSAID);
-    for (ArrayRef<BasicBlock*>::iterator
-           i = Preds.begin(), e = Preds.end(); i != e; ++i) {
-      BasicBlock *Pred = *i;
-
-      // If we need to preserve LCSSA, determine if any of the preds is a loop
-      // exit.
-      if (PreserveLCSSA)
-        if (Loop *PL = LI->getLoopFor(Pred))
-          if (!PL->contains(OldBB))
-            HasLoopExit = true;
-
-      // If we need to preserve LoopInfo, note whether any of the preds crosses
-      // an interesting loop boundary.
-      if (!L) continue;
-      if (L->contains(Pred))
-        IsLoopEntry = false;
-      else
-        SplitMakesNewLoopHeader = true;
-    }
+  for (ArrayRef<BasicBlock *>::iterator i = Preds.begin(), e = Preds.end();
+       i != e; ++i) {
+    BasicBlock *Pred = *i;
+
+    // If we need to preserve LCSSA, determine if any of the preds is a loop
+    // exit.
+    if (PreserveLCSSA)
+      if (Loop *PL = LI->getLoopFor(Pred))
+        if (!PL->contains(OldBB))
+          HasLoopExit = true;
+
+    // If we need to preserve LoopInfo, note whether any of the preds crosses
+    // an interesting loop boundary.
+    if (!L)
+      continue;
+    if (L->contains(Pred))
+      IsLoopEntry = false;
+    else
+      SplitMakesNewLoopHeader = true;
   }
 
-  // Update dominator tree if available.
-  if (DominatorTreeWrapperPass *DTWP =
-          P->getAnalysisIfAvailable<DominatorTreeWrapperPass>())
-    DTWP->getDomTree().splitBlock(NewBB);
-
-  if (!L) return;
+  // Unless we have a loop for OldBB, nothing else to do here.
+  if (!L)
+    return;
 
   if (IsLoopEntry) {
     // Add the new block to the nearest enclosing loop (and not an adjacent
@@ -382,9 +376,9 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
     }
 
     if (InnermostPredLoop)
-      InnermostPredLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+      InnermostPredLoop->addBasicBlockToLoop(NewBB, *LI);
   } else {
-    L->addBasicBlockToLoop(NewBB, LI->getBase());
+    L->addBasicBlockToLoop(NewBB, *LI);
     if (SplitMakesNewLoopHeader)
       L->moveToHeader(NewBB);
   }
@@ -393,10 +387,9 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
 /// UpdatePHINodes - Update the PHI nodes in OrigBB to include the values coming
 /// from NewBB. This also updates AliasAnalysis, if available.
 static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
-                           ArrayRef<BasicBlock*> Preds, BranchInst *BI,
-                           Pass *P, bool HasLoopExit) {
+                           ArrayRef<BasicBlock *> Preds, BranchInst *BI,
+                           AliasAnalysis *AA, bool HasLoopExit) {
   // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB.
-  AliasAnalysis *AA = P ? P->getAnalysisIfAvailable<AliasAnalysis>() : nullptr;
   SmallPtrSet<BasicBlock *, 16> PredSet(Preds.begin(), Preds.end());
   for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) {
     PHINode *PN = cast<PHINode>(I++);
@@ -461,11 +454,15 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
   }
 }
 
-/// SplitBlockPredecessors - This method transforms BB by introducing a new
-/// basic block into the function, and moving some of the predecessors of BB to
-/// be predecessors of the new block.  The new predecessors are indicated by the
-/// Preds array, which has NumPreds elements in it.  The new block is given a
-/// suffix of 'Suffix'.
+/// SplitBlockPredecessors - This method introduces at least one new basic block
+/// into the function and moves some of the predecessors of BB to be
+/// predecessors of the new block. The new predecessors are indicated by the
+/// Preds array. The new block is given a suffix of 'Suffix'. Returns new basic
+/// block to which predecessors from Preds are now pointing.
+///
+/// If BB is a landingpad block then additional basicblock might be introduced.
+/// It will have suffix of 'Suffix'+".split_lp".
+/// See SplitLandingPadPredecessors for more details on this case.
 ///
 /// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
 /// LoopInfo, and LCCSA but no other analyses. In particular, it does not
@@ -473,8 +470,21 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
 /// of the edges being split is an exit of a loop with other exits).
 ///
 BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
-                                         ArrayRef<BasicBlock*> Preds,
-                                         const char *Suffix, Pass *P) {
+                                         ArrayRef<BasicBlock *> Preds,
+                                         const char *Suffix, AliasAnalysis *AA,
+                                         DominatorTree *DT, LoopInfo *LI,
+                                         bool PreserveLCSSA) {
+  // For the landingpads we need to act a bit differently.
+  // Delegate this work to the SplitLandingPadPredecessors.
+  if (BB->isLandingPad()) {
+    SmallVector<BasicBlock*, 2> NewBBs;
+    std::string NewName = std::string(Suffix) + ".split-lp";
+
+    SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(),
+                                NewBBs, AA, DT, LI, PreserveLCSSA);
+    return NewBBs[0];
+  }
+
   // Create new basic block, insert right before the original block.
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+Suffix,
                                          BB->getParent(), BB);
@@ -505,10 +515,11 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
 
   // Update DominatorTree, LoopInfo, and LCCSA analysis information.
   bool HasLoopExit = false;
-  UpdateAnalysisInformation(BB, NewBB, Preds, P, HasLoopExit);
+  UpdateAnalysisInformation(BB, NewBB, Preds, DT, LI, PreserveLCSSA,
+                            HasLoopExit);
 
   // Update the PHI nodes in BB with the values coming from NewBB.
-  UpdatePHINodes(BB, NewBB, Preds, BI, P, HasLoopExit);
+  UpdatePHINodes(BB, NewBB, Preds, BI, AA, HasLoopExit);
   return NewBB;
 }
 
@@ -526,10 +537,11 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
 /// exits).
 ///
 void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
-                                       ArrayRef<BasicBlock*> Preds,
+                                       ArrayRef<BasicBlock *> Preds,
                                        const char *Suffix1, const char *Suffix2,
-                                       Pass *P,
-                                       SmallVectorImpl<BasicBlock*> &NewBBs) {
+                                       SmallVectorImpl<BasicBlock *> &NewBBs,
+                                       AliasAnalysis *AA, DominatorTree *DT,
+                                       LoopInfo *LI, bool PreserveLCSSA) {
   assert(OrigBB->isLandingPad() && "Trying to split a non-landing pad!");
 
   // Create a new basic block for OrigBB's predecessors listed in Preds. Insert
@@ -552,12 +564,12 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
     Preds[i]->getTerminator()->replaceUsesOfWith(OrigBB, NewBB1);
   }
 
-  // Update DominatorTree, LoopInfo, and LCCSA analysis information.
   bool HasLoopExit = false;
-  UpdateAnalysisInformation(OrigBB, NewBB1, Preds, P, HasLoopExit);
+  UpdateAnalysisInformation(OrigBB, NewBB1, Preds, DT, LI, PreserveLCSSA,
+                            HasLoopExit);
 
   // Update the PHI nodes in OrigBB with the values coming from NewBB1.
-  UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, P, HasLoopExit);
+  UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, AA, HasLoopExit);
 
   // Move the remaining edges from OrigBB to point to NewBB2.
   SmallVector<BasicBlock*, 8> NewBB2Preds;
@@ -589,10 +601,11 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
 
     // Update DominatorTree, LoopInfo, and LCCSA analysis information.
     HasLoopExit = false;
-    UpdateAnalysisInformation(OrigBB, NewBB2, NewBB2Preds, P, HasLoopExit);
+    UpdateAnalysisInformation(OrigBB, NewBB2, NewBB2Preds, DT, LI,
+                              PreserveLCSSA, HasLoopExit);
 
     // Update the PHI nodes in OrigBB with the values coming from NewBB2.
-    UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, P, HasLoopExit);
+    UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, AA, HasLoopExit);
   }
 
   LandingPadInst *LPad = OrigBB->getLandingPadInst();
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index eda22cf..7e83c9e 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/CFG.h"
@@ -41,14 +42,19 @@ namespace {
     }
 
     bool runOnFunction(Function &F) override {
-      unsigned N = SplitAllCriticalEdges(F, this);
+      auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+      auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+      auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+      auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+      unsigned N =
+          SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI));
       NumBroken += N;
       return N > 0;
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<LoopInfo>();
+      AU.addPreserved<LoopInfoWrapperPass>();
 
       // No loop canonicalization guarantees are broken by this pass.
       AU.addPreservedID(LoopSimplifyID);
@@ -125,10 +131,9 @@ static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
 /// to.
 ///
 BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
-                                    Pass *P, bool MergeIdenticalEdges,
-                                    bool DontDeleteUselessPhis,
-                                    bool SplitLandingPads) {
-  if (!isCriticalEdge(TI, SuccNum, MergeIdenticalEdges)) return nullptr;
+                                    const CriticalEdgeSplittingOptions &Options) {
+  if (!isCriticalEdge(TI, SuccNum, Options.MergeIdenticalEdges))
+    return nullptr;
 
   assert(!isa<IndirectBrInst>(TI) &&
          "Cannot split critical edge from IndirectBrInst");
@@ -179,29 +184,22 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
   // If there are any other edges from TIBB to DestBB, update those to go
   // through the split block, making those edges non-critical as well (and
   // reducing the number of phi entries in the DestBB if relevant).
-  if (MergeIdenticalEdges) {
+  if (Options.MergeIdenticalEdges) {
     for (unsigned i = SuccNum+1, e = TI->getNumSuccessors(); i != e; ++i) {
       if (TI->getSuccessor(i) != DestBB) continue;
 
       // Remove an entry for TIBB from DestBB phi nodes.
-      DestBB->removePredecessor(TIBB, DontDeleteUselessPhis);
+      DestBB->removePredecessor(TIBB, Options.DontDeleteUselessPHIs);
 
       // We found another edge to DestBB, go to NewBB instead.
       TI->setSuccessor(i, NewBB);
     }
   }
 
-
-
-  // If we don't have a pass object, we can't update anything...
-  if (!P) return NewBB;
-
-  DominatorTreeWrapperPass *DTWP =
-      P->getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>();
-
   // If we have nothing to update, just return.
+  auto *AA = Options.AA;
+  auto *DT = Options.DT;
+  auto *LI = Options.LI;
   if (!DT && !LI)
     return NewBB;
 
@@ -268,13 +266,13 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
       if (Loop *DestLoop = LI->getLoopFor(DestBB)) {
         if (TIL == DestLoop) {
           // Both in the same loop, the NewBB joins loop.
-          DestLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+          DestLoop->addBasicBlockToLoop(NewBB, *LI);
         } else if (TIL->contains(DestLoop)) {
           // Edge from an outer loop to an inner loop.  Add to the outer loop.
-          TIL->addBasicBlockToLoop(NewBB, LI->getBase());
+          TIL->addBasicBlockToLoop(NewBB, *LI);
         } else if (DestLoop->contains(TIL)) {
           // Edge from an inner loop to an outer loop.  Add to the outer loop.
-          DestLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+          DestLoop->addBasicBlockToLoop(NewBB, *LI);
         } else {
           // Edge from two loops with no containment relation.  Because these
           // are natural loops, we know that the destination block must be the
@@ -283,19 +281,20 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
           assert(DestLoop->getHeader() == DestBB &&
                  "Should not create irreducible loops!");
           if (Loop *P = DestLoop->getParentLoop())
-            P->addBasicBlockToLoop(NewBB, LI->getBase());
+            P->addBasicBlockToLoop(NewBB, *LI);
         }
       }
+
       // If TIBB is in a loop and DestBB is outside of that loop, we may need
       // to update LoopSimplify form and LCSSA form.
-      if (!TIL->contains(DestBB) &&
-          P->mustPreserveAnalysisID(LoopSimplifyID)) {
+      if (!TIL->contains(DestBB)) {
         assert(!TIL->contains(NewBB) &&
                "Split point for loop exit is contained in loop!");
 
         // Update LCSSA form in the newly created exit block.
-        if (P->mustPreserveAnalysisID(LCSSAID))
+        if (Options.PreserveLCSSA) {
           createPHIsForSplitLoopExit(TIBB, NewBB, DestBB);
+        }
 
         // The only that we can break LoopSimplify form by splitting a critical
         // edge is if after the split there exists some edge from TIL to DestBB
@@ -322,20 +321,12 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
         if (!LoopPreds.empty()) {
           assert(!DestBB->isLandingPad() &&
                  "We don't split edges to landing pads!");
-          BasicBlock *NewExitBB =
-              SplitBlockPredecessors(DestBB, LoopPreds, "split", P);
-          if (P->mustPreserveAnalysisID(LCSSAID))
+          BasicBlock *NewExitBB = SplitBlockPredecessors(
+              DestBB, LoopPreds, "split", AA, DT, LI, Options.PreserveLCSSA);
+          if (Options.PreserveLCSSA)
             createPHIsForSplitLoopExit(LoopPreds, NewExitBB, DestBB);
         }
       }
-      // LCSSA form was updated above for the case where LoopSimplify is
-      // available, which means that all predecessors of loop exit blocks
-      // are within the loop. Without LoopSimplify form, it would be
-      // necessary to insert a new phi.
-      assert((!P->mustPreserveAnalysisID(LCSSAID) ||
-              P->mustPreserveAnalysisID(LoopSimplifyID)) &&
-             "SplitCriticalEdge doesn't know how to update LCCSA form "
-             "without LoopSimplify!");
     }
   }
 
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 112d26c..762a83f 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -21,7 +21,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 
 using namespace llvm;
 
@@ -486,135 +486,3 @@ Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
     CI->setCallingConv(Fn->getCallingConv());
   return CI;
 }
-
-SimplifyFortifiedLibCalls::~SimplifyFortifiedLibCalls() { }
-
-bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const DataLayout *TD,
-                                     const TargetLibraryInfo *TLI) {
-  // We really need DataLayout for later.
-  if (!TD) return false;
-  
-  this->CI = CI;
-  Function *Callee = CI->getCalledFunction();
-  StringRef Name = Callee->getName();
-  FunctionType *FT = Callee->getFunctionType();
-  LLVMContext &Context = CI->getParent()->getContext();
-  IRBuilder<> B(CI);
-
-  if (Name == "__memcpy_chk") {
-    // Check if this has the right signature.
-    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(Context) ||
-        FT->getParamType(3) != TD->getIntPtrType(Context))
-      return false;
-
-    if (isFoldable(3, 2, false)) {
-      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                     CI->getArgOperand(2), 1);
-      replaceCall(CI->getArgOperand(0));
-      return true;
-    }
-    return false;
-  }
-
-  // Should be similar to memcpy.
-  if (Name == "__mempcpy_chk") {
-    return false;
-  }
-
-  if (Name == "__memmove_chk") {
-    // Check if this has the right signature.
-    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(Context) ||
-        FT->getParamType(3) != TD->getIntPtrType(Context))
-      return false;
-
-    if (isFoldable(3, 2, false)) {
-      B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
-                      CI->getArgOperand(2), 1);
-      replaceCall(CI->getArgOperand(0));
-      return true;
-    }
-    return false;
-  }
-
-  if (Name == "__memset_chk") {
-    // Check if this has the right signature.
-    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isIntegerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(Context) ||
-        FT->getParamType(3) != TD->getIntPtrType(Context))
-      return false;
-
-    if (isFoldable(3, 2, false)) {
-      Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(),
-                                   false);
-      B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
-      replaceCall(CI->getArgOperand(0));
-      return true;
-    }
-    return false;
-  }
-
-  if (Name == "__strcpy_chk" || Name == "__stpcpy_chk") {
-    // Check if this has the right signature.
-    if (FT->getNumParams() != 3 ||
-        FT->getReturnType() != FT->getParamType(0) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
-        FT->getParamType(2) != TD->getIntPtrType(Context))
-      return 0;
-    
-    
-    // If a) we don't have any length information, or b) we know this will
-    // fit then just lower to a plain st[rp]cpy. Otherwise we'll keep our
-    // st[rp]cpy_chk call which may fail at runtime if the size is too long.
-    // TODO: It might be nice to get a maximum length out of the possible
-    // string lengths for varying.
-    if (isFoldable(2, 1, true)) {
-      Value *Ret = EmitStrCpy(CI->getArgOperand(0), CI->getArgOperand(1), B, TD,
-                              TLI, Name.substr(2, 6));
-      if (!Ret)
-        return false;
-      replaceCall(Ret);
-      return true;
-    }
-    return false;
-  }
-
-  if (Name == "__strncpy_chk" || Name == "__stpncpy_chk") {
-    // Check if this has the right signature.
-    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
-        !FT->getParamType(2)->isIntegerTy() ||
-        FT->getParamType(3) != TD->getIntPtrType(Context))
-      return false;
-
-    if (isFoldable(3, 2, false)) {
-      Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                               CI->getArgOperand(2), B, TD, TLI,
-                               Name.substr(2, 7));
-      if (!Ret)
-        return false;
-      replaceCall(Ret);
-      return true;
-    }
-    return false;
-  }
-
-  if (Name == "__strcat_chk") {
-    return false;
-  }
-
-  if (Name == "__strncat_chk") {
-    return false;
-  }
-
-  return false;
-}
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 6ce22b1..01e811f 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -21,7 +21,6 @@ add_llvm_library(LLVMTransformUtils
   LoopSimplify.cpp
   LoopUnroll.cpp
   LoopUnrollRuntime.cpp
-  LowerExpectIntrinsic.cpp
   LowerInvoke.cpp
   LowerSwitch.cpp
   Mem2Reg.cpp
@@ -37,6 +36,10 @@ add_llvm_library(LLVMTransformUtils
   UnifyFunctionExitNodes.cpp
   Utils.cpp
   ValueMapper.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Utils
   )
 
 add_dependencies(LLVMTransformUtils intrinsics_gen)
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 5c8f20d..09279b6 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -164,14 +164,13 @@ static MDNode* FindSubprogram(const Function *F, DebugInfoFinder &Finder) {
 
 // Add an operand to an existing MDNode. The new operand will be added at the
 // back of the operand list.
-static void AddOperand(MDNode *Node, Value *Operand) {
-  SmallVector<Value*, 16> Operands;
-  for (unsigned i = 0; i < Node->getNumOperands(); i++) {
-    Operands.push_back(Node->getOperand(i));
-  }
-  Operands.push_back(Operand);
-  MDNode *NewNode = MDNode::get(Node->getContext(), Operands);
-  Node->replaceAllUsesWith(NewNode);
+static void AddOperand(DICompileUnit CU, DIArray SPs, Metadata *NewSP) {
+  SmallVector<Metadata *, 16> NewSPs;
+  NewSPs.reserve(SPs->getNumOperands() + 1);
+  for (unsigned I = 0, E = SPs->getNumOperands(); I != E; ++I)
+    NewSPs.push_back(SPs->getOperand(I));
+  NewSPs.push_back(NewSP);
+  CU.replaceSubprograms(DIArray(MDNode::get(CU->getContext(), NewSPs)));
 }
 
 // Clone the module-level debug info associated with OldFunc. The cloned data
@@ -187,7 +186,7 @@ static void CloneDebugInfoMetadata(Function *NewFunc, const Function *OldFunc,
   // Ensure that OldFunc appears in the map.
   // (if it's already there it must point to NewFunc anyway)
   VMap[OldFunc] = NewFunc;
-  DISubprogram NewSubprogram(MapValue(OldSubprogramMDNode, VMap));
+  DISubprogram NewSubprogram(MapMetadata(OldSubprogramMDNode, VMap));
 
   for (DICompileUnit CU : Finder.compile_units()) {
     DIArray Subprograms(CU.getSubprograms());
@@ -196,7 +195,8 @@ static void CloneDebugInfoMetadata(Function *NewFunc, const Function *OldFunc,
     // also contain the new one.
     for (unsigned i = 0; i < Subprograms.getNumElements(); i++) {
       if ((MDNode*)Subprograms.getElement(i) == OldSubprogramMDNode) {
-        AddOperand(Subprograms, NewSubprogram);
+        AddOperand(CU, Subprograms, NewSubprogram);
+        break;
       }
     }
   }
@@ -260,21 +260,36 @@ namespace {
     const char *NameSuffix;
     ClonedCodeInfo *CodeInfo;
     const DataLayout *DL;
+    CloningDirector *Director;
+    ValueMapTypeRemapper *TypeMapper;
+    ValueMaterializer *Materializer;
+
   public:
     PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
                           ValueToValueMapTy &valueMap,
                           bool moduleLevelChanges,
                           const char *nameSuffix, 
                           ClonedCodeInfo *codeInfo,
-                          const DataLayout *DL)
+                          const DataLayout *DL,
+                          CloningDirector *Director)
     : NewFunc(newFunc), OldFunc(oldFunc),
       VMap(valueMap), ModuleLevelChanges(moduleLevelChanges),
-      NameSuffix(nameSuffix), CodeInfo(codeInfo), DL(DL) {
+      NameSuffix(nameSuffix), CodeInfo(codeInfo), DL(DL),
+      Director(Director) {
+      // These are optional components.  The Director may return null.
+      if (Director) {
+        TypeMapper = Director->getTypeRemapper();
+        Materializer = Director->getValueMaterializer();
+      } else {
+        TypeMapper = nullptr;
+        Materializer = nullptr;
+      }
     }
 
     /// CloneBlock - The specified block is found to be reachable, clone it and
     /// anything that it can reach.
-    void CloneBlock(const BasicBlock *BB,
+    void CloneBlock(const BasicBlock *BB, 
+                    BasicBlock::const_iterator StartingInst,
                     std::vector<const BasicBlock*> &ToClone);
   };
 }
@@ -282,6 +297,7 @@ namespace {
 /// CloneBlock - The specified block is found to be reachable, clone it and
 /// anything that it can reach.
 void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
+                                       BasicBlock::const_iterator StartingInst,
                                        std::vector<const BasicBlock*> &ToClone){
   WeakVH &BBEntry = VMap[BB];
 
@@ -307,21 +323,39 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
                                             const_cast<BasicBlock*>(BB));
     VMap[OldBBAddr] = BlockAddress::get(NewFunc, NewBB);
   }
-    
 
   bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
-  
+
   // Loop over all instructions, and copy them over, DCE'ing as we go.  This
   // loop doesn't include the terminator.
-  for (BasicBlock::const_iterator II = BB->begin(), IE = --BB->end();
+  for (BasicBlock::const_iterator II = StartingInst, IE = --BB->end();
        II != IE; ++II) {
+    // If the "Director" remaps the instruction, don't clone it.
+    if (Director) {
+      CloningDirector::CloningAction Action 
+                              = Director->handleInstruction(VMap, II, NewBB);
+      // If the cloning director says stop, we want to stop everything, not
+      // just break out of the loop (which would cause the terminator to be
+      // cloned).  The cloning director is responsible for inserting a proper
+      // terminator into the new basic block in this case.
+      if (Action == CloningDirector::StopCloningBB)
+        return;
+      // If the cloning director says skip, continue to the next instruction.
+      // In this case, the cloning director is responsible for mapping the
+      // skipped instruction to some value that is defined in the new
+      // basic block.
+      if (Action == CloningDirector::SkipInstruction)
+        continue;
+    }
+
     Instruction *NewInst = II->clone();
 
     // Eagerly remap operands to the newly cloned instruction, except for PHI
     // nodes for which we defer processing until we update the CFG.
     if (!isa<PHINode>(NewInst)) {
       RemapInstruction(NewInst, VMap,
-                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
+                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                       TypeMapper, Materializer);
 
       // If we can simplify this instruction to some other value, simply add
       // a mapping to that value rather than inserting a new instruction into
@@ -354,6 +388,18 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
   // Finally, clone over the terminator.
   const TerminatorInst *OldTI = BB->getTerminator();
   bool TerminatorDone = false;
+  if (Director) {
+    CloningDirector::CloningAction Action 
+                           = Director->handleInstruction(VMap, OldTI, NewBB);
+    // If the cloning director says stop, we want to stop everything, not
+    // just break out of the loop (which would cause the terminator to be
+    // cloned).  The cloning director is responsible for inserting a proper
+    // terminator into the new basic block in this case.
+    if (Action == CloningDirector::StopCloningBB)
+      return;
+    assert(Action != CloningDirector::SkipInstruction && 
+           "SkipInstruction is not valid for terminators.");
+  }
   if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) {
     if (BI->isConditional()) {
       // If the condition was a known constant in the callee...
@@ -409,39 +455,55 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
   }
 }
 
-/// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
-/// except that it does some simple constant prop and DCE on the fly.  The
-/// effect of this is to copy significantly less code in cases where (for
-/// example) a function call with constant arguments is inlined, and those
-/// constant arguments cause a significant amount of code in the callee to be
-/// dead.  Since this doesn't produce an exact copy of the input, it can't be
-/// used for things like CloneFunction or CloneModule.
-void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
+/// CloneAndPruneIntoFromInst - This works like CloneAndPruneFunctionInto, except
+/// that it does not clone the entire function. Instead it starts at an
+/// instruction provided by the caller and copies (and prunes) only the code 
+/// reachable from that instruction.
+void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
+                                     const Instruction *StartingInst,
                                      ValueToValueMapTy &VMap,
                                      bool ModuleLevelChanges,
-                                     SmallVectorImpl<ReturnInst*> &Returns,
+                                     SmallVectorImpl<ReturnInst *> &Returns,
                                      const char *NameSuffix, 
                                      ClonedCodeInfo *CodeInfo,
                                      const DataLayout *DL,
-                                     Instruction *TheCall) {
+                                     CloningDirector *Director) {
   assert(NameSuffix && "NameSuffix cannot be null!");
-  
+
+  ValueMapTypeRemapper *TypeMapper = nullptr;
+  ValueMaterializer *Materializer = nullptr;
+
+  if (Director) {
+    TypeMapper = Director->getTypeRemapper();
+    Materializer = Director->getValueMaterializer();
+  }
+
 #ifndef NDEBUG
-  for (Function::const_arg_iterator II = OldFunc->arg_begin(), 
-       E = OldFunc->arg_end(); II != E; ++II)
-    assert(VMap.count(II) && "No mapping from source argument specified!");
+  // If the cloning starts at the begining of the function, verify that
+  // the function arguments are mapped.
+  if (!StartingInst)
+    for (Function::const_arg_iterator II = OldFunc->arg_begin(),
+         E = OldFunc->arg_end(); II != E; ++II)
+      assert(VMap.count(II) && "No mapping from source argument specified!");
 #endif
 
   PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges,
-                            NameSuffix, CodeInfo, DL);
+                            NameSuffix, CodeInfo, DL, Director);
+  const BasicBlock *StartingBB;
+  if (StartingInst)
+    StartingBB = StartingInst->getParent();
+  else {
+    StartingBB = &OldFunc->getEntryBlock();
+    StartingInst = StartingBB->begin();
+  }
 
   // Clone the entry block, and anything recursively reachable from it.
   std::vector<const BasicBlock*> CloneWorklist;
-  CloneWorklist.push_back(&OldFunc->getEntryBlock());
+  PFC.CloneBlock(StartingBB, StartingInst, CloneWorklist);
   while (!CloneWorklist.empty()) {
     const BasicBlock *BB = CloneWorklist.back();
     CloneWorklist.pop_back();
-    PFC.CloneBlock(BB, CloneWorklist);
+    PFC.CloneBlock(BB, BB->begin(), CloneWorklist);
   }
   
   // Loop over all of the basic blocks in the old function.  If the block was
@@ -470,7 +532,8 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
     // Finally, remap the terminator instructions, as those can't be remapped
     // until all BBs are mapped.
     RemapInstruction(NewBB->getTerminator(), VMap,
-                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
+                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                     TypeMapper, Materializer);
   }
   
   // Defer PHI resolution until rest of function is resolved, PHI resolution
@@ -569,7 +632,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
   // and zap unconditional fall-through branches.  This happen all the time when
   // specializing code: code specialization turns conditional branches into
   // uncond branches, and this code folds them.
-  Function::iterator Begin = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]);
+  Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB]);
   Function::iterator I = Begin;
   while (I != NewFunc->end()) {
     // Check if this block has become dead during inlining or other
@@ -620,9 +683,30 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
   // Make a final pass over the basic blocks from theh old function to gather
   // any return instructions which survived folding. We have to do this here
   // because we can iteratively remove and merge returns above.
-  for (Function::iterator I = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]),
+  for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB]),
                           E = NewFunc->end();
        I != E; ++I)
     if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator()))
       Returns.push_back(RI);
 }
+
+
+/// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
+/// except that it does some simple constant prop and DCE on the fly.  The
+/// effect of this is to copy significantly less code in cases where (for
+/// example) a function call with constant arguments is inlined, and those
+/// constant arguments cause a significant amount of code in the callee to be
+/// dead.  Since this doesn't produce an exact copy of the input, it can't be
+/// used for things like CloneFunction or CloneModule.
+void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
+                                     ValueToValueMapTy &VMap,
+                                     bool ModuleLevelChanges,
+                                     SmallVectorImpl<ReturnInst*> &Returns,
+                                     const char *NameSuffix, 
+                                     ClonedCodeInfo *CodeInfo,
+                                     const DataLayout *DL,
+                                     Instruction *TheCall) {
+  CloneAndPruneIntoFromInst(NewFunc, OldFunc, OldFunc->front().begin(),
+                            VMap, ModuleLevelChanges, Returns, NameSuffix,
+                            CodeInfo, DL, nullptr);
+}
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index d078c96..fae9ff5 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -109,7 +109,7 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
        I != E; ++I) {
     GlobalAlias *GA = cast<GlobalAlias>(VMap[I]);
     if (const Constant *C = I->getAliasee())
-      GA->setAliasee(cast<GlobalObject>(MapValue(C, VMap)));
+      GA->setAliasee(MapValue(C, VMap));
   }
 
   // And named metadata....
@@ -118,7 +118,7 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
     const NamedMDNode &NMD = *I;
     NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName());
     for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
-      NewNMD->addOperand(MapValue(NMD.getOperand(i), VMap));
+      NewNMD->addOperand(MapMetadata(NMD.getOperand(i), VMap));
   }
 
   return New;
diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp
index 9972b22..003da58 100644
--- a/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -39,6 +39,19 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
                           F->getEntryBlock().begin());
   }
 
+  // We cannot demote invoke instructions to the stack if their normal edge
+  // is critical. Therefore, split the critical edge and create a basic block
+  // into which the store can be inserted.
+  if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
+    if (!II->getNormalDest()->getSinglePredecessor()) {
+      unsigned SuccNum = GetSuccessorNumber(II->getParent(), II->getNormalDest());
+      assert(isCriticalEdge(II, SuccNum) && "Expected a critical edge!");
+      BasicBlock *BB = SplitCriticalEdge(II, SuccNum);
+      assert(BB && "Unable to split critical edge.");
+      (void)BB;
+    }
+  }
+
   // Change all of the users of the instruction to read from the stack slot.
   while (!I.use_empty()) {
     Instruction *U = cast<Instruction>(I.user_back());
@@ -71,7 +84,6 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
     }
   }
 
-
   // Insert stores of the computed value into the stack slot. We have to be
   // careful if I is an invoke instruction, because we can't insert the store
   // AFTER the terminator instruction.
@@ -79,27 +91,13 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
   if (!isa<TerminatorInst>(I)) {
     InsertPt = &I;
     ++InsertPt;
+    for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt)
+      /* empty */;   // Don't insert before PHI nodes or landingpad instrs.
   } else {
     InvokeInst &II = cast<InvokeInst>(I);
-    if (II.getNormalDest()->getSinglePredecessor())
-      InsertPt = II.getNormalDest()->getFirstInsertionPt();
-    else {
-      // We cannot demote invoke instructions to the stack if their normal edge
-      // is critical.  Therefore, split the critical edge and insert the store
-      // in the newly created basic block.
-      unsigned SuccNum = GetSuccessorNumber(I.getParent(), II.getNormalDest());
-      TerminatorInst *TI = &cast<TerminatorInst>(I);
-      assert (isCriticalEdge(TI, SuccNum) &&
-              "Expected a critical edge!");
-      BasicBlock *BB = SplitCriticalEdge(TI, SuccNum);
-      assert (BB && "Unable to split critical edge.");
-      InsertPt = BB->getFirstInsertionPt();
-    }
+    InsertPt = II.getNormalDest()->getFirstInsertionPt();
   }
 
-  for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt)
-    /* empty */;   // Don't insert before PHI nodes or landingpad instrs.
-
   new StoreInst(&I, Slot, InsertPt);
   return Slot;
 }
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 2d0b7dc..c2ef1ac 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -18,7 +18,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -30,6 +30,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -308,7 +309,7 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
 
   // Walk the existing metadata, adding the complete (perhaps cyclic) chain to
   // the set.
-  SmallVector<const Value *, 16> Queue(MD.begin(), MD.end());
+  SmallVector<const Metadata *, 16> Queue(MD.begin(), MD.end());
   while (!Queue.empty()) {
     const MDNode *M = cast<MDNode>(Queue.pop_back_val());
     for (unsigned i = 0, ie = M->getNumOperands(); i != ie; ++i)
@@ -319,13 +320,12 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
 
   // Now we have a complete set of all metadata in the chains used to specify
   // the noalias scopes and the lists of those scopes.
-  SmallVector<MDNode *, 16> DummyNodes;
-  DenseMap<const MDNode *, TrackingVH<MDNode> > MDMap;
+  SmallVector<TempMDTuple, 16> DummyNodes;
+  DenseMap<const MDNode *, TrackingMDNodeRef> MDMap;
   for (SetVector<const MDNode *>::iterator I = MD.begin(), IE = MD.end();
        I != IE; ++I) {
-    MDNode *Dummy = MDNode::getTemporary(CalledFunc->getContext(), None);
-    DummyNodes.push_back(Dummy);
-    MDMap[*I] = Dummy;
+    DummyNodes.push_back(MDTuple::getTemporary(CalledFunc->getContext(), None));
+    MDMap[*I].reset(DummyNodes.back().get());
   }
 
   // Create new metadata nodes to replace the dummy nodes, replacing old
@@ -333,17 +333,18 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
   // node.
   for (SetVector<const MDNode *>::iterator I = MD.begin(), IE = MD.end();
        I != IE; ++I) {
-    SmallVector<Value *, 4> NewOps;
+    SmallVector<Metadata *, 4> NewOps;
     for (unsigned i = 0, ie = (*I)->getNumOperands(); i != ie; ++i) {
-      const Value *V = (*I)->getOperand(i);
+      const Metadata *V = (*I)->getOperand(i);
       if (const MDNode *M = dyn_cast<MDNode>(V))
         NewOps.push_back(MDMap[M]);
       else
-        NewOps.push_back(const_cast<Value *>(V));
+        NewOps.push_back(const_cast<Metadata *>(V));
     }
 
-    MDNode *NewM = MDNode::get(CalledFunc->getContext(), NewOps),
-           *TempM = MDMap[*I];
+    MDNode *NewM = MDNode::get(CalledFunc->getContext(), NewOps);
+    MDTuple *TempM = cast<MDTuple>(MDMap[*I]);
+    assert(TempM->isTemporary() && "Expected temporary node");
 
     TempM->replaceAllUsesWith(NewM);
   }
@@ -388,10 +389,6 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
         NI->setMetadata(LLVMContext::MD_noalias, M);
     }
   }
-
-  // Now that everything has been replaced, delete the dummy nodes.
-  for (unsigned i = 0, ie = DummyNodes.size(); i != ie; ++i)
-    MDNode::deleteTemporary(DummyNodes[i]);
 }
 
 /// AddAliasScopeMetadata - If the inlined function has noalias arguments, then
@@ -516,7 +513,7 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
       // need to go through several PHIs to see it, and thus could be
       // repeated in the Objects list.
       SmallPtrSet<const Value *, 4> ObjSet;
-      SmallVector<Value *, 4> Scopes, NoAliases;
+      SmallVector<Metadata *, 4> Scopes, NoAliases;
 
       SmallSetVector<const Argument *, 4> NAPtrArgs;
       for (unsigned i = 0, ie = PtrArgs.size(); i != ie; ++i) {
@@ -633,9 +630,10 @@ static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) {
   DominatorTree DT;
   bool DTCalculated = false;
 
-  const Function *CalledFunc = CS.getCalledFunction();
-  for (Function::const_arg_iterator I = CalledFunc->arg_begin(),
-       E = CalledFunc->arg_end(); I != E; ++I) {
+  Function *CalledFunc = CS.getCalledFunction();
+  for (Function::arg_iterator I = CalledFunc->arg_begin(),
+                              E = CalledFunc->arg_end();
+       I != E; ++I) {
     unsigned Align = I->getType()->isPointerTy() ? I->getParamAlignment() : 0;
     if (Align && !I->hasByValOrInAllocaAttr() && !I->hasNUses(0)) {
       if (!DTCalculated) {
@@ -647,8 +645,9 @@ static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) {
       // If we can already prove the asserted alignment in the context of the
       // caller, then don't bother inserting the assumption.
       Value *Arg = CS.getArgument(I->getArgNo());
-      if (getKnownAlignment(Arg, IFI.DL, IFI.AT, CS.getInstruction(),
-                            &DT) >= Align)
+      if (getKnownAlignment(Arg, IFI.DL,
+                            &IFI.ACT->getAssumptionCache(*CalledFunc),
+                            CS.getInstruction(), &DT) >= Align)
         continue;
 
       IRBuilder<>(CS.getInstruction()).CreateAlignmentAssumption(*IFI.DL, Arg,
@@ -748,6 +747,8 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
   PointerType *ArgTy = cast<PointerType>(Arg->getType());
   Type *AggTy = ArgTy->getElementType();
 
+  Function *Caller = TheCall->getParent()->getParent();
+
   // If the called function is readonly, then it could not mutate the caller's
   // copy of the byval'd memory.  In this case, it is safe to elide the copy and
   // temporary.
@@ -760,8 +761,9 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
 
     // If the pointer is already known to be sufficiently aligned, or if we can
     // round it up to a larger alignment, then we don't need a temporary.
-    if (getOrEnforceKnownAlignment(Arg, ByValAlignment,
-                                   IFI.DL, IFI.AT, TheCall) >= ByValAlignment)
+    if (getOrEnforceKnownAlignment(Arg, ByValAlignment, IFI.DL,
+                                   &IFI.ACT->getAssumptionCache(*Caller),
+                                   TheCall) >= ByValAlignment)
       return Arg;
     
     // Otherwise, we have to make a memcpy to get a safe alignment.  This is bad
@@ -778,8 +780,6 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
   // pointer inside the callee).
   Align = std::max(Align, ByValAlignment);
   
-  Function *Caller = TheCall->getParent()->getParent(); 
-  
   Value *NewAlloca = new AllocaInst(AggTy, nullptr, Align, Arg->getName(), 
                                     &*Caller->begin()->begin());
   IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca));
@@ -824,20 +824,42 @@ static bool hasLifetimeMarkers(AllocaInst *AI) {
   return false;
 }
 
-/// updateInlinedAtInfo - Helper function used by fixupLineNumbers to
-/// recursively update InlinedAtEntry of a DebugLoc.
-static DebugLoc updateInlinedAtInfo(const DebugLoc &DL, 
-                                    const DebugLoc &InlinedAtDL,
-                                    LLVMContext &Ctx) {
-  if (MDNode *IA = DL.getInlinedAt(Ctx)) {
-    DebugLoc NewInlinedAtDL 
-      = updateInlinedAtInfo(DebugLoc::getFromDILocation(IA), InlinedAtDL, Ctx);
-    return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx),
-                         NewInlinedAtDL.getAsMDNode(Ctx));
+/// Rebuild the entire inlined-at chain for this instruction so that the top of
+/// the chain now is inlined-at the new call site.
+static DebugLoc
+updateInlinedAtInfo(DebugLoc DL, MDLocation *InlinedAtNode,
+                    LLVMContext &Ctx,
+                    DenseMap<const MDLocation *, MDLocation *> &IANodes) {
+  SmallVector<MDLocation*, 3> InlinedAtLocations;
+  MDLocation *Last = InlinedAtNode;
+  DebugLoc CurInlinedAt = DL;
+
+  // Gather all the inlined-at nodes
+  while (MDLocation *IA =
+             cast_or_null<MDLocation>(CurInlinedAt.getInlinedAt(Ctx))) {
+    // Skip any we've already built nodes for
+    if (MDLocation *Found = IANodes[IA]) {
+      Last = Found;
+      break;
+    }
+
+    InlinedAtLocations.push_back(IA);
+    CurInlinedAt = DebugLoc::getFromDILocation(IA);
+  }
+
+  // Starting from the top, rebuild the nodes to point to the new inlined-at
+  // location (then rebuilding the rest of the chain behind it) and update the
+  // map of already-constructed inlined-at nodes.
+  for (auto I = InlinedAtLocations.rbegin(), E = InlinedAtLocations.rend();
+       I != E; ++I) {
+    const MDLocation *MD = *I;
+    Last = IANodes[MD] = MDLocation::getDistinct(
+        Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last);
   }
 
-  return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx),
-                       InlinedAtDL.getAsMDNode(Ctx));
+  // And finally create the normal location for this instruction, referring to
+  // the new inlined-at chain.
+  return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx), Last);
 }
 
 /// fixupLineNumbers - Update inlined instructions' line numbers to 
@@ -848,6 +870,20 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
   if (TheCallDL.isUnknown())
     return;
 
+  auto &Ctx = Fn->getContext();
+  auto *InlinedAtNode = cast<MDLocation>(TheCallDL.getAsMDNode(Ctx));
+
+  // Create a unique call site, not to be confused with any other call from the
+  // same location.
+  InlinedAtNode = MDLocation::getDistinct(
+      Ctx, InlinedAtNode->getLine(), InlinedAtNode->getColumn(),
+      InlinedAtNode->getScope(), InlinedAtNode->getInlinedAt());
+
+  // Cache the inlined-at nodes as they're built so they are reused, without
+  // this every instruction's inlined-at chain would become distinct from each
+  // other.
+  DenseMap<const MDLocation *, MDLocation *> IANodes;
+
   for (; FI != Fn->end(); ++FI) {
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
          BI != BE; ++BI) {
@@ -865,12 +901,19 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
 
         BI->setDebugLoc(TheCallDL);
       } else {
-        BI->setDebugLoc(updateInlinedAtInfo(DL, TheCallDL, BI->getContext()));
+        BI->setDebugLoc(updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes));
         if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(BI)) {
           LLVMContext &Ctx = BI->getContext();
           MDNode *InlinedAt = BI->getDebugLoc().getInlinedAt(Ctx);
-          DVI->setOperand(2, createInlinedVariable(DVI->getVariable(), 
-                                                   InlinedAt, Ctx));
+          DVI->setOperand(2, MetadataAsValue::get(
+                                 Ctx, createInlinedVariable(DVI->getVariable(),
+                                                            InlinedAt, Ctx)));
+        } else if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(BI)) {
+          LLVMContext &Ctx = BI->getContext();
+          MDNode *InlinedAt = BI->getDebugLoc().getInlinedAt(Ctx);
+          DDI->setOperand(1, MetadataAsValue::get(
+                                 Ctx, createInlinedVariable(DDI->getVariable(),
+                                                            InlinedAt, Ctx)));
         }
       }
     }
@@ -1026,8 +1069,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
 
     // FIXME: We could register any cloned assumptions instead of clearing the
     // whole function's cache.
-    if (IFI.AT)
-      IFI.AT->forgetCachedAssumptions(Caller);
+    if (IFI.ACT)
+      IFI.ACT->getAssumptionCache(*Caller).clear();
   }
 
   // If there are any alloca instructions in the block that used to be the entry
@@ -1069,6 +1112,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
                                                    FirstNewBlock->getInstList(),
                                                    AI, I);
     }
+    // Move any dbg.declares describing the allocas into the entry basic block.
+    DIBuilder DIB(*Caller->getParent());
+    for (auto &AI : IFI.StaticAllocas)
+      replaceDbgDeclareForAlloca(AI, AI, DIB, /*Deref=*/false);
   }
 
   bool InlinedMustTailCalls = false;
@@ -1398,7 +1445,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // the entries are the same or undef).  If so, remove the PHI so it doesn't
   // block other optimizations.
   if (PHI) {
-    if (Value *V = SimplifyInstruction(PHI, IFI.DL, nullptr, nullptr, IFI.AT)) {
+    if (Value *V = SimplifyInstruction(PHI, IFI.DL, nullptr, nullptr,
+                                       &IFI.ACT->getAssumptionCache(*Caller))) {
       PHI->replaceAllUsesWith(V);
       PHI->eraseFromParent();
     }
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index 51a3d9c..1cba367 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -61,7 +61,7 @@ static bool isExitBlock(BasicBlock *BB,
 /// uses.
 static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,
                                const SmallVectorImpl<BasicBlock *> &ExitBlocks,
-                               PredIteratorCache &PredCache) {
+                               PredIteratorCache &PredCache, LoopInfo *LI) {
   SmallVector<Use *, 16> UsesToRewrite;
 
   BasicBlock *InstBB = Inst.getParent();
@@ -94,6 +94,7 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,
   DomTreeNode *DomNode = DT.getNode(DomBB);
 
   SmallVector<PHINode *, 16> AddedPHIs;
+  SmallVector<PHINode *, 8> PostProcessPHIs;
 
   SSAUpdater SSAUpdate;
   SSAUpdate.Initialize(Inst.getType(), Inst.getName());
@@ -131,6 +132,18 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,
 
     // Remember that this phi makes the value alive in this block.
     SSAUpdate.AddAvailableValue(ExitBB, PN);
+
+    // LoopSimplify might fail to simplify some loops (e.g. when indirect
+    // branches are involved). In such situations, it might happen that an exit
+    // for Loop L1 is the header of a disjoint Loop L2. Thus, when we create
+    // PHIs in such an exit block, we are also inserting PHIs into L2's header.
+    // This could break LCSSA form for L2 because these inserted PHIs can also
+    // have uses outside of L2. Remember all PHIs in such situation as to
+    // revisit than later on. FIXME: Remove this if indirectbr support into
+    // LoopSimplify gets improved.
+    if (auto *OtherLoop = LI->getLoopFor(ExitBB))
+      if (!L.contains(OtherLoop))
+        PostProcessPHIs.push_back(PN);
   }
 
   // Rewrite all uses outside the loop in terms of the new PHIs we just
@@ -157,6 +170,25 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,
     SSAUpdate.RewriteUse(*UsesToRewrite[i]);
   }
 
+  // Post process PHI instructions that were inserted into another disjoint loop
+  // and update their exits properly.
+  for (auto *I : PostProcessPHIs) {
+    if (I->use_empty())
+      continue;
+
+    BasicBlock *PHIBB = I->getParent();
+    Loop *OtherLoop = LI->getLoopFor(PHIBB);
+    SmallVector<BasicBlock *, 8> EBs;
+    OtherLoop->getExitBlocks(EBs);
+    if (EBs.empty())
+      continue;
+
+    // Recurse and re-process each PHI instruction. FIXME: we should really
+    // convert this entire thing to a worklist approach where we process a
+    // vector of instructions...
+    processInstruction(*OtherLoop, *I, DT, EBs, PredCache, LI);
+  }
+
   // Remove PHI nodes that did not have any uses rewritten.
   for (unsigned i = 0, e = AddedPHIs.size(); i != e; ++i) {
     if (AddedPHIs[i]->use_empty())
@@ -180,7 +212,8 @@ blockDominatesAnExit(BasicBlock *BB,
   return false;
 }
 
-bool llvm::formLCSSA(Loop &L, DominatorTree &DT, ScalarEvolution *SE) {
+bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,
+                     ScalarEvolution *SE) {
   bool Changed = false;
 
   // Get the set of exiting blocks.
@@ -212,7 +245,7 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, ScalarEvolution *SE) {
            !isa<PHINode>(I->user_back())))
         continue;
 
-      Changed |= processInstruction(L, *I, DT, ExitBlocks, PredCache);
+      Changed |= processInstruction(L, *I, DT, ExitBlocks, PredCache, LI);
     }
   }
 
@@ -228,15 +261,15 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, ScalarEvolution *SE) {
 }
 
 /// Process a loop nest depth first.
-bool llvm::formLCSSARecursively(Loop &L, DominatorTree &DT,
+bool llvm::formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI,
                                 ScalarEvolution *SE) {
   bool Changed = false;
 
   // Recurse depth-first through inner loops.
-  for (Loop::iterator LI = L.begin(), LE = L.end(); LI != LE; ++LI)
-    Changed |= formLCSSARecursively(**LI, DT, SE);
+  for (Loop::iterator I = L.begin(), E = L.end(); I != E; ++I)
+    Changed |= formLCSSARecursively(**I, DT, LI, SE);
 
-  Changed |= formLCSSA(L, DT, SE);
+  Changed |= formLCSSA(L, DT, LI, SE);
   return Changed;
 }
 
@@ -261,7 +294,7 @@ struct LCSSA : public FunctionPass {
     AU.setPreservesCFG();
 
     AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<LoopInfo>();
+    AU.addRequired<LoopInfoWrapperPass>();
     AU.addPreservedID(LoopSimplifyID);
     AU.addPreserved<AliasAnalysis>();
     AU.addPreserved<ScalarEvolution>();
@@ -275,7 +308,7 @@ private:
 char LCSSA::ID = 0;
 INITIALIZE_PASS_BEGIN(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false)
 
 Pass *llvm::createLCSSAPass() { return new LCSSA(); }
@@ -285,13 +318,13 @@ char &llvm::LCSSAID = LCSSA::ID;
 /// Process all loops in the function, inner-most out.
 bool LCSSA::runOnFunction(Function &F) {
   bool Changed = false;
-  LI = &getAnalysis<LoopInfo>();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   SE = getAnalysisIfAvailable<ScalarEvolution>();
 
   // Simplify each loop nest in the function.
   for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
-    Changed |= formLCSSARecursively(**I, *DT, SE);
+    Changed |= formLCSSARecursively(**I, *DT, LI, SE);
 
   return Changed;
 }
diff --git a/lib/Transforms/Utils/LLVMBuild.txt b/lib/Transforms/Utils/LLVMBuild.txt
index 88b2ffe..6b2d405 100644
--- a/lib/Transforms/Utils/LLVMBuild.txt
+++ b/lib/Transforms/Utils/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = TransformUtils
 parent = Transforms
-required_libraries = Analysis Core IPA Support Target
+required_libraries = Analysis Core IPA Support
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index c963c51..4830568 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LibCallSemantics.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -110,11 +111,17 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
   }
 
   if (SwitchInst *SI = dyn_cast<SwitchInst>(T)) {
-    // If we are switching on a constant, we can convert the switch into a
-    // single branch instruction!
+    // If we are switching on a constant, we can convert the switch to an
+    // unconditional branch.
     ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition());
-    BasicBlock *TheOnlyDest = SI->getDefaultDest();
-    BasicBlock *DefaultDest = TheOnlyDest;
+    BasicBlock *DefaultDest = SI->getDefaultDest();
+    BasicBlock *TheOnlyDest = DefaultDest;
+
+    // If the default is unreachable, ignore it when searching for TheOnlyDest.
+    if (isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg()) &&
+        SI->getNumCases() > 0) {
+      TheOnlyDest = SI->case_begin().getCaseSuccessor();
+    }
 
     // Figure out which case it goes to.
     for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
@@ -137,7 +144,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
           SmallVector<uint32_t, 8> Weights;
           for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
                ++MD_i) {
-            ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(MD_i));
+            ConstantInt *CI =
+                mdconst::dyn_extract<ConstantInt>(MD->getOperand(MD_i));
             assert(CI);
             Weights.push_back(CI->getValue().getZExtValue());
           }
@@ -208,8 +216,10 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
                                                SI->getDefaultDest());
       MDNode *MD = SI->getMetadata(LLVMContext::MD_prof);
       if (MD && MD->getNumOperands() == 3) {
-        ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2));
-        ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1));
+        ConstantInt *SICase =
+            mdconst::dyn_extract<ConstantInt>(MD->getOperand(2));
+        ConstantInt *SIDef =
+            mdconst::dyn_extract<ConstantInt>(MD->getOperand(1));
         assert(SICase && SIDef);
         // The TrueWeight should be the weight for the single case of SI.
         NewBr->setMetadata(LLVMContext::MD_prof,
@@ -486,7 +496,7 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
 /// between them, moving the instructions in the predecessor into DestBB and
 /// deleting the predecessor block.
 ///
-void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) {
+void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) {
   // If BB has single-entry PHI nodes, fold them.
   while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
     Value *NewVal = PN->getIncomingValue(0);
@@ -522,14 +532,10 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) {
   if (PredBB == &DestBB->getParent()->getEntryBlock())
     DestBB->moveAfter(PredBB);
 
-  if (P) {
-    if (DominatorTreeWrapperPass *DTWP =
-            P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) {
-      DominatorTree &DT = DTWP->getDomTree();
-      BasicBlock *PredBBIDom = DT.getNode(PredBB)->getIDom()->getBlock();
-      DT.changeImmediateDominator(DestBB, PredBBIDom);
-      DT.eraseNode(PredBB);
-    }
+  if (DT) {
+    BasicBlock *PredBBIDom = DT->getNode(PredBB)->getIDom()->getBlock();
+    DT->changeImmediateDominator(DestBB, PredBBIDom);
+    DT->eraseNode(PredBB);
   }
   // Nuke BB.
   PredBB->eraseFromParent();
@@ -940,7 +946,7 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align,
 /// increase the alignment of the ultimate object, making this check succeed.
 unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
                                           const DataLayout *DL,
-                                          AssumptionTracker *AT,
+                                          AssumptionCache *AC,
                                           const Instruction *CxtI,
                                           const DominatorTree *DT) {
   assert(V->getType()->isPointerTy() &&
@@ -948,7 +954,7 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
   unsigned BitWidth = DL ? DL->getPointerTypeSizeInBits(V->getType()) : 64;
 
   APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-  computeKnownBits(V, KnownZero, KnownOne, DL, 0, AT, CxtI, DT);
+  computeKnownBits(V, KnownZero, KnownOne, DL, 0, AC, CxtI, DT);
   unsigned TrailZ = KnownZero.countTrailingOnes();
 
   // Avoid trouble with ridiculously large TrailZ values, such as
@@ -1048,7 +1054,7 @@ static bool isArray(AllocaInst *AI) {
 /// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set
 /// of llvm.dbg.value intrinsics.
 bool llvm::LowerDbgDeclare(Function &F) {
-  DIBuilder DIB(*F.getParent());
+  DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
   SmallVector<DbgDeclareInst *, 4> Dbgs;
   for (auto &FI : F)
     for (BasicBlock::iterator BI : FI)
@@ -1091,19 +1097,21 @@ bool llvm::LowerDbgDeclare(Function &F) {
 /// FindAllocaDbgDeclare - Finds the llvm.dbg.declare intrinsic describing the
 /// alloca 'V', if any.
 DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
-  if (MDNode *DebugNode = MDNode::getIfExists(V->getContext(), V))
-    for (User *U : DebugNode->users())
-      if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
-        return DDI;
+  if (auto *L = LocalAsMetadata::getIfExists(V))
+    if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
+      for (User *U : MDV->users())
+        if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
+          return DDI;
 
   return nullptr;
 }
 
 bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
-                                      DIBuilder &Builder) {
+                                      DIBuilder &Builder, bool Deref) {
   DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI);
   if (!DDI)
     return false;
+  DebugLoc Loc = DDI->getDebugLoc();
   DIVariable DIVar(DDI->getVariable());
   DIExpression DIExpr(DDI->getExpression());
   assert((!DIVar || DIVar.isVariable()) &&
@@ -1111,23 +1119,24 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
   if (!DIVar)
     return false;
 
-  // Create a copy of the original DIDescriptor for user variable, appending
-  // "deref" operation to a list of address elements, as new llvm.dbg.declare
-  // will take a value storing address of the memory for variable, not
-  // alloca itself.
-  SmallVector<int64_t, 4> NewDIExpr;
-  if (DIExpr) {
-    for (unsigned i = 0, n = DIExpr.getNumElements(); i < n; ++i) {
-      NewDIExpr.push_back(DIExpr.getElement(i));
-    }
+  if (Deref) {
+    // Create a copy of the original DIDescriptor for user variable, prepending
+    // "deref" operation to a list of address elements, as new llvm.dbg.declare
+    // will take a value storing address of the memory for variable, not
+    // alloca itself.
+    SmallVector<uint64_t, 4> NewDIExpr;
+    NewDIExpr.push_back(dwarf::DW_OP_deref);
+    if (DIExpr)
+      for (unsigned i = 0, n = DIExpr.getNumElements(); i < n; ++i)
+        NewDIExpr.push_back(DIExpr.getElement(i));
+    DIExpr = Builder.createExpression(NewDIExpr);
   }
-  NewDIExpr.push_back(dwarf::DW_OP_deref);
 
   // Insert llvm.dbg.declare in the same basic block as the original alloca,
   // and remove old llvm.dbg.declare.
   BasicBlock *BB = AI->getParent();
-  Builder.insertDeclare(NewAllocaAddress, DIVar,
-                        Builder.createExpression(NewDIExpr), BB);
+  Builder.insertDeclare(NewAllocaAddress, DIVar, DIExpr, BB)
+    ->setDebugLoc(Loc);
   DDI->eraseFromParent();
   return true;
 }
@@ -1252,7 +1261,7 @@ static bool markAliveBlocks(BasicBlock *BB,
       if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
         changeToUnreachable(II, true);
         Changed = true;
-      } else if (II->doesNotThrow()) {
+      } else if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(II)) {
         if (II->use_empty() && II->onlyReadsMemory()) {
           // jump to the normal destination branch.
           BranchInst::Create(II->getNormalDest(), II);
@@ -1326,6 +1335,8 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsign
         K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
         break;
       case LLVMContext::MD_alias_scope:
+        K->setMetadata(Kind, MDNode::getMostGenericAliasScope(JMD, KMD));
+        break;
       case LLVMContext::MD_noalias:
         K->setMetadata(Kind, MDNode::intersect(JMD, KMD));
         break;
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index af0501f..a0f8268 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -44,7 +44,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -113,6 +113,14 @@ static void placeSplitBlockCarefully(BasicBlock *NewBB,
 BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) {
   BasicBlock *Header = L->getHeader();
 
+  // Get analyses that we try to update.
+  auto *AA = PP->getAnalysisIfAvailable<AliasAnalysis>();
+  auto *DTWP = PP->getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  auto *LIWP = PP->getAnalysisIfAvailable<LoopInfoWrapperPass>();
+  auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+  bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID);
+
   // Compute the set of predecessors of the loop that are not in the loop.
   SmallVector<BasicBlock*, 8> OutsideBlocks;
   for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
@@ -131,15 +139,8 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) {
 
   // Split out the loop pre-header.
   BasicBlock *PreheaderBB;
-  if (!Header->isLandingPad()) {
-    PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader",
-                                         PP);
-  } else {
-    SmallVector<BasicBlock*, 2> NewBBs;
-    SplitLandingPadPredecessors(Header, OutsideBlocks, ".preheader",
-                                ".split-lp", PP, NewBBs);
-    PreheaderBB = NewBBs[0];
-  }
+  PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader",
+                                       AA, DT, LI, PreserveLCSSA);
 
   PreheaderBB->getTerminator()->setDebugLoc(
                                       Header->getFirstNonPHI()->getDebugLoc());
@@ -157,7 +158,9 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) {
 ///
 /// This method is used to split exit blocks that have predecessors outside of
 /// the loop.
-static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, Pass *PP) {
+static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit,
+                                        AliasAnalysis *AA, DominatorTree *DT,
+                                        LoopInfo *LI, Pass *PP) {
   SmallVector<BasicBlock*, 8> LoopBlocks;
   for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) {
     BasicBlock *P = *I;
@@ -172,15 +175,10 @@ static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, Pass *PP) {
   assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?");
   BasicBlock *NewExitBB = nullptr;
 
-  if (Exit->isLandingPad()) {
-    SmallVector<BasicBlock*, 2> NewBBs;
-    SplitLandingPadPredecessors(Exit, LoopBlocks,
-                                ".loopexit", ".nonloopexit",
-                                PP, NewBBs);
-    NewExitBB = NewBBs[0];
-  } else {
-    NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", PP);
-  }
+  bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID);
+
+  NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", AA, DT,
+                                     LI, PreserveLCSSA);
 
   DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
                << NewExitBB->getName() << "\n");
@@ -210,11 +208,11 @@ static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,
 /// us how to partition the loops.
 static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA,
                                         DominatorTree *DT,
-                                        AssumptionTracker *AT) {
+                                        AssumptionCache *AC) {
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) {
     PHINode *PN = cast<PHINode>(I);
     ++I;
-    if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT, AT)) {
+    if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT, AC)) {
       // This is a degenerate PHI already, don't modify it!
       PN->replaceAllUsesWith(V);
       if (AA) AA->deleteValue(PN);
@@ -254,7 +252,7 @@ static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA,
 static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
                                 AliasAnalysis *AA, DominatorTree *DT,
                                 LoopInfo *LI, ScalarEvolution *SE, Pass *PP,
-                                AssumptionTracker *AT) {
+                                AssumptionCache *AC) {
   // Don't try to separate loops without a preheader.
   if (!Preheader)
     return nullptr;
@@ -263,7 +261,7 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
   assert(!L->getHeader()->isLandingPad() &&
          "Can't insert backedge to landing pad");
 
-  PHINode *PN = findPHIToPartitionLoops(L, AA, DT, AT);
+  PHINode *PN = findPHIToPartitionLoops(L, AA, DT, AC);
   if (!PN) return nullptr;  // No known way to partition.
 
   // Pull out all predecessors that have varying values in the loop.  This
@@ -287,9 +285,11 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
   if (SE)
     SE->forgetLoop(L);
 
+  bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID);
+
   BasicBlock *Header = L->getHeader();
-  BasicBlock *NewBB =
-    SplitBlockPredecessors(Header, OuterLoopPreds,  ".outer", PP);
+  BasicBlock *NewBB = SplitBlockPredecessors(Header, OuterLoopPreds, ".outer",
+                                             AA, DT, LI, PreserveLCSSA);
 
   // Make sure that NewBB is put someplace intelligent, which doesn't mess up
   // code layout too horribly.
@@ -460,7 +460,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
 
   // Update Loop Information - we know that this block is now in the current
   // loop and all parent loops.
-  L->addBasicBlockToLoop(BEBlock, LI->getBase());
+  L->addBasicBlockToLoop(BEBlock, *LI);
 
   // Update dominator information
   DT->splitBlock(BEBlock);
@@ -476,8 +476,8 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
 /// explicit if they accepted the analysis directly and then updated it.
 static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
                             AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI,
-                            ScalarEvolution *SE, Pass *PP,
-                            const DataLayout *DL, AssumptionTracker *AT) {
+                            ScalarEvolution *SE, Pass *PP, const DataLayout *DL,
+                            AssumptionCache *AC) {
   bool Changed = false;
 ReprocessLoop:
 
@@ -567,7 +567,7 @@ ReprocessLoop:
       // Must be exactly this loop: no subloops, parent loops, or non-loop preds
       // allowed.
       if (!L->contains(*PI)) {
-        if (rewriteLoopExitBlock(L, ExitBlock, PP)) {
+        if (rewriteLoopExitBlock(L, ExitBlock, AA, DT, LI, PP)) {
           ++NumInserted;
           Changed = true;
         }
@@ -583,8 +583,8 @@ ReprocessLoop:
     // this for loops with a giant number of backedges, just factor them into a
     // common backedge instead.
     if (L->getNumBackEdges() < 8) {
-      if (Loop *OuterL = separateNestedLoop(L, Preheader, AA, DT, LI, SE,
-                                            PP, AT)) {
+      if (Loop *OuterL =
+              separateNestedLoop(L, Preheader, AA, DT, LI, SE, PP, AC)) {
         ++NumNested;
         // Enqueue the outer loop as it should be processed next in our
         // depth-first nest walk.
@@ -614,7 +614,7 @@ ReprocessLoop:
   PHINode *PN;
   for (BasicBlock::iterator I = L->getHeader()->begin();
        (PN = dyn_cast<PHINode>(I++)); )
-    if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT, AT)) {
+    if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT, AC)) {
       if (AA) AA->deleteValue(PN);
       if (SE) SE->forgetValue(PN);
       PN->replaceAllUsesWith(V);
@@ -714,7 +714,7 @@ ReprocessLoop:
 
 bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP,
                         AliasAnalysis *AA, ScalarEvolution *SE,
-                        const DataLayout *DL, AssumptionTracker *AT) {
+                        const DataLayout *DL, AssumptionCache *AC) {
   bool Changed = false;
 
   // Worklist maintains our depth-first queue of loops in this nest to process.
@@ -726,13 +726,12 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP,
   // order. We can use this simple process because loops form a tree.
   for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
     Loop *L2 = Worklist[Idx];
-    for (Loop::iterator I = L2->begin(), E = L2->end(); I != E; ++I)
-      Worklist.push_back(*I);
+    Worklist.append(L2->begin(), L2->end());
   }
 
   while (!Worklist.empty())
     Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, AA, DT, LI,
-                               SE, PP, DL, AT);
+                               SE, PP, DL, AC);
 
   return Changed;
 }
@@ -751,19 +750,19 @@ namespace {
     LoopInfo *LI;
     ScalarEvolution *SE;
     const DataLayout *DL;
-    AssumptionTracker *AT;
+    AssumptionCache *AC;
 
     bool runOnFunction(Function &F) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionTracker>();
+      AU.addRequired<AssumptionCacheTracker>();
 
       // We need loop information to identify the loops...
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addPreserved<DominatorTreeWrapperPass>();
 
-      AU.addRequired<LoopInfo>();
-      AU.addPreserved<LoopInfo>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
 
       AU.addPreserved<AliasAnalysis>();
       AU.addPreserved<ScalarEvolution>();
@@ -779,9 +778,9 @@ namespace {
 char LoopSimplify::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
                 "Canonicalize natural loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(LoopSimplify, "loop-simplify",
                 "Canonicalize natural loops", false, false)
 
@@ -795,16 +794,16 @@ Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
 bool LoopSimplify::runOnFunction(Function &F) {
   bool Changed = false;
   AA = getAnalysisIfAvailable<AliasAnalysis>();
-  LI = &getAnalysis<LoopInfo>();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   SE = getAnalysisIfAvailable<ScalarEvolution>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
-  AT = &getAnalysis<AssumptionTracker>();
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
   // Simplify each loop nest in the function.
   for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
-    Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, DL, AT);
+    Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, DL, AC);
 
   return Changed;
 }
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index 0e1baa1..accb731 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -19,7 +19,7 @@
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -154,9 +154,8 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM,
 /// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are
 /// available from the Pass it must also preserve those analyses.
 bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
-                      bool AllowRuntime, unsigned TripMultiple,
-                      LoopInfo *LI, Pass *PP, LPPassManager *LPM,
-                      AssumptionTracker *AT) {
+                      bool AllowRuntime, unsigned TripMultiple, LoopInfo *LI,
+                      Pass *PP, LPPassManager *LPM, AssumptionCache *AC) {
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
     DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
@@ -312,7 +311,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
       // Tell LI about New.
       if (*BB == Header) {
         assert(LI->getLoopFor(*BB) == L && "Header should not be in a sub-loop");
-        L->addBasicBlockToLoop(New, LI->getBase());
+        L->addBasicBlockToLoop(New, *LI);
       } else {
         // Figure out which loop New is in.
         const Loop *OldLoop = LI->getLoopFor(*BB);
@@ -334,7 +333,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
           if (SE)
             SE->forgetLoop(OldLoop);
         }
-        NewLoop->addBasicBlockToLoop(New, LI->getBase());
+        NewLoop->addBasicBlockToLoop(New, *LI);
       }
 
       if (*BB == Header)
@@ -473,7 +472,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
 
   // FIXME: We could register any cloned assumptions instead of clearing the
   // whole function's cache.
-  AT->forgetCachedAssumptions(F);
+  AC->clear();
 
   DominatorTree *DT = nullptr;
   if (PP) {
@@ -534,7 +533,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     if (OuterL) {
       DataLayoutPass *DLP = PP->getAnalysisIfAvailable<DataLayoutPass>();
       const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
-      simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, DL, AT);
+      simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, DL, AC);
 
       // LCSSA must be performed on the outermost affected loop. The unrolled
       // loop's last loop latch is guaranteed to be in the outermost loop after
@@ -544,9 +543,32 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
         while (OuterL->getParentLoop() != LatchLoop)
           OuterL = OuterL->getParentLoop();
 
-      formLCSSARecursively(*OuterL, *DT, SE);
+      formLCSSARecursively(*OuterL, *DT, LI, SE);
     }
   }
 
   return true;
 }
+
+/// Given an llvm.loop loop id metadata node, returns the loop hint metadata
+/// node with the given name (for example, "llvm.loop.unroll.count"). If no
+/// such metadata node exists, then nullptr is returned.
+MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
+  // First operand should refer to the loop id itself.
+  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+  for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+    MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+    if (!MD)
+      continue;
+
+    MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+    if (!S)
+      continue;
+
+    if (Name.equals(S->getString()))
+      return MD;
+  }
+  return nullptr;
+}
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 3d91336..91b688c 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -23,14 +23,17 @@
 
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <algorithm>
@@ -55,10 +58,11 @@ STATISTIC(NumRuntimeUnrolled,
 /// - Branch around the original loop if the trip count is less
 ///   than the unroll factor.
 ///
-static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count,
+static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
                           BasicBlock *LastPrologBB, BasicBlock *PrologEnd,
                           BasicBlock *OrigPH, BasicBlock *NewPH,
-                          ValueToValueMapTy &VMap, Pass *P) {
+                          ValueToValueMapTy &VMap, AliasAnalysis *AA,
+                          DominatorTree *DT, LoopInfo *LI, Pass *P) {
   BasicBlock *Latch = L->getLoopLatch();
   assert(Latch && "Loop must have a latch");
 
@@ -105,23 +109,25 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count,
     }
   }
 
-  // Create a branch around the orignal loop, which is taken if the
-  // trip count is less than the unroll factor.
+  // Create a branch around the orignal loop, which is taken if there are no
+  // iterations remaining to be executed after running the prologue.
   Instruction *InsertPt = PrologEnd->getTerminator();
+
+  assert(Count != 0 && "nonsensical Count!");
+
+  // If BECount <u (Count - 1) then (BECount + 1) & (Count - 1) == (BECount + 1)
+  // (since Count is a power of 2).  This means %xtraiter is (BECount + 1) and
+  // and all of the iterations of this loop were executed by the prologue.  Note
+  // that if BECount <u (Count - 1) then (BECount + 1) cannot unsigned-overflow.
   Instruction *BrLoopExit =
-    new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, TripCount,
-                 ConstantInt::get(TripCount->getType(), Count));
+    new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, BECount,
+                 ConstantInt::get(BECount->getType(), Count - 1));
   BasicBlock *Exit = L->getUniqueExitBlock();
   assert(Exit && "Loop must have a single exit block only");
   // Split the exit to maintain loop canonicalization guarantees
   SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit));
-  if (!Exit->isLandingPad()) {
-    SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", P);
-  } else {
-    SmallVector<BasicBlock*, 2> NewBBs;
-    SplitLandingPadPredecessors(Exit, Preds, ".unr1-lcssa", ".unr2-lcssa",
-                                P, NewBBs);
-  }
+  SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", AA, DT, LI,
+                         P->mustPreserveAnalysisID(LCSSAID));
   // Add the branch to the exit block (around the unrolled loop)
   BranchInst::Create(Exit, NewPH, BrLoopExit, InsertPt);
   InsertPt->eraseFromParent();
@@ -160,9 +166,9 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
     NewBlocks.push_back(NewBB);
 
     if (NewLoop)
-      NewLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+      NewLoop->addBasicBlockToLoop(NewBB, *LI);
     else if (ParentLoop)
-      ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+      ParentLoop->addBasicBlockToLoop(NewBB, *LI);
 
     VMap[*BB] = NewBB;
     if (Header == *BB) {
@@ -217,9 +223,9 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
   }
   if (NewLoop) {
     // Add unroll disable metadata to disable future unrolling for this loop.
-    SmallVector<Value *, 4> Vals;
+    SmallVector<Metadata *, 4> MDs;
     // Reserve first location for self reference to the LoopID metadata node.
-    Vals.push_back(nullptr);
+    MDs.push_back(nullptr);
     MDNode *LoopID = NewLoop->getLoopID();
     if (LoopID) {
       // First remove any existing loop unrolling metadata.
@@ -230,17 +236,18 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
           const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
           IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
         }
-        if (!IsUnrollMetadata) Vals.push_back(LoopID->getOperand(i));
+        if (!IsUnrollMetadata)
+          MDs.push_back(LoopID->getOperand(i));
       }
     }
 
     LLVMContext &Context = NewLoop->getHeader()->getContext();
-    SmallVector<Value *, 1> DisableOperands;
+    SmallVector<Metadata *, 1> DisableOperands;
     DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
-    Vals.push_back(DisableNode);
+    MDs.push_back(DisableNode);
 
-    MDNode *NewLoopID = MDNode::get(Context, Vals);
+    MDNode *NewLoopID = MDNode::get(Context, MDs);
     // Set operand 0 to refer to the loop id itself.
     NewLoopID->replaceOperandWith(0, NewLoopID);
     NewLoop->setLoopID(NewLoopID);
@@ -291,23 +298,28 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
 
   // Only unroll loops with a computable trip count and the trip count needs
   // to be an int value (allowing a pointer type is a TODO item)
-  const SCEV *BECount = SE->getBackedgeTakenCount(L);
-  if (isa<SCEVCouldNotCompute>(BECount) || !BECount->getType()->isIntegerTy())
+  const SCEV *BECountSC = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BECountSC) ||
+      !BECountSC->getType()->isIntegerTy())
     return false;
 
-  // If BECount is INT_MAX, we can't compute trip-count without overflow.
-  if (BECount->isAllOnesValue())
-    return false;
+  unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
 
   // Add 1 since the backedge count doesn't include the first loop iteration
   const SCEV *TripCountSC =
-    SE->getAddExpr(BECount, SE->getConstant(BECount->getType(), 1));
+    SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
   if (isa<SCEVCouldNotCompute>(TripCountSC))
     return false;
 
   // We only handle cases when the unroll factor is a power of 2.
   // Count is the loop unroll factor, the number of extra copies added + 1.
-  if ((Count & (Count-1)) != 0)
+  if (!isPowerOf2_32(Count))
+    return false;
+
+  // This constraint lets us deal with an overflowing trip count easily; see the
+  // comment on ModVal below.  This check is equivalent to `Log2(Count) <
+  // BEWidth`.
+  if (static_cast<uint64_t>(Count) > (1ULL << BEWidth))
     return false;
 
   // If this loop is nested, then the loop unroller changes the code in
@@ -315,13 +327,17 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
   if (Loop *ParentLoop = L->getParentLoop())
     SE->forgetLoop(ParentLoop);
 
+  // Grab analyses that we preserve.
+  auto *DTWP = LPM->getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+
   BasicBlock *PH = L->getLoopPreheader();
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = L->getLoopLatch();
   // It helps to splits the original preheader twice, one for the end of the
   // prolog code and one for a new loop preheader
-  BasicBlock *PEnd = SplitEdge(PH, Header, LPM->getAsPass());
-  BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), LPM->getAsPass());
+  BasicBlock *PEnd = SplitEdge(PH, Header, DT, LI);
+  BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), DT, LI);
   BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator());
 
   // Compute the number of extra iterations required, which is:
@@ -329,16 +345,23 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
   SCEVExpander Expander(*SE, "loop-unroll");
   Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
                                             PreHeaderBR);
+  Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
+                                          PreHeaderBR);
 
   IRBuilder<> B(PreHeaderBR);
   Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
 
-  // Check if for no extra iterations, then jump to cloned/unrolled loop.
-  // We have to check that the trip count computation didn't overflow when
-  // adding one to the backedge taken count.
-  Value *LCmp = B.CreateIsNotNull(ModVal, "lcmp.mod");
-  Value *OverflowCheck = B.CreateIsNull(TripCount, "lcmp.overflow");
-  Value *BranchVal = B.CreateOr(OverflowCheck, LCmp, "lcmp.or");
+  // If ModVal is zero, we know that either
+  //  1. there are no iteration to be run in the prologue loop
+  // OR
+  //  2. the addition computing TripCount overflowed
+  //
+  // If (2) is true, we know that TripCount really is (1 << BEWidth) and so the
+  // number of iterations that remain to be run in the original loop is a
+  // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
+  // explicitly check this above).
+
+  Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod");
 
   // Branch to either the extra iterations or the cloned/unrolled loop
   // We will fix up the true branch label when adding loop body copies
@@ -361,10 +384,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
   std::vector<BasicBlock *> NewBlocks;
   ValueToValueMapTy VMap;
 
-  // If unroll count is 2 and we can't overflow in tripcount computation (which
-  // is BECount + 1), then we don't need a loop for prologue, and we can unroll
-  // it. We can be sure that we don't overflow only if tripcount is a constant.
-  bool UnrollPrologue = (Count == 2 && isa<ConstantInt>(TripCount));
+  bool UnrollPrologue = Count == 2;
 
   // Clone all the basic blocks in the loop. If Count is 2, we don't clone
   // the loop, otherwise we create a cloned loop to execute the extra
@@ -390,8 +410,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
   // Connect the prolog code to the original loop and update the
   // PHI functions.
   BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]);
-  ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, VMap,
-                LPM->getAsPass());
+  ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap,
+                /*AliasAnalysis*/ nullptr, DT, LI, LPM->getAsPass());
   NumRuntimeUnrolled++;
   return true;
 }
diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
deleted file mode 100644
index ff89e74..0000000
--- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-//===- LowerExpectIntrinsic.cpp - Lower expect intrinsic ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass lowers the 'expect' intrinsic to LLVM metadata.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "lower-expect-intrinsic"
-
-STATISTIC(IfHandled, "Number of 'expect' intrinsic instructions handled");
-
-static cl::opt<uint32_t>
-LikelyBranchWeight("likely-branch-weight", cl::Hidden, cl::init(64),
-                   cl::desc("Weight of the branch likely to be taken (default = 64)"));
-static cl::opt<uint32_t>
-UnlikelyBranchWeight("unlikely-branch-weight", cl::Hidden, cl::init(4),
-                   cl::desc("Weight of the branch unlikely to be taken (default = 4)"));
-
-namespace {
-
-  class LowerExpectIntrinsic : public FunctionPass {
-
-    bool HandleSwitchExpect(SwitchInst *SI);
-
-    bool HandleIfExpect(BranchInst *BI);
-
-  public:
-    static char ID;
-    LowerExpectIntrinsic() : FunctionPass(ID) {
-      initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnFunction(Function &F) override;
-  };
-}
-
-
-bool LowerExpectIntrinsic::HandleSwitchExpect(SwitchInst *SI) {
-  CallInst *CI = dyn_cast<CallInst>(SI->getCondition());
-  if (!CI)
-    return false;
-
-  Function *Fn = CI->getCalledFunction();
-  if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
-    return false;
-
-  Value *ArgValue = CI->getArgOperand(0);
-  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
-  if (!ExpectedValue)
-    return false;
-
-  SwitchInst::CaseIt Case = SI->findCaseValue(ExpectedValue);
-  unsigned n = SI->getNumCases(); // +1 for default case.
-  std::vector<uint32_t> Weights(n + 1);
-
-  Weights[0] = Case == SI->case_default() ? LikelyBranchWeight
-                                          : UnlikelyBranchWeight;
-  for (unsigned i = 0; i != n; ++i)
-    Weights[i + 1] = i == Case.getCaseIndex() ? LikelyBranchWeight
-                                              : UnlikelyBranchWeight;
-
-  SI->setMetadata(LLVMContext::MD_prof,
-                  MDBuilder(CI->getContext()).createBranchWeights(Weights));
-
-  SI->setCondition(ArgValue);
-  return true;
-}
-
-
-bool LowerExpectIntrinsic::HandleIfExpect(BranchInst *BI) {
-  if (BI->isUnconditional())
-    return false;
-
-  // Handle non-optimized IR code like:
-  //   %expval = call i64 @llvm.expect.i64(i64 %conv1, i64 1)
-  //   %tobool = icmp ne i64 %expval, 0
-  //   br i1 %tobool, label %if.then, label %if.end
-  //
-  // Or the following simpler case:
-  //   %expval = call i1 @llvm.expect.i1(i1 %cmp, i1 1)
-  //   br i1 %expval, label %if.then, label %if.end
-
-  CallInst *CI;
-
-  ICmpInst *CmpI = dyn_cast<ICmpInst>(BI->getCondition());
-  if (!CmpI) {
-    CI = dyn_cast<CallInst>(BI->getCondition());
-  } else {
-    if (CmpI->getPredicate() != CmpInst::ICMP_NE)
-      return false;
-    CI = dyn_cast<CallInst>(CmpI->getOperand(0));
-  }
-
-  if (!CI)
-    return false;
-
-  Function *Fn = CI->getCalledFunction();
-  if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
-    return false;
-
-  Value *ArgValue = CI->getArgOperand(0);
-  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
-  if (!ExpectedValue)
-    return false;
-
-  MDBuilder MDB(CI->getContext());
-  MDNode *Node;
-
-  // If expect value is equal to 1 it means that we are more likely to take
-  // branch 0, in other case more likely is branch 1.
-  if (ExpectedValue->isOne())
-    Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight);
-  else
-    Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
-
-  BI->setMetadata(LLVMContext::MD_prof, Node);
-
-  if (CmpI)
-    CmpI->setOperand(0, ArgValue);
-  else
-    BI->setCondition(ArgValue);
-  return true;
-}
-
-
-bool LowerExpectIntrinsic::runOnFunction(Function &F) {
-  for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
-    BasicBlock *BB = I++;
-
-    // Create "block_weights" metadata.
-    if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
-      if (HandleIfExpect(BI))
-        IfHandled++;
-    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
-      if (HandleSwitchExpect(SI))
-        IfHandled++;
-    }
-
-    // remove llvm.expect intrinsics.
-    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
-         BI != BE; ) {
-      CallInst *CI = dyn_cast<CallInst>(BI++);
-      if (!CI)
-        continue;
-
-      Function *Fn = CI->getCalledFunction();
-      if (Fn && Fn->getIntrinsicID() == Intrinsic::expect) {
-        Value *Exp = CI->getArgOperand(0);
-        CI->replaceAllUsesWith(Exp);
-        CI->eraseFromParent();
-      }
-    }
-  }
-
-  return false;
-}
-
-
-char LowerExpectIntrinsic::ID = 0;
-INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect", "Lower 'expect' "
-                "Intrinsics", false, false)
-
-FunctionPass *llvm::createLowerExpectIntrinsicPass() {
-  return new LowerExpectIntrinsic();
-}
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index a0105c2..b3bdae4 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -32,6 +32,23 @@ using namespace llvm;
 #define DEBUG_TYPE "lower-switch"
 
 namespace {
+  struct IntRange {
+    int64_t Low, High;
+  };
+  // Return true iff R is covered by Ranges.
+  static bool IsInRanges(const IntRange &R,
+                         const std::vector<IntRange> &Ranges) {
+    // Note: Ranges must be sorted, non-overlapping and non-adjacent.
+
+    // Find the first range whose High field is >= R.High,
+    // then check if the Low field is <= R.Low. If so, we
+    // have a Range that covers R.
+    auto I = std::lower_bound(
+        Ranges.begin(), Ranges.end(), R,
+        [](const IntRange &A, const IntRange &B) { return A.High < B.High; });
+    return I != Ranges.end() && I->Low <= R.Low;
+  }
+
   /// LowerSwitch Pass - Replace all SwitchInst instructions with chained branch
   /// instructions.
   class LowerSwitch : public FunctionPass {
@@ -46,18 +63,16 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       // This is a cluster of orthogonal Transforms
       AU.addPreserved<UnifyFunctionExitNodes>();
-      AU.addPreserved("mem2reg");
       AU.addPreservedID(LowerInvokePassID);
     }
 
     struct CaseRange {
-      Constant* Low;
-      Constant* High;
+      ConstantInt* Low;
+      ConstantInt* High;
       BasicBlock* BB;
 
-      CaseRange(Constant *low = nullptr, Constant *high = nullptr,
-                BasicBlock *bb = nullptr) :
-        Low(low), High(high), BB(bb) { }
+      CaseRange(ConstantInt *low, ConstantInt *high, BasicBlock *bb)
+          : Low(low), High(high), BB(bb) {}
     };
 
     typedef std::vector<CaseRange> CaseVector;
@@ -68,7 +83,8 @@ namespace {
     BasicBlock *switchConvert(CaseItr Begin, CaseItr End,
                               ConstantInt *LowerBound, ConstantInt *UpperBound,
                               Value *Val, BasicBlock *Predecessor,
-                              BasicBlock *OrigBlock, BasicBlock *Default);
+                              BasicBlock *OrigBlock, BasicBlock *Default,
+                              const std::vector<IntRange> &UnreachableRanges);
     BasicBlock *newLeafBlock(CaseRange &Leaf, Value *Val, BasicBlock *OrigBlock,
                              BasicBlock *Default);
     unsigned Clusterify(CaseVector &Cases, SwitchInst *SI);
@@ -131,25 +147,39 @@ static raw_ostream& operator<<(raw_ostream &O,
   return O << "]";
 }
 
-/// \brief Update the first occurrence of the "switch statement" BB in the PHI
-/// node with the "new" BB. The other occurrences will be updated by subsequent
-/// calls to this function.
-///
-/// Switch statements may have more than one incoming edge into the same BB if
-/// they all have the same value. When the switch statement is converted these
-/// incoming edges are now coming from multiple BBs.
-static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB) {
-  for (BasicBlock::iterator I = SuccBB->begin(), E = SuccBB->getFirstNonPHI();
-       I != E; ++I) {
+// \brief Update the first occurrence of the "switch statement" BB in the PHI
+// node with the "new" BB. The other occurrences will:
+//
+// 1) Be updated by subsequent calls to this function.  Switch statements may
+// have more than one outcoming edge into the same BB if they all have the same
+// value. When the switch statement is converted these incoming edges are now
+// coming from multiple BBs.
+// 2) Removed if subsequent incoming values now share the same case, i.e.,
+// multiple outcome edges are condensed into one. This is necessary to keep the
+// number of phi values equal to the number of branches to SuccBB.
+static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,
+                    unsigned NumMergedCases) {
+  for (BasicBlock::iterator I = SuccBB->begin(), IE = SuccBB->getFirstNonPHI();
+       I != IE; ++I) {
     PHINode *PN = cast<PHINode>(I);
 
     // Only update the first occurence.
-    for (unsigned Idx = 0, E = PN->getNumIncomingValues(); Idx != E; ++Idx) {
+    unsigned Idx = 0, E = PN->getNumIncomingValues();
+    unsigned LocalNumMergedCases = NumMergedCases;
+    for (; Idx != E; ++Idx) {
       if (PN->getIncomingBlock(Idx) == OrigBB) {
         PN->setIncomingBlock(Idx, NewBB);
         break;
       }
     }
+
+    // Remove additional occurences coming from condensed cases and keep the
+    // number of incoming values equal to the number of branches to SuccBB.
+    for (++Idx; LocalNumMergedCases > 0 && Idx < E; ++Idx)
+      if (PN->getIncomingBlock(Idx) == OrigBB) {
+        PN->removeIncomingValue(Idx);
+        LocalNumMergedCases--;
+      }
   }
 }
 
@@ -158,12 +188,12 @@ static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB) {
 // LowerBound and UpperBound are used to keep track of the bounds for Val
 // that have already been checked by a block emitted by one of the previous
 // calls to switchConvert in the call stack.
-BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
-                                       ConstantInt *LowerBound,
-                                       ConstantInt *UpperBound, Value *Val,
-                                       BasicBlock *Predecessor,
-                                       BasicBlock *OrigBlock,
-                                       BasicBlock *Default) {
+BasicBlock *
+LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
+                           ConstantInt *UpperBound, Value *Val,
+                           BasicBlock *Predecessor, BasicBlock *OrigBlock,
+                           BasicBlock *Default,
+                           const std::vector<IntRange> &UnreachableRanges) {
   unsigned Size = End - Begin;
 
   if (Size == 1) {
@@ -172,7 +202,11 @@ BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
     // emitting the code that checks if the value actually falls in the range
     // because the bounds already tell us so.
     if (Begin->Low == LowerBound && Begin->High == UpperBound) {
-      fixPhis(Begin->BB, OrigBlock, Predecessor);
+      unsigned NumMergedCases = 0;
+      if (LowerBound && UpperBound)
+        NumMergedCases =
+            UpperBound->getSExtValue() - LowerBound->getSExtValue();
+      fixPhis(Begin->BB, OrigBlock, Predecessor, NumMergedCases);
       return Begin->BB;
     }
     return newLeafBlock(*Begin, Val, OrigBlock, Default);
@@ -186,32 +220,32 @@ BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
 
   CaseRange &Pivot = *(Begin + Mid);
   DEBUG(dbgs() << "Pivot ==> "
-               << cast<ConstantInt>(Pivot.Low)->getValue()
-               << " -" << cast<ConstantInt>(Pivot.High)->getValue() << "\n");
+               << Pivot.Low->getValue()
+               << " -" << Pivot.High->getValue() << "\n");
 
   // NewLowerBound here should never be the integer minimal value.
   // This is because it is computed from a case range that is never
   // the smallest, so there is always a case range that has at least
   // a smaller value.
-  ConstantInt *NewLowerBound = cast<ConstantInt>(Pivot.Low);
-  ConstantInt *NewUpperBound;
-
-  // If we don't have a Default block then it means that we can never
-  // have a value outside of a case range, so set the UpperBound to the highest
-  // value in the LHS part of the case ranges.
-  if (Default != nullptr) {
-    // Because NewLowerBound is never the smallest representable integer
-    // it is safe here to subtract one.
-    NewUpperBound = ConstantInt::get(NewLowerBound->getContext(),
-                                     NewLowerBound->getValue() - 1);
-  } else {
-    CaseItr LastLHS = LHS.begin() + LHS.size() - 1;
-    NewUpperBound = cast<ConstantInt>(LastLHS->High);
+  ConstantInt *NewLowerBound = Pivot.Low;
+
+  // Because NewLowerBound is never the smallest representable integer
+  // it is safe here to subtract one.
+  ConstantInt *NewUpperBound = ConstantInt::get(NewLowerBound->getContext(),
+                                                NewLowerBound->getValue() - 1);
+
+  if (!UnreachableRanges.empty()) {
+    // Check if the gap between LHS's highest and NewLowerBound is unreachable.
+    int64_t GapLow = LHS.back().High->getSExtValue() + 1;
+    int64_t GapHigh = NewLowerBound->getSExtValue() - 1;
+    IntRange Gap = { GapLow, GapHigh };
+    if (GapHigh >= GapLow && IsInRanges(Gap, UnreachableRanges))
+      NewUpperBound = LHS.back().High;
   }
 
   DEBUG(dbgs() << "LHS Bounds ==> ";
         if (LowerBound) {
-          dbgs() << cast<ConstantInt>(LowerBound)->getSExtValue();
+          dbgs() << LowerBound->getSExtValue();
         } else {
           dbgs() << "NONE";
         }
@@ -219,7 +253,7 @@ BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
         dbgs() << "RHS Bounds ==> ";
         dbgs() << NewLowerBound->getSExtValue() << " - ";
         if (UpperBound) {
-          dbgs() << cast<ConstantInt>(UpperBound)->getSExtValue() << "\n";
+          dbgs() << UpperBound->getSExtValue() << "\n";
         } else {
           dbgs() << "NONE\n";
         });
@@ -234,10 +268,10 @@ BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
 
   BasicBlock *LBranch = switchConvert(LHS.begin(), LHS.end(), LowerBound,
                                       NewUpperBound, Val, NewNode, OrigBlock,
-                                      Default);
+                                      Default, UnreachableRanges);
   BasicBlock *RBranch = switchConvert(RHS.begin(), RHS.end(), NewLowerBound,
                                       UpperBound, Val, NewNode, OrigBlock,
-                                      Default);
+                                      Default, UnreachableRanges);
 
   Function::iterator FI = OrigBlock;
   F->getBasicBlockList().insert(++FI, NewNode);
@@ -270,11 +304,11 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
                         Leaf.Low, "SwitchLeaf");
   } else {
     // Make range comparison
-    if (cast<ConstantInt>(Leaf.Low)->isMinValue(true /*isSigned*/)) {
+    if (Leaf.Low->isMinValue(true /*isSigned*/)) {
       // Val >= Min && Val <= Hi --> Val <= Hi
       Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SLE, Val, Leaf.High,
                           "SwitchLeaf");
-    } else if (cast<ConstantInt>(Leaf.Low)->isZero()) {
+    } else if (Leaf.Low->isZero()) {
       // Val >= 0 && Val <= Hi --> Val <=u Hi
       Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High,
                           "SwitchLeaf");      
@@ -299,8 +333,8 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
   for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
     PHINode* PN = cast<PHINode>(I);
     // Remove all but one incoming entries from the cluster
-    uint64_t Range = cast<ConstantInt>(Leaf.High)->getSExtValue() -
-                     cast<ConstantInt>(Leaf.Low)->getSExtValue();    
+    uint64_t Range = Leaf.High->getSExtValue() -
+                     Leaf.Low->getSExtValue();
     for (uint64_t j = 0; j < Range; ++j) {
       PN->removeIncomingValue(OrigBlock);
     }
@@ -328,8 +362,8 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
   if (Cases.size()>=2)
     for (CaseItr I = Cases.begin(), J = std::next(Cases.begin());
          J != Cases.end();) {
-      int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue();
-      int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue();
+      int64_t nextValue = J->Low->getSExtValue();
+      int64_t currentValue = I->High->getSExtValue();
       BasicBlock* nextBB = J->BB;
       BasicBlock* currentBB = I->BB;
 
@@ -362,26 +396,102 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) {
   Value *Val = SI->getCondition();  // The value we are switching on...
   BasicBlock* Default = SI->getDefaultDest();
 
-  // If there is only the default destination, don't bother with the code below.
+  // If there is only the default destination, just branch.
   if (!SI->getNumCases()) {
-    BranchInst::Create(SI->getDefaultDest(), CurBlock);
-    CurBlock->getInstList().erase(SI);
+    BranchInst::Create(Default, CurBlock);
+    SI->eraseFromParent();
     return;
   }
 
-  const bool DefaultIsUnreachable =
-      Default->size() == 1 && isa<UnreachableInst>(Default->getTerminator());
+  // Prepare cases vector.
+  CaseVector Cases;
+  unsigned numCmps = Clusterify(Cases, SI);
+  DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
+               << ". Total compares: " << numCmps << "\n");
+  DEBUG(dbgs() << "Cases: " << Cases << "\n");
+  (void)numCmps;
+
+  ConstantInt *LowerBound = nullptr;
+  ConstantInt *UpperBound = nullptr;
+  std::vector<IntRange> UnreachableRanges;
+
+  if (isa<UnreachableInst>(Default->getFirstNonPHIOrDbg())) {
+    // Make the bounds tightly fitted around the case value range, becase we
+    // know that the value passed to the switch must be exactly one of the case
+    // values.
+    assert(!Cases.empty());
+    LowerBound = Cases.front().Low;
+    UpperBound = Cases.back().High;
+
+    DenseMap<BasicBlock *, unsigned> Popularity;
+    unsigned MaxPop = 0;
+    BasicBlock *PopSucc = nullptr;
+
+    IntRange R = { INT64_MIN, INT64_MAX };
+    UnreachableRanges.push_back(R);
+    for (const auto &I : Cases) {
+      int64_t Low = I.Low->getSExtValue();
+      int64_t High = I.High->getSExtValue();
+
+      IntRange &LastRange = UnreachableRanges.back();
+      if (LastRange.Low == Low) {
+        // There is nothing left of the previous range.
+        UnreachableRanges.pop_back();
+      } else {
+        // Terminate the previous range.
+        assert(Low > LastRange.Low);
+        LastRange.High = Low - 1;
+      }
+      if (High != INT64_MAX) {
+        IntRange R = { High + 1, INT64_MAX };
+        UnreachableRanges.push_back(R);
+      }
+
+      // Count popularity.
+      int64_t N = High - Low + 1;
+      unsigned &Pop = Popularity[I.BB];
+      if ((Pop += N) > MaxPop) {
+        MaxPop = Pop;
+        PopSucc = I.BB;
+      }
+    }
+#ifndef NDEBUG
+    /* UnreachableRanges should be sorted and the ranges non-adjacent. */
+    for (auto I = UnreachableRanges.begin(), E = UnreachableRanges.end();
+         I != E; ++I) {
+      assert(I->Low <= I->High);
+      auto Next = I + 1;
+      if (Next != E) {
+        assert(Next->Low > I->High);
+      }
+    }
+#endif
+
+    // Use the most popular block as the new default, reducing the number of
+    // cases.
+    assert(MaxPop > 0 && PopSucc);
+    Default = PopSucc;
+    for (CaseItr I = Cases.begin(); I != Cases.end();) {
+      if (I->BB == PopSucc)
+        I = Cases.erase(I);
+      else
+        ++I;
+    }
+
+    // If there are no cases left, just branch.
+    if (Cases.empty()) {
+      BranchInst::Create(Default, CurBlock);
+      SI->eraseFromParent();
+      return;
+    }
+  }
+
   // Create a new, empty default block so that the new hierarchy of
   // if-then statements go to this and the PHI nodes are happy.
-  // if the default block is set as an unreachable we avoid creating one
-  // because will never be a valid target.
-  BasicBlock *NewDefault = nullptr;
-  if (!DefaultIsUnreachable) {
-    NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault");
-    F->getBasicBlockList().insert(Default, NewDefault);
-
-    BranchInst::Create(Default, NewDefault);
-  }
+  BasicBlock *NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault");
+  F->getBasicBlockList().insert(Default, NewDefault);
+  BranchInst::Create(Default, NewDefault);
+
   // If there is an entry in any PHI nodes for the default edge, make sure
   // to update them as well.
   for (BasicBlock::iterator I = Default->begin(); isa<PHINode>(I); ++I) {
@@ -391,40 +501,18 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) {
     PN->setIncomingBlock((unsigned)BlockIdx, NewDefault);
   }
 
-  // Prepare cases vector.
-  CaseVector Cases;
-  unsigned numCmps = Clusterify(Cases, SI);
-
-  DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
-               << ". Total compares: " << numCmps << "\n");
-  DEBUG(dbgs() << "Cases: " << Cases << "\n");
-  (void)numCmps;
-  
-  ConstantInt *UpperBound = nullptr;
-  ConstantInt *LowerBound = nullptr;
-
-  // Optimize the condition where Default is an unreachable block. In this case
-  // we can make the bounds tightly fitted around the case value ranges,
-  // because we know that the value passed to the switch should always be
-  // exactly one of the case values.
-  if (DefaultIsUnreachable) {
-    CaseItr LastCase = Cases.begin() + Cases.size() - 1;
-    UpperBound = cast<ConstantInt>(LastCase->High);
-    LowerBound = cast<ConstantInt>(Cases.begin()->Low);
-  }
   BasicBlock *SwitchBlock =
       switchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val,
-                    OrigBlock, OrigBlock, NewDefault);
+                    OrigBlock, OrigBlock, NewDefault, UnreachableRanges);
 
   // Branch to our shiny new if-then stuff...
   BranchInst::Create(SwitchBlock, OrigBlock);
 
   // We are now done with the switch instruction, delete it.
+  BasicBlock *OldDefault = SI->getDefaultDest();
   CurBlock->getInstList().erase(SI);
 
-  pred_iterator PI = pred_begin(Default), E = pred_end(Default);
-  // If the Default block has no more predecessors just remove it
-  if (PI == E) {
-    DeleteDeadBlock(Default);
-  }
+  // If the Default block has no more predecessors just remove it.
+  if (pred_begin(OldDefault) == pred_end(OldDefault))
+    DeleteDeadBlock(OldDefault);
 }
diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp
index 477ee7a..00cf4e6 100644
--- a/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/lib/Transforms/Utils/Mem2Reg.cpp
@@ -14,7 +14,7 @@
 
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
@@ -39,7 +39,7 @@ namespace {
     bool runOnFunction(Function &F) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionTracker>();
+      AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.setPreservesCFG();
       // This is a cluster of orthogonal Transforms
@@ -53,7 +53,7 @@ namespace {
 char PromotePass::ID = 0;
 INITIALIZE_PASS_BEGIN(PromotePass, "mem2reg", "Promote Memory to Register",
                 false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(PromotePass, "mem2reg", "Promote Memory to Register",
                 false, false)
@@ -66,7 +66,8 @@ bool PromotePass::runOnFunction(Function &F) {
   bool Changed  = false;
 
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
+  AssumptionCache &AC =
+      getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
   while (1) {
     Allocas.clear();
@@ -80,7 +81,7 @@ bool PromotePass::runOnFunction(Function &F) {
 
     if (Allocas.empty()) break;
 
-    PromoteMemToReg(Allocas, DT, nullptr, AT);
+    PromoteMemToReg(Allocas, DT, nullptr, &AC);
     NumPromoted += Allocas.size();
     Changed = true;
   }
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 1fd7071..dabadb7 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -239,7 +239,7 @@ struct PromoteMem2Reg {
   AliasSetTracker *AST;
 
   /// A cache of @llvm.assume intrinsics used by SimplifyInstruction.
-  AssumptionTracker *AT;
+  AssumptionCache *AC;
 
   /// Reverse mapping of Allocas.
   DenseMap<AllocaInst *, unsigned> AllocaLookup;
@@ -282,9 +282,10 @@ struct PromoteMem2Reg {
 
 public:
   PromoteMem2Reg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                 AliasSetTracker *AST, AssumptionTracker *AT)
+                 AliasSetTracker *AST, AssumptionCache *AC)
       : Allocas(Allocas.begin(), Allocas.end()), DT(DT),
-        DIB(*DT.getRoot()->getParent()->getParent()), AST(AST), AT(AT) {}
+        DIB(*DT.getRoot()->getParent()->getParent(), /*AllowUnresolved*/ false),
+        AST(AST), AC(AC) {}
 
   void run();
 
@@ -415,7 +416,8 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
   // Record debuginfo for the store and remove the declaration's
   // debuginfo.
   if (DbgDeclareInst *DDI = Info.DbgDeclare) {
-    DIBuilder DIB(*AI->getParent()->getParent()->getParent());
+    DIBuilder DIB(*AI->getParent()->getParent()->getParent(),
+                  /*AllowUnresolved*/ false);
     ConvertDebugDeclareToDebugValue(DDI, Info.OnlyStore, DIB);
     DDI->eraseFromParent();
     LBI.deleteValue(DDI);
@@ -498,7 +500,8 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
     StoreInst *SI = cast<StoreInst>(AI->user_back());
     // Record debuginfo for the store before removing it.
     if (DbgDeclareInst *DDI = Info.DbgDeclare) {
-      DIBuilder DIB(*AI->getParent()->getParent()->getParent());
+      DIBuilder DIB(*AI->getParent()->getParent()->getParent(),
+                    /*AllowUnresolved*/ false);
       ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
     }
     SI->eraseFromParent();
@@ -688,7 +691,7 @@ void PromoteMem2Reg::run() {
       PHINode *PN = I->second;
 
       // If this PHI node merges one value and/or undefs, get the value.
-      if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, &DT, AT)) {
+      if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, &DT, AC)) {
         if (AST && PN->getType()->isPointerTy())
           AST->deleteValue(PN);
         PN->replaceAllUsesWith(V);
@@ -1068,10 +1071,10 @@ NextIteration:
 }
 
 void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                           AliasSetTracker *AST, AssumptionTracker *AT) {
+                           AliasSetTracker *AST, AssumptionCache *AC) {
   // If there is nothing to do, bail out...
   if (Allocas.empty())
     return;
 
-  PromoteMem2Reg(Allocas, DT, AST, AT).run();
+  PromoteMem2Reg(Allocas, DT, AST, AC).run();
 }
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index 3fcb789..c057b06 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -150,8 +150,8 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
                                          ProtoName, &BB->front());
 
   // Fill in all the predecessors of the PHI.
-  for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
-    InsertedPHI->addIncoming(PredValues[i].second, PredValues[i].first);
+  for (const auto &PredValue : PredValues)
+    InsertedPHI->addIncoming(PredValue.second, PredValue.first);
 
   // See if the PHI node can be merged to a single value.  This can happen in
   // loop cases when we get a PHI of itself and one other value.
@@ -245,8 +245,7 @@ public:
     // but it is relatively slow.  If we already have PHI nodes in this
     // block, walk one of them to get the predecessor list instead.
     if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) {
-      for (unsigned PI = 0, E = SomePhi->getNumIncomingValues(); PI != E; ++PI)
-        Preds->push_back(SomePhi->getIncomingBlock(PI));
+      Preds->append(SomePhi->block_begin(), SomePhi->block_end());
     } else {
       for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
         Preds->push_back(*PI);
@@ -344,20 +343,17 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
   // This is important because we have to handle multiple defs/uses in a block
   // ourselves: SSAUpdater is purely for cross-block references.
   DenseMap<BasicBlock*, TinyPtrVector<Instruction*> > UsesByBlock;
-  
-  for (unsigned i = 0, e = Insts.size(); i != e; ++i) {
-    Instruction *User = Insts[i];
+
+  for (Instruction *User : Insts)
     UsesByBlock[User->getParent()].push_back(User);
-  }
   
   // Okay, now we can iterate over all the blocks in the function with uses,
   // processing them.  Keep track of which loads are loading a live-in value.
   // Walk the uses in the use-list order to be determinstic.
   SmallVector<LoadInst*, 32> LiveInLoads;
   DenseMap<Value*, Value*> ReplacedLoads;
-  
-  for (unsigned i = 0, e = Insts.size(); i != e; ++i) {
-    Instruction *User = Insts[i];
+
+  for (Instruction *User : Insts) {
     BasicBlock *BB = User->getParent();
     TinyPtrVector<Instruction*> &BlockUses = UsesByBlock[BB];
     
@@ -380,8 +376,8 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
     
     // Otherwise, check to see if this block is all loads.
     bool HasStore = false;
-    for (unsigned i = 0, e = BlockUses.size(); i != e; ++i) {
-      if (isa<StoreInst>(BlockUses[i])) {
+    for (Instruction *I : BlockUses) {
+      if (isa<StoreInst>(I)) {
         HasStore = true;
         break;
       }
@@ -391,8 +387,8 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
     // efficient way to tell which on is first in the block and don't want to
     // scan large blocks, so just add all loads as live ins.
     if (!HasStore) {
-      for (unsigned i = 0, e = BlockUses.size(); i != e; ++i)
-        LiveInLoads.push_back(cast<LoadInst>(BlockUses[i]));
+      for (Instruction *I : BlockUses)
+        LiveInLoads.push_back(cast<LoadInst>(I));
       BlockUses.clear();
       continue;
     }
@@ -403,8 +399,8 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
     // block is a load, then it uses the live in value.  The last store defines
     // the live out value.  We handle this by doing a linear scan of the block.
     Value *StoredValue = nullptr;
-    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) {
-      if (LoadInst *L = dyn_cast<LoadInst>(II)) {
+    for (Instruction &I : *BB) {
+      if (LoadInst *L = dyn_cast<LoadInst>(&I)) {
         // If this is a load from an unrelated pointer, ignore it.
         if (!isInstInList(L, Insts)) continue;
         
@@ -419,8 +415,8 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
         }
         continue;
       }
-      
-      if (StoreInst *SI = dyn_cast<StoreInst>(II)) {
+
+      if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
         // If this is a store to an unrelated pointer, ignore it.
         if (!isInstInList(SI, Insts)) continue;
         updateDebugInfo(SI);
@@ -438,8 +434,7 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
   
   // Okay, now we rewrite all loads that use live-in values in the loop,
   // inserting PHI nodes as necessary.
-  for (unsigned i = 0, e = LiveInLoads.size(); i != e; ++i) {
-    LoadInst *ALoad = LiveInLoads[i];
+  for (LoadInst *ALoad : LiveInLoads) {
     Value *NewVal = SSA.GetValueInMiddleOfBlock(ALoad->getParent());
     replaceLoadWithValue(ALoad, NewVal);
 
@@ -454,9 +449,7 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
   
   // Now that everything is rewritten, delete the old instructions from the
   // function.  They should all be dead now.
-  for (unsigned i = 0, e = Insts.size(); i != e; ++i) {
-    Instruction *User = Insts[i];
-    
+  for (Instruction *User : Insts) {
     // If this is a load that still has uses, then the load must have been added
     // as a live value in the SSAUpdate data structure for a block (e.g. because
     // the loaded value was stored later).  In this case, we need to recursively
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 92fd56a..3248a83 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -53,9 +53,13 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "simplifycfg"
 
+// Chosen as 2 so as to be cheap, but still to have enough power to fold
+// a select, so the "clamp" idiom (of a min followed by a max) will be caught.
+// To catch this, we need to fold a compare and a select, hence '2' being the
+// minimum reasonable default.
 static cl::opt<unsigned>
-PHINodeFoldingThreshold("phi-node-folding-threshold", cl::Hidden, cl::init(1),
-   cl::desc("Control the amount of phi node folding to perform (default = 1)"));
+PHINodeFoldingThreshold("phi-node-folding-threshold", cl::Hidden, cl::init(2),
+   cl::desc("Control the amount of phi node folding to perform (default = 2)"));
 
 static cl::opt<bool>
 DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false),
@@ -73,6 +77,7 @@ STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping");
 STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
 STATISTIC(NumLookupTablesHoles, "Number of switch instructions turned into lookup tables (holes checked)");
+STATISTIC(NumTableCmpReuses, "Number of reused switch table lookup compares");
 STATISTIC(NumSinkCommons, "Number of common instructions sunk down to the end block");
 STATISTIC(NumSpeculations, "Number of speculative executed instructions");
 
@@ -107,7 +112,7 @@ class SimplifyCFGOpt {
   const TargetTransformInfo &TTI;
   unsigned BonusInstThreshold;
   const DataLayout *const DL;
-  AssumptionTracker *AT;
+  AssumptionCache *AC;
   Value *isValueEqualityComparison(TerminatorInst *TI);
   BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI,
                                std::vector<ValueEqualityComparisonCase> &Cases);
@@ -127,8 +132,8 @@ class SimplifyCFGOpt {
 
 public:
   SimplifyCFGOpt(const TargetTransformInfo &TTI, unsigned BonusInstThreshold,
-                 const DataLayout *DL, AssumptionTracker *AT)
-      : TTI(TTI), BonusInstThreshold(BonusInstThreshold), DL(DL), AT(AT) {}
+                 const DataLayout *DL, AssumptionCache *AC)
+      : TTI(TTI), BonusInstThreshold(BonusInstThreshold), DL(DL), AC(AC) {}
   bool run(BasicBlock *BB);
 };
 }
@@ -215,45 +220,15 @@ static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
 }
 
 /// ComputeSpeculationCost - Compute an abstract "cost" of speculating the
-/// given instruction, which is assumed to be safe to speculate. 1 means
-/// cheap, 2 means less cheap, and UINT_MAX means prohibitively expensive.
-static unsigned ComputeSpeculationCost(const User *I, const DataLayout *DL) {
+/// given instruction, which is assumed to be safe to speculate. TCC_Free means
+/// cheap, TCC_Basic means less cheap, and TCC_Expensive means prohibitively
+/// expensive.
+static unsigned ComputeSpeculationCost(const User *I, const DataLayout *DL,
+                                       const TargetTransformInfo &TTI) {
   assert(isSafeToSpeculativelyExecute(I, DL) &&
          "Instruction is not safe to speculatively execute!");
-  switch (Operator::getOpcode(I)) {
-  default:
-    // In doubt, be conservative.
-    return UINT_MAX;
-  case Instruction::GetElementPtr:
-    // GEPs are cheap if all indices are constant.
-    if (!cast<GEPOperator>(I)->hasAllConstantIndices())
-      return UINT_MAX;
-    return 1;
-  case Instruction::ExtractValue:
-  case Instruction::Load:
-  case Instruction::Add:
-  case Instruction::Sub:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::ICmp:
-  case Instruction::Trunc:
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::BitCast:
-  case Instruction::ExtractElement:
-  case Instruction::InsertElement:
-    return 1; // These are all cheap.
-
-  case Instruction::Call:
-  case Instruction::Select:
-    return 2;
-  }
+  return TTI.getUserCost(I);
 }
-
 /// DominatesMergePoint - If we have a merge point of an "if condition" as
 /// accepted above, return true if the specified value dominates the block.  We
 /// don't handle the true generality of domination here, just a special case
@@ -274,7 +249,8 @@ static unsigned ComputeSpeculationCost(const User *I, const DataLayout *DL) {
 static bool DominatesMergePoint(Value *V, BasicBlock *BB,
                                 SmallPtrSetImpl<Instruction*> *AggressiveInsts,
                                 unsigned &CostRemaining,
-                                const DataLayout *DL) {
+                                const DataLayout *DL,
+                                const TargetTransformInfo &TTI) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) {
     // Non-instructions all dominate instructions, but not all constantexprs
@@ -310,7 +286,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   if (!isSafeToSpeculativelyExecute(I, DL))
     return false;
 
-  unsigned Cost = ComputeSpeculationCost(I, DL);
+  unsigned Cost = ComputeSpeculationCost(I, DL, TTI);
 
   if (Cost > CostRemaining)
     return false;
@@ -320,7 +296,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   // Okay, we can only really hoist these out if their operands do
   // not take us over the cost threshold.
   for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
-    if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, DL))
+    if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, DL, TTI))
       return false;
   // Okay, it's safe to do this!  Remember this instruction.
   AggressiveInsts->insert(I);
@@ -383,10 +359,9 @@ struct ConstantComparesGatherer {
   }
 
   /// Prevent copy
-  ConstantComparesGatherer(const ConstantComparesGatherer &)
-      LLVM_DELETED_FUNCTION;
+  ConstantComparesGatherer(const ConstantComparesGatherer &) = delete;
   ConstantComparesGatherer &
-  operator=(const ConstantComparesGatherer &) LLVM_DELETED_FUNCTION;
+  operator=(const ConstantComparesGatherer &) = delete;
 
 private:
 
@@ -712,8 +687,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     if (HasWeight)
       for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
            ++MD_i) {
-        ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(MD_i));
-        assert(CI);
+        ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(MD_i));
         Weights.push_back(CI->getValue().getZExtValue());
       }
     for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) {
@@ -818,7 +792,7 @@ static void GetBranchWeights(TerminatorInst *TI,
   MDNode *MD = TI->getMetadata(LLVMContext::MD_prof);
   assert(MD);
   for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) {
-    ConstantInt *CI = cast<ConstantInt>(MD->getOperand(i));
+    ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(i));
     Weights.push_back(CI->getValue().getZExtValue());
   }
 
@@ -1079,7 +1053,8 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I);
 /// HoistThenElseCodeToIf - Given a conditional branch that goes to BB1 and
 /// BB2, hoist any common code in the two blocks up into the branch block.  The
 /// caller of this function guarantees that BI's block dominates BB1 and BB2.
-static bool HoistThenElseCodeToIf(BranchInst *BI, const DataLayout *DL) {
+static bool HoistThenElseCodeToIf(BranchInst *BI, const DataLayout *DL,
+                                  const TargetTransformInfo &TTI) {
   // This does very trivial matching, with limited scanning, to find identical
   // instructions in the two blocks.  In particular, we don't want to get into
   // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
@@ -1114,6 +1089,9 @@ static bool HoistThenElseCodeToIf(BranchInst *BI, const DataLayout *DL) {
     if (isa<TerminatorInst>(I1))
       goto HoistTerminator;
 
+    if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2))
+      return Changed;
+
     // For a normal instruction, we just move one to right before the branch,
     // then replace all uses of the other with the first.  Finally, we remove
     // the now redundant second instruction.
@@ -1244,14 +1222,13 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
     return false;
 
   // Gather the PHI nodes in BBEnd.
-  std::map<Value*, std::pair<Value*, PHINode*> > MapValueFromBB1ToBB2;
+  SmallDenseMap<std::pair<Value *, Value *>, PHINode *> JointValueMap;
   Instruction *FirstNonPhiInBBEnd = nullptr;
-  for (BasicBlock::iterator I = BBEnd->begin(), E = BBEnd->end();
-       I != E; ++I) {
+  for (BasicBlock::iterator I = BBEnd->begin(), E = BBEnd->end(); I != E; ++I) {
     if (PHINode *PN = dyn_cast<PHINode>(I)) {
       Value *BB1V = PN->getIncomingValueForBlock(BB1);
       Value *BB2V = PN->getIncomingValueForBlock(BB2);
-      MapValueFromBB1ToBB2[BB1V] = std::make_pair(BB2V, PN);
+      JointValueMap[std::make_pair(BB1V, BB2V)] = PN;
     } else {
       FirstNonPhiInBBEnd = &*I;
       break;
@@ -1260,13 +1237,13 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
   if (!FirstNonPhiInBBEnd)
     return false;
 
-
   // This does very trivial matching, with limited scanning, to find identical
   // instructions in the two blocks.  We scan backward for obviously identical
   // instructions in an identical order.
   BasicBlock::InstListType::reverse_iterator RI1 = BB1->getInstList().rbegin(),
-      RE1 = BB1->getInstList().rend(), RI2 = BB2->getInstList().rbegin(),
-      RE2 = BB2->getInstList().rend();
+                                             RE1 = BB1->getInstList().rend(),
+                                             RI2 = BB2->getInstList().rbegin(),
+                                             RE2 = BB2->getInstList().rend();
   // Skip debug info.
   while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1)) ++RI1;
   if (RI1 == RE1)
@@ -1289,6 +1266,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
       return Changed;
 
     Instruction *I1 = &*RI1, *I2 = &*RI2;
+    auto InstPair = std::make_pair(I1, I2);
     // I1 and I2 should have a single use in the same PHI node, and they
     // perform the same operation.
     // Cannot move control-flow-involving, volatile loads, vaarg, etc.
@@ -1299,11 +1277,11 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
         I1->mayHaveSideEffects() || I2->mayHaveSideEffects() ||
         I1->mayReadOrWriteMemory() || I2->mayReadOrWriteMemory() ||
         !I1->hasOneUse() || !I2->hasOneUse() ||
-        MapValueFromBB1ToBB2.find(I1) == MapValueFromBB1ToBB2.end() ||
-        MapValueFromBB1ToBB2[I1].first != I2)
+        !JointValueMap.count(InstPair))
       return Changed;
 
     // Check whether we should swap the operands of ICmpInst.
+    // TODO: Add support of communativity.
     ICmpInst *ICmp1 = dyn_cast<ICmpInst>(I1), *ICmp2 = dyn_cast<ICmpInst>(I2);
     bool SwapOpnds = false;
     if (ICmp1 && ICmp2 &&
@@ -1324,16 +1302,13 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
     // with a PHI node after sinking. We only handle the case where there is
     // a single pair of different operands.
     Value *DifferentOp1 = nullptr, *DifferentOp2 = nullptr;
-    unsigned Op1Idx = 0;
+    unsigned Op1Idx = ~0U;
     for (unsigned I = 0, E = I1->getNumOperands(); I != E; ++I) {
       if (I1->getOperand(I) == I2->getOperand(I))
         continue;
-      // Early exit if we have more-than one pair of different operands or
-      // the different operand is already in MapValueFromBB1ToBB2.
-      // Early exit if we need a PHI node to replace a constant.
-      if (DifferentOp1 ||
-          MapValueFromBB1ToBB2.find(I1->getOperand(I)) !=
-          MapValueFromBB1ToBB2.end() ||
+      // Early exit if we have more-than one pair of different operands or if
+      // we need a PHI node to replace a constant.
+      if (Op1Idx != ~0U ||
           isa<Constant>(I1->getOperand(I)) ||
           isa<Constant>(I2->getOperand(I))) {
         // If we can't sink the instructions, undo the swapping.
@@ -1346,24 +1321,27 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
       DifferentOp2 = I2->getOperand(I);
     }
 
-    // We insert the pair of different operands to MapValueFromBB1ToBB2 and
-    // remove (I1, I2) from MapValueFromBB1ToBB2.
-    if (DifferentOp1) {
-      PHINode *NewPN = PHINode::Create(DifferentOp1->getType(), 2,
-                                       DifferentOp1->getName() + ".sink",
-                                       BBEnd->begin());
-      MapValueFromBB1ToBB2[DifferentOp1] = std::make_pair(DifferentOp2, NewPN);
+    DEBUG(dbgs() << "SINK common instructions " << *I1 << "\n");
+    DEBUG(dbgs() << "                         " << *I2 << "\n");
+
+    // We insert the pair of different operands to JointValueMap and
+    // remove (I1, I2) from JointValueMap.
+    if (Op1Idx != ~0U) {
+      auto &NewPN = JointValueMap[std::make_pair(DifferentOp1, DifferentOp2)];
+      if (!NewPN) {
+        NewPN =
+            PHINode::Create(DifferentOp1->getType(), 2,
+                            DifferentOp1->getName() + ".sink", BBEnd->begin());
+        NewPN->addIncoming(DifferentOp1, BB1);
+        NewPN->addIncoming(DifferentOp2, BB2);
+        DEBUG(dbgs() << "Create PHI node " << *NewPN << "\n";);
+      }
       // I1 should use NewPN instead of DifferentOp1.
       I1->setOperand(Op1Idx, NewPN);
-      NewPN->addIncoming(DifferentOp1, BB1);
-      NewPN->addIncoming(DifferentOp2, BB2);
-      DEBUG(dbgs() << "Create PHI node " << *NewPN << "\n";);
     }
-    PHINode *OldPN = MapValueFromBB1ToBB2[I1].second;
-    MapValueFromBB1ToBB2.erase(I1);
+    PHINode *OldPN = JointValueMap[InstPair];
+    JointValueMap.erase(InstPair);
 
-    DEBUG(dbgs() << "SINK common instructions " << *I1 << "\n";);
-    DEBUG(dbgs() << "                         " << *I2 << "\n";);
     // We need to update RE1 and RE2 if we are going to sink the first
     // instruction in the basic block down.
     bool UpdateRE1 = (I1 == BB1->begin()), UpdateRE2 = (I2 == BB2->begin());
@@ -1489,7 +1467,8 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
 ///
 /// \returns true if the conditional block is removed.
 static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
-                                   const DataLayout *DL) {
+                                   const DataLayout *DL,
+                                   const TargetTransformInfo &TTI) {
   // Be conservative for now. FP select instruction can often be expensive.
   Value *BrCond = BI->getCondition();
   if (isa<FCmpInst>(BrCond))
@@ -1538,7 +1517,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
                                                          EndBB))))
       return false;
     if (!SpeculatedStoreValue &&
-        ComputeSpeculationCost(I, DL) > PHINodeFoldingThreshold)
+        ComputeSpeculationCost(I, DL, TTI) > PHINodeFoldingThreshold *
+        TargetTransformInfo::TCC_Basic)
       return false;
 
     // Store the store speculation candidate.
@@ -1597,9 +1577,11 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
     if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE, DL)) ||
         (OrigCE && !isSafeToSpeculativelyExecute(OrigCE, DL)))
       return false;
-    unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE, DL) : 0;
-    unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE, DL) : 0;
-    if (OrigCost + ThenCost > 2 * PHINodeFoldingThreshold)
+    unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE, DL, TTI) : 0;
+    unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE, DL, TTI) : 0;
+    unsigned MaxCost = 2 * PHINodeFoldingThreshold *
+      TargetTransformInfo::TCC_Basic;
+    if (OrigCost + ThenCost > MaxCost)
       return false;
 
     // Account for the cost of an unfolded ConstantExpr which could end up
@@ -1804,7 +1786,8 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *DL) {
 
 /// FoldTwoEntryPHINode - Given a BB that starts with the specified two-entry
 /// PHI node, see if we can eliminate it.
-static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL) {
+static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL,
+                                const TargetTransformInfo &TTI) {
   // Ok, this is a two entry PHI node.  Check to see if this is a simple "if
   // statement", which has a very simple dominance structure.  Basically, we
   // are trying to find the condition that is being branched on, which
@@ -1835,6 +1818,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL) {
   SmallPtrSet<Instruction*, 4> AggressiveInsts;
   unsigned MaxCostVal0 = PHINodeFoldingThreshold,
            MaxCostVal1 = PHINodeFoldingThreshold;
+  MaxCostVal0 *= TargetTransformInfo::TCC_Basic;
+  MaxCostVal1 *= TargetTransformInfo::TCC_Basic;
 
   for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
     PHINode *PN = cast<PHINode>(II++);
@@ -1845,9 +1830,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL) {
     }
 
     if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts,
-                             MaxCostVal0, DL) ||
+                             MaxCostVal0, DL, TTI) ||
         !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts,
-                             MaxCostVal1, DL))
+                             MaxCostVal1, DL, TTI))
       return false;
   }
 
@@ -2036,8 +2021,10 @@ static bool ExtractBranchMetadata(BranchInst *BI,
          "Looking for probabilities on unconditional branch?");
   MDNode *ProfileData = BI->getMetadata(LLVMContext::MD_prof);
   if (!ProfileData || ProfileData->getNumOperands() != 3) return false;
-  ConstantInt *CITrue = dyn_cast<ConstantInt>(ProfileData->getOperand(1));
-  ConstantInt *CIFalse = dyn_cast<ConstantInt>(ProfileData->getOperand(2));
+  ConstantInt *CITrue =
+      mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(1));
+  ConstantInt *CIFalse =
+      mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(2));
   if (!CITrue || !CIFalse) return false;
   ProbTrue = CITrue->getValue().getZExtValue();
   ProbFalse = CIFalse->getValue().getZExtValue();
@@ -2534,17 +2521,15 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
     // The weight to CommonDest should be PredCommon * SuccTotal +
     //                                    PredOther * SuccCommon.
     // The weight to OtherDest should be PredOther * SuccOther.
-    SmallVector<uint64_t, 2> NewWeights;
-    NewWeights.push_back(PredCommon * (SuccCommon + SuccOther) +
-                         PredOther * SuccCommon);
-    NewWeights.push_back(PredOther * SuccOther);
+    uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther) +
+                                  PredOther * SuccCommon,
+                              PredOther * SuccOther};
     // Halve the weights if any of them cannot fit in an uint32_t
     FitWeights(NewWeights);
 
-    SmallVector<uint32_t, 2> MDWeights(NewWeights.begin(),NewWeights.end());
     PBI->setMetadata(LLVMContext::MD_prof,
-                     MDBuilder(BI->getContext()).
-                     createBranchWeights(MDWeights));
+                     MDBuilder(BI->getContext())
+                         .createBranchWeights(NewWeights[0], NewWeights[1]));
   }
 
   // OtherDest may have phi nodes.  If so, add an entry from PBI's
@@ -2718,7 +2703,7 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
 /// the PHI, merging the third icmp into the switch.
 static bool TryToSimplifyUncondBranchWithICmpInIt(
     ICmpInst *ICI, IRBuilder<> &Builder, const TargetTransformInfo &TTI,
-    unsigned BonusInstThreshold, const DataLayout *DL, AssumptionTracker *AT) {
+    unsigned BonusInstThreshold, const DataLayout *DL, AssumptionCache *AC) {
   BasicBlock *BB = ICI->getParent();
 
   // If the block has any PHIs in it or the icmp has multiple uses, it is too
@@ -2751,7 +2736,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
       ICI->eraseFromParent();
     }
     // BB is now empty, so it is likely to simplify away.
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
   }
 
   // Ok, the block is reachable from the default dest.  If the constant we're
@@ -2767,7 +2752,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
     ICI->replaceAllUsesWith(V);
     ICI->eraseFromParent();
     // BB is now empty, so it is likely to simplify away.
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
   }
 
   // The use of the icmp has to be in the 'end' block, by the only PHI node in
@@ -2947,20 +2932,9 @@ bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
       return false;
 
   // Turn all invokes that unwind here into calls and delete the basic block.
-  bool InvokeRequiresTableEntry = false;
-  bool Changed = false;
   for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
     InvokeInst *II = cast<InvokeInst>((*PI++)->getTerminator());
-
-    if (II->hasFnAttr(Attribute::UWTable)) {
-      // Don't remove an `invoke' instruction if the ABI requires an entry into
-      // the table.
-      InvokeRequiresTableEntry = true;
-      continue;
-    }
-
     SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
-
     // Insert a call instruction before the invoke.
     CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II);
     Call->takeName(II);
@@ -2980,14 +2954,11 @@ bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
 
     // Finally, delete the invoke instruction!
     II->eraseFromParent();
-    Changed = true;
   }
 
-  if (!InvokeRequiresTableEntry)
-    // The landingpad is now unreachable.  Zap it.
-    BB->eraseFromParent();
-
-  return Changed;
+  // The landingpad is now unreachable.  Zap it.
+  BB->eraseFromParent();
+  return true;
 }
 
 bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
@@ -3018,7 +2989,7 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
     }
 
     // If we eliminated all predecessors of the block, delete the block now.
-    if (pred_begin(BB) == pred_end(BB))
+    if (pred_empty(BB))
       // We know there are no successors, so just nuke the block.
       BB->eraseFromParent();
 
@@ -3119,55 +3090,6 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
           --i; --e;
           Changed = true;
         }
-      // If the default value is unreachable, figure out the most popular
-      // destination and make it the default.
-      if (SI->getDefaultDest() == BB) {
-        std::map<BasicBlock*, std::pair<unsigned, unsigned> > Popularity;
-        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-             i != e; ++i) {
-          std::pair<unsigned, unsigned> &entry =
-              Popularity[i.getCaseSuccessor()];
-          if (entry.first == 0) {
-            entry.first = 1;
-            entry.second = i.getCaseIndex();
-          } else {
-            entry.first++;
-          }
-        }
-
-        // Find the most popular block.
-        unsigned MaxPop = 0;
-        unsigned MaxIndex = 0;
-        BasicBlock *MaxBlock = nullptr;
-        for (std::map<BasicBlock*, std::pair<unsigned, unsigned> >::iterator
-             I = Popularity.begin(), E = Popularity.end(); I != E; ++I) {
-          if (I->second.first > MaxPop ||
-              (I->second.first == MaxPop && MaxIndex > I->second.second)) {
-            MaxPop = I->second.first;
-            MaxIndex = I->second.second;
-            MaxBlock = I->first;
-          }
-        }
-        if (MaxBlock) {
-          // Make this the new default, allowing us to delete any explicit
-          // edges to it.
-          SI->setDefaultDest(MaxBlock);
-          Changed = true;
-
-          // If MaxBlock has phinodes in it, remove MaxPop-1 entries from
-          // it.
-          if (isa<PHINode>(MaxBlock->begin()))
-            for (unsigned i = 0; i != MaxPop-1; ++i)
-              MaxBlock->removePredecessor(SI->getParent());
-
-          for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-               i != e; ++i)
-            if (i.getCaseSuccessor() == MaxBlock) {
-              SI->removeCase(i);
-              --i; --e;
-            }
-        }
-      }
     } else if (InvokeInst *II = dyn_cast<InvokeInst>(TI)) {
       if (II->getUnwindDest() == BB) {
         // Convert the invoke to a call instruction.  This would be a good
@@ -3191,7 +3113,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
   }
 
   // If this block is now dead, remove it.
-  if (pred_begin(BB) == pred_end(BB) &&
+  if (pred_empty(BB) &&
       BB != &BB->getParent()->getEntryBlock()) {
     // We know there are no successors, so just nuke the block.
     BB->eraseFromParent();
@@ -3201,70 +3123,122 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
   return Changed;
 }
 
-/// TurnSwitchRangeIntoICmp - Turns a switch with that contains only a
-/// integer range comparison into a sub, an icmp and a branch.
-static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
-  assert(SI->getNumCases() > 1 && "Degenerate switch?");
+static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
+  assert(Cases.size() >= 1);
 
-  // Make sure all cases point to the same destination and gather the values.
-  SmallVector<ConstantInt *, 16> Cases;
-  SwitchInst::CaseIt I = SI->case_begin();
-  Cases.push_back(I.getCaseValue());
-  SwitchInst::CaseIt PrevI = I++;
-  for (SwitchInst::CaseIt E = SI->case_end(); I != E; PrevI = I++) {
-    if (PrevI.getCaseSuccessor() != I.getCaseSuccessor())
+  array_pod_sort(Cases.begin(), Cases.end(), ConstantIntSortPredicate);
+  for (size_t I = 1, E = Cases.size(); I != E; ++I) {
+    if (Cases[I - 1]->getValue() != Cases[I]->getValue() + 1)
       return false;
-    Cases.push_back(I.getCaseValue());
   }
-  assert(Cases.size() == SI->getNumCases() && "Not all cases gathered");
+  return true;
+}
 
-  // Sort the case values, then check if they form a range we can transform.
-  array_pod_sort(Cases.begin(), Cases.end(), ConstantIntSortPredicate);
-  for (unsigned I = 1, E = Cases.size(); I != E; ++I) {
-    if (Cases[I-1]->getValue() != Cases[I]->getValue()+1)
-      return false;
+/// Turn a switch with two reachable destinations into an integer range
+/// comparison and branch.
+static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
+  assert(SI->getNumCases() > 1 && "Degenerate switch?");
+
+  bool HasDefault =
+      !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+
+  // Partition the cases into two sets with different destinations.
+  BasicBlock *DestA = HasDefault ? SI->getDefaultDest() : nullptr;
+  BasicBlock *DestB = nullptr;
+  SmallVector <ConstantInt *, 16> CasesA;
+  SmallVector <ConstantInt *, 16> CasesB;
+
+  for (SwitchInst::CaseIt I : SI->cases()) {
+    BasicBlock *Dest = I.getCaseSuccessor();
+    if (!DestA) DestA = Dest;
+    if (Dest == DestA) {
+      CasesA.push_back(I.getCaseValue());
+      continue;
+    }
+    if (!DestB) DestB = Dest;
+    if (Dest == DestB) {
+      CasesB.push_back(I.getCaseValue());
+      continue;
+    }
+    return false;  // More than two destinations.
   }
 
-  Constant *Offset = ConstantExpr::getNeg(Cases.back());
-  Constant *NumCases = ConstantInt::get(Offset->getType(), SI->getNumCases());
+  assert(DestA && DestB && "Single-destination switch should have been folded.");
+  assert(DestA != DestB);
+  assert(DestB != SI->getDefaultDest());
+  assert(!CasesB.empty() && "There must be non-default cases.");
+  assert(!CasesA.empty() || HasDefault);
+
+  // Figure out if one of the sets of cases form a contiguous range.
+  SmallVectorImpl<ConstantInt *> *ContiguousCases = nullptr;
+  BasicBlock *ContiguousDest = nullptr;
+  BasicBlock *OtherDest = nullptr;
+  if (!CasesA.empty() && CasesAreContiguous(CasesA)) {
+    ContiguousCases = &CasesA;
+    ContiguousDest = DestA;
+    OtherDest = DestB;
+  } else if (CasesAreContiguous(CasesB)) {
+    ContiguousCases = &CasesB;
+    ContiguousDest = DestB;
+    OtherDest = DestA;
+  } else
+    return false;
+
+  // Start building the compare and branch.
+
+  Constant *Offset = ConstantExpr::getNeg(ContiguousCases->back());
+  Constant *NumCases = ConstantInt::get(Offset->getType(), ContiguousCases->size());
 
   Value *Sub = SI->getCondition();
   if (!Offset->isNullValue())
-    Sub = Builder.CreateAdd(Sub, Offset, Sub->getName()+".off");
+    Sub = Builder.CreateAdd(Sub, Offset, Sub->getName() + ".off");
+
   Value *Cmp;
   // If NumCases overflowed, then all possible values jump to the successor.
-  if (NumCases->isNullValue() && SI->getNumCases() != 0)
+  if (NumCases->isNullValue() && !ContiguousCases->empty())
     Cmp = ConstantInt::getTrue(SI->getContext());
   else
     Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
-  BranchInst *NewBI = Builder.CreateCondBr(
-      Cmp, SI->case_begin().getCaseSuccessor(), SI->getDefaultDest());
+  BranchInst *NewBI = Builder.CreateCondBr(Cmp, ContiguousDest, OtherDest);
 
   // Update weight for the newly-created conditional branch.
-  SmallVector<uint64_t, 8> Weights;
-  bool HasWeights = HasBranchWeights(SI);
-  if (HasWeights) {
+  if (HasBranchWeights(SI)) {
+    SmallVector<uint64_t, 8> Weights;
     GetBranchWeights(SI, Weights);
     if (Weights.size() == 1 + SI->getNumCases()) {
-      // Combine all weights for the cases to be the true weight of NewBI.
-      // We assume that the sum of all weights for a Terminator can fit into 32
-      // bits.
-      uint32_t NewTrueWeight = 0;
-      for (unsigned I = 1, E = Weights.size(); I != E; ++I)
-        NewTrueWeight += (uint32_t)Weights[I];
+      uint64_t TrueWeight = 0;
+      uint64_t FalseWeight = 0;
+      for (size_t I = 0, E = Weights.size(); I != E; ++I) {
+        if (SI->getSuccessor(I) == ContiguousDest)
+          TrueWeight += Weights[I];
+        else
+          FalseWeight += Weights[I];
+      }
+      while (TrueWeight > UINT32_MAX || FalseWeight > UINT32_MAX) {
+        TrueWeight /= 2;
+        FalseWeight /= 2;
+      }
       NewBI->setMetadata(LLVMContext::MD_prof,
-                         MDBuilder(SI->getContext()).
-                         createBranchWeights(NewTrueWeight,
-                                             (uint32_t)Weights[0]));
+                         MDBuilder(SI->getContext()).createBranchWeights(
+                             (uint32_t)TrueWeight, (uint32_t)FalseWeight));
     }
   }
 
-  // Prune obsolete incoming values off the successor's PHI nodes.
-  for (BasicBlock::iterator BBI = SI->case_begin().getCaseSuccessor()->begin();
-       isa<PHINode>(BBI); ++BBI) {
-    for (unsigned I = 0, E = SI->getNumCases()-1; I != E; ++I)
+  // Prune obsolete incoming values off the successors' PHI nodes.
+  for (auto BBI = ContiguousDest->begin(); isa<PHINode>(BBI); ++BBI) {
+    unsigned PreviousEdges = ContiguousCases->size();
+    if (ContiguousDest == SI->getDefaultDest()) ++PreviousEdges;
+    for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
+      cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
+  }
+  for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) {
+    unsigned PreviousEdges = SI->getNumCases() - ContiguousCases->size();
+    if (OtherDest == SI->getDefaultDest()) ++PreviousEdges;
+    for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
       cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
   }
+
+  // Drop the switch.
   SI->eraseFromParent();
 
   return true;
@@ -3273,11 +3247,11 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
 /// EliminateDeadSwitchCases - Compute masked bits for the condition of a switch
 /// and use it to remove dead cases.
 static bool EliminateDeadSwitchCases(SwitchInst *SI, const DataLayout *DL,
-                                     AssumptionTracker *AT) {
+                                     AssumptionCache *AC) {
   Value *Cond = SI->getCondition();
   unsigned Bits = Cond->getType()->getIntegerBitWidth();
   APInt KnownZero(Bits, 0), KnownOne(Bits, 0);
-  computeKnownBits(Cond, KnownZero, KnownOne, DL, 0, AT, SI);
+  computeKnownBits(Cond, KnownZero, KnownOne, DL, 0, AC, SI);
 
   // Gather dead cases.
   SmallVector<ConstantInt*, 8> DeadCases;
@@ -3484,6 +3458,21 @@ GetCaseResults(SwitchInst *SI,
       continue;
     } else if (Constant *C = ConstantFold(I, ConstantPool, DL)) {
       // Instruction is side-effect free and constant.
+
+      // If the instruction has uses outside this block or a phi node slot for
+      // the block, it is not safe to bypass the instruction since it would then
+      // no longer dominate all its uses.
+      for (auto &Use : I->uses()) {
+        User *User = Use.getUser();
+        if (Instruction *I = dyn_cast<Instruction>(User))
+          if (I->getParent() == CaseDest)
+            continue;
+        if (PHINode *Phi = dyn_cast<PHINode>(User))
+          if (Phi->getIncomingBlock(Use) == CaseDest)
+            continue;
+        return false;
+      }
+
       ConstantPool.insert(std::make_pair(I, C));
     } else {
       break;
@@ -3509,12 +3498,6 @@ GetCaseResults(SwitchInst *SI,
     if (!ConstVal)
       return false;
 
-    // Note: If the constant comes from constant-propagating the case value
-    // through the CaseDest basic block, it will be safe to remove the
-    // instructions in that block. They cannot be used (except in the phi nodes
-    // we visit) outside CaseDest, because that block does not dominate its
-    // successor. If it did, we would not be in this phi node.
-
     // Be conservative about which kinds of constants we support.
     if (!ValidLookupTableConstant(ConstVal))
       return false;
@@ -3655,7 +3638,7 @@ static void RemoveSwitchAfterSelectConversion(SwitchInst *SI, PHINode *PHI,
 /// phi nodes in a common successor block with only two different
 /// constant values, replace the switch with select.
 static bool SwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
-                           const DataLayout *DL, AssumptionTracker *AT) {
+                           const DataLayout *DL, AssumptionCache *AC) {
   Value *const Cond = SI->getCondition();
   PHINode *PHI = nullptr;
   BasicBlock *CommonDest = nullptr;
@@ -3982,6 +3965,89 @@ static bool ShouldBuildLookupTable(SwitchInst *SI,
   return SI->getNumCases() * 10 >= TableSize * 4;
 }
 
+/// Try to reuse the switch table index compare. Following pattern:
+/// \code
+///     if (idx < tablesize)
+///        r = table[idx]; // table does not contain default_value
+///     else
+///        r = default_value;
+///     if (r != default_value)
+///        ...
+/// \endcode
+/// Is optimized to:
+/// \code
+///     cond = idx < tablesize;
+///     if (cond)
+///        r = table[idx];
+///     else
+///        r = default_value;
+///     if (cond)
+///        ...
+/// \endcode
+/// Jump threading will then eliminate the second if(cond).
+static void reuseTableCompare(User *PhiUser, BasicBlock *PhiBlock,
+          BranchInst *RangeCheckBranch, Constant *DefaultValue,
+          const SmallVectorImpl<std::pair<ConstantInt*, Constant*> >& Values) {
+
+  ICmpInst *CmpInst = dyn_cast<ICmpInst>(PhiUser);
+  if (!CmpInst)
+    return;
+
+  // We require that the compare is in the same block as the phi so that jump
+  // threading can do its work afterwards.
+  if (CmpInst->getParent() != PhiBlock)
+    return;
+
+  Constant *CmpOp1 = dyn_cast<Constant>(CmpInst->getOperand(1));
+  if (!CmpOp1)
+    return;
+
+  Value *RangeCmp = RangeCheckBranch->getCondition();
+  Constant *TrueConst = ConstantInt::getTrue(RangeCmp->getType());
+  Constant *FalseConst = ConstantInt::getFalse(RangeCmp->getType());
+
+  // Check if the compare with the default value is constant true or false.
+  Constant *DefaultConst = ConstantExpr::getICmp(CmpInst->getPredicate(),
+                                                 DefaultValue, CmpOp1, true);
+  if (DefaultConst != TrueConst && DefaultConst != FalseConst)
+    return;
+
+  // Check if the compare with the case values is distinct from the default
+  // compare result.
+  for (auto ValuePair : Values) {
+    Constant *CaseConst = ConstantExpr::getICmp(CmpInst->getPredicate(),
+                              ValuePair.second, CmpOp1, true);
+    if (!CaseConst || CaseConst == DefaultConst)
+      return;
+    assert((CaseConst == TrueConst || CaseConst == FalseConst) &&
+           "Expect true or false as compare result.");
+  }
+ 
+  // Check if the branch instruction dominates the phi node. It's a simple
+  // dominance check, but sufficient for our needs.
+  // Although this check is invariant in the calling loops, it's better to do it
+  // at this late stage. Practically we do it at most once for a switch.
+  BasicBlock *BranchBlock = RangeCheckBranch->getParent();
+  for (auto PI = pred_begin(PhiBlock), E = pred_end(PhiBlock); PI != E; ++PI) {
+    BasicBlock *Pred = *PI;
+    if (Pred != BranchBlock && Pred->getUniquePredecessor() != BranchBlock)
+      return;
+  }
+
+  if (DefaultConst == FalseConst) {
+    // The compare yields the same result. We can replace it.
+    CmpInst->replaceAllUsesWith(RangeCmp);
+    ++NumTableCmpReuses;
+  } else {
+    // The compare yields the same result, just inverted. We can replace it.
+    Value *InvertedTableCmp = BinaryOperator::CreateXor(RangeCmp,
+                ConstantInt::get(RangeCmp->getType(), 1), "inverted.cmp",
+                RangeCheckBranch);
+    CmpInst->replaceAllUsesWith(InvertedTableCmp);
+    ++NumTableCmpReuses;
+  }
+}
+
 /// SwitchToLookupTable - If the switch is only used to initialize one or more
 /// phi nodes in a common successor block with different constant values,
 /// replace the switch with lookup tables.
@@ -4058,11 +4124,8 @@ static bool SwitchToLookupTable(SwitchInst *SI,
   // If the table has holes, we need a constant result for the default case
   // or a bitmask that fits in a register.
   SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList;
-  bool HasDefaultResults = false;
-  if (TableHasHoles) {
-    HasDefaultResults = GetCaseResults(SI, nullptr, SI->getDefaultDest(),
+  bool HasDefaultResults = GetCaseResults(SI, nullptr, SI->getDefaultDest(),
                                        &CommonDest, DefaultResultsList, DL);
-  }
 
   bool NeedMask = (TableHasHoles && !HasDefaultResults);
   if (NeedMask) {
@@ -4102,21 +4165,24 @@ static bool SwitchToLookupTable(SwitchInst *SI,
          "It is impossible for a switch to have more entries than the max "
          "representable value of its input integer type's size.");
 
-  // If we have a fully covered lookup table, unconditionally branch to the
-  // lookup table BB. Otherwise, check if the condition value is within the case
-  // range. If it is so, branch to the new BB. Otherwise branch to SI's default
-  // destination.
-  const bool GeneratingCoveredLookupTable = MaxTableSize == TableSize;
-  if (GeneratingCoveredLookupTable) {
+  // If the default destination is unreachable, or if the lookup table covers
+  // all values of the conditional variable, branch directly to the lookup table
+  // BB. Otherwise, check that the condition is within the case range.
+  const bool DefaultIsReachable =
+      !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+  const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize);
+  BranchInst *RangeCheckBranch = nullptr;
+
+  if (!DefaultIsReachable || GeneratingCoveredLookupTable) {
     Builder.CreateBr(LookupBB);
     // We cached PHINodes in PHIs, to avoid accessing deleted PHINodes later,
     // do not delete PHINodes here.
     SI->getDefaultDest()->removePredecessor(SI->getParent(),
-                                            true/*DontDeleteUselessPHIs*/);
+                                            /*DontDeleteUselessPHIs=*/true);
   } else {
     Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get(
                                        MinCaseVal->getType(), TableSize));
-    Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+    RangeCheckBranch = Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
   }
 
   // Populate the BB that does the lookups.
@@ -4167,11 +4233,11 @@ static bool SwitchToLookupTable(SwitchInst *SI,
   bool ReturnedEarly = false;
   for (size_t I = 0, E = PHIs.size(); I != E; ++I) {
     PHINode *PHI = PHIs[I];
+    const ResultListTy &ResultList = ResultLists[PHI];
 
     // If using a bitmask, use any value to fill the lookup table holes.
     Constant *DV = NeedMask ? ResultLists[PHI][0].second : DefaultResults[PHI];
-    SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultLists[PHI],
-                            DV, DL);
+    SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL);
 
     Value *Result = Table.BuildLookup(TableIndex, Builder);
 
@@ -4184,6 +4250,16 @@ static bool SwitchToLookupTable(SwitchInst *SI,
       break;
     }
 
+    // Do a small peephole optimization: re-use the switch table compare if
+    // possible.
+    if (!TableHasHoles && HasDefaultResults && RangeCheckBranch) {
+      BasicBlock *PhiBlock = PHI->getParent();
+      // Search for compare instructions which use the phi.
+      for (auto *User : PHI->users()) {
+        reuseTableCompare(User, PhiBlock, RangeCheckBranch, DV, ResultList);
+      }
+    }
+
     PHI->addIncoming(Result, LookupBB);
   }
 
@@ -4214,12 +4290,12 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
     // see if that predecessor totally determines the outcome of this switch.
     if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
       if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
 
     Value *Cond = SI->getCondition();
     if (SelectInst *Select = dyn_cast<SelectInst>(Cond))
       if (SimplifySwitchOnSelect(SI, Select))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
 
     // If the block only contains the switch, see if we can fold the block
     // away into any preds.
@@ -4229,25 +4305,25 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
       ++BBI;
     if (SI == &*BBI)
       if (FoldValueComparisonIntoPredecessors(SI, Builder))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
   }
 
   // Try to transform the switch into an icmp and a branch.
   if (TurnSwitchRangeIntoICmp(SI, Builder))
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
 
   // Remove unreachable cases.
-  if (EliminateDeadSwitchCases(SI, DL, AT))
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+  if (EliminateDeadSwitchCases(SI, DL, AC))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
 
-  if (SwitchToSelect(SI, Builder, DL, AT))
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+  if (SwitchToSelect(SI, Builder, DL, AC))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
 
   if (ForwardSwitchConditionToPHI(SI))
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
 
   if (SwitchToLookupTable(SI, Builder, TTI, DL))
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
 
   return false;
 }
@@ -4284,7 +4360,7 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
 
   if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
     if (SimplifyIndirectBrOnSelect(IBI, SI))
-      return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+      return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
   }
   return Changed;
 }
@@ -4309,7 +4385,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
         ;
       if (I->isTerminator() &&
           TryToSimplifyUncondBranchWithICmpInIt(ICI, Builder, TTI,
-                                                BonusInstThreshold, DL, AT))
+                                                BonusInstThreshold, DL, AC))
         return true;
     }
 
@@ -4318,7 +4394,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
   // predecessor and use logical operations to update the incoming value
   // for PHI nodes in common successor.
   if (FoldBranchToCommonDest(BI, DL, BonusInstThreshold))
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
   return false;
 }
 
@@ -4333,7 +4409,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     // switch.
     if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
       if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
 
     // This block must be empty, except for the setcond inst, if it exists.
     // Ignore dbg intrinsics.
@@ -4343,14 +4419,14 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
       ++I;
     if (&*I == BI) {
       if (FoldValueComparisonIntoPredecessors(BI, Builder))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
     } else if (&*I == cast<Instruction>(BI->getCondition())){
       ++I;
       // Ignore dbg intrinsics.
       while (isa<DbgInfoIntrinsic>(I))
         ++I;
       if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
     }
   }
 
@@ -4362,7 +4438,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // branches to us and one of our successors, fold the comparison into the
   // predecessor and use logical operations to pick the right destination.
   if (FoldBranchToCommonDest(BI, DL, BonusInstThreshold))
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
 
   // We have a conditional branch to two blocks that are only reachable
   // from BI.  We know that the condbr dominates the two blocks, so see if
@@ -4370,16 +4446,16 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // can hoist it up to the branching block.
   if (BI->getSuccessor(0)->getSinglePredecessor()) {
     if (BI->getSuccessor(1)->getSinglePredecessor()) {
-      if (HoistThenElseCodeToIf(BI, DL))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+      if (HoistThenElseCodeToIf(BI, DL, TTI))
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
       // execute Successor #0 if it branches to Successor #1.
       TerminatorInst *Succ0TI = BI->getSuccessor(0)->getTerminator();
       if (Succ0TI->getNumSuccessors() == 1 &&
           Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
-        if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), DL))
-          return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+        if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), DL, TTI))
+          return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
     }
   } else if (BI->getSuccessor(1)->getSinglePredecessor()) {
     // If Successor #0 has multiple preds, we may be able to conditionally
@@ -4387,8 +4463,8 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     TerminatorInst *Succ1TI = BI->getSuccessor(1)->getTerminator();
     if (Succ1TI->getNumSuccessors() == 1 &&
         Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
-      if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), DL))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+      if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), DL, TTI))
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
   }
 
   // If this is a branch on a phi node in the current block, thread control
@@ -4396,14 +4472,14 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
     if (PN->getParent() == BI->getParent())
       if (FoldCondBranchOnPHI(BI, DL))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
 
   // Scan predecessor blocks for conditional branches.
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
       if (PBI != BI && PBI->isConditional())
         if (SimplifyCondBranchToCondBranch(PBI, BI))
-          return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+          return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
 
   return false;
 }
@@ -4484,7 +4560,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 
   // Remove basic blocks that have no predecessors (except the entry block)...
   // or that just have themself as a predecessor.  These are unreachable.
-  if ((pred_begin(BB) == pred_end(BB) &&
+  if ((pred_empty(BB) &&
        BB != &BB->getParent()->getEntryBlock()) ||
       BB->getSinglePredecessor() == BB) {
     DEBUG(dbgs() << "Removing BB: \n" << *BB);
@@ -4515,7 +4591,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
   // eliminate it, do so now.
   if (PHINode *PN = dyn_cast<PHINode>(BB->begin()))
     if (PN->getNumIncomingValues() == 2)
-      Changed |= FoldTwoEntryPHINode(PN, DL);
+      Changed |= FoldTwoEntryPHINode(PN, DL, TTI);
 
   Builder.SetInsertPoint(BB->getTerminator());
   if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
@@ -4547,7 +4623,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 /// of the CFG.  It returns true if a modification was made.
 ///
 bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
-                       unsigned BonusInstThreshold,
-                       const DataLayout *DL, AssumptionTracker *AT) {
-  return SimplifyCFGOpt(TTI, BonusInstThreshold, DL, AT).run(BB);
+                       unsigned BonusInstThreshold, const DataLayout *DL,
+                       AssumptionCache *AC) {
+  return SimplifyCFGOpt(TTI, BonusInstThreshold, DL, AC).run(BB);
 }
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index a4fdd55..6a5d885 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -48,22 +48,15 @@ namespace {
     Loop             *L;
     LoopInfo         *LI;
     ScalarEvolution  *SE;
-    const DataLayout *DL; // May be NULL
 
     SmallVectorImpl<WeakVH> &DeadInsts;
 
     bool Changed;
 
   public:
-    SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, LPPassManager *LPM,
-                   SmallVectorImpl<WeakVH> &Dead, IVUsers *IVU = nullptr) :
-      L(Loop),
-      LI(LPM->getAnalysisIfAvailable<LoopInfo>()),
-      SE(SE),
-      DeadInsts(Dead),
-      Changed(false) {
-      DataLayoutPass *DLP = LPM->getAnalysisIfAvailable<DataLayoutPass>();
-      DL = DLP ? &DLP->getDataLayout() : nullptr;
+    SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, LoopInfo *LI,
+                   SmallVectorImpl<WeakVH> &Dead, IVUsers *IVU = nullptr)
+        : L(Loop), LI(LI), SE(SE), DeadInsts(Dead), Changed(false) {
       assert(LI && "IV simplification requires LoopInfo");
     }
 
@@ -80,6 +73,7 @@ namespace {
     void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
     void eliminateIVRemainder(BinaryOperator *Rem, Value *IVOperand,
                               bool IsSigned);
+    bool strengthenOverflowingOperation(BinaryOperator *OBO, Value *IVOperand);
 
     Instruction *splitOverflowIntrinsic(Instruction *IVUser,
                                         const DominatorTree *DT);
@@ -271,6 +265,107 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
   return true;
 }
 
+/// Annotate BO with nsw / nuw if it provably does not signed-overflow /
+/// unsigned-overflow.  Returns true if anything changed, false otherwise.
+bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
+                                                    Value *IVOperand) {
+
+  // Currently we only handle instructions of the form "add <indvar> <value>"
+  unsigned Op = BO->getOpcode();
+  if (Op != Instruction::Add)
+    return false;
+
+  // If BO is already both nuw and nsw then there is nothing left to do
+  if (BO->hasNoUnsignedWrap() && BO->hasNoSignedWrap())
+    return false;
+
+  IntegerType *IT = cast<IntegerType>(IVOperand->getType());
+  Value *OtherOperand = nullptr;
+  if (BO->getOperand(0) == IVOperand) {
+    OtherOperand = BO->getOperand(1);
+  } else {
+    assert(BO->getOperand(1) == IVOperand && "only other use!");
+    OtherOperand = BO->getOperand(0);
+  }
+
+  bool Changed = false;
+  const SCEV *OtherOpSCEV = SE->getSCEV(OtherOperand);
+  if (OtherOpSCEV == SE->getCouldNotCompute())
+    return false;
+
+  const SCEV *IVOpSCEV = SE->getSCEV(IVOperand);
+  const SCEV *ZeroSCEV = SE->getConstant(IVOpSCEV->getType(), 0);
+
+  if (!BO->hasNoSignedWrap()) {
+    // Upgrade the add to an "add nsw" if we can prove that it will never
+    // sign-overflow or sign-underflow.
+
+    const SCEV *SignedMax =
+      SE->getConstant(APInt::getSignedMaxValue(IT->getBitWidth()));
+    const SCEV *SignedMin =
+      SE->getConstant(APInt::getSignedMinValue(IT->getBitWidth()));
+
+    // The addition "IVOperand + OtherOp" does not sign-overflow if the result
+    // is sign-representable in 2's complement in the given bit-width.
+    //
+    // If OtherOp is SLT 0, then for an IVOperand in [SignedMin - OtherOp,
+    // SignedMax], "IVOperand + OtherOp" is in [SignedMin, SignedMax + OtherOp].
+    // Everything in [SignedMin, SignedMax + OtherOp] is representable since
+    // SignedMax + OtherOp is at least -1.
+    //
+    // If OtherOp is SGE 0, then for an IVOperand in [SignedMin, SignedMax -
+    // OtherOp], "IVOperand + OtherOp" is in [SignedMin + OtherOp, SignedMax].
+    // Everything in [SignedMin + OtherOp, SignedMax] is representable since
+    // SignedMin + OtherOp is at most -1.
+    //
+    // It follows that for all values of IVOperand in [SignedMin - smin(0,
+    // OtherOp), SignedMax - smax(0, OtherOp)] the result of the add is
+    // representable (i.e. there is no sign-overflow).
+
+    const SCEV *UpperDelta = SE->getSMaxExpr(ZeroSCEV, OtherOpSCEV);
+    const SCEV *UpperLimit = SE->getMinusSCEV(SignedMax, UpperDelta);
+
+    bool NeverSignedOverflows =
+      SE->isKnownPredicate(ICmpInst::ICMP_SLE, IVOpSCEV, UpperLimit);
+
+    if (NeverSignedOverflows) {
+      const SCEV *LowerDelta = SE->getSMinExpr(ZeroSCEV, OtherOpSCEV);
+      const SCEV *LowerLimit = SE->getMinusSCEV(SignedMin, LowerDelta);
+
+      bool NeverSignedUnderflows =
+        SE->isKnownPredicate(ICmpInst::ICMP_SGE, IVOpSCEV, LowerLimit);
+      if (NeverSignedUnderflows) {
+        BO->setHasNoSignedWrap(true);
+        Changed = true;
+      }
+    }
+  }
+
+  if (!BO->hasNoUnsignedWrap()) {
+    // Upgrade the add computing "IVOperand + OtherOp" to an "add nuw" if we can
+    // prove that it will never unsigned-overflow (i.e. the result will always
+    // be representable in the given bit-width).
+    //
+    // "IVOperand + OtherOp" is unsigned-representable in 2's complement iff it
+    // does not produce a carry.  "IVOperand + OtherOp" produces no carry iff
+    // IVOperand ULE (UnsignedMax - OtherOp).
+
+    const SCEV *UnsignedMax =
+      SE->getConstant(APInt::getMaxValue(IT->getBitWidth()));
+    const SCEV *UpperLimit = SE->getMinusSCEV(UnsignedMax, OtherOpSCEV);
+
+    bool NeverUnsignedOverflows =
+        SE->isKnownPredicate(ICmpInst::ICMP_ULE, IVOpSCEV, UpperLimit);
+
+    if (NeverUnsignedOverflows) {
+      BO->setHasNoUnsignedWrap(true);
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
 /// \brief Split sadd.with.overflow into add + sadd.with.overflow to allow
 /// analysis and optimization.
 ///
@@ -430,6 +525,16 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
       pushIVUsers(IVOperand, Simplified, SimpleIVUsers);
       continue;
     }
+
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseOper.first)) {
+      if (isa<OverflowingBinaryOperator>(BO) &&
+          strengthenOverflowingOperation(BO, IVOperand)) {
+        // re-queue uses of the now modified binary operator and fall
+        // through to the checks that remain.
+        pushIVUsers(IVOperand, Simplified, SimpleIVUsers);
+      }
+    }
+
     CastInst *Cast = dyn_cast<CastInst>(UseOper.first);
     if (V && Cast) {
       V->visitCast(Cast);
@@ -450,8 +555,8 @@ void IVVisitor::anchor() { }
 bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, LPPassManager *LPM,
                        SmallVectorImpl<WeakVH> &Dead, IVVisitor *V)
 {
-  LoopInfo *LI = &LPM->getAnalysis<LoopInfo>();
-  SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, LPM, Dead);
+  LoopInfo *LI = &LPM->getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, LI, Dead);
   SIV.simplifyUsers(CurrIV, V);
   return SIV.hasChanged();
 }
diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp
index 5632095..55a4455 100644
--- a/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -18,14 +18,14 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -42,8 +42,8 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
-      AU.addRequired<AssumptionTracker>();
-      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
 
     /// runOnFunction - Remove instructions that simplify.
@@ -53,8 +53,10 @@ namespace {
       const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
       DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
       const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
-      const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
-      AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
+      const TargetLibraryInfo *TLI =
+          &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+      AssumptionCache *AC =
+          &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
       SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
       bool Changed = false;
 
@@ -71,7 +73,7 @@ namespace {
               continue;
             // Don't waste time simplifying unused instructions.
             if (!I->use_empty())
-              if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AT)) {
+              if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
                 // Mark all uses for resimplification next time round the loop.
                 for (User *U : I->users())
                   Next->insert(cast<Instruction>(U));
@@ -104,8 +106,8 @@ namespace {
 char InstSimplifier::ID = 0;
 INITIALIZE_PASS_BEGIN(InstSimplifier, "instsimplify",
                       "Remove redundant instructions", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(InstSimplifier, "instsimplify",
                     "Remove redundant instructions", false, false)
 char &llvm::InstructionSimplifierID = InstSimplifier::ID;
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index a39f128..fb1d83f 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -30,7 +30,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 
 using namespace llvm;
@@ -116,207 +116,68 @@ static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
   }
 }
 
-//===----------------------------------------------------------------------===//
-// Fortified Library Call Optimizations
-//===----------------------------------------------------------------------===//
-
-static bool isFortifiedCallFoldable(CallInst *CI, unsigned SizeCIOp, unsigned SizeArgOp,
-                       bool isString) {
-  if (CI->getArgOperand(SizeCIOp) == CI->getArgOperand(SizeArgOp))
-    return true;
-  if (ConstantInt *SizeCI =
-          dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) {
-    if (SizeCI->isAllOnesValue())
-      return true;
-    if (isString) {
-      uint64_t Len = GetStringLength(CI->getArgOperand(SizeArgOp));
-      // If the length is 0 we don't know how long it is and so we can't
-      // remove the check.
-      if (Len == 0)
-        return false;
-      return SizeCI->getZExtValue() >= Len;
-    }
-    if (ConstantInt *Arg = dyn_cast<ConstantInt>(CI->getArgOperand(SizeArgOp)))
-      return SizeCI->getZExtValue() >= Arg->getZExtValue();
-  }
-  return false;
-}
-
-Value *LibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  LLVMContext &Context = CI->getContext();
-
-  // Check if this has the right signature.
-  if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isPointerTy() ||
-      FT->getParamType(2) != DL->getIntPtrType(Context) ||
-      FT->getParamType(3) != DL->getIntPtrType(Context))
-    return nullptr;
-
-  if (isFortifiedCallFoldable(CI, 3, 2, false)) {
-    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                   CI->getArgOperand(2), 1);
-    return CI->getArgOperand(0);
-  }
-  return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  LLVMContext &Context = CI->getContext();
-
-  // Check if this has the right signature.
-  if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isPointerTy() ||
-      FT->getParamType(2) != DL->getIntPtrType(Context) ||
-      FT->getParamType(3) != DL->getIntPtrType(Context))
-    return nullptr;
-
-  if (isFortifiedCallFoldable(CI, 3, 2, false)) {
-    B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
-                    CI->getArgOperand(2), 1);
-    return CI->getArgOperand(0);
-  }
-  return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeMemSetChk(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  LLVMContext &Context = CI->getContext();
-
-  // Check if this has the right signature.
-  if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isIntegerTy() ||
-      FT->getParamType(2) != DL->getIntPtrType(Context) ||
-      FT->getParamType(3) != DL->getIntPtrType(Context))
-    return nullptr;
-
-  if (isFortifiedCallFoldable(CI, 3, 2, false)) {
-    Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
-    B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
-    return CI->getArgOperand(0);
-  }
-  return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeStrCpyChk(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  StringRef Name = Callee->getName();
-  FunctionType *FT = Callee->getFunctionType();
-  LLVMContext &Context = CI->getContext();
-
-  // Check if this has the right signature.
-  if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
-      FT->getParamType(2) != DL->getIntPtrType(Context))
-    return nullptr;
-
-  Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
-  if (Dst == Src) // __strcpy_chk(x,x)  -> x
-    return Src;
-
-  // If a) we don't have any length information, or b) we know this will
-  // fit then just lower to a plain strcpy. Otherwise we'll keep our
-  // strcpy_chk call which may fail at runtime if the size is too long.
-  // TODO: It might be nice to get a maximum length out of the possible
-  // string lengths for varying.
-  if (isFortifiedCallFoldable(CI, 2, 1, true)) {
-    Value *Ret = EmitStrCpy(Dst, Src, B, DL, TLI, Name.substr(2, 6));
-    return Ret;
-  } else {
-    // Maybe we can stil fold __strcpy_chk to __memcpy_chk.
-    uint64_t Len = GetStringLength(Src);
-    if (Len == 0)
-      return nullptr;
-
-    // This optimization require DataLayout.
-    if (!DL)
-      return nullptr;
-
-    Value *Ret = EmitMemCpyChk(
-        Dst, Src, ConstantInt::get(DL->getIntPtrType(Context), Len),
-        CI->getArgOperand(2), B, DL, TLI);
-    return Ret;
-  }
-  return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeStpCpyChk(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  StringRef Name = Callee->getName();
-  FunctionType *FT = Callee->getFunctionType();
-  LLVMContext &Context = CI->getContext();
-
-  // Check if this has the right signature.
-  if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
-      FT->getParamType(2) != DL->getIntPtrType(FT->getParamType(0)))
-    return nullptr;
+/// \brief Returns whether \p F matches the signature expected for the
+/// string/memory copying library function \p Func.
+/// Acceptable functions are st[rp][n]?cpy, memove, memcpy, and memset.
+/// Their fortified (_chk) counterparts are also accepted.
+static bool checkStringCopyLibFuncSignature(Function *F, LibFunc::Func Func,
+                                            const DataLayout *DL) {
+  FunctionType *FT = F->getFunctionType();
+  LLVMContext &Context = F->getContext();
+  Type *PCharTy = Type::getInt8PtrTy(Context);
+  Type *SizeTTy = DL ? DL->getIntPtrType(Context) : nullptr;
+  unsigned NumParams = FT->getNumParams();
 
-  Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
-  if (Dst == Src) { // stpcpy(x,x)  -> x+strlen(x)
-    Value *StrLen = EmitStrLen(Src, B, DL, TLI);
-    return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : nullptr;
-  }
-
-  // If a) we don't have any length information, or b) we know this will
-  // fit then just lower to a plain stpcpy. Otherwise we'll keep our
-  // stpcpy_chk call which may fail at runtime if the size is too long.
-  // TODO: It might be nice to get a maximum length out of the possible
-  // string lengths for varying.
-  if (isFortifiedCallFoldable(CI, 2, 1, true)) {
-    Value *Ret = EmitStrCpy(Dst, Src, B, DL, TLI, Name.substr(2, 6));
-    return Ret;
-  } else {
-    // Maybe we can stil fold __stpcpy_chk to __memcpy_chk.
-    uint64_t Len = GetStringLength(Src);
-    if (Len == 0)
-      return nullptr;
-
-    // This optimization require DataLayout.
-    if (!DL)
-      return nullptr;
-
-    Type *PT = FT->getParamType(0);
-    Value *LenV = ConstantInt::get(DL->getIntPtrType(PT), Len);
-    Value *DstEnd =
-        B.CreateGEP(Dst, ConstantInt::get(DL->getIntPtrType(PT), Len - 1));
-    if (!EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, DL, TLI))
-      return nullptr;
-    return DstEnd;
-  }
-  return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeStrNCpyChk(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  StringRef Name = Callee->getName();
-  FunctionType *FT = Callee->getFunctionType();
-  LLVMContext &Context = CI->getContext();
-
-  // Check if this has the right signature.
-  if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
-      !FT->getParamType(2)->isIntegerTy() ||
-      FT->getParamType(3) != DL->getIntPtrType(Context))
-    return nullptr;
+  // All string libfuncs return the same type as the first parameter.
+  if (FT->getReturnType() != FT->getParamType(0))
+    return false;
 
-  if (isFortifiedCallFoldable(CI, 3, 2, false)) {
-    Value *Ret =
-        EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                    CI->getArgOperand(2), B, DL, TLI, Name.substr(2, 7));
-    return Ret;
-  }
-  return nullptr;
+  switch (Func) {
+  default:
+    llvm_unreachable("Can't check signature for non-string-copy libfunc.");
+  case LibFunc::stpncpy_chk:
+  case LibFunc::strncpy_chk:
+    --NumParams; // fallthrough
+  case LibFunc::stpncpy:
+  case LibFunc::strncpy: {
+    if (NumParams != 3 || FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != PCharTy || !FT->getParamType(2)->isIntegerTy())
+      return false;
+    break;
+  }
+  case LibFunc::strcpy_chk:
+  case LibFunc::stpcpy_chk:
+    --NumParams; // fallthrough
+  case LibFunc::stpcpy:
+  case LibFunc::strcpy: {
+    if (NumParams != 2 || FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != PCharTy)
+      return false;
+    break;
+  }
+  case LibFunc::memmove_chk:
+  case LibFunc::memcpy_chk:
+    --NumParams; // fallthrough
+  case LibFunc::memmove:
+  case LibFunc::memcpy: {
+    if (NumParams != 3 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() || FT->getParamType(2) != SizeTTy)
+      return false;
+    break;
+  }
+  case LibFunc::memset_chk:
+    --NumParams; // fallthrough
+  case LibFunc::memset: {
+    if (NumParams != 3 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isIntegerTy() || FT->getParamType(2) != SizeTTy)
+      return false;
+    break;
+  }
+  }
+  // If this is a fortified libcall, the last parameter is a size_t.
+  if (NumParams == FT->getNumParams() - 1)
+    return FT->getParamType(FT->getNumParams() - 1) == SizeTTy;
+  return true;
 }
 
 //===----------------------------------------------------------------------===//
@@ -600,11 +461,8 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  // Verify the "strcpy" function prototype.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      FT->getParamType(0) != B.getInt8PtrTy())
+
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strcpy, DL))
     return nullptr;
 
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
@@ -631,9 +489,8 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   // Verify the "stpcpy" function prototype.
   FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      FT->getParamType(0) != B.getInt8PtrTy())
+
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::stpcpy, DL))
     return nullptr;
 
   // These optimizations require DataLayout.
@@ -665,10 +522,8 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
 Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      FT->getParamType(0) != B.getInt8PtrTy() ||
-      !FT->getParamType(2)->isIntegerTy())
+
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strncpy, DL))
     return nullptr;
 
   Value *Dst = CI->getArgOperand(0);
@@ -976,11 +831,7 @@ Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) {
   if (!DL)
     return nullptr;
 
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isPointerTy() ||
-      FT->getParamType(2) != DL->getIntPtrType(CI->getContext()))
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy, DL))
     return nullptr;
 
   // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
@@ -995,11 +846,7 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
   if (!DL)
     return nullptr;
 
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isPointerTy() ||
-      FT->getParamType(2) != DL->getIntPtrType(CI->getContext()))
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove, DL))
     return nullptr;
 
   // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
@@ -1014,11 +861,7 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
   if (!DL)
     return nullptr;
 
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isIntegerTy() ||
-      FT->getParamType(2) != DL->getIntPtrType(FT->getParamType(0)))
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset, DL))
     return nullptr;
 
   // memset(p, v, n) -> llvm.memset(p, v, n, 1)
@@ -1031,6 +874,28 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
 // Math Library Optimizations
 //===----------------------------------------------------------------------===//
 
+/// Return a variant of Val with float type.
+/// Currently this works in two cases: If Val is an FPExtension of a float
+/// value to something bigger, simply return the operand.
+/// If Val is a ConstantFP but can be converted to a float ConstantFP without
+/// loss of precision do so.
+static Value *valueHasFloatPrecision(Value *Val) {
+  if (FPExtInst *Cast = dyn_cast<FPExtInst>(Val)) {
+    Value *Op = Cast->getOperand(0);
+    if (Op->getType()->isFloatTy())
+      return Op;
+  }
+  if (ConstantFP *Const = dyn_cast<ConstantFP>(Val)) {
+    APFloat F = Const->getValueAPF();
+    bool losesInfo;
+    (void)F.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven,
+                    &losesInfo);
+    if (!losesInfo)
+      return ConstantFP::get(Const->getContext(), F);
+  }
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
 
@@ -1052,12 +917,11 @@ Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
   }
 
   // If this is something like 'floor((double)floatval)', convert to floorf.
-  FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
-  if (!Cast || !Cast->getOperand(0)->getType()->isFloatTy())
+  Value *V = valueHasFloatPrecision(CI->getArgOperand(0));
+  if (V == nullptr)
     return nullptr;
 
   // floor((double)floatval) -> (double)floorf(floatval)
-  Value *V = Cast->getOperand(0);
   if (Callee->isIntrinsic()) {
     Module *M = CI->getParent()->getParent()->getParent();
     Intrinsic::ID IID = (Intrinsic::ID) Callee->getIntrinsicID();
@@ -1083,21 +947,19 @@ Value *LibCallSimplifier::optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
     return nullptr;
 
   // If this is something like 'fmin((double)floatval1, (double)floatval2)',
-  // we convert it to fminf.
-  FPExtInst *Cast1 = dyn_cast<FPExtInst>(CI->getArgOperand(0));
-  FPExtInst *Cast2 = dyn_cast<FPExtInst>(CI->getArgOperand(1));
-  if (!Cast1 || !Cast1->getOperand(0)->getType()->isFloatTy() || !Cast2 ||
-      !Cast2->getOperand(0)->getType()->isFloatTy())
+  // or fmin(1.0, (double)floatval), then we convert it to fminf.
+  Value *V1 = valueHasFloatPrecision(CI->getArgOperand(0));
+  if (V1 == nullptr)
+    return nullptr;
+  Value *V2 = valueHasFloatPrecision(CI->getArgOperand(1));
+  if (V2 == nullptr)
     return nullptr;
 
   // fmin((double)floatval1, (double)floatval2)
-  //                      -> (double)fmin(floatval1, floatval2)
-  Value *V = nullptr;
-  Value *V1 = Cast1->getOperand(0);
-  Value *V2 = Cast2->getOperand(0);
+  //                      -> (double)fminf(floatval1, floatval2)
   // TODO: Handle intrinsics in the same way as in optimizeUnaryDoubleFP().
-  V = EmitBinaryFloatFnCall(V1, V2, Callee->getName(), B,
-                            Callee->getAttributes());
+  Value *V = EmitBinaryFloatFnCall(V1, V2, Callee->getName(), B,
+                                   Callee->getAttributes());
   return B.CreateFPExt(V, B.getDoubleTy());
 }
 
@@ -1995,53 +1857,18 @@ bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
   return false;
 }
 
-Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
-  if (CI->isNoBuiltin())
-    return nullptr;
-
+Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
+                                                      IRBuilder<> &Builder) {
   LibFunc::Func Func;
   Function *Callee = CI->getCalledFunction();
   StringRef FuncName = Callee->getName();
-  IRBuilder<> Builder(CI);
-  bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C;
-
-  // Command-line parameter overrides function attribute.
-  if (EnableUnsafeFPShrink.getNumOccurrences() > 0)
-    UnsafeFPShrink = EnableUnsafeFPShrink;
-  else if (Callee->hasFnAttribute("unsafe-fp-math")) {
-    // FIXME: This is the same problem as described in optimizeSqrt().
-    // If calls gain access to IR-level FMF, then use that instead of a
-    // function attribute.
 
-    // Check for unsafe-fp-math = true.
-    Attribute Attr = Callee->getFnAttribute("unsafe-fp-math");
-    if (Attr.getValueAsString() == "true")
-      UnsafeFPShrink = true;
-  }
-
-  // First, check for intrinsics.
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
-    if (!isCallingConvC)
-      return nullptr;
-    switch (II->getIntrinsicID()) {
-    case Intrinsic::pow:
-      return optimizePow(CI, Builder);
-    case Intrinsic::exp2:
-      return optimizeExp2(CI, Builder);
-    case Intrinsic::fabs:
-      return optimizeFabs(CI, Builder);
-    case Intrinsic::sqrt:
-      return optimizeSqrt(CI, Builder);
-    default:
-      return nullptr;
-    }
-  }
-
-  // Then check for known library functions.
+  // Check for string/memory library functions.
   if (TLI->getLibFunc(FuncName, Func) && TLI->has(Func)) {
-    // We never change the calling convention.
-    if (!ignoreCallingConv(Func) && !isCallingConvC)
-      return nullptr;
+    // Make sure we never change the calling convention.
+    assert((ignoreCallingConv(Func) ||
+            CI->getCallingConv() == llvm::CallingConv::C) &&
+      "Optimizing string/memory libcall would change the calling convention");
     switch (Func) {
     case LibFunc::strcat:
       return optimizeStrCat(CI, Builder);
@@ -2087,6 +1914,77 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
       return optimizeMemMove(CI, Builder);
     case LibFunc::memset:
       return optimizeMemSet(CI, Builder);
+    default:
+      break;
+    }
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
+  if (CI->isNoBuiltin())
+    return nullptr;
+
+  LibFunc::Func Func;
+  Function *Callee = CI->getCalledFunction();
+  StringRef FuncName = Callee->getName();
+  IRBuilder<> Builder(CI);
+  bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C;
+
+  // Command-line parameter overrides function attribute.
+  if (EnableUnsafeFPShrink.getNumOccurrences() > 0)
+    UnsafeFPShrink = EnableUnsafeFPShrink;
+  else if (Callee->hasFnAttribute("unsafe-fp-math")) {
+    // FIXME: This is the same problem as described in optimizeSqrt().
+    // If calls gain access to IR-level FMF, then use that instead of a
+    // function attribute.
+
+    // Check for unsafe-fp-math = true.
+    Attribute Attr = Callee->getFnAttribute("unsafe-fp-math");
+    if (Attr.getValueAsString() == "true")
+      UnsafeFPShrink = true;
+  }
+
+  // First, check for intrinsics.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+    if (!isCallingConvC)
+      return nullptr;
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::pow:
+      return optimizePow(CI, Builder);
+    case Intrinsic::exp2:
+      return optimizeExp2(CI, Builder);
+    case Intrinsic::fabs:
+      return optimizeFabs(CI, Builder);
+    case Intrinsic::sqrt:
+      return optimizeSqrt(CI, Builder);
+    default:
+      return nullptr;
+    }
+  }
+
+  // Also try to simplify calls to fortified library functions.
+  if (Value *SimplifiedFortifiedCI = FortifiedSimplifier.optimizeCall(CI)) {
+    // Try to further simplify the result.
+    CallInst *SimplifiedCI = dyn_cast<CallInst>(SimplifiedFortifiedCI);
+    if (SimplifiedCI && SimplifiedCI->getCalledFunction())
+      if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, Builder)) {
+        // If we were able to further simplify, remove the now redundant call.
+        SimplifiedCI->replaceAllUsesWith(V);
+        SimplifiedCI->eraseFromParent();
+        return V;
+      }
+    return SimplifiedFortifiedCI;
+  }
+
+  // Then check for known library functions.
+  if (TLI->getLibFunc(FuncName, Func) && TLI->has(Func)) {
+    // We never change the calling convention.
+    if (!ignoreCallingConv(Func) && !isCallingConvC)
+      return nullptr;
+    if (Value *V = optimizeStringMemoryLibCall(CI, Builder))
+      return V;
+    switch (Func) {
     case LibFunc::cosf:
     case LibFunc::cos:
     case LibFunc::cosl:
@@ -2177,40 +2075,32 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
       if (UnsafeFPShrink && hasFloatVersion(FuncName))
         return optimizeUnaryDoubleFP(CI, Builder, true);
       return nullptr;
+    case LibFunc::copysign:
     case LibFunc::fmin:
     case LibFunc::fmax:
       if (hasFloatVersion(FuncName))
         return optimizeBinaryDoubleFP(CI, Builder);
       return nullptr;
-    case LibFunc::memcpy_chk:
-      return optimizeMemCpyChk(CI, Builder);
-    case LibFunc::memmove_chk:
-      return optimizeMemMoveChk(CI, Builder);
-    case LibFunc::memset_chk:
-      return optimizeMemSetChk(CI, Builder);
-    case LibFunc::strcpy_chk:
-      return optimizeStrCpyChk(CI, Builder);
-    case LibFunc::stpcpy_chk:
-      return optimizeStpCpyChk(CI, Builder);
-    case LibFunc::stpncpy_chk:
-    case LibFunc::strncpy_chk:
-      return optimizeStrNCpyChk(CI, Builder);
     default:
       return nullptr;
     }
   }
-
   return nullptr;
 }
 
-LibCallSimplifier::LibCallSimplifier(const DataLayout *DL,
-                                     const TargetLibraryInfo *TLI) :
-                                     DL(DL),
-                                     TLI(TLI),
-                                     UnsafeFPShrink(false) {
+LibCallSimplifier::LibCallSimplifier(
+    const DataLayout *DL, const TargetLibraryInfo *TLI,
+    function_ref<void(Instruction *, Value *)> Replacer)
+    : FortifiedSimplifier(DL, TLI), DL(DL), TLI(TLI), UnsafeFPShrink(false),
+      Replacer(Replacer) {}
+
+void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
+  // Indirect through the replacer used in this instance.
+  Replacer(I, With);
 }
 
-void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) const {
+/*static*/ void LibCallSimplifier::replaceAllUsesWithDefault(Instruction *I,
+                                                             Value *With) {
   I->replaceAllUsesWith(With);
   I->eraseFromParent();
 }
@@ -2262,3 +2152,184 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) const {
 //   * trunc(cnst) -> cnst'
 //
 //
+
+//===----------------------------------------------------------------------===//
+// Fortified Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+bool FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
+                                                         unsigned ObjSizeOp,
+                                                         unsigned SizeOp,
+                                                         bool isString) {
+  if (CI->getArgOperand(ObjSizeOp) == CI->getArgOperand(SizeOp))
+    return true;
+  if (ConstantInt *ObjSizeCI =
+          dyn_cast<ConstantInt>(CI->getArgOperand(ObjSizeOp))) {
+    if (ObjSizeCI->isAllOnesValue())
+      return true;
+    // If the object size wasn't -1 (unknown), bail out if we were asked to.
+    if (OnlyLowerUnknownSize)
+      return false;
+    if (isString) {
+      uint64_t Len = GetStringLength(CI->getArgOperand(SizeOp));
+      // If the length is 0 we don't know how long it is and so we can't
+      // remove the check.
+      if (Len == 0)
+        return false;
+      return ObjSizeCI->getZExtValue() >= Len;
+    }
+    if (ConstantInt *SizeCI = dyn_cast<ConstantInt>(CI->getArgOperand(SizeOp)))
+      return ObjSizeCI->getZExtValue() >= SizeCI->getZExtValue();
+  }
+  return false;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy_chk, DL))
+    return nullptr;
+
+  if (isFortifiedCallFoldable(CI, 3, 2, false)) {
+    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                   CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove_chk, DL))
+    return nullptr;
+
+  if (isFortifiedCallFoldable(CI, 3, 2, false)) {
+    B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+                    CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset_chk, DL))
+    return nullptr;
+
+  if (isFortifiedCallFoldable(CI, 3, 2, false)) {
+    Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
+    B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
+                                                      IRBuilder<> &B,
+                                                      LibFunc::Func Func) {
+  Function *Callee = CI->getCalledFunction();
+  StringRef Name = Callee->getName();
+
+  if (!checkStringCopyLibFuncSignature(Callee, Func, DL))
+    return nullptr;
+
+  Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1),
+        *ObjSize = CI->getArgOperand(2);
+
+  // __stpcpy_chk(x,x,...)  -> x+strlen(x)
+  if (Func == LibFunc::stpcpy_chk && !OnlyLowerUnknownSize && Dst == Src) {
+    Value *StrLen = EmitStrLen(Src, B, DL, TLI);
+    return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : nullptr;
+  }
+
+  // If a) we don't have any length information, or b) we know this will
+  // fit then just lower to a plain st[rp]cpy. Otherwise we'll keep our
+  // st[rp]cpy_chk call which may fail at runtime if the size is too long.
+  // TODO: It might be nice to get a maximum length out of the possible
+  // string lengths for varying.
+  if (isFortifiedCallFoldable(CI, 2, 1, true)) {
+    Value *Ret = EmitStrCpy(Dst, Src, B, DL, TLI, Name.substr(2, 6));
+    return Ret;
+  } else if (!OnlyLowerUnknownSize) {
+    // Maybe we can stil fold __st[rp]cpy_chk to __memcpy_chk.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0)
+      return nullptr;
+
+    // This optimization requires DataLayout.
+    if (!DL)
+      return nullptr;
+
+    Type *SizeTTy = DL->getIntPtrType(CI->getContext());
+    Value *LenV = ConstantInt::get(SizeTTy, Len);
+    Value *Ret = EmitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI);
+    // If the function was an __stpcpy_chk, and we were able to fold it into
+    // a __memcpy_chk, we still need to return the correct end pointer.
+    if (Ret && Func == LibFunc::stpcpy_chk)
+      return B.CreateGEP(Dst, ConstantInt::get(SizeTTy, Len - 1));
+    return Ret;
+  }
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
+                                                       IRBuilder<> &B,
+                                                       LibFunc::Func Func) {
+  Function *Callee = CI->getCalledFunction();
+  StringRef Name = Callee->getName();
+
+  if (!checkStringCopyLibFuncSignature(Callee, Func, DL))
+    return nullptr;
+  if (isFortifiedCallFoldable(CI, 3, 2, false)) {
+    Value *Ret =
+        EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                    CI->getArgOperand(2), B, DL, TLI, Name.substr(2, 7));
+    return Ret;
+  }
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) {
+  if (CI->isNoBuiltin())
+    return nullptr;
+
+  LibFunc::Func Func;
+  Function *Callee = CI->getCalledFunction();
+  StringRef FuncName = Callee->getName();
+  IRBuilder<> Builder(CI);
+  bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C;
+
+  // First, check that this is a known library functions.
+  if (!TLI->getLibFunc(FuncName, Func) || !TLI->has(Func))
+    return nullptr;
+
+  // We never change the calling convention.
+  if (!ignoreCallingConv(Func) && !isCallingConvC)
+    return nullptr;
+
+  switch (Func) {
+  case LibFunc::memcpy_chk:
+    return optimizeMemCpyChk(CI, Builder);
+  case LibFunc::memmove_chk:
+    return optimizeMemMoveChk(CI, Builder);
+  case LibFunc::memset_chk:
+    return optimizeMemSetChk(CI, Builder);
+  case LibFunc::stpcpy_chk:
+  case LibFunc::strcpy_chk:
+    return optimizeStrpCpyChk(CI, Builder, Func);
+  case LibFunc::stpncpy_chk:
+  case LibFunc::strncpy_chk:
+    return optimizeStrpNCpyChk(CI, Builder, Func);
+  default:
+    break;
+  }
+  return nullptr;
+}
+
+FortifiedLibCallSimplifier::
+FortifiedLibCallSimplifier(const DataLayout *DL, const TargetLibraryInfo *TLI,
+                           bool OnlyLowerUnknownSize)
+  : DL(DL), TLI(TLI), OnlyLowerUnknownSize(OnlyLowerUnknownSize) {
+}
diff --git a/lib/Transforms/Utils/SymbolRewriter.cpp b/lib/Transforms/Utils/SymbolRewriter.cpp
index aacc945..b343cc4 100644
--- a/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -60,7 +60,7 @@
 #define DEBUG_TYPE "symbol-rewriter"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Pass.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -79,6 +79,19 @@ static cl::list<std::string> RewriteMapFiles("rewrite-map-file",
 
 namespace llvm {
 namespace SymbolRewriter {
+void rewriteComdat(Module &M, GlobalObject *GO, const std::string &Source,
+                   const std::string &Target) {
+  if (Comdat *CD = GO->getComdat()) {
+    auto &Comdats = M.getComdatSymbolTable();
+
+    Comdat *C = M.getOrInsertComdat(Target);
+    C->setSelectionKind(CD->getSelectionKind());
+    GO->setComdat(C);
+
+    Comdats.erase(Comdats.find(Source));
+  }
+}
+
 template <RewriteDescriptor::Type DT, typename ValueType,
           ValueType *(llvm::Module::*Get)(StringRef) const>
 class ExplicitRewriteDescriptor : public RewriteDescriptor {
@@ -102,10 +115,14 @@ template <RewriteDescriptor::Type DT, typename ValueType,
 bool ExplicitRewriteDescriptor<DT, ValueType, Get>::performOnModule(Module &M) {
   bool Changed = false;
   if (ValueType *S = (M.*Get)(Source)) {
+    if (GlobalObject *GO = dyn_cast<GlobalObject>(S))
+      rewriteComdat(M, GO, Source, Target);
+
     if (Value *T = (M.*Get)(Target))
       S->setValueName(T->getValueName());
     else
       S->setName(Target);
+
     Changed = true;
   }
   return Changed;
@@ -113,7 +130,8 @@ bool ExplicitRewriteDescriptor<DT, ValueType, Get>::performOnModule(Module &M) {
 
 template <RewriteDescriptor::Type DT, typename ValueType,
           ValueType *(llvm::Module::*Get)(StringRef) const,
-          iterator_range<typename iplist<ValueType>::iterator> (llvm::Module::*Iterator)()>
+          iterator_range<typename iplist<ValueType>::iterator>
+          (llvm::Module::*Iterator)()>
 class PatternRewriteDescriptor : public RewriteDescriptor {
 public:
   const std::string Pattern;
@@ -131,7 +149,8 @@ public:
 
 template <RewriteDescriptor::Type DT, typename ValueType,
           ValueType *(llvm::Module::*Get)(StringRef) const,
-          iterator_range<typename iplist<ValueType>::iterator> (llvm::Module::*Iterator)()>
+          iterator_range<typename iplist<ValueType>::iterator>
+          (llvm::Module::*Iterator)()>
 bool PatternRewriteDescriptor<DT, ValueType, Get, Iterator>::
 performOnModule(Module &M) {
   bool Changed = false;
@@ -143,6 +162,12 @@ performOnModule(Module &M) {
       report_fatal_error("unable to transforn " + C.getName() + " in " +
                          M.getModuleIdentifier() + ": " + Error);
 
+    if (C.getName() == Name)
+      continue;
+
+    if (GlobalObject *GO = dyn_cast<GlobalObject>(&C))
+      rewriteComdat(M, GO, C.getName(), Name);
+
     if (Value *V = (M.*Get)(Name))
       C.setValueName(V->getValueName());
     else
@@ -492,7 +517,7 @@ RewriteSymbols::RewriteSymbols() : ModulePass(ID) {
 
 RewriteSymbols::RewriteSymbols(SymbolRewriter::RewriteDescriptorList &DL)
     : ModulePass(ID) {
-  std::swap(Descriptors, DL);
+  Descriptors.splice(Descriptors.begin(), DL);
 }
 
 bool RewriteSymbols::runOnModule(Module &M) {
diff --git a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index 0c2fc0a..7e00a80 100644
--- a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -35,7 +35,6 @@ void UnifyFunctionExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
   // We preserve the non-critical-edgeness property
   AU.addPreservedID(BreakCriticalEdgesID);
   // This is a cluster of orthogonal Transforms
-  AU.addPreserved("mem2reg");
   AU.addPreservedID(LowerSwitchID);
 }
 
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index a2f69d1..49c0902 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -40,7 +40,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
 
   // Global values do not need to be seeded into the VM if they
   // are using the identity mapping.
-  if (isa<GlobalValue>(V) || isa<MDString>(V))
+  if (isa<GlobalValue>(V))
     return VM[V] = const_cast<Value*>(V);
   
   if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
@@ -56,57 +56,24 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
     
     return VM[V] = const_cast<Value*>(V);
   }
-  
 
-  if (const MDNode *MD = dyn_cast<MDNode>(V)) {
+  if (const auto *MDV = dyn_cast<MetadataAsValue>(V)) {
+    const Metadata *MD = MDV->getMetadata();
     // If this is a module-level metadata and we know that nothing at the module
     // level is changing, then use an identity mapping.
-    if (!MD->isFunctionLocal() && (Flags & RF_NoModuleLevelChanges))
-      return VM[V] = const_cast<Value*>(V);
-    
-    // Create a dummy node in case we have a metadata cycle.
-    MDNode *Dummy = MDNode::getTemporary(V->getContext(), None);
-    VM[V] = Dummy;
-    
-    // Check all operands to see if any need to be remapped.
-    for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i) {
-      Value *OP = MD->getOperand(i);
-      if (!OP) continue;
-      Value *Mapped_OP = MapValue(OP, VM, Flags, TypeMapper, Materializer);
-      // Use identity map if Mapped_Op is null and we can ignore missing
-      // entries.
-      if (Mapped_OP == OP ||
-          (Mapped_OP == nullptr && (Flags & RF_IgnoreMissingEntries)))
-        continue;
-
-      // Ok, at least one operand needs remapping.  
-      SmallVector<Value*, 4> Elts;
-      Elts.reserve(MD->getNumOperands());
-      for (i = 0; i != e; ++i) {
-        Value *Op = MD->getOperand(i);
-        if (!Op)
-          Elts.push_back(nullptr);
-        else {
-          Value *Mapped_Op = MapValue(Op, VM, Flags, TypeMapper, Materializer);
-          // Use identity map if Mapped_Op is null and we can ignore missing
-          // entries.
-          if (Mapped_Op == nullptr && (Flags & RF_IgnoreMissingEntries))
-            Mapped_Op = Op;
-          Elts.push_back(Mapped_Op);
-        }
-      }
-      MDNode *NewMD = MDNode::get(V->getContext(), Elts);
-      Dummy->replaceAllUsesWith(NewMD);
-      VM[V] = NewMD;
-      MDNode::deleteTemporary(Dummy);
-      return NewMD;
-    }
+    if (!isa<LocalAsMetadata>(MD) && (Flags & RF_NoModuleLevelChanges))
+      return VM[V] = const_cast<Value *>(V);
 
-    VM[V] = const_cast<Value*>(V);
-    MDNode::deleteTemporary(Dummy);
+    auto *MappedMD = MapMetadata(MD, VM, Flags, TypeMapper, Materializer);
+    if (MD == MappedMD || (!MappedMD && (Flags & RF_IgnoreMissingEntries)))
+      return VM[V] = const_cast<Value *>(V);
 
-    // No operands needed remapping.  Use an identity mapping.
-    return const_cast<Value*>(V);
+    // FIXME: This assert crashes during bootstrap, but I think it should be
+    // correct.  For now, just match behaviour from before the metadata/value
+    // split.
+    //
+    //    assert(MappedMD && "Referenced metadata value not in value map");
+    return VM[V] = MetadataAsValue::get(V->getContext(), MappedMD);
   }
 
   // Okay, this either must be a constant (which may or may not be mappable) or
@@ -177,6 +144,198 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
   return VM[V] = ConstantPointerNull::get(cast<PointerType>(NewTy));
 }
 
+static Metadata *mapToMetadata(ValueToValueMapTy &VM, const Metadata *Key,
+                     Metadata *Val) {
+  VM.MD()[Key].reset(Val);
+  return Val;
+}
+
+static Metadata *mapToSelf(ValueToValueMapTy &VM, const Metadata *MD) {
+  return mapToMetadata(VM, MD, const_cast<Metadata *>(MD));
+}
+
+static Metadata *MapMetadataImpl(const Metadata *MD,
+                                 SmallVectorImpl<MDNode *> &Cycles,
+                                 ValueToValueMapTy &VM, RemapFlags Flags,
+                                 ValueMapTypeRemapper *TypeMapper,
+                                 ValueMaterializer *Materializer);
+
+static Metadata *mapMetadataOp(Metadata *Op, SmallVectorImpl<MDNode *> &Cycles,
+                               ValueToValueMapTy &VM, RemapFlags Flags,
+                               ValueMapTypeRemapper *TypeMapper,
+                               ValueMaterializer *Materializer) {
+  if (!Op)
+    return nullptr;
+  if (Metadata *MappedOp =
+          MapMetadataImpl(Op, Cycles, VM, Flags, TypeMapper, Materializer))
+    return MappedOp;
+  // Use identity map if MappedOp is null and we can ignore missing entries.
+  if (Flags & RF_IgnoreMissingEntries)
+    return Op;
+
+  // FIXME: This assert crashes during bootstrap, but I think it should be
+  // correct.  For now, just match behaviour from before the metadata/value
+  // split.
+  //
+  //    llvm_unreachable("Referenced metadata not in value map!");
+  return nullptr;
+}
+
+/// \brief Remap nodes.
+///
+/// Insert \c NewNode in the value map, and then remap \c OldNode's operands.
+/// Assumes that \c NewNode is already a clone of \c OldNode.
+///
+/// \pre \c NewNode is a clone of \c OldNode.
+static bool remap(const MDNode *OldNode, MDNode *NewNode,
+                  SmallVectorImpl<MDNode *> &Cycles, ValueToValueMapTy &VM,
+                  RemapFlags Flags, ValueMapTypeRemapper *TypeMapper,
+                  ValueMaterializer *Materializer) {
+  assert(OldNode->getNumOperands() == NewNode->getNumOperands() &&
+         "Expected nodes to match");
+  assert(OldNode->isResolved() && "Expected resolved node");
+  assert(!NewNode->isUniqued() && "Expected non-uniqued node");
+
+  // Map the node upfront so it's available for cyclic references.
+  mapToMetadata(VM, OldNode, NewNode);
+  bool AnyChanged = false;
+  for (unsigned I = 0, E = OldNode->getNumOperands(); I != E; ++I) {
+    Metadata *Old = OldNode->getOperand(I);
+    assert(NewNode->getOperand(I) == Old &&
+           "Expected old operands to already be in place");
+
+    Metadata *New = mapMetadataOp(OldNode->getOperand(I), Cycles, VM, Flags,
+                                  TypeMapper, Materializer);
+    if (Old != New) {
+      AnyChanged = true;
+      NewNode->replaceOperandWith(I, New);
+    }
+  }
+
+  return AnyChanged;
+}
+
+/// \brief Map a distinct MDNode.
+///
+/// Distinct nodes are not uniqued, so they must always recreated.
+static Metadata *mapDistinctNode(const MDNode *Node,
+                                 SmallVectorImpl<MDNode *> &Cycles,
+                                 ValueToValueMapTy &VM, RemapFlags Flags,
+                                 ValueMapTypeRemapper *TypeMapper,
+                                 ValueMaterializer *Materializer) {
+  assert(Node->isDistinct() && "Expected distinct node");
+
+  MDNode *NewMD = MDNode::replaceWithDistinct(Node->clone());
+  remap(Node, NewMD, Cycles, VM, Flags, TypeMapper, Materializer);
+
+  // Track any cycles beneath this node.
+  for (Metadata *Op : NewMD->operands())
+    if (auto *Node = dyn_cast_or_null<MDNode>(Op))
+      if (!Node->isResolved())
+        Cycles.push_back(Node);
+
+  return NewMD;
+}
+
+/// \brief Map a uniqued MDNode.
+///
+/// Uniqued nodes may not need to be recreated (they may map to themselves).
+static Metadata *mapUniquedNode(const MDNode *Node,
+                                SmallVectorImpl<MDNode *> &Cycles,
+                                ValueToValueMapTy &VM, RemapFlags Flags,
+                                ValueMapTypeRemapper *TypeMapper,
+                                ValueMaterializer *Materializer) {
+  assert(Node->isUniqued() && "Expected uniqued node");
+
+  // Create a temporary node upfront in case we have a metadata cycle.
+  auto ClonedMD = Node->clone();
+  if (!remap(Node, ClonedMD.get(), Cycles, VM, Flags, TypeMapper, Materializer))
+    // No operands changed, so use the identity mapping.
+    return mapToSelf(VM, Node);
+
+  // At least one operand has changed, so uniquify the cloned node.
+  return mapToMetadata(VM, Node,
+                       MDNode::replaceWithUniqued(std::move(ClonedMD)));
+}
+
+static Metadata *MapMetadataImpl(const Metadata *MD,
+                                 SmallVectorImpl<MDNode *> &Cycles,
+                                 ValueToValueMapTy &VM, RemapFlags Flags,
+                                 ValueMapTypeRemapper *TypeMapper,
+                                 ValueMaterializer *Materializer) {
+  // If the value already exists in the map, use it.
+  if (Metadata *NewMD = VM.MD().lookup(MD).get())
+    return NewMD;
+
+  if (isa<MDString>(MD))
+    return mapToSelf(VM, MD);
+
+  if (isa<ConstantAsMetadata>(MD))
+    if ((Flags & RF_NoModuleLevelChanges))
+      return mapToSelf(VM, MD);
+
+  if (const auto *VMD = dyn_cast<ValueAsMetadata>(MD)) {
+    Value *MappedV =
+        MapValue(VMD->getValue(), VM, Flags, TypeMapper, Materializer);
+    if (VMD->getValue() == MappedV ||
+        (!MappedV && (Flags & RF_IgnoreMissingEntries)))
+      return mapToSelf(VM, MD);
+
+    // FIXME: This assert crashes during bootstrap, but I think it should be
+    // correct.  For now, just match behaviour from before the metadata/value
+    // split.
+    //
+    //    assert(MappedV && "Referenced metadata not in value map!");
+    if (MappedV)
+      return mapToMetadata(VM, MD, ValueAsMetadata::get(MappedV));
+    return nullptr;
+  }
+
+  const MDNode *Node = cast<MDNode>(MD);
+  assert(Node->isResolved() && "Unexpected unresolved node");
+
+  // If this is a module-level metadata and we know that nothing at the
+  // module level is changing, then use an identity mapping.
+  if (Flags & RF_NoModuleLevelChanges)
+    return mapToSelf(VM, MD);
+
+  if (Node->isDistinct())
+    return mapDistinctNode(Node, Cycles, VM, Flags, TypeMapper, Materializer);
+
+  return mapUniquedNode(Node, Cycles, VM, Flags, TypeMapper, Materializer);
+}
+
+Metadata *llvm::MapMetadata(const Metadata *MD, ValueToValueMapTy &VM,
+                            RemapFlags Flags, ValueMapTypeRemapper *TypeMapper,
+                            ValueMaterializer *Materializer) {
+  SmallVector<MDNode *, 8> Cycles;
+  Metadata *NewMD =
+      MapMetadataImpl(MD, Cycles, VM, Flags, TypeMapper, Materializer);
+
+  // Resolve cycles underneath MD.
+  if (NewMD && NewMD != MD) {
+    if (auto *N = dyn_cast<MDNode>(NewMD))
+      if (!N->isResolved())
+        N->resolveCycles();
+
+    for (MDNode *N : Cycles)
+      if (!N->isResolved())
+        N->resolveCycles();
+  } else {
+    // Shouldn't get unresolved cycles if nothing was remapped.
+    assert(Cycles.empty() && "Expected no unresolved cycles");
+  }
+
+  return NewMD;
+}
+
+MDNode *llvm::MapMetadata(const MDNode *MD, ValueToValueMapTy &VM,
+                          RemapFlags Flags, ValueMapTypeRemapper *TypeMapper,
+                          ValueMaterializer *Materializer) {
+  return cast<MDNode>(MapMetadata(static_cast<const Metadata *>(MD), VM, Flags,
+                                  TypeMapper, Materializer));
+}
+
 /// RemapInstruction - Convert the instruction operands from referencing the
 /// current values into those specified by VMap.
 ///
@@ -215,7 +374,7 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
            ME = MDs.end();
        MI != ME; ++MI) {
     MDNode *Old = MI->second;
-    MDNode *New = MapValue(Old, VMap, Flags, TypeMapper, Materializer);
+    MDNode *New = MapMetadata(Old, VMap, Flags, TypeMapper, Materializer);
     if (New != Old)
       I->setMetadata(MI->first, New);
   }
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index b4991bc..525c050 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -201,14 +201,16 @@ namespace {
       initializeBBVectorizePass(*PassRegistry::getPassRegistry());
     }
 
-    BBVectorize(Pass *P, const VectorizeConfig &C)
+    BBVectorize(Pass *P, Function &F, const VectorizeConfig &C)
       : BasicBlockPass(ID), Config(C) {
       AA = &P->getAnalysis<AliasAnalysis>();
       DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
       SE = &P->getAnalysis<ScalarEvolution>();
       DataLayoutPass *DLP = P->getAnalysisIfAvailable<DataLayoutPass>();
       DL = DLP ? &DLP->getDataLayout() : nullptr;
-      TTI = IgnoreTargetInfo ? nullptr : &P->getAnalysis<TargetTransformInfo>();
+      TTI = IgnoreTargetInfo
+                ? nullptr
+                : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     }
 
     typedef std::pair<Value *, Value *> ValuePair;
@@ -442,7 +444,10 @@ namespace {
       SE = &getAnalysis<ScalarEvolution>();
       DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
       DL = DLP ? &DLP->getDataLayout() : nullptr;
-      TTI = IgnoreTargetInfo ? nullptr : &getAnalysis<TargetTransformInfo>();
+      TTI = IgnoreTargetInfo
+                ? nullptr
+                : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+                      *BB.getParent());
 
       return vectorizeBB(BB);
     }
@@ -452,7 +457,7 @@ namespace {
       AU.addRequired<AliasAnalysis>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<ScalarEvolution>();
-      AU.addRequired<TargetTransformInfo>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
       AU.addPreserved<AliasAnalysis>();
       AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addPreserved<ScalarEvolution>();
@@ -1277,7 +1282,7 @@ namespace {
             CostSavings, FixedOrder)) continue;
 
         // J is a candidate for merging with I.
-        if (!PairableInsts.size() ||
+        if (PairableInsts.empty() ||
              PairableInsts[PairableInsts.size()-1] != I) {
           PairableInsts.push_back(I);
         }
@@ -2609,7 +2614,6 @@ namespace {
                                                      true, o, 1));
           NewI1->insertBefore(IBeforeJ ? J : I);
           I1 = NewI1;
-          I1T = I2T;
           I1Elem = I2Elem;
         } else if (I1Elem > I2Elem) {
           std::vector<Constant *> Mask(I1Elem);
@@ -2626,8 +2630,6 @@ namespace {
                                                      true, o, 1));
           NewI2->insertBefore(IBeforeJ ? J : I);
           I2 = NewI2;
-          I2T = I1T;
-          I2Elem = I1Elem;
         }
 
         // Now that both I1 and I2 are the same length we can shuffle them
@@ -3195,7 +3197,7 @@ char BBVectorize::ID = 0;
 static const char bb_vectorize_name[] = "Basic-Block Vectorization";
 INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
@@ -3206,7 +3208,7 @@ BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
 
 bool
 llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
-  BBVectorize BBVectorizer(P, C);
+  BBVectorize BBVectorizer(P, *BB.getParent(), C);
   return BBVectorizer.vectorizeBB(BB);
 }
 
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index 07967d8..905c069 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -3,6 +3,9 @@ add_llvm_library(LLVMVectorize
   Vectorize.cpp
   LoopVectorize.cpp
   SLPVectorizer.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
   )
 
 add_dependencies(LLVMVectorize intrinsics_gen)
diff --git a/lib/Transforms/Vectorize/LLVMBuild.txt b/lib/Transforms/Vectorize/LLVMBuild.txt
index b57ce6c..be00294 100644
--- a/lib/Transforms/Vectorize/LLVMBuild.txt
+++ b/lib/Transforms/Vectorize/LLVMBuild.txt
@@ -20,4 +20,4 @@ type = Library
 name = Vectorize
 parent = Transforms
 library_name = Vectorize
-required_libraries = Analysis Core Support Target TransformUtils
+required_libraries = Analysis Core Support TransformUtils
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 35b2ecf..6142306 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -55,9 +55,10 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -105,15 +106,6 @@ using namespace llvm::PatternMatch;
 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 
-static cl::opt<unsigned>
-VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
-                    cl::desc("Sets the SIMD width. Zero is autoselect."));
-
-static cl::opt<unsigned>
-VectorizationInterleave("force-vector-interleave", cl::init(0), cl::Hidden,
-                    cl::desc("Sets the vectorization interleave count. "
-                             "Zero is autoselect."));
-
 static cl::opt<bool>
 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
                    cl::desc("Enable if-conversion during vectorization."));
@@ -144,13 +136,6 @@ static cl::opt<bool> EnableMemAccessVersioning(
 /// We don't unroll loops with a known constant trip count below this number.
 static const unsigned TinyTripCountUnrollThreshold = 128;
 
-/// When performing memory disambiguation checks at runtime do not make more
-/// than this number of comparisons.
-static const unsigned RuntimeMemoryCheckThreshold = 8;
-
-/// Maximum simd width.
-static const unsigned MaxVectorWidth = 64;
-
 static cl::opt<unsigned> ForceTargetNumScalarRegs(
     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
     cl::desc("A flag that overrides the target's number of scalar registers."));
@@ -218,27 +203,19 @@ class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 class LoopVectorizeHints;
 
-/// Optimization analysis message produced during vectorization. Messages inform
-/// the user why vectorization did not occur.
-class Report {
-  std::string Message;
-  raw_string_ostream Out;
-  Instruction *Instr;
-
+/// \brief This modifies LoopAccessReport to initialize message with
+/// loop-vectorizer-specific part.
+class VectorizationReport : public LoopAccessReport {
 public:
-  Report(Instruction *I = nullptr) : Out(Message), Instr(I) {
-    Out << "loop not vectorized: ";
-  }
-
-  template <typename A> Report &operator<<(const A &Value) {
-    Out << Value;
-    return *this;
-  }
-
-  Instruction *getInstr() { return Instr; }
-
-  std::string &str() { return Out.str(); }
-  operator Twine() { return Out.str(); }
+  VectorizationReport(Instruction *I = nullptr)
+      : LoopAccessReport("loop not vectorized: ", I) {}
+
+  /// \brief This allows promotion of the loop-access analysis report into the
+  /// loop-vectorizer report.  It modifies the message to add the
+  /// loop-vectorizer-specific part of the message.
+  explicit VectorizationReport(const LoopAccessReport &R)
+      : LoopAccessReport(Twine("loop not vectorized: ") + R.str(),
+                         R.getInstr()) {}
 };
 
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
@@ -293,13 +270,6 @@ protected:
   typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
                    VectorParts> EdgeMaskCache;
 
-  /// \brief Add code that checks at runtime if the accessed arrays overlap.
-  ///
-  /// Returns a pair of instructions where the first element is the first
-  /// instruction generated in possibly a sequence of instructions and the
-  /// second value is the final comparator value or NULL if no check is needed.
-  std::pair<Instruction *, Instruction *> addRuntimeCheck(Instruction *Loc);
-
   /// \brief Add checks for strides that where assumed to be 1.
   ///
   /// Returns the last check instruction and the first check instruction in the
@@ -355,10 +325,9 @@ protected:
   /// element.
   virtual Value *getBroadcastInstrs(Value *V);
 
-  /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
-  /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
-  /// The sequence starts at StartIndex.
-  virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
+  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
+  /// to each vector element of Val. The sequence starts at StartIndex.
+  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);
 
   /// When we go over instructions in the basic block we rely on previous
   /// values within the current basic block or on loop invariant values.
@@ -479,7 +448,7 @@ private:
                             bool IfPredicateStore = false) override;
   void vectorizeMemoryInstruction(Instruction *Instr) override;
   Value *getBroadcastInstrs(Value *V) override;
-  Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate) override;
+  Value *getStepVector(Value *Val, int StartIdx, Value *Step) override;
   Value *reverseVector(Value *Vec) override;
 };
 
@@ -574,17 +543,14 @@ static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *F
 /// induction variable and the different reduction variables.
 class LoopVectorizationLegality {
 public:
-  unsigned NumLoads;
-  unsigned NumStores;
-  unsigned NumPredStores;
-
   LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
                             DominatorTree *DT, TargetLibraryInfo *TLI,
-                            AliasAnalysis *AA, Function *F)
-      : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
-        DT(DT), TLI(TLI), AA(AA), TheFunction(F), Induction(nullptr),
-        WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {
-  }
+                            AliasAnalysis *AA, Function *F,
+                            const TargetTransformInfo *TTI,
+                            LoopAccessAnalysis *LAA)
+      : NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
+        TLI(TLI), TheFunction(F), TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr),
+        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}
 
   /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
@@ -602,11 +568,9 @@ public:
 
   /// This enum represents the kinds of inductions that we support.
   enum InductionKind {
-    IK_NoInduction,         ///< Not an induction variable.
-    IK_IntInduction,        ///< Integer induction variable. Step = 1.
-    IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
-    IK_PtrInduction,        ///< Pointer induction var. Step = sizeof(elem).
-    IK_ReversePtrInduction  ///< Reverse ptr indvar. Step = - sizeof(elem).
+    IK_NoInduction,  ///< Not an induction variable.
+    IK_IntInduction, ///< Integer induction variable. Step = C.
+    IK_PtrInduction  ///< Pointer induction var. Step = C / sizeof(elem).
   };
 
   // This enum represents the kind of minmax reduction.
@@ -657,51 +621,69 @@ public:
     MinMaxReductionKind MinMaxKind;
   };
 
-  /// This struct holds information about the memory runtime legality
-  /// check that a group of pointers do not overlap.
-  struct RuntimePointerCheck {
-    RuntimePointerCheck() : Need(false) {}
-
-    /// Reset the state of the pointer runtime information.
-    void reset() {
-      Need = false;
-      Pointers.clear();
-      Starts.clear();
-      Ends.clear();
-      IsWritePtr.clear();
-      DependencySetId.clear();
-      AliasSetId.clear();
+  /// A struct for saving information about induction variables.
+  struct InductionInfo {
+    InductionInfo(Value *Start, InductionKind K, ConstantInt *Step)
+        : StartValue(Start), IK(K), StepValue(Step) {
+      assert(IK != IK_NoInduction && "Not an induction");
+      assert(StartValue && "StartValue is null");
+      assert(StepValue && !StepValue->isZero() && "StepValue is zero");
+      assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) &&
+             "StartValue is not a pointer for pointer induction");
+      assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) &&
+             "StartValue is not an integer for integer induction");
+      assert(StepValue->getType()->isIntegerTy() &&
+             "StepValue is not an integer");
+    }
+    InductionInfo()
+        : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {}
+
+    /// Get the consecutive direction. Returns:
+    ///   0 - unknown or non-consecutive.
+    ///   1 - consecutive and increasing.
+    ///  -1 - consecutive and decreasing.
+    int getConsecutiveDirection() const {
+      if (StepValue && (StepValue->isOne() || StepValue->isMinusOne()))
+        return StepValue->getSExtValue();
+      return 0;
     }
 
-    /// Insert a pointer and calculate the start and end SCEVs.
-    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr,
-                unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides);
-
-    /// This flag indicates if we need to add the runtime check.
-    bool Need;
-    /// Holds the pointers that we need to check.
-    SmallVector<TrackingVH<Value>, 2> Pointers;
-    /// Holds the pointer value at the beginning of the loop.
-    SmallVector<const SCEV*, 2> Starts;
-    /// Holds the pointer value at the end of the loop.
-    SmallVector<const SCEV*, 2> Ends;
-    /// Holds the information if this pointer is used for writing to memory.
-    SmallVector<bool, 2> IsWritePtr;
-    /// Holds the id of the set of pointers that could be dependent because of a
-    /// shared underlying object.
-    SmallVector<unsigned, 2> DependencySetId;
-    /// Holds the id of the disjoint alias set to which this pointer belongs.
-    SmallVector<unsigned, 2> AliasSetId;
-  };
+    /// Compute the transformed value of Index at offset StartValue using step
+    /// StepValue.
+    /// For integer induction, returns StartValue + Index * StepValue.
+    /// For pointer induction, returns StartValue[Index * StepValue].
+    /// FIXME: The newly created binary instructions should contain nsw/nuw
+    /// flags, which can be found from the original scalar operations.
+    Value *transform(IRBuilder<> &B, Value *Index) const {
+      switch (IK) {
+      case IK_IntInduction:
+        assert(Index->getType() == StartValue->getType() &&
+               "Index type does not match StartValue type");
+        if (StepValue->isMinusOne())
+          return B.CreateSub(StartValue, Index);
+        if (!StepValue->isOne())
+          Index = B.CreateMul(Index, StepValue);
+        return B.CreateAdd(StartValue, Index);
+
+      case IK_PtrInduction:
+        if (StepValue->isMinusOne())
+          Index = B.CreateNeg(Index);
+        else if (!StepValue->isOne())
+          Index = B.CreateMul(Index, StepValue);
+        return B.CreateGEP(StartValue, Index);
+
+      case IK_NoInduction:
+        return nullptr;
+      }
+      llvm_unreachable("invalid enum");
+    }
 
-  /// A struct for saving information about induction variables.
-  struct InductionInfo {
-    InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
-    InductionInfo() : StartValue(nullptr), IK(IK_NoInduction) {}
     /// Start value.
     TrackingVH<Value> StartValue;
     /// Induction kind.
     InductionKind IK;
+    /// Step value.
+    ConstantInt *StepValue;
   };
 
   /// ReductionList contains the reduction descriptors for all
@@ -753,13 +735,19 @@ public:
   bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
 
   /// Returns the information that we collected about runtime memory check.
-  RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
+  const LoopAccessInfo::RuntimePointerCheck *getRuntimePointerCheck() const {
+    return LAI->getRuntimePointerCheck();
+  }
+
+  const LoopAccessInfo *getLAI() const {
+    return LAI;
+  }
 
   /// This function returns the identity element (or neutral element) for
   /// the operation K.
   static Constant *getReductionIdentity(ReductionKind K, Type *Tp);
 
-  unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
+  unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
 
   bool hasStride(Value *V) { return StrideSet.count(V); }
   bool mustCheckStrides() { return !StrideSet.empty(); }
@@ -768,6 +756,30 @@ public:
   }
   SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); }
 
+  /// Returns true if the target machine supports masked store operation
+  /// for the given \p DataType and kind of access to \p Ptr.
+  bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
+    return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr));
+  }
+  /// Returns true if the target machine supports masked load operation
+  /// for the given \p DataType and kind of access to \p Ptr.
+  bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
+    return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr));
+  }
+  /// Returns true if vector representation of the instruction \p I
+  /// requires mask.
+  bool isMaskRequired(const Instruction* I) {
+    return (MaskedOp.count(I) != 0);
+  }
+  unsigned getNumStores() const {
+    return LAI->getNumStores();
+  }
+  unsigned getNumLoads() const {
+    return LAI->getNumLoads();
+  }
+  unsigned getNumPredStores() const {
+    return NumPredStores;
+  }
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -806,40 +818,45 @@ private:
   /// pattern corresponding to a min(X, Y) or max(X, Y).
   static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I,
                                                     ReductionInstDesc &Prev);
-  /// Returns the induction kind of Phi. This function may return NoInduction
-  /// if the PHI is not an induction variable.
-  InductionKind isInductionVariable(PHINode *Phi);
+  /// Returns the induction kind of Phi and record the step. This function may
+  /// return NoInduction if the PHI is not an induction variable.
+  InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue);
 
   /// \brief Collect memory access with loop invariant strides.
   ///
   /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
   /// invariant.
-  void collectStridedAcccess(Value *LoadOrStoreInst);
+  void collectStridedAccess(Value *LoadOrStoreInst);
 
   /// Report an analysis message to assist the user in diagnosing loops that are
-  /// not vectorized.
-  void emitAnalysis(Report &Message) {
-    DebugLoc DL = TheLoop->getStartLoc();
-    if (Instruction *I = Message.getInstr())
-      DL = I->getDebugLoc();
-    emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE,
-                                   *TheFunction, DL, Message.str());
+  /// not vectorized.  These are handled as LoopAccessReport rather than
+  /// VectorizationReport because the << operator of VectorizationReport returns
+  /// LoopAccessReport.
+  void emitAnalysis(const LoopAccessReport &Message) {
+    LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME);
   }
 
+  unsigned NumPredStores;
+
   /// The loop that we evaluate.
   Loop *TheLoop;
   /// Scev analysis.
   ScalarEvolution *SE;
   /// DataLayout analysis.
   const DataLayout *DL;
-  /// Dominators.
-  DominatorTree *DT;
   /// Target Library Info.
   TargetLibraryInfo *TLI;
-  /// Alias analysis.
-  AliasAnalysis *AA;
   /// Parent function
   Function *TheFunction;
+  /// Target Transform Info
+  const TargetTransformInfo *TTI;
+  /// Dominator Tree.
+  DominatorTree *DT;
+  // LoopAccess analysis.
+  LoopAccessAnalysis *LAA;
+  // And the loop-accesses info corresponding to this loop.  This pointer is
+  // null until canVectorizeMemory sets it up.
+  const LoopAccessInfo *LAI;
 
   //  ---  vectorization state --- //
 
@@ -861,16 +878,16 @@ private:
   /// This set holds the variables which are known to be uniform after
   /// vectorization.
   SmallPtrSet<Instruction*, 4> Uniforms;
-  /// We need to check that all of the pointers in this list are disjoint
-  /// at runtime.
-  RuntimePointerCheck PtrRtCheck;
+
   /// Can we assume the absence of NaNs.
   bool HasFunNoNaNAttr;
 
-  unsigned MaxSafeDepDistBytes;
-
   ValueToValueMap Strides;
   SmallPtrSet<Value *, 8> StrideSet;
+  
+  /// While vectorizing these instructions we have to generate a
+  /// call to the appropriate masked intrinsic
+  SmallPtrSet<const Instruction*, 8> MaskedOp;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -886,11 +903,11 @@ public:
                              LoopVectorizationLegality *Legal,
                              const TargetTransformInfo &TTI,
                              const DataLayout *DL, const TargetLibraryInfo *TLI,
-                             AssumptionTracker *AT, const Function *F,
+                             AssumptionCache *AC, const Function *F,
                              const LoopVectorizeHints *Hints)
       : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI),
         TheFunction(F), Hints(Hints) {
-    CodeMetrics::collectEphemeralValues(L, AT, EphValues);
+    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
   }
 
   /// Information about vectorization costs
@@ -951,13 +968,11 @@ private:
   bool isConsecutiveLoadOrStore(Instruction *I);
 
   /// Report an analysis message to assist the user in diagnosing loops that are
-  /// not vectorized.
-  void emitAnalysis(Report &Message) {
-    DebugLoc DL = TheLoop->getStartLoc();
-    if (Instruction *I = Message.getInstr())
-      DL = I->getDebugLoc();
-    emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE,
-                                   *TheFunction, DL, Message.str());
+  /// not vectorized.  These are handled as LoopAccessReport rather than
+  /// VectorizationReport because the << operator of VectorizationReport returns
+  /// LoopAccessReport.
+  void emitAnalysis(const LoopAccessReport &Message) {
+    LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME);
   }
 
   /// Values used only by @llvm.assume calls.
@@ -1010,7 +1025,7 @@ class LoopVectorizeHints {
     bool validate(unsigned Val) {
       switch (Kind) {
       case HK_WIDTH:
-        return isPowerOf2_32(Val) && Val <= MaxVectorWidth;
+        return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
       case HK_UNROLL:
         return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
       case HK_FORCE:
@@ -1038,7 +1053,8 @@ public:
   };
 
   LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
-      : Width("vectorize.width", VectorizationFactor, HK_WIDTH),
+      : Width("vectorize.width", VectorizerParams::VectorizationFactor,
+              HK_WIDTH),
         Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
         Force("vectorize.enable", FK_Undefined, HK_FORCE),
         TheLoop(L) {
@@ -1046,8 +1062,8 @@ public:
     getHintsFromMetadata();
 
     // force-vector-interleave overrides DisableInterleaving.
-    if (VectorizationInterleave.getNumOccurrences() > 0)
-      Interleave.Value = VectorizationInterleave;
+    if (VectorizerParams::isInterleaveForced())
+      Interleave.Value = VectorizerParams::VectorizationInterleave;
 
     DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
           << "LV: Interleaving disabled by the pass manager\n");
@@ -1062,7 +1078,7 @@ public:
 
   /// Dumps all the hint information.
   std::string emitRemark() const {
-    Report R;
+    VectorizationReport R;
     if (Force.Value == LoopVectorizeHints::FK_Disabled)
       R << "vectorization is explicitly disabled";
     else {
@@ -1097,7 +1113,7 @@ private:
 
     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
       const MDString *S = nullptr;
-      SmallVector<Value*, 4> Args;
+      SmallVector<Metadata *, 4> Args;
 
       // The expected hint is either a MDString or a MDNode with the first
       // operand a MDString.
@@ -1123,12 +1139,12 @@ private:
   }
 
   /// Checks string hint with one operand and set value if valid.
-  void setHint(StringRef Name, Value *Arg) {
+  void setHint(StringRef Name, Metadata *Arg) {
     if (!Name.startswith(Prefix()))
       return;
     Name = Name.substr(Prefix().size(), StringRef::npos);
 
-    const ConstantInt *C = dyn_cast<ConstantInt>(Arg);
+    const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
     if (!C) return;
     unsigned Val = C->getZExtValue();
 
@@ -1147,9 +1163,10 @@ private:
   /// Create a new hint from name / value pair.
   MDNode *createHintMetadata(StringRef Name, unsigned V) const {
     LLVMContext &Context = TheLoop->getHeader()->getContext();
-    Value *Vals[] = {MDString::get(Context, Name),
-                     ConstantInt::get(Type::getInt32Ty(Context), V)};
-    return MDNode::get(Context, Vals);
+    Metadata *MDs[] = {MDString::get(Context, Name),
+                       ConstantAsMetadata::get(
+                           ConstantInt::get(Type::getInt32Ty(Context), V))};
+    return MDNode::get(Context, MDs);
   }
 
   /// Matches metadata with hint name.
@@ -1170,7 +1187,7 @@ private:
       return;
 
     // Reserve the first element to LoopID (see below).
-    SmallVector<Value*, 4> Vals(1);
+    SmallVector<Metadata *, 4> MDs(1);
     // If the loop already has metadata, then ignore the existing operands.
     MDNode *LoopID = TheLoop->getLoopID();
     if (LoopID) {
@@ -1178,25 +1195,21 @@ private:
         MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
         // If node in update list, ignore old value.
         if (!matchesHintMetadataName(Node, HintTypes))
-          Vals.push_back(Node);
+          MDs.push_back(Node);
       }
     }
 
     // Now, add the missing hints.
     for (auto H : HintTypes)
-      Vals.push_back(
-          createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
+      MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
 
     // Replace current metadata node with new one.
     LLVMContext &Context = TheLoop->getHeader()->getContext();
-    MDNode *NewLoopID = MDNode::get(Context, Vals);
+    MDNode *NewLoopID = MDNode::get(Context, MDs);
     // Set operand 0 to refer to the loop id itself.
     NewLoopID->replaceOperandWith(0, NewLoopID);
 
     TheLoop->setLoopID(NewLoopID);
-    if (LoopID)
-      LoopID->replaceAllUsesWith(NewLoopID);
-    LoopID = NewLoopID;
   }
 
   /// The loop these hints belong to.
@@ -1248,7 +1261,8 @@ struct LoopVectorize : public FunctionPass {
   BlockFrequencyInfo *BFI;
   TargetLibraryInfo *TLI;
   AliasAnalysis *AA;
-  AssumptionTracker *AT;
+  AssumptionCache *AC;
+  LoopAccessAnalysis *LAA;
   bool DisableUnrolling;
   bool AlwaysVectorize;
 
@@ -1258,13 +1272,15 @@ struct LoopVectorize : public FunctionPass {
     SE = &getAnalysis<ScalarEvolution>();
     DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
     DL = DLP ? &DLP->getDataLayout() : nullptr;
-    LI = &getAnalysis<LoopInfo>();
-    TTI = &getAnalysis<TargetTransformInfo>();
+    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     BFI = &getAnalysis<BlockFrequencyInfo>();
-    TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    TLI = TLIP ? &TLIP->getTLI() : nullptr;
     AA = &getAnalysis<AliasAnalysis>();
-    AT = &getAnalysis<AssumptionTracker>();
+    AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    LAA = &getAnalysis<LoopAccessAnalysis>();
 
     // Compute some weights outside of the loop over the loops. Compute this
     // using a BranchProbability to re-use its scaling math.
@@ -1375,7 +1391,7 @@ struct LoopVectorize : public FunctionPass {
     }
 
     // Check if it is legal to vectorize the loop.
-    LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F);
+    LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI, LAA);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
       emitMissedWarning(F, L, Hints);
@@ -1383,7 +1399,7 @@ struct LoopVectorize : public FunctionPass {
     }
 
     // Use the cost model.
-    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AT, F,
+    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AC, F,
                                   &Hints);
 
     // Check the function attributes to find out if this function should be
@@ -1471,16 +1487,17 @@ struct LoopVectorize : public FunctionPass {
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AssumptionTracker>();
+    AU.addRequired<AssumptionCacheTracker>();
     AU.addRequiredID(LoopSimplifyID);
     AU.addRequiredID(LCSSAID);
     AU.addRequired<BlockFrequencyInfo>();
     AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<LoopInfo>();
+    AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<ScalarEvolution>();
-    AU.addRequired<TargetTransformInfo>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addRequired<AliasAnalysis>();
-    AU.addPreserved<LoopInfo>();
+    AU.addRequired<LoopAccessAnalysis>();
+    AU.addPreserved<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<AliasAnalysis>();
   }
@@ -1494,65 +1511,6 @@ struct LoopVectorize : public FunctionPass {
 // LoopVectorizationCostModel.
 //===----------------------------------------------------------------------===//
 
-static Value *stripIntegerCast(Value *V) {
-  if (CastInst *CI = dyn_cast<CastInst>(V))
-    if (CI->getOperand(0)->getType()->isIntegerTy())
-      return CI->getOperand(0);
-  return V;
-}
-
-///\brief Replaces the symbolic stride in a pointer SCEV expression by one.
-///
-/// If \p OrigPtr is not null, use it to look up the stride value instead of
-/// \p Ptr.
-static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE,
-                                             ValueToValueMap &PtrToStride,
-                                             Value *Ptr, Value *OrigPtr = nullptr) {
-
-  const SCEV *OrigSCEV = SE->getSCEV(Ptr);
-
-  // If there is an entry in the map return the SCEV of the pointer with the
-  // symbolic stride replaced by one.
-  ValueToValueMap::iterator SI = PtrToStride.find(OrigPtr ? OrigPtr : Ptr);
-  if (SI != PtrToStride.end()) {
-    Value *StrideVal = SI->second;
-
-    // Strip casts.
-    StrideVal = stripIntegerCast(StrideVal);
-
-    // Replace symbolic stride by one.
-    Value *One = ConstantInt::get(StrideVal->getType(), 1);
-    ValueToValueMap RewriteMap;
-    RewriteMap[StrideVal] = One;
-
-    const SCEV *ByOne =
-        SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true);
-    DEBUG(dbgs() << "LV: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne
-                 << "\n");
-    return ByOne;
-  }
-
-  // Otherwise, just return the SCEV of the original pointer.
-  return SE->getSCEV(Ptr);
-}
-
-void LoopVectorizationLegality::RuntimePointerCheck::insert(
-    ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId,
-    unsigned ASId, ValueToValueMap &Strides) {
-  // Get the stride replaced scev.
-  const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
-  assert(AR && "Invalid addrec expression");
-  const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
-  const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
-  Pointers.push_back(Ptr);
-  Starts.push_back(AR->getStart());
-  Ends.push_back(ScEnd);
-  IsWritePtr.push_back(WritePtr);
-  DependencySetId.push_back(DepSetId);
-  AliasSetId.push_back(ASId);
-}
-
 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   // We need to place the broadcast of invariant variables outside the loop.
   Instruction *Instr = dyn_cast<Instruction>(V);
@@ -1572,11 +1530,13 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   return Shuf;
 }
 
-Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
-                                                 bool Negate) {
+Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
+                                          Value *Step) {
   assert(Val->getType()->isVectorTy() && "Must be a vector");
   assert(Val->getType()->getScalarType()->isIntegerTy() &&
          "Elem must be an integer");
+  assert(Step->getType() == Val->getType()->getScalarType() &&
+         "Step has wrong type");
   // Create the types.
   Type *ITy = Val->getType()->getScalarType();
   VectorType *Ty = cast<VectorType>(Val->getType());
@@ -1584,15 +1544,18 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
   SmallVector<Constant*, 8> Indices;
 
   // Create a vector of consecutive numbers from zero to VF.
-  for (int i = 0; i < VLen; ++i) {
-    int64_t Idx = Negate ? (-i) : i;
-    Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate));
-  }
+  for (int i = 0; i < VLen; ++i)
+    Indices.push_back(ConstantInt::get(ITy, StartIdx + i));
 
   // Add the consecutive indices to the vector value.
   Constant *Cv = ConstantVector::get(Indices);
   assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
-  return Builder.CreateAdd(Val, Cv, "induction");
+  Step = Builder.CreateVectorSplat(VLen, Step);
+  assert(Step->getType() == Val->getType() && "Invalid step vec");
+  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
+  // which can be found from the original scalar operations.
+  Step = Builder.CreateMul(Cv, Step);
+  return Builder.CreateAdd(Val, Step, "induction");
 }
 
 /// \brief Find the operand of the GEP that should be checked for consecutive
@@ -1630,10 +1593,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
   if (Phi && Inductions.count(Phi)) {
     InductionInfo II = Inductions[Phi];
-    if (IK_PtrInduction == II.IK)
-      return 1;
-    else if (IK_ReversePtrInduction == II.IK)
-      return -1;
+    return II.getConsecutiveDirection();
   }
 
   GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
@@ -1658,10 +1618,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
         return 0;
 
     InductionInfo II = Inductions[Phi];
-    if (IK_PtrInduction == II.IK)
-      return 1;
-    else if (IK_ReversePtrInduction == II.IK)
-      return -1;
+    return II.getConsecutiveDirection();
   }
 
   unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
@@ -1711,7 +1668,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
 }
 
 bool LoopVectorizationLegality::isUniform(Value *V) {
-  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+  return LAI->isUniform(V);
 }
 
 InnerLoopVectorizer::VectorParts&
@@ -1763,7 +1720,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
   unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
 
-  if (SI && Legal->blockNeedsPredication(SI->getParent()))
+  if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
+      !Legal->isMaskRequired(SI))
     return scalarizeInstruction(Instr, true);
 
   if (ScalarAllocatedSize != VectorElementSize)
@@ -1832,6 +1790,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
     Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
   }
 
+  VectorParts Mask = createBlockInMask(Instr->getParent());
   // Handle Stores:
   if (SI) {
     assert(!Legal->isUniform(SI->getPointerOperand()) &&
@@ -1840,7 +1799,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
     // We don't want to update the value in the map as it might be used in
     // another expression. So don't use a reference type for "StoredVal".
     VectorParts StoredVal = getVectorValue(SI->getValueOperand());
-
+    
     for (unsigned Part = 0; Part < UF; ++Part) {
       // Calculate the pointer for the specific unroll-part.
       Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
@@ -1853,12 +1812,18 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
         // wide store needs to start at the last vector element.
         PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
         PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+        Mask[Part] = reverseVector(Mask[Part]);
       }
 
       Value *VecPtr = Builder.CreateBitCast(PartPtr,
                                             DataTy->getPointerTo(AddressSpace));
-      StoreInst *NewSI =
-        Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
+
+      Instruction *NewSI;
+      if (Legal->isMaskRequired(SI))
+        NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
+                                          Mask[Part]);
+      else 
+        NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
       propagateMetadata(NewSI, SI);
     }
     return;
@@ -1873,14 +1838,21 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
 
     if (Reverse) {
       // If the address is consecutive but reversed, then the
-      // wide store needs to start at the last vector element.
+      // wide load needs to start at the last vector element.
       PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
       PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+      Mask[Part] = reverseVector(Mask[Part]);
     }
 
+    Instruction* NewLI;
     Value *VecPtr = Builder.CreateBitCast(PartPtr,
                                           DataTy->getPointerTo(AddressSpace));
-    LoadInst *NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
+    if (Legal->isMaskRequired(LI))
+      NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
+                                       UndefValue::get(DataTy),
+                                       "wide.masked.load");
+    else
+      NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
     propagateMetadata(NewLI, LI);
     Entry[Part] = Reverse ? reverseVector(NewLI) :  NewLI;
   }
@@ -1958,7 +1930,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
         Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
         CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
         LoopVectorBody.push_back(CondBlock);
-        VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
+        VectorLp->addBasicBlockToLoop(CondBlock, *LI);
         // Update Builder with newly created basic block.
         Builder.SetInsertPoint(InsertPt);
       }
@@ -1987,7 +1959,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
       if (IfPredicateStore) {
          BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
          LoopVectorBody.push_back(NewIfBlock);
-         VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
+         VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
          Builder.SetInsertPoint(InsertPt);
          Instruction *OldBr = IfBlock->getTerminator();
          BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
@@ -2044,102 +2016,6 @@ InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {
   return std::make_pair(FirstInst, TheCheck);
 }
 
-std::pair<Instruction *, Instruction *>
-InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) {
-  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
-  Legal->getRuntimePointerCheck();
-
-  Instruction *tnullptr = nullptr;
-  if (!PtrRtCheck->Need)
-    return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
-
-  unsigned NumPointers = PtrRtCheck->Pointers.size();
-  SmallVector<TrackingVH<Value> , 2> Starts;
-  SmallVector<TrackingVH<Value> , 2> Ends;
-
-  LLVMContext &Ctx = Loc->getContext();
-  SCEVExpander Exp(*SE, "induction");
-  Instruction *FirstInst = nullptr;
-
-  for (unsigned i = 0; i < NumPointers; ++i) {
-    Value *Ptr = PtrRtCheck->Pointers[i];
-    const SCEV *Sc = SE->getSCEV(Ptr);
-
-    if (SE->isLoopInvariant(Sc, OrigLoop)) {
-      DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<
-            *Ptr <<"\n");
-      Starts.push_back(Ptr);
-      Ends.push_back(Ptr);
-    } else {
-      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n');
-      unsigned AS = Ptr->getType()->getPointerAddressSpace();
-
-      // Use this type for pointer arithmetic.
-      Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
-
-      Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
-      Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
-      Starts.push_back(Start);
-      Ends.push_back(End);
-    }
-  }
-
-  IRBuilder<> ChkBuilder(Loc);
-  // Our instructions might fold to a constant.
-  Value *MemoryRuntimeCheck = nullptr;
-  for (unsigned i = 0; i < NumPointers; ++i) {
-    for (unsigned j = i+1; j < NumPointers; ++j) {
-      // No need to check if two readonly pointers intersect.
-      if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
-        continue;
-
-      // Only need to check pointers between two different dependency sets.
-      if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
-       continue;
-      // Only need to check pointers in the same alias set.
-      if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j])
-        continue;
-
-      unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
-      unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
-
-      assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&
-             (AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&
-             "Trying to bounds check pointers with different address spaces");
-
-      Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
-      Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
-
-      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc");
-      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc");
-      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy1, "bc");
-      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy0, "bc");
-
-      Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
-      FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
-      Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
-      FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
-      Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
-      FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
-      if (MemoryRuntimeCheck) {
-        IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
-                                         "conflict.rdx");
-        FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
-      }
-      MemoryRuntimeCheck = IsConflict;
-    }
-  }
-
-  // We have to do this trickery because the IRBuilder might fold the check to a
-  // constant expression in which case there is no Instruction anchored in a
-  // the block.
-  Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
-                                                 ConstantInt::getTrue(Ctx));
-  ChkBuilder.Insert(Check, "memcheck.conflict");
-  FirstInst = getFirstInst(FirstInst, Check, Loc);
-  return std::make_pair(FirstInst, Check);
-}
-
 void InnerLoopVectorizer::createEmptyLoop() {
   /*
    In this function we generate a new loop. The new loop will contain
@@ -2265,13 +2141,13 @@ void InnerLoopVectorizer::createEmptyLoop() {
   // before calling any utilities such as SCEV that require valid LoopInfo.
   if (ParentLoop) {
     ParentLoop->addChildLoop(Lp);
-    ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
-    ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
-    ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
+    ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
+    ParentLoop->addBasicBlockToLoop(VectorPH, *LI);
+    ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
   } else {
     LI->addTopLevelLoop(Lp);
   }
-  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
+  Lp->addBasicBlockToLoop(VecBody, *LI);
 
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
@@ -2326,7 +2202,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
     BasicBlock *CheckBlock =
       LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked");
     if (ParentLoop)
-      ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
+      ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
     LoopBypassBlocks.push_back(CheckBlock);
     Instruction *OldTerm = LastBypassBlock->getTerminator();
     BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm);
@@ -2346,7 +2222,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
     BasicBlock *CheckBlock =
         LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
     if (ParentLoop)
-      ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
+      ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
     LoopBypassBlocks.push_back(CheckBlock);
 
     // Replace the branch into the memory check block with a conditional branch
@@ -2364,13 +2240,13 @@ void InnerLoopVectorizer::createEmptyLoop() {
   // faster.
   Instruction *MemRuntimeCheck;
   std::tie(FirstCheckInst, MemRuntimeCheck) =
-      addRuntimeCheck(LastBypassBlock->getTerminator());
+    Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator());
   if (MemRuntimeCheck) {
     // Create a new block containing the memory check.
     BasicBlock *CheckBlock =
-        LastBypassBlock->splitBasicBlock(MemRuntimeCheck, "vector.memcheck");
+        LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck");
     if (ParentLoop)
-      ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
+      ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
     LoopBypassBlocks.push_back(CheckBlock);
 
     // Replace the branch into the memory check block with a conditional branch
@@ -2461,33 +2337,13 @@ void InnerLoopVectorizer::createEmptyLoop() {
       Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
                                                    II.StartValue->getType(),
                                                    "cast.crd");
-      EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end");
-      break;
-    }
-    case LoopVectorizationLegality::IK_ReverseIntInduction: {
-      // Convert the CountRoundDown variable to the PHI size.
-      Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
-                                                   II.StartValue->getType(),
-                                                   "cast.crd");
-      // Handle reverse integer induction counter.
-      EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end");
+      EndValue = II.transform(BypassBuilder, CRD);
+      EndValue->setName("ind.end");
       break;
     }
     case LoopVectorizationLegality::IK_PtrInduction: {
-      // For pointer induction variables, calculate the offset using
-      // the end index.
-      EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown,
-                                         "ptr.ind.end");
-      break;
-    }
-    case LoopVectorizationLegality::IK_ReversePtrInduction: {
-      // The value at the end of the loop for the reverse pointer is calculated
-      // by creating a GEP with a negative index starting from the start value.
-      Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
-      Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown,
-                                              "rev.ind.end");
-      EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx,
-                                         "rev.ptr.ind.end");
+      EndValue = II.transform(BypassBuilder, CountRoundDown);
+      EndValue->setName("ptr.ind.end");
       break;
     }
     }// end of case
@@ -2835,9 +2691,6 @@ void InnerLoopVectorizer::vectorizeLoop() {
     }
 
     // Fix the vector-loop phi.
-    // We created the induction variable so we know that the
-    // preheader is the first entry.
-    BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
 
     // Reductions do not have to start at zero. They can start with
     // any loop invariant values.
@@ -2849,7 +2702,8 @@ void InnerLoopVectorizer::vectorizeLoop() {
       // Make sure to add the reduction stat value only to the
       // first unroll part.
       Value *StartVal = (part == 0) ? VectorStart : Identity;
-      cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
+      cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal,
+                                                  LoopVectorPreHeader);
       cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
                                                   LoopVectorBody.back());
     }
@@ -3104,6 +2958,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
   LoopVectorizationLegality::InductionInfo II =
   Legal->getInductionVars()->lookup(P);
 
+  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
+  // which can be found from the original scalar operations.
   switch (II.IK) {
     case LoopVectorizationLegality::IK_NoInduction:
       llvm_unreachable("Unknown induction");
@@ -3121,80 +2977,42 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
         Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
                                                  "normalized.idx");
         NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
-        Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
-                                        "offset.idx");
+        Broadcasted = II.transform(Builder, NormalizedIdx);
+        Broadcasted->setName("offset.idx");
       }
       Broadcasted = getBroadcastInstrs(Broadcasted);
       // After broadcasting the induction variable we need to make the vector
       // consecutive by adding 0, 1, 2, etc.
       for (unsigned part = 0; part < UF; ++part)
-        Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
+        Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue);
       return;
     }
-    case LoopVectorizationLegality::IK_ReverseIntInduction:
     case LoopVectorizationLegality::IK_PtrInduction:
-    case LoopVectorizationLegality::IK_ReversePtrInduction:
-      // Handle reverse integer and pointer inductions.
-      Value *StartIdx = ExtendedIdx;
-      // This is the normalized GEP that starts counting at zero.
-      Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
-                                               "normalized.idx");
-
-      // Handle the reverse integer induction variable case.
-      if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
-        IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
-        Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
-                                               "resize.norm.idx");
-        Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
-                                               "reverse.idx");
-
-        // This is a new value so do not hoist it out.
-        Value *Broadcasted = getBroadcastInstrs(ReverseInd);
-        // After broadcasting the induction variable we need to make the
-        // vector consecutive by adding  ... -3, -2, -1, 0.
-        for (unsigned part = 0; part < UF; ++part)
-          Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
-                                             true);
-        return;
-      }
-
       // Handle the pointer induction variable case.
       assert(P->getType()->isPointerTy() && "Unexpected type.");
-
-      // Is this a reverse induction ptr or a consecutive induction ptr.
-      bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
-                      II.IK);
-
+      // This is the normalized GEP that starts counting at zero.
+      Value *NormalizedIdx =
+          Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx");
       // This is the vector of results. Notice that we don't generate
       // vector geps because scalar geps result in better code.
       for (unsigned part = 0; part < UF; ++part) {
         if (VF == 1) {
-          int EltIndex = (part) * (Reverse ? -1 : 1);
+          int EltIndex = part;
           Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
-          Value *GlobalIdx;
-          if (Reverse)
-            GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
-          else
-            GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
-
-          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
-                                             "next.gep");
+          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
+          Value *SclrGep = II.transform(Builder, GlobalIdx);
+          SclrGep->setName("next.gep");
           Entry[part] = SclrGep;
           continue;
         }
 
         Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
         for (unsigned int i = 0; i < VF; ++i) {
-          int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
+          int EltIndex = i + part * VF;
           Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
-          Value *GlobalIdx;
-          if (!Reverse)
-            GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
-          else
-            GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
-
-          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
-                                             "next.gep");
+          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
+          Value *SclrGep = II.transform(Builder, GlobalIdx);
+          SclrGep->setName("next.gep");
           VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
                                                Builder.getInt32(i),
                                                "insert.gep");
@@ -3214,7 +3032,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       // Nothing to do for PHIs and BR, since we already took care of the
       // loop control flow instructions.
       continue;
-    case Instruction::PHI:{
+    case Instruction::PHI: {
       // Vectorize PHINodes.
       widenPHIInstruction(it, Entry, UF, VF, PV);
       continue;
@@ -3335,8 +3153,12 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
         Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
                                                CI->getType());
         Value *Broadcasted = getBroadcastInstrs(ScalarCast);
+        LoopVectorizationLegality::InductionInfo II =
+            Legal->getInductionVars()->lookup(OldInduction);
+        Constant *Step =
+            ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue());
         for (unsigned Part = 0; Part < UF; ++Part)
-          Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false);
+          Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
         propagateMetadata(Entry, it);
         break;
       }
@@ -3452,7 +3274,7 @@ static bool canIfConvertPHINodes(BasicBlock *BB) {
 
 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   if (!EnableIfConversion) {
-    emitAnalysis(Report() << "if-conversion is disabled");
+    emitAnalysis(VectorizationReport() << "if-conversion is disabled");
     return false;
   }
 
@@ -3485,7 +3307,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
 
     // We don't support switch statements inside loops.
     if (!isa<BranchInst>(BB->getTerminator())) {
-      emitAnalysis(Report(BB->getTerminator())
+      emitAnalysis(VectorizationReport(BB->getTerminator())
                    << "loop contains a switch statement");
       return false;
     }
@@ -3493,12 +3315,12 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
     // We must be able to predicate all blocks that need to be predicated.
     if (blockNeedsPredication(BB)) {
       if (!blockCanBePredicated(BB, SafePointes)) {
-        emitAnalysis(Report(BB->getTerminator())
+        emitAnalysis(VectorizationReport(BB->getTerminator())
                      << "control flow cannot be substituted for a select");
         return false;
       }
     } else if (BB != Header && !canIfConvertPHINodes(BB)) {
-      emitAnalysis(Report(BB->getTerminator())
+      emitAnalysis(VectorizationReport(BB->getTerminator())
                    << "control flow cannot be substituted for a select");
       return false;
     }
@@ -3513,27 +3335,40 @@ bool LoopVectorizationLegality::canVectorize() {
   // be canonicalized.
   if (!TheLoop->getLoopPreheader()) {
     emitAnalysis(
-        Report() << "loop control flow is not understood by vectorizer");
+        VectorizationReport() <<
+        "loop control flow is not understood by vectorizer");
     return false;
   }
 
   // We can only vectorize innermost loops.
-  if (TheLoop->getSubLoopsVector().size()) {
-    emitAnalysis(Report() << "loop is not the innermost loop");
+  if (!TheLoop->getSubLoopsVector().empty()) {
+    emitAnalysis(VectorizationReport() << "loop is not the innermost loop");
     return false;
   }
 
   // We must have a single backedge.
   if (TheLoop->getNumBackEdges() != 1) {
     emitAnalysis(
-        Report() << "loop control flow is not understood by vectorizer");
+        VectorizationReport() <<
+        "loop control flow is not understood by vectorizer");
     return false;
   }
 
   // We must have a single exiting block.
   if (!TheLoop->getExitingBlock()) {
     emitAnalysis(
-        Report() << "loop control flow is not understood by vectorizer");
+        VectorizationReport() <<
+        "loop control flow is not understood by vectorizer");
+    return false;
+  }
+
+  // We only handle bottom-tested loops, i.e. loop in which the condition is
+  // checked at the end of each iteration. With that we can assume that all
+  // instructions in the loop are executed the same number of times.
+  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+    emitAnalysis(
+        VectorizationReport() <<
+        "loop control flow is not understood by vectorizer");
     return false;
   }
 
@@ -3551,7 +3386,8 @@ bool LoopVectorizationLegality::canVectorize() {
   // ScalarEvolution needs to be able to find the exit count.
   const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
   if (ExitCount == SE->getCouldNotCompute()) {
-    emitAnalysis(Report() << "could not determine number of loop iterations");
+    emitAnalysis(VectorizationReport() <<
+                 "could not determine number of loop iterations");
     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
     return false;
   }
@@ -3572,7 +3408,8 @@ bool LoopVectorizationLegality::canVectorize() {
   collectLoopUniforms();
 
   DEBUG(dbgs() << "LV: We can vectorize this loop" <<
-        (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
+        (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" :
+         "")
         <<"!\n");
 
   // Okay! We can vectorize. At this point we don't have any other mem analysis
@@ -3627,9 +3464,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
   // Look for the attribute signaling the absence of NaNs.
   Function &F = *Header->getParent();
   if (F.hasFnAttribute("no-nans-fp-math"))
-    HasFunNoNaNAttr = F.getAttributes().getAttribute(
-      AttributeSet::FunctionIndex,
-      "no-nans-fp-math").getValueAsString() == "true";
+    HasFunNoNaNAttr =
+        F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
 
   // For each block in the loop.
   for (Loop::block_iterator bb = TheLoop->block_begin(),
@@ -3645,7 +3481,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (!PhiTy->isIntegerTy() &&
             !PhiTy->isFloatingPointTy() &&
             !PhiTy->isPointerTy()) {
-          emitAnalysis(Report(it)
+          emitAnalysis(VectorizationReport(it)
                        << "loop control flow is not understood by vectorizer");
           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
           return false;
@@ -3659,14 +3495,15 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           // identified reduction value with an outside user.
           if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))
             continue;
-          emitAnalysis(Report(it) << "value could not be identified as "
-                                     "an induction or reduction variable");
+          emitAnalysis(VectorizationReport(it) <<
+                       "value could not be identified as "
+                       "an induction or reduction variable");
           return false;
         }
 
-        // We only allow if-converted PHIs with more than two incoming values.
+        // We only allow if-converted PHIs with exactly two incoming values.
         if (Phi->getNumIncomingValues() != 2) {
-          emitAnalysis(Report(it)
+          emitAnalysis(VectorizationReport(it)
                        << "control flow not understood by vectorizer");
           DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
           return false;
@@ -3674,8 +3511,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
         // This is the value coming from the preheader.
         Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
+        ConstantInt *StepValue = nullptr;
         // Check if this is an induction variable.
-        InductionKind IK = isInductionVariable(Phi);
+        InductionKind IK = isInductionVariable(Phi, StepValue);
 
         if (IK_NoInduction != IK) {
           // Get the widest type.
@@ -3685,7 +3523,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
             WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy);
 
           // Int inductions are special because we only allow one IV.
-          if (IK == IK_IntInduction) {
+          if (IK == IK_IntInduction && StepValue->isOne()) {
             // Use the phi node with the widest type as induction. Use the last
             // one if there are multiple (no good reason for doing this other
             // than it is expedient).
@@ -3694,13 +3532,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           }
 
           DEBUG(dbgs() << "LV: Found an induction variable.\n");
-          Inductions[Phi] = InductionInfo(StartValue, IK);
+          Inductions[Phi] = InductionInfo(StartValue, IK, StepValue);
 
           // Until we explicitly handle the case of an induction variable with
           // an outside loop user we have to give up vectorizing this loop.
           if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
-            emitAnalysis(Report(it) << "use of induction value outside of the "
-                                       "loop is not handled by vectorizer");
+            emitAnalysis(VectorizationReport(it) <<
+                         "use of induction value outside of the "
+                         "loop is not handled by vectorizer");
             return false;
           }
 
@@ -3745,8 +3584,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           continue;
         }
 
-        emitAnalysis(Report(it) << "value that could not be identified as "
-                                   "reduction is used outside the loop");
+        emitAnalysis(VectorizationReport(it) <<
+                     "value that could not be identified as "
+                     "reduction is used outside the loop");
         DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
         return false;
       }// end of PHI handling
@@ -3755,7 +3595,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       // calls and we do handle certain intrinsic and libm functions.
       CallInst *CI = dyn_cast<CallInst>(it);
       if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
-        emitAnalysis(Report(it) << "call instruction cannot be vectorized");
+        emitAnalysis(VectorizationReport(it) <<
+                     "call instruction cannot be vectorized");
         DEBUG(dbgs() << "LV: Found a call site.\n");
         return false;
       }
@@ -3765,7 +3606,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       if (CI &&
           hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
         if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
-          emitAnalysis(Report(it)
+          emitAnalysis(VectorizationReport(it)
                        << "intrinsic instruction cannot be vectorized");
           DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
           return false;
@@ -3776,7 +3617,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       // Also, we can't vectorize extractelement instructions.
       if ((!VectorType::isValidElementType(it->getType()) &&
            !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
-        emitAnalysis(Report(it)
+        emitAnalysis(VectorizationReport(it)
                      << "instruction return type cannot be vectorized");
         DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
         return false;
@@ -3786,21 +3627,23 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
         Type *T = ST->getValueOperand()->getType();
         if (!VectorType::isValidElementType(T)) {
-          emitAnalysis(Report(ST) << "store instruction cannot be vectorized");
+          emitAnalysis(VectorizationReport(ST) <<
+                       "store instruction cannot be vectorized");
           return false;
         }
         if (EnableMemAccessVersioning)
-          collectStridedAcccess(ST);
+          collectStridedAccess(ST);
       }
 
       if (EnableMemAccessVersioning)
         if (LoadInst *LI = dyn_cast<LoadInst>(it))
-          collectStridedAcccess(LI);
+          collectStridedAccess(LI);
 
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
       if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
-        emitAnalysis(Report(it) << "value cannot be used outside the loop");
+        emitAnalysis(VectorizationReport(it) <<
+                     "value cannot be used outside the loop");
         return false;
       }
 
@@ -3811,7 +3654,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
   if (!Induction) {
     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
     if (Inductions.empty()) {
-      emitAnalysis(Report()
+      emitAnalysis(VectorizationReport()
                    << "loop induction variable could not be identified");
       return false;
     }
@@ -3933,7 +3776,7 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
   return Stride;
 }
 
-void LoopVectorizationLegality::collectStridedAcccess(Value *MemAccess) {
+void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
   Value *Ptr = nullptr;
   if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
     Ptr = LI->getPointerOperand();
@@ -3971,7 +3814,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
       if (I->getType()->isPointerTy() && isConsecutivePtr(I))
         Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
 
-  while (Worklist.size()) {
+  while (!Worklist.empty()) {
     Instruction *I = dyn_cast<Instruction>(Worklist.back());
     Worklist.pop_back();
 
@@ -3989,962 +3832,12 @@ void LoopVectorizationLegality::collectLoopUniforms() {
   }
 }
 
-namespace {
-/// \brief Analyses memory accesses in a loop.
-///
-/// Checks whether run time pointer checks are needed and builds sets for data
-/// dependence checking.
-class AccessAnalysis {
-public:
-  /// \brief Read or write access location.
-  typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
-  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
-
-  /// \brief Set of potential dependent memory accesses.
-  typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
-
-  AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) :
-    DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {}
-
-  /// \brief Register a load  and whether it is only read from.
-  void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) {
-    Value *Ptr = const_cast<Value*>(Loc.Ptr);
-    AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
-    Accesses.insert(MemAccessInfo(Ptr, false));
-    if (IsReadOnly)
-      ReadOnlyPtr.insert(Ptr);
-  }
-
-  /// \brief Register a store.
-  void addStore(AliasAnalysis::Location &Loc) {
-    Value *Ptr = const_cast<Value*>(Loc.Ptr);
-    AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
-    Accesses.insert(MemAccessInfo(Ptr, true));
-  }
-
-  /// \brief Check whether we can check the pointers at runtime for
-  /// non-intersection.
-  bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
-                       unsigned &NumComparisons, ScalarEvolution *SE,
-                       Loop *TheLoop, ValueToValueMap &Strides,
-                       bool ShouldCheckStride = false);
-
-  /// \brief Goes over all memory accesses, checks whether a RT check is needed
-  /// and builds sets of dependent accesses.
-  void buildDependenceSets() {
-    processMemAccesses();
-  }
-
-  bool isRTCheckNeeded() { return IsRTCheckNeeded; }
-
-  bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
-  void resetDepChecks() { CheckDeps.clear(); }
-
-  MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
-
-private:
-  typedef SetVector<MemAccessInfo> PtrAccessSet;
-
-  /// \brief Go over all memory access and check whether runtime pointer checks
-  /// are needed /// and build sets of dependency check candidates.
-  void processMemAccesses();
-
-  /// Set of all accesses.
-  PtrAccessSet Accesses;
-
-  /// Set of accesses that need a further dependence check.
-  MemAccessInfoSet CheckDeps;
-
-  /// Set of pointers that are read only.
-  SmallPtrSet<Value*, 16> ReadOnlyPtr;
-
-  const DataLayout *DL;
-
-  /// An alias set tracker to partition the access set by underlying object and
-  //intrinsic property (such as TBAA metadata).
-  AliasSetTracker AST;
-
-  /// Sets of potentially dependent accesses - members of one set share an
-  /// underlying pointer. The set "CheckDeps" identfies which sets really need a
-  /// dependence check.
-  DepCandidates &DepCands;
-
-  bool IsRTCheckNeeded;
-};
-
-} // end anonymous namespace
-
-/// \brief Check whether a pointer can participate in a runtime bounds check.
-static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides,
-                                Value *Ptr) {
-  const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
-  if (!AR)
-    return false;
-
-  return AR->isAffine();
-}
-
-/// \brief Check the stride of the pointer and ensure that it does not wrap in
-/// the address space.
-static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
-                        const Loop *Lp, ValueToValueMap &StridesMap);
-
-bool AccessAnalysis::canCheckPtrAtRT(
-    LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
-    unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop,
-    ValueToValueMap &StridesMap, bool ShouldCheckStride) {
-  // Find pointers with computable bounds. We are going to use this information
-  // to place a runtime bound check.
-  bool CanDoRT = true;
-
-  bool IsDepCheckNeeded = isDependencyCheckNeeded();
-  NumComparisons = 0;
-
-  // We assign a consecutive id to access from different alias sets.
-  // Accesses between different groups doesn't need to be checked.
-  unsigned ASId = 1;
-  for (auto &AS : AST) {
-    unsigned NumReadPtrChecks = 0;
-    unsigned NumWritePtrChecks = 0;
-
-    // We assign consecutive id to access from different dependence sets.
-    // Accesses within the same set don't need a runtime check.
-    unsigned RunningDepId = 1;
-    DenseMap<Value *, unsigned> DepSetId;
-
-    for (auto A : AS) {
-      Value *Ptr = A.getValue();
-      bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
-      MemAccessInfo Access(Ptr, IsWrite);
-
-      if (IsWrite)
-        ++NumWritePtrChecks;
-      else
-        ++NumReadPtrChecks;
-
-      if (hasComputableBounds(SE, StridesMap, Ptr) &&
-          // When we run after a failing dependency check we have to make sure we
-          // don't have wrapping pointers.
-          (!ShouldCheckStride ||
-           isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) {
-        // The id of the dependence set.
-        unsigned DepId;
-
-        if (IsDepCheckNeeded) {
-          Value *Leader = DepCands.getLeaderValue(Access).getPointer();
-          unsigned &LeaderId = DepSetId[Leader];
-          if (!LeaderId)
-            LeaderId = RunningDepId++;
-          DepId = LeaderId;
-        } else
-          // Each access has its own dependence set.
-          DepId = RunningDepId++;
-
-        RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap);
-
-        DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n');
-      } else {
-        CanDoRT = false;
-      }
-    }
-
-    if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
-      NumComparisons += 0; // Only one dependence set.
-    else {
-      NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks +
-                                              NumWritePtrChecks - 1));
-    }
-
-    ++ASId;
-  }
-
-  // If the pointers that we would use for the bounds comparison have different
-  // address spaces, assume the values aren't directly comparable, so we can't
-  // use them for the runtime check. We also have to assume they could
-  // overlap. In the future there should be metadata for whether address spaces
-  // are disjoint.
-  unsigned NumPointers = RtCheck.Pointers.size();
-  for (unsigned i = 0; i < NumPointers; ++i) {
-    for (unsigned j = i + 1; j < NumPointers; ++j) {
-      // Only need to check pointers between two different dependency sets.
-      if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j])
-       continue;
-      // Only need to check pointers in the same alias set.
-      if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j])
-        continue;
-
-      Value *PtrI = RtCheck.Pointers[i];
-      Value *PtrJ = RtCheck.Pointers[j];
-
-      unsigned ASi = PtrI->getType()->getPointerAddressSpace();
-      unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
-      if (ASi != ASj) {
-        DEBUG(dbgs() << "LV: Runtime check would require comparison between"
-                       " different address spaces\n");
-        return false;
-      }
-    }
-  }
-
-  return CanDoRT;
-}
-
-void AccessAnalysis::processMemAccesses() {
-  // We process the set twice: first we process read-write pointers, last we
-  // process read-only pointers. This allows us to skip dependence tests for
-  // read-only pointers.
-
-  DEBUG(dbgs() << "LV: Processing memory accesses...\n");
-  DEBUG(dbgs() << "  AST: "; AST.dump());
-  DEBUG(dbgs() << "LV:   Accesses:\n");
-  DEBUG({
-    for (auto A : Accesses)
-      dbgs() << "\t" << *A.getPointer() << " (" <<
-                (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?
-                                         "read-only" : "read")) << ")\n";
-  });
-
-  // The AliasSetTracker has nicely partitioned our pointers by metadata
-  // compatibility and potential for underlying-object overlap. As a result, we
-  // only need to check for potential pointer dependencies within each alias
-  // set.
-  for (auto &AS : AST) {
-    // Note that both the alias-set tracker and the alias sets themselves used
-    // linked lists internally and so the iteration order here is deterministic
-    // (matching the original instruction order within each set).
-
-    bool SetHasWrite = false;
-
-    // Map of pointers to last access encountered.
-    typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap;
-    UnderlyingObjToAccessMap ObjToLastAccess;
-
-    // Set of access to check after all writes have been processed.
-    PtrAccessSet DeferredAccesses;
-
-    // Iterate over each alias set twice, once to process read/write pointers,
-    // and then to process read-only pointers.
-    for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
-      bool UseDeferred = SetIteration > 0;
-      PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
-
-      for (auto A : AS) {
-        Value *Ptr = A.getValue();
-        bool IsWrite = S.count(MemAccessInfo(Ptr, true));
-
-        // If we're using the deferred access set, then it contains only reads.
-        bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
-        if (UseDeferred && !IsReadOnlyPtr)
-          continue;
-        // Otherwise, the pointer must be in the PtrAccessSet, either as a read
-        // or a write.
-        assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
-                 S.count(MemAccessInfo(Ptr, false))) &&
-               "Alias-set pointer not in the access set?");
-
-        MemAccessInfo Access(Ptr, IsWrite);
-        DepCands.insert(Access);
-
-        // Memorize read-only pointers for later processing and skip them in the
-        // first round (they need to be checked after we have seen all write
-        // pointers). Note: we also mark pointer that are not consecutive as
-        // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need
-        // the second check for "!IsWrite".
-        if (!UseDeferred && IsReadOnlyPtr) {
-          DeferredAccesses.insert(Access);
-          continue;
-        }
-
-        // If this is a write - check other reads and writes for conflicts.  If
-        // this is a read only check other writes for conflicts (but only if
-        // there is no other write to the ptr - this is an optimization to
-        // catch "a[i] = a[i] + " without having to do a dependence check).
-        if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
-          CheckDeps.insert(Access);
-          IsRTCheckNeeded = true;
-        }
-
-        if (IsWrite)
-          SetHasWrite = true;
-
-        // Create sets of pointers connected by a shared alias set and
-        // underlying object.
-        typedef SmallVector<Value *, 16> ValueVector;
-        ValueVector TempObjects;
-        GetUnderlyingObjects(Ptr, TempObjects, DL);
-        for (Value *UnderlyingObj : TempObjects) {
-          UnderlyingObjToAccessMap::iterator Prev =
-            ObjToLastAccess.find(UnderlyingObj);
-          if (Prev != ObjToLastAccess.end())
-            DepCands.unionSets(Access, Prev->second);
-
-          ObjToLastAccess[UnderlyingObj] = Access;
-        }
-      }
-    }
-  }
-}
-
-namespace {
-/// \brief Checks memory dependences among accesses to the same underlying
-/// object to determine whether there vectorization is legal or not (and at
-/// which vectorization factor).
-///
-/// This class works under the assumption that we already checked that memory
-/// locations with different underlying pointers are "must-not alias".
-/// We use the ScalarEvolution framework to symbolically evalutate access
-/// functions pairs. Since we currently don't restructure the loop we can rely
-/// on the program order of memory accesses to determine their safety.
-/// At the moment we will only deem accesses as safe for:
-///  * A negative constant distance assuming program order.
-///
-///      Safe: tmp = a[i + 1];     OR     a[i + 1] = x;
-///            a[i] = tmp;                y = a[i];
-///
-///   The latter case is safe because later checks guarantuee that there can't
-///   be a cycle through a phi node (that is, we check that "x" and "y" is not
-///   the same variable: a header phi can only be an induction or a reduction, a
-///   reduction can't have a memory sink, an induction can't have a memory
-///   source). This is important and must not be violated (or we have to
-///   resort to checking for cycles through memory).
-///
-///  * A positive constant distance assuming program order that is bigger
-///    than the biggest memory access.
-///
-///     tmp = a[i]        OR              b[i] = x
-///     a[i+2] = tmp                      y = b[i+2];
-///
-///     Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively.
-///
-///  * Zero distances and all accesses have the same size.
-///
-class MemoryDepChecker {
-public:
-  typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
-  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
-
-  MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L)
-      : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
-        ShouldRetryWithRuntimeCheck(false) {}
-
-  /// \brief Register the location (instructions are given increasing numbers)
-  /// of a write access.
-  void addAccess(StoreInst *SI) {
-    Value *Ptr = SI->getPointerOperand();
-    Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
-    InstMap.push_back(SI);
-    ++AccessIdx;
-  }
-
-  /// \brief Register the location (instructions are given increasing numbers)
-  /// of a write access.
-  void addAccess(LoadInst *LI) {
-    Value *Ptr = LI->getPointerOperand();
-    Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
-    InstMap.push_back(LI);
-    ++AccessIdx;
-  }
-
-  /// \brief Check whether the dependencies between the accesses are safe.
-  ///
-  /// Only checks sets with elements in \p CheckDeps.
-  bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
-                   MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides);
-
-  /// \brief The maximum number of bytes of a vector register we can vectorize
-  /// the accesses safely with.
-  unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
-
-  /// \brief In same cases when the dependency check fails we can still
-  /// vectorize the loop with a dynamic array access check.
-  bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
-
-private:
-  ScalarEvolution *SE;
-  const DataLayout *DL;
-  const Loop *InnermostLoop;
-
-  /// \brief Maps access locations (ptr, read/write) to program order.
-  DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
-
-  /// \brief Memory access instructions in program order.
-  SmallVector<Instruction *, 16> InstMap;
-
-  /// \brief The program order index to be used for the next instruction.
-  unsigned AccessIdx;
-
-  // We can access this many bytes in parallel safely.
-  unsigned MaxSafeDepDistBytes;
-
-  /// \brief If we see a non-constant dependence distance we can still try to
-  /// vectorize this loop with runtime checks.
-  bool ShouldRetryWithRuntimeCheck;
-
-  /// \brief Check whether there is a plausible dependence between the two
-  /// accesses.
-  ///
-  /// Access \p A must happen before \p B in program order. The two indices
-  /// identify the index into the program order map.
-  ///
-  /// This function checks  whether there is a plausible dependence (or the
-  /// absence of such can't be proved) between the two accesses. If there is a
-  /// plausible dependence but the dependence distance is bigger than one
-  /// element access it records this distance in \p MaxSafeDepDistBytes (if this
-  /// distance is smaller than any other distance encountered so far).
-  /// Otherwise, this function returns true signaling a possible dependence.
-  bool isDependent(const MemAccessInfo &A, unsigned AIdx,
-                   const MemAccessInfo &B, unsigned BIdx,
-                   ValueToValueMap &Strides);
-
-  /// \brief Check whether the data dependence could prevent store-load
-  /// forwarding.
-  bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
-};
-
-} // end anonymous namespace
-
-static bool isInBoundsGep(Value *Ptr) {
-  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
-    return GEP->isInBounds();
-  return false;
-}
-
-/// \brief Check whether the access through \p Ptr has a constant stride.
-static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
-                        const Loop *Lp, ValueToValueMap &StridesMap) {
-  const Type *Ty = Ptr->getType();
-  assert(Ty->isPointerTy() && "Unexpected non-ptr");
-
-  // Make sure that the pointer does not point to aggregate types.
-  const PointerType *PtrTy = cast<PointerType>(Ty);
-  if (PtrTy->getElementType()->isAggregateType()) {
-    DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr <<
-          "\n");
-    return 0;
-  }
-
-  const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr);
-
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
-  if (!AR) {
-    DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer "
-          << *Ptr << " SCEV: " << *PtrScev << "\n");
-    return 0;
-  }
-
-  // The accesss function must stride over the innermost loop.
-  if (Lp != AR->getLoop()) {
-    DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " <<
-          *Ptr << " SCEV: " << *PtrScev << "\n");
-  }
-
-  // The address calculation must not wrap. Otherwise, a dependence could be
-  // inverted.
-  // An inbounds getelementptr that is a AddRec with a unit stride
-  // cannot wrap per definition. The unit stride requirement is checked later.
-  // An getelementptr without an inbounds attribute and unit stride would have
-  // to access the pointer value "0" which is undefined behavior in address
-  // space 0, therefore we can also vectorize this case.
-  bool IsInBoundsGEP = isInBoundsGep(Ptr);
-  bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
-  bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
-  if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
-    DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space "
-          << *Ptr << " SCEV: " << *PtrScev << "\n");
-    return 0;
-  }
-
-  // Check the step is constant.
-  const SCEV *Step = AR->getStepRecurrence(*SE);
-
-  // Calculate the pointer stride and check if it is consecutive.
-  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
-  if (!C) {
-    DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr <<
-          " SCEV: " << *PtrScev << "\n");
-    return 0;
-  }
-
-  int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType());
-  const APInt &APStepVal = C->getValue()->getValue();
-
-  // Huge step value - give up.
-  if (APStepVal.getBitWidth() > 64)
-    return 0;
-
-  int64_t StepVal = APStepVal.getSExtValue();
-
-  // Strided access.
-  int64_t Stride = StepVal / Size;
-  int64_t Rem = StepVal % Size;
-  if (Rem)
-    return 0;
-
-  // If the SCEV could wrap but we have an inbounds gep with a unit stride we
-  // know we can't "wrap around the address space". In case of address space
-  // zero we know that this won't happen without triggering undefined behavior.
-  if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) &&
-      Stride != 1 && Stride != -1)
-    return 0;
-
-  return Stride;
-}
-
-bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
-                                                    unsigned TypeByteSize) {
-  // If loads occur at a distance that is not a multiple of a feasible vector
-  // factor store-load forwarding does not take place.
-  // Positive dependences might cause troubles because vectorizing them might
-  // prevent store-load forwarding making vectorized code run a lot slower.
-  //   a[i] = a[i-3] ^ a[i-8];
-  //   The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
-  //   hence on your typical architecture store-load forwarding does not take
-  //   place. Vectorizing in such cases does not make sense.
-  // Store-load forwarding distance.
-  const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize;
-  // Maximum vector factor.
-  unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize;
-  if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues)
-    MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes;
-
-  for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues;
-       vf *= 2) {
-    if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) {
-      MaxVFWithoutSLForwardIssues = (vf >>=1);
-      break;
-    }
-  }
-
-  if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) {
-    DEBUG(dbgs() << "LV: Distance " << Distance <<
-          " that could cause a store-load forwarding conflict\n");
-    return true;
-  }
-
-  if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
-      MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize)
-    MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
-  return false;
-}
-
-bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
-                                   const MemAccessInfo &B, unsigned BIdx,
-                                   ValueToValueMap &Strides) {
-  assert (AIdx < BIdx && "Must pass arguments in program order");
-
-  Value *APtr = A.getPointer();
-  Value *BPtr = B.getPointer();
-  bool AIsWrite = A.getInt();
-  bool BIsWrite = B.getInt();
-
-  // Two reads are independent.
-  if (!AIsWrite && !BIsWrite)
-    return false;
-
-  // We cannot check pointers in different address spaces.
-  if (APtr->getType()->getPointerAddressSpace() !=
-      BPtr->getType()->getPointerAddressSpace())
-    return true;
-
-  const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr);
-  const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr);
-
-  int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides);
-  int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides);
-
-  const SCEV *Src = AScev;
-  const SCEV *Sink = BScev;
-
-  // If the induction step is negative we have to invert source and sink of the
-  // dependence.
-  if (StrideAPtr < 0) {
-    //Src = BScev;
-    //Sink = AScev;
-    std::swap(APtr, BPtr);
-    std::swap(Src, Sink);
-    std::swap(AIsWrite, BIsWrite);
-    std::swap(AIdx, BIdx);
-    std::swap(StrideAPtr, StrideBPtr);
-  }
-
-  const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
-
-  DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink
-        << "(Induction step: " << StrideAPtr <<  ")\n");
-  DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to "
-        << *InstMap[BIdx] << ": " << *Dist << "\n");
-
-  // Need consecutive accesses. We don't want to vectorize
-  // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
-  // the address space.
-  if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
-    DEBUG(dbgs() << "Non-consecutive pointer access\n");
-    return true;
-  }
-
-  const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
-  if (!C) {
-    DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n");
-    ShouldRetryWithRuntimeCheck = true;
-    return true;
-  }
-
-  Type *ATy = APtr->getType()->getPointerElementType();
-  Type *BTy = BPtr->getType()->getPointerElementType();
-  unsigned TypeByteSize = DL->getTypeAllocSize(ATy);
-
-  // Negative distances are not plausible dependencies.
-  const APInt &Val = C->getValue()->getValue();
-  if (Val.isNegative()) {
-    bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
-    if (IsTrueDataDependence &&
-        (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
-         ATy != BTy))
-      return true;
-
-    DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n");
-    return false;
-  }
-
-  // Write to the same location with the same size.
-  // Could be improved to assert type sizes are the same (i32 == float, etc).
-  if (Val == 0) {
-    if (ATy == BTy)
-      return false;
-    DEBUG(dbgs() << "LV: Zero dependence difference but different types\n");
-    return true;
-  }
-
-  assert(Val.isStrictlyPositive() && "Expect a positive value");
-
-  // Positive distance bigger than max vectorization factor.
-  if (ATy != BTy) {
-    DEBUG(dbgs() <<
-          "LV: ReadWrite-Write positive dependency with different types\n");
-    return false;
-  }
-
-  unsigned Distance = (unsigned) Val.getZExtValue();
-
-  // Bail out early if passed-in parameters make vectorization not feasible.
-  unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1;
-  unsigned ForcedUnroll = VectorizationInterleave ? VectorizationInterleave : 1;
-
-  // The distance must be bigger than the size needed for a vectorized version
-  // of the operation and the size of the vectorized operation must not be
-  // bigger than the currrent maximum size.
-  if (Distance < 2*TypeByteSize ||
-      2*TypeByteSize > MaxSafeDepDistBytes ||
-      Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
-    DEBUG(dbgs() << "LV: Failure because of Positive distance "
-        << Val.getSExtValue() << '\n');
-    return true;
-  }
-
-  MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
-    Distance : MaxSafeDepDistBytes;
-
-  bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
-  if (IsTrueDataDependence &&
-      couldPreventStoreLoadForward(Distance, TypeByteSize))
-     return true;
-
-  DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<
-        " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n');
-
-  return false;
-}
-
-bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
-                                   MemAccessInfoSet &CheckDeps,
-                                   ValueToValueMap &Strides) {
-
-  MaxSafeDepDistBytes = -1U;
-  while (!CheckDeps.empty()) {
-    MemAccessInfo CurAccess = *CheckDeps.begin();
-
-    // Get the relevant memory access set.
-    EquivalenceClasses<MemAccessInfo>::iterator I =
-      AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
-
-    // Check accesses within this set.
-    EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE;
-    AI = AccessSets.member_begin(I), AE = AccessSets.member_end();
-
-    // Check every access pair.
-    while (AI != AE) {
-      CheckDeps.erase(*AI);
-      EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
-      while (OI != AE) {
-        // Check every accessing instruction pair in program order.
-        for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
-             I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
-          for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
-               I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
-            if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides))
-              return false;
-            if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides))
-              return false;
-          }
-        ++OI;
-      }
-      AI++;
-    }
-  }
-  return true;
-}
-
 bool LoopVectorizationLegality::canVectorizeMemory() {
-
-  typedef SmallVector<Value*, 16> ValueVector;
-  typedef SmallPtrSet<Value*, 16> ValueSet;
-
-  // Holds the Load and Store *instructions*.
-  ValueVector Loads;
-  ValueVector Stores;
-
-  // Holds all the different accesses in the loop.
-  unsigned NumReads = 0;
-  unsigned NumReadWrites = 0;
-
-  PtrRtCheck.Pointers.clear();
-  PtrRtCheck.Need = false;
-
-  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
-  MemoryDepChecker DepChecker(SE, DL, TheLoop);
-
-  // For each block.
-  for (Loop::block_iterator bb = TheLoop->block_begin(),
-       be = TheLoop->block_end(); bb != be; ++bb) {
-
-    // Scan the BB and collect legal loads and stores.
-    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
-         ++it) {
-
-      // If this is a load, save it. If this instruction can read from memory
-      // but is not a load, then we quit. Notice that we don't handle function
-      // calls that read or write.
-      if (it->mayReadFromMemory()) {
-        // Many math library functions read the rounding mode. We will only
-        // vectorize a loop if it contains known function calls that don't set
-        // the flag. Therefore, it is safe to ignore this read from memory.
-        CallInst *Call = dyn_cast<CallInst>(it);
-        if (Call && getIntrinsicIDForCall(Call, TLI))
-          continue;
-
-        LoadInst *Ld = dyn_cast<LoadInst>(it);
-        if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
-          emitAnalysis(Report(Ld)
-                       << "read with atomic ordering or volatile read");
-          DEBUG(dbgs() << "LV: Found a non-simple load.\n");
-          return false;
-        }
-        NumLoads++;
-        Loads.push_back(Ld);
-        DepChecker.addAccess(Ld);
-        continue;
-      }
-
-      // Save 'store' instructions. Abort if other instructions write to memory.
-      if (it->mayWriteToMemory()) {
-        StoreInst *St = dyn_cast<StoreInst>(it);
-        if (!St) {
-          emitAnalysis(Report(it) << "instruction cannot be vectorized");
-          return false;
-        }
-        if (!St->isSimple() && !IsAnnotatedParallel) {
-          emitAnalysis(Report(St)
-                       << "write with atomic ordering or volatile write");
-          DEBUG(dbgs() << "LV: Found a non-simple store.\n");
-          return false;
-        }
-        NumStores++;
-        Stores.push_back(St);
-        DepChecker.addAccess(St);
-      }
-    } // Next instr.
-  } // Next block.
-
-  // Now we have two lists that hold the loads and the stores.
-  // Next, we find the pointers that they use.
-
-  // Check if we see any stores. If there are no stores, then we don't
-  // care if the pointers are *restrict*.
-  if (!Stores.size()) {
-    DEBUG(dbgs() << "LV: Found a read-only loop!\n");
-    return true;
-  }
-
-  AccessAnalysis::DepCandidates DependentAccesses;
-  AccessAnalysis Accesses(DL, AA, DependentAccesses);
-
-  // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
-  // multiple times on the same object. If the ptr is accessed twice, once
-  // for read and once for write, it will only appear once (on the write
-  // list). This is okay, since we are going to check for conflicts between
-  // writes and between reads and writes, but not between reads and reads.
-  ValueSet Seen;
-
-  ValueVector::iterator I, IE;
-  for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
-    StoreInst *ST = cast<StoreInst>(*I);
-    Value* Ptr = ST->getPointerOperand();
-
-    if (isUniform(Ptr)) {
-      emitAnalysis(
-          Report(ST)
-          << "write to a loop invariant address could not be vectorized");
-      DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
-      return false;
-    }
-
-    // If we did *not* see this pointer before, insert it to  the read-write
-    // list. At this phase it is only a 'write' list.
-    if (Seen.insert(Ptr).second) {
-      ++NumReadWrites;
-
-      AliasAnalysis::Location Loc = AA->getLocation(ST);
-      // The TBAA metadata could have a control dependency on the predication
-      // condition, so we cannot rely on it when determining whether or not we
-      // need runtime pointer checks.
-      if (blockNeedsPredication(ST->getParent()))
-        Loc.AATags.TBAA = nullptr;
-
-      Accesses.addStore(Loc);
-    }
-  }
-
-  if (IsAnnotatedParallel) {
-    DEBUG(dbgs()
-          << "LV: A loop annotated parallel, ignore memory dependency "
-          << "checks.\n");
-    return true;
-  }
-
-  for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
-    LoadInst *LD = cast<LoadInst>(*I);
-    Value* Ptr = LD->getPointerOperand();
-    // If we did *not* see this pointer before, insert it to the
-    // read list. If we *did* see it before, then it is already in
-    // the read-write list. This allows us to vectorize expressions
-    // such as A[i] += x;  Because the address of A[i] is a read-write
-    // pointer. This only works if the index of A[i] is consecutive.
-    // If the address of i is unknown (for example A[B[i]]) then we may
-    // read a few words, modify, and write a few words, and some of the
-    // words may be written to the same address.
-    bool IsReadOnlyPtr = false;
-    if (Seen.insert(Ptr).second ||
-        !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) {
-      ++NumReads;
-      IsReadOnlyPtr = true;
-    }
-
-    AliasAnalysis::Location Loc = AA->getLocation(LD);
-    // The TBAA metadata could have a control dependency on the predication
-    // condition, so we cannot rely on it when determining whether or not we
-    // need runtime pointer checks.
-    if (blockNeedsPredication(LD->getParent()))
-      Loc.AATags.TBAA = nullptr;
-
-    Accesses.addLoad(Loc, IsReadOnlyPtr);
-  }
-
-  // If we write (or read-write) to a single destination and there are no
-  // other reads in this loop then is it safe to vectorize.
-  if (NumReadWrites == 1 && NumReads == 0) {
-    DEBUG(dbgs() << "LV: Found a write-only loop!\n");
-    return true;
-  }
-
-  // Build dependence sets and check whether we need a runtime pointer bounds
-  // check.
-  Accesses.buildDependenceSets();
-  bool NeedRTCheck = Accesses.isRTCheckNeeded();
-
-  // Find pointers with computable bounds. We are going to use this information
-  // to place a runtime bound check.
-  unsigned NumComparisons = 0;
-  bool CanDoRT = false;
-  if (NeedRTCheck)
-    CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop,
-                                       Strides);
-
-  DEBUG(dbgs() << "LV: We need to do " << NumComparisons <<
-        " pointer comparisons.\n");
-
-  // If we only have one set of dependences to check pointers among we don't
-  // need a runtime check.
-  if (NumComparisons == 0 && NeedRTCheck)
-    NeedRTCheck = false;
-
-  // Check that we did not collect too many pointers or found an unsizeable
-  // pointer.
-  if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
-    PtrRtCheck.reset();
-    CanDoRT = false;
-  }
-
-  if (CanDoRT) {
-    DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
-  }
-
-  if (NeedRTCheck && !CanDoRT) {
-    emitAnalysis(Report() << "cannot identify array bounds");
-    DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
-          "the array bounds.\n");
-    PtrRtCheck.reset();
-    return false;
-  }
-
-  PtrRtCheck.Need = NeedRTCheck;
-
-  bool CanVecMem = true;
-  if (Accesses.isDependencyCheckNeeded()) {
-    DEBUG(dbgs() << "LV: Checking memory dependencies\n");
-    CanVecMem = DepChecker.areDepsSafe(
-        DependentAccesses, Accesses.getDependenciesToCheck(), Strides);
-    MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
-
-    if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) {
-      DEBUG(dbgs() << "LV: Retrying with memory checks\n");
-      NeedRTCheck = true;
-
-      // Clear the dependency checks. We assume they are not needed.
-      Accesses.resetDepChecks();
-
-      PtrRtCheck.reset();
-      PtrRtCheck.Need = true;
-
-      CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE,
-                                         TheLoop, Strides, true);
-      // Check that we did not collect too many pointers or found an unsizeable
-      // pointer.
-      if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
-        if (!CanDoRT && NumComparisons > 0)
-          emitAnalysis(Report()
-                       << "cannot check memory dependencies at runtime");
-        else
-          emitAnalysis(Report()
-                       << NumComparisons << " exceeds limit of "
-                       << RuntimeMemoryCheckThreshold
-                       << " dependent memory operations checked at runtime");
-        DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n");
-        PtrRtCheck.reset();
-        return false;
-      }
-
-      CanVecMem = true;
-    }
-  }
-
-  if (!CanVecMem)
-    emitAnalysis(Report() << "unsafe dependent memory operations in loop");
-
-  DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") <<
-        " need a runtime memory check.\n");
-
-  return CanVecMem;
+  LAI = &LAA->getInfo(TheLoop, Strides);
+  auto &OptionalReport = LAI->getReport();
+  if (OptionalReport)
+    emitAnalysis(VectorizationReport(*OptionalReport));
+  return LAI->canVectorizeMemory();
 }
 
 static bool hasMultipleUsesOf(Instruction *I,
@@ -5236,7 +4129,8 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
 }
 
 LoopVectorizationLegality::InductionKind
-LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+LoopVectorizationLegality::isInductionVariable(PHINode *Phi,
+                                               ConstantInt *&StepValue) {
   Type *PhiTy = Phi->getType();
   // We only handle integer and pointer inductions variables.
   if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
@@ -5249,22 +4143,19 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
     DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
     return IK_NoInduction;
   }
-  const SCEV *Step = AR->getStepRecurrence(*SE);
-
-  // Integer inductions need to have a stride of one.
-  if (PhiTy->isIntegerTy()) {
-    if (Step->isOne())
-      return IK_IntInduction;
-    if (Step->isAllOnesValue())
-      return IK_ReverseIntInduction;
-    return IK_NoInduction;
-  }
 
+  const SCEV *Step = AR->getStepRecurrence(*SE);
   // Calculate the pointer stride and check if it is consecutive.
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
   if (!C)
     return IK_NoInduction;
 
+  ConstantInt *CV = C->getValue();
+  if (PhiTy->isIntegerTy()) {
+    StepValue = CV;
+    return IK_IntInduction;
+  }
+
   assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
   Type *PointerElementType = PhiTy->getPointerElementType();
   // The pointer stride cannot be determined if the pointer element type is not
@@ -5272,13 +4163,12 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   if (!PointerElementType->isSized())
     return IK_NoInduction;
 
-  uint64_t Size = DL->getTypeAllocSize(PointerElementType);
-  if (C->getValue()->equalsInt(Size))
-    return IK_PtrInduction;
-  else if (C->getValue()->equalsInt(0 - Size))
-    return IK_ReversePtrInduction;
-
-  return IK_NoInduction;
+  int64_t Size = static_cast<int64_t>(DL->getTypeAllocSize(PointerElementType));
+  int64_t CVSize = CV->getSExtValue();
+  if (CVSize % Size)
+    return IK_NoInduction;
+  StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size);
+  return IK_PtrInduction;
 }
 
 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
@@ -5291,21 +4181,32 @@ bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
 }
 
 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
-  assert(TheLoop->contains(BB) && "Unknown block used");
-
-  // Blocks that do not dominate the latch need predication.
-  BasicBlock* Latch = TheLoop->getLoopLatch();
-  return !DT->dominates(BB, Latch);
+  return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
 }
 
 bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
                                            SmallPtrSetImpl<Value *> &SafePtrs) {
+  
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    // Check that we don't have a constant expression that can trap as operand.
+    for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
+         OI != OE; ++OI) {
+      if (Constant *C = dyn_cast<Constant>(*OI))
+        if (C->canTrap())
+          return false;
+    }
     // We might be able to hoist the load.
     if (it->mayReadFromMemory()) {
       LoadInst *LI = dyn_cast<LoadInst>(it);
-      if (!LI || !SafePtrs.count(LI->getPointerOperand()))
+      if (!LI)
         return false;
+      if (!SafePtrs.count(LI->getPointerOperand())) {
+        if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) {
+          MaskedOp.insert(LI);
+          continue;
+        }
+        return false;
+      }
     }
 
     // We don't predicate stores at the moment.
@@ -5313,22 +4214,30 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
       StoreInst *SI = dyn_cast<StoreInst>(it);
       // We only support predication of stores in basic blocks with one
       // predecessor.
-      if (!SI || ++NumPredStores > NumberOfStoresToPredicate ||
-          !SafePtrs.count(SI->getPointerOperand()) ||
-          !SI->getParent()->getSinglePredecessor())
+      if (!SI)
+        return false;
+
+      bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
+      bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
+      
+      if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
+          !isSinglePredecessor) {
+        // Build a masked store if it is legal for the target, otherwise scalarize
+        // the block.
+        bool isLegalMaskedOp =
+          isLegalMaskedStore(SI->getValueOperand()->getType(),
+                             SI->getPointerOperand());
+        if (isLegalMaskedOp) {
+          --NumPredStores;
+          MaskedOp.insert(SI);
+          continue;
+        }
         return false;
+      }
     }
     if (it->mayThrow())
       return false;
 
-    // Check that we don't have a constant expression that can trap as operand.
-    for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
-         OI != OE; ++OI) {
-      if (Constant *C = dyn_cast<Constant>(*OI))
-        if (C->canTrap())
-          return false;
-    }
-
     // The instructions below can trap.
     switch (it->getOpcode()) {
     default: continue;
@@ -5336,7 +4245,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
     case Instruction::SDiv:
     case Instruction::URem:
     case Instruction::SRem:
-             return false;
+      return false;
     }
   }
 
@@ -5348,13 +4257,17 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   // Width 1 means no vectorize
   VectorizationFactor Factor = { 1U, 0U };
   if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
-    emitAnalysis(Report() << "runtime pointer checks needed. Enable vectorization of this loop with '#pragma clang loop vectorize(enable)' when compiling with -Os");
+    emitAnalysis(VectorizationReport() <<
+                 "runtime pointer checks needed. Enable vectorization of this "
+                 "loop with '#pragma clang loop vectorize(enable)' when "
+                 "compiling with -Os");
     DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
     return Factor;
   }
 
-  if (!EnableCondStoresVectorization && Legal->NumPredStores) {
-    emitAnalysis(Report() << "store that is conditionally executed prevents vectorization");
+  if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
+    emitAnalysis(VectorizationReport() <<
+                 "store that is conditionally executed prevents vectorization");
     DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
     return Factor;
   }
@@ -5380,7 +4293,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
     MaxVectorSize = 1;
   }
 
-  assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements"
+  assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
          " into one vector!");
 
   unsigned VF = MaxVectorSize;
@@ -5389,7 +4302,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   if (OptForSize) {
     // If we are unable to calculate the trip count then don't try to vectorize.
     if (TC < 2) {
-      emitAnalysis(Report() << "unable to calculate the loop count due to complex control flow");
+      emitAnalysis
+        (VectorizationReport() <<
+         "unable to calculate the loop count due to complex control flow");
       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
       return Factor;
     }
@@ -5403,10 +4318,11 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
     // If the trip count that we found modulo the vectorization factor is not
     // zero then we require a tail.
     if (VF < 2) {
-      emitAnalysis(Report() << "cannot optimize for size and vectorize at the "
-                               "same time. Enable vectorization of this loop "
-                               "with '#pragma clang loop vectorize(enable)' "
-                               "when compiling with -Os");
+      emitAnalysis(VectorizationReport() <<
+                   "cannot optimize for size and vectorize at the "
+                   "same time. Enable vectorization of this loop "
+                   "with '#pragma clang loop vectorize(enable)' "
+                   "when compiling with -Os");
       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
       return Factor;
     }
@@ -5619,8 +4535,10 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
 
     // Unroll until store/load ports (estimated by max unroll factor) are
     // saturated.
-    unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1);
-    unsigned LoadsUF = UF /  (Legal->NumLoads ? Legal->NumLoads : 1);
+    unsigned NumStores = Legal->getNumStores();
+    unsigned NumLoads = Legal->getNumLoads();
+    unsigned StoresUF = UF / (NumStores ? NumStores : 1);
+    unsigned LoadsUF = UF /  (NumLoads ? NumLoads : 1);
 
     // If we have a scalar reduction (vector reductions are already dealt with
     // by this point), we can increase the critical path length if the loop
@@ -6008,7 +4926,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
 
     // Wide load/stores.
     unsigned Cost = TTI.getAddressComputationCost(VectorTy);
-    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+    if (Legal->isMaskRequired(I))
+      Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment,
+                                        AS);
+    else
+      Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
 
     if (Reverse)
       Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
@@ -6081,15 +5003,16 @@ Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
 char LoopVectorize::ID = 0;
 static const char lv_name[] = "Loop Vectorization";
 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
 
 namespace llvm {
@@ -6186,7 +5109,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
                                ConstantInt::get(Cond[Part]->getType(), 1));
       CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
       LoopVectorBody.push_back(CondBlock);
-      VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
+      VectorLp->addBasicBlockToLoop(CondBlock, *LI);
       // Update Builder with newly created basic block.
       Builder.SetInsertPoint(InsertPt);
     }
@@ -6212,7 +5135,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
       if (IfPredicateStore) {
         BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
         LoopVectorBody.push_back(NewIfBlock);
-        VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
+        VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
         Builder.SetInsertPoint(InsertPt);
         Instruction *OldBr = IfBlock->getTerminator();
         BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
@@ -6237,11 +5160,10 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
   return V;
 }
 
-Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx,
-                                               bool Negate) {
+Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) {
   // When unrolling and the VF is 1, we only need to add a simple scalar.
   Type *ITy = Val->getType();
   assert(!ITy->isVectorTy() && "Val must be a scalar");
-  Constant *C = ConstantInt::get(ITy, StartIdx, Negate);
-  return Builder.CreateAdd(Val, C, "induction");
+  Constant *C = ConstantInt::get(ITy, StartIdx);
+  return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
 }
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 44bfea1..baf9741 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -19,9 +19,10 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -74,6 +75,27 @@ static const unsigned MinVecRegSize = 128;
 
 static const unsigned RecursionMaxDepth = 12;
 
+// Limit the number of alias checks. The limit is chosen so that
+// it has no negative effect on the llvm benchmarks.
+static const unsigned AliasedCheckLimit = 10;
+
+// Another limit for the alias checks: The maximum distance between load/store
+// instructions where alias checks are done.
+// This limit is useful for very large basic blocks.
+static const unsigned MaxMemDepDistance = 160;
+
+/// \brief Predicate for the element types that the SLP vectorizer supports.
+///
+/// The most important thing to filter here are types which are invalid in LLVM
+/// vectors. We also filter target specific types which have absolutely no
+/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
+/// avoids spending time checking the cost model and realizing that they will
+/// be inevitably scalarized.
+static bool isValidElementType(Type *Ty) {
+  return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
+         !Ty->isPPC_FP128Ty();
+}
+
 /// \returns the parent basic block if all of the instructions in \p VL
 /// are in the same block or null otherwise.
 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
@@ -207,6 +229,8 @@ static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
         MD = MDNode::getMostGenericTBAA(MD, IMD);
         break;
       case LLVMContext::MD_alias_scope:
+        MD = MDNode::getMostGenericAliasScope(MD, IMD);
+        break;
       case LLVMContext::MD_noalias:
         MD = MDNode::intersect(MD, IMD);
         break;
@@ -263,104 +287,6 @@ static bool CanReuseExtract(ArrayRef<Value *> VL) {
   return true;
 }
 
-static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
-                                           SmallVectorImpl<Value *> &Left,
-                                           SmallVectorImpl<Value *> &Right) {
-
-  SmallVector<Value *, 16> OrigLeft, OrigRight;
-
-  bool AllSameOpcodeLeft = true;
-  bool AllSameOpcodeRight = true;
-  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    Instruction *I = cast<Instruction>(VL[i]);
-    Value *V0 = I->getOperand(0);
-    Value *V1 = I->getOperand(1);
-
-    OrigLeft.push_back(V0);
-    OrigRight.push_back(V1);
-
-    Instruction *I0 = dyn_cast<Instruction>(V0);
-    Instruction *I1 = dyn_cast<Instruction>(V1);
-
-    // Check whether all operands on one side have the same opcode. In this case
-    // we want to preserve the original order and not make things worse by
-    // reordering.
-    AllSameOpcodeLeft = I0;
-    AllSameOpcodeRight = I1;
-
-    if (i && AllSameOpcodeLeft) {
-      if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) {
-        if(P0->getOpcode() != I0->getOpcode())
-          AllSameOpcodeLeft = false;
-      } else
-        AllSameOpcodeLeft = false;
-    }
-    if (i && AllSameOpcodeRight) {
-      if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) {
-        if(P1->getOpcode() != I1->getOpcode())
-          AllSameOpcodeRight = false;
-      } else
-        AllSameOpcodeRight = false;
-    }
-
-    // Sort two opcodes. In the code below we try to preserve the ability to use
-    // broadcast of values instead of individual inserts.
-    // vl1 = load
-    // vl2 = phi
-    // vr1 = load
-    // vr2 = vr2
-    //    = vl1 x vr1
-    //    = vl2 x vr2
-    // If we just sorted according to opcode we would leave the first line in
-    // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
-    //    = vl1 x vr1
-    //    = vr2 x vl2
-    // Because vr2 and vr1 are from the same load we loose the opportunity of a
-    // broadcast for the packed right side in the backend: we have [vr1, vl2]
-    // instead of [vr1, vr2=vr1].
-    if (I0 && I1) {
-       if(!i && I0->getOpcode() > I1->getOpcode()) {
-         Left.push_back(I1);
-         Right.push_back(I0);
-       } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) {
-         // Try not to destroy a broad cast for no apparent benefit.
-         Left.push_back(I1);
-         Right.push_back(I0);
-       } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] ==  I0) {
-         // Try preserve broadcasts.
-         Left.push_back(I1);
-         Right.push_back(I0);
-       } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) {
-         // Try preserve broadcasts.
-         Left.push_back(I1);
-         Right.push_back(I0);
-       } else {
-         Left.push_back(I0);
-         Right.push_back(I1);
-       }
-       continue;
-    }
-    // One opcode, put the instruction on the right.
-    if (I0) {
-      Left.push_back(V1);
-      Right.push_back(I0);
-      continue;
-    }
-    Left.push_back(V0);
-    Right.push_back(V1);
-  }
-
-  bool LeftBroadcast = isSplat(Left);
-  bool RightBroadcast = isSplat(Right);
-
-  // Don't reorder if the operands where good to begin with.
-  if (!(LeftBroadcast || RightBroadcast) &&
-      (AllSameOpcodeRight || AllSameOpcodeLeft)) {
-    Left = OrigLeft;
-    Right = OrigRight;
-  }
-}
-
 /// \returns True if in-tree use also needs extract. This refers to
 /// possible scalar operand in vectorized instruction.
 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
@@ -388,6 +314,26 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
   }
 }
 
+/// \returns the AA location that is being access by the instruction.
+static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) {
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return AA->getLocation(SI);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return AA->getLocation(LI);
+  return AliasAnalysis::Location();
+}
+
+/// \returns True if the instruction is not a volatile or atomic load/store.
+static bool isSimple(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isSimple();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isSimple();
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+    return !MI->isVolatile();
+  return true;
+}
+
 /// Bottom Up SLP Vectorizer.
 class BoUpSLP {
 public:
@@ -398,11 +344,11 @@ public:
 
   BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl,
           TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa,
-          LoopInfo *Li, DominatorTree *Dt, AssumptionTracker *AT)
-      : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0),
-        F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
+          LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC)
+      : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
+        SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
         Builder(Se->getContext()) {
-    CodeMetrics::collectEphemeralValues(F, AT, EphValues);
+    CodeMetrics::collectEphemeralValues(F, AC, EphValues);
   }
 
   /// \brief Vectorize the tree that starts with the elements in \p VL.
@@ -494,6 +440,16 @@ private:
   /// be beneficial even the tree height is tiny.
   bool isFullyVectorizableTinyTree();
 
+  /// \reorder commutative operands in alt shuffle if they result in
+  ///  vectorized code.
+  void reorderAltShuffleOperands(ArrayRef<Value *> VL,
+                                 SmallVectorImpl<Value *> &Left,
+                                 SmallVectorImpl<Value *> &Right);
+  /// \reorder commutative operands to get better probability of
+  /// generating vectorized code.
+  void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+                                      SmallVectorImpl<Value *> &Left,
+                                      SmallVectorImpl<Value *> &Right);
   struct TreeEntry {
     TreeEntry() : Scalars(), VectorizedValue(nullptr),
     NeedToGather(0) {}
@@ -555,6 +511,52 @@ private:
   };
   typedef SmallVector<ExternalUser, 16> UserList;
 
+  /// Checks if two instructions may access the same memory.
+  ///
+  /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
+  /// is invariant in the calling loop.
+  bool isAliased(const AliasAnalysis::Location &Loc1, Instruction *Inst1,
+                 Instruction *Inst2) {
+
+    // First check if the result is already in the cache.
+    AliasCacheKey key = std::make_pair(Inst1, Inst2);
+    Optional<bool> &result = AliasCache[key];
+    if (result.hasValue()) {
+      return result.getValue();
+    }
+    AliasAnalysis::Location Loc2 = getLocation(Inst2, AA);
+    bool aliased = true;
+    if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
+      // Do the alias check.
+      aliased = AA->alias(Loc1, Loc2);
+    }
+    // Store the result in the cache.
+    result = aliased;
+    return aliased;
+  }
+
+  typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
+
+  /// Cache for alias results.
+  /// TODO: consider moving this to the AliasAnalysis itself.
+  DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
+
+  /// Removes an instruction from its block and eventually deletes it.
+  /// It's like Instruction::eraseFromParent() except that the actual deletion
+  /// is delayed until BoUpSLP is destructed.
+  /// This is required to ensure that there are no incorrect collisions in the
+  /// AliasCache, which can happen if a new instruction is allocated at the
+  /// same address as a previously deleted instruction.
+  void eraseInstruction(Instruction *I) {
+    I->removeFromParent();
+    I->dropAllReferences();
+    DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
+  }
+
+  /// Temporary store for deleted instructions. Instructions will be deleted
+  /// eventually when the BoUpSLP is destructed.
+  SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
+
   /// A list of values that need to extracted out of the tree.
   /// This list holds pairs of (Internal Scalar : External User).
   UserList ExternalUses;
@@ -791,7 +793,7 @@ private:
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
     /// actually moved at this stage.
-    bool tryScheduleBundle(ArrayRef<Value *> VL, AliasAnalysis *AA);
+    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
 
     /// Un-bundles a group of instructions.
     void cancelScheduling(ArrayRef<Value *> VL);
@@ -808,7 +810,7 @@ private:
     /// Updates the dependency information of a bundle and of all instructions/
     /// bundles which depend on the original bundle.
     void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
-                               AliasAnalysis *AA);
+                               BoUpSLP *SLP);
 
     /// Sets all instruction in the scheduling region to un-scheduled.
     void resetSchedule();
@@ -857,7 +859,7 @@ private:
   };
 
   /// Attaches the BlockScheduling structures to basic blocks.
-  DenseMap<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
+  MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
 
   /// Performs the "real" scheduling. Done before vectorization is actually
   /// performed in a basic block.
@@ -1031,11 +1033,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     }
   }
 
-  // If any of the scalars appears in the table OR it is marked as a value that
-  // needs to stat scalar then we need to gather the scalars.
+  // If any of the scalars is marked as a value that needs to stay scalar then
+  // we need to gather the scalars.
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) {
-      DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n");
+    if (MustGather.count(VL[i])) {
+      DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
       newTreeEntry(VL, false);
       return;
     }
@@ -1069,7 +1071,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   }
   BlockScheduling &BS = *BSRef.get();
 
-  if (!BS.tryScheduleBundle(VL, AA)) {
+  if (!BS.tryScheduleBundle(VL, this)) {
     DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
     BS.cancelScheduling(VL);
     newTreeEntry(VL, false);
@@ -1158,7 +1160,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       Type *SrcTy = VL0->getOperand(0)->getType();
       for (unsigned i = 0; i < VL.size(); ++i) {
         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
-        if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) {
+        if (Ty != SrcTy || !isValidElementType(Ty)) {
           BS.cancelScheduling(VL);
           newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
@@ -1381,6 +1383,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       }
       newTreeEntry(VL, true);
       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+
+      // Reorder operands if reordering would enable vectorization.
+      if (isa<BinaryOperator>(VL0)) {
+        ValueList Left, Right;
+        reorderAltShuffleOperands(VL, Left, Right);
+        buildTree_rec(Left, Depth + 1);
+        buildTree_rec(Right, Depth + 1);
+        return;
+      }
+
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -1704,7 +1716,7 @@ int BoUpSLP::getTreeCost() {
 
   // We only vectorize tiny trees if it is fully vectorizable.
   if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
-    if (!VectorizableTree.size()) {
+    if (VectorizableTree.empty()) {
       assert(!ExternalUses.size() && "We should not have any external users");
     }
     return INT_MAX;
@@ -1818,6 +1830,195 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
   return X == PtrSCEVB;
 }
 
+// Reorder commutative operations in alternate shuffle if the resulting vectors
+// are consecutive loads. This would allow us to vectorize the tree.
+// If we have something like-
+// load a[0] - load b[0]
+// load b[1] + load a[1]
+// load a[2] - load b[2]
+// load a[3] + load b[3]
+// Reordering the second load b[1]  load a[1] would allow us to vectorize this
+// code.
+void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
+                                        SmallVectorImpl<Value *> &Left,
+                                        SmallVectorImpl<Value *> &Right) {
+
+  // Push left and right operands of binary operation into Left and Right
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
+    Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
+  }
+
+  // Reorder if we have a commutative operation and consecutive access
+  // are on either side of the alternate instructions.
+  for (unsigned j = 0; j < VL.size() - 1; ++j) {
+    if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
+      if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
+        Instruction *VL1 = cast<Instruction>(VL[j]);
+        Instruction *VL2 = cast<Instruction>(VL[j + 1]);
+        if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) {
+          std::swap(Left[j], Right[j]);
+          continue;
+        } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) {
+          std::swap(Left[j + 1], Right[j + 1]);
+          continue;
+        }
+        // else unchanged
+      }
+    }
+    if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
+      if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
+        Instruction *VL1 = cast<Instruction>(VL[j]);
+        Instruction *VL2 = cast<Instruction>(VL[j + 1]);
+        if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) {
+          std::swap(Left[j], Right[j]);
+          continue;
+        } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) {
+          std::swap(Left[j + 1], Right[j + 1]);
+          continue;
+        }
+        // else unchanged
+      }
+    }
+  }
+}
+
+void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+                                             SmallVectorImpl<Value *> &Left,
+                                             SmallVectorImpl<Value *> &Right) {
+
+  SmallVector<Value *, 16> OrigLeft, OrigRight;
+
+  bool AllSameOpcodeLeft = true;
+  bool AllSameOpcodeRight = true;
+  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+    Instruction *I = cast<Instruction>(VL[i]);
+    Value *VLeft = I->getOperand(0);
+    Value *VRight = I->getOperand(1);
+
+    OrigLeft.push_back(VLeft);
+    OrigRight.push_back(VRight);
+
+    Instruction *ILeft = dyn_cast<Instruction>(VLeft);
+    Instruction *IRight = dyn_cast<Instruction>(VRight);
+
+    // Check whether all operands on one side have the same opcode. In this case
+    // we want to preserve the original order and not make things worse by
+    // reordering.
+    if (i && AllSameOpcodeLeft && ILeft) {
+      if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) {
+        if (PLeft->getOpcode() != ILeft->getOpcode())
+          AllSameOpcodeLeft = false;
+      } else
+        AllSameOpcodeLeft = false;
+    }
+    if (i && AllSameOpcodeRight && IRight) {
+      if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) {
+        if (PRight->getOpcode() != IRight->getOpcode())
+          AllSameOpcodeRight = false;
+      } else
+        AllSameOpcodeRight = false;
+    }
+
+    // Sort two opcodes. In the code below we try to preserve the ability to use
+    // broadcast of values instead of individual inserts.
+    // vl1 = load
+    // vl2 = phi
+    // vr1 = load
+    // vr2 = vr2
+    //    = vl1 x vr1
+    //    = vl2 x vr2
+    // If we just sorted according to opcode we would leave the first line in
+    // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
+    //    = vl1 x vr1
+    //    = vr2 x vl2
+    // Because vr2 and vr1 are from the same load we loose the opportunity of a
+    // broadcast for the packed right side in the backend: we have [vr1, vl2]
+    // instead of [vr1, vr2=vr1].
+    if (ILeft && IRight) {
+      if (!i && ILeft->getOpcode() > IRight->getOpcode()) {
+        Left.push_back(IRight);
+        Right.push_back(ILeft);
+      } else if (i && ILeft->getOpcode() > IRight->getOpcode() &&
+                 Right[i - 1] != IRight) {
+        // Try not to destroy a broad cast for no apparent benefit.
+        Left.push_back(IRight);
+        Right.push_back(ILeft);
+      } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
+                 Right[i - 1] == ILeft) {
+        // Try preserve broadcasts.
+        Left.push_back(IRight);
+        Right.push_back(ILeft);
+      } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
+                 Left[i - 1] == IRight) {
+        // Try preserve broadcasts.
+        Left.push_back(IRight);
+        Right.push_back(ILeft);
+      } else {
+        Left.push_back(ILeft);
+        Right.push_back(IRight);
+      }
+      continue;
+    }
+    // One opcode, put the instruction on the right.
+    if (ILeft) {
+      Left.push_back(VRight);
+      Right.push_back(ILeft);
+      continue;
+    }
+    Left.push_back(VLeft);
+    Right.push_back(VRight);
+  }
+
+  bool LeftBroadcast = isSplat(Left);
+  bool RightBroadcast = isSplat(Right);
+
+  // If operands end up being broadcast return this operand order.
+  if (LeftBroadcast || RightBroadcast)
+    return;
+
+  // Don't reorder if the operands where good to begin.
+  if (AllSameOpcodeRight || AllSameOpcodeLeft) {
+    Left = OrigLeft;
+    Right = OrigRight;
+  }
+
+  // Finally check if we can get longer vectorizable chain by reordering
+  // without breaking the good operand order detected above.
+  // E.g. If we have something like-
+  // load a[0]  load b[0]
+  // load b[1]  load a[1]
+  // load a[2]  load b[2]
+  // load a[3]  load b[3]
+  // Reordering the second load b[1]  load a[1] would allow us to vectorize
+  // this code and we still retain AllSameOpcode property.
+  // FIXME: This load reordering might break AllSameOpcode in some rare cases
+  // such as-
+  // add a[0],c[0]  load b[0]
+  // add a[1],c[2]  load b[1]
+  // b[2]           load b[2]
+  // add a[3],c[3]  load b[3]
+  for (unsigned j = 0; j < VL.size() - 1; ++j) {
+    if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
+      if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
+        if (isConsecutiveAccess(L, L1)) {
+          std::swap(Left[j + 1], Right[j + 1]);
+          continue;
+        }
+      }
+    }
+    if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
+      if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
+        if (isConsecutiveAccess(L, L1)) {
+          std::swap(Left[j + 1], Right[j + 1]);
+          continue;
+        }
+      }
+    }
+    // else unchanged
+  }
+}
+
 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
   Instruction *VL0 = cast<Instruction>(VL[0]);
   BasicBlock::iterator NextInst = VL0;
@@ -2214,10 +2415,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     }
     case Instruction::ShuffleVector: {
       ValueList LHSVL, RHSVL;
-      for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
-        LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
-        RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
-      }
+      assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
+      reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
       setInsertPointAfterBundle(E->Scalars);
 
       Value *LHS = vectorizeTree(LHSVL);
@@ -2360,7 +2559,7 @@ Value *BoUpSLP::vectorizeTree() {
         Scalar->replaceAllUsesWith(Undef);
       }
       DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
-      cast<Instruction>(Scalar)->eraseFromParent();
+      eraseInstruction(cast<Instruction>(Scalar));
     }
   }
 
@@ -2442,7 +2641,7 @@ void BoUpSLP::optimizeGatherSequence() {
         if (In->isIdenticalTo(*v) &&
             DT->dominates((*v)->getParent(), In->getParent())) {
           In->replaceAllUsesWith(*v);
-          In->eraseFromParent();
+          eraseInstruction(In);
           In = nullptr;
           break;
         }
@@ -2460,7 +2659,7 @@ void BoUpSLP::optimizeGatherSequence() {
 // Groups the instructions to a bundle (which is then a single scheduling entity)
 // and schedules instructions until the bundle gets ready.
 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
-                                                 AliasAnalysis *AA) {
+                                                 BoUpSLP *SLP) {
   if (isa<PHINode>(VL[0]))
     return true;
 
@@ -2517,7 +2716,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
   DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
                << BB->getName() << "\n");
 
-  calculateDependencies(Bundle, true, AA);
+  calculateDependencies(Bundle, true, SLP);
 
   // Now try to schedule the new bundle. As soon as the bundle is "ready" it
   // means that there are no cyclic dependencies and we can schedule it.
@@ -2648,18 +2847,9 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
   }
 }
 
-/// \returns the AA location that is being access by the instruction.
-static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) {
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return AA->getLocation(SI);
-  if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    return AA->getLocation(LI);
-  return AliasAnalysis::Location();
-}
-
 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
                                                      bool InsertInReadyList,
-                                                     AliasAnalysis *AA) {
+                                                     BoUpSLP *SLP) {
   assert(SD->isSchedulingEntity());
 
   SmallVector<ScheduleData *, 10> WorkList;
@@ -2704,26 +2894,60 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
         // Handle the memory dependencies.
         ScheduleData *DepDest = BundleMember->NextLoadStore;
         if (DepDest) {
-          AliasAnalysis::Location SrcLoc = getLocation(BundleMember->Inst, AA);
+          Instruction *SrcInst = BundleMember->Inst;
+          AliasAnalysis::Location SrcLoc = getLocation(SrcInst, SLP->AA);
           bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
+          unsigned numAliased = 0;
+          unsigned DistToSrc = 1;
 
           while (DepDest) {
             assert(isInSchedulingRegion(DepDest));
-            if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) {
-              AliasAnalysis::Location DstLoc = getLocation(DepDest->Inst, AA);
-              if (!SrcLoc.Ptr || !DstLoc.Ptr || AA->alias(SrcLoc, DstLoc)) {
-                DepDest->MemoryDependencies.push_back(BundleMember);
-                BundleMember->Dependencies++;
-                ScheduleData *DestBundle = DepDest->FirstInBundle;
-                if (!DestBundle->IsScheduled) {
-                  BundleMember->incrementUnscheduledDeps(1);
-                }
-                if (!DestBundle->hasValidDependencies()) {
-                  WorkList.push_back(DestBundle);
-                }
+
+            // We have two limits to reduce the complexity:
+            // 1) AliasedCheckLimit: It's a small limit to reduce calls to
+            //    SLP->isAliased (which is the expensive part in this loop).
+            // 2) MaxMemDepDistance: It's for very large blocks and it aborts
+            //    the whole loop (even if the loop is fast, it's quadratic).
+            //    It's important for the loop break condition (see below) to
+            //    check this limit even between two read-only instructions.
+            if (DistToSrc >= MaxMemDepDistance ||
+                    ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
+                     (numAliased >= AliasedCheckLimit ||
+                      SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
+
+              // We increment the counter only if the locations are aliased
+              // (instead of counting all alias checks). This gives a better
+              // balance between reduced runtime and accurate dependencies.
+              numAliased++;
+
+              DepDest->MemoryDependencies.push_back(BundleMember);
+              BundleMember->Dependencies++;
+              ScheduleData *DestBundle = DepDest->FirstInBundle;
+              if (!DestBundle->IsScheduled) {
+                BundleMember->incrementUnscheduledDeps(1);
+              }
+              if (!DestBundle->hasValidDependencies()) {
+                WorkList.push_back(DestBundle);
               }
             }
             DepDest = DepDest->NextLoadStore;
+
+            // Example, explaining the loop break condition: Let's assume our
+            // starting instruction is i0 and MaxMemDepDistance = 3.
+            //
+            //                      +--------v--v--v
+            //             i0,i1,i2,i3,i4,i5,i6,i7,i8
+            //             +--------^--^--^
+            //
+            // MaxMemDepDistance let us stop alias-checking at i3 and we add
+            // dependencies from i0 to i3,i4,.. (even if they are not aliased).
+            // Previously we already added dependencies from i3 to i6,i7,i8
+            // (because of MaxMemDepDistance). As we added a dependency from
+            // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
+            // and we can abort this loop at i6.
+            if (DistToSrc >= 2 * MaxMemDepDistance)
+                break;
+            DistToSrc++;
           }
         }
       }
@@ -2779,7 +3003,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
         "scheduler and vectorizer have different opinion on what is a bundle");
     SD->FirstInBundle->SchedulingPriority = Idx++;
     if (SD->isSchedulingEntity()) {
-      BS->calculateDependencies(SD, false, AA);
+      BS->calculateDependencies(SD, false, this);
       NumToSchedule++;
     }
   }
@@ -2833,7 +3057,7 @@ struct SLPVectorizer : public FunctionPass {
   AliasAnalysis *AA;
   LoopInfo *LI;
   DominatorTree *DT;
-  AssumptionTracker *AT;
+  AssumptionCache *AC;
 
   bool runOnFunction(Function &F) override {
     if (skipOptnoneFunction(F))
@@ -2842,12 +3066,13 @@ struct SLPVectorizer : public FunctionPass {
     SE = &getAnalysis<ScalarEvolution>();
     DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
     DL = DLP ? &DLP->getDataLayout() : nullptr;
-    TTI = &getAnalysis<TargetTransformInfo>();
-    TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+    TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    TLI = TLIP ? &TLIP->getTLI() : nullptr;
     AA = &getAnalysis<AliasAnalysis>();
-    LI = &getAnalysis<LoopInfo>();
+    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    AT = &getAnalysis<AssumptionTracker>();
+    AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
     StoreRefs.clear();
     bool Changed = false;
@@ -2870,7 +3095,10 @@ struct SLPVectorizer : public FunctionPass {
 
     // Use the bottom up slp vectorizer to construct chains that start with
     // store instructions.
-    BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AT);
+    BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AC);
+
+    // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
+    // delete instructions.
 
     // Scan the blocks in the function in post order.
     for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
@@ -2897,13 +3125,13 @@ struct SLPVectorizer : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     FunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<AssumptionTracker>();
+    AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<ScalarEvolution>();
     AU.addRequired<AliasAnalysis>();
-    AU.addRequired<TargetTransformInfo>();
-    AU.addRequired<LoopInfo>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addPreserved<LoopInfo>();
+    AU.addPreserved<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.setPreservesCFG();
   }
@@ -3078,7 +3306,7 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
 
     // Check that the pointer points to scalars.
     Type *Ty = SI->getValueOperand()->getType();
-    if (Ty->isAggregateType() || Ty->isVectorTy())
+    if (!isValidElementType(Ty))
       continue;
 
     // Find the base pointer.
@@ -3119,7 +3347,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
 
   for (int i = 0, e = VL.size(); i < e; ++i) {
     Type *Ty = VL[i]->getType();
-    if (Ty->isAggregateType() || Ty->isVectorTy())
+    if (!isValidElementType(Ty))
       return false;
     Instruction *Inst = dyn_cast<Instruction>(VL[i]);
     if (!Inst || Inst->getOpcode() != Opcode0)
@@ -3339,7 +3567,7 @@ public:
       return false;
 
     Type *Ty = B->getType();
-    if (Ty->isVectorTy())
+    if (!isValidElementType(Ty))
       return false;
 
     ReductionOpcode = B->getOpcode();
@@ -3502,11 +3730,10 @@ private:
   /// \brief Emit a horizontal reduction of the vectorized value.
   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
     assert(VectorizedValue && "Need to have a vectorized tree node");
-    Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue);
     assert(isPowerOf2_32(ReduxWidth) &&
            "We only handle power-of-two reductions for now");
 
-    Value *TmpVec = ValToReduce;
+    Value *TmpVec = VectorizedValue;
     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
       if (IsPairwiseReduction) {
         Value *LeftMask =
@@ -3730,6 +3957,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
             // and the iterator may become invalid value.
             it = BB->begin();
             e = BB->end();
+            break;
           }
         }
       }
@@ -3786,8 +4014,8 @@ char SLPVectorizer::ID = 0;
 static const char lv_name[] = "SLP Vectorizer";
 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index d459bcf..6e002fd 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -19,7 +19,7 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 
 using namespace llvm;
 
diff --git a/shared_llvm.mk b/shared_llvm.mk
index e9bedb5..fff83dc 100644
--- a/shared_llvm.mk
+++ b/shared_llvm.mk
@@ -3,7 +3,8 @@ LOCAL_PATH:= $(call my-dir)
 llvm_pre_static_libraries := \
   libLLVMLinker \
   libLLVMipo \
-  libLLVMDebugInfo \
+  libLLVMDebugInfoDWARF \
+  libLLVMDebugInfoPDB \
   libLLVMIRReader \
   libLLVMBitWriter \
   libLLVMBitReader
@@ -68,7 +69,8 @@ llvm_post_static_libraries := \
 llvm_host_static_libraries := \
   libLLVMExecutionEngine \
   libLLVMRuntimeDyld \
-  libLLVMMCJIT
+  libLLVMMCJIT \
+  libLLVMOrcJIT
 
 ifeq (true,$(FORCE_BUILD_LLVM_COMPONENTS))
 # HOST LLVM shared library build
diff --git a/test/Analysis/AssumptionCache/basic.ll b/test/Analysis/AssumptionCache/basic.ll
new file mode 100644
index 0000000..bd4e7b6
--- /dev/null
+++ b/test/Analysis/AssumptionCache/basic.ll
@@ -0,0 +1,22 @@
+; RUN: opt < %s -disable-output -passes='print<assumptions>' 2>&1 | FileCheck %s
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare void @llvm.assume(i1)
+
+define void @test1(i32 %a) {
+; CHECK-LABEL: Cached assumptions for function: test1
+; CHECK-NEXT: icmp ne i32 %{{.*}}, 0
+; CHECK-NEXT: icmp slt i32 %{{.*}}, 0
+; CHECK-NEXT: icmp sgt i32 %{{.*}}, 0
+
+entry:
+  %cond1 = icmp ne i32 %a, 0
+  call void @llvm.assume(i1 %cond1)
+  %cond2 = icmp slt i32 %a, 0
+  call void @llvm.assume(i1 %cond2)
+  %cond3 = icmp sgt i32 %a, 0
+  call void @llvm.assume(i1 %cond3)
+
+  ret void
+}
diff --git a/test/Analysis/BasicAA/2003-11-04-SimpleCases.ll b/test/Analysis/BasicAA/2003-11-04-SimpleCases.ll
index 768411e..f2b06cb 100644
--- a/test/Analysis/BasicAA/2003-11-04-SimpleCases.ll
+++ b/test/Analysis/BasicAA/2003-11-04-SimpleCases.ll
@@ -3,10 +3,12 @@
 
 ; RUN: opt < %s -basicaa -aa-eval -print-may-aliases -disable-output 2>&1 | FileCheck %s
 
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
 %T = type { i32, [10 x i8] }
 
 ; CHECK:     Function: test
-; CHECK-NOT:   May:
+; CHECK-NOT:   MayAlias:
 
 define void @test(%T* %P) {
   %A = getelementptr %T* %P, i64 0
diff --git a/test/Analysis/BasicAA/2003-12-11-ConstExprGEP.ll b/test/Analysis/BasicAA/2003-12-11-ConstExprGEP.ll
index b7bbf77..42512b8 100644
--- a/test/Analysis/BasicAA/2003-12-11-ConstExprGEP.ll
+++ b/test/Analysis/BasicAA/2003-12-11-ConstExprGEP.ll
@@ -3,12 +3,14 @@
 
 ; RUN: opt < %s -basicaa -aa-eval -print-may-aliases -disable-output 2>&1 | FileCheck %s
 
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
 %T = type { i32, [10 x i8] }
 
 @G = external global %T
 
 ; CHECK:     Function: test
-; CHECK-NOT:   May:
+; CHECK-NOT:   MayAlias:
 
 define void @test() {
   %D = getelementptr %T* @G, i64 0, i32 0
diff --git a/test/Analysis/BasicAA/2007-08-01-NoAliasAndGEP.ll b/test/Analysis/BasicAA/2007-08-01-NoAliasAndGEP.ll
index 4be793e..d11e75d 100644
--- a/test/Analysis/BasicAA/2007-08-01-NoAliasAndGEP.ll
+++ b/test/Analysis/BasicAA/2007-08-01-NoAliasAndGEP.ll
@@ -1,9 +1,11 @@
 ; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
 
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
 ; CHECK: Function: foo
-; CHECK:   MayAlias: i32* %Ipointer, i32* %Jpointer
+; CHECK:   PartialAlias: i32* %Ipointer, i32* %Jpointer
 ; CHECK: 9 no alias responses
-; CHECK: 6 may alias responses
+; CHECK: 6 partial alias responses
 
 define void @foo(i32* noalias %p, i32* noalias %q, i32 %i, i32 %j) {
   %Ipointer = getelementptr i32* %p, i32 %i
diff --git a/test/Analysis/BasicAA/constant-over-index.ll b/test/Analysis/BasicAA/constant-over-index.ll
index 232533c..aeb068b 100644
--- a/test/Analysis/BasicAA/constant-over-index.ll
+++ b/test/Analysis/BasicAA/constant-over-index.ll
@@ -1,10 +1,13 @@
 ; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info 2>&1 | FileCheck %s
 ; PR4267
 
-; CHECK: MayAlias: double* %p.0.i.0, double* %p3
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK: PartialAlias: double* %p.0.i.0, double* %p3
 
 ; %p3 is equal to %p.0.i.0 on the second iteration of the loop,
-; so MayAlias is needed.
+; so MayAlias is needed.  In practice, basicaa returns PartialAlias
+; for GEPs to ignore TBAA.
 
 define void @foo([3 x [3 x double]]* noalias %p) {
 entry:
diff --git a/test/Analysis/BasicAA/full-store-partial-alias.ll b/test/Analysis/BasicAA/full-store-partial-alias.ll
index 4de2daf..9699d92 100644
--- a/test/Analysis/BasicAA/full-store-partial-alias.ll
+++ b/test/Analysis/BasicAA/full-store-partial-alias.ll
@@ -29,9 +29,9 @@ entry:
   ret i32 %tmp5.lobit
 }
 
-!0 = metadata !{metadata !4, metadata !4, i64 0}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !5, metadata !5, i64 0}
-!4 = metadata !{metadata !"double", metadata !1}
-!5 = metadata !{metadata !"int", metadata !1}
+!0 = !{!4, !4, i64 0}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA", null}
+!3 = !{!5, !5, i64 0}
+!4 = !{!"double", !1}
+!5 = !{!"int", !1}
diff --git a/test/Analysis/BasicAA/invariant_load.ll b/test/Analysis/BasicAA/invariant_load.ll
index 09b5401..bc629cd 100644
--- a/test/Analysis/BasicAA/invariant_load.ll
+++ b/test/Analysis/BasicAA/invariant_load.ll
@@ -23,4 +23,4 @@ entry:
 ; CHECK: %add = add nsw i32 %0, 1
 }
 
-!3 = metadata !{}
+!3 = !{}
diff --git a/test/Analysis/BasicAA/struct-geps.ll b/test/Analysis/BasicAA/struct-geps.ll
new file mode 100644
index 0000000..3764d48
--- /dev/null
+++ b/test/Analysis/BasicAA/struct-geps.ll
@@ -0,0 +1,164 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct = type { i32, i32, i32 }
+
+; CHECK-LABEL: test_simple
+
+; CHECK-DAG: PartialAlias: %struct* %st, i32* %x
+; CHECK-DAG: PartialAlias: %struct* %st, i32* %y
+; CHECK-DAG: PartialAlias: %struct* %st, i32* %z
+
+; CHECK-DAG: NoAlias: i32* %x, i32* %y
+; CHECK-DAG: NoAlias: i32* %x, i32* %z
+; CHECK-DAG: NoAlias: i32* %y, i32* %z
+
+; CHECK-DAG: PartialAlias: %struct* %st, %struct* %y_12
+; CHECK-DAG: PartialAlias: %struct* %y_12, i32* %x
+; CHECK-DAG: PartialAlias: i32* %x, i80* %y_10
+
+; CHECK-DAG: PartialAlias: %struct* %st, i64* %y_8
+; CHECK-DAG: PartialAlias: i32* %z, i64* %y_8
+; CHECK-DAG: NoAlias: i32* %x, i64* %y_8
+
+; CHECK-DAG: MustAlias: %struct* %y_12, i32* %y
+; CHECK-DAG: MustAlias: i32* %y, i64* %y_8
+; CHECK-DAG: MustAlias: i32* %y, i80* %y_10
+
+define void @test_simple(%struct* %st, i64 %i, i64 %j, i64 %k) {
+  %x = getelementptr %struct* %st, i64 %i, i32 0
+  %y = getelementptr %struct* %st, i64 %j, i32 1
+  %z = getelementptr %struct* %st, i64 %k, i32 2
+  %y_12 = bitcast i32* %y to %struct*
+  %y_10 = bitcast i32* %y to i80*
+  %y_8 = bitcast i32* %y to i64*
+  ret void
+}
+
+; CHECK-LABEL: test_in_array
+
+; CHECK-DAG: PartialAlias: [1 x %struct]* %st, i32* %x
+; CHECK-DAG: PartialAlias: [1 x %struct]* %st, i32* %y
+; CHECK-DAG: PartialAlias: [1 x %struct]* %st, i32* %z
+
+; CHECK-DAG: NoAlias: i32* %x, i32* %y
+; CHECK-DAG: NoAlias: i32* %x, i32* %z
+; CHECK-DAG: NoAlias: i32* %y, i32* %z
+
+; CHECK-DAG: PartialAlias: %struct* %y_12, [1 x %struct]* %st
+; CHECK-DAG: PartialAlias: %struct* %y_12, i32* %x
+; CHECK-DAG: PartialAlias: i32* %x, i80* %y_10
+
+; CHECK-DAG: PartialAlias: [1 x %struct]* %st, i64* %y_8
+; CHECK-DAG: PartialAlias: i32* %z, i64* %y_8
+; CHECK-DAG: NoAlias: i32* %x, i64* %y_8
+
+; CHECK-DAG: MustAlias: %struct* %y_12, i32* %y
+; CHECK-DAG: MustAlias: i32* %y, i64* %y_8
+; CHECK-DAG: MustAlias: i32* %y, i80* %y_10
+
+define void @test_in_array([1 x %struct]* %st, i64 %i, i64 %j, i64 %k, i64 %i1, i64 %j1, i64 %k1) {
+  %x = getelementptr [1 x %struct]* %st, i64 %i, i64 %i1, i32 0
+  %y = getelementptr [1 x %struct]* %st, i64 %j, i64 %j1, i32 1
+  %z = getelementptr [1 x %struct]* %st, i64 %k, i64 %k1, i32 2
+  %y_12 = bitcast i32* %y to %struct*
+  %y_10 = bitcast i32* %y to i80*
+  %y_8 = bitcast i32* %y to i64*
+  ret void
+}
+
+; CHECK-LABEL: test_in_3d_array
+
+; CHECK-DAG: PartialAlias: [1 x [1 x [1 x %struct]]]* %st, i32* %x
+; CHECK-DAG: PartialAlias: [1 x [1 x [1 x %struct]]]* %st, i32* %y
+; CHECK-DAG: PartialAlias: [1 x [1 x [1 x %struct]]]* %st, i32* %z
+
+; CHECK-DAG: NoAlias: i32* %x, i32* %y
+; CHECK-DAG: NoAlias: i32* %x, i32* %z
+; CHECK-DAG: NoAlias: i32* %y, i32* %z
+
+; CHECK-DAG: PartialAlias: %struct* %y_12, [1 x [1 x [1 x %struct]]]* %st
+; CHECK-DAG: PartialAlias: %struct* %y_12, i32* %x
+; CHECK-DAG: PartialAlias: i32* %x, i80* %y_10
+
+; CHECK-DAG: PartialAlias: [1 x [1 x [1 x %struct]]]* %st, i64* %y_8
+; CHECK-DAG: PartialAlias: i32* %z, i64* %y_8
+; CHECK-DAG: NoAlias: i32* %x, i64* %y_8
+
+; CHECK-DAG: MustAlias: %struct* %y_12, i32* %y
+; CHECK-DAG: MustAlias: i32* %y, i64* %y_8
+; CHECK-DAG: MustAlias: i32* %y, i80* %y_10
+
+define void @test_in_3d_array([1 x [1 x [1 x %struct]]]* %st, i64 %i, i64 %j, i64 %k, i64 %i1, i64 %j1, i64 %k1, i64 %i2, i64 %j2, i64 %k2, i64 %i3, i64 %j3, i64 %k3) {
+  %x = getelementptr [1 x [1 x [1 x %struct]]]* %st, i64 %i, i64 %i1, i64 %i2, i64 %i3, i32 0
+  %y = getelementptr [1 x [1 x [1 x %struct]]]* %st, i64 %j, i64 %j1, i64 %j2, i64 %j3, i32 1
+  %z = getelementptr [1 x [1 x [1 x %struct]]]* %st, i64 %k, i64 %k1, i64 %k2, i64 %k3, i32 2
+  %y_12 = bitcast i32* %y to %struct*
+  %y_10 = bitcast i32* %y to i80*
+  %y_8 = bitcast i32* %y to i64*
+  ret void
+}
+
+; CHECK-LABEL: test_same_underlying_object_same_indices
+
+; CHECK-DAG: NoAlias: i32* %x, i32* %x2
+; CHECK-DAG: NoAlias: i32* %y, i32* %y2
+; CHECK-DAG: NoAlias: i32* %z, i32* %z2
+
+; CHECK-DAG: PartialAlias: i32* %x, i32* %y2
+; CHECK-DAG: PartialAlias: i32* %x, i32* %z2
+
+; CHECK-DAG: PartialAlias: i32* %x2, i32* %y
+; CHECK-DAG: PartialAlias: i32* %y, i32* %z2
+
+; CHECK-DAG: PartialAlias: i32* %x2, i32* %z
+; CHECK-DAG: PartialAlias: i32* %y2, i32* %z
+
+define void @test_same_underlying_object_same_indices(%struct* %st, i64 %i, i64 %j, i64 %k) {
+  %st2 = getelementptr %struct* %st, i32 10
+  %x2 = getelementptr %struct* %st2, i64 %i, i32 0
+  %y2 = getelementptr %struct* %st2, i64 %j, i32 1
+  %z2 = getelementptr %struct* %st2, i64 %k, i32 2
+  %x = getelementptr %struct* %st, i64 %i, i32 0
+  %y = getelementptr %struct* %st, i64 %j, i32 1
+  %z = getelementptr %struct* %st, i64 %k, i32 2
+  ret void
+}
+
+; CHECK-LABEL: test_same_underlying_object_different_indices
+
+; CHECK-DAG: PartialAlias: i32* %x, i32* %x2
+; CHECK-DAG: PartialAlias: i32* %y, i32* %y2
+; CHECK-DAG: PartialAlias: i32* %z, i32* %z2
+
+; CHECK-DAG: PartialAlias: i32* %x, i32* %y2
+; CHECK-DAG: PartialAlias: i32* %x, i32* %z2
+
+; CHECK-DAG: PartialAlias: i32* %x2, i32* %y
+; CHECK-DAG: PartialAlias: i32* %y, i32* %z2
+
+; CHECK-DAG: PartialAlias: i32* %x2, i32* %z
+; CHECK-DAG: PartialAlias: i32* %y2, i32* %z
+
+define void @test_same_underlying_object_different_indices(%struct* %st, i64 %i1, i64 %j1, i64 %k1, i64 %i2, i64 %k2, i64 %j2) {
+  %st2 = getelementptr %struct* %st, i32 10
+  %x2 = getelementptr %struct* %st2, i64 %i2, i32 0
+  %y2 = getelementptr %struct* %st2, i64 %j2, i32 1
+  %z2 = getelementptr %struct* %st2, i64 %k2, i32 2
+  %x = getelementptr %struct* %st, i64 %i1, i32 0
+  %y = getelementptr %struct* %st, i64 %j1, i32 1
+  %z = getelementptr %struct* %st, i64 %k1, i32 2
+  ret void
+}
+
+
+%struct2 = type { [1 x { i32, i32 }], [2 x { i32 }] }
+
+; CHECK-LABEL: test_struct_in_array
+; CHECK-DAG: MustAlias: i32* %x, i32* %y
+define void @test_struct_in_array(%struct2* %st, i64 %i, i64 %j, i64 %k) {
+  %x = getelementptr %struct2* %st, i32 0, i32 1, i32 1, i32 0
+  %y = getelementptr %struct2* %st, i32 0, i32 0, i32 1, i32 1
+  ret void
+}
diff --git a/test/Analysis/BlockFrequencyInfo/bad_input.ll b/test/Analysis/BlockFrequencyInfo/bad_input.ll
index bcdc1e6..da62dca 100644
--- a/test/Analysis/BlockFrequencyInfo/bad_input.ll
+++ b/test/Analysis/BlockFrequencyInfo/bad_input.ll
@@ -23,7 +23,7 @@ for.end:
   ret void
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 0, i32 3}
+!0 = !{!"branch_weights", i32 0, i32 3}
 
 ; CHECK-LABEL: Printing analysis {{.*}} for function 'infinite_loop'
 ; CHECK-NEXT: block-frequency-info: infinite_loop
@@ -47,4 +47,4 @@ for.end:
   ret void
 }
 
-!1 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+!1 = !{!"branch_weights", i32 1, i32 1}
diff --git a/test/Analysis/BlockFrequencyInfo/basic.ll b/test/Analysis/BlockFrequencyInfo/basic.ll
index 006e6ab..1c24176 100644
--- a/test/Analysis/BlockFrequencyInfo/basic.ll
+++ b/test/Analysis/BlockFrequencyInfo/basic.ll
@@ -47,7 +47,7 @@ exit:
   ret i32 %result
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
+!0 = !{!"branch_weights", i32 64, i32 4}
 
 define i32 @test3(i32 %i, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 ; CHECK-LABEL: Printing analysis {{.*}} for function 'test3':
@@ -90,7 +90,7 @@ exit:
   ret i32 %result
 }
 
-!1 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 64, i32 4, i32 4}
+!1 = !{!"branch_weights", i32 4, i32 4, i32 64, i32 4, i32 4}
 
 define void @nested_loops(i32 %a) {
 ; CHECK-LABEL: Printing analysis {{.*}} for function 'nested_loops':
@@ -138,4 +138,4 @@ for.end13:
 
 declare void @g(i32)
 
-!2 = metadata !{metadata !"branch_weights", i32 1, i32 4000}
+!2 = !{!"branch_weights", i32 1, i32 4000}
diff --git a/test/Analysis/BlockFrequencyInfo/double_backedge.ll b/test/Analysis/BlockFrequencyInfo/double_backedge.ll
index df8217c..597bf83 100644
--- a/test/Analysis/BlockFrequencyInfo/double_backedge.ll
+++ b/test/Analysis/BlockFrequencyInfo/double_backedge.ll
@@ -23,5 +23,5 @@ exit:
 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
   ret void
 }
-!0 = metadata !{metadata !"branch_weights", i32 1, i32 9}
-!1 = metadata !{metadata !"branch_weights", i32 4, i32 5}
+!0 = !{!"branch_weights", i32 1, i32 9}
+!1 = !{!"branch_weights", i32 4, i32 5}
diff --git a/test/Analysis/BlockFrequencyInfo/double_exit.ll b/test/Analysis/BlockFrequencyInfo/double_exit.ll
index 75f664d..3063ba7 100644
--- a/test/Analysis/BlockFrequencyInfo/double_exit.ll
+++ b/test/Analysis/BlockFrequencyInfo/double_exit.ll
@@ -66,9 +66,9 @@ exit:
   ret i32 %Return.2
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 1, i32 3}
-!1 = metadata !{metadata !"branch_weights", i32 4, i32 1}
-!2 = metadata !{metadata !"branch_weights", i32 2, i32 1}
+!0 = !{!"branch_weights", i32 1, i32 3}
+!1 = !{!"branch_weights", i32 4, i32 1}
+!2 = !{!"branch_weights", i32 2, i32 1}
 
 declare i32 @c2(i32, i32)
 declare i32 @logic2(i32, i32, i32)
@@ -159,7 +159,7 @@ exit:
   ret i32 %Return.0
 }
 
-!3 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+!3 = !{!"branch_weights", i32 1, i32 1}
 
 declare i32 @c3(i32, i32, i32)
 declare i32 @logic3(i32, i32, i32, i32)
diff --git a/test/Analysis/BlockFrequencyInfo/extremely-likely-loop-successor.ll b/test/Analysis/BlockFrequencyInfo/extremely-likely-loop-successor.ll
new file mode 100644
index 0000000..e55deaf
--- /dev/null
+++ b/test/Analysis/BlockFrequencyInfo/extremely-likely-loop-successor.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -analyze -block-freq | FileCheck %s
+
+; PR21622: Check for a crasher when the sum of exits to the same successor of a
+; loop overflows.
+
+; CHECK-LABEL: Printing analysis {{.*}} for function 'extremely_likely_loop_successor':
+; CHECK-NEXT: block-frequency-info: extremely_likely_loop_successor
+define void @extremely_likely_loop_successor() {
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+entry:
+  br label %loop
+
+; CHECK-NEXT: loop: float = 1.0,
+loop:
+  %exit.1.cond = call i1 @foo()
+  br i1 %exit.1.cond, label %exit, label %loop.2, !prof !0
+
+; CHECK-NEXT: loop.2: float = 0.0000000
+loop.2:
+  %exit.2.cond = call i1 @foo()
+  br i1 %exit.2.cond, label %exit, label %loop.3, !prof !0
+
+; CHECK-NEXT: loop.3: float = 0.0000000
+loop.3:
+  %exit.3.cond = call i1 @foo()
+  br i1 %exit.3.cond, label %exit, label %loop.4, !prof !0
+
+; CHECK-NEXT: loop.4: float = 0.0,
+loop.4:
+  %exit.4.cond = call i1 @foo()
+  br i1 %exit.4.cond, label %exit, label %loop, !prof !0
+
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+exit:
+  ret void
+}
+
+declare i1 @foo()
+
+!0 = !{!"branch_weights", i32 4294967295, i32 1}
diff --git a/test/Analysis/BlockFrequencyInfo/irreducible.ll b/test/Analysis/BlockFrequencyInfo/irreducible.ll
index af4ad15..b275aae 100644
--- a/test/Analysis/BlockFrequencyInfo/irreducible.ll
+++ b/test/Analysis/BlockFrequencyInfo/irreducible.ll
@@ -31,8 +31,8 @@ return:
   ret void
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 1, i32 7}
-!1 = metadata !{metadata !"branch_weights", i32 3, i32 4}
+!0 = !{!"branch_weights", i32 1, i32 7}
+!1 = !{!"branch_weights", i32 3, i32 4}
 
 ; Irreducible control flow
 ; ========================
@@ -112,7 +112,7 @@ exit:
   ret void
 }
 
-!2 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+!2 = !{!"branch_weights", i32 3, i32 1}
 
 ; Testcase #2
 ; ===========
@@ -156,7 +156,7 @@ exit:
   ret void
 }
 
-!3 = metadata !{metadata !"branch_weights", i32 2, i32 2, i32 2}
+!3 = !{!"branch_weights", i32 2, i32 2, i32 2}
 
 ; A true loop with irreducible control flow inside.
 define void @loop_around_irreducible(i1 %x) {
@@ -186,8 +186,8 @@ exit:
 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
   ret void
 }
-!4 = metadata !{metadata !"branch_weights", i32 1, i32 1}
-!5 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+!4 = !{!"branch_weights", i32 1, i32 1}
+!5 = !{!"branch_weights", i32 3, i32 1}
 
 ; Two unrelated irreducible SCCs.
 define void @two_sccs(i1 %x) {
@@ -225,9 +225,9 @@ exit:
 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
   ret void
 }
-!6 = metadata !{metadata !"branch_weights", i32 3, i32 1}
-!7 = metadata !{metadata !"branch_weights", i32 1, i32 1}
-!8 = metadata !{metadata !"branch_weights", i32 4, i32 1}
+!6 = !{!"branch_weights", i32 3, i32 1}
+!7 = !{!"branch_weights", i32 1, i32 1}
+!8 = !{!"branch_weights", i32 4, i32 1}
 
 ; A true loop inside irreducible control flow.
 define void @loop_inside_irreducible(i1 %x) {
@@ -257,9 +257,9 @@ exit:
 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
   ret void
 }
-!9 = metadata !{metadata !"branch_weights", i32 1, i32 1}
-!10 = metadata !{metadata !"branch_weights", i32 3, i32 1}
-!11 = metadata !{metadata !"branch_weights", i32 2, i32 1}
+!9 = !{!"branch_weights", i32 1, i32 1}
+!10 = !{!"branch_weights", i32 3, i32 1}
+!11 = !{!"branch_weights", i32 2, i32 1}
 
 ; Irreducible control flow in a branch that's in a true loop.
 define void @loop_around_branch_with_irreducible(i1 %x) {
@@ -301,8 +301,8 @@ exit:
 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
   ret void
 }
-!12 = metadata !{metadata !"branch_weights", i32 3, i32 1}
-!13 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+!12 = !{!"branch_weights", i32 3, i32 1}
+!13 = !{!"branch_weights", i32 1, i32 1}
 
 ; Irreducible control flow between two true loops.
 define void @loop_around_branch_with_irreducible_around_loop(i1 %x) {
@@ -348,10 +348,10 @@ exit:
 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
   ret void
 }
-!14 = metadata !{metadata !"branch_weights", i32 2, i32 1}
-!15 = metadata !{metadata !"branch_weights", i32 1, i32 1}
-!16 = metadata !{metadata !"branch_weights", i32 3, i32 1}
-!17 = metadata !{metadata !"branch_weights", i32 4, i32 1}
+!14 = !{!"branch_weights", i32 2, i32 1}
+!15 = !{!"branch_weights", i32 1, i32 1}
+!16 = !{!"branch_weights", i32 3, i32 1}
+!17 = !{!"branch_weights", i32 4, i32 1}
 
 ; An irreducible SCC with a non-header.
 define void @nonheader(i1 %x) {
@@ -377,9 +377,9 @@ exit:
 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
   ret void
 }
-!18 = metadata !{metadata !"branch_weights", i32 1, i32 1}
-!19 = metadata !{metadata !"branch_weights", i32 1, i32 3}
-!20 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+!18 = !{!"branch_weights", i32 1, i32 1}
+!19 = !{!"branch_weights", i32 1, i32 3}
+!20 = !{!"branch_weights", i32 3, i32 1}
 
 ; An irreducible SCC with an irreducible sub-SCC.  In the current version of
 ; -block-freq, this means an extra header.
@@ -416,6 +416,6 @@ exit:
 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
   ret void
 }
-!21 = metadata !{metadata !"branch_weights", i32 2, i32 1}
-!22 = metadata !{metadata !"branch_weights", i32 1, i32 1}
-!23 = metadata !{metadata !"branch_weights", i32 8, i32 1, i32 3, i32 12}
+!21 = !{!"branch_weights", i32 2, i32 1}
+!22 = !{!"branch_weights", i32 1, i32 1}
+!23 = !{!"branch_weights", i32 8, i32 1, i32 3, i32 12}
diff --git a/test/Analysis/BlockFrequencyInfo/loop_with_branch.ll b/test/Analysis/BlockFrequencyInfo/loop_with_branch.ll
index 9d27b6b..9a86564 100644
--- a/test/Analysis/BlockFrequencyInfo/loop_with_branch.ll
+++ b/test/Analysis/BlockFrequencyInfo/loop_with_branch.ll
@@ -40,5 +40,5 @@ exit:
 declare i1 @foo0(i32)
 declare i2 @foo1(i32)
 
-!0 = metadata !{metadata !"branch_weights", i32 1, i32 3}
-!1 = metadata !{metadata !"branch_weights", i32 1, i32 2, i32 3}
+!0 = !{!"branch_weights", i32 1, i32 3}
+!1 = !{!"branch_weights", i32 1, i32 2, i32 3}
diff --git a/test/Analysis/BlockFrequencyInfo/nested_loop_with_branches.ll b/test/Analysis/BlockFrequencyInfo/nested_loop_with_branches.ll
index d93ffce..19d1658 100644
--- a/test/Analysis/BlockFrequencyInfo/nested_loop_with_branches.ll
+++ b/test/Analysis/BlockFrequencyInfo/nested_loop_with_branches.ll
@@ -55,5 +55,5 @@ declare i1 @foo4(i32)
 declare i1 @foo5(i32)
 declare i1 @foo6(i32)
 
-!0 = metadata !{metadata !"branch_weights", i32 1, i32 3}
-!1 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+!0 = !{!"branch_weights", i32 1, i32 3}
+!1 = !{!"branch_weights", i32 3, i32 1}
diff --git a/test/Analysis/BranchProbabilityInfo/basic.ll b/test/Analysis/BranchProbabilityInfo/basic.ll
index 05cb31d..5915ed1 100644
--- a/test/Analysis/BranchProbabilityInfo/basic.ll
+++ b/test/Analysis/BranchProbabilityInfo/basic.ll
@@ -43,7 +43,7 @@ exit:
   ret i32 %result
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
+!0 = !{!"branch_weights", i32 64, i32 4}
 
 define i32 @test3(i32 %i, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 ; CHECK: Printing analysis {{.*}} for function 'test3'
@@ -87,7 +87,7 @@ exit:
   ret i32 %result
 }
 
-!1 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 64, i32 4, i32 4}
+!1 = !{!"branch_weights", i32 4, i32 4, i32 64, i32 4, i32 4}
 
 define i32 @test4(i32 %x) nounwind uwtable readnone ssp {
 ; CHECK: Printing analysis {{.*}} for function 'test4'
@@ -114,7 +114,7 @@ return:
   ret i32 %retval.0
 }
 
-!2 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 4, i32 4, i32 64}
+!2 = !{!"branch_weights", i32 7, i32 6, i32 4, i32 4, i32 64}
 
 declare void @coldfunc() cold
 
diff --git a/test/Analysis/CFLAliasAnalysis/asm-global-bugfix.ll b/test/Analysis/CFLAliasAnalysis/asm-global-bugfix.ll
new file mode 100644
index 0000000..d8ee94b
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/asm-global-bugfix.ll
@@ -0,0 +1,16 @@
+; Test case for a bug where we would crash when we were requested to report
+; whether two values that didn't belong to a function (i.e. two globals, etc)
+; aliased.
+
+; RUN: opt < %s -cfl-aa -aa-eval -print-may-aliases -disable-output 2>&1 | FileCheck %s
+
+@G = private unnamed_addr constant [1 x i8] c"\00", align 1
+
+; CHECK: Function: test_no_crash
+; CHECK: 1 no alias responses
+define void @test_no_crash() #0 {
+entry:
+  call i8* asm "nop", "=r,r"(
+       i8* getelementptr inbounds ([1 x i8]* @G, i64 0, i64 0))
+  ret void
+}
diff --git a/test/Analysis/CFLAliasAnalysis/full-store-partial-alias.ll b/test/Analysis/CFLAliasAnalysis/full-store-partial-alias.ll
index 155fe13..21edfc2 100644
--- a/test/Analysis/CFLAliasAnalysis/full-store-partial-alias.ll
+++ b/test/Analysis/CFLAliasAnalysis/full-store-partial-alias.ll
@@ -2,8 +2,9 @@
 ; RUN: opt -S -tbaa -gvn < %s | FileCheck %s
 ; Adapted from the BasicAA full-store-partial-alias.ll test.
 
-; CFL AA should notice that the store stores to the entire %u object,
+; CFL AA could notice that the store stores to the entire %u object,
 ; so the %tmp5 load is PartialAlias with the store and suppress TBAA.
+; FIXME: However, right now, CFLAA cannot prove PartialAlias here
 ; Without CFL AA, TBAA should say that %tmp5 is NoAlias with the store.
 
 target datalayout = "e-p:64:64:64"
@@ -14,8 +15,9 @@ target datalayout = "e-p:64:64:64"
 @endianness_test = global i64 1, align 8
 
 define i32 @signbit(double %x) nounwind {
-; CFLAA: ret i32 %tmp5.lobit
-; CHECK:   ret i32 0
+; FIXME: This would be ret i32 %tmp5.lobit if CFLAA could prove PartialAlias
+; CFLAA: ret i32 0
+; CHECK: ret i32 0
 entry:
   %u = alloca %union.anon, align 8
   %tmp9 = getelementptr inbounds %union.anon* %u, i64 0, i32 0
@@ -29,9 +31,9 @@ entry:
   ret i32 %tmp5.lobit
 }
 
-!0 = metadata !{metadata !4, metadata !4, i64 0}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !5, metadata !5, i64 0}
-!4 = metadata !{metadata !"double", metadata !1}
-!5 = metadata !{metadata !"int", metadata !1}
+!0 = !{!4, !4, i64 0}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA", null}
+!3 = !{!5, !5, i64 0}
+!4 = !{!"double", !1}
+!5 = !{!"int", !1}
diff --git a/test/Analysis/CFLAliasAnalysis/gep-signed-arithmetic.ll b/test/Analysis/CFLAliasAnalysis/gep-signed-arithmetic.ll
index a0195d7..19d251c 100644
--- a/test/Analysis/CFLAliasAnalysis/gep-signed-arithmetic.ll
+++ b/test/Analysis/CFLAliasAnalysis/gep-signed-arithmetic.ll
@@ -3,9 +3,11 @@
 
 target datalayout = "e-p:32:32:32"
 
-; CHECK: 1 partial alias response
+; FIXME: This could be PartialAlias but CFLAA can't currently prove it
+; CHECK: 1 may alias response
 
-define i32 @test(i32* %tab, i32 %indvar) nounwind {
+define i32 @test(i32 %indvar) nounwind {
+  %tab = alloca i32, align 4
   %tmp31 = mul i32 %indvar, -2
   %tmp32 = add i32 %tmp31, 30
   %t.5 = getelementptr i32* %tab, i32 %tmp32
diff --git a/test/Analysis/CFLAliasAnalysis/must-and-partial.ll b/test/Analysis/CFLAliasAnalysis/must-and-partial.ll
index df7de38..163a6c3 100644
--- a/test/Analysis/CFLAliasAnalysis/must-and-partial.ll
+++ b/test/Analysis/CFLAliasAnalysis/must-and-partial.ll
@@ -1,14 +1,15 @@
 ; RUN: opt < %s -cfl-aa -aa-eval -print-all-alias-modref-info 2>&1 | FileCheck %s
-
 ; When merging MustAlias and PartialAlias, merge to PartialAlias
 ; instead of MayAlias.
 
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
-; CHECK: PartialAlias:  i16* %bigbase0, i8* %phi
-define i8 @test0(i8* %base, i1 %x) {
+; FIXME: This could be PartialAlias but CFLAA can't currently prove it
+; CHECK: MayAlias:  i16* %bigbase0, i8* %phi
+define i8 @test0(i1 %x) {
 entry:
+  %base = alloca i8, align 4
   %baseplusone = getelementptr i8* %base, i64 1
   br i1 %x, label %red, label %green
 red:
@@ -24,9 +25,11 @@ green:
   ret i8 %loaded
 }
 
-; CHECK: PartialAlias:  i16* %bigbase1, i8* %sel
-define i8 @test1(i8* %base, i1 %x) {
+; FIXME: This could be PartialAlias but CFLAA can't currently prove it
+; CHECK: MayAlias:  i16* %bigbase1, i8* %sel
+define i8 @test1(i1 %x) {
 entry:
+  %base = alloca i8, align 4
   %baseplusone = getelementptr i8* %base, i64 1
   %sel = select i1 %x, i8* %baseplusone, i8* %base
   store i8 0, i8* %sel
@@ -37,3 +40,15 @@ entry:
   %loaded = load i8* %sel
   ret i8 %loaded
 }
+
+; Incoming pointer arguments should not be PartialAlias because we do not know their initial state
+; even if they are nocapture
+; CHECK: MayAlias:  double* %A, double* %Index
+define void @testr2(double* nocapture readonly %A, double* nocapture readonly %Index) {
+  %arrayidx22 = getelementptr inbounds double* %Index, i64 2
+  %1 = load double* %arrayidx22
+  %arrayidx25 = getelementptr inbounds double* %A, i64 2
+  %2 = load double* %arrayidx25
+  %mul26 = fmul double %1, %2
+  ret void
+}
diff --git a/test/Analysis/CFLAliasAnalysis/stratified-attrs-indexing.ll b/test/Analysis/CFLAliasAnalysis/stratified-attrs-indexing.ll
new file mode 100644
index 0000000..8afedf2
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/stratified-attrs-indexing.ll
@@ -0,0 +1,33 @@
+; This testcase ensures that CFLAA doesn't try to access out of bounds indices
+; when given functions with large amounts of arguments (specifically, more
+; arguments than the StratifiedAttrs bitset can handle)
+;
+; Because the result on failure is effectively crashing the compiler, output
+; checking is minimal.
+
+; RUN: opt < %s -cfl-aa -aa-eval -print-may-aliases -disable-output 2>&1 | FileCheck %s
+
+; CHECK: Function: test
+define void @test(i1 %cond,
+                  i32* %arg1, i32* %arg2, i32* %arg3, i32* %arg4, i32* %arg5,
+                  i32* %arg6, i32* %arg7, i32* %arg8, i32* %arg9, i32* %arg10,
+                  i32* %arg11, i32* %arg12, i32* %arg13, i32* %arg14, i32* %arg15,
+                  i32* %arg16, i32* %arg17, i32* %arg18, i32* %arg19, i32* %arg20,
+                  i32* %arg21, i32* %arg22, i32* %arg23, i32* %arg24, i32* %arg25,
+                  i32* %arg26, i32* %arg27, i32* %arg28, i32* %arg29, i32* %arg30,
+                  i32* %arg31, i32* %arg32, i32* %arg33, i32* %arg34, i32* %arg35) {
+
+  ; CHECK: 946 Total Alias Queries Performed
+  ; CHECK: 810 no alias responses (85.6%)
+  %a = alloca i32, align 4
+  %b = select i1 %cond, i32* %arg35, i32* %arg34
+  %c = select i1 %cond, i32* %arg34, i32* %arg33
+  %d = select i1 %cond, i32* %arg33, i32* %arg32
+  %e = select i1 %cond, i32* %arg32, i32* %arg31
+  %f = select i1 %cond, i32* %arg31, i32* %arg30
+  %g = select i1 %cond, i32* %arg30, i32* %arg29
+  %h = select i1 %cond, i32* %arg29, i32* %arg28
+  %i = select i1 %cond, i32* %arg28, i32* %arg27
+
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
new file mode 100644
index 0000000..4683c43
--- /dev/null
+++ b/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -0,0 +1,89 @@
+; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck %s -check-prefix=AVX2
+
+
+; AVX2-LABEL: test1
+; AVX2: Found an estimated cost of 4 {{.*}}.masked
+define <2 x double> @test1(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
+  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+  %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+  ret <2 x double> %res
+}
+
+; AVX2-LABEL: test2
+; AVX2: Found an estimated cost of 4 {{.*}}.masked
+define <4 x i32> @test2(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+  ret <4 x i32> %res
+}
+
+; AVX2-LABEL: test3
+; AVX2: Found an estimated cost of 4 {{.*}}.masked
+define void @test3(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
+  ret void
+}
+
+; AVX2-LABEL: test4
+; AVX2: Found an estimated cost of 4 {{.*}}.masked
+define <8 x float> @test4(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
+  ret <8 x float> %res
+}
+
+; AVX2-LABEL: test5
+; AVX2: Found an estimated cost of 5 {{.*}}.masked
+define void @test5(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+; AVX2-LABEL: test6
+; AVX2: Found an estimated cost of 6 {{.*}}.masked
+define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+; AVX2-LABEL: test7
+; AVX2: Found an estimated cost of 5 {{.*}}.masked
+define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+  ret <2 x float> %res
+}
+
+; AVX2-LABEL: test8
+; AVX2: Found an estimated cost of 6 {{.*}}.masked
+define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+  ret <2 x i32> %res
+}
+
+
+declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
+declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
+declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
+declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
+declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
+declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
+
diff --git a/test/Analysis/CostModel/X86/vselect-cost.ll b/test/Analysis/CostModel/X86/vselect-cost.ll
index 2416777..b7e56ef 100644
--- a/test/Analysis/CostModel/X86/vselect-cost.ll
+++ b/test/Analysis/CostModel/X86/vselect-cost.ll
@@ -11,7 +11,7 @@
 
 define <2 x i64> @test_2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_2i64':
-; SSE2: Cost Model: {{.*}} 4 for instruction:   %sel = select <2 x i1>
+; SSE2: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
 ; SSE41: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
 ; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
 ; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
@@ -21,7 +21,7 @@ define <2 x i64> @test_2i64(<2 x i64> %a, <2 x i64> %b) {
 
 define <2 x double> @test_2double(<2 x double> %a, <2 x double> %b) {
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_2double':
-; SSE2: Cost Model: {{.*}} 3 for instruction:   %sel = select <2 x i1>
+; SSE2: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
 ; SSE41: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
 ; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
 ; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
@@ -31,7 +31,7 @@ define <2 x double> @test_2double(<2 x double> %a, <2 x double> %b) {
 
 define <4 x i32> @test_4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4i32':
-; SSE2: Cost Model: {{.*}} 8 for instruction:   %sel = select <4 x i1>
+; SSE2: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
 ; SSE41: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
 ; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
 ; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
@@ -41,7 +41,7 @@ define <4 x i32> @test_4i32(<4 x i32> %a, <4 x i32> %b) {
 
 define <4 x float> @test_4float(<4 x float> %a, <4 x float> %b) {
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4float':
-; SSE2: Cost Model: {{.*}} 7 for instruction:   %sel = select <4 x i1>
+; SSE2: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
 ; SSE41: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
 ; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
 ; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
@@ -51,7 +51,7 @@ define <4 x float> @test_4float(<4 x float> %a, <4 x float> %b) {
 
 define <16 x i8> @test_16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_16i8':
-; SSE2: Cost Model: {{.*}} 32 for instruction:   %sel = select <16 x i1>
+; SSE2: Cost Model: {{.*}} 1 for instruction:   %sel = select <16 x i1>
 ; SSE41: Cost Model: {{.*}} 1 for instruction:   %sel = select <16 x i1>
 ; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <16 x i1>
 ; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <16 x i1>
@@ -63,7 +63,7 @@ define <16 x i8> @test_16i8(<16 x i8> %a, <16 x i8> %b) {
 ; <8 x float>. Integers of the same size should also use those instructions.
 define <4 x i64> @test_4i64(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4i64':
-; SSE2: Cost Model: {{.*}} 8 for instruction:   %sel = select <4 x i1>
+; SSE2: Cost Model: {{.*}} 2 for instruction:   %sel = select <4 x i1>
 ; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <4 x i1>
 ; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
 ; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
@@ -73,7 +73,7 @@ define <4 x i64> @test_4i64(<4 x i64> %a, <4 x i64> %b) {
 
 define <4 x double> @test_4double(<4 x double> %a, <4 x double> %b) {
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4double':
-; SSE2: Cost Model: {{.*}} 6 for instruction:   %sel = select <4 x i1>
+; SSE2: Cost Model: {{.*}} 2 for instruction:   %sel = select <4 x i1>
 ; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <4 x i1>
 ; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
 ; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
@@ -83,7 +83,7 @@ define <4 x double> @test_4double(<4 x double> %a, <4 x double> %b) {
 
 define <8 x i32> @test_8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_8i32':
-; SSE2: Cost Model: {{.*}} 16 for instruction:   %sel = select <8 x i1>
+; SSE2: Cost Model: {{.*}} 2 for instruction:   %sel = select <8 x i1>
 ; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <8 x i1>
 ; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <8 x i1>
 ; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <8 x i1>
@@ -93,7 +93,7 @@ define <8 x i32> @test_8i32(<8 x i32> %a, <8 x i32> %b) {
 
 define <8 x float> @test_8float(<8 x float> %a, <8 x float> %b) {
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_8float':
-; SSE2: Cost Model: {{.*}} 14 for instruction:   %sel = select <8 x i1>
+; SSE2: Cost Model: {{.*}} 2 for instruction:   %sel = select <8 x i1>
 ; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <8 x i1>
 ; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <8 x i1>
 ; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <8 x i1>
@@ -104,10 +104,9 @@ define <8 x float> @test_8float(<8 x float> %a, <8 x float> %b) {
 ; AVX2
 define <16 x i16> @test_16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK:Printing analysis 'Cost Model Analysis' for function 'test_16i16':
-; SSE2: Cost Model: {{.*}} 32 for instruction:   %sel = select <16 x i1>
+; SSE2: Cost Model: {{.*}} 2 for instruction:   %sel = select <16 x i1>
 ; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <16 x i1>
-;;; FIXME: This AVX cost is obviously wrong. We shouldn't be scalarizing.
-; AVX: Cost Model: {{.*}} 32 for instruction:   %sel = select <16 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <16 x i1>
 ; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <16 x i1>
   %sel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i16> %a, <16 x i16> %b
   ret <16 x i16> %sel
@@ -115,10 +114,9 @@ define <16 x i16> @test_16i16(<16 x i16> %a, <16 x i16> %b) {
 
 define <32 x i8> @test_32i8(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_32i8':
-; SSE2: Cost Model: {{.*}} 64 for instruction:   %sel = select <32 x i1>
+; SSE2: Cost Model: {{.*}} 2 for instruction:   %sel = select <32 x i1>
 ; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <32 x i1>
-;;; FIXME: This AVX cost is obviously wrong. We shouldn't be scalarizing.
-; AVX: Cost Model: {{.*}} 64 for instruction:   %sel = select <32 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <32 x i1>
 ; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <32 x i1>
   %sel = select <32 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true>, <32 x i8> %a, <32 x i8> %b
   ret <32 x i8> %sel
diff --git a/test/Analysis/CostModel/no_info.ll b/test/Analysis/CostModel/no_info.ll
index f3f165b..5f3b56a 100644
--- a/test/Analysis/CostModel/no_info.ll
+++ b/test/Analysis/CostModel/no_info.ll
@@ -1,11 +1,12 @@
 ; RUN: opt < %s -cost-model -analyze | FileCheck %s
 
-; The cost model does not have any target information so it can't make a decision.
+; The cost model does not have any target information so it just makes boring
+; assumptions.
 
 ; -- No triple in this module --
 
-;CHECK: Unknown cost {{.*}} add
-;CHECK: Unknown cost {{.*}} ret
+;CHECK: cost of 1 {{.*}} add
+;CHECK: cost of 1 {{.*}} ret
 define i32 @no_info(i32 %arg) {
   %e = add i32 %arg, %arg
   ret i32 %e
diff --git a/test/Analysis/Dominators/basic.ll b/test/Analysis/Dominators/basic.ll
new file mode 100644
index 0000000..353c339
--- /dev/null
+++ b/test/Analysis/Dominators/basic.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -domtree -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-OLDPM
+; RUN: opt < %s -disable-output -passes='print<domtree>' 2>&1 | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-NEWPM
+
+define void @test1() {
+; CHECK-OLDPM-LABEL: 'Dominator Tree Construction' for function 'test1':
+; CHECK-NEWPM-LABEL: DominatorTree for function: test1
+; CHECK:      [1] %entry
+; CHECK-NEXT:   [2] %a
+; CHECK-NEXT:   [2] %c
+; CHECK-NEXT:     [3] %d
+; CHECK-NEXT:     [3] %e
+; CHECK-NEXT:   [2] %b
+
+entry:
+  br i1 undef, label %a, label %b
+
+a:
+  br label %c
+
+b:
+  br label %c
+
+c:
+  br i1 undef, label %d, label %e
+
+d:
+  ret void
+
+e:
+  ret void
+}
+
+define void @test2() {
+; CHECK-OLDPM-LABEL: 'Dominator Tree Construction' for function 'test2':
+; CHECK-NEWPM-LABEL: DominatorTree for function: test2
+; CHECK:      [1] %entry
+; CHECK-NEXT:   [2] %a
+; CHECK-NEXT:     [3] %b
+; CHECK-NEXT:       [4] %c
+; CHECK-NEXT:         [5] %d
+; CHECK-NEXT:         [5] %ret
+
+entry:
+  br label %a
+
+a:
+  br label %b
+
+b:
+  br i1 undef, label %a, label %c
+
+c:
+  br i1 undef, label %d, label %ret
+
+d:
+  br i1 undef, label %a, label %ret
+
+ret:
+  ret void
+}
diff --git a/test/Analysis/Lint/cppeh-catch-intrinsics-clean.ll b/test/Analysis/Lint/cppeh-catch-intrinsics-clean.ll
new file mode 100644
index 0000000..e398d71
--- /dev/null
+++ b/test/Analysis/Lint/cppeh-catch-intrinsics-clean.ll
@@ -0,0 +1,109 @@
+; RUN: opt -lint -disable-output < %s
+
+; This test is meant to prove that the verifier does not report errors for correct
+; use of the llvm.eh.begincatch and llvm.eh.endcatch intrinsics.
+
+target triple = "x86_64-pc-windows-msvc"
+
+declare i8* @llvm.eh.begincatch(i8*)
+
+declare void @llvm.eh.endcatch()
+
+@_ZTIi = external constant i8*
+
+; Function Attrs: uwtable
+define void @test_ref_clean() {
+entry:
+  invoke void @_Z9may_throwv()
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %exn = extractvalue { i8*, i32 } %0, 0
+  %sel = extractvalue { i8*, i32 } %0, 1
+  %1 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %sel, %1
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %2 = call i8* @llvm.eh.begincatch(i8* %exn)
+  call void @_Z10handle_intv()
+  br label %invoke.cont2
+
+invoke.cont2:                                     ; preds = %catch
+  call void @llvm.eh.endcatch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont2, %entry
+  ret void
+
+eh.resume:                                        ; preds = %catch.dispatch
+  resume { i8*, i32 } %0
+}
+
+; Function Attrs: uwtable
+define void @test_ref_clean_multibranch() {
+entry:
+  invoke void @_Z9may_throwv()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  invoke void @_Z9may_throwv()
+          to label %invoke.cont unwind label %lpad1
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %exn = extractvalue { i8*, i32 } %0, 0
+  %sel = extractvalue { i8*, i32 } %0, 1
+  %1 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %sel, %1
+  br i1 %matches, label %catch, label %eh.resume
+
+  invoke void @_Z9may_throwv()
+          to label %try.cont unwind label %lpad
+
+lpad1:                                            ; preds = %entry
+  %l1.0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+		  cleanup
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %exn1 = extractvalue { i8*, i32 } %l1.0, 0
+  %sel1 = extractvalue { i8*, i32 } %l1.0, 1
+  %l1.1 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matchesl1 = icmp eq i32 %sel1, %l1.1
+  br i1 %matchesl1, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad, %lpad1
+  %exn2 = phi i8* [%exn, %lpad], [%exn1, %lpad1]
+  %sel2 = phi i32 [%sel, %lpad], [%sel1, %lpad1]
+  %3 = call i8* @llvm.eh.begincatch(i8* %exn2)
+  call void @_Z10handle_intv()
+  %matches1 = icmp eq i32 %sel2, 0
+  br i1 %matches1, label %invoke.cont2, label %invoke.cont3
+
+invoke.cont2:                                     ; preds = %catch
+  call void @llvm.eh.endcatch()
+  br label %try.cont
+
+invoke.cont3:                                     ; preds = %catch
+  call void @llvm.eh.endcatch()
+  br label %eh.resume
+
+try.cont:                                         ; preds = %invoke.cont2, %entry
+  ret void
+
+eh.resume:                                        ; preds = %catch.dispatch
+  %lpad.val = insertvalue { i8*, i32 } undef, i32 0, 1
+  resume { i8*, i32 } %lpad.val
+}
+
+declare void @_Z9may_throwv()
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.typeid.for(i8*)
+
+declare void @_Z10handle_intv()
+
diff --git a/test/Analysis/Lint/cppeh-catch-intrinsics.ll b/test/Analysis/Lint/cppeh-catch-intrinsics.ll
new file mode 100644
index 0000000..5ab73e35
--- /dev/null
+++ b/test/Analysis/Lint/cppeh-catch-intrinsics.ll
@@ -0,0 +1,278 @@
+; RUN: opt -lint -disable-output < %s 2>&1 | FileCheck %s
+
+; This test is meant to prove that the Verifier is able to identify a variety
+; of errors with the llvm.eh.begincatch and llvm.eh.endcatch intrinsics.
+; See cppeh-catch-intrinsics-clean for correct uses.
+
+target triple = "x86_64-pc-windows-msvc"
+
+declare i8* @llvm.eh.begincatch(i8*)
+
+declare void @llvm.eh.endcatch()
+
+@_ZTIi = external constant i8*
+
+; Function Attrs: uwtable
+define void @test_missing_endcatch() {
+; CHECK: Some paths from llvm.eh.begincatch may not reach llvm.eh.endcatch
+; CHECK-NEXT: %2 = call i8* @llvm.eh.begincatch(i8* %exn)
+entry:
+  invoke void @_Z9may_throwv()
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %exn = extractvalue { i8*, i32 } %0, 0
+  %sel = extractvalue { i8*, i32 } %0, 1
+  %1 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %sel, %1
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %2 = call i8* @llvm.eh.begincatch(i8* %exn)
+  call void @_Z10handle_intv()
+  br label %invoke.cont2
+
+invoke.cont2:                                     ; preds = %catch
+  br label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont2, %entry
+  ret void
+
+eh.resume:                                        ; preds = %catch.dispatch
+  resume { i8*, i32 } %0
+}
+
+; Function Attrs: uwtable
+define void @test_missing_begincatch() {
+; CHECK: llvm.eh.endcatch may be reachable without passing llvm.eh.begincatch
+; CHECK-NEXT:  call void @llvm.eh.endcatch()
+entry:
+  invoke void @_Z9may_throwv()
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %exn = extractvalue { i8*, i32 } %0, 0
+  %sel = extractvalue { i8*, i32 } %0, 1
+  %1 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %sel, %1
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  call void @_Z10handle_intv()
+  br label %invoke.cont2
+
+invoke.cont2:                                     ; preds = %catch
+  call void @llvm.eh.endcatch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont2, %entry
+  ret void
+
+eh.resume:                                        ; preds = %catch.dispatch
+  resume { i8*, i32 } %0
+}
+
+; Function Attrs: uwtable
+define void @test_multiple_begin() {
+; CHECK: llvm.eh.begincatch may be called a second time before llvm.eh.endcatch
+; CHECK-NEXT:  %2 = call i8* @llvm.eh.begincatch(i8* %exn)
+; CHECK-NEXT:  %3 = call i8* @llvm.eh.begincatch(i8* %exn)
+entry:
+  invoke void @_Z9may_throwv()
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %exn = extractvalue { i8*, i32 } %0, 0
+  %sel = extractvalue { i8*, i32 } %0, 1
+  %1 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %sel, %1
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %2 = call i8* @llvm.eh.begincatch(i8* %exn)
+  call void @_Z10handle_intv()
+  br label %invoke.cont2
+
+invoke.cont2:                                     ; preds = %catch
+  %3 = call i8* @llvm.eh.begincatch(i8* %exn)
+  call void @llvm.eh.endcatch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont2, %entry
+  ret void
+
+eh.resume:                                        ; preds = %catch.dispatch
+  resume { i8*, i32 } %0
+}
+
+; Function Attrs: uwtable
+define void @test_multiple_end() {
+; CHECK: llvm.eh.endcatch may be called a second time after llvm.eh.begincatch
+; CHECK-NEXT:  call void @llvm.eh.endcatch()
+; CHECK-NEXT:  call void @llvm.eh.endcatch()
+entry:
+  invoke void @_Z9may_throwv()
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %exn = extractvalue { i8*, i32 } %0, 0
+  %sel = extractvalue { i8*, i32 } %0, 1
+  %1 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %sel, %1
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %2 = call i8* @llvm.eh.begincatch(i8* %exn)
+  call void @_Z10handle_intv()
+  call void @llvm.eh.endcatch()
+  br label %invoke.cont2
+
+invoke.cont2:                                     ; preds = %catch
+  call void @llvm.eh.endcatch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont2, %entry
+  ret void
+
+eh.resume:                                        ; preds = %catch.dispatch
+  resume { i8*, i32 } %0
+}
+
+
+; Function Attrs: uwtable
+define void @test_begincatch_without_lpad() {
+; CHECK: llvm.eh.begincatch may be reachable without passing a landingpad
+; CHECK-NEXT:  %0 = call i8* @llvm.eh.begincatch(i8* %exn)
+entry:
+  %exn = alloca i8
+  %0 = call i8* @llvm.eh.begincatch(i8* %exn)
+  call void @_Z10handle_intv()
+  br label %invoke.cont2
+
+invoke.cont2:                                     ; preds = %catch
+  call void @llvm.eh.endcatch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont2, %entry
+  ret void
+}
+
+; Function Attrs: uwtable
+define void @test_branch_to_begincatch_with_no_lpad(i32 %fake.sel) {
+; CHECK: llvm.eh.begincatch may be reachable without passing a landingpad
+; CHECK-NEXT:  %3 = call i8* @llvm.eh.begincatch(i8* %exn2)
+entry:
+  %fake.exn = alloca i8
+  invoke void @_Z9may_throwv()
+          to label %catch unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %exn = extractvalue { i8*, i32 } %0, 0
+  %sel = extractvalue { i8*, i32 } %0, 1
+  %1 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %sel, %1
+  br i1 %matches, label %catch, label %eh.resume
+
+  invoke void @_Z9may_throwv()
+          to label %try.cont unwind label %lpad
+
+catch:                                            ; preds = %lpad, %entry
+  %exn2 = phi i8* [%exn, %lpad], [%fake.exn, %entry]
+  %sel2 = phi i32 [%sel, %lpad], [%fake.sel, %entry]
+  %3 = call i8* @llvm.eh.begincatch(i8* %exn2)
+  call void @_Z10handle_intv()
+  %matches1 = icmp eq i32 %sel2, 0
+  br i1 %matches1, label %invoke.cont2, label %invoke.cont3
+
+invoke.cont2:                                     ; preds = %catch
+  call void @llvm.eh.endcatch()
+  br label %try.cont
+
+invoke.cont3:                                     ; preds = %catch
+  call void @llvm.eh.endcatch()
+  br label %eh.resume
+
+try.cont:                                         ; preds = %invoke.cont2
+  ret void
+
+eh.resume:                                        ; preds = %catch.dispatch
+  %lpad.val = insertvalue { i8*, i32 } undef, i32 0, 1
+  resume { i8*, i32 } %lpad.val
+}
+
+; Function Attrs: uwtable
+define void @test_branch_missing_endcatch() {
+; CHECK: Some paths from llvm.eh.begincatch may not reach llvm.eh.endcatch
+; CHECK-NEXT:  %3 = call i8* @llvm.eh.begincatch(i8* %exn2)
+entry:
+  invoke void @_Z9may_throwv()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  invoke void @_Z9may_throwv()
+          to label %invoke.cont unwind label %lpad1
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %exn = extractvalue { i8*, i32 } %0, 0
+  %sel = extractvalue { i8*, i32 } %0, 1
+  %1 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %sel, %1
+  br i1 %matches, label %catch, label %eh.resume
+
+  invoke void @_Z9may_throwv()
+          to label %try.cont unwind label %lpad
+
+lpad1:                                            ; preds = %entry
+  %l1.0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+		  cleanup
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %exn1 = extractvalue { i8*, i32 } %l1.0, 0
+  %sel1 = extractvalue { i8*, i32 } %l1.0, 1
+  %l1.1 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matchesl1 = icmp eq i32 %sel1, %l1.1
+  br i1 %matchesl1, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad, %lpad1
+  %exn2 = phi i8* [%exn, %lpad], [%exn1, %lpad1]
+  %sel2 = phi i32 [%sel, %lpad], [%sel1, %lpad1]
+  %3 = call i8* @llvm.eh.begincatch(i8* %exn2)
+  call void @_Z10handle_intv()
+  %matches1 = icmp eq i32 %sel2, 0
+  br i1 %matches1, label %invoke.cont2, label %invoke.cont3
+
+invoke.cont2:                                     ; preds = %catch
+  call void @llvm.eh.endcatch()
+  br label %try.cont
+
+invoke.cont3:                                     ; preds = %catch
+  br label %eh.resume
+
+try.cont:                                         ; preds = %invoke.cont2, %entry
+  ret void
+
+eh.resume:                                        ; preds = %catch.dispatch
+  %lpad.val = insertvalue { i8*, i32 } undef, i32 0, 1
+  resume { i8*, i32 } %lpad.val
+}
+
+declare void @_Z9may_throwv()
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.typeid.for(i8*)
+
+declare void @_Z10handle_intv()
+
diff --git a/test/Analysis/LoopAccessAnalysis/backward-dep-different-types.ll b/test/Analysis/LoopAccessAnalysis/backward-dep-different-types.ll
new file mode 100644
index 0000000..f503a5c
--- /dev/null
+++ b/test/Analysis/LoopAccessAnalysis/backward-dep-different-types.ll
@@ -0,0 +1,50 @@
+; RUN: opt -loop-accesses -analyze < %s | FileCheck %s
+
+; In this loop just because we access A through different types (int, float)
+; we still have a dependence cycle:
+;
+;   for (i = 0; i < n; i++) {
+;    A_float = (float *) A;
+;    A_float[i + 1] = A[i] * B[i];
+;   }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; CHECK: Report: unsafe dependent memory operations in loop
+; CHECK-NOT: Memory dependences are safe
+
+@n = global i32 20, align 4
+@B = common global i32* null, align 8
+@A = common global i32* null, align 8
+
+define void @f() {
+entry:
+  %a = load i32** @A, align 8
+  %b = load i32** @B, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %storemerge3 = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32* %a, i64 %storemerge3
+  %loadA = load i32* %arrayidxA, align 2
+
+  %arrayidxB = getelementptr inbounds i32* %b, i64 %storemerge3
+  %loadB = load i32* %arrayidxB, align 2
+
+  %mul = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %storemerge3, 1
+
+  %a_float = bitcast i32* %a to float*
+  %arrayidxA_plus_2 = getelementptr inbounds float* %a_float, i64 %add
+  %mul_float = sitofp i32 %mul to float
+  store float %mul_float, float* %arrayidxA_plus_2, align 2
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-no-dbg.ll b/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-no-dbg.ll
new file mode 100644
index 0000000..62291d5
--- /dev/null
+++ b/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-no-dbg.ll
@@ -0,0 +1,60 @@
+; RUN: opt -loop-accesses -analyze < %s | FileCheck %s
+
+; FIXME: This is the non-debug version of unsafe-and-rt-checks.ll not
+; requiring "asserts".  Once we can check memory dependences without -debug,
+; we should remove this test.
+
+; Analyze this loop:
+;   for (i = 0; i < n; i++)
+;    A[i + 1] = A[i] * B[i] * C[i];
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; CHECK: Report: unsafe dependent memory operations in loop
+
+; CHECK: Run-time memory checks:
+; CHECK-NEXT: 0:
+; CHECK-NEXT:   %arrayidxA_plus_2 = getelementptr inbounds i16* %a, i64 %add
+; CHECK-NEXT:   %arrayidxB = getelementptr inbounds i16* %b, i64 %storemerge3
+; CHECK-NEXT: 1:
+; CHECK-NEXT:   %arrayidxA_plus_2 = getelementptr inbounds i16* %a, i64 %add
+; CHECK-NEXT:   %arrayidxC = getelementptr inbounds i16* %c, i64 %storemerge3
+
+@n = global i32 20, align 4
+@B = common global i16* null, align 8
+@A = common global i16* null, align 8
+@C = common global i16* null, align 8
+
+define void @f() {
+entry:
+  %a = load i16** @A, align 8
+  %b = load i16** @B, align 8
+  %c = load i16** @C, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %storemerge3 = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i16* %a, i64 %storemerge3
+  %loadA = load i16* %arrayidxA, align 2
+
+  %arrayidxB = getelementptr inbounds i16* %b, i64 %storemerge3
+  %loadB = load i16* %arrayidxB, align 2
+
+  %arrayidxC = getelementptr inbounds i16* %c, i64 %storemerge3
+  %loadC = load i16* %arrayidxC, align 2
+
+  %mul = mul i16 %loadB, %loadA
+  %mul1 = mul i16 %mul, %loadC
+
+  %add = add nuw nsw i64 %storemerge3, 1
+  %arrayidxA_plus_2 = getelementptr inbounds i16* %a, i64 %add
+  store i16 %mul1, i16* %arrayidxA_plus_2, align 2
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks.ll b/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks.ll
new file mode 100644
index 0000000..4769a3a
--- /dev/null
+++ b/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks.ll
@@ -0,0 +1,61 @@
+; RUN: opt -loop-accesses -analyze < %s | FileCheck %s
+; RUN: opt -loop-accesses -analyze -debug-only=loop-accesses < %s 2>&1 | FileCheck %s --check-prefix=DEBUG
+; REQUIRES: asserts
+
+; Analyze this loop:
+;   for (i = 0; i < n; i++)
+;    A[i + 1] = A[i] * B[i] * C[i];
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; CHECK: Report: unsafe dependent memory operations in loop
+
+; DEBUG: LAA: Distance for   %loadA = load i16* %arrayidxA, align 2 to   store i16 %mul1, i16* %arrayidxA_plus_2, align 2: 2
+; DEBUG-NEXT: LAA: Failure because of Positive distance 2
+
+; CHECK: Run-time memory checks:
+; CHECK-NEXT: 0:
+; CHECK-NEXT:   %arrayidxA_plus_2 = getelementptr inbounds i16* %a, i64 %add
+; CHECK-NEXT:   %arrayidxB = getelementptr inbounds i16* %b, i64 %storemerge3
+; CHECK-NEXT: 1:
+; CHECK-NEXT:   %arrayidxA_plus_2 = getelementptr inbounds i16* %a, i64 %add
+; CHECK-NEXT:   %arrayidxC = getelementptr inbounds i16* %c, i64 %storemerge3
+
+@n = global i32 20, align 4
+@B = common global i16* null, align 8
+@A = common global i16* null, align 8
+@C = common global i16* null, align 8
+
+define void @f() {
+entry:
+  %a = load i16** @A, align 8
+  %b = load i16** @B, align 8
+  %c = load i16** @C, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %storemerge3 = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i16* %a, i64 %storemerge3
+  %loadA = load i16* %arrayidxA, align 2
+
+  %arrayidxB = getelementptr inbounds i16* %b, i64 %storemerge3
+  %loadB = load i16* %arrayidxB, align 2
+
+  %arrayidxC = getelementptr inbounds i16* %c, i64 %storemerge3
+  %loadC = load i16* %arrayidxC, align 2
+
+  %mul = mul i16 %loadB, %loadA
+  %mul1 = mul i16 %mul, %loadC
+
+  %add = add nuw nsw i64 %storemerge3, 1
+  %arrayidxA_plus_2 = getelementptr inbounds i16* %a, i64 %add
+  store i16 %mul1, i16* %arrayidxA_plus_2, align 2
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/LoopInfo/2003-05-15-NestingProblem.ll b/test/Analysis/LoopInfo/2003-05-15-NestingProblem.ll
index a87bab7..599b3e4 100644
--- a/test/Analysis/LoopInfo/2003-05-15-NestingProblem.ll
+++ b/test/Analysis/LoopInfo/2003-05-15-NestingProblem.ll
@@ -2,6 +2,7 @@
 ; not a child of the loopentry.6 loop.
 ;
 ; RUN: opt < %s -analyze -loops | FileCheck %s
+; RUN: opt < %s -passes='print<loops>' -disable-output 2>&1 | FileCheck %s
 
 ; CHECK: Loop at depth 4 containing: %loopentry.7<header><latch><exiting>
 
diff --git a/test/Analysis/ScalarEvolution/incorrect-nsw.ll b/test/Analysis/ScalarEvolution/incorrect-nsw.ll
new file mode 100644
index 0000000..dd981c4
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/incorrect-nsw.ll
@@ -0,0 +1,26 @@
+; RUN: opt -analyze -scalar-evolution -scalar-evolution < %s | FileCheck %s
+
+define void @bad.nsw() {
+; CHECK-LABEL: Classifying expressions for: @bad.nsw
+; CHECK-LABEL: Classifying expressions for: @bad.nsw
+ entry: 
+  br label %loop
+
+ loop:
+  %i = phi i8 [ -1, %entry ], [ %i.inc, %loop ]
+; CHECK:  %i = phi i8 [ -1, %entry ], [ %i.inc, %loop ]
+; CHECK-NEXT: -->  {-1,+,-128}<nw><%loop>
+; CHECK-NOT: -->  {-1,+,-128}<nsw><%loop>
+
+  %counter = phi i8 [ 0, %entry ], [ %counter.inc, %loop ]
+
+  %i.inc = add i8 %i, -128
+  %i.sext = sext i8 %i to i16
+
+  %counter.inc = add i8 %counter, 1
+  %continue = icmp eq i8 %counter, 1
+  br i1 %continue, label %exit, label %loop
+
+ exit:
+  ret void  
+}
diff --git a/test/Analysis/ScalarEvolution/infer-prestart-no-wrap.ll b/test/Analysis/ScalarEvolution/infer-prestart-no-wrap.ll
new file mode 100644
index 0000000..c9689f7
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/infer-prestart-no-wrap.ll
@@ -0,0 +1,101 @@
+; ; RUN: opt -analyze -scalar-evolution < %s | FileCheck %s
+
+define void @infer.sext.0(i1* %c, i32 %start) {
+; CHECK-LABEL: Classifying expressions for: @infer.sext.0
+ entry:
+  br label %loop
+
+ loop:
+  %counter = phi i32 [ 0, %entry ], [ %counter.inc, %loop ]
+  %idx = phi i32 [ %start, %entry ], [ %idx.inc, %loop ]
+  %idx.inc = add nsw i32 %idx, 1
+  %idx.inc.sext = sext i32 %idx.inc to i64
+; CHECK: %idx.inc.sext = sext i32 %idx.inc to i64
+; CHECK-NEXT: -->  {(1 + (sext i32 %start to i64)),+,1}<nsw><%loop>
+  %condition = icmp eq i32 %counter, 1
+  %counter.inc = add i32 %counter, 1
+  br i1 %condition, label %exit, label %loop
+
+ exit:
+  ret void
+}
+
+define void @infer.zext.0(i1* %c, i32 %start) {
+; CHECK-LABEL: Classifying expressions for: @infer.zext.0
+ entry:
+  br label %loop
+
+ loop:
+  %counter = phi i32 [ 0, %entry ], [ %counter.inc, %loop ]
+  %idx = phi i32 [ %start, %entry ], [ %idx.inc, %loop ]
+  %idx.inc = add nuw i32 %idx, 1
+  %idx.inc.sext = zext i32 %idx.inc to i64
+; CHECK: %idx.inc.sext = zext i32 %idx.inc to i64
+; CHECK-NEXT: -->  {(1 + (zext i32 %start to i64)),+,1}<nuw><%loop>
+  %condition = icmp eq i32 %counter, 1
+  %counter.inc = add i32 %counter, 1
+  br i1 %condition, label %exit, label %loop
+
+ exit:
+  ret void
+}
+
+define void @infer.sext.1(i32 %start, i1* %c) {
+; CHECK-LABEL: Classifying expressions for: @infer.sext.1
+ entry:
+  %start.mul = mul i32 %start, 4
+  %start.real = add i32 %start.mul, 2
+  br label %loop
+
+ loop:
+  %idx = phi i32 [ %start.real, %entry ], [ %idx.inc, %loop ]
+  %idx.sext = sext i32 %idx to i64
+; CHECK: %idx.sext = sext i32 %idx to i64
+; CHECK-NEXT:  -->  {(2 + (sext i32 (4 * %start) to i64)),+,2}<nsw><%loop>
+  %idx.inc = add nsw i32 %idx, 2
+  %condition = load i1* %c
+  br i1 %condition, label %exit, label %loop
+
+ exit:
+  ret void
+}
+
+define void @infer.sext.2(i1* %c, i8 %start) {
+; CHECK-LABEL: Classifying expressions for: @infer.sext.2
+ entry:
+  %start.inc = add i8 %start, 1
+  %entry.condition = icmp slt i8 %start, 127
+  br i1 %entry.condition, label %loop, label %exit
+
+ loop:
+  %idx = phi i8 [ %start.inc, %entry ], [ %idx.inc, %loop ]
+  %idx.sext = sext i8 %idx to i16
+; CHECK: %idx.sext = sext i8 %idx to i16
+; CHECK-NEXT: -->  {(1 + (sext i8 %start to i16)),+,1}<nsw><%loop>
+  %idx.inc = add nsw i8 %idx, 1
+  %condition = load volatile i1* %c
+  br i1 %condition, label %exit, label %loop
+
+ exit:
+  ret void
+}
+
+define void @infer.zext.1(i1* %c, i8 %start) {
+; CHECK-LABEL: Classifying expressions for: @infer.zext.1
+ entry:
+  %start.inc = add i8 %start, 1
+  %entry.condition = icmp ult i8 %start, 255
+  br i1 %entry.condition, label %loop, label %exit
+
+ loop:
+  %idx = phi i8 [ %start.inc, %entry ], [ %idx.inc, %loop ]
+  %idx.zext = zext i8 %idx to i16
+; CHECK: %idx.zext = zext i8 %idx to i16
+; CHECK-NEXT: -->  {(1 + (zext i8 %start to i16)),+,1}<nuw><%loop>
+  %idx.inc = add nuw i8 %idx, 1
+  %condition = load volatile i1* %c
+  br i1 %condition, label %exit, label %loop
+
+ exit:
+  ret void
+}
diff --git a/test/Analysis/ScalarEvolution/load-with-range-metadata.ll b/test/Analysis/ScalarEvolution/load-with-range-metadata.ll
index 2f6dcd0..32c1074 100644
--- a/test/Analysis/ScalarEvolution/load-with-range-metadata.ll
+++ b/test/Analysis/ScalarEvolution/load-with-range-metadata.ll
@@ -34,4 +34,4 @@ define i32 @ult_trip_count_with_range(i32 *%ptr0, i32 *%ptr1) {
   ret i32 0
 }
 
-!0 = metadata !{i32 1, i32 100}
+!0 = !{i32 1, i32 100}
diff --git a/test/Analysis/ScalarEvolution/min-max-exprs.ll b/test/Analysis/ScalarEvolution/min-max-exprs.ll
new file mode 100644
index 0000000..3e0a35d
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/min-max-exprs.ll
@@ -0,0 +1,53 @@
+; RUN: opt -scalar-evolution -analyze < %s | FileCheck %s
+;
+; This checks if the min and max expressions are properly recognized by
+; ScalarEvolution even though they the ICmpInst and SelectInst have different
+; types.
+;
+;    #define max(a, b) (a > b ? a : b)
+;    #define min(a, b) (a < b ? a : b)
+;
+;    void f(int *A, int N) {
+;      for (int i = 0; i < N; i++) {
+;        A[max(0, i - 3)] = A[min(N, i + 3)] * 2;
+;      }
+;    }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(i32* %A, i32 %N) {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb2, %bb
+  %i.0 = phi i32 [ 0, %bb ], [ %tmp23, %bb2 ]
+  %i.0.1 = sext i32 %i.0 to i64
+  %tmp = icmp slt i32 %i.0, %N
+  br i1 %tmp, label %bb2, label %bb24
+
+bb2:                                              ; preds = %bb1
+  %tmp3 = add nuw nsw i32 %i.0, 3
+  %tmp4 = icmp slt i32 %tmp3, %N
+  %tmp5 = sext i32 %tmp3 to i64
+  %tmp6 = sext i32 %N to i64
+  %tmp9 = select i1 %tmp4, i64 %tmp5, i64 %tmp6
+;                  min(N, i+3)
+; CHECK:           select i1 %tmp4, i64 %tmp5, i64 %tmp6
+; CHECK-NEXT:  --> (-1 + (-1 * ((-1 + (-1 * (sext i32 {3,+,1}<nw><%bb1> to i64))) smax (-1 + (-1 * (sext i32 %N to i64))))))
+  %tmp11 = getelementptr inbounds i32* %A, i64 %tmp9
+  %tmp12 = load i32* %tmp11, align 4
+  %tmp13 = shl nsw i32 %tmp12, 1
+  %tmp14 = icmp sge i32 3, %i.0
+  %tmp17 = add nsw i64 %i.0.1, -3
+  %tmp19 = select i1 %tmp14, i64 0, i64 %tmp17
+;                  max(0, i - 3)
+; CHECK:           select i1 %tmp14, i64 0, i64 %tmp17
+; CHECK-NEXT: -->  (-3 + (3 smax {0,+,1}<nuw><nsw><%bb1>))
+  %tmp21 = getelementptr inbounds i32* %A, i64 %tmp19
+  store i32 %tmp13, i32* %tmp21, align 4
+  %tmp23 = add nuw nsw i32 %i.0, 1
+  br label %bb1
+
+bb24:                                             ; preds = %bb1
+  ret void
+}
diff --git a/test/Analysis/ScalarEvolution/nw-sub-is-not-nw-add.ll b/test/Analysis/ScalarEvolution/nw-sub-is-not-nw-add.ll
new file mode 100644
index 0000000..41b07d5
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/nw-sub-is-not-nw-add.ll
@@ -0,0 +1,41 @@
+; RUN: opt -S -indvars < %s | FileCheck %s
+
+; Check that SCEV does not assume sub nuw X Y == add nuw X, -Y
+define void @f(i32* %loc) {
+; CHECK-LABEL: @f
+ entry:
+  br label %loop
+
+ loop:
+  %idx = phi i32 [ 6, %entry ], [ %idx.dec, %loop ]
+  store i32 %idx, i32* %loc
+  %idx.dec = sub nuw i32 %idx, 1
+  %cond = icmp uge i32 %idx.dec, 5
+  br i1 %cond, label %loop, label %exit
+; CHECK-NOT: br i1 true, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+declare void @use_i1(i1)
+
+; Check that SCEV does not assume sub nsw X Y == add nsw X, -Y
+define void @g(i32 %lim) {
+; CHECK-LABEL: @g
+ entry:
+  br label %loop
+
+ loop:
+  %idx = phi i32 [ -1, %entry ], [ %idx.dec, %loop ]
+  %t = icmp sgt i32 %idx, 0
+; CHECK-NOT:   call void @use_i1(i1 false)
+; CHECK: call void @use_i1(i1 %t)
+  call void @use_i1(i1 %t)
+  %idx.dec = sub nsw i32 %idx, -2147483648
+  %cond = icmp eq i32 %idx.dec, %lim
+  br i1 %cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
diff --git a/test/Analysis/ScalarEvolution/pr22179.ll b/test/Analysis/ScalarEvolution/pr22179.ll
new file mode 100644
index 0000000..d9fb510
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/pr22179.ll
@@ -0,0 +1,28 @@
+; RUN: opt -analyze -scalar-evolution < %s | FileCheck %s
+
+%struct.anon = type { i8 }
+%struct.S = type { i32 }
+
+@a = common global %struct.anon zeroinitializer, align 1
+@b = common global %struct.S zeroinitializer, align 4
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @main() {
+; CHECK-LABEL: Classifying expressions for: @main
+  store i8 0, i8* getelementptr inbounds (%struct.anon* @a, i64 0, i32 0), align 1
+  br label %loop
+
+loop:
+  %storemerge1 = phi i8 [ 0, %0 ], [ %inc, %loop ]
+  %m = load volatile i32* getelementptr inbounds (%struct.S* @b, i64 0, i32 0), align 4
+  %inc = add nuw i8 %storemerge1, 1
+; CHECK:   %inc = add nuw i8 %storemerge1, 1
+; CHECK-NEXT: -->  {1,+,1}<nuw><%loop>
+; CHECK-NOT: -->  {1,+,1}<nuw><nsw><%loop>
+  %exitcond = icmp eq i8 %inc, -128
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  store i8 -128, i8* getelementptr inbounds (%struct.anon* @a, i64 0, i32 0), align 1
+  ret i32 0
+}
diff --git a/test/Analysis/ScalarEvolution/pr22641.ll b/test/Analysis/ScalarEvolution/pr22641.ll
new file mode 100644
index 0000000..3b55afe
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/pr22641.ll
@@ -0,0 +1,25 @@
+; RUN: opt -analyze -scalar-evolution < %s | FileCheck %s
+
+define i1 @main(i16 %a) {
+; CHECK-LABEL: Classifying expressions for: @main
+entry:
+  br label %body
+
+body:
+  %dec2 = phi i16 [ %a, %entry ], [ %dec, %cond ]
+  %dec = add i16 %dec2, -1
+  %conv2 = zext i16 %dec2 to i32
+  %conv = zext i16 %dec to i32
+; CHECK:   %conv = zext i16 %dec to i32
+; CHECK-NEXT: -->  {(zext i16 (-1 + %a) to i32),+,65535}<nuw><%body>
+; CHECK-NOT:  -->  {(65535 + (zext i16 %a to i32)),+,65535}<nuw><%body>
+
+  br label %cond
+
+cond:
+  br i1 false, label %body, label %exit
+
+exit:
+  %ret = icmp ne i32 %conv, 0
+  ret i1 %ret
+}
diff --git a/test/Analysis/ScalarEvolution/pr22674.ll b/test/Analysis/ScalarEvolution/pr22674.ll
new file mode 100644
index 0000000..7defcb9
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/pr22674.ll
@@ -0,0 +1,101 @@
+; RUN: opt -loop-reduce -S %s
+
+
+target datalayout = "e-m:e-p:32:32-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnux32"
+
+%"class.llvm::AttributeSetNode.230.2029.3828.6141.6912.7683.8454.9482.9996.10253.18506" = type { %"class.llvm::FoldingSetImpl::Node.1.1801.3600.5913.6684.7455.8226.9254.9768.10025.18505", i32 }
+%"class.llvm::FoldingSetImpl::Node.1.1801.3600.5913.6684.7455.8226.9254.9768.10025.18505" = type { i8* }
+%"struct.std::pair.241.2040.3839.6152.6923.7694.8465.9493.10007.10264.18507" = type { i32, %"class.llvm::AttributeSetNode.230.2029.3828.6141.6912.7683.8454.9482.9996.10253.18506"* }
+%"class.llvm::Attribute.222.2021.3820.6133.6904.7675.8446.9474.9988.10245.18509" = type { %"class.llvm::AttributeImpl.2.1802.3601.5914.6685.7456.8227.9255.9769.10026.18508"* }
+%"class.llvm::AttributeImpl.2.1802.3601.5914.6685.7456.8227.9255.9769.10026.18508" = type <{ i32 (...)**, %"class.llvm::FoldingSetImpl::Node.1.1801.3600.5913.6684.7455.8226.9254.9768.10025.18505", i8, [3 x i8] }>
+
+; Function Attrs: nounwind uwtable
+define void @_ZNK4llvm11AttrBuilder13hasAttributesENS_12AttributeSetEy() #0 align 2 {
+entry:
+  br i1 undef, label %cond.false, label %_ZNK4llvm12AttributeSet11getNumSlotsEv.exit
+
+_ZNK4llvm12AttributeSet11getNumSlotsEv.exit:      ; preds = %entry
+  br i1 undef, label %cond.false, label %for.body.lr.ph.for.body.lr.ph.split_crit_edge
+
+for.body.lr.ph.for.body.lr.ph.split_crit_edge:    ; preds = %_ZNK4llvm12AttributeSet11getNumSlotsEv.exit
+  br label %land.lhs.true.i
+
+land.lhs.true.i:                                  ; preds = %for.inc, %for.body.lr.ph.for.body.lr.ph.split_crit_edge
+  %I.099 = phi i32 [ 0, %for.body.lr.ph.for.body.lr.ph.split_crit_edge ], [ %inc, %for.inc ]
+  %cmp.i = icmp ugt i32 undef, %I.099
+  br i1 %cmp.i, label %_ZNK4llvm12AttributeSet12getSlotIndexEj.exit, label %cond.false.i.split
+
+cond.false.i.split:                               ; preds = %land.lhs.true.i
+  unreachable
+
+_ZNK4llvm12AttributeSet12getSlotIndexEj.exit:     ; preds = %land.lhs.true.i
+  br i1 undef, label %for.end, label %for.inc
+
+for.inc:                                          ; preds = %_ZNK4llvm12AttributeSet12getSlotIndexEj.exit
+  %inc = add i32 %I.099, 1
+  br i1 undef, label %cond.false, label %land.lhs.true.i
+
+for.end:                                          ; preds = %_ZNK4llvm12AttributeSet12getSlotIndexEj.exit
+  %I.099.lcssa129 = phi i32 [ %I.099, %_ZNK4llvm12AttributeSet12getSlotIndexEj.exit ]
+  br i1 undef, label %cond.false, label %_ZNK4llvm12AttributeSet3endEj.exit
+
+cond.false:                                       ; preds = %for.end, %for.inc, %_ZNK4llvm12AttributeSet11getNumSlotsEv.exit, %entry
+  unreachable
+
+_ZNK4llvm12AttributeSet3endEj.exit:               ; preds = %for.end
+  %second.i.i.i = getelementptr inbounds %"struct.std::pair.241.2040.3839.6152.6923.7694.8465.9493.10007.10264.18507"* undef, i32 %I.099.lcssa129, i32 1
+  %0 = load %"class.llvm::AttributeSetNode.230.2029.3828.6141.6912.7683.8454.9482.9996.10253.18506"** %second.i.i.i, align 4, !tbaa !2
+  %NumAttrs.i.i.i = getelementptr inbounds %"class.llvm::AttributeSetNode.230.2029.3828.6141.6912.7683.8454.9482.9996.10253.18506"* %0, i32 0, i32 1
+  %1 = load i32* %NumAttrs.i.i.i, align 4, !tbaa !8
+  %add.ptr.i.i.i55 = getelementptr inbounds %"class.llvm::Attribute.222.2021.3820.6133.6904.7675.8446.9474.9988.10245.18509"* undef, i32 %1
+  br i1 undef, label %return, label %for.body11
+
+for.cond9:                                        ; preds = %_ZNK4llvm9Attribute13getKindAsEnumEv.exit
+  %cmp10 = icmp eq %"class.llvm::Attribute.222.2021.3820.6133.6904.7675.8446.9474.9988.10245.18509"* %incdec.ptr, %add.ptr.i.i.i55
+  br i1 %cmp10, label %return, label %for.body11
+
+for.body11:                                       ; preds = %for.cond9, %_ZNK4llvm12AttributeSet3endEj.exit
+  %I5.096 = phi %"class.llvm::Attribute.222.2021.3820.6133.6904.7675.8446.9474.9988.10245.18509"* [ %incdec.ptr, %for.cond9 ], [ undef, %_ZNK4llvm12AttributeSet3endEj.exit ]
+  %2 = bitcast %"class.llvm::Attribute.222.2021.3820.6133.6904.7675.8446.9474.9988.10245.18509"* %I5.096 to i32*
+  %3 = load i32* %2, align 4, !tbaa !10
+  %tobool.i59 = icmp eq i32 %3, 0
+  br i1 %tobool.i59, label %cond.false21, label %_ZNK4llvm9Attribute15isEnumAttributeEv.exit
+
+_ZNK4llvm9Attribute15isEnumAttributeEv.exit:      ; preds = %for.body11
+  switch i8 undef, label %cond.false21 [
+    i8 0, label %_ZNK4llvm9Attribute13getKindAsEnumEv.exit
+    i8 1, label %_ZNK4llvm9Attribute13getKindAsEnumEv.exit
+    i8 2, label %_ZNK4llvm9Attribute15getKindAsStringEv.exit
+  ]
+
+_ZNK4llvm9Attribute13getKindAsEnumEv.exit:        ; preds = %_ZNK4llvm9Attribute15isEnumAttributeEv.exit, %_ZNK4llvm9Attribute15isEnumAttributeEv.exit
+  %incdec.ptr = getelementptr inbounds %"class.llvm::Attribute.222.2021.3820.6133.6904.7675.8446.9474.9988.10245.18509"* %I5.096, i32 1
+  br i1 undef, label %for.cond9, label %return
+
+cond.false21:                                     ; preds = %_ZNK4llvm9Attribute15isEnumAttributeEv.exit, %for.body11
+  unreachable
+
+_ZNK4llvm9Attribute15getKindAsStringEv.exit:      ; preds = %_ZNK4llvm9Attribute15isEnumAttributeEv.exit
+  unreachable
+
+return:                                           ; preds = %_ZNK4llvm9Attribute13getKindAsEnumEv.exit, %for.cond9, %_ZNK4llvm12AttributeSet3endEj.exit
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 3.7.0 (ssh://llvm@gnu-4.sc.intel.com/export/server/git/llvm/clang 4c31740d4f81614b6d278c7825cfdae5a1c78799) (llvm/llvm.git b693958bd09144aed90312709a7e2ccf7124eb53)"}
+!2 = !{!3, !7, i64 4}
+!3 = !{!"_ZTSSt4pairIjPN4llvm16AttributeSetNodeEE", !4, i64 0, !7, i64 4}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!"any pointer", !5, i64 0}
+!8 = !{!9, !4, i64 4}
+!9 = !{!"_ZTSN4llvm16AttributeSetNodeE", !4, i64 4}
+!10 = !{!7, !7, i64 0}
diff --git a/test/Analysis/ScalarEvolution/scev-expander-incorrect-nowrap.ll b/test/Analysis/ScalarEvolution/scev-expander-incorrect-nowrap.ll
new file mode 100644
index 0000000..012cad7
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/scev-expander-incorrect-nowrap.ll
@@ -0,0 +1,30 @@
+; RUN: opt -indvars -S < %s | FileCheck %s
+
+declare void @use(i32)
+declare void @use.i8(i8)
+
+define void @f() {
+; CHECK-LABEL: @f
+ entry:
+  br label %loop
+
+ loop:
+; The only use for idx.mirror is to induce an nuw for %idx.  It does
+; not induce an nuw for %idx.inc
+  %idx.mirror = phi i8 [ -6, %entry ], [ %idx.mirror.inc, %loop ]
+  %idx = phi i8 [ -5, %entry ], [ %idx.inc, %loop ]
+
+  %idx.sext = sext i8 %idx to i32
+  call void @use(i32 %idx.sext)
+
+  %idx.mirror.inc = add nuw i8 %idx.mirror, 1
+  call void @use.i8(i8 %idx.mirror.inc)
+
+  %idx.inc = add i8 %idx, 1
+; CHECK-NOT: %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %cmp = icmp ugt i8 %idx.inc, 0
+  br i1 %cmp, label %loop, label %exit
+
+ exit:
+  ret void
+}
diff --git a/test/Analysis/ScalarEvolution/scev-prestart-nowrap.ll b/test/Analysis/ScalarEvolution/scev-prestart-nowrap.ll
new file mode 100644
index 0000000..3ca32bd
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/scev-prestart-nowrap.ll
@@ -0,0 +1,82 @@
+; RUN: opt -analyze -scalar-evolution < %s | FileCheck %s
+
+; An example run where SCEV(%postinc)->getStart() may overflow:
+;
+; %start = INT_SMAX
+; %low.limit = INT_SMIN
+; %high.limit = < not used >
+;
+; >> entry:
+;  %postinc.start = INT_SMIN
+;
+; >> loop:
+;  %idx = %start 
+;  %postinc = INT_SMIN
+;  %postinc.inc = INT_SMIN + 1
+;  %postinc.sext = sext(INT_SMIN) = i64 INT32_SMIN
+;  %break.early = INT_SMIN `slt` INT_SMIN = false
+;  br i1 false, ___,  %early.exit
+;
+; >> early.exit:
+;  ret i64 INT32_SMIN
+
+
+define i64 @bad.0(i32 %start, i32 %low.limit, i32 %high.limit) {
+; CHECK-LABEL: Classifying expressions for: @bad.0
+ entry:
+  %postinc.start = add i32 %start, 1
+  br label %loop
+
+ loop:
+  %idx = phi i32 [ %start, %entry ], [ %idx.inc, %continue ]
+  %postinc = phi i32 [ %postinc.start, %entry ], [ %postinc.inc, %continue ]
+  %postinc.inc = add nsw i32 %postinc, 1
+  %postinc.sext = sext i32 %postinc to i64
+; CHECK:  %postinc.sext = sext i32 %postinc to i64
+; CHECK-NEXT:  -->  {(sext i32 (1 + %start) to i64),+,1}<nsw><%loop>
+  %break.early = icmp slt i32 %postinc, %low.limit
+  br i1 %break.early, label %continue, label %early.exit
+
+ continue:
+  %idx.inc = add nsw i32 %idx, 1
+  %cmp = icmp slt i32 %idx.inc, %high.limit
+  br i1 %cmp, label %loop, label %exit
+
+ exit:
+  ret i64 0
+
+ early.exit:
+  ret i64 %postinc.sext
+}
+
+define i64 @bad.1(i32 %start, i32 %low.limit, i32 %high.limit, i1* %unknown) {
+; CHECK-LABEL: Classifying expressions for: @bad.1
+ entry:
+  %postinc.start = add i32 %start, 1
+  br label %loop
+
+ loop:
+  %idx = phi i32 [ %start, %entry ], [ %idx.inc, %continue ], [ %idx.inc, %continue.1 ]
+  %postinc = phi i32 [ %postinc.start, %entry ], [ %postinc.inc, %continue ], [ %postinc.inc, %continue.1 ]
+  %postinc.inc = add nsw i32 %postinc, 1
+  %postinc.sext = sext i32 %postinc to i64
+; CHECK:  %postinc.sext = sext i32 %postinc to i64
+; CHECK-NEXT:  -->  {(sext i32 (1 + %start) to i64),+,1}<nsw><%loop>
+  %break.early = icmp slt i32 %postinc, %low.limit
+  br i1 %break.early, label %continue.1, label %early.exit
+
+ continue.1:
+  %cond = load volatile i1* %unknown
+  %idx.inc = add nsw i32 %idx, 1
+  br i1 %cond, label %loop, label %continue
+
+ continue:
+  %cmp = icmp slt i32 %idx.inc, %high.limit
+  br i1 %cmp, label %loop, label %exit
+
+ exit:
+  ret i64 0
+
+ early.exit:
+  ret i64 %postinc.sext
+}
diff --git a/test/Analysis/ScalarEvolution/zext-signed-addrec.ll b/test/Analysis/ScalarEvolution/zext-signed-addrec.ll
index 27aed3b..4369820 100644
--- a/test/Analysis/ScalarEvolution/zext-signed-addrec.ll
+++ b/test/Analysis/ScalarEvolution/zext-signed-addrec.ll
@@ -43,7 +43,7 @@ if.end:                                           ; preds = %if.end, %for.cond1.
   %shl = and i32 %conv7, 510
   store i32 %shl, i32* @c, align 4
 
-; CHECK: %lsr.iv.next = add i32 %lsr.iv, -258
+; CHECK: %lsr.iv.next = add nsw i32 %lsr.iv, -258
   %dec = add i8 %2, -1
 
   %cmp2 = icmp sgt i8 %dec, -1
diff --git a/test/Analysis/ScopedNoAliasAA/basic-domains.ll b/test/Analysis/ScopedNoAliasAA/basic-domains.ll
index d88a496..7633a6d 100644
--- a/test/Analysis/ScopedNoAliasAA/basic-domains.ll
+++ b/test/Analysis/ScopedNoAliasAA/basic-domains.ll
@@ -22,25 +22,25 @@ entry:
 
 attributes #0 = { nounwind uwtable }
 
-!0 = metadata !{metadata !0, metadata !"some domain"}
-!1 = metadata !{metadata !1, metadata !"some other domain"}
+!0 = !{!0, !"some domain"}
+!1 = !{!1, !"some other domain"}
 
 ; Two scopes (which must be self-referential to avoid being "uniqued"):
-!2 = metadata !{metadata !2, metadata !0, metadata !"a scope in dom0"}
-!3 = metadata !{metadata !2}
+!2 = !{!2, !0, !"a scope in dom0"}
+!3 = !{!2}
 
-!4 = metadata !{metadata !4, metadata !0, metadata !"another scope in dom0"}
-!5 = metadata !{metadata !4}
+!4 = !{!4, !0, !"another scope in dom0"}
+!5 = !{!4}
 
 ; A list of the two scopes.
-!6 = metadata !{metadata !2, metadata !4}
+!6 = !{!2, !4}
 
 ; Another scope in the second domain
-!7 = metadata !{metadata !7, metadata !1, metadata !"another scope in dom1"}
-!8 = metadata !{metadata !7}
+!7 = !{!7, !1, !"another scope in dom1"}
+!8 = !{!7}
 
 ; A list of scopes from both domains.
-!9 = metadata !{metadata !2, metadata !4, metadata !7}
+!9 = !{!2, !4, !7}
 
 ; CHECK: NoAlias:   %0 = load float* %c, align 4, !alias.scope !0 <->   store float %0, float* %arrayidx.i, align 4, !noalias !6
 ; CHECK: NoAlias:   %0 = load float* %c, align 4, !alias.scope !0 <->   store float %1, float* %arrayidx.i2, align 4, !noalias !6
diff --git a/test/Analysis/ScopedNoAliasAA/basic.ll b/test/Analysis/ScopedNoAliasAA/basic.ll
index 73fe333..bb232b5 100644
--- a/test/Analysis/ScopedNoAliasAA/basic.ll
+++ b/test/Analysis/ScopedNoAliasAA/basic.ll
@@ -22,6 +22,6 @@ entry:
 
 attributes #0 = { nounwind uwtable }
 
-!0 = metadata !{metadata !0, metadata !"some domain"}
-!1 = metadata !{metadata !1, metadata !0, metadata !"some scope"}
+!0 = !{!0, !"some domain"}
+!1 = !{!1, !0, !"some scope"}
 
diff --git a/test/Analysis/ScopedNoAliasAA/basic2.ll b/test/Analysis/ScopedNoAliasAA/basic2.ll
index 37b0add..a154b13 100644
--- a/test/Analysis/ScopedNoAliasAA/basic2.ll
+++ b/test/Analysis/ScopedNoAliasAA/basic2.ll
@@ -32,10 +32,10 @@ entry:
 
 attributes #0 = { nounwind uwtable }
 
-!0 = metadata !{metadata !1, metadata !3}
-!1 = metadata !{metadata !1, metadata !2, metadata !"some scope"}
-!2 = metadata !{metadata !2, metadata !"some domain"}
-!3 = metadata !{metadata !3, metadata !2, metadata !"some other scope"}
-!4 = metadata !{metadata !1}
-!5 = metadata !{metadata !3}
+!0 = !{!1, !3}
+!1 = !{!1, !2, !"some scope"}
+!2 = !{!2, !"some domain"}
+!3 = !{!3, !2, !"some other scope"}
+!4 = !{!1}
+!5 = !{!3}
 
diff --git a/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll b/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
index 9051139..920d6f5 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
@@ -32,14 +32,14 @@ attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.4"}
-!1 = metadata !{metadata !2, metadata !2, i64 0}
-!2 = metadata !{metadata !"any pointer", metadata !3, i64 0}
-!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
-!5 = metadata !{metadata !6, metadata !2, i64 8}
-!6 = metadata !{metadata !"_ZTSN12_GLOBAL__N_11RINS_1FIPi8TreeIterN1I1S1LENS_1KINS_1DIKS2_S3_EEEEE1GEPSD_EE", metadata !7, i64 8}
-!7 = metadata !{metadata !"_ZTSN12_GLOBAL__N_11FIPi8TreeIterN1I1S1LENS_1KINS_1DIKS1_S2_EEEEE1GE", metadata !8, i64 0}
-!8 = metadata !{metadata !"_ZTSN12_GLOBAL__N_11DIKPi8TreeIterEE", metadata !2, i64 0, metadata !9, i64 8}
-!9 = metadata !{metadata !"_ZTS8TreeIter", metadata !2, i64 8, metadata !10, i64 16}
-!10 = metadata !{metadata !"bool", metadata !3, i64 0}
+!0 = !{!"clang version 3.4"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"any pointer", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !2, i64 8}
+!6 = !{!"_ZTSN12_GLOBAL__N_11RINS_1FIPi8TreeIterN1I1S1LENS_1KINS_1DIKS2_S3_EEEEE1GEPSD_EE", !7, i64 8}
+!7 = !{!"_ZTSN12_GLOBAL__N_11FIPi8TreeIterN1I1S1LENS_1KINS_1DIKS1_S2_EEEEE1GE", !8, i64 0}
+!8 = !{!"_ZTSN12_GLOBAL__N_11DIKPi8TreeIterEE", !2, i64 0, !9, i64 8}
+!9 = !{!"_ZTS8TreeIter", !2, i64 8, !10, i64 16}
+!10 = !{!"bool", !3, i64 0}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/aliastest.ll b/test/Analysis/TypeBasedAliasAnalysis/aliastest.ll
index 76a88c8..10da13a 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/aliastest.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/aliastest.ll
@@ -45,23 +45,23 @@ define i8 @test1_no(i8* %a, i8* %b) nounwind {
 }
 
 ; Root note.
-!0 = metadata !{ }
+!0 = !{ }
 ; Some type.
-!1 = metadata !{metadata !7, metadata !7, i64 0}
+!1 = !{!7, !7, i64 0}
 ; Some other non-aliasing type.
-!2 = metadata !{metadata !8, metadata !8, i64 0}
+!2 = !{!8, !8, i64 0}
 
 ; Some type.
-!3 = metadata !{metadata !9, metadata !9, i64 0}
+!3 = !{!9, !9, i64 0}
 ; Some type in a different type system.
-!4 = metadata !{metadata !10, metadata !10, i64 0}
+!4 = !{!10, !10, i64 0}
 
 ; Invariant memory.
-!5 = metadata !{metadata !11, metadata !11, i64 0, i1 1}
+!5 = !{!11, !11, i64 0, i1 1}
 ; Not invariant memory.
-!6 = metadata !{metadata !11, metadata !11, i64 0, i1 0}
-!7 = metadata !{ metadata !"foo", metadata !0 }
-!8 = metadata !{ metadata !"bar", metadata !0 }
-!9 = metadata !{ metadata !"foo", metadata !0 }
-!10 = metadata !{ metadata !"bar", metadata !"different" }
-!11 = metadata !{ metadata !"qux", metadata !0}
+!6 = !{!11, !11, i64 0, i1 0}
+!7 = !{ !"foo", !0 }
+!8 = !{ !"bar", !0 }
+!9 = !{ !"foo", !0 }
+!10 = !{ !"bar", !"different" }
+!11 = !{ !"qux", !0}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/argument-promotion.ll b/test/Analysis/TypeBasedAliasAnalysis/argument-promotion.ll
index 14bbeac..31f775e 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/argument-promotion.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/argument-promotion.ll
@@ -32,8 +32,8 @@ define i32 @callercaller(i32* %Q) {
   ret i32 %X
 }
 
-!0 = metadata !{metadata !"test"}
-!1 = metadata !{metadata !3, metadata !3, i64 0}
-!2 = metadata !{metadata !4, metadata !4, i64 0}
-!3 = metadata !{metadata !"green", metadata !0}
-!4 = metadata !{metadata !"blue", metadata !0}
+!0 = !{!"test"}
+!1 = !{!3, !3, i64 0}
+!2 = !{!4, !4, i64 0}
+!3 = !{!"green", !0}
+!4 = !{!"blue", !0}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/dse.ll b/test/Analysis/TypeBasedAliasAnalysis/dse.ll
index 9032fad..09f8feb 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/dse.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/dse.ll
@@ -50,23 +50,23 @@ define i8 @test1_no(i8* %a, i8* %b) nounwind {
 }
 
 ; Root note.
-!0 = metadata !{ }
+!0 = !{ }
 ; Some type.
-!1 = metadata !{metadata !7, metadata !7, i64 0}
+!1 = !{!7, !7, i64 0}
 ; Some other non-aliasing type.
-!2 = metadata !{metadata !8, metadata !8, i64 0}
+!2 = !{!8, !8, i64 0}
 
 ; Some type.
-!3 = metadata !{metadata !9, metadata !9, i64 0}
+!3 = !{!9, !9, i64 0}
 ; Some type in a different type system.
-!4 = metadata !{metadata !10, metadata !10, i64 0}
+!4 = !{!10, !10, i64 0}
 
 ; Invariant memory.
-!5 = metadata !{metadata !11, metadata !11, i64 0, i1 1}
+!5 = !{!11, !11, i64 0, i1 1}
 ; Not invariant memory.
-!6 = metadata !{metadata !11, metadata !11, i64 0, i1 0}
-!7 = metadata !{ metadata !"foo", metadata !0 }
-!8 = metadata !{ metadata !"bar", metadata !0 }
-!9 = metadata !{ metadata !"foo", metadata !0 }
-!10 = metadata !{ metadata !"bar", metadata !"different" }
-!11 = metadata !{ metadata !"qux", metadata !0}
+!6 = !{!11, !11, i64 0, i1 0}
+!7 = !{ !"foo", !0 }
+!8 = !{ !"bar", !0 }
+!9 = !{ !"foo", !0 }
+!10 = !{ !"bar", !"different" }
+!11 = !{ !"qux", !0}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/dynamic-indices.ll b/test/Analysis/TypeBasedAliasAnalysis/dynamic-indices.ll
index 4dc4073..732f5d7 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/dynamic-indices.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/dynamic-indices.ll
@@ -123,15 +123,15 @@ for.end:                                          ; preds = %for.body
   ret float %tmp10
 }
 
-; CHECK: [[TAG]] = metadata !{metadata [[TYPE_LL:!.*]], metadata [[TYPE_LL]], i64 0}
-; CHECK: [[TYPE_LL]] = metadata !{metadata !"long long", metadata {{!.*}}}
-!0 = metadata !{metadata !6, metadata !6, i64 0}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !7, metadata !7, i64 0}
-!4 = metadata !{metadata !8, metadata !8, i64 0}
-!5 = metadata !{metadata !9, metadata !9, i64 0}
-!6 = metadata !{metadata !"short", metadata !1}
-!7 = metadata !{metadata !"long long", metadata !1}
-!8 = metadata !{metadata !"int", metadata !1}
-!9 = metadata !{metadata !"float", metadata !1}
+; CHECK: [[TAG]] = !{[[TYPE_LL:!.*]], [[TYPE_LL]], i64 0}
+; CHECK: [[TYPE_LL]] = !{!"long long", {{!.*}}}
+!0 = !{!6, !6, i64 0}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA", null}
+!3 = !{!7, !7, i64 0}
+!4 = !{!8, !8, i64 0}
+!5 = !{!9, !9, i64 0}
+!6 = !{!"short", !1}
+!7 = !{!"long long", !1}
+!8 = !{!"int", !1}
+!9 = !{!"float", !1}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll b/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll
index e9fb941..6c9439a 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll
@@ -77,10 +77,10 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1) nounwind
 ; CHECK: attributes #2 = { nounwind readonly }
 
 ; Root note.
-!0 = metadata !{ }
+!0 = !{ }
 
 ; Invariant memory.
-!1 = metadata !{metadata !3, metadata !3, i64 0, i1 1 }
+!1 = !{!3, !3, i64 0, i1 1 }
 ; Not invariant memory.
-!2 = metadata !{metadata !3, metadata !3, i64 0, i1 0 }
-!3 = metadata !{ metadata !"foo", metadata !0 }
+!2 = !{!3, !3, i64 0, i1 0 }
+!3 = !{ !"foo", !0 }
diff --git a/test/Analysis/TypeBasedAliasAnalysis/gvn-nonlocal-type-mismatch.ll b/test/Analysis/TypeBasedAliasAnalysis/gvn-nonlocal-type-mismatch.ll
index 90e1abb..edea6d0 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/gvn-nonlocal-type-mismatch.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/gvn-nonlocal-type-mismatch.ll
@@ -46,12 +46,12 @@ entry:
   br i1 %c, label %if.else, label %if.then
 
 if.then:
-  %t = load i32* %p, !tbaa !4
+  %t = load i32* %p, !tbaa !3
   store i32 %t, i32* %q
   ret void
 
 if.else:
-  %u = load i32* %p, !tbaa !3
+  %u = load i32* %p, !tbaa !4
   store i32 %u, i32* %q
   ret void
 }
@@ -61,11 +61,11 @@ if.else:
 
 ; CHECK: @watch_out_for_another_type_change
 ; CHECK: if.then:
-; CHECK:   %t = load i32* %p
-; CHECK:   store i32 %t, i32* %q
+; CHECK:   store i32 0, i32* %q
 ; CHECK:   ret void
 ; CHECK: if.else:
-; CHECK:   store i32 0, i32* %q
+; CHECK:   %u = load i32* %p
+; CHECK:   store i32 %u, i32* %q
 
 define void @watch_out_for_another_type_change(i1 %c, i32* %p, i32* %p1, i32* %q) nounwind {
 entry:
@@ -74,22 +74,22 @@ entry:
   br i1 %c, label %if.else, label %if.then
 
 if.then:
-  %t = load i32* %p, !tbaa !3
+  %t = load i32* %p, !tbaa !4
   store i32 %t, i32* %q
   ret void
 
 if.else:
-  %u = load i32* %p, !tbaa !4
+  %u = load i32* %p, !tbaa !3
   store i32 %u, i32* %q
   ret void
 }
 
-!0 = metadata !{}
-!1 = metadata !{metadata !5, metadata !5, i64 0}
-!2 = metadata !{metadata !6, metadata !6, i64 0}
-!3 = metadata !{metadata !7, metadata !7, i64 0}
-!4 = metadata !{metadata !8, metadata !8, i64 0}
-!5 = metadata !{metadata !"red", metadata !0}
-!6 = metadata !{metadata !"blu", metadata !0}
-!7 = metadata !{metadata !"outer space"}
-!8 = metadata !{metadata !"brick red", metadata !5}
+!0 = !{}
+!1 = !{!5, !5, i64 0}
+!2 = !{!6, !6, i64 0}
+!3 = !{!7, !7, i64 0}
+!4 = !{!8, !8, i64 0}
+!5 = !{!"red", !0}
+!6 = !{!"blu", !0}
+!7 = !{!"outer space"}
+!8 = !{!"brick red", !5}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll b/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
index 93b8e50..0c12cac 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
@@ -25,8 +25,8 @@ declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
 ; CHECK: attributes #0 = { nounwind readonly }
 ; CHECK: attributes [[NUW]] = { nounwind }
 
-!0 = metadata !{metadata !"tbaa root", null}
-!1 = metadata !{metadata !3, metadata !3, i64 0}
-!2 = metadata !{metadata !4, metadata !4, i64 0}
-!3 = metadata !{metadata !"A", metadata !0}
-!4 = metadata !{metadata !"B", metadata !0}
+!0 = !{!"tbaa root", null}
+!1 = !{!3, !3, i64 0}
+!2 = !{!4, !4, i64 0}
+!3 = !{!"A", !0}
+!4 = !{!"B", !0}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/licm.ll b/test/Analysis/TypeBasedAliasAnalysis/licm.ll
index e45fc85..0722a2c 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/licm.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/licm.ll
@@ -29,9 +29,9 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-!0 = metadata !{metadata !"root", null}
-!1 = metadata !{metadata !6, metadata !6, i64 0}
-!2 = metadata !{metadata !7, metadata !7, i64 0}
+!0 = !{!"root", null}
+!1 = !{!6, !6, i64 0}
+!2 = !{!7, !7, i64 0}
 
 ; LICM shouldn't hoist anything here.
 
@@ -56,10 +56,10 @@ loop:
   br label %loop
 }
 
-!3 = metadata !{metadata !"pointer", metadata !8}
-!4 = metadata !{metadata !8, metadata !8, i64 0}
-!5 = metadata !{metadata !9, metadata !9, i64 0}
-!6 = metadata !{metadata !"pointer", metadata !0}
-!7 = metadata !{metadata !"double", metadata !0}
-!8 = metadata !{metadata !"char", metadata !9}
-!9 = metadata !{metadata !"root", null}
+!3 = !{!"pointer", !8}
+!4 = !{!8, !8, i64 0}
+!5 = !{!9, !9, i64 0}
+!6 = !{!"pointer", !0}
+!7 = !{!"double", !0}
+!8 = !{!"char", !9}
+!9 = !{!"root", null}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/memcpyopt.ll b/test/Analysis/TypeBasedAliasAnalysis/memcpyopt.ll
index cdf7281..9fc9e42 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/memcpyopt.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/memcpyopt.ll
@@ -18,10 +18,10 @@ define void @foo(i8* nocapture %p, i8* nocapture %q, i8* nocapture %s) nounwind
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
 
-; CHECK: [[TAGA]] = metadata !{metadata [[TYPEA:!.*]], metadata [[TYPEA]], i64 0}
-; CHECK: [[TYPEA]] = metadata !{metadata !"A", metadata !{{.*}}}
-!0 = metadata !{metadata !"tbaa root", null}
-!1 = metadata !{metadata !3, metadata !3, i64 0}
-!2 = metadata !{metadata !4, metadata !4, i64 0}
-!3 = metadata !{metadata !"A", metadata !0}
-!4 = metadata !{metadata !"B", metadata !0}
+; CHECK: [[TAGA]] = !{[[TYPEA:!.*]], [[TYPEA]], i64 0}
+; CHECK: [[TYPEA]] = !{!"A", !{{.*}}}
+!0 = !{!"tbaa root", null}
+!1 = !{!3, !3, i64 0}
+!2 = !{!4, !4, i64 0}
+!3 = !{!"A", !0}
+!4 = !{!"B", !0}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/placement-tbaa.ll b/test/Analysis/TypeBasedAliasAnalysis/placement-tbaa.ll
index a027841..fd05dbe 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/placement-tbaa.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/placement-tbaa.ll
@@ -97,14 +97,14 @@ declare noalias i8* @_Znwm(i64)
 
 attributes #0 = { nounwind }
 
-!0 = metadata !{metadata !1, metadata !1, i64 0}
-!1 = metadata !{metadata !"int", metadata !2, i64 0}
-!2 = metadata !{metadata !"omnipotent char", metadata !3, i64 0}
-!3 = metadata !{metadata !"Simple C/C++ TBAA"}
-!4 = metadata !{metadata !5, metadata !5, i64 0}
-!5 = metadata !{metadata !"any pointer", metadata !2, i64 0}
-!6 = metadata !{metadata !7, metadata !8, i64 0}
-!7 = metadata !{metadata !"_ZTS3Foo", metadata !8, i64 0}
-!8 = metadata !{metadata !"long", metadata !2, i64 0}
-!9 = metadata !{metadata !10, metadata !5, i64 0}
-!10 = metadata !{metadata !"_ZTS3Bar", metadata !5, i64 0}
+!0 = !{!1, !1, i64 0}
+!1 = !{!"int", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"any pointer", !2, i64 0}
+!6 = !{!7, !8, i64 0}
+!7 = !{!"_ZTS3Foo", !8, i64 0}
+!8 = !{!"long", !2, i64 0}
+!9 = !{!10, !5, i64 0}
+!10 = !{!"_ZTS3Bar", !5, i64 0}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/precedence.ll b/test/Analysis/TypeBasedAliasAnalysis/precedence.ll
index b219ef1..0b697b2 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/precedence.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/precedence.ll
@@ -39,12 +39,12 @@ entry:
   ret i64 %tmp3
 }
 
-!0 = metadata !{metadata !2, metadata !2, i64 0}
-!1 = metadata !{metadata !"simple"}
-!2 = metadata !{metadata !"int", metadata !1}
-!3 = metadata !{metadata !6, metadata !6, i64 0}
-!4 = metadata !{metadata !7, metadata !7, i64 0}
-!5 = metadata !{metadata !8, metadata !8, i64 0}
-!6 = metadata !{metadata !"float", metadata !1}
-!7 = metadata !{metadata !"long", metadata !1}
-!8 = metadata !{metadata !"small", metadata !1}
+!0 = !{!2, !2, i64 0}
+!1 = !{!"simple"}
+!2 = !{!"int", !1}
+!3 = !{!6, !6, i64 0}
+!4 = !{!7, !7, i64 0}
+!5 = !{!8, !8, i64 0}
+!6 = !{!"float", !1}
+!7 = !{!"long", !1}
+!8 = !{!"small", !1}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/sink.ll b/test/Analysis/TypeBasedAliasAnalysis/sink.ll
index 726da6c..1a124b8 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/sink.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/sink.ll
@@ -15,10 +15,10 @@ b:
   ret void
 }
 
-; CHECK: [[TAGA]] = metadata !{metadata [[TYPEA:!.*]], metadata [[TYPEA]], i64 0}
-; CHECK: [[TYPEA]] = metadata !{metadata !"A", metadata !{{.*}}}
-!0 = metadata !{metadata !3, metadata !3, i64 0}
-!1 = metadata !{metadata !4, metadata !4, i64 0}
-!2 = metadata !{metadata !"test"}
-!3 = metadata !{metadata !"A", metadata !2}
-!4 = metadata !{metadata !"B", metadata !2}
+; CHECK: [[TAGA]] = !{[[TYPEA:!.*]], [[TYPEA]], i64 0}
+; CHECK: [[TYPEA]] = !{!"A", !{{.*}}}
+!0 = !{!3, !3, i64 0}
+!1 = !{!4, !4, i64 0}
+!2 = !{!"test"}
+!3 = !{!"A", !2}
+!4 = !{!"B", !2}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll b/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
index 38bece7..3c035af 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
@@ -363,30 +363,30 @@ entry:
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
-!0 = metadata !{metadata !1, metadata !1, i64 0}
-!1 = metadata !{metadata !"any pointer", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA"}
-!4 = metadata !{metadata !5, metadata !5, i64 0}
-!5 = metadata !{metadata !"long long", metadata !2}
-!6 = metadata !{metadata !7, metadata !7, i64 0}
-!7 = metadata !{metadata !"int", metadata !2}
-!8 = metadata !{metadata !9, metadata !7, i64 4}
-!9 = metadata !{metadata !"_ZTS7StructA", metadata !10, i64 0, metadata !7, i64 4, metadata !10, i64 8, metadata !7, i64 12}
-!10 = metadata !{metadata !"short", metadata !2}
-!11 = metadata !{metadata !9, metadata !10, i64 0}
-!12 = metadata !{metadata !13, metadata !7, i64 8}
-!13 = metadata !{metadata !"_ZTS7StructB", metadata !10, i64 0, metadata !9, i64 4, metadata !7, i64 20}
-!14 = metadata !{metadata !13, metadata !10, i64 4}
-!15 = metadata !{metadata !13, metadata !7, i64 20}
-!16 = metadata !{metadata !13, metadata !7, i64 16}
-!17 = metadata !{metadata !18, metadata !7, i64 4}
-!18 = metadata !{metadata !"_ZTS7StructS", metadata !10, i64 0, metadata !7, i64 4}
-!19 = metadata !{metadata !18, metadata !10, i64 0}
-!20 = metadata !{metadata !21, metadata !7, i64 4}
-!21 = metadata !{metadata !"_ZTS8StructS2", metadata !10, i64 0, metadata !7, i64 4}
-!22 = metadata !{metadata !21, metadata !10, i64 0}
-!23 = metadata !{metadata !24, metadata !7, i64 12}
-!24 = metadata !{metadata !"_ZTS7StructC", metadata !10, i64 0, metadata !13, i64 4, metadata !7, i64 28}
-!25 = metadata !{metadata !26, metadata !7, i64 12}
-!26 = metadata !{metadata !"_ZTS7StructD", metadata !10, i64 0, metadata !13, i64 4, metadata !7, i64 28, metadata !2, i64 32}
+!0 = !{!1, !1, i64 0}
+!1 = !{!"any pointer", !2}
+!2 = !{!"omnipotent char", !3}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"long long", !2}
+!6 = !{!7, !7, i64 0}
+!7 = !{!"int", !2}
+!8 = !{!9, !7, i64 4}
+!9 = !{!"_ZTS7StructA", !10, i64 0, !7, i64 4, !10, i64 8, !7, i64 12}
+!10 = !{!"short", !2}
+!11 = !{!9, !10, i64 0}
+!12 = !{!13, !7, i64 8}
+!13 = !{!"_ZTS7StructB", !10, i64 0, !9, i64 4, !7, i64 20}
+!14 = !{!13, !10, i64 4}
+!15 = !{!13, !7, i64 20}
+!16 = !{!13, !7, i64 16}
+!17 = !{!18, !7, i64 4}
+!18 = !{!"_ZTS7StructS", !10, i64 0, !7, i64 4}
+!19 = !{!18, !10, i64 0}
+!20 = !{!21, !7, i64 4}
+!21 = !{!"_ZTS8StructS2", !10, i64 0, !7, i64 4}
+!22 = !{!21, !10, i64 0}
+!23 = !{!24, !7, i64 12}
+!24 = !{!"_ZTS7StructC", !10, i64 0, !13, i64 4, !7, i64 28}
+!25 = !{!26, !7, i64 12}
+!26 = !{!"_ZTS7StructD", !10, i64 0, !13, i64 4, !7, i64 28, !2, i64 32}
diff --git a/test/Analysis/ValueTracking/memory-dereferenceable.ll b/test/Analysis/ValueTracking/memory-dereferenceable.ll
new file mode 100644
index 0000000..1c55efc
--- /dev/null
+++ b/test/Analysis/ValueTracking/memory-dereferenceable.ll
@@ -0,0 +1,34 @@
+; RUN: opt -print-memderefs -analyze -S <%s | FileCheck %s
+
+; Uses the print-deref (+ analyze to print) pass to run
+; isDereferenceablePointer() on many load instruction operands
+
+target datalayout = "e"
+
+declare zeroext i1 @return_i1()
+
+@globalstr = global [6 x i8] c"hello\00"
+
+define void @test(i32 addrspace(1)* dereferenceable(8) %dparam) {
+; CHECK: The following are dereferenceable:
+; CHECK: %globalptr
+; CHECK: %alloca
+; CHECK: %dparam
+; CHECK: %relocate
+; CHECK-NOT: %nparam
+entry:
+    %globalptr = getelementptr inbounds [6 x i8]* @globalstr, i32 0, i32 0
+    %load1 = load i8* %globalptr
+    %alloca = alloca i1
+    %load2 = load i1* %alloca
+    %load3 = load i32 addrspace(1)* %dparam
+    %tok = tail call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 addrspace(1)* %dparam)
+    %relocate = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %tok, i32 4, i32 4)
+    %load4 = load i32 addrspace(1)* %relocate
+    %nparam = getelementptr i32 addrspace(1)* %dparam, i32 5
+    %load5 = load i32 addrspace(1)* %nparam
+    ret void
+}
+
+declare i32 @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()*, i32, i32, ...)
+declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32)
diff --git a/test/Assembler/2010-02-05-FunctionLocalMetadataBecomesNull.ll b/test/Assembler/2010-02-05-FunctionLocalMetadataBecomesNull.ll
index 5cb869d..50ad32e 100644
--- a/test/Assembler/2010-02-05-FunctionLocalMetadataBecomesNull.ll
+++ b/test/Assembler/2010-02-05-FunctionLocalMetadataBecomesNull.ll
@@ -1,4 +1,4 @@
-; RUN: opt -O3 < %s | llvm-dis | not grep badref
+; RUN: opt -S -O3 < %s | FileCheck %s
 ; RUN: verify-uselistorder %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
@@ -12,7 +12,8 @@ target triple = "x86_64-apple-darwin10.2"
 
 define i32 @main() nounwind readonly {
   %diff1 = alloca i64                             ; <i64*> [#uses=2]
-  call void @llvm.dbg.declare(metadata !{i64* %diff1}, metadata !0, metadata !{metadata !"0x102"})
+; CHECK: call void @llvm.dbg.value(metadata i64 72,
+  call void @llvm.dbg.declare(metadata i64* %diff1, metadata !0, metadata !{!"0x102"})
   store i64 72, i64* %diff1, align 8
   %v1 = load %struct.test** @TestArrayPtr, align 8 ; <%struct.test*> [#uses=1]
   %v2 = ptrtoint %struct.test* %v1 to i64 ; <i64> [#uses=1]
@@ -23,13 +24,16 @@ define i32 @main() nounwind readonly {
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
-!7 = metadata !{metadata !1}
-!6 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 131941)\001\00\000\00\000", metadata !8, metadata !9, metadata !9, metadata !7, null, null} ; [ DW_TAG_compile_unit ]
-!0 = metadata !{metadata !"0x100\00c\002\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
-!1 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\00256\000\001", metadata !8, metadata !2, metadata !3, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !8, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !6} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !"/d/j/debug-test.c", metadata !"/Volumes/Data/b"}
-!9 = metadata !{i32 0}
+!7 = !{!1}
+!6 = !{!"0x11\0012\00clang version 3.0 (trunk 131941)\001\00\000\00\000", !8, !9, !9, !7, null, null} ; [ DW_TAG_compile_unit ]
+!0 = !{!"0x100\00c\002\000", !1, !2, !5} ; [ DW_TAG_auto_variable ]
+!1 = !{!"0x2e\00main\00main\00\001\000\001\000\006\00256\000\001", !8, !2, !3, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x29", !8} ; [ DW_TAG_file_type ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !8, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !6} ; [ DW_TAG_base_type ]
+!8 = !{!"/d/j/debug-test.c", !"/Volumes/Data/b"}
+!9 = !{i32 0}
+
+!llvm.module.flags = !{!10}
+!10 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Assembler/ConstantExprNoFold.ll b/test/Assembler/ConstantExprNoFold.ll
index 8d03e7a..83236d5 100644
--- a/test/Assembler/ConstantExprNoFold.ll
+++ b/test/Assembler/ConstantExprNoFold.ll
@@ -31,6 +31,17 @@ target datalayout = "p:32:32"
 @weak.gep = global i32* getelementptr (i32* @weak, i32 1)
 @weak = extern_weak global i32
 
+; An object with weak linkage cannot have it's identity determined at compile time.
+; CHECK: @F = global i1 icmp eq (i32* @weakany, i32* @glob)
+@F = global i1 icmp eq (i32* @weakany, i32* @glob)
+@weakany = weak global i32 0
+
+; Empty globals might end up anywhere, even on top of another global.
+; CHECK: @empty.cmp = global i1 icmp eq ([0 x i8]* @empty.1, [0 x i8]* @empty.2)
+@empty.1 = external global [0 x i8], align 1
+@empty.2 = external global [0 x i8], align 1
+@empty.cmp = global i1 icmp eq ([0 x i8]* @empty.1, [0 x i8]* @empty.2)
+
 ; Don't add an inbounds on @glob.a3, since it's not inbounds.
 ; CHECK: @glob.a3 = alias getelementptr (i32* @glob.a2, i32 1)
 @glob = global i32 0
diff --git a/test/Assembler/alloca-invalid-type-2.ll b/test/Assembler/alloca-invalid-type-2.ll
new file mode 100644
index 0000000..7b1cc62
--- /dev/null
+++ b/test/Assembler/alloca-invalid-type-2.ll
@@ -0,0 +1,9 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+; CHECK: invalid type for alloca
+
+define void @test() {
+entry:
+  alloca i32 (i32)
+  ret void
+}
diff --git a/test/Assembler/alloca-invalid-type.ll b/test/Assembler/alloca-invalid-type.ll
new file mode 100644
index 0000000..413bcbd
--- /dev/null
+++ b/test/Assembler/alloca-invalid-type.ll
@@ -0,0 +1,9 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+; CHECK: invalid type for alloca
+
+define void @test() {
+entry:
+  alloca metadata !{null}
+  ret void
+}
diff --git a/test/Assembler/call-invalid-1.ll b/test/Assembler/call-invalid-1.ll
new file mode 100644
index 0000000..4a12b14
--- /dev/null
+++ b/test/Assembler/call-invalid-1.ll
@@ -0,0 +1,9 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+declare void @f()
+
+define void @g() {
+  call void @f() align 8
+; CHECK: error: call instructions may not have an alignment
+  ret void
+}
diff --git a/test/Assembler/debug-info.ll b/test/Assembler/debug-info.ll
new file mode 100644
index 0000000..435b892
--- /dev/null
+++ b/test/Assembler/debug-info.ll
@@ -0,0 +1,72 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+; CHECK: !named = !{!0, !0, !1, !2, !3, !4, !5, !6, !7, !8, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !27}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30}
+
+; CHECK:      !0 = !MDSubrange(count: 3)
+; CHECK-NEXT: !1 = !MDSubrange(count: 3, lowerBound: 4)
+; CHECK-NEXT: !2 = !MDSubrange(count: 3, lowerBound: -5)
+!0 = !MDSubrange(count: 3)
+!1 = !MDSubrange(count: 3, lowerBound: 0)
+
+!2 = !MDSubrange(count: 3, lowerBound: 4)
+!3 = !MDSubrange(count: 3, lowerBound: -5)
+
+; CHECK-NEXT: !3 = !MDEnumerator(name: "seven", value: 7)
+; CHECK-NEXT: !4 = !MDEnumerator(name: "negeight", value: -8)
+; CHECK-NEXT: !5 = !MDEnumerator(name: "", value: 0)
+!4 = !MDEnumerator(name: "seven", value: 7)
+!5 = !MDEnumerator(name: "negeight", value: -8)
+!6 = !MDEnumerator(name: "", value: 0)
+
+; CHECK-NEXT: !6 = !MDBasicType(tag: DW_TAG_base_type, name: "name", size: 1, align: 2, encoding: DW_ATE_unsigned_char)
+; CHECK-NEXT: !7 = !MDBasicType(tag: DW_TAG_unspecified_type, name: "decltype(nullptr)")
+; CHECK-NEXT: !8 = !MDBasicType(tag: DW_TAG_base_type)
+!7 = !MDBasicType(tag: DW_TAG_base_type, name: "name", size: 1, align: 2, encoding: DW_ATE_unsigned_char)
+!8 = !MDBasicType(tag: DW_TAG_unspecified_type, name: "decltype(nullptr)")
+!9 = !MDBasicType(tag: DW_TAG_base_type)
+!10 = !MDBasicType(tag: DW_TAG_base_type, name: "", size: 0, align: 0, encoding: 0)
+
+; CHECK-NEXT: !9 = distinct !{}
+; CHECK-NEXT: !10 = !MDFile(filename: "path/to/file", directory: "/path/to/dir")
+; CHECK-NEXT: !11 = distinct !{}
+; CHECK-NEXT: !12 = !MDFile(filename: "", directory: "")
+!11 = distinct !{}
+!12 = !MDFile(filename: "path/to/file", directory: "/path/to/dir")
+!13 = distinct !{}
+!14 = !MDFile(filename: "", directory: "")
+
+; CHECK-NEXT: !13 = !MDDerivedType(tag: DW_TAG_pointer_type, baseType: !6, size: 32, align: 32)
+!15 = !MDDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 32, align: 32)
+
+; CHECK-NEXT: !14 = !MDCompositeType(tag: DW_TAG_structure_type, name: "MyType", file: !10, line: 2, size: 32, align: 32, identifier: "MangledMyType")
+; CHECK-NEXT: !15 = distinct !MDCompositeType(tag: DW_TAG_structure_type, name: "Base", file: !10, line: 3, scope: !14, size: 128, align: 32, offset: 64, flags: DIFlagPublic, elements: !16, runtimeLang: DW_LANG_C_plus_plus_11, vtableHolder: !15, templateParams: !18, identifier: "MangledBase")
+; CHECK-NEXT: !16 = !{!17}
+; CHECK-NEXT: !17 = !MDDerivedType(tag: DW_TAG_member, name: "field", file: !10, line: 4, scope: !15, baseType: !6, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
+; CHECK-NEXT: !18 = !{!6}
+; CHECK-NEXT: !19 = !MDCompositeType(tag: DW_TAG_structure_type, name: "Derived", file: !10, line: 3, scope: !14, baseType: !15, size: 128, align: 32, offset: 64, flags: DIFlagPublic, elements: !20, runtimeLang: DW_LANG_C_plus_plus_11, vtableHolder: !15, templateParams: !18, identifier: "MangledBase")
+; CHECK-NEXT: !20 = !{!21}
+; CHECK-NEXT: !21 = !MDDerivedType(tag: DW_TAG_inheritance, scope: !19, baseType: !15)
+; CHECK-NEXT: !22 = !MDDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !6, size: 32, align: 32, extraData: !15)
+; CHECK-NEXT: !23 = !MDCompositeType(tag: DW_TAG_structure_type)
+; CHECK-NEXT: !24 = !MDCompositeType(tag: DW_TAG_structure_type, runtimeLang: DW_LANG_Cobol85)
+!16 = !MDCompositeType(tag: DW_TAG_structure_type, name: "MyType", file: !12, line: 2, size: 32, align: 32, identifier: "MangledMyType")
+!17 = !MDCompositeType(tag: DW_TAG_structure_type, name: "Base", file: !12, line: 3, scope: !16, size: 128, align: 32, offset: 64, flags: DIFlagPublic, elements: !18, runtimeLang: DW_LANG_C_plus_plus_11, vtableHolder: !17, templateParams: !20, identifier: "MangledBase")
+!18 = !{!19}
+!19 = !MDDerivedType(tag: DW_TAG_member, name: "field", file: !12, line: 4, scope: !17, baseType: !7, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
+!20 = !{!7}
+!21 = !MDCompositeType(tag: DW_TAG_structure_type, name: "Derived", file: !12, line: 3, scope: !16, baseType: !17, size: 128, align: 32, offset: 64, flags: DIFlagPublic, elements: !22, runtimeLang: DW_LANG_C_plus_plus_11, vtableHolder: !17, templateParams: !20, identifier: "MangledBase")
+!22 = !{!23}
+!23 = !MDDerivedType(tag: DW_TAG_inheritance, scope: !21, baseType: !17)
+!24 = !MDDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !7, size: 32, align: 32, extraData: !17)
+!25 = !MDCompositeType(tag: DW_TAG_structure_type)
+!26 = !MDCompositeType(tag: DW_TAG_structure_type, runtimeLang: 6)
+
+; !25 = !{!7, !7}
+; !26 = !MDSubroutineType(flags: DIFlagPublic | DIFlagStaticMember, types: !25)
+; !27 = !MDSubroutineType(types: !25)
+!27 = !{!7, !7}
+!28 = !MDSubroutineType(flags: DIFlagPublic | DIFlagStaticMember, types: !27)
+!29 = !MDSubroutineType(flags: 0, types: !27)
+!30 = !MDSubroutineType(types: !27)
diff --git a/test/Assembler/distinct-mdnode.ll b/test/Assembler/distinct-mdnode.ll
new file mode 100644
index 0000000..abd7aea
--- /dev/null
+++ b/test/Assembler/distinct-mdnode.ll
@@ -0,0 +1,28 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10}
+
+!0 = !{}
+!1 = !{}   ; This should merge with !0.
+!2 = !{!0}
+!3 = !{!0} ; This should merge with !2.
+!4 = distinct !{}
+!5 = distinct !{}
+!6 = distinct !{!0}
+!7 = distinct !{!0}
+!8 = distinct !{!8}
+!9 = distinct !{!9}
+!10 = !{!10} ; This should become distinct.
+
+; CHECK: !named = !{!0, !0, !1, !1, !2, !3, !4, !5, !6, !7, !8}
+; CHECK:      !0 = !{}
+; CHECK-NEXT: !1 = !{!0}
+; CHECK-NEXT: !2 = distinct !{}
+; CHECK-NEXT: !3 = distinct !{}
+; CHECK-NEXT: !4 = distinct !{!0}
+; CHECK-NEXT: !5 = distinct !{!0}
+; CHECK-NEXT: !6 = distinct !{!6}
+; CHECK-NEXT: !7 = distinct !{!7}
+; CHECK-NEXT: !8 = distinct !{!8}
+; CHECK-NOT:  !
diff --git a/test/Assembler/drop-debug-info.ll b/test/Assembler/drop-debug-info.ll
new file mode 100644
index 0000000..5109b5e
--- /dev/null
+++ b/test/Assembler/drop-debug-info.ll
@@ -0,0 +1,29 @@
+; RUN: llvm-as < %s -o %t.bc 2>&1 >/dev/null | FileCheck -check-prefix=WARN %s
+; RUN: llvm-dis < %t.bc | FileCheck %s
+; RUN: verify-uselistorder < %t.bc
+
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  ret i32 0, !dbg !12
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9}
+
+!0 = !{!"0x11\0012\00clang version 3.5 (trunk 195495) (llvm/trunk 195495:195504M)\000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/Users/manmanren/llvm_gmail/release/../llvm/tools/clang/test/CodeGen/debug-info-version.c] [DW_LANG_C99]
+!1 = !{!"../llvm/tools/clang/test/CodeGen/debug-info-version.c", !"/Users/manmanren/llvm_gmail/release"}
+!2 = !{i32 0}
+!3 = !{!4}
+!4 = !{!"0x2e\00main\00main\00\003\000\001\000\006\00256\000\003", !1, !5, !6, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [main]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/Users/manmanren/llvm_gmail/release/../llvm/tools/clang/test/CodeGen/debug-info-version.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!12 = !MDLocation(line: 4, scope: !4)
+
+; WARN: warning: ignoring debug info with an invalid version (0)
+; CHECK-NOT: !dbg
+; CHECK-NOT: !llvm.dbg.cu
diff --git a/test/Assembler/extractvalue-no-idx.ll b/test/Assembler/extractvalue-no-idx.ll
new file mode 100644
index 0000000..b313209
--- /dev/null
+++ b/test/Assembler/extractvalue-no-idx.ll
@@ -0,0 +1,8 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+; CHECK: expected index
+
+define void @f1() {
+  extractvalue <{ i32, i32 }> undef, !dbg !0
+  ret void
+}
diff --git a/test/Assembler/functionlocal-metadata.ll b/test/Assembler/functionlocal-metadata.ll
index c46233a..517138d 100644
--- a/test/Assembler/functionlocal-metadata.ll
+++ b/test/Assembler/functionlocal-metadata.ll
@@ -3,32 +3,30 @@
 
 define void @Foo(i32 %a, i32 %b) {
 entry:
-  call void @llvm.dbg.value(metadata !{ i32* %1 }, i64 16, metadata !2, metadata !{metadata !"0x102"})
-; CHECK: call void @llvm.dbg.value(metadata !{i32* %1}, i64 16, metadata ![[ID2:[0-9]+]], metadata {{.*}})
+  call void @llvm.dbg.value(metadata i32* %1, i64 16, metadata !2, metadata !{!"0x102"})
+; CHECK: call void @llvm.dbg.value(metadata i32* %1, i64 16, metadata ![[ID2:[0-9]+]], metadata {{.*}})
   %0 = add i32 %a, 1                              ; <i32> [#uses=1]
   %two = add i32 %b, %0                           ; <i32> [#uses=0]
   %1 = alloca i32                                 ; <i32*> [#uses=1]
 
-  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !{i32* %1}, metadata !{metadata !"0x102"})
-; CHECK: call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !{i32* %1}, metadata {{.*}})
-  call void @llvm.dbg.declare(metadata !{i32 %two}, metadata !{i32 %0}, metadata !{metadata !"0x102"})
-; CHECK: call void @llvm.dbg.declare(metadata !{i32 %two}, metadata !{i32 %0}, metadata {{.*}})
-  call void @llvm.dbg.declare(metadata !{i32 %0}, metadata !{i32* %1, i32 %0}, metadata !{metadata !"0x102"})
-; CHECK: call void @llvm.dbg.declare(metadata !{i32 %0}, metadata !{i32* %1, i32 %0}, metadata {{.*}})
-  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !{i32 %b, i32 %0}, metadata !{metadata !"0x102"})
-; CHECK: call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !{i32 %b, i32 %0}, metadata {{.*}})
-  call void @llvm.dbg.declare(metadata !{i32 %a}, metadata !{i32 %a, metadata !"foo"}, metadata !{metadata !"0x102"})
-; CHECK: call void @llvm.dbg.declare(metadata !{i32 %a}, metadata !{i32 %a, metadata !"foo"}, metadata {{.*}})
-  call void @llvm.dbg.declare(metadata !{i32 %b}, metadata !{metadata !0, i32 %two}, metadata !{metadata !"0x102"})
-; CHECK: call void @llvm.dbg.declare(metadata !{i32 %b}, metadata !{metadata ![[ID0:[0-9]+]], i32 %two}, metadata {{.*}})
-
-  call void @llvm.dbg.value(metadata !{ i32 %a }, i64 0, metadata !1, metadata !{metadata !"0x102"})
-; CHECK: call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata ![[ID1:[0-9]+]], metadata {{.*}})
-  call void @llvm.dbg.value(metadata !{ i32 %0 }, i64 25, metadata !0, metadata !{metadata !"0x102"})
-; CHECK: call void @llvm.dbg.value(metadata !{i32 %0}, i64 25, metadata ![[ID0]], metadata {{.*}})
-  call void @llvm.dbg.value(metadata !{ i32* %1 }, i64 16, metadata !3, metadata !{metadata !"0x102"})
-; CHECK: call void @llvm.dbg.value(metadata !{i32* %1}, i64 16, metadata ![[ID3:[0-9]+]], metadata {{.*}})
-  call void @llvm.dbg.value(metadata !3, i64 12, metadata !2, metadata !{metadata !"0x102"})
+  call void @llvm.dbg.declare(metadata i32* %1, metadata i32* %1, metadata !{!"0x102"})
+; CHECK: call void @llvm.dbg.declare(metadata i32* %1, metadata i32* %1, metadata {{.*}})
+  call void @llvm.dbg.declare(metadata i32 %two, metadata i32 %0, metadata !{!"0x102"})
+; CHECK: call void @llvm.dbg.declare(metadata i32 %two, metadata i32 %0, metadata {{.*}})
+  call void @llvm.dbg.declare(metadata i32* %1, metadata i32 %b, metadata !{!"0x102"})
+; CHECK: call void @llvm.dbg.declare(metadata i32* %1, metadata i32 %b, metadata {{.*}})
+  call void @llvm.dbg.declare(metadata i32 %a, metadata i32 %a, metadata !{!"0x102"})
+; CHECK: call void @llvm.dbg.declare(metadata i32 %a, metadata i32 %a, metadata {{.*}})
+  call void @llvm.dbg.declare(metadata i32 %b, metadata i32 %two, metadata !{!"0x102"})
+; CHECK: call void @llvm.dbg.declare(metadata i32 %b, metadata i32 %two, metadata {{.*}})
+
+  call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !1, metadata !{!"0x102"})
+; CHECK: call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata ![[ID1:[0-9]+]], metadata {{.*}})
+  call void @llvm.dbg.value(metadata i32 %0, i64 25, metadata !0, metadata !{!"0x102"})
+; CHECK: call void @llvm.dbg.value(metadata i32 %0, i64 25, metadata ![[ID0:[0-9]+]], metadata {{.*}})
+  call void @llvm.dbg.value(metadata i32* %1, i64 16, metadata !3, metadata !{!"0x102"})
+; CHECK: call void @llvm.dbg.value(metadata i32* %1, i64 16, metadata ![[ID3:[0-9]+]], metadata {{.*}})
+  call void @llvm.dbg.value(metadata !3, i64 12, metadata !2, metadata !{!"0x102"})
 ; CHECK: call void @llvm.dbg.value(metadata ![[ID3]], i64 12, metadata ![[ID2]], metadata {{.*}})
 
   ret void, !foo !0, !bar !1
@@ -37,11 +35,11 @@ entry:
 
 !llvm.module.flags = !{!4}
 
-!0 = metadata !{i32 662302, i32 26, metadata !1, null}
-!1 = metadata !{i32 4, metadata !"foo"}
-!2 = metadata !{metadata !"bar"}
-!3 = metadata !{metadata !"foo"}
-!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !MDLocation(line: 662302, column: 26, scope: !1)
+!1 = !{i32 4, !"foo"}
+!2 = !{!"bar"}
+!3 = !{!"foo"}
+!4 = !{i32 1, !"Debug Info Version", i32 2}
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
@@ -51,7 +49,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 ; CHECK: !foo = !{![[FOO]]}
 ; CHECK: !bar = !{![[BAR]]}
-; CHECK: ![[ID0]] = metadata !{i32 662302, i32 26, metadata ![[ID1]], null}
-; CHECK: ![[ID1]] = metadata !{i32 4, metadata !"foo"}
-; CHECK: ![[ID2]] = metadata !{metadata !"bar"}
-; CHECK: ![[ID3]] = metadata !{metadata !"foo"}
+; CHECK: ![[ID0]] = !MDLocation(line: 662302, column: 26, scope: ![[ID1]])
+; CHECK: ![[ID1]] = !{i32 4, !"foo"}
+; CHECK: ![[ID2]] = !{!"bar"}
+; CHECK: ![[ID3]] = !{!"foo"}
diff --git a/test/Assembler/generic-debug-node.ll b/test/Assembler/generic-debug-node.ll
new file mode 100644
index 0000000..cb7222b
--- /dev/null
+++ b/test/Assembler/generic-debug-node.ll
@@ -0,0 +1,27 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+; CHECK: !named = !{!0, !1, !1, !2, !2, !2, !2, !3, !4, !2}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9}
+
+; CHECK: !0 = !{}
+!0 = !{}
+
+; CHECK-NEXT: !1 = !GenericDebugNode(tag: DW_TAG_entry_point, header: "some\00header", operands: {!0, !2, !2})
+!1 = !GenericDebugNode(tag: 3, header: "some\00header", operands: {!0, !3, !4})
+!2 = !GenericDebugNode(tag: 3, header: "some\00header", operands: {!{}, !3, !4})
+
+; CHECK-NEXT: !2 = !GenericDebugNode(tag: DW_TAG_entry_point)
+!3 = !GenericDebugNode(tag: 3)
+!4 = !GenericDebugNode(tag: 3, header: "")
+!5 = !GenericDebugNode(tag: 3, operands: {})
+!6 = !GenericDebugNode(tag: 3, header: "", operands: {})
+
+; CHECK-NEXT: !3 = distinct !GenericDebugNode(tag: DW_TAG_entry_point)
+!7 = distinct !GenericDebugNode(tag: 3)
+
+; CHECK-NEXT: !4 = !GenericDebugNode(tag: 65535)
+!8 = !GenericDebugNode(tag: 65535)
+
+; CHECK-NOT: !
+!9 = !GenericDebugNode(tag: DW_TAG_entry_point)
diff --git a/test/Assembler/getelementptr.ll b/test/Assembler/getelementptr.ll
index e938ff4..bd583af 100644
--- a/test/Assembler/getelementptr.ll
+++ b/test/Assembler/getelementptr.ll
@@ -8,6 +8,12 @@
 @C = global i32* getelementptr ([2 x [3 x [5 x [7 x i32]]]]* @A, i64 3, i64 2, i64 0, i64 0, i64 7523)
 ; CHECK: @C = global i32* getelementptr ([2 x [3 x [5 x [7 x i32]]]]* @A, i64 39, i64 1, i64 1, i64 4, i64 5)
 
+; Verify that constant expression GEPs work with i84 indices.
+@D = external global [1 x i32]
+
+@E = global i32* getelementptr inbounds ([1 x i32]* @D, i84 0, i64 1)
+; CHECK: @E = global i32* getelementptr inbounds ([1 x i32]* @D, i84 1, i64 0)
+
 ; Verify that i16 indices work.
 @x = external global {i32, i32}
 @y = global i32* getelementptr ({ i32, i32 }* @x, i16 42, i32 0)
diff --git a/test/Assembler/getelementptr_vec_idx4.ll b/test/Assembler/getelementptr_vec_idx4.ll
new file mode 100644
index 0000000..08fe434
--- /dev/null
+++ b/test/Assembler/getelementptr_vec_idx4.ll
@@ -0,0 +1,5 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+; CHECK: getelementptr vector index has a wrong number of elements
+
+global <2 x i32*> getelementptr (<4 x [3 x {i32, i32}]*> zeroinitializer, <2 x i32> <i32 1, i32 2>, <2 x i32> <i32 2, i32 3>, <2 x i32> <i32 1, i32 1>)
diff --git a/test/Assembler/gv-invalid-type.ll b/test/Assembler/gv-invalid-type.ll
new file mode 100644
index 0000000..bde04da
--- /dev/null
+++ b/test/Assembler/gv-invalid-type.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@gv = global metadata undef
+; CHECK: invalid type for global variable
diff --git a/test/Assembler/inalloca.ll b/test/Assembler/inalloca.ll
index a8c47b4..f433066 100644
--- a/test/Assembler/inalloca.ll
+++ b/test/Assembler/inalloca.ll
@@ -13,5 +13,5 @@ entry:
   ret void
 }
 
-!0 = metadata !{i32 662302, null}
+!0 = !{i32 662302, null}
 !foo = !{ !0 }
diff --git a/test/Assembler/insertvalue-invalid-type-1.ll b/test/Assembler/insertvalue-invalid-type-1.ll
new file mode 100644
index 0000000..de9b782
--- /dev/null
+++ b/test/Assembler/insertvalue-invalid-type-1.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+; CHECK: insertvalue operand and field disagree in type: 'i32' instead of 'i64'
+
+define <{ i32 }> @test() {
+  ret <{ i32 }> insertvalue (<{ i64 }> zeroinitializer, i32 4, 0)
+}
diff --git a/test/Assembler/insertvalue-invalid-type.ll b/test/Assembler/insertvalue-invalid-type.ll
new file mode 100644
index 0000000..6be20e5
--- /dev/null
+++ b/test/Assembler/insertvalue-invalid-type.ll
@@ -0,0 +1,9 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+; CHECK: insertvalue operand and field disagree in type: 'i8*' instead of 'i32'
+
+define void @test() {
+entry:
+  insertvalue { i32, i32 } undef, i8* null, 0
+  ret void
+}
diff --git a/test/Assembler/invalid-attrgrp.ll b/test/Assembler/invalid-attrgrp.ll
new file mode 100644
index 0000000..bf1961a
--- /dev/null
+++ b/test/Assembler/invalid-attrgrp.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+attributes
+; CHECK: expected attribute group id
diff --git a/test/Assembler/invalid-comdat.ll b/test/Assembler/invalid-comdat.ll
index 987e1e1..7351999 100644
--- a/test/Assembler/invalid-comdat.ll
+++ b/test/Assembler/invalid-comdat.ll
@@ -1,4 +1,4 @@
 ; RUN: not llvm-as < %s 2>&1 | FileCheck %s
 
-@v = global i32 0, comdat $v
+@v = global i32 0, comdat($v)
 ; CHECK: use of undefined comdat '$v'
diff --git a/test/Assembler/invalid-datalayout1.ll b/test/Assembler/invalid-datalayout1.ll
new file mode 100644
index 0000000..d1befdc
--- /dev/null
+++ b/test/Assembler/invalid-datalayout1.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "^"
+; CHECK: Unknown specifier in datalayout string
diff --git a/test/Assembler/invalid-datalayout10.ll b/test/Assembler/invalid-datalayout10.ll
new file mode 100644
index 0000000..9f19688
--- /dev/null
+++ b/test/Assembler/invalid-datalayout10.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "m"
+; CHECK: Expected mangling specifier in datalayout string
diff --git a/test/Assembler/invalid-datalayout11.ll b/test/Assembler/invalid-datalayout11.ll
new file mode 100644
index 0000000..f8fed8f
--- /dev/null
+++ b/test/Assembler/invalid-datalayout11.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "m."
+; CHECK: Unexpected trailing characters after mangling specifier in datalayout string
diff --git a/test/Assembler/invalid-datalayout12.ll b/test/Assembler/invalid-datalayout12.ll
new file mode 100644
index 0000000..d79c196
--- /dev/null
+++ b/test/Assembler/invalid-datalayout12.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "f"
+; CHECK: Missing alignment specification in datalayout string
diff --git a/test/Assembler/invalid-datalayout13.ll b/test/Assembler/invalid-datalayout13.ll
new file mode 100644
index 0000000..5ac719d
--- /dev/null
+++ b/test/Assembler/invalid-datalayout13.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = ":32"
+; CHECK: Expected token before separator in datalayout string
diff --git a/test/Assembler/invalid-datalayout14.ll b/test/Assembler/invalid-datalayout14.ll
new file mode 100644
index 0000000..84634b5
--- /dev/null
+++ b/test/Assembler/invalid-datalayout14.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "i64:64:16"
+; CHECK: Preferred alignment cannot be less than the ABI alignment
diff --git a/test/Assembler/invalid-datalayout15.ll b/test/Assembler/invalid-datalayout15.ll
new file mode 100644
index 0000000..ea240b7
--- /dev/null
+++ b/test/Assembler/invalid-datalayout15.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "i64:16:16777216"
+; CHECK: Invalid preferred alignment, must be a 16bit integer
diff --git a/test/Assembler/invalid-datalayout16.ll b/test/Assembler/invalid-datalayout16.ll
new file mode 100644
index 0000000..0dd1abb
--- /dev/null
+++ b/test/Assembler/invalid-datalayout16.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "i64:16777216:16777216"
+; CHECK: Invalid ABI alignment, must be a 16bit integer
diff --git a/test/Assembler/invalid-datalayout17.ll b/test/Assembler/invalid-datalayout17.ll
new file mode 100644
index 0000000..519f5c1
--- /dev/null
+++ b/test/Assembler/invalid-datalayout17.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "i16777216:16:16"
+; CHECK: Invalid bit width, must be a 24bit integer
diff --git a/test/Assembler/invalid-datalayout18.ll b/test/Assembler/invalid-datalayout18.ll
new file mode 100644
index 0000000..b9956f9
--- /dev/null
+++ b/test/Assembler/invalid-datalayout18.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "p:32:32:16"
+; CHECK: Preferred alignment cannot be less than the ABI alignment
diff --git a/test/Assembler/invalid-datalayout2.ll b/test/Assembler/invalid-datalayout2.ll
new file mode 100644
index 0000000..a435612
--- /dev/null
+++ b/test/Assembler/invalid-datalayout2.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "m:v"
+; CHECK: Unknown mangling in datalayout string
diff --git a/test/Assembler/invalid-datalayout3.ll b/test/Assembler/invalid-datalayout3.ll
new file mode 100644
index 0000000..44535fd
--- /dev/null
+++ b/test/Assembler/invalid-datalayout3.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "n0"
+; CHECK: Zero width native integer type in datalayout string
diff --git a/test/Assembler/invalid-datalayout4.ll b/test/Assembler/invalid-datalayout4.ll
new file mode 100644
index 0000000..2d946d3
--- /dev/null
+++ b/test/Assembler/invalid-datalayout4.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "p16777216:64:64:64"
+; CHECK: Invalid address space, must be a 24bit integer
diff --git a/test/Assembler/invalid-datalayout5.ll b/test/Assembler/invalid-datalayout5.ll
new file mode 100644
index 0000000..3ce8791
--- /dev/null
+++ b/test/Assembler/invalid-datalayout5.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "a1:64"
+; CHECK: Sized aggregate specification in datalayout string
diff --git a/test/Assembler/invalid-datalayout6.ll b/test/Assembler/invalid-datalayout6.ll
new file mode 100644
index 0000000..425099f
--- /dev/null
+++ b/test/Assembler/invalid-datalayout6.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "a:"
+; CHECK: Trailing separator in datalayout string
diff --git a/test/Assembler/invalid-datalayout7.ll b/test/Assembler/invalid-datalayout7.ll
new file mode 100644
index 0000000..097227a
--- /dev/null
+++ b/test/Assembler/invalid-datalayout7.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "p:52"
+; CHECK: number of bits must be a byte width multiple
diff --git a/test/Assembler/invalid-datalayout8.ll b/test/Assembler/invalid-datalayout8.ll
new file mode 100644
index 0000000..28832ff
--- /dev/null
+++ b/test/Assembler/invalid-datalayout8.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "e-p"
+; CHECK: Missing size specification for pointer in datalayout string
diff --git a/test/Assembler/invalid-datalayout9.ll b/test/Assembler/invalid-datalayout9.ll
new file mode 100644
index 0000000..dfeac65
--- /dev/null
+++ b/test/Assembler/invalid-datalayout9.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "e-p:64"
+; CHECK: Missing alignment specification for pointer in datalayout string
diff --git a/test/Assembler/invalid-debug-info-version.ll b/test/Assembler/invalid-debug-info-version.ll
new file mode 100644
index 0000000..94a8153
--- /dev/null
+++ b/test/Assembler/invalid-debug-info-version.ll
@@ -0,0 +1,5 @@
+; RUN: opt < %s -S | FileCheck %s
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"Debug Info Version", !""}
+; CHECK: !{i32 1, !"Debug Info Version", !""}
diff --git a/test/Assembler/invalid-fwdref2.ll b/test/Assembler/invalid-fwdref2.ll
new file mode 100644
index 0000000..d823481
--- /dev/null
+++ b/test/Assembler/invalid-fwdref2.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as %s -disable-output 2>&1 | grep "forward reference and definition of global have different types"
+
+@a2 = alias void ()* @g2
+@g2 = internal global i8 42
diff --git a/test/Assembler/invalid-generic-debug-node-tag-bad.ll b/test/Assembler/invalid-generic-debug-node-tag-bad.ll
new file mode 100644
index 0000000..98c10b6
--- /dev/null
+++ b/test/Assembler/invalid-generic-debug-node-tag-bad.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: <stdin>:[[@LINE+1]]:29: error: invalid DWARF tag 'DW_TAG_badtag'
+!0 = !GenericDebugNode(tag: DW_TAG_badtag)
diff --git a/test/Assembler/invalid-generic-debug-node-tag-missing.ll b/test/Assembler/invalid-generic-debug-node-tag-missing.ll
new file mode 100644
index 0000000..5372cbb
--- /dev/null
+++ b/test/Assembler/invalid-generic-debug-node-tag-missing.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: <stdin>:[[@LINE+1]]:47: error: missing required field 'tag'
+!0 = !GenericDebugNode(header: "some\00header")
diff --git a/test/Assembler/invalid-generic-debug-node-tag-overflow.ll b/test/Assembler/invalid-generic-debug-node-tag-overflow.ll
new file mode 100644
index 0000000..1722caa
--- /dev/null
+++ b/test/Assembler/invalid-generic-debug-node-tag-overflow.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK-NOT: error
+!0 = !GenericDebugNode(tag: 65535)
+
+; CHECK: <stdin>:[[@LINE+1]]:29: error: value for 'tag' too large, limit is 65535
+!1 = !GenericDebugNode(tag: 65536)
diff --git a/test/Assembler/invalid-generic-debug-node-tag-wrong-type.ll b/test/Assembler/invalid-generic-debug-node-tag-wrong-type.ll
new file mode 100644
index 0000000..fca24a7
--- /dev/null
+++ b/test/Assembler/invalid-generic-debug-node-tag-wrong-type.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: <stdin>:[[@LINE+1]]:29: error: expected DWARF tag
+!0 = !GenericDebugNode(tag: "string")
diff --git a/test/Assembler/invalid-hexint.ll b/test/Assembler/invalid-hexint.ll
new file mode 100644
index 0000000..a11b8cd
--- /dev/null
+++ b/test/Assembler/invalid-hexint.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+global i64 u0x0p001
+; CHECK: expected value token
diff --git a/test/Assembler/invalid-mdbasictype-missing-tag.ll b/test/Assembler/invalid-mdbasictype-missing-tag.ll
new file mode 100644
index 0000000..4b3823d
--- /dev/null
+++ b/test/Assembler/invalid-mdbasictype-missing-tag.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:31: error: missing required field 'tag'
+!0 = !MDBasicType(name: "name")
diff --git a/test/Assembler/invalid-mdcompileunit-language-bad.ll b/test/Assembler/invalid-mdcompileunit-language-bad.ll
new file mode 100644
index 0000000..cf2da20
--- /dev/null
+++ b/test/Assembler/invalid-mdcompileunit-language-bad.ll
@@ -0,0 +1,5 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: <stdin>:[[@LINE+1]]:31: error: invalid DWARF language 'DW_LANG_NoSuchLanguage'
+!0 = !MDCompileUnit(language: DW_LANG_NoSuchLanguage,
+                    file: !MDFile(filename: "a", directory: "b"))
diff --git a/test/Assembler/invalid-mdcompileunit-language-overflow.ll b/test/Assembler/invalid-mdcompileunit-language-overflow.ll
new file mode 100644
index 0000000..14dab17
--- /dev/null
+++ b/test/Assembler/invalid-mdcompileunit-language-overflow.ll
@@ -0,0 +1,9 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK-NOT: error
+!0 = !MDCompileUnit(language: 65535,
+                    file: !MDFile(filename: "a", directory: "b"))
+
+; CHECK: <stdin>:[[@LINE+1]]:31: error: value for 'language' too large, limit is 65535
+!1 = !MDCompileUnit(language: 65536,
+                    file: !MDFile(filename: "a", directory: "b"))
diff --git a/test/Assembler/invalid-mdcompileunit-missing-language.ll b/test/Assembler/invalid-mdcompileunit-missing-language.ll
new file mode 100644
index 0000000..57a9a3e
--- /dev/null
+++ b/test/Assembler/invalid-mdcompileunit-missing-language.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: <stdin>:[[@LINE+1]]:65: error: missing required field 'language'
+!0 = !MDCompileUnit(file: !MDFile(filename: "a", directory: "b"))
diff --git a/test/Assembler/invalid-mdcompositetype-missing-tag.ll b/test/Assembler/invalid-mdcompositetype-missing-tag.ll
new file mode 100644
index 0000000..a3b1418
--- /dev/null
+++ b/test/Assembler/invalid-mdcompositetype-missing-tag.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:36: error: missing required field 'tag'
+!25 = !MDCompositeType(name: "Type")
diff --git a/test/Assembler/invalid-mdderivedtype-missing-basetype.ll b/test/Assembler/invalid-mdderivedtype-missing-basetype.ll
new file mode 100644
index 0000000..24fa585
--- /dev/null
+++ b/test/Assembler/invalid-mdderivedtype-missing-basetype.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:45: error: missing required field 'baseType'
+!0 = !MDDerivedType(tag: DW_TAG_pointer_type)
diff --git a/test/Assembler/invalid-mdderivedtype-missing-tag.ll b/test/Assembler/invalid-mdderivedtype-missing-tag.ll
new file mode 100644
index 0000000..3620628
--- /dev/null
+++ b/test/Assembler/invalid-mdderivedtype-missing-tag.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:34: error: missing required field 'tag'
+!0 = !MDDerivedType(baseType: !{})
diff --git a/test/Assembler/invalid-mdenumerator-missing-name.ll b/test/Assembler/invalid-mdenumerator-missing-name.ll
new file mode 100644
index 0000000..709c6a5
--- /dev/null
+++ b/test/Assembler/invalid-mdenumerator-missing-name.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:28: error: missing required field 'name'
+!0 = !MDEnumerator(value: 7)
diff --git a/test/Assembler/invalid-mdenumerator-missing-value.ll b/test/Assembler/invalid-mdenumerator-missing-value.ll
new file mode 100644
index 0000000..a850168
--- /dev/null
+++ b/test/Assembler/invalid-mdenumerator-missing-value.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:32: error: missing required field 'value'
+!0 = !MDEnumerator(name: "name")
diff --git a/test/Assembler/invalid-mdexpression-large.ll b/test/Assembler/invalid-mdexpression-large.ll
new file mode 100644
index 0000000..43b8ce0
--- /dev/null
+++ b/test/Assembler/invalid-mdexpression-large.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK-NOT: error
+!0 = !MDExpression(18446744073709551615)
+
+; CHECK: <stdin>:[[@LINE+1]]:20: error: element too large, limit is 18446744073709551615
+!1 = !MDExpression(18446744073709551616)
diff --git a/test/Assembler/invalid-mdexpression-verify.ll b/test/Assembler/invalid-mdexpression-verify.ll
new file mode 100644
index 0000000..e573ef3
--- /dev/null
+++ b/test/Assembler/invalid-mdexpression-verify.ll
@@ -0,0 +1,9 @@
+; RUN: not llvm-as -disable-output < %s 2>&1 | FileCheck -check-prefix VERIFY %s
+; RUN: llvm-as -disable-verify < %s | llvm-dis | FileCheck -check-prefix NOVERIFY %s
+
+; NOVERIFY: !named = !{!0}
+!named = !{!0}
+
+; NOVERIFY: !0 = !MDExpression(0, 1, 9, 7, 2)
+; VERIFY: assembly parsed, but does not verify
+!0 = !MDExpression(0, 1, 9, 7, 2)
diff --git a/test/Assembler/invalid-mdfile-missing-directory.ll b/test/Assembler/invalid-mdfile-missing-directory.ll
new file mode 100644
index 0000000..825db08
--- /dev/null
+++ b/test/Assembler/invalid-mdfile-missing-directory.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:30: error: missing required field 'directory'
+!0 = !MDFile(filename: "file")
diff --git a/test/Assembler/invalid-mdfile-missing-filename.ll b/test/Assembler/invalid-mdfile-missing-filename.ll
new file mode 100644
index 0000000..0dd7117
--- /dev/null
+++ b/test/Assembler/invalid-mdfile-missing-filename.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:30: error: missing required field 'filename'
+!0 = !MDFile(directory: "dir")
diff --git a/test/Assembler/invalid-mdglobalvariable-missing-name.ll b/test/Assembler/invalid-mdglobalvariable-missing-name.ll
new file mode 100644
index 0000000..bc0f724
--- /dev/null
+++ b/test/Assembler/invalid-mdglobalvariable-missing-name.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: <stdin>:[[@LINE+1]]:42: error: missing required field 'name'
+!0 = !MDGlobalVariable(linkageName: "foo")
diff --git a/test/Assembler/invalid-mdimportedentity-missing-parent.ll b/test/Assembler/invalid-mdimportedentity-missing-parent.ll
new file mode 100644
index 0000000..710a027
--- /dev/null
+++ b/test/Assembler/invalid-mdimportedentity-missing-parent.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:51: error: missing required field 'scope'
+!3 = !MDImportedEntity(tag: DW_TAG_imported_module)
diff --git a/test/Assembler/invalid-mdimportedentity-missing-tag.ll b/test/Assembler/invalid-mdimportedentity-missing-tag.ll
new file mode 100644
index 0000000..63cf0d2
--- /dev/null
+++ b/test/Assembler/invalid-mdimportedentity-missing-tag.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:33: error: missing required field 'tag'
+!3 = !MDImportedEntity(scope: !0)
diff --git a/test/Assembler/invalid-mdlexicalblock-missing-parent.ll b/test/Assembler/invalid-mdlexicalblock-missing-parent.ll
new file mode 100644
index 0000000..cdd12af
--- /dev/null
+++ b/test/Assembler/invalid-mdlexicalblock-missing-parent.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:29: error: missing required field 'scope'
+!0 = !MDLexicalBlock(line: 7)
diff --git a/test/Assembler/invalid-mdlexicalblockfile-missing-discriminator.ll b/test/Assembler/invalid-mdlexicalblockfile-missing-discriminator.ll
new file mode 100644
index 0000000..b71eed8
--- /dev/null
+++ b/test/Assembler/invalid-mdlexicalblockfile-missing-discriminator.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:36: error: missing required field 'discriminator'
+!0 = !MDLexicalBlockFile(scope: !{})
diff --git a/test/Assembler/invalid-mdlexicalblockfile-missing-parent.ll b/test/Assembler/invalid-mdlexicalblockfile-missing-parent.ll
new file mode 100644
index 0000000..1c901e2
--- /dev/null
+++ b/test/Assembler/invalid-mdlexicalblockfile-missing-parent.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:42: error: missing required field 'scope'
+!0 = !MDLexicalBlockFile(discriminator: 0)
diff --git a/test/Assembler/invalid-mdlocalvariable-missing-name.ll b/test/Assembler/invalid-mdlocalvariable-missing-name.ll
new file mode 100644
index 0000000..5b23600
--- /dev/null
+++ b/test/Assembler/invalid-mdlocalvariable-missing-name.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: <stdin>:[[@LINE+1]]:29: error: missing required field 'tag'
+!0 = !MDLocalVariable(arg: 7)
diff --git a/test/Assembler/invalid-mdlocation-field-bad.ll b/test/Assembler/invalid-mdlocation-field-bad.ll
new file mode 100644
index 0000000..6ec7c64
--- /dev/null
+++ b/test/Assembler/invalid-mdlocation-field-bad.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: <stdin>:[[@LINE+1]]:18: error: invalid field 'bad'
+!0 = !MDLocation(bad: 0)
diff --git a/test/Assembler/invalid-mdlocation-field-twice.ll b/test/Assembler/invalid-mdlocation-field-twice.ll
new file mode 100644
index 0000000..2335c93
--- /dev/null
+++ b/test/Assembler/invalid-mdlocation-field-twice.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+!0 = !{}
+
+; CHECK: <stdin>:[[@LINE+1]]:38: error: field 'line' cannot be specified more than once
+!1 = !MDLocation(line: 3, scope: !0, line: 3)
diff --git a/test/Assembler/invalid-mdlocation-missing-scope-2.ll b/test/Assembler/invalid-mdlocation-missing-scope-2.ll
new file mode 100644
index 0000000..3b267c9
--- /dev/null
+++ b/test/Assembler/invalid-mdlocation-missing-scope-2.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: <stdin>:[[@LINE+1]]:25: error: missing required field 'scope'
+!0 = !MDLocation(line: 7)
diff --git a/test/Assembler/invalid-mdlocation-missing-scope.ll b/test/Assembler/invalid-mdlocation-missing-scope.ll
new file mode 100644
index 0000000..87f41b3
--- /dev/null
+++ b/test/Assembler/invalid-mdlocation-missing-scope.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: <stdin>:[[@LINE+1]]:18: error: missing required field 'scope'
+!0 = !MDLocation()
diff --git a/test/Assembler/invalid-mdlocation-overflow-column.ll b/test/Assembler/invalid-mdlocation-overflow-column.ll
new file mode 100644
index 0000000..92ea661
--- /dev/null
+++ b/test/Assembler/invalid-mdlocation-overflow-column.ll
@@ -0,0 +1,9 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+!0 = !{}
+
+; CHECK-NOT: error
+!1 = !MDLocation(column: 65535, scope: !0)
+
+; CHECK: <stdin>:[[@LINE+1]]:26: error: value for 'column' too large, limit is 65535
+!2 = !MDLocation(column: 65536, scope: !0)
diff --git a/test/Assembler/invalid-mdlocation-overflow-line.ll b/test/Assembler/invalid-mdlocation-overflow-line.ll
new file mode 100644
index 0000000..535b4c9
--- /dev/null
+++ b/test/Assembler/invalid-mdlocation-overflow-line.ll
@@ -0,0 +1,9 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+!0 = !{}
+
+; CHECK-NOT: error
+!1 = !MDLocation(line: 4294967295, scope: !0)
+
+; CHECK: <stdin>:[[@LINE+1]]:24: error: value for 'line' too large, limit is 4294967295
+!2 = !MDLocation(line: 4294967296, scope: !0)
diff --git a/test/Assembler/invalid-mdnamespace-missing-namespace.ll b/test/Assembler/invalid-mdnamespace-missing-namespace.ll
new file mode 100644
index 0000000..da2f511
--- /dev/null
+++ b/test/Assembler/invalid-mdnamespace-missing-namespace.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:36: error: missing required field 'scope'
+!0 = !MDNamespace(name: "Namespace")
diff --git a/test/Assembler/invalid-mdnode-badref.ll b/test/Assembler/invalid-mdnode-badref.ll
new file mode 100644
index 0000000..cfa03e0
--- /dev/null
+++ b/test/Assembler/invalid-mdnode-badref.ll
@@ -0,0 +1,5 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+!named = !{!0}
+
+; CHECK: [[@LINE+1]]:14: error: use of undefined metadata '!1'
+!0 = !{!0, !1}
diff --git a/test/Assembler/invalid-mdnode-vector.ll b/test/Assembler/invalid-mdnode-vector.ll
new file mode 100644
index 0000000..65231c0
--- /dev/null
+++ b/test/Assembler/invalid-mdnode-vector.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+!0 = !
+; CHECK: expected '{' here
diff --git a/test/Assembler/invalid-mdnode-vector2.ll b/test/Assembler/invalid-mdnode-vector2.ll
new file mode 100644
index 0000000..f800795
--- /dev/null
+++ b/test/Assembler/invalid-mdnode-vector2.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+!0 = !{
+; CHECK: expected metadata operand
diff --git a/test/Assembler/invalid-mdobjcproperty-missing-name.ll b/test/Assembler/invalid-mdobjcproperty-missing-name.ll
new file mode 100644
index 0000000..b55cfa8
--- /dev/null
+++ b/test/Assembler/invalid-mdobjcproperty-missing-name.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:38: error: missing required field 'name'
+!0 = !MDObjCProperty(setter: "setFoo")
diff --git a/test/Assembler/invalid-mdsubprogram-missing-name.ll b/test/Assembler/invalid-mdsubprogram-missing-name.ll
new file mode 100644
index 0000000..54ded22
--- /dev/null
+++ b/test/Assembler/invalid-mdsubprogram-missing-name.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: <stdin>:[[@LINE+1]]:38: error: missing required field 'name'
+!0 = !MDSubprogram(linkageName: "foo")
diff --git a/test/Assembler/invalid-mdsubrange-count-large.ll b/test/Assembler/invalid-mdsubrange-count-large.ll
new file mode 100644
index 0000000..0d150aa
--- /dev/null
+++ b/test/Assembler/invalid-mdsubrange-count-large.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK-NOT: error
+!0 = !MDSubrange(count: 9223372036854775807)
+
+; CHECK: <stdin>:[[@LINE+1]]:25: error: value for 'count' too large, limit is 9223372036854775807
+!1 = !MDSubrange(count: 9223372036854775808)
diff --git a/test/Assembler/invalid-mdsubrange-count-missing.ll b/test/Assembler/invalid-mdsubrange-count-missing.ll
new file mode 100644
index 0000000..bf9cb9a
--- /dev/null
+++ b/test/Assembler/invalid-mdsubrange-count-missing.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:32: error: missing required field 'count'
+!0 = !MDSubrange(lowerBound: -3)
diff --git a/test/Assembler/invalid-mdsubrange-count-negative.ll b/test/Assembler/invalid-mdsubrange-count-negative.ll
new file mode 100644
index 0000000..92c0b4e
--- /dev/null
+++ b/test/Assembler/invalid-mdsubrange-count-negative.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK-NOT: error
+!0 = !MDSubrange(count: -1)
+
+; CHECK: <stdin>:[[@LINE+1]]:25: error: value for 'count' too small, limit is -1
+!0 = !MDSubrange(count: -2)
diff --git a/test/Assembler/invalid-mdsubrange-lowerBound-max.ll b/test/Assembler/invalid-mdsubrange-lowerBound-max.ll
new file mode 100644
index 0000000..1c68e98
--- /dev/null
+++ b/test/Assembler/invalid-mdsubrange-lowerBound-max.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:41: error: value for 'lowerBound' too large, limit is 9223372036854775807
+!0 = !MDSubrange(count: 30, lowerBound: 9223372036854775808)
diff --git a/test/Assembler/invalid-mdsubrange-lowerBound-min.ll b/test/Assembler/invalid-mdsubrange-lowerBound-min.ll
new file mode 100644
index 0000000..b3b2a03
--- /dev/null
+++ b/test/Assembler/invalid-mdsubrange-lowerBound-min.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:41: error: value for 'lowerBound' too small, limit is -9223372036854775808
+!0 = !MDSubrange(count: 30, lowerBound: -9223372036854775809)
diff --git a/test/Assembler/invalid-mdsubroutinetype-missing-types.ll b/test/Assembler/invalid-mdsubroutinetype-missing-types.ll
new file mode 100644
index 0000000..7342417
--- /dev/null
+++ b/test/Assembler/invalid-mdsubroutinetype-missing-types.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:65: error: missing required field 'types'
+!29 = !MDSubroutineType(flags: DIFlagPublic | DIFlagStaticMember)
diff --git a/test/Assembler/invalid-mdtemplatetypeparameter-missing-type.ll b/test/Assembler/invalid-mdtemplatetypeparameter-missing-type.ll
new file mode 100644
index 0000000..62bee70
--- /dev/null
+++ b/test/Assembler/invalid-mdtemplatetypeparameter-missing-type.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:44: error: missing required field 'type'
+!0 = !MDTemplateTypeParameter(name: "param")
diff --git a/test/Assembler/invalid-mdtemplatevalueparameter-missing-tag.ll b/test/Assembler/invalid-mdtemplatevalueparameter-missing-tag.ll
new file mode 100644
index 0000000..fea218c
--- /dev/null
+++ b/test/Assembler/invalid-mdtemplatevalueparameter-missing-tag.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+1]]:55: error: missing required field 'tag'
+!0 = !MDTemplateValueParameter(type: !{}, value: i32 7)
diff --git a/test/Assembler/invalid-mdtemplatevalueparameter-missing-type.ll b/test/Assembler/invalid-mdtemplatevalueparameter-missing-type.ll
new file mode 100644
index 0000000..8ea3acc
--- /dev/null
+++ b/test/Assembler/invalid-mdtemplatevalueparameter-missing-type.ll
@@ -0,0 +1,5 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+2]]:44: error: missing required field 'type'
+!0 = !MDTemplateValueParameter(tag: DW_TAG_template_value_parameter,
+                               value: i32 7)
diff --git a/test/Assembler/invalid-mdtemplatevalueparameter-missing-value.ll b/test/Assembler/invalid-mdtemplatevalueparameter-missing-value.ll
new file mode 100644
index 0000000..e1e3f81
--- /dev/null
+++ b/test/Assembler/invalid-mdtemplatevalueparameter-missing-value.ll
@@ -0,0 +1,5 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: [[@LINE+2]]:41: error: missing required field 'value'
+!0 = !MDTemplateValueParameter(tag: DW_TAG_template_value_parameter,
+                               type: !{})
diff --git a/test/Assembler/invalid-metadata-attachment-has-type.ll b/test/Assembler/invalid-metadata-attachment-has-type.ll
new file mode 100644
index 0000000..74e4151
--- /dev/null
+++ b/test/Assembler/invalid-metadata-attachment-has-type.ll
@@ -0,0 +1,8 @@
+; RUN: not llvm-as -disable-output < %s 2>&1 | FileCheck %s
+; Check common error from old format.
+
+define void @foo() {
+; CHECK: {{.*}}:[[@LINE+1]]:{{[0-9]+}}: error: invalid metadata-value-metadata roundtrip
+  ret void, !bar !{metadata !0}
+}
+!0 = !{}
diff --git a/test/Assembler/invalid-metadata-function-local-attachments.ll b/test/Assembler/invalid-metadata-function-local-attachments.ll
new file mode 100644
index 0000000..20cb0a9
--- /dev/null
+++ b/test/Assembler/invalid-metadata-function-local-attachments.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+define void @foo(i32 %v) {
+entry:
+; CHECK: <stdin>:[[@LINE+1]]:{{[0-9]+}}: error: invalid use of function-local name
+  ret void, !foo !{i32 %v}
+}
diff --git a/test/Assembler/invalid-metadata-function-local-complex-1.ll b/test/Assembler/invalid-metadata-function-local-complex-1.ll
new file mode 100644
index 0000000..be1d16d
--- /dev/null
+++ b/test/Assembler/invalid-metadata-function-local-complex-1.ll
@@ -0,0 +1,10 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+define void @foo(i32 %v) {
+entry:
+; CHECK: <stdin>:[[@LINE+1]]:{{[0-9]+}}: error: invalid use of function-local name
+  call void @llvm.bar(metadata !{i32 %v, i32 0})
+  ret void
+}
+
+declare void @llvm.bar(metadata)
diff --git a/test/Assembler/invalid-metadata-function-local-complex-2.ll b/test/Assembler/invalid-metadata-function-local-complex-2.ll
new file mode 100644
index 0000000..72fa41a
--- /dev/null
+++ b/test/Assembler/invalid-metadata-function-local-complex-2.ll
@@ -0,0 +1,10 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+define void @foo(i32 %v) {
+entry:
+; CHECK: <stdin>:[[@LINE+1]]:{{[0-9]+}}: error: invalid use of function-local name
+  call void @llvm.bar(metadata !{i32 0, i32 %v})
+  ret void
+}
+
+declare void @llvm.bar(metadata)
diff --git a/test/Assembler/invalid-metadata-function-local-complex-3.ll b/test/Assembler/invalid-metadata-function-local-complex-3.ll
new file mode 100644
index 0000000..35ec763
--- /dev/null
+++ b/test/Assembler/invalid-metadata-function-local-complex-3.ll
@@ -0,0 +1,10 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+define void @foo(i32 %v) {
+entry:
+; CHECK: <stdin>:[[@LINE+1]]:{{[0-9]+}}: error: invalid use of function-local name
+  call void @llvm.bar(metadata !{i32 %v})
+  ret void
+}
+
+declare void @llvm.bar(metadata)
diff --git a/test/Assembler/invalid-metadata-has-type.ll b/test/Assembler/invalid-metadata-has-type.ll
new file mode 100644
index 0000000..647cb67
--- /dev/null
+++ b/test/Assembler/invalid-metadata-has-type.ll
@@ -0,0 +1,5 @@
+; RUN: not llvm-as -disable-output < %s 2>&1 | FileCheck %s
+; Check common error from old format.
+
+; CHECK: {{.*}}:[[@LINE+1]]:{{[0-9]+}}: error: unexpected type in metadata definition
+!0 = metadata !{}
diff --git a/test/Assembler/invalid-name.ll b/test/Assembler/invalid-name.ll
index d9d7a11..0681ea5 100644
--- a/test/Assembler/invalid-name.ll
+++ b/test/Assembler/invalid-name.ll
diff --git a/test/Assembler/invalid-name2.ll b/test/Assembler/invalid-name2.ll
new file mode 100644
index 0000000..384dee6
--- /dev/null
+++ b/test/Assembler/invalid-name2.ll
diff --git a/test/Assembler/invalid-specialized-mdnode.ll b/test/Assembler/invalid-specialized-mdnode.ll
new file mode 100644
index 0000000..9a84abb
--- /dev/null
+++ b/test/Assembler/invalid-specialized-mdnode.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: <stdin>:[[@LINE+1]]:6: error: expected metadata type
+!0 = !Invalid(field: 0)
diff --git a/test/Assembler/invalid_cast4.ll b/test/Assembler/invalid_cast4.ll
new file mode 100644
index 0000000..7056f84
--- /dev/null
+++ b/test/Assembler/invalid_cast4.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+; CHECK: invalid cast opcode for cast from 'i64' to 'i64'
+global i64* inttoptr (i64 0 to i64)
diff --git a/test/Assembler/large-comdat.ll b/test/Assembler/large-comdat.ll
new file mode 100644
index 0000000..cab164f
--- /dev/null
+++ b/test/Assembler/large-comdat.ll
@@ -0,0 +1,9 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+$_ZSt9make_pairIN4llvm16DenseMapIteratorINS0_14PointerIntPairIPKNS0_5ValueELj1ENS0_21PointerLikeTypeTraitsIS5_EEEENS0_19NonLocalPointerInfoENS0_12DenseMapInfoIS8_EENS0_12DenseMapPairIS9_EEEEbESt4pairINSt17__decay_and_stripIT_E6__typeENSG_IT0_E6__typeEESH_SK_ = comdat any
+
+; CHECK: $_ZSt9make_pairIN4llvm16DenseMapIteratorINS0_14PointerIntPairIPKNS0_5ValueELj1ENS0_21PointerLikeTypeTraitsIS5_EEEENS0_19NonLocalPointerInfoENS0_12DenseMapInfoIS8_EENS0_12DenseMapPairIS9_EEEEbESt4pairINSt17__decay_and_stripIT_E6__typeENSG_IT0_E6__typeEESH_SK_ = comdat any
+
+define void @_ZSt9make_pairIN4llvm16DenseMapIteratorINS0_14PointerIntPairIPKNS0_5ValueELj1ENS0_21PointerLikeTypeTraitsIS5_EEEENS0_19NonLocalPointerInfoENS0_12DenseMapInfoIS8_EENS0_12DenseMapPairIS9_EEEEbESt4pairINSt17__decay_and_stripIT_E6__typeENSG_IT0_E6__typeEESH_SK_() comdat {
+  ret void
+}
diff --git a/test/Assembler/mdcompileunit.ll b/test/Assembler/mdcompileunit.ll
new file mode 100644
index 0000000..ce00523
--- /dev/null
+++ b/test/Assembler/mdcompileunit.ll
@@ -0,0 +1,31 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+; CHECK: !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !7, !8, !8}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10}
+
+!0 = distinct !{}
+!1 = !MDFile(filename: "path/to/file", directory: "/path/to/dir")
+!2 = distinct !{}
+!3 = distinct !{}
+!4 = distinct !{}
+!5 = distinct !{}
+!6 = distinct !{}
+
+; CHECK: !7 = !MDCompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, flags: "-O2", runtimeVersion: 2, splitDebugFilename: "abc.debug", emissionKind: 3, enums: !2, retainedTypes: !3, subprograms: !4, globals: !5, imports: !6)
+!7 = !MDCompileUnit(language: DW_LANG_C99, file: !1, producer: "clang",
+                    isOptimized: true, flags: "-O2", runtimeVersion: 2,
+                    splitDebugFilename: "abc.debug", emissionKind: 3,
+                    enums: !2, retainedTypes: !3, subprograms: !4,
+                    globals: !5, imports: !6)
+!8 = !MDCompileUnit(language: 12, file: !1, producer: "clang",
+                    isOptimized: true, flags: "-O2", runtimeVersion: 2,
+                    splitDebugFilename: "abc.debug", emissionKind: 3,
+                    enums: !2, retainedTypes: !3, subprograms: !4,
+                    globals: !5, imports: !6)
+
+; CHECK: !8 = !MDCompileUnit(language: DW_LANG_C99, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: 0)
+!9 = !MDCompileUnit(language: 12, file: !1, producer: "",
+                    isOptimized: false, flags: "", runtimeVersion: 0,
+                    splitDebugFilename: "", emissionKind: 0)
+!10 = !MDCompileUnit(language: 12, file: !1)
diff --git a/test/Assembler/mdexpression.ll b/test/Assembler/mdexpression.ll
new file mode 100644
index 0000000..42bbfe0
--- /dev/null
+++ b/test/Assembler/mdexpression.ll
@@ -0,0 +1,16 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+; CHECK: !named = !{!0, !1, !2, !3, !4}
+!named = !{!0, !1, !2, !3, !4}
+
+; CHECK:      !0 = !MDExpression()
+; CHECK-NEXT: !1 = !MDExpression(DW_OP_deref)
+; CHECK-NEXT: !2 = !MDExpression(DW_OP_plus, 3)
+; CHECK-NEXT: !3 = !MDExpression(DW_OP_bit_piece, 3, 7)
+; CHECK-NEXT: !4 = !MDExpression(DW_OP_deref, DW_OP_plus, 3, DW_OP_bit_piece, 3, 7)
+!0 = !MDExpression()
+!1 = !MDExpression(DW_OP_deref)
+!2 = !MDExpression(DW_OP_plus, 3)
+!3 = !MDExpression(DW_OP_bit_piece, 3, 7)
+!4 = !MDExpression(DW_OP_deref, DW_OP_plus, 3, DW_OP_bit_piece, 3, 7)
diff --git a/test/Assembler/mdglobalvariable.ll b/test/Assembler/mdglobalvariable.ll
new file mode 100644
index 0000000..ef04f3e
--- /dev/null
+++ b/test/Assembler/mdglobalvariable.ll
@@ -0,0 +1,22 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+@foo = global i32 0
+
+; CHECK: !named = !{!0, !1, !2, !3, !4, !5, !6}
+!named = !{!0, !1, !2, !3, !4, !5, !6}
+
+!0 = distinct !{}
+!1 = distinct !{}
+!2 = !MDFile(filename: "path/to/file", directory: "/path/to/dir")
+!3 = distinct !{}
+!4 = distinct !{}
+
+; CHECK: !5 = !MDGlobalVariable(scope: !0, name: "foo", linkageName: "foo", file: !2, line: 7, type: !3, isLocal: true, isDefinition: false, variable: i32* @foo, declaration: !4)
+!5 = !MDGlobalVariable(scope: !0, name: "foo", linkageName: "foo",
+                       file: !2, line: 7, type: !3, isLocal: true,
+                       isDefinition: false, variable: i32* @foo,
+                       declaration: !4)
+
+; CHECK: !6 = !MDGlobalVariable(scope: null, name: "bar", isLocal: false, isDefinition: true)
+!6 = !MDGlobalVariable(name: "bar")
diff --git a/test/Assembler/mdimportedentity.ll b/test/Assembler/mdimportedentity.ll
new file mode 100644
index 0000000..a1ac48d
--- /dev/null
+++ b/test/Assembler/mdimportedentity.ll
@@ -0,0 +1,20 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+; CHECK: !named = !{!0, !1, !2, !3, !3}
+!named = !{!0, !1, !2, !3, !4}
+
+; CHECK:      !0 = distinct !{}
+; CHECK-NEXT: !1 = distinct !{}
+!0 = distinct !{}
+!1 = distinct !{}
+
+; CHECK-NEXT: !2 = !MDImportedEntity(tag: DW_TAG_imported_module, scope: !0, entity: !1, line: 7, name: "foo")
+!2 = !MDImportedEntity(tag: DW_TAG_imported_module, scope: !0, entity: !1,
+                       line: 7, name: "foo")
+
+; CHECK-NEXT: !3 = !MDImportedEntity(tag: DW_TAG_imported_module, scope: !0, name: "")
+!3 = !MDImportedEntity(tag: DW_TAG_imported_module, scope: !0)
+!4 = !MDImportedEntity(tag: DW_TAG_imported_module, scope: !0, entity: null,
+                       line: 0, name: "")
+
diff --git a/test/Assembler/mdlexicalblock.ll b/test/Assembler/mdlexicalblock.ll
new file mode 100644
index 0000000..0a2c339
--- /dev/null
+++ b/test/Assembler/mdlexicalblock.ll
@@ -0,0 +1,25 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+; CHECK: !named = !{!0, !1, !2, !3, !4, !4, !5, !6, !7, !7}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9}
+
+!0 = distinct !{}
+!1 = distinct !{}
+!2 = !MDFile(filename: "path/to/file", directory: "/path/to/dir")
+
+; CHECK: !3 = !MDLexicalBlock(scope: !0, file: !2, line: 7, column: 35)
+!3 = !MDLexicalBlock(scope: !0, file: !2, line: 7, column: 35)
+
+; CHECK: !4 = !MDLexicalBlock(scope: !0)
+!4 = !MDLexicalBlock(scope: !0)
+!5 = !MDLexicalBlock(scope: !0, file: null, line: 0, column: 0)
+
+; CHECK: !5 = !MDLexicalBlockFile(scope: !3, file: !2, discriminator: 0)
+; CHECK: !6 = !MDLexicalBlockFile(scope: !3, file: !2, discriminator: 1)
+!6 = !MDLexicalBlockFile(scope: !3, file: !2, discriminator: 0)
+!7 = !MDLexicalBlockFile(scope: !3, file: !2, discriminator: 1)
+
+; CHECK: !7 = !MDLexicalBlockFile(scope: !3, discriminator: 7)
+!8 = !MDLexicalBlockFile(scope: !3, discriminator: 7)
+!9 = !MDLexicalBlockFile(scope: !3, file: null, discriminator: 7)
diff --git a/test/Assembler/mdlocalvariable.ll b/test/Assembler/mdlocalvariable.ll
new file mode 100644
index 0000000..7a4185f
--- /dev/null
+++ b/test/Assembler/mdlocalvariable.ll
@@ -0,0 +1,26 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+@foo = global i32 0
+
+; CHECK: !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
+
+!0 = distinct !{}
+!1 = distinct !{}
+!2 = !MDFile(filename: "path/to/file", directory: "/path/to/dir")
+!3 = distinct !{}
+!4 = distinct !{}
+
+; CHECK: !5 = !MDLocalVariable(tag: DW_TAG_arg_variable, scope: !0, name: "foo", file: !2, line: 7, type: !3, arg: 3, flags: DIFlagArtificial, inlinedAt: !4)
+; CHECK: !6 = !MDLocalVariable(tag: DW_TAG_auto_variable, scope: !0, name: "foo", file: !2, line: 7, type: !3, flags: DIFlagArtificial, inlinedAt: !4)
+!5 = !MDLocalVariable(tag: DW_TAG_arg_variable, scope: !0, name: "foo",
+                      file: !2, line: 7, type: !3, arg: 3,
+                      flags: DIFlagArtificial, inlinedAt: !4)
+!6 = !MDLocalVariable(tag: DW_TAG_auto_variable, scope: !0, name: "foo",
+                      file: !2, line: 7, type: !3, flags: DIFlagArtificial, inlinedAt: !4)
+
+; CHECK: !7 = !MDLocalVariable(tag: DW_TAG_arg_variable, scope: null, name: "", arg: 0)
+; CHECK: !8 = !MDLocalVariable(tag: DW_TAG_auto_variable, scope: null, name: "")
+!7 = !MDLocalVariable(tag: DW_TAG_arg_variable)
+!8 = !MDLocalVariable(tag: DW_TAG_auto_variable)
diff --git a/test/Assembler/mdlocation.ll b/test/Assembler/mdlocation.ll
new file mode 100644
index 0000000..e095d90
--- /dev/null
+++ b/test/Assembler/mdlocation.ll
@@ -0,0 +1,23 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+; CHECK: !named = !{!0, !1, !1, !2, !2, !3, !3, !4}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7}
+
+; CHECK: !0 = !{}
+!0 = !{}
+
+; CHECK-NEXT: !1 = !MDLocation(line: 3, column: 7, scope: !0)
+!1 = !MDLocation(line: 3, column: 7, scope: !0)
+!2 = !MDLocation(scope: !0, column: 7, line: 3)
+
+; CHECK-NEXT: !2 = !MDLocation(line: 3, column: 7, scope: !0, inlinedAt: !1)
+!3 = !MDLocation(scope: !0, inlinedAt: !1, column: 7, line: 3)
+!4 = !MDLocation(column: 7, line: 3, scope: !0, inlinedAt: !1)
+
+; CHECK-NEXT: !3 = !MDLocation(line: 0, scope: !0)
+!5 = !MDLocation(scope: !0)
+!6 = !MDLocation(scope: !0, column: 0, line: 0)
+
+; CHECK-NEXT: !4 = !MDLocation(line: 4294967295, column: 65535, scope: !0)
+!7 = !MDLocation(line: 4294967295, column: 65535, scope: !0)
diff --git a/test/Assembler/mdnamespace.ll b/test/Assembler/mdnamespace.ll
new file mode 100644
index 0000000..d7f6849
--- /dev/null
+++ b/test/Assembler/mdnamespace.ll
@@ -0,0 +1,16 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+; CHECK: !named = !{!0, !1, !2, !3, !4, !4}
+!named = !{!0, !1, !2, !3, !4, !5}
+
+!0 = distinct !{}
+!1 = distinct !{}
+!2 = !MDFile(filename: "path/to/file", directory: "/path/to/dir")
+
+; CHECK: !3 = !MDNamespace(scope: !0, file: !2, name: "Namespace", line: 7)
+!3 = !MDNamespace(scope: !0, file: !2, name: "Namespace", line: 7)
+
+; CHECK: !4 = !MDNamespace(scope: !0)
+!4 = !MDNamespace(scope: !0, file: null, name: "", line: 0)
+!5 = !MDNamespace(scope: !0)
diff --git a/test/Assembler/mdobjcproperty.ll b/test/Assembler/mdobjcproperty.ll
new file mode 100644
index 0000000..8afe943
--- /dev/null
+++ b/test/Assembler/mdobjcproperty.ll
@@ -0,0 +1,20 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+; CHECK: !named = !{!0, !1, !2, !3, !4, !4}
+!named = !{!0, !1, !2, !3, !4, !5}
+
+!0 = distinct !{}
+!1 = !MDFile(filename: "path/to/file", directory: "/path/to/dir")
+!2 = distinct !{}
+
+
+; CHECK: !2 = distinct !{}
+; CHECK-NEXT: !3 = !MDObjCProperty(name: "foo", file: !1, line: 7, setter: "setFoo", getter: "getFoo", attributes: 7, type: !2)
+!3 = !MDObjCProperty(name: "foo", file: !1, line: 7, setter: "setFoo",
+                     getter: "getFoo", attributes: 7, type: !2)
+
+; CHECK-NEXT: !4 = !MDObjCProperty(name: "foo")
+!4 = !MDObjCProperty(name: "foo", file: null, line: 0, setter: "", getter: "",
+                     attributes: 0, type: null)
+!5 = !MDObjCProperty(name: "foo")
diff --git a/test/Assembler/mdsubprogram.ll b/test/Assembler/mdsubprogram.ll
new file mode 100644
index 0000000..aecfefc
--- /dev/null
+++ b/test/Assembler/mdsubprogram.ll
@@ -0,0 +1,28 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+declare void @_Z3foov()
+
+; CHECK: !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9}
+
+!0 = distinct !{}
+!1 = distinct !{}
+!2 = !MDFile(filename: "path/to/file", directory: "/path/to/dir")
+!3 = distinct !{}
+!4 = distinct !{}
+!5 = distinct !{}
+!6 = distinct !{}
+!7 = distinct !{}
+
+; CHECK: !8 = !MDSubprogram(scope: !0, name: "foo", linkageName: "_Zfoov", file: !2, line: 7, type: !3, isLocal: true, isDefinition: false, scopeLine: 8, containingType: !4, virtuality: DW_VIRTUALITY_pure_virtual, virtualIndex: 10, flags: DIFlagPrototyped, isOptimized: true, function: void ()* @_Z3foov, templateParams: !5, declaration: !6, variables: !7)
+!8 = !MDSubprogram(scope: !0, name: "foo", linkageName: "_Zfoov",
+                   file: !2, line: 7, type: !3, isLocal: true,
+                   isDefinition: false, scopeLine: 8, containingType: !4,
+                   virtuality: DW_VIRTUALITY_pure_virtual, virtualIndex: 10,
+                   flags: DIFlagPrototyped, isOptimized: true, function: void ()* @_Z3foov,
+                   templateParams: !5, declaration: !6, variables: !7)
+
+; CHECK: !9 = !MDSubprogram(scope: null, name: "bar", isLocal: false, isDefinition: true, isOptimized: false)
+!9 = !MDSubprogram(name: "bar")
+
diff --git a/test/Assembler/mdsubrange-empty-array.ll b/test/Assembler/mdsubrange-empty-array.ll
new file mode 100644
index 0000000..fa05582
--- /dev/null
+++ b/test/Assembler/mdsubrange-empty-array.ll
@@ -0,0 +1,14 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+; CHECK: !named = !{!0, !0, !1, !2}
+!named = !{!0, !1, !2, !3}
+
+; CHECK:      !0 = !MDSubrange(count: -1)
+; CHECK-NEXT: !1 = !MDSubrange(count: -1, lowerBound: 4)
+; CHECK-NEXT: !2 = !MDSubrange(count: -1, lowerBound: -5)
+!0 = !MDSubrange(count: -1)
+!1 = !MDSubrange(count: -1, lowerBound: 0)
+
+!2 = !MDSubrange(count: -1, lowerBound: 4)
+!3 = !MDSubrange(count: -1, lowerBound: -5)
diff --git a/test/Assembler/mdtemplateparameter.ll b/test/Assembler/mdtemplateparameter.ll
new file mode 100644
index 0000000..f005c08
--- /dev/null
+++ b/test/Assembler/mdtemplateparameter.ll
@@ -0,0 +1,24 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+; CHECK: !named = !{!0, !1, !2, !3, !3, !4, !5, !5}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7}
+
+!0 = distinct !{}
+!1 = distinct !{}
+; CHECK: !1 = distinct !{}
+
+; CHECK-NEXT: !2 = !MDTemplateTypeParameter(name: "Ty", type: !1)
+; CHECK-NEXT: !3 = !MDTemplateTypeParameter(name: "", type: !1)
+!2 = !MDTemplateTypeParameter(name: "Ty", type: !1)
+!3 = !MDTemplateTypeParameter(type: !1)
+!4 = !MDTemplateTypeParameter(name: "", type: !1)
+
+; CHECK-NEXT: !4 = !MDTemplateValueParameter(tag: DW_TAG_template_value_parameter, name: "V", type: !1, value: i32 7)
+; CHECK-NEXT: !5 = !MDTemplateValueParameter(tag: DW_TAG_template_value_parameter, name: "", type: !1, value: i32 7)
+!5 = !MDTemplateValueParameter(tag: DW_TAG_template_value_parameter,
+                               name: "V", type: !1, value: i32 7)
+!6 = !MDTemplateValueParameter(tag: DW_TAG_template_value_parameter,
+                               type: !1, value: i32 7)
+!7 = !MDTemplateValueParameter(tag: DW_TAG_template_value_parameter,
+                               name: "", type: !1, value: i32 7)
diff --git a/test/Assembler/mdtype-large-values.ll b/test/Assembler/mdtype-large-values.ll
new file mode 100644
index 0000000..287e862
--- /dev/null
+++ b/test/Assembler/mdtype-large-values.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+; CHECK: !named = !{!0, !1, !2}
+!named = !{!0, !1, !2}
+
+; CHECK:      !0 = !MDBasicType(tag: DW_TAG_base_type, name: "name", size: 18446744073709551615, align: 18446744073709551614, encoding: DW_ATE_unsigned_char)
+; CHECK-NEXT: !1 = !MDDerivedType(tag: DW_TAG_pointer_type, baseType: !0, size: 18446744073709551615, align: 18446744073709551614, offset: 18446744073709551613)
+; CHECK-NEXT: !2 = !MDCompositeType(tag: DW_TAG_array_type, baseType: !0, size: 18446744073709551615, align: 18446744073709551614, offset: 18446744073709551613)
+!0 = !MDBasicType(tag: DW_TAG_base_type, name: "name", size: 18446744073709551615, align: 18446744073709551614, encoding: DW_ATE_unsigned_char)
+!1 = !MDDerivedType(tag: DW_TAG_pointer_type, baseType: !0, size: 18446744073709551615, align: 18446744073709551614, offset: 18446744073709551613)
+!2 = !MDCompositeType(tag: DW_TAG_array_type, baseType: !0, size: 18446744073709551615, align: 18446744073709551614, offset: 18446744073709551613)
diff --git a/test/Assembler/metadata-null-operands.ll b/test/Assembler/metadata-null-operands.ll
new file mode 100644
index 0000000..acae1d4
--- /dev/null
+++ b/test/Assembler/metadata-null-operands.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+; Don't crash on null operands.  (If/when we add a verify check for these, we
+; should disable the verifier for this test and remove this comment; the test
+; is still important.)
+!named = !{!0, !1}
+!0 = !MDDerivedType(tag: DW_TAG_pointer_type, baseType: null)
+!1 = !MDCompileUnit(language: DW_LANG_C, file: null)
+
+; CHECK: !named = !{!0, !1}
+; CHECK: !0 = !MDDerivedType({{.*}}baseType: null{{.*}})
+; CHECK: !1 = !MDCompileUnit({{.*}}file: null{{.*}})
diff --git a/test/Assembler/metadata.ll b/test/Assembler/metadata.ll
index f6e619d..e2c5923 100644
--- a/test/Assembler/metadata.ll
+++ b/test/Assembler/metadata.ll
@@ -11,8 +11,8 @@ define void @test() {
   ret void, !foo !0, !bar !1
 }
 
-!0 = metadata !{i32 662302, i32 26, metadata !1, null}
-!1 = metadata !{i32 4, metadata !"foo"}
+!0 = !MDLocation(line: 662302, column: 26, scope: !1)
+!1 = !{i32 4, !"foo"}
 
 declare void @llvm.dbg.func.start(metadata) nounwind readnone
 
diff --git a/test/Assembler/named-metadata.ll b/test/Assembler/named-metadata.ll
index 954c189..9fa37a7 100644
--- a/test/Assembler/named-metadata.ll
+++ b/test/Assembler/named-metadata.ll
@@ -1,9 +1,9 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
 ; RUN: verify-uselistorder %s
 
-!0 = metadata !{metadata !"zero"}
-!1 = metadata !{metadata !"one"}
-!2 = metadata !{metadata !"two"}
+!0 = !{!"zero"}
+!1 = !{!"one"}
+!2 = !{!"two"}
 
 !foo = !{!0, !1, !2}
 ; CHECK: !foo = !{!0, !1, !2}
diff --git a/test/Assembler/short-hexpair.ll b/test/Assembler/short-hexpair.ll
new file mode 100644
index 0000000..067ea30
--- /dev/null
+++ b/test/Assembler/short-hexpair.ll
@@ -0,0 +1,4 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+@x = global fp128 0xL01
+; CHECK: @x = global fp128 0xL00000000000000000000000000000001
diff --git a/test/Assembler/unnamed-comdat.ll b/test/Assembler/unnamed-comdat.ll
new file mode 100644
index 0000000..8aa0f78c9
--- /dev/null
+++ b/test/Assembler/unnamed-comdat.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+; CHECK: comdat cannot be unnamed
+
+define void @0() comdat {
+  ret void
+}
diff --git a/test/Assembler/upgrade-loop-metadata.ll b/test/Assembler/upgrade-loop-metadata.ll
index 7c5a580..0852469 100644
--- a/test/Assembler/upgrade-loop-metadata.ll
+++ b/test/Assembler/upgrade-loop-metadata.ll
@@ -31,12 +31,12 @@ for.end:                                          ; preds = %for.cond
   ret void
 }
 
-; CHECK: !{metadata !"llvm.loop.interleave.count", i32 4}
-; CHECK: !{metadata !"llvm.loop.vectorize.width", i32 8}
-; CHECK: !{metadata !"llvm.loop.vectorize.enable", i1 true}
+; CHECK: !{!"llvm.loop.interleave.count", i32 4}
+; CHECK: !{!"llvm.loop.vectorize.width", i32 8}
+; CHECK: !{!"llvm.loop.vectorize.enable", i1 true}
 
-!0 = metadata !{metadata !"clang version 3.5.0 (trunk 211528)"}
-!1 = metadata !{metadata !1, metadata !2, metadata !3, metadata !4, metadata !4}
-!2 = metadata !{metadata !"llvm.vectorizer.unroll", i32 4}
-!3 = metadata !{metadata !"llvm.vectorizer.width", i32 8}
-!4 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
+!0 = !{!"clang version 3.5.0 (trunk 211528)"}
+!1 = !{!1, !2, !3, !4, !4}
+!2 = !{!"llvm.vectorizer.unroll", i32 4}
+!3 = !{!"llvm.vectorizer.width", i32 8}
+!4 = !{!"llvm.vectorizer.enable", i1 true}
diff --git a/test/Bindings/Go/go.test b/test/Bindings/Go/go.test
index 3951483..14eb328 100644
--- a/test/Bindings/Go/go.test
+++ b/test/Bindings/Go/go.test
@@ -1,3 +1,3 @@
 ; RUN: llvm-go test llvm.org/llvm/bindings/go/llvm
 
-; REQUIRES: shell
+; REQUIRES: shell, not_ubsan
diff --git a/test/Bindings/OCaml/core.ml b/test/Bindings/OCaml/core.ml
index c08351e..c5e47e7 100644
--- a/test/Bindings/OCaml/core.ml
+++ b/test/Bindings/OCaml/core.ml
@@ -1146,7 +1146,7 @@ let test_builder () =
     (* CHECK: %dbg = add i32 %P1, %P2, !dbg !2
      * !2 is metadata emitted at EOF.
      *)
-    insist ((current_debug_location atentry) = None);
+    insist ((current_debug_location atentry) = Some (mdnode context [||]));
 
     let m_line = const_int i32_type 2 in
     let m_col = const_int i32_type 3 in
@@ -1428,15 +1428,24 @@ let test_builder () =
     add_incoming (p2, b2) phi;
     insist ([(p1, b1); (p2, b2)] = incoming phi);
 
+    (* CHECK: %PhiEmptyNode = phi i8
+     *)
+    let phi_empty = build_empty_phi i8_type "PhiEmptyNode" at_jb in
+    insist ([] = incoming phi_empty);
+
+    (* can't emit an empty phi to bitcode *)
+    add_incoming (const_int i8_type 1, b1) phi_empty;
+    add_incoming (const_int i8_type 2, b2) phi_empty;
+
     ignore (build_unreachable at_jb);
   end
 
 (* End-of-file checks for things like metdata and attributes.
  * CHECK: attributes #0 = {{.*}}uwtable{{.*}}
  * CHECK: !llvm.module.flags = !{!0}
- * CHECK: !0 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
- * CHECK: !1 = metadata !{i32 1, metadata !"metadata test"}
- * CHECK: !2 = metadata !{i32 2, i32 3, metadata !3, metadata !3}
+ * CHECK: !0 = !{i32 1, !"Debug Info Version", i32 2}
+ * CHECK: !1 = !{i32 1, !"metadata test"}
+ * CHECK: !2 = !MDLocation(line: 2, column: 3, scope: !3, inlinedAt: !3)
  *)
 
 (*===-- Pass Managers -----------------------------------------------------===*)
diff --git a/test/Bindings/OCaml/executionengine.ml b/test/Bindings/OCaml/executionengine.ml
index 893f988..1de2cfb 100644
--- a/test/Bindings/OCaml/executionengine.ml
+++ b/test/Bindings/OCaml/executionengine.ml
@@ -50,7 +50,10 @@ let test_executionengine () =
   let ee = create m in
 
   (* add plus *)
-  let plus = define_plus m in
+  ignore (define_plus m);
+
+  (* declare global variable *)
+  ignore (define_global "globvar" (const_int i32_type 23) m);
 
   (* add module *)
   let m2 = create_module (global_context ()) "test_module2" in
@@ -73,9 +76,13 @@ let test_executionengine () =
   (* run_static_ctors *)
   run_static_ctors ee;
 
+  (* get a handle on globvar *)
+  let varh    = get_global_value_address "globvar" int32_t ee in
+  if 23l <> varh then bomb "get_global_value_address didn't work";
+
   (* call plus *)
   let cplusty = Foreign.funptr (int32_t @-> int32_t @-> returning int32_t) in
-  let cplus   = get_pointer_to_global plus cplusty ee in
+  let cplus   = get_function_address "plus" cplusty ee in
   if 4l <> cplus 2l 2l then bomb "plus didn't work";
 
   (* call getglobal *)
diff --git a/test/Bindings/OCaml/linker.ml b/test/Bindings/OCaml/linker.ml
index 00064b0..1ea0be9 100644
--- a/test/Bindings/OCaml/linker.ml
+++ b/test/Bindings/OCaml/linker.ml
@@ -39,19 +39,19 @@ let test_linker () =
 
   let m1 = make_module "one"
   and m2 = make_module "two" in
-  link_modules m1 m2 Mode.PreserveSource;
+  link_modules m1 m2;
   dispose_module m1;
   dispose_module m2;
 
   let m1 = make_module "one"
   and m2 = make_module "two" in
-  link_modules m1 m2 Mode.DestroySource;
+  link_modules m1 m2;
   dispose_module m1;
 
   let m1 = make_module "one"
   and m2 = make_module "one" in
   try
-    link_modules m1 m2 Mode.PreserveSource;
+    link_modules m1 m2;
     failwith "must raise"
   with Error _ ->
     dispose_module m1;
diff --git a/test/Bindings/OCaml/lit.local.cfg b/test/Bindings/OCaml/lit.local.cfg
index bca5d39..7a83ca1 100644
--- a/test/Bindings/OCaml/lit.local.cfg
+++ b/test/Bindings/OCaml/lit.local.cfg
@@ -3,5 +3,5 @@ config.suffixes = ['.ml']
 if not 'ocaml' in config.root.llvm_bindings:
     config.unsupported = True
 
-if config.root.have_ocaml_ounit != '1':
+if config.root.have_ocaml_ounit not in ('1', 'TRUE'):
     config.unsupported = True
diff --git a/test/Bindings/OCaml/transform_utils.ml b/test/Bindings/OCaml/transform_utils.ml
new file mode 100644
index 0000000..6b46df1
--- /dev/null
+++ b/test/Bindings/OCaml/transform_utils.ml
@@ -0,0 +1,21 @@
+(* RUN: cp %s %T/transform_utils.ml
+ * RUN: %ocamlc -g -warn-error A -package llvm.transform_utils -linkpkg %T/transform_utils.ml -o %t
+ * RUN: %t
+ * RUN: %ocamlopt -g -warn-error A -package llvm.transform_utils -linkpkg %T/transform_utils.ml -o %t
+ * RUN: %t
+ * XFAIL: vg_leak
+ *)
+
+open Llvm
+open Llvm_transform_utils
+
+let context = global_context ()
+
+let test_clone_module () =
+  let m  = create_module context "mod" in
+  let m' = clone_module m in
+  if m == m' then failwith "m == m'";
+  if string_of_llmodule m <> string_of_llmodule m' then failwith "string_of m <> m'"
+
+let () =
+  test_clone_module ()
diff --git a/test/Bindings/llvm-c/Inputs/invalid.ll.bc b/test/Bindings/llvm-c/Inputs/invalid.ll.bc
new file mode 100644
index 0000000..a85c364
--- /dev/null
+++ b/test/Bindings/llvm-c/Inputs/invalid.ll.bc
diff --git a/test/Bindings/llvm-c/add_named_metadata_operand.ll b/test/Bindings/llvm-c/add_named_metadata_operand.ll
new file mode 100644
index 0000000..fcc833d
--- /dev/null
+++ b/test/Bindings/llvm-c/add_named_metadata_operand.ll
@@ -0,0 +1,2 @@
+; RUN: llvm-c-test --add-named-metadata-operand < /dev/null
+; This used to trigger an assertion
diff --git a/test/Bindings/llvm-c/invalid-bitcode.test b/test/Bindings/llvm-c/invalid-bitcode.test
new file mode 100644
index 0000000..6318a9b
--- /dev/null
+++ b/test/Bindings/llvm-c/invalid-bitcode.test
@@ -0,0 +1,3 @@
+; RUN: not llvm-c-test --module-dump < %S/Inputs/invalid.ll.bc 2>&1 | FileCheck %s
+
+CHECK: Error parsing bitcode: Unknown attribute kind (48)
diff --git a/test/Bindings/llvm-c/set_metadata.ll b/test/Bindings/llvm-c/set_metadata.ll
new file mode 100644
index 0000000..0a65b8c
--- /dev/null
+++ b/test/Bindings/llvm-c/set_metadata.ll
@@ -0,0 +1,2 @@
+; RUN: llvm-c-test --set-metadata < /dev/null
+; This used to trigger an assertion
diff --git a/test/Bitcode/Inputs/invalid-abbrev.bc b/test/Bitcode/Inputs/invalid-abbrev.bc
new file mode 100644
index 0000000..4e8f394
--- /dev/null
+++ b/test/Bitcode/Inputs/invalid-abbrev.bc
diff --git a/test/Bitcode/Inputs/invalid-align.bc b/test/Bitcode/Inputs/invalid-align.bc
new file mode 100644
index 0000000..e84fa6c
--- /dev/null
+++ b/test/Bitcode/Inputs/invalid-align.bc
diff --git a/test/Bitcode/Inputs/invalid-bad-abbrev-number.bc b/test/Bitcode/Inputs/invalid-bad-abbrev-number.bc
new file mode 100644
index 0000000..e4e1fb3
--- /dev/null
+++ b/test/Bitcode/Inputs/invalid-bad-abbrev-number.bc
@@ -0,0 +1 @@
+BC��!0000000000
+\ No newline at end of file
diff --git a/test/Bitcode/Inputs/invalid-bitwidth.bc b/test/Bitcode/Inputs/invalid-bitwidth.bc
new file mode 100644
index 0000000..e9028f7
--- /dev/null
+++ b/test/Bitcode/Inputs/invalid-bitwidth.bc
diff --git a/test/Bitcode/Inputs/invalid-extractval-array-idx.bc b/test/Bitcode/Inputs/invalid-extractval-array-idx.bc
new file mode 100644
index 0000000..7465df3
--- /dev/null
+++ b/test/Bitcode/Inputs/invalid-extractval-array-idx.bc
diff --git a/test/Bitcode/Inputs/invalid-extractval-struct-idx.bc b/test/Bitcode/Inputs/invalid-extractval-struct-idx.bc
new file mode 100644
index 0000000..ccb40f7
--- /dev/null
+++ b/test/Bitcode/Inputs/invalid-extractval-struct-idx.bc
diff --git a/test/Bitcode/Inputs/invalid-extractval-too-many-idxs.bc b/test/Bitcode/Inputs/invalid-extractval-too-many-idxs.bc
new file mode 100644
index 0000000..543a3ba
--- /dev/null
+++ b/test/Bitcode/Inputs/invalid-extractval-too-many-idxs.bc
diff --git a/test/Bitcode/Inputs/invalid-insertval-array-idx.bc b/test/Bitcode/Inputs/invalid-insertval-array-idx.bc
new file mode 100644
index 0000000..79c3c03
--- /dev/null
+++ b/test/Bitcode/Inputs/invalid-insertval-array-idx.bc
diff --git a/test/Bitcode/Inputs/invalid-insertval-struct-idx.bc b/test/Bitcode/Inputs/invalid-insertval-struct-idx.bc
new file mode 100644
index 0000000..ec70384
--- /dev/null
+++ b/test/Bitcode/Inputs/invalid-insertval-struct-idx.bc
diff --git a/test/Bitcode/Inputs/invalid-insertval-too-many-idxs.bc b/test/Bitcode/Inputs/invalid-insertval-too-many-idxs.bc
new file mode 100644
index 0000000..fd21ac2
--- /dev/null
+++ b/test/Bitcode/Inputs/invalid-insertval-too-many-idxs.bc
diff --git a/test/Bitcode/Inputs/invalid-pr20485.bc b/test/Bitcode/Inputs/invalid-pr20485.bc
new file mode 100644
index 0000000..b6211de
--- /dev/null
+++ b/test/Bitcode/Inputs/invalid-pr20485.bc
diff --git a/test/Bitcode/Inputs/invalid-type-table-forward-ref.bc b/test/Bitcode/Inputs/invalid-type-table-forward-ref.bc
new file mode 100644
index 0000000..4594efe
--- /dev/null
+++ b/test/Bitcode/Inputs/invalid-type-table-forward-ref.bc
diff --git a/test/Bitcode/Inputs/invalid-unexpected-eof.bc b/test/Bitcode/Inputs/invalid-unexpected-eof.bc
new file mode 100644
index 0000000..a487393
--- /dev/null
+++ b/test/Bitcode/Inputs/invalid-unexpected-eof.bc
@@ -0,0 +1 @@
+BC��!00000000000000000000
+\ No newline at end of file
diff --git a/test/Bitcode/calling-conventions.3.2.ll b/test/Bitcode/calling-conventions.3.2.ll
index f36e9f8..b60f1d7 100644
--- a/test/Bitcode/calling-conventions.3.2.ll
+++ b/test/Bitcode/calling-conventions.3.2.ll
@@ -15,7 +15,7 @@ declare coldcc void @coldcc()
 ; CHECK: declare coldcc void @coldcc
 
 declare cc10 void @cc10()
-; CHECK: declare cc10 void @cc10
+; CHECK: declare ghccc void @cc10
 
 declare spir_kernel void @spir_kernel()
 ; CHECK: declare spir_kernel void @spir_kernel
@@ -72,7 +72,7 @@ define void @call_coldcc() {
 }
 
 define void @call_cc10 () { 
-; CHECK: call cc10 void @cc10 
+; CHECK: call ghccc void @cc10
   call cc10 void @cc10 ()
   ret void
 }
diff --git a/test/Bitcode/drop-debug-info.3.5.ll b/test/Bitcode/drop-debug-info.3.5.ll
new file mode 100644
index 0000000..fde136d
--- /dev/null
+++ b/test/Bitcode/drop-debug-info.3.5.ll
@@ -0,0 +1,40 @@
+; RUN: llvm-dis < %s.bc -o %t.ll 2>&1 | FileCheck -check-prefix=WARN %s
+; RUN: FileCheck -input-file=%t.ll %s
+
+; The bitcode paired with this test was generated by passing this file to
+; llvm-dis-3.5.  This tests that llvm-dis warns correctly when reading old
+; bitcode.
+
+; CHECK-NOT: !llvm.dbg.cu
+; CHECK-NOT: !dbg
+; WARN: warning: ignoring debug info with an invalid version (1)
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  ret i32 0, !dbg !12
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.2 (230356)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/Users/dexonsmith/data/llvm/staging/test/Bitcode/t.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"t.c", metadata !"/Users/dexonsmith/data/llvm/staging/test/Bitcode"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/Users/dexonsmith/data/llvm/staging/test/Bitcode/t.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{metadata !"clang version 3.5.2 (230356)"}
+!12 = metadata !{i32 1, i32 14, metadata !4, null}
diff --git a/test/Bitcode/drop-debug-info.3.5.ll.bc b/test/Bitcode/drop-debug-info.3.5.ll.bc
new file mode 100644
index 0000000..c401989
--- /dev/null
+++ b/test/Bitcode/drop-debug-info.3.5.ll.bc
diff --git a/test/Bitcode/drop-debug-info.ll b/test/Bitcode/drop-debug-info.ll
deleted file mode 100644
index a2f5694..0000000
--- a/test/Bitcode/drop-debug-info.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llvm-as < %s -o %t.bc 2>&1 >/dev/null | FileCheck -check-prefix=WARN %s
-; RUN: llvm-dis < %t.bc | FileCheck %s
-; RUN: verify-uselistorder < %t.bc
-
-define i32 @main() {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  ret i32 0, !dbg !12
-}
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!9}
-
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 (trunk 195495) (llvm/trunk 195495:195504M)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/manmanren/llvm_gmail/release/../llvm/tools/clang/test/CodeGen/debug-info-version.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"../llvm/tools/clang/test/CodeGen/debug-info-version.c", metadata !"/Users/manmanren/llvm_gmail/release"}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00main\00main\00\003\000\001\000\006\00256\000\003", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [main]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/Users/manmanren/llvm_gmail/release/../llvm/tools/clang/test/CodeGen/debug-info-version.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!12 = metadata !{i32 4, i32 0, metadata !4, null}
-
-; WARN: warning: ignoring debug info with an invalid version (0)
-; CHECK-NOT: !dbg
-; CHECK-NOT: !llvm.dbg.cu
diff --git a/test/Bitcode/function-encoding-rel-operands.ll b/test/Bitcode/function-encoding-rel-operands.ll
index 24d6d80..a96253b 100644
--- a/test/Bitcode/function-encoding-rel-operands.ll
+++ b/test/Bitcode/function-encoding-rel-operands.ll
@@ -35,9 +35,9 @@ define double @test_float_binops(i32 %a) nounwind {
 
 
 ; CHECK: FUNCTION_BLOCK
-; skip checking operands of INST_INBOUNDS_GEP since that depends on ordering
+; skip checking operands of INST_GEP since that depends on ordering
 ; between literals and the formal parameters.
-; CHECK: INST_INBOUNDS_GEP {{.*}}
+; CHECK: INST_GEP {{.*}}
 ; CHECK: INST_LOAD {{.*}}op0=1 {{.*}}
 ; CHECK: INST_CMP2 op0=1 {{.*}}
 ; CHECK: INST_RET {{.*}}op0=1
diff --git a/test/Bitcode/function-local-metadata.3.5.ll b/test/Bitcode/function-local-metadata.3.5.ll
new file mode 100644
index 0000000..5bd8296
--- /dev/null
+++ b/test/Bitcode/function-local-metadata.3.5.ll
@@ -0,0 +1,35 @@
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+; Check that function-local metadata is dropped correctly when it's not a
+; direct argument to a call instruction.
+;
+; Bitcode assembled by llvm-as v3.5.0.
+
+define void @foo(i32 %v) {
+; CHECK: entry:
+entry:
+; CHECK-NEXT: call void @llvm.bar(metadata i32 %v)
+  call void @llvm.bar(metadata !{i32 %v})
+
+; Note: these supposedly legal instructions fired an assertion in llvm-as:
+;
+; Assertion failed: (I != ValueMap.end() && "Value not in slotcalculator!"), function getValueID, file lib/Bitcode/Writer/ValueEnumerator.cpp, line 138.
+;
+; So, I didn't test them; it looks like bitcode compatability is irrelevant.
+  ; call void @llvm.bar(metadata !{i32 0, i32 %v})
+  ; call void @llvm.bar(metadata !{i32 %v, i32 0})
+  ; call void @llvm.bar(metadata !{metadata !{}, i32 %v})
+  ; call void @llvm.bar(metadata !{i32 %v, metadata !{}})
+
+; CHECK-NEXT: call void @llvm.bar(metadata !0)
+; CHECK-NEXT: call void @llvm.bar(metadata !0)
+  call void @llvm.bar(metadata !{i32 %v, i32 %v})
+  call void @llvm.bar(metadata !{metadata !{i32 %v}})
+
+; CHECK-NEXT: ret void{{$}}
+  ret void, !baz !{i32 %v}
+}
+
+declare void @llvm.bar(metadata)
+
+; CHECK: !0 = !{}
diff --git a/test/Bitcode/function-local-metadata.3.5.ll.bc b/test/Bitcode/function-local-metadata.3.5.ll.bc
new file mode 100644
index 0000000..6323ca4
--- /dev/null
+++ b/test/Bitcode/function-local-metadata.3.5.ll.bc
diff --git a/test/Bitcode/highLevelStructure.3.2.ll b/test/Bitcode/highLevelStructure.3.2.ll
index f9509eb..88fb340 100644
--- a/test/Bitcode/highLevelStructure.3.2.ll
+++ b/test/Bitcode/highLevelStructure.3.2.ll
@@ -75,11 +75,11 @@ entry:
 ; Named metadata Test
 ; CHECK: !name = !{!0, !1, !2}
 !name = !{!0, !1, !2}
-; CHECK: !0 = metadata !{metadata !"zero"}
+; CHECK: !0 = !{!"zero"}
 !0 = metadata !{metadata !"zero"}
-; CHECK: !1 = metadata !{metadata !"one"}
+; CHECK: !1 = !{!"one"}
 !1 = metadata !{metadata !"one"}
-; CHECK: !2 = metadata !{metadata !"two"}
+; CHECK: !2 = !{!"two"}
 !2 = metadata !{metadata !"two"}
 
 
diff --git a/test/Bitcode/invalid.ll b/test/Bitcode/invalid.ll
index 1d4a82b..df9fec8 100644
--- a/test/Bitcode/invalid.ll
+++ b/test/Bitcode/invalid.ll
@@ -1,6 +1,6 @@
 ; RUN:  not llvm-dis < %s.bc 2>&1 | FileCheck %s
 
-; CHECK: llvm-dis{{(\.EXE|\.exe)?}}: Invalid value
+; CHECK: llvm-dis{{(\.EXE|\.exe)?}}: error: Unknown attribute kind (48)
 
 ; invalid.ll.bc has an invalid attribute number.
 ; The test checks that LLVM reports the error and doesn't access freed memory
diff --git a/test/Bitcode/invalid.test b/test/Bitcode/invalid.test
new file mode 100644
index 0000000..fb81888
--- /dev/null
+++ b/test/Bitcode/invalid.test
@@ -0,0 +1,43 @@
+RUN: not llvm-dis -disable-output %p/Inputs/invalid-pr20485.bc 2>&1 | \
+RUN:   FileCheck --check-prefix=INVALID-ENCODING %s
+RUN: not llvm-dis -disable-output %p/Inputs/invalid-abbrev.bc 2>&1 | \
+RUN:   FileCheck --check-prefix=BAD-ABBREV %s
+RUN: not llvm-dis -disable-output %p/Inputs/invalid-unexpected-eof.bc 2>&1 | \
+RUN:   FileCheck --check-prefix=UNEXPECTED-EOF %s
+RUN: not llvm-dis -disable-output %p/Inputs/invalid-bad-abbrev-number.bc 2>&1 | \
+RUN:   FileCheck --check-prefix=BAD-ABBREV-NUMBER %s
+RUN: not llvm-dis -disable-output %p/Inputs/invalid-type-table-forward-ref.bc 2>&1 | \
+RUN:   FileCheck --check-prefix=BAD-TYPE-TABLE-FORWARD-REF %s
+RUN: not llvm-dis -disable-output %p/Inputs/invalid-bitwidth.bc 2>&1 | \
+RUN:   FileCheck --check-prefix=BAD-BITWIDTH %s
+RUN: not llvm-dis -disable-output %p/Inputs/invalid-align.bc  2>&1 | \
+RUN:   FileCheck --check-prefix=BAD-ALIGN %s
+
+INVALID-ENCODING: Invalid encoding
+BAD-ABBREV: Abbreviation starts with an Array or a Blob
+UNEXPECTED-EOF: Unexpected end of file
+BAD-ABBREV-NUMBER: Invalid abbrev number
+BAD-TYPE-TABLE-FORWARD-REF: Invalid TYPE table: Only named structs can be forward referenced
+BAD-BITWIDTH: Bitwidth for integer type out of range
+BAD-ALIGN: Invalid alignment value
+
+RUN: not llvm-dis -disable-output %p/Inputs/invalid-extractval-array-idx.bc 2>&1 | \
+RUN:   FileCheck --check-prefix=EXTRACT-ARRAY %s
+RUN: not llvm-dis -disable-output %p/Inputs/invalid-extractval-struct-idx.bc 2>&1 | \
+RUN:   FileCheck --check-prefix=EXTRACT-STRUCT %s
+RUN: not llvm-dis -disable-output %p/Inputs/invalid-extractval-too-many-idxs.bc 2>&1 | \
+RUN:   FileCheck --check-prefix=EXTRACT-IDXS %s
+RUN: not llvm-dis -disable-output %p/Inputs/invalid-insertval-array-idx.bc 2>&1 | \
+RUN:   FileCheck --check-prefix=INSERT-ARRAY %s
+RUN: not llvm-dis -disable-output %p/Inputs/invalid-insertval-struct-idx.bc 2>&1 | \
+RUN:   FileCheck --check-prefix=INSERT-STRUCT %s
+RUN: not llvm-dis -disable-output %p/Inputs/invalid-insertval-too-many-idxs.bc 2>&1 | \
+RUN:   FileCheck --check-prefix=INSERT-IDXS %s
+
+
+EXTRACT-ARRAY: EXTRACTVAL: Invalid array index
+EXTRACT-STRUCT: EXTRACTVAL: Invalid struct index
+EXTRACT-IDXS: EXTRACTVAL: Invalid type
+INSERT-ARRAY: INSERTVAL: Invalid array index
+INSERT-STRUCT: INSERTVAL: Invalid struct index
+INSERT-IDXS: INSERTVAL: Invalid type
diff --git a/test/Bitcode/linkage-types-3.2.ll b/test/Bitcode/linkage-types-3.2.ll
index dc6c90c..fb6cc57 100644
--- a/test/Bitcode/linkage-types-3.2.ll
+++ b/test/Bitcode/linkage-types-3.2.ll
@@ -6,124 +6,124 @@
 ; older bitcode files.
 
 @common.var = common global i32 0
-; CHECK: @common.var = common global i32 0
+; CHECK: @common.var = common global i32 0{{$}}
 
 @appending.var = appending global [8 x i32] undef
-; CHECK: @appending.var = appending global [8 x i32] undef
+; CHECK: @appending.var = appending global [8 x i32] undef{{$}}
 
 @extern_weak.var = extern_weak global i32
-; CHECK: @extern_weak.var = extern_weak global i32
+; CHECK: @extern_weak.var = extern_weak global i32{{$}}
 
 @private.var = private constant i32 0
-; CHECK: @private.var = private constant i32 0
+; CHECK: @private.var = private constant i32 0{{$}}
 
 @linker_private.var = linker_private constant i32 0
-; CHECK: @linker_private.var = private constant i32 0
+; CHECK: @linker_private.var = private constant i32 0{{$}}
 
 @linker_private_weak.var = linker_private_weak constant i32 0
-; CHECK: @linker_private_weak.var = private constant i32 0
+; CHECK: @linker_private_weak.var = private constant i32 0{{$}}
 
 @linker_private_weak_def_auto.var = linker_private_weak_def_auto constant i32 0
-; CHECK: @linker_private_weak_def_auto.var = constant i32 0
+; CHECK: @linker_private_weak_def_auto.var = constant i32 0{{$}}
 
 @internal.var = internal constant i32 0
-; CHECK: @internal.var = internal constant i32 0
+; CHECK: @internal.var = internal constant i32 0{{$}}
 
 @available_externally.var = available_externally constant i32 0
-; CHECK: @available_externally.var = available_externally constant i32 0
+; CHECK: @available_externally.var = available_externally constant i32 0{{$}}
 
 @linkonce.var = linkonce constant i32 0
-; CHECK: @linkonce.var = linkonce constant i32 0
+; CHECK: @linkonce.var = linkonce constant i32 0, comdat{{$}}
 
 @weak.var = weak constant i32 0
-; CHECK: @weak.var = weak constant i32 0
+; CHECK: @weak.var = weak constant i32 0, comdat{{$}}
 
 @linkonce_odr.var = linkonce_odr constant i32 0
-; CHECK: @linkonce_odr.var = linkonce_odr constant i32 0
+; CHECK: @linkonce_odr.var = linkonce_odr constant i32 0, comdat{{$}}
 
 @linkonce_odr_auto_hide.var = linkonce_odr_auto_hide constant i32 0
-; CHECK: @linkonce_odr_auto_hide.var = constant i32 0
+; CHECK: @linkonce_odr_auto_hide.var = constant i32 0{{$}}
 
 @external.var = external constant i32
-; CHECK: @external.var = external constant i32
+; CHECK: @external.var = external constant i32{{$}}
 
 @dllexport.var = dllexport global i32 0
-; CHECK: @dllexport.var = dllexport global i32 0
+; CHECK: @dllexport.var = dllexport global i32 0{{$}}
 
 @dllimport.var = dllimport global i32
-; CHECK: @dllimport.var = external dllimport global i32
+; CHECK: @dllimport.var = external dllimport global i32{{$}}
 
 define private void @private()
-; CHECK: define private void @private
+; CHECK: define private void @private() {
 {
-  ret void;
+  ret void
 }
 
 define linker_private void @linker_private()
-; CHECK: define private void @linker_private
+; CHECK: define private void @linker_private() {
 {
-  ret void;
+  ret void
 }
 
 define linker_private_weak void @linker_private_weak()
-; CHECK: define private void @linker_private_weak
+; CHECK: define private void @linker_private_weak() {
 {
-  ret void;
+  ret void
 }
 
 define linker_private_weak_def_auto void @linker_private_weak_def_auto()
-; CHECK: define void @linker_private_weak_def_auto
+; CHECK: define void @linker_private_weak_def_auto() {
 {
-  ret void;
+  ret void
 }
 
 define internal void @internal()
-; CHECK: define internal void @internal
+; CHECK: define internal void @internal() {
 {
-  ret void;
+  ret void
 }
 
 define available_externally void @available_externally()
-; CHECK: define available_externally void @available_externally
+; CHECK: define available_externally void @available_externally() {
 {
-  ret void;
+  ret void
 }
 
 define linkonce void @linkonce()
-; CHECK: define linkonce void @linkonce
+; CHECK: define linkonce void @linkonce() comdat {
 {
-  ret void;
+  ret void
 }
 
 define weak void @weak()
-; CHECK: define weak void @weak
+; CHECK: define weak void @weak() comdat {
 {
-  ret void;
+  ret void
 }
 
 define linkonce_odr void @linkonce_odr()
-; CHECK: define linkonce_odr void @linkonce_odr
+; CHECK: define linkonce_odr void @linkonce_odr() comdat {
 {
-  ret void;
+  ret void
 }
 
 define linkonce_odr_auto_hide void @linkonce_odr_auto_hide()
-; CHECK: define void @linkonce_odr_auto_hide
+; CHECK: define void @linkonce_odr_auto_hide() {
 {
-  ret void;
+  ret void
 }
 
 define external void @external()
-; CHECK: define void @external
+; CHECK: define void @external() {
 {
-  ret void;
+  ret void
 }
 
 declare dllimport void @dllimport()
-; CHECK: declare dllimport void @dllimport
+; CHECK: declare dllimport void @dllimport(){{$}}
 
 define dllexport void @dllexport()
-; CHECK: define dllexport void @dllexport()
+; CHECK: define dllexport void @dllexport() {
 {
-  ret void;
+  ret void
 }
diff --git a/test/Bitcode/mdstring-high-bits.ll b/test/Bitcode/mdstring-high-bits.ll
new file mode 100644
index 0000000..0d8fdeb
--- /dev/null
+++ b/test/Bitcode/mdstring-high-bits.ll
@@ -0,0 +1,9 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+; PR21882: confirm we don't crash when high bits are set in a character in a
+; metadata string.
+
+; CHECK: !name = !{!0}
+!name = !{!0}
+; CHECK: !0 = !{!"\80"}
+!0 = !{!"\80"}
diff --git a/test/Bitcode/metadata-2.ll b/test/Bitcode/metadata-2.ll
index bb957a7..07371a3 100644
--- a/test/Bitcode/metadata-2.ll
+++ b/test/Bitcode/metadata-2.ll
@@ -84,5 +84,5 @@ moduleinfoCtorEntry:
 }
 !llvm.ldc.classinfo._D6Object7__ClassZ = !{!0}
 !llvm.ldc.classinfo._D10ModuleInfo7__ClassZ = !{!1}
-!0 = metadata !{%object.Object undef, i1 false, i1 false}
-!1 = metadata !{%object.ModuleInfo undef, i1 false, i1 false}
+!0 = !{%object.Object undef, i1 false, i1 false}
+!1 = !{%object.ModuleInfo undef, i1 false, i1 false}
diff --git a/test/Bitcode/metadata.3.5.ll b/test/Bitcode/metadata.3.5.ll
new file mode 100644
index 0000000..ae7b83a
--- /dev/null
+++ b/test/Bitcode/metadata.3.5.ll
@@ -0,0 +1,26 @@
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+; Check that metadata encoded in 3.5 is correctly understood going forward.
+;
+; Bitcode assembled by llvm-as v3.5.0.
+
+define void @foo(i32 %v) {
+; CHECK: entry:
+entry:
+; CHECK-NEXT: call void @llvm.bar(metadata !0)
+  call void @llvm.bar(metadata !0)
+
+; CHECK-NEXT: ret void, !baz !1
+  ret void, !baz !1
+}
+
+declare void @llvm.bar(metadata)
+
+@global = global i32 0
+
+; CHECK: !0 = !{!1, !2, i32* @global, null}
+; CHECK: !1 = !{!2, null}
+; CHECK: !2 = !{}
+!0 = metadata !{metadata !1, metadata !2, i32* @global, null}
+!1 = metadata !{metadata !2, null}
+!2 = metadata !{}
diff --git a/test/Bitcode/metadata.3.5.ll.bc b/test/Bitcode/metadata.3.5.ll.bc
new file mode 100644
index 0000000..1857465
--- /dev/null
+++ b/test/Bitcode/metadata.3.5.ll.bc
diff --git a/test/Bitcode/metadata.ll b/test/Bitcode/metadata.ll
index 955b48b..7d24a91 100644
--- a/test/Bitcode/metadata.ll
+++ b/test/Bitcode/metadata.ll
@@ -2,5 +2,5 @@
 ; RUN: verify-uselistorder < %s
 
 !llvm.foo = !{!0}
-!0 = metadata !{i32 42}
+!0 = !{i32 42}
 @my.str = internal constant [4 x i8] c"foo\00"
diff --git a/test/Bitcode/pr18704.ll b/test/Bitcode/pr18704.ll
index f05fe53..e57ce3c 100644
--- a/test/Bitcode/pr18704.ll
+++ b/test/Bitcode/pr18704.ll
@@ -1,6 +1,6 @@
 ; RUN:  not llvm-dis < %s.bc 2>&1 | FileCheck %s
 
-; CHECK: llvm-dis{{(\.EXE|\.exe)?}}: Never resolved value found in function
+; CHECK: llvm-dis{{(\.EXE|\.exe)?}}: error: Never resolved value found in function
 
 ; pr18704.ll.bc has an instruction referring to invalid type.
 ; The test checks that LLVM reports the error and doesn't access freed memory
diff --git a/test/Bitcode/upgrade-loop-metadata.ll b/test/Bitcode/upgrade-loop-metadata.ll
index cebc583..be2a99a 100644
--- a/test/Bitcode/upgrade-loop-metadata.ll
+++ b/test/Bitcode/upgrade-loop-metadata.ll
@@ -27,9 +27,9 @@ for.end:                                          ; preds = %for.cond
   ret void
 }
 
-; CHECK: !{metadata !"llvm.loop.interleave.count", i32 4}
-; CHECK: !{metadata !"llvm.loop.vectorize.width", i32 8}
-; CHECK: !{metadata !"llvm.loop.vectorize.enable", i1 true}
+; CHECK: !{!"llvm.loop.interleave.count", i32 4}
+; CHECK: !{!"llvm.loop.vectorize.width", i32 8}
+; CHECK: !{!"llvm.loop.vectorize.enable", i1 true}
 
 !0 = metadata !{metadata !"clang version 3.5.0 (trunk 211528)"}
 !1 = metadata !{metadata !1, metadata !2, metadata !3, metadata !4, metadata !4}
diff --git a/test/Bitcode/upgrade-tbaa.ll b/test/Bitcode/upgrade-tbaa.ll
index 23b4d7d..c20c66a 100644
--- a/test/Bitcode/upgrade-tbaa.ll
+++ b/test/Bitcode/upgrade-tbaa.ll
@@ -4,7 +4,7 @@
 ; Function Attrs: nounwind
 define void @_Z4testPiPf(i32* nocapture %pI, float* nocapture %pF) #0 {
 entry:
-  store i32 0, i32* %pI, align 4, !tbaa !{metadata !"int", metadata !0}
+  store i32 0, i32* %pI, align 4, !tbaa !{!"int", !0}
   ; CHECK: store i32 0, i32* %pI, align 4, !tbaa [[TAG_INT:!.*]]
   store float 1.000000e+00, float* %pF, align 4, !tbaa !2
   ; CHECK: store float 1.000000e+00, float* %pF, align 4, !tbaa [[TAG_FLOAT:!.*]]
@@ -13,12 +13,12 @@ entry:
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
-!0 = metadata !{metadata !"omnipotent char", metadata !1}
-!1 = metadata !{metadata !"Simple C/C++ TBAA"}
-!2 = metadata !{metadata !"float", metadata !0}
+!0 = !{!"omnipotent char", !1}
+!1 = !{!"Simple C/C++ TBAA"}
+!2 = !{!"float", !0}
 
-; CHECK: [[TAG_INT]] = metadata !{metadata [[TYPE_INT:!.*]], metadata [[TYPE_INT]], i64 0}
-; CHECK: [[TYPE_INT]] = metadata !{metadata !"int", metadata [[TYPE_CHAR:!.*]]}
-; CHECK: [[TYPE_CHAR]] = metadata !{metadata !"omnipotent char", metadata !{{.*}}
-; CHECK: [[TAG_FLOAT]] = metadata !{metadata [[TYPE_FLOAT:!.*]], metadata [[TYPE_FLOAT]], i64 0}
-; CHECK: [[TYPE_FLOAT]] = metadata !{metadata !"float", metadata [[TYPE_CHAR]]}
+; CHECK: [[TAG_INT]] = !{[[TYPE_INT:!.*]], [[TYPE_INT]], i64 0}
+; CHECK: [[TYPE_INT]] = !{!"int", [[TYPE_CHAR:!.*]]}
+; CHECK: [[TYPE_CHAR]] = !{!"omnipotent char", !{{.*}}
+; CHECK: [[TAG_FLOAT]] = !{[[TYPE_FLOAT:!.*]], [[TYPE_FLOAT]], i64 0}
+; CHECK: [[TYPE_FLOAT]] = !{!"float", [[TYPE_CHAR]]}
diff --git a/test/Bitcode/weak-macho-3.5.ll b/test/Bitcode/weak-macho-3.5.ll
new file mode 100644
index 0000000..0c09fe4
--- /dev/null
+++ b/test/Bitcode/weak-macho-3.5.ll
@@ -0,0 +1,11 @@
+; RUN:  llvm-dis < %s.bc| FileCheck %s
+
+; weak-macho-3.5.ll.bc was generated by passing this file to llvm-as-3.5
+; The test checks that LLVM does not place weak GlobalVariables into Comdats for
+; macho object files, they don't support it.
+
+target triple = "x86_64-apple-macosx10.9.0"
+; CHECK: target triple = "x86_64-apple-macosx10.9.0"
+
+@x = weak global i32 0
+; CHECK: @x = weak global i32 0{{$}}
diff --git a/test/Bitcode/weak-macho-3.5.ll.bc b/test/Bitcode/weak-macho-3.5.ll.bc
new file mode 100644
index 0000000..ee66072
--- /dev/null
+++ b/test/Bitcode/weak-macho-3.5.ll.bc
diff --git a/test/BugPoint/metadata.ll b/test/BugPoint/metadata.ll
index 1c27a49..00015d1 100644
--- a/test/BugPoint/metadata.ll
+++ b/test/BugPoint/metadata.ll
@@ -5,11 +5,11 @@
 ; Bugpoint should keep the call's metadata attached to the call.
 
 ; CHECK: call void @foo(), !dbg ![[LOC:[0-9]+]], !attach ![[CALL:[0-9]+]]
-; CHECK: ![[LOC]] = metadata !{i32 104, i32 105, metadata ![[SCOPE:[0-9]+]], metadata ![[SCOPE]]}
-; CHECK: ![[SCOPE]] = metadata !{metadata !"0x11\000\00me\001\00\000\00\000", metadata ![[FILE:[0-9]+]], metadata ![[LIST:[0-9]+]], metadata ![[LIST]], null, null, null}
-; CHECK: ![[FILE]] = metadata !{metadata !"source.c", metadata !"/dir"}
-; CHECK: ![[LIST]] = metadata !{i32 0}
-; CHECK: ![[CALL]] = metadata !{metadata !"the call to foo"}
+; CHECK: ![[LOC]] = !MDLocation(line: 104, column: 105, scope: ![[SCOPE:[0-9]+]], inlinedAt: ![[SCOPE]])
+; CHECK: ![[SCOPE]] = !{!"0x11\000\00me\001\00\000\00\000", ![[FILE:[0-9]+]], ![[LIST:[0-9]+]], ![[LIST]], null, null, null}
+; CHECK: ![[FILE]] = !{!"source.c", !"/dir"}
+; CHECK: ![[LIST]] = !{i32 0}
+; CHECK: ![[CALL]] = !{!"the call to foo"}
 
 %rust_task = type {}
 define void @test(i32* %a, i8* %b) {
@@ -25,18 +25,18 @@ declare void @foo()
 
 !llvm.module.flags = !{!17}
 
-!0 = metadata !{metadata !"boring"}
-!1 = metadata !{metadata !"uninteresting"}
-!2 = metadata !{metadata !"the call to foo"}
-!3 = metadata !{metadata !"noise"}
-!4 = metadata !{metadata !"filler"}
+!0 = !{!"boring"}
+!1 = !{!"uninteresting"}
+!2 = !{!"the call to foo"}
+!3 = !{!"noise"}
+!4 = !{!"filler"}
 
-!9 = metadata !{metadata !"0x11\000\00me\001\00\000\00\000", metadata !15, metadata !16, metadata !16, null, null, null} ; [ DW_TAG_compile_unit ]
-!10 = metadata !{i32 100, i32 101, metadata !9, metadata !9}
-!11 = metadata !{i32 102, i32 103, metadata !9, metadata !9}
-!12 = metadata !{i32 104, i32 105, metadata !9, metadata !9}
-!13 = metadata !{i32 106, i32 107, metadata !9, metadata !9}
-!14 = metadata !{i32 108, i32 109, metadata !9, metadata !9}
-!15 = metadata !{metadata !"source.c", metadata !"/dir"}
-!16 = metadata !{i32 0}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!9 = !{!"0x11\000\00me\001\00\000\00\000", !15, !16, !16, null, null, null} ; [ DW_TAG_compile_unit ]
+!10 = !MDLocation(line: 100, column: 101, scope: !9, inlinedAt: !9)
+!11 = !MDLocation(line: 102, column: 103, scope: !9, inlinedAt: !9)
+!12 = !MDLocation(line: 104, column: 105, scope: !9, inlinedAt: !9)
+!13 = !MDLocation(line: 106, column: 107, scope: !9, inlinedAt: !9)
+!14 = !MDLocation(line: 108, column: 109, scope: !9, inlinedAt: !9)
+!15 = !{!"source.c", !"/dir"}
+!16 = !{i32 0}
+!17 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index bdb5d79..f85a1d9 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -30,6 +30,7 @@ set(LLVM_TEST_DEPENDS
           llvm-cov
           llvm-diff
           llvm-dis
+          llvm-dsymutil
           llvm-dwarfdump
           llvm-extract
           llvm-link
@@ -37,11 +38,12 @@ set(LLVM_TEST_DEPENDS
           llvm-mc
           llvm-mcmarkup
           llvm-nm
-          llvm-size
           llvm-objdump
           llvm-profdata
+          llvm-ranlib
           llvm-readobj
           llvm-rtdyld
+          llvm-size
           llvm-symbolizer
           llvm-tblgen
           llvm-vtabledump
@@ -68,6 +70,25 @@ if(TARGET llvm-go)
   set(LLVM_TEST_DEPENDS ${LLVM_TEST_DEPENDS} llvm-go)
 endif()
 
+if(TARGET ocaml_llvm)
+  set(LLVM_TEST_DEPENDS ${LLVM_TEST_DEPENDS}
+          ocaml_llvm
+          ocaml_llvm_all_backends
+          ocaml_llvm_analysis
+          ocaml_llvm_bitreader
+          ocaml_llvm_bitwriter
+          ocaml_llvm_executionengine
+          ocaml_llvm_irreader
+          ocaml_llvm_linker
+          ocaml_llvm_target
+          ocaml_llvm_ipo
+          ocaml_llvm_passmgr_builder
+          ocaml_llvm_scalar_opts
+          ocaml_llvm_transform_utils
+          ocaml_llvm_vectorize
+        )
+endif()
+
 add_lit_testsuite(check-llvm "Running the LLVM regression tests"
   ${CMAKE_CURRENT_BINARY_DIR}
   PARAMS llvm_site_config=${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
diff --git a/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll b/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
index 4da33a0..73ee522 100644
--- a/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
+++ b/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
@@ -16,7 +16,7 @@ for.body:                                         ; preds = %for.body, %entry
   %add53 = add nsw i64 %n1, 0, !dbg !52
   %add55 = add nsw i64 %n1, 0, !dbg !53
   %mul63 = mul nsw i64 %add53, -20995, !dbg !54
-  tail call void @llvm.dbg.value(metadata !{i64 %mul63}, i64 0, metadata !30, metadata !{metadata !"0x102"}), !dbg !55
+  tail call void @llvm.dbg.value(metadata i64 %mul63, i64 0, metadata !30, metadata !{!"0x102"}), !dbg !55
   %mul65 = mul nsw i64 %add55, -3196, !dbg !56
   %add67 = add nsw i64 0, %mul65, !dbg !57
   %add80 = add i64 0, 1024, !dbg !58
@@ -44,63 +44,63 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!36, !37}
 !llvm.ident = !{!38}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.6.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [] [] []
-!1 = metadata !{metadata !"test.c", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00\00\00\00140\000\001\000\006\00256\001\00141", metadata !1, metadata !5, metadata !6, null, void ()* @test, null, null, metadata !12} ; [ DW_TAG_subprogram ] [] [] [def] [scope 141] []
-!5 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ] [] []
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [] [] [from ]
-!7 = metadata !{null, metadata !8}
-!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [] [] []
-!9 = metadata !{metadata !"0x16\00\0030\000\000\000\000", metadata !10, null, metadata !11} ; [ DW_TAG_typedef ] [] [] [] [from int]
-!10 = metadata !{metadata !"", metadata !""}
-!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [] [int] []
-!12 = metadata !{metadata !13, metadata !14, metadata !18, metadata !19, metadata !20, metadata !21, metadata !22, metadata !23, metadata !24, metadata !25, metadata !26, metadata !27, metadata !28, metadata !29, metadata !30, metadata !31, metadata !32, metadata !33, metadata !34, metadata !35}
-!13 = metadata !{metadata !"0x101\00\0016777356\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [] [data] []
-!14 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
-!15 = metadata !{metadata !"0x16\00\00183\000\000\000\000", metadata !16, null, metadata !17} ; [ DW_TAG_typedef ] [] [INT32] [] [from long int]
-!16 = metadata !{metadata !"", metadata !""}
-!17 = metadata !{metadata !"0x24\00\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [] [long int] []
-!18 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
-!19 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
-!20 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
-!21 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
-!22 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
-!23 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
-!24 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
-!25 = metadata !{metadata !"0x100\00\00143\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
-!26 = metadata !{metadata !"0x100\00\00143\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
-!27 = metadata !{metadata !"0x100\00\00143\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
-!28 = metadata !{metadata !"0x100\00\00143\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
-!29 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
-!30 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
-!31 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
-!32 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [ ] [] []
-!33 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
-!34 = metadata !{metadata !"0x100\00\00145\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [  ] [] []
-!35 = metadata !{metadata !"0x100\00\00146\000", metadata !4, metadata !5, metadata !11} ; [ DW_TAG_auto_variable ] [  ] [] []
-!36 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!37 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!38 = metadata !{metadata !"clang version 3.6.0 "}
-!39 = metadata !{i32 154, i32 8, metadata !40, null}
-!40 = metadata !{metadata !"0xb\00154\008\002", metadata !1, metadata !41} ; [ DW_TAG_lexical_block ] [  ] []
-!41 = metadata !{metadata !"0xb\00154\008\001", metadata !1, metadata !42} ; [ DW_TAG_lexical_block ] [  ] []
-!42 = metadata !{metadata !"0xb\00154\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [  ] []
-!43 = metadata !{i32 157, i32 5, metadata !44, null}
-!44 = metadata !{metadata !"0xb\00154\0042\000", metadata !1, metadata !42} ; [ DW_TAG_lexical_block ] [  ] []
-!45 = metadata !{i32 159, i32 5, metadata !44, null}
-!46 = metadata !{metadata !47, metadata !47, i64 0}
-!47 = metadata !{metadata !"int", metadata !48, i64 0}
-!48 = metadata !{metadata !"omnipotent char", metadata !49, i64 0}
-!49 = metadata !{metadata !"Simple C/C++ TBAA"}
-!50 = metadata !{i32 160, i32 5, metadata !44, null}
-!51 = metadata !{i32 161, i32 5, metadata !44, null}
-!52 = metadata !{i32 188, i32 5, metadata !44, null}
-!53 = metadata !{i32 190, i32 5, metadata !44, null}
-!54 = metadata !{i32 198, i32 5, metadata !44, null}
-!55 = metadata !{i32 144, i32 13, metadata !4, null}
-!56 = metadata !{i32 200, i32 5, metadata !44, null}
-!57 = metadata !{i32 203, i32 5, metadata !44, null}
-!58 = metadata !{i32 207, i32 5, metadata !44, null}
-!59 = metadata !{i32 208, i32 5, metadata !44, null}
+!0 = !{!"0x11\0012\00clang version 3.6.0 \001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [] [] []
+!1 = !{!"test.c", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00\00\00\00140\000\001\000\006\00256\001\00141", !1, !5, !6, null, void ()* @test, null, null, !12} ; [ DW_TAG_subprogram ] [] [] [def] [scope 141] []
+!5 = !{!"0x29", !1} ; [ DW_TAG_file_type ] [] []
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [] [] [from ]
+!7 = !{null, !8}
+!8 = !{!"0xf\00\000\0064\0064\000\000", null, null, !9} ; [ DW_TAG_pointer_type ] [] [] []
+!9 = !{!"0x16\00\0030\000\000\000\000", !10, null, !11} ; [ DW_TAG_typedef ] [] [] [] [from int]
+!10 = !{!"", !""}
+!11 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [] [int] []
+!12 = !{!13, !14, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35}
+!13 = !{!"0x101\00\0016777356\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [] [data] []
+!14 = !{!"0x100\00\00142\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [] [] []
+!15 = !{!"0x16\00\00183\000\000\000\000", !16, null, !17} ; [ DW_TAG_typedef ] [] [INT32] [] [from long int]
+!16 = !{!"", !""}
+!17 = !{!"0x24\00\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [] [long int] []
+!18 = !{!"0x100\00\00142\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [] [] []
+!19 = !{!"0x100\00\00142\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [] [] []
+!20 = !{!"0x100\00\00142\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [] [] []
+!21 = !{!"0x100\00\00142\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [] [] []
+!22 = !{!"0x100\00\00142\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [] [] []
+!23 = !{!"0x100\00\00142\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [] [] []
+!24 = !{!"0x100\00\00142\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!25 = !{!"0x100\00\00143\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!26 = !{!"0x100\00\00143\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!27 = !{!"0x100\00\00143\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!28 = !{!"0x100\00\00143\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!29 = !{!"0x100\00\00144\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!30 = !{!"0x100\00\00144\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!31 = !{!"0x100\00\00144\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!32 = !{!"0x100\00\00144\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [ ] [] []
+!33 = !{!"0x100\00\00144\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!34 = !{!"0x100\00\00145\000", !4, !5, !8} ; [ DW_TAG_auto_variable ] [  ] [] []
+!35 = !{!"0x100\00\00146\000", !4, !5, !11} ; [ DW_TAG_auto_variable ] [  ] [] []
+!36 = !{i32 2, !"Dwarf Version", i32 4}
+!37 = !{i32 2, !"Debug Info Version", i32 2}
+!38 = !{!"clang version 3.6.0 "}
+!39 = !MDLocation(line: 154, column: 8, scope: !40)
+!40 = !{!"0xb\00154\008\002", !1, !41} ; [ DW_TAG_lexical_block ] [  ] []
+!41 = !{!"0xb\00154\008\001", !1, !42} ; [ DW_TAG_lexical_block ] [  ] []
+!42 = !{!"0xb\00154\003\000", !1, !4} ; [ DW_TAG_lexical_block ] [  ] []
+!43 = !MDLocation(line: 157, column: 5, scope: !44)
+!44 = !{!"0xb\00154\0042\000", !1, !42} ; [ DW_TAG_lexical_block ] [  ] []
+!45 = !MDLocation(line: 159, column: 5, scope: !44)
+!46 = !{!47, !47, i64 0}
+!47 = !{!"int", !48, i64 0}
+!48 = !{!"omnipotent char", !49, i64 0}
+!49 = !{!"Simple C/C++ TBAA"}
+!50 = !MDLocation(line: 160, column: 5, scope: !44)
+!51 = !MDLocation(line: 161, column: 5, scope: !44)
+!52 = !MDLocation(line: 188, column: 5, scope: !44)
+!53 = !MDLocation(line: 190, column: 5, scope: !44)
+!54 = !MDLocation(line: 198, column: 5, scope: !44)
+!55 = !MDLocation(line: 144, column: 13, scope: !4)
+!56 = !MDLocation(line: 200, column: 5, scope: !44)
+!57 = !MDLocation(line: 203, column: 5, scope: !44)
+!58 = !MDLocation(line: 207, column: 5, scope: !44)
+!59 = !MDLocation(line: 208, column: 5, scope: !44)
diff --git a/test/CodeGen/AArch64/aarch64-2014-12-02-combine-soften.ll b/test/CodeGen/AArch64/aarch64-2014-12-02-combine-soften.ll
new file mode 100644
index 0000000..4553251
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-2014-12-02-combine-soften.ll
@@ -0,0 +1,16 @@
+;RUN: llc <%s -mattr=-neon  -mattr=-fp-armv8  | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+@t = common global i32 0, align 4
+@x = common global i32 0, align 4
+
+define void @foo() {
+entry:
+;CHECK-LABEL: foo:
+;CHECK: __floatsisf
+  %0 = load i32* @x, align 4
+  %conv = sitofp i32 %0 to float
+  store float %conv, float* bitcast (i32* @t to float*), align 4
+  ret void
+}
diff --git a/test/CodeGen/AArch64/addsub-shifted.ll b/test/CodeGen/AArch64/addsub-shifted.ll
index 0a93edd..1d963f4 100644
--- a/test/CodeGen/AArch64/addsub-shifted.ll
+++ b/test/CodeGen/AArch64/addsub-shifted.ll
@@ -190,7 +190,7 @@ define void @test_asr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
 ; CHECK: ret
 }
 
-define i32 @test_cmp(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
+define void @test_cmp(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64, i32 %v) {
 ; CHECK-LABEL: test_cmp:
 
   %shift1 = shl i32 %rhs32, 13
@@ -199,40 +199,46 @@ define i32 @test_cmp(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
 ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}, lsl #13
 
 t2:
+  store volatile i32 %v, i32* @var32
   %shift2 = lshr i32 %rhs32, 20
   %tst2 = icmp ne i32 %lhs32, %shift2
   br i1 %tst2, label %t3, label %end
 ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}, lsr #20
 
 t3:
+  store volatile i32 %v, i32* @var32
   %shift3 = ashr i32 %rhs32, 9
   %tst3 = icmp ne i32 %lhs32, %shift3
   br i1 %tst3, label %t4, label %end
 ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}, asr #9
 
 t4:
+  store volatile i32 %v, i32* @var32
   %shift4 = shl i64 %rhs64, 43
   %tst4 = icmp uge i64 %lhs64, %shift4
   br i1 %tst4, label %t5, label %end
 ; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}, lsl #43
 
 t5:
+  store volatile i32 %v, i32* @var32
   %shift5 = lshr i64 %rhs64, 20
   %tst5 = icmp ne i64 %lhs64, %shift5
   br i1 %tst5, label %t6, label %end
 ; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}, lsr #20
 
 t6:
+  store volatile i32 %v, i32* @var32
   %shift6 = ashr i64 %rhs64, 59
   %tst6 = icmp ne i64 %lhs64, %shift6
   br i1 %tst6, label %t7, label %end
 ; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}, asr #59
 
 t7:
-  ret i32 1
-end:
+  store volatile i32 %v, i32* @var32
+  br label %end
 
-  ret i32 0
+end:
+  ret void
 ; CHECK: ret
 }
 
diff --git a/test/CodeGen/AArch64/analyze-branch.ll b/test/CodeGen/AArch64/analyze-branch.ll
index 6616b27..932cd75 100644
--- a/test/CodeGen/AArch64/analyze-branch.ll
+++ b/test/CodeGen/AArch64/analyze-branch.ll
@@ -7,8 +7,8 @@ declare void @test_true()
 declare void @test_false()
 
 ; !0 corresponds to a branch being taken, !1 to not being takne.
-!0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
-!1 = metadata !{metadata !"branch_weights", i32 4, i32 64}
+!0 = !{!"branch_weights", i32 64, i32 4}
+!1 = !{!"branch_weights", i32 4, i32 64}
 
 define void @test_Bcc_fallthrough_taken(i32 %in) nounwind {
 ; CHECK-LABEL: test_Bcc_fallthrough_taken:
diff --git a/test/CodeGen/AArch64/analyzecmp.ll b/test/CodeGen/AArch64/analyzecmp.ll
index 8962505..0b3bcd8 100644
--- a/test/CodeGen/AArch64/analyzecmp.ll
+++ b/test/CodeGen/AArch64/analyzecmp.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -O3 -mcpu=cortex-a57 < %s | FileCheck %s 
 
-; CHECK-LABLE: @test
-; CHECK: tst [[CMP:x[0-9]+]], #0x8000000000000000
-; CHECK: csel [[R0:x[0-9]+]], [[S0:x[0-9]+]], [[S1:x[0-9]+]], eq
-; CHECK: csel [[R1:x[0-9]+]], [[S2:x[0-9]+]], [[S3:x[0-9]+]], eq
+; CHECK-LABEL: @test
+; CHECK: and 
+; CHECK: csel
+; CHECK: csel
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "arm64--linux-gnueabi"
 
diff --git a/test/CodeGen/AArch64/argument-blocks.ll b/test/CodeGen/AArch64/argument-blocks.ll
new file mode 100644
index 0000000..f1dcfa6
--- /dev/null
+++ b/test/CodeGen/AArch64/argument-blocks.ll
@@ -0,0 +1,197 @@
+; RUN: llc -mtriple=aarch64-apple-ios7.0 -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DARWINPCS
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AAPCS
+
+declare void @callee(...)
+
+define float @test_hfa_regs(float, [2 x float] %in) {
+; CHECK-LABEL: test_hfa_regs:
+; CHECK: fadd s0, s1, s2
+
+  %lhs = extractvalue [2 x float] %in, 0
+  %rhs = extractvalue [2 x float] %in, 1
+  %sum = fadd float %lhs, %rhs
+  ret float %sum
+}
+
+; Check that the array gets allocated to a contiguous block on the stack (rather
+; than the default of 2 8-byte slots).
+define float @test_hfa_block([7 x float], [2 x float] %in) {
+; CHECK-LABEL: test_hfa_block:
+; CHECK: ldp [[LHS:s[0-9]+]], [[RHS:s[0-9]+]], [sp]
+; CHECK: fadd s0, [[LHS]], [[RHS]]
+
+  %lhs = extractvalue [2 x float] %in, 0
+  %rhs = extractvalue [2 x float] %in, 1
+  %sum = fadd float %lhs, %rhs
+  ret float %sum
+}
+
+; Check that an HFA prevents backfilling of VFP registers (i.e. %rhs must go on
+; the stack rather than in s7).
+define float @test_hfa_block_consume([7 x float], [2 x float] %in, float %rhs) {
+; CHECK-LABEL: test_hfa_block_consume:
+; CHECK-DAG: ldr [[LHS:s[0-9]+]], [sp]
+; CHECK-DAG: ldr [[RHS:s[0-9]+]], [sp, #8]
+; CHECK: fadd s0, [[LHS]], [[RHS]]
+
+  %lhs = extractvalue [2 x float] %in, 0
+  %sum = fadd float %lhs, %rhs
+  ret float %sum
+}
+
+define float @test_hfa_stackalign([8 x float], [1 x float], [2 x float] %in) {
+; CHECK-LABEL: test_hfa_stackalign:
+; CHECK-AAPCS: ldp [[LHS:s[0-9]+]], [[RHS:s[0-9]+]], [sp, #8]
+; CHECK-DARWINPCS: ldp [[LHS:s[0-9]+]], [[RHS:s[0-9]+]], [sp, #4]
+; CHECK: fadd s0, [[LHS]], [[RHS]]
+  %lhs = extractvalue [2 x float] %in, 0
+  %rhs = extractvalue [2 x float] %in, 1
+  %sum = fadd float %lhs, %rhs
+  ret float %sum
+}
+
+; An HFA that ends up on the stack should not have any effect on where
+; integer-based arguments go.
+define i64 @test_hfa_ignores_gprs([7 x float], [2 x float] %in, i64, i64 %res) {
+; CHECK-LABEL: test_hfa_ignores_gprs:
+; CHECK: mov x0, x1
+  ret i64 %res
+}
+
+; [2 x float] should not be promoted to double by the Darwin varargs handling,
+; but should go in an 8-byte aligned slot.
+define void @test_varargs_stackalign() {
+; CHECK-LABEL: test_varargs_stackalign:
+; CHECK-DARWINPCS: stp {{w[0-9]+}}, {{w[0-9]+}}, [sp, #16]
+
+  call void(...)* @callee([3 x float] undef, [2 x float] [float 1.0, float 2.0])
+  ret void
+}
+
+define i64 @test_smallstruct_block([7 x i64], [2 x i64] %in) {
+; CHECK-LABEL: test_smallstruct_block:
+; CHECK: ldp [[LHS:x[0-9]+]], [[RHS:x[0-9]+]], [sp]
+; CHECK: add x0, [[LHS]], [[RHS]]
+  %lhs = extractvalue [2 x i64] %in, 0
+  %rhs = extractvalue [2 x i64] %in, 1
+  %sum = add i64 %lhs, %rhs
+  ret i64 %sum
+}
+
+; Check that a small struct prevents backfilling of registers (i.e. %rhs
+; must go on the stack rather than in x7).
+define i64 @test_smallstruct_block_consume([7 x i64], [2 x i64] %in, i64 %rhs) {
+; CHECK-LABEL: test_smallstruct_block_consume:
+; CHECK-DAG: ldr [[LHS:x[0-9]+]], [sp]
+; CHECK-DAG: ldr [[RHS:x[0-9]+]], [sp, #16]
+; CHECK: add x0, [[LHS]], [[RHS]]
+
+  %lhs = extractvalue [2 x i64] %in, 0
+  %sum = add i64 %lhs, %rhs
+  ret i64 %sum
+}
+
+define <1 x i64> @test_v1i64_blocked([7 x double], [2 x <1 x i64>] %in) {
+; CHECK-LABEL: test_v1i64_blocked:
+; CHECK: ldr d0, [sp]
+  %val = extractvalue [2 x <1 x i64>] %in, 0
+  ret <1 x i64> %val
+}
+
+define <1 x double> @test_v1f64_blocked([7 x double], [2 x <1 x double>] %in) {
+; CHECK-LABEL: test_v1f64_blocked:
+; CHECK: ldr d0, [sp]
+  %val = extractvalue [2 x <1 x double>] %in, 0
+  ret <1 x double> %val
+}
+
+define <2 x i32> @test_v2i32_blocked([7 x double], [2 x <2 x i32>] %in) {
+; CHECK-LABEL: test_v2i32_blocked:
+; CHECK: ldr d0, [sp]
+  %val = extractvalue [2 x <2 x i32>] %in, 0
+  ret <2 x i32> %val
+}
+
+define <2 x float> @test_v2f32_blocked([7 x double], [2 x <2 x float>] %in) {
+; CHECK-LABEL: test_v2f32_blocked:
+; CHECK: ldr d0, [sp]
+  %val = extractvalue [2 x <2 x float>] %in, 0
+  ret <2 x float> %val
+}
+
+define <4 x i16> @test_v4i16_blocked([7 x double], [2 x <4 x i16>] %in) {
+; CHECK-LABEL: test_v4i16_blocked:
+; CHECK: ldr d0, [sp]
+  %val = extractvalue [2 x <4 x i16>] %in, 0
+  ret <4 x i16> %val
+}
+
+define <4 x half> @test_v4f16_blocked([7 x double], [2 x <4 x half>] %in) {
+; CHECK-LABEL: test_v4f16_blocked:
+; CHECK: ldr d0, [sp]
+  %val = extractvalue [2 x <4 x half>] %in, 0
+  ret <4 x half> %val
+}
+
+define <8 x i8> @test_v8i8_blocked([7 x double], [2 x <8 x i8>] %in) {
+; CHECK-LABEL: test_v8i8_blocked:
+; CHECK: ldr d0, [sp]
+  %val = extractvalue [2 x <8 x i8>] %in, 0
+  ret <8 x i8> %val
+}
+
+define <2 x i64> @test_v2i64_blocked([7 x double], [2 x <2 x i64>] %in) {
+; CHECK-LABEL: test_v2i64_blocked:
+; CHECK: ldr q0, [sp]
+  %val = extractvalue [2 x <2 x i64>] %in, 0
+  ret <2 x i64> %val
+}
+
+define <2 x double> @test_v2f64_blocked([7 x double], [2 x <2 x double>] %in) {
+; CHECK-LABEL: test_v2f64_blocked:
+; CHECK: ldr q0, [sp]
+  %val = extractvalue [2 x <2 x double>] %in, 0
+  ret <2 x double> %val
+}
+
+define <4 x i32> @test_v4i32_blocked([7 x double], [2 x <4 x i32>] %in) {
+; CHECK-LABEL: test_v4i32_blocked:
+; CHECK: ldr q0, [sp]
+  %val = extractvalue [2 x <4 x i32>] %in, 0
+  ret <4 x i32> %val
+}
+
+define <4 x float> @test_v4f32_blocked([7 x double], [2 x <4 x float>] %in) {
+; CHECK-LABEL: test_v4f32_blocked:
+; CHECK: ldr q0, [sp]
+  %val = extractvalue [2 x <4 x float>] %in, 0
+  ret <4 x float> %val
+}
+
+define <8 x i16> @test_v8i16_blocked([7 x double], [2 x <8 x i16>] %in) {
+; CHECK-LABEL: test_v8i16_blocked:
+; CHECK: ldr q0, [sp]
+  %val = extractvalue [2 x <8 x i16>] %in, 0
+  ret <8 x i16> %val
+}
+
+define <8 x half> @test_v8f16_blocked([7 x double], [2 x <8 x half>] %in) {
+; CHECK-LABEL: test_v8f16_blocked:
+; CHECK: ldr q0, [sp]
+  %val = extractvalue [2 x <8 x half>] %in, 0
+  ret <8 x half> %val
+}
+
+define <16 x i8> @test_v16i8_blocked([7 x double], [2 x <16 x i8>] %in) {
+; CHECK-LABEL: test_v16i8_blocked:
+; CHECK: ldr q0, [sp]
+  %val = extractvalue [2 x <16 x i8>] %in, 0
+  ret <16 x i8> %val
+}
+
+define half @test_f16_blocked([7 x double], [2 x half] %in) {
+; CHECK-LABEL: test_f16_blocked:
+; CHECK: ldr h0, [sp]
+  %val = extractvalue [2 x half] %in, 0
+  ret half %val
+}
diff --git a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
index e57a8c9..8b88c0b 100644
--- a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
+++ b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
@@ -11,7 +11,7 @@ if.then24:                                        ; preds = %entry
   unreachable
 
 if.else295:                                       ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{i32* %do_tab_convert}, metadata !16, metadata !{metadata !"0x102"}), !dbg !18
+  call void @llvm.dbg.declare(metadata i32* %do_tab_convert, metadata !16, metadata !{!"0x102"}), !dbg !18
   store i32 0, i32* %do_tab_convert, align 4, !dbg !19
   unreachable
 }
@@ -21,25 +21,25 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.gv = !{!0}
 !llvm.dbg.sp = !{!1, !7, !10, !11, !12}
 
-!0 = metadata !{metadata !"0x34\00vsplive\00vsplive\00\00617\001\001", metadata !1, metadata !2, metadata !6, null, null} ; [ DW_TAG_variable ]
-!1 = metadata !{metadata !"0x2e\00drt_vsprintf\00drt_vsprintf\00\00616\000\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\0012\00clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)\001\00\000\00\000", metadata !20, metadata !21, metadata !21, null, null, null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !5, i32 0} ; [ DW_TAG_subroutine_type ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !3} ; [ DW_TAG_base_type ]
-!7 = metadata !{metadata !"0x2e\00putc_mem\00putc_mem\00\0030\001\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !8, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!8 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !9, i32 0} ; [ DW_TAG_subroutine_type ]
-!9 = metadata !{null}
-!10 = metadata !{metadata !"0x2e\00print_double\00print_double\00\00203\001\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!11 = metadata !{metadata !"0x2e\00print_number\00print_number\00\0075\001\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !4, i32 0, null, null, null, null} ; [ DW_TAG_subprogram ]
-!12 = metadata !{metadata !"0x2e\00get_flags\00get_flags\00\00508\001\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !8, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!13 = metadata !{i32 653, i32 5, metadata !14, null}
-!14 = metadata !{metadata !"0xb\00652\0035\002", metadata !20, metadata !15} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{metadata !"0xb\00616\001\000", metadata !20, metadata !1} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{metadata !"0x100\00do_tab_convert\00853\000", metadata !17, metadata !2, metadata !6} ; [ DW_TAG_auto_variable ]
-!17 = metadata !{metadata !"0xb\00850\0012\0033", metadata !20, metadata !14} ; [ DW_TAG_lexical_block ]
-!18 = metadata !{i32 853, i32 11, metadata !17, null}
-!19 = metadata !{i32 853, i32 29, metadata !17, null}
-!20 = metadata !{metadata !"print.i", metadata !"/Volumes/Ebi/echeng/radars/r9146594"}
-!21 = metadata !{i32 0}
+!0 = !{!"0x34\00vsplive\00vsplive\00\00617\001\001", !1, !2, !6, null, null} ; [ DW_TAG_variable ]
+!1 = !{!"0x2e\00drt_vsprintf\00drt_vsprintf\00\00616\000\001\000\006\00256\000\000", !20, !2, !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x29", !20} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\0012\00clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)\001\00\000\00\000", !20, !21, !21, null, null, null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !20, !2, null, !5, i32 0} ; [ DW_TAG_subroutine_type ]
+!5 = !{!6}
+!6 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !3} ; [ DW_TAG_base_type ]
+!7 = !{!"0x2e\00putc_mem\00putc_mem\00\0030\001\001\000\006\00256\000\000", !20, !2, !8, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!8 = !{!"0x15\00\000\000\000\000\000\000", !20, !2, null, !9, i32 0} ; [ DW_TAG_subroutine_type ]
+!9 = !{null}
+!10 = !{!"0x2e\00print_double\00print_double\00\00203\001\001\000\006\00256\000\000", !20, !2, !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!11 = !{!"0x2e\00print_number\00print_number\00\0075\001\001\000\006\00256\000\000", !20, !2, !4, i32 0, null, null, null, null} ; [ DW_TAG_subprogram ]
+!12 = !{!"0x2e\00get_flags\00get_flags\00\00508\001\001\000\006\00256\000\000", !20, !2, !8, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!13 = !MDLocation(line: 653, column: 5, scope: !14)
+!14 = !{!"0xb\00652\0035\002", !20, !15} ; [ DW_TAG_lexical_block ]
+!15 = !{!"0xb\00616\001\000", !20, !1} ; [ DW_TAG_lexical_block ]
+!16 = !{!"0x100\00do_tab_convert\00853\000", !17, !2, !6} ; [ DW_TAG_auto_variable ]
+!17 = !{!"0xb\00850\0012\0033", !20, !14} ; [ DW_TAG_lexical_block ]
+!18 = !MDLocation(line: 853, column: 11, scope: !17)
+!19 = !MDLocation(line: 853, column: 29, scope: !17)
+!20 = !{!"print.i", !"/Volumes/Ebi/echeng/radars/r9146594"}
+!21 = !{i32 0}
diff --git a/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll b/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll
index 4b037db..b5b1b70 100644
--- a/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll
+++ b/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll
@@ -43,8 +43,8 @@ entry:
 
 !llvm.module.flags = !{!0, !1, !2, !3}
 
-!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
-!4 = metadata !{}
+!0 = !{i32 1, !"Objective-C Version", i32 2}
+!1 = !{i32 1, !"Objective-C Image Info Version", i32 0}
+!2 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
+!4 = !{}
diff --git a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
index 7d880f3..4db1f59 100644
--- a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
+++ b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
@@ -61,7 +61,7 @@ entry:
 
 !llvm.module.flags = !{!0, !1, !2, !3}
 
-!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!0 = !{i32 1, !"Objective-C Version", i32 2}
+!1 = !{i32 1, !"Objective-C Image Info Version", i32 0}
+!2 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
diff --git a/test/CodeGen/AArch64/arm64-aapcs-be.ll b/test/CodeGen/AArch64/arm64-aapcs-be.ll
index 77e2b0f..f27570a 100644
--- a/test/CodeGen/AArch64/arm64-aapcs-be.ll
+++ b/test/CodeGen/AArch64/arm64-aapcs-be.ll
@@ -21,4 +21,20 @@ entry:
 ; CHECK-DAG: strh w{{[0-9]}}, [sp, #14]
 ; CHECK-DAG: strb w{{[0-9]}}, [sp, #7]
   ret i32 %call
-}
-\ No newline at end of file
+}
+
+define float @test_block_addr([8 x float], [1 x float] %in) {
+; CHECK-LABEL: test_block_addr:
+; CHECK: ldr s0, [sp]
+  %val = extractvalue [1 x float] %in, 0
+  ret float %val
+}
+
+define void @test_block_addr_callee() {
+; CHECK-LABEL: test_block_addr_callee:
+; CHECK: str {{[a-z0-9]+}}, [sp]
+; CHECK: bl test_block_addr
+  %val = insertvalue [1 x float] undef, float 0.0, 0
+  call float @test_block_addr([8 x float] undef, [1 x float] %val)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll
index deb740e..e03d7fa 100644
--- a/test/CodeGen/AArch64/arm64-abi_align.ll
+++ b/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -527,8 +527,8 @@ attributes #3 = { nounwind "fp-contract-model"="standard" "relocation-model"="pi
 attributes #4 = { nounwind }
 attributes #5 = { nobuiltin }
 
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"short", metadata !1}
-!4 = metadata !{i64 0, i64 4, metadata !0, i64 4, i64 2, metadata !3, i64 8, i64 4, metadata !0, i64 12, i64 2, metadata !3, i64 16, i64 4, metadata !0, i64 20, i64 2, metadata !3}
+!0 = !{!"int", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"short", !1}
+!4 = !{i64 0, i64 4, !0, i64 4, i64 2, !3, i64 8, i64 4, !0, i64 12, i64 2, !3, i64 16, i64 4, !0, i64 20, i64 2, !3}
diff --git a/test/CodeGen/AArch64/arm64-atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll
index 3377849..642d72a 100644
--- a/test/CodeGen/AArch64/arm64-atomic-128.ll
+++ b/test/CodeGen/AArch64/arm64-atomic-128.ll
@@ -29,8 +29,7 @@ define void @fetch_and_nand(i128* %p, i128 %bits) {
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
-; CHECK-DAG: str    [[DEST_REGHI]]
-; CHECK-DAG: str    [[DEST_REGLO]]
+; CHECK-DAG: stp    [[DEST_REGLO]], [[DEST_REGHI]]
   %val = atomicrmw nand i128* %p, i128 %bits release
   store i128 %val, i128* @var, align 16
   ret void
@@ -45,8 +44,7 @@ define void @fetch_and_or(i128* %p, i128 %bits) {
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
-; CHECK-DAG: str    [[DEST_REGHI]]
-; CHECK-DAG: str    [[DEST_REGLO]]
+; CHECK-DAG: stp    [[DEST_REGLO]], [[DEST_REGHI]]
   %val = atomicrmw or i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -61,8 +59,7 @@ define void @fetch_and_add(i128* %p, i128 %bits) {
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
-; CHECK-DAG: str    [[DEST_REGHI]]
-; CHECK-DAG: str    [[DEST_REGLO]]
+; CHECK-DAG: stp    [[DEST_REGLO]], [[DEST_REGHI]]
   %val = atomicrmw add i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -77,8 +74,7 @@ define void @fetch_and_sub(i128* %p, i128 %bits) {
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
-; CHECK-DAG: str    [[DEST_REGHI]]
-; CHECK-DAG: str    [[DEST_REGLO]]
+; CHECK-DAG: stp    [[DEST_REGLO]], [[DEST_REGHI]]
   %val = atomicrmw sub i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -99,8 +95,7 @@ define void @fetch_and_min(i128* %p, i128 %bits) {
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
-; CHECK-DAG: str    [[DEST_REGHI]]
-; CHECK-DAG: str    [[DEST_REGLO]]
+; CHECK-DAG: stp    [[DEST_REGLO]], [[DEST_REGHI]]
   %val = atomicrmw min i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -121,8 +116,7 @@ define void @fetch_and_max(i128* %p, i128 %bits) {
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
-; CHECK-DAG: str    [[DEST_REGHI]]
-; CHECK-DAG: str    [[DEST_REGLO]]
+; CHECK-DAG: stp    [[DEST_REGLO]], [[DEST_REGHI]]
   %val = atomicrmw max i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -143,8 +137,7 @@ define void @fetch_and_umin(i128* %p, i128 %bits) {
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
-; CHECK-DAG: str    [[DEST_REGHI]]
-; CHECK-DAG: str    [[DEST_REGLO]]
+; CHECK-DAG: stp    [[DEST_REGLO]], [[DEST_REGHI]]
   %val = atomicrmw umin i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -165,8 +158,7 @@ define void @fetch_and_umax(i128* %p, i128 %bits) {
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
-; CHECK-DAG: str    [[DEST_REGHI]]
-; CHECK-DAG: str    [[DEST_REGLO]]
+; CHECK-DAG: stp    [[DEST_REGLO]], [[DEST_REGHI]]
   %val = atomicrmw umax i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
diff --git a/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
index 664a26c..b032d9c 100644
--- a/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
+++ b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
@@ -184,7 +184,7 @@ declare hidden fastcc i32 @Maze1Mech(i64, i64, i64, i64, i64, i32, i32) nounwind
 ; Materializable
 declare hidden fastcc void @CleanNet(i64) nounwind ssp
 
-!0 = metadata !{metadata !"long", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"any pointer", metadata !1}
+!0 = !{!"long", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"any pointer", !1}
diff --git a/test/CodeGen/AArch64/arm64-cse.ll b/test/CodeGen/AArch64/arm64-cse.ll
index b74ece8..508df7c 100644
--- a/test/CodeGen/AArch64/arm64-cse.ll
+++ b/test/CodeGen/AArch64/arm64-cse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 < %s -aarch64-atomic-cfg-tidy=0 -aarch64-gep-opt=false | FileCheck %s
+; RUN: llc -O3 < %s -aarch64-atomic-cfg-tidy=0 -aarch64-gep-opt=false -verify-machineinstrs | FileCheck %s
 target triple = "arm64-apple-ios"
 
 ; rdar://12462006
diff --git a/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll b/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll
index 8a744c5..a9b8024 100644
--- a/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll
+++ b/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll
@@ -19,6 +19,6 @@ define internal fastcc void @callee(i32* nocapture %p, i32 %a) nounwind optsize
   ret void
 }
 
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!0 = !{!"int", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll b/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
index e51c38b..e41e19e 100644
--- a/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
+++ b/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
@@ -6,7 +6,7 @@
 ; rdar://11855286
 define double @foo0(<2 x i64> %a) nounwind {
 ; CHECK:  scvtf.2d  [[REG:v[0-9]+]], v0, #9
-; CHECK-NEXT:  ins.d v0[0], [[REG]][1]
+; CHECK-NEXT:  mov  d0, [[REG]][1]
   %vecext = extractelement <2 x i64> %a, i32 1
   %fcvt_n = tail call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 %vecext, i32 9)
   ret double %fcvt_n
diff --git a/test/CodeGen/AArch64/arm64-fold-address.ll b/test/CodeGen/AArch64/arm64-fold-address.ll
index 96cc3e9..1f0b918 100644
--- a/test/CodeGen/AArch64/arm64-fold-address.ll
+++ b/test/CodeGen/AArch64/arm64-fold-address.ll
@@ -72,8 +72,8 @@ entry:
 
 !llvm.module.flags = !{!0, !1, !2, !3}
 
-!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
-!4 = metadata !{}
+!0 = !{i32 1, !"Objective-C Version", i32 2}
+!1 = !{i32 1, !"Objective-C Image Info Version", i32 0}
+!2 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
+!4 = !{}
diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
index c118f10..917911a 100644
--- a/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
+++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
@@ -34,7 +34,7 @@ declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) #1
 attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
-!1 = metadata !{metadata !2, metadata !2, i64 0}
-!2 = metadata !{metadata !"double", metadata !3, i64 0}
-!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"double", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/AArch64/arm64-ldp.ll b/test/CodeGen/AArch64/arm64-ldp.ll
index 5a98626..a9fa4ca 100644
--- a/test/CodeGen/AArch64/arm64-ldp.ll
+++ b/test/CodeGen/AArch64/arm64-ldp.ll
@@ -12,6 +12,18 @@ define i32 @ldp_int(i32* %p) nounwind {
   ret i32 %add
 }
 
+; CHECK: ldp_sext_int
+; CHECK: ldpsw
+define i64 @ldp_sext_int(i32* %p) nounwind {
+  %tmp = load i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32* %p, i64 1
+  %tmp1 = load i32* %add.ptr, align 4
+  %sexttmp = sext i32 %tmp to i64
+  %sexttmp1 = sext i32 %tmp1 to i64
+  %add = add nsw i64 %sexttmp1, %sexttmp
+  ret i64 %add
+}
+
 ; CHECK: ldp_long
 ; CHECK: ldp
 define i64 @ldp_long(i64* %p) nounwind {
@@ -56,6 +68,21 @@ define i32 @ldur_int(i32* %a) nounwind {
   ret i32 %tmp3
 }
 
+define i64 @ldur_sext_int(i32* %a) nounwind {
+; LDUR_CHK: ldur_sext_int
+; LDUR_CHK: ldpsw     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-8]
+; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32* %a, i32 -1
+  %tmp1 = load i32* %p1, align 2
+  %p2 = getelementptr inbounds i32* %a, i32 -2
+  %tmp2 = load i32* %p2, align 2
+  %sexttmp1 = sext i32 %tmp1 to i64
+  %sexttmp2 = sext i32 %tmp2 to i64
+  %tmp3 = add i64 %sexttmp1, %sexttmp2
+  ret i64 %tmp3
+}
+
 define i64 @ldur_long(i64* %a) nounwind ssp {
 ; LDUR_CHK: ldur_long
 ; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16]
@@ -110,6 +137,22 @@ define i64 @pairUpBarelyIn(i64* %a) nounwind ssp {
   ret i64 %tmp3
 }
 
+define i64 @pairUpBarelyInSext(i32* %a) nounwind ssp {
+; LDUR_CHK: pairUpBarelyInSext
+; LDUR_CHK-NOT: ldur
+; LDUR_CHK: ldpsw     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
+; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32* %a, i64 -63
+  %tmp1 = load i32* %p1, align 2
+  %p2 = getelementptr inbounds i32* %a, i64 -64
+  %tmp2 = load i32* %p2, align 2
+  %sexttmp1 = sext i32 %tmp1 to i64
+  %sexttmp2 = sext i32 %tmp2 to i64
+  %tmp3 = add i64 %sexttmp1, %sexttmp2
+  ret i64 %tmp3
+}
+
 define i64 @pairUpBarelyOut(i64* %a) nounwind ssp {
 ; LDUR_CHK: pairUpBarelyOut
 ; LDUR_CHK-NOT: ldp
@@ -125,6 +168,23 @@ define i64 @pairUpBarelyOut(i64* %a) nounwind ssp {
   ret i64 %tmp3
 }
 
+define i64 @pairUpBarelyOutSext(i32* %a) nounwind ssp {
+; LDUR_CHK: pairUpBarelyOutSext
+; LDUR_CHK-NOT: ldp
+; Don't be fragile about which loads or manipulations of the base register
+; are used---just check that there isn't an ldp before the add
+; LDUR_CHK: add
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32* %a, i64 -64
+  %tmp1 = load i32* %p1, align 2
+  %p2 = getelementptr inbounds i32* %a, i64 -65
+  %tmp2 = load i32* %p2, align 2
+  %sexttmp1 = sext i32 %tmp1 to i64
+  %sexttmp2 = sext i32 %tmp2 to i64
+  %tmp3 = add i64 %sexttmp1, %sexttmp2
+  ret i64 %tmp3
+}
+
 define i64 @pairUpNotAligned(i64* %a) nounwind ssp {
 ; LDUR_CHK: pairUpNotAligned
 ; LDUR_CHK-NOT: ldp
@@ -147,3 +207,28 @@ define i64 @pairUpNotAligned(i64* %a) nounwind ssp {
   %tmp3 = add i64 %tmp1, %tmp2
   ret i64 %tmp3
 }
+
+define i64 @pairUpNotAlignedSext(i32* %a) nounwind ssp {
+; LDUR_CHK: pairUpNotAlignedSext
+; LDUR_CHK-NOT: ldp
+; LDUR_CHK: ldursw
+; LDUR_CHK-NEXT: ldursw
+; LDUR_CHK-NEXT: add
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32* %a, i64 -18
+  %bp1 = bitcast i32* %p1 to i8*
+  %bp1p1 = getelementptr inbounds i8* %bp1, i64 1
+  %dp1 = bitcast i8* %bp1p1 to i32*
+  %tmp1 = load i32* %dp1, align 1
+
+  %p2 = getelementptr inbounds i32* %a, i64 -17
+  %bp2 = bitcast i32* %p2 to i8*
+  %bp2p1 = getelementptr inbounds i8* %bp2, i64 1
+  %dp2 = bitcast i8* %bp2p1 to i32*
+  %tmp2 = load i32* %dp2, align 1
+
+  %sexttmp1 = sext i32 %tmp1 to i64
+  %sexttmp2 = sext i32 %tmp2 to i64
+  %tmp3 = add i64 %sexttmp1, %sexttmp2
+ ret i64 %tmp3
+}
diff --git a/test/CodeGen/AArch64/arm64-named-reg-alloc.ll b/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
index d86d2e6..0c56454 100644
--- a/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
+++ b/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
@@ -11,4 +11,4 @@ entry:
 
 declare i32 @llvm.read_register.i32(metadata) nounwind
 
-!0 = metadata !{metadata !"x5\00"}
+!0 = !{!"x5\00"}
diff --git a/test/CodeGen/AArch64/arm64-named-reg-notareg.ll b/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
index 3ca14c4..759bc15 100644
--- a/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
+++ b/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
@@ -10,4 +10,4 @@ entry:
 
 declare i32 @llvm.read_register.i32(metadata) nounwind
 
-!0 = metadata !{metadata !"notareg\00"}
+!0 = !{!"notareg\00"}
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index 1cfba82..4a92c3d 100644
--- a/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -188,7 +188,7 @@ define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) {
 
 define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) {
 ; CHECK-LABEL: ins2f1:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK: mov {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp3 = extractelement <2 x double> %tmp1, i32 1
   %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
   ret <1 x double> %tmp4
diff --git a/test/CodeGen/AArch64/arm64-neon-select_cc.ll b/test/CodeGen/AArch64/arm64-neon-select_cc.ll
index 95c582a..d334c08 100644
--- a/test/CodeGen/AArch64/arm64-neon-select_cc.ll
+++ b/test/CodeGen/AArch64/arm64-neon-select_cc.ll
@@ -204,3 +204,18 @@ define <2 x double> @test_select_cc_v2f64(double %a, double %b, <2 x double> %c,
   %e = select i1 %cmp31, <2 x double> %c, <2 x double> %d
   ret <2 x double> %e
 }
+
+; Special case: when the select condition is an icmp with i1 operands, don't
+; do the comparison on vectors.
+; Part of PR21549.
+define <2 x i32> @test_select_cc_v2i32_icmpi1(i1 %cc, <2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_select_cc_v2i32_icmpi1:
+; CHECK: tst   w0, #0x1
+; CHECK: csetm [[MASK:w[0-9]+]], ne
+; CHECK: dup   [[DUPMASK:v[0-9]+]].2s, [[MASK]]
+; CHECK: bsl   [[DUPMASK]].8b, v0.8b, v1.8b
+; CHECK: mov   v0.16b, [[DUPMASK]].16b
+  %cmp = icmp ne i1 %cc, 0
+  %e = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %e
+}
diff --git a/test/CodeGen/AArch64/arm64-platform-reg.ll b/test/CodeGen/AArch64/arm64-platform-reg.ll
index 651c793..b0d3ee0 100644
--- a/test/CodeGen/AArch64/arm64-platform-reg.ll
+++ b/test/CodeGen/AArch64/arm64-platform-reg.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
+; RUN: llc -mtriple=arm64-freebsd-gnu -aarch64-reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
 ; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
 
 ; x18 is reserved as a platform register on Darwin but not on other
@@ -16,11 +17,11 @@ define void @keep_live() {
 ; CHECK: ldr x18
 ; CHECK: str x18
 
-; CHECK-DARWIN-NOT: ldr fp
-; CHECK-DARWIN-NOT: ldr x18
-; CHECK-DARWIN: Spill
-; CHECK-DARWIN-NOT: ldr fp
-; CHECK-DARWIN-NOT: ldr x18
-; CHECK-DARWIN: ret
+; CHECK-RESERVE-X18-NOT: ldr fp
+; CHECK-RESERVE-X18-NOT: ldr x18
+; CHECK-RESERVE-X18: Spill
+; CHECK-RESERVE-X18-NOT: ldr fp
+; CHECK-RESERVE-X18-NOT: ldr x18
+; CHECK-RESERVE-X18: ret
   ret void
 }
diff --git a/test/CodeGen/AArch64/arm64-popcnt.ll b/test/CodeGen/AArch64/arm64-popcnt.ll
index 117ab3a..b0b529a 100644
--- a/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -4,7 +4,8 @@
 define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
   ret i32 %cnt
-; CHECK: fmov	s0, w0
+; CHECK: ubfx	x{{[0-9]+}}
+; CHECK: fmov	d0, x{{[0-9]+}}
 ; CHECK: cnt.8b	v0, v0
 ; CHECK: uaddlv.8b	h0, v0
 ; CHECK: fmov w0, s0
@@ -15,7 +16,24 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
 ; CHECK-NONEON: and w{{[0-9]+}}, w{{[0-9]+}}, #0x33333333
 ; CHECK-NONEON: and w{{[0-9]+}}, w{{[0-9]+}}, #0xf0f0f0f
 ; CHECK-NONEON: mul
+}
 
+define i32 @cnt32_advsimd_2(<2 x i32> %x) {
+  %1 = extractelement <2 x i32> %x, i64 0
+  %2 = tail call i32 @llvm.ctpop.i32(i32 %1)
+  ret i32 %2
+; CHECK: fmov	w0, s0
+; CHECK: fmov	d0, x0
+; CHECK: cnt.8b	v0, v0
+; CHECK: uaddlv.8b	h0, v0
+; CHECK: fmov w0, s0
+; CHECK: ret
+; CHECK-NONEON-LABEL: cnt32_advsimd_2
+; CHECK-NONEON-NOT: 8b
+; CHECK-NONEON: and w{{[0-9]+}}, w{{[0-9]+}}, #0x55555555
+; CHECK-NONEON: and w{{[0-9]+}}, w{{[0-9]+}}, #0x33333333
+; CHECK-NONEON: and w{{[0-9]+}}, w{{[0-9]+}}, #0xf0f0f0f
+; CHECK-NONEON: mul
 }
 
 define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
diff --git a/test/CodeGen/AArch64/arm64-prefetch.ll b/test/CodeGen/AArch64/arm64-prefetch.ll
index 9dc6301..aac3515 100644
--- a/test/CodeGen/AArch64/arm64-prefetch.ll
+++ b/test/CodeGen/AArch64/arm64-prefetch.ll
@@ -117,7 +117,7 @@ entry:
 
 declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind
 
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"any pointer", metadata !1}
+!0 = !{!"int", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"any pointer", !1}
diff --git a/test/CodeGen/AArch64/arm64-promote-const.ll b/test/CodeGen/AArch64/arm64-promote-const.ll
index 380ff55..5dd92a7 100644
--- a/test/CodeGen/AArch64/arm64-promote-const.ll
+++ b/test/CodeGen/AArch64/arm64-promote-const.ll
@@ -41,8 +41,7 @@ entry:
 ; PROMOTED-LABEL: test2:
 ; In stress mode, constant vector are promoted
 ; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1:__PromotedConst[0-9]+]]@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
-; PROMOTED: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; PROMOTED: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTV1]]@PAGEOFF]
 ; Destination register is defined by ABI
 ; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
 ; PROMOTED-NEXT: mla.16b v0, v0, v[[REGNUM]]
@@ -64,51 +63,23 @@ entry:
   ret <16 x i8> %add.i9
 }
 
-; Two different uses of the sane constant in two different basic blocks,
+; Two different uses of the same constant in two different basic blocks,
 ; one dominates the other
 define <16 x i8> @test3(<16 x i8> %arg, i32 %path) {
 ; PROMOTED-LABEL: test3:
 ; In stress mode, constant vector are promoted
 ; Since, the constant is the same as the previous function,
 ; the same address must be used
-; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
-; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
-; Destination register is defined by ABI
-; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
-; PROMOTED-NEXT: cbnz w0, [[LABEL:LBB.*]]
-; Next BB
-; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV2:__PromotedConst[0-9]+]]@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV2]]@PAGEOFF
-; PROMOTED-NEXT: ldr q[[REGNUM]], {{\[}}[[BASEADDR]]]
-; Next BB
-; PROMOTED-NEXT: [[LABEL]]:
-; PROMOTED-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
-; PROMOTED-NEXT: add.16b v0, v0, [[DESTV]]
-; PROMOTED-NEXT: ret
+; PROMOTED: ldr
+; PROMOTED: ldr
+; PROMOTED-NOT: ldr
+; PROMOTED: ret
 
 ; REGULAR-LABEL: test3:
-; Regular mode does not elimitate common sub expression by its own.
-; In other words, the same loads appears several times.
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1:lCP.*]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
-; Destination register is defined by ABI
-; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
-; REGULAR-NEXT: cbz w0, [[LABELelse:LBB.*]]
-; Next BB
-; Redundant load
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
-; REGULAR-NEXT: b [[LABELend:LBB.*]]
-; Next BB
-; REGULAR-NEXT: [[LABELelse]]
-; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL2]]@PAGEOFF]
-; Next BB
-; REGULAR-NEXT: [[LABELend]]:
-; REGULAR-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
-; REGULAR-NEXT: add.16b v0, v0, [[DESTV]]
-; REGULAR-NEXT: ret
+; REGULAR: ldr
+; REGULAR: ldr
+; REGULAR-NOT: ldr
+; REGULAR: ret
 entry:
   %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
   %tobool = icmp eq i32 %path, 0
@@ -135,34 +106,14 @@ define <16 x i8> @test4(<16 x i8> %arg, i32 %path) {
 ; In stress mode, constant vector are promoted
 ; Since, the constant is the same as the previous function,
 ; the same address must be used
-; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
-; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
-; Destination register is defined by ABI
-; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
-; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
-; Next BB
-; PROMOTED: mul.16b v0, v0, v[[REGNUM]]
-; Next BB
-; PROMOTED-NEXT: [[LABEL]]:
-; PROMOTED-NEXT: ret
-
+; PROMOTED: ldr
+; PROMOTED-NOT: ldr
+; PROMOTED: ret
 
 ; REGULAR-LABEL: test4:
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
-; Destination register is defined by ABI
-; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
-; REGULAR-NEXT: cbz w0, [[LABEL:LBB.*]]
-; Next BB
-; Redundant expression
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
-; Destination register is defined by ABI
-; REGULAR-NEXT: mul.16b v0, v0, v[[REGNUM]]
-; Next BB
-; REGULAR-NEXT: [[LABEL]]:
-; REGULAR-NEXT: ret
+; REGULAR: ldr
+; REGULAR-NOT: ldr
+; REGULAR: ret
 entry:
   %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
   %tobool = icmp eq i32 %path, 0
@@ -184,40 +135,13 @@ define <16 x i8> @test5(<16 x i8> %arg, i32 %path) {
 ; In stress mode, constant vector are promoted
 ; Since, the constant is the same as the previous function,
 ; the same address must be used
-; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
-; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
-; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
-; Next BB
-; PROMOTED: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
-; PROMOTED-NEXT: mul.16b v[[REGNUM]], [[DESTV]], v[[REGNUM]]
-; Next BB
-; PROMOTED-NEXT: [[LABEL]]:
-; PROMOTED-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[REGNUM]], v[[REGNUM]]
-; PROMOTED-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
-; PROMOTED-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
-; PROMOTED-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
-; PROMOTED-NEXT: ret
+; PROMOTED: ldr
+; PROMOTED-NOT: ldr
+; PROMOTED: ret
 
 ; REGULAR-LABEL: test5:
-; REGULAR: cbz w0, [[LABELelse:LBB.*]]
-; Next BB
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
-; REGULAR-NEXT: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
-; REGULAR-NEXT: mul.16b v[[DESTREGNUM:[0-9]+]], [[DESTV]], v[[REGNUM]]
-; REGULAR-NEXT: b [[LABELend:LBB.*]]
-; Next BB
-; REGULAR-NEXT: [[LABELelse]]
-; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
-; REGULAR-NEXT: ldr q[[DESTREGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
-; Next BB
-; REGULAR-NEXT: [[LABELend]]:
-; REGULAR-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[DESTREGNUM]], v[[DESTREGNUM]]
-; REGULAR-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
-; REGULAR-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
-; REGULAR-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
-; REGULAR-NEXT: ret
+; REGULAR: ldr
+; REGULAR: ret
 entry:
   %tobool = icmp eq i32 %path, 0
   br i1 %tobool, label %if.end, label %if.then
diff --git a/test/CodeGen/AArch64/arm64-st1.ll b/test/CodeGen/AArch64/arm64-st1.ll
index 4370484..76d52f4 100644
--- a/test/CodeGen/AArch64/arm64-st1.ll
+++ b/test/CodeGen/AArch64/arm64-st1.ll
@@ -8,6 +8,26 @@ define void @st1lane_16b(<16 x i8> %A, i8* %D) {
   ret void
 }
 
+define void @st1lane_ro_16b(<16 x i8> %A, i8* %D, i64 %offset) {
+; CHECK-LABEL: st1lane_ro_16b
+; CHECK: add x[[XREG:[0-9]+]], x0, x1
+; CHECK: st1.b { v0 }[1], [x[[XREG]]]
+  %ptr = getelementptr i8* %D, i64 %offset
+  %tmp = extractelement <16 x i8> %A, i32 1
+  store i8 %tmp, i8* %ptr
+  ret void
+}
+
+define void @st1lane0_ro_16b(<16 x i8> %A, i8* %D, i64 %offset) {
+; CHECK-LABEL: st1lane0_ro_16b
+; CHECK: add x[[XREG:[0-9]+]], x0, x1
+; CHECK: st1.b { v0 }[0], [x[[XREG]]]
+  %ptr = getelementptr i8* %D, i64 %offset
+  %tmp = extractelement <16 x i8> %A, i32 0
+  store i8 %tmp, i8* %ptr
+  ret void
+}
+
 define void @st1lane_8h(<8 x i16> %A, i16* %D) {
 ; CHECK-LABEL: st1lane_8h
 ; CHECK: st1.h
@@ -16,6 +36,25 @@ define void @st1lane_8h(<8 x i16> %A, i16* %D) {
   ret void
 }
 
+define void @st1lane_ro_8h(<8 x i16> %A, i16* %D, i64 %offset) {
+; CHECK-LABEL: st1lane_ro_8h
+; CHECK: add x[[XREG:[0-9]+]], x0, x1
+; CHECK: st1.h { v0 }[1], [x[[XREG]]]
+  %ptr = getelementptr i16* %D, i64 %offset
+  %tmp = extractelement <8 x i16> %A, i32 1
+  store i16 %tmp, i16* %ptr
+  ret void
+}
+
+define void @st1lane0_ro_8h(<8 x i16> %A, i16* %D, i64 %offset) {
+; CHECK-LABEL: st1lane0_ro_8h
+; CHECK: str h0, [x0, x1, lsl #1]
+  %ptr = getelementptr i16* %D, i64 %offset
+  %tmp = extractelement <8 x i16> %A, i32 0
+  store i16 %tmp, i16* %ptr
+  ret void
+}
+
 define void @st1lane_4s(<4 x i32> %A, i32* %D) {
 ; CHECK-LABEL: st1lane_4s
 ; CHECK: st1.s
@@ -24,6 +63,25 @@ define void @st1lane_4s(<4 x i32> %A, i32* %D) {
   ret void
 }
 
+define void @st1lane_ro_4s(<4 x i32> %A, i32* %D, i64 %offset) {
+; CHECK-LABEL: st1lane_ro_4s
+; CHECK: add x[[XREG:[0-9]+]], x0, x1
+; CHECK: st1.s { v0 }[1], [x[[XREG]]]
+  %ptr = getelementptr i32* %D, i64 %offset
+  %tmp = extractelement <4 x i32> %A, i32 1
+  store i32 %tmp, i32* %ptr
+  ret void
+}
+
+define void @st1lane0_ro_4s(<4 x i32> %A, i32* %D, i64 %offset) {
+; CHECK-LABEL: st1lane0_ro_4s
+; CHECK: str s0, [x0, x1, lsl #2]
+  %ptr = getelementptr i32* %D, i64 %offset
+  %tmp = extractelement <4 x i32> %A, i32 0
+  store i32 %tmp, i32* %ptr
+  ret void
+}
+
 define void @st1lane_4s_float(<4 x float> %A, float* %D) {
 ; CHECK-LABEL: st1lane_4s_float
 ; CHECK: st1.s
@@ -32,6 +90,25 @@ define void @st1lane_4s_float(<4 x float> %A, float* %D) {
   ret void
 }
 
+define void @st1lane_ro_4s_float(<4 x float> %A, float* %D, i64 %offset) {
+; CHECK-LABEL: st1lane_ro_4s_float
+; CHECK: add x[[XREG:[0-9]+]], x0, x1
+; CHECK: st1.s { v0 }[1], [x[[XREG]]]
+  %ptr = getelementptr float* %D, i64 %offset
+  %tmp = extractelement <4 x float> %A, i32 1
+  store float %tmp, float* %ptr
+  ret void
+}
+
+define void @st1lane0_ro_4s_float(<4 x float> %A, float* %D, i64 %offset) {
+; CHECK-LABEL: st1lane0_ro_4s_float
+; CHECK: str s0, [x0, x1, lsl #2]
+  %ptr = getelementptr float* %D, i64 %offset
+  %tmp = extractelement <4 x float> %A, i32 0
+  store float %tmp, float* %ptr
+  ret void
+}
+
 define void @st1lane_2d(<2 x i64> %A, i64* %D) {
 ; CHECK-LABEL: st1lane_2d
 ; CHECK: st1.d
@@ -40,6 +117,25 @@ define void @st1lane_2d(<2 x i64> %A, i64* %D) {
   ret void
 }
 
+define void @st1lane_ro_2d(<2 x i64> %A, i64* %D, i64 %offset) {
+; CHECK-LABEL: st1lane_ro_2d
+; CHECK: add x[[XREG:[0-9]+]], x0, x1
+; CHECK: st1.d { v0 }[1], [x[[XREG]]]
+  %ptr = getelementptr i64* %D, i64 %offset
+  %tmp = extractelement <2 x i64> %A, i32 1
+  store i64 %tmp, i64* %ptr
+  ret void
+}
+
+define void @st1lane0_ro_2d(<2 x i64> %A, i64* %D, i64 %offset) {
+; CHECK-LABEL: st1lane0_ro_2d
+; CHECK: str d0, [x0, x1, lsl #3]
+  %ptr = getelementptr i64* %D, i64 %offset
+  %tmp = extractelement <2 x i64> %A, i32 0
+  store i64 %tmp, i64* %ptr
+  ret void
+}
+
 define void @st1lane_2d_double(<2 x double> %A, double* %D) {
 ; CHECK-LABEL: st1lane_2d_double
 ; CHECK: st1.d
@@ -48,6 +144,25 @@ define void @st1lane_2d_double(<2 x double> %A, double* %D) {
   ret void
 }
 
+define void @st1lane_ro_2d_double(<2 x double> %A, double* %D, i64 %offset) {
+; CHECK-LABEL: st1lane_ro_2d_double
+; CHECK: add x[[XREG:[0-9]+]], x0, x1
+; CHECK: st1.d { v0 }[1], [x[[XREG]]]
+  %ptr = getelementptr double* %D, i64 %offset
+  %tmp = extractelement <2 x double> %A, i32 1
+  store double %tmp, double* %ptr
+  ret void
+}
+
+define void @st1lane0_ro_2d_double(<2 x double> %A, double* %D, i64 %offset) {
+; CHECK-LABEL: st1lane0_ro_2d_double
+; CHECK: str d0, [x0, x1, lsl #3]
+  %ptr = getelementptr double* %D, i64 %offset
+  %tmp = extractelement <2 x double> %A, i32 0
+  store double %tmp, double* %ptr
+  ret void
+}
+
 define void @st1lane_8b(<8 x i8> %A, i8* %D) {
 ; CHECK-LABEL: st1lane_8b
 ; CHECK: st1.b
@@ -56,6 +171,26 @@ define void @st1lane_8b(<8 x i8> %A, i8* %D) {
   ret void
 }
 
+define void @st1lane_ro_8b(<8 x i8> %A, i8* %D, i64 %offset) {
+; CHECK-LABEL: st1lane_ro_8b
+; CHECK: add x[[XREG:[0-9]+]], x0, x1
+; CHECK: st1.b { v0 }[1], [x[[XREG]]]
+  %ptr = getelementptr i8* %D, i64 %offset
+  %tmp = extractelement <8 x i8> %A, i32 1
+  store i8 %tmp, i8* %ptr
+  ret void
+}
+
+define void @st1lane0_ro_8b(<8 x i8> %A, i8* %D, i64 %offset) {
+; CHECK-LABEL: st1lane0_ro_8b
+; CHECK: add x[[XREG:[0-9]+]], x0, x1
+; CHECK: st1.b { v0 }[0], [x[[XREG]]]
+  %ptr = getelementptr i8* %D, i64 %offset
+  %tmp = extractelement <8 x i8> %A, i32 0
+  store i8 %tmp, i8* %ptr
+  ret void
+}
+
 define void @st1lane_4h(<4 x i16> %A, i16* %D) {
 ; CHECK-LABEL: st1lane_4h
 ; CHECK: st1.h
@@ -64,6 +199,25 @@ define void @st1lane_4h(<4 x i16> %A, i16* %D) {
   ret void
 }
 
+define void @st1lane_ro_4h(<4 x i16> %A, i16* %D, i64 %offset) {
+; CHECK-LABEL: st1lane_ro_4h
+; CHECK: add x[[XREG:[0-9]+]], x0, x1
+; CHECK: st1.h { v0 }[1], [x[[XREG]]]
+  %ptr = getelementptr i16* %D, i64 %offset
+  %tmp = extractelement <4 x i16> %A, i32 1
+  store i16 %tmp, i16* %ptr
+  ret void
+}
+
+define void @st1lane0_ro_4h(<4 x i16> %A, i16* %D, i64 %offset) {
+; CHECK-LABEL: st1lane0_ro_4h
+; CHECK: str h0, [x0, x1, lsl #1]
+  %ptr = getelementptr i16* %D, i64 %offset
+  %tmp = extractelement <4 x i16> %A, i32 0
+  store i16 %tmp, i16* %ptr
+  ret void
+}
+
 define void @st1lane_2s(<2 x i32> %A, i32* %D) {
 ; CHECK-LABEL: st1lane_2s
 ; CHECK: st1.s
@@ -72,6 +226,25 @@ define void @st1lane_2s(<2 x i32> %A, i32* %D) {
   ret void
 }
 
+define void @st1lane_ro_2s(<2 x i32> %A, i32* %D, i64 %offset) {
+; CHECK-LABEL: st1lane_ro_2s
+; CHECK: add x[[XREG:[0-9]+]], x0, x1
+; CHECK: st1.s { v0 }[1], [x[[XREG]]]
+  %ptr = getelementptr i32* %D, i64 %offset
+  %tmp = extractelement <2 x i32> %A, i32 1
+  store i32 %tmp, i32* %ptr
+  ret void
+}
+
+define void @st1lane0_ro_2s(<2 x i32> %A, i32* %D, i64 %offset) {
+; CHECK-LABEL: st1lane0_ro_2s
+; CHECK: str s0, [x0, x1, lsl #2]
+  %ptr = getelementptr i32* %D, i64 %offset
+  %tmp = extractelement <2 x i32> %A, i32 0
+  store i32 %tmp, i32* %ptr
+  ret void
+}
+
 define void @st1lane_2s_float(<2 x float> %A, float* %D) {
 ; CHECK-LABEL: st1lane_2s_float
 ; CHECK: st1.s
@@ -80,6 +253,25 @@ define void @st1lane_2s_float(<2 x float> %A, float* %D) {
   ret void
 }
 
+define void @st1lane_ro_2s_float(<2 x float> %A, float* %D, i64 %offset) {
+; CHECK-LABEL: st1lane_ro_2s_float
+; CHECK: add x[[XREG:[0-9]+]], x0, x1
+; CHECK: st1.s { v0 }[1], [x[[XREG]]]
+  %ptr = getelementptr float* %D, i64 %offset
+  %tmp = extractelement <2 x float> %A, i32 1
+  store float %tmp, float* %ptr
+  ret void
+}
+
+define void @st1lane0_ro_2s_float(<2 x float> %A, float* %D, i64 %offset) {
+; CHECK-LABEL: st1lane0_ro_2s_float
+; CHECK: str s0, [x0, x1, lsl #2]
+  %ptr = getelementptr float* %D, i64 %offset
+  %tmp = extractelement <2 x float> %A, i32 0
+  store float %tmp, float* %ptr
+  ret void
+}
+
 define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
 ; CHECK-LABEL: st2lane_16b
 ; CHECK: st2.b
diff --git a/test/CodeGen/AArch64/arm64-stackmap-nops.ll b/test/CodeGen/AArch64/arm64-stackmap-nops.ll
new file mode 100644
index 0000000..5915b64
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stackmap-nops.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+define void @test_shadow_optimization() {
+entry:
+; Expect 8 bytes worth of nops here rather than 16: With the shadow optimization
+; in place, 8 bytes will be consumed by the frame teardown and return instr.
+; CHECK-LABEL: test_shadow_optimization:
+; CHECK:      nop
+; CHECK-NEXT: nop
+; CHECK-NOT:  nop
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  0, i32  16)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
diff --git a/test/CodeGen/AArch64/arm64-stackpointer.ll b/test/CodeGen/AArch64/arm64-stackpointer.ll
index 581faf1..a33de8c 100644
--- a/test/CodeGen/AArch64/arm64-stackpointer.ll
+++ b/test/CodeGen/AArch64/arm64-stackpointer.ll
@@ -21,4 +21,4 @@ declare void @llvm.write_register.i64(metadata, i64) nounwind
 
 ; register unsigned long current_stack_pointer asm("sp");
 ; CHECK-NOT: .asciz  "sp"
-!0 = metadata !{metadata !"sp\00"}
+!0 = !{!"sp\00"}
diff --git a/test/CodeGen/AArch64/arm64-tls-dynamics.ll b/test/CodeGen/AArch64/arm64-tls-dynamics.ll
index e8a83fd..30ea63b 100644
--- a/test/CodeGen/AArch64/arm64-tls-dynamics.ll
+++ b/test/CodeGen/AArch64/arm64-tls-dynamics.ll
@@ -20,7 +20,7 @@ define i32 @test_generaldynamic() {
 ; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
 ; CHECK: ldr w0, [x[[TP]], x0]
 
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE21
 ; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
 ; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
 ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
@@ -43,7 +43,7 @@ define i32* @test_generaldynamic_addr() {
 ; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
 ; CHECK: add x0, [[TP]], x0
 
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE21
 ; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
 ; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
 ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
@@ -73,7 +73,7 @@ define i32 @test_localdynamic() {
 
 ; CHECK: ldr w0, [x[[TPIDR]], x[[TPREL]]]
 
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE21
 ; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
 ; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
 ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
@@ -101,7 +101,7 @@ define i32* @test_localdynamic_addr() {
 
 ; CHECK: add x0, [[TPIDR]], [[TPREL]]
 
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE21
 ; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
 ; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
 ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
diff --git a/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll b/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
index a7f5215..923742d 100644
--- a/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
@@ -22,10 +22,10 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"=
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.6.0 "}
-!1 = metadata !{metadata !2, metadata !2, i64 0}
-!2 = metadata !{metadata !"any pointer", metadata !3, i64 0}
-!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
-!5 = metadata !{metadata !6, metadata !6, i64 0}
-!6 = metadata !{metadata !"int", metadata !3, i64 0}
+!0 = !{!"clang version 3.6.0 "}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"any pointer", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"int", !3, i64 0}
diff --git a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
index 36a7bfd..44f2af1 100644
--- a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
@@ -12,6 +12,7 @@ define void @test_simple(i32 %n, ...) {
 ; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
 
 ; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
 
 ; CHECK: stp x1, x2, [sp, #[[GR_BASE:[0-9]+]]]
 ; ... omit middle ones ...
@@ -21,11 +22,10 @@ define void @test_simple(i32 %n, ...) {
 ; ... omit middle ones ...
 ; CHECK: stp q6, q7, [sp, #
 
-; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
+; CHECK: str [[STACK_TOP]], [x[[VA_LIST]]]
 
 ; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
 ; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #56
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
 ; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
 
 ; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
@@ -50,6 +50,7 @@ define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
 ; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
 
 ; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
 
 ; CHECK: stp x3, x4, [sp, #[[GR_BASE:[0-9]+]]]
 ; ... omit middle ones ...
@@ -59,11 +60,10 @@ define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
 ; ... omit middle ones ...
 ; CHECK: str q7, [sp, #
 
-; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
+; CHECK: str [[STACK_TOP]], [x[[VA_LIST]]]
 
 ; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
 ; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #40
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
 ; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
 
 ; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
@@ -89,18 +89,20 @@ define void @test_nospare([8 x i64], [8 x float], ...) {
   call void @llvm.va_start(i8* %addr)
 ; CHECK-NOT: sub sp, sp
 ; CHECK: mov [[STACK:x[0-9]+]], sp
-; CHECK: str [[STACK]], [{{x[0-9]+}}, :lo12:var]
+; CHECK: add x[[VAR:[0-9]+]], {{x[0-9]+}}, :lo12:var
+; CHECK: str [[STACK]], [x[[VAR]]]
 
   ret void
 }
 
 ; If there are non-variadic arguments on the stack (here two i64s) then the
 ; __stack field should point just past them.
-define void @test_offsetstack([10 x i64], [3 x float], ...) {
+define void @test_offsetstack([8 x i64], [2 x i64], [3 x float], ...) {
 ; CHECK-LABEL: test_offsetstack:
 ; CHECK: sub sp, sp, #80
 ; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96
-; CHECK: str [[STACK_TOP]], [{{x[0-9]+}}, :lo12:var]
+; CHECK: add x[[VAR:[0-9]+]], {{x[0-9]+}}, :lo12:var
+; CHECK: str [[STACK_TOP]], [x[[VAR]]]
 
   %addr = bitcast %va_list* @var to i8*
   call void @llvm.va_start(i8* %addr)
diff --git a/test/CodeGen/AArch64/arm64-vshuffle.ll b/test/CodeGen/AArch64/arm64-vshuffle.ll
index 62fd961..75e0d80 100644
--- a/test/CodeGen/AArch64/arm64-vshuffle.ll
+++ b/test/CodeGen/AArch64/arm64-vshuffle.ll
@@ -29,14 +29,14 @@ entry:
 }
 
 ; CHECK: lCPI1_0:
-; CHECK:          .byte   2                       ; 0x2
+; CHECK:          .byte   0                       ; 0x0
 ; CHECK:          .byte   255                     ; 0xff
-; CHECK:          .byte   6                       ; 0x6
+; CHECK:          .byte   2                       ; 0x2
 ; CHECK:          .byte   255                     ; 0xff
 ; CHECK:          .byte   10                      ; 0xa
 ; CHECK:          .byte   12                      ; 0xc
 ; CHECK:          .byte   14                      ; 0xe
-; CHECK:          .byte   0                       ; 0x0
+; CHECK:          .byte   7                       ; 0x7
 ; CHECK: test2
 ; CHECK: ldr     d[[REG0:[0-9]+]], [{{.*}}, lCPI1_0@PAGEOFF]
 ; CHECK: adrp    x[[REG2:[0-9]+]], lCPI1_1@PAGE
@@ -82,22 +82,22 @@ bb:
   ret <16 x i1> %Shuff
 }
 ; CHECK: lCPI3_1:
-; CHECK:         .byte   2                       ; 0x2
-; CHECK:         .byte   1                       ; 0x1
-; CHECK:         .byte   6                       ; 0x6
-; CHECK:         .byte   18                      ; 0x12
-; CHECK:         .byte   10                      ; 0xa
-; CHECK:         .byte   12                      ; 0xc
-; CHECK:         .byte   14                      ; 0xe
 ; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   1                       ; 0x1
 ; CHECK:         .byte   2                       ; 0x2
-; CHECK:         .byte   31                      ; 0x1f
+; CHECK:         .byte   18                      ; 0x12
+; CHECK:         .byte   4                       ; 0x4
+; CHECK:         .byte   5                       ; 0x5
 ; CHECK:         .byte   6                       ; 0x6
-; CHECK:         .byte   30                      ; 0x1e
+; CHECK:         .byte   7                       ; 0x7
+; CHECK:         .byte   8                       ; 0x8
+; CHECK:         .byte   31                      ; 0x1f
 ; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   30                      ; 0x1e
 ; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   13                      ; 0xd
 ; CHECK:         .byte   14                      ; 0xe
-; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   15                      ; 0xf
 ; CHECK: _test4:
 ; CHECK:         ldr     q[[REG1:[0-9]+]]
 ; CHECK:         movi.2d v[[REG0:[0-9]+]], #0000000000000000
diff --git a/test/CodeGen/AArch64/bitcast-v2i8.ll b/test/CodeGen/AArch64/bitcast-v2i8.ll
new file mode 100644
index 0000000..4bdac64
--- /dev/null
+++ b/test/CodeGen/AArch64/bitcast-v2i8.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=aarch64-apple-ios | FileCheck %s
+
+; Part of PR21549: going through the stack isn't ideal but is correct.
+
+define i16 @test_bitcast_v2i8_to_i16(<2 x i8> %a) {
+; CHECK-LABEL: test_bitcast_v2i8_to_i16
+; CHECK:      mov.s   [[WREG_HI:w[0-9]+]], v0[1]
+; CHECK-NEXT: fmov    [[WREG_LO:w[0-9]+]], s0
+; CHECK-NEXT: strb    [[WREG_HI]], [sp, #15]
+; CHECK-NEXT: strb    [[WREG_LO]], [sp, #14]
+; CHECK-NEXT: ldrh    w0, [sp, #14]
+
+  %aa = bitcast <2 x i8> %a to i16
+  ret i16 %aa
+}
diff --git a/test/CodeGen/AArch64/br-to-eh-lpad.ll b/test/CodeGen/AArch64/br-to-eh-lpad.ll
new file mode 100644
index 0000000..20bffd9
--- /dev/null
+++ b/test/CodeGen/AArch64/br-to-eh-lpad.ll
@@ -0,0 +1,78 @@
+; RUN: llc < %s -mtriple=aarch64-apple-ios -verify-machineinstrs
+
+; This function tests that the machine verifier accepts an unconditional
+; branch from an invoke basic block, to its EH landing pad basic block.
+; The test is brittle and isn't ideally reduced, because in most cases the
+; branch would be removed (for instance, turned into a fallthrough), and in
+; that case, the machine verifier, which relies on analyzing branches for this
+; kind of verification, is unable to check anything, so accepts the CFG.
+
+define void @test_branch_to_landingpad() {
+entry:
+  br i1 undef, label %if.end50.thread, label %if.then6
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*)
+          catch %struct._objc_typeinfo.12.129.194.285.350.493.519.532.571.597.623.765* @"OBJC_EHTYPE_$_NSString"
+          catch %struct._objc_typeinfo.12.129.194.285.350.493.519.532.571.597.623.765* @OBJC_EHTYPE_id
+          catch i8* null
+  br i1 undef, label %invoke.cont33, label %catch.fallthrough
+
+catch.fallthrough:
+  %matches31 = icmp eq i32 undef, 0
+  br i1 %matches31, label %invoke.cont41, label %finally.catchall
+
+if.then6:
+  invoke void @objc_exception_throw()
+          to label %invoke.cont7 unwind label %lpad
+
+invoke.cont7:
+  unreachable
+
+if.end50.thread:
+  tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @.str1, i64 0, i64 0), i32 125)
+  tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @.str1, i64 0, i64 0), i32 128)
+  unreachable
+
+invoke.cont33:
+  tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @.str1, i64 0, i64 0), i32 119)
+  unreachable
+
+invoke.cont41:
+  invoke void @objc_exception_rethrow()
+          to label %invoke.cont43 unwind label %lpad40
+
+invoke.cont43:
+  unreachable
+
+lpad40:
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*)
+          catch i8* null
+  br label %finally.catchall
+
+finally.catchall:
+  tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @.str1, i64 0, i64 0), i32 125)
+  unreachable
+}
+
+%struct._objc_typeinfo.12.129.194.285.350.493.519.532.571.597.623.765 = type { i8**, i8*, %struct._class_t.10.127.192.283.348.491.517.530.569.595.621.764* }
+%struct._class_t.10.127.192.283.348.491.517.530.569.595.621.764 = type { %struct._class_t.10.127.192.283.348.491.517.530.569.595.621.764*, %struct._class_t.10.127.192.283.348.491.517.530.569.595.621.764*, %struct._objc_cache.0.117.182.273.338.481.507.520.559.585.611.754*, i8* (i8*, i8*)**, %struct._class_ro_t.9.126.191.282.347.490.516.529.568.594.620.763* }
+%struct._objc_cache.0.117.182.273.338.481.507.520.559.585.611.754 = type opaque
+%struct._class_ro_t.9.126.191.282.347.490.516.529.568.594.620.763 = type { i32, i32, i32, i8*, i8*, %struct.__method_list_t.2.119.184.275.340.483.509.522.561.587.613.756*, %struct._objc_protocol_list.6.123.188.279.344.487.513.526.565.591.617.760*, %struct._ivar_list_t.8.125.190.281.346.489.515.528.567.593.619.762*, i8*, %struct._prop_list_t.4.121.186.277.342.485.511.524.563.589.615.758* }
+%struct.__method_list_t.2.119.184.275.340.483.509.522.561.587.613.756 = type { i32, i32, [0 x %struct._objc_method.1.118.183.274.339.482.508.521.560.586.612.755] }
+%struct._objc_method.1.118.183.274.339.482.508.521.560.586.612.755 = type { i8*, i8*, i8* }
+%struct._objc_protocol_list.6.123.188.279.344.487.513.526.565.591.617.760 = type { i64, [0 x %struct._protocol_t.5.122.187.278.343.486.512.525.564.590.616.759*] }
+%struct._protocol_t.5.122.187.278.343.486.512.525.564.590.616.759 = type { i8*, i8*, %struct._objc_protocol_list.6.123.188.279.344.487.513.526.565.591.617.760*, %struct.__method_list_t.2.119.184.275.340.483.509.522.561.587.613.756*, %struct.__method_list_t.2.119.184.275.340.483.509.522.561.587.613.756*, %struct.__method_list_t.2.119.184.275.340.483.509.522.561.587.613.756*, %struct.__method_list_t.2.119.184.275.340.483.509.522.561.587.613.756*, %struct._prop_list_t.4.121.186.277.342.485.511.524.563.589.615.758*, i32, i32, i8** }
+%struct._ivar_list_t.8.125.190.281.346.489.515.528.567.593.619.762 = type { i32, i32, [0 x %struct._ivar_t.7.124.189.280.345.488.514.527.566.592.618.761] }
+%struct._ivar_t.7.124.189.280.345.488.514.527.566.592.618.761 = type { i32*, i8*, i8*, i32, i32 }
+%struct._prop_list_t.4.121.186.277.342.485.511.524.563.589.615.758 = type { i32, i32, [0 x %struct._prop_t.3.120.185.276.341.484.510.523.562.588.614.757] }
+%struct._prop_t.3.120.185.276.341.484.510.523.562.588.614.757 = type { i8*, i8* }
+
+@.str1 = external unnamed_addr constant [17 x i8], align 1
+@OBJC_EHTYPE_id = external global %struct._objc_typeinfo.12.129.194.285.350.493.519.532.571.597.623.765
+@"OBJC_EHTYPE_$_NSString" = external global %struct._objc_typeinfo.12.129.194.285.350.493.519.532.571.597.623.765, section "__DATA,__datacoal_nt,coalesced", align 8
+
+declare void @objc_exception_throw()
+declare void @objc_exception_rethrow()
+declare i32 @__objc_personality_v0(...)
+declare void @printf(i8* nocapture readonly, ...)
diff --git a/test/CodeGen/AArch64/combine-comparisons-by-cse.ll b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
index df8dc87..3686a1f 100644
--- a/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
+++ b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
@@ -366,7 +366,6 @@ define i32 @fcmpri(i32 %argc, i8** nocapture readonly %argv) {
 ; CHECK-LABEL-DAG: .LBB9_3
 ; CHECK: cmp w19, #0
 ; CHECK: fcmp d8, #0.0
-; CHECK: b.gt .LBB9_5
 ; CHECK-NOT: cmp w19, #1
 ; CHECK-NOT: b.ge .LBB9_5
 
diff --git a/test/CodeGen/AArch64/compiler-ident.ll b/test/CodeGen/AArch64/compiler-ident.ll
index 0350571..217340d 100644
--- a/test/CodeGen/AArch64/compiler-ident.ll
+++ b/test/CodeGen/AArch64/compiler-ident.ll
@@ -8,5 +8,5 @@ target triple = "aarch64--linux-gnu"
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"some LLVM version"}
+!0 = !{!"some LLVM version"}
 
diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll
index f0f36bd..1266842 100644
--- a/test/CodeGen/AArch64/cpus.ll
+++ b/test/CodeGen/AArch64/cpus.ll
@@ -4,6 +4,7 @@
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a72 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 
 ; CHECK-NOT: {{.*}}  is not a recognized processor for this target
diff --git a/test/CodeGen/AArch64/dp-3source.ll b/test/CodeGen/AArch64/dp-3source.ll
index 22bd4a8..bd96ec7 100644
--- a/test/CodeGen/AArch64/dp-3source.ll
+++ b/test/CodeGen/AArch64/dp-3source.ll
@@ -161,3 +161,18 @@ define i64 @test_umnegl(i32 %lhs, i32 %rhs) {
 ; CHECK: umnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
   ret i64 %res
 }
+
+@a = common global i32 0, align 4
+@b = common global i32 0, align 4
+@c = common global i32 0, align 4
+
+define void @test_mneg(){
+; CHECK-LABEL: test_mneg:
+  %1 = load i32* @a, align 4
+  %2 = load i32* @b, align 4
+  %3 = sub i32 0, %1
+  %4 = mul i32 %2, %3
+  store i32 %4, i32* @c, align 4
+; CHECK: mneg {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  ret void
+}
diff --git a/test/CodeGen/AArch64/f16-convert.ll b/test/CodeGen/AArch64/f16-convert.ll
index 12412d4..d1f49a91 100644
--- a/test/CodeGen/AArch64/f16-convert.ll
+++ b/test/CodeGen/AArch64/f16-convert.ll
@@ -133,7 +133,8 @@ define void @store0(i16* nocapture %a, float %val) nounwind {
 
 define void @store1(i16* nocapture %a, double %val) nounwind {
 ; CHECK-LABEL: store1:
-; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: fcvt s0, d0
+; CHECK-NEXT: fcvt h0, s0
 ; CHECK-NEXT: str  h0, [x0]
 ; CHECK-NEXT: ret
 
@@ -158,7 +159,8 @@ define void @store2(i16* nocapture %a, i32 %i, float %val) nounwind {
 
 define void @store3(i16* nocapture %a, i32 %i, double %val) nounwind {
 ; CHECK-LABEL: store3:
-; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: fcvt s0, d0
+; CHECK-NEXT: fcvt h0, s0
 ; CHECK-NEXT: str h0, [x0, w1, sxtw #1]
 ; CHECK-NEXT: ret
 
@@ -184,7 +186,8 @@ define void @store4(i16* nocapture %a, i64 %i, float %val) nounwind {
 
 define void @store5(i16* nocapture %a, i64 %i, double %val) nounwind {
 ; CHECK-LABEL: store5:
-; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: fcvt s0, d0
+; CHECK-NEXT: fcvt h0, s0
 ; CHECK-NEXT: str h0, [x0, x1, lsl #1]
 ; CHECK-NEXT: ret
 
@@ -209,7 +212,8 @@ define void @store6(i16* nocapture %a, float %val) nounwind {
 
 define void @store7(i16* nocapture %a, double %val) nounwind {
 ; CHECK-LABEL: store7:
-; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: fcvt s0, d0
+; CHECK-NEXT: fcvt h0, s0
 ; CHECK-NEXT: str h0, [x0, #20]
 ; CHECK-NEXT: ret
 
@@ -234,7 +238,8 @@ define void @store8(i16* nocapture %a, float %val) nounwind {
 
 define void @store9(i16* nocapture %a, double %val) nounwind {
 ; CHECK-LABEL: store9:
-; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: fcvt s0, d0
+; CHECK-NEXT: fcvt h0, s0
 ; CHECK-NEXT: stur h0, [x0, #-20]
 ; CHECK-NEXT: ret
 
diff --git a/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll b/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
new file mode 100644
index 0000000..bc4a210
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
@@ -0,0 +1,42 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-label: test_or
+; CHECK:       cbnz w0, {{LBB[0-9]+_2}}
+; CHECK:       cbz w1, {{LBB[0-9]+_1}}
+define i64 @test_or(i32 %a, i32 %b) {
+bb1:
+  %0 = icmp eq i32 %a, 0
+  %1 = icmp eq i32 %b, 0
+  %or.cond = or i1 %0, %1
+  br i1 %or.cond, label %bb3, label %bb4, !prof !0
+
+bb3:
+  ret i64 0
+
+bb4:
+  %2 = call i64 @bar()
+  ret i64 %2
+}
+
+; CHECK-label: test_ans
+; CHECK:       cbz w0, {{LBB[0-9]+_2}}
+; CHECK:       cbnz w1, {{LBB[0-9]+_3}}
+define i64 @test_and(i32 %a, i32 %b) {
+bb1:
+  %0 = icmp ne i32 %a, 0
+  %1 = icmp ne i32 %b, 0
+  %or.cond = and i1 %0, %1
+  br i1 %or.cond, label %bb4, label %bb3, !prof !1
+
+bb3:
+  ret i64 0
+
+bb4:
+  %2 = call i64 @bar()
+  ret i64 %2
+}
+
+declare i64 @bar()
+
+!0 = !{!"branch_weights", i32 5128, i32 32}
+!1 = !{!"branch_weights", i32 1024, i32 4136}
diff --git a/test/CodeGen/AArch64/fast-isel-branch_weights.ll b/test/CodeGen/AArch64/fast-isel-branch_weights.ll
index 5b22476..70dbdf2 100644
--- a/test/CodeGen/AArch64/fast-isel-branch_weights.ll
+++ b/test/CodeGen/AArch64/fast-isel-branch_weights.ll
@@ -16,4 +16,4 @@ success:
   ret i64 0
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 0, i32 2147483647}
+!0 = !{!"branch_weights", i32 0, i32 2147483647}
diff --git a/test/CodeGen/AArch64/fast-isel-memcpy.ll b/test/CodeGen/AArch64/fast-isel-memcpy.ll
new file mode 100644
index 0000000..9161dad
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-memcpy.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+; Test that we don't segfault.
+; CHECK-LABEL: test
+; CHECK:       ldr [[REG1:x[0-9]+]], [x1]
+; CHECK-NEXT:  and [[REG2:x[0-9]+]], x0, #0x7fffffffffffffff
+; CHECK-NEXT:  str [[REG1]], {{\[}}[[REG2]]{{\]}}
+define void @test(i64 %a, i8* %b) {
+  %1 = and i64 %a, 9223372036854775807
+  %2 = inttoptr i64 %1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %b, i64 8, i32 8, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
diff --git a/test/CodeGen/AArch64/fast-isel-tbz.ll b/test/CodeGen/AArch64/fast-isel-tbz.ll
index d7f46b2..a5f02ff 100644
--- a/test/CodeGen/AArch64/fast-isel-tbz.ll
+++ b/test/CodeGen/AArch64/fast-isel-tbz.ll
@@ -1,5 +1,5 @@
-; RUN: llc                             -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck %s
-; RUN: llc -fast-isel -fast-isel-abort -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+; RUN: llc                             -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -fast-isel -fast-isel-abort -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck --check-prefix=CHECK --check-prefix=FAST %s
 
 define i32 @icmp_eq_i8(i8 zeroext %a) {
 ; CHECK-LABEL: icmp_eq_i8
@@ -121,6 +121,160 @@ bb2:
   ret i32 0
 }
 
+define i32 @icmp_slt_i8(i8 zeroext %a) {
+; FAST-LABEL: icmp_slt_i8
+; FAST:       tbnz w0, #7, {{LBB.+_2}}
+  %1 = icmp slt i8 %a, 0
+  br i1 %1, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_slt_i16(i16 zeroext %a) {
+; FAST-LABEL: icmp_slt_i16
+; FAST:       tbnz w0, #15, {{LBB.+_2}}
+  %1 = icmp slt i16 %a, 0
+  br i1 %1, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_slt_i32(i32 %a) {
+; CHECK-LABEL: icmp_slt_i32
+; CHECK:       tbnz w0, #31, {{LBB.+_2}}
+  %1 = icmp slt i32 %a, 0
+  br i1 %1, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_slt_i64(i64 %a) {
+; CHECK-LABEL: icmp_slt_i64
+; CHECK:       tbnz x0, #63, {{LBB.+_2}}
+  %1 = icmp slt i64 %a, 0
+  br i1 %1, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_sge_i8(i8 zeroext %a) {
+; FAST-LABEL: icmp_sge_i8
+; FAST:       tbz w0, #7, {{LBB.+_2}}
+  %1 = icmp sge i8 %a, 0
+  br i1 %1, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_sge_i16(i16 zeroext %a) {
+; FAST-LABEL: icmp_sge_i16
+; FAST:       tbz w0, #15, {{LBB.+_2}}
+  %1 = icmp sge i16 %a, 0
+  br i1 %1, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_sle_i8(i8 zeroext %a) {
+; FAST-LABEL: icmp_sle_i8
+; FAST:       tbnz w0, #7, {{LBB.+_2}}
+  %1 = icmp sle i8 %a, -1
+  br i1 %1, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_sle_i16(i16 zeroext %a) {
+; FAST-LABEL: icmp_sle_i16
+; FAST:       tbnz w0, #15, {{LBB.+_2}}
+  %1 = icmp sle i16 %a, -1
+  br i1 %1, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_sle_i32(i32 %a) {
+; CHECK-LABEL: icmp_sle_i32
+; CHECK:       tbnz w0, #31, {{LBB.+_2}}
+  %1 = icmp sle i32 %a, -1
+  br i1 %1, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_sle_i64(i64 %a) {
+; CHECK-LABEL: icmp_sle_i64
+; CHECK:       tbnz x0, #63, {{LBB.+_2}}
+  %1 = icmp sle i64 %a, -1
+  br i1 %1, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_sgt_i8(i8 zeroext %a) {
+; FAST-LABEL: icmp_sgt_i8
+; FAST:       tbz w0, #7, {{LBB.+_2}}
+  %1 = icmp sgt i8 %a, -1
+  br i1 %1, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_sgt_i16(i16 zeroext %a) {
+; FAST-LABEL: icmp_sgt_i16
+; FAST:       tbz w0, #15, {{LBB.+_2}}
+  %1 = icmp sgt i16 %a, -1
+  br i1 %1, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_sgt_i32(i32 %a) {
+; CHECK-LABEL: icmp_sgt_i32
+; CHECK:       tbz w0, #31, {{LBB.+_2}}
+  %1 = icmp sgt i32 %a, -1
+  br i1 %1, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_sgt_i64(i64 %a) {
+; FAST-LABEL: icmp_sgt_i64
+; FAST:       tbz x0, #63, {{LBB.+_2}}
+  %1 = icmp sgt i64 %a, -1
+  br i1 %1, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
 ; Test that we don't fold the 'and' instruction into the compare.
 define i32 @icmp_eq_and_i32(i32 %a, i1 %c) {
 ; CHECK-LABEL: icmp_eq_and_i32
@@ -137,5 +291,5 @@ bb2:
   ret i32 0
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 0, i32 2147483647}
-!1 = metadata !{metadata !"branch_weights", i32 2147483647, i32 0}
+!0 = !{!"branch_weights", i32 0, i32 2147483647}
+!1 = !{!"branch_weights", i32 2147483647, i32 0}
diff --git a/test/CodeGen/AArch64/fdiv-combine.ll b/test/CodeGen/AArch64/fdiv-combine.ll
new file mode 100644
index 0000000..389eefd
--- /dev/null
+++ b/test/CodeGen/AArch64/fdiv-combine.ll
@@ -0,0 +1,94 @@
+; RUN: llc -march=aarch64 < %s | FileCheck %s
+
+; Following test cases check:
+;   a / D; b / D; c / D;
+;                =>
+;   recip = 1.0 / D; a * recip; b * recip; c * recip;
+define void @three_fdiv_float(float %D, float %a, float %b, float %c) #0 {
+; CHECK-LABEL: three_fdiv_float:
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fdiv
+; CHECK: fmul
+; CHECK: fmul
+; CHECK: fmul
+  %div = fdiv float %a, %D
+  %div1 = fdiv float %b, %D
+  %div2 = fdiv float %c, %D
+  tail call void @foo_3f(float %div, float %div1, float %div2)
+  ret void
+}
+
+define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 {
+; CHECK-LABEL: three_fdiv_double:
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fdiv
+; CHECK: fmul
+; CHECK: fmul
+; CHECK: fmul
+  %div = fdiv double %a, %D
+  %div1 = fdiv double %b, %D
+  %div2 = fdiv double %c, %D
+  tail call void @foo_3d(double %div, double %div1, double %div2)
+  ret void
+}
+
+define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+; CHECK-LABEL: three_fdiv_4xfloat:
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fdiv
+; CHECK: fmul
+; CHECK: fmul
+; CHECK: fmul
+  %div = fdiv <4 x float> %a, %D
+  %div1 = fdiv <4 x float> %b, %D
+  %div2 = fdiv <4 x float> %c, %D
+  tail call void @foo_3_4xf(<4 x float> %div, <4 x float> %div1, <4 x float> %div2)
+  ret void
+}
+
+define void @three_fdiv_2xdouble(<2 x double> %D, <2 x double> %a, <2 x double> %b, <2 x double> %c) #0 {
+; CHECK-LABEL: three_fdiv_2xdouble:
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fdiv
+; CHECK: fmul
+; CHECK: fmul
+; CHECK: fmul
+  %div = fdiv <2 x double> %a, %D
+  %div1 = fdiv <2 x double> %b, %D
+  %div2 = fdiv <2 x double> %c, %D
+  tail call void @foo_3_2xd(<2 x double> %div, <2 x double> %div1, <2 x double> %div2)
+  ret void
+}
+
+; Following test cases check we never combine two FDIVs if neither of them
+; calculates a reciprocal.
+define void @two_fdiv_float(float %D, float %a, float %b) #0 {
+; CHECK-LABEL: two_fdiv_float:
+; CHECK: fdiv
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fmul
+  %div = fdiv float %a, %D
+  %div1 = fdiv float %b, %D
+  tail call void @foo_2f(float %div, float %div1)
+  ret void
+}
+
+define void @two_fdiv_double(double %D, double %a, double %b) #0 {
+; CHECK-LABEL: two_fdiv_double:
+; CHECK: fdiv
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fmul
+  %div = fdiv double %a, %D
+  %div1 = fdiv double %b, %D
+  tail call void @foo_2d(double %div, double %div1)
+  ret void
+}
+
+declare void @foo_3f(float, float, float)
+declare void @foo_3d(double, double, double)
+declare void @foo_3_4xf(<4 x float>, <4 x float>, <4 x float>)
+declare void @foo_3_2xd(<2 x double>, <2 x double>, <2 x double>)
+declare void @foo_2f(float, float)
+declare void @foo_2d(double, double)
+
+attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AArch64/fp16-v8-instructions.ll b/test/CodeGen/AArch64/fp16-v8-instructions.ll
index 9ee2296..b75f160 100644
--- a/test/CodeGen/AArch64/fp16-v8-instructions.ll
+++ b/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -188,10 +188,10 @@ define <8 x half> @s_to_h(<8 x float> %a) {
 
 define <8 x half> @d_to_h(<8 x double> %a) {
 ; CHECK-LABEL: d_to_h:
-; CHECK-DAG: ins v{{[0-9]+}}.d
-; CHECK-DAG: ins v{{[0-9]+}}.d
-; CHECK-DAG: ins v{{[0-9]+}}.d
-; CHECK-DAG: ins v{{[0-9]+}}.d
+; CHECK-DAG: mov d{{[0-9]+}}, v{{[0-9]+}}.d[1]
+; CHECK-DAG: mov d{{[0-9]+}}, v{{[0-9]+}}.d[1]
+; CHECK-DAG: mov d{{[0-9]+}}, v{{[0-9]+}}.d[1]
+; CHECK-DAG: mov d{{[0-9]+}}, v{{[0-9]+}}.d[1]
 ; CHECK-DAG: fcvt h
 ; CHECK-DAG: fcvt h
 ; CHECK-DAG: fcvt h
diff --git a/test/CodeGen/AArch64/fpimm.ll b/test/CodeGen/AArch64/fpimm.ll
index e59520c..b7db918 100644
--- a/test/CodeGen/AArch64/fpimm.ll
+++ b/test/CodeGen/AArch64/fpimm.ll
@@ -1,4 +1,6 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu                                                  -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -code-model=large                             -verify-machineinstrs < %s | FileCheck %s --check-prefix=LARGE
+; RUN: llc -mtriple=aarch64-apple-darwin -code-model=large -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s --check-prefix=LARGE
 
 @varf32 = global float 0.0
 @varf64 = global double 0.0
@@ -34,3 +36,22 @@ define void @check_double() {
 ; CHECK: ret
   ret void
 }
+
+; LARGE-LABEL: check_float2
+; LARGE:       movz [[REG:w[0-9]+]], #0x4049, lsl #16
+; LARGE-NEXT:  movk [[REG]], #0xfdb
+; LARGE-NEXT:  fmov s0, [[REG]]
+define float @check_float2() {
+  ret float 3.14159274101257324218750
+}
+
+; LARGE-LABEL: check_double2
+; LARGE:       movz [[REG:x[0-9]+]], #0x4009, lsl #48
+; LARGE-NEXT:  movk [[REG]], #0x21fb, lsl #32
+; LARGE-NEXT:  movk [[REG]], #0x5444, lsl #16
+; LARGE-NEXT:  movk [[REG]], #0x2d18
+; LARGE-NEXT:  fmov d0, [[REG]]
+define double @check_double2() {
+  ret double 3.1415926535897931159979634685441851615905761718750
+}
+
diff --git a/test/CodeGen/AArch64/func-argpassing.ll b/test/CodeGen/AArch64/func-argpassing.ll
index abb732c..9fc9a5f 100644
--- a/test/CodeGen/AArch64/func-argpassing.ll
+++ b/test/CodeGen/AArch64/func-argpassing.ll
@@ -96,10 +96,8 @@ define [2 x i64] @return_struct() {
     %addr = bitcast %myStruct* @varstruct to [2 x i64]*
     %val = load [2 x i64]* %addr
     ret [2 x i64] %val
-; CHECK-DAG: ldr x0, [{{x[0-9]+}}, {{#?}}:lo12:varstruct]
-    ; Odd register regex below disallows x0 which we want to be live now.
-; CHECK-DAG: add {{x[1-9][0-9]*}}, {{x[1-9][0-9]*}}, {{#?}}:lo12:varstruct
-; CHECK: ldr x1, [{{x[1-9][0-9]*}}, #8]
+; CHECK: add x[[VARSTRUCT:[0-9]+]], {{x[0-9]+}}, :lo12:varstruct
+; CHECK: ldp x0, x1, [x[[VARSTRUCT]]]
     ; Make sure epilogue immediately follows
 ; CHECK-NEXT: ret
 }
@@ -166,8 +164,8 @@ define void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
 define i64 @check_i128_regalign(i32 %val0, i128 %val1, i64 %val2) {
 ; CHECK-LABEL: check_i128_regalign
     store i128 %val1, i128* @var128
-; CHECK-DAG: str x2, [{{x[0-9]+}}, {{#?}}:lo12:var128]
-; CHECK-DAG: str x3, [{{x[0-9]+}}, #8]
+; CHECK: add x[[VAR128:[0-9]+]], {{x[0-9]+}}, :lo12:var128
+; CHECK-DAG: stp x2, x3, [x[[VAR128]]]
 
     ret i64 %val2
 ; CHECK: mov x0, x4
diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll
index 51979f0..16157f8 100644
--- a/test/CodeGen/AArch64/func-calls.ll
+++ b/test/CodeGen/AArch64/func-calls.ll
@@ -62,8 +62,8 @@ define void @simple_rets() {
   %arr = call [2 x i64] @return_smallstruct()
   store [2 x i64] %arr, [2 x i64]* @varsmallstruct
 ; CHECK: bl return_smallstruct
-; CHECK: str x1, [{{x[0-9]+}}, #8]
-; CHECK: str x0, [{{x[0-9]+}}, {{#?}}:lo12:varsmallstruct]
+; CHECK: add x[[VARSMALLSTRUCT:[0-9]+]], {{x[0-9]+}}, :lo12:varsmallstruct
+; CHECK: stp x0, x1, [x[[VARSMALLSTRUCT]]]
 
   call void @return_large_struct(%myStruct* sret @varstruct)
 ; CHECK: add x8, {{x[0-9]+}}, {{#?}}:lo12:varstruct
@@ -128,12 +128,12 @@ define void @check_i128_align() {
   call void @check_i128_stackalign(i32 0, i32 1, i32 2, i32 3,
                                    i32 4, i32 5, i32 6, i32 7,
                                    i32 42, i128 %val)
-; CHECK: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:var128]
-; CHECK: ldr [[I128HI:x[0-9]+]], [{{x[0-9]+}}, #8]
+; CHECK: add x[[VAR128:[0-9]+]], {{x[0-9]+}}, :lo12:var128
+; CHECK: ldp [[I128LO:x[0-9]+]], [[I128HI:x[0-9]+]], [x[[VAR128]]]
 ; CHECK: stp [[I128LO]], [[I128HI]], [sp, #16]
 
-; CHECK-NONEON: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, :lo12:var128]
-; CHECK-NONEON: ldr [[I128HI:x[0-9]+]], [{{x[0-9]+}}, #8]
+; CHECK-NONEON: add x[[VAR128:[0-9]+]], {{x[0-9]+}}, :lo12:var128
+; CHECK-NONEON: ldp [[I128LO:x[0-9]+]], [[I128HI:x[0-9]+]], [x[[VAR128]]]
 ; CHECK-NONEON: stp [[I128LO]], [[I128HI]], [sp, #16]
 ; CHECK: bl check_i128_stackalign
 
diff --git a/test/CodeGen/AArch64/ghc-cc.ll b/test/CodeGen/AArch64/ghc-cc.ll
new file mode 100644
index 0000000..505bd5f
--- /dev/null
+++ b/test/CodeGen/AArch64/ghc-cc.ll
@@ -0,0 +1,89 @@
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; Check the GHC call convention works (aarch64)
+
+@base  = external global i64 ; assigned to register: r19
+@sp    = external global i64 ; assigned to register: r20
+@hp    = external global i64 ; assigned to register: r21
+@r1    = external global i64 ; assigned to register: r22
+@r2    = external global i64 ; assigned to register: r23
+@r3    = external global i64 ; assigned to register: r24
+@r4    = external global i64 ; assigned to register: r25
+@r5    = external global i64 ; assigned to register: r26
+@r6    = external global i64 ; assigned to register: r27
+@splim = external global i64 ; assigned to register: r28
+
+@f1 = external global float  ; assigned to register: s8
+@f2 = external global float  ; assigned to register: s9
+@f3 = external global float  ; assigned to register: s10
+@f4 = external global float  ; assigned to register: s11
+
+@d1 = external global double ; assigned to register: d12
+@d2 = external global double ; assigned to register: d13
+@d3 = external global double ; assigned to register: d14
+@d4 = external global double ; assigned to register: d15
+
+define ghccc i64 @addtwo(i64 %x, i64 %y) nounwind {
+entry:
+  ; CHECK-LABEL: addtwo
+  ; CHECK:       add      x0, x19, x20
+  ; CHECK-NEXT:  ret
+  %0 = add i64 %x, %y
+  ret i64 %0
+}
+
+define void @zap(i64 %a, i64 %b) nounwind {
+entry:
+  ; CHECK-LABEL: zap
+  ; CHECK-NOT:   mov   {{x[0-9]+}}, sp
+  ; CHECK:       bl    addtwo
+  ; CHECK-NEXT:  bl    foo
+  %0 = call ghccc i64 @addtwo(i64 %a, i64 %b)
+  call void @foo() nounwind
+  ret void
+}
+
+define ghccc void @foo_i64 () nounwind {
+entry:
+  ; CHECK-LABEL: foo_i64
+  ; CHECK:       adrp    {{x[0-9]+}}, base
+  ; CHECK-NEXT:  ldr     x19, [{{x[0-9]+}}, :lo12:base]
+  ; CHECK-NEXT:  bl      bar_i64
+  ; CHECK-NEXT:  ret
+
+  %0 = load i64* @base
+  tail call ghccc void @bar_i64( i64 %0 ) nounwind
+  ret void
+}
+
+define ghccc void @foo_float () nounwind {
+entry:
+  ; CHECK-LABEL: foo_float
+  ; CHECK:       adrp    {{x[0-9]+}}, f1
+  ; CHECK-NEXT:  ldr     s8, [{{x[0-9]+}}, :lo12:f1]
+  ; CHECK-NEXT:  bl      bar_float
+  ; CHECK-NEXT:  ret
+
+  %0 = load float* @f1
+  tail call ghccc void @bar_float( float %0 ) nounwind
+  ret void
+}
+
+define ghccc void @foo_double () nounwind {
+entry:
+  ; CHECK-LABEL: foo_double
+  ; CHECK:       adrp    {{x[0-9]+}}, d1
+  ; CHECK-NEXT:  ldr     d12, [{{x[0-9]+}}, :lo12:d1]
+  ; CHECK-NEXT:  bl      bar_double
+  ; CHECK-NEXT:  ret
+
+  %0 = load double* @d1
+  tail call ghccc void @bar_double( double %0 ) nounwind
+  ret void
+}
+
+declare ghccc void @foo ()
+
+declare ghccc void @bar_i64 (i64)
+declare ghccc void @bar_float (float)
+declare ghccc void @bar_double (double)
diff --git a/test/CodeGen/AArch64/global-merge-1.ll b/test/CodeGen/AArch64/global-merge-1.ll
index 68aba5e..7dc8da1 100644
--- a/test/CodeGen/AArch64/global-merge-1.ll
+++ b/test/CodeGen/AArch64/global-merge-1.ll
@@ -11,6 +11,7 @@
 @n = internal global i32 0, align 4
 
 define void @f1(i32 %a1, i32 %a2) {
+;CHECK-APPLE-IOS-NOT: adrp
 ;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals@PAGE
 ;CHECK-APPLE-IOS-NOT: adrp
 ;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals@PAGEOFF
diff --git a/test/CodeGen/AArch64/global-merge-2.ll b/test/CodeGen/AArch64/global-merge-2.ll
index a773566..70b700c 100644
--- a/test/CodeGen/AArch64/global-merge-2.ll
+++ b/test/CodeGen/AArch64/global-merge-2.ll
@@ -8,6 +8,7 @@
 
 define void @f1(i32 %a1, i32 %a2) {
 ;CHECK-APPLE-IOS-LABEL: _f1:
+;CHECK-APPLE-IOS-NOT: adrp
 ;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
 ;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
 ;CHECK-APPLE-IOS-NOT: adrp
diff --git a/test/CodeGen/AArch64/implicit-sret.ll b/test/CodeGen/AArch64/implicit-sret.ll
new file mode 100644
index 0000000..264d519
--- /dev/null
+++ b/test/CodeGen/AArch64/implicit-sret.ll
@@ -0,0 +1,13 @@
+; RUN: llc %s -o - -mtriple=arm64-apple-ios7.0 | FileCheck %s
+;
+; Handle implicit sret arguments that are generated on-the-fly during lowering.
+; <rdar://19792160> Null pointer assertion in AArch64TargetLowering
+
+; CHECK-LABEL: big_retval
+; ... str or stp for the first 1024 bits
+; CHECK: strb wzr, [x8, #128]
+; CHECK: ret
+define i1032 @big_retval() {
+entry:
+  ret i1032 0
+}
diff --git a/test/CodeGen/AArch64/large_shift.ll b/test/CodeGen/AArch64/large_shift.ll
new file mode 100644
index 0000000..f72c97d
--- /dev/null
+++ b/test/CodeGen/AArch64/large_shift.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=aarch64 -o - %s
+target triple = "arm64-unknown-unknown"
+
+; Make sure we don't run into an assert in the aarch64 code selection when
+; DAGCombining fails.
+
+declare void @t()
+
+define void @foo() {
+  %c = bitcast i64 270458 to i64
+  %t0 = lshr i64 %c, 422383
+  %t1 = trunc i64 %t0 to i1
+  br i1 %t1, label %BB1, label %BB0
+
+BB0:
+  call void @t()
+  br label %BB1
+
+BB1:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/machine_cse_impdef_killflags.ll b/test/CodeGen/AArch64/machine_cse_impdef_killflags.ll
new file mode 100644
index 0000000..e77824f
--- /dev/null
+++ b/test/CodeGen/AArch64/machine_cse_impdef_killflags.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=aarch64-apple-ios -fast-isel -verify-machineinstrs | FileCheck %s
+
+; Check that the kill flag is cleared between CSE'd instructions on their
+; imp-def'd registers.
+; The verifier would complain otherwise.
+define i64 @csed-impdef-killflag(i64 %a) {
+; CHECK-LABEL: csed-impdef-killflag
+; CHECK-DAG:  mov    [[REG0:w[0-9]+]], wzr
+; CHECK-DAG:  orr    [[REG1:w[0-9]+]], wzr, #0x1
+; CHECK-DAG:  orr    [[REG2:x[0-9]+]], xzr, #0x2
+; CHECK-DAG:  orr    [[REG3:x[0-9]+]], xzr, #0x3
+; CHECK:      cmp    x0, #0
+; CHECK-DAG:  csel   w[[SELECT_WREG_1:[0-9]+]], [[REG0]], [[REG1]], ne
+; CHECK-DAG:  csel   [[SELECT_XREG_2:x[0-9]+]], [[REG2]], [[REG3]], ne
+; CHECK:      ubfx   [[SELECT_XREG_1:x[0-9]+]], x[[SELECT_WREG_1]], #0, #32
+; CHECK-NEXT: add    x0, [[SELECT_XREG_2]], [[SELECT_XREG_1]]
+; CHECK-NEXT: ret
+
+  %1 = icmp ne i64 %a, 0
+  %2 = select i1 %1, i32 0, i32 1
+  %3 = icmp ne i64 %a, 0
+  %4 = select i1 %3, i64 2, i64 3
+  %5 = zext i32 %2 to i64
+  %6 = add i64 %4, %5
+  ret i64 %6
+}
diff --git a/test/CodeGen/AArch64/neon-scalar-copy.ll b/test/CodeGen/AArch64/neon-scalar-copy.ll
index 6afac31..3f77060 100644
--- a/test/CodeGen/AArch64/neon-scalar-copy.ll
+++ b/test/CodeGen/AArch64/neon-scalar-copy.ll
@@ -1,101 +1,145 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -asm-verbose=false < %s | FileCheck %s
 
-
-define float @test_dup_sv2S(<2 x float> %v) {
- ; CHECK-LABEL: test_dup_sv2S
- ; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+define float @test_dup_sv2S(<2 x float> %v) #0 {
+ ; CHECK-LABEL: test_dup_sv2S:
+ ; CHECK-NEXT: mov s{{[0-9]+}}, {{v[0-9]+}}.s[1]
+ ; CHECK-NEXT: ret
  %tmp1 = extractelement <2 x float> %v, i32 1
  ret float  %tmp1
 }
 
-define float @test_dup_sv2S_0(<2 x float> %v) {
- ; CHECK-LABEL: test_dup_sv2S_0
+define float @test_dup_sv2S_0(<2 x float> %v) #0 {
+ ; CHECK-LABEL: test_dup_sv2S_0:
  ; CHECK-NOT: dup {{[vsd][0-9]+}}
  ; CHECK-NOT: ins {{[vsd][0-9]+}}
- ; CHECK: ret
+ ; CHECK-NEXT: ret
  %tmp1 = extractelement <2 x float> %v, i32 0
  ret float  %tmp1
 }
 
-define float @test_dup_sv4S(<4 x float> %v) {
- ; CHECK-LABEL: test_dup_sv4S
+define float @test_dup_sv4S(<4 x float> %v) #0 {
+ ; CHECK-LABEL: test_dup_sv4S:
+ ; CHECK-NEXT: mov s{{[0-9]+}}, {{v[0-9]+}}.s[1]
+ ; CHECK-NEXT: ret
+ %tmp1 = extractelement <4 x float> %v, i32 1
+ ret float  %tmp1
+}
+
+define float @test_dup_sv4S_0(<4 x float> %v) #0 {
+ ; CHECK-LABEL: test_dup_sv4S_0:
  ; CHECK-NOT: dup {{[vsd][0-9]+}}
  ; CHECK-NOT: ins {{[vsd][0-9]+}}
- ; CHECK: ret
+ ; CHECK-NEXT: ret
  %tmp1 = extractelement <4 x float> %v, i32 0
  ret float  %tmp1
 }
 
-define double @test_dup_dvD(<1 x double> %v) {
- ; CHECK-LABEL: test_dup_dvD
+define double @test_dup_dvD(<1 x double> %v) #0 {
+ ; CHECK-LABEL: test_dup_dvD:
  ; CHECK-NOT: dup {{[vsd][0-9]+}}
  ; CHECK-NOT: ins {{[vsd][0-9]+}}
- ; CHECK: ret
+ ; CHECK-NEXT: ret
  %tmp1 = extractelement <1 x double> %v, i32 0
  ret double  %tmp1
 }
 
-define double @test_dup_dv2D(<2 x double> %v) {
- ; CHECK-LABEL: test_dup_dv2D
- ; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+define double @test_dup_dv2D(<2 x double> %v) #0 {
+ ; CHECK-LABEL: test_dup_dv2D:
+ ; CHECK-NEXT: mov d{{[0-9]+}}, {{v[0-9]+}}.d[1]
+ ; CHECK-NEXT: ret
  %tmp1 = extractelement <2 x double> %v, i32 1
  ret double  %tmp1
 }
 
-define double @test_dup_dv2D_0(<2 x double> %v) {
- ; CHECK-LABEL: test_dup_dv2D_0
- ; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
- ; CHECK: ret
- %tmp1 = extractelement <2 x double> %v, i32 1
+define double @test_dup_dv2D_0(<2 x double> %v) #0 {
+ ; CHECK-LABEL: test_dup_dv2D_0:
+ ; CHECK-NOT: dup {{[vsd][0-9]+}}
+ ; CHECK-NOT: ins {{[vsd][0-9]+}}
+ ; CHECK-NEXT: ret
+ %tmp1 = extractelement <2 x double> %v, i32 0
  ret double  %tmp1
 }
 
-define <1 x i8> @test_vector_dup_bv16B(<16 x i8> %v1) {
- ; CHECK-LABEL: test_vector_dup_bv16B
+define half @test_dup_hv8H(<8 x half> %v) #0 {
+ ; CHECK-LABEL: test_dup_hv8H:
+ ; CHECK-NEXT: mov h{{[0-9]+}}, {{v[0-9]+}}.h[1]
+ ; CHECK-NEXT: ret
+ %tmp1 = extractelement <8 x half> %v, i32 1
+ ret half  %tmp1
+}
+
+define half @test_dup_hv8H_0(<8 x half> %v) #0 {
+ ; CHECK-LABEL: test_dup_hv8H_0:
+ ; CHECK-NOT: dup {{[vsdh][0-9]+}}
+ ; CHECK-NOT: ins {{[vsdh][0-9]+}}
+ ; CHECK-NEXT: ret
+ %tmp1 = extractelement <8 x half> %v, i32 0
+ ret half  %tmp1
+}
+
+define <1 x i8> @test_vector_dup_bv16B(<16 x i8> %v1) #0 {
+ ; CHECK-LABEL: test_vector_dup_bv16B:
+ ; CHECK-NEXT: umov [[W:w[0-9]+]], v0.b[14]
+ ; CHECK-NEXT: fmov s0, [[W]]
+ ; CHECK-NEXT: ret
  %shuffle.i = shufflevector <16 x i8> %v1, <16 x i8> undef, <1 x i32> <i32 14> 
  ret <1 x i8> %shuffle.i
 }
 
-define <1 x i8> @test_vector_dup_bv8B(<8 x i8> %v1) {
- ; CHECK-LABEL: test_vector_dup_bv8B
+define <1 x i8> @test_vector_dup_bv8B(<8 x i8> %v1) #0 {
+ ; CHECK-LABEL: test_vector_dup_bv8B:
+ ; CHECK-NEXT: dup v0.8b, v0.b[7]
+ ; CHECK-NEXT: ret
  %shuffle.i = shufflevector <8 x i8> %v1, <8 x i8> undef, <1 x i32> <i32 7> 
  ret <1 x i8> %shuffle.i
 }
 
-define <1 x i16> @test_vector_dup_hv8H(<8 x i16> %v1) {
- ; CHECK-LABEL: test_vector_dup_hv8H
+define <1 x i16> @test_vector_dup_hv8H(<8 x i16> %v1) #0 {
+ ; CHECK-LABEL: test_vector_dup_hv8H:
+ ; CHECK-NEXT: umov [[W:w[0-9]+]], v0.h[7]
+ ; CHECK-NEXT: fmov s0, [[W]]
+ ; CHECK-NEXT: ret
  %shuffle.i = shufflevector <8 x i16> %v1, <8 x i16> undef, <1 x i32> <i32 7> 
  ret <1 x i16> %shuffle.i
 }
 
-define <1 x i16> @test_vector_dup_hv4H(<4 x i16> %v1) {
- ; CHECK-LABEL: test_vector_dup_hv4H
+define <1 x i16> @test_vector_dup_hv4H(<4 x i16> %v1) #0 {
+ ; CHECK-LABEL: test_vector_dup_hv4H:
+ ; CHECK-NEXT: dup v0.4h, v0.h[3]
+ ; CHECK-NEXT: ret
  %shuffle.i = shufflevector <4 x i16> %v1, <4 x i16> undef, <1 x i32> <i32 3> 
  ret <1 x i16> %shuffle.i
 }
 
-define <1 x i32> @test_vector_dup_sv4S(<4 x i32> %v1) {
- ; CHECK-LABEL: test_vector_dup_sv4S
+define <1 x i32> @test_vector_dup_sv4S(<4 x i32> %v1) #0 {
+ ; CHECK-LABEL: test_vector_dup_sv4S:
+ ; CHECK-NEXT: mov  [[W:w[0-9]+]], v0.s[3]
+ ; CHECK-NEXT: fmov s0, [[W]]
+ ; CHECK-NEXT: ret
  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <1 x i32> <i32 3> 
  ret <1 x i32> %shuffle
 }
 
-define <1 x i32> @test_vector_dup_sv2S(<2 x i32> %v1) {
- ; CHECK-LABEL: test_vector_dup_sv2S
+define <1 x i32> @test_vector_dup_sv2S(<2 x i32> %v1) #0 {
+ ; CHECK-LABEL: test_vector_dup_sv2S:
+ ; CHECK-NEXT: dup v0.2s, v0.s[1]
+ ; CHECK-NEXT: ret
  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <1 x i32> <i32 1> 
  ret <1 x i32> %shuffle
 }
 
-define <1 x i64> @test_vector_dup_dv2D(<2 x i64> %v1) {
- ; CHECK-LABEL: test_vector_dup_dv2D
- ; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #8
+define <1 x i64> @test_vector_dup_dv2D(<2 x i64> %v1) #0 {
+ ; CHECK-LABEL: test_vector_dup_dv2D:
+ ; CHECK-NEXT: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #8
+ ; CHECK-NEXT: ret
  %shuffle.i = shufflevector <2 x i64> %v1, <2 x i64> undef, <1 x i32> <i32 1> 
  ret <1 x i64> %shuffle.i
 }
 
-define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) {
-  ; CHECK-LABEL: test_vector_copy_dup_dv2D
-  ; CHECK: {{dup|mov}} {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) #0 {
+  ; CHECK-LABEL: test_vector_copy_dup_dv2D:
+  ; CHECK-NEXT: {{dup|mov}} {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  ; CHECK-NEXT: ret
   %vget_lane = extractelement <2 x i64> %c, i32 1
   %vset_lane = insertelement <1 x i64> undef, i64 %vget_lane, i32 0
   ret <1 x i64> %vset_lane
@@ -118,3 +162,5 @@ define void @test_out_of_range_insert(<4 x i32> %vec, i32 %elt) {
   insertelement <4 x i32> %vec, i32 %elt, i32 4
   ret void
 }
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/or-combine.ll b/test/CodeGen/AArch64/or-combine.ll
new file mode 100644
index 0000000..c6c343a
--- /dev/null
+++ b/test/CodeGen/AArch64/or-combine.ll
@@ -0,0 +1,44 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+
+define i32 @test_consts(i32 %in) {
+; CHECK-LABEL: test_consts:
+; CHECK-NOT: bfxil
+; CHECK-NOT: and
+; CHECK-NOT: orr
+; CHECK: ret
+
+  %lo = and i32 %in, 65535
+  %hi = and i32 %in, -65536
+  %res = or i32 %lo, %hi
+  ret i32 %res
+}
+
+define i32 @test_generic(i32 %in, i32 %mask1, i32 %mask2) {
+; CHECK-LABEL: test_generic:
+; CHECK: orr [[FULL_MASK:w[0-9]+]], w1, w2
+; CHECK: and w0, w0, [[FULL_MASK]]
+
+  %lo = and i32 %in, %mask1
+  %hi = and i32 %in, %mask2
+  %res = or i32 %lo, %hi
+  ret i32 %res
+}
+
+; In this case the transformation isn't profitable, since %lo and %hi
+; are used more than once.
+define [3 x i32] @test_reuse(i32 %in, i32 %mask1, i32 %mask2) {
+; CHECK-LABEL: test_reuse:
+; CHECK-DAG: and w1, w0, w1
+; CHECK-DAG: and w2, w0, w2
+; CHECK-DAG: orr w0, w1, w2
+
+  %lo = and i32 %in, %mask1
+  %hi = and i32 %in, %mask2
+  %recombine = or i32 %lo, %hi
+
+  %res.tmp0 = insertvalue [3 x i32] undef, i32 %recombine, 0
+  %res.tmp1 = insertvalue [3 x i32] %res.tmp0, i32 %lo, 1
+  %res = insertvalue [3 x i32] %res.tmp1, i32 %hi, 2
+
+  ret [3 x i32] %res
+}
diff --git a/test/CodeGen/AArch64/ragreedy-csr.ll b/test/CodeGen/AArch64/ragreedy-csr.ll
index de29b1b..31ff543 100644
--- a/test/CodeGen/AArch64/ragreedy-csr.ll
+++ b/test/CodeGen/AArch64/ragreedy-csr.ll
@@ -271,27 +271,27 @@ return:
   %retval.0 = phi i32 [ 0, %entry ], [ 1, %land.lhs.true52 ], [ 1, %land.lhs.true43 ], [ 0, %if.else123 ], [ 1, %while.cond59.preheader ], [ 1, %while.cond95.preheader ], [ 1, %while.cond130.preheader ], [ 1, %land.lhs.true28 ], [ 1, %if.then83 ], [ 0, %lor.lhs.false74 ], [ 1, %land.rhs ], [ 1, %if.then117 ], [ 0, %while.body104 ], [ 1, %land.rhs99 ], [ 1, %if.then152 ], [ 0, %while.body139 ], [ 1, %land.rhs134 ], [ 0, %while.body ]
   ret i32 %retval.0
 }
-!181 = metadata !{metadata !"branch_weights", i32 662038, i32 1}
-!988 = metadata !{metadata !"branch_weights", i32 12091450, i32 1916}
-!989 = metadata !{metadata !"branch_weights", i32 7564670, i32 4526781}
-!990 = metadata !{metadata !"branch_weights", i32 7484958, i32 13283499}
-!991 = metadata !{metadata !"branch_weights", i32 8677007, i32 4606493}
-!992 = metadata !{metadata !"branch_weights", i32 -1172426948, i32 145094705}
-!993 = metadata !{metadata !"branch_weights", i32 1468914, i32 5683688}
-!994 = metadata !{metadata !"branch_weights", i32 114025221, i32 -1217548794, i32 -1199521551, i32 87712616}
-!995 = metadata !{metadata !"branch_weights", i32 1853716452, i32 -444717951, i32 932776759}
-!996 = metadata !{metadata !"branch_weights", i32 1004870, i32 20259}
-!997 = metadata !{metadata !"branch_weights", i32 20071, i32 189}
-!998 = metadata !{metadata !"branch_weights", i32 -1020255939, i32 572177766}
-!999 = metadata !{metadata !"branch_weights", i32 2666513, i32 3466431}
-!1000 = metadata !{metadata !"branch_weights", i32 5117635, i32 1859780}
-!1001 = metadata !{metadata !"branch_weights", i32 354902465, i32 -1444604407}
-!1002 = metadata !{metadata !"branch_weights", i32 -1762419279, i32 1592770684}
-!1003 = metadata !{metadata !"branch_weights", i32 1435905930, i32 -1951930624}
-!1004 = metadata !{metadata !"branch_weights", i32 1, i32 504888}
-!1005 = metadata !{metadata !"branch_weights", i32 94662, i32 504888}
-!1006 = metadata !{metadata !"branch_weights", i32 -1897793104, i32 160196332}
-!1007 = metadata !{metadata !"branch_weights", i32 2074643678, i32 -29579071}
-!1008 = metadata !{metadata !"branch_weights", i32 1, i32 226163}
-!1009 = metadata !{metadata !"branch_weights", i32 58357, i32 226163}
-!1010 = metadata !{metadata !"branch_weights", i32 -2072848646, i32 92907517}
+!181 = !{!"branch_weights", i32 662038, i32 1}
+!988 = !{!"branch_weights", i32 12091450, i32 1916}
+!989 = !{!"branch_weights", i32 7564670, i32 4526781}
+!990 = !{!"branch_weights", i32 7484958, i32 13283499}
+!991 = !{!"branch_weights", i32 8677007, i32 4606493}
+!992 = !{!"branch_weights", i32 -1172426948, i32 145094705}
+!993 = !{!"branch_weights", i32 1468914, i32 5683688}
+!994 = !{!"branch_weights", i32 114025221, i32 -1217548794, i32 -1199521551, i32 87712616}
+!995 = !{!"branch_weights", i32 1853716452, i32 -444717951, i32 932776759}
+!996 = !{!"branch_weights", i32 1004870, i32 20259}
+!997 = !{!"branch_weights", i32 20071, i32 189}
+!998 = !{!"branch_weights", i32 -1020255939, i32 572177766}
+!999 = !{!"branch_weights", i32 2666513, i32 3466431}
+!1000 = !{!"branch_weights", i32 5117635, i32 1859780}
+!1001 = !{!"branch_weights", i32 354902465, i32 -1444604407}
+!1002 = !{!"branch_weights", i32 -1762419279, i32 1592770684}
+!1003 = !{!"branch_weights", i32 1435905930, i32 -1951930624}
+!1004 = !{!"branch_weights", i32 1, i32 504888}
+!1005 = !{!"branch_weights", i32 94662, i32 504888}
+!1006 = !{!"branch_weights", i32 -1897793104, i32 160196332}
+!1007 = !{!"branch_weights", i32 2074643678, i32 -29579071}
+!1008 = !{!"branch_weights", i32 1, i32 226163}
+!1009 = !{!"branch_weights", i32 58357, i32 226163}
+!1010 = !{!"branch_weights", i32 -2072848646, i32 92907517}
diff --git a/test/CodeGen/AArch64/remat.ll b/test/CodeGen/AArch64/remat.ll
index 32b3ed2..8b3e6dd 100644
--- a/test/CodeGen/AArch64/remat.ll
+++ b/test/CodeGen/AArch64/remat.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a57 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a53 -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a72 -o - %s | FileCheck %s
 
 %X = type { i64, i64, i64 }
 declare void @f(%X*)
diff --git a/test/CodeGen/AArch64/setcc-type-mismatch.ll b/test/CodeGen/AArch64/setcc-type-mismatch.ll
new file mode 100644
index 0000000..86817fa
--- /dev/null
+++ b/test/CodeGen/AArch64/setcc-type-mismatch.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s
+
+define void @test_mismatched_setcc(<4 x i22> %l, <4 x i22> %r, <4 x i1>* %addr) {
+; CHECK-LABEL: test_mismatched_setcc:
+; CHECK: cmeq [[CMP128:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: xtn {{v[0-9]+}}.4h, [[CMP128]].4s
+
+  %tst = icmp eq <4 x i22> %l, %r
+  store <4 x i1> %tst, <4 x i1>* %addr
+  ret void
+}
diff --git a/test/CodeGen/ARM/2007-05-09-tailmerge-2.ll b/test/CodeGen/ARM/2007-05-09-tailmerge-2.ll
index 4894116..37e41ec 100644
--- a/test/CodeGen/ARM/2007-05-09-tailmerge-2.ll
+++ b/test/CodeGen/ARM/2007-05-09-tailmerge-2.ll
@@ -1,6 +1,11 @@
-; RUN: llc < %s -march=arm -enable-tail-merge | grep bl.*baz | count 1
-; RUN: llc < %s -march=arm -enable-tail-merge | grep bl.*quux | count 1
+; RUN: llc < %s -march=arm  | FileCheck %s
+
 ; Check that calls to baz and quux are tail-merged.
+; CHECK: bl _baz
+; CHECK-NOT: bl _baz
+; CHECK: bl _quux
+; CHECK-NOT: bl _quux
+
 ; PR1628
 
 ; ModuleID = 'tail.c'
diff --git a/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll b/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
index acbab8a..30ae723 100644
--- a/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
+++ b/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
@@ -1,10 +1,23 @@
-; RUN: llc < %s -march=arm | grep bl.*baz | count 1
-; RUN: llc < %s -march=arm | grep bl.*quux | count 1
-; RUN: llc < %s -march=arm -enable-tail-merge=0 | grep bl.*baz | count 2
-; RUN: llc < %s -march=arm -enable-tail-merge=0 | grep bl.*quux | count 2
-; Check that tail merging is the default on ARM, and that -enable-tail-merge=0 works.
+; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc < %s -march=arm -enable-tail-merge=0 | \
+; RUN:   FileCheck --check-prefix=NOMERGE %s
+
+; Check that tail merging is the default on ARM, and that -enable-tail-merge=0
+; works.
 ; PR1628
 
+; CHECK: bl _baz
+; CHECK-NOT: bl _baz
+
+; CHECK: bl _quux
+; CHECK-NOT: bl _quux
+
+; NOMERGE: bl _baz
+; NOMERGE: bl _baz
+
+; NOMERGE: bl _quux
+; NOMERGE: bl _quux
+
 ; ModuleID = 'tail.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "i686-apple-darwin8"
diff --git a/test/CodeGen/ARM/2009-10-16-Scope.ll b/test/CodeGen/ARM/2009-10-16-Scope.ll
index b4e758d..de05644 100644
--- a/test/CodeGen/ARM/2009-10-16-Scope.ll
+++ b/test/CodeGen/ARM/2009-10-16-Scope.ll
@@ -9,7 +9,7 @@ entry:
   br label %do.body, !dbg !0
 
 do.body:                                          ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{i32* %count_}, metadata !4, metadata !{metadata !"0x102"})
+  call void @llvm.dbg.declare(metadata i32* %count_, metadata !4, metadata !{!"0x102"})
   %conv = ptrtoint i32* %count_ to i32, !dbg !0   ; <i32> [#uses=1]
   %call = call i32 @foo(i32 %conv) ssp, !dbg !0   ; <i32> [#uses=0]
   br label %do.end, !dbg !0
@@ -22,13 +22,13 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i32 @foo(i32) ssp
 
-!0 = metadata !{i32 5, i32 2, metadata !1, null}
-!1 = metadata !{metadata !"0xb\001\001\000", null, metadata !2}; [DW_TAG_lexical_block ]
-!2 = metadata !{metadata !"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", i32 0, metadata !3, null, null, null, null, null, null}; [DW_TAG_subprogram ]
-!3 = metadata !{metadata !"0x11\0012\00clang 1.1\001\00\000\00\000", metadata !8, null, metadata !9, null, null, null}; [DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x100\00count_\005\000", metadata !5, metadata !3, metadata !6}; [ DW_TAG_auto_variable ]
-!5 = metadata !{metadata !"0xb\001\001\000", null, metadata !1}; [DW_TAG_lexical_block ]
-!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !3}; [DW_TAG_base_type ]
-!7 = metadata !{i32 6, i32 1, metadata !2, null}
-!8 = metadata !{metadata !"genmodes.i", metadata !"/Users/yash/Downloads"}
-!9 = metadata !{i32 0}
+!0 = !MDLocation(line: 5, column: 2, scope: !1)
+!1 = !{!"0xb\001\001\000", null, !2}; [DW_TAG_lexical_block ]
+!2 = !{!"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", i32 0, !3, null, null, null, null, null, null}; [DW_TAG_subprogram ]
+!3 = !{!"0x11\0012\00clang 1.1\001\00\000\00\000", !8, null, !9, null, null, null}; [DW_TAG_compile_unit ]
+!4 = !{!"0x100\00count_\005\000", !5, !3, !6}; [ DW_TAG_auto_variable ]
+!5 = !{!"0xb\001\001\000", null, !1}; [DW_TAG_lexical_block ]
+!6 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !3}; [DW_TAG_base_type ]
+!7 = !MDLocation(line: 6, column: 1, scope: !2)
+!8 = !{!"genmodes.i", !"/Users/yash/Downloads"}
+!9 = !{i32 0}
diff --git a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
index bce3120..6f7db93 100644
--- a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
+++ b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
@@ -5,7 +5,7 @@ target triple = "armv4t-apple-darwin10"
 
 define hidden i32 @__addvsi3(i32 %a, i32 %b) nounwind {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %b}, i64 0, metadata !0, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata i32 %b, i64 0, metadata !0, metadata !{!"0x102"})
   %0 = add nsw i32 %b, %a, !dbg !9                ; <i32> [#uses=1]
   ret i32 %0, !dbg !11
 }
@@ -14,19 +14,19 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!15}
-!0 = metadata !{metadata !"0x101\00b\0093\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00__addvsi3\00__addvsi3\00__addvsi3\0094\000\001\000\006\000\000\000", metadata !12, null, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
-!12 = metadata !{metadata !"libgcc2.c", metadata !"/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc"}
-!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)\001\00\000\00\000", metadata !12, metadata !13, metadata !13, metadata !14, null, null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{metadata !6, metadata !6, metadata !6}
-!6 = metadata !{metadata !"0x16\00SItype\00152\000\000\000\000", metadata !12, null, metadata !8} ; [ DW_TAG_typedef ]
-!7 = metadata !{metadata !"0x29", metadata !"libgcc2.h", metadata !"/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc", metadata !3} ; [ DW_TAG_file_type ]
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !12, metadata !2} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 95, i32 0, metadata !10, null}
-!10 = metadata !{metadata !"0xb\0094\000\000", metadata !12, metadata !1} ; [ DW_TAG_lexical_block ]
-!11 = metadata !{i32 100, i32 0, metadata !10, null}
-!13 = metadata !{i32 0}
-!14 = metadata !{metadata !1}
-!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x101\00b\0093\000", !1, !2, !6} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00__addvsi3\00__addvsi3\00__addvsi3\0094\000\001\000\006\000\000\000", !12, null, !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x29", !12} ; [ DW_TAG_file_type ]
+!12 = !{!"libgcc2.c", !"/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc"}
+!3 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)\001\00\000\00\000", !12, !13, !13, !14, null, null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !12, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{!6, !6, !6}
+!6 = !{!"0x16\00SItype\00152\000\000\000\000", !12, null, !8} ; [ DW_TAG_typedef ]
+!7 = !{!"0x29", !"libgcc2.h", !"/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc", !3} ; [ DW_TAG_file_type ]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", !12, !2} ; [ DW_TAG_base_type ]
+!9 = !MDLocation(line: 95, scope: !10)
+!10 = !{!"0xb\0094\000\000", !12, !1} ; [ DW_TAG_lexical_block ]
+!11 = !MDLocation(line: 100, scope: !10)
+!13 = !{i32 0}
+!14 = !{!1}
+!15 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll b/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll
index efe1ab5..18b3be0 100644
--- a/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll
+++ b/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll
@@ -7,16 +7,16 @@ target triple = "thumbv7-apple-darwin3.0.0-iphoneos"
 
 define void @x0(i8* nocapture %buf, i32 %nbytes) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8* %buf}, i64 0, metadata !0, metadata !{metadata !"0x102"}), !dbg !15
-  tail call void @llvm.dbg.value(metadata !{i32 %nbytes}, i64 0, metadata !8, metadata !{metadata !"0x102"}), !dbg !16
+  tail call void @llvm.dbg.value(metadata i8* %buf, i64 0, metadata !0, metadata !{!"0x102"}), !dbg !15
+  tail call void @llvm.dbg.value(metadata i32 %nbytes, i64 0, metadata !8, metadata !{!"0x102"}), !dbg !16
   %tmp = load i32* @length, !dbg !17              ; <i32> [#uses=3]
   %cmp = icmp eq i32 %tmp, -1, !dbg !17           ; <i1> [#uses=1]
   %cmp.not = xor i1 %cmp, true                    ; <i1> [#uses=1]
   %cmp3 = icmp ult i32 %tmp, %nbytes, !dbg !17    ; <i1> [#uses=1]
   %or.cond = and i1 %cmp.not, %cmp3               ; <i1> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{i32 %tmp}, i64 0, metadata !8, metadata !{metadata !"0x102"}), !dbg !17
+  tail call void @llvm.dbg.value(metadata i32 %tmp, i64 0, metadata !8, metadata !{!"0x102"}), !dbg !17
   %nbytes.addr.0 = select i1 %or.cond, i32 %tmp, i32 %nbytes ; <i32> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !18, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !10, metadata !{!"0x102"}), !dbg !19
   br label %while.cond, !dbg !20
 
 while.cond:                                       ; preds = %while.body, %entry
@@ -47,30 +47,30 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.lv.fn = !{!0, !8, !10, !12}
 !llvm.dbg.gv = !{!14}
 
-!0 = metadata !{metadata !"0x101\00buf\004\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00x0\00x0\00x0\005\000\001\000\006\000\000\000", metadata !26, null, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x29", metadata !26} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\0012\00clang 2.0\001\00\00\00\00", metadata !26, null, null, null, null, null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !26, metadata !2, null, metadata !5, null} ; [ DW_TAG_subroutine_type ]
-!5 = metadata !{null}
-!6 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !26, metadata !2, metadata !7} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{metadata !"0x24\00unsigned char\000\008\008\000\000\008", metadata !26, metadata !2} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !"0x101\00nbytes\004\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{metadata !"0x24\00unsigned long\000\0032\0032\000\000\007", metadata !26, metadata !2} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !"0x100\00nread\006\000", metadata !11, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{metadata !"0xb\005\001\000", metadata !26, metadata !1} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{metadata !"0x100\00c\007\000", metadata !11, metadata !2, metadata !13} ; [ DW_TAG_auto_variable ]
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !26, metadata !2} ; [ DW_TAG_base_type ]
-!14 = metadata !{metadata !"0x34\00length\00length\00length\001\000\001", metadata !2, metadata !2, metadata !13, i32* @length} ; [ DW_TAG_variable ]
-!15 = metadata !{i32 4, i32 24, metadata !1, null}
-!16 = metadata !{i32 4, i32 43, metadata !1, null}
-!17 = metadata !{i32 9, i32 2, metadata !11, null}
-!18 = metadata !{i32 0}
-!19 = metadata !{i32 10, i32 2, metadata !11, null}
-!20 = metadata !{i32 11, i32 2, metadata !11, null}
-!21 = metadata !{i32 12, i32 3, metadata !22, null}
-!22 = metadata !{metadata !"0xb\0011\0045\000", metadata !26, metadata !11} ; [ DW_TAG_lexical_block ]
-!23 = metadata !{i32 13, i32 3, metadata !22, null}
-!24 = metadata !{i32 14, i32 2, metadata !22, null}
-!25 = metadata !{i32 15, i32 1, metadata !11, null}
-!26 = metadata !{metadata !"t.c", metadata !"/private/tmp"}
+!0 = !{!"0x101\00buf\004\000", !1, !2, !6} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00x0\00x0\00x0\005\000\001\000\006\000\000\000", !26, null, !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x29", !26} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\0012\00clang 2.0\001\00\00\00\00", !26, null, null, null, null, null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !26, !2, null, !5, null} ; [ DW_TAG_subroutine_type ]
+!5 = !{null}
+!6 = !{!"0xf\00\000\0032\0032\000\000", !26, !2, !7} ; [ DW_TAG_pointer_type ]
+!7 = !{!"0x24\00unsigned char\000\008\008\000\000\008", !26, !2} ; [ DW_TAG_base_type ]
+!8 = !{!"0x101\00nbytes\004\000", !1, !2, !9} ; [ DW_TAG_arg_variable ]
+!9 = !{!"0x24\00unsigned long\000\0032\0032\000\000\007", !26, !2} ; [ DW_TAG_base_type ]
+!10 = !{!"0x100\00nread\006\000", !11, !2, !9} ; [ DW_TAG_auto_variable ]
+!11 = !{!"0xb\005\001\000", !26, !1} ; [ DW_TAG_lexical_block ]
+!12 = !{!"0x100\00c\007\000", !11, !2, !13} ; [ DW_TAG_auto_variable ]
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", !26, !2} ; [ DW_TAG_base_type ]
+!14 = !{!"0x34\00length\00length\00length\001\000\001", !2, !2, !13, i32* @length} ; [ DW_TAG_variable ]
+!15 = !MDLocation(line: 4, column: 24, scope: !1)
+!16 = !MDLocation(line: 4, column: 43, scope: !1)
+!17 = !MDLocation(line: 9, column: 2, scope: !11)
+!18 = !{i32 0}
+!19 = !MDLocation(line: 10, column: 2, scope: !11)
+!20 = !MDLocation(line: 11, column: 2, scope: !11)
+!21 = !MDLocation(line: 12, column: 3, scope: !22)
+!22 = !{!"0xb\0011\0045\000", !26, !11} ; [ DW_TAG_lexical_block ]
+!23 = !MDLocation(line: 13, column: 3, scope: !22)
+!24 = !MDLocation(line: 14, column: 2, scope: !22)
+!25 = !MDLocation(line: 15, column: 1, scope: !11)
+!26 = !{!"t.c", !"/private/tmp"}
diff --git a/test/CodeGen/ARM/2010-08-04-StackVariable.ll b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
index f10408c..f71a6c9 100644
--- a/test/CodeGen/ARM/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
@@ -6,8 +6,8 @@
 define i32 @_Z3fooi4SVal(i32 %i, %struct.SVal* noalias %location) nounwind ssp {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
-  call void @llvm.dbg.value(metadata !{%struct.SVal* %location}, i64 0, metadata !25, metadata !{metadata !"0x102"}), !dbg !24
+  call void @llvm.dbg.value(metadata i32 %i, i64 0, metadata !23, metadata !{!"0x102"}), !dbg !24
+  call void @llvm.dbg.value(metadata %struct.SVal* %location, i64 0, metadata !25, metadata !{!"0x102"}), !dbg !24
   %0 = icmp ne i32 %i, 0, !dbg !27                ; <i1> [#uses=1]
   br i1 %0, label %bb, label %bb1, !dbg !27
 
@@ -34,7 +34,7 @@ return:                                           ; preds = %bb2
 define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2  {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{%struct.SVal* %this}, i64 0, metadata !31, metadata !{metadata !"0x102"}), !dbg !34
+  call void @llvm.dbg.value(metadata %struct.SVal* %this, i64 0, metadata !31, metadata !{!"0x102"}), !dbg !34
   %0 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 0, !dbg !34 ; <i8**> [#uses=1]
   store i8* null, i8** %0, align 8, !dbg !34
   %1 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 1, !dbg !34 ; <i32*> [#uses=1]
@@ -52,7 +52,7 @@ entry:
   %0 = alloca %struct.SVal                        ; <%struct.SVal*> [#uses=3]
   %v = alloca %struct.SVal                        ; <%struct.SVal*> [#uses=4]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.SVal* %v}, metadata !38, metadata !{metadata !"0x102"}), !dbg !41
+  call void @llvm.dbg.declare(metadata %struct.SVal* %v, metadata !38, metadata !{!"0x102"}), !dbg !41
   call void @_ZN4SValC1Ev(%struct.SVal* %v) nounwind, !dbg !41
   %1 = getelementptr inbounds %struct.SVal* %v, i32 0, i32 1, !dbg !42 ; <i32*> [#uses=1]
   store i32 1, i32* %1, align 8, !dbg !42
@@ -65,7 +65,7 @@ entry:
   %7 = load i32* %6, align 8, !dbg !43            ; <i32> [#uses=1]
   store i32 %7, i32* %5, align 8, !dbg !43
   %8 = call i32 @_Z3fooi4SVal(i32 2, %struct.SVal* noalias %0) nounwind, !dbg !43 ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{i32 %8}, i64 0, metadata !44, metadata !{metadata !"0x102"}), !dbg !43
+  call void @llvm.dbg.value(metadata i32 %8, i64 0, metadata !44, metadata !{!"0x102"}), !dbg !43
   br label %return, !dbg !45
 
 return:                                           ; preds = %entry
@@ -77,53 +77,53 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!49}
 
-!0 = metadata !{metadata !"0x2e\00SVal\00SVal\00\0011\000\000\000\006\000\000\000", metadata !48, metadata !1, metadata !14, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x13\00SVal\001\00128\0064\000\000\000", metadata !48, null, null, metadata !4, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
-!2 = metadata !{metadata !"0x29", metadata !48} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\001", metadata !48, metadata !47, metadata !47, metadata !46, metadata !47,  metadata !47} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !5, metadata !7, metadata !0, metadata !9}
-!5 = metadata !{metadata !"0xd\00Data\007\0064\0064\000\000", metadata !48, metadata !1, metadata !6} ; [ DW_TAG_member ]
-!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !48, null, null} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{metadata !"0xd\00Kind\008\0032\0032\0064\000", metadata !48, metadata !1, metadata !8} ; [ DW_TAG_member ]
-!8 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", metadata !48, null} ; [ DW_TAG_base_type ]
-!9 = metadata !{metadata !"0x2e\00~SVal\00~SVal\00\0012\000\000\000\006\000\000\000", metadata !48, metadata !1, metadata !10, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !48, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!11 = metadata !{null, metadata !12, metadata !13}
-!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !48, null, metadata !1} ; [ DW_TAG_pointer_type ]
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !48, null} ; [ DW_TAG_base_type ]
-!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !48, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!15 = metadata !{null, metadata !12}
-!16 = metadata !{metadata !"0x2e\00SVal\00SVal\00_ZN4SValC1Ev\0011\000\001\000\006\000\000\000", metadata !48, metadata !1, metadata !14, null, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null} ; [ DW_TAG_subprogram ]
-!17 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooi4SVal\0016\000\001\000\006\000\000\000", metadata !48, metadata !2, metadata !18, null, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null} ; [ DW_TAG_subprogram ]
-!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !48, null, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!19 = metadata !{metadata !13, metadata !13, metadata !1}
-!20 = metadata !{metadata !"0x2e\00main\00main\00main\0023\000\001\000\006\000\000\000", metadata !48, metadata !2, metadata !21, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
-!21 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !48, null, null, metadata !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!22 = metadata !{metadata !13}
-!23 = metadata !{metadata !"0x101\00i\0016\000", metadata !17, metadata !2, metadata !13} ; [ DW_TAG_arg_variable ]
-!24 = metadata !{i32 16, i32 0, metadata !17, null}
-!25 = metadata !{metadata !"0x101\00location\0016\000", metadata !17, metadata !2, metadata !26} ; [ DW_TAG_arg_variable ]
-!26 = metadata !{metadata !"0x10\00SVal\000\0064\0064\000\000", metadata !48, metadata !2, metadata !1} ; [ DW_TAG_reference_type ]
-!27 = metadata !{i32 17, i32 0, metadata !28, null}
-!28 = metadata !{metadata !"0xb\0016\000\002", metadata !2, metadata !17} ; [ DW_TAG_lexical_block ]
-!29 = metadata !{i32 18, i32 0, metadata !28, null}
-!30 = metadata !{i32 20, i32 0, metadata !28, null}
-!31 = metadata !{metadata !"0x101\00this\0011\000", metadata !16, metadata !2, metadata !32} ; [ DW_TAG_arg_variable ]
-!32 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !48, metadata !2, metadata !33} ; [ DW_TAG_const_type ]
-!33 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !48, metadata !2, metadata !1} ; [ DW_TAG_pointer_type ]
-!34 = metadata !{i32 11, i32 0, metadata !16, null}
-!35 = metadata !{i32 11, i32 0, metadata !36, null}
-!36 = metadata !{metadata !"0xb\0011\000\001", metadata !48, metadata !37} ; [ DW_TAG_lexical_block ]
-!37 = metadata !{metadata !"0xb\0011\000\000", metadata !48, metadata !16} ; [ DW_TAG_lexical_block ]
-!38 = metadata !{metadata !"0x100\00v\0024\000", metadata !39, metadata !2, metadata !1} ; [ DW_TAG_auto_variable ]
-!39 = metadata !{metadata !"0xb\0023\000\004", metadata !48, metadata !40} ; [ DW_TAG_lexical_block ]
-!40 = metadata !{metadata !"0xb\0023\000\003", metadata !48, metadata !20} ; [ DW_TAG_lexical_block ]
-!41 = metadata !{i32 24, i32 0, metadata !39, null}
-!42 = metadata !{i32 25, i32 0, metadata !39, null}
-!43 = metadata !{i32 26, i32 0, metadata !39, null}
-!44 = metadata !{metadata !"0x100\00k\0026\000", metadata !39, metadata !2, metadata !13} ; [ DW_TAG_auto_variable ]
-!45 = metadata !{i32 27, i32 0, metadata !39, null}
-!46 = metadata !{metadata !16, metadata !17, metadata !20}
-!47 = metadata !{}
-!48 = metadata !{metadata !"small.cc", metadata !"/Users/manav/R8248330"}
-!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00SVal\00SVal\00\0011\000\000\000\006\000\000\000", !48, !1, !14, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x13\00SVal\001\00128\0064\000\000\000", !48, null, null, !4, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
+!2 = !{!"0x29", !48} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\001", !48, !47, !47, !46, !47,  !47} ; [ DW_TAG_compile_unit ]
+!4 = !{!5, !7, !0, !9}
+!5 = !{!"0xd\00Data\007\0064\0064\000\000", !48, !1, !6} ; [ DW_TAG_member ]
+!6 = !{!"0xf\00\000\0064\0064\000\000", !48, null, null} ; [ DW_TAG_pointer_type ]
+!7 = !{!"0xd\00Kind\008\0032\0032\0064\000", !48, !1, !8} ; [ DW_TAG_member ]
+!8 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", !48, null} ; [ DW_TAG_base_type ]
+!9 = !{!"0x2e\00~SVal\00~SVal\00\0012\000\000\000\006\000\000\000", !48, !1, !10, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!10 = !{!"0x15\00\000\000\000\000\000\000", !48, null, null, !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = !{null, !12, !13}
+!12 = !{!"0xf\00\000\0064\0064\000\0064", !48, null, !1} ; [ DW_TAG_pointer_type ]
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", !48, null} ; [ DW_TAG_base_type ]
+!14 = !{!"0x15\00\000\000\000\000\000\000", !48, null, null, !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = !{null, !12}
+!16 = !{!"0x2e\00SVal\00SVal\00_ZN4SValC1Ev\0011\000\001\000\006\000\000\000", !48, !1, !14, null, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null} ; [ DW_TAG_subprogram ]
+!17 = !{!"0x2e\00foo\00foo\00_Z3fooi4SVal\0016\000\001\000\006\000\000\000", !48, !2, !18, null, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null} ; [ DW_TAG_subprogram ]
+!18 = !{!"0x15\00\000\000\000\000\000\000", !48, null, null, !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = !{!13, !13, !1}
+!20 = !{!"0x2e\00main\00main\00main\0023\000\001\000\006\000\000\000", !48, !2, !21, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
+!21 = !{!"0x15\00\000\000\000\000\000\000", !48, null, null, !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = !{!13}
+!23 = !{!"0x101\00i\0016\000", !17, !2, !13} ; [ DW_TAG_arg_variable ]
+!24 = !MDLocation(line: 16, scope: !17)
+!25 = !{!"0x101\00location\0016\000", !17, !2, !26} ; [ DW_TAG_arg_variable ]
+!26 = !{!"0x10\00SVal\000\0064\0064\000\000", !48, !2, !1} ; [ DW_TAG_reference_type ]
+!27 = !MDLocation(line: 17, scope: !28)
+!28 = !{!"0xb\0016\000\002", !2, !17} ; [ DW_TAG_lexical_block ]
+!29 = !MDLocation(line: 18, scope: !28)
+!30 = !MDLocation(line: 20, scope: !28)
+!31 = !{!"0x101\00this\0011\000", !16, !2, !32} ; [ DW_TAG_arg_variable ]
+!32 = !{!"0x26\00\000\0064\0064\000\0064", !48, !2, !33} ; [ DW_TAG_const_type ]
+!33 = !{!"0xf\00\000\0064\0064\000\000", !48, !2, !1} ; [ DW_TAG_pointer_type ]
+!34 = !MDLocation(line: 11, scope: !16)
+!35 = !MDLocation(line: 11, scope: !36)
+!36 = !{!"0xb\0011\000\001", !48, !37} ; [ DW_TAG_lexical_block ]
+!37 = !{!"0xb\0011\000\000", !48, !16} ; [ DW_TAG_lexical_block ]
+!38 = !{!"0x100\00v\0024\000", !39, !2, !1} ; [ DW_TAG_auto_variable ]
+!39 = !{!"0xb\0023\000\004", !48, !40} ; [ DW_TAG_lexical_block ]
+!40 = !{!"0xb\0023\000\003", !48, !20} ; [ DW_TAG_lexical_block ]
+!41 = !MDLocation(line: 24, scope: !39)
+!42 = !MDLocation(line: 25, scope: !39)
+!43 = !MDLocation(line: 26, scope: !39)
+!44 = !{!"0x100\00k\0026\000", !39, !2, !13} ; [ DW_TAG_auto_variable ]
+!45 = !MDLocation(line: 27, scope: !39)
+!46 = !{!16, !17, !20}
+!47 = !{}
+!48 = !{!"small.cc", !"/Users/manav/R8248330"}
+!49 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
index 7fbd3ba..67dda67 100644
--- a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
+++ b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
@@ -30,9 +30,9 @@ target triple = "thumbv7-apple-darwin10"
 
 define zeroext i8 @get1(i8 zeroext %a) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata i8 %a, i64 0, metadata !10, metadata !{!"0x102"}), !dbg !30
   %0 = load i8* @x1, align 4, !dbg !30
-  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata i8 %0, i64 0, metadata !11, metadata !{!"0x102"}), !dbg !30
   store i8 %a, i8* @x1, align 4, !dbg !30
   ret i8 %0, !dbg !31
 }
@@ -41,36 +41,36 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 define zeroext i8 @get2(i8 zeroext %a) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !32
+  tail call void @llvm.dbg.value(metadata i8 %a, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !32
   %0 = load i8* @x2, align 4, !dbg !32
-  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !19, metadata !{metadata !"0x102"}), !dbg !32
+  tail call void @llvm.dbg.value(metadata i8 %0, i64 0, metadata !19, metadata !{!"0x102"}), !dbg !32
   store i8 %a, i8* @x2, align 4, !dbg !32
   ret i8 %0, !dbg !33
 }
 
 define zeroext i8 @get3(i8 zeroext %a) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !21, metadata !{metadata !"0x102"}), !dbg !34
+  tail call void @llvm.dbg.value(metadata i8 %a, i64 0, metadata !21, metadata !{!"0x102"}), !dbg !34
   %0 = load i8* @x3, align 4, !dbg !34
-  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !22, metadata !{metadata !"0x102"}), !dbg !34
+  tail call void @llvm.dbg.value(metadata i8 %0, i64 0, metadata !22, metadata !{!"0x102"}), !dbg !34
   store i8 %a, i8* @x3, align 4, !dbg !34
   ret i8 %0, !dbg !35
 }
 
 define zeroext i8 @get4(i8 zeroext %a) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !24, metadata !{metadata !"0x102"}), !dbg !36
+  tail call void @llvm.dbg.value(metadata i8 %a, i64 0, metadata !24, metadata !{!"0x102"}), !dbg !36
   %0 = load i8* @x4, align 4, !dbg !36
-  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !25, metadata !{metadata !"0x102"}), !dbg !36
+  tail call void @llvm.dbg.value(metadata i8 %0, i64 0, metadata !25, metadata !{!"0x102"}), !dbg !36
   store i8 %a, i8* @x4, align 4, !dbg !36
   ret i8 %0, !dbg !37
 }
 
 define zeroext i8 @get5(i8 zeroext %a) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !38
+  tail call void @llvm.dbg.value(metadata i8 %a, i64 0, metadata !27, metadata !{!"0x102"}), !dbg !38
   %0 = load i8* @x5, align 4, !dbg !38
-  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !28, metadata !{metadata !"0x102"}), !dbg !38
+  tail call void @llvm.dbg.value(metadata i8 %0, i64 0, metadata !28, metadata !{!"0x102"}), !dbg !38
   store i8 %a, i8* @x5, align 4, !dbg !38
   ret i8 %0, !dbg !39
 }
@@ -78,53 +78,53 @@ entry:
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!49}
 
-!0 = metadata !{metadata !"0x2e\00get1\00get1\00get1\004\000\001\000\006\00256\001\004", metadata !47, metadata !1, metadata !3, null, i8 (i8)* @get1, null, null, metadata !42} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x29", metadata !47} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 2369.8)\001\00\000\00\000", metadata !47, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !48} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5, metadata !5}
-!5 = metadata !{metadata !"0x24\00_Bool\000\008\008\000\000\002", metadata !47, metadata !1} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x2e\00get2\00get2\00get2\007\000\001\000\006\00256\001\007", metadata !47, metadata !1, metadata !3, null, i8 (i8)* @get2, null, null, metadata !43} ; [ DW_TAG_subprogram ]
-!7 = metadata !{metadata !"0x2e\00get3\00get3\00get3\0010\000\001\000\006\00256\001\0010", metadata !47, metadata !1, metadata !3, null, i8 (i8)* @get3, null, null, metadata !44} ; [ DW_TAG_subprogram ]
-!8 = metadata !{metadata !"0x2e\00get4\00get4\00get4\0013\000\001\000\006\00256\001\0013", metadata !47, metadata !1, metadata !3, null, i8 (i8)* @get4, null, null, metadata !45} ; [ DW_TAG_subprogram ]
-!9 = metadata !{metadata !"0x2e\00get5\00get5\00get5\0016\000\001\000\006\00256\001\0016", metadata !47, metadata !1, metadata !3, null, i8 (i8)* @get5, null, null, metadata !46} ; [ DW_TAG_subprogram ]
-!10 = metadata !{metadata !"0x101\00a\004\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{metadata !"0x100\00b\004\000", metadata !12, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!12 = metadata !{metadata !"0xb\004\000\000", metadata !47, metadata !0} ; [ DW_TAG_lexical_block ]
-!13 = metadata !{metadata !"0x34\00x1\00x1\00\003\001\001", metadata !1, metadata !1, metadata !5, i8* @x1, null} ; [ DW_TAG_variable ]
-!14 = metadata !{metadata !"0x34\00x2\00x2\00\006\001\001", metadata !1, metadata !1, metadata !5, i8* @x2, null} ; [ DW_TAG_variable ]
-!15 = metadata !{metadata !"0x34\00x3\00x3\00\009\001\001", metadata !1, metadata !1, metadata !5, i8* @x3, null} ; [ DW_TAG_variable ]
-!16 = metadata !{metadata !"0x34\00x4\00x4\00\0012\001\001", metadata !1, metadata !1, metadata !5, i8* @x4, null} ; [ DW_TAG_variable ]
-!17 = metadata !{metadata !"0x34\00x5\00x5\00\0015\000\001", metadata !1, metadata !1, metadata !5, i8* @x5, null} ; [ DW_TAG_variable ]
-!18 = metadata !{metadata !"0x101\00a\007\000", metadata !6, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{metadata !"0x100\00b\007\000", metadata !20, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!20 = metadata !{metadata !"0xb\007\000\001", metadata !47, metadata !6} ; [ DW_TAG_lexical_block ]
-!21 = metadata !{metadata !"0x101\00a\0010\000", metadata !7, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!22 = metadata !{metadata !"0x100\00b\0010\000", metadata !23, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!23 = metadata !{metadata !"0xb\0010\000\002", metadata !47, metadata !7} ; [ DW_TAG_lexical_block ]
-!24 = metadata !{metadata !"0x101\00a\0013\000", metadata !8, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!25 = metadata !{metadata !"0x100\00b\0013\000", metadata !26, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!26 = metadata !{metadata !"0xb\0013\000\003", metadata !47, metadata !8} ; [ DW_TAG_lexical_block ]
-!27 = metadata !{metadata !"0x101\00a\0016\000", metadata !9, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!28 = metadata !{metadata !"0x100\00b\0016\000", metadata !29, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{metadata !"0xb\0016\000\004", metadata !47, metadata !9} ; [ DW_TAG_lexical_block ]
-!30 = metadata !{i32 4, i32 0, metadata !0, null}
-!31 = metadata !{i32 4, i32 0, metadata !12, null}
-!32 = metadata !{i32 7, i32 0, metadata !6, null}
-!33 = metadata !{i32 7, i32 0, metadata !20, null}
-!34 = metadata !{i32 10, i32 0, metadata !7, null}
-!35 = metadata !{i32 10, i32 0, metadata !23, null}
-!36 = metadata !{i32 13, i32 0, metadata !8, null}
-!37 = metadata !{i32 13, i32 0, metadata !26, null}
-!38 = metadata !{i32 16, i32 0, metadata !9, null}
-!39 = metadata !{i32 16, i32 0, metadata !29, null}
-!40 = metadata !{metadata !0, metadata !6, metadata !7, metadata !8, metadata !9}
-!41 = metadata !{metadata !13, metadata !14, metadata !15, metadata !16, metadata !17}
-!42 = metadata !{metadata !10, metadata !11}
-!43 = metadata !{metadata !18, metadata !19}
-!44 = metadata !{metadata !21, metadata !22}
-!45 = metadata !{metadata !24, metadata !25}
-!46 = metadata !{metadata !27, metadata !28}
-!47 = metadata !{metadata !"foo.c", metadata !"/tmp/"}
-!48 = metadata !{}
-!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00get1\00get1\00get1\004\000\001\000\006\00256\001\004", !47, !1, !3, null, i8 (i8)* @get1, null, null, !42} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x29", !47} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 2369.8)\001\00\000\00\000", !47, !48, !48, !40, !41,  !48} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !47, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5, !5}
+!5 = !{!"0x24\00_Bool\000\008\008\000\000\002", !47, !1} ; [ DW_TAG_base_type ]
+!6 = !{!"0x2e\00get2\00get2\00get2\007\000\001\000\006\00256\001\007", !47, !1, !3, null, i8 (i8)* @get2, null, null, !43} ; [ DW_TAG_subprogram ]
+!7 = !{!"0x2e\00get3\00get3\00get3\0010\000\001\000\006\00256\001\0010", !47, !1, !3, null, i8 (i8)* @get3, null, null, !44} ; [ DW_TAG_subprogram ]
+!8 = !{!"0x2e\00get4\00get4\00get4\0013\000\001\000\006\00256\001\0013", !47, !1, !3, null, i8 (i8)* @get4, null, null, !45} ; [ DW_TAG_subprogram ]
+!9 = !{!"0x2e\00get5\00get5\00get5\0016\000\001\000\006\00256\001\0016", !47, !1, !3, null, i8 (i8)* @get5, null, null, !46} ; [ DW_TAG_subprogram ]
+!10 = !{!"0x101\00a\004\000", !0, !1, !5} ; [ DW_TAG_arg_variable ]
+!11 = !{!"0x100\00b\004\000", !12, !1, !5} ; [ DW_TAG_auto_variable ]
+!12 = !{!"0xb\004\000\000", !47, !0} ; [ DW_TAG_lexical_block ]
+!13 = !{!"0x34\00x1\00x1\00\003\001\001", !1, !1, !5, i8* @x1, null} ; [ DW_TAG_variable ]
+!14 = !{!"0x34\00x2\00x2\00\006\001\001", !1, !1, !5, i8* @x2, null} ; [ DW_TAG_variable ]
+!15 = !{!"0x34\00x3\00x3\00\009\001\001", !1, !1, !5, i8* @x3, null} ; [ DW_TAG_variable ]
+!16 = !{!"0x34\00x4\00x4\00\0012\001\001", !1, !1, !5, i8* @x4, null} ; [ DW_TAG_variable ]
+!17 = !{!"0x34\00x5\00x5\00\0015\000\001", !1, !1, !5, i8* @x5, null} ; [ DW_TAG_variable ]
+!18 = !{!"0x101\00a\007\000", !6, !1, !5} ; [ DW_TAG_arg_variable ]
+!19 = !{!"0x100\00b\007\000", !20, !1, !5} ; [ DW_TAG_auto_variable ]
+!20 = !{!"0xb\007\000\001", !47, !6} ; [ DW_TAG_lexical_block ]
+!21 = !{!"0x101\00a\0010\000", !7, !1, !5} ; [ DW_TAG_arg_variable ]
+!22 = !{!"0x100\00b\0010\000", !23, !1, !5} ; [ DW_TAG_auto_variable ]
+!23 = !{!"0xb\0010\000\002", !47, !7} ; [ DW_TAG_lexical_block ]
+!24 = !{!"0x101\00a\0013\000", !8, !1, !5} ; [ DW_TAG_arg_variable ]
+!25 = !{!"0x100\00b\0013\000", !26, !1, !5} ; [ DW_TAG_auto_variable ]
+!26 = !{!"0xb\0013\000\003", !47, !8} ; [ DW_TAG_lexical_block ]
+!27 = !{!"0x101\00a\0016\000", !9, !1, !5} ; [ DW_TAG_arg_variable ]
+!28 = !{!"0x100\00b\0016\000", !29, !1, !5} ; [ DW_TAG_auto_variable ]
+!29 = !{!"0xb\0016\000\004", !47, !9} ; [ DW_TAG_lexical_block ]
+!30 = !MDLocation(line: 4, scope: !0)
+!31 = !MDLocation(line: 4, scope: !12)
+!32 = !MDLocation(line: 7, scope: !6)
+!33 = !MDLocation(line: 7, scope: !20)
+!34 = !MDLocation(line: 10, scope: !7)
+!35 = !MDLocation(line: 10, scope: !23)
+!36 = !MDLocation(line: 13, scope: !8)
+!37 = !MDLocation(line: 13, scope: !26)
+!38 = !MDLocation(line: 16, scope: !9)
+!39 = !MDLocation(line: 16, scope: !29)
+!40 = !{!0, !6, !7, !8, !9}
+!41 = !{!13, !14, !15, !16, !17}
+!42 = !{!10, !11}
+!43 = !{!18, !19}
+!44 = !{!21, !22}
+!45 = !{!24, !25}
+!46 = !{!27, !28}
+!47 = !{!"foo.c", !"/tmp/"}
+!48 = !{}
+!49 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/2011-04-12-FastRegAlloc.ll b/test/CodeGen/ARM/2011-04-12-FastRegAlloc.ll
index eb23de0..e9a6793 100644
--- a/test/CodeGen/ARM/2011-04-12-FastRegAlloc.ll
+++ b/test/CodeGen/ARM/2011-04-12-FastRegAlloc.ll
@@ -12,4 +12,4 @@ entry:
   ret void
 }
 
-!0 = metadata !{i32 109}
+!0 = !{i32 109}
diff --git a/test/CodeGen/ARM/2011-05-04-MultipleLandingPadSuccs.ll b/test/CodeGen/ARM/2011-05-04-MultipleLandingPadSuccs.ll
index d3394b5..2af3e3e 100644
--- a/test/CodeGen/ARM/2011-05-04-MultipleLandingPadSuccs.ll
+++ b/test/CodeGen/ARM/2011-05-04-MultipleLandingPadSuccs.ll
@@ -81,8 +81,8 @@ declare void @_Unwind_SjLj_Resume_or_Rethrow(i8*)
 
 declare void @_ZSt9terminatev()
 
-!0 = metadata !{metadata !"any pointer", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"bool", metadata !1}
-!4 = metadata !{metadata !"int", metadata !1}
+!0 = !{!"any pointer", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA", null}
+!3 = !{!"bool", !1}
+!4 = !{!"int", !1}
diff --git a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
index ede936c..3edc946 100644
--- a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
+++ b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
@@ -29,41 +29,41 @@ target triple = "thumbv7-apple-macosx10.7.0"
 @x5 = global i32 0, align 4
 
 define i32 @get1(i32 %a) nounwind optsize ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !10, metadata !{!"0x102"}), !dbg !30
   %1 = load i32* @x1, align 4, !dbg !31
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !31
+  tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !11, metadata !{!"0x102"}), !dbg !31
   store i32 %a, i32* @x1, align 4, !dbg !31
   ret i32 %1, !dbg !31
 }
 
 define i32 @get2(i32 %a) nounwind optsize ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !32
+  tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !13, metadata !{!"0x102"}), !dbg !32
   %1 = load i32* @x2, align 4, !dbg !33
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !33
+  tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !14, metadata !{!"0x102"}), !dbg !33
   store i32 %a, i32* @x2, align 4, !dbg !33
   ret i32 %1, !dbg !33
 }
 
 define i32 @get3(i32 %a) nounwind optsize ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !34
+  tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !16, metadata !{!"0x102"}), !dbg !34
   %1 = load i32* @x3, align 4, !dbg !35
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !35
+  tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !35
   store i32 %a, i32* @x3, align 4, !dbg !35
   ret i32 %1, !dbg !35
 }
 
 define i32 @get4(i32 %a) nounwind optsize ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !19, metadata !{metadata !"0x102"}), !dbg !36
+  tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !19, metadata !{!"0x102"}), !dbg !36
   %1 = load i32* @x4, align 4, !dbg !37
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !20, metadata !{metadata !"0x102"}), !dbg !37
+  tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !20, metadata !{!"0x102"}), !dbg !37
   store i32 %a, i32* @x4, align 4, !dbg !37
   ret i32 %1, !dbg !37
 }
 
 define i32 @get5(i32 %a) nounwind optsize ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !38
+  tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !27, metadata !{!"0x102"}), !dbg !38
   %1 = load i32* @x5, align 4, !dbg !39
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !28, metadata !{metadata !"0x102"}), !dbg !39
+  tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !28, metadata !{!"0x102"}), !dbg !39
   store i32 %a, i32* @x5, align 4, !dbg !39
   ret i32 %1, !dbg !39
 }
@@ -73,50 +73,50 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!49}
 
-!0 = metadata !{metadata !"0x11\0012\00clang\001\00\000\00\001", metadata !47, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !48} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"0x2e\00get1\00get1\00\005\000\001\000\006\00256\001\005", metadata !47, metadata !2, metadata !3, null, i32 (i32)* @get1, null, null, metadata !42} ; [ DW_TAG_subprogram ] [line 5] [def] [get1]
-!2 = metadata !{metadata !"0x29", metadata !47} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x2e\00get2\00get2\00\008\000\001\000\006\00256\001\008", metadata !47, metadata !2, metadata !3, null, i32 (i32)* @get2, null, null, metadata !43} ; [ DW_TAG_subprogram ] [line 8] [def] [get2]
-!7 = metadata !{metadata !"0x2e\00get3\00get3\00\0011\000\001\000\006\00256\001\0011", metadata !47, metadata !2, metadata !3, null, i32 (i32)* @get3, null, null, metadata !44} ; [ DW_TAG_subprogram ] [line 11] [def] [get3]
-!8 = metadata !{metadata !"0x2e\00get4\00get4\00\0014\000\001\000\006\00256\001\0014", metadata !47, metadata !2, metadata !3, null, i32 (i32)* @get4, null, null, metadata !45} ; [ DW_TAG_subprogram ] [line 14] [def] [get4]
-!9 = metadata !{metadata !"0x2e\00get5\00get5\00\0017\000\001\000\006\00256\001\0017", metadata !47, metadata !2, metadata !3, null, i32 (i32)* @get5, null, null, metadata !46} ; [ DW_TAG_subprogram ] [line 17] [def] [get5]
-!10 = metadata !{metadata !"0x101\00a\0016777221\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{metadata !"0x100\00b\005\000", metadata !12, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
-!12 = metadata !{metadata !"0xb\005\0019\000", metadata !47, metadata !1} ; [ DW_TAG_lexical_block ]
-!13 = metadata !{metadata !"0x101\00a\0016777224\000", metadata !6, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
-!14 = metadata !{metadata !"0x100\00b\008\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
-!15 = metadata !{metadata !"0xb\008\0017\001", metadata !47, metadata !6} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{metadata !"0x101\00a\0016777227\000", metadata !7, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{metadata !"0x100\00b\0011\000", metadata !18, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
-!18 = metadata !{metadata !"0xb\0011\0019\002", metadata !47, metadata !7} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{metadata !"0x101\00a\0016777230\000", metadata !8, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
-!20 = metadata !{metadata !"0x100\00b\0014\000", metadata !21, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
-!21 = metadata !{metadata !"0xb\0014\0019\003", metadata !47, metadata !8} ; [ DW_TAG_lexical_block ]
-!25 = metadata !{metadata !"0x34\00x1\00x1\00\004\001\001", metadata !0, metadata !2, metadata !5, i32* @x1, null} ; [ DW_TAG_variable ]
-!26 = metadata !{metadata !"0x34\00x2\00x2\00\007\001\001", metadata !0, metadata !2, metadata !5, i32* @x2, null} ; [ DW_TAG_variable ]
-!27 = metadata !{metadata !"0x101\00a\0016777233\000", metadata !9, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
-!28 = metadata !{metadata !"0x100\00b\0017\000", metadata !29, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{metadata !"0xb\0017\0019\004", metadata !47, metadata !9} ; [ DW_TAG_lexical_block ]
-!30 = metadata !{i32 5, i32 16, metadata !1, null}
-!31 = metadata !{i32 5, i32 32, metadata !12, null}
-!32 = metadata !{i32 8, i32 14, metadata !6, null}
-!33 = metadata !{i32 8, i32 29, metadata !15, null}
-!34 = metadata !{i32 11, i32 16, metadata !7, null}
-!35 = metadata !{i32 11, i32 32, metadata !18, null}
-!36 = metadata !{i32 14, i32 16, metadata !8, null}
-!37 = metadata !{i32 14, i32 32, metadata !21, null}
-!38 = metadata !{i32 17, i32 16, metadata !9, null}
-!39 = metadata !{i32 17, i32 32, metadata !29, null}
-!40 = metadata !{metadata !1, metadata !6, metadata !7, metadata !8, metadata !9}
-!41 = metadata !{metadata !25, metadata !26}
-!42 = metadata !{metadata !10, metadata !11}
-!43 = metadata !{metadata !13, metadata !14}
-!44 = metadata !{metadata !16, metadata !17}
-!45 = metadata !{metadata !19, metadata !20}
-!46 = metadata !{metadata !27, metadata !28}
-!47 = metadata !{metadata !"ss3.c", metadata !"/private/tmp"}
-!48 = metadata !{}
-!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang\001\00\000\00\001", !47, !48, !48, !40, !41,  !48} ; [ DW_TAG_compile_unit ]
+!1 = !{!"0x2e\00get1\00get1\00\005\000\001\000\006\00256\001\005", !47, !2, !3, null, i32 (i32)* @get1, null, null, !42} ; [ DW_TAG_subprogram ] [line 5] [def] [get1]
+!2 = !{!"0x29", !47} ; [ DW_TAG_file_type ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !47, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !0} ; [ DW_TAG_base_type ]
+!6 = !{!"0x2e\00get2\00get2\00\008\000\001\000\006\00256\001\008", !47, !2, !3, null, i32 (i32)* @get2, null, null, !43} ; [ DW_TAG_subprogram ] [line 8] [def] [get2]
+!7 = !{!"0x2e\00get3\00get3\00\0011\000\001\000\006\00256\001\0011", !47, !2, !3, null, i32 (i32)* @get3, null, null, !44} ; [ DW_TAG_subprogram ] [line 11] [def] [get3]
+!8 = !{!"0x2e\00get4\00get4\00\0014\000\001\000\006\00256\001\0014", !47, !2, !3, null, i32 (i32)* @get4, null, null, !45} ; [ DW_TAG_subprogram ] [line 14] [def] [get4]
+!9 = !{!"0x2e\00get5\00get5\00\0017\000\001\000\006\00256\001\0017", !47, !2, !3, null, i32 (i32)* @get5, null, null, !46} ; [ DW_TAG_subprogram ] [line 17] [def] [get5]
+!10 = !{!"0x101\00a\0016777221\000", !1, !2, !5} ; [ DW_TAG_arg_variable ]
+!11 = !{!"0x100\00b\005\000", !12, !2, !5} ; [ DW_TAG_auto_variable ]
+!12 = !{!"0xb\005\0019\000", !47, !1} ; [ DW_TAG_lexical_block ]
+!13 = !{!"0x101\00a\0016777224\000", !6, !2, !5} ; [ DW_TAG_arg_variable ]
+!14 = !{!"0x100\00b\008\000", !15, !2, !5} ; [ DW_TAG_auto_variable ]
+!15 = !{!"0xb\008\0017\001", !47, !6} ; [ DW_TAG_lexical_block ]
+!16 = !{!"0x101\00a\0016777227\000", !7, !2, !5} ; [ DW_TAG_arg_variable ]
+!17 = !{!"0x100\00b\0011\000", !18, !2, !5} ; [ DW_TAG_auto_variable ]
+!18 = !{!"0xb\0011\0019\002", !47, !7} ; [ DW_TAG_lexical_block ]
+!19 = !{!"0x101\00a\0016777230\000", !8, !2, !5} ; [ DW_TAG_arg_variable ]
+!20 = !{!"0x100\00b\0014\000", !21, !2, !5} ; [ DW_TAG_auto_variable ]
+!21 = !{!"0xb\0014\0019\003", !47, !8} ; [ DW_TAG_lexical_block ]
+!25 = !{!"0x34\00x1\00x1\00\004\001\001", !0, !2, !5, i32* @x1, null} ; [ DW_TAG_variable ]
+!26 = !{!"0x34\00x2\00x2\00\007\001\001", !0, !2, !5, i32* @x2, null} ; [ DW_TAG_variable ]
+!27 = !{!"0x101\00a\0016777233\000", !9, !2, !5} ; [ DW_TAG_arg_variable ]
+!28 = !{!"0x100\00b\0017\000", !29, !2, !5} ; [ DW_TAG_auto_variable ]
+!29 = !{!"0xb\0017\0019\004", !47, !9} ; [ DW_TAG_lexical_block ]
+!30 = !MDLocation(line: 5, column: 16, scope: !1)
+!31 = !MDLocation(line: 5, column: 32, scope: !12)
+!32 = !MDLocation(line: 8, column: 14, scope: !6)
+!33 = !MDLocation(line: 8, column: 29, scope: !15)
+!34 = !MDLocation(line: 11, column: 16, scope: !7)
+!35 = !MDLocation(line: 11, column: 32, scope: !18)
+!36 = !MDLocation(line: 14, column: 16, scope: !8)
+!37 = !MDLocation(line: 14, column: 32, scope: !21)
+!38 = !MDLocation(line: 17, column: 16, scope: !9)
+!39 = !MDLocation(line: 17, column: 32, scope: !29)
+!40 = !{!1, !6, !7, !8, !9}
+!41 = !{!25, !26}
+!42 = !{!10, !11}
+!43 = !{!13, !14}
+!44 = !{!16, !17}
+!45 = !{!19, !20}
+!46 = !{!27, !28}
+!47 = !{!"ss3.c", !"/private/tmp"}
+!48 = !{}
+!49 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/2012-04-24-SplitEHCriticalEdge.ll b/test/CodeGen/ARM/2012-04-24-SplitEHCriticalEdge.ll
index b3a7e34..69d72bd 100644
--- a/test/CodeGen/ARM/2012-04-24-SplitEHCriticalEdge.ll
+++ b/test/CodeGen/ARM/2012-04-24-SplitEHCriticalEdge.ll
@@ -65,7 +65,7 @@ declare i32 @__gxx_personality_sj0(...)
 
 !llvm.module.flags = !{!0, !1, !2, !3}
 
-!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!0 = !{i32 1, !"Objective-C Version", i32 2}
+!1 = !{i32 1, !"Objective-C Image Info Version", i32 0}
+!2 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
diff --git a/test/CodeGen/ARM/2012-08-04-DtripleSpillReload.ll b/test/CodeGen/ARM/2012-08-04-DtripleSpillReload.ll
index adb5c7e..70e3079 100644
--- a/test/CodeGen/ARM/2012-08-04-DtripleSpillReload.ll
+++ b/test/CodeGen/ARM/2012-08-04-DtripleSpillReload.ll
@@ -169,4 +169,4 @@ define arm_aapcs_vfpcc void @foo(float, i1 zeroext, i1 zeroext) nounwind uwtable
 
 declare arm_aapcs_vfpcc void @bar(%0*, float)
 
-!0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
+!0 = !{!"branch_weights", i32 64, i32 4}
diff --git a/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll
index 5235e9c..53860ea 100644
--- a/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll
+++ b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll
@@ -8,4 +8,4 @@ define void @f() nounwind ssp {
   ret void
 }
 
-!0 = metadata !{i32 318437}
+!0 = !{i32 318437}
diff --git a/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll
index d389b5c..b47247c 100644
--- a/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll
+++ b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll
@@ -8,4 +8,4 @@ define hidden void @f(i32* %corr, i32 %order) nounwind ssp {
   ret void
 }
 
-!0 = metadata !{i32 257}
+!0 = !{i32 257}
diff --git a/test/CodeGen/ARM/2014-08-04-muls-it.ll b/test/CodeGen/ARM/2014-08-04-muls-it.ll
index 4636bff..5ba1347 100644
--- a/test/CodeGen/ARM/2014-08-04-muls-it.ll
+++ b/test/CodeGen/ARM/2014-08-04-muls-it.ll
@@ -17,9 +17,7 @@ if.end:                                           ; preds = %if.then, %entry
 
 ; CHECK-LABEL: function
 ; CHECK: cmp r0, r1
-; CHECK: bne [[LABEL:[.*]]]
 ; CHECK-NOT: mulseq r0, r0, r0
-; CHECK: [[LABEL]]
-; CHECK: muls r0, r0, r0
+; CHECK: muleq r0, r0, r0
 ; CHECK: bx lr
 
diff --git a/test/CodeGen/ARM/2015-01-21-thumbv4t-ldstr-opt.ll b/test/CodeGen/ARM/2015-01-21-thumbv4t-ldstr-opt.ll
new file mode 100644
index 0000000..de2dead
--- /dev/null
+++ b/test/CodeGen/ARM/2015-01-21-thumbv4t-ldstr-opt.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple=thumbv4t-none--eabi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-V4T
+; RUN: llc -mtriple=thumbv6m-none--eabi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-V6M
+
+; CHECK-LABEL: test1
+define i32 @test1(i32* %p) {
+
+; Offsets less than 8 can be generated in a single add
+; CHECK: adds [[NEWBASE:r[0-9]]], r0, #4
+  %1 = getelementptr inbounds i32* %p, i32 1
+  %2 = getelementptr inbounds i32* %p, i32 2
+  %3 = getelementptr inbounds i32* %p, i32 3
+  %4 = getelementptr inbounds i32* %p, i32 4
+
+; CHECK-NEXT: ldm [[NEWBASE]],
+  %5 = load i32* %1, align 4
+  %6 = load i32* %2, align 4
+  %7 = load i32* %3, align 4
+  %8 = load i32* %4, align 4
+
+  %9 = add nsw i32 %5, %6
+  %10 = add nsw i32 %9, %7
+  %11 = add nsw i32 %10, %8
+  ret i32 %11
+}
+
+; CHECK-LABEL: test2
+define i32 @test2(i32* %p) {
+
+; Offsets >=8 require a mov and an add
+; CHECK-V4T:  movs [[NEWBASE:r[0-9]]], r0
+; CHECK-V6M:  mov [[NEWBASE:r[0-9]]], r0
+; CHECK-NEXT: adds [[NEWBASE]], #8
+  %1 = getelementptr inbounds i32* %p, i32 2
+  %2 = getelementptr inbounds i32* %p, i32 3
+  %3 = getelementptr inbounds i32* %p, i32 4
+  %4 = getelementptr inbounds i32* %p, i32 5
+
+; CHECK-NEXT: ldm [[NEWBASE]],
+  %5 = load i32* %1, align 4
+  %6 = load i32* %2, align 4
+  %7 = load i32* %3, align 4
+  %8 = load i32* %4, align 4
+
+  %9 = add nsw i32 %5, %6
+  %10 = add nsw i32 %9, %7
+  %11 = add nsw i32 %10, %8
+  ret i32 %11
+}
diff --git a/test/CodeGen/ARM/Windows/read-only-data.ll b/test/CodeGen/ARM/Windows/read-only-data.ll
index 0ccb5ed..0438d68 100644
--- a/test/CodeGen/ARM/Windows/read-only-data.ll
+++ b/test/CodeGen/ARM/Windows/read-only-data.ll
@@ -10,6 +10,6 @@ entry:
   ret void
 }
 
-; CHECK: .section .rdata,"rd"
+; CHECK: .section .rdata,"dr"
 ; CHECK-NOT: .section ".rodata.str1.1"
 
diff --git a/test/CodeGen/ARM/Windows/stack-probe-non-default.ll b/test/CodeGen/ARM/Windows/stack-probe-non-default.ll
new file mode 100644
index 0000000..796bcdd
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/stack-probe-non-default.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple thumbv7-windows -mcpu cortex-a9 -o - %s \
+; RUN:     | FileCheck %s -check-prefix CHECK-DEFAULT-CODE-MODEL
+
+; RUN: llc -mtriple thumbv7-windows -mcpu cortex-a9 -code-model large -o - %s \
+; RUN:     | FileCheck %s -check-prefix CHECK-LARGE-CODE-MODEL
+
+declare dllimport arm_aapcs_vfpcc void @initialise(i8*)
+
+define dllexport arm_aapcs_vfpcc signext i8 @function(i32 %offset) #0 {
+entry:
+  %buffer = alloca [4096 x i8], align 1
+  %0 = getelementptr inbounds [4096 x i8]* %buffer, i32 0, i32 0
+  call arm_aapcs_vfpcc void @initialise(i8* %0)
+  %arrayidx = getelementptr inbounds [4096 x i8]* %buffer, i32 0, i32 %offset
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}
+
+attributes #0 = { "stack-probe-size"="8096" }
+
+; CHECK-DEFAULT-CODE-MODEL-NOT: __chkstk
+; CHECK-DEFAULT-CODE-MODEL: sub.w sp, sp, #4096
+
+; CHECK-LARGE-CODE-MODEL-NOT: movw r12, :lower16:__chkstk
+; CHECK-LARGE-CODE-MODEL-NOT: movt r12, :upper16:__chkstk
+; CHECK-LARGE-CODE-MODEL: sub.w sp, sp, #4096
+
diff --git a/test/CodeGen/ARM/Windows/structors.ll b/test/CodeGen/ARM/Windows/structors.ll
index a1a9026..874b5bf 100644
--- a/test/CodeGen/ARM/Windows/structors.ll
+++ b/test/CodeGen/ARM/Windows/structors.ll
@@ -7,6 +7,6 @@ entry:
   ret void
 }
 
-; CHECK: .section .CRT$XCU,"rd"
+; CHECK: .section .CRT$XCU,"dr"
 ; CHECK: .long function
 
diff --git a/test/CodeGen/ARM/aggregate-padding.ll b/test/CodeGen/ARM/aggregate-padding.ll
new file mode 100644
index 0000000..bc46a9c
--- /dev/null
+++ b/test/CodeGen/ARM/aggregate-padding.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mtriple=armv7-linux-gnueabihf %s -o - | FileCheck %s
+
+; [2 x i64] should be contiguous when split (e.g. we shouldn't try to align all
+; i32 components to 64 bits). Also makes sure i64 based types are properly
+; aligned on the stack.
+define i64 @test_i64_contiguous_on_stack([8 x double], float, i32 %in, [2 x i64] %arg) nounwind {
+; CHECK-LABEL: test_i64_contiguous_on_stack:
+; CHECK-DAG: ldr [[LO0:r[0-9]+]], [sp, #8]
+; CHECK-DAG: ldr [[HI0:r[0-9]+]], [sp, #12]
+; CHECK-DAG: ldr [[LO1:r[0-9]+]], [sp, #16]
+; CHECK-DAG: ldr [[HI1:r[0-9]+]], [sp, #20]
+; CHECK: adds r0, [[LO0]], [[LO1]]
+; CHECK: adc r1, [[HI0]], [[HI1]]
+
+  %val1 = extractvalue [2 x i64] %arg, 0
+  %val2 = extractvalue [2 x i64] %arg, 1
+  %sum = add i64 %val1, %val2
+  ret i64 %sum
+}
+
+; [2 x i64] should try to use looks for 4 regs, not 8 (which might happen if the
+; i64 -> i32, i32 split wasn't handled correctly).
+define i64 @test_2xi64_uses_4_regs([8 x double], float, [2 x i64] %arg) nounwind {
+; CHECK-LABEL: test_2xi64_uses_4_regs:
+; CHECK-DAG: mov r0, r2
+; CHECK-DAG: mov r1, r3
+
+  %val = extractvalue [2 x i64] %arg, 1
+  ret i64 %val
+}
+
+; An aggregate should be able to split between registers and stack if there is
+; nothing else on the stack.
+define i32 @test_aggregates_split([8 x double], i32, [4 x i32] %arg) nounwind {
+; CHECK-LABEL: test_aggregates_split:
+; CHECK: ldr [[VAL3:r[0-9]+]], [sp]
+; CHECK: add r0, r1, [[VAL3]]
+
+  %val0 = extractvalue [4 x i32] %arg, 0
+  %val3 = extractvalue [4 x i32] %arg, 3
+  %sum = add i32 %val0, %val3
+  ret i32 %sum
+}
+
+; If an aggregate has to be moved entirely onto the stack, nothing should be
+; able to use r0-r3 any more. Also checks that [2 x i64] properly aligned when
+; it uses regs.
+define i32 @test_no_int_backfilling([8 x double], float, i32, [2 x i64], i32 %arg) nounwind {
+; CHECK-LABEL: test_no_int_backfilling:
+; CHECK: ldr r0, [sp, #24]
+  ret i32 %arg
+}
+
+; Even if the argument was successfully allocated as reg block, there should be
+; no backfillig to r1.
+define i32 @test_no_int_backfilling_regsonly(i32, [1 x i64], i32 %arg) {
+; CHECK-LABEL: test_no_int_backfilling_regsonly:
+; CHECK: ldr r0, [sp]
+  ret i32 %arg
+}
+
+; If an aggregate has to be moved entirely onto the stack, nothing should be
+; able to use r0-r3 any more.
+define float @test_no_float_backfilling([7 x double], [4 x i32], i32, [4 x double], float %arg) nounwind {
+; CHECK-LABEL: test_no_float_backfilling:
+; CHECK: vldr s0, [sp, #40]
+  ret float %arg
+}
+
+; They're a bit pointless, but types like [N x i8] should work as well.
+define i8 @test_i8_in_regs(i32, [3 x i8] %arg) {
+; CHECK-LABEL: test_i8_in_regs:
+; CHECK: add r0, r1, r3
+  %val0 = extractvalue [3 x i8] %arg, 0
+  %val2 = extractvalue [3 x i8] %arg, 2
+  %sum = add i8 %val0, %val2
+  ret i8 %sum
+}
+
+define i16 @test_i16_split(i32, i32, [3 x i16] %arg) {
+; CHECK-LABEL: test_i16_split:
+; CHECK: ldrh [[VAL2:r[0-9]+]], [sp]
+; CHECK: add r0, r2, [[VAL2]]
+  %val0 = extractvalue [3 x i16] %arg, 0
+  %val2 = extractvalue [3 x i16] %arg, 2
+  %sum = add i16 %val0, %val2
+  ret i16 %sum
+}
+
+; Beware: on the stack each i16 still gets a 32-bit slot, the array is not
+; packed.
+define i16 @test_i16_forced_stack([8 x double], double, i32, i32, [3 x i16] %arg) {
+; CHECK-LABEL: test_i16_forced_stack:
+; CHECK-DAG: ldrh [[VAL0:r[0-9]+]], [sp, #8]
+; CHECK-DAG: ldrh [[VAL2:r[0-9]+]], [sp, #16]
+; CHECK: add r0, [[VAL0]], [[VAL2]]
+  %val0 = extractvalue [3 x i16] %arg, 0
+  %val2 = extractvalue [3 x i16] %arg, 2
+  %sum = add i16 %val0, %val2
+  ret i16 %sum
+}
diff --git a/test/CodeGen/ARM/alloc-no-stack-realign.ll b/test/CodeGen/ARM/alloc-no-stack-realign.ll
index 6e6311d..5ad8719 100644
--- a/test/CodeGen/ARM/alloc-no-stack-realign.ll
+++ b/test/CodeGen/ARM/alloc-no-stack-realign.ll
@@ -8,21 +8,28 @@
 
 define void @test1(<16 x float>* noalias sret %agg.result) nounwind ssp "no-realign-stack" {
 entry:
-; NO-REALIGN: test1
-; NO-REALIGN: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #48
-; NO-REALIGN: vst1.64
-; NO-REALIGN: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #32
-; NO-REALIGN: vst1.64
-; NO-REALIGN: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #16
-; NO-REALIGN: vst1.64
-; NO-REALIGN: vst1.64
-; NO-REALIGN: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #48
-; NO-REALIGN: vst1.64
-; NO-REALIGN: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #32
-; NO-REALIGN: vst1.64
-; NO-REALIGN: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #16
-; NO-REALIGN: vst1.64
-; NO-REALIGN: vst1.64
+; NO-REALIGN-LABEL: test1
+; NO-REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]]
+; NO-REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
+; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
+; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #48
+; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+
+; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1:[0-9]+]], #48
+; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
+; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]!
+; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+
+; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0:0]], #48
+; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0]], #32
+; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]!
+; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]
  %retval = alloca <16 x float>, align 16
  %0 = load <16 x float>* @T3_retval, align 16
  store <16 x float> %0, <16 x float>* %retval
@@ -33,22 +40,31 @@ entry:
 
 define void @test2(<16 x float>* noalias sret %agg.result) nounwind ssp {
 entry:
-; REALIGN: test2
-; REALIGN: bic sp, sp, #63
-; REALIGN: orr [[R2:r[0-9]+]], [[R1:r[0-9]+]], #48
-; REALIGN: vst1.64
-; REALIGN: orr [[R2:r[0-9]+]], [[R1:r[0-9]+]], #32
-; REALIGN: vst1.64
-; REALIGN: orr [[R2:r[0-9]+]], [[R1:r[0-9]+]], #16
-; REALIGN: vst1.64
-; REALIGN: vst1.64
-; REALIGN: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #48
-; REALIGN: vst1.64
-; REALIGN: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #32
-; REALIGN: vst1.64
-; REALIGN: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #16
-; REALIGN: vst1.64
-; REALIGN: vst1.64
+; REALIGN-LABEL: test2
+; REALIGN: bfc sp, #0, #6
+; REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]]
+; REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
+; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
+; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #48
+; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+
+
+; REALIGN: orr r[[R2:[0-9]+]], r[[R1:[0-9]+]], #48
+; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; REALIGN: orr r[[R2:[0-9]+]], r[[R1]], #32
+; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; REALIGN: orr r[[R2:[0-9]+]], r[[R1]], #16
+; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+
+; REALIGN: add r[[R1:[0-9]+]], r[[R0:0]], #48
+; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+; REALIGN: add r[[R1:[0-9]+]], r[[R0]], #32
+; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+; REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]!
+; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]
  %retval = alloca <16 x float>, align 16
  %0 = load <16 x float>* @T3_retval, align 16
  store <16 x float> %0, <16 x float>* %retval
diff --git a/test/CodeGen/ARM/arm-abi-attr.ll b/test/CodeGen/ARM/arm-abi-attr.ll
index f3923ae..61cb6ce 100644
--- a/test/CodeGen/ARM/arm-abi-attr.ll
+++ b/test/CodeGen/ARM/arm-abi-attr.ll
@@ -1,13 +1,13 @@
-; RUN: llc -mtriple=arm-linux < %s | FileCheck %s --check-prefix=APCS
-; RUN: llc -mtriple=arm-linux -mattr=apcs < %s | \
+; RUN: llc -mtriple=arm-linux-gnu < %s | FileCheck %s --check-prefix=APCS
+; RUN: llc -mtriple=arm-linux-gnu -target-abi=apcs < %s | \
 ; RUN: FileCheck %s --check-prefix=APCS
-; RUN: llc -mtriple=arm-linux-gnueabi -mattr=apcs < %s | \
+; RUN: llc -mtriple=arm-linux-gnueabi -target-abi=apcs < %s | \
 ; RUN: FileCheck %s --check-prefix=APCS
 
 ; RUN: llc -mtriple=arm-linux-gnueabi < %s | FileCheck %s --check-prefix=AAPCS
-; RUN: llc -mtriple=arm-linux-gnueabi -mattr=aapcs < %s | \
+; RUN: llc -mtriple=arm-linux-gnueabi -target-abi=aapcs < %s | \
 ; RUN: FileCheck %s --check-prefix=AAPCS
-; RUN: llc -mtriple=arm-linux-gnu -mattr=aapcs < %s | \
+; RUN: llc -mtriple=arm-linux-gnu -target-abi=aapcs < %s | \
 ; RUN: FileCheck %s --check-prefix=AAPCS
 
 ; The stack is 8 byte aligned on AAPCS and 4 on APCS, so we should get a BIC
diff --git a/test/CodeGen/ARM/atomic-64bit.ll b/test/CodeGen/ARM/atomic-64bit.ll
index 462c185..0c0769f 100644
--- a/test/CodeGen/ARM/atomic-64bit.ll
+++ b/test/CodeGen/ARM/atomic-64bit.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
 ; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabihf | FileCheck %s --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-LE
-; RUN: llc < %s -mtriple=armebv7 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
+; RUN: llc < %s -mtriple=armebv7 -target-abi apcs | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
 ; RUN: llc < %s -mtriple=thumbebv7-none-linux-gnueabihf | FileCheck %s --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-BE
 
 define i64 @test1(i64* %ptr, i64 %val) {
diff --git a/test/CodeGen/ARM/atomic-ops-v8.ll b/test/CodeGen/ARM/atomic-ops-v8.ll
index 7072aaa..6ba1352 100644
--- a/test/CodeGen/ARM/atomic-ops-v8.ll
+++ b/test/CodeGen/ARM/atomic-ops-v8.ll
@@ -1296,7 +1296,7 @@ define void @test_atomic_store_monotonic_regoff_i8(i64 %base, i64 %off, i8 %val)
   %addr = inttoptr i64 %addr_int to i8*
 
   store atomic i8 %val, i8* %addr monotonic, align 1
-; CHECK-LE: ldrb{{(\.w)?}} [[VAL:r[0-9]+]], [sp]
+; CHECK-LE: ldr{{b?(\.w)?}} [[VAL:r[0-9]+]], [sp]
 ; CHECK-LE: strb [[VAL]], [r0, r2]
 ; CHECK-BE: ldrb{{(\.w)?}} [[VAL:r[0-9]+]], [sp, #3]
 ; CHECK-BE: strb [[VAL]], [r1, r3]
diff --git a/test/CodeGen/ARM/big-endian-neon-extend.ll b/test/CodeGen/ARM/big-endian-neon-extend.ll
index 931c6c3..1498356 100644
--- a/test/CodeGen/ARM/big-endian-neon-extend.ll
+++ b/test/CodeGen/ARM/big-endian-neon-extend.ll
@@ -2,10 +2,18 @@
 
 define void @vector_ext_2i8_to_2i64( <2 x i8>* %loadaddr, <2 x i64>* %storeaddr ) {
 ; CHECK-LABEL: vector_ext_2i8_to_2i64:
-; CHECK:       vld1.16 {[[REG:d[0-9]+]]
-; CHECK:       vmov.i64 {{q[0-9]+}}, #0xff
-; CHECK:       vrev16.8  [[REG]], [[REG]]
-; CHECK:       vmovl.u8  {{q[0-9]+}}, [[REG]]
+; CHECK:      vld1.16   {[[REG:d[0-9]+]][0]}, [r0:16]
+; CHECK-NEXT: vmov.i64  [[MASK:q[0-9]+]], #0xff
+; CHECK-NEXT: vrev64.32 [[MASK]], [[MASK]]
+; CHECK-NEXT: vrev16.8  [[REG]], [[REG]]
+; CHECK-NEXT: vmovl.u8  [[QREG:q[0-9]+]], [[REG]]
+; CHECK-NEXT: vmovl.u16 [[QREG]], [[REG]]
+; CHECK-NEXT: vmovl.u32 [[QREG]], [[REG]]
+; CHECK-NEXT: vrev64.32 [[QREG]], [[QREG]]
+; CHECK-NEXT: vand      [[QREG]], [[QREG]], [[MASK]]
+; CHECK-NEXT: vrev64.32 [[QREG]], [[QREG]]
+; CHECK-NEXT: vst1.64   {[[REG]], {{d[0-9]+}}}, [r1]
+; CHECK-NEXT: bx        lr
   %1 = load <2 x i8>* %loadaddr
   %2 = zext <2 x i8> %1 to <2 x i64>
   store <2 x i64> %2, <2 x i64>* %storeaddr
@@ -14,10 +22,17 @@ define void @vector_ext_2i8_to_2i64( <2 x i8>* %loadaddr, <2 x i64>* %storeaddr
 
 define void @vector_ext_2i16_to_2i64( <2 x i16>* %loadaddr, <2 x i64>* %storeaddr ) {
 ; CHECK-LABEL: vector_ext_2i16_to_2i64:
-; CHECK:       vld1.32 {[[REG:d[0-9]+]]
-; CHECK:       vmov.i64 {{q[0-9]+}}, #0xffff
-; CHECK:       vrev32.16  [[REG]], [[REG]]
-; CHECK:       vmovl.u16  {{q[0-9]+}}, [[REG]]
+; CHECK:      vld1.32   {[[REG:d[0-9]+]][0]}, [r0:32]
+; CHECK-NEXT: vmov.i64  [[MASK:q[0-9]+]], #0xffff
+; CHECK-NEXT: vrev64.32 [[MASK]], [[MASK]]
+; CHECK-NEXT: vrev32.16 [[REG]], [[REG]]
+; CHECK-NEXT: vmovl.u16 [[QREG:q[0-9]+]], [[REG]]
+; CHECK-NEXT: vmovl.u32 [[QREG]], [[REG]]
+; CHECK-NEXT: vrev64.32 [[QREG]], [[QREG]]
+; CHECK-NEXT: vand      [[QREG]], [[QREG]], [[MASK]]
+; CHECK-NEXT: vrev64.32 [[QREG]], [[QREG]]
+; CHECK-NEXT: vst1.64   {[[REG]], {{d[0-9]+}}}, [r1]
+; CHECK-NEXT: bx        lr
   %1 = load <2 x i16>* %loadaddr
   %2 = zext <2 x i16> %1 to <2 x i64>
   store <2 x i64> %2, <2 x i64>* %storeaddr
@@ -27,8 +42,13 @@ define void @vector_ext_2i16_to_2i64( <2 x i16>* %loadaddr, <2 x i64>* %storeadd
 
 define void @vector_ext_2i8_to_2i32( <2 x i8>* %loadaddr, <2 x i32>* %storeaddr ) {
 ; CHECK-LABEL: vector_ext_2i8_to_2i32:
-; CHECK:       vld1.16 {[[REG:d[0-9]+]]
-; CHECK:       vrev16.8  [[REG]], [[REG]]
+; CHECK:      vld1.16   {[[REG:d[0-9]+]][0]}, [r0:16]
+; CHECK-NEXT: vrev16.8  [[REG]], [[REG]]
+; CHECK-NEXT: vmovl.u8  [[QREG:q[0-9]+]], [[REG]]
+; CHECK-NEXT: vmovl.u16 [[QREG]], [[REG]]
+; CHECK-NEXT: vrev64.32 [[REG]], [[REG]]
+; CHECK-NEXT: vstr      [[REG]], [r1]
+; CHECK-NEXT: bx        lr
   %1 = load <2 x i8>* %loadaddr
   %2 = zext <2 x i8> %1 to <2 x i32>
   store <2 x i32> %2, <2 x i32>* %storeaddr
@@ -37,9 +57,12 @@ define void @vector_ext_2i8_to_2i32( <2 x i8>* %loadaddr, <2 x i32>* %storeaddr
 
 define void @vector_ext_2i16_to_2i32( <2 x i16>* %loadaddr, <2 x i32>* %storeaddr ) {
 ; CHECK-LABEL: vector_ext_2i16_to_2i32:
-; CHECK:       vld1.32 {[[REG:d[0-9]+]]
-; CHECK:       vrev32.16  [[REG]], [[REG]]
-; CHECK:       vmovl.u16  {{q[0-9]+}}, [[REG]]
+; CHECK:      vld1.32   {[[REG:d[0-9]+]][0]}, [r0:32]
+; CHECK-NEXT: vrev32.16 [[REG]], [[REG]]
+; CHECK-NEXT: vmovl.u16 [[QREG:q[0-9]+]], [[REG]]
+; CHECK-NEXT: vrev64.32 [[REG]], [[REG]]
+; CHECK-NEXT: vstr      [[REG]], [r1]
+; CHECK-NEXT: bx        lr
   %1 = load <2 x i16>* %loadaddr
   %2 = zext <2 x i16> %1 to <2 x i32>
   store <2 x i32> %2, <2 x i32>* %storeaddr
@@ -48,9 +71,15 @@ define void @vector_ext_2i16_to_2i32( <2 x i16>* %loadaddr, <2 x i32>* %storeadd
 
 define void @vector_ext_2i8_to_2i16( <2 x i8>* %loadaddr, <2 x i16>* %storeaddr ) {
 ; CHECK-LABEL: vector_ext_2i8_to_2i16:
-; CHECK:       vld1.16 {[[REG:d[0-9]+]]
-; CHECK:       vrev16.8  [[REG]], [[REG]]
-; CHECK:       vmovl.u8  {{q[0-9]+}}, [[REG]]
+; CHECK:      vld1.16   {[[REG:d[0-9]+]][0]}, [r0:16]
+; CHECK-NEXT: vrev16.8  [[REG]], [[REG]]
+; CHECK-NEXT: vmovl.u8  [[QREG:q[0-9]+]], [[REG]]
+; CHECK-NEXT: vmovl.u16 [[QREG]], [[REG]]
+; CHECK-NEXT: vrev32.16 [[REG]], [[REG]]
+; CHECK-NEXT: vuzp.16   [[REG]], {{d[0-9]+}}
+; CHECK-NEXT: vrev32.16 [[REG]], {{d[0-9]+}}
+; CHECK-NEXT: vst1.32   {[[REG]][0]}, [r1:32]
+; CHECK-NEXT: bx        lr
   %1 = load <2 x i8>* %loadaddr
   %2 = zext <2 x i8> %1 to <2 x i16>
   store <2 x i16> %2, <2 x i16>* %storeaddr
@@ -59,9 +88,13 @@ define void @vector_ext_2i8_to_2i16( <2 x i8>* %loadaddr, <2 x i16>* %storeaddr
 
 define void @vector_ext_4i8_to_4i32( <4 x i8>* %loadaddr, <4 x i32>* %storeaddr ) {
 ; CHECK-LABEL: vector_ext_4i8_to_4i32:
-; CHECK:       vld1.32 {[[REG:d[0-9]+]]
-; CHECK:       vrev32.8  [[REG]], [[REG]]
-; CHECK:       vmovl.u8  {{q[0-9]+}}, [[REG]]
+; CHECK:      vld1.32   {[[REG:d[0-9]+]][0]}, [r0:32]
+; CHECK-NEXT: vrev32.8  [[REG]], [[REG]]
+; CHECK-NEXT: vmovl.u8  [[QREG:q[0-9]+]], [[REG]]
+; CHECK-NEXT: vmovl.u16 [[QREG]], [[REG]]
+; CHECK-NEXT: vrev64.32 [[QREG]], [[QREG]]
+; CHECK-NEXT: vst1.64   {[[REG]], {{d[0-9]+}}}, [r1]
+; CHECK-NEXT: bx        lr
   %1 = load <4 x i8>* %loadaddr
   %2 = zext <4 x i8> %1 to <4 x i32>
   store <4 x i32> %2, <4 x i32>* %storeaddr
@@ -70,12 +103,14 @@ define void @vector_ext_4i8_to_4i32( <4 x i8>* %loadaddr, <4 x i32>* %storeaddr
 
 define void @vector_ext_4i8_to_4i16( <4 x i8>* %loadaddr, <4 x i16>* %storeaddr ) {
 ; CHECK-LABEL: vector_ext_4i8_to_4i16:
-; CHECK:       vld1.32 {[[REG:d[0-9]+]]
-; CHECK:       vrev32.8  [[REG]], [[REG]]
-; CHECK:       vmovl.u8  {{q[0-9]+}}, [[REG]]
+; CHECK:      vld1.32   {[[REG:d[0-9]+]][0]}, [r0:32]
+; CHECK-NEXT: vrev32.8  [[REG]], [[REG]]
+; CHECK-NEXT: vmovl.u8  [[QREG:q[0-9]+]], [[REG]]
+; CHECK-NEXT: vrev64.16 [[REG]], [[REG]]
+; CHECK-NEXT: vstr      [[REG]], [r1]
+; CHECK-NEXT: bx        lr
   %1 = load <4 x i8>* %loadaddr
   %2 = zext <4 x i8> %1 to <4 x i16>
   store <4 x i16> %2, <4 x i16>* %storeaddr
   ret void
 }
-
diff --git a/test/CodeGen/ARM/build-attributes-encoding.s b/test/CodeGen/ARM/build-attributes-encoding.s
index 34a1ad3..29f13f0 100644
--- a/test/CodeGen/ARM/build-attributes-encoding.s
+++ b/test/CodeGen/ARM/build-attributes-encoding.s
@@ -78,7 +78,7 @@
 // CHECK-NEXT:     EntrySize: 0
 // CHECK-NEXT:     SectionData (
 // CHECK-NEXT:       0000: 41460000 00616561 62690001 3C000000
-// CHECK-NEXT:       0010: 05434F52 5445582D 41380006 0A074108
+// CHECK-NEXT:       0010: 05636F72 7465782D 61380006 0A074108
 // CHECK-NEXT:       0020: 0109020A 030C0214 01150117 01180119
 // CHECK-NEXT:       0030: 011B001C 0124012A 012C0244 036EA001
 // CHECK-NEXT:       0040: 81013100 FA0101
diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index 99c2445..37c6a447 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -3,39 +3,106 @@
 
 ; RUN: llc < %s -mtriple=thumbv5-linux-gnueabi -mcpu=xscale | FileCheck %s --check-prefix=XSCALE
 ; RUN: llc < %s -mtriple=armv6-linux-gnueabi | FileCheck %s --check-prefix=V6
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6-FAST
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi | FileCheck %s --check-prefix=V6M
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6M-FAST
+; RUN: llc < %s -mtriple=thumbv6sm-linux-gnueabi | FileCheck %s --check-prefix=V6M
+; RUN: llc < %s -mtriple=thumbv6sm-linux-gnueabi  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6M-FAST
 ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s | FileCheck %s --check-prefix=ARM1156T2F-S
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast  | FileCheck %s --check-prefix=ARM1156T2F-S-FAST
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi | FileCheck %s --check-prefix=V7M
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V7M-FAST
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=V7
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V7-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi | FileCheck %s --check-prefix=V8
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V8-FAST
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi | FileCheck %s --check-prefix=Vt8
+; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=-neon,-crypto | FileCheck %s --check-prefix=V8-FPARMv8
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=-fp-armv8,-crypto | FileCheck %s --check-prefix=V8-NEON
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=-crypto | FileCheck %s --check-prefix=V8-FPARMv8-NEON
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi | FileCheck %s --check-prefix=V8-FPARMv8-NEON-CRYPTO
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 | FileCheck %s --check-prefix=CORTEX-A5-DEFAULT
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A5-DEFAULT-FAST
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-neon,+d16 | FileCheck %s --check-prefix=CORTEX-A5-NONEON
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-vfp2 | FileCheck %s --check-prefix=CORTEX-A5-NOFPU
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-vfp2  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A5-NOFPU-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A9-SOFT
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A9-SOFT-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-A9-HARD
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A9-HARD-FAST
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 | FileCheck %s --check-prefix=CORTEX-A12-DEFAULT
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A9-SOFT
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A12-DEFAULT-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 -mattr=-vfp2 | FileCheck %s --check-prefix=CORTEX-A12-NOFPU
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 -mattr=-vfp2  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A12-NOFPU-FAST
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 | FileCheck %s --check-prefix=CORTEX-A15
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A15-FAST
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 | FileCheck %s --check-prefix=CORTEX-A17-DEFAULT
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A17-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 -mattr=-vfp2 | FileCheck %s --check-prefix=CORTEX-A17-NOFPU
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 -mattr=-vfp2  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A17-NOFPU-FAST
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=CORTEX-M0
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M0-FAST
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus | FileCheck %s --check-prefix=CORTEX-M0PLUS
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M0PLUS-FAST
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1 | FileCheck %s --check-prefix=CORTEX-M1
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M1-FAST
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000 | FileCheck %s --check-prefix=SC000
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=SC000-FAST
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3 | FileCheck %s --check-prefix=CORTEX-M3
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M3-FAST
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=sc300 | FileCheck %s --check-prefix=SC300
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=sc300  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=SC300-FAST
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=sc300 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-M4-SOFT
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=soft  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M4-SOFT-FAST
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-M4-HARD
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M4-HARD-FAST
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=-vfp2 | FileCheck %s --check-prefix=CORTEX-M7 --check-prefix=CORTEX-M7-SOFT
+; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=-vfp2  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M7-NOFPU-FAST
 ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=+fp-only-sp | FileCheck %s --check-prefix=CORTEX-M7 --check-prefix=CORTEX-M7-SINGLE
-; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 | FileCheck %s --check-prefix=CORTEX-M7 --check-prefix=CORTEX-M7-DOUBLE
+; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=+fp-only-sp  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M7-FAST
+; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 | FileCheck %s --check-prefix=CORTEX-M7-DOUBLE
+; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=CORTEX-R5
+; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R5-FAST
+; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 | FileCheck %s --check-prefix=CORTEX-R7
+; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R7-FAST
+; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 | FileCheck %s --check-prefix=CORTEX-A53
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A53-FAST
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=CORTEX-A57
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A57-FAST
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 | FileCheck %s --check-prefix=CORTEX-A72
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A72-FAST
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 | FileCheck %s  --check-prefix=CORTEX-A7-CHECK
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s  --check-prefix=CORTEX-A7-CHECK-FAST
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=-vfp2,-vfp3,-vfp4,-neon | FileCheck %s --check-prefix=CORTEX-A7-NOFPU
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=-vfp2,-vfp3,-vfp4,-neon  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A7-NOFPU-FAST
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,-neon  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A7-FPUV4-FAST
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,,+d16,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4
 ; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=pic | FileCheck %s --check-prefix=RELOC-PIC
 ; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=static | FileCheck %s --check-prefix=RELOC-OTHER
@@ -49,6 +116,9 @@
 ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a72 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a72 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a72 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; ARMv7a
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
@@ -82,75 +152,185 @@
 ; XSCALE:      .eabi_attribute 8, 1
 ; XSCALE:      .eabi_attribute 9, 1
 
+; DYN-ROUNDING: .eabi_attribute 19, 1
+
 ; V6:   .eabi_attribute 6, 6
 ; V6:   .eabi_attribute 8, 1
+;; We assume round-to-nearest by default (matches GCC)
+; V6-NOT:   .eabi_attribute 19
+;; The default choice made by llc is for a V6 CPU without an FPU.
+;; This is not an interesting detail, but for such CPUs, the default intention is to use
+;; software floating-point support. The choice is not important for targets without
+;; FPU support!
+; V6:   .eabi_attribute 20, 1
+; V6:   .eabi_attribute 21, 1
+; V6-NOT:   .eabi_attribute 22
+; V6:   .eabi_attribute 23, 3
 ; V6:   .eabi_attribute 24, 1
 ; V6:   .eabi_attribute 25, 1
 ; V6-NOT:   .eabi_attribute 27
 ; V6-NOT:   .eabi_attribute 28
 ; V6-NOT:    .eabi_attribute 36
+; V6:    .eabi_attribute 38, 1
 ; V6-NOT:    .eabi_attribute 42
+; V6-NOT:  .eabi_attribute 44
 ; V6-NOT:    .eabi_attribute 68
 
+; V6-FAST-NOT:   .eabi_attribute 19
+;; Despite the V6 CPU having no FPU by default, we chose to flush to
+;; positive zero here. There's no hardware support doing this, but the
+;; fast maths software library might.
+; V6-FAST-NOT:   .eabi_attribute 20
+; V6-FAST-NOT:   .eabi_attribute 21
+; V6-FAST-NOT:   .eabi_attribute 22
+; V6-FAST:   .eabi_attribute 23, 1
+
+;; We emit 6, 12 for both v6-M and v6S-M, technically this is incorrect for
+;; V6-M, however we don't model the OS extension so this is fine.
 ; V6M:  .eabi_attribute 6, 12
 ; V6M-NOT:  .eabi_attribute 7
 ; V6M:  .eabi_attribute 8, 0
 ; V6M:  .eabi_attribute 9, 1
+; V6M-NOT:   .eabi_attribute 19
+;; The default choice made by llc is for a V6M CPU without an FPU.
+;; This is not an interesting detail, but for such CPUs, the default intention is to use
+;; software floating-point support. The choice is not important for targets without
+;; FPU support!
+; V6M:  .eabi_attribute 20, 1
+; V6M:   .eabi_attribute 21, 1
+; V6M-NOT:   .eabi_attribute 22
+; V6M:   .eabi_attribute 23, 3
 ; V6M:  .eabi_attribute 24, 1
 ; V6M:  .eabi_attribute 25, 1
 ; V6M-NOT:  .eabi_attribute 27
 ; V6M-NOT:  .eabi_attribute 28
 ; V6M-NOT:  .eabi_attribute 36
+; V6M:  .eabi_attribute 38, 1
 ; V6M-NOT:  .eabi_attribute 42
+; V6M-NOT:  .eabi_attribute 44
 ; V6M-NOT:  .eabi_attribute 68
 
+; V6M-FAST-NOT:   .eabi_attribute 19
+;; Despite the V6M CPU having no FPU by default, we chose to flush to
+;; positive zero here. There's no hardware support doing this, but the
+;; fast maths software library might.
+; V6M-FAST-NOT:  .eabi_attribute 20
+; V6M-FAST-NOT:   .eabi_attribute 21
+; V6M-FAST-NOT:   .eabi_attribute 22
+; V6M-FAST:   .eabi_attribute 23, 1
+
 ; ARM1156T2F-S: .cpu arm1156t2f-s
 ; ARM1156T2F-S: .eabi_attribute 6, 8
 ; ARM1156T2F-S: .eabi_attribute 8, 1
 ; ARM1156T2F-S: .eabi_attribute 9, 2
 ; ARM1156T2F-S: .fpu vfpv2
+; ARM1156T2F-S-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
 ; ARM1156T2F-S: .eabi_attribute 20, 1
 ; ARM1156T2F-S: .eabi_attribute 21, 1
+; ARM1156T2F-S-NOT: .eabi_attribute 22
 ; ARM1156T2F-S: .eabi_attribute 23, 3
 ; ARM1156T2F-S: .eabi_attribute 24, 1
 ; ARM1156T2F-S: .eabi_attribute 25, 1
 ; ARM1156T2F-S-NOT: .eabi_attribute 27
 ; ARM1156T2F-S-NOT: .eabi_attribute 28
 ; ARM1156T2F-S-NOT: .eabi_attribute 36
+; ARM1156T2F-S: .eabi_attribute 38, 1
 ; ARM1156T2F-S-NOT:    .eabi_attribute 42
+; ARM1156T2F-S-NOT:    .eabi_attribute 44
 ; ARM1156T2F-S-NOT:    .eabi_attribute 68
 
+; ARM1156T2F-S-FAST-NOT:   .eabi_attribute 19
+;; V6 cores default to flush to positive zero (value 0). Note that value 2 is also equally
+;; valid for this core, it's an implementation defined question as to which of 0 and 2 you
+;; select. LLVM historically picks 0.
+; ARM1156T2F-S-FAST-NOT: .eabi_attribute 20
+; ARM1156T2F-S-FAST-NOT:   .eabi_attribute 21
+; ARM1156T2F-S-FAST-NOT:   .eabi_attribute 22
+; ARM1156T2F-S-FAST:   .eabi_attribute 23, 1
+
 ; V7M:  .eabi_attribute 6, 10
 ; V7M:  .eabi_attribute 7, 77
 ; V7M:  .eabi_attribute 8, 0
 ; V7M:  .eabi_attribute 9, 2
+; V7M-NOT:   .eabi_attribute 19
+;; The default choice made by llc is for a V7M CPU without an FPU.
+;; This is not an interesting detail, but for such CPUs, the default intention is to use
+;; software floating-point support. The choice is not important for targets without
+;; FPU support!
+; V7M:  .eabi_attribute 20, 1
+; V7M: .eabi_attribute 21, 1
+; V7M-NOT: .eabi_attribute 22
+; V7M: .eabi_attribute 23, 3
 ; V7M:  .eabi_attribute 24, 1
 ; V7M:  .eabi_attribute 25, 1
 ; V7M-NOT:  .eabi_attribute 27
 ; V7M-NOT:  .eabi_attribute 28
 ; V7M-NOT:  .eabi_attribute 36
+; V7M:  .eabi_attribute 38, 1
 ; V7M-NOT:  .eabi_attribute 42
 ; V7M-NOT:  .eabi_attribute 44
 ; V7M-NOT:  .eabi_attribute 68
 
+; V7M-FAST-NOT:   .eabi_attribute 19
+;; Despite the V7M CPU having no FPU by default, we chose to flush
+;; preserving sign. This matches what the hardware would do in the
+;; architecture revision were to exist on the current target.
+; V7M-FAST:  .eabi_attribute 20, 2
+; V7M-FAST-NOT:   .eabi_attribute 21
+; V7M-FAST-NOT:   .eabi_attribute 22
+; V7M-FAST:   .eabi_attribute 23, 1
+
 ; V7:      .syntax unified
 ; V7: .eabi_attribute 6, 10
+; V7-NOT:   .eabi_attribute 19
+;; In safe-maths mode we default to an IEEE 754 compliant choice.
 ; V7: .eabi_attribute 20, 1
 ; V7: .eabi_attribute 21, 1
+; V7-NOT: .eabi_attribute 22
 ; V7: .eabi_attribute 23, 3
 ; V7: .eabi_attribute 24, 1
 ; V7: .eabi_attribute 25, 1
 ; V7-NOT: .eabi_attribute 27
 ; V7-NOT: .eabi_attribute 28
 ; V7-NOT: .eabi_attribute 36
+; V7: .eabi_attribute 38, 1
 ; V7-NOT:    .eabi_attribute 42
+; V7-NOT:    .eabi_attribute 44
 ; V7-NOT:    .eabi_attribute 68
 
+; V7-FAST-NOT:   .eabi_attribute 19
+;; The default CPU does have an FPU and it must be VFPv3 or better, so it flushes
+;; denormals to zero preserving the sign.
+; V7-FAST: .eabi_attribute 20, 2
+; V7-FAST-NOT:   .eabi_attribute 21
+; V7-FAST-NOT:   .eabi_attribute 22
+; V7-FAST:   .eabi_attribute 23, 1
+
 ; V8:      .syntax unified
+; V8: .eabi_attribute 67, "2.09"
 ; V8: .eabi_attribute 6, 14
+; V8-NOT:   .eabi_attribute 19
+; V8: .eabi_attribute 20, 1
+; V8: .eabi_attribute 21, 1
+; V8-NOT: .eabi_attribute 22
+; V8: .eabi_attribute 23, 3
+; V8-NOT: .eabi_attribute 44
+
+; V8-FAST-NOT:   .eabi_attribute 19
+;; The default does have an FPU, and for V8-A, it flushes preserving sign.
+; V8-FAST: .eabi_attribute 20, 2
+; V8-FAST-NOT: .eabi_attribute 21
+; V8-FAST-NOT: .eabi_attribute 22
+; V8-FAST: .eabi_attribute 23, 1
 
 ; Vt8:     .syntax unified
 ; Vt8: .eabi_attribute 6, 14
+; Vt8-NOT:   .eabi_attribute 19
+; Vt8: .eabi_attribute 20, 1
+; Vt8: .eabi_attribute 21, 1
+; Vt8-NOT: .eabi_attribute 22
+; Vt8: .eabi_attribute 23, 3
 
 ; V8-FPARMv8:      .syntax unified
 ; V8-FPARMv8: .eabi_attribute 6, 14
@@ -175,74 +355,95 @@
 ; NO-STRICT-ALIGN: .eabi_attribute 34, 1
 ; STRICT-ALIGN: .eabi_attribute 34, 0
 
-; Tag_CPU_arch	'ARMv7'
-; CORTEX-A7-CHECK: .eabi_attribute	6, 10
-; CORTEX-A7-NOFPU: .eabi_attribute	6, 10
-; CORTEX-A7-FPUV4: .eabi_attribute	6, 10
+; Tag_CPU_arch  'ARMv7'
+; CORTEX-A7-CHECK: .eabi_attribute      6, 10
+; CORTEX-A7-NOFPU: .eabi_attribute      6, 10
+
+; CORTEX-A7-FPUV4: .eabi_attribute      6, 10
 
 ; Tag_CPU_arch_profile 'A'
-; CORTEX-A7-CHECK: .eabi_attribute	7, 65
-; CORTEX-A7-NOFPU: .eabi_attribute	7, 65
-; CORTEX-A7-FPUV4: .eabi_attribute	7, 65
+; CORTEX-A7-CHECK: .eabi_attribute      7, 65
+; CORTEX-A7-NOFPU: .eabi_attribute      7, 65
+; CORTEX-A7-FPUV4: .eabi_attribute      7, 65
 
 ; Tag_ARM_ISA_use
-; CORTEX-A7-CHECK: .eabi_attribute	8, 1
-; CORTEX-A7-NOFPU: .eabi_attribute	8, 1
-; CORTEX-A7-FPUV4: .eabi_attribute	8, 1
+; CORTEX-A7-CHECK: .eabi_attribute      8, 1
+; CORTEX-A7-NOFPU: .eabi_attribute      8, 1
+; CORTEX-A7-FPUV4: .eabi_attribute      8, 1
 
 ; Tag_THUMB_ISA_use
-; CORTEX-A7-CHECK: .eabi_attribute	9, 2
-; CORTEX-A7-NOFPU: .eabi_attribute	9, 2
-; CORTEX-A7-FPUV4: .eabi_attribute	9, 2
+; CORTEX-A7-CHECK: .eabi_attribute      9, 2
+; CORTEX-A7-NOFPU: .eabi_attribute      9, 2
+; CORTEX-A7-FPUV4: .eabi_attribute      9, 2
 
-; CORTEX-A7-CHECK: .fpu	neon-vfpv4
+; CORTEX-A7-CHECK: .fpu neon-vfpv4
 ; CORTEX-A7-NOFPU-NOT: .fpu
-; CORTEX-A7-FPUV4: .fpu	vfpv4
+; CORTEX-A7-FPUV4: .fpu vfpv4
 
+; CORTEX-A7-CHECK-NOT:   .eabi_attribute 19
 ; Tag_ABI_FP_denormal
-; CORTEX-A7-CHECK: .eabi_attribute	20, 1
-; CORTEX-A7-NOFPU: .eabi_attribute	20, 1
-; CORTEX-A7-FPUV4: .eabi_attribute	20, 1
+;; We default to IEEE 754 compliance
+; CORTEX-A7-CHECK: .eabi_attribute      20, 1
+;; The A7 has VFPv3 support by default, so flush preserving sign.
+; CORTEX-A7-CHECK-FAST: .eabi_attribute 20, 2
+; CORTEX-A7-NOFPU: .eabi_attribute      20, 1
+;; Despite there being no FPU, we chose to flush to zero preserving
+;; sign. This matches what the hardware would do for this architecture
+;; revision.
+; CORTEX-A7-NOFPU-FAST: .eabi_attribute 20, 2
+; CORTEX-A7-FPUV4: .eabi_attribute      20, 1
+;; The VFPv4 FPU flushes preserving sign.
+; CORTEX-A7-FPUV4-FAST: .eabi_attribute 20, 2
 
 ; Tag_ABI_FP_exceptions
-; CORTEX-A7-CHECK: .eabi_attribute	21, 1
-; CORTEX-A7-NOFPU: .eabi_attribute	21, 1
-; CORTEX-A7-FPUV4: .eabi_attribute	21, 1
+; CORTEX-A7-CHECK: .eabi_attribute      21, 1
+; CORTEX-A7-NOFPU: .eabi_attribute      21, 1
+; CORTEX-A7-FPUV4: .eabi_attribute      21, 1
+
+; Tag_ABI_FP_user_exceptions
+; CORTEX-A7-CHECK-NOT: .eabi_attribute      22
+; CORTEX-A7-NOFPU-NOT: .eabi_attribute      22
+; CORTEX-A7-FPUV4-NOT: .eabi_attribute      22
 
 ; Tag_ABI_FP_number_model
-; CORTEX-A7-CHECK: .eabi_attribute	23, 3
-; CORTEX-A7-NOFPU: .eabi_attribute	23, 3
-; CORTEX-A7-FPUV4: .eabi_attribute	23, 3
+; CORTEX-A7-CHECK: .eabi_attribute      23, 3
+; CORTEX-A7-NOFPU: .eabi_attribute      23, 3
+; CORTEX-A7-FPUV4: .eabi_attribute      23, 3
 
 ; Tag_ABI_align_needed
-; CORTEX-A7-CHECK: .eabi_attribute	24, 1
-; CORTEX-A7-NOFPU: .eabi_attribute	24, 1
-; CORTEX-A7-FPUV4: .eabi_attribute	24, 1
+; CORTEX-A7-CHECK: .eabi_attribute      24, 1
+; CORTEX-A7-NOFPU: .eabi_attribute      24, 1
+; CORTEX-A7-FPUV4: .eabi_attribute      24, 1
 
 ; Tag_ABI_align_preserved
-; CORTEX-A7-CHECK: .eabi_attribute	25, 1
-; CORTEX-A7-NOFPU: .eabi_attribute	25, 1
-; CORTEX-A7-FPUV4: .eabi_attribute	25, 1
+; CORTEX-A7-CHECK: .eabi_attribute      25, 1
+; CORTEX-A7-NOFPU: .eabi_attribute      25, 1
+; CORTEX-A7-FPUV4: .eabi_attribute      25, 1
 
 ; Tag_FP_HP_extension
-; CORTEX-A7-CHECK: .eabi_attribute	36, 1
-; CORTEX-A7-NOFPU: .eabi_attribute	36, 1
-; CORTEX-A7-FPUV4: .eabi_attribute	36, 1
+; CORTEX-A7-CHECK: .eabi_attribute      36, 1
+; CORTEX-A7-NOFPU: .eabi_attribute      36, 1
+; CORTEX-A7-FPUV4: .eabi_attribute      36, 1
+
+; Tag_FP_16bit_format
+; CORTEX-A7-CHECK: .eabi_attribute      38, 1
+; CORTEX-A7-NOFPU: .eabi_attribute      38, 1
+; CORTEX-A7-FPUV4: .eabi_attribute      38, 1
 
 ; Tag_MPextension_use
-; CORTEX-A7-CHECK: .eabi_attribute	42, 1
-; CORTEX-A7-NOFPU: .eabi_attribute	42, 1
-; CORTEX-A7-FPUV4: .eabi_attribute	42, 1
+; CORTEX-A7-CHECK: .eabi_attribute      42, 1
+; CORTEX-A7-NOFPU: .eabi_attribute      42, 1
+; CORTEX-A7-FPUV4: .eabi_attribute      42, 1
 
 ; Tag_DIV_use
-; CORTEX-A7-CHECK: .eabi_attribute	44, 2
-; CORTEX-A7-NOFPU: .eabi_attribute	44, 2
-; CORTEX-A7-FPUV4: .eabi_attribute	44, 2
+; CORTEX-A7-CHECK: .eabi_attribute      44, 2
+; CORTEX-A7-NOFPU: .eabi_attribute      44, 2
+; CORTEX-A7-FPUV4: .eabi_attribute      44, 2
 
 ; Tag_Virtualization_use
-; CORTEX-A7-CHECK: .eabi_attribute	68, 3
-; CORTEX-A7-NOFPU: .eabi_attribute	68, 3
-; CORTEX-A7-FPUV4: .eabi_attribute	68, 3
+; CORTEX-A7-CHECK: .eabi_attribute      68, 3
+; CORTEX-A7-NOFPU: .eabi_attribute      68, 3
+; CORTEX-A7-FPUV4: .eabi_attribute      68, 3
 
 ; CORTEX-A5-DEFAULT:        .cpu    cortex-a5
 ; CORTEX-A5-DEFAULT:        .eabi_attribute 6, 10
@@ -250,84 +451,146 @@
 ; CORTEX-A5-DEFAULT:        .eabi_attribute 8, 1
 ; CORTEX-A5-DEFAULT:        .eabi_attribute 9, 2
 ; CORTEX-A5-DEFAULT:        .fpu    neon-vfpv4
+; CORTEX-A5-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
 ; CORTEX-A5-DEFAULT:        .eabi_attribute 20, 1
 ; CORTEX-A5-DEFAULT:        .eabi_attribute 21, 1
+; CORTEX-A5-DEFAULT-NOT:        .eabi_attribute 22
 ; CORTEX-A5-DEFAULT:        .eabi_attribute 23, 3
 ; CORTEX-A5-DEFAULT:        .eabi_attribute 24, 1
 ; CORTEX-A5-DEFAULT:        .eabi_attribute 25, 1
 ; CORTEX-A5-DEFAULT:        .eabi_attribute 42, 1
+; CORTEX-A5-DEFAULT-NOT:        .eabi_attribute 44
 ; CORTEX-A5-DEFAULT:        .eabi_attribute 68, 1
 
+; CORTEX-A5-DEFAULT-FAST-NOT:   .eabi_attribute 19
+;; The A5 defaults to a VFPv4 FPU, so it flushed preserving sign when -ffast-math
+;; is given.
+; CORTEX-A5-DEFAULT-FAST:        .eabi_attribute 20, 2
+; CORTEX-A5-DEFAULT-FAST-NOT: .eabi_attribute 21
+; CORTEX-A5-DEFAULT-FAST-NOT: .eabi_attribute 22
+; CORTEX-A5-DEFAULT-FAST: .eabi_attribute 23, 1
+
 ; CORTEX-A5-NONEON:        .cpu    cortex-a5
 ; CORTEX-A5-NONEON:        .eabi_attribute 6, 10
 ; CORTEX-A5-NONEON:        .eabi_attribute 7, 65
 ; CORTEX-A5-NONEON:        .eabi_attribute 8, 1
 ; CORTEX-A5-NONEON:        .eabi_attribute 9, 2
 ; CORTEX-A5-NONEON:        .fpu    vfpv4-d16
+;; We default to IEEE 754 compliance
 ; CORTEX-A5-NONEON:        .eabi_attribute 20, 1
 ; CORTEX-A5-NONEON:        .eabi_attribute 21, 1
+; CORTEX-A5-NONEON-NOT:    .eabi_attribute 22
 ; CORTEX-A5-NONEON:        .eabi_attribute 23, 3
 ; CORTEX-A5-NONEON:        .eabi_attribute 24, 1
 ; CORTEX-A5-NONEON:        .eabi_attribute 25, 1
 ; CORTEX-A5-NONEON:        .eabi_attribute 42, 1
 ; CORTEX-A5-NONEON:        .eabi_attribute 68, 1
 
+; CORTEX-A5-NONEON-FAST-NOT:   .eabi_attribute 19
+;; The A5 defaults to a VFPv4 FPU, so it flushed preserving sign when -ffast-math
+;; is given.
+; CORTEX-A5-NONEON-FAST:        .eabi_attribute 20, 2
+; CORTEX-A5-NONEON-FAST-NOT: .eabi_attribute 21
+; CORTEX-A5-NONEON-FAST-NOT: .eabi_attribute 22
+; CORTEX-A5-NONEON-FAST: .eabi_attribute 23, 1
+
 ; CORTEX-A5-NOFPU:        .cpu    cortex-a5
 ; CORTEX-A5-NOFPU:        .eabi_attribute 6, 10
 ; CORTEX-A5-NOFPU:        .eabi_attribute 7, 65
 ; CORTEX-A5-NOFPU:        .eabi_attribute 8, 1
 ; CORTEX-A5-NOFPU:        .eabi_attribute 9, 2
 ; CORTEX-A5-NOFPU-NOT:    .fpu
+; CORTEX-A5-NOFPU-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
 ; CORTEX-A5-NOFPU:        .eabi_attribute 20, 1
 ; CORTEX-A5-NOFPU:        .eabi_attribute 21, 1
+; CORTEX-A5-NOFPU-NOT:    .eabi_attribute 22
 ; CORTEX-A5-NOFPU:        .eabi_attribute 23, 3
 ; CORTEX-A5-NOFPU:        .eabi_attribute 24, 1
 ; CORTEX-A5-NOFPU:        .eabi_attribute 25, 1
 ; CORTEX-A5-NOFPU:        .eabi_attribute 42, 1
 ; CORTEX-A5-NOFPU:        .eabi_attribute 68, 1
 
+; CORTEX-A5-NOFPU-FAST-NOT:   .eabi_attribute 19
+;; Despite there being no FPU, we chose to flush to zero preserving
+;; sign. This matches what the hardware would do for this architecture
+;; revision.
+; CORTEX-A5-NOFPU-FAST: .eabi_attribute 20, 2
+; CORTEX-A5-NOFPU-FAST-NOT: .eabi_attribute 21
+; CORTEX-A5-NOFPU-FAST-NOT: .eabi_attribute 22
+; CORTEX-A5-NOFPU-FAST: .eabi_attribute 23, 1
+
 ; CORTEX-A9-SOFT:  .cpu cortex-a9
 ; CORTEX-A9-SOFT:  .eabi_attribute 6, 10
 ; CORTEX-A9-SOFT:  .eabi_attribute 7, 65
 ; CORTEX-A9-SOFT:  .eabi_attribute 8, 1
 ; CORTEX-A9-SOFT:  .eabi_attribute 9, 2
 ; CORTEX-A9-SOFT:  .fpu neon
+; CORTEX-A9-SOFT-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
 ; CORTEX-A9-SOFT:  .eabi_attribute 20, 1
 ; CORTEX-A9-SOFT:  .eabi_attribute 21, 1
+; CORTEX-A9-SOFT-NOT:  .eabi_attribute 22
 ; CORTEX-A9-SOFT:  .eabi_attribute 23, 3
 ; CORTEX-A9-SOFT:  .eabi_attribute 24, 1
 ; CORTEX-A9-SOFT:  .eabi_attribute 25, 1
 ; CORTEX-A9-SOFT-NOT:  .eabi_attribute 27
 ; CORTEX-A9-SOFT-NOT:  .eabi_attribute 28
 ; CORTEX-A9-SOFT:  .eabi_attribute 36, 1
+; CORTEX-A9-SOFT:  .eabi_attribute 38, 1
 ; CORTEX-A9-SOFT:  .eabi_attribute 42, 1
+; CORTEX-A9-SOFT-NOT:  .eabi_attribute 44
 ; CORTEX-A9-SOFT:  .eabi_attribute 68, 1
 
+; CORTEX-A9-SOFT-FAST-NOT:   .eabi_attribute 19
+;; The A9 defaults to a VFPv3 FPU, so it flushes preseving sign when
+;; -ffast-math is specified.
+; CORTEX-A9-SOFT-FAST:  .eabi_attribute 20, 2
+; CORTEX-A5-SOFT-FAST-NOT: .eabi_attribute 21
+; CORTEX-A5-SOFT-FAST-NOT: .eabi_attribute 22
+; CORTEX-A5-SOFT-FAST: .eabi_attribute 23, 1
+
 ; CORTEX-A9-HARD:  .cpu cortex-a9
 ; CORTEX-A9-HARD:  .eabi_attribute 6, 10
 ; CORTEX-A9-HARD:  .eabi_attribute 7, 65
 ; CORTEX-A9-HARD:  .eabi_attribute 8, 1
 ; CORTEX-A9-HARD:  .eabi_attribute 9, 2
 ; CORTEX-A9-HARD:  .fpu neon
+; CORTEX-A9-HARD-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
 ; CORTEX-A9-HARD:  .eabi_attribute 20, 1
 ; CORTEX-A9-HARD:  .eabi_attribute 21, 1
+; CORTEX-A9-HARD-NOT:  .eabi_attribute 22
 ; CORTEX-A9-HARD:  .eabi_attribute 23, 3
 ; CORTEX-A9-HARD:  .eabi_attribute 24, 1
 ; CORTEX-A9-HARD:  .eabi_attribute 25, 1
 ; CORTEX-A9-HARD-NOT:  .eabi_attribute 27
 ; CORTEX-A9-HARD:  .eabi_attribute 28, 1
 ; CORTEX-A9-HARD:  .eabi_attribute 36, 1
+; CORTEX-A9-HARD:  .eabi_attribute 38, 1
 ; CORTEX-A9-HARD:  .eabi_attribute 42, 1
 ; CORTEX-A9-HARD:  .eabi_attribute 68, 1
 
+; CORTEX-A9-HARD-FAST-NOT:   .eabi_attribute 19
+;; The A9 defaults to a VFPv3 FPU, so it flushes preseving sign when
+;; -ffast-math is specified.
+; CORTEX-A9-HARD-FAST:  .eabi_attribute 20, 2
+; CORTEX-A9-HARD-FAST-NOT:  .eabi_attribute 21
+; CORTEX-A9-HARD-FAST-NOT:  .eabi_attribute 22
+; CORTEX-A9-HARD-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-A12-DEFAULT:  .cpu cortex-a12
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 6, 10
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 7, 65
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 8, 1
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 9, 2
 ; CORTEX-A12-DEFAULT:  .fpu neon-vfpv4
+; CORTEX-A12-DEFAULT-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 20, 1
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 21, 1
+; CORTEX-A12-DEFAULT-NOT:  .eabi_attribute 22
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 23, 3
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 24, 1
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 25, 1
@@ -335,14 +598,25 @@
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 44, 2
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 68, 3
 
+; CORTEX-A12-DEFAULT-FAST-NOT:   .eabi_attribute 19
+;; The A12 defaults to a VFPv3 FPU, so it flushes preseving sign when
+;; -ffast-math is specified.
+; CORTEX-A12-DEFAULT-FAST:  .eabi_attribute 20, 2
+; CORTEX-A12-HARD-FAST-NOT:  .eabi_attribute 21
+; CORTEX-A12-HARD-FAST-NOT:  .eabi_attribute 22
+; CORTEX-A12-HARD-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-A12-NOFPU:  .cpu cortex-a12
 ; CORTEX-A12-NOFPU:  .eabi_attribute 6, 10
 ; CORTEX-A12-NOFPU:  .eabi_attribute 7, 65
 ; CORTEX-A12-NOFPU:  .eabi_attribute 8, 1
 ; CORTEX-A12-NOFPU:  .eabi_attribute 9, 2
 ; CORTEX-A12-NOFPU-NOT:  .fpu
+; CORTEX-A12-NOFPU-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
 ; CORTEX-A12-NOFPU:  .eabi_attribute 20, 1
 ; CORTEX-A12-NOFPU:  .eabi_attribute 21, 1
+; CORTEX-A12-NOFPU-NOT:  .eabi_attribute 22
 ; CORTEX-A12-NOFPU:  .eabi_attribute 23, 3
 ; CORTEX-A12-NOFPU:  .eabi_attribute 24, 1
 ; CORTEX-A12-NOFPU:  .eabi_attribute 25, 1
@@ -350,32 +624,56 @@
 ; CORTEX-A12-NOFPU:  .eabi_attribute 44, 2
 ; CORTEX-A12-NOFPU:  .eabi_attribute 68, 3
 
+; CORTEX-A12-NOFPU-FAST-NOT:   .eabi_attribute 19
+;; Despite there being no FPU, we chose to flush to zero preserving
+;; sign. This matches what the hardware would do for this architecture
+;; revision.
+; CORTEX-A12-NOFPU-FAST:  .eabi_attribute 20, 2
+; CORTEX-A12-NOFPU-FAST-NOT:  .eabi_attribute 21
+; CORTEX-A12-NOFPU-FAST-NOT:  .eabi_attribute 22
+; CORTEX-A12-NOFPU-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-A15: .cpu cortex-a15
 ; CORTEX-A15: .eabi_attribute 6, 10
 ; CORTEX-A15: .eabi_attribute 7, 65
 ; CORTEX-A15: .eabi_attribute 8, 1
 ; CORTEX-A15: .eabi_attribute 9, 2
 ; CORTEX-A15: .fpu neon-vfpv4
+; CORTEX-A15-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
 ; CORTEX-A15: .eabi_attribute 20, 1
 ; CORTEX-A15: .eabi_attribute 21, 1
+; CORTEX-A15-NOT: .eabi_attribute 22
 ; CORTEX-A15: .eabi_attribute 23, 3
 ; CORTEX-A15: .eabi_attribute 24, 1
 ; CORTEX-A15: .eabi_attribute 25, 1
 ; CORTEX-A15-NOT: .eabi_attribute 27
 ; CORTEX-A15-NOT: .eabi_attribute 28
 ; CORTEX-A15: .eabi_attribute 36, 1
+; CORTEX-A15: .eabi_attribute 38, 1
 ; CORTEX-A15: .eabi_attribute 42, 1
 ; CORTEX-A15: .eabi_attribute 44, 2
 ; CORTEX-A15: .eabi_attribute 68, 3
 
+; CORTEX-A15-FAST-NOT:   .eabi_attribute 19
+;; The A15 defaults to a VFPv3 FPU, so it flushes preseving sign when
+;; -ffast-math is specified.
+; CORTEX-A15-FAST: .eabi_attribute 20, 2
+; CORTEX-A15-FAST-NOT:  .eabi_attribute 21
+; CORTEX-A15-FAST-NOT:  .eabi_attribute 22
+; CORTEX-A15-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-A17-DEFAULT:  .cpu cortex-a17
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 6, 10
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 7, 65
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 8, 1
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 9, 2
 ; CORTEX-A17-DEFAULT:  .fpu neon-vfpv4
+; CORTEX-A17-DEFAULT-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 20, 1
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 21, 1
+; CORTEX-A17-DEFAULT-NOT:  .eabi_attribute 22
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 23, 3
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 24, 1
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 25, 1
@@ -383,14 +681,25 @@
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 44, 2
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 68, 3
 
+; CORTEX-A17-FAST-NOT:   .eabi_attribute 19
+;; The A17 defaults to a VFPv3 FPU, so it flushes preseving sign when
+;; -ffast-math is specified.
+; CORTEX-A17-FAST:  .eabi_attribute 20, 2
+; CORTEX-A17-FAST-NOT:  .eabi_attribute 21
+; CORTEX-A17-FAST-NOT:  .eabi_attribute 22
+; CORTEX-A17-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-A17-NOFPU:  .cpu cortex-a17
 ; CORTEX-A17-NOFPU:  .eabi_attribute 6, 10
 ; CORTEX-A17-NOFPU:  .eabi_attribute 7, 65
 ; CORTEX-A17-NOFPU:  .eabi_attribute 8, 1
 ; CORTEX-A17-NOFPU:  .eabi_attribute 9, 2
 ; CORTEX-A17-NOFPU-NOT:  .fpu
+; CORTEX-A17-NOFPU-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
 ; CORTEX-A17-NOFPU:  .eabi_attribute 20, 1
 ; CORTEX-A17-NOFPU:  .eabi_attribute 21, 1
+; CORTEX-A17-NOFPU-NOT:  .eabi_attribute 22
 ; CORTEX-A17-NOFPU:  .eabi_attribute 23, 3
 ; CORTEX-A17-NOFPU:  .eabi_attribute 24, 1
 ; CORTEX-A17-NOFPU:  .eabi_attribute 25, 1
@@ -398,72 +707,263 @@
 ; CORTEX-A17-NOFPU:  .eabi_attribute 44, 2
 ; CORTEX-A17-NOFPU:  .eabi_attribute 68, 3
 
+; CORTEX-A17-NOFPU-NOT:   .eabi_attribute 19
+;; Despite there being no FPU, we chose to flush to zero preserving
+;; sign. This matches what the hardware would do for this architecture
+;; revision.
+; CORTEX-A17-NOFPU-FAST:  .eabi_attribute 20, 2
+; CORTEX-A17-NOFPU-FAST-NOT:  .eabi_attribute 21
+; CORTEX-A17-NOFPU-FAST-NOT:  .eabi_attribute 22
+; CORTEX-A17-NOFPU-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-M0:  .cpu cortex-m0
 ; CORTEX-M0:  .eabi_attribute 6, 12
 ; CORTEX-M0-NOT:  .eabi_attribute 7
 ; CORTEX-M0:  .eabi_attribute 8, 0
 ; CORTEX-M0:  .eabi_attribute 9, 1
+; CORTEX-M0-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-M0:  .eabi_attribute 20, 1
+; CORTEX-M0:  .eabi_attribute 21, 1
+; CORTEX-M0-NOT:  .eabi_attribute 22
+; CORTEX-M0:  .eabi_attribute 23, 3
 ; CORTEX-M0:  .eabi_attribute 24, 1
 ; CORTEX-M0:  .eabi_attribute 25, 1
 ; CORTEX-M0-NOT:  .eabi_attribute 27
 ; CORTEX-M0-NOT:  .eabi_attribute 28
 ; CORTEX-M0-NOT:  .eabi_attribute 36
+; CORTEX-M0:  .eabi_attribute 38, 1
 ; CORTEX-M0-NOT:  .eabi_attribute 42
+; CORTEX-M0-NOT:  .eabi_attribute 44
 ; CORTEX-M0-NOT:  .eabi_attribute 68
 
+; CORTEX-M0-FAST-NOT:   .eabi_attribute 19
+;; Despite the M0 CPU having no FPU in this scenario, we chose to
+;; flush to positive zero here. There's no hardware support doing
+;; this, but the fast maths software library might and such behaviour
+;; would match hardware support on this architecture revision if it
+;; existed.
+; CORTEX-M0-FAST-NOT:  .eabi_attribute 20
+; CORTEX-M0-FAST-NOT:  .eabi_attribute 21
+; CORTEX-M0-FAST-NOT:  .eabi_attribute 22
+; CORTEX-M0-FAST:  .eabi_attribute 23, 1
+
+; CORTEX-M0PLUS:  .cpu cortex-m0plus
+; CORTEX-M0PLUS:  .eabi_attribute 6, 12
+; CORTEX-M0PLUS-NOT:  .eabi_attribute 7
+; CORTEX-M0PLUS:  .eabi_attribute 8, 0
+; CORTEX-M0PLUS:  .eabi_attribute 9, 1
+; CORTEX-M0PLUS-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-M0PLUS:  .eabi_attribute 20, 1
+; CORTEX-M0PLUS:  .eabi_attribute 21, 1
+; CORTEX-M0PLUS-NOT:  .eabi_attribute 22
+; CORTEX-M0PLUS:  .eabi_attribute 23, 3
+; CORTEX-M0PLUS:  .eabi_attribute 24, 1
+; CORTEX-M0PLUS:  .eabi_attribute 25, 1
+; CORTEX-M0PLUS-NOT:  .eabi_attribute 27
+; CORTEX-M0PLUS-NOT:  .eabi_attribute 28
+; CORTEX-M0PLUS-NOT:  .eabi_attribute 36
+; CORTEX-M0PLUS:  .eabi_attribute 38, 1
+; CORTEX-M0PLUS-NOT:  .eabi_attribute 42
+; CORTEX-M0PLUS-NOT:  .eabi_attribute 44
+; CORTEX-M0PLUS-NOT:  .eabi_attribute 68
+
+; CORTEX-M0PLUS-FAST-NOT:   .eabi_attribute 19
+;; Despite the M0+ CPU having no FPU in this scenario, we chose to
+;; flush to positive zero here. There's no hardware support doing
+;; this, but the fast maths software library might and such behaviour
+;; would match hardware support on this architecture revision if it
+;; existed.
+; CORTEX-M0PLUS-FAST-NOT:  .eabi_attribute 20
+; CORTEX-M0PLUS-FAST-NOT:  .eabi_attribute 21
+; CORTEX-M0PLUS-FAST-NOT:  .eabi_attribute 22
+; CORTEX-M0PLUS-FAST:  .eabi_attribute 23, 1
+
+; CORTEX-M1:  .cpu cortex-m1
+; CORTEX-M1:  .eabi_attribute 6, 12
+; CORTEX-M1-NOT:  .eabi_attribute 7
+; CORTEX-M1:  .eabi_attribute 8, 0
+; CORTEX-M1:  .eabi_attribute 9, 1
+; CORTEX-M1-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-M1:  .eabi_attribute 20, 1
+; CORTEX-M1:  .eabi_attribute 21, 1
+; CORTEX-M1-NOT:  .eabi_attribute 22
+; CORTEX-M1:  .eabi_attribute 23, 3
+; CORTEX-M1:  .eabi_attribute 24, 1
+; CORTEX-M1:  .eabi_attribute 25, 1
+; CORTEX-M1-NOT:  .eabi_attribute 27
+; CORTEX-M1-NOT:  .eabi_attribute 28
+; CORTEX-M1-NOT:  .eabi_attribute 36
+; CORTEX-M1:  .eabi_attribute 38, 1
+; CORTEX-M1-NOT:  .eabi_attribute 42
+; CORTEX-M1-NOT:  .eabi_attribute 44
+; CORTEX-M1-NOT:  .eabi_attribute 68
+
+; CORTEX-M1-FAST-NOT:   .eabi_attribute 19
+;; Despite the M1 CPU having no FPU in this scenario, we chose to
+;; flush to positive zero here. There's no hardware support doing
+;; this, but the fast maths software library might and such behaviour
+;; would match hardware support on this architecture revision if it
+;; existed.
+; CORTEX-M1-FAST-NOT:  .eabi_attribute 20
+; CORTEX-M1-FAST-NOT:  .eabi_attribute 21
+; CORTEX-M1-FAST-NOT:  .eabi_attribute 22
+; CORTEX-M1-FAST:  .eabi_attribute 23, 1
+
+; SC000:  .cpu sc000
+; SC000:  .eabi_attribute 6, 12
+; SC000-NOT:  .eabi_attribute 7
+; SC000:  .eabi_attribute 8, 0
+; SC000:  .eabi_attribute 9, 1
+; SC000-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; SC000:  .eabi_attribute 20, 1
+; SC000:  .eabi_attribute 21, 1
+; SC000-NOT:  .eabi_attribute 22
+; SC000:  .eabi_attribute 23, 3
+; SC000:  .eabi_attribute 24, 1
+; SC000:  .eabi_attribute 25, 1
+; SC000-NOT:  .eabi_attribute 27
+; SC000-NOT:  .eabi_attribute 28
+; SC000-NOT:  .eabi_attribute 36
+; SC000:  .eabi_attribute 38, 1
+; SC000-NOT:  .eabi_attribute 42
+; SC000-NOT:  .eabi_attribute 44
+; SC000-NOT:  .eabi_attribute 68
+
+; SC000-FAST-NOT:   .eabi_attribute 19
+;; Despite the SC000 CPU having no FPU in this scenario, we chose to
+;; flush to positive zero here. There's no hardware support doing
+;; this, but the fast maths software library might and such behaviour
+;; would match hardware support on this architecture revision if it
+;; existed.
+; SC000-FAST-NOT:  .eabi_attribute 20
+; SC000-FAST-NOT:  .eabi_attribute 21
+; SC000-FAST-NOT:  .eabi_attribute 22
+; SC000-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-M3:  .cpu cortex-m3
 ; CORTEX-M3:  .eabi_attribute 6, 10
 ; CORTEX-M3:  .eabi_attribute 7, 77
 ; CORTEX-M3:  .eabi_attribute 8, 0
 ; CORTEX-M3:  .eabi_attribute 9, 2
+; CORTEX-M3-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
 ; CORTEX-M3:  .eabi_attribute 20, 1
 ; CORTEX-M3:  .eabi_attribute 21, 1
+; CORTEX-M3-NOT:  .eabi_attribute 22
 ; CORTEX-M3:  .eabi_attribute 23, 3
 ; CORTEX-M3:  .eabi_attribute 24, 1
 ; CORTEX-M3:  .eabi_attribute 25, 1
 ; CORTEX-M3-NOT:  .eabi_attribute 27
 ; CORTEX-M3-NOT:  .eabi_attribute 28
 ; CORTEX-M3-NOT:  .eabi_attribute 36
+; CORTEX-M3:  .eabi_attribute 38, 1
 ; CORTEX-M3-NOT:  .eabi_attribute 42
 ; CORTEX-M3-NOT:  .eabi_attribute 44
 ; CORTEX-M3-NOT:  .eabi_attribute 68
 
+; CORTEX-M3-FAST-NOT:   .eabi_attribute 19
+;; Despite there being no FPU, we chose to flush to zero preserving
+;; sign. This matches what the hardware would do for this architecture
+;; revision.
+; CORTEX-M3-FAST:  .eabi_attribute 20, 2
+; CORTEX-M3-FAST-NOT:  .eabi_attribute 21
+; CORTEX-M3-FAST-NOT:  .eabi_attribute 22
+; CORTEX-M3-FAST:  .eabi_attribute 23, 1
+
+; SC300:  .cpu sc300
+; SC300:  .eabi_attribute 6, 10
+; SC300:  .eabi_attribute 7, 77
+; SC300:  .eabi_attribute 8, 0
+; SC300:  .eabi_attribute 9, 2
+; SC300-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; SC300:  .eabi_attribute 20, 1
+; SC300:  .eabi_attribute 21, 1
+; SC300-NOT:  .eabi_attribute 22
+; SC300:  .eabi_attribute 23, 3
+; SC300:  .eabi_attribute 24, 1
+; SC300:  .eabi_attribute 25, 1
+; SC300-NOT:  .eabi_attribute 27
+; SC300-NOT:  .eabi_attribute 28
+; SC300-NOT:  .eabi_attribute 36
+; SC300:  .eabi_attribute 38, 1
+; SC300-NOT:  .eabi_attribute 42
+; SC300-NOT:  .eabi_attribute 44
+; SC300-NOT:  .eabi_attribute 68
+
+; SC300-FAST-NOT:   .eabi_attribute 19
+;; Despite there being no FPU, we chose to flush to zero preserving
+;; sign. This matches what the hardware would do for this architecture
+;; revision.
+; SC300-FAST:  .eabi_attribute 20, 2
+; SC300-FAST-NOT:  .eabi_attribute 21
+; SC300-FAST-NOT:  .eabi_attribute 22
+; SC300-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-M4-SOFT:  .cpu cortex-m4
 ; CORTEX-M4-SOFT:  .eabi_attribute 6, 13
 ; CORTEX-M4-SOFT:  .eabi_attribute 7, 77
 ; CORTEX-M4-SOFT:  .eabi_attribute 8, 0
 ; CORTEX-M4-SOFT:  .eabi_attribute 9, 2
 ; CORTEX-M4-SOFT:  .fpu vfpv4-d16
+; CORTEX-M4-SOFT-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
 ; CORTEX-M4-SOFT:  .eabi_attribute 20, 1
 ; CORTEX-M4-SOFT:  .eabi_attribute 21, 1
+; CORTEX-M4-SOFT-NOT:  .eabi_attribute 22
 ; CORTEX-M4-SOFT:  .eabi_attribute 23, 3
 ; CORTEX-M4-SOFT:  .eabi_attribute 24, 1
 ; CORTEX-M4-SOFT:  .eabi_attribute 25, 1
 ; CORTEX-M4-SOFT:  .eabi_attribute 27, 1
 ; CORTEX-M4-SOFT-NOT:  .eabi_attribute 28
 ; CORTEX-M4-SOFT:  .eabi_attribute 36, 1
+; CORTEX-M4-SOFT:  .eabi_attribute 38, 1
 ; CORTEX-M4-SOFT-NOT:  .eabi_attribute 42
 ; CORTEX-M4-SOFT-NOT:  .eabi_attribute 44
 ; CORTEX-M4-SOFT-NOT:  .eabi_attribute 68
 
+; CORTEX-M4-SOFT-FAST-NOT:   .eabi_attribute 19
+;; The M4 defaults to a VFPv4 FPU, so it flushes preseving sign when
+;; -ffast-math is specified.
+; CORTEX-M4-SOFT-FAST:  .eabi_attribute 20, 2
+; CORTEX-M4-SOFT-FAST-NOT:  .eabi_attribute 21
+; CORTEX-M4-SOFT-FAST-NOT:  .eabi_attribute 22
+; CORTEX-M4-SOFT-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-M4-HARD:  .cpu cortex-m4
 ; CORTEX-M4-HARD:  .eabi_attribute 6, 13
 ; CORTEX-M4-HARD:  .eabi_attribute 7, 77
 ; CORTEX-M4-HARD:  .eabi_attribute 8, 0
 ; CORTEX-M4-HARD:  .eabi_attribute 9, 2
 ; CORTEX-M4-HARD:  .fpu vfpv4-d16
+; CORTEX-M4-HARD-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
 ; CORTEX-M4-HARD:  .eabi_attribute 20, 1
 ; CORTEX-M4-HARD:  .eabi_attribute 21, 1
+; CORTEX-M4-HARD-NOT:  .eabi_attribute 22
 ; CORTEX-M4-HARD:  .eabi_attribute 23, 3
 ; CORTEX-M4-HARD:  .eabi_attribute 24, 1
 ; CORTEX-M4-HARD:  .eabi_attribute 25, 1
 ; CORTEX-M4-HARD:  .eabi_attribute 27, 1
 ; CORTEX-M4-HARD:  .eabi_attribute 28, 1
 ; CORTEX-M4-HARD:  .eabi_attribute 36, 1
+; CORTEX-M4-HARD:  .eabi_attribute 38, 1
 ; CORTEX-M4-HARD-NOT:  .eabi_attribute 42
 ; CORTEX-M4-HARD-NOT:  .eabi_attribute 44
 ; CORTEX-M4-HARD-NOT:  .eabi_attribute 68
 
+; CORTEX-M4-HARD-FAST-NOT:   .eabi_attribute 19
+;; The M4 defaults to a VFPv4 FPU, so it flushes preseving sign when
+;; -ffast-math is specified.
+; CORTEX-M4-HARD-FAST:  .eabi_attribute 20, 2
+; CORTEX-M4-HARD-FAST-NOT:  .eabi_attribute 21
+; CORTEX-M4-HARD-FAST-NOT:  .eabi_attribute 22
+; CORTEX-M4-HARD-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-M7:  .cpu    cortex-m7
 ; CORTEX-M7:  .eabi_attribute 6, 13
 ; CORTEX-M7:  .eabi_attribute 7, 77
@@ -473,8 +973,11 @@
 ; CORTEX-M7-SINGLE:  .fpu fpv5-d16
 ; CORTEX-M7-DOUBLE:  .fpu fpv5-d16
 ; CORTEX-M7:  .eabi_attribute 17, 1
+; CORTEX-M7-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
 ; CORTEX-M7:  .eabi_attribute 20, 1
 ; CORTEX-M7:  .eabi_attribute 21, 1
+; CORTEX-M7-NOT:  .eabi_attribute 22
 ; CORTEX-M7:  .eabi_attribute 23, 3
 ; CORTEX-M7:  .eabi_attribute 24, 1
 ; CORTEX-M7:  .eabi_attribute 25, 1
@@ -482,26 +985,79 @@
 ; CORTEX-M7-SINGLE:  .eabi_attribute 27, 1
 ; CORTEX-M7-DOUBLE-NOT: .eabi_attribute 27
 ; CORTEX-M7:  .eabi_attribute 36, 1
+; CORTEX-M7:  .eabi_attribute 38, 1
+; CORTEX-M7-NOT:  .eabi_attribute 44
 ; CORTEX-M7:  .eabi_attribute 14, 0
 
+; CORTEX-M7-NOFPU-FAST-NOT:   .eabi_attribute 19
+;; The M7 has the ARMv8 FP unit, which always flushes preserving sign.
+; CORTEX-M7-FAST:  .eabi_attribute 20, 2
+;; Despite there being no FPU, we chose to flush to zero preserving
+;; sign. This matches what the hardware would do for this architecture
+;; revision.
+; CORTEX-M7-NOFPU-FAST: .eabi_attribute 20, 2
+; CORTEX-M7-NOFPU-FAST-NOT:  .eabi_attribute 21
+; CORTEX-M7-NOFPU-FAST-NOT:  .eabi_attribute 22
+; CORTEX-M7-NOFPU-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-R5:  .cpu cortex-r5
 ; CORTEX-R5:  .eabi_attribute 6, 10
 ; CORTEX-R5:  .eabi_attribute 7, 82
 ; CORTEX-R5:  .eabi_attribute 8, 1
 ; CORTEX-R5:  .eabi_attribute 9, 2
 ; CORTEX-R5:  .fpu vfpv3-d16
+; CORTEX-R5-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
 ; CORTEX-R5:  .eabi_attribute 20, 1
 ; CORTEX-R5:  .eabi_attribute 21, 1
+; CORTEX-R5-NOT:  .eabi_attribute 22
 ; CORTEX-R5:  .eabi_attribute 23, 3
 ; CORTEX-R5:  .eabi_attribute 24, 1
 ; CORTEX-R5:  .eabi_attribute 25, 1
 ; CORTEX-R5:  .eabi_attribute 27, 1
 ; CORTEX-R5-NOT:  .eabi_attribute 28
 ; CORTEX-R5-NOT:  .eabi_attribute 36
+; CORTEX-R5:  .eabi_attribute 38, 1
 ; CORTEX-R5-NOT:  .eabi_attribute 42
 ; CORTEX-R5:  .eabi_attribute 44, 2
 ; CORTEX-R5-NOT:  .eabi_attribute 68
 
+; CORTEX-R5-FAST-NOT:   .eabi_attribute 19
+;; The R5 has the VFPv3 FP unit, which always flushes preserving sign.
+; CORTEX-R5-FAST:  .eabi_attribute 20, 2
+; CORTEX-R5-FAST-NOT:  .eabi_attribute 21
+; CORTEX-R5-FAST-NOT:  .eabi_attribute 22
+; CORTEX-R5-FAST:  .eabi_attribute 23, 1
+
+; CORTEX-R7:  .cpu cortex-r7
+; CORTEX-R7:  .eabi_attribute 6, 10
+; CORTEX-R7:  .eabi_attribute 7, 82
+; CORTEX-R7:  .eabi_attribute 8, 1
+; CORTEX-R7:  .eabi_attribute 9, 2
+; CORTEX-R7:  .fpu vfpv3-d16
+; CORTEX-R7-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-R7:  .eabi_attribute 20, 1
+; CORTEX-R7:  .eabi_attribute 21, 1
+; CORTEX-R7-NOT:  .eabi_attribute 22
+; CORTEX-R7:  .eabi_attribute 23, 3
+; CORTEX-R7:  .eabi_attribute 24, 1
+; CORTEX-R7:  .eabi_attribute 25, 1
+; CORTEX-R7:  .eabi_attribute 27, 1
+; CORTEX-R7-NOT:  .eabi_attribute 28
+; CORTEX-R7-NOT:  .eabi_attribute 36
+; CORTEX-R7:  .eabi_attribute 38, 1
+; CORTEX-R7:  .eabi_attribute 42, 1
+; CORTEX-R7:  .eabi_attribute 44, 2
+; CORTEX-R7-NOT:  .eabi_attribute 68
+
+; CORTEX-R7-FAST-NOT:   .eabi_attribute 19
+;; The R7 has the VFPv3 FP unit, which always flushes preserving sign.
+; CORTEX-R7-FAST:  .eabi_attribute 20, 2
+; CORTEX-R7-FAST-NOT:  .eabi_attribute 21
+; CORTEX-R7-FAST-NOT:  .eabi_attribute 22
+; CORTEX-R7-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-A53:  .cpu cortex-a53
 ; CORTEX-A53:  .eabi_attribute 6, 14
 ; CORTEX-A53:  .eabi_attribute 7, 65
@@ -509,15 +1065,29 @@
 ; CORTEX-A53:  .eabi_attribute 9, 2
 ; CORTEX-A53:  .fpu crypto-neon-fp-armv8
 ; CORTEX-A53:  .eabi_attribute 12, 3
+; CORTEX-A53-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-A53:  .eabi_attribute 20, 1
+; CORTEX-A53:  .eabi_attribute 21, 1
+; CORTEX-A53-NOT:  .eabi_attribute 22
+; CORTEX-A53:  .eabi_attribute 23, 3
 ; CORTEX-A53:  .eabi_attribute 24, 1
 ; CORTEX-A53:  .eabi_attribute 25, 1
 ; CORTEX-A53-NOT:  .eabi_attribute 27
 ; CORTEX-A53-NOT:  .eabi_attribute 28
 ; CORTEX-A53:  .eabi_attribute 36, 1
+; CORTEX-A53:  .eabi_attribute 38, 1
 ; CORTEX-A53:  .eabi_attribute 42, 1
 ; CORTEX-A53-NOT:  .eabi_attribute 44
 ; CORTEX-A53:  .eabi_attribute 68, 3
 
+; CORTEX-A53-FAST-NOT:   .eabi_attribute 19
+;; The A53 has the ARMv8 FP unit, which always flushes preserving sign.
+; CORTEX-A53-FAST:  .eabi_attribute 20, 2
+; CORTEX-A53-FAST-NOT:  .eabi_attribute 21
+; CORTEX-A53-FAST-NOT:  .eabi_attribute 22
+; CORTEX-A53-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-A57:  .cpu cortex-a57
 ; CORTEX-A57:  .eabi_attribute 6, 14
 ; CORTEX-A57:  .eabi_attribute 7, 65
@@ -525,15 +1095,59 @@
 ; CORTEX-A57:  .eabi_attribute 9, 2
 ; CORTEX-A57:  .fpu crypto-neon-fp-armv8
 ; CORTEX-A57:  .eabi_attribute 12, 3
+; CORTEX-A57-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-A57:  .eabi_attribute 20, 1
+; CORTEX-A57:  .eabi_attribute 21, 1
+; CORTEX-A57-NOT:  .eabi_attribute 22
+; CORTEX-A57:  .eabi_attribute 23, 3
 ; CORTEX-A57:  .eabi_attribute 24, 1
 ; CORTEX-A57:  .eabi_attribute 25, 1
 ; CORTEX-A57-NOT:  .eabi_attribute 27
 ; CORTEX-A57-NOT:  .eabi_attribute 28
 ; CORTEX-A57:  .eabi_attribute 36, 1
+; CORTEX-A57:  .eabi_attribute 38, 1
 ; CORTEX-A57:  .eabi_attribute 42, 1
 ; CORTEX-A57-NOT:  .eabi_attribute 44
 ; CORTEX-A57:  .eabi_attribute 68, 3
 
+; CORTEX-A57-FAST-NOT:   .eabi_attribute 19
+;; The A57 has the ARMv8 FP unit, which always flushes preserving sign.
+; CORTEX-A57-FAST:  .eabi_attribute 20, 2
+; CORTEX-A57-FAST-NOT:  .eabi_attribute 21
+; CORTEX-A57-FAST-NOT:  .eabi_attribute 22
+; CORTEX-A57-FAST:  .eabi_attribute 23, 1
+
+; CORTEX-A72:  .cpu cortex-a72
+; CORTEX-A72:  .eabi_attribute 6, 14
+; CORTEX-A72:  .eabi_attribute 7, 65
+; CORTEX-A72:  .eabi_attribute 8, 1
+; CORTEX-A72:  .eabi_attribute 9, 2
+; CORTEX-A72:  .fpu crypto-neon-fp-armv8
+; CORTEX-A72:  .eabi_attribute 12, 3
+; CORTEX-A72-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-A72:  .eabi_attribute 20, 1
+; CORTEX-A72:  .eabi_attribute 21, 1
+; CORTEX-A72-NOT:  .eabi_attribute 22
+; CORTEX-A72:  .eabi_attribute 23, 3
+; CORTEX-A72:  .eabi_attribute 24, 1
+; CORTEX-A72:  .eabi_attribute 25, 1
+; CORTEX-A72-NOT:  .eabi_attribute 27
+; CORTEX-A72-NOT:  .eabi_attribute 28
+; CORTEX-A72:  .eabi_attribute 36, 1
+; CORTEX-A72:  .eabi_attribute 38, 1
+; CORTEX-A72:  .eabi_attribute 42, 1
+; CORTEX-A72-NOT:  .eabi_attribute 44
+; CORTEX-A72:  .eabi_attribute 68, 3
+
+; CORTEX-A72-FAST-NOT:   .eabi_attribute 19
+;; The A72 has the ARMv8 FP unit, which always flushes preserving sign.
+; CORTEX-A72-FAST:  .eabi_attribute 20, 2
+; CORTEX-A72-FAST-NOT:  .eabi_attribute 21
+; CORTEX-A72-FAST-NOT:  .eabi_attribute 22
+; CORTEX-A72-FAST:  .eabi_attribute 23, 1
+
 ; RELOC-PIC:  .eabi_attribute 15, 1
 ; RELOC-PIC:  .eabi_attribute 16, 1
 ; RELOC-PIC:  .eabi_attribute 17, 2
@@ -543,5 +1157,5 @@
 ; PCS-R9-RESERVE:  .eabi_attribute 14, 3
 
 define i32 @f(i64 %z) {
-	ret i32 0
+    ret i32 0
 }
diff --git a/test/CodeGen/ARM/coalesce-dbgvalue.ll b/test/CodeGen/ARM/coalesce-dbgvalue.ll
index 47d81a6..4e5fb5e 100644
--- a/test/CodeGen/ARM/coalesce-dbgvalue.ll
+++ b/test/CodeGen/ARM/coalesce-dbgvalue.ll
@@ -27,11 +27,11 @@ for.cond1:                                        ; preds = %for.end9, %for.cond
 
 for.body2:                                        ; preds = %for.cond1
   store i32 %storemerge11, i32* @b, align 4, !dbg !26
-  tail call void @llvm.dbg.value(metadata !27, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !28
+  tail call void @llvm.dbg.value(metadata i32* null, i64 0, metadata !11, metadata !{!"0x102"}), !dbg !28
   %0 = load i64* @a, align 8, !dbg !29
   %xor = xor i64 %0, %e.1.ph, !dbg !29
   %conv3 = trunc i64 %xor to i32, !dbg !29
-  tail call void @llvm.dbg.value(metadata !{i32 %conv3}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !29
+  tail call void @llvm.dbg.value(metadata i32 %conv3, i64 0, metadata !10, metadata !{!"0x102"}), !dbg !29
   %tobool4 = icmp eq i32 %conv3, 0, !dbg !29
   br i1 %tobool4, label %land.end, label %land.rhs, !dbg !29
 
@@ -79,33 +79,33 @@ attributes #3 = { nounwind }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 182024) (llvm/trunk 182023)\001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !15, metadata !2} ; [ DW_TAG_compile_unit ] [/d/b/pr16110.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"pr16110.c", metadata !"/d/b"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00pr16110\00pr16110\00\007\000\001\000\006\000\001\007", metadata !1, metadata !5, metadata !6, null, i32 ()* @pr16110, null, null, metadata !9} ; [ DW_TAG_subprogram ] [line 7] [def] [pr16110]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/d/b/pr16110.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !10, metadata !11}
-!10 = metadata !{metadata !"0x100\00e\008\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [e] [line 8]
-!11 = metadata !{metadata !"0x100\00f\0013\000", metadata !12, metadata !5, metadata !14} ; [ DW_TAG_auto_variable ] [f] [line 13]
-!12 = metadata !{metadata !"0xb\0012\000\002", metadata !1, metadata !13} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
-!13 = metadata !{metadata !"0xb\0012\000\001", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
-!14 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
-!15 = metadata !{metadata !16, metadata !18, metadata !19, metadata !20}
-!16 = metadata !{metadata !"0x34\00a\00a\00\001\000\001", null, metadata !5, metadata !17, i64* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
-!17 = metadata !{metadata !"0x24\00long long int\000\0064\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [long long int] [line 0, size 64, align 32, offset 0, enc DW_ATE_signed]
-!18 = metadata !{metadata !"0x34\00b\00b\00\002\000\001", null, metadata !5, metadata !8, i32* @b, null} ; [ DW_TAG_variable ] [b] [line 2] [def]
-!19 = metadata !{metadata !"0x34\00c\00c\00\003\000\001", null, metadata !5, metadata !8, i32* @c, null} ; [ DW_TAG_variable ] [c] [line 3] [def]
-!20 = metadata !{metadata !"0x34\00d\00d\00\004\000\001", null, metadata !5, metadata !8, i32* @d, null} ; [ DW_TAG_variable ] [d] [line 4] [def]
-!21 = metadata !{i32 10, i32 0, metadata !22, null}
-!22 = metadata !{metadata !"0xb\0010\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
-!26 = metadata !{i32 12, i32 0, metadata !13, null}
-!27 = metadata !{i32* null}
-!28 = metadata !{i32 13, i32 0, metadata !12, null}
-!29 = metadata !{i32 14, i32 0, metadata !12, null}
-!31 = metadata !{i32 16, i32 0, metadata !4, null}
-!32 = metadata !{i32 18, i32 0, metadata !4, null}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4 (trunk 182024) (llvm/trunk 182023)\001\00\000\00\000", !1, !2, !2, !3, !15, !2} ; [ DW_TAG_compile_unit ] [/d/b/pr16110.c] [DW_LANG_C99]
+!1 = !{!"pr16110.c", !"/d/b"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00pr16110\00pr16110\00\007\000\001\000\006\000\001\007", !1, !5, !6, null, i32 ()* @pr16110, null, null, !9} ; [ DW_TAG_subprogram ] [line 7] [def] [pr16110]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/d/b/pr16110.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!10, !11}
+!10 = !{!"0x100\00e\008\000", !4, !5, !8} ; [ DW_TAG_auto_variable ] [e] [line 8]
+!11 = !{!"0x100\00f\0013\000", !12, !5, !14} ; [ DW_TAG_auto_variable ] [f] [line 13]
+!12 = !{!"0xb\0012\000\002", !1, !13} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
+!13 = !{!"0xb\0012\000\001", !1, !4} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
+!14 = !{!"0xf\00\000\0032\0032\000\000", null, null, !8} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
+!15 = !{!16, !18, !19, !20}
+!16 = !{!"0x34\00a\00a\00\001\000\001", null, !5, !17, i64* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
+!17 = !{!"0x24\00long long int\000\0064\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [long long int] [line 0, size 64, align 32, offset 0, enc DW_ATE_signed]
+!18 = !{!"0x34\00b\00b\00\002\000\001", null, !5, !8, i32* @b, null} ; [ DW_TAG_variable ] [b] [line 2] [def]
+!19 = !{!"0x34\00c\00c\00\003\000\001", null, !5, !8, i32* @c, null} ; [ DW_TAG_variable ] [c] [line 3] [def]
+!20 = !{!"0x34\00d\00d\00\004\000\001", null, !5, !8, i32* @d, null} ; [ DW_TAG_variable ] [d] [line 4] [def]
+!21 = !MDLocation(line: 10, scope: !22)
+!22 = !{!"0xb\0010\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
+!26 = !MDLocation(line: 12, scope: !13)
+!27 = !{i32* null}
+!28 = !MDLocation(line: 13, scope: !12)
+!29 = !MDLocation(line: 14, scope: !12)
+!31 = !MDLocation(line: 16, scope: !4)
+!32 = !MDLocation(line: 18, scope: !4)
+!33 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/coalesce-subregs.ll b/test/CodeGen/ARM/coalesce-subregs.ll
index e7bd5f4..e4e3315 100644
--- a/test/CodeGen/ARM/coalesce-subregs.ll
+++ b/test/CodeGen/ARM/coalesce-subregs.ll
@@ -293,7 +293,6 @@ bb:
 ; CHECK: adjustCopiesBackFrom
 ; The shuffle in if.else3 must be preserved even though adjustCopiesBackFrom
 ; is tempted to remove it.
-; CHECK: %if.else3
 ; CHECK: vorr d
 define internal void @adjustCopiesBackFrom(<2 x i64>* noalias nocapture sret %agg.result, <2 x i64> %in) {
 entry:
diff --git a/test/CodeGen/ARM/crc32.ll b/test/CodeGen/ARM/crc32.ll
new file mode 100644
index 0000000..cc94330
--- /dev/null
+++ b/test/CodeGen/ARM/crc32.ll
@@ -0,0 +1,58 @@
+; RUN: llc -mtriple=thumbv8 -o - %s | FileCheck %s
+
+define i32 @test_crc32b(i32 %cur, i8 %next) {
+; CHECK-LABEL: test_crc32b:
+; CHECK: crc32b r0, r0, r1
+  %bits = zext i8 %next to i32
+  %val = call i32 @llvm.arm.crc32b(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32h(i32 %cur, i16 %next) {
+; CHECK-LABEL: test_crc32h:
+; CHECK: crc32h r0, r0, r1
+  %bits = zext i16 %next to i32
+  %val = call i32 @llvm.arm.crc32h(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32w(i32 %cur, i32 %next) {
+; CHECK-LABEL: test_crc32w:
+; CHECK: crc32w r0, r0, r1
+  %val = call i32 @llvm.arm.crc32w(i32 %cur, i32 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32cb(i32 %cur, i8 %next) {
+; CHECK-LABEL: test_crc32cb:
+; CHECK: crc32cb r0, r0, r1
+  %bits = zext i8 %next to i32
+  %val = call i32 @llvm.arm.crc32cb(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32ch(i32 %cur, i16 %next) {
+; CHECK-LABEL: test_crc32ch:
+; CHECK: crc32ch r0, r0, r1
+  %bits = zext i16 %next to i32
+  %val = call i32 @llvm.arm.crc32ch(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32cw(i32 %cur, i32 %next) {
+; CHECK-LABEL: test_crc32cw:
+; CHECK: crc32cw r0, r0, r1
+  %val = call i32 @llvm.arm.crc32cw(i32 %cur, i32 %next)
+  ret i32 %val
+}
+
+
+declare i32 @llvm.arm.crc32b(i32, i32)
+declare i32 @llvm.arm.crc32h(i32, i32)
+declare i32 @llvm.arm.crc32w(i32, i32)
+declare i32 @llvm.arm.crc32x(i32, i64)
+
+declare i32 @llvm.arm.crc32cb(i32, i32)
+declare i32 @llvm.arm.crc32ch(i32, i32)
+declare i32 @llvm.arm.crc32cw(i32, i32)
+declare i32 @llvm.arm.crc32cx(i32, i64)
diff --git a/test/CodeGen/ARM/cse-ldrlit.ll b/test/CodeGen/ARM/cse-ldrlit.ll
index ea8c0ca..3f5d4c2 100644
--- a/test/CodeGen/ARM/cse-ldrlit.ll
+++ b/test/CodeGen/ARM/cse-ldrlit.ll
@@ -33,8 +33,8 @@ false:
 ; CHECK-ARM-PIC-LABEL: foo:
 ; CHECK-ARM-PIC: ldr [[VAR_OFFSET:r[0-9]+]], LCPI0_0
 ; CHECK-ARM-PIC: LPC0_0:
-; CHECK-ARM-PIC-NEXT: ldr r0, [pc, [[VAR_OFFSET]]]
-; CHECK-ARM-PIC: ldr {{r[1-9][0-9]?}}, [r0, #4]
+; CHECK-ARM-PIC-NEXT: add r0, pc, [[VAR_OFFSET]]
+; CHECK-ARM-PIC: ldr {{r[0-9]+}}, [r0, #4]
 
 ; CHECK-ARM-PIC: LCPI0_0:
 ; CHECK-ARM-PIC-NEXT: .long _var-(LPC0_0+8)
diff --git a/test/CodeGen/ARM/cse-libcalls.ll b/test/CodeGen/ARM/cse-libcalls.ll
index 62b9e43..4f5b759 100644
--- a/test/CodeGen/ARM/cse-libcalls.ll
+++ b/test/CodeGen/ARM/cse-libcalls.ll
@@ -1,9 +1,13 @@
-; RUN: llc < %s -march=arm | grep "bl.*__ltdf" | count 1
+; RUN: llc < %s -march=arm | FileCheck %s
+
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
 
 ; Without CSE of libcalls, there are two calls in the output instead of one.
 
+; CHECK: bl ___ltdf
+; CHECK-NOT: bl ___ltdf
+
 define double @u_f_nonbon(double %lambda) nounwind {
 entry:
 	%tmp19.i.i = load double* null, align 4		; <double> [#uses=2]
diff --git a/test/CodeGen/ARM/dagcombine-concatvector.ll b/test/CodeGen/ARM/dagcombine-concatvector.ll
index 62ed87f..80ef2ab 100644
--- a/test/CodeGen/ARM/dagcombine-concatvector.ll
+++ b/test/CodeGen/ARM/dagcombine-concatvector.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=thumbv7s-apple-ios3.0.0 -mcpu=generic | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE
-; RUN: llc < %s -mtriple=thumbeb -mattr=v7,neon | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
+; RUN: llc < %s -mtriple=thumbeb -target-abi apcs -mattr=v7,neon | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
 
 ; PR15525
 ; CHECK-LABEL: test1:
diff --git a/test/CodeGen/ARM/debug-frame-vararg.ll b/test/CodeGen/ARM/debug-frame-vararg.ll
index ffc1a6a..65be2db 100644
--- a/test/CodeGen/ARM/debug-frame-vararg.ll
+++ b/test/CodeGen/ARM/debug-frame-vararg.ll
@@ -25,40 +25,40 @@
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/var.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"var.c", metadata !"/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00sum\00sum\00\005\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !6, null, i32 (i32, ...)* @sum, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [sum]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/var.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!11 = metadata !{metadata !"clang version 3.5 "}
-!12 = metadata !{metadata !"0x101\00count\0016777221\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [count] [line 5]
-!13 = metadata !{i32 5, i32 0, metadata !4, null}
-!14 = metadata !{metadata !"0x100\00vl\006\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [vl] [line 6]
-!15 = metadata !{metadata !"0x16\00va_list\0030\000\000\000\000", metadata !16, null, metadata !17} ; [ DW_TAG_typedef ] [va_list] [line 30, size 0, align 0, offset 0] [from __builtin_va_list]
-!16 = metadata !{metadata !"/linux-x86_64-high/gcc_4.7.2/dbg/llvm/bin/../lib/clang/3.5/include/stdarg.h", metadata !"/tmp"}
-!17 = metadata !{metadata !"0x16\00__builtin_va_list\006\000\000\000\000", metadata !1, null, metadata !18} ; [ DW_TAG_typedef ] [__builtin_va_list] [line 6, size 0, align 0, offset 0] [from __va_list]
-!18 = metadata !{metadata !"0x13\00__va_list\006\0032\0032\000\000\000", metadata !1, null, null, metadata !19, null, null, null} ; [ DW_TAG_structure_type ] [__va_list] [line 6, size 32, align 32, offset 0] [def] [from ]
-!19 = metadata !{metadata !20}
-!20 = metadata !{metadata !"0xd\00__ap\006\0032\0032\000\000", metadata !1, metadata !18, metadata !21} ; [ DW_TAG_member ] [__ap] [line 6, size 32, align 32, offset 0] [from ]
-!21 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from ]
-!22 = metadata !{i32 6, i32 0, metadata !4, null}
-!23 = metadata !{i32 7, i32 0, metadata !4, null}
-!24 = metadata !{metadata !"0x100\00sum\008\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [sum] [line 8]
-!25 = metadata !{i32 8, i32 0, metadata !4, null}
-!26 = metadata !{metadata !"0x100\00i\009\000", metadata !27, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 9]
-!27 = metadata !{metadata !"0xb\009\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
-!28 = metadata !{i32 9, i32 0, metadata !27, null}
-!29 = metadata !{i32 10, i32 0, metadata !30, null}
-!30 = metadata !{metadata !"0xb\009\000\001", metadata !1, metadata !27} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
-!31 = metadata !{i32 11, i32 0, metadata !30, null}
-!32 = metadata !{i32 12, i32 0, metadata !4, null}
-!33 = metadata !{i32 13, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/var.c] [DW_LANG_C99]
+!1 = !{!"var.c", !"/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00sum\00sum\00\005\000\001\000\006\00256\000\005", !1, !5, !6, null, i32 (i32, ...)* @sum, null, null, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [sum]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/var.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 1, !"Debug Info Version", i32 2}
+!11 = !{!"clang version 3.5 "}
+!12 = !{!"0x101\00count\0016777221\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [count] [line 5]
+!13 = !MDLocation(line: 5, scope: !4)
+!14 = !{!"0x100\00vl\006\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [vl] [line 6]
+!15 = !{!"0x16\00va_list\0030\000\000\000\000", !16, null, !17} ; [ DW_TAG_typedef ] [va_list] [line 30, size 0, align 0, offset 0] [from __builtin_va_list]
+!16 = !{!"/linux-x86_64-high/gcc_4.7.2/dbg/llvm/bin/../lib/clang/3.5/include/stdarg.h", !"/tmp"}
+!17 = !{!"0x16\00__builtin_va_list\006\000\000\000\000", !1, null, !18} ; [ DW_TAG_typedef ] [__builtin_va_list] [line 6, size 0, align 0, offset 0] [from __va_list]
+!18 = !{!"0x13\00__va_list\006\0032\0032\000\000\000", !1, null, null, !19, null, null, null} ; [ DW_TAG_structure_type ] [__va_list] [line 6, size 32, align 32, offset 0] [def] [from ]
+!19 = !{!20}
+!20 = !{!"0xd\00__ap\006\0032\0032\000\000", !1, !18, !21} ; [ DW_TAG_member ] [__ap] [line 6, size 32, align 32, offset 0] [from ]
+!21 = !{!"0xf\00\000\0032\0032\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from ]
+!22 = !MDLocation(line: 6, scope: !4)
+!23 = !MDLocation(line: 7, scope: !4)
+!24 = !{!"0x100\00sum\008\000", !4, !5, !8} ; [ DW_TAG_auto_variable ] [sum] [line 8]
+!25 = !MDLocation(line: 8, scope: !4)
+!26 = !{!"0x100\00i\009\000", !27, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 9]
+!27 = !{!"0xb\009\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
+!28 = !MDLocation(line: 9, scope: !27)
+!29 = !MDLocation(line: 10, scope: !30)
+!30 = !{!"0xb\009\000\001", !1, !27} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
+!31 = !MDLocation(line: 11, scope: !30)
+!32 = !MDLocation(line: 12, scope: !4)
+!33 = !MDLocation(line: 13, scope: !4)
 
 ; CHECK-FP-LABEL: sum
 ; CHECK-FP: .cfi_startproc
@@ -88,24 +88,22 @@
 ; CHECK-THUMB-FP: .cfi_startproc
 ; CHECK-THUMB-FP: sub    sp, #16
 ; CHECK-THUMB-FP: .cfi_def_cfa_offset 16
-; CHECK-THUMB-FP: push   {r4, r5, r7, lr}
-; CHECK-THUMB-FP: .cfi_def_cfa_offset 32
+; CHECK-THUMB-FP: push   {r4, lr}
+; CHECK-THUMB-FP: .cfi_def_cfa_offset 24
 ; CHECK-THUMB-FP: .cfi_offset lr, -20
-; CHECK-THUMB-FP: .cfi_offset r7, -24
-; CHECK-THUMB-FP: .cfi_offset r5, -28
-; CHECK-THUMB-FP: .cfi_offset r4, -32
+; CHECK-THUMB-FP: .cfi_offset r4, -24
 ; CHECK-THUMB-FP: sub    sp, #8
-; CHECK-THUMB-FP: .cfi_def_cfa_offset 40
+; CHECK-THUMB-FP: .cfi_def_cfa_offset 32
 
 ; CHECK-THUMB-FP-ELIM-LABEL: sum
 ; CHECK-THUMB-FP-ELIM: .cfi_startproc
 ; CHECK-THUMB-FP-ELIM: sub    sp, #16
 ; CHECK-THUMB-FP-ELIM: .cfi_def_cfa_offset 16
-; CHECK-THUMB-FP-ELIM: push   {r4, r5, r7, lr}
+; CHECK-THUMB-FP-ELIM: push   {r4, r6, r7, lr}
 ; CHECK-THUMB-FP-ELIM: .cfi_def_cfa_offset 32
 ; CHECK-THUMB-FP-ELIM: .cfi_offset lr, -20
 ; CHECK-THUMB-FP-ELIM: .cfi_offset r7, -24
-; CHECK-THUMB-FP-ELIM: .cfi_offset r5, -28
+; CHECK-THUMB-FP-ELIM: .cfi_offset r6, -28
 ; CHECK-THUMB-FP-ELIM: .cfi_offset r4, -32
 ; CHECK-THUMB-FP-ELIM: add    r7, sp, #8
 ; CHECK-THUMB-FP-ELIM: .cfi_def_cfa r7, 24
diff --git a/test/CodeGen/ARM/debug-frame.ll b/test/CodeGen/ARM/debug-frame.ll
index c6243ec..16e2c4c 100644
--- a/test/CodeGen/ARM/debug-frame.ll
+++ b/test/CodeGen/ARM/debug-frame.ll
@@ -128,41 +128,41 @@ declare void @_ZSt9terminatev()
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/exp.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"exp.cpp", metadata !"/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00test\00test\00_Z4testiiiiiddddd\004\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !6, null, void (i32, i32, i32, i32, i32, double, double, double, double, double)* @_Z4testiiiiiddddd, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 5] [test]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/exp.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null, metadata !8, metadata !8, metadata !8, metadata !8, metadata !8, metadata !9, metadata !9, metadata !9, metadata !9, metadata !9}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
-!10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!12 = metadata !{metadata !"clang version 3.5 "}
-!13 = metadata !{metadata !"0x101\00a\0016777220\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 4]
-!14 = metadata !{i32 4, i32 0, metadata !4, null}
-!15 = metadata !{metadata !"0x101\00b\0033554436\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [b] [line 4]
-!16 = metadata !{metadata !"0x101\00c\0050331652\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [c] [line 4]
-!17 = metadata !{metadata !"0x101\00d\0067108868\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [d] [line 4]
-!18 = metadata !{metadata !"0x101\00e\0083886084\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [e] [line 4]
-!19 = metadata !{metadata !"0x101\00m\00100663301\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [m] [line 5]
-!20 = metadata !{i32 5, i32 0, metadata !4, null}
-!21 = metadata !{metadata !"0x101\00n\00117440517\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [n] [line 5]
-!22 = metadata !{metadata !"0x101\00p\00134217733\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [p] [line 5]
-!23 = metadata !{metadata !"0x101\00q\00150994949\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [q] [line 5]
-!24 = metadata !{metadata !"0x101\00r\00167772165\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [r] [line 5]
-!25 = metadata !{i32 7, i32 0, metadata !26, null}
-!26 = metadata !{metadata !"0xb\006\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/tmp/exp.cpp]
-!27 = metadata !{i32 8, i32 0, metadata !26, null}
-!28 = metadata !{i32 11, i32 0, metadata !26, null}
-!29 = metadata !{i32 9, i32 0, metadata !30, null}
-!30 = metadata !{metadata !"0xb\008\000\001", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/tmp/exp.cpp]
-!31 = metadata !{i32 10, i32 0, metadata !30, null}
-!32 = metadata !{i32 10, i32 0, metadata !4, null}
-!33 = metadata !{i32 11, i32 0, metadata !4, null}
-!34 = metadata !{i32 11, i32 0, metadata !30, null}
+!0 = !{!"0x11\004\00clang version 3.5 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/exp.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"exp.cpp", !"/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00test\00test\00_Z4testiiiiiddddd\004\000\001\000\006\00256\000\005", !1, !5, !6, null, void (i32, i32, i32, i32, i32, double, double, double, double, double)* @_Z4testiiiiiddddd, null, null, !2} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 5] [test]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/exp.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null, !8, !8, !8, !8, !8, !9, !9, !9, !9, !9}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!10 = !{i32 2, !"Dwarf Version", i32 4}
+!11 = !{i32 1, !"Debug Info Version", i32 2}
+!12 = !{!"clang version 3.5 "}
+!13 = !{!"0x101\00a\0016777220\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [a] [line 4]
+!14 = !MDLocation(line: 4, scope: !4)
+!15 = !{!"0x101\00b\0033554436\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [b] [line 4]
+!16 = !{!"0x101\00c\0050331652\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [c] [line 4]
+!17 = !{!"0x101\00d\0067108868\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [d] [line 4]
+!18 = !{!"0x101\00e\0083886084\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [e] [line 4]
+!19 = !{!"0x101\00m\00100663301\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [m] [line 5]
+!20 = !MDLocation(line: 5, scope: !4)
+!21 = !{!"0x101\00n\00117440517\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [n] [line 5]
+!22 = !{!"0x101\00p\00134217733\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [p] [line 5]
+!23 = !{!"0x101\00q\00150994949\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [q] [line 5]
+!24 = !{!"0x101\00r\00167772165\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [r] [line 5]
+!25 = !MDLocation(line: 7, scope: !26)
+!26 = !{!"0xb\006\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [/tmp/exp.cpp]
+!27 = !MDLocation(line: 8, scope: !26)
+!28 = !MDLocation(line: 11, scope: !26)
+!29 = !MDLocation(line: 9, scope: !30)
+!30 = !{!"0xb\008\000\001", !1, !4} ; [ DW_TAG_lexical_block ] [/tmp/exp.cpp]
+!31 = !MDLocation(line: 10, scope: !30)
+!32 = !MDLocation(line: 10, scope: !4)
+!33 = !MDLocation(line: 11, scope: !4)
+!34 = !MDLocation(line: 11, scope: !30)
 
 ; CHECK-FP-LABEL: _Z4testiiiiiddddd:
 ; CHECK-FP:   .cfi_startproc
diff --git a/test/CodeGen/ARM/debug-info-arg.ll b/test/CodeGen/ARM/debug-info-arg.ll
index 34e9938..8679589 100644
--- a/test/CodeGen/ARM/debug-info-arg.ll
+++ b/test/CodeGen/ARM/debug-info-arg.ll
@@ -7,13 +7,13 @@ target triple = "thumbv7-apple-ios"
 %struct.tag_s = type { i32, i32, i32 }
 
 define void @foo(%struct.tag_s* nocapture %this, %struct.tag_s* %c, i64 %x, i64 %y, %struct.tag_s* nocapture %ptr1, %struct.tag_s* nocapture %ptr2) nounwind ssp {
-  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %this}, i64 0, metadata !5, metadata !{metadata !"0x102"}), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %c}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !21
-  tail call void @llvm.dbg.value(metadata !{i64 %x}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !22
-  tail call void @llvm.dbg.value(metadata !{i64 %y}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !23
+  tail call void @llvm.dbg.value(metadata %struct.tag_s* %this, i64 0, metadata !5, metadata !{!"0x102"}), !dbg !20
+  tail call void @llvm.dbg.value(metadata %struct.tag_s* %c, i64 0, metadata !13, metadata !{!"0x102"}), !dbg !21
+  tail call void @llvm.dbg.value(metadata i64 %x, i64 0, metadata !14, metadata !{!"0x102"}), !dbg !22
+  tail call void @llvm.dbg.value(metadata i64 %y, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !23
 ;CHECK:	@DEBUG_VALUE: foo:y <- [R7+8]
-  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %ptr1}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !24
-  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %ptr2}, i64 0, metadata !19, metadata !{metadata !"0x102"}), !dbg !25
+  tail call void @llvm.dbg.value(metadata %struct.tag_s* %ptr1, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !24
+  tail call void @llvm.dbg.value(metadata %struct.tag_s* %ptr2, i64 0, metadata !19, metadata !{!"0x102"}), !dbg !25
   %1 = icmp eq %struct.tag_s* %c, null, !dbg !26
   br i1 %1, label %3, label %2, !dbg !26
 
@@ -32,37 +32,37 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33}
 
-!0 = metadata !{metadata !"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)\001\00\000\00\001", metadata !32, metadata !4, metadata !4, metadata !30, null,  null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"0x2e\00foo\00foo\00\0011\000\001\000\006\00256\001\0011", metadata !2, metadata !2, metadata !3, null, void (%struct.tag_s*, %struct.tag_s*, i64, i64, %struct.tag_s*, %struct.tag_s*)* @foo, null, null, metadata !31} ; [ DW_TAG_subprogram ] [line 11] [def] [foo]
-!2 = metadata !{metadata !"0x29", metadata !32} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !32, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null}
-!5 = metadata !{metadata !"0x101\00this\0016777227\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!6 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !7} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{metadata !"0x13\00tag_s\005\0096\0032\000\000\000", metadata !32, metadata !0, null, metadata !8, null, null, null} ; [ DW_TAG_structure_type ] [tag_s] [line 5, size 96, align 32, offset 0] [def] [from ]
-!8 = metadata !{metadata !9, metadata !11, metadata !12}
-!9 = metadata !{metadata !"0xd\00x\006\0032\0032\000\000", metadata !32, metadata !7, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
-!11 = metadata !{metadata !"0xd\00y\007\0032\0032\0032\000", metadata !32, metadata !7, metadata !10} ; [ DW_TAG_member ]
-!12 = metadata !{metadata !"0xd\00z\008\0032\0032\0064\000", metadata !32, metadata !7, metadata !10} ; [ DW_TAG_member ]
-!13 = metadata !{metadata !"0x101\00c\0033554443\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!14 = metadata !{metadata !"0x101\00x\0050331659\000", metadata !1, metadata !2, metadata !15} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{metadata !"0x16\00UInt64\001\000\000\000\000", metadata !32, metadata !0, metadata !16} ; [ DW_TAG_typedef ]
-!16 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0032\000\000\007", null, metadata !0} ; [ DW_TAG_base_type ]
-!17 = metadata !{metadata !"0x101\00y\0067108875\000", metadata !1, metadata !2, metadata !15} ; [ DW_TAG_arg_variable ]
-!18 = metadata !{metadata !"0x101\00ptr1\0083886091\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{metadata !"0x101\00ptr2\00100663307\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!20 = metadata !{i32 11, i32 24, metadata !1, null}
-!21 = metadata !{i32 11, i32 44, metadata !1, null}
-!22 = metadata !{i32 11, i32 54, metadata !1, null}
-!23 = metadata !{i32 11, i32 64, metadata !1, null}
-!24 = metadata !{i32 11, i32 81, metadata !1, null}
-!25 = metadata !{i32 11, i32 101, metadata !1, null}
-!26 = metadata !{i32 12, i32 3, metadata !27, null}
-!27 = metadata !{metadata !"0xb\0011\00107\000", metadata !2, metadata !1} ; [ DW_TAG_lexical_block ]
-!28 = metadata !{i32 13, i32 5, metadata !27, null}
-!29 = metadata !{i32 14, i32 1, metadata !27, null}
-!30 = metadata !{metadata !1}
-!31 = metadata !{metadata !5, metadata !13, metadata !14, metadata !17, metadata !18, metadata!19}
-!32 = metadata !{metadata !"one.c", metadata !"/Volumes/Athwagate/R10048772"}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)\001\00\000\00\001", !32, !4, !4, !30, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = !{!"0x2e\00foo\00foo\00\0011\000\001\000\006\00256\001\0011", !2, !2, !3, null, void (%struct.tag_s*, %struct.tag_s*, i64, i64, %struct.tag_s*, %struct.tag_s*)* @foo, null, null, !31} ; [ DW_TAG_subprogram ] [line 11] [def] [foo]
+!2 = !{!"0x29", !32} ; [ DW_TAG_file_type ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !32, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{null}
+!5 = !{!"0x101\00this\0016777227\000", !1, !2, !6} ; [ DW_TAG_arg_variable ]
+!6 = !{!"0xf\00\000\0032\0032\000\000", null, !0, !7} ; [ DW_TAG_pointer_type ]
+!7 = !{!"0x13\00tag_s\005\0096\0032\000\000\000", !32, !0, null, !8, null, null, null} ; [ DW_TAG_structure_type ] [tag_s] [line 5, size 96, align 32, offset 0] [def] [from ]
+!8 = !{!9, !11, !12}
+!9 = !{!"0xd\00x\006\0032\0032\000\000", !32, !7, !10} ; [ DW_TAG_member ]
+!10 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !0} ; [ DW_TAG_base_type ]
+!11 = !{!"0xd\00y\007\0032\0032\0032\000", !32, !7, !10} ; [ DW_TAG_member ]
+!12 = !{!"0xd\00z\008\0032\0032\0064\000", !32, !7, !10} ; [ DW_TAG_member ]
+!13 = !{!"0x101\00c\0033554443\000", !1, !2, !6} ; [ DW_TAG_arg_variable ]
+!14 = !{!"0x101\00x\0050331659\000", !1, !2, !15} ; [ DW_TAG_arg_variable ]
+!15 = !{!"0x16\00UInt64\001\000\000\000\000", !32, !0, !16} ; [ DW_TAG_typedef ]
+!16 = !{!"0x24\00long long unsigned int\000\0064\0032\000\000\007", null, !0} ; [ DW_TAG_base_type ]
+!17 = !{!"0x101\00y\0067108875\000", !1, !2, !15} ; [ DW_TAG_arg_variable ]
+!18 = !{!"0x101\00ptr1\0083886091\000", !1, !2, !6} ; [ DW_TAG_arg_variable ]
+!19 = !{!"0x101\00ptr2\00100663307\000", !1, !2, !6} ; [ DW_TAG_arg_variable ]
+!20 = !MDLocation(line: 11, column: 24, scope: !1)
+!21 = !MDLocation(line: 11, column: 44, scope: !1)
+!22 = !MDLocation(line: 11, column: 54, scope: !1)
+!23 = !MDLocation(line: 11, column: 64, scope: !1)
+!24 = !MDLocation(line: 11, column: 81, scope: !1)
+!25 = !MDLocation(line: 11, column: 101, scope: !1)
+!26 = !MDLocation(line: 12, column: 3, scope: !27)
+!27 = !{!"0xb\0011\00107\000", !2, !1} ; [ DW_TAG_lexical_block ]
+!28 = !MDLocation(line: 13, column: 5, scope: !27)
+!29 = !MDLocation(line: 14, column: 1, scope: !27)
+!30 = !{!1}
+!31 = !{!5, !13, !14, !17, !18, !19}
+!32 = !{!"one.c", !"/Volumes/Athwagate/R10048772"}
+!33 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/debug-info-blocks.ll b/test/CodeGen/ARM/debug-info-blocks.ll
index 3623927..3bf6ad9 100644
--- a/test/CodeGen/ARM/debug-info-blocks.ll
+++ b/test/CodeGen/ARM/debug-info-blocks.ll
@@ -31,22 +31,22 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
   %1 = alloca %0*, align 4
   %bounds = alloca %struct.CR, align 4
   %data = alloca %struct.CR, align 4
-  call void @llvm.dbg.value(metadata !{i8* %.block_descriptor}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !129
+  call void @llvm.dbg.value(metadata i8* %.block_descriptor, i64 0, metadata !27, metadata !{!"0x102"}), !dbg !129
   store %0* %loadedMydata, %0** %1, align 4
-  call void @llvm.dbg.declare(metadata !{%0** %1}, metadata !130, metadata !{metadata !"0x102"}), !dbg !131
+  call void @llvm.dbg.declare(metadata %0** %1, metadata !130, metadata !{!"0x102"}), !dbg !131
   %2 = bitcast %struct.CR* %bounds to %1*
   %3 = getelementptr %1* %2, i32 0, i32 0
   store [4 x i32] %bounds.coerce0, [4 x i32]* %3
-  call void @llvm.dbg.declare(metadata !{%struct.CR* %bounds}, metadata !132, metadata !{metadata !"0x102"}), !dbg !133
+  call void @llvm.dbg.declare(metadata %struct.CR* %bounds, metadata !132, metadata !{!"0x102"}), !dbg !133
   %4 = bitcast %struct.CR* %data to %1*
   %5 = getelementptr %1* %4, i32 0, i32 0
   store [4 x i32] %data.coerce0, [4 x i32]* %5
-  call void @llvm.dbg.declare(metadata !{%struct.CR* %data}, metadata !134, metadata !{metadata !"0x102"}), !dbg !135
+  call void @llvm.dbg.declare(metadata %struct.CR* %data, metadata !134, metadata !{!"0x102"}), !dbg !135
   %6 = bitcast i8* %.block_descriptor to %2*
   %7 = getelementptr inbounds %2* %6, i32 0, i32 6
-  call void @llvm.dbg.declare(metadata !{%2* %6}, metadata !136, metadata !163), !dbg !137
-  call void @llvm.dbg.declare(metadata !{%2* %6}, metadata !138, metadata !164), !dbg !137
-  call void @llvm.dbg.declare(metadata !{%2* %6}, metadata !139, metadata !165), !dbg !140
+  call void @llvm.dbg.declare(metadata %2* %6, metadata !136, metadata !163), !dbg !137
+  call void @llvm.dbg.declare(metadata %2* %6, metadata !138, metadata !164), !dbg !137
+  call void @llvm.dbg.declare(metadata %2* %6, metadata !139, metadata !165), !dbg !140
   %8 = load %0** %1, align 4, !dbg !141
   %9 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_13", !dbg !141
   %10 = bitcast %0* %8 to i8*, !dbg !141
@@ -95,169 +95,169 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!162}
 
-!0 = metadata !{metadata !"0x11\0016\00Apple clang version 2.1\000\00\002\00\001", metadata !153, metadata !147, metadata !26, metadata !148, null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"0x4\00\00248\0032\0032\000\000\000", metadata !160, metadata !0, null, metadata !3, null, null, null} ; [ DW_TAG_enumeration_type ] [line 248, size 32, align 32, offset 0] [def] [from ]
-!2 = metadata !{metadata !"0x29", metadata !160} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x28\00Ver1\000"} ; [ DW_TAG_enumerator ]
-!5 = metadata !{metadata !"0x4\00Mode\0079\0032\0032\000\000\000", metadata !160, metadata !0, null, metadata !7, null, null, null} ; [ DW_TAG_enumeration_type ] [Mode] [line 79, size 32, align 32, offset 0] [def] [from ]
-!6 = metadata !{metadata !"0x29", metadata !161} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x28\00One\000"} ; [ DW_TAG_enumerator ]
-!9 = metadata !{metadata !"0x4\00\0015\0032\0032\000\000\000", metadata !149, metadata !0, null, metadata !11, null, null, null} ; [ DW_TAG_enumeration_type ] [line 15, size 32, align 32, offset 0] [def] [from ]
-!10 = metadata !{metadata !"0x29", metadata !149} ; [ DW_TAG_file_type ]
-!11 = metadata !{metadata !12, metadata !13}
-!12 = metadata !{metadata !"0x28\00Unknown\000"} ; [ DW_TAG_enumerator ]
-!13 = metadata !{metadata !"0x28\00Known\001"} ; [ DW_TAG_enumerator ]
-!14 = metadata !{metadata !"0x4\00\0020\0032\0032\000\000\000", metadata !150, metadata !0, null, metadata !16, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
-!15 = metadata !{metadata !"0x29", metadata !150} ; [ DW_TAG_file_type ]
-!16 = metadata !{metadata !17, metadata !18}
-!17 = metadata !{metadata !"0x28\00Single\000"} ; [ DW_TAG_enumerator ]
-!18 = metadata !{metadata !"0x28\00Double\001"} ; [ DW_TAG_enumerator ]
-!19 = metadata !{metadata !"0x4\00\0014\0032\0032\000\000\000", metadata !151, metadata !0, null, metadata !21, null, null, null} ; [ DW_TAG_enumeration_type ] [line 14, size 32, align 32, offset 0] [def] [from ]
-!20 = metadata !{metadata !"0x29", metadata !151} ; [ DW_TAG_file_type ]
-!21 = metadata !{metadata !22}
-!22 = metadata !{metadata !"0x28\00Eleven\000"} ; [ DW_TAG_enumerator ]
-!23 = metadata !{metadata !"0x2e\00foobar_func_block_invoke_0\00foobar_func_block_invoke_0\00\00609\001\001\000\006\00256\000\00609", metadata !152, metadata !24, metadata !25, null, void (i8*, %0*, [4 x i32], [4 x i32])* @foobar_func_block_invoke_0, null, null, null} ; [ DW_TAG_subprogram ] [line 609] [local] [def] [foobar_func_block_invoke_0]
-!24 = metadata !{metadata !"0x29", metadata !152} ; [ DW_TAG_file_type ]
-!25 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !152, metadata !24, null, metadata !26, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!26 = metadata !{null}
-!27 = metadata !{metadata !"0x101\00.block_descriptor\0016777825\0064", metadata !23, metadata !24, metadata !28} ; [ DW_TAG_arg_variable ]
-!28 = metadata !{metadata !"0xf\00\000\0032\000\000\000", null, metadata !0, metadata !29} ; [ DW_TAG_pointer_type ]
-!29 = metadata !{metadata !"0x13\00__block_literal_14\00609\00256\0032\000\000\000", metadata !152, metadata !24, null, metadata !30, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_14] [line 609, size 256, align 32, offset 0] [def] [from ]
-!30 = metadata !{metadata !31, metadata !33, metadata !35, metadata !36, metadata !37, metadata !48, metadata !89, metadata !124}
-!31 = metadata !{metadata !"0xd\00__isa\00609\0032\0032\000\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
-!32 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, null} ; [ DW_TAG_pointer_type ]
-!33 = metadata !{metadata !"0xd\00__flags\00609\0032\0032\0032\000", metadata !152, metadata !24, metadata !34} ; [ DW_TAG_member ]
-!34 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
-!35 = metadata !{metadata !"0xd\00__reserved\00609\0032\0032\0064\000", metadata !152, metadata !24, metadata !34} ; [ DW_TAG_member ]
-!36 = metadata !{metadata !"0xd\00__FuncPtr\00609\0032\0032\0096\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
-!37 = metadata !{metadata !"0xd\00__descriptor\00609\0032\0032\00128\000", metadata !152, metadata !24, metadata !38} ; [ DW_TAG_member ]
-!38 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !39} ; [ DW_TAG_pointer_type ]
-!39 = metadata !{metadata !"0x13\00__block_descriptor_withcopydispose\00307\00128\0032\000\000\000", metadata !153, metadata !0, null, metadata !41, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 307, size 128, align 32, offset 0] [def] [from ]
-!40 = metadata !{metadata !"0x29", metadata !153} ; [ DW_TAG_file_type ]
-!41 = metadata !{metadata !42, metadata !44, metadata !45, metadata !47}
-!42 = metadata !{metadata !"0xd\00reserved\00307\0032\0032\000\000", metadata !153, metadata !40, metadata !43} ; [ DW_TAG_member ]
-!43 = metadata !{metadata !"0x24\00long unsigned int\000\0032\0032\000\000\007", null, metadata !0} ; [ DW_TAG_base_type ]
-!44 = metadata !{metadata !"0xd\00Size\00307\0032\0032\0032\000", metadata !153, metadata !40, metadata !43} ; [ DW_TAG_member ]
-!45 = metadata !{metadata !"0xd\00CopyFuncPtr\00307\0032\0032\0064\000", metadata !153, metadata !40, metadata !46} ; [ DW_TAG_member ]
-!46 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !32} ; [ DW_TAG_pointer_type ]
-!47 = metadata !{metadata !"0xd\00DestroyFuncPtr\00307\0032\0032\0096\000", metadata !153, metadata !40, metadata !46} ; [ DW_TAG_member ]
-!48 = metadata !{metadata !"0xd\00mydata\00609\0032\0032\00160\000", metadata !152, metadata !24, metadata !49} ; [ DW_TAG_member ]
-!49 = metadata !{metadata !"0xf\00\000\0032\000\000\000", null, metadata !0, metadata !50} ; [ DW_TAG_pointer_type ]
-!50 = metadata !{metadata !"0x13\00\000\00224\000\000\0016\000", metadata !152, metadata !24, null, metadata !51, null, null, null} ; [ DW_TAG_structure_type ] [line 0, size 224, align 0, offset 0] [def] [from ]
-!51 = metadata !{metadata !52, metadata !53, metadata !54, metadata !55, metadata !56, metadata !57, metadata !58}
-!52 = metadata !{metadata !"0xd\00__isa\000\0032\0032\000\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
-!53 = metadata !{metadata !"0xd\00__forwarding\000\0032\0032\0032\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
-!54 = metadata !{metadata !"0xd\00__flags\000\0032\0032\0064\000", metadata !152, metadata !24, metadata !34} ; [ DW_TAG_member ]
-!55 = metadata !{metadata !"0xd\00__size\000\0032\0032\0096\000", metadata !152, metadata !24, metadata !34} ; [ DW_TAG_member ]
-!56 = metadata !{metadata !"0xd\00__copy_helper\000\0032\0032\00128\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
-!57 = metadata !{metadata !"0xd\00__destroy_helper\000\0032\0032\00160\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
-!58 = metadata !{metadata !"0xd\00mydata\000\0032\0032\00192\000", metadata !152, metadata !24, metadata !59} ; [ DW_TAG_member ]
-!59 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !60} ; [ DW_TAG_pointer_type ]
-!60 = metadata !{metadata !"0x13\00UIMydata\0026\00128\0032\000\000\0016", metadata !154, metadata !24, null, metadata !62, null, null, null} ; [ DW_TAG_structure_type ] [UIMydata] [line 26, size 128, align 32, offset 0] [def] [from ]
-!61 = metadata !{metadata !"0x29", metadata !154} ; [ DW_TAG_file_type ]
-!62 = metadata !{metadata !63, metadata !71, metadata !75, metadata !79}
-!63 = metadata !{metadata !"0x1c\00\000\000\000\000\000", metadata !60, null, metadata !64} ; [ DW_TAG_inheritance ]
-!64 = metadata !{metadata !"0x13\00NSO\0066\0032\0032\000\000\0016", metadata !155, metadata !40, null, metadata !66, null, null, null} ; [ DW_TAG_structure_type ] [NSO] [line 66, size 32, align 32, offset 0] [def] [from ]
-!65 = metadata !{metadata !"0x29", metadata !155} ; [ DW_TAG_file_type ]
-!66 = metadata !{metadata !67}
-!67 = metadata !{metadata !"0xd\00isa\0067\0032\0032\000\002", metadata !155, metadata !65, metadata !68, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!68 = metadata !{metadata !"0x16\00Class\00197\000\000\000\000", metadata !153, metadata !0, metadata !69} ; [ DW_TAG_typedef ]
-!69 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !70} ; [ DW_TAG_pointer_type ]
-!70 = metadata !{metadata !"0x13\00objc_class\000\000\000\000\004\000", metadata !153, metadata !0, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
-!71 = metadata !{metadata !"0xd\00_mydataRef\0028\0032\0032\0032\000", metadata !154, metadata !61, metadata !72, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!72 = metadata !{metadata !"0x16\00CFTypeRef\00313\000\000\000\000", metadata !152, metadata !0, metadata !73} ; [ DW_TAG_typedef ]
-!73 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !74} ; [ DW_TAG_pointer_type ]
-!74 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, metadata !0, null} ; [ DW_TAG_const_type ]
-!75 = metadata !{metadata !"0xd\00_scale\0029\0032\0032\0064\000", metadata !154, metadata !61, metadata !76, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!76 = metadata !{metadata !"0x16\00Float\0089\000\000\000\000", metadata !156, metadata !0, metadata !78} ; [ DW_TAG_typedef ]
-!77 = metadata !{metadata !"0x29", metadata !156} ; [ DW_TAG_file_type ]
-!78 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !0} ; [ DW_TAG_base_type ]
-!79 = metadata !{metadata !"0xd\00_mydataFlags\0037\008\008\0096\000", metadata !154, metadata !61, metadata !80, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!80 = metadata !{metadata !"0x13\00\0030\008\008\000\000\000", metadata !154, metadata !0, null, metadata !81, null, null, null} ; [ DW_TAG_structure_type ] [line 30, size 8, align 8, offset 0] [def] [from ]
-!81 = metadata !{metadata !82, metadata !84, metadata !85, metadata !86, metadata !87, metadata !88}
-!82 = metadata !{metadata !"0xd\00named\0031\001\0032\000\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
-!83 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, metadata !0} ; [ DW_TAG_base_type ]
-!84 = metadata !{metadata !"0xd\00mydataO\0032\003\0032\001\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
-!85 = metadata !{metadata !"0xd\00cached\0033\001\0032\004\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
-!86 = metadata !{metadata !"0xd\00hasBeenCached\0034\001\0032\005\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
-!87 = metadata !{metadata !"0xd\00hasPattern\0035\001\0032\006\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
-!88 = metadata !{metadata !"0xd\00isCIMydata\0036\001\0032\007\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
-!89 = metadata !{metadata !"0xd\00self\00609\0032\0032\00192\000", metadata !152, metadata !24, metadata !90} ; [ DW_TAG_member ]
-!90 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !91} ; [ DW_TAG_pointer_type ]
-!91 = metadata !{metadata !"0x13\00MyWork\0036\00384\0032\000\000\0016", metadata !152, metadata !40, null, metadata !92, null, null, null} ; [ DW_TAG_structure_type ] [MyWork] [line 36, size 384, align 32, offset 0] [def] [from ]
-!92 = metadata !{metadata !93, metadata !98, metadata !101, metadata !107, metadata !123}
-!93 = metadata !{metadata !"0x1c\00\000\000\000\000\000", metadata !152, metadata !91, metadata !94} ; [ DW_TAG_inheritance ]
-!94 = metadata !{metadata !"0x13\00twork\0043\0032\0032\000\000\0016", metadata !157, metadata !40, null, metadata !96, null, null, null} ; [ DW_TAG_structure_type ] [twork] [line 43, size 32, align 32, offset 0] [def] [from ]
-!95 = metadata !{metadata !"0x29", metadata !157} ; [ DW_TAG_file_type ]
-!96 = metadata !{metadata !97}
-!97 = metadata !{metadata !"0x1c\00\000\000\000\000\000", metadata !94, null, metadata !64} ; [ DW_TAG_inheritance ]
-!98 = metadata !{metadata !"0xd\00_itemID\0038\0064\0032\0032\001", metadata !152, metadata !24, metadata !99, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!99 = metadata !{metadata !"0x16\00uint64_t\0055\000\000\000\000", metadata !153, metadata !0, metadata !100} ; [ DW_TAG_typedef ]
-!100 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0032\000\000\007", null, metadata !0} ; [ DW_TAG_base_type ]
-!101 = metadata !{metadata !"0xd\00_library\0039\0032\0032\0096\001", metadata !152, metadata !24, metadata !102, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!102 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !103} ; [ DW_TAG_pointer_type ]
-!103 = metadata !{metadata !"0x13\00MyLibrary2\0022\0032\0032\000\000\0016", metadata !158, metadata !40, null, metadata !105, null, null, null} ; [ DW_TAG_structure_type ] [MyLibrary2] [line 22, size 32, align 32, offset 0] [def] [from ]
-!104 = metadata !{metadata !"0x29", metadata !158} ; [ DW_TAG_file_type ]
-!105 = metadata !{metadata !106}
-!106 = metadata !{metadata !"0x1c\00\000\000\000\000\000", metadata !103, null, metadata !64} ; [ DW_TAG_inheritance ]
-!107 = metadata !{metadata !"0xd\00_bounds\0040\00128\0032\00128\001", metadata !152, metadata !24, metadata !108, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!108 = metadata !{metadata !"0x16\00CR\0033\000\000\000\000", metadata !153, metadata !0, metadata !109} ; [ DW_TAG_typedef ]
-!109 = metadata !{metadata !"0x13\00CR\0029\00128\0032\000\000\000", metadata !156, metadata !0, null, metadata !110, null, null, null} ; [ DW_TAG_structure_type ] [CR] [line 29, size 128, align 32, offset 0] [def] [from ]
-!110 = metadata !{metadata !111, metadata !117}
-!111 = metadata !{metadata !"0xd\00origin\0030\0064\0032\000\000", metadata !156, metadata !77, metadata !112} ; [ DW_TAG_member ]
-!112 = metadata !{metadata !"0x16\00CP\0017\000\000\000\000", metadata !156, metadata !0, metadata !113} ; [ DW_TAG_typedef ]
-!113 = metadata !{metadata !"0x13\00CP\0013\0064\0032\000\000\000", metadata !156, metadata !0, null, metadata !114, null, null, null} ; [ DW_TAG_structure_type ] [CP] [line 13, size 64, align 32, offset 0] [def] [from ]
-!114 = metadata !{metadata !115, metadata !116}
-!115 = metadata !{metadata !"0xd\00x\0014\0032\0032\000\000", metadata !156, metadata !77, metadata !76} ; [ DW_TAG_member ]
-!116 = metadata !{metadata !"0xd\00y\0015\0032\0032\0032\000", metadata !156, metadata !77, metadata !76} ; [ DW_TAG_member ]
-!117 = metadata !{metadata !"0xd\00size\0031\0064\0032\0064\000", metadata !156, metadata !77, metadata !118} ; [ DW_TAG_member ]
-!118 = metadata !{metadata !"0x16\00Size\0025\000\000\000\000", metadata !156, metadata !0, metadata !119} ; [ DW_TAG_typedef ]
-!119 = metadata !{metadata !"0x13\00Size\0021\0064\0032\000\000\000", metadata !156, metadata !0, null, metadata !120, null, null, null} ; [ DW_TAG_structure_type ] [Size] [line 21, size 64, align 32, offset 0] [def] [from ]
-!120 = metadata !{metadata !121, metadata !122}
-!121 = metadata !{metadata !"0xd\00width\0022\0032\0032\000\000", metadata !156, metadata !77, metadata !76} ; [ DW_TAG_member ]
-!122 = metadata !{metadata !"0xd\00height\0023\0032\0032\0032\000", metadata !156, metadata !77, metadata !76} ; [ DW_TAG_member ]
-!123 = metadata !{metadata !"0xd\00_data\0040\00128\0032\00256\001", metadata !152, metadata !24, metadata !108, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!124 = metadata !{metadata !"0xd\00semi\00609\0032\0032\00224\000", metadata !152, metadata !24, metadata !125} ; [ DW_TAG_member ]
-!125 = metadata !{metadata !"0x16\00d_t\0035\000\000\000\000", metadata !152, metadata !0, metadata !126} ; [ DW_TAG_typedef ]
-!126 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !127} ; [ DW_TAG_pointer_type ]
-!127 = metadata !{metadata !"0x13\00my_struct\0049\000\000\000\004\000", metadata !159, metadata !0, null, null, null, null, null} ; [ DW_TAG_structure_type ] [my_struct] [line 49, size 0, align 0, offset 0] [decl] [from ]
-!128 = metadata !{metadata !"0x29", metadata !159} ; [ DW_TAG_file_type ]
-!129 = metadata !{i32 609, i32 144, metadata !23, null}
-!130 = metadata !{metadata !"0x101\00loadedMydata\0033555041\000", metadata !23, metadata !24, metadata !59} ; [ DW_TAG_arg_variable ]
-!131 = metadata !{i32 609, i32 155, metadata !23, null}
-!132 = metadata !{metadata !"0x101\00bounds\0050332257\000", metadata !23, metadata !24, metadata !108} ; [ DW_TAG_arg_variable ]
-!133 = metadata !{i32 609, i32 175, metadata !23, null}
-!134 = metadata !{metadata !"0x101\00data\0067109473\000", metadata !23, metadata !24, metadata !108} ; [ DW_TAG_arg_variable ]
-!135 = metadata !{i32 609, i32 190, metadata !23, null}
-!136 = metadata !{metadata !"0x100\00mydata\00604\000", metadata !23, metadata !24, metadata !50} ; [ DW_TAG_auto_variable ]
-!137 = metadata !{i32 604, i32 49, metadata !23, null}
-!138 = metadata !{metadata !"0x100\00self\00604\000", metadata !23, metadata !40, metadata !90} ; [ DW_TAG_auto_variable ]
-!139 = metadata !{metadata !"0x100\00semi\00607\000", metadata !23, metadata !24, metadata !125} ; [ DW_TAG_auto_variable ]
-!140 = metadata !{i32 607, i32 30, metadata !23, null}
-!141 = metadata !{i32 610, i32 17, metadata !142, null}
-!142 = metadata !{metadata !"0xb\00609\00200\0094", metadata !152, metadata !23} ; [ DW_TAG_lexical_block ]
-!143 = metadata !{i32 611, i32 17, metadata !142, null}
-!144 = metadata !{i32 612, i32 17, metadata !142, null}
-!145 = metadata !{i32 613, i32 17, metadata !142, null}
-!146 = metadata !{i32 615, i32 13, metadata !142, null}
-!147 = metadata !{metadata !1, metadata !1, metadata !5, metadata !5, metadata !9, metadata !14, metadata !19, metadata !19, metadata !14, metadata !14, metadata !14, metadata !19, metadata !19, metadata !19}
-!148 = metadata !{metadata !23}
-!149 = metadata !{metadata !"header3.h", metadata !"/Volumes/Sandbox/llvm"}
-!150 = metadata !{metadata !"Private.h", metadata !"/Volumes/Sandbox/llvm"}
-!151 = metadata !{metadata !"header4.h", metadata !"/Volumes/Sandbox/llvm"}
-!152 = metadata !{metadata !"MyLibrary.m", metadata !"/Volumes/Sandbox/llvm"}
-!153 = metadata !{metadata !"MyLibrary.i", metadata !"/Volumes/Sandbox/llvm"}
-!154 = metadata !{metadata !"header11.h", metadata !"/Volumes/Sandbox/llvm"}
-!155 = metadata !{metadata !"NSO.h", metadata !"/Volumes/Sandbox/llvm"}
-!156 = metadata !{metadata !"header12.h", metadata !"/Volumes/Sandbox/llvm"}
-!157 = metadata !{metadata !"header13.h", metadata !"/Volumes/Sandbox/llvm"}
-!158 = metadata !{metadata !"header14.h", metadata !"/Volumes/Sandbox/llvm"}
-!159 = metadata !{metadata !"header15.h", metadata !"/Volumes/Sandbox/llvm"}
-!160 = metadata !{metadata !"header.h", metadata !"/Volumes/Sandbox/llvm"}
-!161 = metadata !{metadata !"header2.h", metadata !"/Volumes/Sandbox/llvm"}
-!162 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!163 = metadata !{metadata !"0x102\0034\0020\006\0034\004\006\0034\0024"} ; [ DW_TAG_expression ] [DW_OP_plus 20 DW_OP_deref DW_OP_plus 4 DW_OP_deref DW_OP_plus 24]
-!164 = metadata !{metadata !"0x102\0034\0024"} ; [ DW_TAG_expression ] [DW_OP_plus 24]
-!165 = metadata !{metadata !"0x102\0034\0028"} ; [ DW_TAG_expression ] [DW_OP_plus 28]
+!0 = !{!"0x11\0016\00Apple clang version 2.1\000\00\002\00\001", !153, !147, !26, !148, null, null} ; [ DW_TAG_compile_unit ]
+!1 = !{!"0x4\00\00248\0032\0032\000\000\000", !160, !0, null, !3, null, null, null} ; [ DW_TAG_enumeration_type ] [line 248, size 32, align 32, offset 0] [def] [from ]
+!2 = !{!"0x29", !160} ; [ DW_TAG_file_type ]
+!3 = !{!4}
+!4 = !{!"0x28\00Ver1\000"} ; [ DW_TAG_enumerator ]
+!5 = !{!"0x4\00Mode\0079\0032\0032\000\000\000", !160, !0, null, !7, null, null, null} ; [ DW_TAG_enumeration_type ] [Mode] [line 79, size 32, align 32, offset 0] [def] [from ]
+!6 = !{!"0x29", !161} ; [ DW_TAG_file_type ]
+!7 = !{!8}
+!8 = !{!"0x28\00One\000"} ; [ DW_TAG_enumerator ]
+!9 = !{!"0x4\00\0015\0032\0032\000\000\000", !149, !0, null, !11, null, null, null} ; [ DW_TAG_enumeration_type ] [line 15, size 32, align 32, offset 0] [def] [from ]
+!10 = !{!"0x29", !149} ; [ DW_TAG_file_type ]
+!11 = !{!12, !13}
+!12 = !{!"0x28\00Unknown\000"} ; [ DW_TAG_enumerator ]
+!13 = !{!"0x28\00Known\001"} ; [ DW_TAG_enumerator ]
+!14 = !{!"0x4\00\0020\0032\0032\000\000\000", !150, !0, null, !16, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
+!15 = !{!"0x29", !150} ; [ DW_TAG_file_type ]
+!16 = !{!17, !18}
+!17 = !{!"0x28\00Single\000"} ; [ DW_TAG_enumerator ]
+!18 = !{!"0x28\00Double\001"} ; [ DW_TAG_enumerator ]
+!19 = !{!"0x4\00\0014\0032\0032\000\000\000", !151, !0, null, !21, null, null, null} ; [ DW_TAG_enumeration_type ] [line 14, size 32, align 32, offset 0] [def] [from ]
+!20 = !{!"0x29", !151} ; [ DW_TAG_file_type ]
+!21 = !{!22}
+!22 = !{!"0x28\00Eleven\000"} ; [ DW_TAG_enumerator ]
+!23 = !{!"0x2e\00foobar_func_block_invoke_0\00foobar_func_block_invoke_0\00\00609\001\001\000\006\00256\000\00609", !152, !24, !25, null, void (i8*, %0*, [4 x i32], [4 x i32])* @foobar_func_block_invoke_0, null, null, null} ; [ DW_TAG_subprogram ] [line 609] [local] [def] [foobar_func_block_invoke_0]
+!24 = !{!"0x29", !152} ; [ DW_TAG_file_type ]
+!25 = !{!"0x15\00\000\000\000\000\000\000", !152, !24, null, !26, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!26 = !{null}
+!27 = !{!"0x101\00.block_descriptor\0016777825\0064", !23, !24, !28} ; [ DW_TAG_arg_variable ]
+!28 = !{!"0xf\00\000\0032\000\000\000", null, !0, !29} ; [ DW_TAG_pointer_type ]
+!29 = !{!"0x13\00__block_literal_14\00609\00256\0032\000\000\000", !152, !24, null, !30, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_14] [line 609, size 256, align 32, offset 0] [def] [from ]
+!30 = !{!31, !33, !35, !36, !37, !48, !89, !124}
+!31 = !{!"0xd\00__isa\00609\0032\0032\000\000", !152, !24, !32} ; [ DW_TAG_member ]
+!32 = !{!"0xf\00\000\0032\0032\000\000", null, !0, null} ; [ DW_TAG_pointer_type ]
+!33 = !{!"0xd\00__flags\00609\0032\0032\0032\000", !152, !24, !34} ; [ DW_TAG_member ]
+!34 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !0} ; [ DW_TAG_base_type ]
+!35 = !{!"0xd\00__reserved\00609\0032\0032\0064\000", !152, !24, !34} ; [ DW_TAG_member ]
+!36 = !{!"0xd\00__FuncPtr\00609\0032\0032\0096\000", !152, !24, !32} ; [ DW_TAG_member ]
+!37 = !{!"0xd\00__descriptor\00609\0032\0032\00128\000", !152, !24, !38} ; [ DW_TAG_member ]
+!38 = !{!"0xf\00\000\0032\0032\000\000", null, !0, !39} ; [ DW_TAG_pointer_type ]
+!39 = !{!"0x13\00__block_descriptor_withcopydispose\00307\00128\0032\000\000\000", !153, !0, null, !41, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 307, size 128, align 32, offset 0] [def] [from ]
+!40 = !{!"0x29", !153} ; [ DW_TAG_file_type ]
+!41 = !{!42, !44, !45, !47}
+!42 = !{!"0xd\00reserved\00307\0032\0032\000\000", !153, !40, !43} ; [ DW_TAG_member ]
+!43 = !{!"0x24\00long unsigned int\000\0032\0032\000\000\007", null, !0} ; [ DW_TAG_base_type ]
+!44 = !{!"0xd\00Size\00307\0032\0032\0032\000", !153, !40, !43} ; [ DW_TAG_member ]
+!45 = !{!"0xd\00CopyFuncPtr\00307\0032\0032\0064\000", !153, !40, !46} ; [ DW_TAG_member ]
+!46 = !{!"0xf\00\000\0032\0032\000\000", null, !0, !32} ; [ DW_TAG_pointer_type ]
+!47 = !{!"0xd\00DestroyFuncPtr\00307\0032\0032\0096\000", !153, !40, !46} ; [ DW_TAG_member ]
+!48 = !{!"0xd\00mydata\00609\0032\0032\00160\000", !152, !24, !49} ; [ DW_TAG_member ]
+!49 = !{!"0xf\00\000\0032\000\000\000", null, !0, !50} ; [ DW_TAG_pointer_type ]
+!50 = !{!"0x13\00\000\00224\000\000\0016\000", !152, !24, null, !51, null, null, null} ; [ DW_TAG_structure_type ] [line 0, size 224, align 0, offset 0] [def] [from ]
+!51 = !{!52, !53, !54, !55, !56, !57, !58}
+!52 = !{!"0xd\00__isa\000\0032\0032\000\000", !152, !24, !32} ; [ DW_TAG_member ]
+!53 = !{!"0xd\00__forwarding\000\0032\0032\0032\000", !152, !24, !32} ; [ DW_TAG_member ]
+!54 = !{!"0xd\00__flags\000\0032\0032\0064\000", !152, !24, !34} ; [ DW_TAG_member ]
+!55 = !{!"0xd\00__size\000\0032\0032\0096\000", !152, !24, !34} ; [ DW_TAG_member ]
+!56 = !{!"0xd\00__copy_helper\000\0032\0032\00128\000", !152, !24, !32} ; [ DW_TAG_member ]
+!57 = !{!"0xd\00__destroy_helper\000\0032\0032\00160\000", !152, !24, !32} ; [ DW_TAG_member ]
+!58 = !{!"0xd\00mydata\000\0032\0032\00192\000", !152, !24, !59} ; [ DW_TAG_member ]
+!59 = !{!"0xf\00\000\0032\0032\000\000", null, !0, !60} ; [ DW_TAG_pointer_type ]
+!60 = !{!"0x13\00UIMydata\0026\00128\0032\000\000\0016", !154, !24, null, !62, null, null, null} ; [ DW_TAG_structure_type ] [UIMydata] [line 26, size 128, align 32, offset 0] [def] [from ]
+!61 = !{!"0x29", !154} ; [ DW_TAG_file_type ]
+!62 = !{!63, !71, !75, !79}
+!63 = !{!"0x1c\00\000\000\000\000\000", !60, null, !64} ; [ DW_TAG_inheritance ]
+!64 = !{!"0x13\00NSO\0066\0032\0032\000\000\0016", !155, !40, null, !66, null, null, null} ; [ DW_TAG_structure_type ] [NSO] [line 66, size 32, align 32, offset 0] [def] [from ]
+!65 = !{!"0x29", !155} ; [ DW_TAG_file_type ]
+!66 = !{!67}
+!67 = !{!"0xd\00isa\0067\0032\0032\000\002", !155, !65, !68, !"", !"", !"", i32 0} ; [ DW_TAG_member ]
+!68 = !{!"0x16\00Class\00197\000\000\000\000", !153, !0, !69} ; [ DW_TAG_typedef ]
+!69 = !{!"0xf\00\000\0032\0032\000\000", null, !0, !70} ; [ DW_TAG_pointer_type ]
+!70 = !{!"0x13\00objc_class\000\000\000\000\004\000", !153, !0, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!71 = !{!"0xd\00_mydataRef\0028\0032\0032\0032\000", !154, !61, !72, !"", !"", !"", i32 0} ; [ DW_TAG_member ]
+!72 = !{!"0x16\00CFTypeRef\00313\000\000\000\000", !152, !0, !73} ; [ DW_TAG_typedef ]
+!73 = !{!"0xf\00\000\0032\0032\000\000", null, !0, !74} ; [ DW_TAG_pointer_type ]
+!74 = !{!"0x26\00\000\000\000\000\000", null, !0, null} ; [ DW_TAG_const_type ]
+!75 = !{!"0xd\00_scale\0029\0032\0032\0064\000", !154, !61, !76, !"", !"", !"", i32 0} ; [ DW_TAG_member ]
+!76 = !{!"0x16\00Float\0089\000\000\000\000", !156, !0, !78} ; [ DW_TAG_typedef ]
+!77 = !{!"0x29", !156} ; [ DW_TAG_file_type ]
+!78 = !{!"0x24\00float\000\0032\0032\000\000\004", null, !0} ; [ DW_TAG_base_type ]
+!79 = !{!"0xd\00_mydataFlags\0037\008\008\0096\000", !154, !61, !80, !"", !"", !"", i32 0} ; [ DW_TAG_member ]
+!80 = !{!"0x13\00\0030\008\008\000\000\000", !154, !0, null, !81, null, null, null} ; [ DW_TAG_structure_type ] [line 30, size 8, align 8, offset 0] [def] [from ]
+!81 = !{!82, !84, !85, !86, !87, !88}
+!82 = !{!"0xd\00named\0031\001\0032\000\000", !154, !61, !83} ; [ DW_TAG_member ]
+!83 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", null, !0} ; [ DW_TAG_base_type ]
+!84 = !{!"0xd\00mydataO\0032\003\0032\001\000", !154, !61, !83} ; [ DW_TAG_member ]
+!85 = !{!"0xd\00cached\0033\001\0032\004\000", !154, !61, !83} ; [ DW_TAG_member ]
+!86 = !{!"0xd\00hasBeenCached\0034\001\0032\005\000", !154, !61, !83} ; [ DW_TAG_member ]
+!87 = !{!"0xd\00hasPattern\0035\001\0032\006\000", !154, !61, !83} ; [ DW_TAG_member ]
+!88 = !{!"0xd\00isCIMydata\0036\001\0032\007\000", !154, !61, !83} ; [ DW_TAG_member ]
+!89 = !{!"0xd\00self\00609\0032\0032\00192\000", !152, !24, !90} ; [ DW_TAG_member ]
+!90 = !{!"0xf\00\000\0032\0032\000\000", null, !0, !91} ; [ DW_TAG_pointer_type ]
+!91 = !{!"0x13\00MyWork\0036\00384\0032\000\000\0016", !152, !40, null, !92, null, null, null} ; [ DW_TAG_structure_type ] [MyWork] [line 36, size 384, align 32, offset 0] [def] [from ]
+!92 = !{!93, !98, !101, !107, !123}
+!93 = !{!"0x1c\00\000\000\000\000\000", !152, !91, !94} ; [ DW_TAG_inheritance ]
+!94 = !{!"0x13\00twork\0043\0032\0032\000\000\0016", !157, !40, null, !96, null, null, null} ; [ DW_TAG_structure_type ] [twork] [line 43, size 32, align 32, offset 0] [def] [from ]
+!95 = !{!"0x29", !157} ; [ DW_TAG_file_type ]
+!96 = !{!97}
+!97 = !{!"0x1c\00\000\000\000\000\000", !94, null, !64} ; [ DW_TAG_inheritance ]
+!98 = !{!"0xd\00_itemID\0038\0064\0032\0032\001", !152, !24, !99, !"", !"", !"", i32 0} ; [ DW_TAG_member ]
+!99 = !{!"0x16\00uint64_t\0055\000\000\000\000", !153, !0, !100} ; [ DW_TAG_typedef ]
+!100 = !{!"0x24\00long long unsigned int\000\0064\0032\000\000\007", null, !0} ; [ DW_TAG_base_type ]
+!101 = !{!"0xd\00_library\0039\0032\0032\0096\001", !152, !24, !102, !"", !"", !"", i32 0} ; [ DW_TAG_member ]
+!102 = !{!"0xf\00\000\0032\0032\000\000", null, !0, !103} ; [ DW_TAG_pointer_type ]
+!103 = !{!"0x13\00MyLibrary2\0022\0032\0032\000\000\0016", !158, !40, null, !105, null, null, null} ; [ DW_TAG_structure_type ] [MyLibrary2] [line 22, size 32, align 32, offset 0] [def] [from ]
+!104 = !{!"0x29", !158} ; [ DW_TAG_file_type ]
+!105 = !{!106}
+!106 = !{!"0x1c\00\000\000\000\000\000", !103, null, !64} ; [ DW_TAG_inheritance ]
+!107 = !{!"0xd\00_bounds\0040\00128\0032\00128\001", !152, !24, !108, !"", !"", !"", i32 0} ; [ DW_TAG_member ]
+!108 = !{!"0x16\00CR\0033\000\000\000\000", !153, !0, !109} ; [ DW_TAG_typedef ]
+!109 = !{!"0x13\00CR\0029\00128\0032\000\000\000", !156, !0, null, !110, null, null, null} ; [ DW_TAG_structure_type ] [CR] [line 29, size 128, align 32, offset 0] [def] [from ]
+!110 = !{!111, !117}
+!111 = !{!"0xd\00origin\0030\0064\0032\000\000", !156, !77, !112} ; [ DW_TAG_member ]
+!112 = !{!"0x16\00CP\0017\000\000\000\000", !156, !0, !113} ; [ DW_TAG_typedef ]
+!113 = !{!"0x13\00CP\0013\0064\0032\000\000\000", !156, !0, null, !114, null, null, null} ; [ DW_TAG_structure_type ] [CP] [line 13, size 64, align 32, offset 0] [def] [from ]
+!114 = !{!115, !116}
+!115 = !{!"0xd\00x\0014\0032\0032\000\000", !156, !77, !76} ; [ DW_TAG_member ]
+!116 = !{!"0xd\00y\0015\0032\0032\0032\000", !156, !77, !76} ; [ DW_TAG_member ]
+!117 = !{!"0xd\00size\0031\0064\0032\0064\000", !156, !77, !118} ; [ DW_TAG_member ]
+!118 = !{!"0x16\00Size\0025\000\000\000\000", !156, !0, !119} ; [ DW_TAG_typedef ]
+!119 = !{!"0x13\00Size\0021\0064\0032\000\000\000", !156, !0, null, !120, null, null, null} ; [ DW_TAG_structure_type ] [Size] [line 21, size 64, align 32, offset 0] [def] [from ]
+!120 = !{!121, !122}
+!121 = !{!"0xd\00width\0022\0032\0032\000\000", !156, !77, !76} ; [ DW_TAG_member ]
+!122 = !{!"0xd\00height\0023\0032\0032\0032\000", !156, !77, !76} ; [ DW_TAG_member ]
+!123 = !{!"0xd\00_data\0040\00128\0032\00256\001", !152, !24, !108, !"", !"", !"", i32 0} ; [ DW_TAG_member ]
+!124 = !{!"0xd\00semi\00609\0032\0032\00224\000", !152, !24, !125} ; [ DW_TAG_member ]
+!125 = !{!"0x16\00d_t\0035\000\000\000\000", !152, !0, !126} ; [ DW_TAG_typedef ]
+!126 = !{!"0xf\00\000\0032\0032\000\000", null, !0, !127} ; [ DW_TAG_pointer_type ]
+!127 = !{!"0x13\00my_struct\0049\000\000\000\004\000", !159, !0, null, null, null, null, null} ; [ DW_TAG_structure_type ] [my_struct] [line 49, size 0, align 0, offset 0] [decl] [from ]
+!128 = !{!"0x29", !159} ; [ DW_TAG_file_type ]
+!129 = !MDLocation(line: 609, column: 144, scope: !23)
+!130 = !{!"0x101\00loadedMydata\0033555041\000", !23, !24, !59} ; [ DW_TAG_arg_variable ]
+!131 = !MDLocation(line: 609, column: 155, scope: !23)
+!132 = !{!"0x101\00bounds\0050332257\000", !23, !24, !108} ; [ DW_TAG_arg_variable ]
+!133 = !MDLocation(line: 609, column: 175, scope: !23)
+!134 = !{!"0x101\00data\0067109473\000", !23, !24, !108} ; [ DW_TAG_arg_variable ]
+!135 = !MDLocation(line: 609, column: 190, scope: !23)
+!136 = !{!"0x100\00mydata\00604\000", !23, !24, !50} ; [ DW_TAG_auto_variable ]
+!137 = !MDLocation(line: 604, column: 49, scope: !23)
+!138 = !{!"0x100\00self\00604\000", !23, !40, !90} ; [ DW_TAG_auto_variable ]
+!139 = !{!"0x100\00semi\00607\000", !23, !24, !125} ; [ DW_TAG_auto_variable ]
+!140 = !MDLocation(line: 607, column: 30, scope: !23)
+!141 = !MDLocation(line: 610, column: 17, scope: !142)
+!142 = !{!"0xb\00609\00200\0094", !152, !23} ; [ DW_TAG_lexical_block ]
+!143 = !MDLocation(line: 611, column: 17, scope: !142)
+!144 = !MDLocation(line: 612, column: 17, scope: !142)
+!145 = !MDLocation(line: 613, column: 17, scope: !142)
+!146 = !MDLocation(line: 615, column: 13, scope: !142)
+!147 = !{!1, !1, !5, !5, !9, !14, !19, !19, !14, !14, !14, !19, !19, !19}
+!148 = !{!23}
+!149 = !{!"header3.h", !"/Volumes/Sandbox/llvm"}
+!150 = !{!"Private.h", !"/Volumes/Sandbox/llvm"}
+!151 = !{!"header4.h", !"/Volumes/Sandbox/llvm"}
+!152 = !{!"MyLibrary.m", !"/Volumes/Sandbox/llvm"}
+!153 = !{!"MyLibrary.i", !"/Volumes/Sandbox/llvm"}
+!154 = !{!"header11.h", !"/Volumes/Sandbox/llvm"}
+!155 = !{!"NSO.h", !"/Volumes/Sandbox/llvm"}
+!156 = !{!"header12.h", !"/Volumes/Sandbox/llvm"}
+!157 = !{!"header13.h", !"/Volumes/Sandbox/llvm"}
+!158 = !{!"header14.h", !"/Volumes/Sandbox/llvm"}
+!159 = !{!"header15.h", !"/Volumes/Sandbox/llvm"}
+!160 = !{!"header.h", !"/Volumes/Sandbox/llvm"}
+!161 = !{!"header2.h", !"/Volumes/Sandbox/llvm"}
+!162 = !{i32 1, !"Debug Info Version", i32 2}
+!163 = !{!"0x102\0034\0020\006\0034\004\006\0034\0024"} ; [ DW_TAG_expression ] [DW_OP_plus 20 DW_OP_deref DW_OP_plus 4 DW_OP_deref DW_OP_plus 24]
+!164 = !{!"0x102\0034\0024"} ; [ DW_TAG_expression ] [DW_OP_plus 24]
+!165 = !{!"0x102\0034\0028"} ; [ DW_TAG_expression ] [DW_OP_plus 28]
diff --git a/test/CodeGen/ARM/debug-info-branch-folding.ll b/test/CodeGen/ARM/debug-info-branch-folding.ll
index db96b49..9475695 100644
--- a/test/CodeGen/ARM/debug-info-branch-folding.ll
+++ b/test/CodeGen/ARM/debug-info-branch-folding.ll
@@ -20,9 +20,9 @@ entry:
 
 for.body9:                                        ; preds = %for.body9, %entry
   %add19 = fadd <4 x float> undef, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, !dbg !39
-  tail call void @llvm.dbg.value(metadata !{<4 x float> %add19}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !39
+  tail call void @llvm.dbg.value(metadata <4 x float> %add19, i64 0, metadata !27, metadata !{!"0x102"}), !dbg !39
   %add20 = fadd <4 x float> undef, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, !dbg !39
-  tail call void @llvm.dbg.value(metadata !{<4 x float> %add20}, i64 0, metadata !28, metadata !{metadata !"0x102"}), !dbg !39
+  tail call void @llvm.dbg.value(metadata <4 x float> %add20, i64 0, metadata !28, metadata !{!"0x102"}), !dbg !39
   br i1 %cond, label %for.end54, label %for.body9, !dbg !44
 
 for.end54:                                        ; preds = %for.body9
@@ -42,60 +42,60 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.module.flags = !{!56}
 !llvm.dbg.cu = !{!2}
 
-!0 = metadata !{metadata !"0x2e\00test0001\00test0001\00\003\000\001\000\006\00256\001\000", metadata !54, null, metadata !3, i32 0, <4 x float> (float)* @test0001, null, null, metadata !51} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x29", metadata !54} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 129915)\001\00\000\00\001", metadata !54, metadata !17, metadata !17, metadata !50, null,  null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !54, metadata !1, i32 0, metadata !4, i32 0} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x16\00v4f32\0014\000\000\000\000", metadata !54, metadata !2, metadata !6} ; [ DW_TAG_typedef ]
-!6 = metadata !{metadata !"0x1\00\000\00128\00128\000\000", metadata !54, metadata !2, metadata !7, metadata !8, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [from float]
-!7 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !2} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x21\000\004"}         ; [ DW_TAG_subrange_type ]
-!10 = metadata !{metadata !"0x2e\00main\00main\00\0059\000\001\000\006\00256\001\000", metadata !54, null, metadata !11, null, i32 (i32, i8**, i1)* @main, null, null, metadata !52} ; [ DW_TAG_subprogram ] [line 59] [def] [scope 0] [main]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !54, metadata !1, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
-!14 = metadata !{metadata !"0x2e\00printFV\00printFV\00\0041\001\001\000\006\00256\001\000", metadata !55, null, metadata !16, null, null, null, null, metadata !53} ; [ DW_TAG_subprogram ] [line 41] [local] [def] [scope 0] [printFV]
-!15 = metadata !{metadata !"0x29", metadata !55} ; [ DW_TAG_file_type ]
-!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !55, metadata !15, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!17 = metadata !{null}
-!18 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !0, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{metadata !"0x101\00argc\0016777275\000", metadata !10, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
-!20 = metadata !{metadata !"0x101\00argv\0033554491\000", metadata !10, metadata !1, metadata !21} ; [ DW_TAG_arg_variable ]
-!21 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !22} ; [ DW_TAG_pointer_type ]
-!22 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !23} ; [ DW_TAG_pointer_type ]
-!23 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !2} ; [ DW_TAG_base_type ]
-!24 = metadata !{metadata !"0x100\00i\0060\000", metadata !25, metadata !1, metadata !13} ; [ DW_TAG_auto_variable ]
-!25 = metadata !{metadata !"0xb\0059\0033\0014", metadata !1, metadata !10} ; [ DW_TAG_lexical_block ]
-!26 = metadata !{metadata !"0x100\00j\0060\000", metadata !25, metadata !1, metadata !13} ; [ DW_TAG_auto_variable ]
-!27 = metadata !{metadata !"0x100\00x\0061\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!28 = metadata !{metadata !"0x100\00y\0062\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{metadata !"0x100\00z\0063\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!30 = metadata !{metadata !"0x101\00F\0016777257\000", metadata !14, metadata !15, metadata !31} ; [ DW_TAG_arg_variable ]
-!31 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !32} ; [ DW_TAG_pointer_type ]
-!32 = metadata !{metadata !"0x16\00FV\0025\000\000\000\000", metadata !55, metadata !2, metadata !33} ; [ DW_TAG_typedef ]
-!33 = metadata !{metadata !"0x17\00\0022\00128\00128\000\000\000", metadata !55, metadata !2, i32 0, metadata !34, null} ; [ DW_TAG_union_type ]
-!34 = metadata !{metadata !35, metadata !37}
-!35 = metadata !{metadata !"0xd\00V\0023\00128\00128\000\000", metadata !55, metadata !15, metadata !36} ; [ DW_TAG_member ]
-!36 = metadata !{metadata !"0x16\00v4sf\003\000\000\000\000", metadata !55, metadata !2, metadata !6} ; [ DW_TAG_typedef ]
-!37 = metadata !{metadata !"0xd\00A\0024\00128\0032\000\000", metadata !55, metadata !15, metadata !38} ; [ DW_TAG_member ]
-!38 = metadata !{metadata !"0x1\00\000\00128\0032\000\000", null, metadata !2, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_array_type ]
-!39 = metadata !{i32 79, i32 7, metadata !40, null}
-!40 = metadata !{metadata !"0xb\0075\0035\0018", metadata !1, metadata !41} ; [ DW_TAG_lexical_block ]
-!41 = metadata !{metadata !"0xb\0075\005\0017", metadata !1, metadata !42} ; [ DW_TAG_lexical_block ]
-!42 = metadata !{metadata !"0xb\0071\0032\0016", metadata !1, metadata !43} ; [ DW_TAG_lexical_block ]
-!43 = metadata !{metadata !"0xb\0071\003\0015", metadata !1, metadata !25} ; [ DW_TAG_lexical_block ]
-!44 = metadata !{i32 75, i32 5, metadata !42, null}
-!45 = metadata !{i32 42, i32 2, metadata !46, metadata !48}
-!46 = metadata !{metadata !"0xb\0042\002\0020", metadata !15, metadata !47} ; [ DW_TAG_lexical_block ]
-!47 = metadata !{metadata !"0xb\0041\0028\0019", metadata !15, metadata !14} ; [ DW_TAG_lexical_block ]
-!48 = metadata !{i32 95, i32 3, metadata !25, null}
-!49 = metadata !{i32 99, i32 3, metadata !25, null}
-!50 = metadata !{metadata !0, metadata !10, metadata !14}
-!51 = metadata !{metadata !18}
-!52 = metadata !{metadata !19, metadata !20, metadata !24, metadata !26, metadata !27, metadata !28, metadata !29}
-!53 = metadata !{metadata !30}
-!54 = metadata !{metadata !"build2.c", metadata !"/private/tmp"}
-!55 = metadata !{metadata !"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", metadata !"/private/tmp"}
-!56 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00test0001\00test0001\00\003\000\001\000\006\00256\001\000", !54, null, !3, i32 0, <4 x float> (float)* @test0001, null, null, !51} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x29", !54} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 3.0 (trunk 129915)\001\00\000\00\001", !54, !17, !17, !50, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !54, !1, i32 0, !4, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = !{!5}
+!5 = !{!"0x16\00v4f32\0014\000\000\000\000", !54, !2, !6} ; [ DW_TAG_typedef ]
+!6 = !{!"0x1\00\000\00128\00128\000\000", !54, !2, !7, !8, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [from float]
+!7 = !{!"0x24\00float\000\0032\0032\000\000\004", null, !2} ; [ DW_TAG_base_type ]
+!8 = !{!9}
+!9 = !{!"0x21\000\004"}         ; [ DW_TAG_subrange_type ]
+!10 = !{!"0x2e\00main\00main\00\0059\000\001\000\006\00256\001\000", !54, null, !11, null, i32 (i32, i8**, i1)* @main, null, null, !52} ; [ DW_TAG_subprogram ] [line 59] [def] [scope 0] [main]
+!11 = !{!"0x15\00\000\000\000\000\000\000", !54, !1, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!13}
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !2} ; [ DW_TAG_base_type ]
+!14 = !{!"0x2e\00printFV\00printFV\00\0041\001\001\000\006\00256\001\000", !55, null, !16, null, null, null, null, !53} ; [ DW_TAG_subprogram ] [line 41] [local] [def] [scope 0] [printFV]
+!15 = !{!"0x29", !55} ; [ DW_TAG_file_type ]
+!16 = !{!"0x15\00\000\000\000\000\000\000", !55, !15, null, !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = !{null}
+!18 = !{!"0x101\00a\0016777219\000", !0, !1, !7} ; [ DW_TAG_arg_variable ]
+!19 = !{!"0x101\00argc\0016777275\000", !10, !1, !13} ; [ DW_TAG_arg_variable ]
+!20 = !{!"0x101\00argv\0033554491\000", !10, !1, !21} ; [ DW_TAG_arg_variable ]
+!21 = !{!"0xf\00\000\0032\0032\000\000", null, !2, !22} ; [ DW_TAG_pointer_type ]
+!22 = !{!"0xf\00\000\0032\0032\000\000", null, !2, !23} ; [ DW_TAG_pointer_type ]
+!23 = !{!"0x24\00char\000\008\008\000\000\006", null, !2} ; [ DW_TAG_base_type ]
+!24 = !{!"0x100\00i\0060\000", !25, !1, !13} ; [ DW_TAG_auto_variable ]
+!25 = !{!"0xb\0059\0033\0014", !1, !10} ; [ DW_TAG_lexical_block ]
+!26 = !{!"0x100\00j\0060\000", !25, !1, !13} ; [ DW_TAG_auto_variable ]
+!27 = !{!"0x100\00x\0061\000", !25, !1, !5} ; [ DW_TAG_auto_variable ]
+!28 = !{!"0x100\00y\0062\000", !25, !1, !5} ; [ DW_TAG_auto_variable ]
+!29 = !{!"0x100\00z\0063\000", !25, !1, !5} ; [ DW_TAG_auto_variable ]
+!30 = !{!"0x101\00F\0016777257\000", !14, !15, !31} ; [ DW_TAG_arg_variable ]
+!31 = !{!"0xf\00\000\0032\0032\000\000", null, !2, !32} ; [ DW_TAG_pointer_type ]
+!32 = !{!"0x16\00FV\0025\000\000\000\000", !55, !2, !33} ; [ DW_TAG_typedef ]
+!33 = !{!"0x17\00\0022\00128\00128\000\000\000", !55, !2, i32 0, !34, null} ; [ DW_TAG_union_type ]
+!34 = !{!35, !37}
+!35 = !{!"0xd\00V\0023\00128\00128\000\000", !55, !15, !36} ; [ DW_TAG_member ]
+!36 = !{!"0x16\00v4sf\003\000\000\000\000", !55, !2, !6} ; [ DW_TAG_typedef ]
+!37 = !{!"0xd\00A\0024\00128\0032\000\000", !55, !15, !38} ; [ DW_TAG_member ]
+!38 = !{!"0x1\00\000\00128\0032\000\000", null, !2, !7, !8, i32 0, i32 0} ; [ DW_TAG_array_type ]
+!39 = !MDLocation(line: 79, column: 7, scope: !40)
+!40 = !{!"0xb\0075\0035\0018", !1, !41} ; [ DW_TAG_lexical_block ]
+!41 = !{!"0xb\0075\005\0017", !1, !42} ; [ DW_TAG_lexical_block ]
+!42 = !{!"0xb\0071\0032\0016", !1, !43} ; [ DW_TAG_lexical_block ]
+!43 = !{!"0xb\0071\003\0015", !1, !25} ; [ DW_TAG_lexical_block ]
+!44 = !MDLocation(line: 75, column: 5, scope: !42)
+!45 = !MDLocation(line: 42, column: 2, scope: !46, inlinedAt: !48)
+!46 = !{!"0xb\0042\002\0020", !15, !47} ; [ DW_TAG_lexical_block ]
+!47 = !{!"0xb\0041\0028\0019", !15, !14} ; [ DW_TAG_lexical_block ]
+!48 = !MDLocation(line: 95, column: 3, scope: !25)
+!49 = !MDLocation(line: 99, column: 3, scope: !25)
+!50 = !{!0, !10, !14}
+!51 = !{!18}
+!52 = !{!19, !20, !24, !26, !27, !28, !29}
+!53 = !{!30}
+!54 = !{!"build2.c", !"/private/tmp"}
+!55 = !{!"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", !"/private/tmp"}
+!56 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/debug-info-d16-reg.ll b/test/CodeGen/ARM/debug-info-d16-reg.ll
index 9791987..85b510f 100644
--- a/test/CodeGen/ARM/debug-info-d16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-d16-reg.ll
@@ -12,9 +12,9 @@ target triple = "thumbv7-apple-darwin10"
 
 define i32 @inlineprinter(i8* %ptr, double %val, i8 zeroext %c) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !19, metadata !{metadata !"0x102"}), !dbg !26
-  tail call void @llvm.dbg.value(metadata !{double %val}, i64 0, metadata !20, metadata !{metadata !"0x102"}), !dbg !26
-  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !21, metadata !{metadata !"0x102"}), !dbg !26
+  tail call void @llvm.dbg.value(metadata i8* %ptr, i64 0, metadata !19, metadata !{!"0x102"}), !dbg !26
+  tail call void @llvm.dbg.value(metadata double %val, i64 0, metadata !20, metadata !{!"0x102"}), !dbg !26
+  tail call void @llvm.dbg.value(metadata i8 %c, i64 0, metadata !21, metadata !{!"0x102"}), !dbg !26
   %0 = zext i8 %c to i32, !dbg !27
   %1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %val, i32 %0) nounwind, !dbg !27
   ret i32 0, !dbg !29
@@ -22,9 +22,9 @@ entry:
 
 define i32 @printer(i8* %ptr, double %val, i8 zeroext %c) nounwind optsize noinline {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !30
-  tail call void @llvm.dbg.value(metadata !{double %val}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !30
-  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata i8* %ptr, i64 0, metadata !16, metadata !{!"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata double %val, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata i8 %c, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !30
   %0 = zext i8 %c to i32, !dbg !31
   %1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %val, i32 %0) nounwind, !dbg !31
   ret i32 0, !dbg !33
@@ -36,18 +36,18 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 define i32 @main(i32 %argc, i8** nocapture %argv) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !22, metadata !{metadata !"0x102"}), !dbg !34
-  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !34
+  tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !22, metadata !{!"0x102"}), !dbg !34
+  tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !23, metadata !{!"0x102"}), !dbg !34
   %0 = sitofp i32 %argc to double, !dbg !35
   %1 = fadd double %0, 5.555552e+05, !dbg !35
-  tail call void @llvm.dbg.value(metadata !{double %1}, i64 0, metadata !24, metadata !{metadata !"0x102"}), !dbg !35
+  tail call void @llvm.dbg.value(metadata double %1, i64 0, metadata !24, metadata !{!"0x102"}), !dbg !35
   %2 = tail call i32 @puts(i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0)) nounwind, !dbg !36
   %3 = getelementptr inbounds i8* bitcast (i32 (i32, i8**)* @main to i8*), i32 %argc, !dbg !37
   %4 = trunc i32 %argc to i8, !dbg !37
   %5 = add i8 %4, 97, !dbg !37
-  tail call void @llvm.dbg.value(metadata !{i8* %3}, i64 0, metadata !19, metadata !{metadata !"0x102"}) nounwind, !dbg !38
-  tail call void @llvm.dbg.value(metadata !{double %1}, i64 0, metadata !20, metadata !{metadata !"0x102"}) nounwind, !dbg !38
-  tail call void @llvm.dbg.value(metadata !{i8 %5}, i64 0, metadata !21, metadata !{metadata !"0x102"}) nounwind, !dbg !38
+  tail call void @llvm.dbg.value(metadata i8* %3, i64 0, metadata !19, metadata !{!"0x102"}) nounwind, !dbg !38
+  tail call void @llvm.dbg.value(metadata double %1, i64 0, metadata !20, metadata !{!"0x102"}) nounwind, !dbg !38
+  tail call void @llvm.dbg.value(metadata i8 %5, i64 0, metadata !21, metadata !{!"0x102"}) nounwind, !dbg !38
   %6 = zext i8 %5 to i32, !dbg !39
   %7 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %3, double %1, i32 %6) nounwind, !dbg !39
   %8 = tail call i32 @printer(i8* %3, double %1, i8 zeroext %5) nounwind, !dbg !40
@@ -59,52 +59,52 @@ declare i32 @puts(i8* nocapture) nounwind
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!48}
 
-!0 = metadata !{metadata !"0x2e\00printer\00printer\00printer\0012\000\001\000\006\00256\001\0012", metadata !46, metadata !1, metadata !3, null, i32 (i8*, double, i8)* @printer, null, null, metadata !43} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x29", metadata !46} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\001\00(LLVM build 00)\001\00\000\00\001", metadata !46, metadata !47, metadata !47, metadata !42, null,  null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !46, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5, metadata !6, metadata !7, metadata !8}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !46, metadata !1} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !46, metadata !1, null} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{metadata !"0x24\00double\000\0064\0032\000\000\004", metadata !46, metadata !1} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !"0x24\00unsigned char\000\008\008\000\000\008", metadata !46, metadata !1} ; [ DW_TAG_base_type ]
-!9 = metadata !{metadata !"0x2e\00inlineprinter\00inlineprinter\00inlineprinter\005\000\001\000\006\00256\001\005", metadata !46, metadata !1, metadata !3, null, i32 (i8*, double, i8)* @inlineprinter, null, null, metadata !44} ; [ DW_TAG_subprogram ]
-!10 = metadata !{metadata !"0x2e\00main\00main\00main\0018\000\001\000\006\00256\001\0018", metadata !46, metadata !1, metadata !11, null, i32 (i32, i8**)* @main, null, null, metadata !45} ; [ DW_TAG_subprogram ]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !46, metadata !1, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !5, metadata !5, metadata !13}
-!13 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !46, metadata !1, metadata !14} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !46, metadata !1, metadata !15} ; [ DW_TAG_pointer_type ]
-!15 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !46, metadata !1} ; [ DW_TAG_base_type ]
-!16 = metadata !{metadata !"0x101\00ptr\0011\000", metadata !0, metadata !1, metadata !6} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{metadata !"0x101\00val\0011\000", metadata !0, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
-!18 = metadata !{metadata !"0x101\00c\0011\000", metadata !0, metadata !1, metadata !8} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{metadata !"0x101\00ptr\004\000", metadata !9, metadata !1, metadata !6} ; [ DW_TAG_arg_variable ]
-!20 = metadata !{metadata !"0x101\00val\004\000", metadata !9, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
-!21 = metadata !{metadata !"0x101\00c\004\000", metadata !9, metadata !1, metadata !8} ; [ DW_TAG_arg_variable ]
-!22 = metadata !{metadata !"0x101\00argc\0017\000", metadata !10, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!23 = metadata !{metadata !"0x101\00argv\0017\000", metadata !10, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
-!24 = metadata !{metadata !"0x100\00dval\0019\000", metadata !25, metadata !1, metadata !7} ; [ DW_TAG_auto_variable ]
-!25 = metadata !{metadata !"0xb\0018\000\002", metadata !46, metadata !10} ; [ DW_TAG_lexical_block ]
-!26 = metadata !{i32 4, i32 0, metadata !9, null}
-!27 = metadata !{i32 6, i32 0, metadata !28, null}
-!28 = metadata !{metadata !"0xb\005\000\001", metadata !46, metadata !9} ; [ DW_TAG_lexical_block ]
-!29 = metadata !{i32 7, i32 0, metadata !28, null}
-!30 = metadata !{i32 11, i32 0, metadata !0, null}
-!31 = metadata !{i32 13, i32 0, metadata !32, null}
-!32 = metadata !{metadata !"0xb\0012\000\000", metadata !46, metadata !0} ; [ DW_TAG_lexical_block ]
-!33 = metadata !{i32 14, i32 0, metadata !32, null}
-!34 = metadata !{i32 17, i32 0, metadata !10, null}
-!35 = metadata !{i32 19, i32 0, metadata !25, null}
-!36 = metadata !{i32 20, i32 0, metadata !25, null}
-!37 = metadata !{i32 21, i32 0, metadata !25, null}
-!38 = metadata !{i32 4, i32 0, metadata !9, metadata !37}
-!39 = metadata !{i32 6, i32 0, metadata !28, metadata !37}
-!40 = metadata !{i32 22, i32 0, metadata !25, null}
-!41 = metadata !{i32 23, i32 0, metadata !25, null}
-!42 = metadata !{metadata !0, metadata !9, metadata !10}
-!43 = metadata !{metadata !16, metadata !17, metadata !18}
-!44 = metadata !{metadata !19, metadata !20, metadata !21}
-!45 = metadata !{metadata !22, metadata !23, metadata !24}
-!46 = metadata !{metadata !"a.c", metadata !"/tmp/"}
-!47 = metadata !{i32 0}
-!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00printer\00printer\00printer\0012\000\001\000\006\00256\001\0012", !46, !1, !3, null, i32 (i8*, double, i8)* @printer, null, null, !43} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x29", !46} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\001\00(LLVM build 00)\001\00\000\00\001", !46, !47, !47, !42, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !46, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5, !6, !7, !8}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", !46, !1} ; [ DW_TAG_base_type ]
+!6 = !{!"0xf\00\000\0032\0032\000\000", !46, !1, null} ; [ DW_TAG_pointer_type ]
+!7 = !{!"0x24\00double\000\0064\0032\000\000\004", !46, !1} ; [ DW_TAG_base_type ]
+!8 = !{!"0x24\00unsigned char\000\008\008\000\000\008", !46, !1} ; [ DW_TAG_base_type ]
+!9 = !{!"0x2e\00inlineprinter\00inlineprinter\00inlineprinter\005\000\001\000\006\00256\001\005", !46, !1, !3, null, i32 (i8*, double, i8)* @inlineprinter, null, null, !44} ; [ DW_TAG_subprogram ]
+!10 = !{!"0x2e\00main\00main\00main\0018\000\001\000\006\00256\001\0018", !46, !1, !11, null, i32 (i32, i8**)* @main, null, null, !45} ; [ DW_TAG_subprogram ]
+!11 = !{!"0x15\00\000\000\000\000\000\000", !46, !1, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!5, !5, !13}
+!13 = !{!"0xf\00\000\0032\0032\000\000", !46, !1, !14} ; [ DW_TAG_pointer_type ]
+!14 = !{!"0xf\00\000\0032\0032\000\000", !46, !1, !15} ; [ DW_TAG_pointer_type ]
+!15 = !{!"0x24\00char\000\008\008\000\000\006", !46, !1} ; [ DW_TAG_base_type ]
+!16 = !{!"0x101\00ptr\0011\000", !0, !1, !6} ; [ DW_TAG_arg_variable ]
+!17 = !{!"0x101\00val\0011\000", !0, !1, !7} ; [ DW_TAG_arg_variable ]
+!18 = !{!"0x101\00c\0011\000", !0, !1, !8} ; [ DW_TAG_arg_variable ]
+!19 = !{!"0x101\00ptr\004\000", !9, !1, !6} ; [ DW_TAG_arg_variable ]
+!20 = !{!"0x101\00val\004\000", !9, !1, !7} ; [ DW_TAG_arg_variable ]
+!21 = !{!"0x101\00c\004\000", !9, !1, !8} ; [ DW_TAG_arg_variable ]
+!22 = !{!"0x101\00argc\0017\000", !10, !1, !5} ; [ DW_TAG_arg_variable ]
+!23 = !{!"0x101\00argv\0017\000", !10, !1, !13} ; [ DW_TAG_arg_variable ]
+!24 = !{!"0x100\00dval\0019\000", !25, !1, !7} ; [ DW_TAG_auto_variable ]
+!25 = !{!"0xb\0018\000\002", !46, !10} ; [ DW_TAG_lexical_block ]
+!26 = !MDLocation(line: 4, scope: !9)
+!27 = !MDLocation(line: 6, scope: !28)
+!28 = !{!"0xb\005\000\001", !46, !9} ; [ DW_TAG_lexical_block ]
+!29 = !MDLocation(line: 7, scope: !28)
+!30 = !MDLocation(line: 11, scope: !0)
+!31 = !MDLocation(line: 13, scope: !32)
+!32 = !{!"0xb\0012\000\000", !46, !0} ; [ DW_TAG_lexical_block ]
+!33 = !MDLocation(line: 14, scope: !32)
+!34 = !MDLocation(line: 17, scope: !10)
+!35 = !MDLocation(line: 19, scope: !25)
+!36 = !MDLocation(line: 20, scope: !25)
+!37 = !MDLocation(line: 21, scope: !25)
+!38 = !MDLocation(line: 4, scope: !9, inlinedAt: !37)
+!39 = !MDLocation(line: 6, scope: !28, inlinedAt: !37)
+!40 = !MDLocation(line: 22, scope: !25)
+!41 = !MDLocation(line: 23, scope: !25)
+!42 = !{!0, !9, !10}
+!43 = !{!16, !17, !18}
+!44 = !{!19, !20, !21}
+!45 = !{!22, !23, !24}
+!46 = !{!"a.c", !"/tmp/"}
+!47 = !{i32 0}
+!48 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/debug-info-qreg.ll b/test/CodeGen/ARM/debug-info-qreg.ll
index cfcefb8..c05df6a 100644
--- a/test/CodeGen/ARM/debug-info-qreg.ll
+++ b/test/CodeGen/ARM/debug-info-qreg.ll
@@ -2,13 +2,11 @@
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-macosx10.6.7"
 
-;CHECK: sub-register
-;CHECK-NEXT: DW_OP_regx
+;CHECK: sub-register DW_OP_regx
 ;CHECK-NEXT: ascii
 ;CHECK-NEXT: DW_OP_piece
 ;CHECK-NEXT: byte   8
-;CHECK-NEXT: sub-register
-;CHECK-NEXT: DW_OP_regx
+;CHECK-NEXT: sub-register DW_OP_regx
 ;CHECK-NEXT: ascii
 ;CHECK-NEXT: DW_OP_piece
 ;CHECK-NEXT: byte   8
@@ -26,7 +24,7 @@ for.body9:                                        ; preds = %for.body9, %entry
   br i1 undef, label %for.end54, label %for.body9, !dbg !44
 
 for.end54:                                        ; preds = %for.body9
-  tail call void @llvm.dbg.value(metadata !{<4 x float> %add19}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !39
+  tail call void @llvm.dbg.value(metadata <4 x float> %add19, i64 0, metadata !27, metadata !{!"0x102"}), !dbg !39
   %tmp115 = extractelement <4 x float> %add19, i32 1
   %conv6.i75 = fpext float %tmp115 to double, !dbg !45
   %call.i82 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0), double undef, double %conv6.i75, double undef, double undef) nounwind, !dbg !45
@@ -40,60 +38,60 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!56}
 
-!0 = metadata !{metadata !"0x2e\00test0001\00test0001\00\003\000\001\000\006\00256\001\003", metadata !54, metadata !1, metadata !3, null, <4 x float> (float)* @test0001, null, null, metadata !51} ; [ DW_TAG_subprogram ] [line 3] [def] [test0001]
-!1 = metadata !{metadata !"0x29", metadata !54} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 129915)\001\00\000\00\001", metadata !54, metadata !17, metadata !17, metadata !50, null,  null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !54, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x16\00v4f32\0014\000\000\000\000", metadata !54, metadata !2, metadata !6} ; [ DW_TAG_typedef ]
-!6 = metadata !{metadata !"0x1\00\000\00128\00128\000\000", metadata !2, null, metadata !7, metadata !8, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [from float]
-!7 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !2} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x21\000\004"}         ; [ DW_TAG_subrange_type ]
-!10 = metadata !{metadata !"0x2e\00main\00main\00\0059\000\001\000\006\00256\001\0059", metadata !54, metadata !1, metadata !11, null, i32 (i32, i8**)* @main, null, null, metadata !52} ; [ DW_TAG_subprogram ] [line 59] [def] [main]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !54, metadata !1, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
-!14 = metadata !{metadata !"0x2e\00printFV\00printFV\00\0041\001\001\000\006\00256\001\0041", metadata !55, metadata !15, metadata !16, null, null, null, null, metadata !53} ; [ DW_TAG_subprogram ] [line 41] [local] [def] [printFV]
-!15 = metadata !{metadata !"0x29", metadata !55} ; [ DW_TAG_file_type ]
-!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !55, metadata !15, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!17 = metadata !{null}
-!18 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !0, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{metadata !"0x101\00argc\0016777275\000", metadata !10, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
-!20 = metadata !{metadata !"0x101\00argv\0033554491\000", metadata !10, metadata !1, metadata !21} ; [ DW_TAG_arg_variable ]
-!21 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !22} ; [ DW_TAG_pointer_type ]
-!22 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !23} ; [ DW_TAG_pointer_type ]
-!23 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !2} ; [ DW_TAG_base_type ]
-!24 = metadata !{metadata !"0x100\00i\0060\000", metadata !25, metadata !1, metadata !13} ; [ DW_TAG_auto_variable ]
-!25 = metadata !{metadata !"0xb\0059\0033\0014", metadata !54, metadata !10} ; [ DW_TAG_lexical_block ]
-!26 = metadata !{metadata !"0x100\00j\0060\000", metadata !25, metadata !1, metadata !13} ; [ DW_TAG_auto_variable ]
-!27 = metadata !{metadata !"0x100\00x\0061\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!28 = metadata !{metadata !"0x100\00y\0062\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{metadata !"0x100\00z\0063\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!30 = metadata !{metadata !"0x101\00F\0016777257\000", metadata !14, metadata !15, metadata !31} ; [ DW_TAG_arg_variable ]
-!31 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !32} ; [ DW_TAG_pointer_type ]
-!32 = metadata !{metadata !"0x16\00FV\0025\000\000\000\000", metadata !55, metadata !2, metadata !33} ; [ DW_TAG_typedef ]
-!33 = metadata !{metadata !"0x17\00\0022\00128\00128\000\000\000", metadata !55, metadata !2, i32 0, metadata !34, null} ; [ DW_TAG_union_type ]
-!34 = metadata !{metadata !35, metadata !37}
-!35 = metadata !{metadata !"0xd\00V\0023\00128\00128\000\000", metadata !55, metadata !15, metadata !36} ; [ DW_TAG_member ]
-!36 = metadata !{metadata !"0x16\00v4sf\003\000\000\000\000", metadata !55, metadata !2, metadata !6} ; [ DW_TAG_typedef ]
-!37 = metadata !{metadata !"0xd\00A\0024\00128\0032\000\000", metadata !55, metadata !15, metadata !38} ; [ DW_TAG_member ]
-!38 = metadata !{metadata !"0x1\00\000\00128\0032\000\000", null, metadata !2, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_array_type ]
-!39 = metadata !{i32 79, i32 7, metadata !40, null}
-!40 = metadata !{metadata !"0xb\0075\0035\0018", metadata !54, metadata !41} ; [ DW_TAG_lexical_block ]
-!41 = metadata !{metadata !"0xb\0075\005\0017", metadata !54, metadata !42} ; [ DW_TAG_lexical_block ]
-!42 = metadata !{metadata !"0xb\0071\0032\0016", metadata !54, metadata !43} ; [ DW_TAG_lexical_block ]
-!43 = metadata !{metadata !"0xb\0071\003\0015", metadata !54, metadata !25} ; [ DW_TAG_lexical_block ]
-!44 = metadata !{i32 75, i32 5, metadata !42, null}
-!45 = metadata !{i32 42, i32 2, metadata !46, metadata !48}
-!46 = metadata !{metadata !"0xb\0042\002\0020", metadata !55, metadata !47} ; [ DW_TAG_lexical_block ]
-!47 = metadata !{metadata !"0xb\0041\0028\0019", metadata !55, metadata !14} ; [ DW_TAG_lexical_block ]
-!48 = metadata !{i32 95, i32 3, metadata !25, null}
-!49 = metadata !{i32 99, i32 3, metadata !25, null}
-!50 = metadata !{metadata !0, metadata !10, metadata !14}
-!51 = metadata !{metadata !18}
-!52 = metadata !{metadata !19, metadata !20, metadata !24, metadata !26, metadata !27, metadata !28, metadata !29}
-!53 = metadata !{metadata !30}
-!54 = metadata !{metadata !"build2.c", metadata !"/private/tmp"}
-!55 = metadata !{metadata !"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", metadata !"/private/tmp"}
-!56 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00test0001\00test0001\00\003\000\001\000\006\00256\001\003", !54, !1, !3, null, <4 x float> (float)* @test0001, null, null, !51} ; [ DW_TAG_subprogram ] [line 3] [def] [test0001]
+!1 = !{!"0x29", !54} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 3.0 (trunk 129915)\001\00\000\00\001", !54, !17, !17, !50, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !54, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x16\00v4f32\0014\000\000\000\000", !54, !2, !6} ; [ DW_TAG_typedef ]
+!6 = !{!"0x1\00\000\00128\00128\000\000", !2, null, !7, !8, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [from float]
+!7 = !{!"0x24\00float\000\0032\0032\000\000\004", null, !2} ; [ DW_TAG_base_type ]
+!8 = !{!9}
+!9 = !{!"0x21\000\004"}         ; [ DW_TAG_subrange_type ]
+!10 = !{!"0x2e\00main\00main\00\0059\000\001\000\006\00256\001\0059", !54, !1, !11, null, i32 (i32, i8**)* @main, null, null, !52} ; [ DW_TAG_subprogram ] [line 59] [def] [main]
+!11 = !{!"0x15\00\000\000\000\000\000\000", !54, !1, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!13}
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !2} ; [ DW_TAG_base_type ]
+!14 = !{!"0x2e\00printFV\00printFV\00\0041\001\001\000\006\00256\001\0041", !55, !15, !16, null, null, null, null, !53} ; [ DW_TAG_subprogram ] [line 41] [local] [def] [printFV]
+!15 = !{!"0x29", !55} ; [ DW_TAG_file_type ]
+!16 = !{!"0x15\00\000\000\000\000\000\000", !55, !15, null, !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = !{null}
+!18 = !{!"0x101\00a\0016777219\000", !0, !1, !7} ; [ DW_TAG_arg_variable ]
+!19 = !{!"0x101\00argc\0016777275\000", !10, !1, !13} ; [ DW_TAG_arg_variable ]
+!20 = !{!"0x101\00argv\0033554491\000", !10, !1, !21} ; [ DW_TAG_arg_variable ]
+!21 = !{!"0xf\00\000\0032\0032\000\000", null, !2, !22} ; [ DW_TAG_pointer_type ]
+!22 = !{!"0xf\00\000\0032\0032\000\000", null, !2, !23} ; [ DW_TAG_pointer_type ]
+!23 = !{!"0x24\00char\000\008\008\000\000\006", null, !2} ; [ DW_TAG_base_type ]
+!24 = !{!"0x100\00i\0060\000", !25, !1, !13} ; [ DW_TAG_auto_variable ]
+!25 = !{!"0xb\0059\0033\0014", !54, !10} ; [ DW_TAG_lexical_block ]
+!26 = !{!"0x100\00j\0060\000", !25, !1, !13} ; [ DW_TAG_auto_variable ]
+!27 = !{!"0x100\00x\0061\000", !25, !1, !5} ; [ DW_TAG_auto_variable ]
+!28 = !{!"0x100\00y\0062\000", !25, !1, !5} ; [ DW_TAG_auto_variable ]
+!29 = !{!"0x100\00z\0063\000", !25, !1, !5} ; [ DW_TAG_auto_variable ]
+!30 = !{!"0x101\00F\0016777257\000", !14, !15, !31} ; [ DW_TAG_arg_variable ]
+!31 = !{!"0xf\00\000\0032\0032\000\000", null, !2, !32} ; [ DW_TAG_pointer_type ]
+!32 = !{!"0x16\00FV\0025\000\000\000\000", !55, !2, !33} ; [ DW_TAG_typedef ]
+!33 = !{!"0x17\00\0022\00128\00128\000\000\000", !55, !2, i32 0, !34, null} ; [ DW_TAG_union_type ]
+!34 = !{!35, !37}
+!35 = !{!"0xd\00V\0023\00128\00128\000\000", !55, !15, !36} ; [ DW_TAG_member ]
+!36 = !{!"0x16\00v4sf\003\000\000\000\000", !55, !2, !6} ; [ DW_TAG_typedef ]
+!37 = !{!"0xd\00A\0024\00128\0032\000\000", !55, !15, !38} ; [ DW_TAG_member ]
+!38 = !{!"0x1\00\000\00128\0032\000\000", null, !2, !7, !8, i32 0, i32 0} ; [ DW_TAG_array_type ]
+!39 = !MDLocation(line: 79, column: 7, scope: !40)
+!40 = !{!"0xb\0075\0035\0018", !54, !41} ; [ DW_TAG_lexical_block ]
+!41 = !{!"0xb\0075\005\0017", !54, !42} ; [ DW_TAG_lexical_block ]
+!42 = !{!"0xb\0071\0032\0016", !54, !43} ; [ DW_TAG_lexical_block ]
+!43 = !{!"0xb\0071\003\0015", !54, !25} ; [ DW_TAG_lexical_block ]
+!44 = !MDLocation(line: 75, column: 5, scope: !42)
+!45 = !MDLocation(line: 42, column: 2, scope: !46, inlinedAt: !48)
+!46 = !{!"0xb\0042\002\0020", !55, !47} ; [ DW_TAG_lexical_block ]
+!47 = !{!"0xb\0041\0028\0019", !55, !14} ; [ DW_TAG_lexical_block ]
+!48 = !MDLocation(line: 95, column: 3, scope: !25)
+!49 = !MDLocation(line: 99, column: 3, scope: !25)
+!50 = !{!0, !10, !14}
+!51 = !{!18}
+!52 = !{!19, !20, !24, !26, !27, !28, !29}
+!53 = !{!30}
+!54 = !{!"build2.c", !"/private/tmp"}
+!55 = !{!"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", !"/private/tmp"}
+!56 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/debug-info-s16-reg.ll b/test/CodeGen/ARM/debug-info-s16-reg.ll
index 6bd7172..9b303dd 100644
--- a/test/CodeGen/ARM/debug-info-s16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-s16-reg.ll
@@ -1,8 +1,7 @@
 ; RUN: llc < %s - | FileCheck %s
 ; Radar 9309221
 ; Test dwarf reg no for s16
-;CHECK: super-register
-;CHECK-NEXT: DW_OP_regx
+;CHECK: super-register DW_OP_regx
 ;CHECK-NEXT: ascii
 ;CHECK-NEXT: DW_OP_piece
 ;CHECK-NEXT: 4
@@ -15,9 +14,9 @@ target triple = "thumbv7-apple-macosx10.6.7"
 
 define i32 @inlineprinter(i8* %ptr, float %val, i8 zeroext %c) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !8, metadata !{metadata !"0x102"}), !dbg !24
-  tail call void @llvm.dbg.value(metadata !{float %val}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !25
-  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !26
+  tail call void @llvm.dbg.value(metadata i8* %ptr, i64 0, metadata !8, metadata !{!"0x102"}), !dbg !24
+  tail call void @llvm.dbg.value(metadata float %val, i64 0, metadata !10, metadata !{!"0x102"}), !dbg !25
+  tail call void @llvm.dbg.value(metadata i8 %c, i64 0, metadata !12, metadata !{!"0x102"}), !dbg !26
   %conv = fpext float %val to double, !dbg !27
   %conv3 = zext i8 %c to i32, !dbg !27
   %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %conv, i32 %conv3) nounwind optsize, !dbg !27
@@ -28,9 +27,9 @@ declare i32 @printf(i8* nocapture, ...) nounwind optsize
 
 define i32 @printer(i8* %ptr, float %val, i8 zeroext %c) nounwind optsize noinline ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !30
-  tail call void @llvm.dbg.value(metadata !{float %val}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !31
-  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !32
+  tail call void @llvm.dbg.value(metadata i8* %ptr, i64 0, metadata !14, metadata !{!"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata float %val, i64 0, metadata !15, metadata !{!"0x102"}), !dbg !31
+  tail call void @llvm.dbg.value(metadata i8 %c, i64 0, metadata !16, metadata !{!"0x102"}), !dbg !32
   %conv = fpext float %val to double, !dbg !33
   %conv3 = zext i8 %c to i32, !dbg !33
   %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %conv, i32 %conv3) nounwind optsize, !dbg !33
@@ -39,19 +38,19 @@ entry:
 
 define i32 @main(i32 %argc, i8** nocapture %argv) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !36
-  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !37
+  tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !36
+  tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !37
   %conv = sitofp i32 %argc to double, !dbg !38
   %add = fadd double %conv, 5.555552e+05, !dbg !38
   %conv1 = fptrunc double %add to float, !dbg !38
-  tail call void @llvm.dbg.value(metadata !{float %conv1}, i64 0, metadata !22, metadata !{metadata !"0x102"}), !dbg !38
+  tail call void @llvm.dbg.value(metadata float %conv1, i64 0, metadata !22, metadata !{!"0x102"}), !dbg !38
   %call = tail call i32 @puts(i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0)) nounwind optsize, !dbg !39
   %add.ptr = getelementptr i8* bitcast (i32 (i32, i8**)* @main to i8*), i32 %argc, !dbg !40
   %add5 = add nsw i32 %argc, 97, !dbg !40
   %conv6 = trunc i32 %add5 to i8, !dbg !40
-  tail call void @llvm.dbg.value(metadata !{i8* %add.ptr}, i64 0, metadata !8, metadata !{metadata !"0x102"}) nounwind, !dbg !41
-  tail call void @llvm.dbg.value(metadata !{float %conv1}, i64 0, metadata !10, metadata !{metadata !"0x102"}) nounwind, !dbg !42
-  tail call void @llvm.dbg.value(metadata !{i8 %conv6}, i64 0, metadata !12, metadata !{metadata !"0x102"}) nounwind, !dbg !43
+  tail call void @llvm.dbg.value(metadata i8* %add.ptr, i64 0, metadata !8, metadata !{!"0x102"}) nounwind, !dbg !41
+  tail call void @llvm.dbg.value(metadata float %conv1, i64 0, metadata !10, metadata !{!"0x102"}) nounwind, !dbg !42
+  tail call void @llvm.dbg.value(metadata i8 %conv6, i64 0, metadata !12, metadata !{!"0x102"}) nounwind, !dbg !43
   %conv.i = fpext float %conv1 to double, !dbg !44
   %conv3.i = and i32 %add5, 255, !dbg !44
   %call.i = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %add.ptr, double %conv.i, i32 %conv3.i) nounwind optsize, !dbg !44
@@ -66,57 +65,57 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!53}
 
-!0 = metadata !{metadata !"0x2e\00inlineprinter\00inlineprinter\00\005\000\001\000\006\00256\001\005", metadata !51, metadata !1, metadata !3, null, i32 (i8*, float, i8)* @inlineprinter, null, null, metadata !48} ; [ DW_TAG_subprogram ] [line 5] [def] [inlineprinter]
-!1 = metadata !{metadata !"0x29", metadata !51} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 129915)\001\00\000\00\001", metadata !51, metadata !52, metadata !52, metadata !47, null,  null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !51, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x2e\00printer\00printer\00\0012\000\001\000\006\00256\001\0012", metadata !51, metadata !1, metadata !3, null, i32 (i8*, float, i8)* @printer, null, null, metadata !49} ; [ DW_TAG_subprogram ] [line 12] [def] [printer]
-!7 = metadata !{metadata !"0x2e\00main\00main\00\0018\000\001\000\006\00256\001\0018", metadata !51, metadata !1, metadata !3, null, i32 (i32, i8**)* @main, null, null, metadata !50} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
-!8 = metadata !{metadata !"0x101\00ptr\0016777220\000", metadata !0, metadata !1, metadata !9} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, null} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{metadata !"0x101\00val\0033554436\000", metadata !0, metadata !1, metadata !11} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !2} ; [ DW_TAG_base_type ]
-!12 = metadata !{metadata !"0x101\00c\0050331652\000", metadata !0, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
-!13 = metadata !{metadata !"0x24\00unsigned char\000\008\008\000\000\008", null, metadata !2} ; [ DW_TAG_base_type ]
-!14 = metadata !{metadata !"0x101\00ptr\0016777227\000", metadata !6, metadata !1, metadata !9} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{metadata !"0x101\00val\0033554443\000", metadata !6, metadata !1, metadata !11} ; [ DW_TAG_arg_variable ]
-!16 = metadata !{metadata !"0x101\00c\0050331659\000", metadata !6, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{metadata !"0x101\00argc\0016777233\000", metadata !7, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!18 = metadata !{metadata !"0x101\00argv\0033554449\000", metadata !7, metadata !1, metadata !19} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !20} ; [ DW_TAG_pointer_type ]
-!20 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !21} ; [ DW_TAG_pointer_type ]
-!21 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !2} ; [ DW_TAG_base_type ]
-!22 = metadata !{metadata !"0x100\00dval\0019\000", metadata !23, metadata !1, metadata !11} ; [ DW_TAG_auto_variable ]
-!23 = metadata !{metadata !"0xb\0018\001\002", metadata !51, metadata !7} ; [ DW_TAG_lexical_block ]
-!24 = metadata !{i32 4, i32 22, metadata !0, null}
-!25 = metadata !{i32 4, i32 33, metadata !0, null}
-!26 = metadata !{i32 4, i32 52, metadata !0, null}
-!27 = metadata !{i32 6, i32 3, metadata !28, null}
-!28 = metadata !{metadata !"0xb\005\001\000", metadata !51, metadata !0} ; [ DW_TAG_lexical_block ]
-!29 = metadata !{i32 7, i32 3, metadata !28, null}
-!30 = metadata !{i32 11, i32 42, metadata !6, null}
-!31 = metadata !{i32 11, i32 53, metadata !6, null}
-!32 = metadata !{i32 11, i32 72, metadata !6, null}
-!33 = metadata !{i32 13, i32 3, metadata !34, null}
-!34 = metadata !{metadata !"0xb\0012\001\001", metadata !51, metadata !6} ; [ DW_TAG_lexical_block ]
-!35 = metadata !{i32 14, i32 3, metadata !34, null}
-!36 = metadata !{i32 17, i32 15, metadata !7, null}
-!37 = metadata !{i32 17, i32 28, metadata !7, null}
-!38 = metadata !{i32 19, i32 31, metadata !23, null}
-!39 = metadata !{i32 20, i32 3, metadata !23, null}
-!40 = metadata !{i32 21, i32 3, metadata !23, null}
-!41 = metadata !{i32 4, i32 22, metadata !0, metadata !40}
-!42 = metadata !{i32 4, i32 33, metadata !0, metadata !40}
-!43 = metadata !{i32 4, i32 52, metadata !0, metadata !40}
-!44 = metadata !{i32 6, i32 3, metadata !28, metadata !40}
-!45 = metadata !{i32 22, i32 3, metadata !23, null}
-!46 = metadata !{i32 23, i32 1, metadata !23, null}
-!47 = metadata !{metadata !0, metadata !6, metadata !7}
-!48 = metadata !{metadata !8, metadata !10, metadata !12}
-!49 = metadata !{metadata !14, metadata !15, metadata !16}
-!50 = metadata !{metadata !17, metadata !18, metadata !22}
-!51 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
-!52 = metadata !{i32 0}
-!53 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00inlineprinter\00inlineprinter\00\005\000\001\000\006\00256\001\005", !51, !1, !3, null, i32 (i8*, float, i8)* @inlineprinter, null, null, !48} ; [ DW_TAG_subprogram ] [line 5] [def] [inlineprinter]
+!1 = !{!"0x29", !51} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 3.0 (trunk 129915)\001\00\000\00\001", !51, !52, !52, !47, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !51, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !2} ; [ DW_TAG_base_type ]
+!6 = !{!"0x2e\00printer\00printer\00\0012\000\001\000\006\00256\001\0012", !51, !1, !3, null, i32 (i8*, float, i8)* @printer, null, null, !49} ; [ DW_TAG_subprogram ] [line 12] [def] [printer]
+!7 = !{!"0x2e\00main\00main\00\0018\000\001\000\006\00256\001\0018", !51, !1, !3, null, i32 (i32, i8**)* @main, null, null, !50} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
+!8 = !{!"0x101\00ptr\0016777220\000", !0, !1, !9} ; [ DW_TAG_arg_variable ]
+!9 = !{!"0xf\00\000\0032\0032\000\000", null, !2, null} ; [ DW_TAG_pointer_type ]
+!10 = !{!"0x101\00val\0033554436\000", !0, !1, !11} ; [ DW_TAG_arg_variable ]
+!11 = !{!"0x24\00float\000\0032\0032\000\000\004", null, !2} ; [ DW_TAG_base_type ]
+!12 = !{!"0x101\00c\0050331652\000", !0, !1, !13} ; [ DW_TAG_arg_variable ]
+!13 = !{!"0x24\00unsigned char\000\008\008\000\000\008", null, !2} ; [ DW_TAG_base_type ]
+!14 = !{!"0x101\00ptr\0016777227\000", !6, !1, !9} ; [ DW_TAG_arg_variable ]
+!15 = !{!"0x101\00val\0033554443\000", !6, !1, !11} ; [ DW_TAG_arg_variable ]
+!16 = !{!"0x101\00c\0050331659\000", !6, !1, !13} ; [ DW_TAG_arg_variable ]
+!17 = !{!"0x101\00argc\0016777233\000", !7, !1, !5} ; [ DW_TAG_arg_variable ]
+!18 = !{!"0x101\00argv\0033554449\000", !7, !1, !19} ; [ DW_TAG_arg_variable ]
+!19 = !{!"0xf\00\000\0032\0032\000\000", null, !2, !20} ; [ DW_TAG_pointer_type ]
+!20 = !{!"0xf\00\000\0032\0032\000\000", null, !2, !21} ; [ DW_TAG_pointer_type ]
+!21 = !{!"0x24\00char\000\008\008\000\000\006", null, !2} ; [ DW_TAG_base_type ]
+!22 = !{!"0x100\00dval\0019\000", !23, !1, !11} ; [ DW_TAG_auto_variable ]
+!23 = !{!"0xb\0018\001\002", !51, !7} ; [ DW_TAG_lexical_block ]
+!24 = !MDLocation(line: 4, column: 22, scope: !0)
+!25 = !MDLocation(line: 4, column: 33, scope: !0)
+!26 = !MDLocation(line: 4, column: 52, scope: !0)
+!27 = !MDLocation(line: 6, column: 3, scope: !28)
+!28 = !{!"0xb\005\001\000", !51, !0} ; [ DW_TAG_lexical_block ]
+!29 = !MDLocation(line: 7, column: 3, scope: !28)
+!30 = !MDLocation(line: 11, column: 42, scope: !6)
+!31 = !MDLocation(line: 11, column: 53, scope: !6)
+!32 = !MDLocation(line: 11, column: 72, scope: !6)
+!33 = !MDLocation(line: 13, column: 3, scope: !34)
+!34 = !{!"0xb\0012\001\001", !51, !6} ; [ DW_TAG_lexical_block ]
+!35 = !MDLocation(line: 14, column: 3, scope: !34)
+!36 = !MDLocation(line: 17, column: 15, scope: !7)
+!37 = !MDLocation(line: 17, column: 28, scope: !7)
+!38 = !MDLocation(line: 19, column: 31, scope: !23)
+!39 = !MDLocation(line: 20, column: 3, scope: !23)
+!40 = !MDLocation(line: 21, column: 3, scope: !23)
+!41 = !MDLocation(line: 4, column: 22, scope: !0, inlinedAt: !40)
+!42 = !MDLocation(line: 4, column: 33, scope: !0, inlinedAt: !40)
+!43 = !MDLocation(line: 4, column: 52, scope: !0, inlinedAt: !40)
+!44 = !MDLocation(line: 6, column: 3, scope: !28, inlinedAt: !40)
+!45 = !MDLocation(line: 22, column: 3, scope: !23)
+!46 = !MDLocation(line: 23, column: 1, scope: !23)
+!47 = !{!0, !6, !7}
+!48 = !{!8, !10, !12}
+!49 = !{!14, !15, !16}
+!50 = !{!17, !18, !22}
+!51 = !{!"a.c", !"/private/tmp"}
+!52 = !{i32 0}
+!53 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/debug-info-sreg2.ll b/test/CodeGen/ARM/debug-info-sreg2.ll
index 4374b9e..977a6f2 100644
--- a/test/CodeGen/ARM/debug-info-sreg2.ll
+++ b/test/CodeGen/ARM/debug-info-sreg2.ll
@@ -15,7 +15,7 @@ target triple = "thumbv7-apple-macosx10.6.7"
 define void @_Z3foov() optsize ssp {
 entry:
   %call = tail call float @_Z3barv() optsize, !dbg !11
-  tail call void @llvm.dbg.value(metadata !{float %call}, i64 0, metadata !5, metadata !{metadata !"0x102"}), !dbg !11
+  tail call void @llvm.dbg.value(metadata float %call, i64 0, metadata !5, metadata !{!"0x102"}), !dbg !11
   %call16 = tail call float @_Z2f2v() optsize, !dbg !12
   %cmp7 = fcmp olt float %call, %call16, !dbg !12
   br i1 %cmp7, label %for.body, label %for.end, !dbg !12
@@ -43,24 +43,24 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.0 (trunk 130845)\001\00\000\00\001", metadata !18, metadata !19, metadata !19, metadata !16, null,  null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3foov\005\000\001\000\006\00256\001\005", metadata !18, metadata !2, metadata !3, null, void ()* @_Z3foov, null, null, metadata !17} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
-!2 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null}
-!5 = metadata !{metadata !"0x100\00k\006\000", metadata !6, metadata !2, metadata !7} ; [ DW_TAG_auto_variable ]
-!6 = metadata !{metadata !"0xb\005\0012\000", metadata !18, metadata !1} ; [ DW_TAG_lexical_block ]
-!7 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !0} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !"0x100\00y\008\000", metadata !9, metadata !2, metadata !7} ; [ DW_TAG_auto_variable ]
-!9 = metadata !{metadata !"0xb\007\0025\002", metadata !18, metadata !10} ; [ DW_TAG_lexical_block ]
-!10 = metadata !{metadata !"0xb\007\003\001", metadata !18, metadata !6} ; [ DW_TAG_lexical_block ]
-!11 = metadata !{i32 6, i32 18, metadata !6, null}
-!12 = metadata !{i32 7, i32 3, metadata !6, null}
-!13 = metadata !{i32 8, i32 20, metadata !9, null}
-!14 = metadata !{i32 7, i32 20, metadata !10, null}
-!15 = metadata !{i32 10, i32 1, metadata !6, null}
-!16 = metadata !{metadata !1}
-!17 = metadata !{metadata !5, metadata !8}
-!18 = metadata !{metadata !"k.cc", metadata !"/private/tmp"}
-!19 = metadata !{i32 0}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.0 (trunk 130845)\001\00\000\00\001", !18, !19, !19, !16, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = !{!"0x2e\00foo\00foo\00_Z3foov\005\000\001\000\006\00256\001\005", !18, !2, !3, null, void ()* @_Z3foov, null, null, !17} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
+!2 = !{!"0x29", !18} ; [ DW_TAG_file_type ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !18, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{null}
+!5 = !{!"0x100\00k\006\000", !6, !2, !7} ; [ DW_TAG_auto_variable ]
+!6 = !{!"0xb\005\0012\000", !18, !1} ; [ DW_TAG_lexical_block ]
+!7 = !{!"0x24\00float\000\0032\0032\000\000\004", null, !0} ; [ DW_TAG_base_type ]
+!8 = !{!"0x100\00y\008\000", !9, !2, !7} ; [ DW_TAG_auto_variable ]
+!9 = !{!"0xb\007\0025\002", !18, !10} ; [ DW_TAG_lexical_block ]
+!10 = !{!"0xb\007\003\001", !18, !6} ; [ DW_TAG_lexical_block ]
+!11 = !MDLocation(line: 6, column: 18, scope: !6)
+!12 = !MDLocation(line: 7, column: 3, scope: !6)
+!13 = !MDLocation(line: 8, column: 20, scope: !9)
+!14 = !MDLocation(line: 7, column: 20, scope: !10)
+!15 = !MDLocation(line: 10, column: 1, scope: !6)
+!16 = !{!1}
+!17 = !{!5, !8}
+!18 = !{!"k.cc", !"/private/tmp"}
+!19 = !{i32 0}
+!20 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/debug-segmented-stacks.ll b/test/CodeGen/ARM/debug-segmented-stacks.ll
index 2123fa7..7ea5665 100644
--- a/test/CodeGen/ARM/debug-segmented-stacks.ll
+++ b/test/CodeGen/ARM/debug-segmented-stacks.ll
@@ -39,40 +39,40 @@ define void @test_basic() #0 {
 ; ARM-linux       .cfi_same_value r5
 }
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/var.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"var.c", metadata !"/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00test_basic\00test_basic\00\005\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !6, null, void ()* @test_basic, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [sum]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/var.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!11 = metadata !{metadata !"clang version 3.5 "}
-!12 = metadata !{metadata !"0x101\00count\0016777221\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [count] [line 5]
-!13 = metadata !{i32 5, i32 0, metadata !4, null}
-!14 = metadata !{metadata !"0x100\00vl\006\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [vl] [line 6]
-!15 = metadata !{metadata !"0x16\00va_list\0030\000\000\000\000", metadata !16, null, metadata !17} ; [ DW_TAG_typedef ] [va_list] [line 30, size 0, align 0, offset 0] [from __builtin_va_list]
-!16 = metadata !{metadata !"/linux-x86_64-high/gcc_4.7.2/dbg/llvm/bin/../lib/clang/3.5/include/stdarg.h", metadata !"/tmp"}
-!17 = metadata !{metadata !"0x16\00__builtin_va_list\006\000\000\000\000", metadata !1, null, metadata !18} ; [ DW_TAG_typedef ] [__builtin_va_list] [line 6, size 0, align 0, offset 0] [from __va_list]
-!18 = metadata !{metadata !"0x13\00__va_list\006\0032\0032\000\000\000", metadata !1, null, null, metadata !19, null, null, null} ; [ DW_TAG_structure_type ] [__va_list] [line 6, size 32, align 32, offset 0] [def] [from ]
-!19 = metadata !{metadata !20}
-!20 = metadata !{metadata !"0xd\00__ap\006\0032\0032\000\000", metadata !1, metadata !18, metadata !21} ; [ DW_TAG_member ] [__ap] [line 6, size 32, align 32, offset 0] [from ]
-!21 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from ]
-!22 = metadata !{i32 6, i32 0, metadata !4, null}
-!23 = metadata !{i32 7, i32 0, metadata !4, null}
-!24 = metadata !{metadata !"0x100\00test_basic\008\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [sum] [line 8]
-!25 = metadata !{i32 8, i32 0, metadata !4, null}
-!26 = metadata !{metadata !"0x100\00i\009\000", metadata !27, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 9]
-!27 = metadata !{metadata !"0xb\009\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
-!28 = metadata !{i32 9, i32 0, metadata !27, null}
-!29 = metadata !{i32 10, i32 0, metadata !30, null}
-!30 = metadata !{metadata !"0xb\009\000\001", metadata !1, metadata !27} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
-!31 = metadata !{i32 11, i32 0, metadata !30, null}
-!32 = metadata !{i32 12, i32 0, metadata !4, null}
-!33 = metadata !{i32 13, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/var.c] [DW_LANG_C99]
+!1 = !{!"var.c", !"/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00test_basic\00test_basic\00\005\000\001\000\006\00256\000\005", !1, !5, !6, null, void ()* @test_basic, null, null, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [sum]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/var.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 1, !"Debug Info Version", i32 2}
+!11 = !{!"clang version 3.5 "}
+!12 = !{!"0x101\00count\0016777221\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [count] [line 5]
+!13 = !MDLocation(line: 5, scope: !4)
+!14 = !{!"0x100\00vl\006\000", !4, !5, !15} ; [ DW_TAG_auto_variable ] [vl] [line 6]
+!15 = !{!"0x16\00va_list\0030\000\000\000\000", !16, null, !17} ; [ DW_TAG_typedef ] [va_list] [line 30, size 0, align 0, offset 0] [from __builtin_va_list]
+!16 = !{!"/linux-x86_64-high/gcc_4.7.2/dbg/llvm/bin/../lib/clang/3.5/include/stdarg.h", !"/tmp"}
+!17 = !{!"0x16\00__builtin_va_list\006\000\000\000\000", !1, null, !18} ; [ DW_TAG_typedef ] [__builtin_va_list] [line 6, size 0, align 0, offset 0] [from __va_list]
+!18 = !{!"0x13\00__va_list\006\0032\0032\000\000\000", !1, null, null, !19, null, null, null} ; [ DW_TAG_structure_type ] [__va_list] [line 6, size 32, align 32, offset 0] [def] [from ]
+!19 = !{!20}
+!20 = !{!"0xd\00__ap\006\0032\0032\000\000", !1, !18, !21} ; [ DW_TAG_member ] [__ap] [line 6, size 32, align 32, offset 0] [from ]
+!21 = !{!"0xf\00\000\0032\0032\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from ]
+!22 = !MDLocation(line: 6, scope: !4)
+!23 = !MDLocation(line: 7, scope: !4)
+!24 = !{!"0x100\00test_basic\008\000", !4, !5, !8} ; [ DW_TAG_auto_variable ] [sum] [line 8]
+!25 = !MDLocation(line: 8, scope: !4)
+!26 = !{!"0x100\00i\009\000", !27, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 9]
+!27 = !{!"0xb\009\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
+!28 = !MDLocation(line: 9, scope: !27)
+!29 = !MDLocation(line: 10, scope: !30)
+!30 = !{!"0xb\009\000\001", !1, !27} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
+!31 = !MDLocation(line: 11, scope: !30)
+!32 = !MDLocation(line: 12, scope: !4)
+!33 = !MDLocation(line: 13, scope: !4)
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
diff --git a/test/CodeGen/ARM/dyn-stackalloc.ll b/test/CodeGen/ARM/dyn-stackalloc.ll
index 4ac5b8a..05c143d 100644
--- a/test/CodeGen/ARM/dyn-stackalloc.ll
+++ b/test/CodeGen/ARM/dyn-stackalloc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm-eabi %s -o /dev/null
+; RUN: llc -mcpu=generic -mtriple=arm-eabi -verify-machineinstrs < %s | FileCheck %s
 
 %struct.comment = type { i8**, i32*, i32, i8* }
 %struct.info = type { i32, i32, i32, i32, i32, i32, i32, i8* }
@@ -7,6 +7,18 @@
 @str215 = external global [2 x i8]
 
 define void @t1(%struct.state* %v) {
+
+; Make sure we generate:
+;   sub	sp, sp, r1
+; instead of:
+;   sub	r1, sp, r1
+;   mov	sp, r1
+
+; CHECK-LABEL: @t1
+; CHECK: bic [[REG1:r[0-9]+]],
+; CHECK-NOT: sub r{{[0-9]+}}, sp, [[REG1]]
+; CHECK: sub sp, sp, [[REG1]]
+
   %tmp6 = load i32* null
   %tmp8 = alloca float, i32 %tmp6
   store i32 1, i32* null
diff --git a/test/CodeGen/ARM/emit-big-cst.ll b/test/CodeGen/ARM/emit-big-cst.ll
index 9a3367d..01d789c 100644
--- a/test/CodeGen/ARM/emit-big-cst.ll
+++ b/test/CodeGen/ARM/emit-big-cst.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumbv7-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mtriple=thumbv7-unknown-unknown -target-abi apcs < %s | FileCheck %s
 ; Check assembly printing of odd constants.
 
 ; CHECK: bigCst:
diff --git a/test/CodeGen/ARM/fold-stack-adjust.ll b/test/CodeGen/ARM/fold-stack-adjust.ll
index 514d4a9..c5ff82e 100644
--- a/test/CodeGen/ARM/fold-stack-adjust.ll
+++ b/test/CodeGen/ARM/fold-stack-adjust.ll
@@ -71,7 +71,7 @@ define void @check_vfp_fold() minsize {
 ; CHECK-IOS-LABEL: check_vfp_fold:
 ; CHECK-IOS: push {r0, r1, r2, r3, r4, r7, lr}
 ; CHECK-IOS: sub.w r4, sp, #16
-; CHECK-IOS: bic r4, r4, #15
+; CHECK-IOS: bfc r4, #0, #4
 ; CHECK-IOS: mov sp, r4
 ; CHECK-IOS: vst1.64 {d8, d9}, [r4:128]
 ; ...
diff --git a/test/CodeGen/ARM/frame-register.ll b/test/CodeGen/ARM/frame-register.ll
index e6a55bd..b04e376 100644
--- a/test/CodeGen/ARM/frame-register.ll
+++ b/test/CodeGen/ARM/frame-register.ll
@@ -30,9 +30,9 @@ entry:
 ; CHECK-ARM: push {r11, lr}
 ; CHECK-ARM: mov r11, sp
 
-; CHECK-THUMB: push {r4, r6, r7, lr}
-; CHECK-THUMB: add r7, sp, #8
+; CHECK-THUMB: push {r7, lr}
+; CHECK-THUMB: add r7, sp, #0
 
 ; CHECK-DARWIN-ARM: push {r7, lr}
-; CHECK-DARWIN-THUMB: push {r4, r7, lr}
+; CHECK-DARWIN-THUMB: push {r7, lr}
 
diff --git a/test/CodeGen/ARM/ghc-tcreturn-lowered.ll b/test/CodeGen/ARM/ghc-tcreturn-lowered.ll
new file mode 100644
index 0000000..623b422
--- /dev/null
+++ b/test/CodeGen/ARM/ghc-tcreturn-lowered.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=thumbv7-eabi -o - %s | FileCheck %s
+
+declare ghccc void @g()
+
+define ghccc void @test_direct_tail() {
+; CHECK-LABEL: test_direct_tail:
+; CHECK: b g
+
+  tail call ghccc void @g()
+  ret void
+}
+
+@ind_func = global void()* zeroinitializer
+
+define ghccc void @test_indirect_tail() {
+; CHECK-LABEL: test_indirect_tail:
+; CHECK: bx {{r[0-9]+}}
+  %func = load void()** @ind_func
+  tail call ghccc void()* %func()
+  ret void
+}
diff --git a/test/CodeGen/ARM/global-merge-1.ll b/test/CodeGen/ARM/global-merge-1.ll
index 341597e..e5d4def 100644
--- a/test/CodeGen/ARM/global-merge-1.ll
+++ b/test/CodeGen/ARM/global-merge-1.ll
@@ -78,8 +78,8 @@ attributes #3 = { nounwind }
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"LLVM version 3.4 "}
-!1 = metadata !{metadata !2, metadata !2, i64 0}
-!2 = metadata !{metadata !"int", metadata !3, i64 0}
-!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!0 = !{!"LLVM version 3.4 "}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM/globals.ll b/test/CodeGen/ARM/globals.ll
index 3101500..2c599bf 100644
--- a/test/CodeGen/ARM/globals.ll
+++ b/test/CodeGen/ARM/globals.ll
@@ -43,6 +43,7 @@ define i32 @test1() {
 ; DarwinPIC: LPC0_0:
 ; DarwinPIC:    ldr r0, [pc, r0]
 ; DarwinPIC:    ldr r0, [r0]
+; DarwinPIC-NOT: ldr
 ; DarwinPIC:    bx lr
 
 ; DarwinPIC: 	.align	2
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
index 5d8e477..f76fd30 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
@@ -59,5 +59,5 @@ declare %classL* @_ZN1M1JI1LS1_EcvPS1_Ev(%classM2*)
 declare void @_ZN1F10handleMoveEb(%classF*, i1 zeroext)
 declare void @_Z3fn1v()
 
-!0 = metadata !{metadata !"clang version 3.5"}
-!1 = metadata !{metadata !"branch_weights", i32 62, i32 62}
+!0 = !{!"clang version 3.5"}
+!1 = !{!"branch_weights", i32 62, i32 62}
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight.ll b/test/CodeGen/ARM/ifcvt-branch-weight.ll
index a994d3d..2d12a89 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight.ll
@@ -38,5 +38,5 @@ return:
   ret i8 1
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 4, i32 12}
-!1 = metadata !{metadata !"branch_weights", i32 8, i32 16}
+!0 = !{!"branch_weights", i32 4, i32 12}
+!1 = !{!"branch_weights", i32 8, i32 16}
diff --git a/test/CodeGen/ARM/inline-diagnostics.ll b/test/CodeGen/ARM/inline-diagnostics.ll
index 7b77da2..0276abf 100644
--- a/test/CodeGen/ARM/inline-diagnostics.ll
+++ b/test/CodeGen/ARM/inline-diagnostics.ll
@@ -13,4 +13,4 @@ define float @inline_func(float %f1, float %f2) #0 {
   ret float %1
 }
 
-!1 = metadata !{i32 271, i32 305}
+!1 = !{i32 271, i32 305}
diff --git a/test/CodeGen/ARM/interrupt-attr.ll b/test/CodeGen/ARM/interrupt-attr.ll
index 96d1ee2..c6da09d 100644
--- a/test/CodeGen/ARM/interrupt-attr.ll
+++ b/test/CodeGen/ARM/interrupt-attr.ll
@@ -15,7 +15,7 @@ define arm_aapcscc void @irq_fn() alignstack(8) "interrupt"="IRQ" {
 ; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr}
 ; CHECK-A: add r11, sp, #20
 ; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}}
-; CHECK-A: bic sp, sp, #7
+; CHECK-A: bfc sp, #0, #3
 ; CHECK-A: bl bar
 ; CHECK-A: sub sp, r11, #20
 ; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr}
@@ -25,7 +25,7 @@ define arm_aapcscc void @irq_fn() alignstack(8) "interrupt"="IRQ" {
 ; CHECK-A-THUMB: push.w {r0, r1, r2, r3, r4, r7, r12, lr}
 ; CHECK-A-THUMB: add r7, sp, #20
 ; CHECK-A-THUMB: mov r4, sp
-; CHECK-A-THUMB: bic r4, r4, #7
+; CHECK-A-THUMB: bfc r4, #0, #3
 ; CHECK-A-THUMB: bl bar
 ; CHECK-A-THUMB: sub.w r4, r7,  #20
 ; CHECK-A-THUMB: mov sp, r4
@@ -38,7 +38,7 @@ define arm_aapcscc void @irq_fn() alignstack(8) "interrupt"="IRQ" {
 ; CHECK-M: push.w {r4, r10, r11, lr}
 ; CHECK-M: add.w r11, sp, #8
 ; CHECK-M: mov r4, sp
-; CHECK-M: bic r4, r4, #7
+; CHECK-M: bfc r4, #0, #3
 ; CHECK-M: mov sp, r4
 ; CHECK-M: bl _bar
 ; CHECK-M: sub.w r4, r11, #8
@@ -56,7 +56,7 @@ define arm_aapcscc void @fiq_fn() alignstack(8) "interrupt"="FIQ" {
   ; 32 to get past r0, r1, ..., r7
 ; CHECK-A: add r11, sp, #32
 ; CHECK-A: sub sp, sp, #{{[0-9]+}}
-; CHECK-A: bic sp, sp, #7
+; CHECK-A: bfc sp, #0, #3
 ; [...]
   ; 32 must match above
 ; CHECK-A: sub sp, r11, #32
@@ -75,7 +75,7 @@ define arm_aapcscc void @swi_fn() alignstack(8) "interrupt"="SWI" {
 ; CHECK-A: push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
 ; CHECK-A: add r11, sp, #44
 ; CHECK-A: sub sp, sp, #{{[0-9]+}}
-; CHECK-A: bic sp, sp, #7
+; CHECK-A: bfc sp, #0, #3
 ; [...]
 ; CHECK-A: sub sp, r11, #44
 ; CHECK-A: pop {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
@@ -91,7 +91,7 @@ define arm_aapcscc void @undef_fn() alignstack(8) "interrupt"="UNDEF" {
 ; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr}
 ; CHECK-A: add r11, sp, #20
 ; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}}
-; CHECK-A: bic sp, sp, #7
+; CHECK-A: bfc sp, #0, #3
 ; [...]
 ; CHECK-A: sub sp, r11, #20
 ; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr}
@@ -106,7 +106,7 @@ define arm_aapcscc void @abort_fn() alignstack(8) "interrupt"="ABORT" {
 ; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr}
 ; CHECK-A: add r11, sp, #20
 ; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}}
-; CHECK-A: bic sp, sp, #7
+; CHECK-A: bfc sp, #0, #3
 ; [...]
 ; CHECK-A: sub sp, r11, #20
 ; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr}
diff --git a/test/CodeGen/ARM/isel-v8i32-crash.ll b/test/CodeGen/ARM/isel-v8i32-crash.ll
new file mode 100644
index 0000000..0116fe8
--- /dev/null
+++ b/test/CodeGen/ARM/isel-v8i32-crash.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=armv7-linux-gnu | FileCheck %s
+
+; Check we don't crash when trying to combine:
+;   (d1 = <float 8.000000e+00, float 8.000000e+00, ...>) (power of 2)
+;   vmul.f32        d0, d1, d0
+;   vcvt.s32.f32    d0, d0
+; into:
+;   vcvt.s32.f32    d0, d0, #3
+; when we have a vector length of 8, due to use of v8i32 types.
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK: func:
+; CHECK: vcvt.s32.f32  q[[R:[0-9]]], q[[R]], #3
+define void @func(i16* nocapture %pb, float* nocapture readonly %pf) #0 {
+entry:
+  %0 = bitcast float* %pf to <8 x float>*
+  %1 = load <8 x float>* %0, align 4
+  %2 = fmul <8 x float> %1, <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
+  %3 = fptosi <8 x float> %2 to <8 x i16>
+  %4 = bitcast i16* %pb to <8 x i16>*
+  store <8 x i16> %3, <8 x i16>* %4, align 2
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/ARM/krait-cpu-div-attribute.ll b/test/CodeGen/ARM/krait-cpu-div-attribute.ll
new file mode 100644
index 0000000..b7a1dcc
--- /dev/null
+++ b/test/CodeGen/ARM/krait-cpu-div-attribute.ll
@@ -0,0 +1,36 @@
+; Tests the genration of ".arch_extension" attribute for hardware
+; division on krait CPU. For now, krait is recognized as "cortex-a9" + hwdiv
+; Also, tests for the hwdiv instruction on krait CPU
+
+; check for arch_extension/cpu directive
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=krait | FileCheck %s --check-prefix=DIV_EXTENSION
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -mcpu=krait | FileCheck %s --check-prefix=DIV_EXTENSION
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 | FileCheck %s --check-prefix=NODIV_KRAIT
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -mcpu=cortex-a9 | FileCheck %s --check-prefix=NODIV_KRAIT
+; RUN: llc < %s -mcpu=krait -mattr=-hwdiv,-hwdiv-arm | FileCheck %s --check-prefix=NODIV_KRAIT
+
+; check if correct instruction is emitted by integrated assembler
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=krait -filetype=obj | llvm-objdump -mcpu=krait -triple armv7-linux-gnueabi -d - | FileCheck %s --check-prefix=HWDIV
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -mcpu=krait -filetype=obj | llvm-objdump -mcpu=krait -triple thumbv7-linux-gnueabi -d - | FileCheck %s --check-prefix=HWDIV
+
+; arch_extension attribute
+; DIV_EXTENSION:  .cpu cortex-a9
+; DIV_EXTENSION:  .arch_extension idiv
+; NODIV_KRAIT-NOT:  .arch_extension idiv
+; HWDIV: sdiv
+
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %c = alloca i32, align 4
+  store i32 0, i32* %retval
+  store volatile i32 100, i32* %b, align 4
+  store volatile i32 32, i32* %c, align 4
+  %0 = load volatile i32* %b, align 4
+  %1 = load volatile i32* %c, align 4
+  %div = sdiv i32 %0, %1
+  store volatile i32 %div, i32* %a, align 4
+  ret i32 0
+}
diff --git a/test/CodeGen/ARM/longMAC.ll b/test/CodeGen/ARM/longMAC.ll
index fed6ec0..3f30fd4 100644
--- a/test/CodeGen/ARM/longMAC.ll
+++ b/test/CodeGen/ARM/longMAC.ll
@@ -75,3 +75,44 @@ define i64 @MACLongTest5(i64 %c, i32 %a, i32 %b) {
   %add = add i64 %mul, %c
   ret i64 %add
 }
+
+define i64 @MACLongTest6(i32 %a, i32 %b, i32 %c, i32 %d) {
+;CHECK-LABEL: MACLongTest6:
+;CHECK: smull   r12, lr, r1, r0
+;CHECK: smlal   r12, lr, r3, r2
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %conv2 = sext i32 %c to i64
+  %conv3 = sext i32 %d to i64
+  %mul4 = mul nsw i64 %conv3, %conv2
+  %add = add nsw i64 %mul4, %mul
+  ret i64 %add
+}
+
+define i64 @MACLongTest7(i64 %acc, i32 %lhs, i32 %rhs) {
+;CHECK-LABEL: MACLongTest7:
+;CHECK-NOT: smlal
+  %conv = sext i32 %lhs to i64
+  %conv1 = sext i32 %rhs to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %shl = shl i64 %mul, 32
+  %shr = lshr i64 %mul, 32
+  %or = or i64 %shl, %shr
+  %add = add i64 %or, %acc
+  ret i64 %add
+}
+
+define i64 @MACLongTest8(i64 %acc, i32 %lhs, i32 %rhs) {
+;CHECK-LABEL: MACLongTest8:
+;CHECK-NOT: smlal
+  %conv = zext i32 %lhs to i64
+  %conv1 = zext i32 %rhs to i64
+  %mul = mul nuw i64 %conv1, %conv
+  %and = and i64 %mul, 4294967295
+  %shl = shl i64 %mul, 32
+  %or = or i64 %and, %shl
+  %add = add i64 %or, %acc
+  ret i64 %add
+}
+
diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll
index 84ce4a7..33ac4e1 100644
--- a/test/CodeGen/ARM/memcpy-inline.ll
+++ b/test/CodeGen/ARM/memcpy-inline.ll
@@ -46,10 +46,8 @@ entry:
 ; CHECK: movw [[REG2:r[0-9]+]], #16716
 ; CHECK: movt [[REG2:r[0-9]+]], #72
 ; CHECK: str [[REG2]], [r0, #32]
-; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
-; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
-; CHECK: adds r0, #16
-; CHECK: adds r1, #16
+; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
+; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]!
 ; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
@@ -59,10 +57,8 @@ entry:
 define void @t3(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t3:
-; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
-; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
-; CHECK: adds r0, #16
-; CHECK: adds r1, #16
+; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
+; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]!
 ; CHECK: vld1.8 {d{{[0-9]+}}}, [r1]
 ; CHECK: vst1.8 {d{{[0-9]+}}}, [r0]
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false)
@@ -73,7 +69,8 @@ define void @t4(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t4:
 ; CHECK: vld1.8 {[[REG3:d[0-9]+]], [[REG4:d[0-9]+]]}, [r1]
-; CHECK: vst1.8 {[[REG3]], [[REG4]]}, [r0]
+; CHECK: vst1.8 {[[REG3]], [[REG4]]}, [r0]!
+; CHECK: strh [[REG5:r[0-9]+]], [r0]
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false)
   ret void
 }
diff --git a/test/CodeGen/ARM/metadata-default.ll b/test/CodeGen/ARM/metadata-default.ll
index f6a3fe2..f8e40b4 100644
--- a/test/CodeGen/ARM/metadata-default.ll
+++ b/test/CodeGen/ARM/metadata-default.ll
@@ -9,8 +9,8 @@ define i32 @f(i64 %z) {
 
 !llvm.module.flags = !{!0, !1}
 
-!0 = metadata !{i32 1, metadata !"wchar_size", i32 4}
-!1 = metadata !{i32 1, metadata !"min_enum_size", i32 4}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
 
 ; CHECK: .eabi_attribute 18, 4   @ Tag_ABI_PCS_wchar_t
 ; CHECK: .eabi_attribute 26, 2   @ Tag_ABI_enum_size
diff --git a/test/CodeGen/ARM/metadata-short-enums.ll b/test/CodeGen/ARM/metadata-short-enums.ll
index bccd332..2f1586d 100644
--- a/test/CodeGen/ARM/metadata-short-enums.ll
+++ b/test/CodeGen/ARM/metadata-short-enums.ll
@@ -9,8 +9,8 @@ define i32 @f(i64 %z) {
 
 !llvm.module.flags = !{!0, !1}
 
-!0 = metadata !{i32 1, metadata !"wchar_size", i32 4}
-!1 = metadata !{i32 1, metadata !"min_enum_size", i32 1}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 1}
 
 ; CHECK: .eabi_attribute 18, 4   @ Tag_ABI_PCS_wchar_t
 ; CHECK: .eabi_attribute 26, 1   @ Tag_ABI_enum_size
diff --git a/test/CodeGen/ARM/metadata-short-wchar.ll b/test/CodeGen/ARM/metadata-short-wchar.ll
index 6de9bf1..b7f5833 100644
--- a/test/CodeGen/ARM/metadata-short-wchar.ll
+++ b/test/CodeGen/ARM/metadata-short-wchar.ll
@@ -9,8 +9,8 @@ define i32 @f(i64 %z) {
 
 !llvm.module.flags = !{!0, !1}
 
-!0 = metadata !{i32 1, metadata !"wchar_size", i32 2}
-!1 = metadata !{i32 1, metadata !"min_enum_size", i32 4}
+!0 = !{i32 1, !"wchar_size", i32 2}
+!1 = !{i32 1, !"min_enum_size", i32 4}
 
 ; CHECK: .eabi_attribute 18, 2   @ Tag_ABI_PCS_wchar_t
 ; CHECK: .eabi_attribute 26, 2   @ Tag_ABI_enum_size
diff --git a/test/CodeGen/ARM/named-reg-alloc.ll b/test/CodeGen/ARM/named-reg-alloc.ll
index 3c27d22..380cf39 100644
--- a/test/CodeGen/ARM/named-reg-alloc.ll
+++ b/test/CodeGen/ARM/named-reg-alloc.ll
@@ -11,4 +11,4 @@ entry:
 
 declare i32 @llvm.read_register.i32(metadata) nounwind
 
-!0 = metadata !{metadata !"r5\00"}
+!0 = !{!"r5\00"}
diff --git a/test/CodeGen/ARM/named-reg-notareg.ll b/test/CodeGen/ARM/named-reg-notareg.ll
index af38b60..3ac03f4 100644
--- a/test/CodeGen/ARM/named-reg-notareg.ll
+++ b/test/CodeGen/ARM/named-reg-notareg.ll
@@ -10,4 +10,4 @@ entry:
 
 declare i32 @llvm.read_register.i32(metadata) nounwind
 
-!0 = metadata !{metadata !"notareg\00"}
+!0 = !{!"notareg\00"}
diff --git a/test/CodeGen/ARM/none-macho-v4t.ll b/test/CodeGen/ARM/none-macho-v4t.ll
index 4c6e68e..b6018de 100644
--- a/test/CodeGen/ARM/none-macho-v4t.ll
+++ b/test/CodeGen/ARM/none-macho-v4t.ll
@@ -11,11 +11,15 @@ define void @test_call() {
 ; CHECK: [[PC_LABEL:LPC[0-9]+_[0-9]+]]:
 ; CHECK-NEXT: add r[[CALLEE_STUB]], pc
 ; CHECK: ldr [[CALLEE:r[0-9]+]], [r[[CALLEE_STUB]]]
-; CHECK: mov lr, pc
-; CHECK: bx [[CALLEE]]
+; CHECK-NOT: mov lr, pc
+; CHECK: bl [[INDIRECT_PAD:Ltmp[0-9]+]]
 
 ; CHECK: [[LITPOOL]]:
 ; CHECK-NEXT: .long L_callee$non_lazy_ptr-([[PC_LABEL]]+4)
+
+; CHECK: [[INDIRECT_PAD]]:
+; CHECK: bx [[CALLEE]]
+
   call void @callee()
   ret void
 }
diff --git a/test/CodeGen/ARM/null-streamer.ll b/test/CodeGen/ARM/null-streamer.ll
index 350c45e..19ad22a 100644
--- a/test/CodeGen/ARM/null-streamer.ll
+++ b/test/CodeGen/ARM/null-streamer.ll
@@ -5,3 +5,5 @@ define i32 @main()  {
 entry:
   ret i32 0
 }
+
+module asm ".fnstart"
diff --git a/test/CodeGen/ARM/odr_comdat.ll b/test/CodeGen/ARM/odr_comdat.ll
deleted file mode 100644
index e28b578..0000000
--- a/test/CodeGen/ARM/odr_comdat.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -mtriple=arm-linux-gnueabi | FileCheck %s -check-prefix=ARMGNUEABI
-
-; Checking that a comdat group gets generated correctly for a static member 
-; of instantiated C++ templates.
-; see http://sourcery.mentor.com/public/cxx-abi/abi.html#vague-itemplate
-; section 5.2.6 Instantiated templates
-; "Any static member data object is emitted in a COMDAT identified by its mangled 
-;  name, in any object file with a reference to its name symbol."
-
-; Case 1: variable is not explicitly initialized, and ends up in a .bss section
-; ARMGNUEABI: .section        .bss._ZN1CIiE1iE,"aGw",%nobits,_ZN1CIiE1iE,comdat
-@_ZN1CIiE1iE = weak_odr global i32 0, align 4
-
-; Case 2: variable is explicitly initialized, and ends up in a .data section
-; ARMGNUEABI: .section        .data._ZN1CIiE1jE,"aGw",%progbits,_ZN1CIiE1jE,comdat
-@_ZN1CIiE1jE = weak_odr global i32 12, align 4
diff --git a/test/CodeGen/ARM/out-of-registers.ll b/test/CodeGen/ARM/out-of-registers.ll
index 790e416..a83923d 100644
--- a/test/CodeGen/ARM/out-of-registers.ll
+++ b/test/CodeGen/ARM/out-of-registers.ll
@@ -38,5 +38,5 @@ attributes #2 = { nounwind readonly }
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"Snapdragon LLVM ARM Compiler 3.4"}
-!1 = metadata !{metadata !1}
+!0 = !{!"Snapdragon LLVM ARM Compiler 3.4"}
+!1 = !{!1}
diff --git a/test/CodeGen/ARM/section-name.ll b/test/CodeGen/ARM/section-name.ll
index a0aad47..a4c6054 100644
--- a/test/CodeGen/ARM/section-name.ll
+++ b/test/CodeGen/ARM/section-name.ll
@@ -16,7 +16,7 @@ entry:
   ret void
 }
 
-; CHECK: .section .text.test3,"axG",%progbits,test3,comdat
+; CHECK: .text
 ; CHECK: .weak test3
 ; CHECK: .type test3,%function
 define linkonce_odr void @test3() {
diff --git a/test/CodeGen/ARM/setcc-type-mismatch.ll b/test/CodeGen/ARM/setcc-type-mismatch.ll
new file mode 100644
index 0000000..2cfdba1
--- /dev/null
+++ b/test/CodeGen/ARM/setcc-type-mismatch.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=armv7-linux-gnueabihf %s -o - | FileCheck %s
+
+define void @test_mismatched_setcc(<4 x i22> %l, <4 x i22> %r, <4 x i1>* %addr) {
+; CHECK-LABEL: test_mismatched_setcc:
+; CHECK: vceq.i32 [[CMP128:q[0-9]+]], {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK: vmovn.i32 {{d[0-9]+}}, [[CMP128]]
+
+  %tst = icmp eq <4 x i22> %l, %r
+  store <4 x i1> %tst, <4 x i1>* %addr
+  ret void
+}
diff --git a/test/CodeGen/ARM/sjlj-prepare-critical-edge.ll b/test/CodeGen/ARM/sjlj-prepare-critical-edge.ll
index d8241d0..a7bc22f 100644
--- a/test/CodeGen/ARM/sjlj-prepare-critical-edge.ll
+++ b/test/CodeGen/ARM/sjlj-prepare-critical-edge.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O1 -mtriple thumbv7-apple-ios6
+; RUN: llc < %s -O1 -mtriple thumbv7-apple-ios6 | FileCheck %s
 ; Just make sure no one tries to make the assumption that the normal edge of an
 ; invoke is never a critical edge.  Previously, this code would assert.
 
@@ -65,3 +65,129 @@ declare i32 @__gxx_personality_sj0(...)
 declare void @release(i8*)
 
 declare void @terminatev()
+
+; Make sure that the instruction DemoteRegToStack inserts to reload
+; %call.i.i.i14.i.i follows the instruction that saves the value to the stack in
+; basic block %entry.do.body.i.i.i_crit_edge.
+; Previously, DemoteRegToStack would insert a load instruction into the entry
+; block to reload %call.i.i.i14.i.i before the phi instruction (%0) in block
+; %do.body.i.i.i.
+
+; CHECK-LABEL: __Z4foo1c:
+; CHECK: blx __Znwm
+; CHECK: {{.*}}@ %entry.do.body.i.i.i_crit_edge
+; CHECK: str r0, [sp, [[OFFSET:#[0-9]+]]]
+; CHECK: ldr [[R0:r[0-9]+]], [sp, [[OFFSET]]]
+; CHECK: {{.*}}@ %do.body.i.i.i
+; CHECK: cmp [[R0]], #0
+
+%"class.std::__1::basic_string" = type { %"class.std::__1::__compressed_pair" }
+%"class.std::__1::__compressed_pair" = type { %"class.std::__1::__libcpp_compressed_pair_imp" }
+%"class.std::__1::__libcpp_compressed_pair_imp" = type { %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__rep" }
+%"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__rep" = type { %union.anon }
+%union.anon = type { %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__long" }
+%"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__long" = type { i32, i32, i8* }
+
+@.str = private unnamed_addr constant [12 x i8] c"some_string\00", align 1
+
+define void @_Z4foo1c(i8 signext %a) {
+entry:
+  %s1 = alloca %"class.std::__1::basic_string", align 4
+  call void @_ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE6__initEPKcm(%"class.std::__1::basic_string"* %s1, i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 11)
+  %call.i.i.i14.i.i = invoke noalias i8* @_Znwm(i32 1024)
+          to label %do.body.i.i.i unwind label %lpad.body
+
+do.body.i.i.i:                                    ; preds = %entry, %_ZNSt3__116allocator_traitsINS_9allocatorIcEEE9constructIccEEvRS2_PT_RKT0_.exit.i.i.i
+  %lsr.iv = phi i32 [ %lsr.iv.next, %_ZNSt3__116allocator_traitsINS_9allocatorIcEEE9constructIccEEvRS2_PT_RKT0_.exit.i.i.i ], [ -1024, %entry ]
+  %0 = phi i8* [ %incdec.ptr.i.i.i, %_ZNSt3__116allocator_traitsINS_9allocatorIcEEE9constructIccEEvRS2_PT_RKT0_.exit.i.i.i ], [ %call.i.i.i14.i.i, %entry ]
+  %new.isnull.i.i.i.i = icmp eq i8* %0, null
+  br i1 %new.isnull.i.i.i.i, label %_ZNSt3__116allocator_traitsINS_9allocatorIcEEE9constructIccEEvRS2_PT_RKT0_.exit.i.i.i, label %new.notnull.i.i.i.i
+
+new.notnull.i.i.i.i:                              ; preds = %do.body.i.i.i
+  store i8 %a, i8* %0, align 1
+  br label %_ZNSt3__116allocator_traitsINS_9allocatorIcEEE9constructIccEEvRS2_PT_RKT0_.exit.i.i.i
+
+_ZNSt3__116allocator_traitsINS_9allocatorIcEEE9constructIccEEvRS2_PT_RKT0_.exit.i.i.i: ; preds = %new.notnull.i.i.i.i, %do.body.i.i.i
+  %1 = phi i8* [ null, %do.body.i.i.i ], [ %0, %new.notnull.i.i.i.i ]
+  %incdec.ptr.i.i.i = getelementptr inbounds i8* %1, i32 1
+  %lsr.iv.next = add i32 %lsr.iv, 1
+  %cmp.i16.i.i = icmp eq i32 %lsr.iv.next, 0
+  br i1 %cmp.i16.i.i, label %invoke.cont, label %do.body.i.i.i
+
+invoke.cont:                                      ; preds = %_ZNSt3__116allocator_traitsINS_9allocatorIcEEE9constructIccEEvRS2_PT_RKT0_.exit.i.i.i
+  invoke void @_Z4foo2Pci(i8* %call.i.i.i14.i.i, i32 1024)
+          to label %invoke.cont5 unwind label %lpad2
+
+invoke.cont5:                                     ; preds = %invoke.cont
+  %cmp.i.i.i15 = icmp eq i8* %call.i.i.i14.i.i, null
+  br i1 %cmp.i.i.i15, label %invoke.cont6, label %_ZNSt3__113__vector_baseIcNS_9allocatorIcEEE5clearEv.exit.i.i.i19
+
+_ZNSt3__113__vector_baseIcNS_9allocatorIcEEE5clearEv.exit.i.i.i19: ; preds = %invoke.cont5
+  call void @_ZdlPv(i8* %call.i.i.i14.i.i)
+  br label %invoke.cont6
+
+invoke.cont6:                                     ; preds = %_ZNSt3__113__vector_baseIcNS_9allocatorIcEEE5clearEv.exit.i.i.i19, %invoke.cont5
+  %call10 = call %"class.std::__1::basic_string"* @_ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEED1Ev(%"class.std::__1::basic_string"* %s1)
+  ret void
+
+lpad.body:                                        ; preds = %entry
+  %2 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  %3 = extractvalue { i8*, i32 } %2, 0
+  %4 = extractvalue { i8*, i32 } %2, 1
+  br label %ehcleanup
+
+lpad2:                                            ; preds = %invoke.cont
+  %5 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  %6 = extractvalue { i8*, i32 } %5, 0
+  %7 = extractvalue { i8*, i32 } %5, 1
+  %cmp.i.i.i21 = icmp eq i8* %call.i.i.i14.i.i, null
+  br i1 %cmp.i.i.i21, label %ehcleanup, label %_ZNSt3__113__vector_baseIcNS_9allocatorIcEEE5clearEv.exit.i.i.i26
+
+_ZNSt3__113__vector_baseIcNS_9allocatorIcEEE5clearEv.exit.i.i.i26: ; preds = %lpad2
+  call void @_ZdlPv(i8* %call.i.i.i14.i.i)
+  br label %ehcleanup
+
+ehcleanup:                                        ; preds = %_ZNSt3__113__vector_baseIcNS_9allocatorIcEEE5clearEv.exit.i.i.i26, %lpad2, %lpad.body
+  %exn.slot.0 = phi i8* [ %3, %lpad.body ], [ %6, %lpad2 ], [ %6, %_ZNSt3__113__vector_baseIcNS_9allocatorIcEEE5clearEv.exit.i.i.i26 ]
+  %ehselector.slot.0 = phi i32 [ %4, %lpad.body ], [ %7, %lpad2 ], [ %7, %_ZNSt3__113__vector_baseIcNS_9allocatorIcEEE5clearEv.exit.i.i.i26 ]
+  %call12 = invoke %"class.std::__1::basic_string"* @_ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEED1Ev(%"class.std::__1::basic_string"* %s1)
+          to label %eh.resume unwind label %terminate.lpad
+
+eh.resume:                                        ; preds = %ehcleanup
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.0, 0
+  %lpad.val13 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.0, 1
+  resume { i8*, i32 } %lpad.val13
+
+terminate.lpad:                                   ; preds = %ehcleanup
+  %8 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          catch i8* null
+  %9 = extractvalue { i8*, i32 } %8, 0
+  call void @__clang_call_terminate(i8* %9)
+  unreachable
+}
+
+declare void @_Z4foo2Pci(i8*, i32)
+
+define linkonce_odr hidden void @__clang_call_terminate(i8*) {
+  %2 = tail call i8* @__cxa_begin_catch(i8* %0)
+  tail call void @_ZSt9terminatev()
+  unreachable
+}
+
+declare i8* @__cxa_begin_catch(i8*)
+declare void @_ZSt9terminatev()
+declare %"class.std::__1::basic_string"* @_ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEED1Ev(%"class.std::__1::basic_string"* returned)
+declare void @_ZdlPv(i8*) #3
+declare noalias i8* @_Znwm(i32)
+declare void @_ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE6__initEPKcm(%"class.std::__1::basic_string"*, i8*, i32)
+declare void @_Unwind_SjLj_Register({ i8*, i32, [4 x i32], i8*, i8*, [5 x i8*] }*)
+declare void @_Unwind_SjLj_Unregister({ i8*, i32, [4 x i32], i8*, i8*, [5 x i8*] }*)
+declare i8* @llvm.frameaddress(i32)
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8*)
+declare i32 @llvm.eh.sjlj.setjmp(i8*)
+declare i8* @llvm.eh.sjlj.lsda()
+declare void @llvm.eh.sjlj.callsite(i32)
+declare void @llvm.eh.sjlj.functioncontext(i8*)
diff --git a/test/CodeGen/ARM/spill-q.ll b/test/CodeGen/ARM/spill-q.ll
index 4fa97ea..425fc12 100644
--- a/test/CodeGen/ARM/spill-q.ll
+++ b/test/CodeGen/ARM/spill-q.ll
@@ -11,7 +11,7 @@ declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
 
 define void @aaa(%quuz* %this, i8* %block) {
 ; CHECK-LABEL: aaa:
-; CHECK: bic {{.*}}, #15
+; CHECK: bfc {{.*}}, #0, #4
 ; CHECK: vst1.64 {{.*}}sp:128
 ; CHECK: vld1.64 {{.*}}sp:128
 entry:
diff --git a/test/CodeGen/ARM/stack-alignment.ll b/test/CodeGen/ARM/stack-alignment.ll
new file mode 100644
index 0000000..153f92e
--- /dev/null
+++ b/test/CodeGen/ARM/stack-alignment.ll
@@ -0,0 +1,164 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=armv4t   | FileCheck %s -check-prefix=CHECK-v4A32
+; RUN: llc -verify-machineinstrs < %s -mtriple=armv7a   | FileCheck %s -check-prefix=CHECK-v7A32
+; RUN: llc -verify-machineinstrs < %s -mtriple=thumbv7a | FileCheck %s -check-prefix=CHECK-THUMB2
+; FIXME: There are no tests for Thumb1 since dynamic stack alignment is not supported for
+; Thumb1.
+
+define i32 @f_bic_can_be_used_align() nounwind {
+entry:
+; CHECK-LABEL: f_bic_can_be_used_align:
+; CHECK-v7A32: bfc        sp, #0, #8
+; CHECK-v4A32: bic        sp, sp, #255
+; CHECK-THUMB2:	mov	r4, sp
+; CHECK-THUMB2-NEXT: bfc	r4, #0, #8
+; CHECK-THUMB2-NEXT: mov	sp, r4
+  %x = alloca i32, align 256
+  store volatile i32 0, i32* %x, align 256
+  ret i32 0
+}
+
+define i32 @f_too_large_for_bic_align() nounwind {
+entry:
+; CHECK-LABEL: f_too_large_for_bic_align:
+; CHECK-v7A32: bfc sp, #0, #9
+; CHECK-v4A32: lsr sp, sp, #9
+; CHECK-v4A32: lsl sp, sp, #9
+; CHECK-THUMB2:	mov	r4, sp
+; CHECK-THUMB2-NEXT:	bfc	r4, #0, #9
+; CHECK-THUMB2-NEXT:	mov	sp, r4
+  %x = alloca i32, align 512
+  store volatile i32 0, i32* %x, align 512
+  ret i32 0
+}
+
+define i8* @f_alignedDPRCS2Spills(double* %d) #0 {
+entry:
+; CHECK-LABEL: f_too_large_for_bic_align:
+; CHECK-v7A32: bfc sp, #0, #12
+; CHECK-v4A32: lsr sp, sp, #12
+; CHECK-v4A32: lsl sp, sp, #12
+; CHECK-THUMB2:      bfc	r4, #0, #12
+; CHECK-THUMB2-NEXT: mov	sp, r4
+  %a = alloca i8, align 4096
+  %0 = load double* %d, align 4
+  %arrayidx1 = getelementptr inbounds double* %d, i32 1
+  %1 = load double* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds double* %d, i32 2
+  %2 = load double* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds double* %d, i32 3
+  %3 = load double* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds double* %d, i32 4
+  %4 = load double* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds double* %d, i32 5
+  %5 = load double* %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds double* %d, i32 6
+  %6 = load double* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds double* %d, i32 7
+  %7 = load double* %arrayidx7, align 4
+  %arrayidx8 = getelementptr inbounds double* %d, i32 8
+  %8 = load double* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds double* %d, i32 9
+  %9 = load double* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds double* %d, i32 10
+  %10 = load double* %arrayidx10, align 4
+  %arrayidx11 = getelementptr inbounds double* %d, i32 11
+  %11 = load double* %arrayidx11, align 4
+  %arrayidx12 = getelementptr inbounds double* %d, i32 12
+  %12 = load double* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds double* %d, i32 13
+  %13 = load double* %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds double* %d, i32 14
+  %14 = load double* %arrayidx14, align 4
+  %arrayidx15 = getelementptr inbounds double* %d, i32 15
+  %15 = load double* %arrayidx15, align 4
+  %arrayidx16 = getelementptr inbounds double* %d, i32 16
+  %16 = load double* %arrayidx16, align 4
+  %arrayidx17 = getelementptr inbounds double* %d, i32 17
+  %17 = load double* %arrayidx17, align 4
+  %arrayidx18 = getelementptr inbounds double* %d, i32 18
+  %18 = load double* %arrayidx18, align 4
+  %arrayidx19 = getelementptr inbounds double* %d, i32 19
+  %19 = load double* %arrayidx19, align 4
+  %arrayidx20 = getelementptr inbounds double* %d, i32 20
+  %20 = load double* %arrayidx20, align 4
+  %arrayidx21 = getelementptr inbounds double* %d, i32 21
+  %21 = load double* %arrayidx21, align 4
+  %arrayidx22 = getelementptr inbounds double* %d, i32 22
+  %22 = load double* %arrayidx22, align 4
+  %arrayidx23 = getelementptr inbounds double* %d, i32 23
+  %23 = load double* %arrayidx23, align 4
+  %arrayidx24 = getelementptr inbounds double* %d, i32 24
+  %24 = load double* %arrayidx24, align 4
+  %arrayidx25 = getelementptr inbounds double* %d, i32 25
+  %25 = load double* %arrayidx25, align 4
+  %arrayidx26 = getelementptr inbounds double* %d, i32 26
+  %26 = load double* %arrayidx26, align 4
+  %arrayidx27 = getelementptr inbounds double* %d, i32 27
+  %27 = load double* %arrayidx27, align 4
+  %arrayidx28 = getelementptr inbounds double* %d, i32 28
+  %28 = load double* %arrayidx28, align 4
+  %arrayidx29 = getelementptr inbounds double* %d, i32 29
+  %29 = load double* %arrayidx29, align 4
+  %div = fdiv double %29, %28
+  %div30 = fdiv double %div, %27
+  %div31 = fdiv double %div30, %26
+  %div32 = fdiv double %div31, %25
+  %div33 = fdiv double %div32, %24
+  %div34 = fdiv double %div33, %23
+  %div35 = fdiv double %div34, %22
+  %div36 = fdiv double %div35, %21
+  %div37 = fdiv double %div36, %20
+  %div38 = fdiv double %div37, %19
+  %div39 = fdiv double %div38, %18
+  %div40 = fdiv double %div39, %17
+  %div41 = fdiv double %div40, %16
+  %div42 = fdiv double %div41, %15
+  %div43 = fdiv double %div42, %14
+  %div44 = fdiv double %div43, %13
+  %div45 = fdiv double %div44, %12
+  %div46 = fdiv double %div45, %11
+  %div47 = fdiv double %div46, %10
+  %div48 = fdiv double %div47, %9
+  %div49 = fdiv double %div48, %8
+  %div50 = fdiv double %div49, %7
+  %div51 = fdiv double %div50, %6
+  %div52 = fdiv double %div51, %5
+  %div53 = fdiv double %div52, %4
+  %div54 = fdiv double %div53, %3
+  %div55 = fdiv double %div54, %2
+  %div56 = fdiv double %div55, %1
+  %div57 = fdiv double %div56, %0
+  %div58 = fdiv double %0, %1
+  %div59 = fdiv double %div58, %2
+  %div60 = fdiv double %div59, %3
+  %div61 = fdiv double %div60, %4
+  %div62 = fdiv double %div61, %5
+  %div63 = fdiv double %div62, %6
+  %div64 = fdiv double %div63, %7
+  %div65 = fdiv double %div64, %8
+  %div66 = fdiv double %div65, %9
+  %div67 = fdiv double %div66, %10
+  %div68 = fdiv double %div67, %11
+  %div69 = fdiv double %div68, %12
+  %div70 = fdiv double %div69, %13
+  %div71 = fdiv double %div70, %14
+  %div72 = fdiv double %div71, %15
+  %div73 = fdiv double %div72, %16
+  %div74 = fdiv double %div73, %17
+  %div75 = fdiv double %div74, %18
+  %div76 = fdiv double %div75, %19
+  %div77 = fdiv double %div76, %20
+  %div78 = fdiv double %div77, %21
+  %div79 = fdiv double %div78, %22
+  %div80 = fdiv double %div79, %23
+  %div81 = fdiv double %div80, %24
+  %div82 = fdiv double %div81, %25
+  %div83 = fdiv double %div82, %26
+  %div84 = fdiv double %div83, %27
+  %div85 = fdiv double %div84, %28
+  %div86 = fdiv double %div85, %29
+  %mul = fmul double %div57, %div86
+  %conv = fptosi double %mul to i32
+  %add.ptr = getelementptr inbounds i8* %a, i32 %conv
+  ret i8* %add.ptr
+}
diff --git a/test/CodeGen/ARM/stack_guard_remat.ll b/test/CodeGen/ARM/stack_guard_remat.ll
index b11ea92..7c89b99 100644
--- a/test/CodeGen/ARM/stack_guard_remat.ll
+++ b/test/CodeGen/ARM/stack_guard_remat.ll
@@ -8,7 +8,7 @@
 ;PIC:   foo2
 ;PIC:   ldr [[R0:r[0-9]+]], [[LABEL0:LCPI[0-9_]+]]
 ;PIC: [[LABEL1:LPC0_1]]:
-;PIC:   ldr [[R1:r[0-9]+]], [pc, [[R0]]]
+;PIC:   add [[R1:r[0-9]+]], pc, [[R0]]
 ;PIC:   ldr [[R2:r[0-9]+]], {{\[}}[[R1]]{{\]}}
 ;PIC:   ldr {{r[0-9]+}}, {{\[}}[[R2]]{{\]}}
 
diff --git a/test/CodeGen/ARM/stackpointer.ll b/test/CodeGen/ARM/stackpointer.ll
index 420a916..320f0d9 100644
--- a/test/CodeGen/ARM/stackpointer.ll
+++ b/test/CodeGen/ARM/stackpointer.ll
@@ -22,4 +22,4 @@ declare void @llvm.write_register.i32(metadata, i32) nounwind
 
 ; register unsigned long current_stack_pointer asm("sp");
 ; CHECK-NOT: .asciz  "sp"
-!0 = metadata !{metadata !"sp\00"}
+!0 = !{!"sp\00"}
diff --git a/test/CodeGen/ARM/sub-cmp-peephole.ll b/test/CodeGen/ARM/sub-cmp-peephole.ll
index 19727da..f7328dc 100644
--- a/test/CodeGen/ARM/sub-cmp-peephole.ll
+++ b/test/CodeGen/ARM/sub-cmp-peephole.ll
@@ -88,6 +88,19 @@ if.end11:                                         ; preds = %num2long.exit
   ret i32 23
 }
 
+; When considering the producer of cmp's src as the subsuming instruction,
+; only consider that when the comparison is to 0.
+define i32 @cmp_src_nonzero(i32 %a, i32 %b, i32 %x, i32 %y) {
+entry:
+; CHECK-LABEL: cmp_src_nonzero:
+; CHECK: sub
+; CHECK: cmp
+  %sub = sub i32 %a, %b
+  %cmp = icmp eq i32 %sub, 17
+  %ret = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %ret
+}
+
 define float @float_sel(i32 %a, i32 %b, float %x, float %y) {
 entry:
 ; CHECK-LABEL: float_sel:
@@ -144,3 +157,50 @@ entry:
   store i32 %sub, i32* @t
   ret double %ret
 }
+
+declare void @abort()
+declare void @exit(i32)
+
+; If the comparison uses the V bit (signed overflow/underflow), we can't
+; omit the comparison.
+define i32 @cmp_slt0(i32 %a, i32 %b, i32 %x, i32 %y) {
+entry:
+; CHECK-LABEL: cmp_slt0
+; CHECK: sub
+; CHECK: cmp
+; CHECK: bge
+  %load = load i32* @t, align 4
+  %sub = sub i32 %load, 17
+  %cmp = icmp slt i32 %sub, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  call void @abort()
+  unreachable
+
+if.else:
+  call void @exit(i32 0)
+  unreachable
+}
+
+; Same for the C bit. (Note the ult X, 0 is trivially
+; false, so the DAG combiner may or may not optimize it).
+define i32 @cmp_ult0(i32 %a, i32 %b, i32 %x, i32 %y) {
+entry:
+; CHECK-LABEL: cmp_ult0
+; CHECK: sub
+; CHECK: cmp
+; CHECK: bhs
+  %load = load i32* @t, align 4
+  %sub = sub i32 %load, 17
+  %cmp = icmp ult i32 %sub, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  call void @abort()
+  unreachable
+
+if.else:
+  call void @exit(i32 0)
+  unreachable
+}
diff --git a/test/CodeGen/ARM/tail-call-weak.ll b/test/CodeGen/ARM/tail-call-weak.ll
new file mode 100644
index 0000000..466c33d
--- /dev/null
+++ b/test/CodeGen/ARM/tail-call-weak.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple thumbv7-windows-coff -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-COFF
+; RUN: llc -mtriple thumbv7-elf -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-ELF
+; RUN: llc -mtriple thumbv7-macho -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-MACHO
+
+declare i8* @f()
+declare extern_weak i8* @g(i8*)
+
+; weak symbol resolution occurs statically in PE/COFF, ensure that we permit
+; tail calls on weak externals when targeting a COFF environment.
+define void @test() {
+  %call = tail call i8* @f()
+  %call1 = tail call i8* @g(i8* %call)
+  ret void
+}
+
+; CHECK-COFF: b g
+; CHECK-ELF: bl g
+; CHECK-MACHO: blx _g
+
diff --git a/test/CodeGen/ARM/tail-call.ll b/test/CodeGen/ARM/tail-call.ll
index c3e7965..ca19b05 100644
--- a/test/CodeGen/ARM/tail-call.ll
+++ b/test/CodeGen/ARM/tail-call.ll
@@ -1,5 +1,6 @@
-; RUN: llc -mtriple armv7 -O0 -o - < %s | FileCheck %s -check-prefix CHECK-TAIL
-; RUN: llc -mtriple armv7 -O0 -disable-tail-calls -o - < %s \
+; RUN: llc -mtriple armv7 -target-abi apcs -O0 -o - < %s \
+; RUN:   | FileCheck %s -check-prefix CHECK-TAIL
+; RUN: llc -mtriple armv7 -target-abi apcs -O0 -disable-tail-calls -o - < %s \
 ; RUN:   | FileCheck %s -check-prefix CHECK-NO-TAIL
 
 declare i32 @callee(i32 %i)
diff --git a/test/CodeGen/ARM/tail-merge-branch-weight.ll b/test/CodeGen/ARM/tail-merge-branch-weight.ll
index 9b5d566..95b0a20 100644
--- a/test/CodeGen/ARM/tail-merge-branch-weight.ll
+++ b/test/CodeGen/ARM/tail-merge-branch-weight.ll
@@ -39,6 +39,6 @@ L3:                                           ; preds = %L0, %L1, %L2
   ret i32 %retval.0
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 200, i32 800}
-!1 = metadata !{metadata !"branch_weights", i32 600, i32 400}
-!2 = metadata !{metadata !"branch_weights", i32 300, i32 700}
+!0 = !{!"branch_weights", i32 200, i32 800}
+!1 = !{!"branch_weights", i32 600, i32 400}
+!2 = !{!"branch_weights", i32 300, i32 700}
diff --git a/test/CodeGen/ARM/taildup-branch-weight.ll b/test/CodeGen/ARM/taildup-branch-weight.ll
index 0a16071..64e0f4b 100644
--- a/test/CodeGen/ARM/taildup-branch-weight.ll
+++ b/test/CodeGen/ARM/taildup-branch-weight.ll
@@ -27,7 +27,7 @@ B4:
   ret void
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 4, i32 124}
+!0 = !{!"branch_weights", i32 4, i32 124}
 
 ; CHECK: Machine code for function test1:
 ; CHECK: Successors according to CFG: BB#1(8) BB#2(248)
@@ -51,4 +51,4 @@ B3:
   ret void
 }
 
-!1 = metadata !{metadata !"branch_weights", i32 248, i32 8}
+!1 = !{!"branch_weights", i32 248, i32 8}
diff --git a/test/CodeGen/ARM/thumb1-varalloc.ll b/test/CodeGen/ARM/thumb1-varalloc.ll
index 8d5888d..82c4ad5 100644
--- a/test/CodeGen/ARM/thumb1-varalloc.ll
+++ b/test/CodeGen/ARM/thumb1-varalloc.ll
@@ -43,26 +43,6 @@ bb3:
 declare noalias i8* @strdup(i8* nocapture) nounwind
 declare i32 @_called_func(i8*, i32*) nounwind
 
-; Variable ending up at unaligned offset from sp (i.e. not a multiple of 4)
-define void @test_local_var_addr() {
-; CHECK-LABEL: test_local_var_addr:
-
-  %addr1 = alloca i8
-  %addr2 = alloca i8
-
-; CHECK: mov r0, sp
-; CHECK: adds r0, #{{[0-9]+}}
-; CHECK: blx
-  call void @take_ptr(i8* %addr1)
-
-; CHECK: mov r0, sp
-; CHECK: adds r0, #{{[0-9]+}}
-; CHECK: blx
-  call void @take_ptr(i8* %addr2)
-
-  ret void
-}
-
 ; Simple variable ending up *at* sp.
 define void @test_simple_var() {
 ; CHECK-LABEL: test_simple_var:
@@ -126,14 +106,16 @@ define void @test_local_var_offset_1020() {
   ret void
 }
 
-; Max range addressable with tADDrSPi + tADDi8
-define void @test_local_var_offset_1275() {
-; CHECK-LABEL: test_local_var_offset_1275
+; Max range addressable with tADDrSPi + tADDi8 is 1275, however the automatic
+; 4-byte aligning of objects on the stack combined with 8-byte stack alignment
+; means that 1268 is the max offset we can use.
+define void @test_local_var_offset_1268() {
+; CHECK-LABEL: test_local_var_offset_1268
   %addr1 = alloca i8, i32 1
-  %addr2 = alloca i8, i32 1275
+  %addr2 = alloca i8, i32 1268
 
 ; CHECK: add r0, sp, #1020
-; CHECK: adds r0, #255
+; CHECK: adds r0, #248
 ; CHECK-NEXT: blx
   call void @take_ptr(i8* %addr1)
 
diff --git a/test/CodeGen/ARM/thumb1_return_sequence.ll b/test/CodeGen/ARM/thumb1_return_sequence.ll
index 318e6e4..c831260 100644
--- a/test/CodeGen/ARM/thumb1_return_sequence.ll
+++ b/test/CodeGen/ARM/thumb1_return_sequence.ll
@@ -3,7 +3,7 @@
 
 ; CHECK-V4T-LABEL: clobberframe
 ; CHECK-V5T-LABEL: clobberframe
-define <4 x i32> @clobberframe() #0 {
+define <4 x i32> @clobberframe(<6 x i32>* %p) #0 {
 entry:
 ; Prologue
 ; --------
@@ -11,9 +11,10 @@ entry:
 ; CHECK-V4T:    sub sp,
 ; CHECK-V5T:    push {[[SAVED:(r[4567](, )?)+]], lr}
 
-  %b = alloca <4 x i32>, align 16
+  %b = alloca <6 x i32>, align 16
   %a = alloca <4 x i32>, align 16
-  store <4 x i32> <i32 42, i32 42, i32 42, i32 42>, <4 x i32>* %b, align 16
+  %stuff = load <6 x i32>* %p, align 16
+  store <6 x i32> %stuff, <6 x i32>* %b, align 16
   store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %a, align 16
   %0 = load <4 x i32>* %a, align 16
   ret <4 x i32> %0
@@ -70,40 +71,25 @@ entry:
 
 ; CHECK-V4T-LABEL: simpleframe
 ; CHECK-V5T-LABEL: simpleframe
-define i32 @simpleframe() #0 {
+define i32 @simpleframe(<6 x i32>* %p) #0 {
 entry:
 ; Prologue
 ; --------
 ; CHECK-V4T:    push    {[[SAVED:(r[4567](, )?)+]], lr}
 ; CHECK-V5T:    push    {[[SAVED:(r[4567](, )?)+]], lr}
 
-  %a = alloca i32, align 4
-  %b = alloca i32, align 4
-  %c = alloca i32, align 4
-  %d = alloca i32, align 4
-  store i32 1, i32* %a, align 4
-  store i32 2, i32* %b, align 4
-  store i32 3, i32* %c, align 4
-  store i32 4, i32* %d, align 4
-  %0 = load i32* %a, align 4
-  %inc = add nsw i32 %0, 1
-  store i32 %inc, i32* %a, align 4
-  %1 = load i32* %b, align 4
-  %inc1 = add nsw i32 %1, 1
-  store i32 %inc1, i32* %b, align 4
-  %2 = load i32* %c, align 4
-  %inc2 = add nsw i32 %2, 1
-  store i32 %inc2, i32* %c, align 4
-  %3 = load i32* %d, align 4
-  %inc3 = add nsw i32 %3, 1
-  store i32 %inc3, i32* %d, align 4
-  %4 = load i32* %a, align 4
-  %5 = load i32* %b, align 4
-  %add = add nsw i32 %4, %5
-  %6 = load i32* %c, align 4
-  %add4 = add nsw i32 %add, %6
-  %7 = load i32* %d, align 4
-  %add5 = add nsw i32 %add4, %7
+  %0 = load <6 x i32>* %p, align 16
+  %1 = extractelement <6 x i32> %0, i32 0
+  %2 = extractelement <6 x i32> %0, i32 1
+  %3 = extractelement <6 x i32> %0, i32 2
+  %4 = extractelement <6 x i32> %0, i32 3
+  %5 = extractelement <6 x i32> %0, i32 4
+  %6 = extractelement <6 x i32> %0, i32 5
+  %add1 = add nsw i32 %1, %2
+  %add2 = add nsw i32 %add1, %3
+  %add3 = add nsw i32 %add2, %4
+  %add4 = add nsw i32 %add3, %5
+  %add5 = add nsw i32 %add4, %6
   ret i32 %add5
 
 ; Epilogue
diff --git a/test/CodeGen/ARM/thumb_indirect_calls.ll b/test/CodeGen/ARM/thumb_indirect_calls.ll
new file mode 100644
index 0000000..16a55a8
--- /dev/null
+++ b/test/CodeGen/ARM/thumb_indirect_calls.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=thumbv4t-eabi %s -o - | FileCheck ---check-prefix=CHECK -check-prefix=CHECK-V4T %s
+; RUN: llc -mtriple=thumbv5t-eabi %s -o - | FileCheck ---check-prefix=CHECK -check-prefix=CHECK-V5T %s
+
+@f = common global void (i32)* null, align 4
+
+; CHECK-LABEL foo:
+define void @foo(i32 %x) {
+entry:
+  %0 = load void (i32)** @f, align 4
+  tail call void %0(i32 %x)
+  ret void
+
+; CHECK: ldr [[TMP:r[0-3]]], [[F:\.[A-Z0-9_]+]]
+; CHECK: ldr [[CALLEE:r[0-3]]], {{\[}}[[TMP]]{{\]}}
+
+; CHECK-V4T-NOT: blx
+; CHECK-V4T: bl [[INDIRECT_PAD:\.Ltmp[0-9]+]]
+; CHECK-V4T: [[F]]:
+; CHECK-V4T: [[INDIRECT_PAD]]:
+; CHECK-V4T-NEXT: bx [[CALLEE]]
+; CHECK-V5T: blx [[CALLEE]]
+}
+
+; CHECK-LABEL bar:
+define void @bar(void (i32)* nocapture %g, i32 %x, void (i32)* nocapture %h) {
+entry:
+  tail call void %g(i32 %x)
+  tail call void %h(i32 %x)
+  ret void
+
+; CHECK-V4T: bl [[INDIRECT_PAD1:\.Ltmp[0-9]+]]
+; CHECK-V4T: bl [[INDIRECT_PAD2:\.Ltmp[0-9]+]]
+; CHECK-V4T: [[INDIRECT_PAD1]]:
+; CHECK-V4T-NEXT: bx
+; CHECK-V4T: [[INDIRECT_PAD2]]:
+; CHECK-V4T-NEXT: bx
+; CHECK-V5T: blx
+; CHECK-V5T: blx
+}
+
diff --git a/test/CodeGen/ARM/tls1.ll b/test/CodeGen/ARM/tls1.ll
index a1ca0b7..b03f76b 100644
--- a/test/CodeGen/ARM/tls1.ll
+++ b/test/CodeGen/ARM/tls1.ll
@@ -1,11 +1,13 @@
-; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:     grep "i(TPOFF)"
-; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:     grep "__aeabi_read_tp"
-; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi \
-; RUN:     -relocation-model=pic | grep "__tls_get_addr"
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | FileCheck %s
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi -relocation-model=pic | \
+; RUN:   FileCheck %s --check-prefix=PIC
 
 
+; CHECK: i(TPOFF)
+; CHECK: __aeabi_read_tp
+
+; PIC: __tls_get_addr
+
 @i = thread_local global i32 15		; <i32*> [#uses=2]
 
 define i32 @f() {
diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll
index 89f355c..6f8b3dd 100644
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll
@@ -347,17 +347,17 @@ define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) {
 
 define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) {
 ;CHECK-LABEL: check_spr_splat4:
-;CHECK: vdup.32 q
+;CHECK: vld1.16
   %conv = sitofp i16 %q to float
   %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 0
   %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
   %sub = fsub <4 x float> %splat.splat, %p
   ret <4 x float> %sub
 }
-
+; Same codegen as above test; scalar is splatted using vld1, so shuffle index is irrelevant.
 define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) {
 ;CHECK-LABEL: check_spr_splat4_lane1:
-;CHECK: vdup.32 q{{.*}}, d{{.*}}[1]
+;CHECK: vld1.16
   %conv = sitofp i16 %q to float
   %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 1
   %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
diff --git a/test/CodeGen/ARM/vector-DAGCombine.ll b/test/CodeGen/ARM/vector-DAGCombine.ll
index 759da22..566e955 100644
--- a/test/CodeGen/ARM/vector-DAGCombine.ll
+++ b/test/CodeGen/ARM/vector-DAGCombine.ll
@@ -27,6 +27,14 @@ entry:
   ret void
 }
 
+; PR22678
+; Check CONCAT_VECTORS DAG combiner pass doesn't introduce illegal types.
+define void @test_pr22678() {
+  %1 = fptoui <16 x float> undef to <16 x i8>
+  store <16 x i8> %1, <16 x i8>* undef
+  ret void
+}
+
 ; Radar 8407927: Make sure that VMOVRRD gets optimized away when the result is
 ; converted back to be used as a vector type.
 ; CHECK-LABEL: test_vmovrrd_combine:
diff --git a/test/CodeGen/ARM/vector-load.ll b/test/CodeGen/ARM/vector-load.ll
new file mode 100644
index 0000000..c177a55
--- /dev/null
+++ b/test/CodeGen/ARM/vector-load.ll
@@ -0,0 +1,253 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:o-p:32:32-i1:8:32-i8:8:32-i16:16:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7s-apple-ios8.0.0"
+
+define <8 x i8> @load_v8i8(<8 x i8>** %ptr) {
+;CHECK-LABEL: load_v8i8:
+;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]
+	%A = load <8 x i8>** %ptr
+	%lA = load <8 x i8>* %A, align 1
+	ret <8 x i8> %lA
+}
+
+define <8 x i8> @load_v8i8_update(<8 x i8>** %ptr) {
+;CHECK-LABEL: load_v8i8_update:
+;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <8 x i8>** %ptr
+	%lA = load <8 x i8>* %A, align 1
+	%inc = getelementptr <8 x i8>* %A, i38 1
+        store <8 x i8>* %inc, <8 x i8>** %ptr
+	ret <8 x i8> %lA
+}
+
+define <4 x i16> @load_v4i16(<4 x i16>** %ptr) {
+;CHECK-LABEL: load_v4i16:
+;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]
+	%A = load <4 x i16>** %ptr
+	%lA = load <4 x i16>* %A, align 1
+	ret <4 x i16> %lA
+}
+
+define <4 x i16> @load_v4i16_update(<4 x i16>** %ptr) {
+;CHECK-LABEL: load_v4i16_update:
+;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <4 x i16>** %ptr
+	%lA = load <4 x i16>* %A, align 1
+	%inc = getelementptr <4 x i16>* %A, i34 1
+        store <4 x i16>* %inc, <4 x i16>** %ptr
+	ret <4 x i16> %lA
+}
+
+define <2 x i32> @load_v2i32(<2 x i32>** %ptr) {
+;CHECK-LABEL: load_v2i32:
+;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]
+	%A = load <2 x i32>** %ptr
+	%lA = load <2 x i32>* %A, align 1
+	ret <2 x i32> %lA
+}
+
+define <2 x i32> @load_v2i32_update(<2 x i32>** %ptr) {
+;CHECK-LABEL: load_v2i32_update:
+;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <2 x i32>** %ptr
+	%lA = load <2 x i32>* %A, align 1
+	%inc = getelementptr <2 x i32>* %A, i32 1
+        store <2 x i32>* %inc, <2 x i32>** %ptr
+	ret <2 x i32> %lA
+}
+
+define <2 x float> @load_v2f32(<2 x float>** %ptr) {
+;CHECK-LABEL: load_v2f32:
+;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]
+	%A = load <2 x float>** %ptr
+	%lA = load <2 x float>* %A, align 1
+	ret <2 x float> %lA
+}
+
+define <2 x float> @load_v2f32_update(<2 x float>** %ptr) {
+;CHECK-LABEL: load_v2f32_update:
+;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <2 x float>** %ptr
+	%lA = load <2 x float>* %A, align 1
+	%inc = getelementptr <2 x float>* %A, i32 1
+        store <2 x float>* %inc, <2 x float>** %ptr
+	ret <2 x float> %lA
+}
+
+define <1 x i64> @load_v1i64(<1 x i64>** %ptr) {
+;CHECK-LABEL: load_v1i64:
+;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]
+	%A = load <1 x i64>** %ptr
+	%lA = load <1 x i64>* %A, align 1
+	ret <1 x i64> %lA
+}
+
+define <1 x i64> @load_v1i64_update(<1 x i64>** %ptr) {
+;CHECK-LABEL: load_v1i64_update:
+;CHECK: vld1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <1 x i64>** %ptr
+	%lA = load <1 x i64>* %A, align 1
+	%inc = getelementptr <1 x i64>* %A, i31 1
+        store <1 x i64>* %inc, <1 x i64>** %ptr
+	ret <1 x i64> %lA
+}
+
+define <16 x i8> @load_v16i8(<16 x i8>** %ptr) {
+;CHECK-LABEL: load_v16i8:
+;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]
+	%A = load <16 x i8>** %ptr
+	%lA = load <16 x i8>* %A, align 1
+	ret <16 x i8> %lA
+}
+
+define <16 x i8> @load_v16i8_update(<16 x i8>** %ptr) {
+;CHECK-LABEL: load_v16i8_update:
+;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <16 x i8>** %ptr
+	%lA = load <16 x i8>* %A, align 1
+	%inc = getelementptr <16 x i8>* %A, i316 1
+        store <16 x i8>* %inc, <16 x i8>** %ptr
+	ret <16 x i8> %lA
+}
+
+define <8 x i16> @load_v8i16(<8 x i16>** %ptr) {
+;CHECK-LABEL: load_v8i16:
+;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]
+	%A = load <8 x i16>** %ptr
+	%lA = load <8 x i16>* %A, align 1
+	ret <8 x i16> %lA
+}
+
+define <8 x i16> @load_v8i16_update(<8 x i16>** %ptr) {
+;CHECK-LABEL: load_v8i16_update:
+;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <8 x i16>** %ptr
+	%lA = load <8 x i16>* %A, align 1
+	%inc = getelementptr <8 x i16>* %A, i38 1
+        store <8 x i16>* %inc, <8 x i16>** %ptr
+	ret <8 x i16> %lA
+}
+
+define <4 x i32> @load_v4i32(<4 x i32>** %ptr) {
+;CHECK-LABEL: load_v4i32:
+;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]
+	%A = load <4 x i32>** %ptr
+	%lA = load <4 x i32>* %A, align 1
+	ret <4 x i32> %lA
+}
+
+define <4 x i32> @load_v4i32_update(<4 x i32>** %ptr) {
+;CHECK-LABEL: load_v4i32_update:
+;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <4 x i32>** %ptr
+	%lA = load <4 x i32>* %A, align 1
+	%inc = getelementptr <4 x i32>* %A, i34 1
+        store <4 x i32>* %inc, <4 x i32>** %ptr
+	ret <4 x i32> %lA
+}
+
+define <4 x float> @load_v4f32(<4 x float>** %ptr) {
+;CHECK-LABEL: load_v4f32:
+;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]
+	%A = load <4 x float>** %ptr
+	%lA = load <4 x float>* %A, align 1
+	ret <4 x float> %lA
+}
+
+define <4 x float> @load_v4f32_update(<4 x float>** %ptr) {
+;CHECK-LABEL: load_v4f32_update:
+;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <4 x float>** %ptr
+	%lA = load <4 x float>* %A, align 1
+	%inc = getelementptr <4 x float>* %A, i34 1
+        store <4 x float>* %inc, <4 x float>** %ptr
+	ret <4 x float> %lA
+}
+
+define <2 x i64> @load_v2i64(<2 x i64>** %ptr) {
+;CHECK-LABEL: load_v2i64:
+;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]
+	%A = load <2 x i64>** %ptr
+	%lA = load <2 x i64>* %A, align 1
+	ret <2 x i64> %lA
+}
+
+define <2 x i64> @load_v2i64_update(<2 x i64>** %ptr) {
+;CHECK-LABEL: load_v2i64_update:
+;CHECK: vld1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <2 x i64>** %ptr
+	%lA = load <2 x i64>* %A, align 1
+	%inc = getelementptr <2 x i64>* %A, i32 1
+        store <2 x i64>* %inc, <2 x i64>** %ptr
+	ret <2 x i64> %lA
+}
+
+; Make sure we change the type to match alignment if necessary.
+define <2 x i64> @load_v2i64_update_aligned2(<2 x i64>** %ptr) {
+;CHECK-LABEL: load_v2i64_update_aligned2:
+;CHECK: vld1.16 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <2 x i64>** %ptr
+	%lA = load <2 x i64>* %A, align 2
+	%inc = getelementptr <2 x i64>* %A, i32 1
+        store <2 x i64>* %inc, <2 x i64>** %ptr
+	ret <2 x i64> %lA
+}
+
+define <2 x i64> @load_v2i64_update_aligned4(<2 x i64>** %ptr) {
+;CHECK-LABEL: load_v2i64_update_aligned4:
+;CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <2 x i64>** %ptr
+	%lA = load <2 x i64>* %A, align 4
+	%inc = getelementptr <2 x i64>* %A, i32 1
+        store <2 x i64>* %inc, <2 x i64>** %ptr
+	ret <2 x i64> %lA
+}
+
+define <2 x i64> @load_v2i64_update_aligned8(<2 x i64>** %ptr) {
+;CHECK-LABEL: load_v2i64_update_aligned8:
+;CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <2 x i64>** %ptr
+	%lA = load <2 x i64>* %A, align 8
+	%inc = getelementptr <2 x i64>* %A, i32 1
+        store <2 x i64>* %inc, <2 x i64>** %ptr
+	ret <2 x i64> %lA
+}
+
+define <2 x i64> @load_v2i64_update_aligned16(<2 x i64>** %ptr) {
+;CHECK-LABEL: load_v2i64_update_aligned16:
+;CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}:128]!
+	%A = load <2 x i64>** %ptr
+	%lA = load <2 x i64>* %A, align 16
+	%inc = getelementptr <2 x i64>* %A, i32 1
+        store <2 x i64>* %inc, <2 x i64>** %ptr
+	ret <2 x i64> %lA
+}
+
+; Make sure we don't break smaller-than-dreg extloads.
+define <4 x i32> @zextload_v8i8tov8i32(<4 x i8>** %ptr) {
+;CHECK-LABEL: zextload_v8i8tov8i32:
+;CHECK: vld1.32 {{{d[0-9]+}}[0]}, [{{r[0-9]+}}:32]
+;CHECK: vmovl.u8        {{q[0-9]+}}, {{d[0-9]+}}
+;CHECK: vmovl.u16       {{q[0-9]+}}, {{d[0-9]+}}
+	%A = load <4 x i8>** %ptr
+	%lA = load <4 x i8>* %A, align 4
+        %zlA = zext <4 x i8> %lA to <4 x i32>
+	ret <4 x i32> %zlA
+}
+
+define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) {
+;CHECK-LABEL: zextload_v8i8tov8i32_fake_update:
+;CHECK: ldr.w   r[[PTRREG:[0-9]+]], [r0]
+;CHECK: vld1.32 {{{d[0-9]+}}[0]}, [r[[PTRREG]]:32]
+;CHECK: add.w   r[[INCREG:[0-9]+]], r[[PTRREG]], #16
+;CHECK: str.w   r[[INCREG]], [r0]
+;CHECK: vmovl.u8        {{q[0-9]+}}, {{d[0-9]+}}
+;CHECK: vmovl.u16       {{q[0-9]+}}, {{d[0-9]+}}
+	%A = load <4 x i8>** %ptr
+	%lA = load <4 x i8>* %A, align 4
+	%inc = getelementptr <4 x i8>* %A, i38 4
+        store <4 x i8>* %inc, <4 x i8>** %ptr
+        %zlA = zext <4 x i8> %lA to <4 x i32>
+	ret <4 x i32> %zlA
+}
diff --git a/test/CodeGen/ARM/vector-store.ll b/test/CodeGen/ARM/vector-store.ll
new file mode 100644
index 0000000..55cb8f2
--- /dev/null
+++ b/test/CodeGen/ARM/vector-store.ll
@@ -0,0 +1,258 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:o-p:32:32-i1:8:32-i8:8:32-i16:16:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7s-apple-ios8.0.0"
+
+define void @store_v8i8(<8 x i8>** %ptr, <8 x i8> %val) {
+;CHECK-LABEL: store_v8i8:
+;CHECK: str r1, [r0]
+	%A = load <8 x i8>** %ptr
+	store  <8 x i8> %val, <8 x i8>* %A, align 1
+	ret void
+}
+
+define void @store_v8i8_update(<8 x i8>** %ptr, <8 x i8> %val) {
+;CHECK-LABEL: store_v8i8_update:
+;CHECK: vst1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <8 x i8>** %ptr
+	store  <8 x i8> %val, <8 x i8>* %A, align 1
+	%inc = getelementptr <8 x i8>* %A, i38 1
+        store <8 x i8>* %inc, <8 x i8>** %ptr
+	ret void
+}
+
+define void @store_v4i16(<4 x i16>** %ptr, <4 x i16> %val) {
+;CHECK-LABEL: store_v4i16:
+;CHECK: str r1, [r0]
+	%A = load <4 x i16>** %ptr
+	store  <4 x i16> %val, <4 x i16>* %A, align 1
+	ret void
+}
+
+define void @store_v4i16_update(<4 x i16>** %ptr, <4 x i16> %val) {
+;CHECK-LABEL: store_v4i16_update:
+;CHECK: vst1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <4 x i16>** %ptr
+	store  <4 x i16> %val, <4 x i16>* %A, align 1
+	%inc = getelementptr <4 x i16>* %A, i34 1
+        store <4 x i16>* %inc, <4 x i16>** %ptr
+	ret void
+}
+
+define void @store_v2i32(<2 x i32>** %ptr, <2 x i32> %val) {
+;CHECK-LABEL: store_v2i32:
+;CHECK: str r1, [r0]
+	%A = load <2 x i32>** %ptr
+	store  <2 x i32> %val, <2 x i32>* %A, align 1
+	ret void
+}
+
+define void @store_v2i32_update(<2 x i32>** %ptr, <2 x i32> %val) {
+;CHECK-LABEL: store_v2i32_update:
+;CHECK: vst1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <2 x i32>** %ptr
+	store  <2 x i32> %val, <2 x i32>* %A, align 1
+	%inc = getelementptr <2 x i32>* %A, i32 1
+        store <2 x i32>* %inc, <2 x i32>** %ptr
+	ret void
+}
+
+define void @store_v2f32(<2 x float>** %ptr, <2 x float> %val) {
+;CHECK-LABEL: store_v2f32:
+;CHECK: str r1, [r0]
+	%A = load <2 x float>** %ptr
+	store  <2 x float> %val, <2 x float>* %A, align 1
+	ret void
+}
+
+define void @store_v2f32_update(<2 x float>** %ptr, <2 x float> %val) {
+;CHECK-LABEL: store_v2f32_update:
+;CHECK: vst1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <2 x float>** %ptr
+	store  <2 x float> %val, <2 x float>* %A, align 1
+	%inc = getelementptr <2 x float>* %A, i32 1
+        store <2 x float>* %inc, <2 x float>** %ptr
+	ret void
+}
+
+define void @store_v1i64(<1 x i64>** %ptr, <1 x i64> %val) {
+;CHECK-LABEL: store_v1i64:
+;CHECK: str r1, [r0]
+	%A = load <1 x i64>** %ptr
+	store  <1 x i64> %val, <1 x i64>* %A, align 1
+	ret void
+}
+
+define void @store_v1i64_update(<1 x i64>** %ptr, <1 x i64> %val) {
+;CHECK-LABEL: store_v1i64_update:
+;CHECK: vst1.8 {{{d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <1 x i64>** %ptr
+	store  <1 x i64> %val, <1 x i64>* %A, align 1
+	%inc = getelementptr <1 x i64>* %A, i31 1
+        store <1 x i64>* %inc, <1 x i64>** %ptr
+	ret void
+}
+
+define void @store_v16i8(<16 x i8>** %ptr, <16 x i8> %val) {
+;CHECK-LABEL: store_v16i8:
+;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]
+	%A = load <16 x i8>** %ptr
+	store  <16 x i8> %val, <16 x i8>* %A, align 1
+	ret void
+}
+
+define void @store_v16i8_update(<16 x i8>** %ptr, <16 x i8> %val) {
+;CHECK-LABEL: store_v16i8_update:
+;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <16 x i8>** %ptr
+	store  <16 x i8> %val, <16 x i8>* %A, align 1
+	%inc = getelementptr <16 x i8>* %A, i316 1
+        store <16 x i8>* %inc, <16 x i8>** %ptr
+	ret void
+}
+
+define void @store_v8i16(<8 x i16>** %ptr, <8 x i16> %val) {
+;CHECK-LABEL: store_v8i16:
+;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]
+	%A = load <8 x i16>** %ptr
+	store  <8 x i16> %val, <8 x i16>* %A, align 1
+	ret void
+}
+
+define void @store_v8i16_update(<8 x i16>** %ptr, <8 x i16> %val) {
+;CHECK-LABEL: store_v8i16_update:
+;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <8 x i16>** %ptr
+	store  <8 x i16> %val, <8 x i16>* %A, align 1
+	%inc = getelementptr <8 x i16>* %A, i38 1
+        store <8 x i16>* %inc, <8 x i16>** %ptr
+	ret void
+}
+
+define void @store_v4i32(<4 x i32>** %ptr, <4 x i32> %val) {
+;CHECK-LABEL: store_v4i32:
+;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]
+	%A = load <4 x i32>** %ptr
+	store  <4 x i32> %val, <4 x i32>* %A, align 1
+	ret void
+}
+
+define void @store_v4i32_update(<4 x i32>** %ptr, <4 x i32> %val) {
+;CHECK-LABEL: store_v4i32_update:
+;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <4 x i32>** %ptr
+	store  <4 x i32> %val, <4 x i32>* %A, align 1
+	%inc = getelementptr <4 x i32>* %A, i34 1
+        store <4 x i32>* %inc, <4 x i32>** %ptr
+	ret void
+}
+
+define void @store_v4f32(<4 x float>** %ptr, <4 x float> %val) {
+;CHECK-LABEL: store_v4f32:
+;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]
+	%A = load <4 x float>** %ptr
+	store  <4 x float> %val, <4 x float>* %A, align 1
+	ret void
+}
+
+define void @store_v4f32_update(<4 x float>** %ptr, <4 x float> %val) {
+;CHECK-LABEL: store_v4f32_update:
+;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <4 x float>** %ptr
+	store  <4 x float> %val, <4 x float>* %A, align 1
+	%inc = getelementptr <4 x float>* %A, i34 1
+        store <4 x float>* %inc, <4 x float>** %ptr
+	ret void
+}
+
+define void @store_v2i64(<2 x i64>** %ptr, <2 x i64> %val) {
+;CHECK-LABEL: store_v2i64:
+;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]
+	%A = load <2 x i64>** %ptr
+	store  <2 x i64> %val, <2 x i64>* %A, align 1
+	ret void
+}
+
+define void @store_v2i64_update(<2 x i64>** %ptr, <2 x i64> %val) {
+;CHECK-LABEL: store_v2i64_update:
+;CHECK: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <2 x i64>** %ptr
+	store  <2 x i64> %val, <2 x i64>* %A, align 1
+	%inc = getelementptr <2 x i64>* %A, i32 1
+        store <2 x i64>* %inc, <2 x i64>** %ptr
+	ret void
+}
+
+define void @store_v2i64_update_aligned2(<2 x i64>** %ptr, <2 x i64> %val) {
+;CHECK-LABEL: store_v2i64_update_aligned2:
+;CHECK: vst1.16 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <2 x i64>** %ptr
+	store  <2 x i64> %val, <2 x i64>* %A, align 2
+	%inc = getelementptr <2 x i64>* %A, i32 1
+        store <2 x i64>* %inc, <2 x i64>** %ptr
+	ret void
+}
+
+define void @store_v2i64_update_aligned4(<2 x i64>** %ptr, <2 x i64> %val) {
+;CHECK-LABEL: store_v2i64_update_aligned4:
+;CHECK: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <2 x i64>** %ptr
+	store  <2 x i64> %val, <2 x i64>* %A, align 4
+	%inc = getelementptr <2 x i64>* %A, i32 1
+        store <2 x i64>* %inc, <2 x i64>** %ptr
+	ret void
+}
+
+define void @store_v2i64_update_aligned8(<2 x i64>** %ptr, <2 x i64> %val) {
+;CHECK-LABEL: store_v2i64_update_aligned8:
+;CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}]!
+	%A = load <2 x i64>** %ptr
+	store  <2 x i64> %val, <2 x i64>* %A, align 8
+	%inc = getelementptr <2 x i64>* %A, i32 1
+        store <2 x i64>* %inc, <2 x i64>** %ptr
+	ret void
+}
+
+define void @store_v2i64_update_aligned16(<2 x i64>** %ptr, <2 x i64> %val) {
+;CHECK-LABEL: store_v2i64_update_aligned16:
+;CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [{{r[0-9]+}}:128]!
+	%A = load <2 x i64>** %ptr
+	store  <2 x i64> %val, <2 x i64>* %A, align 16
+	%inc = getelementptr <2 x i64>* %A, i32 1
+        store <2 x i64>* %inc, <2 x i64>** %ptr
+	ret void
+}
+
+define void @truncstore_v4i32tov4i8(<4 x i8>** %ptr, <4 x i32> %val) {
+;CHECK-LABEL: truncstore_v4i32tov4i8:
+;CHECK: ldr.w   r9, [sp]
+;CHECK: vmov    {{d[0-9]+}}, r3, r9
+;CHECK: vmov    {{d[0-9]+}}, r1, r2
+;CHECK: vmovn.i32       [[VECLO:d[0-9]+]], {{q[0-9]+}}
+;CHECK: vuzp.8  [[VECLO]], {{d[0-9]+}}
+;CHECK: ldr     r[[PTRREG:[0-9]+]], [r0]
+;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32]
+	%A = load <4 x i8>** %ptr
+        %trunc = trunc <4 x i32> %val to <4 x i8>
+	store  <4 x i8> %trunc, <4 x i8>* %A, align 4
+	ret void
+}
+
+define void @truncstore_v4i32tov4i8_fake_update(<4 x i8>** %ptr, <4 x i32> %val) {
+;CHECK-LABEL: truncstore_v4i32tov4i8_fake_update:
+;CHECK: ldr.w   r9, [sp]
+;CHECK: vmov    {{d[0-9]+}}, r3, r9
+;CHECK: vmov    {{d[0-9]+}}, r1, r2
+;CHECK: movs    [[IMM16:r[0-9]+]], #16
+;CHECK: vmovn.i32       [[VECLO:d[0-9]+]], {{q[0-9]+}}
+;CHECK: vuzp.8  [[VECLO]], {{d[0-9]+}}
+;CHECK: ldr     r[[PTRREG:[0-9]+]], [r0]
+;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32], [[IMM16]]
+;CHECK: str     r[[PTRREG]], [r0]
+	%A = load <4 x i8>** %ptr
+        %trunc = trunc <4 x i32> %val to <4 x i8>
+	store  <4 x i8> %trunc, <4 x i8>* %A, align 4
+	%inc = getelementptr <4 x i8>* %A, i38 4
+        store <4 x i8>* %inc, <4 x i8>** %ptr
+	ret void
+}
diff --git a/test/CodeGen/ARM/vfp-regs-dwarf.ll b/test/CodeGen/ARM/vfp-regs-dwarf.ll
index f83adf9..b67f770 100644
--- a/test/CodeGen/ARM/vfp-regs-dwarf.ll
+++ b/test/CodeGen/ARM/vfp-regs-dwarf.ll
@@ -31,14 +31,14 @@ define void @stack_offsets() {
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/tim/llvm/build/tmp.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"tmp.c", metadata !"/Users/tim/llvm/build"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00bar\00bar\00\001\000\001\000\006\000\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @stack_offsets, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/Users/tim/llvm/build/tmp.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/Users/tim/llvm/build/tmp.c] [DW_LANG_C99]
+!1 = !{!"tmp.c", !"/Users/tim/llvm/build"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00bar\00bar\00\001\000\001\000\006\000\000\001", !1, !5, !6, null, void ()* @stack_offsets, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/Users/tim/llvm/build/tmp.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 1, !"Debug Info Version", i32 2}
 
diff --git a/test/CodeGen/ARM/vld1.ll b/test/CodeGen/ARM/vld1.ll
index caeeada..db640f5 100644
--- a/test/CodeGen/ARM/vld1.ll
+++ b/test/CodeGen/ARM/vld1.ll
@@ -119,6 +119,14 @@ define <2 x i64> @vld1Qi64(i64* %A) nounwind {
 	ret <2 x i64> %tmp1
 }
 
+define <2 x double> @vld1Qf64(double* %A) nounwind {
+;CHECK-LABEL: vld1Qf64:
+;CHECK: vld1.64
+	%tmp0 = bitcast double* %A to i8*
+	%tmp1 = call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %tmp0, i32 1)
+	ret <2 x double> %tmp1
+}
+
 declare <8 x i8>  @llvm.arm.neon.vld1.v8i8(i8*, i32) nounwind readonly
 declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32) nounwind readonly
 declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32) nounwind readonly
@@ -130,6 +138,7 @@ declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
 declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32) nounwind readonly
 declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
 declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly
+declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32) nounwind readonly
 
 ; Radar 8355607
 ; Do not crash if the vld1 result is not used.
diff --git a/test/CodeGen/ARM/vst1.ll b/test/CodeGen/ARM/vst1.ll
index 14f3ff0..a6bcf7d 100644
--- a/test/CodeGen/ARM/vst1.ll
+++ b/test/CodeGen/ARM/vst1.ll
@@ -117,6 +117,15 @@ define void @vst1Qi64(i64* %A, <2 x i64>* %B) nounwind {
 	ret void
 }
 
+define void @vst1Qf64(double* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: vst1Qf64:
+;CHECK: vst1.64
+	%tmp0 = bitcast double* %A to i8*
+	%tmp1 = load <2 x double>* %B
+	call void @llvm.arm.neon.vst1.v2f64(i8* %tmp0, <2 x double> %tmp1, i32 1)
+	ret void
+}
+
 declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind
 declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32) nounwind
 declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
@@ -128,3 +137,4 @@ declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
 declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32) nounwind
 declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
 declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32) nounwind
diff --git a/test/CodeGen/BPF/alu8.ll b/test/CodeGen/BPF/alu8.ll
new file mode 100644
index 0000000..0233225
--- /dev/null
+++ b/test/CodeGen/BPF/alu8.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=bpf -show-mc-encoding < %s | FileCheck %s
+; test little endian only for now
+
+define i8 @mov(i8 %a, i8 %b) nounwind {
+; CHECK-LABEL: mov:
+; CHECK: mov r0, r2 # encoding: [0xbf,0x20,0x00,0x00,0x00,0x00,0x00,0x00]
+; CHECK: ret # encoding: [0x95,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+  ret i8 %b
+}
+
+define i8 @add(i8 %a, i8 %b) nounwind {
+; CHECK-LABEL: add:
+; CHECK: add r1, r2 # encoding: [0x0f,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+; CHECK: mov r0, r1 # encoding: [0xbf,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
+  %1 = add i8 %a, %b
+  ret i8 %1
+}
+
+define i8 @and(i8 %a, i8 %b) nounwind {
+; CHECK-LABEL: and:
+; CHECK: and r1, r2 # encoding: [0x5f,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+  %1 = and i8 %a, %b
+  ret i8 %1
+}
+
+define i8 @bis(i8 %a, i8 %b) nounwind {
+; CHECK-LABEL: bis:
+; CHECK: or r1, r2 # encoding: [0x4f,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+  %1 = or i8 %a, %b
+  ret i8 %1
+}
+
+define i8 @xorand(i8 %a, i8 %b) nounwind {
+; CHECK-LABEL: xorand:
+; CHECK: xori r2, -1 # encoding: [0xa7,0x02,0x00,0x00,0xff,0xff,0xff,0xff]
+  %1 = xor i8 %b, -1
+  %2 = and i8 %a, %1
+  ret i8 %2
+}
+
+define i8 @xor(i8 %a, i8 %b) nounwind {
+; CHECK-LABEL: xor:
+; CHECK: xor r1, r2 # encoding: [0xaf,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+  %1 = xor i8 %a, %b
+  ret i8 %1
+}
diff --git a/test/CodeGen/BPF/atomics.ll b/test/CodeGen/BPF/atomics.ll
new file mode 100644
index 0000000..2f9730d
--- /dev/null
+++ b/test/CodeGen/BPF/atomics.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=bpf -verify-machineinstrs -show-mc-encoding | FileCheck %s
+; test little endian only for now
+
+; CHECK-LABEL: test_load_add_32
+; CHECK: xadd32
+; CHECK: encoding: [0xc3
+define void @test_load_add_32(i32* %p, i32 zeroext %v) {
+entry:
+  atomicrmw add i32* %p, i32 %v seq_cst
+  ret void
+}
+
+; CHECK-LABEL: test_load_add_64
+; CHECK: xadd64
+; CHECK: encoding: [0xdb
+define void @test_load_add_64(i64* %p, i64 zeroext %v) {
+entry:
+  atomicrmw add i64* %p, i64 %v seq_cst
+  ret void
+}
diff --git a/test/CodeGen/BPF/basictest.ll b/test/CodeGen/BPF/basictest.ll
new file mode 100644
index 0000000..0cbfff8
--- /dev/null
+++ b/test/CodeGen/BPF/basictest.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -march=bpf | FileCheck %s
+
+define i32 @test0(i32 %X) {
+  %tmp.1 = add i32 %X, 1
+  ret i32 %tmp.1
+; CHECK-LABEL: test0:
+; CHECK: addi r1, 1
+}
+
+; CHECK-LABEL: store_imm:
+; CHECK: stw  0(r1), r0
+; CHECK: stw  4(r2), r0
+define i32 @store_imm(i32* %a, i32* %b) {
+entry:
+  store i32 0, i32* %a, align 4
+  %0 = getelementptr inbounds i32* %b, i32 1
+  store i32 0, i32* %0, align 4
+  ret i32 0
+}
+
+@G = external global i8
+define zeroext i8 @loadG() {
+  %tmp = load i8* @G
+  ret i8 %tmp
+; CHECK-LABEL: loadG:
+; CHECK: ld_64 r1
+; CHECK: ldb  r0, 0(r1)
+}
diff --git a/test/CodeGen/BPF/byval.ll b/test/CodeGen/BPF/byval.ll
new file mode 100644
index 0000000..065604b
--- /dev/null
+++ b/test/CodeGen/BPF/byval.ll
@@ -0,0 +1,27 @@
+; RUN: not llc -march=bpf < %s 2> %t1
+; RUN: FileCheck %s < %t1
+; CHECK: by value not supported
+
+%struct.S = type { [10 x i32] }
+
+; Function Attrs: nounwind uwtable
+define void @bar(i32 %a) #0 {
+entry:
+  %.compoundliteral = alloca %struct.S, align 8
+  %arrayinit.begin = getelementptr inbounds %struct.S* %.compoundliteral, i64 0, i32 0, i64 0
+  store i32 1, i32* %arrayinit.begin, align 8
+  %arrayinit.element = getelementptr inbounds %struct.S* %.compoundliteral, i64 0, i32 0, i64 1
+  store i32 2, i32* %arrayinit.element, align 4
+  %arrayinit.element2 = getelementptr inbounds %struct.S* %.compoundliteral, i64 0, i32 0, i64 2
+  store i32 3, i32* %arrayinit.element2, align 8
+  %arrayinit.start = getelementptr inbounds %struct.S* %.compoundliteral, i64 0, i32 0, i64 3
+  %scevgep4 = bitcast i32* %arrayinit.start to i8*
+  call void @llvm.memset.p0i8.i64(i8* %scevgep4, i8 0, i64 28, i32 4, i1 false)
+  call void @foo(i32 %a, %struct.S* byval align 8 %.compoundliteral) #3
+  ret void
+}
+
+declare void @foo(i32, %struct.S* byval align 8) #1
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3
diff --git a/test/CodeGen/BPF/cc_args.ll b/test/CodeGen/BPF/cc_args.ll
new file mode 100644
index 0000000..5085fe5
--- /dev/null
+++ b/test/CodeGen/BPF/cc_args.ll
@@ -0,0 +1,96 @@
+; RUN: llc < %s -march=bpf -show-mc-encoding | FileCheck %s
+; test little endian only for now
+
+define void @test() #0 {
+entry:
+; CHECK: test:
+
+; CHECK: mov  r1, 123 # encoding: [0xb7,0x01,0x00,0x00,0x7b,0x00,0x00,0x00]
+; CHECK: call f_i16
+  call void @f_i16(i16 123)
+
+; CHECK: mov  r1, 12345678 # encoding: [0xb7,0x01,0x00,0x00,0x4e,0x61,0xbc,0x00]
+; CHECK: call f_i32
+  call void @f_i32(i32 12345678)
+
+; CHECK: ld_64 r1, 72623859790382856 # encoding: [0x18,0x01,0x00,0x00,0x08,0x07,0x06,0x05,0x00,0x00,0x00,0x00,0x04,0x03,0x02,0x01]
+; CHECK: call f_i64
+  call void @f_i64(i64 72623859790382856)
+
+; CHECK: mov  r1, 1234
+; CHECK: mov  r2, 5678
+; CHECK: call f_i32_i32
+  call void @f_i32_i32(i32 1234, i32 5678)
+
+; CHECK: mov  r1, 2
+; CHECK: mov  r2, 3
+; CHECK: mov  r3, 4
+; CHECK: call f_i16_i32_i16
+  call void @f_i16_i32_i16(i16 2, i32 3, i16 4)
+
+; CHECK: mov  r1, 5
+; CHECK: ld_64 r2, 7262385979038285
+; CHECK: mov  r3, 6
+; CHECK: call f_i16_i64_i16
+  call void @f_i16_i64_i16(i16 5, i64 7262385979038285, i16 6)
+
+  ret void
+}
+
+@g_i16 = common global i16 0, align 2
+@g_i32 = common global i32 0, align 2
+@g_i64 = common global i64 0, align 4
+
+define void @f_i16(i16 %a) #0 {
+; CHECK: f_i16:
+; CHECK: sth 0(r2), r1 # encoding: [0x6b,0x12,0x00,0x00,0x00,0x00,0x00,0x00]
+  store volatile i16 %a, i16* @g_i16, align 2
+  ret void
+}
+
+define void @f_i32(i32 %a) #0 {
+; CHECK: f_i32:
+; CHECK: sth 0(r2), r1 # encoding: [0x6b,0x12,0x00,0x00,0x00,0x00,0x00,0x00]
+; CHECK: sth 2(r2), r1 # encoding: [0x6b,0x12,0x02,0x00,0x00,0x00,0x00,0x00]
+  store volatile i32 %a, i32* @g_i32, align 2
+  ret void
+}
+
+define void @f_i64(i64 %a) #0 {
+; CHECK: f_i64:
+; CHECK: stw 0(r2), r1
+; CHECK: stw 4(r2), r1 # encoding: [0x63,0x12,0x04,0x00,0x00,0x00,0x00,0x00]
+  store volatile i64 %a, i64* @g_i64, align 2
+  ret void
+}
+
+define void @f_i32_i32(i32 %a, i32 %b) #0 {
+; CHECK: f_i32_i32:
+; CHECK: stw 0(r3), r1
+  store volatile i32 %a, i32* @g_i32, align 4
+; CHECK: stw 0(r3), r2
+  store volatile i32 %b, i32* @g_i32, align 4
+  ret void
+}
+
+define void @f_i16_i32_i16(i16 %a, i32 %b, i16 %c) #0 {
+; CHECK: f_i16_i32_i16:
+; CHECK: sth 0(r4), r1
+  store volatile i16 %a, i16* @g_i16, align 2
+; CHECK: stw 0(r1), r2
+  store volatile i32 %b, i32* @g_i32, align 4
+; CHECK: sth 0(r4), r3
+  store volatile i16 %c, i16* @g_i16, align 2
+  ret void
+}
+
+define void @f_i16_i64_i16(i16 %a, i64 %b, i16 %c) #0 {
+; CHECK: f_i16_i64_i16:
+; CHECK: sth 0(r4), r1
+  store volatile i16 %a, i16* @g_i16, align 2
+; CHECK: std 0(r1), r2 # encoding: [0x7b,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+  store volatile i64 %b, i64* @g_i64, align 8
+; CHECK: sth 0(r4), r3
+  store volatile i16 %c, i16* @g_i16, align 2
+  ret void
+}
diff --git a/test/CodeGen/BPF/cc_ret.ll b/test/CodeGen/BPF/cc_ret.ll
new file mode 100644
index 0000000..e32b17b
--- /dev/null
+++ b/test/CodeGen/BPF/cc_ret.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -march=bpf | FileCheck %s
+
+define void @test() #0 {
+entry:
+; CHECK: test:
+
+; CHECK: call f_i16
+; CHECK: sth 0(r1), r0
+  %0 = call i16 @f_i16()
+  store volatile i16 %0, i16* @g_i16
+
+; CHECK: call f_i32
+; CHECK: stw 0(r1), r0
+  %1 = call i32 @f_i32()
+  store volatile i32 %1, i32* @g_i32
+
+; CHECK: call f_i64
+; CHECK: std 0(r1), r0
+  %2 = call i64 @f_i64()
+  store volatile i64 %2, i64* @g_i64
+
+  ret void
+}
+
+@g_i16 = common global i16 0, align 2
+@g_i32 = common global i32 0, align 2
+@g_i64 = common global i64 0, align 2
+
+define i16 @f_i16() #0 {
+; CHECK: f_i16:
+; CHECK: mov r0, 1
+; CHECK: ret
+  ret i16 1
+}
+
+define i32 @f_i32() #0 {
+; CHECK: f_i32:
+; CHECK: mov r0, 16909060
+; CHECK: ret
+  ret i32 16909060
+}
+
+define i64 @f_i64() #0 {
+; CHECK: f_i64:
+; CHECK: ld_64 r0, 72623859790382856
+; CHECK: ret
+  ret i64 72623859790382856
+}
diff --git a/test/CodeGen/BPF/cmp.ll b/test/CodeGen/BPF/cmp.ll
new file mode 100644
index 0000000..b353f90
--- /dev/null
+++ b/test/CodeGen/BPF/cmp.ll
@@ -0,0 +1,119 @@
+; RUN: llc < %s -march=bpf | FileCheck %s
+
+; Function Attrs: nounwind readnone uwtable
+define signext i8 @foo_cmp1(i8 signext %a, i8 signext %b) #0 {
+  %1 = icmp sgt i8 %a, %b
+  br i1 %1, label %2, label %4
+
+; <label>:2                                       ; preds = %0
+  %3 = mul i8 %b, %a
+  br label %6
+
+; <label>:4                                       ; preds = %0
+  %5 = shl i8 %b, 3
+  br label %6
+
+; <label>:6                                       ; preds = %4, %2
+  %.0 = phi i8 [ %3, %2 ], [ %5, %4 ]
+  ret i8 %.0
+; CHECK-LABEL:foo_cmp1:
+; CHECK: jsge r2, r1
+}
+
+; Function Attrs: nounwind readnone uwtable
+define signext i8 @foo_cmp2(i8 signext %a, i8 signext %b) #0 {
+  %1 = icmp slt i8 %a, %b
+  br i1 %1, label %4, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = mul i8 %b, %a
+  br label %6
+
+; <label>:4                                       ; preds = %0
+  %5 = shl i8 %b, 3
+  br label %6
+
+; <label>:6                                       ; preds = %4, %2
+  %.0 = phi i8 [ %3, %2 ], [ %5, %4 ]
+  ret i8 %.0
+; CHECK-LABEL:foo_cmp2:
+; CHECK: jsgt r2, r1
+}
+
+; Function Attrs: nounwind readnone uwtable
+define signext i8 @foo_cmp3(i8 signext %a, i8 signext %b) #0 {
+  %1 = icmp slt i8 %a, %b
+  br i1 %1, label %2, label %4
+
+; <label>:2                                       ; preds = %0
+  %3 = mul i8 %b, %a
+  br label %6
+
+; <label>:4                                       ; preds = %0
+  %5 = shl i8 %b, 3
+  br label %6
+
+; <label>:6                                       ; preds = %4, %2
+  %.0 = phi i8 [ %3, %2 ], [ %5, %4 ]
+  ret i8 %.0
+; CHECK-LABEL:foo_cmp3:
+; CHECK: jsge r1, r2
+}
+
+; Function Attrs: nounwind readnone uwtable
+define signext i8 @foo_cmp4(i8 signext %a, i8 signext %b) #0 {
+  %1 = icmp sgt i8 %a, %b
+  br i1 %1, label %4, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = mul i8 %b, %a
+  br label %6
+
+; <label>:4                                       ; preds = %0
+  %5 = shl i8 %b, 3
+  br label %6
+
+; <label>:6                                       ; preds = %4, %2
+  %.0 = phi i8 [ %3, %2 ], [ %5, %4 ]
+  ret i8 %.0
+; CHECK-LABEL:foo_cmp4:
+; CHECK: jsgt r1, r2
+}
+
+; Function Attrs: nounwind readnone uwtable
+define signext i8 @min(i8 signext %a, i8 signext %b) #0 {
+  %1 = icmp slt i8 %a, %b
+  %a.b = select i1 %1, i8 %a, i8 %b
+  ret i8 %a.b
+; CHECK-LABEL:min:
+; CHECK: jsgt r2, r1
+; CHECK: mov r1, r2
+; CHECK: mov r0, r1
+}
+
+; Function Attrs: nounwind readnone uwtable
+define zeroext i8 @minu(i8 zeroext %a, i8 zeroext %b) #0 {
+  %1 = icmp ult i8 %a, 100
+  %a.b = select i1 %1, i8 %a, i8 %b
+  ret i8 %a.b
+; CHECK-LABEL:minu:
+; CHECK: jgt r3, r1
+}
+
+; Function Attrs: nounwind readnone uwtable
+define signext i8 @max(i8 signext %a, i8 signext %b) #0 {
+  %1 = icmp sgt i8 %a, %b
+  %a.b = select i1 %1, i8 %a, i8 %b
+  ret i8 %a.b
+; CHECK-LABEL:max:
+; CHECK: jsgt r1, r2
+}
+
+; Function Attrs: nounwind readnone uwtable
+define signext i8 @meq(i8 signext %a, i8 signext %b, i8 signext %c) #0 {
+  %1 = icmp eq i8 %a, %b
+  %c.a = select i1 %1, i8 %c, i8 %a
+  ret i8 %c.a
+; CHECK-LABEL:meq:
+; CHECK: jeq r1, r2
+}
diff --git a/test/CodeGen/BPF/ex1.ll b/test/CodeGen/BPF/ex1.ll
new file mode 100644
index 0000000..5fc1200
--- /dev/null
+++ b/test/CodeGen/BPF/ex1.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -march=bpf | FileCheck %s
+
+%struct.bpf_context = type { i64, i64, i64, i64, i64, i64, i64 }
+%struct.sk_buff = type { i64, i64, i64, i64, i64, i64, i64 }
+%struct.net_device = type { i64, i64, i64, i64, i64, i64, i64 }
+
+@bpf_prog1.devname = private unnamed_addr constant [3 x i8] c"lo\00", align 1
+@bpf_prog1.fmt = private unnamed_addr constant [15 x i8] c"skb %x dev %x\0A\00", align 1
+
+; Function Attrs: nounwind uwtable
+define i32 @bpf_prog1(%struct.bpf_context* nocapture %ctx) #0 section "events/net/netif_receive_skb" {
+  %devname = alloca [3 x i8], align 1
+  %fmt = alloca [15 x i8], align 1
+  %1 = getelementptr inbounds [3 x i8]* %devname, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* getelementptr inbounds ([3 x i8]* @bpf_prog1.devname, i64 0, i64 0), i64 3, i32 1, i1 false)
+  %2 = getelementptr inbounds %struct.bpf_context* %ctx, i64 0, i32 0
+  %3 = load i64* %2, align 8
+  %4 = inttoptr i64 %3 to %struct.sk_buff*
+  %5 = getelementptr inbounds %struct.sk_buff* %4, i64 0, i32 2
+  %6 = bitcast i64* %5 to i8*
+  %7 = call i8* inttoptr (i64 4 to i8* (i8*)*)(i8* %6) #1
+  %8 = call i32 inttoptr (i64 9 to i32 (i8*, i8*, i32)*)(i8* %7, i8* %1, i32 2) #1
+  %9 = icmp eq i32 %8, 0
+  br i1 %9, label %10, label %13
+
+; <label>:10                                      ; preds = %0
+  %11 = getelementptr inbounds [15 x i8]* %fmt, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %11, i8* getelementptr inbounds ([15 x i8]* @bpf_prog1.fmt, i64 0, i64 0), i64 15, i32 1, i1 false)
+  %12 = call i32 (i8*, i32, ...)* inttoptr (i64 11 to i32 (i8*, i32, ...)*)(i8* %11, i32 15, %struct.sk_buff* %4, i8* %7) #1
+; CHECK-LABEL: bpf_prog1:
+; CHECK: call 4
+; CHECK: call 9
+; CHECK: jnei r0, 0
+; CHECK: mov r1, 622884453
+; CHECK: ld_64 r1, 7214898703899978611
+; CHECK: call 11
+; CHECK: mov r0, 0
+; CHECK: ret
+  br label %13
+
+; <label>:13                                      ; preds = %10, %0
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #1
diff --git a/test/CodeGen/BPF/intrinsics.ll b/test/CodeGen/BPF/intrinsics.ll
new file mode 100644
index 0000000..9a078fb
--- /dev/null
+++ b/test/CodeGen/BPF/intrinsics.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -march=bpf | FileCheck %s
+
+; Function Attrs: nounwind uwtable
+define i32 @ld_b(i64 %foo, i64* nocapture %bar, i8* %ctx, i8* %ctx2) #0 {
+  %1 = tail call i64 @llvm.bpf.load.byte(i8* %ctx, i64 123) #2
+  %2 = add i64 %1, %foo
+  %3 = load volatile i64* %bar, align 8
+  %4 = add i64 %2, %3
+  %5 = tail call i64 @llvm.bpf.load.byte(i8* %ctx2, i64 %foo) #2
+  %6 = add i64 %4, %5
+  %7 = load volatile i64* %bar, align 8
+  %8 = add i64 %6, %7
+  %9 = trunc i64 %8 to i32
+  ret i32 %9
+; CHECK-LABEL: ld_b:
+; CHECK: ldabs_b r0, r6.data + 123
+; CHECK: ldind_b r0, r6.data
+}
+
+declare i64 @llvm.bpf.load.byte(i8*, i64) #1
+
+; Function Attrs: nounwind uwtable
+define i32 @ld_h(i8* %ctx, i8* %ctx2, i32 %foo) #0 {
+  %1 = tail call i64 @llvm.bpf.load.half(i8* %ctx, i64 123) #2
+  %2 = sext i32 %foo to i64
+  %3 = tail call i64 @llvm.bpf.load.half(i8* %ctx2, i64 %2) #2
+  %4 = add i64 %3, %1
+  %5 = trunc i64 %4 to i32
+  ret i32 %5
+; CHECK-LABEL: ld_h:
+; CHECK: ldind_h r0, r6.data
+; CHECK: ldabs_h r0, r6.data + 123
+}
+
+declare i64 @llvm.bpf.load.half(i8*, i64) #1
+
+; Function Attrs: nounwind uwtable
+define i32 @ld_w(i8* %ctx, i8* %ctx2, i32 %foo) #0 {
+  %1 = tail call i64 @llvm.bpf.load.word(i8* %ctx, i64 123) #2
+  %2 = sext i32 %foo to i64
+  %3 = tail call i64 @llvm.bpf.load.word(i8* %ctx2, i64 %2) #2
+  %4 = add i64 %3, %1
+  %5 = trunc i64 %4 to i32
+  ret i32 %5
+; CHECK-LABEL: ld_w:
+; CHECK: ldind_w r0, r6.data
+; CHECK: ldabs_w r0, r6.data + 123
+}
+
+declare i64 @llvm.bpf.load.word(i8*, i64) #1
diff --git a/test/CodeGen/BPF/lit.local.cfg b/test/CodeGen/BPF/lit.local.cfg
new file mode 100644
index 0000000..a4ab262
--- /dev/null
+++ b/test/CodeGen/BPF/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'BPF' in config.root.targets:
+    config.unsupported = True
diff --git a/test/CodeGen/BPF/load.ll b/test/CodeGen/BPF/load.ll
new file mode 100644
index 0000000..b097435
--- /dev/null
+++ b/test/CodeGen/BPF/load.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -march=bpf | FileCheck %s
+
+define i16 @am1(i16* %a) nounwind {
+  %1 = load i16* %a
+  ret i16 %1
+}
+; CHECK-LABEL: am1:
+; CHECK: ldh r0, 0(r1)
+
+@foo = external global i16
+
+define i16 @am2() nounwind {
+  %1 = load i16* @foo
+  ret i16 %1
+}
+; CHECK-LABEL: am2:
+; CHECK: ldh r0, 0(r1)
+
+define i16 @am4() nounwind {
+  %1 = load volatile i16* inttoptr(i16 32 to i16*)
+  ret i16 %1
+}
+; CHECK-LABEL: am4:
+; CHECK: mov r1, 32
+; CHECK: ldh r0, 0(r1)
+
+define i16 @am5(i16* %a) nounwind {
+  %1 = getelementptr i16* %a, i16 2
+  %2 = load i16* %1
+  ret i16 %2
+}
+; CHECK-LABEL: am5:
+; CHECK: ldh r0, 4(r1)
+
+%S = type { i16, i16 }
+@baz = common global %S zeroinitializer, align 1
+
+define i16 @am6() nounwind {
+  %1 = load i16* getelementptr (%S* @baz, i32 0, i32 1)
+  ret i16 %1
+}
+; CHECK-LABEL: am6:
+; CHECK: ldh r0, 2(r1)
diff --git a/test/CodeGen/BPF/loops.ll b/test/CodeGen/BPF/loops.ll
new file mode 100644
index 0000000..40bf449
--- /dev/null
+++ b/test/CodeGen/BPF/loops.ll
@@ -0,0 +1,111 @@
+; RUN: llc < %s -march=bpf | FileCheck %s
+
+define zeroext i16 @add(i16* nocapture %a, i16 zeroext %n) nounwind readonly {
+entry:
+  %cmp8 = icmp eq i16 %n, 0                       ; <i1> [#uses=1]
+  br i1 %cmp8, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.010 = phi i16 [ 0, %entry ], [ %inc, %for.body ] ; <i16> [#uses=2]
+  %sum.09 = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
+  %arrayidx = getelementptr i16* %a, i16 %i.010   ; <i16*> [#uses=1]
+; CHECK-LABEL: add:
+; CHECK: add r{{[0-9]+}}, r{{[0-9]+}}
+  %tmp4 = load i16* %arrayidx                     ; <i16> [#uses=1]
+  %add = add i16 %tmp4, %sum.09                   ; <i16> [#uses=2]
+  %inc = add i16 %i.010, 1                        ; <i16> [#uses=2]
+  %exitcond = icmp eq i16 %inc, %n                ; <i1> [#uses=1]
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
+  ret i16 %sum.0.lcssa
+}
+
+define zeroext i16 @sub(i16* nocapture %a, i16 zeroext %n) nounwind readonly {
+entry:
+  %cmp8 = icmp eq i16 %n, 0                       ; <i1> [#uses=1]
+  br i1 %cmp8, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.010 = phi i16 [ 0, %entry ], [ %inc, %for.body ] ; <i16> [#uses=2]
+  %sum.09 = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
+  %arrayidx = getelementptr i16* %a, i16 %i.010   ; <i16*> [#uses=1]
+; CHECK-LABEL: sub:
+; CHECK: sub r{{[0-9]+}}, r{{[0-9]+}}
+  %tmp4 = load i16* %arrayidx                     ; <i16> [#uses=1]
+  %add = sub i16 %tmp4, %sum.09                   ; <i16> [#uses=2]
+  %inc = add i16 %i.010, 1                        ; <i16> [#uses=2]
+  %exitcond = icmp eq i16 %inc, %n                ; <i1> [#uses=1]
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
+  ret i16 %sum.0.lcssa
+}
+
+define zeroext i16 @or(i16* nocapture %a, i16 zeroext %n) nounwind readonly {
+entry:
+  %cmp8 = icmp eq i16 %n, 0                       ; <i1> [#uses=1]
+  br i1 %cmp8, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.010 = phi i16 [ 0, %entry ], [ %inc, %for.body ] ; <i16> [#uses=2]
+  %sum.09 = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
+  %arrayidx = getelementptr i16* %a, i16 %i.010   ; <i16*> [#uses=1]
+; CHECK-LABEL: or:
+; CHECK: or r{{[0-9]+}}, r{{[0-9]+}}
+  %tmp4 = load i16* %arrayidx                     ; <i16> [#uses=1]
+  %add = or i16 %tmp4, %sum.09                   ; <i16> [#uses=2]
+  %inc = add i16 %i.010, 1                        ; <i16> [#uses=2]
+  %exitcond = icmp eq i16 %inc, %n                ; <i1> [#uses=1]
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
+  ret i16 %sum.0.lcssa
+}
+
+define zeroext i16 @xor(i16* nocapture %a, i16 zeroext %n) nounwind readonly {
+entry:
+  %cmp8 = icmp eq i16 %n, 0                       ; <i1> [#uses=1]
+  br i1 %cmp8, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.010 = phi i16 [ 0, %entry ], [ %inc, %for.body ] ; <i16> [#uses=2]
+  %sum.09 = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
+  %arrayidx = getelementptr i16* %a, i16 %i.010   ; <i16*> [#uses=1]
+; CHECK-LABEL: xor:
+; CHECK: xor r{{[0-9]+}}, r{{[0-9]+}}
+  %tmp4 = load i16* %arrayidx                     ; <i16> [#uses=1]
+  %add = xor i16 %tmp4, %sum.09                   ; <i16> [#uses=2]
+  %inc = add i16 %i.010, 1                        ; <i16> [#uses=2]
+  %exitcond = icmp eq i16 %inc, %n                ; <i1> [#uses=1]
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
+  ret i16 %sum.0.lcssa
+}
+
+define zeroext i16 @and(i16* nocapture %a, i16 zeroext %n) nounwind readonly {
+entry:
+  %cmp8 = icmp eq i16 %n, 0                       ; <i1> [#uses=1]
+  br i1 %cmp8, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.010 = phi i16 [ 0, %entry ], [ %inc, %for.body ] ; <i16> [#uses=2]
+  %sum.09 = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
+  %arrayidx = getelementptr i16* %a, i16 %i.010   ; <i16*> [#uses=1]
+; CHECK-LABEL: and:
+; CHECK: and r{{[0-9]+}}, r{{[0-9]+}}
+  %tmp4 = load i16* %arrayidx                     ; <i16> [#uses=1]
+  %add = and i16 %tmp4, %sum.09                   ; <i16> [#uses=2]
+  %inc = add i16 %i.010, 1                        ; <i16> [#uses=2]
+  %exitcond = icmp eq i16 %inc, %n                ; <i1> [#uses=1]
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
+  ret i16 %sum.0.lcssa
+}
diff --git a/test/CodeGen/BPF/many_args1.ll b/test/CodeGen/BPF/many_args1.ll
new file mode 100644
index 0000000..08218f4
--- /dev/null
+++ b/test/CodeGen/BPF/many_args1.ll
@@ -0,0 +1,12 @@
+; RUN: not llc -march=bpf < %s 2> %t1
+; RUN: FileCheck %s < %t1
+; CHECK: too many args
+
+; Function Attrs: nounwind uwtable
+define i32 @foo(i32 %a, i32 %b, i32 %c) #0 {
+entry:
+  %call = tail call i32 @bar(i32 %a, i32 %b, i32 %c, i32 1, i32 2, i32 3) #3
+  ret i32 %call
+}
+
+declare i32 @bar(i32, i32, i32, i32, i32, i32) #1
diff --git a/test/CodeGen/BPF/many_args2.ll b/test/CodeGen/BPF/many_args2.ll
new file mode 100644
index 0000000..a69886c
--- /dev/null
+++ b/test/CodeGen/BPF/many_args2.ll
@@ -0,0 +1,15 @@
+; RUN: not llc -march=bpf < %s 2> %t1
+; RUN: FileCheck %s < %t1
+; CHECK: too many args
+
+; Function Attrs: nounwind readnone uwtable
+define i32 @bar(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f) #0 {
+entry:
+  ret i32 1
+}
+
+; Function Attrs: nounwind readnone uwtable
+define i32 @foo(i32 %a, i32 %b, i32 %c) #0 {
+entry:
+  ret i32 1
+}
diff --git a/test/CodeGen/BPF/sanity.ll b/test/CodeGen/BPF/sanity.ll
new file mode 100644
index 0000000..db63c07
--- /dev/null
+++ b/test/CodeGen/BPF/sanity.ll
@@ -0,0 +1,117 @@
+; RUN: llc < %s -march=bpf | FileCheck %s
+
+@foo_printf.fmt = private unnamed_addr constant [9 x i8] c"hello  \0A\00", align 1
+
+; Function Attrs: nounwind readnone uwtable
+define i32 @foo_int(i32 %a, i32 %b) #0 {
+  %1 = add nsw i32 %b, %a
+  ret i32 %1
+; CHECK-LABEL: foo_int:
+; CHECK: add  r2, r1
+}
+
+; Function Attrs: nounwind readnone uwtable
+define signext i8 @foo_char(i8 signext %a, i8 signext %b) #0 {
+  %1 = add i8 %b, %a
+  ret i8 %1
+; CHECK-LABEL: foo_char:
+; CHECK: add  r2, r1
+; CHECK: slli  r2, 56
+; CHECK: srai  r2, 56
+}
+
+; Function Attrs: nounwind readnone uwtable
+define i64 @foo_ll(i64 %a, i64 %b, i64 %c) #0 {
+  %1 = add nsw i64 %b, %a
+  %2 = sub i64 %1, %c
+  ret i64 %2
+; CHECK-LABEL: foo_ll:
+; CHECK: add  r2, r1
+; CHECK: sub  r2, r3
+; CHECK: mov  r0, r2
+}
+
+; Function Attrs: nounwind uwtable
+define void @foo_call2(i32 %a, i32 %b) #1 {
+  %1 = trunc i32 %b to i8
+  tail call void @foo_2arg(i8 signext %1, i32 %a) #3
+  ret void
+; CHECK-LABEL: foo_call2:
+; CHECK: slli  r2, 56
+; CHECK: srai  r2, 56
+; CHECK: mov  r1, r2
+}
+
+declare void @foo_2arg(i8 signext, i32) #2
+
+; Function Attrs: nounwind uwtable
+define i32 @foo_call5(i8 signext %a, i16 signext %b, i32 %c, i64 %d) #1 {
+  %1 = tail call i32 @bar(i8 signext %a, i16 signext %b, i32 %c, i64 %d) #3
+  ret i32 0
+; CHECK-LABEL: foo_call5:
+; CHECK: call bar
+}
+
+declare i32 @bar(i8 signext, i16 signext, i32, i64) #2
+
+; Function Attrs: nounwind readnone uwtable
+define signext i8 @foo_cmp(i8 signext %a, i8 signext %b) #0 {
+  %1 = icmp slt i8 %a, %b
+  %a.b = select i1 %1, i8 %a, i8 %b
+  ret i8 %a.b
+; CHECK-LABEL: foo_cmp:
+; CHECK: jsgt  r2, r1
+}
+
+; Function Attrs: nounwind readnone uwtable
+define i32 @foo_muldiv(i8 signext %a, i16 signext %b, i32 %c, i64 %d) #0 {
+  %1 = icmp eq i8 %a, 0
+  br i1 %1, label %5, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = sext i16 %b to i32
+  %4 = mul nsw i32 %3, %c
+  br label %8
+
+; <label>:5                                       ; preds = %0
+  %6 = trunc i64 %d to i32
+  %7 = udiv i32 %6, %c
+  br label %8
+
+; <label>:8                                       ; preds = %5, %2
+  %.0 = phi i32 [ %4, %2 ], [ %7, %5 ]
+  ret i32 %.0
+; CHECK-LABEL: foo_muldiv:
+; CHECK: mul r2, r3
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @foo_optimized() #1 {
+  %1 = tail call i32 @manyarg(i32 1, i32 2, i32 3, i32 4, i32 5) #3
+  ret i32 %1
+; CHECK-LABEL: foo_optimized:
+; CHECK: mov r1, 1
+; CHECK: mov r2, 2
+; CHECK: mov r3, 3
+; CHECK: mov r4, 4
+; CHECK: mov r5, 5
+}
+
+declare i32 @manyarg(i32, i32, i32, i32, i32) #2
+
+; Function Attrs: nounwind uwtable
+define void @foo_printf() #1 {
+  %fmt = alloca [9 x i8], align 1
+  %1 = getelementptr inbounds [9 x i8]* %fmt, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* getelementptr inbounds ([9 x i8]* @foo_printf.fmt, i64 0, i64 0), i64 9, i32 1, i1 false)
+; CHECK-LABEL: foo_printf:
+; CHECK: ld_64 r1, 729618802566522216
+  %2 = call i32 (i8*, ...)* @printf(i8* %1) #3
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #3
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture, ...) #4
diff --git a/test/CodeGen/BPF/setcc.ll b/test/CodeGen/BPF/setcc.ll
new file mode 100644
index 0000000..eabb6c9
--- /dev/null
+++ b/test/CodeGen/BPF/setcc.ll
@@ -0,0 +1,99 @@
+; RUN: llc -march=bpf < %s | FileCheck %s
+
+define i16 @sccweqand(i16 %a, i16 %b) nounwind {
+  %t1 = and i16 %a, %b
+  %t2 = icmp eq i16 %t1, 0
+  %t3 = zext i1 %t2 to i16
+  ret i16 %t3
+}
+; CHECK-LABEL: sccweqand:
+; CHECK: jeq  r1, r2
+
+define i16 @sccwneand(i16 %a, i16 %b) nounwind {
+  %t1 = and i16 %a, %b
+  %t2 = icmp ne i16 %t1, 0
+  %t3 = zext i1 %t2 to i16
+  ret i16 %t3
+}
+; CHECK-LABEL: sccwneand:
+; CHECK: jne  r1, r2
+
+define i16 @sccwne(i16 %a, i16 %b) nounwind {
+  %t1 = icmp ne i16 %a, %b
+  %t2 = zext i1 %t1 to i16
+  ret i16 %t2
+}
+; CHECK-LABEL:sccwne:
+; CHECK: jne  r1, r2
+
+define i16 @sccweq(i16 %a, i16 %b) nounwind {
+  %t1 = icmp eq i16 %a, %b
+  %t2 = zext i1 %t1 to i16
+  ret i16 %t2
+}
+; CHECK-LABEL:sccweq:
+; CHECK: jeq  r1, r2
+
+define i16 @sccwugt(i16 %a, i16 %b) nounwind {
+  %t1 = icmp ugt i16 %a, %b
+  %t2 = zext i1 %t1 to i16
+  ret i16 %t2
+}
+; CHECK-LABEL:sccwugt:
+; CHECK: jgt  r1, r2
+
+define i16 @sccwuge(i16 %a, i16 %b) nounwind {
+  %t1 = icmp uge i16 %a, %b
+  %t2 = zext i1 %t1 to i16
+  ret i16 %t2
+}
+; CHECK-LABEL:sccwuge:
+; CHECK: jge  r1, r2
+
+define i16 @sccwult(i16 %a, i16 %b) nounwind {
+  %t1 = icmp ult i16 %a, %b
+  %t2 = zext i1 %t1 to i16
+  ret i16 %t2
+}
+; CHECK-LABEL:sccwult:
+; CHECK: jgt  r2, r1
+
+define i16 @sccwule(i16 %a, i16 %b) nounwind {
+  %t1 = icmp ule i16 %a, %b
+  %t2 = zext i1 %t1 to i16
+  ret i16 %t2
+}
+; CHECK-LABEL:sccwule:
+; CHECK: jge  r2, r1
+
+define i16 @sccwsgt(i16 %a, i16 %b) nounwind {
+  %t1 = icmp sgt i16 %a, %b
+  %t2 = zext i1 %t1 to i16
+  ret i16 %t2
+}
+; CHECK-LABEL:sccwsgt:
+; CHECK: jsgt  r1, r2
+
+define i16 @sccwsge(i16 %a, i16 %b) nounwind {
+  %t1 = icmp sge i16 %a, %b
+  %t2 = zext i1 %t1 to i16
+  ret i16 %t2
+}
+; CHECK-LABEL:sccwsge:
+; CHECK: jsge  r1, r2
+
+define i16 @sccwslt(i16 %a, i16 %b) nounwind {
+  %t1 = icmp slt i16 %a, %b
+  %t2 = zext i1 %t1 to i16
+  ret i16 %t2
+}
+; CHECK-LABEL:sccwslt:
+; CHECK: jsgt  r2, r1
+
+define i16 @sccwsle(i16 %a, i16 %b) nounwind {
+  %t1 = icmp sle i16 %a, %b
+  %t2 = zext i1 %t1 to i16
+  ret i16 %t2
+}
+; CHECK-LABEL:sccwsle:
+; CHECK: jsge  r2, r1
diff --git a/test/CodeGen/BPF/shifts.ll b/test/CodeGen/BPF/shifts.ll
new file mode 100644
index 0000000..898ae2d
--- /dev/null
+++ b/test/CodeGen/BPF/shifts.ll
@@ -0,0 +1,101 @@
+; RUN: llc < %s -march=bpf -show-mc-encoding | FileCheck %s
+; test little endian only for now
+
+define zeroext i8 @lshr8(i8 zeroext %a, i8 zeroext %cnt) nounwind readnone {
+entry:
+; CHECK-LABEL: lshr8:
+; CHECK: srl r1, r2 # encoding: [0x7f,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+  %shr = lshr i8 %a, %cnt
+  ret i8 %shr
+}
+
+define signext i8 @ashr8(i8 signext %a, i8 zeroext %cnt) nounwind readnone {
+entry:
+; CHECK-LABEL: ashr8:
+; CHECK: sra r1, r2 # encoding: [0xcf,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+  %shr = ashr i8 %a, %cnt
+  ret i8 %shr
+}
+
+define zeroext i8 @shl8(i8 zeroext %a, i8 zeroext %cnt) nounwind readnone {
+entry:
+; CHECK: shl8
+; CHECK: sll r1, r2 # encoding: [0x6f,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+  %shl = shl i8 %a, %cnt
+  ret i8 %shl
+}
+
+define zeroext i16 @lshr16(i16 zeroext %a, i16 zeroext %cnt) nounwind readnone {
+entry:
+; CHECK-LABEL: lshr16:
+; CHECK: srl r1, r2 # encoding: [0x7f,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+  %shr = lshr i16 %a, %cnt
+  ret i16 %shr
+}
+
+define signext i16 @ashr16(i16 signext %a, i16 zeroext %cnt) nounwind readnone {
+entry:
+; CHECK-LABEL: ashr16:
+; CHECK: sra r1, r2 # encoding: [0xcf,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+  %shr = ashr i16 %a, %cnt
+  ret i16 %shr
+}
+
+define zeroext i16 @shl16(i16 zeroext %a, i16 zeroext %cnt) nounwind readnone {
+entry:
+; CHECK-LABEL: shl16:
+; CHECK: sll r1, r2 # encoding: [0x6f,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+  %shl = shl i16 %a, %cnt
+  ret i16 %shl
+}
+
+define zeroext i32 @lshr32(i32 zeroext %a, i32 zeroext %cnt) nounwind readnone {
+entry:
+; CHECK-LABEL: lshr32:
+; CHECK: srl r1, r2 # encoding: [0x7f,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+; CHECK: slli r1, 32 # encoding: [0x67,0x01,0x00,0x00,0x20,0x00,0x00,0x00]
+  %shr = lshr i32 %a, %cnt
+  ret i32 %shr
+}
+
+define signext i32 @ashr32(i32 signext %a, i32 zeroext %cnt) nounwind readnone {
+entry:
+; CHECK-LABEL: ashr32:
+; CHECK: sra r1, r2 # encoding: [0xcf,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+  %shr = ashr i32 %a, %cnt
+  ret i32 %shr
+}
+
+define zeroext i32 @shl32(i32 zeroext %a, i32 zeroext %cnt) nounwind readnone {
+entry:
+; CHECK-LABEL: shl32:
+; CHECK: sll r1, r2 # encoding: [0x6f,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+  %shl = shl i32 %a, %cnt
+  ret i32 %shl
+}
+
+define zeroext i64 @lshr64(i64 zeroext %a, i64 zeroext %cnt) nounwind readnone {
+entry:
+; CHECK-LABEL: lshr64:
+; CHECK: srl r1, r2 # encoding: [0x7f,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+  %shr = lshr i64 %a, %cnt
+  ret i64 %shr
+}
+
+define signext i64 @ashr64(i64 signext %a, i64 zeroext %cnt) nounwind readnone {
+entry:
+; CHECK-LABEL: ashr64:
+; CHECK: sra r1, r2 # encoding: [0xcf,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+  %shr = ashr i64 %a, %cnt
+  ret i64 %shr
+}
+
+define zeroext i64 @shl64(i64 zeroext %a, i64 zeroext %cnt) nounwind readnone {
+entry:
+; CHECK-LABEL: shl64:
+; CHECK: sll r1, r2 # encoding: [0x6f,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+; CHECK: mov r0, r1 # encoding: [0xbf,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
+; CHECK: ret # encoding: [0x95,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+  %shl = shl i64 %a, %cnt
+  ret i64 %shl
+}
diff --git a/test/CodeGen/BPF/sockex2.ll b/test/CodeGen/BPF/sockex2.ll
new file mode 100644
index 0000000..6ae5e1c
--- /dev/null
+++ b/test/CodeGen/BPF/sockex2.ll
@@ -0,0 +1,326 @@
+; RUN: llc < %s -march=bpf -show-mc-encoding | FileCheck %s
+; test little endian only for now
+
+%struct.bpf_map_def = type { i32, i32, i32, i32 }
+%struct.sk_buff = type opaque
+
+@hash_map = global %struct.bpf_map_def { i32 1, i32 4, i32 8, i32 1024 }, section "maps", align 4
+
+; Function Attrs: nounwind uwtable
+define i32 @bpf_prog2(%struct.sk_buff* %skb) #0 section "socket2" {
+  %key = alloca i32, align 4
+  %val = alloca i64, align 8
+  %1 = bitcast %struct.sk_buff* %skb to i8*
+  %2 = call i64 @llvm.bpf.load.half(i8* %1, i64 12) #2
+  %3 = icmp eq i64 %2, 34984
+  br i1 %3, label %4, label %6
+
+; <label>:4                                       ; preds = %0
+  %5 = call i64 @llvm.bpf.load.half(i8* %1, i64 16) #2
+  br label %6
+
+; <label>:6                                       ; preds = %4, %0
+  %proto.0.i = phi i64 [ %5, %4 ], [ %2, %0 ]
+  %nhoff.0.i = phi i64 [ 18, %4 ], [ 14, %0 ]
+  %7 = icmp eq i64 %proto.0.i, 33024
+  br i1 %7, label %8, label %12
+
+; <label>:8                                       ; preds = %6
+  %9 = add i64 %nhoff.0.i, 2
+  %10 = call i64 @llvm.bpf.load.half(i8* %1, i64 %9) #2
+  %11 = add i64 %nhoff.0.i, 4
+  br label %12
+
+; <label>:12                                      ; preds = %8, %6
+  %proto.1.i = phi i64 [ %10, %8 ], [ %proto.0.i, %6 ]
+  %nhoff.1.i = phi i64 [ %11, %8 ], [ %nhoff.0.i, %6 ]
+  switch i64 %proto.1.i, label %flow_dissector.exit.thread [
+    i64 2048, label %13
+    i64 34525, label %39
+  ]
+
+; <label>:13                                      ; preds = %12
+  %14 = add i64 %nhoff.1.i, 6
+  %15 = call i64 @llvm.bpf.load.half(i8* %1, i64 %14) #2
+  %16 = and i64 %15, 16383
+  %17 = icmp eq i64 %16, 0
+  br i1 %17, label %18, label %.thread.i.i
+
+; <label>:18                                      ; preds = %13
+  %19 = add i64 %nhoff.1.i, 9
+  %20 = call i64 @llvm.bpf.load.byte(i8* %1, i64 %19) #2
+  %21 = icmp eq i64 %20, 47
+  br i1 %21, label %28, label %.thread.i.i
+
+.thread.i.i:                                      ; preds = %18, %13
+  %22 = phi i64 [ %20, %18 ], [ 0, %13 ]
+  %23 = add i64 %nhoff.1.i, 12
+  %24 = call i64 @llvm.bpf.load.word(i8* %1, i64 %23) #2
+  %25 = add i64 %nhoff.1.i, 16
+  %26 = call i64 @llvm.bpf.load.word(i8* %1, i64 %25) #2
+  %27 = trunc i64 %26 to i32
+  br label %28
+
+; <label>:28                                      ; preds = %.thread.i.i, %18
+  %29 = phi i32 [ %27, %.thread.i.i ], [ undef, %18 ]
+  %30 = phi i64 [ %22, %.thread.i.i ], [ 47, %18 ]
+  %31 = call i64 @llvm.bpf.load.byte(i8* %1, i64 %nhoff.1.i) #2
+  %32 = icmp eq i64 %31, 69
+  br i1 %32, label %33, label %35
+
+; <label>:33                                      ; preds = %28
+  %34 = add i64 %nhoff.1.i, 20
+  br label %parse_ip.exit.i
+
+; <label>:35                                      ; preds = %28
+  %36 = shl i64 %31, 2
+  %37 = and i64 %36, 60
+  %38 = add i64 %37, %nhoff.1.i
+  br label %parse_ip.exit.i
+
+; <label>:39                                      ; preds = %12
+  %40 = add i64 %nhoff.1.i, 6
+  %41 = call i64 @llvm.bpf.load.byte(i8* %1, i64 %40) #2
+  %42 = add i64 %nhoff.1.i, 8
+  %43 = call i64 @llvm.bpf.load.word(i8* %1, i64 %42) #2
+  %44 = add i64 %nhoff.1.i, 12
+  %45 = call i64 @llvm.bpf.load.word(i8* %1, i64 %44) #2
+  %46 = add i64 %nhoff.1.i, 16
+  %47 = call i64 @llvm.bpf.load.word(i8* %1, i64 %46) #2
+  %48 = add i64 %nhoff.1.i, 20
+  %49 = call i64 @llvm.bpf.load.word(i8* %1, i64 %48) #2
+  %50 = add i64 %nhoff.1.i, 24
+  %51 = call i64 @llvm.bpf.load.word(i8* %1, i64 %50) #2
+  %52 = add i64 %nhoff.1.i, 28
+  %53 = call i64 @llvm.bpf.load.word(i8* %1, i64 %52) #2
+  %54 = add i64 %nhoff.1.i, 32
+  %55 = call i64 @llvm.bpf.load.word(i8* %1, i64 %54) #2
+  %56 = add i64 %nhoff.1.i, 36
+  %57 = call i64 @llvm.bpf.load.word(i8* %1, i64 %56) #2
+  %58 = xor i64 %53, %51
+  %59 = xor i64 %58, %55
+  %60 = xor i64 %59, %57
+  %61 = trunc i64 %60 to i32
+  %62 = add i64 %nhoff.1.i, 40
+  br label %parse_ip.exit.i
+
+parse_ip.exit.i:                                  ; preds = %39, %35, %33
+  %63 = phi i32 [ %61, %39 ], [ %29, %33 ], [ %29, %35 ]
+  %64 = phi i64 [ %41, %39 ], [ %30, %33 ], [ %30, %35 ]
+  %nhoff.2.i = phi i64 [ %62, %39 ], [ %34, %33 ], [ %38, %35 ]
+  switch i64 %64, label %187 [
+    i64 47, label %65
+    i64 4, label %137
+    i64 41, label %163
+  ]
+
+; <label>:65                                      ; preds = %parse_ip.exit.i
+  %66 = call i64 @llvm.bpf.load.half(i8* %1, i64 %nhoff.2.i) #2
+  %67 = add i64 %nhoff.2.i, 2
+  %68 = call i64 @llvm.bpf.load.half(i8* %1, i64 %67) #2
+  %69 = and i64 %66, 1856
+  %70 = icmp eq i64 %69, 0
+  br i1 %70, label %71, label %187
+
+; <label>:71                                      ; preds = %65
+  %72 = lshr i64 %66, 5
+  %73 = and i64 %72, 4
+  %74 = add i64 %nhoff.2.i, 4
+  %..i = add i64 %74, %73
+  %75 = and i64 %66, 32
+  %76 = icmp eq i64 %75, 0
+  %77 = add i64 %..i, 4
+  %nhoff.4.i = select i1 %76, i64 %..i, i64 %77
+  %78 = and i64 %66, 16
+  %79 = icmp eq i64 %78, 0
+  %80 = add i64 %nhoff.4.i, 4
+  %nhoff.4..i = select i1 %79, i64 %nhoff.4.i, i64 %80
+  %81 = icmp eq i64 %68, 33024
+  br i1 %81, label %82, label %86
+
+; <label>:82                                      ; preds = %71
+  %83 = add i64 %nhoff.4..i, 2
+  %84 = call i64 @llvm.bpf.load.half(i8* %1, i64 %83) #2
+  %85 = add i64 %nhoff.4..i, 4
+  br label %86
+
+; <label>:86                                      ; preds = %82, %71
+  %proto.2.i = phi i64 [ %84, %82 ], [ %68, %71 ]
+  %nhoff.6.i = phi i64 [ %85, %82 ], [ %nhoff.4..i, %71 ]
+  switch i64 %proto.2.i, label %flow_dissector.exit.thread [
+    i64 2048, label %87
+    i64 34525, label %113
+  ]
+
+; <label>:87                                      ; preds = %86
+  %88 = add i64 %nhoff.6.i, 6
+  %89 = call i64 @llvm.bpf.load.half(i8* %1, i64 %88) #2
+  %90 = and i64 %89, 16383
+  %91 = icmp eq i64 %90, 0
+  br i1 %91, label %92, label %.thread.i4.i
+
+; <label>:92                                      ; preds = %87
+  %93 = add i64 %nhoff.6.i, 9
+  %94 = call i64 @llvm.bpf.load.byte(i8* %1, i64 %93) #2
+  %95 = icmp eq i64 %94, 47
+  br i1 %95, label %102, label %.thread.i4.i
+
+.thread.i4.i:                                     ; preds = %92, %87
+  %96 = phi i64 [ %94, %92 ], [ 0, %87 ]
+  %97 = add i64 %nhoff.6.i, 12
+  %98 = call i64 @llvm.bpf.load.word(i8* %1, i64 %97) #2
+  %99 = add i64 %nhoff.6.i, 16
+  %100 = call i64 @llvm.bpf.load.word(i8* %1, i64 %99) #2
+  %101 = trunc i64 %100 to i32
+  br label %102
+
+; <label>:102                                     ; preds = %.thread.i4.i, %92
+  %103 = phi i32 [ %101, %.thread.i4.i ], [ %63, %92 ]
+  %104 = phi i64 [ %96, %.thread.i4.i ], [ 47, %92 ]
+  %105 = call i64 @llvm.bpf.load.byte(i8* %1, i64 %nhoff.6.i) #2
+  %106 = icmp eq i64 %105, 69
+  br i1 %106, label %107, label %109
+
+; <label>:107                                     ; preds = %102
+  %108 = add i64 %nhoff.6.i, 20
+  br label %187
+
+; <label>:109                                     ; preds = %102
+  %110 = shl i64 %105, 2
+  %111 = and i64 %110, 60
+  %112 = add i64 %111, %nhoff.6.i
+  br label %187
+
+; <label>:113                                     ; preds = %86
+  %114 = add i64 %nhoff.6.i, 6
+  %115 = call i64 @llvm.bpf.load.byte(i8* %1, i64 %114) #2
+  %116 = add i64 %nhoff.6.i, 8
+  %117 = call i64 @llvm.bpf.load.word(i8* %1, i64 %116) #2
+  %118 = add i64 %nhoff.6.i, 12
+  %119 = call i64 @llvm.bpf.load.word(i8* %1, i64 %118) #2
+  %120 = add i64 %nhoff.6.i, 16
+  %121 = call i64 @llvm.bpf.load.word(i8* %1, i64 %120) #2
+  %122 = add i64 %nhoff.6.i, 20
+  %123 = call i64 @llvm.bpf.load.word(i8* %1, i64 %122) #2
+  %124 = add i64 %nhoff.6.i, 24
+  %125 = call i64 @llvm.bpf.load.word(i8* %1, i64 %124) #2
+  %126 = add i64 %nhoff.6.i, 28
+  %127 = call i64 @llvm.bpf.load.word(i8* %1, i64 %126) #2
+  %128 = add i64 %nhoff.6.i, 32
+  %129 = call i64 @llvm.bpf.load.word(i8* %1, i64 %128) #2
+  %130 = add i64 %nhoff.6.i, 36
+  %131 = call i64 @llvm.bpf.load.word(i8* %1, i64 %130) #2
+  %132 = xor i64 %127, %125
+  %133 = xor i64 %132, %129
+  %134 = xor i64 %133, %131
+  %135 = trunc i64 %134 to i32
+  %136 = add i64 %nhoff.6.i, 40
+  br label %187
+
+; <label>:137                                     ; preds = %parse_ip.exit.i
+  %138 = add i64 %nhoff.2.i, 6
+  %139 = call i64 @llvm.bpf.load.half(i8* %1, i64 %138) #2
+  %140 = and i64 %139, 16383
+  %141 = icmp eq i64 %140, 0
+  br i1 %141, label %142, label %.thread.i1.i
+
+; <label>:142                                     ; preds = %137
+  %143 = add i64 %nhoff.2.i, 9
+  %144 = call i64 @llvm.bpf.load.byte(i8* %1, i64 %143) #2
+  %145 = icmp eq i64 %144, 47
+  br i1 %145, label %152, label %.thread.i1.i
+
+.thread.i1.i:                                     ; preds = %142, %137
+  %146 = phi i64 [ %144, %142 ], [ 0, %137 ]
+  %147 = add i64 %nhoff.2.i, 12
+  %148 = call i64 @llvm.bpf.load.word(i8* %1, i64 %147) #2
+  %149 = add i64 %nhoff.2.i, 16
+  %150 = call i64 @llvm.bpf.load.word(i8* %1, i64 %149) #2
+  %151 = trunc i64 %150 to i32
+  br label %152
+
+; <label>:152                                     ; preds = %.thread.i1.i, %142
+  %153 = phi i32 [ %151, %.thread.i1.i ], [ %63, %142 ]
+  %154 = phi i64 [ %146, %.thread.i1.i ], [ 47, %142 ]
+  %155 = call i64 @llvm.bpf.load.byte(i8* %1, i64 %nhoff.2.i) #2
+  %156 = icmp eq i64 %155, 69
+  br i1 %156, label %157, label %159
+
+; <label>:157                                     ; preds = %152
+  %158 = add i64 %nhoff.2.i, 20
+  br label %187
+
+; <label>:159                                     ; preds = %152
+  %160 = shl i64 %155, 2
+  %161 = and i64 %160, 60
+  %162 = add i64 %161, %nhoff.2.i
+  br label %187
+
+; <label>:163                                     ; preds = %parse_ip.exit.i
+  %164 = add i64 %nhoff.2.i, 6
+  %165 = call i64 @llvm.bpf.load.byte(i8* %1, i64 %164) #2
+  %166 = add i64 %nhoff.2.i, 8
+  %167 = call i64 @llvm.bpf.load.word(i8* %1, i64 %166) #2
+  %168 = add i64 %nhoff.2.i, 12
+  %169 = call i64 @llvm.bpf.load.word(i8* %1, i64 %168) #2
+  %170 = add i64 %nhoff.2.i, 16
+  %171 = call i64 @llvm.bpf.load.word(i8* %1, i64 %170) #2
+  %172 = add i64 %nhoff.2.i, 20
+  %173 = call i64 @llvm.bpf.load.word(i8* %1, i64 %172) #2
+  %174 = add i64 %nhoff.2.i, 24
+  %175 = call i64 @llvm.bpf.load.word(i8* %1, i64 %174) #2
+  %176 = add i64 %nhoff.2.i, 28
+  %177 = call i64 @llvm.bpf.load.word(i8* %1, i64 %176) #2
+  %178 = add i64 %nhoff.2.i, 32
+  %179 = call i64 @llvm.bpf.load.word(i8* %1, i64 %178) #2
+  %180 = add i64 %nhoff.2.i, 36
+  %181 = call i64 @llvm.bpf.load.word(i8* %1, i64 %180) #2
+  %182 = xor i64 %177, %175
+  %183 = xor i64 %182, %179
+  %184 = xor i64 %183, %181
+  %185 = trunc i64 %184 to i32
+  %186 = add i64 %nhoff.2.i, 40
+  br label %187
+
+; <label>:187                                     ; preds = %163, %159, %157, %113, %109, %107, %65, %parse_ip.exit.i
+  %188 = phi i32 [ %63, %parse_ip.exit.i ], [ %185, %163 ], [ %63, %65 ], [ %135, %113 ], [ %103, %107 ], [ %103, %109 ], [ %153, %157 ], [ %153, %159 ]
+  %189 = phi i64 [ %64, %parse_ip.exit.i ], [ %165, %163 ], [ 47, %65 ], [ %115, %113 ], [ %104, %107 ], [ %104, %109 ], [ %154, %157 ], [ %154, %159 ]
+  %nhoff.7.i = phi i64 [ %nhoff.2.i, %parse_ip.exit.i ], [ %186, %163 ], [ %nhoff.2.i, %65 ], [ %136, %113 ], [ %108, %107 ], [ %112, %109 ], [ %158, %157 ], [ %162, %159 ]
+  %cond.i.i = icmp eq i64 %189, 51
+  %190 = select i1 %cond.i.i, i64 4, i64 0
+  %191 = add i64 %190, %nhoff.7.i
+  %192 = call i64 @llvm.bpf.load.word(i8* %1, i64 %191) #2
+  store i32 %188, i32* %key, align 4
+  %193 = bitcast i32* %key to i8*
+  %194 = call i8* inttoptr (i64 1 to i8* (i8*, i8*)*)(i8* bitcast (%struct.bpf_map_def* @hash_map to i8*), i8* %193) #2
+  %195 = icmp eq i8* %194, null
+  br i1 %195, label %199, label %196
+
+; <label>:196                                     ; preds = %187
+  %197 = bitcast i8* %194 to i64*
+  %198 = atomicrmw add i64* %197, i64 1 seq_cst
+  br label %flow_dissector.exit.thread
+
+; <label>:199                                     ; preds = %187
+  store i64 1, i64* %val, align 8
+  %200 = bitcast i64* %val to i8*
+  %201 = call i32 inttoptr (i64 2 to i32 (i8*, i8*, i8*, i64)*)(i8* bitcast (%struct.bpf_map_def* @hash_map to i8*), i8* %193, i8* %200, i64 0) #2
+  br label %flow_dissector.exit.thread
+
+flow_dissector.exit.thread:                       ; preds = %86, %12, %196, %199
+  ret i32 0
+; CHECK-LABEL: bpf_prog2:
+; CHECK: ldabs_h r0, r6.data + 12 # encoding: [0x28,0x00,0x00,0x00,0x0c,0x00,0x00,0x00]
+; CHECK: ldabs_h r0, r6.data + 16 # encoding: [0x28,0x00,0x00,0x00,0x10,0x00,0x00,0x00]
+; CHECK-NOT: implicit
+; CHECK: ld_64   r1
+; CHECK-NOT: ori
+; CHECK: call 1 # encoding: [0x85,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
+; CHECK: call 2 # encoding: [0x85,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+}
+
+declare i64 @llvm.bpf.load.half(i8*, i64) #1
+
+declare i64 @llvm.bpf.load.word(i8*, i64) #1
+
+declare i64 @llvm.bpf.load.byte(i8*, i64) #1
diff --git a/test/CodeGen/BPF/struct_ret1.ll b/test/CodeGen/BPF/struct_ret1.ll
new file mode 100644
index 0000000..1477c56
--- /dev/null
+++ b/test/CodeGen/BPF/struct_ret1.ll
@@ -0,0 +1,17 @@
+; RUN: not llc -march=bpf < %s 2> %t1
+; RUN: FileCheck %s < %t1
+; CHECK: only integer returns
+
+%struct.S = type { i32, i32, i32 }
+
+@s = common global %struct.S zeroinitializer, align 4
+
+; Function Attrs: nounwind readonly uwtable
+define { i64, i32 } @bar(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) #0 {
+entry:
+  %retval.sroa.0.0.copyload = load i64* bitcast (%struct.S* @s to i64*), align 4
+  %retval.sroa.2.0.copyload = load i32* getelementptr inbounds (%struct.S* @s, i64 0, i32 2), align 4
+  %.fca.0.insert = insertvalue { i64, i32 } undef, i64 %retval.sroa.0.0.copyload, 0
+  %.fca.1.insert = insertvalue { i64, i32 } %.fca.0.insert, i32 %retval.sroa.2.0.copyload, 1
+  ret { i64, i32 } %.fca.1.insert
+}
diff --git a/test/CodeGen/BPF/struct_ret2.ll b/test/CodeGen/BPF/struct_ret2.ll
new file mode 100644
index 0000000..9046120
--- /dev/null
+++ b/test/CodeGen/BPF/struct_ret2.ll
@@ -0,0 +1,12 @@
+; RUN: not llc -march=bpf < %s 2> %t1
+; RUN: FileCheck %s < %t1
+; CHECK: only small returns
+
+; Function Attrs: nounwind uwtable
+define { i64, i32 } @foo(i32 %a, i32 %b, i32 %c) #0 {
+entry:
+  %call = tail call { i64, i32 } @bar(i32 %a, i32 %b, i32 %c, i32 1, i32 2) #3
+  ret { i64, i32 } %call
+}
+
+declare { i64, i32 } @bar(i32, i32, i32, i32, i32) #1
diff --git a/test/CodeGen/BPF/vararg1.ll b/test/CodeGen/BPF/vararg1.ll
new file mode 100644
index 0000000..4a22db6
--- /dev/null
+++ b/test/CodeGen/BPF/vararg1.ll
@@ -0,0 +1,9 @@
+; RUN: not llc -march=bpf < %s 2> %t1
+; RUN: FileCheck %s < %t1
+; CHECK: with VarArgs
+
+; Function Attrs: nounwind readnone uwtable
+define void @foo(i32 %a, ...) #0 {
+entry:
+  ret void
+}
diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll
index 0e98280..83277c9 100644
--- a/test/CodeGen/Generic/MachineBranchProb.ll
+++ b/test/CodeGen/Generic/MachineBranchProb.ll
@@ -32,4 +32,4 @@ return:
   ret i32 %retval.0
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 4, i32 4, i32 64}
+!0 = !{!"branch_weights", i32 7, i32 6, i32 4, i32 4, i32 64}
diff --git a/test/CodeGen/Generic/dbg_value.ll b/test/CodeGen/Generic/dbg_value.ll
index 73e41c7..ed7bdba 100644
--- a/test/CodeGen/Generic/dbg_value.ll
+++ b/test/CodeGen/Generic/dbg_value.ll
@@ -4,11 +4,11 @@
 %0 = type { i32, i32 }
 
 define void @t(%0*, i32, i32, i32, i32) nounwind {
-  tail call void @llvm.dbg.value(metadata !{%0* %0}, i64 0, metadata !0, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata %0* %0, i64 0, metadata !0, metadata !{!"0x102"})
   unreachable
 }
 
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 ; !0 should conform to the format of DIVariable.
-!0 = metadata !{metadata !"0x101\00a\000\000", null, null, null} ; [ DW_TAG_arg_variable ]
+!0 = !{!"0x101\00a\000\000", null, null, null} ; [ DW_TAG_arg_variable ]
diff --git a/test/CodeGen/Generic/empty-phi.ll b/test/CodeGen/Generic/empty-phi.ll
new file mode 100644
index 0000000..8d5f3b9
--- /dev/null
+++ b/test/CodeGen/Generic/empty-phi.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s
+
+define void @f() {
+entry:
+  br label %bb1
+
+bb1:
+  %0 = phi [0 x { i8*, i64, i64 }] [ %load, %bb2 ], [ undef, %entry ]
+  store [0 x { i8*, i64, i64 }] %0, [0 x { i8*, i64, i64 }]* undef, align 8
+  %1 = icmp eq i64 undef, 0
+  br i1 %1, label %bb2, label %bb3
+
+bb2:
+  %load = load [0 x { i8*, i64, i64 }]* undef, align 8
+  br label %bb1
+
+bb3:
+  ret void
+}
diff --git a/test/CodeGen/Generic/overloaded-intrinsic-name.ll b/test/CodeGen/Generic/overloaded-intrinsic-name.ll
new file mode 100644
index 0000000..aa6a031
--- /dev/null
+++ b/test/CodeGen/Generic/overloaded-intrinsic-name.ll
@@ -0,0 +1,57 @@
+; RUN: opt -verify -S < %s
+
+; Tests the name mangling performed by the codepath following
+; getMangledTypeStr(). Only tests that code with the various manglings
+; run fine: doesn't actually test the mangling with the type of the
+; arguments. Meant to serve as an example-document on how the user
+; should do name manglings.
+
+; Exercise the most general case, llvm_anyptr_type, using gc.relocate
+; and gc.statepoint. Note that it has nothing to do with gc.*
+; functions specifically: any function that accepts llvm_anyptr_type
+; will serve the purpose.
+
+; function and integer
+define i32* @test_iAny(i32* %v) {
+       %tok = call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32* %v)
+       %v-new = call i32* @llvm.experimental.gc.relocate.p0i32(i32 %tok, i32 4, i32 4)
+       ret i32* %v-new
+}
+
+; float
+define float* @test_fAny(float* %v) {
+       %tok = call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, float* %v)
+       %v-new = call float* @llvm.experimental.gc.relocate.p0f32(i32 %tok, i32 4, i32 4)
+       ret float* %v-new
+}
+
+; array of integers
+define [3 x i32]* @test_aAny([3 x i32]* %v) {
+       %tok = call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, [3 x i32]* %v)
+       %v-new = call [3 x i32]* @llvm.experimental.gc.relocate.p0a3i32(i32 %tok, i32 4, i32 4)
+       ret [3 x i32]* %v-new
+}
+
+; vector of integers
+define <3 x i32>* @test_vAny(<3 x i32>* %v) {
+       %tok = call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, <3 x i32>* %v)
+       %v-new = call <3 x i32>* @llvm.experimental.gc.relocate.p0v3i32(i32 %tok, i32 4, i32 4)
+       ret <3 x i32>* %v-new
+}
+
+%struct.test = type { i32, i1 }
+
+; struct
+define %struct.test* @test_struct(%struct.test* %v) {
+       %tok = call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, %struct.test* %v)
+       %v-new = call %struct.test* @llvm.experimental.gc.relocate.p0struct.test(i32 %tok, i32 4, i32 4)
+       ret %struct.test* %v-new
+}
+
+declare zeroext i1 @return_i1()
+declare i32 @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()*, i32, i32, ...)
+declare i32* @llvm.experimental.gc.relocate.p0i32(i32, i32, i32)
+declare float* @llvm.experimental.gc.relocate.p0f32(i32, i32, i32)
+declare [3 x i32]* @llvm.experimental.gc.relocate.p0a3i32(i32, i32, i32)
+declare <3 x i32>* @llvm.experimental.gc.relocate.p0v3i32(i32, i32, i32)
+declare %struct.test* @llvm.experimental.gc.relocate.p0struct.test(i32, i32, i32)
diff --git a/test/CodeGen/Generic/print-machineinstrs.ll b/test/CodeGen/Generic/print-machineinstrs.ll
index 75dceb5..26bccaa 100644
--- a/test/CodeGen/Generic/print-machineinstrs.ll
+++ b/test/CodeGen/Generic/print-machineinstrs.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -O3 -debug-pass=Structure -print-machineinstrs= -o /dev/null 2>&1 | FileCheck %s
 
 define i64 @foo(i64 %a, i64 %b) nounwind {
-; CHECK: -branch-folder -print-machineinstrs
+; CHECK: -branch-folder  -machineinstr-printer
 ; CHECK: Control Flow Optimizer
 ; CHECK-NEXT: MachineFunction Printer
 ; CHECK: Machine code for function foo:
diff --git a/test/CodeGen/Hexagon/BranchPredict.ll b/test/CodeGen/Hexagon/BranchPredict.ll
index 4ab1966..5d56449 100644
--- a/test/CodeGen/Hexagon/BranchPredict.ll
+++ b/test/CodeGen/Hexagon/BranchPredict.ll
@@ -72,5 +72,5 @@ return:                                           ; preds = %if.else, %if.then
   ret i32 %retval.0
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
-!1 = metadata !{metadata !"branch_weights", i32 4, i32 64}
+!0 = !{!"branch_weights", i32 64, i32 4}
+!1 = !{!"branch_weights", i32 4, i32 64}
diff --git a/test/CodeGen/Hexagon/always-ext.ll b/test/CodeGen/Hexagon/always-ext.ll
index 9c8d708..93f4240 100644
--- a/test/CodeGen/Hexagon/always-ext.ll
+++ b/test/CodeGen/Hexagon/always-ext.ll
@@ -1,3 +1,4 @@
+; XFAIL:
 ; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
 
 ; Check that we don't generate an invalid packet with too many instructions
@@ -7,7 +8,7 @@
 ; CHECK: {
 ; CHECK-NOT: call abort
 ; CHECK: memw(##0)
-; CHECK: memw(r{{[0-9+]}}<<#2+##4)
+; CHECK: memw(r{{[0-9+]}}<<#2 + ##4)
 ; CHECK: }
 
 %struct.CuTest.1.28.31.37.40.43.52.55.67.85.111 = type { i8*, void (%struct.CuTest.1.28.31.37.40.43.52.55.67.85.111*)*, i32, i32, i8*, [23 x i32]* }
diff --git a/test/CodeGen/Hexagon/block-addr.ll b/test/CodeGen/Hexagon/block-addr.ll
index 54a12bf..dc0d6e6 100644
--- a/test/CodeGen/Hexagon/block-addr.ll
+++ b/test/CodeGen/Hexagon/block-addr.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s
 
 ; CHECK: r{{[0-9]+}} = CONST32(#.LJTI{{[0-9]+_[0-9]+}})
-; CHECK: r{{[0-9]+}} = memw(r{{[0-9]+}}+r{{[0-9]+<<#[0-9]+}})
+; CHECK: r{{[0-9]+}} = memw(r{{[0-9]+}} + r{{[0-9]+<<#[0-9]+}})
 ; CHECK: jumpr r{{[0-9]+}}
 
 define void @main() #0 {
diff --git a/test/CodeGen/Hexagon/cext-check.ll b/test/CodeGen/Hexagon/cext-check.ll
index 7c4b19e..b7181d8 100644
--- a/test/CodeGen/Hexagon/cext-check.ll
+++ b/test/CodeGen/Hexagon/cext-check.ll
@@ -2,9 +2,9 @@
 ; Check that we constant extended instructions only when necessary.
 
 define i32 @cext_test1(i32* %a) nounwind {
-; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memw(r{{[0-9]+}}+##8000)
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memw(r{{[0-9]+}}{{ *}}+{{ *}}##8000)
 ; CHECK: r{{[0-9]+}}{{ *}}={{ *}}add(r{{[0-9]+}}{{ *}},{{ *}}##300000)
-; CHECK-NOT: r{{[0-9]+}}{{ *}}={{ *}}memw(r{{[0-9]+}}+##4092)
+; CHECK-NOT: r{{[0-9]+}}{{ *}}={{ *}}memw(r{{[0-9]+}}{{ *}}+{{ *}}##4092)
 ; CHECK-NOT: r{{[0-9]+}}{{ *}}={{ *}}add(r{{[0-9]+}}{{ *}},{{ *}}##300)
 entry:
   %0 = load i32* %a, align 4
@@ -29,9 +29,9 @@ return:
 }
 
 define i32 @cext_test2(i8* %a) nounwind {
-; CHECK-NOT: r{{[0-9]+}}{{ *}}={{ *}}memub(r{{[0-9]+}}+##1023)
+; CHECK-NOT: r{{[0-9]+}}{{ *}}={{ *}}memub(r{{[0-9]+}}+{{ *}}##1023)
 ; CHECK: r{{[0-9]+}}{{ *}}={{ *}}add(r{{[0-9]+}}{{ *}},{{ *}}##300000)
-; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memub(r{{[0-9]+}}+##1024)
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memub(r{{[0-9]+}}{{ *}}+{{ *}}##1024)
 ; CHECK-NOT: r{{[0-9]+}}{{ *}}={{ *}}add(r{{[0-9]+}}{{ *}},{{ *}}##6000)
 entry:
   %tobool = icmp ne i8* %a, null
diff --git a/test/CodeGen/Hexagon/cmp-not.ll b/test/CodeGen/Hexagon/cmp-not.ll
deleted file mode 100644
index abcddc38..0000000
--- a/test/CodeGen/Hexagon/cmp-not.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; Check that we generate matching compare insn.
-
-; Function Attrs: nounwind
-define i32 @neqi(i32 %argc) #0 {
-entry:
-  %p = alloca i8, align 1
-  %0 = tail call i1 @llvm.hexagon.C4.cmpneqi(i32 %argc, i32 512)
-  %conv = zext i1 %0 to i8
-  store volatile i8 %conv, i8* %p, align 1
-  %p.0.p.0. = load volatile i8* %p, align 1
-  %conv1 = zext i8 %p.0.p.0. to i32
-  ret i32 %conv1
-}
-; CHECK:	p{{[0-3]}}{{ *}} = !cmp.eq(r{{[0-9]+}}, ##512)
-
-; Function Attrs: nounwind readnone
-declare i1 @llvm.hexagon.C4.cmpneqi(i32, i32) #1
-
-; Function Attrs: nounwind
-define i32 @ngti(i32 %argc) #0 {
-entry:
-  %p = alloca i8, align 1
-  %0 = tail call i1 @llvm.hexagon.C4.cmpltei(i32 %argc, i32 4)
-  %conv = zext i1 %0 to i8
-  store volatile i8 %conv, i8* %p, align 1
-  %p.0.p.0. = load volatile i8* %p, align 1
-  %conv1 = zext i8 %p.0.p.0. to i32
-  ret i32 %conv1
-}
-; CHECK:	p{{[0-3]}}{{ *}} = !cmp.gt(r{{[0-9]+}}, #4)
-
-; Function Attrs: nounwind readnone
-declare i1 @llvm.hexagon.C4.cmpltei(i32, i32) #1
-
-; Function Attrs: nounwind
-define i32 @ngtui(i32 %argc) #0 {
-entry:
-  %p = alloca i8, align 1
-  %0 = tail call i1 @llvm.hexagon.C4.cmplteui(i32 %argc, i32 4)
-  %conv = zext i1 %0 to i8
-  store volatile i8 %conv, i8* %p, align 1
-  %p.0.p.0. = load volatile i8* %p, align 1
-  %conv1 = zext i8 %p.0.p.0. to i32
-  ret i32 %conv1
-}
-; CHECK: 	p{{[0-3]}}{{ *}} = !cmp.gtu(r{{[0-9]+}}, #4)
-
-; Function Attrs: nounwind readnone
-declare i1 @llvm.hexagon.C4.cmplteui(i32, i32) #1
diff --git a/test/CodeGen/Hexagon/cmp-to-predreg.ll b/test/CodeGen/Hexagon/cmp-to-predreg.ll
index d430b90..2b65343 100644
--- a/test/CodeGen/Hexagon/cmp-to-predreg.ll
+++ b/test/CodeGen/Hexagon/cmp-to-predreg.ll
@@ -2,7 +2,7 @@
 ; Check that we generate compare to predicate register.
 
 define i32 @compare1(i32 %a, i32 %b) nounwind {
-; CHECK: p{{[0-3]}}{{ *}}={{ *}}!cmp.eq(r{{[0-9]+}},{{ *}}r{{[0-9]+}})
+; CHECK: p{{[0-3]}}{{ *}}={{ *[!]?}}cmp.eq(r{{[0-9]+}},{{ *}}r{{[0-9]+}})
 entry:
   %cmp = icmp ne i32 %a, %b
   %add = add nsw i32 %a, %b
@@ -12,7 +12,7 @@ entry:
 }
 
 define i32 @compare2(i32 %a) nounwind {
-; CHECK: p{{[0-3]}}{{ *}}={{ *}}!cmp.eq(r{{[0-9]+}},{{ *}}#10)
+; CHECK: p{{[0-3]}}{{ *}}={{ *[!]?}}cmp.eq(r{{[0-9]+}},{{ *}}#10)
 entry:
   %cmp = icmp ne i32 %a, 10
   %add = add nsw i32 %a, 10
diff --git a/test/CodeGen/Hexagon/dadd.ll b/test/CodeGen/Hexagon/dadd.ll
index 602978a..a86a90c 100644
--- a/test/CodeGen/Hexagon/dadd.ll
+++ b/test/CodeGen/Hexagon/dadd.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
 ; Check that we generate double precision floating point add in V5.
 
-; CHECK: r{{[0-9]+}}:{{[0-9]+}} = dfadd(r{{[0-9]+}}:{{[0-9]+}}, r{{[0-9]+}}:{{[0-9]+}})
+; CHECK: call __hexagon_adddf3
 
 
 define i32 @main() nounwind {
diff --git a/test/CodeGen/Hexagon/dmul.ll b/test/CodeGen/Hexagon/dmul.ll
index d743773..cbe0d7f 100644
--- a/test/CodeGen/Hexagon/dmul.ll
+++ b/test/CodeGen/Hexagon/dmul.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
 ; Check that we generate double precision floating point multiply in V5.
 
-; CHECK: r{{[0-9]+}}:{{[0-9]+}} = dfmpy(r{{[0-9]+}}:{{[0-9]+}}, r{{[0-9]+}}:{{[0-9]+}})
+; CHECK: call __hexagon_muldf3
 
 define i32 @main() nounwind {
 entry:
diff --git a/test/CodeGen/Hexagon/dsub.ll b/test/CodeGen/Hexagon/dsub.ll
index 4f9d39e..f271492 100644
--- a/test/CodeGen/Hexagon/dsub.ll
+++ b/test/CodeGen/Hexagon/dsub.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
 ; Check that we generate double precision floating point subtract in V5.
 
-; CHECK: r{{[0-9]+}}:{{[0-9]+}} = dfsub(r{{[0-9]+}}:{{[0-9]+}}, r{{[0-9]+}}:{{[0-9]+}})
+; CHECK: call __hexagon_subdf3
 
 define i32 @main() nounwind {
 entry:
diff --git a/test/CodeGen/Hexagon/dualstore.ll b/test/CodeGen/Hexagon/dualstore.ll
index f7d7e8b..33d9ce9 100644
--- a/test/CodeGen/Hexagon/dualstore.ll
+++ b/test/CodeGen/Hexagon/dualstore.ll
@@ -1,17 +1,12 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-hexagon-misched < %s | FileCheck %s
+; RUN: llc -march=hexagon -disable-hexagon-misched < %s | FileCheck %s
 ; Check that we generate dual stores in one packet in V4
 
-; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#{{[0-9]+}}){{ *}}={{ *}}##500000
-; CHECK-NEXT: memw(r{{[0-9]+}}{{ *}}+{{ *}}#{{[0-9]+}}){{ *}}={{ *}}##100000
-; CHECK-NEXT: }
+; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#{{[0-9]+}}){{ *}}=
+; CHECK-NEXT: memw(r{{[0-9]+}}{{ *}}+{{ *}}#{{[0-9]+}}){{ *}}=
 
-@Reg = global i32 0, align 4
-define i32 @main() nounwind {
+define i32 @main(i32 %v, i32* %p1, i32* %p2) nounwind {
 entry:
-  %number= alloca i32, align 4
-  store i32 500000, i32* %number, align 4
-  %number1= alloca i32, align 4
-  store i32 100000, i32* %number1, align 4
+  store i32 %v, i32* %p1, align 4
+  store i32 %v, i32* %p2, align 4
   ret i32 0
 }
-
diff --git a/test/CodeGen/Hexagon/hwloop-dbg.ll b/test/CodeGen/Hexagon/hwloop-dbg.ll
index f093dae..3c05884 100644
--- a/test/CodeGen/Hexagon/hwloop-dbg.ll
+++ b/test/CodeGen/Hexagon/hwloop-dbg.ll
@@ -5,9 +5,9 @@ target triple = "hexagon"
 
 define void @foo(i32* nocapture %a, i32* nocapture %b) nounwind {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32* %a}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !17
-  tail call void @llvm.dbg.value(metadata !{i32* %b}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !18
-  tail call void @llvm.dbg.value(metadata !30, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32* %a, i64 0, metadata !13, metadata !{!"0x102"}), !dbg !17
+  tail call void @llvm.dbg.value(metadata i32* %b, i64 0, metadata !14, metadata !{!"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !{!"0x102"}), !dbg !19
   br label %for.body, !dbg !19
 
 for.body:                                         ; preds = %for.body, %entry
@@ -18,11 +18,11 @@ for.body:                                         ; preds = %for.body, %entry
   %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
   %b.addr.01 = phi i32* [ %b, %entry ], [ %incdec.ptr, %for.body ]
   %incdec.ptr = getelementptr inbounds i32* %b.addr.01, i32 1, !dbg !21
-  tail call void @llvm.dbg.value(metadata !{i32* %incdec.ptr}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !21
+  tail call void @llvm.dbg.value(metadata i32* %incdec.ptr, i64 0, metadata !14, metadata !{!"0x102"}), !dbg !21
   %0 = load i32* %b.addr.01, align 4, !dbg !21
   store i32 %0, i32* %arrayidx.phi, align 4, !dbg !21
   %inc = add nsw i32 %i.02, 1, !dbg !26
-  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !26
+  tail call void @llvm.dbg.value(metadata i32 %inc, i64 0, metadata !15, metadata !{!"0x102"}), !dbg !26
   %exitcond = icmp eq i32 %inc, 10, !dbg !19
   %arrayidx.inc = getelementptr i32* %arrayidx.phi, i32 1
   br i1 %exitcond, label %for.end, label %for.body, !dbg !19
@@ -37,28 +37,28 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!29}
 
-!0 = metadata !{metadata !"0x11\0012\00QuIC LLVM Hexagon Clang version 6.1-pre-unknown, (git://git-hexagon-aus.quicinc.com/llvm/clang-mainline.git e9382867661454cdf44addb39430741578e9765c) (llvm/llvm-mainline.git 36412bb1fcf03ed426d4437b41198bae066675ac)\001\00\000\00\001", metadata !28, metadata !2, metadata !2, metadata !3, metadata !2, null} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c] [DW_LANG_C99]
-!2 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\001\001", metadata !28, null, metadata !7, null, void (i32*, i32*)* @foo, null, null, metadata !11} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!6 = metadata !{metadata !"0x29", metadata !28} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9, metadata !9}
-!9 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
-!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!11 = metadata !{metadata !13, metadata !14, metadata !15}
-!13 = metadata !{metadata !"0x101\00a\0016777217\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [a] [line 1]
-!14 = metadata !{metadata !"0x101\00b\0033554433\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [b] [line 1]
-!15 = metadata !{metadata !"0x100\00i\002\000", metadata !16, metadata !6, metadata !10} ; [ DW_TAG_auto_variable ] [i] [line 2]
-!16 = metadata !{metadata !"0xb\001\0026\000", metadata !28, metadata !5} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
-!17 = metadata !{i32 1, i32 15, metadata !5, null}
-!18 = metadata !{i32 1, i32 23, metadata !5, null}
-!19 = metadata !{i32 3, i32 8, metadata !20, null}
-!20 = metadata !{metadata !"0xb\003\003\001", metadata !28, metadata !16} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
-!21 = metadata !{i32 4, i32 5, metadata !22, null}
-!22 = metadata !{metadata !"0xb\003\0028\002", metadata !28, metadata !20} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
-!26 = metadata !{i32 3, i32 23, metadata !20, null}
-!27 = metadata !{i32 6, i32 1, metadata !16, null}
-!28 = metadata !{metadata !"hwloop-dbg.c", metadata !"/usr2/kparzysz/s.hex/t"}
-!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!30 = metadata !{i32 0}
+!0 = !{!"0x11\0012\00QuIC LLVM Hexagon Clang version 6.1-pre-unknown, (git://git-hexagon-aus.quicinc.com/llvm/clang-mainline.git e9382867661454cdf44addb39430741578e9765c) (llvm/llvm-mainline.git 36412bb1fcf03ed426d4437b41198bae066675ac)\001\00\000\00\001", !28, !2, !2, !3, !2, null} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c] [DW_LANG_C99]
+!2 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\00256\001\001", !28, null, !7, null, void (i32*, i32*)* @foo, null, null, !11} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!6 = !{!"0x29", !28} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9, !9}
+!9 = !{!"0xf\00\000\0032\0032\000\000", null, null, !10} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
+!10 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!11 = !{!13, !14, !15}
+!13 = !{!"0x101\00a\0016777217\000", !5, !6, !9} ; [ DW_TAG_arg_variable ] [a] [line 1]
+!14 = !{!"0x101\00b\0033554433\000", !5, !6, !9} ; [ DW_TAG_arg_variable ] [b] [line 1]
+!15 = !{!"0x100\00i\002\000", !16, !6, !10} ; [ DW_TAG_auto_variable ] [i] [line 2]
+!16 = !{!"0xb\001\0026\000", !28, !5} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
+!17 = !MDLocation(line: 1, column: 15, scope: !5)
+!18 = !MDLocation(line: 1, column: 23, scope: !5)
+!19 = !MDLocation(line: 3, column: 8, scope: !20)
+!20 = !{!"0xb\003\003\001", !28, !16} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
+!21 = !MDLocation(line: 4, column: 5, scope: !22)
+!22 = !{!"0xb\003\0028\002", !28, !20} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
+!26 = !MDLocation(line: 3, column: 23, scope: !20)
+!27 = !MDLocation(line: 6, column: 1, scope: !16)
+!28 = !{!"hwloop-dbg.c", !"/usr2/kparzysz/s.hex/t"}
+!29 = !{i32 1, !"Debug Info Version", i32 2}
+!30 = !{i32 0}
diff --git a/test/CodeGen/Hexagon/idxload-with-zero-offset.ll b/test/CodeGen/Hexagon/idxload-with-zero-offset.ll
index ca6df88..fbf1a3a 100644
--- a/test/CodeGen/Hexagon/idxload-with-zero-offset.ll
+++ b/test/CodeGen/Hexagon/idxload-with-zero-offset.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; Check that we generate load instruction with (base + register offset << 0)
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; Check that we generate load instruction with (base + register offset << x)
 
 ; load word
 
-define i32 @load_w(i32* nocapture %a, i32 %n) nounwind {
-; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memw(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
+define i32 @load_w(i32* nocapture %a, i32 %n, i32 %m) nounwind {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memw(r{{[0-9]+}}{{ *}}+{{ *}}r{{[0-9]+}}{{ *}}<<{{ *}}#2)
 entry:
-  %tmp = shl i32 %n, 4
+  %tmp = add i32 %n, %m
   %scevgep9 = getelementptr i32* %a, i32 %tmp
   %val = load i32* %scevgep9, align 4
   ret i32 %val
@@ -14,10 +14,10 @@ entry:
 
 ; load unsigned half word
 
-define i16 @load_uh(i16* nocapture %a, i32 %n) nounwind {
-; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memuh(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
+define i16 @load_uh(i16* nocapture %a, i32 %n, i32 %m) nounwind {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memuh(r{{[0-9]+}}{{ *}}+{{ *}}r{{[0-9]+}}{{ *}}<<#1)
 entry:
-  %tmp = shl i32 %n, 4
+  %tmp = add i32 %n, %m
   %scevgep9 = getelementptr i16* %a, i32 %tmp
   %val = load i16* %scevgep9, align 2
   ret i16 %val
@@ -25,10 +25,10 @@ entry:
 
 ; load signed half word
 
-define i32 @load_h(i16* nocapture %a, i32 %n) nounwind {
-; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memh(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
+define i32 @load_h(i16* nocapture %a, i32 %n, i32 %m) nounwind {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memh(r{{[0-9]+}}{{ *}}+{{ *}}r{{[0-9]+}}{{ *}}<<#1)
 entry:
-  %tmp = shl i32 %n, 4
+  %tmp = add i32 %n, %m
   %scevgep9 = getelementptr i16* %a, i32 %tmp
   %val = load i16* %scevgep9, align 2
   %conv = sext i16 %val to i32
@@ -37,10 +37,10 @@ entry:
 
 ; load unsigned byte
 
-define i8 @load_ub(i8* nocapture %a, i32 %n) nounwind {
-; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memub(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
+define i8 @load_ub(i8* nocapture %a, i32 %n, i32 %m) nounwind {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memub(r{{[0-9]+}}{{ *}}+{{ *}}r{{[0-9]+}}{{ *}}<<#0)
 entry:
-  %tmp = shl i32 %n, 4
+  %tmp = add i32 %n, %m
   %scevgep9 = getelementptr i8* %a, i32 %tmp
   %val = load i8* %scevgep9, align 1
   ret i8 %val
@@ -48,10 +48,10 @@ entry:
 
 ; load signed byte
 
-define i32 @foo_2(i8* nocapture %a, i32 %n) nounwind {
-; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memb(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
+define i32 @foo_2(i8* nocapture %a, i32 %n, i32 %m) nounwind {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memb(r{{[0-9]+}}{{ *}}+{{ *}}r{{[0-9]+}}{{ *}}<<{{ *}}#0)
 entry:
-  %tmp = shl i32 %n, 4
+  %tmp = add i32 %n, %m
   %scevgep9 = getelementptr i8* %a, i32 %tmp
   %val = load i8* %scevgep9, align 1
   %conv = sext i8 %val to i32
@@ -60,10 +60,10 @@ entry:
 
 ; load doubleword
 
-define i64 @load_d(i64* nocapture %a, i32 %n) nounwind {
-; CHECK: r{{[0-9]+}}:{{[0-9]+}}{{ *}}={{ *}}memd(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
+define i64 @load_d(i64* nocapture %a, i32 %n, i32 %m) nounwind {
+; CHECK: r{{[0-9]+}}:{{[0-9]+}}{{ *}}={{ *}}memd(r{{[0-9]+}}{{ *}}+{{ *}}r{{[0-9]+}}{{ *}}<<{{ *}}#3)
 entry:
-  %tmp = shl i32 %n, 4
+  %tmp = add i32 %n, %m
   %scevgep9 = getelementptr i64* %a, i32 %tmp
   %val = load i64* %scevgep9, align 8
   ret i64 %val
diff --git a/test/CodeGen/Hexagon/intrinsics/alu32_alu.ll b/test/CodeGen/Hexagon/intrinsics/alu32_alu.ll
new file mode 100644
index 0000000..37f9f40
--- /dev/null
+++ b/test/CodeGen/Hexagon/intrinsics/alu32_alu.ll
@@ -0,0 +1,202 @@
+; RUN: llc -march=hexagon -O0 < %s | FileCheck %s
+; Hexagon Programmer's Reference Manual 11.1.1 ALU32/ALU
+
+; Add
+declare i32 @llvm.hexagon.A2.addi(i32, i32)
+define i32 @A2_addi(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.addi(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0, #0)
+
+declare i32 @llvm.hexagon.A2.add(i32, i32)
+define i32 @A2_add(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.add(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0, r1)
+
+declare i32 @llvm.hexagon.A2.addsat(i32, i32)
+define i32 @A2_addsat(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.addsat(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0, r1):sat
+
+; Logical operations
+declare i32 @llvm.hexagon.A2.and(i32, i32)
+define i32 @A2_and(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.and(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = and(r0, r1)
+
+declare i32 @llvm.hexagon.A2.or(i32, i32)
+define i32 @A2_or(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.or(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = or(r0, r1)
+
+declare i32 @llvm.hexagon.A2.xor(i32, i32)
+define i32 @A2_xor(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.xor(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = xor(r0, r1)
+
+declare i32 @llvm.hexagon.A4.andn(i32, i32)
+define i32 @A4_andn(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A4.andn(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = and(r0, ~r1)
+
+declare i32 @llvm.hexagon.A4.orn(i32, i32)
+define i32 @A4_orn(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A4.orn(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = or(r0, ~r1)
+
+; Nop
+declare void @llvm.hexagon.A2.nop()
+define void @A2_nop(i32 %a, i32 %b) {
+  call void @llvm.hexagon.A2.nop()
+  ret void
+}
+; CHECK: nop
+
+; Subtract
+declare i32 @llvm.hexagon.A2.sub(i32, i32)
+define i32 @A2_sub(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.sub(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = sub(r0, r1)
+
+declare i32 @llvm.hexagon.A2.subsat(i32, i32)
+define i32 @A2_subsat(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.subsat(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = sub(r0, r1):sat
+
+; Sign extend
+declare i32 @llvm.hexagon.A2.sxtb(i32)
+define i32 @A2_sxtb(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.sxtb(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = sxtb(r0)
+
+declare i32 @llvm.hexagon.A2.sxth(i32)
+define i32 @A2_sxth(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.sxth(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = sxth(r0)
+
+; Transfer immediate
+declare i32 @llvm.hexagon.A2.tfril(i32, i32)
+define i32 @A2_tfril(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.tfril(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0.l = #0
+
+declare i32 @llvm.hexagon.A2.tfrih(i32, i32)
+define i32 @A2_tfrih(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.tfrih(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0.h = #0
+
+declare i32 @llvm.hexagon.A2.tfrsi(i32)
+define i32 @A2_tfrsi() {
+  %z = call i32 @llvm.hexagon.A2.tfrsi(i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = #0
+
+; Transfer register
+declare i32 @llvm.hexagon.A2.tfr(i32)
+define i32 @A2_tfr(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.tfr(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = r0
+
+; Vector add halfwords
+declare i32 @llvm.hexagon.A2.svaddh(i32, i32)
+define i32 @A2_svaddh(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.svaddh(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = vaddh(r0, r1)
+
+declare i32 @llvm.hexagon.A2.svaddhs(i32, i32)
+define i32 @A2_svaddhs(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.svaddhs(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = vaddh(r0, r1):sat
+
+declare i32 @llvm.hexagon.A2.svadduhs(i32, i32)
+define i32 @A2_svadduhs(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.svadduhs(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = vadduh(r0, r1):sat
+
+; Vector average halfwords
+declare i32 @llvm.hexagon.A2.svavgh(i32, i32)
+define i32 @A2_svavgh(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.svavgh(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = vavgh(r0, r1)
+
+declare i32 @llvm.hexagon.A2.svavghs(i32, i32)
+define i32 @A2_svavghs(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.svavghs(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = vavgh(r0, r1):rnd
+
+declare i32 @llvm.hexagon.A2.svnavgh(i32, i32)
+define i32 @A2_svnavgh(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.svnavgh(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = vnavgh(r0, r1)
+
+; Vector subtract halfwords
+declare i32 @llvm.hexagon.A2.svsubh(i32, i32)
+define i32 @A2_svsubh(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.svsubh(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = vsubh(r0, r1)
+
+declare i32 @llvm.hexagon.A2.svsubhs(i32, i32)
+define i32 @A2_svsubhs(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.svsubhs(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = vsubh(r0, r1):sat
+
+declare i32 @llvm.hexagon.A2.svsubuhs(i32, i32)
+define i32 @A2_svsubuhs(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.svsubuhs(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = vsubuh(r0, r1):sat
+
+; Zero extend
+declare i32 @llvm.hexagon.A2.zxth(i32)
+define i32 @A2_zxth(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.zxth(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = zxth(r0)
diff --git a/test/CodeGen/Hexagon/intrinsics/alu32_perm.ll b/test/CodeGen/Hexagon/intrinsics/alu32_perm.ll
new file mode 100644
index 0000000..a9cc01c
--- /dev/null
+++ b/test/CodeGen/Hexagon/intrinsics/alu32_perm.ll
@@ -0,0 +1,104 @@
+; RUN: llc -march=hexagon -O0 < %s | FileCheck %s
+; Hexagon Programmer's Reference Manual 11.1.2 ALU32/PERM
+
+; Combine words into doubleword
+declare i64 @llvm.hexagon.A4.combineri(i32, i32)
+define i64 @A4_combineri(i32 %a) {
+  %z = call i64 @llvm.hexagon.A4.combineri(i32 %a, i32 0)
+  ret i64 %z
+}
+; CHECK:  = combine(r0, #0)
+
+declare i64 @llvm.hexagon.A4.combineir(i32, i32)
+define i64 @A4_combineir(i32 %a) {
+  %z = call i64 @llvm.hexagon.A4.combineir(i32 0, i32 %a)
+  ret i64 %z
+}
+; CHECK:  = combine(#0, r0)
+
+declare i64 @llvm.hexagon.A2.combineii(i32, i32)
+define i64 @A2_combineii() {
+  %z = call i64 @llvm.hexagon.A2.combineii(i32 0, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 = combine(#0, #0)
+
+declare i32 @llvm.hexagon.A2.combine.hh(i32, i32)
+define i32 @A2_combine_hh(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.combine.hh(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = combine(r0.h, r1.h)
+
+declare i32 @llvm.hexagon.A2.combine.hl(i32, i32)
+define i32 @A2_combine_hl(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.combine.hl(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = combine(r0.h, r1.l)
+
+declare i32 @llvm.hexagon.A2.combine.lh(i32, i32)
+define i32 @A2_combine_lh(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.combine.lh(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = combine(r0.l, r1.h)
+
+declare i32 @llvm.hexagon.A2.combine.ll(i32, i32)
+define i32 @A2_combine_ll(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.combine.ll(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = combine(r0.l, r1.l)
+
+declare i64 @llvm.hexagon.A2.combinew(i32, i32)
+define i64 @A2_combinew(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.A2.combinew(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = combine(r0, r1)
+
+; Mux
+declare i32 @llvm.hexagon.C2.muxri(i32, i32, i32)
+define i32 @C2_muxri(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.C2.muxri(i32 %a, i32 0, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mux(p0, #0, r1)
+
+declare i32 @llvm.hexagon.C2.muxir(i32, i32, i32)
+define i32 @C2_muxir(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.C2.muxir(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = mux(p0, r1, #0)
+
+declare i32 @llvm.hexagon.C2.mux(i32, i32, i32)
+define i32 @C2_mux(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.C2.mux(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 = mux(p0, r1, r2)
+
+; Shift word by 16
+declare i32 @llvm.hexagon.A2.aslh(i32)
+define i32 @A2_aslh(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.aslh(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = aslh(r0)
+
+declare i32 @llvm.hexagon.A2.asrh(i32)
+define i32 @A2_asrh(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.asrh(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = asrh(r0)
+
+; Pack high and low halfwords
+declare i64 @llvm.hexagon.S2.packhl(i32, i32)
+define i64 @S2_packhl(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.S2.packhl(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = packhl(r0, r1)
diff --git a/test/CodeGen/Hexagon/intrinsics/cr.ll b/test/CodeGen/Hexagon/intrinsics/cr.ll
new file mode 100644
index 0000000..9bdcb25
--- /dev/null
+++ b/test/CodeGen/Hexagon/intrinsics/cr.ll
@@ -0,0 +1,132 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; Hexagon Programmer's Reference Manual 11.2 CR
+
+; Corner detection acceleration
+declare i32 @llvm.hexagon.C4.fastcorner9(i32, i32)
+define i32 @C4_fastcorner9(i32 %a, i32 %b) {
+  %z = call i32@llvm.hexagon.C4.fastcorner9(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = fastcorner9(p0, p1)
+
+declare i32 @llvm.hexagon.C4.fastcorner9.not(i32, i32)
+define i32 @C4_fastcorner9_not(i32 %a, i32 %b) {
+  %z = call i32@llvm.hexagon.C4.fastcorner9.not(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = !fastcorner9(p0, p1)
+
+; Logical reductions on predicates
+declare i32 @llvm.hexagon.C2.any8(i32)
+define i32 @C2_any8(i32 %a) {
+  %z = call i32@llvm.hexagon.C2.any8(i32 %a)
+  ret i32 %z
+}
+; CHECK: p0 = any8(p0)
+
+declare i32 @llvm.hexagon.C2.all8(i32)
+define i32 @C2_all8(i32 %a) {
+  %z = call i32@llvm.hexagon.C2.all8(i32 %a)
+  ret i32 %z
+}
+
+; CHECK: p0 = all8(p0)
+
+; Logical operations on predicates
+declare i32 @llvm.hexagon.C2.and(i32, i32)
+define i32 @C2_and(i32 %a, i32 %b) {
+  %z = call i32@llvm.hexagon.C2.and(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = and(p0, p1)
+
+declare i32 @llvm.hexagon.C4.and.and(i32, i32, i32)
+define i32 @C4_and_and(i32 %a, i32 %b, i32 %c) {
+  %z = call i32@llvm.hexagon.C4.and.and(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: p0 = and(p0, and(p1, p2))
+
+declare i32 @llvm.hexagon.C2.or(i32, i32)
+define i32 @C2_or(i32 %a, i32 %b) {
+  %z = call i32@llvm.hexagon.C2.or(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = or(p0, p1)
+
+declare i32 @llvm.hexagon.C4.and.or(i32, i32, i32)
+define i32 @C4_and_or(i32 %a, i32 %b, i32 %c) {
+  %z = call i32@llvm.hexagon.C4.and.or(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: p0 = and(p0, or(p1, p2))
+
+declare i32 @llvm.hexagon.C2.xor(i32, i32)
+define i32 @C2_xor(i32 %a, i32 %b) {
+  %z = call i32@llvm.hexagon.C2.xor(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = xor(p0, p1)
+
+declare i32 @llvm.hexagon.C4.or.and(i32, i32, i32)
+define i32 @C4_or_and(i32 %a, i32 %b, i32 %c) {
+  %z = call i32@llvm.hexagon.C4.or.and(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: p0 = or(p0, and(p1, p2))
+
+declare i32 @llvm.hexagon.C2.andn(i32, i32)
+define i32 @C2_andn(i32 %a, i32 %b) {
+  %z = call i32@llvm.hexagon.C2.andn(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = and(p0, !p1)
+
+declare i32 @llvm.hexagon.C4.or.or(i32, i32, i32)
+define i32 @C4_or_or(i32 %a, i32 %b, i32 %c) {
+  %z = call i32@llvm.hexagon.C4.or.or(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: p0 = or(p0, or(p1, p2))
+
+declare i32 @llvm.hexagon.C4.and.andn(i32, i32, i32)
+define i32 @C4_and_andn(i32 %a, i32 %b, i32 %c) {
+  %z = call i32@llvm.hexagon.C4.and.andn(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: p0 = and(p0, and(p1, !p2))
+
+declare i32 @llvm.hexagon.C4.and.orn(i32, i32, i32)
+define i32 @C4_and_orn(i32 %a, i32 %b, i32 %c) {
+  %z = call i32@llvm.hexagon.C4.and.orn(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: p0 = and(p0, or(p1, !p2))
+
+declare i32 @llvm.hexagon.C2.not(i32)
+define i32 @C2_not(i32 %a) {
+  %z = call i32@llvm.hexagon.C2.not(i32 %a)
+  ret i32 %z
+}
+; CHECK: p0 = not(p0)
+
+declare i32 @llvm.hexagon.C4.or.andn(i32, i32, i32)
+define i32 @C4_or_andn(i32 %a, i32 %b, i32 %c) {
+  %z = call i32@llvm.hexagon.C4.or.andn(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: p0 = or(p0, and(p1, !p2))
+
+declare i32 @llvm.hexagon.C2.orn(i32, i32)
+define i32 @C2_orn(i32 %a, i32 %b) {
+  %z = call i32@llvm.hexagon.C2.orn(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = or(p0, !p1)
+
+declare i32 @llvm.hexagon.C4.or.orn(i32, i32, i32)
+define i32 @C4_or_orn(i32 %a, i32 %b, i32 %c) {
+  %z = call i32@llvm.hexagon.C4.or.orn(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: p0 = or(p0, or(p1, !p2))
diff --git a/test/CodeGen/Hexagon/intrinsics/xtype_alu.ll b/test/CodeGen/Hexagon/intrinsics/xtype_alu.ll
new file mode 100644
index 0000000..4a11112
--- /dev/null
+++ b/test/CodeGen/Hexagon/intrinsics/xtype_alu.ll
@@ -0,0 +1,1020 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -O0 < %s | FileCheck %s
+; Hexagon Programmer's Reference Manual 11.10.1 XTYPE/ALU
+
+; Absolute value doubleword
+declare i64 @llvm.hexagon.A2.absp(i64)
+define i64 @A2_absp(i64 %a) {
+  %z = call i64 @llvm.hexagon.A2.absp(i64 %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = abs(r1:0)
+
+; Absolute value word
+declare i32 @llvm.hexagon.A2.abs(i32)
+define i32 @A2_abs(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.abs(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = abs(r0)
+
+declare i32 @llvm.hexagon.A2.abssat(i32)
+define i32 @A2_abssat(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.abssat(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = abs(r0):sat
+
+; Add and accumulate
+declare i32 @llvm.hexagon.S4.addaddi(i32, i32, i32)
+define i32 @S4_addaddi(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S4.addaddi(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0, add(r1, #0))
+
+declare i32 @llvm.hexagon.S4.subaddi(i32, i32, i32)
+define i32 @S4_subaddi(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S4.subaddi(i32 %a, i32 0, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0, sub(#0, r1))
+
+declare i32 @llvm.hexagon.M2.accii(i32, i32, i32)
+define i32 @M2_accii(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.accii(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 += add(r1, #0)
+
+declare i32 @llvm.hexagon.M2.naccii(i32, i32, i32)
+define i32 @M2_naccii(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.naccii(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 -= add(r1, #0)
+
+declare i32 @llvm.hexagon.M2.acci(i32, i32, i32)
+define i32 @M2_acci(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.acci(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += add(r1, r2)
+
+declare i32 @llvm.hexagon.M2.nacci(i32, i32, i32)
+define i32 @M2_nacci(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.nacci(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= add(r1, r2)
+
+; Add doublewords
+declare i64 @llvm.hexagon.A2.addp(i64, i64)
+define i64 @A2_addp(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.addp(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = add(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.addpsat(i64, i64)
+define i64 @A2_addpsat(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.addpsat(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = add(r1:0, r3:2):sat
+
+; Add halfword
+declare i32 @llvm.hexagon.A2.addh.l16.ll(i32, i32)
+define i32 @A2_addh_l16_ll(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.addh.l16.ll(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0.l, r1.l)
+
+declare i32 @llvm.hexagon.A2.addh.l16.hl(i32, i32)
+define i32 @A2_addh_l16_hl(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.addh.l16.hl(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0.l, r1.h)
+
+declare i32 @llvm.hexagon.A2.addh.l16.sat.ll(i32, i32)
+define i32 @A2_addh_l16_sat.ll(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.addh.l16.sat.ll(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0.l, r1.l):sat
+
+declare i32 @llvm.hexagon.A2.addh.l16.sat.hl(i32, i32)
+define i32 @A2_addh_l16_sat.hl(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.addh.l16.sat.hl(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0.l, r1.h):sat
+
+declare i32 @llvm.hexagon.A2.addh.h16.ll(i32, i32)
+define i32 @A2_addh_h16_ll(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.addh.h16.ll(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0.l, r1.l):<<16
+
+declare i32 @llvm.hexagon.A2.addh.h16.lh(i32, i32)
+define i32 @A2_addh_h16_lh(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.addh.h16.lh(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0.l, r1.h):<<16
+
+declare i32 @llvm.hexagon.A2.addh.h16.hl(i32, i32)
+define i32 @A2_addh_h16_hl(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.addh.h16.hl(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0.h, r1.l):<<16
+
+declare i32 @llvm.hexagon.A2.addh.h16.hh(i32, i32)
+define i32 @A2_addh_h16_hh(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.addh.h16.hh(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0.h, r1.h):<<16
+
+declare i32 @llvm.hexagon.A2.addh.h16.sat.ll(i32, i32)
+define i32 @A2_addh_h16_sat_ll(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.addh.h16.sat.ll(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0.l, r1.l):sat:<<16
+
+declare i32 @llvm.hexagon.A2.addh.h16.sat.lh(i32, i32)
+define i32 @A2_addh_h16_sat_lh(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.addh.h16.sat.lh(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0.l, r1.h):sat:<<16
+
+declare i32 @llvm.hexagon.A2.addh.h16.sat.hl(i32, i32)
+define i32 @A2_addh_h16_sat_hl(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.addh.h16.sat.hl(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0.h, r1.l):sat:<<16
+
+declare i32 @llvm.hexagon.A2.addh.h16.sat.hh(i32, i32)
+define i32 @A2_addh_h16_sat_hh(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.addh.h16.sat.hh(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0.h, r1.h):sat:<<16
+
+; Logical doublewords
+declare i64 @llvm.hexagon.A2.notp(i64)
+define i64 @A2_notp(i64 %a) {
+  %z = call i64 @llvm.hexagon.A2.notp(i64 %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = not(r1:0)
+
+declare i64 @llvm.hexagon.A2.andp(i64, i64)
+define i64 @A2_andp(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.andp(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = and(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A4.andnp(i64, i64)
+define i64 @A2_andnp(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A4.andnp(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = and(r1:0, ~r3:2)
+
+declare i64 @llvm.hexagon.A2.orp(i64, i64)
+define i64 @A2_orp(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.orp(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = or(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A4.ornp(i64, i64)
+define i64 @A2_ornp(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A4.ornp(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = or(r1:0, ~r3:2)
+
+declare i64 @llvm.hexagon.A2.xorp(i64, i64)
+define i64 @A2_xorp(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.xorp(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = xor(r1:0, r3:2)
+
+; Logical-logical doublewords
+declare i64 @llvm.hexagon.M4.xor.xacc(i64, i64, i64)
+define i64 @M4_xor_xacc(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M4.xor.xacc(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 ^= xor(r3:2, r5:4)
+
+; Logical-logical words
+declare i32 @llvm.hexagon.S4.or.andi(i32, i32, i32)
+define i32 @S4_or_andi(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S4.or.andi(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 |= and(r1, #0)
+
+declare i32 @llvm.hexagon.S4.or.andix(i32, i32, i32)
+define i32 @S4_or_andix(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S4.or.andix(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r1 = or(r0, and(r1, #0))
+
+declare i32 @llvm.hexagon.M4.or.andn(i32, i32, i32)
+define i32 @M4_or_andn(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M4.or.andn(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 |= and(r1, ~r2)
+
+declare i32 @llvm.hexagon.M4.and.andn(i32, i32, i32)
+define i32 @M4_and_andn(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M4.and.andn(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 &= and(r1, ~r2)
+
+declare i32 @llvm.hexagon.M4.xor.andn(i32, i32, i32)
+define i32 @M4_xor_andn(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M4.xor.andn(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 ^= and(r1, ~r2)
+
+declare i32 @llvm.hexagon.M4.and.and(i32, i32, i32)
+define i32 @M4_and_and(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M4.and.and(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 &= and(r1, r2)
+
+declare i32 @llvm.hexagon.M4.and.or(i32, i32, i32)
+define i32 @M4_and_or(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M4.and.or(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 &= or(r1, r2)
+
+declare i32 @llvm.hexagon.M4.and.xor(i32, i32, i32)
+define i32 @M4_and_xor(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M4.and.xor(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 &= xor(r1, r2)
+
+declare i32 @llvm.hexagon.M4.or.and(i32, i32, i32)
+define i32 @M4_or_and(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M4.or.and(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 |= and(r1, r2)
+
+declare i32 @llvm.hexagon.M4.or.or(i32, i32, i32)
+define i32 @M4_or_or(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M4.or.or(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 |= or(r1, r2)
+
+declare i32 @llvm.hexagon.M4.or.xor(i32, i32, i32)
+define i32 @M4_or_xor(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M4.or.xor(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 |= xor(r1, r2)
+
+declare i32 @llvm.hexagon.M4.xor.and(i32, i32, i32)
+define i32 @M4_xor_and(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M4.xor.and(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 ^= and(r1, r2)
+
+declare i32 @llvm.hexagon.M4.xor.or(i32, i32, i32)
+define i32 @M4_xor_or(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M4.xor.or(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 ^= or(r1, r2)
+
+; Maximum words
+declare i32 @llvm.hexagon.A2.max(i32, i32)
+define i32 @A2_max(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.max(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = max(r0, r1)
+
+declare i32 @llvm.hexagon.A2.maxu(i32, i32)
+define i32 @A2_maxu(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.maxu(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = maxu(r0, r1)
+
+; Maximum doublewords
+declare i64 @llvm.hexagon.A2.maxp(i64, i64)
+define i64 @A2_maxp(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.maxp(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = max(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.maxup(i64, i64)
+define i64 @A2_maxup(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.maxup(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = maxu(r1:0, r3:2)
+
+; Minimum words
+declare i32 @llvm.hexagon.A2.min(i32, i32)
+define i32 @A2_min(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.min(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = min(r0, r1)
+
+declare i32 @llvm.hexagon.A2.minu(i32, i32)
+define i32 @A2_minu(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.minu(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = minu(r0, r1)
+
+; Minimum doublewords
+declare i64 @llvm.hexagon.A2.minp(i64, i64)
+define i64 @A2_minp(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.minp(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = min(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.minup(i64, i64)
+define i64 @A2_minup(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.minup(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = minu(r1:0, r3:2)
+
+; Module wrap
+declare i32 @llvm.hexagon.A4.modwrapu(i32, i32)
+define i32 @A4_modwrapu(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A4.modwrapu(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = modwrap(r0, r1)
+
+; Negate
+declare i64 @llvm.hexagon.A2.negp(i64)
+define i64 @A2_negp(i64 %a) {
+  %z = call i64 @llvm.hexagon.A2.negp(i64 %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = neg(r1:0)
+
+declare i32 @llvm.hexagon.A2.negsat(i32)
+define i32 @A2_negsat(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.negsat(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = neg(r0):sat
+
+; Round
+declare i32 @llvm.hexagon.A2.roundsat(i64)
+define i32 @A2_roundsat(i64 %a) {
+  %z = call i32 @llvm.hexagon.A2.roundsat(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = round(r1:0):sat
+
+declare i32 @llvm.hexagon.A4.cround.ri(i32, i32)
+define i32 @A4_cround_ri(i32 %a) {
+  %z = call i32 @llvm.hexagon.A4.cround.ri(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = cround(r0, #0)
+
+declare i32 @llvm.hexagon.A4.round.ri(i32, i32)
+define i32 @A4_round_ri(i32 %a) {
+  %z = call i32 @llvm.hexagon.A4.round.ri(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = round(r0, #0)
+
+declare i32 @llvm.hexagon.A4.round.ri.sat(i32, i32)
+define i32 @A4_round_ri_sat(i32 %a) {
+  %z = call i32 @llvm.hexagon.A4.round.ri.sat(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = round(r0, #0):sat
+
+declare i32 @llvm.hexagon.A4.cround.rr(i32, i32)
+define i32 @A4_cround_rr(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A4.cround.rr(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = cround(r0, r1)
+
+declare i32 @llvm.hexagon.A4.round.rr(i32, i32)
+define i32 @A4_round_rr(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A4.round.rr(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = round(r0, r1)
+
+declare i32 @llvm.hexagon.A4.round.rr.sat(i32, i32)
+define i32 @A4_round_rr_sat(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A4.round.rr.sat(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = round(r0, r1):sat
+
+; Subtract doublewords
+declare i64 @llvm.hexagon.A2.subp(i64, i64)
+define i64 @A2_subp(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.subp(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = sub(r1:0, r3:2)
+
+; Subtract and accumulate
+declare i32 @llvm.hexagon.M2.subacc(i32, i32, i32)
+define i32 @M2_subacc(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.subacc(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += sub(r1, r2)
+
+; Subtract halfwords
+declare i32 @llvm.hexagon.A2.subh.l16.ll(i32, i32)
+define i32 @A2_subh_l16_ll(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.subh.l16.ll(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = sub(r0.l, r1.l)
+
+declare i32 @llvm.hexagon.A2.subh.l16.hl(i32, i32)
+define i32 @A2_subh_l16_hl(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.subh.l16.hl(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = sub(r0.l, r1.h)
+
+declare i32 @llvm.hexagon.A2.subh.l16.sat.ll(i32, i32)
+define i32 @A2_subh_l16_sat.ll(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.subh.l16.sat.ll(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = sub(r0.l, r1.l):sat
+
+declare i32 @llvm.hexagon.A2.subh.l16.sat.hl(i32, i32)
+define i32 @A2_subh_l16_sat.hl(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.subh.l16.sat.hl(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = sub(r0.l, r1.h):sat
+
+declare i32 @llvm.hexagon.A2.subh.h16.ll(i32, i32)
+define i32 @A2_subh_h16_ll(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.subh.h16.ll(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = sub(r0.l, r1.l):<<16
+
+declare i32 @llvm.hexagon.A2.subh.h16.lh(i32, i32)
+define i32 @A2_subh_h16_lh(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.subh.h16.lh(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = sub(r0.l, r1.h):<<16
+
+declare i32 @llvm.hexagon.A2.subh.h16.hl(i32, i32)
+define i32 @A2_subh_h16_hl(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.subh.h16.hl(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = sub(r0.h, r1.l):<<16
+
+declare i32 @llvm.hexagon.A2.subh.h16.hh(i32, i32)
+define i32 @A2_subh_h16_hh(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.subh.h16.hh(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = sub(r0.h, r1.h):<<16
+
+declare i32 @llvm.hexagon.A2.subh.h16.sat.ll(i32, i32)
+define i32 @A2_subh_h16_sat_ll(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.subh.h16.sat.ll(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = sub(r0.l, r1.l):sat:<<16
+
+declare i32 @llvm.hexagon.A2.subh.h16.sat.lh(i32, i32)
+define i32 @A2_subh_h16_sat_lh(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.subh.h16.sat.lh(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = sub(r0.l, r1.h):sat:<<16
+
+declare i32 @llvm.hexagon.A2.subh.h16.sat.hl(i32, i32)
+define i32 @A2_subh_h16_sat_hl(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.subh.h16.sat.hl(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = sub(r0.h, r1.l):sat:<<16
+
+declare i32 @llvm.hexagon.A2.subh.h16.sat.hh(i32, i32)
+define i32 @A2_subh_h16_sat_hh(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A2.subh.h16.sat.hh(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = sub(r0.h, r1.h):sat:<<16
+
+; Sign extend word to doubleword
+declare i64 @llvm.hexagon.A2.sxtw(i32)
+define i64 @A2_sxtw(i32 %a) {
+  %z = call i64 @llvm.hexagon.A2.sxtw(i32 %a)
+  ret i64 %z
+}
+; CHECK:  = sxtw(r0)
+
+; Vector absolute value halfwords
+declare i64 @llvm.hexagon.A2.vabsh(i64)
+define i64 @A2_vabsh(i64 %a) {
+  %z = call i64 @llvm.hexagon.A2.vabsh(i64 %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = vabsh(r1:0)
+
+declare i64 @llvm.hexagon.A2.vabshsat(i64)
+define i64 @A2_vabshsat(i64 %a) {
+  %z = call i64 @llvm.hexagon.A2.vabshsat(i64 %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = vabsh(r1:0):sat
+
+; Vector absolute value words
+declare i64 @llvm.hexagon.A2.vabsw(i64)
+define i64 @A2_vabsw(i64 %a) {
+  %z = call i64 @llvm.hexagon.A2.vabsw(i64 %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = vabsw(r1:0)
+
+declare i64 @llvm.hexagon.A2.vabswsat(i64)
+define i64 @A2_vabswsat(i64 %a) {
+  %z = call i64 @llvm.hexagon.A2.vabswsat(i64 %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = vabsw(r1:0):sat
+
+; Vector absolute difference halfwords
+declare i64 @llvm.hexagon.M2.vabsdiffh(i64, i64)
+define i64 @M2_vabsdiffh(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.vabsdiffh(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vabsdiffh(r1:0, r3:2)
+
+; Vector absolute difference words
+declare i64 @llvm.hexagon.M2.vabsdiffw(i64, i64)
+define i64 @M2_vabsdiffw(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.vabsdiffw(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vabsdiffw(r1:0, r3:2)
+
+; Vector add halfwords
+declare i64 @llvm.hexagon.A2.vaddh(i64, i64)
+define i64 @A2_vaddh(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vaddh(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vaddh(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vaddhs(i64, i64)
+define i64 @A2_vaddhs(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vaddhs(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vaddh(r1:0, r3:2):sat
+
+declare i64 @llvm.hexagon.A2.vadduhs(i64, i64)
+define i64 @A2_vadduhs(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vadduhs(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vadduh(r1:0, r3:2):sat
+
+; Vector add halfwords with saturate and pack to unsigned bytes
+declare i32 @llvm.hexagon.A5.vaddhubs(i64, i64)
+define i32 @A5_vaddhubs(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.A5.vaddhubs(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: r0 = vaddhub(r1:0, r3:2):sat
+
+; Vector reduce add unsigned bytes
+declare i64 @llvm.hexagon.A2.vraddub(i64, i64)
+define i64 @A2_vraddub(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vraddub(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vraddub(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vraddub.acc(i64, i64, i64)
+define i64 @A2_vraddub_acc(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.A2.vraddub.acc(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vraddub(r3:2, r5:4)
+
+; Vector reduce add halfwords
+declare i32 @llvm.hexagon.M2.vradduh(i64, i64)
+define i32 @M2_vradduh(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.M2.vradduh(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: r0 = vradduh(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.M2.vraddh(i64, i64)
+define i32 @M2_vraddh(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.M2.vraddh(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: r0 = vraddh(r1:0, r3:2)
+
+; Vector add bytes
+declare i64 @llvm.hexagon.A2.vaddub(i64, i64)
+define i64 @A2_vaddub(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vaddub(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vaddub(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vaddubs(i64, i64)
+define i64 @A2_vaddubs(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vaddubs(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vaddub(r1:0, r3:2):sat
+
+; Vector add words
+declare i64 @llvm.hexagon.A2.vaddw(i64, i64)
+define i64 @A2_vaddw(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vaddw(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vaddw(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vaddws(i64, i64)
+define i64 @A2_vaddws(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vaddws(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vaddw(r1:0, r3:2):sat
+
+; Vector average halfwords
+declare i64 @llvm.hexagon.A2.vavgh(i64, i64)
+define i64 @A2_vavgh(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vavgh(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vavgh(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vavghr(i64, i64)
+define i64 @A2_vavghr(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vavghr(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vavgh(r1:0, r3:2):rnd
+
+declare i64 @llvm.hexagon.A2.vavghcr(i64, i64)
+define i64 @A2_vavghcr(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vavghcr(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vavgh(r1:0, r3:2):crnd
+
+declare i64 @llvm.hexagon.A2.vavguh(i64, i64)
+define i64 @A2_vavguh(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vavguh(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vavguh(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vavguhr(i64, i64)
+define i64 @A2_vavguhr(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vavguhr(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vavguh(r1:0, r3:2):rnd
+
+declare i64 @llvm.hexagon.A2.vnavgh(i64, i64)
+define i64 @A2_vnavgh(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vnavgh(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vnavgh(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vnavghr(i64, i64)
+define i64 @A2_vnavghr(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vnavghr(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vnavgh(r1:0, r3:2):rnd
+
+declare i64 @llvm.hexagon.A2.vnavghcr(i64, i64)
+define i64 @A2_vnavghcr(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vnavghcr(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vnavgh(r1:0, r3:2):crnd
+
+; Vector average unsigned bytes
+declare i64 @llvm.hexagon.A2.vavgub(i64, i64)
+define i64 @A2_vavgub(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vavgub(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vavgub(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vavgubr(i64, i64)
+define i64 @A2_vavgubr(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vavgubr(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vavgub(r1:0, r3:2):rnd
+
+; Vector average words
+declare i64 @llvm.hexagon.A2.vavgw(i64, i64)
+define i64 @A2_vavgw(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vavgw(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vavgw(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vavgwr(i64, i64)
+define i64 @A2_vavgwr(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vavgwr(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vavgw(r1:0, r3:2):rnd
+
+declare i64 @llvm.hexagon.A2.vavgwcr(i64, i64)
+define i64 @A2_vavgwcr(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vavgwcr(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vavgw(r1:0, r3:2):crnd
+
+declare i64 @llvm.hexagon.A2.vavguw(i64, i64)
+define i64 @A2_vavguw(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vavguw(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vavguw(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vavguwr(i64, i64)
+define i64 @A2_vavguwr(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vavguwr(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vavguw(r1:0, r3:2):rnd
+
+declare i64 @llvm.hexagon.A2.vnavgw(i64, i64)
+define i64 @A2_vnavgw(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vnavgw(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vnavgw(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vnavgwr(i64, i64)
+define i64 @A2_vnavgwr(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vnavgwr(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vnavgw(r1:0, r3:2):rnd
+
+declare i64 @llvm.hexagon.A2.vnavgwcr(i64, i64)
+define i64 @A2_vnavgwcr(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vnavgwcr(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vnavgw(r1:0, r3:2):crnd
+
+; Vector conditional negate
+declare i64 @llvm.hexagon.S2.vcnegh(i64, i32)
+define i64 @S2_vcnegh(i64 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.S2.vcnegh(i64 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vcnegh(r1:0, r2)
+
+declare i64 @llvm.hexagon.S2.vrcnegh(i64, i64, i32)
+define i64 @S2_vrcnegh(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.vrcnegh(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vrcnegh(r3:2, r4)
+
+; Vector maximum bytes
+declare i64 @llvm.hexagon.A2.vmaxub(i64, i64)
+define i64 @A2_vmaxub(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vmaxub(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmaxub(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vmaxb(i64, i64)
+define i64 @A2_vmaxb(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vmaxb(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmaxb(r1:0, r3:2)
+
+; Vector maximum halfwords
+declare i64 @llvm.hexagon.A2.vmaxh(i64, i64)
+define i64 @A2_vmaxh(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vmaxh(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmaxh(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vmaxuh(i64, i64)
+define i64 @A2_vmaxuh(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vmaxuh(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmaxuh(r1:0, r3:2)
+
+; Vector reduce maximum halfwords
+declare i64 @llvm.hexagon.A4.vrmaxh(i64, i64, i32)
+define i64 @A4_vrmaxh(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.A4.vrmaxh(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrmaxh(r3:2, r4)
+
+declare i64 @llvm.hexagon.A4.vrmaxuh(i64, i64, i32)
+define i64 @A4_vrmaxuh(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.A4.vrmaxuh(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrmaxuh(r3:2, r4)
+
+; Vector reduce maximum words
+declare i64 @llvm.hexagon.A4.vrmaxw(i64, i64, i32)
+define i64 @A4_vrmaxw(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.A4.vrmaxw(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrmaxw(r3:2, r4)
+
+declare i64 @llvm.hexagon.A4.vrmaxuw(i64, i64, i32)
+define i64 @A4_vrmaxuw(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.A4.vrmaxuw(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrmaxuw(r3:2, r4)
+
+; Vector minimum bytes
+declare i64 @llvm.hexagon.A2.vminub(i64, i64)
+define i64 @A2_vminub(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vminub(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vminub(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vminb(i64, i64)
+define i64 @A2_vminb(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vminb(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vminb(r1:0, r3:2)
+
+; Vector minimum halfwords
+declare i64 @llvm.hexagon.A2.vminh(i64, i64)
+define i64 @A2_vminh(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vminh(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vminh(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vminuh(i64, i64)
+define i64 @A2_vminuh(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vminuh(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vminuh(r1:0, r3:2)
+
+; Vector reduce minimum halfwords
+declare i64 @llvm.hexagon.A4.vrminh(i64, i64, i32)
+define i64 @A4_vrminh(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.A4.vrminh(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrminh(r3:2, r4)
+
+declare i64 @llvm.hexagon.A4.vrminuh(i64, i64, i32)
+define i64 @A4_vrminuh(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.A4.vrminuh(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrminuh(r3:2, r4)
+
+; Vector reduce minimum words
+declare i64 @llvm.hexagon.A4.vrminw(i64, i64, i32)
+define i64 @A4_vrminw(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.A4.vrminw(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrminw(r3:2, r4)
+
+declare i64 @llvm.hexagon.A4.vrminuw(i64, i64, i32)
+define i64 @A4_vrminuw(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.A4.vrminuw(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrminuw(r3:2, r4)
+
+; Vector sum of absolute differences unsigned bytes
+declare i64 @llvm.hexagon.A2.vrsadub(i64, i64)
+define i64 @A2_vrsadub(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vrsadub(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrsadub(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vrsadub.acc(i64, i64, i64)
+define i64 @A2_vrsadub_acc(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.A2.vrsadub.acc(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vrsadub(r3:2, r5:4)
+
+; Vector subtract halfwords
+declare i64 @llvm.hexagon.A2.vsubh(i64, i64)
+define i64 @A2_vsubh(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vsubh(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vsubh(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vsubhs(i64, i64)
+define i64 @A2_vsubhs(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vsubhs(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vsubh(r1:0, r3:2):sat
+
+declare i64 @llvm.hexagon.A2.vsubuhs(i64, i64)
+define i64 @A2_vsubuhs(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vsubuhs(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vsubuh(r1:0, r3:2):sat
+
+; Vector subtract bytes
+declare i64 @llvm.hexagon.A2.vsubub(i64, i64)
+define i64 @A2_vsubub(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vsubub(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vsubub(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vsububs(i64, i64)
+define i64 @A2_vsububs(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vsububs(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vsubub(r1:0, r3:2):sat
+
+; Vector subtract words
+declare i64 @llvm.hexagon.A2.vsubw(i64, i64)
+define i64 @A2_vsubw(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vsubw(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vsubw(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.A2.vsubws(i64, i64)
+define i64 @A2_vsubws(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.A2.vsubws(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vsubw(r1:0, r3:2):sat
diff --git a/test/CodeGen/Hexagon/intrinsics/xtype_bit.ll b/test/CodeGen/Hexagon/intrinsics/xtype_bit.ll
new file mode 100644
index 0000000..8531b2f
--- /dev/null
+++ b/test/CodeGen/Hexagon/intrinsics/xtype_bit.ll
@@ -0,0 +1,329 @@
+; RUN: llc -march=hexagon -O0 < %s | FileCheck %s
+; Hexagon Programmer's Reference Manual 11.10.2 XTYPE/BIT
+
+; Count leading
+declare i32 @llvm.hexagon.S2.clbp(i64)
+define i32 @S2_clbp(i64 %a) {
+  %z = call i32 @llvm.hexagon.S2.clbp(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = clb(r1:0)
+
+declare i32 @llvm.hexagon.S2.cl0p(i64)
+define i32 @S2_cl0p(i64 %a) {
+  %z = call i32 @llvm.hexagon.S2.cl0p(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = cl0(r1:0)
+
+declare i32 @llvm.hexagon.S2.cl1p(i64)
+define i32 @S2_cl1p(i64 %a) {
+  %z = call i32 @llvm.hexagon.S2.cl1p(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = cl1(r1:0)
+
+declare i32 @llvm.hexagon.S4.clbpnorm(i64)
+define i32 @S4_clbpnorm(i64 %a) {
+  %z = call i32 @llvm.hexagon.S4.clbpnorm(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = normamt(r1:0)
+
+declare i32 @llvm.hexagon.S4.clbpaddi(i64, i32)
+define i32 @S4_clbpaddi(i64 %a) {
+  %z = call i32 @llvm.hexagon.S4.clbpaddi(i64 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = add(clb(r1:0), #0)
+
+declare i32 @llvm.hexagon.S4.clbaddi(i32, i32)
+define i32 @S4_clbaddi(i32 %a) {
+  %z = call i32 @llvm.hexagon.S4.clbaddi(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = add(clb(r0), #0)
+
+declare i32 @llvm.hexagon.S2.cl0(i32)
+define i32 @S2_cl0(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.cl0(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = cl0(r0)
+
+declare i32 @llvm.hexagon.S2.cl1(i32)
+define i32 @S2_cl1(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.cl1(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = cl1(r0)
+
+declare i32 @llvm.hexagon.S2.clbnorm(i32)
+define i32 @S4_clbnorm(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.clbnorm(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = normamt(r0)
+
+; Count population
+declare i32 @llvm.hexagon.S5.popcountp(i64)
+define i32 @S5_popcountp(i64 %a) {
+  %z = call i32 @llvm.hexagon.S5.popcountp(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = popcount(r1:0)
+
+; Count trailing
+declare i32 @llvm.hexagon.S2.ct0p(i64)
+define i32 @S2_ct0p(i64 %a) {
+  %z = call i32 @llvm.hexagon.S2.ct0p(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = ct0(r1:0)
+
+declare i32 @llvm.hexagon.S2.ct1p(i64)
+define i32 @S2_ct1p(i64 %a) {
+  %z = call i32 @llvm.hexagon.S2.ct1p(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = ct1(r1:0)
+
+declare i32 @llvm.hexagon.S2.ct0(i32)
+define i32 @S2_ct0(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.ct0(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = ct0(r0)
+
+declare i32 @llvm.hexagon.S2.ct1(i32)
+define i32 @S2_ct1(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.ct1(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = ct1(r0)
+
+; Extract bitfield
+declare i64 @llvm.hexagon.S2.extractup(i64, i32, i32)
+define i64 @S2_extractup(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.extractup(i64 %a, i32 0, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 = extractu(r1:0, #0, #0)
+
+declare i64 @llvm.hexagon.S4.extractp(i64, i32, i32)
+define i64 @S2_extractp(i64 %a) {
+  %z = call i64 @llvm.hexagon.S4.extractp(i64 %a, i32 0, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 = extract(r1:0, #0, #0)
+
+declare i32 @llvm.hexagon.S2.extractu(i32, i32, i32)
+define i32 @S2_extractu(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.extractu(i32 %a, i32 0, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = extractu(r0, #0, #0)
+
+declare i32 @llvm.hexagon.S4.extract(i32, i32, i32)
+define i32 @S2_extract(i32 %a) {
+  %z = call i32 @llvm.hexagon.S4.extract(i32 %a, i32 0, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = extract(r0, #0, #0)
+
+declare i64 @llvm.hexagon.S2.extractup.rp(i64, i64)
+define i64 @S2_extractup_rp(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.extractup.rp(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = extractu(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.S4.extractp.rp(i64, i64)
+define i64 @S4_extractp_rp(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S4.extractp.rp(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = extract(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.S2.extractu.rp(i32, i64)
+define i32 @S2_extractu_rp(i32 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.S2.extractu.rp(i32 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: r0 = extractu(r0, r3:2)
+
+declare i32 @llvm.hexagon.S4.extract.rp(i32, i64)
+define i32 @S4_extract_rp(i32 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.S4.extract.rp(i32 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: r0 = extract(r0, r3:2)
+
+; Insert bitfield
+declare i64 @llvm.hexagon.S2.insertp(i64, i64, i32, i32)
+define i64 @S2_insertp(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.insertp(i64 %a, i64 %b, i32 0, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 = insert(r3:2, #0, #0)
+
+declare i32 @llvm.hexagon.S2.insert(i32, i32, i32, i32)
+define i32 @S2_insert(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.insert(i32 %a, i32 %b, i32 0, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = insert(r1, #0, #0)
+
+declare i32 @llvm.hexagon.S2.insert.rp(i32, i32, i64)
+define i32 @S2_insert_rp(i32 %a, i32 %b, i64 %c) {
+  %z = call i32 @llvm.hexagon.S2.insert.rp(i32 %a, i32 %b, i64 %c)
+  ret i32 %z
+}
+; CHECK: r0 = insert(r1, r3:2)
+
+declare i64 @llvm.hexagon.S2.insertp.rp(i64, i64, i64)
+define i64 @S2_insertp_rp(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.S2.insertp.rp(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 = insert(r3:2, r5:4)
+
+; Interleave/deinterleave
+declare i64 @llvm.hexagon.S2.deinterleave(i64)
+define i64 @S2_deinterleave(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.deinterleave(i64 %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = deinterleave(r1:0)
+
+declare i64 @llvm.hexagon.S2.interleave(i64)
+define i64 @S2_interleave(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.interleave(i64 %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = interleave(r1:0)
+
+; Linear feedback-shift operation
+declare i64 @llvm.hexagon.S2.lfsp(i64, i64)
+define i64 @S2_lfsp(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.lfsp(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = lfs(r1:0, r3:2)
+
+; Masked parity
+declare i32 @llvm.hexagon.S2.parityp(i64, i64)
+define i32 @S2_parityp(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.S2.parityp(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: r0 = parity(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.S4.parity(i32, i32)
+define i32 @S4_parity(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S4.parity(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = parity(r0, r1)
+
+; Bit reverse
+declare i64 @llvm.hexagon.S2.brevp(i64)
+define i64 @S2_brevp(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.brevp(i64 %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = brev(r1:0)
+
+declare i32 @llvm.hexagon.S2.brev(i32)
+define i32 @S2_brev(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.brev(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = brev(r0)
+
+; Set/clear/toggle bit
+declare i32 @llvm.hexagon.S2.setbit.i(i32, i32)
+define i32 @S2_setbit_i(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.setbit.i(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = setbit(r0, #0)
+
+declare i32 @llvm.hexagon.S2.clrbit.i(i32, i32)
+define i32 @S2_clrbit_i(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.clrbit.i(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = clrbit(r0, #0)
+
+declare i32 @llvm.hexagon.S2.togglebit.i(i32, i32)
+define i32 @S2_togglebit_i(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.togglebit.i(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = togglebit(r0, #0)
+
+declare i32 @llvm.hexagon.S2.setbit.r(i32, i32)
+define i32 @S2_setbit_r(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.setbit.r(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = setbit(r0, r1)
+
+declare i32 @llvm.hexagon.S2.clrbit.r(i32, i32)
+define i32 @S2_clrbit_r(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.clrbit.r(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = clrbit(r0, r1)
+
+declare i32 @llvm.hexagon.S2.togglebit.r(i32, i32)
+define i32 @S2_togglebit_r(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.togglebit.r(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = togglebit(r0, r1)
+
+; Split bitfield
+declare i64 @llvm.hexagon.A4.bitspliti(i32, i32)
+define i64 @A4_bitspliti(i32 %a) {
+  %z = call i64 @llvm.hexagon.A4.bitspliti(i32 %a, i32 0)
+  ret i64 %z
+}
+; CHECK:  = bitsplit(r0, #0)
+
+declare i64 @llvm.hexagon.A4.bitsplit(i32, i32)
+define i64 @A4_bitsplit(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.A4.bitsplit(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = bitsplit(r0, r1)
+
+; Table index
+declare i32 @llvm.hexagon.S2.tableidxb.goodsyntax(i32, i32, i32, i32)
+define i32 @S2_tableidxb_goodsyntax(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.tableidxb.goodsyntax(i32 %a, i32 %b, i32 0, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = tableidxb(r1, #0, #0)
+
+declare i32 @llvm.hexagon.S2.tableidxh.goodsyntax(i32, i32, i32, i32)
+define i32 @S2_tableidxh_goodsyntax(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.tableidxh.goodsyntax(i32 %a, i32 %b, i32 0, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = tableidxh(r1, #0, #-1)
+
+declare i32 @llvm.hexagon.S2.tableidxw.goodsyntax(i32, i32, i32, i32)
+define i32 @S2_tableidxw_goodsyntax(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.tableidxw.goodsyntax(i32 %a, i32 %b, i32 0, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = tableidxw(r1, #0, #-2)
+
+declare i32 @llvm.hexagon.S2.tableidxd.goodsyntax(i32, i32, i32, i32)
+define i32 @S2_tableidxd_goodsyntax(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.tableidxd.goodsyntax(i32 %a, i32 %b, i32 0, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = tableidxd(r1, #0, #-3)
diff --git a/test/CodeGen/Hexagon/intrinsics/xtype_complex.ll b/test/CodeGen/Hexagon/intrinsics/xtype_complex.ll
new file mode 100644
index 0000000..57b0c5b
--- /dev/null
+++ b/test/CodeGen/Hexagon/intrinsics/xtype_complex.ll
@@ -0,0 +1,349 @@
+; RUN: llc -march=hexagon -O0 < %s | FileCheck %s
+; Hexagon Programmer's Reference Manual 11.10.3 XTYPE/COMPLEX
+
+; Complex add/sub halfwords
+declare i64 @llvm.hexagon.S4.vxaddsubh(i64, i64)
+define i64 @S4_vxaddsubh(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S4.vxaddsubh(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vxaddsubh(r1:0, r3:2):sat
+
+declare i64 @llvm.hexagon.S4.vxsubaddh(i64, i64)
+define i64 @S4_vxsubaddh(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S4.vxsubaddh(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vxsubaddh(r1:0, r3:2):sat
+
+declare i64 @llvm.hexagon.S4.vxaddsubhr(i64, i64)
+define i64 @S4_vxaddsubhr(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S4.vxaddsubhr(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vxaddsubh(r1:0, r3:2):rnd:>>1:sat
+
+declare i64 @llvm.hexagon.S4.vxsubaddhr(i64, i64)
+define i64 @S4_vxsubaddhr(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S4.vxsubaddhr(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vxsubaddh(r1:0, r3:2):rnd:>>1:sat
+
+; Complex add/sub words
+declare i64 @llvm.hexagon.S4.vxaddsubw(i64, i64)
+define i64 @S4_vxaddsubw(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S4.vxaddsubw(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vxaddsubw(r1:0, r3:2):sat
+
+declare i64 @llvm.hexagon.S4.vxsubaddw(i64, i64)
+define i64 @S4_vxsubaddw(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S4.vxsubaddw(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vxsubaddw(r1:0, r3:2):sat
+
+; Complex multiply
+declare i64 @llvm.hexagon.M2.cmpys.s0(i32, i32)
+define i64 @M2_cmpys_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.cmpys.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = cmpy(r0, r1):sat
+
+declare i64 @llvm.hexagon.M2.cmpys.s1(i32, i32)
+define i64 @M2_cmpys_s1(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.cmpys.s1(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = cmpy(r0, r1):<<1:sat
+
+declare i64 @llvm.hexagon.M2.cmpysc.s0(i32, i32)
+define i64 @M2_cmpysc_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.cmpysc.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = cmpy(r0, r1*):sat
+
+declare i64 @llvm.hexagon.M2.cmpysc.s1(i32, i32)
+define i64 @M2_cmpysc_s1(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.cmpysc.s1(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = cmpy(r0, r1*):<<1:sat
+
+declare i64 @llvm.hexagon.M2.cmacs.s0(i64, i32, i32)
+define i64 @M2_cmacs_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.cmacs.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += cmpy(r2, r3):sat
+
+declare i64 @llvm.hexagon.M2.cmacs.s1(i64, i32, i32)
+define i64 @M2_cmacs_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.cmacs.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += cmpy(r2, r3):<<1:sat
+
+declare i64 @llvm.hexagon.M2.cnacs.s0(i64, i32, i32)
+define i64 @M2_cnacs_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.cnacs.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= cmpy(r2, r3):sat
+
+declare i64 @llvm.hexagon.M2.cnacs.s1(i64, i32, i32)
+define i64 @M2_cnacs_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.cnacs.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= cmpy(r2, r3):<<1:sat
+
+declare i64 @llvm.hexagon.M2.cmacsc.s0(i64, i32, i32)
+define i64 @M2_cmacsc_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.cmacsc.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += cmpy(r2, r3*):sat
+
+declare i64 @llvm.hexagon.M2.cmacsc.s1(i64, i32, i32)
+define i64 @M2_cmacsc_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.cmacsc.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += cmpy(r2, r3*):<<1:sat
+
+declare i64 @llvm.hexagon.M2.cnacsc.s0(i64, i32, i32)
+define i64 @M2_cnacsc_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.cnacsc.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= cmpy(r2, r3*):sat
+
+declare i64 @llvm.hexagon.M2.cnacsc.s1(i64, i32, i32)
+define i64 @M2_cnacsc_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.cnacsc.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= cmpy(r2, r3*):<<1:sat
+
+; Complex multiply real or imaginary
+declare i64 @llvm.hexagon.M2.cmpyi.s0(i32, i32)
+define i64 @M2_cmpyi_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.cmpyi.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = cmpyi(r0, r1)
+
+declare i64 @llvm.hexagon.M2.cmpyr.s0(i32, i32)
+define i64 @M2_cmpyr_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.cmpyr.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = cmpyr(r0, r1)
+
+declare i64 @llvm.hexagon.M2.cmaci.s0(i64, i32, i32)
+define i64 @M2_cmaci_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.cmaci.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += cmpyi(r2, r3)
+
+declare i64 @llvm.hexagon.M2.cmacr.s0(i64, i32, i32)
+define i64 @M2_cmacr_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.cmacr.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += cmpyr(r2, r3)
+
+; Complex multiply with round and pack
+declare i32 @llvm.hexagon.M2.cmpyrs.s0(i32, i32)
+define i32 @M2_cmpyrs_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.cmpyrs.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = cmpy(r0, r1):rnd:sat
+
+declare i32 @llvm.hexagon.M2.cmpyrs.s1(i32, i32)
+define i32 @M2_cmpyrs_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.cmpyrs.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = cmpy(r0, r1):<<1:rnd:sat
+
+declare i32 @llvm.hexagon.M2.cmpyrsc.s0(i32, i32)
+define i32 @M2_cmpyrsc_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.cmpyrsc.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = cmpy(r0, r1*):rnd:sat
+
+declare i32 @llvm.hexagon.M2.cmpyrsc.s1(i32, i32)
+define i32 @M2_cmpyrsc_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.cmpyrsc.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = cmpy(r0, r1*):<<1:rnd:sat
+
+; Complex multiply 32x16
+declare i32 @llvm.hexagon.M4.cmpyi.wh(i64, i32)
+define i32 @M4_cmpyi_wh(i64 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M4.cmpyi.wh(i64 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = cmpyiwh(r1:0, r2):<<1:rnd:sat
+
+declare i32 @llvm.hexagon.M4.cmpyi.whc(i64, i32)
+define i32 @M4_cmpyi_whc(i64 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M4.cmpyi.whc(i64 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = cmpyiwh(r1:0, r2*):<<1:rnd:sat
+
+declare i32 @llvm.hexagon.M4.cmpyr.wh(i64, i32)
+define i32 @M4_cmpyr_wh(i64 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M4.cmpyr.wh(i64 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = cmpyrwh(r1:0, r2):<<1:rnd:sat
+
+declare i32 @llvm.hexagon.M4.cmpyr.whc(i64, i32)
+define i32 @M4_cmpyr_whc(i64 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M4.cmpyr.whc(i64 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = cmpyrwh(r1:0, r2*):<<1:rnd:sat
+
+; Vector complex multiply real or imaginary
+declare i64 @llvm.hexagon.M2.vcmpy.s0.sat.r(i64, i64)
+define i64 @M2_vcmpy_s0_sat_r(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.vcmpy.s0.sat.r(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vcmpyr(r1:0, r3:2):sat
+
+declare i64 @llvm.hexagon.M2.vcmpy.s1.sat.r(i64, i64)
+define i64 @M2_vcmpy_s1_sat_r(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.vcmpy.s1.sat.r(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vcmpyr(r1:0, r3:2):<<1:sat
+
+declare i64 @llvm.hexagon.M2.vcmpy.s0.sat.i(i64, i64)
+define i64 @M2_vcmpy_s0_sat_i(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.vcmpy.s0.sat.i(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vcmpyi(r1:0, r3:2):sat
+
+declare i64 @llvm.hexagon.M2.vcmpy.s1.sat.i(i64, i64)
+define i64 @M2_vcmpy_s1_sat_i(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.vcmpy.s1.sat.i(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vcmpyi(r1:0, r3:2):<<1:sat
+
+declare i64 @llvm.hexagon.M2.vcmac.s0.sat.r(i64, i64, i64)
+define i64 @M2_vcmac_s0_sat_r(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M2.vcmac.s0.sat.r(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vcmpyr(r3:2, r5:4):sat
+
+declare i64 @llvm.hexagon.M2.vcmac.s0.sat.i(i64, i64, i64)
+define i64 @M2_vcmac_s0_sat_i(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M2.vcmac.s0.sat.i(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vcmpyi(r3:2, r5:4):sat
+
+; Vector complex conjugate
+declare i64 @llvm.hexagon.A2.vconj(i64)
+define i64 @A2_vconj(i64 %a) {
+  %z = call i64 @llvm.hexagon.A2.vconj(i64 %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = vconj(r1:0):sat
+
+; Vector complex rotate
+declare i64 @llvm.hexagon.S2.vcrotate(i64, i32)
+define i64 @S2_vcrotate(i64 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.S2.vcrotate(i64 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vcrotate(r1:0, r2)
+
+; Vector reduce complex multiply real or imaginary
+declare i64 @llvm.hexagon.M2.vrcmpyi.s0(i64, i64)
+define i64 @M2_vrcmpyi_s0(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.vrcmpyi.s0(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrcmpyi(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.M2.vrcmpyr.s0(i64, i64)
+define i64 @M2_vrcmpyr_s0(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.vrcmpyr.s0(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrcmpyr(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.M2.vrcmpyi.s0c(i64, i64)
+define i64 @M2_vrcmpyi_s0c(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.vrcmpyi.s0c(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrcmpyi(r1:0, r3:2*)
+
+declare i64 @llvm.hexagon.M2.vrcmpyr.s0c(i64, i64)
+define i64 @M2_vrcmpyr_s0c(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.vrcmpyr.s0c(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrcmpyr(r1:0, r3:2*)
+
+declare i64 @llvm.hexagon.M2.vrcmaci.s0(i64, i64, i64)
+define i64 @M2_vrcmaci_s0(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M2.vrcmaci.s0(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vrcmpyi(r3:2, r5:4)
+
+declare i64 @llvm.hexagon.M2.vrcmacr.s0(i64, i64, i64)
+define i64 @M2_vrcmacr_s0(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M2.vrcmacr.s0(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vrcmpyr(r3:2, r5:4)
+
+declare i64 @llvm.hexagon.M2.vrcmaci.s0c(i64, i64, i64)
+define i64 @M2_vrcmaci_s0c(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M2.vrcmaci.s0c(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vrcmpyi(r3:2, r5:4*)
+
+declare i64 @llvm.hexagon.M2.vrcmacr.s0c(i64, i64, i64)
+define i64 @M2_vrcmacr_s0c(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M2.vrcmacr.s0c(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vrcmpyr(r3:2, r5:4*)
+
+; Vector reduce complex rotate
+declare i64 @llvm.hexagon.S4.vrcrotate(i64, i32, i32)
+define i64 @S4_vrcrotate(i64 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.S4.vrcrotate(i64 %a, i32 %b, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrcrotate(r1:0, r2, #0)
+
+declare i64 @llvm.hexagon.S4.vrcrotate.acc(i64, i64, i32, i32)
+define i64 @S4_vrcrotate_acc(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S4.vrcrotate.acc(i64 %a, i64 %b, i32 %c, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 += vrcrotate(r3:2, r4, #0)
diff --git a/test/CodeGen/Hexagon/intrinsics/xtype_fp.ll b/test/CodeGen/Hexagon/intrinsics/xtype_fp.ll
new file mode 100644
index 0000000..aef8127
--- /dev/null
+++ b/test/CodeGen/Hexagon/intrinsics/xtype_fp.ll
@@ -0,0 +1,388 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -O0 < %s | FileCheck %s
+; Hexagon Programmer's Reference Manual 11.10.4 XTYPE/FP
+
+; Floating point addition
+declare float @llvm.hexagon.F2.sfadd(float, float)
+define float @F2_sfadd(float %a, float %b) {
+  %z = call float @llvm.hexagon.F2.sfadd(float %a, float %b)
+  ret float %z
+}
+; CHECK: r0 = sfadd(r0, r1)
+
+; Classify floating-point value
+declare i32 @llvm.hexagon.F2.sfclass(float, i32)
+define i32 @F2_sfclass(float %a) {
+  %z = call i32 @llvm.hexagon.F2.sfclass(float %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = sfclass(r0, #0)
+
+declare i32 @llvm.hexagon.F2.dfclass(double, i32)
+define i32 @F2_dfclass(double %a) {
+  %z = call i32 @llvm.hexagon.F2.dfclass(double %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = dfclass(r1:0, #0)
+
+; Compare floating-point value
+declare i32 @llvm.hexagon.F2.sfcmpge(float, float)
+define i32 @F2_sfcmpge(float %a, float %b) {
+  %z = call i32 @llvm.hexagon.F2.sfcmpge(float %a, float %b)
+  ret i32 %z
+}
+; CHECK: p0 = sfcmp.ge(r0, r1)
+
+declare i32 @llvm.hexagon.F2.sfcmpuo(float, float)
+define i32 @F2_sfcmpuo(float %a, float %b) {
+  %z = call i32 @llvm.hexagon.F2.sfcmpuo(float %a, float %b)
+  ret i32 %z
+}
+; CHECK: p0 = sfcmp.uo(r0, r1)
+
+declare i32 @llvm.hexagon.F2.sfcmpeq(float, float)
+define i32 @F2_sfcmpeq(float %a, float %b) {
+  %z = call i32 @llvm.hexagon.F2.sfcmpeq(float %a, float %b)
+  ret i32 %z
+}
+; CHECK: p0 = sfcmp.eq(r0, r1)
+
+declare i32 @llvm.hexagon.F2.sfcmpgt(float, float)
+define i32 @F2_sfcmpgt(float %a, float %b) {
+  %z = call i32 @llvm.hexagon.F2.sfcmpgt(float %a, float %b)
+  ret i32 %z
+}
+; CHECK: p0 = sfcmp.gt(r0, r1)
+
+declare i32 @llvm.hexagon.F2.dfcmpge(double, double)
+define i32 @F2_dfcmpge(double %a, double %b) {
+  %z = call i32 @llvm.hexagon.F2.dfcmpge(double %a, double %b)
+  ret i32 %z
+}
+; CHECK: p0 = dfcmp.ge(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.F2.dfcmpuo(double, double)
+define i32 @F2_dfcmpuo(double %a, double %b) {
+  %z = call i32 @llvm.hexagon.F2.dfcmpuo(double %a, double %b)
+  ret i32 %z
+}
+; CHECK: p0 = dfcmp.uo(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.F2.dfcmpeq(double, double)
+define i32 @F2_dfcmpeq(double %a, double %b) {
+  %z = call i32 @llvm.hexagon.F2.dfcmpeq(double %a, double %b)
+  ret i32 %z
+}
+; CHECK: p0 = dfcmp.eq(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.F2.dfcmpgt(double, double)
+define i32 @F2_dfcmpgt(double %a, double %b) {
+  %z = call i32 @llvm.hexagon.F2.dfcmpgt(double %a, double %b)
+  ret i32 %z
+}
+; CHECK: p0 = dfcmp.gt(r1:0, r3:2)
+
+; Convert floating-point value to other format
+declare double @llvm.hexagon.F2.conv.sf2df(float)
+define double @F2_conv_sf2df(float %a) {
+  %z = call double @llvm.hexagon.F2.conv.sf2df(float %a)
+  ret double %z
+}
+; CHECK:  = convert_sf2df(r0)
+
+declare float @llvm.hexagon.F2.conv.df2sf(double)
+define float @F2_conv_df2sf(double %a) {
+  %z = call float @llvm.hexagon.F2.conv.df2sf(double %a)
+  ret float %z
+}
+; CHECK: r0 = convert_df2sf(r1:0)
+
+; Convert integer to floating-point value
+declare double @llvm.hexagon.F2.conv.ud2df(i64)
+define double @F2_conv_ud2df(i64 %a) {
+  %z = call double @llvm.hexagon.F2.conv.ud2df(i64 %a)
+  ret double %z
+}
+; CHECK: r1:0 = convert_ud2df(r1:0)
+
+declare double @llvm.hexagon.F2.conv.d2df(i64)
+define double @F2_conv_d2df(i64 %a) {
+  %z = call double @llvm.hexagon.F2.conv.d2df(i64 %a)
+  ret double %z
+}
+; CHECK: r1:0 = convert_d2df(r1:0)
+
+declare double @llvm.hexagon.F2.conv.uw2df(i32)
+define double @F2_conv_uw2df(i32 %a) {
+  %z = call double @llvm.hexagon.F2.conv.uw2df(i32 %a)
+  ret double %z
+}
+; CHECK:  = convert_uw2df(r0)
+
+declare double @llvm.hexagon.F2.conv.w2df(i32)
+define double @F2_conv_w2df(i32 %a) {
+  %z = call double @llvm.hexagon.F2.conv.w2df(i32 %a)
+  ret double %z
+}
+; CHECK:  = convert_w2df(r0)
+
+declare float @llvm.hexagon.F2.conv.ud2sf(i64)
+define float @F2_conv_ud2sf(i64 %a) {
+  %z = call float @llvm.hexagon.F2.conv.ud2sf(i64 %a)
+  ret float %z
+}
+; CHECK: r0 = convert_ud2sf(r1:0)
+
+declare float @llvm.hexagon.F2.conv.d2sf(i64)
+define float @F2_conv_d2sf(i64 %a) {
+  %z = call float @llvm.hexagon.F2.conv.d2sf(i64 %a)
+  ret float %z
+}
+; CHECK: r0 = convert_d2sf(r1:0)
+
+declare float @llvm.hexagon.F2.conv.uw2sf(i32)
+define float @F2_conv_uw2sf(i32 %a) {
+  %z = call float @llvm.hexagon.F2.conv.uw2sf(i32 %a)
+  ret float %z
+}
+; CHECK: r0 = convert_uw2sf(r0)
+
+declare float @llvm.hexagon.F2.conv.w2sf(i32)
+define float @F2_conv_w2sf(i32 %a) {
+  %z = call float @llvm.hexagon.F2.conv.w2sf(i32 %a)
+  ret float %z
+}
+; CHECK: r0 = convert_w2sf(r0)
+
+; Convert floating-point value to integer
+declare i64 @llvm.hexagon.F2.conv.df2d(double)
+define i64 @F2_conv_df2d(double %a) {
+  %z = call i64 @llvm.hexagon.F2.conv.df2d(double %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = convert_df2d(r1:0)
+
+declare i64 @llvm.hexagon.F2.conv.df2ud(double)
+define i64 @F2_conv_df2ud(double %a) {
+  %z = call i64 @llvm.hexagon.F2.conv.df2ud(double %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = convert_df2ud(r1:0)
+
+declare i64 @llvm.hexagon.F2.conv.df2d.chop(double)
+define i64 @F2_conv_df2d_chop(double %a) {
+  %z = call i64 @llvm.hexagon.F2.conv.df2d.chop(double %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = convert_df2d(r1:0):chop
+
+declare i64 @llvm.hexagon.F2.conv.df2ud.chop(double)
+define i64 @F2_conv_df2ud_chop(double %a) {
+  %z = call i64 @llvm.hexagon.F2.conv.df2ud.chop(double %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = convert_df2ud(r1:0):chop
+
+declare i64 @llvm.hexagon.F2.conv.sf2ud(float)
+define i64 @F2_conv_sf2ud(float %a) {
+  %z = call i64 @llvm.hexagon.F2.conv.sf2ud(float %a)
+  ret i64 %z
+}
+; CHECK:  = convert_sf2ud(r0)
+
+declare i64 @llvm.hexagon.F2.conv.sf2d(float)
+define i64 @F2_conv_sf2d(float %a) {
+  %z = call i64 @llvm.hexagon.F2.conv.sf2d(float %a)
+  ret i64 %z
+}
+; CHECK:  = convert_sf2d(r0)
+
+declare i64 @llvm.hexagon.F2.conv.sf2d.chop(float)
+define i64 @F2_conv_sf2d_chop(float %a) {
+  %z = call i64 @llvm.hexagon.F2.conv.sf2d.chop(float %a)
+  ret i64 %z
+}
+; CHECK:  = convert_sf2d(r0):chop
+
+declare i64 @llvm.hexagon.F2.conv.sf2ud.chop(float)
+define i64 @F2_conv_sf2ud_chop(float %a) {
+  %z = call i64 @llvm.hexagon.F2.conv.sf2ud.chop(float %a)
+  ret i64 %z
+}
+; CHECK:  = convert_sf2ud(r0):chop
+
+declare i32 @llvm.hexagon.F2.conv.df2uw(double)
+define i32 @F2_conv_df2uw(double %a) {
+  %z = call i32 @llvm.hexagon.F2.conv.df2uw(double %a)
+  ret i32 %z
+}
+; CHECK: r0 = convert_df2uw(r1:0)
+
+declare i32 @llvm.hexagon.F2.conv.df2w(double)
+define i32 @F2_conv_df2w(double %a) {
+  %z = call i32 @llvm.hexagon.F2.conv.df2w(double %a)
+  ret i32 %z
+}
+; CHECK: r0 = convert_df2w(r1:0)
+
+declare i32 @llvm.hexagon.F2.conv.df2w.chop(double)
+define i32 @F2_conv_df2w_chop(double %a) {
+  %z = call i32 @llvm.hexagon.F2.conv.df2w.chop(double %a)
+  ret i32 %z
+}
+; CHECK: r0 = convert_df2w(r1:0):chop
+
+declare i32 @llvm.hexagon.F2.conv.df2uw.chop(double)
+define i32 @F2_conv_df2uw_chop(double %a) {
+  %z = call i32 @llvm.hexagon.F2.conv.df2uw.chop(double %a)
+  ret i32 %z
+}
+; CHECK: r0 = convert_df2uw(r1:0):chop
+
+declare i32 @llvm.hexagon.F2.conv.sf2uw(float)
+define i32 @F2_conv_sf2uw(float %a) {
+  %z = call i32 @llvm.hexagon.F2.conv.sf2uw(float %a)
+  ret i32 %z
+}
+; CHECK: r0 = convert_sf2uw(r0)
+
+declare i32 @llvm.hexagon.F2.conv.sf2uw.chop(float)
+define i32 @F2_conv_sf2uw_chop(float %a) {
+  %z = call i32 @llvm.hexagon.F2.conv.sf2uw.chop(float %a)
+  ret i32 %z
+}
+; CHECK: r0 = convert_sf2uw(r0):chop
+
+declare i32 @llvm.hexagon.F2.conv.sf2w(float)
+define i32 @F2_conv_sf2w(float %a) {
+  %z = call i32 @llvm.hexagon.F2.conv.sf2w(float %a)
+  ret i32 %z
+}
+; CHECK: r0 = convert_sf2w(r0)
+
+declare i32 @llvm.hexagon.F2.conv.sf2w.chop(float)
+define i32 @F2_conv_sf2w_chop(float %a) {
+  %z = call i32 @llvm.hexagon.F2.conv.sf2w.chop(float %a)
+  ret i32 %z
+}
+; CHECK: r0 = convert_sf2w(r0):chop
+
+; Floating point extreme value assistance
+declare float @llvm.hexagon.F2.sffixupr(float)
+define float @F2_sffixupr(float %a) {
+  %z = call float @llvm.hexagon.F2.sffixupr(float %a)
+  ret float %z
+}
+; CHECK: r0 = sffixupr(r0)
+
+declare float @llvm.hexagon.F2.sffixupn(float, float)
+define float @F2_sffixupn(float %a, float %b) {
+  %z = call float @llvm.hexagon.F2.sffixupn(float %a, float %b)
+  ret float %z
+}
+; CHECK: r0 = sffixupn(r0, r1)
+
+declare float @llvm.hexagon.F2.sffixupd(float, float)
+define float @F2_sffixupd(float %a, float %b) {
+  %z = call float @llvm.hexagon.F2.sffixupd(float %a, float %b)
+  ret float %z
+}
+; CHECK: r0 = sffixupd(r0, r1)
+
+; Floating point fused multiply-add
+declare float @llvm.hexagon.F2.sffma(float, float, float)
+define float @F2_sffma(float %a, float %b, float %c) {
+  %z = call float @llvm.hexagon.F2.sffma(float %a, float %b, float %c)
+  ret float %z
+}
+; CHECK: r0 += sfmpy(r1, r2)
+
+declare float @llvm.hexagon.F2.sffms(float, float, float)
+define float @F2_sffms(float %a, float %b, float %c) {
+  %z = call float @llvm.hexagon.F2.sffms(float %a, float %b, float %c)
+  ret float %z
+}
+; CHECK: r0 -= sfmpy(r1, r2)
+
+; Floating point fused multiply-add with scaling
+declare float @llvm.hexagon.F2.sffma.sc(float, float, float, i32)
+define float @F2_sffma_sc(float %a, float %b, float %c, i32 %d) {
+  %z = call float @llvm.hexagon.F2.sffma.sc(float %a, float %b, float %c, i32 %d)
+  ret float %z
+}
+; CHECK: r0 += sfmpy(r1, r2, p0):scale
+
+; Floating point fused multiply-add for library routines
+declare float @llvm.hexagon.F2.sffma.lib(float, float, float)
+define float @F2_sffma_lib(float %a, float %b, float %c) {
+  %z = call float @llvm.hexagon.F2.sffma.lib(float %a, float %b, float %c)
+  ret float %z
+}
+; CHECK: r0 += sfmpy(r1, r2):lib
+
+declare float @llvm.hexagon.F2.sffms.lib(float, float, float)
+define float @F2_sffms_lib(float %a, float %b, float %c) {
+  %z = call float @llvm.hexagon.F2.sffms.lib(float %a, float %b, float %c)
+  ret float %z
+}
+; CHECK: r0 -= sfmpy(r1, r2):lib
+
+; Create floating-point constant
+declare float @llvm.hexagon.F2.sfimm.p(i32)
+define float @F2_sfimm_p() {
+  %z = call float @llvm.hexagon.F2.sfimm.p(i32 0)
+  ret float %z
+}
+; CHECK: r0 = sfmake(#0):pos
+
+declare float @llvm.hexagon.F2.sfimm.n(i32)
+define float @F2_sfimm_n() {
+  %z = call float @llvm.hexagon.F2.sfimm.n(i32 0)
+  ret float %z
+} 
+; CHECK: r0 = sfmake(#0):neg
+
+declare double @llvm.hexagon.F2.dfimm.p(i32)
+define double @F2_dfimm_p() {
+  %z = call double @llvm.hexagon.F2.dfimm.p(i32 0)
+  ret double %z
+}
+; CHECK: r1:0 = dfmake(#0):pos
+
+declare double @llvm.hexagon.F2.dfimm.n(i32)
+define double @F2_dfimm_n() {
+  %z = call double @llvm.hexagon.F2.dfimm.n(i32 0)
+  ret double %z
+}
+; CHECK: r1:0 = dfmake(#0):neg
+
+; Floating point maximum
+declare float @llvm.hexagon.F2.sfmax(float, float)
+define float @F2_sfmax(float %a, float %b) {
+  %z = call float @llvm.hexagon.F2.sfmax(float %a, float %b)
+  ret float %z
+}
+; CHECK: r0 = sfmax(r0, r1)
+
+; Floating point minimum
+declare float @llvm.hexagon.F2.sfmin(float, float)
+define float @F2_sfmin(float %a, float %b) {
+  %z = call float @llvm.hexagon.F2.sfmin(float %a, float %b)
+  ret float %z
+}
+; CHECK: r0 = sfmin(r0, r1)
+
+; Floating point multiply
+declare float @llvm.hexagon.F2.sfmpy(float, float)
+define float @F2_sfmpy(float %a, float %b) {
+  %z = call float @llvm.hexagon.F2.sfmpy(float %a, float %b)
+  ret float %z
+}
+; CHECK: r0 = sfmpy(r0, r1)
+
+; Floating point subtraction
+declare float @llvm.hexagon.F2.sfsub(float, float)
+define float @F2_sfsub(float %a, float %b) {
+  %z = call float @llvm.hexagon.F2.sfsub(float %a, float %b)
+  ret float %z
+}
+; CHECK: r0 = sfsub(r0, r1)
diff --git a/test/CodeGen/Hexagon/intrinsics/xtype_mpy.ll b/test/CodeGen/Hexagon/intrinsics/xtype_mpy.ll
new file mode 100644
index 0000000..6409e4e
--- /dev/null
+++ b/test/CodeGen/Hexagon/intrinsics/xtype_mpy.ll
@@ -0,0 +1,1525 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -O0 < %s | FileCheck %s
+; Hexagon Programmer's Reference Manual 11.10.5 XTYPE/MPY
+
+; Multiply and use lower result
+declare i32 @llvm.hexagon.M4.mpyrr.addi(i32, i32, i32)
+define i32 @M4_mpyrr_addi(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M4.mpyrr.addi(i32 0, i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(#0, mpyi(r0, r1))
+
+declare i32 @llvm.hexagon.M4.mpyri.addi(i32, i32, i32)
+define i32 @M4_mpyri_addi(i32 %a) {
+  %z = call i32 @llvm.hexagon.M4.mpyri.addi(i32 0, i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = add(#0, mpyi(r0, #0))
+
+declare i32 @llvm.hexagon.M4.mpyri.addr.u2(i32, i32, i32)
+define i32 @M4_mpyri_addr_u2(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M4.mpyri.addr.u2(i32 %a, i32 0, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0, mpyi(#0, r1))
+
+declare i32 @llvm.hexagon.M4.mpyri.addr(i32, i32, i32)
+define i32 @M4_mpyri_addr(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M4.mpyri.addr(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = add(r0, mpyi(r1, #0))
+
+declare i32 @llvm.hexagon.M4.mpyrr.addr(i32, i32, i32)
+define i32 @M4_mpyrr_addr(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M4.mpyrr.addr(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r1 = add(r0, mpyi(r1, r2))
+
+; Vector multiply word by signed half (32x16)
+declare i64 @llvm.hexagon.M2.mmpyl.s0(i64, i64)
+define i64 @M2_mmpyl_s0(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.mmpyl.s0(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpyweh(r1:0, r3:2):sat
+
+declare i64 @llvm.hexagon.M2.mmpyl.s1(i64, i64)
+define i64 @M2_mmpyl_s1(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.mmpyl.s1(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpyweh(r1:0, r3:2):<<1:sat
+
+declare i64 @llvm.hexagon.M2.mmpyh.s0(i64, i64)
+define i64 @M2_mmpyh_s0(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.mmpyh.s0(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpywoh(r1:0, r3:2):sat
+
+declare i64 @llvm.hexagon.M2.mmpyh.s1(i64, i64)
+define i64 @M2_mmpyh_s1(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.mmpyh.s1(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpywoh(r1:0, r3:2):<<1:sat
+
+declare i64 @llvm.hexagon.M2.mmpyl.rs0(i64, i64)
+define i64 @M2_mmpyl_rs0(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.mmpyl.rs0(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpyweh(r1:0, r3:2):rnd:sat
+
+declare i64 @llvm.hexagon.M2.mmpyl.rs1(i64, i64)
+define i64 @M2_mmpyl_rs1(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.mmpyl.rs1(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpyweh(r1:0, r3:2):<<1:rnd:sat
+
+declare i64 @llvm.hexagon.M2.mmpyh.rs0(i64, i64)
+define i64 @M2_mmpyh_rs0(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.mmpyh.rs0(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpywoh(r1:0, r3:2):rnd:sat
+
+declare i64 @llvm.hexagon.M2.mmpyh.rs1(i64, i64)
+define i64 @M2_mmpyh_rs1(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.mmpyh.rs1(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpywoh(r1:0, r3:2):<<1:rnd:sat
+
+; Vector multiply word by unsigned half (32x16)
+declare i64 @llvm.hexagon.M2.mmpyul.s0(i64, i64)
+define i64 @M2_mmpyul_s0(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.mmpyul.s0(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpyweuh(r1:0, r3:2):sat
+
+declare i64 @llvm.hexagon.M2.mmpyul.s1(i64, i64)
+define i64 @M2_mmpyul_s1(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.mmpyul.s1(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpyweuh(r1:0, r3:2):<<1:sat
+
+declare i64 @llvm.hexagon.M2.mmpyuh.s0(i64, i64)
+define i64 @M2_mmpyuh_s0(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.mmpyuh.s0(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpywouh(r1:0, r3:2):sat
+
+declare i64 @llvm.hexagon.M2.mmpyuh.s1(i64, i64)
+define i64 @M2_mmpyuh_s1(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.mmpyuh.s1(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpywouh(r1:0, r3:2):<<1:sat
+
+declare i64 @llvm.hexagon.M2.mmpyul.rs0(i64, i64)
+define i64 @M2_mmpyul_rs0(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.mmpyul.rs0(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpyweuh(r1:0, r3:2):rnd:sat
+
+declare i64 @llvm.hexagon.M2.mmpyul.rs1(i64, i64)
+define i64 @M2_mmpyul_rs1(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.mmpyul.rs1(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpyweuh(r1:0, r3:2):<<1:rnd:sat
+
+declare i64 @llvm.hexagon.M2.mmpyuh.rs0(i64, i64)
+define i64 @M2_mmpyuh_rs0(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.mmpyuh.rs0(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpywouh(r1:0, r3:2):rnd:sat
+
+declare i64 @llvm.hexagon.M2.mmpyuh.rs1(i64, i64)
+define i64 @M2_mmpyuh_rs1(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.mmpyuh.rs1(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpywouh(r1:0, r3:2):<<1:rnd:sat
+
+; Multiply signed halfwords
+declare i64 @llvm.hexagon.M2.mpyd.ll.s0(i32, i32)
+define i64 @M2_mpyd_ll_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.ll.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0.l, r1.l)
+
+declare i64 @llvm.hexagon.M2.mpyd.ll.s1(i32, i32)
+define i64 @M2_mpyd_ll_s1(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.ll.s1(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0.l, r1.l):<<1
+
+declare i64 @llvm.hexagon.M2.mpyd.lh.s0(i32, i32)
+define i64 @M2_mpyd_lh_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.lh.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0.l, r1.h)
+
+declare i64 @llvm.hexagon.M2.mpyd.lh.s1(i32, i32)
+define i64 @M2_mpyd_lh_s1(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.lh.s1(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0.l, r1.h):<<1
+
+declare i64 @llvm.hexagon.M2.mpyd.hl.s0(i32, i32)
+define i64 @M2_mpyd_hl_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.hl.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0.h, r1.l)
+
+declare i64 @llvm.hexagon.M2.mpyd.hl.s1(i32, i32)
+define i64 @M2_mpyd_hl_s1(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.hl.s1(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0.h, r1.l):<<1
+
+declare i64 @llvm.hexagon.M2.mpyd.hh.s0(i32, i32)
+define i64 @M2_mpyd_hh_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.hh.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0.h, r1.h)
+
+declare i64 @llvm.hexagon.M2.mpyd.hh.s1(i32, i32)
+define i64 @M2_mpyd_hh_s1(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.hh.s1(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0.h, r1.h):<<1
+
+declare i64 @llvm.hexagon.M2.mpyd.rnd.ll.s0(i32, i32)
+define i64 @M2_mpyd_rnd_ll_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.rnd.ll.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0.l, r1.l):rnd
+
+declare i64 @llvm.hexagon.M2.mpyd.rnd.ll.s1(i32, i32)
+define i64 @M2_mpyd_rnd_ll_s1(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.rnd.ll.s1(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0.l, r1.l):<<1:rnd
+
+declare i64 @llvm.hexagon.M2.mpyd.rnd.lh.s0(i32, i32)
+define i64 @M2_mpyd_rnd_lh_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.rnd.lh.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0.l, r1.h):rnd
+
+declare i64 @llvm.hexagon.M2.mpyd.rnd.lh.s1(i32, i32)
+define i64 @M2_mpyd_rnd_lh_s1(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.rnd.lh.s1(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0.l, r1.h):<<1:rnd
+
+declare i64 @llvm.hexagon.M2.mpyd.rnd.hl.s0(i32, i32)
+define i64 @M2_mpyd_rnd_hl_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.rnd.hl.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0.h, r1.l):rnd
+
+declare i64 @llvm.hexagon.M2.mpyd.rnd.hl.s1(i32, i32)
+define i64 @M2_mpyd_rnd_hl_s1(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.rnd.hl.s1(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0.h, r1.l):<<1:rnd
+
+declare i64 @llvm.hexagon.M2.mpyd.rnd.hh.s0(i32, i32)
+define i64 @M2_mpyd_rnd_hh_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.rnd.hh.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0.h, r1.h):rnd
+
+declare i64 @llvm.hexagon.M2.mpyd.rnd.hh.s1(i32, i32)
+define i64 @M2_mpyd_rnd_hh_s1(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.rnd.hh.s1(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0.h, r1.h):<<1:rnd
+
+declare i64 @llvm.hexagon.M2.mpyd.acc.ll.s0(i64, i32, i32)
+define i64 @M2_mpyd_acc_ll_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.acc.ll.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpy(r2.l, r3.l)
+
+declare i64 @llvm.hexagon.M2.mpyd.acc.ll.s1(i64, i32, i32)
+define i64 @M2_mpyd_acc_ll_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.acc.ll.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpy(r2.l, r3.l):<<1
+
+declare i64 @llvm.hexagon.M2.mpyd.acc.lh.s0(i64, i32, i32)
+define i64 @M2_mpyd_acc_lh_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.acc.lh.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpy(r2.l, r3.h)
+
+declare i64 @llvm.hexagon.M2.mpyd.acc.lh.s1(i64, i32, i32)
+define i64 @M2_mpyd_acc_lh_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.acc.lh.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpy(r2.l, r3.h):<<1
+
+declare i64 @llvm.hexagon.M2.mpyd.acc.hl.s0(i64, i32, i32)
+define i64 @M2_mpyd_acc_hl_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.acc.hl.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpy(r2.h, r3.l)
+
+declare i64 @llvm.hexagon.M2.mpyd.acc.hl.s1(i64, i32, i32)
+define i64 @M2_mpyd_acc_hl_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.acc.hl.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpy(r2.h, r3.l):<<1
+
+declare i64 @llvm.hexagon.M2.mpyd.acc.hh.s0(i64, i32, i32)
+define i64 @M2_mpyd_acc_hh_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.acc.hh.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpy(r2.h, r3.h)
+
+declare i64 @llvm.hexagon.M2.mpyd.acc.hh.s1(i64, i32, i32)
+define i64 @M2_mpyd_acc_hh_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.acc.hh.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpy(r2.h, r3.h):<<1
+
+declare i64 @llvm.hexagon.M2.mpyd.nac.ll.s0(i64, i32, i32)
+define i64 @M2_mpyd_nac_ll_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.nac.ll.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpy(r2.l, r3.l)
+
+declare i64 @llvm.hexagon.M2.mpyd.nac.ll.s1(i64, i32, i32)
+define i64 @M2_mpyd_nac_ll_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.nac.ll.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpy(r2.l, r3.l):<<1
+
+declare i64 @llvm.hexagon.M2.mpyd.nac.lh.s0(i64, i32, i32)
+define i64 @M2_mpyd_nac_lh_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.nac.lh.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpy(r2.l, r3.h)
+
+declare i64 @llvm.hexagon.M2.mpyd.nac.lh.s1(i64, i32, i32)
+define i64 @M2_mpyd_nac_lh_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.nac.lh.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpy(r2.l, r3.h):<<1
+
+declare i64 @llvm.hexagon.M2.mpyd.nac.hl.s0(i64, i32, i32)
+define i64 @M2_mpyd_nac_hl_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.nac.hl.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpy(r2.h, r3.l)
+
+declare i64 @llvm.hexagon.M2.mpyd.nac.hl.s1(i64, i32, i32)
+define i64 @M2_mpyd_nac_hl_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.nac.hl.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpy(r2.h, r3.l):<<1
+
+declare i64 @llvm.hexagon.M2.mpyd.nac.hh.s0(i64, i32, i32)
+define i64 @M2_mpyd_nac_hh_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.nac.hh.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpy(r2.h, r3.h)
+
+declare i64 @llvm.hexagon.M2.mpyd.nac.hh.s1(i64, i32, i32)
+define i64 @M2_mpyd_nac_hh_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyd.nac.hh.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpy(r2.h, r3.h):<<1
+
+declare i32 @llvm.hexagon.M2.mpy.ll.s0(i32, i32)
+define i32 @M2_mpy_ll_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.ll.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.l, r1.l)
+
+declare i32 @llvm.hexagon.M2.mpy.ll.s1(i32, i32)
+define i32 @M2_mpy_ll_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.ll.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.l, r1.l):<<1
+
+declare i32 @llvm.hexagon.M2.mpy.lh.s0(i32, i32)
+define i32 @M2_mpy_lh_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.lh.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.l, r1.h)
+
+declare i32 @llvm.hexagon.M2.mpy.lh.s1(i32, i32)
+define i32 @M2_mpy_lh_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.lh.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.l, r1.h):<<1
+
+declare i32 @llvm.hexagon.M2.mpy.hl.s0(i32, i32)
+define i32 @M2_mpy_hl_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.hl.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.h, r1.l)
+
+declare i32 @llvm.hexagon.M2.mpy.hl.s1(i32, i32)
+define i32 @M2_mpy_hl_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.hl.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.h, r1.l):<<1
+
+declare i32 @llvm.hexagon.M2.mpy.hh.s0(i32, i32)
+define i32 @M2_mpy_hh_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.hh.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.h, r1.h)
+
+declare i32 @llvm.hexagon.M2.mpy.hh.s1(i32, i32)
+define i32 @M2_mpy_hh_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.hh.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.h, r1.h):<<1
+
+declare i32 @llvm.hexagon.M2.mpy.sat.ll.s0(i32, i32)
+define i32 @M2_mpy_sat_ll_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.sat.ll.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.l, r1.l):sat
+
+declare i32 @llvm.hexagon.M2.mpy.sat.ll.s1(i32, i32)
+define i32 @M2_mpy_sat_ll_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.sat.ll.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.l, r1.l):<<1:sat
+
+declare i32 @llvm.hexagon.M2.mpy.sat.lh.s0(i32, i32)
+define i32 @M2_mpy_sat_lh_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.sat.lh.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.l, r1.h):sat
+
+declare i32 @llvm.hexagon.M2.mpy.sat.lh.s1(i32, i32)
+define i32 @M2_mpy_sat_lh_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.sat.lh.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.l, r1.h):<<1:sat
+
+declare i32 @llvm.hexagon.M2.mpy.sat.hl.s0(i32, i32)
+define i32 @M2_mpy_sat_hl_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.sat.hl.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.h, r1.l):sat
+
+declare i32 @llvm.hexagon.M2.mpy.sat.hl.s1(i32, i32)
+define i32 @M2_mpy_sat_hl_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.sat.hl.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.h, r1.l):<<1:sat
+
+declare i32 @llvm.hexagon.M2.mpy.sat.hh.s0(i32, i32)
+define i32 @M2_mpy_sat_hh_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.sat.hh.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.h, r1.h):sat
+
+declare i32 @llvm.hexagon.M2.mpy.sat.hh.s1(i32, i32)
+define i32 @M2_mpy_sat_hh_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.sat.hh.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.h, r1.h):<<1:sat
+
+declare i32 @llvm.hexagon.M2.mpy.sat.rnd.ll.s0(i32, i32)
+define i32 @M2_mpy_sat_rnd_ll_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.sat.rnd.ll.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.l, r1.l):rnd:sat
+
+declare i32 @llvm.hexagon.M2.mpy.sat.rnd.ll.s1(i32, i32)
+define i32 @M2_mpy_sat_rnd_ll_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.sat.rnd.ll.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.l, r1.l):<<1:rnd:sat
+
+declare i32 @llvm.hexagon.M2.mpy.sat.rnd.lh.s0(i32, i32)
+define i32 @M2_mpy_sat_rnd_lh_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.sat.rnd.lh.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.l, r1.h):rnd:sat
+
+declare i32 @llvm.hexagon.M2.mpy.sat.rnd.lh.s1(i32, i32)
+define i32 @M2_mpy_sat_rnd_lh_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.sat.rnd.lh.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.l, r1.h):<<1:rnd:sat
+
+declare i32 @llvm.hexagon.M2.mpy.sat.rnd.hl.s0(i32, i32)
+define i32 @M2_mpy_sat_rnd_hl_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.sat.rnd.hl.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.h, r1.l):rnd:sat
+
+declare i32 @llvm.hexagon.M2.mpy.sat.rnd.hl.s1(i32, i32)
+define i32 @M2_mpy_sat_rnd_hl_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.sat.rnd.hl.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.h, r1.l):<<1:rnd:sat
+
+declare i32 @llvm.hexagon.M2.mpy.sat.rnd.hh.s0(i32, i32)
+define i32 @M2_mpy_sat_rnd_hh_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.sat.rnd.hh.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.h, r1.h):rnd:sat
+
+declare i32 @llvm.hexagon.M2.mpy.sat.rnd.hh.s1(i32, i32)
+define i32 @M2_mpy_sat_rnd_hh_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.sat.rnd.hh.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0.h, r1.h):<<1:rnd:sat
+
+declare i32 @llvm.hexagon.M2.mpy.acc.ll.s0(i32, i32, i32)
+define i32 @M2_mpy_acc_ll_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.acc.ll.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1.l, r2.l)
+
+declare i32 @llvm.hexagon.M2.mpy.acc.ll.s1(i32, i32, i32)
+define i32 @M2_mpy_acc_ll_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.acc.ll.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1.l, r2.l):<<1
+
+declare i32 @llvm.hexagon.M2.mpy.acc.lh.s0(i32, i32, i32)
+define i32 @M2_mpy_acc_lh_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.acc.lh.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1.l, r2.h)
+
+declare i32 @llvm.hexagon.M2.mpy.acc.lh.s1(i32, i32, i32)
+define i32 @M2_mpy_acc_lh_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.acc.lh.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1.l, r2.h):<<1
+
+declare i32 @llvm.hexagon.M2.mpy.acc.hl.s0(i32, i32, i32)
+define i32 @M2_mpy_acc_hl_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.acc.hl.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1.h, r2.l)
+
+declare i32 @llvm.hexagon.M2.mpy.acc.hl.s1(i32, i32, i32)
+define i32 @M2_mpy_acc_hl_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.acc.hl.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1.h, r2.l):<<1
+
+declare i32 @llvm.hexagon.M2.mpy.acc.hh.s0(i32, i32, i32)
+define i32 @M2_mpy_acc_hh_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.acc.hh.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1.h, r2.h)
+
+declare i32 @llvm.hexagon.M2.mpy.acc.hh.s1(i32, i32, i32)
+define i32 @M2_mpy_acc_hh_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.acc.hh.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1.h, r2.h):<<1
+
+declare i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32, i32, i32)
+define i32 @M2_mpy_acc_sat_ll_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1.l, r2.l):sat
+
+declare i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s1(i32, i32, i32)
+define i32 @M2_mpy_acc_sat_ll_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1.l, r2.l):<<1:sat
+
+declare i32 @llvm.hexagon.M2.mpy.acc.sat.lh.s0(i32, i32, i32)
+define i32 @M2_mpy_acc_sat_lh_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.acc.sat.lh.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1.l, r2.h):sat
+
+declare i32 @llvm.hexagon.M2.mpy.acc.sat.lh.s1(i32, i32, i32)
+define i32 @M2_mpy_acc_sat_lh_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.acc.sat.lh.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1.l, r2.h):<<1:sat
+
+declare i32 @llvm.hexagon.M2.mpy.acc.sat.hl.s0(i32, i32, i32)
+define i32 @M2_mpy_acc_sat_hl_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.acc.sat.hl.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1.h, r2.l):sat
+
+declare i32 @llvm.hexagon.M2.mpy.acc.sat.hl.s1(i32, i32, i32)
+define i32 @M2_mpy_acc_sat_hl_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.acc.sat.hl.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1.h, r2.l):<<1:sat
+
+declare i32 @llvm.hexagon.M2.mpy.acc.sat.hh.s0(i32, i32, i32)
+define i32 @M2_mpy_acc_sat_hh_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.acc.sat.hh.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1.h, r2.h):sat
+
+declare i32 @llvm.hexagon.M2.mpy.acc.sat.hh.s1(i32, i32, i32)
+define i32 @M2_mpy_acc_sat_hh_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.acc.sat.hh.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1.h, r2.h):<<1:sat
+
+declare i32 @llvm.hexagon.M2.mpy.nac.ll.s0(i32, i32, i32)
+define i32 @M2_mpy_nac_ll_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.nac.ll.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1.l, r2.l)
+
+declare i32 @llvm.hexagon.M2.mpy.nac.ll.s1(i32, i32, i32)
+define i32 @M2_mpy_nac_ll_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.nac.ll.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1.l, r2.l):<<1
+
+declare i32 @llvm.hexagon.M2.mpy.nac.lh.s0(i32, i32, i32)
+define i32 @M2_mpy_nac_lh_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.nac.lh.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1.l, r2.h)
+
+declare i32 @llvm.hexagon.M2.mpy.nac.lh.s1(i32, i32, i32)
+define i32 @M2_mpy_nac_lh_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.nac.lh.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1.l, r2.h):<<1
+
+declare i32 @llvm.hexagon.M2.mpy.nac.hl.s0(i32, i32, i32)
+define i32 @M2_mpy_nac_hl_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.nac.hl.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1.h, r2.l)
+
+declare i32 @llvm.hexagon.M2.mpy.nac.hl.s1(i32, i32, i32)
+define i32 @M2_mpy_nac_hl_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.nac.hl.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1.h, r2.l):<<1
+
+declare i32 @llvm.hexagon.M2.mpy.nac.hh.s0(i32, i32, i32)
+define i32 @M2_mpy_nac_hh_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.nac.hh.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1.h, r2.h)
+
+declare i32 @llvm.hexagon.M2.mpy.nac.hh.s1(i32, i32, i32)
+define i32 @M2_mpy_nac_hh_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.nac.hh.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1.h, r2.h):<<1
+
+declare i32 @llvm.hexagon.M2.mpy.nac.sat.ll.s0(i32, i32, i32)
+define i32 @M2_mpy_nac_sat_ll_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.nac.sat.ll.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1.l, r2.l):sat
+
+declare i32 @llvm.hexagon.M2.mpy.nac.sat.ll.s1(i32, i32, i32)
+define i32 @M2_mpy_nac_sat_ll_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.nac.sat.ll.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1.l, r2.l):<<1:sat
+
+declare i32 @llvm.hexagon.M2.mpy.nac.sat.lh.s0(i32, i32, i32)
+define i32 @M2_mpy_nac_sat_lh_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.nac.sat.lh.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1.l, r2.h):sat
+
+declare i32 @llvm.hexagon.M2.mpy.nac.sat.lh.s1(i32, i32, i32)
+define i32 @M2_mpy_nac_sat_lh_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.nac.sat.lh.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1.l, r2.h):<<1:sat
+
+declare i32 @llvm.hexagon.M2.mpy.nac.sat.hl.s0(i32, i32, i32)
+define i32 @M2_mpy_nac_sat_hl_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.nac.sat.hl.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1.h, r2.l):sat
+
+declare i32 @llvm.hexagon.M2.mpy.nac.sat.hl.s1(i32, i32, i32)
+define i32 @M2_mpy_nac_sat_hl_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.nac.sat.hl.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1.h, r2.l):<<1:sat
+
+declare i32 @llvm.hexagon.M2.mpy.nac.sat.hh.s0(i32, i32, i32)
+define i32 @M2_mpy_nac_sat_hh_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.nac.sat.hh.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1.h, r2.h):sat
+
+declare i32 @llvm.hexagon.M2.mpy.nac.sat.hh.s1(i32, i32, i32)
+define i32 @M2_mpy_nac_sat_hh_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpy.nac.sat.hh.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1.h, r2.h):<<1:sat
+
+; Multiply unsigned halfwords
+declare i64 @llvm.hexagon.M2.mpyud.ll.s0(i32, i32)
+define i64 @M2_mpyud_ll_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.ll.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpyu(r0.l, r1.l)
+
+declare i64 @llvm.hexagon.M2.mpyud.ll.s1(i32, i32)
+define i64 @M2_mpyud_ll_s1(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.ll.s1(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpyu(r0.l, r1.l):<<1
+
+declare i64 @llvm.hexagon.M2.mpyud.lh.s0(i32, i32)
+define i64 @M2_mpyud_lh_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.lh.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpyu(r0.l, r1.h)
+
+declare i64 @llvm.hexagon.M2.mpyud.lh.s1(i32, i32)
+define i64 @M2_mpyud_lh_s1(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.lh.s1(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpyu(r0.l, r1.h):<<1
+
+declare i64 @llvm.hexagon.M2.mpyud.hl.s0(i32, i32)
+define i64 @M2_mpyud_hl_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.hl.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpyu(r0.h, r1.l)
+
+declare i64 @llvm.hexagon.M2.mpyud.hl.s1(i32, i32)
+define i64 @M2_mpyud_hl_s1(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.hl.s1(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpyu(r0.h, r1.l):<<1
+
+declare i64 @llvm.hexagon.M2.mpyud.hh.s0(i32, i32)
+define i64 @M2_mpyud_hh_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.hh.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpyu(r0.h, r1.h)
+
+declare i64 @llvm.hexagon.M2.mpyud.hh.s1(i32, i32)
+define i64 @M2_mpyud_hh_s1(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.hh.s1(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpyu(r0.h, r1.h):<<1
+
+declare i64 @llvm.hexagon.M2.mpyud.acc.ll.s0(i64, i32, i32)
+define i64 @M2_mpyud_acc_ll_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.acc.ll.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpyu(r2.l, r3.l)
+
+declare i64 @llvm.hexagon.M2.mpyud.acc.ll.s1(i64, i32, i32)
+define i64 @M2_mpyud_acc_ll_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.acc.ll.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpyu(r2.l, r3.l):<<1
+
+declare i64 @llvm.hexagon.M2.mpyud.acc.lh.s0(i64, i32, i32)
+define i64 @M2_mpyud_acc_lh_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.acc.lh.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpyu(r2.l, r3.h)
+
+declare i64 @llvm.hexagon.M2.mpyud.acc.lh.s1(i64, i32, i32)
+define i64 @M2_mpyud_acc_lh_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.acc.lh.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpyu(r2.l, r3.h):<<1
+
+declare i64 @llvm.hexagon.M2.mpyud.acc.hl.s0(i64, i32, i32)
+define i64 @M2_mpyud_acc_hl_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.acc.hl.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpyu(r2.h, r3.l)
+
+declare i64 @llvm.hexagon.M2.mpyud.acc.hl.s1(i64, i32, i32)
+define i64 @M2_mpyud_acc_hl_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.acc.hl.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpyu(r2.h, r3.l):<<1
+
+declare i64 @llvm.hexagon.M2.mpyud.acc.hh.s0(i64, i32, i32)
+define i64 @M2_mpyud_acc_hh_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.acc.hh.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpyu(r2.h, r3.h)
+
+declare i64 @llvm.hexagon.M2.mpyud.acc.hh.s1(i64, i32, i32)
+define i64 @M2_mpyud_acc_hh_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.acc.hh.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpyu(r2.h, r3.h):<<1
+
+declare i64 @llvm.hexagon.M2.mpyud.nac.ll.s0(i64, i32, i32)
+define i64 @M2_mpyud_nac_ll_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.nac.ll.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpyu(r2.l, r3.l)
+
+declare i64 @llvm.hexagon.M2.mpyud.nac.ll.s1(i64, i32, i32)
+define i64 @M2_mpyud_nac_ll_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.nac.ll.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpyu(r2.l, r3.l):<<1
+
+declare i64 @llvm.hexagon.M2.mpyud.nac.lh.s0(i64, i32, i32)
+define i64 @M2_mpyud_nac_lh_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.nac.lh.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpyu(r2.l, r3.h)
+
+declare i64 @llvm.hexagon.M2.mpyud.nac.lh.s1(i64, i32, i32)
+define i64 @M2_mpyud_nac_lh_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.nac.lh.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpyu(r2.l, r3.h):<<1
+
+declare i64 @llvm.hexagon.M2.mpyud.nac.hl.s0(i64, i32, i32)
+define i64 @M2_mpyud_nac_hl_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.nac.hl.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpyu(r2.h, r3.l)
+
+declare i64 @llvm.hexagon.M2.mpyud.nac.hl.s1(i64, i32, i32)
+define i64 @M2_mpyud_nac_hl_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.nac.hl.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpyu(r2.h, r3.l):<<1
+
+declare i64 @llvm.hexagon.M2.mpyud.nac.hh.s0(i64, i32, i32)
+define i64 @M2_mpyud_nac_hh_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.nac.hh.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpyu(r2.h, r3.h)
+
+declare i64 @llvm.hexagon.M2.mpyud.nac.hh.s1(i64, i32, i32)
+define i64 @M2_mpyud_nac_hh_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.mpyud.nac.hh.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpyu(r2.h, r3.h):<<1
+
+declare i32 @llvm.hexagon.M2.mpyu.ll.s0(i32, i32)
+define i32 @M2_mpyu_ll_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.ll.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpyu(r0.l, r1.l)
+
+declare i32 @llvm.hexagon.M2.mpyu.ll.s1(i32, i32)
+define i32 @M2_mpyu_ll_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.ll.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpyu(r0.l, r1.l):<<1
+
+declare i32 @llvm.hexagon.M2.mpyu.lh.s0(i32, i32)
+define i32 @M2_mpyu_lh_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.lh.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpyu(r0.l, r1.h)
+
+declare i32 @llvm.hexagon.M2.mpyu.lh.s1(i32, i32)
+define i32 @M2_mpyu_lh_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.lh.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpyu(r0.l, r1.h):<<1
+
+declare i32 @llvm.hexagon.M2.mpyu.hl.s0(i32, i32)
+define i32 @M2_mpyu_hl_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.hl.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpyu(r0.h, r1.l)
+
+declare i32 @llvm.hexagon.M2.mpyu.hl.s1(i32, i32)
+define i32 @M2_mpyu_hl_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.hl.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpyu(r0.h, r1.l):<<1
+
+declare i32 @llvm.hexagon.M2.mpyu.hh.s0(i32, i32)
+define i32 @M2_mpyu_hh_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.hh.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpyu(r0.h, r1.h)
+
+declare i32 @llvm.hexagon.M2.mpyu.hh.s1(i32, i32)
+define i32 @M2_mpyu_hh_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.hh.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpyu(r0.h, r1.h):<<1
+
+declare i32 @llvm.hexagon.M2.mpyu.acc.ll.s0(i32, i32, i32)
+define i32 @M2_mpyu_acc_ll_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.acc.ll.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpyu(r1.l, r2.l)
+
+declare i32 @llvm.hexagon.M2.mpyu.acc.ll.s1(i32, i32, i32)
+define i32 @M2_mpyu_acc_ll_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.acc.ll.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpyu(r1.l, r2.l):<<1
+
+declare i32 @llvm.hexagon.M2.mpyu.acc.lh.s0(i32, i32, i32)
+define i32 @M2_mpyu_acc_lh_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.acc.lh.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpyu(r1.l, r2.h)
+
+declare i32 @llvm.hexagon.M2.mpyu.acc.lh.s1(i32, i32, i32)
+define i32 @M2_mpyu_acc_lh_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.acc.lh.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpyu(r1.l, r2.h):<<1
+
+declare i32 @llvm.hexagon.M2.mpyu.acc.hl.s0(i32, i32, i32)
+define i32 @M2_mpyu_acc_hl_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.acc.hl.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpyu(r1.h, r2.l)
+
+declare i32 @llvm.hexagon.M2.mpyu.acc.hl.s1(i32, i32, i32)
+define i32 @M2_mpyu_acc_hl_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.acc.hl.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpyu(r1.h, r2.l):<<1
+
+declare i32 @llvm.hexagon.M2.mpyu.acc.hh.s0(i32, i32, i32)
+define i32 @M2_mpyu_acc_hh_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.acc.hh.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpyu(r1.h, r2.h)
+
+declare i32 @llvm.hexagon.M2.mpyu.acc.hh.s1(i32, i32, i32)
+define i32 @M2_mpyu_acc_hh_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.acc.hh.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpyu(r1.h, r2.h):<<1
+
+declare i32 @llvm.hexagon.M2.mpyu.nac.ll.s0(i32, i32, i32)
+define i32 @M2_mpyu_nac_ll_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.nac.ll.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpyu(r1.l, r2.l)
+
+declare i32 @llvm.hexagon.M2.mpyu.nac.ll.s1(i32, i32, i32)
+define i32 @M2_mpyu_nac_ll_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.nac.ll.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpyu(r1.l, r2.l):<<1
+
+declare i32 @llvm.hexagon.M2.mpyu.nac.lh.s0(i32, i32, i32)
+define i32 @M2_mpyu_nac_lh_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.nac.lh.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpyu(r1.l, r2.h)
+
+declare i32 @llvm.hexagon.M2.mpyu.nac.lh.s1(i32, i32, i32)
+define i32 @M2_mpyu_nac_lh_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.nac.lh.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpyu(r1.l, r2.h):<<1
+
+declare i32 @llvm.hexagon.M2.mpyu.nac.hl.s0(i32, i32, i32)
+define i32 @M2_mpyu_nac_hl_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.nac.hl.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpyu(r1.h, r2.l)
+
+declare i32 @llvm.hexagon.M2.mpyu.nac.hl.s1(i32, i32, i32)
+define i32 @M2_mpyu_nac_hl_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.nac.hl.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpyu(r1.h, r2.l):<<1
+
+declare i32 @llvm.hexagon.M2.mpyu.nac.hh.s0(i32, i32, i32)
+define i32 @M2_mpyu_nac_hh_s0(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.nac.hh.s0(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpyu(r1.h, r2.h)
+
+declare i32 @llvm.hexagon.M2.mpyu.nac.hh.s1(i32, i32, i32)
+define i32 @M2_mpyu_nac_hh_s1(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.nac.hh.s1(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpyu(r1.h, r2.h):<<1
+
+; Polynomial multiply words
+declare i64 @llvm.hexagon.M4.pmpyw(i32, i32)
+define i64 @M4_pmpyw(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M4.pmpyw(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = pmpyw(r0, r1)
+
+declare i64 @llvm.hexagon.M4.pmpyw.acc(i64, i32, i32)
+define i64 @M4_pmpyw_acc(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M4.pmpyw.acc(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 ^= pmpyw(r2, r3)
+
+; Vector reduce multiply word by signed half
+declare i64 @llvm.hexagon.M4.vrmpyoh.s0(i64, i64)
+define i64 @M4_vrmpyoh_s0(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M4.vrmpyoh.s0(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrmpywoh(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.M4.vrmpyoh.s1(i64, i64)
+define i64 @M4_vrmpyoh_s1(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M4.vrmpyoh.s1(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrmpywoh(r1:0, r3:2):<<1
+
+declare i64 @llvm.hexagon.M4.vrmpyeh.s0(i64, i64)
+define i64 @M4_vrmpyeh_s0(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M4.vrmpyeh.s0(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrmpyweh(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.M4.vrmpyeh.s1(i64, i64)
+define i64 @M4_vrmpyeh_s1(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M4.vrmpyeh.s1(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrmpyweh(r1:0, r3:2):<<1
+
+declare i64 @llvm.hexagon.M4.vrmpyoh.acc.s0(i64, i64, i64)
+define i64 @M4_vrmpyoh_acc_s0(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M4.vrmpyoh.acc.s0(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vrmpywoh(r3:2, r5:4)
+
+declare i64 @llvm.hexagon.M4.vrmpyoh.acc.s1(i64, i64, i64)
+define i64 @M4_vrmpyoh_acc_s1(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M4.vrmpyoh.acc.s1(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vrmpywoh(r3:2, r5:4):<<1
+
+declare i64 @llvm.hexagon.M4.vrmpyeh.acc.s0(i64, i64, i64)
+define i64 @M4_vrmpyeh_acc_s0(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M4.vrmpyeh.acc.s0(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vrmpyweh(r3:2, r5:4)
+
+declare i64 @llvm.hexagon.M4.vrmpyeh.acc.s1(i64, i64, i64)
+define i64 @M4_vrmpyeh_acc_s1(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M4.vrmpyeh.acc.s1(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vrmpyweh(r3:2, r5:4):<<1
+
+; Multiply and use upper result
+declare i32 @llvm.hexagon.M2.dpmpyss.rnd.s0(i32, i32)
+define i32 @M2_dpmpyss_rnd_s0(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.dpmpyss.rnd.s0(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0, r1):rnd
+
+declare i32 @llvm.hexagon.M2.mpyu.up(i32, i32)
+define i32 @M2_mpyu_up(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpyu.up(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpyu(r0, r1)
+
+declare i32 @llvm.hexagon.M2.mpysu.up(i32, i32)
+define i32 @M2_mpysu_up(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpysu.up(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpysu(r0, r1)
+
+declare i32 @llvm.hexagon.M2.hmmpyh.s1(i32, i32)
+define i32 @M2_hmmpyh_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.hmmpyh.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0, r1.h):<<1:sat
+
+declare i32 @llvm.hexagon.M2.hmmpyl.s1(i32, i32)
+define i32 @M2_hmmpyl_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.hmmpyl.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0, r1.l):<<1:sat
+
+declare i32 @llvm.hexagon.M2.hmmpyh.rs1(i32, i32)
+define i32 @M2_hmmpyh_rs1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.hmmpyh.rs1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0, r1.h):<<1:rnd:sat
+
+declare i32 @llvm.hexagon.M2.mpy.up.s1.sat(i32, i32)
+define i32 @M2_mpy_up_s1_sat(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.up.s1.sat(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0, r1):<<1:sat
+
+declare i32 @llvm.hexagon.M2.hmmpyl.rs1(i32, i32)
+define i32 @M2_hmmpyl_rs1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.hmmpyl.rs1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0, r1.l):<<1:rnd:sat
+
+declare i32 @llvm.hexagon.M2.mpy.up(i32, i32)
+define i32 @M2_mpy_up(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.up(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0, r1)
+
+declare i32 @llvm.hexagon.M2.mpy.up.s1(i32, i32)
+define i32 @M2_mpy_up_s1(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.M2.mpy.up.s1(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = mpy(r0, r1):<<1
+
+declare i32 @llvm.hexagon.M4.mac.up.s1.sat(i32, i32, i32)
+define i32 @M4_mac_up_s1_sat(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M4.mac.up.s1.sat(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += mpy(r1, r2):<<1:sat
+
+declare i32 @llvm.hexagon.M4.nac.up.s1.sat(i32, i32, i32)
+define i32 @M4_nac_up_s1_sat(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.M4.nac.up.s1.sat(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= mpy(r1, r2):<<1:sat
+
+; Multiply and use full result
+declare i64 @llvm.hexagon.M2.dpmpyss.s0(i32, i32)
+define i64 @M2_dpmpyss_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.dpmpyss.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpy(r0, r1)
+
+declare i64 @llvm.hexagon.M2.dpmpyuu.s0(i32, i32)
+define i64 @M2_dpmpyuu_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.dpmpyuu.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = mpyu(r0, r1)
+
+declare i64 @llvm.hexagon.M2.dpmpyss.acc.s0(i64, i32, i32)
+define i64 @M2_dpmpyss_acc_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.dpmpyss.acc.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpy(r2, r3)
+
+declare i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64, i32, i32)
+define i64 @M2_dpmpyss_nac_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpy(r2, r3)
+
+declare i64 @llvm.hexagon.M2.dpmpyuu.acc.s0(i64, i32, i32)
+define i64 @M2_dpmpyuu_acc_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.dpmpyuu.acc.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += mpyu(r2, r3)
+
+declare i64 @llvm.hexagon.M2.dpmpyuu.nac.s0(i64, i32, i32)
+define i64 @M2_dpmpyuu_nac_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.dpmpyuu.nac.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= mpyu(r2, r3)
+
+; Vector dual multiply
+declare i64 @llvm.hexagon.M2.vdmpys.s0(i64, i64)
+define i64 @M2_vdmpys_s0(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.vdmpys.s0(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vdmpy(r1:0, r3:2):sat
+
+declare i64 @llvm.hexagon.M2.vdmpys.s1(i64, i64)
+define i64 @M2_vdmpys_s1(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.vdmpys.s1(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vdmpy(r1:0, r3:2):<<1:sat
+
+; Vector reduce multiply bytes
+declare i64 @llvm.hexagon.M5.vrmpybuu(i64, i64)
+define i64 @M5_vrmpybuu(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M5.vrmpybuu(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrmpybu(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.M5.vrmpybsu(i64, i64)
+define i64 @M5_vrmpybsu(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M5.vrmpybsu(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrmpybsu(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.M5.vrmacbuu(i64, i64, i64)
+define i64 @M5_vrmacbuu(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M5.vrmacbuu(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vrmpybu(r3:2, r5:4)
+
+declare i64 @llvm.hexagon.M5.vrmacbsu(i64, i64, i64)
+define i64 @M5_vrmacbsu(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M5.vrmacbsu(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vrmpybsu(r3:2, r5:4)
+
+; Vector dual multiply signed by unsigned bytes
+declare i64 @llvm.hexagon.M5.vdmpybsu(i64, i64)
+define i64 @M5_vdmpybsu(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M5.vdmpybsu(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vdmpybsu(r1:0, r3:2):sat
+
+declare i64 @llvm.hexagon.M5.vdmacbsu(i64, i64, i64)
+define i64 @M5_vdmacbsu(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M5.vdmacbsu(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vdmpybsu(r3:2, r5:4):sat
+
+; Vector multiply even halfwords
+declare i64 @llvm.hexagon.M2.vmpy2es.s0(i64, i64)
+define i64 @M2_vmpy2es_s0(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.vmpy2es.s0(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpyeh(r1:0, r3:2):sat
+
+declare i64 @llvm.hexagon.M2.vmpy2es.s1(i64, i64)
+define i64 @M2_vmpy2es_s1(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.vmpy2es.s1(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpyeh(r1:0, r3:2):<<1:sat
+
+declare i64 @llvm.hexagon.M2.vmac2es(i64, i64, i64)
+define i64 @M2_vmac2es(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M2.vmac2es(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vmpyeh(r3:2, r5:4)
+
+declare i64 @llvm.hexagon.M2.vmac2es.s0(i64, i64, i64)
+define i64 @M2_vmac2es_s0(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M2.vmac2es.s0(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vmpyeh(r3:2, r5:4):sat
+
+declare i64 @llvm.hexagon.M2.vmac2es.s1(i64, i64, i64)
+define i64 @M2_vmac2es_s1(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M2.vmac2es.s1(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vmpyeh(r3:2, r5:4):<<1:sat
+
+; Vector multiply halfwords
+declare i64 @llvm.hexagon.M2.vmpy2s.s0(i32, i32)
+define i64 @M2_vmpy2s_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.vmpy2s.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpyh(r0, r1):sat
+
+declare i64 @llvm.hexagon.M2.vmpy2s.s1(i32, i32)
+define i64 @M2_vmpy2s_s1(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.vmpy2s.s1(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpyh(r0, r1):<<1:sat
+
+declare i64 @llvm.hexagon.M2.vmac2(i64, i32, i32)
+define i64 @M2_vmac2(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.vmac2(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vmpyh(r2, r3)
+
+declare i64 @llvm.hexagon.M2.vmac2s.s0(i64, i32, i32)
+define i64 @M2_vmac2s_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.vmac2s.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vmpyh(r2, r3):sat
+
+declare i64 @llvm.hexagon.M2.vmac2s.s1(i64, i32, i32)
+define i64 @M2_vmac2s_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.vmac2s.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vmpyh(r2, r3):<<1:sat
+
+; Vector multiply halfwords signed by unsigned
+declare i64 @llvm.hexagon.M2.vmpy2su.s0(i32, i32)
+define i64 @M2_vmpy2su_s0(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.vmpy2su.s0(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpyhsu(r0, r1):sat
+
+declare i64 @llvm.hexagon.M2.vmpy2su.s1(i32, i32)
+define i64 @M2_vmpy2su_s1(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M2.vmpy2su.s1(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpyhsu(r0, r1):<<1:sat
+
+declare i64 @llvm.hexagon.M2.vmac2su.s0(i64, i32, i32)
+define i64 @M2_vmac2su_s0(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.vmac2su.s0(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vmpyhsu(r2, r3):sat
+
+declare i64 @llvm.hexagon.M2.vmac2su.s1(i64, i32, i32)
+define i64 @M2_vmac2su_s1(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M2.vmac2su.s1(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vmpyhsu(r2, r3):<<1:sat
+
+; Vector reduce multiply halfwords
+declare i64 @llvm.hexagon.M2.vrmpy.s0(i64, i64)
+define i64 @M2_vrmpy_s0(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.M2.vrmpy.s0(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vrmpyh(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.M2.vrmac.s0(i64, i64, i64)
+define i64 @M2_vrmac_s0(i64 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.M2.vrmac.s0(i64 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vrmpyh(r3:2, r5:4)
+
+; Vector multiply bytes
+declare i64 @llvm.hexagon.M5.vmpybsu(i32, i32)
+define i64 @M2_vmpybsu(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M5.vmpybsu(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpybsu(r0, r1)
+
+declare i64 @llvm.hexagon.M5.vmpybuu(i32, i32)
+define i64 @M2_vmpybuu(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M5.vmpybuu(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vmpybu(r0, r1)
+
+declare i64 @llvm.hexagon.M5.vmacbuu(i64, i32, i32)
+define i64 @M2_vmacbuu(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M5.vmacbuu(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vmpybu(r2, r3)
+
+declare i64 @llvm.hexagon.M5.vmacbsu(i64, i32, i32)
+define i64 @M2_vmacbsu(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M5.vmacbsu(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += vmpybsu(r2, r3)
+
+; Vector polynomial multiply halfwords
+declare i64 @llvm.hexagon.M4.vpmpyh(i32, i32)
+define i64 @M4_vpmpyh(i32 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.M4.vpmpyh(i32 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vpmpyh(r0, r1)
+
+declare i64 @llvm.hexagon.M4.vpmpyh.acc(i64, i32, i32)
+define i64 @M4_vpmpyh_acc(i64 %a, i32 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.M4.vpmpyh.acc(i64 %a, i32 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 ^= vpmpyh(r2, r3)
diff --git a/test/CodeGen/Hexagon/intrinsics/xtype_perm.ll b/test/CodeGen/Hexagon/intrinsics/xtype_perm.ll
new file mode 100644
index 0000000..0b76132
--- /dev/null
+++ b/test/CodeGen/Hexagon/intrinsics/xtype_perm.ll
@@ -0,0 +1,252 @@
+; RUN: llc -march=hexagon -O0 < %s | FileCheck %s
+; Hexagon Programmer's Reference Manual 11.10.6 XTYPE/PERM
+
+; Saturate
+declare i32 @llvm.hexagon.A2.sat(i64)
+define i32 @A2_sat(i64 %a) {
+  %z = call i32 @llvm.hexagon.A2.sat(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = sat(r1:0)
+
+declare i32 @llvm.hexagon.A2.sath(i32)
+define i32 @A2_sath(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.sath(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = sath(r0)
+
+declare i32 @llvm.hexagon.A2.satuh(i32)
+define i32 @A2_satuh(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.satuh(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = satuh(r0)
+
+declare i32 @llvm.hexagon.A2.satub(i32)
+define i32 @A2_satub(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.satub(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = satub(r0)
+
+declare i32 @llvm.hexagon.A2.satb(i32)
+define i32 @A2_satb(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.satb(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = satb(r0)
+
+; Swizzle bytes
+declare i32 @llvm.hexagon.A2.swiz(i32)
+define i32 @A2_swiz(i32 %a) {
+  %z = call i32 @llvm.hexagon.A2.swiz(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = swiz(r0)
+
+; Vector round and pack
+declare i32 @llvm.hexagon.S2.vrndpackwh(i64)
+define i32 @S2_vrndpackwh(i64 %a) {
+  %z = call i32 @llvm.hexagon.S2.vrndpackwh(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = vrndwh(r1:0)
+
+declare i32 @llvm.hexagon.S2.vrndpackwhs(i64)
+define i32 @S2_vrndpackwhs(i64 %a) {
+  %z = call i32 @llvm.hexagon.S2.vrndpackwhs(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = vrndwh(r1:0):sat
+
+; Vector saturate and pack
+declare i32 @llvm.hexagon.S2.vsathub(i64)
+define i32 @S2_vsathub(i64 %a) {
+  %z = call i32 @llvm.hexagon.S2.vsathub(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = vsathub(r1:0)
+
+declare i32 @llvm.hexagon.S2.vsatwh(i64)
+define i32 @S2_vsatwh(i64 %a) {
+  %z = call i32 @llvm.hexagon.S2.vsatwh(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = vsatwh(r1:0)
+
+declare i32 @llvm.hexagon.S2.vsatwuh(i64)
+define i32 @S2_vsatwuh(i64 %a) {
+  %z = call i32 @llvm.hexagon.S2.vsatwuh(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = vsatwuh(r1:0)
+
+declare i32 @llvm.hexagon.S2.vsathb(i64)
+define i32 @S2_vsathb(i64 %a) {
+  %z = call i32 @llvm.hexagon.S2.vsathb(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = vsathb(r1:0)
+
+declare i32 @llvm.hexagon.S2.svsathb(i32)
+define i32 @S2_svsathb(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.svsathb(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = vsathb(r0)
+
+declare i32 @llvm.hexagon.S2.svsathub(i32)
+define i32 @S2_svsathub(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.svsathub(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = vsathub(r0)
+
+; Vector saturate without pack
+declare i64 @llvm.hexagon.S2.vsathub.nopack(i64)
+define i64 @S2_vsathub_nopack(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.vsathub.nopack(i64 %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = vsathub(r1:0)
+
+declare i64 @llvm.hexagon.S2.vsatwuh.nopack(i64)
+define i64 @S2_vsatwuh_nopack(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.vsatwuh.nopack(i64 %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = vsatwuh(r1:0)
+
+declare i64 @llvm.hexagon.S2.vsatwh.nopack(i64)
+define i64 @S2_vsatwh_nopack(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.vsatwh.nopack(i64 %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = vsatwh(r1:0)
+
+declare i64 @llvm.hexagon.S2.vsathb.nopack(i64)
+define i64 @S2_vsathb_nopack(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.vsathb.nopack(i64 %a)
+  ret i64 %z
+}
+; CHECK: r1:0 = vsathb(r1:0)
+
+; Vector shuffle
+declare i64 @llvm.hexagon.S2.shuffeb(i64, i64)
+define i64 @S2_shuffeb(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.shuffeb(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = shuffeb(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.S2.shuffob(i64, i64)
+define i64 @S2_shuffob(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.shuffob(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = shuffob(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.S2.shuffeh(i64, i64)
+define i64 @S2_shuffeh(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.shuffeh(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = shuffeh(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.S2.shuffoh(i64, i64)
+define i64 @S2_shuffoh(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.shuffoh(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = shuffoh(r1:0, r3:2)
+
+; Vector splat bytes
+declare i32 @llvm.hexagon.S2.vsplatrb(i32)
+define i32 @S2_vsplatrb(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.vsplatrb(i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = vsplatb(r0)
+
+; Vector splat halfwords
+declare i64 @llvm.hexagon.S2.vsplatrh(i32)
+define i64 @S2_vsplatrh(i32 %a) {
+  %z = call i64 @llvm.hexagon.S2.vsplatrh(i32 %a)
+  ret i64 %z
+}
+; CHECK:  = vsplath(r0)
+
+; Vector splice
+declare i64 @llvm.hexagon.S2.vspliceib(i64, i64, i32)
+define i64 @S2_vspliceib(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.vspliceib(i64 %a, i64 %b, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 = vspliceb(r1:0, r3:2, #0)
+
+declare i64 @llvm.hexagon.S2.vsplicerb(i64, i64, i32)
+define i64 @S2_vsplicerb(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.vsplicerb(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 = vspliceb(r1:0, r3:2, p0)
+
+; Vector sign extend
+declare i64 @llvm.hexagon.S2.vsxtbh(i32)
+define i64 @S2_vsxtbh(i32 %a) {
+  %z = call i64 @llvm.hexagon.S2.vsxtbh(i32 %a)
+  ret i64 %z
+}
+; CHECK:  = vsxtbh(r0)
+
+declare i64 @llvm.hexagon.S2.vsxthw(i32)
+define i64 @S2_vsxthw(i32 %a) {
+  %z = call i64 @llvm.hexagon.S2.vsxthw(i32 %a)
+  ret i64 %z
+}
+; CHECK:  = vsxthw(r0)
+
+; Vector truncate
+declare i32 @llvm.hexagon.S2.vtrunohb(i64)
+define i32 @S2_vtrunohb(i64 %a) {
+  %z = call i32 @llvm.hexagon.S2.vtrunohb(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = vtrunohb(r1:0)
+
+declare i32 @llvm.hexagon.S2.vtrunehb(i64)
+define i32 @S2_vtrunehb(i64 %a) {
+  %z = call i32 @llvm.hexagon.S2.vtrunehb(i64 %a)
+  ret i32 %z
+}
+; CHECK: r0 = vtrunehb(r1:0)
+
+declare i64 @llvm.hexagon.S2.vtrunowh(i64, i64)
+define i64 @S2_vtrunowh(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.vtrunowh(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vtrunowh(r1:0, r3:2)
+
+declare i64 @llvm.hexagon.S2.vtrunewh(i64, i64)
+define i64 @S2_vtrunewh(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.vtrunewh(i64 %a, i64 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vtrunewh(r1:0, r3:2)
+
+; Vector zero extend
+declare i64 @llvm.hexagon.S2.vzxtbh(i32)
+define i64 @S2_vzxtbh(i32 %a) {
+  %z = call i64 @llvm.hexagon.S2.vzxtbh(i32 %a)
+  ret i64 %z
+}
+; CHECK:  = vzxtbh(r0)
+
+declare i64 @llvm.hexagon.S2.vzxthw(i32)
+define i64 @S2_vzxthw(i32 %a) {
+  %z = call i64 @llvm.hexagon.S2.vzxthw(i32 %a)
+  ret i64 %z
+}
+; CHECK:  = vzxthw(r0)
diff --git a/test/CodeGen/Hexagon/intrinsics/xtype_pred.ll b/test/CodeGen/Hexagon/intrinsics/xtype_pred.ll
new file mode 100644
index 0000000..96e63d8
--- /dev/null
+++ b/test/CodeGen/Hexagon/intrinsics/xtype_pred.ll
@@ -0,0 +1,351 @@
+; RUN: llc -march=hexagon -O0 < %s | FileCheck %s
+; Hexagon Programmer's Reference Manual 11.10.7 XTYPE/PRED
+
+; Compare byte
+declare i32 @llvm.hexagon.A4.cmpbgt(i32, i32)
+define i32 @A4_cmpbgt(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A4.cmpbgt(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = cmpb.gt(r0, r1)
+
+declare i32 @llvm.hexagon.A4.cmpbeq(i32, i32)
+define i32 @A4_cmpbeq(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A4.cmpbeq(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = cmpb.eq(r0, r1)
+
+declare i32 @llvm.hexagon.A4.cmpbgtu(i32, i32)
+define i32 @A4_cmpbgtu(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A4.cmpbgtu(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = cmpb.gtu(r0, r1)
+
+declare i32 @llvm.hexagon.A4.cmpbgti(i32, i32)
+define i32 @A4_cmpbgti(i32 %a) {
+  %z = call i32 @llvm.hexagon.A4.cmpbgti(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = cmpb.gt(r0, #0)
+
+declare i32 @llvm.hexagon.A4.cmpbeqi(i32, i32)
+define i32 @A4_cmpbeqi(i32 %a) {
+  %z = call i32 @llvm.hexagon.A4.cmpbeqi(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = cmpb.eq(r0, #0)
+
+declare i32 @llvm.hexagon.A4.cmpbgtui(i32, i32)
+define i32 @A4_cmpbgtui(i32 %a) {
+  %z = call i32 @llvm.hexagon.A4.cmpbgtui(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = cmpb.gtu(r0, #0)
+
+; Compare half
+declare i32 @llvm.hexagon.A4.cmphgt(i32, i32)
+define i32 @A4_cmphgt(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A4.cmphgt(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = cmph.gt(r0, r1)
+
+declare i32 @llvm.hexagon.A4.cmpheq(i32, i32)
+define i32 @A4_cmpheq(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A4.cmpheq(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = cmph.eq(r0, r1)
+
+declare i32 @llvm.hexagon.A4.cmphgtu(i32, i32)
+define i32 @A4_cmphgtu(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A4.cmphgtu(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = cmph.gtu(r0, r1)
+
+declare i32 @llvm.hexagon.A4.cmphgti(i32, i32)
+define i32 @A4_cmphgti(i32 %a) {
+  %z = call i32 @llvm.hexagon.A4.cmphgti(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = cmph.gt(r0, #0)
+
+declare i32 @llvm.hexagon.A4.cmpheqi(i32, i32)
+define i32 @A4_cmpheqi(i32 %a) {
+  %z = call i32 @llvm.hexagon.A4.cmpheqi(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = cmph.eq(r0, #0)
+
+declare i32 @llvm.hexagon.A4.cmphgtui(i32, i32)
+define i32 @A4_cmphgtui(i32 %a) {
+  %z = call i32 @llvm.hexagon.A4.cmphgtui(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = cmph.gtu(r0, #0)
+
+; Compare doublewords
+declare i32 @llvm.hexagon.C2.cmpgtp(i64, i64)
+define i32 @C2_cmpgtp(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.C2.cmpgtp(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: p0 = cmp.gt(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.C2.cmpeqp(i64, i64)
+define i32 @C2_cmpeqp(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.C2.cmpeqp(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: p0 = cmp.eq(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.C2.cmpgtup(i64, i64)
+define i32 @C2_cmpgtup(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.C2.cmpgtup(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: p0 = cmp.gtu(r1:0, r3:2)
+
+; Compare bitmask
+declare i32 @llvm.hexagon.C2.bitsclri(i32, i32)
+define i32 @C2_bitsclri(i32 %a) {
+  %z = call i32 @llvm.hexagon.C2.bitsclri(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = bitsclr(r0, #0)
+
+declare i32 @llvm.hexagon.C4.nbitsclri(i32, i32)
+define i32 @C4_nbitsclri(i32 %a) {
+  %z = call i32 @llvm.hexagon.C4.nbitsclri(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = !bitsclr(r0, #0)
+
+declare i32 @llvm.hexagon.C2.bitsset(i32, i32)
+define i32 @C2_bitsset(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.C2.bitsset(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = bitsset(r0, r1)
+
+declare i32 @llvm.hexagon.C4.nbitsset(i32, i32)
+define i32 @C4_nbitsset(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.C4.nbitsset(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = !bitsset(r0, r1)
+
+declare i32 @llvm.hexagon.C2.bitsclr(i32, i32)
+define i32 @C2_bitsclr(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.C2.bitsclr(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = bitsclr(r0, r1)
+
+declare i32 @llvm.hexagon.C4.nbitsclr(i32, i32)
+define i32 @C4_nbitsclr(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.C4.nbitsclr(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = !bitsclr(r0, r1)
+
+; Mask generate from predicate
+declare i64 @llvm.hexagon.C2.mask(i32)
+define i64 @C2_mask(i32 %a) {
+  %z = call i64 @llvm.hexagon.C2.mask(i32 %a)
+  ret i64 %z
+}
+; CHECK:  = mask(p0)
+
+; Check for TLB match
+declare i32 @llvm.hexagon.A4.tlbmatch(i64, i32)
+define i32 @A4_tlbmatch(i64 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.A4.tlbmatch(i64 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = tlbmatch(r1:0, r2)
+
+; Test bit
+declare i32 @llvm.hexagon.S2.tstbit.i(i32, i32)
+define i32 @S2_tstbit_i(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.tstbit.i(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = tstbit(r0, #0)
+
+declare i32 @llvm.hexagon.S4.ntstbit.i(i32, i32)
+define i32 @S4_ntstbit_i(i32 %a) {
+  %z = call i32 @llvm.hexagon.S4.ntstbit.i(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = !tstbit(r0, #0)
+
+declare i32 @llvm.hexagon.S2.tstbit.r(i32, i32)
+define i32 @S2_tstbit_r(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.tstbit.r(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = tstbit(r0, r1)
+
+declare i32 @llvm.hexagon.S4.ntstbit.r(i32, i32)
+define i32 @S4_ntstbit_r(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S4.ntstbit.r(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: p0 = !tstbit(r0, r1)
+
+; Vector compare halfwords
+declare i32 @llvm.hexagon.A2.vcmpheq(i64, i64)
+define i32 @A2_vcmpheq(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.A2.vcmpheq(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: p0 = vcmph.eq(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.A2.vcmphgt(i64, i64)
+define i32 @A2_vcmphgt(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.A2.vcmphgt(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: p0 = vcmph.gt(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.A2.vcmphgtu(i64, i64)
+define i32 @A2_vcmphgtu(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.A2.vcmphgtu(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: p0 = vcmph.gtu(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.A4.vcmpheqi(i64, i32)
+define i32 @A4_vcmpheqi(i64 %a) {
+  %z = call i32 @llvm.hexagon.A4.vcmpheqi(i64 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = vcmph.eq(r1:0, #0)
+
+declare i32 @llvm.hexagon.A4.vcmphgti(i64, i32)
+define i32 @A4_vcmphgti(i64 %a) {
+  %z = call i32 @llvm.hexagon.A4.vcmphgti(i64 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = vcmph.gt(r1:0, #0)
+
+declare i32 @llvm.hexagon.A4.vcmphgtui(i64, i32)
+define i32 @A4_vcmphgtui(i64 %a) {
+  %z = call i32 @llvm.hexagon.A4.vcmphgtui(i64 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = vcmph.gtu(r1:0, #0)
+
+; Vector compare bytes for any match
+declare i32 @llvm.hexagon.A4.vcmpbeq.any(i64, i64)
+define i32 @A4_vcmpbeq_any(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.A4.vcmpbeq.any(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: p0 = any8(vcmpb.eq(r1:0, r3:2))
+
+; Vector compare bytes
+declare i32 @llvm.hexagon.A2.vcmpbeq(i64, i64)
+define i32 @A2_vcmpbeq(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.A2.vcmpbeq(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: p0 = vcmpb.eq(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.A2.vcmpbgtu(i64, i64)
+define i32 @A2_vcmpbgtu(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.A2.vcmpbgtu(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: p0 = vcmpb.gtu(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.A4.vcmpbgt(i64, i64)
+define i32 @A4_vcmpbgt(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.A4.vcmpbgt(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: p0 = vcmpb.gt(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.A4.vcmpbeqi(i64, i32)
+define i32 @A4_vcmpbeqi(i64 %a) {
+  %z = call i32 @llvm.hexagon.A4.vcmpbeqi(i64 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = vcmpb.eq(r1:0, #0)
+
+declare i32 @llvm.hexagon.A4.vcmpbgti(i64, i32)
+define i32 @A4_vcmpbgti(i64 %a) {
+  %z = call i32 @llvm.hexagon.A4.vcmpbgti(i64 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = vcmpb.gt(r1:0, #0)
+
+declare i32 @llvm.hexagon.A4.vcmpbgtui(i64, i32)
+define i32 @A4_vcmpbgtui(i64 %a) {
+  %z = call i32 @llvm.hexagon.A4.vcmpbgtui(i64 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = vcmpb.gtu(r1:0, #0)
+
+; Vector compare words
+declare i32 @llvm.hexagon.A2.vcmpweq(i64, i64)
+define i32 @A2_vcmpweq(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.A2.vcmpweq(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: p0 = vcmpw.eq(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.A2.vcmpwgt(i64, i64)
+define i32 @A2_vcmpwgt(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.A2.vcmpwgt(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: p0 = vcmpw.gt(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.A2.vcmpwgtu(i64, i64)
+define i32 @A2_vcmpwgtu(i64 %a, i64 %b) {
+  %z = call i32 @llvm.hexagon.A2.vcmpwgtu(i64 %a, i64 %b)
+  ret i32 %z
+}
+; CHECK: p0 = vcmpw.gtu(r1:0, r3:2)
+
+declare i32 @llvm.hexagon.A4.vcmpweqi(i64, i32)
+define i32 @A4_vcmpweqi(i64 %a) {
+  %z = call i32 @llvm.hexagon.A4.vcmpweqi(i64 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = vcmpw.eq(r1:0, #0)
+
+declare i32 @llvm.hexagon.A4.vcmpwgti(i64, i32)
+define i32 @A4_vcmpwgti(i64 %a) {
+  %z = call i32 @llvm.hexagon.A4.vcmpwgti(i64 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = vcmpw.gt(r1:0, #0)
+
+declare i32 @llvm.hexagon.A4.vcmpwgtui(i64, i32)
+define i32 @A4_vcmpwgtui(i64 %a) {
+  %z = call i32 @llvm.hexagon.A4.vcmpwgtui(i64 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: p0 = vcmpw.gtu(r1:0, #0)
+
+; Viterbi pack even and odd predicate bitsclr
+declare i32 @llvm.hexagon.C2.vitpack(i32, i32)
+define i32 @C2_vitpack(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.C2.vitpack(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = vitpack(p1, p0)
+
+; Vector mux
+declare i64 @llvm.hexagon.C2.vmux(i32, i64, i64)
+define i64 @C2_vmux(i32 %a, i64 %b, i64 %c) {
+  %z = call i64 @llvm.hexagon.C2.vmux(i32 %a, i64 %b, i64 %c)
+  ret i64 %z
+}
+; CHECK:  = vmux(p0, r3:2, r5:4)
diff --git a/test/CodeGen/Hexagon/intrinsics/xtype_shift.ll b/test/CodeGen/Hexagon/intrinsics/xtype_shift.ll
new file mode 100644
index 0000000..c84999b
--- /dev/null
+++ b/test/CodeGen/Hexagon/intrinsics/xtype_shift.ll
@@ -0,0 +1,723 @@
+; RUN: llc -march=hexagon -O0 < %s | FileCheck %s
+; Hexagon Programmer's Reference Manual 11.10.8 XTYPE/SHIFT
+
+; Shift by immediate
+declare i64 @llvm.hexagon.S2.asr.i.p(i64, i32)
+define i64 @S2_asr_i_p(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.asr.i.p(i64 %a, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 = asr(r1:0, #0)
+
+declare i64 @llvm.hexagon.S2.lsr.i.p(i64, i32)
+define i64 @S2_lsr_i_p(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.lsr.i.p(i64 %a, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 = lsr(r1:0, #0)
+
+declare i64 @llvm.hexagon.S2.asl.i.p(i64, i32)
+define i64 @S2_asl_i_p(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.asl.i.p(i64 %a, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 = asl(r1:0, #0)
+
+declare i32 @llvm.hexagon.S2.asr.i.r(i32, i32)
+define i32 @S2_asr_i_r(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.asr.i.r(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = asr(r0, #0)
+
+declare i32 @llvm.hexagon.S2.lsr.i.r(i32, i32)
+define i32 @S2_lsr_i_r(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.lsr.i.r(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = lsr(r0, #0)
+
+declare i32 @llvm.hexagon.S2.asl.i.r(i32, i32)
+define i32 @S2_asl_i_r(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.asl.i.r(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = asl(r0, #0)
+
+; Shift by immediate and accumulate
+declare i64 @llvm.hexagon.S2.asr.i.p.nac(i64, i64, i32)
+define i64 @S2_asr_i_p_nac(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.asr.i.p.nac(i64 %a, i64 %b, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 -= asr(r3:2, #0)
+
+declare i64 @llvm.hexagon.S2.lsr.i.p.nac(i64, i64, i32)
+define i64 @S2_lsr_i_p_nac(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.lsr.i.p.nac(i64 %a, i64 %b, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 -= lsr(r3:2, #0)
+
+declare i64 @llvm.hexagon.S2.asl.i.p.nac(i64, i64, i32)
+define i64 @S2_asl_i_p_nac(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.asl.i.p.nac(i64 %a, i64 %b, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 -= asl(r3:2, #0)
+
+declare i64 @llvm.hexagon.S2.asr.i.p.acc(i64, i64, i32)
+define i64 @S2_asr_i_p_acc(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.asr.i.p.acc(i64 %a, i64 %b, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 += asr(r3:2, #0)
+
+declare i64 @llvm.hexagon.S2.lsr.i.p.acc(i64, i64, i32)
+define i64 @S2_lsr_i_p_acc(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.lsr.i.p.acc(i64 %a, i64 %b, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 += lsr(r3:2, #0)
+
+declare i64 @llvm.hexagon.S2.asl.i.p.acc(i64, i64, i32)
+define i64 @S2_asl_i_p_acc(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.asl.i.p.acc(i64 %a, i64 %b, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 += asl(r3:2, #0)
+
+declare i32 @llvm.hexagon.S2.asr.i.r.nac(i32, i32, i32)
+define i32 @S2_asr_i_r_nac(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.asr.i.r.nac(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 -= asr(r1, #0)
+
+declare i32 @llvm.hexagon.S2.lsr.i.r.nac(i32, i32, i32)
+define i32 @S2_lsr_i_r_nac(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.lsr.i.r.nac(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 -= lsr(r1, #0)
+
+declare i32 @llvm.hexagon.S2.asl.i.r.nac(i32, i32, i32)
+define i32 @S2_asl_i_r_nac(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.asl.i.r.nac(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 -= asl(r1, #0)
+
+declare i32 @llvm.hexagon.S2.asr.i.r.acc(i32, i32, i32)
+define i32 @S2_asr_i_r_acc(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.asr.i.r.acc(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 += asr(r1, #0)
+
+declare i32 @llvm.hexagon.S2.lsr.i.r.acc(i32, i32, i32)
+define i32 @S2_lsr_i_r_acc(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.lsr.i.r.acc(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 += lsr(r1, #0)
+
+declare i32 @llvm.hexagon.S2.asl.i.r.acc(i32, i32, i32)
+define i32 @S2_asl_i_r_acc(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.asl.i.r.acc(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 += asl(r1, #0)
+
+; Shift by immediate and add
+declare i32 @llvm.hexagon.S4.addi.asl.ri(i32, i32, i32)
+define i32 @S4_addi_asl_ri(i32 %a) {
+  %z = call i32 @llvm.hexagon.S4.addi.asl.ri(i32 0, i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = add(#0, asl(r0, #0))
+
+declare i32 @llvm.hexagon.S4.subi.asl.ri(i32, i32, i32)
+define i32 @S4_subi_asl_ri(i32 %a) {
+  %z = call i32 @llvm.hexagon.S4.subi.asl.ri(i32 0, i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = sub(#0, asl(r0, #0))
+
+declare i32 @llvm.hexagon.S4.addi.lsr.ri(i32, i32, i32)
+define i32 @S4_addi_lsr_ri(i32 %a) {
+  %z = call i32 @llvm.hexagon.S4.addi.lsr.ri(i32 0, i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = add(#0, lsr(r0, #0))
+
+declare i32 @llvm.hexagon.S4.subi.lsr.ri(i32, i32, i32)
+define i32 @S4_subi_lsr_ri(i32 %a) {
+  %z = call i32 @llvm.hexagon.S4.subi.lsr.ri(i32 0, i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = sub(#0, lsr(r0, #0))
+
+declare i32 @llvm.hexagon.S2.addasl.rrri(i32, i32, i32)
+define i32 @S2_addasl_rrri(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.addasl.rrri(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = addasl(r0, r1, #0)
+
+; Shift by immediate and logical
+declare i64 @llvm.hexagon.S2.asr.i.p.and(i64, i64, i32)
+define i64 @S2_asr_i_p_and(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.asr.i.p.and(i64 %a, i64 %b, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 &= asr(r3:2, #0)
+
+declare i64 @llvm.hexagon.S2.lsr.i.p.and(i64, i64, i32)
+define i64 @S2_lsr_i_p_and(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.lsr.i.p.and(i64 %a, i64 %b, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 &= lsr(r3:2, #0)
+
+declare i64 @llvm.hexagon.S2.asl.i.p.and(i64, i64, i32)
+define i64 @S2_asl_i_p_and(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.asl.i.p.and(i64 %a, i64 %b, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 &= asl(r3:2, #0)
+
+declare i64 @llvm.hexagon.S2.asr.i.p.or(i64, i64, i32)
+define i64 @S2_asr_i_p_or(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.asr.i.p.or(i64 %a, i64 %b, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 |= asr(r3:2, #0)
+
+declare i64 @llvm.hexagon.S2.lsr.i.p.or(i64, i64, i32)
+define i64 @S2_lsr_i_p_or(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.lsr.i.p.or(i64 %a, i64 %b, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 |= lsr(r3:2, #0)
+
+declare i64 @llvm.hexagon.S2.asl.i.p.or(i64, i64, i32)
+define i64 @S2_asl_i_p_or(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.asl.i.p.or(i64 %a, i64 %b, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 |= asl(r3:2, #0)
+
+declare i64 @llvm.hexagon.S2.lsr.i.p.xacc(i64, i64, i32)
+define i64 @S2_lsr_i_p_xacc(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.lsr.i.p.xacc(i64 %a, i64 %b, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 ^= lsr(r3:2, #0)
+
+declare i64 @llvm.hexagon.S2.asl.i.p.xacc(i64, i64, i32)
+define i64 @S2_asl_i_p_xacc(i64 %a, i64 %b) {
+  %z = call i64 @llvm.hexagon.S2.asl.i.p.xacc(i64 %a, i64 %b, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 ^= asl(r3:2, #0)
+
+declare i32 @llvm.hexagon.S2.asr.i.r.and(i32, i32, i32)
+define i32 @S2_asr_i_r_and(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.asr.i.r.and(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 &= asr(r1, #0)
+
+declare i32 @llvm.hexagon.S2.lsr.i.r.and(i32, i32, i32)
+define i32 @S2_lsr_i_r_and(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.lsr.i.r.and(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 &= lsr(r1, #0)
+
+declare i32 @llvm.hexagon.S2.asl.i.r.and(i32, i32, i32)
+define i32 @S2_asl_i_r_and(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.asl.i.r.and(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 &= asl(r1, #0)
+
+declare i32 @llvm.hexagon.S2.asr.i.r.or(i32, i32, i32)
+define i32 @S2_asr_i_r_or(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.asr.i.r.or(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 |= asr(r1, #0)
+
+declare i32 @llvm.hexagon.S2.lsr.i.r.or(i32, i32, i32)
+define i32 @S2_lsr_i_r_or(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.lsr.i.r.or(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 |= lsr(r1, #0)
+
+declare i32 @llvm.hexagon.S2.asl.i.r.or(i32, i32, i32)
+define i32 @S2_asl_i_r_or(i32%a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.asl.i.r.or(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 |= asl(r1, #0)
+
+declare i32 @llvm.hexagon.S2.lsr.i.r.xacc(i32, i32, i32)
+define i32 @S2_lsr_i_r_xacc(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.lsr.i.r.xacc(i32%a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 ^= lsr(r1, #0)
+
+declare i32 @llvm.hexagon.S2.asl.i.r.xacc(i32, i32, i32)
+define i32 @S2_asl_i_r_xacc(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.asl.i.r.xacc(i32 %a, i32 %b, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 ^= asl(r1, #0)
+
+declare i32 @llvm.hexagon.S4.andi.asl.ri(i32, i32, i32)
+define i32 @S4_andi_asl_ri(i32 %a) {
+  %z = call i32 @llvm.hexagon.S4.andi.asl.ri(i32 0, i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = and(#0, asl(r0, #0))
+
+declare i32 @llvm.hexagon.S4.ori.asl.ri(i32, i32, i32)
+define i32 @S4_ori_asl_ri(i32 %a) {
+  %z = call i32 @llvm.hexagon.S4.ori.asl.ri(i32 0, i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = or(#0, asl(r0, #0))
+
+declare i32 @llvm.hexagon.S4.andi.lsr.ri(i32, i32, i32)
+define i32 @S4_andi_lsr_ri(i32 %a) {
+  %z = call i32 @llvm.hexagon.S4.andi.lsr.ri(i32 0, i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = and(#0, lsr(r0, #0))
+
+declare i32 @llvm.hexagon.S4.ori.lsr.ri(i32, i32, i32)
+define i32 @S4_ori_lsr_ri(i32 %a) {
+  %z = call i32 @llvm.hexagon.S4.ori.lsr.ri(i32 0, i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = or(#0, lsr(r0, #0))
+
+; Shift right by immediate with rounding
+declare i64 @llvm.hexagon.S2.asr.i.p.rnd(i64, i32)
+define i64 @S2_asr_i_p_rnd(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.asr.i.p.rnd(i64 %a, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 = asr(r1:0, #0):rnd
+
+declare i32 @llvm.hexagon.S2.asr.i.r.rnd(i32, i32)
+define i32 @S2_asr_i_r_rnd(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.asr.i.r.rnd(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = asr(r0, #0):rnd
+
+; Shift left by immediate with saturation
+declare i32 @llvm.hexagon.S2.asl.i.r.sat(i32, i32)
+define i32 @S2_asl_i_r_sat(i32 %a) {
+  %z = call i32 @llvm.hexagon.S2.asl.i.r.sat(i32 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = asl(r0, #0):sat
+
+; Shift by register
+declare i64 @llvm.hexagon.S2.asr.r.p(i64, i32)
+define i64 @S2_asr_r_p(i64 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.S2.asr.r.p(i64 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = asr(r1:0, r2)
+
+declare i64 @llvm.hexagon.S2.lsr.r.p(i64, i32)
+define i64 @S2_lsr_r_p(i64 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.S2.lsr.r.p(i64 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = lsr(r1:0, r2)
+
+declare i64 @llvm.hexagon.S2.asl.r.p(i64, i32)
+define i64 @S2_asl_r_p(i64 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.S2.asl.r.p(i64 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = asl(r1:0, r2)
+
+declare i64 @llvm.hexagon.S2.lsl.r.p(i64, i32)
+define i64 @S2_lsl_r_p(i64 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.S2.lsl.r.p(i64 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = lsl(r1:0, r2)
+
+declare i32 @llvm.hexagon.S2.asr.r.r(i32, i32)
+define i32 @S2_asr_r_r(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.asr.r.r(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = asr(r0, r1)
+
+declare i32 @llvm.hexagon.S2.lsr.r.r(i32, i32)
+define i32 @S2_lsr_r_r(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.lsr.r.r(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = lsr(r0, r1)
+
+declare i32 @llvm.hexagon.S2.asl.r.r(i32, i32)
+define i32 @S2_asl_r_r(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.asl.r.r(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = asl(r0, r1)
+
+declare i32 @llvm.hexagon.S2.lsl.r.r(i32, i32)
+define i32 @S2_lsl_r_r(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.lsl.r.r(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = lsl(r0, r1)
+
+declare i32 @llvm.hexagon.S4.lsli(i32, i32)
+define i32 @S4_lsli(i32 %a) {
+  %z = call i32 @llvm.hexagon.S4.lsli(i32 0, i32 %a)
+  ret i32 %z
+}
+; CHECK: r0 = lsl(#0, r0)
+
+; Shift by register and accumulate
+declare i64 @llvm.hexagon.S2.asr.r.p.nac(i64, i64, i32)
+define i64 @S2_asr_r_p_nac(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.asr.r.p.nac(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= asr(r3:2, r4)
+
+declare i64 @llvm.hexagon.S2.lsr.r.p.nac(i64, i64, i32)
+define i64 @S2_lsr_r_p_nac(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.lsr.r.p.nac(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= lsr(r3:2, r4)
+
+declare i64 @llvm.hexagon.S2.asl.r.p.nac(i64, i64, i32)
+define i64 @S2_asl_r_p_nac(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.asl.r.p.nac(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= asl(r3:2, r4)
+
+declare i64 @llvm.hexagon.S2.lsl.r.p.nac(i64, i64, i32)
+define i64 @S2_lsl_r_p_nac(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.lsl.r.p.nac(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 -= lsl(r3:2, r4)
+
+declare i64 @llvm.hexagon.S2.asr.r.p.acc(i64, i64, i32)
+define i64 @S2_asr_r_p_acc(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.asr.r.p.acc(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += asr(r3:2, r4)
+
+declare i64 @llvm.hexagon.S2.lsr.r.p.acc(i64, i64, i32)
+define i64 @S2_lsr_r_p_acc(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.lsr.r.p.acc(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += lsr(r3:2, r4)
+
+declare i64 @llvm.hexagon.S2.asl.r.p.acc(i64, i64, i32)
+define i64 @S2_asl_r_p_acc(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.asl.r.p.acc(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += asl(r3:2, r4)
+
+declare i64 @llvm.hexagon.S2.lsl.r.p.acc(i64, i64, i32)
+define i64 @S2_lsl_r_p_acc(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.lsl.r.p.acc(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 += lsl(r3:2, r4)
+
+declare i32 @llvm.hexagon.S2.asr.r.r.nac(i32, i32, i32)
+define i32 @S2_asr_r_r_nac(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.S2.asr.r.r.nac(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= asr(r1, r2)
+
+declare i32 @llvm.hexagon.S2.lsr.r.r.nac(i32, i32, i32)
+define i32 @S2_lsr_r_r_nac(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.S2.lsr.r.r.nac(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= lsr(r1, r2)
+
+declare i32 @llvm.hexagon.S2.asl.r.r.nac(i32, i32, i32)
+define i32 @S2_asl_r_r_nac(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.S2.asl.r.r.nac(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= asl(r1, r2)
+
+declare i32 @llvm.hexagon.S2.lsl.r.r.nac(i32, i32, i32)
+define i32 @S2_lsl_r_r_nac(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.S2.lsl.r.r.nac(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 -= lsl(r1, r2)
+
+declare i32 @llvm.hexagon.S2.asr.r.r.acc(i32, i32, i32)
+define i32 @S2_asr_r_r_acc(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.S2.asr.r.r.acc(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += asr(r1, r2)
+
+declare i32 @llvm.hexagon.S2.lsr.r.r.acc(i32, i32, i32)
+define i32 @S2_lsr_r_r_acc(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.S2.lsr.r.r.acc(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += lsr(r1, r2)
+
+declare i32 @llvm.hexagon.S2.asl.r.r.acc(i32, i32, i32)
+define i32 @S2_asl_r_r_acc(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.S2.asl.r.r.acc(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += asl(r1, r2)
+
+declare i32 @llvm.hexagon.S2.lsl.r.r.acc(i32, i32, i32)
+define i32 @S2_lsl_r_r_acc(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.S2.lsl.r.r.acc(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 += lsl(r1, r2)
+
+; Shift by register and logical
+declare i64 @llvm.hexagon.S2.asr.r.p.or(i64, i64, i32)
+define i64 @S2_asr_r_p_or(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.asr.r.p.or(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 |= asr(r3:2, r4)
+
+declare i64 @llvm.hexagon.S2.lsr.r.p.or(i64, i64, i32)
+define i64 @S2_lsr_r_p_or(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.lsr.r.p.or(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 |= lsr(r3:2, r4)
+
+declare i64 @llvm.hexagon.S2.asl.r.p.or(i64, i64, i32)
+define i64 @S2_asl_r_p_or(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.asl.r.p.or(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 |= asl(r3:2, r4)
+
+declare i64 @llvm.hexagon.S2.lsl.r.p.or(i64, i64, i32)
+define i64 @S2_lsl_r_p_or(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.lsl.r.p.or(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 |= lsl(r3:2, r4)
+
+declare i64 @llvm.hexagon.S2.asr.r.p.and(i64, i64, i32)
+define i64 @S2_asr_r_p_and(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.asr.r.p.and(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 &= asr(r3:2, r4)
+
+declare i64 @llvm.hexagon.S2.lsr.r.p.and(i64, i64, i32)
+define i64 @S2_lsr_r_p_and(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.lsr.r.p.and(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 &= lsr(r3:2, r4)
+
+declare i64 @llvm.hexagon.S2.asl.r.p.and(i64, i64, i32)
+define i64 @S2_asl_r_p_and(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.asl.r.p.and(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 &= asl(r3:2, r4)
+
+declare i64 @llvm.hexagon.S2.lsl.r.p.and(i64, i64, i32)
+define i64 @S2_lsl_r_p_and(i64 %a, i64 %b, i32 %c) {
+  %z = call i64 @llvm.hexagon.S2.lsl.r.p.and(i64 %a, i64 %b, i32 %c)
+  ret i64 %z
+}
+; CHECK: r1:0 &= lsl(r3:2, r4)
+
+declare i32 @llvm.hexagon.S2.asr.r.r.or(i32, i32, i32)
+define i32 @S2_asr_r_r_or(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.S2.asr.r.r.or(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 |= asr(r1, r2)
+
+declare i32 @llvm.hexagon.S2.lsr.r.r.or(i32, i32, i32)
+define i32 @S2_lsr_r_r_or(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.S2.lsr.r.r.or(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 |= lsr(r1, r2)
+
+declare i32 @llvm.hexagon.S2.asl.r.r.or(i32, i32, i32)
+define i32 @S2_asl_r_r_or(i32%a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.S2.asl.r.r.or(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 |= asl(r1, r2)
+
+declare i32 @llvm.hexagon.S2.lsl.r.r.or(i32, i32, i32)
+define i32 @S2_lsl_r_r_or(i32%a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.S2.lsl.r.r.or(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 |= lsl(r1, r2)
+
+declare i32 @llvm.hexagon.S2.asr.r.r.and(i32, i32, i32)
+define i32 @S2_asr_r_r_and(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.S2.asr.r.r.and(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 &= asr(r1, r2)
+
+declare i32 @llvm.hexagon.S2.lsr.r.r.and(i32, i32, i32)
+define i32 @S2_lsr_r_r_and(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.S2.lsr.r.r.and(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 &= lsr(r1, r2)
+
+declare i32 @llvm.hexagon.S2.asl.r.r.and(i32, i32, i32)
+define i32 @S2_asl_r_r_and(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.S2.asl.r.r.and(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 &= asl(r1, r2)
+
+declare i32 @llvm.hexagon.S2.lsl.r.r.and(i32, i32, i32)
+define i32 @S2_lsl_r_r_and(i32 %a, i32 %b, i32 %c) {
+  %z = call i32 @llvm.hexagon.S2.lsl.r.r.and(i32 %a, i32 %b, i32 %c)
+  ret i32 %z
+}
+; CHECK: r0 &= lsl(r1, r2)
+
+; Shift by register with saturation
+declare i32 @llvm.hexagon.S2.asr.r.r.sat(i32, i32)
+define i32 @S2_asr_r_r_sat(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.asr.r.r.sat(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = asr(r0, r1):sat
+
+declare i32 @llvm.hexagon.S2.asl.r.r.sat(i32, i32)
+define i32 @S2_asl_r_r_sat(i32 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.asl.r.r.sat(i32 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = asl(r0, r1):sat
+
+; Vector shift halfwords by immediate
+declare i64 @llvm.hexagon.S2.asr.i.vh(i64, i32)
+define i64 @S2_asr_i_vh(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.asr.i.vh(i64 %a, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 = vasrh(r1:0, #0)
+
+declare i64 @llvm.hexagon.S2.lsr.i.vh(i64, i32)
+define i64 @S2_lsr_i_vh(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.lsr.i.vh(i64 %a, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 = vlsrh(r1:0, #0)
+
+declare i64 @llvm.hexagon.S2.asl.i.vh(i64, i32)
+define i64 @S2_asl_i_vh(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.asl.i.vh(i64 %a, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 = vaslh(r1:0, #0)
+
+; Vector shift halfwords by register
+declare i64 @llvm.hexagon.S2.asr.r.vh(i64, i32)
+define i64 @S2_asr_r_vh(i64 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.S2.asr.r.vh(i64 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vasrh(r1:0, r2)
+
+declare i64 @llvm.hexagon.S2.lsr.r.vh(i64, i32)
+define i64 @S2_lsr_r_vh(i64 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.S2.lsr.r.vh(i64 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vlsrh(r1:0, r2)
+
+declare i64 @llvm.hexagon.S2.asl.r.vh(i64, i32)
+define i64 @S2_asl_r_vh(i64 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.S2.asl.r.vh(i64 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vaslh(r1:0, r2)
+
+declare i64 @llvm.hexagon.S2.lsl.r.vh(i64, i32)
+define i64 @S2_lsl_r_vh(i64 %a, i32 %b) {
+  %z = call i64 @llvm.hexagon.S2.lsl.r.vh(i64 %a, i32 %b)
+  ret i64 %z
+}
+; CHECK: r1:0 = vlslh(r1:0, r2)
+
+; Vector shift words by immediate
+declare i64 @llvm.hexagon.S2.asr.i.vw(i64, i32)
+define i64 @S2_asr_i_vw(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.asr.i.vw(i64 %a, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 = vasrw(r1:0, #0)
+
+declare i64 @llvm.hexagon.S2.lsr.i.vw(i64, i32)
+define i64 @S2_lsr_i_vw(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.lsr.i.vw(i64 %a, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 = vlsrw(r1:0, #0)
+
+declare i64 @llvm.hexagon.S2.asl.i.vw(i64, i32)
+define i64 @S2_asl_i_vw(i64 %a) {
+  %z = call i64 @llvm.hexagon.S2.asl.i.vw(i64 %a, i32 0)
+  ret i64 %z
+}
+; CHECK: r1:0 = vaslw(r1:0, #0)
+
+; Vector shift words by with truncate and pack
+declare i32 @llvm.hexagon.S2.asr.i.svw.trun(i64, i32)
+define i32 @S2_asr_i_svw_trun(i64 %a) {
+  %z = call i32 @llvm.hexagon.S2.asr.i.svw.trun(i64 %a, i32 0)
+  ret i32 %z
+}
+; CHECK: r0 = vasrw(r1:0, #0)
+
+declare i32 @llvm.hexagon.S2.asr.r.svw.trun(i64, i32)
+define i32 @S2_asr_r_svw_trun(i64 %a, i32 %b) {
+  %z = call i32 @llvm.hexagon.S2.asr.r.svw.trun(i64 %a, i32 %b)
+  ret i32 %z
+}
+; CHECK: r0 = vasrw(r1:0, r2)
diff --git a/test/CodeGen/Hexagon/newvaluestore.ll b/test/CodeGen/Hexagon/newvaluestore.ll
index 186e393..93cf347 100644
--- a/test/CodeGen/Hexagon/newvaluestore.ll
+++ b/test/CodeGen/Hexagon/newvaluestore.ll
@@ -7,7 +7,7 @@
 
 define i32 @main() nounwind {
 entry:
-; CHECK: memw(r{{[0-9]+}} + #{{[0-9]+}}) = r{{[0-9]+}}.new
+; CHECK: memw(r{{[0-9]+}}+#{{[0-9]+}}) = r{{[0-9]+}}.new
   %number1 = alloca i32, align 4
   %number2 = alloca i32, align 4
   %number3 = alloca i32, align 4
diff --git a/test/CodeGen/Hexagon/pred-absolute-store.ll b/test/CodeGen/Hexagon/pred-absolute-store.ll
index b1b09f4..64635b1 100644
--- a/test/CodeGen/Hexagon/pred-absolute-store.ll
+++ b/test/CodeGen/Hexagon/pred-absolute-store.ll
@@ -2,7 +2,7 @@
 ; Check that we are able to predicate instructions with abosolute
 ; addressing mode.
 
-; CHECK: if{{ *}}(p{{[0-3]+}}){{ *}}memw(##gvar){{ *}}={{ *}}r{{[0-9]+}}
+; CHECK: if{{ *}}(p{{[0-3]+}}.new){{ *}}memw(##gvar){{ *}}={{ *}}r{{[0-9]+}}
 
 @gvar = external global i32
 define i32 @test2(i32 %a, i32 %b) nounwind {
diff --git a/test/CodeGen/Hexagon/struct_args_large.ll b/test/CodeGen/Hexagon/struct_args_large.ll
index f09fd10..db87d9e 100644
--- a/test/CodeGen/Hexagon/struct_args_large.ll
+++ b/test/CodeGen/Hexagon/struct_args_large.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
 ; CHECK: r[[T0:[0-9]+]] = CONST32(#s2)
-; CHECK: memw(r29 + #0) = r{{.}}
+; CHECK: memw(r29+#0) = r{{.}}
 ; CHECK: memw(r29+#8) = r{{.}}
 
 %struct.large = type { i64, i64 }
diff --git a/test/CodeGen/Inputs/DbgValueOtherTargets.ll b/test/CodeGen/Inputs/DbgValueOtherTargets.ll
index 2d05b45..d21a4ee 100644
--- a/test/CodeGen/Inputs/DbgValueOtherTargets.ll
+++ b/test/CodeGen/Inputs/DbgValueOtherTargets.ll
@@ -3,7 +3,7 @@
 define i32 @main() nounwind ssp {
 entry:
 ; CHECK: DEBUG_VALUE
-  call void @llvm.dbg.value(metadata !6, i64 0, metadata !7, metadata !{metadata !"0x102"}), !dbg !9
+  call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !7, metadata !{!"0x102"}), !dbg !9
   ret i32 0, !dbg !10
 }
 
@@ -14,17 +14,17 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!13}
 
-!0 = metadata !{metadata !"0x2e\00main\00main\00\002\000\001\000\006\000\000\000", metadata !12, metadata !1, metadata !3, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 120996)\000\00\000\00\000", metadata !12, metadata !6, metadata !6, metadata !11, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !12, metadata !2} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 0}
-!7 = metadata !{metadata !"0x100\00i\003\000", metadata !8, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!8 = metadata !{metadata !"0xb\002\0012\000", metadata !12, metadata !0} ; [ DW_TAG_lexical_block ]
-!9 = metadata !{i32 3, i32 11, metadata !8, null}
-!10 = metadata !{i32 4, i32 2, metadata !8, null}
-!11 = metadata !{metadata !0}
-!12 = metadata !{metadata !"/tmp/x.c", metadata !"/Users/manav"}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00main\00main\00\002\000\001\000\006\000\000\000", !12, !1, !3, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x29", !12} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 2.9 (trunk 120996)\000\00\000\00\000", !12, !6, !6, !11, null, null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !12, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", !12, !2} ; [ DW_TAG_base_type ]
+!6 = !{i32 0}
+!7 = !{!"0x100\00i\003\000", !8, !1, !5} ; [ DW_TAG_auto_variable ]
+!8 = !{!"0xb\002\0012\000", !12, !0} ; [ DW_TAG_lexical_block ]
+!9 = !MDLocation(line: 3, column: 11, scope: !8)
+!10 = !MDLocation(line: 4, column: 2, scope: !8)
+!11 = !{!0}
+!12 = !{!"/tmp/x.c", !"/Users/manav"}
+!13 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/Mips/2008-08-01-AsmInline.ll b/test/CodeGen/Mips/2008-08-01-AsmInline.ll
index 3c1bb39..ae06ffe 100644
--- a/test/CodeGen/Mips/2008-08-01-AsmInline.ll
+++ b/test/CodeGen/Mips/2008-08-01-AsmInline.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=mips -mcpu=mips32 < %s | FileCheck %s
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi=n64 < %s | FileCheck %s
 
 %struct.DWstruct = type { i32, i32 }
 
diff --git a/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll b/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll
index c3791df..f736ddd 100644
--- a/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll
+++ b/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-O32
 ; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-O32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n32 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n32 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n64 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N64
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n64 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi n32 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi n32 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi n64 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi n64 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N64
 
 define float @h() nounwind readnone {
 entry:
diff --git a/test/CodeGen/Mips/Fast-ISel/callabi.ll b/test/CodeGen/Mips/Fast-ISel/callabi.ll
index 44b94bb..e76d7a7 100644
--- a/test/CodeGen/Mips/Fast-ISel/callabi.ll
+++ b/test/CodeGen/Mips/Fast-ISel/callabi.ll
@@ -474,4 +474,4 @@ attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.6.0 (gitosis@dmz-portal.mips.com:clang 43992fe7b17de5553ac06d323cb80cc6723a9ae3) (gitosis@dmz-portal.mips.com:llvm.git 0834e6839eb170197c81bb02e916258d1527e312)"}
+!0 = !{!"clang version 3.6.0 (gitosis@dmz-portal.mips.com:clang 43992fe7b17de5553ac06d323cb80cc6723a9ae3) (gitosis@dmz-portal.mips.com:llvm.git 0834e6839eb170197c81bb02e916258d1527e312)"}
diff --git a/test/CodeGen/Mips/Fast-ISel/overflt.ll b/test/CodeGen/Mips/Fast-ISel/overflt.ll
new file mode 100644
index 0000000..94abd2d
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/overflt.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+
+@x = common global [128000 x float] zeroinitializer, align 4
+@y = global float* getelementptr inbounds ([128000 x float]* @x, i32 0, i32 0), align 4
+@result = common global float 0.000000e+00, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%f \0A\00", align 1
+
+; Function Attrs: nounwind
+define void @foo() {
+entry:
+; CHECK-LABEL:   .ent  foo
+  %0 = load float** @y, align 4
+  %arrayidx = getelementptr inbounds float* %0, i32 64000
+  store float 5.500000e+00, float* %arrayidx, align 4
+; CHECK:        lui     $[[REG_FPCONST_INT:[0-9]+]], 16560
+; CHECK:        mtc1    $[[REG_FPCONST_INT]], $f[[REG_FPCONST:[0-9]+]]
+; CHECK:        lw      $[[REG_Y_GOT:[0-9]+]], %got(y)(${{[0-9]+}})
+; CHECK:        lw      $[[REG_Y:[0-9]+]], 0($[[REG_Y_GOT]])
+; CHECK:        lui     $[[REG_IDX_UPPER:[0-9]+]], 3
+; CHECK:        ori     $[[REG_IDX:[0-9]+]], $[[REG_IDX_UPPER]], 59392
+; CHECK:        addu    $[[REG_Y_IDX:[0-9]+]], $[[REG_IDX]], $[[REG_Y]]
+; CHECK:        swc1    $f[[REG_FPCONST]], 0($[[REG_Y_IDX]])
+  ret void
+; CHECK-LABEL:   .end  foo
+}
+
+; Function Attrs: nounwind
+define void @goo() {
+entry:
+; CHECK-LABEL:   .ent  goo
+  %0 = load float** @y, align 4
+  %arrayidx = getelementptr inbounds float* %0, i32 64000
+  %1 = load float* %arrayidx, align 4
+  store float %1, float* @result, align 4
+; CHECK-DAG:    lw      $[[REG_RESULT:[0-9]+]], %got(result)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_Y_GOT:[0-9]+]], %got(y)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_Y:[0-9]+]], 0($[[REG_Y_GOT]])
+; CHECK-DAG:    lui     $[[REG_IDX_UPPER:[0-9]+]], 3
+; CHECK-DAG:    ori     $[[REG_IDX:[0-9]+]], $[[REG_IDX_UPPER]], 59392
+; CHECK-DAG:    addu    $[[REG_Y_IDX:[0-9]+]], $[[REG_IDX]], $[[REG_Y]]
+; CHECK-DAG:    lwc1    $f[[Y_IDX:[0-9]+]], 0($[[REG_Y_IDX]])
+; CHECK-DAG:    swc1    $f[[Y_IDX]], 0($[[REG_RESULT]])
+; CHECK-LABEL:   .end  goo
+  ret void
+}
+
+; 
+; Original C code for test.
+;
+;float x[128000];
+;float *y = x;
+;float result;
+
+
+;void foo() {
+;  y[64000] = 5.5;
+;}
+
+;void goo() {
+;  result = y[64000];
+;}
diff --git a/test/CodeGen/Mips/Fast-ISel/retabi.ll b/test/CodeGen/Mips/Fast-ISel/retabi.ll
new file mode 100644
index 0000000..d271aef
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/retabi.ll
@@ -0,0 +1,80 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+
+@i = global i32 75, align 4
+@s = global i16 -345, align 2
+@c = global i8 118, align 1
+@f = global float 0x40BE623360000000, align 4
+@d = global double 1.298330e+03, align 8
+
+; Function Attrs: nounwind
+define i32 @reti() {
+entry:
+; CHECK-LABEL: reti:
+  %0 = load i32* @i, align 4
+  ret i32 %0
+; CHECK:        lui     $[[REG_GPa:[0-9]+]], %hi(_gp_disp)
+; CHECK:        addiu   $[[REG_GPb:[0-9]+]], $[[REG_GPa]], %lo(_gp_disp)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], $[[REG_GPb]], $25
+; CHECK:        lw      $[[REG_I_ADDR:[0-9]+]], %got(i)($[[REG_GP]])
+; CHECK:        lw      $2, 0($[[REG_I_ADDR]])
+; CHECK:        jr      $ra
+}
+
+; Function Attrs: nounwind
+define signext i16 @rets() {
+entry:
+; CHECK-LABEL: rets:
+  %0 = load i16* @s, align 2
+  ret i16 %0
+; CHECK:        lui     $[[REG_GPa:[0-9]+]], %hi(_gp_disp)
+; CHECK:        addiu   $[[REG_GPb:[0-9]+]], $[[REG_GPa]], %lo(_gp_disp)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], $[[REG_GPb]], $25
+; CHECK:        lw      $[[REG_S_ADDR:[0-9]+]], %got(s)($[[REG_GP]])
+; CHECK:        lhu     $[[REG_S:[0-9]+]], 0($[[REG_S_ADDR]])
+; CHECK:        seh     $2, $[[REG_S]]
+; CHECK:        jr      $ra
+}
+
+; Function Attrs: nounwind
+define signext i8 @retc() {
+entry:
+; CHECK-LABEL: retc:
+  %0 = load i8* @c, align 1
+  ret i8 %0
+; CHECK:        lui     $[[REG_GPa:[0-9]+]], %hi(_gp_disp)
+; CHECK:        addiu   $[[REG_GPb:[0-9]+]], $[[REG_GPa]], %lo(_gp_disp)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], $[[REG_GPb]], $25
+; CHECK:        lw      $[[REG_C_ADDR:[0-9]+]], %got(c)($[[REG_GP]])
+; CHECK:        lbu     $[[REG_C:[0-9]+]], 0($[[REG_C_ADDR]])
+; CHECK:        seb     $2, $[[REG_C]]
+; CHECK:        jr      $ra
+}
+
+; Function Attrs: nounwind
+define float @retf() {
+entry:
+; CHECK-LABEL: retf:
+  %0 = load float* @f, align 4
+  ret float %0
+; CHECK:        lui     $[[REG_GPa:[0-9]+]], %hi(_gp_disp)
+; CHECK:        addiu   $[[REG_GPb:[0-9]+]], $[[REG_GPa]], %lo(_gp_disp)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], $[[REG_GPb]], $25
+; CHECK:        lw      $[[REG_F_ADDR:[0-9]+]], %got(f)($[[REG_GP]])
+; CHECK:        lwc1    $f0, 0($[[REG_F_ADDR]])
+; CHECK:        jr      $ra
+}
+
+; Function Attrs: nounwind
+define double @retd() {
+entry:
+; CHECK-LABEL: retd:
+  %0 = load double* @d, align 8
+  ret double %0
+; CHECK:        lui     $[[REG_GPa:[0-9]+]], %hi(_gp_disp)
+; CHECK:        addiu   $[[REG_GPb:[0-9]+]], $[[REG_GPa]], %lo(_gp_disp)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], $[[REG_GPb]], $25
+; CHECK:        lw      $[[REG_D_ADDR:[0-9]+]], %got(d)($[[REG_GP]])
+; CHECK:        ldc1    $f0, 0($[[REG_D_ADDR]])
+; CHECK:        jr      $ra
+}
diff --git a/test/CodeGen/Mips/abiflags32.ll b/test/CodeGen/Mips/abiflags32.ll
index e32d4a5..39e2a90 100644
--- a/test/CodeGen/Mips/abiflags32.ll
+++ b/test/CodeGen/Mips/abiflags32.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | FileCheck %s
 ; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -mattr=fp64 %s -o - | FileCheck  -check-prefix=CHECK-64 %s
-; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips64 -mattr=-n64,n32 %s -o - | FileCheck  -check-prefix=CHECK-64n %s
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips64 -target-abi n32 %s -o - | FileCheck  -check-prefix=CHECK-64n %s
 
 ; CHECK: .nan    legacy
 ; We don't emit '.module fp=32' for compatibility with binutils 2.24 which
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
index 78fd829..ccfeb00 100644
--- a/test/CodeGen/Mips/atomic.ll
+++ b/test/CodeGen/Mips/atomic.ll
@@ -1,14 +1,15 @@
-; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32   < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=NO-SEB-SEH  -check-prefix=CHECK-EL
-; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL
-; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL
-; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips4    < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=NO-SEB-SEH  -check-prefix=CHECK-EL
-; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64   < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=NO-SEB-SEH  -check-prefix=CHECK-EL
-; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL
-; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL
+; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32   < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=NO-SEB-SEH  -check-prefix=CHECK-EL -check-prefix=NOT-MICROMIPS
+; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL -check-prefix=NOT-MICROMIPS
+; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL -check-prefix=NOT-MICROMIPS
+; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips4    < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=NO-SEB-SEH  -check-prefix=CHECK-EL -check-prefix=NOT-MICROMIPS
+; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64   < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=NO-SEB-SEH  -check-prefix=CHECK-EL -check-prefix=NOT-MICROMIPS
+; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL -check-prefix=NOT-MICROMIPS
+; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL -check-prefix=NOT-MICROMIPS
+; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32r2 -mattr=micromips < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL -check-prefix=MICROMIPS
 
 ; Keep one big-endian check so that we don't reduce testing, but don't add more
 ; since endianness doesn't affect the body of the atomic operations.
-; RUN: llc -march=mips   --disable-machine-licm -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=NO-SEB-SEH -check-prefix=CHECK-EB
+; RUN: llc -march=mips   --disable-machine-licm -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=NO-SEB-SEH -check-prefix=CHECK-EB -check-prefix=NOT-MICROMIPS
 
 @x = common global i32 0, align 4
 
@@ -26,7 +27,8 @@ entry:
 ; ALL:           ll      $[[R1:[0-9]+]], 0($[[R0]])
 ; ALL:           addu    $[[R2:[0-9]+]], $[[R1]], $4
 ; ALL:           sc      $[[R2]], 0($[[R0]])
-; ALL:           beqz    $[[R2]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R2]], $[[BB0]]
+; MICROMIPS:     beqzc   $[[R2]], $[[BB0]]
 }
 
 define i32 @AtomicLoadNand32(i32 signext %incr) nounwind {
@@ -44,7 +46,8 @@ entry:
 ; ALL:           and     $[[R3:[0-9]+]], $[[R1]], $4
 ; ALL:           nor     $[[R2:[0-9]+]], $zero, $[[R3]]
 ; ALL:           sc      $[[R2]], 0($[[R0]])
-; ALL:           beqz    $[[R2]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R2]], $[[BB0]]
+; MICROMIPS:     beqzc   $[[R2]], $[[BB0]]
 }
 
 define i32 @AtomicSwap32(i32 signext %newval) nounwind {
@@ -63,7 +66,8 @@ entry:
 ; ALL:       $[[BB0:[A-Z_0-9]+]]:
 ; ALL:           ll      ${{[0-9]+}}, 0($[[R0]])
 ; ALL:           sc      $[[R2:[0-9]+]], 0($[[R0]])
-; ALL:           beqz    $[[R2]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R2]], $[[BB0]]
+; MICROMIPS:     beqzc   $[[R2]], $[[BB0]]
 }
 
 define i32 @AtomicCmpSwap32(i32 signext %oldval, i32 signext %newval) nounwind {
@@ -84,7 +88,8 @@ entry:
 ; ALL:           ll      $2, 0($[[R0]])
 ; ALL:           bne     $2, $4, $[[BB1:[A-Z_0-9]+]]
 ; ALL:           sc      $[[R2:[0-9]+]], 0($[[R0]])
-; ALL:           beqz    $[[R2]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R2]], $[[BB0]]
+; MICROMIPS:     beqzc   $[[R2]], $[[BB0]]
 ; ALL:       $[[BB1]]:
 }
 
@@ -120,7 +125,8 @@ entry:
 ; ALL:           and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
 ; ALL:           or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
 ; ALL:           sc      $[[R14]], 0($[[R2]])
-; ALL:           beqz    $[[R14]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R14]], $[[BB0]]
+; MICROMIPS:     beqzc   $[[R14]], $[[BB0]]
 
 ; ALL:           and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
 ; ALL:           srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
@@ -159,7 +165,8 @@ entry:
 ; ALL:        and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
 ; ALL:        or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
 ; ALL:        sc      $[[R14]], 0($[[R2]])
-; ALL:        beqz    $[[R14]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R14]], $[[BB0]]
+; MICROMIPS:  beqzc   $[[R14]], $[[BB0]]
 
 ; ALL:        and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
 ; ALL:        srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
@@ -199,7 +206,8 @@ entry:
 ; ALL:           and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
 ; ALL:           or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
 ; ALL:           sc      $[[R14]], 0($[[R2]])
-; ALL:           beqz    $[[R14]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R14]], $[[BB0]]
+; MICROMIPS:     beqzc   $[[R14]], $[[BB0]]
 
 ; ALL:           and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
 ; ALL:           srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
@@ -237,7 +245,8 @@ entry:
 ; ALL:           and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
 ; ALL:           or      $[[R14:[0-9]+]], $[[R13]], $[[R18]]
 ; ALL:           sc      $[[R14]], 0($[[R2]])
-; ALL:           beqz    $[[R14]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R14]], $[[BB0]]
+; MICROMIPS:     beqzc   $[[R14]], $[[BB0]]
 
 ; ALL:           and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
 ; ALL:           srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
@@ -282,7 +291,8 @@ entry:
 ; ALL:           and     $[[R15:[0-9]+]], $[[R13]], $[[R8]]
 ; ALL:           or      $[[R16:[0-9]+]], $[[R15]], $[[R12]]
 ; ALL:           sc      $[[R16]], 0($[[R2]])
-; ALL:           beqz    $[[R16]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R16]], $[[BB0]]
+; MICROMIPS:     beqzc   $[[R16]], $[[BB0]]
 
 ; ALL:       $[[BB1]]:
 ; ALL:           srlv    $[[R17:[0-9]+]], $[[R14]], $[[R5]]
@@ -322,7 +332,8 @@ entry:
 ; ALL:           and     $[[R15:[0-9]+]], $[[R13]], $[[R8]]
 ; ALL:           or      $[[R16:[0-9]+]], $[[R15]], $[[R12]]
 ; ALL:           sc      $[[R16]], 0($[[R2]])
-; ALL:           beqz    $[[R16]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R16]], $[[BB0]]
+; MICROMIPS:     beqzc   $[[R16]], $[[BB0]]
 
 ; ALL:       $[[BB1]]:
 ; ALL:           srlv    $[[R17:[0-9]+]], $[[R14]], $[[R5]]
@@ -367,7 +378,8 @@ entry:
 ; ALL:           and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
 ; ALL:           or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
 ; ALL:           sc      $[[R14]], 0($[[R2]])
-; ALL:           beqz    $[[R14]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R14]], $[[BB0]]
+; MICROMIPS:     beqzc   $[[R14]], $[[BB0]]
 
 ; ALL:           and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
 ; ALL:           srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
@@ -430,5 +442,6 @@ entry:
 ; ALL:           ll      $[[R1:[0-9]+]], 0($[[PTR]])
 ; ALL:           addu    $[[R2:[0-9]+]], $[[R1]], $4
 ; ALL:           sc      $[[R2]], 0($[[PTR]])
-; ALL:           beqz    $[[R2]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R2]], $[[BB0]]
+; MICROMIPS:     beqzc   $[[R2]], $[[BB0]]
 }
diff --git a/test/CodeGen/Mips/blockaddr.ll b/test/CodeGen/Mips/blockaddr.ll
index d6dc7e7..f743637 100644
--- a/test/CodeGen/Mips/blockaddr.ll
+++ b/test/CodeGen/Mips/blockaddr.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-O32
 ; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-O32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n32 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n32 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n64 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N64
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n64 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi n32 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi n32 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi n64 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi n64 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N64
 ; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32 -mattr=+mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-MIPS16-1
 ; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32 -mattr=+mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-MIPS16-2
 
diff --git a/test/CodeGen/Mips/brsize3.ll b/test/CodeGen/Mips/brsize3.ll
index 7b1f440..3620868 100644
--- a/test/CodeGen/Mips/brsize3.ll
+++ b/test/CodeGen/Mips/brsize3.ll
@@ -30,4 +30,4 @@ x:                                                ; preds = %x, %entry
 attributes #0 = { noreturn nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
 attributes #1 = { nounwind }
 
-!1 = metadata !{i32 45}
+!1 = !{i32 45}
diff --git a/test/CodeGen/Mips/brsize3a.ll b/test/CodeGen/Mips/brsize3a.ll
index 6382fa2..f05e211 100644
--- a/test/CodeGen/Mips/brsize3a.ll
+++ b/test/CodeGen/Mips/brsize3a.ll
@@ -23,4 +23,4 @@ x:                                                ; preds = %x, %entry
 attributes #0 = { noreturn nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
 attributes #1 = { nounwind }
 
-!1 = metadata !{i32 45}
+!1 = !{i32 45}
diff --git a/test/CodeGen/Mips/cconv/arguments-float.ll b/test/CodeGen/Mips/cconv/arguments-float.ll
index 14a3baa..ee40d7f 100644
--- a/test/CodeGen/Mips/cconv/arguments-float.ll
+++ b/test/CodeGen/Mips/cconv/arguments-float.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -march=mips -relocation-model=static -soft-float < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32BE %s
 ; RUN: llc -march=mipsel -relocation-model=static -soft-float < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32LE %s
 
-; RUN-TODO: llc -march=mips64 -relocation-model=static -soft-float -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -relocation-model=static -soft-float -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -relocation-model=static -soft-float -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -soft-float -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
 
-; RUN: llc -march=mips64 -relocation-model=static -soft-float -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
-; RUN: llc -march=mips64el -relocation-model=static -soft-float -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+; RUN: llc -march=mips64 -relocation-model=static -soft-float -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -soft-float -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
 
-; RUN: llc -march=mips64 -relocation-model=static -soft-float -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
-; RUN: llc -march=mips64el -relocation-model=static -soft-float -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+; RUN: llc -march=mips64 -relocation-model=static -soft-float -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -soft-float -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
 
 ; Test the floating point arguments for all ABI's and byte orders as specified
 ; by section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cconv/arguments-fp128.ll b/test/CodeGen/Mips/cconv/arguments-fp128.ll
index c8cd8fd..1666974 100644
--- a/test/CodeGen/Mips/cconv/arguments-fp128.ll
+++ b/test/CodeGen/Mips/cconv/arguments-fp128.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=mips64 -relocation-model=static -soft-float -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
-; RUN: llc -march=mips64el -relocation-model=static -soft-float -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
+; RUN: llc -march=mips64 -relocation-model=static -soft-float -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
+; RUN: llc -march=mips64el -relocation-model=static -soft-float -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
 
-; RUN: llc -march=mips64 -relocation-model=static -soft-float -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
-; RUN: llc -march=mips64el -relocation-model=static -soft-float -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
+; RUN: llc -march=mips64 -relocation-model=static -soft-float -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
+; RUN: llc -march=mips64el -relocation-model=static -soft-float -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
 
 ; Test the fp128 arguments for all ABI's and byte orders as specified
 ; by section 2 of the MIPSpro N32 Handbook.
diff --git a/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll b/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll
index 70ccf14..380bd5c 100644
--- a/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll
+++ b/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32BE %s
 ; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32LE %s
 
-; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
 
-; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW --check-prefix=NEWBE %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW --check-prefix=NEWLE %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW --check-prefix=NEWBE %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW --check-prefix=NEWLE %s
 
-; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW --check-prefix=NEWBE %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW --check-prefix=NEWLE %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW --check-prefix=NEWBE %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW --check-prefix=NEWLE %s
 
 ; Test the effect of varargs on floating point types in the non-variable part
 ; of the argument list as specified by section 2 of the MIPSpro N32 Handbook.
diff --git a/test/CodeGen/Mips/cconv/arguments-hard-float.ll b/test/CodeGen/Mips/cconv/arguments-hard-float.ll
index 9837f7e..3221e23 100644
--- a/test/CodeGen/Mips/cconv/arguments-hard-float.ll
+++ b/test/CodeGen/Mips/cconv/arguments-hard-float.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32BE %s
 ; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32LE %s
 
-; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
 
-; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
 
-; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
 
 ; Test the floating point arguments for all ABI's and byte orders as specified
 ; by section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cconv/arguments-hard-fp128.ll b/test/CodeGen/Mips/cconv/arguments-hard-fp128.ll
index 5e3f403..583759a 100644
--- a/test/CodeGen/Mips/cconv/arguments-hard-fp128.ll
+++ b/test/CodeGen/Mips/cconv/arguments-hard-fp128.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
 
-; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
 
 ; Test the fp128 arguments for all ABI's and byte orders as specified
 ; by section 2 of the MIPSpro N32 Handbook.
diff --git a/test/CodeGen/Mips/cconv/arguments-struct.ll b/test/CodeGen/Mips/cconv/arguments-struct.ll
new file mode 100644
index 0000000..7ff894f
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments-struct.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=mips-unknown-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32-BE %s
+; RUN: llc -mtriple=mipsel-unknown-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32-LE %s
+
+; RUN-TODO: llc -mtriple=mips64-unknown-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32-BE %s
+; RUN-TODO: llc -mtriple=mips64el-unknown-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32-LE %s
+
+; RUN: llc -mtriple=mips64-unknown-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW-BE %s
+; RUN: llc -mtriple=mips64el-unknown-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW-LE %s
+
+; RUN: llc -mtriple=mips64-unknown-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW-BE %s
+; RUN: llc -mtriple=mips64el-unknown-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW-LE %s
+
+; Test small structures for all ABI's and byte orders.
+;
+; N32/N64 are identical in this area so their checks have been combined into
+; the 'NEW' prefix (the N stands for New).
+
+@bytes = global [2 x i8] zeroinitializer
+
+define void @s_i8(i8 inreg %a) nounwind {
+entry:
+	store i8 %a, i8* getelementptr inbounds ([2 x i8]* @bytes, i32 0, i32 1)
+        ret void
+}
+
+; ALL-LABEL: s_i8:
+
+; SYM32-DAG:   lui   [[PTR_HI:\$[0-9]+]], %hi(bytes)
+; SYM32-DAG:   addiu [[PTR:\$[0-9]+]], [[PTR_HI]], %lo(bytes)
+
+; SYM64-DAG:   ld    [[PTR:\$[0-9]+]], %got_disp(bytes)(
+
+; O32-BE-DAG:  srl [[ARG:\$[0-9]+]], $4, 24
+; O32-BE-DAG:  sb  [[ARG]], 1([[PTR]])
+
+; O32-LE-DAG:  sb  $4, 1([[PTR]])
+
+; NEW-BE-DAG:  dsrl [[ARG:\$[0-9]+]], $4, 56
+; NEW-BE-DAG:  sb   [[ARG]], 1([[PTR]])
+
+; NEW-LE-DAG:  sb   $4, 1([[PTR]])
diff --git a/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-byte.ll b/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-byte.ll
new file mode 100644
index 0000000..458b124
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-byte.ll
@@ -0,0 +1,282 @@
+; RUN: llc --march=mips64 -mcpu=mips64r2 < %s | FileCheck %s
+
+; Generated from the C program:
+; 
+; #include <stdio.h>
+; #include <string.h>
+; 
+; struct SmallStruct_1b {
+;  char x1;
+; };
+; 
+; struct SmallStruct_2b {
+;  char x1;
+;  char x2;
+; };
+; 
+; struct SmallStruct_3b {
+;  char x1;
+;  char x2;
+;  char x3;
+; };
+; 
+; struct SmallStruct_4b {
+;  char x1;
+;  char x2;
+;  char x3;
+;  char x4;
+; };
+; 
+; struct SmallStruct_5b {
+;  char x1;
+;  char x2;
+;  char x3;
+;  char x4;
+;  char x5;
+; };
+; 
+; struct SmallStruct_6b {
+;  char x1;
+;  char x2;
+;  char x3;
+;  char x4;
+;  char x5;
+;  char x6;
+; };
+; 
+; struct SmallStruct_7b {
+;  char x1;
+;  char x2;
+;  char x3;
+;  char x4;
+;  char x5;
+;  char x6;
+;  char x7;
+; };
+; 
+; struct SmallStruct_8b {
+;  char x1;
+;  char x2;
+;  char x3;
+;  char x4;
+;  char x5;
+;  char x6;
+;  char x7;
+;  char x8;
+; };
+; 
+; struct SmallStruct_9b {
+;  char x1;
+;  char x2;
+;  char x3;
+;  char x4;
+;  char x5;
+;  char x6;
+;  char x7;
+;  char x8;
+;  char x9;
+; };
+; 
+; void varArgF_SmallStruct(char* c, ...);
+; 
+; void smallStruct_1b(struct SmallStruct_1b* ss) {
+;  varArgF_SmallStruct("", *ss);
+; }
+; 
+; void smallStruct_2b(struct SmallStruct_2b* ss) {
+;  varArgF_SmallStruct("", *ss);
+; }
+; 
+; void smallStruct_3b(struct SmallStruct_3b* ss)
+; {
+;  varArgF_SmallStruct("", *ss);
+; }
+; 
+; void smallStruct_4b(struct SmallStruct_4b* ss)
+; {
+;  varArgF_SmallStruct("", *ss);
+; }
+; 
+; void smallStruct_5b(struct SmallStruct_5b* ss) 
+; {
+;  varArgF_SmallStruct("", *ss);
+; }
+; 
+; void smallStruct_6b(struct SmallStruct_6b* ss) 
+; {
+;  varArgF_SmallStruct("", *ss);
+; }
+; 
+; void smallStruct_7b(struct SmallStruct_7b* ss) 
+; {
+;  varArgF_SmallStruct("", *ss);
+; }
+; 
+; void smallStruct_8b(struct SmallStruct_8b* ss) 
+; {
+;  varArgF_SmallStruct("", *ss);
+; }
+; 
+; void smallStruct_9b(struct SmallStruct_9b* ss) 
+; {
+;  varArgF_SmallStruct("", *ss);
+; }
+
+%struct.SmallStruct_1b = type { i8 }
+%struct.SmallStruct_2b = type { i8, i8 }
+%struct.SmallStruct_3b = type { i8, i8, i8 }
+%struct.SmallStruct_4b = type { i8, i8, i8, i8 }
+%struct.SmallStruct_5b = type { i8, i8, i8, i8, i8 }
+%struct.SmallStruct_6b = type { i8, i8, i8, i8, i8, i8 }
+%struct.SmallStruct_7b = type { i8, i8, i8, i8, i8, i8, i8 }
+%struct.SmallStruct_8b = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+%struct.SmallStruct_9b = type { i8, i8, i8, i8, i8, i8, i8, i8, i8 }
+
+@.str = private unnamed_addr constant [3 x i8] c"01\00", align 1
+
+declare void @varArgF_SmallStruct(i8* %c, ...) 
+
+define void @smallStruct_1b(%struct.SmallStruct_1b* %ss) #0 {
+entry:
+  %ss.addr = alloca %struct.SmallStruct_1b*, align 8
+  store %struct.SmallStruct_1b* %ss, %struct.SmallStruct_1b** %ss.addr, align 8
+  %0 = load %struct.SmallStruct_1b** %ss.addr, align 8
+  %1 = bitcast %struct.SmallStruct_1b* %0 to { i8 }*
+  %2 = getelementptr { i8 }* %1, i32 0, i32 0
+  %3 = load i8* %2, align 1
+  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i8 inreg %3)
+  ret void
+ ; CHECK-LABEL: smallStruct_1b: 
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 56
+}
+
+define void @smallStruct_2b(%struct.SmallStruct_2b* %ss) #0 {
+entry:
+  %ss.addr = alloca %struct.SmallStruct_2b*, align 8
+  store %struct.SmallStruct_2b* %ss, %struct.SmallStruct_2b** %ss.addr, align 8
+  %0 = load %struct.SmallStruct_2b** %ss.addr, align 8
+  %1 = bitcast %struct.SmallStruct_2b* %0 to { i16 }*
+  %2 = getelementptr { i16 }* %1, i32 0, i32 0
+  %3 = load i16* %2, align 1
+  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i16 inreg %3)
+  ret void
+ ; CHECK-LABEL: smallStruct_2b:
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 48
+}
+
+define void @smallStruct_3b(%struct.SmallStruct_3b* %ss) #0 {
+entry:
+  %ss.addr = alloca %struct.SmallStruct_3b*, align 8
+  %.coerce = alloca { i24 }
+  store %struct.SmallStruct_3b* %ss, %struct.SmallStruct_3b** %ss.addr, align 8
+  %0 = load %struct.SmallStruct_3b** %ss.addr, align 8
+  %1 = bitcast { i24 }* %.coerce to i8*
+  %2 = bitcast %struct.SmallStruct_3b* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 3, i32 0, i1 false)
+  %3 = getelementptr { i24 }* %.coerce, i32 0, i32 0
+  %4 = load i24* %3, align 1
+  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i24 inreg %4)
+  ret void
+ ; CHECK-LABEL: smallStruct_3b:
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 40
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+
+define void @smallStruct_4b(%struct.SmallStruct_4b* %ss) #0 {
+entry:
+  %ss.addr = alloca %struct.SmallStruct_4b*, align 8
+  store %struct.SmallStruct_4b* %ss, %struct.SmallStruct_4b** %ss.addr, align 8
+  %0 = load %struct.SmallStruct_4b** %ss.addr, align 8
+  %1 = bitcast %struct.SmallStruct_4b* %0 to { i32 }*
+  %2 = getelementptr { i32 }* %1, i32 0, i32 0
+  %3 = load i32* %2, align 1
+  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i32 inreg %3)
+  ret void
+ ; CHECK-LABEL: smallStruct_4b:
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 32
+}
+
+define void @smallStruct_5b(%struct.SmallStruct_5b* %ss) #0 {
+entry:
+  %ss.addr = alloca %struct.SmallStruct_5b*, align 8
+  %.coerce = alloca { i40 }
+  store %struct.SmallStruct_5b* %ss, %struct.SmallStruct_5b** %ss.addr, align 8
+  %0 = load %struct.SmallStruct_5b** %ss.addr, align 8
+  %1 = bitcast { i40 }* %.coerce to i8*
+  %2 = bitcast %struct.SmallStruct_5b* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 5, i32 0, i1 false)
+  %3 = getelementptr { i40 }* %.coerce, i32 0, i32 0
+  %4 = load i40* %3, align 1
+  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i40 inreg %4)
+  ret void
+ ; CHECK-LABEL: smallStruct_5b:
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 24
+}
+
+define void @smallStruct_6b(%struct.SmallStruct_6b* %ss) #0 {
+entry:
+  %ss.addr = alloca %struct.SmallStruct_6b*, align 8
+  %.coerce = alloca { i48 }
+  store %struct.SmallStruct_6b* %ss, %struct.SmallStruct_6b** %ss.addr, align 8
+  %0 = load %struct.SmallStruct_6b** %ss.addr, align 8
+  %1 = bitcast { i48 }* %.coerce to i8*
+  %2 = bitcast %struct.SmallStruct_6b* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 6, i32 0, i1 false)
+  %3 = getelementptr { i48 }* %.coerce, i32 0, i32 0
+  %4 = load i48* %3, align 1
+  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i48 inreg %4)
+  ret void
+ ; CHECK-LABEL: smallStruct_6b:
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 16
+}
+
+define void @smallStruct_7b(%struct.SmallStruct_7b* %ss) #0 {
+entry:
+  %ss.addr = alloca %struct.SmallStruct_7b*, align 8
+  %.coerce = alloca { i56 }
+  store %struct.SmallStruct_7b* %ss, %struct.SmallStruct_7b** %ss.addr, align 8
+  %0 = load %struct.SmallStruct_7b** %ss.addr, align 8
+  %1 = bitcast { i56 }* %.coerce to i8*
+  %2 = bitcast %struct.SmallStruct_7b* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 7, i32 0, i1 false)
+  %3 = getelementptr { i56 }* %.coerce, i32 0, i32 0
+  %4 = load i56* %3, align 1
+  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i56 inreg %4)
+  ret void
+ ; CHECK-LABEL: smallStruct_7b:
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 8
+}
+
+define void @smallStruct_8b(%struct.SmallStruct_8b* %ss) #0 {
+entry:
+  %ss.addr = alloca %struct.SmallStruct_8b*, align 8
+  store %struct.SmallStruct_8b* %ss, %struct.SmallStruct_8b** %ss.addr, align 8
+  %0 = load %struct.SmallStruct_8b** %ss.addr, align 8
+  %1 = bitcast %struct.SmallStruct_8b* %0 to { i64 }*
+  %2 = getelementptr { i64 }* %1, i32 0, i32 0
+  %3 = load i64* %2, align 1
+  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i64 inreg %3)
+  ret void
+ ; CHECK-LABEL: smallStruct_8b:
+ ; CHECK-NOT: dsll
+}
+
+define void @smallStruct_9b(%struct.SmallStruct_9b* %ss) #0 {
+entry:
+  %ss.addr = alloca %struct.SmallStruct_9b*, align 8
+  %.coerce = alloca { i64, i8 }
+  store %struct.SmallStruct_9b* %ss, %struct.SmallStruct_9b** %ss.addr, align 8
+  %0 = load %struct.SmallStruct_9b** %ss.addr, align 8
+  %1 = bitcast { i64, i8 }* %.coerce to i8*
+  %2 = bitcast %struct.SmallStruct_9b* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 9, i32 0, i1 false)
+  %3 = getelementptr { i64, i8 }* %.coerce, i32 0, i32 0
+  %4 = load i64* %3, align 1
+  %5 = getelementptr { i64, i8 }* %.coerce, i32 0, i32 1
+  %6 = load i8* %5, align 1
+  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i64 inreg %4, i8 inreg %6)
+  ret void
+ ; CHECK-LABEL: smallStruct_9b:
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 56
+}
diff --git a/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-combinations.ll b/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-combinations.ll
new file mode 100644
index 0000000..899a3e8
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-combinations.ll
@@ -0,0 +1,149 @@
+; RUN: llc --march=mips64 -mcpu=mips64r2 < %s | FileCheck %s
+
+; Generated from the C program:
+;
+; #include <stdio.h>
+; #include <string.h>
+; 
+; struct SmallStruct_1b1s {
+;  char x1;
+;  short x2;
+; };
+; 
+; struct SmallStruct_1b1i {
+;  char x1;
+;  int x2;
+; };
+; 
+; struct SmallStruct_1b1s1b {
+;  char x1;
+;  short x2;
+;  char x3;
+; };
+; 
+; struct SmallStruct_1s1i {
+;  short x1;
+;  int x2;
+; };
+; 
+; struct SmallStruct_3b1s {
+;  char x1;
+;  char x2;
+;  char x3;
+;  short x4;
+; };
+; 
+; void varArgF_SmallStruct(char* c, ...);
+; 
+; void smallStruct_1b1s(struct SmallStruct_1b1s* ss)
+; {
+;  varArgF_SmallStruct("", *ss);
+; }
+; 
+; void smallStruct_1b1i(struct SmallStruct_1b1i* ss)
+; {
+;  varArgF_SmallStruct("", *ss);
+; }
+; 
+; void smallStruct_1b1s1b(struct SmallStruct_1b1s1b* ss)
+; {
+;  varArgF_SmallStruct("", *ss);
+; }
+; 
+; void smallStruct_1s1i(struct SmallStruct_1s1i* ss)
+; {
+;  varArgF_SmallStruct("", *ss);
+; }
+; 
+; void smallStruct_3b1s(struct SmallStruct_3b1s* ss)
+; {
+;  varArgF_SmallStruct("", *ss);
+; }
+
+%struct.SmallStruct_1b1s = type { i8, i16 }
+%struct.SmallStruct_1b1i = type { i8, i32 }
+%struct.SmallStruct_1b1s1b = type { i8, i16, i8 }
+%struct.SmallStruct_1s1i = type { i16, i32 }
+%struct.SmallStruct_3b1s = type { i8, i8, i8, i16 }
+
+@.str = private unnamed_addr constant [3 x i8] c"01\00", align 1
+
+declare void @varArgF_SmallStruct(i8* %c, ...) 
+
+define void @smallStruct_1b1s(%struct.SmallStruct_1b1s* %ss) #0 {
+entry:
+  %ss.addr = alloca %struct.SmallStruct_1b1s*, align 8
+  store %struct.SmallStruct_1b1s* %ss, %struct.SmallStruct_1b1s** %ss.addr, align 8
+  %0 = load %struct.SmallStruct_1b1s** %ss.addr, align 8
+  %1 = bitcast %struct.SmallStruct_1b1s* %0 to { i32 }*
+  %2 = getelementptr { i32 }* %1, i32 0, i32 0
+  %3 = load i32* %2, align 1
+  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i32 inreg %3)
+  ret void
+ ; CHECK-LABEL: smallStruct_1b1s:
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 32
+}
+
+define void @smallStruct_1b1i(%struct.SmallStruct_1b1i* %ss) #0 {
+entry:
+  %ss.addr = alloca %struct.SmallStruct_1b1i*, align 8
+  store %struct.SmallStruct_1b1i* %ss, %struct.SmallStruct_1b1i** %ss.addr, align 8
+  %0 = load %struct.SmallStruct_1b1i** %ss.addr, align 8
+  %1 = bitcast %struct.SmallStruct_1b1i* %0 to { i64 }*
+  %2 = getelementptr { i64 }* %1, i32 0, i32 0
+  %3 = load i64* %2, align 1
+  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i64 inreg %3)
+  ret void
+ ; CHECK-LABEL: smallStruct_1b1i:
+ ; CHECK-NOT: dsll
+}
+
+define void @smallStruct_1b1s1b(%struct.SmallStruct_1b1s1b* %ss) #0 {
+entry:
+  %ss.addr = alloca %struct.SmallStruct_1b1s1b*, align 8
+  %.coerce = alloca { i48 }
+  store %struct.SmallStruct_1b1s1b* %ss, %struct.SmallStruct_1b1s1b** %ss.addr, align 8
+  %0 = load %struct.SmallStruct_1b1s1b** %ss.addr, align 8
+  %1 = bitcast { i48 }* %.coerce to i8*
+  %2 = bitcast %struct.SmallStruct_1b1s1b* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 6, i32 0, i1 false)
+  %3 = getelementptr { i48 }* %.coerce, i32 0, i32 0
+  %4 = load i48* %3, align 1
+  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i48 inreg %4)
+  ret void
+ ; CHECK-LABEL: smallStruct_1b1s1b:
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 16
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+
+define void @smallStruct_1s1i(%struct.SmallStruct_1s1i* %ss) #0 {
+entry:
+  %ss.addr = alloca %struct.SmallStruct_1s1i*, align 8
+  store %struct.SmallStruct_1s1i* %ss, %struct.SmallStruct_1s1i** %ss.addr, align 8
+  %0 = load %struct.SmallStruct_1s1i** %ss.addr, align 8
+  %1 = bitcast %struct.SmallStruct_1s1i* %0 to { i64 }*
+  %2 = getelementptr { i64 }* %1, i32 0, i32 0
+  %3 = load i64* %2, align 1
+  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i64 inreg %3)
+  ret void
+ ; CHECK-LABEL: smallStruct_1s1i:
+ ; CHECK-NOT: dsll
+}
+
+define void @smallStruct_3b1s(%struct.SmallStruct_3b1s* %ss) #0 {
+entry:
+  %ss.addr = alloca %struct.SmallStruct_3b1s*, align 8
+  %.coerce = alloca { i48 }
+  store %struct.SmallStruct_3b1s* %ss, %struct.SmallStruct_3b1s** %ss.addr, align 8
+  %0 = load %struct.SmallStruct_3b1s** %ss.addr, align 8
+  %1 = bitcast { i48 }* %.coerce to i8*
+  %2 = bitcast %struct.SmallStruct_3b1s* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 6, i32 0, i1 false)
+  %3 = getelementptr { i48 }* %.coerce, i32 0, i32 0
+  %4 = load i48* %3, align 1
+  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i48 inreg %4)
+  ret void
+ ; CHECK-LABEL: smallStruct_3b1s:
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 16
+}
diff --git a/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-multiple-args.ll b/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-multiple-args.ll
new file mode 100644
index 0000000..1f73625
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments-varargs-small-structs-multiple-args.ll
@@ -0,0 +1,161 @@
+; RUN: llc --march=mips64 -mcpu=mips64r2 < %s | FileCheck %s
+
+; Generated from the C program:
+;  
+; #include <stdio.h>
+; #include <string.h>
+; 
+; struct SmallStruct_1b {
+;  char x1;
+; };
+; 
+; struct SmallStruct_2b {
+;  char x1;
+;  char x2;
+; };
+; 
+; struct SmallStruct_3b {
+;  char x1;
+;  char x2;
+;  char x3;
+; };
+; 
+; struct SmallStruct_4b {
+;  char x1;
+;  char x2;
+;  char x3;
+;  char x4;
+; };
+; 
+; struct SmallStruct_5b {
+;  char x1;
+;  char x2;
+;  char x3;
+;  char x4;
+;  char x5;
+; };
+; 
+; struct SmallStruct_6b {
+;  char x1;
+;  char x2;
+;  char x3;
+;  char x4;
+;  char x5;
+;  char x6;
+; };
+; 
+; struct SmallStruct_7b {
+;  char x1;
+;  char x2;
+;  char x3;
+;  char x4;
+;  char x5;
+;  char x6;
+;  char x7;
+; };
+; 
+; struct SmallStruct_8b {
+;  char x1;
+;  char x2;
+;  char x3;
+;  char x4;
+;  char x5;
+;  char x6;
+;  char x7;
+;  char x8;
+; };
+; 
+; struct SmallStruct_9b {
+;  char x1;
+;  char x2;
+;  char x3;
+;  char x4;
+;  char x5;
+;  char x6;
+;  char x7;
+;  char x8;
+;  char x9;
+; };
+; 
+; void varArgF_SmallStruct(char* c, ...);
+; 
+; void smallStruct_1b_x9(struct SmallStruct_1b* ss1,  struct SmallStruct_1b* ss2, struct SmallStruct_1b* ss3, struct SmallStruct_1b* ss4, struct SmallStruct_1b* ss5, struct SmallStruct_1b* ss6, struct SmallStruct_1b* ss7, struct SmallStruct_1b* ss8, struct SmallStruct_1b* ss9)
+; {
+;  varArgF_SmallStruct("", *ss1, *ss2, *ss3, *ss4, *ss5, *ss6, *ss7, *ss8, *ss9);
+; }
+
+%struct.SmallStruct_1b = type { i8 }
+
+@.str = private unnamed_addr constant [3 x i8] c"01\00", align 1
+
+declare void @varArgF_SmallStruct(i8* %c, ...) 
+
+define void @smallStruct_1b_x9(%struct.SmallStruct_1b* %ss1, %struct.SmallStruct_1b* %ss2, %struct.SmallStruct_1b* %ss3, %struct.SmallStruct_1b* %ss4, %struct.SmallStruct_1b* %ss5, %struct.SmallStruct_1b* %ss6, %struct.SmallStruct_1b* %ss7, %struct.SmallStruct_1b* %ss8, %struct.SmallStruct_1b* %ss9) #0 {
+entry:
+  %ss1.addr = alloca %struct.SmallStruct_1b*, align 8
+  %ss2.addr = alloca %struct.SmallStruct_1b*, align 8
+  %ss3.addr = alloca %struct.SmallStruct_1b*, align 8
+  %ss4.addr = alloca %struct.SmallStruct_1b*, align 8
+  %ss5.addr = alloca %struct.SmallStruct_1b*, align 8
+  %ss6.addr = alloca %struct.SmallStruct_1b*, align 8
+  %ss7.addr = alloca %struct.SmallStruct_1b*, align 8
+  %ss8.addr = alloca %struct.SmallStruct_1b*, align 8
+  %ss9.addr = alloca %struct.SmallStruct_1b*, align 8
+  store %struct.SmallStruct_1b* %ss1, %struct.SmallStruct_1b** %ss1.addr, align 8
+  store %struct.SmallStruct_1b* %ss2, %struct.SmallStruct_1b** %ss2.addr, align 8
+  store %struct.SmallStruct_1b* %ss3, %struct.SmallStruct_1b** %ss3.addr, align 8
+  store %struct.SmallStruct_1b* %ss4, %struct.SmallStruct_1b** %ss4.addr, align 8
+  store %struct.SmallStruct_1b* %ss5, %struct.SmallStruct_1b** %ss5.addr, align 8
+  store %struct.SmallStruct_1b* %ss6, %struct.SmallStruct_1b** %ss6.addr, align 8
+  store %struct.SmallStruct_1b* %ss7, %struct.SmallStruct_1b** %ss7.addr, align 8
+  store %struct.SmallStruct_1b* %ss8, %struct.SmallStruct_1b** %ss8.addr, align 8
+  store %struct.SmallStruct_1b* %ss9, %struct.SmallStruct_1b** %ss9.addr, align 8
+  %0 = load %struct.SmallStruct_1b** %ss1.addr, align 8
+  %1 = load %struct.SmallStruct_1b** %ss2.addr, align 8
+  %2 = load %struct.SmallStruct_1b** %ss3.addr, align 8
+  %3 = load %struct.SmallStruct_1b** %ss4.addr, align 8
+  %4 = load %struct.SmallStruct_1b** %ss5.addr, align 8
+  %5 = load %struct.SmallStruct_1b** %ss6.addr, align 8
+  %6 = load %struct.SmallStruct_1b** %ss7.addr, align 8
+  %7 = load %struct.SmallStruct_1b** %ss8.addr, align 8
+  %8 = load %struct.SmallStruct_1b** %ss9.addr, align 8
+  %9 = bitcast %struct.SmallStruct_1b* %0 to { i8 }*
+  %10 = getelementptr { i8 }* %9, i32 0, i32 0
+  %11 = load i8* %10, align 1
+  %12 = bitcast %struct.SmallStruct_1b* %1 to { i8 }*
+  %13 = getelementptr { i8 }* %12, i32 0, i32 0
+  %14 = load i8* %13, align 1
+  %15 = bitcast %struct.SmallStruct_1b* %2 to { i8 }*
+  %16 = getelementptr { i8 }* %15, i32 0, i32 0
+  %17 = load i8* %16, align 1
+  %18 = bitcast %struct.SmallStruct_1b* %3 to { i8 }*
+  %19 = getelementptr { i8 }* %18, i32 0, i32 0
+  %20 = load i8* %19, align 1
+  %21 = bitcast %struct.SmallStruct_1b* %4 to { i8 }*
+  %22 = getelementptr { i8 }* %21, i32 0, i32 0
+  %23 = load i8* %22, align 1
+  %24 = bitcast %struct.SmallStruct_1b* %5 to { i8 }*
+  %25 = getelementptr { i8 }* %24, i32 0, i32 0
+  %26 = load i8* %25, align 1
+  %27 = bitcast %struct.SmallStruct_1b* %6 to { i8 }*
+  %28 = getelementptr { i8 }* %27, i32 0, i32 0
+  %29 = load i8* %28, align 1
+  %30 = bitcast %struct.SmallStruct_1b* %7 to { i8 }*
+  %31 = getelementptr { i8 }* %30, i32 0, i32 0
+  %32 = load i8* %31, align 1
+  %33 = bitcast %struct.SmallStruct_1b* %8 to { i8 }*
+  %34 = getelementptr { i8 }* %33, i32 0, i32 0
+  %35 = load i8* %34, align 1
+  call void (i8*, ...)* @varArgF_SmallStruct(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i8 inreg %11, i8 inreg %14, i8 inreg %17, i8 inreg %20, i8 inreg %23, i8 inreg %26, i8 inreg %29, i8 inreg %32, i8 inreg %35)
+  ret void
+ ; CHECK-LABEL: smallStruct_1b_x9:
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 56
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 56
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 56
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 56
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 56
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 56
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 56
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 56
+ ; CHECK: dsll $[[R1:[0-9]+]], $[[R2:[0-9]+]], 56
+}
diff --git a/test/CodeGen/Mips/cconv/arguments-varargs.ll b/test/CodeGen/Mips/cconv/arguments-varargs.ll
index adacda5..6e6f48b 100644
--- a/test/CodeGen/Mips/cconv/arguments-varargs.ll
+++ b/test/CodeGen/Mips/cconv/arguments-varargs.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -mtriple=mips-linux -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 --check-prefix=O32-BE %s
 ; RUN: llc -mtriple=mipsel-linux -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 --check-prefix=O32-LE %s
 
-; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 
-; RUN: llc -mtriple=mips64-linux -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N32 --check-prefix=NEW-BE %s
-; RUN: llc -mtriple=mips64el-linux -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N32 --check-prefix=NEW-LE %s
+; RUN: llc -mtriple=mips64-linux -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N32 --check-prefix=NEW-BE %s
+; RUN: llc -mtriple=mips64el-linux -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N32 --check-prefix=NEW-LE %s
 
-; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N64 --check-prefix=NEW-BE %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N64 --check-prefix=NEW-LE %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N64 --check-prefix=NEW-BE %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N64 --check-prefix=NEW-LE %s
 
 @hwords = global [3 x i16] zeroinitializer, align 1
 @words  = global [3 x i32] zeroinitializer, align 1
diff --git a/test/CodeGen/Mips/cconv/arguments.ll b/test/CodeGen/Mips/cconv/arguments.ll
index 43da604..98671aa 100644
--- a/test/CodeGen/Mips/cconv/arguments.ll
+++ b/test/CodeGen/Mips/cconv/arguments.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
 ; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
 
-; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
 
-; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
 
-; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
 
 ; Test the integer arguments for all ABI's and byte orders as specified by
 ; section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cconv/callee-saved-float.ll b/test/CodeGen/Mips/cconv/callee-saved-float.ll
index de4d917..c84f0f4 100644
--- a/test/CodeGen/Mips/cconv/callee-saved-float.ll
+++ b/test/CodeGen/Mips/cconv/callee-saved-float.ll
@@ -3,20 +3,20 @@
 ; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
 ; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
 
-; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=O32-INV %s
-; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=O32-INV %s
+; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=O32-INV %s
+; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=O32-INV %s
 
-; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N32-INV %s
-; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N32-INV %s
+; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N32-INV %s
+; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N32-INV %s
 
-; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N64-INV %s
-; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N64-INV %s
+; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N64-INV %s
+; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N64-INV %s
 
 ; Test the the callee-saved registers are callee-saved as specified by section
 ; 2 of the MIPSpro N32 Handbook and section 3 of the SYSV ABI spec.
diff --git a/test/CodeGen/Mips/cconv/callee-saved.ll b/test/CodeGen/Mips/cconv/callee-saved.ll
index 293e99f..d0b1e64 100644
--- a/test/CodeGen/Mips/cconv/callee-saved.ll
+++ b/test/CodeGen/Mips/cconv/callee-saved.ll
@@ -3,20 +3,20 @@
 ; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
 ; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
 
-; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
-; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
 
-; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32-INV %s
-; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32-INV %s
+; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32-INV %s
+; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32-INV %s
 
-; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64-INV %s
-; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64-INV %s
+; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64-INV %s
+; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64-INV %s
 
 ; Test the the callee-saved registers are callee-saved as specified by section
 ; 2 of the MIPSpro N32 Handbook and section 3 of the SYSV ABI spec.
diff --git a/test/CodeGen/Mips/cconv/memory-layout.ll b/test/CodeGen/Mips/cconv/memory-layout.ll
index 0c3cc9e..33a68da 100644
--- a/test/CodeGen/Mips/cconv/memory-layout.ll
+++ b/test/CodeGen/Mips/cconv/memory-layout.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 ; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 
-; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 
-; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
 
-; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
 
 ; Test the memory layout for all ABI's and byte orders as specified by section
 ; 4 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cconv/reserved-space.ll b/test/CodeGen/Mips/cconv/reserved-space.ll
index b36f89e..23190c2 100644
--- a/test/CodeGen/Mips/cconv/reserved-space.ll
+++ b/test/CodeGen/Mips/cconv/reserved-space.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 ; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 
-; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 
-; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
 
-; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
 
 ; Test that O32 correctly reserved space for the four arguments, even when
 ; there aren't any as per section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cconv/return-float.ll b/test/CodeGen/Mips/cconv/return-float.ll
index d1a5e4f..8c4c31c 100644
--- a/test/CodeGen/Mips/cconv/return-float.ll
+++ b/test/CodeGen/Mips/cconv/return-float.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -mtriple=mips-linux-gnu -soft-float -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 ; RUN: llc -mtriple=mipsel-linux-gnu -soft-float -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 
-; RUN-TODO: llc -mtriple=mips64-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -mtriple=mips64el-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64-linux-gnu -soft-float -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64el-linux-gnu -soft-float -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 
-; RUN: llc -mtriple=mips64-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64-linux-gnu -soft-float -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -soft-float -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
 
-; RUN: llc -mtriple=mips64-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64-linux-gnu -soft-float -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -soft-float -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
 
 ; Test the float returns for all ABI's and byte orders as specified by
 ; section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cconv/return-hard-float.ll b/test/CodeGen/Mips/cconv/return-hard-float.ll
index 123b499..f0aeb12 100644
--- a/test/CodeGen/Mips/cconv/return-hard-float.ll
+++ b/test/CodeGen/Mips/cconv/return-hard-float.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 ; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 
-; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 
-; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
 
-; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
 
 ; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static -mattr=+o32,+fp64 < %s | FileCheck --check-prefix=ALL --check-prefix=032FP64 %s
 ; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static -mattr=+o32,+fp64 < %s | FileCheck --check-prefix=ALL --check-prefix=032FP64 %s
diff --git a/test/CodeGen/Mips/cconv/return-hard-fp128.ll b/test/CodeGen/Mips/cconv/return-hard-fp128.ll
index 0da59ef..05dacfe 100644
--- a/test/CodeGen/Mips/cconv/return-hard-fp128.ll
+++ b/test/CodeGen/Mips/cconv/return-hard-fp128.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
 
-; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
 
 ; Test the fp128 returns for N32/N64 and all byte orders as specified by
 ; section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cconv/return-hard-struct-f128.ll b/test/CodeGen/Mips/cconv/return-hard-struct-f128.ll
index 2e84477..4ce26b1 100644
--- a/test/CodeGen/Mips/cconv/return-hard-struct-f128.ll
+++ b/test/CodeGen/Mips/cconv/return-hard-struct-f128.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
 
-; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
 
 ; Test return of {fp128} agrees with de-facto N32/N64 ABI.
 
diff --git a/test/CodeGen/Mips/cconv/return-struct.ll b/test/CodeGen/Mips/cconv/return-struct.ll
index 11a8cf0..3d591df 100644
--- a/test/CodeGen/Mips/cconv/return-struct.ll
+++ b/test/CodeGen/Mips/cconv/return-struct.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 --check-prefix=O32-BE %s
 ; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 --check-prefix=O32-LE %s
 
-; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 
-; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 --check-prefix=N32-BE %s
-; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 --check-prefix=N32-LE %s
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 --check-prefix=N32-BE %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 --check-prefix=N32-LE %s
 
-; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 --check-prefix=N64-BE %s
-; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 --check-prefix=N64-LE %s
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 --check-prefix=N64-BE %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 --check-prefix=N64-LE %s
 
 ; Test struct returns for all ABI's and byte orders.
 
diff --git a/test/CodeGen/Mips/cconv/return.ll b/test/CodeGen/Mips/cconv/return.ll
index 63f9b5f..516026d 100644
--- a/test/CodeGen/Mips/cconv/return.ll
+++ b/test/CodeGen/Mips/cconv/return.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 ; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 
-; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 
-; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
 
-; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
 
 ; Test the integer returns for all ABI's and byte orders as specified by
 ; section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cconv/stack-alignment.ll b/test/CodeGen/Mips/cconv/stack-alignment.ll
index 834033b..f21bc30 100644
--- a/test/CodeGen/Mips/cconv/stack-alignment.ll
+++ b/test/CodeGen/Mips/cconv/stack-alignment.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 ; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 
-; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
 
-; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
 
-; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
 
 ; Test the stack alignment for all ABI's and byte orders as specified by
 ; section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/ci2.ll b/test/CodeGen/Mips/ci2.ll
index 7187f0c..e2068fd 100644
--- a/test/CodeGen/Mips/ci2.ll
+++ b/test/CodeGen/Mips/ci2.ll
@@ -36,4 +36,4 @@ if.end:                                           ; preds = %if.else, %if.then
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
-!1 = metadata !{i32 103}
+!1 = !{i32 103}
diff --git a/test/CodeGen/Mips/const1.ll b/test/CodeGen/Mips/const1.ll
index cb2baca..f32ce24 100644
--- a/test/CodeGen/Mips/const1.ll
+++ b/test/CodeGen/Mips/const1.ll
@@ -32,4 +32,4 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"=
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.4 (gitosis@dmz-portal.mips.com:clang.git b754974ec32ab712ea7d8b52cd8037b24e7d6ed3) (gitosis@dmz-portal.mips.com:llvm.git 8e211187b501bc73edb938fde0019c9a20bcffd5)"}
+!0 = !{!"clang version 3.4 (gitosis@dmz-portal.mips.com:clang.git b754974ec32ab712ea7d8b52cd8037b24e7d6ed3) (gitosis@dmz-portal.mips.com:llvm.git 8e211187b501bc73edb938fde0019c9a20bcffd5)"}
diff --git a/test/CodeGen/Mips/const4a.ll b/test/CodeGen/Mips/const4a.ll
index b4c509f..ac6795b 100644
--- a/test/CodeGen/Mips/const4a.ll
+++ b/test/CodeGen/Mips/const4a.ll
@@ -177,4 +177,4 @@ attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.4 (gitosis@dmz-portal.mips.com:clang.git b310439121c875937d78cc49cc969bc1197fc025) (gitosis@dmz-portal.mips.com:llvm.git 7fc0ca9656ebec8dad61f72f5a5ddfb232c070fd)"}
+!0 = !{!"clang version 3.4 (gitosis@dmz-portal.mips.com:clang.git b310439121c875937d78cc49cc969bc1197fc025) (gitosis@dmz-portal.mips.com:llvm.git 7fc0ca9656ebec8dad61f72f5a5ddfb232c070fd)"}
diff --git a/test/CodeGen/Mips/const6.ll b/test/CodeGen/Mips/const6.ll
index 3f02ab9..c26e02f 100644
--- a/test/CodeGen/Mips/const6.ll
+++ b/test/CodeGen/Mips/const6.ll
@@ -159,6 +159,6 @@ attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.4 (gitosis@dmz-portal.mips.com:clang.git b310439121c875937d78cc49cc969bc1197fc025) (gitosis@dmz-portal.mips.com:llvm.git 7fc0ca9656ebec8dad61f72f5a5ddfb232c070fd)"}
+!0 = !{!"clang version 3.4 (gitosis@dmz-portal.mips.com:clang.git b310439121c875937d78cc49cc969bc1197fc025) (gitosis@dmz-portal.mips.com:llvm.git 7fc0ca9656ebec8dad61f72f5a5ddfb232c070fd)"}
 
 
diff --git a/test/CodeGen/Mips/const6a.ll b/test/CodeGen/Mips/const6a.ll
index d342390..aff1357 100644
--- a/test/CodeGen/Mips/const6a.ll
+++ b/test/CodeGen/Mips/const6a.ll
@@ -26,4 +26,4 @@ entry:
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
 attributes #1 = { nounwind }
 
-!1 = metadata !{i32 121}
+!1 = !{i32 121}
diff --git a/test/CodeGen/Mips/fcmp.ll b/test/CodeGen/Mips/fcmp.ll
index b775983..8e83b00 100644
--- a/test/CodeGen/Mips/fcmp.ll
+++ b/test/CodeGen/Mips/fcmp.ll
@@ -781,3 +781,93 @@ define i32 @true_f64(double %a, double %b) nounwind {
   %2 = zext i1 %1 to i32
   ret i32 %2
 }
+
+; The optimizers sometimes produce setlt instead of setolt/setult.
+define float @bug1_f32(float %angle, float %at) #0 {
+entry:
+; ALL-LABEL: bug1_f32:
+
+; 32-C-DAG:      add.s    $[[T0:f[0-9]+]], $f14, $f12
+; 32-C-DAG:      lwc1     $[[T1:f[0-9]+]], %lo($CPI32_0)(
+; 32-C-DAG:      c.ole.s  $[[T0]], $[[T1]]
+; 32-C-DAG:      bc1t
+
+; 32-CMP-DAG:    add.s    $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    lwc1     $[[T1:f[0-9]+]], %lo($CPI32_0)(
+; 32-CMP-DAG:    cmp.le.s $[[T2:f[0-9]+]], $[[T0]], $[[T1]]
+; 32-CMP-DAG:    mfc1     $[[T3:[0-9]+]], $[[T2]]
+; FIXME: This instruction is redundant.
+; 32-CMP-DAG:    andi     $[[T4:[0-9]+]], $[[T3]], 1
+; 32-CMP-DAG:    bnez     $[[T4]],
+
+; 64-C-DAG:      add.s    $[[T0:f[0-9]+]], $f13, $f12
+; 64-C-DAG:      lwc1     $[[T1:f[0-9]+]], %got_ofst($CPI32_0)(
+; 64-C-DAG:      c.ole.s  $[[T0]], $[[T1]]
+; 64-C-DAG:      bc1t
+
+; 64-CMP-DAG:    add.s    $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    lwc1     $[[T1:f[0-9]+]], %got_ofst($CPI32_0)(
+; 64-CMP-DAG:    cmp.le.s $[[T2:f[0-9]+]], $[[T0]], $[[T1]]
+; 64-CMP-DAG:    mfc1     $[[T3:[0-9]+]], $[[T2]]
+; FIXME: This instruction is redundant.
+; 64-CMP-DAG:    andi     $[[T4:[0-9]+]], $[[T3]], 1
+; 64-CMP-DAG:    bnez     $[[T4]],
+
+  %add = fadd fast float %at, %angle
+  %cmp = fcmp ogt float %add, 1.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %sub = fadd fast float %add, -1.000000e+00
+  br label %if.end
+
+if.end:
+  %theta.0 = phi float [ %sub, %if.then ], [ %add, %entry ]
+  ret float %theta.0
+}
+
+; The optimizers sometimes produce setlt instead of setolt/setult.
+define double @bug1_f64(double %angle, double %at) #0 {
+entry:
+; ALL-LABEL: bug1_f64:
+
+; 32-C-DAG:      add.d    $[[T0:f[0-9]+]], $f14, $f12
+; 32-C-DAG:      ldc1     $[[T1:f[0-9]+]], %lo($CPI33_0)(
+; 32-C-DAG:      c.ole.d  $[[T0]], $[[T1]]
+; 32-C-DAG:      bc1t
+
+; 32-CMP-DAG:    add.d    $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    ldc1     $[[T1:f[0-9]+]], %lo($CPI33_0)(
+; 32-CMP-DAG:    cmp.le.d $[[T2:f[0-9]+]], $[[T0]], $[[T1]]
+; 32-CMP-DAG:    mfc1     $[[T3:[0-9]+]], $[[T2]]
+; FIXME: This instruction is redundant.
+; 32-CMP-DAG:    andi     $[[T4:[0-9]+]], $[[T3]], 1
+; 32-CMP-DAG:    bnez     $[[T4]],
+
+; 64-C-DAG:      add.d    $[[T0:f[0-9]+]], $f13, $f12
+; 64-C-DAG:      ldc1     $[[T1:f[0-9]+]], %got_ofst($CPI33_0)(
+; 64-C-DAG:      c.ole.d  $[[T0]], $[[T1]]
+; 64-C-DAG:      bc1t
+
+; 64-CMP-DAG:    add.d    $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    ldc1     $[[T1:f[0-9]+]], %got_ofst($CPI33_0)(
+; 64-CMP-DAG:    cmp.le.d $[[T2:f[0-9]+]], $[[T0]], $[[T1]]
+; 64-CMP-DAG:    mfc1     $[[T3:[0-9]+]], $[[T2]]
+; FIXME: This instruction is redundant.
+; 64-CMP-DAG:    andi     $[[T4:[0-9]+]], $[[T3]], 1
+; 64-CMP-DAG:    bnez     $[[T4]],
+
+  %add = fadd fast double %at, %angle
+  %cmp = fcmp ogt double %add, 1.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %sub = fadd fast double %add, -1.000000e+00
+  br label %if.end
+
+if.end:
+  %theta.0 = phi double [ %sub, %if.then ], [ %add, %entry ]
+  ret double %theta.0
+}
+
+attributes #0 = { nounwind readnone "no-nans-fp-math"="true" }
diff --git a/test/CodeGen/Mips/fcopysign-f32-f64.ll b/test/CodeGen/Mips/fcopysign-f32-f64.ll
index 148a780..860bc79 100644
--- a/test/CodeGen/Mips/fcopysign-f32-f64.ll
+++ b/test/CodeGen/Mips/fcopysign-f32-f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=n64 | FileCheck %s -check-prefix=64
-; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s -check-prefix=64
-; RUN: llc  < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=64R2
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -target-abi=n64 | FileCheck %s -check-prefix=64
+; RUN: llc  < %s -march=mips64el -mcpu=mips64 -target-abi=n64 | FileCheck %s -check-prefix=64
+; RUN: llc  < %s -march=mips64el -mcpu=mips64r2 -target-abi=n64 | FileCheck %s -check-prefix=64R2
 
 declare double @copysign(double, double) nounwind readnone
 
diff --git a/test/CodeGen/Mips/fcopysign.ll b/test/CodeGen/Mips/fcopysign.ll
index 3a9d9c7..6928f2f 100644
--- a/test/CodeGen/Mips/fcopysign.ll
+++ b/test/CodeGen/Mips/fcopysign.ll
@@ -1,8 +1,8 @@
 ; RUN: llc  < %s -march=mipsel -mcpu=mips32 | FileCheck %s -check-prefix=32
 ; RUN: llc  < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=32R2
-; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=n64 | FileCheck %s -check-prefix=64
-; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s -check-prefix=64
-; RUN: llc  < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=64R2
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -target-abi=n64 | FileCheck %s -check-prefix=64
+; RUN: llc  < %s -march=mips64el -mcpu=mips64 -target-abi=n64 | FileCheck %s -check-prefix=64
+; RUN: llc  < %s -march=mips64el -mcpu=mips64r2 -target-abi=n64 | FileCheck %s -check-prefix=64R2
 
 define double @func0(double %d0, double %d1) nounwind readnone {
 entry:
diff --git a/test/CodeGen/Mips/fmadd1.ll b/test/CodeGen/Mips/fmadd1.ll
index 271631e..99d99fa 100644
--- a/test/CodeGen/Mips/fmadd1.ll
+++ b/test/CodeGen/Mips/fmadd1.ll
@@ -8,15 +8,15 @@
 ; RUN: llc < %s -march=mipsel   -mcpu=mips32              -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=32   -check-prefix=32-NONAN
 ; RUN: llc < %s -march=mipsel   -mcpu=mips32r2            -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=32R2 -check-prefix=32R2-NONAN
 ; RUN: llc < %s -march=mipsel   -mcpu=mips32r6            -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=32R6 -check-prefix=32R6-NONAN
-; RUN: llc < %s -march=mips64el -mcpu=mips64   -mattr=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=64   -check-prefix=64-NONAN
-; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=64R2 -check-prefix=64R2-NONAN
-; RUN: llc < %s -march=mips64el -mcpu=mips64r6 -mattr=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=64R6 -check-prefix=64R6-NONAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64   -target-abi=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=64   -check-prefix=64-NONAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -target-abi=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=64R2 -check-prefix=64R2-NONAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64r6 -target-abi=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=64R6 -check-prefix=64R6-NONAN
 ; RUN: llc < %s -march=mipsel   -mcpu=mips32              | FileCheck %s -check-prefix=ALL -check-prefix=32 -check-prefix=32-NAN
 ; RUN: llc < %s -march=mipsel   -mcpu=mips32r2            | FileCheck %s -check-prefix=ALL -check-prefix=32R2 -check-prefix=32R2-NAN
 ; RUN: llc < %s -march=mipsel   -mcpu=mips32r6            | FileCheck %s -check-prefix=ALL -check-prefix=32R6 -check-prefix=32R6-NAN
-; RUN: llc < %s -march=mips64el -mcpu=mips64   -mattr=n64 | FileCheck %s -check-prefix=ALL -check-prefix=64   -check-prefix=64-NAN
-; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=ALL -check-prefix=64R2 -check-prefix=64R2-NAN
-; RUN: llc < %s -march=mips64el -mcpu=mips64r6 -mattr=n64 | FileCheck %s -check-prefix=ALL -check-prefix=64R6 -check-prefix=64R6-NAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64   -target-abi=n64 | FileCheck %s -check-prefix=ALL -check-prefix=64   -check-prefix=64-NAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -target-abi=n64 | FileCheck %s -check-prefix=ALL -check-prefix=64R2 -check-prefix=64R2-NAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64r6 -target-abi=n64 | FileCheck %s -check-prefix=ALL -check-prefix=64R6 -check-prefix=64R6-NAN
 
 define float @FOO0float(float %a, float %b, float %c) nounwind readnone {
 entry:
@@ -39,10 +39,9 @@ entry:
 ; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
 ; 32R6-DAG:      add.s $f0, $[[T1]], $[[T2]]
 
-; 64-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f13
-; 64-DAG:        add.s $[[T2:f[0-9]+]], $[[T1]], $f14
-; 64-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
-; 64-DAG:        add.s $f0, $[[T1]], $[[T2]]
+; 64-DAG:        madd.s $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64-DAG:        mtc1 $zero, $[[T1:f[0-9]+]]
+; 64-DAG:        add.s $f0, $[[T0]], $[[T1]]
 
 ; 64R2:          madd.s $[[T0:f[0-9]+]], $f14, $f12, $f13
 ; 64R2:          mtc1 $zero, $[[T1:f[0-9]+]]
@@ -80,10 +79,9 @@ entry:
 ; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
 ; 32R6-DAG:      add.s $f0, $[[T1]], $[[T2]]
 
-; 64-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f13
-; 64-DAG:        sub.s $[[T2:f[0-9]+]], $[[T1]], $f14
-; 64-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
-; 64-DAG:        add.s $f0, $[[T1]], $[[T2]]
+; 64-DAG:        msub.s $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64-DAG:        mtc1 $zero, $[[T1:f[0-9]+]]
+; 64-DAG:        add.s $f0, $[[T0]], $[[T1]]
 
 ; 64R2:          msub.s $[[T0:f[0-9]+]], $f14, $f12, $f13
 ; 64R2:          mtc1 $zero, $[[T1:f[0-9]+]]
@@ -124,10 +122,11 @@ entry:
 ; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
 ; 32R6-DAG:      sub.s $f0, $[[T2]], $[[T1]]
 
-; 64-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f13
-; 64-DAG:        add.s $[[T2:f[0-9]+]], $[[T1]], $f14
-; 64-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
-; 64-DAG:        sub.s $f0, $[[T2]], $[[T1]]
+; 64-NONAN:      nmadd.s $f0, $f14, $f12, $f13
+
+; 64-NAN:        madd.s $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64-NAN:        mtc1 $zero, $[[T1:f[0-9]+]]
+; 64-NAN:        sub.s  $f0, $[[T1]], $[[T0]]
 
 ; 64R2-NONAN:    nmadd.s $f0, $f14, $f12, $f13
 
@@ -164,10 +163,11 @@ entry:
 ; 32R2-NAN:      mtc1 $zero, $[[T2:f[0-9]+]]
 ; 32R2-NAN:      sub.s  $f0, $[[T2]], $[[T1]]
 
-; 64-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f13
-; 64-DAG:        sub.s $[[T2:f[0-9]+]], $[[T1]], $f14
-; 64-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
-; 64-DAG:        sub.s $f0, $[[T2]], $[[T1]]
+; 64-NAN:        msub.s $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64-NAN:        mtc1 $zero, $[[T1:f[0-9]+]]
+; 64-NAN:        sub.s  $f0, $[[T1]], $[[T0]]
+
+; 64-NONAN:      nmsub.s $f0, $f14, $f12, $f13
 
 ; 64R2-NAN:      msub.s $[[T0:f[0-9]+]], $f14, $f12, $f13
 ; 64R2-NAN:      mtc1 $zero, $[[T1:f[0-9]+]]
@@ -206,10 +206,9 @@ entry:
 ; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
 ; 32R6-DAG:      add.d $f0, $[[T1]], $[[T2]]
 
-; 64-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f13
-; 64-DAG:        add.d $[[T2:f[0-9]+]], $[[T1]], $f14
-; 64-DAG:        dmtc1 $zero, $[[T2:f[0-9]+]]
-; 64-DAG:        add.d $f0, $[[T1]], $[[T2]]
+; 64-DAG:        madd.d $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64-DAG:        mtc1 $zero, $[[T1:f[0-9]+]]
+; 64-DAG:        add.d $f0, $[[T0]], $[[T1]]
 
 ; 64R2:          madd.d $[[T0:f[0-9]+]], $f14, $f12, $f13
 ; 64R2:          mtc1 $zero, $[[T1:f[0-9]+]]
@@ -248,10 +247,9 @@ entry:
 ; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
 ; 32R6-DAG:      add.d $f0, $[[T1]], $[[T2]]
 
-; 64-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f13
-; 64-DAG:        sub.d $[[T2:f[0-9]+]], $[[T1]], $f14
-; 64-DAG:        dmtc1 $zero, $[[T2:f[0-9]+]]
-; 64-DAG:        add.d $f0, $[[T1]], $[[T2]]
+; 64-DAG:        msub.d $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64-DAG:        mtc1 $zero, $[[T1:f[0-9]+]]
+; 64-DAG:        add.d $f0, $[[T0]], $[[T1]]
 
 ; 64R2:          msub.d $[[T0:f[0-9]+]], $f14, $f12, $f13
 ; 64R2:          mtc1 $zero, $[[T1:f[0-9]+]]
@@ -293,10 +291,11 @@ entry:
 ; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
 ; 32R6-DAG:      sub.d $f0, $[[T2]], $[[T1]]
 
-; 64-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f13
-; 64-DAG:        add.d $[[T2:f[0-9]+]], $[[T1]], $f14
-; 64-DAG:        dmtc1 $zero, $[[T2:f[0-9]+]]
-; 64-DAG:        sub.d $f0, $[[T2]], $[[T1]]
+; 64-NONAN:      nmadd.d $f0, $f14, $f12, $f13
+
+; 64-NAN:        madd.d $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64-NAN:        mtc1 $zero, $[[T1:f[0-9]+]]
+; 64-NAN:        sub.d $f0, $[[T1]], $[[T0]]
 
 ; 64R2-NONAN:    nmadd.d $f0, $f14, $f12, $f13
 
@@ -340,10 +339,11 @@ entry:
 ; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
 ; 32R6-DAG:      sub.d $f0, $[[T2]], $[[T1]]
 
-; 64-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f13
-; 64-DAG:        sub.d $[[T2:f[0-9]+]], $[[T1]], $f14
-; 64-DAG:        dmtc1 $zero, $[[T2:f[0-9]+]]
-; 64-DAG:        sub.d $f0, $[[T2]], $[[T1]]
+; 64-NONAN:      nmsub.d $f0, $f14, $f12, $f13
+
+; 64-NAN:        msub.d $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64-NAN:        mtc1 $zero, $[[T1:f[0-9]+]]
+; 64-NAN:        sub.d $f0, $[[T1]], $[[T0]]
 
 ; 64R2-NONAN:    nmsub.d $f0, $f14, $f12, $f13
 
diff --git a/test/CodeGen/Mips/fp-indexed-ls.ll b/test/CodeGen/Mips/fp-indexed-ls.ll
index 787e131..ea337de 100644
--- a/test/CodeGen/Mips/fp-indexed-ls.ll
+++ b/test/CodeGen/Mips/fp-indexed-ls.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=mipsel   -mcpu=mips32   < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32R1
 ; RUN: llc -march=mipsel   -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32R2
 ; RUN: llc -march=mipsel   -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32R6
-; RUN: llc -march=mips64el -mcpu=mips4    -mattr=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS4
-; RUN: llc -march=mips64el -mcpu=mips64   -mattr=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS4
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS4
-; RUN: llc -march=mips64el -mcpu=mips64r6 -mattr=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64R6
+; RUN: llc -march=mips64el -mcpu=mips4    -target-abi=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS4
+; RUN: llc -march=mips64el -mcpu=mips64   -target-abi=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS4
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS4
+; RUN: llc -march=mips64el -mcpu=mips64r6 -target-abi=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64R6
 
 ; Check that [ls][dwu]xc1 are not emitted for nacl.
 ; RUN: llc -mtriple=mipsel-none-nacl-gnu -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=CHECK-NACL
diff --git a/test/CodeGen/Mips/fptr2.ll b/test/CodeGen/Mips/fptr2.ll
deleted file mode 100644
index c8b5e0d..0000000
--- a/test/CodeGen/Mips/fptr2.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static  < %s | FileCheck %s -check-prefix=static16
-
-; Function Attrs: nounwind
-define double @my_mul(double %a, double %b) #0 {
-entry:
-  %a.addr = alloca double, align 8
-  %b.addr = alloca double, align 8
-  store double %a, double* %a.addr, align 8
-  store double %b, double* %b.addr, align 8
-  %0 = load double* %a.addr, align 8
-  %1 = load double* %b.addr, align 8
-  %mul = fmul double %0, %1
-  ret double %mul
-}
-
-; static16: 	        .ent	__fn_stub_my_mul
-; static16:     	.set reorder
-; static16-NEXT:	#NO_APP
-; static16: 	        .end __fn_stub_my_mul
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
diff --git a/test/CodeGen/Mips/fpxx.ll b/test/CodeGen/Mips/fpxx.ll
index 7e2ed22..5b42ece 100644
--- a/test/CodeGen/Mips/fpxx.ll
+++ b/test/CodeGen/Mips/fpxx.ll
@@ -10,11 +10,11 @@
 ; RUN: llc -march=mips64 -mcpu=mips64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-NOFPXX
 ; RUN: not llc -march=mips64 -mcpu=mips64 -mattr=fpxx < %s 2>&1 | FileCheck %s -check-prefix=64-FPXX
 
-; RUN-TODO: llc -march=mips64 -mcpu=mips4 -mattr=-n64,+o32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=4-O32-NOFPXX
-; RUN-TODO: llc -march=mips64 -mcpu=mips4 -mattr=-n64,+o32 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=4-O32-FPXX
+; RUN-TODO: llc -march=mips64 -mcpu=mips4 -target-abi o32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=4-O32-NOFPXX
+; RUN-TODO: llc -march=mips64 -mcpu=mips4 -target-abi o32 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=4-O32-FPXX
 
-; RUN-TODO: llc -march=mips64 -mcpu=mips64 -mattr=-n64,+o32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-O32-NOFPXX
-; RUN-TODO: llc -march=mips64 -mcpu=mips64 -mattr=-n64,+o32 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-O32-FPXX
+; RUN-TODO: llc -march=mips64 -mcpu=mips64 -target-abi o32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-O32-NOFPXX
+; RUN-TODO: llc -march=mips64 -mcpu=mips64 -target-abi o32 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-O32-FPXX
 
 declare double @dbl();
 
diff --git a/test/CodeGen/Mips/global-address.ll b/test/CodeGen/Mips/global-address.ll
index 0785cfc..ae6afeb 100644
--- a/test/CodeGen/Mips/global-address.ll
+++ b/test/CodeGen/Mips/global-address.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-O32
 ; RUN: llc -march=mipsel -relocation-model=static -mtriple=mipsel-linux-gnu < %s | FileCheck %s -check-prefix=STATIC-O32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n32 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n32 -relocation-model=static  -mtriple=mipsel-linux-gnu < %s | FileCheck %s -check-prefix=STATIC-N32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n64 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N64
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n64 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi n32 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi n32 -relocation-model=static  -mtriple=mipsel-linux-gnu < %s | FileCheck %s -check-prefix=STATIC-N32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi n64 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi n64 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N64
 
 @s1 = internal unnamed_addr global i32 8, align 4
 @g1 = external global i32
diff --git a/test/CodeGen/Mips/inlineasm-assembler-directives.ll b/test/CodeGen/Mips/inlineasm-assembler-directives.ll
new file mode 100644
index 0000000..e4a6d1e
--- /dev/null
+++ b/test/CodeGen/Mips/inlineasm-assembler-directives.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=mips < %s | FileCheck %s
+
+; Check for the emission of appropriate assembler directives before and
+; after the inline assembly code.
+define void @f() nounwind {
+entry:
+; CHECK:      #APP
+; CHECK-NEXT: .set  push
+; CHECK-NEXT: .set  at
+; CHECK-NEXT: .set  macro
+; CHECK-NEXT: .set  reorder
+; CHECK:      addi $9, ${{[2-9][0-9]?}}, 8
+; CHECK:      subi ${{[2-9][0-9]?}}, $9, 6
+; CHECK:      .set  pop
+; CHECK-NEXT: #NO_APP
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  store i32 20, i32* %a, align 4
+  %0 = load i32* %a, align 4
+  %1 = call i32 asm sideeffect "addi $$9, $1, 8\0A\09subi $0, $$9, 6", "=r,r,~{$1}"(i32 %0)
+  store i32 %1, i32* %b, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
index a67ddce..41991d0 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
@@ -32,10 +32,10 @@ entry:
 
 ; Now l with 1024: make sure register lo is picked. We do this by checking the instruction
 ; after the inline expression for a mflo to pull the value out of lo.
-; CHECK: #APP
-; CHECK-NEXT:  mtlo ${{[0-9]+}} 
+; CHECK:       #APP
+; CHECK:       mtlo ${{[0-9]+}}
 ; CHECK-NEXT:  madd ${{[0-9]+}},${{[0-9]+}}
-; CHECK-NEXT: #NO_APP	
+; CHECK:       #NO_APP
 ; CHECK-NEXT:  mflo	${{[0-9]+}}
   %bosco = alloca i32, align 4
   call i32 asm sideeffect "\09mtlo $3 \0A\09\09madd $1,$2 ", "=l,r,r,r"(i32 7, i32 6, i32 44) nounwind
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
index a7ba762..acce632 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
@@ -3,7 +3,7 @@
 ; The target is 64 bit.
 ;
 ;
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi=n64 < %s | FileCheck %s
 
 
 define i32 @main() nounwind {
diff --git a/test/CodeGen/Mips/inlineasm64.ll b/test/CodeGen/Mips/inlineasm64.ll
index dbce3c3..a8e949b 100644
--- a/test/CodeGen/Mips/inlineasm64.ll
+++ b/test/CodeGen/Mips/inlineasm64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi=n64 < %s | FileCheck %s
 
 @gl2 = external global i64
 @gl1 = external global i64
diff --git a/test/CodeGen/Mips/inlineasmmemop.ll b/test/CodeGen/Mips/inlineasmmemop.ll
index a08a024..5518520 100644
--- a/test/CodeGen/Mips/inlineasmmemop.ll
+++ b/test/CodeGen/Mips/inlineasmmemop.ll
@@ -5,6 +5,7 @@
 
 define i32 @f1(i32 %x) nounwind {
 entry:
+; CHECK-LABEL: f1:
 ; CHECK: addiu $[[T0:[0-9]+]], $sp
 ; CHECK: #APP
 ; CHECK: sw $4, 0($[[T0]])
@@ -22,42 +23,26 @@ entry:
   ret i32 %0
 }
 
-; "D": Second word of double word. This works for any memory element
+; CHECK-LABEL: main:
+; "D": Second word of a double word. This works for any memory element
 ; double or single.
 ; CHECK: #APP
-; CHECK-NEXT: lw ${{[0-9]+}},4(${{[0-9]+}});
-; CHECK-NEXT: #NO_APP
+; CHECK: lw ${{[0-9]+}},4(${{[0-9]+}});
+; CHECK: #NO_APP
 
-; No "D": First word of double word. This works for any memory element 
+; No "D": First word of a double word. This works for any memory element
 ; double or single.
 ; CHECK: #APP
-; CHECK-NEXT: lw ${{[0-9]+}},0(${{[0-9]+}});
-; CHECK-NEXT: #NO_APP
-
-;int b[8] = {0,1,2,3,4,5,6,7};
-;int main()
-;{
-;  int i;
-; 
-;  // The first word. Notice, no 'D'
-;  { asm (
-;    "lw    %0,%1;\n"
-;    : "=r" (i) : "m" (*(b+4)));}
-; 
-;  // The second word
-;  { asm (
-;    "lw    %0,%D1;\n"
-;    : "=r" (i) "m" (*(b+4)));}
-;}
+; CHECK: lw ${{[0-9]+}},0(${{[0-9]+}});
+; CHECK: #NO_APP
 
 @b = common global [20 x i32] zeroinitializer, align 4
 
 define void @main() {
 entry:
+; Second word:
   tail call void asm sideeffect "    lw    $0,${1:D};", "r,*m,~{$11}"(i32 undef, i32* getelementptr inbounds ([20 x i32]* @b, i32 0, i32 3))
+; First word. Notice, no 'D':
   tail call void asm sideeffect "    lw    $0,${1};", "r,*m,~{$11}"(i32 undef, i32* getelementptr inbounds ([20 x i32]* @b, i32 0, i32 3))
   ret void
 }
-
-attributes #0 = { nounwind }
-
diff --git a/test/CodeGen/Mips/largeimmprinting.ll b/test/CodeGen/Mips/largeimmprinting.ll
index 0e9c91f..918dfee 100644
--- a/test/CodeGen/Mips/largeimmprinting.ll
+++ b/test/CodeGen/Mips/largeimmprinting.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
-; RUN: llc -march=mips64el -mcpu=mips4 -mattr=n64 < %s | \
+; RUN: llc -march=mips64el -mcpu=mips4 -target-abi=n64 < %s | \
 ; RUN:     FileCheck %s -check-prefix=64
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | \
+; RUN: llc -march=mips64el -mcpu=mips64 -target-abi=n64 < %s | \
 ; RUN:     FileCheck %s -check-prefix=64
 
 %struct.S1 = type { [65536 x i8] }
diff --git a/test/CodeGen/Mips/lcb2.ll b/test/CodeGen/Mips/lcb2.ll
index 715584b..59b96e6 100644
--- a/test/CodeGen/Mips/lcb2.ll
+++ b/test/CodeGen/Mips/lcb2.ll
@@ -120,14 +120,14 @@ attributes #1 = { nounwind }
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.5 (gitosis@dmz-portal.mips.com:clang.git ed197d08c90d82e1119774e10920e6f7a841c8ec) (gitosis@dmz-portal.mips.com:llvm.git b9235a363fa2dddb26ac01cbaed58efbc9eff392)"}
-!1 = metadata !{metadata !2, metadata !2, i64 0}
-!2 = metadata !{metadata !"int", metadata !3, i64 0}
-!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
-!5 = metadata !{i32 59}
-!6 = metadata !{i32 156}
-!7 = metadata !{i32 210}
-!8 = metadata !{i32 299}
-!9 = metadata !{i32 340}
-!10 = metadata !{i32 412}
+!0 = !{!"clang version 3.5 (gitosis@dmz-portal.mips.com:clang.git ed197d08c90d82e1119774e10920e6f7a841c8ec) (gitosis@dmz-portal.mips.com:llvm.git b9235a363fa2dddb26ac01cbaed58efbc9eff392)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{i32 59}
+!6 = !{i32 156}
+!7 = !{i32 210}
+!8 = !{i32 299}
+!9 = !{i32 340}
+!10 = !{i32 412}
diff --git a/test/CodeGen/Mips/lcb3c.ll b/test/CodeGen/Mips/lcb3c.ll
index 72a0b8c..eb83291 100644
--- a/test/CodeGen/Mips/lcb3c.ll
+++ b/test/CodeGen/Mips/lcb3c.ll
@@ -55,5 +55,5 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"=
 attributes #1 = { nounwind }
 
 
-!1 = metadata !{i32 65}
-!2 = metadata !{i32 167}
+!1 = !{i32 65}
+!2 = !{i32 167}
diff --git a/test/CodeGen/Mips/lcb4a.ll b/test/CodeGen/Mips/lcb4a.ll
index e37feca..fbcadd2 100644
--- a/test/CodeGen/Mips/lcb4a.ll
+++ b/test/CodeGen/Mips/lcb4a.ll
@@ -59,11 +59,11 @@ attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointe
 attributes #1 = { nounwind }
 
 
-!1 = metadata !{metadata !2, metadata !2, i64 0}
-!2 = metadata !{metadata !"int", metadata !3, i64 0}
-!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
-!5 = metadata !{i32 58}
-!6 = metadata !{i32 108}
-!7 = metadata !{i32 190}
-!8 = metadata !{i32 243}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{i32 58}
+!6 = !{i32 108}
+!7 = !{i32 190}
+!8 = !{i32 243}
diff --git a/test/CodeGen/Mips/lcb5.ll b/test/CodeGen/Mips/lcb5.ll
index 0a89c80..b2a8d1d 100644
--- a/test/CodeGen/Mips/lcb5.ll
+++ b/test/CodeGen/Mips/lcb5.ll
@@ -220,21 +220,21 @@ attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointe
 attributes #1 = { nounwind }
 
 
-!1 = metadata !{metadata !2, metadata !2, i64 0}
-!2 = metadata !{metadata !"int", metadata !3, i64 0}
-!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
-!5 = metadata !{i32 57}
-!6 = metadata !{i32 107}
-!7 = metadata !{i32 188}
-!8 = metadata !{i32 241}
-!9 = metadata !{i32 338}
-!10 = metadata !{i32 391}
-!11 = metadata !{i32 477}
-!12 = metadata !{i32 533}
-!13 = metadata !{i32 621}
-!14 = metadata !{i32 663}
-!15 = metadata !{i32 747}
-!16 = metadata !{i32 792}
-!17 = metadata !{i32 867}
-!18 = metadata !{i32 953}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{i32 57}
+!6 = !{i32 107}
+!7 = !{i32 188}
+!8 = !{i32 241}
+!9 = !{i32 338}
+!10 = !{i32 391}
+!11 = !{i32 477}
+!12 = !{i32 533}
+!13 = !{i32 621}
+!14 = !{i32 663}
+!15 = !{i32 747}
+!16 = !{i32 792}
+!17 = !{i32 867}
+!18 = !{i32 953}
diff --git a/test/CodeGen/Mips/llvm-ir/add.ll b/test/CodeGen/Mips/llvm-ir/add.ll
new file mode 100644
index 0000000..6cccc7d
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/add.ll
@@ -0,0 +1,123 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP64
+
+define signext i1 @add_i1(i1 signext %a, i1 signext %b) {
+entry:
+; ALL-LABEL: add_i1:
+
+  ; ALL:        addu    $[[T0:[0-9]+]], $4, $5
+  ; ALL:        sll     $[[T0]], $[[T0]], 31
+  ; ALL:        sra     $2, $[[T0]], 31
+
+  %r = add i1 %a, %b
+  ret i1 %r
+}
+
+define signext i8 @add_i8(i8 signext %a, i8 signext %b) {
+entry:
+; ALL-LABEL: add_i8:
+
+  ; NOT-R2-R6:  addu    $[[T0:[0-9]+]], $4, $5
+  ; NOT-R2-R6:  sll     $[[T0]], $[[T0]], 24
+  ; NOT-R2-R6:  sra     $2, $[[T0]], 24
+
+  ; R2-R6:         addu    $[[T0:[0-9]+]], $4, $5
+  ; R2-R6:         seb     $2, $[[T0:[0-9]+]]
+
+  %r = add i8 %a, %b
+  ret i8 %r
+}
+
+define signext i16 @add_i16(i16 signext %a, i16 signext %b) {
+entry:
+; ALL-LABEL: add_i16:
+
+  ; NOT-R2-R6:  addu    $[[T0:[0-9]+]], $4, $5
+  ; NOT-R2-R6:  sll     $[[T0]], $[[T0]], 16
+  ; NOT-R2-R6:  sra     $2, $[[T0]], 16
+
+  ; R2-R6:         addu    $[[T0:[0-9]+]], $4, $5
+  ; R2-R6:         seh     $2, $[[T0:[0-9]+]]
+
+  %r = add i16 %a, %b
+  ret i16 %r
+}
+
+define signext i32 @add_i32(i32 signext %a, i32 signext %b) {
+entry:
+; ALL-LABEL: add_i32:
+
+  ; ALL:        addu    $2, $4, $5
+
+  %r = add i32 %a, %b
+  ret i32 %r
+}
+
+define signext i64 @add_i64(i64 signext %a, i64 signext %b) {
+entry:
+; ALL-LABEL: add_i64:
+
+  ; GP32:       addu    $3, $5, $7
+  ; GP32:       sltu    $[[T0:[0-9]+]], $3, $7
+  ; GP32:       addu    $[[T1:[0-9]+]], $[[T0]], $6
+  ; GP32:       addu    $2, $4, $[[T1]]
+
+  ; GP64:       daddu   $2, $4, $5
+
+  %r = add i64 %a, %b
+  ret i64 %r
+}
+
+define signext i128 @add_i128(i128 signext %a, i128 signext %b) {
+entry:
+; ALL-LABEL: add_i128:
+
+  ; GP32:       lw        $[[T0:[0-9]+]], 28($sp)
+  ; GP32:       addu      $[[T1:[0-9]+]], $7, $[[T0]]
+  ; GP32:       sltu      $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+  ; GP32:       lw        $[[T3:[0-9]+]], 24($sp)
+  ; GP32:       addu      $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+  ; GP32:       addu      $[[T5:[0-9]+]], $6, $[[T4]]
+  ; GP32:       sltu      $[[T6:[0-9]+]], $[[T5]], $[[T3]]
+  ; GP32:       lw        $[[T7:[0-9]+]], 20($sp)
+  ; GP32:       addu      $[[T8:[0-9]+]], $[[T6]], $[[T7]]
+  ; GP32:       lw        $[[T9:[0-9]+]], 16($sp)
+  ; GP32:       addu      $3, $5, $[[T8]]
+  ; GP32:       sltu      $[[T10:[0-9]+]], $3, $[[T7]]
+  ; GP32:       addu      $[[T11:[0-9]+]], $[[T10]], $[[T9]]
+  ; GP32:       addu      $2, $4, $[[T11]]
+  ; GP32:       move      $4, $[[T5]]
+  ; GP32:       move      $5, $[[T1]]
+
+  ; GP64:       daddu     $3, $5, $7
+  ; GP64:       sltu      $[[T0:[0-9]+]], $3, $7
+  ; GP64:       daddu     $[[T1:[0-9]+]], $[[T0]], $6
+  ; GP64:       daddu     $2, $4, $[[T1]]
+
+  %r = add i128 %a, %b
+  ret i128 %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/and.ll b/test/CodeGen/Mips/llvm-ir/and.ll
new file mode 100644
index 0000000..8ebcfe4
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/and.ll
@@ -0,0 +1,99 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+
+define signext i1 @and_i1(i1 signext %a, i1 signext %b) {
+entry:
+; ALL-LABEL: and_i1:
+
+  ; ALL:          and     $2, $4, $5
+
+  %r = and i1 %a, %b
+  ret i1 %r
+}
+
+define signext i8 @and_i8(i8 signext %a, i8 signext %b) {
+entry:
+; ALL-LABEL: and_i8:
+
+  ; ALL:          and     $2, $4, $5
+
+  %r = and i8 %a, %b
+  ret i8 %r
+}
+
+define signext i16 @and_i16(i16 signext %a, i16 signext %b) {
+entry:
+; ALL-LABEL: and_i16:
+
+  ; ALL:          and     $2, $4, $5
+
+  %r = and i16 %a, %b
+  ret i16 %r
+}
+
+define signext i32 @and_i32(i32 signext %a, i32 signext %b) {
+entry:
+; ALL-LABEL: and_i32:
+
+  ; ALL:          and     $2, $4, $5
+
+  %r = and i32 %a, %b
+  ret i32 %r
+}
+
+define signext i64 @and_i64(i64 signext %a, i64 signext %b) {
+entry:
+; ALL-LABEL: and_i64:
+
+  ; GP32:         and     $2, $4, $6
+  ; GP32:         and     $3, $5, $7
+
+  ; GP64:         and     $2, $4, $5
+
+  %r = and i64 %a, %b
+  ret i64 %r
+}
+
+define signext i128 @and_i128(i128 signext %a, i128 signext %b) {
+entry:
+; ALL-LABEL: and_i128:
+
+  ; GP32:         lw      $[[T0:[0-9]+]], 24($sp)
+  ; GP32:         lw      $[[T1:[0-9]+]], 20($sp)
+  ; GP32:         lw      $[[T2:[0-9]+]], 16($sp)
+  ; GP32:         and     $2, $4, $[[T2]]
+  ; GP32:         and     $3, $5, $[[T1]]
+  ; GP32:         and     $4, $6, $[[T0]]
+  ; GP32:         lw      $[[T3:[0-9]+]], 28($sp)
+  ; GP32:         and     $5, $7, $[[T3]]
+
+  ; GP64:         and     $2, $4, $6
+  ; GP64:         and     $3, $5, $7
+
+  %r = and i128 %a, %b
+  ret i128 %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/ashr.ll b/test/CodeGen/Mips/llvm-ir/ashr.ll
new file mode 100644
index 0000000..7e1587c
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -0,0 +1,200 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=M2
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=32R6
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=M3
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=64R6
+
+define signext i1 @ashr_i1(i1 signext %a, i1 signext %b) {
+entry:
+; ALL-LABEL: ashr_i1:
+
+  ; ALL:        move    $2, $4
+
+  %r = ashr i1 %a, %b
+  ret i1 %r
+}
+
+define signext i8 @ashr_i8(i8 signext %a, i8 signext %b) {
+entry:
+; ALL-LABEL: ashr_i8:
+
+  ; FIXME: The andi instruction is redundant.
+  ; ALL:        andi    $[[T0:[0-9]+]], $5, 255
+  ; ALL:        srav    $2, $4, $[[T0]]
+
+  %r = ashr i8 %a, %b
+  ret i8 %r
+}
+
+define signext i16 @ashr_i16(i16 signext %a, i16 signext %b) {
+entry:
+; ALL-LABEL: ashr_i16:
+
+  ; FIXME: The andi instruction is redundant.
+  ; ALL:        andi    $[[T0:[0-9]+]], $5, 65535
+  ; ALL:        srav    $2, $4, $[[T0]]
+
+  %r = ashr i16 %a, %b
+  ret i16 %r
+}
+
+define signext i32 @ashr_i32(i32 signext %a, i32 signext %b) {
+entry:
+; ALL-LABEL: ashr_i32:
+
+  ; ALL:        srav    $2, $4, $5
+
+  %r = ashr i32 %a, %b
+  ret i32 %r
+}
+
+define signext i64 @ashr_i64(i64 signext %a, i64 signext %b) {
+entry:
+; ALL-LABEL: ashr_i64:
+
+  ; M2:         srav      $[[T0:[0-9]+]], $4, $7
+  ; M2:         andi      $[[T1:[0-9]+]], $7, 32
+  ; M2:         bnez      $[[T1]], $[[BB0:BB[0-9_]+]]
+  ; M2:         move      $3, $[[T0]]
+  ; M2:         srlv      $[[T2:[0-9]+]], $5, $7
+  ; M2:         not       $[[T3:[0-9]+]], $7
+  ; M2:         sll       $[[T4:[0-9]+]], $4, 1
+  ; M2:         sllv      $[[T5:[0-9]+]], $[[T4]], $[[T3]]
+  ; M2:         or        $3, $[[T3]], $[[T2]]
+  ; M2:         $[[BB0]]:
+  ; M2:         beqz      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         nop
+  ; M2:         sra       $2, $4, 31
+  ; M2:         $[[BB1]]:
+  ; M2:         jr        $ra
+  ; M2:         nop
+
+  ; 32R1-R5:    srlv      $[[T0:[0-9]+]], $5, $7
+  ; 32R1-R5:    not       $[[T1:[0-9]+]], $7
+  ; 32R1-R5:    sll       $[[T2:[0-9]+]], $4, 1
+  ; 32R1-R5:    sllv      $[[T3:[0-9]+]], $[[T2]], $[[T1]]
+  ; 32R1-R5:    or        $3, $[[T3]], $[[T0]]
+  ; 32R1-R5:    srav      $[[T4:[0-9]+]], $4, $7
+  ; 32R1-R5:    andi      $[[T5:[0-9]+]], $7, 32
+  ; 32R1-R5:    movn      $3, $[[T4]], $[[T5]]
+  ; 32R1-R5:    sra       $4, $4, 31
+  ; 32R1-R5:    jr        $ra
+  ; 32R1-R5:    movn      $2, $4, $[[T5]]
+
+  ; 32R6:       srav      $[[T0:[0-9]+]], $4, $7
+  ; 32R6:       andi      $[[T1:[0-9]+]], $7, 32
+  ; 32R6:       seleqz    $[[T2:[0-9]+]], $[[T0]], $[[T1]]
+  ; 32R6:       sra       $[[T3:[0-9]+]], $4, 31
+  ; 32R6:       selnez    $[[T4:[0-9]+]], $[[T3]], $[[T1]]
+  ; 32R6:       or        $[[T5:[0-9]+]], $[[T4]], $[[T2]]
+  ; 32R6:       srlv      $[[T6:[0-9]+]], $5, $7
+  ; 32R6:       not       $[[T7:[0-9]+]], $7
+  ; 32R6:       sll       $[[T8:[0-9]+]], $4, 1
+  ; 32R6:       sllv      $[[T9:[0-9]+]], $[[T8]], $[[T7]]
+  ; 32R6:       or        $[[T10:[0-9]+]], $[[T9]], $[[T6]]
+  ; 32R6:       seleqz    $[[T11:[0-9]+]], $[[T10]], $[[T1]]
+  ; 32R6:       selnez    $[[T12:[0-9]+]], $[[T0]], $[[T1]]
+  ; 32R6:       jr        $ra
+  ; 32R6:       or        $3, $[[T0]], $[[T11]]
+
+  ; FIXME: The sll instruction below is redundant.
+  ; GP64:       sll       $[[T0:[0-9]+]], $5, 0
+  ; GP64:       dsrav     $2, $4, $[[T0]]
+
+  %r = ashr i64 %a, %b
+  ret i64 %r
+}
+
+define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
+entry:
+; ALL-LABEL: ashr_i128:
+
+  ; GP32:           lw        $25, %call16(__ashrti3)($gp)
+
+  ; M3:             sll       $[[T0:[0-9]+]], $7, 0
+  ; M3:             dsrav     $[[T1:[0-9]+]], $4, $[[T0]]
+  ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 32
+  ; M3:             bnez      $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]]
+  ; M3:             move      $3, $[[T1]]
+  ; M3:             dsrlv     $[[T4:[0-9]+]], $5, $[[T0]]
+  ; M3:             dsll      $[[T5:[0-9]+]], $4, 1
+  ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
+  ; M3:             dsllv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
+  ; M3:             or        $3, $[[T7]], $[[T4]]
+  ; M3:             $[[BB0]]:
+  ; M3:             beqz      $[[T3]], $[[BB1:BB[0-9_]+]]
+  ; M3:             nop
+  ; M3:             dsra      $2, $4, 31
+  ; M3:             $[[BB1]]:
+  ; M3:             jr        $ra
+  ; M3:             nop
+
+  ; GP64-NOT-R6:    sll       $[[T0:[0-9]+]], $7, 0
+  ; GP64-NOT-R6:    dsrlv     $[[T1:[0-9]+]], $5, $[[T0]]
+  ; GP64-NOT-R6:    dsll      $[[T2:[0-9]+]], $4, 1
+  ; GP64-NOT-R6:    not       $[[T3:[0-9]+]], $[[T0]]
+  ; GP64-NOT-R6:    dsllv     $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+  ; GP64-NOT-R6:    or        $3, $[[T4]], $[[T1]]
+  ; GP64-NOT-R6:    dsrav     $2, $4, $[[T0]]
+  ; GP64-NOT-R6:    andi      $[[T5:[0-9]+]], $[[T0]], 32
+
+  ; GP64-NOT-R6:    movn      $3, $2, $[[T5]]
+  ; GP64-NOT-R6:    dsra      $[[T6:[0-9]+]], $4, 31
+  ; GP64-NOT-R6:    jr        $ra
+  ; GP64-NOT-R6:    movn      $2, $[[T6]], $[[T5]]
+
+  ; 64R6:           sll       $[[T0:[0-9]+]], $7, 0
+  ; 64R6:           dsrav     $[[T1:[0-9]+]], $4, $[[T0]]
+  ; 64R6:           andi      $[[T2:[0-9]+]], $[[T0]], 32
+  ; 64R6:           sll       $[[T3:[0-9]+]], $[[T2]], 0
+  ; 64R6:           seleqz    $[[T4:[0-9]+]], $[[T1]], $[[T3]]
+  ; 64R6:           dsra      $[[T5:[0-9]+]], $4, 31
+  ; 64R6:           selnez    $[[T6:[0-9]+]], $[[T5]], $[[T3]]
+  ; 64R6:           or        $2, $[[T6]], $[[T4]]
+  ; 64R6:           dsrlv     $[[T7:[0-9]+]], $5, $[[T0]]
+  ; 64R6:           dsll      $[[T8:[0-9]+]], $4, 1
+  ; 64R6:           not       $[[T9:[0-9]+]], $[[T0]]
+  ; 64R6:           dsllv     $[[T10:[0-9]+]], $[[T8]], $[[T9]]
+  ; 64R6:           or        $[[T11:[0-9]+]], $[[T10]], $[[T7]]
+  ; 64R6:           seleqz    $[[T12:[0-9]+]], $[[T11]], $[[T3]]
+  ; 64R6:           selnez    $[[T13:[0-9]+]], $[[T1]], $[[T3]]
+  ; 64R6:           jr        $ra
+  ; 64R6:           or        $3, $[[T13]], $[[T12]]
+
+  %r = ashr i128 %a, %b
+  ret i128 %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/call.ll b/test/CodeGen/Mips/llvm-ir/call.ll
index 4cbf43c..112ab8e 100644
--- a/test/CodeGen/Mips/llvm-ir/call.ll
+++ b/test/CodeGen/Mips/llvm-ir/call.ll
@@ -3,10 +3,14 @@
 ; FIXME: We should remove the need for -enable-mips-tail-calls
 ; RUN: llc -march=mips   -mcpu=mips32   -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
 ; RUN: llc -march=mips   -mcpu=mips32r2 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
+; RUN: llc -march=mips   -mcpu=mips32r3 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
+; RUN: llc -march=mips   -mcpu=mips32r5 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
 ; RUN: llc -march=mips   -mcpu=mips32r6 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
 ; RUN: llc -march=mips64 -mcpu=mips4    -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
 ; RUN: llc -march=mips64 -mcpu=mips64   -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
 ; RUN: llc -march=mips64 -mcpu=mips64r2 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
+; RUN: llc -march=mips64 -mcpu=mips64r3 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
+; RUN: llc -march=mips64 -mcpu=mips64r5 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
 ; RUN: llc -march=mips64 -mcpu=mips64r6 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
 
 declare void @extern_void_void()
diff --git a/test/CodeGen/Mips/llvm-ir/indirectbr.ll b/test/CodeGen/Mips/llvm-ir/indirectbr.ll
index d8fd787..debfeb3 100644
--- a/test/CodeGen/Mips/llvm-ir/indirectbr.ll
+++ b/test/CodeGen/Mips/llvm-ir/indirectbr.ll
@@ -2,10 +2,14 @@
 
 ; RUN: llc -march=mips   -mcpu=mips32   -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
 ; RUN: llc -march=mips   -mcpu=mips32r2 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
+; RUN: llc -march=mips   -mcpu=mips32r3 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
+; RUN: llc -march=mips   -mcpu=mips32r5 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
 ; RUN: llc -march=mips   -mcpu=mips32r6 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=R6
 ; RUN: llc -march=mips64 -mcpu=mips4    -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
 ; RUN: llc -march=mips64 -mcpu=mips64   -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
 ; RUN: llc -march=mips64 -mcpu=mips64r2 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64r3 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64r5 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
 ; RUN: llc -march=mips64 -mcpu=mips64r6 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=R6
 
 define i32 @br(i8 *%addr) {
diff --git a/test/CodeGen/Mips/llvm-ir/lshr.ll b/test/CodeGen/Mips/llvm-ir/lshr.ll
new file mode 100644
index 0000000..7344d95
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -0,0 +1,188 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=M2
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=32R6
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=M3
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=64R6
+
+define signext i1 @lshr_i1(i1 signext %a, i1 signext %b) {
+entry:
+; ALL-LABEL: lshr_i1:
+
+  ; ALL:        move    $2, $4
+
+  %r = lshr i1 %a, %b
+  ret i1 %r
+}
+
+define zeroext i8 @lshr_i8(i8 zeroext %a, i8 zeroext %b) {
+entry:
+; ALL-LABEL: lshr_i8:
+
+  ; ALL:        srlv    $[[T0:[0-9]+]], $4, $5
+  ; ALL:        andi    $2, $[[T0]], 255
+
+  %r = lshr i8 %a, %b
+  ret i8 %r
+}
+
+define zeroext i16 @lshr_i16(i16 zeroext %a, i16 zeroext %b) {
+entry:
+; ALL-LABEL: lshr_i16:
+
+  ; ALL:        srlv    $[[T0:[0-9]+]], $4, $5
+  ; ALL:        andi    $2, $[[T0]], 65535
+
+  %r = lshr i16 %a, %b
+  ret i16 %r
+}
+
+define signext i32 @lshr_i32(i32 signext %a, i32 signext %b) {
+entry:
+; ALL-LABEL: lshr_i32:
+
+  ; ALL:          srlv    $2, $4, $5
+
+  %r = lshr i32 %a, %b
+  ret i32 %r
+}
+
+define signext i64 @lshr_i64(i64 signext %a, i64 signext %b) {
+entry:
+; ALL-LABEL: lshr_i64:
+
+  ; M2:         srlv      $[[T0:[0-9]+]], $4, $7
+  ; M2:         andi      $[[T1:[0-9]+]], $7, 32
+  ; M2:         bnez      $[[T1]], $[[BB0:BB[0-9_]+]]
+  ; M2:         move      $3, $[[T0]]
+  ; M2:         srlv      $[[T2:[0-9]+]], $5, $7
+  ; M2:         not       $[[T3:[0-9]+]], $7
+  ; M2:         sll       $[[T4:[0-9]+]], $4, 1
+  ; M2:         sllv      $[[T5:[0-9]+]], $[[T4]], $[[T3]]
+  ; M2:         or        $3, $[[T3]], $[[T2]]
+  ; M2:         $[[BB0]]:
+  ; M2:         bnez      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         addiu     $2, $zero, 0
+  ; M2:         move      $2, $[[T0]]
+  ; M2:         $[[BB1]]:
+  ; M2:         jr        $ra
+  ; M2:         nop
+
+  ; 32R1-R5:    srlv      $[[T0:[0-9]+]], $5, $7
+  ; 32R1-R5:    not       $[[T1:[0-9]+]], $7
+  ; 32R1-R5:    sll       $[[T2:[0-9]+]], $4, 1
+  ; 32R1-R5:    sllv      $[[T3:[0-9]+]], $[[T2]], $[[T1]]
+  ; 32R1-R5:    or        $3, $[[T3]], $[[T0]]
+  ; 32R1-R5:    srlv      $[[T4:[0-9]+]], $4, $7
+  ; 32R1-R5:    andi      $[[T5:[0-9]+]], $7, 32
+  ; 32R1-R5:    movn      $3, $[[T4]], $[[T5]]
+  ; 32R1-R5:    jr        $ra
+  ; 32R1-R5:    movn      $2, $zero, $[[T5]]
+
+  ; 32R6:       srlv      $[[T0:[0-9]+]], $5, $7
+  ; 32R6:       not       $[[T1:[0-9]+]], $7
+  ; 32R6:       sll       $[[T2:[0-9]+]], $4, 1
+  ; 32R6:       sllv      $[[T3:[0-9]+]], $[[T2]], $[[T1]]
+  ; 32R6:       or        $[[T4:[0-9]+]], $[[T3]], $[[T0]]
+  ; 32R6:       andi      $[[T5:[0-9]+]], $7, 32
+  ; 32R6:       seleqz    $[[T6:[0-9]+]], $[[T4]], $[[T3]]
+  ; 32R6:       srlv      $[[T7:[0-9]+]], $4, $7
+  ; 32R6:       selnez    $[[T8:[0-9]+]], $[[T7]], $[[T5]]
+  ; 32R6:       or        $3, $[[T8]], $[[T6]]
+  ; 32R6:       jr        $ra
+  ; 32R6:       seleqz    $2, $[[T7]], $[[T5]]
+
+  ; GP64:         sll     $[[T0:[0-9]+]], $5, 0
+  ; GP64:         dsrlv   $2, $4, $[[T0]]
+
+  %r = lshr i64 %a, %b
+  ret i64 %r
+}
+
+define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
+entry:
+; ALL-LABEL: lshr_i128:
+
+  ; GP32:         lw      $25, %call16(__lshrti3)($gp)
+
+  ; M3:             sll       $[[T0:[0-9]+]], $7, 0
+  ; M3:             dsrlv     $[[T1:[0-9]+]], $4, $[[T0]]
+  ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 32
+  ; M3:             bnez      $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]]
+  ; M3:             move      $3, $[[T1]]
+  ; M3:             dsrlv     $[[T4:[0-9]+]], $5, $[[T0]]
+  ; M3:             dsll      $[[T5:[0-9]+]], $4, 1
+  ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
+  ; M3:             dsllv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
+  ; M3:             or        $3, $[[T7]], $[[T4]]
+  ; M3:             $[[BB0]]:
+  ; M3:             bnez      $[[T3]], $[[BB1:BB[0-9_]+]]
+  ; M3:             daddiu    $2, $zero, 0
+  ; M3:             move      $2, $[[T1]]
+  ; M3:             $[[BB1]]:
+  ; M3:             jr        $ra
+  ; M3:             nop
+
+  ; GP64-NOT-R6:    sll       $[[T0:[0-9]+]], $7, 0
+  ; GP64-NOT-R6:    dsrlv     $[[T1:[0-9]+]], $5, $[[T0]]
+  ; GP64-NOT-R6:    dsll      $[[T2:[0-9]+]], $4, 1
+  ; GP64-NOT-R6:    not       $[[T3:[0-9]+]], $[[T0]]
+  ; GP64-NOT-R6:    dsllv     $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+  ; GP64-NOT-R6:    or        $3, $[[T4]], $[[T1]]
+  ; GP64-NOT-R6:    dsrlv     $2, $4, $[[T0]]
+  ; GP64-NOT-R6:    andi      $[[T5:[0-9]+]], $[[T0]], 32
+  ; GP64-NOT-R6:    movn      $3, $2, $[[T5]]
+  ; GP64-NOT-R6:    jr        $ra
+  ; GP64-NOT-R6:    movn      $2, $zero, $1
+
+  ; 64R6:           sll       $[[T0:[0-9]+]], $7, 0
+  ; 64R6:           dsrlv     $[[T1:[0-9]+]], $5, $[[T0]]
+  ; 64R6:           dsll      $[[T2:[0-9]+]], $4, 1
+  ; 64R6:           not       $[[T3:[0-9]+]], $[[T0]]
+  ; 64R6:           dsllv     $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+  ; 64R6:           or        $[[T5:[0-9]+]], $[[T4]], $[[T1]]
+  ; 64R6:           andi      $[[T6:[0-9]+]], $[[T0]], 32
+  ; 64R6:           sll       $[[T7:[0-9]+]], $[[T6]], 0
+  ; 64R6:           seleqz    $[[T8:[0-9]+]], $[[T5]], $[[T7]]
+  ; 64R6:           dsrlv     $[[T9:[0-9]+]], $4, $[[T0]]
+  ; 64R6:           selnez    $[[T10:[0-9]+]], $[[T9]], $[[T7]]
+  ; 64R6:           or        $3, $[[T10]], $[[T8]]
+  ; 64R6:           jr        $ra
+  ; 64R6:           seleqz    $2, $[[T0]], $[[T7]]
+
+  %r = lshr i128 %a, %b
+  ret i128 %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/mul.ll b/test/CodeGen/Mips/llvm-ir/mul.ll
index 1674124..a758280 100644
--- a/test/CodeGen/Mips/llvm-ir/mul.ll
+++ b/test/CodeGen/Mips/llvm-ir/mul.ll
@@ -1,19 +1,27 @@
-; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN:    -check-prefix=ALL -check-prefix=M2
-; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN:    -check-prefix=ALL -check-prefix=32R1-R2 -check-prefix=32R1
-; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN:    -check-prefix=ALL -check-prefix=32R1-R2 -check-prefix=32R2
-; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN:    -check-prefix=ALL -check-prefix=32R6
-; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN:    -check-prefix=ALL -check-prefix=M4
-; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN:    -check-prefix=ALL -check-prefix=64R1-R2
-; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN:    -check-prefix=ALL -check-prefix=64R1-R2
-; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN:     -check-prefix=ALL -check-prefix=64R6
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s -check-prefix=ALL \
+; RUN:    -check-prefix=M2 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s -check-prefix=ALL \
+; RUN:    -check-prefix=32R1-R5 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL \
+; RUN:    -check-prefix=32R1-R5 -check-prefix=32R2-R5 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s -check-prefix=ALL \
+; RUN:    -check-prefix=32R1-R5 -check-prefix=32R2-R5 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s -check-prefix=ALL \
+; RUN:    -check-prefix=32R1-R5 -check-prefix=32R2-R5 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL \
+; RUN:    -check-prefix=32R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s -check-prefix=ALL \
+; RUN:    -check-prefix=M4 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s -check-prefix=ALL \
+; RUN:    -check-prefix=64R1-R5 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL \
+; RUN:    -check-prefix=64R1-R5 -check-prefix=GP64 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s -check-prefix=ALL \
+; RUN:    -check-prefix=64R1-R5 -check-prefix=GP64 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s -check-prefix=ALL \
+; RUN:    -check-prefix=64R1-R5 -check-prefix=GP64 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s -check-prefix=ALL \
+; RUN:    -check-prefix=64R6
 
 define signext i1 @mul_i1(i1 signext %a, i1 signext %b) {
 entry:
@@ -24,9 +32,9 @@ entry:
   ; M2:         sll     $[[T0]], $[[T0]], 31
   ; M2:         sra     $2, $[[T0]], 31
 
-  ; 32R1-R2:    mul     $[[T0:[0-9]+]], $4, $5
-  ; 32R1-R2:    sll     $[[T0]], $[[T0]], 31
-  ; 32R1-R2:    sra     $2, $[[T0]], 31
+  ; 32R1-R5:    mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R1-R5:    sll     $[[T0]], $[[T0]], 31
+  ; 32R1-R5:    sra     $2, $[[T0]], 31
 
   ; 32R6:       mul     $[[T0:[0-9]+]], $4, $5
   ; 32R6:       sll     $[[T0]], $[[T0]], 31
@@ -37,9 +45,9 @@ entry:
   ; M4:         sll     $[[T0]], $[[T0]], 31
   ; M4:         sra     $2, $[[T0]], 31
 
-  ; 64R1-R2:    mul     $[[T0:[0-9]+]], $4, $5
-  ; 64R1-R2:    sll     $[[T0]], $[[T0]], 31
-  ; 64R1-R2:    sra     $2, $[[T0]], 31
+  ; 64R1-R5:    mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R1-R5:    sll     $[[T0]], $[[T0]], 31
+  ; 64R1-R5:    sra     $2, $[[T0]], 31
 
   ; 64R6:       mul     $[[T0:[0-9]+]], $4, $5
   ; 64R6:       sll     $[[T0]], $[[T0]], 31
@@ -62,8 +70,8 @@ entry:
   ; 32R1:       sll     $[[T0]], $[[T0]], 24
   ; 32R1:       sra     $2, $[[T0]], 24
 
-  ; 32R2:       mul     $[[T0:[0-9]+]], $4, $5
-  ; 32R2:       seb     $2, $[[T0]]
+  ; 32R2-R5:    mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R2-R5:    seb     $2, $[[T0]]
 
   ; 32R6:       mul     $[[T0:[0-9]+]], $4, $5
   ; 32R6:       seb     $2, $[[T0]]
@@ -99,8 +107,8 @@ entry:
   ; 32R1:       sll     $[[T0]], $[[T0]], 16
   ; 32R1:       sra     $2, $[[T0]], 16
 
-  ; 32R2:       mul     $[[T0:[0-9]+]], $4, $5
-  ; 32R2:       seh     $2, $[[T0]]
+  ; 32R2-R5:    mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R2-R5:    seh     $2, $[[T0]]
 
   ; 32R6:       mul     $[[T0:[0-9]+]], $4, $5
   ; 32R6:       seh     $2, $[[T0]]
@@ -130,10 +138,10 @@ entry:
   ; M2:         mult    $4, $5
   ; M2:         mflo    $2
 
-  ; 32R1-R2:    mul     $2, $4, $5
+  ; 32R1-R5:    mul     $2, $4, $5
   ; 32R6:       mul     $2, $4, $5
 
-  ; 64R1-R2:    mul     $2, $4, $5
+  ; 64R1-R5:    mul     $2, $4, $5
   ; 64R6:       mul     $2, $4, $5
   %r = mul i32 %a, %b
   ret i32 %r
@@ -153,13 +161,13 @@ entry:
   ; M2:         addu    $[[T2:[0-9]+]], $4, $[[T1]]
   ; M2:         addu    $2, $[[T2]], $[[T0]]
 
-  ; 32R1-R2:    multu   $5, $7
-  ; 32R1-R2:    mflo    $3
-  ; 32R1-R2:    mfhi    $[[T0:[0-9]+]]
-  ; 32R1-R2:    mul     $[[T1:[0-9]+]], $4, $7
-  ; 32R1-R2:    mul     $[[T2:[0-9]+]], $5, $6
-  ; 32R1-R2:    addu    $[[T0]], $[[T0]], $[[T2:[0-9]+]]
-  ; 32R1-R2:    addu    $2, $[[T0]], $[[T1]]
+  ; 32R1-R5:    multu   $5, $7
+  ; 32R1-R5:    mflo    $3
+  ; 32R1-R5:    mfhi    $[[T0:[0-9]+]]
+  ; 32R1-R5:    mul     $[[T1:[0-9]+]], $4, $7
+  ; 32R1-R5:    mul     $[[T2:[0-9]+]], $5, $6
+  ; 32R1-R5:    addu    $[[T0]], $[[T0]], $[[T2:[0-9]+]]
+  ; 32R1-R5:    addu    $2, $[[T0]], $[[T1]]
 
   ; 32R6:       mul     $[[T0:[0-9]+]], $5, $6
   ; 32R6:       muhu    $[[T1:[0-9]+]], $5, $7
@@ -171,11 +179,38 @@ entry:
   ; M4:         dmult   $4, $5
   ; M4:         mflo    $2
 
-  ; 64R1-R2:    dmult   $4, $5
-  ; 64R1-R2:    mflo    $2
+  ; 64R1-R5:    dmult   $4, $5
+  ; 64R1-R5:    mflo    $2
 
   ; 64R6:       dmul    $2, $4, $5
 
   %r = mul i64 %a, %b
   ret i64 %r
 }
+
+define signext i128 @mul_i128(i128 signext %a, i128 signext %b) {
+entry:
+; ALL-LABEL: mul_i128:
+
+  ; GP32:           lw      $25, %call16(__multi3)($gp)
+
+  ; GP64-NOT-R6:    dmult   $4, $7
+  ; GP64-NOT-R6:    mflo    $[[T0:[0-9]+]]
+  ; GP64-NOT-R6:    dmult   $5, $6
+  ; GP64-NOT-R6:    mflo    $[[T1:[0-9]+]]
+  ; GP64-NOT-R6:    dmultu  $5, $7
+  ; GP64-NOT-R6:    mflo    $3
+  ; GP64-NOT-R6:    mfhi    $[[T2:[0-9]+]]
+  ; GP64-NOT-R6:    daddu   $[[T3:[0-9]+]], $[[T2]], $[[T1]]
+  ; GP64-NOT-R6:    daddu   $2, $[[T3:[0-9]+]], $[[T0]]
+
+  ; 64R6:           dmul    $[[T0:[0-9]+]], $5, $6
+  ; 64R6:           dmuhu   $[[T1:[0-9]+]], $5, $7
+  ; 64R6:           daddu   $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+  ; 64R6:           dmul    $[[T3:[0-9]+]], $4, $7
+  ; 64R6:           daddu   $2, $[[T2]], $[[T3]]
+  ; 64R6:           dmul    $3, $5, $7
+
+  %r = mul i128 %a, %b
+  ret i128 %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/or.ll b/test/CodeGen/Mips/llvm-ir/or.ll
new file mode 100644
index 0000000..6215e40
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/or.ll
@@ -0,0 +1,99 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+
+define signext i1 @or_i1(i1 signext %a, i1 signext %b) {
+entry:
+; ALL-LABEL: or_i1:
+
+  ; ALL:          or     $2, $4, $5
+
+  %r = or i1 %a, %b
+  ret i1 %r
+}
+
+define signext i8 @or_i8(i8 signext %a, i8 signext %b) {
+entry:
+; ALL-LABEL: or_i8:
+
+  ; ALL:          or     $2, $4, $5
+
+  %r = or i8 %a, %b
+  ret i8 %r
+}
+
+define signext i16 @or_i16(i16 signext %a, i16 signext %b) {
+entry:
+; ALL-LABEL: or_i16:
+
+  ; ALL:          or     $2, $4, $5
+
+  %r = or i16 %a, %b
+  ret i16 %r
+}
+
+define signext i32 @or_i32(i32 signext %a, i32 signext %b) {
+entry:
+; ALL-LABEL: or_i32:
+
+  ; ALL:          or     $2, $4, $5
+
+  %r = or i32 %a, %b
+  ret i32 %r
+}
+
+define signext i64 @or_i64(i64 signext %a, i64 signext %b) {
+entry:
+; ALL-LABEL: or_i64:
+
+  ; GP32:         or     $2, $4, $6
+  ; GP32:         or     $3, $5, $7
+
+  ; GP64:         or     $2, $4, $5
+
+  %r = or i64 %a, %b
+  ret i64 %r
+}
+
+define signext i128 @or_i128(i128 signext %a, i128 signext %b) {
+entry:
+; ALL-LABEL: or_i128:
+
+  ; GP32:         lw     $[[T0:[0-9]+]], 24($sp)
+  ; GP32:         lw     $[[T1:[0-9]+]], 20($sp)
+  ; GP32:         lw     $[[T2:[0-9]+]], 16($sp)
+  ; GP32:         or     $2, $4, $[[T2]]
+  ; GP32:         or     $3, $5, $[[T1]]
+  ; GP32:         or     $4, $6, $[[T0]]
+  ; GP32:         lw     $[[T3:[0-9]+]], 28($sp)
+  ; GP32:         or     $5, $7, $[[T3]]
+
+  ; GP64:         or     $2, $4, $6
+  ; GP64:         or     $3, $5, $7
+
+  %r = or i128 %a, %b
+  ret i128 %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/ret.ll b/test/CodeGen/Mips/llvm-ir/ret.ll
index 8f5b115..0561c24 100644
--- a/test/CodeGen/Mips/llvm-ir/ret.ll
+++ b/test/CodeGen/Mips/llvm-ir/ret.ll
@@ -9,10 +9,14 @@
 
 ; RUN: llc -march=mips   -mcpu=mips32   -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=NO-MTHC1 -check-prefix=NOT-R6
 ; RUN: llc -march=mips   -mcpu=mips32r2 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=MTHC1 -check-prefix=NOT-R6
+; RUN: llc -march=mips   -mcpu=mips32r3 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=MTHC1 -check-prefix=NOT-R6
+; RUN: llc -march=mips   -mcpu=mips32r5 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=MTHC1 -check-prefix=NOT-R6
 ; RUN: llc -march=mips   -mcpu=mips32r6 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=MTHC1 -check-prefix=R6
 ; RUN: llc -march=mips64 -mcpu=mips4    -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=DMTC1 -check-prefix=NOT-R6
 ; RUN: llc -march=mips64 -mcpu=mips64   -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=DMTC1 -check-prefix=NOT-R6
 ; RUN: llc -march=mips64 -mcpu=mips64r2 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=DMTC1 -check-prefix=NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64r3 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=DMTC1 -check-prefix=NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64r5 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=DMTC1 -check-prefix=NOT-R6
 ; RUN: llc -march=mips64 -mcpu=mips64r6 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=DMTC1 -check-prefix=R6
 
 define void @ret_void() {
diff --git a/test/CodeGen/Mips/llvm-ir/sdiv.ll b/test/CodeGen/Mips/llvm-ir/sdiv.ll
new file mode 100644
index 0000000..929ee88
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/sdiv.ll
@@ -0,0 +1,144 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=R2-R5 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=R2-R5 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=R2-R5 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN:    -check-prefix=R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=R2-R5 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=R2-R5 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=R2-R5 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN:    -check-prefix=R6 -check-prefix=64R6
+
+define signext i1 @sdiv_i1(i1 signext %a, i1 signext %b) {
+entry:
+; ALL-LABEL: sdiv_i1:
+
+  ; NOT-R6:       div     $zero, $4, $5
+  ; NOT-R6:       teq     $5, $zero, 7
+  ; NOT-R6:       mflo    $[[T0:[0-9]+]]
+  ; FIXME: The sll/sra instructions are redundant since div is signed.
+  ; NOT-R6:       sll     $[[T1:[0-9]+]], $[[T0]], 31
+  ; NOT-R6:       sra     $2, $[[T1]], 31
+
+  ; R6:           div     $[[T0:[0-9]+]], $4, $5
+  ; R6:           teq     $5, $zero, 7
+  ; FIXME: The sll/sra instructions are redundant since div is signed.
+  ; R6:           sll     $[[T1:[0-9]+]], $[[T0]], 31
+  ; R6:           sra     $2, $[[T1]], 31
+
+  %r = sdiv i1 %a, %b
+  ret i1 %r
+}
+
+define signext i8 @sdiv_i8(i8 signext %a, i8 signext %b) {
+entry:
+; ALL-LABEL: sdiv_i8:
+
+  ; NOT-R2-R6:    div     $zero, $4, $5
+  ; NOT-R2-R6:    teq     $5, $zero, 7
+  ; NOT-R2-R6:    mflo    $[[T0:[0-9]+]]
+  ; FIXME: The sll/sra instructions are redundant since div is signed.
+  ; NOT-R2-R6:    sll     $[[T1:[0-9]+]], $[[T0]], 24
+  ; NOT-R2-R6:    sra     $2, $[[T1]], 24
+
+  ; R2-R5:        div     $zero, $4, $5
+  ; R2-R5:        teq     $5, $zero, 7
+  ; R2-R5:        mflo    $[[T0:[0-9]+]]
+  ; FIXME: This instruction is redundant.
+  ; R2-R5:        seb     $2, $[[T0]]
+
+  ; R6:           div     $[[T0:[0-9]+]], $4, $5
+  ; R6:           teq     $5, $zero, 7
+  ; FIXME: This instruction is redundant.
+  ; R6:           seb     $2, $[[T0]]
+
+  %r = sdiv i8 %a, %b
+  ret i8 %r
+}
+
+define signext i16 @sdiv_i16(i16 signext %a, i16 signext %b) {
+entry:
+; ALL-LABEL: sdiv_i16:
+
+  ; NOT-R2-R6:    div     $zero, $4, $5
+  ; NOT-R2-R6:    teq     $5, $zero, 7
+  ; NOT-R2-R6:    mflo    $[[T0:[0-9]+]]
+  ; FIXME: The sll/sra instructions are redundant since div is signed.
+  ; NOT-R2-R6:    sll     $[[T1:[0-9]+]], $[[T0]], 16
+  ; NOT-R2-R6:    sra     $2, $[[T1]], 16
+
+  ; R2-R5:        div     $zero, $4, $5
+  ; R2-R5:        teq     $5, $zero, 7
+  ; R2-R5:        mflo    $[[T0:[0-9]+]]
+  ; FIXME: This is instruction is redundant since div is signed.
+  ; R2-R5:        seh     $2, $[[T0]]
+
+  ; R6:           div     $[[T0:[0-9]+]], $4, $5
+  ; R6:           teq     $5, $zero, 7
+  ; FIXME: This is instruction is redundant since div is signed.
+  ; R6:           seh     $2, $[[T0]]
+
+  %r = sdiv i16 %a, %b
+  ret i16 %r
+}
+
+define signext i32 @sdiv_i32(i32 signext %a, i32 signext %b) {
+entry:
+; ALL-LABEL: sdiv_i32:
+
+  ; NOT-R6:       div     $zero, $4, $5
+  ; NOT-R6:       teq     $5, $zero, 7
+  ; NOT-R6:       mflo    $2
+
+  ; R6:           div     $2, $4, $5
+  ; R6:           teq     $5, $zero, 7
+
+  %r = sdiv i32 %a, %b
+  ret i32 %r
+}
+
+define signext i64 @sdiv_i64(i64 signext %a, i64 signext %b) {
+entry:
+; ALL-LABEL: sdiv_i64:
+
+  ; GP32:         lw      $25, %call16(__divdi3)($gp)
+
+  ; GP64-NOT-R6:  ddiv    $zero, $4, $5
+  ; GP64-NOT-R6:  teq     $5, $zero, 7
+  ; GP64-NOT-R6:  mflo    $2
+
+  ; 64R6:         ddiv    $2, $4, $5
+  ; 64R6:         teq     $5, $zero, 7
+
+  %r = sdiv i64 %a, %b
+  ret i64 %r
+}
+
+define signext i128 @sdiv_i128(i128 signext %a, i128 signext %b) {
+entry:
+  ; ALL-LABEL: sdiv_i128:
+
+    ; GP32:         lw      $25, %call16(__divti3)($gp)
+
+    ; GP64-NOT-R6:  ld      $25, %call16(__divti3)($gp)
+    ; 64R6:         ld      $25, %call16(__divti3)($gp)
+
+    %r = sdiv i128 %a, %b
+    ret i128 %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/select.ll b/test/CodeGen/Mips/llvm-ir/select.ll
new file mode 100644
index 0000000..f17670a
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/select.ll
@@ -0,0 +1,712 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=M2 -check-prefix=M2-M3
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=CMOV \
+; RUN:    -check-prefix=CMOV-32 -check-prefix=CMOV-32R1
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=CMOV \
+; RUN:    -check-prefix=CMOV-32 -check-prefix=CMOV-32R2-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=CMOV \
+; RUN:    -check-prefix=CMOV-32 -check-prefix=CMOV-32R2-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=CMOV \
+; RUN:    -check-prefix=CMOV-32 -check-prefix=CMOV-32R2-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=SEL -check-prefix=SEL-32
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=M3 -check-prefix=M2-M3
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=CMOV -check-prefix=CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=CMOV -check-prefix=CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=CMOV -check-prefix=CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=CMOV -check-prefix=CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=CMOV -check-prefix=CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=SEL -check-prefix=SEL-64
+
+define signext i1 @tst_select_i1_i1(i1 signext %s,
+                                    i1 signext %x, i1 signext %y) {
+entry:
+  ; ALL-LABEL: tst_select_i1_i1:
+
+  ; M2-M3:  andi    $[[T0:[0-9]+]], $4, 1
+  ; M2-M3:  bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2-M3:  nop
+  ; M2-M3:  move    $5, $6
+  ; M2-M3:  $[[BB0]]:
+  ; M2-M3:  jr      $ra
+  ; M2-M3:  move    $2, $5
+
+  ; CMOV:   andi    $[[T0:[0-9]+]], $4, 1
+  ; CMOV:   movn    $6, $5, $[[T0]]
+  ; CMOV:   move    $2, $6
+
+  ; SEL:    andi    $[[T0:[0-9]+]], $4, 1
+  ; SEL:    seleqz  $[[T1:[0-9]+]], $6, $[[T0]]
+  ; SEL:    selnez  $[[T2:[0-9]+]], $5, $[[T0]]
+  ; SEL:    or      $2, $[[T2]], $[[T1]]
+  %r = select i1 %s, i1 %x, i1 %y
+  ret i1 %r
+}
+
+define signext i8 @tst_select_i1_i8(i1 signext %s,
+                                    i8 signext %x, i8 signext %y) {
+entry:
+  ; ALL-LABEL: tst_select_i1_i8:
+
+  ; M2-M3:  andi    $[[T0:[0-9]+]], $4, 1
+  ; M2-M3:  bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2-M3:  nop
+  ; M2-M3:  move    $5, $6
+  ; M2-M3:  $[[BB0]]:
+  ; M2-M3:  jr      $ra
+  ; M2-M3:  move    $2, $5
+
+  ; CMOV:   andi    $[[T0:[0-9]+]], $4, 1
+  ; CMOV:   movn    $6, $5, $[[T0]]
+  ; CMOV:   move    $2, $6
+
+  ; SEL:    andi    $[[T0:[0-9]+]], $4, 1
+  ; SEL:    seleqz  $[[T1:[0-9]+]], $6, $[[T0]]
+  ; SEL:    selnez  $[[T2:[0-9]+]], $5, $[[T0]]
+  ; SEL:    or      $2, $[[T2]], $[[T1]]
+  %r = select i1 %s, i8 %x, i8 %y
+  ret i8 %r
+}
+
+define signext i32 @tst_select_i1_i32(i1 signext %s,
+                                      i32 signext %x, i32 signext %y) {
+entry:
+  ; ALL-LABEL: tst_select_i1_i32:
+
+  ; M2-M3:  andi    $[[T0:[0-9]+]], $4, 1
+  ; M2-M3:  bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2-M3:  nop
+  ; M2-M3:  move    $5, $6
+  ; M2-M3:  $[[BB0]]:
+  ; M2-M3:  jr      $ra
+  ; M2-M3:  move    $2, $5
+
+  ; CMOV:   andi    $[[T0:[0-9]+]], $4, 1
+  ; CMOV:   movn    $6, $5, $[[T0]]
+  ; CMOV:   move    $2, $6
+
+  ; SEL:    andi    $[[T0:[0-9]+]], $4, 1
+  ; SEL:    seleqz  $[[T1:[0-9]+]], $6, $[[T0]]
+  ; SEL:    selnez  $[[T2:[0-9]+]], $5, $[[T0]]
+  ; SEL:    or      $2, $[[T2]], $[[T1]]
+  %r = select i1 %s, i32 %x, i32 %y
+  ret i32 %r
+}
+
+define signext i64 @tst_select_i1_i64(i1 signext %s,
+                                      i64 signext %x, i64 signext %y) {
+entry:
+  ; ALL-LABEL: tst_select_i1_i64:
+
+  ; M2:     andi    $[[T0:[0-9]+]], $4, 1
+  ; M2:     bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2:     nop
+  ; M2:     lw      $[[T1:[0-9]+]], 16($sp)
+  ; M2:     $[[BB0]]:
+  ; FIXME: This branch is redundant
+  ; M2:     bnez    $[[T0]], $[[BB1:BB[0-9_]+]]
+  ; M2:     nop
+  ; M2:     lw      $[[T2:[0-9]+]], 20($sp)
+  ; M2:     $[[BB1]]:
+  ; M2:     move    $2, $[[T1]]
+  ; M2:     jr      $ra
+  ; M2:     move    $3, $[[T2]]
+
+  ; CMOV-32:    andi    $[[T0:[0-9]+]], $4, 1
+  ; CMOV-32:    lw      $2, 16($sp)
+  ; CMOV-32:    movn    $2, $6, $[[T0]]
+  ; CMOV-32:    lw      $3, 20($sp)
+  ; CMOV-32:    movn    $3, $7, $[[T0]]
+
+  ; SEL-32:     andi    $[[T0:[0-9]+]], $4, 1
+  ; SEL-32:     selnez  $[[T1:[0-9]+]], $6, $[[T0]]
+  ; SEL-32:     lw      $[[T2:[0-9]+]], 16($sp)
+  ; SEL-32:     seleqz  $[[T3:[0-9]+]], $[[T2]], $[[T0]]
+  ; SEL-32:     or      $2, $[[T1]], $[[T3]]
+  ; SEL-32:     selnez  $[[T4:[0-9]+]], $7, $[[T0]]
+  ; SEL-32:     lw      $[[T5:[0-9]+]], 20($sp)
+  ; SEL-32:     seleqz  $[[T6:[0-9]+]], $[[T5]], $[[T0]]
+  ; SEL-32:     or      $3, $[[T4]], $[[T6]]
+
+  ; M3:         andi    $[[T0:[0-9]+]], $4, 1
+  ; M3:         bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M3:         nop
+  ; M3:         move    $5, $6
+  ; M3:         $[[BB0]]:
+  ; M3:         jr      $ra
+  ; M3:         move    $2, $5
+
+  ; CMOV-64:    andi    $[[T0:[0-9]+]], $4, 1
+  ; CMOV-64:    movn    $6, $5, $[[T0]]
+  ; CMOV-64:    move    $2, $6
+
+  ; SEL-64:     andi    $[[T0:[0-9]+]], $4, 1
+  ; FIXME: This shift is redundant
+  ; SEL-64:     sll     $[[T0]], $[[T0]], 0
+  ; SEL-64:     seleqz  $[[T1:[0-9]+]], $6, $[[T0]]
+  ; SEL-64:     selnez  $[[T0]], $5, $[[T0]]
+  ; SEL-64:     or      $2, $[[T0]], $[[T1]]
+  %r = select i1 %s, i64 %x, i64 %y
+  ret i64 %r
+}
+
+define float @tst_select_i1_float(i1 signext %s, float %x, float %y) {
+entry:
+  ; ALL-LABEL: tst_select_i1_float:
+
+  ; M2-M3:      andi    $[[T0:[0-9]+]], $4, 1
+  ; M2-M3:      bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2-M3:      nop
+  ; M2:         jr      $ra
+  ; M2:         mtc1    $6, $f0
+  ; M3:         mov.s   $f13, $f14
+  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      jr      $ra
+  ; M2:         mtc1    $5, $f0
+  ; M3:         mov.s   $f0, $f13
+
+  ; CMOV-32:    mtc1    $6, $f0
+  ; CMOV-32:    mtc1    $5, $f1
+  ; CMOV-32:    andi    $[[T0:[0-9]+]], $4, 1
+  ; CMOV-32:    movn.s  $f0, $f1, $[[T0]]
+
+  ; SEL-32:     mtc1    $5, $[[F0:f[0-9]+]]
+  ; SEL-32:     mtc1    $6, $[[F1:f[0-9]+]]
+  ; SEL-32:     mtc1    $4, $f0
+  ; SEL-32:     sel.s   $f0, $[[F1]], $[[F0]]
+
+  ; CMOV-64:    andi    $[[T0:[0-9]+]], $4, 1
+  ; CMOV-64:    movn.s  $f14, $f13, $[[T0]]
+  ; CMOV-64:    mov.s   $f0, $f14
+
+  ; SEL-64:     mtc1    $4, $f0
+  ; SEL-64:     sel.s   $f0, $f14, $f13
+  %r = select i1 %s, float %x, float %y
+  ret float %r
+}
+
+define float @tst_select_i1_float_reordered(float %x, float %y,
+                                            i1 signext %s) {
+entry:
+  ; ALL-LABEL: tst_select_i1_float_reordered:
+
+  ; M2-M3:      andi    $[[T0:[0-9]+]], $6, 1
+  ; M2-M3:      bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2-M3:      nop
+  ; M2:         mov.s   $f12, $f14
+  ; M3:         mov.s   $f12, $f13
+  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      jr      $ra
+  ; M2-M3:      mov.s   $f0, $f12
+
+  ; CMOV-32:    andi    $[[T0:[0-9]+]], $6, 1
+  ; CMOV-32:    movn.s  $f14, $f12, $[[T0]]
+  ; CMOV-32:    mov.s   $f0, $f14
+
+  ; SEL-32:     mtc1    $6, $f0
+  ; SEL-32:     sel.s   $f0, $f14, $f12
+
+  ; CMOV-64:    andi    $[[T0:[0-9]+]], $6, 1
+  ; CMOV-64:    movn.s  $f13, $f12, $[[T0]]
+  ; CMOV-64:    mov.s   $f0, $f13
+
+  ; SEL-64:     mtc1    $6, $f0
+  ; SEL-64:     sel.s   $f0, $f13, $f12
+  %r = select i1 %s, float %x, float %y
+  ret float %r
+}
+
+define double @tst_select_i1_double(i1 signext %s, double %x, double %y) {
+entry:
+  ; ALL-LABEL: tst_select_i1_double:
+
+  ; M2:         andi    $[[T0:[0-9]+]], $4, 1
+  ; M2:         bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2:         nop
+  ; M2:         ldc1    $f0, 16($sp)
+  ; M2:         jr      $ra
+  ; M2:         nop
+  ; M2:         $[[BB0]]:
+  ; M2:         mtc1    $7, $f0
+  ; M2:         jr      $ra
+  ; M2:         mtc1    $6, $f1
+
+  ; CMOV-32:      mtc1    $7, $[[F0:f[0-9]+]]
+  ; CMOV-32R1:    mtc1    $6, $f{{[0-9]+}}
+  ; CMOV-32R2-R5: mthc1   $6, $[[F0]]
+  ; CMOV-32:      andi    $[[T0:[0-9]+]], $4, 1
+  ; CMOV-32:      ldc1    $f0, 16($sp)
+  ; CMOV-32:      movn.d  $f0, $[[F0]], $[[T0]]
+
+  ; SEL-32:     mtc1    $7, $[[F0:f[0-9]+]]
+  ; SEL-32:     mthc1   $6, $[[F0]]
+  ; SEL-32:     ldc1    $[[F1:f[0-9]+]], 16($sp)
+  ; SEL-32:     mtc1    $4, $f0
+  ; SEL-32:     sel.d   $f0, $[[F1]], $[[F0]]
+
+  ; M3:         andi    $[[T0:[0-9]+]], $4, 1
+  ; M3:         bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M3:         nop
+  ; M3:         mov.d   $f13, $f14
+  ; M3:         $[[BB0]]:
+  ; M3:         jr      $ra
+  ; M3:         mov.d   $f0, $f13
+
+  ; CMOV-64:    andi    $[[T0:[0-9]+]], $4, 1
+  ; CMOV-64:    movn.d  $f14, $f13, $[[T0]]
+  ; CMOV-64:    mov.d   $f0, $f14
+
+  ; SEL-64:     mtc1    $4, $f0
+  ; SEL-64:     sel.d   $f0, $f14, $f13
+  %r = select i1 %s, double %x, double %y
+  ret double %r
+}
+
+define double @tst_select_i1_double_reordered(double %x, double %y,
+                                              i1 signext %s) {
+entry:
+  ; ALL-LABEL: tst_select_i1_double_reordered:
+
+  ; M2:         lw      $[[T0:[0-9]+]], 16($sp)
+  ; M2:         andi    $[[T1:[0-9]+]], $[[T0]], 1
+  ; M2:         bnez    $[[T1]], $[[BB0:BB[0-9_]+]]
+  ; M2:         nop
+  ; M2:         mov.d   $f12, $f14
+  ; M2:         $[[BB0]]:
+  ; M2:         jr      $ra
+  ; M2:         mov.d   $f0, $f12
+
+  ; CMOV-32:    lw      $[[T0:[0-9]+]], 16($sp)
+  ; CMOV-32:    andi    $[[T1:[0-9]+]], $[[T0]], 1
+  ; CMOV-32:    movn.d  $f14, $f12, $[[T1]]
+  ; CMOV-32:    mov.d   $f0, $f14
+
+  ; SEL-32:     lw      $[[T0:[0-9]+]], 16($sp)
+  ; SEL-32:     mtc1    $[[T0]], $f0
+  ; SEL-32:     sel.d   $f0, $f14, $f12
+
+  ; M3:         andi    $[[T0:[0-9]+]], $6, 1
+  ; M3:         bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M3:         nop
+  ; M3:         mov.d   $f12, $f13
+  ; M3:         $[[BB0]]:
+  ; M3:         jr      $ra
+  ; M3:         mov.d   $f0, $f12
+
+  ; CMOV-64:    andi    $[[T0:[0-9]+]], $6, 1
+  ; CMOV-64:    movn.d  $f13, $f12, $[[T0]]
+  ; CMOV-64:    mov.d   $f0, $f13
+
+  ; SEL-64:     mtc1    $6, $f0
+  ; SEL-64:     sel.d   $f0, $f13, $f12
+  %r = select i1 %s, double %x, double %y
+  ret double %r
+}
+
+define float @tst_select_fcmp_olt_float(float %x, float %y) {
+entry:
+  ; ALL-LABEL: tst_select_fcmp_olt_float:
+
+  ; M2:         c.olt.s   $f12, $f14
+  ; M3:         c.olt.s   $f12, $f13
+  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2-M3:      nop
+  ; M2:         mov.s     $f12, $f14
+  ; M3:         mov.s     $f12, $f13
+  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      jr        $ra
+  ; M2-M3:      mov.s     $f0, $f12
+
+  ; CMOV-32:    c.olt.s   $f12, $f14
+  ; CMOV-32:    movt.s    $f14, $f12, $fcc0
+  ; CMOV-32:    mov.s     $f0, $f14
+
+  ; SEL-32:     cmp.lt.s  $f0, $f12, $f14
+  ; SEL-32:     sel.s     $f0, $f14, $f12
+
+  ; CMOV-64:    c.olt.s   $f12, $f13
+  ; CMOV-64:    movt.s    $f13, $f12, $fcc0
+  ; CMOV-64:    mov.s     $f0, $f13
+
+  ; SEL-64:     cmp.lt.s  $f0, $f12, $f13
+  ; SEL-64:     sel.s     $f0, $f13, $f12
+  %s = fcmp olt float %x, %y
+  %r = select i1 %s, float %x, float %y
+  ret float %r
+}
+
+define float @tst_select_fcmp_ole_float(float %x, float %y) {
+entry:
+  ; ALL-LABEL: tst_select_fcmp_ole_float:
+
+  ; M2:         c.ole.s   $f12, $f14
+  ; M3:         c.ole.s   $f12, $f13
+  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2-M3:      nop
+  ; M2:         mov.s     $f12, $f14
+  ; M3:         mov.s     $f12, $f13
+  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      jr        $ra
+  ; M2-M3:      mov.s     $f0, $f12
+
+  ; CMOV-32:    c.ole.s   $f12, $f14
+  ; CMOV-32:    movt.s    $f14, $f12, $fcc0
+  ; CMOV-32:    mov.s     $f0, $f14
+
+  ; SEL-32:     cmp.le.s  $f0, $f12, $f14
+  ; SEL-32:     sel.s     $f0, $f14, $f12
+
+  ; CMOV-64:    c.ole.s   $f12, $f13
+  ; CMOV-64:    movt.s    $f13, $f12, $fcc0
+  ; CMOV-64:    mov.s     $f0, $f13
+
+  ; SEL-64:     cmp.le.s  $f0, $f12, $f13
+  ; SEL-64:     sel.s     $f0, $f13, $f12
+  %s = fcmp ole float %x, %y
+  %r = select i1 %s, float %x, float %y
+  ret float %r
+}
+
+define float @tst_select_fcmp_ogt_float(float %x, float %y) {
+entry:
+  ; ALL-LABEL: tst_select_fcmp_ogt_float:
+
+  ; M2:         c.ule.s   $f12, $f14
+  ; M3:         c.ule.s   $f12, $f13
+  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2-M3:      nop
+  ; M2:         mov.s     $f12, $f14
+  ; M3:         mov.s     $f12, $f13
+  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      jr        $ra
+  ; M2-M3:      mov.s     $f0, $f12
+
+  ; CMOV-32:    c.ule.s   $f12, $f14
+  ; CMOV-32:    movf.s    $f14, $f12, $fcc0
+  ; CMOV-32:    mov.s     $f0, $f14
+
+  ; SEL-32:     cmp.lt.s  $f0, $f14, $f12
+  ; SEL-32:     sel.s     $f0, $f14, $f12
+
+  ; CMOV-64:    c.ule.s   $f12, $f13
+  ; CMOV-64:    movf.s    $f13, $f12, $fcc0
+  ; CMOV-64:    mov.s     $f0, $f13
+
+  ; SEL-64:     cmp.lt.s  $f0, $f13, $f12
+  ; SEL-64:     sel.s     $f0, $f13, $f12
+  %s = fcmp ogt float %x, %y
+  %r = select i1 %s, float %x, float %y
+  ret float %r
+}
+
+define float @tst_select_fcmp_oge_float(float %x, float %y) {
+entry:
+  ; ALL-LABEL: tst_select_fcmp_oge_float:
+
+  ; M2:         c.ult.s   $f12, $f14
+  ; M3:         c.ult.s   $f12, $f13
+  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2-M3:      nop
+  ; M2:         mov.s     $f12, $f14
+  ; M3:         mov.s     $f12, $f13
+  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      jr        $ra
+  ; M2-M3:      mov.s     $f0, $f12
+
+  ; CMOV-32:    c.ult.s   $f12, $f14
+  ; CMOV-32:    movf.s    $f14, $f12, $fcc0
+  ; CMOV-32:    mov.s     $f0, $f14
+
+  ; SEL-32:     cmp.le.s  $f0, $f14, $f12
+  ; SEL-32:     sel.s     $f0, $f14, $f12
+
+  ; CMOV-64:    c.ult.s   $f12, $f13
+  ; CMOV-64:    movf.s    $f13, $f12, $fcc0
+  ; CMOV-64:    mov.s     $f0, $f13
+
+  ; SEL-64:     cmp.le.s  $f0, $f13, $f12
+  ; SEL-64:     sel.s     $f0, $f13, $f12
+  %s = fcmp oge float %x, %y
+  %r = select i1 %s, float %x, float %y
+  ret float %r
+}
+
+define float @tst_select_fcmp_oeq_float(float %x, float %y) {
+entry:
+  ; ALL-LABEL: tst_select_fcmp_oeq_float:
+
+  ; M2:         c.eq.s    $f12, $f14
+  ; M3:         c.eq.s    $f12, $f13
+  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2-M3:      nop
+  ; M2:         mov.s     $f12, $f14
+  ; M3:         mov.s     $f12, $f13
+  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      jr        $ra
+  ; M2-M3:      mov.s     $f0, $f12
+
+  ; CMOV-32:    c.eq.s    $f12, $f14
+  ; CMOV-32:    movt.s    $f14, $f12, $fcc0
+  ; CMOV-32:    mov.s     $f0, $f14
+
+  ; SEL-32:     cmp.eq.s  $f0, $f12, $f14
+  ; SEL-32:     sel.s     $f0, $f14, $f12
+
+  ; CMOV-64:    c.eq.s    $f12, $f13
+  ; CMOV-64:    movt.s    $f13, $f12, $fcc0
+  ; CMOV-64:    mov.s     $f0, $f13
+
+  ; SEL-64:     cmp.eq.s  $f0, $f12, $f13
+  ; SEL-64:     sel.s     $f0, $f13, $f12
+  %s = fcmp oeq float %x, %y
+  %r = select i1 %s, float %x, float %y
+  ret float %r
+}
+
+define float @tst_select_fcmp_one_float(float %x, float %y) {
+entry:
+  ; ALL-LABEL: tst_select_fcmp_one_float:
+
+  ; M2:         c.ueq.s   $f12, $f14
+  ; M3:         c.ueq.s   $f12, $f13
+  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2-M3:      nop
+  ; M2:         mov.s     $f12, $f14
+  ; M3:         mov.s     $f12, $f13
+  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      jr        $ra
+  ; M2-M3:      mov.s     $f0, $f12
+
+  ; CMOV-32:    c.ueq.s   $f12, $f14
+  ; CMOV-32:    movf.s    $f14, $f12, $fcc0
+  ; CMOV-32:    mov.s     $f0, $f14
+
+  ; SEL-32:     cmp.ueq.s $f0, $f12, $f14
+  ; SEL-32:     mfc1      $[[T0:[0-9]+]], $f0
+  ; SEL-32:     not       $[[T0]], $[[T0]]
+  ; SEL-32:     mtc1      $[[T0:[0-9]+]], $f0
+  ; SEL-32:     sel.s     $f0, $f14, $f12
+
+  ; CMOV-64:    c.ueq.s   $f12, $f13
+  ; CMOV-64:    movf.s    $f13, $f12, $fcc0
+  ; CMOV-64:    mov.s     $f0, $f13
+
+  ; SEL-64:     cmp.ueq.s $f0, $f12, $f13
+  ; SEL-64:     mfc1      $[[T0:[0-9]+]], $f0
+  ; SEL-64:     not       $[[T0]], $[[T0]]
+  ; SEL-64:     mtc1      $[[T0:[0-9]+]], $f0
+  ; SEL-64:     sel.s     $f0, $f13, $f12
+
+  %s = fcmp one float %x, %y
+  %r = select i1 %s, float %x, float %y
+  ret float %r
+}
+
+define double @tst_select_fcmp_olt_double(double %x, double %y) {
+entry:
+  ; ALL-LABEL: tst_select_fcmp_olt_double:
+
+  ; M2:         c.olt.d   $f12, $f14
+  ; M3:         c.olt.d   $f12, $f13
+  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2-M3:      nop
+  ; M2:         mov.d     $f12, $f14
+  ; M3:         mov.d     $f12, $f13
+  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      jr        $ra
+  ; M2-M3:      mov.d     $f0, $f12
+
+  ; CMOV-32:    c.olt.d   $f12, $f14
+  ; CMOV-32:    movt.d    $f14, $f12, $fcc0
+  ; CMOV-32:    mov.d     $f0, $f14
+
+  ; SEL-32:     cmp.lt.d  $f0, $f12, $f14
+  ; SEL-32:     sel.d     $f0, $f14, $f12
+
+  ; CMOV-64:    c.olt.d   $f12, $f13
+  ; CMOV-64:    movt.d    $f13, $f12, $fcc0
+  ; CMOV-64:    mov.d     $f0, $f13
+
+  ; SEL-64:     cmp.lt.d  $f0, $f12, $f13
+  ; SEL-64:     sel.d     $f0, $f13, $f12
+  %s = fcmp olt double %x, %y
+  %r = select i1 %s, double %x, double %y
+  ret double %r
+}
+
+define double @tst_select_fcmp_ole_double(double %x, double %y) {
+entry:
+  ; ALL-LABEL: tst_select_fcmp_ole_double:
+
+  ; M2:         c.ole.d   $f12, $f14
+  ; M3:         c.ole.d   $f12, $f13
+  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2-M3:      nop
+  ; M2:         mov.d     $f12, $f14
+  ; M3:         mov.d     $f12, $f13
+  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      jr        $ra
+  ; M2-M3:      mov.d     $f0, $f12
+
+  ; CMOV-32:    c.ole.d   $f12, $f14
+  ; CMOV-32:    movt.d    $f14, $f12, $fcc0
+  ; CMOV-32:    mov.d     $f0, $f14
+
+  ; SEL-32:     cmp.le.d  $f0, $f12, $f14
+  ; SEL-32:     sel.d     $f0, $f14, $f12
+
+  ; CMOV-64:    c.ole.d   $f12, $f13
+  ; CMOV-64:    movt.d    $f13, $f12, $fcc0
+  ; CMOV-64:    mov.d     $f0, $f13
+
+  ; SEL-64:     cmp.le.d  $f0, $f12, $f13
+  ; SEL-64:     sel.d     $f0, $f13, $f12
+  %s = fcmp ole double %x, %y
+  %r = select i1 %s, double %x, double %y
+  ret double %r
+}
+
+define double @tst_select_fcmp_ogt_double(double %x, double %y) {
+entry:
+  ; ALL-LABEL: tst_select_fcmp_ogt_double:
+
+  ; M2:         c.ule.d   $f12, $f14
+  ; M3:         c.ule.d   $f12, $f13
+  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2-M3:      nop
+  ; M2:         mov.d     $f12, $f14
+  ; M3:         mov.d     $f12, $f13
+  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      jr        $ra
+  ; M2-M3:      mov.d     $f0, $f12
+
+  ; CMOV-32:    c.ule.d   $f12, $f14
+  ; CMOV-32:    movf.d    $f14, $f12, $fcc0
+  ; CMOV-32:    mov.d     $f0, $f14
+
+  ; SEL-32:     cmp.lt.d  $f0, $f14, $f12
+  ; SEL-32:     sel.d     $f0, $f14, $f12
+
+  ; CMOV-64:    c.ule.d   $f12, $f13
+  ; CMOV-64:    movf.d    $f13, $f12, $fcc0
+  ; CMOV-64:    mov.d     $f0, $f13
+
+  ; SEL-64:     cmp.lt.d  $f0, $f13, $f12
+  ; SEL-64:     sel.d     $f0, $f13, $f12
+  %s = fcmp ogt double %x, %y
+  %r = select i1 %s, double %x, double %y
+  ret double %r
+}
+
+define double @tst_select_fcmp_oge_double(double %x, double %y) {
+entry:
+  ; ALL-LABEL: tst_select_fcmp_oge_double:
+
+  ; M2:         c.ult.d   $f12, $f14
+  ; M3:         c.ult.d   $f12, $f13
+  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2-M3:      nop
+  ; M2:         mov.d     $f12, $f14
+  ; M3:         mov.d     $f12, $f13
+  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      jr        $ra
+  ; M2-M3:      mov.d     $f0, $f12
+
+  ; CMOV-32:    c.ult.d   $f12, $f14
+  ; CMOV-32:    movf.d    $f14, $f12, $fcc0
+  ; CMOV-32:    mov.d     $f0, $f14
+
+  ; SEL-32:     cmp.le.d  $f0, $f14, $f12
+  ; SEL-32:     sel.d     $f0, $f14, $f12
+
+  ; CMOV-64:    c.ult.d   $f12, $f13
+  ; CMOV-64:    movf.d    $f13, $f12, $fcc0
+  ; CMOV-64:    mov.d     $f0, $f13
+
+  ; SEL-64:     cmp.le.d  $f0, $f13, $f12
+  ; SEL-64:     sel.d     $f0, $f13, $f12
+  %s = fcmp oge double %x, %y
+  %r = select i1 %s, double %x, double %y
+  ret double %r
+}
+
+define double @tst_select_fcmp_oeq_double(double %x, double %y) {
+entry:
+  ; ALL-LABEL: tst_select_fcmp_oeq_double:
+
+  ; M2:         c.eq.d    $f12, $f14
+  ; M3:         c.eq.d    $f12, $f13
+  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2-M3:      nop
+  ; M2:         mov.d     $f12, $f14
+  ; M3:         mov.d     $f12, $f13
+  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      jr        $ra
+  ; M2-M3:      mov.d     $f0, $f12
+
+  ; CMOV-32:    c.eq.d    $f12, $f14
+  ; CMOV-32:    movt.d    $f14, $f12, $fcc0
+  ; CMOV-32:    mov.d     $f0, $f14
+
+  ; SEL-32:     cmp.eq.d  $f0, $f12, $f14
+  ; SEL-32:     sel.d     $f0, $f14, $f12
+
+  ; CMOV-64:    c.eq.d    $f12, $f13
+  ; CMOV-64:    movt.d    $f13, $f12, $fcc0
+  ; CMOV-64:    mov.d     $f0, $f13
+
+  ; SEL-64:     cmp.eq.d  $f0, $f12, $f13
+  ; SEL-64:     sel.d     $f0, $f13, $f12
+  %s = fcmp oeq double %x, %y
+  %r = select i1 %s, double %x, double %y
+  ret double %r
+}
+
+define double @tst_select_fcmp_one_double(double %x, double %y) {
+entry:
+  ; ALL-LABEL: tst_select_fcmp_one_double:
+
+  ; M2:         c.ueq.d   $f12, $f14
+  ; M3:         c.ueq.d   $f12, $f13
+  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2-M3:      nop
+  ; M2:         mov.d     $f12, $f14
+  ; M3:         mov.d     $f12, $f13
+  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      jr        $ra
+  ; M2-M3:      mov.d     $f0, $f12
+
+  ; CMOV-32:    c.ueq.d   $f12, $f14
+  ; CMOV-32:    movf.d    $f14, $f12, $fcc0
+  ; CMOV-32:    mov.d     $f0, $f14
+
+  ; SEL-32:     cmp.ueq.d $f0, $f12, $f14
+  ; SEL-32:     mfc1      $[[T0:[0-9]+]], $f0
+  ; SEL-32:     not       $[[T0]], $[[T0]]
+  ; SEL-32:     mtc1      $[[T0:[0-9]+]], $f0
+  ; SEL-32:     sel.d     $f0, $f14, $f12
+
+  ; CMOV-64:    c.ueq.d   $f12, $f13
+  ; CMOV-64:    movf.d    $f13, $f12, $fcc0
+  ; CMOV-64:    mov.d     $f0, $f13
+
+  ; SEL-64:     cmp.ueq.d $f0, $f12, $f13
+  ; SEL-64:     mfc1      $[[T0:[0-9]+]], $f0
+  ; SEL-64:     not       $[[T0]], $[[T0]]
+  ; SEL-64:     mtc1      $[[T0:[0-9]+]], $f0
+  ; SEL-64:     sel.d     $f0, $f13, $f12
+  %s = fcmp one double %x, %y
+  %r = select i1 %s, double %x, double %y
+  ret double %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/shl.ll b/test/CodeGen/Mips/llvm-ir/shl.ll
new file mode 100644
index 0000000..6640320
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/shl.ll
@@ -0,0 +1,200 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=M2 -check-prefix=NOT-R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 -check-prefix=NOT-R2-R6 \
+; RUN:    -check-prefix=32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=32R1-R5 -check-prefix=R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=32R1-R5 -check-prefix=R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=32R1-R5 -check-prefix=R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32 \
+; RUN:    -check-prefix=32R6 -check-prefix=R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=M3 -check-prefix=NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=GP64-NOT-R6 -check-prefix R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=GP64-NOT-R6 -check-prefix R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=GP64-NOT-R6 -check-prefix R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64 \
+; RUN:    -check-prefix=64R6 -check-prefix=R2-R6
+
+define signext i1 @shl_i1(i1 signext %a, i1 signext %b) {
+entry:
+; ALL-LABEL: shl_i1:
+
+  ; ALL:        move    $2, $4
+
+  %r = shl i1 %a, %b
+  ret i1 %r
+}
+
+define signext i8 @shl_i8(i8 signext %a, i8 signext %b) {
+entry:
+; ALL-LABEL: shl_i8:
+
+  ; NOT-R2-R6:  andi    $[[T0:[0-9]+]], $5, 255
+  ; NOT-R2-R6:  sllv    $[[T1:[0-9]+]], $4, $[[T0]]
+  ; NOT-R2-R6:  sll     $[[T2:[0-9]+]], $[[T1]], 24
+  ; NOT-R2-R6:  sra     $2, $[[T2]], 24
+
+  ; R2-R6:      andi    $[[T0:[0-9]+]], $5, 255
+  ; R2-R6:      sllv    $[[T1:[0-9]+]], $4, $[[T0]]
+  ; R2-R6:      seb     $2, $[[T1]]
+
+  %r = shl i8 %a, %b
+  ret i8 %r
+}
+
+define signext i16 @shl_i16(i16 signext %a, i16 signext %b) {
+entry:
+; ALL-LABEL: shl_i16:
+
+  ; NOT-R2-R6:  andi    $[[T0:[0-9]+]], $5, 65535
+  ; NOT-R2-R6:  sllv    $[[T1:[0-9]+]], $4, $[[T0]]
+  ; NOT-R2-R6:  sll     $[[T2:[0-9]+]], $[[T1]], 16
+  ; NOT-R2-R6:  sra     $2, $[[T2]], 16
+
+  ; R2-R6:      andi    $[[T0:[0-9]+]], $5, 65535
+  ; R2-R6:      sllv    $[[T1:[0-9]+]], $4, $[[T0]]
+  ; R2-R6:      seh     $2, $[[T1]]
+
+  %r = shl i16 %a, %b
+  ret i16 %r
+}
+
+define signext i32 @shl_i32(i32 signext %a, i32 signext %b) {
+entry:
+; ALL-LABEL: shl_i32:
+
+  ; ALL:        sllv    $2, $4, $5
+
+  %r = shl i32 %a, %b
+  ret i32 %r
+}
+
+define signext i64 @shl_i64(i64 signext %a, i64 signext %b) {
+entry:
+; ALL-LABEL: shl_i64:
+
+  ; M2:         sllv      $[[T0:[0-9]+]], $5, $7
+  ; M2:         andi      $[[T1:[0-9]+]], $7, 32
+  ; M2:         bnez      $[[T1]], $[[BB0:BB[0-9_]+]]
+  ; M2:         move      $2, $[[T0]]
+  ; M2:         sllv      $[[T2:[0-9]+]], $4, $7
+  ; M2:         not       $[[T3:[0-9]+]], $7
+  ; M2:         srl       $[[T4:[0-9]+]], $5, 1
+  ; M2:         srlv      $[[T5:[0-9]+]], $[[T4]], $[[T3]]
+  ; M2:         or        $2, $[[T2]], $[[T3]]
+  ; M2:         $[[BB0]]:
+  ; M2:         bnez      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         addiu     $3, $zero, 0
+  ; M2:         move      $3, $[[T0]]
+  ; M2:         $[[BB1]]:
+  ; M2:         jr        $ra
+  ; M2:         nop
+
+  ; 32R1-R5:    sllv      $[[T0:[0-9]+]], $4, $7
+  ; 32R1-R5:    not       $[[T1:[0-9]+]], $7
+  ; 32R1-R5:    srl       $[[T2:[0-9]+]], $5, 1
+  ; 32R1-R5:    srlv      $[[T3:[0-9]+]], $[[T2]], $[[T1]]
+  ; 32R1-R5:    or        $2, $[[T0]], $[[T3]]
+  ; 32R1-R5:    sllv      $[[T4:[0-9]+]], $5, $7
+  ; 32R1-R5:    andi      $[[T5:[0-9]+]], $7, 32
+  ; 32R1-R5:    movn      $2, $[[T4]], $[[T5]]
+  ; 32R1-R5:    jr        $ra
+  ; 32R1-R5:    movn      $3, $zero, $[[T5]]
+
+  ; 32R6:       sllv      $[[T0:[0-9]+]], $4, $7
+  ; 32R6:       not       $[[T1:[0-9]+]], $7
+  ; 32R6:       srl       $[[T2:[0-9]+]], $5, 1
+  ; 32R6:       srlv      $[[T3:[0-9]+]], $[[T2]], $[[T1]]
+  ; 32R6:       or        $[[T4:[0-9]+]], $[[T0]], $[[T3]]
+  ; 32R6:       andi      $[[T5:[0-9]+]], $7, 32
+  ; 32R6:       seleqz    $[[T6:[0-9]+]], $[[T4]], $[[T2]]
+  ; 32R6:       sllv      $[[T7:[0-9]+]], $5, $7
+  ; 32R6:       selnez    $[[T8:[0-9]+]], $[[T7]], $[[T5]]
+  ; 32R6:       or        $2, $[[T8]], $[[T6]]
+  ; 32R6:       jr        $ra
+  ; 32R6:       seleqz    $3, $[[T7]], $[[T5]]
+
+  ; GP64:       sll       $[[T0:[0-9]+]], $5, 0
+  ; GP64:       dsllv     $2, $4, $1
+
+  %r = shl i64 %a, %b
+  ret i64 %r
+}
+
+define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
+entry:
+; ALL-LABEL: shl_i128:
+
+  ; GP32:           lw        $25, %call16(__ashlti3)($gp)
+
+  ; M3:             sll       $[[T0:[0-9]+]], $7, 0
+  ; M3:             dsllv     $[[T1:[0-9]+]], $5, $[[T0]]
+  ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 32
+  ; M3:             bnez      $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]]
+  ; M3:             move      $2, $[[T1]]
+  ; M3:             dsllv     $[[T4:[0-9]+]], $4, $[[T0]]
+  ; M3:             dsrl      $[[T5:[0-9]+]], $5, 1
+  ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
+  ; M3:             dsrlv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
+  ; M3:             or        $2, $[[T4]], $[[T7]]
+  ; M3:             $[[BB0]]:
+  ; M3:             bnez      $[[T3]], $[[BB1:BB[0-9_]+]]
+  ; M3:             daddiu    $3, $zero, 0
+  ; M3:             move      $3, $[[T1]]
+  ; M3:             $[[BB1]]:
+  ; M3:             jr        $ra
+  ; M3:             nop
+
+  ; GP64-NOT-R6:    sll       $[[T0:[0-9]+]], $7, 0
+  ; GP64-NOT-R6:    dsllv     $[[T1:[0-9]+]], $4, $[[T0]]
+  ; GP64-NOT-R6:    dsrl      $[[T2:[0-9]+]], $5, 1
+  ; GP64-NOT-R6:    not       $[[T3:[0-9]+]], $[[T0]]
+  ; GP64-NOT-R6:    dsrlv     $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+  ; GP64-NOT-R6:    or        $2, $[[T1]], $[[T4]]
+  ; GP64-NOT-R6:    dsllv     $3, $5, $[[T0]]
+  ; GP64-NOT-R6:    andi      $[[T5:[0-9]+]], $[[T0]], 32
+  ; GP64-NOT-R6:    movn      $2, $3, $[[T5]]
+  ; GP64-NOT-R6:    jr        $ra
+  ; GP64-NOT-R6:    movn      $3, $zero, $1
+
+  ; 64R6:           sll       $[[T0:[0-9]+]], $7, 0
+  ; 64R6:           dsllv     $[[T1:[0-9]+]], $4, $[[T0]]
+  ; 64R6:           dsrl      $[[T2:[0-9]+]], $5, 1
+  ; 64R6:           not       $[[T3:[0-9]+]], $[[T0]]
+  ; 64R6:           dsrlv     $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+  ; 64R6:           or        $[[T5:[0-9]+]], $[[T1]], $[[T4]]
+  ; 64R6:           andi      $[[T6:[0-9]+]], $[[T0]], 32
+  ; 64R6:           sll       $[[T7:[0-9]+]], $[[T6]], 0
+  ; 64R6:           seleqz    $[[T8:[0-9]+]], $[[T5]], $[[T7]]
+  ; 64R6:           dsllv     $[[T9:[0-9]+]], $5, $[[T0]]
+  ; 64R6:           selnez    $[[T10:[0-9]+]], $[[T9]], $[[T7]]
+  ; 64R6:           or        $2, $[[T10]], $[[T8]]
+  ; 64R6:           jr        $ra
+  ; 64R6:           seleqz    $3, $[[T0]], $[[T7]]
+
+  %r = shl i128 %a, %b
+  ret i128 %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/srem.ll b/test/CodeGen/Mips/llvm-ir/srem.ll
new file mode 100644
index 0000000..ceb53ee
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/srem.ll
@@ -0,0 +1,139 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN:  -check-prefix=GP32 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN:  -check-prefix=GP32 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s -check-prefix=GP32 \
+; RUN:  -check-prefix=R2-R5 -check-prefix=R2-R6 -check-prefix=NOT-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s -check-prefix=GP32 \
+; RUN:  -check-prefix=R2-R5 -check-prefix=R2-R6 -check-prefix=NOT-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s -check-prefix=GP32 \
+; RUN:  -check-prefix=R2-R5 -check-prefix=R2-R6 -check-prefix=NOT-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN:   -check-prefix=GP32 -check-prefix=R6 -check-prefix=R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN:  -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN:  -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN:  -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN:  -check-prefix=R2-R5 -check-prefix=R2-R6 \
+; RUN:  -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN:  -check-prefix=R2-R5 -check-prefix=R2-R6 \
+; RUN:  -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN:  -check-prefix=R2-R5 -check-prefix=R2-R6 \
+; RUN:  -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN:  -check-prefix=64R6 -check-prefix=R6 -check-prefix=R2-R6
+
+define signext i1 @srem_i1(i1 signext %a, i1 signext %b) {
+entry:
+; ALL-LABEL: srem_i1:
+
+  ; NOT-R6:       div     $zero, $4, $5
+  ; NOT-R6:       teq     $5, $zero, 7
+  ; NOT-R6:       mfhi    $[[T0:[0-9]+]]
+  ; NOT-R6:       sll     $[[T1:[0-9]+]], $[[T0]], 31
+  ; NOT-R6:       sra     $2, $[[T1]], 31
+
+  ; R6:           mod     $[[T0:[0-9]+]], $4, $5
+  ; R6:           teq     $5, $zero, 7
+  ; R6:           sll     $[[T3:[0-9]+]], $[[T0]], 31
+  ; R6:           sra     $2, $[[T3]], 31
+
+  %r = srem i1 %a, %b
+  ret i1 %r
+}
+
+define signext i8 @srem_i8(i8 signext %a, i8 signext %b) {
+entry:
+; ALL-LABEL: srem_i8:
+
+  ; NOT-R2-R6:    div     $zero, $4, $5
+  ; NOT-R2-R6:    teq     $5, $zero, 7
+  ; NOT-R2-R6:    mfhi    $[[T0:[0-9]+]]
+  ; NOT-R2-R6:    sll     $[[T1:[0-9]+]], $[[T0]], 24
+  ; NOT-R2-R6:    sra     $2, $[[T1]], 24
+
+  ; R2-R5:        div     $zero, $4, $5
+  ; R2-R5:        teq     $5, $zero, 7
+  ; R2-R5:        mfhi    $[[T0:[0-9]+]]
+  ; R2-R5:        seb     $2, $[[T0]]
+
+  ; R6:           mod     $[[T0:[0-9]+]], $4, $5
+  ; R6:           teq     $5, $zero, 7
+  ; R6:           seb     $2, $[[T0]]
+
+  %r = srem i8 %a, %b
+  ret i8 %r
+}
+
+define signext i16 @srem_i16(i16 signext %a, i16 signext %b) {
+entry:
+; ALL-LABEL: srem_i16:
+
+  ; NOT-R2-R6:    div     $zero, $4, $5
+  ; NOT-R2-R6:    teq     $5, $zero, 7
+  ; NOT-R2-R6:    mfhi    $[[T0:[0-9]+]]
+  ; NOT-R2-R6:    sll     $[[T1:[0-9]+]], $[[T0]], 16
+  ; NOT-R2-R6:    sra     $2, $[[T1]], 16
+
+  ; R2-R5:        div     $zero, $4, $5
+  ; R2-R5:        teq     $5, $zero, 7
+  ; R2-R5:        mfhi    $[[T0:[0-9]+]]
+  ; R2-R5:        seh     $2, $[[T1]]
+
+  ; R6:           mod     $[[T0:[0-9]+]], $4, $5
+  ; R6:           teq     $5, $zero, 7
+  ; R6:           seh     $2, $[[T0]]
+
+  %r = srem i16 %a, %b
+  ret i16 %r
+}
+
+define signext i32 @srem_i32(i32 signext %a, i32 signext %b) {
+entry:
+; ALL-LABEL: srem_i32:
+
+  ; NOT-R6:       div     $zero, $4, $5
+  ; NOT-R6:       teq     $5, $zero, 7
+  ; NOT-R6:       mfhi    $2
+
+  ; R6:           mod     $2, $4, $5
+  ; R6:           teq     $5, $zero, 7
+
+  %r = srem i32 %a, %b
+  ret i32 %r
+}
+
+define signext i64 @srem_i64(i64 signext %a, i64 signext %b) {
+entry:
+; ALL-LABEL: srem_i64:
+
+  ; GP32:         lw      $25, %call16(__moddi3)($gp)
+
+  ; GP64-NOT-R6:  ddiv    $zero, $4, $5
+  ; GP64-NOT-R6:  teq     $5, $zero, 7
+  ; GP64-NOT-R6:  mfhi    $2
+
+  ; 64R6:         dmod    $2, $4, $5
+  ; 64R6:         teq     $5, $zero, 7
+
+  %r = srem i64 %a, %b
+  ret i64 %r
+}
+
+define signext i128 @srem_i128(i128 signext %a, i128 signext %b) {
+entry:
+; ALL-LABEL: srem_i128:
+
+  ; GP32:         lw      $25, %call16(__modti3)($gp)
+
+  ; GP64-NOT-R6:  ld      $25, %call16(__modti3)($gp)
+  ; 64-R6:        ld      $25, %call16(__modti3)($gp)
+
+  %r = srem i128 %a, %b
+  ret i128 %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/sub.ll b/test/CodeGen/Mips/llvm-ir/sub.ll
new file mode 100644
index 0000000..1649758
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/sub.ll
@@ -0,0 +1,122 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP64
+
+define signext i1 @sub_i1(i1 signext %a, i1 signext %b) {
+entry:
+; ALL-LABEL: sub_i1:
+
+  ; ALL:            subu    $[[T0:[0-9]+]], $4, $5
+  ; ALL:            sll     $[[T0]], $[[T0]], 31
+  ; ALL:            sra     $2, $[[T0]], 31
+
+  %r = sub i1 %a, %b
+  ret i1 %r
+}
+
+define signext i8 @sub_i8(i8 signext %a, i8 signext %b) {
+entry:
+; ALL-LABEL: sub_i8:
+
+  ; NOT-R2-R6:      subu    $[[T0:[0-9]+]], $4, $5
+  ; NOT-R2-R6:      sll     $[[T0]], $[[T0]], 24
+  ; NOT-R2-R6:      sra     $2, $[[T0]], 24
+
+  ; R2-R6:          subu    $[[T0:[0-9]+]], $4, $5
+  ; R2-R6:          seb     $2, $[[T0:[0-9]+]]
+
+  %r = sub i8 %a, %b
+  ret i8 %r
+}
+
+define signext i16 @sub_i16(i16 signext %a, i16 signext %b) {
+entry:
+; ALL-LABEL: sub_i16:
+
+  ; NOT-R2-R6:      subu    $[[T0:[0-9]+]], $4, $5
+  ; NOT-R2-R6:      sll     $[[T0]], $[[T0]], 16
+  ; NOT-R2-R6:      sra     $2, $[[T0]], 16
+
+  ; R2-R6:          subu    $[[T0:[0-9]+]], $4, $5
+  ; R2-R6:          seh     $2, $[[T0:[0-9]+]]
+
+  %r = sub i16 %a, %b
+  ret i16 %r
+}
+
+define signext i32 @sub_i32(i32 signext %a, i32 signext %b) {
+entry:
+; ALL-LABEL: sub_i32:
+
+  ; ALL:            subu    $2, $4, $5
+
+  %r = sub i32 %a, %b
+  ret i32 %r
+}
+
+define signext i64 @sub_i64(i64 signext %a, i64 signext %b) {
+entry:
+; ALL-LABEL: sub_i64:
+
+  ; GP32:           subu    $3, $5, $7
+  ; GP32:           sltu    $[[T0:[0-9]+]], $5, $7
+  ; GP32:           addu    $[[T1:[0-9]+]], $[[T0]], $6
+  ; GP32:           subu    $2, $4, $[[T1]]
+
+  ; GP64:           dsubu   $2, $4, $5
+
+  %r = sub i64 %a, %b
+  ret i64 %r
+}
+
+define signext i128 @sub_i128(i128 signext %a, i128 signext %b) {
+entry:
+; ALL-LABEL: sub_i128:
+
+  ; GP32:       lw        $[[T0:[0-9]+]], 20($sp)
+  ; GP32:       sltu      $[[T1:[0-9]+]], $5, $[[T0]]
+  ; GP32:       lw        $[[T2:[0-9]+]], 16($sp)
+  ; GP32:       addu      $[[T3:[0-9]+]], $[[T1]], $[[T2]]
+  ; GP32:       lw        $[[T4:[0-9]+]], 24($sp)
+  ; GP32:       lw        $[[T5:[0-9]+]], 28($sp)
+  ; GP32:       subu      $[[T6:[0-9]+]], $7, $[[T5]]
+  ; GP32:       subu      $2, $4, $[[T3]]
+  ; GP32:       sltu      $[[T8:[0-9]+]], $6, $[[T4]]
+  ; GP32:       addu      $[[T9:[0-9]+]], $[[T8]], $[[T0]]
+  ; GP32:       subu      $3, $5, $[[T9]]
+  ; GP32:       sltu      $[[T10:[0-9]+]], $7, $[[T5]]
+  ; GP32:       addu      $[[T11:[0-9]+]], $[[T10]], $[[T4]]
+  ; GP32:       subu      $4, $6, $[[T11]]
+  ; GP32:       move      $5, $[[T6]]
+
+  ; GP64:       dsubu     $3, $5, $7
+  ; GP64:       sltu      $[[T0:[0-9]+]], $5, $7
+  ; GP64:       daddu     $[[T1:[0-9]+]], $[[T0]], $6
+  ; GP64:       dsubu     $2, $4, $[[T1]]
+
+  %r = sub i128 %a, %b
+  ret i128 %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/udiv.ll b/test/CodeGen/Mips/llvm-ir/udiv.ll
new file mode 100644
index 0000000..a7cafe5
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/udiv.ll
@@ -0,0 +1,116 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN:    -check-prefix=R6 -check-prefix=GP32
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN:    -check-prefix=NOT-R6 -check-prefix=GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN:    -check-prefix=R6 -check-prefix=64R6
+
+define zeroext i1 @udiv_i1(i1 zeroext %a, i1 zeroext %b) {
+entry:
+; ALL-LABEL: udiv_i1:
+
+  ; NOT-R6:       divu    $zero, $4, $5
+  ; NOT-R6:       teq     $5, $zero, 7
+  ; NOT-R6:       mflo    $2
+
+  ; R6:           divu    $2, $4, $5
+  ; R6:           teq     $5, $zero, 7
+
+  %r = udiv i1 %a, %b
+  ret i1 %r
+}
+
+define zeroext i8 @udiv_i8(i8 zeroext %a, i8 zeroext %b) {
+entry:
+; ALL-LABEL: udiv_i8:
+
+  ; NOT-R6:       divu    $zero, $4, $5
+  ; NOT-R6:       teq     $5, $zero, 7
+  ; NOT-R6:       mflo    $2
+
+  ; R6:           divu    $2, $4, $5
+  ; R6:           teq     $5, $zero, 7
+
+  %r = udiv i8 %a, %b
+  ret i8 %r
+}
+
+define zeroext i16 @udiv_i16(i16 zeroext %a, i16 zeroext %b) {
+entry:
+; ALL-LABEL: udiv_i16:
+
+  ; NOT-R6:       divu    $zero, $4, $5
+  ; NOT-R6:       teq     $5, $zero, 7
+  ; NOT-R6:       mflo    $2
+
+  ; R6:           divu    $2, $4, $5
+  ; R6:           teq     $5, $zero, 7
+
+  %r = udiv i16 %a, %b
+  ret i16 %r
+}
+
+define signext i32 @udiv_i32(i32 signext %a, i32 signext %b) {
+entry:
+; ALL-LABEL: udiv_i32:
+
+  ; NOT-R6:       divu    $zero, $4, $5
+  ; NOT-R6:       teq     $5, $zero, 7
+  ; NOT-R6:       mflo    $2
+
+  ; R6:           divu    $2, $4, $5
+  ; R6:           teq     $5, $zero, 7
+
+  %r = udiv i32 %a, %b
+  ret i32 %r
+}
+
+define signext i64 @udiv_i64(i64 signext %a, i64 signext %b) {
+entry:
+; ALL-LABEL: udiv_i64:
+
+  ; GP32:         lw      $25, %call16(__udivdi3)($gp)
+
+  ; GP64-NOT-R6:  ddivu   $zero, $4, $5
+  ; GP64-NOT-R6:  teq     $5, $zero, 7
+  ; GP64-NOT-R6:  mflo    $2
+
+  ; 64R6:         ddivu   $2, $4, $5
+  ; 64R6:         teq     $5, $zero, 7
+
+  %r = udiv i64 %a, %b
+  ret i64 %r
+}
+
+define signext i128 @udiv_i128(i128 signext %a, i128 signext %b) {
+entry:
+; ALL-LABEL: udiv_i128:
+
+  ; GP32:         lw      $25, %call16(__udivti3)($gp)
+
+  ; GP64-NOT-R6:  ld      $25, %call16(__udivti3)($gp)
+  ; 64-R6:        ld      $25, %call16(__udivti3)($gp)
+
+  %r = udiv i128 %a, %b
+  ret i128 %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/urem.ll b/test/CodeGen/Mips/llvm-ir/urem.ll
new file mode 100644
index 0000000..d5a231c
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/urem.ll
@@ -0,0 +1,155 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN:  -check-prefix=GP32 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN:  -check-prefix=GP32 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s -check-prefix=GP32 \
+; RUN:  -check-prefix=R2-R5 -check-prefix=R2-R6 -check-prefix=NOT-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s -check-prefix=GP32 \
+; RUN:  -check-prefix=R2-R5 -check-prefix=R2-R6 -check-prefix=NOT-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s -check-prefix=GP32 \
+; RUN:  -check-prefix=R2-R5 -check-prefix=R2-R6 -check-prefix=NOT-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN:   -check-prefix=GP32 -check-prefix=R6 -check-prefix=R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN:  -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN:  -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN:  -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN:  -check-prefix=R2-R5 -check-prefix=R2-R6 \
+; RUN:  -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN:  -check-prefix=R2-R5 -check-prefix=R2-R6 \
+; RUN:  -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN:  -check-prefix=R2-R5 -check-prefix=R2-R6 \
+; RUN:  -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN:  -check-prefix=64R6 -check-prefix=R6 -check-prefix=R2-R6
+
+define signext i1 @urem_i1(i1 signext %a, i1 signext %b) {
+entry:
+; ALL-LABEL: urem_i1:
+
+  ; NOT-R6:       andi    $[[T0:[0-9]+]], $5, 1
+  ; NOT-R6:       andi    $[[T1:[0-9]+]], $4, 1
+  ; NOT-R6:       divu    $zero, $[[T1]], $[[T0]]
+  ; NOT-R6:       teq     $[[T0]], $zero, 7
+  ; NOT-R6:       mfhi    $[[T2:[0-9]+]]
+  ; NOT-R6:       sll     $[[T3:[0-9]+]], $[[T2]], 31
+  ; NOT-R6:       sra     $2, $[[T3]], 31
+
+  ; R6:           andi    $[[T0:[0-9]+]], $5, 1
+  ; R6:           andi    $[[T1:[0-9]+]], $4, 1
+  ; R6:           modu    $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+  ; R6:           teq     $[[T0]], $zero, 7
+  ; R6:           sll     $[[T3:[0-9]+]], $[[T2]], 31
+  ; R6:           sra     $2, $[[T3]], 31
+
+  %r = urem i1 %a, %b
+  ret i1 %r
+}
+
+define signext i8 @urem_i8(i8 signext %a, i8 signext %b) {
+entry:
+; ALL-LABEL: urem_i8:
+
+  ; NOT-R2-R6:    andi    $[[T0:[0-9]+]], $5, 255
+  ; NOT-R2-R6:    andi    $[[T1:[0-9]+]], $4, 255
+  ; NOT-R2-R6:    divu    $zero, $[[T1]], $[[T0]]
+  ; NOT-R2-R6:    teq     $[[T0]], $zero, 7
+  ; NOT-R2-R6:    mfhi    $[[T2:[0-9]+]]
+  ; NOT-R2-R6:    sll     $[[T3:[0-9]+]], $[[T2]], 24
+  ; NOT-R2-R6:    sra     $2, $[[T3]], 24
+
+  ; R2-R5:        andi    $[[T0:[0-9]+]], $5, 255
+  ; R2-R5:        andi    $[[T1:[0-9]+]], $4, 255
+  ; R2-R5:        divu    $zero, $[[T1]], $[[T0]]
+  ; R2-R5:        teq     $[[T0]], $zero, 7
+  ; R2-R5:        mfhi    $[[T2:[0-9]+]]
+  ; R2-R5:        seb     $2, $[[T2]]
+
+  ; R6:           andi    $[[T0:[0-9]+]], $5, 255
+  ; R6:           andi    $[[T1:[0-9]+]], $4, 255
+  ; R6:           modu    $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+  ; R6:           teq     $[[T0]], $zero, 7
+  ; R6:           seb     $2, $[[T2]]
+
+  %r = urem i8 %a, %b
+  ret i8 %r
+}
+
+define signext i16 @urem_i16(i16 signext %a, i16 signext %b) {
+entry:
+; ALL-LABEL: urem_i16:
+
+  ; NOT-R2-R6:    andi    $[[T0:[0-9]+]], $5, 65535
+  ; NOT-R2-R6:    andi    $[[T1:[0-9]+]], $4, 65535
+  ; NOT-R2-R6:    divu    $zero, $[[T1]], $[[T0]]
+  ; NOT-R2-R6:    teq     $[[T0]], $zero, 7
+  ; NOT-R2-R6:    mfhi    $[[T2:[0-9]+]]
+  ; NOT-R2-R6:    sll     $[[T3:[0-9]+]], $[[T2]], 16
+  ; NOT-R2-R6:    sra     $2, $[[T3]], 16
+
+  ; R2-R5:        andi    $[[T0:[0-9]+]], $5, 65535
+  ; R2-R5:        andi    $[[T1:[0-9]+]], $4, 65535
+  ; R2-R5:        divu    $zero, $[[T1]], $[[T0]]
+  ; R2-R5:        teq     $[[T0]], $zero, 7
+  ; R2-R5:        mfhi    $[[T3:[0-9]+]]
+  ; R2-R5:        seh     $2, $[[T2]]
+
+  ; R6:           andi    $[[T0:[0-9]+]], $5, 65535
+  ; R6:           andi    $[[T1:[0-9]+]], $4, 65535
+  ; R6:           modu    $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+  ; R6:           teq     $[[T0]], $zero, 7
+  ; R6:           seh     $2, $[[T2]]
+
+  %r = urem i16 %a, %b
+  ret i16 %r
+}
+
+define signext i32 @urem_i32(i32 signext %a, i32 signext %b) {
+entry:
+; ALL-LABEL: urem_i32:
+
+  ; NOT-R6:       divu    $zero, $4, $5
+  ; NOT-R6:       teq     $5, $zero, 7
+  ; NOT-R6:       mfhi    $2
+
+  ; R6:           modu    $2, $4, $5
+  ; R6:           teq     $5, $zero, 7
+
+  %r = urem i32 %a, %b
+  ret i32 %r
+}
+
+define signext i64 @urem_i64(i64 signext %a, i64 signext %b) {
+entry:
+; ALL-LABEL: urem_i64:
+
+  ; GP32:         lw      $25, %call16(__umoddi3)($gp)
+
+  ; GP64-NOT-R6:  ddivu   $zero, $4, $5
+  ; GP64-NOT-R6:  teq     $5, $zero, 7
+  ; GP64-NOT-R6:  mfhi    $2
+
+  ; 64R6:         dmodu   $2, $4, $5
+  ; 64R6:         teq     $5, $zero, 7
+
+  %r = urem i64 %a, %b
+  ret i64 %r
+}
+
+define signext i128 @urem_i128(i128 signext %a, i128 signext %b) {
+entry:
+  ; ALL-LABEL: urem_i128:
+
+    ; GP32:         lw      $25, %call16(__umodti3)($gp)
+
+    ; GP64-NOT-R6:  ld      $25, %call16(__umodti3)($gp)
+    ; 64-R6:        ld      $25, %call16(__umodti3)($gp)
+
+    %r = urem i128 %a, %b
+    ret i128 %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/xor.ll b/test/CodeGen/Mips/llvm-ir/xor.ll
new file mode 100644
index 0000000..89af9998
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/xor.ll
@@ -0,0 +1,99 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=GP64
+
+define signext i1 @xor_i1(i1 signext %a, i1 signext %b) {
+entry:
+; ALL-LABEL: xor_i1:
+
+  ; ALL:          xor     $2, $4, $5
+
+  %r = xor i1 %a, %b
+  ret i1 %r
+}
+
+define signext i8 @xor_i8(i8 signext %a, i8 signext %b) {
+entry:
+; ALL-LABEL: xor_i8:
+
+  ; ALL:          xor     $2, $4, $5
+
+  %r = xor i8 %a, %b
+  ret i8 %r
+}
+
+define signext i16 @xor_i16(i16 signext %a, i16 signext %b) {
+entry:
+; ALL-LABEL: xor_i16:
+
+  ; ALL:          xor     $2, $4, $5
+
+  %r = xor i16 %a, %b
+  ret i16 %r
+}
+
+define signext i32 @xor_i32(i32 signext %a, i32 signext %b) {
+entry:
+; ALL-LABEL: xor_i32:
+
+  ; ALL:          xor     $2, $4, $5
+
+  %r = xor i32 %a, %b
+  ret i32 %r
+}
+
+define signext i64 @xor_i64(i64 signext %a, i64 signext %b) {
+entry:
+; ALL-LABEL: xor_i64:
+
+  ; GP32:         xor     $2, $4, $6
+  ; GP32:         xor     $3, $5, $7
+
+  ; GP64:         xor     $2, $4, $5
+
+  %r = xor i64 %a, %b
+  ret i64 %r
+}
+
+define signext i128 @xor_i128(i128 signext %a, i128 signext %b) {
+entry:
+; ALL-LABEL: xor_i128:
+
+  ; GP32:         lw      $[[T0:[0-9]+]], 24($sp)
+  ; GP32:         lw      $[[T1:[0-9]+]], 20($sp)
+  ; GP32:         lw      $[[T2:[0-9]+]], 16($sp)
+  ; GP32:         xor     $2, $4, $[[T2]]
+  ; GP32:         xor     $3, $5, $[[T1]]
+  ; GP32:         xor     $4, $6, $[[T0]]
+  ; GP32:         lw      $[[T3:[0-9]+]], 28($sp)
+  ; GP32:         xor     $5, $7, $[[T3]]
+
+  ; GP64:         xor     $2, $4, $6
+  ; GP64:         xor     $3, $5, $7
+
+  %r = xor i128 %a, %b
+  ret i128 %r
+}
diff --git a/test/CodeGen/Mips/load-store-left-right.ll b/test/CodeGen/Mips/load-store-left-right.ll
index f6d0e8d..b8e6e83 100644
--- a/test/CodeGen/Mips/load-store-left-right.ll
+++ b/test/CodeGen/Mips/load-store-left-right.ll
@@ -4,14 +4,14 @@
 ; RUN: llc -march=mips     -mcpu=mips32r2            < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EB %s
 ; RUN: llc -march=mipsel   -mcpu=mips32r6            < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32R6 -check-prefix=MIPS32R6-EL %s
 ; RUN: llc -march=mips     -mcpu=mips32r6            < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32R6 -check-prefix=MIPS32R6-EB %s
-; RUN: llc -march=mips64el -mcpu=mips4    -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
-; RUN: llc -march=mips64   -mcpu=mips4    -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
-; RUN: llc -march=mips64el -mcpu=mips64   -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
-; RUN: llc -march=mips64   -mcpu=mips64   -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
-; RUN: llc -march=mips64   -mcpu=mips64r2 -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
-; RUN: llc -march=mips64el -mcpu=mips64r6 -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64R6 -check-prefix=MIPS64R6-EL %s
-; RUN: llc -march=mips64   -mcpu=mips64r6 -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64R6 -check-prefix=MIPS64R6-EB %s
+; RUN: llc -march=mips64el -mcpu=mips4    -target-abi=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
+; RUN: llc -march=mips64   -mcpu=mips4    -target-abi=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
+; RUN: llc -march=mips64el -mcpu=mips64   -target-abi=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
+; RUN: llc -march=mips64   -mcpu=mips64   -target-abi=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
+; RUN: llc -march=mips64   -mcpu=mips64r2 -target-abi=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
+; RUN: llc -march=mips64el -mcpu=mips64r6 -target-abi=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64R6 -check-prefix=MIPS64R6-EL %s
+; RUN: llc -march=mips64   -mcpu=mips64r6 -target-abi=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64R6 -check-prefix=MIPS64R6-EB %s
 
 %struct.SLL = type { i64 }
 %struct.SI = type { i32 }
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index b9b52be..9f5b741 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s
 ; RUN: llc -march=mipsel -force-mips-long-branch -O3 < %s \
 ; RUN:   | FileCheck %s -check-prefix=O32
-; RUN: llc -march=mips64el -mcpu=mips4 -mattr=n64 -force-mips-long-branch -O3 \
+; RUN: llc -march=mips64el -mcpu=mips4 -target-abi=n64 -force-mips-long-branch -O3 \
 ; RUN:   < %s | FileCheck %s -check-prefix=N64
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 -force-mips-long-branch -O3 \
+; RUN: llc -march=mips64el -mcpu=mips64 -target-abi=n64 -force-mips-long-branch -O3 \
 ; RUN:   < %s | FileCheck %s -check-prefix=N64
 ; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=micromips \
 ; RUN:   -force-mips-long-branch -O3 < %s | FileCheck %s -check-prefix=MICROMIPS
@@ -123,11 +123,10 @@ end:
 
 ; MICROMIPS:   $[[BB0]]:
 ; MICROMIPS:        lw      $[[R1:[0-9]+]], %got(x)($[[GP]])
-; MICROMIPS:        addiu   $[[R2:[0-9]+]], $zero, 1
-; MICROMIPS:        sw      $[[R2]], 0($[[R1]])
+; MICROMIPS:        li16    $[[R2:[0-9]+]], 1
+; MICROMIPS:        sw16    $[[R2]], 0($[[R1]])
 ; MICROMIPS:   $[[BB2]]:
-; MICROMIPS:        jr      $ra
-; MICROMIPS:        nop
+; MICROMIPS:        jrc      $ra
 
 
 ; Check the NaCl version.  Check that sp change is not in the branch delay slot
diff --git a/test/CodeGen/Mips/mbrsize4a.ll b/test/CodeGen/Mips/mbrsize4a.ll
index c802991..15e1f47 100644
--- a/test/CodeGen/Mips/mbrsize4a.ll
+++ b/test/CodeGen/Mips/mbrsize4a.ll
@@ -34,4 +34,4 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"=
 attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
-!1 = metadata !{i32 68}
+!1 = !{i32 68}
diff --git a/test/CodeGen/Mips/micromips-and16.ll b/test/CodeGen/Mips/micromips-and16.ll
new file mode 100644
index 0000000..4eacf18
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-and16.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
+; RUN:   -relocation-model=pic -O3 < %s | FileCheck %s
+
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %c = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* %b, align 4
+  %1 = load i32* %c, align 4
+  %and = and i32 %0, %1
+  store i32 %and, i32* %a, align 4
+  ret i32 0
+}
+
+; CHECK: and16
diff --git a/test/CodeGen/Mips/micromips-atomic.ll b/test/CodeGen/Mips/micromips-atomic.ll
index a50e0b7..82eee4b 100644
--- a/test/CodeGen/Mips/micromips-atomic.ll
+++ b/test/CodeGen/Mips/micromips-atomic.ll
@@ -14,5 +14,5 @@ entry:
 ; CHECK:   ll      $[[R1:[0-9]+]], 0($[[R0]])
 ; CHECK:   addu    $[[R2:[0-9]+]], $[[R1]], $4
 ; CHECK:   sc      $[[R2]], 0($[[R0]])
-; CHECK:   beqz    $[[R2]], $[[BB0]]
+; CHECK:   beqzc   $[[R2]], $[[BB0]]
 }
diff --git a/test/CodeGen/Mips/micromips-atomic1.ll b/test/CodeGen/Mips/micromips-atomic1.ll
new file mode 100644
index 0000000..37c3d76
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-atomic1.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=mipsel -filetype=obj --disable-machine-licm -mattr=micromips < %s -o - \
+; RUN:   | llvm-objdump -no-show-raw-insn -arch mipsel -mcpu=mips32r2 -mattr=micromips -d - \
+; RUN:   | FileCheck %s -check-prefix=MICROMIPS
+
+; Use llvm-objdump to check wheter the encodings of microMIPS atomic instructions are correct.
+; While emitting assembly files directly when in microMIPS mode, it is possible to emit a mips32r2
+; instruction instead of microMIPS instruction, and since many mips32r2 and microMIPS
+; instructions have identical assembly formats, invalid instruction cannot be detected.
+
+@y = common global i8 0, align 1
+
+define signext i8 @AtomicLoadAdd8(i8 signext %incr) nounwind {
+entry:
+  %0 = atomicrmw add i8* @y, i8 %incr monotonic
+  ret i8 %0
+
+; MICROMIPS:     ll      ${{[0-9]+}}, 0(${{[0-9]+}})
+; MICROMIPS:     sc      ${{[0-9]+}}, 0(${{[0-9]+}})
+}
+
+define signext i8 @AtomicCmpSwap8(i8 signext %oldval, i8 signext %newval) nounwind {
+entry:
+  %pair0 = cmpxchg i8* @y, i8 %oldval, i8 %newval monotonic monotonic
+  %0 = extractvalue { i8, i1 } %pair0, 0
+  ret i8 %0
+
+; MICROMIPS:     ll      ${{[0-9]+}}, 0(${{[0-9]+}})
+; MICROMIPS:     sc      ${{[0-9]+}}, 0(${{[0-9]+}})
+}
diff --git a/test/CodeGen/Mips/micromips-compact-branches.ll b/test/CodeGen/Mips/micromips-compact-branches.ll
new file mode 100644
index 0000000..670f9a0
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-compact-branches.ll
@@ -0,0 +1,19 @@
+; RUN: llc %s -march=mipsel -mattr=micromips -filetype=asm -O3 \
+; RUN: -disable-mips-delay-filler -relocation-model=pic -o - | FileCheck %s
+
+define void @main() nounwind uwtable {
+entry:
+  %x = alloca i32, align 4
+  %0 = load i32* %x, align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 10, i32* %x, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; CHECK: bnezc
diff --git a/test/CodeGen/Mips/micromips-compact-jump.ll b/test/CodeGen/Mips/micromips-compact-jump.ll
new file mode 100644
index 0000000..70cff84
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-compact-jump.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
+; RUN:   -disable-mips-delay-filler -O3 < %s | FileCheck %s
+
+define i32 @foo(i32 signext %a) #0 {
+entry:
+  ret i32 0
+}
+
+declare i32 @bar(i32 signext) #1
+
+; CHECK:      jrc
diff --git a/test/CodeGen/Mips/micromips-delay-slot-jr.ll b/test/CodeGen/Mips/micromips-delay-slot-jr.ll
new file mode 100644
index 0000000..09a98c2
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-delay-slot-jr.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
+; RUN:   -relocation-model=static -O2 < %s | FileCheck %s
+
+@main.L = internal unnamed_addr constant [3 x i8*] [i8* blockaddress(@main, %L1), i8* blockaddress(@main, %L2), i8* null], align 4
+@str = private unnamed_addr constant [2 x i8] c"A\00"
+@str2 = private unnamed_addr constant [2 x i8] c"B\00"
+
+define i32 @main() #0 {
+entry:
+  br label %L1
+
+L1:                                               ; preds = %entry, %L1
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %L1 ]
+  %puts = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str, i32 0, i32 0))
+  %inc = add i32 %i.0, 1
+  %arrayidx = getelementptr inbounds [3 x i8*]* @main.L, i32 0, i32 %i.0
+  %0 = load i8** %arrayidx, align 4, !tbaa !1
+  indirectbr i8* %0, [label %L1, label %L2]
+
+L2:                                               ; preds = %L1
+  %puts2 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str2, i32 0, i32 0))
+  ret i32 0
+}
+
+declare i32 @puts(i8* nocapture readonly) #1
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"any pointer", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+
+; CHECK:      jrc
+
+%struct.foostruct = type { [3 x float] }
+%struct.barstruct = type { %struct.foostruct, float }
+@bar_ary = common global [4 x %struct.barstruct] zeroinitializer, align 4
+define float* @spooky(i32 signext %i) #0 {
+
+  %safe = getelementptr inbounds [4 x %struct.barstruct]* @bar_ary, i32 0, i32 %i, i32 1
+  store float 1.420000e+02, float* %safe, align 4, !tbaa !1
+  ret float* %safe
+}
+
+; CHECK:      spooky:
+; CHECK:      jrc $ra
+
diff --git a/test/CodeGen/Mips/micromips-delay-slot.ll b/test/CodeGen/Mips/micromips-delay-slot.ll
index 4bab97a..b5f6c56 100644
--- a/test/CodeGen/Mips/micromips-delay-slot.ll
+++ b/test/CodeGen/Mips/micromips-delay-slot.ll
@@ -1,18 +1,18 @@
 ; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
-; RUN:   -relocation-model=pic -O3 < %s | FileCheck %s
+; RUN:   -relocation-model=static -O2 < %s | FileCheck %s
 
-; Function Attrs: nounwind uwtable
-define i32 @foo(i32 %a) #0 {
+; Function Attrs: nounwind
+define i32 @foo(i32 signext %a) #0 {
 entry:
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
   %0 = load i32* %a.addr, align 4
   %shl = shl i32 %0, 2
-  %call = call i32 @bar(i32 %shl)
+  %call = call i32 @bar(i32 signext %shl)
   ret i32 %call
 }
 
-declare i32 @bar(i32) #1
-
-; CHECK: nop
+declare i32 @bar(i32 signext) #1
 
+; CHECK:      jals
+; CHECK-NEXT: sll16
diff --git a/test/CodeGen/Mips/micromips-li.ll b/test/CodeGen/Mips/micromips-li.ll
new file mode 100644
index 0000000..ac315f9
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-li.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
+; RUN:   -relocation-model=pic -O3 < %s | FileCheck %s
+
+@x = external global i32
+@y = external global i32
+@z = external global i32
+
+define i32 @main() nounwind {
+entry:
+  store i32 1, i32* @x, align 4
+  store i32 2148, i32* @y, align 4
+  store i32 33332, i32* @z, align 4
+  ret i32 0
+}
+
+; CHECK: li16   ${{[2-7]|16|17}}, 1
+; CHECK: addiu  ${{[0-9]+}}, $zero, 2148
+; CHECK: ori ${{[0-9]+}}, $zero, 33332
diff --git a/test/CodeGen/Mips/micromips-or16.ll b/test/CodeGen/Mips/micromips-or16.ll
new file mode 100644
index 0000000..ab7e79a
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-or16.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
+; RUN:   -relocation-model=pic -O3 < %s | FileCheck %s
+
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %c = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* %b, align 4
+  %1 = load i32* %c, align 4
+  %or = or i32 %0, %1
+  store i32 %or, i32* %a, align 4
+  ret i32 0
+}
+
+; CHECK: or16
diff --git a/test/CodeGen/Mips/micromips-sw-lw-16.ll b/test/CodeGen/Mips/micromips-sw-lw-16.ll
new file mode 100644
index 0000000..bc09554
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-sw-lw-16.ll
@@ -0,0 +1,27 @@
+; RUN: llc %s -march=mipsel -mattr=micromips -filetype=asm \
+; RUN: -relocation-model=pic -O3 -o - | FileCheck %s
+
+; Function Attrs: noinline nounwind
+define void @bar(i32* %p) #0 {
+entry:
+  %p.addr = alloca i32*, align 4
+  store i32* %p, i32** %p.addr, align 4
+  %0 = load i32** %p.addr, align 4
+  %1 = load i32* %0, align 4
+  %add = add nsw i32 7, %1
+  %2 = load i32** %p.addr, align 4
+  store i32 %add, i32* %2, align 4
+  %3 = load i32** %p.addr, align 4
+  %add.ptr = getelementptr inbounds i32* %3, i32 1
+  %4 = load i32* %add.ptr, align 4
+  %add1 = add nsw i32 7, %4
+  %5 = load i32** %p.addr, align 4
+  %add.ptr2 = getelementptr inbounds i32* %5, i32 1
+  store i32 %add1, i32* %add.ptr2, align 4
+  ret void
+}
+
+; CHECK: lw16 ${{[0-9]+}}, 0($4)
+; CHECK: sw16 ${{[0-9]+}}, 0($4)
+; CHECK: lw16 ${{[0-9]+}}, 4(${{[0-9]+}})
+; CHECK: sw16 ${{[0-9]+}}, 4(${{[0-9]+}})
diff --git a/test/CodeGen/Mips/micromips-xor16.ll b/test/CodeGen/Mips/micromips-xor16.ll
new file mode 100644
index 0000000..9915112
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-xor16.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
+; RUN:   -relocation-model=pic -O3 < %s | FileCheck %s
+
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %c = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* %b, align 4
+  %1 = load i32* %c, align 4
+  %xor = xor i32 %0, %1
+  store i32 %xor, i32* %a, align 4
+  ret i32 0
+}
+
+; CHECK: xor16
diff --git a/test/CodeGen/Mips/mips64-sret.ll b/test/CodeGen/Mips/mips64-sret.ll
index ed494e9..0559747 100644
--- a/test/CodeGen/Mips/mips64-sret.ll
+++ b/test/CodeGen/Mips/mips64-sret.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi=n64 < %s | FileCheck %s
 
 define void @foo(i32* noalias sret %agg.result) nounwind {
 entry:
diff --git a/test/CodeGen/Mips/mips64directive.ll b/test/CodeGen/Mips/mips64directive.ll
index 3d95f51..c4ba534 100644
--- a/test/CodeGen/Mips/mips64directive.ll
+++ b/test/CodeGen/Mips/mips64directive.ll
@@ -1,5 +1,5 @@
-; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=n64 | FileCheck %s
-; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -target-abi=n64 | FileCheck %s
+; RUN: llc  < %s -march=mips64el -mcpu=mips64 -target-abi=n64 | FileCheck %s
 
 @gl = global i64 1250999896321, align 8
 
diff --git a/test/CodeGen/Mips/mips64ext.ll b/test/CodeGen/Mips/mips64ext.ll
index 22ea0eb..9c1243b 100644
--- a/test/CodeGen/Mips/mips64ext.ll
+++ b/test/CodeGen/Mips/mips64ext.ll
@@ -1,5 +1,5 @@
-; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=n64 | FileCheck %s
-; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -target-abi=n64 | FileCheck %s
+; RUN: llc  < %s -march=mips64el -mcpu=mips64 -target-abi=n64 | FileCheck %s
 
 define i64 @zext64_32(i32 %a) nounwind readnone {
 entry:
diff --git a/test/CodeGen/Mips/mips64extins.ll b/test/CodeGen/Mips/mips64extins.ll
index 14f92ca..211cd5f 100644
--- a/test/CodeGen/Mips/mips64extins.ll
+++ b/test/CodeGen/Mips/mips64extins.ll
@@ -1,4 +1,4 @@
-; RUN: llc  < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 | FileCheck %s 
+; RUN: llc  < %s -march=mips64el -mcpu=mips64r2 -target-abi=n64 | FileCheck %s 
 
 define i64 @dext(i64 %i) nounwind readnone {
 entry:
diff --git a/test/CodeGen/Mips/mips64fpimm0.ll b/test/CodeGen/Mips/mips64fpimm0.ll
index 19e076d..0296cb5 100644
--- a/test/CodeGen/Mips/mips64fpimm0.ll
+++ b/test/CodeGen/Mips/mips64fpimm0.ll
@@ -1,5 +1,5 @@
-; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=n64 | FileCheck %s
-; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -target-abi=n64 | FileCheck %s
+; RUN: llc  < %s -march=mips64el -mcpu=mips64 -target-abi=n64 | FileCheck %s
 
 define double @foo1() nounwind readnone {
 entry:
diff --git a/test/CodeGen/Mips/mips64fpldst.ll b/test/CodeGen/Mips/mips64fpldst.ll
index 2f42270..5d62156 100644
--- a/test/CodeGen/Mips/mips64fpldst.ll
+++ b/test/CodeGen/Mips/mips64fpldst.ll
@@ -1,7 +1,7 @@
-; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=-n64,n64 | FileCheck %s -check-prefix=CHECK-N64
-; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=-n64,n32 | FileCheck %s -check-prefix=CHECK-N32
-; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=-n64,n64 | FileCheck %s -check-prefix=CHECK-N64
-; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=-n64,n32 | FileCheck %s -check-prefix=CHECK-N32
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -target-abi n64 | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -target-abi n32 | FileCheck %s -check-prefix=CHECK-N32
+; RUN: llc  < %s -march=mips64el -mcpu=mips64 -target-abi n64 | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc  < %s -march=mips64el -mcpu=mips64 -target-abi n32 | FileCheck %s -check-prefix=CHECK-N32
 
 @f0 = common global float 0.000000e+00, align 4
 @d0 = common global double 0.000000e+00, align 8
diff --git a/test/CodeGen/Mips/mips64intldst.ll b/test/CodeGen/Mips/mips64intldst.ll
index c3607ba..1ceafc1 100644
--- a/test/CodeGen/Mips/mips64intldst.ll
+++ b/test/CodeGen/Mips/mips64intldst.ll
@@ -1,7 +1,7 @@
-; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=-n64,n64 | FileCheck %s -check-prefix=CHECK-N64
-; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=-n64,n32 | FileCheck %s -check-prefix=CHECK-N32
-; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=-n64,n64 | FileCheck %s -check-prefix=CHECK-N64
-; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=-n64,n32 | FileCheck %s -check-prefix=CHECK-N32
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -target-abi n64 | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -target-abi n32 | FileCheck %s -check-prefix=CHECK-N32
+; RUN: llc  < %s -march=mips64el -mcpu=mips64 -target-abi n64 | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc  < %s -march=mips64el -mcpu=mips64 -target-abi n32 | FileCheck %s -check-prefix=CHECK-N32
 
 @c = common global i8 0, align 4
 @s = common global i16 0, align 4
diff --git a/test/CodeGen/Mips/mips64sinttofpsf.ll b/test/CodeGen/Mips/mips64sinttofpsf.ll
new file mode 100644
index 0000000..d3d4603
--- /dev/null
+++ b/test/CodeGen/Mips/mips64sinttofpsf.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=mips64 -mcpu=mips64r2 -soft-float -O0 < %s | FileCheck %s
+
+
+define double @foo() #0 {
+entry:
+  %x = alloca i32, align 4
+  store volatile i32 -32, i32* %x, align 4
+  %0 = load volatile i32* %x, align 4
+  %conv = sitofp i32 %0 to double
+  ret double %conv
+
+; CHECK-NOT:        dsll
+; CHECK-NOT:        dsrl
+
+}
diff --git a/test/CodeGen/Mips/named-register-n32.ll b/test/CodeGen/Mips/named-register-n32.ll
new file mode 100644
index 0000000..b15e928
--- /dev/null
+++ b/test/CodeGen/Mips/named-register-n32.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=mips64 -relocation-model=static -mattr=+noabicalls -target-abi n32 < %s | FileCheck %s
+
+define i32* @get_gp() {
+entry:
+  %0 = call i64 @llvm.read_register.i64(metadata !0)
+  %1 = trunc i64 %0 to i32
+  %2 = inttoptr i32 %1 to i32*
+  ret i32* %2
+}
+
+; CHECK-LABEL: get_gp:
+; CHECK:           sll $2, $gp, 0
+
+declare i64 @llvm.read_register.i64(metadata)
+
+!llvm.named.register.$28 = !{!0}
+
+!0 = !{!"$28"}
diff --git a/test/CodeGen/Mips/named-register-n64.ll b/test/CodeGen/Mips/named-register-n64.ll
new file mode 100644
index 0000000..3198772
--- /dev/null
+++ b/test/CodeGen/Mips/named-register-n64.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=mips64 -relocation-model=static -mattr=+noabicalls < %s | FileCheck %s
+
+define i32* @get_gp() {
+entry:
+  %0 = call i64 @llvm.read_register.i64(metadata !0)
+  %1 = inttoptr i64 %0 to i32*
+  ret i32* %1
+}
+
+; CHECK-LABEL: get_gp:
+; CHECK:           move $2, $gp
+
+declare i64 @llvm.read_register.i64(metadata)
+
+!llvm.named.register.$28 = !{!0}
+
+!0 = !{!"$28"}
diff --git a/test/CodeGen/Mips/named-register-o32.ll b/test/CodeGen/Mips/named-register-o32.ll
new file mode 100644
index 0000000..0890c66
--- /dev/null
+++ b/test/CodeGen/Mips/named-register-o32.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=mips -relocation-model=static -mattr=+noabicalls < %s | FileCheck %s
+
+define i32* @get_gp() {
+entry:
+  %0 = call i32 @llvm.read_register.i32(metadata !0)
+  %1 = inttoptr i32 %0 to i32*
+  ret i32* %1
+}
+
+; CHECK-LABEL: get_gp:
+; CHECK:           move $2, $gp
+
+declare i32 @llvm.read_register.i32(metadata)
+
+!llvm.named.register.$28 = !{!0}
+
+!0 = !{!"$28"}
diff --git a/test/CodeGen/Mips/no-odd-spreg-msa.ll b/test/CodeGen/Mips/no-odd-spreg-msa.ll
new file mode 100644
index 0000000..30dd1ff
--- /dev/null
+++ b/test/CodeGen/Mips/no-odd-spreg-msa.ll
@@ -0,0 +1,131 @@
+; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+fp64,+msa,-nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG
+; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+fp64,+msa,+nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOODDSPREG
+
+@v4f32 = global <4 x float> zeroinitializer
+
+define void @msa_insert_0(float %a) {
+entry:
+  ; Force the float into an odd-numbered register using named registers and
+  ; load the vector.
+  %b = call float asm sideeffect "mov.s $0, $1", "={$f13},{$f12}" (float %a)
+  %0 = load volatile <4 x float>* @v4f32
+
+  ; Clobber all except $f12/$w12 and $f13
+  ;
+  ; The intention is that if odd single precision registers are permitted, the
+  ; allocator will choose $f12/$w12 for the vector and $f13 for the float to
+  ; avoid the spill/reload.
+  ;
+  ; On the other hand, if odd single precision registers are not permitted, it
+  ; must copy $f13 to an even-numbered register before inserting into the
+  ; vector.
+  call void asm sideeffect "# Clobber", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
+  %1 = insertelement <4 x float> %0, float %b, i32 0
+  store <4 x float> %1, <4 x float>* @v4f32
+  ret void
+}
+
+; ALL-LABEL:  msa_insert_0:
+; ALL:            mov.s $f13, $f12
+; ALL:            lw $[[R0:[0-9]+]], %got(v4f32)(
+; ALL:            ld.w $w[[W0:[0-9]+]], 0($[[R0]])
+; NOODDSPREG:     mov.s $f[[F0:[0-9]+]], $f13
+; NOODDSPREG:     insve.w $w[[W0]][0], $w[[F0]][0]
+; ODDSPREG:       insve.w $w[[W0]][0], $w13[0]
+; ALL:            # Clobber
+; ALL-NOT: sdc1
+; ALL-NOT: ldc1
+; ALL:            st.w $w[[W0]], 0($[[R0]])
+
+define void @msa_insert_1(float %a) {
+entry:
+  ; Force the float into an odd-numbered register using named registers and
+  ; load the vector.
+  %b = call float asm sideeffect "mov.s $0, $1", "={$f13},{$f12}" (float %a)
+  %0 = load volatile <4 x float>* @v4f32
+
+  ; Clobber all except $f12/$w12 and $f13
+  ;
+  ; The intention is that if odd single precision registers are permitted, the
+  ; allocator will choose $f12/$w12 for the vector and $f13 for the float to
+  ; avoid the spill/reload.
+  ;
+  ; On the other hand, if odd single precision registers are not permitted, it
+  ; must copy $f13 to an even-numbered register before inserting into the
+  ; vector.
+  call void asm sideeffect "# Clobber", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
+  %1 = insertelement <4 x float> %0, float %b, i32 1
+  store <4 x float> %1, <4 x float>* @v4f32
+  ret void
+}
+
+; ALL-LABEL:  msa_insert_1:
+; ALL:            mov.s $f13, $f12
+; ALL:            lw $[[R0:[0-9]+]], %got(v4f32)(
+; ALL:            ld.w $w[[W0:[0-9]+]], 0($[[R0]])
+; NOODDSPREG:     mov.s $f[[F0:[0-9]+]], $f13
+; NOODDSPREG:     insve.w $w[[W0]][1], $w[[F0]][0]
+; ODDSPREG:       insve.w $w[[W0]][1], $w13[0]
+; ALL:            # Clobber
+; ALL-NOT: sdc1
+; ALL-NOT: ldc1
+; ALL:            st.w $w[[W0]], 0($[[R0]])
+
+define float @msa_extract_0() {
+entry:
+  %0 = load volatile <4 x float>* @v4f32
+  %1 = call <4 x float> asm sideeffect "move.v $0, $1", "={$w13},{$w12}" (<4 x float> %0)
+
+  ; Clobber all except $f12, and $f13
+  ;
+  ; The intention is that if odd single precision registers are permitted, the
+  ; allocator will choose $f13/$w13 for the vector since that saves on moves.
+  ;
+  ; On the other hand, if odd single precision registers are not permitted, it
+  ; must move it to $f12/$w12.
+  call void asm sideeffect "# Clobber", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
+
+  %2 = extractelement <4 x float> %1, i32 0
+  ret float %2
+}
+
+; ALL-LABEL:  msa_extract_0:
+; ALL:            lw $[[R0:[0-9]+]], %got(v4f32)(
+; ALL:            ld.w $w12, 0($[[R0]])
+; ALL:            move.v $w[[W0:13]], $w12
+; NOODDSPREG:     move.v $w[[W0:12]], $w13
+; ALL:            # Clobber
+; ALL-NOT: st.w
+; ALL-NOT: ld.w
+; ALL:            mov.s $f0, $f[[W0]]
+
+define float @msa_extract_1() {
+entry:
+  %0 = load volatile <4 x float>* @v4f32
+  %1 = call <4 x float> asm sideeffect "move.v $0, $1", "={$w13},{$w12}" (<4 x float> %0)
+
+  ; Clobber all except $f13
+  ;
+  ; The intention is that if odd single precision registers are permitted, the
+  ; allocator will choose $f13/$w13 for the vector since that saves on moves.
+  ;
+  ; On the other hand, if odd single precision registers are not permitted, it
+  ; must be spilled.
+  call void asm sideeffect "# Clobber", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f12},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
+
+  %2 = extractelement <4 x float> %1, i32 1
+  ret float %2
+}
+
+; ALL-LABEL:  msa_extract_1:
+; ALL:            lw $[[R0:[0-9]+]], %got(v4f32)(
+; ALL:            ld.w $w12, 0($[[R0]])
+; ALL:            splati.w $w[[W0:[0-9]+]], $w13[1]
+; NOODDSPREG:     st.w $w[[W0]], 0($sp)
+; ODDSPREG-NOT: st.w
+; ODDSPREG-NOT: ld.w
+; ALL:            # Clobber
+; ODDSPREG-NOT: st.w
+; ODDSPREG-NOT: ld.w
+; NOODDSPREG:     ld.w $w0, 0($sp)
+; ODDSPREG:       mov.s $f0, $f[[W0]]
diff --git a/test/CodeGen/Mips/octeon.ll b/test/CodeGen/Mips/octeon.ll
index d5ff9bd..97e12e7 100644
--- a/test/CodeGen/Mips/octeon.ll
+++ b/test/CodeGen/Mips/octeon.ll
@@ -1,15 +1,14 @@
-; RUN: llc -O1 < %s -march=mips64 -mcpu=octeon | FileCheck %s -check-prefix=OCTEON
-; RUN: llc -O1 < %s -march=mips64 -mcpu=mips64 | FileCheck %s -check-prefix=MIPS64
+; RUN: llc -O1 < %s -march=mips64 -mcpu=octeon | FileCheck %s -check-prefix=ALL -check-prefix=OCTEON
+; RUN: llc -O1 < %s -march=mips64 -mcpu=mips64 | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64
 
 define i64 @addi64(i64 %a, i64 %b) nounwind {
 entry:
-; OCTEON-LABEL: addi64:
+; ALL-LABEL: addi64:
 ; OCTEON: jr      $ra
 ; OCTEON: baddu   $2, $4, $5
-; MIPS64-LABEL: addi64:
-; MIPS64: daddu
-; MIPS64: jr
-; MIPS64: andi
+; MIPS64: daddu   $[[T0:[0-9]+]], $4, $5
+; MIPS64: jr      $ra
+; MIPS64: andi    $2, $[[T0]], 255
   %add = add i64 %a, %b
   %and = and i64 %add, 255
   ret i64 %and
@@ -17,13 +16,142 @@ entry:
 
 define i64 @mul(i64 %a, i64 %b) nounwind {
 entry:
-; OCTEON-LABEL: mul:
+; ALL-LABEL: mul:
 ; OCTEON: jr    $ra
 ; OCTEON: dmul  $2, $4, $5
-; MIPS64-LABEL: mul:
-; MIPS64: dmult
-; MIPS64: jr
-; MIPS64: mflo
+; MIPS64: dmult $4, $5
+; MIPS64: jr    $ra
+; MIPS64: mflo  $2
   %res = mul i64 %a, %b
   ret i64 %res
 }
+
+define i64 @cmpeq(i64 %a, i64 %b) nounwind {
+entry:
+; ALL-LABEL: cmpeq:
+; OCTEON: jr     $ra
+; OCTEON: seq    $2, $4, $5
+; MIPS64: xor    $[[T0:[0-9]+]], $4, $5
+; MIPS64: sltiu  $[[T1:[0-9]+]], $[[T0]], 1
+; MIPS64: dsll   $[[T2:[0-9]+]], $[[T1]], 32
+; MIPS64: jr     $ra
+; MIPS64: dsrl   $2, $[[T2]], 32
+  %res = icmp eq i64 %a, %b
+  %res2 = zext i1 %res to i64
+  ret i64 %res2
+}
+
+define i64 @cmpeqi(i64 %a) nounwind {
+entry:
+; ALL-LABEL: cmpeqi:
+; OCTEON: jr     $ra
+; OCTEON: seqi   $2, $4, 42
+; MIPS64: daddiu $[[T0:[0-9]+]], $zero, 42
+; MIPS64: xor    $[[T1:[0-9]+]], $4, $[[T0]]
+; MIPS64: sltiu  $[[T2:[0-9]+]], $[[T1]], 1
+; MIPS64: dsll   $[[T3:[0-9]+]], $[[T2]], 32
+; MIPS64: jr     $ra
+; MIPS64: dsrl   $2, $[[T3]], 32
+  %res = icmp eq i64 %a, 42
+  %res2 = zext i1 %res to i64
+  ret i64 %res2
+}
+
+define i64 @cmpne(i64 %a, i64 %b) nounwind {
+entry:
+; ALL-LABEL: cmpne:
+; OCTEON: jr     $ra
+; OCTEON: sne    $2, $4, $5
+; MIPS64: xor    $[[T0:[0-9]+]], $4, $5
+; MIPS64: sltu   $[[T1:[0-9]+]], $zero, $[[T0]]
+; MIPS64: dsll   $[[T2:[0-9]+]], $[[T1]], 32
+; MIPS64: jr     $ra
+; MIPS64: dsrl   $2, $[[T2]], 32
+  %res = icmp ne i64 %a, %b
+  %res2 = zext i1 %res to i64
+  ret i64 %res2
+}
+
+define i64 @cmpnei(i64 %a) nounwind {
+entry:
+; ALL-LABEL: cmpnei:
+; OCTEON: jr     $ra
+; OCTEON: snei   $2, $4, 42
+; MIPS64: daddiu $[[T0:[0-9]+]], $zero, 42
+; MIPS64: xor    $[[T1:[0-9]+]], $4, $[[T0]]
+; MIPS64: sltu   $[[T2:[0-9]+]], $zero, $[[T1]]
+; MIPS64: dsll   $[[T3:[0-9]+]], $[[T2]], 32
+; MIPS64: jr     $ra
+; MIPS64: dsrl   $2, $[[T3]], 32
+  %res = icmp ne i64 %a, 42
+  %res2 = zext i1 %res to i64
+  ret i64 %res2
+}
+
+define i64 @bbit0(i64 %a) nounwind {
+entry:
+; ALL-LABEL: bbit0:
+; OCTEON: bbit0   $4, 3, $[[BB0:BB[0-9_]+]]
+; MIPS64: andi  $[[T0:[0-9]+]], $4, 8
+; MIPS64: beqz  $[[T0]], $[[BB0:BB[0-9_]+]]
+  %bit = and i64 %a, 8
+  %res = icmp eq i64 %bit, 0
+  br i1 %res, label %endif, label %if
+if:
+  ret i64 48
+
+endif:
+  ret i64 12
+}
+
+define i64 @bbit032(i64 %a) nounwind {
+entry:
+; ALL-LABEL: bbit032:
+; OCTEON: bbit032 $4, 3, $[[BB0:BB[0-9_]+]]
+; MIPS64: daddiu  $[[T0:[0-9]+]], $zero, 1
+; MIPS64: dsll    $[[T1:[0-9]+]], $[[T0]], 35
+; MIPS64: and     $[[T2:[0-9]+]], $4, $[[T1]]
+; MIPS64: beqz    $[[T2]], $[[BB0:BB[0-9_]+]]
+  %bit = and i64 %a, 34359738368
+  %res = icmp eq i64 %bit, 0
+  br i1 %res, label %endif, label %if
+if:
+  ret i64 48
+
+endif:
+  ret i64 12
+}
+
+define i64 @bbit1(i64 %a) nounwind {
+entry:
+; ALL-LABEL: bbit1:
+; OCTEON: bbit1 $4, 3, $[[BB0:BB[0-9_]+]]
+; MIPS64: andi  $[[T0:[0-9]+]], $4, 8
+; MIPS64: beqz  $[[T0]], $[[BB0:BB[0-9_]+]]
+  %bit = and i64 %a, 8
+  %res = icmp ne i64 %bit, 0
+  br i1 %res, label %endif, label %if
+if:
+  ret i64 48
+
+endif:
+  ret i64 12
+}
+
+define i64 @bbit132(i64 %a) nounwind {
+entry:
+; ALL-LABEL: bbit132:
+; OCTEON: bbit132 $4, 3, $[[BB0:BB[0-9_]+]]
+; MIPS64: daddiu  $[[T0:[0-9]+]], $zero, 1
+; MIPS64: dsll    $[[T1:[0-9]+]], $[[T0]], 35
+; MIPS64: and     $[[T2:[0-9]+]], $4, $[[T1]]
+; MIPS64: beqz    $[[T2]], $[[BB0:BB[0-9_]+]]
+  %bit = and i64 %a, 34359738368
+  %res = icmp ne i64 %bit, 0
+  br i1 %res, label %endif, label %if
+if:
+  ret i64 48
+
+endif:
+  ret i64 12
+}
diff --git a/test/CodeGen/Mips/powif64_16.ll b/test/CodeGen/Mips/powif64_16.ll
index 4875727..33ec8c4 100644
--- a/test/CodeGen/Mips/powif64_16.ll
+++ b/test/CodeGen/Mips/powif64_16.ll
@@ -20,7 +20,7 @@ define double @foo_pow_f64(double %y, i32 %p)  {
 attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
 attributes #1 = { nounwind readonly }
 
-!0 = metadata !{metadata !"double", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"int", metadata !1}
+!0 = !{!"double", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"int", !1}
diff --git a/test/CodeGen/Mips/remat-immed-load.ll b/test/CodeGen/Mips/remat-immed-load.ll
index b53b156..3d37b43 100644
--- a/test/CodeGen/Mips/remat-immed-load.ll
+++ b/test/CodeGen/Mips/remat-immed-load.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
-; RUN: llc -march=mips64el -mcpu=mips4 -mattr=n64 < %s | FileCheck %s -check-prefix=64
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | FileCheck %s -check-prefix=64
+; RUN: llc -march=mips64el -mcpu=mips4 -target-abi=n64 < %s | FileCheck %s -check-prefix=64
+; RUN: llc -march=mips64el -mcpu=mips64 -target-abi=n64 < %s | FileCheck %s -check-prefix=64
 
 define void @f0() nounwind {
 entry:
diff --git a/test/CodeGen/Mips/start-asm-file.ll b/test/CodeGen/Mips/start-asm-file.ll
index 9dc501c..60c047a 100644
--- a/test/CodeGen/Mips/start-asm-file.ll
+++ b/test/CodeGen/Mips/start-asm-file.ll
@@ -19,36 +19,36 @@
 
 ; ### N32 ABI ###
 ; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
-; RUN: -relocation-model=static -mattr=-n64,+n32 %s -o - | \
+; RUN: -relocation-model=static -target-abi n32 %s -o - | \
 ; RUN:   FileCheck -check-prefix=CHECK-STATIC-N32 -check-prefix=CHECK-STATIC-N32-NLEGACY %s
 
 ; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
-; RUN: -relocation-model=pic -mattr=-n64,+n32 %s -o - | \
+; RUN: -relocation-model=pic -target-abi n32 %s -o - | \
 ; RUN:   FileCheck -check-prefix=CHECK-PIC-N32 -check-prefix=CHECK-PIC-N32-NLEGACY %s
 
 ; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
-; RUN: -relocation-model=static -mattr=-n64,+n32,+nan2008 %s -o - | \
+; RUN: -relocation-model=static -target-abi n32 -mattr=+nan2008 %s -o - | \
 ; RUN:   FileCheck -check-prefix=CHECK-STATIC-N32 -check-prefix=CHECK-STATIC-N32-N2008 %s
 
 ; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
-; RUN: -relocation-model=pic -mattr=-n64,+n32,+nan2008 %s -o - | \
+; RUN: -relocation-model=pic -target-abi n32 -mattr=+nan2008 %s -o - | \
 ; RUN:   FileCheck -check-prefix=CHECK-PIC-N32 -check-prefix=CHECK-PIC-N32-N2008 %s
 
 ; ### N64 ABI ###
 ; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
-; RUN: -relocation-model=static -mattr=+n64 %s -o - | \
+; RUN: -relocation-model=static -target-abi n64 %s -o - | \
 ; RUN:   FileCheck -check-prefix=CHECK-STATIC-N64 -check-prefix=CHECK-STATIC-N64-NLEGACY %s
 
 ; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
-; RUN: -relocation-model=pic -mattr=+n64 %s -o - | \
+; RUN: -relocation-model=pic -target-abi n64 %s -o - | \
 ; RUN:   FileCheck -check-prefix=CHECK-PIC-N64 -check-prefix=CHECK-PIC-N64-NLEGACY %s
 
 ; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
-; RUN: -relocation-model=static -mattr=+n64,+nan2008 %s -o - | \
+; RUN: -relocation-model=static -target-abi n64 -mattr=+nan2008 %s -o - | \
 ; RUN:   FileCheck -check-prefix=CHECK-STATIC-N64 -check-prefix=CHECK-STATIC-N64-N2008 %s
 
 ; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
-; RUN: -relocation-model=pic -mattr=+n64,+nan2008 %s -o - | \
+; RUN: -relocation-model=pic -target-abi n64 -mattr=+nan2008 %s -o - | \
 ; RUN:   FileCheck -check-prefix=CHECK-PIC-N64 -check-prefix=CHECK-PIC-N64-N2008 %s
 
 ; CHECK-STATIC-O32: .abicalls
diff --git a/test/CodeGen/NVPTX/annotations.ll b/test/CodeGen/NVPTX/annotations.ll
index 39d52d3..2341377 100644
--- a/test/CodeGen/NVPTX/annotations.ll
+++ b/test/CodeGen/NVPTX/annotations.ll
@@ -33,21 +33,14 @@ define void @kernel_func_minctasm(float* %a) {
 
 !nvvm.annotations = !{!1, !2, !3, !4, !5, !6, !7, !8}
 
-!1 = metadata !{void (float*)* @kernel_func_maxntid, metadata !"kernel", i32 1}
-!2 = metadata !{void (float*)* @kernel_func_maxntid,
-                metadata !"maxntidx", i32 10,
-                metadata !"maxntidy", i32 20,
-                metadata !"maxntidz", i32 30}
-
-!3 = metadata !{void (float*)* @kernel_func_reqntid, metadata !"kernel", i32 1}
-!4 = metadata !{void (float*)* @kernel_func_reqntid,
-                metadata !"reqntidx", i32 11,
-                metadata !"reqntidy", i32 22,
-                metadata !"reqntidz", i32 33}
-
-!5 = metadata !{void (float*)* @kernel_func_minctasm, metadata !"kernel", i32 1}
-!6 = metadata !{void (float*)* @kernel_func_minctasm,
-                metadata !"minctasm", i32 42}
-
-!7 = metadata !{i64 addrspace(1)* @texture, metadata !"texture", i32 1}
-!8 = metadata !{i64 addrspace(1)* @surface, metadata !"surface", i32 1}
+!1 = !{void (float*)* @kernel_func_maxntid, !"kernel", i32 1}
+!2 = !{void (float*)* @kernel_func_maxntid, !"maxntidx", i32 10, !"maxntidy", i32 20, !"maxntidz", i32 30}
+
+!3 = !{void (float*)* @kernel_func_reqntid, !"kernel", i32 1}
+!4 = !{void (float*)* @kernel_func_reqntid, !"reqntidx", i32 11, !"reqntidy", i32 22, !"reqntidz", i32 33}
+
+!5 = !{void (float*)* @kernel_func_minctasm, !"kernel", i32 1}
+!6 = !{void (float*)* @kernel_func_minctasm, !"minctasm", i32 42}
+
+!7 = !{i64 addrspace(1)* @texture, !"texture", i32 1}
+!8 = !{i64 addrspace(1)* @surface, !"surface", i32 1}
diff --git a/test/CodeGen/NVPTX/bug21465.ll b/test/CodeGen/NVPTX/bug21465.ll
index 157b28c..cacffce 100644
--- a/test/CodeGen/NVPTX/bug21465.ll
+++ b/test/CodeGen/NVPTX/bug21465.ll
@@ -21,4 +21,4 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"=
 
 !nvvm.annotations = !{!0}
 
-!0 = metadata !{void (%struct.S*, i32*)* @_Z11TakesStruct1SPi, metadata !"kernel", i32 1}
+!0 = !{void (%struct.S*, i32*)* @_Z11TakesStruct1SPi, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/bug22246.ll b/test/CodeGen/NVPTX/bug22246.ll
new file mode 100644
index 0000000..70e7e12
--- /dev/null
+++ b/test/CodeGen/NVPTX/bug22246.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: _Z3foobbbPb
+define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, i8* nocapture %output) {
+entry:
+; CHECK: selp.b32       %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %p{{[0-9]+}}
+  %.sink.v = select i1 %p1, i1 %p2, i1 %p3
+  %frombool5 = zext i1 %.sink.v to i8
+  store i8 %frombool5, i8* %output, align 1
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/bug22322.ll b/test/CodeGen/NVPTX/bug22322.ll
new file mode 100644
index 0000000..19ee694
--- /dev/null
+++ b/test/CodeGen/NVPTX/bug22322.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%class.float3 = type { float, float, float }
+
+; Function Attrs: nounwind
+; CHECK-LABEL: some_kernel
+define void @some_kernel(%class.float3* nocapture %dst) #0 {
+_ZL11compute_vecRK6float3jb.exit:
+  %ret_vec.sroa.8.i = alloca float, align 4
+  %0 = tail call i32 @llvm.ptx.read.ctaid.x()
+  %1 = tail call i32 @llvm.ptx.read.ntid.x()
+  %2 = mul nsw i32 %1, %0
+  %3 = tail call i32 @llvm.ptx.read.tid.x()
+  %4 = add nsw i32 %2, %3
+  %5 = zext i32 %4 to i64
+  %6 = bitcast float* %ret_vec.sroa.8.i to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %6)
+  %7 = and i32 %4, 15
+  %8 = icmp eq i32 %7, 0
+  %9 = select i1 %8, float 0.000000e+00, float -1.000000e+00
+  store float %9, float* %ret_vec.sroa.8.i, align 4
+; CHECK: setp.lt.f32     %p{{[0-9]+}}, %f{{[0-9]+}}, 0f00000000
+  %10 = fcmp olt float %9, 0.000000e+00
+  %ret_vec.sroa.8.i.val = load float* %ret_vec.sroa.8.i, align 4
+  %11 = select i1 %10, float 0.000000e+00, float %ret_vec.sroa.8.i.val
+  call void @llvm.lifetime.end(i64 4, i8* %6)
+  %12 = getelementptr inbounds %class.float3* %dst, i64 %5, i32 0
+  store float 0.000000e+00, float* %12, align 4
+  %13 = getelementptr inbounds %class.float3* %dst, i64 %5, i32 1
+  store float %11, float* %13, align 4
+  %14 = getelementptr inbounds %class.float3* %dst, i64 %5, i32 2
+  store float 0.000000e+00, float* %14, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ptx.read.ctaid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ptx.read.ntid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ptx.read.tid.x() #1
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #2
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+
+!nvvm.annotations = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{void (%class.float3*)* @some_kernel, !"kernel", i32 1}
+!1 = !{!"clang version 3.5.1 (tags/RELEASE_351/final)"}
diff --git a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
index 83d4916..8483112 100644
--- a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -63,4 +63,4 @@ declare void @callee(float*, i8*)
 
 !nvvm.annotations = !{!0}
 
-!0 = metadata !{void (float*)* @kernel_func, metadata !"kernel", i32 1}
+!0 = !{void (float*)* @kernel_func, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/calling-conv.ll b/test/CodeGen/NVPTX/calling-conv.ll
index 190a146..3b03442 100644
--- a/test/CodeGen/NVPTX/calling-conv.ll
+++ b/test/CodeGen/NVPTX/calling-conv.ll
@@ -27,4 +27,4 @@ define void @metadata_kernel(float* %a) {
 
 !nvvm.annotations = !{!1}
 
-!1 = metadata !{void (float*)* @metadata_kernel, metadata !"kernel", i32 1}
+!1 = !{void (float*)* @metadata_kernel, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/fma-assoc.ll b/test/CodeGen/NVPTX/fma-assoc.ll
new file mode 100644
index 0000000..fc04c61
--- /dev/null
+++ b/test/CodeGen/NVPTX/fma-assoc.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
+
+define ptx_device float @t1_f32(float %x, float %y, float %z,
+                                float %u, float %v) {
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul float %x, %y
+  %b = fmul float %u, %v
+  %c = fadd float %a, %b
+  %d = fadd float %c, %z
+  ret float %d
+}
+
+define ptx_device double @t1_f64(double %x, double %y, double %z,
+                                 double %u, double %v) {
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul double %x, %y
+  %b = fmul double %u, %v
+  %c = fadd double %a, %b
+  %d = fadd double %c, %z
+  ret double %d
+}
diff --git a/test/CodeGen/NVPTX/fma.ll b/test/CodeGen/NVPTX/fma.ll
index 14b5c45..6785a01 100644
--- a/test/CodeGen/NVPTX/fma.ll
+++ b/test/CodeGen/NVPTX/fma.ll
@@ -1,5 +1,8 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
 
+declare float @dummy_f32(float, float) #0
+declare double @dummy_f64(double, double) #0
+
 define ptx_device float @t1_f32(float %x, float %y, float %z) {
 ; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
 ; CHECK: ret;
@@ -8,6 +11,17 @@ define ptx_device float @t1_f32(float %x, float %y, float %z) {
   ret float %b
 }
 
+define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) {
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul float %x, %y
+  %b = fadd float %a, %z
+  %c = fadd float %a, %w
+  %d = call float @dummy_f32(float %b, float %c)
+  ret float %d
+}
+
 define ptx_device double @t1_f64(double %x, double %y, double %z) {
 ; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
 ; CHECK: ret;
@@ -15,3 +29,14 @@ define ptx_device double @t1_f64(double %x, double %y, double %z) {
   %b = fadd double %a, %z
   ret double %b
 }
+
+define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) {
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul double %x, %y
+  %b = fadd double %a, %z
+  %c = fadd double %a, %w
+  %d = call double @dummy_f64(double %b, double %c)
+  ret double %d
+}
diff --git a/test/CodeGen/NVPTX/generic-to-nvvm.ll b/test/CodeGen/NVPTX/generic-to-nvvm.ll
index 2a52798..fb63d6e 100644
--- a/test/CodeGen/NVPTX/generic-to-nvvm.ll
+++ b/test/CodeGen/NVPTX/generic-to-nvvm.ll
@@ -23,4 +23,4 @@ define void @foo(i32* %a, i32* %b) {
 
 
 !nvvm.annotations = !{!0}
-!0 = metadata !{void (i32*, i32*)* @foo, metadata !"kernel", i32 1}
+!0 = !{void (i32*, i32*)* @foo, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/i1-global.ll b/test/CodeGen/NVPTX/i1-global.ll
index 1dd8ae4..e3fe08e 100644
--- a/test/CodeGen/NVPTX/i1-global.ll
+++ b/test/CodeGen/NVPTX/i1-global.ll
@@ -16,4 +16,4 @@ define void @foo(i1 %p, i32* %out) {
 
 
 !nvvm.annotations = !{!0}
-!0 = metadata !{void (i1, i32*)* @foo, metadata !"kernel", i32 1}
+!0 = !{void (i1, i32*)* @foo, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/i1-param.ll b/test/CodeGen/NVPTX/i1-param.ll
index f4df874..aac7196 100644
--- a/test/CodeGen/NVPTX/i1-param.ll
+++ b/test/CodeGen/NVPTX/i1-param.ll
@@ -16,4 +16,4 @@ define void @foo(i1 %p, i32* %out) {
 
 
 !nvvm.annotations = !{!0}
-!0 = metadata !{void (i1, i32*)* @foo, metadata !"kernel", i32 1}
+!0 = !{void (i1, i32*)* @foo, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/managed.ll b/test/CodeGen/NVPTX/managed.ll
index 4d7e781..d3f1604 100644
--- a/test/CodeGen/NVPTX/managed.ll
+++ b/test/CodeGen/NVPTX/managed.ll
@@ -8,4 +8,4 @@
 
 
 !nvvm.annotations = !{!0}
-!0 = metadata !{i32 addrspace(1)* @managed_g, metadata !"managed", i32 1}
+!0 = !{i32 addrspace(1)* @managed_g, !"managed", i32 1}
diff --git a/test/CodeGen/NVPTX/noduplicate-syncthreads.ll b/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
index 64745fc..841bbc3 100644
--- a/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
+++ b/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
@@ -70,5 +70,5 @@ if.end17:                                         ; preds = %if.else13, %if.then
 ; Function Attrs: noduplicate nounwind
 declare void @llvm.cuda.syncthreads() #2
 
-!0 = metadata !{void (float*)* @foo, metadata !"kernel", i32 1}
-!1 = metadata !{null, metadata !"align", i32 8}
+!0 = !{void (float*)* @foo, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
diff --git a/test/CodeGen/NVPTX/nounroll.ll b/test/CodeGen/NVPTX/nounroll.ll
new file mode 100644
index 0000000..db96d2a
--- /dev/null
+++ b/test/CodeGen/NVPTX/nounroll.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+; Compiled from the following CUDA code:
+;
+;   #pragma nounroll
+;   for (int i = 0; i < 2; ++i)
+;     output[i] = input[i];
+define void @nounroll(float* %input, float* %output) {
+; CHECK-LABEL: .visible .func nounroll(
+entry:
+  br label %for.body
+
+for.body:
+; CHECK: .pragma "nounroll"
+  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.06 to i64
+  %arrayidx = getelementptr inbounds float* %input, i64 %idxprom
+  %0 = load float* %arrayidx, align 4
+; CHECK: ld.f32
+  %arrayidx2 = getelementptr inbounds float* %output, i64 %idxprom
+  store float %0, float* %arrayidx2, align 4
+; CHECK: st.f32
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, 2
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+; CHECK-NOT: ld.f32
+; CHECK-NOT: st.f32
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.unroll.disable"}
diff --git a/test/CodeGen/NVPTX/nvcl-param-align.ll b/test/CodeGen/NVPTX/nvcl-param-align.ll
new file mode 100644
index 0000000..c1a489f
--- /dev/null
+++ b/test/CodeGen/NVPTX/nvcl-param-align.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-unknown-nvcl"
+
+; CHECK-LABEL: .entry foo(
+define void @foo(i64 %img, i64 %sampler, <5 x float>* %v) {
+; The parameter alignment should be the next power of 2 of 5xsizeof(float),
+; which is 32.
+; CHECK: .param .u32 .ptr .align 32 foo_param_2
+  ret void
+}
+
+!nvvm.annotations = !{!1, !2, !3}
+!1 = !{void (i64, i64, <5 x float>*)* @foo, !"kernel", i32 1}
+!2 = !{void (i64, i64, <5 x float>*)* @foo, !"rdoimage", i32 0}
+!3 = !{void (i64, i64, <5 x float>*)* @foo, !"sampler", i32 1}
diff --git a/test/CodeGen/NVPTX/refl1.ll b/test/CodeGen/NVPTX/refl1.ll
index 4aeff09..e8782ea 100644
--- a/test/CodeGen/NVPTX/refl1.ll
+++ b/test/CodeGen/NVPTX/refl1.ll
@@ -36,4 +36,4 @@ attributes #2 = { alwaysinline inlinehint nounwind readnone }
 
 !nvvm.annotations = !{!0}
 
-!0 = metadata !{void (float*)* @foo, metadata !"kernel", i32 1}
+!0 = !{void (float*)* @foo, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/simple-call.ll b/test/CodeGen/NVPTX/simple-call.ll
index ab6f423..1b41361 100644
--- a/test/CodeGen/NVPTX/simple-call.ll
+++ b/test/CodeGen/NVPTX/simple-call.ll
@@ -23,4 +23,4 @@ define void @kernel_func(float* %a) {
 
 !nvvm.annotations = !{!1}
 
-!1 = metadata !{void (float*)* @kernel_func, metadata !"kernel", i32 1}
+!1 = !{void (float*)* @kernel_func, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/surf-read-cuda.ll b/test/CodeGen/NVPTX/surf-read-cuda.ll
index 10a1ecc..ed02134 100644
--- a/test/CodeGen/NVPTX/surf-read-cuda.ll
+++ b/test/CodeGen/NVPTX/surf-read-cuda.ll
@@ -47,7 +47,7 @@ define void @bar(float* %red, i32 %idx) {
 
 
 !nvvm.annotations = !{!1, !2, !3}
-!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1}
-!2 = metadata !{void (float*, i32)* @bar, metadata !"kernel", i32 1}
-!3 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1}
+!1 = !{void (i64, float*, i32)* @foo, !"kernel", i32 1}
+!2 = !{void (float*, i32)* @bar, !"kernel", i32 1}
+!3 = !{i64 addrspace(1)* @surf0, !"surface", i32 1}
 
diff --git a/test/CodeGen/NVPTX/surf-read.ll b/test/CodeGen/NVPTX/surf-read.ll
index a69d03e..7383722 100644
--- a/test/CodeGen/NVPTX/surf-read.ll
+++ b/test/CodeGen/NVPTX/surf-read.ll
@@ -16,5 +16,5 @@ define void @foo(i64 %img, float* %red, i32 %idx) {
 }
 
 !nvvm.annotations = !{!1, !2}
-!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1}
-!2 = metadata !{void (i64, float*, i32)* @foo, metadata !"rdwrimage", i32 0}
+!1 = !{void (i64, float*, i32)* @foo, !"kernel", i32 1}
+!2 = !{void (i64, float*, i32)* @foo, !"rdwrimage", i32 0}
diff --git a/test/CodeGen/NVPTX/surf-write-cuda.ll b/test/CodeGen/NVPTX/surf-write-cuda.ll
index 654c47f..da55a24 100644
--- a/test/CodeGen/NVPTX/surf-write-cuda.ll
+++ b/test/CodeGen/NVPTX/surf-write-cuda.ll
@@ -36,7 +36,7 @@ define void @bar(i32 %val, i32 %idx) {
 
 
 !nvvm.annotations = !{!1, !2, !3}
-!1 = metadata !{void (i64, i32, i32)* @foo, metadata !"kernel", i32 1}
-!2 = metadata !{void (i32, i32)* @bar, metadata !"kernel", i32 1}
-!3 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1}
+!1 = !{void (i64, i32, i32)* @foo, !"kernel", i32 1}
+!2 = !{void (i32, i32)* @bar, !"kernel", i32 1}
+!3 = !{i64 addrspace(1)* @surf0, !"surface", i32 1}
 
diff --git a/test/CodeGen/NVPTX/surf-write.ll b/test/CodeGen/NVPTX/surf-write.ll
index 880231f..5098d2a 100644
--- a/test/CodeGen/NVPTX/surf-write.ll
+++ b/test/CodeGen/NVPTX/surf-write.ll
@@ -12,5 +12,5 @@ define void @foo(i64 %img, i32 %val, i32 %idx) {
 }
 
 !nvvm.annotations = !{!1, !2}
-!1 = metadata !{void (i64, i32, i32)* @foo, metadata !"kernel", i32 1}
-!2 = metadata !{void (i64, i32, i32)* @foo, metadata !"wroimage", i32 0}
+!1 = !{void (i64, i32, i32)* @foo, !"kernel", i32 1}
+!2 = !{void (i64, i32, i32)* @foo, !"wroimage", i32 0}
diff --git a/test/CodeGen/NVPTX/tex-read-cuda.ll b/test/CodeGen/NVPTX/tex-read-cuda.ll
index ee0cefa..c5b5600 100644
--- a/test/CodeGen/NVPTX/tex-read-cuda.ll
+++ b/test/CodeGen/NVPTX/tex-read-cuda.ll
@@ -41,6 +41,6 @@ define void @bar(float* %red, i32 %idx) {
 }
 
 !nvvm.annotations = !{!1, !2, !3}
-!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1}
-!2 = metadata !{void (float*, i32)* @bar, metadata !"kernel", i32 1}
-!3 = metadata !{i64 addrspace(1)* @tex0, metadata !"texture", i32 1}
+!1 = !{void (i64, float*, i32)* @foo, !"kernel", i32 1}
+!2 = !{void (float*, i32)* @bar, !"kernel", i32 1}
+!3 = !{i64 addrspace(1)* @tex0, !"texture", i32 1}
diff --git a/test/CodeGen/NVPTX/tex-read.ll b/test/CodeGen/NVPTX/tex-read.ll
index 55e4bfc..6e0fda6 100644
--- a/test/CodeGen/NVPTX/tex-read.ll
+++ b/test/CodeGen/NVPTX/tex-read.ll
@@ -15,6 +15,6 @@ define void @foo(i64 %img, i64 %sampler, float* %red, i32 %idx) {
 }
 
 !nvvm.annotations = !{!1, !2, !3}
-!1 = metadata !{void (i64, i64, float*, i32)* @foo, metadata !"kernel", i32 1}
-!2 = metadata !{void (i64, i64, float*, i32)* @foo, metadata !"rdoimage", i32 0}
-!3 = metadata !{void (i64, i64, float*, i32)* @foo, metadata !"sampler", i32 1}
+!1 = !{void (i64, i64, float*, i32)* @foo, !"kernel", i32 1}
+!2 = !{void (i64, i64, float*, i32)* @foo, !"rdoimage", i32 0}
+!3 = !{void (i64, i64, float*, i32)* @foo, !"sampler", i32 1}
diff --git a/test/CodeGen/NVPTX/texsurf-queries.ll b/test/CodeGen/NVPTX/texsurf-queries.ll
index c7637cc..e56eb5d 100644
--- a/test/CodeGen/NVPTX/texsurf-queries.ll
+++ b/test/CodeGen/NVPTX/texsurf-queries.ll
@@ -99,5 +99,5 @@ define i32 @s3() {
 
 
 !nvvm.annotations = !{!1, !2}
-!1 = metadata !{i64 addrspace(1)* @tex0, metadata !"texture", i32 1}
-!2 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1}
+!1 = !{i64 addrspace(1)* @tex0, !"texture", i32 1}
+!2 = !{i64 addrspace(1)* @surf0, !"surface", i32 1}
diff --git a/test/CodeGen/NVPTX/vector-global.ll b/test/CodeGen/NVPTX/vector-global.ll
new file mode 100644
index 0000000..a463bee
--- /dev/null
+++ b/test/CodeGen/NVPTX/vector-global.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+@g1 = external global <4 x i32> ; external global variable
+; CHECK: .extern .global .align 16 .b8 g1[16];
+@g2 = global <4 x i32> zeroinitializer ; module-level global variable
+; CHECK: .visible .global .align 16 .b8 g2[16];
diff --git a/test/CodeGen/NVPTX/weak-linkage.ll b/test/CodeGen/NVPTX/weak-linkage.ll
index 7a13357..5df57b2 100644
--- a/test/CodeGen/NVPTX/weak-linkage.ll
+++ b/test/CodeGen/NVPTX/weak-linkage.ll
@@ -1,11 +1,17 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 
-
+; CHECK: // .weak foo
 ; CHECK: .weak .func foo
 define weak void @foo() {
   ret void
 }
 
+; CHECK: // .weak baz
+; CHECK: .weak .func baz
+define weak_odr void @baz() {
+  ret void
+}
+
 ; CHECK: .visible .func bar
 define void @bar() {
   ret void
diff --git a/test/CodeGen/PowerPC/2007-03-24-cntlzd.ll b/test/CodeGen/PowerPC/2007-03-24-cntlzd.ll
index 3620b0e..3624b51 100644
--- a/test/CodeGen/PowerPC/2007-03-24-cntlzd.ll
+++ b/test/CodeGen/PowerPC/2007-03-24-cntlzd.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -march=ppc64 -mcpu=g5 | grep cntlzd
+; RUN: llc -mcpu=g5 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
 
 define i32 @_ZNK4llvm5APInt17countLeadingZerosEv(i64 *%t) nounwind {
         %tmp19 = load i64* %t
@@ -7,6 +9,12 @@ define i32 @_ZNK4llvm5APInt17countLeadingZerosEv(i64 *%t) nounwind {
         %tmp89 = add i32 %tmp23, -64          ; <i32> [#uses=1]
         %tmp90 = add i32 %tmp89, 0            ; <i32> [#uses=1]
         ret i32 %tmp90
+
+; CHECK-LABEL: @_ZNK4llvm5APInt17countLeadingZerosEv
+; CHECK: ld [[REG1:[0-9]+]], 0(3)
+; CHECK-NEXT: cntlzd [[REG2:[0-9]+]], [[REG1]]
+; CHECK-NEXT: addi 3, [[REG2]], -64
+; CHECK-NEXT: blr
 }
 
 declare i64 @llvm.ctlz.i64(i64, i1)
diff --git a/test/CodeGen/PowerPC/2011-12-05-NoSpillDupCR.ll b/test/CodeGen/PowerPC/2011-12-05-NoSpillDupCR.ll
index 3acd01d..e7bc5bf 100644
--- a/test/CodeGen/PowerPC/2011-12-05-NoSpillDupCR.ll
+++ b/test/CodeGen/PowerPC/2011-12-05-NoSpillDupCR.ll
@@ -183,4 +183,4 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32,
 
 declare i32 @puts(i8* nocapture) nounwind
 
-!3 = metadata !{metadata !"branch_weights", i32 64, i32 4}
+!3 = !{!"branch_weights", i32 64, i32 4}
diff --git a/test/CodeGen/PowerPC/2011-12-06-SpillAndRestoreCR.ll b/test/CodeGen/PowerPC/2011-12-06-SpillAndRestoreCR.ll
index 4a1a512..a6223d4 100644
--- a/test/CodeGen/PowerPC/2011-12-06-SpillAndRestoreCR.ll
+++ b/test/CodeGen/PowerPC/2011-12-06-SpillAndRestoreCR.ll
@@ -217,4 +217,4 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32,
 
 declare i32 @puts(i8* nocapture) nounwind
 
-!3 = metadata !{metadata !"branch_weights", i32 64, i32 4}
+!3 = !{!"branch_weights", i32 64, i32 4}
diff --git a/test/CodeGen/PowerPC/Frames-large.ll b/test/CodeGen/PowerPC/Frames-large.ll
index 0ccea42..5b8aef4 100644
--- a/test/CodeGen/PowerPC/Frames-large.ll
+++ b/test/CodeGen/PowerPC/Frames-large.ll
@@ -1,9 +1,8 @@
-; RUN: llvm-as < %s > %t.bc
-; RUN: llc < %t.bc -march=ppc32 | FileCheck %s -check-prefix=PPC32-NOFP
-; RUN: llc < %t.bc -march=ppc32 -disable-fp-elim | FileCheck %s -check-prefix=PPC32-FP
+; RUN: llc < %s -march=ppc32 | FileCheck %s -check-prefix=PPC32-NOFP
+; RUN: llc < %s -march=ppc32 -disable-fp-elim | FileCheck %s -check-prefix=PPC32-FP
 
-; RUN: llc < %t.bc -march=ppc64 | FileCheck %s -check-prefix=PPC64-NOFP
-; RUN: llc < %t.bc -march=ppc64 -disable-fp-elim | FileCheck %s -check-prefix=PPC64-FP
+; RUN: llc < %s -march=ppc64 | FileCheck %s -check-prefix=PPC64-NOFP
+; RUN: llc < %s -march=ppc64 -disable-fp-elim | FileCheck %s -check-prefix=PPC64-FP
 
 
 target triple = "powerpc-apple-darwin8"
diff --git a/test/CodeGen/PowerPC/aa-tbaa.ll b/test/CodeGen/PowerPC/aa-tbaa.ll
index 1939841..0e7ff3d 100644
--- a/test/CodeGen/PowerPC/aa-tbaa.ll
+++ b/test/CodeGen/PowerPC/aa-tbaa.ll
@@ -35,7 +35,7 @@ next:
 ; CHECK: blr
 }
 
-!0 = metadata !{ metadata !"root" }
-!1 = metadata !{ metadata !"set1", metadata !0 }
-!2 = metadata !{ metadata !"set2", metadata !0 }
+!0 = !{ !"root" }
+!1 = !{ !"set1", !0 }
+!2 = !{ !"set2", !0 }
 
diff --git a/test/CodeGen/PowerPC/add-fi.ll b/test/CodeGen/PowerPC/add-fi.ll
new file mode 100644
index 0000000..18892c8
--- /dev/null
+++ b/test/CodeGen/PowerPC/add-fi.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mcpu=ppc64 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32* @test1() {
+        %X = alloca { i32, i32 }
+        %Y = getelementptr {i32,i32}* %X, i32 0, i32 1
+        ret i32* %Y
+
+; CHECK-LABEL: @test1
+; CHECK: addi 3, 1, -4
+; CHECK: blr
+}
+
+define i32* @test2() {
+        %X = alloca { i32, i32, i32, i32 }
+        %Y = getelementptr {i32,i32,i32,i32}* %X, i32 0, i32 3
+        ret i32* %Y
+
+; CHECK-LABEL: @test2
+; CHECK: addi 3, 1, -4
+; CHECK: blr
+}
+
diff --git a/test/CodeGen/PowerPC/addi-licm.ll b/test/CodeGen/PowerPC/addi-licm.ll
new file mode 100644
index 0000000..070d86f
--- /dev/null
+++ b/test/CodeGen/PowerPC/addi-licm.ll
@@ -0,0 +1,64 @@
+; RUN: llc -mcpu=pwr7 -disable-ppc-preinc-prep < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PIP
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define double @foo() #1 {
+entry:
+  %x = alloca [2048 x float], align 4
+  %y = alloca [2048 x float], align 4
+  %0 = bitcast [2048 x float]* %x to i8*
+  call void @llvm.lifetime.start(i64 8192, i8* %0) #2
+  %1 = bitcast [2048 x float]* %y to i8*
+  call void @llvm.lifetime.start(i64 8192, i8* %1) #2
+  br label %for.body.i
+
+; CHECK-LABEL: @foo
+; CHECK: addi [[REG1:[0-9]+]], 1,
+; CHECK: addi [[REG2:[0-9]+]], 1,
+; CHECK: %for.body.i
+; CHECK-DAG: lfsx {{[0-9]+}}, [[REG1]],
+; CHECK-DAG: lfsx {{[0-9]+}}, [[REG2]],
+; CHECK: blr
+
+; PIP-LABEL: @foo
+; PIP: addi [[REG1:[0-9]+]], 1,
+; PIP: addi [[REG2:[0-9]+]], 1,
+; PIP: %for.body.i
+; PIP-DAG: lfsu {{[0-9]+}}, 4([[REG1]])
+; PIP-DAG: lfsu {{[0-9]+}}, 4([[REG2]])
+; PIP: blr
+
+for.body.i:                                       ; preds = %for.body.i.preheader, %for.body.i
+  %accumulator.09.i = phi double [ %add.i, %for.body.i ], [ 0.000000e+00, %entry ]
+  %i.08.i = phi i64 [ %inc.i, %for.body.i ], [ 0, %entry ]
+  %arrayidx.i = getelementptr inbounds [2048 x float]* %x, i64 0, i64 %i.08.i
+  %v14 = load float* %arrayidx.i, align 4
+  %conv.i = fpext float %v14 to double
+  %arrayidx1.i = getelementptr inbounds [2048 x float]* %y, i64 0, i64 %i.08.i
+  %v15 = load float* %arrayidx1.i, align 4
+  %conv2.i = fpext float %v15 to double
+  %mul.i = fmul double %conv.i, %conv2.i
+  %add.i = fadd double %accumulator.09.i, %mul.i
+  %inc.i = add nuw nsw i64 %i.08.i, 1
+  %exitcond.i = icmp eq i64 %i.08.i, 2047
+  br i1 %exitcond.i, label %loop.exit, label %for.body.i
+
+loop.exit:                                        ; preds = %for.body.i
+  ret double %accumulator.09.i
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #2
+
+declare void @bar(float*, float*)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #2
+
+attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind }
+
+
diff --git a/test/CodeGen/PowerPC/arr-fp-arg-no-copy.ll b/test/CodeGen/PowerPC/arr-fp-arg-no-copy.ll
new file mode 100644
index 0000000..fd430a6
--- /dev/null
+++ b/test/CodeGen/PowerPC/arr-fp-arg-no-copy.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mcpu=ppc64 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @bar() #0 {
+entry:
+  tail call void @xxx([2 x i64] [i64 4607182418800017408, i64 4611686018427387904]) #0
+  ret void
+
+; CHECK-LABEL: @bar
+; CHECK-DAG: li [[REG1:[0-9]+]], 1023
+; CHECK-DAG: li [[REG2:[0-9]+]], {{1$}}
+; CHECK-DAG: sldi 3, [[REG1]], 52
+; CHECK-DAG: sldi 4, [[REG2]], 62
+; CHECK: bl xxx
+; CHECK: blr
+}
+
+declare void @xxx([2 x i64])
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/asm-Zy.ll b/test/CodeGen/PowerPC/asm-Zy.ll
index 691165f..6d1ab0e 100644
--- a/test/CodeGen/PowerPC/asm-Zy.ll
+++ b/test/CodeGen/PowerPC/asm-Zy.ll
@@ -10,5 +10,5 @@ entry:
 ; CHECK: lwbrx 3, 0,
 }
 
-!0 = metadata !{i32 101688}
+!0 = !{i32 101688}
 
diff --git a/test/CodeGen/PowerPC/asm-constraints.ll b/test/CodeGen/PowerPC/asm-constraints.ll
index 998b618..9bf8b75 100644
--- a/test/CodeGen/PowerPC/asm-constraints.ll
+++ b/test/CodeGen/PowerPC/asm-constraints.ll
@@ -30,15 +30,16 @@ entry:
 }
 
 ; CHECK-LABEL: @foo
-; CHECK: ld [[REG:[0-9]+]],0(4)
-; CHECK-NEXT: cmpw [[REG]],[[REG]]
-; CHECK-NEXT: bne- 1f
-; CHECK-NEXT: 1: isync
+; CHECK: ld [[REG:[0-9]+]], 0(4)
+; CHECK: cmpw 0, [[REG]], [[REG]]
+; CHECK: bne- 0, .Ltmp[[TMP:[0-9]+]]
+; CHECK: .Ltmp[[TMP]]:
+; CHECK: isync
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.6.0 (trunk 217557)"}
-!1 = metadata !{i32 67, i32 91, i32 110, i32 126}
+!0 = !{!"clang version 3.6.0 (trunk 217557)"}
+!1 = !{i32 67, i32 91, i32 110, i32 126}
diff --git a/test/CodeGen/PowerPC/bperm.ll b/test/CodeGen/PowerPC/bperm.ll
new file mode 100644
index 0000000..c489c1f
--- /dev/null
+++ b/test/CodeGen/PowerPC/bperm.ll
@@ -0,0 +1,279 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone
+define zeroext i32 @bs4(i32 zeroext %a) #0 {
+entry:
+  %0 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %0
+
+; CHECK-LABEL: @bs4
+; CHECK: rlwinm [[REG1:[0-9]+]], 3, 8, 0, 31
+; CHECK: rlwimi [[REG1]], 3, 24, 16, 23
+; CHECK: rlwimi [[REG1]], 3, 24, 0, 7
+; CHECK: mr 3, [[REG1]]
+; CHECK: blr
+}
+
+define i64 @bs8(i64 %x) #0 {
+entry:
+  %0 = tail call i64 @llvm.bswap.i64(i64 %x)
+  ret i64 %0
+
+; CHECK-LABEL: @bs8
+; CHECK-DAG: rldicl [[REG1:[0-9]+]], 3, 16, 0
+; CHECK-DAG: rldicl [[REG2:[0-9]+]], 3, 8, 0
+; CHECK-DAG: rldicl [[REG3:[0-9]+]], 3, 24, 0
+; CHECK-DAG: rldimi [[REG2]], [[REG1]], 8, 48
+; CHECK-DAG: rldicl [[REG4:[0-9]+]], 3, 32, 0
+; CHECK-DAG: rldimi [[REG2]], [[REG3]], 16, 40
+; CHECK-DAG: rldicl [[REG5:[0-9]+]], 3, 48, 0
+; CHECK-DAG: rldimi [[REG2]], [[REG4]], 24, 32
+; CHECK-DAG: rldicl [[REG6:[0-9]+]], 3, 56, 0
+; CHECK-DAG: rldimi [[REG2]], [[REG5]], 40, 16
+; CHECK-DAG: rldimi [[REG2]], [[REG6]], 48, 8
+; CHECK-DAG: rldimi [[REG2]], 3, 56, 0
+; CHECK: mr 3, [[REG2]]
+; CHECK: blr
+}
+
+define i64 @test1(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = lshr i64 %i1, 8
+  %and = and i64 %0, 5963776000
+  ret i64 %and
+
+; CHECK-LABEL: @test1
+; CHECK-DAG: li [[REG1:[0-9]+]], 11375
+; CHECK-DAG: rldicl [[REG3:[0-9]+]], 4, 56, 0
+; CHECK-DAG: sldi [[REG2:[0-9]+]], [[REG1]], 19
+; CHECK: and 3, [[REG3]], [[REG2]]
+; CHECK: blr
+}
+
+define i64 @test2(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = lshr i64 %i1, 6
+  %and = and i64 %0, 133434808670355456
+  ret i64 %and
+
+; CHECK-LABEL: @test2
+; CHECK-DAG: lis [[REG1:[0-9]+]], 474
+; CHECK-DAG: rldicl [[REG5:[0-9]+]], 4, 58, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 3648
+; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 32
+; CHECK-DAG: oris [[REG4:[0-9]+]], [[REG3]], 25464
+; CHECK: and 3, [[REG5]], [[REG4]]
+; CHECK: blr
+}
+
+define i64 @test3(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = shl i64 %i0, 34
+  %and = and i64 %0, 191795733152661504
+  ret i64 %and
+
+; CHECK-LABEL: @test3
+; CHECK-DAG: lis [[REG1:[0-9]+]], 170
+; CHECK-DAG: rldicl [[REG4:[0-9]+]], 3, 34, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 22861
+; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 34
+; CHECK: and 3, [[REG4]], [[REG3]]
+; CHECK: blr
+}
+
+define i64 @test4(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = lshr i64 %i1, 15
+  %and = and i64 %0, 58195968
+  ret i64 %and
+
+; CHECK-LABEL: @test4
+; CHECK: rldicl [[REG1:[0-9]+]], 4, 49, 0
+; CHECK: andis. 3, [[REG1]], 888
+; CHECK: blr
+}
+
+define i64 @test5(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = shl i64 %i1, 12
+  %and = and i64 %0, 127252959854592
+  ret i64 %and
+
+; CHECK-LABEL: @test5
+; CHECK-DAG: lis [[REG1:[0-9]+]], 3703
+; CHECK-DAG: rldicl [[REG4:[0-9]+]], 4, 12, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 35951
+; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 19
+; CHECK: and 3, [[REG4]], [[REG3]]
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define zeroext i32 @test6(i32 zeroext %x) #0 {
+entry:
+  %and = lshr i32 %x, 16
+  %shr = and i32 %and, 255
+  %and1 = shl i32 %x, 16
+  %shl = and i32 %and1, 16711680
+  %or = or i32 %shr, %shl
+  ret i32 %or
+
+; CHECK-LABEL: @test6
+; CHECK: rlwinm [[REG1:[0-9]+]], 3, 16, 24, 31
+; CHECK: rlwimi [[REG1]], 3, 16, 8, 15
+; CHECK: mr 3, [[REG1]]
+; CHECK: blr
+}
+
+define i64 @test7(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = lshr i64 %i0, 5
+  %and = and i64 %0, 58195968
+  ret i64 %and
+
+; CHECK-LABEL: @test7
+; CHECK: rlwinm [[REG1:[0-9]+]], 3, 27, 9, 12
+; CHECK: rlwimi [[REG1]], 3, 27, 6, 7
+; CHECK: mr 3, [[REG1]]
+; CHECK: blr
+}
+
+define i64 @test8(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = lshr i64 %i0, 1
+  %and = and i64 %0, 169172533248
+  ret i64 %and
+
+; CHECK-LABEL: @test8
+; CHECK-DAG: lis [[REG1:[0-9]+]], 4
+; CHECK-DAG: rldicl [[REG4:[0-9]+]], 3, 63, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 60527
+; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 19
+; CHECK: and 3, [[REG4]], [[REG3]]
+; CHECK: blr
+}
+
+define i64 @test9(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = lshr i64 %i1, 14
+  %and = and i64 %0, 18848677888
+  %1 = shl i64 %i1, 51
+  %and3 = and i64 %1, 405323966463344640
+  %or4 = or i64 %and, %and3
+  ret i64 %or4
+
+; CHECK-LABEL: @test9
+; CHECK-DAG: lis [[REG1:[0-9]+]], 1440
+; CHECK-DAG: rldicl [[REG5:[0-9]+]], 4, 62, 0
+; CHECK-DAG: rldicl [[REG6:[0-9]+]], 4, 50, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 4
+; CHECK-DAG: rldimi [[REG6]], [[REG5]], 53, 0
+; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 32
+; CHECK-DAG: oris [[REG4:[0-9]+]], [[REG3]], 25464
+; CHECK: and 3, [[REG6]], [[REG4]]
+; CHECK: blr
+}
+
+define i64 @test10(i64 %i0, i64 %i1) #0 {
+entry:
+  %0 = shl i64 %i0, 37
+  %and = and i64 %0, 15881483390550016
+  %1 = shl i64 %i0, 25
+  %and3 = and i64 %1, 2473599172608
+  %or4 = or i64 %and, %and3
+  ret i64 %or4
+
+; CHECK-LABEL: @test10
+; CHECK-DAG: lis [[REG1:[0-9]+]], 1
+; CHECK-DAG: rldicl [[REG6:[0-9]+]], 3, 25, 0
+; CHECK-DAG: rldicl [[REG7:[0-9]+]], 3, 37, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 8183
+; CHECK-DAG: ori [[REG3:[0-9]+]], [[REG1]], 50017
+; CHECK-DAG: sldi [[REG4:[0-9]+]], [[REG2]], 25
+; CHECK-DAG: sldi [[REG5:[0-9]+]], [[REG3]], 37
+; CHECK-DAG: and [[REG8:[0-9]+]], [[REG6]], [[REG4]]
+; CHECK-DAG: and [[REG9:[0-9]+]], [[REG7]], [[REG5]]
+; CHECK: or 3, [[REG9]], [[REG8]]
+; CHECK: blr
+}
+
+define i64 @test11(i64 %x) #0 {
+entry:
+  %and = and i64 %x, 4294967295
+  %shl = shl i64 %x, 32
+  %or = or i64 %and, %shl
+  ret i64 %or
+
+; CHECK-LABEL: @test11
+; CHECK: rlwinm 3, 3, 0, 1, 0
+; CHECK: blr
+}
+
+define i64 @test12(i64 %x) #0 {
+entry:
+  %and = and i64 %x, 4294905855
+  %shl = shl i64 %x, 32
+  %or = or i64 %and, %shl
+  ret i64 %or
+
+; CHECK-LABEL: @test12
+; CHECK: rlwinm 3, 3, 0, 20, 15
+; CHECK: blr
+}
+
+define i64 @test13(i64 %x) #0 {
+entry:
+  %shl = shl i64 %x, 4
+  %and = and i64 %shl, 240
+  %shr = lshr i64 %x, 28
+  %and1 = and i64 %shr, 15
+  %or = or i64 %and, %and1
+  ret i64 %or
+
+; CHECK-LABEL: @test13
+; CHECK: rlwinm 3, 3, 4, 24, 31
+; CHECK: blr
+}
+
+define i64 @test14(i64 %x) #0 {
+entry:
+  %shl = shl i64 %x, 4
+  %and = and i64 %shl, 240
+  %shr = lshr i64 %x, 28
+  %and1 = and i64 %shr, 15
+  %and2 = and i64 %x, -4294967296
+  %or = or i64 %and1, %and2
+  %or3 = or i64 %or, %and
+  ret i64 %or3
+
+; CHECK-LABEL: @test14
+; CHECK: rldicr [[REG1:[0-9]+]], 3, 0, 31
+; CHECK: rlwimi [[REG1]], 3, 4, 24, 31
+; CHECK: mr 3, [[REG1]]
+; CHECK: blr
+}
+
+define i64 @test15(i64 %x) #0 {
+entry:
+  %shl = shl i64 %x, 4
+  %and = and i64 %shl, 240
+  %shr = lshr i64 %x, 28
+  %and1 = and i64 %shr, 15
+  %and2 = and i64 %x, -256
+  %or = or i64 %and1, %and2
+  %or3 = or i64 %or, %and
+  ret i64 %or3
+
+; CHECK-LABEL: @test15
+; CHECK: rlwimi 3, 3, 4, 24, 31
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.bswap.i32(i32) #0
+declare i64 @llvm.bswap.i64(i64) #0
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/PowerPC/cc.ll b/test/CodeGen/PowerPC/cc.ll
index f92121b..c23ee7c 100644
--- a/test/CodeGen/PowerPC/cc.ll
+++ b/test/CodeGen/PowerPC/cc.ll
@@ -41,7 +41,7 @@ entry:
   br label %foo
 
 foo:
-  call { i64, i64 } asm sideeffect "sc", "={r0},={r3},{r0},~{cc}" (i64 %a)
+  call { i64, i64 } asm sideeffect "sc", "={r0},={r3},{r0},~{cc},~{cr1},~{cr2},~{cr3},~{cr4},~{cr5},~{cr6},~{cr7}" (i64 %a)
   br i1 %c, label %bar, label %end
 
 bar:
diff --git a/test/CodeGen/PowerPC/cmpb-ppc32.ll b/test/CodeGen/PowerPC/cmpb-ppc32.ll
new file mode 100644
index 0000000..639ed88
--- /dev/null
+++ b/test/CodeGen/PowerPC/cmpb-ppc32.ll
@@ -0,0 +1,50 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone
+define zeroext i16 @test16(i16 zeroext %x, i16 zeroext %y) #0 {
+entry:
+  %0 = xor i16 %y, %x
+  %1 = and i16 %0, 255
+  %cmp = icmp eq i16 %1, 0
+  %cmp20 = icmp ult i16 %0, 256
+  %conv25 = select i1 %cmp, i32 255, i32 0
+  %conv27 = select i1 %cmp20, i32 65280, i32 0
+  %or = or i32 %conv25, %conv27
+  %conv29 = trunc i32 %or to i16
+  ret i16 %conv29
+
+; CHECK-LABEL: @test16
+; CHECK: cmpb [[REG1:[0-9]+]], 4, 3
+; CHECK: rlwinm 3, [[REG1]], 0, 16, 31
+; CHECK: blr
+}
+
+define i32 @test32(i32 %x, i32 %y) #0 {
+entry:
+  %0 = xor i32 %y, %x
+  %1 = and i32 %0, 255
+  %cmp = icmp eq i32 %1, 0
+  %2 = and i32 %0, 65280
+  %cmp28 = icmp eq i32 %2, 0
+  %3 = and i32 %0, 16711680
+  %cmp34 = icmp eq i32 %3, 0
+  %cmp40 = icmp ult i32 %0, 16777216
+  %conv44 = select i1 %cmp, i32 255, i32 0
+  %conv45 = select i1 %cmp28, i32 65280, i32 0
+  %conv47 = select i1 %cmp34, i32 16711680, i32 0
+  %conv50 = select i1 %cmp40, i32 -16777216, i32 0
+  %or = or i32 %conv45, %conv50
+  %or49 = or i32 %or, %conv44
+  %or52 = or i32 %or49, %conv47
+  ret i32 %or52
+
+; CHECK-LABEL: @test32
+; CHECK: cmpb 3, 4, 3
+; CHECK-NOT: rlwinm
+; CHECK: blr
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/PowerPC/cmpb.ll b/test/CodeGen/PowerPC/cmpb.ll
new file mode 100644
index 0000000..7d0c0ab
--- /dev/null
+++ b/test/CodeGen/PowerPC/cmpb.ll
@@ -0,0 +1,204 @@
+; RUN: llc -mcpu pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone
+define zeroext i16 @test16(i16 zeroext %x, i16 zeroext %y) #0 {
+entry:
+  %0 = xor i16 %y, %x
+  %1 = and i16 %0, 255
+  %cmp = icmp eq i16 %1, 0
+  %cmp20 = icmp ult i16 %0, 256
+  %conv25 = select i1 %cmp, i32 255, i32 0
+  %conv27 = select i1 %cmp20, i32 65280, i32 0
+  %or = or i32 %conv25, %conv27
+  %conv29 = trunc i32 %or to i16
+  ret i16 %conv29
+
+; CHECK-LABEL: @test16
+; CHECK: cmpb [[REG1:[0-9]+]], 4, 3
+; CHECK: rldicl 3, [[REG1]], 0, 48
+; CHECK: blr
+}
+
+define zeroext i16 @test16p1(i16 zeroext %x, i16 zeroext %y) #0 {
+entry:
+  %0 = xor i16 %y, %x
+  %1 = and i16 %0, 255
+  %cmp = icmp eq i16 %1, 0
+  %cmp20 = icmp ult i16 %0, 256
+  %conv28 = select i1 %cmp, i32 5, i32 0
+  %conv30 = select i1 %cmp20, i32 65280, i32 0
+  %or = or i32 %conv28, %conv30
+  %conv32 = trunc i32 %or to i16
+  ret i16 %conv32
+
+; CHECK-LABEL: @test16p1
+; CHECK: cmpb [[REG1:[0-9]+]], 4, 3
+; CHECK: andi. 3, [[REG1]], 65285
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define zeroext i16 @test16p2(i16 zeroext %x, i16 zeroext %y) #0 {
+entry:
+  %0 = xor i16 %y, %x
+  %1 = and i16 %0, 255
+  %cmp = icmp eq i16 %1, 0
+  %cmp20 = icmp ult i16 %0, 256
+  %conv28 = select i1 %cmp, i32 255, i32 0
+  %conv30 = select i1 %cmp20, i32 1280, i32 0
+  %or = or i32 %conv28, %conv30
+  %conv32 = trunc i32 %or to i16
+  ret i16 %conv32
+
+; CHECK-LABEL: @test16p2
+; CHECK: cmpb [[REG1:[0-9]+]], 4, 3
+; CHECK: andi. 3, [[REG1]], 1535
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define zeroext i16 @test16p3(i16 zeroext %x, i16 zeroext %y) #0 {
+entry:
+  %0 = xor i16 %y, %x
+  %1 = and i16 %0, 255
+  %cmp = icmp eq i16 %1, 0
+  %cmp20 = icmp ult i16 %0, 256
+  %conv27 = select i1 %cmp, i32 255, i32 0
+  %conv29 = select i1 %cmp20, i32 1024, i32 1280
+  %or = or i32 %conv27, %conv29
+  %conv31 = trunc i32 %or to i16
+  ret i16 %conv31
+
+; CHECK-LABEL: @test16p3
+; CHECK: cmpb [[REG1:[0-9]+]], 4, 3
+; CHECK: rldicl [[REG2:[0-9]+]], [[REG1]], 0, 55
+; CHECK: xori 3, [[REG2]], 1280
+; CHECK: blr
+}
+
+define zeroext i32 @test32(i32 zeroext %x, i32 zeroext %y) #0 {
+entry:
+  %0 = xor i32 %y, %x
+  %1 = and i32 %0, 255
+  %cmp = icmp eq i32 %1, 0
+  %2 = and i32 %0, 65280
+  %cmp28 = icmp eq i32 %2, 0
+  %3 = and i32 %0, 16711680
+  %cmp34 = icmp eq i32 %3, 0
+  %cmp40 = icmp ult i32 %0, 16777216
+  %conv44 = select i1 %cmp, i32 255, i32 0
+  %conv45 = select i1 %cmp28, i32 65280, i32 0
+  %conv47 = select i1 %cmp34, i32 16711680, i32 0
+  %conv50 = select i1 %cmp40, i32 -16777216, i32 0
+  %or = or i32 %conv45, %conv50
+  %or49 = or i32 %or, %conv44
+  %or52 = or i32 %or49, %conv47
+  ret i32 %or52
+
+; CHECK-LABEL: @test32
+; CHECK: cmpb [[REG1:[0-9]+]], 4, 3
+; CHECK: rldicl 3, [[REG1]], 0, 32
+; CHECK: blr
+}
+
+define zeroext i32 @test32p1(i32 zeroext %x, i32 zeroext %y) #0 {
+entry:
+  %0 = xor i32 %y, %x
+  %1 = and i32 %0, 255
+  %cmp = icmp eq i32 %1, 0
+  %2 = and i32 %0, 65280
+  %cmp28 = icmp eq i32 %2, 0
+  %3 = and i32 %0, 16711680
+  %cmp34 = icmp eq i32 %3, 0
+  %cmp40 = icmp ult i32 %0, 16777216
+  %conv47 = select i1 %cmp, i32 255, i32 0
+  %conv48 = select i1 %cmp28, i32 65280, i32 0
+  %conv50 = select i1 %cmp34, i32 458752, i32 0
+  %conv53 = select i1 %cmp40, i32 -16777216, i32 0
+  %or = or i32 %conv48, %conv53
+  %or52 = or i32 %or, %conv47
+  %or55 = or i32 %or52, %conv50
+  ret i32 %or55
+
+; CHECK-LABEL: @test32p1
+; CHECK: li [[REG1:[0-9]+]], 0
+; CHECK: cmpb [[REG4:[0-9]+]], 4, 3
+; CHECK: oris [[REG2:[0-9]+]], [[REG1]], 65287
+; CHECK: ori [[REG3:[0-9]+]], [[REG2]], 65535
+; CHECK: and 3, [[REG4]], [[REG3]]
+; CHECK: blr
+}
+
+define zeroext i32 @test32p2(i32 zeroext %x, i32 zeroext %y) #0 {
+entry:
+  %0 = xor i32 %y, %x
+  %1 = and i32 %0, 255
+  %cmp = icmp eq i32 %1, 0
+  %2 = and i32 %0, 65280
+  %cmp22 = icmp eq i32 %2, 0
+  %cmp28 = icmp ult i32 %0, 16777216
+  %conv32 = select i1 %cmp, i32 255, i32 0
+  %conv33 = select i1 %cmp22, i32 65280, i32 0
+  %conv35 = select i1 %cmp28, i32 -16777216, i32 0
+  %or = or i32 %conv33, %conv35
+  %or37 = or i32 %or, %conv32
+  ret i32 %or37
+
+; CHECK-LABEL: @test32p2
+; CHECK: li [[REG1:[0-9]+]], 0
+; CHECK: cmpb [[REG4:[0-9]+]], 4, 3
+; CHECK: oris [[REG2:[0-9]+]], [[REG1]], 65280
+; CHECK: ori [[REG3:[0-9]+]], [[REG2]], 65535
+; CHECK: and 3, [[REG4]], [[REG3]]
+; CHECK: blr
+}
+
+define i64 @test64(i64 %x, i64 %y) #0 {
+entry:
+  %shr19 = lshr i64 %x, 56
+  %conv21 = trunc i64 %shr19 to i32
+  %shr43 = lshr i64 %y, 56
+  %conv45 = trunc i64 %shr43 to i32
+  %0 = xor i64 %y, %x
+  %1 = and i64 %0, 255
+  %cmp = icmp eq i64 %1, 0
+  %2 = and i64 %0, 65280
+  %cmp52 = icmp eq i64 %2, 0
+  %3 = and i64 %0, 16711680
+  %cmp58 = icmp eq i64 %3, 0
+  %4 = and i64 %0, 4278190080
+  %cmp64 = icmp eq i64 %4, 0
+  %5 = and i64 %0, 1095216660480
+  %cmp70 = icmp eq i64 %5, 0
+  %6 = and i64 %0, 280375465082880
+  %cmp76 = icmp eq i64 %6, 0
+  %7 = and i64 %0, 71776119061217280
+  %cmp82 = icmp eq i64 %7, 0
+  %cmp88 = icmp eq i32 %conv21, %conv45
+  %conv92 = select i1 %cmp, i64 255, i64 0
+  %conv93 = select i1 %cmp52, i64 65280, i64 0
+  %or = or i64 %conv92, %conv93
+  %conv95 = select i1 %cmp58, i64 16711680, i64 0
+  %or97 = or i64 %or, %conv95
+  %conv98 = select i1 %cmp64, i64 4278190080, i64 0
+  %or100 = or i64 %or97, %conv98
+  %conv101 = select i1 %cmp70, i64 1095216660480, i64 0
+  %or103 = or i64 %or100, %conv101
+  %conv104 = select i1 %cmp76, i64 280375465082880, i64 0
+  %or106 = or i64 %or103, %conv104
+  %conv107 = select i1 %cmp82, i64 71776119061217280, i64 0
+  %or109 = or i64 %or106, %conv107
+  %conv110 = select i1 %cmp88, i64 -72057594037927936, i64 0
+  %or112 = or i64 %or109, %conv110
+  ret i64 %or112
+
+; CHECK-LABEL: @test64
+; CHECK: cmpb 3, 3, 4
+; CHECK-NOT: rldicl
+; CHECK: blr
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/PowerPC/code-align.ll b/test/CodeGen/PowerPC/code-align.ll
new file mode 100644
index 0000000..c6ec37f
--- /dev/null
+++ b/test/CodeGen/PowerPC/code-align.ll
@@ -0,0 +1,110 @@
+; RUN: llc -mcpu=ppc64 < %s | FileCheck %s -check-prefix=GENERIC
+; RUN: llc -mcpu=970 < %s | FileCheck %s -check-prefix=PWR
+; RUN: llc -mcpu=a2 < %s | FileCheck %s -check-prefix=BASIC
+; RUN: llc -mcpu=e500mc < %s | FileCheck %s -check-prefix=BASIC
+; RUN: llc -mcpu=e5500 < %s | FileCheck %s -check-prefix=BASIC
+; RUN: llc -mcpu=pwr4 < %s | FileCheck %s -check-prefix=PWR
+; RUN: llc -mcpu=pwr5 < %s | FileCheck %s -check-prefix=PWR
+; RUN: llc -mcpu=pwr5x < %s | FileCheck %s -check-prefix=PWR
+; RUN: llc -mcpu=pwr6 < %s | FileCheck %s -check-prefix=PWR
+; RUN: llc -mcpu=pwr6x < %s | FileCheck %s -check-prefix=PWR
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PWR
+; RUN: llc -mcpu=pwr8 < %s | FileCheck %s -check-prefix=PWR
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone
+define signext i32 @foo(i32 signext %x) #0 {
+entry:
+  %mul = shl nsw i32 %x, 1
+  ret i32 %mul
+
+; GENERIC-LABEL: .globl  foo
+; BASIC-LABEL: .globl  foo
+; PWR-LABEL: .globl  foo
+; GENERIC: .align  2
+; BASIC: .align  4
+; PWR: .align  4
+; GENERIC: @foo
+; BASIC: @foo
+; PWR: @foo
+}
+
+; Function Attrs: nounwind
+define void @loop(i32 signext %x, i32* nocapture %a) #1 {
+entry:
+  br label %vector.body
+
+; GENERIC-LABEL: @loop
+; BASIC-LABEL: @loop
+; PWR-LABEL: @loop
+; GENERIC: mtctr
+; BASIC: mtctr
+; PWR: mtctr
+; GENERIC-NOT: .align
+; BASIC: .align  4
+; PWR: .align  4
+; GENERIC: lwzu
+; BASIC: lwzu
+; PWR: lwzu
+; GENERIC: bdnz
+; BASIC: bdnz
+; PWR: bdnz
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %induction45 = or i64 %index, 1
+  %0 = getelementptr inbounds i32* %a, i64 %index
+  %1 = getelementptr inbounds i32* %a, i64 %induction45
+  %2 = load i32* %0, align 4
+  %3 = load i32* %1, align 4
+  %4 = add nsw i32 %2, 4
+  %5 = add nsw i32 %3, 4
+  %6 = mul nsw i32 %4, 3
+  %7 = mul nsw i32 %5, 3
+  store i32 %6, i32* %0, align 4
+  store i32 %7, i32* %1, align 4
+  %index.next = add i64 %index, 2
+  %8 = icmp eq i64 %index.next, 2048
+  br i1 %8, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @sloop(i32 signext %x, i32* nocapture %a) #1 {
+entry:
+  br label %for.body
+
+; GENERIC-LABEL: @sloop
+; BASIC-LABEL: @sloop
+; PWR-LABEL: @sloop
+; GENERIC: mtctr
+; BASIC: mtctr
+; PWR: mtctr
+; GENERIC-NOT: .align
+; BASIC: .align  4
+; PWR: .align  5
+; GENERIC: bdnz
+; BASIC: bdnz
+; PWR: bdnz
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 4
+  %mul = mul nsw i32 %add, 3
+  store i32 %mul, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 2048
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/constants-i64.ll b/test/CodeGen/PowerPC/constants-i64.ll
new file mode 100644
index 0000000..5f2815e
--- /dev/null
+++ b/test/CodeGen/PowerPC/constants-i64.ll
@@ -0,0 +1,84 @@
+; RUN: llc -mcpu=ppc64 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone
+define i64 @cn1() #0 {
+entry:
+  ret i64 281474976710655
+
+; CHECK-LABEL: @cn1
+; CHECK: lis [[REG1:[0-9]+]], -1
+; CHECK: rldicr 3, [[REG1]], 48, 63
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define i64 @cnb() #0 {
+entry:
+  ret i64 281474976710575
+
+; CHECK-LABEL: @cnb
+; CHECK: lis [[REG1:[0-9]+]], -81
+; CHECK: rldicr 3, [[REG1]], 48, 63
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define i64 @f2(i64 %x) #0 {
+entry:
+  ret i64 -68719476736
+
+; CHECK-LABEL: @f2
+; CHECK: li [[REG1:[0-9]+]], -1
+; CHECK: sldi 3, [[REG1]], 36
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define i64 @f2a(i64 %x) #0 {
+entry:
+  ret i64 -361850994688
+
+; CHECK-LABEL: @f2a
+; CHECK: li [[REG1:[0-9]+]], -337
+; CHECK: sldi 3, [[REG1]], 30
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define i64 @f2n(i64 %x) #0 {
+entry:
+  ret i64 68719476735
+
+; CHECK-LABEL: @f2n
+; CHECK: lis [[REG1:[0-9]+]], -4096
+; CHECK: rldicr 3, [[REG1]], 36, 63
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define i64 @f3(i64 %x) #0 {
+entry:
+  ret i64 8589934591
+
+; CHECK-LABEL: @f3
+; CHECK: lis [[REG1:[0-9]+]], -32768
+; CHECK: rldicr 3, [[REG1]], 33, 63
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define i64 @cn2n() #0 {
+entry:
+  ret i64 -1407374887747585
+
+; CHECK-LABEL: @cn2n
+; CHECK: lis [[REG1:[0-9]+]], -5121
+; CHECK: ori [[REG2:[0-9]+]], [[REG1]], 65534
+; CHECK: rldicr 3, [[REG2]], 22, 63
+; CHECK: blr
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/PowerPC/crsave.ll b/test/CodeGen/PowerPC/crsave.ll
index a9b4b36..602ba94 100644
--- a/test/CodeGen/PowerPC/crsave.ll
+++ b/test/CodeGen/PowerPC/crsave.ll
@@ -6,7 +6,7 @@ declare void @foo()
 define i32 @test_cr2() nounwind uwtable {
 entry:
   %ret = alloca i32, align 4
-  %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmp 2,$2,$1\0A\09mfcr $0", "=r,r,r,r,r,~{cr2}"(i32 1, i32 2, i32 3, i32 0) nounwind
+  %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmpw 2,$2,$1\0A\09mfcr $0", "=r,r,r,r,r,~{cr2}"(i32 1, i32 2, i32 3, i32 0) nounwind
   store i32 %0, i32* %ret, align 4
   call void @foo()
   %1 = load i32* %ret, align 4
@@ -35,7 +35,7 @@ entry:
 define i32 @test_cr234() nounwind {
 entry:
   %ret = alloca i32, align 4
-  %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmp 2,$2,$1\0A\09cmp 3,$2,$2\0A\09cmp 4,$2,$3\0A\09mfcr $0", "=r,r,r,r,r,~{cr2},~{cr3},~{cr4}"(i32 1, i32 2, i32 3, i32 0) nounwind
+  %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmpw 2,$2,$1\0A\09cmpw 3,$2,$2\0A\09cmpw 4,$2,$3\0A\09mfcr $0", "=r,r,r,r,r,~{cr2},~{cr3},~{cr4}"(i32 1, i32 2, i32 3, i32 0) nounwind
   store i32 %0, i32* %ret, align 4
   call void @foo()
   %1 = load i32* %ret, align 4
diff --git a/test/CodeGen/PowerPC/ctrloops.ll b/test/CodeGen/PowerPC/ctrloops.ll
index ca00f68..ccab7cb 100644
--- a/test/CodeGen/PowerPC/ctrloops.ll
+++ b/test/CodeGen/PowerPC/ctrloops.ll
@@ -1,6 +1,6 @@
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-freebsd10.0"
-; RUN: llc < %s -march=ppc64 | FileCheck %s
+; RUN: llc < %s -march=ppc64 -relocation-model=pic | FileCheck %s
 
 @a = common global i32 0, align 4
 
@@ -73,3 +73,26 @@ for.end:                                          ; preds = %for.body, %entry
 ; CHECK-NOT: cmplwi
 ; CHECK: bdnz
 }
+
+@tls_var = external thread_local global i8
+
+define i32 @test4() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %phi = phi i32 [ %dec, %for.body ], [ undef, %entry ]
+  %load = ptrtoint i8* @tls_var to i32
+  %dec = add i32 %phi, -1
+  %cmp = icmp sgt i32 %phi, 1
+  br i1 %cmp, label %for.body, label %return
+
+return:                                           ; preds = %for.body
+  ret i32 %load
+; CHECK-LABEL: @test4
+; CHECK-NOT: mtctr
+; CHECK: addi {{[0-9]+}}
+; CHECK: cmpwi
+; CHECK-NOT: bdnz
+; CHECK: bgt
+}
diff --git a/test/CodeGen/PowerPC/dbg.ll b/test/CodeGen/PowerPC/dbg.ll
index 04338a6..bd15367 100644
--- a/test/CodeGen/PowerPC/dbg.ll
+++ b/test/CodeGen/PowerPC/dbg.ll
@@ -6,8 +6,8 @@ target triple = "powerpc64-unknown-linux-gnu"
 
 define i32 @main(i32 %argc, i8** nocapture %argv) nounwind readnone {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !17
-  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !15, metadata !{!"0x102"}), !dbg !17
+  tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !16, metadata !{!"0x102"}), !dbg !18
   %add = add nsw i32 %argc, 1, !dbg !19
   ret i32 %add, !dbg !19
 }
@@ -17,23 +17,23 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.1\001\00\000\00\000", metadata !21, metadata !1, metadata !1, metadata !3, metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\00256\001\000", metadata !21, null, metadata !7, null, i32 (i32, i8**)* @main, null, null, metadata !13} ; [ DW_TAG_subprogram ]
-!6 = metadata !{metadata !"0x29", metadata !21} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !9, metadata !10}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !12} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{metadata !"0x24\00char\000\008\008\000\000\008", null, null} ; [ DW_TAG_base_type ]
-!13 = metadata !{metadata !15, metadata !16}
-!15 = metadata !{metadata !"0x101\00argc\0016777217\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
-!16 = metadata !{metadata !"0x101\00argv\0033554433\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{i32 1, i32 14, metadata !5, null}
-!18 = metadata !{i32 1, i32 26, metadata !5, null}
-!19 = metadata !{i32 2, i32 3, metadata !20, null}
-!20 = metadata !{metadata !"0xb\001\0034\000", metadata !21, metadata !5} ; [ DW_TAG_lexical_block ]
-!21 = metadata !{metadata !"dbg.c", metadata !"/src"}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.1\001\00\000\00\000", !21, !1, !1, !3, !1, !""} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00main\00main\00\001\000\001\000\006\00256\001\000", !21, null, !7, null, i32 (i32, i8**)* @main, null, null, !13} ; [ DW_TAG_subprogram ]
+!6 = !{!"0x29", !21} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !9, !10}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = !{!"0xf\00\000\0064\0064\000\000", null, null, !11} ; [ DW_TAG_pointer_type ]
+!11 = !{!"0xf\00\000\0064\0064\000\000", null, null, !12} ; [ DW_TAG_pointer_type ]
+!12 = !{!"0x24\00char\000\008\008\000\000\008", null, null} ; [ DW_TAG_base_type ]
+!13 = !{!15, !16}
+!15 = !{!"0x101\00argc\0016777217\000", !5, !6, !9} ; [ DW_TAG_arg_variable ]
+!16 = !{!"0x101\00argv\0033554433\000", !5, !6, !10} ; [ DW_TAG_arg_variable ]
+!17 = !MDLocation(line: 1, column: 14, scope: !5)
+!18 = !MDLocation(line: 1, column: 26, scope: !5)
+!19 = !MDLocation(line: 2, column: 3, scope: !20)
+!20 = !{!"0xb\001\0034\000", !21, !5} ; [ DW_TAG_lexical_block ]
+!21 = !{!"dbg.c", !"/src"}
+!22 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/PowerPC/early-ret2.ll b/test/CodeGen/PowerPC/early-ret2.ll
index 1784777..f9758d3 100644
--- a/test/CodeGen/PowerPC/early-ret2.ll
+++ b/test/CodeGen/PowerPC/early-ret2.ll
@@ -25,5 +25,5 @@ while.end:                                        ; preds = %while.body, %while.
 
 attributes #0 = { noinline nounwind }
 
-!0 = metadata !{}
+!0 = !{}
 
diff --git a/test/CodeGen/PowerPC/fast-isel-const.ll b/test/CodeGen/PowerPC/fast-isel-const.ll
new file mode 100644
index 0000000..1057d0a
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-const.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck %s --check-prefix=ELF64
+
+define zeroext i1 @testi1(i8 %in) nounwind uwtable ssp {
+entry:
+  %c = icmp eq i8 %in, 5
+  br i1 %c, label %true, label %false
+
+; ELF64-LABEL: @testi1
+
+true:
+  br label %end
+
+; ELF64-NOT: li {{[0-9]+}}, -1
+; ELF64: li {{[0-9]+}}, 1
+
+false:
+  br label %end
+
+; ELF64: li {{[0-9]+}}, 0
+
+end:
+  %r = phi i1 [ 0, %false], [ 1, %true ]
+  ret i1 %r
+
+; ELF64: blr
+}
+
diff --git a/test/CodeGen/PowerPC/fdiv-combine.ll b/test/CodeGen/PowerPC/fdiv-combine.ll
new file mode 100644
index 0000000..d3dc3fe
--- /dev/null
+++ b/test/CodeGen/PowerPC/fdiv-combine.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mcpu=ppc64 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Following test case checks:
+;   a / D; b / D; c / D;
+;                =>
+;   recip = 1.0 / D; a * recip; b * recip; c * recip;
+
+define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 {
+; CHECK-LABEL: three_fdiv_double:
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fdiv
+; CHECK: fmul
+; CHECK: fmul
+; CHECK: fmul
+  %div = fdiv double %a, %D
+  %div1 = fdiv double %b, %D
+  %div2 = fdiv double %c, %D
+  tail call void @foo_3d(double %div, double %div1, double %div2)
+  ret void
+}
+
+define void @two_fdiv_double(double %D, double %a, double %b) #0 {
+; CHECK-LABEL: two_fdiv_double:
+; CHECK: fdiv
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fmul
+  %div = fdiv double %a, %D
+  %div1 = fdiv double %b, %D
+  tail call void @foo_2d(double %div, double %div1)
+  ret void
+}
+
+declare void @foo_3d(double, double, double)
+declare void @foo_3_2xd(<2 x double>, <2 x double>, <2 x double>)
+declare void @foo_2d(double, double)
+
+attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/PowerPC/flt-preinc.ll b/test/CodeGen/PowerPC/flt-preinc.ll
new file mode 100644
index 0000000..dd17031
--- /dev/null
+++ b/test/CodeGen/PowerPC/flt-preinc.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mcpu=ppc64 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readonly
+define float @tf(float* nocapture readonly %i, i32 signext %o) #0 {
+entry:
+  %idx.ext = sext i32 %o to i64
+  %add.ptr = getelementptr inbounds float* %i, i64 %idx.ext
+  %0 = load float* %add.ptr, align 4
+  %add.ptr.sum = add nsw i64 %idx.ext, 1
+  %add.ptr3 = getelementptr inbounds float* %i, i64 %add.ptr.sum
+  %1 = load float* %add.ptr3, align 4
+  %add = fadd float %0, %1
+  ret float %add
+
+; CHECK-LABEL: @tf
+; CHECK: lfsux
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readonly
+define double @td(double* nocapture readonly %i, i32 signext %o) #0 {
+entry:
+  %idx.ext = sext i32 %o to i64
+  %add.ptr = getelementptr inbounds double* %i, i64 %idx.ext
+  %0 = load double* %add.ptr, align 8
+  %add.ptr.sum = add nsw i64 %idx.ext, 1
+  %add.ptr3 = getelementptr inbounds double* %i, i64 %add.ptr.sum
+  %1 = load double* %add.ptr3, align 8
+  %add = fadd double %0, %1
+  ret double %add
+
+; CHECK-LABEL: @td
+; CHECK: lfdux
+; CHECK: blr
+}
+
+attributes #0 = { nounwind readonly }
+
diff --git a/test/CodeGen/PowerPC/fma-assoc.ll b/test/CodeGen/PowerPC/fma-assoc.ll
new file mode 100644
index 0000000..dc1316e
--- /dev/null
+++ b/test/CodeGen/PowerPC/fma-assoc.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -march=ppc32 -fp-contract=fast -mattr=-vsx | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 | FileCheck -check-prefix=CHECK-VSX %s
+
+define double @test_FMADD_ASSOC1(double %A, double %B, double %C,
+                                 double %D, double %E) {
+	%F = fmul double %A, %B         ; <double> [#uses=1]
+	%G = fmul double %C, %D         ; <double> [#uses=1]
+	%H = fadd double %F, %G         ; <double> [#uses=1]
+	%I = fadd double %H, %E         ; <double> [#uses=1]
+	ret double %I
+; CHECK-LABEL: test_FMADD_ASSOC1:
+; CHECK: fmadd
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FMADD_ASSOC1:
+; CHECK-VSX: xsmaddmdp
+; CHECK-VSX-NEXT: xsmaddadp
+; CHECK-VSX-NEXT: fmr
+; CHECK-VSX-NEXT: blr
+}
+
+define double @test_FMADD_ASSOC2(double %A, double %B, double %C,
+                                 double %D, double %E) {
+	%F = fmul double %A, %B         ; <double> [#uses=1]
+	%G = fmul double %C, %D         ; <double> [#uses=1]
+	%H = fadd double %F, %G         ; <double> [#uses=1]
+	%I = fadd double %E, %H         ; <double> [#uses=1]
+	ret double %I
+; CHECK-LABEL: test_FMADD_ASSOC2:
+; CHECK: fmadd
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FMADD_ASSOC2:
+; CHECK-VSX: xsmaddmdp
+; CHECK-VSX-NEXT: xsmaddadp
+; CHECK-VSX-NEXT: fmr
+; CHECK-VSX-NEXT: blr
+}
+
+define double @test_FMSUB_ASSOC1(double %A, double %B, double %C,
+                                 double %D, double %E) {
+	%F = fmul double %A, %B         ; <double> [#uses=1]
+	%G = fmul double %C, %D         ; <double> [#uses=1]
+	%H = fadd double %F, %G         ; <double> [#uses=1]
+	%I = fsub double %H, %E         ; <double> [#uses=1]
+	ret double %I
+; CHECK-LABEL: test_FMSUB_ASSOC1:
+; CHECK: fmsub
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FMSUB_ASSOC1:
+; CHECK-VSX: xsmsubmdp
+; CHECK-VSX-NEXT: xsmaddadp
+; CHECK-VSX-NEXT: fmr
+; CHECK-VSX-NEXT: blr
+}
+
+define double @test_FMSUB_ASSOC2(double %A, double %B, double %C,
+                                 double %D, double %E) {
+	%F = fmul double %A, %B         ; <double> [#uses=1]
+	%G = fmul double %C, %D         ; <double> [#uses=1]
+	%H = fadd double %F, %G         ; <double> [#uses=1]
+	%I = fsub double %E, %H         ; <double> [#uses=1]
+	ret double %I
+; CHECK-LABEL: test_FMSUB_ASSOC2:
+; CHECK: fnmsub
+; CHECK-NEXT: fnmsub
+; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FMSUB_ASSOC2:
+; CHECK-VSX: xsnmsubmdp
+; CHECK-VSX-NEXT: xsnmsubadp
+; CHECK-VSX-NEXT: fmr
+; CHECK-VSX-NEXT: blr
+}
+
diff --git a/test/CodeGen/PowerPC/fma-ext.ll b/test/CodeGen/PowerPC/fma-ext.ll
new file mode 100644
index 0000000..56825ce
--- /dev/null
+++ b/test/CodeGen/PowerPC/fma-ext.ll
@@ -0,0 +1,93 @@
+; RUN: llc < %s -march=ppc32 -fp-contract=fast -mattr=-vsx | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 | FileCheck -check-prefix=CHECK-VSX %s
+
+define double @test_FMADD_EXT1(float %A, float %B, double %C) {
+    %D = fmul float %A, %B          ; <float> [#uses=1]
+    %E = fpext float %D to double   ; <double> [#uses=1]
+    %F = fadd double %E, %C         ; <double> [#uses=1]
+    ret double %F
+; CHECK-LABEL: test_FMADD_EXT1:
+; CHECK: fmadd
+; CHECK-NEXT: blr
+                                
+; CHECK-VSX-LABEL: test_FMADD_EXT1:
+; CHECK-VSX: xsmaddmdp
+; CHECK-VSX-NEXT: blr
+}
+
+define double @test_FMADD_EXT2(float %A, float %B, double %C) {
+    %D = fmul float %A, %B          ; <float> [#uses=1]
+    %E = fpext float %D to double   ; <double> [#uses=1]
+    %F = fadd double %C, %E         ; <double> [#uses=1]
+    ret double %F
+; CHECK-LABEL: test_FMADD_EXT2:
+; CHECK: fmadd
+; CHECK-NEXT: blr
+                                
+; CHECK-VSX-LABEL: test_FMADD_EXT2:
+; CHECK-VSX: xsmaddmdp
+; CHECK-VSX-NEXT: blr
+}
+
+define double @test_FMSUB_EXT1(float %A, float %B, double %C) {
+    %D = fmul float %A, %B          ; <float> [#uses=1]
+    %E = fpext float %D to double   ; <double> [#uses=1]
+    %F = fsub double %E, %C         ; <double> [#uses=1]
+    ret double %F
+; CHECK-LABEL: test_FMSUB_EXT1:
+; CHECK: fmsub
+; CHECK-NEXT: blr
+                                
+; CHECK-VSX-LABEL: test_FMSUB_EXT1:
+; CHECK-VSX: xsmsubmdp
+; CHECK-VSX-NEXT: blr
+}
+
+define double @test_FMSUB_EXT2(float %A, float %B, double %C) {
+    %D = fmul float %A, %B          ; <float> [#uses=1]
+    %E = fpext float %D to double   ; <double> [#uses=1]
+    %F = fsub double %C, %E         ; <double> [#uses=1]
+    ret double %F
+; CHECK-LABEL: test_FMSUB_EXT2:
+; CHECK: fnmsub
+; CHECK-NEXT: blr
+                                
+; CHECK-VSX-LABEL: test_FMSUB_EXT2:
+; CHECK-VSX: xsnmsubmdp
+; CHECK-VSX-NEXT: fmr
+; CHECK-VSX-NEXT: blr
+}
+
+define double @test_FMSUB_EXT3(float %A, float %B, double %C) {
+    %D = fmul float %A, %B          ; <float> [#uses=1]
+    %E = fsub float -0.000000e+00, %D ;		<float> [#uses=1]
+    %F = fpext float %E to double   ; <double> [#uses=1]
+    %G = fsub double %F, %C         ; <double> [#uses=1]
+    ret double %G
+; CHECK-LABEL: test_FMSUB_EXT3:
+; CHECK: fneg
+; CHECK-NEXT: fmsub
+; CHECK-NEXT: blr
+                                
+; CHECK-VSX-LABEL: test_FMSUB_EXT3:
+; CHECK-VSX: xsnegdp
+; CHECK-VSX-NEXT: xsmsubmdp
+; CHECK-VSX-NEXT: blr
+}
+    
+define double @test_FMSUB_EXT4(float %A, float %B, double %C) {
+    %D = fmul float %A, %B          ; <float> [#uses=1]
+    %E = fpext float %D to double   ; <double> [#uses=1]
+    %F = fsub double -0.000000e+00, %E ;		<double> [#uses=1]
+    %G = fsub double %F, %C         ; <double> [#uses=1]
+    ret double %G
+; CHECK-LABEL: test_FMSUB_EXT4:
+; CHECK: fneg
+; CHECK-NEXT: fmsub
+; CHECK-NEXT: blr
+                                
+; CHECK-VSX-LABEL: test_FMSUB_EXT4:
+; CHECK-VSX: xsnegdp
+; CHECK-VSX-NEXT: xsmsubmdp
+; CHECK-VSX-NEXT: blr
+}  
+\ No newline at end of file
diff --git a/test/CodeGen/PowerPC/fp-to-int-ext.ll b/test/CodeGen/PowerPC/fp-to-int-ext.ll
new file mode 100644
index 0000000..bfacd89
--- /dev/null
+++ b/test/CodeGen/PowerPC/fp-to-int-ext.ll
@@ -0,0 +1,69 @@
+; RUN: llc -mcpu=a2 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define double @foo1(i32* %x) #0 {
+entry:
+  %0 = load i32* %x, align 4
+  %conv = sext i32 %0 to i64
+  %conv1 = sitofp i64 %conv to double
+  ret double %conv1
+
+; CHECK-LABEL: @foo1
+; CHECK: lfiwax [[REG1:[0-9]+]], 0, 3
+; CHECK: fcfid 1, [[REG1]]
+; CHECK: blr
+}
+
+define double @foo2(i32* %x) #0 {
+entry:
+  %0 = load i32* %x, align 4
+  %conv = zext i32 %0 to i64
+  %conv1 = sitofp i64 %conv to double
+  ret double %conv1
+
+; CHECK-LABEL: @foo2
+; CHECK: lfiwzx [[REG1:[0-9]+]], 0, 3
+; CHECK: fcfid 1, [[REG1]]
+; CHECK: blr
+}
+
+define double @foo3(i32* %x) #0 {
+entry:
+  %0 = load i32* %x, align 4
+  %1 = add i32 %0, 8
+  %conv = zext i32 %1 to i64
+  %conv1 = sitofp i64 %conv to double
+  ret double %conv1
+
+; CHECK-LABEL: @foo3
+; CHECK-DAG: lwz [[REG1:[0-9]+]], 0(3)
+; CHECK-DAG: addi [[REG3:[0-9]+]], 1,
+; CHECK-DAG: addi [[REG2:[0-9]+]], [[REG1]], 8
+; CHECK-DAG: stw [[REG2]],
+; CHECK: lfiwzx [[REG4:[0-9]+]], 0, [[REG3]]
+; CHECK: fcfid 1, [[REG4]]
+; CHECK: blr
+}
+
+define double @foo4(i32* %x) #0 {
+entry:
+  %0 = load i32* %x, align 4
+  %1 = add i32 %0, 8
+  %conv = sext i32 %1 to i64
+  %conv1 = sitofp i64 %conv to double
+  ret double %conv1
+
+; CHECK-LABEL: @foo4
+; CHECK-DAG: lwz [[REG1:[0-9]+]], 0(3)
+; CHECK-DAG: addi [[REG3:[0-9]+]], 1,
+; CHECK-DAG: addi [[REG2:[0-9]+]], [[REG1]], 8
+; CHECK-DAG: stw [[REG2]],
+; CHECK: lfiwax [[REG4:[0-9]+]], 0, [[REG3]]
+; CHECK: fcfid 1, [[REG4]]
+; CHECK: blr
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/fp-to-int-to-fp.ll b/test/CodeGen/PowerPC/fp-to-int-to-fp.ll
new file mode 100644
index 0000000..f56b9b3
--- /dev/null
+++ b/test/CodeGen/PowerPC/fp-to-int-to-fp.ll
@@ -0,0 +1,70 @@
+; RUN: llc -mcpu=a2 < %s | FileCheck %s -check-prefix=FPCVT
+; RUN: llc -mcpu=ppc64 < %s | FileCheck %s -check-prefix=PPC64
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone
+define float @fool(float %X) #0 {
+entry:
+  %conv = fptosi float %X to i64
+  %conv1 = sitofp i64 %conv to float
+  ret float %conv1
+
+; FPCVT-LABEL: @fool
+; FPCVT: fctidz [[REG1:[0-9]+]], 1
+; FPCVT: fcfids 1, [[REG1]]
+; FPCVT: blr
+
+; PPC64-LABEL: @fool
+; PPC64: fctidz [[REG1:[0-9]+]], 1
+; PPC64: fcfid [[REG2:[0-9]+]], [[REG1]]
+; PPC64: frsp 1, [[REG2]]
+; PPC64: blr
+}
+
+; Function Attrs: nounwind readnone
+define double @foodl(double %X) #0 {
+entry:
+  %conv = fptosi double %X to i64
+  %conv1 = sitofp i64 %conv to double
+  ret double %conv1
+
+; FPCVT-LABEL: @foodl
+; FPCVT: fctidz [[REG1:[0-9]+]], 1
+; FPCVT: fcfid 1, [[REG1]]
+; FPCVT: blr
+
+; PPC64-LABEL: @foodl
+; PPC64: fctidz [[REG1:[0-9]+]], 1
+; PPC64: fcfid 1, [[REG1]]
+; PPC64: blr
+}
+
+; Function Attrs: nounwind readnone
+define float @fooul(float %X) #0 {
+entry:
+  %conv = fptoui float %X to i64
+  %conv1 = uitofp i64 %conv to float
+  ret float %conv1
+
+; FPCVT-LABEL: @fooul
+; FPCVT: fctiduz [[REG1:[0-9]+]], 1
+; FPCVT: fcfidus 1, [[REG1]]
+; FPCVT: blr
+}
+
+; Function Attrs: nounwind readnone
+define double @fooudl(double %X) #0 {
+entry:
+  %conv = fptoui double %X to i64
+  %conv1 = uitofp i64 %conv to double
+  ret double %conv1
+
+; FPCVT-LABEL: @fooudl
+; FPCVT: fctiduz [[REG1:[0-9]+]], 1
+; FPCVT: fcfidu 1, [[REG1]]
+; FPCVT: blr
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/PowerPC/glob-comp-aa-crash.ll b/test/CodeGen/PowerPC/glob-comp-aa-crash.ll
index f97d0ff..2ea036f 100644
--- a/test/CodeGen/PowerPC/glob-comp-aa-crash.ll
+++ b/test/CodeGen/PowerPC/glob-comp-aa-crash.ll
@@ -130,10 +130,10 @@ attributes #4 = { optsize }
 attributes #5 = { nounwind optsize }
 attributes #6 = { noreturn optsize }
 
-!0 = metadata !{metadata !"any pointer", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"bool", metadata !1}
-!4 = metadata !{i8 0, i8 2}
-!5 = metadata !{metadata !0, metadata !0, i64 0}
-!6 = metadata !{metadata !3, metadata !3, i64 0}
+!0 = !{!"any pointer", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"bool", !1}
+!4 = !{i8 0, i8 2}
+!5 = !{!0, !0, i64 0}
+!6 = !{!3, !3, i64 0}
diff --git a/test/CodeGen/PowerPC/i1-ext-fold.ll b/test/CodeGen/PowerPC/i1-ext-fold.ll
new file mode 100644
index 0000000..19bd8ff
--- /dev/null
+++ b/test/CodeGen/PowerPC/i1-ext-fold.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone
+define signext i32 @foo(i32 signext %a, i32 signext %b) #0 {
+entry:
+  %cmp = icmp slt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  %shl = shl nuw nsw i32 %conv, 4
+  ret i32 %shl
+
+; CHECK-LABEL: @foo
+; CHECK-DAG: cmpw
+; CHECK-DAG: li [[REG1:[0-9]+]], 0
+; CHECK-DAG: li [[REG2:[0-9]+]], 16
+; CHECK: isel 3, [[REG2]], [[REG1]],
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define signext i32 @foo2(i32 signext %a, i32 signext %b) #0 {
+entry:
+  %cmp = icmp slt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  %shl = shl nuw nsw i32 %conv, 4
+  %add1 = or i32 %shl, 5
+  ret i32 %add1
+
+; CHECK-LABEL: @foo2
+; CHECK-DAG: cmpw
+; CHECK-DAG: li [[REG1:[0-9]+]], 5
+; CHECK-DAG: li [[REG2:[0-9]+]], 21
+; CHECK: isel 3, [[REG2]], [[REG1]],
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define signext i32 @foo3(i32 signext %a, i32 signext %b) #0 {
+entry:
+  %cmp = icmp sle i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  %shl = shl nuw nsw i32 %conv, 4
+  ret i32 %shl
+
+; CHECK-LABEL: @foo3
+; CHECK-DAG: cmpw
+; CHECK-DAG: li [[REG1:[0-9]+]], 16
+; CHECK: isel 3, 0, [[REG1]],
+; CHECK: blr
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/PowerPC/ia-mem-r0.ll b/test/CodeGen/PowerPC/ia-mem-r0.ll
new file mode 100644
index 0000000..4ab17ed
--- /dev/null
+++ b/test/CodeGen/PowerPC/ia-mem-r0.ll
@@ -0,0 +1,94 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Make sure that we don't generate a std r, 0(0) -- the memory address cannot
+; be stored in r0.
+; CHECK-LABEL: @test1
+; CHECK-NOT: std {{[0-9]+}}, 0(0) 
+; CHECK: blr
+
+define void @test1({ i8*, void (i8*, i8*)* } %fn_arg) {
+  %fn = alloca { i8*, void (i8*, i8*)* }
+  %sp = alloca i8*, align 8
+  %regs = alloca [18 x i64], align 8
+  store { i8*, void (i8*, i8*)* } %fn_arg, { i8*, void (i8*, i8*)* }* %fn
+  %1 = bitcast [18 x i64]* %regs to i64*
+  call void asm sideeffect "std  14, $0", "=*m"(i64* %1)
+  %2 = bitcast [18 x i64]* %regs to i8*
+  %3 = getelementptr i8* %2, i32 8
+  %4 = bitcast i8* %3 to i64*
+  call void asm sideeffect "std  15, $0", "=*m"(i64* %4)
+  %5 = bitcast [18 x i64]* %regs to i8*
+  %6 = getelementptr i8* %5, i32 16
+  %7 = bitcast i8* %6 to i64*
+  call void asm sideeffect "std  16, $0", "=*m"(i64* %7)
+  %8 = bitcast [18 x i64]* %regs to i8*
+  %9 = getelementptr i8* %8, i32 24
+  %10 = bitcast i8* %9 to i64*
+  call void asm sideeffect "std  17, $0", "=*m"(i64* %10)
+  %11 = bitcast [18 x i64]* %regs to i8*
+  %12 = getelementptr i8* %11, i32 32
+  %13 = bitcast i8* %12 to i64*
+  call void asm sideeffect "std  18, $0", "=*m"(i64* %13)
+  %14 = bitcast [18 x i64]* %regs to i8*
+  %15 = getelementptr i8* %14, i32 40
+  %16 = bitcast i8* %15 to i64*
+  call void asm sideeffect "std  19, $0", "=*m"(i64* %16)
+  %17 = bitcast [18 x i64]* %regs to i8*
+  %18 = getelementptr i8* %17, i32 48
+  %19 = bitcast i8* %18 to i64*
+  call void asm sideeffect "std  20, $0", "=*m"(i64* %19)
+  %20 = bitcast [18 x i64]* %regs to i8*
+  %21 = getelementptr i8* %20, i32 56
+  %22 = bitcast i8* %21 to i64*
+  call void asm sideeffect "std  21, $0", "=*m"(i64* %22)
+  %23 = bitcast [18 x i64]* %regs to i8*
+  %24 = getelementptr i8* %23, i32 64
+  %25 = bitcast i8* %24 to i64*
+  call void asm sideeffect "std  22, $0", "=*m"(i64* %25)
+  %26 = bitcast [18 x i64]* %regs to i8*
+  %27 = getelementptr i8* %26, i32 72
+  %28 = bitcast i8* %27 to i64*
+  call void asm sideeffect "std  23, $0", "=*m"(i64* %28)
+  %29 = bitcast [18 x i64]* %regs to i8*
+  %30 = getelementptr i8* %29, i32 80
+  %31 = bitcast i8* %30 to i64*
+  call void asm sideeffect "std  24, $0", "=*m"(i64* %31)
+  %32 = bitcast [18 x i64]* %regs to i8*
+  %33 = getelementptr i8* %32, i32 88
+  %34 = bitcast i8* %33 to i64*
+  call void asm sideeffect "std  25, $0", "=*m"(i64* %34)
+  %35 = bitcast [18 x i64]* %regs to i8*
+  %36 = getelementptr i8* %35, i32 96
+  %37 = bitcast i8* %36 to i64*
+  call void asm sideeffect "std  26, $0", "=*m"(i64* %37)
+  %38 = bitcast [18 x i64]* %regs to i8*
+  %39 = getelementptr i8* %38, i32 104
+  %40 = bitcast i8* %39 to i64*
+  call void asm sideeffect "std  27, $0", "=*m"(i64* %40)
+  %41 = bitcast [18 x i64]* %regs to i8*
+  %42 = getelementptr i8* %41, i32 112
+  %43 = bitcast i8* %42 to i64*
+  call void asm sideeffect "std  28, $0", "=*m"(i64* %43)
+  %44 = bitcast [18 x i64]* %regs to i8*
+  %45 = getelementptr i8* %44, i32 120
+  %46 = bitcast i8* %45 to i64*
+  call void asm sideeffect "std  29, $0", "=*m"(i64* %46)
+  %47 = bitcast [18 x i64]* %regs to i8*
+  %48 = getelementptr i8* %47, i32 128
+  %49 = bitcast i8* %48 to i64*
+  call void asm sideeffect "std  30, $0", "=*m"(i64* %49)
+  %50 = bitcast [18 x i64]* %regs to i8*
+  %51 = getelementptr i8* %50, i32 136
+  %52 = bitcast i8* %51 to i64*
+  call void asm sideeffect "std  31, $0", "=*m"(i64* %52)
+  %53 = getelementptr { i8*, void (i8*, i8*)* }* %fn, i32 0, i32 1
+  %.funcptr = load void (i8*, i8*)** %53
+  %54 = getelementptr { i8*, void (i8*, i8*)* }* %fn, i32 0, i32 0
+  %.ptr = load i8** %54
+  %55 = load i8** %sp
+  call void %.funcptr(i8* %.ptr, i8* %55)
+  ret void
+}
+
diff --git a/test/CodeGen/PowerPC/ia-neg-const.ll b/test/CodeGen/PowerPC/ia-neg-const.ll
new file mode 100644
index 0000000..556ab80
--- /dev/null
+++ b/test/CodeGen/PowerPC/ia-neg-const.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@.str = private unnamed_addr constant [5 x i8] c"%ld\0A\00", align 1
+
+; Function Attrs: nounwind
+define i64 @main() #0 {
+entry:
+  %x = alloca i64, align 8
+  store i64 0, i64* %x, align 8
+  %0 = call i64 asm sideeffect "ld       $0,$1\0A\09add${2:I}   $0,$0,$2", "=&r,*m,Ir"(i64* %x, i64 -1) #0
+  ret i64 %0
+}
+
+; CHECK: ld
+; CHECK-NOT: addi   3, 3, 4294967295
+; CHECK: addi   3, 3, -1
+; CHECK: blr
+
+; Function Attrs: nounwind
+declare signext i32 @printf(i8* nocapture readonly, ...) #0
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/in-asm-f64-reg.ll b/test/CodeGen/PowerPC/in-asm-f64-reg.ll
index 1321dfc..08b1a2c 100644
--- a/test/CodeGen/PowerPC/in-asm-f64-reg.ll
+++ b/test/CodeGen/PowerPC/in-asm-f64-reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -no-integrated-as | FileCheck %s
 
 define void @f() {
 ; CHECK: @f
diff --git a/test/CodeGen/PowerPC/inlineasm-i64-reg.ll b/test/CodeGen/PowerPC/inlineasm-i64-reg.ll
index 5e31cd5..4d8e704 100644
--- a/test/CodeGen/PowerPC/inlineasm-i64-reg.ll
+++ b/test/CodeGen/PowerPC/inlineasm-i64-reg.ll
@@ -105,4 +105,4 @@ if.end40:
 attributes #0 = { alwaysinline inlinehint nounwind }
 attributes #1 = { nounwind }
 
-!0 = metadata !{i32 -2146895770}
+!0 = !{i32 -2146895770}
diff --git a/test/CodeGen/PowerPC/lbz-from-ld-shift.ll b/test/CodeGen/PowerPC/lbz-from-ld-shift.ll
new file mode 100644
index 0000000..3eacd6a
--- /dev/null
+++ b/test/CodeGen/PowerPC/lbz-from-ld-shift.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mcpu=ppc64 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readonly
+define signext i32 @test(i32* nocapture readonly %P) #0 {
+entry:
+  %0 = load i32* %P, align 4
+  %shr = lshr i32 %0, 24
+  ret i32 %shr
+
+; CHECK-LABEL: @test
+; CHECK: lbz 3, 0(3)
+; CHECK: blr
+}
+
+attributes #0 = { nounwind readonly }
+
diff --git a/test/CodeGen/PowerPC/ld-st-upd.ll b/test/CodeGen/PowerPC/ld-st-upd.ll
new file mode 100644
index 0000000..24f31ac
--- /dev/null
+++ b/test/CodeGen/PowerPC/ld-st-upd.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define i32* @test4(i32* readonly %X, i32* nocapture %dest) #0 {
+  %Y = getelementptr i32* %X, i64 4
+  %A = load i32* %Y, align 4
+  store i32 %A, i32* %dest, align 4
+  ret i32* %Y
+
+; CHECK-LABEL: @test4
+; CHECK: lwzu [[REG1:[0-9]+]], 16(3)
+; CHECK: stw [[REG1]], 0(4)
+; CHECK: blr
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/ldtoc-inv.ll b/test/CodeGen/PowerPC/ldtoc-inv.ll
new file mode 100644
index 0000000..550747c
--- /dev/null
+++ b/test/CodeGen/PowerPC/ldtoc-inv.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@phasor = external constant [4096 x i32]
+
+; Function Attrs: nounwind
+define void @test(i32* nocapture %out, i32 zeroext %step_size) #0 {
+entry:
+  %shl = shl i32 %step_size, 2
+  %idxprom = zext i32 %shl to i64
+  br label %for.body
+
+; Make sure that the TOC load has been hoisted out of the loop.
+; CHECK-LABEL: @test
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc@l
+; CHECK: %for.body
+; CHECK: blr
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = trunc i64 %indvars.iv to i32
+  %shl1 = shl i32 %0, %step_size
+  %idxprom2 = sext i32 %shl1 to i64
+  %arrayidx.sum = add nsw i64 %idxprom2, %idxprom
+  %arrayidx3 = getelementptr inbounds [4096 x i32]* @phasor, i64 0, i64 %arrayidx.sum
+  %1 = load i32* %arrayidx3, align 4
+  %arrayidx5 = getelementptr inbounds i32* %out, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  %cmp = icmp slt i64 %indvars.iv.next, 1020
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/loop-data-prefetch.ll b/test/CodeGen/PowerPC/loop-data-prefetch.ll
new file mode 100644
index 0000000..8871481
--- /dev/null
+++ b/test/CodeGen/PowerPC/loop-data-prefetch.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mcpu=a2 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+; Function Attrs: nounwind
+define void @foo(double* nocapture %a, double* nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double* %b, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 8
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double* %a, i64 %indvars.iv
+  store double %add, double* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+
+; CHECK-LABEL: @foo
+; CHECK: dcbt
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/mult-alt-generic-powerpc.ll b/test/CodeGen/PowerPC/mult-alt-generic-powerpc.ll
index 659cdf7..743cc62 100644
--- a/test/CodeGen/PowerPC/mult-alt-generic-powerpc.ll
+++ b/test/CodeGen/PowerPC/mult-alt-generic-powerpc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32
+; RUN: llc < %s -march=ppc32 -no-integrated-as
 ; ModuleID = 'mult-alt-generic.c'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
 target triple = "powerpc"
diff --git a/test/CodeGen/PowerPC/mult-alt-generic-powerpc64.ll b/test/CodeGen/PowerPC/mult-alt-generic-powerpc64.ll
index 3da06f6..29a5786 100644
--- a/test/CodeGen/PowerPC/mult-alt-generic-powerpc64.ll
+++ b/test/CodeGen/PowerPC/mult-alt-generic-powerpc64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc64
+; RUN: llc < %s -march=ppc64 -no-integrated-as
 ; ModuleID = 'mult-alt-generic.c'
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64"
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r0.ll b/test/CodeGen/PowerPC/named-reg-alloc-r0.ll
index e683f99..b669c35 100644
--- a/test/CodeGen/PowerPC/named-reg-alloc-r0.ll
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r0.ll
@@ -12,4 +12,4 @@ entry:
 
 declare i32 @llvm.read_register.i32(metadata) nounwind
 
-!0 = metadata !{metadata !"r0\00"}
+!0 = !{!"r0\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r1-64.ll b/test/CodeGen/PowerPC/named-reg-alloc-r1-64.ll
index b047f9f..419e12c 100644
--- a/test/CodeGen/PowerPC/named-reg-alloc-r1-64.ll
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r1-64.ll
@@ -15,4 +15,4 @@ entry:
 
 declare i64 @llvm.read_register.i64(metadata) nounwind
 
-!0 = metadata !{metadata !"r1\00"}
+!0 = !{!"r1\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r1.ll b/test/CodeGen/PowerPC/named-reg-alloc-r1.ll
index 9d0eb34..3ccab8c 100644
--- a/test/CodeGen/PowerPC/named-reg-alloc-r1.ll
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r1.ll
@@ -17,4 +17,4 @@ entry:
 
 declare i32 @llvm.read_register.i32(metadata) nounwind
 
-!0 = metadata !{metadata !"r1\00"}
+!0 = !{!"r1\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r13-64.ll b/test/CodeGen/PowerPC/named-reg-alloc-r13-64.ll
index df5085b..74e31fdd 100644
--- a/test/CodeGen/PowerPC/named-reg-alloc-r13-64.ll
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r13-64.ll
@@ -15,4 +15,4 @@ entry:
 
 declare i64 @llvm.read_register.i64(metadata) nounwind
 
-!0 = metadata !{metadata !"r13\00"}
+!0 = !{!"r13\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r13.ll b/test/CodeGen/PowerPC/named-reg-alloc-r13.ll
index 900ebb2..314f5d5 100644
--- a/test/CodeGen/PowerPC/named-reg-alloc-r13.ll
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r13.ll
@@ -15,4 +15,4 @@ entry:
 
 declare i32 @llvm.read_register.i32(metadata) nounwind
 
-!0 = metadata !{metadata !"r13\00"}
+!0 = !{!"r13\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r2-64.ll b/test/CodeGen/PowerPC/named-reg-alloc-r2-64.ll
index 0da33fa..834df8b 100644
--- a/test/CodeGen/PowerPC/named-reg-alloc-r2-64.ll
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r2-64.ll
@@ -1,17 +1,14 @@
-; RUN: not llc < %s -mtriple=powerpc64-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-DARWIN
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=powerpc64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
 
 define i64 @get_reg() nounwind {
 entry:
 ; FIXME: Include an allocatable-specific error message
-; CHECK-DARWIN: Invalid register name global variable
+; CHECK: Invalid register name global variable
         %reg = call i64 @llvm.read_register.i64(metadata !0)
   ret i64 %reg
-
-; CHECK-LABEL: @get_reg
-; CHECK: mr 3, 2
 }
 
 declare i64 @llvm.read_register.i64(metadata) nounwind
 
-!0 = metadata !{metadata !"r2\00"}
+!0 = !{!"r2\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r2.ll b/test/CodeGen/PowerPC/named-reg-alloc-r2.ll
index 51e7e3e..45d9816 100644
--- a/test/CodeGen/PowerPC/named-reg-alloc-r2.ll
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r2.ll
@@ -1,11 +1,11 @@
-; RUN: not llc < %s -mtriple=powerpc-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: not llc < %s -mtriple=powerpc-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-NOTPPC32
 ; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s --check-prefix=CHECK-NOTPPC32
 
 define i32 @get_reg() nounwind {
 entry:
 ; FIXME: Include an allocatable-specific error message
-; CHECK-DARWIN: Invalid register name global variable
+; CHECK-NOTPPC32: Invalid register name global variable
         %reg = call i32 @llvm.read_register.i32(metadata !0)
   ret i32 %reg
 
@@ -15,4 +15,4 @@ entry:
 
 declare i32 @llvm.read_register.i32(metadata) nounwind
 
-!0 = metadata !{metadata !"r2\00"}
+!0 = !{!"r2\00"}
diff --git a/test/CodeGen/PowerPC/no-extra-fp-conv-ldst.ll b/test/CodeGen/PowerPC/no-extra-fp-conv-ldst.ll
new file mode 100644
index 0000000..6beee25
--- /dev/null
+++ b/test/CodeGen/PowerPC/no-extra-fp-conv-ldst.ll
@@ -0,0 +1,96 @@
+; RUN: llc -mcpu=a2 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readonly
+define double @test1(i64* nocapture readonly %x) #0 {
+entry:
+  %0 = load i64* %x, align 8
+  %conv = sitofp i64 %0 to double
+  ret double %conv
+
+; CHECK-LABEL: @test1
+; CHECK: lfd [[REG1:[0-9]+]], 0(3)
+; CHECK: fcfid 1, [[REG1]]
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readonly
+define double @test2(i32* nocapture readonly %x) #0 {
+entry:
+  %0 = load i32* %x, align 4
+  %conv = sitofp i32 %0 to double
+  ret double %conv
+
+; CHECK-LABEL: @test2
+; CHECK: lfiwax [[REG1:[0-9]+]], 0, 3
+; CHECK: fcfid 1, [[REG1]]
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define float @foo(float %X) #0 {
+entry:
+  %conv = fptosi float %X to i32
+  %conv1 = sitofp i32 %conv to float
+  ret float %conv1
+
+; CHECK-LABEL: @foo
+; CHECK-DAG: fctiwz [[REG2:[0-9]+]], 1
+; CHECK-DAG: addi [[REG1:[0-9]+]], 1,
+; CHECK: stfiwx [[REG2]], 0, [[REG1]]
+; CHECK: lfiwax [[REG3:[0-9]+]], 0, [[REG1]]
+; CHECK: fcfids 1, [[REG3]]
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define double @food(double %X) #0 {
+entry:
+  %conv = fptosi double %X to i32
+  %conv1 = sitofp i32 %conv to double
+  ret double %conv1
+
+; CHECK-LABEL: @food
+; CHECK-DAG: fctiwz [[REG2:[0-9]+]], 1
+; CHECK-DAG: addi [[REG1:[0-9]+]], 1,
+; CHECK: stfiwx [[REG2]], 0, [[REG1]]
+; CHECK: lfiwax [[REG3:[0-9]+]], 0, [[REG1]]
+; CHECK: fcfid 1, [[REG3]]
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define float @foou(float %X) #0 {
+entry:
+  %conv = fptoui float %X to i32
+  %conv1 = uitofp i32 %conv to float
+  ret float %conv1
+
+; CHECK-LABEL: @foou
+; CHECK-DAG: fctiwuz [[REG2:[0-9]+]], 1
+; CHECK-DAG: addi [[REG1:[0-9]+]], 1,
+; CHECK: stfiwx [[REG2]], 0, [[REG1]]
+; CHECK: lfiwzx [[REG3:[0-9]+]], 0, [[REG1]]
+; CHECK: fcfidus 1, [[REG3]]
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define double @fooud(double %X) #0 {
+entry:
+  %conv = fptoui double %X to i32
+  %conv1 = uitofp i32 %conv to double
+  ret double %conv1
+
+; CHECK-LABEL: @fooud
+; CHECK-DAG: fctiwuz [[REG2:[0-9]+]], 1
+; CHECK-DAG: addi [[REG1:[0-9]+]], 1,
+; CHECK: stfiwx [[REG2]], 0, [[REG1]]
+; CHECK: lfiwzx [[REG3:[0-9]+]], 0, [[REG1]]
+; CHECK: fcfidu 1, [[REG3]]
+; CHECK: blr
+}
+
+attributes #0 = { nounwind readonly }
+
diff --git a/test/CodeGen/PowerPC/no-pref-jumps.ll b/test/CodeGen/PowerPC/no-pref-jumps.ll
new file mode 100644
index 0000000..d9490f1
--- /dev/null
+++ b/test/CodeGen/PowerPC/no-pref-jumps.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo(i32 signext %a, i32 signext %b) #0 {
+entry:
+  %cmp = icmp sgt i32 %a, 5
+  %cmp1 = icmp slt i32 %b, 3
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.else
+
+; CHECK-LABEL: @foo
+; CHECK: cmpwi
+; CHECK: cmpwi
+; CHECK: cror
+; CHECK: blr
+
+if.then:                                          ; preds = %entry
+  tail call void bitcast (void (...)* @bar to void ()*)() #0
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  tail call void bitcast (void (...)* @car to void ()*)() #0
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare void @bar(...)
+
+declare void @car(...)
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/p8-isel-sched.ll b/test/CodeGen/PowerPC/p8-isel-sched.ll
new file mode 100644
index 0000000..034fe3c
--- /dev/null
+++ b/test/CodeGen/PowerPC/p8-isel-sched.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mcpu=pwr8 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo(i32* nocapture %r1, i32* nocapture %r2, i32* nocapture %r3, i32* nocapture %r4, i32 signext %a, i32 signext %b, i32 signext %c, i32 signext %d) #0 {
+entry:
+  %tobool = icmp ne i32 %a, 0
+  %cond = select i1 %tobool, i32 %b, i32 %c
+  store i32 %cond, i32* %r1, align 4
+  %cond5 = select i1 %tobool, i32 %b, i32 %d
+  store i32 %cond5, i32* %r2, align 4
+  %add = add nsw i32 %b, 1
+  %sub = add nsw i32 %d, -2
+  %cond10 = select i1 %tobool, i32 %add, i32 %sub
+  store i32 %cond10, i32* %r3, align 4
+  %add13 = add nsw i32 %b, 3
+  %sub15 = add nsw i32 %d, -5
+  %cond17 = select i1 %tobool, i32 %add13, i32 %sub15
+  store i32 %cond17, i32* %r4, align 4
+  ret void
+}
+
+; Make sure that we don't schedule all of the isels together, they should be
+; intermixed with the adds because each isel starts a new dispatch group.
+; CHECK-LABEL: @foo
+; CHECK: isel
+; CHECK: addi
+; CHECK: isel
+; CHECK: blr
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/post-ra-ec.ll b/test/CodeGen/PowerPC/post-ra-ec.ll
new file mode 100644
index 0000000..9c61677
--- /dev/null
+++ b/test/CodeGen/PowerPC/post-ra-ec.ll
@@ -0,0 +1,47 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.inode.0.12.120 = type { i8* }
+%struct.kstat2.1.13.121 = type { i32 }
+%struct.task_struct.4.16.124 = type { i8*, %struct.atomic_t.2.14.122, %struct.signal_struct.3.15.123* }
+%struct.atomic_t.2.14.122 = type { i32 }
+%struct.signal_struct.3.15.123 = type { i64 }
+%struct.pid.5.17.125 = type { i8* }
+
+; Function Attrs: nounwind
+define signext i32 @proc_task_getattr(%struct.inode.0.12.120* nocapture readonly %inode, %struct.kstat2.1.13.121* nocapture %stat) #0 {
+entry:
+  %call1.i = tail call %struct.task_struct.4.16.124* @get_pid_task(%struct.pid.5.17.125* undef, i32 zeroext 0) #0
+  br i1 undef, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %0 = load i64* undef, align 8
+  %conv.i = trunc i64 %0 to i32
+  %1 = load i32* null, align 4
+  %add = add i32 %1, %conv.i
+  store i32 %add, i32* null, align 4
+  %counter.i.i = getelementptr inbounds %struct.task_struct.4.16.124* %call1.i, i64 0, i32 1, i32 0
+  %2 = tail call i32 asm sideeffect "\09lwsync\0A1:\09lwarx\09$0,0,$1\09\09# atomic_dec_return\0A\09addic\09$0,$0,-1\0A\09stwcx.\09$0,0,$1\0A\09bne-\091b\0A\09sync\0A", "=&r,r,~{cr0},~{xer},~{memory}"(i32* %counter.i.i) #0
+  %cmp.i = icmp eq i32 %2, 0
+  br i1 %cmp.i, label %if.then.i, label %if.end
+
+; CHECK-LABEL: @proc_task_getattr
+; CHECK-NOT: stwcx. [[REG:[0-9]+]],0,[[REG]]
+; CHECK: blr
+
+if.then.i:                                        ; preds = %if.then
+  %3 = bitcast %struct.task_struct.4.16.124* %call1.i to i8*
+  tail call void @foo(i8* %3) #0
+  unreachable
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i32 0
+}
+
+declare void @foo(i8*)
+
+declare %struct.task_struct.4.16.124* @get_pid_task(%struct.pid.5.17.125*, i32 zeroext)
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/ppc32-cyclecounter.ll b/test/CodeGen/PowerPC/ppc32-cyclecounter.ll
new file mode 100644
index 0000000..9e2cd0b
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc32-cyclecounter.ll
@@ -0,0 +1,20 @@
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc"
+; RUN: llc -mcpu=ppc < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+
+define i64 @test1() nounwind {
+entry:
+  %r = call i64 @llvm.readcyclecounter()
+  ret i64 %r
+}
+
+; CHECK: @test1
+; CHECK: mfspr 3, 269
+; CHECK: mfspr 4, 268
+; CHECK: mfspr [[REG:[0-9]+]], 269
+; CHECK: cmpw [[CR:[0-9]+]], 3, [[REG]]
+; CHECK: bne [[CR]], .LBB
+
+declare i64 @llvm.readcyclecounter()
+
diff --git a/test/CodeGen/PowerPC/ppc32-lshrti3.ll b/test/CodeGen/PowerPC/ppc32-lshrti3.ll
index 6e76fea..f773cce 100644
--- a/test/CodeGen/PowerPC/ppc32-lshrti3.ll
+++ b/test/CodeGen/PowerPC/ppc32-lshrti3.ll
@@ -36,4 +36,4 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.5.0 (213754)"}
+!0 = !{!"clang version 3.5.0 (213754)"}
diff --git a/test/CodeGen/PowerPC/ppc32-pic-large.ll b/test/CodeGen/PowerPC/ppc32-pic-large.ll
index ecc4f10..bb906ec 100644
--- a/test/CodeGen/PowerPC/ppc32-pic-large.ll
+++ b/test/CodeGen/PowerPC/ppc32-pic-large.ll
@@ -1,23 +1,29 @@
 ; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -relocation-model=pic | FileCheck -check-prefix=LARGE-BSS %s
 @bar = common global i32 0, align 4
 
+declare i32 @call_foo(i32, ...)
+
 define i32 @foo() {
 entry:
   %0 = load i32* @bar, align 4
+  %call = call i32 (i32, ...)* @call_foo(i32 %0, i32 0, i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64)
   ret i32 %0
 }
 
 !llvm.module.flags = !{!0}
-!0 = metadata !{i32 1, metadata !"PIC Level", i32 2}
+!0 = !{i32 1, !"PIC Level", i32 2}
 ; LARGE-BSS:       [[POFF:\.L[0-9]+\$poff]]:
 ; LARGE-BSS-NEXT:    .long .LTOC-[[PB:\.L[0-9]+\$pb]]
 ; LARGE-BSS-NEXT:  foo:
+; LARGE-BSS:         stw 30, -8(1)
 ; LARGE-BSS:         bl [[PB]]
 ; LARGE-BSS-NEXT:  [[PB]]:
 ; LARGE-BSS:         mflr 30
 ; LARGE-BSS:         lwz [[REG:[0-9]+]], [[POFF]]-[[PB]](30)
 ; LARGE-BSS-NEXT:    add 30, [[REG]], 30
-; LARGE-BSS:         lwz [[VREG:[0-9]+]], [[VREF:\.LC[0-9]+]]-.LTOC(30)
-; LARGE-BSS:         lwz {{[0-9]+}}, 0([[VREG]])
+; LARGE-BSS-DAG:     lwz [[VREG:[0-9]+]], [[VREF:\.LC[0-9]+]]-.LTOC(30)
+; LARGE-BSS-DAG:     lwz {{[0-9]+}}, 0([[VREG]])
+; LARGE-BSS-DAG:     stw {{[0-9]+}}, 8(1)
+; LARGE-BSS:         lwz 30, -8(1)
 ; LARGE-BSS:       [[VREF]]:
 ; LARGE-BSS-NEXT:    .long bar
diff --git a/test/CodeGen/PowerPC/ppc32-pic.ll b/test/CodeGen/PowerPC/ppc32-pic.ll
index f9c3467..abc1367 100644
--- a/test/CodeGen/PowerPC/ppc32-pic.ll
+++ b/test/CodeGen/PowerPC/ppc32-pic.ll
@@ -1,16 +1,24 @@
 ; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -relocation-model=pic | FileCheck -check-prefix=SMALL-BSS %s
 @bar = common global i32 0, align 4
 
+declare i32 @call_foo(i32, ...)
+
 define i32 @foo() {
 entry:
   %0 = load i32* @bar, align 4
-  ret i32 %0
+  %call = call i32 (i32, ...)* @call_foo(i32 %0, i32 0, i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64)
+  ret i32 0
 }
 
 !llvm.module.flags = !{!0}
-!0 = metadata !{i32 1, metadata !"PIC Level", i32 1}
+!0 = !{i32 1, !"PIC Level", i32 1}
 ; SMALL-BSS-LABEL:foo:
+; SMALL-BSS:         stw 30, -8(1)
+; SMALL-BSS:         stwu 1, -32(1)
 ; SMALL-BSS:         bl _GLOBAL_OFFSET_TABLE_@local-4
 ; SMALL-BSS:         mflr 30
-; SMALL-BSS:         lwz [[VREG:[0-9]+]], bar@GOT(30)
-; SMALL-BSS:         lwz {{[0-9]+}}, 0([[VREG]])
+; SMALL-BSS-DAG:     stw {{[0-9]+}}, 8(1)
+; SMALL-BSS-DAG:     lwz [[VREG:[0-9]+]], bar@GOT(30)
+; SMALL-BSS-DAG:     lwz {{[0-9]+}}, 0([[VREG]])
+; SMALL-BSS:         bl call_foo@PLT
+; SMALL-BSS:         lwz 30, -8(1)
diff --git a/test/CodeGen/PowerPC/ppc64-anyregcc-crash.ll b/test/CodeGen/PowerPC/ppc64-anyregcc-crash.ll
new file mode 100644
index 0000000..479c7a7
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-anyregcc-crash.ll
@@ -0,0 +1,19 @@
+; RUN: not llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+;
+; Check that misuse of anyregcc results in a compile time error.
+
+; CHECK: LLVM ERROR: ran out of registers during register allocation
+define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
+                        i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
+                        i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
+                        i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32) {
+entry:
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
+                i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
+                i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
+                i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
+                i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32)
+  ret i64 %result
+}
+
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/PowerPC/ppc64-anyregcc.ll b/test/CodeGen/PowerPC/ppc64-anyregcc.ll
new file mode 100644
index 0000000..8b4cec5
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-anyregcc.ll
@@ -0,0 +1,367 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Stackmap Header: no constants - 6 callsites
+; CHECK-LABEL: .section	.llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 8
+; Num LargeConstants
+; CHECK-NEXT:   .long 0
+; Num Callsites
+; CHECK-NEXT:   .long 8
+
+; Functions and stack size
+; CHECK-NEXT:   .quad test
+; CHECK-NEXT:   .quad 128
+; CHECK-NEXT:   .quad property_access1
+; CHECK-NEXT:   .quad 128
+; CHECK-NEXT:   .quad property_access2
+; CHECK-NEXT:   .quad 128
+; CHECK-NEXT:   .quad property_access3
+; CHECK-NEXT:   .quad 128
+; CHECK-NEXT:   .quad anyreg_test1
+; CHECK-NEXT:   .quad 144
+; CHECK-NEXT:   .quad anyreg_test2
+; CHECK-NEXT:   .quad 144
+; CHECK-NEXT:   .quad patchpoint_spilldef
+; CHECK-NEXT:   .quad 256
+; CHECK-NEXT:   .quad patchpoint_spillargs
+; CHECK-NEXT:   .quad 288
+
+
+; test
+; CHECK-LABEL:  .long   .L{{.*}}-.L.test
+; CHECK-NEXT:   .short  0
+; 3 locations
+; CHECK-NEXT:   .short  3
+; Loc 0: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Constant 3
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long 3
+define i64 @test() nounwind ssp uwtable {
+entry:
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 24, i8* null, i32 2, i32 1, i32 2, i64 3)
+  ret i64 0
+}
+
+; property access 1 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-LABEL:  .long   .L{{.*}}-.L.property_access1
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 24, i8* %f, i32 1, i8* %obj)
+  ret i64 %ret
+}
+
+; property access 2 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-LABEL:  .long   .L{{.*}}-.L.property_access2
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access2() nounwind ssp uwtable {
+entry:
+  %obj = alloca i64, align 8
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 24, i8* %f, i32 1, i64* %obj)
+  ret i64 %ret
+}
+
+; property access 3 - %obj is a frame index
+; CHECK-LABEL:  .long   .L{{.*}}-.L.property_access3
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Direct FP - 8
+; CHECK-NEXT:   .byte 2
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 31
+; CHECK-NEXT:   .long 112
+define i64 @property_access3() nounwind ssp uwtable {
+entry:
+  %obj = alloca i64, align 8
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 24, i8* %f, i32 0, i64* %obj)
+  ret i64 %ret
+}
+
+; anyreg_test1
+; CHECK-LABEL:  .long   .L{{.*}}-.L.anyreg_test1
+; CHECK-NEXT:   .short  0
+; 14 locations
+; CHECK-NEXT:   .short  14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 3: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 4: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 5: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 6: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 7: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 8: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 9: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 10: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 11: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 12: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 13: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 24, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  ret i64 %ret
+}
+
+; anyreg_test2
+; CHECK-LABEL:  .long   .L{{.*}}-.L.anyreg_test2
+; CHECK-NEXT:   .short  0
+; 14 locations
+; CHECK-NEXT:   .short  14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 3: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 4: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 5: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 6: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 7: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 8: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 9: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 10: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 11: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 12: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 13: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 24, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  ret i64 %ret
+}
+
+; Test spilling the return value of an anyregcc call.
+;
+; <rdar://problem/15432754> [JS] Assertion: "Folded a def to a non-store!"
+;
+; CHECK-LABEL: .long .L{{.*}}-.L.patchpoint_spilldef
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 3
+; Loc 0: Register (some register that will be spilled to the stack)
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 24, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+  tail call void asm sideeffect "nop", "~{r0},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{r14},~{r15},~{r16},~{r17
+},~{r18},~{r19},~{r20},~{r21},~{r22},~{r23},~{r24},~{r25},~{r26},~{r27},~{r28},~{r29},~{r30},~{r31}"() nounwind
+  ret i64 %result
+}
+
+; Test spilling the arguments of an anyregcc call.
+;
+; <rdar://problem/15487687> [JS] AnyRegCC argument ends up being spilled
+;
+; CHECK-LABEL: .long .L{{.*}}-.L.patchpoint_spillargs
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 5
+; Loc 0: Return a register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Arg0 in a Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 2: Arg1 in a Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 3: Arg2 spilled to FP -96
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 31
+; CHECK-NEXT: .long 128
+; Loc 4: Arg3 spilled to FP - 88
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 31
+; CHECK-NEXT: .long 136
+define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+  tail call void asm sideeffect "nop", "~{r0},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{r14},~{r15},~{r16},~{r17
+},~{r18},~{r19},~{r20},~{r21},~{r22},~{r23},~{r24},~{r25},~{r26},~{r27},~{r28},~{r29},~{r30},~{r31}"() nounwind
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 24, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  ret i64 %result
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/PowerPC/ppc64-calls.ll b/test/CodeGen/PowerPC/ppc64-calls.ll
index 31794be..707ba95 100644
--- a/test/CodeGen/PowerPC/ppc64-calls.ll
+++ b/test/CodeGen/PowerPC/ppc64-calls.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc64 | FileCheck %s
+; RUN: llc < %s -march=ppc64 -mcpu=pwr7 | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -67,3 +67,20 @@ define double @test_external(double %x) nounwind {
 ; CHECK-NEXT: nop
   ret double %call
 }
+
+; The 'ld 2, 40(1)' really must always come directly after the bctrl to make
+; the unwinding code in libgcc happy.
+@g = external global void ()*
+declare void @h(i64)
+define void @test_indir_toc_reload(i64 %x) {
+  %1 = load void ()** @g
+  call void %1()
+  call void @h(i64 %x)
+  ret void
+
+; CHECK-LABEL: @test_indir_toc_reload
+; CHECK: bctrl
+; CHECK-NEXT: ld 2, 40(1)
+; CHECK: blr
+}
+
diff --git a/test/CodeGen/PowerPC/ppc64-elf-abi.ll b/test/CodeGen/PowerPC/ppc64-elf-abi.ll
index d82122d..5344337 100644
--- a/test/CodeGen/PowerPC/ppc64-elf-abi.ll
+++ b/test/CodeGen/PowerPC/ppc64-elf-abi.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefix=CHECK-ELFv1
-; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mattr=+elfv1 < %s | FileCheck %s -check-prefix=CHECK-ELFv1
-; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mattr=+elfv2 < %s | FileCheck %s -check-prefix=CHECK-ELFv2
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -target-abi elfv1 < %s | FileCheck %s -check-prefix=CHECK-ELFv1
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -target-abi elfv2 < %s | FileCheck %s -check-prefix=CHECK-ELFv2
 ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefix=CHECK-ELFv2
-; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mattr=+elfv1 < %s | FileCheck %s -check-prefix=CHECK-ELFv1
-; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mattr=+elfv2 < %s | FileCheck %s -check-prefix=CHECK-ELFv2
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -target-abi elfv1 < %s | FileCheck %s -check-prefix=CHECK-ELFv1
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -target-abi elfv2 < %s | FileCheck %s -check-prefix=CHECK-ELFv2
 
 ; CHECK-ELFv2: .abiversion 2
 ; CHECK-ELFv1-NOT: .abiversion 2
diff --git a/test/CodeGen/PowerPC/ppc64-fastcc-fast-isel.ll b/test/CodeGen/PowerPC/ppc64-fastcc-fast-isel.ll
new file mode 100644
index 0000000..941513f
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-fastcc-fast-isel.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mcpu=pwr7 -mattr=-vsx -fast-isel -fast-isel-abort < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define fastcc i64 @g1(i64 %g1, double %f1, i64 %g2, double %f2, i64 %g3, double %f3, i64 %g4, double %f4) #0 {
+  ret i64 %g1
+
+; CHECK-LABEL: @g1
+; CHECK-NOT: mr 3,
+; CHECK: blr
+}
+
+define fastcc i64 @g2(i64 %g1, double %f1, i64 %g2, double %f2, i64 %g3, double %f3, i64 %g4, double %f4) #0 {
+  ret i64 %g2
+
+; CHECK-LABEL: @g2
+; CHECK: mr 3, 4
+; CHECK-NEXT: blr
+}
+
+define fastcc i64 @g3(i64 %g1, double %f1, i64 %g2, double %f2, i64 %g3, double %f3, i64 %g4, double %f4) #0 {
+  ret i64 %g3
+
+; CHECK-LABEL: @g3
+; CHECK: mr 3, 5
+; CHECK-NEXT: blr
+}
+
+define fastcc double @f2(i64 %g1, double %f1, i64 %g2, double %f2, i64 %g3, double %f3, i64 %g4, double %f4) #0 {
+  ret double %f2
+
+; CHECK-LABEL: @f2
+; CHECK: fmr 1, 2
+; CHECK-NEXT: blr
+}
+
+define void @cg2(i64 %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, i64 %v, double 0.0, i64 0, double 0.0, i64 0, double 0.0)
+  ret void
+
+; CHECK-LABEL: @cg2
+; CHECK: mr 4, 3
+; CHECK: blr
+}
+
+define void @cf2(double %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, i64 0, double %v, i64 0, double 0.0, i64 0, double 0.0)
+  ret void
+
+; CHECK-LABEL: @cf2
+; CHECK: mr 2, 1
+; CHECK: blr
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/ppc64-fastcc.ll b/test/CodeGen/PowerPC/ppc64-fastcc.ll
new file mode 100644
index 0000000..bb1365a
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-fastcc.ll
@@ -0,0 +1,540 @@
+; RUN: llc -mcpu=pwr7 -mattr=-vsx < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define fastcc i64 @g1(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret i64 %g1
+
+; CHECK-LABEL: @g1
+; CHECK-NOT: mr 3,
+; CHECK: blr
+}
+
+define fastcc i64 @g2(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret i64 %g2
+
+; CHECK-LABEL: @g2
+; CHECK: mr 3, 4
+; CHECK-NEXT: blr
+}
+
+define fastcc i64 @g3(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret i64 %g3
+
+; CHECK-LABEL: @g3
+; CHECK: mr 3, 5
+; CHECK-NEXT: blr
+}
+
+define fastcc i64 @g4(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret i64 %g4
+
+; CHECK-LABEL: @g4
+; CHECK: mr 3, 6
+; CHECK-NEXT: blr
+}
+
+define fastcc i64 @g5(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret i64 %g5
+
+; CHECK-LABEL: @g5
+; CHECK: mr 3, 7
+; CHECK-NEXT: blr
+}
+
+define fastcc i64 @g6(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret i64 %g6
+
+; CHECK-LABEL: @g6
+; CHECK: mr 3, 8
+; CHECK-NEXT: blr
+}
+
+define fastcc i64 @g7(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret i64 %g7
+
+; CHECK-LABEL: @g7
+; CHECK: mr 3, 9
+; CHECK-NEXT: blr
+}
+
+define fastcc i64 @g8(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret i64 %g8
+
+; CHECK-LABEL: @g8
+; CHECK: mr 3, 10
+; CHECK-NEXT: blr
+}
+
+define fastcc i64 @g9(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret i64 %g9
+
+; CHECK-LABEL: @g9
+; CHECK: ld 3, 48(1)
+; CHECK-NEXT: blr
+}
+
+define fastcc i64 @g10(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret i64 %g10
+
+; CHECK-LABEL: @g10
+; CHECK: ld 3, 56(1)
+; CHECK-NEXT: blr
+}
+
+define fastcc i64 @g11(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret i64 %g11
+
+; CHECK-LABEL: @g11
+; CHECK: ld 3, 64(1)
+; CHECK-NEXT: blr
+}
+
+define fastcc double @f1(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret double %f1
+
+; CHECK-LABEL: @f1
+; CHECK-NOT: fmr 1,
+; CHECK: blr
+}
+
+define fastcc double @f2(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret double %f2
+
+; CHECK-LABEL: @f2
+; CHECK: fmr 1, 2
+; CHECK-NEXT: blr
+}
+
+define fastcc double @f3(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret double %f3
+
+; CHECK-LABEL: @f3
+; CHECK: fmr 1, 3
+; CHECK-NEXT: blr
+}
+
+define fastcc double @f4(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret double %f4
+
+; CHECK-LABEL: @f4
+; CHECK: fmr 1, 4
+; CHECK-NEXT: blr
+}
+
+define fastcc double @f5(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret double %f5
+
+; CHECK-LABEL: @f5
+; CHECK: fmr 1, 5
+; CHECK-NEXT: blr
+}
+
+define fastcc double @f6(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret double %f6
+
+; CHECK-LABEL: @f6
+; CHECK: fmr 1, 6
+; CHECK-NEXT: blr
+}
+
+define fastcc double @f7(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret double %f7
+
+; CHECK-LABEL: @f7
+; CHECK: fmr 1, 7
+; CHECK-NEXT: blr
+}
+
+define fastcc double @f8(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret double %f8
+
+; CHECK-LABEL: @f8
+; CHECK: fmr 1, 8
+; CHECK-NEXT: blr
+}
+
+define fastcc double @f9(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret double %f9
+
+; CHECK-LABEL: @f9
+; CHECK: fmr 1, 9
+; CHECK-NEXT: blr
+}
+
+define fastcc double @f10(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret double %f10
+
+; CHECK-LABEL: @f10
+; CHECK: fmr 1, 10
+; CHECK-NEXT: blr
+}
+
+define fastcc double @f11(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret double %f11
+
+; CHECK-LABEL: @f11
+; CHECK: fmr 1, 11
+; CHECK-NEXT: blr
+}
+
+define fastcc double @f12(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret double %f12
+
+; CHECK-LABEL: @f12
+; CHECK: fmr 1, 12
+; CHECK-NEXT: blr
+}
+
+define fastcc double @f13(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret double %f13
+
+; CHECK-LABEL: @f13
+; CHECK: fmr 1, 13
+; CHECK-NEXT: blr
+}
+
+define fastcc double @f14(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret double %f14
+
+; CHECK-LABEL: @f14
+; CHECK: lfd 1, 120(1)
+; CHECK-NEXT: blr
+}
+
+define fastcc double @f15(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret double %f15
+
+; CHECK-LABEL: @f15
+; CHECK: lfd 1, 152(1)
+; CHECK-NEXT: blr
+}
+
+define fastcc <4 x i32> @v1(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret <4 x i32> %v1
+
+; CHECK-LABEL: @v1
+; CHECK-NOT: vor 2,
+; CHECK: blr
+}
+
+define fastcc <4 x i32> @v2(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret <4 x i32> %v2
+
+; CHECK-LABEL: @v2
+; CHECK: vor 2, 3, 3
+; CHECK-NEXT: blr
+}
+
+define fastcc <4 x i32> @v3(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret <4 x i32> %v3
+
+; CHECK-LABEL: @v3
+; CHECK: vor 2, 4, 4
+; CHECK-NEXT: blr
+}
+
+define fastcc <4 x i32> @v4(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret <4 x i32> %v4
+
+; CHECK-LABEL: @v4
+; CHECK: vor 2, 5, 5
+; CHECK-NEXT: blr
+}
+
+define fastcc <4 x i32> @v5(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret <4 x i32> %v5
+
+; CHECK-LABEL: @v5
+; CHECK: vor 2, 6, 6
+; CHECK-NEXT: blr
+}
+
+define fastcc <4 x i32> @v6(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret <4 x i32> %v6
+
+; CHECK-LABEL: @v6
+; CHECK: vor 2, 7, 7
+; CHECK-NEXT: blr
+}
+
+define fastcc <4 x i32> @v7(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret <4 x i32> %v7
+
+; CHECK-LABEL: @v7
+; CHECK: vor 2, 8, 8
+; CHECK-NEXT: blr
+}
+
+define fastcc <4 x i32> @v8(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret <4 x i32> %v8
+
+; CHECK-LABEL: @v8
+; CHECK: vor 2, 9, 9
+; CHECK-NEXT: blr
+}
+
+define fastcc <4 x i32> @v9(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret <4 x i32> %v9
+
+; CHECK-LABEL: @v9
+; CHECK: vor 2, 10, 10
+; CHECK-NEXT: blr
+}
+
+define fastcc <4 x i32> @v10(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret <4 x i32> %v10
+
+; CHECK-LABEL: @v10
+; CHECK: vor 2, 11, 11
+; CHECK-NEXT: blr
+}
+
+define fastcc <4 x i32> @v11(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret <4 x i32> %v11
+
+; CHECK-LABEL: @v11
+; CHECK: vor 2, 12, 12
+; CHECK-NEXT: blr
+}
+
+define fastcc <4 x i32> @v12(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret <4 x i32> %v12
+
+; CHECK-LABEL: @v12
+; CHECK: vor 2, 13, 13
+; CHECK-NEXT: blr
+}
+
+define fastcc <4 x i32> @v13(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret <4 x i32> %v13
+
+; CHECK-LABEL: @v13
+; CHECK: addi [[REG1:[0-9]+]], 1, 96
+; CHECK-NEXT: lvx 2, 0, [[REG1]]
+; CHECK-NEXT: blr
+}
+
+define fastcc <4 x i32> @v14(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret <4 x i32> %v14
+
+; CHECK-LABEL: @v14
+; CHECK: addi [[REG1:[0-9]+]], 1, 128
+; CHECK-NEXT: lvx 2, 0, [[REG1]]
+; CHECK-NEXT: blr
+}
+
+define fastcc <4 x i32> @v15(i64 %g1, double %f1, <4 x i32> %v1, i64 %g2, double %f2, <4 x i32> %v2, i64 %g3, double %f3, <4 x i32> %v3, i64 %g4, double %f4, <4 x i32> %v4, i64 %g5, double %f5, <4 x i32> %v5, i64 %g6, double %f6, <4 x i32> %v6, i64 %g7, double %f7, <4 x i32> %v7, i64 %g8, double %f8, <4 x i32> %v8, i64 %g9, double %f9, <4 x i32> %v9, i64 %g10, double %f10, <4 x i32> %v10, i64 %g11, double %f11, <4 x i32> %v11, i64 %g12, double %f12, <4 x i32> %v12, i64 %g13, double %f13, <4 x i32> %v13, i64 %g14, double %f14, <4 x i32> %v14, i64 %g15, double %f15, <4 x i32> %v15, i64 %g16, double %f16, <4 x i32> %v16) #0 {
+  ret <4 x i32> %v15
+
+; CHECK-LABEL: @v15
+; CHECK: addi [[REG1:[0-9]+]], 1, 160
+; CHECK-NEXT: lvx 2, 0, [[REG1]]
+; CHECK-NEXT: blr
+}
+
+define void @cg1(i64 %v) #0 {
+  tail call fastcc i64 @g1(i64 %v, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cg1
+; CHECK-NOT: {{^[ \t]*}}mr 3,
+; CHECK: blr
+}
+
+define void @cg2(i64 %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 %v, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cg2
+; CHECK: mr 4, 3
+; CHECK: blr
+}
+
+define void @cg3(i64 %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 %v, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cg3
+; CHECK: mr 5, 3
+; CHECK: blr
+}
+
+define void @cg4(i64 %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 %v, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cg4
+; CHECK: mr 6, 3
+; CHECK: blr
+}
+
+define void @cg5(i64 %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 %v, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cg5
+; CHECK: mr 7, 3
+; CHECK: blr
+}
+
+define void @cg6(i64 %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 %v, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cg6
+; CHECK: mr 8, 3
+; CHECK: blr
+}
+
+define void @cg7(i64 %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 %v, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cg7
+; CHECK: mr 9, 3
+; CHECK: blr
+}
+
+define void @cg8(i64 %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 %v, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cg8
+; CHECK: mr 10, 3
+; CHECK: blr
+}
+
+define void @cg9(i64 %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 %v, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cg9
+; CHECK: mr [[REG1:[0-9]+]], 3
+; CHECK: std [[REG1]], 48(1)
+; CHECK: blr
+}
+
+define void @cg10(i64 %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 %v, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cg10
+; CHECK: mr [[REG1:[0-9]+]], 3
+; CHECK: std [[REG1]], 56(1)
+; CHECK: blr
+}
+
+define void @cg11(i64 %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 %v, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cg11
+; CHECK: mr [[REG1:[0-9]+]], 3
+; CHECK: std [[REG1]], 64(1)
+; CHECK: blr
+}
+
+define void @cf1(double %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double %v, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cf1
+; CHECK-NOT: fmr 1,
+; CHECK: blr
+}
+
+define void @cf2(double %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double %v, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cf2
+; CHECK: fmr 2, 1
+; CHECK: blr
+}
+
+define void @cf3(double %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double %v, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cf3
+; CHECK: fmr 3, 1
+; CHECK: blr
+}
+
+define void @cf4(double %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double %v, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cf4
+; CHECK: fmr 4, 1
+; CHECK: blr
+}
+
+define void @cf5(double %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double %v, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cf5
+; CHECK: fmr 5, 1
+; CHECK: blr
+}
+
+define void @cf14(double %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double %v, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cf14
+; CHECK: stfd 1, 120(1)
+; CHECK: blr
+}
+
+define void @cf15(double %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double %v, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cf15
+; CHECK: stfd 1, 152(1)
+; CHECK: blr
+}
+
+define void @cv2(<4 x i32> %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> %v, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cv2
+; CHECK: vor 3, 2, 2
+; CHECK: blr
+}
+
+define void @cv3(<4 x i32> %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> %v, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cv3
+; CHECK: vor 4, 2, 2
+; CHECK: blr
+}
+
+define void @cv13(<4 x i32> %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> %v, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cv13
+; CHECK: li [[REG1:[0-9]+]], 96
+; CHECK: stvx 2, 1, [[REG1]]
+; CHECK: blr
+}
+
+define void @cv14(<4 x i32> %v) #0 {
+  tail call fastcc i64 @g1(i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> %v, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0, double 0.0, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  ret void
+
+; CHECK-LABEL: @cv14
+; CHECK: li [[REG1:[0-9]+]], 128
+; CHECK: stvx 2, 1, [[REG1]]
+; CHECK: blr
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/ppc64-func-desc-hoist.ll b/test/CodeGen/PowerPC/ppc64-func-desc-hoist.ll
new file mode 100644
index 0000000..57577f9
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-func-desc-hoist.ll
@@ -0,0 +1,47 @@
+; RUN: llc -mcpu=a2 < %s | FileCheck %s -check-prefix=INVFUNCDESC
+; RUN: llc -mcpu=a2 -mattr=-invariant-function-descriptors < %s | FileCheck %s -check-prefix=NONINVFUNCDESC
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @bar(void (...)* nocapture %x) #0 {
+entry:
+  %callee.knr.cast = bitcast void (...)* %x to void ()*
+  br label %for.body
+
+; INVFUNCDESC-LABEL: @bar
+; INVFUNCDESC-DAG: ld [[REG1:[0-9]+]], 8(3)
+; INVFUNCDESC-DAG: ld [[REG2:[0-9]+]], 16(3)
+; INVFUNCDESC-DAG: ld [[REG3:[0-9]+]], 0(3)
+
+; INVFUNCDESC: %for.body
+; INVFUNCDESC: std 2, 40(1)
+; INVFUNCDESC-DAG: mtctr [[REG3]]
+; INVFUNCDESC-DAG: mr 11, [[REG2]]
+; INVFUNCDESC-DAG: mr 2, [[REG1]]
+; INVFUNCDESC: bctrl
+; INVFUNCDESC-NEXT: ld 2, 40(1)
+
+; NONINVFUNCDESC-LABEL: @bar
+; NONINVFUNCDESC: %for.body
+; NONINVFUNCDESC: std 2, 40(1)
+; NONINVFUNCDESC-DAG: ld 3, 0(30)
+; NONINVFUNCDESC-DAG: ld 11, 16(30)
+; NONINVFUNCDESC-DAG: ld 2, 8(30)
+; NONINVFUNCDESC: mtctr 3
+; NONINVFUNCDESC: bctrl
+; NONINVFUNCDESC-NEXT: ld 2, 40(1)
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  tail call void %callee.knr.cast() #0
+  %inc = add nuw nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %inc, 1600000000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/ppc64-gep-opt.ll b/test/CodeGen/PowerPC/ppc64-gep-opt.ll
new file mode 100644
index 0000000..14cf9a7
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-gep-opt.ll
@@ -0,0 +1,157 @@
+; RUN: llc -O3 -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -O3 -print-after=codegenprepare -mcpu=ppc64 < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-NoAA <%t %s
+; RUN: llc -O3 -print-after=codegenprepare -mcpu=pwr7  < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-UseAA <%t %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Following test cases test enabling SeparateConstOffsetFromGEP pass in the PPC
+; backend. If useAA() returns true, it will lower a GEP with multiple indices
+; into GEPs with a single index, otherwise it will lower it into a
+; "ptrtoint+arithmetics+inttoptr" form.
+
+%struct = type { i32, i32, i32, i32, [20 x i32] }
+
+; Check that when two complex GEPs are used in two basic blocks, LLVM can
+; elimilate the common subexpression for the second use.
+define void @test_GEP_CSE([240 x %struct]* %string, i32* %adj, i32 %lib, i64 %idxprom) {
+  %liberties = getelementptr [240 x %struct]* %string, i64 1, i64 %idxprom, i32 3
+  %1 = load i32* %liberties, align 4
+  %cmp = icmp eq i32 %1, %lib
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %origin = getelementptr [240 x %struct]* %string, i64 1, i64 %idxprom, i32 2
+  %2 = load i32* %origin, align 4
+  store i32 %2, i32* %adj, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; CHECK-NoAA-LABEL: @test_GEP_CSE(
+; CHECK-NoAA: [[PTR0:%[a-zA-Z0-9]+]] = ptrtoint [240 x %struct]* %string to i64
+; CHECK-NoAA: [[PTR1:%[a-zA-Z0-9]+]] = mul i64 %idxprom, 96
+; CHECK-NoAA: [[PTR2:%[a-zA-Z0-9]+]] = add i64 [[PTR0]], [[PTR1]]
+; CHECK-NoAA: add i64 [[PTR2]], 23052
+; CHECK-NoAA: inttoptr
+; CHECK-NoAA: if.then:
+; CHECK-NoAA-NOT: ptrtoint
+; CHECK-NoAA-NOT: mul
+; CHECK-NoAA: add i64 [[PTR2]], 23048
+; CHECK-NoAA: inttoptr
+
+; CHECK-UseAA-LABEL: @test_GEP_CSE(
+; CHECK-UseAA: [[PTR0:%[a-zA-Z0-9]+]] = bitcast [240 x %struct]* %string to i8*
+; CHECK-UseAA: [[IDX:%[a-zA-Z0-9]+]] = mul i64 %idxprom, 96
+; CHECK-UseAA: [[PTR1:%[a-zA-Z0-9]+]] = getelementptr i8* [[PTR0]], i64 [[IDX]]
+; CHECK-UseAA: getelementptr i8* [[PTR1]], i64 23052
+; CHECK-UseAA: bitcast
+; CHECK-UseAA: if.then:
+; CHECK-UseAA: getelementptr i8* [[PTR1]], i64 23048
+; CHECK-UseAA: bitcast
+
+%class.my = type { i32, [128 x i32], i32, [256 x %struct.pt]}
+%struct.pt = type { %struct.point*, i32, i32 }
+%struct.point = type { i32, i32 }
+
+; Check when a GEP is used across two basic block, LLVM can sink the address
+; calculation and code gen can generate a better addressing mode for the second
+; use.
+define void @test_GEP_across_BB(%class.my* %this, i64 %idx) {
+  %1 = getelementptr %class.my* %this, i64 0, i32 3, i64 %idx, i32 1
+  %2 = load i32* %1, align 4
+  %3 = getelementptr %class.my* %this, i64 0, i32 3, i64 %idx, i32 2
+  %4 = load i32* %3, align 4
+  %5 = icmp eq i32 %2, %4
+  br i1 %5, label %if.true, label %exit
+
+if.true:
+  %6 = shl i32 %4, 1
+  store i32 %6, i32* %3, align 4
+  br label %exit
+
+exit:
+  %7 = add nsw i32 %4, 1
+  store i32 %7, i32* %1, align 4
+  ret void
+}
+; CHECK-LABEL: test_GEP_across_BB:
+; CHECK-NOT: lwzu
+; CHECK: blr
+
+; CHECK-NoAA-LABEL: test_GEP_across_BB(
+; CHECK-NoAA: add i64 [[TMP:%[a-zA-Z0-9]+]], 528
+; CHECK-NoAA: add i64 [[TMP]], 532
+; CHECK-NoAA: if.true:
+; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = add i64 [[TMP]], 532
+; CHECK-NoAA: exit:
+; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = add i64 [[TMP]], 528
+
+; CHECK-UseAA-LABEL: test_GEP_across_BB(
+; CHECK-UseAA: [[PTR0:%[a-zA-Z0-9]+]] = getelementptr
+; CHECK-UseAA: getelementptr i8* [[PTR0]], i64 528
+; CHECK-UseAA: getelementptr i8* [[PTR0]], i64 532
+; CHECK-UseAA: if.true:
+; CHECK-UseAA: {{%sunk[a-zA-Z0-9]+}} = getelementptr i8* [[PTR0]], i64 532
+; CHECK-UseAA: exit:
+; CHECK-UseAA: {{%sunk[a-zA-Z0-9]+}} = getelementptr i8* [[PTR0]], i64 528
+
+%struct.S = type { float, double }
+@struct_array = global [1024 x %struct.S] zeroinitializer, align 16
+
+; The following two test cases check we can extract constant from indices of
+; struct type.
+; The constant offsets are from indices "i64 %idxprom" and "i32 1". As the
+; alloca size of %struct.S is 16, and "i32 1" is the 2rd element whose field
+; offset is 8, the total constant offset is (5 * 16 + 8) = 88.
+define double* @test-struct_1(i32 %i) {
+entry:
+  %add = add nsw i32 %i, 5
+  %idxprom = sext i32 %add to i64
+  %p = getelementptr [1024 x %struct.S]* @struct_array, i64 0, i64 %idxprom, i32 1
+  ret double* %p
+}
+; CHECK-NoAA-LABEL: @test-struct_1(
+; CHECK-NoAA-NOT: getelementptr
+; CHECK-NoAA: add i64 %{{[a-zA-Z0-9]+}}, 88
+
+; CHECK-UseAA-LABEL: @test-struct_1(
+; CHECK-UseAA: getelementptr i8* %{{[a-zA-Z0-9]+}}, i64 88
+
+%struct3 = type { i64, i32 }
+%struct2 = type { %struct3, i32 }
+%struct1 = type { i64, %struct2 }
+%struct0 = type { i32, i32, i64*, [100 x %struct1] }
+
+; The constant offsets are from indices "i32 3", "i64 %arrayidx" and "i32 1".
+; "i32 3" is the 4th element whose field offset is 16. The alloca size of
+; %struct1 is 32. "i32 1" is the 2rd element whose field offset is 8. So the
+; total constant offset is 16 + (-2 * 32) + 8 = -40
+define %struct2* @test-struct_2(%struct0* %ptr, i64 %idx) {
+entry:
+  %arrayidx = add nsw i64 %idx, -2
+  %ptr2 = getelementptr %struct0* %ptr, i64 0, i32 3, i64 %arrayidx, i32 1
+  ret %struct2* %ptr2
+}
+; CHECK-NoAA-LABEL: @test-struct_2(
+; CHECK-NoAA-NOT: = getelementptr
+; CHECK-NoAA: add i64 %{{[a-zA-Z0-9]+}}, -40
+
+; CHECK-UseAA-LABEL: @test-struct_2(
+; CHECK-UseAA: getelementptr i8* %{{[a-zA-Z0-9]+}}, i64 -40
+
+; Test that when a index is added from two constant, SeparateConstOffsetFromGEP
+; pass does not generate incorrect result.
+define void @test_const_add([3 x i32]* %in) {
+  %inc = add nsw i32 2, 1
+  %idxprom = sext i32 %inc to i64
+  %arrayidx = getelementptr [3 x i32]* %in, i64 %idxprom, i64 2
+  store i32 0, i32* %arrayidx, align 4
+  ret void
+}
+; CHECK-LABEL: test_const_add:
+; CHECK: li [[REG:[0-9]+]], 0
+; CHECK: stw [[REG]], 44(3)
+; CHECK: blr
+
diff --git a/test/CodeGen/PowerPC/ppc64-icbt-pwr7.ll b/test/CodeGen/PowerPC/ppc64-icbt-pwr7.ll
new file mode 100644
index 0000000..e8617cc
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-icbt-pwr7.ll
@@ -0,0 +1,19 @@
+; Test the ICBT instruction is not emitted on POWER7
+; Based on the ppc64-prefetch.ll test
+; RUN: not llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s 2>&1 | FileCheck %s
+ 
+declare void @llvm.prefetch(i8*, i32, i32, i32)
+
+define void @test(i8* %a, ...) nounwind {
+entry:
+  call void @llvm.prefetch(i8* %a, i32 0, i32 3, i32 0)
+  ret void
+
+; FIXME: Crashing is not really the correct behavior here, we really should just emit nothing
+; CHECK: Cannot select: 0x{{[0-9,a-f]+}}: ch = Prefetch 
+; CHECK: 0x{{[0-9,a-f]+}}: i32 = Constant<0> 
+; CHECK-NEXT: 0x{{[0-9,a-f]+}}: i32 = Constant<3>
+; CHECK-NEXT: 0x{{[0-9,a-f]+}}: i32 = Constant<0>
+
+}
+
diff --git a/test/CodeGen/PowerPC/ppc64-icbt-pwr8.ll b/test/CodeGen/PowerPC/ppc64-icbt-pwr8.ll
new file mode 100644
index 0000000..a0f084a
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-icbt-pwr8.ll
@@ -0,0 +1,16 @@
+; Test the ICBT instruction on POWER8
+; Copied from the ppc64-prefetch.ll test
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+
+declare void @llvm.prefetch(i8*, i32, i32, i32)
+
+define void @test(i8* %a, ...) nounwind {
+entry:
+  call void @llvm.prefetch(i8* %a, i32 0, i32 3, i32 0)
+  ret void
+
+; CHECK-LABEL: @test
+; CHECK: icbt
+}
+
+
diff --git a/test/CodeGen/PowerPC/ppc64-nonfunc-calls.ll b/test/CodeGen/PowerPC/ppc64-nonfunc-calls.ll
new file mode 100644
index 0000000..b1d3f39
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-nonfunc-calls.ll
@@ -0,0 +1,69 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.cd = type { i64, i64, i64 }
+
+@something = global [33 x i8] c"this is not really code, but...\0A\00", align 1
+@tls_something = thread_local global %struct.cd zeroinitializer, align 8
+@extern_something = external global %struct.cd
+
+; Function Attrs: nounwind
+define void @foo() #0 {
+entry:
+  tail call void bitcast ([33 x i8]* @something to void ()*)() #0
+  ret void
+
+; CHECK-LABEL: @foo
+; CHECK-DAG: addis [[REG1:[0-9]+]], 2, something@toc@ha
+; CHECK-DAG: std 2, 40(1)
+; CHECK-DAG: addi [[REG3:[0-9]+]], [[REG1]], something@toc@l
+; CHECK-DAG: ld [[REG2:[0-9]+]], 0([[REG3]])
+; CHECK-DAG: ld 11, 16([[REG3]])
+; CHECK-DAG: ld 2, 8([[REG3]])
+; CHECK-DAG: mtctr [[REG2]]
+; CHECK: bctrl
+; CHECK: ld 2, 40(1)
+; CHECK: blr
+}
+
+; Function Attrs: nounwind
+define void @bar() #0 {
+entry:
+  tail call void bitcast (%struct.cd* @tls_something to void ()*)() #0
+  ret void
+
+; CHECK-LABEL: @bar
+; CHECK-DAG: addis [[REG1:[0-9]+]], 13, tls_something@tprel@ha
+; CHECK-DAG: std 2, 40(1)
+; CHECK-DAG: addi [[REG3:[0-9]+]], [[REG1]], tls_something@tprel@l
+; CHECK-DAG: ld [[REG2:[0-9]+]], 0([[REG3]])
+; CHECK-DAG: ld 11, 16([[REG3]])
+; CHECK-DAG: ld 2, 8([[REG3]])
+; CHECK-DAG: mtctr [[REG2]]
+; CHECK: bctrl
+; CHECK: ld 2, 40(1)
+; CHECK: blr
+}
+
+; Function Attrs: nounwind
+define void @ext() #0 {
+entry:
+  tail call void bitcast (%struct.cd* @extern_something to void ()*)() #0
+  ret void
+
+; CHECK-LABEL: @ext
+; CHECK-DAG: addis [[REG1:[0-9]+]], 2, [[NAME:[._A-Za-z0-9]+]]@toc@ha
+; CHECK-DAG: std 2, 40(1)
+; CHECK-DAG: ld [[REG3:[0-9]+]], [[NAME]]@toc@l(3)
+; CHECK-DAG: ld [[REG2:[0-9]+]], 0([[REG3]])
+; CHECK-DAG: ld 11, 16([[REG3]])
+; CHECK-DAG: ld 2, 8([[REG3]])
+; CHECK-DAG: mtctr [[REG2]]
+; CHECK: bctrl
+; CHECK: ld 2, 40(1)
+; CHECK: blr
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/ppc64-patchpoint.ll b/test/CodeGen/PowerPC/ppc64-patchpoint.ll
new file mode 100644
index 0000000..6580eff
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-patchpoint.ll
@@ -0,0 +1,97 @@
+; RUN: llc                             < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
+; RUN: llc -fast-isel -fast-isel-abort < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu                             < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -fast-isel -fast-isel-abort < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE
+
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Trivial patchpoint codegen
+;
+define i64 @trivial_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: trivial_patchpoint_codegen:
+
+; CHECK: li 12, -8531
+; CHECK-NEXT: rldic 12, 12, 32, 16
+; CHECK-NEXT: oris 12, 12, 48879
+; CHECK-NEXT: ori 12, 12, 51966
+; CHECK-NEXT: mtctr 12
+; CHECK-NEXT: bctrl
+
+; CHECK: li 12, -8531
+; CHECK-NEXT: rldic 12, 12, 32, 16
+; CHECK-NEXT: oris 12, 12, 48879
+; CHECK-NEXT: ori 12, 12, 51967
+; CHECK-NEXT: mtctr 12
+; CHECK-NEXT: bctrl
+
+; CHECK: blr
+
+  %resolveCall2 = inttoptr i64 244837814094590 to i8*
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 24, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %resolveCall3 = inttoptr i64 244837814094591 to i8*
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 24, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  ret i64 %result
+}
+
+; Caller frame metadata with stackmaps. This should not be optimized
+; as a leaf function.
+;
+; CHECK-LABEL: caller_meta_leaf
+; CHECK-BE: stdu 1, -80(1)
+; CHECK-LE: stdu 1, -64(1)
+; CHECK: Ltmp
+; CHECK-BE: addi 1, 1, 80
+; CHECK-LE: addi 1, 1, 64
+; CHECK: blr
+
+define void @caller_meta_leaf() {
+entry:
+  %metadata = alloca i64, i32 3, align 8
+  store i64 11, i64* %metadata
+  store i64 12, i64* %metadata
+  store i64 13, i64* %metadata
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+  ret void
+}
+
+; Test patchpoints reusing the same TargetConstant.
+; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
+; There is no way to verify this, since it depends on memory allocation.
+; But I think it's useful to include as a working example.
+define i64 @testLowerConstant(i64 %arg, i64 %tmp2, i64 %tmp10, i64* %tmp33, i64 %tmp79) {
+entry:
+  %tmp80 = add i64 %tmp79, -16
+  %tmp81 = inttoptr i64 %tmp80 to i64*
+  %tmp82 = load i64* %tmp81, align 8
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+  %tmp83 = load i64* %tmp33, align 8
+  %tmp84 = add i64 %tmp83, -24
+  %tmp85 = inttoptr i64 %tmp84 to i64*
+  %tmp86 = load i64* %tmp85, align 8
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+  ret i64 10
+}
+
+; Test small patchpoints that don't emit calls.
+define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: small_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NOT:  nop
+; CHECK: blr
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
+
diff --git a/test/CodeGen/PowerPC/ppc64-r2-alloc.ll b/test/CodeGen/PowerPC/ppc64-r2-alloc.ll
new file mode 100644
index 0000000..87292d8
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-r2-alloc.ll
@@ -0,0 +1,81 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define signext i32 @foo(i32 signext %a, i32 signext %d) #0 {
+entry:
+  %div = sdiv i32 %a, %d
+  %div1 = sdiv i32 %div, %d
+  %div2 = sdiv i32 %div1, %d
+  %div3 = sdiv i32 %div2, %d
+  %div4 = sdiv i32 %div3, %d
+  %div5 = sdiv i32 %div4, %d
+  %div6 = sdiv i32 %div5, %d
+  %div7 = sdiv i32 %div6, %d
+  %div8 = sdiv i32 %div7, %d
+  %div9 = sdiv i32 %div8, %d
+  %div10 = sdiv i32 %div9, %d
+  %div11 = sdiv i32 %div10, %d
+  %div12 = sdiv i32 %div11, %d
+  %div13 = sdiv i32 %div12, %d
+  %div14 = sdiv i32 %div13, %d
+  %div15 = sdiv i32 %div14, %d
+  %div16 = sdiv i32 %div15, %d
+  %div17 = sdiv i32 %div16, %d
+  %div18 = sdiv i32 %div17, %d
+  %div19 = sdiv i32 %div18, %d
+  %div20 = sdiv i32 %div19, %d
+  %div21 = sdiv i32 %div20, %d
+  %div22 = sdiv i32 %div21, %d
+  %div23 = sdiv i32 %div22, %d
+  %div24 = sdiv i32 %div23, %d
+  %div25 = sdiv i32 %div24, %d
+  %div26 = sdiv i32 %div25, %d
+  %div27 = sdiv i32 %div26, %d
+  %div28 = sdiv i32 %div27, %d
+  %div29 = sdiv i32 %div28, %d
+  %div30 = sdiv i32 %div29, %d
+  %div31 = sdiv i32 %div30, %d
+  %div32 = sdiv i32 %div31, %d
+  %div33 = sdiv i32 %div32, %div31
+  %div34 = sdiv i32 %div33, %div30
+  %div35 = sdiv i32 %div34, %div29
+  %div36 = sdiv i32 %div35, %div28
+  %div37 = sdiv i32 %div36, %div27
+  %div38 = sdiv i32 %div37, %div26
+  %div39 = sdiv i32 %div38, %div25
+  %div40 = sdiv i32 %div39, %div24
+  %div41 = sdiv i32 %div40, %div23
+  %div42 = sdiv i32 %div41, %div22
+  %div43 = sdiv i32 %div42, %div21
+  %div44 = sdiv i32 %div43, %div20
+  %div45 = sdiv i32 %div44, %div19
+  %div46 = sdiv i32 %div45, %div18
+  %div47 = sdiv i32 %div46, %div17
+  %div48 = sdiv i32 %div47, %div16
+  %div49 = sdiv i32 %div48, %div15
+  %div50 = sdiv i32 %div49, %div14
+  %div51 = sdiv i32 %div50, %div13
+  %div52 = sdiv i32 %div51, %div12
+  %div53 = sdiv i32 %div52, %div11
+  %div54 = sdiv i32 %div53, %div10
+  %div55 = sdiv i32 %div54, %div9
+  %div56 = sdiv i32 %div55, %div8
+  %div57 = sdiv i32 %div56, %div7
+  %div58 = sdiv i32 %div57, %div6
+  %div59 = sdiv i32 %div58, %div5
+  %div60 = sdiv i32 %div59, %div4
+  %div61 = sdiv i32 %div60, %div3
+  %div62 = sdiv i32 %div61, %div2
+  %div63 = sdiv i32 %div62, %div1
+  %div64 = sdiv i32 %div63, %div
+  ret i32 %div64
+}
+
+; This function will need to use all non-reserved GPRs (and then some), make
+; sure that r2 is among them.
+; CHECK-LABEL: @foo
+; CHECK: std 2,
+; CHECK: ld 2,
+; CHECK: blr
+
diff --git a/test/CodeGen/PowerPC/ppc64-stackmap-nops.ll b/test/CodeGen/PowerPC/ppc64-stackmap-nops.ll
new file mode 100644
index 0000000..368ddc5
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-stackmap-nops.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-gnu-linux | FileCheck %s
+
+define void @test_shadow_optimization() {
+entry:
+; Expect 12 bytes worth of nops here rather than 32: With the shadow optimization
+; in place, 20 bytes will be consumed by the frame teardown and return instr.
+; CHECK-LABEL: test_shadow_optimization:
+
+; CHECK:      nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NOT:  nop
+; CHECK: addi 1, 1, 64
+; CHECK: ld [[REG1:[0-9]+]], 16(1)
+; CHECK: ld 31, -8(1)
+; CHECK: mtlr [[REG1]]
+; CHECK: blr
+
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  0, i32  32)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+
diff --git a/test/CodeGen/PowerPC/ppc64-stackmap.ll b/test/CodeGen/PowerPC/ppc64-stackmap.ll
new file mode 100644
index 0000000..714d363
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-stackmap.ll
@@ -0,0 +1,289 @@
+; RUN: llc                             < %s | FileCheck %s
+;
+; Note: Print verbose stackmaps using -debug-only=stackmaps.
+
+; We are not getting the correct stack alignment when cross compiling for arm64.
+; So specify a datalayout here.
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; CHECK-LABEL:  .section  .llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 11
+; Num LargeConstants
+; CHECK-NEXT:   .long 2
+; Num Callsites
+; CHECK-NEXT:   .long 11
+
+; Functions and stack size
+; CHECK-NEXT:   .quad constantargs
+; CHECK-NEXT:   .quad 128
+; CHECK-NEXT:   .quad osrinline
+; CHECK-NEXT:   .quad 144
+; CHECK-NEXT:   .quad osrcold
+; CHECK-NEXT:   .quad 128
+; CHECK-NEXT:   .quad propertyRead
+; CHECK-NEXT:   .quad 128
+; CHECK-NEXT:   .quad propertyWrite
+; CHECK-NEXT:   .quad 128
+; CHECK-NEXT:   .quad jsVoidCall
+; CHECK-NEXT:   .quad 128
+; CHECK-NEXT:   .quad jsIntCall
+; CHECK-NEXT:   .quad 128
+; CHECK-NEXT:   .quad spilledValue
+; CHECK-NEXT:   .quad 304
+; CHECK-NEXT:   .quad spilledStackMapValue
+; CHECK-NEXT:   .quad 224
+; CHECK-NEXT:   .quad liveConstant
+; CHECK-NEXT:   .quad 64
+; CHECK-NEXT:   .quad clobberLR
+; CHECK-NEXT:   .quad 208
+
+; Num LargeConstants
+; CHECK-NEXT:   .quad   4294967295
+; CHECK-NEXT:   .quad   4294967296
+
+; Constant arguments
+;
+; CHECK-NEXT:   .quad   1
+; CHECK-NEXT:   .long   .L{{.*}}-.L.constantargs
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  4
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65535
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65536
+; SmallConstant
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   0
+; LargeConstant at index 0
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   1
+
+define void @constantargs() {
+entry:
+  %0 = inttoptr i64 244837814094590 to i8*
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 24, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+  ret void
+}
+
+; Inline OSR Exit
+;
+; CHECK-LABEL:  .long   .L{{.*}}-.L.osrinline
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long  0
+define void @osrinline(i64 %a, i64 %b) {
+entry:
+  ; Runtime void->void call.
+  call void inttoptr (i64 244837814094590 to void ()*)()
+  ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
+  ret void
+}
+
+; Cold OSR Exit
+;
+; 2 live variables in register.
+;
+; CHECK-LABEL:  .long   .L{{.*}}-.L.osrcold
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long  0
+define void @osrcold(i64 %a, i64 %b) {
+entry:
+  %test = icmp slt i64 %a, %b
+  br i1 %test, label %ret, label %cold
+cold:
+  ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
+  %thunk = inttoptr i64 244837814094590 to i8*
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 24, i8* %thunk, i32 0, i64 %a, i64 %b)
+  unreachable
+ret:
+  ret void
+}
+
+; Property Read
+; CHECK-LABEL:  .long   .L{{.*}}-.L.propertyRead
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  0
+;
+; FIXME: There are currently no stackmap entries. After moving to
+; AnyRegCC, we will have entries for the object and return value.
+define i64 @propertyRead(i64* %obj) {
+entry:
+  %resolveRead = inttoptr i64 244837814094590 to i8*
+  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 24, i8* %resolveRead, i32 1, i64* %obj)
+  %add = add i64 %result, 3
+  ret i64 %add
+}
+
+; Property Write
+; CHECK-LABEL:  .long   .L{{.*}}-.L.propertyWrite
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
+entry:
+  %resolveWrite = inttoptr i64 244837814094590 to i8*
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 24, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+  ret void
+}
+
+; Void JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK-LABEL:  .long   .L{{.*}}-.L.jsVoidCall
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+  %resolveCall = inttoptr i64 244837814094590 to i8*
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 24, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  ret void
+}
+
+; i64 JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK-LABEL:  .long   .L{{.*}}-.L.jsIntCall
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+  %resolveCall = inttoptr i64 244837814094590 to i8*
+  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 24, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  %add = add i64 %result, 3
+  ret i64 %add
+}
+
+; Spilled stack map values.
+;
+; Verify 28 stack map entries.
+;
+; CHECK-LABEL:  .long .L{{.*}}-.L.spilledValue
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .short 28
+;
+; Check that at least one is a spilled entry from r31.
+; Location: Indirect FP + ...
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 31
+define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) {
+entry:
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 24, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
+  ret void
+}
+
+; Spilled stack map values.
+;
+; Verify 30 stack map entries.
+;
+; CHECK-LABEL:  .long .L{{.*}}-.L.spilledStackMapValue
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .short 30
+;
+; Check that at least one is a spilled entry from r31.
+; Location: Indirect FP + ...
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 31
+define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29) {
+entry:
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
+  ret void
+}
+
+
+; Map a constant value.
+;
+; CHECK-LABEL:  .long .L{{.*}}-.L.liveConstant
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   33
+
+define void @liveConstant() {
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
+  ret void
+}
+
+; Map a value when LR is the only free register.
+;
+; CHECK-LABEL:  .long .L{{.*}}-.L.clobberLR
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: Indirect FP (r31) - offset
+; CHECK-NEXT:   .byte   3
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .short  31
+; CHECK-NEXT:   .long   {{[0-9]+}}
+define void @clobberLR(i32 %a) {
+  tail call void asm sideeffect "nop", "~{r0},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{r14},~{r15},~{r16},~{r17},~{r18},~{r19},~{r20},~{r21},~{r22},~{r23},~{r24},~{r25},~{r26},~{r27},~{r28},~{r29},~{r30},~{r31}"() nounwind
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/PowerPC/ppc64-vaarg-int.ll b/test/CodeGen/PowerPC/ppc64-vaarg-int.ll
index 5a63b01..c9a4f91 100644
--- a/test/CodeGen/PowerPC/ppc64-vaarg-int.ll
+++ b/test/CodeGen/PowerPC/ppc64-vaarg-int.ll
@@ -16,5 +16,5 @@ declare void @llvm.va_start(i8*) nounwind
 
 ; CHECK: @intvaarg
 ; Make sure that the va pointer is incremented by 8 (not 4).
-; CHECK: addi{{.*}}, 8
+; CHECK: addi{{.*}}, 1, 64 
 
diff --git a/test/CodeGen/PowerPC/ppc64le-aggregates.ll b/test/CodeGen/PowerPC/ppc64le-aggregates.ll
index 9eed623..3fce36e 100644
--- a/test/CodeGen/PowerPC/ppc64le-aggregates.ll
+++ b/test/CodeGen/PowerPC/ppc64le-aggregates.ll
@@ -1,4 +1,11 @@
-; RUN: llc < %s -march=ppc64le -mcpu=pwr8 -mattr=+altivec | FileCheck %s
+; RUN: llc < %s -march=ppc64le -mcpu=pwr8 -mattr=+altivec -mattr=-vsx | FileCheck %s
+; RUN: llc < %s -march=ppc64le -mattr=+altivec -mattr=-vsx | FileCheck %s
+
+; Currently VSX support is disabled for this test because we generate lxsdx
+; instead of lfd, and stxsdx instead of stfd.  That is a poor choice when we
+; have reg+imm addressing, and is on the list of things to be fixed.
+; The second run step is to ensure that -march=ppc64le is adequate to select
+; the same feature set as with -mcpu=pwr8 since that is the baseline for ppc64le.
 
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux-gnu"
@@ -257,26 +264,26 @@ entry:
   ret void
 }
 ; CHECK-LABEL: @caller2
-; CHECK: ld [[REG:[0-9]+]], .LC
-; CHECK-DAG: lfs 1, 0([[REG]])
-; CHECK-DAG: lfs 2, 4([[REG]])
-; CHECK-DAG: lfs 3, 8([[REG]])
-; CHECK-DAG: lfs 4, 12([[REG]])
-; CHECK-DAG: lfs 5, 16([[REG]])
-; CHECK-DAG: lfs 6, 20([[REG]])
-; CHECK-DAG: lfs 7, 24([[REG]])
-; CHECK-DAG: lfs 8, 28([[REG]])
-; CHECK: ld [[REG:[0-9]+]], .LC
-; CHECK-DAG: lfs 9, 0([[REG]])
-; CHECK-DAG: lfs 10, 4([[REG]])
-; CHECK-DAG: lfs 11, 8([[REG]])
-; CHECK-DAG: lfs 12, 12([[REG]])
-; CHECK-DAG: lfs 13, 16([[REG]])
-; CHECK: ld [[REG:[0-9]+]], .LC
-; CHECK-DAG: lwz [[REG0:[0-9]+]], 0([[REG]])
-; CHECK-DAG: lwz [[REG1:[0-9]+]], 4([[REG]])
-; CHECK-DAG: sldi [[REG1]], [[REG1]], 32
-; CHECK-DAG: or 10, [[REG0]], [[REG1]]
+; CHECK: ld {{[0-9]+}}, .LC
+; CHECK-DAG: lfs 1, 0({{[0-9]+}})
+; CHECK-DAG: lfs 2, 4({{[0-9]+}})
+; CHECK-DAG: lfs 3, 8({{[0-9]+}})
+; CHECK-DAG: lfs 4, 12({{[0-9]+}})
+; CHECK-DAG: lfs 5, 16({{[0-9]+}})
+; CHECK-DAG: lfs 6, 20({{[0-9]+}})
+; CHECK-DAG: lfs 7, 24({{[0-9]+}})
+; CHECK-DAG: lfs 8, 28({{[0-9]+}})
+
+; CHECK-DAG: lfs 9, 0({{[0-9]+}})
+; CHECK-DAG: lfs 10, 4({{[0-9]+}})
+; CHECK-DAG: lfs 11, 8({{[0-9]+}})
+; CHECK-DAG: lfs 12, 12({{[0-9]+}})
+; CHECK-DAG: lfs 13, 16({{[0-9]+}})
+
+; CHECK-DAG: lwz [[REG0:[0-9]+]], 0({{[0-9]+}})
+; CHECK-DAG: lwz [[REG1:[0-9]+]], 4({{[0-9]+}})
+; CHECK-DAG: sldi [[REG2:[0-9]+]], [[REG1]], 32
+; CHECK-DAG: or 10, [[REG0]], [[REG2]]
 ; CHECK: bl test2
 
 declare void @test2([8 x float], [5 x float], [2 x float])
diff --git a/test/CodeGen/PowerPC/ppc64le-calls.ll b/test/CodeGen/PowerPC/ppc64le-calls.ll
index 0d667dd..b65b954 100644
--- a/test/CodeGen/PowerPC/ppc64le-calls.ll
+++ b/test/CodeGen/PowerPC/ppc64le-calls.ll
@@ -1,4 +1,8 @@
 ; RUN: llc -march=ppc64le -mcpu=pwr8 < %s | FileCheck %s
+; RUN: llc -march=ppc64le < %s | FileCheck %s
+
+; The second run of the test case is to ensure the behaviour is the same
+; without specifying -mcpu=pwr8 as that is now the baseline for ppc64le.
 
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux-gnu"
diff --git a/test/CodeGen/PowerPC/ppc64le-localentry.ll b/test/CodeGen/PowerPC/ppc64le-localentry.ll
index 4676ce8..d9995de 100644
--- a/test/CodeGen/PowerPC/ppc64le-localentry.ll
+++ b/test/CodeGen/PowerPC/ppc64le-localentry.ll
@@ -1,5 +1,10 @@
 ; RUN: llc -march=ppc64le -mcpu=pwr8 < %s | FileCheck %s
 ; RUN: llc -march=ppc64le -mcpu=pwr8 -O0 < %s | FileCheck %s
+; RUN: llc -march=ppc64le < %s | FileCheck %s
+; RUN: llc -march=ppc64le -O0 < %s | FileCheck %s
+
+; The second run of the test case is to ensure the behaviour is the same
+; without specifying -mcpu=pwr8 as that is now the baseline for ppc64le.
 
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux-gnu"
diff --git a/test/CodeGen/PowerPC/ppcf128-endian.ll b/test/CodeGen/PowerPC/ppcf128-endian.ll
index 2a5f13a..180fedf 100644
--- a/test/CodeGen/PowerPC/ppcf128-endian.ll
+++ b/test/CodeGen/PowerPC/ppcf128-endian.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=pwr7 -mattr=+altivec < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -mattr=+altivec -mattr=-vsx < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux-gnu"
diff --git a/test/CodeGen/PowerPC/pr17168.ll b/test/CodeGen/PowerPC/pr17168.ll
index c3f0162..62a9ede 100644
--- a/test/CodeGen/PowerPC/pr17168.ll
+++ b/test/CodeGen/PowerPC/pr17168.ll
@@ -25,7 +25,7 @@ for.cond968.preheader:                            ; preds = %for.cond968.prehead
 for.end1042:                                      ; preds = %for.cond968.preheader, %for.cond964.preheader, %entry
   %0 = phi i32 [ undef, %for.cond964.preheader ], [ undef, %for.cond968.preheader ], [ undef, %entry ]
   %1 = load i32* getelementptr inbounds ([3 x i32]* @grid_points, i64 0, i64 0), align 4, !dbg !443, !tbaa !444
-  tail call void @llvm.dbg.value(metadata !447, i64 0, metadata !119, metadata !{metadata !"0x102"}), !dbg !448
+  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !119, metadata !{!"0x102"}), !dbg !448
   %sub10454270 = add nsw i32 %0, -1, !dbg !448
   %cmp10464271 = icmp sgt i32 %sub10454270, 1, !dbg !448
   %sub11134263 = add nsw i32 %1, -1, !dbg !450
@@ -54,468 +54,468 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!438, !464}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 190311)\001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !298, metadata !2} ; [ DW_TAG_compile_unit ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"bt.c", metadata !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !82, metadata !102, metadata !114, metadata !132, metadata !145, metadata !154, metadata !155, metadata !162, metadata !183, metadata !200, metadata !201, metadata !207, metadata !208, metadata !215, metadata !221, metadata !230, metadata !238, metadata !246, metadata !255, metadata !260, metadata !261, metadata !268, metadata !274, metadata !279, metadata !280, metadata !287, metadata !293}
-!4 = metadata !{metadata !"0x2e\00main\00main\00\0074\000\001\000\006\00256\001\0074", metadata !1, metadata !5, metadata !6, null, null, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 74] [def] [main]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !8, metadata !9}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
-!11 = metadata !{metadata !"0x24\00char\000\008\008\000\000\008", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_unsigned_char]
-!12 = metadata !{metadata !13, metadata !14, metadata !15, metadata !16, metadata !17, metadata !18, metadata !19, metadata !21, metadata !22, metadata !23, metadata !25, metadata !26}
-!13 = metadata !{metadata !"0x101\00argc\0016777290\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [argc] [line 74]
-!14 = metadata !{metadata !"0x101\00argv\0033554506\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [argv] [line 74]
-!15 = metadata !{metadata !"0x100\00niter\0076\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [niter] [line 76]
-!16 = metadata !{metadata !"0x100\00step\0076\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [step] [line 76]
-!17 = metadata !{metadata !"0x100\00n3\0076\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [n3] [line 76]
-!18 = metadata !{metadata !"0x100\00nthreads\0077\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [nthreads] [line 77]
-!19 = metadata !{metadata !"0x100\00navg\0078\000", metadata !4, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [navg] [line 78]
-!20 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
-!21 = metadata !{metadata !"0x100\00mflops\0078\000", metadata !4, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [mflops] [line 78]
-!22 = metadata !{metadata !"0x100\00tmax\0080\000", metadata !4, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [tmax] [line 80]
-!23 = metadata !{metadata !"0x100\00verified\0081\000", metadata !4, metadata !5, metadata !24} ; [ DW_TAG_auto_variable ] [verified] [line 81]
-!24 = metadata !{metadata !"0x16\00boolean\0012\000\000\000\000", metadata !1, null, metadata !8} ; [ DW_TAG_typedef ] [boolean] [line 12, size 0, align 0, offset 0] [from int]
-!25 = metadata !{metadata !"0x100\00class\0082\000", metadata !4, metadata !5, metadata !11} ; [ DW_TAG_auto_variable ] [class] [line 82]
-!26 = metadata !{metadata !"0x100\00fp\0083\000", metadata !4, metadata !5, metadata !27} ; [ DW_TAG_auto_variable ] [fp] [line 83]
-!27 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from FILE]
-!28 = metadata !{metadata !"0x16\00FILE\0049\000\000\000\000", metadata !1, null, metadata !29} ; [ DW_TAG_typedef ] [FILE] [line 49, size 0, align 0, offset 0] [from _IO_FILE]
-!29 = metadata !{metadata !"0x13\00_IO_FILE\00271\001728\0064\000\000\000", metadata !30, null, null, metadata !31, null, null, null} ; [ DW_TAG_structure_type ] [_IO_FILE] [line 271, size 1728, align 64, offset 0] [def] [from ]
-!30 = metadata !{metadata !"/usr/include/libio.h", metadata !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
-!31 = metadata !{metadata !32, metadata !33, metadata !34, metadata !35, metadata !36, metadata !37, metadata !38, metadata !39, metadata !40, metadata !41, metadata !42, metadata !43, metadata !44, metadata !52, metadata !53, metadata !54, metadata !55, metadata !58, metadata !60, metadata !62, metadata !66, metadata !68, metadata !70, metadata !71, metadata !72, metadata !73, metadata !74, metadata !77, metadata !78}
-!32 = metadata !{metadata !"0xd\00_flags\00272\0032\0032\000\000", metadata !30, metadata !29, metadata !8} ; [ DW_TAG_member ] [_flags] [line 272, size 32, align 32, offset 0] [from int]
-!33 = metadata !{metadata !"0xd\00_IO_read_ptr\00277\0064\0064\0064\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_read_ptr] [line 277, size 64, align 64, offset 64] [from ]
-!34 = metadata !{metadata !"0xd\00_IO_read_end\00278\0064\0064\00128\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_read_end] [line 278, size 64, align 64, offset 128] [from ]
-!35 = metadata !{metadata !"0xd\00_IO_read_base\00279\0064\0064\00192\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_read_base] [line 279, size 64, align 64, offset 192] [from ]
-!36 = metadata !{metadata !"0xd\00_IO_write_base\00280\0064\0064\00256\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_write_base] [line 280, size 64, align 64, offset 256] [from ]
-!37 = metadata !{metadata !"0xd\00_IO_write_ptr\00281\0064\0064\00320\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_write_ptr] [line 281, size 64, align 64, offset 320] [from ]
-!38 = metadata !{metadata !"0xd\00_IO_write_end\00282\0064\0064\00384\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_write_end] [line 282, size 64, align 64, offset 384] [from ]
-!39 = metadata !{metadata !"0xd\00_IO_buf_base\00283\0064\0064\00448\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_buf_base] [line 283, size 64, align 64, offset 448] [from ]
-!40 = metadata !{metadata !"0xd\00_IO_buf_end\00284\0064\0064\00512\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_buf_end] [line 284, size 64, align 64, offset 512] [from ]
-!41 = metadata !{metadata !"0xd\00_IO_save_base\00286\0064\0064\00576\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_save_base] [line 286, size 64, align 64, offset 576] [from ]
-!42 = metadata !{metadata !"0xd\00_IO_backup_base\00287\0064\0064\00640\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_backup_base] [line 287, size 64, align 64, offset 640] [from ]
-!43 = metadata !{metadata !"0xd\00_IO_save_end\00288\0064\0064\00704\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_save_end] [line 288, size 64, align 64, offset 704] [from ]
-!44 = metadata !{metadata !"0xd\00_markers\00290\0064\0064\00768\000", metadata !30, metadata !29, metadata !45} ; [ DW_TAG_member ] [_markers] [line 290, size 64, align 64, offset 768] [from ]
-!45 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !46} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _IO_marker]
-!46 = metadata !{metadata !"0x13\00_IO_marker\00186\00192\0064\000\000\000", metadata !30, null, null, metadata !47, null, null, null} ; [ DW_TAG_structure_type ] [_IO_marker] [line 186, size 192, align 64, offset 0] [def] [from ]
-!47 = metadata !{metadata !48, metadata !49, metadata !51}
-!48 = metadata !{metadata !"0xd\00_next\00187\0064\0064\000\000", metadata !30, metadata !46, metadata !45} ; [ DW_TAG_member ] [_next] [line 187, size 64, align 64, offset 0] [from ]
-!49 = metadata !{metadata !"0xd\00_sbuf\00188\0064\0064\0064\000", metadata !30, metadata !46, metadata !50} ; [ DW_TAG_member ] [_sbuf] [line 188, size 64, align 64, offset 64] [from ]
-!50 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !29} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _IO_FILE]
-!51 = metadata !{metadata !"0xd\00_pos\00192\0032\0032\00128\000", metadata !30, metadata !46, metadata !8} ; [ DW_TAG_member ] [_pos] [line 192, size 32, align 32, offset 128] [from int]
-!52 = metadata !{metadata !"0xd\00_chain\00292\0064\0064\00832\000", metadata !30, metadata !29, metadata !50} ; [ DW_TAG_member ] [_chain] [line 292, size 64, align 64, offset 832] [from ]
-!53 = metadata !{metadata !"0xd\00_fileno\00294\0032\0032\00896\000", metadata !30, metadata !29, metadata !8} ; [ DW_TAG_member ] [_fileno] [line 294, size 32, align 32, offset 896] [from int]
-!54 = metadata !{metadata !"0xd\00_flags2\00298\0032\0032\00928\000", metadata !30, metadata !29, metadata !8} ; [ DW_TAG_member ] [_flags2] [line 298, size 32, align 32, offset 928] [from int]
-!55 = metadata !{metadata !"0xd\00_old_offset\00300\0064\0064\00960\000", metadata !30, metadata !29, metadata !56} ; [ DW_TAG_member ] [_old_offset] [line 300, size 64, align 64, offset 960] [from __off_t]
-!56 = metadata !{metadata !"0x16\00__off_t\00141\000\000\000\000", metadata !30, null, metadata !57} ; [ DW_TAG_typedef ] [__off_t] [line 141, size 0, align 0, offset 0] [from long int]
-!57 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
-!58 = metadata !{metadata !"0xd\00_cur_column\00304\0016\0016\001024\000", metadata !30, metadata !29, metadata !59} ; [ DW_TAG_member ] [_cur_column] [line 304, size 16, align 16, offset 1024] [from unsigned short]
-!59 = metadata !{metadata !"0x24\00unsigned short\000\0016\0016\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned short] [line 0, size 16, align 16, offset 0, enc DW_ATE_unsigned]
-!60 = metadata !{metadata !"0xd\00_vtable_offset\00305\008\008\001040\000", metadata !30, metadata !29, metadata !61} ; [ DW_TAG_member ] [_vtable_offset] [line 305, size 8, align 8, offset 1040] [from signed char]
-!61 = metadata !{metadata !"0x24\00signed char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [signed char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!62 = metadata !{metadata !"0xd\00_shortbuf\00306\008\008\001048\000", metadata !30, metadata !29, metadata !63} ; [ DW_TAG_member ] [_shortbuf] [line 306, size 8, align 8, offset 1048] [from ]
-!63 = metadata !{metadata !"0x1\00\000\008\008\000\000", null, null, metadata !11, metadata !64, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
-!64 = metadata !{metadata !65}
-!65 = metadata !{metadata !"0x21\000\001"}        ; [ DW_TAG_subrange_type ] [0, 0]
-!66 = metadata !{metadata !"0xd\00_lock\00310\0064\0064\001088\000", metadata !30, metadata !29, metadata !67} ; [ DW_TAG_member ] [_lock] [line 310, size 64, align 64, offset 1088] [from ]
-!67 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!68 = metadata !{metadata !"0xd\00_offset\00319\0064\0064\001152\000", metadata !30, metadata !29, metadata !69} ; [ DW_TAG_member ] [_offset] [line 319, size 64, align 64, offset 1152] [from __off64_t]
-!69 = metadata !{metadata !"0x16\00__off64_t\00142\000\000\000\000", metadata !30, null, metadata !57} ; [ DW_TAG_typedef ] [__off64_t] [line 142, size 0, align 0, offset 0] [from long int]
-!70 = metadata !{metadata !"0xd\00__pad1\00328\0064\0064\001216\000", metadata !30, metadata !29, metadata !67} ; [ DW_TAG_member ] [__pad1] [line 328, size 64, align 64, offset 1216] [from ]
-!71 = metadata !{metadata !"0xd\00__pad2\00329\0064\0064\001280\000", metadata !30, metadata !29, metadata !67} ; [ DW_TAG_member ] [__pad2] [line 329, size 64, align 64, offset 1280] [from ]
-!72 = metadata !{metadata !"0xd\00__pad3\00330\0064\0064\001344\000", metadata !30, metadata !29, metadata !67} ; [ DW_TAG_member ] [__pad3] [line 330, size 64, align 64, offset 1344] [from ]
-!73 = metadata !{metadata !"0xd\00__pad4\00331\0064\0064\001408\000", metadata !30, metadata !29, metadata !67} ; [ DW_TAG_member ] [__pad4] [line 331, size 64, align 64, offset 1408] [from ]
-!74 = metadata !{metadata !"0xd\00__pad5\00332\0064\0064\001472\000", metadata !30, metadata !29, metadata !75} ; [ DW_TAG_member ] [__pad5] [line 332, size 64, align 64, offset 1472] [from size_t]
-!75 = metadata !{metadata !"0x16\00size_t\0042\000\000\000\000", metadata !30, null, metadata !76} ; [ DW_TAG_typedef ] [size_t] [line 42, size 0, align 0, offset 0] [from long unsigned int]
-!76 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!77 = metadata !{metadata !"0xd\00_mode\00334\0032\0032\001536\000", metadata !30, metadata !29, metadata !8} ; [ DW_TAG_member ] [_mode] [line 334, size 32, align 32, offset 1536] [from int]
-!78 = metadata !{metadata !"0xd\00_unused2\00336\00160\008\001568\000", metadata !30, metadata !29, metadata !79} ; [ DW_TAG_member ] [_unused2] [line 336, size 160, align 8, offset 1568] [from ]
-!79 = metadata !{metadata !"0x1\00\000\00160\008\000\000", null, null, metadata !11, metadata !80, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
-!80 = metadata !{metadata !81}
-!81 = metadata !{metadata !"0x21\000\0020"}       ; [ DW_TAG_subrange_type ] [0, 19]
-!82 = metadata !{metadata !"0x2e\00verify\00verify\00\002388\001\001\000\006\00256\001\002388", metadata !1, metadata !5, metadata !83, null, null, null, null, metadata !86} ; [ DW_TAG_subprogram ] [line 2388] [local] [def] [verify]
-!83 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !84, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!84 = metadata !{null, metadata !8, metadata !10, metadata !85}
-!85 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !24} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from boolean]
-!86 = metadata !{metadata !87, metadata !88, metadata !89, metadata !90, metadata !94, metadata !95, metadata !96, metadata !97, metadata !98, metadata !99, metadata !100, metadata !101}
-!87 = metadata !{metadata !"0x101\00no_time_steps\0016779604\000", metadata !82, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [no_time_steps] [line 2388]
-!88 = metadata !{metadata !"0x101\00class\0033556820\000", metadata !82, metadata !5, metadata !10} ; [ DW_TAG_arg_variable ] [class] [line 2388]
-!89 = metadata !{metadata !"0x101\00verified\0050334036\000", metadata !82, metadata !5, metadata !85} ; [ DW_TAG_arg_variable ] [verified] [line 2388]
-!90 = metadata !{metadata !"0x100\00xcrref\002397\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xcrref] [line 2397]
-!91 = metadata !{metadata !"0x1\00\000\00320\0064\000\000", null, null, metadata !20, metadata !92, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 64, offset 0] [from double]
-!92 = metadata !{metadata !93}
-!93 = metadata !{metadata !"0x21\000\005"}        ; [ DW_TAG_subrange_type ] [0, 4]
-!94 = metadata !{metadata !"0x100\00xceref\002397\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xceref] [line 2397]
-!95 = metadata !{metadata !"0x100\00xcrdif\002397\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xcrdif] [line 2397]
-!96 = metadata !{metadata !"0x100\00xcedif\002397\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xcedif] [line 2397]
-!97 = metadata !{metadata !"0x100\00epsilon\002398\000", metadata !82, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [epsilon] [line 2398]
-!98 = metadata !{metadata !"0x100\00xce\002398\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xce] [line 2398]
-!99 = metadata !{metadata !"0x100\00xcr\002398\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xcr] [line 2398]
-!100 = metadata !{metadata !"0x100\00dtref\002398\000", metadata !82, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [dtref] [line 2398]
-!101 = metadata !{metadata !"0x100\00m\002399\000", metadata !82, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 2399]
-!102 = metadata !{metadata !"0x2e\00rhs_norm\00rhs_norm\00\00266\001\001\000\006\00256\001\00266", metadata !1, metadata !5, metadata !103, null, null, null, null, metadata !106} ; [ DW_TAG_subprogram ] [line 266] [local] [def] [rhs_norm]
-!103 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !104, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!104 = metadata !{null, metadata !105}
-!105 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from double]
-!106 = metadata !{metadata !107, metadata !108, metadata !109, metadata !110, metadata !111, metadata !112, metadata !113}
-!107 = metadata !{metadata !"0x101\00rms\0016777482\000", metadata !102, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [rms] [line 266]
-!108 = metadata !{metadata !"0x100\00i\00271\000", metadata !102, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 271]
-!109 = metadata !{metadata !"0x100\00j\00271\000", metadata !102, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 271]
-!110 = metadata !{metadata !"0x100\00k\00271\000", metadata !102, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 271]
-!111 = metadata !{metadata !"0x100\00d\00271\000", metadata !102, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [d] [line 271]
-!112 = metadata !{metadata !"0x100\00m\00271\000", metadata !102, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 271]
-!113 = metadata !{metadata !"0x100\00add\00272\000", metadata !102, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [add] [line 272]
-!114 = metadata !{metadata !"0x2e\00compute_rhs\00compute_rhs\00\001767\001\001\000\006\00256\001\001767", metadata !1, metadata !5, metadata !115, null, void ()* @compute_rhs, null, null, metadata !117} ; [ DW_TAG_subprogram ] [line 1767] [local] [def] [compute_rhs]
-!115 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !116, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!116 = metadata !{null}
-!117 = metadata !{metadata !118, metadata !119, metadata !120, metadata !121, metadata !122, metadata !123, metadata !124, metadata !125, metadata !126, metadata !127, metadata !128, metadata !129, metadata !130, metadata !131}
-!118 = metadata !{metadata !"0x100\00i\001769\000", metadata !114, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 1769]
-!119 = metadata !{metadata !"0x100\00j\001769\000", metadata !114, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 1769]
-!120 = metadata !{metadata !"0x100\00k\001769\000", metadata !114, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 1769]
-!121 = metadata !{metadata !"0x100\00m\001769\000", metadata !114, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 1769]
-!122 = metadata !{metadata !"0x100\00rho_inv\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [rho_inv] [line 1770]
-!123 = metadata !{metadata !"0x100\00uijk\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [uijk] [line 1770]
-!124 = metadata !{metadata !"0x100\00up1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [up1] [line 1770]
-!125 = metadata !{metadata !"0x100\00um1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [um1] [line 1770]
-!126 = metadata !{metadata !"0x100\00vijk\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [vijk] [line 1770]
-!127 = metadata !{metadata !"0x100\00vp1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [vp1] [line 1770]
-!128 = metadata !{metadata !"0x100\00vm1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [vm1] [line 1770]
-!129 = metadata !{metadata !"0x100\00wijk\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [wijk] [line 1770]
-!130 = metadata !{metadata !"0x100\00wp1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [wp1] [line 1770]
-!131 = metadata !{metadata !"0x100\00wm1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [wm1] [line 1770]
-!132 = metadata !{metadata !"0x2e\00error_norm\00error_norm\00\00225\001\001\000\006\00256\001\00225", metadata !1, metadata !5, metadata !103, null, null, null, null, metadata !133} ; [ DW_TAG_subprogram ] [line 225] [local] [def] [error_norm]
-!133 = metadata !{metadata !134, metadata !135, metadata !136, metadata !137, metadata !138, metadata !139, metadata !140, metadata !141, metadata !142, metadata !143, metadata !144}
-!134 = metadata !{metadata !"0x101\00rms\0016777441\000", metadata !132, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [rms] [line 225]
-!135 = metadata !{metadata !"0x100\00i\00232\000", metadata !132, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 232]
-!136 = metadata !{metadata !"0x100\00j\00232\000", metadata !132, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 232]
-!137 = metadata !{metadata !"0x100\00k\00232\000", metadata !132, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 232]
-!138 = metadata !{metadata !"0x100\00m\00232\000", metadata !132, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 232]
-!139 = metadata !{metadata !"0x100\00d\00232\000", metadata !132, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [d] [line 232]
-!140 = metadata !{metadata !"0x100\00xi\00233\000", metadata !132, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [xi] [line 233]
-!141 = metadata !{metadata !"0x100\00eta\00233\000", metadata !132, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [eta] [line 233]
-!142 = metadata !{metadata !"0x100\00zeta\00233\000", metadata !132, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [zeta] [line 233]
-!143 = metadata !{metadata !"0x100\00u_exact\00233\000", metadata !132, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [u_exact] [line 233]
-!144 = metadata !{metadata !"0x100\00add\00233\000", metadata !132, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [add] [line 233]
-!145 = metadata !{metadata !"0x2e\00exact_solution\00exact_solution\00\00643\001\001\000\006\00256\001\00644", metadata !1, metadata !5, metadata !146, null, null, null, null, metadata !148} ; [ DW_TAG_subprogram ] [line 643] [local] [def] [scope 644] [exact_solution]
-!146 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !147, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!147 = metadata !{null, metadata !20, metadata !20, metadata !20, metadata !105}
-!148 = metadata !{metadata !149, metadata !150, metadata !151, metadata !152, metadata !153}
-!149 = metadata !{metadata !"0x101\00xi\0016777859\000", metadata !145, metadata !5, metadata !20} ; [ DW_TAG_arg_variable ] [xi] [line 643]
-!150 = metadata !{metadata !"0x101\00eta\0033555075\000", metadata !145, metadata !5, metadata !20} ; [ DW_TAG_arg_variable ] [eta] [line 643]
-!151 = metadata !{metadata !"0x101\00zeta\0050332291\000", metadata !145, metadata !5, metadata !20} ; [ DW_TAG_arg_variable ] [zeta] [line 643]
-!152 = metadata !{metadata !"0x101\00dtemp\0067109508\000", metadata !145, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [dtemp] [line 644]
-!153 = metadata !{metadata !"0x100\00m\00653\000", metadata !145, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 653]
-!154 = metadata !{metadata !"0x2e\00set_constants\00set_constants\00\002191\001\001\000\006\00256\001\002191", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2191] [local] [def] [set_constants]
-!155 = metadata !{metadata !"0x2e\00lhsinit\00lhsinit\00\00855\001\001\000\006\00256\001\00855", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !156} ; [ DW_TAG_subprogram ] [line 855] [local] [def] [lhsinit]
-!156 = metadata !{metadata !157, metadata !158, metadata !159, metadata !160, metadata !161}
-!157 = metadata !{metadata !"0x100\00i\00857\000", metadata !155, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 857]
-!158 = metadata !{metadata !"0x100\00j\00857\000", metadata !155, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 857]
-!159 = metadata !{metadata !"0x100\00k\00857\000", metadata !155, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 857]
-!160 = metadata !{metadata !"0x100\00m\00857\000", metadata !155, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 857]
-!161 = metadata !{metadata !"0x100\00n\00857\000", metadata !155, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [n] [line 857]
-!162 = metadata !{metadata !"0x2e\00initialize\00initialize\00\00669\001\001\000\006\00256\001\00669", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !163} ; [ DW_TAG_subprogram ] [line 669] [local] [def] [initialize]
-!163 = metadata !{metadata !164, metadata !165, metadata !166, metadata !167, metadata !168, metadata !169, metadata !170, metadata !171, metadata !172, metadata !173, metadata !174, metadata !179, metadata !180, metadata !181, metadata !182}
-!164 = metadata !{metadata !"0x100\00i\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 679]
-!165 = metadata !{metadata !"0x100\00j\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 679]
-!166 = metadata !{metadata !"0x100\00k\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 679]
-!167 = metadata !{metadata !"0x100\00m\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 679]
-!168 = metadata !{metadata !"0x100\00ix\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [ix] [line 679]
-!169 = metadata !{metadata !"0x100\00iy\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [iy] [line 679]
-!170 = metadata !{metadata !"0x100\00iz\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [iz] [line 679]
-!171 = metadata !{metadata !"0x100\00xi\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [xi] [line 680]
-!172 = metadata !{metadata !"0x100\00eta\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [eta] [line 680]
-!173 = metadata !{metadata !"0x100\00zeta\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [zeta] [line 680]
-!174 = metadata !{metadata !"0x100\00Pface\00680\000", metadata !162, metadata !5, metadata !175} ; [ DW_TAG_auto_variable ] [Pface] [line 680]
-!175 = metadata !{metadata !"0x1\00\000\001920\0064\000\000", null, null, metadata !20, metadata !176, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1920, align 64, offset 0] [from double]
-!176 = metadata !{metadata !177, metadata !178, metadata !93}
-!177 = metadata !{metadata !"0x21\000\002"}       ; [ DW_TAG_subrange_type ] [0, 1]
-!178 = metadata !{metadata !"0x21\000\003"}       ; [ DW_TAG_subrange_type ] [0, 2]
-!179 = metadata !{metadata !"0x100\00Pxi\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [Pxi] [line 680]
-!180 = metadata !{metadata !"0x100\00Peta\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [Peta] [line 680]
-!181 = metadata !{metadata !"0x100\00Pzeta\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [Pzeta] [line 680]
-!182 = metadata !{metadata !"0x100\00temp\00680\000", metadata !162, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [temp] [line 680]
-!183 = metadata !{metadata !"0x2e\00exact_rhs\00exact_rhs\00\00301\001\001\000\006\00256\001\00301", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !184} ; [ DW_TAG_subprogram ] [line 301] [local] [def] [exact_rhs]
-!184 = metadata !{metadata !185, metadata !186, metadata !187, metadata !188, metadata !189, metadata !190, metadata !191, metadata !192, metadata !193, metadata !194, metadata !195, metadata !196, metadata !197, metadata !198, metadata !199}
-!185 = metadata !{metadata !"0x100\00dtemp\00310\000", metadata !183, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [dtemp] [line 310]
-!186 = metadata !{metadata !"0x100\00xi\00310\000", metadata !183, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [xi] [line 310]
-!187 = metadata !{metadata !"0x100\00eta\00310\000", metadata !183, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [eta] [line 310]
-!188 = metadata !{metadata !"0x100\00zeta\00310\000", metadata !183, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [zeta] [line 310]
-!189 = metadata !{metadata !"0x100\00dtpp\00310\000", metadata !183, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [dtpp] [line 310]
-!190 = metadata !{metadata !"0x100\00m\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 311]
-!191 = metadata !{metadata !"0x100\00i\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 311]
-!192 = metadata !{metadata !"0x100\00j\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 311]
-!193 = metadata !{metadata !"0x100\00k\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 311]
-!194 = metadata !{metadata !"0x100\00ip1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [ip1] [line 311]
-!195 = metadata !{metadata !"0x100\00im1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [im1] [line 311]
-!196 = metadata !{metadata !"0x100\00jp1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [jp1] [line 311]
-!197 = metadata !{metadata !"0x100\00jm1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [jm1] [line 311]
-!198 = metadata !{metadata !"0x100\00km1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [km1] [line 311]
-!199 = metadata !{metadata !"0x100\00kp1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [kp1] [line 311]
-!200 = metadata !{metadata !"0x2e\00adi\00adi\00\00210\001\001\000\006\00256\001\00210", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 210] [local] [def] [adi]
-!201 = metadata !{metadata !"0x2e\00add\00add\00\00187\001\001\000\006\00256\001\00187", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !202} ; [ DW_TAG_subprogram ] [line 187] [local] [def] [add]
-!202 = metadata !{metadata !203, metadata !204, metadata !205, metadata !206}
-!203 = metadata !{metadata !"0x100\00i\00193\000", metadata !201, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 193]
-!204 = metadata !{metadata !"0x100\00j\00193\000", metadata !201, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 193]
-!205 = metadata !{metadata !"0x100\00k\00193\000", metadata !201, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 193]
-!206 = metadata !{metadata !"0x100\00m\00193\000", metadata !201, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 193]
-!207 = metadata !{metadata !"0x2e\00z_solve\00z_solve\00\003457\001\001\000\006\00256\001\003457", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3457] [local] [def] [z_solve]
-!208 = metadata !{metadata !"0x2e\00z_backsubstitute\00z_backsubstitute\00\003480\001\001\000\006\00256\001\003480", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !209} ; [ DW_TAG_subprogram ] [line 3480] [local] [def] [z_backsubstitute]
-!209 = metadata !{metadata !210, metadata !211, metadata !212, metadata !213, metadata !214}
-!210 = metadata !{metadata !"0x100\00i\003492\000", metadata !208, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 3492]
-!211 = metadata !{metadata !"0x100\00j\003492\000", metadata !208, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 3492]
-!212 = metadata !{metadata !"0x100\00k\003492\000", metadata !208, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 3492]
-!213 = metadata !{metadata !"0x100\00m\003492\000", metadata !208, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 3492]
-!214 = metadata !{metadata !"0x100\00n\003492\000", metadata !208, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [n] [line 3492]
-!215 = metadata !{metadata !"0x2e\00z_solve_cell\00z_solve_cell\00\003512\001\001\000\006\00256\001\003512", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !216} ; [ DW_TAG_subprogram ] [line 3512] [local] [def] [z_solve_cell]
-!216 = metadata !{metadata !217, metadata !218, metadata !219, metadata !220}
-!217 = metadata !{metadata !"0x100\00i\003527\000", metadata !215, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 3527]
-!218 = metadata !{metadata !"0x100\00j\003527\000", metadata !215, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 3527]
-!219 = metadata !{metadata !"0x100\00k\003527\000", metadata !215, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 3527]
-!220 = metadata !{metadata !"0x100\00ksize\003527\000", metadata !215, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [ksize] [line 3527]
-!221 = metadata !{metadata !"0x2e\00binvrhs\00binvrhs\00\003154\001\001\000\006\00256\001\003154", metadata !1, metadata !5, metadata !222, null, null, null, null, metadata !225} ; [ DW_TAG_subprogram ] [line 3154] [local] [def] [binvrhs]
-!222 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !223, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!223 = metadata !{null, metadata !224, metadata !105}
-!224 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !91} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!225 = metadata !{metadata !226, metadata !227, metadata !228, metadata !229}
-!226 = metadata !{metadata !"0x101\00lhs\0016780370\000", metadata !221, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [lhs] [line 3154]
-!227 = metadata !{metadata !"0x101\00r\0033557586\000", metadata !221, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [r] [line 3154]
-!228 = metadata !{metadata !"0x100\00pivot\003159\000", metadata !221, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [pivot] [line 3159]
-!229 = metadata !{metadata !"0x100\00coeff\003159\000", metadata !221, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [coeff] [line 3159]
-!230 = metadata !{metadata !"0x2e\00matmul_sub\00matmul_sub\00\002841\001\001\000\006\00256\001\002842", metadata !1, metadata !5, metadata !231, null, null, null, null, metadata !233} ; [ DW_TAG_subprogram ] [line 2841] [local] [def] [scope 2842] [matmul_sub]
-!231 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !232, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!232 = metadata !{null, metadata !224, metadata !224, metadata !224}
-!233 = metadata !{metadata !234, metadata !235, metadata !236, metadata !237}
-!234 = metadata !{metadata !"0x101\00ablock\0016780057\000", metadata !230, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [ablock] [line 2841]
-!235 = metadata !{metadata !"0x101\00bblock\0033557273\000", metadata !230, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [bblock] [line 2841]
-!236 = metadata !{metadata !"0x101\00cblock\0050334490\000", metadata !230, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [cblock] [line 2842]
-!237 = metadata !{metadata !"0x100\00j\002851\000", metadata !230, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 2851]
-!238 = metadata !{metadata !"0x2e\00matvec_sub\00matvec_sub\00\002814\001\001\000\006\00256\001\002814", metadata !1, metadata !5, metadata !239, null, null, null, null, metadata !241} ; [ DW_TAG_subprogram ] [line 2814] [local] [def] [matvec_sub]
-!239 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !240, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!240 = metadata !{null, metadata !224, metadata !105, metadata !105}
-!241 = metadata !{metadata !242, metadata !243, metadata !244, metadata !245}
-!242 = metadata !{metadata !"0x101\00ablock\0016780030\000", metadata !238, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [ablock] [line 2814]
-!243 = metadata !{metadata !"0x101\00avec\0033557246\000", metadata !238, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [avec] [line 2814]
-!244 = metadata !{metadata !"0x101\00bvec\0050334462\000", metadata !238, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [bvec] [line 2814]
-!245 = metadata !{metadata !"0x100\00i\002823\000", metadata !238, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 2823]
-!246 = metadata !{metadata !"0x2e\00binvcrhs\00binvcrhs\00\002885\001\001\000\006\00256\001\002885", metadata !1, metadata !5, metadata !247, null, null, null, null, metadata !249} ; [ DW_TAG_subprogram ] [line 2885] [local] [def] [binvcrhs]
-!247 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !248, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!248 = metadata !{null, metadata !224, metadata !224, metadata !105}
-!249 = metadata !{metadata !250, metadata !251, metadata !252, metadata !253, metadata !254}
-!250 = metadata !{metadata !"0x101\00lhs\0016780101\000", metadata !246, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [lhs] [line 2885]
-!251 = metadata !{metadata !"0x101\00c\0033557317\000", metadata !246, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [c] [line 2885]
-!252 = metadata !{metadata !"0x101\00r\0050334533\000", metadata !246, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [r] [line 2885]
-!253 = metadata !{metadata !"0x100\00pivot\002890\000", metadata !246, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [pivot] [line 2890]
-!254 = metadata !{metadata !"0x100\00coeff\002890\000", metadata !246, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [coeff] [line 2890]
-!255 = metadata !{metadata !"0x2e\00lhsz\00lhsz\00\001475\001\001\000\006\00256\001\001475", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !256} ; [ DW_TAG_subprogram ] [line 1475] [local] [def] [lhsz]
-!256 = metadata !{metadata !257, metadata !258, metadata !259}
-!257 = metadata !{metadata !"0x100\00i\001484\000", metadata !255, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 1484]
-!258 = metadata !{metadata !"0x100\00j\001484\000", metadata !255, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 1484]
-!259 = metadata !{metadata !"0x100\00k\001484\000", metadata !255, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 1484]
-!260 = metadata !{metadata !"0x2e\00y_solve\00y_solve\00\003299\001\001\000\006\00256\001\003299", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3299] [local] [def] [y_solve]
-!261 = metadata !{metadata !"0x2e\00y_backsubstitute\00y_backsubstitute\00\003323\001\001\000\006\00256\001\003323", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !262} ; [ DW_TAG_subprogram ] [line 3323] [local] [def] [y_backsubstitute]
-!262 = metadata !{metadata !263, metadata !264, metadata !265, metadata !266, metadata !267}
-!263 = metadata !{metadata !"0x100\00i\003335\000", metadata !261, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 3335]
-!264 = metadata !{metadata !"0x100\00j\003335\000", metadata !261, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 3335]
-!265 = metadata !{metadata !"0x100\00k\003335\000", metadata !261, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 3335]
-!266 = metadata !{metadata !"0x100\00m\003335\000", metadata !261, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 3335]
-!267 = metadata !{metadata !"0x100\00n\003335\000", metadata !261, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [n] [line 3335]
-!268 = metadata !{metadata !"0x2e\00y_solve_cell\00y_solve_cell\00\003355\001\001\000\006\00256\001\003355", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !269} ; [ DW_TAG_subprogram ] [line 3355] [local] [def] [y_solve_cell]
-!269 = metadata !{metadata !270, metadata !271, metadata !272, metadata !273}
-!270 = metadata !{metadata !"0x100\00i\003370\000", metadata !268, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 3370]
-!271 = metadata !{metadata !"0x100\00j\003370\000", metadata !268, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 3370]
-!272 = metadata !{metadata !"0x100\00k\003370\000", metadata !268, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 3370]
-!273 = metadata !{metadata !"0x100\00jsize\003370\000", metadata !268, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [jsize] [line 3370]
-!274 = metadata !{metadata !"0x2e\00lhsy\00lhsy\00\001181\001\001\000\006\00256\001\001181", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !275} ; [ DW_TAG_subprogram ] [line 1181] [local] [def] [lhsy]
-!275 = metadata !{metadata !276, metadata !277, metadata !278}
-!276 = metadata !{metadata !"0x100\00i\001190\000", metadata !274, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 1190]
-!277 = metadata !{metadata !"0x100\00j\001190\000", metadata !274, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 1190]
-!278 = metadata !{metadata !"0x100\00k\001190\000", metadata !274, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 1190]
-!279 = metadata !{metadata !"0x2e\00x_solve\00x_solve\00\002658\001\001\000\006\00256\001\002658", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2658] [local] [def] [x_solve]
-!280 = metadata !{metadata !"0x2e\00x_backsubstitute\00x_backsubstitute\00\002684\001\001\000\006\00256\001\002684", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !281} ; [ DW_TAG_subprogram ] [line 2684] [local] [def] [x_backsubstitute]
-!281 = metadata !{metadata !282, metadata !283, metadata !284, metadata !285, metadata !286}
-!282 = metadata !{metadata !"0x100\00i\002696\000", metadata !280, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 2696]
-!283 = metadata !{metadata !"0x100\00j\002696\000", metadata !280, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 2696]
-!284 = metadata !{metadata !"0x100\00k\002696\000", metadata !280, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 2696]
-!285 = metadata !{metadata !"0x100\00m\002696\000", metadata !280, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 2696]
-!286 = metadata !{metadata !"0x100\00n\002696\000", metadata !280, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [n] [line 2696]
-!287 = metadata !{metadata !"0x2e\00x_solve_cell\00x_solve_cell\00\002716\001\001\000\006\00256\001\002716", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !288} ; [ DW_TAG_subprogram ] [line 2716] [local] [def] [x_solve_cell]
-!288 = metadata !{metadata !289, metadata !290, metadata !291, metadata !292}
-!289 = metadata !{metadata !"0x100\00i\002728\000", metadata !287, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 2728]
-!290 = metadata !{metadata !"0x100\00j\002728\000", metadata !287, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 2728]
-!291 = metadata !{metadata !"0x100\00k\002728\000", metadata !287, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 2728]
-!292 = metadata !{metadata !"0x100\00isize\002728\000", metadata !287, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [isize] [line 2728]
-!293 = metadata !{metadata !"0x2e\00lhsx\00lhsx\00\00898\001\001\000\006\00256\001\00898", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !294} ; [ DW_TAG_subprogram ] [line 898] [local] [def] [lhsx]
-!294 = metadata !{metadata !295, metadata !296, metadata !297}
-!295 = metadata !{metadata !"0x100\00i\00907\000", metadata !293, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 907]
-!296 = metadata !{metadata !"0x100\00j\00907\000", metadata !293, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 907]
-!297 = metadata !{metadata !"0x100\00k\00907\000", metadata !293, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 907]
-!298 = metadata !{metadata !299, metadata !304, metadata !305, metadata !309, metadata !310, metadata !311, metadata !312, metadata !313, metadata !314, metadata !315, metadata !316, metadata !317, metadata !318, metadata !319, metadata !320, metadata !321, metadata !322, metadata !323, metadata !324, metadata !325, metadata !326, metadata !327, metadata !328, metadata !329, metadata !330, metadata !331, metadata !332, metadata !333, metadata !334, metadata !335, metadata !336, metadata !337, metadata !338, metadata !339, metadata !340, metadata !341, metadata !342, metadata !343, metadata !347, metadata !350, metadata !351, metadata !352, metadata !353, metadata !354, metadata !355, metadata !356, metadata !360, metadata !361, metadata !362, metadata !363, metadata !364, metadata !365, metadata !366, metadata !367, metadata !368, metadata !369, metadata !370, metadata !371, metadata !372, metadata !373, metadata !374, metadata !375, metadata !376, metadata !377, metadata !378, metadata !379, metadata !380, metadata !381, metadata !382, metadata !383, metadata !384, metadata !385, metadata !386, metadata !387, metadata !388, metadata !389, metadata !390, metadata !391, metadata !392, metadata !393, metadata !394, metadata !395, metadata !396, metadata !397, metadata !398, metadata !399, metadata !400, metadata !401, metadata !402, metadata !403, metadata !404, metadata !405, metadata !406, metadata !407, metadata !408, metadata !409, metadata !410, metadata !411, metadata !412, metadata !413, metadata !414, metadata !415, metadata !416, metadata !417, metadata !418, metadata !419, metadata !422, metadata !426, metadata !427, metadata !430, metadata !431, metadata !434, metadata !435, metadata !436, metadata !437}
-!299 = metadata !{metadata !"0x34\00grid_points\00grid_points\00\0028\001\001", null, metadata !300, metadata !302, [3 x i32]* @grid_points, null} ; [ DW_TAG_variable ] [grid_points] [line 28] [local] [def]
-!300 = metadata !{metadata !"0x29", metadata !301}      ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/./header.h]
-!301 = metadata !{metadata !"./header.h", metadata !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
-!302 = metadata !{metadata !"0x1\00\000\0096\0032\000\000", null, null, metadata !8, metadata !303, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 96, align 32, offset 0] [from int]
-!303 = metadata !{metadata !178}
-!304 = metadata !{metadata !"0x34\00dt\00dt\00\0035\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dt] [line 35] [local] [def]
-!305 = metadata !{metadata !"0x34\00rhs\00rhs\00\0068\001\001", null, metadata !300, metadata !306, null, null} ; [ DW_TAG_variable ] [rhs] [line 68] [local] [def]
-!306 = metadata !{metadata !"0x1\00\000\001385839040\0064\000\000", null, null, metadata !20, metadata !307, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1385839040, align 64, offset 0] [from double]
-!307 = metadata !{metadata !308, metadata !308, metadata !308, metadata !93}
-!308 = metadata !{metadata !"0x21\000\00163"}     ; [ DW_TAG_subrange_type ] [0, 162]
-!309 = metadata !{metadata !"0x34\00zzcon5\00zzcon5\00\0042\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [zzcon5] [line 42] [local] [def]
-!310 = metadata !{metadata !"0x34\00zzcon4\00zzcon4\00\0042\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [zzcon4] [line 42] [local] [def]
-!311 = metadata !{metadata !"0x34\00zzcon3\00zzcon3\00\0042\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [zzcon3] [line 42] [local] [def]
-!312 = metadata !{metadata !"0x34\00dz5tz1\00dz5tz1\00\0043\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz5tz1] [line 43] [local] [def]
-!313 = metadata !{metadata !"0x34\00dz4tz1\00dz4tz1\00\0043\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz4tz1] [line 43] [local] [def]
-!314 = metadata !{metadata !"0x34\00dz3tz1\00dz3tz1\00\0043\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz3tz1] [line 43] [local] [def]
-!315 = metadata !{metadata !"0x34\00zzcon2\00zzcon2\00\0042\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [zzcon2] [line 42] [local] [def]
-!316 = metadata !{metadata !"0x34\00dz2tz1\00dz2tz1\00\0043\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz2tz1] [line 43] [local] [def]
-!317 = metadata !{metadata !"0x34\00tz2\00tz2\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tz2] [line 31] [local] [def]
-!318 = metadata !{metadata !"0x34\00dz1tz1\00dz1tz1\00\0043\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz1tz1] [line 43] [local] [def]
-!319 = metadata !{metadata !"0x34\00yycon5\00yycon5\00\0040\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [yycon5] [line 40] [local] [def]
-!320 = metadata !{metadata !"0x34\00yycon4\00yycon4\00\0040\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [yycon4] [line 40] [local] [def]
-!321 = metadata !{metadata !"0x34\00yycon3\00yycon3\00\0040\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [yycon3] [line 40] [local] [def]
-!322 = metadata !{metadata !"0x34\00dy5ty1\00dy5ty1\00\0041\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy5ty1] [line 41] [local] [def]
-!323 = metadata !{metadata !"0x34\00dy4ty1\00dy4ty1\00\0041\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy4ty1] [line 41] [local] [def]
-!324 = metadata !{metadata !"0x34\00dy3ty1\00dy3ty1\00\0041\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy3ty1] [line 41] [local] [def]
-!325 = metadata !{metadata !"0x34\00yycon2\00yycon2\00\0040\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [yycon2] [line 40] [local] [def]
-!326 = metadata !{metadata !"0x34\00dy2ty1\00dy2ty1\00\0041\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy2ty1] [line 41] [local] [def]
-!327 = metadata !{metadata !"0x34\00ty2\00ty2\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [ty2] [line 31] [local] [def]
-!328 = metadata !{metadata !"0x34\00dy1ty1\00dy1ty1\00\0041\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy1ty1] [line 41] [local] [def]
-!329 = metadata !{metadata !"0x34\00dssp\00dssp\00\0035\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dssp] [line 35] [local] [def]
-!330 = metadata !{metadata !"0x34\00c1\00c1\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c1] [line 45] [local] [def]
-!331 = metadata !{metadata !"0x34\00xxcon5\00xxcon5\00\0038\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [xxcon5] [line 38] [local] [def]
-!332 = metadata !{metadata !"0x34\00xxcon4\00xxcon4\00\0038\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [xxcon4] [line 38] [local] [def]
-!333 = metadata !{metadata !"0x34\00xxcon3\00xxcon3\00\0038\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [xxcon3] [line 38] [local] [def]
-!334 = metadata !{metadata !"0x34\00dx5tx1\00dx5tx1\00\0039\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx5tx1] [line 39] [local] [def]
-!335 = metadata !{metadata !"0x34\00dx4tx1\00dx4tx1\00\0039\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx4tx1] [line 39] [local] [def]
-!336 = metadata !{metadata !"0x34\00dx3tx1\00dx3tx1\00\0039\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx3tx1] [line 39] [local] [def]
-!337 = metadata !{metadata !"0x34\00c2\00c2\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c2] [line 45] [local] [def]
-!338 = metadata !{metadata !"0x34\00con43\00con43\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [con43] [line 48] [local] [def]
-!339 = metadata !{metadata !"0x34\00xxcon2\00xxcon2\00\0038\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [xxcon2] [line 38] [local] [def]
-!340 = metadata !{metadata !"0x34\00dx2tx1\00dx2tx1\00\0039\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx2tx1] [line 39] [local] [def]
-!341 = metadata !{metadata !"0x34\00tx2\00tx2\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tx2] [line 31] [local] [def]
-!342 = metadata !{metadata !"0x34\00dx1tx1\00dx1tx1\00\0039\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx1tx1] [line 39] [local] [def]
-!343 = metadata !{metadata !"0x34\00forcing\00forcing\00\0066\001\001", null, metadata !300, metadata !344, null, null} ; [ DW_TAG_variable ] [forcing] [line 66] [local] [def]
-!344 = metadata !{metadata !"0x1\00\000\001663006848\0064\000\000", null, null, metadata !20, metadata !345, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1663006848, align 64, offset 0] [from double]
-!345 = metadata !{metadata !308, metadata !308, metadata !308, metadata !346}
-!346 = metadata !{metadata !"0x21\000\006"}       ; [ DW_TAG_subrange_type ] [0, 5]
-!347 = metadata !{metadata !"0x34\00qs\00qs\00\0063\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [qs] [line 63] [local] [def]
-!348 = metadata !{metadata !"0x1\00\000\00277167808\0064\000\000", null, null, metadata !20, metadata !349, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 277167808, align 64, offset 0] [from double]
-!349 = metadata !{metadata !308, metadata !308, metadata !308}
-!350 = metadata !{metadata !"0x34\00square\00square\00\0065\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [square] [line 65] [local] [def]
-!351 = metadata !{metadata !"0x34\00ws\00ws\00\0062\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [ws] [line 62] [local] [def]
-!352 = metadata !{metadata !"0x34\00vs\00vs\00\0061\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [vs] [line 61] [local] [def]
-!353 = metadata !{metadata !"0x34\00us\00us\00\0060\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [us] [line 60] [local] [def]
-!354 = metadata !{metadata !"0x34\00rho_i\00rho_i\00\0064\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [rho_i] [line 64] [local] [def]
-!355 = metadata !{metadata !"0x34\00u\00u\00\0067\001\001", null, metadata !300, metadata !306, null, null} ; [ DW_TAG_variable ] [u] [line 67] [local] [def]
-!356 = metadata !{metadata !"0x34\00ce\00ce\00\0036\001\001", null, metadata !300, metadata !357, null, null} ; [ DW_TAG_variable ] [ce] [line 36] [local] [def]
-!357 = metadata !{metadata !"0x1\00\000\004160\0064\000\000", null, null, metadata !20, metadata !358, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 4160, align 64, offset 0] [from double]
-!358 = metadata !{metadata !93, metadata !359}
-!359 = metadata !{metadata !"0x21\000\0013"}      ; [ DW_TAG_subrange_type ] [0, 12]
-!360 = metadata !{metadata !"0x34\00dnzm1\00dnzm1\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dnzm1] [line 44] [local] [def]
-!361 = metadata !{metadata !"0x34\00dnym1\00dnym1\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dnym1] [line 44] [local] [def]
-!362 = metadata !{metadata !"0x34\00dnxm1\00dnxm1\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dnxm1] [line 44] [local] [def]
-!363 = metadata !{metadata !"0x34\00zzcon1\00zzcon1\00\0042\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [zzcon1] [line 42] [local] [def]
-!364 = metadata !{metadata !"0x34\00yycon1\00yycon1\00\0040\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [yycon1] [line 40] [local] [def]
-!365 = metadata !{metadata !"0x34\00xxcon1\00xxcon1\00\0038\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [xxcon1] [line 38] [local] [def]
-!366 = metadata !{metadata !"0x34\00con16\00con16\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [con16] [line 48] [local] [def]
-!367 = metadata !{metadata !"0x34\00c2iv\00c2iv\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c2iv] [line 48] [local] [def]
-!368 = metadata !{metadata !"0x34\00c3c4tz3\00c3c4tz3\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c3c4tz3] [line 48] [local] [def]
-!369 = metadata !{metadata !"0x34\00c3c4ty3\00c3c4ty3\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c3c4ty3] [line 48] [local] [def]
-!370 = metadata !{metadata !"0x34\00c3c4tx3\00c3c4tx3\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c3c4tx3] [line 48] [local] [def]
-!371 = metadata !{metadata !"0x34\00comz6\00comz6\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [comz6] [line 47] [local] [def]
-!372 = metadata !{metadata !"0x34\00comz5\00comz5\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [comz5] [line 47] [local] [def]
-!373 = metadata !{metadata !"0x34\00comz4\00comz4\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [comz4] [line 47] [local] [def]
-!374 = metadata !{metadata !"0x34\00comz1\00comz1\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [comz1] [line 47] [local] [def]
-!375 = metadata !{metadata !"0x34\00dtdssp\00dtdssp\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dtdssp] [line 45] [local] [def]
-!376 = metadata !{metadata !"0x34\00c2dttz1\00c2dttz1\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c2dttz1] [line 47] [local] [def]
-!377 = metadata !{metadata !"0x34\00c2dtty1\00c2dtty1\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c2dtty1] [line 47] [local] [def]
-!378 = metadata !{metadata !"0x34\00c2dttx1\00c2dttx1\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c2dttx1] [line 47] [local] [def]
-!379 = metadata !{metadata !"0x34\00dttz2\00dttz2\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dttz2] [line 46] [local] [def]
-!380 = metadata !{metadata !"0x34\00dttz1\00dttz1\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dttz1] [line 46] [local] [def]
-!381 = metadata !{metadata !"0x34\00dtty2\00dtty2\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dtty2] [line 46] [local] [def]
-!382 = metadata !{metadata !"0x34\00dtty1\00dtty1\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dtty1] [line 46] [local] [def]
-!383 = metadata !{metadata !"0x34\00dttx2\00dttx2\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dttx2] [line 46] [local] [def]
-!384 = metadata !{metadata !"0x34\00dttx1\00dttx1\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dttx1] [line 46] [local] [def]
-!385 = metadata !{metadata !"0x34\00c5dssp\00c5dssp\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c5dssp] [line 45] [local] [def]
-!386 = metadata !{metadata !"0x34\00c4dssp\00c4dssp\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c4dssp] [line 45] [local] [def]
-!387 = metadata !{metadata !"0x34\00dzmax\00dzmax\00\0037\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dzmax] [line 37] [local] [def]
-!388 = metadata !{metadata !"0x34\00dymax\00dymax\00\0037\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dymax] [line 37] [local] [def]
-!389 = metadata !{metadata !"0x34\00dxmax\00dxmax\00\0037\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dxmax] [line 37] [local] [def]
-!390 = metadata !{metadata !"0x34\00dz5\00dz5\00\0034\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz5] [line 34] [local] [def]
-!391 = metadata !{metadata !"0x34\00dz4\00dz4\00\0034\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz4] [line 34] [local] [def]
-!392 = metadata !{metadata !"0x34\00dz3\00dz3\00\0034\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz3] [line 34] [local] [def]
-!393 = metadata !{metadata !"0x34\00dz2\00dz2\00\0034\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz2] [line 34] [local] [def]
-!394 = metadata !{metadata !"0x34\00dz1\00dz1\00\0034\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz1] [line 34] [local] [def]
-!395 = metadata !{metadata !"0x34\00dy5\00dy5\00\0033\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy5] [line 33] [local] [def]
-!396 = metadata !{metadata !"0x34\00dy4\00dy4\00\0033\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy4] [line 33] [local] [def]
-!397 = metadata !{metadata !"0x34\00dy3\00dy3\00\0033\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy3] [line 33] [local] [def]
-!398 = metadata !{metadata !"0x34\00dy2\00dy2\00\0033\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy2] [line 33] [local] [def]
-!399 = metadata !{metadata !"0x34\00dy1\00dy1\00\0033\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy1] [line 33] [local] [def]
-!400 = metadata !{metadata !"0x34\00dx5\00dx5\00\0032\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx5] [line 32] [local] [def]
-!401 = metadata !{metadata !"0x34\00dx4\00dx4\00\0032\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx4] [line 32] [local] [def]
-!402 = metadata !{metadata !"0x34\00dx3\00dx3\00\0032\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx3] [line 32] [local] [def]
-!403 = metadata !{metadata !"0x34\00dx2\00dx2\00\0032\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx2] [line 32] [local] [def]
-!404 = metadata !{metadata !"0x34\00dx1\00dx1\00\0032\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx1] [line 32] [local] [def]
-!405 = metadata !{metadata !"0x34\00tz3\00tz3\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tz3] [line 31] [local] [def]
-!406 = metadata !{metadata !"0x34\00tz1\00tz1\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tz1] [line 31] [local] [def]
-!407 = metadata !{metadata !"0x34\00ty3\00ty3\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [ty3] [line 31] [local] [def]
-!408 = metadata !{metadata !"0x34\00ty1\00ty1\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [ty1] [line 31] [local] [def]
-!409 = metadata !{metadata !"0x34\00tx3\00tx3\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tx3] [line 31] [local] [def]
-!410 = metadata !{metadata !"0x34\00tx1\00tx1\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tx1] [line 31] [local] [def]
-!411 = metadata !{metadata !"0x34\00conz1\00conz1\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [conz1] [line 45] [local] [def]
-!412 = metadata !{metadata !"0x34\00c1345\00c1345\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c1345] [line 44] [local] [def]
-!413 = metadata !{metadata !"0x34\00c3c4\00c3c4\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c3c4] [line 44] [local] [def]
-!414 = metadata !{metadata !"0x34\00c1c5\00c1c5\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c1c5] [line 44] [local] [def]
-!415 = metadata !{metadata !"0x34\00c1c2\00c1c2\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c1c2] [line 44] [local] [def]
-!416 = metadata !{metadata !"0x34\00c5\00c5\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c5] [line 45] [local] [def]
-!417 = metadata !{metadata !"0x34\00c4\00c4\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c4] [line 45] [local] [def]
-!418 = metadata !{metadata !"0x34\00c3\00c3\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c3] [line 45] [local] [def]
-!419 = metadata !{metadata !"0x34\00lhs\00lhs\00\0069\001\001", null, metadata !300, metadata !420, null, null} ; [ DW_TAG_variable ] [lhs] [line 69] [local] [def]
-!420 = metadata !{metadata !"0x1\00\000\0020787585600\0064\000\000", null, null, metadata !20, metadata !421, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 20787585600, align 64, offset 0] [from double]
-!421 = metadata !{metadata !308, metadata !308, metadata !308, metadata !178, metadata !93, metadata !93}
-!422 = metadata !{metadata !"0x34\00q\00q\00\0073\001\001", null, metadata !300, metadata !423, null, null} ; [ DW_TAG_variable ] [q] [line 73] [local] [def]
-!423 = metadata !{metadata !"0x1\00\000\0010368\0064\000\000", null, null, metadata !20, metadata !424, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 10368, align 64, offset 0] [from double]
-!424 = metadata !{metadata !425}
-!425 = metadata !{metadata !"0x21\000\00162"}     ; [ DW_TAG_subrange_type ] [0, 161]
-!426 = metadata !{metadata !"0x34\00cuf\00cuf\00\0072\001\001", null, metadata !300, metadata !423, null, null} ; [ DW_TAG_variable ] [cuf] [line 72] [local] [def]
-!427 = metadata !{metadata !"0x34\00buf\00buf\00\0075\001\001", null, metadata !300, metadata !428, null, null} ; [ DW_TAG_variable ] [buf] [line 75] [local] [def]
-!428 = metadata !{metadata !"0x1\00\000\0051840\0064\000\000", null, null, metadata !20, metadata !429, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 51840, align 64, offset 0] [from double]
-!429 = metadata !{metadata !425, metadata !93}
-!430 = metadata !{metadata !"0x34\00ue\00ue\00\0074\001\001", null, metadata !300, metadata !428, null, null} ; [ DW_TAG_variable ] [ue] [line 74] [local] [def]
-!431 = metadata !{metadata !"0x34\00njac\00njac\00\0086\001\001", null, metadata !300, metadata !432, null, null} ; [ DW_TAG_variable ] [njac] [line 86] [local] [def]
-!432 = metadata !{metadata !"0x1\00\000\006886684800\0064\000\000", null, null, metadata !20, metadata !433, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 6886684800, align 64, offset 0] [from double]
-!433 = metadata !{metadata !308, metadata !308, metadata !425, metadata !93, metadata !93}
-!434 = metadata !{metadata !"0x34\00fjac\00fjac\00\0084\001\001", null, metadata !300, metadata !432, null, null} ; [ DW_TAG_variable ] [fjac] [line 84] [local] [def]
-!435 = metadata !{metadata !"0x34\00tmp3\00tmp3\00\0088\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tmp3] [line 88] [local] [def]
-!436 = metadata !{metadata !"0x34\00tmp2\00tmp2\00\0088\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tmp2] [line 88] [local] [def]
-!437 = metadata !{metadata !"0x34\00tmp1\00tmp1\00\0088\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tmp1] [line 88] [local] [def]
-!438 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!439 = metadata !{i32 1898, i32 0, metadata !440, null}
-!440 = metadata !{metadata !"0xb\001898\000\00107", metadata !1, metadata !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!441 = metadata !{i32 1913, i32 0, metadata !442, null}
-!442 = metadata !{metadata !"0xb\001913\000\00115", metadata !1, metadata !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!443 = metadata !{i32 1923, i32 0, metadata !114, null}
-!444 = metadata !{metadata !"int", metadata !445}
-!445 = metadata !{metadata !"omnipotent char", metadata !446}
-!446 = metadata !{metadata !"Simple C/C++ TBAA"}
-!447 = metadata !{i32 1}
-!448 = metadata !{i32 1925, i32 0, metadata !449, null}
-!449 = metadata !{metadata !"0xb\001925\000\00121", metadata !1, metadata !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!450 = metadata !{i32 1939, i32 0, metadata !451, null}
-!451 = metadata !{metadata !"0xb\001939\000\00127", metadata !1, metadata !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!452 = metadata !{i32 1940, i32 0, metadata !453, null}
-!453 = metadata !{metadata !"0xb\001940\000\00129", metadata !1, metadata !454} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!454 = metadata !{metadata !"0xb\001939\000\00128", metadata !1, metadata !451} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!455 = metadata !{i32 1941, i32 0, metadata !456, null}
-!456 = metadata !{metadata !"0xb\001941\000\00131", metadata !1, metadata !457} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!457 = metadata !{metadata !"0xb\001940\000\00130", metadata !1, metadata !453} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!458 = metadata !{i32 2020, i32 0, metadata !459, null}
-!459 = metadata !{metadata !"0xb\002020\000\00149", metadata !1, metadata !460} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!460 = metadata !{metadata !"0xb\002019\000\00148", metadata !1, metadata !461} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!461 = metadata !{metadata !"0xb\002019\000\00147", metadata !1, metadata !462} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!462 = metadata !{metadata !"0xb\002018\000\00146", metadata !1, metadata !463} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!463 = metadata !{metadata !"0xb\002018\000\00145", metadata !1, metadata !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!464 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4 (trunk 190311)\001\00\000\00\000", !1, !2, !2, !3, !298, !2} ; [ DW_TAG_compile_unit ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c] [DW_LANG_C99]
+!1 = !{!"bt.c", !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
+!2 = !{}
+!3 = !{!4, !82, !102, !114, !132, !145, !154, !155, !162, !183, !200, !201, !207, !208, !215, !221, !230, !238, !246, !255, !260, !261, !268, !274, !279, !280, !287, !293}
+!4 = !{!"0x2e\00main\00main\00\0074\000\001\000\006\00256\001\0074", !1, !5, !6, null, null, null, null, !12} ; [ DW_TAG_subprogram ] [line 74] [def] [main]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8, !9}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0xf\00\000\0064\0064\000\000", null, null, !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!10 = !{!"0xf\00\000\0064\0064\000\000", null, null, !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!11 = !{!"0x24\00char\000\008\008\000\000\008", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_unsigned_char]
+!12 = !{!13, !14, !15, !16, !17, !18, !19, !21, !22, !23, !25, !26}
+!13 = !{!"0x101\00argc\0016777290\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [argc] [line 74]
+!14 = !{!"0x101\00argv\0033554506\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [argv] [line 74]
+!15 = !{!"0x100\00niter\0076\000", !4, !5, !8} ; [ DW_TAG_auto_variable ] [niter] [line 76]
+!16 = !{!"0x100\00step\0076\000", !4, !5, !8} ; [ DW_TAG_auto_variable ] [step] [line 76]
+!17 = !{!"0x100\00n3\0076\000", !4, !5, !8} ; [ DW_TAG_auto_variable ] [n3] [line 76]
+!18 = !{!"0x100\00nthreads\0077\000", !4, !5, !8} ; [ DW_TAG_auto_variable ] [nthreads] [line 77]
+!19 = !{!"0x100\00navg\0078\000", !4, !5, !20} ; [ DW_TAG_auto_variable ] [navg] [line 78]
+!20 = !{!"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!21 = !{!"0x100\00mflops\0078\000", !4, !5, !20} ; [ DW_TAG_auto_variable ] [mflops] [line 78]
+!22 = !{!"0x100\00tmax\0080\000", !4, !5, !20} ; [ DW_TAG_auto_variable ] [tmax] [line 80]
+!23 = !{!"0x100\00verified\0081\000", !4, !5, !24} ; [ DW_TAG_auto_variable ] [verified] [line 81]
+!24 = !{!"0x16\00boolean\0012\000\000\000\000", !1, null, !8} ; [ DW_TAG_typedef ] [boolean] [line 12, size 0, align 0, offset 0] [from int]
+!25 = !{!"0x100\00class\0082\000", !4, !5, !11} ; [ DW_TAG_auto_variable ] [class] [line 82]
+!26 = !{!"0x100\00fp\0083\000", !4, !5, !27} ; [ DW_TAG_auto_variable ] [fp] [line 83]
+!27 = !{!"0xf\00\000\0064\0064\000\000", null, null, !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from FILE]
+!28 = !{!"0x16\00FILE\0049\000\000\000\000", !1, null, !29} ; [ DW_TAG_typedef ] [FILE] [line 49, size 0, align 0, offset 0] [from _IO_FILE]
+!29 = !{!"0x13\00_IO_FILE\00271\001728\0064\000\000\000", !30, null, null, !31, null, null, null} ; [ DW_TAG_structure_type ] [_IO_FILE] [line 271, size 1728, align 64, offset 0] [def] [from ]
+!30 = !{!"/usr/include/libio.h", !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
+!31 = !{!32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !52, !53, !54, !55, !58, !60, !62, !66, !68, !70, !71, !72, !73, !74, !77, !78}
+!32 = !{!"0xd\00_flags\00272\0032\0032\000\000", !30, !29, !8} ; [ DW_TAG_member ] [_flags] [line 272, size 32, align 32, offset 0] [from int]
+!33 = !{!"0xd\00_IO_read_ptr\00277\0064\0064\0064\000", !30, !29, !10} ; [ DW_TAG_member ] [_IO_read_ptr] [line 277, size 64, align 64, offset 64] [from ]
+!34 = !{!"0xd\00_IO_read_end\00278\0064\0064\00128\000", !30, !29, !10} ; [ DW_TAG_member ] [_IO_read_end] [line 278, size 64, align 64, offset 128] [from ]
+!35 = !{!"0xd\00_IO_read_base\00279\0064\0064\00192\000", !30, !29, !10} ; [ DW_TAG_member ] [_IO_read_base] [line 279, size 64, align 64, offset 192] [from ]
+!36 = !{!"0xd\00_IO_write_base\00280\0064\0064\00256\000", !30, !29, !10} ; [ DW_TAG_member ] [_IO_write_base] [line 280, size 64, align 64, offset 256] [from ]
+!37 = !{!"0xd\00_IO_write_ptr\00281\0064\0064\00320\000", !30, !29, !10} ; [ DW_TAG_member ] [_IO_write_ptr] [line 281, size 64, align 64, offset 320] [from ]
+!38 = !{!"0xd\00_IO_write_end\00282\0064\0064\00384\000", !30, !29, !10} ; [ DW_TAG_member ] [_IO_write_end] [line 282, size 64, align 64, offset 384] [from ]
+!39 = !{!"0xd\00_IO_buf_base\00283\0064\0064\00448\000", !30, !29, !10} ; [ DW_TAG_member ] [_IO_buf_base] [line 283, size 64, align 64, offset 448] [from ]
+!40 = !{!"0xd\00_IO_buf_end\00284\0064\0064\00512\000", !30, !29, !10} ; [ DW_TAG_member ] [_IO_buf_end] [line 284, size 64, align 64, offset 512] [from ]
+!41 = !{!"0xd\00_IO_save_base\00286\0064\0064\00576\000", !30, !29, !10} ; [ DW_TAG_member ] [_IO_save_base] [line 286, size 64, align 64, offset 576] [from ]
+!42 = !{!"0xd\00_IO_backup_base\00287\0064\0064\00640\000", !30, !29, !10} ; [ DW_TAG_member ] [_IO_backup_base] [line 287, size 64, align 64, offset 640] [from ]
+!43 = !{!"0xd\00_IO_save_end\00288\0064\0064\00704\000", !30, !29, !10} ; [ DW_TAG_member ] [_IO_save_end] [line 288, size 64, align 64, offset 704] [from ]
+!44 = !{!"0xd\00_markers\00290\0064\0064\00768\000", !30, !29, !45} ; [ DW_TAG_member ] [_markers] [line 290, size 64, align 64, offset 768] [from ]
+!45 = !{!"0xf\00\000\0064\0064\000\000", null, null, !46} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _IO_marker]
+!46 = !{!"0x13\00_IO_marker\00186\00192\0064\000\000\000", !30, null, null, !47, null, null, null} ; [ DW_TAG_structure_type ] [_IO_marker] [line 186, size 192, align 64, offset 0] [def] [from ]
+!47 = !{!48, !49, !51}
+!48 = !{!"0xd\00_next\00187\0064\0064\000\000", !30, !46, !45} ; [ DW_TAG_member ] [_next] [line 187, size 64, align 64, offset 0] [from ]
+!49 = !{!"0xd\00_sbuf\00188\0064\0064\0064\000", !30, !46, !50} ; [ DW_TAG_member ] [_sbuf] [line 188, size 64, align 64, offset 64] [from ]
+!50 = !{!"0xf\00\000\0064\0064\000\000", null, null, !29} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _IO_FILE]
+!51 = !{!"0xd\00_pos\00192\0032\0032\00128\000", !30, !46, !8} ; [ DW_TAG_member ] [_pos] [line 192, size 32, align 32, offset 128] [from int]
+!52 = !{!"0xd\00_chain\00292\0064\0064\00832\000", !30, !29, !50} ; [ DW_TAG_member ] [_chain] [line 292, size 64, align 64, offset 832] [from ]
+!53 = !{!"0xd\00_fileno\00294\0032\0032\00896\000", !30, !29, !8} ; [ DW_TAG_member ] [_fileno] [line 294, size 32, align 32, offset 896] [from int]
+!54 = !{!"0xd\00_flags2\00298\0032\0032\00928\000", !30, !29, !8} ; [ DW_TAG_member ] [_flags2] [line 298, size 32, align 32, offset 928] [from int]
+!55 = !{!"0xd\00_old_offset\00300\0064\0064\00960\000", !30, !29, !56} ; [ DW_TAG_member ] [_old_offset] [line 300, size 64, align 64, offset 960] [from __off_t]
+!56 = !{!"0x16\00__off_t\00141\000\000\000\000", !30, null, !57} ; [ DW_TAG_typedef ] [__off_t] [line 141, size 0, align 0, offset 0] [from long int]
+!57 = !{!"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!58 = !{!"0xd\00_cur_column\00304\0016\0016\001024\000", !30, !29, !59} ; [ DW_TAG_member ] [_cur_column] [line 304, size 16, align 16, offset 1024] [from unsigned short]
+!59 = !{!"0x24\00unsigned short\000\0016\0016\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned short] [line 0, size 16, align 16, offset 0, enc DW_ATE_unsigned]
+!60 = !{!"0xd\00_vtable_offset\00305\008\008\001040\000", !30, !29, !61} ; [ DW_TAG_member ] [_vtable_offset] [line 305, size 8, align 8, offset 1040] [from signed char]
+!61 = !{!"0x24\00signed char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [signed char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!62 = !{!"0xd\00_shortbuf\00306\008\008\001048\000", !30, !29, !63} ; [ DW_TAG_member ] [_shortbuf] [line 306, size 8, align 8, offset 1048] [from ]
+!63 = !{!"0x1\00\000\008\008\000\000", null, null, !11, !64, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
+!64 = !{!65}
+!65 = !{!"0x21\000\001"}        ; [ DW_TAG_subrange_type ] [0, 0]
+!66 = !{!"0xd\00_lock\00310\0064\0064\001088\000", !30, !29, !67} ; [ DW_TAG_member ] [_lock] [line 310, size 64, align 64, offset 1088] [from ]
+!67 = !{!"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!68 = !{!"0xd\00_offset\00319\0064\0064\001152\000", !30, !29, !69} ; [ DW_TAG_member ] [_offset] [line 319, size 64, align 64, offset 1152] [from __off64_t]
+!69 = !{!"0x16\00__off64_t\00142\000\000\000\000", !30, null, !57} ; [ DW_TAG_typedef ] [__off64_t] [line 142, size 0, align 0, offset 0] [from long int]
+!70 = !{!"0xd\00__pad1\00328\0064\0064\001216\000", !30, !29, !67} ; [ DW_TAG_member ] [__pad1] [line 328, size 64, align 64, offset 1216] [from ]
+!71 = !{!"0xd\00__pad2\00329\0064\0064\001280\000", !30, !29, !67} ; [ DW_TAG_member ] [__pad2] [line 329, size 64, align 64, offset 1280] [from ]
+!72 = !{!"0xd\00__pad3\00330\0064\0064\001344\000", !30, !29, !67} ; [ DW_TAG_member ] [__pad3] [line 330, size 64, align 64, offset 1344] [from ]
+!73 = !{!"0xd\00__pad4\00331\0064\0064\001408\000", !30, !29, !67} ; [ DW_TAG_member ] [__pad4] [line 331, size 64, align 64, offset 1408] [from ]
+!74 = !{!"0xd\00__pad5\00332\0064\0064\001472\000", !30, !29, !75} ; [ DW_TAG_member ] [__pad5] [line 332, size 64, align 64, offset 1472] [from size_t]
+!75 = !{!"0x16\00size_t\0042\000\000\000\000", !30, null, !76} ; [ DW_TAG_typedef ] [size_t] [line 42, size 0, align 0, offset 0] [from long unsigned int]
+!76 = !{!"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!77 = !{!"0xd\00_mode\00334\0032\0032\001536\000", !30, !29, !8} ; [ DW_TAG_member ] [_mode] [line 334, size 32, align 32, offset 1536] [from int]
+!78 = !{!"0xd\00_unused2\00336\00160\008\001568\000", !30, !29, !79} ; [ DW_TAG_member ] [_unused2] [line 336, size 160, align 8, offset 1568] [from ]
+!79 = !{!"0x1\00\000\00160\008\000\000", null, null, !11, !80, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
+!80 = !{!81}
+!81 = !{!"0x21\000\0020"}       ; [ DW_TAG_subrange_type ] [0, 19]
+!82 = !{!"0x2e\00verify\00verify\00\002388\001\001\000\006\00256\001\002388", !1, !5, !83, null, null, null, null, !86} ; [ DW_TAG_subprogram ] [line 2388] [local] [def] [verify]
+!83 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !84, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!84 = !{null, !8, !10, !85}
+!85 = !{!"0xf\00\000\0064\0064\000\000", null, null, !24} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from boolean]
+!86 = !{!87, !88, !89, !90, !94, !95, !96, !97, !98, !99, !100, !101}
+!87 = !{!"0x101\00no_time_steps\0016779604\000", !82, !5, !8} ; [ DW_TAG_arg_variable ] [no_time_steps] [line 2388]
+!88 = !{!"0x101\00class\0033556820\000", !82, !5, !10} ; [ DW_TAG_arg_variable ] [class] [line 2388]
+!89 = !{!"0x101\00verified\0050334036\000", !82, !5, !85} ; [ DW_TAG_arg_variable ] [verified] [line 2388]
+!90 = !{!"0x100\00xcrref\002397\000", !82, !5, !91} ; [ DW_TAG_auto_variable ] [xcrref] [line 2397]
+!91 = !{!"0x1\00\000\00320\0064\000\000", null, null, !20, !92, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 64, offset 0] [from double]
+!92 = !{!93}
+!93 = !{!"0x21\000\005"}        ; [ DW_TAG_subrange_type ] [0, 4]
+!94 = !{!"0x100\00xceref\002397\000", !82, !5, !91} ; [ DW_TAG_auto_variable ] [xceref] [line 2397]
+!95 = !{!"0x100\00xcrdif\002397\000", !82, !5, !91} ; [ DW_TAG_auto_variable ] [xcrdif] [line 2397]
+!96 = !{!"0x100\00xcedif\002397\000", !82, !5, !91} ; [ DW_TAG_auto_variable ] [xcedif] [line 2397]
+!97 = !{!"0x100\00epsilon\002398\000", !82, !5, !20} ; [ DW_TAG_auto_variable ] [epsilon] [line 2398]
+!98 = !{!"0x100\00xce\002398\000", !82, !5, !91} ; [ DW_TAG_auto_variable ] [xce] [line 2398]
+!99 = !{!"0x100\00xcr\002398\000", !82, !5, !91} ; [ DW_TAG_auto_variable ] [xcr] [line 2398]
+!100 = !{!"0x100\00dtref\002398\000", !82, !5, !20} ; [ DW_TAG_auto_variable ] [dtref] [line 2398]
+!101 = !{!"0x100\00m\002399\000", !82, !5, !8} ; [ DW_TAG_auto_variable ] [m] [line 2399]
+!102 = !{!"0x2e\00rhs_norm\00rhs_norm\00\00266\001\001\000\006\00256\001\00266", !1, !5, !103, null, null, null, null, !106} ; [ DW_TAG_subprogram ] [line 266] [local] [def] [rhs_norm]
+!103 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !104, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!104 = !{null, !105}
+!105 = !{!"0xf\00\000\0064\0064\000\000", null, null, !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from double]
+!106 = !{!107, !108, !109, !110, !111, !112, !113}
+!107 = !{!"0x101\00rms\0016777482\000", !102, !5, !105} ; [ DW_TAG_arg_variable ] [rms] [line 266]
+!108 = !{!"0x100\00i\00271\000", !102, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 271]
+!109 = !{!"0x100\00j\00271\000", !102, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 271]
+!110 = !{!"0x100\00k\00271\000", !102, !5, !8} ; [ DW_TAG_auto_variable ] [k] [line 271]
+!111 = !{!"0x100\00d\00271\000", !102, !5, !8} ; [ DW_TAG_auto_variable ] [d] [line 271]
+!112 = !{!"0x100\00m\00271\000", !102, !5, !8} ; [ DW_TAG_auto_variable ] [m] [line 271]
+!113 = !{!"0x100\00add\00272\000", !102, !5, !20} ; [ DW_TAG_auto_variable ] [add] [line 272]
+!114 = !{!"0x2e\00compute_rhs\00compute_rhs\00\001767\001\001\000\006\00256\001\001767", !1, !5, !115, null, void ()* @compute_rhs, null, null, !117} ; [ DW_TAG_subprogram ] [line 1767] [local] [def] [compute_rhs]
+!115 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !116, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!116 = !{null}
+!117 = !{!118, !119, !120, !121, !122, !123, !124, !125, !126, !127, !128, !129, !130, !131}
+!118 = !{!"0x100\00i\001769\000", !114, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 1769]
+!119 = !{!"0x100\00j\001769\000", !114, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 1769]
+!120 = !{!"0x100\00k\001769\000", !114, !5, !8} ; [ DW_TAG_auto_variable ] [k] [line 1769]
+!121 = !{!"0x100\00m\001769\000", !114, !5, !8} ; [ DW_TAG_auto_variable ] [m] [line 1769]
+!122 = !{!"0x100\00rho_inv\001770\000", !114, !5, !20} ; [ DW_TAG_auto_variable ] [rho_inv] [line 1770]
+!123 = !{!"0x100\00uijk\001770\000", !114, !5, !20} ; [ DW_TAG_auto_variable ] [uijk] [line 1770]
+!124 = !{!"0x100\00up1\001770\000", !114, !5, !20} ; [ DW_TAG_auto_variable ] [up1] [line 1770]
+!125 = !{!"0x100\00um1\001770\000", !114, !5, !20} ; [ DW_TAG_auto_variable ] [um1] [line 1770]
+!126 = !{!"0x100\00vijk\001770\000", !114, !5, !20} ; [ DW_TAG_auto_variable ] [vijk] [line 1770]
+!127 = !{!"0x100\00vp1\001770\000", !114, !5, !20} ; [ DW_TAG_auto_variable ] [vp1] [line 1770]
+!128 = !{!"0x100\00vm1\001770\000", !114, !5, !20} ; [ DW_TAG_auto_variable ] [vm1] [line 1770]
+!129 = !{!"0x100\00wijk\001770\000", !114, !5, !20} ; [ DW_TAG_auto_variable ] [wijk] [line 1770]
+!130 = !{!"0x100\00wp1\001770\000", !114, !5, !20} ; [ DW_TAG_auto_variable ] [wp1] [line 1770]
+!131 = !{!"0x100\00wm1\001770\000", !114, !5, !20} ; [ DW_TAG_auto_variable ] [wm1] [line 1770]
+!132 = !{!"0x2e\00error_norm\00error_norm\00\00225\001\001\000\006\00256\001\00225", !1, !5, !103, null, null, null, null, !133} ; [ DW_TAG_subprogram ] [line 225] [local] [def] [error_norm]
+!133 = !{!134, !135, !136, !137, !138, !139, !140, !141, !142, !143, !144}
+!134 = !{!"0x101\00rms\0016777441\000", !132, !5, !105} ; [ DW_TAG_arg_variable ] [rms] [line 225]
+!135 = !{!"0x100\00i\00232\000", !132, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 232]
+!136 = !{!"0x100\00j\00232\000", !132, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 232]
+!137 = !{!"0x100\00k\00232\000", !132, !5, !8} ; [ DW_TAG_auto_variable ] [k] [line 232]
+!138 = !{!"0x100\00m\00232\000", !132, !5, !8} ; [ DW_TAG_auto_variable ] [m] [line 232]
+!139 = !{!"0x100\00d\00232\000", !132, !5, !8} ; [ DW_TAG_auto_variable ] [d] [line 232]
+!140 = !{!"0x100\00xi\00233\000", !132, !5, !20} ; [ DW_TAG_auto_variable ] [xi] [line 233]
+!141 = !{!"0x100\00eta\00233\000", !132, !5, !20} ; [ DW_TAG_auto_variable ] [eta] [line 233]
+!142 = !{!"0x100\00zeta\00233\000", !132, !5, !20} ; [ DW_TAG_auto_variable ] [zeta] [line 233]
+!143 = !{!"0x100\00u_exact\00233\000", !132, !5, !91} ; [ DW_TAG_auto_variable ] [u_exact] [line 233]
+!144 = !{!"0x100\00add\00233\000", !132, !5, !20} ; [ DW_TAG_auto_variable ] [add] [line 233]
+!145 = !{!"0x2e\00exact_solution\00exact_solution\00\00643\001\001\000\006\00256\001\00644", !1, !5, !146, null, null, null, null, !148} ; [ DW_TAG_subprogram ] [line 643] [local] [def] [scope 644] [exact_solution]
+!146 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !147, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!147 = !{null, !20, !20, !20, !105}
+!148 = !{!149, !150, !151, !152, !153}
+!149 = !{!"0x101\00xi\0016777859\000", !145, !5, !20} ; [ DW_TAG_arg_variable ] [xi] [line 643]
+!150 = !{!"0x101\00eta\0033555075\000", !145, !5, !20} ; [ DW_TAG_arg_variable ] [eta] [line 643]
+!151 = !{!"0x101\00zeta\0050332291\000", !145, !5, !20} ; [ DW_TAG_arg_variable ] [zeta] [line 643]
+!152 = !{!"0x101\00dtemp\0067109508\000", !145, !5, !105} ; [ DW_TAG_arg_variable ] [dtemp] [line 644]
+!153 = !{!"0x100\00m\00653\000", !145, !5, !8} ; [ DW_TAG_auto_variable ] [m] [line 653]
+!154 = !{!"0x2e\00set_constants\00set_constants\00\002191\001\001\000\006\00256\001\002191", !1, !5, !115, null, null, null, null, !2} ; [ DW_TAG_subprogram ] [line 2191] [local] [def] [set_constants]
+!155 = !{!"0x2e\00lhsinit\00lhsinit\00\00855\001\001\000\006\00256\001\00855", !1, !5, !115, null, null, null, null, !156} ; [ DW_TAG_subprogram ] [line 855] [local] [def] [lhsinit]
+!156 = !{!157, !158, !159, !160, !161}
+!157 = !{!"0x100\00i\00857\000", !155, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 857]
+!158 = !{!"0x100\00j\00857\000", !155, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 857]
+!159 = !{!"0x100\00k\00857\000", !155, !5, !8} ; [ DW_TAG_auto_variable ] [k] [line 857]
+!160 = !{!"0x100\00m\00857\000", !155, !5, !8} ; [ DW_TAG_auto_variable ] [m] [line 857]
+!161 = !{!"0x100\00n\00857\000", !155, !5, !8} ; [ DW_TAG_auto_variable ] [n] [line 857]
+!162 = !{!"0x2e\00initialize\00initialize\00\00669\001\001\000\006\00256\001\00669", !1, !5, !115, null, null, null, null, !163} ; [ DW_TAG_subprogram ] [line 669] [local] [def] [initialize]
+!163 = !{!164, !165, !166, !167, !168, !169, !170, !171, !172, !173, !174, !179, !180, !181, !182}
+!164 = !{!"0x100\00i\00679\000", !162, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 679]
+!165 = !{!"0x100\00j\00679\000", !162, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 679]
+!166 = !{!"0x100\00k\00679\000", !162, !5, !8} ; [ DW_TAG_auto_variable ] [k] [line 679]
+!167 = !{!"0x100\00m\00679\000", !162, !5, !8} ; [ DW_TAG_auto_variable ] [m] [line 679]
+!168 = !{!"0x100\00ix\00679\000", !162, !5, !8} ; [ DW_TAG_auto_variable ] [ix] [line 679]
+!169 = !{!"0x100\00iy\00679\000", !162, !5, !8} ; [ DW_TAG_auto_variable ] [iy] [line 679]
+!170 = !{!"0x100\00iz\00679\000", !162, !5, !8} ; [ DW_TAG_auto_variable ] [iz] [line 679]
+!171 = !{!"0x100\00xi\00680\000", !162, !5, !20} ; [ DW_TAG_auto_variable ] [xi] [line 680]
+!172 = !{!"0x100\00eta\00680\000", !162, !5, !20} ; [ DW_TAG_auto_variable ] [eta] [line 680]
+!173 = !{!"0x100\00zeta\00680\000", !162, !5, !20} ; [ DW_TAG_auto_variable ] [zeta] [line 680]
+!174 = !{!"0x100\00Pface\00680\000", !162, !5, !175} ; [ DW_TAG_auto_variable ] [Pface] [line 680]
+!175 = !{!"0x1\00\000\001920\0064\000\000", null, null, !20, !176, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1920, align 64, offset 0] [from double]
+!176 = !{!177, !178, !93}
+!177 = !{!"0x21\000\002"}       ; [ DW_TAG_subrange_type ] [0, 1]
+!178 = !{!"0x21\000\003"}       ; [ DW_TAG_subrange_type ] [0, 2]
+!179 = !{!"0x100\00Pxi\00680\000", !162, !5, !20} ; [ DW_TAG_auto_variable ] [Pxi] [line 680]
+!180 = !{!"0x100\00Peta\00680\000", !162, !5, !20} ; [ DW_TAG_auto_variable ] [Peta] [line 680]
+!181 = !{!"0x100\00Pzeta\00680\000", !162, !5, !20} ; [ DW_TAG_auto_variable ] [Pzeta] [line 680]
+!182 = !{!"0x100\00temp\00680\000", !162, !5, !91} ; [ DW_TAG_auto_variable ] [temp] [line 680]
+!183 = !{!"0x2e\00exact_rhs\00exact_rhs\00\00301\001\001\000\006\00256\001\00301", !1, !5, !115, null, null, null, null, !184} ; [ DW_TAG_subprogram ] [line 301] [local] [def] [exact_rhs]
+!184 = !{!185, !186, !187, !188, !189, !190, !191, !192, !193, !194, !195, !196, !197, !198, !199}
+!185 = !{!"0x100\00dtemp\00310\000", !183, !5, !91} ; [ DW_TAG_auto_variable ] [dtemp] [line 310]
+!186 = !{!"0x100\00xi\00310\000", !183, !5, !20} ; [ DW_TAG_auto_variable ] [xi] [line 310]
+!187 = !{!"0x100\00eta\00310\000", !183, !5, !20} ; [ DW_TAG_auto_variable ] [eta] [line 310]
+!188 = !{!"0x100\00zeta\00310\000", !183, !5, !20} ; [ DW_TAG_auto_variable ] [zeta] [line 310]
+!189 = !{!"0x100\00dtpp\00310\000", !183, !5, !20} ; [ DW_TAG_auto_variable ] [dtpp] [line 310]
+!190 = !{!"0x100\00m\00311\000", !183, !5, !8} ; [ DW_TAG_auto_variable ] [m] [line 311]
+!191 = !{!"0x100\00i\00311\000", !183, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 311]
+!192 = !{!"0x100\00j\00311\000", !183, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 311]
+!193 = !{!"0x100\00k\00311\000", !183, !5, !8} ; [ DW_TAG_auto_variable ] [k] [line 311]
+!194 = !{!"0x100\00ip1\00311\000", !183, !5, !8} ; [ DW_TAG_auto_variable ] [ip1] [line 311]
+!195 = !{!"0x100\00im1\00311\000", !183, !5, !8} ; [ DW_TAG_auto_variable ] [im1] [line 311]
+!196 = !{!"0x100\00jp1\00311\000", !183, !5, !8} ; [ DW_TAG_auto_variable ] [jp1] [line 311]
+!197 = !{!"0x100\00jm1\00311\000", !183, !5, !8} ; [ DW_TAG_auto_variable ] [jm1] [line 311]
+!198 = !{!"0x100\00km1\00311\000", !183, !5, !8} ; [ DW_TAG_auto_variable ] [km1] [line 311]
+!199 = !{!"0x100\00kp1\00311\000", !183, !5, !8} ; [ DW_TAG_auto_variable ] [kp1] [line 311]
+!200 = !{!"0x2e\00adi\00adi\00\00210\001\001\000\006\00256\001\00210", !1, !5, !115, null, null, null, null, !2} ; [ DW_TAG_subprogram ] [line 210] [local] [def] [adi]
+!201 = !{!"0x2e\00add\00add\00\00187\001\001\000\006\00256\001\00187", !1, !5, !115, null, null, null, null, !202} ; [ DW_TAG_subprogram ] [line 187] [local] [def] [add]
+!202 = !{!203, !204, !205, !206}
+!203 = !{!"0x100\00i\00193\000", !201, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 193]
+!204 = !{!"0x100\00j\00193\000", !201, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 193]
+!205 = !{!"0x100\00k\00193\000", !201, !5, !8} ; [ DW_TAG_auto_variable ] [k] [line 193]
+!206 = !{!"0x100\00m\00193\000", !201, !5, !8} ; [ DW_TAG_auto_variable ] [m] [line 193]
+!207 = !{!"0x2e\00z_solve\00z_solve\00\003457\001\001\000\006\00256\001\003457", !1, !5, !115, null, null, null, null, !2} ; [ DW_TAG_subprogram ] [line 3457] [local] [def] [z_solve]
+!208 = !{!"0x2e\00z_backsubstitute\00z_backsubstitute\00\003480\001\001\000\006\00256\001\003480", !1, !5, !115, null, null, null, null, !209} ; [ DW_TAG_subprogram ] [line 3480] [local] [def] [z_backsubstitute]
+!209 = !{!210, !211, !212, !213, !214}
+!210 = !{!"0x100\00i\003492\000", !208, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 3492]
+!211 = !{!"0x100\00j\003492\000", !208, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 3492]
+!212 = !{!"0x100\00k\003492\000", !208, !5, !8} ; [ DW_TAG_auto_variable ] [k] [line 3492]
+!213 = !{!"0x100\00m\003492\000", !208, !5, !8} ; [ DW_TAG_auto_variable ] [m] [line 3492]
+!214 = !{!"0x100\00n\003492\000", !208, !5, !8} ; [ DW_TAG_auto_variable ] [n] [line 3492]
+!215 = !{!"0x2e\00z_solve_cell\00z_solve_cell\00\003512\001\001\000\006\00256\001\003512", !1, !5, !115, null, null, null, null, !216} ; [ DW_TAG_subprogram ] [line 3512] [local] [def] [z_solve_cell]
+!216 = !{!217, !218, !219, !220}
+!217 = !{!"0x100\00i\003527\000", !215, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 3527]
+!218 = !{!"0x100\00j\003527\000", !215, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 3527]
+!219 = !{!"0x100\00k\003527\000", !215, !5, !8} ; [ DW_TAG_auto_variable ] [k] [line 3527]
+!220 = !{!"0x100\00ksize\003527\000", !215, !5, !8} ; [ DW_TAG_auto_variable ] [ksize] [line 3527]
+!221 = !{!"0x2e\00binvrhs\00binvrhs\00\003154\001\001\000\006\00256\001\003154", !1, !5, !222, null, null, null, null, !225} ; [ DW_TAG_subprogram ] [line 3154] [local] [def] [binvrhs]
+!222 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !223, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!223 = !{null, !224, !105}
+!224 = !{!"0xf\00\000\0064\0064\000\000", null, null, !91} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!225 = !{!226, !227, !228, !229}
+!226 = !{!"0x101\00lhs\0016780370\000", !221, !5, !224} ; [ DW_TAG_arg_variable ] [lhs] [line 3154]
+!227 = !{!"0x101\00r\0033557586\000", !221, !5, !105} ; [ DW_TAG_arg_variable ] [r] [line 3154]
+!228 = !{!"0x100\00pivot\003159\000", !221, !5, !20} ; [ DW_TAG_auto_variable ] [pivot] [line 3159]
+!229 = !{!"0x100\00coeff\003159\000", !221, !5, !20} ; [ DW_TAG_auto_variable ] [coeff] [line 3159]
+!230 = !{!"0x2e\00matmul_sub\00matmul_sub\00\002841\001\001\000\006\00256\001\002842", !1, !5, !231, null, null, null, null, !233} ; [ DW_TAG_subprogram ] [line 2841] [local] [def] [scope 2842] [matmul_sub]
+!231 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !232, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!232 = !{null, !224, !224, !224}
+!233 = !{!234, !235, !236, !237}
+!234 = !{!"0x101\00ablock\0016780057\000", !230, !5, !224} ; [ DW_TAG_arg_variable ] [ablock] [line 2841]
+!235 = !{!"0x101\00bblock\0033557273\000", !230, !5, !224} ; [ DW_TAG_arg_variable ] [bblock] [line 2841]
+!236 = !{!"0x101\00cblock\0050334490\000", !230, !5, !224} ; [ DW_TAG_arg_variable ] [cblock] [line 2842]
+!237 = !{!"0x100\00j\002851\000", !230, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 2851]
+!238 = !{!"0x2e\00matvec_sub\00matvec_sub\00\002814\001\001\000\006\00256\001\002814", !1, !5, !239, null, null, null, null, !241} ; [ DW_TAG_subprogram ] [line 2814] [local] [def] [matvec_sub]
+!239 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !240, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!240 = !{null, !224, !105, !105}
+!241 = !{!242, !243, !244, !245}
+!242 = !{!"0x101\00ablock\0016780030\000", !238, !5, !224} ; [ DW_TAG_arg_variable ] [ablock] [line 2814]
+!243 = !{!"0x101\00avec\0033557246\000", !238, !5, !105} ; [ DW_TAG_arg_variable ] [avec] [line 2814]
+!244 = !{!"0x101\00bvec\0050334462\000", !238, !5, !105} ; [ DW_TAG_arg_variable ] [bvec] [line 2814]
+!245 = !{!"0x100\00i\002823\000", !238, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 2823]
+!246 = !{!"0x2e\00binvcrhs\00binvcrhs\00\002885\001\001\000\006\00256\001\002885", !1, !5, !247, null, null, null, null, !249} ; [ DW_TAG_subprogram ] [line 2885] [local] [def] [binvcrhs]
+!247 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !248, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!248 = !{null, !224, !224, !105}
+!249 = !{!250, !251, !252, !253, !254}
+!250 = !{!"0x101\00lhs\0016780101\000", !246, !5, !224} ; [ DW_TAG_arg_variable ] [lhs] [line 2885]
+!251 = !{!"0x101\00c\0033557317\000", !246, !5, !224} ; [ DW_TAG_arg_variable ] [c] [line 2885]
+!252 = !{!"0x101\00r\0050334533\000", !246, !5, !105} ; [ DW_TAG_arg_variable ] [r] [line 2885]
+!253 = !{!"0x100\00pivot\002890\000", !246, !5, !20} ; [ DW_TAG_auto_variable ] [pivot] [line 2890]
+!254 = !{!"0x100\00coeff\002890\000", !246, !5, !20} ; [ DW_TAG_auto_variable ] [coeff] [line 2890]
+!255 = !{!"0x2e\00lhsz\00lhsz\00\001475\001\001\000\006\00256\001\001475", !1, !5, !115, null, null, null, null, !256} ; [ DW_TAG_subprogram ] [line 1475] [local] [def] [lhsz]
+!256 = !{!257, !258, !259}
+!257 = !{!"0x100\00i\001484\000", !255, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 1484]
+!258 = !{!"0x100\00j\001484\000", !255, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 1484]
+!259 = !{!"0x100\00k\001484\000", !255, !5, !8} ; [ DW_TAG_auto_variable ] [k] [line 1484]
+!260 = !{!"0x2e\00y_solve\00y_solve\00\003299\001\001\000\006\00256\001\003299", !1, !5, !115, null, null, null, null, !2} ; [ DW_TAG_subprogram ] [line 3299] [local] [def] [y_solve]
+!261 = !{!"0x2e\00y_backsubstitute\00y_backsubstitute\00\003323\001\001\000\006\00256\001\003323", !1, !5, !115, null, null, null, null, !262} ; [ DW_TAG_subprogram ] [line 3323] [local] [def] [y_backsubstitute]
+!262 = !{!263, !264, !265, !266, !267}
+!263 = !{!"0x100\00i\003335\000", !261, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 3335]
+!264 = !{!"0x100\00j\003335\000", !261, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 3335]
+!265 = !{!"0x100\00k\003335\000", !261, !5, !8} ; [ DW_TAG_auto_variable ] [k] [line 3335]
+!266 = !{!"0x100\00m\003335\000", !261, !5, !8} ; [ DW_TAG_auto_variable ] [m] [line 3335]
+!267 = !{!"0x100\00n\003335\000", !261, !5, !8} ; [ DW_TAG_auto_variable ] [n] [line 3335]
+!268 = !{!"0x2e\00y_solve_cell\00y_solve_cell\00\003355\001\001\000\006\00256\001\003355", !1, !5, !115, null, null, null, null, !269} ; [ DW_TAG_subprogram ] [line 3355] [local] [def] [y_solve_cell]
+!269 = !{!270, !271, !272, !273}
+!270 = !{!"0x100\00i\003370\000", !268, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 3370]
+!271 = !{!"0x100\00j\003370\000", !268, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 3370]
+!272 = !{!"0x100\00k\003370\000", !268, !5, !8} ; [ DW_TAG_auto_variable ] [k] [line 3370]
+!273 = !{!"0x100\00jsize\003370\000", !268, !5, !8} ; [ DW_TAG_auto_variable ] [jsize] [line 3370]
+!274 = !{!"0x2e\00lhsy\00lhsy\00\001181\001\001\000\006\00256\001\001181", !1, !5, !115, null, null, null, null, !275} ; [ DW_TAG_subprogram ] [line 1181] [local] [def] [lhsy]
+!275 = !{!276, !277, !278}
+!276 = !{!"0x100\00i\001190\000", !274, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 1190]
+!277 = !{!"0x100\00j\001190\000", !274, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 1190]
+!278 = !{!"0x100\00k\001190\000", !274, !5, !8} ; [ DW_TAG_auto_variable ] [k] [line 1190]
+!279 = !{!"0x2e\00x_solve\00x_solve\00\002658\001\001\000\006\00256\001\002658", !1, !5, !115, null, null, null, null, !2} ; [ DW_TAG_subprogram ] [line 2658] [local] [def] [x_solve]
+!280 = !{!"0x2e\00x_backsubstitute\00x_backsubstitute\00\002684\001\001\000\006\00256\001\002684", !1, !5, !115, null, null, null, null, !281} ; [ DW_TAG_subprogram ] [line 2684] [local] [def] [x_backsubstitute]
+!281 = !{!282, !283, !284, !285, !286}
+!282 = !{!"0x100\00i\002696\000", !280, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 2696]
+!283 = !{!"0x100\00j\002696\000", !280, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 2696]
+!284 = !{!"0x100\00k\002696\000", !280, !5, !8} ; [ DW_TAG_auto_variable ] [k] [line 2696]
+!285 = !{!"0x100\00m\002696\000", !280, !5, !8} ; [ DW_TAG_auto_variable ] [m] [line 2696]
+!286 = !{!"0x100\00n\002696\000", !280, !5, !8} ; [ DW_TAG_auto_variable ] [n] [line 2696]
+!287 = !{!"0x2e\00x_solve_cell\00x_solve_cell\00\002716\001\001\000\006\00256\001\002716", !1, !5, !115, null, null, null, null, !288} ; [ DW_TAG_subprogram ] [line 2716] [local] [def] [x_solve_cell]
+!288 = !{!289, !290, !291, !292}
+!289 = !{!"0x100\00i\002728\000", !287, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 2728]
+!290 = !{!"0x100\00j\002728\000", !287, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 2728]
+!291 = !{!"0x100\00k\002728\000", !287, !5, !8} ; [ DW_TAG_auto_variable ] [k] [line 2728]
+!292 = !{!"0x100\00isize\002728\000", !287, !5, !8} ; [ DW_TAG_auto_variable ] [isize] [line 2728]
+!293 = !{!"0x2e\00lhsx\00lhsx\00\00898\001\001\000\006\00256\001\00898", !1, !5, !115, null, null, null, null, !294} ; [ DW_TAG_subprogram ] [line 898] [local] [def] [lhsx]
+!294 = !{!295, !296, !297}
+!295 = !{!"0x100\00i\00907\000", !293, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 907]
+!296 = !{!"0x100\00j\00907\000", !293, !5, !8} ; [ DW_TAG_auto_variable ] [j] [line 907]
+!297 = !{!"0x100\00k\00907\000", !293, !5, !8} ; [ DW_TAG_auto_variable ] [k] [line 907]
+!298 = !{!299, !304, !305, !309, !310, !311, !312, !313, !314, !315, !316, !317, !318, !319, !320, !321, !322, !323, !324, !325, !326, !327, !328, !329, !330, !331, !332, !333, !334, !335, !336, !337, !338, !339, !340, !341, !342, !343, !347, !350, !351, !352, !353, !354, !355, !356, !360, !361, !362, !363, !364, !365, !366, !367, !368, !369, !370, !371, !372, !373, !374, !375, !376, !377, !378, !379, !380, !381, !382, !383, !384, !385, !386, !387, !388, !389, !390, !391, !392, !393, !394, !395, !396, !397, !398, !399, !400, !401, !402, !403, !404, !405, !406, !407, !408, !409, !410, !411, !412, !413, !414, !415, !416, !417, !418, !419, !422, !426, !427, !430, !431, !434, !435, !436, !437}
+!299 = !{!"0x34\00grid_points\00grid_points\00\0028\001\001", null, !300, !302, [3 x i32]* @grid_points, null} ; [ DW_TAG_variable ] [grid_points] [line 28] [local] [def]
+!300 = !{!"0x29", !301}      ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/./header.h]
+!301 = !{!"./header.h", !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
+!302 = !{!"0x1\00\000\0096\0032\000\000", null, null, !8, !303, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 96, align 32, offset 0] [from int]
+!303 = !{!178}
+!304 = !{!"0x34\00dt\00dt\00\0035\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dt] [line 35] [local] [def]
+!305 = !{!"0x34\00rhs\00rhs\00\0068\001\001", null, !300, !306, null, null} ; [ DW_TAG_variable ] [rhs] [line 68] [local] [def]
+!306 = !{!"0x1\00\000\001385839040\0064\000\000", null, null, !20, !307, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1385839040, align 64, offset 0] [from double]
+!307 = !{!308, !308, !308, !93}
+!308 = !{!"0x21\000\00163"}     ; [ DW_TAG_subrange_type ] [0, 162]
+!309 = !{!"0x34\00zzcon5\00zzcon5\00\0042\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [zzcon5] [line 42] [local] [def]
+!310 = !{!"0x34\00zzcon4\00zzcon4\00\0042\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [zzcon4] [line 42] [local] [def]
+!311 = !{!"0x34\00zzcon3\00zzcon3\00\0042\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [zzcon3] [line 42] [local] [def]
+!312 = !{!"0x34\00dz5tz1\00dz5tz1\00\0043\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dz5tz1] [line 43] [local] [def]
+!313 = !{!"0x34\00dz4tz1\00dz4tz1\00\0043\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dz4tz1] [line 43] [local] [def]
+!314 = !{!"0x34\00dz3tz1\00dz3tz1\00\0043\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dz3tz1] [line 43] [local] [def]
+!315 = !{!"0x34\00zzcon2\00zzcon2\00\0042\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [zzcon2] [line 42] [local] [def]
+!316 = !{!"0x34\00dz2tz1\00dz2tz1\00\0043\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dz2tz1] [line 43] [local] [def]
+!317 = !{!"0x34\00tz2\00tz2\00\0031\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [tz2] [line 31] [local] [def]
+!318 = !{!"0x34\00dz1tz1\00dz1tz1\00\0043\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dz1tz1] [line 43] [local] [def]
+!319 = !{!"0x34\00yycon5\00yycon5\00\0040\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [yycon5] [line 40] [local] [def]
+!320 = !{!"0x34\00yycon4\00yycon4\00\0040\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [yycon4] [line 40] [local] [def]
+!321 = !{!"0x34\00yycon3\00yycon3\00\0040\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [yycon3] [line 40] [local] [def]
+!322 = !{!"0x34\00dy5ty1\00dy5ty1\00\0041\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dy5ty1] [line 41] [local] [def]
+!323 = !{!"0x34\00dy4ty1\00dy4ty1\00\0041\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dy4ty1] [line 41] [local] [def]
+!324 = !{!"0x34\00dy3ty1\00dy3ty1\00\0041\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dy3ty1] [line 41] [local] [def]
+!325 = !{!"0x34\00yycon2\00yycon2\00\0040\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [yycon2] [line 40] [local] [def]
+!326 = !{!"0x34\00dy2ty1\00dy2ty1\00\0041\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dy2ty1] [line 41] [local] [def]
+!327 = !{!"0x34\00ty2\00ty2\00\0031\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [ty2] [line 31] [local] [def]
+!328 = !{!"0x34\00dy1ty1\00dy1ty1\00\0041\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dy1ty1] [line 41] [local] [def]
+!329 = !{!"0x34\00dssp\00dssp\00\0035\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dssp] [line 35] [local] [def]
+!330 = !{!"0x34\00c1\00c1\00\0045\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c1] [line 45] [local] [def]
+!331 = !{!"0x34\00xxcon5\00xxcon5\00\0038\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [xxcon5] [line 38] [local] [def]
+!332 = !{!"0x34\00xxcon4\00xxcon4\00\0038\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [xxcon4] [line 38] [local] [def]
+!333 = !{!"0x34\00xxcon3\00xxcon3\00\0038\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [xxcon3] [line 38] [local] [def]
+!334 = !{!"0x34\00dx5tx1\00dx5tx1\00\0039\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dx5tx1] [line 39] [local] [def]
+!335 = !{!"0x34\00dx4tx1\00dx4tx1\00\0039\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dx4tx1] [line 39] [local] [def]
+!336 = !{!"0x34\00dx3tx1\00dx3tx1\00\0039\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dx3tx1] [line 39] [local] [def]
+!337 = !{!"0x34\00c2\00c2\00\0045\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c2] [line 45] [local] [def]
+!338 = !{!"0x34\00con43\00con43\00\0048\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [con43] [line 48] [local] [def]
+!339 = !{!"0x34\00xxcon2\00xxcon2\00\0038\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [xxcon2] [line 38] [local] [def]
+!340 = !{!"0x34\00dx2tx1\00dx2tx1\00\0039\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dx2tx1] [line 39] [local] [def]
+!341 = !{!"0x34\00tx2\00tx2\00\0031\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [tx2] [line 31] [local] [def]
+!342 = !{!"0x34\00dx1tx1\00dx1tx1\00\0039\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dx1tx1] [line 39] [local] [def]
+!343 = !{!"0x34\00forcing\00forcing\00\0066\001\001", null, !300, !344, null, null} ; [ DW_TAG_variable ] [forcing] [line 66] [local] [def]
+!344 = !{!"0x1\00\000\001663006848\0064\000\000", null, null, !20, !345, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1663006848, align 64, offset 0] [from double]
+!345 = !{!308, !308, !308, !346}
+!346 = !{!"0x21\000\006"}       ; [ DW_TAG_subrange_type ] [0, 5]
+!347 = !{!"0x34\00qs\00qs\00\0063\001\001", null, !300, !348, null, null} ; [ DW_TAG_variable ] [qs] [line 63] [local] [def]
+!348 = !{!"0x1\00\000\00277167808\0064\000\000", null, null, !20, !349, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 277167808, align 64, offset 0] [from double]
+!349 = !{!308, !308, !308}
+!350 = !{!"0x34\00square\00square\00\0065\001\001", null, !300, !348, null, null} ; [ DW_TAG_variable ] [square] [line 65] [local] [def]
+!351 = !{!"0x34\00ws\00ws\00\0062\001\001", null, !300, !348, null, null} ; [ DW_TAG_variable ] [ws] [line 62] [local] [def]
+!352 = !{!"0x34\00vs\00vs\00\0061\001\001", null, !300, !348, null, null} ; [ DW_TAG_variable ] [vs] [line 61] [local] [def]
+!353 = !{!"0x34\00us\00us\00\0060\001\001", null, !300, !348, null, null} ; [ DW_TAG_variable ] [us] [line 60] [local] [def]
+!354 = !{!"0x34\00rho_i\00rho_i\00\0064\001\001", null, !300, !348, null, null} ; [ DW_TAG_variable ] [rho_i] [line 64] [local] [def]
+!355 = !{!"0x34\00u\00u\00\0067\001\001", null, !300, !306, null, null} ; [ DW_TAG_variable ] [u] [line 67] [local] [def]
+!356 = !{!"0x34\00ce\00ce\00\0036\001\001", null, !300, !357, null, null} ; [ DW_TAG_variable ] [ce] [line 36] [local] [def]
+!357 = !{!"0x1\00\000\004160\0064\000\000", null, null, !20, !358, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 4160, align 64, offset 0] [from double]
+!358 = !{!93, !359}
+!359 = !{!"0x21\000\0013"}      ; [ DW_TAG_subrange_type ] [0, 12]
+!360 = !{!"0x34\00dnzm1\00dnzm1\00\0044\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dnzm1] [line 44] [local] [def]
+!361 = !{!"0x34\00dnym1\00dnym1\00\0044\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dnym1] [line 44] [local] [def]
+!362 = !{!"0x34\00dnxm1\00dnxm1\00\0044\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dnxm1] [line 44] [local] [def]
+!363 = !{!"0x34\00zzcon1\00zzcon1\00\0042\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [zzcon1] [line 42] [local] [def]
+!364 = !{!"0x34\00yycon1\00yycon1\00\0040\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [yycon1] [line 40] [local] [def]
+!365 = !{!"0x34\00xxcon1\00xxcon1\00\0038\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [xxcon1] [line 38] [local] [def]
+!366 = !{!"0x34\00con16\00con16\00\0048\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [con16] [line 48] [local] [def]
+!367 = !{!"0x34\00c2iv\00c2iv\00\0048\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c2iv] [line 48] [local] [def]
+!368 = !{!"0x34\00c3c4tz3\00c3c4tz3\00\0048\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c3c4tz3] [line 48] [local] [def]
+!369 = !{!"0x34\00c3c4ty3\00c3c4ty3\00\0048\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c3c4ty3] [line 48] [local] [def]
+!370 = !{!"0x34\00c3c4tx3\00c3c4tx3\00\0048\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c3c4tx3] [line 48] [local] [def]
+!371 = !{!"0x34\00comz6\00comz6\00\0047\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [comz6] [line 47] [local] [def]
+!372 = !{!"0x34\00comz5\00comz5\00\0047\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [comz5] [line 47] [local] [def]
+!373 = !{!"0x34\00comz4\00comz4\00\0047\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [comz4] [line 47] [local] [def]
+!374 = !{!"0x34\00comz1\00comz1\00\0047\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [comz1] [line 47] [local] [def]
+!375 = !{!"0x34\00dtdssp\00dtdssp\00\0045\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dtdssp] [line 45] [local] [def]
+!376 = !{!"0x34\00c2dttz1\00c2dttz1\00\0047\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c2dttz1] [line 47] [local] [def]
+!377 = !{!"0x34\00c2dtty1\00c2dtty1\00\0047\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c2dtty1] [line 47] [local] [def]
+!378 = !{!"0x34\00c2dttx1\00c2dttx1\00\0047\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c2dttx1] [line 47] [local] [def]
+!379 = !{!"0x34\00dttz2\00dttz2\00\0046\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dttz2] [line 46] [local] [def]
+!380 = !{!"0x34\00dttz1\00dttz1\00\0046\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dttz1] [line 46] [local] [def]
+!381 = !{!"0x34\00dtty2\00dtty2\00\0046\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dtty2] [line 46] [local] [def]
+!382 = !{!"0x34\00dtty1\00dtty1\00\0046\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dtty1] [line 46] [local] [def]
+!383 = !{!"0x34\00dttx2\00dttx2\00\0046\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dttx2] [line 46] [local] [def]
+!384 = !{!"0x34\00dttx1\00dttx1\00\0046\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dttx1] [line 46] [local] [def]
+!385 = !{!"0x34\00c5dssp\00c5dssp\00\0045\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c5dssp] [line 45] [local] [def]
+!386 = !{!"0x34\00c4dssp\00c4dssp\00\0045\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c4dssp] [line 45] [local] [def]
+!387 = !{!"0x34\00dzmax\00dzmax\00\0037\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dzmax] [line 37] [local] [def]
+!388 = !{!"0x34\00dymax\00dymax\00\0037\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dymax] [line 37] [local] [def]
+!389 = !{!"0x34\00dxmax\00dxmax\00\0037\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dxmax] [line 37] [local] [def]
+!390 = !{!"0x34\00dz5\00dz5\00\0034\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dz5] [line 34] [local] [def]
+!391 = !{!"0x34\00dz4\00dz4\00\0034\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dz4] [line 34] [local] [def]
+!392 = !{!"0x34\00dz3\00dz3\00\0034\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dz3] [line 34] [local] [def]
+!393 = !{!"0x34\00dz2\00dz2\00\0034\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dz2] [line 34] [local] [def]
+!394 = !{!"0x34\00dz1\00dz1\00\0034\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dz1] [line 34] [local] [def]
+!395 = !{!"0x34\00dy5\00dy5\00\0033\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dy5] [line 33] [local] [def]
+!396 = !{!"0x34\00dy4\00dy4\00\0033\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dy4] [line 33] [local] [def]
+!397 = !{!"0x34\00dy3\00dy3\00\0033\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dy3] [line 33] [local] [def]
+!398 = !{!"0x34\00dy2\00dy2\00\0033\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dy2] [line 33] [local] [def]
+!399 = !{!"0x34\00dy1\00dy1\00\0033\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dy1] [line 33] [local] [def]
+!400 = !{!"0x34\00dx5\00dx5\00\0032\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dx5] [line 32] [local] [def]
+!401 = !{!"0x34\00dx4\00dx4\00\0032\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dx4] [line 32] [local] [def]
+!402 = !{!"0x34\00dx3\00dx3\00\0032\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dx3] [line 32] [local] [def]
+!403 = !{!"0x34\00dx2\00dx2\00\0032\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dx2] [line 32] [local] [def]
+!404 = !{!"0x34\00dx1\00dx1\00\0032\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [dx1] [line 32] [local] [def]
+!405 = !{!"0x34\00tz3\00tz3\00\0031\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [tz3] [line 31] [local] [def]
+!406 = !{!"0x34\00tz1\00tz1\00\0031\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [tz1] [line 31] [local] [def]
+!407 = !{!"0x34\00ty3\00ty3\00\0031\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [ty3] [line 31] [local] [def]
+!408 = !{!"0x34\00ty1\00ty1\00\0031\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [ty1] [line 31] [local] [def]
+!409 = !{!"0x34\00tx3\00tx3\00\0031\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [tx3] [line 31] [local] [def]
+!410 = !{!"0x34\00tx1\00tx1\00\0031\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [tx1] [line 31] [local] [def]
+!411 = !{!"0x34\00conz1\00conz1\00\0045\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [conz1] [line 45] [local] [def]
+!412 = !{!"0x34\00c1345\00c1345\00\0044\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c1345] [line 44] [local] [def]
+!413 = !{!"0x34\00c3c4\00c3c4\00\0044\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c3c4] [line 44] [local] [def]
+!414 = !{!"0x34\00c1c5\00c1c5\00\0044\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c1c5] [line 44] [local] [def]
+!415 = !{!"0x34\00c1c2\00c1c2\00\0044\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c1c2] [line 44] [local] [def]
+!416 = !{!"0x34\00c5\00c5\00\0045\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c5] [line 45] [local] [def]
+!417 = !{!"0x34\00c4\00c4\00\0045\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c4] [line 45] [local] [def]
+!418 = !{!"0x34\00c3\00c3\00\0045\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [c3] [line 45] [local] [def]
+!419 = !{!"0x34\00lhs\00lhs\00\0069\001\001", null, !300, !420, null, null} ; [ DW_TAG_variable ] [lhs] [line 69] [local] [def]
+!420 = !{!"0x1\00\000\0020787585600\0064\000\000", null, null, !20, !421, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 20787585600, align 64, offset 0] [from double]
+!421 = !{!308, !308, !308, !178, !93, !93}
+!422 = !{!"0x34\00q\00q\00\0073\001\001", null, !300, !423, null, null} ; [ DW_TAG_variable ] [q] [line 73] [local] [def]
+!423 = !{!"0x1\00\000\0010368\0064\000\000", null, null, !20, !424, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 10368, align 64, offset 0] [from double]
+!424 = !{!425}
+!425 = !{!"0x21\000\00162"}     ; [ DW_TAG_subrange_type ] [0, 161]
+!426 = !{!"0x34\00cuf\00cuf\00\0072\001\001", null, !300, !423, null, null} ; [ DW_TAG_variable ] [cuf] [line 72] [local] [def]
+!427 = !{!"0x34\00buf\00buf\00\0075\001\001", null, !300, !428, null, null} ; [ DW_TAG_variable ] [buf] [line 75] [local] [def]
+!428 = !{!"0x1\00\000\0051840\0064\000\000", null, null, !20, !429, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 51840, align 64, offset 0] [from double]
+!429 = !{!425, !93}
+!430 = !{!"0x34\00ue\00ue\00\0074\001\001", null, !300, !428, null, null} ; [ DW_TAG_variable ] [ue] [line 74] [local] [def]
+!431 = !{!"0x34\00njac\00njac\00\0086\001\001", null, !300, !432, null, null} ; [ DW_TAG_variable ] [njac] [line 86] [local] [def]
+!432 = !{!"0x1\00\000\006886684800\0064\000\000", null, null, !20, !433, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 6886684800, align 64, offset 0] [from double]
+!433 = !{!308, !308, !425, !93, !93}
+!434 = !{!"0x34\00fjac\00fjac\00\0084\001\001", null, !300, !432, null, null} ; [ DW_TAG_variable ] [fjac] [line 84] [local] [def]
+!435 = !{!"0x34\00tmp3\00tmp3\00\0088\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [tmp3] [line 88] [local] [def]
+!436 = !{!"0x34\00tmp2\00tmp2\00\0088\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [tmp2] [line 88] [local] [def]
+!437 = !{!"0x34\00tmp1\00tmp1\00\0088\001\001", null, !300, !20, null, null} ; [ DW_TAG_variable ] [tmp1] [line 88] [local] [def]
+!438 = !{i32 2, !"Dwarf Version", i32 4}
+!439 = !MDLocation(line: 1898, scope: !440)
+!440 = !{!"0xb\001898\000\00107", !1, !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!441 = !MDLocation(line: 1913, scope: !442)
+!442 = !{!"0xb\001913\000\00115", !1, !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!443 = !MDLocation(line: 1923, scope: !114)
+!444 = !{!"int", !445}
+!445 = !{!"omnipotent char", !446}
+!446 = !{!"Simple C/C++ TBAA"}
+!447 = !{i32 1}
+!448 = !MDLocation(line: 1925, scope: !449)
+!449 = !{!"0xb\001925\000\00121", !1, !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!450 = !MDLocation(line: 1939, scope: !451)
+!451 = !{!"0xb\001939\000\00127", !1, !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!452 = !MDLocation(line: 1940, scope: !453)
+!453 = !{!"0xb\001940\000\00129", !1, !454} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!454 = !{!"0xb\001939\000\00128", !1, !451} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!455 = !MDLocation(line: 1941, scope: !456)
+!456 = !{!"0xb\001941\000\00131", !1, !457} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!457 = !{!"0xb\001940\000\00130", !1, !453} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!458 = !MDLocation(line: 2020, scope: !459)
+!459 = !{!"0xb\002020\000\00149", !1, !460} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!460 = !{!"0xb\002019\000\00148", !1, !461} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!461 = !{!"0xb\002019\000\00147", !1, !462} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!462 = !{!"0xb\002018\000\00146", !1, !463} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!463 = !{!"0xb\002018\000\00145", !1, !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!464 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/PowerPC/preincprep-invoke.ll b/test/CodeGen/PowerPC/preincprep-invoke.ll
new file mode 100644
index 0000000..473b7d0
--- /dev/null
+++ b/test/CodeGen/PowerPC/preincprep-invoke.ll
@@ -0,0 +1,50 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@.str1 = external unnamed_addr constant [1 x i8], align 1
+@.str2 = external unnamed_addr constant [39 x i8], align 1
+
+declare void @_ZN13CStdOutStreamlsEPKc()
+
+declare void @_ZN13CStdOutStream5FlushEv()
+
+declare i32 @__gxx_personality_v0(...)
+
+define void @_Z11GetPasswordP13CStdOutStreamb() {
+entry:
+  br label %for.cond.i.i
+
+for.cond.i.i:                                     ; preds = %for.cond.i.i, %entry
+  br i1 undef, label %_ZN11CStringBaseIcEC2EPKc.exit.critedge, label %for.cond.i.i
+
+_ZN11CStringBaseIcEC2EPKc.exit.critedge:          ; preds = %for.cond.i.i
+  invoke void @_ZN13CStdOutStreamlsEPKc()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %_ZN11CStringBaseIcEC2EPKc.exit.critedge
+  invoke void @_ZN13CStdOutStream5FlushEv()
+          to label %invoke.cont4 unwind label %lpad
+
+invoke.cont4:                                     ; preds = %invoke.cont
+  %call7 = invoke i8* @getpass()
+          to label %for.cond.i.i30 unwind label %lpad
+
+; CHECK-LABEL: @_Z11GetPasswordP13CStdOutStreamb
+; CHECK: addi {{[0-9]+}}, 3, -1
+
+for.cond.i.i30:                                   ; preds = %for.cond.i.i30, %invoke.cont4
+  %indvars.iv.i.i26 = phi i64 [ %indvars.iv.next.i.i29, %for.cond.i.i30 ], [ 0, %invoke.cont4 ]
+  %arrayidx.i.i27 = getelementptr inbounds i8* %call7, i64 %indvars.iv.i.i26
+  %0 = load i8* %arrayidx.i.i27, align 1
+  %indvars.iv.next.i.i29 = add nuw nsw i64 %indvars.iv.i.i26, 1
+  br label %for.cond.i.i30
+
+lpad:                                             ; preds = %invoke.cont4, %invoke.cont, %_ZN11CStringBaseIcEC2EPKc.exit.critedge
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  resume { i8*, i32 } undef
+}
+
+declare i8* @getpass()
+
diff --git a/test/CodeGen/PowerPC/qpx-bv-sint.ll b/test/CodeGen/PowerPC/qpx-bv-sint.ll
new file mode 100644
index 0000000..0bc14ed
--- /dev/null
+++ b/test/CodeGen/PowerPC/qpx-bv-sint.ll
@@ -0,0 +1,33 @@
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+; RUN: llc < %s -march=ppc64 -mcpu=a2q | FileCheck %s
+
+define void @s452() nounwind {
+entry:
+  br label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %entry
+  %conv.4 = sitofp i32 undef to double
+  %conv.5 = sitofp i32 undef to double
+  %mul.4.v.i0.1 = insertelement <2 x double> undef, double %conv.4, i32 0
+  %mul.4.v.i0.2 = insertelement <2 x double> %mul.4.v.i0.1, double %conv.5, i32 1
+  %mul.4 = fmul <2 x double> %mul.4.v.i0.2, undef
+  %add7.4 = fadd <2 x double> undef, %mul.4
+  store <2 x double> %add7.4, <2 x double>* undef, align 16
+  br i1 undef, label %for.end, label %for.body4
+
+for.end:                                          ; preds = %for.body4
+  unreachable
+; CHECK-LABEL: @s452
+; CHECK: lfiwax [[REG1:[0-9]+]],
+; CHECK: fcfid [[REG2:[0-9]+]], [[REG1]]
+; FIXME: We could 'promote' this to a vector earlier and remove this splat.
+; CHECK: qvesplati {{[0-9]+}}, [[REG2]], 0
+; CHECK: qvfmul
+; CHECK: qvfadd
+; CHECK: qvesplati {{[0-9]+}},
+; FIXME: We can use qvstfcdx here instead of two stores.
+; CHECK: stfd
+; CHECK: stfd
+}
+
diff --git a/test/CodeGen/PowerPC/qpx-bv.ll b/test/CodeGen/PowerPC/qpx-bv.ll
new file mode 100644
index 0000000..ae181de
--- /dev/null
+++ b/test/CodeGen/PowerPC/qpx-bv.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -mcpu=a2q | FileCheck %s
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+define <4 x double> @foo(double %f1, double %f2, double %f3, double %f4) {
+  %v1 = insertelement <4 x double> undef, double %f1, i32 0
+  %v2 = insertelement <4 x double> %v1,   double %f2, i32 1
+  %v3 = insertelement <4 x double> %v2,   double %f3, i32 2
+  %v4 = insertelement <4 x double> %v3,   double %f4, i32 3
+  ret <4 x double> %v4
+
+; CHECK-LABEL: @foo
+; CHECK: qvgpci [[REG1:[0-9]+]], 275
+; CHECK-DAG: qvgpci [[REG2:[0-9]+]], 101
+; CHECK-DAG: qvfperm [[REG3:[0-9]+]], 3, 4, [[REG1]]
+; CHECK-DAG: qvfperm [[REG4:[0-9]+]], 1, 2, [[REG1]]
+; CHECK-DAG: qvfperm 1, [[REG4]], [[REG3]], [[REG2]]
+; CHECK: blr
+}
+
+define <4 x float> @goo(float %f1, float %f2, float %f3, float %f4) {
+  %v1 = insertelement <4 x float> undef, float %f1, i32 0
+  %v2 = insertelement <4 x float> %v1,   float %f2, i32 1
+  %v3 = insertelement <4 x float> %v2,   float %f3, i32 2
+  %v4 = insertelement <4 x float> %v3,   float %f4, i32 3
+  ret <4 x float> %v4
+
+; CHECK-LABEL: @goo
+; CHECK: qvgpci [[REG1:[0-9]+]], 275
+; CHECK-DAG: qvgpci [[REG2:[0-9]+]], 101
+; CHECK-DAG: qvfperm [[REG3:[0-9]+]], 3, 4, [[REG1]]
+; CHECK-DAG: qvfperm [[REG4:[0-9]+]], 1, 2, [[REG1]]
+; CHECK-DAG: qvfperm 1, [[REG4]], [[REG3]], [[REG2]]
+; CHECK: blr
+}
+
diff --git a/test/CodeGen/PowerPC/qpx-func-clobber.ll b/test/CodeGen/PowerPC/qpx-func-clobber.ll
new file mode 100644
index 0000000..511fa38
--- /dev/null
+++ b/test/CodeGen/PowerPC/qpx-func-clobber.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=ppc64 -mcpu=a2q | FileCheck %s
+target triple = "powerpc64-bgq-linux"
+
+declare <4 x double> @foo(<4 x double> %p)
+
+define <4 x double> @bar(<4 x double> %p, <4 x double> %q) {
+entry:
+  %v = call <4 x double> @foo(<4 x double> %p)
+  %w = call <4 x double> @foo(<4 x double> %q)
+  %x = fadd <4 x double> %v, %w
+  ret <4 x double> %x
+
+; CHECK-LABEL: @bar
+; CHECK: qvstfdx 2,
+; CHECK: bl foo
+; CHECK: qvstfdx 1,
+; CHECK: qvlfdx 1,
+; CHECK: bl foo
+; CHECK: qvlfdx [[REG:[0-9]+]],
+; CHECK: qvfadd 1, [[REG]], 1
+}
+
diff --git a/test/CodeGen/PowerPC/qpx-load.ll b/test/CodeGen/PowerPC/qpx-load.ll
new file mode 100644
index 0000000..bea3477
--- /dev/null
+++ b/test/CodeGen/PowerPC/qpx-load.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=ppc64 -mcpu=a2q | FileCheck %s
+target triple = "powerpc64-bgq-linux"
+
+define <4 x double> @foo(<4 x double>* %p) {
+entry:
+  %v = load <4 x double>* %p, align 8
+  ret <4 x double> %v
+}
+
+; CHECK: @foo
+; CHECK-DAG: li [[REG1:[0-9]+]], 31
+; CHECK-DAG: qvlfdx [[REG4:[0-9]+]], 0, 3
+; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], 3, [[REG1]]
+; CHECK-DAG: qvlpcldx [[REG3:[0-9]+]], 0, 3
+; CHECK-DAG: qvfperm 1, [[REG4]], [[REG2]], [[REG3]]
+; CHECK: blr
+
+define <4 x double> @bar(<4 x double>* %p) {
+entry:
+  %v = load <4 x double>* %p, align 32
+  ret <4 x double> %v
+}
+
+; CHECK: @bar
+; CHECK: qvlfdx
+
diff --git a/test/CodeGen/PowerPC/qpx-recipest.ll b/test/CodeGen/PowerPC/qpx-recipest.ll
new file mode 100644
index 0000000..0e01358
--- /dev/null
+++ b/test/CodeGen/PowerPC/qpx-recipest.ll
@@ -0,0 +1,194 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2q -enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2q | FileCheck -check-prefix=CHECK-SAFE %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <4 x double> @foo(<4 x double> %a, <4 x double> %b) nounwind {
+entry:
+  %x = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %b)
+  %r = fdiv <4 x double> %a, %x
+  ret <4 x double> %r
+
+; CHECK-LABEL: @foo
+; CHECK: qvfrsqrte
+; CHECK: qvfmul
+; FIXME: We're currently loading two constants here (1.5 and -1.5), and using
+;        an qvfmadd instead of a qvfnmsub
+; CHECK: qvfmadd
+; CHECK: qvfmadd
+; CHECK: qvfmul
+; CHECK: qvfmul
+; CHECK: qvfmadd
+; CHECK: qvfmul
+; CHECK: qvfmul
+; CHECK: blr
+
+; CHECK-SAFE-LABEL: @foo
+; CHECK-SAFE: fsqrt
+; CHECK-SAFE: fdiv
+; CHECK-SAFE: blr
+}
+
+define <4 x double> @foof(<4 x double> %a, <4 x float> %b) nounwind {
+entry:
+  %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b)
+  %y = fpext <4 x float> %x to <4 x double>
+  %r = fdiv <4 x double> %a, %y
+  ret <4 x double> %r
+
+; CHECK-LABEL: @foof
+; CHECK: qvfrsqrtes
+; CHECK: qvfmuls
+; FIXME: We're currently loading two constants here (1.5 and -1.5), and using
+;        an qvfmadd instead of a qvfnmsubs
+; CHECK: qvfmadds
+; CHECK: qvfmadds
+; CHECK: qvfmuls
+; CHECK: qvfmul
+; CHECK: blr
+
+; CHECK-SAFE-LABEL: @foof
+; CHECK-SAFE: fsqrts
+; CHECK-SAFE: fdiv
+; CHECK-SAFE: blr
+}
+
+define <4 x float> @food(<4 x float> %a, <4 x double> %b) nounwind {
+entry:
+  %x = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %b)
+  %y = fptrunc <4 x double> %x to <4 x float>
+  %r = fdiv <4 x float> %a, %y
+  ret <4 x float> %r
+
+; CHECK-LABEL: @food
+; CHECK: qvfrsqrte
+; CHECK: qvfmul
+; FIXME: We're currently loading two constants here (1.5 and -1.5), and using
+;        an qvfmadd instead of a qvfnmsub
+; CHECK: qvfmadd
+; CHECK: qvfmadd
+; CHECK: qvfmul
+; CHECK: qvfmul
+; CHECK: qvfmadd
+; CHECK: qvfmul
+; CHECK: qvfrsp
+; CHECK: qvfmuls
+; CHECK: blr
+
+; CHECK-SAFE-LABEL: @food
+; CHECK-SAFE: fsqrt
+; CHECK-SAFE: fdivs
+; CHECK-SAFE: blr
+}
+
+define <4 x float> @goo(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+  %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b)
+  %r = fdiv <4 x float> %a, %x
+  ret <4 x float> %r
+
+; CHECK-LABEL: @goo
+; CHECK: qvfrsqrtes
+; CHECK: qvfmuls
+; FIXME: We're currently loading two constants here (1.5 and -1.5), and using
+;        an qvfmadd instead of a qvfnmsubs
+; CHECK: qvfmadds
+; CHECK: qvfmadds
+; CHECK: qvfmuls
+; CHECK: qvfmuls
+; CHECK: blr
+
+; CHECK-SAFE-LABEL: @goo
+; CHECK-SAFE: fsqrts
+; CHECK-SAFE: fdivs
+; CHECK-SAFE: blr
+}
+
+define <4 x double> @foo2(<4 x double> %a, <4 x double> %b) nounwind {
+entry:
+  %r = fdiv <4 x double> %a, %b
+  ret <4 x double> %r
+
+; CHECK-LABEL: @foo2
+; CHECK: qvfre
+; CHECK: qvfnmsub
+; CHECK: qvfmadd
+; CHECK: qvfnmsub
+; CHECK: qvfmadd
+; CHECK: qvfmul
+; CHECK: blr
+
+; CHECK-SAFE-LABEL: @foo2
+; CHECK-SAFE: fdiv
+; CHECK-SAFE: blr
+}
+
+define <4 x float> @goo2(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+  %r = fdiv <4 x float> %a, %b
+  ret <4 x float> %r
+
+; CHECK-LABEL: @goo2
+; CHECK: qvfres
+; CHECK: qvfnmsubs
+; CHECK: qvfmadds
+; CHECK: qvfmuls
+; CHECK: blr
+
+; CHECK-SAFE-LABEL: @goo2
+; CHECK-SAFE: fdivs
+; CHECK-SAFE: blr
+}
+
+define <4 x double> @foo3(<4 x double> %a) nounwind {
+entry:
+  %r = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a)
+  ret <4 x double> %r
+
+; CHECK-LABEL: @foo3
+; CHECK: qvfrsqrte
+; CHECK: qvfmul
+; FIXME: We're currently loading two constants here (1.5 and -1.5), and using
+;        an qvfmadd instead of a qvfnmsub
+; CHECK-DAG: qvfmadd
+; CHECK-DAG: qvfcmpeq
+; CHECK-DAG: qvfmadd
+; CHECK-DAG: qvfmul
+; CHECK-DAG: qvfmul
+; CHECK-DAG: qvfmadd
+; CHECK-DAG: qvfmul
+; CHECK-DAG: qvfmul
+; CHECK: qvfsel
+; CHECK: blr
+
+; CHECK-SAFE-LABEL: @foo3
+; CHECK-SAFE: fsqrt
+; CHECK-SAFE: blr
+}
+
+define <4 x float> @goo3(<4 x float> %a) nounwind {
+entry:
+  %r = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)
+  ret <4 x float> %r
+
+; CHECK-LABEL: @goo3
+; CHECK: qvfrsqrtes
+; CHECK: qvfmuls
+; FIXME: We're currently loading two constants here (1.5 and -1.5), and using
+;        an qvfmadds instead of a qvfnmsubs
+; CHECK-DAG: qvfmadds
+; CHECK-DAG: qvfcmpeq
+; CHECK-DAG: qvfmadds
+; CHECK-DAG: qvfmuls
+; CHECK-DAG: qvfmuls
+; CHECK: qvfsel
+; CHECK: blr
+
+; CHECK-SAFE-LABEL: @goo3
+; CHECK-SAFE: fsqrts
+; CHECK-SAFE: blr
+}
+
diff --git a/test/CodeGen/PowerPC/qpx-rounding-ops.ll b/test/CodeGen/PowerPC/qpx-rounding-ops.ll
new file mode 100644
index 0000000..6fdd8e6
--- /dev/null
+++ b/test/CodeGen/PowerPC/qpx-rounding-ops.ll
@@ -0,0 +1,109 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2q | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2q -enable-unsafe-fp-math | FileCheck -check-prefix=CHECK-FM %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define <4 x float> @test1(<4 x float> %x) nounwind  {
+  %call = tail call <4 x float> @llvm.floor.v4f32(<4 x float> %x) nounwind readnone
+  ret <4 x float> %call
+
+; CHECK: test1:
+; CHECK: qvfrim 1, 1
+
+; CHECK-FM: test1:
+; CHECK-FM: qvfrim 1, 1
+}
+
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readnone
+
+define <4 x double> @test2(<4 x double> %x) nounwind  {
+  %call = tail call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone
+  ret <4 x double> %call
+
+; CHECK: test2:
+; CHECK: qvfrim 1, 1
+
+; CHECK-FM: test2:
+; CHECK-FM: qvfrim 1, 1
+}
+
+declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone
+
+define <4 x float> @test3(<4 x float> %x) nounwind  {
+  %call = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %x) nounwind readnone
+  ret <4 x float> %call
+
+; CHECK: test3:
+; CHECK-NOT: qvfrin
+
+; CHECK-FM: test3:
+; CHECK-FM-NOT: qvfrin
+}
+
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone
+
+define <4 x double> @test4(<4 x double> %x) nounwind  {
+  %call = tail call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %x) nounwind readnone
+  ret <4 x double> %call
+
+; CHECK: test4:
+; CHECK-NOT: qvfrin
+
+; CHECK-FM: test4:
+; CHECK-FM-NOT: qvfrin
+}
+
+declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) nounwind readnone
+
+define <4 x float> @test5(<4 x float> %x) nounwind  {
+  %call = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone
+  ret <4 x float> %call
+
+; CHECK: test5:
+; CHECK: qvfrip 1, 1
+
+; CHECK-FM: test5:
+; CHECK-FM: qvfrip 1, 1
+}
+
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone
+
+define <4 x double> @test6(<4 x double> %x) nounwind  {
+  %call = tail call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
+  ret <4 x double> %call
+
+; CHECK: test6:
+; CHECK: qvfrip 1, 1
+
+; CHECK-FM: test6:
+; CHECK-FM: qvfrip 1, 1
+}
+
+declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone
+
+define <4 x float> @test9(<4 x float> %x) nounwind  {
+  %call = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone
+  ret <4 x float> %call
+
+; CHECK: test9:
+; CHECK: qvfriz 1, 1
+
+; CHECK-FM: test9:
+; CHECK-FM: qvfriz 1, 1
+}
+
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone
+
+define <4 x double> @test10(<4 x double> %x) nounwind  {
+  %call = tail call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone
+  ret <4 x double> %call
+
+; CHECK: test10:
+; CHECK: qvfriz 1, 1
+
+; CHECK-FM: test10:
+; CHECK-FM: qvfriz 1, 1
+}
+
+declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone
+
diff --git a/test/CodeGen/PowerPC/qpx-s-load.ll b/test/CodeGen/PowerPC/qpx-s-load.ll
new file mode 100644
index 0000000..1ca0ae6
--- /dev/null
+++ b/test/CodeGen/PowerPC/qpx-s-load.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=ppc64 -mcpu=a2q | FileCheck %s
+target triple = "powerpc64-bgq-linux"
+
+define <4 x float> @foo(<4 x float>* %p) {
+entry:
+  %v = load <4 x float>* %p, align 4
+  ret <4 x float> %v
+}
+
+; CHECK: @foo
+; CHECK-DAG: li [[REG1:[0-9]+]], 15
+; CHECK-DAG: qvlfsx [[REG4:[0-9]+]], 0, 3
+; CHECK-DAG: qvlfsx [[REG2:[0-9]+]], 3, [[REG1]]
+; CHECK-DAG: qvlpclsx [[REG3:[0-9]+]], 0, 3
+; CHECK-DAG: qvfperm 1, [[REG4]], [[REG2]], [[REG3]]
+; CHECK: blr
+
+define <4 x float> @bar(<4 x float>* %p) {
+entry:
+  %v = load <4 x float>* %p, align 16
+  ret <4 x float> %v
+}
+
+; CHECK: @bar
+; CHECK: qvlfsx
+
diff --git a/test/CodeGen/PowerPC/qpx-s-sel.ll b/test/CodeGen/PowerPC/qpx-s-sel.ll
new file mode 100644
index 0000000..e3a2dd9
--- /dev/null
+++ b/test/CodeGen/PowerPC/qpx-s-sel.ll
@@ -0,0 +1,144 @@
+; RUN: llc < %s -march=ppc64 -mcpu=a2q | FileCheck %s
+target triple = "powerpc64-bgq-linux"
+
+@Q = constant <4 x i1> <i1 0, i1 undef, i1 1, i1 1>, align 16
+@R = global <4 x i1> <i1 0, i1 0, i1 0, i1 0>, align 16
+
+define <4 x float> @test1(<4 x float> %a, <4 x float> %b, <4 x i1> %c) nounwind readnone {
+entry:
+  %r = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %r
+
+; CHECK-LABEL: @test1
+; CHECK: qvfsel 1, 3, 1, 2
+; CHECK: blr
+}
+
+define <4 x float> @test2(<4 x float> %a, <4 x float> %b, i1 %c1, i1 %c2, i1 %c3, i1 %c4) nounwind readnone {
+entry:
+  %v = insertelement <4 x i1> undef, i1 %c1, i32 0
+  %v2 = insertelement <4 x i1> %v, i1 %c2, i32 1
+  %v3 = insertelement <4 x i1> %v2, i1 %c3, i32 2
+  %v4 = insertelement <4 x i1> %v3, i1 %c4, i32 3
+  %r = select <4 x i1> %v4, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %r
+
+; CHECK-LABEL: @test2
+; CHECK: stw
+; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]],
+; CHECK-DAG: qvlfdx [[REG2:[0-9]+]],
+; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]]
+; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]]
+; CHECK: qvfsel 1, [[REG4]], 1, 2
+; CHECK: blr
+}
+
+define <4 x i1> @test3(<4 x i1> %a) nounwind readnone {
+entry:
+  %v = and <4 x i1> %a, <i1 0, i1 undef, i1 1, i1 1>
+  ret <4 x i1> %v
+
+; CHECK-LABEL: @test3
+; CHECK: qvlfsx [[REG:[0-9]+]],
+; qvflogical 1, 1, [[REG]], 1
+; blr
+}
+
+define <4 x i1> @test4(<4 x i1> %a) nounwind {
+entry:
+  %q = load <4 x i1>* @Q, align 16
+  %v = and <4 x i1> %a, %q
+  ret <4 x i1> %v
+
+; CHECK-LABEL: @test4
+; CHECK-DAG: lbz
+; CHECK-DAG: qvlfdx [[REG1:[0-9]+]],
+; CHECK-DAG: stw
+; CHECK-DAG: qvlfiwzx [[REG2:[0-9]+]],
+; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG2]]
+; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG1]]
+; CHECK: qvflogical 1, 1, [[REG4]], 1
+; CHECK: blr
+}
+
+define void @test5(<4 x i1> %a) nounwind {
+entry:
+  store <4 x i1> %a, <4 x i1>* @R
+  ret void
+
+; CHECK-LABEL: @test5
+; CHECK: qvlfdx [[REG1:[0-9]+]],
+; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]]
+; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]]
+; CHECK: qvstfiwx [[REG3]],
+; CHECK: lwz
+; CHECK: stb
+; CHECK: blr
+}
+
+define i1 @test6(<4 x i1> %a) nounwind {
+entry:
+  %r = extractelement <4 x i1> %a, i32 2
+  ret i1 %r
+
+; CHECK-LABEL: @test6
+; CHECK: qvlfdx [[REG1:[0-9]+]],
+; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]]
+; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]]
+; CHECK: qvstfiwx [[REG3]],
+; CHECK: lwz
+; CHECK: blr
+}
+
+define i1 @test7(<4 x i1> %a) nounwind {
+entry:
+  %r = extractelement <4 x i1> %a, i32 2
+  %s = extractelement <4 x i1> %a, i32 3
+  %q = and i1 %r, %s
+  ret i1 %q
+
+; CHECK-LABEL: @test7
+; CHECK: qvlfdx [[REG1:[0-9]+]],
+; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]]
+; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]]
+; CHECK: qvstfiwx [[REG3]],
+; CHECK-DAG: lwz [[REG4:[0-9]+]],
+; FIXME: We're storing the vector twice, and that's silly.
+; CHECK-DAG: qvstfiwx [[REG3]],
+; CHECK: lwz [[REG5:[0-9]+]],
+; CHECK: and 3,
+; CHECK: blr
+}
+
+define i1 @test8(<3 x i1> %a) nounwind {
+entry:
+  %r = extractelement <3 x i1> %a, i32 2
+  ret i1 %r
+
+; CHECK-LABEL: @test8
+; CHECK: qvlfdx [[REG1:[0-9]+]],
+; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]]
+; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]]
+; CHECK: qvstfiwx [[REG3]],
+; CHECK: lwz
+; CHECK: blr
+}
+
+define <3 x float> @test9(<3 x float> %a, <3 x float> %b, i1 %c1, i1 %c2, i1 %c3) nounwind readnone {
+entry:
+  %v = insertelement <3 x i1> undef, i1 %c1, i32 0
+  %v2 = insertelement <3 x i1> %v, i1 %c2, i32 1
+  %v3 = insertelement <3 x i1> %v2, i1 %c3, i32 2
+  %r = select <3 x i1> %v3, <3 x float> %a, <3 x float> %b
+  ret <3 x float> %r
+
+; CHECK-LABEL: @test9
+; CHECK: stw
+; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]],
+; CHECK-DAG: qvlfdx [[REG2:[0-9]+]],
+; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]]
+; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]]
+; CHECK: qvfsel 1, [[REG4]], 1, 2
+; CHECK: blr
+}
+
diff --git a/test/CodeGen/PowerPC/qpx-s-store.ll b/test/CodeGen/PowerPC/qpx-s-store.ll
new file mode 100644
index 0000000..0bd6201
--- /dev/null
+++ b/test/CodeGen/PowerPC/qpx-s-store.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=ppc64 -mcpu=a2q | FileCheck %s
+target triple = "powerpc64-bgq-linux"
+
+define void @foo(<4 x float> %v, <4 x float>* %p) {
+entry:
+  store <4 x float> %v, <4 x float>* %p, align 4
+  ret void
+}
+
+; CHECK: @foo
+; CHECK: stfs
+; CHECK: stfs
+; CHECK: stfs
+; CHECK: stfs
+; CHECK: blr
+
+define void @bar(<4 x float> %v, <4 x float>* %p) {
+entry:
+  store <4 x float> %v, <4 x float>* %p, align 16
+  ret void
+}
+
+; CHECK: @bar
+; CHECK: qvstfsx
+
diff --git a/test/CodeGen/PowerPC/qpx-sel.ll b/test/CodeGen/PowerPC/qpx-sel.ll
new file mode 100644
index 0000000..6822735
--- /dev/null
+++ b/test/CodeGen/PowerPC/qpx-sel.ll
@@ -0,0 +1,152 @@
+; RUN: llc < %s -march=ppc64 -mcpu=a2q | FileCheck %s
+target triple = "powerpc64-bgq-linux"
+
+@Q = constant <4 x i1> <i1 0, i1 undef, i1 1, i1 1>, align 16
+@R = global <4 x i1> <i1 0, i1 0, i1 0, i1 0>, align 16
+
+define <4 x double> @test1(<4 x double> %a, <4 x double> %b, <4 x i1> %c) nounwind readnone {
+entry:
+  %r = select <4 x i1> %c, <4 x double> %a, <4 x double> %b
+  ret <4 x double> %r
+
+; CHECK-LABEL: @test1
+; CHECK: qvfsel 1, 3, 1, 2
+; CHECK: blr
+}
+
+define <4 x double> @test2(<4 x double> %a, <4 x double> %b, i1 %c1, i1 %c2, i1 %c3, i1 %c4) nounwind readnone {
+entry:
+  %v = insertelement <4 x i1> undef, i1 %c1, i32 0
+  %v2 = insertelement <4 x i1> %v, i1 %c2, i32 1
+  %v3 = insertelement <4 x i1> %v2, i1 %c3, i32 2
+  %v4 = insertelement <4 x i1> %v3, i1 %c4, i32 3
+  %r = select <4 x i1> %v4, <4 x double> %a, <4 x double> %b
+  ret <4 x double> %r
+
+; CHECK-LABEL: @test2
+
+; FIXME: This load/store sequence is unnecessary.
+; CHECK-DAG: lbz
+; CHECK-DAG: stw
+
+; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]],
+; CHECK-DAG: qvlfdx [[REG2:[0-9]+]],
+; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]]
+; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]]
+; CHECK: qvfsel 1, [[REG4]], 1, 2
+; CHECK: blr
+}
+
+define <4 x i1> @test3(<4 x i1> %a) nounwind readnone {
+entry:
+  %v = and <4 x i1> %a, <i1 0, i1 undef, i1 1, i1 1>
+  ret <4 x i1> %v
+
+; CHECK-LABEL: @test3
+; CHECK: qvlfsx [[REG:[0-9]+]],
+; qvflogical 1, 1, [[REG]], 1
+; blr
+}
+
+define <4 x i1> @test4(<4 x i1> %a) nounwind {
+entry:
+  %q = load <4 x i1>* @Q, align 16
+  %v = and <4 x i1> %a, %q
+  ret <4 x i1> %v
+
+; CHECK-LABEL: @test4
+; CHECK-DAG: lbz
+; CHECK-DAG: qvlfdx [[REG1:[0-9]+]],
+; CHECK-DAG: stw
+; CHECK-DAG: qvlfiwzx [[REG2:[0-9]+]],
+; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG2]]
+; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG1]]
+; CHECK: qvflogical 1, 1, [[REG4]], 1
+; CHECK: blr
+}
+
+define void @test5(<4 x i1> %a) nounwind {
+entry:
+  store <4 x i1> %a, <4 x i1>* @R
+  ret void
+
+; CHECK-LABEL: @test5
+; CHECK: qvlfdx [[REG1:[0-9]+]],
+; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]]
+; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]]
+; CHECK: qvstfiwx [[REG3]],
+; CHECK: lwz
+; CHECK: stb
+; CHECK: blr
+}
+
+define i1 @test6(<4 x i1> %a) nounwind {
+entry:
+  %r = extractelement <4 x i1> %a, i32 2
+  ret i1 %r
+
+; CHECK-LABEL: @test6
+; CHECK: qvlfdx [[REG1:[0-9]+]],
+; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]]
+; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]]
+; CHECK: qvstfiwx [[REG3]],
+; CHECK: lwz
+; CHECK: blr
+}
+
+define i1 @test7(<4 x i1> %a) nounwind {
+entry:
+  %r = extractelement <4 x i1> %a, i32 2
+  %s = extractelement <4 x i1> %a, i32 3
+  %q = and i1 %r, %s
+  ret i1 %q
+
+; CHECK-LABEL: @test7
+; CHECK: qvlfdx [[REG1:[0-9]+]],
+; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]]
+; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]]
+; CHECK: qvstfiwx [[REG3]],
+; CHECK-DAG: lwz [[REG4:[0-9]+]],
+; FIXME: We're storing the vector twice, and that's silly.
+; CHECK-DAG: qvstfiwx [[REG3]],
+; CHECK-DAG: lwz [[REG5:[0-9]+]],
+; CHECK: and 3,
+; CHECK: blr
+}
+
+define i1 @test8(<3 x i1> %a) nounwind {
+entry:
+  %r = extractelement <3 x i1> %a, i32 2
+  ret i1 %r
+
+; CHECK-LABEL: @test8
+; CHECK: qvlfdx [[REG1:[0-9]+]],
+; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]]
+; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]]
+; CHECK: qvstfiwx [[REG3]],
+; CHECK: lwz
+; CHECK: blr
+}
+
+define <3 x double> @test9(<3 x double> %a, <3 x double> %b, i1 %c1, i1 %c2, i1 %c3) nounwind readnone {
+entry:
+  %v = insertelement <3 x i1> undef, i1 %c1, i32 0
+  %v2 = insertelement <3 x i1> %v, i1 %c2, i32 1
+  %v3 = insertelement <3 x i1> %v2, i1 %c3, i32 2
+  %r = select <3 x i1> %v3, <3 x double> %a, <3 x double> %b
+  ret <3 x double> %r
+
+; CHECK-LABEL: @test9
+
+; FIXME: This load/store sequence is unnecessary.
+; CHECK-DAG: lbz
+; CHECK-DAG: stw
+
+; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]],
+; CHECK-DAG: qvlfdx [[REG2:[0-9]+]],
+; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]]
+; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]]
+; CHECK: qvfsel 1, [[REG4]], 1, 2
+; CHECK: blr
+}
+
diff --git a/test/CodeGen/PowerPC/qpx-store.ll b/test/CodeGen/PowerPC/qpx-store.ll
new file mode 100644
index 0000000..2579d2c
--- /dev/null
+++ b/test/CodeGen/PowerPC/qpx-store.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=ppc64 -mcpu=a2q | FileCheck %s
+target triple = "powerpc64-bgq-linux"
+
+define void @foo(<4 x double> %v, <4 x double>* %p) {
+entry:
+  store <4 x double> %v, <4 x double>* %p, align 8
+  ret void
+}
+
+; CHECK: @foo
+; CHECK: stfd
+; CHECK: stfd
+; CHECK: stfd
+; CHECK: stfd
+; CHECK: blr
+
+define void @bar(<4 x double> %v, <4 x double>* %p) {
+entry:
+  store <4 x double> %v, <4 x double>* %p, align 32
+  ret void
+}
+
+; CHECK: @bar
+; CHECK: qvstfdx
+
diff --git a/test/CodeGen/PowerPC/qpx-unalperm.ll b/test/CodeGen/PowerPC/qpx-unalperm.ll
new file mode 100644
index 0000000..e765b46
--- /dev/null
+++ b/test/CodeGen/PowerPC/qpx-unalperm.ll
@@ -0,0 +1,64 @@
+; RUN: llc < %s -mcpu=a2q | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+define <4 x double> @foo(<4 x double>* %a) {
+entry:
+  %r = load <4 x double>* %a, align 32
+  ret <4 x double> %r
+; CHECK: qvlfdx
+; CHECK: blr
+}
+
+define <4 x double> @bar(<4 x double>* %a) {
+entry:
+  %r = load <4 x double>* %a, align 8
+  %b = getelementptr <4 x double>* %a, i32 16
+  %s = load <4 x double>* %b, align 32
+  %t = fadd <4 x double> %r, %s
+  ret <4 x double> %t
+; CHECK: qvlpcldx
+; CHECK: qvlfdx
+; CHECK: qvfperm
+; CHECK: blr
+}
+
+define <4 x double> @bar1(<4 x double>* %a) {
+entry:
+  %r = load <4 x double>* %a, align 8
+  %b = getelementptr <4 x double>* %a, i32 16
+  %s = load <4 x double>* %b, align 8
+  %t = fadd <4 x double> %r, %s
+  ret <4 x double> %t
+}
+
+define <4 x double> @bar2(<4 x double>* %a) {
+entry:
+  %r = load <4 x double>* %a, align 8
+  %b = getelementptr <4 x double>* %a, i32 1
+  %s = load <4 x double>* %b, align 32
+  %t = fadd <4 x double> %r, %s
+  ret <4 x double> %t
+}
+
+define <4 x double> @bar3(<4 x double>* %a) {
+entry:
+  %r = load <4 x double>* %a, align 8
+  %b = getelementptr <4 x double>* %a, i32 1
+  %s = load <4 x double>* %b, align 8
+  %t = fadd <4 x double> %r, %s
+  ret <4 x double> %t
+}
+
+define <4 x double> @bar4(<4 x double>* %a) {
+entry:
+  %r = load <4 x double>* %a, align 8
+  %b = getelementptr <4 x double>* %a, i32 1
+  %s = load <4 x double>* %b, align 8
+  %c = getelementptr <4 x double>* %b, i32 1
+  %t = load <4 x double>* %c, align 8
+  %u = fadd <4 x double> %r, %s
+  %v = fadd <4 x double> %u, %t
+  ret <4 x double> %v
+}
+
diff --git a/test/CodeGen/PowerPC/retaddr2.ll b/test/CodeGen/PowerPC/retaddr2.ll
new file mode 100644
index 0000000..8581f6c
--- /dev/null
+++ b/test/CodeGen/PowerPC/retaddr2.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone
+define i8* @test1() #0 {
+entry:
+  %0 = tail call i8* @llvm.returnaddress(i32 0)
+  ret i8* %0
+}
+
+; CHECK-LABEL: @test1
+; CHECK: mflr 0
+; CHECK: std 0, 16(1)
+; CHECK-DAG: ld 3, 64(1)
+; CHECK-DAG: ld 0, 16(1)
+; CHECK: mtlr 0
+; CHECK: blr
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.returnaddress(i32) #0
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/PowerPC/rlwimi-and.ll b/test/CodeGen/PowerPC/rlwimi-and.ll
index 213363e..9433f8e 100644
--- a/test/CodeGen/PowerPC/rlwimi-and.ll
+++ b/test/CodeGen/PowerPC/rlwimi-and.ll
@@ -28,11 +28,9 @@ codeRepl17:                                       ; preds = %codeRepl4
   store i16 %rvml38.sroa.0.0.insert.insert, i16* undef, align 2
   unreachable
 
-; FIXME: the SLWI could be folded into the RLWIMI to give a rotate of 8.
 ; CHECK: @test
-; CHECK-DAG: slwi [[R1:[0-9]+]], {{[0-9]+}}, 31
-; CHECK-DAG: rlwinm [[R2:[0-9]+]], {{[0-9]+}}, 0, 31, 31
-; CHECK: rlwimi [[R2]], [[R1]], 9, 23, 23
+; CHECK: rlwinm [[R1:[0-9]+]], {{[0-9]+}}, 0, 31, 31
+; CHECK: rlwimi [[R1]], {{[0-9]+}}, 8, 23, 23
 
 codeRepl29:                                       ; preds = %codeRepl1
   unreachable
diff --git a/test/CodeGen/PowerPC/rlwimi2.ll b/test/CodeGen/PowerPC/rlwimi2.ll
index 1bee4e0..7978718 100644
--- a/test/CodeGen/PowerPC/rlwimi2.ll
+++ b/test/CodeGen/PowerPC/rlwimi2.ll
@@ -1,7 +1,7 @@
 ; All of these ands and shifts should be folded into rlwimi's
 ; RUN: llc < %s -march=ppc32 -o %t
-; RUN: grep rlwimi %t | count 3
-; RUN: grep srwi   %t | count 1
+; RUN: grep rlwimi %t | count 4
+; RUN: not grep srwi %t
 ; RUN: not grep slwi %t
 
 define i16 @test1(i32 %srcA, i32 %srcB, i32 %alpha) nounwind {
diff --git a/test/CodeGen/PowerPC/rm-zext.ll b/test/CodeGen/PowerPC/rm-zext.ll
new file mode 100644
index 0000000..33995e1
--- /dev/null
+++ b/test/CodeGen/PowerPC/rm-zext.ll
@@ -0,0 +1,89 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone
+define signext i32 @foo(i32 signext %a) #0 {
+entry:
+  %mul = mul nsw i32 %a, %a
+  %shr2 = lshr i32 %mul, 5
+  ret i32 %shr2
+
+; CHECK-LABEL @foo
+; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32
+; CHECK: blr
+}
+
+define zeroext i32 @test6(i32 zeroext %x) #0 {
+entry:
+  %and = lshr i32 %x, 16
+  %shr = and i32 %and, 255
+  %and1 = shl i32 %x, 16
+  %shl = and i32 %and1, 16711680
+  %or = or i32 %shr, %shl
+  ret i32 %or
+
+; CHECK-LABEL @test6
+; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32
+; CHECK: blr
+}
+
+define zeroext i32 @min(i32 zeroext %a, i32 zeroext %b) #0 {
+entry:
+  %cmp = icmp ule i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+
+; CHECK-LABEL @min
+; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.bswap.i32(i32) #0
+
+; Function Attrs: nounwind readonly
+define zeroext i32 @bs32(i32* nocapture readonly %x) #1 {
+entry:
+  %0 = load i32* %x, align 4
+  %1 = tail call i32 @llvm.bswap.i32(i32 %0)
+  ret i32 %1
+
+; CHECK-LABEL: @bs32
+; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readonly
+define zeroext i16 @bs16(i16* nocapture readonly %x) #1 {
+entry:
+  %0 = load i16* %x, align 2
+  %1 = tail call i16 @llvm.bswap.i16(i16 %0)
+  ret i16 %1
+
+; CHECK-LABEL: @bs16
+; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+declare i16 @llvm.bswap.i16(i16) #0
+
+; Function Attrs: nounwind readnone
+define zeroext i32 @ctlz32(i32 zeroext %x) #0 {
+entry:
+  %0 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+  ret i32 %0
+
+; CHECK-LABEL: @ctlz32
+; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ctlz.i32(i32, i1) #0
+
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind readonly }
+
diff --git a/test/CodeGen/PowerPC/sdiv-pow2.ll b/test/CodeGen/PowerPC/sdiv-pow2.ll
new file mode 100644
index 0000000..5ec019d
--- /dev/null
+++ b/test/CodeGen/PowerPC/sdiv-pow2.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mcpu=ppc64 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc < %s | FileCheck -check-prefix=CHECK-32 %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone
+define signext i32 @foo4(i32 signext %a) #0 {
+entry:
+  %div = sdiv i32 %a, 8
+  ret i32 %div
+
+; CHECK-LABEL @foo4
+; CHECK: srawi [[REG1:[0-9]+]], 3, 3
+; CHECK: addze [[REG2:[0-9]+]], [[REG1]]
+; CHECK: extsw 3, [[REG2]]
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define i64 @foo8(i64 %a) #0 {
+entry:
+  %div = sdiv i64 %a, 8
+  ret i64 %div
+
+; CHECK-LABEL @foo8
+; CHECK: sradi [[REG1:[0-9]+]], 3, 3
+; CHECK: addze 3, [[REG1]]
+; CHECK: blr
+
+; CHECK-32-LABEL @foo8
+; CHECK-32-NOT: sradi
+; CHECK-32: blr
+}
+
+; Function Attrs: nounwind readnone
+define signext i32 @foo4n(i32 signext %a) #0 {
+entry:
+  %div = sdiv i32 %a, -8
+  ret i32 %div
+
+; CHECK-LABEL: @foo4n
+; CHECK: srawi [[REG1:[0-9]+]], 3, 3
+; CHECK: addze [[REG2:[0-9]+]], [[REG1]]
+; CHECK: neg [[REG3:[0-9]+]], [[REG2]]
+; CHECK: extsw 3, [[REG3]]
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define i64 @foo8n(i64 %a) #0 {
+entry:
+  %div = sdiv i64 %a, -8
+  ret i64 %div
+
+; CHECK-LABEL: @foo8n
+; CHECK: sradi [[REG1:[0-9]+]], 3, 3
+; CHECK: addze [[REG2:[0-9]+]], [[REG1]]
+; CHECK: neg 3, [[REG2]]
+; CHECK: blr
+
+; CHECK-32-LABEL @foo8n
+; CHECK-32-NOT: sradi
+; CHECK-32: blr
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/PowerPC/stack-realign.ll b/test/CodeGen/PowerPC/stack-realign.ll
index a59fceb..762f50a 100644
--- a/test/CodeGen/PowerPC/stack-realign.ll
+++ b/test/CodeGen/PowerPC/stack-realign.ll
@@ -37,6 +37,7 @@ entry:
 ; CHECK-DAG: subfic 0, [[REG]], -160
 ; CHECK: stdux 1, 1, 0
 
+; CHECK: .cfi_def_cfa_register r30
 ; CHECK: .cfi_offset r30, -16
 ; CHECK: .cfi_offset lr, 16
 
@@ -59,6 +60,7 @@ entry:
 ; CHECK-FP-DAG: subfic 0, [[REG]], -160
 ; CHECK-FP: stdux 1, 1, 0
 
+; CHECK-FP: .cfi_def_cfa_register r30
 ; CHECK-FP: .cfi_offset r31, -8
 ; CHECK-FP: .cfi_offset r30, -16
 ; CHECK-FP: .cfi_offset lr, 16
@@ -120,6 +122,8 @@ entry:
 ; CHECK-DAG: subfc 0, [[REG3]], [[REG2]]
 ; CHECK: stdux 1, 1, 0
 
+; CHECK: .cfi_def_cfa_register r30
+
 ; CHECK: blr
 
 ; CHECK-32-LABEL: @hoo
@@ -178,6 +182,8 @@ entry:
 ; CHECK-DAG: subfic 0, [[REG]], -192
 ; CHECK: stdux 1, 1, 0
 
+; CHECK: .cfi_def_cfa_register r30
+
 ; CHECK: stfd 30, -16(30)
 
 ; CHECK: blr
@@ -193,6 +199,8 @@ entry:
 ; CHECK-FP-DAG: subfic 0, [[REG]], -192
 ; CHECK-FP: stdux 1, 1, 0
 
+; CHECK-FP: .cfi_def_cfa_register r30
+
 ; CHECK-FP: stfd 30, -16(30)
 
 ; CHECK-FP: blr
diff --git a/test/CodeGen/PowerPC/subreg-postra-2.ll b/test/CodeGen/PowerPC/subreg-postra-2.ll
new file mode 100644
index 0000000..2faaa61
--- /dev/null
+++ b/test/CodeGen/PowerPC/subreg-postra-2.ll
@@ -0,0 +1,175 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @jbd2_journal_commit_transaction() #0 {
+entry:
+  br i1 undef, label %do.body, label %if.then5
+
+if.then5:                                         ; preds = %entry
+  unreachable
+
+do.body:                                          ; preds = %entry
+  br i1 undef, label %do.body.i, label %trace_jbd2_start_commit.exit
+
+do.body.i:                                        ; preds = %do.body
+  unreachable
+
+trace_jbd2_start_commit.exit:                     ; preds = %do.body
+  br i1 undef, label %do.body.i1116, label %trace_jbd2_commit_locking.exit
+
+do.body.i1116:                                    ; preds = %trace_jbd2_start_commit.exit
+  unreachable
+
+trace_jbd2_commit_locking.exit:                   ; preds = %trace_jbd2_start_commit.exit
+  br i1 undef, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %trace_jbd2_commit_locking.exit
+  unreachable
+
+while.end:                                        ; preds = %trace_jbd2_commit_locking.exit
+  br i1 undef, label %spin_unlock.exit1146, label %if.then.i.i.i.i1144
+
+if.then.i.i.i.i1144:                              ; preds = %while.end
+  unreachable
+
+spin_unlock.exit1146:                             ; preds = %while.end
+  br i1 undef, label %spin_unlock.exit1154, label %if.then.i.i.i.i1152
+
+if.then.i.i.i.i1152:                              ; preds = %spin_unlock.exit1146
+  unreachable
+
+spin_unlock.exit1154:                             ; preds = %spin_unlock.exit1146
+  br i1 undef, label %do.body.i1159, label %trace_jbd2_commit_flushing.exit
+
+do.body.i1159:                                    ; preds = %spin_unlock.exit1154
+  br i1 undef, label %if.end.i1166, label %do.body5.i1165
+
+do.body5.i1165:                                   ; preds = %do.body.i1159
+  unreachable
+
+if.end.i1166:                                     ; preds = %do.body.i1159
+  unreachable
+
+trace_jbd2_commit_flushing.exit:                  ; preds = %spin_unlock.exit1154
+  br i1 undef, label %for.end.i, label %for.body.lr.ph.i
+
+for.body.lr.ph.i:                                 ; preds = %trace_jbd2_commit_flushing.exit
+  unreachable
+
+for.end.i:                                        ; preds = %trace_jbd2_commit_flushing.exit
+  br i1 undef, label %journal_submit_data_buffers.exit, label %if.then.i.i.i.i31.i
+
+if.then.i.i.i.i31.i:                              ; preds = %for.end.i
+  br label %journal_submit_data_buffers.exit
+
+journal_submit_data_buffers.exit:                 ; preds = %if.then.i.i.i.i31.i, %for.end.i
+  br i1 undef, label %if.end103, label %if.then102
+
+if.then102:                                       ; preds = %journal_submit_data_buffers.exit
+  unreachable
+
+if.end103:                                        ; preds = %journal_submit_data_buffers.exit
+  br i1 undef, label %do.body.i1182, label %trace_jbd2_commit_logging.exit
+
+do.body.i1182:                                    ; preds = %if.end103
+  br i1 undef, label %if.end.i1189, label %do.body5.i1188
+
+do.body5.i1188:                                   ; preds = %do.body5.i1188, %do.body.i1182
+  br i1 undef, label %if.end.i1189, label %do.body5.i1188
+
+if.end.i1189:                                     ; preds = %do.body5.i1188, %do.body.i1182
+  unreachable
+
+trace_jbd2_commit_logging.exit:                   ; preds = %if.end103
+  br label %while.cond129.outer1451
+
+while.cond129.outer1451:                          ; preds = %start_journal_io, %trace_jbd2_commit_logging.exit
+  br label %while.cond129
+
+while.cond129:                                    ; preds = %if.then135, %while.cond129.outer1451
+  br i1 undef, label %while.end246, label %if.then135
+
+if.then135:                                       ; preds = %while.cond129
+  br i1 undef, label %start_journal_io, label %while.cond129
+
+start_journal_io:                                 ; preds = %if.then135
+  br label %while.cond129.outer1451
+
+while.end246:                                     ; preds = %while.cond129
+  br i1 undef, label %for.end.i1287, label %for.body.i1277
+
+for.body.i1277:                                   ; preds = %while.end246
+  unreachable
+
+for.end.i1287:                                    ; preds = %while.end246
+  br i1 undef, label %journal_finish_inode_data_buffers.exit, label %if.then.i.i.i.i84.i
+
+if.then.i.i.i.i84.i:                              ; preds = %for.end.i1287
+  unreachable
+
+journal_finish_inode_data_buffers.exit:           ; preds = %for.end.i1287
+  br i1 undef, label %if.end256, label %if.then249
+
+if.then249:                                       ; preds = %journal_finish_inode_data_buffers.exit
+  unreachable
+
+if.end256:                                        ; preds = %journal_finish_inode_data_buffers.exit
+  br label %while.body318
+
+while.body318:                                    ; preds = %wait_on_buffer.exit, %if.end256
+  br i1 undef, label %wait_on_buffer.exit, label %if.then.i1296
+
+if.then.i1296:                                    ; preds = %while.body318
+  br label %wait_on_buffer.exit
+
+wait_on_buffer.exit:                              ; preds = %if.then.i1296, %while.body318
+  br i1 undef, label %do.body378, label %while.body318
+
+do.body378:                                       ; preds = %wait_on_buffer.exit
+  br i1 undef, label %while.end418, label %while.body392.lr.ph
+
+while.body392.lr.ph:                              ; preds = %do.body378
+  br label %while.body392
+
+while.body392:                                    ; preds = %wait_on_buffer.exit1319, %while.body392.lr.ph
+  %0 = load i8** undef, align 8
+  %add.ptr399 = getelementptr inbounds i8* %0, i64 -72
+  %b_state.i.i1314 = bitcast i8* %add.ptr399 to i64*
+  %tobool.i1316 = icmp eq i64 undef, 0
+  br i1 %tobool.i1316, label %wait_on_buffer.exit1319, label %if.then.i1317
+
+if.then.i1317:                                    ; preds = %while.body392
+  unreachable
+
+wait_on_buffer.exit1319:                          ; preds = %while.body392
+  %1 = load volatile i64* %b_state.i.i1314, align 8
+  %conv.i.i1322 = and i64 %1, 1
+  %lnot404 = icmp eq i64 %conv.i.i1322, 0
+  %.err.4 = select i1 %lnot404, i32 -5, i32 undef
+  %2 = call i64 asm sideeffect "1:.long 0x7c0000a8 $| ((($0) & 0x1f) << 21) $| (((0) & 0x1f) << 16) $| ((($3) & 0x1f) << 11) $| (((0) & 0x1) << 0) \0Aandc $0,$0,$2\0Astdcx. $0,0,$3\0Abne- 1b\0A", "=&r,=*m,r,r,*m,~{cc},~{memory}"(i64* %b_state.i.i1314, i64 262144, i64* %b_state.i.i1314, i64* %b_state.i.i1314) #0
+  store i8* %0, i8** undef, align 8
+  %cmp.i1312 = icmp eq i32* undef, undef
+  br i1 %cmp.i1312, label %while.end418, label %while.body392
+
+while.end418:                                     ; preds = %wait_on_buffer.exit1319, %do.body378
+  %err.4.lcssa = phi i32 [ undef, %do.body378 ], [ %.err.4, %wait_on_buffer.exit1319 ]
+  %tobool419 = icmp eq i32 %err.4.lcssa, 0
+  br i1 %tobool419, label %if.end421, label %if.then420
+
+; CHECK-LABEL: @jbd2_journal_commit_transaction
+; CHECK: andi.
+; CHECK: cror [[REG:[0-9]+]], 1, 1
+; CHECK: stdcx.
+; CHECK: isel {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, [[REG]]
+
+if.then420:                                       ; preds = %while.end418
+  unreachable
+
+if.end421:                                        ; preds = %while.end418
+  unreachable
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/subreg-postra.ll b/test/CodeGen/PowerPC/subreg-postra.ll
new file mode 100644
index 0000000..b10fa66
--- /dev/null
+++ b/test/CodeGen/PowerPC/subreg-postra.ll
@@ -0,0 +1,168 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @jbd2_journal_commit_transaction(i32* %journal) #0 {
+entry:
+  br i1 undef, label %do.body, label %if.then5
+
+if.then5:                                         ; preds = %entry
+  unreachable
+
+do.body:                                          ; preds = %entry
+  br i1 undef, label %do.body.i, label %trace_jbd2_start_commit.exit
+
+do.body.i:                                        ; preds = %do.body
+  unreachable
+
+trace_jbd2_start_commit.exit:                     ; preds = %do.body
+  br i1 undef, label %do.body.i1116, label %trace_jbd2_commit_locking.exit
+
+do.body.i1116:                                    ; preds = %trace_jbd2_start_commit.exit
+  br i1 undef, label %if.end.i1123, label %do.body5.i1122
+
+do.body5.i1122:                                   ; preds = %do.body.i1116
+  unreachable
+
+if.end.i1123:                                     ; preds = %do.body.i1116
+  br label %trace_jbd2_commit_locking.exit
+
+trace_jbd2_commit_locking.exit:                   ; preds = %if.end.i1123, %trace_jbd2_start_commit.exit
+  br i1 undef, label %spin_unlock.exit1146, label %if.then.i.i.i.i1144
+
+if.then.i.i.i.i1144:                              ; preds = %trace_jbd2_commit_locking.exit
+  unreachable
+
+spin_unlock.exit1146:                             ; preds = %trace_jbd2_commit_locking.exit
+  br i1 undef, label %spin_unlock.exit1154, label %if.then.i.i.i.i1152
+
+if.then.i.i.i.i1152:                              ; preds = %spin_unlock.exit1146
+  br label %spin_unlock.exit1154
+
+spin_unlock.exit1154:                             ; preds = %if.then.i.i.i.i1152, %spin_unlock.exit1146
+  br i1 undef, label %do.body.i1159, label %trace_jbd2_commit_flushing.exit
+
+do.body.i1159:                                    ; preds = %spin_unlock.exit1154
+  unreachable
+
+trace_jbd2_commit_flushing.exit:                  ; preds = %spin_unlock.exit1154
+  br i1 undef, label %for.end.i, label %for.body.lr.ph.i
+
+for.body.lr.ph.i:                                 ; preds = %trace_jbd2_commit_flushing.exit
+  br i1 undef, label %spin_unlock.exit.i, label %if.then.i.i.i.i.i
+
+if.then.i.i.i.i.i:                                ; preds = %for.body.lr.ph.i
+  unreachable
+
+spin_unlock.exit.i:                               ; preds = %for.body.lr.ph.i
+  unreachable
+
+for.end.i:                                        ; preds = %trace_jbd2_commit_flushing.exit
+  br i1 undef, label %journal_submit_data_buffers.exit, label %if.then.i.i.i.i31.i
+
+if.then.i.i.i.i31.i:                              ; preds = %for.end.i
+  unreachable
+
+journal_submit_data_buffers.exit:                 ; preds = %for.end.i
+  br i1 undef, label %if.end103, label %if.then102
+
+if.then102:                                       ; preds = %journal_submit_data_buffers.exit
+  unreachable
+
+if.end103:                                        ; preds = %journal_submit_data_buffers.exit
+  br i1 undef, label %do.body.i1182, label %trace_jbd2_commit_logging.exit
+
+do.body.i1182:                                    ; preds = %if.end103
+  unreachable
+
+trace_jbd2_commit_logging.exit:                   ; preds = %if.end103
+  br i1 undef, label %for.end.i1287, label %for.body.i1277
+
+for.body.i1277:                                   ; preds = %trace_jbd2_commit_logging.exit
+  unreachable
+
+for.end.i1287:                                    ; preds = %trace_jbd2_commit_logging.exit
+  br i1 undef, label %journal_finish_inode_data_buffers.exit, label %if.then.i.i.i.i84.i
+
+if.then.i.i.i.i84.i:                              ; preds = %for.end.i1287
+  unreachable
+
+journal_finish_inode_data_buffers.exit:           ; preds = %for.end.i1287
+  br i1 undef, label %if.end256, label %if.then249
+
+if.then249:                                       ; preds = %journal_finish_inode_data_buffers.exit
+  unreachable
+
+if.end256:                                        ; preds = %journal_finish_inode_data_buffers.exit
+  br i1 undef, label %do.body277, label %if.then260
+
+if.then260:                                       ; preds = %if.end256
+  br label %do.body277
+
+do.body277:                                       ; preds = %if.then260, %if.end256
+  br label %while.body318
+
+while.body318:                                    ; preds = %wait_on_buffer.exit, %do.body277
+  %tobool.i1295 = icmp eq i64 undef, 0
+  br i1 %tobool.i1295, label %wait_on_buffer.exit, label %if.then.i1296
+
+if.then.i1296:                                    ; preds = %while.body318
+  unreachable
+
+wait_on_buffer.exit:                              ; preds = %while.body318
+  br i1 undef, label %do.body378, label %while.body318
+
+do.body378:                                       ; preds = %wait_on_buffer.exit
+  br i1 undef, label %while.end418, label %while.body392.lr.ph
+
+while.body392.lr.ph:                              ; preds = %do.body378
+  br label %while.body392
+
+while.body392:                                    ; preds = %wait_on_buffer.exit1319, %while.body392.lr.ph
+  %0 = load i8** undef, align 8
+  %add.ptr399 = getelementptr inbounds i8* %0, i64 -72
+  %b_state.i.i1314 = bitcast i8* %add.ptr399 to i64*
+  %tobool.i1316 = icmp eq i64 undef, 0
+  br i1 %tobool.i1316, label %wait_on_buffer.exit1319, label %if.then.i1317
+
+if.then.i1317:                                    ; preds = %while.body392
+  unreachable
+
+wait_on_buffer.exit1319:                          ; preds = %while.body392
+  %1 = load volatile i64* %b_state.i.i1314, align 8
+  %conv.i.i1322 = and i64 %1, 1
+  %lnot404 = icmp eq i64 %conv.i.i1322, 0
+  %.err.4 = select i1 %lnot404, i32 -5, i32 undef
+  %2 = call i64 asm sideeffect "1:.long 0x7c0000a8 $| ((($0) & 0x1f) << 21) $| (((0) & 0x1f) << 16) $| ((($3) & 0x1f) << 11) $| (((0) & 0x1) << 0) \0Aandc $0,$0,$2\0Astdcx. $0,0,$3\0Abne- 1b\0A", "=&r,=*m,r,r,*m,~{cc},~{memory}"(i64* %b_state.i.i1314, i64 262144, i64* %b_state.i.i1314, i64* %b_state.i.i1314) #1
+  %prev.i.i.i1325 = getelementptr inbounds i8* %0, i64 8
+  %3 = load i32** null, align 8
+  store i32* %3, i32** undef, align 8
+  call void @__brelse(i32* undef) #1
+  br i1 undef, label %while.end418, label %while.body392
+
+; CHECK-LABEL: @jbd2_journal_commit_transaction
+; CHECK: andi.
+; CHECK: cror [[REG:[0-9]+]], 1, 1
+; CHECK: stdcx.
+; CHECK: isel {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, [[REG]]
+
+while.end418:                                     ; preds = %wait_on_buffer.exit1319, %do.body378
+  %err.4.lcssa = phi i32 [ undef, %do.body378 ], [ %.err.4, %wait_on_buffer.exit1319 ]
+  br i1 undef, label %if.end421, label %if.then420
+
+if.then420:                                       ; preds = %while.end418
+  call void @jbd2_journal_abort(i32* %journal, i32 signext %err.4.lcssa) #1
+  br label %if.end421
+
+if.end421:                                        ; preds = %if.then420, %while.end418
+  unreachable
+}
+
+declare void @jbd2_journal_abort(i32*, i32 signext)
+
+declare void @__brelse(i32*)
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/tls-cse.ll b/test/CodeGen/PowerPC/tls-cse.ll
new file mode 100644
index 0000000..2aa75f9
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls-cse.ll
@@ -0,0 +1,52 @@
+; RUN: llc -march=ppc64 -mcpu=pwr7 -O2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -march=ppc64 -mcpu=pwr7 -O2 -relocation-model=pic < %s | grep "__tls_get_addr" | count 1
+
+; This test was derived from LLVM's own
+; PrettyStackTraceEntry::~PrettyStackTraceEntry().  It demonstrates an
+; opportunity for CSE of calls to __tls_get_addr().
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+%"class.llvm::PrettyStackTraceEntry" = type { i32 (...)**, %"class.llvm::PrettyStackTraceEntry"* }
+
+@_ZTVN4llvm21PrettyStackTraceEntryE = unnamed_addr constant [5 x i8*] [i8* null, i8* null, i8* bitcast (void (%"class.llvm::PrettyStackTraceEntry"*)* @_ZN4llvm21PrettyStackTraceEntryD2Ev to i8*), i8* bitcast (void (%"class.llvm::PrettyStackTraceEntry"*)* @_ZN4llvm21PrettyStackTraceEntryD0Ev to i8*), i8* bitcast (void ()* @__cxa_pure_virtual to i8*)], align 8
+@_ZL20PrettyStackTraceHead = internal thread_local unnamed_addr global %"class.llvm::PrettyStackTraceEntry"* null, align 8
+@.str = private unnamed_addr constant [87 x i8] c"PrettyStackTraceHead == this && \22Pretty stack trace entry destruction is out of order\22\00", align 1
+@.str1 = private unnamed_addr constant [64 x i8] c"/home/wschmidt/llvm/llvm-test2/lib/Support/PrettyStackTrace.cpp\00", align 1
+@__PRETTY_FUNCTION__._ZN4llvm21PrettyStackTraceEntryD2Ev = private unnamed_addr constant [62 x i8] c"virtual llvm::PrettyStackTraceEntry::~PrettyStackTraceEntry()\00", align 1
+
+declare void @_ZN4llvm21PrettyStackTraceEntryD2Ev(%"class.llvm::PrettyStackTraceEntry"* %this) unnamed_addr
+declare void @__cxa_pure_virtual()
+declare void @__assert_fail(i8*, i8*, i32 zeroext, i8*)
+declare void @_ZdlPv(i8*)
+
+define void @_ZN4llvm21PrettyStackTraceEntryD0Ev(%"class.llvm::PrettyStackTraceEntry"* %this) unnamed_addr align 2 {
+entry:
+  %0 = getelementptr inbounds %"class.llvm::PrettyStackTraceEntry"* %this, i64 0, i32 0
+  store i32 (...)** bitcast (i8** getelementptr inbounds ([5 x i8*]* @_ZTVN4llvm21PrettyStackTraceEntryE, i64 0, i64 2) to i32 (...)**), i32 (...)*** %0, align 8
+  %1 = load %"class.llvm::PrettyStackTraceEntry"** @_ZL20PrettyStackTraceHead, align 8
+  %cmp.i = icmp eq %"class.llvm::PrettyStackTraceEntry"* %1, %this
+  br i1 %cmp.i, label %_ZN4llvm21PrettyStackTraceEntryD2Ev.exit, label %cond.false.i
+
+cond.false.i:                                     ; preds = %entry
+  tail call void @__assert_fail(i8* getelementptr inbounds ([87 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([64 x i8]* @.str1, i64 0, i64 0), i32 zeroext 119, i8* getelementptr inbounds ([62 x i8]* @__PRETTY_FUNCTION__._ZN4llvm21PrettyStackTraceEntryD2Ev, i64 0, i64 0))
+  unreachable
+
+_ZN4llvm21PrettyStackTraceEntryD2Ev.exit:         ; preds = %entry
+  %NextEntry.i.i = getelementptr inbounds %"class.llvm::PrettyStackTraceEntry"* %this, i64 0, i32 1
+  %2 = bitcast %"class.llvm::PrettyStackTraceEntry"** %NextEntry.i.i to i64*
+  %3 = load i64* %2, align 8
+  store i64 %3, i64* bitcast (%"class.llvm::PrettyStackTraceEntry"** @_ZL20PrettyStackTraceHead to i64*), align 8
+  %4 = bitcast %"class.llvm::PrettyStackTraceEntry"* %this to i8*
+  tail call void @_ZdlPv(i8* %4)
+  ret void
+}
+
+; CHECK-LABEL: _ZN4llvm21PrettyStackTraceEntryD0Ev:
+; CHECK: addis [[REG1:[0-9]+]], 2, _ZL20PrettyStackTraceHead@got@tlsld@ha
+; CHECK: addi 3, [[REG1]], _ZL20PrettyStackTraceHead@got@tlsld@l
+; CHECK: bl __tls_get_addr(_ZL20PrettyStackTraceHead@tlsld)
+; CHECK: addis 3, 3, _ZL20PrettyStackTraceHead@dtprel@ha
+; CHECK: ld {{[0-9]+}}, _ZL20PrettyStackTraceHead@dtprel@l(3)
+; CHECK: std {{[0-9]+}}, _ZL20PrettyStackTraceHead@dtprel@l(3)
diff --git a/test/CodeGen/PowerPC/tls-pic.ll b/test/CodeGen/PowerPC/tls-pic.ll
index 9ba3725..6c671b0 100644
--- a/test/CodeGen/PowerPC/tls-pic.ll
+++ b/test/CodeGen/PowerPC/tls-pic.ll
@@ -19,32 +19,32 @@ entry:
 
 ; OPT0-LABEL: main:
 ; OPT0:      addis [[REG:[0-9]+]], 2, a@got@tlsld@ha
-; OPT0-NEXT: addi 3, [[REG]], a@got@tlsld@l
+; OPT0:      addi 3, [[REG]], a@got@tlsld@l
 ; OPT0:      bl __tls_get_addr(a@tlsld)
 ; OPT0-NEXT: nop
 ; OPT0:      addis [[REG2:[0-9]+]], 3, a@dtprel@ha
-; OPT0-NEXT: addi {{[0-9]+}}, [[REG2]], a@dtprel@l
+; OPT0:      addi {{[0-9]+}}, [[REG2]], a@dtprel@l
 ; OPT0-32-LABEL: main
 ; OPT0-32:        addi {{[0-9]+}}, {{[0-9]+}}, a@got@tlsld
 ; OPT0-32:        bl __tls_get_addr(a@tlsld)@PLT
 ; OPT0-32:        addis [[REG:[0-9]+]], 3, a@dtprel@ha
-; OPT0-32-NEXT:   addi  {{[0-9]+}}, [[REG]], a@dtprel@l
+; OPT0-32:        addi  {{[0-9]+}}, [[REG]], a@dtprel@l
 ; OPT1-32-LABEL: main
 ; OPT1-32:        addi 3, {{[0-9]+}}, a@got@tlsld
 ; OPT1-32:        bl __tls_get_addr(a@tlsld)@PLT
 ; OPT1-32:        addis [[REG:[0-9]+]], 3, a@dtprel@ha
-; OPT1-32-NEXT:   addi  {{[0-9]+}}, [[REG]], a@dtprel@l
+; OPT1-32:        addi  {{[0-9]+}}, [[REG]], a@dtprel@l
 
 ; Test peephole optimization for thread-local storage using the
 ; local dynamic model.
 
 ; OPT1-LABEL: main:
 ; OPT1:      addis [[REG:[0-9]+]], 2, a@got@tlsld@ha
-; OPT1-NEXT: addi 3, [[REG]], a@got@tlsld@l
+; OPT1:      addi 3, [[REG]], a@got@tlsld@l
 ; OPT1:      bl __tls_get_addr(a@tlsld)
 ; OPT1-NEXT: nop
 ; OPT1:      addis [[REG2:[0-9]+]], 3, a@dtprel@ha
-; OPT1-NEXT: lwa {{[0-9]+}}, a@dtprel@l([[REG2]])
+; OPT1:      lwa {{[0-9]+}}, a@dtprel@l([[REG2]])
 
 ; Test correct assembly code generation for thread-local storage using
 ; the general dynamic model.
@@ -60,8 +60,8 @@ entry:
 }
 
 ; OPT1-LABEL: main2
-; OPT1: addis [[REG:[0-9]+]], 2, a2@got@tlsgd@ha
-; OPT1-NEXT: addi 3, [[REG]], a2@got@tlsgd@l
+; OPT1:      addis [[REG:[0-9]+]], 2, a2@got@tlsgd@ha
+; OPT1:      addi 3, [[REG]], a2@got@tlsgd@l
 ; OPT1:      bl __tls_get_addr(a2@tlsgd)
 ; OPT1-NEXT: nop
 ; OPT1-32-LABEL: main2
diff --git a/test/CodeGen/PowerPC/tls-store2.ll b/test/CodeGen/PowerPC/tls-store2.ll
index f884dd8..e9aa17e 100644
--- a/test/CodeGen/PowerPC/tls-store2.ll
+++ b/test/CodeGen/PowerPC/tls-store2.ll
@@ -19,13 +19,14 @@ entry:
 }
 
 ; CHECK-LABEL: call_once:
-; CHECK: addis 3, 2, __once_callable@got@tlsgd@ha
-; CHECK: addi 3, 3, __once_callable@got@tlsgd@l
+; CHECK: addi 3, {{[0-9]+}}, __once_callable@got@tlsgd@l
 ; CHECK: bl __tls_get_addr(__once_callable@tlsgd)
 ; CHECK-NEXT: nop
-; CHECK: std {{[0-9]+}}, 0(3)
-; CHECK: addis 3, 2, __once_call@got@tlsgd@ha
-; CHECK: addi 3, 3, __once_call@got@tlsgd@l
+; FIXME: We could check here for 'std {{[0-9]+}}, 0(3)', but that no longer
+; works because, with new scheduling freedom, we create a copy of R3 based on the
+; initial scheduling, but don't coalesce it again after we move the instructions
+; so that the copy is no longer necessary.
+; CHECK: addi 3, {{[0-9]+}}, __once_call@got@tlsgd@l
 ; CHECK: bl __tls_get_addr(__once_call@tlsgd)
 ; CHECK-NEXT: nop
 ; CHECK: std {{[0-9]+}}, 0(3)
diff --git a/test/CodeGen/PowerPC/toc-load-sched-bug.ll b/test/CodeGen/PowerPC/toc-load-sched-bug.ll
index d437915..e92c4f4 100644
--- a/test/CodeGen/PowerPC/toc-load-sched-bug.ll
+++ b/test/CodeGen/PowerPC/toc-load-sched-bug.ll
@@ -484,51 +484,51 @@ attributes #7 = { noreturn nounwind }
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.6.0 (trunk 215115) (llvm/trunk 215117)"}
-!1 = metadata !{metadata !2, metadata !4, i64 0}
-!2 = metadata !{metadata !"_ZTSSs", metadata !3, i64 0}
-!3 = metadata !{metadata !"_ZTSNSs12_Alloc_hiderE", metadata !4, i64 0}
-!4 = metadata !{metadata !"any pointer", metadata !5, i64 0}
-!5 = metadata !{metadata !"omnipotent char", metadata !6, i64 0}
-!6 = metadata !{metadata !"Simple C/C++ TBAA"}
-!7 = metadata !{metadata !8, metadata !9, i64 0}
-!8 = metadata !{metadata !"_ZTSNSs9_Rep_baseE", metadata !9, i64 0, metadata !9, i64 8, metadata !10, i64 16}
-!9 = metadata !{metadata !"long", metadata !5, i64 0}
-!10 = metadata !{metadata !"int", metadata !5, i64 0}
-!11 = metadata !{metadata !12, metadata !12, i64 0}
-!12 = metadata !{metadata !"vtable pointer", metadata !6, i64 0}
-!13 = metadata !{metadata !3, metadata !4, i64 0}
-!14 = metadata !{metadata !15, metadata !10, i64 24}
-!15 = metadata !{metadata !"_ZTSN4llvm12SMDiagnosticE", metadata !4, i64 0, metadata !16, i64 8, metadata !2, i64 16, metadata !10, i64 24, metadata !10, i64 28, metadata !17, i64 32, metadata !2, i64 40, metadata !2, i64 48, metadata !18, i64 56, metadata !19, i64 80}
-!16 = metadata !{metadata !"_ZTSN4llvm5SMLocE", metadata !4, i64 0}
-!17 = metadata !{metadata !"_ZTSN4llvm9SourceMgr8DiagKindE", metadata !5, i64 0}
-!18 = metadata !{metadata !"_ZTSSt6vectorISt4pairIjjESaIS1_EE"}
-!19 = metadata !{metadata !"_ZTSN4llvm11SmallVectorINS_7SMFixItELj4EEE", metadata !20, i64 48}
-!20 = metadata !{metadata !"_ZTSN4llvm18SmallVectorStorageINS_7SMFixItELj4EEE", metadata !5, i64 0}
-!21 = metadata !{metadata !15, metadata !10, i64 28}
-!22 = metadata !{metadata !15, metadata !17, i64 32}
-!23 = metadata !{metadata !24, metadata !4, i64 0}
-!24 = metadata !{metadata !"_ZTSN4llvm15SmallVectorBaseE", metadata !4, i64 0, metadata !4, i64 8, metadata !4, i64 16}
-!25 = metadata !{metadata !24, metadata !4, i64 8}
-!26 = metadata !{metadata !24, metadata !4, i64 16}
-!27 = metadata !{metadata !4, metadata !4, i64 0}
-!28 = metadata !{metadata !"branch_weights", i32 64, i32 4}
-!29 = metadata !{metadata !10, metadata !10, i64 0}
-!30 = metadata !{metadata !31, metadata !4, i64 8}
-!31 = metadata !{metadata !"_ZTSN4llvm12MemoryBufferE", metadata !4, i64 8, metadata !4, i64 16}
-!32 = metadata !{metadata !31, metadata !4, i64 16}
-!33 = metadata !{metadata !5, metadata !5, i64 0}
-!34 = metadata !{metadata !35, metadata !4, i64 0}
-!35 = metadata !{metadata !"_ZTSSt12_Vector_baseISt4pairIjjESaIS1_EE", metadata !36, i64 0}
-!36 = metadata !{metadata !"_ZTSNSt12_Vector_baseISt4pairIjjESaIS1_EE12_Vector_implE", metadata !4, i64 0, metadata !4, i64 8, metadata !4, i64 16}
-!37 = metadata !{metadata !38, metadata !38, i64 0}
-!38 = metadata !{metadata !"bool", metadata !5, i64 0}
-!39 = metadata !{i8 0, i8 2}
-!40 = metadata !{metadata !41, metadata !4, i64 0}
-!41 = metadata !{metadata !"_ZTSN4llvm10TimeRegionE", metadata !4, i64 0}
-!42 = metadata !{metadata !43, metadata !44, i64 32}
-!43 = metadata !{metadata !"_ZTSN4llvm11raw_ostreamE", metadata !4, i64 8, metadata !4, i64 16, metadata !4, i64 24, metadata !44, i64 32}
-!44 = metadata !{metadata !"_ZTSN4llvm11raw_ostream10BufferKindE", metadata !5, i64 0}
-!45 = metadata !{metadata !43, metadata !4, i64 24}
-!46 = metadata !{metadata !43, metadata !4, i64 8}
-!47 = metadata !{i64 0, i64 8, metadata !27, i64 8, i64 8, metadata !27}
+!0 = !{!"clang version 3.6.0 (trunk 215115) (llvm/trunk 215117)"}
+!1 = !{!2, !4, i64 0}
+!2 = !{!"_ZTSSs", !3, i64 0}
+!3 = !{!"_ZTSNSs12_Alloc_hiderE", !4, i64 0}
+!4 = !{!"any pointer", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!8, !9, i64 0}
+!8 = !{!"_ZTSNSs9_Rep_baseE", !9, i64 0, !9, i64 8, !10, i64 16}
+!9 = !{!"long", !5, i64 0}
+!10 = !{!"int", !5, i64 0}
+!11 = !{!12, !12, i64 0}
+!12 = !{!"vtable pointer", !6, i64 0}
+!13 = !{!3, !4, i64 0}
+!14 = !{!15, !10, i64 24}
+!15 = !{!"_ZTSN4llvm12SMDiagnosticE", !4, i64 0, !16, i64 8, !2, i64 16, !10, i64 24, !10, i64 28, !17, i64 32, !2, i64 40, !2, i64 48, !18, i64 56, !19, i64 80}
+!16 = !{!"_ZTSN4llvm5SMLocE", !4, i64 0}
+!17 = !{!"_ZTSN4llvm9SourceMgr8DiagKindE", !5, i64 0}
+!18 = !{!"_ZTSSt6vectorISt4pairIjjESaIS1_EE"}
+!19 = !{!"_ZTSN4llvm11SmallVectorINS_7SMFixItELj4EEE", !20, i64 48}
+!20 = !{!"_ZTSN4llvm18SmallVectorStorageINS_7SMFixItELj4EEE", !5, i64 0}
+!21 = !{!15, !10, i64 28}
+!22 = !{!15, !17, i64 32}
+!23 = !{!24, !4, i64 0}
+!24 = !{!"_ZTSN4llvm15SmallVectorBaseE", !4, i64 0, !4, i64 8, !4, i64 16}
+!25 = !{!24, !4, i64 8}
+!26 = !{!24, !4, i64 16}
+!27 = !{!4, !4, i64 0}
+!28 = !{!"branch_weights", i32 64, i32 4}
+!29 = !{!10, !10, i64 0}
+!30 = !{!31, !4, i64 8}
+!31 = !{!"_ZTSN4llvm12MemoryBufferE", !4, i64 8, !4, i64 16}
+!32 = !{!31, !4, i64 16}
+!33 = !{!5, !5, i64 0}
+!34 = !{!35, !4, i64 0}
+!35 = !{!"_ZTSSt12_Vector_baseISt4pairIjjESaIS1_EE", !36, i64 0}
+!36 = !{!"_ZTSNSt12_Vector_baseISt4pairIjjESaIS1_EE12_Vector_implE", !4, i64 0, !4, i64 8, !4, i64 16}
+!37 = !{!38, !38, i64 0}
+!38 = !{!"bool", !5, i64 0}
+!39 = !{i8 0, i8 2}
+!40 = !{!41, !4, i64 0}
+!41 = !{!"_ZTSN4llvm10TimeRegionE", !4, i64 0}
+!42 = !{!43, !44, i64 32}
+!43 = !{!"_ZTSN4llvm11raw_ostreamE", !4, i64 8, !4, i64 16, !4, i64 24, !44, i64 32}
+!44 = !{!"_ZTSN4llvm11raw_ostream10BufferKindE", !5, i64 0}
+!45 = !{!43, !4, i64 24}
+!46 = !{!43, !4, i64 8}
+!47 = !{i64 0, i64 8, !27, i64 8, i64 8, !27}
diff --git a/test/CodeGen/PowerPC/unwind-dw2-g.ll b/test/CodeGen/PowerPC/unwind-dw2-g.ll
index 54d3189..4ae6ff2 100644
--- a/test/CodeGen/PowerPC/unwind-dw2-g.ll
+++ b/test/CodeGen/PowerPC/unwind-dw2-g.ll
@@ -21,15 +21,15 @@ attributes #0 = { nounwind }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !11}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/unwind-dw2.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"/tmp/unwind-dw2.c", metadata !"/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\000\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/unwind-dw2.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!9 = metadata !{i32 2, i32 0, metadata !4, null}
-!10 = metadata !{i32 3, i32 0, metadata !4, null}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4\000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/unwind-dw2.c] [DW_LANG_C99]
+!1 = !{!"/tmp/unwind-dw2.c", !"/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\000\000\001", !1, !5, !6, null, void ()* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/unwind-dw2.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{i32 2, !"Dwarf Version", i32 3}
+!9 = !MDLocation(line: 2, scope: !4)
+!10 = !MDLocation(line: 3, scope: !4)
+!11 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/PowerPC/vec-abi-align.ll b/test/CodeGen/PowerPC/vec-abi-align.ll
index 5075ff2..2ec57af 100644
--- a/test/CodeGen/PowerPC/vec-abi-align.ll
+++ b/test/CodeGen/PowerPC/vec-abi-align.ll
@@ -35,17 +35,17 @@ entry:
   ret void
 
 ; CHECK-LABEL: @test2
-; CHECK: ld {{[0-9]+}}, 112(1)
-; CHECK: li [[REG16:[0-9]+]], 16
-; CHECK: addi [[REGB:[0-9]+]], 1, 112
-; CHECK: lvx 2, [[REGB]], [[REG16]]
+; CHECK-DAG: ld {{[0-9]+}}, 112(1)
+; CHECK-DAG: li [[REG16:[0-9]+]], 16
+; CHECK-DAG: addi [[REGB:[0-9]+]], 1, 112
+; CHECK-DAG: lvx 2, [[REGB]], [[REG16]]
 ; CHECK: blr
 
 ; CHECK-VSX-LABEL: @test2
-; CHECK-VSX: ld {{[0-9]+}}, 112(1)
-; CHECK-VSX: li [[REG16:[0-9]+]], 16
-; CHECK-VSX: addi [[REGB:[0-9]+]], 1, 112
-; CHECK-VSX: lxvw4x {{[0-9]+}}, [[REGB]], [[REG16]]
+; CHECK-VSX-DAG: ld {{[0-9]+}}, 112(1)
+; CHECK-VSX-DAG: li [[REG16:[0-9]+]], 16
+; CHECK-VSX-DAG: addi [[REGB:[0-9]+]], 1, 112
+; CHECK-VSX-DAG: lxvw4x {{[0-9]+}}, [[REGB]], [[REG16]]
 ; CHECK-VSX: blr
 }
 
@@ -61,17 +61,17 @@ entry:
   ret void
 
 ; CHECK-LABEL: @test3
-; CHECK: ld {{[0-9]+}}, 128(1)
-; CHECK: li [[REG16:[0-9]+]], 16
-; CHECK: addi [[REGB:[0-9]+]], 1, 128
-; CHECK: lvx 2, [[REGB]], [[REG16]]
+; CHECK-DAG: ld {{[0-9]+}}, 128(1)
+; CHECK-DAG: li [[REG16:[0-9]+]], 16
+; CHECK-DAG: addi [[REGB:[0-9]+]], 1, 128
+; CHECK-DAG: lvx 2, [[REGB]], [[REG16]]
 ; CHECK: blr
 
 ; CHECK-VSX-LABEL: @test3
-; CHECK-VSX: ld {{[0-9]+}}, 128(1)
-; CHECK-VSX: li [[REG16:[0-9]+]], 16
-; CHECK-VSX: addi [[REGB:[0-9]+]], 1, 128
-; CHECK-VSX: lxvw4x {{[0-9]+}}, [[REGB]], [[REG16]]
+; CHECK-VSX-DAG: ld {{[0-9]+}}, 128(1)
+; CHECK-VSX-DAG: li [[REG16:[0-9]+]], 16
+; CHECK-VSX-DAG: addi [[REGB:[0-9]+]], 1, 128
+; CHECK-VSX-DAG: lxvw4x {{[0-9]+}}, [[REGB]], [[REG16]]
 ; CHECK-VSX: blr
 }
 
diff --git a/test/CodeGen/PowerPC/vec_clz.ll b/test/CodeGen/PowerPC/vec_clz.ll
new file mode 100644
index 0000000..01cdecd
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_clz.ll
@@ -0,0 +1,40 @@
+; Check the vctlz* instructions that were added in P8
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s
+
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>) nounwind readnone
+
+define <16 x i8> @test_v16i8(<16 x i8> %x) nounwind readnone {
+       %vcnt = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %x)
+       ret <16 x i8> %vcnt
+; CHECK: @test_v16i8
+; CHECK: vclzb 2, 2
+; CHECK: blr
+}
+
+define <8 x i16> @test_v8i16(<8 x i16> %x) nounwind readnone {
+       %vcnt = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %x)
+       ret <8 x i16> %vcnt
+; CHECK: @test_v8i16
+; CHECK: vclzh 2, 2
+; CHECK: blr
+}
+
+define <4 x i32> @test_v4i32(<4 x i32> %x) nounwind readnone {
+       %vcnt = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x)
+       ret <4 x i32> %vcnt
+; CHECK: @test_v4i32
+; CHECK: vclzw 2, 2
+; CHECK: blr
+}
+
+define <2 x i64> @test_v2i64(<2 x i64> %x) nounwind readnone {
+       %vcnt = tail call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %x)
+       ret <2 x i64> %vcnt
+; CHECK: @test_v2i64
+; CHECK: vclzd 2, 2
+; CHECK: blr
+}
diff --git a/test/CodeGen/PowerPC/vec_misaligned.ll b/test/CodeGen/PowerPC/vec_misaligned.ll
index 73a4a4d..49f11e4 100644
--- a/test/CodeGen/PowerPC/vec_misaligned.ll
+++ b/test/CodeGen/PowerPC/vec_misaligned.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=ppc32 -mcpu=g5 | FileCheck %s
 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -mattr=-vsx -mattr=-power8-vector | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=CHECK-LE
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec -mattr=-vsx -mattr=-power8-vector | FileCheck %s -check-prefix=CHECK-LE
 
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc-apple-darwin8"
diff --git a/test/CodeGen/PowerPC/vec_popcnt.ll b/test/CodeGen/PowerPC/vec_popcnt.ll
new file mode 100644
index 0000000..0ce9dfa
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_popcnt.ll
@@ -0,0 +1,72 @@
+; Check the vecpopcnt* instructions that were added in P8
+; In addition, check the conversions to/from the v2i64 VMX register that was also added in P8.
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s
+
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
+
+define <16 x i8> @test_v16i8_v2i64(<2 x i64> %x) nounwind readnone {
+       %tmp  = bitcast <2 x i64> %x to <16 x i8>;
+       %vcnt = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %tmp)
+       ret <16 x i8> %vcnt
+; CHECK: @test_v16i8_v2i64
+; CHECK: vpopcntb 2, 2
+; CHECK: blr
+}
+
+define <8 x i16> @test_v8i16_v2i64(<2 x i64> %x) nounwind readnone {
+       %tmp = bitcast <2 x i64> %x to <8 x i16>
+       %vcnt = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %tmp)
+       ret <8 x i16> %vcnt
+; CHECK: @test_v8i16_v2i64
+; CHECK: vpopcnth 2, 2
+; CHECK: blr
+}
+
+define <4 x i32> @test_v4i32_v2i64(<2 x i64> %x) nounwind readnone {
+       %tmp = bitcast <2 x i64> %x to <4 x i32>
+       %vcnt = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %tmp)
+       ret <4 x i32> %vcnt
+; CHECK: @test_v4i32_v2i64
+; CHECK: vpopcntw 2, 2
+; CHECK: blr
+}
+
+define <2 x i64> @test_v2i64_v2i64(<2 x i64> %x) nounwind readnone {
+       %vcnt = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x)
+       ret <2 x i64> %vcnt
+; CHECK: @test_v2i64_v2i64
+; CHECK: vpopcntd 2, 2
+; CHECK: blr
+}
+
+define <2 x i64> @test_v2i64_v4i32(<4 x i32> %x) nounwind readnone {
+       %tmp = bitcast <4 x i32> %x to <2 x i64>
+       %vcnt = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %tmp)
+       ret <2 x i64> %vcnt
+; CHECK: @test_v2i64_v4i32
+; CHECK: vpopcntd 2, 2
+; CHECK: blr
+}
+
+
+define <2 x i64> @test_v2i64_v8i16(<8 x i16> %x) nounwind readnone {
+       %tmp = bitcast <8 x i16> %x to <2 x i64>
+       %vcnt = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %tmp)
+       ret <2 x i64> %vcnt
+; CHECK: @test_v2i64_v8i16
+; CHECK: vpopcntd 2, 2
+; CHECK: blr
+}
+
+define <2 x i64> @test_v2i64_v16i8(<16 x i8> %x) nounwind readnone {
+       %tmp = bitcast <16 x i8> %x to <2 x i64>
+       %vcnt = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %tmp)
+       ret <2 x i64> %vcnt
+; CHECK: @test_v2i64_v16i8
+; CHECK: vpopcntd 2, 2
+; CHECK: blr
+}
diff --git a/test/CodeGen/PowerPC/vec_shuffle_le.ll b/test/CodeGen/PowerPC/vec_shuffle_le.ll
index a4b2119..c7fc1c6 100644
--- a/test/CodeGen/PowerPC/vec_shuffle_le.ll
+++ b/test/CodeGen/PowerPC/vec_shuffle_le.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec -mattr=-vsx -mcpu=pwr7 | FileCheck %s
 
 define void @VPKUHUM_xy(<16 x i8>* %A, <16 x i8>* %B) {
 entry:
diff --git a/test/CodeGen/PowerPC/vec_veqv_vnand_vorc.ll b/test/CodeGen/PowerPC/vec_veqv_vnand_vorc.ll
new file mode 100644
index 0000000..f7d5a51
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_veqv_vnand_vorc.ll
@@ -0,0 +1,29 @@
+; Check the miscellaneous logical vector operations added in P8
+; 
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s
+; Test x eqv y
+define <4 x i32> @test_veqv(<4 x i32> %x, <4 x i32> %y) nounwind {
+       %tmp = xor <4 x i32> %x, %y
+       %ret_val = xor <4 x i32> %tmp, < i32 -1, i32 -1, i32 -1, i32 -1>
+       ret <4 x i32> %ret_val
+; CHECK: veqv 2, 2, 3
+}
+
+; Test x vnand y
+define <4 x i32> @test_vnand(<4 x i32> %x, <4 x i32> %y) nounwind {
+       %tmp = and <4 x i32> %x, %y
+       %ret_val = xor <4 x i32> %tmp, <i32 -1, i32 -1, i32 -1, i32 -1>
+       ret <4 x i32> %ret_val
+; CHECK: vnand 2, 2, 3
+}
+
+; Test x vorc y and variants
+define <4 x i32> @test_vorc(<4 x i32> %x, <4 x i32> %y) nounwind {
+       %tmp1 = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+       %tmp2 = or <4 x i32> %x, %tmp1
+; CHECK: vorc 3, 2, 3      
+       %tmp3 = xor <4 x i32> %tmp2, <i32 -1, i32 -1, i32 -1, i32 -1>
+       %tmp4 = or <4 x i32> %tmp3, %x
+; CHECK: vorc 2, 2, 3
+       ret <4 x i32> %tmp4
+}
diff --git a/test/CodeGen/PowerPC/vsel-prom.ll b/test/CodeGen/PowerPC/vsel-prom.ll
new file mode 100644
index 0000000..dd219ec
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsel-prom.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @Compute_Lateral() #0 {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  unreachable
+
+if.end:                                           ; preds = %entry
+  %0 = select i1 undef, <2 x double> undef, <2 x double> zeroinitializer
+  %1 = extractelement <2 x double> %0, i32 1
+  store double %1, double* undef, align 8
+  ret void
+
+; CHECK-LABEL: @Compute_Lateral
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/vsx-args.ll b/test/CodeGen/PowerPC/vsx-args.ll
index 520aeb5..2b53c0a 100644
--- a/test/CodeGen/PowerPC/vsx-args.ll
+++ b/test/CodeGen/PowerPC/vsx-args.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mcpu=pwr7 -mattr=+vsx | FileCheck %s
+; RUN: llc < %s -mcpu=pwr7 -mattr=+vsx -fast-isel -O0 | FileCheck %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/PowerPC/vsx-fma-m.ll b/test/CodeGen/PowerPC/vsx-fma-m.ll
index 9dff9a7..ab36072 100644
--- a/test/CodeGen/PowerPC/vsx-fma-m.ll
+++ b/test/CodeGen/PowerPC/vsx-fma-m.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mcpu=pwr7 -mattr=+vsx | FileCheck %s
+; RUN: llc < %s -mcpu=pwr7 -mattr=+vsx -fast-isel -O0 | FileCheck -check-prefix=CHECK-FISL %s
 
 ; Also run with -schedule-ppc-vsx-fma-mutation-early as a stress test for the
 ; live-interval-updating logic.
@@ -22,6 +23,15 @@ entry:
 ; CHECK-DAG: stxsdx 3, 0, 7
 ; CHECK-DAG: stxsdx 1, 7, [[C1]]
 ; CHECK: blr
+
+; CHECK-FISL-LABEL: @test1
+; CHECK-FISL-DAG: fmr 0, 1
+; CHECK-FISL-DAG: xsmaddadp 0, 2, 3
+; CHECK-FISL-DAG: stxsdx 0, 0, 7
+; CHECK-FISL-DAG: xsmaddadp 1, 2, 4
+; CHECK-FISL-DAG: li [[C1:[0-9]+]], 8
+; CHECK-FISL-DAG: stxsdx 1, 7, [[C1]]
+; CHECK-FISL: blr
 }
 
 define void @test2(double %a, double %b, double %c, double %e, double %f, double* nocapture %d) #0 {
@@ -46,6 +56,19 @@ entry:
 ; CHECK-DAG: stxsdx 4, 8, [[C1]]
 ; CHECK-DAG: stxsdx 1, 8, [[C2]]
 ; CHECK: blr
+
+; CHECK-FISL-LABEL: @test2
+; CHECK-FISL-DAG: fmr 0, 1
+; CHECK-FISL-DAG: xsmaddadp 0, 2, 3
+; CHECK-FISL-DAG: stxsdx 0, 0, 8
+; CHECK-FISL-DAG: fmr 0, 1
+; CHECK-FISL-DAG: xsmaddadp 0, 2, 4
+; CHECK-FISL-DAG: li [[C1:[0-9]+]], 8
+; CHECK-FISL-DAG: stxsdx 0, 8, [[C1]]
+; CHECK-FISL-DAG: xsmaddadp 1, 2, 5
+; CHECK-FISL-DAG: li [[C2:[0-9]+]], 16
+; CHECK-FISL-DAG: stxsdx 1, 8, [[C2]]
+; CHECK-FISL: blr
 }
 
 define void @test3(double %a, double %b, double %c, double %e, double %f, double* nocapture %d) #0 {
@@ -81,6 +104,20 @@ entry:
 ; CHECK-DAG: stxsdx 1, 8, [[C2]]
 ; CHECK-DAG: stxsdx 4, 8, [[C3]]
 ; CHECK: blr
+
+; CHECK-FISL-LABEL: @test3
+; CHECK-FISL-DAG: fmr [[F1:[0-9]+]], 1
+; CHECK-FISL-DAG: xsmaddadp [[F1]], 2, 4
+; CHECK-FISL-DAG: fmr 4, [[F1]]
+; CHECK-FISL-DAG: xsmaddadp 4, 2, 3
+; CHECK-FISL-DAG: li [[C1:[0-9]+]], 24
+; CHECK-FISL-DAG: stxsdx 4, 8, [[C1]]
+; CHECK-FISL-DAG: xsmaddadp 1, 2, 5
+; CHECK-FISL-DAG: li [[C2:[0-9]+]], 16
+; CHECK-FISL-DAG: stxsdx 1, 8, [[C2]]
+; CHECK-FISL-DAG: li [[C3:[0-9]+]], 8
+; CHECK-FISL-DAG: stxsdx 0, 8, [[C3]]
+; CHECK-FISL: blr
 }
 
 define void @test4(double %a, double %b, double %c, double %e, double %f, double* nocapture %d) #0 {
@@ -116,6 +153,22 @@ entry:
 ; CHECK-DAG: stxsdx 4, 8, [[C3]]
 ; CHECK-DAG: stxsdx 1, 8, [[C2]]
 ; CHECK: blr
+
+; CHECK-FISL-LABEL: @test4
+; CHECK-FISL-DAG: fmr [[F1:[0-9]+]], 1
+; CHECK-FISL-DAG: xsmaddadp [[F1]], 2, 3
+; CHECK-FISL-DAG: stxsdx 0, 0, 8
+; CHECK-FISL-DAG: fmr [[F1]], 1
+; CHECK-FISL-DAG: xsmaddadp [[F1]], 2, 4
+; CHECK-FISL-DAG: li [[C3:[0-9]+]], 8
+; CHECK-FISL-DAG: stxsdx 0, 8, [[C3]]
+; CHECK-FISL-DAG: xsmaddadp 0, 2, 3
+; CHECK-FISL-DAG: li [[C1:[0-9]+]], 24
+; CHECK-FISL-DAG: stxsdx 0, 8, [[C1]]
+; CHECK-FISL-DAG: xsmaddadp 1, 2, 5
+; CHECK-FISL-DAG: li [[C2:[0-9]+]], 16
+; CHECK-FISL-DAG: stxsdx 1, 8, [[C2]]
+; CHECK-FISL: blr
 }
 
 declare double @llvm.fma.f64(double, double, double) #0
@@ -136,6 +189,15 @@ entry:
 ; CHECK-DAG: stxvd2x 36, 0, 3
 ; CHECK-DAG: stxvd2x 34, 3, [[C1:[0-9]+]]
 ; CHECK: blr
+
+; CHECK-FISL-LABEL: @testv1
+; CHECK-FISL-DAG: xxlor 0, 34, 34
+; CHECK-FISL-DAG: xvmaddadp 0, 35, 36
+; CHECK-FISL-DAG: stxvd2x 0, 0, 3
+; CHECK-FISL-DAG: xvmaddadp 34, 35, 37
+; CHECK-FISL-DAG: li [[C1:[0-9]+]], 16
+; CHECK-FISL-DAG: stxvd2x 34, 3, [[C1:[0-9]+]]
+; CHECK-FISL: blr
 }
 
 define void @testv2(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %e, <2 x double> %f, <2 x double>* nocapture %d) #0 {
@@ -160,6 +222,19 @@ entry:
 ; CHECK-DAG: stxvd2x 37, 3, [[C1:[0-9]+]]
 ; CHECK-DAG: stxvd2x 34, 3, [[C2:[0-9]+]]
 ; CHECK: blr
+
+; CHECK-FISL-LABEL: @testv2
+; CHECK-FISL-DAG: xxlor 0, 34, 34
+; CHECK-FISL-DAG: xvmaddadp 0, 35, 36
+; CHECK-FISL-DAG: stxvd2x 0, 0, 3
+; CHECK-FISL-DAG: xxlor 0, 34, 34
+; CHECK-FISL-DAG: xvmaddadp 0, 35, 37
+; CHECK-FISL-DAG: li [[C1:[0-9]+]], 16
+; CHECK-FISL-DAG: stxvd2x 0, 3, [[C1:[0-9]+]]
+; CHECK-FISL-DAG: xvmaddadp 34, 35, 38
+; CHECK-FISL-DAG: li [[C2:[0-9]+]], 32
+; CHECK-FISL-DAG: stxvd2x 34, 3, [[C2:[0-9]+]]
+; CHECK-FISL: blr
 }
 
 define void @testv3(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %e, <2 x double> %f, <2 x double>* nocapture %d) #0 {
@@ -194,13 +269,30 @@ entry:
 ; re-ordering the instructions.
 ; CHECK-DAG: xvmaddadp [[V1]], 35, 36
 
-; CHECK-DAG: xvmaddmdp 36, 35, 37
+; CHECK-DAG: xvmaddmdp 35, 36, 37
 ; CHECK-DAG: xvmaddadp 34, 35, 38
 ; CHECK-DAG: stxvd2x 32, 0, 3
-; CHECK-DAG: stxvd2x 36, 3, [[C1]]
+; CHECK-DAG: stxvd2x 35, 3, [[C1]]
 ; CHECK-DAG: stxvd2x 34, 3, [[C2]]
 ; CHECK-DAG: stxvd2x 37, 3, [[C3]]
 ; CHECK: blr
+
+; CHECK-FISL-LABEL: @testv3
+; CHECK-FISL-DAG: xxlor [[V1:[0-9]+]], 34, 34
+; CHECK-FISL-DAG: xvmaddadp [[V1]], 35, 36
+; CHECK-FISL-DAG: stxvd2x [[V1]], 0, 3
+; CHECK-FISL-DAG: xxlor [[V2:[0-9]+]], 34, 34
+; CHECK-FISL-DAG: xvmaddadp [[V2]], 35, 37
+; CHECK-FISL-DAG: xxlor [[V3:[0-9]+]], 0, 0
+; CHECK-FISL-DAG: xvmaddadp [[V3]], 35, 36
+; CHECK-FISL-DAG: li [[C1:[0-9]+]], 48
+; CHECK-FISL-DAG: stxvd2x [[V3]], 3, [[C1]]
+; CHECK-FISL-DAG: xvmaddadp 34, 35, 38
+; CHECK-FISL-DAG: li [[C2:[0-9]+]], 32
+; CHECK-FISL-DAG: stxvd2x 34, 3, [[C2]]
+; CHECK-FISL-DAG: li [[C3:[0-9]+]], 16
+; CHECK-FISL-DAG: stxvd2x 0, 3, [[C3]]
+; CHECK-FISL: blr
 }
 
 define void @testv4(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %e, <2 x double> %f, <2 x double>* nocapture %d) #0 {
@@ -236,6 +328,22 @@ entry:
 ; CHECK-DAG: stxvd2x 37, 3, [[C3]]
 ; CHECK-DAG: stxvd2x 34, 3, [[C2]]
 ; CHECK: blr
+
+; CHECK-FISL-LABEL: @testv4
+; CHECK-FISL-DAG: xxlor [[V1:[0-9]+]], 34, 34
+; CHECK-FISL-DAG: xvmaddadp [[V1]], 35, 36
+; CHECK-FISL-DAG: stxvd2x 0, 0, 3
+; CHECK-FISL-DAG: xxlor [[V2:[0-9]+]], 34, 34
+; CHECK-FISL-DAG: xvmaddadp [[V2]], 35, 37
+; CHECK-FISL-DAG: li [[C1:[0-9]+]], 16
+; CHECK-FISL-DAG: stxvd2x 0, 3, [[C1]]
+; CHECK-FISL-DAG: xvmaddadp 0, 35, 37
+; CHECK-FISL-DAG: li [[C3:[0-9]+]], 48
+; CHECK-FISL-DAG: stxvd2x 0, 3, [[C3]]
+; CHECK-FISL-DAG: xvmaddadp 0, 35, 36
+; CHECK-FISL-DAG: li [[C2:[0-9]+]], 32
+; CHECK-FISL-DAG: stxvd2x 34, 3, [[C2]]
+; CHECK-FISL: blr
 }
 
 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #0
diff --git a/test/CodeGen/PowerPC/vsx-infl-copy1.ll b/test/CodeGen/PowerPC/vsx-infl-copy1.ll
new file mode 100644
index 0000000..cff7f8f
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-infl-copy1.ll
@@ -0,0 +1,133 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@ub = external global [1024 x i32], align 4
+@uc = external global [1024 x i32], align 4
+
+; Function Attrs: noinline nounwind
+define void @_Z8example9Pj() #0 {
+entry:
+  br label %vector.body
+
+; CHECK-LABEL: @_Z8example9Pj
+; CHECK: xxlor
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %43, %vector.body ]
+  %vec.phi20 = phi <4 x i32> [ zeroinitializer, %entry ], [ %44, %vector.body ]
+  %vec.phi21 = phi <4 x i32> [ zeroinitializer, %entry ], [ %45, %vector.body ]
+  %vec.phi23 = phi <4 x i32> [ zeroinitializer, %entry ], [ %46, %vector.body ]
+  %vec.phi24 = phi <4 x i32> [ zeroinitializer, %entry ], [ %47, %vector.body ]
+  %vec.phi25 = phi <4 x i32> [ zeroinitializer, %entry ], [ %48, %vector.body ]
+  %vec.phi26 = phi <4 x i32> [ zeroinitializer, %entry ], [ %49, %vector.body ]
+  %vec.phi27 = phi <4 x i32> [ zeroinitializer, %entry ], [ %50, %vector.body ]
+  %vec.phi28 = phi <4 x i32> [ zeroinitializer, %entry ], [ %51, %vector.body ]
+  %vec.phi29 = phi <4 x i32> [ zeroinitializer, %entry ], [ %52, %vector.body ]
+  %vec.phi30 = phi <4 x i32> [ zeroinitializer, %entry ], [ %53, %vector.body ]
+  %wide.load32 = load <4 x i32>* null, align 4
+  %.sum82 = add i64 %index, 24
+  %0 = getelementptr [1024 x i32]* @ub, i64 0, i64 %.sum82
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load36 = load <4 x i32>* %1, align 4
+  %wide.load37 = load <4 x i32>* undef, align 4
+  %.sum84 = add i64 %index, 32
+  %2 = getelementptr [1024 x i32]* @ub, i64 0, i64 %.sum84
+  %3 = bitcast i32* %2 to <4 x i32>*
+  %wide.load38 = load <4 x i32>* %3, align 4
+  %.sum85 = add i64 %index, 36
+  %4 = getelementptr [1024 x i32]* @ub, i64 0, i64 %.sum85
+  %5 = bitcast i32* %4 to <4 x i32>*
+  %wide.load39 = load <4 x i32>* %5, align 4
+  %6 = getelementptr [1024 x i32]* @ub, i64 0, i64 undef
+  %7 = bitcast i32* %6 to <4 x i32>*
+  %wide.load40 = load <4 x i32>* %7, align 4
+  %.sum87 = add i64 %index, 44
+  %8 = getelementptr [1024 x i32]* @ub, i64 0, i64 %.sum87
+  %9 = bitcast i32* %8 to <4 x i32>*
+  %wide.load41 = load <4 x i32>* %9, align 4
+  %10 = getelementptr inbounds [1024 x i32]* @uc, i64 0, i64 %index
+  %11 = bitcast i32* %10 to <4 x i32>*
+  %wide.load42 = load <4 x i32>* %11, align 4
+  %.sum8889 = or i64 %index, 4
+  %12 = getelementptr [1024 x i32]* @uc, i64 0, i64 %.sum8889
+  %13 = bitcast i32* %12 to <4 x i32>*
+  %wide.load43 = load <4 x i32>* %13, align 4
+  %.sum9091 = or i64 %index, 8
+  %14 = getelementptr [1024 x i32]* @uc, i64 0, i64 %.sum9091
+  %15 = bitcast i32* %14 to <4 x i32>*
+  %wide.load44 = load <4 x i32>* %15, align 4
+  %.sum94 = add i64 %index, 16
+  %16 = getelementptr [1024 x i32]* @uc, i64 0, i64 %.sum94
+  %17 = bitcast i32* %16 to <4 x i32>*
+  %wide.load46 = load <4 x i32>* %17, align 4
+  %.sum95 = add i64 %index, 20
+  %18 = getelementptr [1024 x i32]* @uc, i64 0, i64 %.sum95
+  %19 = bitcast i32* %18 to <4 x i32>*
+  %wide.load47 = load <4 x i32>* %19, align 4
+  %20 = getelementptr [1024 x i32]* @uc, i64 0, i64 undef
+  %21 = bitcast i32* %20 to <4 x i32>*
+  %wide.load48 = load <4 x i32>* %21, align 4
+  %.sum97 = add i64 %index, 28
+  %22 = getelementptr [1024 x i32]* @uc, i64 0, i64 %.sum97
+  %23 = bitcast i32* %22 to <4 x i32>*
+  %wide.load49 = load <4 x i32>* %23, align 4
+  %.sum98 = add i64 %index, 32
+  %24 = getelementptr [1024 x i32]* @uc, i64 0, i64 %.sum98
+  %25 = bitcast i32* %24 to <4 x i32>*
+  %wide.load50 = load <4 x i32>* %25, align 4
+  %.sum99 = add i64 %index, 36
+  %26 = getelementptr [1024 x i32]* @uc, i64 0, i64 %.sum99
+  %27 = bitcast i32* %26 to <4 x i32>*
+  %wide.load51 = load <4 x i32>* %27, align 4
+  %.sum100 = add i64 %index, 40
+  %28 = getelementptr [1024 x i32]* @uc, i64 0, i64 %.sum100
+  %29 = bitcast i32* %28 to <4 x i32>*
+  %wide.load52 = load <4 x i32>* %29, align 4
+  %.sum101 = add i64 %index, 44
+  %30 = getelementptr [1024 x i32]* @uc, i64 0, i64 %.sum101
+  %31 = bitcast i32* %30 to <4 x i32>*
+  %wide.load53 = load <4 x i32>* %31, align 4
+  %32 = add <4 x i32> zeroinitializer, %vec.phi
+  %33 = add <4 x i32> zeroinitializer, %vec.phi20
+  %34 = add <4 x i32> %wide.load32, %vec.phi21
+  %35 = add <4 x i32> zeroinitializer, %vec.phi23
+  %36 = add <4 x i32> zeroinitializer, %vec.phi24
+  %37 = add <4 x i32> %wide.load36, %vec.phi25
+  %38 = add <4 x i32> %wide.load37, %vec.phi26
+  %39 = add <4 x i32> %wide.load38, %vec.phi27
+  %40 = add <4 x i32> %wide.load39, %vec.phi28
+  %41 = add <4 x i32> %wide.load40, %vec.phi29
+  %42 = add <4 x i32> %wide.load41, %vec.phi30
+  %43 = sub <4 x i32> %32, %wide.load42
+  %44 = sub <4 x i32> %33, %wide.load43
+  %45 = sub <4 x i32> %34, %wide.load44
+  %46 = sub <4 x i32> %35, %wide.load46
+  %47 = sub <4 x i32> %36, %wide.load47
+  %48 = sub <4 x i32> %37, %wide.load48
+  %49 = sub <4 x i32> %38, %wide.load49
+  %50 = sub <4 x i32> %39, %wide.load50
+  %51 = sub <4 x i32> %40, %wide.load51
+  %52 = sub <4 x i32> %41, %wide.load52
+  %53 = sub <4 x i32> %42, %wide.load53
+  %index.next = add i64 %index, 48
+  br i1 false, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %.lcssa112 = phi <4 x i32> [ %53, %vector.body ]
+  %.lcssa111 = phi <4 x i32> [ %52, %vector.body ]
+  %.lcssa110 = phi <4 x i32> [ %51, %vector.body ]
+  %.lcssa109 = phi <4 x i32> [ %50, %vector.body ]
+  %.lcssa108 = phi <4 x i32> [ %49, %vector.body ]
+  %.lcssa107 = phi <4 x i32> [ %48, %vector.body ]
+  %.lcssa106 = phi <4 x i32> [ %47, %vector.body ]
+  %.lcssa105 = phi <4 x i32> [ %46, %vector.body ]
+  %.lcssa103 = phi <4 x i32> [ %45, %vector.body ]
+  %.lcssa102 = phi <4 x i32> [ %44, %vector.body ]
+  %.lcssa = phi <4 x i32> [ %43, %vector.body ]
+  ret void
+}
+
+attributes #0 = { noinline nounwind }
+
diff --git a/test/CodeGen/PowerPC/vsx-infl-copy2.ll b/test/CodeGen/PowerPC/vsx-infl-copy2.ll
new file mode 100644
index 0000000..0f27906
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-infl-copy2.ll
@@ -0,0 +1,114 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @_Z28test_goto_loop_unroll_factorILi22EiEvPKT0_iPKc(i32* nocapture readonly %first) #0 {
+entry:
+  br i1 false, label %loop2_start, label %if.end5
+
+; CHECK-LABEL: @_Z28test_goto_loop_unroll_factorILi22EiEvPKT0_iPKc
+
+loop2_start:                                      ; preds = %loop2_start, %entry
+  br i1 undef, label %loop2_start, label %if.then.i31
+
+if.end5:                                          ; preds = %entry
+  br i1 undef, label %loop_start.preheader, label %if.then.i31
+
+loop_start.preheader:                             ; preds = %if.end5
+  br i1 false, label %middle.block, label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %loop_start.preheader
+  %vec.phi61 = phi <4 x i32> [ %34, %vector.body ], [ zeroinitializer, %loop_start.preheader ]
+  %vec.phi62 = phi <4 x i32> [ %35, %vector.body ], [ zeroinitializer, %loop_start.preheader ]
+  %vec.phi63 = phi <4 x i32> [ %36, %vector.body ], [ zeroinitializer, %loop_start.preheader ]
+  %vec.phi65 = phi <4 x i32> [ %37, %vector.body ], [ zeroinitializer, %loop_start.preheader ]
+  %vec.phi67 = phi <4 x i32> [ %38, %vector.body ], [ zeroinitializer, %loop_start.preheader ]
+  %vec.phi68 = phi <4 x i32> [ %39, %vector.body ], [ zeroinitializer, %loop_start.preheader ]
+  %vec.phi69 = phi <4 x i32> [ %40, %vector.body ], [ zeroinitializer, %loop_start.preheader ]
+  %vec.phi70 = phi <4 x i32> [ %41, %vector.body ], [ zeroinitializer, %loop_start.preheader ]
+  %vec.phi71 = phi <4 x i32> [ %42, %vector.body ], [ zeroinitializer, %loop_start.preheader ]
+  %.sum = add i64 0, 4
+  %wide.load72 = load <4 x i32>* null, align 4
+  %.sum109 = add i64 0, 8
+  %0 = getelementptr i32* %first, i64 %.sum109
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load73 = load <4 x i32>* %1, align 4
+  %.sum110 = add i64 0, 12
+  %2 = getelementptr i32* %first, i64 %.sum110
+  %3 = bitcast i32* %2 to <4 x i32>*
+  %wide.load74 = load <4 x i32>* %3, align 4
+  %.sum112 = add i64 0, 20
+  %4 = getelementptr i32* %first, i64 %.sum112
+  %5 = bitcast i32* %4 to <4 x i32>*
+  %wide.load76 = load <4 x i32>* %5, align 4
+  %.sum114 = add i64 0, 28
+  %6 = getelementptr i32* %first, i64 %.sum114
+  %7 = bitcast i32* %6 to <4 x i32>*
+  %wide.load78 = load <4 x i32>* %7, align 4
+  %.sum115 = add i64 0, 32
+  %8 = getelementptr i32* %first, i64 %.sum115
+  %9 = bitcast i32* %8 to <4 x i32>*
+  %wide.load79 = load <4 x i32>* %9, align 4
+  %.sum116 = add i64 0, 36
+  %10 = getelementptr i32* %first, i64 %.sum116
+  %11 = bitcast i32* %10 to <4 x i32>*
+  %wide.load80 = load <4 x i32>* %11, align 4
+  %.sum117 = add i64 0, 40
+  %12 = getelementptr i32* %first, i64 %.sum117
+  %13 = bitcast i32* %12 to <4 x i32>*
+  %wide.load81 = load <4 x i32>* %13, align 4
+  %.sum118 = add i64 0, 44
+  %14 = getelementptr i32* %first, i64 %.sum118
+  %15 = bitcast i32* %14 to <4 x i32>*
+  %wide.load82 = load <4 x i32>* %15, align 4
+  %16 = mul <4 x i32> %wide.load72, <i32 269850533, i32 269850533, i32 269850533, i32 269850533>
+  %17 = mul <4 x i32> %wide.load73, <i32 269850533, i32 269850533, i32 269850533, i32 269850533>
+  %18 = mul <4 x i32> %wide.load74, <i32 269850533, i32 269850533, i32 269850533, i32 269850533>
+  %19 = mul <4 x i32> %wide.load76, <i32 269850533, i32 269850533, i32 269850533, i32 269850533>
+  %20 = mul <4 x i32> %wide.load78, <i32 269850533, i32 269850533, i32 269850533, i32 269850533>
+  %21 = mul <4 x i32> %wide.load79, <i32 269850533, i32 269850533, i32 269850533, i32 269850533>
+  %22 = mul <4 x i32> %wide.load80, <i32 269850533, i32 269850533, i32 269850533, i32 269850533>
+  %23 = mul <4 x i32> %wide.load81, <i32 269850533, i32 269850533, i32 269850533, i32 269850533>
+  %24 = mul <4 x i32> %wide.load82, <i32 269850533, i32 269850533, i32 269850533, i32 269850533>
+  %25 = add <4 x i32> %16, <i32 -1138325064, i32 -1138325064, i32 -1138325064, i32 -1138325064>
+  %26 = add <4 x i32> %17, <i32 -1138325064, i32 -1138325064, i32 -1138325064, i32 -1138325064>
+  %27 = add <4 x i32> %18, <i32 -1138325064, i32 -1138325064, i32 -1138325064, i32 -1138325064>
+  %28 = add <4 x i32> %19, <i32 -1138325064, i32 -1138325064, i32 -1138325064, i32 -1138325064>
+  %29 = add <4 x i32> %20, <i32 -1138325064, i32 -1138325064, i32 -1138325064, i32 -1138325064>
+  %30 = add <4 x i32> %21, <i32 -1138325064, i32 -1138325064, i32 -1138325064, i32 -1138325064>
+  %31 = add <4 x i32> %22, <i32 -1138325064, i32 -1138325064, i32 -1138325064, i32 -1138325064>
+  %32 = add <4 x i32> %23, <i32 -1138325064, i32 -1138325064, i32 -1138325064, i32 -1138325064>
+  %33 = add <4 x i32> %24, <i32 -1138325064, i32 -1138325064, i32 -1138325064, i32 -1138325064>
+  %34 = add nsw <4 x i32> %25, %vec.phi61
+  %35 = add nsw <4 x i32> %26, %vec.phi62
+  %36 = add nsw <4 x i32> %27, %vec.phi63
+  %37 = add nsw <4 x i32> %28, %vec.phi65
+  %38 = add nsw <4 x i32> %29, %vec.phi67
+  %39 = add nsw <4 x i32> %30, %vec.phi68
+  %40 = add nsw <4 x i32> %31, %vec.phi69
+  %41 = add nsw <4 x i32> %32, %vec.phi70
+  %42 = add nsw <4 x i32> %33, %vec.phi71
+  br i1 false, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body, %loop_start.preheader
+  %rdx.vec.exit.phi85 = phi <4 x i32> [ zeroinitializer, %loop_start.preheader ], [ %34, %vector.body ]
+  %rdx.vec.exit.phi86 = phi <4 x i32> [ zeroinitializer, %loop_start.preheader ], [ %35, %vector.body ]
+  %rdx.vec.exit.phi87 = phi <4 x i32> [ zeroinitializer, %loop_start.preheader ], [ %36, %vector.body ]
+  %rdx.vec.exit.phi89 = phi <4 x i32> [ zeroinitializer, %loop_start.preheader ], [ %37, %vector.body ]
+  %rdx.vec.exit.phi91 = phi <4 x i32> [ zeroinitializer, %loop_start.preheader ], [ %38, %vector.body ]
+  %rdx.vec.exit.phi92 = phi <4 x i32> [ zeroinitializer, %loop_start.preheader ], [ %39, %vector.body ]
+  %rdx.vec.exit.phi93 = phi <4 x i32> [ zeroinitializer, %loop_start.preheader ], [ %40, %vector.body ]
+  %rdx.vec.exit.phi94 = phi <4 x i32> [ zeroinitializer, %loop_start.preheader ], [ %41, %vector.body ]
+  %rdx.vec.exit.phi95 = phi <4 x i32> [ zeroinitializer, %loop_start.preheader ], [ %42, %vector.body ]
+  br i1 false, label %if.then.i31, label %loop_start.prol
+
+loop_start.prol:                                  ; preds = %loop_start.prol, %middle.block
+  br label %loop_start.prol
+
+if.then.i31:                                      ; preds = %middle.block, %if.end5, %loop2_start
+  unreachable
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll b/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll
new file mode 100644
index 0000000..7367672
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll
@@ -0,0 +1,172 @@
+; RUN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64le-unknown-linux-gnu < %s > %t
+; RUN: grep lxvd2x < %t | count 18
+; RUN: grep stxvd2x < %t | count 18
+; RUN: grep xxpermdi < %t | count 36
+
+@vf = global <4 x float> <float -1.500000e+00, float 2.500000e+00, float -3.500000e+00, float 4.500000e+00>, align 16
+@vd = global <2 x double> <double 3.500000e+00, double -7.500000e+00>, align 16
+@vsi = global <4 x i32> <i32 -1, i32 2, i32 -3, i32 4>, align 16
+@vui = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@vsll = global <2 x i64> <i64 255, i64 -937>, align 16
+@vull = global <2 x i64> <i64 1447, i64 2894>, align 16
+@res_vsi = common global <4 x i32> zeroinitializer, align 16
+@res_vui = common global <4 x i32> zeroinitializer, align 16
+@res_vf = common global <4 x float> zeroinitializer, align 16
+@res_vsll = common global <2 x i64> zeroinitializer, align 16
+@res_vull = common global <2 x i64> zeroinitializer, align 16
+@res_vd = common global <2 x double> zeroinitializer, align 16
+
+define void @test1() {
+entry:
+; CHECK-LABEL: test1
+  %__a.addr.i31 = alloca i32, align 4
+  %__b.addr.i32 = alloca <4 x i32>*, align 8
+  %__a.addr.i29 = alloca i32, align 4
+  %__b.addr.i30 = alloca <4 x float>*, align 8
+  %__a.addr.i27 = alloca i32, align 4
+  %__b.addr.i28 = alloca <2 x i64>*, align 8
+  %__a.addr.i25 = alloca i32, align 4
+  %__b.addr.i26 = alloca <2 x i64>*, align 8
+  %__a.addr.i23 = alloca i32, align 4
+  %__b.addr.i24 = alloca <2 x double>*, align 8
+  %__a.addr.i20 = alloca <4 x i32>, align 16
+  %__b.addr.i21 = alloca i32, align 4
+  %__c.addr.i22 = alloca <4 x i32>*, align 8
+  %__a.addr.i17 = alloca <4 x i32>, align 16
+  %__b.addr.i18 = alloca i32, align 4
+  %__c.addr.i19 = alloca <4 x i32>*, align 8
+  %__a.addr.i14 = alloca <4 x float>, align 16
+  %__b.addr.i15 = alloca i32, align 4
+  %__c.addr.i16 = alloca <4 x float>*, align 8
+  %__a.addr.i11 = alloca <2 x i64>, align 16
+  %__b.addr.i12 = alloca i32, align 4
+  %__c.addr.i13 = alloca <2 x i64>*, align 8
+  %__a.addr.i8 = alloca <2 x i64>, align 16
+  %__b.addr.i9 = alloca i32, align 4
+  %__c.addr.i10 = alloca <2 x i64>*, align 8
+  %__a.addr.i6 = alloca <2 x double>, align 16
+  %__b.addr.i7 = alloca i32, align 4
+  %__c.addr.i = alloca <2 x double>*, align 8
+  %__a.addr.i = alloca i32, align 4
+  %__b.addr.i = alloca <4 x i32>*, align 8
+  store i32 0, i32* %__a.addr.i, align 4
+  store <4 x i32>* @vsi, <4 x i32>** %__b.addr.i, align 8
+  %0 = load i32* %__a.addr.i, align 4
+  %1 = load <4 x i32>** %__b.addr.i, align 8
+  %2 = bitcast <4 x i32>* %1 to i8*
+  %3 = getelementptr i8* %2, i32 %0
+  %4 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* %3)
+  store <4 x i32> %4, <4 x i32>* @res_vsi, align 16
+  store i32 0, i32* %__a.addr.i31, align 4
+  store <4 x i32>* @vui, <4 x i32>** %__b.addr.i32, align 8
+  %5 = load i32* %__a.addr.i31, align 4
+  %6 = load <4 x i32>** %__b.addr.i32, align 8
+  %7 = bitcast <4 x i32>* %6 to i8*
+  %8 = getelementptr i8* %7, i32 %5
+  %9 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* %8)
+  store <4 x i32> %9, <4 x i32>* @res_vui, align 16
+  store i32 0, i32* %__a.addr.i29, align 4
+  store <4 x float>* @vf, <4 x float>** %__b.addr.i30, align 8
+  %10 = load i32* %__a.addr.i29, align 4
+  %11 = load <4 x float>** %__b.addr.i30, align 8
+  %12 = bitcast <4 x float>* %11 to i8*
+  %13 = getelementptr i8* %12, i32 %10
+  %14 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* %13)
+  %15 = bitcast <4 x i32> %14 to <4 x float>
+  store <4 x float> %15, <4 x float>* @res_vf, align 16
+  store i32 0, i32* %__a.addr.i27, align 4
+  store <2 x i64>* @vsll, <2 x i64>** %__b.addr.i28, align 8
+  %16 = load i32* %__a.addr.i27, align 4
+  %17 = load <2 x i64>** %__b.addr.i28, align 8
+  %18 = bitcast <2 x i64>* %17 to i8*
+  %19 = getelementptr i8* %18, i32 %16
+  %20 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* %19)
+  %21 = bitcast <2 x double> %20 to <2 x i64>
+  store <2 x i64> %21, <2 x i64>* @res_vsll, align 16
+  store i32 0, i32* %__a.addr.i25, align 4
+  store <2 x i64>* @vull, <2 x i64>** %__b.addr.i26, align 8
+  %22 = load i32* %__a.addr.i25, align 4
+  %23 = load <2 x i64>** %__b.addr.i26, align 8
+  %24 = bitcast <2 x i64>* %23 to i8*
+  %25 = getelementptr i8* %24, i32 %22
+  %26 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* %25)
+  %27 = bitcast <2 x double> %26 to <2 x i64>
+  store <2 x i64> %27, <2 x i64>* @res_vull, align 16
+  store i32 0, i32* %__a.addr.i23, align 4
+  store <2 x double>* @vd, <2 x double>** %__b.addr.i24, align 8
+  %28 = load i32* %__a.addr.i23, align 4
+  %29 = load <2 x double>** %__b.addr.i24, align 8
+  %30 = bitcast <2 x double>* %29 to i8*
+  %31 = getelementptr i8* %30, i32 %28
+  %32 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* %31)
+  store <2 x double> %32, <2 x double>* @res_vd, align 16
+  %33 = load <4 x i32>* @vsi, align 16
+  store <4 x i32> %33, <4 x i32>* %__a.addr.i20, align 16
+  store i32 0, i32* %__b.addr.i21, align 4
+  store <4 x i32>* @res_vsi, <4 x i32>** %__c.addr.i22, align 8
+  %34 = load <4 x i32>* %__a.addr.i20, align 16
+  %35 = load i32* %__b.addr.i21, align 4
+  %36 = load <4 x i32>** %__c.addr.i22, align 8
+  %37 = bitcast <4 x i32>* %36 to i8*
+  %38 = getelementptr i8* %37, i32 %35
+  call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %34, i8* %38)
+  %39 = load <4 x i32>* @vui, align 16
+  store <4 x i32> %39, <4 x i32>* %__a.addr.i17, align 16
+  store i32 0, i32* %__b.addr.i18, align 4
+  store <4 x i32>* @res_vui, <4 x i32>** %__c.addr.i19, align 8
+  %40 = load <4 x i32>* %__a.addr.i17, align 16
+  %41 = load i32* %__b.addr.i18, align 4
+  %42 = load <4 x i32>** %__c.addr.i19, align 8
+  %43 = bitcast <4 x i32>* %42 to i8*
+  %44 = getelementptr i8* %43, i32 %41
+  call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %40, i8* %44)
+  %45 = load <4 x float>* @vf, align 16
+  store <4 x float> %45, <4 x float>* %__a.addr.i14, align 16
+  store i32 0, i32* %__b.addr.i15, align 4
+  store <4 x float>* @res_vf, <4 x float>** %__c.addr.i16, align 8
+  %46 = load <4 x float>* %__a.addr.i14, align 16
+  %47 = bitcast <4 x float> %46 to <4 x i32>
+  %48 = load i32* %__b.addr.i15, align 4
+  %49 = load <4 x float>** %__c.addr.i16, align 8
+  %50 = bitcast <4 x float>* %49 to i8*
+  %51 = getelementptr i8* %50, i32 %48
+  call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %47, i8* %51) #1
+  %52 = load <2 x i64>* @vsll, align 16
+  store <2 x i64> %52, <2 x i64>* %__a.addr.i11, align 16
+  store i32 0, i32* %__b.addr.i12, align 4
+  store <2 x i64>* @res_vsll, <2 x i64>** %__c.addr.i13, align 8
+  %53 = load <2 x i64>* %__a.addr.i11, align 16
+  %54 = bitcast <2 x i64> %53 to <2 x double>
+  %55 = load i32* %__b.addr.i12, align 4
+  %56 = load <2 x i64>** %__c.addr.i13, align 8
+  %57 = bitcast <2 x i64>* %56 to i8*
+  %58 = getelementptr i8* %57, i32 %55
+  call void @llvm.ppc.vsx.stxvd2x(<2 x double> %54, i8* %58)
+  %59 = load <2 x i64>* @vull, align 16
+  store <2 x i64> %59, <2 x i64>* %__a.addr.i8, align 16
+  store i32 0, i32* %__b.addr.i9, align 4
+  store <2 x i64>* @res_vull, <2 x i64>** %__c.addr.i10, align 8
+  %60 = load <2 x i64>* %__a.addr.i8, align 16
+  %61 = bitcast <2 x i64> %60 to <2 x double>
+  %62 = load i32* %__b.addr.i9, align 4
+  %63 = load <2 x i64>** %__c.addr.i10, align 8
+  %64 = bitcast <2 x i64>* %63 to i8*
+  %65 = getelementptr i8* %64, i32 %62
+  call void @llvm.ppc.vsx.stxvd2x(<2 x double> %61, i8* %65)
+  %66 = load <2 x double>* @vd, align 16
+  store <2 x double> %66, <2 x double>* %__a.addr.i6, align 16
+  store i32 0, i32* %__b.addr.i7, align 4
+  store <2 x double>* @res_vd, <2 x double>** %__c.addr.i, align 8
+  %67 = load <2 x double>* %__a.addr.i6, align 16
+  %68 = load i32* %__b.addr.i7, align 4
+  %69 = load <2 x double>** %__c.addr.i, align 8
+  %70 = bitcast <2 x double>* %69 to i8*
+  %71 = getelementptr i8* %70, i32 %68
+  call void @llvm.ppc.vsx.stxvd2x(<2 x double> %67, i8* %71)
+  ret void
+}
+
+declare void @llvm.ppc.vsx.stxvd2x(<2 x double>, i8*)
+declare void @llvm.ppc.vsx.stxvw4x(<4 x i32>, i8*)
+declare <2 x double> @llvm.ppc.vsx.lxvd2x(i8*)
+declare <4 x i32> @llvm.ppc.vsx.lxvw4x(i8*)
diff --git a/test/CodeGen/PowerPC/vsx-ldst.ll b/test/CodeGen/PowerPC/vsx-ldst.ll
index 0c9ebef..688187d 100644
--- a/test/CodeGen/PowerPC/vsx-ldst.ll
+++ b/test/CodeGen/PowerPC/vsx-ldst.ll
@@ -3,6 +3,16 @@
 ; RUN: grep lxvd2x < %t | count 3
 ; RUN: grep stxvw4x < %t | count 3
 ; RUN: grep stxvd2x < %t | count 3
+; RUN: llc -mcpu=pwr8 -mattr=+vsx -O0 -fast-isel=1 -mtriple=powerpc64-unknown-linux-gnu < %s > %t
+; RUN: grep lxvw4x < %t | count 3
+; RUN: grep lxvd2x < %t | count 3
+; RUN: grep stxvw4x < %t | count 3
+; RUN: grep stxvd2x < %t | count 3
+
+; RUN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64le-unknown-linux-gnu < %s > %t
+; RUN: grep lxvd2x < %t | count 6
+; RUN: grep stxvd2x < %t | count 6
+; RUN: grep xxpermdi < %t | count 12
 
 @vsi = global <4 x i32> <i32 -1, i32 2, i32 -3, i32 4>, align 16
 @vui = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
diff --git a/test/CodeGen/PowerPC/vsx-p8.ll b/test/CodeGen/PowerPC/vsx-p8.ll
index 81406b6..d5a1905 100644
--- a/test/CodeGen/PowerPC/vsx-p8.ll
+++ b/test/CodeGen/PowerPC/vsx-p8.ll
@@ -1,4 +1,7 @@
 ; RUN: llc -mcpu=pwr8 -mattr=+power8-vector < %s | FileCheck %s
+; RUN: llc -mcpu=pwr8 -mattr=+power8-vector < %s | FileCheck -check-prefix=CHECK-REG %s
+; RUN: llc -mcpu=pwr8 -mattr=+power8-vector -fast-isel -O0 < %s | FileCheck %s
+; RUN: llc -mcpu=pwr8 -mattr=+power8-vector -fast-isel -O0 < %s | FileCheck -check-prefix=CHECK-FISL %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -26,17 +29,27 @@ define <4 x float> @test32u(<4 x float>* %a) {
   %v = load <4 x float>* %a, align 8
   ret <4 x float> %v
 
-; CHECK-LABEL: @test32u
-; CHECK: lxvw4x 34, 0, 3
-; CHECK: blr
+; CHECK-REG-LABEL: @test32u
+; CHECK-REG: lxvw4x 34, 0, 3
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test32u
+; CHECK-FISL: lxvw4x 0, 0, 3
+; CHECK-FISL: xxlor 34, 0, 0
+; CHECK-FISL: blr
 }
 
 define void @test33u(<4 x float>* %a, <4 x float> %b) {
   store <4 x float> %b, <4 x float>* %a, align 8
   ret void
 
-; CHECK-LABEL: @test33u
-; CHECK: stxvw4x 34, 0, 3
-; CHECK: blr
+; CHECK-REG-LABEL: @test33u
+; CHECK-REG: stxvw4x 34, 0, 3
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test33u
+; CHECK-FISL: vor 3, 2, 2
+; CHECK-FISL: stxvw4x 35, 0, 3
+; CHECK-FISL: blr
 }
 
diff --git a/test/CodeGen/PowerPC/vsx-self-copy.ll b/test/CodeGen/PowerPC/vsx-self-copy.ll
index 23615ca..787ac4b 100644
--- a/test/CodeGen/PowerPC/vsx-self-copy.ll
+++ b/test/CodeGen/PowerPC/vsx-self-copy.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mcpu=pwr7 -mattr=+vsx < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -mattr=+vsx -fast-isel -O0 < %s | FileCheck %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/PowerPC/vsx-spill-norwstore.ll b/test/CodeGen/PowerPC/vsx-spill-norwstore.ll
new file mode 100644
index 0000000..a3c4aa5
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-spill-norwstore.ll
@@ -0,0 +1,63 @@
+; RUN: llc -mcpu=pwr7 -verify-machineinstrs < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@.str1 = external unnamed_addr constant [5 x i8], align 1
+@.str10 = external unnamed_addr constant [9 x i8], align 1
+
+; Function Attrs: nounwind
+define void @main() #0 {
+; CHECK-LABEL: @main
+; Make sure that the stxvd2x passes -verify-machineinstrs
+; CHECK: stxvd2x
+
+entry:
+  %0 = tail call <8 x i16> @llvm.ppc.altivec.vupkhsb(<16 x i8> <i8 0, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 -1, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1>) #0
+  %1 = tail call <8 x i16> @llvm.ppc.altivec.vupklsb(<16 x i8> <i8 0, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 -1, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1>) #0
+  br i1 false, label %if.then.i68.i, label %check.exit69.i
+
+if.then.i68.i:                                    ; preds = %entry
+  unreachable
+
+check.exit69.i:                                   ; preds = %entry
+  br i1 undef, label %if.then.i63.i, label %check.exit64.i
+
+if.then.i63.i:                                    ; preds = %check.exit69.i
+  tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str10, i64 0, i64 0), i8* getelementptr inbounds ([5 x i8]* @.str1, i64 0, i64 0)) #0
+  br label %check.exit64.i
+
+check.exit64.i:                                   ; preds = %if.then.i63.i, %check.exit69.i
+  %2 = tail call i32 @llvm.ppc.altivec.vcmpequh.p(i32 2, <8 x i16> %0, <8 x i16> <i16 0, i16 -1, i16 -1, i16 0, i16 0, i16 0, i16 -1, i16 0>) #0
+  %tobool.i55.i = icmp eq i32 %2, 0
+  br i1 %tobool.i55.i, label %if.then.i58.i, label %check.exit59.i
+
+if.then.i58.i:                                    ; preds = %check.exit64.i
+  unreachable
+
+check.exit59.i:                                   ; preds = %check.exit64.i
+  %3 = tail call i32 @llvm.ppc.altivec.vcmpequh.p(i32 2, <8 x i16> %1, <8 x i16> <i16 -1, i16 0, i16 0, i16 -1, i16 -1, i16 -1, i16 0, i16 -1>) #0
+  %tobool.i50.i = icmp eq i32 %3, 0
+  br i1 %tobool.i50.i, label %if.then.i53.i, label %check.exit54.i
+
+if.then.i53.i:                                    ; preds = %check.exit59.i
+  unreachable
+
+check.exit54.i:                                   ; preds = %check.exit59.i
+  unreachable
+}
+
+; Function Attrs: nounwind readnone
+declare <8 x i16> @llvm.ppc.altivec.vupkhsb(<16 x i8>) #1
+
+; Function Attrs: nounwind readnone
+declare <8 x i16> @llvm.ppc.altivec.vupklsb(<16 x i8>) #1
+
+; Function Attrs: nounwind
+declare void @printf(i8* nocapture readonly, ...) #0
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ppc.altivec.vcmpequh.p(i32, <8 x i16>, <8 x i16>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
diff --git a/test/CodeGen/PowerPC/vsx-spill.ll b/test/CodeGen/PowerPC/vsx-spill.ll
index 29bc6fc..032bcf6 100644
--- a/test/CodeGen/PowerPC/vsx-spill.ll
+++ b/test/CodeGen/PowerPC/vsx-spill.ll
@@ -1,4 +1,7 @@
 ; RUN: llc -mcpu=pwr7 -mattr=+vsx < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -mattr=+vsx < %s | FileCheck -check-prefix=CHECK-REG %s
+; RUN: llc -mcpu=pwr7 -mattr=+vsx -fast-isel -O0 < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -mattr=+vsx -fast-isel -O0 < %s | FileCheck -check-prefix=CHECK-FISL %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -7,10 +10,16 @@ entry:
   call void asm sideeffect "", "~{f0},~{f1},~{f2},~{f3},~{f4},~{f5},~{f6},~{f7},~{f8},~{f9},~{f10},~{f11},~{f12},~{f13},~{f14},~{f15},~{f16},~{f17},~{f18},~{f19},~{f20},~{f21},~{f22},~{f23},~{f24},~{f25},~{f26},~{f27},~{f28},~{f29},~{f30},~{f31}"() nounwind
   br label %return
 
-; CHECK: @foo1
-; CHECK: xxlor [[R1:[0-9]+]], 1, 1
-; CHECK: xxlor 1, [[R1]], [[R1]]
-; CHECK: blr
+; CHECK-REG: @foo1
+; CHECK-REG: xxlor [[R1:[0-9]+]], 1, 1
+; CHECK-REG: xxlor 1, [[R1]], [[R1]]
+; CHECK-REG: blr
+
+; CHECK-FISL: @foo1
+; CHECK-FISL: lis 0, -1
+; CHECK-FISL: ori 0, 0, 65384
+; CHECK-FISL: stxsdx 1, 1, 0
+; CHECK-FISL: blr
 
 return:                                           ; preds = %entry
   ret double %a
@@ -22,10 +31,16 @@ entry:
   call void asm sideeffect "", "~{f0},~{f1},~{f2},~{f3},~{f4},~{f5},~{f6},~{f7},~{f8},~{f9},~{f10},~{f11},~{f12},~{f13},~{f14},~{f15},~{f16},~{f17},~{f18},~{f19},~{f20},~{f21},~{f22},~{f23},~{f24},~{f25},~{f26},~{f27},~{f28},~{f29},~{f30},~{f31}"() nounwind
   br label %return
 
-; CHECK: @foo2
-; CHECK: {{xxlor|xsadddp}} [[R1:[0-9]+]], 1, 1
-; CHECK: {{xxlor|xsadddp}} 1, [[R1]], [[R1]]
-; CHECK: blr
+; CHECK-REG: @foo2
+; CHECK-REG: {{xxlor|xsadddp}} [[R1:[0-9]+]], 1, 1
+; CHECK-REG: {{xxlor|xsadddp}} 1, [[R1]], [[R1]]
+; CHECK-REG: blr
+
+; CHECK-FISL: @foo2
+; CHECK-FISL: xsadddp [[R1:[0-9]+]], 1, 1
+; CHECK-FISL: stxsdx [[R1]], [[R1]], 0
+; CHECK-FISL: lxsdx [[R1]], [[R1]], 0
+; CHECK-FISL: blr
 
 return:                                           ; preds = %entry
   ret double %b
diff --git a/test/CodeGen/PowerPC/vsx.ll b/test/CodeGen/PowerPC/vsx.ll
index 333b75a..f91ffdb 100644
--- a/test/CodeGen/PowerPC/vsx.ll
+++ b/test/CodeGen/PowerPC/vsx.ll
@@ -1,4 +1,7 @@
 ; RUN: llc -mcpu=pwr7 -mattr=+vsx < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -mattr=+vsx < %s | FileCheck -check-prefix=CHECK-REG %s
+; RUN: llc -mcpu=pwr7 -mattr=+vsx -fast-isel -O0 < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -mattr=+vsx -fast-isel -O0 < %s | FileCheck -check-prefix=CHECK-FISL %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -47,9 +50,16 @@ entry:
   %v = xor <4 x i32> %a, %b
   ret <4 x i32> %v
 
-; CHECK-LABEL: @test5
-; CHECK: xxlxor 34, 34, 35
-; CHECK: blr
+; CHECK-REG-LABEL: @test5
+; CHECK-REG: xxlxor 34, 34, 35
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test5
+; CHECK-FISL: vor 4, 2, 2
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: xxlxor 36, 36, 37
+; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: blr
 }
 
 define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) {
@@ -57,9 +67,16 @@ entry:
   %v = xor <8 x i16> %a, %b
   ret <8 x i16> %v
 
-; CHECK-LABEL: @test6
-; CHECK: xxlxor 34, 34, 35
-; CHECK: blr
+; CHECK-REG-LABEL: @test6
+; CHECK-REG: xxlxor 34, 34, 35
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test6
+; CHECK-FISL: vor 4, 2, 2
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: xxlxor 36, 36, 37
+; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: blr
 }
 
 define <16 x i8> @test7(<16 x i8> %a, <16 x i8> %b) {
@@ -67,9 +84,16 @@ entry:
   %v = xor <16 x i8> %a, %b
   ret <16 x i8> %v
 
-; CHECK-LABEL: @test7
-; CHECK: xxlxor 34, 34, 35
-; CHECK: blr
+; CHECK-REG-LABEL: @test7
+; CHECK-REG: xxlxor 34, 34, 35
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test7
+; CHECK-FISL: vor 4, 2, 2
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: xxlxor 36, 36, 37
+; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: blr
 }
 
 define <4 x i32> @test8(<4 x i32> %a, <4 x i32> %b) {
@@ -77,9 +101,16 @@ entry:
   %v = or <4 x i32> %a, %b
   ret <4 x i32> %v
 
-; CHECK-LABEL: @test8
-; CHECK: xxlor 34, 34, 35
-; CHECK: blr
+; CHECK-REG-LABEL: @test8
+; CHECK-REG: xxlor 34, 34, 35
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test8
+; CHECK-FISL: vor 4, 2, 2
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: xxlor 36, 36, 37
+; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: blr
 }
 
 define <8 x i16> @test9(<8 x i16> %a, <8 x i16> %b) {
@@ -87,9 +118,16 @@ entry:
   %v = or <8 x i16> %a, %b
   ret <8 x i16> %v
 
-; CHECK-LABEL: @test9
-; CHECK: xxlor 34, 34, 35
-; CHECK: blr
+; CHECK-REG-LABEL: @test9
+; CHECK-REG: xxlor 34, 34, 35
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test9
+; CHECK-FISL: vor 4, 2, 2
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: xxlor 36, 36, 37
+; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: blr
 }
 
 define <16 x i8> @test10(<16 x i8> %a, <16 x i8> %b) {
@@ -97,9 +135,16 @@ entry:
   %v = or <16 x i8> %a, %b
   ret <16 x i8> %v
 
-; CHECK-LABEL: @test10
-; CHECK: xxlor 34, 34, 35
-; CHECK: blr
+; CHECK-REG-LABEL: @test10
+; CHECK-REG: xxlor 34, 34, 35
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test10
+; CHECK-FISL: vor 4, 2, 2
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: xxlor 36, 36, 37
+; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: blr
 }
 
 define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
@@ -107,9 +152,16 @@ entry:
   %v = and <4 x i32> %a, %b
   ret <4 x i32> %v
 
-; CHECK-LABEL: @test11
-; CHECK: xxland 34, 34, 35
-; CHECK: blr
+; CHECK-REG-LABEL: @test11
+; CHECK-REG: xxland 34, 34, 35
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test11
+; CHECK-FISL: vor 4, 2, 2
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: xxland 36, 36, 37
+; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: blr
 }
 
 define <8 x i16> @test12(<8 x i16> %a, <8 x i16> %b) {
@@ -117,9 +169,16 @@ entry:
   %v = and <8 x i16> %a, %b
   ret <8 x i16> %v
 
-; CHECK-LABEL: @test12
-; CHECK: xxland 34, 34, 35
-; CHECK: blr
+; CHECK-REG-LABEL: @test12
+; CHECK-REG: xxland 34, 34, 35
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test12
+; CHECK-FISL: vor 4, 2, 2
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: xxland 36, 36, 37
+; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: blr
 }
 
 define <16 x i8> @test13(<16 x i8> %a, <16 x i8> %b) {
@@ -127,9 +186,16 @@ entry:
   %v = and <16 x i8> %a, %b
   ret <16 x i8> %v
 
-; CHECK-LABEL: @test13
-; CHECK: xxland 34, 34, 35
-; CHECK: blr
+; CHECK-REG-LABEL: @test13
+; CHECK-REG: xxland 34, 34, 35
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test13
+; CHECK-FISL: vor 4, 2, 2
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: xxland 36, 36, 37
+; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: blr
 }
 
 define <4 x i32> @test14(<4 x i32> %a, <4 x i32> %b) {
@@ -138,9 +204,23 @@ entry:
   %w = xor <4 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1>
   ret <4 x i32> %w
 
-; CHECK-LABEL: @test14
-; CHECK: xxlnor 34, 34, 35
-; CHECK: blr
+; CHECK-REG-LABEL: @test14
+; CHECK-REG: xxlnor 34, 34, 35
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test14
+; CHECK-FISL: vor 4, 2, 2
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: xxlor 36, 36, 37
+; CHECK-FISL: vor 0, 4, 4
+; CHECK-FISL: vor 4, 2, 2
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: xxlnor 36, 36, 37
+; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: lis 0, -1
+; CHECK-FISL: ori 0, 0, 65520
+; CHECK-FISL: stvx 0, 1, 0
+; CHECK-FISL: blr
 }
 
 define <8 x i16> @test15(<8 x i16> %a, <8 x i16> %b) {
@@ -149,9 +229,23 @@ entry:
   %w = xor <8 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
   ret <8 x i16> %w
 
-; CHECK-LABEL: @test15
-; CHECK: xxlnor 34, 34, 35
-; CHECK: blr
+; CHECK-REG-LABEL: @test15
+; CHECK-REG: xxlnor 34, 34, 35
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test15
+; CHECK-FISL: vor 4, 2, 2
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: xxlor 36, 36, 37
+; CHECK-FISL: vor 0, 4, 4
+; CHECK-FISL: vor 4, 2, 2
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: xxlnor 36, 36, 37
+; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: lis 0, -1
+; CHECK-FISL: ori 0, 0, 65520
+; CHECK-FISL: stvx 0, 1, 0
+; CHECK-FISL: blr
 }
 
 define <16 x i8> @test16(<16 x i8> %a, <16 x i8> %b) {
@@ -160,9 +254,23 @@ entry:
   %w = xor <16 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   ret <16 x i8> %w
 
-; CHECK-LABEL: @test16
-; CHECK: xxlnor 34, 34, 35
-; CHECK: blr
+; CHECK-REG-LABEL: @test16
+; CHECK-REG: xxlnor 34, 34, 35
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test16
+; CHECK-FISL: vor 4, 2, 2
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: xxlor 36, 36, 37
+; CHECK-FISL: vor 0, 4, 4
+; CHECK-FISL: vor 4, 2, 2
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: xxlnor 36, 36, 37
+; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: lis 0, -1
+; CHECK-FISL: ori 0, 0, 65520
+; CHECK-FISL: stvx 0, 1, 0
+; CHECK-FISL: blr
 }
 
 define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
@@ -171,9 +279,21 @@ entry:
   %v = and <4 x i32> %a, %w
   ret <4 x i32> %v
 
-; CHECK-LABEL: @test17
-; CHECK: xxlandc 34, 34, 35
-; CHECK: blr
+; CHECK-REG-LABEL: @test17
+; CHECK-REG: xxlandc 34, 34, 35
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test17
+; CHECK-FISL: vspltisb 4, -1
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: vor 0, 4, 4
+; CHECK-FISL: xxlxor 37, 37, 32
+; CHECK-FISL: vor 3, 5, 5
+; CHECK-FISL: vor 5, 2, 2
+; CHECK-FISL: vor 0, 3, 3
+; CHECK-FISL: xxland 37, 37, 32
+; CHECK-FISL: vor 2, 5, 5
+; CHECK-FISL: blr
 }
 
 define <8 x i16> @test18(<8 x i16> %a, <8 x i16> %b) {
@@ -182,9 +302,24 @@ entry:
   %v = and <8 x i16> %a, %w
   ret <8 x i16> %v
 
-; CHECK-LABEL: @test18
-; CHECK: xxlandc 34, 34, 35
-; CHECK: blr
+; CHECK-REG-LABEL: @test18
+; CHECK-REG: xxlandc 34, 34, 35
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test18
+; CHECK-FISL: vspltisb 4, -1
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: vor 0, 4, 4
+; CHECK-FISL: xxlxor 37, 37, 32
+; CHECK-FISL: vor 4, 5, 5
+; CHECK-FISL: vor 5, 2, 2
+; CHECK-FISL: vor 0, 3, 3
+; CHECK-FISL: xxlandc 37, 37, 32
+; CHECK-FISL: vor 2, 5, 5
+; CHECK-FISL: lis 0, -1
+; CHECK-FISL: ori 0, 0, 65520
+; CHECK-FISL: stvx 4, 1, 0
+; CHECK-FISL: blr
 }
 
 define <16 x i8> @test19(<16 x i8> %a, <16 x i8> %b) {
@@ -193,9 +328,24 @@ entry:
   %v = and <16 x i8> %a, %w
   ret <16 x i8> %v
 
-; CHECK-LABEL: @test19
-; CHECK: xxlandc 34, 34, 35
-; CHECK: blr
+; CHECK-REG-LABEL: @test19
+; CHECK-REG: xxlandc 34, 34, 35
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test19
+; CHECK-FISL: vspltisb 4, -1
+; CHECK-FISL: vor 5, 3, 3
+; CHECK-FISL: vor 0, 4, 4
+; CHECK-FISL: xxlxor 37, 37, 32
+; CHECK-FISL: vor 4, 5, 5
+; CHECK-FISL: vor 5, 2, 2
+; CHECK-FISL: vor 0, 3, 3
+; CHECK-FISL: xxlandc 37, 37, 32
+; CHECK-FISL: vor 2, 5, 5
+; CHECK-FISL: lis 0, -1
+; CHECK-FISL: ori 0, 0, 65520
+; CHECK-FISL: stvx 4, 1, 0
+; CHECK-FISL: blr
 }
 
 define <4 x i32> @test20(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
@@ -204,10 +354,19 @@ entry:
   %v = select <4 x i1> %m, <4 x i32> %a, <4 x i32> %b
   ret <4 x i32> %v
 
-; CHECK-LABEL: @test20
-; CHECK: vcmpequw {{[0-9]+}}, 4, 5
-; CHECK: xxsel 34, 35, 34, {{[0-9]+}}
-; CHECK: blr
+; CHECK-REG-LABEL: @test20
+; CHECK-REG: vcmpequw {{[0-9]+}}, 4, 5
+; CHECK-REG: xxsel 34, 35, 34, {{[0-9]+}}
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test20
+; CHECK-FISL: vcmpequw 4, 4, 5
+; CHECK-FISL: vor 0, 3, 3
+; CHECK-FISL: vor 1, 2, 2
+; CHECK-FISL: vor 6, 4, 4
+; CHECK-FISL: xxsel 32, 32, 33, 38
+; CHECK-FISL: vor 2, 0, 0
+; CHECK-FISL: blr
 }
 
 define <4 x float> @test21(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) {
@@ -216,10 +375,20 @@ entry:
   %v = select <4 x i1> %m, <4 x float> %a, <4 x float> %b
   ret <4 x float> %v
 
-; CHECK-LABEL: @test21
-; CHECK: xvcmpeqsp [[V1:[0-9]+]], 36, 37
-; CHECK: xxsel 34, 35, 34, [[V1]]
-; CHECK: blr
+; CHECK-REG-LABEL: @test21
+; CHECK-REG: xvcmpeqsp [[V1:[0-9]+]], 36, 37
+; CHECK-REG: xxsel 34, 35, 34, [[V1]]
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test21
+; CHECK-FISL: vor 0, 5, 5
+; CHECK-FISL: vor 1, 4, 4
+; CHECK-FISL: vor 6, 3, 3
+; CHECK-FISL: vor 7, 2, 2
+; CHECK-FISL: xvcmpeqsp 32, 33, 32
+; CHECK-FISL: xxsel 32, 38, 39, 32
+; CHECK-FISL: vor 2, 0, 0
+; CHECK-FISL: blr
 }
 
 define <4 x float> @test22(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) {
@@ -228,16 +397,27 @@ entry:
   %v = select <4 x i1> %m, <4 x float> %a, <4 x float> %b
   ret <4 x float> %v
 
-; CHECK-LABEL: @test22
-; CHECK-DAG: xvcmpeqsp {{[0-9]+}}, 37, 37
-; CHECK-DAG: xvcmpeqsp {{[0-9]+}}, 36, 36
-; CHECK-DAG: xvcmpeqsp {{[0-9]+}}, 36, 37
-; CHECK-DAG: xxlnor
-; CHECK-DAG: xxlnor
-; CHECK-DAG: xxlor
-; CHECK-DAG: xxlor
-; CHECK: xxsel 34, 35, 34, {{[0-9]+}}
-; CHECK: blr
+; CHECK-REG-LABEL: @test22
+; CHECK-REG-DAG: xvcmpeqsp {{[0-9]+}}, 37, 37
+; CHECK-REG-DAG: xvcmpeqsp {{[0-9]+}}, 36, 36
+; CHECK-REG-DAG: xvcmpeqsp {{[0-9]+}}, 36, 37
+; CHECK-REG-DAG: xxlnor
+; CHECK-REG-DAG: xxlnor
+; CHECK-REG-DAG: xxlor
+; CHECK-REG-DAG: xxlor
+; CHECK-REG: xxsel 34, 35, 34, {{[0-9]+}}
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test22
+; CHECK-FISL-DAG: xvcmpeqsp {{[0-9]+}}, 33, 32
+; CHECK-FISL-DAG: xvcmpeqsp {{[0-9]+}}, 32, 32
+; CHECK-FISL-DAG: xvcmpeqsp {{[0-9]+}}, 33, 33
+; CHECK-FISL-DAG: xxlnor
+; CHECK-FISL-DAG: xxlnor
+; CHECK-FISL-DAG: xxlor
+; CHECK-FISL-DAG: xxlor
+; CHECK-FISL: xxsel 0, 38, 39, {{[0-9]+}}
+; CHECK-FISL: blr
 }
 
 define <8 x i16> @test23(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) {
@@ -246,10 +426,19 @@ entry:
   %v = select <8 x i1> %m, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %v
 
-; CHECK-LABEL: @test23
-; CHECK: vcmpequh {{[0-9]+}}, 4, 5
-; CHECK: xxsel 34, 35, 34, {{[0-9]+}}
-; CHECK: blr
+; CHECK-REG-LABEL: @test23
+; CHECK-REG: vcmpequh {{[0-9]+}}, 4, 5
+; CHECK-REG: xxsel 34, 35, 34, {{[0-9]+}}
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test23
+; CHECK-FISL: vcmpequh 4, 4, 5
+; CHECK-FISL: vor 0, 3, 3
+; CHECK-FISL: vor 1, 2, 2
+; CHECK-FISL: vor 6, 4, 4
+; CHECK-FISL: xxsel 32, 32, 33, 38
+; CHECK-FISL: vor 2, 0, 
+; CHECK-FISL: blr
 }
 
 define <16 x i8> @test24(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
@@ -258,10 +447,19 @@ entry:
   %v = select <16 x i1> %m, <16 x i8> %a, <16 x i8> %b
   ret <16 x i8> %v
 
-; CHECK-LABEL: @test24
-; CHECK: vcmpequb {{[0-9]+}}, 4, 5
-; CHECK: xxsel 34, 35, 34, {{[0-9]+}}
-; CHECK: blr
+; CHECK-REG-LABEL: @test24
+; CHECK-REG: vcmpequb {{[0-9]+}}, 4, 5
+; CHECK-REG: xxsel 34, 35, 34, {{[0-9]+}}
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test24
+; CHECK-FISL: vcmpequb 4, 4, 5
+; CHECK-FISL: vor 0, 3, 3
+; CHECK-FISL: vor 1, 2, 2
+; CHECK-FISL: vor 6, 4, 4
+; CHECK-FISL: xxsel 32, 32, 33, 38
+; CHECK-FISL: vor 2, 0, 0
+; CHECK-FISL: blr
 }
 
 define <2 x double> @test25(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %d) {
@@ -342,9 +540,16 @@ define <2 x i64> @test30(<2 x i64>* %a) {
   %v = load <2 x i64>* %a, align 16
   ret <2 x i64> %v
 
-; CHECK-LABEL: @test30
-; CHECK: lxvd2x 34, 0, 3
-; CHECK: blr
+; CHECK-REG-LABEL: @test30
+; CHECK-REG: lxvd2x 34, 0, 3
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test30
+; CHECK-FISL: lxvd2x 0, 0, 3
+; CHECK-FISL: xxlor 34, 0, 0
+; CHECK-FISL: vor 3, 2, 2
+; CHECK-FISL: vor 2, 3, 3
+; CHECK-FISL: blr
 }
 
 define void @test31(<2 x i64>* %a, <2 x i64> %b) {
@@ -360,18 +565,28 @@ define <4 x float> @test32(<4 x float>* %a) {
   %v = load <4 x float>* %a, align 16
   ret <4 x float> %v
 
-; CHECK-LABEL: @test32
-; CHECK: lxvw4x 34, 0, 3
-; CHECK: blr
+; CHECK-REG-LABEL: @test32
+; CHECK-REG: lxvw4x 34, 0, 3
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test32
+; CHECK-FISL: lxvw4x 0, 0, 3
+; CHECK-FISL: xxlor 34, 0, 0
+; CHECK-FISL: blr
 }
 
 define void @test33(<4 x float>* %a, <4 x float> %b) {
   store <4 x float> %b, <4 x float>* %a, align 16
   ret void
 
-; CHECK-LABEL: @test33
-; CHECK: stxvw4x 34, 0, 3
-; CHECK: blr
+; CHECK-REG-LABEL: @test33
+; CHECK-REG: stxvw4x 34, 0, 3
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test33
+; CHECK-FISL: vor 3, 2, 2
+; CHECK-FISL: stxvw4x 35, 0, 3
+; CHECK-FISL: blr
 }
 
 define <4 x float> @test32u(<4 x float>* %a) {
@@ -390,27 +605,44 @@ define void @test33u(<4 x float>* %a, <4 x float> %b) {
   store <4 x float> %b, <4 x float>* %a, align 8
   ret void
 
-; CHECK-LABEL: @test33u
-; CHECK: stxvw4x 34, 0, 3
-; CHECK: blr
+; CHECK-REG-LABEL: @test33u
+; CHECK-REG: stxvw4x 34, 0, 3
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test33u
+; CHECK-FISL: vor 3, 2, 2
+; CHECK-FISL: stxvw4x 35, 0, 3
+; CHECK-FISL: blr
 }
 
 define <4 x i32> @test34(<4 x i32>* %a) {
   %v = load <4 x i32>* %a, align 16
   ret <4 x i32> %v
 
-; CHECK-LABEL: @test34
-; CHECK: lxvw4x 34, 0, 3
-; CHECK: blr
+; CHECK-REG-LABEL: @test34
+; CHECK-REG: lxvw4x 34, 0, 3
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test34
+; CHECK-FISL: lxvw4x 0, 0, 3
+; CHECK-FISL: xxlor 34, 0, 0
+; CHECK-FISL: vor 3, 2, 2
+; CHECK-FISL: vor 2, 3, 3
+; CHECK-FISL: blr
 }
 
 define void @test35(<4 x i32>* %a, <4 x i32> %b) {
   store <4 x i32> %b, <4 x i32>* %a, align 16
   ret void
 
-; CHECK-LABEL: @test35
-; CHECK: stxvw4x 34, 0, 3
-; CHECK: blr
+; CHECK-REG-LABEL: @test35
+; CHECK-REG: stxvw4x 34, 0, 3
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test35
+; CHECK-FISL: vor 3, 2, 2
+; CHECK-FISL: stxvw4x 35, 0, 3
+; CHECK-FISL: blr
 }
 
 define <2 x double> @test40(<2 x i64> %a) {
@@ -596,37 +828,60 @@ define double @test63(<2 x double> %a) {
   %v = extractelement <2 x double> %a, i32 0
   ret double %v
 
-; CHECK-LABEL: @test63
-; CHECK: xxlor 1, 34, 34
-; CHECK: blr
+; CHECK-REG-LABEL: @test63
+; CHECK-REG: xxlor 1, 34, 34
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test63
+; CHECK-FISL: xxlor 0, 34, 34
+; CHECK-FISL: fmr 1, 0
+; CHECK-FISL: blr
 }
 
 define double @test64(<2 x double> %a) {
   %v = extractelement <2 x double> %a, i32 1
   ret double %v
 
-; CHECK-LABEL: @test64
-; CHECK: xxpermdi 1, 34, 34, 2
-; CHECK: blr
+; CHECK-REG-LABEL: @test64
+; CHECK-REG: xxpermdi 1, 34, 34, 2
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test64
+; CHECK-FISL: xxpermdi 34, 34, 34, 2
+; CHECK-FISL: xxlor 0, 34, 34
+; CHECK-FISL: fmr 1, 0
+; CHECK-FISL: blr
 }
 
 define <2 x i1> @test65(<2 x i64> %a, <2 x i64> %b) {
   %w = icmp eq <2 x i64> %a, %b
   ret <2 x i1> %w
 
-; CHECK-LABEL: @test65
-; CHECK: vcmpequw 2, 2, 3
-; CHECK: blr
+; CHECK-REG-LABEL: @test65
+; CHECK-REG: vcmpequw 2, 2, 3
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test65
+; CHECK-FISL: vor 4, 3, 3
+; CHECK-FISL: vor 5, 2, 2
+; CHECK-FISL: vcmpequw 4, 5, 4
+; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: blr
 }
 
 define <2 x i1> @test66(<2 x i64> %a, <2 x i64> %b) {
   %w = icmp ne <2 x i64> %a, %b
   ret <2 x i1> %w
 
-; CHECK-LABEL: @test66
-; CHECK: vcmpequw {{[0-9]+}}, 2, 3
-; CHECK: xxlnor 34, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: blr
+; CHECK-REG-LABEL: @test66
+; CHECK-REG: vcmpequw {{[0-9]+}}, 2, 3
+; CHECK-REG: xxlnor 34, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test66
+; CHECK-FISL: vcmpequw {{[0-9]+}}, 5, 4
+; CHECK-FISL: xxlnor 34, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-FISL: blr
 }
 
 define <2 x i1> @test67(<2 x i64> %a, <2 x i64> %b) {
@@ -660,7 +915,7 @@ define <2 x double> @test69(<2 x i16> %a) {
 ; CHECK-LABEL: @test69
 ; CHECK: vspltisw [[V1:[0-9]+]], 8
 ; CHECK: vadduwm [[V2:[0-9]+]], [[V1]], [[V1]]
-; CHECK: vslw [[V3:[0-9]+]], 2, [[V2]]
+; CHECK: vslw [[V3:[0-9]+]], {{[0-9]+}}, [[V2]]
 ; CHECK: vsraw {{[0-9]+}}, [[V3]], [[V2]]
 ; CHECK: xxsldwi [[V4:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, 1
 ; CHECK: xvcvsxwdp 34, [[V4]]
@@ -674,7 +929,7 @@ define <2 x double> @test70(<2 x i8> %a) {
 ; CHECK-LABEL: @test70
 ; CHECK: vspltisw [[V1:[0-9]+]], 12
 ; CHECK: vadduwm [[V2:[0-9]+]], [[V1]], [[V1]]
-; CHECK: vslw [[V3:[0-9]+]], 2, [[V2]]
+; CHECK: vslw [[V3:[0-9]+]], {{[0-9]+}}, [[V2]]
 ; CHECK: vsraw {{[0-9]+}}, [[V3]], [[V2]]
 ; CHECK: xxsldwi [[V4:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, 1
 ; CHECK: xvcvsxwdp 34, [[V4]]
@@ -687,15 +942,24 @@ define <2 x i32> @test80(i32 %v) {
   %i = add <2 x i32> %b2, <i32 2, i32 3>
   ret <2 x i32> %i
 
-; CHECK-LABEL: @test80
-; CHECK-DAG: addi [[R1:[0-9]+]], 3, 3
-; CHECK-DAG: addi [[R2:[0-9]+]], 1, -16
-; CHECK-DAG: addi [[R3:[0-9]+]], 3, 2
-; CHECK: std [[R1]], -8(1)
-; CHECK: std [[R3]], -16(1)
-; CHECK: lxvd2x 34, 0, [[R2]]
-; CHECK-NOT: stxvd2x
-; CHECK: blr
+; CHECK-REG-LABEL: @test80
+; CHECK-REG-DAG: addi [[R1:[0-9]+]], 3, 3
+; CHECK-REG-DAG: addi [[R2:[0-9]+]], 1, -16
+; CHECK-REG-DAG: addi [[R3:[0-9]+]], 3, 2
+; CHECK-REG: std [[R1]], -8(1)
+; CHECK-REG: std [[R3]], -16(1)
+; CHECK-REG: lxvd2x 34, 0, [[R2]]
+; CHECK-REG-NOT: stxvd2x
+; CHECK-REG: blr
+
+; CHECK-FISL-LABEL: @test80
+; CHECK-FISL-DAG: addi [[R1:[0-9]+]], 3, 3
+; CHECK-FISL-DAG: addi [[R2:[0-9]+]], 1, -16
+; CHECK-FISL-DAG: addi [[R3:[0-9]+]], 3, 2
+; CHECK-FISL-DAG: std [[R1]], -8(1)
+; CHECK-FISL-DAG: std [[R3]], -16(1)
+; CHECK-FISL-DAG: lxvd2x 0, 0, [[R2]]
+; CHECK-FISL: blr
 }
 
 define <2 x double> @test81(<4 x float> %b) {
@@ -712,8 +976,11 @@ entry:
   %v = select i1 %m, double %a, double %b
   ret double %v
 
-; CHECK-LABEL: @test82
-; CHECK: xscmpudp [[REG:[0-9]+]], 3, 4
-; CHECK: beqlr [[REG]]
-}
+; CHECK-REG-LABEL: @test82
+; CHECK-REG: xscmpudp [[REG:[0-9]+]], 3, 4
+; CHECK-REG: beqlr [[REG]]
 
+; CHECK-FISL-LABEL: @test82
+; CHECK-FISL: xscmpudp [[REG:[0-9]+]], 3, 4
+; CHECK-FISL: beq [[REG]], {{.*}}
+}
diff --git a/test/CodeGen/PowerPC/vsx_insert_extract_le.ll b/test/CodeGen/PowerPC/vsx_insert_extract_le.ll
new file mode 100644
index 0000000..0a9df37
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx_insert_extract_le.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mcpu=pwr8 -mattr=+vsx -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+
+define <2 x double> @testi0(<2 x double>* %p1, double* %p2) {
+  %v = load <2 x double>* %p1
+  %s = load double* %p2
+  %r = insertelement <2 x double> %v, double %s, i32 0
+  ret <2 x double> %r
+
+; CHECK-LABEL: testi0
+; CHECK: lxvd2x 0, 0, 3
+; CHECK: lxsdx 34, 0, 4
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 1, 34, 34, 0
+; CHECK: xxpermdi 34, 0, 1, 1
+}
+
+define <2 x double> @testi1(<2 x double>* %p1, double* %p2) {
+  %v = load <2 x double>* %p1
+  %s = load double* %p2
+  %r = insertelement <2 x double> %v, double %s, i32 1
+  ret <2 x double> %r
+
+; CHECK-LABEL: testi1
+; CHECK: lxvd2x 0, 0, 3
+; CHECK: lxsdx 34, 0, 4
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 1, 34, 34, 0
+; CHECK: xxpermdi 34, 1, 0, 3
+}
+
+define double @teste0(<2 x double>* %p1) {
+  %v = load <2 x double>* %p1
+  %r = extractelement <2 x double> %v, i32 0
+  ret double %r
+
+; FIXME: Swap optimization will collapse this into lxvd2x 1, 0, 3.
+
+; CHECK-LABEL: teste0
+; CHECK: lxvd2x 0, 0, 3
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 1, 0, 0, 2
+}
+
+define double @teste1(<2 x double>* %p1) {
+  %v = load <2 x double>* %p1
+  %r = extractelement <2 x double> %v, i32 1
+  ret double %r
+
+; CHECK-LABEL: teste1
+; CHECK: lxvd2x 0, 0, 3
+; CHECK: xxpermdi 1, 0, 0, 2
+}
diff --git a/test/CodeGen/PowerPC/vsx_shuffle_le.ll b/test/CodeGen/PowerPC/vsx_shuffle_le.ll
new file mode 100644
index 0000000..588cfda
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx_shuffle_le.ll
@@ -0,0 +1,207 @@
+; RUN: llc -mcpu=pwr8 -mattr=+vsx -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+
+define <2 x double> @test00(<2 x double>* %p1, <2 x double>* %p2) {
+  %v1 = load <2 x double>* %p1
+  %v2 = load <2 x double>* %p2
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <2 x i32> < i32 0, i32 0>
+  ret <2 x double> %v3
+
+; CHECK-LABEL: test00
+; CHECK: lxvd2x 0, 0, 3
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 34, 0, 0, 3
+}
+
+define <2 x double> @test01(<2 x double>* %p1, <2 x double>* %p2) {
+  %v1 = load <2 x double>* %p1
+  %v2 = load <2 x double>* %p2
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <2 x i32> < i32 0, i32 1>
+  ret <2 x double> %v3
+
+; CHECK-LABEL: test01
+; CHECK: lxvd2x 0, 0, 3
+; CHECK: xxpermdi 34, 0, 0, 2
+}
+
+define <2 x double> @test02(<2 x double>* %p1, <2 x double>* %p2) {
+  %v1 = load <2 x double>* %p1
+  %v2 = load <2 x double>* %p2
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <2 x i32> < i32 0, i32 2>
+  ret <2 x double> %v3
+
+; CHECK-LABEL: @test02
+; CHECK: lxvd2x 0, 0, 3
+; CHECK: lxvd2x 1, 0, 4
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 1, 1, 1, 2
+; CHECK: xxpermdi 34, 1, 0, 3
+}
+
+define <2 x double> @test03(<2 x double>* %p1, <2 x double>* %p2) {
+  %v1 = load <2 x double>* %p1
+  %v2 = load <2 x double>* %p2
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <2 x i32> < i32 0, i32 3>
+  ret <2 x double> %v3
+
+; CHECK-LABEL: @test03
+; CHECK: lxvd2x 0, 0, 3
+; CHECK: lxvd2x 1, 0, 4
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 1, 1, 1, 2
+; CHECK: xxpermdi 34, 1, 0, 1
+}
+
+define <2 x double> @test10(<2 x double>* %p1, <2 x double>* %p2) {
+  %v1 = load <2 x double>* %p1
+  %v2 = load <2 x double>* %p2
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <2 x i32> < i32 1, i32 0>
+  ret <2 x double> %v3
+
+; CHECK-LABEL: @test10
+; CHECK: lxvd2x 0, 0, 3
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 34, 0, 0, 2
+}
+
+define <2 x double> @test11(<2 x double>* %p1, <2 x double>* %p2) {
+  %v1 = load <2 x double>* %p1
+  %v2 = load <2 x double>* %p2
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <2 x i32> < i32 1, i32 1>
+  ret <2 x double> %v3
+
+; CHECK-LABEL: @test11
+; CHECK: lxvd2x 0, 0, 3
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 34, 0, 0, 0
+}
+
+define <2 x double> @test12(<2 x double>* %p1, <2 x double>* %p2) {
+  %v1 = load <2 x double>* %p1
+  %v2 = load <2 x double>* %p2
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <2 x i32> < i32 1, i32 2>
+  ret <2 x double> %v3
+
+; CHECK-LABEL: @test12
+; CHECK: lxvd2x 0, 0, 3
+; CHECK: lxvd2x 1, 0, 4
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 1, 1, 1, 2
+; CHECK: xxpermdi 34, 1, 0, 2
+}
+
+define <2 x double> @test13(<2 x double>* %p1, <2 x double>* %p2) {
+  %v1 = load <2 x double>* %p1
+  %v2 = load <2 x double>* %p2
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <2 x i32> < i32 1, i32 3>
+  ret <2 x double> %v3
+
+; CHECK-LABEL: @test13
+; CHECK: lxvd2x 0, 0, 3
+; CHECK: lxvd2x 1, 0, 4
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 1, 1, 1, 2
+; CHECK: xxpermdi 34, 1, 0, 0
+}
+
+define <2 x double> @test20(<2 x double>* %p1, <2 x double>* %p2) {
+  %v1 = load <2 x double>* %p1
+  %v2 = load <2 x double>* %p2
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <2 x i32> < i32 2, i32 0>
+  ret <2 x double> %v3
+
+; CHECK-LABEL: @test20
+; CHECK: lxvd2x 0, 0, 3
+; CHECK: lxvd2x 1, 0, 4
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 1, 1, 1, 2
+; CHECK: xxpermdi 34, 0, 1, 3
+}
+
+define <2 x double> @test21(<2 x double>* %p1, <2 x double>* %p2) {
+  %v1 = load <2 x double>* %p1
+  %v2 = load <2 x double>* %p2
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <2 x i32> < i32 2, i32 1>
+  ret <2 x double> %v3
+
+; CHECK-LABEL: @test21
+; CHECK: lxvd2x 0, 0, 3
+; CHECK: lxvd2x 1, 0, 4
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 1, 1, 1, 2
+; CHECK: xxpermdi 34, 0, 1, 1
+}
+
+define <2 x double> @test22(<2 x double>* %p1, <2 x double>* %p2) {
+  %v1 = load <2 x double>* %p1
+  %v2 = load <2 x double>* %p2
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <2 x i32> < i32 2, i32 2>
+  ret <2 x double> %v3
+
+; CHECK-LABEL: @test22
+; CHECK: lxvd2x 0, 0, 4
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 34, 0, 0, 3
+}
+
+define <2 x double> @test23(<2 x double>* %p1, <2 x double>* %p2) {
+  %v1 = load <2 x double>* %p1
+  %v2 = load <2 x double>* %p2
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <2 x i32> < i32 2, i32 3>
+  ret <2 x double> %v3
+
+; CHECK-LABEL: @test23
+; CHECK: lxvd2x 0, 0, 4
+; CHECK: xxpermdi 34, 0, 0, 2
+}
+
+define <2 x double> @test30(<2 x double>* %p1, <2 x double>* %p2) {
+  %v1 = load <2 x double>* %p1
+  %v2 = load <2 x double>* %p2
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <2 x i32> < i32 3, i32 0>
+  ret <2 x double> %v3
+
+; CHECK-LABEL: @test30
+; CHECK: lxvd2x 0, 0, 3
+; CHECK: lxvd2x 1, 0, 4
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 1, 1, 1, 2
+; CHECK: xxpermdi 34, 0, 1, 2
+}
+
+define <2 x double> @test31(<2 x double>* %p1, <2 x double>* %p2) {
+  %v1 = load <2 x double>* %p1
+  %v2 = load <2 x double>* %p2
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <2 x i32> < i32 3, i32 1>
+  ret <2 x double> %v3
+
+; CHECK-LABEL: @test31
+; CHECK: lxvd2x 0, 0, 3
+; CHECK: lxvd2x 1, 0, 4
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 1, 1, 1, 2
+; CHECK: xxpermdi 34, 0, 1, 0
+}
+
+define <2 x double> @test32(<2 x double>* %p1, <2 x double>* %p2) {
+  %v1 = load <2 x double>* %p1
+  %v2 = load <2 x double>* %p2
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <2 x i32> < i32 3, i32 2>
+  ret <2 x double> %v3
+
+; CHECK-LABEL: @test32
+; CHECK: lxvd2x 0, 0, 4
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 34, 0, 0, 2
+}
+
+define <2 x double> @test33(<2 x double>* %p1, <2 x double>* %p2) {
+  %v1 = load <2 x double>* %p1
+  %v2 = load <2 x double>* %p2
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <2 x i32> < i32 3, i32 3>
+  ret <2 x double> %v3
+
+; CHECK-LABEL: @test33
+; CHECK: lxvd2x 0, 0, 4
+; CHECK: xxpermdi 0, 0, 0, 2
+; CHECK: xxpermdi 34, 0, 0, 0
+}
diff --git a/test/CodeGen/PowerPC/xxleqv_xxlnand_xxlorc.ll b/test/CodeGen/PowerPC/xxleqv_xxlnand_xxlorc.ll
new file mode 100644
index 0000000..4d929c6
--- /dev/null
+++ b/test/CodeGen/PowerPC/xxleqv_xxlnand_xxlorc.ll
@@ -0,0 +1,52 @@
+; Check the miscellaneous logical vector operations added in P8
+; 
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+; Test x eqv y
+define <4 x i32> @test_xxleqv(<4 x i32> %x, <4 x i32> %y) nounwind {
+       %tmp = xor <4 x i32> %x, %y
+       %ret_val = xor <4 x i32> %tmp, < i32 -1, i32 -1, i32 -1, i32 -1>
+       ret <4 x i32> %ret_val
+; CHECK: xxleqv 34, 34, 35
+}
+
+; Test x xxlnand y
+define <4 x i32> @test_xxlnand(<4 x i32> %x, <4 x i32> %y) nounwind {
+       %tmp = and <4 x i32> %x, %y
+       %ret_val = xor <4 x i32> %tmp, <i32 -1, i32 -1, i32 -1, i32 -1>
+       ret <4 x i32> %ret_val
+; CHECK: xxlnand 34, 34, 35
+}
+
+; Test x xxlorc y
+define <4 x i32> @test_xxlorc(<4 x i32> %x, <4 x i32> %y) nounwind {
+       %tmp = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+       %ret_val = or <4 x i32> %x, %tmp
+       ret <4 x i32> %ret_val
+; CHECK: xxlorc 34, 34, 35
+}
+
+; Test x eqv y
+define <8 x i16> @test_xxleqvv8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
+       %tmp = xor <8 x i16> %x, %y
+       %ret_val = xor <8 x i16> %tmp, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+       ret <8 x i16> %ret_val
+; CHECK: xxleqv 34, 34, 35
+}
+
+; Test x xxlnand y
+define <8 x i16> @test_xxlnandv8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
+       %tmp = and <8 x i16> %x, %y
+       %ret_val = xor <8 x i16> %tmp, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+       ret <8 x i16> %ret_val
+; CHECK: xxlnand 34, 34, 35
+}
+
+; Test x xxlorc y
+define <8 x i16> @test_xxlorcv8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
+       %tmp = xor <8 x i16> %y, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+       %ret_val = or <8 x i16> %x, %tmp
+       ret <8 x i16> %ret_val
+; CHECK: xxlorc 34, 34, 35
+}
+
diff --git a/test/CodeGen/PowerPC/zext-free.ll b/test/CodeGen/PowerPC/zext-free.ll
new file mode 100644
index 0000000..080dbaa
--- /dev/null
+++ b/test/CodeGen/PowerPC/zext-free.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mcpu=ppc64 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: noreturn nounwind
+define signext i32 @_Z1fRPc(i8** nocapture dereferenceable(8) %p) #0 {
+entry:
+  %.pre = load i8** %p, align 8
+  br label %loop
+
+loop:                                             ; preds = %loop.backedge, %entry
+  %0 = phi i8* [ %.pre, %entry ], [ %.be, %loop.backedge ]
+  %1 = load i8* %0, align 1
+  %tobool = icmp eq i8 %1, 0
+  %incdec.ptr = getelementptr inbounds i8* %0, i64 1
+  store i8* %incdec.ptr, i8** %p, align 8
+  %2 = load i8* %incdec.ptr, align 1
+  %tobool2 = icmp ne i8 %2, 0
+  %or.cond = and i1 %tobool, %tobool2
+  br i1 %or.cond, label %if.then3, label %loop.backedge
+
+if.then3:                                         ; preds = %loop
+  %incdec.ptr4 = getelementptr inbounds i8* %0, i64 2
+  store i8* %incdec.ptr4, i8** %p, align 8
+  br label %loop.backedge
+
+loop.backedge:                                    ; preds = %if.then3, %loop
+  %.be = phi i8* [ %incdec.ptr4, %if.then3 ], [ %incdec.ptr, %loop ]
+  br label %loop
+
+; CHECK-LABEL: @_Z1fRPc
+; CHECK-NOT: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 24, 31
+; CHECK-NOT: clrlwi {{[0-9]+}}, {{[0-9]+}}, 24
+}
+
+attributes #0 = { noreturn nounwind }
+
diff --git a/test/CodeGen/R600/128bit-kernel-args.ll b/test/CodeGen/R600/128bit-kernel-args.ll
index d9b0ff2..557d86a 100644
--- a/test/CodeGen/R600/128bit-kernel-args.ll
+++ b/test/CodeGen/R600/128bit-kernel-args.ll
@@ -1,26 +1,27 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI
 
-; R600-CHECK: {{^}}v4i32_kernel_arg:
-; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR:[0-9]]].X, KC0[3].Y
-; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].Y, KC0[3].Z
-; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].Z, KC0[3].W
-; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].W, KC0[4].X
-; SI-CHECK: {{^}}v4i32_kernel_arg:
-; SI-CHECK: buffer_store_dwordx4
+; R600: {{^}}v4i32_kernel_arg:
+; R600-DAG: MOV {{[* ]*}}T[[GPR:[0-9]]].X, KC0[3].Y
+; R600-DAG: MOV {{[* ]*}}T[[GPR]].Y, KC0[3].Z
+; R600-DAG: MOV {{[* ]*}}T[[GPR]].Z, KC0[3].W
+; R600-DAG: MOV {{[* ]*}}T[[GPR]].W, KC0[4].X
+; SI: {{^}}v4i32_kernel_arg:
+; SI: buffer_store_dwordx4
 define void @v4i32_kernel_arg(<4 x i32> addrspace(1)* %out, <4 x i32>  %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK: {{^}}v4f32_kernel_arg:
-; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR:[0-9]]].X, KC0[3].Y
-; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].Y, KC0[3].Z
-; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].Z, KC0[3].W
-; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].W, KC0[4].X
-; SI-CHECK: {{^}}v4f32_kernel_arg:
-; SI-CHECK: buffer_store_dwordx4
+; R600: {{^}}v4f32_kernel_arg:
+; R600-DAG: MOV {{[* ]*}}T[[GPR:[0-9]]].X, KC0[3].Y
+; R600-DAG: MOV {{[* ]*}}T[[GPR]].Y, KC0[3].Z
+; R600-DAG: MOV {{[* ]*}}T[[GPR]].Z, KC0[3].W
+; R600-DAG: MOV {{[* ]*}}T[[GPR]].W, KC0[4].X
+; SI: {{^}}v4f32_kernel_arg:
+; SI: buffer_store_dwordx4
 define void @v4f32_kernel_arg(<4 x float> addrspace(1)* %out, <4 x float>  %in) {
 entry:
   store <4 x float> %in, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/R600/32-bit-local-address-space.ll b/test/CodeGen/R600/32-bit-local-address-space.ll
index 4ff2762..6aca826 100644
--- a/test/CodeGen/R600/32-bit-local-address-space.ll
+++ b/test/CodeGen/R600/32-bit-local-address-space.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and
 ; the global address space(1) uses 64-bit pointers.  These tests check to make sure
@@ -130,7 +131,7 @@ define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %v
 ; FUNC-LABEL: {{^}}local_address_gep_large_const_offset_store:
 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: ds_write_b32 [[VPTR]], v{{[0-9]+}} [M0]{{$}}
+; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}}
 define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32 addrspace(3)* %out, i32 16385
   store i32 %val, i32 addrspace(3)* %gep, align 4
diff --git a/test/CodeGen/R600/64bit-kernel-args.ll b/test/CodeGen/R600/64bit-kernel-args.ll
index cf4e055..2e08901 100644
--- a/test/CodeGen/R600/64bit-kernel-args.ll
+++ b/test/CodeGen/R600/64bit-kernel-args.ll
@@ -1,9 +1,12 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=SI
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=VI
 
-; SI-CHECK: {{^}}f64_kernel_arg:
-; SI-CHECK-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
-; SI-CHECK-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
-; SI-CHECK: buffer_store_dwordx2
+; GCN: {{^}}f64_kernel_arg:
+; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
+; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
+; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
+; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
+; GCN: buffer_store_dwordx2
 define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
 entry:
   store double %in, double addrspace(1)* %out
diff --git a/test/CodeGen/R600/add-debug.ll b/test/CodeGen/R600/add-debug.ll
index 166e0f6..a83c689 100644
--- a/test/CodeGen/R600/add-debug.ll
+++ b/test/CodeGen/R600/add-debug.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti -debug
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -debug
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -debug
 ; REQUIRES: asserts
 
 ; Check that SelectionDAGDumper does not crash on int_SI_if.
diff --git a/test/CodeGen/R600/add.ll b/test/CodeGen/R600/add.ll
index 767a642..3a8b97c 100644
--- a/test/CodeGen/R600/add.ll
+++ b/test/CodeGen/R600/add.ll
@@ -1,12 +1,13 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK --check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=FUNC %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 ;FUNC-LABEL: {{^}}test1:
-;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: v_add_i32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}}
-;SI-CHECK-NOT: [[REG]]
-;SI-CHECK: buffer_store_dword [[REG]],
+;SI: v_add_i32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}}
+;SI-NOT: [[REG]]
+;SI: buffer_store_dword [[REG]],
 define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
   %a = load i32 addrspace(1)* %in
@@ -17,11 +18,11 @@ define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 }
 
 ;FUNC-LABEL: {{^}}test2:
-;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -33,15 +34,15 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 }
 
 ;FUNC-LABEL: {{^}}test4:
-;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -53,22 +54,22 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 }
 
 ; FUNC-LABEL: {{^}}test8:
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
 define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
 entry:
   %0 = add <8 x i32> %a, %b
@@ -77,38 +78,38 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}test16:
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; EG-CHECK: ADD_INT
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
-; SI-CHECK: s_add_i32
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
 define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
 entry:
   %0 = add <16 x i32> %a, %b
@@ -117,8 +118,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}add64:
-; SI-CHECK: s_add_u32
-; SI-CHECK: s_addc_u32
+; SI: s_add_u32
+; SI: s_addc_u32
 define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = add i64 %a, %b
@@ -132,7 +133,7 @@ entry:
 ; to a VGPR before doing the add.
 
 ; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
-; SI-CHECK-NOT: v_addc_u32_e32 s
+; SI-NOT: v_addc_u32_e32 s
 define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
 entry:
   %0 = load i64 addrspace(1)* %in
@@ -143,8 +144,8 @@ entry:
 
 ; Test i64 add inside a branch.
 ; FUNC-LABEL: {{^}}add64_in_branch:
-; SI-CHECK: s_add_u32
-; SI-CHECK: s_addc_u32
+; SI: s_add_u32
+; SI: s_addc_u32
 define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
diff --git a/test/CodeGen/R600/add_i64.ll b/test/CodeGen/R600/add_i64.ll
index 47ecf6d..1769409 100644
--- a/test/CodeGen/R600/add_i64.ll
+++ b/test/CodeGen/R600/add_i64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 
 declare i32 @llvm.r600.read.tidig.x() readnone
diff --git a/test/CodeGen/R600/address-space.ll b/test/CodeGen/R600/address-space.ll
index d04afe6..74ea9f0 100644
--- a/test/CodeGen/R600/address-space.ll
+++ b/test/CodeGen/R600/address-space.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
 
 ; Test that codegenprepare understands address space sizes
 
@@ -9,9 +10,10 @@
 
 ; CHECK-LABEL: {{^}}do_as_ptr_calcs:
 ; CHECK: s_load_dword [[SREG1:s[0-9]+]],
+; CHECK: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG1]]
 ; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
 ; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12
-; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:20
+; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG2]] offset:20
 define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
 entry:
   %x = getelementptr inbounds %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
diff --git a/test/CodeGen/R600/and.ll b/test/CodeGen/R600/and.ll
index 9a76fce..bb7cba3 100644
--- a/test/CodeGen/R600/and.ll
+++ b/test/CodeGen/R600/and.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}test2:
 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -63,8 +64,8 @@ define void @v_and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addr
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_and_constant_i32:
-; SI: v_and_b32
+; FUNC-LABEL: {{^}}v_and_constant_i32
+; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
 define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, 1234567
@@ -72,7 +73,25 @@ define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr)
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_and_i64:
+; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
+; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
+define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+  %a = load i32 addrspace(1)* %aptr, align 4
+  %and = and i32 %a, 64
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
+; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
+define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+  %a = load i32 addrspace(1)* %aptr, align 4
+  %and = and i32 %a, -16
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_i64
 ; SI: s_and_b64
 define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %and = and i64 %a, %b
@@ -89,8 +108,8 @@ define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_and_constant_i64:
-; SI: s_and_b64
+; FUNC-LABEL: {{^}}s_and_constant_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %and = and i64 %a, 281474976710655
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -149,10 +168,129 @@ define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %apt
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_and_inline_imm_i64:
+; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64
 ; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 64
-define void @s_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 64
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_1_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1
+define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 1
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_1.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0
+define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4607182418800017408
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_1.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0
+define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13830554455654793216
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_0.5_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5
+define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4602678819172646912
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5
+define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13826050856027422720
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 2.0
+define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4611686018427387904
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -2.0
+define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13835058055282163712
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0
+define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4616189618054758400
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0
+define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13839561654909534208
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+
+; Test with the 64-bit integer bitpattern for a 32-bit float in the
+; low 32-bits, which is not a valid 64-bit inline immmediate.
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}}
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 1082130432
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FIXME: Copy of -1 register
+; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], -4.0
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}}
+; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]]
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}}
+define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, -1065353216
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; Shift into upper 32-bits
+; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_4.0_i64
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4647714815446351872
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13871086852301127680
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/anyext.ll b/test/CodeGen/R600/anyext.ll
index 23fdcbb..48d8f31 100644
--- a/test/CodeGen/R600/anyext.ll
+++ b/test/CodeGen/R600/anyext.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ; CHECK-LABEL: {{^}}anyext_i1_i32:
 ; CHECK: v_cndmask_b32_e64
diff --git a/test/CodeGen/R600/array-ptr-calc-i32.ll b/test/CodeGen/R600/array-ptr-calc-i32.ll
index 84d3540..33a8aee 100644
--- a/test/CodeGen/R600/array-ptr-calc-i32.ll
+++ b/test/CodeGen/R600/array-ptr-calc-i32.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() nounwind readnone
 declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
diff --git a/test/CodeGen/R600/array-ptr-calc-i64.ll b/test/CodeGen/R600/array-ptr-calc-i64.ll
index 75f6394..32e657d 100644
--- a/test/CodeGen/R600/array-ptr-calc-i64.ll
+++ b/test/CodeGen/R600/array-ptr-calc-i64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() readnone
 
diff --git a/test/CodeGen/R600/atomic_cmp_swap_local.ll b/test/CodeGen/R600/atomic_cmp_swap_local.ll
index 223f4d3..6c76ad7 100644
--- a/test/CodeGen/R600/atomic_cmp_swap_local.ll
+++ b/test/CodeGen/R600/atomic_cmp_swap_local.ll
@@ -1,14 +1,17 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SICI -check-prefix=GCN -check-prefix=FUNC  %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SICI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
-; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
-; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
-; SI: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 [M0]
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
+; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
+; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
+; GCN: s_endpgm
 define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
@@ -18,17 +21,18 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
-; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI: s_mov_b64  s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
-; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
-; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
-; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
-; SI-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
-; SI: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 [M0]
-; SI: buffer_store_dwordx2 [[RESULT]],
-; SI: s_endpgm
+; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
+; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
+; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
+; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
+; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
+; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: s_endpgm
 define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
@@ -39,8 +43,8 @@ define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrs
 
 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
 ; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 [M0]
-; SI: s_endpgm
+; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
@@ -52,13 +56,15 @@ define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i3
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
-; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
-; SI-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
-; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
-; SI: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 [M0]
-; SI: s_endpgm
+; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
+; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
+; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
+; GCN: s_endpgm
 define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
@@ -67,16 +73,17 @@ define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %sw
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
-; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_mov_b64  s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
-; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
-; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
-; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
-; SI-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
-; SI: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 [M0]
-; SI: s_endpgm
+; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
+; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
+; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
+; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
diff --git a/test/CodeGen/R600/atomic_load_add.ll b/test/CodeGen/R600/atomic_load_add.ll
index f0eff21..5fe05f2 100644
--- a/test/CodeGen/R600/atomic_load_add.ll
+++ b/test/CodeGen/R600/atomic_load_add.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}atomic_add_local:
diff --git a/test/CodeGen/R600/atomic_load_sub.ll b/test/CodeGen/R600/atomic_load_sub.ll
index 61ff296..4072283 100644
--- a/test/CodeGen/R600/atomic_load_sub.ll
+++ b/test/CodeGen/R600/atomic_load_sub.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}atomic_sub_local:
 ; R600: LDS_SUB *
diff --git a/test/CodeGen/R600/basic-branch.ll b/test/CodeGen/R600/basic-branch.ll
index 073ab79..abdc4af 100644
--- a/test/CodeGen/R600/basic-branch.ll
+++ b/test/CodeGen/R600/basic-branch.ll
@@ -1,5 +1,6 @@
 ; XFAIL: *
-; RUN: llc -O0 -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test_branch(
 define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
diff --git a/test/CodeGen/R600/basic-loop.ll b/test/CodeGen/R600/basic-loop.ll
index 3cd609135..f0263ca 100644
--- a/test/CodeGen/R600/basic-loop.ll
+++ b/test/CodeGen/R600/basic-loop.ll
@@ -1,5 +1,5 @@
-; XFAIL: *
-; RUN: llc -O0 -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s
+; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s
+; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test_loop:
 define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll
index 2a0bb37..0334934 100644
--- a/test/CodeGen/R600/bfi_int.ll
+++ b/test/CodeGen/R600/bfi_int.ll
@@ -1,13 +1,14 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
 
 ; BFI_INT Definition pattern from ISA docs
 ; (y & x) | (z & ~x)
 ;
-; R600-CHECK: {{^}}bfi_def:
-; R600-CHECK: BFI_INT
-; SI-CHECK:   @bfi_def
-; SI-CHECK:   v_bfi_b32
+; R600: {{^}}bfi_def:
+; R600: BFI_INT
+; SI:   @bfi_def
+; SI:   v_bfi_b32
 define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = xor i32 %x, -1
@@ -20,10 +21,10 @@ entry:
 
 ; SHA-256 Ch function
 ; z ^ (x & (y ^ z))
-; R600-CHECK: {{^}}bfi_sha256_ch:
-; R600-CHECK: BFI_INT
-; SI-CHECK:   @bfi_sha256_ch
-; SI-CHECK:   v_bfi_b32
+; R600: {{^}}bfi_sha256_ch:
+; R600: BFI_INT
+; SI:   @bfi_sha256_ch
+; SI:   v_bfi_b32
 define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = xor i32 %y, %z
@@ -35,11 +36,11 @@ entry:
 
 ; SHA-256 Ma function
 ; ((x & z) | (y & (x | z)))
-; R600-CHECK: {{^}}bfi_sha256_ma:
-; R600-CHECK: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W
-; R600-CHECK: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W
-; SI-CHECK: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}}
-; SI-CHECK: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}}
+; R600: {{^}}bfi_sha256_ma:
+; R600: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W
+; R600: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W
+; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}}
+; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}}
 
 define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
diff --git a/test/CodeGen/R600/bitcast.ll b/test/CodeGen/R600/bitcast.ll
index 725d5ba..1ba64af 100644
--- a/test/CodeGen/R600/bitcast.ll
+++ b/test/CodeGen/R600/bitcast.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; This test just checks that the compiler doesn't crash.
 
diff --git a/test/CodeGen/R600/bswap.ll b/test/CodeGen/R600/bswap.ll
index 1c5a0c6..e93543d 100644
--- a/test/CodeGen/R600/bswap.ll
+++ b/test/CodeGen/R600/bswap.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.bswap.i32(i32) nounwind readnone
 declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone
diff --git a/test/CodeGen/R600/build_vector.ll b/test/CodeGen/R600/build_vector.ll
index 9137eee..65eacf5 100644
--- a/test/CodeGen/R600/build_vector.ll
+++ b/test/CodeGen/R600/build_vector.ll
@@ -1,32 +1,33 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI
 
-; R600-CHECK: {{^}}build_vector2:
-; R600-CHECK: MOV
-; R600-CHECK: MOV
-; R600-CHECK-NOT: MOV
-; SI-CHECK: {{^}}build_vector2:
-; SI-CHECK-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
-; SI-CHECK-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
-; SI-CHECK: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}}
+; R600: {{^}}build_vector2:
+; R600: MOV
+; R600: MOV
+; R600-NOT: MOV
+; SI: {{^}}build_vector2:
+; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
+; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
+; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}}
 define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
 entry:
   store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK: {{^}}build_vector4:
-; R600-CHECK: MOV
-; R600-CHECK: MOV
-; R600-CHECK: MOV
-; R600-CHECK: MOV
-; R600-CHECK-NOT: MOV
-; SI-CHECK: {{^}}build_vector4:
-; SI-CHECK-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
-; SI-CHECK-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
-; SI-CHECK-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7
-; SI-CHECK-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8
-; SI-CHECK: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}}
+; R600: {{^}}build_vector4:
+; R600: MOV
+; R600: MOV
+; R600: MOV
+; R600: MOV
+; R600-NOT: MOV
+; SI: {{^}}build_vector4:
+; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
+; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
+; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7
+; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8
+; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}}
 define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
 entry:
   store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out
diff --git a/test/CodeGen/R600/call.ll b/test/CodeGen/R600/call.ll
index 1448f04..6de51f1 100644
--- a/test/CodeGen/R600/call.ll
+++ b/test/CodeGen/R600/call.ll
@@ -1,4 +1,5 @@
-; RUN: not llc -march=r600 -mcpu=SI -verify-machineinstrs< %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s 2>&1 | FileCheck %s
 ; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s
 
 ; CHECK: error: unsupported call to function external_function in test_call_external
diff --git a/test/CodeGen/R600/call_fs.ll b/test/CodeGen/R600/call_fs.ll
index 7df2240..db2cb6e 100644
--- a/test/CodeGen/R600/call_fs.ll
+++ b/test/CodeGen/R600/call_fs.ll
@@ -1,13 +1,13 @@
 
-; RUN: llc < %s -march=r600 -mcpu=redwood -show-mc-encoding -o - | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=rv710 -show-mc-encoding -o - | FileCheck --check-prefix=R600-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=redwood -show-mc-encoding -o - | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=r600 -mcpu=rv710 -show-mc-encoding -o - | FileCheck --check-prefix=R600 %s
 
-; EG-CHECK: {{^}}call_fs:
-; EG-CHECK: .long 257
-; EG-CHECK: CALL_FS  ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0xc0,0x84]
-; R600-CHECK: {{^}}call_fs:
-; R600-CHECK: .long 257
-; R600-CHECK:CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89]
+; EG: {{^}}call_fs:
+; EG: .long 257
+; EG: CALL_FS  ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0xc0,0x84]
+; R600: {{^}}call_fs:
+; R600: .long 257
+; R600:CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89]
 
 
 define void @call_fs() #0 {
diff --git a/test/CodeGen/R600/cf_end.ll b/test/CodeGen/R600/cf_end.ll
index 138004d..c74ee22 100644
--- a/test/CodeGen/R600/cf_end.ll
+++ b/test/CodeGen/R600/cf_end.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood --show-mc-encoding | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=caicos --show-mc-encoding | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=cayman --show-mc-encoding | FileCheck --check-prefix=CM-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=redwood --show-mc-encoding | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=r600 -mcpu=caicos --show-mc-encoding | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=r600 -mcpu=cayman --show-mc-encoding | FileCheck --check-prefix=CM %s
 
-; EG-CHECK: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80]
-; CM-CHECK: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88]
+; EG: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80]
+; CM: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88]
 define void @eop() {
   ret void
 }
diff --git a/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll b/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll
index b42b904..e16a397 100644
--- a/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll
+++ b/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -codegenprepare -S -o - %s | FileCheck --check-prefix=OPT %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-LLC %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-LLC %s
 
 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 target triple = "r600--"
diff --git a/test/CodeGen/R600/commute_modifiers.ll b/test/CodeGen/R600/commute_modifiers.ll
index 30c8067..6fddb6d 100644
--- a/test/CodeGen/R600/commute_modifiers.ll
+++ b/test/CodeGen/R600/commute_modifiers.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #1
 declare float @llvm.fabs.f32(float) #1
@@ -65,7 +65,7 @@ define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(
 
 ; FUNC-LABEL: @commute_add_fabs_f32
 ; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
 ; SI-NEXT: buffer_store_dword [[REG]]
 define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
@@ -82,7 +82,7 @@ define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)*
 
 ; FUNC-LABEL: @commute_mul_fneg_f32
 ; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
 ; SI-NEXT: buffer_store_dword [[REG]]
 define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
@@ -99,7 +99,7 @@ define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
 
 ; FUNC-LABEL: @commute_mul_fabs_fneg_f32
 ; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
 ; SI-NEXT: buffer_store_dword [[REG]]
 define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
@@ -118,7 +118,7 @@ define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace
 ; There's no reason to commute this.
 ; FUNC-LABEL: @commute_mul_fabs_x_fabs_y_f32
 ; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
 ; SI-NEXT: buffer_store_dword [[REG]]
 define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
@@ -136,7 +136,7 @@ define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrs
 
 ; FUNC-LABEL: @commute_mul_fabs_x_fneg_fabs_y_f32
 ; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
 ; SI-NEXT: buffer_store_dword [[REG]]
 define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
@@ -158,7 +158,7 @@ define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float
 
 ; SI-LABEL: {{^}}fma_a_2.0_neg_b_f32
 ; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], |[[R2]]|
 ; SI: buffer_store_dword [[RESULT]]
 define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
diff --git a/test/CodeGen/R600/concat_vectors.ll b/test/CodeGen/R600/concat_vectors.ll
index 19992eb..6b3fae3 100644
--- a/test/CodeGen/R600/concat_vectors.ll
+++ b/test/CodeGen/R600/concat_vectors.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}test_concat_v1i32:
 ; 0x80f000 is the high 32 bits of the resource descriptor used by MUBUF
diff --git a/test/CodeGen/R600/copy-illegal-type.ll b/test/CodeGen/R600/copy-illegal-type.ll
index 66ea88e..56c43d2 100644
--- a/test/CodeGen/R600/copy-illegal-type.ll
+++ b/test/CodeGen/R600/copy-illegal-type.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}test_copy_v4i8:
 ; SI: buffer_load_dword [[REG:v[0-9]+]]
diff --git a/test/CodeGen/R600/copy-to-reg.ll b/test/CodeGen/R600/copy-to-reg.ll
index f90ee78..9c1de73 100644
--- a/test/CodeGen/R600/copy-to-reg.ll
+++ b/test/CodeGen/R600/copy-to-reg.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s
 
 ; Test that CopyToReg instructions don't have non-register operands prior
 ; to being emitted.
diff --git a/test/CodeGen/R600/ctlz_zero_undef.ll b/test/CodeGen/R600/ctlz_zero_undef.ll
index f699127..1a4317b 100644
--- a/test/CodeGen/R600/ctlz_zero_undef.ll
+++ b/test/CodeGen/R600/ctlz_zero_undef.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
diff --git a/test/CodeGen/R600/ctpop.ll b/test/CodeGen/R600/ctpop.ll
index 5cfdaef..6f7d92b 100644
--- a/test/CodeGen/R600/ctpop.ll
+++ b/test/CodeGen/R600/ctpop.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
@@ -8,11 +9,11 @@ declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
 declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
 
 ; FUNC-LABEL: {{^}}s_ctpop_i32:
-; SI: s_load_dword [[SVAL:s[0-9]+]],
-; SI: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]]
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
-; SI: s_endpgm
+; GCN: s_load_dword [[SVAL:s[0-9]+]],
+; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]]
+; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; GCN: buffer_store_dword [[VRESULT]],
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
@@ -23,11 +24,10 @@ define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
 
 ; XXX - Why 0 in register?
 ; FUNC-LABEL: {{^}}v_ctpop_i32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_mov_b32_e32 [[VZERO:v[0-9]+]], 0
-; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VZERO]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
@@ -38,13 +38,13 @@ define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noali
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
-; SI: buffer_load_dword [[VAL0:v[0-9]+]],
-; SI: buffer_load_dword [[VAL1:v[0-9]+]],
-; SI: v_mov_b32_e32 [[VZERO:v[0-9]+]], 0
-; SI: v_bcnt_u32_b32_e32 [[MIDRESULT:v[0-9]+]], [[VAL1]], [[VZERO]]
-; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: buffer_load_dword [[VAL1:v[0-9]+]],
+; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
+; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -59,11 +59,11 @@ define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
-; SI: buffer_load_dword [[VAL0:v[0-9]+]],
-; SI-NEXT: s_waitcnt
-; SI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
-; SI-NEXT: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
+; GCN-NEXT: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
   %val0 = load i32 addrspace(1)* %in0, align 4
   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
@@ -73,9 +73,9 @@ define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v2i32:
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: s_endpgm
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -87,11 +87,11 @@ define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrs
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v4i32:
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: s_endpgm
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -105,15 +105,15 @@ define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrs
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v8i32:
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: s_endpgm
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -131,23 +131,23 @@ define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrs
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v16i32:
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: v_bcnt_u32_b32_e32
-; SI: s_endpgm
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -173,10 +173,10 @@ define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> ad
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
@@ -188,10 +188,10 @@ define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
@@ -203,11 +203,12 @@ define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out,
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
@@ -217,11 +218,11 @@ define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspa
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var:
-; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]],
-; SI-DAG: s_load_dword [[VAR:s[0-9]+]],
-; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
@@ -233,11 +234,11 @@ define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv:
-; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]],
-; SI-DAG: s_load_dword [[VAR:s[0-9]+]],
-; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
@@ -249,11 +250,12 @@ define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspa
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv:
-; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}}
-; SI-DAG: buffer_load_dword [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:0x10
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}}
+; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:16
 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
@@ -271,10 +273,11 @@ define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrsp
 
 ; FUNC-LABEL: {{^}}ctpop_i32_in_br:
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
-; SI: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[VAL]]
-; SI: v_mov_b32_e32 [[RESULT]], [[SRESULT]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
+; GCN: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[VAL]]
+; GCN: v_mov_b32_e32 [[RESULT]], [[SRESULT]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 ; EG: BCNT_INT
 define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
 entry:
diff --git a/test/CodeGen/R600/ctpop64.ll b/test/CodeGen/R600/ctpop64.ll
index 2efac8f..8bcd818 100644
--- a/test/CodeGen/R600/ctpop64.ll
+++ b/test/CodeGen/R600/ctpop64.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
@@ -8,10 +9,11 @@ declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
 
 ; FUNC-LABEL: {{^}}s_ctpop_i64:
 ; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]]
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
-; SI: s_endpgm
+; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]]
+; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; GCN: buffer_store_dword [[VRESULT]],
+; GCN: s_endpgm
 define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
@@ -20,12 +22,12 @@ define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i64:
-; SI: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
-; SI: v_mov_b32_e32 [[VZERO:v[0-9]+]], 0
-; SI: v_bcnt_u32_b32_e32 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], [[VZERO]]
+; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %val = load i64 addrspace(1)* %in, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
@@ -35,9 +37,9 @@ define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noali
 }
 
 ; FUNC-LABEL: {{^}}s_ctpop_v2i64:
-; SI: s_bcnt1_i32_b64
-; SI: s_bcnt1_i32_b64
-; SI: s_endpgm
+; GCN: s_bcnt1_i32_b64
+; GCN: s_bcnt1_i32_b64
+; GCN: s_endpgm
 define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
@@ -46,11 +48,11 @@ define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val)
 }
 
 ; FUNC-LABEL: {{^}}s_ctpop_v4i64:
-; SI: s_bcnt1_i32_b64
-; SI: s_bcnt1_i32_b64
-; SI: s_bcnt1_i32_b64
-; SI: s_bcnt1_i32_b64
-; SI: s_endpgm
+; GCN: s_bcnt1_i32_b64
+; GCN: s_bcnt1_i32_b64
+; GCN: s_bcnt1_i32_b64
+; GCN: s_bcnt1_i32_b64
+; GCN: s_endpgm
 define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
@@ -59,11 +61,11 @@ define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val)
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v2i64:
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: s_endpgm
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: s_endpgm
 define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
   %val = load <2 x i64> addrspace(1)* %in, align 16
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
@@ -73,15 +75,15 @@ define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrs
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v4i64:
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: s_endpgm
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: s_endpgm
 define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
   %val = load <4 x i64> addrspace(1)* %in, align 32
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
@@ -95,11 +97,12 @@ define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrs
 
 ; FUNC-LABEL: {{^}}ctpop_i64_in_br:
 ; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
-; SI: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
-; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
-; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
-; SI: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
-; SI: s_endpgm
+; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34
+; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
+; GCN: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
+; GCN: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
+; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
+; GCN: s_endpgm
 define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
 entry:
   %tmp0 = icmp eq i32 %cond, 0
diff --git a/test/CodeGen/R600/cttz_zero_undef.ll b/test/CodeGen/R600/cttz_zero_undef.ll
index c4b1463..d9d284c 100644
--- a/test/CodeGen/R600/cttz_zero_undef.ll
+++ b/test/CodeGen/R600/cttz_zero_undef.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
diff --git a/test/CodeGen/R600/cvt_f32_ubyte.ll b/test/CodeGen/R600/cvt_f32_ubyte.ll
index 0d1db19..4d4bf93 100644
--- a/test/CodeGen/R600/cvt_f32_ubyte.ll
+++ b/test/CodeGen/R600/cvt_f32_ubyte.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}load_i8_to_f32:
 ; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]],
@@ -22,7 +23,7 @@ define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* n
 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <2 x i8> addrspace(1)* %in, align 1
+  %load = load <2 x i8> addrspace(1)* %in, align 2
   %cvt = uitofp <2 x i8> %load to <2 x float>
   store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
   ret void
@@ -36,18 +37,14 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8>
 ; SI-DAG: v_cvt_f32_ubyte0_e32
 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <3 x i8> addrspace(1)* %in, align 1
+  %load = load <3 x i8> addrspace(1)* %in, align 4
   %cvt = uitofp <3 x i8> %load to <3 x float>
   store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; SI-LABEL: {{^}}load_v4i8_to_v4f32:
-; We can't use buffer_load_dword here, because the load is byte aligned, and
-; buffer_load_dword requires dword alignment.
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: v_or_b32_e32 [[LOADREG:v[0-9]+]]
+; SI: buffer_load_dword [[LOADREG:v[0-9]+]]
 ; SI-NOT: bfe
 ; SI-NOT: lshr
 ; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
@@ -56,6 +53,30 @@ define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8>
 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
 ; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <4 x i8> addrspace(1)* %in, align 4
+  %cvt = uitofp <4 x i8> %load to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; This should not be adding instructions to shift into the correct
+; position in the word for the component.
+
+; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
+; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
+; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
+; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
+; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
+; SI-NOT: v_lshlrev_b32
+; SI-NOT: v_or_b32
+
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]]
+
+; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <4 x i8> %load to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@@ -125,7 +146,7 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8>
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <8 x i8> addrspace(1)* %in, align 1
+  %load = load <8 x i8> addrspace(1)* %in, align 8
   %cvt = uitofp <8 x i8> %load to <8 x float>
   store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
   ret void
diff --git a/test/CodeGen/R600/cvt_flr_i32_f32.ll b/test/CodeGen/R600/cvt_flr_i32_f32.ll
new file mode 100644
index 0000000..2dd3a9f
--- /dev/null
+++ b/test/CodeGen/R600/cvt_flr_i32_f32.ll
@@ -0,0 +1,86 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.floor.f32(float) #1
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_0:
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NOT: add
+; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+  %floor = call float @llvm.floor.f32(float %x) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_1:
+; SI: v_add_f32_e64 [[TMP:v[0-9]+]], 1.0, s{{[0-9]+}}
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]]
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
+  %fadd = fadd float %x, 1.0
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs:
+; SI-NOT: add
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %floor = call float @llvm.floor.f32(float %x.fabs) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fneg:
+; SI-NOT: add
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fneg = fsub float -0.000000e+00, %x
+  %floor = call float @llvm.floor.f32(float %x.fneg) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs_fneg:
+; SI-NOT: add
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
+  %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}no_cvt_flr_i32_f32_0:
+; SI-NOT: v_cvt_flr_i32_f32
+; SI: v_floor_f32
+; SI: v_cvt_u32_f32_e32
+; SI: s_endpgm
+define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+  %floor = call float @llvm.floor.f32(float %x) #1
+  %cvt = fptoui float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/cvt_rpi_i32_f32.ll b/test/CodeGen/R600/cvt_rpi_i32_f32.ll
new file mode 100644
index 0000000..864ac40
--- /dev/null
+++ b/test/CodeGen/R600/cvt_rpi_i32_f32.ll
@@ -0,0 +1,83 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.floor.f32(float) #1
+
+; FUNC-LABEL: {{^}}cvt_rpi_i32_f32:
+; SI-SAFE-NOT: v_cvt_rpi_i32_f32
+; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; SI: s_endpgm
+define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
+  %fadd = fadd float %x, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs:
+; SI-SAFE-NOT: v_cvt_rpi_i32_f32
+; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
+; SI: s_endpgm
+define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %fadd = fadd float %x.fabs, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This doesn't work because it forms fsub 0.5, x
+; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fneg:
+; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
+; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, s{{[0-9]+}}
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
+; SI: s_endpgm
+define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fneg = fsub float -0.000000e+00, %x
+  %fadd = fadd float %x.fneg, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This doesn't work for same reason as above
+; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs_fneg:
+; SI-SAFE-NOT: v_cvt_rpi_i32_f32
+; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
+
+; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, |s{{[0-9]+}}|
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
+; SI: s_endpgm
+define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
+  %fadd = fadd float %x.fabs.fneg, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}no_cvt_rpi_i32_f32_0:
+; SI-NOT: v_cvt_rpi_i32_f32
+; SI: v_add_f32
+; SI: v_floor_f32
+; SI: v_cvt_u32_f32
+; SI: s_endpgm
+define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+  %fadd = fadd float %x, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptoui float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/default-fp-mode.ll b/test/CodeGen/R600/default-fp-mode.ll
index 935bf97..da8e914 100644
--- a/test/CodeGen/R600/default-fp-mode.ll
+++ b/test/CodeGen/R600/default-fp-mode.ll
@@ -1,10 +1,17 @@
-; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}test_kernel:
 
diff --git a/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll
index f334062..41afd50 100644
--- a/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll
+++ b/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
 declare void @llvm.AMDGPU.barrier.local() #1
diff --git a/test/CodeGen/R600/ds_read2.ll b/test/CodeGen/R600/ds_read2.ll
index 6e0c8be..c06b0b1 100644
--- a/test/CodeGen/R600/ds_read2.ll
+++ b/test/CodeGen/R600/ds_read2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
 
 ; FIXME: We don't get cases where the address was an SGPR because we
 ; get a copy to the address register for each one.
diff --git a/test/CodeGen/R600/ds_read2_offset_order.ll b/test/CodeGen/R600/ds_read2_offset_order.ll
new file mode 100644
index 0000000..44306bc
--- /dev/null
+++ b/test/CodeGen/R600/ds_read2_offset_order.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+
+; XFAIL: *
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+
+; SI-LABEL: {{^}}offset_order:
+
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56
+; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:0 offset1:4
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:1
+
+define void @offset_order(float addrspace(1)* %out) {
+entry:
+  %ptr0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 0
+  %val0 = load float addrspace(3)* %ptr0
+
+  %ptr1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 256
+  %val1 = load float addrspace(3)* %ptr1
+  %add1 = fadd float %val0, %val1
+
+  %ptr2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 3
+  %val2 = load float addrspace(3)* %ptr2
+  %add2 = fadd float %add1, %val2
+
+  %ptr3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 2
+  %val3 = load float addrspace(3)* %ptr3
+  %add3 = fadd float %add2, %val3
+
+  %ptr4 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 12
+  %val4 = load float addrspace(3)* %ptr4
+  %add4 = fadd float %add3, %val4
+
+  %ptr5 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 14
+  %val5 = load float addrspace(3)* %ptr5
+  %add5 = fadd float %add4, %val5
+
+  %ptr6 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 11
+  %val6 = load float addrspace(3)* %ptr6
+  %add6 = fadd float %add5, %val6
+  store float %add6, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/ds_read2st64.ll b/test/CodeGen/R600/ds_read2st64.ll
index 3e98e59..efd875e 100644
--- a/test/CodeGen/R600/ds_read2st64.ll
+++ b/test/CodeGen/R600/ds_read2st64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
 
 @lds = addrspace(3) global [512 x float] undef, align 4
 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
@@ -65,8 +65,8 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add
 
 ; SI-LABEL: @simple_read2st64_f32_over_max_offset
 ; SI-NOT: ds_read2st64_b32
-; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
 ; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
 ; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
 ; SI: s_endpgm
 define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
@@ -197,8 +197,8 @@ define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double a
 
 ; SI-LABEL: @simple_read2st64_f64_over_max_offset
 ; SI-NOT: ds_read2st64_b64
-; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
 ; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
 ; SI: s_endpgm
 define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
diff --git a/test/CodeGen/R600/ds_write2.ll b/test/CodeGen/R600/ds_write2.ll
index 1807fb5..e2db81a 100644
--- a/test/CodeGen/R600/ds_write2.ll
+++ b/test/CodeGen/R600/ds_write2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
 
 @lds = addrspace(3) global [512 x float] undef, align 4
 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
@@ -7,7 +7,7 @@
 ; SI-LABEL: @simple_write2_one_val_f32
 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:8 [M0]
+; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:8
 ; SI: s_endpgm
 define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -23,9 +23,9 @@ define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1
 
 ; SI-LABEL: @simple_write2_two_val_f32
 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:8 [M0]
+; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:8 
 ; SI: s_endpgm
 define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -84,7 +84,7 @@ define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float
 ; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
 ; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8 [M0]
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -105,7 +105,7 @@ define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2
 ; SI-LABEL: @simple_write2_two_val_subreg2_f32
 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8 [M0]
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -124,7 +124,7 @@ define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x floa
 ; SI-LABEL: @simple_write2_two_val_subreg4_f32
 ; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8 [M0]
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -142,9 +142,9 @@ define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x floa
 
 ; SI-LABEL: @simple_write2_two_val_max_offset_f32
 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:255 [M0]
+; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:255
 ; SI: s_endpgm
 define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -268,7 +268,7 @@ define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float add
 ; SI-LABEL: @simple_write2_one_val_f64
 ; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
-; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:8 [M0]
+; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:8
 ; SI: s_endpgm
 define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -285,8 +285,8 @@ define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace
 ; SI-LABEL: @misaligned_simple_write2_one_val_f64
 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:1 [M0]
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15 [M0]
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:1
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
 ; SI: s_endpgm
 define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -302,9 +302,9 @@ define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, doubl
 
 ; SI-LABEL: @simple_write2_two_val_f64
 ; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x8
+; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
-; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:8 [M0]
+; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
diff --git a/test/CodeGen/R600/ds_write2st64.ll b/test/CodeGen/R600/ds_write2st64.ll
index 4cafb7c..0f1c662 100644
--- a/test/CodeGen/R600/ds_write2st64.ll
+++ b/test/CodeGen/R600/ds_write2st64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
 
 
 @lds = addrspace(3) global [512 x float] undef, align 4
@@ -7,7 +7,7 @@
 ; SI-LABEL: @simple_write2st64_one_val_f32_0_1
 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:1 [M0]
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:1
 ; SI: s_endpgm
 define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -23,9 +23,9 @@ define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float add
 
 ; SI-LABEL: @simple_write2st64_two_val_f32_2_5
 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5 [M0]
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
 ; SI: s_endpgm
 define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -44,9 +44,9 @@ define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float add
 
 ; SI-LABEL: @simple_write2st64_two_val_max_offset_f32
 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:255 [M0]
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:255
 ; SI: s_endpgm
 define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -64,9 +64,9 @@ define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, fl
 
 ; SI-LABEL: @simple_write2st64_two_val_max_offset_f64
 ; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x8
+; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 ; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]],
-; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127 [M0]
+; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
 ; SI: s_endpgm
 define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
diff --git a/test/CodeGen/R600/elf.ll b/test/CodeGen/R600/elf.ll
index 6c521d0..aca3109 100644
--- a/test/CodeGen/R600/elf.ll
+++ b/test/CodeGen/R600/elf.ll
@@ -1,15 +1,21 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s - | FileCheck --check-prefix=ELF-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG-CHECK %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG %s
 
-; ELF-CHECK: Format: ELF32
-; ELF-CHECK: Name: .AMDGPU.config
-; ELF-CHECK: Type: SHT_PROGBITS
+; ELF: Format: ELF32
+; ELF: Name: .AMDGPU.config
+; ELF: Type: SHT_PROGBITS
 
-; CONFIG-CHECK: .align 256
-; CONFIG-CHECK: test:
-; CONFIG-CHECK: .section .AMDGPU.config
-; CONFIG-CHECK-NEXT: .long   45096
-; CONFIG-CHECK-NEXT: .long   0
+; ELF: Symbol {
+; ELF: Name: test
+; ELF: Binding: Global
+
+; CONFIG: .align 256
+; CONFIG: test:
+; CONFIG: .section .AMDGPU.config
+; CONFIG-NEXT: .long   45096
+; CONFIG-NEXT: .long   0
 define void @test(i32 %p) #0 {
    %i = add i32 %p, 2
    %r = bitcast i32 %i to float
diff --git a/test/CodeGen/R600/elf.r600.ll b/test/CodeGen/R600/elf.r600.ll
index 4436c07..51cd085 100644
--- a/test/CodeGen/R600/elf.r600.ll
+++ b/test/CodeGen/R600/elf.r600.ll
@@ -1,14 +1,14 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood -filetype=obj | llvm-readobj -s - | FileCheck --check-prefix=ELF-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=redwood -o - | FileCheck --check-prefix=CONFIG-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=redwood -filetype=obj | llvm-readobj -s - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -march=r600 -mcpu=redwood -o - | FileCheck --check-prefix=CONFIG %s
 
-; ELF-CHECK: Format: ELF32
-; ELF-CHECK: Name: .AMDGPU.config
+; ELF: Format: ELF32
+; ELF: Name: .AMDGPU.config
 
-; CONFIG-CHECK: .section .AMDGPU.config
-; CONFIG-CHECK-NEXT: .long   166100
-; CONFIG-CHECK-NEXT: .long   2
-; CONFIG-CHECK-NEXT: .long   165900
-; CONFIG-CHECK-NEXT: .long   0
+; CONFIG: .section .AMDGPU.config
+; CONFIG-NEXT: .long   166100
+; CONFIG-NEXT: .long   2
+; CONFIG-NEXT: .long   165900
+; CONFIG-NEXT: .long   0
 define void @test(float addrspace(1)* %out, i32 %p) {
    %i = add i32 %p, 2
    %r = bitcast i32 %i to float
diff --git a/test/CodeGen/R600/empty-function.ll b/test/CodeGen/R600/empty-function.ll
index d4ff803..b5593eb 100644
--- a/test/CodeGen/R600/empty-function.ll
+++ b/test/CodeGen/R600/empty-function.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; Make sure we don't assert on empty functions
 
diff --git a/test/CodeGen/R600/endcf-loop-header.ll b/test/CodeGen/R600/endcf-loop-header.ll
new file mode 100644
index 0000000..e3c5b3c
--- /dev/null
+++ b/test/CodeGen/R600/endcf-loop-header.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; This tests that the llvm.SI.end.cf intrinsic is not inserted into the
+; loop block.  This intrinsic will be lowered to s_or_b64 by the code
+; generator.
+
+; CHECK-LABEL: {{^}}test:
+
+; This is was lowered from the llvm.SI.end.cf intrinsic:
+; CHECK: s_or_b64 exec, exec
+
+; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}}
+; CHECK-NOT: s_or_b64 exec, exec
+; CHECK: s_cbranch_execnz [[LOOP_LABEL]]
+define void @test(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %if, label %loop
+
+if:
+  store i32 0, i32 addrspace(1)* %out
+  br label %loop
+
+loop:
+  %tmp1 = phi i32 [0, %entry], [0, %if], [%inc, %loop]
+  %inc = add i32 %tmp1, %cond
+  %tmp2 = icmp ugt i32 %inc, 10
+  br i1 %tmp2, label %done, label %loop
+
+done:
+  %tmp3 = getelementptr i32 addrspace(1)* %out, i64 1
+  store i32 %inc, i32 addrspace(1)* %tmp3
+  ret void
+}
diff --git a/test/CodeGen/R600/extload-private.ll b/test/CodeGen/R600/extload-private.ll
new file mode 100644
index 0000000..fec8682
--- /dev/null
+++ b/test/CodeGen/R600/extload-private.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}load_i8_sext_private:
+; SI: buffer_load_sbyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+define void @load_i8_sext_private(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = alloca i8
+  %tmp1 = load i8* %tmp0
+  %tmp2 = sext i8 %tmp1 to i32
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i8_zext_private:
+; SI: buffer_load_ubyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+define void @load_i8_zext_private(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = alloca i8
+  %tmp1 = load i8* %tmp0
+  %tmp2 = zext i8 %tmp1 to i32
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i16_sext_private:
+; SI: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+define void @load_i16_sext_private(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = alloca i16
+  %tmp1 = load i16* %tmp0
+  %tmp2 = sext i16 %tmp1 to i32
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i16_zext_private:
+; SI: buffer_load_ushort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+define void @load_i16_zext_private(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = alloca i16
+  %tmp1 = load i16* %tmp0
+  %tmp2 = zext i16 %tmp1 to i32
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll
index 5bda8f8..77e5dc3 100644
--- a/test/CodeGen/R600/extload.ll
+++ b/test/CodeGen/R600/extload.ll
@@ -1,9 +1,11 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}anyext_load_i8:
-; EG: AND_INT
-; EG: 255
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
+; EG: VTX_READ_32 [[VAL]]
+
 define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
   %load = load i32 addrspace(1)* %cast, align 1
@@ -14,10 +16,9 @@ define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspac
 }
 
 ; FUNC-LABEL: {{^}}anyext_load_i16:
-; EG: AND_INT
-; EG: AND_INT
-; EG-DAG: 65535
-; EG-DAG: -65536
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
+; EG: VTX_READ_32 [[VAL]]
+
 define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
   %load = load i32 addrspace(1)* %cast, align 1
@@ -28,8 +29,8 @@ define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrs
 }
 
 ; FUNC-LABEL: {{^}}anyext_load_lds_i8:
-; EG: AND_INT
-; EG: 255
+; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
+; EG: LDS_WRITE * [[VAL]]
 define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
   %load = load i32 addrspace(3)* %cast, align 1
@@ -40,10 +41,8 @@ define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addr
 }
 
 ; FUNC-LABEL: {{^}}anyext_load_lds_i16:
-; EG: AND_INT
-; EG: AND_INT
-; EG-DAG: 65535
-; EG-DAG: -65536
+; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
+; EG: LDS_WRITE * [[VAL]]
 define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
   %load = load i32 addrspace(3)* %cast, align 1
@@ -52,72 +51,3 @@ define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 a
   store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut, align 1
   ret void
 }
-
-; FUNC-LABEL: {{^}}sextload_global_i8_to_i64:
-; SI: buffer_load_sbyte [[LOAD:v[0-9]+]],
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: buffer_store_dwordx2
-define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %a = load i8 addrspace(1)* %in, align 8
-  %ext = sext i8 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i16_to_i64:
-; SI: buffer_load_sshort [[LOAD:v[0-9]+]],
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: buffer_store_dwordx2
-define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16 addrspace(1)* %in, align 8
-  %ext = sext i16 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i32_to_i64:
-; SI: buffer_load_dword [[LOAD:v[0-9]+]],
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: buffer_store_dwordx2
-define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %a = load i32 addrspace(1)* %in, align 8
-  %ext = sext i32 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_i8_to_i64:
-; SI-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
-; SI-DAG: buffer_load_ubyte [[LOAD:v[0-9]+]],
-; SI: v_mov_b32_e32 {{v[0-9]+}}, [[ZERO]]
-; SI: buffer_store_dwordx2
-define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %a = load i8 addrspace(1)* %in, align 8
-  %ext = zext i8 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_i16_to_i64:
-; SI-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
-; SI-DAG: buffer_load_ushort [[LOAD:v[0-9]+]],
-; SI: v_mov_b32_e32 {{v[0-9]+}}, [[ZERO]]
-; SI: buffer_store_dwordx2
-define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16 addrspace(1)* %in, align 8
-  %ext = zext i16 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_i32_to_i64:
-; SI-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
-; SI-DAG: buffer_load_dword [[LOAD:v[0-9]+]],
-; SI: v_mov_b32_e32 {{v[0-9]+}}, [[ZERO]]
-; SI: buffer_store_dwordx2
-define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %a = load i32 addrspace(1)* %in, align 8
-  %ext = zext i32 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/extract_vector_elt_i16.ll b/test/CodeGen/R600/extract_vector_elt_i16.ll
index efdc1c8..0774a9a 100644
--- a/test/CodeGen/R600/extract_vector_elt_i16.ll
+++ b/test/CodeGen/R600/extract_vector_elt_i16.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}extract_vector_elt_v2i16:
 ; SI: buffer_load_ushort
diff --git a/test/CodeGen/R600/fabs.f64.ll b/test/CodeGen/R600/fabs.f64.ll
index d2ba320..d87c082 100644
--- a/test/CodeGen/R600/fabs.f64.ll
+++ b/test/CodeGen/R600/fabs.f64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
index 06cc97f..419a73d 100644
--- a/test/CodeGen/R600/fabs.ll
+++ b/test/CodeGen/R600/fabs.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 
@@ -10,7 +11,7 @@
 ; R600-NOT: AND
 ; R600: |PV.{{[XYZW]}}|
 
-; SI: v_and_b32
+; GCN: v_and_b32
 
 define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
@@ -23,7 +24,7 @@ define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
 ; R600-NOT: AND
 ; R600: |PV.{{[XYZW]}}|
 
-; SI: v_and_b32
+; GCN: v_and_b32
 
 define void @fabs_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
@@ -35,7 +36,7 @@ define void @fabs_free(float addrspace(1)* %out, i32 %in) {
 ; FUNC-LABEL: {{^}}fabs_f32:
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 
-; SI: v_and_b32
+; GCN: v_and_b32
 define void @fabs_f32(float addrspace(1)* %out, float %in) {
   %fabs = call float @llvm.fabs.f32(float %in)
   store float %fabs, float addrspace(1)* %out
@@ -46,8 +47,8 @@ define void @fabs_f32(float addrspace(1)* %out, float %in) {
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 
-; SI: v_and_b32
-; SI: v_and_b32
+; GCN: v_and_b32
+; GCN: v_and_b32
 define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   store <2 x float> %fabs, <2 x float> addrspace(1)* %out
@@ -60,20 +61,21 @@ define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 
-; SI: v_and_b32
-; SI: v_and_b32
-; SI: v_and_b32
-; SI: v_and_b32
+; GCN: v_and_b32
+; GCN: v_and_b32
+; GCN: v_and_b32
+; GCN: v_and_b32
 define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
   store <4 x float> %fabs, <4 x float> addrspace(1)* %out
   ret void
 }
 
-; SI-LABEL: {{^}}fabs_fn_fold:
+; GCN-LABEL: {{^}}fabs_fn_fold:
 ; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
-; SI-NOT: and
-; SI: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
+; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; GCN-NOT: and
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
 define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @fabs(float %in0)
   %fmul = fmul float %fabs, %in1
@@ -81,10 +83,11 @@ define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
   ret void
 }
 
-; SI-LABEL: {{^}}fabs_fold:
+; GCN-LABEL: {{^}}fabs_fold:
 ; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
-; SI-NOT: and
-; SI: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
+; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; GCN-NOT: and
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
 define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @llvm.fabs.f32(float %in0)
   %fmul = fmul float %fabs, %in1
diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
index 774dd0b..365af9b 100644
--- a/test/CodeGen/R600/fadd.ll
+++ b/test/CodeGen/R600/fadd.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 
 ; FUNC-LABEL: {{^}}fadd_f32:
 ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
diff --git a/test/CodeGen/R600/fadd64.ll b/test/CodeGen/R600/fadd64.ll
index 3ca8500..f1f6fef 100644
--- a/test/CodeGen/R600/fadd64.ll
+++ b/test/CodeGen/R600/fadd64.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ; CHECK: {{^}}fadd_f64:
 ; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
diff --git a/test/CodeGen/R600/fceil.ll b/test/CodeGen/R600/fceil.ll
index 56dc796..f23e891 100644
--- a/test/CodeGen/R600/fceil.ll
+++ b/test/CodeGen/R600/fceil.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.ceil.f32(float) nounwind readnone
diff --git a/test/CodeGen/R600/fceil64.ll b/test/CodeGen/R600/fceil64.ll
index 029f41d..e3244fa 100644
--- a/test/CodeGen/R600/fceil64.ll
+++ b/test/CodeGen/R600/fceil64.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
 declare double @llvm.ceil.f64(double) nounwind readnone
 declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
@@ -11,23 +12,24 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 ; FUNC-LABEL: {{^}}fceil_f64:
 ; CI: v_ceil_f64_e32
 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
 ; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
 ; SI: s_lshr_b64
 ; SI: s_not_b64
 ; SI: s_and_b64
-; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI-DAG: cmp_lt_i32
+; SI: cmp_lt_i32
 ; SI: cndmask_b32
 ; SI: cndmask_b32
 ; SI: cmp_gt_i32
 ; SI: cndmask_b32
 ; SI: cndmask_b32
-; SI: cmp_gt_f64
-; SI: cndmask_b32
-; SI: cmp_ne_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
+; SI-DAG: v_cmp_gt_f64
+; SI-DAG: v_cmp_lg_f64
+; SI: s_and_b64
+; SI: v_cndmask_b32
+; SI: v_cndmask_b32
 ; SI: v_add_f64
+; SI: s_endpgm
 define void @fceil_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.ceil.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
diff --git a/test/CodeGen/R600/fcmp64.ll b/test/CodeGen/R600/fcmp64.ll
index dc24443..9dc8b50 100644
--- a/test/CodeGen/R600/fcmp64.ll
+++ b/test/CodeGen/R600/fcmp64.ll
@@ -1,7 +1,8 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ; CHECK-LABEL: {{^}}flt_f64:
-; CHECK: v_cmp_lt_f64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
@@ -13,7 +14,7 @@ define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 }
 
 ; CHECK-LABEL: {{^}}fle_f64:
-; CHECK: v_cmp_le_f64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
@@ -25,7 +26,7 @@ define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 }
 
 ; CHECK-LABEL: {{^}}fgt_f64:
-; CHECK: v_cmp_gt_f64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
@@ -37,7 +38,7 @@ define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 }
 
 ; CHECK-LABEL: {{^}}fge_f64:
-; CHECK: v_cmp_ge_f64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
@@ -61,7 +62,7 @@ define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 }
 
 ; CHECK-LABEL: {{^}}feq_f64:
-; CHECK: v_cmp_eq_f64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
diff --git a/test/CodeGen/R600/fconst64.ll b/test/CodeGen/R600/fconst64.ll
index 097c89f..28e0c90 100644
--- a/test/CodeGen/R600/fconst64.ll
+++ b/test/CodeGen/R600/fconst64.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ; CHECK: {{^}}fconst_f64:
 ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000
diff --git a/test/CodeGen/R600/fcopysign.f32.ll b/test/CodeGen/R600/fcopysign.f32.ll
index 897830e..b719d5a 100644
--- a/test/CodeGen/R600/fcopysign.f32.ll
+++ b/test/CodeGen/R600/fcopysign.f32.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
@@ -10,12 +11,14 @@ declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind read
 ; FUNC-LABEL: {{^}}test_copysign_f32:
 ; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb
 ; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc
-; SI-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
-; SI-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
-; SI-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
-; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; VI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0x2c
+; VI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0x30
+; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
+; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
+; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
+; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: BFI_INT
 define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
@@ -25,7 +28,7 @@ define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign
 }
 
 ; FUNC-LABEL: {{^}}test_copysign_v2f32:
-; SI: s_endpgm
+; GCN: s_endpgm
 
 ; EG: BFI_INT
 ; EG: BFI_INT
@@ -36,7 +39,7 @@ define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %ma
 }
 
 ; FUNC-LABEL: {{^}}test_copysign_v4f32:
-; SI: s_endpgm
+; GCN: s_endpgm
 
 ; EG: BFI_INT
 ; EG: BFI_INT
diff --git a/test/CodeGen/R600/fcopysign.f64.ll b/test/CodeGen/R600/fcopysign.f64.ll
index 90f0ce3..3d8c559 100644
--- a/test/CodeGen/R600/fcopysign.f64.ll
+++ b/test/CodeGen/R600/fcopysign.f64.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 
 declare double @llvm.copysign.f64(double, double) nounwind readnone
 declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind readnone
@@ -7,13 +8,15 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind r
 ; FUNC-LABEL: {{^}}test_copysign_f64:
 ; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
-; SI-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
-; SI-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
-; SI: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
-; SI: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
-; SI: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
-; SI: s_endpgm
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
+; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
+; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
+; GCN: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
+; GCN: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
+; GCN: s_endpgm
 define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
   store double %result, double addrspace(1)* %out, align 8
@@ -21,7 +24,7 @@ define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %s
 }
 
 ; FUNC-LABEL: {{^}}test_copysign_v2f64:
-; SI: s_endpgm
+; GCN: s_endpgm
 define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
   %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
   store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8
@@ -29,7 +32,7 @@ define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %
 }
 
 ; FUNC-LABEL: {{^}}test_copysign_v4f64:
-; SI: s_endpgm
+; GCN: s_endpgm
 define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
   %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
   store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8
diff --git a/test/CodeGen/R600/fdiv.f64.ll b/test/CodeGen/R600/fdiv.f64.ll
new file mode 100644
index 0000000..6367f32
--- /dev/null
+++ b/test/CodeGen/R600/fdiv.f64.ll
@@ -0,0 +1,96 @@
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s
+
+
+; COMMON-LABEL: {{^}}fdiv_f64:
+; COMMON-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0
+; COMMON-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]]
+; CI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]]
+
+; Check for div_scale bug workaround on SI
+; SI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]]
+; SI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[NUM]], [[DEN]], [[NUM]]
+
+; COMMON-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]]
+
+; SI-DAG: v_cmp_eq_i32_e32 vcc, {{v[0-9]+}}, {{v[0-9]+}}
+; SI-DAG: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}
+; SI-DAG: s_xor_b64 vcc, [[CMP0]], vcc
+
+; COMMON-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[RCP_SCALE0]], 1.0
+; COMMON-DAG: v_fma_f64 [[FMA1:v\[[0-9]+:[0-9]+\]]], [[RCP_SCALE0]], [[FMA0]], [[RCP_SCALE0]]
+; COMMON-DAG: v_fma_f64 [[FMA2:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[FMA1]], 1.0
+; COMMON-DAG: v_fma_f64 [[FMA3:v\[[0-9]+:[0-9]+\]]], [[FMA1]], [[FMA2]], [[FMA1]]
+; COMMON-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[SCALE1]], [[FMA3]]
+; COMMON-DAG: v_fma_f64 [[FMA4:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[MUL]], [[SCALE1]]
+; COMMON: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA4]], [[FMA3]], [[MUL]]
+; COMMON: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]]
+; COMMON: buffer_store_dwordx2 [[RESULT]]
+; COMMON: s_endpgm
+define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounwind {
+  %gep.1 = getelementptr double addrspace(1)* %in, i32 1
+  %num = load double addrspace(1)* %in
+  %den = load double addrspace(1)* %gep.1
+  %result = fdiv double %num, %den
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}fdiv_f64_s_v:
+define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) nounwind {
+  %den = load double addrspace(1)* %in
+  %result = fdiv double %num, %den
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}fdiv_f64_v_s:
+define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) nounwind {
+  %num = load double addrspace(1)* %in
+  %result = fdiv double %num, %den
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}fdiv_f64_s_s:
+define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) nounwind {
+  %result = fdiv double %num, %den
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}v_fdiv_v2f64:
+define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) nounwind {
+  %gep.1 = getelementptr <2 x double> addrspace(1)* %in, i32 1
+  %num = load <2 x double> addrspace(1)* %in
+  %den = load <2 x double> addrspace(1)* %gep.1
+  %result = fdiv <2 x double> %num, %den
+  store <2 x double> %result, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}s_fdiv_v2f64:
+define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) {
+  %result = fdiv <2 x double> %num, %den
+  store <2 x double> %result, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}v_fdiv_v4f64:
+define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) nounwind {
+  %gep.1 = getelementptr <4 x double> addrspace(1)* %in, i32 1
+  %num = load <4 x double> addrspace(1)* %in
+  %den = load <4 x double> addrspace(1)* %gep.1
+  %result = fdiv <4 x double> %num, %den
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}s_fdiv_v4f64:
+define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) {
+  %result = fdiv <4 x double> %num, %den
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll
index 5321fdb..603287f 100644
--- a/test/CodeGen/R600/fdiv.ll
+++ b/test/CodeGen/R600/fdiv.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; These tests check that fdiv is expanded correctly and also test that the
 ; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
diff --git a/test/CodeGen/R600/fdiv64.ll b/test/CodeGen/R600/fdiv64.ll
deleted file mode 100644
index d424898..0000000
--- a/test/CodeGen/R600/fdiv64.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
-
-; CHECK: {{^}}fdiv_f64:
-; CHECK: v_rcp_f64_e32 {{v\[[0-9]+:[0-9]+\]}}
-; CHECK: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
-
-define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                      double addrspace(1)* %in2) {
-   %r0 = load double addrspace(1)* %in1
-   %r1 = load double addrspace(1)* %in2
-   %r2 = fdiv double %r0, %r1
-   store double %r2, double addrspace(1)* %out
-   ret void
-}
diff --git a/test/CodeGen/R600/ffloor.f64.ll b/test/CodeGen/R600/ffloor.f64.ll
new file mode 100644
index 0000000..745ad3b
--- /dev/null
+++ b/test/CodeGen/R600/ffloor.f64.ll
@@ -0,0 +1,106 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+declare double @llvm.floor.f64(double) nounwind readnone
+declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
+declare <3 x double> @llvm.floor.v3f64(<3 x double>) nounwind readnone
+declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone
+declare <8 x double> @llvm.floor.v8f64(<8 x double>) nounwind readnone
+declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
+
+; FUNC-LABEL: {{^}}ffloor_f64:
+; CI: v_floor_f64_e32
+
+; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
+; SI: s_lshr_b64
+; SI: s_not_b64
+; SI: s_and_b64
+; SI: cmp_lt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: cmp_gt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI-DAG: v_cmp_lt_f64
+; SI-DAG: v_cmp_lg_f64
+; SI-DAG: s_and_b64
+; SI-DAG: v_cndmask_b32
+; SI-DAG: v_cndmask_b32
+; SI: v_add_f64
+; SI: s_endpgm
+define void @ffloor_f64(double addrspace(1)* %out, double %x) {
+  %y = call double @llvm.floor.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_v2f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+  %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone
+  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-FUNC-LABEL: {{^}}ffloor_v3f64:
+; FIXME-CI: v_floor_f64_e32
+; FIXME-CI: v_floor_f64_e32
+; FIXME-CI: v_floor_f64_e32
+; define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+;   %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
+;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}ffloor_v4f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+  %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone
+  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_v8f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+  %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone
+  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_v16f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+  %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone
+  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/ffloor.ll b/test/CodeGen/R600/ffloor.ll
index 166f705..61c46ac 100644
--- a/test/CodeGen/R600/ffloor.ll
+++ b/test/CodeGen/R600/ffloor.ll
@@ -1,104 +1,49 @@
-; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare double @llvm.floor.f64(double) nounwind readnone
-declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
-declare <3 x double> @llvm.floor.v3f64(<3 x double>) nounwind readnone
-declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone
-declare <8 x double> @llvm.floor.v8f64(<8 x double>) nounwind readnone
-declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}floor_f32:
+; SI: v_floor_f32_e32
+; R600: FLOOR
+define void @floor_f32(float addrspace(1)* %out, float %in) {
+  %tmp = call float @llvm.floor.f32(float %in) #0
+  store float %tmp, float addrspace(1)* %out
+  ret void
+}
 
-; FUNC-LABEL: {{^}}ffloor_f64:
-; CI: v_floor_f64_e32
+; FUNC-LABEL: {{^}}floor_v2f32:
+; SI: v_floor_f32_e32
+; SI: v_floor_f32_e32
 
-; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
-; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
-; SI: s_lshr_b64
-; SI: s_not_b64
-; SI: s_and_b64
-; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI-DAG: cmp_lt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
-; SI: cmp_gt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
-; SI: cmp_lt_f64
-; SI: cndmask_b32
-; SI: cmp_ne_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
-; SI: v_add_f64
-define void @ffloor_f64(double addrspace(1)* %out, double %x) {
-  %y = call double @llvm.floor.f64(double %x) nounwind readnone
-  store double %y, double addrspace(1)* %out
+define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+  %tmp = call <2 x float> @llvm.floor.v2f32(<2 x float> %in) #0
+  store <2 x float> %tmp, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}ffloor_v2f64:
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
-  %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone
-  store <2 x double> %y, <2 x double> addrspace(1)* %out
+; FUNC-LABEL: {{^}}floor_v4f32:
+; SI: v_floor_f32_e32
+; SI: v_floor_f32_e32
+; SI: v_floor_f32_e32
+; SI: v_floor_f32_e32
+
+; R600: FLOOR
+; R600: FLOOR
+; R600: FLOOR
+; R600: FLOOR
+define void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+  %tmp = call <4 x float> @llvm.floor.v4f32(<4 x float> %in) #0
+  store <4 x float> %tmp, <4 x float> addrspace(1)* %out
   ret void
 }
 
-; FIXME-FUNC-LABEL: {{^}}ffloor_v3f64:
-; FIXME-CI: v_floor_f64_e32
-; FIXME-CI: v_floor_f64_e32
-; FIXME-CI: v_floor_f64_e32
-; define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
-;   %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
-;   store <3 x double> %y, <3 x double> addrspace(1)* %out
-;   ret void
-; }
+; Function Attrs: nounwind readonly
+declare float @llvm.floor.f32(float) #0
 
-; FUNC-LABEL: {{^}}ffloor_v4f64:
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
-  %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone
-  store <4 x double> %y, <4 x double> addrspace(1)* %out
-  ret void
-}
+; Function Attrs: nounwind readonly
+declare <2 x float> @llvm.floor.v2f32(<2 x float>) #0
 
-; FUNC-LABEL: {{^}}ffloor_v8f64:
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
-  %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone
-  store <8 x double> %y, <8 x double> addrspace(1)* %out
-  ret void
-}
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) #0
 
-; FUNC-LABEL: {{^}}ffloor_v16f64:
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
-  %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone
-  store <16 x double> %y, <16 x double> addrspace(1)* %out
-  ret void
-}
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/flat-address-space.ll b/test/CodeGen/R600/flat-address-space.ll
index fc5af7c..2e98bf5 100644
--- a/test/CodeGen/R600/flat-address-space.ll
+++ b/test/CodeGen/R600/flat-address-space.ll
@@ -1,5 +1,7 @@
-; RUN: llc -O0 -march=r600 -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
-; RUN: llc -O0 -march=r600 -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
+; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
 
 ; Disable optimizations in case there are optimizations added that
 ; specialize away generic pointer accesses.
diff --git a/test/CodeGen/R600/floor.ll b/test/CodeGen/R600/floor.ll
index 67e86c4..c6bfb85 100644
--- a/test/CodeGen/R600/floor.ll
+++ b/test/CodeGen/R600/floor.ll
@@ -1,7 +1,6 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s
 
+; CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 define void @test(<4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = call float @floor(float %r0)
@@ -13,4 +12,4 @@ define void @test(<4 x float> inreg %reg0) #0 {
 declare float @floor(float) readonly
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-attributes #0 = { "ShaderType"="0" }
-\ No newline at end of file
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/fma-combine.ll b/test/CodeGen/R600/fma-combine.ll
new file mode 100644
index 0000000..9aac90c
--- /dev/null
+++ b/test/CodeGen/R600/fma-combine.ll
@@ -0,0 +1,368 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare double @llvm.fabs.f64(double) #0
+declare double @llvm.fma.f64(double, double, double) #0
+declare float @llvm.fma.f32(float, float, float) #0
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double addrspace(1)* %out, i32 %tid
+
+  %a = load double addrspace(1)* %gep.0
+  %b = load double addrspace(1)* %gep.1
+  %c = load double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %fma = fadd double %mul, %c
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0
+  %b = load double addrspace(1)* %gep.1
+  %c = load double addrspace(1)* %gep.2
+  %d = load double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %fma0 = fadd double %mul, %c
+  %fma1 = fadd double %mul, %d
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fadd x, (fmul y, z)) -> (fma y, z, x)
+; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double addrspace(1)* %out, i32 %tid
+
+  %a = load double addrspace(1)* %gep.0
+  %b = load double addrspace(1)* %gep.1
+  %c = load double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %fma = fadd double %c, %mul
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double addrspace(1)* %out, i32 %tid
+
+  %a = load double addrspace(1)* %gep.0
+  %b = load double addrspace(1)* %gep.1
+  %c = load double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %fma = fsub double %mul, %c
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0
+  %b = load double addrspace(1)* %gep.1
+  %c = load double addrspace(1)* %gep.2
+  %d = load double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %fma0 = fsub double %mul, %c
+  %fma1 = fsub double %mul, %d
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double addrspace(1)* %out, i32 %tid
+
+  %a = load double addrspace(1)* %gep.0
+  %b = load double addrspace(1)* %gep.1
+  %c = load double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %fma = fsub double %c, %mul
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0
+  %b = load double addrspace(1)* %gep.1
+  %c = load double addrspace(1)* %gep.2
+  %d = load double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %fma0 = fsub double %c, %mul
+  %fma1 = fsub double %d, %mul
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double addrspace(1)* %out, i32 %tid
+
+  %a = load double addrspace(1)* %gep.0
+  %b = load double addrspace(1)* %gep.1
+  %c = load double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %mul.neg = fsub double -0.0, %mul
+  %fma = fsub double %mul.neg, %c
+
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0
+  %b = load double addrspace(1)* %gep.1
+  %c = load double addrspace(1)* %gep.2
+  %d = load double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %mul.neg = fsub double -0.0, %mul
+  %fma0 = fsub double %mul.neg, %c
+  %fma1 = fsub double %mul.neg, %d
+
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0
+  %b = load double addrspace(1)* %gep.1
+  %c = load double addrspace(1)* %gep.2
+  %d = load double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %mul.neg = fsub double -0.0, %mul
+  %fma0 = fsub double %mul.neg, %c
+  %fma1 = fsub double %mul, %d
+
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
+; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
+; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr double addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr double addrspace(1)* %out, i32 %tid
+
+  %x = load double addrspace(1)* %gep.0
+  %y = load double addrspace(1)* %gep.1
+  %z = load double addrspace(1)* %gep.2
+  %u = load double addrspace(1)* %gep.3
+  %v = load double addrspace(1)* %gep.4
+
+  %tmp0 = fmul double %u, %v
+  %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
+  %tmp2 = fsub double %tmp1, %z
+
+  store double %tmp2, double addrspace(1)* %gep.out
+  ret void
+}
+
+; fold (fsub x, (fma y, z, (fmul u, v)))
+;   -> (fma (fneg y), z, (fma (fneg u), v, x))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
+; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
+; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr double addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr double addrspace(1)* %out, i32 %tid
+
+  %x = load double addrspace(1)* %gep.0
+  %y = load double addrspace(1)* %gep.1
+  %z = load double addrspace(1)* %gep.2
+  %u = load double addrspace(1)* %gep.3
+  %v = load double addrspace(1)* %gep.4
+
+  %tmp0 = fmul double %u, %v
+  %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
+  %tmp2 = fsub double %x, %tmp1
+
+  store double %tmp2, double addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/fma.f64.ll b/test/CodeGen/R600/fma.f64.ll
index 4b0ab76..bca312b 100644
--- a/test/CodeGen/R600/fma.f64.ll
+++ b/test/CodeGen/R600/fma.f64.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare double @llvm.fma.f64(double, double, double) nounwind readnone
 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/R600/fma.ll b/test/CodeGen/R600/fma.ll
index 637e799..f3861ff 100644
--- a/test/CodeGen/R600/fma.ll
+++ b/test/CodeGen/R600/fma.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.fma.f32(float, float, float) nounwind readnone
diff --git a/test/CodeGen/R600/fmax3.ll b/test/CodeGen/R600/fmax3.ll
index cf371b3..629c032 100644
--- a/test/CodeGen/R600/fmax3.ll
+++ b/test/CodeGen/R600/fmax3.ll
@@ -1,11 +1,12 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare float @llvm.maxnum.f32(float, float) nounwind readnone
 
 ; SI-LABEL: {{^}}test_fmax3_olt_0:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
 ; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
@@ -21,8 +22,8 @@ define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
 
 ; Commute operand of second fmax
 ; SI-LABEL: {{^}}test_fmax3_olt_1:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: buffer_load_dword [[REGC:v[0-9]+]]
 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
diff --git a/test/CodeGen/R600/fmax_legacy.f64.ll b/test/CodeGen/R600/fmax_legacy.f64.ll
new file mode 100644
index 0000000..762853d
--- /dev/null
+++ b/test/CodeGen/R600/fmax_legacy.f64.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; Make sure we don't try to form FMAX_LEGACY nodes with f64
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FUNC-LABEL: @test_fmax_legacy_uge_f64
+define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp uge double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_oge_f64
+define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp oge double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_ugt_f64
+define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ugt double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_ogt_f64
+define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ogt double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fmax_legacy.ll b/test/CodeGen/R600/fmax_legacy.ll
index e9d837b..46f0e98 100644
--- a/test/CodeGen/R600/fmax_legacy.ll
+++ b/test/CodeGen/R600/fmax_legacy.ll
@@ -1,12 +1,17 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
+; FIXME: Should replace unsafe-fp-math with no signed zeros.
+
 declare i32 @llvm.r600.read.tidig.x() #1
 
 ; FUNC-LABEL: @test_fmax_legacy_uge_f32
 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
-; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
 ; EG: MAX
 define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -24,8 +29,9 @@ define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(
 
 ; FUNC-LABEL: @test_fmax_legacy_oge_f32
 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
-; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
 define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -43,8 +49,9 @@ define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(
 
 ; FUNC-LABEL: @test_fmax_legacy_ugt_f32
 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
-; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
 define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -62,8 +69,9 @@ define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(
 
 ; FUNC-LABEL: @test_fmax_legacy_ogt_f32
 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
-; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
 define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -79,5 +87,30 @@ define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(
   ret void
 }
 
+
+; FUNC-LABEL: @test_fmax_legacy_ogt_f32_multi_use
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-NOT: v_max_
+; SI: v_cmp_gt_f32
+; SI-NEXT: v_cndmask_b32
+; SI-NOT: v_max_
+
+; EG: MAX
+define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ogt float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out0, align 4
+  store i1 %cmp, i1addrspace(1)* %out1
+  ret void
+}
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fmaxnum.f64.ll b/test/CodeGen/R600/fmaxnum.f64.ll
index 51cbf4d..de563ce 100644
--- a/test/CodeGen/R600/fmaxnum.f64.ll
+++ b/test/CodeGen/R600/fmaxnum.f64.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare double @llvm.maxnum.f64(double, double) #0
 declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) #0
diff --git a/test/CodeGen/R600/fmaxnum.ll b/test/CodeGen/R600/fmaxnum.ll
index 01d30b0..c105598 100644
--- a/test/CodeGen/R600/fmaxnum.ll
+++ b/test/CodeGen/R600/fmaxnum.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare float @llvm.maxnum.f32(float, float) #0
 declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0
diff --git a/test/CodeGen/R600/fmin3.ll b/test/CodeGen/R600/fmin3.ll
index 7420368..e3acb31 100644
--- a/test/CodeGen/R600/fmin3.ll
+++ b/test/CodeGen/R600/fmin3.ll
@@ -1,11 +1,13 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare float @llvm.minnum.f32(float, float) nounwind readnone
 
 ; SI-LABEL: {{^}}test_fmin3_olt_0:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
 ; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
@@ -21,8 +23,8 @@ define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
 
 ; Commute operand of second fmin
 ; SI-LABEL: {{^}}test_fmin3_olt_1:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: buffer_load_dword [[REGC:v[0-9]+]]
 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
diff --git a/test/CodeGen/R600/fmin_legacy.f64.ll b/test/CodeGen/R600/fmin_legacy.f64.ll
new file mode 100644
index 0000000..83043cd
--- /dev/null
+++ b/test/CodeGen/R600/fmin_legacy.f64.ll
@@ -0,0 +1,77 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FUNC-LABEL: @test_fmin_legacy_f64
+define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
+   %r0 = extractelement <4 x double> %reg0, i32 0
+   %r1 = extractelement <4 x double> %reg0, i32 1
+   %r2 = fcmp uge double %r0, %r1
+   %r3 = select i1 %r2, double %r1, double %r0
+   %vec = insertelement <4 x double> undef, double %r3, i32 0
+   store <4 x double> %vec, <4 x double> addrspace(1)* %out, align 16
+   ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ule_f64
+define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ule double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ole_f64
+define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ole double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_olt_f64
+define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp olt double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ult_f64
+define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ult double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fmin_legacy.ll b/test/CodeGen/R600/fmin_legacy.ll
index 2fbdb6b..5014f6c 100644
--- a/test/CodeGen/R600/fmin_legacy.ll
+++ b/test/CodeGen/R600/fmin_legacy.ll
@@ -1,11 +1,15 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math  -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
+; FIXME: Should replace unsafe-fp-math with no signed zeros.
+
 declare i32 @llvm.r600.read.tidig.x() #1
 
 ; FUNC-LABEL: @test_fmin_legacy_f32
 ; EG: MIN *
-; SI: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-NONAN: v_min_f32_e32
 define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
@@ -18,8 +22,9 @@ define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> in
 
 ; FUNC-LABEL: @test_fmin_legacy_ule_f32
 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
-; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
@@ -36,8 +41,9 @@ define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(
 
 ; FUNC-LABEL: @test_fmin_legacy_ole_f32
 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
-; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
@@ -54,8 +60,9 @@ define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(
 
 ; FUNC-LABEL: @test_fmin_legacy_olt_f32
 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
-; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
@@ -72,8 +79,9 @@ define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(
 
 ; FUNC-LABEL: @test_fmin_legacy_ult_f32
 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
-; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
@@ -88,5 +96,28 @@ define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(
   ret void
 }
 
+; FUNC-LABEL: @test_fmin_legacy_ole_f32_multi_use
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-NOT: v_min
+; SI: v_cmp_le_f32
+; SI-NEXT: v_cndmask_b32
+; SI-NOT: v_min
+; SI: s_endpgm
+define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ole float %a, %b
+  %val0 = select i1 %cmp, float %a, float %b
+  store float %val0, float addrspace(1)* %out0, align 4
+  store i1 %cmp, i1 addrspace(1)* %out1
+  ret void
+}
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fminnum.f64.ll b/test/CodeGen/R600/fminnum.f64.ll
index 11b0c20..0f929d6 100644
--- a/test/CodeGen/R600/fminnum.f64.ll
+++ b/test/CodeGen/R600/fminnum.f64.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare double @llvm.minnum.f64(double, double) #0
 declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) #0
diff --git a/test/CodeGen/R600/fminnum.ll b/test/CodeGen/R600/fminnum.ll
index 65adab6..6b93b83 100644
--- a/test/CodeGen/R600/fminnum.ll
+++ b/test/CodeGen/R600/fminnum.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare float @llvm.minnum.f32(float, float) #0
 declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0
diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
index eabb271..6c09aa2 100644
--- a/test/CodeGen/R600/fmul.ll
+++ b/test/CodeGen/R600/fmul.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 
diff --git a/test/CodeGen/R600/fmul64.ll b/test/CodeGen/R600/fmul64.ll
index 0a5f707..9d7787c 100644
--- a/test/CodeGen/R600/fmul64.ll
+++ b/test/CodeGen/R600/fmul64.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
 
 ; FUNC-LABEL: {{^}}fmul_f64:
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
diff --git a/test/CodeGen/R600/fmuladd.ll b/test/CodeGen/R600/fmuladd.ll
index 16003a5..2b70863 100644
--- a/test/CodeGen/R600/fmuladd.ll
+++ b/test/CodeGen/R600/fmuladd.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
 
 declare float @llvm.fmuladd.f32(float, float, float)
 declare double @llvm.fmuladd.f64(double, double, double)
@@ -33,7 +33,7 @@ define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}fmuladd_2.0_a_b_f32
 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
 ; CHECK: buffer_store_dword [[RESULT]]
 define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
@@ -52,7 +52,7 @@ define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %
 
 ; CHECK-LABEL: {{^}}fmuladd_a_2.0_b_f32
 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
 ; CHECK: buffer_store_dword [[RESULT]]
 define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
@@ -71,7 +71,7 @@ define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %
 
 ; CHECK-LABEL: {{^}}fadd_a_a_b_f32:
 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
 ; CHECK: buffer_store_dword [[RESULT]]
 define void @fadd_a_a_b_f32(float addrspace(1)* %out,
@@ -93,7 +93,7 @@ define void @fadd_a_a_b_f32(float addrspace(1)* %out,
 
 ; CHECK-LABEL: {{^}}fadd_b_a_a_f32:
 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
 ; CHECK: buffer_store_dword [[RESULT]]
 define void @fadd_b_a_a_f32(float addrspace(1)* %out,
@@ -115,7 +115,7 @@ define void @fadd_b_a_a_f32(float addrspace(1)* %out,
 
 ; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
 ; CHECK: buffer_store_dword [[RESULT]]
 define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
@@ -135,7 +135,7 @@ define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1
 
 ; CHECK-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
 ; CHECK: buffer_store_dword [[RESULT]]
 define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
@@ -157,7 +157,7 @@ define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspa
 
 ; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32
 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
 ; CHECK: buffer_store_dword [[RESULT]]
 define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
@@ -179,7 +179,7 @@ define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1
 
 ; CHECK-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32
 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]]
 ; CHECK: buffer_store_dword [[RESULT]]
 define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
diff --git a/test/CodeGen/R600/fnearbyint.ll b/test/CodeGen/R600/fnearbyint.ll
index 1c1d731..4fa9ada 100644
--- a/test/CodeGen/R600/fnearbyint.ll
+++ b/test/CodeGen/R600/fnearbyint.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s
 
 ; This should have the exactly the same output as the test for rint,
 ; so no need to check anything.
diff --git a/test/CodeGen/R600/fneg-fabs.f64.ll b/test/CodeGen/R600/fneg-fabs.f64.ll
index 555f4cc..7e6ede6 100644
--- a/test/CodeGen/R600/fneg-fabs.f64.ll
+++ b/test/CodeGen/R600/fneg-fabs.f64.ll
@@ -1,12 +1,11 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FIXME: Check something here. Currently it seems fabs + fneg aren't
 ; into 2 modifiers, although theoretically that should work.
 
 ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f64:
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x7fffffff
-; SI: v_and_b32_e32 v[[FABS:[0-9]+]], {{s[0-9]+}}, [[IMMREG]]
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+}}:[[FABS]]{{\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|
 define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
   %fabs = call double @llvm.fabs.f64(double %x)
   %fsub = fsub double -0.000000e+00, %fabs
@@ -56,8 +55,8 @@ define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
 }
 
 ; FUNC-LABEL: {{^}}fneg_fabs_f64:
-; SI: s_load_dwordx2
 ; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}
+; SI: s_load_dwordx2
 ; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
 ; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
 ; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
diff --git a/test/CodeGen/R600/fneg-fabs.ll b/test/CodeGen/R600/fneg-fabs.ll
index 3cc832f..4fde048 100644
--- a/test/CodeGen/R600/fneg-fabs.ll
+++ b/test/CodeGen/R600/fneg-fabs.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
diff --git a/test/CodeGen/R600/fneg.f64.ll b/test/CodeGen/R600/fneg.f64.ll
index 7aa08a9..aa6df20 100644
--- a/test/CodeGen/R600/fneg.f64.ll
+++ b/test/CodeGen/R600/fneg.f64.ll
@@ -1,7 +1,8 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}fneg_f64:
-; SI: v_xor_b32
+; GCN: v_xor_b32
 define void @fneg_f64(double addrspace(1)* %out, double %in) {
   %fneg = fsub double -0.000000e+00, %in
   store double %fneg, double addrspace(1)* %out
@@ -9,8 +10,8 @@ define void @fneg_f64(double addrspace(1)* %out, double %in) {
 }
 
 ; FUNC-LABEL: {{^}}fneg_v2f64:
-; SI: v_xor_b32
-; SI: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
 define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) {
   %fneg = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %in
   store <2 x double> %fneg, <2 x double> addrspace(1)* %out
@@ -23,10 +24,10 @@ define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double>
 ; R600: -PV
 ; R600: -PV
 
-; SI: v_xor_b32
-; SI: v_xor_b32
-; SI: v_xor_b32
-; SI: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
 define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) {
   %fneg = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %in
   store <4 x double> %fneg, <4 x double> addrspace(1)* %out
@@ -38,8 +39,7 @@ define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double>
 ; unless the target returns true for isNegFree()
 
 ; FUNC-LABEL: {{^}}fneg_free_f64:
-; FIXME: Unnecessary copy to VGPRs
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]$}}
+; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, 0, -{{s\[[0-9]+:[0-9]+\]$}}
 define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fsub = fsub double 0.0, %bc
@@ -47,10 +47,11 @@ define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
   ret void
 }
 
-; SI-LABEL: {{^}}fneg_fold_f64:
+; GCN-LABEL: {{^}}fneg_fold_f64:
 ; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-NOT: xor
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
+; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN-NOT: xor
+; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
 define void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
   %fsub = fsub double -0.0, %in
   %fmul = fmul double %fsub, %in
diff --git a/test/CodeGen/R600/fneg.ll b/test/CodeGen/R600/fneg.ll
index c20cf24..a0fd539 100644
--- a/test/CodeGen/R600/fneg.ll
+++ b/test/CodeGen/R600/fneg.ll
@@ -1,10 +1,11 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}fneg_f32:
 ; R600: -PV
 
-; SI: v_xor_b32
+; GCN: v_xor_b32
 define void @fneg_f32(float addrspace(1)* %out, float %in) {
   %fneg = fsub float -0.000000e+00, %in
   store float %fneg, float addrspace(1)* %out
@@ -15,8 +16,8 @@ define void @fneg_f32(float addrspace(1)* %out, float %in) {
 ; R600: -PV
 ; R600: -PV
 
-; SI: v_xor_b32
-; SI: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
 define void @fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
   %fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
   store <2 x float> %fneg, <2 x float> addrspace(1)* %out
@@ -29,10 +30,10 @@ define void @fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %i
 ; R600: -PV
 ; R600: -PV
 
-; SI: v_xor_b32
-; SI: v_xor_b32
-; SI: v_xor_b32
-; SI: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
 define void @fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
   %fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
   store <4 x float> %fneg, <4 x float> addrspace(1)* %out
@@ -48,7 +49,7 @@ define void @fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %i
 ; R600: -KC0[2].Z
 
 ; XXX: We could use v_add_f32_e64 with the negate bit here instead.
-; SI: v_sub_f32_e64 v{{[0-9]}}, 0.0, s{{[0-9]+$}}
+; GCN: v_sub_f32_e64 v{{[0-9]}}, 0, s{{[0-9]+$}}
 define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fsub = fsub float 0.0, %bc
@@ -58,8 +59,9 @@ define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
 
 ; FUNC-LABEL: {{^}}fneg_fold_f32:
 ; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
-; SI-NOT: xor
-; SI: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
+; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; GCN-NOT: xor
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
 define void @fneg_fold_f32(float addrspace(1)* %out, float %in) {
   %fsub = fsub float -0.0, %in
   %fmul = fmul float %fsub, %in
diff --git a/test/CodeGen/R600/fp-classify.ll b/test/CodeGen/R600/fp-classify.ll
new file mode 100644
index 0000000..4fac517
--- /dev/null
+++ b/test/CodeGen/R600/fp-classify.ll
@@ -0,0 +1,131 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i1 @llvm.AMDGPU.class.f32(float, i32) #1
+declare i1 @llvm.AMDGPU.class.f64(double, i32) #1
+declare i32 @llvm.r600.read.tidig.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare double @llvm.fabs.f64(double) #1
+
+; SI-LABEL: {{^}}test_isinf_pattern:
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x204{{$}}
+; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]]
+; SI-NOT: v_cmp
+; SI: s_endpgm
+define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %cmp = fcmp oeq float %fabs, 0x7FF0000000000000
+  %ext = zext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_not_isinf_pattern_0:
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %cmp = fcmp ueq float %fabs, 0x7FF0000000000000
+  %ext = zext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_not_isinf_pattern_1:
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %cmp = fcmp oeq float %fabs, 0xFFF0000000000000
+  %ext = zext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_isfinite_pattern_0:
+; SI-NOT: v_cmp
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1f8{{$}}
+; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]]
+; SI-NOT: v_cmp
+; SI: s_endpgm
+define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %ord = fcmp ord float %x, 0.000000e+00
+  %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Use negative infinity
+; SI-LABEL: {{^}}test_isfinite_not_pattern_0:
+; SI-NOT: v_cmp_class_f32
+; SI: s_endpgm
+define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %ord = fcmp ord float %x, 0.000000e+00
+  %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %ninf = fcmp une float %x.fabs, 0xFFF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; No fabs
+; SI-LABEL: {{^}}test_isfinite_not_pattern_1:
+; SI-NOT: v_cmp_class_f32
+; SI: s_endpgm
+define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %ord = fcmp ord float %x, 0.000000e+00
+  %ninf = fcmp une float %x, 0x7FF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; fabs of different value
+; SI-LABEL: {{^}}test_isfinite_not_pattern_2:
+; SI-NOT: v_cmp_class_f32
+; SI: s_endpgm
+define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 {
+  %ord = fcmp ord float %x, 0.000000e+00
+  %x.fabs = tail call float @llvm.fabs.f32(float %y) #1
+  %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Wrong ordered compare type
+; SI-LABEL: {{^}}test_isfinite_not_pattern_3:
+; SI-NOT: v_cmp_class_f32
+; SI: s_endpgm
+define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %ord = fcmp uno float %x, 0.000000e+00
+  %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Wrong unordered compare
+; SI-LABEL: {{^}}test_isfinite_not_pattern_4:
+; SI-NOT: v_cmp_class_f32
+; SI: s_endpgm
+define void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %ord = fcmp ord float %x, 0.000000e+00
+  %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %ninf = fcmp one float %x.fabs, 0x7FF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fp16_to_fp.ll b/test/CodeGen/R600/fp16_to_fp.ll
index ec3e051..da78f61 100644
--- a/test/CodeGen/R600/fp16_to_fp.ll
+++ b/test/CodeGen/R600/fp16_to_fp.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
 declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
diff --git a/test/CodeGen/R600/fp32_to_fp16.ll b/test/CodeGen/R600/fp32_to_fp16.ll
index e86ee62..c3c65ae 100644
--- a/test/CodeGen/R600/fp32_to_fp16.ll
+++ b/test/CodeGen/R600/fp32_to_fp16.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
 
diff --git a/test/CodeGen/R600/fp_to_sint.f64.ll b/test/CodeGen/R600/fp_to_sint.f64.ll
index 09edb40..e641847 100644
--- a/test/CodeGen/R600/fp_to_sint.f64.ll
+++ b/test/CodeGen/R600/fp_to_sint.f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll
index c583ec3..16549c3 100644
--- a/test/CodeGen/R600/fp_to_sint.ll
+++ b/test/CodeGen/R600/fp_to_sint.ll
@@ -1,16 +1,28 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+
+declare float @llvm.fabs.f32(float) #0
 
 ; FUNC-LABEL: {{^}}fp_to_sint_i32:
 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; SI: v_cvt_i32_f32_e32
 ; SI: s_endpgm
-define void @fp_to_sint_i32 (i32 addrspace(1)* %out, float %in) {
+define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
   %conv = fptosi float %in to i32
   store i32 %conv, i32 addrspace(1)* %out
   ret void
 }
 
+; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs:
+; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
+define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
+  %in.fabs = call float @llvm.fabs.f32(float %in) #0
+  %conv = fptosi float %in.fabs to i32
+  store i32 %conv, i32 addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}fp_to_sint_v2i32:
 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
@@ -214,3 +226,5 @@ define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
   store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
   ret void
 }
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fp_to_uint.f64.ll b/test/CodeGen/R600/fp_to_uint.f64.ll
index 25859bb..1ffe2fa 100644
--- a/test/CodeGen/R600/fp_to_uint.f64.ll
+++ b/test/CodeGen/R600/fp_to_uint.f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll
index 91bf4b7..804d90f 100644
--- a/test/CodeGen/R600/fp_to_uint.ll
+++ b/test/CodeGen/R600/fp_to_uint.ll
@@ -1,29 +1,31 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=EG -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 
-; FUNC-LABEL: {{^}}fp_to_uint_i32:
+; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i32:
 ; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+
 ; SI: v_cvt_u32_f32_e32
 ; SI: s_endpgm
-define void @fp_to_uint_i32 (i32 addrspace(1)* %out, float %in) {
+define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) {
   %conv = fptoui float %in to i32
   store i32 %conv, i32 addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}fp_to_uint_v2i32:
+; FUNC-LABEL: {{^}}fp_to_uint_v2f32_to_v2i32:
 ; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
 ; SI: v_cvt_u32_f32_e32
 ; SI: v_cvt_u32_f32_e32
-
-define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   %result = fptoui <2 x float> %in to <2 x i32>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}fp_to_uint_v4i32:
+; FUNC-LABEL: {{^}}fp_to_uint_v4f32_to_v4i32:
 ; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
@@ -33,14 +35,14 @@ define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
 ; SI: v_cvt_u32_f32_e32
 ; SI: v_cvt_u32_f32_e32
 
-define void @fp_to_uint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float> addrspace(1) * %in
   %result = fptoui <4 x float> %value to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
 
-; FUNC: {{^}}fp_to_uint_i64:
+; FUNC: {{^}}fp_to_uint_f32_to_i64:
 ; EG-DAG: AND_INT
 ; EG-DAG: LSHR
 ; EG-DAG: SUB_INT
@@ -64,13 +66,13 @@ define void @fp_to_uint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspac
 ; EG-DAG: CNDE_INT
 
 ; SI: s_endpgm
-define void @fp_to_uint_i64(i64 addrspace(1)* %out, float %x) {
+define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) {
   %conv = fptoui float %x to i64
   store i64 %conv, i64 addrspace(1)* %out
   ret void
 }
 
-; FUNC: {{^}}fp_to_uint_v2i64:
+; FUNC: {{^}}fp_to_uint_v2f32_to_v2i64:
 ; EG-DAG: AND_INT
 ; EG-DAG: LSHR
 ; EG-DAG: SUB_INT
@@ -115,13 +117,13 @@ define void @fp_to_uint_i64(i64 addrspace(1)* %out, float %x) {
 ; EG-DAG: CNDE_INT
 
 ; SI: s_endpgm
-define void @fp_to_uint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
   %conv = fptoui <2 x float> %x to <2 x i64>
   store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
   ret void
 }
 
-; FUNC: {{^}}fp_to_uint_v4i64:
+; FUNC: {{^}}fp_to_uint_v4f32_to_v4i64:
 ; EG-DAG: AND_INT
 ; EG-DAG: LSHR
 ; EG-DAG: SUB_INT
@@ -208,7 +210,7 @@ define void @fp_to_uint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
 ; EG-DAG: CNDE_INT
 
 ; SI: s_endpgm
-define void @fp_to_uint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
   %conv = fptoui <4 x float> %x to <4 x i64>
   store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/fpext.ll b/test/CodeGen/R600/fpext.ll
index 418395f..734a43b 100644
--- a/test/CodeGen/R600/fpext.ll
+++ b/test/CodeGen/R600/fpext.ll
@@ -1,9 +1,45 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; CHECK: {{^}}fpext:
-; CHECK: v_cvt_f64_f32_e32
-define void @fpext(double addrspace(1)* %out, float %in) {
+; FUNC-LABEL: {{^}}fpext_f32_to_f64:
+; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) {
   %result = fpext float %in to double
   store double %result, double addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}fpext_v2f32_to_v2f64:
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) {
+  %result = fpext <2 x float> %in to <2 x double>
+  store <2 x double> %result, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fpext_v4f32_to_v4f64:
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) {
+  %result = fpext <4 x float> %in to <4 x double>
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fpext_v8f32_to_v8f64:
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+define void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) {
+  %result = fpext <8 x float> %in to <8 x double>
+  store <8 x double> %result, <8 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fptrunc.ll b/test/CodeGen/R600/fptrunc.ll
index 8ac8d3b..385e10e 100644
--- a/test/CodeGen/R600/fptrunc.ll
+++ b/test/CodeGen/R600/fptrunc.ll
@@ -1,9 +1,45 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; CHECK: {{^}}fptrunc:
-; CHECK: v_cvt_f32_f64_e32
-define void @fptrunc(float addrspace(1)* %out, double %in) {
+; FUNC-LABEL: {{^}}fptrunc_f64_to_f32:
+; SI: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
   %result = fptrunc double %in to float
   store float %result, float addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32:
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) {
+  %result = fptrunc <2 x double> %in to <2 x float>
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fptrunc_v4f64_to_v4f32:
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) {
+  %result = fptrunc <4 x double> %in to <4 x float>
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fptrunc_v8f64_to_v8f32:
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) {
+  %result = fptrunc <8 x double> %in to <8 x float>
+  store <8 x float> %result, <8 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/frem.ll b/test/CodeGen/R600/frem.ll
index c846a77..02a0070 100644
--- a/test/CodeGen/R600/frem.ll
+++ b/test/CodeGen/R600/frem.ll
@@ -1,16 +1,18 @@
-; RUN: llc -march=r600 -mcpu=SI -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}frem_f32:
-; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
-; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:0x10
-; SI-DAG: v_cmp
-; SI-DAG: v_mul_f32
-; SI: v_rcp_f32_e32
-; SI: v_mul_f32_e32
-; SI: v_mul_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_mad_f32
-; SI: s_endpgm
+; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
+; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
+; GCN-DAG: v_cmp
+; GCN-DAG: v_mul_f32
+; GCN: v_rcp_f32_e32
+; GCN: v_mul_f32_e32
+; GCN: v_mul_f32_e32
+; GCN: v_trunc_f32_e32
+; GCN: v_mad_f32
+; GCN: s_endpgm
 define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                       float addrspace(1)* %in2) #0 {
    %gep2 = getelementptr float addrspace(1)* %in2, i32 4
@@ -22,14 +24,14 @@ define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 }
 
 ; FUNC-LABEL: {{^}}unsafe_frem_f32:
-; SI: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:0x10
-; SI: buffer_load_dword [[X:v[0-9]+]], {{.*}}
-; SI: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]]
-; SI: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]]
-; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
-; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
+; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
+; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}}
+; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]]
+; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]]
+; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
+; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: s_endpgm
 define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                              float addrspace(1)* %in2) #1 {
    %gep2 = getelementptr float addrspace(1)* %in2, i32 4
@@ -40,11 +42,17 @@ define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
    ret void
 }
 
-; TODO: This should check something when f64 fdiv is implemented
-; correctly
-
 ; FUNC-LABEL: {{^}}frem_f64:
-; SI: s_endpgm
+; GCN: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0
+; GCN: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0
+; GCN-DAG: v_div_fmas_f64
+; GCN-DAG: v_div_scale_f64
+; GCN-DAG: v_mul_f64
+; CI: v_trunc_f64_e32
+; CI: v_mul_f64
+; GCN: v_add_f64
+; GCN: buffer_store_dwordx2
+; GCN: s_endpgm
 define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) #0 {
    %r0 = load double addrspace(1)* %in1, align 8
@@ -55,11 +63,12 @@ define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 }
 
 ; FUNC-LABEL: {{^}}unsafe_frem_f64:
-; SI: v_rcp_f64_e32
-; SI: v_mul_f64
+; GCN: v_rcp_f64_e32
+; GCN: v_mul_f64
 ; SI: v_bfe_u32
-; SI: v_fma_f64
-; SI: s_endpgm
+; CI: v_trunc_f64_e32
+; GCN: v_fma_f64
+; GCN: s_endpgm
 define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                              double addrspace(1)* %in2) #1 {
    %r0 = load double addrspace(1)* %in1, align 8
diff --git a/test/CodeGen/R600/fsqrt.ll b/test/CodeGen/R600/fsqrt.ll
index 1f91faf..1fdf3e4 100644
--- a/test/CodeGen/R600/fsqrt.ll
+++ b/test/CodeGen/R600/fsqrt.ll
@@ -1,4 +1,9 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s
+
+; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x)
 
 ; CHECK: {{^}}fsqrt_f32:
 ; CHECK: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
index 6e5ccf1..ef90fea 100644
--- a/test/CodeGen/R600/fsub.ll
+++ b/test/CodeGen/R600/fsub.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 
 ; FUNC-LABEL: {{^}}v_fsub_f32:
diff --git a/test/CodeGen/R600/fsub64.ll b/test/CodeGen/R600/fsub64.ll
index eca1b62..2d85cc5 100644
--- a/test/CodeGen/R600/fsub64.ll
+++ b/test/CodeGen/R600/fsub64.ll
@@ -1,12 +1,107 @@
-; RUN: llc -march=r600 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare double @llvm.fabs.f64(double) #0
 
 ; SI-LABEL: {{^}}fsub_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
-   %r0 = load double addrspace(1)* %in1
-   %r1 = load double addrspace(1)* %in2
-   %r2 = fsub double %r0, %r1
-   store double %r2, double addrspace(1)* %out
-   ret void
+  %r0 = load double addrspace(1)* %in1
+  %r1 = load double addrspace(1)* %in2
+  %r2 = fsub double %r0, %r1
+  store double %r2, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fsub_fabs_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}}
+define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                           double addrspace(1)* %in2) {
+  %r0 = load double addrspace(1)* %in1
+  %r1 = load double addrspace(1)* %in2
+  %r1.fabs = call double @llvm.fabs.f64(double %r1) #0
+  %r2 = fsub double %r0, %r1.fabs
+  store double %r2, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fsub_fabs_inv_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, -v\[[0-9]+:[0-9]+\]}}
+define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                               double addrspace(1)* %in2) {
+  %r0 = load double addrspace(1)* %in1
+  %r1 = load double addrspace(1)* %in2
+  %r0.fabs = call double @llvm.fabs.f64(double %r0) #0
+  %r2 = fsub double %r0.fabs, %r1
+  store double %r2, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
+  %sub = fsub double %a, %b
+  store double %sub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_imm_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], 4.0, -s\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
+  %sub = fsub double 4.0, %a
+  store double %sub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_imm_inv_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -4.0, s\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) {
+  %sub = fsub double %a, 4.0
+  store double %sub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_self_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) {
+  %sub = fsub double %a, %a
+  store double %sub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fsub_v2f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) {
+  %sub = fsub <2 x double> %a, %b
+  store <2 x double> %sub, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fsub_v4f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x double> addrspace(1)* %in, i32 1
+  %a = load <4 x double> addrspace(1)* %in
+  %b = load <4 x double> addrspace(1)* %b_ptr
+  %result = fsub <4 x double> %a, %b
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
 }
+
+; SI-LABEL: {{^}}s_fsub_v4f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) {
+  %result = fsub <4 x double> %a, %b
+  store <4 x double> %result, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/ftrunc.f64.ll b/test/CodeGen/R600/ftrunc.f64.ll
index fba6154..21399a8 100644
--- a/test/CodeGen/R600/ftrunc.f64.ll
+++ b/test/CodeGen/R600/ftrunc.f64.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
 declare double @llvm.trunc.f64(double) nounwind readnone
 declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
@@ -23,12 +24,12 @@ define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 ; CI: v_trunc_f64_e32
 
 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
 ; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
 ; SI: s_lshr_b64
+; SI: cmp_lt_i32
 ; SI: s_not_b64
 ; SI: s_and_b64
-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI: cmp_lt_i32
 ; SI: cndmask_b32
 ; SI: cndmask_b32
 ; SI: cmp_gt_i32
diff --git a/test/CodeGen/R600/ftrunc.ll b/test/CodeGen/R600/ftrunc.ll
index 0eb1d7d..edc0860 100644
--- a/test/CodeGen/R600/ftrunc.ll
+++ b/test/CodeGen/R600/ftrunc.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG --check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
 
 declare float @llvm.trunc.f32(float) nounwind readnone
 declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone
diff --git a/test/CodeGen/R600/gep-address-space.ll b/test/CodeGen/R600/gep-address-space.ll
index 036daaf..5c6920d 100644
--- a/test/CodeGen/R600/gep-address-space.ll
+++ b/test/CodeGen/R600/gep-address-space.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=CHECK %s
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
 
 define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
 ; CHECK-LABEL: {{^}}use_gep_address_space:
diff --git a/test/CodeGen/R600/global-directive.ll b/test/CodeGen/R600/global-directive.ll
index d1244b8..3ba12c2 100644
--- a/test/CodeGen/R600/global-directive.ll
+++ b/test/CodeGen/R600/global-directive.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; Make sure the GlobalDirective isn't merged with the function name
 
diff --git a/test/CodeGen/R600/global-extload-i1.ll b/test/CodeGen/R600/global-extload-i1.ll
new file mode 100644
index 0000000..67d36ce
--- /dev/null
+++ b/test/CodeGen/R600/global-extload-i1.ll
@@ -0,0 +1,302 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; FIXME: Evergreen broken
+
+; FUNC-LABEL: {{^}}zextload_global_i1_to_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @zextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %a = load i1 addrspace(1)* %in
+  %ext = zext i1 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i1_to_i32:
+; SI: buffer_load_ubyte
+; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}}
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @sextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %a = load i1 addrspace(1)* %in
+  %ext = sext i1 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i32:
+; SI: s_endpgm
+define void @zextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i1> addrspace(1)* %in
+  %ext = zext <1 x i1> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i32:
+; SI: s_endpgm
+define void @sextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i1> addrspace(1)* %in
+  %ext = sext <1 x i1> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i32:
+; SI: s_endpgm
+define void @zextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i1> addrspace(1)* %in
+  %ext = zext <2 x i1> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i32:
+; SI: s_endpgm
+define void @sextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i1> addrspace(1)* %in
+  %ext = sext <2 x i1> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i32:
+; SI: s_endpgm
+define void @zextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i1> addrspace(1)* %in
+  %ext = zext <4 x i1> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i32:
+; SI: s_endpgm
+define void @sextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i1> addrspace(1)* %in
+  %ext = sext <4 x i1> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i32:
+; SI: s_endpgm
+define void @zextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i1> addrspace(1)* %in
+  %ext = zext <8 x i1> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i32:
+; SI: s_endpgm
+define void @sextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i1> addrspace(1)* %in
+  %ext = sext <8 x i1> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i32:
+; SI: s_endpgm
+define void @zextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i1> addrspace(1)* %in
+  %ext = zext <16 x i1> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i32:
+; SI: s_endpgm
+define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i1> addrspace(1)* %in
+  %ext = sext <16 x i1> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i32:
+; XSI: s_endpgm
+; define void @zextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i1> addrspace(1)* %in
+;   %ext = zext <32 x i1> %load to <32 x i32>
+;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i32:
+; XSI: s_endpgm
+; define void @sextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i1> addrspace(1)* %in
+;   %ext = sext <32 x i1> %load to <32 x i32>
+;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i32:
+; XSI: s_endpgm
+; define void @zextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i1> addrspace(1)* %in
+;   %ext = zext <64 x i1> %load to <64 x i32>
+;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i32:
+; XSI: s_endpgm
+; define void @sextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i1> addrspace(1)* %in
+;   %ext = sext <64 x i1> %load to <64 x i32>
+;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}zextload_global_i1_to_i64:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; SI: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
+; SI: buffer_store_dwordx2
+define void @zextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %a = load i1 addrspace(1)* %in
+  %ext = zext i1 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i1_to_i64:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
+; SI: buffer_store_dwordx2
+define void @sextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %a = load i1 addrspace(1)* %in
+  %ext = sext i1 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i64:
+; SI: s_endpgm
+define void @zextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i1> addrspace(1)* %in
+  %ext = zext <1 x i1> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i64:
+; SI: s_endpgm
+define void @sextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i1> addrspace(1)* %in
+  %ext = sext <1 x i1> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i64:
+; SI: s_endpgm
+define void @zextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i1> addrspace(1)* %in
+  %ext = zext <2 x i1> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i64:
+; SI: s_endpgm
+define void @sextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i1> addrspace(1)* %in
+  %ext = sext <2 x i1> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i64:
+; SI: s_endpgm
+define void @zextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i1> addrspace(1)* %in
+  %ext = zext <4 x i1> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i64:
+; SI: s_endpgm
+define void @sextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i1> addrspace(1)* %in
+  %ext = sext <4 x i1> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i64:
+; SI: s_endpgm
+define void @zextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i1> addrspace(1)* %in
+  %ext = zext <8 x i1> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i64:
+; SI: s_endpgm
+define void @sextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i1> addrspace(1)* %in
+  %ext = sext <8 x i1> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i64:
+; SI: s_endpgm
+define void @zextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i1> addrspace(1)* %in
+  %ext = zext <16 x i1> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i64:
+; SI: s_endpgm
+define void @sextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i1> addrspace(1)* %in
+  %ext = sext <16 x i1> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i64:
+; XSI: s_endpgm
+; define void @zextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i1> addrspace(1)* %in
+;   %ext = zext <32 x i1> %load to <32 x i64>
+;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i64:
+; XSI: s_endpgm
+; define void @sextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i1> addrspace(1)* %in
+;   %ext = sext <32 x i1> %load to <32 x i64>
+;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i64:
+; XSI: s_endpgm
+; define void @zextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i1> addrspace(1)* %in
+;   %ext = zext <64 x i1> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i64:
+; XSI: s_endpgm
+; define void @sextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i1> addrspace(1)* %in
+;   %ext = sext <64 x i1> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
diff --git a/test/CodeGen/R600/global-extload-i16.ll b/test/CodeGen/R600/global-extload-i16.ll
new file mode 100644
index 0000000..f3e3312
--- /dev/null
+++ b/test/CodeGen/R600/global-extload-i16.ll
@@ -0,0 +1,302 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; FIXME: cypress is broken because the bigger testcases spill and it's not implemented
+
+; FUNC-LABEL: {{^}}zextload_global_i16_to_i32:
+; SI: buffer_load_ushort
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %a = load i16 addrspace(1)* %in
+  %ext = zext i16 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i16_to_i32:
+; SI: buffer_load_sshort
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %a = load i16 addrspace(1)* %in
+  %ext = sext i16 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32:
+; SI: buffer_load_ushort
+; SI: s_endpgm
+define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i16> addrspace(1)* %in
+  %ext = zext <1 x i16> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32:
+; SI: buffer_load_sshort
+; SI: s_endpgm
+define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i16> addrspace(1)* %in
+  %ext = sext <1 x i16> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32:
+; SI: s_endpgm
+define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i16> addrspace(1)* %in
+  %ext = zext <2 x i16> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32:
+; SI: s_endpgm
+define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i16> addrspace(1)* %in
+  %ext = sext <2 x i16> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32:
+; SI: s_endpgm
+define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i16> addrspace(1)* %in
+  %ext = zext <4 x i16> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32:
+; SI: s_endpgm
+define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i16> addrspace(1)* %in
+  %ext = sext <4 x i16> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32:
+; SI: s_endpgm
+define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i16> addrspace(1)* %in
+  %ext = zext <8 x i16> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32:
+; SI: s_endpgm
+define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i16> addrspace(1)* %in
+  %ext = sext <8 x i16> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32:
+; SI: s_endpgm
+define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i16> addrspace(1)* %in
+  %ext = zext <16 x i16> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32:
+; SI: s_endpgm
+define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i16> addrspace(1)* %in
+  %ext = sext <16 x i16> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32:
+; SI: s_endpgm
+define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i16> addrspace(1)* %in
+  %ext = zext <32 x i16> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32:
+; SI: s_endpgm
+define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i16> addrspace(1)* %in
+  %ext = sext <32 x i16> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32:
+; SI: s_endpgm
+define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <64 x i16> addrspace(1)* %in
+  %ext = zext <64 x i16> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32:
+; SI: s_endpgm
+define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <64 x i16> addrspace(1)* %in
+  %ext = sext <64 x i16> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_i16_to_i64:
+; SI: buffer_load_ushort v[[LO:[0-9]+]],
+; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %a = load i16 addrspace(1)* %in
+  %ext = zext i16 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i16_to_i64:
+; SI: buffer_load_sshort [[LOAD:v[0-9]+]],
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: buffer_store_dwordx2
+define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %a = load i16 addrspace(1)* %in
+  %ext = sext i16 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64:
+; SI: s_endpgm
+define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i16> addrspace(1)* %in
+  %ext = zext <1 x i16> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64:
+; SI: s_endpgm
+define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i16> addrspace(1)* %in
+  %ext = sext <1 x i16> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64:
+; SI: s_endpgm
+define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i16> addrspace(1)* %in
+  %ext = zext <2 x i16> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64:
+; SI: s_endpgm
+define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i16> addrspace(1)* %in
+  %ext = sext <2 x i16> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64:
+; SI: s_endpgm
+define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i16> addrspace(1)* %in
+  %ext = zext <4 x i16> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64:
+; SI: s_endpgm
+define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i16> addrspace(1)* %in
+  %ext = sext <4 x i16> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64:
+; SI: s_endpgm
+define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i16> addrspace(1)* %in
+  %ext = zext <8 x i16> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64:
+; SI: s_endpgm
+define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i16> addrspace(1)* %in
+  %ext = sext <8 x i16> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64:
+; SI: s_endpgm
+define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i16> addrspace(1)* %in
+  %ext = zext <16 x i16> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64:
+; SI: s_endpgm
+define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i16> addrspace(1)* %in
+  %ext = sext <16 x i16> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64:
+; SI: s_endpgm
+define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i16> addrspace(1)* %in
+  %ext = zext <32 x i16> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64:
+; SI: s_endpgm
+define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i16> addrspace(1)* %in
+  %ext = sext <32 x i16> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64:
+; SI: s_endpgm
+define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <64 x i16> addrspace(1)* %in
+  %ext = zext <64 x i16> %load to <64 x i64>
+  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64:
+; SI: s_endpgm
+define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <64 x i16> addrspace(1)* %in
+  %ext = sext <64 x i16> %load to <64 x i64>
+  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/global-extload-i32.ll b/test/CodeGen/R600/global-extload-i32.ll
new file mode 100644
index 0000000..b3d5438
--- /dev/null
+++ b/test/CodeGen/R600/global-extload-i32.ll
@@ -0,0 +1,457 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}zextload_global_i32_to_i64:
+; SI: buffer_load_dword v[[LO:[0-9]+]],
+; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %a = load i32 addrspace(1)* %in
+  %ext = zext i32 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i32_to_i64:
+; SI: buffer_load_dword [[LOAD:v[0-9]+]],
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: buffer_store_dwordx2
+define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %a = load i32 addrspace(1)* %in
+  %ext = sext i32 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i32_to_v1i64:
+; SI: buffer_load_dword
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @zextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i32> addrspace(1)* %in
+  %ext = zext <1 x i32> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i32_to_v1i64:
+; SI: buffer_load_dword
+; SI: v_ashrrev_i32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @sextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i32> addrspace(1)* %in
+  %ext = sext <1 x i32> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i32_to_v2i64:
+; SI: buffer_load_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i32> addrspace(1)* %in
+  %ext = zext <2 x i32> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i32_to_v2i64:
+; SI: buffer_load_dwordx2
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI: s_endpgm
+define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i32> addrspace(1)* %in
+  %ext = sext <2 x i32> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i32_to_v4i64:
+; SI: buffer_load_dwordx4
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i32> addrspace(1)* %in
+  %ext = zext <4 x i32> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i32_to_v4i64:
+; SI: buffer_load_dwordx4
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI: s_endpgm
+define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i32> addrspace(1)* %in
+  %ext = sext <4 x i32> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI: s_endpgm
+define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i32> addrspace(1)* %in
+  %ext = zext <8 x i32> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i32_to_v8i64:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI: s_endpgm
+define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i32> addrspace(1)* %in
+  %ext = sext <8 x i32> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i32_to_v16i64:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI: s_endpgm
+define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i32> addrspace(1)* %in
+  %ext = sext <16 x i32> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i32_to_v16i64
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+
+; SI: s_endpgm
+define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i32> addrspace(1)* %in
+  %ext = zext <16 x i32> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v32i32_to_v32i64:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI: s_endpgm
+define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i32> addrspace(1)* %in
+  %ext = sext <32 x i32> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v32i32_to_v32i64:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI: s_endpgm
+define void @zextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i32> addrspace(1)* %in
+  %ext = zext <32 x i32> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/global-extload-i8.ll b/test/CodeGen/R600/global-extload-i8.ll
new file mode 100644
index 0000000..4c37f3f
--- /dev/null
+++ b/test/CodeGen/R600/global-extload-i8.ll
@@ -0,0 +1,299 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}zextload_global_i8_to_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @zextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %a = load i8 addrspace(1)* %in
+  %ext = zext i8 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i8_to_i32:
+; SI: buffer_load_sbyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @sextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %a = load i8 addrspace(1)* %in
+  %ext = sext i8 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i32:
+; SI: s_endpgm
+define void @zextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i8> addrspace(1)* %in
+  %ext = zext <1 x i8> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i32:
+; SI: s_endpgm
+define void @sextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i8> addrspace(1)* %in
+  %ext = sext <1 x i8> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i32:
+; SI: s_endpgm
+define void @zextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i8> addrspace(1)* %in
+  %ext = zext <2 x i8> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i32:
+; SI: s_endpgm
+define void @sextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i8> addrspace(1)* %in
+  %ext = sext <2 x i8> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i32:
+; SI: s_endpgm
+define void @zextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i8> addrspace(1)* %in
+  %ext = zext <4 x i8> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i32:
+; SI: s_endpgm
+define void @sextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i8> addrspace(1)* %in
+  %ext = sext <4 x i8> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i32:
+; SI: s_endpgm
+define void @zextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i8> addrspace(1)* %in
+  %ext = zext <8 x i8> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i32:
+; SI: s_endpgm
+define void @sextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i8> addrspace(1)* %in
+  %ext = sext <8 x i8> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i32:
+; SI: s_endpgm
+define void @zextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i8> addrspace(1)* %in
+  %ext = zext <16 x i8> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i32:
+; SI: s_endpgm
+define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i8> addrspace(1)* %in
+  %ext = sext <16 x i8> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i32:
+; XSI: s_endpgm
+; define void @zextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i8> addrspace(1)* %in
+;   %ext = zext <32 x i8> %load to <32 x i32>
+;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i32:
+; XSI: s_endpgm
+; define void @sextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i8> addrspace(1)* %in
+;   %ext = sext <32 x i8> %load to <32 x i32>
+;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i32:
+; XSI: s_endpgm
+; define void @zextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i8> addrspace(1)* %in
+;   %ext = zext <64 x i8> %load to <64 x i32>
+;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i32:
+; XSI: s_endpgm
+; define void @sextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i8> addrspace(1)* %in
+;   %ext = sext <64 x i8> %load to <64 x i32>
+;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}zextload_global_i8_to_i64:
+; SI: buffer_load_ubyte v[[LO:[0-9]+]],
+; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %a = load i8 addrspace(1)* %in
+  %ext = zext i8 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i8_to_i64:
+; SI: buffer_load_sbyte [[LOAD:v[0-9]+]],
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: buffer_store_dwordx2
+define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %a = load i8 addrspace(1)* %in
+  %ext = sext i8 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i64:
+; SI: s_endpgm
+define void @zextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i8> addrspace(1)* %in
+  %ext = zext <1 x i8> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i64:
+; SI: s_endpgm
+define void @sextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i8> addrspace(1)* %in
+  %ext = sext <1 x i8> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i64:
+; SI: s_endpgm
+define void @zextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i8> addrspace(1)* %in
+  %ext = zext <2 x i8> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i64:
+; SI: s_endpgm
+define void @sextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i8> addrspace(1)* %in
+  %ext = sext <2 x i8> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i64:
+; SI: s_endpgm
+define void @zextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i8> addrspace(1)* %in
+  %ext = zext <4 x i8> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i64:
+; SI: s_endpgm
+define void @sextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i8> addrspace(1)* %in
+  %ext = sext <4 x i8> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i64:
+; SI: s_endpgm
+define void @zextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i8> addrspace(1)* %in
+  %ext = zext <8 x i8> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i64:
+; SI: s_endpgm
+define void @sextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i8> addrspace(1)* %in
+  %ext = sext <8 x i8> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i64:
+; SI: s_endpgm
+define void @zextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i8> addrspace(1)* %in
+  %ext = zext <16 x i8> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i64:
+; SI: s_endpgm
+define void @sextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i8> addrspace(1)* %in
+  %ext = sext <16 x i8> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i64:
+; XSI: s_endpgm
+; define void @zextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i8> addrspace(1)* %in
+;   %ext = zext <32 x i8> %load to <32 x i64>
+;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i64:
+; XSI: s_endpgm
+; define void @sextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i8> addrspace(1)* %in
+;   %ext = sext <32 x i8> %load to <32 x i64>
+;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i64:
+; XSI: s_endpgm
+; define void @zextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i8> addrspace(1)* %in
+;   %ext = zext <64 x i8> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i64:
+; XSI: s_endpgm
+; define void @sextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i8> addrspace(1)* %in
+;   %ext = sext <64 x i8> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
diff --git a/test/CodeGen/R600/global-zero-initializer.ll b/test/CodeGen/R600/global-zero-initializer.ll
index b69b061..6909c58 100644
--- a/test/CodeGen/R600/global-zero-initializer.ll
+++ b/test/CodeGen/R600/global-zero-initializer.ll
@@ -1,4 +1,5 @@
-; RUN: not llc -march=r600 -mcpu=SI < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
 
 ; CHECK: error: unsupported initializer for address space in load_init_global_global
 
diff --git a/test/CodeGen/R600/global_atomics.ll b/test/CodeGen/R600/global_atomics.ll
index 533a964..5a07a02 100644
--- a/test/CodeGen/R600/global_atomics.ll
+++ b/test/CodeGen/R600/global_atomics.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_offset:
-; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32 addrspace(1)* %out, i32 4
@@ -10,7 +10,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset:
-; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -21,7 +21,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
-; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
@@ -31,7 +31,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
-; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
@@ -81,7 +81,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_offset:
-; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32 addrspace(1)* %out, i32 4
@@ -90,7 +90,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset:
-; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -101,7 +101,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
-; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
@@ -111,7 +111,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
-; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
@@ -161,7 +161,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
-; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32 addrspace(1)* %out, i32 4
@@ -170,7 +170,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset:
-; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -181,7 +181,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
-; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
@@ -191,7 +191,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
-; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
@@ -241,7 +241,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_offset:
-; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32 addrspace(1)* %out, i32 4
@@ -250,7 +250,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset:
-; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -261,7 +261,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
-; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
@@ -271,7 +271,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
-; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
@@ -321,7 +321,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
-; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32 addrspace(1)* %out, i32 4
@@ -330,7 +330,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset:
-; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -341,7 +341,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
-; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
@@ -351,7 +351,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
-; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
@@ -401,7 +401,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_offset:
-; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32 addrspace(1)* %out, i32 4
@@ -410,7 +410,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset:
-; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -421,7 +421,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
-; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
@@ -431,7 +431,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
-; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
@@ -481,7 +481,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
-; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32 addrspace(1)* %out, i32 4
@@ -490,7 +490,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset:
-; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -501,7 +501,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
-; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
@@ -511,7 +511,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
-; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
@@ -561,7 +561,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_offset:
-; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32 addrspace(1)* %out, i32 4
@@ -570,7 +570,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset:
-; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -581,7 +581,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
-; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
@@ -591,7 +591,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
-; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
@@ -641,7 +641,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
-; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32 addrspace(1)* %out, i32 4
@@ -650,7 +650,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset:
-; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -661,7 +661,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
-; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
@@ -671,7 +671,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
-; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
@@ -721,7 +721,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
-; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32 addrspace(1)* %out, i32 4
@@ -730,7 +730,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset:
-; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
@@ -741,7 +741,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
-; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
@@ -751,7 +751,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
-; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
diff --git a/test/CodeGen/R600/gv-const-addrspace-fail.ll b/test/CodeGen/R600/gv-const-addrspace-fail.ll
index 905948f..af0df41 100644
--- a/test/CodeGen/R600/gv-const-addrspace-fail.ll
+++ b/test/CodeGen/R600/gv-const-addrspace-fail.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; XUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
diff --git a/test/CodeGen/R600/gv-const-addrspace.ll b/test/CodeGen/R600/gv-const-addrspace.ll
index 6aa20b8..45af71d 100644
--- a/test/CodeGen/R600/gv-const-addrspace.ll
+++ b/test/CodeGen/R600/gv-const-addrspace.ll
@@ -1,5 +1,6 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 
 @b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
@@ -9,6 +10,7 @@
 ; FUNC-LABEL: {{^}}float:
 ; FIXME: We should be using s_load_dword here.
 ; SI: buffer_load_dword
+; VI: s_load_dword
 
 ; EG-DAG: MOV {{\** *}}T2.X
 ; EG-DAG: MOV {{\** *}}T3.X
@@ -31,6 +33,7 @@ entry:
 
 ; FIXME: We should be using s_load_dword here.
 ; SI: buffer_load_dword
+; VI: s_load_dword
 
 ; EG-DAG: MOV {{\** *}}T2.X
 ; EG-DAG: MOV {{\** *}}T3.X
@@ -53,7 +56,7 @@ entry:
 @struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
 
 ; FUNC-LABEL: {{^}}struct_foo_gv_load:
-; SI: s_load_dword
+; GCN: s_load_dword
 
 define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
@@ -70,6 +73,7 @@ define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
 ; FUNC-LABEL: {{^}}array_v1_gv_load:
 ; FIXME: We should be using s_load_dword here.
 ; SI: buffer_load_dword
+; VI: s_load_dword
 define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
   %load = load <1 x i32> addrspace(2)* %gep, align 4
diff --git a/test/CodeGen/R600/half.ll b/test/CodeGen/R600/half.ll
index 6ad9b2f..35a41c5 100644
--- a/test/CodeGen/R600/half.ll
+++ b/test/CodeGen/R600/half.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s
 
 define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) {
 ; CHECK-LABEL: {{^}}test_load_store:
diff --git a/test/CodeGen/R600/hsa.ll b/test/CodeGen/R600/hsa.ll
new file mode 100644
index 0000000..ff75b90
--- /dev/null
+++ b/test/CodeGen/R600/hsa.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
+
+; HSA: {{^}}simple:
+; HSA: .section        .hsa.version
+; HSA-NEXT: .ascii  "HSA Code Unit:0.0:AMD:0.1:GFX8.1:0"
+; Make sure we are setting the ATC bit:
+; HSA: s_mov_b32 s[[HI:[0-9]]], 0x100f000
+; HSA: buffer_store_dword v{{[0-9]+}}, s[0:[[HI]]], 0
+
+define void @simple(i32 addrspace(1)* %out) {
+entry:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/i1-copy-implicit-def.ll b/test/CodeGen/R600/i1-copy-implicit-def.ll
index 7c5bc04..b11a211 100644
--- a/test/CodeGen/R600/i1-copy-implicit-def.ll
+++ b/test/CodeGen/R600/i1-copy-implicit-def.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SILowerI1Copies was not handling IMPLICIT_DEF
 ; SI-LABEL: {{^}}br_implicit_def:
diff --git a/test/CodeGen/R600/i1-copy-phi.ll b/test/CodeGen/R600/i1-copy-phi.ll
index bfa8672..430466e 100644
--- a/test/CodeGen/R600/i1-copy-phi.ll
+++ b/test/CodeGen/R600/i1-copy-phi.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}br_i1_phi:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
diff --git a/test/CodeGen/R600/icmp64.ll b/test/CodeGen/R600/icmp64.ll
index 870bf7f..0eaa33e 100644
--- a/test/CodeGen/R600/icmp64.ll
+++ b/test/CodeGen/R600/icmp64.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}test_i64_eq:
 ; SI: v_cmp_eq_i64
diff --git a/test/CodeGen/R600/imm.ll b/test/CodeGen/R600/imm.ll
index 1fcaf29..9b95fd6 100644
--- a/test/CodeGen/R600/imm.ll
+++ b/test/CodeGen/R600/imm.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CHECK %s
 
 ; Use a 64-bit value with lo bits that can be represented as an inline constant
 ; CHECK-LABEL: {{^}}i64_imm_inline_lo:
@@ -22,73 +23,100 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_0.0_f32
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; CHECK-LABEL: {{^}}store_imm_neg_0.0_i64:
+; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000
+; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
+  store i64 -9223372036854775808, i64 addrspace(1) *%out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_neg_0.0_i32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
 ; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
+  store i32 -2147483648, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
   store float 0.0, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_0.5_f32
+; CHECK-LABEL: {{^}}store_imm_neg_0.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; CHECK: buffer_store_dword [[REG]]
+define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
+  store float -0.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.5_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
   store float 0.5, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f32
+; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
   store float -0.5, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_1.0_f32
+; CHECK-LABEL: {{^}}store_inline_imm_1.0_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
   store float 1.0, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f32
+; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
   store float -1.0, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_2.0_f32
+; CHECK-LABEL: {{^}}store_inline_imm_2.0_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
   store float 2.0, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f32
+; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
   store float -2.0, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_4.0_f32
+; CHECK-LABEL: {{^}}store_inline_imm_4.0_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
   store float 4.0, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f32
+; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
   store float -4.0, float addrspace(1)* %out
   ret void
@@ -96,106 +124,106 @@ define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
 
 ; CHECK-LABEL: {{^}}store_literal_imm_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_literal_imm_f32(float addrspace(1)* %out) {
   store float 4096.0, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_0.0_f32
+; CHECK-LABEL: {{^}}add_inline_imm_0.0_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0.0, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0.0
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_0.5_f32
+; CHECK-LABEL: {{^}}add_inline_imm_0.5_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0.5
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f32
+; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -0.5
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_1.0_f32
+; CHECK-LABEL: {{^}}add_inline_imm_1.0_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 1.0
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f32
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -1.0
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_2.0_f32
+; CHECK-LABEL: {{^}}add_inline_imm_2.0_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 2.0
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f32
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -2.0
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_4.0_f32
+; CHECK-LABEL: {{^}}add_inline_imm_4.0_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 4.0
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f32
+; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -4.0
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: @commute_add_inline_imm_0.5_f32
+; CHECK-LABEL: {{^}}commute_add_inline_imm_0.5_f32:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %x = load float addrspace(1)* %in
   %y = fadd float %x, 0.5
@@ -203,13 +231,387 @@ define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addr
   ret void
 }
 
-; CHECK-LABEL: @commute_add_literal_f32
+; CHECK-LABEL: {{^}}commute_add_literal_f32:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]]
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %x = load float addrspace(1)* %in
   %y = fadd float %x, 1024.0
   store float %y, float addrspace(1)* %out
   ret void
 }
+
+; CHECK-LABEL: {{^}}add_inline_imm_1_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0x36a0000000000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_2_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0x36b0000000000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_16_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 16, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0x36e0000000000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0xffffffffe0000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0xffffffffc0000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -16, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0xfffffffe00000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_63_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 63, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0x36ff800000000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_64_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 64, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0x3700000000000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}add_inline_imm_0.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_0.5_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0.5, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0.5
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -0.5, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, -0.5
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_1.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 1.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, -1.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_2.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 2.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, -2.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_4.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 4.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 4.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -4.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, -4.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}add_inline_imm_1_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0x0000000000000001
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_2_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0x0000000000000002
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_16_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 16, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0x0000000000000010
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0xffffffffffffffff
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0xfffffffffffffffe
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -16, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0xfffffffffffffff0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_63_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 63, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0x000000000000003F
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_64_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 64, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0x0000000000000040
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.0_f64:
+; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0
+; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
+  store double 0.0, double addrspace(1)* %out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}store_literal_imm_neg_0.0_f64:
+; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000
+; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
+  store double -0.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.5_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fe00000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
+  store double 0.5, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfe00000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
+  store double -0.5, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_1.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3ff00000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
+  store double 1.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbff00000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
+  store double -1.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_2.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 2.0
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
+  store double 2.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], -2.0
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
+  store double -2.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_4.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40100000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
+  store double 4.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xc0100000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
+  store double -4.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_literal_imm_f64:
+; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x40b00000
+; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_literal_imm_f64(double addrspace(1)* %out) {
+  store double 4096.0, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/indirect-addressing-si.ll b/test/CodeGen/R600/indirect-addressing-si.ll
index 0ba1614..9cd2d84 100644
--- a/test/CodeGen/R600/indirect-addressing-si.ll
+++ b/test/CodeGen/R600/indirect-addressing-si.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ; Tests for indirect addressing on SI, which is implemented using dynamic
 ; indexing of vectors.
diff --git a/test/CodeGen/R600/indirect-private-64.ll b/test/CodeGen/R600/indirect-private-64.ll
index e0a6ce1..cb06d60 100644
--- a/test/CodeGen/R600/indirect-private-64.ll
+++ b/test/CodeGen/R600/indirect-private-64.ll
@@ -1,5 +1,7 @@
-; RUN: llc -march=r600 -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
-; RUN: llc -march=r600 -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
 
 
 declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
diff --git a/test/CodeGen/R600/infinite-loop.ll b/test/CodeGen/R600/infinite-loop.ll
index 48edab0..7233aa5 100644
--- a/test/CodeGen/R600/infinite-loop.ll
+++ b/test/CodeGen/R600/infinite-loop.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}infinite_loop:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
diff --git a/test/CodeGen/R600/inline-asm.ll b/test/CodeGen/R600/inline-asm.ll
new file mode 100644
index 0000000..efc2292
--- /dev/null
+++ b/test/CodeGen/R600/inline-asm.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK: {{^}}inline_asm:
+; CHECK: s_endpgm
+; CHECK: s_endpgm
+define void @inline_asm(i32 addrspace(1)* %out) {
+entry:
+  store i32 5, i32 addrspace(1)* %out
+  call void asm sideeffect "s_endpgm", ""()
+  ret void
+}
diff --git a/test/CodeGen/R600/inline-calls.ll b/test/CodeGen/R600/inline-calls.ll
index 3bceeca..33a4c83 100644
--- a/test/CodeGen/R600/inline-calls.ll
+++ b/test/CodeGen/R600/inline-calls.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck  %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck  %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck  %s
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-NOT: {{^}}func:
diff --git a/test/CodeGen/R600/input-mods.ll b/test/CodeGen/R600/input-mods.ll
index e3e9499..1c4d285 100644
--- a/test/CodeGen/R600/input-mods.ll
+++ b/test/CodeGen/R600/input-mods.ll
@@ -1,13 +1,13 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
-;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG
+;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM
 
-;EG-CHECK-LABEL: {{^}}test:
-;EG-CHECK: EXP_IEEE *
-;CM-CHECK-LABEL: {{^}}test:
-;CM-CHECK: EXP_IEEE T{{[0-9]+}}.X, -|T{{[0-9]+}}.X|
-;CM-CHECK: EXP_IEEE T{{[0-9]+}}.Y (MASKED), -|T{{[0-9]+}}.X|
-;CM-CHECK: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X|
-;CM-CHECK: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X|
+;EG-LABEL: {{^}}test:
+;EG: EXP_IEEE *
+;CM-LABEL: {{^}}test:
+;CM: EXP_IEEE T{{[0-9]+}}.X, -|T{{[0-9]+}}.X|
+;CM: EXP_IEEE T{{[0-9]+}}.Y (MASKED), -|T{{[0-9]+}}.X|
+;CM: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X|
+;CM: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X|
 
 define void @test(<4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
diff --git a/test/CodeGen/R600/insert_subreg.ll b/test/CodeGen/R600/insert_subreg.ll
index e311e19..4a5e886 100644
--- a/test/CodeGen/R600/insert_subreg.ll
+++ b/test/CodeGen/R600/insert_subreg.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s
 
 ; Test that INSERT_SUBREG instructions don't have non-register operands after
 ; instruction selection.
diff --git a/test/CodeGen/R600/insert_vector_elt.ll b/test/CodeGen/R600/insert_vector_elt.ll
index 857c414..64afddc 100644
--- a/test/CodeGen/R600/insert_vector_elt.ll
+++ b/test/CodeGen/R600/insert_vector_elt.ll
@@ -1,4 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
 
 ; FIXME: Broken on evergreen
 ; FIXME: For some reason the 8 and 16 vectors are being stored as
diff --git a/test/CodeGen/R600/kernel-args.ll b/test/CodeGen/R600/kernel-args.ll
index 9a7da90..5db45ce 100644
--- a/test/CodeGen/R600/kernel-args.ll
+++ b/test/CodeGen/R600/kernel-args.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 
-; EG-CHECK-LABEL: {{^}}i8_arg:
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: {{^}}i8_arg:
-; SI-CHECK: buffer_load_ubyte
+; FUNC-LABEL: {{^}}i8_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; GCN: buffer_load_ubyte
 
 define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
 entry:
@@ -14,10 +14,10 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}i8_zext_arg:
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: {{^}}i8_zext_arg:
-; SI-CHECK: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; FUNC-LABEL: {{^}}i8_zext_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 
 define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
 entry:
@@ -26,10 +26,10 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}i8_sext_arg:
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: {{^}}i8_sext_arg:
-; SI-CHECK: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; FUNC-LABEL: {{^}}i8_sext_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 
 define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
 entry:
@@ -38,10 +38,9 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}i16_arg:
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: {{^}}i16_arg:
-; SI-CHECK: buffer_load_ushort
+; FUNC-LABEL: {{^}}i16_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; GCN: buffer_load_ushort
 
 define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
 entry:
@@ -50,10 +49,10 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}i16_zext_arg:
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: {{^}}i16_zext_arg:
-; SI-CHECK: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; FUNC-LABEL: {{^}}i16_zext_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 
 define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
 entry:
@@ -62,10 +61,10 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}i16_sext_arg:
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: {{^}}i16_sext_arg:
-; SI-CHECK: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; FUNC-LABEL: {{^}}i16_sext_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 
 define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
 entry:
@@ -74,380 +73,369 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}i32_arg:
-; EG-CHECK: T{{[0-9]\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: {{^}}i32_arg:
-; s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; FUNC-LABEL: {{^}}i32_arg:
+; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
 entry:
   store i32 %in, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}f32_arg:
-; EG-CHECK: T{{[0-9]\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: {{^}}f32_arg:
-; s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; FUNC-LABEL: {{^}}f32_arg:
+; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
 entry:
   store float %in, float addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v2i8_arg:
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: {{^}}v2i8_arg:
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
+; FUNC-LABEL: {{^}}v2i8_arg:
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
 define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
 entry:
   store <2 x i8> %in, <2 x i8> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v2i16_arg:
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; SI-CHECK-LABEL: {{^}}v2i16_arg:
-; SI-CHECK-DAG: buffer_load_ushort
-; SI-CHECK-DAG: buffer_load_ushort
+; FUNC-LABEL: {{^}}v2i16_arg:
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; GCN-DAG: buffer_load_ushort
+; GCN-DAG: buffer_load_ushort
 define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v2i32_arg:
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
-; SI-CHECK-LABEL: {{^}}v2i32_arg:
-; SI-CHECK: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
+; FUNC-LABEL: {{^}}v2i32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
+; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
+; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
 define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v2f32_arg:
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
-; SI-CHECK-LABEL: {{^}}v2f32_arg:
-; SI-CHECK: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
+; FUNC-LABEL: {{^}}v2f32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
+; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
+; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
 define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
 entry:
   store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v3i8_arg:
+; FUNC-LABEL: {{^}}v3i8_arg:
 ; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
 ; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
 ; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
-; SI-CHECK-LABEL: {{^}}v3i8_arg:
 define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
 entry:
   store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v3i16_arg:
+; FUNC-LABEL: {{^}}v3i16_arg:
 ; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
 ; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
 ; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
-; SI-CHECK-LABEL: {{^}}v3i16_arg:
 define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
 entry:
   store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
   ret void
 }
-; EG-CHECK-LABEL: {{^}}v3i32_arg:
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
-; SI-CHECK-LABEL: {{^}}v3i32_arg:
-; SI-CHECK: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
+; FUNC-LABEL: {{^}}v3i32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
+; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
 define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
 entry:
   store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v3f32_arg:
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
-; SI-CHECK-LABEL: {{^}}v3f32_arg:
-; SI-CHECK: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
+; FUNC-LABEL: {{^}}v3f32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
+; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
 define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
 entry:
   store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v4i8_arg:
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: {{^}}v4i8_arg:
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
+; FUNC-LABEL: {{^}}v4i8_arg:
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
 define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v4i16_arg:
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; SI-CHECK-LABEL: {{^}}v4i16_arg:
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
+; FUNC-LABEL: {{^}}v4i16_arg:
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
 define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
 entry:
   store <4 x i16> %in, <4 x i16> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v4i32_arg:
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
-; SI-CHECK-LABEL: {{^}}v4i32_arg:
-; SI-CHECK: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
+; FUNC-LABEL: {{^}}v4i32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
+; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
+; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
 define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v4f32_arg:
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
-; SI-CHECK-LABEL: {{^}}v4f32_arg:
-; SI-CHECK: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
+; FUNC-LABEL: {{^}}v4f32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
+; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
+; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
 define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
 entry:
   store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v8i8_arg:
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: {{^}}v8i8_arg:
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
+; FUNC-LABEL: {{^}}v8i8_arg:
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
 define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
 entry:
   store <8 x i8> %in, <8 x i8> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v8i16_arg:
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; SI-CHECK-LABEL: {{^}}v8i16_arg:
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
+; FUNC-LABEL: {{^}}v8i16_arg:
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
 define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
 entry:
   store <8 x i16> %in, <8 x i16> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v8i32_arg:
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI-CHECK-LABEL: {{^}}v8i32_arg:
-; SI-CHECK: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
+; FUNC-LABEL: {{^}}v8i32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
+; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
+; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44
 define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
 entry:
   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v8f32_arg:
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI-CHECK-LABEL: {{^}}v8f32_arg:
-; SI-CHECK: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
+; FUNC-LABEL: {{^}}v8f32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
+; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
 define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
 entry:
   store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v16i8_arg:
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; EG-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: {{^}}v16i8_arg:
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
+; FUNC-LABEL: {{^}}v16i8_arg:
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
 define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
 entry:
   store <16 x i8> %in, <16 x i8> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v16i16_arg:
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; EG-CHECK: VTX_READ_16
-; SI-CHECK-LABEL: {{^}}v16i16_arg:
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
+; FUNC-LABEL: {{^}}v16i16_arg:
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
 define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
 entry:
   store <16 x i16> %in, <16 x i16> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v16i32_arg:
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI-CHECK-LABEL: {{^}}v16i32_arg:
-; SI-CHECK: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; FUNC-LABEL: {{^}}v16i32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
+; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
 define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
 entry:
   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}v16f32_arg:
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
-; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI-CHECK-LABEL: {{^}}v16f32_arg:
-; SI-CHECK: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; FUNC-LABEL: {{^}}v16f32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
+; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
 define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
 entry:
   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
@@ -455,18 +443,18 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}kernel_arg_i64:
-; SI: s_load_dwordx2
-; SI: s_load_dwordx2
-; SI: buffer_store_dwordx2
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: buffer_store_dwordx2
 define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
   store i64 %a, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
-; XSI: s_load_dwordx2
-; XSI: s_load_dwordx2
-; XSI: buffer_store_dwordx2
+; XGCN: s_load_dwordx2
+; XGCN: s_load_dwordx2
+; XGCN: buffer_store_dwordx2
 ; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
 ;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
 ;   ret void
diff --git a/test/CodeGen/R600/large-alloca.ll b/test/CodeGen/R600/large-alloca.ll
index d8be6d4..788816c 100644
--- a/test/CodeGen/R600/large-alloca.ll
+++ b/test/CodeGen/R600/large-alloca.ll
@@ -1,6 +1,7 @@
 ; XFAIL: *
 ; REQUIRES: asserts
-; RUN: llc -march=r600 -mcpu=SI < %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s
 
 define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) nounwind {
   %large = alloca [8192 x i32], align 4
diff --git a/test/CodeGen/R600/large-constant-initializer.ll b/test/CodeGen/R600/large-constant-initializer.ll
index 5612dd3..c8671ef 100644
--- a/test/CodeGen/R600/large-constant-initializer.ll
+++ b/test/CodeGen/R600/large-constant-initializer.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI < %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s
 ; CHECK: s_endpgm
 
 @gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4
diff --git a/test/CodeGen/R600/lds-initializer.ll b/test/CodeGen/R600/lds-initializer.ll
index 91d5d12..7344eff 100644
--- a/test/CodeGen/R600/lds-initializer.ll
+++ b/test/CodeGen/R600/lds-initializer.ll
@@ -1,4 +1,5 @@
-; RUN: not llc -march=r600 -mcpu=SI < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
 
 ; CHECK: error: unsupported initializer for address space in load_init_lds_global
 
diff --git a/test/CodeGen/R600/lds-zero-initializer.ll b/test/CodeGen/R600/lds-zero-initializer.ll
index 23912a9..1fb6f52 100644
--- a/test/CodeGen/R600/lds-zero-initializer.ll
+++ b/test/CodeGen/R600/lds-zero-initializer.ll
@@ -1,4 +1,5 @@
-; RUN: not llc -march=r600 -mcpu=SI < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
 
 ; CHECK: error: unsupported initializer for address space in load_zeroinit_lds_global
 
diff --git a/test/CodeGen/R600/llvm.AMDGPU.abs.ll b/test/CodeGen/R600/llvm.AMDGPU.abs.ll
index b4aede8..8bc2583 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.abs.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.abs.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.abs(i32) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll b/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll
index 98f6695..a11d9ae 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll
@@ -1,8 +1,10 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}test_barrier_global:
 ; EG: GROUP_BARRIER
+; SI: buffer_store_dword
+; SI: s_waitcnt
 ; SI: s_barrier
 
 define void @test_barrier_global(i32 addrspace(1)* %out) {
diff --git a/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll b/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll
index 92fe9f2..76c2453 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll
@@ -1,8 +1,11 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}test_barrier_local:
 ; EG: GROUP_BARRIER
+
+; SI: buffer_store_dword
+; SI: s_waitcnt
 ; SI: s_barrier
 
 define void @test_barrier_local(i32 addrspace(1)* %out) {
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
index 0b60d0d..2ec2546 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
index 0794ac4..6cd0108 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfi.ll b/test/CodeGen/R600/llvm.AMDGPU.bfi.ll
index df61b0b..517a55a 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfi.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfi.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.bfi(i32, i32, i32) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfm.ll b/test/CodeGen/R600/llvm.AMDGPU.bfm.ll
index 0ba4af5..2346f40 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfm.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfm.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.bfm(i32, i32) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.brev.ll b/test/CodeGen/R600/llvm.AMDGPU.brev.ll
index 647df34..3973f53 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.brev.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.brev.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone
 
diff --git a/test/CodeGen/R600/llvm.AMDGPU.clamp.ll b/test/CodeGen/R600/llvm.AMDGPU.clamp.ll
index c6efdb9..11ec963 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.clamp.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.clamp.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.fabs.f32(float) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.class.ll b/test/CodeGen/R600/llvm.AMDGPU.class.ll
new file mode 100644
index 0000000..f111eb9
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.class.ll
@@ -0,0 +1,497 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i1 @llvm.AMDGPU.class.f32(float, i32) #1
+declare i1 @llvm.AMDGPU.class.f64(double, i32) #1
+declare i32 @llvm.r600.read.tidig.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare double @llvm.fabs.f64(double) #1
+
+; SI-LABEL: {{^}}test_class_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fabs_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+  %a.fabs = call float @llvm.fabs.f32(float %a) #1
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a.fabs, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fneg_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+  %a.fneg = fsub float -0.0, %a
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fneg_fabs_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+  %a.fabs = call float @llvm.fabs.f32(float %a) #1
+  %a.fneg.fabs = fsub float -0.0, %a.fabs
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg.fabs, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_1_f32:
+; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}}
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_64_f32:
+; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}}
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Set all 10 bits of mask
+; SI-LABEL: {{^}}test_class_full_mask_f32:
+; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
+; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1023) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_9bit_mask_f32:
+; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
+; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}v_test_class_full_mask_f32:
+; SI-DAG: buffer_load_dword [[VA:v[0-9]+]]
+; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
+; SI: v_cmp_class_f32_e32 vcc, [[VA]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load float addrspace(1)* %gep.in
+
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f32:
+; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
+; SI: v_cmp_class_f32_e32 vcc, 1.0, [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %b = load i32 addrspace(1)* %gep.in
+
+  %result = call i1 @llvm.AMDGPU.class.f32(float 1.0, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; FIXME: Why isn't this using a literal constant operand?
+; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32:
+; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
+; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
+; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %b = load i32 addrspace(1)* %gep.in
+
+  %result = call i1 @llvm.AMDGPU.class.f32(float 1024.0, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_f64:
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fabs_f64:
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+  %a.fabs = call double @llvm.fabs.f64(double %a) #1
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a.fabs, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fneg_f64:
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+  %a.fneg = fsub double -0.0, %a
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fneg_fabs_f64:
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+  %a.fabs = call double @llvm.fabs.f64(double %a) #1
+  %a.fneg.fabs = fsub double -0.0, %a.fabs
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg.fabs, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_1_f64:
+; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}}
+; SI: s_endpgm
+define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 1) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_64_f64:
+; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}}
+; SI: s_endpgm
+define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 64) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Set all 9 bits of mask
+; SI-LABEL: {{^}}test_class_full_mask_f64:
+; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
+; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}v_test_class_full_mask_f64:
+; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]]
+; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
+; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load double addrspace(1)* %in
+
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f64:
+; XSI: v_cmp_class_f64_e32 vcc, 1.0,
+; SI: v_cmp_class_f64_e32 vcc,
+; SI: s_endpgm
+define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %b = load i32 addrspace(1)* %gep.in
+
+  %result = call i1 @llvm.AMDGPU.class.f64(double 1.0, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64:
+; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
+; SI: s_endpgm
+define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %b = load i32 addrspace(1)* %gep.in
+
+  %result = call i1 @llvm.AMDGPU.class.f64(double 1024.0, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_class_f32_0:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 3) #1
+  %or = or i1 %class0, %class1
+
+  %sext = sext i1 %or to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or3_class_f32_0:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
+  %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %or.0 = or i1 %class0, %class1
+  %or.1 = or i1 %or.0, %class2
+
+  %sext = sext i1 %or.1 to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0:
+; SI-NOT: v_cmp_class
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
+; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
+  %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %class3 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
+  %class4 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 16) #1
+  %class5 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 32) #1
+  %class6 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1
+  %class7 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 128) #1
+  %class8 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 256) #1
+  %class9 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 512) #1
+  %or.0 = or i1 %class0, %class1
+  %or.1 = or i1 %or.0, %class2
+  %or.2 = or i1 %or.1, %class3
+  %or.3 = or i1 %or.2, %class4
+  %or.4 = or i1 %or.3, %class5
+  %or.5 = or i1 %or.4, %class6
+  %or.6 = or i1 %or.5, %class7
+  %or.7 = or i1 %or.6, %class8
+  %or.8 = or i1 %or.7, %class9
+  %sext = sext i1 %or.8 to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_class_f32_1:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
+  %or = or i1 %class0, %class1
+
+  %sext = sext i1 %or to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_class_f32_2:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
+  %or = or i1 %class0, %class1
+
+  %sext = sext i1 %or to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_no_fold_or_class_f32_0:
+; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}}
+; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
+; SI: s_or_b64
+; SI: s_endpgm
+define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %b, i32 8) #1
+  %or = or i1 %class0, %class1
+
+  %sext = sext i1 %or to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_0_f32:
+; SI-NOT: v_cmp_class
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 0) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_0_f64:
+; SI-NOT: v_cmp_class
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 0) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll b/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
index 7aacbb9..799817e 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
 
 declare float @llvm.AMDGPU.cvt.f32.ubyte0(i32) nounwind readnone
 declare float @llvm.AMDGPU.cvt.f32.ubyte1(i32) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll b/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
index 009fd73..55ca9c7 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
@@ -1,25 +1,29 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
 declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone
 declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone
 
-; SI-LABEL: {{^}}test_div_fixup_f32:
+; GCN-LABEL: {{^}}test_div_fixup_f32:
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}test_div_fixup_f64:
-; SI: v_div_fixup_f64
+; GCN-LABEL: {{^}}test_div_fixup_f64:
+; GCN: v_div_fixup_f64
 define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
   %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll b/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
index dcca9e9..239fd53 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
@@ -1,27 +1,179 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
+; FIXME: Enable for VI.
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate
 declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone
 declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone
 
-; SI-LABEL: {{^}}test_div_fmas_f32:
+; GCN-LABEL: {{^}}test_div_fmas_f32:
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
+; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0:
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
 ; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
+; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
+define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
+; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VA]], [[VC]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}test_div_fmas_f64:
-; SI: v_div_fmas_f64
+; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f64:
+; GCN: v_div_fmas_f64
 define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
   %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
 }
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
+; SI: v_cmp_eq_i32_e64 vcc, s{{[0-9]+}}, 0
+; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
+  %cmp = icmp eq i32 %i, 0
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
+; SI: s_mov_b64 vcc, 0
+; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
+; SI: s_mov_b64 vcc, -1
+; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-DAG: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0
+; SI-DAG: v_cmp_ne_i32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0
+; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
+; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
+; SI: s_endpgm
+define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.a = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.b = getelementptr float addrspace(1)* %gep.a, i32 1
+  %gep.c = getelementptr float addrspace(1)* %gep.a, i32 2
+  %gep.out = getelementptr float addrspace(1)* %out, i32 2
+
+  %a = load float addrspace(1)* %gep.a
+  %b = load float addrspace(1)* %gep.b
+  %c = load float addrspace(1)* %gep.c
+
+  %cmp0 = icmp eq i32 %tid, 0
+  %cmp1 = icmp ne i32 %d, 0
+  %and = and i1 %cmp0, %cmp1
+
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone
+  store float %result, float addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
+; SI: v_cmp_eq_i32_e64 [[CMPTID:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0
+; SI: s_and_saveexec_b64 [[CMPTID]], [[CMPTID]]
+; SI: s_xor_b64 [[CMPTID]], exec, [[CMPTID]]
+
+; SI: buffer_load_dword [[LOAD:v[0-9]+]]
+; SI: v_cmp_ne_i32_e64 [[CMPLOAD:s\[[0-9]+:[0-9]+\]]], [[LOAD]], 0
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, [[CMPLOAD]]
+
+
+; SI: BB9_2:
+; SI: s_or_b64 exec, exec, [[CMPTID]]
+; SI: v_cmp_ne_i32_e32 vcc, 0, v0
+; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
+entry:
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.out = getelementptr float addrspace(1)* %out, i32 2
+  %gep.a = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.b = getelementptr float addrspace(1)* %gep.a, i32 1
+  %gep.c = getelementptr float addrspace(1)* %gep.a, i32 2
+
+  %a = load float addrspace(1)* %gep.a
+  %b = load float addrspace(1)* %gep.b
+  %c = load float addrspace(1)* %gep.c
+
+  %cmp0 = icmp eq i32 %tid, 0
+  br i1 %cmp0, label %bb, label %exit
+
+bb:
+  %val = load i32 addrspace(1)* %dummy
+  %cmp1 = icmp ne i32 %val, 0
+  br label %exit
+
+exit:
+  %cond = phi i1 [false, %entry], [%cmp1, %bb]
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone
+  store float %result, float addrspace(1)* %gep.out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll b/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
index 641c8ca..5773da0 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
@@ -1,12 +1,13 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone
 declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
 
 ; SI-LABEL @test_div_scale_f32_1:
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
@@ -26,7 +27,7 @@ define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)*
 
 ; SI-LABEL @test_div_scale_f32_2:
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
@@ -46,7 +47,7 @@ define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)*
 
 ; SI-LABEL @test_div_scale_f64_1:
 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x8
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
@@ -66,7 +67,7 @@ define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)
 
 ; SI-LABEL @test_div_scale_f64_1:
 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x8
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
@@ -285,3 +286,79 @@ define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %
   store double %result0, double addrspace(1)* %out, align 8
   ret void
 }
+
+; SI-LABEL @test_div_scale_f32_inline_imm_num:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %a = load float addrspace(1)* %gep.0, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_inline_imm_den:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %a = load float addrspace(1)* %gep.0, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_fabs_num:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], |[[A]]|
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a.fabs, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_fabs_den:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], |[[B]]|, |[[B]]|, [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b.fabs, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.fract.ll b/test/CodeGen/R600/llvm.AMDGPU.fract.ll
index 235068c..7d15300 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.fract.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.fract.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.AMDGPU.fract.f32(float) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imad24.ll b/test/CodeGen/R600/llvm.AMDGPU.imad24.ll
index 8998840..42102e3 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imad24.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imad24.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 ; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imax.ll b/test/CodeGen/R600/llvm.AMDGPU.imax.ll
index dac21a4..ce7fca0 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imax.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imax.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}vector_imax:
 ; SI: v_max_i32_e32
@@ -29,4 +30,4 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 
-!0 = metadata !{metadata !"const", null, i32 1}
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imin.ll b/test/CodeGen/R600/llvm.AMDGPU.imin.ll
index 462c497..15cd38b 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imin.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imin.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}vector_imin:
 ; SI: v_min_i32_e32
@@ -29,4 +30,4 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 
-!0 = metadata !{metadata !"const", null, i32 1}
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imul24.ll b/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
index db563dd..fdc1172 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
diff --git a/test/CodeGen/R600/llvm.AMDGPU.kill.ll b/test/CodeGen/R600/llvm.AMDGPU.kill.ll
index 988b43c..d1ff3b1 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.kill.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.kill.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}kill_gs_const:
 ; SI-NOT: v_cmpx_le_f32
@@ -19,4 +20,4 @@ declare void @llvm.AMDGPU.kill(float)
 
 attributes #0 = { "ShaderType"="2" }
 
-!0 = metadata !{metadata !"const", null, i32 1}
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll b/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll
index 72719fe..a59c0ce 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare float @llvm.AMDGPU.ldexp.f32(float, i32) nounwind readnone
 declare double @llvm.AMDGPU.ldexp.f64(double, i32) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll b/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll
index 6e3fa25..4cafd56 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.AMDGPU.legacy.rsq(float) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll
index c4b04c5..d2a655b 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
 declare double @llvm.sqrt.f64(double) nounwind readnone
@@ -22,6 +23,8 @@ define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
 ; FUNC-LABEL: {{^}}rsq_rcp_pat_f64:
 ; SI-UNSAFE: v_rsq_f64_e32
 ; SI-SAFE-NOT: v_rsq_f64_e32
+; SI-SAFE: v_sqrt_f64
+; SI-SAFE: v_rcp_f64
 define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
   %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone
   %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rcp.ll b/test/CodeGen/R600/llvm.AMDGPU.rcp.ll
index 3ee3e6b..edd6e9a 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.rcp.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.rcp.ll
@@ -1,6 +1,9 @@
-; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s
 
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
index 18854be..67f1d22 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
@@ -1,9 +1,21 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
 
 declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone
 
 ; FUNC-LABEL: {{^}}rsq_clamped_f64:
 ; SI: v_rsq_clamp_f64_e32
+
+; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3]
+; TODO: this constant should be folded:
+; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
+; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
+; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
+; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
+; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
+; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
+; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
+
 define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {
   %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone
   store double %rsq_clamped, double addrspace(1)* %out, align 8
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
index 6bf9f0c..eeff253 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
@@ -6,7 +7,15 @@ declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone
 
 ; FUNC-LABEL: {{^}}rsq_clamped_f32:
 ; SI: v_rsq_clamp_f32_e32
+
+; VI: v_rsq_f32_e32 [[RSQ:v[0-9]+]], {{s[0-9]+}}
+; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]]
+; TODO: this constant should be folded:
+; VI: v_mov_b32_e32 [[MINFLT:v[0-9]+]], 0xff7fffff
+; VI: v_max_f32_e32 {{v[0-9]+}}, [[MIN]], [[MINFLT]]
+
 ; EG: RECIPSQRT_CLAMPED
+
 define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
   %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone
   store float %rsq_clamped, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.ll
index d6299b8..36b72f1 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.rsq.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll b/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
index 2e6bd5c..5829f73 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone
 
diff --git a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
index fdd531d..74792e5 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
@@ -1,10 +1,11 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
 
-; R600-CHECK: {{^}}amdgpu_trunc:
-; R600-CHECK: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK: {{^}}amdgpu_trunc:
-; SI-CHECK: v_trunc_f32
+; R600: {{^}}amdgpu_trunc:
+; R600: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: {{^}}amdgpu_trunc:
+; SI: v_trunc_f32
 
 define void @amdgpu_trunc(float addrspace(1)* %out, float %x) {
 entry:
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
index 59d6248..88613db 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
@@ -20,7 +20,7 @@ define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2
 
 ; FUNC-LABEL: {{^}}commute_umad24:
 ; SI-DAG: buffer_load_dword [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: buffer_load_dword [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mad_u32_u24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]]
 ; SI: buffer_store_dword [[RESULT]]
 define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umax.ll b/test/CodeGen/R600/llvm.AMDGPU.umax.ll
index ee854ec..4320dfe 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umax.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umax.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}vector_umax:
 ; SI: v_max_u32_e32
@@ -44,4 +45,4 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 
-!0 = metadata !{metadata !"const", null, i32 1}
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umin.ll b/test/CodeGen/R600/llvm.AMDGPU.umin.ll
index 2eaa372..e4cac33 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umin.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umin.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}vector_umin:
 ; SI: v_min_u32_e32
@@ -44,4 +45,4 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 
-!0 = metadata !{metadata !"const", null, i32 1}
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umul24.ll b/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
index 567ac31..76624a0 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
diff --git a/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
deleted file mode 100644
index d26bc32..0000000
--- a/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
-
-;CHECK: s_mov_b32
-;CHECK-NEXT: v_interp_mov_f32
-
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
-main_body:
-  %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
-  %5 = call i32 @llvm.SI.packf16(float %4, float %4)
-  %6 = bitcast i32 %5 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %6, float %6, float %6, float %6)
-  ret void
-}
-
-declare void @llvm.AMDGPU.shader.type(i32)
-
-declare float @llvm.SI.fs.constant(i32, i32, i32) readnone
-
-declare i32 @llvm.SI.packf16(float, float) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/llvm.SI.fs.interp.ll b/test/CodeGen/R600/llvm.SI.fs.interp.ll
new file mode 100644
index 0000000..6b36140
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.fs.interp.ll
@@ -0,0 +1,30 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-NOT: s_wqm
+;CHECK: s_mov_b32
+;CHECK-NEXT: v_interp_mov_f32
+;CHECK: v_interp_p1_f32
+;CHECK: v_interp_p2_f32
+
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 {
+main_body:
+  %5 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
+  %6 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %4)
+  %7 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %4)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %6, float %7, float %7)
+  ret void
+}
+
+declare void @llvm.AMDGPU.shader.type(i32)
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.constant(i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.SI.gather4.ll b/test/CodeGen/R600/llvm.SI.gather4.ll
index 91a2012..275cb58 100644
--- a/test/CodeGen/R600/llvm.SI.gather4.ll
+++ b/test/CodeGen/R600/llvm.SI.gather4.ll
@@ -1,4 +1,5 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-LABEL: {{^}}gather4_v2:
 ;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
diff --git a/test/CodeGen/R600/llvm.SI.getlod.ll b/test/CodeGen/R600/llvm.SI.getlod.ll
index ec26fe5..06ee98e 100644
--- a/test/CodeGen/R600/llvm.SI.getlod.ll
+++ b/test/CodeGen/R600/llvm.SI.getlod.ll
@@ -1,4 +1,5 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-LABEL: {{^}}getlod:
 ;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
diff --git a/test/CodeGen/R600/llvm.SI.image.ll b/test/CodeGen/R600/llvm.SI.image.ll
index 4eec543..0fac8d7 100644
--- a/test/CodeGen/R600/llvm.SI.image.ll
+++ b/test/CodeGen/R600/llvm.SI.image.ll
@@ -1,4 +1,5 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-LABEL: {{^}}image_load:
 ;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
diff --git a/test/CodeGen/R600/llvm.SI.image.sample.ll b/test/CodeGen/R600/llvm.SI.image.sample.ll
index ebff391..4bc638a 100644
--- a/test/CodeGen/R600/llvm.SI.image.sample.ll
+++ b/test/CodeGen/R600/llvm.SI.image.sample.ll
@@ -1,6 +1,8 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-LABEL: {{^}}sample:
+;CHECK: s_wqm
 ;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample() #0 {
 main_body:
@@ -14,6 +16,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_cl:
+;CHECK: s_wqm
 ;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_cl() #0 {
 main_body:
@@ -27,6 +30,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_d:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_d() #0 {
 main_body:
@@ -40,6 +44,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_d_cl:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_d_cl() #0 {
 main_body:
@@ -53,6 +58,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_l:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_l() #0 {
 main_body:
@@ -66,6 +72,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_b:
+;CHECK: s_wqm
 ;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_b() #0 {
 main_body:
@@ -79,6 +86,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_b_cl:
+;CHECK: s_wqm
 ;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_b_cl() #0 {
 main_body:
@@ -92,6 +100,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_lz:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_lz() #0 {
 main_body:
@@ -105,6 +114,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_cd:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_cd() #0 {
 main_body:
@@ -118,6 +128,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_cd_cl:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_cd_cl() #0 {
 main_body:
@@ -131,6 +142,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c:
+;CHECK: s_wqm
 ;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c() #0 {
 main_body:
@@ -144,6 +156,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_cl:
+;CHECK: s_wqm
 ;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_cl() #0 {
 main_body:
@@ -157,6 +170,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_d:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_d() #0 {
 main_body:
@@ -170,6 +184,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_d_cl:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_d_cl() #0 {
 main_body:
@@ -183,6 +198,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_l:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_l() #0 {
 main_body:
@@ -196,6 +212,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_b:
+;CHECK: s_wqm
 ;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_b() #0 {
 main_body:
@@ -209,6 +226,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_b_cl:
+;CHECK: s_wqm
 ;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_b_cl() #0 {
 main_body:
@@ -222,6 +240,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_lz:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_lz() #0 {
 main_body:
@@ -235,6 +254,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_cd:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_cd() #0 {
 main_body:
@@ -248,6 +268,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_cd_cl:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_cd_cl() #0 {
 main_body:
diff --git a/test/CodeGen/R600/llvm.SI.image.sample.o.ll b/test/CodeGen/R600/llvm.SI.image.sample.o.ll
index dbc1b2b..9d89354 100644
--- a/test/CodeGen/R600/llvm.SI.image.sample.o.ll
+++ b/test/CodeGen/R600/llvm.SI.image.sample.o.ll
@@ -1,6 +1,8 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-LABEL: {{^}}sample:
+;CHECK: s_wqm
 ;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample() #0 {
 main_body:
@@ -14,6 +16,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_cl:
+;CHECK: s_wqm
 ;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_cl() #0 {
 main_body:
@@ -27,6 +30,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_d:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_d() #0 {
 main_body:
@@ -40,6 +44,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_d_cl:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_d_cl() #0 {
 main_body:
@@ -53,6 +58,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_l:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_l() #0 {
 main_body:
@@ -66,6 +72,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_b:
+;CHECK: s_wqm
 ;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_b() #0 {
 main_body:
@@ -79,6 +86,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_b_cl:
+;CHECK: s_wqm
 ;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_b_cl() #0 {
 main_body:
@@ -92,6 +100,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_lz:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_lz() #0 {
 main_body:
@@ -105,6 +114,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_cd:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_cd() #0 {
 main_body:
@@ -118,6 +128,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_cd_cl:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_cd_cl() #0 {
 main_body:
@@ -131,6 +142,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c:
+;CHECK: s_wqm
 ;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c() #0 {
 main_body:
@@ -144,6 +156,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_cl:
+;CHECK: s_wqm
 ;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_cl() #0 {
 main_body:
@@ -157,6 +170,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_d:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_d() #0 {
 main_body:
@@ -170,6 +184,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_d_cl:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_d_cl() #0 {
 main_body:
@@ -183,6 +198,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_l:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_l() #0 {
 main_body:
@@ -196,6 +212,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_b:
+;CHECK: s_wqm
 ;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_b() #0 {
 main_body:
@@ -209,6 +226,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_b_cl:
+;CHECK: s_wqm
 ;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_b_cl() #0 {
 main_body:
@@ -222,6 +240,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_lz:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_lz() #0 {
 main_body:
@@ -235,6 +254,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_cd:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_cd() #0 {
 main_body:
@@ -248,6 +268,7 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}sample_c_cd_cl:
+;CHECK-NOT: s_wqm
 ;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @sample_c_cd_cl() #0 {
 main_body:
diff --git a/test/CodeGen/R600/llvm.SI.imageload.ll b/test/CodeGen/R600/llvm.SI.imageload.ll
index 673d92d..35e4591 100644
--- a/test/CodeGen/R600/llvm.SI.imageload.ll
+++ b/test/CodeGen/R600/llvm.SI.imageload.ll
@@ -1,4 +1,5 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-DAG: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
 ;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
@@ -126,6 +127,6 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
 attributes #0 = { "ShaderType"="0" }
 attributes #1 = { nounwind readnone }
 
-!0 = metadata !{metadata !"const", null}
-!1 = metadata !{}
-!2 = metadata !{metadata !0, metadata !0, i64 0, i32 1}
+!0 = !{!"const", null}
+!1 = !{}
+!2 = !{!0, !0, i64 0, i32 1}
diff --git a/test/CodeGen/R600/llvm.SI.load.dword.ll b/test/CodeGen/R600/llvm.SI.load.dword.ll
index e5c6201..d2e6a8e 100644
--- a/test/CodeGen/R600/llvm.SI.load.dword.ll
+++ b/test/CodeGen/R600/llvm.SI.load.dword.ll
@@ -1,28 +1,41 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
 
 ; Example of a simple geometry shader loading vertex attributes from the
 ; ESGS ring buffer
 
-; CHECK-LABEL: {{^}}main:
-; CHECK: buffer_load_dword
-; CHECK: buffer_load_dword
-; CHECK: buffer_load_dword
-; CHECK: buffer_load_dword
+; FIXME: Out of bounds immediate offset crashes
 
-define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, [2 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* inreg, [17 x <16 x i8>] addrspace(2)* inreg, i32, i32, i32, i32) #0 {
+; CHECK-LABEL: {{^}}main:
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc
+; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc
+
+define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) #0 {
 main_body:
-  %10 = getelementptr [2 x <16 x i8>] addrspace(2)* %3, i64 0, i32 1
-  %11 = load <16 x i8> addrspace(2)* %10, !tbaa !0
-  %12 = shl i32 %6, 2
-  %13 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0)
-  %14 = bitcast i32 %13 to float
-  %15 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %11, i32 %12, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0)
-  %16 = bitcast i32 %15 to float
-  %17 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %11, i32 %12, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0)
-  %18 = bitcast i32 %17 to float
-  %19 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %11, <2 x i32> <i32 0, i32 0>, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0)
-  %20 = bitcast i32 %19 to float
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %14, float %16, float %18, float %20)
+  %tmp = getelementptr [2 x <16 x i8>] addrspace(2)* %arg3, i64 0, i32 1
+  %tmp10 = load <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp11 = shl i32 %arg6, 2
+  %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0)
+  %tmp13 = bitcast i32 %tmp12 to float
+  %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %tmp15 = bitcast i32 %tmp14 to float
+  %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0)
+  %tmp17 = bitcast i32 %tmp16 to float
+  %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0)
+  %tmp19 = bitcast i32 %tmp18 to float
+
+  %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0)
+  %tmp21 = bitcast i32 %tmp20 to float
+
+  %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0)
+  %tmp23 = bitcast i32 %tmp22 to float
+
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp13, float %tmp15, float %tmp17, float %tmp19)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp21, float %tmp23, float %tmp23, float %tmp23)
   ret void
 }
 
@@ -37,4 +50,4 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
 attributes #0 = { "ShaderType"="1" }
 attributes #1 = { nounwind readonly }
 
-!0 = metadata !{metadata !"const", null, i32 1}
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.SI.resinfo.ll b/test/CodeGen/R600/llvm.SI.resinfo.ll
index d8f3722..ac95fd0 100644
--- a/test/CodeGen/R600/llvm.SI.resinfo.ll
+++ b/test/CodeGen/R600/llvm.SI.resinfo.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
 ; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
diff --git a/test/CodeGen/R600/llvm.SI.sample-masked.ll b/test/CodeGen/R600/llvm.SI.sample-masked.ll
index 9e86bec..ce9558c 100644
--- a/test/CodeGen/R600/llvm.SI.sample-masked.ll
+++ b/test/CodeGen/R600/llvm.SI.sample-masked.ll
@@ -1,4 +1,5 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s
 
 ; CHECK-LABEL: {{^}}v1:
 ; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 13
diff --git a/test/CodeGen/R600/llvm.SI.sample.ll b/test/CodeGen/R600/llvm.SI.sample.ll
index a1d2c02..509c45f 100644
--- a/test/CodeGen/R600/llvm.SI.sample.ll
+++ b/test/CodeGen/R600/llvm.SI.sample.ll
@@ -1,4 +1,5 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15
 ;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 3
diff --git a/test/CodeGen/R600/llvm.SI.sampled.ll b/test/CodeGen/R600/llvm.SI.sampled.ll
index 91b71f3..f2badff 100644
--- a/test/CodeGen/R600/llvm.SI.sampled.ll
+++ b/test/CodeGen/R600/llvm.SI.sampled.ll
@@ -1,4 +1,5 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15
 ;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 3
diff --git a/test/CodeGen/R600/llvm.SI.sendmsg-m0.ll b/test/CodeGen/R600/llvm.SI.sendmsg-m0.ll
new file mode 100644
index 0000000..2198590
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.sendmsg-m0.ll
@@ -0,0 +1,20 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=BOTH %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=BOTH %s
+
+; BOTH-LABEL: {{^}}main:
+; BOTH: s_mov_b32 m0, s0
+; VI-NEXT: s_nop 0
+; BOTH-NEXT: s_sendmsg Gs_done(nop)
+; BOTH-NEXT: s_endpgm
+
+define void @main(i32 inreg %a) #0 {
+main_body:
+  call void @llvm.SI.sendmsg(i32 3, i32 %a)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.SI.sendmsg(i32, i32) #1
+
+attributes #0 = { "ShaderType"="2" "unsafe-fp-math"="true" }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/llvm.SI.sendmsg.ll b/test/CodeGen/R600/llvm.SI.sendmsg.ll
index 042fc5b..ce38002 100644
--- a/test/CodeGen/R600/llvm.SI.sendmsg.ll
+++ b/test/CodeGen/R600/llvm.SI.sendmsg.ll
@@ -1,4 +1,5 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: s_sendmsg Gs(emit stream 0)
diff --git a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
index 702daea..71f5154 100644
--- a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
+++ b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
@@ -1,4 +1,5 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-LABEL: {{^}}test1:
 ;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
diff --git a/test/CodeGen/R600/llvm.SI.tid.ll b/test/CodeGen/R600/llvm.SI.tid.ll
index ee96124..f6e6d70 100644
--- a/test/CodeGen/R600/llvm.SI.tid.ll
+++ b/test/CodeGen/R600/llvm.SI.tid.ll
@@ -1,7 +1,9 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s
 
-;CHECK: v_mbcnt_lo_u32_b32_e64
-;CHECK: v_mbcnt_hi_u32_b32_e32
+;GCN: v_mbcnt_lo_u32_b32_e64
+;SI: v_mbcnt_hi_u32_b32_e32
+;VI: v_mbcnt_hi_u32_b32_e64
 
 define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
 main_body:
diff --git a/test/CodeGen/R600/llvm.amdgpu.kilp.ll b/test/CodeGen/R600/llvm.amdgpu.kilp.ll
index 08bee38..42df6db 100644
--- a/test/CodeGen/R600/llvm.amdgpu.kilp.ll
+++ b/test/CodeGen/R600/llvm.amdgpu.kilp.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}kilp_gs_const:
 ; SI: s_mov_b64 exec, 0
@@ -17,4 +18,4 @@ declare void @llvm.AMDGPU.kilp(float)
 
 attributes #0 = { "ShaderType"="2" }
 
-!0 = metadata !{metadata !"const", null, i32 1}
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.amdgpu.lrp.ll b/test/CodeGen/R600/llvm.amdgpu.lrp.ll
index ee922fe..4e4c2ec 100644
--- a/test/CodeGen/R600/llvm.amdgpu.lrp.ll
+++ b/test/CodeGen/R600/llvm.amdgpu.lrp.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone
 
diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll
index 837340f..c65df8b 100644
--- a/test/CodeGen/R600/llvm.cos.ll
+++ b/test/CodeGen/R600/llvm.cos.ll
@@ -1,5 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC
-;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 
 ;FUNC-LABEL: test
 ;EG: MULADD_IEEE *
diff --git a/test/CodeGen/R600/llvm.exp2.ll b/test/CodeGen/R600/llvm.exp2.ll
index 52dc67d..4269892 100644
--- a/test/CodeGen/R600/llvm.exp2.ll
+++ b/test/CodeGen/R600/llvm.exp2.ll
@@ -1,14 +1,15 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK --check-prefix=FUNC
-;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK --check-prefix=FUNC
-;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
 ;FUNC-LABEL: {{^}}test:
-;EG-CHECK: EXP_IEEE
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: v_exp_f32
+;EG: EXP_IEEE
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_exp_f32
 
 define void @test(float addrspace(1)* %out, float %in) {
 entry:
@@ -18,20 +19,20 @@ entry:
 }
 
 ;FUNC-LABEL: {{^}}testv2:
-;EG-CHECK: EXP_IEEE
-;EG-CHECK: EXP_IEEE
+;EG: EXP_IEEE
+;EG: EXP_IEEE
 ; FIXME: We should be able to merge these packets together on Cayman so we
 ; have a maximum of 4 instructions.
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: v_exp_f32
-;SI-CHECK: v_exp_f32
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_exp_f32
+;SI: v_exp_f32
 
 define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
@@ -41,32 +42,32 @@ entry:
 }
 
 ;FUNC-LABEL: {{^}}testv4:
-;EG-CHECK: EXP_IEEE
-;EG-CHECK: EXP_IEEE
-;EG-CHECK: EXP_IEEE
-;EG-CHECK: EXP_IEEE
+;EG: EXP_IEEE
+;EG: EXP_IEEE
+;EG: EXP_IEEE
+;EG: EXP_IEEE
 ; FIXME: We should be able to merge these packets together on Cayman so we
 ; have a maximum of 4 instructions.
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: v_exp_f32
-;SI-CHECK: v_exp_f32
-;SI-CHECK: v_exp_f32
-;SI-CHECK: v_exp_f32
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_exp_f32
+;SI: v_exp_f32
+;SI: v_exp_f32
+;SI: v_exp_f32
 define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
diff --git a/test/CodeGen/R600/llvm.floor.ll b/test/CodeGen/R600/llvm.floor.ll
deleted file mode 100644
index 0c7a15b..0000000
--- a/test/CodeGen/R600/llvm.floor.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
-
-; R600-CHECK: {{^}}f32:
-; R600-CHECK: FLOOR
-; SI-CHECK: {{^}}f32:
-; SI-CHECK: v_floor_f32_e32
-define void @f32(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = call float @llvm.floor.f32(float %in)
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-; R600-CHECK: {{^}}v2f32:
-; R600-CHECK: FLOOR
-; R600-CHECK: FLOOR
-; SI-CHECK: {{^}}v2f32:
-; SI-CHECK: v_floor_f32_e32
-; SI-CHECK: v_floor_f32_e32
-define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %0 = call <2 x float> @llvm.floor.v2f32(<2 x float> %in)
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; R600-CHECK: {{^}}v4f32:
-; R600-CHECK: FLOOR
-; R600-CHECK: FLOOR
-; R600-CHECK: FLOOR
-; R600-CHECK: FLOOR
-; SI-CHECK: {{^}}v4f32:
-; SI-CHECK: v_floor_f32_e32
-; SI-CHECK: v_floor_f32_e32
-; SI-CHECK: v_floor_f32_e32
-; SI-CHECK: v_floor_f32_e32
-define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-entry:
-  %0 = call <4 x float> @llvm.floor.v4f32(<4 x float> %in)
-  store <4 x float> %0, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; Function Attrs: nounwind readonly
-declare float @llvm.floor.f32(float) #0
-
-; Function Attrs: nounwind readonly
-declare <2 x float> @llvm.floor.v2f32(<2 x float>) #0
-
-; Function Attrs: nounwind readonly
-declare <4 x float> @llvm.floor.v4f32(<4 x float>) #0
-
-attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/R600/llvm.log2.ll b/test/CodeGen/R600/llvm.log2.ll
index 0b54a46..c75e785 100644
--- a/test/CodeGen/R600/llvm.log2.ll
+++ b/test/CodeGen/R600/llvm.log2.ll
@@ -1,14 +1,15 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK --check-prefix=FUNC
-;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK --check-prefix=FUNC
-;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
 ;FUNC-LABEL: {{^}}test:
-;EG-CHECK: LOG_IEEE
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: v_log_f32
+;EG: LOG_IEEE
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_log_f32
 
 define void @test(float addrspace(1)* %out, float %in) {
 entry:
@@ -18,20 +19,20 @@ entry:
 }
 
 ;FUNC-LABEL: {{^}}testv2:
-;EG-CHECK: LOG_IEEE
-;EG-CHECK: LOG_IEEE
+;EG: LOG_IEEE
+;EG: LOG_IEEE
 ; FIXME: We should be able to merge these packets together on Cayman so we
 ; have a maximum of 4 instructions.
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: v_log_f32
-;SI-CHECK: v_log_f32
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_log_f32
+;SI: v_log_f32
 
 define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
@@ -41,32 +42,32 @@ entry:
 }
 
 ;FUNC-LABEL: {{^}}testv4:
-;EG-CHECK: LOG_IEEE
-;EG-CHECK: LOG_IEEE
-;EG-CHECK: LOG_IEEE
-;EG-CHECK: LOG_IEEE
+;EG: LOG_IEEE
+;EG: LOG_IEEE
+;EG: LOG_IEEE
+;EG: LOG_IEEE
 ; FIXME: We should be able to merge these packets together on Cayman so we
 ; have a maximum of 4 instructions.
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: v_log_f32
-;SI-CHECK: v_log_f32
-;SI-CHECK: v_log_f32
-;SI-CHECK: v_log_f32
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_log_f32
+;SI: v_log_f32
+;SI: v_log_f32
+;SI: v_log_f32
 define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in)
diff --git a/test/CodeGen/R600/llvm.memcpy.ll b/test/CodeGen/R600/llvm.memcpy.ll
index 5f2710a..e491732 100644
--- a/test/CodeGen/R600/llvm.memcpy.ll
+++ b/test/CodeGen/R600/llvm.memcpy.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind
 declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
@@ -6,39 +7,23 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
 
 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
-
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
+
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
-
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
 ; SI: ds_read_u8
 
-
 ; SI: ds_read_u8
 ; SI: ds_read_u8
 ; SI: ds_read_u8
@@ -65,6 +50,14 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
 ; SI: ds_write_b8
 
 ; SI: ds_write_b8
@@ -75,6 +68,14 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
 ; SI: ds_write_b8
 
 ; SI: s_endpgm
diff --git a/test/CodeGen/R600/llvm.rint.f64.ll b/test/CodeGen/R600/llvm.rint.f64.ll
index 72b546e..c63fb17 100644
--- a/test/CodeGen/R600/llvm.rint.f64.ll
+++ b/test/CodeGen/R600/llvm.rint.f64.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}rint_f64:
 ; CI: v_rndne_f64_e32
diff --git a/test/CodeGen/R600/llvm.rint.ll b/test/CodeGen/R600/llvm.rint.ll
index 2e05964..661db51 100644
--- a/test/CodeGen/R600/llvm.rint.ll
+++ b/test/CodeGen/R600/llvm.rint.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}rint_f32:
 ; R600: RNDNE
diff --git a/test/CodeGen/R600/llvm.round.f64.ll b/test/CodeGen/R600/llvm.round.f64.ll
new file mode 100644
index 0000000..920dbb3
--- /dev/null
+++ b/test/CodeGen/R600/llvm.round.f64.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}round_f64:
+; SI: s_endpgm
+define void @round_f64(double addrspace(1)* %out, double %x) #0 {
+  %result = call double @llvm.round.f64(double %x) #1
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; This is a pretty large function, so just test a few of the
+; instructions that are necessary.
+
+; FUNC-LABEL: {{^}}v_round_f64:
+; SI: buffer_load_dwordx2
+; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11
+
+; SI-DAG: v_not_b32_e32
+; SI-DAG: v_not_b32_e32
+
+; SI-DAG: v_cmp_eq_i32
+
+; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff
+; SI-DAG: v_cmp_lt_i32_e64
+; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]]
+
+; SI-DAG: v_cmp_gt_i32_e64
+
+
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr double addrspace(1)* %out, i32 %tid
+  %x = load double addrspace(1)* %gep
+  %result = call double @llvm.round.f64(double %x) #1
+  store double %result, double addrspace(1)* %out.gep
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v2f64:
+; SI: s_endpgm
+define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
+  %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
+  store <2 x double> %result, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v4f64:
+; SI: s_endpgm
+define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
+  %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v8f64:
+; SI: s_endpgm
+define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
+  %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
+  store <8 x double> %result, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+declare double @llvm.round.f64(double) #1
+declare <2 x double> @llvm.round.v2f64(<2 x double>) #1
+declare <4 x double> @llvm.round.v4f64(<4 x double>) #1
+declare <8 x double> @llvm.round.v8f64(<8 x double>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.round.ll b/test/CodeGen/R600/llvm.round.ll
index bedf4ba..8d1cfb6 100644
--- a/test/CodeGen/R600/llvm.round.ll
+++ b/test/CodeGen/R600/llvm.round.ll
@@ -1,17 +1,28 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600 --check-prefix=FUNC
-
-; FUNC-LABEL: {{^}}f32:
-; R600: FRACT {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
-; R600-DAG: ADD  {{.*}}, -0.5
-; R600-DAG: CEIL {{.*}} [[ARG]]
-; R600-DAG: FLOOR {{.*}} [[ARG]]
-; R600-DAG: CNDGE
-; R600-DAG: CNDGT
-; R600: CNDGE {{[^,]+}}, [[ARG]]
-define void @f32(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = call float @llvm.round.f32(float %in)
-  store float %0, float addrspace(1)* %out
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}round_f32:
+; SI-DAG: s_load_dword [[SX:s[0-9]+]]
+; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff
+; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]]
+; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
+; SI: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
+; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
+; SI: v_cmp_ge_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SUB]]|, 0.5
+; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
+; SI: buffer_store_dword [[RESULT]]
+
+; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
+; R600-DAG: ADD  {{.*}},
+; R600-DAG: BFI_INT
+; R600-DAG: SETGE
+; R600-DAG: CNDE
+; R600-DAG: ADD
+define void @round_f32(float addrspace(1)* %out, float %x) #0 {
+  %result = call float @llvm.round.f32(float %x) #1
+  store float %result, float addrspace(1)* %out
   ret void
 }
 
@@ -20,24 +31,37 @@ entry:
 ; a test for the scalar case, so the vector tests just check that the
 ; compiler doesn't crash.
 
-; FUNC-LABEL: v2f32
+; FUNC-LABEL: {{^}}round_v2f32:
+; SI: s_endpgm
 ; R600: CF_END
-define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %0 = call <2 x float> @llvm.round.v2f32(<2 x float> %in)
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
+  %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: v4f32
+; FUNC-LABEL: {{^}}round_v4f32:
+; SI: s_endpgm
 ; R600: CF_END
-define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-entry:
-  %0 = call <4 x float> @llvm.round.v4f32(<4 x float> %in)
-  store <4 x float> %0, <4 x float> addrspace(1)* %out
+define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
+  %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
 
-declare float @llvm.round.f32(float)
-declare <2 x float> @llvm.round.v2f32(<2 x float>)
-declare <4 x float> @llvm.round.v4f32(<4 x float>)
+; FUNC-LABEL: {{^}}round_v8f32:
+; SI: s_endpgm
+; R600: CF_END
+define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 {
+  %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1
+  store <8 x float> %result, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.round.f32(float) #1
+declare <2 x float> @llvm.round.v2f32(<2 x float>) #1
+declare <4 x float> @llvm.round.v4f32(<4 x float>) #1
+declare <8 x float> @llvm.round.v8f32(<8 x float>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll
index 7e45710..3bb245c 100644
--- a/test/CodeGen/R600/llvm.sin.ll
+++ b/test/CodeGen/R600/llvm.sin.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
 
 ; FUNC-LABEL: sin_f32
 ; EG: MULADD_IEEE *
diff --git a/test/CodeGen/R600/llvm.sqrt.ll b/test/CodeGen/R600/llvm.sqrt.ll
index c039225..cc4717a 100644
--- a/test/CodeGen/R600/llvm.sqrt.ll
+++ b/test/CodeGen/R600/llvm.sqrt.ll
@@ -1,11 +1,12 @@
-; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600
+; RUN: llc < %s -march=amdgcn --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn --mcpu=tonga -verify-machineinstrs| FileCheck %s --check-prefix=SI
 
-; R600-CHECK-LABEL: {{^}}sqrt_f32:
-; R600-CHECK: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
-; R600-CHECK: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
-; SI-CHECK-LABEL: {{^}}sqrt_f32:
-; SI-CHECK: v_sqrt_f32_e32
+; R600-LABEL: {{^}}sqrt_f32:
+; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
+; SI-LABEL: {{^}}sqrt_f32:
+; SI: v_sqrt_f32_e32
 define void @sqrt_f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.sqrt.f32(float %in)
@@ -13,14 +14,14 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: {{^}}sqrt_v2f32:
-; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W
-; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
-; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
-; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
-; SI-CHECK-LABEL: {{^}}sqrt_v2f32:
-; SI-CHECK: v_sqrt_f32_e32
-; SI-CHECK: v_sqrt_f32_e32
+; R600-LABEL: {{^}}sqrt_v2f32:
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
+; SI-LABEL: {{^}}sqrt_v2f32:
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
 define void @sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
@@ -28,20 +29,20 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: {{^}}sqrt_v4f32:
-; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y
-; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS
-; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z
-; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS
-; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W
-; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
-; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
-; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
-; SI-CHECK-LABEL: {{^}}sqrt_v4f32:
-; SI-CHECK: v_sqrt_f32_e32
-; SI-CHECK: v_sqrt_f32_e32
-; SI-CHECK: v_sqrt_f32_e32
-; SI-CHECK: v_sqrt_f32_e32
+; R600-LABEL: {{^}}sqrt_v4f32:
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
+; SI-LABEL: {{^}}sqrt_v4f32:
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
 define void @sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
diff --git a/test/CodeGen/R600/llvm.trunc.ll b/test/CodeGen/R600/llvm.trunc.ll
deleted file mode 100644
index 5585477..0000000
--- a/test/CodeGen/R600/llvm.trunc.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; CHECK-LABEL: {{^}}trunc_f32:
-; CHECK: TRUNC
-
-define void @trunc_f32(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = call float @llvm.trunc.f32(float %in)
-  store float %0, float  addrspace(1)* %out
-  ret void
-}
-
-declare float @llvm.trunc.f32(float)
diff --git a/test/CodeGen/R600/load-i1.ll b/test/CodeGen/R600/load-i1.ll
index d85e16f..315c0a3 100644
--- a/test/CodeGen/R600/load-i1.ll
+++ b/test/CodeGen/R600/load-i1.ll
@@ -1,21 +1,58 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-
-; SI-LABEL: {{^}}global_copy_i1_to_i1:
+; FUNC-LABEL: {{^}}global_copy_i1_to_i1:
 ; SI: buffer_load_ubyte
 ; SI: v_and_b32_e32 v{{[0-9]+}}, 1
 ; SI: buffer_store_byte
 ; SI: s_endpgm
+
+; EG: VTX_READ_8
+; EG: AND_INT
 define void @global_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1 addrspace(1)* %in
   store i1 %load, i1 addrspace(1)* %out, align 1
   ret void
 }
 
-; SI-LABEL: {{^}}global_sextload_i1_to_i32:
-; XSI: BUFFER_LOAD_BYTE
+; FUNC-LABEL: {{^}}local_copy_i1_to_i1:
+; SI: ds_read_u8
+; SI: v_and_b32_e32 v{{[0-9]+}}, 1
+; SI: ds_write_b8
+; SI: s_endpgm
+
+; EG: LDS_UBYTE_READ_RET
+; EG: AND_INT
+; EG: LDS_BYTE_WRITE
+define void @local_copy_i1_to_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) nounwind {
+  %load = load i1 addrspace(3)* %in
+  store i1 %load, i1 addrspace(3)* %out, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_copy_i1_to_i1:
+; SI: buffer_load_ubyte
+; SI: v_and_b32_e32 v{{[0-9]+}}, 1
+; SI: buffer_store_byte
+; SI: s_endpgm
+
+; EG: VTX_READ_8
+; EG: AND_INT
+define void @constant_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) nounwind {
+  %load = load i1 addrspace(2)* %in
+  store i1 %load, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i1_to_i32:
+; SI: buffer_load_ubyte
+; SI: v_bfe_i32
 ; SI: buffer_store_dword
 ; SI: s_endpgm
+
+; EG: VTX_READ_8
+; EG: BFE_INT
 define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
@@ -23,10 +60,11 @@ define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
   ret void
 }
 
-; SI-LABEL: {{^}}global_zextload_i1_to_i32:
+; FUNC-LABEL: {{^}}global_zextload_i1_to_i32:
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_dword
 ; SI: s_endpgm
+
 define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
@@ -34,8 +72,9 @@ define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
   ret void
 }
 
-; SI-LABEL: {{^}}global_sextload_i1_to_i64:
-; XSI: BUFFER_LOAD_BYTE
+; FUNC-LABEL: {{^}}global_sextload_i1_to_i64:
+; SI: buffer_load_ubyte
+; SI: v_bfe_i32
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
@@ -45,8 +84,9 @@ define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
   ret void
 }
 
-; SI-LABEL: {{^}}global_zextload_i1_to_i64:
+; FUNC-LABEL: {{^}}global_zextload_i1_to_i64:
 ; SI: buffer_load_ubyte
+; SI: v_mov_b32_e32 {{v[0-9]+}}, 0
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
@@ -56,7 +96,7 @@ define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
   ret void
 }
 
-; SI-LABEL: {{^}}i1_arg:
+; FUNC-LABEL: {{^}}i1_arg:
 ; SI: buffer_load_ubyte
 ; SI: v_and_b32_e32
 ; SI: buffer_store_byte
@@ -66,7 +106,7 @@ define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
   ret void
 }
 
-; SI-LABEL: {{^}}i1_arg_zext_i32:
+; FUNC-LABEL: {{^}}i1_arg_zext_i32:
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_dword
 ; SI: s_endpgm
@@ -76,7 +116,7 @@ define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   ret void
 }
 
-; SI-LABEL: {{^}}i1_arg_zext_i64:
+; FUNC-LABEL: {{^}}i1_arg_zext_i64:
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
@@ -86,8 +126,8 @@ define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   ret void
 }
 
-; SI-LABEL: {{^}}i1_arg_sext_i32:
-; XSI: BUFFER_LOAD_BYTE
+; FUNC-LABEL: {{^}}i1_arg_sext_i32:
+; SI: buffer_load_ubyte
 ; SI: buffer_store_dword
 ; SI: s_endpgm
 define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
@@ -96,8 +136,10 @@ define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   ret void
 }
 
-; SI-LABEL: {{^}}i1_arg_sext_i64:
-; XSI: BUFFER_LOAD_BYTE
+; FUNC-LABEL: {{^}}i1_arg_sext_i64:
+; SI: buffer_load_ubyte
+; SI: v_bfe_i32
+; SI: v_ashrrev_i32
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
index 62d3063..b71b7cb 100644
--- a/test/CodeGen/R600/load.ll
+++ b/test/CodeGen/R600/load.ll
@@ -1,6 +1,7 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK --check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600-CHECK --check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=FUNC %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 ;===------------------------------------------------------------------------===;
 ; GLOBAL ADDRESS SPACE
@@ -8,9 +9,9 @@
 
 ; Load an i8 value from the global address space.
 ; FUNC-LABEL: {{^}}load_i8:
-; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
 
-; SI-CHECK: buffer_load_ubyte v{{[0-9]+}},
+; SI: buffer_load_ubyte v{{[0-9]+}},
 define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %1 = load i8 addrspace(1)* %in
   %2 = zext i8 %1 to i32
@@ -19,12 +20,12 @@ define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 }
 
 ; FUNC-LABEL: {{^}}load_i8_sext:
-; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
-; R600-CHECK: 24
-; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
-; R600-CHECK: 24
-; SI-CHECK: buffer_load_sbyte
+; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; R600: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
+; R600: 24
+; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
+; R600: 24
+; SI: buffer_load_sbyte
 define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = load i8 addrspace(1)* %in
@@ -34,10 +35,10 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v2i8:
-; R600-CHECK: VTX_READ_8
-; R600-CHECK: VTX_READ_8
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
+; R600: VTX_READ_8
+; R600: VTX_READ_8
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
 define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
 entry:
   %0 = load <2 x i8> addrspace(1)* %in
@@ -47,18 +48,18 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v2i8_sext:
-; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
-; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
-; R600-CHECK-DAG: 24
-; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
-; R600-CHECK-DAG: 24
-; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
-; R600-CHECK-DAG: 24
-; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
-; R600-CHECK-DAG: 24
-; SI-CHECK: buffer_load_sbyte
-; SI-CHECK: buffer_load_sbyte
+; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
+; R600-DAG: 24
+; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
+; R600-DAG: 24
+; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
+; R600-DAG: 24
+; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
+; R600-DAG: 24
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
 define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
 entry:
   %0 = load <2 x i8> addrspace(1)* %in
@@ -68,14 +69,14 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v4i8:
-; R600-CHECK: VTX_READ_8
-; R600-CHECK: VTX_READ_8
-; R600-CHECK: VTX_READ_8
-; R600-CHECK: VTX_READ_8
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
-; SI-CHECK: buffer_load_ubyte
+; R600: VTX_READ_8
+; R600: VTX_READ_8
+; R600: VTX_READ_8
+; R600: VTX_READ_8
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
 define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
 entry:
   %0 = load <4 x i8> addrspace(1)* %in
@@ -85,30 +86,30 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v4i8_sext:
-; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
-; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-CHECK-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
-; R600-CHECK-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
-; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
-; R600-CHECK-DAG: 24
-; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
-; R600-CHECK-DAG: 24
-; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
-; R600-CHECK-DAG: 24
-; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
-; R600-CHECK-DAG: 24
-; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Z_CHAN:[XYZW]]], [[DST_Z]]
-; R600-CHECK-DAG: 24
-; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Z_CHAN]]
-; R600-CHECK-DAG: 24
-; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_W_CHAN:[XYZW]]], [[DST_W]]
-; R600-CHECK-DAG: 24
-; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
-; R600-CHECK-DAG: 24
-; SI-CHECK: buffer_load_sbyte
-; SI-CHECK: buffer_load_sbyte
-; SI-CHECK: buffer_load_sbyte
-; SI-CHECK: buffer_load_sbyte
+; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; R600-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
+; R600-DAG: 24
+; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
+; R600-DAG: 24
+; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
+; R600-DAG: 24
+; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
+; R600-DAG: 24
+; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Z_CHAN:[XYZW]]], [[DST_Z]]
+; R600-DAG: 24
+; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Z_CHAN]]
+; R600-DAG: 24
+; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_W_CHAN:[XYZW]]], [[DST_W]]
+; R600-DAG: 24
+; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
+; R600-DAG: 24
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
 define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
 entry:
   %0 = load <4 x i8> addrspace(1)* %in
@@ -119,8 +120,8 @@ entry:
 
 ; Load an i16 value from the global address space.
 ; FUNC-LABEL: {{^}}load_i16:
-; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: buffer_load_ushort
+; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI: buffer_load_ushort
 define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
   %0 = load i16	 addrspace(1)* %in
@@ -130,12 +131,12 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i16_sext:
-; R600-CHECK: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
-; R600-CHECK: 16
-; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
-; R600-CHECK: 16
-; SI-CHECK: buffer_load_sshort
+; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; R600: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
+; R600: 16
+; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
+; R600: 16
+; SI: buffer_load_sshort
 define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
   %0 = load i16 addrspace(1)* %in
@@ -145,10 +146,10 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v2i16:
-; R600-CHECK: VTX_READ_16
-; R600-CHECK: VTX_READ_16
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
+; R600: VTX_READ_16
+; R600: VTX_READ_16
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
 define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
   %0 = load <2 x i16> addrspace(1)* %in
@@ -158,18 +159,18 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v2i16_sext:
-; R600-CHECK-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
-; R600-CHECK-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
-; R600-CHECK-DAG: 16
-; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
-; R600-CHECK-DAG: 16
-; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
-; R600-CHECK-DAG: 16
-; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
-; R600-CHECK-DAG: 16
-; SI-CHECK: buffer_load_sshort
-; SI-CHECK: buffer_load_sshort
+; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
+; R600-DAG: 16
+; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
+; R600-DAG: 16
+; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
+; R600-DAG: 16
+; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
+; R600-DAG: 16
+; SI: buffer_load_sshort
+; SI: buffer_load_sshort
 define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
   %0 = load <2 x i16> addrspace(1)* %in
@@ -179,14 +180,14 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v4i16:
-; R600-CHECK: VTX_READ_16
-; R600-CHECK: VTX_READ_16
-; R600-CHECK: VTX_READ_16
-; R600-CHECK: VTX_READ_16
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
-; SI-CHECK: buffer_load_ushort
+; R600: VTX_READ_16
+; R600: VTX_READ_16
+; R600: VTX_READ_16
+; R600: VTX_READ_16
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
 define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
 entry:
   %0 = load <4 x i16> addrspace(1)* %in
@@ -196,30 +197,30 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v4i16_sext:
-; R600-CHECK-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
-; R600-CHECK-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-CHECK-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
-; R600-CHECK-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
-; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
-; R600-CHECK-DAG: 16
-; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
-; R600-CHECK-DAG: 16
-; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
-; R600-CHECK-DAG: 16
-; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
-; R600-CHECK-DAG: 16
-; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Z_CHAN:[XYZW]]], [[DST_Z]]
-; R600-CHECK-DAG: 16
-; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Z_CHAN]]
-; R600-CHECK-DAG: 16
-; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_W_CHAN:[XYZW]]], [[DST_W]]
-; R600-CHECK-DAG: 16
-; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
-; R600-CHECK-DAG: 16
-; SI-CHECK: buffer_load_sshort
-; SI-CHECK: buffer_load_sshort
-; SI-CHECK: buffer_load_sshort
-; SI-CHECK: buffer_load_sshort
+; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; R600-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
+; R600-DAG: 16
+; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
+; R600-DAG: 16
+; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
+; R600-DAG: 16
+; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
+; R600-DAG: 16
+; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Z_CHAN:[XYZW]]], [[DST_Z]]
+; R600-DAG: 16
+; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Z_CHAN]]
+; R600-DAG: 16
+; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_W_CHAN:[XYZW]]], [[DST_W]]
+; R600-DAG: 16
+; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
+; R600-DAG: 16
+; SI: buffer_load_sshort
+; SI: buffer_load_sshort
+; SI: buffer_load_sshort
+; SI: buffer_load_sshort
 define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
 entry:
   %0 = load <4 x i16> addrspace(1)* %in
@@ -230,9 +231,9 @@ entry:
 
 ; load an i32 value from the global address space.
 ; FUNC-LABEL: {{^}}load_i32:
-; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: buffer_load_dword v{{[0-9]+}}
+; SI: buffer_load_dword v{{[0-9]+}}
 define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32 addrspace(1)* %in
@@ -242,9 +243,9 @@ entry:
 
 ; load a f32 value from the global address space.
 ; FUNC-LABEL: {{^}}load_f32:
-; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: buffer_load_dword v{{[0-9]+}}
+; SI: buffer_load_dword v{{[0-9]+}}
 define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float addrspace(1)* %in
@@ -254,9 +255,9 @@ entry:
 
 ; load a v2f32 value from the global address space
 ; FUNC-LABEL: {{^}}load_v2f32:
-; R600-CHECK: MEM_RAT
-; R600-CHECK: VTX_READ_64
-; SI-CHECK: buffer_load_dwordx2
+; R600: MEM_RAT
+; R600: VTX_READ_64
+; SI: buffer_load_dwordx2
 define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 entry:
   %0 = load <2 x float> addrspace(1)* %in
@@ -265,8 +266,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i64:
-; R600-CHECK: VTX_READ_64
-; SI-CHECK: buffer_load_dwordx2
+; R600: VTX_READ_64
+; SI: buffer_load_dwordx2
 define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
   %0 = load i64 addrspace(1)* %in
@@ -275,11 +276,11 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i64_sext:
-; R600-CHECK: MEM_RAT
-; R600-CHECK: MEM_RAT
-; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.x
-; R600-CHECK: 31
-; SI-CHECK: buffer_load_dword
+; R600: MEM_RAT
+; R600: MEM_RAT
+; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.x
+; R600: 31
+; SI: buffer_load_dword
 
 define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
@@ -290,8 +291,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i64_zext:
-; R600-CHECK: MEM_RAT
-; R600-CHECK: MEM_RAT
+; R600: MEM_RAT
+; R600: MEM_RAT
 define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32 addrspace(1)* %in
@@ -301,17 +302,17 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v8i32:
-; R600-CHECK: VTX_READ_128
-; R600-CHECK: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
 ; XXX: We should be using DWORDX4 instructions on SI.
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
 define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
 entry:
   %0 = load <8 x i32> addrspace(1)* %in
@@ -320,27 +321,27 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v16i32:
-; R600-CHECK: VTX_READ_128
-; R600-CHECK: VTX_READ_128
-; R600-CHECK: VTX_READ_128
-; R600-CHECK: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
 ; XXX: We should be using DWORDX4 instructions on SI.
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
-; SI-CHECK: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
 define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
 entry:
   %0 = load <16 x i32> addrspace(1)* %in
@@ -354,12 +355,12 @@ entry:
 
 ; Load a sign-extended i8 value
 ; FUNC-LABEL: {{^}}load_const_i8_sext:
-; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
-; R600-CHECK: 24
-; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
-; R600-CHECK: 24
-; SI-CHECK: buffer_load_sbyte v{{[0-9]+}},
+; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; R600: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
+; R600: 24
+; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
+; R600: 24
+; SI: buffer_load_sbyte v{{[0-9]+}},
 define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = load i8 addrspace(2)* %in
@@ -370,8 +371,8 @@ entry:
 
 ; Load an aligned i8 value
 ; FUNC-LABEL: {{^}}load_const_i8_aligned:
-; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: buffer_load_ubyte v{{[0-9]+}},
+; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI: buffer_load_ubyte v{{[0-9]+}},
 define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = load i8 addrspace(2)* %in
@@ -382,8 +383,8 @@ entry:
 
 ; Load an un-aligned i8 value
 ; FUNC-LABEL: {{^}}load_const_i8_unaligned:
-; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: buffer_load_ubyte v{{[0-9]+}},
+; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI: buffer_load_ubyte v{{[0-9]+}},
 define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = getelementptr i8 addrspace(2)* %in, i32 1
@@ -395,12 +396,12 @@ entry:
 
 ; Load a sign-extended i16 value
 ; FUNC-LABEL: {{^}}load_const_i16_sext:
-; R600-CHECK: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
-; R600-CHECK: 16
-; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
-; R600-CHECK: 16
-; SI-CHECK: buffer_load_sshort
+; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; R600: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
+; R600: 16
+; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
+; R600: 16
+; SI: buffer_load_sshort
 define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
   %0 = load i16 addrspace(2)* %in
@@ -411,8 +412,8 @@ entry:
 
 ; Load an aligned i16 value
 ; FUNC-LABEL: {{^}}load_const_i16_aligned:
-; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: buffer_load_ushort
+; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI: buffer_load_ushort
 define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
   %0 = load i16 addrspace(2)* %in
@@ -423,8 +424,8 @@ entry:
 
 ; Load an un-aligned i16 value
 ; FUNC-LABEL: {{^}}load_const_i16_unaligned:
-; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: buffer_load_ushort
+; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI: buffer_load_ushort
 define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
   %0 = getelementptr i16 addrspace(2)* %in, i32 1
@@ -436,9 +437,9 @@ entry:
 
 ; Load an i32 value from the constant address space.
 ; FUNC-LABEL: {{^}}load_const_addrspace_i32:
-; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: s_load_dword s{{[0-9]+}}
+; SI: s_load_dword s{{[0-9]+}}
 define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %0 = load i32 addrspace(2)* %in
@@ -448,9 +449,9 @@ entry:
 
 ; Load a f32 value from the constant address space.
 ; FUNC-LABEL: {{^}}load_const_addrspace_f32:
-; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: s_load_dword s{{[0-9]+}}
+; SI: s_load_dword s{{[0-9]+}}
 define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) {
   %1 = load float addrspace(2)* %in
   store float %1, float addrspace(1)* %out
@@ -463,10 +464,10 @@ define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(
 
 ; Load an i8 value from the local address space.
 ; FUNC-LABEL: {{^}}load_i8_local:
-; R600-CHECK: LDS_UBYTE_READ_RET
-; SI-CHECK-NOT: s_wqm_b64
-; SI-CHECK: s_mov_b32 m0
-; SI-CHECK: ds_read_u8
+; R600: LDS_UBYTE_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u8
 define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
   %1 = load i8 addrspace(3)* %in
   %2 = zext i8 %1 to i32
@@ -475,11 +476,11 @@ define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
 }
 
 ; FUNC-LABEL: {{^}}load_i8_sext_local:
-; R600-CHECK: LDS_UBYTE_READ_RET
-; R600-CHECK: ASHR
-; SI-CHECK-NOT: s_wqm_b64
-; SI-CHECK: s_mov_b32 m0
-; SI-CHECK: ds_read_i8
+; R600: LDS_UBYTE_READ_RET
+; R600: ASHR
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i8
 define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
 entry:
   %0 = load i8 addrspace(3)* %in
@@ -489,12 +490,12 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v2i8_local:
-; R600-CHECK: LDS_UBYTE_READ_RET
-; R600-CHECK: LDS_UBYTE_READ_RET
-; SI-CHECK-NOT: s_wqm_b64
-; SI-CHECK: s_mov_b32 m0
-; SI-CHECK: ds_read_u8
-; SI-CHECK: ds_read_u8
+; R600: LDS_UBYTE_READ_RET
+; R600: LDS_UBYTE_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u8
+; SI: ds_read_u8
 define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
 entry:
   %0 = load <2 x i8> addrspace(3)* %in
@@ -504,14 +505,14 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v2i8_sext_local:
-; R600-CHECK-DAG: LDS_UBYTE_READ_RET
-; R600-CHECK-DAG: LDS_UBYTE_READ_RET
-; R600-CHECK-DAG: ASHR
-; R600-CHECK-DAG: ASHR
-; SI-CHECK-NOT: s_wqm_b64
-; SI-CHECK: s_mov_b32 m0
-; SI-CHECK: ds_read_i8
-; SI-CHECK: ds_read_i8
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: ASHR
+; R600-DAG: ASHR
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i8
+; SI: ds_read_i8
 define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
 entry:
   %0 = load <2 x i8> addrspace(3)* %in
@@ -521,16 +522,16 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v4i8_local:
-; R600-CHECK: LDS_UBYTE_READ_RET
-; R600-CHECK: LDS_UBYTE_READ_RET
-; R600-CHECK: LDS_UBYTE_READ_RET
-; R600-CHECK: LDS_UBYTE_READ_RET
-; SI-CHECK-NOT: s_wqm_b64
-; SI-CHECK: s_mov_b32 m0
-; SI-CHECK: ds_read_u8
-; SI-CHECK: ds_read_u8
-; SI-CHECK: ds_read_u8
-; SI-CHECK: ds_read_u8
+; R600: LDS_UBYTE_READ_RET
+; R600: LDS_UBYTE_READ_RET
+; R600: LDS_UBYTE_READ_RET
+; R600: LDS_UBYTE_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
 define void @load_v4i8_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
 entry:
   %0 = load <4 x i8> addrspace(3)* %in
@@ -540,20 +541,20 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v4i8_sext_local:
-; R600-CHECK-DAG: LDS_UBYTE_READ_RET
-; R600-CHECK-DAG: LDS_UBYTE_READ_RET
-; R600-CHECK-DAG: LDS_UBYTE_READ_RET
-; R600-CHECK-DAG: LDS_UBYTE_READ_RET
-; R600-CHECK-DAG: ASHR
-; R600-CHECK-DAG: ASHR
-; R600-CHECK-DAG: ASHR
-; R600-CHECK-DAG: ASHR
-; SI-CHECK-NOT: s_wqm_b64
-; SI-CHECK: s_mov_b32 m0
-; SI-CHECK: ds_read_i8
-; SI-CHECK: ds_read_i8
-; SI-CHECK: ds_read_i8
-; SI-CHECK: ds_read_i8
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: ASHR
+; R600-DAG: ASHR
+; R600-DAG: ASHR
+; R600-DAG: ASHR
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i8
+; SI: ds_read_i8
+; SI: ds_read_i8
+; SI: ds_read_i8
 define void @load_v4i8_sext_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
 entry:
   %0 = load <4 x i8> addrspace(3)* %in
@@ -564,10 +565,10 @@ entry:
 
 ; Load an i16 value from the local address space.
 ; FUNC-LABEL: {{^}}load_i16_local:
-; R600-CHECK: LDS_USHORT_READ_RET
-; SI-CHECK-NOT: s_wqm_b64
-; SI-CHECK: s_mov_b32 m0
-; SI-CHECK: ds_read_u16
+; R600: LDS_USHORT_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u16
 define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
 entry:
   %0 = load i16	 addrspace(3)* %in
@@ -577,11 +578,11 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i16_sext_local:
-; R600-CHECK: LDS_USHORT_READ_RET
-; R600-CHECK: ASHR
-; SI-CHECK-NOT: s_wqm_b64
-; SI-CHECK: s_mov_b32 m0
-; SI-CHECK: ds_read_i16
+; R600: LDS_USHORT_READ_RET
+; R600: ASHR
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i16
 define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
 entry:
   %0 = load i16 addrspace(3)* %in
@@ -591,12 +592,12 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v2i16_local:
-; R600-CHECK: LDS_USHORT_READ_RET
-; R600-CHECK: LDS_USHORT_READ_RET
-; SI-CHECK-NOT: s_wqm_b64
-; SI-CHECK: s_mov_b32 m0
-; SI-CHECK: ds_read_u16
-; SI-CHECK: ds_read_u16
+; R600: LDS_USHORT_READ_RET
+; R600: LDS_USHORT_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u16
+; SI: ds_read_u16
 define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
 entry:
   %0 = load <2 x i16> addrspace(3)* %in
@@ -606,14 +607,14 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v2i16_sext_local:
-; R600-CHECK-DAG: LDS_USHORT_READ_RET
-; R600-CHECK-DAG: LDS_USHORT_READ_RET
-; R600-CHECK-DAG: ASHR
-; R600-CHECK-DAG: ASHR
-; SI-CHECK-NOT: s_wqm_b64
-; SI-CHECK: s_mov_b32 m0
-; SI-CHECK: ds_read_i16
-; SI-CHECK: ds_read_i16
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: ASHR
+; R600-DAG: ASHR
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i16
+; SI: ds_read_i16
 define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
 entry:
   %0 = load <2 x i16> addrspace(3)* %in
@@ -623,16 +624,16 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v4i16_local:
-; R600-CHECK: LDS_USHORT_READ_RET
-; R600-CHECK: LDS_USHORT_READ_RET
-; R600-CHECK: LDS_USHORT_READ_RET
-; R600-CHECK: LDS_USHORT_READ_RET
-; SI-CHECK-NOT: s_wqm_b64
-; SI-CHECK: s_mov_b32 m0
-; SI-CHECK: ds_read_u16
-; SI-CHECK: ds_read_u16
-; SI-CHECK: ds_read_u16
-; SI-CHECK: ds_read_u16
+; R600: LDS_USHORT_READ_RET
+; R600: LDS_USHORT_READ_RET
+; R600: LDS_USHORT_READ_RET
+; R600: LDS_USHORT_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
 define void @load_v4i16_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
 entry:
   %0 = load <4 x i16> addrspace(3)* %in
@@ -642,20 +643,20 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_v4i16_sext_local:
-; R600-CHECK-DAG: LDS_USHORT_READ_RET
-; R600-CHECK-DAG: LDS_USHORT_READ_RET
-; R600-CHECK-DAG: LDS_USHORT_READ_RET
-; R600-CHECK-DAG: LDS_USHORT_READ_RET
-; R600-CHECK-DAG: ASHR
-; R600-CHECK-DAG: ASHR
-; R600-CHECK-DAG: ASHR
-; R600-CHECK-DAG: ASHR
-; SI-CHECK-NOT: s_wqm_b64
-; SI-CHECK: s_mov_b32 m0
-; SI-CHECK: ds_read_i16
-; SI-CHECK: ds_read_i16
-; SI-CHECK: ds_read_i16
-; SI-CHECK: ds_read_i16
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: ASHR
+; R600-DAG: ASHR
+; R600-DAG: ASHR
+; R600-DAG: ASHR
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i16
+; SI: ds_read_i16
+; SI: ds_read_i16
+; SI: ds_read_i16
 define void @load_v4i16_sext_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
 entry:
   %0 = load <4 x i16> addrspace(3)* %in
@@ -666,10 +667,10 @@ entry:
 
 ; load an i32 value from the local address space.
 ; FUNC-LABEL: {{^}}load_i32_local:
-; R600-CHECK: LDS_READ_RET
-; SI-CHECK-NOT: s_wqm_b64
-; SI-CHECK: s_mov_b32 m0
-; SI-CHECK: ds_read_b32
+; R600: LDS_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_b32
 define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = load i32 addrspace(3)* %in
@@ -679,9 +680,9 @@ entry:
 
 ; load a f32 value from the local address space.
 ; FUNC-LABEL: {{^}}load_f32_local:
-; R600-CHECK: LDS_READ_RET
-; SI-CHECK: s_mov_b32 m0
-; SI-CHECK: ds_read_b32
+; R600: LDS_READ_RET
+; SI: s_mov_b32 m0
+; SI: ds_read_b32
 define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) {
 entry:
   %0 = load float addrspace(3)* %in
@@ -691,10 +692,10 @@ entry:
 
 ; load a v2f32 value from the local address space
 ; FUNC-LABEL: {{^}}load_v2f32_local:
-; R600-CHECK: LDS_READ_RET
-; R600-CHECK: LDS_READ_RET
-; SI-CHECK: s_mov_b32 m0
-; SI-CHECK: ds_read_b64
+; R600: LDS_READ_RET
+; R600: LDS_READ_RET
+; SI: s_mov_b32 m0
+; SI: ds_read_b64
 define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) {
 entry:
   %0 = load <2 x float> addrspace(3)* %in
@@ -704,11 +705,11 @@ entry:
 
 ; Test loading a i32 and v2i32 value from the same base pointer.
 ; FUNC-LABEL: {{^}}load_i32_v2i32_local:
-; R600-CHECK: LDS_READ_RET
-; R600-CHECK: LDS_READ_RET
-; R600-CHECK: LDS_READ_RET
-; SI-CHECK-DAG: ds_read_b32
-; SI-CHECK-DAG: ds_read2_b32
+; R600: LDS_READ_RET
+; R600: LDS_READ_RET
+; R600: LDS_READ_RET
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_read2_b32
 define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) {
   %scalar = load i32 addrspace(3)* %in
   %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
@@ -726,9 +727,9 @@ define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)
 ; On SI we need to make sure that the base offset is a register and not
 ; an immediate.
 ; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
-; SI-CHECK: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
-; SI-CHECK: ds_read_b32 v0, v[[ZERO]] offset:4
-; R600-CHECK: LDS_READ_RET
+; SI: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
+; SI: ds_read_b32 v0, v[[ZERO]] offset:4
+; R600: LDS_READ_RET
 define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %tmp0 = getelementptr [512 x i32] addrspace(3)* @lds, i32 0, i32 1
diff --git a/test/CodeGen/R600/load.vec.ll b/test/CodeGen/R600/load.vec.ll
index 0d6e213..346d8dc 100644
--- a/test/CodeGen/R600/load.vec.ll
+++ b/test/CodeGen/R600/load.vec.ll
@@ -1,11 +1,12 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK  %s
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK  %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG  %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI  %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI  %s
 
 ; load a v2i32 value from the global address space.
-; EG-CHECK: {{^}}load_v2i32:
-; EG-CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0
-; SI-CHECK: {{^}}load_v2i32:
-; SI-CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
+; EG: {{^}}load_v2i32:
+; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0
+; SI: {{^}}load_v2i32:
+; SI: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
 define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %a = load <2 x i32> addrspace(1) * %in
   store <2 x i32> %a, <2 x i32> addrspace(1)* %out
@@ -13,10 +14,10 @@ define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 }
 
 ; load a v4i32 value from the global address space.
-; EG-CHECK: {{^}}load_v4i32:
-; EG-CHECK: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0
-; SI-CHECK: {{^}}load_v4i32:
-; SI-CHECK: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}]
+; EG: {{^}}load_v4i32:
+; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0
+; SI: {{^}}load_v4i32:
+; SI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}]
 define void @load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %a = load <4 x i32> addrspace(1) * %in
   store <4 x i32> %a, <4 x i32> addrspace(1)* %out
diff --git a/test/CodeGen/R600/load64.ll b/test/CodeGen/R600/load64.ll
index a60c4eb..cb3d654 100644
--- a/test/CodeGen/R600/load64.ll
+++ b/test/CodeGen/R600/load64.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ; load a f64 value from the global address space.
 ; CHECK-LABEL: {{^}}load_f64:
diff --git a/test/CodeGen/R600/local-64.ll b/test/CodeGen/R600/local-64.ll
index eb14b5f..4b45169 100644
--- a/test/CodeGen/R600/local-64.ll
+++ b/test/CodeGen/R600/local-64.ll
@@ -1,8 +1,9 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
 
 ; BOTH-LABEL: {{^}}local_i32_load
-; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 [M0]
+; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28
 ; BOTH: buffer_store_dword [[REG]],
 define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %gep = getelementptr i32 addrspace(3)* %in, i32 7
@@ -12,7 +13,7 @@ define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounw
 }
 
 ; BOTH-LABEL: {{^}}local_i32_load_0_offset
-; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} [M0]
+; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}}
 ; BOTH: buffer_store_dword [[REG]],
 define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %val = load i32 addrspace(3)* %in, align 4
@@ -22,7 +23,7 @@ define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %
 
 ; BOTH-LABEL: {{^}}local_i8_load_i16_max_offset:
 ; BOTH-NOT: ADD
-; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 [M0]
+; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535
 ; BOTH: buffer_store_byte [[REG]],
 define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8 addrspace(3)* %in, i32 65535
@@ -37,7 +38,7 @@ define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
 ; SI: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
 ; CI: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
 ; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
-; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] [M0]
+; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]]
 ; BOTH: buffer_store_byte [[REG]],
 define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8 addrspace(3)* %in, i32 65536
@@ -48,7 +49,7 @@ define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspa
 
 ; BOTH-LABEL: {{^}}local_i64_load:
 ; BOTH-NOT: ADD
-; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 [M0]
+; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
 ; BOTH: buffer_store_dwordx2 [[REG]],
 define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %gep = getelementptr i64 addrspace(3)* %in, i32 7
@@ -58,7 +59,7 @@ define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounw
 }
 
 ; BOTH-LABEL: {{^}}local_i64_load_0_offset
-; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} [M0]
+; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
 ; BOTH: buffer_store_dwordx2 [[REG]],
 define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %val = load i64 addrspace(3)* %in, align 8
@@ -68,7 +69,7 @@ define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %
 
 ; BOTH-LABEL: {{^}}local_f64_load:
 ; BOTH-NOT: ADD
-; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 [M0]
+; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
 ; BOTH: buffer_store_dwordx2 [[REG]],
 define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %gep = getelementptr double addrspace(3)* %in, i32 7
@@ -78,7 +79,7 @@ define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in)
 }
 
 ; BOTH-LABEL: {{^}}local_f64_load_0_offset
-; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} [M0]
+; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
 ; BOTH: buffer_store_dwordx2 [[REG]],
 define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %val = load double addrspace(3)* %in, align 8
@@ -88,7 +89,7 @@ define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace
 
 ; BOTH-LABEL: {{^}}local_i64_store:
 ; BOTH-NOT: ADD
-; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 [M0]
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
 define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
   %gep = getelementptr i64 addrspace(3)* %out, i32 7
   store i64 5678, i64 addrspace(3)* %gep, align 8
@@ -97,7 +98,7 @@ define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_i64_store_0_offset:
 ; BOTH-NOT: ADD
-; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} [M0]
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
 define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
   store i64 1234, i64 addrspace(3)* %out, align 8
   ret void
@@ -105,7 +106,7 @@ define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_f64_store:
 ; BOTH-NOT: ADD
-; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 [M0]
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
 define void @local_f64_store(double addrspace(3)* %out) nounwind {
   %gep = getelementptr double addrspace(3)* %out, i32 7
   store double 16.0, double addrspace(3)* %gep, align 8
@@ -113,7 +114,7 @@ define void @local_f64_store(double addrspace(3)* %out) nounwind {
 }
 
 ; BOTH-LABEL: {{^}}local_f64_store_0_offset
-; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} [M0]
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
 define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
   store double 20.0, double addrspace(3)* %out, align 8
   ret void
@@ -121,8 +122,8 @@ define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_v2i64_store:
 ; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112 [M0]
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120
 ; BOTH: s_endpgm
 define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <2 x i64> addrspace(3)* %out, i32 7
@@ -132,8 +133,8 @@ define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_v2i64_store_0_offset:
 ; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} [M0]
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
 ; BOTH: s_endpgm
 define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
   store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
@@ -142,10 +143,10 @@ define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_v4i64_store:
 ; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224 [M0]
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232 [M0]
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240 [M0]
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248
 ; BOTH: s_endpgm
 define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <4 x i64> addrspace(3)* %out, i32 7
@@ -155,10 +156,10 @@ define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_v4i64_store_0_offset:
 ; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} [M0]
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 [M0]
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16 [M0]
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24
 ; BOTH: s_endpgm
 define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
   store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
diff --git a/test/CodeGen/R600/local-atomics.ll b/test/CodeGen/R600/local-atomics.ll
index 2ac811f..29921b6 100644
--- a/test/CodeGen/R600/local-atomics.ll
+++ b/test/CodeGen/R600/local-atomics.ll
@@ -1,15 +1,16 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
 ; EG: LDS_WRXCHG_RET *
-; SI: s_load_dword [[SPTR:s[0-9]+]],
-; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] [M0]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -18,8 +19,8 @@ define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32_offset:
 ; EG: LDS_WRXCHG_RET *
-; SI: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -30,12 +31,12 @@ define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; XXX - Is it really necessary to load 4 into VGPR?
 ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
 ; EG: LDS_ADD_RET *
-; SI: s_load_dword [[SPTR:s[0-9]+]],
-; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] [M0]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -44,8 +45,8 @@ define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_offset:
 ; EG: LDS_ADD_RET *
-; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -55,9 +56,9 @@ define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_bad_si_offset:
 ; EG: LDS_ADD_RET *
-; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} [M0]
-; CI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
@@ -69,10 +70,9 @@ define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 ad
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32:
 ; EG: LDS_ADD_RET *
-; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
-; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
-; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] [M0]
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]]
+; GCN: s_endpgm
 define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -81,10 +81,9 @@ define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
 ; EG: LDS_ADD_RET *
-; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
-; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
-; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: s_endpgm
 define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
@@ -94,9 +93,9 @@ define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_bad_si_offset:
 ; EG: LDS_ADD_RET *
-; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} [M0]
-; CI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_inc_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
@@ -108,8 +107,8 @@ define void @lds_atomic_inc_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 ad
 
 ; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32:
 ; EG: LDS_SUB_RET *
-; SI: ds_sub_rtn_u32
-; SI: s_endpgm
+; GCN: ds_sub_rtn_u32
+; GCN: s_endpgm
 define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -118,8 +117,8 @@ define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 
 ; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32_offset:
 ; EG: LDS_SUB_RET *
-; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -129,10 +128,9 @@ define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32:
 ; EG: LDS_SUB_RET *
-; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
-; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
-; SI: ds_dec_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] [M0]
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_dec_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]]
+; GCN: s_endpgm
 define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -141,10 +139,9 @@ define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
 ; EG: LDS_SUB_RET *
-; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
-; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
-; SI: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: s_endpgm
 define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
@@ -154,8 +151,8 @@ define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32:
 ; EG: LDS_AND_RET *
-; SI: ds_and_rtn_b32
-; SI: s_endpgm
+; GCN: ds_and_rtn_b32
+; GCN: s_endpgm
 define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -164,8 +161,8 @@ define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 
 ; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32_offset:
 ; EG: LDS_AND_RET *
-; SI: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -175,8 +172,8 @@ define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32:
 ; EG: LDS_OR_RET *
-; SI: ds_or_rtn_b32
-; SI: s_endpgm
+; GCN: ds_or_rtn_b32
+; GCN: s_endpgm
 define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -185,8 +182,8 @@ define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %pt
 
 ; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32_offset:
 ; EG: LDS_OR_RET *
-; SI: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -196,8 +193,8 @@ define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(
 
 ; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32:
 ; EG: LDS_XOR_RET *
-; SI: ds_xor_rtn_b32
-; SI: s_endpgm
+; GCN: ds_xor_rtn_b32
+; GCN: s_endpgm
 define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -206,8 +203,8 @@ define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 
 ; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32_offset:
 ; EG: LDS_XOR_RET *
-; SI: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -225,8 +222,8 @@ define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32:
 ; EG: LDS_MIN_INT_RET *
-; SI: ds_min_rtn_i32
-; SI: s_endpgm
+; GCN: ds_min_rtn_i32
+; GCN: s_endpgm
 define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -235,8 +232,8 @@ define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 
 ; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32_offset:
 ; EG: LDS_MIN_INT_RET *
-; SI: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -246,8 +243,8 @@ define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32:
 ; EG: LDS_MAX_INT_RET *
-; SI: ds_max_rtn_i32
-; SI: s_endpgm
+; GCN: ds_max_rtn_i32
+; GCN: s_endpgm
 define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -256,8 +253,8 @@ define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 
 ; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32_offset:
 ; EG: LDS_MAX_INT_RET *
-; SI: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -267,8 +264,8 @@ define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32:
 ; EG: LDS_MIN_UINT_RET *
-; SI: ds_min_rtn_u32
-; SI: s_endpgm
+; GCN: ds_min_rtn_u32
+; GCN: s_endpgm
 define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -277,8 +274,8 @@ define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 
 ; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32_offset:
 ; EG: LDS_MIN_UINT_RET *
-; SI: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -288,8 +285,8 @@ define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 
 ; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32:
 ; EG: LDS_MAX_UINT_RET *
-; SI: ds_max_rtn_u32
-; SI: s_endpgm
+; GCN: ds_max_rtn_u32
+; GCN: s_endpgm
 define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -298,8 +295,8 @@ define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 
 ; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32_offset:
 ; EG: LDS_MAX_UINT_RET *
-; SI: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -308,19 +305,19 @@ define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32:
-; SI: s_load_dword [[SPTR:s[0-9]+]],
-; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] [M0]
-; SI: s_endpgm
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
+; GCN: s_endpgm
 define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset:
-; SI: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -329,19 +326,19 @@ define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 
 ; XXX - Is it really necessary to load 4 into VGPR?
 ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32:
-; SI: s_load_dword [[SPTR:s[0-9]+]],
-; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: ds_add_u32 [[VPTR]], [[DATA]] [M0]
-; SI: s_endpgm
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_add_u32 [[VPTR]], [[DATA]]
+; GCN: s_endpgm
 define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset:
-; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -349,9 +346,9 @@ define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_bad_si_offset
-; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} [M0]
-; CI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 [M0]
-; SI: s_endpgm
+; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
@@ -361,20 +358,18 @@ define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32:
-; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
-; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
-; SI: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] [M0]
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]]
+; GCN: s_endpgm
 define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
-; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
-; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
-; SI: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: s_endpgm
 define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
@@ -383,8 +378,8 @@ define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_bad_si_offset:
 ; SI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}}
-; CI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; CIVI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_inc_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
@@ -394,16 +389,16 @@ define void @lds_atomic_inc_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32:
-; SI: ds_sub_u32
-; SI: s_endpgm
+; GCN: ds_sub_u32
+; GCN: s_endpgm
 define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset:
-; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -411,20 +406,18 @@ define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32:
-; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
-; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
-; SI: ds_dec_u32  v{{[0-9]+}}, [[NEGONE]]
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_dec_u32  v{{[0-9]+}}, [[NEGONE]]
+; GCN: s_endpgm
 define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
-; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
-; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
-; SI: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: s_endpgm
 define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
@@ -432,16 +425,16 @@ define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32:
-; SI: ds_and_b32
-; SI: s_endpgm
+; GCN: ds_and_b32
+; GCN: s_endpgm
 define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset:
-; SI: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -449,16 +442,16 @@ define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32:
-; SI: ds_or_b32
-; SI: s_endpgm
+; GCN: ds_or_b32
+; GCN: s_endpgm
 define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset:
-; SI: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -466,16 +459,16 @@ define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32:
-; SI: ds_xor_b32
-; SI: s_endpgm
+; GCN: ds_xor_b32
+; GCN: s_endpgm
 define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset:
-; SI: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -490,16 +483,16 @@ define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; }
 
 ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32:
-; SI: ds_min_i32
-; SI: s_endpgm
+; GCN: ds_min_i32
+; GCN: s_endpgm
 define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset:
-; SI: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -507,16 +500,16 @@ define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32:
-; SI: ds_max_i32
-; SI: s_endpgm
+; GCN: ds_max_i32
+; GCN: s_endpgm
 define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset:
-; SI: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -524,16 +517,16 @@ define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32:
-; SI: ds_min_u32
-; SI: s_endpgm
+; GCN: ds_min_u32
+; GCN: s_endpgm
 define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset:
-; SI: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -541,16 +534,16 @@ define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32:
-; SI: ds_max_u32
-; SI: s_endpgm
+; GCN: ds_max_u32
+; GCN: s_endpgm
 define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset:
-; SI: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
diff --git a/test/CodeGen/R600/local-atomics64.ll b/test/CodeGen/R600/local-atomics64.ll
index ce0cf59..50d039f 100644
--- a/test/CodeGen/R600/local-atomics64.ll
+++ b/test/CodeGen/R600/local-atomics64.ll
@@ -1,8 +1,9 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64:
-; SI: ds_wrxchg_rtn_b64
-; SI: s_endpgm
+; GCN: ds_wrxchg_rtn_b64
+; GCN: s_endpgm
 define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -10,8 +11,8 @@ define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
-; SI: ds_wrxchg_rtn_b64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -20,8 +21,8 @@ define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64:
-; SI: ds_add_rtn_u64
-; SI: s_endpgm
+; GCN: ds_add_rtn_u64
+; GCN: s_endpgm
 define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -29,14 +30,14 @@ define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
 ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, 9
-; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
-; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
-; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 [M0]
-; SI: buffer_store_dwordx2 [[RESULT]],
-; SI: s_endpgm
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
+; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: s_endpgm
 define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i64 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
@@ -45,12 +46,11 @@ define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64:
-; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
-; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
-; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
-; SI: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
-; SI: buffer_store_dwordx2 [[RESULT]],
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
+; GCN: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: s_endpgm
 define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -58,8 +58,8 @@ define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64_offset:
-; SI: ds_inc_rtn_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_inc_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
@@ -68,8 +68,8 @@ define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64:
-; SI: ds_sub_rtn_u64
-; SI: s_endpgm
+; GCN: ds_sub_rtn_u64
+; GCN: s_endpgm
 define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -77,8 +77,8 @@ define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
-; SI: ds_sub_rtn_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_sub_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -87,12 +87,11 @@ define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64:
-; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
-; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
-; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
-; SI: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
-; SI: buffer_store_dwordx2 [[RESULT]],
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
+; GCN: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: s_endpgm
 define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -100,8 +99,8 @@ define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64_offset:
-; SI: ds_dec_rtn_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_dec_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
@@ -110,8 +109,8 @@ define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64:
-; SI: ds_and_rtn_b64
-; SI: s_endpgm
+; GCN: ds_and_rtn_b64
+; GCN: s_endpgm
 define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -119,8 +118,8 @@ define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
-; SI: ds_and_rtn_b64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_and_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -129,8 +128,8 @@ define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64:
-; SI: ds_or_rtn_b64
-; SI: s_endpgm
+; GCN: ds_or_rtn_b64
+; GCN: s_endpgm
 define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -138,8 +137,8 @@ define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %pt
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
-; SI: ds_or_rtn_b64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_or_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -148,8 +147,8 @@ define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64:
-; SI: ds_xor_rtn_b64
-; SI: s_endpgm
+; GCN: ds_xor_rtn_b64
+; GCN: s_endpgm
 define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -157,8 +156,8 @@ define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
-; SI: ds_xor_rtn_b64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_xor_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -175,8 +174,8 @@ define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; }
 
 ; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64:
-; SI: ds_min_rtn_i64
-; SI: s_endpgm
+; GCN: ds_min_rtn_i64
+; GCN: s_endpgm
 define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -184,8 +183,8 @@ define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
-; SI: ds_min_rtn_i64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_min_rtn_i64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -194,8 +193,8 @@ define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64:
-; SI: ds_max_rtn_i64
-; SI: s_endpgm
+; GCN: ds_max_rtn_i64
+; GCN: s_endpgm
 define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -203,8 +202,8 @@ define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
-; SI: ds_max_rtn_i64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_max_rtn_i64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -213,8 +212,8 @@ define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64:
-; SI: ds_min_rtn_u64
-; SI: s_endpgm
+; GCN: ds_min_rtn_u64
+; GCN: s_endpgm
 define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -222,8 +221,8 @@ define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
-; SI: ds_min_rtn_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_min_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -232,8 +231,8 @@ define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64:
-; SI: ds_max_rtn_u64
-; SI: s_endpgm
+; GCN: ds_max_rtn_u64
+; GCN: s_endpgm
 define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -241,8 +240,8 @@ define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
-; SI: ds_max_rtn_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_max_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -251,16 +250,16 @@ define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64:
-; SI: ds_wrxchg_rtn_b64
-; SI: s_endpgm
+; GCN: ds_wrxchg_rtn_b64
+; GCN: s_endpgm
 define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
-; SI: ds_wrxchg_rtn_b64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -268,8 +267,8 @@ define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64:
-; SI: ds_add_u64
-; SI: s_endpgm
+; GCN: ds_add_u64
+; GCN: s_endpgm
 define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
@@ -277,12 +276,12 @@ define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64_offset:
 ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, 9
-; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
-; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
-; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 [M0]
-; SI: s_endpgm
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i64 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
@@ -290,19 +289,18 @@ define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64:
-; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
-; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
-; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
-; SI: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
+; GCN: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: s_endpgm
 define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64_offset:
-; SI: ds_inc_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_inc_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
@@ -310,16 +308,16 @@ define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64:
-; SI: ds_sub_u64
-; SI: s_endpgm
+; GCN: ds_sub_u64
+; GCN: s_endpgm
 define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
-; SI: ds_sub_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_sub_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -327,19 +325,18 @@ define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64:
-; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
-; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
-; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
-; SI: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
+; GCN: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: s_endpgm
 define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64_offset:
-; SI: ds_dec_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_dec_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
@@ -347,16 +344,16 @@ define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64:
-; SI: ds_and_b64
-; SI: s_endpgm
+; GCN: ds_and_b64
+; GCN: s_endpgm
 define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
-; SI: ds_and_b64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_and_b64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -364,16 +361,16 @@ define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64:
-; SI: ds_or_b64
-; SI: s_endpgm
+; GCN: ds_or_b64
+; GCN: s_endpgm
 define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
-; SI: ds_or_b64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_or_b64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -381,16 +378,16 @@ define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64:
-; SI: ds_xor_b64
-; SI: s_endpgm
+; GCN: ds_xor_b64
+; GCN: s_endpgm
 define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
-; SI: ds_xor_b64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_xor_b64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -405,16 +402,16 @@ define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; }
 
 ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64:
-; SI: ds_min_i64
-; SI: s_endpgm
+; GCN: ds_min_i64
+; GCN: s_endpgm
 define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
-; SI: ds_min_i64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_min_i64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -422,16 +419,16 @@ define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64:
-; SI: ds_max_i64
-; SI: s_endpgm
+; GCN: ds_max_i64
+; GCN: s_endpgm
 define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
-; SI: ds_max_i64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_max_i64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -439,16 +436,16 @@ define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64:
-; SI: ds_min_u64
-; SI: s_endpgm
+; GCN: ds_min_u64
+; GCN: s_endpgm
 define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
-; SI: ds_min_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_min_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -456,16 +453,16 @@ define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64:
-; SI: ds_max_u64
-; SI: s_endpgm
+; GCN: ds_max_u64
+; GCN: s_endpgm
 define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
-; SI: ds_max_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_max_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
index 88ef05d..3d90ab1 100644
--- a/test/CodeGen/R600/local-memory-two-objects.ll
+++ b/test/CodeGen/R600/local-memory-two-objects.ll
@@ -1,38 +1,38 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=SI %s
-; RUN: llc < %s -march=r600 -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=CI %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=CI %s
 
 @local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
 @local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
 
-; EG-CHECK: {{^}}local_memory_two_objects:
+; EG: {{^}}local_memory_two_objects:
 
 ; Check that the LDS size emitted correctly
-; EG-CHECK: .long 166120
-; EG-CHECK-NEXT: .long 8
-; SI-CHECK: .long 47180
-; SI-CHECK-NEXT: .long 32768
+; EG: .long 166120
+; EG-NEXT: .long 8
+; GCN: .long 47180
+; GCN-NEXT: .long 38792
 
 ; We would like to check the the lds writes are using different
 ; addresses, but due to variations in the scheduler, we can't do
 ; this consistently on evergreen GPUs.
-; EG-CHECK: LDS_WRITE
-; EG-CHECK: LDS_WRITE
-; SI-CHECK: ds_write_b32 {{v[0-9]*}}, v[[ADDRW:[0-9]*]]
-; SI-CHECK-NOT: ds_write_b32 {{v[0-9]*}}, v[[ADDRW]]
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+; GCN: ds_write_b32 {{v[0-9]*}}, v[[ADDRW:[0-9]*]]
+; GCN-NOT: ds_write_b32 {{v[0-9]*}}, v[[ADDRW]]
 
 ; GROUP_BARRIER must be the last instruction in a clause
-; EG-CHECK: GROUP_BARRIER
-; EG-CHECK-NEXT: ALU clause
+; EG: GROUP_BARRIER
+; EG-NEXT: ALU clause
 
 ; Make sure the lds reads are using different addresses, at different
 ; constant offsets.
-; EG-CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
-; EG-CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
+; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
+; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
 ; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], 16, v{{[0-9]+}}
-; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]] [M0]
-; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16 [M0]
-; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] [M0]
+; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]]
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]]
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] offset:16
 
 define void @local_memory_two_objects(i32 addrspace(1)* %out) {
 entry:
diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll
index 9b13cb2..68e72c5 100644
--- a/test/CodeGen/R600/local-memory.ll
+++ b/test/CodeGen/R600/local-memory.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
 @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
 
@@ -10,9 +10,9 @@
 ; EG: .long 166120
 ; EG-NEXT: .long 128
 ; SI: .long 47180
-; SI-NEXT: .long 65536
+; SI-NEXT: .long 71560
 ; CI: .long 47180
-; CI-NEXT: .long 32768
+; CI-NEXT: .long 38792
 
 ; EG: LDS_WRITE
 ; SI-NOT: s_wqm_b64
diff --git a/test/CodeGen/R600/loop-address.ll b/test/CodeGen/R600/loop-address.ll
index b46d8e9..03e0f01 100644
--- a/test/CodeGen/R600/loop-address.ll
+++ b/test/CodeGen/R600/loop-address.ll
@@ -31,7 +31,7 @@ attributes #0 = { nounwind "fp-contract-model"="standard" "relocation-model"="pi
 
 !opencl.kernels = !{!0, !1, !2, !3}
 
-!0 = metadata !{void (i32 addrspace(1)*, i32)* @loop_ge}
-!1 = metadata !{null}
-!2 = metadata !{null}
-!3 = metadata !{null}
+!0 = !{void (i32 addrspace(1)*, i32)* @loop_ge}
+!1 = !{null}
+!2 = !{null}
+!3 = !{null}
diff --git a/test/CodeGen/R600/loop-idiom.ll b/test/CodeGen/R600/loop-idiom.ll
index 0478bdb..a0b00ab 100644
--- a/test/CodeGen/R600/loop-idiom.ll
+++ b/test/CodeGen/R600/loop-idiom.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 target triple = "r600--"
diff --git a/test/CodeGen/R600/lshl.ll b/test/CodeGen/R600/lshl.ll
index 9785866..9ac988d 100644
--- a/test/CodeGen/R600/lshl.ll
+++ b/test/CodeGen/R600/lshl.ll
@@ -1,4 +1,5 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
 
diff --git a/test/CodeGen/R600/lshr.ll b/test/CodeGen/R600/lshr.ll
index acfc1fd..50e444a 100644
--- a/test/CodeGen/R600/lshr.ll
+++ b/test/CodeGen/R600/lshr.ll
@@ -1,4 +1,5 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
 
diff --git a/test/CodeGen/R600/m0-spill.ll b/test/CodeGen/R600/m0-spill.ll
index a8b0e0d..4dade82 100644
--- a/test/CodeGen/R600/m0-spill.ll
+++ b/test/CodeGen/R600/m0-spill.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 @lds = external addrspace(3) global [64 x float]
 
diff --git a/test/CodeGen/R600/mad-combine.ll b/test/CodeGen/R600/mad-combine.ll
new file mode 100644
index 0000000..8c4e09b
--- /dev/null
+++ b/test/CodeGen/R600/mad-combine.ll
@@ -0,0 +1,567 @@
+; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
+
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
+
+; Make sure we don't form mad with denormals
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.fma.f32(float, float, float) #0
+declare float @llvm.fmuladd.f32(float, float, float) #0
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_mad_f32_0:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+
+; SI-DENORM-SLOWFMAF-NOT: v_fma
+; SI-DENORM-SLOWFMAF-NOT: v_mad
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %gep.0
+  %b = load float addrspace(1)* %gep.1
+  %c = load float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %fma = fadd float %mul, %c
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0
+  %b = load float addrspace(1)* %gep.1
+  %c = load float addrspace(1)* %gep.2
+  %d = load float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %fma0 = fadd float %mul, %c
+  %fma1 = fadd float %mul, %d
+
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fadd x, (fmul y, z)) -> (fma y, z, x)
+; FUNC-LABEL: {{^}}combine_to_mad_f32_1:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %gep.0
+  %b = load float addrspace(1)* %gep.1
+  %c = load float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %fma = fadd float %c, %mul
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %gep.0
+  %b = load float addrspace(1)* %gep.1
+  %c = load float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %fma = fsub float %mul, %c
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0
+  %b = load float addrspace(1)* %gep.1
+  %c = load float addrspace(1)* %gep.2
+  %d = load float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %fma0 = fsub float %mul, %c
+  %fma1 = fsub float %mul, %d
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %gep.0
+  %b = load float addrspace(1)* %gep.1
+  %c = load float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %fma = fsub float %c, %mul
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0
+  %b = load float addrspace(1)* %gep.1
+  %c = load float addrspace(1)* %gep.2
+  %d = load float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %fma0 = fsub float %c, %mul
+  %fma1 = fsub float %d, %mul
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
+
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %gep.0
+  %b = load float addrspace(1)* %gep.1
+  %c = load float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %mul.neg = fsub float -0.0, %mul
+  %fma = fsub float %mul.neg, %c
+
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0
+  %b = load float addrspace(1)* %gep.1
+  %c = load float addrspace(1)* %gep.2
+  %d = load float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %mul.neg = fsub float -0.0, %mul
+  %fma0 = fsub float %mul.neg, %c
+  %fma1 = fsub float %mul.neg, %d
+
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0
+  %b = load float addrspace(1)* %gep.1
+  %c = load float addrspace(1)* %gep.2
+  %d = load float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %mul.neg = fsub float -0.0, %mul
+  %fma0 = fsub float %mul.neg, %c
+  %fma1 = fsub float %mul, %d
+
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]]
+
+; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], [[D]], [[E]], -[[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr float addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %z = load float addrspace(1)* %gep.2
+  %u = load float addrspace(1)* %gep.3
+  %v = load float addrspace(1)* %gep.4
+
+  %tmp0 = fmul float %u, %v
+  %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
+  %tmp2 = fsub float %tmp1, %z
+
+  store float %tmp2, float addrspace(1)* %gep.out
+  ret void
+}
+
+; fold (fsub x, (fma y, z, (fmul u, v)))
+;   -> (fma (fneg y), z, (fma (fneg u), v, x))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
+; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+
+; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], -[[D]], [[E]], [[A]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP0]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: s_endpgm
+define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr float addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %z = load float addrspace(1)* %gep.2
+  %u = load float addrspace(1)* %gep.3
+  %v = load float addrspace(1)* %gep.4
+
+  %tmp0 = fmul float %u, %v
+  %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0
+  %tmp2 = fsub float %x, %tmp1
+
+  store float %tmp2, float addrspace(1)* %gep.out
+  ret void
+}
+
+; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
+
+; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: s_endpgm
+define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr float addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %z = load float addrspace(1)* %gep.2
+  %u = load float addrspace(1)* %gep.3
+  %v = load float addrspace(1)* %gep.4
+
+  %tmp0 = fmul float %u, %v
+  %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
+  %tmp2 = fsub float %tmp1, %z
+
+  store float %tmp2, float addrspace(1)* %gep.out
+  ret void
+}
+
+; fold (fsub x, (fmuladd y, z, (fmul u, v)))
+;   -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
+
+; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: s_endpgm
+define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr float addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %z = load float addrspace(1)* %gep.2
+  %u = load float addrspace(1)* %gep.3
+  %v = load float addrspace(1)* %gep.4
+
+  %tmp0 = fmul float %u, %v
+  %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
+  %tmp2 = fsub float %x, %tmp1
+
+  store float %tmp2, float addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/mad-sub.ll b/test/CodeGen/R600/mad-sub.ll
index 240abd0..7b4020d 100644
--- a/test/CodeGen/R600/mad-sub.ll
+++ b/test/CodeGen/R600/mad-sub.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
 declare float @llvm.fabs.f32(float) #0
@@ -171,7 +171,7 @@ define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float
 
 ; FUNC-LABEL: {{^}}fsub_c_fadd_a_a:
 ; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
 ; SI: buffer_store_dword [[RESULT]]
 define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) {
@@ -192,7 +192,7 @@ define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in)
 
 ; FUNC-LABEL: {{^}}fsub_fadd_a_a_c:
 ; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]]
 ; SI: buffer_store_dword [[RESULT]]
 define void @fsub_fadd_a_a_c(float addrspace(1)* %out, float addrspace(1)* %in) {
diff --git a/test/CodeGen/R600/mad_int24.ll b/test/CodeGen/R600/mad_int24.ll
index c8dd377..86d75a6 100644
--- a/test/CodeGen/R600/mad_int24.ll
+++ b/test/CodeGen/R600/mad_int24.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
 declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
 
diff --git a/test/CodeGen/R600/mad_uint24.ll b/test/CodeGen/R600/mad_uint24.ll
index b7b32fe..95fe341 100644
--- a/test/CodeGen/R600/mad_uint24.ll
+++ b/test/CodeGen/R600/mad_uint24.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
 ; FUNC-LABEL: {{^}}u32_mad24:
 ; EG: MULADD_UINT24
diff --git a/test/CodeGen/R600/madak.ll b/test/CodeGen/R600/madak.ll
new file mode 100644
index 0000000..505a49b
--- /dev/null
+++ b/test/CodeGen/R600/madak.ll
@@ -0,0 +1,193 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+; FIXME: Enable VI
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; GCN-LABEL: {{^}}madak_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: buffer_load_dword [[VB:v[0-9]+]]
+; GCN: v_madak_f32 {{v[0-9]+}}, [[VB]], [[VA]], 0x41200000
+define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %in.a.gep, align 4
+  %b = load float addrspace(1)* %in.b.gep, align 4
+
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Make sure this is only folded with one use. This is a code size
+; optimization and if we fold the immediate multiple times, we'll undo
+; it.
+
+; GCN-LABEL: {{^}}madak_2_use_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]]
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VC]], [[VK]]
+; GCN: s_endpgm
+define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+  %in.gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
+  %in.gep.2 = getelementptr float addrspace(1)* %in.gep.0, i32 2
+
+  %out.gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %out.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
+
+  %a = load float addrspace(1)* %in.gep.0, align 4
+  %b = load float addrspace(1)* %in.gep.1, align 4
+  %c = load float addrspace(1)* %in.gep.2, align 4
+
+  %mul0 = fmul float %a, %b
+  %mul1 = fmul float %a, %c
+  %madak0 = fadd float %mul0, 10.0
+  %madak1 = fadd float %mul1, 10.0
+
+  store float %madak0, float addrspace(1)* %out.gep.0, align 4
+  store float %madak1, float addrspace(1)* %out.gep.1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
+define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %in.a.gep, align 4
+
+  %mul = fmul float 4.0, %a
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Make sure nothing weird happens with a value that is also allowed as
+; an inline immediate.
+
+; GCN-LABEL: {{^}}madak_inline_imm_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: buffer_load_dword [[VB:v[0-9]+]]
+; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
+define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %in.a.gep, align 4
+  %b = load float addrspace(1)* %in.b.gep, align 4
+
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 4.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; We can't use an SGPR when forming madak
+; GCN-LABEL: {{^}}s_v_madak_f32:
+; GCN: s_load_dword [[SB:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
+; GCN-NOT: v_madak_f32
+; GCN: v_mad_f32 {{v[0-9]+}}, [[SB]], [[VA]], [[VK]]
+define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %in.a.gep, align 4
+
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: @v_s_madak_f32
+; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
+; GCN-NOT: v_madak_f32
+; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[SB]], [[VK]]
+define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %b = load float addrspace(1)* %in.b.gep, align 4
+
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_s_madak_f32:
+; GCN-NOT: v_madak_f32
+; GCN: v_mad_f32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_madak_src0_modifier_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: buffer_load_dword [[VB:v[0-9]+]]
+; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
+; GCN: s_endpgm
+define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %in.a.gep, align 4
+  %b = load float addrspace(1)* %in.b.gep, align 4
+
+  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
+
+  %mul = fmul float %a.fabs, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_madak_src1_modifier_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: buffer_load_dword [[VB:v[0-9]+]]
+; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
+; GCN: s_endpgm
+define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %in.a.gep, align 4
+  %b = load float addrspace(1)* %in.b.gep, align 4
+
+  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
+
+  %mul = fmul float %a, %b.fabs
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/madmk.ll b/test/CodeGen/R600/madmk.ll
new file mode 100644
index 0000000..249e48e
--- /dev/null
+++ b/test/CodeGen/R600/madmk.ll
@@ -0,0 +1,181 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; GCN-LABEL: {{^}}madmk_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: v_madmk_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
+define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}madmk_2_use_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VB]]
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VC]]
+; GCN: s_endpgm
+define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+  %in.gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
+  %in.gep.2 = getelementptr float addrspace(1)* %in.gep.0, i32 2
+
+  %out.gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %out.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
+
+  %a = load float addrspace(1)* %in.gep.0, align 4
+  %b = load float addrspace(1)* %in.gep.1, align 4
+  %c = load float addrspace(1)* %in.gep.2, align 4
+
+  %mul0 = fmul float %a, 10.0
+  %mul1 = fmul float %a, 10.0
+  %madmk0 = fadd float %mul0, %b
+  %madmk1 = fadd float %mul1, %c
+
+  store float %madmk0, float addrspace(1)* %out.gep.0, align 4
+  store float %madmk1, float addrspace(1)* %out.gep.1, align 4
+  ret void
+}
+
+; We don't get any benefit if the constant is an inline immediate.
+; GCN-LABEL: {{^}}madmk_inline_imm_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: v_mad_f32 {{v[0-9]+}}, 4.0, [[VA]], [[VB]]
+define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %mul = fmul float %a, 4.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_s_madmk_f32:
+; GCN-NOT: v_madmk_f32
+; GCN: v_mad_f32
+; GCN: s_endpgm
+define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_s_madmk_f32:
+; GCN-NOT: v_madmk_f32
+; GCN: v_mad_f32
+; GCN: s_endpgm
+define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+  %a = load float addrspace(1)* %gep.0, align 4
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_vector_madmk_f32:
+; GCN-NOT: v_madmk_f32
+; GCN: v_mad_f32
+; GCN: s_endpgm
+define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+  %b = load float addrspace(1)* %gep.0, align 4
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_madmk_src0_modifier_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
+define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
+
+  %mul = fmul float %a.fabs, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_madmk_src2_modifier_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}|
+define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b.fabs
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}madmk_add_inline_imm_f32:
+; GCN: buffer_load_dword [[A:v[0-9]+]]
+; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0
+define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %gep.0, align 4
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, 2.0
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/max.ll b/test/CodeGen/R600/max.ll
index d67ef47..20af993 100644
--- a/test/CodeGen/R600/max.ll
+++ b/test/CodeGen/R600/max.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
diff --git a/test/CodeGen/R600/max3.ll b/test/CodeGen/R600/max3.ll
index 74b08f6..f905e17 100644
--- a/test/CodeGen/R600/max3.ll
+++ b/test/CodeGen/R600/max3.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
diff --git a/test/CodeGen/R600/min.ll b/test/CodeGen/R600/min.ll
index 88c0dff..00ba5c6 100644
--- a/test/CodeGen/R600/min.ll
+++ b/test/CodeGen/R600/min.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
@@ -97,3 +97,24 @@ define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: @v_test_umin_ult_i32_multi_use
+; SI-NOT: v_min
+; SI: v_cmp_lt_u32
+; SI-NEXT: v_cndmask_b32
+; SI-NOT: v_min
+; SI: s_endpgm
+define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep0 = getelementptr i32 addrspace(1)* %out0, i32 %tid
+  %outgep1 = getelementptr i1 addrspace(1)* %out1, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp ult i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep0, align 4
+  store i1 %cmp, i1 addrspace(1)* %outgep1
+  ret void
+}
diff --git a/test/CodeGen/R600/min3.ll b/test/CodeGen/R600/min3.ll
index f852cff..6c11a65 100644
--- a/test/CodeGen/R600/min3.ll
+++ b/test/CodeGen/R600/min3.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
diff --git a/test/CodeGen/R600/missing-store.ll b/test/CodeGen/R600/missing-store.ll
index 5346046..8ddef35 100644
--- a/test/CodeGen/R600/missing-store.ll
+++ b/test/CodeGen/R600/missing-store.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
 
 @ptr_load = addrspace(3) global i32 addrspace(2)* undef, align 8
 
diff --git a/test/CodeGen/R600/mubuf.ll b/test/CodeGen/R600/mubuf.ll
index c2efda4..988e5c1 100644
--- a/test/CodeGen/R600/mubuf.ll
+++ b/test/CodeGen/R600/mubuf.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
 
 declare i32 @llvm.r600.read.tidig.x() readnone
 
@@ -8,7 +8,7 @@ declare i32 @llvm.r600.read.tidig.x() readnone
 
 ; MUBUF load with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_load0:
-; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0x4 ; encoding: [0x04,0x00,0x30,0xe0
+; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0
 define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %in, i64 1
@@ -19,7 +19,7 @@ entry:
 
 ; MUBUF load with the largest possible immediate offset
 ; CHECK-LABEL: {{^}}mubuf_load1:
-; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0xfff ; encoding: [0xff,0x0f,0x20,0xe0
+; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0
 define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i8 addrspace(1)* %in, i64 4095
@@ -30,7 +30,8 @@ entry:
 
 ; MUBUF load with an immediate byte offset that doesn't fit into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_load2:
-; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 ; encoding: [0x00,0x80,0x30,0xe0
+; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
+; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0
 define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %in, i64 1024
@@ -42,7 +43,7 @@ entry:
 ; MUBUF load with a 12-bit immediate offset and a register offset
 ; CHECK-LABEL: {{^}}mubuf_load3:
 ; CHECK-NOT: ADD
-; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:0x4 ; encoding: [0x04,0x80,0x30,0xe0
+; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0
 define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %in, i64 %offset
@@ -52,13 +53,46 @@ entry:
   ret void
 }
 
+; CHECK-LABEL: {{^}}soffset_max_imm:
+; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc
+define void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
+main_body:
+  %tmp0 = getelementptr [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
+  %tmp1 = load <16 x i8> addrspace(2)* %tmp0
+  %tmp2 = shl i32 %6, 2
+  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %tmp4 = add i32 %6, 16
+  %tmp5 = bitcast float 0.0 to i32
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  ret void
+}
+
+; Make sure immediates that aren't inline constants don't get folded into
+; the soffset operand.
+; FIXME: for this test we should be smart enough to shift the immediate into
+; the offset field.
+; CHECK-LABEL: {{^}}soffset_no_fold:
+; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41
+; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc
+define void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
+main_body:
+  %tmp0 = getelementptr [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
+  %tmp1 = load <16 x i8> addrspace(2)* %tmp0
+  %tmp2 = shl i32 %6, 2
+  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %tmp4 = add i32 %6, 16
+  %tmp5 = bitcast float 0.0 to i32
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  ret void
+}
+
 ;;;==========================================================================;;;
 ;;; MUBUF STORE TESTS
 ;;;==========================================================================;;;
 
 ; MUBUF store with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_store0:
-; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0x4 ; encoding: [0x04,0x00,0x70,0xe0
+; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0
 define void @mubuf_store0(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %out, i64 1
@@ -68,7 +102,7 @@ entry:
 
 ; MUBUF store with the largest possible immediate offset
 ; CHECK-LABEL: {{^}}mubuf_store1:
-; CHECK: buffer_store_byte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0xfff ; encoding: [0xff,0x0f,0x60,0xe0
+; CHECK: buffer_store_byte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0
 
 define void @mubuf_store1(i8 addrspace(1)* %out) {
 entry:
@@ -79,7 +113,8 @@ entry:
 
 ; MUBUF store with an immediate byte offset that doesn't fit into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_store2:
-; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0 addr64 ; encoding: [0x00,0x80,0x70,0xe0
+; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
+; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0
 define void @mubuf_store2(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %out, i64 1024
@@ -90,7 +125,7 @@ entry:
 ; MUBUF store with a 12-bit immediate offset and a register offset
 ; CHECK-LABEL: {{^}}mubuf_store3:
 ; CHECK-NOT: ADD
-; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:0x4 ; encoding: [0x04,0x80,0x70,0xe0
+; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0
 define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %out, i64 %offset
@@ -107,7 +142,7 @@ define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
 }
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_offset:
-; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:0x28
+; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40
 define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
   %out.gep = getelementptr i32 addrspace(1)* %out, i32 10
   store i32 99, i32 addrspace(1)* %out.gep, align 4
@@ -115,13 +150,23 @@ define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
 }
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset:
-; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
+; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
 define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
   %out.gep = getelementptr i32 addrspace(1)* %out, i32 32768
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
 }
 
+; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic:
+; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
+; CHECK: buffer_atomic_add v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
+define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr i32 addrspace(1)* %out, i32 32768
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst
+  ret void
+}
+
 ; CHECK-LABEL: {{^}}store_vgpr_ptr:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
 define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
@@ -130,3 +175,9 @@ define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
 }
+
+declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #3
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+attributes #1 = { "ShaderType"="2" "unsafe-fp-math"="true" }
+attributes #3 = { nounwind readonly }
diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll
index be5d6a0..6f15e70 100644
--- a/test/CodeGen/R600/mul.ll
+++ b/test/CodeGen/R600/mul.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
-; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; mul24 and mad24 are affected
 
diff --git a/test/CodeGen/R600/mul_int24.ll b/test/CodeGen/R600/mul_int24.ll
index be58f7e..7609dcc 100644
--- a/test/CodeGen/R600/mul_int24.ll
+++ b/test/CodeGen/R600/mul_int24.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
 ; FUNC-LABEL: {{^}}i32_mul24:
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
diff --git a/test/CodeGen/R600/mul_uint24.ll b/test/CodeGen/R600/mul_uint24.ll
index 8d1cda8..e640a7c 100644
--- a/test/CodeGen/R600/mul_uint24.ll
+++ b/test/CodeGen/R600/mul_uint24.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
 ; FUNC-LABEL: {{^}}u32_mul24:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
diff --git a/test/CodeGen/R600/mulhu.ll b/test/CodeGen/R600/mulhu.ll
index 82a0783..29b0944 100644
--- a/test/CodeGen/R600/mulhu.ll
+++ b/test/CodeGen/R600/mulhu.ll
@@ -1,7 +1,8 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
-;CHECK: v_mul_hi_u32 v0, {{[sv][0-9]+}}, {{v[0-9]+}}
+;CHECK: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}}
 ;CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
 
 define void @test(i32 %p) {
diff --git a/test/CodeGen/R600/no-initializer-constant-addrspace.ll b/test/CodeGen/R600/no-initializer-constant-addrspace.ll
index cd2dca3..532edf0 100644
--- a/test/CodeGen/R600/no-initializer-constant-addrspace.ll
+++ b/test/CodeGen/R600/no-initializer-constant-addrspace.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -o /dev/null %s
+; RUN: llc -march=amdgcn -mcpu=SI -o /dev/null %s
+; RUN: llc -march=amdgcn -mcpu=tonga -o /dev/null %s
 ; RUN: llc -march=r600 -mcpu=cypress -o /dev/null %s
 
 @extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4
diff --git a/test/CodeGen/R600/no-shrink-extloads.ll b/test/CodeGen/R600/no-shrink-extloads.ll
new file mode 100644
index 0000000..3079492
--- /dev/null
+++ b/test/CodeGen/R600/no-shrink-extloads.ll
@@ -0,0 +1,191 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; Make sure we don't turn the 32-bit argument load into a 16-bit
+; load. There aren't extending scalar lods, so that would require
+; using a buffer_load instruction.
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16:
+; SI: s_load_dword s
+; SI: buffer_store_short v
+define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
+  %trunc = trunc i32 %arg to i16
+  store i16 %trunc, i16 addrspace(1)* %out
+  ret void
+}
+
+; It should be OK (and probably performance neutral) to reduce this,
+; but we don't know if the load is uniform yet.
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16:
+; SI: buffer_load_dword v
+; SI: buffer_store_short v
+define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i16 addrspace(1)* %out, i32 %tid
+  %load = load i32 addrspace(1)* %gep.in
+  %trunc = trunc i32 %load to i16
+  store i16 %trunc, i16 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
+  %trunc = trunc i32 %arg to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8:
+; SI: buffer_load_dword v
+; SI: buffer_store_byte v
+define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid
+  %load = load i32 addrspace(1)* %gep.in
+  %trunc = trunc i32 %load to i8
+  store i8 %trunc, i8 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
+  %trunc = trunc i32 %arg to i1
+  store i1 %trunc, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1:
+; SI: buffer_load_dword v
+; SI: buffer_store_byte v
+define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i1 addrspace(1)* %out, i32 %tid
+  %load = load i32 addrspace(1)* %gep.in
+  %trunc = trunc i32 %load to i1
+  store i1 %trunc, i1 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
+; SI: s_load_dword s
+; SI: buffer_store_dword v
+define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+  %trunc = trunc i64 %arg to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32:
+; SI: buffer_load_dword v
+; SI: buffer_store_dword v
+define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %load = load i64 addrspace(1)* %gep.in
+  %trunc = trunc i64 %load to i32
+  store i32 %trunc, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
+; SI: s_load_dword s
+; SI: buffer_store_dword v
+define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+  %srl = lshr i64 %arg, 32
+  %trunc = trunc i64 %srl to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32:
+; SI: buffer_load_dword v
+; SI: buffer_store_dword v
+define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %load = load i64 addrspace(1)* %gep.in
+  %srl = lshr i64 %load, 32
+  %trunc = trunc i64 %srl to i32
+  store i32 %trunc, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; Might as well reduce to 8-bit loads.
+; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
+  %trunc = trunc i16 %arg to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8:
+; SI: buffer_load_ubyte v
+; SI: buffer_store_byte v
+define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i16 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid
+  %load = load i16 addrspace(1)* %gep.in
+  %trunc = trunc i16 %load to i8
+  store i8 %trunc, i8 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+  %srl = lshr i64 %arg, 32
+  %trunc = trunc i64 %srl to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8:
+; SI: buffer_load_dword v
+; SI: buffer_store_byte v
+define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid
+  %load = load i64 addrspace(1)* %gep.in
+  %srl = lshr i64 %load, 32
+  %trunc = trunc i64 %srl to i8
+  store i8 %trunc, i8 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+  %trunc = trunc i64 %arg to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8:
+; SI: buffer_load_dword v
+; SI: buffer_store_byte v
+define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid
+  %load = load i64 addrspace(1)* %gep.in
+  %trunc = trunc i64 %load to i8
+  store i8 %trunc, i8 addrspace(1)* %gep.out
+  ret void
+}
diff --git a/test/CodeGen/R600/operand-folding.ll b/test/CodeGen/R600/operand-folding.ll
new file mode 100644
index 0000000..88a8145
--- /dev/null
+++ b/test/CodeGen/R600/operand-folding.ll
@@ -0,0 +1,113 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: {{^}}fold_sgpr:
+; CHECK: v_add_i32_e32 v{{[0-9]+}}, s
+define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) {
+entry:
+  %tmp0 = icmp ne i32 %fold, 0
+  br i1 %tmp0, label %if, label %endif
+
+if:
+  %id = call i32 @llvm.r600.read.tidig.x()
+  %offset = add i32 %fold, %id
+  %tmp1 = getelementptr i32 addrspace(1)* %out, i32 %offset
+  store i32 0, i32 addrspace(1)* %tmp1
+  br label %endif
+
+endif:
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fold_imm:
+; CHECK v_or_i32_e32 v{{[0-9]+}}, 5
+define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {
+entry:
+  %fold = add i32 3, 2
+  %tmp0 = icmp ne i32 %cmp, 0
+  br i1 %tmp0, label %if, label %endif
+
+if:
+  %id = call i32 @llvm.r600.read.tidig.x()
+  %val = or i32 %id, %fold
+  store i32 %val, i32 addrspace(1)* %out
+  br label %endif
+
+endif:
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fold_64bit_constant_add:
+; CHECK-NOT: s_mov_b64
+; FIXME: It would be better if we could use v_add here and drop the extra
+; v_mov_b32 instructions.
+; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1
+; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0
+; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}},
+
+define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
+entry:
+  %tmp0 = add i64 %val, 1
+  store i64 %tmp0, i64 addrspace(1)* %out
+  ret void
+}
+
+; Inline constants should always be folded.
+
+; CHECK-LABEL: {{^}}vector_inline:
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
+
+define void @vector_inline(<4 x i32> addrspace(1)* %out) {
+entry:
+  %tmp0 = call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = add i32 %tmp0, 1
+  %tmp2 = add i32 %tmp0, 2
+  %tmp3 = add i32 %tmp0, 3
+  %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
+  %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1
+  %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2
+  %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3
+  %tmp4 = xor <4 x i32> <i32 5, i32 5, i32 5, i32 5>, %vec3
+  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; Immediates with one use should be folded
+; CHECK-LABEL: {{^}}imm_one_use:
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}}
+
+define void @imm_one_use(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = xor i32 %tmp0, 100
+  store i32 %tmp1, i32 addrspace(1)* %out
+  ret void
+}
+; CHECK-LABEL: {{^}}vector_imm:
+; CHECK: s_movk_i32 [[IMM:s[0-9]+]], 0x64
+; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
+; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
+; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
+; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
+
+define void @vector_imm(<4 x i32> addrspace(1)* %out) {
+entry:
+  %tmp0 = call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = add i32 %tmp0, 1
+  %tmp2 = add i32 %tmp0, 2
+  %tmp3 = add i32 %tmp0, 3
+  %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
+  %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1
+  %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2
+  %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3
+  %tmp4 = xor <4 x i32> <i32 100, i32 100, i32 100, i32 100>, %vec3
+  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/operand-spacing.ll b/test/CodeGen/R600/operand-spacing.ll
index f0d228d..20420a8 100644
--- a/test/CodeGen/R600/operand-spacing.ll
+++ b/test/CodeGen/R600/operand-spacing.ll
@@ -1,13 +1,16 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
 
 ; Make sure there isn't an extra space between the instruction name and first operands.
 
-; SI-LABEL: {{^}}add_f32:
+; GCN-LABEL: {{^}}add_f32:
 ; SI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
-; SI: buffer_store_dword [[RESULT]],
+; VI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
+; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
+; GCN: buffer_store_dword [[RESULT]],
 define void @add_f32(float addrspace(1)* %out, float %a, float %b) {
   %result = fadd float %a, %b
   store float %result, float addrspace(1)* %out
diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
index b7493d3..78879a8 100644
--- a/test/CodeGen/R600/or.ll
+++ b/test/CodeGen/R600/or.ll
@@ -1,14 +1,14 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-; EG-LABEL: {{^}}or_v2i32:
+
+; FUNC-LABEL: {{^}}or_v2i32:
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI-LABEL: {{^}}or_v2i32:
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
 define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32> addrspace(1) * %in
@@ -18,18 +18,16 @@ define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in)
   ret void
 }
 
-; EG-LABEL: {{^}}or_v4i32:
+; FUNC-LABEL: {{^}}or_v4i32:
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI-LABEL: {{^}}or_v4i32:
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
 define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
@@ -39,7 +37,7 @@ define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in)
   ret void
 }
 
-; SI-LABEL: {{^}}scalar_or_i32:
+; FUNC-LABEL: {{^}}scalar_or_i32:
 ; SI: s_or_b32
 define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %or = or i32 %a, %b
@@ -47,7 +45,7 @@ define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   ret void
 }
 
-; SI-LABEL: {{^}}vector_or_i32:
+; FUNC-LABEL: {{^}}vector_or_i32:
 ; SI: v_or_b32_e32 v{{[0-9]}}
 define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
   %loada = load i32 addrspace(1)* %a
@@ -56,7 +54,7 @@ define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b)
   ret void
 }
 
-; SI-LABEL: {{^}}scalar_or_literal_i32:
+; FUNC-LABEL: {{^}}scalar_or_literal_i32:
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f
 define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
   %or = or i32 %a, 99999
@@ -64,7 +62,7 @@ define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
   ret void
 }
 
-; SI-LABEL: {{^}}vector_or_literal_i32:
+; FUNC-LABEL: {{^}}vector_or_literal_i32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
 define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
   %loada = load i32 addrspace(1)* %a, align 4
@@ -73,7 +71,7 @@ define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a,
   ret void
 }
 
-; SI-LABEL: {{^}}vector_or_inline_immediate_i32:
+; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}}
 define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
   %loada = load i32 addrspace(1)* %a, align 4
@@ -82,10 +80,10 @@ define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspac
   ret void
 }
 
-; EG-LABEL: {{^}}scalar_or_i64:
+; FUNC-LABEL: {{^}}scalar_or_i64:
 ; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
 ; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
-; SI-LABEL: {{^}}scalar_or_i64:
+
 ; SI: s_or_b64
 define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = or i64 %a, %b
@@ -93,7 +91,7 @@ define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   ret void
 }
 
-; SI-LABEL: {{^}}vector_or_i64:
+; FUNC-LABEL: {{^}}vector_or_i64:
 ; SI: v_or_b32_e32 v{{[0-9]}}
 ; SI: v_or_b32_e32 v{{[0-9]}}
 define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
@@ -104,7 +102,7 @@ define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
   ret void
 }
 
-; SI-LABEL: {{^}}scalar_vector_or_i64:
+; FUNC-LABEL: {{^}}scalar_vector_or_i64:
 ; SI: v_or_b32_e32 v{{[0-9]}}
 ; SI: v_or_b32_e32 v{{[0-9]}}
 define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
@@ -114,7 +112,7 @@ define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a,
   ret void
 }
 
-; SI-LABEL: {{^}}vector_or_i64_loadimm:
+; FUNC-LABEL: {{^}}vector_or_i64_loadimm:
 ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f
 ; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x146f
 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
@@ -129,7 +127,7 @@ define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a,
 }
 
 ; FIXME: The or 0 should really be removed.
-; SI-LABEL: {{^}}vector_or_i64_imm:
+; FUNC-LABEL: {{^}}vector_or_i64_imm:
 ; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
 ; SI: v_or_b32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]]
 ; SI: v_or_b32_e32 {{v[0-9]+}}, 0, {{.*}}
@@ -141,7 +139,7 @@ define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64
   ret void
 }
 
-; SI-LABEL: {{^}}trunc_i64_or_to_i32:
+; FUNC-LABEL: {{^}}trunc_i64_or_to_i32:
 ; SI: s_load_dword s[[SREG0:[0-9]+]]
 ; SI: s_load_dword s[[SREG1:[0-9]+]]
 ; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
@@ -154,14 +152,13 @@ define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   ret void
 }
 
-; EG-CHECK: {{^}}or_i1:
-; EG-CHECK: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+; FUNC-LABEL: {{^}}or_i1:
+; EG: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
 
-; SI-CHECK: {{^}}or_i1:
-; SI-CHECK: s_or_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
 define void @or_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
-  %a = load float addrspace(1) * %in0
-  %b = load float addrspace(1) * %in1
+  %a = load float addrspace(1)* %in0
+  %b = load float addrspace(1)* %in1
   %acmp = fcmp oge float %a, 0.000000e+00
   %bcmp = fcmp oge float %b, 0.000000e+00
   %or = or i1 %acmp, %bcmp
@@ -169,3 +166,13 @@ define void @or_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float add
   store float %result, float addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}s_or_i1:
+; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
+define void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+  %cmp0 = icmp eq i32 %a, %b
+  %cmp1 = icmp eq i32 %c, %d
+  %or = or i1 %cmp0, %cmp1
+  store i1 %or, i1 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/private-memory-atomics.ll b/test/CodeGen/R600/private-memory-atomics.ll
index def4f9d..3ceb0c0 100644
--- a/test/CodeGen/R600/private-memory-atomics.ll
+++ b/test/CodeGen/R600/private-memory-atomics.ll
@@ -1,4 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s
 
 ; This works because promote allocas pass replaces these with LDS atomics.
 
diff --git a/test/CodeGen/R600/private-memory-broken.ll b/test/CodeGen/R600/private-memory-broken.ll
index 4086085..10590a9 100644
--- a/test/CodeGen/R600/private-memory-broken.ll
+++ b/test/CodeGen/R600/private-memory-broken.ll
@@ -1,4 +1,5 @@
-; RUN: not llc -verify-machineinstrs -march=r600 -mcpu=SI %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=SI %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=tonga %s -o /dev/null 2>&1 | FileCheck %s
 
 ; Make sure promote alloca pass doesn't crash
 
diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
index bfb4a6a..b03029c 100644
--- a/test/CodeGen/R600/private-memory.ll
+++ b/test/CodeGen/R600/private-memory.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
@@ -117,7 +119,7 @@ for.end:
 ; R600: MOVA_INT
 
 ; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0
-; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x2 ; encoding: [0x02,0x10,0x68,0xe0
+; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:2 ; encoding: [0x02,0x10,0x68,0xe0
 ; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
 define void @short_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
@@ -138,7 +140,7 @@ entry:
 ; R600: MOVA_INT
 
 ; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0
-; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x1 ; encoding: [0x01,0x10,0x60,0xe0
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:1 ; encoding: [0x01,0x10,0x60,0xe0
 define void @char_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %0 = alloca [2 x i8]
@@ -296,7 +298,7 @@ entry:
 ; FUNC-LABEL: ptrtoint:
 ; SI-NOT: ds_write
 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
-; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x5
+; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5
 define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %alloca = alloca [16 x i32]
   %tmp0 = getelementptr [16 x i32]* %alloca, i32 0, i32 %a
diff --git a/test/CodeGen/R600/r600-encoding.ll b/test/CodeGen/R600/r600-encoding.ll
index 112cdac..3a82ee3 100644
--- a/test/CodeGen/R600/r600-encoding.ll
+++ b/test/CodeGen/R600/r600-encoding.ll
@@ -1,14 +1,14 @@
-; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rs880 | FileCheck --check-prefix=R600-CHECK %s
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=redwood | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rs880 | FileCheck --check-prefix=R600 %s
 
 ; The earliest R600 GPUs have a slightly different encoding than the rest of
 ; the VLIW4/5 GPUs.
 
-; EG-CHECK: {{^}}test:
-; EG-CHECK: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x01,0x[0-9a-f]+,0x[0-9a-f]+}}]
+; EG: {{^}}test:
+; EG: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x01,0x[0-9a-f]+,0x[0-9a-f]+}}]
 
-; R600-CHECK: {{^}}test:
-; R600-CHECK: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}]
+; R600: {{^}}test:
+; R600: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}]
 
 define void @test(<4 x float> inreg %reg0) #0 {
 entry:
diff --git a/test/CodeGen/R600/register-count-comments.ll b/test/CodeGen/R600/register-count-comments.ll
index 61d1b5e..2b49f97 100644
--- a/test/CodeGen/R600/register-count-comments.ll
+++ b/test/CodeGen/R600/register-count-comments.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() nounwind readnone
 
diff --git a/test/CodeGen/R600/reorder-stores.ll b/test/CodeGen/R600/reorder-stores.ll
index 30c0171..ea50d5e 100644
--- a/test/CodeGen/R600/reorder-stores.ll
+++ b/test/CodeGen/R600/reorder-stores.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store:
 ; SI: buffer_load_dwordx2
diff --git a/test/CodeGen/R600/rotl.i64.ll b/test/CodeGen/R600/rotl.i64.ll
index 84a35b6..6da17a4 100644
--- a/test/CodeGen/R600/rotl.i64.ll
+++ b/test/CodeGen/R600/rotl.i64.ll
@@ -1,11 +1,12 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
 
-; FUNC-LABEL: {{^}}s_rotl_i64:
-; SI-DAG: s_lshl_b64
-; SI-DAG: s_sub_i32
-; SI-DAG: s_lshr_b64
-; SI: s_or_b64
-; SI: s_endpgm
+; BOTH-LABEL: {{^}}s_rotl_i64:
+; BOTH-DAG: s_lshl_b64
+; BOTH-DAG: s_sub_i32
+; BOTH-DAG: s_lshr_b64
+; BOTH: s_or_b64
+; BOTH: s_endpgm
 define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
 entry:
   %0 = shl i64 %x, %y
@@ -16,13 +17,15 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_rotl_i64:
+; BOTH-LABEL: {{^}}v_rotl_i64:
 ; SI-DAG: v_lshl_b64
-; SI-DAG: v_sub_i32
+; VI-DAG: v_lshlrev_b64
+; BOTH-DAG: v_sub_i32
 ; SI: v_lshr_b64
-; SI: v_or_b32
-; SI: v_or_b32
-; SI: s_endpgm
+; VI: v_lshrrev_b64
+; BOTH: v_or_b32
+; BOTH: v_or_b32
+; BOTH: s_endpgm
 define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
 entry:
   %x = load i64 addrspace(1)* %xptr, align 8
diff --git a/test/CodeGen/R600/rotl.ll b/test/CodeGen/R600/rotl.ll
index 6c8e503..6c144cd 100644
--- a/test/CodeGen/R600/rotl.ll
+++ b/test/CodeGen/R600/rotl.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}rotl_i32:
 ; R600: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
diff --git a/test/CodeGen/R600/rotr.i64.ll b/test/CodeGen/R600/rotr.i64.ll
index 9e14570..f1d1d26 100644
--- a/test/CodeGen/R600/rotr.i64.ll
+++ b/test/CodeGen/R600/rotr.i64.ll
@@ -1,10 +1,11 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
 
-; FUNC-LABEL: {{^}}s_rotr_i64:
-; SI-DAG: s_sub_i32
-; SI-DAG: s_lshr_b64
-; SI-DAG: s_lshl_b64
-; SI: s_or_b64
+; BOTH-LABEL: {{^}}s_rotr_i64:
+; BOTH-DAG: s_sub_i32
+; BOTH-DAG: s_lshr_b64
+; BOTH-DAG: s_lshl_b64
+; BOTH: s_or_b64
 define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
 entry:
   %tmp0 = sub i64 64, %y
@@ -15,12 +16,14 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_rotr_i64:
-; SI-DAG: v_sub_i32
+; BOTH-LABEL: {{^}}v_rotr_i64:
+; BOTH-DAG: v_sub_i32
 ; SI-DAG: v_lshr_b64
 ; SI-DAG: v_lshl_b64
-; SI: v_or_b32
-; SI: v_or_b32
+; VI-DAG: v_lshrrev_b64
+; VI-DAG: v_lshlrev_b64
+; BOTH: v_or_b32
+; BOTH: v_or_b32
 define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
 entry:
   %x = load i64 addrspace(1)* %xptr, align 8
@@ -33,7 +36,7 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_rotr_v2i64:
+; BOTH-LABEL: {{^}}s_rotr_v2i64:
 define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
 entry:
   %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
@@ -44,7 +47,7 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_rotr_v2i64:
+; BOTH-LABEL: {{^}}v_rotr_v2i64:
 define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
 entry:
   %x = load <2 x i64> addrspace(1)* %xptr, align 8
diff --git a/test/CodeGen/R600/rotr.ll b/test/CodeGen/R600/rotr.ll
index a1add11..044f9ff 100644
--- a/test/CodeGen/R600/rotr.ll
+++ b/test/CodeGen/R600/rotr.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}rotr_i32:
 ; R600: BIT_ALIGN_INT
diff --git a/test/CodeGen/R600/rsq.ll b/test/CodeGen/R600/rsq.ll
index d792c9f..b8a23df 100644
--- a/test/CodeGen/R600/rsq.ll
+++ b/test/CodeGen/R600/rsq.ll
@@ -1,6 +1,7 @@
-; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
-; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
 
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 declare float @llvm.sqrt.f32(float) nounwind readnone
 declare double @llvm.sqrt.f64(double) nounwind readnone
 
@@ -36,3 +37,38 @@ define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind
   store float %div, float addrspace(1)* %out, align 4
   ret void
 }
+
+; Recognize that this is rsqrt(a) * rcp(b) * c,
+; not 1 / ( 1 / sqrt(a)) * rcp(b) * c.
+
+; SI-LABEL: @rsqrt_fmul
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+
+; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]]
+; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]]
+; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]]
+; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+; SI-UNSAFE: buffer_store_dword [[RESULT]]
+
+; SI-SAFE-NOT: v_rsq_f32
+
+; SI: s_endpgm
+define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+
+  %a = load float addrspace(1)* %gep.0
+  %b = load float addrspace(1)* %gep.1
+  %c = load float addrspace(1)* %gep.2
+
+  %x = call float @llvm.sqrt.f32(float %a)
+  %y = fmul float %x, %b
+  %z = fdiv float %c, %y
+  store float %z, float addrspace(1)* %out.gep
+  ret void
+}
diff --git a/test/CodeGen/R600/s_movk_i32.ll b/test/CodeGen/R600/s_movk_i32.ll
index 71f9a41..8be2d1d 100644
--- a/test/CodeGen/R600/s_movk_i32.ll
+++ b/test/CodeGen/R600/s_movk_i32.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}s_movk_i32_k0:
 ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}}
diff --git a/test/CodeGen/R600/saddo.ll b/test/CodeGen/R600/saddo.ll
index 654967c..8e625c1 100644
--- a/test/CodeGen/R600/saddo.ll
+++ b/test/CodeGen/R600/saddo.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
 
 declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/R600/salu-to-valu.ll b/test/CodeGen/R600/salu-to-valu.ll
index 23af3e4..dfb181d 100644
--- a/test/CodeGen/R600/salu-to-valu.ll
+++ b/test/CodeGen/R600/salu-to-valu.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
 
 ; In this test both the pointer and the offset operands to the
 ; BUFFER_LOAD instructions end up being stored in vgprs.  This
diff --git a/test/CodeGen/R600/scalar_to_vector.ll b/test/CodeGen/R600/scalar_to_vector.ll
index dc9ebe0..b82e552 100644
--- a/test/CodeGen/R600/scalar_to_vector.ll
+++ b/test/CodeGen/R600/scalar_to_vector.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 
 ; FUNC-LABEL: {{^}}scalar_to_vector_v2i32:
diff --git a/test/CodeGen/R600/schedule-global-loads.ll b/test/CodeGen/R600/schedule-global-loads.ll
index 5422ca7..b6437d2 100644
--- a/test/CodeGen/R600/schedule-global-loads.ll
+++ b/test/CodeGen/R600/schedule-global-loads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
 
 
 declare i32 @llvm.r600.read.tidig.x() #1
@@ -10,7 +10,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
 
 ; FUNC-LABEL: {{^}}cluster_global_arg_loads:
 ; SI-DAG: buffer_load_dword [[REG0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:0x4
+; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4
 ; SI: buffer_store_dword [[REG0]]
 ; SI: buffer_store_dword [[REG1]]
 define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
diff --git a/test/CodeGen/R600/schedule-kernel-arg-loads.ll b/test/CodeGen/R600/schedule-kernel-arg-loads.ll
index e774157..f9641fa 100644
--- a/test/CodeGen/R600/schedule-kernel-arg-loads.ll
+++ b/test/CodeGen/R600/schedule-kernel-arg-loads.ll
@@ -1,10 +1,18 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI %s
 
 ; FUNC-LABEL: {{^}}cluster_arg_loads:
 ; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
 ; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
+; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; VI-NEXT: s_nop 0
+; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-NEXT: s_nop 0
+; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; VI-NEXT: s_nop 0
+; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
 define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
   store i32 %x, i32 addrspace(1)* %out0, align 4
   store i32 %y, i32 addrspace(1)* %out1, align 4
diff --git a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
index baac5b5..76b655d 100644
--- a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
+++ b/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
@@ -1,6 +1,7 @@
 ; XFAIL: *
 ; REQUIRES: asserts
-; RUN: llc -O0 -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
+; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
 
 declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
 
diff --git a/test/CodeGen/R600/scratch-buffer.ll b/test/CodeGen/R600/scratch-buffer.ll
new file mode 100644
index 0000000..8c5a990
--- /dev/null
+++ b/test/CodeGen/R600/scratch-buffer.ll
@@ -0,0 +1,87 @@
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s
+
+; When a frame index offset is more than 12-bits, make sure we don't store
+; it in mubuf's offset field.
+
+; Also, make sure we use the same register for storing the scratch buffer addresss
+; for both stores. This register is allocated by the register scavenger, so we
+; should be able to reuse the same regiser for each scratch buffer access.
+
+; CHECK-LABEL: {{^}}legal_offset_fi:
+; CHECK: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0{{$}}
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
+; CHECK: v_mov_b32_e32 [[OFFSET]], 0x8000
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
+
+define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
+entry:
+  %scratch0 = alloca [8192 x i32]
+  %scratch1 = alloca [8192 x i32]
+
+  %scratchptr0 = getelementptr [8192 x i32]* %scratch0, i32 0, i32 0
+  store i32 1, i32* %scratchptr0
+
+  %scratchptr1 = getelementptr [8192 x i32]* %scratch1, i32 0, i32 0
+  store i32 2, i32* %scratchptr1
+
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %if, label %else
+
+if:
+  %if_ptr = getelementptr [8192 x i32]* %scratch0, i32 0, i32 %if_offset
+  %if_value = load i32* %if_ptr
+  br label %done
+
+else:
+  %else_ptr = getelementptr [8192 x i32]* %scratch1, i32 0, i32 %else_offset
+  %else_value = load i32* %else_ptr
+  br label %done
+
+done:
+  %value = phi i32 [%if_value, %if], [%else_value, %else]
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+
+  ret void
+
+}
+
+; CHECK-LABEL: {{^}}legal_offset_fi_offset
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
+; CHECK: v_add_i32_e32 [[OFFSET:v[0-9]+]], 0x8000
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
+
+define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
+entry:
+  %scratch0 = alloca [8192 x i32]
+  %scratch1 = alloca [8192 x i32]
+
+  %offset0 = load i32 addrspace(1)* %offsets
+  %scratchptr0 = getelementptr [8192 x i32]* %scratch0, i32 0, i32 %offset0
+  store i32 %offset0, i32* %scratchptr0
+
+  %offsetptr1 = getelementptr i32 addrspace(1)* %offsets, i32 1
+  %offset1 = load i32 addrspace(1)* %offsetptr1
+  %scratchptr1 = getelementptr [8192 x i32]* %scratch1, i32 0, i32 %offset1
+  store i32 %offset1, i32* %scratchptr1
+
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %if, label %else
+
+if:
+  %if_ptr = getelementptr [8192 x i32]* %scratch0, i32 0, i32 %if_offset
+  %if_value = load i32* %if_ptr
+  br label %done
+
+else:
+  %else_ptr = getelementptr [8192 x i32]* %scratch1, i32 0, i32 %else_offset
+  %else_value = load i32* %else_ptr
+  br label %done
+
+done:
+  %value = phi i32 [%if_value, %if], [%else_value, %else]
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/R600/sdiv.ll b/test/CodeGen/R600/sdiv.ll
index 16853e0..07bb417 100644
--- a/test/CodeGen/R600/sdiv.ll
+++ b/test/CodeGen/R600/sdiv.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; The code generated by sdiv is long and complex and may frequently change.
@@ -35,7 +36,7 @@ define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; FUNC-LABEL: {{^}}slow_sdiv_i32_3435:
 ; SI: buffer_load_dword [[VAL:v[0-9]+]],
 ; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
-; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[VAL]], [[MAGIC]]
+; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]]
 ; SI: v_add_i32
 ; SI: v_lshrrev_b32
 ; SI: v_ashrrev_i32
diff --git a/test/CodeGen/R600/sdivrem24.ll b/test/CodeGen/R600/sdivrem24.ll
index 228cf76..e8c5c25 100644
--- a/test/CodeGen/R600/sdivrem24.ll
+++ b/test/CodeGen/R600/sdivrem24.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}sdiv24_i8:
diff --git a/test/CodeGen/R600/sdivrem64.ll b/test/CodeGen/R600/sdivrem64.ll
new file mode 100644
index 0000000..a9b2b7f
--- /dev/null
+++ b/test/CodeGen/R600/sdivrem64.ll
@@ -0,0 +1,225 @@
+;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+
+;FUNC-LABEL: {{^}}test_sdiv:
+;EG: RECIP_UINT
+;EG: LSHL {{.*}}, 1,
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %result = sdiv i64 %x, %y
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_srem:
+;EG: RECIP_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: AND_INT {{.*}}, 1,
+
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %result = urem i64 %x, %y
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_sdiv3264:
+;EG: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = ashr i64 %x, 33
+  %2 = ashr i64 %y, 33
+  %result = sdiv i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_srem3264:
+;EG: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = ashr i64 %x, 33
+  %2 = ashr i64 %y, 33
+  %result = srem i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_sdiv2464:
+;EG: INT_TO_FLT
+;EG: INT_TO_FLT
+;EG: FLT_TO_INT
+;EG-NOT: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = ashr i64 %x, 40
+  %2 = ashr i64 %y, 40
+  %result = sdiv i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_srem2464:
+;EG: INT_TO_FLT
+;EG: INT_TO_FLT
+;EG: FLT_TO_INT
+;EG-NOT: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = ashr i64 %x, 40
+  %2 = ashr i64 %y, 40
+  %result = srem i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/select-i1.ll b/test/CodeGen/R600/select-i1.ll
index 2e2d0e4..6735394 100644
--- a/test/CodeGen/R600/select-i1.ll
+++ b/test/CodeGen/R600/select-i1.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI
 
diff --git a/test/CodeGen/R600/select-vectors.ll b/test/CodeGen/R600/select-vectors.ll
index 7d8df2e..59082c6 100644
--- a/test/CodeGen/R600/select-vectors.ll
+++ b/test/CodeGen/R600/select-vectors.ll
@@ -1,4 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; Test expansion of scalar selects on vectors.
 ; Evergreen not enabled since it seems to be having problems with doubles.
diff --git a/test/CodeGen/R600/select64.ll b/test/CodeGen/R600/select64.ll
index 8de34d5..0245dae 100644
--- a/test/CodeGen/R600/select64.ll
+++ b/test/CodeGen/R600/select64.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ; CHECK-LABEL: {{^}}select0:
 ; i64 select should be split into two i32 selects, and we shouldn't need
@@ -48,3 +49,20 @@ define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspa
   store i32 %trunc, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; CHECK-LABEL: {{^}}v_select_i64_split_imm:
+; CHECK: s_mov_b32 [[SHI:s[0-9]+]], 63
+; CHECK: s_mov_b32 [[SLO:s[0-9]+]], 0
+; CHECK-DAG: v_mov_b32_e32 [[VHI:v[0-9]+]], [[SHI]]
+; CHECK-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], [[SLO]]
+; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}}
+; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}}
+; CHECK: s_endpgm
+define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %a = load i64 addrspace(1)* %aptr, align 8
+  %b = load i64 addrspace(1)* %bptr, align 8
+  %sel = select i1 %cmp, i64 %a, i64 270582939648 ; 63 << 32
+  store i64 %sel, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/selectcc-opt.ll b/test/CodeGen/R600/selectcc-opt.ll
index 82577bb..7780371 100644
--- a/test/CodeGen/R600/selectcc-opt.ll
+++ b/test/CodeGen/R600/selectcc-opt.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
diff --git a/test/CodeGen/R600/selectcc.ll b/test/CodeGen/R600/selectcc.ll
index 5a09b5c..f378e15 100644
--- a/test/CodeGen/R600/selectcc.ll
+++ b/test/CodeGen/R600/selectcc.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}selectcc_i64:
 ; EG: XOR_INT
diff --git a/test/CodeGen/R600/setcc-opt.ll b/test/CodeGen/R600/setcc-opt.ll
index af48df8..93860f5 100644
--- a/test/CodeGen/R600/setcc-opt.ll
+++ b/test/CodeGen/R600/setcc-opt.ll
@@ -1,15 +1,236 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-; SI-LABEL: {{^}}sext_bool_icmp_ne:
-; SI: v_cmp_ne_i32
-; SI-NEXT: v_cndmask_b32
-; SI-NOT: v_cmp_ne_i32
-; SI-NOT: v_cndmask_b32
-; SI: s_endpgm
-define void @sext_bool_icmp_ne(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+; FUNC-LABEL: {{^}}sext_bool_icmp_eq_0:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT:buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+
+; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
+; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
+define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp eq i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, 0
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_bool_icmp_ne_0:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+
+; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
+; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
+define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 0
   store i1 %icmp1, i1 addrspace(1)* %out
   ret void
 }
+
+; This really folds away to false
+; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1:
+; GCN: v_cmp_eq_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc
+; GCN-NEXT: v_cmp_eq_i32_e64 {{s\[[0-9]+:[0-9]+\]}}, [[TMP]], 1{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1,
+; GCN-NEXT: buffer_store_byte [[TMP]]
+; GCN-NEXT: s_endpgm
+define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp eq i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, 1
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; This really folds away to true
+; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1:
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc
+; GCN-NEXT: v_cmp_ne_i32_e64 {{s\[[0-9]+:[0-9]+\]}}, [[TMP]], 1{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1,
+; GCN-NEXT: buffer_store_byte [[TMP]]
+; GCN-NEXT: s_endpgm
+define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 1
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_eq_0:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp eq i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, 0
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_ne_0:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 0
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_eq_1:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_eq_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp eq i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, 1
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_ne_1:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_eq_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 1
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
+; GCN: v_cmp_ne_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[VB]], 2{{$}}
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]]
+; GCN: buffer_store_byte
+; GCN: s_endpgm
+define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 2
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cmp_zext_k_i8max:
+; GCN: buffer_load_ubyte [[B:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
+; GCN: v_mov_b32_e32 [[K255:v[0-9]+]], 0xff{{$}}
+; GCN: v_cmp_ne_i32_e32 vcc, [[B]], [[K255]]
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
+define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
+  %b.ext = zext i8 %b to i32
+  %icmp0 = icmp ne i32 %b.ext, 255
+  store i1 %icmp0, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cmp_sext_k_neg1:
+; GCN: buffer_load_sbyte [[B:v[0-9]+]]
+; GCN: v_cmp_ne_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[B]], -1{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]]
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
+define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
+  %b = load i8 addrspace(1)* %b.ptr
+  %b.ext = sext i8 %b to i32
+  %icmp0 = icmp ne i32 %b.ext, -1
+  store i1 %icmp0, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_sext_arg:
+; GCN: s_load_dword [[B:s[0-9]+]]
+; GCN: v_cmp_ne_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[B]], -1{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]]
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
+define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind {
+  %b.ext = sext i8 %b to i32
+  %icmp0 = icmp ne i32 %b.ext, -1
+  store i1 %icmp0, i1 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This ends up doing a buffer_load_ubyte, and and compare to
+; 255. Seems to be because of ordering problems when not allowing load widths to be reduced.
+; Should do a buffer_load_sbyte and compare with -1
+
+; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_arg:
+; GCN-DAG: buffer_load_ubyte [[B:v[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff{{$}}
+; GCN: v_cmp_ne_i32_e32 vcc, [[B]], [[K]]{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
+define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
+  %b.ext = sext i8 %b to i32
+  %icmp0 = icmp ne i32 %b.ext, -1
+  store i1 %icmp0, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cmp_zext_k_neg1:
+; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
+define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
+  %b.ext = zext i8 %b to i32
+  %icmp0 = icmp ne i32 %b.ext, -1
+  store i1 %icmp0, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_ne_k:
+; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 2
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_eq_k:
+; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, 2
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll
index 8dd2ce4..f9c7e4f 100644
--- a/test/CodeGen/R600/setcc.ll
+++ b/test/CodeGen/R600/setcc.ll
@@ -1,5 +1,7 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-;RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; FUNC-LABEL: {{^}}setcc_v2i32:
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
@@ -94,11 +96,9 @@ entry:
 ; R600-DAG: SETNE_DX10
 ; R600-DAG: AND_INT
 ; R600-DAG: SETNE_INT
-; SI: v_cmp_o_f32
-; SI: v_cmp_neq_f32
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_and_b32_e32
+
+; SI: v_cmp_lg_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp one float %a, %b
@@ -128,11 +128,9 @@ entry:
 ; R600-DAG: SETE_DX10
 ; R600-DAG: OR_INT
 ; R600-DAG: SETNE_INT
-; SI: v_cmp_u_f32
-; SI: v_cmp_eq_f32
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+
+; SI: v_cmp_nlg_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ueq float %a, %b
@@ -144,11 +142,8 @@ entry:
 ; FUNC-LABEL: {{^}}f32_ugt:
 ; R600: SETGE
 ; R600: SETE_DX10
-; SI: v_cmp_u_f32
-; SI: v_cmp_gt_f32
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+; SI: v_cmp_nle_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ugt float %a, %b
@@ -160,11 +155,9 @@ entry:
 ; FUNC-LABEL: {{^}}f32_uge:
 ; R600: SETGT
 ; R600: SETE_DX10
-; SI: v_cmp_u_f32
-; SI: v_cmp_ge_f32
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+
+; SI: v_cmp_nlt_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp uge float %a, %b
@@ -176,11 +169,9 @@ entry:
 ; FUNC-LABEL: {{^}}f32_ult:
 ; R600: SETGE
 ; R600: SETE_DX10
-; SI: v_cmp_u_f32
-; SI: v_cmp_lt_f32
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+
+; SI: v_cmp_nge_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ult float %a, %b
@@ -192,11 +183,9 @@ entry:
 ; FUNC-LABEL: {{^}}f32_ule:
 ; R600: SETGT
 ; R600: SETE_DX10
-; SI: v_cmp_u_f32
-; SI: v_cmp_le_f32
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+
+; SI: v_cmp_ngt_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ule float %a, %b
@@ -343,3 +332,46 @@ entry:
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
+
+; FIXME: This does 4 compares
+; FUNC-LABEL: {{^}}v3i32_eq:
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI: s_endpgm
+define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.a = getelementptr <3 x i32> addrspace(1)* %ptra, i32 %tid
+  %gep.b = getelementptr <3 x i32> addrspace(1)* %ptrb, i32 %tid
+  %gep.out = getelementptr <3 x i32> addrspace(1)* %out, i32 %tid
+  %a = load <3 x i32> addrspace(1)* %gep.a
+  %b = load <3 x i32> addrspace(1)* %gep.b
+  %cmp = icmp eq <3 x i32> %a, %b
+  %ext = sext <3 x i1> %cmp to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v3i8_eq:
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI: s_endpgm
+define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.a = getelementptr <3 x i8> addrspace(1)* %ptra, i32 %tid
+  %gep.b = getelementptr <3 x i8> addrspace(1)* %ptrb, i32 %tid
+  %gep.out = getelementptr <3 x i8> addrspace(1)* %out, i32 %tid
+  %a = load <3 x i8> addrspace(1)* %gep.a
+  %b = load <3 x i8> addrspace(1)* %gep.b
+  %cmp = icmp eq <3 x i8> %a, %b
+  %ext = sext <3 x i1> %cmp to <3 x i8>
+  store <3 x i8> %ext, <3 x i8> addrspace(1)* %gep.out
+  ret void
+}
diff --git a/test/CodeGen/R600/setcc64.ll b/test/CodeGen/R600/setcc64.ll
index 6e43172..231be7a 100644
--- a/test/CodeGen/R600/setcc64.ll
+++ b/test/CodeGen/R600/setcc64.ll
@@ -1,4 +1,5 @@
-;RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
+;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 ; XXX: Merge this into setcc, once R600 supports 64-bit operations
 
@@ -57,11 +58,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}f64_one:
-; SI: v_cmp_o_f64
-; SI: v_cmp_neq_f64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_and_b32_e32
+; SI: v_cmp_lg_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp one double %a, %b
@@ -81,11 +79,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}f64_ueq:
-; SI: v_cmp_u_f64
-; SI: v_cmp_eq_f64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+; SI: v_cmp_nlg_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ueq double %a, %b
@@ -95,11 +90,9 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}f64_ugt:
-; SI: v_cmp_u_f64
-; SI: v_cmp_gt_f64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+
+; SI: v_cmp_nle_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ugt double %a, %b
@@ -109,11 +102,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}f64_uge:
-; SI: v_cmp_u_f64
-; SI: v_cmp_ge_f64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+; SI: v_cmp_nlt_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp uge double %a, %b
@@ -123,11 +113,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}f64_ult:
-; SI: v_cmp_u_f64
-; SI: v_cmp_lt_f64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+; SI: v_cmp_nge_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ult double %a, %b
@@ -137,11 +124,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}f64_ule:
-; SI: v_cmp_u_f64
-; SI: v_cmp_le_f64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_or_b32_e32
+; SI: v_cmp_ngt_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ule double %a, %b
diff --git a/test/CodeGen/R600/seto.ll b/test/CodeGen/R600/seto.ll
index 5fe6ff6..9b5d6b5 100644
--- a/test/CodeGen/R600/seto.ll
+++ b/test/CodeGen/R600/seto.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
diff --git a/test/CodeGen/R600/setuo.ll b/test/CodeGen/R600/setuo.ll
index a391177..76346c4 100644
--- a/test/CodeGen/R600/setuo.ll
+++ b/test/CodeGen/R600/setuo.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: v_cmp_u_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll
index d364e6b..3260179 100644
--- a/test/CodeGen/R600/sext-in-reg.ll
+++ b/test/CodeGen/R600/sext-in-reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
diff --git a/test/CodeGen/R600/sgpr-control-flow.ll b/test/CodeGen/R600/sgpr-control-flow.ll
index d8b8dff..f0236ac 100644
--- a/test/CodeGen/R600/sgpr-control-flow.ll
+++ b/test/CodeGen/R600/sgpr-control-flow.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 ;
 ;
 ; Most SALU instructions ignore control flow, so we need to make sure
@@ -59,6 +59,47 @@ endif:
   ret void
 }
 
+; FIXME: Should write to different SGPR pairs instead of copying to
+; VALU for i1 phi.
+
+; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:
+; SI: buffer_load_dword [[AVAL:v[0-9]+]]
+; SI: v_cmp_lt_i32_e64 [[CMP_IF:s\[[0-9]+:[0-9]+\]]], [[AVAL]], 0
+; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
+
+; SI: BB2_1:
+; SI: buffer_load_dword [[AVAL:v[0-9]+]]
+; SI: v_cmp_eq_i32_e64 [[CMP_ELSE:s\[[0-9]+:[0-9]+\]]], [[AVAL]], 0
+; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
+
+; SI: v_cmp_ne_i32_e64 [[CMP_CMP:s\[[0-9]+:[0-9]+\]]], [[V_CMP]], 0
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
+; SI: buffer_store_dword [[RESULT]]
+define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+entry:
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp1 = icmp eq i32 %tid, 0
+  br i1 %tmp1, label %if, label %else
+
+if:
+  %gep.if = getelementptr i32 addrspace(1)* %a, i32 %tid
+  %a.val = load i32 addrspace(1)* %gep.if
+  %cmp.if = icmp eq i32 %a.val, 0
+  br label %endif
+
+else:
+  %gep.else = getelementptr i32 addrspace(1)* %b, i32 %tid
+  %b.val = load i32 addrspace(1)* %gep.else
+  %cmp.else = icmp slt i32 %b.val, 0
+  br label %endif
+
+endif:
+  %tmp4 = phi i1 [%cmp.if, %if], [%cmp.else, %else]
+  %ext = sext i1 %tmp4 to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
 declare i32 @llvm.r600.read.tidig.x() #0
 
 attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
index aa97fbf..893f5a3 100644
--- a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
+++ b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 ; Copy VGPR -> SGPR used twice as an instruction operand, which is then
 ; used in an REG_SEQUENCE that also needs to be handled.
diff --git a/test/CodeGen/R600/sgpr-copy.ll b/test/CodeGen/R600/sgpr-copy.ll
index 8daf753..57cbadd 100644
--- a/test/CodeGen/R600/sgpr-copy.ll
+++ b/test/CodeGen/R600/sgpr-copy.ll
@@ -1,9 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ; This test checks that no VGPR to SGPR copies are created by the register
 ; allocator.
 ; CHECK-LABEL: {{^}}phi1:
-; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0
+; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
 ; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]]
 
 define void @phi1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
@@ -202,8 +203,8 @@ attributes #2 = { readonly }
 attributes #3 = { readnone }
 attributes #4 = { nounwind readonly }
 
-!0 = metadata !{metadata !"const", null}
-!1 = metadata !{metadata !0, metadata !0, i64 0, i32 1}
+!0 = !{!"const", null}
+!1 = !{!0, !0, i64 0, i32 1}
 
 ; Function Attrs: nounwind readnone
 declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
@@ -267,7 +268,7 @@ endif:
   ret void
 }
 
-!2 = metadata !{metadata !"const", null, i32 1}
+!2 = !{!"const", null, i32 1}
 
 ; CHECK-LABEL: {{^}}copy1:
 ; CHECK: buffer_load_dword
diff --git a/test/CodeGen/R600/shl.ll b/test/CodeGen/R600/shl.ll
index 71c9fc4..f89353b 100644
--- a/test/CodeGen/R600/shl.ll
+++ b/test/CodeGen/R600/shl.ll
@@ -1,13 +1,18 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s
 
-;EG-CHECK: {{^}}shl_v2i32:
-;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: {{^}}shl_v2i32:
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: {{^}}shl_v2i32:
-;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: {{^}}shl_v2i32:
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+;VI: {{^}}shl_v2i32:
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +23,23 @@ define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in
   ret void
 }
 
-;EG-CHECK: {{^}}shl_v4i32:
-;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: {{^}}shl_v4i32:
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: {{^}}shl_v4i32:
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-;SI-CHECK: {{^}}shl_v4i32:
-;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: {{^}}shl_v4i32:
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -39,20 +50,23 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
   ret void
 }
 
-;EG-CHECK: {{^}}shl_i64:
-;EG-CHECK: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
-;EG-CHECK: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG-CHECK: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+;EG: {{^}}shl_i64:
+;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+;EG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-;EG-CHECK-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-CHECK-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-CHECK-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}}
-;EG-CHECK-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
-;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
+;EG-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]]
+;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}}
+;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
 
-;SI-CHECK: {{^}}shl_i64:
-;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: {{^}}shl_i64:
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI: {{^}}shl_i64:
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
 define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
@@ -63,31 +77,35 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   ret void
 }
 
-;EG-CHECK: {{^}}shl_v2i64:
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHA]]
-;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHB]]
-;EG-CHECK-DAG: LSHR {{.*}}, 1
-;EG-CHECK-DAG: LSHR {{.*}}, 1
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: LSHL {{.*}}, [[SHA]]
-;EG-CHECK-DAG: LSHL {{.*}}, [[SHB]]
-;EG-CHECK-DAG: LSHL {{.*}}, [[SHA]]
-;EG-CHECK-DAG: LSHL {{.*}}, [[SHB]]
-;EG-CHECK-DAG: LSHL
-;EG-CHECK-DAG: LSHL
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
-;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
-;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-
-;SI-CHECK: {{^}}shl_v2i64:
-;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;EG: {{^}}shl_v2i64:
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHA]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHB]]
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: LSHL {{.*}}, [[SHA]]
+;EG-DAG: LSHL {{.*}}, [[SHB]]
+;EG-DAG: LSHL {{.*}}, [[SHA]]
+;EG-DAG: LSHL {{.*}}, [[SHB]]
+;EG-DAG: LSHL
+;EG-DAG: LSHL
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+
+;SI: {{^}}shl_v2i64:
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI: {{^}}shl_v2i64:
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
 define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
@@ -98,53 +116,59 @@ define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in
   ret void
 }
 
-;EG-CHECK: {{^}}shl_v4i64:
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHA]]
-;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHB]]
-;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHC]]
-;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHD]]
-;EG-CHECK-DAG: LSHR {{.*}}, 1
-;EG-CHECK-DAG: LSHR {{.*}}, 1
-;EG-CHECK-DAG: LSHR {{.*}}, 1
-;EG-CHECK-DAG: LSHR {{.*}}, 1
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: LSHL {{.*}}, [[SHA]]
-;EG-CHECK-DAG: LSHL {{.*}}, [[SHB]]
-;EG-CHECK-DAG: LSHL {{.*}}, [[SHC]]
-;EG-CHECK-DAG: LSHL {{.*}}, [[SHD]]
-;EG-CHECK-DAG: LSHL {{.*}}, [[SHA]]
-;EG-CHECK-DAG: LSHL {{.*}}, [[SHB]]
-;EG-CHECK-DAG: LSHL {{.*}}, [[SHC]]
-;EG-CHECK-DAG: LSHL {{.*}}, [[SHD]]
-;EG-CHECK-DAG: LSHL
-;EG-CHECK-DAG: LSHL
-;EG-CHECK-DAG: LSHL
-;EG-CHECK-DAG: LSHL
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
-;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
-;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
-;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
-;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-
-;SI-CHECK: {{^}}shl_v4i64:
-;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;EG: {{^}}shl_v4i64:
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHA]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHB]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHC]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHD]]
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: LSHL {{.*}}, [[SHA]]
+;EG-DAG: LSHL {{.*}}, [[SHB]]
+;EG-DAG: LSHL {{.*}}, [[SHC]]
+;EG-DAG: LSHL {{.*}}, [[SHD]]
+;EG-DAG: LSHL {{.*}}, [[SHA]]
+;EG-DAG: LSHL {{.*}}, [[SHB]]
+;EG-DAG: LSHL {{.*}}, [[SHC]]
+;EG-DAG: LSHL {{.*}}, [[SHD]]
+;EG-DAG: LSHL
+;EG-DAG: LSHL
+;EG-DAG: LSHL
+;EG-DAG: LSHL
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+
+;SI: {{^}}shl_v4i64:
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI: {{^}}shl_v4i64:
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
 define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
diff --git a/test/CodeGen/R600/shl_add_constant.ll b/test/CodeGen/R600/shl_add_constant.ll
index 801f77d..6915495 100644
--- a/test/CodeGen/R600/shl_add_constant.ll
+++ b/test/CodeGen/R600/shl_add_constant.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.r600.read.tidig.x() #1
 
diff --git a/test/CodeGen/R600/shl_add_ptr.ll b/test/CodeGen/R600/shl_add_ptr.ll
index 047cf25..d423153 100644
--- a/test/CodeGen/R600/shl_add_ptr.ll
+++ b/test/CodeGen/R600/shl_add_ptr.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
 
 ; Test that doing a shift of a pointer with a constant add will be
 ; folded into the constant offset addressing mode even if the add has
@@ -16,7 +17,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
 
 ; SI-LABEL: {{^}}load_shl_base_lds_0:
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8 [M0]
+; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8
 ; SI: s_endpgm
 define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -33,7 +34,7 @@ define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %ad
 
 ; SI-LABEL: {{^}}load_shl_base_lds_1:
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8 [M0]
+; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
 ; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], 8, v{{[0-9]+}}
 ; SI-DAG: buffer_store_dword [[RESULT]]
 ; SI-DAG: buffer_store_dword [[ADDUSE]]
@@ -68,8 +69,9 @@ define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
 ; pointer can be used with an offset into the second one.
 
 ; SI-LABEL: {{^}}load_shl_base_lds_2:
-; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 [M0]
+; SI: s_mov_b32 m0, -1
+; SI-NEXT: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
 ; SI: s_endpgm
 define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -85,7 +87,7 @@ define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
 
 ; SI-LABEL: {{^}}store_shl_base_lds_0:
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8 [M0]
+; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
 define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
diff --git a/test/CodeGen/R600/si-annotate-cf-assertion.ll b/test/CodeGen/R600/si-annotate-cf-assertion.ll
index 6d60b0a..69d7193 100644
--- a/test/CodeGen/R600/si-annotate-cf-assertion.ll
+++ b/test/CodeGen/R600/si-annotate-cf-assertion.ll
@@ -1,6 +1,7 @@
 ; REQUIRES: asserts
 ; XFAIL: *
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
 
 
 define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
diff --git a/test/CodeGen/R600/si-lod-bias.ll b/test/CodeGen/R600/si-lod-bias.ll
index 60277d6..d6cbd0f 100644
--- a/test/CodeGen/R600/si-lod-bias.ll
+++ b/test/CodeGen/R600/si-lod-bias.ll
@@ -1,4 +1,5 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ; This shader has the potential to generated illegal VGPR to SGPR copies if
 ; the wrong register class is used for the REG_SEQUENCE instructions.
@@ -47,5 +48,5 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
 attributes #0 = { "ShaderType"="0" }
 attributes #1 = { nounwind readnone }
 
-!0 = metadata !{metadata !"const", null}
-!1 = metadata !{metadata !0, metadata !0, i64 0, i32 1}
+!0 = !{!"const", null}
+!1 = !{!0, !0, i64 0, i32 1}
diff --git a/test/CodeGen/R600/si-sgpr-spill.ll b/test/CodeGen/R600/si-sgpr-spill.ll
index 439d8e2..18fda20 100644
--- a/test/CodeGen/R600/si-sgpr-spill.ll
+++ b/test/CodeGen/R600/si-sgpr-spill.ll
@@ -1,9 +1,11 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s
 
 ; These tests check that the compiler won't crash when it needs to spill
 ; SGPRs.
 
 ; CHECK-LABEL: {{^}}main:
+; CHECK: s_wqm
 ; Writing to M0 from an SMRD instruction will hang the GPU.
 ; CHECK-NOT: s_buffer_load_dword m0
 ; CHECK: s_endpgm
@@ -686,7 +688,7 @@ attributes #2 = { readnone }
 attributes #3 = { readonly }
 attributes #4 = { nounwind readonly }
 
-!0 = metadata !{metadata !"const", null, i32 1}
+!0 = !{!"const", null, i32 1}
 
 ; CHECK-LABEL: {{^}}main1:
 ; CHECK: s_endpgm
diff --git a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
index 2c146eb..a4475c0 100644
--- a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
 
 declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
 declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
@@ -51,8 +51,8 @@ define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspac
 
 ; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
-; CI: buffer_store_dword
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: buffer_store_dword
 define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
@@ -94,12 +94,10 @@ define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1
   ret void
 }
 
-; XXX: Should be able to reorder this, but the laods count as ordered
-
 ; FUNC-LABEL: @reorder_constant_load_local_store_constant_load
 ; CI: buffer_load_dword
-; CI: ds_write_b32
 ; CI: buffer_load_dword
+; CI: ds_write_b32
 ; CI: buffer_store_dword
 define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
   %ptr0 = load i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
@@ -183,11 +181,11 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa
 }
 
 ; FUNC-LABEL: @reorder_global_offsets
-; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0xc
-; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0x190
-; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0x194
-; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0x190
-; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0x194
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
 ; CI: buffer_store_dword
 ; CI: s_endpgm
 define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
diff --git a/test/CodeGen/R600/si-vector-hang.ll b/test/CodeGen/R600/si-vector-hang.ll
index 6f91c71..61812c6 100644
--- a/test/CodeGen/R600/si-vector-hang.ll
+++ b/test/CodeGen/R600/si-vector-hang.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ; CHECK: {{^}}test_8_min_char:
 ; CHECK: buffer_store_byte
@@ -96,12 +97,12 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"=
 
 !opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
 
-!0 = metadata !{null}
-!1 = metadata !{null}
-!2 = metadata !{null}
-!3 = metadata !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @test_8_min_char}
-!4 = metadata !{null}
-!5 = metadata !{null}
-!6 = metadata !{null}
-!7 = metadata !{null}
-!8 = metadata !{null}
+!0 = !{null}
+!1 = !{null}
+!2 = !{null}
+!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @test_8_min_char}
+!4 = !{null}
+!5 = !{null}
+!6 = !{null}
+!7 = !{null}
+!8 = !{null}
diff --git a/test/CodeGen/R600/sign_extend.ll b/test/CodeGen/R600/sign_extend.ll
index 94f4c46..f194759 100644
--- a/test/CodeGen/R600/sign_extend.ll
+++ b/test/CodeGen/R600/sign_extend.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}s_sext_i1_to_i32:
 ; SI: v_cndmask_b32_e64
@@ -23,8 +24,9 @@ entry:
 }
 
 ; SI-LABEL: {{^}}s_sext_i1_to_i64:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc
+; SI: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}}
 ; SI: s_endpgm
 define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
diff --git a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
index 8d9ee42..28a413c 100644
--- a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
+++ b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
@@ -1,5 +1,6 @@
 ; XFAIL: *
-; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s
 
 ; 64-bit select was originally lowered with a build_pair, and this
 ; could be simplified to 1 cndmask instead of 2, but that broken when
diff --git a/test/CodeGen/R600/sint_to_fp.f64.ll b/test/CodeGen/R600/sint_to_fp.f64.ll
index 6e4f87c..893cfb3 100644
--- a/test/CodeGen/R600/sint_to_fp.f64.ll
+++ b/test/CodeGen/R600/sint_to_fp.f64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
@@ -10,12 +10,13 @@ define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
   ret void
 }
 
+; FIXME: select on 0, 0
 ; SI-LABEL: {{^}}sint_to_fp_i1_f64:
 ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
-; we should be able to fold the SGPRs into the V_CNDMASK instructions.
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
+; uses an SGPR for [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]]
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
@@ -45,9 +46,9 @@ define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
 
 ; SI-LABEL: @v_sint_to_fp_i64_to_f64
 ; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; SI-DAG: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
-; SI-DAG: v_cvt_f64_i32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
+; SI: v_cvt_f64_i32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
 ; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
+; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
diff --git a/test/CodeGen/R600/sint_to_fp.ll b/test/CodeGen/R600/sint_to_fp.ll
index 7b6ce43..6a291cf 100644
--- a/test/CodeGen/R600/sint_to_fp.ll
+++ b/test/CodeGen/R600/sint_to_fp.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 
diff --git a/test/CodeGen/R600/smrd.ll b/test/CodeGen/R600/smrd.ll
index 1c7df16..bad1668 100644
--- a/test/CodeGen/R600/smrd.ll
+++ b/test/CodeGen/R600/smrd.ll
@@ -1,8 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s
 
 ; SMRD load with an immediate offset.
-; CHECK-LABEL: {{^}}smrd0:
-; CHECK: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
+; GCN-LABEL: {{^}}smrd0:
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
 define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32 addrspace(2)* %ptr, i64 1
@@ -12,8 +14,9 @@ entry:
 }
 
 ; SMRD load with the largest possible immediate offset.
-; CHECK-LABEL: {{^}}smrd1:
-; CHECK: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; GCN-LABEL: {{^}}smrd1:
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
 define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32 addrspace(2)* %ptr, i64 255
@@ -23,10 +26,11 @@ entry:
 }
 
 ; SMRD load with an offset greater than the largest possible immediate.
-; CHECK-LABEL: {{^}}smrd2:
-; CHECK: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
-; CHECK: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
-; CHECK: s_endpgm
+; GCN-LABEL: {{^}}smrd2:
+; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
+; GCN: s_endpgm
 define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32 addrspace(2)* %ptr, i64 256
@@ -36,15 +40,18 @@ entry:
 }
 
 ; SMRD load with a 64-bit offset
-; CHECK-LABEL: {{^}}smrd3:
-; CHECK-DAG: s_mov_b32 s[[SHI:[0-9]+]], 4
-; CHECK-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0 ;
-; FIXME: We don't need to copy these values to VGPRs
-; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
-; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; GCN-LABEL: {{^}}smrd3:
+; FIXME: There are too many copies here because we don't fold immediates
+;        through REG_SEQUENCE
+; SI: s_mov_b32 s[[SLO:[0-9]+]], 0 ;
+; SI: s_mov_b32 s[[SHI:[0-9]+]], 4
+; SI: s_mov_b32 s[[SSLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SSLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; FIXME: We should be able to use s_load_dword here
-; CHECK: buffer_load_dword v{{[0-9]+}}, v{{\[}}[[VLO]]:[[VHI]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64
-; CHECK: s_endpgm
+; SI: buffer_load_dword v{{[0-9]+}}, v{{\[}}[[VLO]]:[[VHI]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64
+; TODO: Add VI checks
+; GCN: s_endpgm
 define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
@@ -54,8 +61,9 @@ entry:
 }
 
 ; SMRD load using the load.const intrinsic with an immediate offset
-; CHECK-LABEL: {{^}}smrd_load_const0:
-; CHECK: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
+; GCN-LABEL: {{^}}smrd_load_const0:
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
 define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
@@ -67,8 +75,9 @@ main_body:
 
 ; SMRD load using the load.const intrinsic with the largest possible immediate
 ; offset.
-; CHECK-LABEL: {{^}}smrd_load_const1:
-; CHECK: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; GCN-LABEL: {{^}}smrd_load_const1:
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
 define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
@@ -80,9 +89,10 @@ main_body:
 ; SMRD load using the load.const intrinsic with an offset greater than the
 ; largets possible immediate.
 ; immediate offset.
-; CHECK-LABEL: {{^}}smrd_load_const2:
-; CHECK: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
-; CHECK: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; GCN-LABEL: {{^}}smrd_load_const2:
+; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
diff --git a/test/CodeGen/R600/split-scalar-i64-add.ll b/test/CodeGen/R600/split-scalar-i64-add.ll
index e3448dc..ec50fd9 100644
--- a/test/CodeGen/R600/split-scalar-i64-add.ll
+++ b/test/CodeGen/R600/split-scalar-i64-add.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() readnone
 
diff --git a/test/CodeGen/R600/sra.ll b/test/CodeGen/R600/sra.ll
index 8ba9daa..d6c6ccd 100644
--- a/test/CodeGen/R600/sra.ll
+++ b/test/CodeGen/R600/sra.ll
@@ -1,13 +1,18 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s
 
-;EG-CHECK-LABEL: {{^}}ashr_v2i32:
-;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-LABEL: {{^}}ashr_v2i32:
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK-LABEL: {{^}}ashr_v2i32:
-;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-LABEL: {{^}}ashr_v2i32:
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+;VI-LABEL: {{^}}ashr_v2i32:
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +23,23 @@ define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
   ret void
 }
 
-;EG-CHECK-LABEL: {{^}}ashr_v4i32:
-;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-LABEL: {{^}}ashr_v4i32:
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-LABEL: {{^}}ashr_v4i32:
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-;SI-CHECK-LABEL: {{^}}ashr_v4i32:
-;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI-LABEL: {{^}}ashr_v4i32:
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -39,11 +50,15 @@ define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
   ret void
 }
 
-;EG-CHECK-LABEL: {{^}}ashr_i64:
-;EG-CHECK: ASHR
+;EG-LABEL: {{^}}ashr_i64:
+;EG: ASHR
+
+;SI-LABEL: {{^}}ashr_i64:
+;SI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
+
+;VI-LABEL: {{^}}ashr_i64:
+;VI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
 
-;SI-CHECK-LABEL: {{^}}ashr_i64:
-;SI-CHECK: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
 define void @ashr_i64(i64 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = sext i32 %in to i64
@@ -52,22 +67,26 @@ entry:
   ret void
 }
 
-;EG-CHECK-LABEL: {{^}}ashr_i64_2:
-;EG-CHECK: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
-;EG-CHECK: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG-CHECK: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+;EG-LABEL: {{^}}ashr_i64_2:
+;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+;EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-;EG-CHECK-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-CHECK-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-CHECK-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
-;EG-CHECK-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
-;EG-CHECK-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
-;EG-CHECK-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
-;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
-
-;SI-CHECK-LABEL: {{^}}ashr_i64_2:
-;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
+;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
+;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
+;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+
+;SI-LABEL: {{^}}ashr_i64_2:
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI-LABEL: {{^}}ashr_i64_2:
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
 define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
   %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
@@ -78,35 +97,39 @@ entry:
   ret void
 }
 
-;EG-CHECK-LABEL: {{^}}ashr_v2i64:
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHA]]
-;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHB]]
-;EG-CHECK-DAG: LSHL {{.*}}, 1
-;EG-CHECK-DAG: LSHL {{.*}}, 1
-;EG-CHECK-DAG: ASHR {{.*}}, [[SHA]]
-;EG-CHECK-DAG: ASHR {{.*}}, [[SHB]]
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
-;EG-CHECK-DAG: OR_INT
-;EG-CHECK-DAG: OR_INT
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: ASHR
-;EG-CHECK-DAG: ASHR
-;EG-CHECK-DAG: ASHR {{.*}}, literal
-;EG-CHECK-DAG: ASHR {{.*}}, literal
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-
-;SI-CHECK-LABEL: {{^}}ashr_v2i64:
-;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;EG-LABEL: {{^}}ashr_v2i64:
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: ASHR {{.*}}, [[SHA]]
+;EG-DAG: ASHR {{.*}}, [[SHB]]
+;EG-DAG: LSHR {{.*}}, [[SHA]]
+;EG-DAG: LSHR {{.*}}, [[SHB]]
+;EG-DAG: OR_INT
+;EG-DAG: OR_INT
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ASHR
+;EG-DAG: ASHR
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+
+;SI-LABEL: {{^}}ashr_v2i64:
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI-LABEL: {{^}}ashr_v2i64:
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
 define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
@@ -117,61 +140,67 @@ define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
   ret void
 }
 
-;EG-CHECK-LABEL: {{^}}ashr_v4i64:
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHA]]
-;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHB]]
-;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHC]]
-;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHD]]
-;EG-CHECK-DAG: LSHL {{.*}}, 1
-;EG-CHECK-DAG: LSHL {{.*}}, 1
-;EG-CHECK-DAG: LSHL {{.*}}, 1
-;EG-CHECK-DAG: LSHL {{.*}}, 1
-;EG-CHECK-DAG: ASHR {{.*}}, [[SHA]]
-;EG-CHECK-DAG: ASHR {{.*}}, [[SHB]]
-;EG-CHECK-DAG: ASHR {{.*}}, [[SHC]]
-;EG-CHECK-DAG: ASHR {{.*}}, [[SHD]]
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
-;EG-CHECK-DAG: OR_INT
-;EG-CHECK-DAG: OR_INT
-;EG-CHECK-DAG: OR_INT
-;EG-CHECK-DAG: OR_INT
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: ASHR
-;EG-CHECK-DAG: ASHR
-;EG-CHECK-DAG: ASHR
-;EG-CHECK-DAG: ASHR
-;EG-CHECK-DAG: ASHR {{.*}}, literal
-;EG-CHECK-DAG: ASHR {{.*}}, literal
-;EG-CHECK-DAG: ASHR {{.*}}, literal
-;EG-CHECK-DAG: ASHR {{.*}}, literal
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-
-;SI-CHECK-LABEL: {{^}}ashr_v4i64:
-;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;EG-LABEL: {{^}}ashr_v4i64:
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHC]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHD]]
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: ASHR {{.*}}, [[SHA]]
+;EG-DAG: ASHR {{.*}}, [[SHB]]
+;EG-DAG: ASHR {{.*}}, [[SHC]]
+;EG-DAG: ASHR {{.*}}, [[SHD]]
+;EG-DAG: LSHR {{.*}}, [[SHA]]
+;EG-DAG: LSHR {{.*}}, [[SHB]]
+;EG-DAG: LSHR {{.*}}, [[SHA]]
+;EG-DAG: LSHR {{.*}}, [[SHB]]
+;EG-DAG: OR_INT
+;EG-DAG: OR_INT
+;EG-DAG: OR_INT
+;EG-DAG: OR_INT
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ASHR
+;EG-DAG: ASHR
+;EG-DAG: ASHR
+;EG-DAG: ASHR
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+
+;SI-LABEL: {{^}}ashr_v4i64:
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI-LABEL: {{^}}ashr_v4i64:
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
 define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
diff --git a/test/CodeGen/R600/srem.ll b/test/CodeGen/R600/srem.ll
index 65e3395..510db0e 100644
--- a/test/CodeGen/R600/srem.ll
+++ b/test/CodeGen/R600/srem.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI < %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s
 
 define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -17,6 +18,19 @@ define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   ret void
 }
 
+; FUNC-LABEL: {{^}}srem_i32_7:
+; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x92492493
+; SI: v_mul_hi_i32 {{v[0-9]+}}, [[MAGIC]],
+; SI: v_mul_lo_i32
+; SI: v_sub_i32
+; SI: s_endpgm
+define void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32 addrspace(1) * %in
+  %result = srem i32 %num, 7
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
 define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
   %num = load <2 x i32> addrspace(1) * %in
@@ -48,3 +62,51 @@ define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)*
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %den_ptr = getelementptr i64 addrspace(1)* %in, i64 1
+  %num = load i64 addrspace(1) * %in
+  %den = load i64 addrspace(1) * %den_ptr
+  %result = srem i64 %num, %den
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+define void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %num = load i64 addrspace(1) * %in
+  %result = srem i64 %num, 4
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %den_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
+  %num = load <2 x i64> addrspace(1) * %in
+  %den = load <2 x i64> addrspace(1) * %den_ptr
+  %result = srem <2 x i64> %num, %den
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %num = load <2 x i64> addrspace(1) * %in
+  %result = srem <2 x i64> %num, <i64 4, i64 4>
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %den_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
+  %num = load <4 x i64> addrspace(1) * %in
+  %den = load <4 x i64> addrspace(1) * %den_ptr
+  %result = srem <4 x i64> %num, %den
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %num = load <4 x i64> addrspace(1) * %in
+  %result = srem <4 x i64> %num, <i64 4, i64 4, i64 4, i64 4>
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/srl.ll b/test/CodeGen/R600/srl.ll
index 8c5daf6..1f9b620 100644
--- a/test/CodeGen/R600/srl.ll
+++ b/test/CodeGen/R600/srl.ll
@@ -1,166 +1,185 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}lshr_i32:
+; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %a = load i32 addrspace(1)* %in
+  %b = load i32 addrspace(1)* %b_ptr
+  %result = lshr i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
 
-;EG-CHECK: {{^}}lshr_v2i32:
-;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; FUNC-LABEL: {{^}}lshr_v2i32:
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-;SI-CHECK: {{^}}lshr_v2i32:
-;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %a = load <2 x i32> addrspace(1)* %in
+  %b = load <2 x i32> addrspace(1)* %b_ptr
   %result = lshr <2 x i32> %a, %b
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-
-;EG-CHECK: {{^}}lshr_v4i32:
-;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-;SI-CHECK: {{^}}lshr_v4i32:
-;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
+; FUNC-LABEL: {{^}}lshr_v4i32:
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %a = load <4 x i32> addrspace(1)* %in
+  %b = load <4 x i32> addrspace(1)* %b_ptr
   %result = lshr <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
 
-;EG-CHECK: {{^}}lshr_i64:
-;EG-CHECK: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
-;EG-CHECK: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG-CHECK: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
-;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-;EG-CHECK-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-CHECK-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-CHECK-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
-;EG-CHECK-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
-;EG-CHECK-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
-;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
-
-;SI-CHECK: {{^}}lshr_i64:
-;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
+; FUNC-LABEL: {{^}}lshr_i64:
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+; EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
+; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
 define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
-  %a = load i64 addrspace(1) * %in
-  %b = load i64 addrspace(1) * %b_ptr
+  %a = load i64 addrspace(1)* %in
+  %b = load i64 addrspace(1)* %b_ptr
   %result = lshr i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 
-;EG-CHECK: {{^}}lshr_v2i64:
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHA]]
-;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHB]]
-;EG-CHECK-DAG: LSHL {{.*}}, 1
-;EG-CHECK-DAG: LSHL {{.*}}, 1
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
-;EG-CHECK-DAG: OR_INT
-;EG-CHECK-DAG: OR_INT
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: LSHR
-;EG-CHECK-DAG: LSHR
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
-;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
-;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-
-;SI-CHECK: {{^}}lshr_v2i64:
-;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
+; FUNC-LABEL: {{^}}lshr_v2i64:
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
 define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
-  %a = load <2 x i64> addrspace(1) * %in
-  %b = load <2 x i64> addrspace(1) * %b_ptr
+  %a = load <2 x i64> addrspace(1)* %in
+  %b = load <2 x i64> addrspace(1)* %b_ptr
   %result = lshr <2 x i64> %a, %b
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
 }
 
-
-;EG-CHECK: {{^}}lshr_v4i64:
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
-;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHA]]
-;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHB]]
-;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHC]]
-;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHD]]
-;EG-CHECK-DAG: LSHL {{.*}}, 1
-;EG-CHECK-DAG: LSHL {{.*}}, 1
-;EG-CHECK-DAG: LSHL {{.*}}, 1
-;EG-CHECK-DAG: LSHL {{.*}}, 1
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHC]]
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHD]]
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHC]]
-;EG-CHECK-DAG: LSHR {{.*}}, [[SHD]]
-;EG-CHECK-DAG: OR_INT
-;EG-CHECK-DAG: OR_INT
-;EG-CHECK-DAG: OR_INT
-;EG-CHECK-DAG: OR_INT
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-CHECK-DAG: LSHR
-;EG-CHECK-DAG: LSHR
-;EG-CHECK-DAG: LSHR
-;EG-CHECK-DAG: LSHR
-;EG-CHECK-DAG: LSHR
-;EG-CHECK-DAG: LSHR
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
-;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
-;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
-;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
-;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
-;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-;EG-CHECK-DAG: CNDE_INT
-
-;SI-CHECK: {{^}}lshr_v4i64:
-;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
+; FUNC-LABEL: {{^}}lshr_v4i64:
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHC]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHD]]
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: LSHR {{.*}}, [[SHC]]
+; EG-DAG: LSHR {{.*}}, [[SHD]]
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: LSHR {{.*}}, [[SHC]]
+; EG-DAG: LSHR {{.*}}, [[SHD]]
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
 define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
-  %a = load <4 x i64> addrspace(1) * %in
-  %b = load <4 x i64> addrspace(1) * %b_ptr
+  %a = load <4 x i64> addrspace(1)* %in
+  %b = load <4 x i64> addrspace(1)* %b_ptr
   %result = lshr <4 x i64> %a, %b
   store <4 x i64> %result, <4 x i64> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/ssubo.ll b/test/CodeGen/R600/ssubo.ll
index 8031c6f..09d3959 100644
--- a/test/CodeGen/R600/ssubo.ll
+++ b/test/CodeGen/R600/ssubo.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
 
 declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/R600/store-barrier.ll b/test/CodeGen/R600/store-barrier.ll
index 350b006..ea65bb0 100644
--- a/test/CodeGen/R600/store-barrier.ll
+++ b/test/CodeGen/R600/store-barrier.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
 
 ; This test is for a bug in the machine scheduler where stores without
 ; an underlying object would be moved across the barrier.  In this
diff --git a/test/CodeGen/R600/store-v3i32.ll b/test/CodeGen/R600/store-v3i32.ll
index 0f28f33..33617b5 100644
--- a/test/CodeGen/R600/store-v3i32.ll
+++ b/test/CodeGen/R600/store-v3i32.ll
@@ -1,5 +1,6 @@
 ; XFAIL: *
-; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
 
 ; 3 vectors have the same size and alignment as 4 vectors, so this
 ; should be done in a single store.
diff --git a/test/CodeGen/R600/store-v3i64.ll b/test/CodeGen/R600/store-v3i64.ll
index 247a561..e0c554a 100644
--- a/test/CodeGen/R600/store-v3i64.ll
+++ b/test/CodeGen/R600/store-v3i64.ll
@@ -1,5 +1,6 @@
 ; XFAIL: *
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}global_store_v3i64:
 ; SI: buffer_store_dwordx4
diff --git a/test/CodeGen/R600/store-vector-ptrs.ll b/test/CodeGen/R600/store-vector-ptrs.ll
index aee639b..ba4d94f 100644
--- a/test/CodeGen/R600/store-vector-ptrs.ll
+++ b/test/CodeGen/R600/store-vector-ptrs.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s
 
 ; This tests for a bug that caused a crash in
 ; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting
diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll
index 713ecd6..e4cb313 100644
--- a/test/CodeGen/R600/store.ll
+++ b/test/CodeGen/R600/store.ll
@@ -1,13 +1,14 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK --check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=CM-CHECK --check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
 
 ;===------------------------------------------------------------------------===;
 ; Global Address Space
 ;===------------------------------------------------------------------------===;
 ; FUNC-LABEL: {{^}}store_i1:
-; EG-CHECK: MEM_RAT MSKOR
-; SI-CHECK: buffer_store_byte
+; EG: MEM_RAT MSKOR
+; SI: buffer_store_byte
 define void @store_i1(i1 addrspace(1)* %out) {
 entry:
   store i1 true, i1 addrspace(1)* %out
@@ -15,27 +16,29 @@ entry:
 }
 
 ; i8 store
-; EG-CHECK-LABEL: {{^}}store_i8:
-; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
-; EG-CHECK: VTX_READ_8 [[VAL:T[0-9]\.X]], [[VAL]]
+; EG-LABEL: {{^}}store_i8:
+; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
+
 ; IG 0: Get the byte index and truncate the value
-; EG-CHECK: AND_INT T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
-; EG-CHECK-NEXT: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], [[VAL]], literal.y
-; EG-CHECK-NEXT: 3(4.203895e-45), 255(3.573311e-43)
+; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
+; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
+; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y
+; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
+
+
 ; IG 1: Truncate the calculated the shift amount for the mask
-; EG-CHECK: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
-; EG-CHECK-NEXT: 3
+
 ; IG 2: Shift the value and the mask
-; EG-CHECK: LSHL T[[RW_GPR]].X, T{{[0-9]}}.[[TRUNC_CHAN]], PV.[[SHIFT_CHAN]]
-; EG-CHECK: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
-; EG-CHECK-NEXT: 255
+; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]]
+; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
+; EG-NEXT: 255
 ; IG 3: Initialize the Y and Z channels to zero
 ;       XXX: An optimal scheduler should merge this into one of the prevous IGs.
-; EG-CHECK: MOV T[[RW_GPR]].Y, 0.0
-; EG-CHECK: MOV * T[[RW_GPR]].Z, 0.0
+; EG: MOV T[[RW_GPR]].Y, 0.0
+; EG: MOV * T[[RW_GPR]].Z, 0.0
 
-; SI-CHECK-LABEL: {{^}}store_i8:
-; SI-CHECK: buffer_store_byte
+; SI-LABEL: {{^}}store_i8:
+; SI: buffer_store_byte
 
 define void @store_i8(i8 addrspace(1)* %out, i8 %in) {
 entry:
@@ -44,39 +47,44 @@ entry:
 }
 
 ; i16 store
-; EG-CHECK-LABEL: {{^}}store_i16:
-; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
-; EG-CHECK: VTX_READ_16 [[VAL:T[0-9]\.X]], [[VAL]]
+; EG-LABEL: {{^}}store_i16:
+; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
+
 ; IG 0: Get the byte index and truncate the value
-; EG-CHECK: AND_INT T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
-; EG-CHECK: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], [[VAL]], literal.y
-; EG-CHECK-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
+
+
+; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
+; EG-NEXT: 3(4.203895e-45),
+
+; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
+; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y
+
+; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
 ; IG 1: Truncate the calculated the shift amount for the mask
-; EG-CHECK: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
-; EG-CHECK: 3
+
 ; IG 2: Shift the value and the mask
-; EG-CHECK: LSHL T[[RW_GPR]].X, T{{[0-9]}}.[[TRUNC_CHAN]], PV.[[SHIFT_CHAN]]
-; EG-CHECK: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
-; EG-CHECK-NEXT: 65535
+; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]]
+; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
+; EG-NEXT: 65535
 ; IG 3: Initialize the Y and Z channels to zero
 ;       XXX: An optimal scheduler should merge this into one of the prevous IGs.
-; EG-CHECK: MOV T[[RW_GPR]].Y, 0.0
-; EG-CHECK: MOV * T[[RW_GPR]].Z, 0.0
+; EG: MOV T[[RW_GPR]].Y, 0.0
+; EG: MOV * T[[RW_GPR]].Z, 0.0
 
-; SI-CHECK-LABEL: {{^}}store_i16:
-; SI-CHECK: buffer_store_short
+; SI-LABEL: {{^}}store_i16:
+; SI: buffer_store_short
 define void @store_i16(i16 addrspace(1)* %out, i16 %in) {
 entry:
   store i16 %in, i16 addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}store_v2i8:
-; EG-CHECK: MEM_RAT MSKOR
-; EG-CHECK-NOT: MEM_RAT MSKOR
-; SI-CHECK-LABEL: {{^}}store_v2i8:
-; SI-CHECK: buffer_store_byte
-; SI-CHECK: buffer_store_byte
+; EG-LABEL: {{^}}store_v2i8:
+; EG: MEM_RAT MSKOR
+; EG-NOT: MEM_RAT MSKOR
+; SI-LABEL: {{^}}store_v2i8:
+; SI: buffer_store_byte
+; SI: buffer_store_byte
 define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
@@ -85,13 +93,13 @@ entry:
 }
 
 
-; EG-CHECK-LABEL: {{^}}store_v2i16:
-; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
-; CM-CHECK-LABEL: {{^}}store_v2i16:
-; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
-; SI-CHECK-LABEL: {{^}}store_v2i16:
-; SI-CHECK: buffer_store_short
-; SI-CHECK: buffer_store_short
+; EG-LABEL: {{^}}store_v2i16:
+; EG: MEM_RAT_CACHELESS STORE_RAW
+; CM-LABEL: {{^}}store_v2i16:
+; CM: MEM_RAT_CACHELESS STORE_DWORD
+; SI-LABEL: {{^}}store_v2i16:
+; SI: buffer_store_short
+; SI: buffer_store_short
 define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
@@ -99,15 +107,15 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}store_v4i8:
-; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
-; CM-CHECK-LABEL: {{^}}store_v4i8:
-; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
-; SI-CHECK-LABEL: {{^}}store_v4i8:
-; SI-CHECK: buffer_store_byte
-; SI-CHECK: buffer_store_byte
-; SI-CHECK: buffer_store_byte
-; SI-CHECK: buffer_store_byte
+; EG-LABEL: {{^}}store_v4i8:
+; EG: MEM_RAT_CACHELESS STORE_RAW
+; CM-LABEL: {{^}}store_v4i8:
+; CM: MEM_RAT_CACHELESS STORE_DWORD
+; SI-LABEL: {{^}}store_v4i8:
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
 define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
@@ -116,30 +124,30 @@ entry:
 }
 
 ; floating-point store
-; EG-CHECK-LABEL: {{^}}store_f32:
-; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1
-; CM-CHECK-LABEL: {{^}}store_f32:
-; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK-LABEL: {{^}}store_f32:
-; SI-CHECK: buffer_store_dword
+; EG-LABEL: {{^}}store_f32:
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1
+; CM-LABEL: {{^}}store_f32:
+; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI-LABEL: {{^}}store_f32:
+; SI: buffer_store_dword
 
 define void @store_f32(float addrspace(1)* %out, float %in) {
   store float %in, float addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}store_v4i16:
-; EG-CHECK: MEM_RAT MSKOR
-; EG-CHECK: MEM_RAT MSKOR
-; EG-CHECK: MEM_RAT MSKOR
-; EG-CHECK: MEM_RAT MSKOR
-; EG-CHECK-NOT: MEM_RAT MSKOR
-; SI-CHECK-LABEL: {{^}}store_v4i16:
-; SI-CHECK: buffer_store_short
-; SI-CHECK: buffer_store_short
-; SI-CHECK: buffer_store_short
-; SI-CHECK: buffer_store_short
-; SI-CHECK-NOT: buffer_store_byte
+; EG-LABEL: {{^}}store_v4i16:
+; EG: MEM_RAT MSKOR
+; EG: MEM_RAT MSKOR
+; EG: MEM_RAT MSKOR
+; EG: MEM_RAT MSKOR
+; EG-NOT: MEM_RAT MSKOR
+; SI-LABEL: {{^}}store_v4i16:
+; SI: buffer_store_short
+; SI: buffer_store_short
+; SI: buffer_store_short
+; SI: buffer_store_short
+; SI-NOT: buffer_store_byte
 define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i16>
@@ -148,12 +156,12 @@ entry:
 }
 
 ; vec2 floating-point stores
-; EG-CHECK-LABEL: {{^}}store_v2f32:
-; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
-; CM-CHECK-LABEL: {{^}}store_v2f32:
-; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
-; SI-CHECK-LABEL: {{^}}store_v2f32:
-; SI-CHECK: buffer_store_dwordx2
+; EG-LABEL: {{^}}store_v2f32:
+; EG: MEM_RAT_CACHELESS STORE_RAW
+; CM-LABEL: {{^}}store_v2f32:
+; CM: MEM_RAT_CACHELESS STORE_DWORD
+; SI-LABEL: {{^}}store_v2f32:
+; SI: buffer_store_dwordx2
 
 define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
 entry:
@@ -163,14 +171,14 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}store_v4i32:
-; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
-; EG-CHECK-NOT: MEM_RAT_CACHELESS STORE_RAW
-; CM-CHECK-LABEL: {{^}}store_v4i32:
-; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
-; CM-CHECK-NOT: MEM_RAT_CACHELESS STORE_DWORD
-; SI-CHECK-LABEL: {{^}}store_v4i32:
-; SI-CHECK: buffer_store_dwordx4
+; EG-LABEL: {{^}}store_v4i32:
+; EG: MEM_RAT_CACHELESS STORE_RAW
+; EG-NOT: MEM_RAT_CACHELESS STORE_RAW
+; CM-LABEL: {{^}}store_v4i32:
+; CM: MEM_RAT_CACHELESS STORE_DWORD
+; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
+; SI-LABEL: {{^}}store_v4i32:
+; SI: buffer_store_dwordx4
 define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out
@@ -178,8 +186,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}store_i64_i8:
-; EG-CHECK: MEM_RAT MSKOR
-; SI-CHECK: buffer_store_byte
+; EG: MEM_RAT MSKOR
+; SI: buffer_store_byte
 define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
@@ -188,8 +196,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}store_i64_i16:
-; EG-CHECK: MEM_RAT MSKOR
-; SI-CHECK: buffer_store_short
+; EG: MEM_RAT MSKOR
+; SI: buffer_store_short
 define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
@@ -202,89 +210,89 @@ entry:
 ;===------------------------------------------------------------------------===;
 
 ; FUNC-LABEL: {{^}}store_local_i1:
-; EG-CHECK: LDS_BYTE_WRITE
-; SI-CHECK: ds_write_b8
+; EG: LDS_BYTE_WRITE
+; SI: ds_write_b8
 define void @store_local_i1(i1 addrspace(3)* %out) {
 entry:
   store i1 true, i1 addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}store_local_i8:
-; EG-CHECK: LDS_BYTE_WRITE
-; SI-CHECK-LABEL: {{^}}store_local_i8:
-; SI-CHECK: ds_write_b8
+; EG-LABEL: {{^}}store_local_i8:
+; EG: LDS_BYTE_WRITE
+; SI-LABEL: {{^}}store_local_i8:
+; SI: ds_write_b8
 define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
   store i8 %in, i8 addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}store_local_i16:
-; EG-CHECK: LDS_SHORT_WRITE
-; SI-CHECK-LABEL: {{^}}store_local_i16:
-; SI-CHECK: ds_write_b16
+; EG-LABEL: {{^}}store_local_i16:
+; EG: LDS_SHORT_WRITE
+; SI-LABEL: {{^}}store_local_i16:
+; SI: ds_write_b16
 define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
   store i16 %in, i16 addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}store_local_v2i16:
-; EG-CHECK: LDS_WRITE
-; CM-CHECK-LABEL: {{^}}store_local_v2i16:
-; CM-CHECK: LDS_WRITE
-; SI-CHECK-LABEL: {{^}}store_local_v2i16:
-; SI-CHECK: ds_write_b16
-; SI-CHECK: ds_write_b16
+; EG-LABEL: {{^}}store_local_v2i16:
+; EG: LDS_WRITE
+; CM-LABEL: {{^}}store_local_v2i16:
+; CM: LDS_WRITE
+; SI-LABEL: {{^}}store_local_v2i16:
+; SI: ds_write_b16
+; SI: ds_write_b16
 define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}store_local_v4i8:
-; EG-CHECK: LDS_WRITE
-; CM-CHECK-LABEL: {{^}}store_local_v4i8:
-; CM-CHECK: LDS_WRITE
-; SI-CHECK-LABEL: {{^}}store_local_v4i8:
-; SI-CHECK: ds_write_b8
-; SI-CHECK: ds_write_b8
-; SI-CHECK: ds_write_b8
-; SI-CHECK: ds_write_b8
+; EG-LABEL: {{^}}store_local_v4i8:
+; EG: LDS_WRITE
+; CM-LABEL: {{^}}store_local_v4i8:
+; CM: LDS_WRITE
+; SI-LABEL: {{^}}store_local_v4i8:
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
 define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}store_local_v2i32:
-; EG-CHECK: LDS_WRITE
-; EG-CHECK: LDS_WRITE
-; CM-CHECK-LABEL: {{^}}store_local_v2i32:
-; CM-CHECK: LDS_WRITE
-; CM-CHECK: LDS_WRITE
-; SI-CHECK-LABEL: {{^}}store_local_v2i32:
-; SI-CHECK: ds_write_b64
+; EG-LABEL: {{^}}store_local_v2i32:
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+; CM-LABEL: {{^}}store_local_v2i32:
+; CM: LDS_WRITE
+; CM: LDS_WRITE
+; SI-LABEL: {{^}}store_local_v2i32:
+; SI: ds_write_b64
 define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: {{^}}store_local_v4i32:
-; EG-CHECK: LDS_WRITE
-; EG-CHECK: LDS_WRITE
-; EG-CHECK: LDS_WRITE
-; EG-CHECK: LDS_WRITE
-; CM-CHECK-LABEL: {{^}}store_local_v4i32:
-; CM-CHECK: LDS_WRITE
-; CM-CHECK: LDS_WRITE
-; CM-CHECK: LDS_WRITE
-; CM-CHECK: LDS_WRITE
-; SI-CHECK-LABEL: {{^}}store_local_v4i32:
-; SI-CHECK: ds_write_b32
-; SI-CHECK: ds_write_b32
-; SI-CHECK: ds_write_b32
-; SI-CHECK: ds_write_b32
+; EG-LABEL: {{^}}store_local_v4i32:
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+; CM-LABEL: {{^}}store_local_v4i32:
+; CM: LDS_WRITE
+; CM: LDS_WRITE
+; CM: LDS_WRITE
+; CM: LDS_WRITE
+; SI-LABEL: {{^}}store_local_v4i32:
+; SI: ds_write_b32
+; SI: ds_write_b32
+; SI: ds_write_b32
+; SI: ds_write_b32
 define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(3)* %out
@@ -292,8 +300,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}store_local_i64_i8:
-; EG-CHECK: LDS_BYTE_WRITE
-; SI-CHECK: ds_write_b8
+; EG: LDS_BYTE_WRITE
+; SI: ds_write_b8
 define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
@@ -302,8 +310,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}store_local_i64_i16:
-; EG-CHECK: LDS_SHORT_WRITE
-; SI-CHECK: ds_write_b16
+; EG: LDS_SHORT_WRITE
+; SI: ds_write_b16
 define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
@@ -318,12 +326,12 @@ entry:
 ; Evergreen / Northern Islands don't support 64-bit stores yet, so there should
 ; be two 32-bit stores.
 
-; EG-CHECK-LABEL: {{^}}vecload2:
-; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
-; CM-CHECK-LABEL: {{^}}vecload2:
-; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
-; SI-CHECK-LABEL: {{^}}vecload2:
-; SI-CHECK: buffer_store_dwordx2
+; EG-LABEL: {{^}}vecload2:
+; EG: MEM_RAT_CACHELESS STORE_RAW
+; CM-LABEL: {{^}}vecload2:
+; CM: MEM_RAT_CACHELESS STORE_DWORD
+; SI-LABEL: {{^}}vecload2:
+; SI: buffer_store_dwordx2
 define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
 entry:
   %0 = load i32 addrspace(2)* %mem, align 4
@@ -341,14 +349,14 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"=
 
 ; FUNC-LABEL: {{^}}"i128-const-store":
 ; FIXME: We should be able to to this with one store instruction
-; EG-CHECK: STORE_RAW
-; EG-CHECK: STORE_RAW
-; EG-CHECK: STORE_RAW
-; EG-CHECK: STORE_RAW
-; CM-CHECK: STORE_DWORD
-; CM-CHECK: STORE_DWORD
-; CM-CHECK: STORE_DWORD
-; CM-CHECK: STORE_DWORD
+; EG: STORE_RAW
+; EG: STORE_RAW
+; EG: STORE_RAW
+; EG: STORE_RAW
+; CM: STORE_DWORD
+; CM: STORE_DWORD
+; CM: STORE_DWORD
+; CM: STORE_DWORD
 ; SI: buffer_store_dwordx2
 ; SI: buffer_store_dwordx2
 define void @i128-const-store(i32 addrspace(1)* %out) {
diff --git a/test/CodeGen/R600/store.r600.ll b/test/CodeGen/R600/store.r600.ll
index 3df30d4..2197260 100644
--- a/test/CodeGen/R600/store.r600.ll
+++ b/test/CodeGen/R600/store.r600.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
 
 ; XXX: Merge this test into store.ll once it is supported on SI
 
 ; v4i32 store
-; EG-CHECK: {{^}}store_v4i32:
-; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
+; EG: {{^}}store_v4i32:
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
 
 define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %1 = load <4 x i32> addrspace(1) * %in
@@ -13,8 +13,8 @@ define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
 }
 
 ; v4f32 store
-; EG-CHECK: {{^}}store_v4f32:
-; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
+; EG: {{^}}store_v4f32:
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
 define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %1 = load <4 x float> addrspace(1) * %in
   store <4 x float> %1, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll
index 2bbc0cf..be48e18 100644
--- a/test/CodeGen/R600/sub.ll
+++ b/test/CodeGen/R600/sub.ll
@@ -1,16 +1,31 @@
-;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-;RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
 
 declare i32 @llvm.r600.read.tidig.x() readnone
 
-;FUNC-LABEL: {{^}}test2:
-;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; FUNC-LABEL: {{^}}test_sub_i32:
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_subrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %a = load i32 addrspace(1)* %in
+  %b = load i32 addrspace(1)* %b_ptr
+  %result = sub i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
 
-;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+; FUNC-LABEL: {{^}}test_sub_v2i32:
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32> addrspace(1) * %in
   %b = load <2 x i32> addrspace(1) * %b_ptr
@@ -19,18 +34,18 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;FUNC-LABEL: {{^}}test4:
-;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; FUNC-LABEL: {{^}}test_sub_v4i32:
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
   %b = load <4 x i32> addrspace(1) * %b_ptr
@@ -73,3 +88,39 @@ define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+; FUNC-LABEL: {{^}}v_test_sub_v2i64:
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %a_ptr = getelementptr <2 x i64> addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr <2 x i64> addrspace(1)* %inB, i32 %tid
+  %a = load <2 x i64> addrspace(1)* %a_ptr
+  %b = load <2 x i64> addrspace(1)* %b_ptr
+  %result = sub <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_test_sub_v4i64:
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %a_ptr = getelementptr <4 x i64> addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr <4 x i64> addrspace(1)* %inB, i32 %tid
+  %a = load <4 x i64> addrspace(1)* %a_ptr
+  %b = load <4 x i64> addrspace(1)* %b_ptr
+  %result = sub <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/subreg-coalescer-crash.ll b/test/CodeGen/R600/subreg-coalescer-crash.ll
new file mode 100644
index 0000000..c4dae47
--- /dev/null
+++ b/test/CodeGen/R600/subreg-coalescer-crash.ll
@@ -0,0 +1,109 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -o - %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s
+
+; SI-LABEL:{{^}}row_filter_C1_D0:
+; SI: s_endpgm
+; Function Attrs: nounwind
+define void @row_filter_C1_D0() {
+entry:
+  br i1 undef, label %for.inc.1, label %do.body.preheader
+
+do.body.preheader:                                ; preds = %entry
+  %0 = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1
+  br i1 undef, label %do.body56.1, label %do.body90
+
+do.body90:                                        ; preds = %do.body56.2, %do.body56.1, %do.body.preheader
+  %1 = phi <4 x i32> [ %6, %do.body56.2 ], [ %5, %do.body56.1 ], [ %0, %do.body.preheader ]
+  %2 = insertelement <4 x i32> %1, i32 undef, i32 2
+  %3 = insertelement <4 x i32> %2, i32 undef, i32 3
+  br i1 undef, label %do.body124.1, label %do.body.1562.preheader
+
+do.body.1562.preheader:                           ; preds = %do.body124.1, %do.body90
+  %storemerge = phi <4 x i32> [ %3, %do.body90 ], [ %7, %do.body124.1 ]
+  %4 = insertelement <4 x i32> undef, i32 undef, i32 1
+  br label %for.inc.1
+
+do.body56.1:                                      ; preds = %do.body.preheader
+  %5 = insertelement <4 x i32> %0, i32 undef, i32 1
+  %or.cond472.1 = or i1 undef, undef
+  br i1 %or.cond472.1, label %do.body56.2, label %do.body90
+
+do.body56.2:                                      ; preds = %do.body56.1
+  %6 = insertelement <4 x i32> %5, i32 undef, i32 1
+  br label %do.body90
+
+do.body124.1:                                     ; preds = %do.body90
+  %7 = insertelement <4 x i32> %3, i32 undef, i32 3
+  br label %do.body.1562.preheader
+
+for.inc.1:                                        ; preds = %do.body.1562.preheader, %entry
+  %storemerge591 = phi <4 x i32> [ zeroinitializer, %entry ], [ %storemerge, %do.body.1562.preheader ]
+  %add.i495 = add <4 x i32> %storemerge591, undef
+  unreachable
+}
+
+; SI-LABEL: {{^}}foo:
+; SI: s_endpgm
+define void @foo() #0 {
+bb:
+  br i1 undef, label %bb2, label %bb1
+
+bb1:                                              ; preds = %bb
+  br i1 undef, label %bb4, label %bb6
+
+bb2:                                              ; preds = %bb4, %bb
+  %tmp = phi float [ %tmp5, %bb4 ], [ 0.000000e+00, %bb ]
+  br i1 undef, label %bb9, label %bb13
+
+bb4:                                              ; preds = %bb7, %bb6, %bb1
+  %tmp5 = phi float [ undef, %bb1 ], [ undef, %bb6 ], [ %tmp8, %bb7 ]
+  br label %bb2
+
+bb6:                                              ; preds = %bb1
+  br i1 undef, label %bb7, label %bb4
+
+bb7:                                              ; preds = %bb6
+  %tmp8 = fmul float undef, undef
+  br label %bb4
+
+bb9:                                              ; preds = %bb2
+  %tmp10 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 2)
+  %tmp11 = extractelement <4 x float> %tmp10, i32 1
+  %tmp12 = extractelement <4 x float> %tmp10, i32 3
+  br label %bb14
+
+bb13:                                             ; preds = %bb2
+  br i1 undef, label %bb23, label %bb24
+
+bb14:                                             ; preds = %bb27, %bb24, %bb9
+  %tmp15 = phi float [ %tmp12, %bb9 ], [ undef, %bb27 ], [ 0.000000e+00, %bb24 ]
+  %tmp16 = phi float [ %tmp11, %bb9 ], [ undef, %bb27 ], [ %tmp25, %bb24 ]
+  %tmp17 = fmul float 10.5, %tmp16
+  %tmp18 = fmul float 11.5, %tmp15
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp18, float %tmp17, float %tmp17, float %tmp17)
+  ret void
+
+bb23:                                             ; preds = %bb13
+  br i1 undef, label %bb24, label %bb26
+
+bb24:                                             ; preds = %bb26, %bb23, %bb13
+  %tmp25 = phi float [ %tmp, %bb13 ], [ %tmp, %bb26 ], [ 0.000000e+00, %bb23 ]
+  br i1 undef, label %bb27, label %bb14
+
+bb26:                                             ; preds = %bb23
+  br label %bb24
+
+bb27:                                             ; preds = %bb24
+  br label %bb14
+}
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/swizzle-export.ll b/test/CodeGen/R600/swizzle-export.ll
index 3e6f7a7..5eaca76 100644
--- a/test/CodeGen/R600/swizzle-export.ll
+++ b/test/CodeGen/R600/swizzle-export.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
 
-;EG-CHECK: {{^}}main:
-;EG-CHECK: EXPORT T{{[0-9]+}}.XYXX
-;EG-CHECK: EXPORT T{{[0-9]+}}.ZXXX
-;EG-CHECK: EXPORT T{{[0-9]+}}.XXWX
-;EG-CHECK: EXPORT T{{[0-9]+}}.XXXW
+;EG: {{^}}main:
+;EG: EXPORT T{{[0-9]+}}.XYXX
+;EG: EXPORT T{{[0-9]+}}.ZXXX
+;EG: EXPORT T{{[0-9]+}}.XXWX
+;EG: EXPORT T{{[0-9]+}}.XXXW
 
 define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
@@ -92,9 +92,9 @@ main_body:
   ret void
 }
 
-; EG-CHECK: {{^}}main2:
-; EG-CHECK: T{{[0-9]+}}.XY__
-; EG-CHECK: T{{[0-9]+}}.ZXY0
+; EG: {{^}}main2:
+; EG: T{{[0-9]+}}.XY__
+; EG: T{{[0-9]+}}.ZXY0
 
 define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
diff --git a/test/CodeGen/R600/trunc-cmp-constant.ll b/test/CodeGen/R600/trunc-cmp-constant.ll
new file mode 100644
index 0000000..a097ab0
--- /dev/null
+++ b/test/CodeGen/R600/trunc-cmp-constant.ll
@@ -0,0 +1,170 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_eq_0:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[TMP]], 1{{$}}
+; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1{{$}}
+; SI: v_cndmask_b32_e64
+; SI: buffer_store_byte
+define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp eq i32 %ext, 0
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: The negate should be inverting the compare.
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_0:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; SI: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 1{{$}}
+; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], [[CMP0]], -1
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp eq i32 %ext, 0
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1:
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; SI: buffer_store_byte [[RESULT]]
+define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp eq i32 %ext, 1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_1:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp eq i32 %ext, 1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_neg1:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp eq i32 %ext, -1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1:
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; SI: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp eq i32 %ext, -1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_ne_0:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp ne i32 %ext, 0
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_0:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp ne i32 %ext, 0
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1:
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
+; SI: buffer_store_byte [[RESULT]]
+define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp ne i32 %ext, 1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_1:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; SI: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 1{{$}}
+; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], [[CMP0]], -1
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp ne i32 %ext, 1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This should be one compare.
+; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_neg1:
+; XSI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; XSI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; XSI: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 0{{$}}
+; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]]
+; XSI-NEXT: buffer_store_byte [[RESULT]]
+define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp ne i32 %ext, -1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1:
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
+; SI: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp ne i32 %ext, -1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}masked_load_i1_to_i32_trunc_cmp_ne_neg1:
+; SI: buffer_load_sbyte [[LOAD:v[0-9]+]]
+; SI: v_cmp_ne_i32_e64 {{s\[[0-9]+:[0-9]+\]}}, [[LOAD]], -1{{$}}
+; SI-NEXT: v_cndmask_b32_e64
+; SI-NEXT: buffer_store_byte
+define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %load = load i8 addrspace(1)* %in
+  %masked = and i8 %load, 255
+  %ext = sext i8 %masked to i32
+  %cmp = icmp ne i32 %ext, -1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/trunc-store-i1.ll b/test/CodeGen/R600/trunc-store-i1.ll
index 3c1b19f..b71a838 100644
--- a/test/CodeGen/R600/trunc-store-i1.ll
+++ b/test/CodeGen/R600/trunc-store-i1.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 
 ; SI-LABEL: {{^}}global_truncstore_i32_to_i1:
diff --git a/test/CodeGen/R600/trunc.ll b/test/CodeGen/R600/trunc.ll
index 7519d10..fa44264 100644
--- a/test/CodeGen/R600/trunc.ll
+++ b/test/CodeGen/R600/trunc.ll
@@ -1,6 +1,8 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
 
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
 define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
 ; SI-LABEL: {{^}}trunc_i64_to_i32_store:
 ; SI: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], 0xb
@@ -34,6 +36,8 @@ define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
 ; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2
 ; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
 ; SI: s_addc_u32
+; SI: v_mov_b32_e32
+; SI: v_mov_b32_e32
 ; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
 ; SI: buffer_store_dword v[[LO_VREG]],
 define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
@@ -65,3 +69,32 @@ define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; SI-LABEL: {{^}}s_trunc_i64_to_i1:
+; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_and_b32_e64 [[MASKED:v[0-9]+]], 1, s[[SLO]]
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[MASKED]], 1
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, [[CMP]]
+define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
+  %trunc = trunc i64 %x to i1
+  %sel = select i1 %trunc, i32 63, i32 -12
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}v_trunc_i64_to_i1:
+; SI: buffer_load_dwordx2 v{{\[}}[[VLO:[0-9]+]]:{{[0-9]+\]}}
+; SI: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]]
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[MASKED]], 1
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, [[CMP]]
+define void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr i64 addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %x = load i64 addrspace(1)* %gep
+
+  %trunc = trunc i64 %x to i1
+  %sel = select i1 %trunc, i32 63, i32 -12
+  store i32 %sel, i32 addrspace(1)* %out.gep
+  ret void
+}
diff --git a/test/CodeGen/R600/tti-unroll-prefs.ll b/test/CodeGen/R600/tti-unroll-prefs.ll
new file mode 100644
index 0000000..0009c42
--- /dev/null
+++ b/test/CodeGen/R600/tti-unroll-prefs.ll
@@ -0,0 +1,58 @@
+; RUN: opt -loop-unroll -S -mtriple=amdgcn-- -mcpu=SI %s | FileCheck %s
+
+; This IR comes from this OpenCL C code:
+;
+; if (b + 4 > a) {
+;   for (int i = 0; i < 4; i++, b++) {
+;     if (b + 1 <= a)
+;       *(dst + c + b) = 0;
+;     else
+;       break;
+;   }
+; }
+;
+; This test is meant to check that this loop isn't unrolled into more than
+; four iterations.  The loop unrolling preferences we currently use cause this
+; loop to not be unrolled at all, but that may change in the future.
+
+; CHECK-LABEL: @test
+; CHECK: store i8 0, i8 addrspace(1)*
+; CHECK-NOT: store i8 0, i8 addrspace(1)*
+; CHECK: ret void
+define void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) {
+entry:
+  %add = add nsw i32 %b, 4
+  %cmp = icmp sgt i32 %add, %a
+  br i1 %cmp, label %for.cond.preheader, label %if.end7
+
+for.cond.preheader:                               ; preds = %entry
+  %cmp313 = icmp slt i32 %b, %a
+  br i1 %cmp313, label %if.then4.lr.ph, label %if.end7.loopexit
+
+if.then4.lr.ph:                                   ; preds = %for.cond.preheader
+  %0 = sext i32 %c to i64
+  br label %if.then4
+
+if.then4:                                         ; preds = %if.then4.lr.ph, %if.then4
+  %i.015 = phi i32 [ 0, %if.then4.lr.ph ], [ %inc, %if.then4 ]
+  %b.addr.014 = phi i32 [ %b, %if.then4.lr.ph ], [ %add2, %if.then4 ]
+  %add2 = add nsw i32 %b.addr.014, 1
+  %1 = sext i32 %b.addr.014 to i64
+  %add.ptr.sum = add nsw i64 %1, %0
+  %add.ptr5 = getelementptr inbounds i8 addrspace(1)* %dst, i64 %add.ptr.sum
+  store i8 0, i8 addrspace(1)* %add.ptr5, align 1
+  %inc = add nsw i32 %i.015, 1
+  %cmp1 = icmp slt i32 %inc, 4
+  %cmp3 = icmp slt i32 %add2, %a
+  %or.cond = and i1 %cmp3, %cmp1
+  br i1 %or.cond, label %if.then4, label %for.cond.if.end7.loopexit_crit_edge
+
+for.cond.if.end7.loopexit_crit_edge:              ; preds = %if.then4
+  br label %if.end7.loopexit
+
+if.end7.loopexit:                                 ; preds = %for.cond.if.end7.loopexit_crit_edge, %for.cond.preheader
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.end7.loopexit, %entry
+  ret void
+}
diff --git a/test/CodeGen/R600/uaddo.ll b/test/CodeGen/R600/uaddo.ll
index eb242c1..57d7835 100644
--- a/test/CodeGen/R600/uaddo.ll
+++ b/test/CodeGen/R600/uaddo.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
 
 declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/R600/udiv.ll b/test/CodeGen/R600/udiv.ll
index 59e91f8..0c2c65b 100644
--- a/test/CodeGen/R600/udiv.ll
+++ b/test/CodeGen/R600/udiv.ll
@@ -1,9 +1,10 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
 
-;EG-CHECK-LABEL: {{^}}test:
-;EG-CHECK-NOT: SETGE_INT
-;EG-CHECK: CF_END
+;EG-LABEL: {{^}}test:
+;EG-NOT: SETGE_INT
+;EG: CF_END
 
 define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
@@ -18,10 +19,10 @@ define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ;The goal of this test is to make sure the ISel doesn't fail when it gets
 ;a v4i32 udiv
 
-;EG-CHECK-LABEL: {{^}}test2:
-;EG-CHECK: CF_END
-;SI-CHECK-LABEL: {{^}}test2:
-;SI-CHECK: s_endpgm
+;EG-LABEL: {{^}}test2:
+;EG: CF_END
+;SI-LABEL: {{^}}test2:
+;SI: s_endpgm
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -32,10 +33,10 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;EG-CHECK-LABEL: {{^}}test4:
-;EG-CHECK: CF_END
-;SI-CHECK-LABEL: {{^}}test4:
-;SI-CHECK: s_endpgm
+;EG-LABEL: {{^}}test4:
+;EG: CF_END
+;SI-LABEL: {{^}}test4:
+;SI: s_endpgm
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/R600/udivrem.ll b/test/CodeGen/R600/udivrem.ll
index f20705b..b3837f2 100644
--- a/test/CodeGen/R600/udivrem.ll
+++ b/test/CodeGen/R600/udivrem.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}test_udivrem:
@@ -32,8 +33,8 @@
 ; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]]
 ; SI: v_cndmask_b32_e64
 ; SI: v_mul_hi_u32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]]
-; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], [[RCP]], [[E]]
-; SI-DAG: v_sub_i32_e32 [[RCP_S_E:v[0-9]+]], [[RCP]], [[E]]
+; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], [[E]], [[RCP]]
+; SI-DAG: v_subrev_i32_e32 [[RCP_S_E:v[0-9]+]], [[E]], [[RCP]]
 ; SI: v_cndmask_b32_e64
 ; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]]
 ; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]]
@@ -112,12 +113,12 @@ define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
-; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_RCP]], [[FIRST_E]]
-; SI-DAG: v_sub_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_RCP]], [[FIRST_E]]
+; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
 ; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_sub_i32_e32 [[FIRST_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FIRST_Num_S_Remainder]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}}
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
@@ -135,12 +136,12 @@ define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
-; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_RCP]], [[SECOND_E]]
-; SI-DAG: v_sub_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_RCP]], [[SECOND_E]]
+; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
 ; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_sub_i32_e32 [[SECOND_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[SECOND_Num_S_Remainder]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}}
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
@@ -262,12 +263,12 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
 ; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
-; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_RCP]], [[FIRST_E]]
-; SI-DAG: v_sub_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_RCP]], [[FIRST_E]]
+; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
 ; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_sub_i32_e32 [[FIRST_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FIRST_Num_S_Remainder]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[l0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}}
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
@@ -285,12 +286,12 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
 ; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
-; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_RCP]], [[SECOND_E]]
-; SI-DAG: v_sub_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_RCP]], [[SECOND_E]]
+; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
 ; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_sub_i32_e32 [[SECOND_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[SECOND_Num_S_Remainder]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}}
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
@@ -308,12 +309,12 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
 ; SI-DAG: v_sub_i32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]]
-; SI-DAG: v_add_i32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_RCP]], [[THIRD_E]]
-; SI-DAG: v_sub_i32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_RCP]], [[THIRD_E]]
+; SI-DAG: v_add_i32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[THIRD_Quotient:v[0-9]+]]
 ; SI-DAG: v_mul_lo_i32 [[THIRD_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_sub_i32_e32 [[THIRD_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[THIRD_Num_S_Remainder]]
+; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder:v[0-9]+]], [[THIRD_Num_S_Remainder]], {{v[0-9]+}}
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_and_b32_e32 [[THIRD_Tmp1:v[0-9]+]]
@@ -331,22 +332,8 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
 ; SI-DAG: v_sub_i32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]]
-; SI-DAG: v_add_i32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_RCP]], [[FOURTH_E]]
-; SI-DAG: v_sub_i32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_RCP]], [[FOURTH_E]]
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[FOURTH_Quotient:v[0-9]+]]
-; SI-DAG: v_mul_lo_i32 [[FOURTH_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_sub_i32_e32 [[FOURTH_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FOURTH_Num_S_Remainder]]
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32 [[FOURTH_Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[FOURTH_Quotient_A_One:v[0-9]+]], {{.*}}, [[FOURTH_Quotient]]
-; SI-DAG: v_subrev_i32_e32 [[FOURTH_Quotient_S_One:v[0-9]+]],
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_add_i32_e32 [[FOURTH_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[FOURTH_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
 define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
diff --git a/test/CodeGen/R600/udivrem24.ll b/test/CodeGen/R600/udivrem24.ll
index defb3c0..4b98ac6 100644
--- a/test/CodeGen/R600/udivrem24.ll
+++ b/test/CodeGen/R600/udivrem24.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}udiv24_i8:
diff --git a/test/CodeGen/R600/udivrem64.ll b/test/CodeGen/R600/udivrem64.ll
index 8864c83..9f3069b 100644
--- a/test/CodeGen/R600/udivrem64.ll
+++ b/test/CodeGen/R600/udivrem64.ll
@@ -1,5 +1,6 @@
-;XUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 
 ;FUNC-LABEL: {{^}}test_udiv:
 ;EG: RECIP_UINT
@@ -34,7 +35,41 @@
 ;EG: BFE_UINT
 ;EG: BFE_UINT
 ;EG: BFE_UINT
-;SI: s_endpgm
+
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
 define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = udiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
@@ -74,9 +109,115 @@ define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;EG: BFE_UINT
 ;EG: BFE_UINT
 ;EG: AND_INT {{.*}}, 1,
-;SI: s_endpgm
+
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
 define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
+
+;FUNC-LABEL: {{^}}test_udiv3264:
+;EG: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 33
+  %2 = lshr i64 %y, 33
+  %result = udiv i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_urem3264:
+;EG: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 33
+  %2 = lshr i64 %y, 33
+  %result = urem i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_udiv2464:
+;EG: UINT_TO_FLT
+;EG: UINT_TO_FLT
+;EG: FLT_TO_UINT
+;EG-NOT: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: v_mad_f32
+;GCN: s_endpgm
+define void @test_udiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 40
+  %2 = lshr i64 %y, 40
+  %result = udiv i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_urem2464:
+;EG: UINT_TO_FLT
+;EG: UINT_TO_FLT
+;EG: FLT_TO_UINT
+;EG-NOT: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: v_mad_f32
+;GCN: s_endpgm
+define void @test_urem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 40
+  %2 = lshr i64 %y, 40
+  %result = urem i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/uint_to_fp.f64.ll b/test/CodeGen/R600/uint_to_fp.f64.ll
index bddf700..f715243 100644
--- a/test/CodeGen/R600/uint_to_fp.f64.ll
+++ b/test/CodeGen/R600/uint_to_fp.f64.ll
@@ -1,47 +1,12 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
-; SI-LABEL: {{^}}uint_to_fp_f64_i32
-; SI: v_cvt_f64_u32_e32
-; SI: s_endpgm
-define void @uint_to_fp_f64_i32(double addrspace(1)* %out, i32 %in) {
-  %cast = uitofp i32 %in to double
-  store double %cast, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}uint_to_fp_i1_f64:
-; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
-; we should be able to fold the SGPRs into the V_CNDMASK instructions.
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @uint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
-  %cmp = icmp eq i32 %in, 0
-  %fp = uitofp i1 %cmp to double
-  store double %fp, double addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}uint_to_fp_i1_f64_load:
-; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, 1
-; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
-; SI: buffer_store_dwordx2 [[RESULT]]
-; SI: s_endpgm
-define void @uint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
-  %fp = uitofp i1 %in to double
-  store double %fp, double addrspace(1)* %out, align 8
-  ret void
-}
-
 ; SI-LABEL: {{^}}v_uint_to_fp_i64_to_f64
 ; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; SI-DAG: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
-; SI-DAG: v_cvt_f64_u32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
+; SI: v_cvt_f64_u32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
 ; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
+; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
@@ -53,23 +18,81 @@ define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)
   ret void
 }
 
-; SI-LABEL: {{^}}s_uint_to_fp_f64_i64
-define void @s_uint_to_fp_f64_i64(double addrspace(1)* %out, i64 %in) {
+; SI-LABEL: {{^}}s_uint_to_fp_i64_to_f64
+define void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
   %cast = uitofp i64 %in to double
   store double %cast, double addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: {{^}}s_uint_to_fp_v2f64_v2i64
-define void @s_uint_to_fp_v2f64_v2i64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
+; SI-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f64
+define void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
   %cast = uitofp <2 x i64> %in to <2 x double>
   store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: {{^}}s_uint_to_fp_v4f64_v4i64
-define void @s_uint_to_fp_v4f64_v4i64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
+; SI-LABEL: {{^}}s_uint_to_fp_v4i64_to_v4f64
+define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
   %cast = uitofp <4 x i64> %in to <4 x double>
   store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
   ret void
 }
+
+; SI-LABEL: {{^}}s_uint_to_fp_i32_to_f64
+; SI: v_cvt_f64_u32_e32
+; SI: s_endpgm
+define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
+  %cast = uitofp i32 %in to double
+  store double %cast, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_v2i32_to_v2f64
+; SI: v_cvt_f64_u32_e32
+; SI: v_cvt_f64_u32_e32
+; SI: s_endpgm
+define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) {
+  %cast = uitofp <2 x i32> %in to <2 x double>
+  store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_v4i32_to_v4f64
+; SI: v_cvt_f64_u32_e32
+; SI: v_cvt_f64_u32_e32
+; SI: v_cvt_f64_u32_e32
+; SI: v_cvt_f64_u32_e32
+; SI: s_endpgm
+define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) {
+  %cast = uitofp <4 x i32> %in to <4 x double>
+  store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FIXME: select on 0, 0
+; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
+; uses an SGPR for [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]]
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = uitofp i1 %cmp to double
+  store double %fp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}uint_to_fp_i1_to_f64_load:
+; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, 1
+; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) {
+  %fp = uitofp i1 %in to double
+  store double %fp, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/uint_to_fp.ll b/test/CodeGen/R600/uint_to_fp.ll
index f58f10b..1c8a175 100644
--- a/test/CodeGen/R600/uint_to_fp.ll
+++ b/test/CodeGen/R600/uint_to_fp.ll
@@ -1,20 +1,32 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; FUNC-LABEL: {{^}}uint_to_fp_v2i32:
+; FUNC-LABEL: {{^}}uint_to_fp_i32_to_f32:
+; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+
+; SI: v_cvt_f32_u32_e32
+; SI: s_endpgm
+define void @uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) {
+  %result = uitofp i32 %in to float
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}uint_to_fp_v2i32_to_v2f32:
 ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
 
 ; SI: v_cvt_f32_u32_e32
 ; SI: v_cvt_f32_u32_e32
 ; SI: s_endpgm
-define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+define void @uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
   %result = uitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}uint_to_fp_v4i32:
+; FUNC-LABEL: {{^}}uint_to_fp_v4i32_to_v4f32:
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -25,45 +37,45 @@ define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
 ; SI: v_cvt_f32_u32_e32
 ; SI: v_cvt_f32_u32_e32
 ; SI: s_endpgm
-define void @uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define void @uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %value = load <4 x i32> addrspace(1) * %in
   %result = uitofp <4 x i32> %value to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}uint_to_fp_i64_f32:
+; FUNC-LABEL: {{^}}uint_to_fp_i64_to_f32:
 ; R600: UINT_TO_FLT
 ; R600: UINT_TO_FLT
 ; R600: MULADD_IEEE
 ; SI: v_cvt_f32_u32_e32
 ; SI: v_cvt_f32_u32_e32
-; SI: v_mad_f32
+; SI: v_madmk_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x4f800000
 ; SI: s_endpgm
-define void @uint_to_fp_i64_f32(float addrspace(1)* %out, i64 %in) {
+define void @uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) {
 entry:
   %0 = uitofp i64 %in to float
   store float %0, float addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}uint_to_fp_i1_f32:
+; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32:
 ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @uint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) {
+define void @uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to float
   store float %fp, float addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: {{^}}uint_to_fp_i1_f32_load:
+; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32_load:
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) {
+define void @uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) {
   %fp = uitofp i1 %in to float
   store float %fp, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/R600/unaligned-load-store.ll b/test/CodeGen/R600/unaligned-load-store.ll
index f8737e6..665dc37 100644
--- a/test/CodeGen/R600/unaligned-load-store.ll
+++ b/test/CodeGen/R600/unaligned-load-store.ll
@@ -1,37 +1,179 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
-; FIXME: This is probably wrong. This probably needs to expand to 8-bit reads and writes.
-; SI-LABEL: {{^}}unaligned_load_store_i32:
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_write_b32
+; SI-LABEL: {{^}}unaligned_load_store_i16_local:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_write_b8
 ; SI: s_endpgm
-define void @unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
+define void @unaligned_load_store_i16_local(i16 addrspace(3)* %p, i16 addrspace(3)* %r) nounwind {
+  %v = load i16 addrspace(3)* %p, align 1
+  store i16 %v, i16 addrspace(3)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i16_global:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: s_endpgm
+define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) nounwind {
+  %v = load i16 addrspace(1)* %p, align 1
+  store i16 %v, i16 addrspace(1)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i32_local:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: s_endpgm
+define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
   %v = load i32 addrspace(3)* %p, align 1
   store i32 %v, i32 addrspace(3)* %r, align 1
   ret void
 }
 
-; SI-LABEL: {{^}}unaligned_load_store_v4i32:
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_write_b32
-; SI: ds_write_b32
-; SI: ds_write_b32
-; SI: ds_write_b32
+; SI-LABEL: {{^}}unaligned_load_store_i32_global:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind {
+  %v = load i32 addrspace(1)* %p, align 1
+  store i32 %v, i32 addrspace(1)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i64_local:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
 ; SI: s_endpgm
-define void @unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
+define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) {
+  %v = load i64 addrspace(3)* %p, align 1
+  store i64 %v, i64 addrspace(3)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i64_global:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) {
+  %v = load i64 addrspace(1)* %p, align 1
+  store i64 %v, i64 addrspace(1)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_v4i32_local:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: s_endpgm
+define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
   %v = load <4 x i32> addrspace(3)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
   ret void
 }
 
+; FIXME: We mark v4i32 as custom, so misaligned loads are never expanded.
+; FIXME-SI-LABEL: {{^}}unaligned_load_store_v4i32_global
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind {
+  %v = load <4 x i32> addrspace(1)* %p, align 1
+  store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
+  ret void
+}
+
 ; SI-LABEL: {{^}}load_lds_i64_align_4:
 ; SI: ds_read2_b32
 ; SI: s_endpgm
@@ -64,12 +206,23 @@ define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture
   ret void
 }
 
-; FIXME: Need to fix this case.
-; define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
-;   %val = load i64 addrspace(3)* %in, align 1
-;   store i64 %val, i64 addrspace(1)* %out, align 8
-;   ret void
-; }
+; SI-LABEL: {{^}}load_lds_i64_align_1:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+
+define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %val = load i64 addrspace(3)* %in, align 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
 
 ; SI-LABEL: {{^}}store_lds_i64_align_4:
 ; SI: ds_write2_b32
diff --git a/test/CodeGen/R600/unhandled-loop-condition-assertion.ll b/test/CodeGen/R600/unhandled-loop-condition-assertion.ll
index ff01a1e..c615f0b 100644
--- a/test/CodeGen/R600/unhandled-loop-condition-assertion.ll
+++ b/test/CodeGen/R600/unhandled-loop-condition-assertion.ll
@@ -1,6 +1,7 @@
 ; REQUIRES: asserts
 ; XFAIL: *
-; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
+; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
+; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
 ; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s
 
 ; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
diff --git a/test/CodeGen/R600/urecip.ll b/test/CodeGen/R600/urecip.ll
index 4d953b5..daacc77 100644
--- a/test/CodeGen/R600/urecip.ll
+++ b/test/CodeGen/R600/urecip.ll
@@ -1,4 +1,5 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK: v_rcp_iflag_f32_e32
 
diff --git a/test/CodeGen/R600/urem.ll b/test/CodeGen/R600/urem.ll
index 914f5d0..aa2a3eb 100644
--- a/test/CodeGen/R600/urem.ll
+++ b/test/CodeGen/R600/urem.ll
@@ -1,34 +1,94 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-;The code generated by urem is long and complex and may frequently change.
-;The goal of this test is to make sure the ISel doesn't fail when it gets
-;a v2i32/v4i32 urem
+; The code generated by urem is long and complex and may frequently
+; change.  The goal of this test is to make sure the ISel doesn't fail
+; when it gets a v2i32/v4i32 urem
 
-;EG-CHECK: {{^}}test2:
-;EG-CHECK: CF_END
-;SI-CHECK: {{^}}test2:
-;SI-CHECK: s_endpgm
+; FUNC-LABEL: {{^}}test_urem_i32:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %a = load i32 addrspace(1)* %in
+  %b = load i32 addrspace(1)* %b_ptr
+  %result = urem i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_urem_i32_7:
+; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x24924925
+; SI: v_mul_hi_u32 {{v[0-9]+}}, [[MAGIC]]
+; SI: v_subrev_i32
+; SI: v_mul_lo_i32
+; SI: v_sub_i32
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32 addrspace(1) * %in
+  %result = urem i32 %num, 7
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
 
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+; FUNC-LABEL: {{^}}test_urem_v2i32:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %a = load <2 x i32> addrspace(1)* %in
+  %b = load <2 x i32> addrspace(1)* %b_ptr
   %result = urem <2 x i32> %a, %b
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-;EG-CHECK: {{^}}test4:
-;EG-CHECK: CF_END
-;SI-CHECK: {{^}}test4:
-;SI-CHECK: s_endpgm
-
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+; FUNC-LABEL: {{^}}test_urem_v4i32:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %a = load <4 x i32> addrspace(1)* %in
+  %b = load <4 x i32> addrspace(1)* %b_ptr
   %result = urem <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}test_urem_i64:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
+  %a = load i64 addrspace(1)* %in
+  %b = load i64 addrspace(1)* %b_ptr
+  %result = urem i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_urem_v2i64:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64> addrspace(1)* %in
+  %b = load <2 x i64> addrspace(1)* %b_ptr
+  %result = urem <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_urem_v4i64:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64> addrspace(1)* %in
+  %b = load <4 x i64> addrspace(1)* %b_ptr
+  %result = urem <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/use-sgpr-multiple-times.ll b/test/CodeGen/R600/use-sgpr-multiple-times.ll
index aa94a0e..f26f300 100644
--- a/test/CodeGen/R600/use-sgpr-multiple-times.ll
+++ b/test/CodeGen/R600/use-sgpr-multiple-times.ll
@@ -1,80 +1,87 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
 declare float @llvm.fma.f32(float, float, float) #1
 declare float @llvm.fmuladd.f32(float, float, float) #1
 declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) #1
 
 
-; SI-LABEL: {{^}}test_sgpr_use_twice_binop:
-; SI: s_load_dword [[SGPR:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]]
-; SI: buffer_store_dword [[RESULT]]
+; GCN-LABEL: {{^}}test_sgpr_use_twice_binop:
+; GCN: s_load_dword [[SGPR:s[0-9]+]],
+; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
   %dbl = fadd float %a, %a
   store float %dbl, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}test_sgpr_use_three_ternary_op:
-; SI: s_load_dword [[SGPR:s[0-9]+]],
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]]
-; SI: buffer_store_dword [[RESULT]]
+; GCN-LABEL: {{^}}test_sgpr_use_three_ternary_op:
+; GCN: s_load_dword [[SGPR:s[0-9]+]],
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
 ; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
-; SI: buffer_store_dword [[RESULT]]
+; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
 ; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
-; SI: buffer_store_dword [[RESULT]]
+; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
 ; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
-; SI: buffer_store_dword [[RESULT]]
+; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_imm:
-; SI: s_load_dword [[SGPR:s[0-9]+]]
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0
-; SI: buffer_store_dword [[RESULT]]
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_imm:
+; GCN: s_load_dword [[SGPR:s[0-9]+]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0
+; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
-; SI: s_load_dword [[SGPR:s[0-9]+]]
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
-; SI: buffer_store_dword [[RESULT]]
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
+; GCN: s_load_dword [[SGPR:s[0-9]+]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
@@ -82,10 +89,10 @@ define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, fl
 }
 
 ; Don't use fma since fma c, x, y is canonicalized to fma x, c, y
-; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_imm_a_a:
-; SI: s_load_dword [[SGPR:s[0-9]+]]
-; SI: v_mad_i32_i24 [[RESULT:v[0-9]+]], 2, [[SGPR]], [[SGPR]]
-; SI: buffer_store_dword [[RESULT]]
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_imm_a_a:
+; GCN: s_load_dword [[SGPR:s[0-9]+]]
+; GCN: v_mad_i32_i24 [[RESULT:v[0-9]+]], 2, [[SGPR]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_twice_ternary_op_imm_a_a(i32 addrspace(1)* %out, i32 %a) #0 {
   %fma = call i32 @llvm.AMDGPU.imad24(i32 2, i32 %a, i32 %a) #1
   store i32 %fma, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/usubo.ll b/test/CodeGen/R600/usubo.ll
index abc5bd2..be1e666 100644
--- a/test/CodeGen/R600/usubo.ll
+++ b/test/CodeGen/R600/usubo.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
 
 declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
@@ -27,7 +28,7 @@ define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 }
 
 ; FUNC-LABEL: {{^}}v_usubo_i32:
-; SI: v_sub_i32_e32
+; SI: v_subrev_i32_e32
 define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32 addrspace(1)* %aptr, align 4
   %b = load i32 addrspace(1)* %bptr, align 4
diff --git a/test/CodeGen/R600/v_cndmask.ll b/test/CodeGen/R600/v_cndmask.ll
index a24dcc7..85936ec 100644
--- a/test/CodeGen/R600/v_cndmask.ll
+++ b/test/CodeGen/R600/v_cndmask.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.r600.read.tidig.x() #1
 
diff --git a/test/CodeGen/R600/valu-i1.ll b/test/CodeGen/R600/valu-i1.ll
index 2c209fc..5a3c2ec 100644
--- a/test/CodeGen/R600/valu-i1.ll
+++ b/test/CodeGen/R600/valu-i1.ll
@@ -1,10 +1,13 @@
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s
 
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; SI-LABEL: @test_if
 ; Make sure the i1 values created by the cfg structurizer pass are
 ; moved using VALU instructions
 ; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
 ; SI: v_mov_b32_e32 v{{[0-9]}}, -1
-define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) {
+define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
 entry:
   switch i32 %a, label %default [
     i32 0, label %case0
@@ -37,3 +40,149 @@ else:
 end:
   ret void
 }
+
+; SI-LABEL: @simple_test_v_if
+; SI: v_cmp_ne_i32_e64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0
+; SI: s_and_saveexec_b64 [[BR_SREG]], [[BR_SREG]]
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+
+; SI: ; BB#1
+; SI: buffer_store_dword
+; SI: s_endpgm
+
+; SI: BB1_2:
+; SI: s_or_b64 exec, exec, [[BR_SREG]]
+; SI: s_endpgm
+define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %is.0 = icmp ne i32 %tid, 0
+  br i1 %is.0, label %store, label %exit
+
+store:
+  %gep = getelementptr i32 addrspace(1)* %dst, i32 %tid
+  store i32 999, i32 addrspace(1)* %gep
+  ret void
+
+exit:
+  ret void
+}
+
+; SI-LABEL: @simple_test_v_loop
+; SI: v_cmp_ne_i32_e64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0
+; SI: s_and_saveexec_b64 [[BR_SREG]], [[BR_SREG]]
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: s_cbranch_execz BB2_2
+
+; SI: ; BB#1:
+; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
+
+; SI: BB2_3:
+; SI: buffer_load_dword
+; SI: buffer_store_dword
+; SI: v_cmp_eq_i32_e32 vcc,
+; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]]
+; SI: s_andn2_b64 exec, exec, [[OR_SREG]]
+; SI: s_cbranch_execnz BB2_3
+
+define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+entry:
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %is.0 = icmp ne i32 %tid, 0
+  %limit = add i32 %tid, 64
+  br i1 %is.0, label %loop, label %exit
+
+loop:
+  %i = phi i32 [%tid, %entry], [%i.inc, %loop]
+  %gep.src = getelementptr i32 addrspace(1)* %src, i32 %i
+  %gep.dst = getelementptr i32 addrspace(1)* %dst, i32 %i
+  %load = load i32 addrspace(1)* %src
+  store i32 %load, i32 addrspace(1)* %gep.dst
+  %i.inc = add nsw i32 %i, 1
+  %cmp = icmp eq i32 %limit, %i.inc
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; SI-LABEL: @multi_vcond_loop
+
+; Load loop limit from buffer
+; Branch to exit if uniformly not taken
+; SI: ; BB#0:
+; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
+; SI: v_cmp_gt_i32_e64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]]
+; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG]], [[OUTER_CMP_SREG]]
+; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
+; SI: s_cbranch_execz BB3_2
+
+; Initialize inner condition to false
+; SI: ; BB#1:
+; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]
+
+; Clear exec bits for workitems that load -1s
+; SI: BB3_3:
+; SI: buffer_load_dword [[B:v[0-9]+]]
+; SI: buffer_load_dword [[A:v[0-9]+]]
+; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], [[A]], -1
+; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_1:s\[[0-9]+:[0-9]+\]]], [[B]], -1
+; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
+; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]]
+; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]]
+; SI: s_cbranch_execz BB3_5
+
+; SI: BB#4:
+; SI: buffer_store_dword
+; SI: v_cmp_ge_i64_e32 vcc
+; SI: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]]
+
+; SI: BB3_5:
+; SI: s_or_b64 exec, exec, [[ORNEG1]]
+; SI: s_or_b64 [[COND_STATE]], [[ORNEG1]], [[COND_STATE]]
+; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
+; SI: s_cbranch_execnz BB3_3
+
+; SI: BB#6
+; SI: s_or_b64 exec, exec, [[COND_STATE]]
+
+; SI: BB3_2:
+; SI-NOT: [[COND_STATE]]
+; SI: s_endpgm
+
+define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
+bb:
+  %tmp = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tmp4 = sext i32 %tmp to i64
+  %tmp5 = getelementptr inbounds i32 addrspace(1)* %arg3, i64 %tmp4
+  %tmp6 = load i32 addrspace(1)* %tmp5, align 4
+  %tmp7 = icmp sgt i32 %tmp6, 0
+  %tmp8 = sext i32 %tmp6 to i64
+  br i1 %tmp7, label %bb10, label %bb26
+
+bb10:                                             ; preds = %bb, %bb20
+  %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ]
+  %tmp12 = add nsw i64 %tmp11, %tmp4
+  %tmp13 = getelementptr inbounds i32 addrspace(1)* %arg1, i64 %tmp12
+  %tmp14 = load i32 addrspace(1)* %tmp13, align 4
+  %tmp15 = getelementptr inbounds i32 addrspace(1)* %arg2, i64 %tmp12
+  %tmp16 = load i32 addrspace(1)* %tmp15, align 4
+  %tmp17 = icmp ne i32 %tmp14, -1
+  %tmp18 = icmp ne i32 %tmp16, -1
+  %tmp19 = and i1 %tmp17, %tmp18
+  br i1 %tmp19, label %bb20, label %bb26
+
+bb20:                                             ; preds = %bb10
+  %tmp21 = add nsw i32 %tmp16, %tmp14
+  %tmp22 = getelementptr inbounds i32 addrspace(1)* %arg, i64 %tmp12
+  store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4
+  %tmp23 = add nuw nsw i64 %tmp11, 1
+  %tmp24 = icmp slt i64 %tmp23, %tmp8
+  br i1 %tmp24, label %bb10, label %bb26
+
+bb26:                                             ; preds = %bb10, %bb20, %bb
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/vector-alloca.ll b/test/CodeGen/R600/vector-alloca.ll
index 0b457a8..228868a 100644
--- a/test/CodeGen/R600/vector-alloca.ll
+++ b/test/CodeGen/R600/vector-alloca.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}vector_read:
 ; EG: MOV
diff --git a/test/CodeGen/R600/vertex-fetch-encoding.ll b/test/CodeGen/R600/vertex-fetch-encoding.ll
index e24744e..e4d117f 100644
--- a/test/CodeGen/R600/vertex-fetch-encoding.ll
+++ b/test/CodeGen/R600/vertex-fetch-encoding.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=barts | FileCheck --check-prefix=NI-CHECK %s
-; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=cayman | FileCheck --check-prefix=CM-CHECK %s
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=barts | FileCheck --check-prefix=NI %s
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=cayman | FileCheck --check-prefix=CM %s
 
-; NI-CHECK: {{^}}vtx_fetch32:
-; NI-CHECK: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
-; CM-CHECK: {{^}}vtx_fetch32:
-; CM-CHECK: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
+; NI: {{^}}vtx_fetch32:
+; NI: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
+; CM: {{^}}vtx_fetch32:
+; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
 
 define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
@@ -13,8 +13,8 @@ entry:
   ret void
 }
 
-; NI-CHECK: {{^}}vtx_fetch128:
-; NI-CHECK: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00
+; NI: {{^}}vtx_fetch128:
+; NI: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00
 ; XXX: Add a case for Cayman when v4i32 stores are supported.
 
 define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
diff --git a/test/CodeGen/R600/vop-shrink.ll b/test/CodeGen/R600/vop-shrink.ll
index e7f0288..d5a46e3 100644
--- a/test/CodeGen/R600/vop-shrink.ll
+++ b/test/CodeGen/R600/vop-shrink.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; Test that we correctly commute a sub instruction
 ; FUNC-LABEL: {{^}}sub_rev:
diff --git a/test/CodeGen/R600/vselect.ll b/test/CodeGen/R600/vselect.ll
index e84b8f7..a6152f7 100644
--- a/test/CodeGen/R600/vselect.ll
+++ b/test/CodeGen/R600/vselect.ll
@@ -1,13 +1,14 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
 
-;EG-CHECK: {{^}}test_select_v2i32:
-;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: {{^}}test_select_v2i32:
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: {{^}}test_select_v2i32:
-;SI-CHECK: v_cndmask_b32_e64
-;SI-CHECK: v_cndmask_b32_e64
+;SI: {{^}}test_select_v2i32:
+;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e64
 
 define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
 entry:
@@ -19,13 +20,13 @@ entry:
   ret void
 }
 
-;EG-CHECK: {{^}}test_select_v2f32:
-;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: {{^}}test_select_v2f32:
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: {{^}}test_select_v2f32:
-;SI-CHECK: v_cndmask_b32_e64
-;SI-CHECK: v_cndmask_b32_e64
+;SI: {{^}}test_select_v2f32:
+;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e64
 
 define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
 entry:
@@ -37,17 +38,17 @@ entry:
   ret void
 }
 
-;EG-CHECK: {{^}}test_select_v4i32:
-;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: {{^}}test_select_v4i32:
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: {{^}}test_select_v4i32:
-;SI-CHECK: v_cndmask_b32_e64
-;SI-CHECK: v_cndmask_b32_e64
-;SI-CHECK: v_cndmask_b32_e64
-;SI-CHECK: v_cndmask_b32_e64
+;SI: {{^}}test_select_v4i32:
+;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e64
 
 define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
 entry:
@@ -59,11 +60,11 @@ entry:
   ret void
 }
 
-;EG-CHECK: {{^}}test_select_v4f32:
-;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: {{^}}test_select_v4f32:
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
 entry:
diff --git a/test/CodeGen/R600/wait.ll b/test/CodeGen/R600/wait.ll
index 735eabd..43561aa 100644
--- a/test/CodeGen/R600/wait.ll
+++ b/test/CodeGen/R600/wait.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace %s
 
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: s_load_dwordx4
 ; CHECK: s_load_dwordx4
-; CHECK: s_waitcnt lgkmcnt(0){{$}}
-; CHECK: s_waitcnt vmcnt(0){{$}}
-; CHECK: s_waitcnt expcnt(0) lgkmcnt(0){{$}}
+; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; CHECK: s_endpgm
 define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
 main_body:
   %tmp = getelementptr <16 x i8> addrspace(2)* %arg3, i32 0
@@ -41,5 +41,5 @@ attributes #0 = { "ShaderType"="1" }
 attributes #1 = { noduplicate nounwind }
 attributes #2 = { nounwind readnone }
 
-!0 = metadata !{metadata !1, metadata !1, i64 0, i32 1}
-!1 = metadata !{metadata !"const", null}
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
diff --git a/test/CodeGen/R600/work-item-intrinsics.ll b/test/CodeGen/R600/work-item-intrinsics.ll
index 47f65f5..4328e96 100644
--- a/test/CodeGen/R600/work-item-intrinsics.ll
+++ b/test/CodeGen/R600/work-item-intrinsics.ll
@@ -1,14 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 
 ; FUNC-LABEL: {{^}}ngroups_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV [[VAL]], KC0[0].X
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @ngroups_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.x() #0
@@ -21,8 +22,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[0].Y
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @ngroups_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.y() #0
@@ -35,8 +37,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[0].Z
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @ngroups_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.z() #0
@@ -49,8 +52,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[0].W
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @global_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.x() #0
@@ -63,8 +67,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[1].X
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @global_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.y() #0
@@ -77,8 +82,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[1].Y
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @global_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.z() #0
@@ -91,8 +97,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[1].Z
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @local_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.x() #0
@@ -105,8 +112,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[1].W
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @local_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.y() #0
@@ -119,8 +127,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[2].X
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @local_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.z() #0
@@ -133,8 +142,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[2].Z
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @get_work_dim (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.AMDGPU.read.workdim() #0
@@ -147,8 +157,8 @@ entry:
 ; kernel arguments, but this may change in the future.
 
 ; FUNC-LABEL: {{^}}tgid_x:
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4
-; SI: buffer_store_dword [[VVAL]]
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4
+; GCN: buffer_store_dword [[VVAL]]
 define void @tgid_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
@@ -157,8 +167,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}tgid_y:
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5
-; SI: buffer_store_dword [[VVAL]]
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5
+; GCN: buffer_store_dword [[VVAL]]
 define void @tgid_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
@@ -167,8 +177,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}tgid_z:
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6
-; SI: buffer_store_dword [[VVAL]]
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6
+; GCN: buffer_store_dword [[VVAL]]
 define void @tgid_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
@@ -177,7 +187,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}tidig_x:
-; SI: buffer_store_dword v0
+; GCN: buffer_store_dword v0
 define void @tidig_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
@@ -186,7 +196,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}tidig_y:
-; SI: buffer_store_dword v1
+; GCN: buffer_store_dword v1
 define void @tidig_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
@@ -195,7 +205,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}tidig_z:
-; SI: buffer_store_dword v2
+; GCN: buffer_store_dword v2
 define void @tidig_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
diff --git a/test/CodeGen/R600/wrong-transalu-pos-fix.ll b/test/CodeGen/R600/wrong-transalu-pos-fix.ll
index d652d2d..4e77c07 100644
--- a/test/CodeGen/R600/wrong-transalu-pos-fix.ll
+++ b/test/CodeGen/R600/wrong-transalu-pos-fix.ll
@@ -81,6 +81,6 @@ attributes #1 = { nounwind readnone }
 
 !opencl.kernels = !{!0, !1, !2}
 
-!0 = metadata !{null}
-!1 = metadata !{null}
-!2 = metadata !{void (i32 addrspace(1)*)* @fill3d}
+!0 = !{null}
+!1 = !{null}
+!2 = !{void (i32 addrspace(1)*)* @fill3d}
diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
index fa54e38..1526e28 100644
--- a/test/CodeGen/R600/xor.ll
+++ b/test/CodeGen/R600/xor.ll
@@ -1,14 +1,14 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-;EG-CHECK: {{^}}xor_v2i32:
-;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: {{^}}xor_v2i32:
-;SI-CHECK: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; FUNC-LABEL: {{^}}xor_v2i32:
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
+; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
   %a = load <2 x i32> addrspace(1) * %in0
@@ -18,17 +18,16 @@ define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in
   ret void
 }
 
-;EG-CHECK: {{^}}xor_v4i32:
-;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; FUNC-LABEL: {{^}}xor_v4i32:
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: {{^}}xor_v4i32:
-;SI-CHECK: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
   %a = load <4 x i32> addrspace(1) * %in0
@@ -38,25 +37,42 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
   ret void
 }
 
-;EG-CHECK: {{^}}xor_i1:
-;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
-
-;SI-CHECK: {{^}}xor_i1:
-;SI-CHECK: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; FUNC-LABEL: {{^}}xor_i1:
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
 
+; SI-DAG: v_cmp_ge_f32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, 0
+; SI-DAG: v_cmp_ge_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, 1.0
+; SI: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[XOR]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
 define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
   %a = load float addrspace(1) * %in0
   %b = load float addrspace(1) * %in1
   %acmp = fcmp oge float %a, 0.000000e+00
-  %bcmp = fcmp oge float %b, 0.000000e+00
+  %bcmp = fcmp oge float %b, 1.000000e+00
   %xor = xor i1 %acmp, %bcmp
   %result = select i1 %xor, float %a, float %b
   store float %result, float addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: {{^}}vector_xor_i32:
-; SI-CHECK: v_xor_b32_e32
+; FUNC-LABEL: {{^}}v_xor_i1:
+; SI: buffer_load_ubyte [[B:v[0-9]+]]
+; SI: buffer_load_ubyte [[A:v[0-9]+]]
+; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]]
+; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
+; SI: buffer_store_byte [[RESULT]]
+define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
+  %a = load i1 addrspace(1)* %in0
+  %b = load i1 addrspace(1)* %in1
+  %xor = xor i1 %a, %b
+  store i1 %xor, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_xor_i32:
+; SI: v_xor_b32_e32
 define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %a = load i32 addrspace(1)* %in0
   %b = load i32 addrspace(1)* %in1
@@ -65,24 +81,24 @@ define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32
   ret void
 }
 
-; SI-CHECK-LABEL: {{^}}scalar_xor_i32:
-; SI-CHECK: s_xor_b32
+; FUNC-LABEL: {{^}}scalar_xor_i32:
+; SI: s_xor_b32
 define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %result = xor i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: {{^}}scalar_not_i32:
-; SI-CHECK: s_not_b32
+; FUNC-LABEL: {{^}}scalar_not_i32:
+; SI: s_not_b32
 define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
   %result = xor i32 %a, -1
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: {{^}}vector_not_i32:
-; SI-CHECK: v_not_b32
+; FUNC-LABEL: {{^}}vector_not_i32:
+; SI: v_not_b32
 define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %a = load i32 addrspace(1)* %in0
   %b = load i32 addrspace(1)* %in1
@@ -91,10 +107,10 @@ define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32
   ret void
 }
 
-; SI-CHECK-LABEL: {{^}}vector_xor_i64:
-; SI-CHECK: v_xor_b32_e32
-; SI-CHECK: v_xor_b32_e32
-; SI-CHECK: s_endpgm
+; FUNC-LABEL: {{^}}vector_xor_i64:
+; SI: v_xor_b32_e32
+; SI: v_xor_b32_e32
+; SI: s_endpgm
 define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
   %a = load i64 addrspace(1)* %in0
   %b = load i64 addrspace(1)* %in1
@@ -103,26 +119,26 @@ define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64
   ret void
 }
 
-; SI-CHECK-LABEL: {{^}}scalar_xor_i64:
-; SI-CHECK: s_xor_b64
-; SI-CHECK: s_endpgm
+; FUNC-LABEL: {{^}}scalar_xor_i64:
+; SI: s_xor_b64
+; SI: s_endpgm
 define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %result = xor i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: {{^}}scalar_not_i64:
-; SI-CHECK: s_not_b64
+; FUNC-LABEL: {{^}}scalar_not_i64:
+; SI: s_not_b64
 define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
   %result = xor i64 %a, -1
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: {{^}}vector_not_i64:
-; SI-CHECK: v_not_b32
-; SI-CHECK: v_not_b32
+; FUNC-LABEL: {{^}}vector_not_i64:
+; SI: v_not_b32
+; SI: v_not_b32
 define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
   %a = load i64 addrspace(1)* %in0
   %b = load i64 addrspace(1)* %in1
@@ -135,8 +151,8 @@ define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64
 ; Note that in the future the backend may be smart enough to
 ; use an SALU instruction for this.
 
-; SI-CHECK-LABEL: {{^}}xor_cf:
-; SI-CHECK: s_xor_b64
+; FUNC-LABEL: {{^}}xor_cf:
+; SI: s_xor_b64
 define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
 entry:
   %0 = icmp eq i64 %a, 0
diff --git a/test/CodeGen/R600/zero_extend.ll b/test/CodeGen/R600/zero_extend.ll
index 0fe1f15..033055d 100644
--- a/test/CodeGen/R600/zero_extend.ll
+++ b/test/CodeGen/R600/zero_extend.ll
@@ -1,14 +1,15 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI
 
-; R600-CHECK: {{^}}test:
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW
+; R600: {{^}}test:
+; R600: MEM_RAT_CACHELESS STORE_RAW
+; R600: MEM_RAT_CACHELESS STORE_RAW
 
-; SI-CHECK: {{^}}test:
-; SI-CHECK: s_mov_b32 [[ZERO:s[0-9]]], 0{{$}}
-; SI-CHECK: v_mov_b32_e32 v[[V_ZERO:[0-9]]], [[ZERO]]
-; SI-CHECK: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
+; SI: {{^}}test:
+; SI: s_mov_b32 [[ZERO:s[0-9]]], 0{{$}}
+; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], [[ZERO]]
+; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
 define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = mul i32 %a, %b
@@ -18,8 +19,8 @@ entry:
   ret void
 }
 
-; SI-CHECK-LABEL: {{^}}testi1toi32:
-; SI-CHECK: v_cndmask_b32
+; SI-LABEL: {{^}}testi1toi32:
+; SI: v_cndmask_b32
 define void @testi1toi32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp eq i32 %a, %b
@@ -28,10 +29,10 @@ entry:
   ret void
 }
 
-; SI-CHECK-LABEL: {{^}}zext_i1_to_i64:
-; SI-CHECK: v_cmp_eq_i32
-; SI-CHECK: v_cndmask_b32
-; SI-CHECK: s_mov_b32 s{{[0-9]+}}, 0
+; SI-LABEL: {{^}}zext_i1_to_i64:
+; SI: s_mov_b32 s{{[0-9]+}}, 0
+; SI: v_cmp_eq_i32
+; SI: v_cndmask_b32
 define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %ext = zext i1 %cmp to i64
diff --git a/test/CodeGen/SPARC/2008-10-10-InlineAsmMemoryOperand.ll b/test/CodeGen/SPARC/2008-10-10-InlineAsmMemoryOperand.ll
index e8315f1..373a196 100644
--- a/test/CodeGen/SPARC/2008-10-10-InlineAsmMemoryOperand.ll
+++ b/test/CodeGen/SPARC/2008-10-10-InlineAsmMemoryOperand.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=sparc
+; RUN: llc < %s -march=sparc -no-integrated-as
 ; PR 1557
 
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-f128:128:128"
diff --git a/test/CodeGen/SPARC/float.ll b/test/CodeGen/SPARC/float.ll
index 6636704..d7a79cb 100644
--- a/test/CodeGen/SPARC/float.ll
+++ b/test/CodeGen/SPARC/float.ll
@@ -154,11 +154,11 @@ entry:
 ; SPARC64:          fitod
 ; SPARC64:          fdtoi
 
-define void @test_itod_dtoi(i32 %a, i32* %ptr0, double* %ptr1) {
+define void @test_itod_dtoi(i32 %a, double %b, i32* %ptr0, double* %ptr1) {
 entry:
   %0 = sitofp i32 %a to double
   store double %0, double* %ptr1, align 8
-  %1 = fptosi double %0 to i32
+  %1 = fptosi double %b to i32
   store i32 %1, i32* %ptr0, align 8
   ret void
 }
diff --git a/test/CodeGen/SPARC/fp128.ll b/test/CodeGen/SPARC/fp128.ll
index abd89bf..a06112a 100644
--- a/test/CodeGen/SPARC/fp128.ll
+++ b/test/CodeGen/SPARC/fp128.ll
@@ -182,26 +182,28 @@ entry:
 }
 
 ; HARD-LABEL:    test_itoq_qtoi
-; HARD:          call _Q_lltoq
-; HARD:          call _Q_qtoll
-; HARD:          fitoq
-; HARD:          fqtoi
+; HARD-DAG:      call _Q_lltoq
+; HARD-DAG:      call _Q_qtoll
+; HARD-DAG:      fitoq
+; HARD-DAG:      fqtoi
 
 ; SOFT-LABEL:    test_itoq_qtoi
-; SOFT:          call _Q_lltoq
-; SOFT:          call _Q_qtoll
-; SOFT:          call _Q_itoq
-; SOFT:          call _Q_qtoi
+; SOFT-DAG:      call _Q_lltoq
+; SOFT-DAG:      call _Q_qtoll
+; SOFT-DAG:      call _Q_itoq
+; SOFT-DAG:      call _Q_qtoi
 
-define void @test_itoq_qtoi(i64 %a, i32 %b, i64* %ptr0, fp128* %ptr1) {
+define void @test_itoq_qtoi(i64 %a, i32 %b, fp128* %c, fp128* %d, i64* %ptr0, fp128* %ptr1) {
 entry:
   %0 = sitofp i64 %a to fp128
   store  fp128 %0, fp128* %ptr1, align 8
-  %1 = fptosi fp128 %0 to i64
+  %cval = load fp128* %c, align 8
+  %1 = fptosi fp128 %cval to i64
   store  i64 %1, i64* %ptr0, align 8
   %2 = sitofp i32 %b to fp128
   store  fp128 %2, fp128* %ptr1, align 8
-  %3 = fptosi fp128 %2 to i32
+  %dval = load fp128* %d, align 8
+  %3 = fptosi fp128 %dval to i32
   %4 = bitcast i64* %ptr0 to i32*
   store  i32 %3, i32* %4, align 8
   ret void
@@ -219,15 +221,17 @@ entry:
 ; SOFT-DAG:      call _Q_utoq
 ; SOFT-DAG:      call _Q_qtou
 
-define void @test_utoq_qtou(i64 %a, i32 %b, i64* %ptr0, fp128* %ptr1) {
+define void @test_utoq_qtou(i64 %a, i32 %b, fp128* %c, fp128* %d, i64* %ptr0, fp128* %ptr1) {
 entry:
   %0 = uitofp i64 %a to fp128
   store  fp128 %0, fp128* %ptr1, align 8
-  %1 = fptoui fp128 %0 to i64
+  %cval = load fp128* %c, align 8
+  %1 = fptoui fp128 %cval to i64
   store  i64 %1, i64* %ptr0, align 8
   %2 = uitofp i32 %b to fp128
   store  fp128 %2, fp128* %ptr1, align 8
-  %3 = fptoui fp128 %2 to i32
+  %dval = load fp128* %d, align 8
+  %3 = fptoui fp128 %dval to i32
   %4 = bitcast i64* %ptr0 to i32*
   store  i32 %3, i32* %4, align 8
   ret void
diff --git a/test/CodeGen/SPARC/inlineasm.ll b/test/CodeGen/SPARC/inlineasm.ll
index 2650533..526cde8 100644
--- a/test/CodeGen/SPARC/inlineasm.ll
+++ b/test/CodeGen/SPARC/inlineasm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=sparc <%s | FileCheck %s
+; RUN: llc -march=sparc -no-integrated-as <%s | FileCheck %s
 
 ; CHECK-LABEL: test_constraint_r
 ; CHECK:       add %o1, %o0, %o0
diff --git a/test/CodeGen/SPARC/mult-alt-generic-sparc.ll b/test/CodeGen/SPARC/mult-alt-generic-sparc.ll
index 6013b17..6a67616 100644
--- a/test/CodeGen/SPARC/mult-alt-generic-sparc.ll
+++ b/test/CodeGen/SPARC/mult-alt-generic-sparc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=sparc
+; RUN: llc < %s -march=sparc -no-integrated-as
 ; ModuleID = 'mult-alt-generic.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32"
 target triple = "sparc"
diff --git a/test/CodeGen/SPARC/setjmp.ll b/test/CodeGen/SPARC/setjmp.ll
index a31cd70..17afb36 100644
--- a/test/CodeGen/SPARC/setjmp.ll
+++ b/test/CodeGen/SPARC/setjmp.ll
@@ -65,8 +65,8 @@ attributes #0 = { nounwind }
 attributes #1 = { noreturn nounwind }
 attributes #2 = { nounwind returns_twice }
 
-!0 = metadata !{metadata !"alias set 6: struct.jmpbuf_env*", metadata !1}
-!1 = metadata !{metadata !1}
-!2 = metadata !{metadata !"alias set 3: int", metadata !1}
-!3 = metadata !{metadata !0, metadata !0, i64 0}
-!4 = metadata !{metadata !2, metadata !2, i64 0}
+!0 = !{!"alias set 6: struct.jmpbuf_env*", !1}
+!1 = !{!1}
+!2 = !{!"alias set 3: int", !1}
+!3 = !{!0, !0, i64 0}
+!4 = !{!2, !2, i64 0}
diff --git a/test/CodeGen/SystemZ/alias-01.ll b/test/CodeGen/SystemZ/alias-01.ll
index 8839aad..89a7318 100644
--- a/test/CodeGen/SystemZ/alias-01.ll
+++ b/test/CodeGen/SystemZ/alias-01.ll
@@ -14,6 +14,6 @@ define void @f1(<16 x i32> *%src1, <16 x float> *%dest) {
   ret void
 }
 
-!0 = metadata !{ metadata !"root" }
-!1 = metadata !{ metadata !"set1", metadata !0 }
-!2 = metadata !{ metadata !"set2", metadata !0 }
+!0 = !{ !"root" }
+!1 = !{ !"set1", !0 }
+!2 = !{ !"set2", !0 }
diff --git a/test/CodeGen/SystemZ/and-08.ll b/test/CodeGen/SystemZ/and-08.ll
index 7ded115..a328c4e 100644
--- a/test/CodeGen/SystemZ/and-08.ll
+++ b/test/CodeGen/SystemZ/and-08.ll
@@ -371,8 +371,8 @@ define void @f26(i64 *%ptr1, i64 *%ptr2) {
   ret void
 }
 
-!0 = metadata !{ metadata !"root" }
-!1 = metadata !{ metadata !"set1", metadata !0 }
-!2 = metadata !{ metadata !"set2", metadata !0 }
-!3 = metadata !{ metadata !1, metadata !1, i64 0}
-!4 = metadata !{ metadata !2, metadata !2, i64 0}
+!0 = !{ !"root" }
+!1 = !{ !"set1", !0 }
+!2 = !{ !"set2", !0 }
+!3 = !{ !1, !1, i64 0}
+!4 = !{ !2, !2, i64 0}
diff --git a/test/CodeGen/SystemZ/asm-01.ll b/test/CodeGen/SystemZ/asm-01.ll
index 801378c..3dbc8ac 100644
--- a/test/CodeGen/SystemZ/asm-01.ll
+++ b/test/CodeGen/SystemZ/asm-01.ll
@@ -1,7 +1,7 @@
 ; Test the "Q" asm constraint, which accepts addresses that have a base
 ; and a 12-bit displacement.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 ; Check the lowest range.
 define void @f1(i64 %base) {
diff --git a/test/CodeGen/SystemZ/asm-02.ll b/test/CodeGen/SystemZ/asm-02.ll
index ad1e35b..458bfeb 100644
--- a/test/CodeGen/SystemZ/asm-02.ll
+++ b/test/CodeGen/SystemZ/asm-02.ll
@@ -1,7 +1,7 @@
 ; Test the "R" asm constraint, which accepts addresses that have a base,
 ; an index and a 12-bit displacement.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 ; Check the lowest range.
 define void @f1(i64 %base) {
diff --git a/test/CodeGen/SystemZ/asm-03.ll b/test/CodeGen/SystemZ/asm-03.ll
index fa3e1a7..2e60ad6 100644
--- a/test/CodeGen/SystemZ/asm-03.ll
+++ b/test/CodeGen/SystemZ/asm-03.ll
@@ -1,7 +1,7 @@
 ; Test the "S" asm constraint, which accepts addresses that have a base
 ; and a 20-bit displacement.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 define void @f1(i64 %base) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/SystemZ/asm-04.ll b/test/CodeGen/SystemZ/asm-04.ll
index af7ea9f..b212253 100644
--- a/test/CodeGen/SystemZ/asm-04.ll
+++ b/test/CodeGen/SystemZ/asm-04.ll
@@ -1,7 +1,7 @@
 ; Test the "T" asm constraint, which accepts addresses that have a base,
 ; an index and a 20-bit displacement.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 define void @f1(i64 %base) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/SystemZ/asm-05.ll b/test/CodeGen/SystemZ/asm-05.ll
index e18cb75..db99b10 100644
--- a/test/CodeGen/SystemZ/asm-05.ll
+++ b/test/CodeGen/SystemZ/asm-05.ll
@@ -1,6 +1,6 @@
 ; Test the "m" asm constraint, which is equivalent to "T".
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 define void @f1(i64 %base) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/SystemZ/asm-06.ll b/test/CodeGen/SystemZ/asm-06.ll
index f9848a2..73c938f 100644
--- a/test/CodeGen/SystemZ/asm-06.ll
+++ b/test/CodeGen/SystemZ/asm-06.ll
@@ -1,6 +1,6 @@
 ; Test the GPR constraint "a", which forbids %r0.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 define i64 @f1() {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/SystemZ/asm-07.ll b/test/CodeGen/SystemZ/asm-07.ll
index bf63150..42b89e6 100644
--- a/test/CodeGen/SystemZ/asm-07.ll
+++ b/test/CodeGen/SystemZ/asm-07.ll
@@ -1,6 +1,6 @@
 ; Test the GPR constraint "r".
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 define i64 @f1() {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/SystemZ/asm-08.ll b/test/CodeGen/SystemZ/asm-08.ll
index 1662337..4185108 100644
--- a/test/CodeGen/SystemZ/asm-08.ll
+++ b/test/CodeGen/SystemZ/asm-08.ll
@@ -1,6 +1,6 @@
 ; Test the GPR constraint "d", which is equivalent to "r".
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 define i64 @f1() {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/SystemZ/asm-09.ll b/test/CodeGen/SystemZ/asm-09.ll
index 5cd7efb..b9d86cf 100644
--- a/test/CodeGen/SystemZ/asm-09.ll
+++ b/test/CodeGen/SystemZ/asm-09.ll
@@ -1,6 +1,6 @@
 ; Test matching operands with the GPR constraint "r".
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 define void @f1(i32 *%dst) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/SystemZ/asm-10.ll b/test/CodeGen/SystemZ/asm-10.ll
index 0eccc19..b71db83 100644
--- a/test/CodeGen/SystemZ/asm-10.ll
+++ b/test/CodeGen/SystemZ/asm-10.ll
@@ -1,6 +1,6 @@
 ; Test the FPR constraint "f".
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 define float @f1() {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/SystemZ/asm-11.ll b/test/CodeGen/SystemZ/asm-11.ll
index 8aeb784..8a4cdbb 100644
--- a/test/CodeGen/SystemZ/asm-11.ll
+++ b/test/CodeGen/SystemZ/asm-11.ll
@@ -1,6 +1,6 @@
 ; Test the "I" constraint (8-bit unsigned constants).
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 ; Test 1 below the first valid value.
 define i32 @f1() {
diff --git a/test/CodeGen/SystemZ/asm-12.ll b/test/CodeGen/SystemZ/asm-12.ll
index feecbac..115092c 100644
--- a/test/CodeGen/SystemZ/asm-12.ll
+++ b/test/CodeGen/SystemZ/asm-12.ll
@@ -1,6 +1,6 @@
 ; Test the "J" constraint (12-bit unsigned constants).
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 ; Test 1 below the first valid value.
 define i32 @f1() {
diff --git a/test/CodeGen/SystemZ/asm-13.ll b/test/CodeGen/SystemZ/asm-13.ll
index b881700..83454ea 100644
--- a/test/CodeGen/SystemZ/asm-13.ll
+++ b/test/CodeGen/SystemZ/asm-13.ll
@@ -1,6 +1,6 @@
 ; Test the "K" constraint (16-bit signed constants).
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 ; Test 1 below the first valid value.
 define i32 @f1() {
diff --git a/test/CodeGen/SystemZ/asm-14.ll b/test/CodeGen/SystemZ/asm-14.ll
index bcd8b1e..41b8f40 100644
--- a/test/CodeGen/SystemZ/asm-14.ll
+++ b/test/CodeGen/SystemZ/asm-14.ll
@@ -1,6 +1,6 @@
 ; Test the "L" constraint (20-bit signed constants).
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 ; Test 1 below the first valid value.
 define i32 @f1() {
diff --git a/test/CodeGen/SystemZ/asm-15.ll b/test/CodeGen/SystemZ/asm-15.ll
index 886ee0e..8361b68 100644
--- a/test/CodeGen/SystemZ/asm-15.ll
+++ b/test/CodeGen/SystemZ/asm-15.ll
@@ -1,6 +1,6 @@
 ; Test the "M" constraint (0x7fffffff)
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 ; Test 1 below the valid value.
 define i32 @f1() {
diff --git a/test/CodeGen/SystemZ/asm-16.ll b/test/CodeGen/SystemZ/asm-16.ll
index 886ee0e..8361b68 100644
--- a/test/CodeGen/SystemZ/asm-16.ll
+++ b/test/CodeGen/SystemZ/asm-16.ll
@@ -1,6 +1,6 @@
 ; Test the "M" constraint (0x7fffffff)
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 ; Test 1 below the valid value.
 define i32 @f1() {
diff --git a/test/CodeGen/SystemZ/asm-17.ll b/test/CodeGen/SystemZ/asm-17.ll
index 7bc9da3..533b5e9 100644
--- a/test/CodeGen/SystemZ/asm-17.ll
+++ b/test/CodeGen/SystemZ/asm-17.ll
@@ -1,6 +1,6 @@
 ; Test explicit register names.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 ; Test i32 GPRs.
 define i32 @f1() {
diff --git a/test/CodeGen/SystemZ/asm-18.ll b/test/CodeGen/SystemZ/asm-18.ll
index d60654b..71e145a 100644
--- a/test/CodeGen/SystemZ/asm-18.ll
+++ b/test/CodeGen/SystemZ/asm-18.ll
@@ -1,7 +1,7 @@
 ; Test high-word operations, using "h" constraints to force a high
 ; register and "r" constraints to force a low register.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 -no-integrated-as | FileCheck %s
 
 ; Test loads and stores involving mixtures of high and low registers.
 define void @f1(i32 *%ptr1, i32 *%ptr2) {
diff --git a/test/CodeGen/SystemZ/fp-cmp-04.ll b/test/CodeGen/SystemZ/fp-cmp-04.ll
index 781a3be..1637ccb 100644
--- a/test/CodeGen/SystemZ/fp-cmp-04.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-04.ll
@@ -1,7 +1,7 @@
 ; Test that floating-point compares are omitted if CC already has the
 ; right value.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -no-integrated-as | FileCheck %s
 
 declare float @llvm.fabs.f32(float %f)
 
diff --git a/test/CodeGen/SystemZ/int-cmp-44.ll b/test/CodeGen/SystemZ/int-cmp-44.ll
index f065e64..30c1c4f 100644
--- a/test/CodeGen/SystemZ/int-cmp-44.ll
+++ b/test/CodeGen/SystemZ/int-cmp-44.ll
@@ -1,7 +1,7 @@
 ; Test that compares are omitted if CC already has the right value
 ; (z10 version).
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -no-integrated-as | FileCheck %s
 
 declare void @foo()
 
diff --git a/test/CodeGen/SystemZ/int-cmp-45.ll b/test/CodeGen/SystemZ/int-cmp-45.ll
index 9c9c49c..c9affa6 100644
--- a/test/CodeGen/SystemZ/int-cmp-45.ll
+++ b/test/CodeGen/SystemZ/int-cmp-45.ll
@@ -1,7 +1,7 @@
 ; Test that compares are omitted if CC already has the right value
 ; (z196 version).
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 -no-integrated-as | FileCheck %s
 
 ; Addition provides enough for equality comparisons with zero.  First teest
 ; the EQ case with LOC.
diff --git a/test/CodeGen/SystemZ/memchr-02.ll b/test/CodeGen/SystemZ/memchr-02.ll
index 982b396..8986627 100644
--- a/test/CodeGen/SystemZ/memchr-02.ll
+++ b/test/CodeGen/SystemZ/memchr-02.ll
@@ -1,6 +1,6 @@
 ; Test memchr using SRST, with the correct prototype.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
 declare i8 *@memchr(i8 *%src, i32 %char, i64 %len)
 
diff --git a/test/CodeGen/SystemZ/memcpy-02.ll b/test/CodeGen/SystemZ/memcpy-02.ll
index 2b01091..776cfee 100644
--- a/test/CodeGen/SystemZ/memcpy-02.ll
+++ b/test/CodeGen/SystemZ/memcpy-02.ll
@@ -385,8 +385,8 @@ define void @f32(i64 *%ptr1, i64 *%ptr2) {
   ret void
 }
 
-!0 = metadata !{ metadata !"root" }
-!1 = metadata !{ metadata !3, metadata !3, i64 0 }
-!2 = metadata !{ metadata !4, metadata !4, i64 0 }
-!3 = metadata !{ metadata !"set1", metadata !0 }
-!4 = metadata !{ metadata !"set2", metadata !0 }
+!0 = !{ !"root" }
+!1 = !{ !3, !3, i64 0 }
+!2 = !{ !4, !4, i64 0 }
+!3 = !{ !"set1", !0 }
+!4 = !{ !"set2", !0 }
diff --git a/test/CodeGen/SystemZ/tls-01.ll b/test/CodeGen/SystemZ/tls-01.ll
index 16bc8f6..da7176c 100644
--- a/test/CodeGen/SystemZ/tls-01.ll
+++ b/test/CodeGen/SystemZ/tls-01.ll
@@ -1,7 +1,7 @@
-; Test initial-exec TLS accesses.
+; Test local-exec TLS accesses.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-MAIN
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-CP
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-MAIN
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-CP
 
 @x = thread_local global i32 0
 
diff --git a/test/CodeGen/SystemZ/tls-02.ll b/test/CodeGen/SystemZ/tls-02.ll
new file mode 100644
index 0000000..15918d0
--- /dev/null
+++ b/test/CodeGen/SystemZ/tls-02.ll
@@ -0,0 +1,18 @@
+; Test initial-exec TLS accesses.
+;
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-MAIN
+
+@x = thread_local(initialexec) global i32 0
+
+; The offset must be loaded from the GOT.  This TLS access model does
+; not use literal pool constants.
+define i32 *@foo() {
+; CHECK-MAIN-LABEL: foo:
+; CHECK-MAIN: ear [[HIGH:%r[0-5]]], %a0
+; CHECK-MAIN: sllg %r2, [[HIGH]], 32
+; CHECK-MAIN: ear %r2, %a1
+; CHECK-MAIN: larl %r1, x@INDNTPOFF
+; CHECK-MAIN: ag %r2, 0(%r1)
+; CHECK-MAIN: br %r14
+  ret i32 *@x
+}
diff --git a/test/CodeGen/SystemZ/tls-03.ll b/test/CodeGen/SystemZ/tls-03.ll
new file mode 100644
index 0000000..c9f7bd6
--- /dev/null
+++ b/test/CodeGen/SystemZ/tls-03.ll
@@ -0,0 +1,23 @@
+; Test general-dynamic TLS accesses.
+;
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-MAIN
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-CP
+
+@x = thread_local global i32 0
+
+; Call __tls_get_offset to retrieve the symbol's TLS offset.
+define i32 *@foo() {
+; CHECK-CP: .LCP{{.*}}:
+; CHECK-CP: .quad x@TLSGD
+;
+; CHECK-MAIN-LABEL: foo:
+; CHECK-MAIN-DAG: larl %r12, _GLOBAL_OFFSET_TABLE_
+; CHECK-MAIN-DAG: lgrl %r2, .LCP{{.*}}
+; CHECK-MAIN: brasl %r14, __tls_get_offset@PLT:tls_gdcall:x
+; CHECK-MAIN: ear [[HIGH:%r[0-5]]], %a0
+; CHECK-MAIN: sllg [[TP:%r[0-5]]], [[HIGH]], 32
+; CHECK-MAIN: ear [[TP]], %a1
+; CHECK-MAIN: agr %r2, [[TP]]
+; CHECK-MAIN: br %r14
+  ret i32 *@x
+}
diff --git a/test/CodeGen/SystemZ/tls-04.ll b/test/CodeGen/SystemZ/tls-04.ll
new file mode 100644
index 0000000..dcb210a
--- /dev/null
+++ b/test/CodeGen/SystemZ/tls-04.ll
@@ -0,0 +1,28 @@
+; Test local-dynamic TLS accesses.
+;
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-MAIN
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-CP
+
+@x = thread_local(localdynamic) global i32 0
+
+; Call __tls_get_offset to retrieve the module's TLS base offset.
+; Add the per-symbol offset and the thread pointer.
+define i32 *@foo() {
+; CHECK-CP: .LCP{{.*}}_0:
+; CHECK-CP: .quad x@TLSLDM
+; CHECK-CP: .LCP{{.*}}_1:
+; CHECK-CP: .quad x@DTPOFF
+;
+; CHECK-MAIN-LABEL: foo:
+; CHECK-MAIN-DAG: larl %r12, _GLOBAL_OFFSET_TABLE_
+; CHECK-MAIN-DAG: lgrl %r2, .LCP{{.*}}_0
+; CHECK-MAIN: brasl %r14, __tls_get_offset@PLT:tls_ldcall:x
+; CHECK-MAIN: larl %r1, .LCP{{.*}}_1
+; CHECK-MAIN: ag %r2, 0(%r1)
+; CHECK-MAIN: ear [[HIGH:%r[0-5]]], %a0
+; CHECK-MAIN: sllg [[TP:%r[0-5]]], [[HIGH]], 32
+; CHECK-MAIN: ear [[TP]], %a1
+; CHECK-MAIN: agr %r2, [[TP]]
+; CHECK-MAIN: br %r14
+  ret i32 *@x
+}
diff --git a/test/CodeGen/SystemZ/tls-05.ll b/test/CodeGen/SystemZ/tls-05.ll
new file mode 100644
index 0000000..385208d
--- /dev/null
+++ b/test/CodeGen/SystemZ/tls-05.ll
@@ -0,0 +1,15 @@
+; Test general-dynamic TLS access optimizations.
+;
+; If we access the same TLS variable twice, there should only be
+; a single call to __tls_get_offset.
+;
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | grep "__tls_get_offset" | count 1
+
+@x = thread_local global i32 0
+
+define i32 @foo() {
+  %val = load i32* @x
+  %inc = add nsw i32 %val, 1
+  store i32 %inc, i32* @x
+  ret i32 %val
+}
diff --git a/test/CodeGen/SystemZ/tls-06.ll b/test/CodeGen/SystemZ/tls-06.ll
new file mode 100644
index 0000000..fcd8614
--- /dev/null
+++ b/test/CodeGen/SystemZ/tls-06.ll
@@ -0,0 +1,17 @@
+; Test general-dynamic TLS access optimizations.
+;
+; If we access two different TLS variables, we need two calls to
+; __tls_get_offset, but should load _GLOBAL_OFFSET_TABLE only once.
+;
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | grep "__tls_get_offset" | count 2
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | grep "_GLOBAL_OFFSET_TABLE_" | count 1
+
+@x = thread_local global i32 0
+@y = thread_local global i32 0
+
+define i32 @foo() {
+  %valx = load i32* @x
+  %valy = load i32* @y
+  %add = add nsw i32 %valx, %valy
+  ret i32 %add
+}
diff --git a/test/CodeGen/SystemZ/tls-07.ll b/test/CodeGen/SystemZ/tls-07.ll
new file mode 100644
index 0000000..6547515
--- /dev/null
+++ b/test/CodeGen/SystemZ/tls-07.ll
@@ -0,0 +1,16 @@
+; Test local-dynamic TLS access optimizations.
+;
+; If we access two different local-dynamic TLS variables, we only
+; need a single call to __tls_get_offset.
+;
+; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | grep "__tls_get_offset" | count 1
+
+@x = thread_local(localdynamic) global i32 0
+@y = thread_local(localdynamic) global i32 0
+
+define i32 @foo() {
+  %valx = load i32* @x
+  %valy = load i32* @y
+  %add = add nsw i32 %valx, %valy
+  ret i32 %add
+}
diff --git a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
index d31a84b..622f55d 100644
--- a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
+++ b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
@@ -25,7 +25,7 @@ define void @_Z19getClosestDiagonal3ii(%0* noalias sret, i32, i32) nounwind {
   %storemerge = phi double [ -1.000000e+00, %4 ], [ 1.000000e+00, %3 ], [ 1.000000e+00, %3 ] ; <double> [#uses=1]
   %v_6 = icmp slt i32 %1, 2                         ; <i1> [#uses=1]
   %storemerge1 = select i1 %v_6, double 1.000000e+00, double -1.000000e+00 ; <double> [#uses=3]
-  call void @llvm.dbg.value(metadata !{double %storemerge}, i64 0, metadata !91, metadata !{metadata !"0x102"}), !dbg !0
+  call void @llvm.dbg.value(metadata double %storemerge, i64 0, metadata !91, metadata !{!"0x102"}), !dbg !0
   %v_7 = icmp eq i32 %2, 1, !dbg !92                ; <i1> [#uses=1]
   %storemerge2 = select i1 %v_7, double 1.000000e+00, double -1.000000e+00 ; <double> [#uses=3]
   %v_8 = getelementptr inbounds %0* %0, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
@@ -48,108 +48,108 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 !llvm.dbg.cu = !{!5}
 !llvm.module.flags = !{!104}
-!0 = metadata !{i32 46, i32 0, metadata !1, null}
-!1 = metadata !{metadata !"0xb\0044\000\000", metadata !101, metadata !2} ; [ DW_TAG_lexical_block ]
-!2 = metadata !{metadata !"0xb\0044\000\000", metadata !101, metadata !3} ; [ DW_TAG_lexical_block ]
-!3 = metadata !{metadata !"0x2e\00getClosestDiagonal3\00getClosestDiagonal3\00_Z19getClosestDiagonal3ii\0044\000\001\000\006\000\000\000", metadata !101, null, metadata !6, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!4 = metadata !{metadata !"0x29", metadata !101} ; [ DW_TAG_file_type ]
-!5 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)\001\00\000\00\000", metadata !101, metadata !102, metadata !102, metadata !103, null, null} ; [ DW_TAG_compile_unit ]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !22, metadata !22}
-!8 = metadata !{metadata !"0x13\00ggVector3\0066\00192\0032\000\000\000", metadata !99, null, null, metadata !10, null, null, null} ; [ DW_TAG_structure_type ] [ggVector3] [line 66, size 192, align 32, offset 0] [def] [from ]
-!9 = metadata !{metadata !"0x29", metadata !"ggVector3.h", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src", metadata !5} ; [ DW_TAG_file_type ]
-!99 = metadata !{metadata !"ggVector3.h", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src"}
-!10 = metadata !{metadata !11, metadata !16, metadata !23, metadata !26, metadata !29, metadata !30, metadata !35, metadata !36, metadata !37, metadata !41, metadata !42, metadata !43, metadata !46, metadata !47, metadata !48, metadata !52, metadata !53, metadata !54, metadata !57, metadata !60, metadata !63, metadata !66, metadata !70, metadata !71, metadata !74, metadata !75, metadata !76, metadata !77, metadata !78, metadata !81, metadata !82, metadata !83, metadata !84, metadata !85, metadata !88, metadata !89, metadata !90}
-!11 = metadata !{metadata !"0xd\00e\00160\00192\0032\000\000", metadata !99, metadata !8, metadata !12} ; [ DW_TAG_member ]
-!12 = metadata !{metadata !"0x1\00\000\00192\0032\000\000", metadata !101, metadata !4, metadata !13, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 192, align 32, offset 0] [from double]
-!13 = metadata !{metadata !"0x24\00double\000\0064\0032\000\000\004", metadata !101, metadata !4} ; [ DW_TAG_base_type ]
-!14 = metadata !{metadata !15}
-!15 = metadata !{metadata !"0x21\000\003"}        ; [ DW_TAG_subrange_type ]
-!16 = metadata !{metadata !"0x2e\00ggVector3\00ggVector3\00\0072\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !17, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!18 = metadata !{null, metadata !19, metadata !20}
-!19 = metadata !{metadata !"0xf\00\000\0032\0032\000\0064", metadata !101, metadata !4, metadata !8} ; [ DW_TAG_pointer_type ]
-!20 = metadata !{metadata !"0x16\00ggBoolean\00478\000\000\000\000", metadata !100, null, metadata !22} ; [ DW_TAG_typedef ]
-!21 = metadata !{metadata !"0x29", metadata !"math.h", metadata !"/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.Internal.sdk/usr/include/architecture/arm", metadata !5} ; [ DW_TAG_file_type ]
-!100 = metadata !{metadata !"math.h", metadata !"/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.Internal.sdk/usr/include/architecture/arm"}
-!22 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !101, metadata !4} ; [ DW_TAG_base_type ]
-!23 = metadata !{metadata !"0x2e\00ggVector3\00ggVector3\00\0073\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !24, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!24 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!25 = metadata !{null, metadata !19}
-!26 = metadata !{metadata !"0x2e\00ggVector3\00ggVector3\00\0074\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !27, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!27 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !28, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!28 = metadata !{null, metadata !19, metadata !13, metadata !13, metadata !13}
-!29 = metadata !{metadata !"0x2e\00Set\00Set\00_ZN9ggVector33SetEddd\0081\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !27, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!30 = metadata !{metadata !"0x2e\00x\00x\00_ZNK9ggVector31xEv\0082\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!31 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !32, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!32 = metadata !{metadata !13, metadata !33}
-!33 = metadata !{metadata !"0xf\00\000\0032\0032\000\0064", metadata !101, metadata !4, metadata !34} ; [ DW_TAG_pointer_type ]
-!34 = metadata !{metadata !"0x26\00\000\00192\0032\000\000", metadata !101, metadata !4, metadata !8} ; [ DW_TAG_const_type ]
-!35 = metadata !{metadata !"0x2e\00y\00y\00_ZNK9ggVector31yEv\0083\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!36 = metadata !{metadata !"0x2e\00z\00z\00_ZNK9ggVector31zEv\0084\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!37 = metadata !{metadata !"0x2e\00x\00x\00_ZN9ggVector31xEv\0085\000\001\000\006\000\000\000", metadata !9, metadata !8, metadata !38, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!38 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !39, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!39 = metadata !{metadata !40, metadata !19}
-!40 = metadata !{metadata !"0x10\00double\000\0032\0032\000\000", metadata !101, metadata !4, metadata !13} ; [ DW_TAG_reference_type ]
-!41 = metadata !{metadata !"0x2e\00y\00y\00_ZN9ggVector31yEv\0086\000\001\000\006\000\000\000", metadata !9, metadata !8, metadata !38, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!42 = metadata !{metadata !"0x2e\00z\00z\00_ZN9ggVector31zEv\0087\000\001\000\006\000\000\000", metadata !9, metadata !8, metadata !38, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!43 = metadata !{metadata !"0x2e\00SetX\00SetX\00_ZN9ggVector34SetXEd\0088\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !44, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!44 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !45, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!45 = metadata !{null, metadata !19, metadata !13}
-!46 = metadata !{metadata !"0x2e\00SetY\00SetY\00_ZN9ggVector34SetYEd\0089\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !44, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!47 = metadata !{metadata !"0x2e\00SetZ\00SetZ\00_ZN9ggVector34SetZEd\0090\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !44, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!48 = metadata !{metadata !"0x2e\00ggVector3\00ggVector3\00\0092\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !49, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!49 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !50, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!50 = metadata !{null, metadata !19, metadata !51}
-!51 = metadata !{metadata !"0x10\00\000\0032\0032\000\000", metadata !101, metadata !4, metadata !34} ; [ DW_TAG_reference_type ]
-!52 = metadata !{metadata !"0x2e\00tolerance\00tolerance\00_ZNK9ggVector39toleranceEv\00100\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!53 = metadata !{metadata !"0x2e\00tolerance\00tolerance\00_ZN9ggVector39toleranceEv\00101\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !38, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!54 = metadata !{metadata !"0x2e\00operator+\00operator+\00_ZNK9ggVector3psEv\00107\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !55, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!55 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !56, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!56 = metadata !{metadata !51, metadata !33}
-!57 = metadata !{metadata !"0x2e\00operator-\00operator-\00_ZNK9ggVector3ngEv\00108\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !58, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!58 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !59, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!59 = metadata !{metadata !8, metadata !33}
-!60 = metadata !{metadata !"0x2e\00operator[]\00operator[]\00_ZNK9ggVector3ixEi\00290\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !61, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!61 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !62, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!62 = metadata !{metadata !13, metadata !33, metadata !22}
-!63 = metadata !{metadata !"0x2e\00operator[]\00operator[]\00_ZN9ggVector3ixEi\00278\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !64, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!64 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !65, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!65 = metadata !{metadata !40, metadata !19, metadata !22}
-!66 = metadata !{metadata !"0x2e\00operator+=\00operator+=\00_ZN9ggVector3pLERKS_\00303\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !67, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!67 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !68, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!68 = metadata !{metadata !69, metadata !19, metadata !51}
-!69 = metadata !{metadata !"0x10\00ggVector3\000\0032\0032\000\000", metadata !101, metadata !4, metadata !8} ; [ DW_TAG_reference_type ]
-!70 = metadata !{metadata !"0x2e\00operator-=\00operator-=\00_ZN9ggVector3mIERKS_\00310\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !67, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!71 = metadata !{metadata !"0x2e\00operator*=\00operator*=\00_ZN9ggVector3mLEd\00317\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !72, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!72 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !73, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!73 = metadata !{metadata !69, metadata !19, metadata !13}
-!74 = metadata !{metadata !"0x2e\00operator/=\00operator/=\00_ZN9ggVector3dVEd\00324\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !72, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!75 = metadata !{metadata !"0x2e\00length\00length\00_ZNK9ggVector36lengthEv\00121\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!76 = metadata !{metadata !"0x2e\00squaredLength\00squaredLength\00_ZNK9ggVector313squaredLengthEv\00122\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!77 = metadata !{metadata !"0x2e\00MakeUnitVector\00MakeUnitVector\00_ZN9ggVector314MakeUnitVectorEv\00217\000\001\000\006\000\000\000", metadata !9, metadata !8, metadata !24, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!78 = metadata !{metadata !"0x2e\00Perturb\00Perturb\00_ZNK9ggVector37PerturbEdd\00126\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !79, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!79 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !80, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!80 = metadata !{metadata !8, metadata !33, metadata !13, metadata !13}
-!81 = metadata !{metadata !"0x2e\00maxComponent\00maxComponent\00_ZNK9ggVector312maxComponentEv\00128\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!82 = metadata !{metadata !"0x2e\00minComponent\00minComponent\00_ZNK9ggVector312minComponentEv\00129\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!83 = metadata !{metadata !"0x2e\00maxAbsComponent\00maxAbsComponent\00_ZNK9ggVector315maxAbsComponentEv\00131\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!84 = metadata !{metadata !"0x2e\00minAbsComponent\00minAbsComponent\00_ZNK9ggVector315minAbsComponentEv\00132\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!85 = metadata !{metadata !"0x2e\00indexOfMinComponent\00indexOfMinComponent\00_ZNK9ggVector319indexOfMinComponentEv\00133\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !86, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!86 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !87, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!87 = metadata !{metadata !22, metadata !33}
-!88 = metadata !{metadata !"0x2e\00indexOfMinAbsComponent\00indexOfMinAbsComponent\00_ZNK9ggVector322indexOfMinAbsComponentEv\00137\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !86, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!89 = metadata !{metadata !"0x2e\00indexOfMaxComponent\00indexOfMaxComponent\00_ZNK9ggVector319indexOfMaxComponentEv\00146\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !86, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!90 = metadata !{metadata !"0x2e\00indexOfMaxAbsComponent\00indexOfMaxAbsComponent\00_ZNK9ggVector322indexOfMaxAbsComponentEv\00150\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !86, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!91 = metadata !{metadata !"0x100\00vx\0046\000", metadata !1, metadata !4, metadata !13} ; [ DW_TAG_auto_variable ]
-!92 = metadata !{i32 48, i32 0, metadata !1, null}
-!93 = metadata !{i32 218, i32 0, metadata !94, metadata !96}
-!94 = metadata !{metadata !"0xb\00217\000\000", metadata !101, metadata !95} ; [ DW_TAG_lexical_block ]
-!95 = metadata !{metadata !"0xb\00217\000\000", metadata !101, metadata !77} ; [ DW_TAG_lexical_block ]
-!96 = metadata !{i32 51, i32 0, metadata !1, null}
-!97 = metadata !{i32 227, i32 0, metadata !94, metadata !96}
-!98 = metadata !{i32 52, i32 0, metadata !1, null}
-!101 = metadata !{metadata !"ggEdgeDiscrepancy.cc", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src"}
-!102 = metadata !{i32 0}
-!103 = metadata !{metadata !3, metadata !77}
-!104 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !MDLocation(line: 46, scope: !1)
+!1 = !{!"0xb\0044\000\000", !101, !2} ; [ DW_TAG_lexical_block ]
+!2 = !{!"0xb\0044\000\000", !101, !3} ; [ DW_TAG_lexical_block ]
+!3 = !{!"0x2e\00getClosestDiagonal3\00getClosestDiagonal3\00_Z19getClosestDiagonal3ii\0044\000\001\000\006\000\000\000", !101, null, !6, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!4 = !{!"0x29", !101} ; [ DW_TAG_file_type ]
+!5 = !{!"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)\001\00\000\00\000", !101, !102, !102, !103, null, null} ; [ DW_TAG_compile_unit ]
+!6 = !{!"0x15\00\000\000\000\000\000\000", !101, !4, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !22, !22}
+!8 = !{!"0x13\00ggVector3\0066\00192\0032\000\000\000", !99, null, null, !10, null, null, null} ; [ DW_TAG_structure_type ] [ggVector3] [line 66, size 192, align 32, offset 0] [def] [from ]
+!9 = !{!"0x29", !"ggVector3.h", !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src", !5} ; [ DW_TAG_file_type ]
+!99 = !{!"ggVector3.h", !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src"}
+!10 = !{!11, !16, !23, !26, !29, !30, !35, !36, !37, !41, !42, !43, !46, !47, !48, !52, !53, !54, !57, !60, !63, !66, !70, !71, !74, !75, !76, !77, !78, !81, !82, !83, !84, !85, !88, !89, !90}
+!11 = !{!"0xd\00e\00160\00192\0032\000\000", !99, !8, !12} ; [ DW_TAG_member ]
+!12 = !{!"0x1\00\000\00192\0032\000\000", !101, !4, !13, !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 192, align 32, offset 0] [from double]
+!13 = !{!"0x24\00double\000\0064\0032\000\000\004", !101, !4} ; [ DW_TAG_base_type ]
+!14 = !{!15}
+!15 = !{!"0x21\000\003"}        ; [ DW_TAG_subrange_type ]
+!16 = !{!"0x2e\00ggVector3\00ggVector3\00\0072\000\000\000\006\000\000\000", !9, !8, !17, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!17 = !{!"0x15\00\000\000\000\000\000\000", !101, !4, null, !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = !{null, !19, !20}
+!19 = !{!"0xf\00\000\0032\0032\000\0064", !101, !4, !8} ; [ DW_TAG_pointer_type ]
+!20 = !{!"0x16\00ggBoolean\00478\000\000\000\000", !100, null, !22} ; [ DW_TAG_typedef ]
+!21 = !{!"0x29", !"math.h", !"/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.Internal.sdk/usr/include/architecture/arm", !5} ; [ DW_TAG_file_type ]
+!100 = !{!"math.h", !"/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.Internal.sdk/usr/include/architecture/arm"}
+!22 = !{!"0x24\00int\000\0032\0032\000\000\005", !101, !4} ; [ DW_TAG_base_type ]
+!23 = !{!"0x2e\00ggVector3\00ggVector3\00\0073\000\000\000\006\000\000\000", !9, !8, !24, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!24 = !{!"0x15\00\000\000\000\000\000\000", !101, !4, null, !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!25 = !{null, !19}
+!26 = !{!"0x2e\00ggVector3\00ggVector3\00\0074\000\000\000\006\000\000\000", !9, !8, !27, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!27 = !{!"0x15\00\000\000\000\000\000\000", !101, !4, null, !28, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = !{null, !19, !13, !13, !13}
+!29 = !{!"0x2e\00Set\00Set\00_ZN9ggVector33SetEddd\0081\000\000\000\006\000\000\000", !9, !8, !27, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!30 = !{!"0x2e\00x\00x\00_ZNK9ggVector31xEv\0082\000\000\000\006\000\000\000", !9, !8, !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!31 = !{!"0x15\00\000\000\000\000\000\000", !101, !4, null, !32, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!32 = !{!13, !33}
+!33 = !{!"0xf\00\000\0032\0032\000\0064", !101, !4, !34} ; [ DW_TAG_pointer_type ]
+!34 = !{!"0x26\00\000\00192\0032\000\000", !101, !4, !8} ; [ DW_TAG_const_type ]
+!35 = !{!"0x2e\00y\00y\00_ZNK9ggVector31yEv\0083\000\000\000\006\000\000\000", !9, !8, !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!36 = !{!"0x2e\00z\00z\00_ZNK9ggVector31zEv\0084\000\000\000\006\000\000\000", !9, !8, !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!37 = !{!"0x2e\00x\00x\00_ZN9ggVector31xEv\0085\000\001\000\006\000\000\000", !9, !8, !38, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!38 = !{!"0x15\00\000\000\000\000\000\000", !101, !4, null, !39, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!39 = !{!40, !19}
+!40 = !{!"0x10\00double\000\0032\0032\000\000", !101, !4, !13} ; [ DW_TAG_reference_type ]
+!41 = !{!"0x2e\00y\00y\00_ZN9ggVector31yEv\0086\000\001\000\006\000\000\000", !9, !8, !38, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!42 = !{!"0x2e\00z\00z\00_ZN9ggVector31zEv\0087\000\001\000\006\000\000\000", !9, !8, !38, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!43 = !{!"0x2e\00SetX\00SetX\00_ZN9ggVector34SetXEd\0088\000\000\000\006\000\000\000", !9, !8, !44, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!44 = !{!"0x15\00\000\000\000\000\000\000", !101, !4, null, !45, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!45 = !{null, !19, !13}
+!46 = !{!"0x2e\00SetY\00SetY\00_ZN9ggVector34SetYEd\0089\000\000\000\006\000\000\000", !9, !8, !44, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!47 = !{!"0x2e\00SetZ\00SetZ\00_ZN9ggVector34SetZEd\0090\000\000\000\006\000\000\000", !9, !8, !44, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!48 = !{!"0x2e\00ggVector3\00ggVector3\00\0092\000\000\000\006\000\000\000", !9, !8, !49, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!49 = !{!"0x15\00\000\000\000\000\000\000", !101, !4, null, !50, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!50 = !{null, !19, !51}
+!51 = !{!"0x10\00\000\0032\0032\000\000", !101, !4, !34} ; [ DW_TAG_reference_type ]
+!52 = !{!"0x2e\00tolerance\00tolerance\00_ZNK9ggVector39toleranceEv\00100\000\000\000\006\000\000\000", !9, !8, !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!53 = !{!"0x2e\00tolerance\00tolerance\00_ZN9ggVector39toleranceEv\00101\000\000\000\006\000\000\000", !9, !8, !38, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!54 = !{!"0x2e\00operator+\00operator+\00_ZNK9ggVector3psEv\00107\000\000\000\006\000\000\000", !9, !8, !55, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!55 = !{!"0x15\00\000\000\000\000\000\000", !101, !4, null, !56, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!56 = !{!51, !33}
+!57 = !{!"0x2e\00operator-\00operator-\00_ZNK9ggVector3ngEv\00108\000\000\000\006\000\000\000", !9, !8, !58, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!58 = !{!"0x15\00\000\000\000\000\000\000", !101, !4, null, !59, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!59 = !{!8, !33}
+!60 = !{!"0x2e\00operator[]\00operator[]\00_ZNK9ggVector3ixEi\00290\000\000\000\006\000\000\000", !9, !8, !61, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!61 = !{!"0x15\00\000\000\000\000\000\000", !101, !4, null, !62, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!62 = !{!13, !33, !22}
+!63 = !{!"0x2e\00operator[]\00operator[]\00_ZN9ggVector3ixEi\00278\000\000\000\006\000\000\000", !9, !8, !64, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!64 = !{!"0x15\00\000\000\000\000\000\000", !101, !4, null, !65, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!65 = !{!40, !19, !22}
+!66 = !{!"0x2e\00operator+=\00operator+=\00_ZN9ggVector3pLERKS_\00303\000\000\000\006\000\000\000", !9, !8, !67, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!67 = !{!"0x15\00\000\000\000\000\000\000", !101, !4, null, !68, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!68 = !{!69, !19, !51}
+!69 = !{!"0x10\00ggVector3\000\0032\0032\000\000", !101, !4, !8} ; [ DW_TAG_reference_type ]
+!70 = !{!"0x2e\00operator-=\00operator-=\00_ZN9ggVector3mIERKS_\00310\000\000\000\006\000\000\000", !9, !8, !67, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!71 = !{!"0x2e\00operator*=\00operator*=\00_ZN9ggVector3mLEd\00317\000\000\000\006\000\000\000", !9, !8, !72, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!72 = !{!"0x15\00\000\000\000\000\000\000", !101, !4, null, !73, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!73 = !{!69, !19, !13}
+!74 = !{!"0x2e\00operator/=\00operator/=\00_ZN9ggVector3dVEd\00324\000\000\000\006\000\000\000", !9, !8, !72, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!75 = !{!"0x2e\00length\00length\00_ZNK9ggVector36lengthEv\00121\000\000\000\006\000\000\000", !9, !8, !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!76 = !{!"0x2e\00squaredLength\00squaredLength\00_ZNK9ggVector313squaredLengthEv\00122\000\000\000\006\000\000\000", !9, !8, !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!77 = !{!"0x2e\00MakeUnitVector\00MakeUnitVector\00_ZN9ggVector314MakeUnitVectorEv\00217\000\001\000\006\000\000\000", !9, !8, !24, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!78 = !{!"0x2e\00Perturb\00Perturb\00_ZNK9ggVector37PerturbEdd\00126\000\000\000\006\000\000\000", !9, !8, !79, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!79 = !{!"0x15\00\000\000\000\000\000\000", !101, !4, null, !80, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!80 = !{!8, !33, !13, !13}
+!81 = !{!"0x2e\00maxComponent\00maxComponent\00_ZNK9ggVector312maxComponentEv\00128\000\000\000\006\000\000\000", !9, !8, !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!82 = !{!"0x2e\00minComponent\00minComponent\00_ZNK9ggVector312minComponentEv\00129\000\000\000\006\000\000\000", !9, !8, !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!83 = !{!"0x2e\00maxAbsComponent\00maxAbsComponent\00_ZNK9ggVector315maxAbsComponentEv\00131\000\000\000\006\000\000\000", !9, !8, !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!84 = !{!"0x2e\00minAbsComponent\00minAbsComponent\00_ZNK9ggVector315minAbsComponentEv\00132\000\000\000\006\000\000\000", !9, !8, !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!85 = !{!"0x2e\00indexOfMinComponent\00indexOfMinComponent\00_ZNK9ggVector319indexOfMinComponentEv\00133\000\000\000\006\000\000\000", !9, !8, !86, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!86 = !{!"0x15\00\000\000\000\000\000\000", !101, !4, null, !87, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!87 = !{!22, !33}
+!88 = !{!"0x2e\00indexOfMinAbsComponent\00indexOfMinAbsComponent\00_ZNK9ggVector322indexOfMinAbsComponentEv\00137\000\000\000\006\000\000\000", !9, !8, !86, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!89 = !{!"0x2e\00indexOfMaxComponent\00indexOfMaxComponent\00_ZNK9ggVector319indexOfMaxComponentEv\00146\000\000\000\006\000\000\000", !9, !8, !86, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!90 = !{!"0x2e\00indexOfMaxAbsComponent\00indexOfMaxAbsComponent\00_ZNK9ggVector322indexOfMaxAbsComponentEv\00150\000\000\000\006\000\000\000", !9, !8, !86, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!91 = !{!"0x100\00vx\0046\000", !1, !4, !13} ; [ DW_TAG_auto_variable ]
+!92 = !MDLocation(line: 48, scope: !1)
+!93 = !MDLocation(line: 218, scope: !94, inlinedAt: !96)
+!94 = !{!"0xb\00217\000\000", !101, !95} ; [ DW_TAG_lexical_block ]
+!95 = !{!"0xb\00217\000\000", !101, !77} ; [ DW_TAG_lexical_block ]
+!96 = !MDLocation(line: 51, scope: !1)
+!97 = !MDLocation(line: 227, scope: !94, inlinedAt: !96)
+!98 = !MDLocation(line: 52, scope: !1)
+!101 = !{!"ggEdgeDiscrepancy.cc", !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src"}
+!102 = !{i32 0}
+!103 = !{!3, !77}
+!104 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/Thumb/fastcc.ll b/test/CodeGen/Thumb/fastcc.ll
index 98ff684..1a01246 100644
--- a/test/CodeGen/Thumb/fastcc.ll
+++ b/test/CodeGen/Thumb/fastcc.ll
@@ -33,4 +33,4 @@ attributes #0 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.5.0 "}
+!0 = !{!"clang version 3.5.0 "}
diff --git a/test/CodeGen/Thumb/iabs.ll b/test/CodeGen/Thumb/iabs.ll
index 76224bc..ecd4a6b 100644
--- a/test/CodeGen/Thumb/iabs.ll
+++ b/test/CodeGen/Thumb/iabs.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=thumb-unknown-unknown -filetype=obj -o %t.o
-; RUN: llvm-objdump -disassemble -arch=thumb %t.o | FileCheck %s
+; RUN: llvm-objdump -disassemble -arch-name=thumb %t.o | FileCheck %s
 
 define i32 @test(i32 %a) {
         %tmp1neg = sub i32 0, %a
diff --git a/test/CodeGen/Thumb/stack-access.ll b/test/CodeGen/Thumb/stack-access.ll
new file mode 100644
index 0000000..bcffda2
--- /dev/null
+++ b/test/CodeGen/Thumb/stack-access.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=thumb-eabi < %s -o - | FileCheck %s
+
+; Check that stack addresses are generated using a single ADD
+define void @test1(i8** %p) {
+  %x = alloca i8, align 1
+  %y = alloca i8, align 1
+  %z = alloca i8, align 1
+; CHECK: add r1, sp, #8
+; CHECK: str r1, [r0]
+  store i8* %x, i8** %p, align 4
+; CHECK: add r1, sp, #4
+; CHECK: str r1, [r0]
+  store i8* %y, i8** %p, align 4
+; CHECK: mov r1, sp
+; CHECK: str r1, [r0]
+  store i8* %z, i8** %p, align 4
+  ret void
+}
+
+; Stack offsets larger than 1020 still need two ADDs
+define void @test2([1024 x i8]** %p) {
+  %arr1 = alloca [1024 x i8], align 1
+  %arr2 = alloca [1024 x i8], align 1
+; CHECK: add r1, sp, #1020
+; CHECK: adds r1, #4
+; CHECK: str r1, [r0]
+  store [1024 x i8]* %arr1, [1024 x i8]** %p, align 4
+; CHECK: mov r1, sp
+; CHECK: str r1, [r0]
+  store [1024 x i8]* %arr2, [1024 x i8]** %p, align 4
+  ret void
+}
+
+; If possible stack-based lrdb/ldrh are widened to use SP-based addressing
+define i32 @test3() #0 {
+  %x = alloca i8, align 1
+  %y = alloca i8, align 1
+; CHECK: ldr r0, [sp]
+  %1 = load i8* %x, align 1
+; CHECK: ldr r1, [sp, #4]
+  %2 = load i8* %y, align 1
+  %3 = add nsw i8 %1, %2
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @test4() #0 {
+  %x = alloca i16, align 2
+  %y = alloca i16, align 2
+; CHECK: ldr r0, [sp]
+  %1 = load i16* %x, align 2
+; CHECK: ldr r1, [sp, #4]
+  %2 = load i16* %y, align 2
+  %3 = add nsw i16 %1, %2
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+; Don't widen if the value needs to be zero-extended
+define zeroext i8 @test5() {
+  %x = alloca i8, align 1
+; CHECK: mov r0, sp
+; CHECK: ldrb r0, [r0]
+  %1 = load i8* %x, align 1
+  ret i8 %1
+}
+
+define zeroext i16 @test6() {
+  %x = alloca i16, align 2
+; CHECK: mov r0, sp
+; CHECK: ldrh r0, [r0]
+  %1 = load i16* %x, align 2
+  ret i16 %1
+}
diff --git a/test/CodeGen/Thumb/stm-merge.ll b/test/CodeGen/Thumb/stm-merge.ll
index 76e71f4..d4b4cd2 100644
--- a/test/CodeGen/Thumb/stm-merge.ll
+++ b/test/CodeGen/Thumb/stm-merge.ll
@@ -7,16 +7,17 @@ target triple = "thumbv6m--linux-gnueabi"
 @e = internal unnamed_addr global i32* null, align 4
 
 ; Function Attrs: nounwind optsize
-define void @fn1() #0 {
+define void @fn1(i32 %x, i32 %y, i32 %z) #0 {
 entry:
 ; CHECK-LABEL: fn1:
 ; CHECK: stm r[[BASE:[0-9]]]!, {{.*}}
 ; CHECK-NOT: {{.*}} r[[BASE]]
-; CHECK: ldr r[[BASE]], {{.*}}
   %g = alloca i32, align 4
   %h = alloca i32, align 4
-  store i32 1, i32* %g, align 4
-  store i32 0, i32* %h, align 4
+  %i = alloca i32, align 4
+  store i32 %x, i32* %i, align 4
+  store i32 %y, i32* %h, align 4
+  store i32 %z, i32* %g, align 4
   %.pr = load i32* @d, align 4
   %cmp11 = icmp slt i32 %.pr, 1
   br i1 %cmp11, label %for.inc.lr.ph, label %for.body5
diff --git a/test/CodeGen/Thumb/vargs.ll b/test/CodeGen/Thumb/vargs.ll
index 4078b01..71e8afa 100644
--- a/test/CodeGen/Thumb/vargs.ll
+++ b/test/CodeGen/Thumb/vargs.ll
@@ -6,6 +6,10 @@
 
 define void @f(i32 %a, ...) {
 entry:
+; Check that space is reserved above the pushed lr for variadic argument
+; registers to be stored in.
+; CHECK: sub sp, #[[IMM:[0-9]+]]
+; CHECK: push
         %va = alloca i8*, align 4               ; <i8**> [#uses=4]
         %va.upgrd.1 = bitcast i8** %va to i8*           ; <i8*> [#uses=1]
         call void @llvm.va_start( i8* %va.upgrd.1 )
@@ -27,6 +31,13 @@ bb7:            ; preds = %bb
         %va.upgrd.4 = bitcast i8** %va to i8*           ; <i8*> [#uses=1]
         call void @llvm.va_end( i8* %va.upgrd.4 )
         ret void
+
+; The return sequence should pop the lr to r3, recover the stack space used to
+; store variadic argument registers, then return via r3. Possibly there is a pop
+; before this, but only if the function happened to use callee-saved registers.
+; CHECK: pop {r3}
+; CHECK: add sp, #[[IMM]]
+; CHECK: bx r3
 }
 
 declare void @llvm.va_start(i8*)
@@ -34,8 +45,3 @@ declare void @llvm.va_start(i8*)
 declare i32 @printf(i8*, ...)
 
 declare void @llvm.va_end(i8*)
-
-; CHECK: pop
-; CHECK: pop
-; CHECK-NOT: pop
-
diff --git a/test/CodeGen/Thumb2/aligned-spill.ll b/test/CodeGen/Thumb2/aligned-spill.ll
index 3a2803f..4ef294b 100644
--- a/test/CodeGen/Thumb2/aligned-spill.ll
+++ b/test/CodeGen/Thumb2/aligned-spill.ll
@@ -9,7 +9,7 @@ target triple = "thumbv7-apple-ios"
 ;
 ; The caller-saved r4 is used as a scratch register for stack realignment.
 ; CHECK: push {r4, r7, lr}
-; CHECK: bic r4, r4, #7
+; CHECK: bfc r4, #0, #3
 ; CHECK: mov sp, r4
 define void @f(double* nocapture %p) nounwind ssp {
 entry:
@@ -23,7 +23,7 @@ entry:
 ; NEON: f
 ; NEON: push {r4, r7, lr}
 ; NEON: sub.w r4, sp, #64
-; NEON: bic r4, r4, #15
+; NEON: bfc r4, #0, #4
 ; Stack pointer must be updated before the spills.
 ; NEON: mov sp, r4
 ; NEON: vst1.64 {d8, d9, d10, d11}, [r4:128]!
@@ -54,7 +54,7 @@ entry:
 ; NEON: f7
 ; NEON: push {r4, r7, lr}
 ; NEON: sub.w r4, sp, #56
-; NEON: bic r4, r4, #15
+; NEON: bfc r4, #0, #4
 ; Stack pointer must be updated before the spills.
 ; NEON: mov sp, r4
 ; NEON: vst1.64 {d8, d9, d10, d11}, [r4:128]!
@@ -81,7 +81,7 @@ entry:
 ; NEON: push {r4, r7, lr}
 ; NEON: vpush {d12, d13, d14, d15}
 ; NEON: sub.w r4, sp, #24
-; NEON: bic r4, r4, #15
+; NEON: bfc r4, #0, #4
 ; Stack pointer must be updated before the spills.
 ; NEON: mov sp, r4
 ; NEON: vst1.64 {d8, d9}, [r4:128]
diff --git a/test/CodeGen/Thumb2/constant-islands-jump-table.ll b/test/CodeGen/Thumb2/constant-islands-jump-table.ll
new file mode 100644
index 0000000..0dd7092
--- /dev/null
+++ b/test/CodeGen/Thumb2/constant-islands-jump-table.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabihf -O1 %s -o - | FileCheck %s
+
+; CHECK-LABEL: test_jump_table:
+; CHECK: b .LBB
+; CHECK-NOT: tbh
+
+define i32 @test_jump_table(i32 %x, float %in) {
+
+h1:
+
+ %b0 = fadd float %in, 1234.5
+ %b1 = fptoui float %b0 to i32
+  
+  switch i32 %x, label %h2 [
+    i32 0, label %h3
+    i32 2, label %h4
+    i32 4, label %h5
+    i32 6, label %h6
+  ]
+
+h2:
+  %a0 = add i32 %x, 5
+  br label %h3
+
+h3:
+  %d2 = phi i32 [%b1, %h1], [%a0, %h2]
+  %d3 = add i32 %d2, 3
+  br label %h4
+
+h4:
+  %c2 = phi i32 [%b1, %h1], [%d3, %h3]
+  %c3 = add i32 %c2, 5
+  br label %h5
+
+h5:
+  %a2 = phi i32 [%b1, %h1], [%c3, %h4]
+  %a3 = add i32 %a2, 6
+  br label %h6
+
+h6:
+  %y = phi i32 [0, %h1], [%a3, %h5]
+  call i32 @llvm.arm.space(i32 2000, i32 undef)
+  ret i32 %y
+  
+}
+
+declare i32 @llvm.arm.space(i32, i32)
diff --git a/test/CodeGen/Thumb2/constant-islands-new-island-padding.ll b/test/CodeGen/Thumb2/constant-islands-new-island-padding.ll
new file mode 100644
index 0000000..991b043
--- /dev/null
+++ b/test/CodeGen/Thumb2/constant-islands-new-island-padding.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios %s -o - | FileCheck %s
+
+@g0 = common global i32 0, align 4
+@d0 = common global double 0.000000e+00, align 8
+@f0 = common global float 0.000000e+00, align 4
+@g1 = common global i32 0, align 4
+
+declare i32 @llvm.arm.space(i32, i32)
+
+; Check that the constant island pass moves the float constant pool entry inside
+; the function.
+
+; CHECK: .long 1067320814 @ float 1.23455596
+; CHECK: {{.*}} %do.end
+
+define i32 @testpadding(i32 %a) {
+entry:
+  %0 = load i32* @g0, align 4
+  %add = add nsw i32 %0, 12
+  store i32 %add, i32* @g0, align 4
+  %1 = load double* @d0, align 8
+  %add1 = fadd double %1, 0x3FF3C0B8ED46EACB
+  store double %add1, double* @d0, align 8
+  %tmpcall11 = call i32 @llvm.arm.space(i32 28, i32 undef)
+  call void @foo20(i32 191)
+  %2 = load float* @f0, align 4
+  %add2 = fadd float %2, 0x3FF3C0BDC0000000
+  store float %add2, float* @f0, align 4
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  tail call void @foo20(i32 19)
+  %3 = load i32* @g1, align 4
+  %tobool = icmp eq i32 %3, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %tmpcall111 = call i32 @llvm.arm.space(i32 954, i32 undef)
+  ret i32 10
+}
+
+declare void @foo20(i32)
diff --git a/test/CodeGen/Thumb2/ifcvt-neon.ll b/test/CodeGen/Thumb2/ifcvt-neon.ll
index 501b0b6..00f3399 100644
--- a/test/CodeGen/Thumb2/ifcvt-neon.ll
+++ b/test/CodeGen/Thumb2/ifcvt-neon.ll
@@ -12,9 +12,9 @@ entry:
   br i1 %0, label %bb, label %bb1
 
 bb:                                               ; preds = %entry
-; CHECK:      ite lt
-; CHECK:      vsublt.f32
-; CHECK-NEXT: vaddge.f32
+; CHECK:      vsub.f32
+; CHECK-NEXT: vadd.f32
+; CHECK:      it gt
   %3 = fadd float %1, %2                          ; <float> [#uses=1]
   br label %bb2
 
diff --git a/test/CodeGen/Thumb2/thumb2-cmn.ll b/test/CodeGen/Thumb2/thumb2-cmn.ll
index efa1505..0f361d7 100644
--- a/test/CodeGen/Thumb2/thumb2-cmn.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmn.ll
@@ -79,7 +79,7 @@ define void @f9(i32 %a, i32 %b) nounwind optsize {
   ret void
 }
 
-!0 = metadata !{i32 81}
+!0 = !{i32 81}
 
 ; CHECK-LABEL: f9:
 ; CHECK: 	cmn.w	r0, r1
diff --git a/test/CodeGen/Thumb2/thumb2-spill-q.ll b/test/CodeGen/Thumb2/thumb2-spill-q.ll
index 94f4725..d1deb46 100644
--- a/test/CodeGen/Thumb2/thumb2-spill-q.ll
+++ b/test/CodeGen/Thumb2/thumb2-spill-q.ll
@@ -11,7 +11,7 @@ declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
 
 define void @aaa(%quuz* %this, i8* %block) {
 ; CHECK-LABEL: aaa:
-; CHECK: bic r4, r4, #15
+; CHECK: bfc r4, #0, #4
 ; CHECK: vst1.64 {{.*}}[{{.*}}:128]
 ; CHECK: vld1.64 {{.*}}[{{.*}}:128]
 entry:
diff --git a/test/CodeGen/X86/2006-05-22-FPSetEQ.ll b/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
index 6c5a4fb..3be77f5 100644
--- a/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
+++ b/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
@@ -1,7 +1,10 @@
-; RUN: llc < %s -march=x86 -mattr=-sse | grep setnp
-; RUN: llc < %s -march=x86 -mattr=-sse -enable-unsafe-fp-math -enable-no-nans-fp-math | \
-; RUN:   not grep setnp
+; RUN: llc < %s -march=x86 -mattr=-sse | FileCheck %s -check-prefix=WITHNANS
+; RUN: llc < %s -march=x86 -mattr=-sse -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s -check-prefix=NONANS
 
+; WITHNANS-LABEL: test:
+; WITHNANS: setnp
+; NONANS-LABEL: test:
+; NONANS-NOT: setnp
 define i32 @test(float %f) {
 	%tmp = fcmp oeq float %f, 0.000000e+00		; <i1> [#uses=1]
 	%tmp.upgrd.1 = zext i1 %tmp to i32		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/2006-10-07-ScalarSSEMiscompile.ll b/test/CodeGen/X86/2006-10-07-ScalarSSEMiscompile.ll
deleted file mode 100644
index d09d061..0000000
--- a/test/CodeGen/X86/2006-10-07-ScalarSSEMiscompile.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=sse | grep movaps
-; Test that the load is NOT folded into the intrinsic, which would zero the top
-; elts of the loaded vector.
-
-target datalayout = "e-p:32:32"
-target triple = "i686-apple-darwin8.7.2"
-
-define <4 x float> @test(<4 x float> %A, <4 x float>* %B) nounwind {
-        %BV = load <4 x float>* %B              ; <<4 x float>> [#uses=1]
-        %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %A, <4 x float> %BV )       ; <<4 x float>> [#uses=1]
-        ret <4 x float> %tmp28
-}
-
-declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
-
diff --git a/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll b/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll
deleted file mode 100644
index 11c0bf9..0000000
--- a/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll
+++ /dev/null
@@ -1,64 +0,0 @@
-; RUN: llc < %s -o - -march=x86 -mattr=+mmx | FileCheck %s
-; There are no MMX instructions here.  We use add+adcl for the adds.
-
-define <1 x i64> @unsigned_add3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) nounwind {
-entry:
-	%tmp2942 = icmp eq i32 %count, 0		; <i1> [#uses=1]
-	br i1 %tmp2942, label %bb31, label %bb26
-
-bb26:		; preds = %bb26, %entry
-
-; CHECK:  addl
-; CHECK:  adcl
-
-	%i.037.0 = phi i32 [ 0, %entry ], [ %tmp25, %bb26 ]		; <i32> [#uses=3]
-	%sum.035.0 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ]		; <<1 x i64>> [#uses=1]
-	%tmp13 = getelementptr <1 x i64>* %b, i32 %i.037.0		; <<1 x i64>*> [#uses=1]
-	%tmp14 = load <1 x i64>* %tmp13		; <<1 x i64>> [#uses=1]
-	%tmp18 = getelementptr <1 x i64>* %a, i32 %i.037.0		; <<1 x i64>*> [#uses=1]
-	%tmp19 = load <1 x i64>* %tmp18		; <<1 x i64>> [#uses=1]
-	%tmp21 = add <1 x i64> %tmp19, %tmp14		; <<1 x i64>> [#uses=1]
-	%tmp22 = add <1 x i64> %tmp21, %sum.035.0		; <<1 x i64>> [#uses=2]
-	%tmp25 = add i32 %i.037.0, 1		; <i32> [#uses=2]
-	%tmp29 = icmp ult i32 %tmp25, %count		; <i1> [#uses=1]
-	br i1 %tmp29, label %bb26, label %bb31
-
-bb31:		; preds = %bb26, %entry
-	%sum.035.1 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ]		; <<1 x i64>> [#uses=1]
-	ret <1 x i64> %sum.035.1
-}
-
-
-; This is the original test converted to use MMX intrinsics.
-
-define <1 x i64> @unsigned_add3a(x86_mmx* %a, x86_mmx* %b, i32 %count) nounwind {
-entry:
-        %tmp2943 = bitcast <1 x i64><i64 0> to x86_mmx
-	%tmp2942 = icmp eq i32 %count, 0		; <i1> [#uses=1]
-	br i1 %tmp2942, label %bb31, label %bb26
-
-bb26:		; preds = %bb26, %entry
-
-; CHECK:  movq	({{.*}},8), %mm
-; CHECK:  paddq	({{.*}},8), %mm
-; CHECK:  paddq	%mm{{[0-7]}}, %mm
-
-	%i.037.0 = phi i32 [ 0, %entry ], [ %tmp25, %bb26 ]		; <i32> [#uses=3]
-	%sum.035.0 = phi x86_mmx [ %tmp2943, %entry ], [ %tmp22, %bb26 ]		; <x86_mmx> [#uses=1]
-	%tmp13 = getelementptr x86_mmx* %b, i32 %i.037.0		; <x86_mmx*> [#uses=1]
-	%tmp14 = load x86_mmx* %tmp13		; <x86_mmx> [#uses=1]
-	%tmp18 = getelementptr x86_mmx* %a, i32 %i.037.0		; <x86_mmx*> [#uses=1]
-	%tmp19 = load x86_mmx* %tmp18		; <x86_mmx> [#uses=1]
-	%tmp21 = call x86_mmx @llvm.x86.mmx.padd.q (x86_mmx %tmp19, x86_mmx %tmp14)		; <x86_mmx> [#uses=1]
-	%tmp22 = call x86_mmx @llvm.x86.mmx.padd.q (x86_mmx %tmp21, x86_mmx %sum.035.0)		; <x86_mmx> [#uses=2]
-	%tmp25 = add i32 %i.037.0, 1		; <i32> [#uses=2]
-	%tmp29 = icmp ult i32 %tmp25, %count		; <i1> [#uses=1]
-	br i1 %tmp29, label %bb26, label %bb31
-
-bb31:		; preds = %bb26, %entry
-	%sum.035.1 = phi x86_mmx [ %tmp2943, %entry ], [ %tmp22, %bb26 ]		; <x86_mmx> [#uses=1]
-        %t = bitcast x86_mmx %sum.035.1 to <1 x i64>
-	ret <1 x i64> %t
-}
-
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
diff --git a/test/CodeGen/X86/2007-06-15-IntToMMX.ll b/test/CodeGen/X86/2007-06-15-IntToMMX.ll
deleted file mode 100644
index 5612d9e..0000000
--- a/test/CodeGen/X86/2007-06-15-IntToMMX.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc < %s -march=x86-64 -mattr=+mmx | FileCheck %s
-
-; CHECK: paddusw
-
-@R = external global x86_mmx          ; <x86_mmx*> [#uses=1]
-
-define void @foo(<1 x i64> %A, <1 x i64> %B) {
-entry:
-        %tmp2 = bitcast <1 x i64> %A to x86_mmx
-        %tmp3 = bitcast <1 x i64> %B to x86_mmx
-        %tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp2, x86_mmx %tmp3 )   ; <x86_mmx> [#uses=1]
-        store x86_mmx %tmp7, x86_mmx* @R
-        tail call void @llvm.x86.mmx.emms( )
-        ret void
-}
-
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
-
-declare void @llvm.x86.mmx.emms()
diff --git a/test/CodeGen/X86/2008-10-06-MMXISelBug.ll b/test/CodeGen/X86/2008-10-06-MMXISelBug.ll
deleted file mode 100644
index 7f7b1a4..0000000
--- a/test/CodeGen/X86/2008-10-06-MMXISelBug.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2
-; PR2850
-
-@tmp_V2i = common global <2 x i32> zeroinitializer		; <<2 x i32>*> [#uses=2]
-
-define void @f0() nounwind {
-entry:
-	%0 = load <2 x i32>* @tmp_V2i, align 8		; <<2 x i32>> [#uses=1]
-	%1 = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer		; <<2 x i32>> [#uses=1]
-	store <2 x i32> %1, <2 x i32>* @tmp_V2i, align 8
-	ret void
-}
diff --git a/test/CodeGen/X86/2009-01-25-NoSSE.ll b/test/CodeGen/X86/2009-01-25-NoSSE.ll
index 8406c4a..c655f2c 100644
--- a/test/CodeGen/X86/2009-01-25-NoSSE.ll
+++ b/test/CodeGen/X86/2009-01-25-NoSSE.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=-sse,-sse2 | not grep xmm
+; RUN: llc < %s -march=x86-64 -mattr=-sse,-sse2 | FileCheck %s
 ; PR3402
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
@@ -6,6 +6,8 @@ target triple = "x86_64-unknown-linux-gnu"
 
 %struct.ktermios = type { i32, i32, i32, i32, i8, [19 x i8], i32, i32 }
 
+; CHECK-NOT: xmm
+; CHECK-NOT: ymm
 define void @foo() nounwind {
 entry:
   %termios = alloca %struct.ktermios, align 8
diff --git a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
index 207d122..e6202f9 100644
--- a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
+++ b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
@@ -1,9 +1,19 @@
 ; RUN: llc < %s
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s
 ; PR3538
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin9"
 define signext i8 @foo(i8* %s1) nounwind ssp {
+
+; Make sure we generate:
+;  movq	-40(%rbp), %rsp
+; Instead of:
+;  movq	-40(%rbp), %rax
+;  movq	%rax, %rsp
+
+; CHECK-LABEL: @foo
+; CHECK: movq	-40(%rbp), %rsp
+
 entry:
   %s1_addr = alloca i8*                           ; <i8**> [#uses=2]
   %retval = alloca i32                            ; <i32*> [#uses=2]
@@ -14,9 +24,9 @@ entry:
   %2 = alloca i64                                 ; <i64*> [#uses=1]
   %3 = alloca i64                                 ; <i64*> [#uses=6]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{i8** %s1_addr}, metadata !0, metadata !{metadata !"0x102"}), !dbg !7
+  call void @llvm.dbg.declare(metadata i8** %s1_addr, metadata !0, metadata !{!"0x102"}), !dbg !7
   store i8* %s1, i8** %s1_addr
-  call void @llvm.dbg.declare(metadata !{[0 x i8]** %str.0}, metadata !8, metadata !{metadata !"0x102"}), !dbg !7
+  call void @llvm.dbg.declare(metadata [0 x i8]** %str.0, metadata !8, metadata !{!"0x102"}), !dbg !7
   %4 = call i8* @llvm.stacksave(), !dbg !7        ; <i8*> [#uses=1]
   store i8* %4, i8** %saved_stack.1, align 8, !dbg !7
   %5 = load i8** %s1_addr, align 8, !dbg !13      ; <i8*> [#uses=1]
@@ -66,22 +76,22 @@ declare i64 @strlen(i8*) nounwind readonly
 
 declare void @llvm.stackrestore(i8*) nounwind
 
-!0 = metadata !{metadata !"0x101\00s1\002\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\000\000", i32 0, metadata !2, metadata !3, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !17, metadata !18, metadata !18, null, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5, metadata !6}
-!5 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !2} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !2, metadata !5} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 2, i32 0, metadata !1, null}
-!8 = metadata !{metadata !"0x100\00str.0\003\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", null, metadata !2, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{metadata !"0x1\00\000\008\008\000\000", null, metadata !2, metadata !5, metadata !11, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0x21\000\001"}        ; [ DW_TAG_subrange_type ]
-!13 = metadata !{i32 3, i32 0, metadata !14, null}
-!14 = metadata !{metadata !"0xb\000\000\000", metadata !17, metadata !1} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{i32 4, i32 0, metadata !14, null}
-!16 = metadata !{i32 5, i32 0, metadata !14, null}
-!17 = metadata !{metadata !"vla.c", metadata !"/tmp/"}
-!18 = metadata !{i32 0}
+!0 = !{!"0x101\00s1\002\000", !1, !2, !6} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\000\000", i32 0, !2, !3, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", !17, !18, !18, null, null, null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", null, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5, !6}
+!5 = !{!"0x24\00char\000\008\008\000\000\006", null, !2} ; [ DW_TAG_base_type ]
+!6 = !{!"0xf\00\000\0064\0064\000\000", null, !2, !5} ; [ DW_TAG_pointer_type ]
+!7 = !MDLocation(line: 2, scope: !1)
+!8 = !{!"0x100\00str.0\003\000", !1, !2, !9} ; [ DW_TAG_auto_variable ]
+!9 = !{!"0xf\00\000\0064\0064\000\0064", null, !2, !10} ; [ DW_TAG_pointer_type ]
+!10 = !{!"0x1\00\000\008\008\000\000", null, !2, !5, !11, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
+!11 = !{!12}
+!12 = !{!"0x21\000\001"}        ; [ DW_TAG_subrange_type ]
+!13 = !MDLocation(line: 3, scope: !14)
+!14 = !{!"0xb\000\000\000", !17, !1} ; [ DW_TAG_lexical_block ]
+!15 = !MDLocation(line: 4, scope: !14)
+!16 = !MDLocation(line: 5, scope: !14)
+!17 = !{!"vla.c", !"/tmp/"}
+!18 = !{i32 0}
diff --git a/test/CodeGen/X86/2009-06-05-ScalarToVectorByteMMX.ll b/test/CodeGen/X86/2009-06-05-ScalarToVectorByteMMX.ll
deleted file mode 100644
index 3061dc2..0000000
--- a/test/CodeGen/X86/2009-06-05-ScalarToVectorByteMMX.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mattr=+mmx,+sse2 | FileCheck %s
-
-; CHECK-NOT: movl
-
-define <8 x i8> @a(i8 zeroext %x) nounwind {
-  %r = insertelement <8 x i8> undef, i8 %x, i32 0
-  ret <8 x i8> %r
-}
-
diff --git a/test/CodeGen/X86/2009-06-07-ExpandMMXBitcast.ll b/test/CodeGen/X86/2009-06-07-ExpandMMXBitcast.ll
deleted file mode 100644
index 66caedf..0000000
--- a/test/CodeGen/X86/2009-06-07-ExpandMMXBitcast.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=corei7 -mattr=+mmx | grep movd | count 2
-
-define i64 @a(i32 %a, i32 %b) nounwind readnone {
-entry:
-	%0 = insertelement <2 x i32> undef, i32 %a, i32 0		; <<2 x i32>> [#uses=1]
-	%1 = insertelement <2 x i32> %0, i32 %b, i32 1		; <<2 x i32>> [#uses=1]
-	%conv = bitcast <2 x i32> %1 to i64		; <i64> [#uses=1]
-	ret i64 %conv
-}
-
diff --git a/test/CodeGen/X86/2009-06-18-movlp-shuffle-register.ll b/test/CodeGen/X86/2009-06-18-movlp-shuffle-register.ll
index 8ea70b4..4c4552d 100644
--- a/test/CodeGen/X86/2009-06-18-movlp-shuffle-register.ll
+++ b/test/CodeGen/X86/2009-06-18-movlp-shuffle-register.ll
@@ -3,7 +3,7 @@
 
 define <4 x float> @f4523(<4 x float> %a,<4 x float> %b) nounwind {
 entry:
-; CHECK: shufps $-28, %xmm
+; CHECK: shufps $228, %xmm
 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4,i32
 5,i32 2,i32 3>
 ret <4 x float> %shuffle
diff --git a/test/CodeGen/X86/2009-08-02-mmx-scalar-to-vector.ll b/test/CodeGen/X86/2009-08-02-mmx-scalar-to-vector.ll
deleted file mode 100644
index 288eef4..0000000
--- a/test/CodeGen/X86/2009-08-02-mmx-scalar-to-vector.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -march=x86-64
-; PR4669
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
-
-define <1 x i64> @test(i64 %t) {
-entry:
-	%t1 = insertelement <1 x i64> undef, i64 %t, i32 0
-        %t0 = bitcast <1 x i64> %t1 to x86_mmx
-	%t2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %t0, i32 48)
-        %t3 = bitcast x86_mmx %t2 to <1 x i64>
-	ret <1 x i64> %t3
-}
diff --git a/test/CodeGen/X86/2009-10-16-Scope.ll b/test/CodeGen/X86/2009-10-16-Scope.ll
index 6fe2ee4..e75d594 100644
--- a/test/CodeGen/X86/2009-10-16-Scope.ll
+++ b/test/CodeGen/X86/2009-10-16-Scope.ll
@@ -9,7 +9,7 @@ entry:
   br label %do.body, !dbg !0
 
 do.body:                                          ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{i32* %count_}, metadata !4, metadata !{metadata !"0x102"})
+  call void @llvm.dbg.declare(metadata i32* %count_, metadata !4, metadata !{!"0x102"})
   %conv = ptrtoint i32* %count_ to i32, !dbg !0   ; <i32> [#uses=1]
   %call = call i32 @foo(i32 %conv) ssp, !dbg !0   ; <i32> [#uses=0]
   br label %do.end, !dbg !0
@@ -22,13 +22,13 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i32 @foo(i32) ssp
 
-!0 = metadata !{i32 5, i32 2, metadata !1, null}
-!1 = metadata !{metadata !"0xb\001\001\000", null, metadata !2}; [DW_TAG_lexical_block ]
-!2 = metadata !{metadata !"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", i32 0, metadata !3, null, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!3 = metadata !{metadata !"0x11\0012\00clang 1.1\001\00\000\00\000", metadata !8, null, metadata !9, null, null, null}; [DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x100\00count_\005\000", metadata !5, metadata !3, metadata !6}; [ DW_TAG_auto_variable ]
-!5 = metadata !{metadata !"0xb\001\001\000", null, metadata !1}; [DW_TAG_lexical_block ]
-!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !3}; [DW_TAG_base_type ]
-!7 = metadata !{i32 6, i32 1, metadata !2, null}
-!8 = metadata !{metadata !"genmodes.i", metadata !"/Users/yash/Downloads"}
-!9 = metadata !{i32 0}
+!0 = !MDLocation(line: 5, column: 2, scope: !1)
+!1 = !{!"0xb\001\001\000", null, !2}; [DW_TAG_lexical_block ]
+!2 = !{!"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", i32 0, !3, null, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!3 = !{!"0x11\0012\00clang 1.1\001\00\000\00\000", !8, null, !9, null, null, null}; [DW_TAG_compile_unit ]
+!4 = !{!"0x100\00count_\005\000", !5, !3, !6}; [ DW_TAG_auto_variable ]
+!5 = !{!"0xb\001\001\000", null, !1}; [DW_TAG_lexical_block ]
+!6 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !3}; [DW_TAG_base_type ]
+!7 = !MDLocation(line: 6, column: 1, scope: !2)
+!8 = !{!"genmodes.i", !"/Users/yash/Downloads"}
+!9 = !{i32 0}
diff --git a/test/CodeGen/X86/2010-01-07-UAMemFeature.ll b/test/CodeGen/X86/2010-01-07-UAMemFeature.ll
deleted file mode 100644
index bb24adb..0000000
--- a/test/CodeGen/X86/2010-01-07-UAMemFeature.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc -mcpu=yonah -mattr=vector-unaligned-mem -march=x86 < %s | FileCheck %s
-; CHECK: addps (
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define <4 x float> @foo(<4 x float>* %P, <4 x float> %In) nounwind {
-	%A = load <4 x float>* %P, align 4
-	%B = fadd <4 x float> %A, %In
-	ret <4 x float> %B
-}
diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll
index 0e2ed9d..b21846d 100644
--- a/test/CodeGen/X86/2010-01-18-DbgValue.ll
+++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll
@@ -12,7 +12,7 @@ entry:
   %retval = alloca double                         ; <double*> [#uses=2]
   %0 = alloca double                              ; <double*> [#uses=2]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.Rect* %my_r0}, metadata !0, metadata !{metadata !"0x102"}), !dbg !15
+  call void @llvm.dbg.declare(metadata %struct.Rect* %my_r0, metadata !0, metadata !{!"0x102"}), !dbg !15
   %1 = getelementptr inbounds %struct.Rect* %my_r0, i32 0, i32 0, !dbg !16 ; <%struct.Pt*> [#uses=1]
   %2 = getelementptr inbounds %struct.Pt* %1, i32 0, i32 0, !dbg !16 ; <double*> [#uses=1]
   %3 = load double* %2, align 8, !dbg !16         ; <double> [#uses=1]
@@ -31,25 +31,25 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!21}
 
-!0 = metadata !{metadata !"0x101\00my_r0\0011\000", metadata !1, metadata !2, metadata !7} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0011\000\001\000\006\000\000\0011", metadata !19, metadata !2, metadata !4, null, double (%struct.Rect*)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !19, metadata !20, metadata !20, metadata !18, null, null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !19, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{metadata !6, metadata !7}
-!6 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", metadata !19, metadata !2} ; [ DW_TAG_base_type ]
-!7 = metadata !{metadata !"0x13\00Rect\006\00256\0064\000\000\000", metadata !19, metadata !2, null, metadata !8, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ]
-!8 = metadata !{metadata !9, metadata !14}
-!9 = metadata !{metadata !"0xd\00P1\007\00128\0064\000\000", metadata !19, metadata !7, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{metadata !"0x13\00Pt\001\00128\0064\000\000\000", metadata !19, metadata !2, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ]
-!11 = metadata !{metadata !12, metadata !13}
-!12 = metadata !{metadata !"0xd\00x\002\0064\0064\000\000", metadata !19, metadata !10, metadata !6} ; [ DW_TAG_member ]
-!13 = metadata !{metadata !"0xd\00y\003\0064\0064\0064\000", metadata !19, metadata !10, metadata !6} ; [ DW_TAG_member ]
-!14 = metadata !{metadata !"0xd\00P2\008\00128\0064\00128\000", metadata !19, metadata !7, metadata !10} ; [ DW_TAG_member ]
-!15 = metadata !{i32 11, i32 0, metadata !1, null}
-!16 = metadata !{i32 12, i32 0, metadata !17, null}
-!17 = metadata !{metadata !"0xb\0011\000\000", metadata !19, metadata !1} ; [ DW_TAG_lexical_block ]
-!18 = metadata !{metadata !1}
-!19 = metadata !{metadata !"b2.c", metadata !"/tmp/"}
-!20 = metadata !{i32 0}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x101\00my_r0\0011\000", !1, !2, !7} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00foo\00foo\00foo\0011\000\001\000\006\000\000\0011", !19, !2, !4, null, double (%struct.Rect*)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x29", !19} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", !19, !20, !20, !18, null, null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !19, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{!6, !7}
+!6 = !{!"0x24\00double\000\0064\0064\000\000\004", !19, !2} ; [ DW_TAG_base_type ]
+!7 = !{!"0x13\00Rect\006\00256\0064\000\000\000", !19, !2, null, !8, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ]
+!8 = !{!9, !14}
+!9 = !{!"0xd\00P1\007\00128\0064\000\000", !19, !7, !10} ; [ DW_TAG_member ]
+!10 = !{!"0x13\00Pt\001\00128\0064\000\000\000", !19, !2, null, !11, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ]
+!11 = !{!12, !13}
+!12 = !{!"0xd\00x\002\0064\0064\000\000", !19, !10, !6} ; [ DW_TAG_member ]
+!13 = !{!"0xd\00y\003\0064\0064\0064\000", !19, !10, !6} ; [ DW_TAG_member ]
+!14 = !{!"0xd\00P2\008\00128\0064\00128\000", !19, !7, !10} ; [ DW_TAG_member ]
+!15 = !MDLocation(line: 11, scope: !1)
+!16 = !MDLocation(line: 12, scope: !17)
+!17 = !{!"0xb\0011\000\000", !19, !1} ; [ DW_TAG_lexical_block ]
+!18 = !{!1}
+!19 = !{!"b2.c", !"/tmp/"}
+!20 = !{i32 0}
+!21 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
index a35efdc..b85f1af 100644
--- a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
+++ b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
@@ -8,7 +8,7 @@
 
 define i32 @"main(tart.core.String[])->int32"(i32 %args) {
 entry:
-  tail call void @llvm.dbg.value(metadata !14, i64 0, metadata !8, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata %tart.reflect.ComplexType* @.type.SwitchStmtTest, i64 0, metadata !8, metadata !{!"0x102"})
   tail call void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType* @.type.SwitchStmtTest) ; <%tart.core.Object*> [#uses=2]
   ret i32 3
 }
@@ -16,20 +16,20 @@ entry:
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 declare void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType*) nounwind readnone
 
-!0 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !15, metadata !16, metadata !16, null, null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"0x26\00\000\00192\0064\000\000", metadata !15, metadata !0, metadata !2} ; [ DW_TAG_const_type ]
-!2 = metadata !{metadata !"0x13\00C\001\00192\0064\000\000\000", metadata !15, metadata !0, null, metadata !3, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 192, align 64, offset 0] [def] [from ]
-!3 = metadata !{metadata !4, metadata !6, metadata !7}
-!4 = metadata !{metadata !"0xd\00x\001\0064\0064\000\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_member ]
-!5 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", metadata !15, metadata !0} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0xd\00y\001\0064\0064\0064\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_member ]
-!7 = metadata !{metadata !"0xd\00z\001\0064\0064\00128\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_member ]
-!8 = metadata !{metadata !"0x100\00t\005\000", metadata !9, metadata !0, metadata !2} ; [ DW_TAG_auto_variable ]
-!9 = metadata !{metadata !"0xb\000\000\000", null, metadata !10}        ; [ DW_TAG_lexical_block ]
-!10 = metadata !{metadata !"0x2e\00foo\00foo\00foo\004\000\001\000\006\000\000\000", i32 0, metadata !0, metadata !11, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !15, metadata !0, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !15, metadata !0} ; [ DW_TAG_base_type ]
-!14 = metadata !{%tart.reflect.ComplexType* @.type.SwitchStmtTest}
-!15 = metadata !{metadata !"sm.c", metadata !""}
-!16 = metadata !{i32 0}
+!0 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", !15, !16, !16, null, null, null} ; [ DW_TAG_compile_unit ]
+!1 = !{!"0x26\00\000\00192\0064\000\000", !15, !0, !2} ; [ DW_TAG_const_type ]
+!2 = !{!"0x13\00C\001\00192\0064\000\000\000", !15, !0, null, !3, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 192, align 64, offset 0] [def] [from ]
+!3 = !{!4, !6, !7}
+!4 = !{!"0xd\00x\001\0064\0064\000\000", !15, !2, !5} ; [ DW_TAG_member ]
+!5 = !{!"0x24\00double\000\0064\0064\000\000\004", !15, !0} ; [ DW_TAG_base_type ]
+!6 = !{!"0xd\00y\001\0064\0064\0064\000", !15, !2, !5} ; [ DW_TAG_member ]
+!7 = !{!"0xd\00z\001\0064\0064\00128\000", !15, !2, !5} ; [ DW_TAG_member ]
+!8 = !{!"0x100\00t\005\000", !9, !0, !2} ; [ DW_TAG_auto_variable ]
+!9 = !{!"0xb\000\000\000", null, !10}        ; [ DW_TAG_lexical_block ]
+!10 = !{!"0x2e\00foo\00foo\00foo\004\000\001\000\006\000\000\000", i32 0, !0, !11, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!11 = !{!"0x15\00\000\000\000\000\000\000", !15, !0, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!13}
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", !15, !0} ; [ DW_TAG_base_type ]
+!14 = !{%tart.reflect.ComplexType* @.type.SwitchStmtTest}
+!15 = !{!"sm.c", !""}
+!16 = !{i32 0}
diff --git a/test/CodeGen/X86/2010-02-11-NonTemporal.ll b/test/CodeGen/X86/2010-02-11-NonTemporal.ll
index 5789a0b..f9cca8c 100644
--- a/test/CodeGen/X86/2010-02-11-NonTemporal.ll
+++ b/test/CodeGen/X86/2010-02-11-NonTemporal.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
 
-!0 = metadata !{ i32 1 }
+!0 = !{ i32 1 }
 
 define void @sub_(i32* noalias %n) {
 "file movnt.f90, line 2, bb1":
diff --git a/test/CodeGen/X86/2010-02-19-TailCallRetAddrBug.ll b/test/CodeGen/X86/2010-02-19-TailCallRetAddrBug.ll
index 060c535..2c6d113 100644
--- a/test/CodeGen/X86/2010-02-19-TailCallRetAddrBug.ll
+++ b/test/CodeGen/X86/2010-02-19-TailCallRetAddrBug.ll
@@ -15,30 +15,30 @@
 ; Move return address from temporary register (%ebp) to new stack location (60(%esp))
 ; CHECK: movl [[REGISTER]], 60(%esp)
 
-%tupl_p = type [9 x i32]*
+%tupl = type [9 x i32]
 
 declare fastcc void @l297(i32 %r10, i32 %r9, i32 %r8, i32 %r7, i32 %r6, i32 %r5, i32 %r3, i32 %r2) noreturn nounwind
 declare fastcc void @l298(i32 %r10, i32 %r9, i32 %r4) noreturn nounwind
 
-define fastcc void @l186(%tupl_p %r1) noreturn nounwind {
+define fastcc void @l186(%tupl* %r1) noreturn nounwind {
 entry:
-  %ptr1 = getelementptr %tupl_p %r1, i32 0, i32 0
+  %ptr1 = getelementptr %tupl* %r1, i32 0, i32 0
   %r2 = load i32* %ptr1
-  %ptr3 = getelementptr %tupl_p %r1, i32 0, i32 1
+  %ptr3 = getelementptr %tupl* %r1, i32 0, i32 1
   %r3 = load i32* %ptr3
-  %ptr5 = getelementptr %tupl_p %r1, i32 0, i32 2
+  %ptr5 = getelementptr %tupl* %r1, i32 0, i32 2
   %r4 = load i32* %ptr5
-  %ptr7 = getelementptr %tupl_p %r1, i32 0, i32 3
+  %ptr7 = getelementptr %tupl* %r1, i32 0, i32 3
   %r5 = load i32* %ptr7
-  %ptr9 = getelementptr %tupl_p %r1, i32 0, i32 4
+  %ptr9 = getelementptr %tupl* %r1, i32 0, i32 4
   %r6 = load i32* %ptr9
-  %ptr11 = getelementptr %tupl_p %r1, i32 0, i32 5
+  %ptr11 = getelementptr %tupl* %r1, i32 0, i32 5
   %r7 = load i32* %ptr11
-  %ptr13 = getelementptr %tupl_p %r1, i32 0, i32 6
+  %ptr13 = getelementptr %tupl* %r1, i32 0, i32 6
   %r8 = load i32* %ptr13
-  %ptr15 = getelementptr %tupl_p %r1, i32 0, i32 7
+  %ptr15 = getelementptr %tupl* %r1, i32 0, i32 7
   %r9 = load i32* %ptr15
-  %ptr17 = getelementptr %tupl_p %r1, i32 0, i32 8
+  %ptr17 = getelementptr %tupl* %r1, i32 0, i32 8
   %r10 = load i32* %ptr17
   %cond = icmp eq i32 %r10, 3
   br i1 %cond, label %true, label %false
diff --git a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
deleted file mode 100644
index 60025bf..0000000
--- a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
+++ /dev/null
@@ -1,100 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | FileCheck %s
-; There are no MMX operations here, so we use XMM or i64.
-
-; CHECK: ti8
-define void @ti8(double %a, double %b) nounwind {
-entry:
-        %tmp1 = bitcast double %a to <8 x i8>
-        %tmp2 = bitcast double %b to <8 x i8>
-        %tmp3 = add <8 x i8> %tmp1, %tmp2
-; CHECK:  paddb
-        store <8 x i8> %tmp3, <8 x i8>* null
-        ret void
-}
-
-; CHECK: ti16
-define void @ti16(double %a, double %b) nounwind {
-entry:
-        %tmp1 = bitcast double %a to <4 x i16>
-        %tmp2 = bitcast double %b to <4 x i16>
-        %tmp3 = add <4 x i16> %tmp1, %tmp2
-; CHECK:  paddw
-        store <4 x i16> %tmp3, <4 x i16>* null
-        ret void
-}
-
-; CHECK: ti32
-define void @ti32(double %a, double %b) nounwind {
-entry:
-        %tmp1 = bitcast double %a to <2 x i32>
-        %tmp2 = bitcast double %b to <2 x i32>
-        %tmp3 = add <2 x i32> %tmp1, %tmp2
-; CHECK:  paddd
-        store <2 x i32> %tmp3, <2 x i32>* null
-        ret void
-}
-
-; CHECK: ti64
-define void @ti64(double %a, double %b) nounwind {
-entry:
-        %tmp1 = bitcast double %a to <1 x i64>
-        %tmp2 = bitcast double %b to <1 x i64>
-        %tmp3 = add <1 x i64> %tmp1, %tmp2
-; CHECK:  addq
-        store <1 x i64> %tmp3, <1 x i64>* null
-        ret void
-}
-
-; MMX intrinsics calls get us MMX instructions.
-; CHECK: ti8a
-define void @ti8a(double %a, double %b) nounwind {
-entry:
-        %tmp1 = bitcast double %a to x86_mmx
-; CHECK: movdq2q
-        %tmp2 = bitcast double %b to x86_mmx
-; CHECK: movdq2q
-        %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %tmp1, x86_mmx %tmp2)
-        store x86_mmx %tmp3, x86_mmx* null
-        ret void
-}
-
-; CHECK: ti16a
-define void @ti16a(double %a, double %b) nounwind {
-entry:
-        %tmp1 = bitcast double %a to x86_mmx
-; CHECK: movdq2q
-        %tmp2 = bitcast double %b to x86_mmx
-; CHECK: movdq2q
-        %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %tmp1, x86_mmx %tmp2)
-        store x86_mmx %tmp3, x86_mmx* null
-        ret void
-}
-
-; CHECK: ti32a
-define void @ti32a(double %a, double %b) nounwind {
-entry:
-        %tmp1 = bitcast double %a to x86_mmx
-; CHECK: movdq2q
-        %tmp2 = bitcast double %b to x86_mmx
-; CHECK: movdq2q
-        %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %tmp1, x86_mmx %tmp2)
-        store x86_mmx %tmp3, x86_mmx* null
-        ret void
-}
-
-; CHECK: ti64a
-define void @ti64a(double %a, double %b) nounwind {
-entry:
-        %tmp1 = bitcast double %a to x86_mmx
-; CHECK: movdq2q
-        %tmp2 = bitcast double %b to x86_mmx
-; CHECK: movdq2q
-        %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %tmp1, x86_mmx %tmp2)
-        store x86_mmx %tmp3, x86_mmx* null
-        ret void
-}
- 
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
diff --git a/test/CodeGen/X86/2010-05-05-LocalAllocEarlyClobber.ll b/test/CodeGen/X86/2010-05-05-LocalAllocEarlyClobber.ll
index fc8c895..86be390 100644
--- a/test/CodeGen/X86/2010-05-05-LocalAllocEarlyClobber.ll
+++ b/test/CodeGen/X86/2010-05-05-LocalAllocEarlyClobber.ll
@@ -29,4 +29,4 @@ entry:
   ret i8* %1
 }
 
-!0 = metadata !{i32 79}
+!0 = !{i32 79}
diff --git a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
index 1998011..0d30a3f 100644
--- a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
@@ -10,10 +10,10 @@
 
 define hidden %0 @__divsc3(float %a, float %b, float %c, float %d) nounwind readnone {
 entry:
-  tail call void @llvm.dbg.value(metadata !{float %a}, i64 0, metadata !0, metadata !{metadata !"0x102"})
-  tail call void @llvm.dbg.value(metadata !{float %b}, i64 0, metadata !11, metadata !{metadata !"0x102"})
-  tail call void @llvm.dbg.value(metadata !{float %c}, i64 0, metadata !12, metadata !{metadata !"0x102"})
-  tail call void @llvm.dbg.value(metadata !{float %d}, i64 0, metadata !13, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata float %a, i64 0, metadata !0, metadata !{!"0x102"})
+  tail call void @llvm.dbg.value(metadata float %b, i64 0, metadata !11, metadata !{!"0x102"})
+  tail call void @llvm.dbg.value(metadata float %c, i64 0, metadata !12, metadata !{!"0x102"})
+  tail call void @llvm.dbg.value(metadata float %d, i64 0, metadata !13, metadata !{!"0x102"})
   %0 = tail call float @fabsf(float %c) nounwind readnone, !dbg !19 ; <float> [#uses=1]
   %1 = tail call float @fabsf(float %d) nounwind readnone, !dbg !19 ; <float> [#uses=1]
   %2 = fcmp olt float %0, %1, !dbg !19            ; <i1> [#uses=1]
@@ -21,34 +21,34 @@ entry:
 
 bb:                                               ; preds = %entry
   %3 = fdiv float %c, %d, !dbg !20                ; <float> [#uses=3]
-  tail call void @llvm.dbg.value(metadata !{float %3}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !20
+  tail call void @llvm.dbg.value(metadata float %3, i64 0, metadata !16, metadata !{!"0x102"}), !dbg !20
   %4 = fmul float %3, %c, !dbg !21                ; <float> [#uses=1]
   %5 = fadd float %4, %d, !dbg !21                ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %5}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !21
+  tail call void @llvm.dbg.value(metadata float %5, i64 0, metadata !14, metadata !{!"0x102"}), !dbg !21
   %6 = fmul float %3, %a, !dbg !22                ; <float> [#uses=1]
   %7 = fadd float %6, %b, !dbg !22                ; <float> [#uses=1]
   %8 = fdiv float %7, %5, !dbg !22                ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %8}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !22
+  tail call void @llvm.dbg.value(metadata float %8, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !22
   %9 = fmul float %3, %b, !dbg !23                ; <float> [#uses=1]
   %10 = fsub float %9, %a, !dbg !23               ; <float> [#uses=1]
   %11 = fdiv float %10, %5, !dbg !23              ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %11}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !23
+  tail call void @llvm.dbg.value(metadata float %11, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !23
   br label %bb2, !dbg !23
 
 bb1:                                              ; preds = %entry
   %12 = fdiv float %d, %c, !dbg !24               ; <float> [#uses=3]
-  tail call void @llvm.dbg.value(metadata !{float %12}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !24
+  tail call void @llvm.dbg.value(metadata float %12, i64 0, metadata !16, metadata !{!"0x102"}), !dbg !24
   %13 = fmul float %12, %d, !dbg !25              ; <float> [#uses=1]
   %14 = fadd float %13, %c, !dbg !25              ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %14}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !25
+  tail call void @llvm.dbg.value(metadata float %14, i64 0, metadata !14, metadata !{!"0x102"}), !dbg !25
   %15 = fmul float %12, %b, !dbg !26              ; <float> [#uses=1]
   %16 = fadd float %15, %a, !dbg !26              ; <float> [#uses=1]
   %17 = fdiv float %16, %14, !dbg !26             ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %17}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !26
+  tail call void @llvm.dbg.value(metadata float %17, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !26
   %18 = fmul float %12, %a, !dbg !27              ; <float> [#uses=1]
   %19 = fsub float %b, %18, !dbg !27              ; <float> [#uses=1]
   %20 = fdiv float %19, %14, !dbg !27             ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %20}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !27
+  tail call void @llvm.dbg.value(metadata float %20, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !27
   br label %bb2, !dbg !27
 
 bb2:                                              ; preds = %bb1, %bb
@@ -74,9 +74,9 @@ bb6:                                              ; preds = %bb4
 bb8:                                              ; preds = %bb6
   %27 = tail call float @copysignf(float 0x7FF0000000000000, float %c) nounwind readnone, !dbg !30 ; <float> [#uses=2]
   %28 = fmul float %27, %a, !dbg !30              ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %28}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata float %28, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !30
   %29 = fmul float %27, %b, !dbg !31              ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %29}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !31
+  tail call void @llvm.dbg.value(metadata float %29, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !31
   br label %bb46, !dbg !31
 
 bb9:                                              ; preds = %bb6, %bb4
@@ -106,24 +106,24 @@ bb15:                                             ; preds = %bb14
 bb16:                                             ; preds = %bb15
   %iftmp.0.0 = select i1 %33, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1]
   %42 = tail call float @copysignf(float %iftmp.0.0, float %a) nounwind readnone, !dbg !33 ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %42}, i64 0, metadata !0, metadata !{metadata !"0x102"}), !dbg !33
+  tail call void @llvm.dbg.value(metadata float %42, i64 0, metadata !0, metadata !{!"0x102"}), !dbg !33
   %43 = fcmp ord float %b, 0.000000e+00           ; <i1> [#uses=1]
   %44 = fsub float %b, %b, !dbg !34               ; <float> [#uses=1]
   %45 = fcmp uno float %44, 0.000000e+00          ; <i1> [#uses=1]
   %46 = and i1 %43, %45, !dbg !34                 ; <i1> [#uses=1]
   %iftmp.1.0 = select i1 %46, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1]
   %47 = tail call float @copysignf(float %iftmp.1.0, float %b) nounwind readnone, !dbg !34 ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %47}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !34
+  tail call void @llvm.dbg.value(metadata float %47, i64 0, metadata !11, metadata !{!"0x102"}), !dbg !34
   %48 = fmul float %42, %c, !dbg !35              ; <float> [#uses=1]
   %49 = fmul float %47, %d, !dbg !35              ; <float> [#uses=1]
   %50 = fadd float %48, %49, !dbg !35             ; <float> [#uses=1]
   %51 = fmul float %50, 0x7FF0000000000000, !dbg !35 ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %51}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !35
+  tail call void @llvm.dbg.value(metadata float %51, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !35
   %52 = fmul float %47, %c, !dbg !36              ; <float> [#uses=1]
   %53 = fmul float %42, %d, !dbg !36              ; <float> [#uses=1]
   %54 = fsub float %52, %53, !dbg !36             ; <float> [#uses=1]
   %55 = fmul float %54, 0x7FF0000000000000, !dbg !36 ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %55}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !36
+  tail call void @llvm.dbg.value(metadata float %55, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !36
   br label %bb46, !dbg !36
 
 bb27:                                             ; preds = %bb15, %bb14, %bb11
@@ -154,24 +154,24 @@ bb34:                                             ; preds = %bb33, %bb30
 bb35:                                             ; preds = %bb34
   %iftmp.2.0 = select i1 %59, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1]
   %67 = tail call float @copysignf(float %iftmp.2.0, float %c) nounwind readnone, !dbg !38 ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %67}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !38
+  tail call void @llvm.dbg.value(metadata float %67, i64 0, metadata !12, metadata !{!"0x102"}), !dbg !38
   %68 = fcmp ord float %d, 0.000000e+00           ; <i1> [#uses=1]
   %69 = fsub float %d, %d, !dbg !39               ; <float> [#uses=1]
   %70 = fcmp uno float %69, 0.000000e+00          ; <i1> [#uses=1]
   %71 = and i1 %68, %70, !dbg !39                 ; <i1> [#uses=1]
   %iftmp.3.0 = select i1 %71, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1]
   %72 = tail call float @copysignf(float %iftmp.3.0, float %d) nounwind readnone, !dbg !39 ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %72}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !39
+  tail call void @llvm.dbg.value(metadata float %72, i64 0, metadata !13, metadata !{!"0x102"}), !dbg !39
   %73 = fmul float %67, %a, !dbg !40              ; <float> [#uses=1]
   %74 = fmul float %72, %b, !dbg !40              ; <float> [#uses=1]
   %75 = fadd float %73, %74, !dbg !40             ; <float> [#uses=1]
   %76 = fmul float %75, 0.000000e+00, !dbg !40    ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %76}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !40
+  tail call void @llvm.dbg.value(metadata float %76, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !40
   %77 = fmul float %67, %b, !dbg !41              ; <float> [#uses=1]
   %78 = fmul float %72, %a, !dbg !41              ; <float> [#uses=1]
   %79 = fsub float %77, %78, !dbg !41             ; <float> [#uses=1]
   %80 = fmul float %79, 0.000000e+00, !dbg !41    ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %80}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !41
+  tail call void @llvm.dbg.value(metadata float %80, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !41
   br label %bb46, !dbg !41
 
 bb46:                                             ; preds = %bb35, %bb34, %bb33, %bb30, %bb16, %bb8, %bb2
@@ -200,52 +200,52 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!48}
 
-!0 = metadata !{metadata !"0x101\00a\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00__divsc3\00__divsc3\00__divsc3\001922\000\001\000\006\000\001\001922", metadata !45, metadata !2, metadata !4, null, %0 (float, float, float, float)* @__divsc3, null, null, metadata !43} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x29", metadata !45} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !45, metadata !47, metadata !47, metadata !44, null,  null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !45, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{metadata !6, metadata !9, metadata !9, metadata !9, metadata !9}
-!6 = metadata !{metadata !"0x16\00SCtype\00170\000\000\000\000", metadata !46, metadata !7, metadata !8} ; [ DW_TAG_typedef ]
-!7 = metadata !{metadata !"0x29", metadata !46} ; [ DW_TAG_file_type ]
-!8 = metadata !{metadata !"0x24\00complex float\000\0064\0032\000\000\003", metadata !45, metadata !2} ; [ DW_TAG_base_type ]
-!9 = metadata !{metadata !"0x16\00SFtype\00167\000\000\000\000", metadata !46, metadata !7, metadata !10} ; [ DW_TAG_typedef ]
-!10 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", metadata !45, metadata !2} ; [ DW_TAG_base_type ]
-!11 = metadata !{metadata !"0x101\00b\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{metadata !"0x101\00c\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
-!13 = metadata !{metadata !"0x101\00d\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
-!14 = metadata !{metadata !"0x100\00denom\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
-!15 = metadata !{metadata !"0xb\001922\000\000", metadata !45, metadata !1} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{metadata !"0x100\00ratio\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
-!17 = metadata !{metadata !"0x100\00x\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
-!18 = metadata !{metadata !"0x100\00y\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
-!19 = metadata !{i32 1929, i32 0, metadata !15, null}
-!20 = metadata !{i32 1931, i32 0, metadata !15, null}
-!21 = metadata !{i32 1932, i32 0, metadata !15, null}
-!22 = metadata !{i32 1933, i32 0, metadata !15, null}
-!23 = metadata !{i32 1934, i32 0, metadata !15, null}
-!24 = metadata !{i32 1938, i32 0, metadata !15, null}
-!25 = metadata !{i32 1939, i32 0, metadata !15, null}
-!26 = metadata !{i32 1940, i32 0, metadata !15, null}
-!27 = metadata !{i32 1941, i32 0, metadata !15, null}
-!28 = metadata !{i32 1946, i32 0, metadata !15, null}
-!29 = metadata !{i32 1948, i32 0, metadata !15, null}
-!30 = metadata !{i32 1950, i32 0, metadata !15, null}
-!31 = metadata !{i32 1951, i32 0, metadata !15, null}
-!32 = metadata !{i32 1953, i32 0, metadata !15, null}
-!33 = metadata !{i32 1955, i32 0, metadata !15, null}
-!34 = metadata !{i32 1956, i32 0, metadata !15, null}
-!35 = metadata !{i32 1957, i32 0, metadata !15, null}
-!36 = metadata !{i32 1958, i32 0, metadata !15, null}
-!37 = metadata !{i32 1960, i32 0, metadata !15, null}
-!38 = metadata !{i32 1962, i32 0, metadata !15, null}
-!39 = metadata !{i32 1963, i32 0, metadata !15, null}
-!40 = metadata !{i32 1964, i32 0, metadata !15, null}
-!41 = metadata !{i32 1965, i32 0, metadata !15, null}
-!42 = metadata !{i32 1969, i32 0, metadata !15, null}
-!43 = metadata !{metadata !0, metadata !11, metadata !12, metadata !13, metadata !14, metadata !16, metadata !17, metadata !18}
-!44 = metadata !{metadata !1}
-!45 = metadata !{metadata !"libgcc2.c", metadata !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"}
-!46 = metadata !{metadata !"libgcc2.h", metadata !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"}
-!47 = metadata !{i32 0}
-!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x101\00a\001921\000", !1, !2, !9} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00__divsc3\00__divsc3\00__divsc3\001922\000\001\000\006\000\001\001922", !45, !2, !4, null, %0 (float, float, float, float)* @__divsc3, null, null, !43} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x29", !45} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", !45, !47, !47, !44, null,  null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !45, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{!6, !9, !9, !9, !9}
+!6 = !{!"0x16\00SCtype\00170\000\000\000\000", !46, !7, !8} ; [ DW_TAG_typedef ]
+!7 = !{!"0x29", !46} ; [ DW_TAG_file_type ]
+!8 = !{!"0x24\00complex float\000\0064\0032\000\000\003", !45, !2} ; [ DW_TAG_base_type ]
+!9 = !{!"0x16\00SFtype\00167\000\000\000\000", !46, !7, !10} ; [ DW_TAG_typedef ]
+!10 = !{!"0x24\00float\000\0032\0032\000\000\004", !45, !2} ; [ DW_TAG_base_type ]
+!11 = !{!"0x101\00b\001921\000", !1, !2, !9} ; [ DW_TAG_arg_variable ]
+!12 = !{!"0x101\00c\001921\000", !1, !2, !9} ; [ DW_TAG_arg_variable ]
+!13 = !{!"0x101\00d\001921\000", !1, !2, !9} ; [ DW_TAG_arg_variable ]
+!14 = !{!"0x100\00denom\001923\000", !15, !2, !9} ; [ DW_TAG_auto_variable ]
+!15 = !{!"0xb\001922\000\000", !45, !1} ; [ DW_TAG_lexical_block ]
+!16 = !{!"0x100\00ratio\001923\000", !15, !2, !9} ; [ DW_TAG_auto_variable ]
+!17 = !{!"0x100\00x\001923\000", !15, !2, !9} ; [ DW_TAG_auto_variable ]
+!18 = !{!"0x100\00y\001923\000", !15, !2, !9} ; [ DW_TAG_auto_variable ]
+!19 = !MDLocation(line: 1929, scope: !15)
+!20 = !MDLocation(line: 1931, scope: !15)
+!21 = !MDLocation(line: 1932, scope: !15)
+!22 = !MDLocation(line: 1933, scope: !15)
+!23 = !MDLocation(line: 1934, scope: !15)
+!24 = !MDLocation(line: 1938, scope: !15)
+!25 = !MDLocation(line: 1939, scope: !15)
+!26 = !MDLocation(line: 1940, scope: !15)
+!27 = !MDLocation(line: 1941, scope: !15)
+!28 = !MDLocation(line: 1946, scope: !15)
+!29 = !MDLocation(line: 1948, scope: !15)
+!30 = !MDLocation(line: 1950, scope: !15)
+!31 = !MDLocation(line: 1951, scope: !15)
+!32 = !MDLocation(line: 1953, scope: !15)
+!33 = !MDLocation(line: 1955, scope: !15)
+!34 = !MDLocation(line: 1956, scope: !15)
+!35 = !MDLocation(line: 1957, scope: !15)
+!36 = !MDLocation(line: 1958, scope: !15)
+!37 = !MDLocation(line: 1960, scope: !15)
+!38 = !MDLocation(line: 1962, scope: !15)
+!39 = !MDLocation(line: 1963, scope: !15)
+!40 = !MDLocation(line: 1964, scope: !15)
+!41 = !MDLocation(line: 1965, scope: !15)
+!42 = !MDLocation(line: 1969, scope: !15)
+!43 = !{!0, !11, !12, !13, !14, !16, !17, !18}
+!44 = !{!1}
+!45 = !{!"libgcc2.c", !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"}
+!46 = !{!"libgcc2.h", !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"}
+!47 = !{i32 0}
+!48 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index 09120a1..9915a70 100644
--- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-darwin10"
 
 define i8* @bar(%struct.a* %myvar) nounwind optsize noinline ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.a* %myvar}, i64 0, metadata !8, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata %struct.a* %myvar, i64 0, metadata !8, metadata !{!"0x102"})
   %0 = getelementptr inbounds %struct.a* %myvar, i64 0, i32 0, !dbg !28 ; <i32*> [#uses=1]
   %1 = load i32* %0, align 8, !dbg !28            ; <i32> [#uses=1]
   tail call void @foo(i32 %1) nounwind optsize noinline ssp, !dbg !28
@@ -24,44 +24,44 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!38}
 
-!0 = metadata !{metadata !"0x34\00ret\00ret\00\007\000\001", metadata !1, metadata !1, metadata !3, null, null} ; [ DW_TAG_variable ]
-!1 = metadata !{metadata !"0x29", metadata !36} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !36, metadata !37, metadata !37, metadata !32, metadata !31,  metadata !37} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !36, metadata !1} ; [ DW_TAG_base_type ]
-!4 = metadata !{metadata !"0x101\00x\0012\000", metadata !5, metadata !1, metadata !3} ; [ DW_TAG_arg_variable ]
-!5 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0013\000\001\000\006\000\001\0013", metadata !36, metadata !1, metadata !6, null, void (i32)* @foo, null, null, metadata !33} ; [ DW_TAG_subprogram ]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !36, metadata !1, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null, metadata !3}
-!8 = metadata !{metadata !"0x101\00myvar\0017\000", metadata !9, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{metadata !"0x2e\00bar\00bar\00bar\0017\000\001\000\006\000\001\0017", metadata !36, metadata !1, metadata !10, null, i8* (%struct.a*)* @bar, null, null, metadata !34} ; [ DW_TAG_subprogram ]
-!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !36, metadata !1, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!11 = metadata !{metadata !12, metadata !13}
-!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, null} ; [ DW_TAG_pointer_type ]
-!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, metadata !14} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{metadata !"0x13\00a\002\00128\0064\000\000\000", metadata !36, metadata !1, null, metadata !15, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 2, size 128, align 64, offset 0] [def] [from ]
-!15 = metadata !{metadata !16, metadata !17}
-!16 = metadata !{metadata !"0xd\00c\003\0032\0032\000\000", metadata !36, metadata !14, metadata !3} ; [ DW_TAG_member ]
-!17 = metadata !{metadata !"0xd\00d\004\0064\0064\0064\000", metadata !36, metadata !14, metadata !13} ; [ DW_TAG_member ]
-!18 = metadata !{metadata !"0x101\00argc\0022\000", metadata !19, metadata !1, metadata !3} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{metadata !"0x2e\00main\00main\00main\0022\000\001\000\006\000\001\0022", metadata !36, metadata !1, metadata !20, null, null, null, null, metadata !35} ; [ DW_TAG_subprogram ]
-!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !36, metadata !1, null, metadata !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!21 = metadata !{metadata !3, metadata !3, metadata !22}
-!22 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, metadata !23} ; [ DW_TAG_pointer_type ]
-!23 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, metadata !24} ; [ DW_TAG_pointer_type ]
-!24 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !36, metadata !1} ; [ DW_TAG_base_type ]
-!25 = metadata !{metadata !"0x101\00argv\0022\000", metadata !19, metadata !1, metadata !22} ; [ DW_TAG_arg_variable ]
-!26 = metadata !{metadata !"0x100\00e\0023\000", metadata !27, metadata !1, metadata !14} ; [ DW_TAG_auto_variable ]
-!27 = metadata !{metadata !"0xb\0022\000\000", metadata !36, metadata !19} ; [ DW_TAG_lexical_block ]
-!28 = metadata !{i32 18, i32 0, metadata !29, null}
-!29 = metadata !{metadata !"0xb\0017\000\001", metadata !36, metadata !9} ; [ DW_TAG_lexical_block ]
-!30 = metadata !{i32 19, i32 0, metadata !29, null}
-!31 = metadata !{metadata !0}
-!32 = metadata !{metadata !5, metadata !9, metadata !19}
-!33 = metadata !{metadata !4}
-!34 = metadata !{metadata !8}
-!35 = metadata !{metadata !18, metadata !25, metadata !26}
-!36 = metadata !{metadata !"foo.c", metadata !"/tmp/"}
-!37 = metadata !{}
+!0 = !{!"0x34\00ret\00ret\00\007\000\001", !1, !1, !3, null, null} ; [ DW_TAG_variable ]
+!1 = !{!"0x29", !36} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", !36, !37, !37, !32, !31,  !37} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x24\00int\000\0032\0032\000\000\005", !36, !1} ; [ DW_TAG_base_type ]
+!4 = !{!"0x101\00x\0012\000", !5, !1, !3} ; [ DW_TAG_arg_variable ]
+!5 = !{!"0x2e\00foo\00foo\00foo\0013\000\001\000\006\000\001\0013", !36, !1, !6, null, void (i32)* @foo, null, null, !33} ; [ DW_TAG_subprogram ]
+!6 = !{!"0x15\00\000\000\000\000\000\000", !36, !1, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null, !3}
+!8 = !{!"0x101\00myvar\0017\000", !9, !1, !13} ; [ DW_TAG_arg_variable ]
+!9 = !{!"0x2e\00bar\00bar\00bar\0017\000\001\000\006\000\001\0017", !36, !1, !10, null, i8* (%struct.a*)* @bar, null, null, !34} ; [ DW_TAG_subprogram ]
+!10 = !{!"0x15\00\000\000\000\000\000\000", !36, !1, null, !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = !{!12, !13}
+!12 = !{!"0xf\00\000\0064\0064\000\000", !36, !1, null} ; [ DW_TAG_pointer_type ]
+!13 = !{!"0xf\00\000\0064\0064\000\000", !36, !1, !14} ; [ DW_TAG_pointer_type ]
+!14 = !{!"0x13\00a\002\00128\0064\000\000\000", !36, !1, null, !15, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 2, size 128, align 64, offset 0] [def] [from ]
+!15 = !{!16, !17}
+!16 = !{!"0xd\00c\003\0032\0032\000\000", !36, !14, !3} ; [ DW_TAG_member ]
+!17 = !{!"0xd\00d\004\0064\0064\0064\000", !36, !14, !13} ; [ DW_TAG_member ]
+!18 = !{!"0x101\00argc\0022\000", !19, !1, !3} ; [ DW_TAG_arg_variable ]
+!19 = !{!"0x2e\00main\00main\00main\0022\000\001\000\006\000\001\0022", !36, !1, !20, null, null, null, null, !35} ; [ DW_TAG_subprogram ]
+!20 = !{!"0x15\00\000\000\000\000\000\000", !36, !1, null, !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!21 = !{!3, !3, !22}
+!22 = !{!"0xf\00\000\0064\0064\000\000", !36, !1, !23} ; [ DW_TAG_pointer_type ]
+!23 = !{!"0xf\00\000\0064\0064\000\000", !36, !1, !24} ; [ DW_TAG_pointer_type ]
+!24 = !{!"0x24\00char\000\008\008\000\000\006", !36, !1} ; [ DW_TAG_base_type ]
+!25 = !{!"0x101\00argv\0022\000", !19, !1, !22} ; [ DW_TAG_arg_variable ]
+!26 = !{!"0x100\00e\0023\000", !27, !1, !14} ; [ DW_TAG_auto_variable ]
+!27 = !{!"0xb\0022\000\000", !36, !19} ; [ DW_TAG_lexical_block ]
+!28 = !MDLocation(line: 18, scope: !29)
+!29 = !{!"0xb\0017\000\001", !36, !9} ; [ DW_TAG_lexical_block ]
+!30 = !MDLocation(line: 19, scope: !29)
+!31 = !{!0}
+!32 = !{!5, !9, !19}
+!33 = !{!4}
+!34 = !{!8}
+!35 = !{!18, !25, !26}
+!36 = !{!"foo.c", !"/tmp/"}
+!37 = !{}
 
 ; The variable bar:myvar changes registers after the first movq.
 ; It is cobbered by popq %rbx
@@ -91,4 +91,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 ; CHECK-NEXT: Ltmp{{.*}}:
 ; CHECK-NEXT: .byte   83
 ; CHECK-NEXT: Ltmp{{.*}}:
-!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!38 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-05-28-Crash.ll b/test/CodeGen/X86/2010-05-28-Crash.ll
index b0a4e8d..7adacf5 100644
--- a/test/CodeGen/X86/2010-05-28-Crash.ll
+++ b/test/CodeGen/X86/2010-05-28-Crash.ll
@@ -4,7 +4,7 @@
 
 define i32 @foo(i32 %y) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %y}, i64 0, metadata !0, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !0, metadata !{!"0x102"})
   %0 = tail call i32 (...)* @zoo(i32 %y) nounwind, !dbg !9 ; <i32> [#uses=1]
   ret i32 %0, !dbg !9
 }
@@ -15,8 +15,8 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 define i32 @bar(i32 %x) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !7, metadata !{metadata !"0x102"})
-  tail call void @llvm.dbg.value(metadata !11, i64 0, metadata !0, metadata !{metadata !"0x102"}) nounwind
+  tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !7, metadata !{!"0x102"})
+  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !0, metadata !{!"0x102"}) nounwind
   %0 = tail call i32 (...)* @zoo(i32 1) nounwind, !dbg !12 ; <i32> [#uses=1]
   %1 = add nsw i32 %0, %x, !dbg !13               ; <i32> [#uses=1]
   ret i32 %1, !dbg !13
@@ -25,28 +25,28 @@ entry:
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!20}
 
-!0 = metadata !{metadata !"0x101\00y\002\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\001\002", metadata !18, metadata !2, metadata !4, null, i32 (i32)* @foo, null, null, metadata !15} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !18, metadata !19, metadata !19, metadata !17, null,  null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{metadata !6, metadata !6}
-!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !18, metadata !2} ; [ DW_TAG_base_type ]
-!7 = metadata !{metadata !"0x101\00x\006\000", metadata !8, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{metadata !"0x2e\00bar\00bar\00bar\006\000\001\000\006\000\001\006", metadata !18, metadata !2, metadata !4, null, i32 (i32)* @bar, null, null, metadata !16} ; [ DW_TAG_subprogram ]
-!9 = metadata !{i32 3, i32 0, metadata !10, null}
-!10 = metadata !{metadata !"0xb\002\000\000", metadata !18, metadata !1} ; [ DW_TAG_lexical_block ]
-!11 = metadata !{i32 1}
-!12 = metadata !{i32 3, i32 0, metadata !10, metadata !13}
-!13 = metadata !{i32 7, i32 0, metadata !14, null}
-!14 = metadata !{metadata !"0xb\006\000\000", metadata !18, metadata !8} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{metadata !0}
-!16 = metadata !{metadata !7}
-!17 = metadata !{metadata !1, metadata !8}
-!18 = metadata !{metadata !"f.c", metadata !"/tmp"}
-!19 = metadata !{i32 0}
+!0 = !{!"0x101\00y\002\000", !1, !2, !6} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\001\002", !18, !2, !4, null, i32 (i32)* @foo, null, null, !15} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x29", !18} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", !18, !19, !19, !17, null,  null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !18, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{!6, !6}
+!6 = !{!"0x24\00int\000\0032\0032\000\000\005", !18, !2} ; [ DW_TAG_base_type ]
+!7 = !{!"0x101\00x\006\000", !8, !2, !6} ; [ DW_TAG_arg_variable ]
+!8 = !{!"0x2e\00bar\00bar\00bar\006\000\001\000\006\000\001\006", !18, !2, !4, null, i32 (i32)* @bar, null, null, !16} ; [ DW_TAG_subprogram ]
+!9 = !MDLocation(line: 3, scope: !10)
+!10 = !{!"0xb\002\000\000", !18, !1} ; [ DW_TAG_lexical_block ]
+!11 = !{i32 1}
+!12 = !MDLocation(line: 3, scope: !10, inlinedAt: !13)
+!13 = !MDLocation(line: 7, scope: !14)
+!14 = !{!"0xb\006\000\000", !18, !8} ; [ DW_TAG_lexical_block ]
+!15 = !{!0}
+!16 = !{!7}
+!17 = !{!1, !8}
+!18 = !{!"f.c", !"/tmp"}
+!19 = !{i32 0}
 
 ;CHECK: DEBUG_VALUE: bar:x <- E
 ;CHECK: Ltmp
 ;CHECK:	DEBUG_VALUE: foo:y <- 1{{$}}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!20 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
index dea9162..3687b82 100644
--- a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
+++ b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
@@ -10,8 +10,8 @@ target triple = "x86_64-apple-darwin10.2"
 define i32 @_ZN3foo3bazEi(%struct.foo* nocapture %this, i32 %x) nounwind readnone optsize noinline ssp align 2 {
 ;CHECK: DEBUG_VALUE: baz:this <- RDI{{$}}
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.foo* %this}, i64 0, metadata !15, metadata !{metadata !"0x102"})
-  tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !16, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata %struct.foo* %this, i64 0, metadata !15, metadata !{!"0x102"})
+  tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !16, metadata !{!"0x102"})
   %0 = mul nsw i32 %x, 7, !dbg !29                ; <i32> [#uses=1]
   %1 = add nsw i32 %0, 1, !dbg !29                ; <i32> [#uses=1]
   ret i32 %1, !dbg !29
@@ -23,38 +23,38 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.module.flags = !{!34}
 !llvm.dbg.lv = !{!0, !14, !15, !16, !17, !24, !25, !28}
 
-!0 = metadata !{metadata !"0x101\00this\0011\000", metadata !1, metadata !3, metadata !12} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3foo3barEi\0011\000\001\000\006\000\001\0011", metadata !31, metadata !2, metadata !9, null, i32 (%struct.foo*, i32)* null, null, null, null} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x13\00foo\003\0032\0032\000\000\000", metadata !31, metadata !3, null, metadata !5, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 3, size 32, align 32, offset 0] [def] [from ]
-!3 = metadata !{metadata !"0x29", metadata !31} ; [ DW_TAG_file_type ]
-!4 = metadata !{metadata !"0x11\004\004.2.1 LLVM build\001\00\000\00\000", metadata !31, metadata !32, metadata !32, metadata !33, null, null} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{metadata !6, metadata !1, metadata !8}
-!6 = metadata !{metadata !"0xd\00y\008\0032\0032\000\000", metadata !31, metadata !2, metadata !7} ; [ DW_TAG_member ]
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !31, metadata !3} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !"0x2e\00baz\00baz\00_ZN3foo3bazEi\0015\000\001\000\006\000\001\0015", metadata !31, metadata !2, metadata !9, null, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null} ; [ DW_TAG_subprogram ]
-!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !3, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!10 = metadata !{metadata !7, metadata !11, metadata !7}
-!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !31, metadata !3, metadata !2} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !31, metadata !3, metadata !13} ; [ DW_TAG_const_type ]
-!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !31, metadata !3, metadata !2} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{metadata !"0x101\00x\0011\000", metadata !1, metadata !3, metadata !7} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{metadata !"0x101\00this\0015\000", metadata !8, metadata !3, metadata !12} ; [ DW_TAG_arg_variable ]
-!16 = metadata !{metadata !"0x101\00x\0015\000", metadata !8, metadata !3, metadata !7} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{metadata !"0x101\00argc\0019\000", metadata !18, metadata !3, metadata !7} ; [ DW_TAG_arg_variable ]
-!18 = metadata !{metadata !"0x2e\00main\00main\00main\0019\000\001\000\006\000\001\0019", metadata !31, metadata !3, metadata !19, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!19 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !3, null, metadata !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!20 = metadata !{metadata !7, metadata !7, metadata !21}
-!21 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !31, metadata !3, metadata !22} ; [ DW_TAG_pointer_type ]
-!22 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !31, metadata !3, metadata !23} ; [ DW_TAG_pointer_type ]
-!23 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !31, metadata !3} ; [ DW_TAG_base_type ]
-!24 = metadata !{metadata !"0x101\00argv\0019\000", metadata !18, metadata !3, metadata !21} ; [ DW_TAG_arg_variable ]
-!25 = metadata !{metadata !"0x100\00a\0020\000", metadata !26, metadata !3, metadata !2} ; [ DW_TAG_auto_variable ]
-!26 = metadata !{metadata !"0xb\0019\000\000", metadata !31, metadata !27} ; [ DW_TAG_lexical_block ]
-!27 = metadata !{metadata !"0xb\0019\000\000", metadata !31, metadata !18} ; [ DW_TAG_lexical_block ]
-!28 = metadata !{metadata !"0x100\00b\0021\000", metadata !26, metadata !3, metadata !7} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{i32 16, i32 0, metadata !30, null}
-!30 = metadata !{metadata !"0xb\0015\000\000", metadata !31, metadata !8} ; [ DW_TAG_lexical_block ]
-!31 = metadata !{metadata !"foo.cp", metadata !"/tmp/"}
-!32 = metadata !{i32 0}
-!33 = metadata !{metadata !1, metadata !8, metadata !18}
-!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x101\00this\0011\000", !1, !3, !12} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00bar\00bar\00_ZN3foo3barEi\0011\000\001\000\006\000\001\0011", !31, !2, !9, null, i32 (%struct.foo*, i32)* null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x13\00foo\003\0032\0032\000\000\000", !31, !3, null, !5, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 3, size 32, align 32, offset 0] [def] [from ]
+!3 = !{!"0x29", !31} ; [ DW_TAG_file_type ]
+!4 = !{!"0x11\004\004.2.1 LLVM build\001\00\000\00\000", !31, !32, !32, !33, null, null} ; [ DW_TAG_compile_unit ]
+!5 = !{!6, !1, !8}
+!6 = !{!"0xd\00y\008\0032\0032\000\000", !31, !2, !7} ; [ DW_TAG_member ]
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", !31, !3} ; [ DW_TAG_base_type ]
+!8 = !{!"0x2e\00baz\00baz\00_ZN3foo3bazEi\0015\000\001\000\006\000\001\0015", !31, !2, !9, null, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null} ; [ DW_TAG_subprogram ]
+!9 = !{!"0x15\00\000\000\000\000\000\000", !31, !3, null, !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = !{!7, !11, !7}
+!11 = !{!"0xf\00\000\0064\0064\000\0064", !31, !3, !2} ; [ DW_TAG_pointer_type ]
+!12 = !{!"0x26\00\000\0064\0064\000\0064", !31, !3, !13} ; [ DW_TAG_const_type ]
+!13 = !{!"0xf\00\000\0064\0064\000\000", !31, !3, !2} ; [ DW_TAG_pointer_type ]
+!14 = !{!"0x101\00x\0011\000", !1, !3, !7} ; [ DW_TAG_arg_variable ]
+!15 = !{!"0x101\00this\0015\000", !8, !3, !12} ; [ DW_TAG_arg_variable ]
+!16 = !{!"0x101\00x\0015\000", !8, !3, !7} ; [ DW_TAG_arg_variable ]
+!17 = !{!"0x101\00argc\0019\000", !18, !3, !7} ; [ DW_TAG_arg_variable ]
+!18 = !{!"0x2e\00main\00main\00main\0019\000\001\000\006\000\001\0019", !31, !3, !19, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!19 = !{!"0x15\00\000\000\000\000\000\000", !31, !3, null, !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = !{!7, !7, !21}
+!21 = !{!"0xf\00\000\0064\0064\000\000", !31, !3, !22} ; [ DW_TAG_pointer_type ]
+!22 = !{!"0xf\00\000\0064\0064\000\000", !31, !3, !23} ; [ DW_TAG_pointer_type ]
+!23 = !{!"0x24\00char\000\008\008\000\000\006", !31, !3} ; [ DW_TAG_base_type ]
+!24 = !{!"0x101\00argv\0019\000", !18, !3, !21} ; [ DW_TAG_arg_variable ]
+!25 = !{!"0x100\00a\0020\000", !26, !3, !2} ; [ DW_TAG_auto_variable ]
+!26 = !{!"0xb\0019\000\000", !31, !27} ; [ DW_TAG_lexical_block ]
+!27 = !{!"0xb\0019\000\000", !31, !18} ; [ DW_TAG_lexical_block ]
+!28 = !{!"0x100\00b\0021\000", !26, !3, !7} ; [ DW_TAG_auto_variable ]
+!29 = !MDLocation(line: 16, scope: !30)
+!30 = !{!"0xb\0015\000\000", !31, !8} ; [ DW_TAG_lexical_block ]
+!31 = !{!"foo.cp", !"/tmp/"}
+!32 = !{i32 0}
+!33 = !{!1, !8, !18}
+!34 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
index 0f8855d..74a7610 100644
--- a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
+++ b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
@@ -26,4 +26,4 @@ entry:
 
 declare i32 @printf(i8*, ...)
 
-!0 = metadata !{i32 191}
+!0 = !{i32 191}
diff --git a/test/CodeGen/X86/2010-06-25-asm-RA-crash.ll b/test/CodeGen/X86/2010-06-25-asm-RA-crash.ll
index 0df9dc1..3470a06 100644
--- a/test/CodeGen/X86/2010-06-25-asm-RA-crash.ll
+++ b/test/CodeGen/X86/2010-06-25-asm-RA-crash.ll
@@ -16,4 +16,4 @@ entry:
 
 declare x86_stdcallcc void @RtlUnwind(...)
 
-!0 = metadata !{i32 215}
+!0 = !{i32 215}
diff --git a/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll b/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll
index d7bc21f..7cffdc5 100644
--- a/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll
+++ b/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll
@@ -19,4 +19,4 @@ entry:
   ret i32 %asmresult
 }
 
-!0 = metadata !{i32 108}
+!0 = !{i32 108}
diff --git a/test/CodeGen/X86/2010-07-06-DbgCrash.ll b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
index 9d65dc1..457c498 100644
--- a/test/CodeGen/X86/2010-07-06-DbgCrash.ll
+++ b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
@@ -3,27 +3,27 @@
 @.str = private constant [4 x i8] c"one\00", align 1 ; <[4 x i8]*> [#uses=1]
 @.str1 = private constant [4 x i8] c"two\00", align 1 ; <[5 x i8]*> [#uses=1]
 @C.9.2167 = internal constant [2 x i8*] [i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0)]
-!38 = metadata !{metadata !"0x29", metadata !109} ; [ DW_TAG_file_type ]
-!39 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)\001\00\000\00\000", metadata !109, metadata !108, metadata !108, null, null, null} ; [ DW_TAG_compile_unit ]
-!46 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !109, null, metadata !47} ; [ DW_TAG_pointer_type ]
-!47 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !109, null} ; [ DW_TAG_base_type ]
-!97 = metadata !{metadata !"0x2e\00main\00main\00main\0073\000\001\000\006\000\000\000", i32 0, metadata !39, metadata !98, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!98 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !109, null, null, metadata !99, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!99 = metadata !{metadata !100}
-!100 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !109, null} ; [ DW_TAG_base_type ]
-!101 = metadata !{[2 x i8*]* @C.9.2167}
-!102 = metadata !{metadata !"0x100\00find_strings\0075\000", metadata !103, metadata !38, metadata !104} ; [ DW_TAG_auto_variable ]
-!103 = metadata !{metadata !"0xb\0073\000\000", null, metadata !97} ; [ DW_TAG_lexical_block ]
-!104 = metadata !{metadata !"0x1\00\000\0085312\0064\000\000", metadata !109, null, metadata !46, metadata !105, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 85312, align 64, offset 0] [from ]
-!105 = metadata !{metadata !106}
-!106 = metadata !{metadata !"0x21\000\001333"}    ; [ DW_TAG_subrange_type ]
-!107 = metadata !{i32 73, i32 0, metadata !103, null}
-!108 = metadata !{i32 0}
-!109 = metadata !{metadata !"pbmsrch.c", metadata !"/Users/grawp/LLVM/test-suite/MultiSource/Benchmarks/MiBench/office-stringsearch"}
+!38 = !{!"0x29", !109} ; [ DW_TAG_file_type ]
+!39 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)\001\00\000\00\000", !109, !108, !108, null, null, null} ; [ DW_TAG_compile_unit ]
+!46 = !{!"0xf\00\000\0064\0064\000\000", !109, null, !47} ; [ DW_TAG_pointer_type ]
+!47 = !{!"0x24\00char\000\008\008\000\000\006", !109, null} ; [ DW_TAG_base_type ]
+!97 = !{!"0x2e\00main\00main\00main\0073\000\001\000\006\000\000\000", i32 0, !39, !98, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!98 = !{!"0x15\00\000\000\000\000\000\000", !109, null, null, !99, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!99 = !{!100}
+!100 = !{!"0x24\00int\000\0032\0032\000\000\005", !109, null} ; [ DW_TAG_base_type ]
+!101 = !{[2 x i8*]* @C.9.2167}
+!102 = !{!"0x100\00find_strings\0075\000", !103, !38, !104} ; [ DW_TAG_auto_variable ]
+!103 = !{!"0xb\0073\000\000", null, !97} ; [ DW_TAG_lexical_block ]
+!104 = !{!"0x1\00\000\0085312\0064\000\000", !109, null, !46, !105, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 85312, align 64, offset 0] [from ]
+!105 = !{!106}
+!106 = !{!"0x21\000\001333"}    ; [ DW_TAG_subrange_type ]
+!107 = !MDLocation(line: 73, scope: !103)
+!108 = !{i32 0}
+!109 = !{!"pbmsrch.c", !"/Users/grawp/LLVM/test-suite/MultiSource/Benchmarks/MiBench/office-stringsearch"}
 
 define i32 @main() nounwind ssp {
 bb.nph:
-  tail call void @llvm.dbg.declare(metadata !101, metadata !102, metadata !{metadata !"0x102"}), !dbg !107
+  tail call void @llvm.dbg.declare(metadata [2 x i8*]* @C.9.2167, metadata !102, metadata !{!"0x102"}), !dbg !107
   ret i32 0, !dbg !107
 }
 
diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll
index a613939..e3decf0 100644
--- a/test/CodeGen/X86/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll
@@ -6,8 +6,8 @@
 define i32 @_Z3fooi4SVal(i32 %i, %struct.SVal* noalias %location) nounwind ssp {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
-  call void @llvm.dbg.value(metadata !{%struct.SVal* %location}, i64 0, metadata !25, metadata !{metadata !"0x102"}), !dbg !24
+  call void @llvm.dbg.value(metadata i32 %i, i64 0, metadata !23, metadata !{!"0x102"}), !dbg !24
+  call void @llvm.dbg.value(metadata %struct.SVal* %location, i64 0, metadata !25, metadata !{!"0x102"}), !dbg !24
   %0 = icmp ne i32 %i, 0, !dbg !27                ; <i1> [#uses=1]
   br i1 %0, label %bb, label %bb1, !dbg !27
 
@@ -34,7 +34,7 @@ return:                                           ; preds = %bb2
 define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2 {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{%struct.SVal* %this}, i64 0, metadata !31, metadata !{metadata !"0x102"}), !dbg !34
+  call void @llvm.dbg.value(metadata %struct.SVal* %this, i64 0, metadata !31, metadata !{!"0x102"}), !dbg !34
   %0 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 0, !dbg !34 ; <i8**> [#uses=1]
   store i8* null, i8** %0, align 8, !dbg !34
   %1 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 1, !dbg !34 ; <i32*> [#uses=1]
@@ -52,7 +52,7 @@ entry:
   %0 = alloca %struct.SVal                        ; <%struct.SVal*> [#uses=3]
   %v = alloca %struct.SVal                        ; <%struct.SVal*> [#uses=4]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.SVal* %v}, metadata !38, metadata !{metadata !"0x102"}), !dbg !41
+  call void @llvm.dbg.declare(metadata %struct.SVal* %v, metadata !38, metadata !{!"0x102"}), !dbg !41
   call void @_ZN4SValC1Ev(%struct.SVal* %v) nounwind, !dbg !41
   %1 = getelementptr inbounds %struct.SVal* %v, i32 0, i32 1, !dbg !42 ; <i32*> [#uses=1]
   store i32 1, i32* %1, align 8, !dbg !42
@@ -65,7 +65,7 @@ entry:
   %7 = load i32* %6, align 8, !dbg !43            ; <i32> [#uses=1]
   store i32 %7, i32* %5, align 8, !dbg !43
   %8 = call i32 @_Z3fooi4SVal(i32 2, %struct.SVal* noalias %0) nounwind, !dbg !43 ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{i32 %8}, i64 0, metadata !44, metadata !{metadata !"0x102"}), !dbg !43
+  call void @llvm.dbg.value(metadata i32 %8, i64 0, metadata !44, metadata !{!"0x102"}), !dbg !43
   br label %return, !dbg !45
 
 return:                                           ; preds = %entry
@@ -76,54 +76,54 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!49}
-!46 = metadata !{metadata !16, metadata !17, metadata !20}
+!46 = !{!16, !17, !20}
 
-!0 = metadata !{metadata !"0x2e\00SVal\00SVal\00\0011\000\000\000\006\000\000\0011", metadata !47, metadata !1, metadata !14, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x13\00SVal\001\00128\0064\000\000\000", metadata !47, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
-!2 = metadata !{metadata !"0x29", metadata !47} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\001", metadata !47, metadata !48, metadata !48, metadata !46, null,  null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !5, metadata !7, metadata !0, metadata !9}
-!5 = metadata !{metadata !"0xd\00Data\007\0064\0064\000\000", metadata !47, metadata !1, metadata !6} ; [ DW_TAG_member ]
-!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !47, metadata !2, null} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{metadata !"0xd\00Kind\008\0032\0032\0064\000", metadata !47, metadata !1, metadata !8} ; [ DW_TAG_member ]
-!8 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", metadata !47, metadata !2} ; [ DW_TAG_base_type ]
-!9 = metadata !{metadata !"0x2e\00~SVal\00~SVal\00\0012\000\000\000\006\000\000\0012", metadata !47, metadata !1, metadata !10, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!11 = metadata !{null, metadata !12, metadata !13}
-!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !47, metadata !2, metadata !1} ; [ DW_TAG_pointer_type ]
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !47, metadata !2} ; [ DW_TAG_base_type ]
-!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!15 = metadata !{null, metadata !12}
-!16 = metadata !{metadata !"0x2e\00SVal\00SVal\00_ZN4SValC1Ev\0011\000\001\000\006\000\000\0011", metadata !47, metadata !1, metadata !14, null, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null} ; [ DW_TAG_subprogram ]
-!17 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooi4SVal\0016\000\001\000\006\000\000\0016", metadata !47, metadata !2, metadata !18, null, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null} ; [ DW_TAG_subprogram ]
-!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!19 = metadata !{metadata !13, metadata !13, metadata !1}
-!20 = metadata !{metadata !"0x2e\00main\00main\00main\0023\000\001\000\006\000\000\0023", metadata !47, metadata !2, metadata !21, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
-!21 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!22 = metadata !{metadata !13}
-!23 = metadata !{metadata !"0x101\00i\0016\000", metadata !17, metadata !2, metadata !13} ; [ DW_TAG_arg_variable ]
-!24 = metadata !{i32 16, i32 0, metadata !17, null}
-!25 = metadata !{metadata !"0x101\00location\0016\000", metadata !17, metadata !2, metadata !26} ; [ DW_TAG_arg_variable ]
-!26 = metadata !{metadata !"0x10\00SVal\000\0064\0064\000\000", metadata !47, metadata !2, metadata !1} ; [ DW_TAG_reference_type ]
-!27 = metadata !{i32 17, i32 0, metadata !28, null}
-!28 = metadata !{metadata !"0xb\0016\000\002", metadata !47, metadata !17} ; [ DW_TAG_lexical_block ]
-!29 = metadata !{i32 18, i32 0, metadata !28, null}
-!30 = metadata !{i32 20, i32 0, metadata !28, null}
-!31 = metadata !{metadata !"0x101\00this\0011\000", metadata !16, metadata !2, metadata !32} ; [ DW_TAG_arg_variable ]
-!32 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !47, metadata !2, metadata !33} ; [ DW_TAG_const_type ]
-!33 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !47, metadata !2, metadata !1} ; [ DW_TAG_pointer_type ]
-!34 = metadata !{i32 11, i32 0, metadata !16, null}
-!35 = metadata !{i32 11, i32 0, metadata !36, null}
-!36 = metadata !{metadata !"0xb\0011\000\001", metadata !47, metadata !37} ; [ DW_TAG_lexical_block ]
-!37 = metadata !{metadata !"0xb\0011\000\000", metadata !47, metadata !16} ; [ DW_TAG_lexical_block ]
-!38 = metadata !{metadata !"0x100\00v\0024\000", metadata !39, metadata !2, metadata !1} ; [ DW_TAG_auto_variable ]
-!39 = metadata !{metadata !"0xb\0023\000\004", metadata !47, metadata !40} ; [ DW_TAG_lexical_block ]
-!40 = metadata !{metadata !"0xb\0023\000\003", metadata !47, metadata !20} ; [ DW_TAG_lexical_block ]
-!41 = metadata !{i32 24, i32 0, metadata !39, null}
-!42 = metadata !{i32 25, i32 0, metadata !39, null}
-!43 = metadata !{i32 26, i32 0, metadata !39, null}
-!44 = metadata !{metadata !"0x100\00k\0026\000", metadata !39, metadata !2, metadata !13} ; [ DW_TAG_auto_variable ]
-!45 = metadata !{i32 27, i32 0, metadata !39, null}
-!47 = metadata !{metadata !"small.cc", metadata !"/Users/manav/R8248330"}
-!48 = metadata !{i32 0}
-!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00SVal\00SVal\00\0011\000\000\000\006\000\000\0011", !47, !1, !14, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x13\00SVal\001\00128\0064\000\000\000", !47, !2, null, !4, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
+!2 = !{!"0x29", !47} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\001", !47, !48, !48, !46, null,  null} ; [ DW_TAG_compile_unit ]
+!4 = !{!5, !7, !0, !9}
+!5 = !{!"0xd\00Data\007\0064\0064\000\000", !47, !1, !6} ; [ DW_TAG_member ]
+!6 = !{!"0xf\00\000\0064\0064\000\000", !47, !2, null} ; [ DW_TAG_pointer_type ]
+!7 = !{!"0xd\00Kind\008\0032\0032\0064\000", !47, !1, !8} ; [ DW_TAG_member ]
+!8 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", !47, !2} ; [ DW_TAG_base_type ]
+!9 = !{!"0x2e\00~SVal\00~SVal\00\0012\000\000\000\006\000\000\0012", !47, !1, !10, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!10 = !{!"0x15\00\000\000\000\000\000\000", !47, !2, null, !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = !{null, !12, !13}
+!12 = !{!"0xf\00\000\0064\0064\000\0064", !47, !2, !1} ; [ DW_TAG_pointer_type ]
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", !47, !2} ; [ DW_TAG_base_type ]
+!14 = !{!"0x15\00\000\000\000\000\000\000", !47, !2, null, !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = !{null, !12}
+!16 = !{!"0x2e\00SVal\00SVal\00_ZN4SValC1Ev\0011\000\001\000\006\000\000\0011", !47, !1, !14, null, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null} ; [ DW_TAG_subprogram ]
+!17 = !{!"0x2e\00foo\00foo\00_Z3fooi4SVal\0016\000\001\000\006\000\000\0016", !47, !2, !18, null, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null} ; [ DW_TAG_subprogram ]
+!18 = !{!"0x15\00\000\000\000\000\000\000", !47, !2, null, !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = !{!13, !13, !1}
+!20 = !{!"0x2e\00main\00main\00main\0023\000\001\000\006\000\000\0023", !47, !2, !21, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
+!21 = !{!"0x15\00\000\000\000\000\000\000", !47, !2, null, !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = !{!13}
+!23 = !{!"0x101\00i\0016\000", !17, !2, !13} ; [ DW_TAG_arg_variable ]
+!24 = !MDLocation(line: 16, scope: !17)
+!25 = !{!"0x101\00location\0016\000", !17, !2, !26} ; [ DW_TAG_arg_variable ]
+!26 = !{!"0x10\00SVal\000\0064\0064\000\000", !47, !2, !1} ; [ DW_TAG_reference_type ]
+!27 = !MDLocation(line: 17, scope: !28)
+!28 = !{!"0xb\0016\000\002", !47, !17} ; [ DW_TAG_lexical_block ]
+!29 = !MDLocation(line: 18, scope: !28)
+!30 = !MDLocation(line: 20, scope: !28)
+!31 = !{!"0x101\00this\0011\000", !16, !2, !32} ; [ DW_TAG_arg_variable ]
+!32 = !{!"0x26\00\000\0064\0064\000\0064", !47, !2, !33} ; [ DW_TAG_const_type ]
+!33 = !{!"0xf\00\000\0064\0064\000\000", !47, !2, !1} ; [ DW_TAG_pointer_type ]
+!34 = !MDLocation(line: 11, scope: !16)
+!35 = !MDLocation(line: 11, scope: !36)
+!36 = !{!"0xb\0011\000\001", !47, !37} ; [ DW_TAG_lexical_block ]
+!37 = !{!"0xb\0011\000\000", !47, !16} ; [ DW_TAG_lexical_block ]
+!38 = !{!"0x100\00v\0024\000", !39, !2, !1} ; [ DW_TAG_auto_variable ]
+!39 = !{!"0xb\0023\000\004", !47, !40} ; [ DW_TAG_lexical_block ]
+!40 = !{!"0xb\0023\000\003", !47, !20} ; [ DW_TAG_lexical_block ]
+!41 = !MDLocation(line: 24, scope: !39)
+!42 = !MDLocation(line: 25, scope: !39)
+!43 = !MDLocation(line: 26, scope: !39)
+!44 = !{!"0x100\00k\0026\000", !39, !2, !13} ; [ DW_TAG_auto_variable ]
+!45 = !MDLocation(line: 27, scope: !39)
+!47 = !{!"small.cc", !"/Users/manav/R8248330"}
+!48 = !{i32 0}
+!49 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
index f52e922..cf9897a 100644
--- a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
+++ b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
@@ -15,21 +15,21 @@ entry:
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!17}
 
-!0 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0053\000\001\000\006\000\000\000", metadata !14, metadata !1, metadata !3, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x29", metadata !14} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 114084)\000\00\000\00\000", metadata !15, metadata !16, metadata !16, metadata !13, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !14, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !14, metadata !1} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", metadata !15, metadata !7, metadata !3, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ]
-!7 = metadata !{metadata !"0x29", metadata !15} ; [ DW_TAG_file_type ]
-!8 = metadata !{i32 53, i32 13, metadata !9, null}
-!9 = metadata !{metadata !"0xb\0053\0011\000", metadata !14, metadata !0} ; [ DW_TAG_lexical_block ]
-!10 = metadata !{i32 4, i32 13, metadata !11, null}
-!11 = metadata !{metadata !"0xb\004\0013\002", metadata !15, metadata !12} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{metadata !"0xb\004\0011\001", metadata !15, metadata !6} ; [ DW_TAG_lexical_block ]
-!13 = metadata !{metadata !0, metadata !6}
-!14 = metadata !{metadata !"", metadata !"/private/tmp"}
-!15 = metadata !{metadata !"bug.c", metadata !"/private/tmp"}
-!16 = metadata !{i32 0}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00foo\00foo\00foo\0053\000\001\000\006\000\000\000", !14, !1, !3, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x29", !14} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 2.9 (trunk 114084)\000\00\000\00\000", !15, !16, !16, !13, null, null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !14, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", !14, !1} ; [ DW_TAG_base_type ]
+!6 = !{!"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", !15, !7, !3, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ]
+!7 = !{!"0x29", !15} ; [ DW_TAG_file_type ]
+!8 = !MDLocation(line: 53, column: 13, scope: !9)
+!9 = !{!"0xb\0053\0011\000", !14, !0} ; [ DW_TAG_lexical_block ]
+!10 = !MDLocation(line: 4, column: 13, scope: !11)
+!11 = !{!"0xb\004\0013\002", !15, !12} ; [ DW_TAG_lexical_block ]
+!12 = !{!"0xb\004\0011\001", !15, !6} ; [ DW_TAG_lexical_block ]
+!13 = !{!0, !6}
+!14 = !{!"", !"/private/tmp"}
+!15 = !{!"bug.c", !"/private/tmp"}
+!16 = !{i32 0}
+!17 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-09-16-asmcrash.ll b/test/CodeGen/X86/2010-09-16-asmcrash.ll
index 9bbd691..7aa9f32 100644
--- a/test/CodeGen/X86/2010-09-16-asmcrash.ll
+++ b/test/CodeGen/X86/2010-09-16-asmcrash.ll
@@ -53,4 +53,4 @@ return:                                           ; preds = %while.end, %while.b
   ret void
 }
 
-!0 = metadata !{i32 158484}
+!0 = !{i32 158484}
diff --git a/test/CodeGen/X86/2010-11-02-DbgParameter.ll b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
index 53fb0af..df3aa1f 100644
--- a/test/CodeGen/X86/2010-11-02-DbgParameter.ll
+++ b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
@@ -9,7 +9,7 @@ target triple = "i386-apple-darwin11.0.0"
 define i32 @foo(%struct.bar* nocapture %i) nounwind readnone optsize noinline ssp {
 ; CHECK: TAG_formal_parameter
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.bar* %i}, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !12
+  tail call void @llvm.dbg.value(metadata %struct.bar* %i, i64 0, metadata !6, metadata !{!"0x102"}), !dbg !12
   ret i32 1, !dbg !13
 }
 
@@ -18,23 +18,23 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!19}
 
-!0 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\001\000\006\00256\001\003", metadata !17, metadata !1, metadata !3, null, i32 (%struct.bar*)* @foo, null, null, metadata !16} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x29", metadata !17} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 117922)\001\00\000\00\000", metadata !17, metadata !18, metadata !18, metadata !15, null,  null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !17, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !17, metadata !2} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x101\00i\003\000", metadata !0, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !17, metadata !1, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{metadata !"0x13\00bar\002\0064\0032\000\000\000", metadata !17, metadata !1, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 2, size 64, align 32, offset 0] [def] [from ]
-!9 = metadata !{metadata !10, metadata !11}
-!10 = metadata !{metadata !"0xd\00x\002\0032\0032\000\000", metadata !17,  metadata !1, metadata !5} ; [ DW_TAG_member ]
-!11 = metadata !{metadata !"0xd\00y\002\0032\0032\0032\000", metadata !17, metadata !1, metadata !5} ; [ DW_TAG_member ]
-!12 = metadata !{i32 3, i32 47, metadata !0, null}
-!13 = metadata !{i32 4, i32 2, metadata !14, null}
-!14 = metadata !{metadata !"0xb\003\0050\000", metadata !17, metadata !0} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{metadata !0}
-!16 = metadata !{metadata !6}
-!17 = metadata !{metadata !"one.c", metadata !"/private/tmp"}
-!18 = metadata !{i32 0}
-!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00foo\00foo\00\003\000\001\000\006\00256\001\003", !17, !1, !3, null, i32 (%struct.bar*)* @foo, null, null, !16} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x29", !17} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 2.9 (trunk 117922)\001\00\000\00\000", !17, !18, !18, !15, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !17, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", !17, !2} ; [ DW_TAG_base_type ]
+!6 = !{!"0x101\00i\003\000", !0, !1, !7} ; [ DW_TAG_arg_variable ]
+!7 = !{!"0xf\00\000\0032\0032\000\000", !17, !1, !8} ; [ DW_TAG_pointer_type ]
+!8 = !{!"0x13\00bar\002\0064\0032\000\000\000", !17, !1, null, !9, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 2, size 64, align 32, offset 0] [def] [from ]
+!9 = !{!10, !11}
+!10 = !{!"0xd\00x\002\0032\0032\000\000", !17,  !1, !5} ; [ DW_TAG_member ]
+!11 = !{!"0xd\00y\002\0032\0032\0032\000", !17, !1, !5} ; [ DW_TAG_member ]
+!12 = !MDLocation(line: 3, column: 47, scope: !0)
+!13 = !MDLocation(line: 4, column: 2, scope: !14)
+!14 = !{!"0xb\003\0050\000", !17, !0} ; [ DW_TAG_lexical_block ]
+!15 = !{!0}
+!16 = !{!6}
+!17 = !{!"one.c", !"/private/tmp"}
+!18 = !{i32 0}
+!19 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
index ac7fbf2..8404020 100644
--- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
+++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
@@ -22,8 +22,8 @@ target triple = "x86_64-apple-darwin10.0.0"
 
 define i64 @gcd(i64 %a, i64 %b) nounwind readnone optsize noinline ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !18
-  tail call void @llvm.dbg.value(metadata !{i64 %b}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !19
+  tail call void @llvm.dbg.value(metadata i64 %a, i64 0, metadata !10, metadata !{!"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata i64 %b, i64 0, metadata !11, metadata !{!"0x102"}), !dbg !19
   br label %while.body, !dbg !20
 
 while.body:                                       ; preds = %while.body, %entry
@@ -34,14 +34,14 @@ while.body:                                       ; preds = %while.body, %entry
   br i1 %cmp, label %if.then, label %while.body, !dbg !23
 
 if.then:                                          ; preds = %while.body
-  tail call void @llvm.dbg.value(metadata !{i64 %rem}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !21
+  tail call void @llvm.dbg.value(metadata i64 %rem, i64 0, metadata !12, metadata !{!"0x102"}), !dbg !21
   ret i64 %b.addr.0, !dbg !23
 }
 
 define i32 @main() nounwind optsize ssp {
 entry:
   %call = tail call i32 @rand() nounwind optsize, !dbg !24
-  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !24
+  tail call void @llvm.dbg.value(metadata i32 %call, i64 0, metadata !14, metadata !{!"0x102"}), !dbg !24
   %cmp = icmp ugt i32 %call, 21, !dbg !25
   br i1 %cmp, label %cond.true, label %cond.end, !dbg !25
 
@@ -51,7 +51,7 @@ cond.true:                                        ; preds = %entry
 
 cond.end:                                         ; preds = %entry, %cond.true
   %cond = phi i32 [ %call1, %cond.true ], [ %call, %entry ], !dbg !25
-  tail call void @llvm.dbg.value(metadata !{i32 %cond}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !25
+  tail call void @llvm.dbg.value(metadata i32 %cond, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !25
   %conv = sext i32 %cond to i64, !dbg !26
   %conv5 = zext i32 %call to i64, !dbg !26
   %call6 = tail call i64 @gcd(i64 %conv, i64 %conv5) optsize, !dbg !26
@@ -78,37 +78,37 @@ declare i32 @puts(i8* nocapture) nounwind
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!33}
 
-!0 = metadata !{metadata !"0x2e\00gcd\00gcd\00\005\000\001\000\006\00256\001\000", metadata !31, metadata !1, metadata !3, null, i64 (i64, i64)* @gcd, null, null, metadata !29} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [gcd]
-!1 = metadata !{metadata !"0x29", metadata !31} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 124117)\001\00\000\00\001", metadata !31, metadata !32, metadata !32, metadata !28, null,  null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x2e\00main\00main\00\0025\000\001\000\006\000\001\000", metadata !31, metadata !1, metadata !7, null, i32 ()* @main, null, null, metadata !30} ; [ DW_TAG_subprogram ] [line 25] [def] [scope 0] [main]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !1, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !"0x101\00a\005\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{metadata !"0x101\00b\005\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{metadata !"0x100\00c\006\000", metadata !13, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!13 = metadata !{metadata !"0xb\005\0052\000", metadata !31, metadata !0} ; [ DW_TAG_lexical_block ]
-!14 = metadata !{metadata !"0x100\00m\0026\000", metadata !15, metadata !1, metadata !16} ; [ DW_TAG_auto_variable ]
-!15 = metadata !{metadata !"0xb\0025\0012\002", metadata !31, metadata !6} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, metadata !2} ; [ DW_TAG_base_type ]
-!17 = metadata !{metadata !"0x100\00z_s\0027\000", metadata !15, metadata !1, metadata !9} ; [ DW_TAG_auto_variable ]
-!18 = metadata !{i32 5, i32 41, metadata !0, null}
-!19 = metadata !{i32 5, i32 49, metadata !0, null}
-!20 = metadata !{i32 7, i32 5, metadata !13, null}
-!21 = metadata !{i32 8, i32 9, metadata !22, null}
-!22 = metadata !{metadata !"0xb\007\0014\001", metadata !31, metadata !13} ; [ DW_TAG_lexical_block ]
-!23 = metadata !{i32 9, i32 9, metadata !22, null}
-!24 = metadata !{i32 26, i32 38, metadata !15, null}
-!25 = metadata !{i32 27, i32 38, metadata !15, null}
-!26 = metadata !{i32 28, i32 9, metadata !15, null}
-!27 = metadata !{i32 30, i32 1, metadata !15, null}
-!28 = metadata !{metadata !0, metadata !6}
-!29 = metadata !{metadata !10, metadata !11, metadata !12}
-!30 = metadata !{metadata !14, metadata !17}
-!31 = metadata !{metadata !"rem_small.c", metadata !"/private/tmp"}
-!32 = metadata !{i32 0}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00gcd\00gcd\00\005\000\001\000\006\00256\001\000", !31, !1, !3, null, i64 (i64, i64)* @gcd, null, null, !29} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [gcd]
+!1 = !{!"0x29", !31} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 2.9 (trunk 124117)\001\00\000\00\001", !31, !32, !32, !28, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !31, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00long int\000\0064\0064\000\000\005", null, !2} ; [ DW_TAG_base_type ]
+!6 = !{!"0x2e\00main\00main\00\0025\000\001\000\006\000\001\000", !31, !1, !7, null, i32 ()* @main, null, null, !30} ; [ DW_TAG_subprogram ] [line 25] [def] [scope 0] [main]
+!7 = !{!"0x15\00\000\000\000\000\000\000", !31, !1, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !2} ; [ DW_TAG_base_type ]
+!10 = !{!"0x101\00a\005\000", !0, !1, !5} ; [ DW_TAG_arg_variable ]
+!11 = !{!"0x101\00b\005\000", !0, !1, !5} ; [ DW_TAG_arg_variable ]
+!12 = !{!"0x100\00c\006\000", !13, !1, !5} ; [ DW_TAG_auto_variable ]
+!13 = !{!"0xb\005\0052\000", !31, !0} ; [ DW_TAG_lexical_block ]
+!14 = !{!"0x100\00m\0026\000", !15, !1, !16} ; [ DW_TAG_auto_variable ]
+!15 = !{!"0xb\0025\0012\002", !31, !6} ; [ DW_TAG_lexical_block ]
+!16 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", null, !2} ; [ DW_TAG_base_type ]
+!17 = !{!"0x100\00z_s\0027\000", !15, !1, !9} ; [ DW_TAG_auto_variable ]
+!18 = !MDLocation(line: 5, column: 41, scope: !0)
+!19 = !MDLocation(line: 5, column: 49, scope: !0)
+!20 = !MDLocation(line: 7, column: 5, scope: !13)
+!21 = !MDLocation(line: 8, column: 9, scope: !22)
+!22 = !{!"0xb\007\0014\001", !31, !13} ; [ DW_TAG_lexical_block ]
+!23 = !MDLocation(line: 9, column: 9, scope: !22)
+!24 = !MDLocation(line: 26, column: 38, scope: !15)
+!25 = !MDLocation(line: 27, column: 38, scope: !15)
+!26 = !MDLocation(line: 28, column: 9, scope: !15)
+!27 = !MDLocation(line: 30, column: 1, scope: !15)
+!28 = !{!0, !6}
+!29 = !{!10, !11, !12}
+!30 = !{!14, !17}
+!31 = !{!"rem_small.c", !"/private/tmp"}
+!32 = !{i32 0}
+!33 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll b/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll
index 445fc01..b764da1 100644
--- a/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll
+++ b/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll
@@ -41,5 +41,5 @@ entry:
 
 declare void @llvm.x86.mmx.emms() nounwind
 
-!0 = metadata !{i32 888, i32 917, i32 945, i32 973, i32 1001, i32 1029, i32 1057}
-!1 = metadata !{i32 1390, i32 1430, i32 1469, i32 1508, i32 1547, i32 1586, i32 1625, i32 1664}
+!0 = !{i32 888, i32 917, i32 945, i32 973, i32 1001, i32 1029, i32 1057}
+!1 = !{i32 1390, i32 1430, i32 1469, i32 1508, i32 1547, i32 1586, i32 1625, i32 1664}
diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index 222068d..7eaa5bb 100644
--- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -26,7 +26,7 @@ entry:
 }
 
 ; CHECK-LABEL: zero_test
-; CHECK: xorps	%xmm0, %xmm0
+; CHECK: pxor %xmm0, %xmm0
 ; CHECK: ret
 
 define void @zero_test() {
diff --git a/test/CodeGen/X86/2011-11-30-or.ll b/test/CodeGen/X86/2011-11-30-or.ll
index 8ac4632..4260e81 100644
--- a/test/CodeGen/X86/2011-11-30-or.ll
+++ b/test/CodeGen/X86/2011-11-30-or.ll
@@ -2,13 +2,13 @@
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 target triple = "x86_64-apple-macosx10.6.6"
-
-; Test that the order of operands is correct
-; CHECK: select_func
-; CHECK: pblendvb        %xmm1, %xmm2
-; CHECK: ret
-
-define void @select_func(<8 x i16> %in) {
+
+; Test that the order of operands is correct
+; CHECK: select_func
+; CHECK: pblendvb        {{LCPI0_[0-9]*}}(%rip), %xmm1
+; CHECK: ret
+
+define void @select_func(<8 x i16> %in) {
 entry:
   %c.lobit.i.i.i = ashr <8 x i16> %in, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   %and.i56.i.i.i = and <8 x i16> %c.lobit.i.i.i, <i16 25, i16 8, i16 65, i16 25, i16 8, i16 95, i16 15, i16 45>
diff --git a/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll b/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll
index cd8a16f..b78c13f 100644
--- a/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll
+++ b/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i686-linux -mattr=-sse | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-linux -mattr=-sse | FileCheck %s
 ; PR11768
 
 @ptr = external global i8*
diff --git a/test/CodeGen/X86/2012-05-19-avx2-store.ll b/test/CodeGen/X86/2012-05-19-avx2-store.ll
deleted file mode 100644
index 1c1e8e2..0000000
--- a/test/CodeGen/X86/2012-05-19-avx2-store.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx2 | FileCheck %s
-
-define void @double_save(<4 x i32>* %Ap, <4 x i32>* %Bp, <8 x i32>* %P) nounwind ssp {
-entry:
-  ; CHECK: vmovaps
-  ; CHECK: vinsertf128 $1, ([[A0:%rdi|%rsi]]),
-  ; CHECK: vmovups
-  %A = load <4 x i32>* %Ap
-  %B = load <4 x i32>* %Bp
-  %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x i32> %Z, <8 x i32>* %P, align 16
-  ret void
-}
diff --git a/test/CodeGen/X86/2012-07-15-broadcastfold.ll b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
index 519c7ca..1c39c74 100644
--- a/test/CodeGen/X86/2012-07-15-broadcastfold.ll
+++ b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 | FileCheck %s
-; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s
 
 declare x86_fastcallcc i64 @barrier()
 
diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
index 1a5efda..c33b48d 100644
--- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
@@ -16,7 +16,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define signext i16 @subdivp(%struct.node.0.27* nocapture %p, double %dsq, double %tolsq, %struct.hgstruct.2.29* nocapture byval align 8 %hg) nounwind uwtable readonly ssp {
 entry:
-  call void @llvm.dbg.declare(metadata !{%struct.hgstruct.2.29* %hg}, metadata !4, metadata !{metadata !"0x102"})
+  call void @llvm.dbg.declare(metadata %struct.hgstruct.2.29* %hg, metadata !4, metadata !{!"0x102"})
   %type = getelementptr inbounds %struct.node.0.27* %p, i64 0, i32 0
   %0 = load i16* %type, align 2
   %cmp = icmp eq i16 %0, 1
@@ -38,15 +38,15 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!12}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 168918) (llvm/trunk 168920)\001\00\000\00\000", metadata !11, metadata !2, metadata !2, metadata !13, metadata !2, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99]
-!2 = metadata !{}
-!4 = metadata !{metadata !"0x101\00hg\0067109589\000", null, metadata !5, metadata !6} ; [ DW_TAG_arg_variable ] [hg] [line 725]
-!5 = metadata !{metadata !"0x29", metadata !11} ; [ DW_TAG_file_type ]
-!6 = metadata !{metadata !"0x16\00hgstruct\00492\000\000\000\000", metadata !11, null, metadata !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !"0x13\00\00487\00512\0064\000\000\000", metadata !11, null, null, null, null, i32 0, null} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [def] [from ]
-!11 = metadata !{metadata !"MultiSource/Benchmarks/Olden/bh/newbh.c", metadata !"MultiSource/Benchmarks/Olden/bh"}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!13 = metadata !{metadata !14}
-!14 = metadata !{metadata !"0x2e\00subdivp\00subdivp\00\000\000\001\000\006\00256\001\001", metadata !11, metadata !5, metadata !15, null, i16 (%struct.node.0.27*, double, double, %struct.hgstruct.2.29* )* @subdivp, null, null, null} ; [ DW_TAG_subprogram ] [def] [subdivp]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{null}
+!0 = !{!"0x11\0012\00clang version 3.3 (trunk 168918) (llvm/trunk 168920)\001\00\000\00\000", !11, !2, !2, !13, !2, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99]
+!2 = !{}
+!4 = !{!"0x101\00hg\0067109589\000", null, !5, !6} ; [ DW_TAG_arg_variable ] [hg] [line 725]
+!5 = !{!"0x29", !11} ; [ DW_TAG_file_type ]
+!6 = !{!"0x16\00hgstruct\00492\000\000\000\000", !11, null, !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ]
+!7 = !{!"0x13\00\00487\00512\0064\000\000\000", !11, null, null, null, null, i32 0, null} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [def] [from ]
+!11 = !{!"MultiSource/Benchmarks/Olden/bh/newbh.c", !"MultiSource/Benchmarks/Olden/bh"}
+!12 = !{i32 1, !"Debug Info Version", i32 2}
+!13 = !{!14}
+!14 = !{!"0x2e\00subdivp\00subdivp\00\000\000\001\000\006\00256\001\001", !11, !5, !15, null, i16 (%struct.node.0.27*, double, double, %struct.hgstruct.2.29* )* @subdivp, null, null, null} ; [ DW_TAG_subprogram ] [def] [subdivp]
+!15 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{null}
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
index 083aacd..28ceb2f 100644
--- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
@@ -43,7 +43,7 @@ if.then3344:
   br label %if.then4073
 
 if.then4073:                                      ; preds = %if.then3344
-  call void @llvm.dbg.declare(metadata !{[20 x i8]* %num14075}, metadata !4, metadata !{metadata !"0x102"})
+  call void @llvm.dbg.declare(metadata [20 x i8]* %num14075, metadata !4, metadata !{!"0x102"})
   %arraydecay4078 = getelementptr inbounds [20 x i8]* %num14075, i64 0, i64 0
   %0 = load i32* undef, align 4
   %add4093 = add nsw i32 %0, 0
@@ -65,30 +65,30 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!35}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 168918) (llvm/trunk 168920)\001\00\000\00\000", metadata !19, metadata !2, metadata !2, metadata !20, metadata !2, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99]
-!1 = metadata !{metadata !2}
-!2 = metadata !{}
-!4 = metadata !{metadata !"0x100\00num1\00815\000", metadata !5, metadata !14, metadata !15} ; [ DW_TAG_auto_variable ] [num1] [line 815]
-!5 = metadata !{metadata !"0xb\00815\000\00177", metadata !14, metadata !6} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!6 = metadata !{metadata !"0xb\00812\000\00176", metadata !14, metadata !7} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!7 = metadata !{metadata !"0xb\00807\000\00175", metadata !14, metadata !8} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!8 = metadata !{metadata !"0xb\00440\000\0094", metadata !14, metadata !9} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!9 = metadata !{metadata !"0xb\00435\000\0091", metadata !14, metadata !10} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!10 = metadata !{metadata !"0xb\00434\000\0090", metadata !14, metadata !11} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!11 = metadata !{metadata !"0xb\00250\000\0024", metadata !14, metadata !12} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!12 = metadata !{metadata !"0xb\00249\000\0023", metadata !14, metadata !13} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!13 = metadata !{metadata !"0xb\00221\000\0019", metadata !14, metadata !2} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!14 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ]
-!15 = metadata !{metadata !"0x1\00\000\00160\008\000\000", null, null, metadata !16, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
-!16 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!17 = metadata !{metadata !18}
-!18 = metadata !{metadata !"0x21\000\0020"}       ; [ DW_TAG_subrange_type ] [0, 19]
-!19 = metadata !{metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset"}
-
-!20 = metadata !{metadata !21}
-!21 = metadata !{metadata !"0x2e\00AttachGalley\00AttachGalley\00\000\000\001\000\006\00256\001\001", metadata !19, metadata !14, metadata !22, null, i32 (%union.rec**)* @AttachGalley, null, null, null} ; [ DW_TAG_subprogram ] [def] [AttachGalley]
-!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!23 = metadata !{null}
+!0 = !{!"0x11\0012\00clang version 3.3 (trunk 168918) (llvm/trunk 168920)\001\00\000\00\000", !19, !2, !2, !20, !2, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99]
+!1 = !{!2}
+!2 = !{}
+!4 = !{!"0x100\00num1\00815\000", !5, !14, !15} ; [ DW_TAG_auto_variable ] [num1] [line 815]
+!5 = !{!"0xb\00815\000\00177", !14, !6} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!6 = !{!"0xb\00812\000\00176", !14, !7} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!7 = !{!"0xb\00807\000\00175", !14, !8} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!8 = !{!"0xb\00440\000\0094", !14, !9} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!9 = !{!"0xb\00435\000\0091", !14, !10} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!10 = !{!"0xb\00434\000\0090", !14, !11} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!11 = !{!"0xb\00250\000\0024", !14, !12} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!12 = !{!"0xb\00249\000\0023", !14, !13} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!13 = !{!"0xb\00221\000\0019", !14, !2} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!14 = !{!"0x29", !19} ; [ DW_TAG_file_type ]
+!15 = !{!"0x1\00\000\00160\008\000\000", null, null, !16, !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
+!16 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!17 = !{!18}
+!18 = !{!"0x21\000\0020"}       ; [ DW_TAG_subrange_type ] [0, 19]
+!19 = !{!"MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", !"MultiSource/Benchmarks/MiBench/consumer-typeset"}
+
+!20 = !{!21}
+!21 = !{!"0x2e\00AttachGalley\00AttachGalley\00\000\000\001\000\006\00256\001\001", !19, !14, !22, null, i32 (%union.rec**)* @AttachGalley, null, null, null} ; [ DW_TAG_subprogram ] [def] [AttachGalley]
+!22 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = !{null}
 
 ; Test DebugValue uses visited by RegisterPressureTracker findUseBetween().
 ;
@@ -108,7 +108,7 @@ cond.true:                                        ; preds = %entry
   unreachable
 
 cond.end:                                         ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{%"class.__gnu_cxx::hash_map"* %X}, metadata !31, metadata !{metadata !"0x102"})
+  call void @llvm.dbg.declare(metadata %"class.__gnu_cxx::hash_map"* %X, metadata !31, metadata !{!"0x102"})
   %_M_num_elements.i.i.i.i = getelementptr inbounds %"class.__gnu_cxx::hash_map"* %X, i64 0, i32 0, i32 5
   invoke void @_Znwm()
           to label %exit.i unwind label %lpad2.i.i.i.i
@@ -134,11 +134,11 @@ declare void @_Znwm()
 
 !llvm.dbg.cu = !{!30}
 
-!30 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 169129) (llvm/trunk 169135)\001\00\000\00\000", metadata !34, metadata !2, metadata !2, metadata !36, null, null} ; [ DW_TAG_compile_unit ] [SingleSource/Benchmarks/Shootout-C++/hash.cpp] [DW_LANG_C_plus_plus]
-!31 = metadata !{metadata !"0x100\00X\0029\000", null, null, metadata !32} ; [ DW_TAG_auto_variable ] [X] [line 29]
-!32 = metadata !{metadata !"0x16\00HM\0028\000\000\000\000", metadata !34, null, null} ; [ DW_TAG_typedef ] [HM] [line 28, size 0, align 0, offset 0] [from ]
-!33 = metadata !{metadata !"0x29", metadata !34} ; [ DW_TAG_file_type ]
-!34 = metadata !{metadata !"SingleSource/Benchmarks/Shootout-C++/hash.cpp", metadata !"SingleSource/Benchmarks/Shootout-C++"}
-!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!36 = metadata !{metadata !37}
-!37 = metadata !{metadata !"0x2e\00main\00main\00\000\000\001\000\006\00256\001\001", metadata !19, metadata !14, metadata !22, null, void ()* @main, null, null, null} ; [ DW_TAG_subprogram ] [def] [main]
+!30 = !{!"0x11\004\00clang version 3.3 (trunk 169129) (llvm/trunk 169135)\001\00\000\00\000", !34, !2, !2, !36, null, null} ; [ DW_TAG_compile_unit ] [SingleSource/Benchmarks/Shootout-C++/hash.cpp] [DW_LANG_C_plus_plus]
+!31 = !{!"0x100\00X\0029\000", null, null, !32} ; [ DW_TAG_auto_variable ] [X] [line 29]
+!32 = !{!"0x16\00HM\0028\000\000\000\000", !34, null, null} ; [ DW_TAG_typedef ] [HM] [line 28, size 0, align 0, offset 0] [from ]
+!33 = !{!"0x29", !34} ; [ DW_TAG_file_type ]
+!34 = !{!"SingleSource/Benchmarks/Shootout-C++/hash.cpp", !"SingleSource/Benchmarks/Shootout-C++"}
+!35 = !{i32 1, !"Debug Info Version", i32 2}
+!36 = !{!37}
+!37 = !{!"0x2e\00main\00main\00\000\000\001\000\006\00256\001\001", !19, !14, !22, null, void ()* @main, null, null, null} ; [ DW_TAG_subprogram ] [def] [main]
diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
index 458ce4f..04b3174 100644
--- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
@@ -20,7 +20,7 @@ if.then:                                          ; preds = %entry
   unreachable
 
 if.end:                                           ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{%struct.btCompoundLeafCallback* %callback}, metadata !3, metadata !{metadata !"0x102"})
+  call void @llvm.dbg.declare(metadata %struct.btCompoundLeafCallback* %callback, metadata !3, metadata !{!"0x102"})
   %m = getelementptr inbounds %struct.btCompoundLeafCallback* %callback, i64 0, i32 1
   store i32 0, i32* undef, align 8
   %cmp12447 = icmp sgt i32 undef, 0
@@ -36,13 +36,13 @@ invoke.cont44:                                    ; preds = %if.end
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 168984) (llvm/trunk 168983)\001\00\000\00\000", metadata !6, null, null, metadata !1, null, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Bullet/MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !2}
-!2 = metadata !{metadata !"0x2e\00test\00test\00\000\000\001\000\006\00256\001\001", metadata !6, metadata !5, metadata !7, null, void ()* @test, null, null, null} ; [ DW_TAG_subprogram ] [def] [test]
-!3 = metadata !{metadata !"0x100\00callback\00214\000", null, null, metadata !4} ; [ DW_TAG_auto_variable ] [callback] [line 214]
-!4 = metadata !{metadata !"0x13\00btCompoundLeafCallback\0090\00512\0064\000\000\000", metadata !6, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [def] [from ]
-!5 = metadata !{metadata !"0x29", metadata !6} ; [ DW_TAG_file_type ]
-!6 = metadata !{metadata !"MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", metadata !"MultiSource/Benchmarks/Bullet"}
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!9 = metadata !{null}
+!0 = !{!"0x11\004\00clang version 3.3 (trunk 168984) (llvm/trunk 168983)\001\00\000\00\000", !6, null, null, !1, null, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Bullet/MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!2}
+!2 = !{!"0x2e\00test\00test\00\000\000\001\000\006\00256\001\001", !6, !5, !7, null, void ()* @test, null, null, null} ; [ DW_TAG_subprogram ] [def] [test]
+!3 = !{!"0x100\00callback\00214\000", null, null, !4} ; [ DW_TAG_auto_variable ] [callback] [line 214]
+!4 = !{!"0x13\00btCompoundLeafCallback\0090\00512\0064\000\000\000", !6, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [def] [from ]
+!5 = !{!"0x29", !6} ; [ DW_TAG_file_type ]
+!6 = !{!"MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", !"MultiSource/Benchmarks/Bullet"}
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{i32 1, !"Debug Info Version", i32 2}
+!9 = !{null}
diff --git a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
index 10dc927..9cd150a 100644
--- a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
+++ b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
@@ -41,7 +41,7 @@ entry:
     i1 false, label %label_end
   ]
 default:
-  unreachable
+  br label %label_end
 
 label_true:
   br label %label_end
@@ -80,7 +80,7 @@ entry:
     i1 false, label %label_end
   ]
 default:
-  unreachable
+  br label %label_end
 
 label_true:
   br label %label_end
@@ -119,7 +119,7 @@ entry:
     i1 false, label %label_end
   ]
 default:
-  unreachable
+  br label %label_end
 
 label_true:
   br label %label_end
diff --git a/test/CodeGen/X86/MachineBranchProb.ll b/test/CodeGen/X86/MachineBranchProb.ll
index a893152..cf41ef2 100644
--- a/test/CodeGen/X86/MachineBranchProb.ll
+++ b/test/CodeGen/X86/MachineBranchProb.ll
@@ -31,4 +31,4 @@ for.inc20:                                        ; preds = %for.cond2
   ret void
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 112017436, i32 -735157296}
+!0 = !{!"branch_weights", i32 112017436, i32 -735157296}
diff --git a/test/CodeGen/X86/MachineSink-DbgValue.ll b/test/CodeGen/X86/MachineSink-DbgValue.ll
index 54d8f65..3a2c58f 100644
--- a/test/CodeGen/X86/MachineSink-DbgValue.ll
+++ b/test/CodeGen/X86/MachineSink-DbgValue.ll
@@ -4,10 +4,10 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.7.0"
 
 define i32 @foo(i32 %i, i32* nocapture %c) nounwind uwtable readonly ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !12
+  tail call void @llvm.dbg.value(metadata i32 %i, i64 0, metadata !6, metadata !{!"0x102"}), !dbg !12
   %ab = load i32* %c, align 1, !dbg !14
-  tail call void @llvm.dbg.value(metadata !{i32* %c}, i64 0, metadata !7, metadata !{metadata !"0x102"}), !dbg !13
-  tail call void @llvm.dbg.value(metadata !{i32 %ab}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !14
+  tail call void @llvm.dbg.value(metadata i32* %c, i64 0, metadata !7, metadata !{!"0x102"}), !dbg !13
+  tail call void @llvm.dbg.value(metadata i32 %ab, i64 0, metadata !10, metadata !{!"0x102"}), !dbg !14
   %cd = icmp eq i32 %i, 42, !dbg !15
   br i1 %cd, label %bb1, label %bb2, !dbg !15
 
@@ -28,26 +28,26 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22}
 
-!0 = metadata !{metadata !"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)\001\00\000\00\001", metadata !20, metadata !21, metadata !21, metadata !18, null,  null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"0x2e\00foo\00foo\00\002\000\001\000\006\00256\001\000", metadata !20, metadata !2, metadata !3, null, i32 (i32, i32*)* @foo, null, null, metadata !19} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
-!2 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x101\00i\0016777218\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{metadata !"0x101\00c\0033554434\000", metadata !1, metadata !2, metadata !8} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !0, metadata !9} ; [ DW_TAG_pointer_type ]
-!9 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !0} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !"0x100\00a\003\000", metadata !11, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{metadata !"0xb\002\0025\000", metadata !20, metadata !1} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 2, i32 13, metadata !1, null}
-!13 = metadata !{i32 2, i32 22, metadata !1, null}
-!14 = metadata !{i32 3, i32 14, metadata !11, null}
-!15 = metadata !{i32 4, i32 3, metadata !11, null}
-!16 = metadata !{i32 5, i32 5, metadata !11, null}
-!17 = metadata !{i32 7, i32 1, metadata !11, null}
-!18 = metadata !{metadata !1}
-!19 = metadata !{metadata !6, metadata !7, metadata !10}
-!20 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
-!21 = metadata !{i32 0}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)\001\00\000\00\001", !20, !21, !21, !18, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = !{!"0x2e\00foo\00foo\00\002\000\001\000\006\00256\001\000", !20, !2, !3, null, i32 (i32, i32*)* @foo, null, null, !19} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
+!2 = !{!"0x29", !20} ; [ DW_TAG_file_type ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !20, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !0} ; [ DW_TAG_base_type ]
+!6 = !{!"0x101\00i\0016777218\000", !1, !2, !5} ; [ DW_TAG_arg_variable ]
+!7 = !{!"0x101\00c\0033554434\000", !1, !2, !8} ; [ DW_TAG_arg_variable ]
+!8 = !{!"0xf\00\000\0064\0064\000\000", null, !0, !9} ; [ DW_TAG_pointer_type ]
+!9 = !{!"0x24\00char\000\008\008\000\000\006", null, !0} ; [ DW_TAG_base_type ]
+!10 = !{!"0x100\00a\003\000", !11, !2, !9} ; [ DW_TAG_auto_variable ]
+!11 = !{!"0xb\002\0025\000", !20, !1} ; [ DW_TAG_lexical_block ]
+!12 = !MDLocation(line: 2, column: 13, scope: !1)
+!13 = !MDLocation(line: 2, column: 22, scope: !1)
+!14 = !MDLocation(line: 3, column: 14, scope: !11)
+!15 = !MDLocation(line: 4, column: 3, scope: !11)
+!16 = !MDLocation(line: 5, column: 5, scope: !11)
+!17 = !MDLocation(line: 7, column: 1, scope: !11)
+!18 = !{!1}
+!19 = !{!6, !7, !10}
+!20 = !{!"a.c", !"/private/tmp"}
+!21 = !{i32 0}
+!22 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll
index f6d6852..f396e88 100644
--- a/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -148,12 +148,12 @@ define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) n
 }
 
 
-;CHECK-LABEL: merge_loads_i16:
-; load:
-;CHECK: movw
-; store:
-;CHECK: movw
-;CHECK: ret
+; CHECK-LABEL: merge_loads_i16:
+;  load:
+; CHECK: movw
+;  store:
+; CHECK: movw
+; CHECK: ret
 define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
   %1 = icmp sgt i32 %count, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
@@ -181,13 +181,13 @@ define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struc
   ret void
 }
 
-; The loads and the stores are interleved. Can't merge them.
-;CHECK-LABEL: no_merge_loads:
-;CHECK: movb
-;CHECK: movb
-;CHECK: movb
-;CHECK: movb
-;CHECK: ret
+; The loads and the stores are interleaved. Can't merge them.
+; CHECK-LABEL: no_merge_loads:
+; CHECK: movb
+; CHECK: movb
+; CHECK: movb
+; CHECK: movb
+; CHECK: ret
 define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
   %1 = icmp sgt i32 %count, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
@@ -216,12 +216,12 @@ a4:                                       ; preds = %4, %.lr.ph
 }
 
 
-;CHECK-LABEL: merge_loads_integer:
-; load:
-;CHECK: movq
-; store:
-;CHECK: movq
-;CHECK: ret
+; CHECK-LABEL: merge_loads_integer:
+;  load:
+; CHECK: movq
+;  store:
+; CHECK: movq
+; CHECK: ret
 define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
   %1 = icmp sgt i32 %count, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
@@ -250,12 +250,12 @@ define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %s
 }
 
 
-;CHECK-LABEL: merge_loads_vector:
-; load:
-;CHECK: movups
-; store:
-;CHECK: movups
-;CHECK: ret
+; CHECK-LABEL: merge_loads_vector:
+;  load:
+; CHECK: movups
+;  store:
+; CHECK: movups
+; CHECK: ret
 define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
   %a1 = icmp sgt i32 %count, 0
   br i1 %a1, label %.lr.ph, label %._crit_edge
@@ -291,18 +291,18 @@ block4:                                       ; preds = %4, %.lr.ph
   ret void
 }
 
-;CHECK-LABEL: merge_loads_no_align:
-; load:
-;CHECK: movl
-;CHECK: movl
-;CHECK: movl
-;CHECK: movl
-; store:
-;CHECK: movl
-;CHECK: movl
-;CHECK: movl
-;CHECK: movl
-;CHECK: ret
+; CHECK-LABEL: merge_loads_no_align:
+;  load:
+; CHECK: movl
+; CHECK: movl
+; CHECK: movl
+; CHECK: movl
+;  store:
+; CHECK: movl
+; CHECK: movl
+; CHECK: movl
+; CHECK: movl
+; CHECK: ret
 define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
   %a1 = icmp sgt i32 %count, 0
   br i1 %a1, label %.lr.ph, label %._crit_edge
@@ -434,3 +434,62 @@ define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {
 ; <label>:14
   ret void
 }
+
+; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 )
+define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
+  %vecext0 = extractelement <8 x float> %v, i32 0
+  %vecext1 = extractelement <8 x float> %v, i32 1
+  %vecext2 = extractelement <8 x float> %v, i32 2
+  %vecext3 = extractelement <8 x float> %v, i32 3
+  %vecext4 = extractelement <8 x float> %v, i32 4
+  %vecext5 = extractelement <8 x float> %v, i32 5
+  %vecext6 = extractelement <8 x float> %v, i32 6
+  %vecext7 = extractelement <8 x float> %v, i32 7
+  %arrayidx1 = getelementptr inbounds float* %ptr, i64 1
+  %arrayidx2 = getelementptr inbounds float* %ptr, i64 2
+  %arrayidx3 = getelementptr inbounds float* %ptr, i64 3
+  %arrayidx4 = getelementptr inbounds float* %ptr, i64 4
+  %arrayidx5 = getelementptr inbounds float* %ptr, i64 5
+  %arrayidx6 = getelementptr inbounds float* %ptr, i64 6
+  %arrayidx7 = getelementptr inbounds float* %ptr, i64 7
+  store float %vecext0, float* %ptr, align 4
+  store float %vecext1, float* %arrayidx1, align 4
+  store float %vecext2, float* %arrayidx2, align 4
+  store float %vecext3, float* %arrayidx3, align 4
+  store float %vecext4, float* %arrayidx4, align 4
+  store float %vecext5, float* %arrayidx5, align 4
+  store float %vecext6, float* %arrayidx6, align 4
+  store float %vecext7, float* %arrayidx7, align 4
+  ret void
+
+; CHECK-LABEL: merge_vec_element_store
+; CHECK: vmovups
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+}
+
+; This is a minimized test based on real code that was failing.
+; We could merge stores (and loads) like this...
+
+define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
+  %idx0 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 0
+  %idx1 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 1
+  %idx4 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 4
+  %idx5 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 5
+
+  %a0 = load i64* %idx0, align 8
+  store i64 %a0, i64* %idx4, align 8
+
+  %b = bitcast i64* %idx1 to <2 x i64>*
+  %v = load <2 x i64>* %b, align 8
+  %a1 = extractelement <2 x i64> %v, i32 0
+  store i64 %a1, i64* %idx5, align 8
+  ret void
+
+; CHECK-LABEL: merge_vec_element_and_scalar_load
+; CHECK:      movq	(%rdi), %rax
+; CHECK-NEXT: movq	%rax, 32(%rdi)
+; CHECK-NEXT: movq	8(%rdi), %rax
+; CHECK-NEXT: movq	%rax, 40(%rdi)
+; CHECK-NEXT: retq
+}
diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll
index 6865873..498ad7e 100644
--- a/test/CodeGen/X86/StackColoring-dbg.ll
+++ b/test/CodeGen/X86/StackColoring-dbg.ll
@@ -17,7 +17,7 @@ entry:
 for.body:
   call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
   call void @llvm.lifetime.start(i64 -1, i8* %x.i) nounwind
-  call void @llvm.dbg.declare(metadata !{i8* %x.i}, metadata !22, metadata !{metadata !"0x102"}) nounwind
+  call void @llvm.dbg.declare(metadata i8* %x.i, metadata !22, metadata !{!"0x102"}) nounwind
   br label %for.body
 }
 
@@ -27,9 +27,9 @@ declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!23}
-!0 = metadata !{metadata !"0x11\001\00clang\001\00\000\00\000", metadata !1, metadata !2, metadata !2, null, null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"t.c", metadata !""}
-!16 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
-!2 = metadata !{i32 0}
-!22 = metadata !{metadata !"0x100\00x\0016\000", null, metadata !2, metadata !16} ; [ DW_TAG_auto_variable ]
-!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\001\00clang\001\00\000\00\000", !1, !2, !2, null, null, null} ; [ DW_TAG_compile_unit ]
+!1 = !{!"t.c", !""}
+!16 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
+!2 = !{i32 0}
+!22 = !{!"0x100\00x\0016\000", null, !2, !16} ; [ DW_TAG_auto_variable ]
+!23 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/SwizzleShuff.ll b/test/CodeGen/X86/SwizzleShuff.ll
index a435272..d387850 100644
--- a/test/CodeGen/X86/SwizzleShuff.ll
+++ b/test/CodeGen/X86/SwizzleShuff.ll
@@ -14,11 +14,12 @@ define void @pull_bitcast (<4 x i8>* %pA, <4 x i8>* %pB) {
 }
 
 ; CHECK: multi_use_swizzle
-; CHECK: mov
-; CHECK-NEXT: shuf
-; CHECK-NEXT: shuf
-; CHECK-NEXT: shuf
-; CHECK-NEXT: xor
+; CHECK: pshufd
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: pblendw
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: pxor
 ; CHECK-NEXT: ret
 define <4 x i32> @multi_use_swizzle (<4 x i32>* %pA, <4 x i32>* %pB) {
   %A = load <4 x i32>* %pA
@@ -45,7 +46,7 @@ define <4 x i8> @pull_bitcast2 (<4 x i8>* %pA, <4 x i8>* %pB, <4 x i8>* %pC) {
 
 
 ; CHECK: reverse_1
-; CHECK-NOT: shuf
+; CHECK-NOT: pshufd
 ; CHECK: ret
 define <4 x i32> @reverse_1 (<4 x i32>* %pA, <4 x i32>* %pB) {
   %A = load <4 x i32>* %pA
@@ -57,7 +58,7 @@ define <4 x i32> @reverse_1 (<4 x i32>* %pA, <4 x i32>* %pB) {
 
 
 ; CHECK: no_reverse_shuff
-; CHECK: shuf
+; CHECK: pshufd
 ; CHECK: ret
 define <4 x i32> @no_reverse_shuff (<4 x i32>* %pA, <4 x i32>* %pB) {
   %A = load <4 x i32>* %pA
diff --git a/test/CodeGen/X86/asm-label.ll b/test/CodeGen/X86/asm-label.ll
index 1fc6e2e..1da66e7 100644
--- a/test/CodeGen/X86/asm-label.ll
+++ b/test/CodeGen/X86/asm-label.ll
@@ -24,7 +24,7 @@ if.end:                                           ; preds = %if.then
   br label %cleanup
 
 cleanup:                                          ; preds = %if.end, %if.then9
-  switch i32 undef, label %unreachable [
+  switch i32 undef, label %default [
     i32 0, label %cleanup.cont
     i32 1, label %if.end11
   ]
@@ -35,6 +35,6 @@ cleanup.cont:                                     ; preds = %cleanup
 if.end11:                                         ; preds = %cleanup.cont, %cleanup, %land.lhs.true, %entry
   ret void
 
-unreachable:                                      ; preds = %cleanup
-  unreachable
+default:                                          ; preds = %cleanup
+  br label %if.end11
 }
diff --git a/test/CodeGen/X86/atomic16.ll b/test/CodeGen/X86/atomic16.ll
index faaa4c4..f6892de 100644
--- a/test/CodeGen/X86/atomic16.ll
+++ b/test/CodeGen/X86/atomic16.ll
@@ -15,17 +15,17 @@ entry:
 ; X32:       incw
   %t2 = atomicrmw add  i16* @sc16, i16 3 acquire
 ; X64:       lock
-; X64:       addw $3, {{.*}} # encoding: [0xf0,0x66
+; X64:       addw $3, {{.*}} # encoding: [0x66,0xf0
 ; X32:       lock
 ; X32:       addw $3
   %t3 = atomicrmw add  i16* @sc16, i16 5 acquire
 ; X64:       lock
-; X64:       xaddw {{.*}} # encoding: [0xf0,0x66
+; X64:       xaddw {{.*}} # encoding: [0x66,0xf0
 ; X32:       lock
 ; X32:       xaddw
   %t4 = atomicrmw add  i16* @sc16, i16 %t3 acquire
 ; X64:       lock
-; X64:       addw {{.*}} # encoding: [0xf0,0x66
+; X64:       addw {{.*}} # encoding: [0x66,0xf0
 ; X32:       lock
 ; X32:       addw
   ret void
@@ -43,17 +43,17 @@ define void @atomic_fetch_sub16() nounwind {
 ; X32:       decw
   %t2 = atomicrmw sub  i16* @sc16, i16 3 acquire
 ; X64:       lock
-; X64:       subw $3, {{.*}} # encoding: [0xf0,0x66
+; X64:       subw $3, {{.*}} # encoding: [0x66,0xf0
 ; X32:       lock
 ; X32:       subw $3
   %t3 = atomicrmw sub  i16* @sc16, i16 5 acquire
 ; X64:       lock
-; X64:       xaddw {{.*}} # encoding: [0xf0,0x66
+; X64:       xaddw {{.*}} # encoding: [0x66,0xf0
 ; X32:       lock
 ; X32:       xaddw
   %t4 = atomicrmw sub  i16* @sc16, i16 %t3 acquire
 ; X64:       lock
-; X64:       subw {{.*}} # encoding: [0xf0,0x66
+; X64:       subw {{.*}} # encoding: [0x66,0xf0
 ; X32:       lock
 ; X32:       subw
   ret void
@@ -66,7 +66,7 @@ define void @atomic_fetch_and16() nounwind {
 ; X32-LABEL:   atomic_fetch_and16
   %t1 = atomicrmw and  i16* @sc16, i16 3 acquire
 ; X64:       lock
-; X64:       andw $3, {{.*}} # encoding: [0xf0,0x66
+; X64:       andw $3, {{.*}} # encoding: [0x66,0xf0
 ; X32:       lock
 ; X32:       andw $3
   %t2 = atomicrmw and  i16* @sc16, i16 5 acquire
@@ -78,7 +78,7 @@ define void @atomic_fetch_and16() nounwind {
 ; X32:       cmpxchgw
   %t3 = atomicrmw and  i16* @sc16, i16 %t2 acquire
 ; X64:       lock
-; X64:       andw {{.*}} # encoding: [0xf0,0x66
+; X64:       andw {{.*}} # encoding: [0x66,0xf0
 ; X32:       lock
 ; X32:       andw
   ret void
@@ -91,7 +91,7 @@ define void @atomic_fetch_or16() nounwind {
 ; X32-LABEL:   atomic_fetch_or16
   %t1 = atomicrmw or   i16* @sc16, i16 3 acquire
 ; X64:       lock
-; X64:       orw $3, {{.*}} # encoding: [0xf0,0x66
+; X64:       orw $3, {{.*}} # encoding: [0x66,0xf0
 ; X32:       lock
 ; X32:       orw $3
   %t2 = atomicrmw or   i16* @sc16, i16 5 acquire
@@ -103,7 +103,7 @@ define void @atomic_fetch_or16() nounwind {
 ; X32:       cmpxchgw
   %t3 = atomicrmw or   i16* @sc16, i16 %t2 acquire
 ; X64:       lock
-; X64:       orw {{.*}} # encoding: [0xf0,0x66
+; X64:       orw {{.*}} # encoding: [0x66,0xf0
 ; X32:       lock
 ; X32:       orw
   ret void
@@ -116,7 +116,7 @@ define void @atomic_fetch_xor16() nounwind {
 ; X32-LABEL:   atomic_fetch_xor16
   %t1 = atomicrmw xor  i16* @sc16, i16 3 acquire
 ; X64:       lock
-; X64:       xorw $3, {{.*}} # encoding: [0xf0,0x66
+; X64:       xorw $3, {{.*}} # encoding: [0x66,0xf0
 ; X32:       lock
 ; X32:       xorw $3
   %t2 = atomicrmw xor  i16* @sc16, i16 5 acquire
@@ -128,7 +128,7 @@ define void @atomic_fetch_xor16() nounwind {
 ; X32:       cmpxchgw
   %t3 = atomicrmw xor  i16* @sc16, i16 %t2 acquire
 ; X64:       lock
-; X64:       xorw {{.*}} # encoding: [0xf0,0x66
+; X64:       xorw {{.*}} # encoding: [0x66,0xf0
 ; X32:       lock
 ; X32:       xorw
   ret void
diff --git a/test/CodeGen/X86/avx-cvt.ll b/test/CodeGen/X86/avx-cvt.ll
index 22fad7c..10ab971 100644
--- a/test/CodeGen/X86/avx-cvt.ll
+++ b/test/CodeGen/X86/avx-cvt.ll
@@ -87,3 +87,20 @@ entry:
   ret void
 }
 
+define double @nearbyint_f64(double %a) {
+; CHECK-LABEL: nearbyint_f64
+; CHECK: vroundsd $12
+  %res = call double @llvm.nearbyint.f64(double %a)
+  ret double %res
+}
+declare double @llvm.nearbyint.f64(double %p)
+
+define float @floor_f32(float %a) {
+; CHECK-LABEL: floor_f32
+; CHECK: vroundss $1
+  %res = call float @llvm.floor.f32(float %a)
+  ret float %res
+}
+declare float @llvm.floor.f32(float %p)
+
+
diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index d2b44cd..c65b021 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -24,3 +24,17 @@ define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
 declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
 
 
+define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
+  ; CHECK: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
+  ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index ef3e83f..3ecf709 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -455,22 +455,6 @@ define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
-  ; CHECK: vpslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
-  ; CHECK: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
-  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone
 
 
 define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) {
@@ -551,22 +535,6 @@ define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
-  ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
-  ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone
 
 
 define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) {
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index 98c1645..c7e8b3b 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -18,7 +18,7 @@ entry:
 }
 
 ; CHECK: vmovq
-; CHECK-NEXT: vunpcklpd %xmm
+; CHECK-NEXT: vmovddup %xmm
 ; CHECK-NEXT: vinsertf128 $1
 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
 entry:
@@ -29,7 +29,7 @@ entry:
   ret <4 x i64> %vecinit6.i
 }
 
-; CHECK: vunpcklpd %xmm
+; CHECK: vmovddup %xmm
 ; CHECK-NEXT: vinsertf128 $1
 define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
 entry:
@@ -42,7 +42,7 @@ entry:
 
 ; Test this turns into a broadcast:
 ;   shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
-;   
+;
 ; CHECK: vbroadcastss
 define <8 x float> @funcE() nounwind {
 allocas:
diff --git a/test/CodeGen/X86/avx-trunc.ll b/test/CodeGen/X86/avx-trunc.ll
index bf8d9a7..27be9fd 100644
--- a/test/CodeGen/X86/avx-trunc.ll
+++ b/test/CodeGen/X86/avx-trunc.ll
@@ -2,9 +2,9 @@
 
 define <4 x i32> @trunc_64_32(<4 x i64> %A) nounwind uwtable readnone ssp{
 ; CHECK-LABEL: trunc_64_32
-; CHECK: shufps
-; CHECK-NOT: pshufd
-; CHECK-NOT: movlhps 
+; CHECK: pshufd
+; CHECK: pshufd
+; CHECK: pblendw
   %B = trunc <4 x i64> %A to <4 x i32>
   ret <4 x i32>%B
 }
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
index a103405..43303ca 100644
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -182,20 +182,11 @@ entry:
 ;;;; Cases we must not select vperm2f128
 
 define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; AVX1-LABEL: G:
-; AVX1:       ## BB#0: ## %entry
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: G:
-; AVX2:       ## BB#0: ## %entry
-; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
-; AVX2-NEXT:    retq
+; ALL-LABEL: G:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
+; ALL-NEXT:    retq
 entry:
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
   ret <8 x float> %shuffle
diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll
index cba6d98..6069c14 100644
--- a/test/CodeGen/X86/avx.ll
+++ b/test/CodeGen/X86/avx.ll
@@ -60,7 +60,7 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
 ; X32: movl    8(%esp), %ecx
 ; CHECK-NOT: mov
 ;; Try to match a bit more of the instr, since we need the load's offset.
-; CHECK: vinsertps    $-64, 12(%{{...}},%{{...}}), %
+; CHECK: vinsertps    $192, 12(%{{...}},%{{...}}), %
 ; CHECK-NEXT: ret
   %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
   %2 = load <4 x float>* %1, align 16
diff --git a/test/CodeGen/X86/avx1-stack-reload-folding.ll b/test/CodeGen/X86/avx1-stack-reload-folding.ll
deleted file mode 100644
index 2e669b0..0000000
--- a/test/CodeGen/X86/avx1-stack-reload-folding.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-unknown"
-
-; Stack reload folding tests - we use the 'big vectors' pattern to guarantee spilling to stack.
-;
-; Many of these tests are primarily to check memory folding with specific instructions. Using a basic
-; load/cvt/store pattern to test for this would mean that it wouldn't be the memory folding code thats
-; being tested - the load-execute version of the instruction from the tables would be matched instead.
-
-define void @stack_fold_vmulpd(<64 x double>* %a, <64 x double>* %b, <64 x double>* %c) {
-  ;CHECK-LABEL: stack_fold_vmulpd
-  ;CHECK:       vmulpd {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
-
-  %1 = load <64 x double>* %a
-  %2 = load <64 x double>* %b
-  %3 = fadd <64 x double> %1, %2
-  %4 = fsub <64 x double> %1, %2
-  %5 = fmul <64 x double> %3, %4
-  store <64 x double> %5, <64 x double>* %c
-  ret void
-}
-
-define void @stack_fold_cvtdq2ps(<128 x i32>* %a, <128 x i32>* %b, <128 x float>* %c) {
-  ;CHECK-LABEL: stack_fold_cvtdq2ps
-  ;CHECK:   vcvtdq2ps {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
-
-  %1 = load <128 x i32>* %a
-  %2 = load <128 x i32>* %b
-  %3 = and <128 x i32> %1, %2
-  %4 = xor <128 x i32> %1, %2
-  %5 = sitofp <128 x i32> %3 to <128 x float>
-  %6 = sitofp <128 x i32> %4 to <128 x float>
-  %7 = fadd <128 x float> %5, %6
-  store <128 x float> %7, <128 x float>* %c
-  ret void
-}
-
-define void @stack_fold_cvttpd2dq(<64 x double>* %a, <64 x double>* %b, <64 x i32>* %c) #0 {
-  ;CHECK-LABEL: stack_fold_cvttpd2dq
-  ;CHECK:  vcvttpd2dqy {{[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
-
-  %1 = load <64 x double>* %a
-  %2 = load <64 x double>* %b
-  %3 = fadd <64 x double> %1, %2
-  %4 = fsub <64 x double> %1, %2
-  %5 = fptosi <64 x double> %3 to <64 x i32>
-  %6 = fptosi <64 x double> %4 to <64 x i32>
-  %7 = or <64 x i32> %5, %6
-  store <64 x i32> %7, <64 x i32>* %c
-  ret void
-}
-
-define void @stack_fold_cvttps2dq(<128 x float>* %a, <128 x float>* %b, <128 x i32>* %c) #0 {
-  ;CHECK-LABEL: stack_fold_cvttps2dq
-  ;CHECK:   vcvttps2dq {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
-
-  %1 = load <128 x float>* %a
-  %2 = load <128 x float>* %b
-  %3 = fadd <128 x float> %1, %2
-  %4 = fsub <128 x float> %1, %2
-  %5 = fptosi <128 x float> %3 to <128 x i32>
-  %6 = fptosi <128 x float> %4 to <128 x i32>
-  %7 = or <128 x i32> %5, %6
-  store <128 x i32> %7, <128 x i32>* %c
-  ret void
-}
diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll
index f49718e..5f17f1b 100644
--- a/test/CodeGen/X86/avx2-conversions.ll
+++ b/test/CodeGen/X86/avx2-conversions.ll
@@ -84,7 +84,7 @@ define <16 x i16> @sext_16i8_16i16(<16 x i8> %z) {
 ; CHECK-LABEL: trunc_16i16_16i8:
 ; CHECK: vpshufb
 ; CHECK: vpshufb
-; CHECK: vpor
+; CHECK: vpunpcklqdq
 ; CHECK: ret
 define <16 x i8> @trunc_16i16_16i8(<16 x i16> %z) {
   %t = trunc <16 x i16> %z to <16 x i8>
diff --git a/test/CodeGen/X86/avx2-gather.ll b/test/CodeGen/X86/avx2-gather.ll
index a9ac025..91fa20b 100644
--- a/test/CodeGen/X86/avx2-gather.ll
+++ b/test/CodeGen/X86/avx2-gather.ll
@@ -32,3 +32,30 @@ define <2 x double> @test_x86_avx2_gather_d_pd(i8* %a1,
 ; CHECK: vgatherdpd
 ; CHECK: vmovapd
 ; CHECK: ret
+
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*,
+                      <8 x i32>, <8 x float>, i8) nounwind readonly
+
+define <8 x float> @test_x86_avx2_gather_d_ps_256(i8* %a1,
+                     <8 x i32> %idx, <8 x float> %mask) {
+  %res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef,
+                            i8* %a1, <8 x i32> %idx, <8 x float> %mask, i8 4) ;
+  ret <8 x float> %res
+}
+; CHECK-LABEL: @test_x86_avx2_gather_d_ps_256
+; CHECK: vgatherdps %ymm
+; CHECK: ret
+
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*,
+                      <4 x i32>, <4 x double>, i8) nounwind readonly
+
+define <4 x double> @test_x86_avx2_gather_d_pd_256(i8* %a1,
+                     <4 x i32> %idx, <4 x double> %mask) {
+  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef,
+                            i8* %a1, <4 x i32> %idx, <4 x double> %mask, i8 8) ;
+  ret <4 x double> %res
+}
+
+; CHECK-LABEL: test_x86_avx2_gather_d_pd_256
+; CHECK: vgatherdpd %ymm
+; CHECK: ret
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
index ac2c73b..acc3098 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
@@ -31,3 +31,34 @@ define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
 }
 declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone
 
+
+define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
+  ; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
+  %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
+  ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
+  %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
+  ; CHECK: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+  %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 8) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
+  ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero
+  %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 8) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 84b22b7..da0f17a 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -158,22 +158,6 @@ define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) {
   ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
-  ; CHECK: vpslldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
-  %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
-  ; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
-  %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psll_q(<4 x i64> %a0, <2 x i64> %a1) {
@@ -254,22 +238,6 @@ define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) {
   ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
-  ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
-  %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
-  ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
-  %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psrl_q(<4 x i64> %a0, <2 x i64> %a1) {
diff --git a/test/CodeGen/X86/avx2-nontemporal.ll b/test/CodeGen/X86/avx2-nontemporal.ll
index 0768aae..4d28a97 100644
--- a/test/CodeGen/X86/avx2-nontemporal.ll
+++ b/test/CodeGen/X86/avx2-nontemporal.ll
@@ -19,4 +19,4 @@ define void @f(<8 x float> %A, i8* %B, <4 x double> %C, i32 %D, <4 x i64> %E) {
   ret void
 }
 
-!0 = metadata !{i32 1}
+!0 = !{i32 1}
diff --git a/test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll b/test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll
new file mode 100644
index 0000000..7301b7c
--- /dev/null
+++ b/test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll
@@ -0,0 +1,110 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 | FileCheck %s
+
+define <16 x i16> @test_lvm_x86_avx2_pmovsxbw(<16 x i8>* %a) {
+; CHECK-LABEL: test_lvm_x86_avx2_pmovsxbw
+; CHECK: vpmovsxbw (%rdi), %ymm0
+  %1 = load <16 x i8>* %a, align 1
+  %2 = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %1)
+  ret <16 x i16> %2
+}
+
+define <8 x i32> @test_llvm_x86_avx2_pmovsxbd(<16 x i8>* %a) {
+; CHECK-LABEL: test_llvm_x86_avx2_pmovsxbd
+; CHECK: vpmovsxbd (%rdi), %ymm0
+  %1 = load <16 x i8>* %a, align 1
+  %2 = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %1)
+  ret <8 x i32> %2
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovsxbq(<16 x i8>* %a) {
+; CHECK-LABEL: test_llvm_x86_avx2_pmovsxbq
+; CHECK: vpmovsxbq (%rdi), %ymm0
+  %1 = load <16 x i8>* %a, align 1
+  %2 = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %1)
+  ret <4 x i64> %2
+}
+
+define <8 x i32> @test_llvm_x86_avx2_pmovsxwd(<8 x i16>* %a) {
+; CHECK-LABEL: test_llvm_x86_avx2_pmovsxwd
+; CHECK: vpmovsxwd (%rdi), %ymm0
+  %1 = load <8 x i16>* %a, align 1
+  %2 = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %1)
+  ret <8 x i32> %2
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovsxwq(<8 x i16>* %a) {
+; CHECK-LABEL: test_llvm_x86_avx2_pmovsxwq
+; CHECK: vpmovsxwq (%rdi), %ymm0
+  %1 = load <8 x i16>* %a, align 1
+  %2 = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %1)
+  ret <4 x i64> %2
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovsxdq(<4 x i32>* %a) {
+; CHECK-LABEL: test_llvm_x86_avx2_pmovsxdq
+; CHECK: vpmovsxdq (%rdi), %ymm0
+  %1 = load <4 x i32>* %a, align 1
+  %2 = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %1)
+  ret <4 x i64> %2
+}
+
+define <16 x i16> @test_lvm_x86_avx2_pmovzxbw(<16 x i8>* %a) {
+; CHECK-LABEL: test_lvm_x86_avx2_pmovzxbw
+; CHECK: vpmovzxbw (%rdi), %ymm0
+  %1 = load <16 x i8>* %a, align 1
+  %2 = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %1)
+  ret <16 x i16> %2
+}
+
+define <8 x i32> @test_llvm_x86_avx2_pmovzxbd(<16 x i8>* %a) {
+; CHECK-LABEL: test_llvm_x86_avx2_pmovzxbd
+; CHECK: vpmovzxbd (%rdi), %ymm0
+  %1 = load <16 x i8>* %a, align 1
+  %2 = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %1)
+  ret <8 x i32> %2
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovzxbq(<16 x i8>* %a) {
+; CHECK-LABEL: test_llvm_x86_avx2_pmovzxbq
+; CHECK: vpmovzxbq (%rdi), %ymm0
+  %1 = load <16 x i8>* %a, align 1
+  %2 = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %1)
+  ret <4 x i64> %2
+}
+
+define <8 x i32> @test_llvm_x86_avx2_pmovzxwd(<8 x i16>* %a) {
+; CHECK-LABEL: test_llvm_x86_avx2_pmovzxwd
+; CHECK: vpmovzxwd (%rdi), %ymm0
+  %1 = load <8 x i16>* %a, align 1
+  %2 = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %1)
+  ret <8 x i32> %2
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovzxwq(<8 x i16>* %a) {
+; CHECK-LABEL: test_llvm_x86_avx2_pmovzxwq
+; CHECK: vpmovzxwq (%rdi), %ymm0
+  %1 = load <8 x i16>* %a, align 1
+  %2 = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %1)
+  ret <4 x i64> %2
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovzxdq(<4 x i32>* %a) {
+; CHECK-LABEL: test_llvm_x86_avx2_pmovzxdq
+; CHECK: vpmovzxdq (%rdi), %ymm0
+  %1 = load <4 x i32>* %a, align 1
+  %2 = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %1)
+  ret <4 x i64> %2
+}
+
+declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>)
+declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>)
+declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>)
+declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>)
+declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>)
+declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>)
+declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>)
+declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>)
+declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>)
+declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>)
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 924c06e..83100a8 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -317,7 +317,7 @@ define   <4 x double> @_inreg4xdouble(<4 x double> %a) {
 }
 
 ;CHECK-LABEL: _inreg2xdouble:
-;CHECK: vunpcklpd
+;CHECK: vmovddup
 ;CHECK: ret
 define   <2 x double> @_inreg2xdouble(<2 x double> %a) {
   %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index c43da9c..94b0821 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -462,3 +462,193 @@ entry:
   %d = and <8 x i64> %p1, %c
   ret <8 x i64>%d
 }
+
+; CHECK-LABEL: test_mask_vaddps
+; CHECK: vaddps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
+                                     <16 x float> %j, <16 x i32> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = fadd <16 x float> %i, %j
+  %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
+  ret <16 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vmulps
+; CHECK: vmulps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i,
+                                     <16 x float> %j, <16 x i32> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = fmul <16 x float> %i, %j
+  %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
+  ret <16 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vminps
+; CHECK: vminps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i,
+                                     <16 x float> %j, <16 x i32> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %cmp_res = fcmp olt <16 x float> %i, %j
+  %min = select <16 x i1> %cmp_res, <16 x float> %i, <16 x float> %j
+  %r = select <16 x i1> %mask, <16 x float> %min, <16 x float> %dst
+  ret <16 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vminpd
+; CHECK: vminpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
+                                     <8 x double> %j, <8 x i32> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %cmp_res = fcmp olt <8 x double> %i, %j
+  %min = select <8 x i1> %cmp_res, <8 x double> %i, <8 x double> %j
+  %r = select <8 x i1> %mask, <8 x double> %min, <8 x double> %dst
+  ret <8 x double> %r
+}
+
+; CHECK-LABEL: test_mask_vmaxps
+; CHECK: vmaxps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i,
+                                     <16 x float> %j, <16 x i32> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %cmp_res = fcmp ogt <16 x float> %i, %j
+  %max = select <16 x i1> %cmp_res, <16 x float> %i, <16 x float> %j
+  %r = select <16 x i1> %mask, <16 x float> %max, <16 x float> %dst
+  ret <16 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vmaxpd
+; CHECK: vmaxpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
+                                     <8 x double> %j, <8 x i32> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %cmp_res = fcmp ogt <8 x double> %i, %j
+  %max = select <8 x i1> %cmp_res, <8 x double> %i, <8 x double> %j
+  %r = select <8 x i1> %mask, <8 x double> %max, <8 x double> %dst
+  ret <8 x double> %r
+}
+
+; CHECK-LABEL: test_mask_vsubps
+; CHECK: vsubps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i,
+                                     <16 x float> %j, <16 x i32> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = fsub <16 x float> %i, %j
+  %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
+  ret <16 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vdivps
+; CHECK: vdivps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i,
+                                     <16 x float> %j, <16 x i32> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = fdiv <16 x float> %i, %j
+  %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
+  ret <16 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vaddpd
+; CHECK: vaddpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i,
+                                     <8 x double> %j, <8 x i64> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %x = fadd <8 x double> %i, %j
+  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %dst
+  ret <8 x double> %r
+}
+
+; CHECK-LABEL: test_maskz_vaddpd
+; CHECK: vaddpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z}}}
+; CHECK: ret
+define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j,
+                                      <8 x i64> %mask1) nounwind readnone {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %x = fadd <8 x double> %i, %j
+  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer
+  ret <8 x double> %r
+}
+
+; CHECK-LABEL: test_mask_fold_vaddpd
+; CHECK: vaddpd (%rdi), {{.*%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}.*}}
+; CHECK: ret
+define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i,
+                                     <8 x double>* %j,  <8 x i64> %mask1)
+                                     nounwind {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %tmp = load <8 x double>* %j, align 8
+  %x = fadd <8 x double> %i, %tmp
+  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %dst
+  ret <8 x double> %r
+}
+
+; CHECK-LABEL: test_maskz_fold_vaddpd
+; CHECK: vaddpd (%rdi), {{.*%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z}.*}}
+; CHECK: ret
+define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j,
+                                      <8 x i64> %mask1) nounwind {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %tmp = load <8 x double>* %j, align 8
+  %x = fadd <8 x double> %i, %tmp
+  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer
+  ret <8 x double> %r
+}
+
+; CHECK-LABEL: test_broadcast_vaddpd
+; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*}}
+; CHECK: ret
+define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind {
+  %tmp = load double* %j
+  %b = insertelement <8 x double> undef, double %tmp, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef,
+                     <8 x i32> zeroinitializer
+  %x = fadd <8 x double> %c, %i
+  ret <8 x double> %x
+}
+
+; CHECK-LABEL: test_mask_broadcast_vaddpd
+; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*{%k[1-7]}.*}}
+; CHECK: ret
+define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i,
+                                      double* %j, <8 x i64> %mask1) nounwind {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %tmp = load double* %j
+  %b = insertelement <8 x double> undef, double %tmp, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef,
+                     <8 x i32> zeroinitializer
+  %x = fadd <8 x double> %c, %i
+  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %i
+  ret <8 x double> %r
+}
+
+; CHECK-LABEL: test_maskz_broadcast_vaddpd
+; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*{%k[1-7]} {z}.*}}
+; CHECK: ret
+define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
+                                       <8 x i64> %mask1) nounwind {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %tmp = load double* %j
+  %b = insertelement <8 x double> undef, double %tmp, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef,
+                     <8 x i32> zeroinitializer
+  %x = fadd <8 x double> %c, %i
+  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer
+  ret <8 x double> %r
+}
diff --git a/test/CodeGen/X86/avx512-fma-intrinsics.ll b/test/CodeGen/X86/avx512-fma-intrinsics.ll
index 366d324..9b82c88 100644
--- a/test/CodeGen/X86/avx512-fma-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-fma-intrinsics.ll
@@ -8,6 +8,13 @@ define <16 x float> @test_x86_vfmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <1
 }
 declare <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
+define <16 x float> @test_mask_vfmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_mask_vfmadd_ps
+  ; CHECK: vfmadd213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
+  ret <16 x float> %res
+}
+
 define <8 x double> @test_x86_vfmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfmadd_pd_z
   ; CHECK: vfmadd213pd %zmm
@@ -32,6 +39,13 @@ define <16 x float> @test_x86_vfmsubps_z(<16 x float> %a0, <16 x float> %a1, <16
 }
 declare <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
+define <16 x float> @test_mask_vfmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_mask_vfmsub_ps
+  ; CHECK: vfmsub213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
+  ret <16 x float> %res
+}
+
 define <8 x double> @test_x86_vfmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfmsubpd_z
   ; CHECK: vfmsub213pd %zmm
@@ -40,6 +54,13 @@ define <8 x double> @test_x86_vfmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8
 }
 declare <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
 
+define <8 x double> @test_mask_vfmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmsub_pd
+  ; CHECK: vfmsub213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
+  ret <8 x double> %res
+}
+
 define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK-LABEL: test_x86_vfnmadd_ps_z
   ; CHECK: vfnmadd213ps %zmm
@@ -48,6 +69,13 @@ define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <
 }
 declare <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
+define <16 x float> @test_mask_vfnmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_mask_vfnmadd_ps
+  ; CHECK: vfnmadd213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
+  ret <16 x float> %res
+}
+
 define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfnmadd_pd_z
   ; CHECK: vfnmadd213pd %zmm
@@ -56,6 +84,13 @@ define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <
 }
 declare <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
 
+define <8 x double> @test_mask_vfnmadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfnmadd_pd
+  ; CHECK: vfnmadd213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
+  ret <8 x double> %res
+}
+
 define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK-LABEL: test_x86_vfnmsubps_z
   ; CHECK: vfnmsub213ps %zmm
@@ -64,6 +99,13 @@ define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <1
 }
 declare <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
+define <16 x float> @test_mask_vfnmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_mask_vfnmsub_ps
+  ; CHECK: vfnmsub213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
+  ret <16 x float> %res
+}
+
 define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfnmsubpd_z
   ; CHECK: vfnmsub213pd %zmm
@@ -72,6 +114,13 @@ define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8
 }
 declare <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
 
+define <8 x double> @test_mask_vfnmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfnmsub_pd
+  ; CHECK: vfnmsub213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
+  ret <8 x double> %res
+}
+
 define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK-LABEL: test_x86_vfmaddsubps_z
   ; CHECK: vfmaddsub213ps %zmm
@@ -96,6 +145,13 @@ define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1,
 }
 declare <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
 
+define <8 x double> @test_mask_vfmaddsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmaddsub_pd
+  ; CHECK: vfmaddsub213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
+  ret <8 x double> %res
+}
+
 define <16 x float> @test_x86_vfmsubaddps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK-LABEL: test_x86_vfmsubaddps_z
   ; CHECK: vfmsubadd213ps %zmm
@@ -104,6 +160,13 @@ define <16 x float> @test_x86_vfmsubaddps_z(<16 x float> %a0, <16 x float> %a1,
 }
 declare <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
+define <16 x float> @test_mask_vfmsubadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_mask_vfmsubadd_ps
+  ; CHECK: vfmsubadd213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
+  ret <16 x float> %res
+}
+
 define <8 x double> @test_x86_vfmsubaddpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfmsubaddpd_z
   ; CHECK: vfmsubadd213pd %zmm
@@ -111,3 +174,291 @@ define <8 x double> @test_x86_vfmsubaddpd_z(<8 x double> %a0, <8 x double> %a1,
   ret <8 x double> %res
 }
 declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
+
+define <8 x double> @test_mask_vfmsubadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmsubadd_pd
+  ; CHECK: vfmsubadd213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
+  ret <8 x double> %res
+}
+
+define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rne
+  ; CHECK: vfmadd213ps  {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 0) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtn
+  ; CHECK: vfmadd213ps  {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x39,0xa8,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 1) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtp
+  ; CHECK: vfmadd213ps  {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x59,0xa8,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 2) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtz
+  ; CHECK: vfmadd213ps  {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xa8,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 3) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmadd512_ps_rrb_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_current
+  ; CHECK: vfmadd213ps  %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa8,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne
+  ; CHECK: vfmadd213ps  {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn
+  ; CHECK: vfmadd213ps  {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x38,0xa8,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp
+  ; CHECK: vfmadd213ps  {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x58,0xa8,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz
+  ; CHECK: vfmadd213ps  {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x78,0xa8,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_current
+  ; CHECK: vfmadd213ps  %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmsub512_ps_rrb_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_rne
+  ; CHECK: vfmsub213ps  {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xaa,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 0) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmsub512_ps_rrb_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_rtn
+  ; CHECK: vfmsub213ps  {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x39,0xaa,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 1) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmsub512_ps_rrb_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_rtp
+  ; CHECK: vfmsub213ps  {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x59,0xaa,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 2) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmsub512_ps_rrb_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_rtz
+  ; CHECK: vfmsub213ps  {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xaa,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 3) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmsub512_ps_rrb_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_current
+  ; CHECK: vfmsub213ps  %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xaa,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_rne
+  ; CHECK: vfmsub213ps  {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xaa,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_rtn
+  ; CHECK: vfmsub213ps  {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x38,0xaa,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_rtp
+  ; CHECK: vfmsub213ps  {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x58,0xaa,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_rtz
+  ; CHECK: vfmsub213ps  {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x78,0xaa,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_current
+  ; CHECK: vfmsub213ps  %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xaa,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
+  ret <16 x float> %res
+}
+
+define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rne
+  ; CHECK: vfmadd213pd  {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x19,0xa8,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtn
+  ; CHECK: vfmadd213pd  {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x39,0xa8,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtp
+  ; CHECK: vfmadd213pd  {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xa8,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtz
+  ; CHECK: vfmadd213pd  {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xa8,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfmadd512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_current
+  ; CHECK: vfmadd213pd  %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa8,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne
+  ; CHECK: vfmadd213pd  {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xa8,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn
+  ; CHECK: vfmadd213pd  {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x38,0xa8,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp
+  ; CHECK: vfmadd213pd  {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x58,0xa8,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz
+  ; CHECK: vfmadd213pd  {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x78,0xa8,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_current
+  ; CHECK: vfmadd213pd  %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
+  ret <8 x double> %res
+}
+
+
+define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rne
+  ; CHECK: vfnmsub213pd  {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x19,0xae,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtn
+  ; CHECK: vfnmsub213pd  {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x39,0xae,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtp
+  ; CHECK: vfnmsub213pd  {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xae,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtz
+  ; CHECK: vfnmsub213pd  {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xae,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_current
+  ; CHECK: vfnmsub213pd  %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xae,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne
+  ; CHECK: vfnmsub213pd  {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xae,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn
+  ; CHECK: vfnmsub213pd  {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x38,0xae,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp
+  ; CHECK: vfnmsub213pd  {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x58,0xae,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz
+  ; CHECK: vfnmsub213pd  {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x78,0xae,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_current
+  ; CHECK: vfnmsub213pd  %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xae,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
+  ret <8 x double> %res
+}
diff --git a/test/CodeGen/X86/avx512-i1test.ll b/test/CodeGen/X86/avx512-i1test.ll
new file mode 100755
index 0000000..a237738
--- /dev/null
+++ b/test/CodeGen/X86/avx512-i1test.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: func
+; CHECK: testb
+; CHECK: testb
+define void @func() {
+bb1:
+  br i1 undef, label %L_10, label %L_10
+
+L_10:                                             ; preds = %bb1, %bb1
+  br i1 undef, label %L_30, label %bb56
+
+bb56:                                             ; preds = %L_10
+  br label %bb33
+
+bb33:                                             ; preds = %bb51, %bb56
+  %r111 = load i64* undef, align 8
+  br i1 undef, label %bb51, label %bb35
+
+bb35:                                             ; preds = %bb33
+  br i1 undef, label %L_19, label %bb37
+
+bb37:                                             ; preds = %bb35
+  %r128 = and i64 %r111, 576460752303423488
+  %phitmp = icmp eq i64 %r128, 0
+  br label %L_19
+
+L_19:                                             ; preds = %bb37, %bb35
+  %"$V_S25.0" = phi i1 [ %phitmp, %bb37 ], [ true, %bb35 ]
+  br i1 undef, label %bb51, label %bb42
+
+bb42:                                             ; preds = %L_19
+  %r136 = select i1 %"$V_S25.0", i32* undef, i32* undef
+  br label %bb51
+
+bb51:                                             ; preds = %bb42, %L_19, %bb33
+  br i1 false, label %L_30, label %bb33
+
+L_30:                                             ; preds = %bb51, %L_10
+  ret void
+}
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index eba895e..d6b887e 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -106,7 +106,7 @@ define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
 ;CHECK: vpcmpltud
 ;CHECK: kshiftlw $11
 ;CHECK: kshiftrw $15
-;CHECK: kortestw
+;CHECK: testb
 ;CHECK: je
 ;CHECK: ret
 ;CHECK: ret
@@ -125,7 +125,7 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
 ;CHECK: vpcmpgtq
 ;CHECK: kshiftlw $15
 ;CHECK: kshiftrw $15
-;CHECK: kortestw
+;CHECK: testb
 ;CHECK: ret
 
 define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
@@ -150,9 +150,12 @@ define i16 @test13(i32 %a, i32 %b) {
 
 ;CHECK-LABEL: test14
 ;CHECK: vpcmpgtq
-;CHECK: kshiftlw $11
-;CHECK: kshiftrw $15
-;CHECK: kortestw
+;KNL: kshiftlw $11
+;KNL: kshiftrw $15
+;KNL: testb
+;SKX: kshiftlb $3
+;SKX: kshiftrb $7
+;SKX: testb
 ;CHECK: ret
 
 define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
@@ -188,9 +191,11 @@ define i16 @test16(i1 *%addr, i16 %a) {
 }
 
 ;CHECK-LABEL: test17
-;CHECK: kshiftlw
-;CHECK: kshiftrw
+;KNL: kshiftlw
+;KNL: kshiftrw
 ;KNL: korw
+;SKX: kshiftlb
+;SKX: kshiftrb
 ;SKX: korb
 ;CHECK: ret
 define i8 @test17(i1 *%addr, i8 %a) {
diff --git a/test/CodeGen/X86/avx512-intel-ocl.ll b/test/CodeGen/X86/avx512-intel-ocl.ll
new file mode 100644
index 0000000..3f2691b
--- /dev/null
+++ b/test/CodeGen/X86/avx512-intel-ocl.ll
@@ -0,0 +1,105 @@
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=knl | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=knl | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck -check-prefix=X64 %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+declare i32 @func_int(i32, i32)
+
+; WIN64-LABEL: testf16_inp
+; WIN64: vaddps  {{.*}}, {{%zmm[0-1]}}
+; WIN64: leaq    {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; X32-LABEL: testf16_inp
+; X32: vaddps  {{.*}}, {{%zmm[0-1]}}
+; X32: movl    %eax, (%esp)
+; X32: call
+; X32: ret
+
+; X64-LABEL: testf16_inp
+; X64: vaddps  {{.*}}, {{%zmm[0-1]}}
+; X64: leaq    {{.*}}(%rsp), %rdi
+; X64: call
+; X64: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %2, %1
+  ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved zmm16-
+; WIN64-LABEL: testf16_regs
+; WIN64: call
+; WIN64: vaddps  %zmm16, %zmm0, %zmm0
+; WIN64: ret
+
+; preserved zmm16-
+; X64-LABEL: testf16_regs
+; X64: call
+; X64: vaddps  %zmm16, %zmm0, %zmm0
+; X64: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %1, %b
+  %4 = fadd <16 x float> %2, %3
+  ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; WIN64-LABEL: test_prolog_epilog
+; WIN64: vmovups %zmm21, {{.*(%rbp).*}}     # 64-byte Spill
+; WIN64: vmovups %zmm6, {{.*(%rbp).*}}     # 64-byte Spill
+; WIN64: call
+; WIN64: vmovups {{.*(%rbp).*}}, %zmm6      # 64-byte Reload
+; WIN64: vmovups {{.*(%rbp).*}}, %zmm21     # 64-byte Reload
+
+; X64-LABEL: test_prolog_epilog
+; X64:  kmovw   %k7, {{.*}}(%rsp)         ## 8-byte Folded Spill
+; X64:  kmovw   %k6, {{.*}}(%rsp)         ## 8-byte Folded Spill
+; X64:  kmovw   %k5, {{.*}}(%rsp)         ## 8-byte Folded Spill
+; X64:  kmovw   %k4, {{.*}}(%rsp)         ## 8-byte Folded Spill
+; X64: vmovups %zmm31, {{.*}}(%rsp)  ## 64-byte Spill
+; X64: vmovups %zmm16, {{.*}}(%rsp)  ## 64-byte Spill
+; X64: call
+; X64: vmovups {{.*}}(%rsp), %zmm16 ## 64-byte Reload
+; X64: vmovups {{.*}}(%rsp), %zmm31 ## 64-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+   ret <16 x float> %c
+}
+
+
+declare <16 x float> @func_float16_mask(<16 x float>, <16 x i1>)
+
+; X64-LABEL: testf16_inp_mask
+; X64: kmovw   %edi, %k1
+; X64: call
+define <16 x float> @testf16_inp_mask(<16 x float> %a, i16 %mask)  {
+  %imask = bitcast i16 %mask to <16 x i1>
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1> %imask)
+  ret <16 x float> %1
+}
+
+; X64-LABEL: test_prolog_epilog_with_mask
+; X64: kxorw   %k{{.*}}, %k{{.*}}, %k1
+; X64: call
+define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a, <16 x i32> %x1, <16 x i32>%x2, <16 x i1> %mask) nounwind {
+   %cmp_res = icmp eq <16 x i32>%x1, %x2
+   %mask1 = xor <16 x i1> %cmp_res, %mask
+   %c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1)
+   ret <16 x float> %c
+}
+\ No newline at end of file
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 691d1fb..b6375c1 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone
 ; CHECK: kortestw
 ; CHECK: sete
 define i32 @test_kortestz(i16 %a0, i16 %a1) {
-  %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1) 
+  %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1)
   ret i32 %res
 }
 
@@ -14,7 +14,7 @@ declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone
 ; CHECK: kortestw
 ; CHECK: sbbl
 define i32 @test_kortestc(i16 %a0, i16 %a1) {
-  %res = call i32 @llvm.x86.avx512.kortestc.w(i16 %a0, i16 %a1) 
+  %res = call i32 @llvm.x86.avx512.kortestc.w(i16 %a0, i16 %a1)
   ret i32 %res
 }
 
@@ -277,7 +277,7 @@ define <8 x i64> @test_conflict_q(<8 x i64> %a) {
 declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
 
 define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
-  ; CHECK: vpconflictd 
+  ; CHECK: vpconflictd
   %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -340,7 +340,7 @@ define <8 x i64> @test_ctlz_q(<8 x i64> %a) {
 declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1) nounwind readonly
 
 define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
-  ; CHECK: vblendmps
+  ; CHECK: vblendmps %zmm1, %zmm0
   %res = call <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float> %a1, <16 x float> %a2, i16 %a0) ; <<16 x float>> [#uses=1]
   ret <16 x float> %res
 }
@@ -348,7 +348,7 @@ define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x
 declare <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float>, <16 x float>, i16) nounwind readonly
 
 define <8 x double> @test_x86_mask_blend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK: vblendmpd
+  ; CHECK: vblendmpd %zmm1, %zmm0
   %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a1, <8 x double> %a2, i8 %a0) ; <<8 x double>> [#uses=1]
   ret <8 x double> %res
 }
@@ -382,7 +382,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) no
   ret <8 x i32>%res
  }
  declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
- 
+
  define <16 x i32> @test_cvtps2udq(<16 x float> %a) {
  ;CHECK: vcvtps2udq {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x38,0x79,0xc0]
   %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %a, <16 x i32>zeroinitializer, i16 -1, i32 1)
@@ -392,17 +392,17 @@ declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) no
 
  define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
  ;CHECK: vcmpleps {sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x18,0xc2,0xc1,0x02]
-   %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
+   %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i8 2, i16 -1, i32 8)
    ret i16 %res
  }
- declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)
+ declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i8, i16, i32)
 
  define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
  ;CHECK: vcmpneqpd %zmm{{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xc2,0xc1,0x04]
-   %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
+   %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i8 4, i8 -1, i32 4)
    ret i8 %res
  }
- declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
+ declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i8, i8, i32)
 
  ; cvt intrinsics
  define <16 x float> @test_cvtdq2ps(<16 x i32> %a) {
@@ -551,7 +551,73 @@ define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) {
   ret void
 }
 
-declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 )
+declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8)
+
+define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
+; CHECK-LABEL: test_mask_store_aligned_ps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 )
+
+define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_store_aligned_pd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmovapd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8)
+
+define <16 x float> @test_maskz_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
+; CHECK-LABEL: test_maskz_load_aligned_ps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmovaps (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
+
+define <8 x double> @test_maskz_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_maskz_load_aligned_pd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmovapd (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
+  ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
+
+define <16 x float> @test_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
+; CHECK-LABEL: test_load_aligned_ps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovaps (%rdi), %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
+  ret <16 x float> %res
+}
+
+define <8 x double> @test_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_load_aligned_pd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovapd (%rdi), %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
+  ret <8 x double> %res
+}
 
 define <16 x float> @test_vpermt2ps(<16 x float>%x, <16 x float>%y, <16 x i32>%perm) {
 ; CHECK: vpermt2ps {{.*}}encoding: [0x62,0xf2,0x6d,0x48,0x7f,0xc1]
@@ -678,28 +744,28 @@ declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
 define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
 ; CHECK_LABEL: test_cmp_d_512
 ; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 ##
-  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
 ; CHECK: vpcmpltd %zmm1, %zmm0, %k0 ##
-  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 1, i16 -1)
   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
 ; CHECK: vpcmpled %zmm1, %zmm0, %k0 ##
-  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 2, i16 -1)
   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
 ; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 ##
-  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 3, i16 -1)
   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
 ; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 ##
-  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 4, i16 -1)
   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
 ; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 ##
-  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 5, i16 -1)
   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
 ; CHECK: vpcmpnled %zmm1, %zmm0, %k0 ##
-  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 6, i16 -1)
   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
 ; CHECK: vpcmpordd %zmm1, %zmm0, %k0 ##
-  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 7, i16 -1)
   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   ret <8 x i16> %vec7
 }
@@ -707,59 +773,59 @@ define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
 define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
 ; CHECK_LABEL: test_mask_cmp_d_512
 ; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ##
-  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 0, i16 %mask)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
 ; CHECK: vpcmpltd %zmm1, %zmm0, %k0 {%k1} ##
-  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 1, i16 %mask)
   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
 ; CHECK: vpcmpled %zmm1, %zmm0, %k0 {%k1} ##
-  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 2, i16 %mask)
   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
 ; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 {%k1} ##
-  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 3, i16 %mask)
   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
 ; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ##
-  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 4, i16 %mask)
   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
 ; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ##
-  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 5, i16 %mask)
   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
 ; CHECK: vpcmpnled %zmm1, %zmm0, %k0 {%k1} ##
-  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 6, i16 %mask)
   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
 ; CHECK: vpcmpordd %zmm1, %zmm0, %k0 {%k1} ##
-  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 7, i16 %mask)
   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   ret <8 x i16> %vec7
 }
 
-declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
+declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i8, i16) nounwind readnone
 
 define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
 ; CHECK_LABEL: test_ucmp_d_512
 ; CHECK: vpcmpequd %zmm1, %zmm0, %k0 ##
-  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
 ; CHECK: vpcmpltud %zmm1, %zmm0, %k0 ##
-  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 1, i16 -1)
   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
 ; CHECK: vpcmpleud %zmm1, %zmm0, %k0 ##
-  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 2, i16 -1)
   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
 ; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 ##
-  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 3, i16 -1)
   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
 ; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 ##
-  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 4, i16 -1)
   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
 ; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 ##
-  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 5, i16 -1)
   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
 ; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 ##
-  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 6, i16 -1)
   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
 ; CHECK: vpcmpordud %zmm1, %zmm0, %k0 ##
-  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 7, i16 -1)
   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   ret <8 x i16> %vec7
 }
@@ -767,59 +833,59 @@ define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
 define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
 ; CHECK_LABEL: test_mask_ucmp_d_512
 ; CHECK: vpcmpequd %zmm1, %zmm0, %k0 {%k1} ##
-  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 0, i16 %mask)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
 ; CHECK: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ##
-  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 1, i16 %mask)
   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
 ; CHECK: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ##
-  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 2, i16 %mask)
   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
 ; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 {%k1} ##
-  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 3, i16 %mask)
   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
 ; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 {%k1} ##
-  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 4, i16 %mask)
   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
 ; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ##
-  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 5, i16 %mask)
   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
 ; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ##
-  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 6, i16 %mask)
   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
 ; CHECK: vpcmpordud %zmm1, %zmm0, %k0 {%k1} ##
-  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i8 7, i16 %mask)
   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   ret <8 x i16> %vec7
 }
 
-declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
+declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i8, i16) nounwind readnone
 
 define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
 ; CHECK_LABEL: test_cmp_q_512
 ; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 ##
-  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltq %zmm1, %zmm0, %k0 ##
-  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 1, i8 -1)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleq %zmm1, %zmm0, %k0 ##
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 2, i8 -1)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 ##
-  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 3, i8 -1)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 ##
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 4, i8 -1)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 ##
-  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 5, i8 -1)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 ##
-  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 6, i8 -1)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmpordq %zmm1, %zmm0, %k0 ##
-  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 7, i8 -1)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
@@ -827,59 +893,59 @@ define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
 define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
 ; CHECK_LABEL: test_mask_cmp_q_512
 ; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ##
-  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltq %zmm1, %zmm0, %k0 {%k1} ##
-  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 1, i8 %mask)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleq %zmm1, %zmm0, %k0 {%k1} ##
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 2, i8 %mask)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 {%k1} ##
-  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 3, i8 %mask)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 {%k1} ##
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 4, i8 %mask)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} ##
-  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 5, i8 %mask)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 {%k1} ##
-  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 6, i8 %mask)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmpordq %zmm1, %zmm0, %k0 {%k1} ##
-  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 7, i8 %mask)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
-declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
+declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i8, i8) nounwind readnone
 
 define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
 ; CHECK_LABEL: test_ucmp_q_512
 ; CHECK: vpcmpequq %zmm1, %zmm0, %k0 ##
-  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 ##
-  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 1, i8 -1)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 ##
-  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 2, i8 -1)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 ##
-  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 3, i8 -1)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 ##
-  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 4, i8 -1)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 ##
-  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 5, i8 -1)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 ##
-  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 6, i8 -1)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmporduq %zmm1, %zmm0, %k0 ##
-  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 7, i8 -1)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
@@ -887,33 +953,33 @@ define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
 define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
 ; CHECK_LABEL: test_mask_ucmp_q_512
 ; CHECK: vpcmpequq %zmm1, %zmm0, %k0 {%k1} ##
-  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} ##
-  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 1, i8 %mask)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 {%k1} ##
-  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 2, i8 %mask)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 {%k1} ##
-  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 3, i8 %mask)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 {%k1} ##
-  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 4, i8 %mask)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 {%k1} ##
-  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 5, i8 %mask)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 {%k1} ##
-  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 6, i8 %mask)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmporduq %zmm1, %zmm0, %k0 {%k1} ##
-  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 7, i8 %mask)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
-declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
+declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i8, i8) nounwind readnone
 
 define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
 ; CHECK-LABEL: test_mask_vextractf32x4:
@@ -959,8 +1025,8 @@ define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
 }
 
 define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-  ; CHECK-LABEL: test_x86_avx512_mask_pslli_d 
-  ; CHECK: vpslld $7, %zmm0, %zmm1 {%k1}  
+  ; CHECK-LABEL: test_x86_avx512_mask_pslli_d
+  ; CHECK: vpslld $7, %zmm0, %zmm1 {%k1}
   %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
   ret <16 x i32> %res
 }
@@ -983,14 +1049,14 @@ define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
 
 define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
   ; CHECK-LABEL: test_x86_avx512_mask_pslli_q
-  ; CHECK: vpsllq $7, %zmm0, %zmm1 {%k1}   
+  ; CHECK: vpsllq $7, %zmm0, %zmm1 {%k1}
   %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
   ; CHECK-LABEL: test_x86_avx512_maskz_pslli_q
-  ; CHECK: vpsllq $7, %zmm0, %zmm0 {%k1} {z} 
+  ; CHECK: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
   %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -1006,7 +1072,7 @@ define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
 
 define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
   ; CHECK-LABEL: test_x86_avx512_mask_psrli_d
-  ; CHECK: vpsrld $7, %zmm0, %zmm1 {%k1}  
+  ; CHECK: vpsrld $7, %zmm0, %zmm1 {%k1}
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
   ret <16 x i32> %res
 }
@@ -1029,7 +1095,7 @@ define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
 
 define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
   ; CHECK-LABEL: test_x86_avx512_mask_psrli_q
-  ; CHECK: vpsrlq $7, %zmm0, %zmm1 {%k1}  
+  ; CHECK: vpsrlq $7, %zmm0, %zmm1 {%k1}
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
   ret <8 x i64> %res
 }
@@ -1052,7 +1118,7 @@ define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
 
 define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
   ; CHECK-LABEL: test_x86_avx512_mask_psrai_d
-  ; CHECK: vpsrad $7, %zmm0, %zmm1 {%k1}  
+  ; CHECK: vpsrad $7, %zmm0, %zmm1 {%k1}
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
   ret <16 x i32> %res
 }
@@ -1075,7 +1141,7 @@ define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
 
 define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
   ; CHECK-LABEL: test_x86_avx512_mask_psrai_q
-  ; CHECK: vpsraq $7, %zmm0, %zmm1 {%k1}   
+  ; CHECK: vpsraq $7, %zmm0, %zmm1 {%k1}
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
   ret <8 x i64> %res
 }
@@ -1088,3 +1154,455 @@ define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
 }
 
 declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK-LABEL: test_x86_avx512_psll_d
+  ; CHECK: vpslld
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psll_d
+  ; CHECK: vpslld %xmm1, %zmm0, %zmm2 {%k1}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psll_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psll_d
+  ; CHECK: vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK-LABEL: test_x86_avx512_psll_q
+  ; CHECK: vpsllq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psll_q
+  ; CHECK: vpsllq %xmm1, %zmm0, %zmm2 {%k1}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psll_q
+  ; CHECK: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrl_d(<16 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK-LABEL: test_x86_avx512_psrl_d
+  ; CHECK: vpsrld
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psrl_d
+  ; CHECK: vpsrld %xmm1, %zmm0, %zmm2 {%k1}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrl_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psrl_d
+  ; CHECK: vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK-LABEL: test_x86_avx512_psrl_q
+  ; CHECK: vpsrlq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psrl_q
+  ; CHECK: vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psrl_q
+  ; CHECK: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psra_d(<16 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK-LABEL: test_x86_avx512_psra_d
+  ; CHECK: vpsrad
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psra_d
+  ; CHECK: vpsrad %xmm1, %zmm0, %zmm2 {%k1}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psra_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psra_d
+  ; CHECK: vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK-LABEL: test_x86_avx512_psra_q
+  ; CHECK: vpsraq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psra_q
+  ; CHECK: vpsraq %xmm1, %zmm0, %zmm2 {%k1}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psra_q
+  ; CHECK: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psllv_d(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK-LABEL: test_x86_avx512_psllv_d
+  ; CHECK: vpsllvd
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psllv_d
+  ; CHECK: vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psllv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psllv_d
+  ; CHECK: vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1) {
+  ; CHECK-LABEL: test_x86_avx512_psllv_q
+  ; CHECK: vpsllvq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psllv_q
+  ; CHECK: vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psllv_q
+  ; CHECK: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
+
+
+define <16 x i32> @test_x86_avx512_psrav_d(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK-LABEL: test_x86_avx512_psrav_d
+  ; CHECK: vpsravd
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psrav_d
+  ; CHECK: vpsravd %zmm1, %zmm0, %zmm2 {%k1}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrav_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psrav_d
+  ; CHECK: vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1) {
+  ; CHECK-LABEL: test_x86_avx512_psrav_q
+  ; CHECK: vpsravq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psrav_q
+  ; CHECK: vpsravq %zmm1, %zmm0, %zmm2 {%k1}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psrav_q
+  ; CHECK: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrlv_d(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK-LABEL: test_x86_avx512_psrlv_d
+  ; CHECK: vpsrlvd
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psrlv_d
+  ; CHECK: vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psrlv_d
+  ; CHECK: vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1) {
+  ; CHECK-LABEL: test_x86_avx512_psrlv_q
+  ; CHECK: vpsrlvq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psrlv_q
+  ; CHECK: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q
+  ; CHECK: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, <8 x i64>* %ptr) {
+  ; CHECK-LABEL: test_x86_avx512_psrlv_q_memop
+  ; CHECK: vpsrlvq (%
+  %b = load <8 x i64>* %ptr
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+
+define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) {
+  ; CHECK-LABEL: test_vsubps_rn
+  ; CHECK: vsubps {rn-sae}{{.*}} ## encoding: [0x62,0xf1,0x7c,0x18,0x5c,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float> zeroinitializer, i16 -1, i32 0)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) {
+  ; CHECK-LABEL: test_vsubps_rd
+  ; CHECK: vsubps {rd-sae}{{.*}} ## encoding: [0x62,0xf1,0x7c,0x38,0x5c,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float> zeroinitializer, i16 -1, i32 1)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) {
+  ; CHECK-LABEL: test_vsubps_ru
+  ; CHECK: vsubps {ru-sae}{{.*}} ## encoding: [0x62,0xf1,0x7c,0x58,0x5c,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float> zeroinitializer, i16 -1, i32 2)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) {
+  ; CHECK-LABEL: test_vsubps_rz
+  ; CHECK: vsubps {rz-sae}{{.*}} ## encoding: [0x62,0xf1,0x7c,0x78,0x5c,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float> zeroinitializer, i16 -1, i32 3)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) {
+  ; CHECK-LABEL: test_vmulps_rn
+  ; CHECK: vmulps {rn-sae}{{.*}} ## encoding: [0x62,0xf1,0x7c,0x18,0x59,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float> zeroinitializer, i16 -1, i32 0)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) {
+  ; CHECK-LABEL: test_vmulps_rd
+  ; CHECK: vmulps {rd-sae}{{.*}} ## encoding: [0x62,0xf1,0x7c,0x38,0x59,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float> zeroinitializer, i16 -1, i32 1)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) {
+  ; CHECK-LABEL: test_vmulps_ru
+  ; CHECK: vmulps {ru-sae}{{.*}} ## encoding: [0x62,0xf1,0x7c,0x58,0x59,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float> zeroinitializer, i16 -1, i32 2)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) {
+  ; CHECK-LABEL: test_vmulps_rz
+  ; CHECK: vmulps {rz-sae}{{.*}} ## encoding: [0x62,0xf1,0x7c,0x78,0x59,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float> zeroinitializer, i16 -1, i32 3)
+  ret <16 x float> %res
+}
+
+;; mask float
+define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ; CHECK-LABEL: test_vmulps_mask_rn
+  ; CHECK: vmulps {rn-sae}{{.*}}{%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x59,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float> zeroinitializer, i16 %mask, i32 0)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ; CHECK-LABEL: test_vmulps_mask_rd
+  ; CHECK: vmulps {rd-sae}{{.*}}{%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x59,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float> zeroinitializer, i16 %mask, i32 1)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ; CHECK-LABEL: test_vmulps_mask_ru
+  ; CHECK: vmulps {ru-sae}{{.*}}{%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x59,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float> zeroinitializer, i16 %mask, i32 2)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
+  ; CHECK-LABEL: test_vmulps_mask_rz
+  ; CHECK: vmulps {rz-sae}{{.*}}{%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xf9,0x59,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float> zeroinitializer, i16 %mask, i32 3)
+  ret <16 x float> %res
+}
+
+;; With Passthru value
+define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
+  ; CHECK-LABEL: test_vmulps_mask_passthru_rn
+  ; CHECK: vmulps {rn-sae}{{.*}}{%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x59,0xd1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float> %passthru, i16 %mask, i32 0)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
+  ; CHECK-LABEL: test_vmulps_mask_passthru_rd
+  ; CHECK: vmulps {rd-sae}{{.*}}{%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x59,0xd1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float> %passthru, i16 %mask, i32 1)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
+  ; CHECK-LABEL: test_vmulps_mask_passthru_ru
+  ; CHECK: vmulps {ru-sae}{{.*}}{%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x59,0xd1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float> %passthru, i16 %mask, i32 2)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
+  ; CHECK-LABEL: test_vmulps_mask_passthru_rz
+  ; CHECK: vmulps {rz-sae}{{.*}}{%k1} ## encoding: [0x62,0xf1,0x7c,0x79,0x59,0xd1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float> %passthru, i16 %mask, i32 3)
+  ret <16 x float> %res
+}
+
+;; mask double
+define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
+  ; CHECK-LABEL: test_vmulpd_mask_rn
+  ; CHECK: vmulpd {rn-sae}{{.*}}{%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0x59,0xc1]
+  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
+                    <8 x double> zeroinitializer, i8 %mask, i32 0)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
+  ; CHECK-LABEL: test_vmulpd_mask_rd
+  ; CHECK: vmulpd {rd-sae}{{.*}}{%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0x59,0xc1]
+  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
+                    <8 x double> zeroinitializer, i8 %mask, i32 1)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
+  ; CHECK-LABEL: test_vmulpd_mask_ru
+  ; CHECK: vmulpd {ru-sae}{{.*}}{%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0x59,0xc1]
+  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
+                    <8 x double> zeroinitializer, i8 %mask, i32 2)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
+  ; CHECK-LABEL: test_vmulpd_mask_rz
+  ; CHECK: vmulpd {rz-sae}{{.*}}{%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xf9,0x59,0xc1]
+  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
+                    <8 x double> zeroinitializer, i8 %mask, i32 3)
+  ret <8 x double> %res
+}
diff --git a/test/CodeGen/X86/avx512-logic.ll b/test/CodeGen/X86/avx512-logic.ll
new file mode 100644
index 0000000..bee4f52
--- /dev/null
+++ b/test/CodeGen/X86/avx512-logic.ll
@@ -0,0 +1,101 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: vpandd
+; CHECK: vpandd %zmm
+; CHECK: ret
+define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
+                            i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %x = and <16 x i32> %a2, %b
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: vpord
+; CHECK: vpord %zmm
+; CHECK: ret
+define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
+                            i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %x = or <16 x i32> %a2, %b
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: vpxord
+; CHECK: vpxord %zmm
+; CHECK: ret
+define <16 x i32> @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
+                            i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %x = xor <16 x i32> %a2, %b
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: vpandq
+; CHECK: vpandq %zmm
+; CHECK: ret
+define <8 x i64> @vpandq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %x = and <8 x i64> %a2, %b
+  ret <8 x i64> %x
+}
+
+; CHECK-LABEL: vporq
+; CHECK: vporq %zmm
+; CHECK: ret
+define <8 x i64> @vporq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %x = or <8 x i64> %a2, %b
+  ret <8 x i64> %x
+}
+
+; CHECK-LABEL: vpxorq
+; CHECK: vpxorq %zmm
+; CHECK: ret
+define <8 x i64> @vpxorq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %x = xor <8 x i64> %a2, %b
+  ret <8 x i64> %x
+}
+
+
+; CHECK-LABEL: orq_broadcast
+; CHECK: vporq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK: ret
+define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
+  %b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  ret <8 x i64> %b
+}
+
+; CHECK-LABEL: andd512fold
+; CHECK: vpandd (%
+; CHECK: ret
+define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
+entry:
+  %a = load <16 x i32>* %x, align 4
+  %b = and <16 x i32> %y, %a
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: andqbrst
+; CHECK: vpandq  (%rdi){1to8}, %zmm
+; CHECK: ret
+define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) {
+entry:
+  %a = load i64* %ap, align 8
+  %b = insertelement <8 x i64> undef, i64 %a, i32 0
+  %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
+  %d = and <8 x i64> %p1, %c
+  ret <8 x i64>%d
+}
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index 35d3348..264d915 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -1,28 +1,37 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL --check-prefix=CHECK
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX --check-prefix=CHECK
 
+; CHECK-LABEL: mask16
+; CHECK: kmovw
+; CHECK-NEXT: knotw
+; CHECK-NEXT: kmovw
 define i16 @mask16(i16 %x) {
   %m0 = bitcast i16 %x to <16 x i1>
   %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
   %ret = bitcast <16 x i1> %m1 to i16
   ret i16 %ret
-; CHECK-LABEL: mask16
-; CHECK: kmovw
-; CHECK-NEXT: knotw
-; CHECK-NEXT: kmovw
-; CHECK: ret
 }
 
+; CHECK-LABEL: mask8
+; KNL: kmovw
+; KNL-NEXT: knotw
+; KNL-NEXT: kmovw
+; SKX: kmovb
+; SKX-NEXT: knotb
+; SKX-NEXT: kmovb
+
 define i8 @mask8(i8 %x) {
   %m0 = bitcast i8 %x to <8 x i1>
   %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
   %ret = bitcast <8 x i1> %m1 to i8
   ret i8 %ret
-; CHECK-LABEL: mask8
-; CHECK: kmovw
+}
+
+; CHECK-LABEL: mask16_mem
+; CHECK: kmovw ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
 ; CHECK-NEXT: knotw
-; CHECK-NEXT: kmovw
+; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
 ; CHECK: ret
-}
 
 define void @mask16_mem(i16* %ptr) {
   %x = load i16* %ptr, align 4
@@ -31,13 +40,16 @@ define void @mask16_mem(i16* %ptr) {
   %ret = bitcast <16 x i1> %m1 to i16
   store i16 %ret, i16* %ptr, align 4
   ret void
-; CHECK-LABEL: mask16_mem
-; CHECK: kmovw ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
-; CHECK-NEXT: knotw
-; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
-; CHECK: ret
 }
 
+; CHECK-LABEL: mask8_mem
+; KNL: kmovw ([[ARG1]]), %k{{[0-7]}}
+; KNL-NEXT: knotw
+; KNL-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
+; SKX: kmovb ([[ARG1]]), %k{{[0-7]}}
+; SKX-NEXT: knotb
+; SKX-NEXT: kmovb %k{{[0-7]}}, ([[ARG1]])
+
 define void @mask8_mem(i8* %ptr) {
   %x = load i8* %ptr, align 4
   %m0 = bitcast i8 %x to <8 x i1>
@@ -45,13 +57,12 @@ define void @mask8_mem(i8* %ptr) {
   %ret = bitcast <8 x i1> %m1 to i8
   store i8 %ret, i8* %ptr, align 4
   ret void
-; CHECK-LABEL: mask8_mem
-; CHECK: kmovw ([[ARG1]]), %k{{[0-7]}}
-; CHECK-NEXT: knotw
-; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
-; CHECK: ret
 }
 
+; CHECK-LABEL: mand16
+; CHECK: kandw
+; CHECK: kxorw
+; CHECK: korw
 define i16 @mand16(i16 %x, i16 %y) {
   %ma = bitcast i16 %x to <16 x i1>
   %mb = bitcast i16 %y to <16 x i1>
@@ -59,15 +70,11 @@ define i16 @mand16(i16 %x, i16 %y) {
   %md = xor <16 x i1> %ma, %mb
   %me = or <16 x i1> %mc, %md
   %ret = bitcast <16 x i1> %me to i16
-; CHECK: kandw
-; CHECK: kxorw
-; CHECK: korw
   ret i16 %ret
 }
 
-; CHECK: shuf_test1
+; CHECK-LABEL: shuf_test1
 ; CHECK: kshiftrw        $8
-; CHECK:ret
 define i8 @shuf_test1(i16 %v) nounwind {
    %v1 = bitcast i16 %v to <16 x i1>
    %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -75,11 +82,11 @@ define i8 @shuf_test1(i16 %v) nounwind {
    ret i8 %mask1
 }
 
-; CHECK: zext_test1
+; CHECK-LABEL: zext_test1
 ; CHECK: kshiftlw
 ; CHECK: kshiftrw
 ; CHECK: kmovw
-; CHECK:ret
+
 define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
@@ -87,11 +94,11 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
   ret i32 %res
 }
 
-; CHECK: zext_test2
+; CHECK-LABEL: zext_test2
 ; CHECK: kshiftlw
 ; CHECK: kshiftrw
 ; CHECK: kmovw
-; CHECK:ret
+
 define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
@@ -99,14 +106,29 @@ define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
   ret i16 %res
 }
 
-; CHECK: zext_test3
+; CHECK-LABEL: zext_test3
 ; CHECK: kshiftlw
 ; CHECK: kshiftrw
 ; CHECK: kmovw
-; CHECK:ret
+
 define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
   %res = zext i1 %cmp_res.i1 to i8
   ret i8 %res
 }
+
+; CHECK-LABEL: conv1
+; KNL: kmovw   %k0, %eax
+; KNL: movb    %al, (%rdi)
+; SKX: kmovb   %k0, (%rdi)
+define i8 @conv1(<8 x i1>* %R) {
+entry:
+  store <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1>* %R
+
+  %maskPtr = alloca <8 x i1>
+  store <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1>* %maskPtr
+  %mask = load <8 x i1>* %maskPtr
+  %mask_convert = bitcast <8 x i1> %mask to i8
+  ret i8 %mask_convert
+}
+\ No newline at end of file
diff --git a/test/CodeGen/X86/avx512-nontemporal.ll b/test/CodeGen/X86/avx512-nontemporal.ll
index ef50cdb..bf57d02 100644
--- a/test/CodeGen/X86/avx512-nontemporal.ll
+++ b/test/CodeGen/X86/avx512-nontemporal.ll
@@ -16,4 +16,4 @@ define void @f(<16 x float> %A, <16 x float> %AA, i8* %B, <8 x double> %C, <8 x
   ret void
 }
 
-!0 = metadata !{i32 1}
+!0 = !{i32 1}
diff --git a/test/CodeGen/X86/avx512-round.ll b/test/CodeGen/X86/avx512-round.ll
new file mode 100644
index 0000000..ffeb2a8
--- /dev/null
+++ b/test/CodeGen/X86/avx512-round.ll
@@ -0,0 +1,106 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
+
+define <16 x float> @floor_v16f32(<16 x float> %a) {
+; CHECK-LABEL: floor_v16f32
+; CHECK: vrndscaleps $1, {{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0xc0,0x01]
+  %res = call <16 x float> @llvm.floor.v16f32(<16 x float> %a)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.floor.v16f32(<16 x float> %p)
+
+define <8 x double> @floor_v8f64(<8 x double> %a) {
+; CHECK-LABEL: floor_v8f64
+; CHECK: vrndscalepd $1, {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0xc0,0x01]
+  %res = call <8 x double> @llvm.floor.v8f64(<8 x double> %a)
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.floor.v8f64(<8 x double> %p)
+
+define <16 x float> @ceil_v16f32(<16 x float> %a) {
+; CHECK-LABEL: ceil_v16f32
+; CHECK: vrndscaleps $2, {{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0xc0,0x02]
+  %res = call <16 x float> @llvm.ceil.v16f32(<16 x float> %a)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.ceil.v16f32(<16 x float> %p)
+
+define <8 x double> @ceil_v8f64(<8 x double> %a) {
+; CHECK-LABEL: ceil_v8f64
+; CHECK: vrndscalepd $2, {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0xc0,0x02]
+  %res = call <8 x double> @llvm.ceil.v8f64(<8 x double> %a)
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.ceil.v8f64(<8 x double> %p)
+
+define <16 x float> @trunc_v16f32(<16 x float> %a) {
+; CHECK-LABEL: trunc_v16f32
+; CHECK: vrndscaleps $3, {{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0xc0,0x03]
+  %res = call <16 x float> @llvm.trunc.v16f32(<16 x float> %a)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.trunc.v16f32(<16 x float> %p)
+
+define <8 x double> @trunc_v8f64(<8 x double> %a) {
+; CHECK-LABEL: trunc_v8f64
+; CHECK: vrndscalepd $3, {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0xc0,0x03]
+  %res = call <8 x double> @llvm.trunc.v8f64(<8 x double> %a)
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.trunc.v8f64(<8 x double> %p)
+
+define <16 x float> @rint_v16f32(<16 x float> %a) {
+; CHECK-LABEL: rint_v16f32
+; CHECK: vrndscaleps $4, {{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0xc0,0x04]
+  %res = call <16 x float> @llvm.rint.v16f32(<16 x float> %a)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.rint.v16f32(<16 x float> %p)
+
+define <8 x double> @rint_v8f64(<8 x double> %a) {
+; CHECK-LABEL: rint_v8f64
+; CHECK: vrndscalepd $4, {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0xc0,0x04]
+  %res = call <8 x double> @llvm.rint.v8f64(<8 x double> %a)
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.rint.v8f64(<8 x double> %p)
+
+define <16 x float> @nearbyint_v16f32(<16 x float> %a) {
+; CHECK-LABEL: nearbyint_v16f32
+; CHECK: vrndscaleps $12, {{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0xc0,0x0c]
+  %res = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> %a)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p)
+
+define <8 x double> @nearbyint_v8f64(<8 x double> %a) {
+; CHECK-LABEL: nearbyint_v8f64
+; CHECK: vrndscalepd $12, {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0xc0,0x0c]
+  %res = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %a)
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p)
+
+define double @nearbyint_f64(double %a) {
+; CHECK-LABEL: nearbyint_f64
+; CHECK: vrndscalesd $12, {{.*}}encoding: [0x62,0xf3,0xfd,0x08,0x0b,0xc0,0x0c]
+  %res = call double @llvm.nearbyint.f64(double %a)
+  ret double %res
+}
+declare double @llvm.nearbyint.f64(double %p)
+
+define float @floor_f32(float %a) {
+; CHECK-LABEL: floor_f32
+; CHECK: vrndscaless $1, {{.*}}encoding: [0x62,0xf3,0x7d,0x08,0x0a,0xc0,0x01]
+  %res = call float @llvm.floor.f32(float %a)
+  ret float %res
+}
+declare float @llvm.floor.f32(float %p)
+
+define float @floor_f32m(float* %aptr) {
+; CHECK-LABEL: floor_f32m
+; CHECK: vrndscaless $1, (%rdi), {{.*}}encoding: [0x62,0xf3,0x7d,0x08,0x0a,0x07,0x01]
+  %a = load float* %aptr, align 4
+  %res = call float @llvm.floor.f32(float %a)
+  ret float %res
+}
+
diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 0b0e0fc..5bb8233 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -20,6 +20,14 @@ define   <8 x i64> @_inreg8xi64(i64 %a) {
   ret <8 x i64> %c
 }
 
+;CHECK-LABEL: _ss16xfloat_v4
+;CHECK: vbroadcastss %xmm0, %zmm0
+;CHECK: ret
+define   <16 x float> @_ss16xfloat_v4(<4 x float> %a) {
+  %b = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> zeroinitializer
+  ret <16 x float> %b
+}
+
 define   <16 x float> @_inreg16xfloat(float %a) {
 ; CHECK-LABEL: _inreg16xfloat:
 ; CHECK:       ## BB#0:
@@ -30,6 +38,62 @@ define   <16 x float> @_inreg16xfloat(float %a) {
   ret <16 x float> %c
 }
 
+;CHECK-LABEL: _ss16xfloat_mask:
+;CHECK: vbroadcastss %xmm0, %zmm1 {%k1}
+;CHECK: ret
+define   <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %mask1) {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %b = insertelement <16 x float> undef, float %a, i32 0
+  %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+  %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> %i
+  ret <16 x float> %r
+}
+
+;CHECK-LABEL: _ss16xfloat_maskz:
+;CHECK: vbroadcastss %xmm0, %zmm0 {%k1} {z}
+;CHECK: ret
+define   <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %b = insertelement <16 x float> undef, float %a, i32 0
+  %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+  %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> zeroinitializer
+  ret <16 x float> %r
+}
+
+;CHECK-LABEL: _ss16xfloat_load:
+;CHECK: vbroadcastss (%{{.*}}, %zmm
+;CHECK: ret
+define   <16 x float> @_ss16xfloat_load(float* %a.ptr) {
+  %a = load float* %a.ptr
+  %b = insertelement <16 x float> undef, float %a, i32 0
+  %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+  ret <16 x float> %c
+}
+
+;CHECK-LABEL: _ss16xfloat_mask_load:
+;CHECK: vbroadcastss (%rdi), %zmm0 {%k1}
+;CHECK: ret
+define   <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 x i32> %mask1) {
+  %a = load float* %a.ptr
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %b = insertelement <16 x float> undef, float %a, i32 0
+  %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+  %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> %i
+  ret <16 x float> %r
+}
+
+;CHECK-LABEL: _ss16xfloat_maskz_load:
+;CHECK: vbroadcastss (%rdi), %zmm0 {%k1} {z}
+;CHECK: ret
+define   <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) {
+  %a = load float* %a.ptr
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %b = insertelement <16 x float> undef, float %a, i32 0
+  %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+  %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> zeroinitializer
+  ret <16 x float> %r
+}
+
 define   <8 x double> @_inreg8xdouble(double %a) {
 ; CHECK-LABEL: _inreg8xdouble:
 ; CHECK:       ## BB#0:
@@ -40,6 +104,62 @@ define   <8 x double> @_inreg8xdouble(double %a) {
   ret <8 x double> %c
 }
 
+;CHECK-LABEL: _sd8xdouble_mask:
+;CHECK: vbroadcastsd %xmm0, %zmm1 {%k1}
+;CHECK: ret
+define   <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %mask1) {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %b = insertelement <8 x double> undef, double %a, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+  %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> %i
+  ret <8 x double> %r
+}
+
+;CHECK-LABEL: _sd8xdouble_maskz:
+;CHECK: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
+;CHECK: ret
+define   <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %b = insertelement <8 x double> undef, double %a, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+  %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> zeroinitializer
+  ret <8 x double> %r
+}
+
+;CHECK-LABEL: _sd8xdouble_load:
+;CHECK: vbroadcastsd (%rdi), %zmm
+;CHECK: ret
+define   <8 x double> @_sd8xdouble_load(double* %a.ptr) {
+  %a = load double* %a.ptr
+  %b = insertelement <8 x double> undef, double %a, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+  ret <8 x double> %c
+}
+
+;CHECK-LABEL: _sd8xdouble_mask_load:
+;CHECK: vbroadcastsd (%rdi), %zmm0 {%k1}
+;CHECK: ret
+define   <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 x i32> %mask1) {
+  %a = load double* %a.ptr
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %b = insertelement <8 x double> undef, double %a, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+  %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> %i
+  ret <8 x double> %r
+}
+
+define   <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) {
+; CHECK-LABEL: _sd8xdouble_maskz_load:
+; CHECK:    vbroadcastsd (%rdi), %zmm0 {%k1} {z}
+; CHECK:    ret
+  %a = load double* %a.ptr
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %b = insertelement <8 x double> undef, double %a, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+  %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> zeroinitializer
+  ret <8 x double> %r
+}
+
 define   <16 x i32> @_xmm16xi32(<16 x i32> %a) {
 ; CHECK-LABEL: _xmm16xi32:
 ; CHECK:       ## BB#0:
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index c71e60e..b16f5c9 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -37,15 +37,15 @@ define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwin
   ret <16 x i32> %max
 }
 
-define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y) nounwind {
+define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind {
 ; CHECK-LABEL: test4_unsigned:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k1
-; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask = icmp uge <16 x i32> %x, %y
-  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
+  %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
   ret <16 x i32> %max
 }
 
@@ -61,15 +61,15 @@ define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
   ret <8 x i64> %max
 }
 
-define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y) nounwind {
+define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1) nounwind {
 ; CHECK-LABEL: test6_unsigned:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
-; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask = icmp ugt <8 x i64> %x, %y
-  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
+  %max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y
   ret <8 x i64> %max
 }
 
@@ -196,15 +196,15 @@ define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
   ret <8 x i64>%res
 }
 
-define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y) nounwind {
+define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind {
 ; CHECK-LABEL: test16:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled %zmm0, %zmm1, %k1
-; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask = icmp sge <16 x i32> %x, %y
-  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
+  %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
   ret <16 x i32> %max
 }
 
diff --git a/test/CodeGen/X86/avx512bw-arith.ll b/test/CodeGen/X86/avx512bw-arith.ll
new file mode 100644
index 0000000..94f68a2
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-arith.ll
@@ -0,0 +1,102 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw| FileCheck %s
+
+; CHECK-LABEL: vpaddb512_test
+; CHECK: vpaddb %zmm{{.*}}
+; CHECK: ret
+define <64 x i8> @vpaddb512_test(<64 x i8> %i, <64 x i8> %j) nounwind readnone {
+  %x = add <64 x i8> %i, %j
+  ret <64 x i8> %x
+}
+
+; CHECK-LABEL: vpaddb512_fold_test
+; CHECK: vpaddb (%rdi), %zmm{{.*}}
+; CHECK: ret
+define <64 x i8> @vpaddb512_fold_test(<64 x i8> %i, <64 x i8>* %j) nounwind {
+  %tmp = load <64 x i8>* %j, align 4
+  %x = add <64 x i8> %i, %tmp
+  ret <64 x i8> %x
+}
+
+; CHECK-LABEL: vpaddw512_test
+; CHECK: vpaddw %zmm{{.*}}
+; CHECK: ret
+define <32 x i16> @vpaddw512_test(<32 x i16> %i, <32 x i16> %j) nounwind readnone {
+  %x = add <32 x i16> %i, %j
+  ret <32 x i16> %x
+}
+
+; CHECK-LABEL: vpaddw512_fold_test
+; CHECK: vpaddw (%rdi), %zmm{{.*}}
+; CHECK: ret
+define <32 x i16> @vpaddw512_fold_test(<32 x i16> %i, <32 x i16>* %j) nounwind {
+  %tmp = load <32 x i16>* %j, align 4
+  %x = add <32 x i16> %i, %tmp
+  ret <32 x i16> %x
+}
+
+; CHECK-LABEL: vpaddw512_mask_test
+; CHECK: vpaddw %zmm{{.*%k[1-7].*}}
+; CHECK: ret
+define <32 x i16> @vpaddw512_mask_test(<32 x i16> %i, <32 x i16> %j, <32 x i16> %mask1) nounwind readnone {
+  %mask = icmp ne <32 x i16> %mask1, zeroinitializer
+  %x = add <32 x i16> %i, %j
+  %r = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %i
+  ret <32 x i16> %r
+}
+
+; CHECK-LABEL: vpaddw512_maskz_test
+; CHECK: vpaddw %zmm{{.*{%k[1-7]} {z}.*}}
+; CHECK: ret
+define <32 x i16> @vpaddw512_maskz_test(<32 x i16> %i, <32 x i16> %j, <32 x i16> %mask1) nounwind readnone {
+  %mask = icmp ne <32 x i16> %mask1, zeroinitializer
+  %x = add <32 x i16> %i, %j
+  %r = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+  ret <32 x i16> %r
+}
+
+; CHECK-LABEL: vpaddw512_mask_fold_test
+; CHECK: vpaddw (%rdi), %zmm{{.*%k[1-7]}}
+; CHECK: ret
+define <32 x i16> @vpaddw512_mask_fold_test(<32 x i16> %i, <32 x i16>* %j.ptr, <32 x i16> %mask1) nounwind readnone {
+  %mask = icmp ne <32 x i16> %mask1, zeroinitializer
+  %j = load <32 x i16>* %j.ptr
+  %x = add <32 x i16> %i, %j
+  %r = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %i
+  ret <32 x i16> %r
+}
+
+; CHECK-LABEL: vpaddw512_maskz_fold_test
+; CHECK: vpaddw (%rdi), %zmm{{.*{%k[1-7]} {z}}}
+; CHECK: ret
+define <32 x i16> @vpaddw512_maskz_fold_test(<32 x i16> %i, <32 x i16>* %j.ptr, <32 x i16> %mask1) nounwind readnone {
+  %mask = icmp ne <32 x i16> %mask1, zeroinitializer
+  %j = load <32 x i16>* %j.ptr
+  %x = add <32 x i16> %i, %j
+  %r = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+  ret <32 x i16> %r
+}
+
+; CHECK-LABEL: vpsubb512_test
+; CHECK: vpsubb %zmm{{.*}}
+; CHECK: ret
+define <64 x i8> @vpsubb512_test(<64 x i8> %i, <64 x i8> %j) nounwind readnone {
+  %x = sub <64 x i8> %i, %j
+  ret <64 x i8> %x
+}
+
+; CHECK-LABEL: vpsubw512_test
+; CHECK: vpsubw %zmm{{.*}}
+; CHECK: ret
+define <32 x i16> @vpsubw512_test(<32 x i16> %i, <32 x i16> %j) nounwind readnone {
+  %x = sub <32 x i16> %i, %j
+  ret <32 x i16> %x
+}
+
+; CHECK-LABEL: vpmullw512_test
+; CHECK: vpmullw %zmm{{.*}}
+; CHECK: ret
+define <32 x i16> @vpmullw512_test(<32 x i16> %i, <32 x i16> %j) {
+  %x = mul <32 x i16> %i, %j
+  ret <32 x i16> %x
+}
+
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index bbc418c..308de16 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw --show-mc-encoding| FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding| FileCheck %s
 
 define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
 ; CHECK-LABEL: test_pcmpeq_b
@@ -67,28 +67,28 @@ declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32)
 define <8 x i64> @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
 ; CHECK_LABEL: test_cmp_b_512
 ; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
-  %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
+  %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 0, i64 -1)
   %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
 ; CHECK: vpcmpltb %zmm1, %zmm0, %k0 ##
-  %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
+  %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 1, i64 -1)
   %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
 ; CHECK: vpcmpleb %zmm1, %zmm0, %k0 ##
-  %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
+  %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 2, i64 -1)
   %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
 ; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 ##
-  %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
+  %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 3, i64 -1)
   %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
 ; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 ##
-  %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
+  %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 4, i64 -1)
   %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
 ; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 ##
-  %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
+  %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 5, i64 -1)
   %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
 ; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 ##
-  %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
+  %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 6, i64 -1)
   %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
 ; CHECK: vpcmpordb %zmm1, %zmm0, %k0 ##
-  %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
+  %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 7, i64 -1)
   %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
   ret <8 x i64> %vec7
 }
@@ -96,59 +96,59 @@ define <8 x i64> @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
 define <8 x i64> @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
 ; CHECK_LABEL: test_mask_cmp_b_512
 ; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
-  %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
+  %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 0, i64 %mask)
   %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
 ; CHECK: vpcmpltb %zmm1, %zmm0, %k0 {%k1} ##
-  %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
+  %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 1, i64 %mask)
   %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
 ; CHECK: vpcmpleb %zmm1, %zmm0, %k0 {%k1} ##
-  %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
+  %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 2, i64 %mask)
   %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
 ; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 {%k1} ##
-  %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
+  %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 3, i64 %mask)
   %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
 ; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} ##
-  %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
+  %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 4, i64 %mask)
   %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
 ; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} ##
-  %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
+  %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 5, i64 %mask)
   %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
 ; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 {%k1} ##
-  %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
+  %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 6, i64 %mask)
   %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
 ; CHECK: vpcmpordb %zmm1, %zmm0, %k0 {%k1} ##
-  %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
+  %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 7, i64 %mask)
   %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
   ret <8 x i64> %vec7
 }
 
-declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
+declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i8, i64) nounwind readnone
 
 define <8 x i64> @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
 ; CHECK_LABEL: test_ucmp_b_512
 ; CHECK: vpcmpequb %zmm1, %zmm0, %k0 ##
-  %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
+  %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 0, i64 -1)
   %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
 ; CHECK: vpcmpltub %zmm1, %zmm0, %k0 ##
-  %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
+  %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 1, i64 -1)
   %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
 ; CHECK: vpcmpleub %zmm1, %zmm0, %k0 ##
-  %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
+  %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 2, i64 -1)
   %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
 ; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 ##
-  %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
+  %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 3, i64 -1)
   %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
 ; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 ##
-  %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
+  %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 4, i64 -1)
   %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
 ; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 ##
-  %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
+  %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 5, i64 -1)
   %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
 ; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 ##
-  %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
+  %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 6, i64 -1)
   %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
 ; CHECK: vpcmpordub %zmm1, %zmm0, %k0 ##
-  %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
+  %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 7, i64 -1)
   %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
   ret <8 x i64> %vec7
 }
@@ -156,59 +156,59 @@ define <8 x i64> @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
 define <8 x i64> @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
 ; CHECK_LABEL: test_mask_ucmp_b_512
 ; CHECK: vpcmpequb %zmm1, %zmm0, %k0 {%k1} ##
-  %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
+  %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 0, i64 %mask)
   %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
 ; CHECK: vpcmpltub %zmm1, %zmm0, %k0 {%k1} ##
-  %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
+  %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 1, i64 %mask)
   %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
 ; CHECK: vpcmpleub %zmm1, %zmm0, %k0 {%k1} ##
-  %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
+  %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 2, i64 %mask)
   %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
 ; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 {%k1} ##
-  %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
+  %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 3, i64 %mask)
   %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
 ; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 {%k1} ##
-  %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
+  %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 4, i64 %mask)
   %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
 ; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} ##
-  %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
+  %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 5, i64 %mask)
   %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
 ; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} ##
-  %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
+  %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 6, i64 %mask)
   %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
 ; CHECK: vpcmpordub %zmm1, %zmm0, %k0 {%k1} ##
-  %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
+  %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 7, i64 %mask)
   %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
   ret <8 x i64> %vec7
 }
 
-declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
+declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i8, i64) nounwind readnone
 
 define <8 x i32> @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
 ; CHECK_LABEL: test_cmp_w_512
 ; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
-  %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
+  %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 0, i32 -1)
   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
 ; CHECK: vpcmpltw %zmm1, %zmm0, %k0 ##
-  %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
+  %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 1, i32 -1)
   %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
 ; CHECK: vpcmplew %zmm1, %zmm0, %k0 ##
-  %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
+  %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 2, i32 -1)
   %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
 ; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 ##
-  %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
+  %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 3, i32 -1)
   %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
 ; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 ##
-  %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
+  %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 4, i32 -1)
   %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
 ; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 ##
-  %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
+  %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 5, i32 -1)
   %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
 ; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 ##
-  %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
+  %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 6, i32 -1)
   %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
 ; CHECK: vpcmpordw %zmm1, %zmm0, %k0 ##
-  %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
+  %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 7, i32 -1)
   %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
   ret <8 x i32> %vec7
 }
@@ -216,59 +216,59 @@ define <8 x i32> @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
 define <8 x i32> @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
 ; CHECK_LABEL: test_mask_cmp_w_512
 ; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
-  %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
+  %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 0, i32 %mask)
   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
 ; CHECK: vpcmpltw %zmm1, %zmm0, %k0 {%k1} ##
-  %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
+  %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 1, i32 %mask)
   %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
 ; CHECK: vpcmplew %zmm1, %zmm0, %k0 {%k1} ##
-  %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
+  %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 2, i32 %mask)
   %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
 ; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 {%k1} ##
-  %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
+  %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 3, i32 %mask)
   %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
 ; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} ##
-  %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
+  %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 4, i32 %mask)
   %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
 ; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} ##
-  %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
+  %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 5, i32 %mask)
   %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
 ; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 {%k1} ##
-  %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
+  %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 6, i32 %mask)
   %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
 ; CHECK: vpcmpordw %zmm1, %zmm0, %k0 {%k1} ##
-  %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
+  %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 7, i32 %mask)
   %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
   ret <8 x i32> %vec7
 }
 
-declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
+declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i8, i32) nounwind readnone
 
 define <8 x i32> @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
 ; CHECK_LABEL: test_ucmp_w_512
 ; CHECK: vpcmpequw %zmm1, %zmm0, %k0 ##
-  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
+  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 0, i32 -1)
   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
 ; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 ##
-  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
+  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 1, i32 -1)
   %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
 ; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 ##
-  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
+  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 2, i32 -1)
   %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
 ; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 ##
-  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
+  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 3, i32 -1)
   %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
 ; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 ##
-  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
+  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 4, i32 -1)
   %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
 ; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 ##
-  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
+  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 5, i32 -1)
   %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
 ; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 ##
-  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
+  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 6, i32 -1)
   %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
 ; CHECK: vpcmporduw %zmm1, %zmm0, %k0 ##
-  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
+  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 7, i32 -1)
   %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
   ret <8 x i32> %vec7
 }
@@ -276,30 +276,78 @@ define <8 x i32> @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
 define <8 x i32> @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
 ; CHECK_LABEL: test_mask_ucmp_w_512
 ; CHECK: vpcmpequw %zmm1, %zmm0, %k0 {%k1} ##
-  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
+  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 0, i32 %mask)
   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
 ; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} ##
-  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
+  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 1, i32 %mask)
   %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
 ; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} ##
-  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
+  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 2, i32 %mask)
   %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
 ; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 {%k1} ##
-  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
+  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 3, i32 %mask)
   %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
 ; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 {%k1} ##
-  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
+  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 4, i32 %mask)
   %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
 ; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} ##
-  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
+  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 5, i32 %mask)
   %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
 ; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} ##
-  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
+  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 6, i32 %mask)
   %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
 ; CHECK: vpcmporduw %zmm1, %zmm0, %k0 {%k1} ##
-  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
+  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 7, i32 %mask)
   %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
   ret <8 x i32> %vec7
 }
 
-declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
+declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i8, i32) nounwind readnone
+
+; CHECK-LABEL: test_x86_mask_blend_b_256
+; CHECK: vpblendmb
+define <32 x i8> @test_x86_mask_blend_b_256(i32 %a0, <32 x i8> %a1, <32 x i8> %a2) {
+  %res = call <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8> %a1, <32 x i8> %a2, i32 %a0) ; <<32 x i8>> [#uses=1]
+  ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8>, <32 x i8>, i32) nounwind readonly
+
+; CHECK-LABEL: test_x86_mask_blend_w_256
+define <16 x i16> @test_x86_mask_blend_w_256(i16 %mask, <16 x i16> %a1, <16 x i16> %a2) {
+  ; CHECK: vpblendmw
+  %res = call <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16> %a1, <16 x i16> %a2, i16 %mask) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16>, <16 x i16>, i16) nounwind readonly
+
+; CHECK-LABEL: test_x86_mask_blend_b_512
+; CHECK: vpblendmb
+define <64 x i8> @test_x86_mask_blend_b_512(i64 %a0, <64 x i8> %a1, <64 x i8> %a2) {
+  %res = call <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8> %a1, <64 x i8> %a2, i64 %a0) ; <<64 x i8>> [#uses=1]
+  ret <64 x i8> %res
+}
+declare <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8>, <64 x i8>, i64) nounwind readonly
+
+; CHECK-LABEL: test_x86_mask_blend_w_512
+define <32 x i16> @test_x86_mask_blend_w_512(i32 %mask, <32 x i16> %a1, <32 x i16> %a2) {
+  ; CHECK: vpblendmw
+  %res = call <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16> %a1, <32 x i16> %a2, i32 %mask) ; <<32 x i16>> [#uses=1]
+  ret <32 x i16> %res
+}
+declare <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16>, <32 x i16>, i32) nounwind readonly
+
+; CHECK-LABEL: test_x86_mask_blend_b_128
+; CHECK: vpblendmb
+define <16 x i8> @test_x86_mask_blend_b_128(i16 %a0, <16 x i8> %a1, <16 x i8> %a2) {
+  %res = call <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8> %a1, <16 x i8> %a2, i16 %a0) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8>, <16 x i8>, i16) nounwind readonly
+
+; CHECK-LABEL: test_x86_mask_blend_w_128
+define <8 x i16> @test_x86_mask_blend_w_128(i8 %mask, <8 x i16> %a1, <8 x i16> %a2) {
+  ; CHECK: vpblendmw
+  %res = call <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16> %a1, <8 x i16> %a2, i8 %mask) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16>, <8 x i16>, i8) nounwind readonly
diff --git a/test/CodeGen/X86/avx512bw-vec-cmp.ll b/test/CodeGen/X86/avx512bw-vec-cmp.ll
index d2b1724..6ba4db6 100644
--- a/test/CodeGen/X86/avx512bw-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512bw-vec-cmp.ll
@@ -14,9 +14,9 @@ define <64 x i8> @test1(<64 x i8> %x, <64 x i8> %y) nounwind {
 ; CHECK: vpcmpgtb {{.*%k[0-7]}}
 ; CHECK: vmovdqu8 {{.*}}%k1
 ; CHECK: ret
-define <64 x i8> @test2(<64 x i8> %x, <64 x i8> %y) nounwind {
+define <64 x i8> @test2(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind {
   %mask = icmp sgt <64 x i8> %x, %y
-  %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y
+  %max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y
   ret <64 x i8> %max
 }
 
@@ -34,9 +34,9 @@ define <32 x i16> @test3(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1) nounwind
 ; CHECK: vpcmpnleub {{.*%k[0-7]}}
 ; CHECK: vmovdqu8 {{.*}}%k1
 ; CHECK: ret
-define <64 x i8> @test4(<64 x i8> %x, <64 x i8> %y) nounwind {
+define <64 x i8> @test4(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind {
   %mask = icmp ugt <64 x i8> %x, %y
-  %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y
+  %max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y
   ret <64 x i8> %max
 }
 
diff --git a/test/CodeGen/X86/avx512bwvl-arith.ll b/test/CodeGen/X86/avx512bwvl-arith.ll
new file mode 100644
index 0000000..96f0140
--- /dev/null
+++ b/test/CodeGen/X86/avx512bwvl-arith.ll
@@ -0,0 +1,206 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl| FileCheck %s
+
+; 256-bit
+
+; CHECK-LABEL: vpaddb256_test
+; CHECK: vpaddb %ymm{{.*}}
+; CHECK: ret
+define <32 x i8> @vpaddb256_test(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+  %x = add <32 x i8> %i, %j
+  ret <32 x i8> %x
+}
+
+; CHECK-LABEL: vpaddb256_fold_test
+; CHECK: vpaddb (%rdi), %ymm{{.*}}
+; CHECK: ret
+define <32 x i8> @vpaddb256_fold_test(<32 x i8> %i, <32 x i8>* %j) nounwind {
+  %tmp = load <32 x i8>* %j, align 4
+  %x = add <32 x i8> %i, %tmp
+  ret <32 x i8> %x
+}
+
+; CHECK-LABEL: vpaddw256_test
+; CHECK: vpaddw %ymm{{.*}}
+; CHECK: ret
+define <16 x i16> @vpaddw256_test(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+  %x = add <16 x i16> %i, %j
+  ret <16 x i16> %x
+}
+
+; CHECK-LABEL: vpaddw256_fold_test
+; CHECK: vpaddw (%rdi), %ymm{{.*}}
+; CHECK: ret
+define <16 x i16> @vpaddw256_fold_test(<16 x i16> %i, <16 x i16>* %j) nounwind {
+  %tmp = load <16 x i16>* %j, align 4
+  %x = add <16 x i16> %i, %tmp
+  ret <16 x i16> %x
+}
+
+; CHECK-LABEL: vpaddw256_mask_test
+; CHECK: vpaddw %ymm{{.*%k[1-7].*}}
+; CHECK: ret
+define <16 x i16> @vpaddw256_mask_test(<16 x i16> %i, <16 x i16> %j, <16 x i16> %mask1) nounwind readnone {
+  %mask = icmp ne <16 x i16> %mask1, zeroinitializer
+  %x = add <16 x i16> %i, %j
+  %r = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %i
+  ret <16 x i16> %r
+}
+
+; CHECK-LABEL: vpaddw256_maskz_test
+; CHECK: vpaddw %ymm{{.*{%k[1-7]} {z}.*}}
+; CHECK: ret
+define <16 x i16> @vpaddw256_maskz_test(<16 x i16> %i, <16 x i16> %j, <16 x i16> %mask1) nounwind readnone {
+  %mask = icmp ne <16 x i16> %mask1, zeroinitializer
+  %x = add <16 x i16> %i, %j
+  %r = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+  ret <16 x i16> %r
+}
+
+; CHECK-LABEL: vpaddw256_mask_fold_test
+; CHECK: vpaddw (%rdi), %ymm{{.*%k[1-7]}}
+; CHECK: ret
+define <16 x i16> @vpaddw256_mask_fold_test(<16 x i16> %i, <16 x i16>* %j.ptr, <16 x i16> %mask1) nounwind readnone {
+  %mask = icmp ne <16 x i16> %mask1, zeroinitializer
+  %j = load <16 x i16>* %j.ptr
+  %x = add <16 x i16> %i, %j
+  %r = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %i
+  ret <16 x i16> %r
+}
+
+; CHECK-LABEL: vpaddw256_maskz_fold_test
+; CHECK: vpaddw (%rdi), %ymm{{.*{%k[1-7]} {z}}}
+; CHECK: ret
+define <16 x i16> @vpaddw256_maskz_fold_test(<16 x i16> %i, <16 x i16>* %j.ptr, <16 x i16> %mask1) nounwind readnone {
+  %mask = icmp ne <16 x i16> %mask1, zeroinitializer
+  %j = load <16 x i16>* %j.ptr
+  %x = add <16 x i16> %i, %j
+  %r = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+  ret <16 x i16> %r
+}
+
+; CHECK-LABEL: vpsubb256_test
+; CHECK: vpsubb %ymm{{.*}}
+; CHECK: ret
+define <32 x i8> @vpsubb256_test(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+  %x = sub <32 x i8> %i, %j
+  ret <32 x i8> %x
+}
+
+; CHECK-LABEL: vpsubw256_test
+; CHECK: vpsubw %ymm{{.*}}
+; CHECK: ret
+define <16 x i16> @vpsubw256_test(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+  %x = sub <16 x i16> %i, %j
+  ret <16 x i16> %x
+}
+
+; CHECK-LABEL: vpmullw256_test
+; CHECK: vpmullw %ymm{{.*}}
+; CHECK: ret
+define <16 x i16> @vpmullw256_test(<16 x i16> %i, <16 x i16> %j) {
+  %x = mul <16 x i16> %i, %j
+  ret <16 x i16> %x
+}
+
+; 128-bit
+
+; CHECK-LABEL: vpaddb128_test
+; CHECK: vpaddb %xmm{{.*}}
+; CHECK: ret
+define <16 x i8> @vpaddb128_test(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
+  %x = add <16 x i8> %i, %j
+  ret <16 x i8> %x
+}
+
+; CHECK-LABEL: vpaddb128_fold_test
+; CHECK: vpaddb (%rdi), %xmm{{.*}}
+; CHECK: ret
+define <16 x i8> @vpaddb128_fold_test(<16 x i8> %i, <16 x i8>* %j) nounwind {
+  %tmp = load <16 x i8>* %j, align 4
+  %x = add <16 x i8> %i, %tmp
+  ret <16 x i8> %x
+}
+
+; CHECK-LABEL: vpaddw128_test
+; CHECK: vpaddw %xmm{{.*}}
+; CHECK: ret
+define <8 x i16> @vpaddw128_test(<8 x i16> %i, <8 x i16> %j) nounwind readnone {
+  %x = add <8 x i16> %i, %j
+  ret <8 x i16> %x
+}
+
+; CHECK-LABEL: vpaddw128_fold_test
+; CHECK: vpaddw (%rdi), %xmm{{.*}}
+; CHECK: ret
+define <8 x i16> @vpaddw128_fold_test(<8 x i16> %i, <8 x i16>* %j) nounwind {
+  %tmp = load <8 x i16>* %j, align 4
+  %x = add <8 x i16> %i, %tmp
+  ret <8 x i16> %x
+}
+
+; CHECK-LABEL: vpaddw128_mask_test
+; CHECK: vpaddw %xmm{{.*%k[1-7].*}}
+; CHECK: ret
+define <8 x i16> @vpaddw128_mask_test(<8 x i16> %i, <8 x i16> %j, <8 x i16> %mask1) nounwind readnone {
+  %mask = icmp ne <8 x i16> %mask1, zeroinitializer
+  %x = add <8 x i16> %i, %j
+  %r = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %i
+  ret <8 x i16> %r
+}
+
+; CHECK-LABEL: vpaddw128_maskz_test
+; CHECK: vpaddw %xmm{{.*{%k[1-7]} {z}.*}}
+; CHECK: ret
+define <8 x i16> @vpaddw128_maskz_test(<8 x i16> %i, <8 x i16> %j, <8 x i16> %mask1) nounwind readnone {
+  %mask = icmp ne <8 x i16> %mask1, zeroinitializer
+  %x = add <8 x i16> %i, %j
+  %r = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer
+  ret <8 x i16> %r
+}
+
+; CHECK-LABEL: vpaddw128_mask_fold_test
+; CHECK: vpaddw (%rdi), %xmm{{.*%k[1-7]}}
+; CHECK: ret
+define <8 x i16> @vpaddw128_mask_fold_test(<8 x i16> %i, <8 x i16>* %j.ptr, <8 x i16> %mask1) nounwind readnone {
+  %mask = icmp ne <8 x i16> %mask1, zeroinitializer
+  %j = load <8 x i16>* %j.ptr
+  %x = add <8 x i16> %i, %j
+  %r = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %i
+  ret <8 x i16> %r
+}
+
+; CHECK-LABEL: vpaddw128_maskz_fold_test
+; CHECK: vpaddw (%rdi), %xmm{{.*{%k[1-7]} {z}}}
+; CHECK: ret
+define <8 x i16> @vpaddw128_maskz_fold_test(<8 x i16> %i, <8 x i16>* %j.ptr, <8 x i16> %mask1) nounwind readnone {
+  %mask = icmp ne <8 x i16> %mask1, zeroinitializer
+  %j = load <8 x i16>* %j.ptr
+  %x = add <8 x i16> %i, %j
+  %r = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer
+  ret <8 x i16> %r
+}
+
+; CHECK-LABEL: vpsubb128_test
+; CHECK: vpsubb %xmm{{.*}}
+; CHECK: ret
+define <16 x i8> @vpsubb128_test(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
+  %x = sub <16 x i8> %i, %j
+  ret <16 x i8> %x
+}
+
+; CHECK-LABEL: vpsubw128_test
+; CHECK: vpsubw %xmm{{.*}}
+; CHECK: ret
+define <8 x i16> @vpsubw128_test(<8 x i16> %i, <8 x i16> %j) nounwind readnone {
+  %x = sub <8 x i16> %i, %j
+  ret <8 x i16> %x
+}
+
+; CHECK-LABEL: vpmullw128_test
+; CHECK: vpmullw %xmm{{.*}}
+; CHECK: ret
+define <8 x i16> @vpmullw128_test(<8 x i16> %i, <8 x i16> %j) {
+  %x = mul <8 x i16> %i, %j
+  ret <8 x i16> %x
+}
+
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index 45f8d6d..dbb9117 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -69,28 +69,28 @@ declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16)
 define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
 ; CHECK_LABEL: test_cmp_b_256
 ; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
-  %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
+  %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 0, i32 -1)
   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
 ; CHECK: vpcmpltb %ymm1, %ymm0, %k0 ##
-  %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
+  %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 1, i32 -1)
   %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
 ; CHECK: vpcmpleb %ymm1, %ymm0, %k0 ##
-  %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
+  %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 2, i32 -1)
   %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
 ; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 ##
-  %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
+  %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 3, i32 -1)
   %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
 ; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 ##
-  %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
+  %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 4, i32 -1)
   %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
 ; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 ##
-  %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
+  %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 5, i32 -1)
   %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
 ; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 ##
-  %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
+  %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 6, i32 -1)
   %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
 ; CHECK: vpcmpordb %ymm1, %ymm0, %k0 ##
-  %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
+  %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 7, i32 -1)
   %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
   ret <8 x i32> %vec7
 }
@@ -98,59 +98,59 @@ define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
 define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
 ; CHECK_LABEL: test_mask_cmp_b_256
 ; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
-  %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
+  %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 0, i32 %mask)
   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
 ; CHECK: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ##
-  %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
+  %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 1, i32 %mask)
   %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
 ; CHECK: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ##
-  %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
+  %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 2, i32 %mask)
   %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
 ; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ##
-  %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
+  %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 3, i32 %mask)
   %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
 ; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ##
-  %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
+  %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 4, i32 %mask)
   %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
 ; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ##
-  %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
+  %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 5, i32 %mask)
   %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
 ; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ##
-  %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
+  %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 6, i32 %mask)
   %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
 ; CHECK: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ##
-  %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
+  %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 7, i32 %mask)
   %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
   ret <8 x i32> %vec7
 }
 
-declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
+declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i8, i32) nounwind readnone
 
 define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
 ; CHECK_LABEL: test_ucmp_b_256
 ; CHECK: vpcmpequb %ymm1, %ymm0, %k0 ##
-  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
+  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 0, i32 -1)
   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
 ; CHECK: vpcmpltub %ymm1, %ymm0, %k0 ##
-  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
+  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 1, i32 -1)
   %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
 ; CHECK: vpcmpleub %ymm1, %ymm0, %k0 ##
-  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
+  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 2, i32 -1)
   %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
 ; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 ##
-  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
+  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 3, i32 -1)
   %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
 ; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 ##
-  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
+  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 4, i32 -1)
   %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
 ; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 ##
-  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
+  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 5, i32 -1)
   %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
 ; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 ##
-  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
+  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 6, i32 -1)
   %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
 ; CHECK: vpcmpordub %ymm1, %ymm0, %k0 ##
-  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
+  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 7, i32 -1)
   %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
   ret <8 x i32> %vec7
 }
@@ -158,59 +158,59 @@ define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
 define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
 ; CHECK_LABEL: test_mask_ucmp_b_256
 ; CHECK: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ##
-  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
+  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 0, i32 %mask)
   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
 ; CHECK: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ##
-  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
+  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 1, i32 %mask)
   %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
 ; CHECK: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ##
-  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
+  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 2, i32 %mask)
   %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
 ; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ##
-  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
+  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 3, i32 %mask)
   %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
 ; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ##
-  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
+  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 4, i32 %mask)
   %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
 ; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ##
-  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
+  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 5, i32 %mask)
   %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
 ; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ##
-  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
+  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 6, i32 %mask)
   %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
 ; CHECK: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ##
-  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
+  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 7, i32 %mask)
   %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
   ret <8 x i32> %vec7
 }
 
-declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
+declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i8, i32) nounwind readnone
 
 define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
 ; CHECK_LABEL: test_cmp_w_256
 ; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
-  %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
 ; CHECK: vpcmpltw %ymm1, %ymm0, %k0 ##
-  %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 1, i16 -1)
   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
 ; CHECK: vpcmplew %ymm1, %ymm0, %k0 ##
-  %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 2, i16 -1)
   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
 ; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 ##
-  %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 3, i16 -1)
   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
 ; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 ##
-  %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 4, i16 -1)
   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
 ; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 ##
-  %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 5, i16 -1)
   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
 ; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 ##
-  %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 6, i16 -1)
   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
 ; CHECK: vpcmpordw %ymm1, %ymm0, %k0 ##
-  %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 7, i16 -1)
   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   ret <8 x i16> %vec7
 }
@@ -218,59 +218,59 @@ define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
 define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
 ; CHECK_LABEL: test_mask_cmp_w_256
 ; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
-  %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 0, i16 %mask)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
 ; CHECK: vpcmpltw %ymm1, %ymm0, %k0 {%k1} ##
-  %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 1, i16 %mask)
   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
 ; CHECK: vpcmplew %ymm1, %ymm0, %k0 {%k1} ##
-  %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 2, i16 %mask)
   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
 ; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 {%k1} ##
-  %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 3, i16 %mask)
   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
 ; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} ##
-  %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 4, i16 %mask)
   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
 ; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} ##
-  %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 5, i16 %mask)
   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
 ; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 {%k1} ##
-  %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 6, i16 %mask)
   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
 ; CHECK: vpcmpordw %ymm1, %ymm0, %k0 {%k1} ##
-  %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 7, i16 %mask)
   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   ret <8 x i16> %vec7
 }
 
-declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
+declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i8, i16) nounwind readnone
 
 define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
 ; CHECK_LABEL: test_ucmp_w_256
 ; CHECK: vpcmpequw %ymm1, %ymm0, %k0 ##
-  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
 ; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 ##
-  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 1, i16 -1)
   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
 ; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 ##
-  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 2, i16 -1)
   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
 ; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 ##
-  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 3, i16 -1)
   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
 ; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 ##
-  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 4, i16 -1)
   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
 ; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 ##
-  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 5, i16 -1)
   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
 ; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 ##
-  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 6, i16 -1)
   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
 ; CHECK: vpcmporduw %ymm1, %ymm0, %k0 ##
-  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 7, i16 -1)
   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   ret <8 x i16> %vec7
 }
@@ -278,33 +278,33 @@ define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
 define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
 ; CHECK_LABEL: test_mask_ucmp_w_256
 ; CHECK: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ##
-  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 0, i16 %mask)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
 ; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} ##
-  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 1, i16 %mask)
   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
 ; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} ##
-  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 2, i16 %mask)
   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
 ; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 {%k1} ##
-  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 3, i16 %mask)
   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
 ; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 {%k1} ##
-  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 4, i16 %mask)
   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
 ; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} ##
-  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 5, i16 %mask)
   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
 ; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} ##
-  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 6, i16 %mask)
   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
 ; CHECK: vpcmporduw %ymm1, %ymm0, %k0 {%k1} ##
-  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 7, i16 %mask)
   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   ret <8 x i16> %vec7
 }
 
-declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
+declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i8, i16) nounwind readnone
 
 ; 128-bit
 
@@ -375,28 +375,28 @@ declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8)
 define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK_LABEL: test_cmp_b_128
 ; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
-  %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
 ; CHECK: vpcmpltb %xmm1, %xmm0, %k0 ##
-  %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 1, i16 -1)
   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
 ; CHECK: vpcmpleb %xmm1, %xmm0, %k0 ##
-  %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 2, i16 -1)
   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
 ; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 ##
-  %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 3, i16 -1)
   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
 ; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 ##
-  %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 4, i16 -1)
   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
 ; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 ##
-  %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 5, i16 -1)
   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
 ; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 ##
-  %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 6, i16 -1)
   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
 ; CHECK: vpcmpordb %xmm1, %xmm0, %k0 ##
-  %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 7, i16 -1)
   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   ret <8 x i16> %vec7
 }
@@ -404,59 +404,59 @@ define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
 define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
 ; CHECK_LABEL: test_mask_cmp_b_128
 ; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
-  %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 0, i16 %mask)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
 ; CHECK: vpcmpltb %xmm1, %xmm0, %k0 {%k1} ##
-  %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 1, i16 %mask)
   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
 ; CHECK: vpcmpleb %xmm1, %xmm0, %k0 {%k1} ##
-  %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 2, i16 %mask)
   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
 ; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 {%k1} ##
-  %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 3, i16 %mask)
   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
 ; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} ##
-  %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 4, i16 %mask)
   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
 ; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} ##
-  %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 5, i16 %mask)
   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
 ; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 {%k1} ##
-  %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 6, i16 %mask)
   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
 ; CHECK: vpcmpordb %xmm1, %xmm0, %k0 {%k1} ##
-  %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 7, i16 %mask)
   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   ret <8 x i16> %vec7
 }
 
-declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
+declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i8, i16) nounwind readnone
 
 define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK_LABEL: test_ucmp_b_128
 ; CHECK: vpcmpequb %xmm1, %xmm0, %k0 ##
-  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
 ; CHECK: vpcmpltub %xmm1, %xmm0, %k0 ##
-  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 1, i16 -1)
   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
 ; CHECK: vpcmpleub %xmm1, %xmm0, %k0 ##
-  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 2, i16 -1)
   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
 ; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 ##
-  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 3, i16 -1)
   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
 ; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 ##
-  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 4, i16 -1)
   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
 ; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 ##
-  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 5, i16 -1)
   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
 ; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 ##
-  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 6, i16 -1)
   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
 ; CHECK: vpcmpordub %xmm1, %xmm0, %k0 ##
-  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 7, i16 -1)
   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   ret <8 x i16> %vec7
 }
@@ -464,59 +464,59 @@ define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
 define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
 ; CHECK_LABEL: test_mask_ucmp_b_128
 ; CHECK: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ##
-  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 0, i16 %mask)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
 ; CHECK: vpcmpltub %xmm1, %xmm0, %k0 {%k1} ##
-  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 1, i16 %mask)
   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
 ; CHECK: vpcmpleub %xmm1, %xmm0, %k0 {%k1} ##
-  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 2, i16 %mask)
   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
 ; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 {%k1} ##
-  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 3, i16 %mask)
   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
 ; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 {%k1} ##
-  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 4, i16 %mask)
   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
 ; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} ##
-  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 5, i16 %mask)
   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
 ; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} ##
-  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 6, i16 %mask)
   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
 ; CHECK: vpcmpordub %xmm1, %xmm0, %k0 {%k1} ##
-  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 7, i16 %mask)
   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   ret <8 x i16> %vec7
 }
 
-declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
+declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i8, i16) nounwind readnone
 
 define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK_LABEL: test_cmp_w_128
 ; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
-  %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltw %xmm1, %xmm0, %k0 ##
-  %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 1, i8 -1)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmplew %xmm1, %xmm0, %k0 ##
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 2, i8 -1)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 ##
-  %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 3, i8 -1)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 ##
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 4, i8 -1)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 ##
-  %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 5, i8 -1)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 ##
-  %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 6, i8 -1)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmpordw %xmm1, %xmm0, %k0 ##
-  %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 7, i8 -1)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
@@ -524,59 +524,59 @@ define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
 define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
 ; CHECK_LABEL: test_mask_cmp_w_128
 ; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
-  %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltw %xmm1, %xmm0, %k0 {%k1} ##
-  %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 1, i8 %mask)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmplew %xmm1, %xmm0, %k0 {%k1} ##
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 2, i8 %mask)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 {%k1} ##
-  %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 3, i8 %mask)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} ##
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 4, i8 %mask)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} ##
-  %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 5, i8 %mask)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 {%k1} ##
-  %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 6, i8 %mask)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmpordw %xmm1, %xmm0, %k0 {%k1} ##
-  %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 7, i8 %mask)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
-declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
+declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i8, i8) nounwind readnone
 
 define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK_LABEL: test_ucmp_w_128
 ; CHECK: vpcmpequw %xmm1, %xmm0, %k0 ##
-  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 ##
-  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 1, i8 -1)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 ##
-  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 2, i8 -1)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 ##
-  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 3, i8 -1)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 ##
-  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 4, i8 -1)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 ##
-  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 5, i8 -1)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 ##
-  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 6, i8 -1)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmporduw %xmm1, %xmm0, %k0 ##
-  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 7, i8 -1)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
@@ -584,30 +584,415 @@ define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
 define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
 ; CHECK_LABEL: test_mask_ucmp_w_128
 ; CHECK: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ##
-  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} ##
-  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 1, i8 %mask)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} ##
-  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 2, i8 %mask)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 {%k1} ##
-  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 3, i8 %mask)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 {%k1} ##
-  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 4, i8 %mask)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} ##
-  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 5, i8 %mask)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} ##
-  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 6, i8 %mask)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmporduw %xmm1, %xmm0, %k0 {%k1} ##
-  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 7, i8 %mask)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
-declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
+declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i8, i8) nounwind readnone
+
+declare <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+
+define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmadd256_ps
+  ; CHECK: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xc2]
+  %res = call <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
+  ret <8 x float> %res
+}
+
+declare <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmadd128_ps
+  ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
+  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+declare <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) {
+; CHECK-LABEL: test_mask_fmadd256_pd:
+; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
+  %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask)
+  ret <4 x double> %res
+}
+
+declare <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: test_mask_fmadd128_pd:
+; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
+  %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask)
+  ret <2 x double> %res
+}
+
+declare <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+
+define <8 x float> @test_mask_vfmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmsub256_ps
+  ; CHECK: vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xaa,0xc2]
+  %res = call <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
+  ret <8 x float> %res
+}
+
+declare <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_mask_vfmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmsub128_ps
+  ; CHECK: vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xaa,0xc2]
+  %res = call <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+declare <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x double> @test_mask_vfmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmsub256_pd
+  ; CHECK: vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xaa,0xc2]
+  %res = call <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+  ret <4 x double> %res
+}
+
+declare <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+
+define <2 x double> @test_mask_vfmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmsub128_pd
+  ; CHECK: vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xaa,0xc2]
+  %res = call <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+  ret <2 x double> %res
+}
+
+declare <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+
+define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfnmadd256_ps
+  ; CHECK: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xc2]
+  %res = call <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
+  ret <8 x float> %res
+}
+
+declare <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfnmadd128_ps
+  ; CHECK: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xc2]
+  %res = call <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+declare <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfnmadd256_pd
+  ; CHECK: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xc2]
+  %res = call <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+  ret <4 x double> %res
+}
+
+declare <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+
+define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfnmadd128_pd
+  ; CHECK: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xc2]
+  %res = call <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+  ret <2 x double> %res
+}
+
+declare <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+
+define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfnmsub256_ps
+  ; CHECK: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xc2]
+  %res = call <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
+  ret <8 x float> %res
+}
+
+declare <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfnmsub128_ps
+  ; CHECK: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xc2]
+  %res = call <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+declare <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfnmsub256_pd
+  ; CHECK: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xc2]
+  %res = call <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+  ret <4 x double> %res
+}
+
+declare <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+
+define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfnmsub128_pd
+  ; CHECK: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xc2]
+  %res = call <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+  ret <2 x double> %res
+}
+
+declare <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+
+define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) {
+; CHECK-LABEL: test_mask_fmaddsub256_ps:
+; CHECK: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xc2]
+  %res = call <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask)
+  ret <8 x float> %res
+}
+
+declare <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: test_mask_fmaddsub128_ps:
+; CHECK: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xc2]
+  %res = call <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmaddsub256_pd
+  ; CHECK: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xc2]
+  %res = call <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+  ret <4 x double> %res
+}
+
+declare <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+
+define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmaddsub128_pd
+  ; CHECK: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xc2]
+  %res = call <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+  ret <2 x double> %res
+}
+
+declare <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+
+define <8 x float> @test_mask_vfmsubadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmsubadd256_ps
+  ; CHECK: vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa7,0xc2]
+  %res = call <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
+  ret <8 x float> %res
+}
+
+declare <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_mask_vfmsubadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmsubadd128_ps
+  ; CHECK: vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa7,0xc2]
+  %res = call <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+declare <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x double> @test_mask_vfmsubadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmsubadd256_pd
+  ; CHECK: vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa7,0xc2]
+  %res = call <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+  ret <4 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+
+define <2 x double> @test_mask_vfmsubadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmsubadd128_pd
+  ; CHECK: vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0xc2]
+  %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_vfmsubadd128rm_pd(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmsubadd128rm_pd
+  ; CHECK: vfmsubadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0x07]
+  %a2 = load <2 x double>* %ptr_a2
+  %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+  ret <2 x double> %res
+}
+declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
+define <8 x double> @test_mask_vfmsubaddrm_pd(<8 x double> %a0, <8 x double> %a1, <8 x double>* %ptr_a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmsubaddrm_pd
+  ; CHECK: vfmsubadd213pd  (%rdi), %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa7,0x07]
+  %a2 = load <8 x double>* %ptr_a2, align 8
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
+  ret <8 x double> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmadd128_ps_r
+  ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
+  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK-LABEL: test_mask_vfmadd128_ps_rz
+  ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2]
+  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmk
+  ; CHECK: vfmadd213ps	(%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
+  %a2 = load <4 x float>* %ptr_a2
+  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmka
+  ; CHECK: vfmadd213ps     (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
+  %a2 = load <4 x float>* %ptr_a2, align 8
+  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
+  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz
+  ; CHECK: vfmadd213ps	(%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07]
+  %a2 = load <4 x float>* %ptr_a2
+  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
+  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza
+  ; CHECK: vfmadd213ps	(%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07]
+  %a2 = load <4 x float>* %ptr_a2, align 4
+  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmb
+  ; CHECK: vfmadd213ps	(%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
+  %q = load float* %ptr_a2
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmba
+  ; CHECK: vfmadd213ps	(%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
+  %q = load float* %ptr_a2, align 4
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
+  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz
+  ; CHECK: vfmadd213ps	(%rdi){1to4}, %xmm1, %xmm0  ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
+  %q = load float* %ptr_a2
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
+  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza
+  ; CHECK: vfmadd213ps	(%rdi){1to4}, %xmm1, %xmm0  ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
+  %q = load float* %ptr_a2, align 4
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
+  ret <4 x float> %res
+}
+
+define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmadd128_pd_r
+  ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
+  %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK-LABEL: test_mask_vfmadd128_pd_rz
+  ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2]
+  %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmadd128_pd_rmk
+  ; CHECK: vfmadd213pd	(%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07]
+  %a2 = load <2 x double>* %ptr_a2
+  %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2) {
+  ; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz
+  ; CHECK: vfmadd213pd	(%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0x07]
+  %a2 = load <2 x double>* %ptr_a2
+  %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
+  ret <2 x double> %res
+}
+
+define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmadd256_pd_r
+  ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
+  %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+  ; CHECK-LABEL: test_mask_vfmadd256_pd_rz
+  ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2]
+  %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2, i8 %mask) {
+  ; CHECK-LABEL: test_mask_vfmadd256_pd_rmk
+  ; CHECK: vfmadd213pd	(%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07]
+  %a2 = load <4 x double>* %ptr_a2
+  %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2) {
+  ; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz
+  ; CHECK: vfmadd213pd	(%rdi), %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0x07]
+  %a2 = load <4 x double>* %ptr_a2
+  %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
+  ret <4 x double> %res
+}
diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll
index 0000ece..fa4352e 100644
--- a/test/CodeGen/X86/avx512er-intrinsics.ll
+++ b/test/CodeGen/X86/avx512er-intrinsics.ll
@@ -64,16 +64,53 @@ define <8 x double> @test_exp2_pd_512(<8 x double> %a0) {
 declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
-  ; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
+  ; CHECK: vrsqrt28ss %xmm0, %xmm0, %xmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
   %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
 
 define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
-  ; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
+  ; CHECK: vrcp28ss %xmm0, %xmm0, %xmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
   %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
 
+define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0) {
+  ; CHECK: vrsqrt28ss %xmm0, %xmm0, %xmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 7, i32 8) ; 
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0) {
+  ; CHECK: vrsqrt28ss %xmm1, %xmm0, %xmm2 {%k1}{sae} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 7, i32 8) ;
+  ret <4 x float> %res
+}
+
+define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0) {
+  ; CHECK: vrsqrt28sd %xmm0, %xmm0, %xmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 7, i32 8) ; 
+  ret <2 x double> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
+
+define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr ) {
+  ; CHECK: vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07]
+  %mem = load double * %ptr, align 8
+  %mem_v = insertelement <2 x double> undef, double %mem, i32 0
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 7, i32 4) ; 
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr ) {
+  ; CHECK: vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12]
+  %ptr1 = getelementptr double* %ptr, i32 18
+  %mem = load double * %ptr1, align 8
+  %mem_v = insertelement <2 x double> undef, double %mem, i32 0
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 7, i32 4) ;
+  ret <2 x double> %res
+}
+
diff --git a/test/CodeGen/X86/avx512vl-arith.ll b/test/CodeGen/X86/avx512vl-arith.ll
new file mode 100644
index 0000000..1f7da78
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-arith.ll
@@ -0,0 +1,794 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl| FileCheck %s
+
+; 256-bit
+
+; CHECK-LABEL: vpaddq256_test
+; CHECK: vpaddq %ymm{{.*}}
+; CHECK: ret
+define <4 x i64> @vpaddq256_test(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+  %x = add <4 x i64> %i, %j
+  ret <4 x i64> %x
+}
+
+; CHECK-LABEL: vpaddq256_fold_test
+; CHECK: vpaddq (%rdi), %ymm{{.*}}
+; CHECK: ret
+define <4 x i64> @vpaddq256_fold_test(<4 x i64> %i, <4 x i64>* %j) nounwind {
+  %tmp = load <4 x i64>* %j, align 4
+  %x = add <4 x i64> %i, %tmp
+  ret <4 x i64> %x
+}
+
+; CHECK-LABEL: vpaddq256_broadcast_test
+; CHECK: vpaddq LCP{{.*}}(%rip){1to4}, %ymm{{.*}}
+; CHECK: ret
+define <4 x i64> @vpaddq256_broadcast_test(<4 x i64> %i) nounwind {
+  %x = add <4 x i64> %i, <i64 1, i64 1, i64 1, i64 1>
+  ret <4 x i64> %x
+}
+
+; CHECK-LABEL: vpaddq256_broadcast2_test
+; CHECK: vpaddq (%rdi){1to4}, %ymm{{.*}}
+; CHECK: ret
+define <4 x i64> @vpaddq256_broadcast2_test(<4 x i64> %i, i64* %j.ptr) nounwind {
+  %j = load i64* %j.ptr
+  %j.0 = insertelement <4 x i64> undef, i64 %j, i32 0
+  %j.v = shufflevector <4 x i64> %j.0, <4 x i64> undef, <4 x i32> zeroinitializer
+  %x = add <4 x i64> %i, %j.v
+  ret <4 x i64> %x
+}
+
+; CHECK-LABEL: vpaddd256_test
+; CHECK: vpaddd %ymm{{.*}}
+; CHECK: ret
+define <8 x i32> @vpaddd256_test(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+  %x = add <8 x i32> %i, %j
+  ret <8 x i32> %x
+}
+
+; CHECK-LABEL: vpaddd256_fold_test
+; CHECK: vpaddd (%rdi), %ymm{{.*}}
+; CHECK: ret
+define <8 x i32> @vpaddd256_fold_test(<8 x i32> %i, <8 x i32>* %j) nounwind {
+  %tmp = load <8 x i32>* %j, align 4
+  %x = add <8 x i32> %i, %tmp
+  ret <8 x i32> %x
+}
+
+; CHECK-LABEL: vpaddd256_broadcast_test
+; CHECK: vpaddd LCP{{.*}}(%rip){1to8}, %ymm{{.*}}
+; CHECK: ret
+define <8 x i32> @vpaddd256_broadcast_test(<8 x i32> %i) nounwind {
+  %x = add <8 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i32> %x
+}
+
+; CHECK-LABEL: vpaddd256_mask_test
+; CHECK: vpaddd %ymm{{.*%k[1-7].*}}
+; CHECK: ret
+define <8 x i32> @vpaddd256_mask_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %x = add <8 x i32> %i, %j
+  %r = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %i
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd256_maskz_test
+; CHECK: vpaddd %ymm{{.*{%k[1-7]} {z}.*}}
+; CHECK: ret
+define <8 x i32> @vpaddd256_maskz_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %x = add <8 x i32> %i, %j
+  %r = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd256_mask_fold_test
+; CHECK: vpaddd (%rdi), %ymm{{.*%k[1-7]}}
+; CHECK: ret
+define <8 x i32> @vpaddd256_mask_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %j = load <8 x i32>* %j.ptr
+  %x = add <8 x i32> %i, %j
+  %r = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %i
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd256_mask_broadcast_test
+; CHECK: vpaddd LCP{{.*}}(%rip){1to8}, %ymm{{.*{%k[1-7]}}}
+; CHECK: ret
+define <8 x i32> @vpaddd256_mask_broadcast_test(<8 x i32> %i, <8 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %x = add <8 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %i
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd256_maskz_fold_test
+; CHECK: vpaddd (%rdi), %ymm{{.*{%k[1-7]} {z}}}
+; CHECK: ret
+define <8 x i32> @vpaddd256_maskz_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %j = load <8 x i32>* %j.ptr
+  %x = add <8 x i32> %i, %j
+  %r = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd256_maskz_broadcast_test
+; CHECK: vpaddd LCP{{.*}}(%rip){1to8}, %ymm{{.*{%k[1-7]} {z}}}
+; CHECK: ret
+define <8 x i32> @vpaddd256_maskz_broadcast_test(<8 x i32> %i, <8 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %x = add <8 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: vpsubq256_test
+; CHECK: vpsubq %ymm{{.*}}
+; CHECK: ret
+define <4 x i64> @vpsubq256_test(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+  %x = sub <4 x i64> %i, %j
+  ret <4 x i64> %x
+}
+
+; CHECK-LABEL: vpsubd256_test
+; CHECK: vpsubd %ymm{{.*}}
+; CHECK: ret
+define <8 x i32> @vpsubd256_test(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+  %x = sub <8 x i32> %i, %j
+  ret <8 x i32> %x
+}
+
+; CHECK-LABEL: vpmulld256_test
+; CHECK: vpmulld %ymm{{.*}}
+; CHECK: ret
+define <8 x i32> @vpmulld256_test(<8 x i32> %i, <8 x i32> %j) {
+  %x = mul <8 x i32> %i, %j
+  ret <8 x i32> %x
+}
+
+; CHECK-LABEL: test_vaddpd_256
+; CHECK: vaddpd{{.*}}
+; CHECK: ret
+define <4 x double> @test_vaddpd_256(<4 x double> %y, <4 x double> %x) {
+entry:
+  %add.i = fadd <4 x double> %x, %y
+  ret <4 x double> %add.i
+}
+
+; CHECK-LABEL: test_fold_vaddpd_256
+; CHECK: vaddpd LCP{{.*}}(%rip){{.*}}
+; CHECK: ret
+define <4 x double> @test_fold_vaddpd_256(<4 x double> %y) {
+entry:
+  %add.i = fadd <4 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 4.500000e+00, double 5.600000e+00>
+  ret <4 x double> %add.i
+}
+
+; CHECK-LABEL: test_broadcast_vaddpd_256
+; CHECK: LCP{{.*}}(%rip){1to8}, %ymm0, %ymm0
+; CHECK: ret
+define <8 x float> @test_broadcast_vaddpd_256(<8 x float> %a) nounwind {
+  %b = fadd <8 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: test_mask_vaddps_256
+; CHECK: vaddps {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <8 x float> @test_mask_vaddps_256(<8 x float> %dst, <8 x float> %i,
+                                        <8 x float> %j, <8 x i32> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %x = fadd <8 x float> %i, %j
+  %r = select <8 x i1> %mask, <8 x float> %x, <8 x float> %dst
+  ret <8 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vmulps_256
+; CHECK: vmulps {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <8 x float> @test_mask_vmulps_256(<8 x float> %dst, <8 x float> %i,
+                                        <8 x float> %j, <8 x i32> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %x = fmul <8 x float> %i, %j
+  %r = select <8 x i1> %mask, <8 x float> %x, <8 x float> %dst
+  ret <8 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vminps_256
+; CHECK: vminps {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <8 x float> @test_mask_vminps_256(<8 x float> %dst, <8 x float> %i,
+                                        <8 x float> %j, <8 x i32> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %cmp_res = fcmp olt <8 x float> %i, %j
+  %min = select <8 x i1> %cmp_res, <8 x float> %i, <8 x float> %j
+  %r = select <8 x i1> %mask, <8 x float> %min, <8 x float> %dst
+  ret <8 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vmaxps_256
+; CHECK: vmaxps {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <8 x float> @test_mask_vmaxps_256(<8 x float> %dst, <8 x float> %i,
+                                        <8 x float> %j, <8 x i32> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %cmp_res = fcmp ogt <8 x float> %i, %j
+  %max = select <8 x i1> %cmp_res, <8 x float> %i, <8 x float> %j
+  %r = select <8 x i1> %mask, <8 x float> %max, <8 x float> %dst
+  ret <8 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vsubps_256
+; CHECK: vsubps {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <8 x float> @test_mask_vsubps_256(<8 x float> %dst, <8 x float> %i,
+                                        <8 x float> %j, <8 x i32> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %x = fsub <8 x float> %i, %j
+  %r = select <8 x i1> %mask, <8 x float> %x, <8 x float> %dst
+  ret <8 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vdivps_256
+; CHECK: vdivps {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <8 x float> @test_mask_vdivps_256(<8 x float> %dst, <8 x float> %i,
+                                        <8 x float> %j, <8 x i32> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %x = fdiv <8 x float> %i, %j
+  %r = select <8 x i1> %mask, <8 x float> %x, <8 x float> %dst
+  ret <8 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vmulpd_256
+; CHECK: vmulpd {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <4 x double> @test_mask_vmulpd_256(<4 x double> %dst, <4 x double> %i,
+                                        <4 x double> %j, <4 x i64> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %x = fmul <4 x double> %i, %j
+  %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> %dst
+  ret <4 x double> %r
+}
+
+; CHECK-LABEL: test_mask_vminpd_256
+; CHECK: vminpd {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <4 x double> @test_mask_vminpd_256(<4 x double> %dst, <4 x double> %i,
+                                        <4 x double> %j, <4 x i64> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %cmp_res = fcmp olt <4 x double> %i, %j
+  %min = select <4 x i1> %cmp_res, <4 x double> %i, <4 x double> %j
+  %r = select <4 x i1> %mask, <4 x double> %min, <4 x double> %dst
+  ret <4 x double> %r
+}
+
+; CHECK-LABEL: test_mask_vmaxpd_256
+; CHECK: vmaxpd {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <4 x double> @test_mask_vmaxpd_256(<4 x double> %dst, <4 x double> %i,
+                                        <4 x double> %j, <4 x i64> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %cmp_res = fcmp ogt <4 x double> %i, %j
+  %max = select <4 x i1> %cmp_res, <4 x double> %i, <4 x double> %j
+  %r = select <4 x i1> %mask, <4 x double> %max, <4 x double> %dst
+  ret <4 x double> %r
+}
+
+; CHECK-LABEL: test_mask_vsubpd_256
+; CHECK: vsubpd {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <4 x double> @test_mask_vsubpd_256(<4 x double> %dst, <4 x double> %i,
+                                        <4 x double> %j, <4 x i64> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %x = fsub <4 x double> %i, %j
+  %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> %dst
+  ret <4 x double> %r
+}
+
+; CHECK-LABEL: test_mask_vdivpd_256
+; CHECK: vdivpd {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <4 x double> @test_mask_vdivpd_256(<4 x double> %dst, <4 x double> %i,
+                                        <4 x double> %j, <4 x i64> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %x = fdiv <4 x double> %i, %j
+  %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> %dst
+  ret <4 x double> %r
+}
+
+; CHECK-LABEL: test_mask_vaddpd_256
+; CHECK: vaddpd {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <4 x double> @test_mask_vaddpd_256(<4 x double> %dst, <4 x double> %i,
+                                         <4 x double> %j, <4 x i64> %mask1)
+                                         nounwind readnone {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %x = fadd <4 x double> %i, %j
+  %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> %dst
+  ret <4 x double> %r
+}
+
+; CHECK-LABEL: test_maskz_vaddpd_256
+; CHECK: vaddpd {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]} {z}}}
+; CHECK: ret
+define <4 x double> @test_maskz_vaddpd_256(<4 x double> %i, <4 x double> %j,
+                                          <4 x i64> %mask1) nounwind readnone {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %x = fadd <4 x double> %i, %j
+  %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> zeroinitializer
+  ret <4 x double> %r
+}
+
+; CHECK-LABEL: test_mask_fold_vaddpd_256
+; CHECK: vaddpd (%rdi), {{.*%ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}.*}}
+; CHECK: ret
+define <4 x double> @test_mask_fold_vaddpd_256(<4 x double> %dst, <4 x double> %i,
+                                         <4 x double>* %j,  <4 x i64> %mask1)
+                                         nounwind {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %tmp = load <4 x double>* %j
+  %x = fadd <4 x double> %i, %tmp
+  %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> %dst
+  ret <4 x double> %r
+}
+
+; CHECK-LABEL: test_maskz_fold_vaddpd_256
+; CHECK: vaddpd (%rdi), {{.*%ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]} {z}.*}}
+; CHECK: ret
+define <4 x double> @test_maskz_fold_vaddpd_256(<4 x double> %i, <4 x double>* %j,
+                                          <4 x i64> %mask1) nounwind {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %tmp = load <4 x double>* %j
+  %x = fadd <4 x double> %i, %tmp
+  %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> zeroinitializer
+  ret <4 x double> %r
+}
+
+; CHECK-LABEL: test_broadcast2_vaddpd_256
+; CHECK: vaddpd (%rdi){1to4}, %ymm{{.*}}
+; CHECK: ret
+define <4 x double> @test_broadcast2_vaddpd_256(<4 x double> %i, double* %j) nounwind {
+  %tmp = load double* %j
+  %b = insertelement <4 x double> undef, double %tmp, i32 0
+  %c = shufflevector <4 x double> %b, <4 x double> undef,
+                     <4 x i32> zeroinitializer
+  %x = fadd <4 x double> %c, %i
+  ret <4 x double> %x
+}
+
+; CHECK-LABEL: test_mask_broadcast_vaddpd_256
+; CHECK: vaddpd (%rdi){1to4}, %ymm{{.*{%k[1-7]}.*}}
+; CHECK: ret
+define <4 x double> @test_mask_broadcast_vaddpd_256(<4 x double> %dst, <4 x double> %i,
+                                          double* %j, <4 x i64> %mask1) nounwind {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %tmp = load double* %j
+  %b = insertelement <4 x double> undef, double %tmp, i32 0
+  %c = shufflevector <4 x double> %b, <4 x double> undef,
+                     <4 x i32> zeroinitializer
+  %x = fadd <4 x double> %c, %i
+  %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> %i
+  ret <4 x double> %r
+}
+
+; CHECK-LABEL: test_maskz_broadcast_vaddpd_256
+; CHECK: vaddpd (%rdi){1to4}, %ymm{{.*{%k[1-7]} {z}.*}}
+; CHECK: ret
+define <4 x double> @test_maskz_broadcast_vaddpd_256(<4 x double> %i, double* %j,
+                                           <4 x i64> %mask1) nounwind {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %tmp = load double* %j
+  %b = insertelement <4 x double> undef, double %tmp, i32 0
+  %c = shufflevector <4 x double> %b, <4 x double> undef,
+                     <4 x i32> zeroinitializer
+  %x = fadd <4 x double> %c, %i
+  %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> zeroinitializer
+  ret <4 x double> %r
+}
+
+; 128-bit
+
+; CHECK-LABEL: vpaddq128_test
+; CHECK: vpaddq %xmm{{.*}}
+; CHECK: ret
+define <2 x i64> @vpaddq128_test(<2 x i64> %i, <2 x i64> %j) nounwind readnone {
+  %x = add <2 x i64> %i, %j
+  ret <2 x i64> %x
+}
+
+; CHECK-LABEL: vpaddq128_fold_test
+; CHECK: vpaddq (%rdi), %xmm{{.*}}
+; CHECK: ret
+define <2 x i64> @vpaddq128_fold_test(<2 x i64> %i, <2 x i64>* %j) nounwind {
+  %tmp = load <2 x i64>* %j, align 4
+  %x = add <2 x i64> %i, %tmp
+  ret <2 x i64> %x
+}
+
+; CHECK-LABEL: vpaddq128_broadcast2_test
+; CHECK: vpaddq (%rdi){1to2}, %xmm{{.*}}
+; CHECK: ret
+define <2 x i64> @vpaddq128_broadcast2_test(<2 x i64> %i, i64* %j) nounwind {
+  %tmp = load i64* %j
+  %j.0 = insertelement <2 x i64> undef, i64 %tmp, i32 0
+  %j.1 = insertelement <2 x i64> %j.0, i64 %tmp, i32 1
+  %x = add <2 x i64> %i, %j.1
+  ret <2 x i64> %x
+}
+
+; CHECK-LABEL: vpaddd128_test
+; CHECK: vpaddd %xmm{{.*}}
+; CHECK: ret
+define <4 x i32> @vpaddd128_test(<4 x i32> %i, <4 x i32> %j) nounwind readnone {
+  %x = add <4 x i32> %i, %j
+  ret <4 x i32> %x
+}
+
+; CHECK-LABEL: vpaddd128_fold_test
+; CHECK: vpaddd (%rdi), %xmm{{.*}}
+; CHECK: ret
+define <4 x i32> @vpaddd128_fold_test(<4 x i32> %i, <4 x i32>* %j) nounwind {
+  %tmp = load <4 x i32>* %j, align 4
+  %x = add <4 x i32> %i, %tmp
+  ret <4 x i32> %x
+}
+
+; CHECK-LABEL: vpaddd128_broadcast_test
+; CHECK: vpaddd LCP{{.*}}(%rip){1to4}, %xmm{{.*}}
+; CHECK: ret
+define <4 x i32> @vpaddd128_broadcast_test(<4 x i32> %i) nounwind {
+  %x = add <4 x i32> %i, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %x
+}
+
+; CHECK-LABEL: vpaddd128_mask_test
+; CHECK: vpaddd %xmm{{.*%k[1-7].*}}
+; CHECK: ret
+define <4 x i32> @vpaddd128_mask_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %x = add <4 x i32> %i, %j
+  %r = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %i
+  ret <4 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd128_maskz_test
+; CHECK: vpaddd %xmm{{.*{%k[1-7]} {z}.*}}
+; CHECK: ret
+define <4 x i32> @vpaddd128_maskz_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %x = add <4 x i32> %i, %j
+  %r = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+  ret <4 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd128_mask_fold_test
+; CHECK: vpaddd (%rdi), %xmm{{.*%k[1-7]}}
+; CHECK: ret
+define <4 x i32> @vpaddd128_mask_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %j = load <4 x i32>* %j.ptr
+  %x = add <4 x i32> %i, %j
+  %r = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %i
+  ret <4 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd128_mask_broadcast_test
+; CHECK: vpaddd LCP{{.*}}(%rip){1to4}, %xmm{{.*{%k[1-7]}}}
+; CHECK: ret
+define <4 x i32> @vpaddd128_mask_broadcast_test(<4 x i32> %i, <4 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %x = add <4 x i32> %i, <i32 1, i32 1, i32 1, i32 1>
+  %r = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %i
+  ret <4 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd128_maskz_fold_test
+; CHECK: vpaddd (%rdi), %xmm{{.*{%k[1-7]} {z}}}
+; CHECK: ret
+define <4 x i32> @vpaddd128_maskz_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %j = load <4 x i32>* %j.ptr
+  %x = add <4 x i32> %i, %j
+  %r = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+  ret <4 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd128_maskz_broadcast_test
+; CHECK: vpaddd LCP{{.*}}(%rip){1to4}, %xmm{{.*{%k[1-7]} {z}}}
+; CHECK: ret
+define <4 x i32> @vpaddd128_maskz_broadcast_test(<4 x i32> %i, <4 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %x = add <4 x i32> %i, <i32 1, i32 1, i32 1, i32 1>
+  %r = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+  ret <4 x i32> %r
+}
+
+; CHECK-LABEL: vpsubq128_test
+; CHECK: vpsubq %xmm{{.*}}
+; CHECK: ret
+define <2 x i64> @vpsubq128_test(<2 x i64> %i, <2 x i64> %j) nounwind readnone {
+  %x = sub <2 x i64> %i, %j
+  ret <2 x i64> %x
+}
+
+; CHECK-LABEL: vpsubd128_test
+; CHECK: vpsubd %xmm{{.*}}
+; CHECK: ret
+define <4 x i32> @vpsubd128_test(<4 x i32> %i, <4 x i32> %j) nounwind readnone {
+  %x = sub <4 x i32> %i, %j
+  ret <4 x i32> %x
+}
+
+; CHECK-LABEL: vpmulld128_test
+; CHECK: vpmulld %xmm{{.*}}
+; CHECK: ret
+define <4 x i32> @vpmulld128_test(<4 x i32> %i, <4 x i32> %j) {
+  %x = mul <4 x i32> %i, %j
+  ret <4 x i32> %x
+}
+
+; CHECK-LABEL: test_vaddpd_128
+; CHECK: vaddpd{{.*}}
+; CHECK: ret
+define <2 x double> @test_vaddpd_128(<2 x double> %y, <2 x double> %x) {
+entry:
+  %add.i = fadd <2 x double> %x, %y
+  ret <2 x double> %add.i
+}
+
+; CHECK-LABEL: test_fold_vaddpd_128
+; CHECK: vaddpd LCP{{.*}}(%rip){{.*}}
+; CHECK: ret
+define <2 x double> @test_fold_vaddpd_128(<2 x double> %y) {
+entry:
+  %add.i = fadd <2 x double> %y, <double 4.500000e+00, double 3.400000e+00>
+  ret <2 x double> %add.i
+}
+
+; CHECK-LABEL: test_broadcast_vaddpd_128
+; CHECK: LCP{{.*}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK: ret
+define <4 x float> @test_broadcast_vaddpd_128(<4 x float> %a) nounwind {
+  %b = fadd <4 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  ret <4 x float> %b
+}
+
+; CHECK-LABEL: test_mask_vaddps_128
+; CHECK: vaddps {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <4 x float> @test_mask_vaddps_128(<4 x float> %dst, <4 x float> %i,
+                                        <4 x float> %j, <4 x i32> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %x = fadd <4 x float> %i, %j
+  %r = select <4 x i1> %mask, <4 x float> %x, <4 x float> %dst
+  ret <4 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vmulps_128
+; CHECK: vmulps {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <4 x float> @test_mask_vmulps_128(<4 x float> %dst, <4 x float> %i,
+                                        <4 x float> %j, <4 x i32> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %x = fmul <4 x float> %i, %j
+  %r = select <4 x i1> %mask, <4 x float> %x, <4 x float> %dst
+  ret <4 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vminps_128
+; CHECK: vminps {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <4 x float> @test_mask_vminps_128(<4 x float> %dst, <4 x float> %i,
+                                        <4 x float> %j, <4 x i32> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %cmp_res = fcmp olt <4 x float> %i, %j
+  %min = select <4 x i1> %cmp_res, <4 x float> %i, <4 x float> %j
+  %r = select <4 x i1> %mask, <4 x float> %min, <4 x float> %dst
+  ret <4 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vmaxps_128
+; CHECK: vmaxps {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <4 x float> @test_mask_vmaxps_128(<4 x float> %dst, <4 x float> %i,
+                                        <4 x float> %j, <4 x i32> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %cmp_res = fcmp ogt <4 x float> %i, %j
+  %max = select <4 x i1> %cmp_res, <4 x float> %i, <4 x float> %j
+  %r = select <4 x i1> %mask, <4 x float> %max, <4 x float> %dst
+  ret <4 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vsubps_128
+; CHECK: vsubps {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <4 x float> @test_mask_vsubps_128(<4 x float> %dst, <4 x float> %i,
+                                        <4 x float> %j, <4 x i32> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %x = fsub <4 x float> %i, %j
+  %r = select <4 x i1> %mask, <4 x float> %x, <4 x float> %dst
+  ret <4 x float> %r
+}
+
+
+; CHECK-LABEL: test_mask_vdivps_128
+; CHECK: vdivps {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <4 x float> @test_mask_vdivps_128(<4 x float> %dst, <4 x float> %i,
+                                        <4 x float> %j, <4 x i32> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %x = fdiv <4 x float> %i, %j
+  %r = select <4 x i1> %mask, <4 x float> %x, <4 x float> %dst
+  ret <4 x float> %r
+}
+
+; CHECK-LABEL: test_mask_vmulpd_128
+; CHECK: vmulpd {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <2 x double> @test_mask_vmulpd_128(<2 x double> %dst, <2 x double> %i,
+                                        <2 x double> %j, <2 x i64> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %x = fmul <2 x double> %i, %j
+  %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> %dst
+  ret <2 x double> %r
+}
+
+; CHECK-LABEL: test_mask_vminpd_128
+; CHECK: vminpd {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <2 x double> @test_mask_vminpd_128(<2 x double> %dst, <2 x double> %i,
+                                        <2 x double> %j, <2 x i64> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %cmp_res = fcmp olt <2 x double> %i, %j
+  %min = select <2 x i1> %cmp_res, <2 x double> %i, <2 x double> %j
+  %r = select <2 x i1> %mask, <2 x double> %min, <2 x double> %dst
+  ret <2 x double> %r
+}
+
+; CHECK-LABEL: test_mask_vmaxpd_128
+; CHECK: vmaxpd {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <2 x double> @test_mask_vmaxpd_128(<2 x double> %dst, <2 x double> %i,
+                                        <2 x double> %j, <2 x i64> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %cmp_res = fcmp ogt <2 x double> %i, %j
+  %max = select <2 x i1> %cmp_res, <2 x double> %i, <2 x double> %j
+  %r = select <2 x i1> %mask, <2 x double> %max, <2 x double> %dst
+  ret <2 x double> %r
+}
+
+; CHECK-LABEL: test_mask_vsubpd_128
+; CHECK: vsubpd {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <2 x double> @test_mask_vsubpd_128(<2 x double> %dst, <2 x double> %i,
+                                        <2 x double> %j, <2 x i64> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %x = fsub <2 x double> %i, %j
+  %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> %dst
+  ret <2 x double> %r
+}
+
+; CHECK-LABEL: test_mask_vdivpd_128
+; CHECK: vdivpd {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <2 x double> @test_mask_vdivpd_128(<2 x double> %dst, <2 x double> %i,
+                                        <2 x double> %j, <2 x i64> %mask1)
+                                        nounwind readnone {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %x = fdiv <2 x double> %i, %j
+  %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> %dst
+  ret <2 x double> %r
+}
+
+; CHECK-LABEL: test_mask_vaddpd_128
+; CHECK: vaddpd {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}}
+; CHECK: ret
+define <2 x double> @test_mask_vaddpd_128(<2 x double> %dst, <2 x double> %i,
+                                         <2 x double> %j, <2 x i64> %mask1)
+                                         nounwind readnone {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %x = fadd <2 x double> %i, %j
+  %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> %dst
+  ret <2 x double> %r
+}
+
+; CHECK-LABEL: test_maskz_vaddpd_128
+; CHECK: vaddpd {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]} {z}}}
+; CHECK: ret
+define <2 x double> @test_maskz_vaddpd_128(<2 x double> %i, <2 x double> %j,
+                                          <2 x i64> %mask1) nounwind readnone {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %x = fadd <2 x double> %i, %j
+  %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> zeroinitializer
+  ret <2 x double> %r
+}
+
+; CHECK-LABEL: test_mask_fold_vaddpd_128
+; CHECK: vaddpd (%rdi), {{.*%xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}.*}}
+; CHECK: ret
+define <2 x double> @test_mask_fold_vaddpd_128(<2 x double> %dst, <2 x double> %i,
+                                         <2 x double>* %j,  <2 x i64> %mask1)
+                                         nounwind {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %tmp = load <2 x double>* %j
+  %x = fadd <2 x double> %i, %tmp
+  %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> %dst
+  ret <2 x double> %r
+}
+
+; CHECK-LABEL: test_maskz_fold_vaddpd_128
+; CHECK: vaddpd (%rdi), {{.*%xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]} {z}.*}}
+; CHECK: ret
+define <2 x double> @test_maskz_fold_vaddpd_128(<2 x double> %i, <2 x double>* %j,
+                                          <2 x i64> %mask1) nounwind {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %tmp = load <2 x double>* %j
+  %x = fadd <2 x double> %i, %tmp
+  %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> zeroinitializer
+  ret <2 x double> %r
+}
+
+; CHECK-LABEL: test_broadcast2_vaddpd_128
+; CHECK: vaddpd (%rdi){1to2}, %xmm{{.*}}
+; CHECK: ret
+define <2 x double> @test_broadcast2_vaddpd_128(<2 x double> %i, double* %j) nounwind {
+  %tmp = load double* %j
+  %j.0 = insertelement <2 x double> undef, double %tmp, i64 0
+  %j.1 = insertelement <2 x double> %j.0, double %tmp, i64 1
+  %x = fadd <2 x double> %j.1, %i
+  ret <2 x double> %x
+}
+
+; CHECK-LABEL: test_mask_broadcast_vaddpd_128
+; CHECK: vaddpd (%rdi){1to2}, %xmm{{.*{%k[1-7]}.*}}
+; CHECK: ret
+define <2 x double> @test_mask_broadcast_vaddpd_128(<2 x double> %dst, <2 x double> %i,
+                                          double* %j, <2 x i64> %mask1)
+                                          nounwind {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %tmp = load double* %j
+  %j.0 = insertelement <2 x double> undef, double %tmp, i64 0
+  %j.1 = insertelement <2 x double> %j.0, double %tmp, i64 1
+  %x = fadd <2 x double> %j.1, %i
+  %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> %i
+  ret <2 x double> %r
+}
+
+; CHECK-LABEL: test_maskz_broadcast_vaddpd_128
+; CHECK: vaddpd (%rdi){1to2}, %xmm{{.*{%k[1-7]} {z}.*}}
+; CHECK: ret
+define <2 x double> @test_maskz_broadcast_vaddpd_128(<2 x double> %i, double* %j,
+                                           <2 x i64> %mask1) nounwind {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %tmp = load double* %j
+  %j.0 = insertelement <2 x double> undef, double %tmp, i64 0
+  %j.1 = insertelement <2 x double> %j.0, double %tmp, i64 1
+  %x = fadd <2 x double> %j.1, %i
+  %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> zeroinitializer
+  ret <2 x double> %r
+}
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index fa19084..fe347bd 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -67,244 +67,244 @@ define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
 declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8)
 
 define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK_LABEL: test_cmp_d_256
+; CHECK-LABEL: test_cmp_d_256
 ; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
-  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltd %ymm1, %ymm0, %k0 ##
-  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 1, i8 -1)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpled %ymm1, %ymm0, %k0 ##
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 2, i8 -1)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 ##
-  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 3, i8 -1)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 ##
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 4, i8 -1)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 ##
-  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 5, i8 -1)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnled %ymm1, %ymm0, %k0 ##
-  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 6, i8 -1)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmpordd %ymm1, %ymm0, %k0 ##
-  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 7, i8 -1)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
 define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
-; CHECK_LABEL: test_mask_cmp_d_256
+; CHECK-LABEL: test_mask_cmp_d_256
 ; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
-  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltd %ymm1, %ymm0, %k0 {%k1} ##
-  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 1, i8 %mask)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpled %ymm1, %ymm0, %k0 {%k1} ##
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 2, i8 %mask)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 {%k1} ##
-  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 3, i8 %mask)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} ##
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 4, i8 %mask)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 {%k1} ##
-  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 5, i8 %mask)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnled %ymm1, %ymm0, %k0 {%k1} ##
-  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 6, i8 %mask)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmpordd %ymm1, %ymm0, %k0 {%k1} ##
-  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 7, i8 %mask)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
-declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
+declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i8, i8) nounwind readnone
 
 define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK_LABEL: test_ucmp_d_256
+; CHECK-LABEL: test_ucmp_d_256
 ; CHECK: vpcmpequd %ymm1, %ymm0, %k0 ##
-  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltud %ymm1, %ymm0, %k0 ##
-  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 1, i8 -1)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleud %ymm1, %ymm0, %k0 ##
-  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 2, i8 -1)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 ##
-  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 3, i8 -1)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 ##
-  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 4, i8 -1)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 ##
-  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 5, i8 -1)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 ##
-  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 6, i8 -1)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmpordud %ymm1, %ymm0, %k0 ##
-  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 7, i8 -1)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
 define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
-; CHECK_LABEL: test_mask_ucmp_d_256
+; CHECK-LABEL: test_mask_ucmp_d_256
 ; CHECK: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ##
-  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltud %ymm1, %ymm0, %k0 {%k1} ##
-  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 1, i8 %mask)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleud %ymm1, %ymm0, %k0 {%k1} ##
-  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 2, i8 %mask)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 {%k1} ##
-  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 3, i8 %mask)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 {%k1} ##
-  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 4, i8 %mask)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 {%k1} ##
-  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 5, i8 %mask)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 {%k1} ##
-  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 6, i8 %mask)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmpordud %ymm1, %ymm0, %k0 {%k1} ##
-  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 7, i8 %mask)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
-declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
+declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i8, i8) nounwind readnone
 
 define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK_LABEL: test_cmp_q_256
+; CHECK-LABEL: test_cmp_q_256
 ; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
-  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltq %ymm1, %ymm0, %k0 ##
-  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 1, i8 -1)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleq %ymm1, %ymm0, %k0 ##
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 2, i8 -1)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 ##
-  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 3, i8 -1)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 ##
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 4, i8 -1)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 ##
-  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 5, i8 -1)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 ##
-  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 6, i8 -1)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmpordq %ymm1, %ymm0, %k0 ##
-  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 7, i8 -1)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
 define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
-; CHECK_LABEL: test_mask_cmp_q_256
+; CHECK-LABEL: test_mask_cmp_q_256
 ; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
-  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltq %ymm1, %ymm0, %k0 {%k1} ##
-  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 1, i8 %mask)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ##
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 2, i8 %mask)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 {%k1} ##
-  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 3, i8 %mask)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 {%k1} ##
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 4, i8 %mask)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 {%k1} ##
-  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 5, i8 %mask)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 {%k1} ##
-  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 6, i8 %mask)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmpordq %ymm1, %ymm0, %k0 {%k1} ##
-  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 7, i8 %mask)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
-declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
+declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i8, i8) nounwind readnone
 
 define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK_LABEL: test_ucmp_q_256
+; CHECK-LABEL: test_ucmp_q_256
 ; CHECK: vpcmpequq %ymm1, %ymm0, %k0 ##
-  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 ##
-  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 1, i8 -1)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 ##
-  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 2, i8 -1)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 ##
-  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 3, i8 -1)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 ##
-  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 4, i8 -1)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 ##
-  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 5, i8 -1)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 ##
-  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 6, i8 -1)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmporduq %ymm1, %ymm0, %k0 ##
-  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 7, i8 -1)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
 define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
-; CHECK_LABEL: test_mask_ucmp_q_256
+; CHECK-LABEL: test_mask_ucmp_q_256
 ; CHECK: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ##
-  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} ##
-  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 1, i8 %mask)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 {%k1} ##
-  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 2, i8 %mask)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 {%k1} ##
-  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 3, i8 %mask)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 {%k1} ##
-  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 4, i8 %mask)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 {%k1} ##
-  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 5, i8 %mask)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 {%k1} ##
-  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 6, i8 %mask)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmporduq %ymm1, %ymm0, %k0 {%k1} ##
-  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 7, i8 %mask)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
-declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
+declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i8, i8) nounwind readnone
 
 ; 128-bit
 
@@ -373,241 +373,492 @@ define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
 declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8)
 
 define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK_LABEL: test_cmp_d_128
+; CHECK-LABEL: test_cmp_d_128
 ; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
-  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltd %xmm1, %xmm0, %k0 ##
-  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 1, i8 -1)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpled %xmm1, %xmm0, %k0 ##
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 2, i8 -1)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 ##
-  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 3, i8 -1)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 ##
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 4, i8 -1)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 ##
-  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 5, i8 -1)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnled %xmm1, %xmm0, %k0 ##
-  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 6, i8 -1)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmpordd %xmm1, %xmm0, %k0 ##
-  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 7, i8 -1)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
 define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
-; CHECK_LABEL: test_mask_cmp_d_128
+; CHECK-LABEL: test_mask_cmp_d_128
 ; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
-  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltd %xmm1, %xmm0, %k0 {%k1} ##
-  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 1, i8 %mask)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpled %xmm1, %xmm0, %k0 {%k1} ##
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 2, i8 %mask)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 {%k1} ##
-  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 3, i8 %mask)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 {%k1} ##
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 4, i8 %mask)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 {%k1} ##
-  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 5, i8 %mask)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnled %xmm1, %xmm0, %k0 {%k1} ##
-  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 6, i8 %mask)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmpordd %xmm1, %xmm0, %k0 {%k1} ##
-  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 7, i8 %mask)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
-declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
+declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i8, i8) nounwind readnone
 
 define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK_LABEL: test_ucmp_d_128
+; CHECK-LABEL: test_ucmp_d_128
 ; CHECK: vpcmpequd %xmm1, %xmm0, %k0 ##
-  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltud %xmm1, %xmm0, %k0 ##
-  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 1, i8 -1)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleud %xmm1, %xmm0, %k0 ##
-  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 2, i8 -1)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 ##
-  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 3, i8 -1)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 ##
-  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 4, i8 -1)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 ##
-  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 5, i8 -1)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 ##
-  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 6, i8 -1)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmpordud %xmm1, %xmm0, %k0 ##
-  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 7, i8 -1)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
 define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
-; CHECK_LABEL: test_mask_ucmp_d_128
+; CHECK-LABEL: test_mask_ucmp_d_128
 ; CHECK: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ##
-  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltud %xmm1, %xmm0, %k0 {%k1} ##
-  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 1, i8 %mask)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleud %xmm1, %xmm0, %k0 {%k1} ##
-  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 2, i8 %mask)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 {%k1} ##
-  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 3, i8 %mask)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 {%k1} ##
-  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 4, i8 %mask)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 {%k1} ##
-  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 5, i8 %mask)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 {%k1} ##
-  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 6, i8 %mask)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmpordud %xmm1, %xmm0, %k0 {%k1} ##
-  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 7, i8 %mask)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
-declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
+declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i8, i8) nounwind readnone
 
 define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK_LABEL: test_cmp_q_128
+; CHECK-LABEL: test_cmp_q_128
 ; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
-  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltq %xmm1, %xmm0, %k0 ##
-  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 1, i8 -1)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleq %xmm1, %xmm0, %k0 ##
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 2, i8 -1)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 ##
-  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 3, i8 -1)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 ##
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 4, i8 -1)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 ##
-  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 -1)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 ##
-  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 6, i8 -1)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmpordq %xmm1, %xmm0, %k0 ##
-  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 7, i8 -1)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
 define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
-; CHECK_LABEL: test_mask_cmp_q_128
+; CHECK-LABEL: test_mask_cmp_q_128
 ; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
-  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltq %xmm1, %xmm0, %k0 {%k1} ##
-  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 1, i8 %mask)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleq %xmm1, %xmm0, %k0 {%k1} ##
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 2, i8 %mask)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 {%k1} ##
-  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 3, i8 %mask)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 {%k1} ##
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 4, i8 %mask)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 {%k1} ##
-  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 %mask)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 {%k1} ##
-  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 6, i8 %mask)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmpordq %xmm1, %xmm0, %k0 {%k1} ##
-  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 7, i8 %mask)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
-declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
+declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i8, i8) nounwind readnone
 
 define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK_LABEL: test_ucmp_q_128
+; CHECK-LABEL: test_ucmp_q_128
 ; CHECK: vpcmpequq %xmm1, %xmm0, %k0 ##
-  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 ##
-  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 1, i8 -1)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 ##
-  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 2, i8 -1)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 ##
-  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 3, i8 -1)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 ##
-  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 4, i8 -1)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 ##
-  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 -1)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 ##
-  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 6, i8 -1)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmporduq %xmm1, %xmm0, %k0 ##
-  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 7, i8 -1)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
 define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
-; CHECK_LABEL: test_mask_ucmp_q_128
+; CHECK-LABEL: test_mask_ucmp_q_128
 ; CHECK: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ##
-  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
 ; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} ##
-  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 1, i8 %mask)
   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
 ; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 {%k1} ##
-  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 2, i8 %mask)
   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
 ; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 {%k1} ##
-  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 3, i8 %mask)
   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
 ; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 {%k1} ##
-  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 4, i8 %mask)
   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
 ; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 {%k1} ##
-  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 %mask)
   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
 ; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 {%k1} ##
-  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 6, i8 %mask)
   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
 ; CHECK: vpcmporduq %xmm1, %xmm0, %k0 {%k1} ##
-  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 7, i8 %mask)
   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   ret <8 x i8> %vec7
 }
 
-declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
+declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i8, i8) nounwind readnone
+
+; CHECK-LABEL: compr1
+; CHECK: vcompresspd %zmm0
+define void @compr1(i8* %addr, <8 x double> %data, i8 %mask) {
+  call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
+
+; CHECK-LABEL: compr2
+; CHECK: vcompresspd %ymm0
+define void @compr2(i8* %addr, <4 x double> %data, i8 %mask) {
+  call void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
+
+; CHECK-LABEL: compr3
+; CHECK: vcompressps %xmm0
+define void @compr3(i8* %addr, <4 x float> %data, i8 %mask) {
+  call void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
+
+; CHECK-LABEL: compr4
+; CHECK: vcompresspd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x8a,0xc0]
+define <8 x double> @compr4(i8* %addr, <8 x double> %data, i8 %mask) {
+  %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
+  ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
+
+; CHECK-LABEL: compr5
+; CHECK: vcompresspd %ymm0, %ymm1 {%k1}  ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0xc1]
+define <4 x double> @compr5(<4 x double> %data, <4 x double> %src0, i8 %mask) {
+  %res = call <4 x double> @llvm.x86.avx512.mask.compress.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask)
+  ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.compress.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask)
+
+; CHECK-LABEL: compr6
+; CHECK: vcompressps %xmm0
+define <4 x float> @compr6(<4 x float> %data, i8 %mask) {
+  %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask)
+
+; CHECK-LABEL: compr7
+; CHECK-NOT: vcompress
+; CHECK: vmovapd
+define void @compr7(i8* %addr, <8 x double> %data) {
+  call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 -1)
+  ret void
+}
+
+; CHECK-LABEL: compr8
+; CHECK-NOT: vcompressps %xmm0
+define <4 x float> @compr8(<4 x float> %data) {
+  %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: compr9
+; CHECK: vpcompressq %zmm0, (%rdi) {%k1}  ## encoding: [0x62,0xf2,0xfd,0x49,0x8b,0x07]
+define void @compr9(i8* %addr, <8 x i64> %data, i8 %mask) {
+  call void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
+
+; CHECK-LABEL: compr10
+; CHECK: vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0]
+define <4 x i32> @compr10(<4 x i32> %data, i8 %mask) {
+  %res = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask)
+
+; Expand
+
+; CHECK-LABEL: expand1
+; CHECK: vexpandpd (%rdi), %zmm0 {%k1}  ## encoding: [0x62,0xf2,0xfd,0x49,0x88,0x07]
+define <8 x double> @expand1(i8* %addr, <8 x double> %data, i8 %mask) {
+  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
+  ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
+
+; CHECK-LABEL: expand2
+; CHECK: vexpandpd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0x07]
+define <4 x double> @expand2(i8* %addr, <4 x double> %data, i8 %mask) {
+  %res = call <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
+  ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
+
+; CHECK-LABEL: expand3
+; CHECK: vexpandps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x88,0x07]
+define <4 x float> @expand3(i8* %addr, <4 x float> %data, i8 %mask) {
+  %res = call <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
+
+; CHECK-LABEL: expand4
+; CHECK: vexpandpd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x88,0xc0]
+define <8 x double> @expand4(i8* %addr, <8 x double> %data, i8 %mask) {
+  %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
+  ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
+
+; CHECK-LABEL: expand5
+; CHECK: vexpandpd %ymm0, %ymm1 {%k1}  ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0xc8]
+define <4 x double> @expand5(<4 x double> %data, <4 x double> %src0, i8 %mask) {
+  %res = call <4 x double> @llvm.x86.avx512.mask.expand.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask)
+  ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.expand.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask)
+
+; CHECK-LABEL: expand6
+; CHECK: vexpandps %xmm0
+define <4 x float> @expand6(<4 x float> %data, i8 %mask) {
+  %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask)
+
+; CHECK-LABEL: expand7
+; CHECK-NOT: vexpand
+; CHECK: vmovapd
+define <8 x double> @expand7(i8* %addr, <8 x double> %data) {
+  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 -1)
+  ret <8 x double> %res
+}
+
+; CHECK-LABEL: expand8
+; CHECK-NOT: vexpandps %xmm0
+define <4 x float> @expand8(<4 x float> %data) {
+  %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: expand9
+; CHECK: vpexpandq (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x89,0x07]
+define <8 x i64> @expand9(i8* %addr, <8 x i64> %data, i8 %mask) {
+  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
+
+; CHECK-LABEL: expand10
+; CHECK: vpexpandd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x89,0xc0]
+define <4 x i32> @expand10(<4 x i32> %data, i8 %mask) {
+  %res = call <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask)
+
+define <8 x float> @test_x86_mask_blend_ps_256(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
+  ; CHECK: vblendmps %ymm1, %ymm0
+  %res = call <8 x float> @llvm.x86.avx512.mask.blend.ps.256(<8 x float> %a1, <8 x float> %a2, i8 %a0) ; <<8 x float>> [#uses=1]
+  ret <8 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readonly
+
+define <4 x double> @test_x86_mask_blend_pd_256(i8 %a0, <4 x double> %a1, <4 x double> %a2) {
+  ; CHECK: vblendmpd %ymm1, %ymm0
+  %res = call <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double> %a1, <4 x double> %a2, i8 %a0) ; <<4 x double>> [#uses=1]
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_mask_blend_pd_256_memop(<4 x double> %a, <4 x double>* %ptr, i8 %mask) {
+  ; CHECK-LABEL: test_x86_mask_blend_pd_256_memop
+  ; CHECK: vblendmpd (%
+  %b = load <4 x double>* %ptr
+  %res = call <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double> %a, <4 x double> %b, i8 %mask) ; <<4 x double>> [#uses=1]
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readonly
+
+; CHECK-LABEL: test_x86_mask_blend_d_256
+; CHECK: vpblendmd
+define <8 x i32> @test_x86_mask_blend_d_256(i8 %a0, <8 x i32> %a1, <8 x i32> %a2) {
+  %res = call <8 x i32> @llvm.x86.avx512.mask.blend.d.256(<8 x i32> %a1, <8 x i32> %a2, i8 %a0) ; <<8 x i32>> [#uses=1]
+  ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx512.mask.blend.d.256(<8 x i32>, <8 x i32>, i8) nounwind readonly
+
+define <4 x i64> @test_x86_mask_blend_q_256(i8 %a0, <4 x i64> %a1, <4 x i64> %a2) {
+  ; CHECK: vpblendmq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.blend.q.256(<4 x i64> %a1, <4 x i64> %a2, i8 %a0) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx512.mask.blend.q.256(<4 x i64>, <4 x i64>, i8) nounwind readonly
+
+define <4 x float> @test_x86_mask_blend_ps_128(i8 %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: vblendmps %xmm1, %xmm0
+  %res = call <4 x float> @llvm.x86.avx512.mask.blend.ps.128(<4 x float> %a1, <4 x float> %a2, i8 %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.blend.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly
+
+define <2 x double> @test_x86_mask_blend_pd_128(i8 %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: vblendmpd %xmm1, %xmm0
+  %res = call <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double> %a1, <2 x double> %a2, i8 %a0) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_mask_blend_pd_128_memop(<2 x double> %a, <2 x double>* %ptr, i8 %mask) {
+  ; CHECK-LABEL: test_x86_mask_blend_pd_128_memop
+  ; CHECK: vblendmpd (%
+  %b = load <2 x double>* %ptr
+  %res = call <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double> %a, <2 x double> %b, i8 %mask) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double>, <2 x double>, i8) nounwind readonly
+
+define <4 x i32> @test_x86_mask_blend_d_128(i8 %a0, <4 x i32> %a1, <4 x i32> %a2) {
+  ; CHECK: vpblendmd
+  %res = call <4 x i32> @llvm.x86.avx512.mask.blend.d.128(<4 x i32> %a1, <4 x i32> %a2, i8 %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx512.mask.blend.d.128(<4 x i32>, <4 x i32>, i8) nounwind readonly
+
+define <2 x i64> @test_x86_mask_blend_q_128(i8 %a0, <2 x i64> %a1, <2 x i64> %a2) {
+  ; CHECK: vpblendmq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.blend.q.128(<2 x i64> %a1, <2 x i64> %a2, i8 %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx512.mask.blend.q.128(<2 x i64>, <2 x i64>, i8) nounwind readonly
diff --git a/test/CodeGen/X86/avx512vl-logic.ll b/test/CodeGen/X86/avx512vl-logic.ll
new file mode 100644
index 0000000..02cb8f9
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-logic.ll
@@ -0,0 +1,137 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl | FileCheck %s
+
+; 256-bit
+
+; CHECK-LABEL: vpandd256
+; CHECK: vpandd %ymm
+; CHECK: ret
+define <8 x i32> @vpandd256(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %x = and <8 x i32> %a2, %b
+  ret <8 x i32> %x
+}
+
+; CHECK-LABEL: vpord256
+; CHECK: vpord %ymm
+; CHECK: ret
+define <8 x i32> @vpord256(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %x = or <8 x i32> %a2, %b
+  ret <8 x i32> %x
+}
+
+; CHECK-LABEL: vpxord256
+; CHECK: vpxord %ymm
+; CHECK: ret
+define <8 x i32> @vpxord256(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %x = xor <8 x i32> %a2, %b
+  ret <8 x i32> %x
+}
+
+; CHECK-LABEL: vpandq256
+; CHECK: vpandq %ymm
+; CHECK: ret
+define <4 x i64> @vpandq256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
+  %x = and <4 x i64> %a2, %b
+  ret <4 x i64> %x
+}
+
+; CHECK-LABEL: vporq256
+; CHECK: vporq %ymm
+; CHECK: ret
+define <4 x i64> @vporq256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
+  %x = or <4 x i64> %a2, %b
+  ret <4 x i64> %x
+}
+
+; CHECK-LABEL: vpxorq256
+; CHECK: vpxorq %ymm
+; CHECK: ret
+define <4 x i64> @vpxorq256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
+  %x = xor <4 x i64> %a2, %b
+  ret <4 x i64> %x
+}
+
+; 128-bit
+
+; CHECK-LABEL: vpandd128
+; CHECK: vpandd %xmm
+; CHECK: ret
+define <4 x i32> @vpandd128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <4 x i32> %a, <i32 1, i32 1, i32 1, i32 1>
+  %x = and <4 x i32> %a2, %b
+  ret <4 x i32> %x
+}
+
+; CHECK-LABEL: vpord128
+; CHECK: vpord %xmm
+; CHECK: ret
+define <4 x i32> @vpord128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <4 x i32> %a, <i32 1, i32 1, i32 1, i32 1>
+  %x = or <4 x i32> %a2, %b
+  ret <4 x i32> %x
+}
+
+; CHECK-LABEL: vpxord128
+; CHECK: vpxord %xmm
+; CHECK: ret
+define <4 x i32> @vpxord128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <4 x i32> %a, <i32 1, i32 1, i32 1, i32 1>
+  %x = xor <4 x i32> %a2, %b
+  ret <4 x i32> %x
+}
+
+; CHECK-LABEL: vpandq128
+; CHECK: vpandq %xmm
+; CHECK: ret
+define <2 x i64> @vpandq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <2 x i64> %a, <i64 1, i64 1>
+  %x = and <2 x i64> %a2, %b
+  ret <2 x i64> %x
+}
+
+; CHECK-LABEL: vporq128
+; CHECK: vporq %xmm
+; CHECK: ret
+define <2 x i64> @vporq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <2 x i64> %a, <i64 1, i64 1>
+  %x = or <2 x i64> %a2, %b
+  ret <2 x i64> %x
+}
+
+; CHECK-LABEL: vpxorq128
+; CHECK: vpxorq %xmm
+; CHECK: ret
+define <2 x i64> @vpxorq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <2 x i64> %a, <i64 1, i64 1>
+  %x = xor <2 x i64> %a2, %b
+  ret <2 x i64> %x
+}
diff --git a/test/CodeGen/X86/avx512vl-nontemporal.ll b/test/CodeGen/X86/avx512vl-nontemporal.ll
index 2ad9768..fdafb35 100644
--- a/test/CodeGen/X86/avx512vl-nontemporal.ll
+++ b/test/CodeGen/X86/avx512vl-nontemporal.ll
@@ -31,4 +31,4 @@ define void @f128(<4 x float> %A, <4 x float> %AA, i8* %B, <2 x double> %C, <2 x
   store <2 x double> %C2, <2 x double>* %cast2, align 64, !nontemporal !0
   ret void
 }
-!0 = metadata !{i32 1}
+!0 = !{i32 1}
diff --git a/test/CodeGen/X86/avx512vl-vec-cmp.ll b/test/CodeGen/X86/avx512vl-vec-cmp.ll
index 9c64c03..b6b5085 100644
--- a/test/CodeGen/X86/avx512vl-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-cmp.ll
@@ -14,9 +14,9 @@ define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; CHECK: vpcmpgtq {{.*%k[0-7]}}
 ; CHECK: vmovdqa64 {{.*}}%k1
 ; CHECK: ret
-define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y) nounwind {
+define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind {
   %mask = icmp sgt <4 x i64> %x, %y
-  %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
+  %max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y
   ret <4 x i64> %max
 }
 
@@ -34,9 +34,9 @@ define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind
 ; CHECK: vpcmpnleuq {{.*%k[0-7]}}
 ; CHECK: vmovdqa64 {{.*}}%k1
 ; CHECK: ret
-define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y) nounwind {
+define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind {
   %mask = icmp ugt <4 x i64> %x, %y
-  %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
+  %max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y
   ret <4 x i64> %max
 }
 
@@ -204,9 +204,9 @@ define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; CHECK: vpcmpgtq {{.*%k[0-7]}}
 ; CHECK: vmovdqa64 {{.*}}%k1
 ; CHECK: ret
-define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y) nounwind {
+define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind {
   %mask = icmp sgt <2 x i64> %x, %y
-  %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
+  %max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y
   ret <2 x i64> %max
 }
 
@@ -224,9 +224,9 @@ define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind
 ; CHECK: vpcmpnleuq {{.*%k[0-7]}}
 ; CHECK: vmovdqa64 {{.*}}%k1
 ; CHECK: ret
-define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y) nounwind {
+define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind {
   %mask = icmp ugt <2 x i64> %x, %y
-  %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
+  %max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y
   ret <2 x i64> %max
 }
 
diff --git a/test/CodeGen/X86/barrier.ll b/test/CodeGen/X86/barrier.ll
index 4769b39..1f60131 100644
--- a/test/CodeGen/X86/barrier.ll
+++ b/test/CodeGen/X86/barrier.ll
@@ -1,6 +1,7 @@
-; RUN: llc < %s -march=x86 -mattr=-sse2 | grep lock
+; RUN: llc < %s -march=x86 -mattr=-sse2 | FileCheck %s
 
 define void @test() {
+; CHECK: lock
 	fence seq_cst
 	ret void
 }
diff --git a/test/CodeGen/X86/bitcast-mmx.ll b/test/CodeGen/X86/bitcast-mmx.ll
new file mode 100644
index 0000000..de1cb5a
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-mmx.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck %s
+
+define i32 @t0(i64 %x) {
+; CHECK-LABEL: t0:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:    movd %[[REG1:[a-z]+]], %mm0
+; CHECK-NEXT:    pshufw $238, %mm0, %mm0
+; CHECK-NEXT:    movd %mm0, %eax
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast i64 %x to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 -18)
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  %6 = bitcast i64 %5 to <2 x i32>
+  %7 = extractelement <2 x i32> %6, i32 0
+  ret i32 %7
+}
+
+define i64 @t1(i64 %x, i32 %n) {
+; CHECK-LABEL: t1:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:    movd %[[REG2:[a-z]+]], %mm0
+; CHECK-NEXT:    movd %[[REG1]], %mm1
+; CHECK-NEXT:    psllq %mm0, %mm1
+; CHECK-NEXT:    movd %mm1, %rax
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast i64 %x to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %n)
+  %2 = bitcast x86_mmx %1 to i64
+  ret i64 %2
+}
+
+define i64 @t2(i64 %x, i32 %n, i32 %w) {
+; CHECK-LABEL: t2:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:  movd %[[REG4:[a-z]+]], %mm0
+; CHECK-NEXT:  movd %[[REG6:[a-z0-9]+]], %mm1
+; CHECK-NEXT:  psllq %mm0, %mm1
+; CHECK-NEXT:  movd %[[REG1]], %mm0
+; CHECK-NEXT:  por %mm1, %mm0
+; CHECK-NEXT:  movd %mm0, %rax
+; CHECK-NEXT:  retq
+entry:
+  %0 = insertelement <2 x i32> undef, i32 %w, i32 0
+  %1 = insertelement <2 x i32> %0, i32 0, i32 1
+  %2 = bitcast <2 x i32> %1 to x86_mmx
+  %3 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %2, i32 %n)
+  %4 = bitcast i64 %x to x86_mmx
+  %5 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %4, x86_mmx %3)
+  %6 = bitcast x86_mmx %5 to i64
+  ret i64 %6
+}
+
+define i64 @t3(<1 x i64>* %y, i32* %n) {
+; CHECK-LABEL: t3:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:    movq (%[[REG1]]), %mm0
+; CHECK-NEXT:    psllq (%[[REG3:[a-z]+]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast <1 x i64>* %y to x86_mmx*
+  %1 = load x86_mmx* %0, align 8
+  %2 = load i32* %n, align 4
+  %3 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %1, i32 %2)
+  %4 = bitcast x86_mmx %3 to i64
+  ret i64 %4
+}
+
+declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
+declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
+declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx)
+
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index cc40bcf..e35be6a 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -124,7 +124,7 @@ exit:
   ret i32 %sum
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 4, i32 64}
+!0 = !{!"branch_weights", i32 4, i32 64}
 
 define i32 @test_loop_early_exits(i32 %i, i32* %a) {
 ; Check that we sink early exit blocks out of loop bodies.
@@ -506,7 +506,7 @@ if.end:
   ret void
 }
 
-!1 = metadata !{metadata !"branch_weights", i32 1000, i32 1}
+!1 = !{!"branch_weights", i32 1000, i32 1}
 
 declare i32 @f()
 declare i32 @g()
@@ -542,7 +542,7 @@ exit:
   ret i32 %result
 }
 
-!2 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+!2 = !{!"branch_weights", i32 3, i32 1}
 
 declare i32 @__gxx_personality_v0(...)
 
diff --git a/test/CodeGen/X86/break-avx-dep.ll b/test/CodeGen/X86/break-avx-dep.ll
deleted file mode 100644
index 210bda1..0000000
--- a/test/CodeGen/X86/break-avx-dep.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
-;
-; rdar:15221834 False AVX register dependencies cause 5x slowdown on
-; flops-6. Make sure the unused register read by vcvtsi2sdq is zeroed
-; to avoid cyclic dependence on a write to the same register in a
-; previous iteration.
-
-; CHECK-LABEL: t1:
-; CHECK-LABEL: %loop
-; CHECK: vxorps %[[REG:xmm.]], %{{xmm.}}, %{{xmm.}}
-; CHECK: vcvtsi2sdq %{{r[0-9a-x]+}}, %[[REG]], %{{xmm.}}
-define i64 @t1(i64* nocapture %x, double* nocapture %y) nounwind {
-entry:
-  %vx = load i64* %x
-  br label %loop
-loop:
-  %i = phi i64 [ 1, %entry ], [ %inc, %loop ]
-  %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
-  %fi = sitofp i64 %i to double
-  %vy = load double* %y
-  %fipy = fadd double %fi, %vy
-  %iipy = fptosi double %fipy to i64
-  %s2 = add i64 %s1, %iipy
-  %inc = add nsw i64 %i, 1
-  %exitcond = icmp eq i64 %inc, 156250000
-  br i1 %exitcond, label %ret, label %loop
-ret:
-  ret i64 %s2
-}
diff --git a/test/CodeGen/X86/break-false-dep.ll b/test/CodeGen/X86/break-false-dep.ll
new file mode 100644
index 0000000..7034fae
--- /dev/null
+++ b/test/CodeGen/X86/break-false-dep.ll
@@ -0,0 +1,201 @@
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx -mcpu=corei7-avx | FileCheck %s --check-prefix=AVX
+
+define double @t1(float* nocapture %x) nounwind readonly ssp {
+entry:
+; SSE-LABEL: t1:
+; SSE: movss ([[A0:%rdi|%rcx]]), %xmm0
+; SSE: cvtss2sd %xmm0, %xmm0
+
+  %0 = load float* %x, align 4
+  %1 = fpext float %0 to double
+  ret double %1
+}
+
+define float @t2(double* nocapture %x) nounwind readonly ssp optsize {
+entry:
+; SSE-LABEL: t2:
+; SSE: cvtsd2ss ([[A0]]), %xmm0
+  %0 = load double* %x, align 8
+  %1 = fptrunc double %0 to float
+  ret float %1
+}
+
+define float @squirtf(float* %x) nounwind {
+entry:
+; SSE-LABEL: squirtf:
+; SSE: movss ([[A0]]), %xmm0
+; SSE: sqrtss %xmm0, %xmm0
+  %z = load float* %x
+  %t = call float @llvm.sqrt.f32(float %z)
+  ret float %t
+}
+
+define double @squirt(double* %x) nounwind {
+entry:
+; SSE-LABEL: squirt:
+; SSE: movsd ([[A0]]), %xmm0
+; SSE: sqrtsd %xmm0, %xmm0
+  %z = load double* %x
+  %t = call double @llvm.sqrt.f64(double %z)
+  ret double %t
+}
+
+define float @squirtf_size(float* %x) nounwind optsize {
+entry:
+; SSE-LABEL: squirtf_size:
+; SSE: sqrtss ([[A0]]), %xmm0
+  %z = load float* %x
+  %t = call float @llvm.sqrt.f32(float %z)
+  ret float %t
+}
+
+define double @squirt_size(double* %x) nounwind optsize {
+entry:
+; SSE-LABEL: squirt_size:
+; SSE: sqrtsd ([[A0]]), %xmm0
+  %z = load double* %x
+  %t = call double @llvm.sqrt.f64(double %z)
+  ret double %t
+}
+
+declare float @llvm.sqrt.f32(float)
+declare double @llvm.sqrt.f64(double)
+
+; SSE-LABEL: loopdep1
+; SSE: for.body
+;
+; This loop contains two cvtsi2ss instructions that update the same xmm
+; register.  Verify that the execution dependency fix pass breaks those
+; dependencies by inserting xorps instructions.
+;
+; If the register allocator chooses different registers for the two cvtsi2ss
+; instructions, they are still dependent on themselves.
+; SSE: xorps [[XMM1:%xmm[0-9]+]]
+; SSE: , [[XMM1]]
+; SSE: cvtsi2ssl %{{.*}}, [[XMM1]]
+; SSE: xorps [[XMM2:%xmm[0-9]+]]
+; SSE: , [[XMM2]]
+; SSE: cvtsi2ssl %{{.*}}, [[XMM2]]
+;
+define float @loopdep1(i32 %m) nounwind uwtable readnone ssp {
+entry:
+  %tobool3 = icmp eq i32 %m, 0
+  br i1 %tobool3, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %m.addr.07 = phi i32 [ %dec, %for.body ], [ %m, %entry ]
+  %s1.06 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ]
+  %s2.05 = phi float [ %add2, %for.body ], [ 0.000000e+00, %entry ]
+  %n.04 = phi i32 [ %inc, %for.body ], [ 1, %entry ]
+  %conv = sitofp i32 %n.04 to float
+  %add = fadd float %s1.06, %conv
+  %conv1 = sitofp i32 %m.addr.07 to float
+  %add2 = fadd float %s2.05, %conv1
+  %inc = add nsw i32 %n.04, 1
+  %dec = add nsw i32 %m.addr.07, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %s1.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %s2.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2, %for.body ]
+  %sub = fsub float %s1.0.lcssa, %s2.0.lcssa
+  ret float %sub
+}
+
+; rdar:15221834 False AVX register dependencies cause 5x slowdown on
+; flops-6. Make sure the unused register read by vcvtsi2sdq is zeroed
+; to avoid cyclic dependence on a write to the same register in a
+; previous iteration.
+
+; AVX-LABEL: loopdep2:
+; AVX-LABEL: %loop
+; AVX: vxorps %[[REG:xmm.]], %{{xmm.}}, %{{xmm.}}
+; AVX: vcvtsi2sdq %{{r[0-9a-x]+}}, %[[REG]], %{{xmm.}}
+; SSE-LABEL: loopdep2:
+; SSE-LABEL: %loop
+; SSE: xorps %[[REG:xmm.]], %[[REG]]
+; SSE: cvtsi2sdq %{{r[0-9a-x]+}}, %[[REG]]
+define i64 @loopdep2(i64* nocapture %x, double* nocapture %y) nounwind {
+entry:
+  %vx = load i64* %x
+  br label %loop
+loop:
+  %i = phi i64 [ 1, %entry ], [ %inc, %loop ]
+  %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
+  %fi = sitofp i64 %i to double
+  %vy = load double* %y
+  %fipy = fadd double %fi, %vy
+  %iipy = fptosi double %fipy to i64
+  %s2 = add i64 %s1, %iipy
+  %inc = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %inc, 156250000
+  br i1 %exitcond, label %ret, label %loop
+ret:
+  ret i64 %s2
+}
+
+; This loop contains a cvtsi2sd instruction that has a loop-carried
+; false dependency on an xmm that is modified by other scalar instructions
+; that follow it in the loop. Additionally, the source of convert is a 
+; memory operand. Verify the execution dependency fix pass breaks this
+; dependency by inserting a xor before the convert.
+@x = common global [1024 x double] zeroinitializer, align 16
+@y = common global [1024 x double] zeroinitializer, align 16
+@z = common global [1024 x double] zeroinitializer, align 16
+@w = common global [1024 x double] zeroinitializer, align 16
+@v = common global [1024 x i32] zeroinitializer, align 16
+
+define void @loopdep3() {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc14, %entry
+  %i.025 = phi i32 [ 0, %entry ], [ %inc15, %for.inc14 ]
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @v, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %conv = sitofp i32 %0 to double
+  %arrayidx5 = getelementptr inbounds [1024 x double]* @x, i64 0, i64 %indvars.iv
+  %1 = load double* %arrayidx5, align 8
+  %mul = fmul double %conv, %1
+  %arrayidx7 = getelementptr inbounds [1024 x double]* @y, i64 0, i64 %indvars.iv
+  %2 = load double* %arrayidx7, align 8
+  %mul8 = fmul double %mul, %2
+  %arrayidx10 = getelementptr inbounds [1024 x double]* @z, i64 0, i64 %indvars.iv
+  %3 = load double* %arrayidx10, align 8
+  %mul11 = fmul double %mul8, %3
+  %arrayidx13 = getelementptr inbounds [1024 x double]* @w, i64 0, i64 %indvars.iv
+  store double %mul11, double* %arrayidx13, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.inc14, label %for.body3
+
+for.inc14:                                        ; preds = %for.body3
+  %inc15 = add nsw i32 %i.025, 1
+  %exitcond26 = icmp eq i32 %inc15, 100000
+  br i1 %exitcond26, label %for.end16, label %for.cond1.preheader
+
+for.end16:                                        ; preds = %for.inc14
+  ret void
+
+;SSE-LABEL:@loopdep3
+;SSE: xorps [[XMM0:%xmm[0-9]+]], [[XMM0]]
+;SSE-NEXT: cvtsi2sdl {{.*}}, [[XMM0]]
+;SSE-NEXT: mulsd {{.*}}, [[XMM0]]
+;SSE-NEXT: mulsd {{.*}}, [[XMM0]]
+;SSE-NEXT: mulsd {{.*}}, [[XMM0]]
+;SSE-NEXT: movsd [[XMM0]],
+;AVX-LABEL:@loopdep3
+;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]]
+;AVX-NEXT: vcvtsi2sdl {{.*}}, [[XMM0]], [[XMM0]]
+;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]]
+;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]]
+;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]]
+;AVX-NEXT: vmovsd [[XMM0]],
+}
diff --git a/test/CodeGen/X86/break-sse-dep.ll b/test/CodeGen/X86/break-sse-dep.ll
deleted file mode 100644
index 8124d6f..0000000
--- a/test/CodeGen/X86/break-sse-dep.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s
-
-define double @t1(float* nocapture %x) nounwind readonly ssp {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: movss ([[A0:%rdi|%rcx]]), %xmm0
-; CHECK: cvtss2sd %xmm0, %xmm0
-
-  %0 = load float* %x, align 4
-  %1 = fpext float %0 to double
-  ret double %1
-}
-
-define float @t2(double* nocapture %x) nounwind readonly ssp optsize {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: cvtsd2ss ([[A0]]), %xmm0
-  %0 = load double* %x, align 8
-  %1 = fptrunc double %0 to float
-  ret float %1
-}
-
-define float @squirtf(float* %x) nounwind {
-entry:
-; CHECK-LABEL: squirtf:
-; CHECK: movss ([[A0]]), %xmm0
-; CHECK: sqrtss %xmm0, %xmm0
-  %z = load float* %x
-  %t = call float @llvm.sqrt.f32(float %z)
-  ret float %t
-}
-
-define double @squirt(double* %x) nounwind {
-entry:
-; CHECK-LABEL: squirt:
-; CHECK: sqrtsd ([[A0]]), %xmm0
-  %z = load double* %x
-  %t = call double @llvm.sqrt.f64(double %z)
-  ret double %t
-}
-
-define float @squirtf_size(float* %x) nounwind optsize {
-entry:
-; CHECK-LABEL: squirtf_size:
-; CHECK: sqrtss ([[A0]]), %xmm0
-  %z = load float* %x
-  %t = call float @llvm.sqrt.f32(float %z)
-  ret float %t
-}
-
-define double @squirt_size(double* %x) nounwind optsize {
-entry:
-; CHECK-LABEL: squirt_size:
-; CHECK: sqrtsd ([[A0]]), %xmm0
-  %z = load double* %x
-  %t = call double @llvm.sqrt.f64(double %z)
-  ret double %t
-}
-
-declare float @llvm.sqrt.f32(float)
-declare double @llvm.sqrt.f64(double)
diff --git a/test/CodeGen/X86/bswap-vector.ll b/test/CodeGen/X86/bswap-vector.ll
index 9dc960d..7d5f380 100644
--- a/test/CodeGen/X86/bswap-vector.ll
+++ b/test/CodeGen/X86/bswap-vector.ll
@@ -1,7 +1,8 @@
-; RUN: llc < %s -mcpu=x86-64 | FileCheck %s -check-prefix=CHECK-NOSSSE3
-; RUN: llc < %s -mcpu=core2 | FileCheck %s -check-prefix=CHECK-SSSE3
-; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK-AVX2
-; RUN: llc < %s -mcpu=core-avx2 -x86-experimental-vector-widening-legalization | FileCheck %s -check-prefix=CHECK-WIDE-AVX2
+; RUN: llc < %s -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK-NOSSSE3
+; RUN: llc < %s -mcpu=core2 | FileCheck %s --check-prefix=CHECK-SSSE3
+; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s --check-prefix=CHECK-AVX2
+; RUN: llc < %s -mcpu=core-avx2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE-AVX2
+
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -9,165 +10,278 @@ declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
 declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
 
-define <8 x i16> @test1(<8 x i16> %v) #0 {
+define <8 x i16> @test1(<8 x i16> %v) {
+; CHECK-NOSSSE3-LABEL: test1:
+; CHECK-NOSSSE3:       # BB#0: # %entry
+; CHECK-NOSSSE3-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NOSSSE3-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
+; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; CHECK-NOSSSE3-NEXT:    packuswb %xmm2, %xmm0
+; CHECK-NOSSSE3-NEXT:    retq
+;
+; CHECK-SSSE3-LABEL: test1:
+; CHECK-SSSE3:       # BB#0: # %entry
+; CHECK-SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-SSSE3-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: test1:
+; CHECK-AVX2:       # BB#0: # %entry
+; CHECK-AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-WIDE-AVX2-LABEL: test1:
+; CHECK-WIDE-AVX2:       # BB#0: # %entry
+; CHECK-WIDE-AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-WIDE-AVX2-NEXT:    retq
 entry:
   %r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v)
   ret <8 x i16> %r
-
-; CHECK-NOSSSE3-LABEL: @test1
-; CHECK-NOSSSE3: rolw
-; CHECK-NOSSSE3: rolw
-; CHECK-NOSSSE3: rolw
-; CHECK-NOSSSE3: rolw
-; CHECK-NOSSSE3: rolw
-; CHECK-NOSSSE3: rolw
-; CHECK-NOSSSE3: rolw
-; CHECK-NOSSSE3: rolw
-; CHECK-NOSSSE3: retq
-
-; CHECK-SSSE3-LABEL: @test1
-; CHECK-SSSE3: pshufb
-; CHECK-SSSE3-NEXT: retq
-
-; CHECK-AVX2-LABEL: @test1
-; CHECK-AVX2: vpshufb
-; CHECK-AVX2-NEXT: retq
-
-; CHECK-WIDE-AVX2-LABEL: @test1
-; CHECK-WIDE-AVX2: vpshufb
-; CHECK-WIDE-AVX2-NEXT: retq
 }
 
-define <4 x i32> @test2(<4 x i32> %v) #0 {
+define <4 x i32> @test2(<4 x i32> %v) {
+; CHECK-NOSSSE3-LABEL: test2:
+; CHECK-NOSSSE3:       # BB#0: # %entry
+; CHECK-NOSSSE3-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NOSSSE3-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    packuswb %xmm2, %xmm0
+; CHECK-NOSSSE3-NEXT:    retq
+;
+; CHECK-SSSE3-LABEL: test2:
+; CHECK-SSSE3:       # BB#0: # %entry
+; CHECK-SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-SSSE3-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: test2:
+; CHECK-AVX2:       # BB#0: # %entry
+; CHECK-AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-WIDE-AVX2-LABEL: test2:
+; CHECK-WIDE-AVX2:       # BB#0: # %entry
+; CHECK-WIDE-AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-WIDE-AVX2-NEXT:    retq
 entry:
   %r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %v)
   ret <4 x i32> %r
-
-; CHECK-NOSSSE3-LABEL: @test2
-; CHECK-NOSSSE3: bswapl
-; CHECK-NOSSSE3: bswapl
-; CHECK-NOSSSE3: bswapl
-; CHECK-NOSSSE3: bswapl
-; CHECK-NOSSSE3: retq
-
-; CHECK-SSSE3-LABEL: @test2
-; CHECK-SSSE3: pshufb
-; CHECK-SSSE3-NEXT: retq
-
-; CHECK-AVX2-LABEL: @test2
-; CHECK-AVX2: vpshufb
-; CHECK-AVX2-NEXT: retq
-
-; CHECK-WIDE-AVX2-LABEL: @test2
-; CHECK-WIDE-AVX2: vpshufb
-; CHECK-WIDE-AVX2-NEXT: retq
 }
 
-define <2 x i64> @test3(<2 x i64> %v) #0 {
+define <2 x i64> @test3(<2 x i64> %v) {
+; CHECK-NOSSSE3-LABEL: test3:
+; CHECK-NOSSSE3:       # BB#0: # %entry
+; CHECK-NOSSSE3-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NOSSSE3-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; CHECK-NOSSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NOSSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    packuswb %xmm2, %xmm0
+; CHECK-NOSSSE3-NEXT:    retq
+;
+; CHECK-SSSE3-LABEL: test3:
+; CHECK-SSSE3:       # BB#0: # %entry
+; CHECK-SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; CHECK-SSSE3-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: test3:
+; CHECK-AVX2:       # BB#0: # %entry
+; CHECK-AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-WIDE-AVX2-LABEL: test3:
+; CHECK-WIDE-AVX2:       # BB#0: # %entry
+; CHECK-WIDE-AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; CHECK-WIDE-AVX2-NEXT:    retq
 entry:
   %r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v)
   ret <2 x i64> %r
-
-; CHECK-NOSSSE3-LABEL: @test3
-; CHECK-NOSSSE3: bswapq
-; CHECK-NOSSSE3: bswapq
-; CHECK-NOSSSE3: retq
-
-; CHECK-SSSE3-LABEL: @test3
-; CHECK-SSSE3: pshufb
-; CHECK-SSSE3-NEXT: retq
-
-; CHECK-AVX2-LABEL: @test3
-; CHECK-AVX2: vpshufb
-; CHECK-AVX2-NEXT: retq
-
-; CHECK-WIDE-AVX2-LABEL: @test3
-; CHECK-WIDE-AVX2: vpshufb
-; CHECK-WIDE-AVX2-NEXT: retq
 }
 
 declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>)
 declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>)
 declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
 
-define <16 x i16> @test4(<16 x i16> %v) #0 {
+define <16 x i16> @test4(<16 x i16> %v) {
+; CHECK-NOSSSE3-LABEL: test4:
+; CHECK-NOSSSE3:       # BB#0: # %entry
+; CHECK-NOSSSE3-NEXT:    pxor %xmm2, %xmm2
+; CHECK-NOSSSE3-NEXT:    movdqa %xmm0, %xmm3
+; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
+; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; CHECK-NOSSSE3-NEXT:    packuswb %xmm3, %xmm0
+; CHECK-NOSSSE3-NEXT:    movdqa %xmm1, %xmm3
+; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
+; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
+; CHECK-NOSSSE3-NEXT:    packuswb %xmm3, %xmm1
+; CHECK-NOSSSE3-NEXT:    retq
+;
+; CHECK-SSSE3-LABEL: test4:
+; CHECK-SSSE3:       # BB#0: # %entry
+; CHECK-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; CHECK-SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; CHECK-SSSE3-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: test4:
+; CHECK-AVX2:       # BB#0: # %entry
+; CHECK-AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-WIDE-AVX2-LABEL: test4:
+; CHECK-WIDE-AVX2:       # BB#0: # %entry
+; CHECK-WIDE-AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
+; CHECK-WIDE-AVX2-NEXT:    retq
 entry:
   %r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %v)
   ret <16 x i16> %r
-
-; CHECK-SSSE3-LABEL: @test4
-; CHECK-SSSE3: pshufb
-; CHECK-SSSE3: pshufb
-; CHECK-SSSE3-NEXT: retq
-
-; CHECK-AVX2-LABEL: @test4
-; CHECK-AVX2: vpshufb
-; CHECK-AVX2-NEXT: retq
-
-; CHECK-WIDE-AVX2-LABEL: @test4
-; CHECK-WIDE-AVX2: vpshufb
-; CHECK-WIDE-AVX2-NEXT: retq
 }
 
-define <8 x i32> @test5(<8 x i32> %v) #0 {
+define <8 x i32> @test5(<8 x i32> %v) {
+; CHECK-NOSSSE3-LABEL: test5:
+; CHECK-NOSSSE3:       # BB#0: # %entry
+; CHECK-NOSSSE3-NEXT:    pxor %xmm2, %xmm2
+; CHECK-NOSSSE3-NEXT:    movdqa %xmm0, %xmm3
+; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    packuswb %xmm3, %xmm0
+; CHECK-NOSSSE3-NEXT:    movdqa %xmm1, %xmm3
+; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    packuswb %xmm3, %xmm1
+; CHECK-NOSSSE3-NEXT:    retq
+;
+; CHECK-SSSE3-LABEL: test5:
+; CHECK-SSSE3:       # BB#0: # %entry
+; CHECK-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; CHECK-SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; CHECK-SSSE3-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: test5:
+; CHECK-AVX2:       # BB#0: # %entry
+; CHECK-AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-WIDE-AVX2-LABEL: test5:
+; CHECK-WIDE-AVX2:       # BB#0: # %entry
+; CHECK-WIDE-AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
+; CHECK-WIDE-AVX2-NEXT:    retq
 entry:
   %r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %v)
   ret <8 x i32> %r
-
-; CHECK-SSSE3-LABEL: @test5
-; CHECK-SSSE3: pshufb
-; CHECK-SSSE3: pshufb
-; CHECK-SSSE3-NEXT: retq
-
-; CHECK-AVX2-LABEL: @test5
-; CHECK-AVX2: vpshufb
-; CHECK-AVX2-NEXT: retq
-
-; CHECK-WIDE-AVX2-LABEL: @test5
-; CHECK-WIDE-AVX2: vpshufb
-; CHECK-WIDE-AVX2-NEXT: retq
 }
 
-define <4 x i64> @test6(<4 x i64> %v) #0 {
+define <4 x i64> @test6(<4 x i64> %v) {
+; CHECK-NOSSSE3-LABEL: test6:
+; CHECK-NOSSSE3:       # BB#0: # %entry
+; CHECK-NOSSSE3-NEXT:    pxor %xmm2, %xmm2
+; CHECK-NOSSSE3-NEXT:    movdqa %xmm0, %xmm3
+; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; CHECK-NOSSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; CHECK-NOSSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    packuswb %xmm3, %xmm0
+; CHECK-NOSSSE3-NEXT:    movdqa %xmm1, %xmm3
+; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; CHECK-NOSSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NOSSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    packuswb %xmm3, %xmm1
+; CHECK-NOSSSE3-NEXT:    retq
+;
+; CHECK-SSSE3-LABEL: test6:
+; CHECK-SSSE3:       # BB#0: # %entry
+; CHECK-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; CHECK-SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; CHECK-SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; CHECK-SSSE3-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: test6:
+; CHECK-AVX2:       # BB#0: # %entry
+; CHECK-AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-WIDE-AVX2-LABEL: test6:
+; CHECK-WIDE-AVX2:       # BB#0: # %entry
+; CHECK-WIDE-AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; CHECK-WIDE-AVX2-NEXT:    retq
 entry:
   %r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %v)
   ret <4 x i64> %r
-
-; CHECK-SSSE3-LABEL: @test6
-; CHECK-SSSE3: pshufb
-; CHECK-SSSE3: pshufb
-; CHECK-SSSE3-NEXT: retq
-
-; CHECK-AVX2-LABEL: @test6
-; CHECK-AVX2: vpshufb
-; CHECK-AVX2-NEXT: retq
-
-; CHECK-WIDE-AVX2-LABEL: @test6
-; CHECK-WIDE-AVX2: vpshufb
-; CHECK-WIDE-AVX2-NEXT: retq
 }
 
 declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
 
-define <4 x i16> @test7(<4 x i16> %v) #0 {
+define <4 x i16> @test7(<4 x i16> %v) {
+; CHECK-NOSSSE3-LABEL: test7:
+; CHECK-NOSSSE3:       # BB#0: # %entry
+; CHECK-NOSSSE3-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NOSSSE3-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    packuswb %xmm2, %xmm0
+; CHECK-NOSSSE3-NEXT:    psrld $16, %xmm0
+; CHECK-NOSSSE3-NEXT:    retq
+;
+; CHECK-SSSE3-LABEL: test7:
+; CHECK-SSSE3:       # BB#0: # %entry
+; CHECK-SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-SSSE3-NEXT:    psrld $16, %xmm0
+; CHECK-SSSE3-NEXT:    retq
+;
+; CHECK-AVX2-LABEL: test7:
+; CHECK-AVX2:       # BB#0: # %entry
+; CHECK-AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-WIDE-AVX2-LABEL: test7:
+; CHECK-WIDE-AVX2:       # BB#0: # %entry
+; CHECK-WIDE-AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-WIDE-AVX2-NEXT:    retq
 entry:
   %r = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %v)
   ret <4 x i16> %r
-
-; CHECK-SSSE3-LABEL: @test7
-; CHECK-SSSE3: pshufb
-; CHECK-SSSE3: psrld $16
-; CHECK-SSSE3-NEXT: retq
-
-; CHECK-AVX2-LABEL: @test7
-; CHECK-AVX2: vpshufb
-; CHECK-AVX2: vpsrld $16
-; CHECK-AVX2-NEXT: retq
-
-; CHECK-WIDE-AVX2-LABEL: @test7
-; CHECK-WIDE-AVX2: vpshufb
-; CHECK-WIDE-AVX2-NEXT: retq
 }
-
-attributes #0 = { nounwind uwtable }
-
diff --git a/test/CodeGen/X86/chain_order.ll b/test/CodeGen/X86/chain_order.ll
index c88726e..72e6f78 100644
--- a/test/CodeGen/X86/chain_order.ll
+++ b/test/CodeGen/X86/chain_order.ll
@@ -1,13 +1,13 @@
 ; RUN: llc < %s -mcpu=corei7-avx -mtriple=x86_64-linux | FileCheck %s
 
-;CHECK-LABEL: cftx020:
-;CHECK: vmovsd  (%rdi), %xmm{{.*}}
-;CHECK: vmovsd  16(%rdi), %xmm{{.*}}
-;CHECK: vmovsd  24(%rdi), %xmm{{.*}}
-;CHECK: vmovhpd  8(%rdi), %xmm{{.*}}
-;CHECK: vmovupd %xmm{{.*}}, (%rdi)
-;CHECK: vmovupd %xmm{{.*}}, 16(%rdi)
-;CHECK: ret
+; CHECK-LABEL: cftx020:
+; CHECK: vmovsd  (%rdi), %xmm{{.*}}
+; CHECK-NEXT: vmovsd  16(%rdi), %xmm{{.*}}
+; CHECK-NEXT: vmovhpd  24(%rdi), %xmm{{.*}}
+; CHECK-NEXT: vmovhpd  8(%rdi), %xmm{{.*}}
+; CHECK: vmovupd %xmm{{.*}}, (%rdi)
+; CHECK-NEXT: vmovupd %xmm{{.*}}, 16(%rdi)
+; CHECK: ret
 
 ; A test from pifft (after SLP-vectorization) that fails when we drop the chain on newly merged loads.
 define void @cftx020(double* nocapture %a) {
diff --git a/test/CodeGen/X86/clobber-fi0.ll b/test/CodeGen/X86/clobber-fi0.ll
index 38a42db..4876c35 100644
--- a/test/CodeGen/X86/clobber-fi0.ll
+++ b/test/CodeGen/X86/clobber-fi0.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.7.0"
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index d38d2b4..355c6b4 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -disable-cgp-select2branch | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-apple-darwin10 -disable-cgp-select2branch | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
 define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
index 3cb8b97..b7995db 100644
--- a/test/CodeGen/X86/cmpxchg-clobber-flags.ll
+++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -1,19 +1,21 @@
-; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
-; RUN: llc -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s
 
 declare i32 @bar()
 
 define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
 ; CHECK-LABEL: test_intervening_call:
 ; CHECK: cmpxchg
-; CHECK: pushfq
-; CHECK: popq [[FLAGS:%.*]]
+; CHECK: pushf[[LQ:[lq]]]
+; CHECK-NEXT: pop[[LQ]] [[FLAGS:%.*]]
 
-; CHECK: callq bar
+; CHECK-NEXT: call[[LQ]] bar
 
-; CHECK: pushq [[FLAGS]]
-; CHECK: popfq
-; CHECK: jne
+; CHECK-NEXT: push[[LQ]] [[FLAGS]]
+; CHECK-NEXT: popf[[LQ]]
+; CHECK-NEXT: jne
   %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
   %p = extractvalue { i64, i1 } %cx, 1
   call i32 @bar()
@@ -68,14 +70,13 @@ define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
 ; CHECK-LABEL: test_feed_cmov:
 
 ; CHECK: cmpxchg
-; CHECK: pushfq
-; CHECK: popq [[FLAGS:%.*]]
-
-; CHECK: callq bar
+; CHECK: pushf[[LQ:[lq]]]
+; CHECK-NEXT: pop[[LQ]] [[FLAGS:%.*]]
 
-; CHECK: pushq [[FLAGS]]
-; CHECK: popfq
+; CHECK-NEXT: call[[LQ]] bar
 
+; CHECK-NEXT: push[[LQ]] [[FLAGS]]
+; CHECK-NEXT: popf[[LQ]]
   %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
   %success = extractvalue { i32, i1 } %res, 1
 
diff --git a/test/CodeGen/X86/coalesce_commute_subreg.ll b/test/CodeGen/X86/coalesce_commute_subreg.ll
new file mode 100644
index 0000000..8d0a20c
--- /dev/null
+++ b/test/CodeGen/X86/coalesce_commute_subreg.ll
@@ -0,0 +1,51 @@
+; RUN: llc -mtriple="x86_64-apple-darwin" -o - -verify-machineinstrs %s
+
+define void @make_wanted() #0 {
+entry:
+  br i1 undef, label %for.end20, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:
+  br label %for.body3
+
+for.body3:
+  %cmp20.i = icmp eq i32 undef, 0
+  %.col.057 = select i1 %cmp20.i, i32 0, i32 undef
+  br i1 undef, label %while.cond.i, label %for.body5.lr.ph.i
+
+for.body5.lr.ph.i:
+  %0 = sext i32 %.col.057 to i64
+  %1 = sub i32 0, %.col.057
+  %2 = zext i32 %1 to i64
+  %3 = add nuw nsw i64 %2, 1
+  %n.vec110 = and i64 %3, 8589934588
+  %end.idx.rnd.down111 = add nsw i64 %n.vec110, %0
+  br i1 undef, label %middle.block105, label %vector.ph103
+
+vector.ph103:
+  br i1 undef, label %middle.block105, label %vector.body104
+
+vector.body104:
+  %4 = icmp eq i64 undef, %end.idx.rnd.down111
+  br i1 %4, label %middle.block105, label %vector.body104
+
+middle.block105:
+  %resume.val114 = phi i64 [ %0, %for.body5.lr.ph.i ], [ %end.idx.rnd.down111, %vector.body104 ], [ %end.idx.rnd.down111, %vector.ph103 ]
+  %cmp.n116 = icmp eq i64 undef, %resume.val114
+  br i1 %cmp.n116, label %while.cond.i, label %for.body5.i.preheader
+
+for.body5.i.preheader:
+  %lcmp.or182 = or i1 undef, undef
+  br i1 %lcmp.or182, label %for.body5.i.prol, label %while.cond.i
+
+for.body5.i.prol:
+  br i1 undef, label %for.body5.i.prol, label %while.cond.i
+
+while.cond.i:
+  br i1 undef, label %while.cond.i, label %if.then
+
+if.then:
+  br label %for.body3
+
+for.end20:
+  ret void
+}
diff --git a/test/CodeGen/X86/coalescer-dce.ll b/test/CodeGen/X86/coalescer-dce.ll
index 7f72e3d..208d706 100644
--- a/test/CodeGen/X86/coalescer-dce.ll
+++ b/test/CodeGen/X86/coalescer-dce.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -disable-fp-elim -disable-machine-dce -verify-coalescing
+; RUN: llc < %s -verify-machineinstrs -disable-fp-elim -disable-machine-dce -verify-coalescing
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.7.0"
 
diff --git a/test/CodeGen/X86/codegen-prepare-extload.ll b/test/CodeGen/X86/codegen-prepare-extload.ll
index 9320706..9b27c33 100644
--- a/test/CodeGen/X86/codegen-prepare-extload.ll
+++ b/test/CodeGen/X86/codegen-prepare-extload.ll
@@ -1,12 +1,21 @@
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-win64 | FileCheck %s
-; rdar://7304838
+; RUN: opt -codegenprepare < %s -mtriple=x86_64-apple-macosx -S | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=NONSTRESS
+; RUN: opt -codegenprepare < %s -mtriple=x86_64-apple-macosx -S -stress-cgp-ext-ld-promotion | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=STRESS
+; RUN: opt -codegenprepare < %s -mtriple=x86_64-apple-macosx -S -disable-cgp-ext-ld-promotion | FileCheck %s --check-prefix=OPTALL --check-prefix=DISABLE
 
+; rdar://7304838
 ; CodeGenPrepare should move the zext into the block with the load
 ; so that SelectionDAG can select it with the load.
-
+;
+; CHECK-LABEL: foo:
 ; CHECK: movsbl ({{%rdi|%rcx}}), %eax
-
+;
+; OPTALL-LABEL: @foo
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %p
+; OPTALL-NEXT: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; OPTALL: store i32 [[ZEXT]], i32* %q
+; OPTALL: ret
 define void @foo(i8* %p, i32* %q) {
 entry:
   %t = load i8* %p
@@ -19,3 +28,336 @@ true:
 false:
   ret void
 }
+
+; Check that we manage to form a zextload is an operation with only one
+; argument to explicitly extend is in the the way.
+; OPTALL-LABEL: @promoteOneArg
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %p
+; OPT-NEXT: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT]], 2
+; Make sure the operation is not promoted when the promotion pass is disabled.
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], 2
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteOneArg(i8* %p, i32* %q) {
+entry:
+  %t = load i8* %p
+  %add = add nuw i8 %t, 2
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = zext i8 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a sextload is an operation with only one
+; argument to explicitly extend is in the the way.
+; Version with sext.
+; OPTALL-LABEL: @promoteOneArgSExt
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %p
+; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[LD]] to i32
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SEXT]], 2
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], 2
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteOneArgSExt(i8* %p, i32* %q) {
+entry:
+  %t = load i8* %p
+  %add = add nsw i8 %t, 2
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = sext i8 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a zextload is an operation with two
+; arguments to explicitly extend is in the the way.
+; Extending %add will create two extensions:
+; 1. One for %b.
+; 2. One for %t.
+; #1 will not be removed as we do not know anything about %b.
+; #2 may not be merged with the load because %t is used in a comparison.
+; Since two extensions may be emitted in the end instead of one before the
+; transformation, the regular heuristic does not apply the optimization. 
+; 
+; OPTALL-LABEL: @promoteTwoArgZext
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %p
+;
+; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i8 %b to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
+;
+; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+;
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+;
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteTwoArgZext(i8* %p, i32* %q, i8 %b) {
+entry:
+  %t = load i8* %p
+  %add = add nuw i8 %t, %b
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = zext i8 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a sextload is an operation with two
+; arguments to explicitly extend is in the the way.
+; Version with sext.
+; OPTALL-LABEL: @promoteTwoArgSExt
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %p
+;
+; STRESS-NEXT: [[SEXTLD:%[a-zA-Z_0-9-]+]] = sext i8 [[LD]] to i32
+; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i8 %b to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SEXTLD]], [[SEXTB]]
+;
+; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], %b
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
+;
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], %b
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteTwoArgSExt(i8* %p, i32* %q, i8 %b) {
+entry:
+  %t = load i8* %p
+  %add = add nsw i8 %t, %b
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = sext i8 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we do not a zextload if we need to introduce more than
+; one additional extension.
+; OPTALL-LABEL: @promoteThreeArgZext
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %p
+;
+; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i8 %b to i32
+; STRESS-NEXT: [[TMP:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
+; STRESS-NEXT: [[ZEXTC:%[a-zA-Z_0-9-]+]] = zext i8 %c to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[TMP]], [[ZEXTC]]
+;
+; NONSTRESS-NEXT: [[TMP:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
+; NONSTRESS-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[TMP]], %c
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+;
+; DISABLE: add nuw i8
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+;
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteThreeArgZext(i8* %p, i32* %q, i8 %b, i8 %c) {
+entry:
+  %t = load i8* %p
+  %tmp = add nuw i8 %t, %b
+  %add = add nuw i8 %tmp, %c
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = zext i8 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a zextload after promoting and merging
+; two extensions.
+; OPTALL-LABEL: @promoteMergeExtArgZExt
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %p
+;
+; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i16 %b to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
+;
+; NONSTRESS: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
+; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i16 [[ZEXTLD]], %b
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i16 [[ADD]] to i32
+;
+; DISABLE: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i16 [[ZEXTLD]], %b
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i16 [[ADD]] to i32
+;
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteMergeExtArgZExt(i8* %p, i32* %q, i16 %b) {
+entry:
+  %t = load i8* %p
+  %ext = zext i8 %t to i16
+  %add = add nuw i16 %ext, %b
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = zext i16 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a sextload after promoting and merging
+; two extensions.
+; Version with sext.
+; OPTALL-LABEL: @promoteMergeExtArgSExt
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %p
+;
+; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = sext i16 %b to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXTLD]], [[ZEXTB]]
+;
+; NONSTRESS: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
+; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i16 [[ZEXTLD]], %b
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
+;
+; DISABLE: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i16 [[ZEXTLD]], %b
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteMergeExtArgSExt(i8* %p, i32* %q, i16 %b) {
+entry:
+  %t = load i8* %p
+  %ext = zext i8 %t to i16
+  %add = add nsw i16 %ext, %b
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = sext i16 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to catch all the extload opportunities that are exposed
+; by the different iterations of codegen prepare.
+; Moreover, check that we do not promote more than we need to.
+; Here is what is happening in this test (not necessarly in this order):
+; 1. We try to promote the operand of %sextadd.
+;    a. This creates one sext of %ld2 and one of %zextld
+;    b. The sext of %ld2 can be combine with %ld2, so we remove one sext but
+;       introduced one. This is fine with the current heuristic: neutral.
+;    => We have one zext of %zextld left and we created one sext of %ld2.
+; 2. We try to promote the operand of %sextaddza.
+;    a. This creates one sext of %zexta and one of %zextld
+;    b. The sext of %zexta does not lead to any load, it stays here, even if it
+;       could have been combine with the zext of %a.
+;    c. The sext of %zextld leads to %ld and can be combined with it. This is
+;       done by promoting %zextld. This is fine with the current heuristic:
+;       neutral.
+;    => We have created a new zext of %ld and we created one sext of %zexta.
+; 3. We try to promote the operand of %sextaddb.
+;    a. This creates one sext of %b and one of %zextld
+;    b. The sext of %b is a dead-end, nothing to be done.
+;    c. Same thing as 2.c. happens.
+;    => We have created a new zext of %ld and we created one sext of %b.
+; 4. We try to promote the operand of the zext of %zextld introduced in #1.
+;    a. Same thing as 2.c. happens.
+;    b. %zextld does not have any other uses. It is dead coded.
+;    => We have created a new zext of %ld and we removed a zext of %zextld and
+;       a zext of %ld.
+; Currently we do not try to reuse existing extensions, so in the end we have
+; 3 identical zext of %ld. The extensions will be CSE'ed by SDag.
+;
+; OPTALL-LABEL: @severalPromotions
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %addr1
+; OPT-NEXT: [[ZEXTLD1_1:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXTLD1_2:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXTLD1_3:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[LD2:%[a-zA-Z_0-9-]+]] = load i32* %addr2
+; OPT-NEXT: [[SEXTLD2:%[a-zA-Z_0-9-]+]] = sext i32 [[LD2]] to i64
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD2]], [[ZEXTLD1_1]]
+; We do not combine this one: see 2.b.
+; OPT-NEXT: [[ZEXTA:%[a-zA-Z_0-9-]+]] = zext i8 %a to i32
+; OPT-NEXT: [[SEXTZEXTA:%[a-zA-Z_0-9-]+]] = sext i32 [[ZEXTA]] to i64
+; OPT-NEXT: [[RESZA:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTZEXTA]], [[ZEXTLD1_3]]
+; OPT-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; OPT-NEXT: [[RESB:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTB]], [[ZEXTLD1_2]]
+;
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADD]] to i64
+; DISABLE: [[ADDZA:%[a-zA-Z_0-9-]+]] = add nsw i32
+; DISABLE: [[RESZA:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADDZA]] to i64
+; DISABLE: [[ADDB:%[a-zA-Z_0-9-]+]] = add nsw i32
+; DISABLE: [[RESB:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADDB]] to i64
+;
+; OPTALL: call void @dummy(i64 [[RES]], i64 [[RESZA]], i64 [[RESB]])
+; OPTALL: ret
+define void @severalPromotions(i8* %addr1, i32* %addr2, i8 %a, i32 %b) {
+  %ld = load i8* %addr1
+  %zextld = zext i8 %ld to i32
+  %ld2 = load i32* %addr2
+  %add = add nsw i32 %ld2, %zextld
+  %sextadd = sext i32 %add to i64
+  %zexta = zext i8 %a to i32
+  %addza = add nsw i32 %zexta, %zextld
+  %sextaddza = sext i32 %addza to i64
+  %addb = add nsw i32 %b, %zextld
+  %sextaddb = sext i32 %addb to i64
+  call void @dummy(i64 %sextadd, i64 %sextaddza, i64 %sextaddb)
+  ret void
+}
+
+declare void @dummy(i64, i64, i64)
+
+; Make sure we do not try to promote vector types since the type promotion
+; helper does not support them for now.
+; OPTALL-LABEL: @vectorPromotion
+; OPTALL: [[SHL:%[a-zA-Z_0-9-]+]] = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8>
+; OPTALL: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext <2 x i32> [[SHL]] to <2 x i64>
+; OPTALL: ret
+define void @vectorPromotion() {
+entry:
+  %a = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8>
+  %b = zext <2 x i32> %a to <2 x i64>
+  ret void
+}
+
+@a = common global i32 0, align 4
+@c = common global [2 x i32] zeroinitializer, align 4
+
+; PR21978.
+; Make sure we support promotion of operands that produces a Value as opposed
+; to an instruction.
+; This used to cause a crash.
+; OPTALL-LABEL: @promotionOfArgEndsUpInValue
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i16* %addr
+
+; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i16 [[LD]] to i32
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw nsw i32 [[SEXT]], zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32]* @c, i64 0, i64 1), i32* @a) to i32)
+;
+; DISABLE-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw nsw i16 [[LD]], zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32]* @c, i64 0, i64 1), i32* @a) to i16)
+; DISABLE-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
+;
+; OPTALL-NEXT: ret i32 [[RES]]
+define i32 @promotionOfArgEndsUpInValue(i16* %addr) {
+entry:
+  %val = load i16* %addr
+  %add = add nuw nsw i16 %val, zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32]* @c, i64 0, i64 1), i32* @a) to i16)
+  %conv3 = sext i16 %add to i32
+  ret i32 %conv3
+}
diff --git a/test/CodeGen/X86/coff-comdat.ll b/test/CodeGen/X86/coff-comdat.ll
index ac4546d..44e1cb2 100644
--- a/test/CodeGen/X86/coff-comdat.ll
+++ b/test/CodeGen/X86/coff-comdat.ll
@@ -1,58 +1,58 @@
 ; RUN: llc -mtriple i386-pc-win32 < %s | FileCheck %s
 
 $f1 = comdat any
-@v1 = global i32 0, comdat $f1
-define void @f1() comdat $f1 {
+@v1 = global i32 0, comdat($f1)
+define void @f1() comdat($f1) {
   ret void
 }
 
 $f2 = comdat exactmatch
-@v2 = global i32 0, comdat $f2
-define void @f2() comdat $f2 {
+@v2 = global i32 0, comdat($f2)
+define void @f2() comdat($f2) {
   ret void
 }
 
 $f3 = comdat largest
-@v3 = global i32 0, comdat $f3
-define void @f3() comdat $f3 {
+@v3 = global i32 0, comdat($f3)
+define void @f3() comdat($f3) {
   ret void
 }
 
 $f4 = comdat noduplicates
-@v4 = global i32 0, comdat $f4
-define void @f4() comdat $f4 {
+@v4 = global i32 0, comdat($f4)
+define void @f4() comdat($f4) {
   ret void
 }
 
 $f5 = comdat samesize
-@v5 = global i32 0, comdat $f5
-define void @f5() comdat $f5 {
+@v5 = global i32 0, comdat($f5)
+define void @f5() comdat($f5) {
   ret void
 }
 
 $f6 = comdat samesize
-@v6 = global i32 0, comdat $f6
-@f6 = global i32 0, comdat $f6
+@v6 = global i32 0, comdat($f6)
+@f6 = global i32 0, comdat($f6)
 
 $"\01@f7@0" = comdat any
-define x86_fastcallcc void @"\01@v7@0"() comdat $"\01@f7@0" {
+define x86_fastcallcc void @"\01@v7@0"() comdat($"\01@f7@0") {
   ret void
 }
-define x86_fastcallcc void @"\01@f7@0"() comdat $"\01@f7@0" {
+define x86_fastcallcc void @"\01@f7@0"() comdat($"\01@f7@0") {
   ret void
 }
 
 $f8 = comdat any
-define x86_fastcallcc void @v8() comdat $f8 {
+define x86_fastcallcc void @v8() comdat($f8) {
   ret void
 }
-define x86_fastcallcc void @f8() comdat $f8 {
+define x86_fastcallcc void @f8() comdat($f8) {
   ret void
 }
 
 $vftable = comdat largest
 
-@some_name = private unnamed_addr constant [2 x i8*] zeroinitializer, comdat $vftable
+@some_name = private unnamed_addr constant [2 x i8*] zeroinitializer, comdat($vftable)
 @vftable = alias getelementptr([2 x i8*]* @some_name, i32 0, i32 1)
 
 ; CHECK: .section        .text,"xr",discard,_f1
@@ -73,20 +73,20 @@ $vftable = comdat largest
 ; CHECK: .globl  @v8@0
 ; CHECK: .section        .text,"xr",discard,@f8@0
 ; CHECK: .globl  @f8@0
-; CHECK: .section        .bss,"wb",associative,_f1
+; CHECK: .section        .bss,"bw",associative,_f1
 ; CHECK: .globl  _v1
-; CHECK: .section        .bss,"wb",associative,_f2
+; CHECK: .section        .bss,"bw",associative,_f2
 ; CHECK: .globl  _v2
-; CHECK: .section        .bss,"wb",associative,_f3
+; CHECK: .section        .bss,"bw",associative,_f3
 ; CHECK: .globl  _v3
-; CHECK: .section        .bss,"wb",associative,_f4
+; CHECK: .section        .bss,"bw",associative,_f4
 ; CHECK: .globl  _v4
-; CHECK: .section        .bss,"wb",associative,_f5
+; CHECK: .section        .bss,"bw",associative,_f5
 ; CHECK: .globl  _v5
-; CHECK: .section        .bss,"wb",associative,_f6
+; CHECK: .section        .bss,"bw",associative,_f6
 ; CHECK: .globl  _v6
-; CHECK: .section        .bss,"wb",same_size,_f6
+; CHECK: .section        .bss,"bw",same_size,_f6
 ; CHECK: .globl  _f6
-; CHECK: .section        .rdata,"rd",largest,_vftable
+; CHECK: .section        .rdata,"dr",largest,_vftable
 ; CHECK: .globl  _vftable
 ; CHECK: _vftable = L_some_name+4
diff --git a/test/CodeGen/X86/coff-comdat2.ll b/test/CodeGen/X86/coff-comdat2.ll
index 58bc04e..a417d09 100644
--- a/test/CodeGen/X86/coff-comdat2.ll
+++ b/test/CodeGen/X86/coff-comdat2.ll
@@ -5,5 +5,5 @@ target triple = "i686-pc-windows-msvc"
 
 $foo = comdat largest
 @foo = global i32 0
-@bar = global i32 0, comdat $foo
+@bar = global i32 0, comdat($foo)
 ; CHECK: Associative COMDAT symbol 'foo' is not a key for its COMDAT.
diff --git a/test/CodeGen/X86/coff-comdat3.ll b/test/CodeGen/X86/coff-comdat3.ll
index 76e464b..01651ce 100644
--- a/test/CodeGen/X86/coff-comdat3.ll
+++ b/test/CodeGen/X86/coff-comdat3.ll
@@ -4,5 +4,5 @@ target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
 target triple = "i686-pc-windows-msvc"
 
 $foo = comdat largest
-@bar = global i32 0, comdat $foo
+@bar = global i32 0, comdat($foo)
 ; CHECK: Associative COMDAT symbol 'foo' does not exist.
diff --git a/test/CodeGen/X86/combine-and.ll b/test/CodeGen/X86/combine-and.ll
index 59a7a19..bb46ac5 100644
--- a/test/CodeGen/X86/combine-and.ll
+++ b/test/CodeGen/X86/combine-and.ll
@@ -6,159 +6,173 @@
 
 
 define <4 x i32> @test1(<4 x i32> %A) {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 0>
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test1
-; CHECK: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; CHECK-NEXT: retq
-
 
 define <4 x i32> @test2(<4 x i32> %A) {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 0>
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test2
-; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
-; CHECK-NEXT: retq
-
 
 define <4 x i32> @test3(<4 x i32> %A) {
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 0, i32 0, i32 -1, i32 0>
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test3
-; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-NEXT: retq
-
 
 define <4 x i32> @test4(<4 x i32> %A) {
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 0, i32 0, i32 0, i32 -1>
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test4
-; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
-; CHECK-NEXT: retq
-
 
 define <4 x i32> @test5(<4 x i32> %A) {
+; CHECK-LABEL: test5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 0>
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test5
-; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-NEXT: retq
-
 
 define <4 x i32> @test6(<4 x i32> %A) {
+; CHECK-LABEL: test6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 -1>
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test6
-; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; CHECK-NEXT: retq
-
 
 define <4 x i32> @test7(<4 x i32> %A) {
+; CHECK-LABEL: test7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 0, i32 0, i32 -1, i32 -1>
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test7
-; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; CHECK-NEXT: retq
-
 
 define <4 x i32> @test8(<4 x i32> %A) {
+; CHECK-LABEL: test8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 -1>
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test8
-; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
-; CHECK-NEXT: retq
-
 
 define <4 x i32> @test9(<4 x i32> %A) {
+; CHECK-LABEL: test9:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 0, i32 0>
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test9
-; CHECK: movq %xmm0, %xmm0
-; CHECK-NEXT: retq
-
 
 define <4 x i32> @test10(<4 x i32> %A) {
+; CHECK-LABEL: test10:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 0>
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test10
-; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
-; CHECK-NEXT: retq
-
 
 define <4 x i32> @test11(<4 x i32> %A) {
+; CHECK-LABEL: test11:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 -1>
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test11
-; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
-; CHECK-NEXT: retq
-
 
 define <4 x i32> @test12(<4 x i32> %A) {
+; CHECK-LABEL: test12:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 -1, i32 0>
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test12
-; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
-; CHECK-NEXT: retq
-
 
 define <4 x i32> @test13(<4 x i32> %A) {
+; CHECK-LABEL: test13:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 0, i32 -1>
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test13
-; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
-; CHECK-NEXT: retq
-
 
 define <4 x i32> @test14(<4 x i32> %A) {
+; CHECK-LABEL: test14:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 -1>
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test14
-; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; CHECK-NEXT: retq
-
 
 define <4 x i32> @test15(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: test15:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 -1>
   %2 = and <4 x i32> %B, <i32 0, i32 -1, i32 0, i32 0>
   %3 = or <4 x i32> %1, %2
   ret <4 x i32> %3
 }
-; CHECK-LABEL: test15
-; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; CHECK-NEXT: retq
-
 
 define <4 x i32> @test16(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: test16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 0>
   %2 = and <4 x i32> %B, <i32 0, i32 -1, i32 0, i32 -1>
   %3 = or <4 x i32> %1, %2
   ret <4 x i32> %3
 }
-; CHECK-LABEL: test16
-; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-NEXT: retq
-
 
 define <4 x i32> @test17(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: test17:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; CHECK-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 -1>
   %2 = and <4 x i32> %B, <i32 -1, i32 0, i32 -1, i32 0>
   %3 = or <4 x i32> %1, %2
   ret <4 x i32> %3
 }
-; CHECK-LABEL: test17
-; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll
index 9539eae..8a0ffc1 100644
--- a/test/CodeGen/X86/combine-or.ll
+++ b/test/CodeGen/X86/combine-or.ll
@@ -153,7 +153,8 @@ define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test13:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
@@ -177,8 +178,9 @@ define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
 define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test15:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[2,1]
-; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,1,2,3]
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4>
@@ -206,12 +208,9 @@ define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
 define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test17:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    xorps %xmm2, %xmm2
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,0]
-; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,2]
-; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; CHECK-NEXT:    orps %xmm1, %xmm2
-; CHECK-NEXT:    movaps %xmm2, %xmm0
+; CHECK-NEXT:    psllq $32, %xmm0
+; CHECK-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
+; CHECK-NEXT:    por %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
@@ -223,10 +222,10 @@ define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test18:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    xorps %xmm2, %xmm2
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; CHECK-NEXT:    pxor %xmm2, %xmm2
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; CHECK-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; CHECK-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; CHECK-NEXT:    por %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
@@ -239,14 +238,12 @@ define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test19:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    xorps %xmm2, %xmm2
-; CHECK-NEXT:    xorps %xmm3, %xmm3
-; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[0,3]
-; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
-; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
-; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,2]
-; CHECK-NEXT:    orps %xmm3, %xmm2
-; CHECK-NEXT:    movaps %xmm2, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,3]
+; CHECK-NEXT:    pxor %xmm3, %xmm3
+; CHECK-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
+; CHECK-NEXT:    por %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2>
@@ -258,8 +255,8 @@ define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
 define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test20:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    orps %xmm1, %xmm0
-; CHECK-NEXT:    movq %xmm0, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; CHECK-NEXT:    retq
   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
@@ -271,9 +268,8 @@ define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
 define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test21:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    orps %xmm1, %xmm0
-; CHECK-NEXT:    movq %xmm0, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
diff --git a/test/CodeGen/X86/commute-clmul.ll b/test/CodeGen/X86/commute-clmul.ll
new file mode 100644
index 0000000..fe3e556
--- /dev/null
+++ b/test/CodeGen/X86/commute-clmul.ll
@@ -0,0 +1,60 @@
+; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=x86-64 -mattr=+sse2,+pclmul < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=x86-64 -mattr=+avx2,+pclmul < %s | FileCheck %s --check-prefix=AVX
+
+declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <2 x i64> @commute_lq_lq(<2 x i64>* %a0, <2 x i64> %a1) #0 {
+  ;SSE-LABEL: commute_lq_lq
+  ;SSE:       pclmulqdq $0, (%rdi), %xmm0
+  ;SSE-NEXT:  retq
+
+  ;AVX-LABEL: commute_lq_lq
+  ;AVX:       vpclmulqdq $0, (%rdi), %xmm0, %xmm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <2 x i64>* %a0
+  %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %1, <2 x i64> %a1, i8 0)
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @commute_lq_hq(<2 x i64>* %a0, <2 x i64> %a1) #0 {
+  ;SSE-LABEL: commute_lq_hq
+  ;SSE:       pclmulqdq $1, (%rdi), %xmm0
+  ;SSE-NEXT:  retq
+
+  ;AVX-LABEL: commute_lq_hq
+  ;AVX:       vpclmulqdq $1, (%rdi), %xmm0, %xmm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <2 x i64>* %a0
+  %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %1, <2 x i64> %a1, i8 16)
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @commute_hq_lq(<2 x i64>* %a0, <2 x i64> %a1) #0 {
+  ;SSE-LABEL: commute_hq_lq
+  ;SSE:       pclmulqdq $16, (%rdi), %xmm0
+  ;SSE-NEXT:  retq
+
+  ;AVX-LABEL: commute_hq_lq
+  ;AVX:       vpclmulqdq $16, (%rdi), %xmm0, %xmm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <2 x i64>* %a0
+  %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %1, <2 x i64> %a1, i8 1)
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @commute_hq_hq(<2 x i64>* %a0, <2 x i64> %a1) #0 {
+  ;SSE-LABEL: commute_hq_hq
+  ;SSE:       pclmulqdq $17, (%rdi), %xmm0
+  ;SSE-NEXT:  retq
+
+  ;AVX-LABEL: commute_hq_hq
+  ;AVX:       vpclmulqdq $17, (%rdi), %xmm0, %xmm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <2 x i64>* %a0
+  %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %1, <2 x i64> %a1, i8 17)
+  ret <2 x i64> %2
+}
diff --git a/test/CodeGen/X86/commute-fcmp.ll b/test/CodeGen/X86/commute-fcmp.ll
new file mode 100644
index 0000000..0d7f2af
--- /dev/null
+++ b/test/CodeGen/X86/commute-fcmp.ll
@@ -0,0 +1,340 @@
+; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX
+
+;
+; Float Comparisons
+; Only equal/not-equal/ordered/unordered can be safely commuted
+;
+
+define <4 x i32> @commute_cmpps_eq(<4 x float>* %a0, <4 x float> %a1) #0 {
+  ;SSE-LABEL: commute_cmpps_eq
+  ;SSE:       cmpeqps (%rdi), %xmm0
+  ;SSE-NEXT:  retq
+
+  ;AVX-LABEL: commute_cmpps_eq
+  ;AVX:       vcmpeqps (%rdi), %xmm0, %xmm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <4 x float>* %a0
+  %2 = fcmp oeq <4 x float> %1, %a1
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @commute_cmpps_ne(<4 x float>* %a0, <4 x float> %a1) #0 {
+  ;SSE-LABEL: commute_cmpps_ne
+  ;SSE:       cmpneqps (%rdi), %xmm0
+  ;SSE-NEXT:  retq
+
+  ;AVX-LABEL: commute_cmpps_ne
+  ;AVX:       vcmpneqps (%rdi), %xmm0, %xmm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <4 x float>* %a0
+  %2 = fcmp une <4 x float> %1, %a1
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @commute_cmpps_ord(<4 x float>* %a0, <4 x float> %a1) #0 {
+  ;SSE-LABEL: commute_cmpps_ord
+  ;SSE:       cmpordps (%rdi), %xmm0
+  ;SSE-NEXT:  retq
+
+  ;AVX-LABEL: commute_cmpps_ord
+  ;AVX:       vcmpordps (%rdi), %xmm0, %xmm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <4 x float>* %a0
+  %2 = fcmp ord <4 x float> %1, %a1
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @commute_cmpps_uno(<4 x float>* %a0, <4 x float> %a1) #0 {
+  ;SSE-LABEL: commute_cmpps_uno
+  ;SSE:       cmpunordps (%rdi), %xmm0
+  ;SSE-NEXT:  retq
+
+  ;AVX-LABEL: commute_cmpps_uno
+  ;AVX:       vcmpunordps (%rdi), %xmm0, %xmm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <4 x float>* %a0
+  %2 = fcmp uno <4 x float> %1, %a1
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @commute_cmpps_lt(<4 x float>* %a0, <4 x float> %a1) #0 {
+  ;SSE-LABEL: commute_cmpps_lt
+  ;SSE:       movaps (%rdi), %xmm1
+  ;SSE-NEXT:  cmpltps %xmm0, %xmm1
+  ;SSE-NEXT:  movaps %xmm1, %xmm0
+  ;SSE-NEXT:  retq
+
+  ;AVX-LABEL: commute_cmpps_lt
+  ;AVX:       vmovaps (%rdi), %xmm1
+  ;AVX-NEXT:  vcmpltps %xmm0, %xmm1, %xmm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <4 x float>* %a0
+  %2 = fcmp olt <4 x float> %1, %a1
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @commute_cmpps_le(<4 x float>* %a0, <4 x float> %a1) #0 {
+  ;SSE-LABEL: commute_cmpps_le
+  ;SSE:       movaps (%rdi), %xmm1
+  ;SSE-NEXT:  cmpleps %xmm0, %xmm1
+  ;SSE-NEXT:  movaps %xmm1, %xmm0
+  ;SSE-NEXT:  retq
+
+  ;AVX-LABEL: commute_cmpps_le
+  ;AVX:       vmovaps (%rdi), %xmm1
+  ;AVX-NEXT:  vcmpleps %xmm0, %xmm1, %xmm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <4 x float>* %a0
+  %2 = fcmp ole <4 x float> %1, %a1
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <8 x i32> @commute_cmpps_eq_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
+  ;AVX-LABEL: commute_cmpps_eq_ymm
+  ;AVX:       vcmpeqps (%rdi), %ymm0, %ymm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <8 x float>* %a0
+  %2 = fcmp oeq <8 x float> %1, %a1
+  %3 = sext <8 x i1> %2 to <8 x i32>
+  ret <8 x i32> %3
+}
+
+define <8 x i32> @commute_cmpps_ne_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
+  ;AVX-LABEL: commute_cmpps_ne_ymm
+  ;AVX:       vcmpneqps (%rdi), %ymm0, %ymm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <8 x float>* %a0
+  %2 = fcmp une <8 x float> %1, %a1
+  %3 = sext <8 x i1> %2 to <8 x i32>
+  ret <8 x i32> %3
+}
+
+define <8 x i32> @commute_cmpps_ord_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
+  ;AVX-LABEL: commute_cmpps_ord_ymm
+  ;AVX:       vcmpordps (%rdi), %ymm0, %ymm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <8 x float>* %a0
+  %2 = fcmp ord <8 x float> %1, %a1
+  %3 = sext <8 x i1> %2 to <8 x i32>
+  ret <8 x i32> %3
+}
+
+define <8 x i32> @commute_cmpps_uno_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
+  ;AVX-LABEL: commute_cmpps_uno_ymm
+  ;AVX:       vcmpunordps (%rdi), %ymm0, %ymm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <8 x float>* %a0
+  %2 = fcmp uno <8 x float> %1, %a1
+  %3 = sext <8 x i1> %2 to <8 x i32>
+  ret <8 x i32> %3
+}
+
+define <8 x i32> @commute_cmpps_lt_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
+  ;AVX-LABEL: commute_cmpps_lt_ymm
+  ;AVX:       vmovaps (%rdi), %ymm1
+  ;AVX-NEXT:  vcmpltps %ymm0, %ymm1, %ymm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <8 x float>* %a0
+  %2 = fcmp olt <8 x float> %1, %a1
+  %3 = sext <8 x i1> %2 to <8 x i32>
+  ret <8 x i32> %3
+}
+
+define <8 x i32> @commute_cmpps_le_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
+  ;AVX-LABEL: commute_cmpps_le_ymm
+  ;AVX:       vmovaps (%rdi), %ymm1
+  ;AVX-NEXT:  vcmpleps %ymm0, %ymm1, %ymm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <8 x float>* %a0
+  %2 = fcmp ole <8 x float> %1, %a1
+  %3 = sext <8 x i1> %2 to <8 x i32>
+  ret <8 x i32> %3
+}
+
+;
+; Double Comparisons
+; Only equal/not-equal/ordered/unordered can be safely commuted
+;
+
+define <2 x i64> @commute_cmppd_eq(<2 x double>* %a0, <2 x double> %a1) #0 {
+  ;SSE-LABEL: commute_cmppd_eq
+  ;SSE:       cmpeqpd (%rdi), %xmm0
+  ;SSE-NEXT:  retq
+
+  ;AVX-LABEL: commute_cmppd_eq
+  ;AVX:       vcmpeqpd (%rdi), %xmm0, %xmm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <2 x double>* %a0
+  %2 = fcmp oeq <2 x double> %1, %a1
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @commute_cmppd_ne(<2 x double>* %a0, <2 x double> %a1) #0 {
+  ;SSE-LABEL: commute_cmppd_ne
+  ;SSE:       cmpneqpd (%rdi), %xmm0
+  ;SSE-NEXT:  retq
+
+  ;AVX-LABEL: commute_cmppd_ne
+  ;AVX:       vcmpneqpd (%rdi), %xmm0, %xmm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <2 x double>* %a0
+  %2 = fcmp une <2 x double> %1, %a1
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @commute_cmppd_ord(<2 x double>* %a0, <2 x double> %a1) #0 {
+  ;SSE-LABEL: commute_cmppd_ord
+  ;SSE:       cmpordpd (%rdi), %xmm0
+  ;SSE-NEXT:  retq
+
+  ;AVX-LABEL: commute_cmppd_ord
+  ;AVX:       vcmpordpd (%rdi), %xmm0, %xmm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <2 x double>* %a0
+  %2 = fcmp ord <2 x double> %1, %a1
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @commute_cmppd_uno(<2 x double>* %a0, <2 x double> %a1) #0 {
+  ;SSE-LABEL: commute_cmppd_uno
+  ;SSE:       cmpunordpd (%rdi), %xmm0
+  ;SSE-NEXT:  retq
+
+  ;AVX-LABEL: commute_cmppd_uno
+  ;AVX:       vcmpunordpd (%rdi), %xmm0, %xmm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <2 x double>* %a0
+  %2 = fcmp uno <2 x double> %1, %a1
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @commute_cmppd_lt(<2 x double>* %a0, <2 x double> %a1) #0 {
+  ;SSE-LABEL: commute_cmppd_lt
+  ;SSE:       movapd (%rdi), %xmm1
+  ;SSE-NEXT:  cmpltpd %xmm0, %xmm1
+  ;SSE-NEXT:  movapd %xmm1, %xmm0
+  ;SSE-NEXT:  retq
+
+  ;AVX-LABEL: commute_cmppd_lt
+  ;AVX:       vmovapd (%rdi), %xmm1
+  ;AVX-NEXT:  vcmpltpd %xmm0, %xmm1, %xmm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <2 x double>* %a0
+  %2 = fcmp olt <2 x double> %1, %a1
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @commute_cmppd_le(<2 x double>* %a0, <2 x double> %a1) #0 {
+  ;SSE-LABEL: commute_cmppd_le
+  ;SSE:       movapd (%rdi), %xmm1
+  ;SSE-NEXT:  cmplepd %xmm0, %xmm1
+  ;SSE-NEXT:  movapd %xmm1, %xmm0
+  ;SSE-NEXT:  retq
+
+  ;AVX-LABEL: commute_cmppd_le
+  ;AVX:       vmovapd (%rdi), %xmm1
+  ;AVX-NEXT:  vcmplepd %xmm0, %xmm1, %xmm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <2 x double>* %a0
+  %2 = fcmp ole <2 x double> %1, %a1
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <4 x i64> @commute_cmppd_eq_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
+  ;AVX-LABEL: commute_cmppd_eq
+  ;AVX:       vcmpeqpd (%rdi), %ymm0, %ymm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <4 x double>* %a0
+  %2 = fcmp oeq <4 x double> %1, %a1
+  %3 = sext <4 x i1> %2 to <4 x i64>
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @commute_cmppd_ne_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
+  ;AVX-LABEL: commute_cmppd_ne
+  ;AVX:       vcmpneqpd (%rdi), %ymm0, %ymm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <4 x double>* %a0
+  %2 = fcmp une <4 x double> %1, %a1
+  %3 = sext <4 x i1> %2 to <4 x i64>
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @commute_cmppd_ord_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
+  ;AVX-LABEL: commute_cmppd_ord
+  ;AVX:       vcmpordpd (%rdi), %ymm0, %ymm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <4 x double>* %a0
+  %2 = fcmp ord <4 x double> %1, %a1
+  %3 = sext <4 x i1> %2 to <4 x i64>
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @commute_cmppd_uno_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
+  ;AVX-LABEL: commute_cmppd_uno
+  ;AVX:       vcmpunordpd (%rdi), %ymm0, %ymm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <4 x double>* %a0
+  %2 = fcmp uno <4 x double> %1, %a1
+  %3 = sext <4 x i1> %2 to <4 x i64>
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @commute_cmppd_lt_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
+  ;AVX-LABEL: commute_cmppd_lt
+  ;AVX:       vmovapd (%rdi), %ymm1
+  ;AVX-NEXT:  vcmpltpd %ymm0, %ymm1, %ymm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <4 x double>* %a0
+  %2 = fcmp olt <4 x double> %1, %a1
+  %3 = sext <4 x i1> %2 to <4 x i64>
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @commute_cmppd_le_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
+  ;AVX-LABEL: commute_cmppd_le
+  ;AVX:       vmovapd (%rdi), %ymm1
+  ;AVX-NEXT:  vcmplepd %ymm0, %ymm1, %ymm0
+  ;AVX-NEXT:  retq
+
+  %1 = load <4 x double>* %a0
+  %2 = fcmp ole <4 x double> %1, %a1
+  %3 = sext <4 x i1> %2 to <4 x i64>
+  ret <4 x i64> %3
+}
diff --git a/test/CodeGen/X86/commute-xop.ll b/test/CodeGen/X86/commute-xop.ll
new file mode 100644
index 0000000..a3e14fe
--- /dev/null
+++ b/test/CodeGen/X86/commute-xop.ll
@@ -0,0 +1,184 @@
+; RUN: llc -O3 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+xop < %s | FileCheck %s
+
+define <16 x i8> @commute_fold_vpcomb(<16 x i8>* %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: commute_fold_vpcomb
+  ;CHECK:       vpcomgtb (%rdi), %xmm0, %xmm0
+  %1 = load <16 x i8>* %a0
+  %2 = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %1, <16 x i8> %a1, i8 0) ; vpcomltb
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <4 x i32> @commute_fold_vpcomd(<4 x i32>* %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: commute_fold_vpcomd
+  ;CHECK:       vpcomged (%rdi), %xmm0, %xmm0
+  %1 = load <4 x i32>* %a0
+  %2 = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %1, <4 x i32> %a1, i8 1) ; vpcomled
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @commute_fold_vpcomq(<2 x i64>* %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: commute_fold_vpcomq
+  ;CHECK:       vpcomltq (%rdi), %xmm0, %xmm0
+  %1 = load <2 x i64>* %a0
+  %2 = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %1, <2 x i64> %a1, i8 2) ; vpcomgtq
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <16 x i8> @commute_fold_vpcomub(<16 x i8>* %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: commute_fold_vpcomub
+  ;CHECK:       vpcomleub (%rdi), %xmm0, %xmm0
+  %1 = load <16 x i8>* %a0
+  %2 = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %1, <16 x i8> %a1, i8 3) ; vpcomgeub
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <4 x i32> @commute_fold_vpcomud(<4 x i32>* %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: commute_fold_vpcomud
+  ;CHECK:       vpcomequd (%rdi), %xmm0, %xmm0
+  %1 = load <4 x i32>* %a0
+  %2 = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %1, <4 x i32> %a1, i8 4) ; vpcomequd
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @commute_fold_vpcomuq(<2 x i64>* %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: commute_fold_vpcomuq
+  ;CHECK:       vpcomnequq (%rdi), %xmm0, %xmm0
+  %1 = load <2 x i64>* %a0
+  %2 = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %1, <2 x i64> %a1, i8 5) ; vpcomnequq
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <8 x i16> @commute_fold_vpcomuw(<8 x i16>* %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: commute_fold_vpcomuw
+  ;CHECK:       vpcomfalseuw (%rdi), %xmm0, %xmm0
+  %1 = load <8 x i16>* %a0
+  %2 = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %1, <8 x i16> %a1, i8 6) ; vpcomfalseuw
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <8 x i16> @commute_fold_vpcomw(<8 x i16>* %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: commute_fold_vpcomw
+  ;CHECK:       vpcomtruew (%rdi), %xmm0, %xmm0
+  %1 = load <8 x i16>* %a0
+  %2 = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %1, <8 x i16> %a1, i8 7) ; vpcomtruew
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <4 x i32> @commute_fold_vpmacsdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) {
+  ;CHECK-LABEL: commute_fold_vpmacsdd
+  ;CHECK:       vpmacsdd %xmm1, (%rdi), %xmm0, %xmm0
+  %1 = load <4 x i32>* %a0
+  %2 = call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %1, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @commute_fold_vpmacsdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
+  ;CHECK-LABEL: commute_fold_vpmacsdqh
+  ;CHECK:       vpmacsdqh %xmm1, (%rdi), %xmm0, %xmm0
+  %1 = load <4 x i32>* %a0
+  %2 = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @commute_fold_vpmacsdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
+  ;CHECK-LABEL: commute_fold_vpmacsdql
+  ;CHECK:       vpmacsdql %xmm1, (%rdi), %xmm0, %xmm0
+  %1 = load <4 x i32>* %a0
+  %2 = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
+
+define <4 x i32> @commute_fold_vpmacssdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) {
+  ;CHECK-LABEL: commute_fold_vpmacssdd
+  ;CHECK:       vpmacssdd %xmm1, (%rdi), %xmm0, %xmm0
+  %1 = load <4 x i32>* %a0
+  %2 = call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %1, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @commute_fold_vpmacssdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
+  ;CHECK-LABEL: commute_fold_vpmacssdqh
+  ;CHECK:       vpmacssdqh %xmm1, (%rdi), %xmm0, %xmm0
+  %1 = load <4 x i32>* %a0
+  %2 = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @commute_fold_vpmacssdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
+  ;CHECK-LABEL: commute_fold_vpmacssdql
+  ;CHECK:       vpmacssdql %xmm1, (%rdi), %xmm0, %xmm0
+  %1 = load <4 x i32>* %a0
+  %2 = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
+
+define <4 x i32> @commute_fold_vpmacsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
+  ;CHECK-LABEL: commute_fold_vpmacsswd
+  ;CHECK:       vpmacsswd %xmm1, (%rdi), %xmm0, %xmm0
+  %1 = load <8 x i16>* %a0
+  %2 = call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @commute_fold_vpmacssww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) {
+  ;CHECK-LABEL: commute_fold_vpmacssww
+  ;CHECK:       vpmacssww %xmm1, (%rdi), %xmm0, %xmm0
+  %1 = load <8 x i16>* %a0
+  %2 = call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %1, <8 x i16> %a1, <8 x i16> %a2)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @commute_fold_vpmacswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
+  ;CHECK-LABEL: commute_fold_vpmacswd
+  ;CHECK:       vpmacswd %xmm1, (%rdi), %xmm0, %xmm0
+  %1 = load <8 x i16>* %a0
+  %2 = call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @commute_fold_vpmacsww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) {
+  ;CHECK-LABEL: commute_fold_vpmacsww
+  ;CHECK:       vpmacsww %xmm1, (%rdi), %xmm0, %xmm0
+  %1 = load <8 x i16>* %a0
+  %2 = call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %1, <8 x i16> %a1, <8 x i16> %a2)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @commute_fold_vpmadcsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
+  ;CHECK-LABEL: commute_fold_vpmadcsswd
+  ;CHECK:       vpmadcsswd %xmm1, (%rdi), %xmm0, %xmm0
+  %1 = load <8 x i16>* %a0
+  %2 = call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @commute_fold_vpmadcswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
+  ;CHECK-LABEL: commute_fold_vpmadcswd
+  ;CHECK:       vpmadcswd %xmm1, (%rdi), %xmm0, %xmm0
+  %1 = load <8 x i16>* %a0
+  %2 = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
+
+
+
diff --git a/test/CodeGen/X86/compact-unwind.ll b/test/CodeGen/X86/compact-unwind.ll
index 9d3a125..d3b89a5 100644
--- a/test/CodeGen/X86/compact-unwind.ll
+++ b/test/CodeGen/X86/compact-unwind.ll
@@ -1,12 +1,20 @@
 ; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 | FileCheck -check-prefix=ASM %s
 ; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 -filetype=obj -o - \
-; RUN:  | llvm-objdump -triple x86_64-apple-darwin11 -s - \
+; RUN:  | llvm-objdump -triple x86_64-apple-darwin11 -unwind-info - \
 ; RUN:  | FileCheck -check-prefix=CU %s
 ; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 \
 ; RUN:  | llvm-mc -triple x86_64-apple-darwin11 -filetype=obj -o - \
-; RUN:  | llvm-objdump -triple x86_64-apple-darwin11 -s - \
+; RUN:  | llvm-objdump -triple x86_64-apple-darwin11 -unwind-info - \
 ; RUN:  | FileCheck -check-prefix=FROM-ASM %s
 
+; RUN: llc < %s -mtriple x86_64-apple-macosx10.8.0 -mcpu corei7 -filetype=obj -o - \
+; RUN:  | llvm-objdump -triple x86_64-apple-macosx10.8.0 -unwind-info - \
+; RUN:  | FileCheck -check-prefix=NOFP-CU %s
+; RUN: llc < %s -mtriple x86_64-apple-darwin11 -mcpu corei7 \
+; RUN:  | llvm-mc -triple x86_64-apple-darwin11 -filetype=obj -o - \
+; RUN:  | llvm-objdump -triple x86_64-apple-darwin11 -unwind-info - \
+; RUN:  | FileCheck -check-prefix=NOFP-FROM-ASM %s
+
 %ty = type { i8* }
 
 @gv = external global i32
@@ -17,15 +25,19 @@
 ; Even though we can't encode %rax into the compact unwind, We still want to be
 ; able to generate a compact unwind encoding in this particular case.
 
-; CU:      Contents of section __compact_unwind:
-; CU-NEXT: 0020 00000000 00000000 1e000000 01000101
-; CU-NEXT: 0030 00000000 00000000 00000000 00000000
+; CU:    Contents of __compact_unwind section:
+; CU-NEXT:      Entry at offset 0x0:
+; CU-NEXT:        start:                0x0 _test0
+; CU-NEXT:        length:               0x1e
+; CU-NEXT:        compact encoding:     0x01010001
 
-; FROM-ASM:      Contents of section __compact_unwind:
-; FROM-ASM-NEXT: 0020 00000000 00000000 1e000000 01000101
-; FROM-ASM-NEXT: 0030 00000000 00000000 00000000 00000000
+; FROM-ASM:    Contents of __compact_unwind section:
+; FROM-ASM-NEXT:      Entry at offset 0x0:
+; FROM-ASM-NEXT:        start:                0x0 _test0
+; FROM-ASM-NEXT:        length:               0x1e
+; FROM-ASM-NEXT:        compact encoding:     0x01010001
 
-define i8* @foo(i64 %size) {
+define i8* @test0(i64 %size) {
   %addr = alloca i64, align 8
   %tmp20 = load i32* @gv, align 4
   %tmp21 = call i32 @bar()
@@ -39,3 +51,61 @@ define i8* @foo(i64 %size) {
 }
 
 declare i32 @bar()
+
+%"struct.dyld::MappedRanges" = type { [400 x %struct.anon], %"struct.dyld::MappedRanges"* }
+%struct.anon = type { %class.ImageLoader*, i64, i64 }
+%class.ImageLoader = type { i32 (...)**, i8*, i8*, i32, i64, i64, i32, i32, %"struct.ImageLoader::recursive_lock"*, i16, i16, [4 x i8] }
+%"struct.ImageLoader::recursive_lock" = type { i32, i32 }
+
+@G1 = external hidden global %"struct.dyld::MappedRanges", align 8
+
+declare void @OSMemoryBarrier() optsize
+
+; Test the code below uses UNWIND_X86_64_MODE_STACK_IMMD compact unwind
+; encoding.
+
+; NOFP-CU:      Entry at offset 0x20:
+; NOFP-CU-NEXT:        start:                0x1d _test1
+; NOFP-CU-NEXT:        length:               0x42
+; NOFP-CU-NEXT:        compact encoding:     0x02040c0a
+
+; NOFP-FROM-ASM:      Entry at offset 0x20:
+; NOFP-FROM-ASM-NEXT:        start:                0x1d _test1
+; NOFP-FROM-ASM-NEXT:        length:               0x42
+; NOFP-FROM-ASM-NEXT:        compact encoding:     0x02040c0a
+
+define void @test1(%class.ImageLoader* %image) optsize ssp uwtable {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc10, %entry
+  %p.019 = phi %"struct.dyld::MappedRanges"* [ @G1, %entry ], [ %1, %for.inc10 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.inc, %for.cond1.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ]
+  %image4 = getelementptr inbounds %"struct.dyld::MappedRanges"* %p.019, i64 0, i32 0, i64 %indvars.iv, i32 0
+  %0 = load %class.ImageLoader** %image4, align 8
+  %cmp5 = icmp eq %class.ImageLoader* %0, %image
+  br i1 %cmp5, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body3
+  tail call void @OSMemoryBarrier() optsize
+  store %class.ImageLoader* null, %class.ImageLoader** %image4, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body3
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 400
+  br i1 %exitcond, label %for.inc10, label %for.body3
+
+for.inc10:                                        ; preds = %for.inc
+  %next = getelementptr inbounds %"struct.dyld::MappedRanges"* %p.019, i64 0, i32 1
+  %1 = load %"struct.dyld::MappedRanges"** %next, align 8
+  %cmp = icmp eq %"struct.dyld::MappedRanges"* %1, null
+  br i1 %cmp, label %for.end11, label %for.cond1.preheader
+
+for.end11:                                        ; preds = %for.inc10
+  ret void
+}
diff --git a/test/CodeGen/X86/constant-combines.ll b/test/CodeGen/X86/constant-combines.ll
new file mode 100644
index 0000000..d2a6ef4
--- /dev/null
+++ b/test/CodeGen/X86/constant-combines.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define void @PR22524({ float, float }* %arg) {
+; Check that we can materialize the zero constants we store in two places here,
+; and at least form a legal store of the floating point value at the end.
+; The DAG combiner at one point contained bugs that given enough permutations
+; would incorrectly form an illegal operation for the last of these stores when
+; it folded it to a zero too late to legalize the zero store operation. If this
+; ever starts forming a zero store instead of movss, the test case has stopped
+; being useful.
+; 
+; CHECK-LABEL: PR22524:
+entry:
+  %0 = getelementptr inbounds { float, float }* %arg,  i32 0, i32 1
+  store float 0.000000e+00, float* %0, align 4
+; CHECK: movl $0, 4(%rdi)
+
+  %1 = getelementptr inbounds { float, float }* %arg, i64 0,  i32 0
+  %2 = bitcast float* %1 to i64*
+  %3 = load i64* %2, align 8
+  %4 = trunc i64 %3 to i32
+  %5 = lshr i64 %3, 32
+  %6 = trunc i64 %5 to i32
+  %7 = bitcast i32 %6 to float
+  %8 = fmul float %7, 0.000000e+00
+  %9 = bitcast float* %1 to i32*
+  store i32 %6, i32* %9, align 4
+; CHECK: movl $0, (%rdi)
+  store float %8, float* %0, align 4
+; CHECK: movss %{{.*}}, 4(%rdi)
+  ret void
+}
diff --git a/test/CodeGen/X86/constant-hoisting-optnone.ll b/test/CodeGen/X86/constant-hoisting-optnone.ll
new file mode 100644
index 0000000..f61fe3f
--- /dev/null
+++ b/test/CodeGen/X86/constant-hoisting-optnone.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=generic | FileCheck %s
+;
+; Verify that pass 'Constant Hoisting' is not run on optnone functions.
+; Without optnone, Pass 'Constant Hoisting' would firstly hoist
+; constant 0xBEEBEEBEC, and then rebase the other constant
+; (i.e. constant 0xBEEBEEBF4) with respect to the previous one.
+; With optnone, we check that constants are not coalesced.
+
+define i64 @constant_hoisting_optnone() #0 {
+; CHECK-LABEL: @constant_hoisting_optnone
+; CHECK-DAG: movabsq {{.*#+}} imm = 0xBEEBEEBF4
+; CHECK-DAG: movabsq {{.*#+}} imm = 0xBEEBEEBEC
+; CHECK: ret
+entry:
+  %0 = load i64* inttoptr (i64 51250129900 to i64*)
+  %1 = load i64* inttoptr (i64 51250129908 to i64*)
+  %2 = add i64 %0, %1
+  ret i64 %2
+}
+
+attributes #0 = { optnone noinline }
diff --git a/test/CodeGen/X86/copysign-constant-magnitude.ll b/test/CodeGen/X86/copysign-constant-magnitude.ll
new file mode 100644
index 0000000..537d629
--- /dev/null
+++ b/test/CodeGen/X86/copysign-constant-magnitude.ll
@@ -0,0 +1,105 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+define void @test_copysign_const_magnitude_d(double %X) {
+; CHECK: [[SIGNMASK:L.+]]:
+; CHECK-NEXT:   .quad -9223372036854775808    ## double -0.000000e+00
+; CHECK-NEXT:   .quad 0                       ## double 0.000000e+00
+; CHECK: [[ZERO:L.+]]:
+; CHECK-NEXT:   .space 16
+; CHECK: [[ONE:L.+]]:
+; CHECK-NEXT:   .quad 4607182418800017408     ## double 1.000000e+00
+; CHECK-NEXT:   .quad 0                       ## double 0.000000e+00
+; CHECK-LABEL: test_copysign_const_magnitude_d:
+
+; CHECK: id
+  %iX = call double @id_d(double %X)
+
+; CHECK-NEXT: andpd [[SIGNMASK]](%rip), %xmm0
+  %d0 = call double @copysign(double 0.000000e+00, double %iX)
+
+; CHECK-NEXT: id
+  %id0 = call double @id_d(double %d0)
+
+; CHECK-NEXT: andpd [[SIGNMASK]](%rip), %xmm0
+; CHECK-NEXT: orpd [[ZERO]](%rip), %xmm0
+  %dn0 = call double @copysign(double -0.000000e+00, double %id0)
+
+; CHECK-NEXT: id
+  %idn0 = call double @id_d(double %dn0)
+
+; CHECK-NEXT: andpd [[SIGNMASK]](%rip), %xmm0
+; CHECK-NEXT: orpd [[ONE]](%rip), %xmm0
+  %d1 = call double @copysign(double 1.000000e+00, double %idn0)
+
+; CHECK-NEXT: id
+  %id1 = call double @id_d(double %d1)
+
+; CHECK-NEXT: andpd [[SIGNMASK]](%rip), %xmm0
+; CHECK-NEXT: orpd [[ONE]](%rip), %xmm0
+  %dn1 = call double @copysign(double -1.000000e+00, double %id1)
+
+; CHECK-NEXT: id
+  %idn1 = call double @id_d(double %dn1)
+
+; CHECK: retq
+  ret void
+}
+
+define void @test_copysign_const_magnitude_f(float %X) {
+; CHECK: [[SIGNMASK:L.+]]:
+; CHECK-NEXT:   .long	2147483648              ## float -0.000000e+00
+; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
+; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
+; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
+; CHECK: [[ZERO:L.+]]:
+; CHECK-NEXT:   .space 16
+; CHECK: [[ONE:L.+]]:
+; CHECK-NEXT:   .long	1065353216              ## float 1.000000e+00
+; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
+; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
+; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
+; CHECK-LABEL: test_copysign_const_magnitude_f:
+
+; CHECK: id
+  %iX = call float @id_f(float %X)
+
+; CHECK-NEXT: andps [[SIGNMASK]](%rip), %xmm0
+  %d0 = call float @copysignf(float 0.000000e+00, float %iX)
+
+; CHECK-NEXT: id
+  %id0 = call float @id_f(float %d0)
+
+; CHECK-NEXT: andps [[SIGNMASK]](%rip), %xmm0
+; CHECK-NEXT: orps [[ZERO]](%rip), %xmm0
+  %dn0 = call float @copysignf(float -0.000000e+00, float %id0)
+
+; CHECK-NEXT: id
+  %idn0 = call float @id_f(float %dn0)
+
+; CHECK-NEXT: andps [[SIGNMASK]](%rip), %xmm0
+; CHECK-NEXT: orps [[ONE]](%rip), %xmm0
+  %d1 = call float @copysignf(float 1.000000e+00, float %idn0)
+
+; CHECK-NEXT: id
+  %id1 = call float @id_f(float %d1)
+
+; CHECK-NEXT: andps [[SIGNMASK]](%rip), %xmm0
+; CHECK-NEXT: orps [[ONE]](%rip), %xmm0
+  %dn1 = call float @copysignf(float -1.000000e+00, float %id1)
+
+; CHECK-NEXT: id
+  %idn1 = call float @id_f(float %dn1)
+
+; CHECK: retq
+  ret void
+}
+
+declare double @copysign(double, double) nounwind readnone
+declare float @copysignf(float, float) nounwind readnone
+
+; Dummy identity functions, so we always have xmm0, and prevent optimizations.
+declare double @id_d(double)
+declare float @id_f(float)
diff --git a/test/CodeGen/X86/copysign-zero.ll b/test/CodeGen/X86/copysign-zero.ll
deleted file mode 100644
index 47522d8..0000000
--- a/test/CodeGen/X86/copysign-zero.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s | not grep orpd
-; RUN: llc < %s | grep andpd | count 1
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-target triple = "x86_64-apple-darwin8"
-
-define double @test(double %X) nounwind  {
-entry:
-	%tmp2 = tail call double @copysign( double 0.000000e+00, double %X ) nounwind readnone 		; <double> [#uses=1]
-	ret double %tmp2
-}
-
-declare double @copysign(double, double) nounwind readnone 
-
diff --git a/test/CodeGen/X86/cppeh-catch-all.ll b/test/CodeGen/X86/cppeh-catch-all.ll
new file mode 100644
index 0000000..7a12b24
--- /dev/null
+++ b/test/CodeGen/X86/cppeh-catch-all.ll
@@ -0,0 +1,83 @@
+; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
+
+; This test is based on the following code:
+;
+; void test()
+; {
+;   try {
+;     may_throw();
+;   } catch (...) {
+;     handle_exception();
+;   }
+; }
+;
+; Parts of the IR have been hand-edited to simplify the test case.
+; The full IR will be restored when Windows C++ EH support is complete.
+
+; ModuleID = 'catch-all.cpp'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+; Function Attrs: uwtable
+define void @_Z4testv() #0 {
+entry:
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  invoke void @_Z9may_throwv()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  br label %try.cont
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  store i8* %1, i8** %exn.slot
+  %2 = extractvalue { i8*, i32 } %0, 1
+  store i32 %2, i32* %ehselector.slot
+  br label %catch
+
+catch:                                            ; preds = %lpad
+  %exn = load i8** %exn.slot
+  %3 = call i8* @llvm.eh.begincatch(i8* %exn) #3
+  call void @_Z16handle_exceptionv()
+  br label %invoke.cont2
+
+invoke.cont2:                                     ; preds = %catch
+  call void @llvm.eh.endcatch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont2, %invoke.cont
+  ret void
+}
+
+; CHECK: define i8* @_Z4testv.catch(i8*, i8*) {
+; CHECK: catch.entry:
+; CHECK:   %eh.alloc = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1)
+; CHECK:   %eh.data = bitcast i8* %eh.alloc to %struct._Z4testv.ehdata*
+; CHECK:   %eh.obj.ptr = getelementptr inbounds %struct._Z4testv.ehdata* %eh.data, i32 0, i32 1
+; CHECK:   %eh.obj = load i8** %eh.obj.ptr
+; CHECK:   call void @_Z16handle_exceptionv()
+; CHECK:   ret i8* blockaddress(@_Z4testv, %try.cont)
+; CHECK: }
+
+declare void @_Z9may_throwv() #1
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare i8* @llvm.eh.begincatch(i8*)
+
+declare void @_Z16handle_exceptionv() #1
+
+declare void @llvm.eh.endcatch()
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noinline noreturn nounwind }
+attributes #3 = { nounwind }
+attributes #4 = { noreturn nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.7.0 (trunk 226027)"}
diff --git a/test/CodeGen/X86/cppeh-catch-scalar.ll b/test/CodeGen/X86/cppeh-catch-scalar.ll
new file mode 100644
index 0000000..fd5df6c
--- /dev/null
+++ b/test/CodeGen/X86/cppeh-catch-scalar.ll
@@ -0,0 +1,123 @@
+; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
+
+; This test is based on the following code:
+;
+; void test()
+; {
+;   try {
+;     may_throw();
+;   } catch (int i) {
+;     handle_int(i);
+;   }
+; }
+;
+; Parts of the IR have been hand-edited to simplify the test case.
+; The full IR will be restored when Windows C++ EH support is complete.
+
+;ModuleID = 'cppeh-catch-scalar.cpp'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+; This is the structure that will get created for the frame allocation.
+; CHECK: %struct._Z4testv.ehdata = type { i32, i8*, i32 }
+
+@_ZTIi = external constant i8*
+
+; The function entry will be rewritten like this.
+; CHECK: define void @_Z4testv() #0 {
+; CHECK: entry:
+; CHECK:   %frame.alloc = call i8* @llvm.frameallocate(i32 24)
+; CHECK:   %eh.data = bitcast i8* %frame.alloc to %struct._Z4testv.ehdata*
+; CHECK:   %exn.slot = alloca i8*
+; CHECK:   %ehselector.slot = alloca i32
+; CHECK-NOT:  %i = alloca i32, align 4
+; CHECK:  %i = getelementptr inbounds %struct._Z4testv.ehdata* %eh.data, i32 0, i32 2
+
+; Function Attrs: uwtable
+define void @_Z4testv() #0 {
+entry:
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  %i = alloca i32, align 4
+  invoke void @_Z9may_throwv()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  br label %try.cont
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %1 = extractvalue { i8*, i32 } %0, 0
+  store i8* %1, i8** %exn.slot
+  %2 = extractvalue { i8*, i32 } %0, 1
+  store i32 %2, i32* %ehselector.slot
+  br label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %lpad
+  %sel = load i32* %ehselector.slot
+  %3 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) #3
+  %matches = icmp eq i32 %sel, %3
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %catch.dispatch
+  %exn11 = load i8** %exn.slot
+  %4 = call i8* @llvm.eh.begincatch(i8* %exn11) #3
+  %5 = bitcast i8* %4 to i32*
+  %6 = load i32* %5, align 4
+  store i32 %6, i32* %i, align 4
+  %7 = load i32* %i, align 4
+  call void @_Z10handle_inti(i32 %7)
+  br label %invoke.cont2
+
+invoke.cont2:                                     ; preds = %catch
+  call void @llvm.eh.endcatch() #3
+  br label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont2, %invoke.cont
+  ret void
+
+eh.resume:                                        ; preds = %catch.dispatch
+  %exn3 = load i8** %exn.slot
+  %sel4 = load i32* %ehselector.slot
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn3, 0
+  %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %sel4, 1
+  resume { i8*, i32 } %lpad.val5
+}
+
+; CHECK: define i8* @_Z4testv.catch(i8*, i8*) {
+; CHECK: catch.entry:
+; CHECK:   %eh.alloc = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1)
+; CHECK:   %eh.data = bitcast i8* %eh.alloc to %struct._Z4testv.ehdata*
+; CHECK:   %eh.obj.ptr = getelementptr inbounds %struct._Z4testv.ehdata* %eh.data, i32 0, i32 1
+; CHECK:   %eh.obj = load i8** %eh.obj.ptr
+; CHECK:   %i = getelementptr inbounds %struct._Z4testv.ehdata* %eh.data, i32 0, i32 2
+; CHECK:   %2 = bitcast i8* %eh.obj to i32*
+; CHECK:   %3 = load i32* %2, align 4
+; CHECK:   store i32 %3, i32* %i, align 4
+; CHECK:   %4 = load i32* %i, align 4
+; CHECK:   call void @_Z10handle_inti(i32 %4)
+; CHECK:   ret i8* blockaddress(@_Z4testv, %try.cont)
+; CHECK: }
+
+declare void @_Z9may_throwv() #1
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.typeid.for(i8*) #2
+
+declare i8* @llvm.eh.begincatch(i8*)
+
+declare void @llvm.eh.endcatch()
+
+declare void @_Z10handle_inti(i32) #1
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.7.0 (trunk 227474) (llvm/trunk 227508)"}
diff --git a/test/CodeGen/X86/cppeh-frame-vars.ll b/test/CodeGen/X86/cppeh-frame-vars.ll
new file mode 100644
index 0000000..667f133
--- /dev/null
+++ b/test/CodeGen/X86/cppeh-frame-vars.ll
@@ -0,0 +1,261 @@
+; RUN: opt -mtriple=x86_64-pc-windows-msvc -winehprepare -S -o - < %s | FileCheck %s
+
+; This test is based on the following code:
+;
+; struct SomeData {
+;   int a;
+;   int b;
+; };
+; 
+; void may_throw();
+; void does_not_throw(int i);
+; void dump(int *, int, SomeData&);
+; 
+; void test() {
+;   int NumExceptions = 0;
+;   int ExceptionVal[10];
+;   SomeData Data = { 0, 0 };
+; 
+;   for (int i = 0; i < 10; ++i) {
+;     try {
+;       may_throw();
+;       Data.a += i;
+;     }
+;     catch (int e) {
+;       ExceptionVal[NumExceptions] = e;
+;       ++NumExceptions;
+;       if (e == i)
+;         Data.b += e;
+;       else
+;         Data.a += e;
+;     }
+;     does_not_throw(NumExceptions);
+;   }
+;   dump(ExceptionVal, NumExceptions, Data);
+; }
+
+; ModuleID = 'cppeh-frame-vars.cpp'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+%struct.SomeData = type { i32, i32 }
+
+; This structure should be declared for the frame allocation block.
+; CHECK: %"struct.\01?test@@YAXXZ.ehdata" = type { i32, i8*, i32, i32, [10 x i32], i32, %struct.SomeData }
+
+$"\01??_R0H@8" = comdat any
+
+@"\01??_7type_info@@6B@" = external constant i8*
+@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
+
+; The function entry should be rewritten like this.
+; CHECK: define void @"\01?test@@YAXXZ"() #0 {
+; CHECK: entry:
+; CHECK:   %frame.alloc = call i8* @llvm.frameallocate(i32 80)
+; CHECK:   %eh.data = bitcast i8* %frame.alloc to %"struct.\01?test@@YAXXZ.ehdata"*
+; CHECK-NOT:  %NumExceptions = alloca i32, align 4
+; CHECK:   %NumExceptions = getelementptr inbounds %"struct.\01?test@@YAXXZ.ehdata"* %eh.data, i32 0, i32 3
+; CHECK-NOT:  %ExceptionVal = alloca [10 x i32], align 16
+; CHECK:   %ExceptionVal = getelementptr inbounds %"struct.\01?test@@YAXXZ.ehdata"* %eh.data, i32 0, i32 4
+; CHECK-NOT:  %Data = alloca %struct.SomeData, align 4
+; CHECK:   %Data = getelementptr inbounds %"struct.\01?test@@YAXXZ.ehdata"* %eh.data, i32 0, i32 6
+; CHECK:   %i = getelementptr inbounds %"struct.\01?test@@YAXXZ.ehdata"* %eh.data, i32 0, i32 5
+; CHECK:   %exn.slot = alloca i8*
+; CHECK:   %ehselector.slot = alloca i32
+; CHECK-NOT:  %e = alloca i32, align 4
+; CHECK:   %e = getelementptr inbounds %"struct.\01?test@@YAXXZ.ehdata"* %eh.data, i32 0, i32 2
+
+; Function Attrs: uwtable
+define void @"\01?test@@YAXXZ"() #0 {
+entry:
+  %NumExceptions = alloca i32, align 4
+  %ExceptionVal = alloca [10 x i32], align 16
+  %Data = alloca %struct.SomeData, align 4
+  %i = alloca i32, align 4
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  %e = alloca i32, align 4
+  store i32 0, i32* %NumExceptions, align 4
+  %0 = bitcast %struct.SomeData* %Data to i8*
+  call void @llvm.memset(i8* %0, i8 0, i64 8, i32 4, i1 false)
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %1 = load i32* %i, align 4
+  %cmp = icmp slt i32 %1, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  invoke void @"\01?may_throw@@YAXXZ"()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %for.body
+  %2 = load i32* %i, align 4
+  %a = getelementptr inbounds %struct.SomeData* %Data, i32 0, i32 0
+  %3 = load i32* %a, align 4
+  %add = add nsw i32 %3, %2
+  store i32 %add, i32* %a, align 4
+  br label %try.cont
+
+lpad:                                             ; preds = %for.body
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+          catch i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)
+  %5 = extractvalue { i8*, i32 } %4, 0
+  store i8* %5, i8** %exn.slot
+  %6 = extractvalue { i8*, i32 } %4, 1
+  store i32 %6, i32* %ehselector.slot
+  br label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %lpad
+  %sel = load i32* %ehselector.slot
+  %7 = call i32 @llvm.eh.typeid.for(i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*)) #1
+  %matches = icmp eq i32 %sel, %7
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %catch.dispatch
+  %exn = load i8** %exn.slot
+  %8 = call i8* @llvm.eh.begincatch(i8* %exn) #1
+  %9 = bitcast i8* %8 to i32*
+  %10 = load i32* %9, align 4
+  store i32 %10, i32* %e, align 4
+  %11 = load i32* %e, align 4
+  %12 = load i32* %NumExceptions, align 4
+  %idxprom = sext i32 %12 to i64
+  %arrayidx = getelementptr inbounds [10 x i32]* %ExceptionVal, i32 0, i64 %idxprom
+  store i32 %11, i32* %arrayidx, align 4
+  %13 = load i32* %NumExceptions, align 4
+  %inc = add nsw i32 %13, 1
+  store i32 %inc, i32* %NumExceptions, align 4
+  %14 = load i32* %e, align 4
+  %15 = load i32* %i, align 4
+  %cmp1 = icmp eq i32 %14, %15
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %catch
+  %16 = load i32* %e, align 4
+  %b = getelementptr inbounds %struct.SomeData* %Data, i32 0, i32 1
+  %17 = load i32* %b, align 4
+  %add2 = add nsw i32 %17, %16
+  store i32 %add2, i32* %b, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %catch
+  %18 = load i32* %e, align 4
+  %a3 = getelementptr inbounds %struct.SomeData* %Data, i32 0, i32 0
+  %19 = load i32* %a3, align 4
+  %add4 = add nsw i32 %19, %18
+  store i32 %add4, i32* %a3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  call void @llvm.eh.endcatch() #1
+  br label %try.cont
+
+try.cont:                                         ; preds = %if.end, %invoke.cont
+  %20 = load i32* %NumExceptions, align 4
+  call void @"\01?does_not_throw@@YAXH@Z"(i32 %20)
+  br label %for.inc
+
+for.inc:                                          ; preds = %try.cont
+  %21 = load i32* %i, align 4
+  %inc5 = add nsw i32 %21, 1
+  store i32 %inc5, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %22 = load i32* %NumExceptions, align 4
+  %arraydecay = getelementptr inbounds [10 x i32]* %ExceptionVal, i32 0, i32 0
+  call void @"\01?dump@@YAXPEAHHAEAUSomeData@@@Z"(i32* %arraydecay, i32 %22, %struct.SomeData* dereferenceable(8) %Data)
+  ret void
+
+eh.resume:                                        ; preds = %catch.dispatch
+  %exn6 = load i8** %exn.slot
+  %sel7 = load i32* %ehselector.slot
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn6, 0
+  %lpad.val8 = insertvalue { i8*, i32 } %lpad.val, i32 %sel7, 1
+  resume { i8*, i32 } %lpad.val8
+}
+
+; The following catch handler should be outlined.
+; CHECK: define i8* @"\01?test@@YAXXZ.catch"(i8*, i8*) {
+; CHECK: catch.entry:
+; CHECK:   %eh.alloc = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1)
+; CHECK:   %eh.data = bitcast i8* %eh.alloc to %"struct.\01?test@@YAXXZ.ehdata"*
+; CHECK:   %eh.obj.ptr = getelementptr inbounds %"struct.\01?test@@YAXXZ.ehdata"* %eh.data, i32 0, i32 1
+; CHECK:   %eh.obj = load i8** %eh.obj.ptr
+; CHECK:   %e = getelementptr inbounds %"struct.\01?test@@YAXXZ.ehdata"* %eh.data, i32 0, i32 2
+; CHECK:   %NumExceptions = getelementptr inbounds %"struct.\01?test@@YAXXZ.ehdata"* %eh.data, i32 0, i32 3
+; CHECK:   %ExceptionVal = getelementptr inbounds %"struct.\01?test@@YAXXZ.ehdata"* %eh.data, i32 0, i32 4
+; CHECK:   %i = getelementptr inbounds %"struct.\01?test@@YAXXZ.ehdata"* %eh.data, i32 0, i32 5
+; CHECK:   %Data = getelementptr inbounds %"struct.\01?test@@YAXXZ.ehdata"* %eh.data, i32 0, i32 6
+; CHECK:   %2 = bitcast i8* %eh.obj to i32*
+; CHECK:   %3 = load i32* %2, align 4
+; CHECK:   store i32 %3, i32* %e, align 4
+; CHECK:   %4 = load i32* %e, align 4
+; CHECK:   %5 = load i32* %NumExceptions, align 4
+; CHECK:   %idxprom = sext i32 %5 to i64
+; CHECK:   %arrayidx = getelementptr inbounds [10 x i32]* %ExceptionVal, i32 0, i64 %idxprom
+; CHECK:   store i32 %4, i32* %arrayidx, align 4
+; CHECK:   %6 = load i32* %NumExceptions, align 4
+; CHECK:   %inc = add nsw i32 %6, 1
+; CHECK:   store i32 %inc, i32* %NumExceptions, align 4
+; CHECK:   %7 = load i32* %e, align 4
+; CHECK:   %8 = load i32* %i, align 4
+; CHECK:   %cmp1 = icmp eq i32 %7, %8
+; CHECK:   br i1 %cmp1, label %if.then, label %if.else
+;
+; CHECK: if.then:                                          ; preds = %catch.entry
+; CHECK:   %9 = load i32* %e, align 4
+; CHECK:   %b = getelementptr inbounds %struct.SomeData* %Data, i32 0, i32 1
+; CHECK:   %10 = load i32* %b, align 4
+; CHECK:   %add2 = add nsw i32 %10, %9
+; CHECK:   store i32 %add2, i32* %b, align 4
+; CHECK:   br label %if.end
+;
+; CHECK: if.else:                                          ; preds = %catch.entry
+; CHECK:   %11 = load i32* %e, align 4
+; CHECK:   %a3 = getelementptr inbounds %struct.SomeData* %Data, i32 0, i32 0
+; CHECK:   %12 = load i32* %a3, align 4
+; CHECK:   %add4 = add nsw i32 %12, %11
+; CHECK:   store i32 %add4, i32* %a3, align 4
+; CHECK:   br label %if.end
+;
+; CHECK: if.end:                                           ; preds = %if.else, %if.then
+; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont)
+; CHECK: }
+
+
+
+
+
+
+; Function Attrs: nounwind
+declare void @llvm.memset(i8* nocapture, i8, i64, i32, i1) #1
+
+declare void @"\01?may_throw@@YAXXZ"() #2
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.typeid.for(i8*) #3
+
+declare i8* @llvm.eh.begincatch(i8*)
+
+declare void @llvm.eh.endcatch()
+
+declare void @"\01?does_not_throw@@YAXH@Z"(i32) #2
+
+declare void @"\01?dump@@YAXPEAHHAEAUSomeData@@@Z"(i32*, i32, %struct.SomeData* dereferenceable(8)) #2
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 3.7.0 (trunk 228868)"}
diff --git a/test/CodeGen/X86/cpus.ll b/test/CodeGen/X86/cpus.ll
new file mode 100644
index 0000000..ee1f7bb
--- /dev/null
+++ b/test/CodeGen/X86/cpus.ll
@@ -0,0 +1,35 @@
+; Test that the CPU names work.
+;
+; First ensure the error message matches what we expect.
+; CHECK-ERROR: not a recognized processor for this target
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=foobar 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+;
+; Now ensure the error message doesn't occur for valid CPUs.
+; CHECK-NO-ERROR-NOT: not a recognized processor for this target
+;
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=nocona 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=core2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=penryn 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=nehalem 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=westmere 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=sandybridge 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=ivybridge 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=haswell 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=broadwell 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=bonnell 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=silvermont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k8 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=opteron 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon64 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-fx 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k8-sse3 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=opteron-sse3 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon64-sse3 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=amdfam10 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=barcelona 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=bdver1 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=bdver2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=bdver3 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=bdver4 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=btver1 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=btver2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
diff --git a/test/CodeGen/X86/crash-O0.ll b/test/CodeGen/X86/crash-O0.ll
index 956d43b..df8eaaf 100644
--- a/test/CodeGen/X86/crash-O0.ll
+++ b/test/CodeGen/X86/crash-O0.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 -relocation-model=pic -disable-fp-elim < %s
+; RUN: llc -O0 -relocation-model=pic -disable-fp-elim < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10"
 
@@ -29,3 +29,23 @@ entry:
 "41":                                             ; preds = %"39"
   unreachable
 }
+
+; When using fast isel, sdiv is lowered into a sequence of CQO + DIV64.
+; CQO defines implicitly AX and DIV64 uses it implicitly too.
+; When an instruction gets between those two, RegAllocFast was reusing
+; AX for the vreg defined in between and the compiler crashed.
+;
+; An instruction gets between CQO and DIV64 because the load is folded
+; into the division but it requires a sign extension.
+; PR21700
+; CHECK-LABEL: addressModeWith32bitIndex:
+; CHECK: cqto
+; CHECK-NEXT: movslq
+; CHECK-NEXT: idivq
+; CHECK: retq
+define i64 @addressModeWith32bitIndex(i32 %V) {
+  %gep = getelementptr i64* null, i32 %V
+  %load = load i64* %gep
+  %sdiv = sdiv i64 0, %load
+  ret i64 %sdiv
+}
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index ee73377..6b3dd36 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -108,8 +108,8 @@ do.body92:                                        ; preds = %if.then66
   ret void
 }
 
-!0 = metadata !{i32 633550}
-!1 = metadata !{i32 634261}
+!0 = !{i32 633550}
+!1 = !{i32 634261}
 
 
 ; Crash during XOR optimization.
diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index d0791dc..16d8f97 100644
--- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -52,48 +52,48 @@ define void @_Z3barii(i32 %param1, i32 %param2) #0 {
 entry:
   %var1 = alloca %struct.AAA3, align 1
   %var2 = alloca %struct.AAA3, align 1
-  tail call void @llvm.dbg.value(metadata !{i32 %param1}, i64 0, metadata !30, metadata !{metadata !"0x102"}), !dbg !47
-  tail call void @llvm.dbg.value(metadata !{i32 %param2}, i64 0, metadata !31, metadata !{metadata !"0x102"}), !dbg !47
-  tail call void @llvm.dbg.value(metadata !48, i64 0, metadata !32, metadata !{metadata !"0x102"}), !dbg !49
+  tail call void @llvm.dbg.value(metadata i32 %param1, i64 0, metadata !30, metadata !{!"0x102"}), !dbg !47
+  tail call void @llvm.dbg.value(metadata i32 %param2, i64 0, metadata !31, metadata !{!"0x102"}), !dbg !47
+  tail call void @llvm.dbg.value(metadata i8* null, i64 0, metadata !32, metadata !{!"0x102"}), !dbg !49
   %tobool = icmp eq i32 %param2, 0, !dbg !50
   br i1 %tobool, label %if.end, label %if.then, !dbg !50
 
 if.then:                                          ; preds = %entry
   %call = tail call i8* @_Z5i2stri(i32 %param2), !dbg !52
-  tail call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !32, metadata !{metadata !"0x102"}), !dbg !49
+  tail call void @llvm.dbg.value(metadata i8* %call, i64 0, metadata !32, metadata !{!"0x102"}), !dbg !49
   br label %if.end, !dbg !54
 
 if.end:                                           ; preds = %entry, %if.then
-  tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33, metadata !{metadata !"0x102"}), !dbg !55
-  tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !56, metadata !{metadata !"0x102"}), !dbg !57
-  tail call void @llvm.dbg.value(metadata !58, i64 0, metadata !59, metadata !{metadata !"0x102"}), !dbg !60
+  tail call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !33, metadata !{!"0x102"}), !dbg !55
+  tail call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !56, metadata !{!"0x102"}), !dbg !57
+  tail call void @llvm.dbg.value(metadata !58, i64 0, metadata !59, metadata !{!"0x102"}), !dbg !60
   %arraydecay.i = getelementptr inbounds %struct.AAA3* %var1, i64 0, i32 0, i64 0, !dbg !61
   call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !61
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34, metadata !{metadata !"0x102"}), !dbg !63
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !64, metadata !{metadata !"0x102"}), !dbg !65
-  call void @llvm.dbg.value(metadata !58, i64 0, metadata !66, metadata !{metadata !"0x102"}), !dbg !67
+  call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !34, metadata !{!"0x102"}), !dbg !63
+  call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !64, metadata !{!"0x102"}), !dbg !65
+  call void @llvm.dbg.value(metadata !58, i64 0, metadata !66, metadata !{!"0x102"}), !dbg !67
   %arraydecay.i5 = getelementptr inbounds %struct.AAA3* %var2, i64 0, i32 0, i64 0, !dbg !68
   call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !68
   %tobool1 = icmp eq i32 %param1, 0, !dbg !69
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34, metadata !{metadata !"0x102"}), !dbg !63
+  call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !34, metadata !{!"0x102"}), !dbg !63
   br i1 %tobool1, label %if.else, label %if.then2, !dbg !69
 
 if.then2:                                         ; preds = %if.end
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !71, metadata !{metadata !"0x102"}), !dbg !73
-  call void @llvm.dbg.value(metadata !74, i64 0, metadata !75, metadata !{metadata !"0x102"}), !dbg !76
+  call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !71, metadata !{!"0x102"}), !dbg !73
+  call void @llvm.dbg.value(metadata !74, i64 0, metadata !75, metadata !{!"0x102"}), !dbg !76
   call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)), !dbg !76
   br label %if.end3, !dbg !72
 
 if.else:                                          ; preds = %if.end
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !77, metadata !{metadata !"0x102"}), !dbg !79
-  call void @llvm.dbg.value(metadata !80, i64 0, metadata !81, metadata !{metadata !"0x102"}), !dbg !82
+  call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !77, metadata !{!"0x102"}), !dbg !79
+  call void @llvm.dbg.value(metadata !80, i64 0, metadata !81, metadata !{!"0x102"}), !dbg !82
   call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)), !dbg !82
   br label %if.end3
 
 if.end3:                                          ; preds = %if.else, %if.then2
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33, metadata !{metadata !"0x102"}), !dbg !55
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !83, metadata !{metadata !"0x102"}), !dbg !85
-  call void @llvm.dbg.value(metadata !58, i64 0, metadata !86, metadata !{metadata !"0x102"}), !dbg !87
+  call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !33, metadata !{!"0x102"}), !dbg !55
+  call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !83, metadata !{!"0x102"}), !dbg !85
+  call void @llvm.dbg.value(metadata !58, i64 0, metadata !86, metadata !{!"0x102"}), !dbg !87
   call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !87
   ret void, !dbg !88
 }
@@ -113,92 +113,92 @@ attributes #2 = { nounwind readnone }
 !llvm.module.flags = !{!44, !45}
 !llvm.ident = !{!46}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !23, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"dbg-changes-codegen-branch-folding.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00AAA3\004\0032\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_structure_type ] [AAA3] [line 4, size 32, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !6, metadata !11, metadata !17, metadata !18}
-!6 = metadata !{metadata !"0xd\00text\008\0032\008\000\000", metadata !1, metadata !"_ZTS4AAA3", metadata !7} ; [ DW_TAG_member ] [text] [line 8, size 32, align 8, offset 0] [from ]
-!7 = metadata !{metadata !"0x1\00\000\0032\008\000\000", null, null, metadata !8, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char]
-!8 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
-!11 = metadata !{metadata !"0x2e\00AAA3\00AAA3\00\005\000\000\000\006\00256\001\005", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 5] [AAA3]
-!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!13 = metadata !{null, metadata !14, metadata !15}
-!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4AAA3]
-!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!16 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from char]
-!17 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN4AAA3aSEPKc\006\000\000\000\006\00256\001\006", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 6] [operator=]
-!18 = metadata !{metadata !"0x2e\00operator const char *\00operator const char *\00_ZNK4AAA3cvPKcEv\007\000\000\000\006\00256\001\007", metadata !1, metadata !"_ZTS4AAA3", metadata !19, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [operator const char *]
-!19 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!20 = metadata !{metadata !15, metadata !21}
-!21 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ]
-!22 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS4AAA3]
-!23 = metadata !{metadata !24, metadata !35, metadata !40}
-!24 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3barii\0011\000\001\000\006\00256\001\0011", metadata !1, metadata !25, metadata !26, null, void (i32, i32)* @_Z3barii, null, null, metadata !29} ; [ DW_TAG_subprogram ] [line 11] [def] [bar]
-!25 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
-!26 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !27, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!27 = metadata !{null, metadata !28, metadata !28}
-!28 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!29 = metadata !{metadata !30, metadata !31, metadata !32, metadata !33, metadata !34}
-!30 = metadata !{metadata !"0x101\00param1\0016777227\000", metadata !24, metadata !25, metadata !28} ; [ DW_TAG_arg_variable ] [param1] [line 11]
-!31 = metadata !{metadata !"0x101\00param2\0033554443\000", metadata !24, metadata !25, metadata !28} ; [ DW_TAG_arg_variable ] [param2] [line 11]
-!32 = metadata !{metadata !"0x100\00temp\0012\000", metadata !24, metadata !25, metadata !15} ; [ DW_TAG_auto_variable ] [temp] [line 12]
-!33 = metadata !{metadata !"0x100\00var1\0017\000", metadata !24, metadata !25, metadata !"_ZTS4AAA3"} ; [ DW_TAG_auto_variable ] [var1] [line 17]
-!34 = metadata !{metadata !"0x100\00var2\0018\000", metadata !24, metadata !25, metadata !"_ZTS4AAA3"} ; [ DW_TAG_auto_variable ] [var2] [line 18]
-!35 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN4AAA3aSEPKc\006\000\001\000\006\00256\001\006", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, metadata !17, metadata !36} ; [ DW_TAG_subprogram ] [line 6] [def] [operator=]
-!36 = metadata !{metadata !37, metadata !39}
-!37 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!38 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4AAA3]
-!39 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15} ; [ DW_TAG_arg_variable ] [value] [line 6]
-!40 = metadata !{metadata !"0x2e\00AAA3\00AAA3\00_ZN4AAA3C2EPKc\005\000\001\000\006\00256\001\005", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, metadata !11, metadata !41} ; [ DW_TAG_subprogram ] [line 5] [def] [AAA3]
-!41 = metadata !{metadata !42, metadata !43}
-!42 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !40, null, metadata !38} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!43 = metadata !{metadata !"0x101\00value\0033554437\000", metadata !40, metadata !25, metadata !15} ; [ DW_TAG_arg_variable ] [value] [line 5]
-!44 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!45 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!46 = metadata !{metadata !"clang version 3.5.0 "}
-!47 = metadata !{i32 11, i32 0, metadata !24, null}
-!48 = metadata !{i8* null}
-!49 = metadata !{i32 12, i32 0, metadata !24, null}
-!50 = metadata !{i32 14, i32 0, metadata !51, null}
-!51 = metadata !{metadata !"0xb\0014\000\000", metadata !1, metadata !24} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
-!52 = metadata !{i32 15, i32 0, metadata !53, null}
-!53 = metadata !{metadata !"0xb\0014\000\000", metadata !1, metadata !51} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
-!54 = metadata !{i32 16, i32 0, metadata !53, null}
-!55 = metadata !{i32 17, i32 0, metadata !24, null}
-!56 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !40, null, metadata !38, metadata !55} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!57 = metadata !{i32 0, i32 0, metadata !40, metadata !55}
-!58 = metadata !{i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)}
-!59 = metadata !{metadata !"0x101\00value\0033554437\000", metadata !40, metadata !25, metadata !15, metadata !55} ; [ DW_TAG_arg_variable ] [value] [line 5]
-!60 = metadata !{i32 5, i32 0, metadata !40, metadata !55}
-!61 = metadata !{i32 5, i32 0, metadata !62, metadata !55}
-!62 = metadata !{metadata !"0xb\005\000\000", metadata !1, metadata !40} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
-!63 = metadata !{i32 18, i32 0, metadata !24, null}
-!64 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !40, null, metadata !38, metadata !63} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!65 = metadata !{i32 0, i32 0, metadata !40, metadata !63}
-!66 = metadata !{metadata !"0x101\00value\0033554437\000", metadata !40, metadata !25, metadata !15, metadata !63} ; [ DW_TAG_arg_variable ] [value] [line 5]
-!67 = metadata !{i32 5, i32 0, metadata !40, metadata !63}
-!68 = metadata !{i32 5, i32 0, metadata !62, metadata !63}
-!69 = metadata !{i32 20, i32 0, metadata !70, null}
-!70 = metadata !{metadata !"0xb\0020\000\000", metadata !1, metadata !24} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
-!71 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38, metadata !72} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!72 = metadata !{i32 21, i32 0, metadata !70, null}
-!73 = metadata !{i32 0, i32 0, metadata !35, metadata !72}
-!74 = metadata !{i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)}
-!75 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15, metadata !72} ; [ DW_TAG_arg_variable ] [value] [line 6]
-!76 = metadata !{i32 6, i32 0, metadata !35, metadata !72}
-!77 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38, metadata !78} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!78 = metadata !{i32 23, i32 0, metadata !70, null}
-!79 = metadata !{i32 0, i32 0, metadata !35, metadata !78}
-!80 = metadata !{i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)}
-!81 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15, metadata !78} ; [ DW_TAG_arg_variable ] [value] [line 6]
-!82 = metadata !{i32 6, i32 0, metadata !35, metadata !78}
-!83 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38, metadata !84} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!84 = metadata !{i32 24, i32 0, metadata !24, null}
-!85 = metadata !{i32 0, i32 0, metadata !35, metadata !84}
-!86 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15, metadata !84} ; [ DW_TAG_arg_variable ] [value] [line 6]
-!87 = metadata !{i32 6, i32 0, metadata !35, metadata !84}
-!88 = metadata !{i32 25, i32 0, metadata !24, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \001\00\000\00\001", !1, !2, !3, !23, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"dbg-changes-codegen-branch-folding.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00AAA3\004\0032\008\000\000\000", !1, null, null, !5, null, null, !"_ZTS4AAA3"} ; [ DW_TAG_structure_type ] [AAA3] [line 4, size 32, align 8, offset 0] [def] [from ]
+!5 = !{!6, !11, !17, !18}
+!6 = !{!"0xd\00text\008\0032\008\000\000", !1, !"_ZTS4AAA3", !7} ; [ DW_TAG_member ] [text] [line 8, size 32, align 8, offset 0] [from ]
+!7 = !{!"0x1\00\000\0032\008\000\000", null, null, !8, !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char]
+!8 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!9 = !{!10}
+!10 = !{!"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
+!11 = !{!"0x2e\00AAA3\00AAA3\00\005\000\000\000\006\00256\001\005", !1, !"_ZTS4AAA3", !12, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 5] [AAA3]
+!12 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = !{null, !14, !15}
+!14 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4AAA3]
+!15 = !{!"0xf\00\000\0064\0064\000\000", null, null, !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!16 = !{!"0x26\00\000\000\000\000\000", null, null, !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from char]
+!17 = !{!"0x2e\00operator=\00operator=\00_ZN4AAA3aSEPKc\006\000\000\000\006\00256\001\006", !1, !"_ZTS4AAA3", !12, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 6] [operator=]
+!18 = !{!"0x2e\00operator const char *\00operator const char *\00_ZNK4AAA3cvPKcEv\007\000\000\000\006\00256\001\007", !1, !"_ZTS4AAA3", !19, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [operator const char *]
+!19 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = !{!15, !21}
+!21 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ]
+!22 = !{!"0x26\00\000\000\000\000\000", null, null, !"_ZTS4AAA3"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS4AAA3]
+!23 = !{!24, !35, !40}
+!24 = !{!"0x2e\00bar\00bar\00_Z3barii\0011\000\001\000\006\00256\001\0011", !1, !25, !26, null, void (i32, i32)* @_Z3barii, null, null, !29} ; [ DW_TAG_subprogram ] [line 11] [def] [bar]
+!25 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!26 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !27, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!27 = !{null, !28, !28}
+!28 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!29 = !{!30, !31, !32, !33, !34}
+!30 = !{!"0x101\00param1\0016777227\000", !24, !25, !28} ; [ DW_TAG_arg_variable ] [param1] [line 11]
+!31 = !{!"0x101\00param2\0033554443\000", !24, !25, !28} ; [ DW_TAG_arg_variable ] [param2] [line 11]
+!32 = !{!"0x100\00temp\0012\000", !24, !25, !15} ; [ DW_TAG_auto_variable ] [temp] [line 12]
+!33 = !{!"0x100\00var1\0017\000", !24, !25, !"_ZTS4AAA3"} ; [ DW_TAG_auto_variable ] [var1] [line 17]
+!34 = !{!"0x100\00var2\0018\000", !24, !25, !"_ZTS4AAA3"} ; [ DW_TAG_auto_variable ] [var2] [line 18]
+!35 = !{!"0x2e\00operator=\00operator=\00_ZN4AAA3aSEPKc\006\000\001\000\006\00256\001\006", !1, !"_ZTS4AAA3", !12, null, null, null, !17, !36} ; [ DW_TAG_subprogram ] [line 6] [def] [operator=]
+!36 = !{!37, !39}
+!37 = !{!"0x101\00this\0016777216\001088", !35, null, !38} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!38 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4AAA3]
+!39 = !{!"0x101\00value\0033554438\000", !35, !25, !15} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!40 = !{!"0x2e\00AAA3\00AAA3\00_ZN4AAA3C2EPKc\005\000\001\000\006\00256\001\005", !1, !"_ZTS4AAA3", !12, null, null, null, !11, !41} ; [ DW_TAG_subprogram ] [line 5] [def] [AAA3]
+!41 = !{!42, !43}
+!42 = !{!"0x101\00this\0016777216\001088", !40, null, !38} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!43 = !{!"0x101\00value\0033554437\000", !40, !25, !15} ; [ DW_TAG_arg_variable ] [value] [line 5]
+!44 = !{i32 2, !"Dwarf Version", i32 4}
+!45 = !{i32 2, !"Debug Info Version", i32 2}
+!46 = !{!"clang version 3.5.0 "}
+!47 = !MDLocation(line: 11, scope: !24)
+!48 = !{i8* null}
+!49 = !MDLocation(line: 12, scope: !24)
+!50 = !MDLocation(line: 14, scope: !51)
+!51 = !{!"0xb\0014\000\000", !1, !24} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!52 = !MDLocation(line: 15, scope: !53)
+!53 = !{!"0xb\0014\000\000", !1, !51} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!54 = !MDLocation(line: 16, scope: !53)
+!55 = !MDLocation(line: 17, scope: !24)
+!56 = !{!"0x101\00this\0016777216\001088", !40, null, !38, !55} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!57 = !MDLocation(line: 0, scope: !40, inlinedAt: !55)
+!58 = !{i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)}
+!59 = !{!"0x101\00value\0033554437\000", !40, !25, !15, !55} ; [ DW_TAG_arg_variable ] [value] [line 5]
+!60 = !MDLocation(line: 5, scope: !40, inlinedAt: !55)
+!61 = !MDLocation(line: 5, scope: !62, inlinedAt: !55)
+!62 = !{!"0xb\005\000\000", !1, !40} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!63 = !MDLocation(line: 18, scope: !24)
+!64 = !{!"0x101\00this\0016777216\001088", !40, null, !38, !63} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!65 = !MDLocation(line: 0, scope: !40, inlinedAt: !63)
+!66 = !{!"0x101\00value\0033554437\000", !40, !25, !15, !63} ; [ DW_TAG_arg_variable ] [value] [line 5]
+!67 = !MDLocation(line: 5, scope: !40, inlinedAt: !63)
+!68 = !MDLocation(line: 5, scope: !62, inlinedAt: !63)
+!69 = !MDLocation(line: 20, scope: !70)
+!70 = !{!"0xb\0020\000\000", !1, !24} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!71 = !{!"0x101\00this\0016777216\001088", !35, null, !38, !72} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!72 = !MDLocation(line: 21, scope: !70)
+!73 = !MDLocation(line: 0, scope: !35, inlinedAt: !72)
+!74 = !{i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)}
+!75 = !{!"0x101\00value\0033554438\000", !35, !25, !15, !72} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!76 = !MDLocation(line: 6, scope: !35, inlinedAt: !72)
+!77 = !{!"0x101\00this\0016777216\001088", !35, null, !38, !78} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!78 = !MDLocation(line: 23, scope: !70)
+!79 = !MDLocation(line: 0, scope: !35, inlinedAt: !78)
+!80 = !{i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)}
+!81 = !{!"0x101\00value\0033554438\000", !35, !25, !15, !78} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!82 = !MDLocation(line: 6, scope: !35, inlinedAt: !78)
+!83 = !{!"0x101\00this\0016777216\001088", !35, null, !38, !84} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!84 = !MDLocation(line: 24, scope: !24)
+!85 = !MDLocation(line: 0, scope: !35, inlinedAt: !84)
+!86 = !{!"0x101\00value\0033554438\000", !35, !25, !15, !84} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!87 = !MDLocation(line: 6, scope: !35, inlinedAt: !84)
+!88 = !MDLocation(line: 25, scope: !24)
diff --git a/test/CodeGen/X86/dbg-changes-codegen.ll b/test/CodeGen/X86/dbg-changes-codegen.ll
index aae95e8..2179667 100644
--- a/test/CodeGen/X86/dbg-changes-codegen.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen.ll
@@ -44,7 +44,7 @@
 define zeroext i1 @_ZN3Foo3batEv(%struct.Foo* %this) #0 align 2 {
 entry:
   %0 = load %struct.Foo** @pfoo, align 8
-  tail call void @llvm.dbg.value(metadata !{%struct.Foo* %0}, i64 0, metadata !62, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata %struct.Foo* %0, i64 0, metadata !62, metadata !{!"0x102"})
   %cmp.i = icmp eq %struct.Foo* %0, %this
   ret i1 %cmp.i
 }
@@ -53,7 +53,7 @@ entry:
 define void @_Z3bazv() #1 {
 entry:
   %0 = load %struct.Wibble** @wibble1, align 8
-  tail call void @llvm.dbg.value(metadata !64, i64 0, metadata !65, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata %struct.Flibble* undef, i64 0, metadata !65, metadata !{!"0x102"})
   %1 = load %struct.Wibble** @wibble2, align 8
   %cmp.i = icmp ugt %struct.Wibble* %1, %0
   br i1 %cmp.i, label %if.then.i, label %_ZN7Flibble3barEP6Wibble.exit
@@ -76,8 +76,8 @@ attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 attributes #2 = { nounwind readnone }
 
 
-!17 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, null} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from Foo]
-!45 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Flibble]
-!62 = metadata !{metadata !"0x101\00arg\0033554436\000", null, null, metadata !17} ; [ DW_TAG_arg_variable ] [arg] [line 4]
-!64 = metadata !{%struct.Flibble* undef}
-!65 = metadata !{metadata !"0x101\00this\0016777229\001088", null, null, metadata !45} ; [ DW_TAG_arg_variable ] [this] [line 13]
+!17 = !{!"0x10\00\000\000\000\000\000", null, null, null} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from Foo]
+!45 = !{!"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Flibble]
+!62 = !{!"0x101\00arg\0033554436\000", null, null, !17} ; [ DW_TAG_arg_variable ] [arg] [line 4]
+!64 = !{%struct.Flibble* undef}
+!65 = !{!"0x101\00this\0016777229\001088", null, null, !45} ; [ DW_TAG_arg_variable ] [this] [line 13]
diff --git a/test/CodeGen/X86/dbg-combine.ll b/test/CodeGen/X86/dbg-combine.ll
new file mode 100644
index 0000000..f6b9565
--- /dev/null
+++ b/test/CodeGen/X86/dbg-combine.ll
@@ -0,0 +1,113 @@
+; RUN: llc -mtriple x86_64-pc-linux -O0 < %s | FileCheck %s
+
+; Make sure that the sequence of debug locations for function foo is correctly
+; generated. More specifically, .loc entries for lines 4,5,6,7 must appear in
+; the correct sequence.
+
+; $ clang -emit-llvm -S -g dbg-combine.c
+; 1.  int foo()
+; 2.  {
+; 3.     int elems = 3;
+; 4.     int array1[elems];
+; 5.     array1[0]=0;
+; 6.     array1[1]=1;
+; 7.     array1[2]=2;
+; 8.     int array2[elems];
+; 9.     array2[0]=1;
+; 10.    return array2[0];
+; 11. }
+
+; CHECK: .loc    1 4
+; CHECK: .loc    1 5
+; CHECK: .loc    1 6
+; CHECK: .loc    1 7
+
+; ModuleID = 'dbg-combine.c'
+; Function Attrs: nounwind uwtable
+define i32 @foo() #0 {
+entry:
+  %elems = alloca i32, align 4
+  %saved_stack = alloca i8*
+  %cleanup.dest.slot = alloca i32
+  call void @llvm.dbg.declare(metadata i32* %elems, metadata !12, metadata !13), !dbg !14
+  store i32 3, i32* %elems, align 4, !dbg !14
+  %0 = load i32* %elems, align 4, !dbg !15
+  %1 = zext i32 %0 to i64, !dbg !16
+  %2 = call i8* @llvm.stacksave(), !dbg !16
+  store i8* %2, i8** %saved_stack, !dbg !16
+  %vla = alloca i32, i64 %1, align 16, !dbg !16
+  call void @llvm.dbg.declare(metadata i32* %vla, metadata !17, metadata !21), !dbg !22
+  %arrayidx = getelementptr inbounds i32* %vla, i64 0, !dbg !23
+  store i32 0, i32* %arrayidx, align 4, !dbg !24
+  %arrayidx1 = getelementptr inbounds i32* %vla, i64 1, !dbg !25
+  store i32 1, i32* %arrayidx1, align 4, !dbg !26
+  %arrayidx2 = getelementptr inbounds i32* %vla, i64 2, !dbg !27
+  store i32 2, i32* %arrayidx2, align 4, !dbg !28
+  %3 = load i32* %elems, align 4, !dbg !29
+  %4 = zext i32 %3 to i64, !dbg !30
+  %vla3 = alloca i32, i64 %4, align 16, !dbg !30
+  call void @llvm.dbg.declare(metadata i32* %vla3, metadata !31, metadata !21), !dbg !32
+  %arrayidx4 = getelementptr inbounds i32* %vla3, i64 0, !dbg !33
+  store i32 1, i32* %arrayidx4, align 4, !dbg !34
+  %arrayidx5 = getelementptr inbounds i32* %vla3, i64 0, !dbg !35
+  %5 = load i32* %arrayidx5, align 4, !dbg !35
+  store i32 1, i32* %cleanup.dest.slot
+  %6 = load i8** %saved_stack, !dbg !36
+  call void @llvm.stackrestore(i8* %6), !dbg !36
+  ret i32 %5, !dbg !36
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind
+declare i8* @llvm.stacksave() #2
+
+; Function Attrs: nounwind
+declare void @llvm.stackrestore(i8*) #2
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = !{!"0x11\0012\00clang version 3.7.0 (trunk 227074)\000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/home/probinson/projects/scratch/dbg-combine.c] [DW_LANG_C99]
+!1 = !{!"dbg-combine.c", !"/home/probinson/projects/scratch"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\000\000\000\002", !1, !5, !6, null, i32 ()* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [foo]
+!5 = !{!"0x29", !1}                               ; [ DW_TAG_file_type ] [/home/probinson/projects/scratch/dbg-combine.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 2}
+!11 = !{!"clang version 3.7.0 (trunk 227074)"}
+!12 = !{!"0x100\00elems\003\000", !4, !5, !8}     ; [ DW_TAG_auto_variable ] [elems] [line 3]
+!13 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!14 = !MDLocation(line: 3, column: 8, scope: !4)
+!15 = !MDLocation(line: 4, column: 15, scope: !4)
+!16 = !MDLocation(line: 4, column: 4, scope: !4)
+!17 = !{!"0x100\00array1\004\000", !4, !5, !18}   ; [ DW_TAG_auto_variable ] [array1] [line 4]
+!18 = !{!"0x1\00\000\000\0032\000\000\000", null, null, !8, !19, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!19 = !{!20}
+!20 = !{!"0x21\000\00-1"}                         ; [ DW_TAG_subrange_type ] [unbounded]
+!21 = !{!"0x102\006"}                             ; [ DW_TAG_expression ] [DW_OP_deref]
+!22 = !MDLocation(line: 4, column: 8, scope: !4)
+!23 = !MDLocation(line: 5, column: 4, scope: !4)
+!24 = !MDLocation(line: 5, column: 13, scope: !4)
+!25 = !MDLocation(line: 6, column: 4, scope: !4)
+!26 = !MDLocation(line: 6, column: 13, scope: !4)
+!27 = !MDLocation(line: 7, column: 4, scope: !4)
+!28 = !MDLocation(line: 7, column: 13, scope: !4)
+!29 = !MDLocation(line: 8, column: 15, scope: !4)
+!30 = !MDLocation(line: 8, column: 4, scope: !4)
+!31 = !{!"0x100\00array2\008\000", !4, !5, !18}   ; [ DW_TAG_auto_variable ] [array2] [line 8]
+!32 = !MDLocation(line: 8, column: 8, scope: !4)
+!33 = !MDLocation(line: 9, column: 4, scope: !4)
+!34 = !MDLocation(line: 9, column: 13, scope: !4)
+!35 = !MDLocation(line: 10, column: 11, scope: !4)
+!36 = !MDLocation(line: 11, column: 1, scope: !4)
diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll
index c673f5d..629a557 100644
--- a/test/CodeGen/X86/dllexport-x86_64.ll
+++ b/test/CodeGen/X86/dllexport-x86_64.ll
@@ -17,19 +17,16 @@ define dllexport void @f2() unnamed_addr {
 	ret void
 }
 
-; CHECK: .section .text,"xr",discard,lnk1
 ; CHECK: .globl lnk1
 define linkonce_odr dllexport void @lnk1() {
 	ret void
 }
 
-; CHECK: .section .text,"xr",discard,lnk2
 ; CHECK: .globl lnk2
 define linkonce_odr dllexport void @lnk2() alwaysinline {
 	ret void
 }
 
-; CHECK: .section .text,"xr",discard,weak1
 ; CHECK: .globl weak1
 define weak_odr dllexport void @weak1() {
 	ret void
@@ -40,18 +37,16 @@ define weak_odr dllexport void @weak1() {
 ; CHECK: .globl Var1
 @Var1 = dllexport global i32 1, align 4
 
-; CHECK: .rdata,"rd"
+; CHECK: .rdata,"dr"
 ; CHECK: .globl Var2
 @Var2 = dllexport unnamed_addr constant i32 1
 
 ; CHECK: .comm Var3
 @Var3 = common dllexport global i32 0, align 4
 
-; CHECK: .section .data,"wd",discard,WeakVar1
 ; CHECK: .globl WeakVar1
 @WeakVar1 = weak_odr dllexport global i32 1, align 4
 
-; CHECK: .section .rdata,"rd",discard,WeakVar2
 ; CHECK: .globl WeakVar2
 @WeakVar2 = weak_odr dllexport unnamed_addr constant i32 1
 
diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll
index 5035aa1..02a83ae 100644
--- a/test/CodeGen/X86/dllexport.ll
+++ b/test/CodeGen/X86/dllexport.ll
@@ -21,6 +21,8 @@ define dllexport void @f2() unnamed_addr {
 	ret void
 }
 
+declare dllexport void @not_defined()
+
 ; CHECK: .globl _stdfun@0
 define dllexport x86_stdcallcc void @stdfun() nounwind {
 	ret void
@@ -36,19 +38,16 @@ define dllexport x86_thiscallcc void @thisfun() nounwind {
 	ret void
 }
 
-; CHECK: .section .text,"xr",discard,_lnk1
 ; CHECK: .globl _lnk1
 define linkonce_odr dllexport void @lnk1() {
 	ret void
 }
 
-; CHECK: .section .text,"xr",discard,_lnk2
 ; CHECK: .globl _lnk2
 define linkonce_odr dllexport void @lnk2() alwaysinline {
 	ret void
 }
 
-; CHECK: .section .text,"xr",discard,_weak1
 ; CHECK: .globl _weak1
 define weak_odr dllexport void @weak1() {
 	ret void
@@ -59,18 +58,16 @@ define weak_odr dllexport void @weak1() {
 ; CHECK: .globl _Var1
 @Var1 = dllexport global i32 1, align 4
 
-; CHECK: .rdata,"rd"
+; CHECK: .rdata,"dr"
 ; CHECK: .globl _Var2
 @Var2 = dllexport unnamed_addr constant i32 1
 
 ; CHECK: .comm _Var3
 @Var3 = common dllexport global i32 0, align 4
 
-; CHECK: .section .data,"wd",discard,_WeakVar1
 ; CHECK: .globl _WeakVar1
 @WeakVar1 = weak_odr dllexport global i32 1, align 4
 
-; CHECK: .section .rdata,"rd",discard,_WeakVar2
 ; CHECK: .globl _WeakVar2
 @WeakVar2 = weak_odr dllexport unnamed_addr constant i32 1
 
@@ -91,7 +88,6 @@ define weak_odr dllexport void @weak1() {
 ; CHECK: _weak_alias = _f1
 @weak_alias = weak_odr dllexport alias void()* @f1
 
-
 ; CHECK: .section .drectve
 ; CHECK-CL: " /EXPORT:_Var1,DATA"
 ; CHECK-CL: " /EXPORT:_Var2,DATA"
@@ -100,6 +96,7 @@ define weak_odr dllexport void @weak1() {
 ; CHECK-CL: " /EXPORT:_WeakVar2,DATA"
 ; CHECK-CL: " /EXPORT:_f1"
 ; CHECK-CL: " /EXPORT:_f2"
+; CHECK-CL-NOT: not_exported
 ; CHECK-CL: " /EXPORT:_stdfun@0"
 ; CHECK-CL: " /EXPORT:@fastfun@0"
 ; CHECK-CL: " /EXPORT:_thisfun"
@@ -117,6 +114,7 @@ define weak_odr dllexport void @weak1() {
 ; CHECK-GCC: " -export:WeakVar2,data"
 ; CHECK-GCC: " -export:f1"
 ; CHECK-GCC: " -export:f2"
+; CHECK-CL-NOT: not_exported
 ; CHECK-GCC: " -export:stdfun@0"
 ; CHECK-GCC: " -export:@fastfun@0"
 ; CHECK-GCC: " -export:thisfun"
diff --git a/test/CodeGen/X86/dwarf-comp-dir.ll b/test/CodeGen/X86/dwarf-comp-dir.ll
index 872f7fa..77eba63 100644
--- a/test/CodeGen/X86/dwarf-comp-dir.ll
+++ b/test/CodeGen/X86/dwarf-comp-dir.ll
@@ -7,15 +7,15 @@ target triple = "x86_64-unknown-linux-gnu"
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!5}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 143523)\001\00\000\00\000", metadata !4, metadata !2, metadata !7, metadata !2, metadata !2, null} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{}
-!3 = metadata !{metadata !"0x29", metadata !4} ; [ DW_TAG_file_type ]
-!4 = metadata !{metadata !"empty.c", metadata !"/home/nlewycky"}
-!6 = metadata !{metadata !"0x13\00foo\001\008\008\000\000\000", metadata !4, null, null, metadata !2, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
-!7 = metadata !{metadata !6}
+!0 = !{!"0x11\0012\00clang version 3.1 (trunk 143523)\001\00\000\00\000", !4, !2, !7, !2, !2, null} ; [ DW_TAG_compile_unit ]
+!2 = !{}
+!3 = !{!"0x29", !4} ; [ DW_TAG_file_type ]
+!4 = !{!"empty.c", !"/home/nlewycky"}
+!6 = !{!"0x13\00foo\001\008\008\000\000\000", !4, null, null, !2, null, null, !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!7 = !{!6}
 
 ; The important part of the following check is that dir = #0.
 ;                        Dir  Mod Time   File Len   File Name
 ;                        ---- ---------- ---------- ---------------------------
 ; CHECK: file_names[  1]    0 0x00000000 0x00000000 empty.c
-!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!5 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/dwarf-eh-prepare.ll b/test/CodeGen/X86/dwarf-eh-prepare.ll
new file mode 100644
index 0000000..a3a70da
--- /dev/null
+++ b/test/CodeGen/X86/dwarf-eh-prepare.ll
@@ -0,0 +1,51 @@
+; RUN: opt -mtriple=x86_64-linux-gnu -dwarfehprepare < %s -S | FileCheck %s
+
+; Check basic functionality of IR-to-IR DWARF EH preparation. This should
+; eliminate resumes. This pass requires a TargetMachine, so we put it under X86
+; and provide an x86 triple.
+
+@int_typeinfo = global i8 0
+
+declare void @might_throw()
+
+define i32 @simple_catch() {
+  invoke void @might_throw()
+          to label %cont unwind label %lpad
+
+; CHECK: define i32 @simple_catch()
+; CHECK: invoke void @might_throw()
+
+cont:
+  ret i32 0
+
+; CHECK: ret i32 0
+
+lpad:
+  %ehvals = landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0
+      catch i8* @int_typeinfo
+  %ehptr = extractvalue { i8*, i32 } %ehvals, 0
+  %ehsel = extractvalue { i8*, i32 } %ehvals, 1
+  %int_sel = call i32 @llvm.eh.typeid.for(i8* @int_typeinfo)
+  %int_match = icmp eq i32 %ehsel, %int_sel
+  br i1 %int_match, label %catch_int, label %eh.resume
+
+; CHECK: lpad:
+; CHECK: landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0
+; CHECK: call i32 @llvm.eh.typeid.for
+; CHECK: br i1
+
+catch_int:
+  ret i32 1
+
+; CHECK: catch_int:
+; CHECK: ret i32 1
+
+eh.resume:
+  resume { i8*, i32 } %ehvals
+
+; CHECK: eh.resume:
+; CHECK: call void @_Unwind_Resume(i8* %{{.*}})
+}
+
+declare i32 @__gxx_personality_v0(...)
+declare i32 @llvm.eh.typeid.for(i8*)
diff --git a/test/CodeGen/X86/elf-comdat.ll b/test/CodeGen/X86/elf-comdat.ll
index c7e6df7..35d8d6f 100644
--- a/test/CodeGen/X86/elf-comdat.ll
+++ b/test/CodeGen/X86/elf-comdat.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s
 
 $f = comdat any
-@v = global i32 0, comdat $f
-define void @f() comdat $f {
+@v = global i32 0, comdat($f)
+define void @f() comdat($f) {
   ret void
 }
 ; CHECK: .section        .text.f,"axG",@progbits,f,comdat
diff --git a/test/CodeGen/X86/elf-comdat2.ll b/test/CodeGen/X86/elf-comdat2.ll
index 209da39..786cec7 100644
--- a/test/CodeGen/X86/elf-comdat2.ll
+++ b/test/CodeGen/X86/elf-comdat2.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s
 
 $foo = comdat any
-@bar = global i32 42, comdat $foo
+@bar = global i32 42, comdat($foo)
 @foo = global i32 42
 
 ; CHECK:      .type   bar,@object
diff --git a/test/CodeGen/X86/equiv_with_fndef.ll b/test/CodeGen/X86/equiv_with_fndef.ll
new file mode 100644
index 0000000..efbb8ab
--- /dev/null
+++ b/test/CodeGen/X86/equiv_with_fndef.ll
@@ -0,0 +1,10 @@
+; RUN: not llc < %s 2>&1 | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+module asm ".equiv pselect, __pselect"
+
+define void @pselect() {
+  ret void
+}
+; CHECK: 'pselect' is a protected alias
diff --git a/test/CodeGen/X86/equiv_with_vardef.ll b/test/CodeGen/X86/equiv_with_vardef.ll
new file mode 100644
index 0000000..29c19a1
--- /dev/null
+++ b/test/CodeGen/X86/equiv_with_vardef.ll
@@ -0,0 +1,8 @@
+; RUN: not llc < %s 2>&1 | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+module asm ".equiv var, __var"
+
+@var = global i32 0
+; CHECK: symbol 'var' is already defined
diff --git a/test/CodeGen/X86/extractelement-load.ll b/test/CodeGen/X86/extractelement-load.ll
index 8647599..732f698 100644
--- a/test/CodeGen/X86/extractelement-load.ll
+++ b/test/CodeGen/X86/extractelement-load.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=yonah | FileCheck %s
 ; RUN: llc < %s -march=x86-64 -mattr=+sse2 -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+avx -mcpu=btver2 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -29,16 +30,15 @@ undef, i32 7, i32 9, i32 undef, i32 13, i32 15, i32 1, i32 3>
 ; This case could easily end up inf-looping in the DAG combiner due to an
 ; low alignment load of the vector which prevents us from reliably forming a
 ; narrow load.
-; FIXME: It would be nice to detect whether the target has fast and legal
-; unaligned loads and use them here.
+
+; The expected codegen is identical for the AVX case except
+; load/store instructions will have a leading 'v', so we don't
+; need to special-case the checks.
+
 define void @t3() {
 ; CHECK-LABEL: t3:
-;
-; This movs the entire vector, shuffling the high double down. If we fixed the
-; FIXME above it would just move the high double directly.
 ; CHECK: movupd
-; CHECK: shufpd
-; CHECK: movlpd
+; CHECK: movhpd
 
 bb:
   %tmp13 = load <2 x double>* undef, align 1
diff --git a/test/CodeGen/X86/f16c-intrinsics.ll b/test/CodeGen/X86/f16c-intrinsics.ll
index 514d929..802f917 100644
--- a/test/CodeGen/X86/f16c-intrinsics.ll
+++ b/test/CodeGen/X86/f16c-intrinsics.ll
@@ -2,6 +2,8 @@
 ; RUN: llc < %s -march=x86-64 -mattr=+avx,+f16c | FileCheck %s
 
 define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
+  ; CHECK-LABEL: test_x86_vcvtph2ps_128
+  ; CHECK-NOT: vmov
   ; CHECK: vcvtph2ps
   %res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
@@ -10,14 +12,27 @@ declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
 
 
 define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) {
+  ; CHECK-LABEL: test_x86_vcvtph2ps_256
+  ; CHECK-NOT: vmov
   ; CHECK: vcvtph2ps
   %res = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0) ; <<8 x float>> [#uses=1]
   ret <8 x float> %res
 }
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
 
+define <8 x float> @test_x86_vcvtph2ps_256_m(<8 x i16>* nocapture %a) nounwind {
+entry:
+  ; CHECK-LABEL: test_x86_vcvtph2ps_256_m:
+  ; CHECK-NOT: vmov
+  ; CHECK: vcvtph2ps  (%
+  %tmp1 = load <8 x i16>* %a, align 16
+  %0 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %tmp1)
+  ret <8 x float> %0
+}
 
 define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0) {
+  ; CHECK-LABEL: test_x86_vcvtps2ph_128
+  ; CHECK-NOT: vmov
   ; CHECK: vcvtps2ph
   %res = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
@@ -26,6 +41,8 @@ declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly
 
 
 define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0) {
+  ; CHECK-LABEL: test_x86_vcvtps2ph_256
+  ; CHECK-NOT: vmov
   ; CHECK: vcvtps2ph
   %res = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
diff --git a/test/CodeGen/X86/fast-isel-branch_weights.ll b/test/CodeGen/X86/fast-isel-branch_weights.ll
index bc41395..d2b02aa 100644
--- a/test/CodeGen/X86/fast-isel-branch_weights.ll
+++ b/test/CodeGen/X86/fast-isel-branch_weights.ll
@@ -16,4 +16,4 @@ success:
   ret i64 0
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 0, i32 2147483647}
+!0 = !{!"branch_weights", i32 0, i32 2147483647}
diff --git a/test/CodeGen/X86/fast-isel-call-bool.ll b/test/CodeGen/X86/fast-isel-call-bool.ll
new file mode 100644
index 0000000..5cdb2c9
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-call-bool.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -fast-isel -mcpu=core2 -mtriple=x86_64-unknown-unknown -O1 | FileCheck %s
+; See PR21557
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @bar(i1)
+
+define i64 @foo(i8* %arg) {
+; CHECK-LABEL: foo:
+top:
+  %0 = load i8* %arg
+; CHECK: movb
+  %1 = trunc i8 %0 to i1
+; CHECK: andb $1,
+  %2 = call i64 @bar(i1 %1)
+; CHECK: callq
+  ret i64 %2
+}
diff --git a/test/CodeGen/X86/fast-isel-cmp-branch.ll b/test/CodeGen/X86/fast-isel-cmp-branch.ll
index 6e408f8..684647c 100644
--- a/test/CodeGen/X86/fast-isel-cmp-branch.ll
+++ b/test/CodeGen/X86/fast-isel-cmp-branch.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -mtriple=x86_64-linux -asm-verbose=false < %s | FileCheck %s
-; RUN: llc -O0 -mtriple=x86_64-win32 -asm-verbose=false < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=x86_64-windows-itanium -asm-verbose=false < %s | FileCheck %s
 ; rdar://8337108
 
 ; Fast-isel shouldn't try to look through the compare because it's in a
diff --git a/test/CodeGen/X86/fast-isel-double-half-convertion.ll b/test/CodeGen/X86/fast-isel-double-half-convertion.ll
new file mode 100644
index 0000000..ade867b
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-double-half-convertion.ll
@@ -0,0 +1,23 @@
+; RUN: llc -fast-isel -fast-isel-abort -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s
+
+; XFAIL: *
+
+; In the future, we might want to teach fast-isel how to expand a double-to-half
+; conversion into a double-to-float conversion immediately followed by a
+; float-to-half conversion. For now, fast-isel is expected to fail.
+
+define double @test_fp16_to_fp64(i32 %a) {
+entry:
+  %0 = trunc i32 %a to i16
+  %1 = call double @llvm.convert.from.fp16.f64(i16 %0)
+  ret float %0
+}
+
+define i16 @test_fp64_to_fp16(double %a) {
+entry:
+  %0 = call i16 @llvm.convert.to.fp16.f64(double %a)
+  ret i16 %0
+}
+
+declare i16 @llvm.convert.to.fp16.f64(double)
+declare double @llvm.convert.from.fp16.f64(i16)
diff --git a/test/CodeGen/X86/fast-isel-float-half-convertion.ll b/test/CodeGen/X86/fast-isel-float-half-convertion.ll
new file mode 100644
index 0000000..ee89bcd
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-float-half-convertion.ll
@@ -0,0 +1,28 @@
+; RUN: llc -fast-isel -fast-isel-abort -asm-verbose=false -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s | FileCheck %s
+
+; Verify that fast-isel correctly expands float-half conversions.
+
+define i16 @test_fp32_to_fp16(float %a) {
+; CHECK-LABEL: test_fp32_to_fp16:
+; CHECK: vcvtps2ph $0, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %xmm0, %eax
+; CHECK-NEXT: retq
+entry:
+  %0 = call i16 @llvm.convert.to.fp16.f32(float %a)
+  ret i16 %0
+}
+
+define float @test_fp16_to_fp32(i32 %a) {
+; CHECK-LABEL: test_fp16_to_fp32:
+; CHECK: movswl %di, %eax
+; CHECK-NEXT: vmovd %eax, %xmm0
+; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0
+; CHECK-NEXT: retq
+entry:
+  %0 = trunc i32 %a to i16
+  %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
+  ret float %1
+}
+
+declare i16 @llvm.convert.to.fp16.f32(float)
+declare float @llvm.convert.from.fp16.f32(i16)
diff --git a/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll b/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll
new file mode 100644
index 0000000..308a4c3
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=ALL --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+;
+; Verify that fast-isel doesn't select legacy SSE instructions on targets that
+; feature AVX.
+;
+; Test cases are obtained from the following code snippet:
+; ///
+; double single_to_double_rr(float x) {
+;   return (double)x;
+; }
+; float double_to_single_rr(double x) {
+;   return (float)x;
+; }
+; double single_to_double_rm(float *x) {
+;   return (double)*x;
+; }
+; float double_to_single_rm(double *x) {
+;   return (float)*x;
+; }
+; ///
+
+define double @single_to_double_rr(float %x) {
+; ALL-LABEL: single_to_double_rr:
+; SSE-NOT: vcvtss2sd
+; AVX: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL: ret
+entry:
+  %conv = fpext float %x to double
+  ret double %conv
+}
+
+define float @double_to_single_rr(double %x) {
+; ALL-LABEL: double_to_single_rr:
+; SSE-NOT: vcvtsd2ss
+; AVX: vcvtsd2ss %xmm0, %xmm0, %xmm0
+; ALL: ret
+entry:
+  %conv = fptrunc double %x to float
+  ret float %conv
+}
+
+define double @single_to_double_rm(float* %x) {
+; ALL-LABEL: single_to_double_rm:
+; SSE: cvtss2sd (%rdi), %xmm0
+; AVX: vmovss (%rdi), %xmm0
+; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: ret
+entry:
+  %0 = load float* %x, align 4
+  %conv = fpext float %0 to double
+  ret double %conv
+}
+
+define float @double_to_single_rm(double* %x) {
+; ALL-LABEL: double_to_single_rm:
+; SSE: cvtsd2ss (%rdi), %xmm0
+; AVX: vmovsd (%rdi), %xmm0
+; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
+; ALL-NEXT: ret
+entry:
+  %0 = load double* %x, align 8
+  %conv = fptrunc double %0 to float
+  ret float %conv
+}
diff --git a/test/CodeGen/X86/fast-isel-gep.ll b/test/CodeGen/X86/fast-isel-gep.ll
index 4e47c74..a65e070 100644
--- a/test/CodeGen/X86/fast-isel-gep.ll
+++ b/test/CodeGen/X86/fast-isel-gep.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-linux -O0 | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-win32 -O0 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-windows-itanium -O0 | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -march=x86 -O0 | FileCheck %s --check-prefix=X32
 
 ; GEP indices are interpreted as signed integers, so they
diff --git a/test/CodeGen/X86/fast-isel-int-float-conversion.ll b/test/CodeGen/X86/fast-isel-int-float-conversion.ll
new file mode 100644
index 0000000..3869722
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-int-float-conversion.ll
@@ -0,0 +1,45 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=generic -mattr=+sse2 -O0 --fast-isel-abort < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=generic -mattr=+avx -O0 --fast-isel-abort < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+
+
+define double @int_to_double_rr(i32 %a) {
+; ALL-LABEL: int_to_double_rr:
+; SSE2: cvtsi2sdl %edi, %xmm0
+; AVX: vcvtsi2sdl %edi, %xmm0, %xmm0
+; ALL-NEXT: ret
+entry:
+  %0 = sitofp i32 %a to double
+  ret double %0
+}
+
+define double @int_to_double_rm(i32* %a) {
+; ALL-LABEL: int_to_double_rm:
+; SSE2: cvtsi2sdl (%rdi), %xmm0
+; AVX: vcvtsi2sdl (%rdi), %xmm0, %xmm0
+; ALL-NEXT: ret
+entry:
+  %0 = load i32* %a
+  %1 = sitofp i32 %0 to double
+  ret double %1
+}
+
+define float @int_to_float_rr(i32 %a) {
+; ALL-LABEL: int_to_float_rr:
+; SSE2: cvtsi2ssl %edi, %xmm0
+; AVX: vcvtsi2ssl %edi, %xmm0, %xmm0
+; ALL-NEXT: ret
+entry:
+  %0 = sitofp i32 %a to float
+  ret float %0
+}
+
+define float @int_to_float_rm(i32* %a) {
+; ALL-LABEL: int_to_float_rm:
+; SSE2: cvtsi2ssl (%rdi), %xmm0
+; AVX: vcvtsi2ssl (%rdi), %xmm0, %xmm0
+; ALL-NEXT: ret
+entry:
+  %0 = load i32* %a
+  %1 = sitofp i32 %0 to float
+  ret float %1
+}
diff --git a/test/CodeGen/X86/fastmath-float-half-conversion.ll b/test/CodeGen/X86/fastmath-float-half-conversion.ll
new file mode 100644
index 0000000..2930873
--- /dev/null
+++ b/test/CodeGen/X86/fastmath-float-half-conversion.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s | FileCheck %s --check-prefix=ALL --check-prefix=F16C
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+
+define zeroext i16 @test1_fast(double %d) #0 {
+; ALL-LABEL: test1_fast:
+; F16C-NOT: callq {{_+}}truncdfhf2
+; F16C: vcvtsd2ss %xmm0, %xmm0, %xmm0
+; F16C-NEXT: vcvtps2ph $0, %xmm0, %xmm0
+; AVX: callq {{_+}}truncdfhf2
+; ALL: ret
+entry:
+  %0 = tail call i16 @llvm.convert.to.fp16.f64(double %d)
+  ret i16 %0
+}
+
+define zeroext i16 @test2_fast(x86_fp80 %d) #0 {
+; ALL-LABEL: test2_fast:
+; F16C-NOT: callq {{_+}}truncxfhf2
+; F16C: fldt
+; F16C-NEXT: fstps
+; F16C-NEXT: vmovss
+; F16C-NEXT: vcvtps2ph $0, %xmm0, %xmm0
+; AVX: callq {{_+}}truncxfhf2
+; ALL: ret
+entry:
+  %0 = tail call i16 @llvm.convert.to.fp16.f80(x86_fp80 %d)
+  ret i16 %0
+}
+
+define zeroext i16 @test1(double %d) #1 {
+; ALL-LABEL: test1:
+; ALL: callq  {{_+}}truncdfhf2
+; ALL: ret
+entry:
+  %0 = tail call i16 @llvm.convert.to.fp16.f64(double %d)
+  ret i16 %0
+}
+
+define zeroext i16 @test2(x86_fp80 %d) #1 {
+; ALL-LABEL: test2:
+; ALL: callq  {{_+}}truncxfhf2
+; ALL: ret
+entry:
+  %0 = tail call i16 @llvm.convert.to.fp16.f80(x86_fp80 %d)
+  ret i16 %0
+}
+
+declare i16 @llvm.convert.to.fp16.f64(double)
+declare i16 @llvm.convert.to.fp16.f80(x86_fp80)
+
+attributes #0 = { nounwind readnone uwtable "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone uwtable "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/float-conv-elim.ll b/test/CodeGen/X86/float-conv-elim.ll
new file mode 100644
index 0000000..3feff85
--- /dev/null
+++ b/test/CodeGen/X86/float-conv-elim.ll
@@ -0,0 +1,32 @@
+; RUN: llc -march=x86-64 -mcpu=x86-64 < %s | FileCheck %s
+
+; Make sure the float conversion is folded away as it should be.
+; CHECK-LABEL: foo
+; CHECK-NOT: cvt
+; CHECK: movzbl
+define i32 @foo(i8 %a) #0 {
+  %conv = uitofp i8 %a to float
+  %conv1 = fptosi float %conv to i32
+  ret i32 %conv1
+}
+
+; CHECK-LABEL: foo2
+; CHECK-NOT: cvt
+; CHECK: movsbl
+define i32 @foo2(i8 %a) #0 {
+  %conv = sitofp i8 %a to float
+  %conv1 = fptosi float %conv to i32
+  ret i32 %conv1
+}
+
+; CHECK-LABEL: bar
+; CHECK-NOT: cvt
+; CHECK: movl
+define zeroext i8 @bar(i8 zeroext %a) #0 {
+  %conv = uitofp i8 %a to float
+  %conv1 = fptoui float %conv to i8
+  ret i8 %conv1
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
diff --git a/test/CodeGen/X86/fold-load-unops.ll b/test/CodeGen/X86/fold-load-unops.ll
new file mode 100644
index 0000000..0b2e6c7
--- /dev/null
+++ b/test/CodeGen/X86/fold-load-unops.ll
@@ -0,0 +1,57 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s
+
+; Verify that we're folding the load into the math instruction.
+
+; FIXME: The folding should also happen without the avx attribute; 
+; ie, when generating SSE (non-VEX-prefixed) instructions.
+
+define float @rcpss(float* %a) {
+; CHECK-LABEL: rcpss:
+; CHECK:       vrcpss (%rdi), %xmm0, %xmm0
+
+    %ld = load float* %a
+    %ins = insertelement <4 x float> undef, float %ld, i32 0
+    %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins)
+    %ext = extractelement <4 x float> %res, i32 0
+    ret float %ext
+}
+
+define float @rsqrtss(float* %a) {
+; CHECK-LABEL: rsqrtss:
+; CHECK:       vrsqrtss (%rdi), %xmm0, %xmm0
+
+    %ld = load float* %a
+    %ins = insertelement <4 x float> undef, float %ld, i32 0
+    %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins)
+    %ext = extractelement <4 x float> %res, i32 0
+    ret float %ext
+}
+
+define float @sqrtss(float* %a) {
+; CHECK-LABEL: sqrtss:
+; CHECK:       vsqrtss (%rdi), %xmm0, %xmm0
+
+    %ld = load float* %a
+    %ins = insertelement <4 x float> undef, float %ld, i32 0
+    %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins)
+    %ext = extractelement <4 x float> %res, i32 0
+    ret float %ext
+}
+
+define double @sqrtsd(double* %a) {
+; CHECK-LABEL: sqrtsd:
+; CHECK:       vsqrtsd (%rdi), %xmm0, %xmm0
+
+    %ld = load double* %a
+    %ins = insertelement <2 x double> undef, double %ld, i32 0
+    %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins)
+    %ext = extractelement <2 x double> %res, i32 0
+    ret double %ext
+}
+
+
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll
index a643d86..5bf5dbd 100644
--- a/test/CodeGen/X86/fold-tied-op.ll
+++ b/test/CodeGen/X86/fold-tied-op.ll
@@ -1,84 +1,84 @@
-; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd < %s | FileCheck %s
-; Regression test for http://reviews.llvm.org/D5701
-
-; ModuleID = 'xxhash.i'
-target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
-target triple = "i386--netbsd"
-
-; CHECK-LABEL: fn1
-; CHECK:       shldl {{.*#+}} 4-byte Folded Spill
-; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
-; CHECK:       shldl {{.*#+}} 4-byte Folded Spill
-; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
-; CHECK:       addl  {{.*#+}} 4-byte Folded Reload
-; CHECK:       imull {{.*#+}} 4-byte Folded Reload
-; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
-; CHECK:       retl
-
-%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 }
-
-@a = common global i32 0, align 4
-@b = common global i64 0, align 8
-
-; Function Attrs: nounwind uwtable
-define i64 @fn1() #0 {
-entry:
-  %0 = load i32* @a, align 4, !tbaa !1
-  %1 = inttoptr i32 %0 to %struct.XXH_state64_t*
-  %total_len = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 0
-  %2 = load i32* %total_len, align 4, !tbaa !5
-  %tobool = icmp eq i32 %2, 0
-  br i1 %tobool, label %if.else, label %if.then
-
-if.then:                                          ; preds = %entry
-  %v3 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 3
-  %3 = load i64* %v3, align 4, !tbaa !8
-  %v4 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 4
-  %4 = load i64* %v4, align 4, !tbaa !9
-  %v2 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 2
-  %5 = load i64* %v2, align 4, !tbaa !10
-  %shl = shl i64 %5, 1
-  %or = or i64 %shl, %5
-  %shl2 = shl i64 %3, 2
-  %shr = lshr i64 %3, 1
-  %or3 = or i64 %shl2, %shr
-  %add = add i64 %or, %or3
-  %mul = mul i64 %4, -4417276706812531889
-  %shl4 = mul i64 %4, -8834553413625063778
-  %shr5 = ashr i64 %mul, 3
-  %or6 = or i64 %shr5, %shl4
-  %mul7 = mul nsw i64 %or6, 1400714785074694791
-  %xor = xor i64 %add, %mul7
-  store i64 %xor, i64* @b, align 8, !tbaa !11
-  %mul8 = mul nsw i64 %xor, 1400714785074694791
-  br label %if.end
-
-if.else:                                          ; preds = %entry
-  %6 = load i64* @b, align 8, !tbaa !11
-  %xor10 = xor i64 %6, -4417276706812531889
-  %mul11 = mul nsw i64 %xor10, 400714785074694791
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %storemerge.in = phi i64 [ %mul11, %if.else ], [ %mul8, %if.then ]
-  %storemerge = add i64 %storemerge.in, -8796714831421723037
-  store i64 %storemerge, i64* @b, align 8, !tbaa !11
-  ret i64 undef
-}
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.ident = !{!0}
-
-!0 = metadata !{metadata !"clang version 3.6 (trunk 219587)"}
-!1 = metadata !{metadata !2, metadata !2, i64 0}
-!2 = metadata !{metadata !"int", metadata !3, i64 0}
-!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
-!5 = metadata !{metadata !6, metadata !2, i64 0}
-!6 = metadata !{metadata !"XXH_state64_t", metadata !2, i64 0, metadata !2, i64 4, metadata !7, i64 8, metadata !7, i64 16, metadata !7, i64 24}
-!7 = metadata !{metadata !"long long", metadata !3, i64 0}
-!8 = metadata !{metadata !6, metadata !7, i64 16}
-!9 = metadata !{metadata !6, metadata !7, i64 24}
-!10 = metadata !{metadata !6, metadata !7, i64 8}
-!11 = metadata !{metadata !7, metadata !7, i64 0}
+; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd < %s | FileCheck %s
+; Regression test for http://reviews.llvm.org/D5701
+
+; ModuleID = 'xxhash.i'
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386--netbsd"
+
+; CHECK-LABEL: fn1
+; CHECK:       shldl {{.*#+}} 4-byte Folded Spill
+; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
+; CHECK:       shldl {{.*#+}} 4-byte Folded Spill
+; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
+; CHECK:       addl  {{.*#+}} 4-byte Folded Reload
+; CHECK:       imull {{.*#+}} 4-byte Folded Reload
+; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
+; CHECK:       retl
+
+%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 }
+
+@a = common global i32 0, align 4
+@b = common global i64 0, align 8
+
+; Function Attrs: nounwind uwtable
+define i64 @fn1() #0 {
+entry:
+  %0 = load i32* @a, align 4, !tbaa !1
+  %1 = inttoptr i32 %0 to %struct.XXH_state64_t*
+  %total_len = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 0
+  %2 = load i32* %total_len, align 4, !tbaa !5
+  %tobool = icmp eq i32 %2, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %v3 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 3
+  %3 = load i64* %v3, align 4, !tbaa !8
+  %v4 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 4
+  %4 = load i64* %v4, align 4, !tbaa !9
+  %v2 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 2
+  %5 = load i64* %v2, align 4, !tbaa !10
+  %shl = shl i64 %5, 1
+  %or = or i64 %shl, %5
+  %shl2 = shl i64 %3, 2
+  %shr = lshr i64 %3, 1
+  %or3 = or i64 %shl2, %shr
+  %add = add i64 %or, %or3
+  %mul = mul i64 %4, -4417276706812531889
+  %shl4 = mul i64 %4, -8834553413625063778
+  %shr5 = ashr i64 %mul, 3
+  %or6 = or i64 %shr5, %shl4
+  %mul7 = mul nsw i64 %or6, 1400714785074694791
+  %xor = xor i64 %add, %mul7
+  store i64 %xor, i64* @b, align 8, !tbaa !11
+  %mul8 = mul nsw i64 %xor, 1400714785074694791
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %6 = load i64* @b, align 8, !tbaa !11
+  %xor10 = xor i64 %6, -4417276706812531889
+  %mul11 = mul nsw i64 %xor10, 400714785074694791
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %storemerge.in = phi i64 [ %mul11, %if.else ], [ %mul8, %if.then ]
+  %storemerge = add i64 %storemerge.in, -8796714831421723037
+  store i64 %storemerge, i64* @b, align 8, !tbaa !11
+  ret i64 undef
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.6 (trunk 219587)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !2, i64 0}
+!6 = !{!"XXH_state64_t", !2, i64 0, !2, i64 4, !7, i64 8, !7, i64 16, !7, i64 24}
+!7 = !{!"long long", !3, i64 0}
+!8 = !{!6, !7, i64 16}
+!9 = !{!6, !7, i64 24}
+!10 = !{!6, !7, i64 8}
+!11 = !{!7, !7, i64 0}
diff --git a/test/CodeGen/X86/fold-vex.ll b/test/CodeGen/X86/fold-vex.ll
index 2bb5b44..5a8b1d8 100644
--- a/test/CodeGen/X86/fold-vex.ll
+++ b/test/CodeGen/X86/fold-vex.ll
@@ -1,16 +1,31 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s
+; Use CPU parameters to ensure that a CPU-specific attribute is not overriding the AVX definition.
 
-;CHECK: @test
-; No need to load from memory. The operand will be loaded as part of th AND instr.
-;CHECK-NOT: vmovaps
-;CHECK: vandps
-;CHECK: ret
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown                  -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx             | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2                 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown                  -mattr=-avx | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx -mattr=-avx | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2     -mattr=-avx | FileCheck %s --check-prefix=SSE
 
-define void @test1(<8 x i32>* %p0, <8 x i32> %in1) nounwind {
-entry:
-  %in0 = load <8 x i32>* %p0, align 2
-  %a = and <8 x i32> %in0, %in1
-  store <8 x i32> %a, <8 x i32>* undef
-  ret void
+; No need to load unaligned operand from memory using an explicit instruction with AVX.
+; The operand should be folded into the AND instr.
+
+; With SSE, folding memory operands into math/logic ops requires 16-byte alignment
+; unless specially configured on some CPUs such as AMD Family 10H.
+
+define <4 x i32> @test1(<4 x i32>* %p0, <4 x i32> %in1) nounwind {
+  %in0 = load <4 x i32>* %p0, align 2
+  %a = and <4 x i32> %in0, %in1
+  ret <4 x i32> %a
+
+; CHECK-LABEL: @test1
+; CHECK-NOT:   vmovups
+; CHECK:       vandps (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:  ret
+
+; SSE-LABEL: @test1
+; SSE:       movups (%rdi), %xmm1
+; SSE-NEXT:  andps %xmm1, %xmm0
+; SSE-NEXT:  ret
 }
 
diff --git a/test/CodeGen/X86/force-align-stack-alloca.ll b/test/CodeGen/X86/force-align-stack-alloca.ll
index 95defc8..bd98069 100644
--- a/test/CodeGen/X86/force-align-stack-alloca.ll
+++ b/test/CodeGen/X86/force-align-stack-alloca.ll
@@ -33,14 +33,14 @@ define i64 @g(i32 %i) nounwind {
 ; CHECK-NOT:         {{[^ ,]*}}, %esp
 ;
 ; Next we set up the memset call, and then undo it.
-; CHECK:      subl   $32, %esp
+; CHECK:      subl   $20, %esp
 ; CHECK-NOT:         {{[^ ,]*}}, %esp
 ; CHECK:      calll  memset
 ; CHECK-NEXT: addl   $32, %esp
 ; CHECK-NOT:         {{[^ ,]*}}, %esp
 ;
 ; Next we set up the call to 'f'.
-; CHECK:      subl   $32, %esp
+; CHECK:      subl   $28, %esp
 ; CHECK-NOT:         {{[^ ,]*}}, %esp
 ; CHECK:      calll  f
 ; CHECK-NEXT: addl   $32, %esp
diff --git a/test/CodeGen/X86/fp-double-rounding.ll b/test/CodeGen/X86/fp-double-rounding.ll
new file mode 100644
index 0000000..030cb9a
--- /dev/null
+++ b/test/CodeGen/X86/fp-double-rounding.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SAFE
+; RUN: llc < %s -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK --check-prefix=UNSAFE
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64--"
+
+; CHECK-LABEL: double_rounding:
+; SAFE: callq __trunctfdf2
+; SAFE-NEXT: cvtsd2ss %xmm0
+; UNSAFE: callq __trunctfsf2
+; UNSAFE-NOT: cvt
+define void @double_rounding(fp128* %x, float* %f) {
+entry:
+  %0 = load fp128* %x, align 16
+  %1 = fptrunc fp128 %0 to double
+  %2 = fptrunc double %1 to float
+  store float %2, float* %f, align 4
+  ret void
+}
+
+; CHECK-LABEL: double_rounding_precise_first:
+; CHECK: fstps (%
+; CHECK-NOT: fstpl
+define void @double_rounding_precise_first(float* %f) {
+entry:
+  ; Hack, to generate a precise FP_ROUND to double
+  %precise = call double asm sideeffect "fld %st(0)", "={st(0)}"()
+  %0 = fptrunc double %precise to float
+  store float %0, float* %f, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/fpstack-debuginstr-kill.ll b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
index dfc59a3..e3180f4 100644
--- a/test/CodeGen/X86/fpstack-debuginstr-kill.ll
+++ b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
@@ -32,7 +32,7 @@ sw.bb735:                                         ; preds = %if.end511
   unreachable
 
 if.end41.i2210:                                   ; preds = %if.end511
-  call void @llvm.dbg.value(metadata !{x86_fp80 %src.sroa.0.0.src.sroa.0.0.2280}, i64 0, metadata !20, metadata !{metadata !"0x102"})
+  call void @llvm.dbg.value(metadata x86_fp80 %src.sroa.0.0.src.sroa.0.0.2280, i64 0, metadata !20, metadata !{!"0x102"})
   unreachable
 
 sw.bb992:                                         ; preds = %if.end511
@@ -43,29 +43,29 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!24, !25}
-!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 (http://llvm.org/git/clang 8444ae7cfeaefae031f8fedf0d1435ca3b14d90b) (http://llvm.org/git/llvm 886f0101a7d176543b831f5efb74c03427244a55)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !21, metadata !2} ; [ DW_TAG_compile_unit ] [x87stackifier/fpu_ieee.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"fpu_ieee.cpp", metadata !"x87stackifier"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00fpuop_arithmetic\00fpuop_arithmetic\00_Z16fpuop_arithmeticjj\0011\000\001\000\006\00256\001\0013", metadata !5, metadata !6, metadata !7, null, void (i32, i32)* @_Z16fpuop_arithmeticjj, null, null, metadata !10} ; [ DW_TAG_subprogram ] [line 11] [def] [scope 13] [fpuop_arithmetic]
-!5 = metadata !{metadata !"f1.cpp", metadata !"x87stackifier"}
-!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [x87stackifier/f1.cpp]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9, metadata !9}
-!9 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
-!10 = metadata !{metadata !11, metadata !12, metadata !13, metadata !18, metadata !20}
-!11 = metadata !{metadata !"0x101\00\0016777227\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 11]
-!12 = metadata !{metadata !"0x101\00\0033554443\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 11]
-!13 = metadata !{metadata !"0x100\00x\0014\000", metadata !4, metadata !6, metadata !14} ; [ DW_TAG_auto_variable ] [x] [line 14]
-!14 = metadata !{metadata !"0x16\00fpu_extended\003\000\000\000\000", metadata !5, null, metadata !15} ; [ DW_TAG_typedef ] [fpu_extended] [line 3, size 0, align 0, offset 0] [from fpu_register]
-!15 = metadata !{metadata !"0x16\00fpu_register\002\000\000\000\000", metadata !5, null, metadata !16} ; [ DW_TAG_typedef ] [fpu_register] [line 2, size 0, align 0, offset 0] [from uae_f64]
-!16 = metadata !{metadata !"0x16\00uae_f64\001\000\000\000\000", metadata !5, null, metadata !17} ; [ DW_TAG_typedef ] [uae_f64] [line 1, size 0, align 0, offset 0] [from double]
-!17 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
-!18 = metadata !{metadata !"0x100\00a\0015\000", metadata !4, metadata !6, metadata !19} ; [ DW_TAG_auto_variable ] [a] [line 15]
-!19 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!20 = metadata !{metadata !"0x100\00value\0016\000", metadata !4, metadata !6, metadata !14} ; [ DW_TAG_auto_variable ] [value] [line 16]
-!21 = metadata !{metadata !22, metadata !23}
-!22 = metadata !{metadata !"0x34\00g1\00g1\00\005\000\001", null, metadata !6, metadata !14, double* @g1, null} ; [ DW_TAG_variable ] [g1] [line 5] [def]
-!23 = metadata !{metadata !"0x34\00g2\00g2\00\006\000\001", null, metadata !6, metadata !19, i32* @g2, null} ; [ DW_TAG_variable ] [g2] [line 6] [def]
-!24 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!25 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.6.0 (http://llvm.org/git/clang 8444ae7cfeaefae031f8fedf0d1435ca3b14d90b) (http://llvm.org/git/llvm 886f0101a7d176543b831f5efb74c03427244a55)\001\00\000\00\001", !1, !2, !2, !3, !21, !2} ; [ DW_TAG_compile_unit ] [x87stackifier/fpu_ieee.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"fpu_ieee.cpp", !"x87stackifier"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00fpuop_arithmetic\00fpuop_arithmetic\00_Z16fpuop_arithmeticjj\0011\000\001\000\006\00256\001\0013", !5, !6, !7, null, void (i32, i32)* @_Z16fpuop_arithmeticjj, null, null, !10} ; [ DW_TAG_subprogram ] [line 11] [def] [scope 13] [fpuop_arithmetic]
+!5 = !{!"f1.cpp", !"x87stackifier"}
+!6 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [x87stackifier/f1.cpp]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9, !9}
+!9 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!10 = !{!11, !12, !13, !18, !20}
+!11 = !{!"0x101\00\0016777227\000", !4, !6, !9} ; [ DW_TAG_arg_variable ] [line 11]
+!12 = !{!"0x101\00\0033554443\000", !4, !6, !9} ; [ DW_TAG_arg_variable ] [line 11]
+!13 = !{!"0x100\00x\0014\000", !4, !6, !14} ; [ DW_TAG_auto_variable ] [x] [line 14]
+!14 = !{!"0x16\00fpu_extended\003\000\000\000\000", !5, null, !15} ; [ DW_TAG_typedef ] [fpu_extended] [line 3, size 0, align 0, offset 0] [from fpu_register]
+!15 = !{!"0x16\00fpu_register\002\000\000\000\000", !5, null, !16} ; [ DW_TAG_typedef ] [fpu_register] [line 2, size 0, align 0, offset 0] [from uae_f64]
+!16 = !{!"0x16\00uae_f64\001\000\000\000\000", !5, null, !17} ; [ DW_TAG_typedef ] [uae_f64] [line 1, size 0, align 0, offset 0] [from double]
+!17 = !{!"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!18 = !{!"0x100\00a\0015\000", !4, !6, !19} ; [ DW_TAG_auto_variable ] [a] [line 15]
+!19 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!20 = !{!"0x100\00value\0016\000", !4, !6, !14} ; [ DW_TAG_auto_variable ] [value] [line 16]
+!21 = !{!22, !23}
+!22 = !{!"0x34\00g1\00g1\00\005\000\001", null, !6, !14, double* @g1, null} ; [ DW_TAG_variable ] [g1] [line 5] [def]
+!23 = !{!"0x34\00g2\00g2\00\006\000\001", null, !6, !19, i32* @g2, null} ; [ DW_TAG_variable ] [g2] [line 6] [def]
+!24 = !{i32 2, !"Dwarf Version", i32 2}
+!25 = !{i32 2, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/frameaddr.ll b/test/CodeGen/X86/frameaddr.ll
index 452c8e5..5646196 100644
--- a/test/CodeGen/X86/frameaddr.ll
+++ b/test/CodeGen/X86/frameaddr.ll
@@ -1,9 +1,12 @@
 ; RUN: llc < %s -march=x86                                | FileCheck %s --check-prefix=CHECK-32
 ; RUN: llc < %s -march=x86    -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-32
-; RUN: llc < %s -march=x86-64                             | FileCheck %s --check-prefix=CHECK-64
-; RUN: llc < %s -march=x86-64 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -fast-isel | FileCheck %s --check-prefix=CHECK-W64
+; RUN: llc < %s -mtriple=x86_64-unknown                             | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc < %s -mtriple=x86_64-unknown -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-64
 ; RUN: llc < %s -mtriple=x86_64-gnux32                    | FileCheck %s --check-prefix=CHECK-X32ABI
 ; RUN: llc < %s -mtriple=x86_64-gnux32 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-X32ABI
+; RUN: llc < %s -mtriple=x86_64-nacl                    | FileCheck %s --check-prefix=CHECK-NACL64
+; RUN: llc < %s -mtriple=x86_64-nacl -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-NACL64
 
 define i8* @test1() nounwind {
 entry:
@@ -13,6 +16,12 @@ entry:
 ; CHECK-32-NEXT:  movl %ebp, %eax
 ; CHECK-32-NEXT:  pop
 ; CHECK-32-NEXT:  ret
+; CHECK-W64-LABEL: test1
+; CHECK-W64:       push
+; CHECK-W64-NEXT:  movq %rsp, %rbp
+; CHECK-W64-NEXT:  leaq (%rbp), %rax
+; CHECK-W64-NEXT:  pop
+; CHECK-W64-NEXT:  ret
 ; CHECK-64-LABEL: test1
 ; CHECK-64:       push
 ; CHECK-64-NEXT:  movq %rsp, %rbp
@@ -25,6 +34,10 @@ entry:
 ; CHECK-X32ABI-NEXT:  movl %ebp, %eax
 ; CHECK-X32ABI-NEXT:  popq %rbp
 ; CHECK-X32ABI-NEXT:  ret
+; CHECK-NACL64-LABEL: test1
+; CHECK-NACL64:       pushq %rbp
+; CHECK-NACL64-NEXT:  movq %rsp, %rbp
+; CHECK-NACL64-NEXT:  movl %ebp, %eax
   %0 = tail call i8* @llvm.frameaddress(i32 0)
   ret i8* %0
 }
@@ -38,6 +51,12 @@ entry:
 ; CHECK-32-NEXT:  movl (%eax), %eax
 ; CHECK-32-NEXT:  pop
 ; CHECK-32-NEXT:  ret
+; CHECK-W64-LABEL: test2
+; CHECK-W64:       push
+; CHECK-W64-NEXT:  movq %rsp, %rbp
+; CHECK-W64-NEXT:  leaq (%rbp), %rax
+; CHECK-W64-NEXT:  pop
+; CHECK-W64-NEXT:  ret
 ; CHECK-64-LABEL: test2
 ; CHECK-64:       push
 ; CHECK-64-NEXT:  movq %rsp, %rbp
@@ -52,6 +71,11 @@ entry:
 ; CHECK-X32ABI-NEXT:  movl (%eax), %eax
 ; CHECK-X32ABI-NEXT:  popq %rbp
 ; CHECK-X32ABI-NEXT:  ret
+; CHECK-NACL64-LABEL: test2
+; CHECK-NACL64:       pushq %rbp
+; CHECK-NACL64-NEXT:  movq %rsp, %rbp
+; CHECK-NACL64-NEXT:  movl (%ebp), %eax
+; CHECK-NACL64-NEXT:  movl (%eax), %eax
   %0 = tail call i8* @llvm.frameaddress(i32 2)
   ret i8* %0
 }
diff --git a/test/CodeGen/X86/frameallocate.ll b/test/CodeGen/X86/frameallocate.ll
new file mode 100644
index 0000000..7a2f9e3
--- /dev/null
+++ b/test/CodeGen/X86/frameallocate.ll
@@ -0,0 +1,43 @@
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s
+
+declare i8* @llvm.frameallocate(i32)
+declare i8* @llvm.frameaddress(i32)
+declare i8* @llvm.framerecover(i8*, i8*)
+declare i32 @printf(i8*, ...)
+
+@str = internal constant [10 x i8] c"asdf: %d\0A\00"
+
+define void @print_framealloc_from_fp(i8* %fp) {
+  %alloc = call i8* @llvm.framerecover(i8* bitcast (void(i32*, i32*)* @alloc_func to i8*), i8* %fp)
+  %alloc_i32 = bitcast i8* %alloc to i32*
+  %r = load i32* %alloc_i32
+  call i32 (i8*, ...)* @printf(i8* getelementptr ([10 x i8]* @str, i32 0, i32 0), i32 %r)
+  ret void
+}
+
+; CHECK-LABEL: print_framealloc_from_fp:
+; CHECK: movabsq $.Lframeallocation_alloc_func, %[[offs:[a-z]+]]
+; CHECK: movl (%rcx,%[[offs]]), %edx
+; CHECK: leaq {{.*}}(%rip), %rcx
+; CHECK: callq printf
+; CHECK: retq
+
+define void @alloc_func(i32* %s, i32* %d) {
+  %alloc = call i8* @llvm.frameallocate(i32 16)
+  %alloc_i32 = bitcast i8* %alloc to i32*
+  store i32 42, i32* %alloc_i32
+  %fp = call i8* @llvm.frameaddress(i32 0)
+  call void @print_framealloc_from_fp(i8* %fp)
+  ret void
+}
+
+; CHECK-LABEL: alloc_func:
+; CHECK: subq    $48, %rsp
+; CHECK: .seh_stackalloc 48
+; CHECK: leaq    48(%rsp), %rbp
+; CHECK: .seh_setframe 5, 48
+; CHECK: .Lframeallocation_alloc_func = -[[offs:[0-9]+]]
+; CHECK: movl $42, -[[offs]](%rbp)
+; CHECK: leaq    -48(%rbp), %rcx
+; CHECK: callq print_framealloc_from_fp
+; CHECK: retq
diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll
index 5f48b1e..6d397b2 100644
--- a/test/CodeGen/X86/gather-addresses.ll
+++ b/test/CodeGen/X86/gather-addresses.ll
@@ -1,35 +1,38 @@
 ; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN
 ; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=WIN
+; RUN: llc -mtriple=i686-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN32
 ; rdar://7398554
 
 ; When doing vector gather-scatter index calculation with 32-bit indices,
-; bounce the vector off of cache rather than shuffling each individual
+; use an efficient mov/shift sequence rather than shuffling each individual
 ; element out of the index vector.
 
-; CHECK: foo:
-; LIN: movaps	(%rsi), %xmm0
-; LIN: andps	(%rdx), %xmm0
-; LIN: movaps	%xmm0, -24(%rsp)
-; LIN: movslq	-24(%rsp), %[[REG1:r.+]]
-; LIN: movslq	-20(%rsp), %[[REG2:r.+]]
-; LIN: movslq	-16(%rsp), %[[REG3:r.+]]
-; LIN: movslq	-12(%rsp), %[[REG4:r.+]]
-; LIN: movsd	(%rdi,%[[REG1]],8), %xmm0
-; LIN: movhpd	(%rdi,%[[REG2]],8), %xmm0
-; LIN: movsd	(%rdi,%[[REG3]],8), %xmm1
-; LIN: movhpd	(%rdi,%[[REG4]],8), %xmm1
+; CHECK-LABEL: foo:
+; LIN: movdqa	(%rsi), %xmm0
+; LIN: pand 	(%rdx), %xmm0
+; LIN: pextrq	$1, %xmm0, %r[[REG4:.+]]
+; LIN: movd 	%xmm0, %r[[REG2:.+]]
+; LIN: movslq	%e[[REG2]], %r[[REG1:.+]]
+; LIN: sarq    $32, %r[[REG2]]
+; LIN: movslq	%e[[REG4]], %r[[REG3:.+]]
+; LIN: sarq    $32, %r[[REG4]]
+; LIN: movsd	(%rdi,%r[[REG1]],8), %xmm0
+; LIN: movhpd	(%rdi,%r[[REG2]],8), %xmm0
+; LIN: movsd	(%rdi,%r[[REG3]],8), %xmm1
+; LIN: movhpd	(%rdi,%r[[REG4]],8), %xmm1
 
-; WIN: movaps	(%rdx), %xmm0
-; WIN: andps	(%r8), %xmm0
-; WIN: movaps	%xmm0, (%rsp)
-; WIN: movslq	(%rsp), %[[REG1:r.+]]
-; WIN: movslq	4(%rsp), %[[REG2:r.+]]
-; WIN: movslq	8(%rsp), %[[REG3:r.+]]
-; WIN: movslq	12(%rsp), %[[REG4:r.+]]
-; WIN: movsd	(%rcx,%[[REG1]],8), %xmm0
-; WIN: movhpd	(%rcx,%[[REG2]],8), %xmm0
-; WIN: movsd	(%rcx,%[[REG3]],8), %xmm1
-; WIN: movhpd	(%rcx,%[[REG4]],8), %xmm1
+; WIN: movdqa	(%rdx), %xmm0
+; WIN: pand 	(%r8), %xmm0
+; WIN: pextrq	$1, %xmm0, %r[[REG4:.+]]
+; WIN: movd 	%xmm0, %r[[REG2:.+]]
+; WIN: movslq	%e[[REG2]], %r[[REG1:.+]]
+; WIN: sarq    $32, %r[[REG2]]
+; WIN: movslq	%e[[REG4]], %r[[REG3:.+]]
+; WIN: sarq    $32, %r[[REG4]]
+; WIN: movsd	(%rcx,%r[[REG1]],8), %xmm0
+; WIN: movhpd	(%rcx,%r[[REG2]],8), %xmm0
+; WIN: movsd	(%rcx,%r[[REG3]],8), %xmm1
+; WIN: movhpd	(%rcx,%r[[REG4]],8), %xmm1
 
 define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
   %a = load <4 x i32>* %i
@@ -53,3 +56,35 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
   %v3 = insertelement <4 x double> %v2, double %r3, i32 3
   ret <4 x double> %v3
 }
+
+; Check that the sequence previously used above, which bounces the vector off the
+; cache works for x86-32. Note that in this case it will not be used for index
+; calculation, since indexes are 32-bit, not 64.
+; CHECK-LABEL: old:
+; LIN32: movaps	%xmm0, (%esp)
+; LIN32-DAG: {{(mov|and)}}l	(%esp),
+; LIN32-DAG: {{(mov|and)}}l	4(%esp),
+; LIN32-DAG: {{(mov|and)}}l	8(%esp),
+; LIN32-DAG: {{(mov|and)}}l	12(%esp),
+define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind {
+  %a = load <4 x i32>* %i
+  %b = load <4 x i32>* %h
+  %j = and <4 x i32> %a, %b
+  %d0 = extractelement <4 x i32> %j, i32 0
+  %d1 = extractelement <4 x i32> %j, i32 1
+  %d2 = extractelement <4 x i32> %j, i32 2
+  %d3 = extractelement <4 x i32> %j, i32 3
+  %q0 = zext i32 %d0 to i64
+  %q1 = zext i32 %d1 to i64
+  %q2 = zext i32 %d2 to i64
+  %q3 = zext i32 %d3 to i64  
+  %r0 = and i64 %q0, %f
+  %r1 = and i64 %q1, %f
+  %r2 = and i64 %q2, %f
+  %r3 = and i64 %q3, %f
+  %v0 = insertelement <4 x i64> undef, i64 %r0, i32 0
+  %v1 = insertelement <4 x i64> %v0, i64 %r1, i32 1
+  %v2 = insertelement <4 x i64> %v1, i64 %r2, i32 2
+  %v3 = insertelement <4 x i64> %v2, i64 %r3, i32 3
+  ret <4 x i64> %v3
+}
diff --git a/test/CodeGen/X86/gcc_except_table.ll b/test/CodeGen/X86/gcc_except_table.ll
index a732eb1..abce130 100644
--- a/test/CodeGen/X86/gcc_except_table.ll
+++ b/test/CodeGen/X86/gcc_except_table.ll
@@ -15,7 +15,7 @@ define i32 @main() uwtable optsize ssp {
 
 ; MINGW64: .seh_proc
 ; MINGW64: .seh_handler __gxx_personality_v0
-; MINGW64: .seh_setframe 5, 0
+; MINGW64: .seh_setframe 5, 32
 ; MINGW64: callq _Unwind_Resume
 ; MINGW64: .seh_handlerdata
 ; MINGW64: GCC_except_table0:
diff --git a/test/CodeGen/X86/ghc-cc.ll b/test/CodeGen/X86/ghc-cc.ll
index 4dba2c0..3ada8c8 100644
--- a/test/CodeGen/X86/ghc-cc.ll
+++ b/test/CodeGen/X86/ghc-cc.ll
@@ -12,13 +12,13 @@ entry:
   ; CHECK: movl {{[0-9]*}}(%esp), %ebx
   ; CHECK-NEXT: movl {{[0-9]*}}(%esp), %ebp
   ; CHECK-NEXT: calll addtwo
-  %0 = call cc 10 i32 @addtwo(i32 %a, i32 %b)
+  %0 = call ghccc i32 @addtwo(i32 %a, i32 %b)
   ; CHECK: calll foo
   call void @foo() nounwind
   ret void
 }
 
-define cc 10 i32 @addtwo(i32 %x, i32 %y) nounwind {
+define ghccc i32 @addtwo(i32 %x, i32 %y) nounwind {
 entry:
   ; CHECK: leal (%ebx,%ebp), %eax
   %0 = add i32 %x, %y
@@ -26,7 +26,7 @@ entry:
   ret i32 %0
 }
 
-define cc 10 void @foo() nounwind {
+define ghccc void @foo() nounwind {
 entry:
   ; CHECK:      movl r1, %esi
   ; CHECK-NEXT: movl hp, %edi
@@ -37,8 +37,8 @@ entry:
   %2 = load i32* @sp
   %3 = load i32* @base
   ; CHECK: jmp bar
-  tail call cc 10 void @bar( i32 %3, i32 %2, i32 %1, i32 %0 ) nounwind
+  tail call ghccc void @bar( i32 %3, i32 %2, i32 %1, i32 %0 ) nounwind
   ret void
 }
 
-declare cc 10 void @bar(i32, i32, i32, i32)
+declare ghccc void @bar(i32, i32, i32, i32)
diff --git a/test/CodeGen/X86/ghc-cc64.ll b/test/CodeGen/X86/ghc-cc64.ll
index 403391e..7251dd6 100644
--- a/test/CodeGen/X86/ghc-cc64.ll
+++ b/test/CodeGen/X86/ghc-cc64.ll
@@ -25,13 +25,13 @@ entry:
   ; CHECK:      movq %rdi, %r13
   ; CHECK-NEXT: movq %rsi, %rbp
   ; CHECK-NEXT: callq addtwo
-  %0 = call cc 10 i64 @addtwo(i64 %a, i64 %b)
+  %0 = call ghccc i64 @addtwo(i64 %a, i64 %b)
   ; CHECK:      callq foo
   call void @foo() nounwind
   ret void
 }
 
-define cc 10 i64 @addtwo(i64 %x, i64 %y) nounwind {
+define ghccc i64 @addtwo(i64 %x, i64 %y) nounwind {
 entry:
   ; CHECK:      leaq (%r13,%rbp), %rax
   %0 = add i64 %x, %y
@@ -39,7 +39,7 @@ entry:
   ret i64 %0
 }
 
-define cc 10 void @foo() nounwind {
+define ghccc void @foo() nounwind {
 entry:
   ; CHECK:      movsd d2(%rip), %xmm6
   ; CHECK-NEXT: movsd d1(%rip), %xmm5
@@ -74,12 +74,12 @@ entry:
   %14 = load i64* @sp
   %15 = load i64* @base
   ; CHECK: jmp bar
-  tail call cc 10 void @bar( i64 %15, i64 %14, i64 %13, i64 %12, i64 %11,
+  tail call ghccc void @bar( i64 %15, i64 %14, i64 %13, i64 %12, i64 %11,
                              i64 %10, i64 %9, i64 %8, i64 %7, i64 %6,
                              float %5, float %4, float %3, float %2, double %1,
                              double %0 ) nounwind
   ret void
 }
 
-declare cc 10 void @bar(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64,
+declare ghccc void @bar(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64,
                         float, float, float, float, double, double)
diff --git a/test/CodeGen/X86/global-sections-comdat.ll b/test/CodeGen/X86/global-sections-comdat.ll
new file mode 100644
index 0000000..730050d
--- /dev/null
+++ b/test/CodeGen/X86/global-sections-comdat.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=i386-unknown-linux | FileCheck %s -check-prefix=LINUX
+; RUN: llc < %s -mtriple=i386-unknown-linux -data-sections -function-sections | FileCheck %s -check-prefix=LINUX-SECTIONS
+; RUN: llc < %s -mtriple=i386-unknown-linux -data-sections -function-sections -unique-section-names=false | FileCheck %s -check-prefix=LINUX-SECTIONS-SHORT
+
+$F1 = comdat any
+define void @F1(i32 %y) comdat {
+bb0:
+switch i32 %y, label %bb5 [
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+    i32 4, label %bb4
+  ]
+bb1:
+  ret void
+bb2:
+  ret void
+bb3:
+  ret void
+bb4:
+  ret void
+bb5:
+  ret void
+}
+
+; LINUX: .section        .text.F1,"axG",@progbits,F1,comdat
+; LINUX: .size   F1,
+; LINUX-NEXT: .cfi_endproc
+; LINUX-NEXT: .section        .rodata.F1,"aG",@progbits,F1,comdat
+
+; LINUX-SECTIONS: .section        .text.F1,"axG",@progbits,F1,comdat
+; LINUX-SECTIONS: .size   F1,
+; LINUX-SECTIONS-NEXT: .cfi_endproc
+; LINUX-SECTIONS-NEXT: .section        .rodata.F1,"aG",@progbits,F1,comdat
+
+; LINUX-SECTIONS-SHORT: .section        .text,"axG",@progbits,F1,comdat
+; LINUX-SECTIONS-SHORT: .size   F1,
+; LINUX-SECTIONS-SHORT-NEXT: .cfi_endproc
+; LINUX-SECTIONS-SHORT-NEXT: .section        .rodata,"aG",@progbits,F1,comdat
+
+$G16 = comdat any
+@G16 = unnamed_addr constant i32 42, comdat
+
+; LINUX: .section	.rodata.cst4.G16,"aGM",@progbits,4,G16,comdat
+; LINUX-SECTIONS: .section	.rodata.cst4.G16,"aGM",@progbits,4,G16,comdat
+; LINUX-SECTIONS-SHORT: .section	.rodata.cst4,"aGM",@progbits,4,G16,comdat
diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll
index fa1169d..c2f4b65 100644
--- a/test/CodeGen/X86/global-sections.ll
+++ b/test/CodeGen/X86/global-sections.ll
@@ -2,7 +2,8 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin9.7 | FileCheck %s -check-prefix=DARWIN
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -relocation-model=static | FileCheck %s -check-prefix=DARWIN-STATIC
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=DARWIN64
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -data-sections | FileCheck %s -check-prefix=LINUX-SECTIONS
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -data-sections -function-sections | FileCheck %s -check-prefix=LINUX-SECTIONS
+; RUN: llc < %s -mtriple=x86_64-pc-linux -data-sections -function-sections -relocation-model=pic | FileCheck %s -check-prefix=LINUX-SECTIONS-PIC
 ; RUN: llc < %s -mtriple=i686-pc-win32 -data-sections -function-sections | FileCheck %s -check-prefix=WIN32-SECTIONS
 
 define void @F1() {
@@ -12,6 +13,79 @@ define void @F1() {
 ; WIN32-SECTIONS: .section        .text,"xr",one_only,_F1
 ; WIN32-SECTIONS: .globl _F1
 
+define void @F2(i32 %y) {
+bb0:
+switch i32 %y, label %bb5 [
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+    i32 4, label %bb4
+  ]
+bb1:
+  ret void
+bb2:
+  ret void
+bb3:
+  ret void
+bb4:
+  ret void
+bb5:
+  ret void
+}
+
+; LINUX:     .size   F2,
+; LINUX-NEX: .cfi_endproc
+; LINUX-NEX: .section        .rodata,"a",@progbits
+
+; LINUX-SECTIONS: .section        .text.F2,"ax",@progbits
+; LINUX-SECTIONS: .size   F2,
+; LINUX-SECTIONS-NEXT: .cfi_endproc
+; LINUX-SECTIONS-NEXT: .section        .rodata.F2,"a",@progbits
+
+; LINUX-SECTIONS-PIC: .section        .text.F2,"ax",@progbits
+; LINUX-SECTIONS-PIC: .size   F2,
+; LINUX-SECTIONS-PIC-NEXT: .cfi_endproc
+; LINUX-SECTIONS-PIC-NEXT: .section        .rodata.F2,"a",@progbits
+
+declare void @G()
+
+define void @F3(i32 %y) {
+bb0:
+  invoke void @G()
+          to label %bb2 unwind label %bb1
+bb1:
+  landingpad { i8*, i32 } personality i8* bitcast (void ()* @G to i8*)
+          catch i8* null
+  br label %bb2
+bb2:
+
+switch i32 %y, label %bb7 [
+    i32 1, label %bb3
+    i32 2, label %bb4
+    i32 3, label %bb5
+    i32 4, label %bb6
+  ]
+bb3:
+  ret void
+bb4:
+  ret void
+bb5:
+  ret void
+bb6:
+  ret void
+bb7:
+  ret void
+}
+
+; DARWIN64: _F3:
+; DARWIN64: .cfi_endproc
+; DARWIN64-NEXT: Leh_func_end
+; DARWIN64-NEXT: .section        __TEXT,__gcc_except_tab
+; DARWIN64-NOT: .section
+; DARWIN64: .section        __TEXT,__text,regular,pure_instructions
+; DARWIN64-NOT: .section
+; DARWIN64: LJTI{{.*}}:
+
 ; int G1;
 @G1 = common global i32 0
 
@@ -48,7 +122,7 @@ define void @F1() {
 ; LINUX-SECTIONS: .section        .rodata.G3,"a",@progbits
 ; LINUX-SECTIONS: .globl  G3
 
-; WIN32-SECTIONS: .section        .rdata,"rd",one_only,_G3
+; WIN32-SECTIONS: .section        .rdata,"dr",one_only,_G3
 ; WIN32-SECTIONS: .globl  _G3
 
 
@@ -85,7 +159,6 @@ define void @F1() {
 @"foo bar" = linkonce global i32 42
 
 ; LINUX: .type  "foo bar",@object
-; LINUX: .section ".data.foo bar","aGw",@progbits,"foo bar",comdat
 ; LINUX: .weak  "foo bar"
 ; LINUX: "foo bar":
 
@@ -98,7 +171,6 @@ define void @F1() {
 @G6 = weak_odr unnamed_addr constant [1 x i8] c"\01"
 
 ; LINUX:   .type        G6,@object
-; LINUX:   .section     .rodata.G6,"aG",@progbits,G6,comdat
 ; LINUX:   .weak        G6
 ; LINUX: G6:
 ; LINUX:   .byte        1
@@ -123,10 +195,10 @@ define void @F1() {
 ; LINUX: G7:
 ; LINUX:        .asciz  "abcdefghi"
 
-; LINUX-SECTIONS: .section        .rodata.G7,"aMS",@progbits,1
+; LINUX-SECTIONS: .section        .rodata.str1.1,"aMS",@progbits,1
 ; LINUX-SECTIONS:       .globl G7
 
-; WIN32-SECTIONS: .section        .rdata,"rd",one_only,_G7
+; WIN32-SECTIONS: .section        .rdata,"dr",one_only,_G7
 ; WIN32-SECTIONS:       .globl _G7
 
 
@@ -184,12 +256,12 @@ define void @F1() {
 @G14 = private unnamed_addr constant [4 x i8] c"foo\00", align 1
 
 ; LINUX-SECTIONS:        .type   .LG14,@object           # @G14
-; LINUX-SECTIONS:        .section        .rodata..LG14,"aMS",@progbits,1
+; LINUX-SECTIONS:        .section        .rodata.str1.1,"aMS",@progbits,1
 ; LINUX-SECTIONS: .LG14:
 ; LINUX-SECTIONS:        .asciz  "foo"
 ; LINUX-SECTIONS:        .size   .LG14, 4
 
-; WIN32-SECTIONS:        .section        .rdata,"rd"
+; WIN32-SECTIONS:        .section        .rdata,"dr"
 ; WIN32-SECTIONS: L_G14:
 ; WIN32-SECTIONS:        .asciz  "foo"
 
@@ -208,8 +280,8 @@ define void @F1() {
 ; DARWIN64: .section       __TEXT,__const
 ; DARWIN64: _G15:
 
-; LINUX-SECTIONS: .section      .rodata.G15,"aM",@progbits,8
+; LINUX-SECTIONS: .section      .rodata.cst8,"aM",@progbits,8
 ; LINUX-SECTIONS: G15:
 
-; WIN32-SECTIONS: .section      .rdata,"rd",one_only,_G15
+; WIN32-SECTIONS: .section      .rdata,"dr",one_only,_G15
 ; WIN32-SECTIONS: _G15:
diff --git a/test/CodeGen/X86/hoist-invariant-load.ll b/test/CodeGen/X86/hoist-invariant-load.ll
index 34191e3..c9e5290 100644
--- a/test/CodeGen/X86/hoist-invariant-load.ll
+++ b/test/CodeGen/X86/hoist-invariant-load.ll
@@ -27,4 +27,4 @@ for.end:                                          ; preds = %for.body
 
 declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
 
-!0 = metadata !{}
+!0 = !{}
diff --git a/test/CodeGen/X86/huge-stack-offset.ll b/test/CodeGen/X86/huge-stack-offset.ll
new file mode 100644
index 0000000..6195161
--- /dev/null
+++ b/test/CodeGen/X86/huge-stack-offset.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -mtriple=x86_64-linux-unknown | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc < %s -mtriple=i386-linux-unknown | FileCheck %s --check-prefix=CHECK-32
+
+; Test that a large stack offset uses a single add/sub instruction to
+; adjust the stack pointer.
+
+define void @foo() nounwind {
+; CHECK-64-LABEL: foo:
+; CHECK-64:      movabsq $50000000{{..}}, %rax
+; CHECK-64-NEXT: subq    %rax, %rsp
+; CHECK-64-NOT:  subq    $2147483647, %rsp
+; CHECK-64:      movabsq $50000000{{..}}, [[RAX:%r..]]
+; CHECK-64-NEXT: addq    [[RAX]], %rsp
+
+; CHECK-32-LABEL: foo:
+; CHECK-32:      movl    $50000000{{..}}, %eax
+; CHECK-32-NEXT: subl    %eax, %esp
+; CHECK-32-NOT:  subl    $2147483647, %esp
+; CHECK-32:      movl    $50000000{{..}}, [[EAX:%e..]]
+; CHECK-32-NEXT: addl    [[EAX]], %esp
+  %1 = alloca [5000000000 x i8], align 16
+  %2 = getelementptr inbounds [5000000000 x i8]* %1, i32 0, i32 0
+  call void @bar(i8* %2)
+  ret void
+}
+
+; Verify that we do not clobber the return value.
+
+define i32 @foo2() nounwind {
+; CHECK-64-LABEL: foo2:
+; CHECK-64:     movl    $10, %eax
+; CHECK-64-NOT: movabsq ${{.*}}, %rax
+
+; CHECK-32-LABEL: foo2:
+; CHECK-32:     movl    $10, %eax
+; CHECK-32-NOT: movl    ${{.*}}, %eax
+  %1 = alloca [5000000000 x i8], align 16
+  %2 = getelementptr inbounds [5000000000 x i8]* %1, i32 0, i32 0
+  call void @bar(i8* %2)
+  ret i32 10
+}
+
+; Verify that we do not clobber EAX when using inreg attribute
+
+define i32 @foo3(i32 inreg %x) nounwind {
+; CHECK-64-LABEL: foo3:
+; CHECK-64:      movabsq $50000000{{..}}, %rax
+; CHECK-64-NEXT: subq    %rax, %rsp
+
+; CHECK-32-LABEL: foo3:
+; CHECK-32:      subl $2147483647, %esp
+; CHECK-32-NOT:  movl ${{.*}}, %eax
+  %1 = alloca [5000000000 x i8], align 16
+  %2 = getelementptr inbounds [5000000000 x i8]* %1, i32 0, i32 0
+  call void @bar(i8* %2)
+  ret i32 %x
+}
+
+declare void @bar(i8*)
diff --git a/test/CodeGen/X86/i1narrowfail.ll b/test/CodeGen/X86/i1narrowfail.ll
new file mode 100644
index 0000000..e280f3c
--- /dev/null
+++ b/test/CodeGen/X86/i1narrowfail.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: @foo
+; CHECK: orb     $16
+define void @foo(i64* %ptr) {
+  %r11 = load i64* %ptr, align 8
+  %r12 = or i64 16, %r11
+  store i64 %r12, i64* %ptr, align 8
+  ret void
+}
diff --git a/test/CodeGen/X86/ident-metadata.ll b/test/CodeGen/X86/ident-metadata.ll
index a568673..e08738f 100644
--- a/test/CodeGen/X86/ident-metadata.ll
+++ b/test/CodeGen/X86/ident-metadata.ll
@@ -5,5 +5,5 @@
 ; CHECK: .ident  "clang version x.x"
 ; CHECK-NEXT: .ident  "something else"
 !llvm.ident = !{!0, !1}
-!0 = metadata !{metadata !"clang version x.x"}
-!1 = metadata !{metadata !"something else"}
+!0 = !{!"clang version x.x"}
+!1 = !{!"something else"}
diff --git a/test/CodeGen/X86/imul.ll b/test/CodeGen/X86/imul.ll
new file mode 100644
index 0000000..c64b4e3
--- /dev/null
+++ b/test/CodeGen/X86/imul.ll
@@ -0,0 +1,110 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-pc-linux | FileCheck %s --check-prefix=X86
+
+define i32 @mul4_32(i32 %A) {
+; X64-LABEL: mul4_32:
+; X64: leal
+; X86-LABEL: mul4_32:
+; X86: shll
+    %mul = mul i32 %A, 4
+    ret i32 %mul
+}
+
+define i64 @mul4_64(i64 %A) {
+; X64-LABEL: mul4_64:
+; X64: leaq
+; X86-LABEL: mul4_64:
+; X86: shldl
+; X86: shll
+    %mul = mul i64 %A, 4
+    ret i64 %mul
+}
+
+define i32 @mul4096_32(i32 %A) {
+; X64-LABEL: mul4096_32:
+; X64: shll
+; X86-LABEL: mul4096_32:
+; X86: shll
+    %mul = mul i32 %A, 4096
+    ret i32 %mul
+}
+
+define i64 @mul4096_64(i64 %A) {
+; X64-LABEL: mul4096_64:
+; X64: shlq
+; X86-LABEL: mul4096_64:
+; X86: shldl
+; X86: shll
+    %mul = mul i64 %A, 4096
+    ret i64 %mul
+}
+
+define i32 @mulmin4096_32(i32 %A) {
+; X64-LABEL: mulmin4096_32:
+; X64: shll
+; X64-NEXT: negl
+; X86-LABEL: mulmin4096_32:
+; X86: shll
+; X86-NEXT: negl
+    %mul = mul i32 %A, -4096
+    ret i32 %mul
+}
+
+define i64 @mulmin4096_64(i64 %A) {
+; X64-LABEL: mulmin4096_64:
+; X64: shlq
+; X64-NEXT: negq
+; X86-LABEL: mulmin4096_64:
+; X86: shldl
+; X86-NEXT: shll
+; X86-NEXT: xorl
+; X86-NEXT: negl
+; X86-NEXT: sbbl
+    %mul = mul i64 %A, -4096
+    ret i64 %mul
+}
+
+define i32 @mul3_32(i32 %A) {
+; X64-LABEL: mul3_32:
+; X64: leal
+; X86-LABEL: mul3_32:
+; But why?!
+; X86: imull
+    %mul = mul i32 %A, 3
+    ret i32 %mul
+}
+
+define i64 @mul3_64(i64 %A) {
+; X64-LABEL: mul3_64:
+; X64: leaq
+; X86-LABEL: mul3_64:
+; X86: mull
+; X86-NEXT: imull
+    %mul = mul i64 %A, 3
+    ret i64 %mul
+}
+
+define i32 @mul40_32(i32 %A) {
+; X64-LABEL: mul40_32:
+; X64: shll
+; X64-NEXT: leal
+; X86-LABEL: mul40_32:
+; X86: shll
+; X86-NEXT: leal
+    %mul = mul i32 %A, 40
+    ret i32 %mul
+}
+
+define i64 @mul40_64(i64 %A) {
+; X64-LABEL: mul40_64:
+; X64: shlq
+; X64-NEXT: leaq
+; X86-LABEL: mul40_64:
+; X86: leal
+; X86-NEXT: movl
+; X86-NEXT: mull
+; X86-NEXT: leal
+    %mul = mul i64 %A, 40
+    ret i64 %mul
+}
diff --git a/test/CodeGen/X86/imul64-lea.ll b/test/CodeGen/X86/imul64-lea.ll
deleted file mode 100644
index 047c129..0000000
--- a/test/CodeGen/X86/imul64-lea.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 | FileCheck %s
-
-; Test that 64-bit LEAs are generated for both LP64 and ILP32 in 64-bit mode.
-declare i64 @foo64()
-
-define i64 @test64() {
-  %tmp.0 = tail call i64 @foo64( )
-  %tmp.1 = mul i64 %tmp.0, 9
-; CHECK-NOT: mul
-; CHECK: leaq
-  ret i64 %tmp.1
-}
-
-; Test that 32-bit LEAs are generated for both LP64 and ILP32 in 64-bit mode.
-declare i32 @foo32()
-
-define i32 @test32() {
-  %tmp.0 = tail call i32 @foo32( )
-  %tmp.1 = mul i32 %tmp.0, 9
-; CHECK-NOT: mul
-; CHECK: leal
-  ret i32 %tmp.1
-}
-
diff --git a/test/CodeGen/X86/inalloca-ctor.ll b/test/CodeGen/X86/inalloca-ctor.ll
index 7cfa929..b1781d3 100644
--- a/test/CodeGen/X86/inalloca-ctor.ll
+++ b/test/CodeGen/X86/inalloca-ctor.ll
@@ -17,16 +17,16 @@ entry:
 ; CHECK: movl %esp,
   call void @Foo_ctor(%Foo* %c)
 ; CHECK: leal 12(%{{.*}}),
-; CHECK: subl $4, %esp
-; CHECK: calll _Foo_ctor
+; CHECK-NEXT: pushl
+; CHECK-NEXT: calll _Foo_ctor
 ; CHECK: addl $4, %esp
   %b = getelementptr %frame* %args, i32 0, i32 1
   store i32 42, i32* %b
 ; CHECK: movl $42,
   %a = getelementptr %frame* %args, i32 0, i32 0
   call void @Foo_ctor(%Foo* %a)
-; CHECK: subl $4, %esp
-; CHECK: calll _Foo_ctor
+; CHECK-NEXT: pushl
+; CHECK-NEXT: calll _Foo_ctor
 ; CHECK: addl $4, %esp
   call void @f(%frame* inalloca %args)
 ; CHECK: calll   _f
diff --git a/test/CodeGen/X86/inalloca-invoke.ll b/test/CodeGen/X86/inalloca-invoke.ll
index 6cff9ac..cc11ab3 100644
--- a/test/CodeGen/X86/inalloca-invoke.ll
+++ b/test/CodeGen/X86/inalloca-invoke.ll
@@ -31,13 +31,13 @@ blah:
           to label %invoke.cont unwind label %lpad
 
 ;  Uses end as sret param.
-; CHECK:  movl %[[end]], (%esp)
+; CHECK:  pushl %[[end]]
 ; CHECK:  calll _plus
 
 invoke.cont:
   call void @begin(%Iter* sret %beg)
 
-; CHECK:  movl %[[beg]],
+; CHECK:  pushl %[[beg]]
 ; CHECK:  calll _begin
 
   invoke void @reverse(%frame.reverse* inalloca align 4 %rev_args)
diff --git a/test/CodeGen/X86/inalloca-stdcall.ll b/test/CodeGen/X86/inalloca-stdcall.ll
index 54f97d9..65a0f77 100644
--- a/test/CodeGen/X86/inalloca-stdcall.ll
+++ b/test/CodeGen/X86/inalloca-stdcall.ll
@@ -6,6 +6,7 @@ declare x86_stdcallcc void @f(%Foo* inalloca %a)
 declare x86_stdcallcc void @i(i32 %a)
 
 define void @g() {
+; CHECK-LABEL: _g:
   %b = alloca inalloca %Foo
 ; CHECK: movl    $8, %eax
 ; CHECK: calll   __chkstk
@@ -19,7 +20,7 @@ define void @g() {
   call x86_stdcallcc void @f(%Foo* inalloca %b)
 ; CHECK: calll   _f@8
 ; CHECK-NOT: %esp
-; CHECK: subl $4, %esp
+; CHECK: pushl
 ; CHECK: calll   _i@4
   call x86_stdcallcc void @i(i32 0)
   ret void
diff --git a/test/CodeGen/X86/init-priority.ll b/test/CodeGen/X86/init-priority.ll
new file mode 100644
index 0000000..a0cff23
--- /dev/null
+++ b/test/CodeGen/X86/init-priority.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-freebsd9 | FileCheck %s
+
+; Check that our compiler never emits global constructors
+; inside the .init_array section when building for a non-Linux ELF target.
+; Because of this, the test depends on UseInitArray behavior under FreeBSD
+; as found in Generic_ELF::addClangTargetOptions().
+
+; This is to workaround a Visual Studio bug which causes field
+; UseInitArray to be left uninitialized instead of being 
+; zero-initialized (as specified in [dcl.init]p7).
+; This workaround consists in providing a user default constructor
+; that explicitly initializes field UseInitArray.
+
+%class.C = type { i8 }
+%class.D = type { i8 }
+
+@c1 = global %class.C zeroinitializer, align 1
+@d1 = global %class.D zeroinitializer, align 1
+@llvm.global_ctors = appending global [2 x { i32, void ()* }] [{ i32, void ()* } { i32 101, void ()* @_GLOBAL__I_000101 }, { i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
+
+define linkonce_odr void @_ZN1CC1Ev(%class.C* nocapture %this) {
+entry:
+  ret void
+}
+
+define linkonce_odr void @_ZN1DC1Ev(%class.D* nocapture %this) {
+entry:
+  ret void
+}
+
+define linkonce_odr void @_ZN1DC2Ev(%class.D* nocapture %this) {
+entry:
+  ret void
+}
+
+define linkonce_odr void @_ZN1CC2Ev(%class.C* nocapture %this) {
+entry:
+  ret void
+}
+
+define internal void @_GLOBAL__I_000101() nounwind readnone {
+entry:
+  ret void
+}
+
+define internal void @_GLOBAL__I_a() nounwind readnone {
+entry:
+  ret void
+}
+
+; CHECK-NOT: .init_array
diff --git a/test/CodeGen/X86/inline-asm-flag-clobber.ll b/test/CodeGen/X86/inline-asm-flag-clobber.ll
index bb7c33e..0874b51 100644
--- a/test/CodeGen/X86/inline-asm-flag-clobber.ll
+++ b/test/CodeGen/X86/inline-asm-flag-clobber.ll
@@ -29,4 +29,4 @@ entry:
   ret i32 %1
 }
 
-!0 = metadata !{i64 935930}
+!0 = !{i64 935930}
diff --git a/test/CodeGen/X86/insertps-O0-bug.ll b/test/CodeGen/X86/insertps-O0-bug.ll
new file mode 100644
index 0000000..e89ac26
--- /dev/null
+++ b/test/CodeGen/X86/insertps-O0-bug.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -O0 < %s | FileCheck %s
+
+; Check that at -O0, the backend doesn't attempt to canonicalize a vector load
+; used by an INSERTPS into a scalar load plus scalar_to_vector.
+;
+; In order to fold a load into the memory operand of an INSERTPSrm, the backend
+; tries to canonicalize a vector load in input to an INSERTPS node into a
+; scalar load plus scalar_to_vector. This would allow ISel to match the
+; INSERTPSrm variant rather than a load plus INSERTPSrr.
+;
+; However, ISel can only select an INSERTPSrm if folding a load into the operand
+; of an insertps is considered to be profitable.
+;
+; In the example below:
+;
+; __m128 test(__m128 a, __m128 *b) {
+;   __m128 c = _mm_insert_ps(a, *b, 1 << 6);
+;   return c;
+; }
+;
+; At -O0, the backend would attempt to canonicalize the load to 'b' into
+; a scalar load in the hope of matching an INSERTPSrm.
+; However, ISel would fail to recognize an INSERTPSrm since load folding is
+; always considered unprofitable at -O0. This would leave the insertps mask
+; in an invalid state.
+;
+; The problem with the canonicalization rule performed by the backend is that
+; it assumes ISel to always be able to match an INSERTPSrm. This assumption is
+; not always correct at -O0. In this example, FastISel fails to lower the
+; arguments needed by the entry block. This is enough to enable the DAGCombiner
+; and eventually trigger the canonicalization on the INSERTPS node.
+;
+; This test checks that the vector load in input to the insertps is not
+; canonicalized into a scalar load plus scalar_to_vector (a movss).
+
+define <4 x float> @test(<4 x float> %a, <4 x float>* %b) {
+; CHECK-LABEL: test:
+; CHECK: movaps (%rdi), [[REG:%[a-z0-9]+]]
+; CHECK-NOT: movss
+; CHECK: insertps $64, [[REG]],
+; CHECK: ret
+entry:
+  %0 = load <4 x float>* %b, align 16
+  %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %0, i32 64)
+  %2 = alloca <4 x float>, align 16
+  store <4 x float> %1, <4 x float>* %2, align 16
+  %3 = load <4 x float>* %2, align 16
+  ret <4 x float> %3
+}
+
+
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32)
diff --git a/test/CodeGen/X86/large-code-model-isel.ll b/test/CodeGen/X86/large-code-model-isel.ll
new file mode 100644
index 0000000..3c283d9
--- /dev/null
+++ b/test/CodeGen/X86/large-code-model-isel.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -code-model=large -mcpu=core2 -march=x86-64 -O0 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@.str10 = external unnamed_addr constant [2 x i8], align 1
+
+define void @foo() {
+; CHECK-LABEL: foo:
+entry:
+; CHECK: callq
+  %call = call i64* undef(i64* undef, i8* getelementptr inbounds ([2 x i8]* @.str10, i32 0, i32 0))
+  ret void
+}
diff --git a/test/CodeGen/X86/lea-2.ll b/test/CodeGen/X86/lea-2.ll
index 6fb3879..98c57c7 100644
--- a/test/CodeGen/X86/lea-2.ll
+++ b/test/CodeGen/X86/lea-2.ll
@@ -10,7 +10,7 @@ define i32 @test1(i32 %A, i32 %B) {
 ; The above computation of %tmp4 should match a single lea, without using
 ; actual add instructions.
 ; CHECK-NOT: add
-; CHECK: lea {{[a-z]+}}, dword ptr [{{[a-z]+}} + 4*{{[a-z]+}} - 5]
+; CHECK: lea {{[a-z]+}}, [{{[a-z]+}} + 4*{{[a-z]+}} - 5]
 
   ret i32 %tmp4
 }
diff --git a/test/CodeGen/X86/logical-load-fold.ll b/test/CodeGen/X86/logical-load-fold.ll
new file mode 100644
index 0000000..5aac2d7
--- /dev/null
+++ b/test/CodeGen/X86/logical-load-fold.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2,sse-unaligned-mem | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx                    | FileCheck %s --check-prefix=AVX
+
+; Although we have the ability to fold an unaligned load with AVX 
+; and under special conditions with some SSE implementations, we
+; can not fold the load under any circumstances in these test
+; cases because they are not 16-byte loads. The load must be
+; executed as a scalar ('movs*') with a zero extension to
+; 128-bits and then used in the packed logical ('andp*') op. 
+; PR22371 - http://llvm.org/bugs/show_bug.cgi?id=22371
+
+define double @load_double_no_fold(double %x, double %y) {
+; SSE2-LABEL: load_double_no_fold:
+; SSE2:       BB#0:
+; SSE2-NEXT:    cmplesd %xmm0, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    andpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: load_double_no_fold:
+; AVX:       BB#0:
+; AVX-NEXT:    vcmplesd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %cmp = fcmp oge double %x, %y
+  %zext = zext i1 %cmp to i32
+  %conv = sitofp i32 %zext to double
+  ret double %conv
+}
+
+define float @load_float_no_fold(float %x, float %y) {
+; SSE2-LABEL: load_float_no_fold:
+; SSE2:       BB#0:
+; SSE2-NEXT:    cmpless %xmm0, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: load_float_no_fold:
+; AVX:       BB#0:
+; AVX-NEXT:    vcmpless %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %cmp = fcmp oge float %x, %y
+  %zext = zext i1 %cmp to i32
+  %conv = sitofp i32 %zext to float
+  ret float %conv
+}
+
diff --git a/test/CodeGen/X86/lower-vec-shift-2.ll b/test/CodeGen/X86/lower-vec-shift-2.ll
new file mode 100644
index 0000000..fb8fbba
--- /dev/null
+++ b/test/CodeGen/X86/lower-vec-shift-2.ll
@@ -0,0 +1,157 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE2
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
+
+define <8 x i16> @test1(<8 x i16> %A, <8 x i16> %B) {
+; SSE2-LABEL: test1:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    psllw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test1:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %vecinit14 = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer
+  %shl = shl <8 x i16> %A, %vecinit14
+  ret <8 x i16> %shl
+}
+
+define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
+; SSE2-LABEL: test2:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT:    pslld %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test2:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %vecinit6 = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer
+  %shl = shl <4 x i32> %A, %vecinit6
+  ret <4 x i32> %shl
+}
+
+define <2 x i64> @test3(<2 x i64> %A, <2 x i64> %B) {
+; SSE2-LABEL: test3:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    psllq %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test3:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %vecinit2 = shufflevector <2 x i64> %B, <2 x i64> undef, <2 x i32> zeroinitializer
+  %shl = shl <2 x i64> %A, %vecinit2
+  ret <2 x i64> %shl
+}
+
+define <8 x i16> @test4(<8 x i16> %A, <8 x i16> %B) {
+; SSE2-LABEL: test4:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    psrlw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test4:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %vecinit14 = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer
+  %shr = lshr <8 x i16> %A, %vecinit14
+  ret <8 x i16> %shr
+}
+
+define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) {
+; SSE2-LABEL: test5:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT:    psrld %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test5:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %vecinit6 = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer
+  %shr = lshr <4 x i32> %A, %vecinit6
+  ret <4 x i32> %shr
+}
+
+define <2 x i64> @test6(<2 x i64> %A, <2 x i64> %B) {
+; SSE2-LABEL: test6:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    psrlq %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test6:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %vecinit2 = shufflevector <2 x i64> %B, <2 x i64> undef, <2 x i32> zeroinitializer
+  %shr = lshr <2 x i64> %A, %vecinit2
+  ret <2 x i64> %shr
+}
+
+define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) {
+; SSE2-LABEL: test7:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    psraw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test7:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %vecinit14 = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer
+  %shr = ashr <8 x i16> %A, %vecinit14
+  ret <8 x i16> %shr
+}
+
+define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) {
+; SSE2-LABEL: test8:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT:    psrad %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test8:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %vecinit6 = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer
+  %shr = ashr <4 x i32> %A, %vecinit6
+  ret <4 x i32> %shr
+}
diff --git a/test/CodeGen/X86/lzcnt-tzcnt.ll b/test/CodeGen/X86/lzcnt-tzcnt.ll
index 07e4b9d..e98764a 100644
--- a/test/CodeGen/X86/lzcnt-tzcnt.ll
+++ b/test/CodeGen/X86/lzcnt-tzcnt.ll
@@ -437,6 +437,137 @@ define i64 @test18_cttz(i64* %ptr) {
 ; CHECK: tzcnt
 ; CHECK-NEXT: ret
 
+define i16 @test1b_ctlz(i16 %v) {
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp ne i16 %v, 0
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test1b_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test2b_ctlz(i32 %v) {
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp ne i32 %v, 0
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test2b_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test3b_ctlz(i64 %v) {
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test3b_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test4b_ctlz(i16 %v) {
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp ne i16 %v, 0
+  %cond = select i1 %tobool, i16 %cnt, i16 16
+  ret i16 %cond
+}
+; CHECK-LABEL: test4b_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test5b_ctlz(i32 %v) {
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp ne i32 %v, 0
+  %cond = select i1 %tobool, i32 %cnt, i32 32
+  ret i32 %cond
+}
+; CHECK-LABEL: test5b_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test6b_ctlz(i64 %v) {
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 %cnt, i64 64
+  ret i64 %cond
+}
+; CHECK-LABEL: test6b_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test1b_cttz(i16 %v) {
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp ne i16 %v, 0
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test1b_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test2b_cttz(i32 %v) {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp ne i32 %v, 0
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test2b_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test3b_cttz(i64 %v) {
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test3b_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test4b_cttz(i16 %v) {
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp ne i16 %v, 0
+  %cond = select i1 %tobool, i16 %cnt, i16 16
+  ret i16 %cond
+}
+; CHECK-LABEL: test4b_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test5b_cttz(i32 %v) {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp ne i32 %v, 0
+  %cond = select i1 %tobool, i32 %cnt, i32 32
+  ret i32 %cond
+}
+; CHECK-LABEL: test5b_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test6b_cttz(i64 %v) {
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 %cnt, i64 64
+  ret i64 %cond
+}
+; CHECK-LABEL: test6b_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
 
 declare i64 @llvm.cttz.i64(i64, i1)
 declare i32 @llvm.cttz.i32(i32, i1)
diff --git a/test/CodeGen/X86/macho-comdat.ll b/test/CodeGen/X86/macho-comdat.ll
index 3c2d997..6056047 100644
--- a/test/CodeGen/X86/macho-comdat.ll
+++ b/test/CodeGen/X86/macho-comdat.ll
@@ -2,5 +2,5 @@
 ; RUN: FileCheck < %t %s
 
 $f = comdat any
-@v = global i32 0, comdat $f
+@v = global i32 0, comdat($f)
 ; CHECK: LLVM ERROR: MachO doesn't support COMDATs, 'f' cannot be lowered.
diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll
new file mode 100644
index 0000000..f268c57
--- /dev/null
+++ b/test/CodeGen/X86/masked_memop.ll
@@ -0,0 +1,219 @@
+; RUN: llc -mtriple=x86_64-apple-darwin  -mcpu=knl < %s | FileCheck %s -check-prefix=AVX512
+; RUN: llc -mtriple=x86_64-apple-darwin  -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
+; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=AVX_SCALAR
+
+; AVX512-LABEL: test1
+; AVX512: vmovdqu32       (%rdi), %zmm0 {%k1} {z}
+
+; AVX2-LABEL: test1
+; AVX2: vpmaskmovd      32(%rdi)
+; AVX2: vpmaskmovd      (%rdi)
+; AVX2-NOT: blend
+
+; AVX_SCALAR-LABEL: test1
+; AVX_SCALAR-NOT: masked
+; AVX_SCALAR: extractelement
+; AVX_SCALAR: insertelement
+; AVX_SCALAR: extractelement
+; AVX_SCALAR: insertelement
+define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) {
+  %mask = icmp eq <16 x i32> %trigger, zeroinitializer
+  %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef)
+  ret <16 x i32> %res
+}
+
+; AVX512-LABEL: test2
+; AVX512: vmovdqu32       (%rdi), %zmm0 {%k1} {z}
+
+; AVX2-LABEL: test2
+; AVX2: vpmaskmovd      {{.*}}(%rdi)
+; AVX2: vpmaskmovd      {{.*}}(%rdi)
+; AVX2-NOT: blend
+define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) {
+  %mask = icmp eq <16 x i32> %trigger, zeroinitializer
+  %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer)
+  ret <16 x i32> %res
+}
+
+; AVX512-LABEL: test3
+; AVX512: vmovdqu32       %zmm1, (%rdi) {%k1}
+
+; AVX_SCALAR-LABEL: test3
+; AVX_SCALAR-NOT: masked
+; AVX_SCALAR: extractelement
+; AVX_SCALAR: store
+; AVX_SCALAR: extractelement
+; AVX_SCALAR: store
+; AVX_SCALAR: extractelement
+; AVX_SCALAR: store
+define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) {
+  %mask = icmp eq <16 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)
+  ret void
+}
+
+; AVX512-LABEL: test4
+; AVX512: vmovups       (%rdi), %zmm{{.*{%k[1-7]}}}
+
+; AVX2-LABEL: test4
+; AVX2: vmaskmovps      {{.*}}(%rdi)
+; AVX2: vmaskmovps      {{.*}}(%rdi)
+; AVX2: blend
+define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %dst) {
+  %mask = icmp eq <16 x i32> %trigger, zeroinitializer
+  %res = call <16 x float> @llvm.masked.load.v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst)
+  ret <16 x float> %res
+}
+
+; AVX512-LABEL: test5
+; AVX512: vmovupd (%rdi), %zmm1 {%k1}
+
+; AVX2-LABEL: test5
+; AVX2: vmaskmovpd
+; AVX2: vblendvpd
+; AVX2: vmaskmovpd
+; AVX2: vblendvpd
+define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> %dst) {
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst)
+  ret <8 x double> %res
+}
+
+; AVX2-LABEL: test6
+; AVX2: vmaskmovpd
+; AVX2: vblendvpd
+define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
+  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+  %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+  ret <2 x double> %res
+}
+
+; AVX2-LABEL: test7
+; AVX2: vmaskmovps      {{.*}}(%rdi)
+; AVX2: blend
+define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) {
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst)
+  ret <4 x float> %res
+}
+
+; AVX2-LABEL: test8
+; AVX2: vpmaskmovd      {{.*}}(%rdi)
+; AVX2: blend
+define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+  ret <4 x i32> %res
+}
+
+; AVX2-LABEL: test9
+; AVX2: vpmaskmovd %xmm
+define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
+  ret void
+}
+
+; AVX2-LABEL: test10
+; AVX2: vmaskmovpd    (%rdi), %ymm
+; AVX2: blend
+define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 4, <4 x i1>%mask, <4 x double>%dst)
+  ret <4 x double> %res
+}
+
+; AVX2-LABEL: test11
+; AVX2: vmaskmovps
+; AVX2: vblendvps
+define <8 x float> @test11(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
+  ret <8 x float> %res
+}
+
+; AVX2-LABEL: test12
+; AVX2: vpmaskmovd %ymm
+define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
+  ret void
+}
+
+; AVX512-LABEL: test13
+; AVX512: vmovups       %zmm1, (%rdi) {%k1}
+
+define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) {
+  %mask = icmp eq <16 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask)
+  ret void
+}
+
+; AVX2-LABEL: test14
+; AVX2: vpshufd
+; AVX2: vmovq
+; AVX2: vmaskmovps
+define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+; AVX2-LABEL: test15
+; AVX2: vpmaskmovd
+define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+; AVX2-LABEL: test16
+; AVX2: vmaskmovps
+; AVX2: vblendvps
+define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+  ret <2 x float> %res
+}
+
+; AVX2-LABEL: test17
+; AVX2: vpmaskmovd
+; AVX2: vblendvps
+; AVX2: vpmovsxdq
+define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+  ret <2 x i32> %res
+}
+
+; AVX2-LABEL: test18
+; AVX2: vmaskmovps
+; AVX2-NOT: blend
+define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
+  ret <2 x float> %res
+}
+
+
+declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
+declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
+declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
+declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
+declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
+declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
+
diff --git a/test/CodeGen/X86/mem-intrin-base-reg.ll b/test/CodeGen/X86/mem-intrin-base-reg.ll
index dd7f396..9a6de3d 100644
--- a/test/CodeGen/X86/mem-intrin-base-reg.ll
+++ b/test/CodeGen/X86/mem-intrin-base-reg.ll
@@ -63,7 +63,7 @@ spill_vectors:
 ; CHECK-LABEL: _memcpy_vla_vector:
 ; CHECK: andl $-16, %esp
 ; CHECK: movl %esp, %esi
-; CHECK: movl $128, {{.*}}(%esp)
+; CHECK: pushl $128
 ; CHECK: calll _memcpy
 ; CHECK: calll __chkstk
 
diff --git a/test/CodeGen/X86/misched-code-difference-with-debug.ll b/test/CodeGen/X86/misched-code-difference-with-debug.ll
new file mode 100644
index 0000000..fb2a986
--- /dev/null
+++ b/test/CodeGen/X86/misched-code-difference-with-debug.ll
@@ -0,0 +1,90 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-unknown -mcpu=generic | FileCheck %s
+; Both functions should produce the same code. The presence of debug values
+; should not affect the scheduling strategy.
+; Generated from:
+; char argc;
+; class C {
+; public:
+;   int test(char ,char ,char ,...);
+; };
+; void foo() {
+;   C c;
+;   char lc = argc;
+;   c.test(0,argc,0,lc);
+;   c.test(0,argc,0,lc);
+; }
+;
+; with
+; clang -O2 -c test.cpp -emit-llvm -S
+; clang -O2 -c test.cpp -emit-llvm -S -g
+;
+
+
+%class.C = type { i8 }
+
+@argc = global i8 0, align 1
+
+declare i32 @test_function(%class.C*, i8 signext, i8 signext, i8 signext, ...)
+
+; CHECK-LABEL: test_without_debug
+; CHECK: movl [[A:%[a-z]+]], [[B:%[a-z]+]]
+; CHECK-NEXT: movl [[A]], [[C:%[a-z]+]]
+define void @test_without_debug() {
+entry:
+  %c = alloca %class.C, align 1
+  %0 = load i8* @argc, align 1
+  %conv = sext i8 %0 to i32
+  %call = call i32 (%class.C*, i8, i8, i8, ...)* @test_function(%class.C* %c, i8 signext 0, i8 signext %0, i8 signext 0, i32 %conv)
+  %1 = load i8* @argc, align 1
+  %call2 = call i32 (%class.C*, i8, i8, i8, ...)* @test_function(%class.C* %c, i8 signext 0, i8 signext %1, i8 signext 0, i32 %conv)
+  ret void
+}
+
+; CHECK-LABEL: test_with_debug
+; CHECK: movl [[A]], [[B]]
+; CHECK-NEXT: movl [[A]], [[C]]
+define void @test_with_debug() {
+entry:
+  %c = alloca %class.C, align 1
+  %0 = load i8* @argc, align 1
+  tail call void @llvm.dbg.value(metadata i8 %0, i64 0, metadata !19, metadata !29)
+  %conv = sext i8 %0 to i32
+  tail call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !29)
+  %call = call i32 (%class.C*, i8, i8, i8, ...)* @test_function(%class.C* %c, i8 signext 0, i8 signext %0, i8 signext 0, i32 %conv)
+  %1 = load i8* @argc, align 1
+  call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !29)
+  %call2 = call i32 (%class.C*, i8, i8, i8, ...)* @test_function(%class.C* %c, i8 signext 0, i8 signext %1, i8 signext 0, i32 %conv)
+  ret void
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!22, !23}
+
+!0 = !{!"", !1, !2, !3, !12, !20, !2} ; [ DW_TAG_compile_unit ] [test.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"test.cpp", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2\00C\002\008\008\000\000\000", !1, null, null, !5, null, null, !"_ZTS1C"} ; [ DW_TAG_class_type ] [C] [line 2, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!6}
+!6 = !{!"", !1, !"_ZTS1C", !7, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [public] [test]
+!7 = !{!"", null, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !10, !11, !11, !11, null}
+!9 = !{!"", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!"", null, null, !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!11 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!12 = !{!13}
+!13 = !{!"0x2e\00test_with_debug\00test_with_debug\00test_with_debug\006\000\001\000\000\00256\001\006", !1, !14, !15, null, void ()* @test_with_debug, null, null, !17} ; [ DW_TAG_subprogram ] [line 6] [def] [test_with_debug]
+!14 = !{!"0x29", !1}
+!15 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{null}
+!17 = !{!18, !19}
+!18 = !{!"0x100\00c\007\000", !13, !14, !"_ZTS1C"} ; [ DW_TAG_auto_variable ] [c] [line 7]
+!19 = !{!"0x100\00lc\008\000", !13, !14, !11} ; [ DW_TAG_auto_variable ] [lc] [line 8]
+!20 = !{!21}
+!21 = !{!"0x34\00argc\00argc\00\001\000\001", null, !14, !11, i8* @argc, null} ; [ DW_TAG_variable ] [argc] [line 1] [def]
+!22 = !{i32 2, !"Dwarf Version", i32 4}
+!23 = !{i32 2, !"Debug Info Version", i32 2}
+!25 = !MDLocation(line: 8, column: 3, scope: !13)
+!29 = !{!"0x102"}               ; [ DW_TAG_expression ]
diff --git a/test/CodeGen/X86/misched-copy.ll b/test/CodeGen/X86/misched-copy.ll
index 4485b8a..3e37292 100644
--- a/test/CodeGen/X86/misched-copy.ll
+++ b/test/CodeGen/X86/misched-copy.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -march=x86 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
 ;
 ; Test scheduling of copy instructions.
 ;
@@ -44,6 +44,6 @@ end:
 
 attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
-!0 = metadata !{metadata !"float", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!0 = !{!"float", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/misched-crash.ll b/test/CodeGen/X86/misched-crash.ll
index 7644ee0..21c3fa3 100644
--- a/test/CodeGen/X86/misched-crash.ll
+++ b/test/CodeGen/X86/misched-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -enable-misched -verify-misched
+; RUN: llc < %s -verify-machineinstrs -enable-misched -verify-misched
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10"
 
diff --git a/test/CodeGen/X86/mmx-arg-passing-x86-64.ll b/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
new file mode 100644
index 0000000..c536a39
--- /dev/null
+++ b/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86-64
+;
+; On Darwin x86-64, v8i8, v4i16, v2i32 values are passed in XMM[0-7].
+; On Darwin x86-64, v1i64 values are passed in 64-bit GPRs.
+
+@g_v8qi = external global <8 x i8>
+
+define void @t3() nounwind  {
+; X86-64-LABEL: t3:
+; X86-64:       ## BB#0:
+; X86-64-NEXT:    movq _g_v8qi@{{.*}}(%rip), %rax
+; X86-64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-64-NEXT:    movb $1, %al
+; X86-64-NEXT:    jmp _pass_v8qi ## TAILCALL
+  %tmp3 = load <8 x i8>* @g_v8qi, align 8
+  %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
+  %tmp4 = tail call i32 (...)* @pass_v8qi( x86_mmx %tmp3a ) nounwind
+  ret void
+}
+
+define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind  {
+; X86-64-LABEL: t4:
+; X86-64:       ## BB#0:
+; X86-64-NEXT:    movdq2q %xmm1, %mm0
+; X86-64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
+; X86-64-NEXT:    movdq2q %xmm0, %mm0
+; X86-64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
+; X86-64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-64-NEXT:    paddb %xmm0, %xmm1
+; X86-64-NEXT:    movd %xmm1, %rax
+; X86-64-NEXT:    movd %rax, %xmm0
+; X86-64-NEXT:    movb $1, %al
+; X86-64-NEXT:    jmp _pass_v8qi ## TAILCALL
+  %v1a = bitcast x86_mmx %v1 to <8 x i8>
+  %v2b = bitcast x86_mmx %v2 to <8 x i8>
+  %tmp3 = add <8 x i8> %v1a, %v2b
+  %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
+  %tmp4 = tail call i32 (...)* @pass_v8qi( x86_mmx %tmp3a ) nounwind
+  ret void
+}
+
+define void @t5() nounwind  {
+; X86-64-LABEL: t5:
+; X86-64:       ## BB#0:
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    xorl %edi, %edi
+; X86-64-NEXT:    callq _pass_v1di
+; X86-64-NEXT:    popq %rax
+; X86-64-NEXT:    retq
+  call void @pass_v1di( <1 x i64> zeroinitializer )
+  ret void
+}
+
+declare i32 @pass_v8qi(...)
+declare void @pass_v1di(<1 x i64>)
diff --git a/test/CodeGen/X86/mmx-arg-passing.ll b/test/CodeGen/X86/mmx-arg-passing.ll
index 3a0fb95..4e00310 100644
--- a/test/CodeGen/X86/mmx-arg-passing.ll
+++ b/test/CodeGen/X86/mmx-arg-passing.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+mmx | FileCheck %s -check-prefix=X86-32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-64
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+mmx | FileCheck %s --check-prefix=X86-32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86-64
 ;
 ; On Darwin x86-32, v8i8, v4i16, v2i32 values are passed in MM[0-2].
 ; On Darwin x86-32, v1i64 values are passed in memory.  In this example, they
@@ -10,29 +10,40 @@
 @u1 = external global x86_mmx
 
 define void @t1(x86_mmx %v1) nounwind  {
-	store x86_mmx %v1, x86_mmx* @u1, align 8
-	ret void
-
 ; X86-32-LABEL: t1:
-; X86-32: movq %mm0
-
+; X86-32:       ## BB#0:
+; X86-32-NEXT:    movl L_u1$non_lazy_ptr, %eax
+; X86-32-NEXT:    movq %mm0, (%eax)
+; X86-32-NEXT:    retl
+;
 ; X86-64-LABEL: t1:
-; X86-64: movdq2q %xmm0
-; X86-64: movq %mm0
+; X86-64:       ## BB#0:
+; X86-64-NEXT:    movdq2q %xmm0, %mm0
+; X86-64-NEXT:    movq _u1@{{.*}}(%rip), %rax
+; X86-64-NEXT:    movq %mm0, (%rax)
+; X86-64-NEXT:    retq
+	store x86_mmx %v1, x86_mmx* @u1, align 8
+	ret void
 }
 
 @u2 = external global x86_mmx
 
 define void @t2(<1 x i64> %v1) nounwind  {
+; X86-32-LABEL: t2:
+; X86-32:       ## BB#0:
+; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT:    movl L_u2$non_lazy_ptr, %edx
+; X86-32-NEXT:    movl %ecx, 4(%edx)
+; X86-32-NEXT:    movl %eax, (%edx)
+; X86-32-NEXT:    retl
+;
+; X86-64-LABEL: t2:
+; X86-64:       ## BB#0:
+; X86-64-NEXT:    movq _u2@{{.*}}(%rip), %rax
+; X86-64-NEXT:    movq %rdi, (%rax)
+; X86-64-NEXT:    retq
         %tmp = bitcast <1 x i64> %v1 to x86_mmx
 	store x86_mmx %tmp, x86_mmx* @u2, align 8
 	ret void
-
-; X86-32-LABEL: t2:
-; X86-32: movl 4(%esp)
-; X86-32: movl 8(%esp)
-
-; X86-64-LABEL: t2:
-; X86-64: movq %rdi
 }
-
diff --git a/test/CodeGen/X86/mmx-arg-passing2.ll b/test/CodeGen/X86/mmx-arg-passing2.ll
deleted file mode 100644
index c132d31..0000000
--- a/test/CodeGen/X86/mmx-arg-passing2.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | grep movdq2q | count 2
-; Since the add is not an MMX add, we don't have a movq2dq any more.
-
-@g_v8qi = external global <8 x i8>
-
-define void @t1() nounwind  {
-	%tmp3 = load <8 x i8>* @g_v8qi, align 8
-        %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
-	%tmp4 = tail call i32 (...)* @pass_v8qi( x86_mmx %tmp3a ) nounwind
-	ret void
-}
-
-define void @t2(x86_mmx %v1, x86_mmx %v2) nounwind  {
-       %v1a = bitcast x86_mmx %v1 to <8 x i8>
-       %v2b = bitcast x86_mmx %v2 to <8 x i8>
-       %tmp3 = add <8 x i8> %v1a, %v2b
-       %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
-       %tmp4 = tail call i32 (...)* @pass_v8qi( x86_mmx %tmp3a ) nounwind
-       ret void
-}
-
-define void @t3() nounwind  {
-	call void @pass_v1di( <1 x i64> zeroinitializer )
-        ret void
-}
-
-declare i32 @pass_v8qi(...)
-declare void @pass_v1di(<1 x i64>)
diff --git a/test/CodeGen/X86/mmx-arith.ll b/test/CodeGen/X86/mmx-arith.ll
index 6817487..d9d1fbf 100644
--- a/test/CodeGen/X86/mmx-arith.ll
+++ b/test/CodeGen/X86/mmx-arith.ll
@@ -1,309 +1,308 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx
+; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck -check-prefix=X64 %s
 
 ;; A basic sanity check to make sure that MMX arithmetic actually compiles.
 ;; First is a straight translation of the original with bitcasts as needed.
 
-define void @foo(x86_mmx* %A, x86_mmx* %B) {
+; X32-LABEL: test0
+; X64-LABEL: test0
+define void @test0(x86_mmx* %A, x86_mmx* %B) {
 entry:
-	%tmp1 = load x86_mmx* %A		; <x86_mmx> [#uses=1]
-	%tmp3 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp1a = bitcast x86_mmx %tmp1 to <8 x i8>
-        %tmp3a = bitcast x86_mmx %tmp3 to <8 x i8>
-	%tmp4 = add <8 x i8> %tmp1a, %tmp3a		; <<8 x i8>> [#uses=2]
-        %tmp4a = bitcast <8 x i8> %tmp4 to x86_mmx
-	store x86_mmx %tmp4a, x86_mmx* %A
-	%tmp7 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.b( x86_mmx %tmp4a, x86_mmx %tmp7 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp12, x86_mmx* %A
-	%tmp16 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.b( x86_mmx %tmp12, x86_mmx %tmp16 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp21, x86_mmx* %A
-	%tmp27 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp21a = bitcast x86_mmx %tmp21 to <8 x i8>
-        %tmp27a = bitcast x86_mmx %tmp27 to <8 x i8>
-	%tmp28 = sub <8 x i8> %tmp21a, %tmp27a		; <<8 x i8>> [#uses=2]
-        %tmp28a = bitcast <8 x i8> %tmp28 to x86_mmx
-	store x86_mmx %tmp28a, x86_mmx* %A
-	%tmp31 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.b( x86_mmx %tmp28a, x86_mmx %tmp31 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp36, x86_mmx* %A
-	%tmp40 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.b( x86_mmx %tmp36, x86_mmx %tmp40 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp45, x86_mmx* %A
-	%tmp51 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp45a = bitcast x86_mmx %tmp45 to <8 x i8>
-        %tmp51a = bitcast x86_mmx %tmp51 to <8 x i8>
-	%tmp52 = mul <8 x i8> %tmp45a, %tmp51a		; <<8 x i8>> [#uses=2]
-        %tmp52a = bitcast <8 x i8> %tmp52 to x86_mmx
-	store x86_mmx %tmp52a, x86_mmx* %A
-	%tmp57 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp57a = bitcast x86_mmx %tmp57 to <8 x i8>
-	%tmp58 = and <8 x i8> %tmp52, %tmp57a		; <<8 x i8>> [#uses=2]
-        %tmp58a = bitcast <8 x i8> %tmp58 to x86_mmx
-	store x86_mmx %tmp58a, x86_mmx* %A
-	%tmp63 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp63a = bitcast x86_mmx %tmp63 to <8 x i8>
-	%tmp64 = or <8 x i8> %tmp58, %tmp63a		; <<8 x i8>> [#uses=2]
-        %tmp64a = bitcast <8 x i8> %tmp64 to x86_mmx
-	store x86_mmx %tmp64a, x86_mmx* %A
-	%tmp69 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp69a = bitcast x86_mmx %tmp69 to <8 x i8>
-        %tmp64b = bitcast x86_mmx %tmp64a to <8 x i8>
-	%tmp70 = xor <8 x i8> %tmp64b, %tmp69a		; <<8 x i8>> [#uses=1]
-        %tmp70a = bitcast <8 x i8> %tmp70 to x86_mmx
-	store x86_mmx %tmp70a, x86_mmx* %A
-	tail call void @llvm.x86.mmx.emms( )
-	ret void
+  %tmp1 = load x86_mmx* %A
+  %tmp3 = load x86_mmx* %B
+  %tmp1a = bitcast x86_mmx %tmp1 to <8 x i8>
+  %tmp3a = bitcast x86_mmx %tmp3 to <8 x i8>
+  %tmp4 = add <8 x i8> %tmp1a, %tmp3a
+  %tmp4a = bitcast <8 x i8> %tmp4 to x86_mmx
+  store x86_mmx %tmp4a, x86_mmx* %A
+  %tmp7 = load x86_mmx* %B
+  %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %tmp4a, x86_mmx %tmp7)
+  store x86_mmx %tmp12, x86_mmx* %A
+  %tmp16 = load x86_mmx* %B
+  %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %tmp12, x86_mmx %tmp16)
+  store x86_mmx %tmp21, x86_mmx* %A
+  %tmp27 = load x86_mmx* %B
+  %tmp21a = bitcast x86_mmx %tmp21 to <8 x i8>
+  %tmp27a = bitcast x86_mmx %tmp27 to <8 x i8>
+  %tmp28 = sub <8 x i8> %tmp21a, %tmp27a
+  %tmp28a = bitcast <8 x i8> %tmp28 to x86_mmx
+  store x86_mmx %tmp28a, x86_mmx* %A
+  %tmp31 = load x86_mmx* %B
+  %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %tmp28a, x86_mmx %tmp31)
+  store x86_mmx %tmp36, x86_mmx* %A
+  %tmp40 = load x86_mmx* %B
+  %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %tmp36, x86_mmx %tmp40)
+  store x86_mmx %tmp45, x86_mmx* %A
+  %tmp51 = load x86_mmx* %B
+  %tmp45a = bitcast x86_mmx %tmp45 to <8 x i8>
+  %tmp51a = bitcast x86_mmx %tmp51 to <8 x i8>
+  %tmp52 = mul <8 x i8> %tmp45a, %tmp51a
+  %tmp52a = bitcast <8 x i8> %tmp52 to x86_mmx
+  store x86_mmx %tmp52a, x86_mmx* %A
+  %tmp57 = load x86_mmx* %B
+  %tmp57a = bitcast x86_mmx %tmp57 to <8 x i8>
+  %tmp58 = and <8 x i8> %tmp52, %tmp57a
+  %tmp58a = bitcast <8 x i8> %tmp58 to x86_mmx
+  store x86_mmx %tmp58a, x86_mmx* %A
+  %tmp63 = load x86_mmx* %B
+  %tmp63a = bitcast x86_mmx %tmp63 to <8 x i8>
+  %tmp64 = or <8 x i8> %tmp58, %tmp63a
+  %tmp64a = bitcast <8 x i8> %tmp64 to x86_mmx
+  store x86_mmx %tmp64a, x86_mmx* %A
+  %tmp69 = load x86_mmx* %B
+  %tmp69a = bitcast x86_mmx %tmp69 to <8 x i8>
+  %tmp64b = bitcast x86_mmx %tmp64a to <8 x i8>
+  %tmp70 = xor <8 x i8> %tmp64b, %tmp69a
+  %tmp70a = bitcast <8 x i8> %tmp70 to x86_mmx
+  store x86_mmx %tmp70a, x86_mmx* %A
+  tail call void @llvm.x86.mmx.emms()
+  ret void
 }
 
-define void @baz(x86_mmx* %A, x86_mmx* %B) {
+; X32-LABEL: test1
+; X64-LABEL: test1
+define void @test1(x86_mmx* %A, x86_mmx* %B) {
 entry:
-	%tmp1 = load x86_mmx* %A		; <x86_mmx> [#uses=1]
-	%tmp3 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp1a = bitcast x86_mmx %tmp1 to <2 x i32>
-        %tmp3a = bitcast x86_mmx %tmp3 to <2 x i32>
-	%tmp4 = add <2 x i32> %tmp1a, %tmp3a		; <<2 x i32>> [#uses=2]
-        %tmp4a = bitcast <2 x i32> %tmp4 to x86_mmx
-	store x86_mmx %tmp4a, x86_mmx* %A
-	%tmp9 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp9a = bitcast x86_mmx %tmp9 to <2 x i32>
-	%tmp10 = sub <2 x i32> %tmp4, %tmp9a		; <<2 x i32>> [#uses=2]
-        %tmp10a = bitcast <2 x i32> %tmp4 to x86_mmx
-	store x86_mmx %tmp10a, x86_mmx* %A
-	%tmp15 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp10b = bitcast x86_mmx %tmp10a to <2 x i32>
-        %tmp15a = bitcast x86_mmx %tmp15 to <2 x i32>
-	%tmp16 = mul <2 x i32> %tmp10b, %tmp15a		; <<2 x i32>> [#uses=2]
-        %tmp16a = bitcast <2 x i32> %tmp16 to x86_mmx
-	store x86_mmx %tmp16a, x86_mmx* %A
-	%tmp21 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp16b = bitcast x86_mmx %tmp16a to <2 x i32>
-        %tmp21a = bitcast x86_mmx %tmp21 to <2 x i32>
-	%tmp22 = and <2 x i32> %tmp16b, %tmp21a		; <<2 x i32>> [#uses=2]
-        %tmp22a = bitcast <2 x i32> %tmp22 to x86_mmx
-	store x86_mmx %tmp22a, x86_mmx* %A
-	%tmp27 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp22b = bitcast x86_mmx %tmp22a to <2 x i32>
-        %tmp27a = bitcast x86_mmx %tmp27 to <2 x i32>
-	%tmp28 = or <2 x i32> %tmp22b, %tmp27a		; <<2 x i32>> [#uses=2]
-        %tmp28a = bitcast <2 x i32> %tmp28 to x86_mmx
-	store x86_mmx %tmp28a, x86_mmx* %A
-	%tmp33 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp28b = bitcast x86_mmx %tmp28a to <2 x i32>
-        %tmp33a = bitcast x86_mmx %tmp33 to <2 x i32>
-	%tmp34 = xor <2 x i32> %tmp28b, %tmp33a		; <<2 x i32>> [#uses=1]
-        %tmp34a = bitcast <2 x i32> %tmp34 to x86_mmx
-	store x86_mmx %tmp34a, x86_mmx* %A
-	tail call void @llvm.x86.mmx.emms( )
-	ret void
+  %tmp1 = load x86_mmx* %A
+  %tmp3 = load x86_mmx* %B
+  %tmp1a = bitcast x86_mmx %tmp1 to <2 x i32>
+  %tmp3a = bitcast x86_mmx %tmp3 to <2 x i32>
+  %tmp4 = add <2 x i32> %tmp1a, %tmp3a
+  %tmp4a = bitcast <2 x i32> %tmp4 to x86_mmx
+  store x86_mmx %tmp4a, x86_mmx* %A
+  %tmp9 = load x86_mmx* %B
+  %tmp9a = bitcast x86_mmx %tmp9 to <2 x i32>
+  %tmp10 = sub <2 x i32> %tmp4, %tmp9a
+  %tmp10a = bitcast <2 x i32> %tmp4 to x86_mmx
+  store x86_mmx %tmp10a, x86_mmx* %A
+  %tmp15 = load x86_mmx* %B
+  %tmp10b = bitcast x86_mmx %tmp10a to <2 x i32>
+  %tmp15a = bitcast x86_mmx %tmp15 to <2 x i32>
+  %tmp16 = mul <2 x i32> %tmp10b, %tmp15a
+  %tmp16a = bitcast <2 x i32> %tmp16 to x86_mmx
+  store x86_mmx %tmp16a, x86_mmx* %A
+  %tmp21 = load x86_mmx* %B
+  %tmp16b = bitcast x86_mmx %tmp16a to <2 x i32>
+  %tmp21a = bitcast x86_mmx %tmp21 to <2 x i32>
+  %tmp22 = and <2 x i32> %tmp16b, %tmp21a
+  %tmp22a = bitcast <2 x i32> %tmp22 to x86_mmx
+  store x86_mmx %tmp22a, x86_mmx* %A
+  %tmp27 = load x86_mmx* %B
+  %tmp22b = bitcast x86_mmx %tmp22a to <2 x i32>
+  %tmp27a = bitcast x86_mmx %tmp27 to <2 x i32>
+  %tmp28 = or <2 x i32> %tmp22b, %tmp27a
+  %tmp28a = bitcast <2 x i32> %tmp28 to x86_mmx
+  store x86_mmx %tmp28a, x86_mmx* %A
+  %tmp33 = load x86_mmx* %B
+  %tmp28b = bitcast x86_mmx %tmp28a to <2 x i32>
+  %tmp33a = bitcast x86_mmx %tmp33 to <2 x i32>
+  %tmp34 = xor <2 x i32> %tmp28b, %tmp33a
+  %tmp34a = bitcast <2 x i32> %tmp34 to x86_mmx
+  store x86_mmx %tmp34a, x86_mmx* %A
+  tail call void @llvm.x86.mmx.emms( )
+  ret void
 }
 
-define void @bar(x86_mmx* %A, x86_mmx* %B) {
+; X32-LABEL: test2
+; X64-LABEL: test2
+define void @test2(x86_mmx* %A, x86_mmx* %B) {
 entry:
-	%tmp1 = load x86_mmx* %A		; <x86_mmx> [#uses=1]
-	%tmp3 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp1a = bitcast x86_mmx %tmp1 to <4 x i16>
-        %tmp3a = bitcast x86_mmx %tmp3 to <4 x i16>
-	%tmp4 = add <4 x i16> %tmp1a, %tmp3a		; <<4 x i16>> [#uses=2]
-        %tmp4a = bitcast <4 x i16> %tmp4 to x86_mmx
-	store x86_mmx %tmp4a, x86_mmx* %A
-	%tmp7 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.w( x86_mmx %tmp4a, x86_mmx %tmp7 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp12, x86_mmx* %A
-	%tmp16 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp12, x86_mmx %tmp16 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp21, x86_mmx* %A
-	%tmp27 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp21a = bitcast x86_mmx %tmp21 to <4 x i16>
-        %tmp27a = bitcast x86_mmx %tmp27 to <4 x i16>
-	%tmp28 = sub <4 x i16> %tmp21a, %tmp27a		; <<4 x i16>> [#uses=2]
-        %tmp28a = bitcast <4 x i16> %tmp28 to x86_mmx
-	store x86_mmx %tmp28a, x86_mmx* %A
-	%tmp31 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.w( x86_mmx %tmp28a, x86_mmx %tmp31 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp36, x86_mmx* %A
-	%tmp40 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.w( x86_mmx %tmp36, x86_mmx %tmp40 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp45, x86_mmx* %A
-	%tmp51 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp45a = bitcast x86_mmx %tmp45 to <4 x i16>
-        %tmp51a = bitcast x86_mmx %tmp51 to <4 x i16>
-	%tmp52 = mul <4 x i16> %tmp45a, %tmp51a		; <<4 x i16>> [#uses=2]
-        %tmp52a = bitcast <4 x i16> %tmp52 to x86_mmx
-	store x86_mmx %tmp52a, x86_mmx* %A
-	%tmp55 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp60 = tail call x86_mmx @llvm.x86.mmx.pmulh.w( x86_mmx %tmp52a, x86_mmx %tmp55 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp60, x86_mmx* %A
-	%tmp64 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp69 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd( x86_mmx %tmp60, x86_mmx %tmp64 )		; <x86_mmx> [#uses=1]
-	%tmp70 = bitcast x86_mmx %tmp69 to x86_mmx		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp70, x86_mmx* %A
-	%tmp75 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp70a = bitcast x86_mmx %tmp70 to <4 x i16>
-        %tmp75a = bitcast x86_mmx %tmp75 to <4 x i16>
-	%tmp76 = and <4 x i16> %tmp70a, %tmp75a		; <<4 x i16>> [#uses=2]
-        %tmp76a = bitcast <4 x i16> %tmp76 to x86_mmx
-	store x86_mmx %tmp76a, x86_mmx* %A
-	%tmp81 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp76b = bitcast x86_mmx %tmp76a to <4 x i16>
-        %tmp81a = bitcast x86_mmx %tmp81 to <4 x i16>
-	%tmp82 = or <4 x i16> %tmp76b, %tmp81a		; <<4 x i16>> [#uses=2]
-        %tmp82a = bitcast <4 x i16> %tmp82 to x86_mmx
-	store x86_mmx %tmp82a, x86_mmx* %A
-	%tmp87 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp82b = bitcast x86_mmx %tmp82a to <4 x i16>
-        %tmp87a = bitcast x86_mmx %tmp87 to <4 x i16>
-	%tmp88 = xor <4 x i16> %tmp82b, %tmp87a		; <<4 x i16>> [#uses=1]
-        %tmp88a = bitcast <4 x i16> %tmp88 to x86_mmx
-	store x86_mmx %tmp88a, x86_mmx* %A
-	tail call void @llvm.x86.mmx.emms( )
-	ret void
+  %tmp1 = load x86_mmx* %A
+  %tmp3 = load x86_mmx* %B
+  %tmp1a = bitcast x86_mmx %tmp1 to <4 x i16>
+  %tmp3a = bitcast x86_mmx %tmp3 to <4 x i16>
+  %tmp4 = add <4 x i16> %tmp1a, %tmp3a
+  %tmp4a = bitcast <4 x i16> %tmp4 to x86_mmx
+  store x86_mmx %tmp4a, x86_mmx* %A
+  %tmp7 = load x86_mmx* %B
+  %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %tmp4a, x86_mmx %tmp7)
+  store x86_mmx %tmp12, x86_mmx* %A
+  %tmp16 = load x86_mmx* %B
+  %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %tmp12, x86_mmx %tmp16)
+  store x86_mmx %tmp21, x86_mmx* %A
+  %tmp27 = load x86_mmx* %B
+  %tmp21a = bitcast x86_mmx %tmp21 to <4 x i16>
+  %tmp27a = bitcast x86_mmx %tmp27 to <4 x i16>
+  %tmp28 = sub <4 x i16> %tmp21a, %tmp27a
+  %tmp28a = bitcast <4 x i16> %tmp28 to x86_mmx
+  store x86_mmx %tmp28a, x86_mmx* %A
+  %tmp31 = load x86_mmx* %B
+  %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %tmp28a, x86_mmx %tmp31)
+  store x86_mmx %tmp36, x86_mmx* %A
+  %tmp40 = load x86_mmx* %B
+  %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %tmp36, x86_mmx %tmp40)
+  store x86_mmx %tmp45, x86_mmx* %A
+  %tmp51 = load x86_mmx* %B
+  %tmp45a = bitcast x86_mmx %tmp45 to <4 x i16>
+  %tmp51a = bitcast x86_mmx %tmp51 to <4 x i16>
+  %tmp52 = mul <4 x i16> %tmp45a, %tmp51a
+  %tmp52a = bitcast <4 x i16> %tmp52 to x86_mmx
+  store x86_mmx %tmp52a, x86_mmx* %A
+  %tmp55 = load x86_mmx* %B
+  %tmp60 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %tmp52a, x86_mmx %tmp55)
+  store x86_mmx %tmp60, x86_mmx* %A
+  %tmp64 = load x86_mmx* %B
+  %tmp69 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %tmp60, x86_mmx %tmp64)
+  %tmp70 = bitcast x86_mmx %tmp69 to x86_mmx
+  store x86_mmx %tmp70, x86_mmx* %A
+  %tmp75 = load x86_mmx* %B
+  %tmp70a = bitcast x86_mmx %tmp70 to <4 x i16>
+  %tmp75a = bitcast x86_mmx %tmp75 to <4 x i16>
+  %tmp76 = and <4 x i16> %tmp70a, %tmp75a
+  %tmp76a = bitcast <4 x i16> %tmp76 to x86_mmx
+  store x86_mmx %tmp76a, x86_mmx* %A
+  %tmp81 = load x86_mmx* %B
+  %tmp76b = bitcast x86_mmx %tmp76a to <4 x i16>
+  %tmp81a = bitcast x86_mmx %tmp81 to <4 x i16>
+  %tmp82 = or <4 x i16> %tmp76b, %tmp81a
+  %tmp82a = bitcast <4 x i16> %tmp82 to x86_mmx
+  store x86_mmx %tmp82a, x86_mmx* %A
+  %tmp87 = load x86_mmx* %B
+  %tmp82b = bitcast x86_mmx %tmp82a to <4 x i16>
+  %tmp87a = bitcast x86_mmx %tmp87 to <4 x i16>
+  %tmp88 = xor <4 x i16> %tmp82b, %tmp87a
+  %tmp88a = bitcast <4 x i16> %tmp88 to x86_mmx
+  store x86_mmx %tmp88a, x86_mmx* %A
+  tail call void @llvm.x86.mmx.emms( )
+  ret void
 }
 
-;; The following is modified to use MMX intrinsics everywhere they work.
+; X32-LABEL: test3
+define <1 x i64> @test3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) nounwind {
+entry:
+  %tmp2942 = icmp eq i32 %count, 0
+  br i1 %tmp2942, label %bb31, label %bb26
+
+bb26:
+; X32:  addl
+; X32:  adcl
+  %i.037.0 = phi i32 [ 0, %entry ], [ %tmp25, %bb26 ]
+  %sum.035.0 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ]
+  %tmp13 = getelementptr <1 x i64>* %b, i32 %i.037.0
+  %tmp14 = load <1 x i64>* %tmp13
+  %tmp18 = getelementptr <1 x i64>* %a, i32 %i.037.0
+  %tmp19 = load <1 x i64>* %tmp18
+  %tmp21 = add <1 x i64> %tmp19, %tmp14
+  %tmp22 = add <1 x i64> %tmp21, %sum.035.0
+  %tmp25 = add i32 %i.037.0, 1
+  %tmp29 = icmp ult i32 %tmp25, %count
+  br i1 %tmp29, label %bb26, label %bb31
+
+bb31:
+  %sum.035.1 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ]
+  ret <1 x i64> %sum.035.1
+}
 
-define void @fooa(x86_mmx* %A, x86_mmx* %B) {
+; There are no MMX operations here, so we use XMM or i64.
+; X64-LABEL: ti8
+define void @ti8(double %a, double %b) nounwind {
 entry:
-	%tmp1 = load x86_mmx* %A		; <x86_mmx> [#uses=1]
-	%tmp3 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp4 = tail call x86_mmx @llvm.x86.mmx.padd.b( x86_mmx %tmp1, x86_mmx %tmp3 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp4, x86_mmx* %A
-	%tmp7 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.b( x86_mmx %tmp4, x86_mmx %tmp7 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp12, x86_mmx* %A
-	%tmp16 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.b( x86_mmx %tmp12, x86_mmx %tmp16 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp21, x86_mmx* %A
-	%tmp27 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp28 = tail call x86_mmx @llvm.x86.mmx.psub.b( x86_mmx %tmp21, x86_mmx %tmp27 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp28, x86_mmx* %A
-	%tmp31 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.b( x86_mmx %tmp28, x86_mmx %tmp31 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp36, x86_mmx* %A
-	%tmp40 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.b( x86_mmx %tmp36, x86_mmx %tmp40 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp45, x86_mmx* %A
-	%tmp51 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp51a = bitcast x86_mmx %tmp51 to i64
-        %tmp51aa = bitcast i64 %tmp51a to <8 x i8>
-        %tmp51b = bitcast x86_mmx %tmp45 to <8 x i8>
-	%tmp52 = mul <8 x i8> %tmp51b, %tmp51aa		; <x86_mmx> [#uses=2]
-        %tmp52a = bitcast <8 x i8> %tmp52 to i64
-        %tmp52aa = bitcast i64 %tmp52a to x86_mmx
-	store x86_mmx %tmp52aa, x86_mmx* %A
-	%tmp57 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp58 = tail call x86_mmx @llvm.x86.mmx.pand( x86_mmx %tmp51, x86_mmx %tmp57 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp58, x86_mmx* %A
-	%tmp63 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp64 = tail call x86_mmx @llvm.x86.mmx.por( x86_mmx %tmp58, x86_mmx %tmp63 )		; <x86_mmx> [#uses=2]	
-	store x86_mmx %tmp64, x86_mmx* %A
-	%tmp69 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp70 = tail call x86_mmx @llvm.x86.mmx.pxor( x86_mmx %tmp64, x86_mmx %tmp69 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp70, x86_mmx* %A
-	tail call void @llvm.x86.mmx.emms( )
-	ret void
+  %tmp1 = bitcast double %a to <8 x i8>
+  %tmp2 = bitcast double %b to <8 x i8>
+  %tmp3 = add <8 x i8> %tmp1, %tmp2
+; X64:  paddb
+  store <8 x i8> %tmp3, <8 x i8>* null
+  ret void
 }
 
-define void @baza(x86_mmx* %A, x86_mmx* %B) {
+; X64-LABEL: ti16
+define void @ti16(double %a, double %b) nounwind {
 entry:
-	%tmp1 = load x86_mmx* %A		; <x86_mmx> [#uses=1]
-	%tmp3 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp4 = tail call x86_mmx @llvm.x86.mmx.padd.d( x86_mmx %tmp1, x86_mmx %tmp3 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp4, x86_mmx* %A
-	%tmp9 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp10 = tail call x86_mmx @llvm.x86.mmx.psub.d( x86_mmx %tmp4, x86_mmx %tmp9 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp10, x86_mmx* %A
-	%tmp15 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-        %tmp10a = bitcast x86_mmx %tmp10 to <2 x i32>
-        %tmp15a = bitcast x86_mmx %tmp15 to <2 x i32>
-	%tmp16 = mul <2 x i32> %tmp10a, %tmp15a		; <x86_mmx> [#uses=2]
-        %tmp16a = bitcast <2 x i32> %tmp16 to x86_mmx
-	store x86_mmx %tmp16a, x86_mmx* %A
-	%tmp21 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp22 = tail call x86_mmx @llvm.x86.mmx.pand( x86_mmx %tmp16a, x86_mmx %tmp21 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp22, x86_mmx* %A
-	%tmp27 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp28 = tail call x86_mmx @llvm.x86.mmx.por( x86_mmx %tmp22, x86_mmx %tmp27 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp28, x86_mmx* %A
-	%tmp33 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp34 = tail call x86_mmx @llvm.x86.mmx.pxor( x86_mmx %tmp28, x86_mmx %tmp33 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp34, x86_mmx* %A
-	tail call void @llvm.x86.mmx.emms( )
-	ret void
+  %tmp1 = bitcast double %a to <4 x i16>
+  %tmp2 = bitcast double %b to <4 x i16>
+  %tmp3 = add <4 x i16> %tmp1, %tmp2
+; X64:  paddw
+  store <4 x i16> %tmp3, <4 x i16>* null
+  ret void
 }
 
-define void @bara(x86_mmx* %A, x86_mmx* %B) {
+; X64-LABEL: ti32
+define void @ti32(double %a, double %b) nounwind {
 entry:
-	%tmp1 = load x86_mmx* %A		; <x86_mmx> [#uses=1]
-	%tmp3 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp4 = tail call x86_mmx @llvm.x86.mmx.padd.w( x86_mmx %tmp1, x86_mmx %tmp3 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp4, x86_mmx* %A
-	%tmp7 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.w( x86_mmx %tmp4, x86_mmx %tmp7 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp12, x86_mmx* %A
-	%tmp16 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp12, x86_mmx %tmp16 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp21, x86_mmx* %A
-	%tmp27 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp28 = tail call x86_mmx @llvm.x86.mmx.psub.w( x86_mmx %tmp21, x86_mmx %tmp27 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp28, x86_mmx* %A
-	%tmp31 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.w( x86_mmx %tmp28, x86_mmx %tmp31 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp36, x86_mmx* %A
-	%tmp40 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.w( x86_mmx %tmp36, x86_mmx %tmp40 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp45, x86_mmx* %A
-	%tmp51 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp52 = tail call x86_mmx @llvm.x86.mmx.pmull.w( x86_mmx %tmp45, x86_mmx %tmp51 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp52, x86_mmx* %A
-	%tmp55 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp60 = tail call x86_mmx @llvm.x86.mmx.pmulh.w( x86_mmx %tmp52, x86_mmx %tmp55 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp60, x86_mmx* %A
-	%tmp64 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp69 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd( x86_mmx %tmp60, x86_mmx %tmp64 )		; <x86_mmx> [#uses=1]
-	%tmp70 = bitcast x86_mmx %tmp69 to x86_mmx		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp70, x86_mmx* %A
-	%tmp75 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp76 = tail call x86_mmx @llvm.x86.mmx.pand( x86_mmx %tmp70, x86_mmx %tmp75 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp76, x86_mmx* %A
-	%tmp81 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp82 = tail call x86_mmx @llvm.x86.mmx.por( x86_mmx %tmp76, x86_mmx %tmp81 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp82, x86_mmx* %A
-	%tmp87 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
-	%tmp88 = tail call x86_mmx @llvm.x86.mmx.pxor( x86_mmx %tmp82, x86_mmx %tmp87 )		; <x86_mmx> [#uses=2]
-	store x86_mmx %tmp88, x86_mmx* %A
-	tail call void @llvm.x86.mmx.emms( )
-	ret void
+  %tmp1 = bitcast double %a to <2 x i32>
+  %tmp2 = bitcast double %b to <2 x i32>
+  %tmp3 = add <2 x i32> %tmp1, %tmp2
+; X64:  paddd
+  store <2 x i32> %tmp3, <2 x i32>* null
+  ret void
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx)
+; X64-LABEL: ti64
+define void @ti64(double %a, double %b) nounwind {
+entry:
+  %tmp1 = bitcast double %a to <1 x i64>
+  %tmp2 = bitcast double %b to <1 x i64>
+  %tmp3 = add <1 x i64> %tmp1, %tmp2
+; X64:  addq
+  store <1 x i64> %tmp3, <1 x i64>* null
+  ret void
+}
 
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx)
+; MMX intrinsics calls get us MMX instructions.
+; X64-LABEL: ti8a
+define void @ti8a(double %a, double %b) nounwind {
+entry:
+  %tmp1 = bitcast double %a to x86_mmx
+; X64: movdq2q
+  %tmp2 = bitcast double %b to x86_mmx
+; X64: movdq2q
+  %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %tmp1, x86_mmx %tmp2)
+  store x86_mmx %tmp3, x86_mmx* null
+  ret void
+}
 
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
+; X64-LABEL: ti16a
+define void @ti16a(double %a, double %b) nounwind {
+entry:
+  %tmp1 = bitcast double %a to x86_mmx
+; X64: movdq2q
+  %tmp2 = bitcast double %b to x86_mmx
+; X64: movdq2q
+  %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %tmp1, x86_mmx %tmp2)
+  store x86_mmx %tmp3, x86_mmx* null
+  ret void
+}
 
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx)
+; X64-LABEL: ti32a
+define void @ti32a(double %a, double %b) nounwind {
+entry:
+  %tmp1 = bitcast double %a to x86_mmx
+; X64: movdq2q
+  %tmp2 = bitcast double %b to x86_mmx
+; X64: movdq2q
+  %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %tmp1, x86_mmx %tmp2)
+  store x86_mmx %tmp3, x86_mmx* null
+  ret void
+}
 
-declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx)
+; X64-LABEL: ti64a
+define void @ti64a(double %a, double %b) nounwind {
+entry:
+  %tmp1 = bitcast double %a to x86_mmx
+; X64: movdq2q
+  %tmp2 = bitcast double %b to x86_mmx
+; X64: movdq2q
+  %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %tmp1, x86_mmx %tmp2)
+  store x86_mmx %tmp3, x86_mmx* null
+  ret void
+}
+
+declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
 
+declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx)
 declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx)
 
 declare void @llvm.x86.mmx.emms()
 
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
 declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx)
 declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padds.d(x86_mmx, x86_mmx)
 declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx)
 declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.psubs.d(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx)
 
diff --git a/test/CodeGen/X86/mmx-bitcast-to-i64.ll b/test/CodeGen/X86/mmx-bitcast-to-i64.ll
deleted file mode 100644
index 8b1840a..0000000
--- a/test/CodeGen/X86/mmx-bitcast-to-i64.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc < %s -march=x86-64 | grep movd | count 4
-
-define i64 @foo(x86_mmx* %p) {
-  %t = load x86_mmx* %p
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %t)
-  %s = bitcast x86_mmx %u to i64
-  ret i64 %s
-}
-define i64 @goo(x86_mmx* %p) {
-  %t = load x86_mmx* %p
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %t)
-  %s = bitcast x86_mmx %u to i64
-  ret i64 %s
-}
-define i64 @hoo(x86_mmx* %p) {
-  %t = load x86_mmx* %p
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %t)
-  %s = bitcast x86_mmx %u to i64
-  ret i64 %s
-}
-define i64 @ioo(x86_mmx* %p) {
-  %t = load x86_mmx* %p
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %t)
-  %s = bitcast x86_mmx %u to i64
-  ret i64 %s
-}
-
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
diff --git a/test/CodeGen/X86/mmx-bitcast.ll b/test/CodeGen/X86/mmx-bitcast.ll
new file mode 100644
index 0000000..a2eb96a
--- /dev/null
+++ b/test/CodeGen/X86/mmx-bitcast.ll
@@ -0,0 +1,109 @@
+; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse2 | FileCheck %s
+
+define i64 @t0(x86_mmx* %p) {
+; CHECK-LABEL: t0:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movq
+; CHECK-NEXT:    paddq %mm0, %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    retq
+  %t = load x86_mmx* %p
+  %u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %t)
+  %s = bitcast x86_mmx %u to i64
+  ret i64 %s
+}
+
+define i64 @t1(x86_mmx* %p) {
+; CHECK-LABEL: t1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movq
+; CHECK-NEXT:    paddd %mm0, %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    retq
+  %t = load x86_mmx* %p
+  %u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %t)
+  %s = bitcast x86_mmx %u to i64
+  ret i64 %s
+}
+
+define i64 @t2(x86_mmx* %p) {
+; CHECK-LABEL: t2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movq
+; CHECK-NEXT:    paddw %mm0, %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    retq
+  %t = load x86_mmx* %p
+  %u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %t)
+  %s = bitcast x86_mmx %u to i64
+  ret i64 %s
+}
+
+define i64 @t3(x86_mmx* %p) {
+; CHECK-LABEL: t3:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movq
+; CHECK-NEXT:    paddb %mm0, %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    retq
+  %t = load x86_mmx* %p
+  %u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %t)
+  %s = bitcast x86_mmx %u to i64
+  ret i64 %s
+}
+
+@R = external global x86_mmx
+
+define void @t4(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: t4:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    movd
+; CHECK-NEXT:    movd
+; CHECK:    retq
+entry:
+  %tmp2 = bitcast <1 x i64> %A to x86_mmx
+  %tmp3 = bitcast <1 x i64> %B to x86_mmx
+  %tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %tmp2, x86_mmx %tmp3)
+  store x86_mmx %tmp7, x86_mmx* @R
+  tail call void @llvm.x86.mmx.emms()
+  ret void
+}
+
+define i64 @t5(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: t5:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movd
+; CHECK-NEXT:    movd
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; CHECK-NEXT:    movd %xmm0, %rax
+; CHECK-NEXT:    retq
+  %v0 = insertelement <2 x i32> undef, i32 %a, i32 0
+  %v1 = insertelement <2 x i32> %v0, i32 %b, i32 1
+  %conv = bitcast <2 x i32> %v1 to i64
+  ret i64 %conv
+}
+
+declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
+
+define <1 x i64> @t6(i64 %t) {
+; CHECK-LABEL: t6:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movd
+; CHECK-NEXT:    psllq $48, %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    retq
+  %t1 = insertelement <1 x i64> undef, i64 %t, i32 0
+  %t0 = bitcast <1 x i64> %t1 to x86_mmx
+  %t2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %t0, i32 48)
+  %t3 = bitcast x86_mmx %t2 to <1 x i64>
+  ret <1 x i64> %t3
+}
+
+declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
+declare void @llvm.x86.mmx.emms()
+
diff --git a/test/CodeGen/X86/mmx-builtins.ll b/test/CodeGen/X86/mmx-builtins.ll
deleted file mode 100644
index aabdd53..0000000
--- a/test/CodeGen/X86/mmx-builtins.ll
+++ /dev/null
@@ -1,1349 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+ssse3,-avx | FileCheck %s
-; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mattr=+mmx,+ssse3,-avx | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
-
-declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test1(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: phaddw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
-  %6 = bitcast <4 x i16> %5 to <1 x i64>
-  %7 = extractelement <1 x i64> %6, i32 0
-  ret i64 %7
-}
-
-declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pcmpgtd
-entry:
-  %0 = bitcast <1 x i64> %b to <2 x i32>
-  %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test87(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pcmpgtw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test86(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pcmpgtb
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
-  %4 = bitcast <8 x i8> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test85(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pcmpeqd
-entry:
-  %0 = bitcast <1 x i64> %b to <2 x i32>
-  %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test84(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pcmpeqw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test83(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pcmpeqb
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
-  %4 = bitcast <8 x i8> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test82(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: punpckldq
-entry:
-  %0 = bitcast <1 x i64> %b to <2 x i32>
-  %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test81(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: punpcklwd
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test80(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: punpcklbw
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
-  %4 = bitcast <8 x i8> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test79(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: punpckhdq
-entry:
-  %0 = bitcast <1 x i64> %b to <2 x i32>
-  %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test78(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: punpckhwd
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test77(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: punpckhbw
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
-  %4 = bitcast <8 x i8> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test76(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: packuswb
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
-  %4 = bitcast <8 x i8> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test75(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: packssdw
-entry:
-  %0 = bitcast <1 x i64> %b to <2 x i32>
-  %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test74(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: packsswb
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
-  %4 = bitcast <8 x i8> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32) nounwind readnone
-
-define i64 @test73(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: psrad
-entry:
-  %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
-  %3 = bitcast <2 x i32> %2 to <1 x i64>
-  %4 = extractelement <1 x i64> %3, i32 0
-  ret i64 %4
-}
-
-declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32) nounwind readnone
-
-define i64 @test72(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: psraw
-entry:
-  %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
-  %3 = bitcast <4 x i16> %2 to <1 x i64>
-  %4 = extractelement <1 x i64> %3, i32 0
-  ret i64 %4
-}
-
-declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
-
-define i64 @test71(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: psrlq
-entry:
-  %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to i64
-  ret i64 %2
-}
-
-declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32) nounwind readnone
-
-define i64 @test70(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: psrld
-entry:
-  %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
-  %3 = bitcast <2 x i32> %2 to <1 x i64>
-  %4 = extractelement <1 x i64> %3, i32 0
-  ret i64 %4
-}
-
-declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone
-
-define i64 @test69(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: psrlw
-entry:
-  %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
-  %3 = bitcast <4 x i16> %2 to <1 x i64>
-  %4 = extractelement <1 x i64> %3, i32 0
-  ret i64 %4
-}
-
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone
-
-define i64 @test68(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: psllq
-entry:
-  %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to i64
-  ret i64 %2
-}
-
-declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32) nounwind readnone
-
-define i64 @test67(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: pslld
-entry:
-  %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
-  %3 = bitcast <2 x i32> %2 to <1 x i64>
-  %4 = extractelement <1 x i64> %3, i32 0
-  ret i64 %4
-}
-
-declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32) nounwind readnone
-
-define i64 @test66(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: psllw
-entry:
-  %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
-  %3 = bitcast <4 x i16> %2 to <1 x i64>
-  %4 = extractelement <1 x i64> %3, i32 0
-  ret i64 %4
-}
-
-declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test65(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psrad
-entry:
-  %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test64(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psraw
-entry:
-  %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test63(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psrlq
-entry:
-  %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
-  ret i64 %3
-}
-
-declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test62(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psrld
-entry:
-  %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test61(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psrlw
-entry:
-  %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test60(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psllq
-entry:
-  %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
-  ret i64 %3
-}
-
-declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test59(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pslld
-entry:
-  %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test58(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psllw
-entry:
-  %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test56(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pxor
-entry:
-  %0 = bitcast <1 x i64> %b to <2 x i32>
-  %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test55(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: por
-entry:
-  %0 = bitcast <1 x i64> %b to <2 x i32>
-  %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test54(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pandn
-entry:
-  %0 = bitcast <1 x i64> %b to <2 x i32>
-  %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test53(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pand
-entry:
-  %0 = bitcast <1 x i64> %b to <2 x i32>
-  %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test52(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmullw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-define i64 @test51(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmullw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test50(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmulhw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test49(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmaddwd
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test48(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psubusw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test47(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psubusb
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
-  %4 = bitcast <8 x i8> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test46(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psubsw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test45(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psubsb
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
-  %4 = bitcast <8 x i8> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-define i64 @test44(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psubq
-entry:
-  %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
-  %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
-  %3 = bitcast x86_mmx %2 to i64
-  ret i64 %3
-}
-
-declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
-
-declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test43(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psubd
-entry:
-  %0 = bitcast <1 x i64> %b to <2 x i32>
-  %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test42(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psubw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test41(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psubb
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
-  %4 = bitcast <8 x i8> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test40(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: paddusw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test39(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: paddusb
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
-  %4 = bitcast <8 x i8> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test38(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: paddsw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test37(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: paddsb
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
-  %4 = bitcast <8 x i8> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test36(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: paddq
-entry:
-  %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
-  %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
-  %3 = bitcast x86_mmx %2 to i64
-  ret i64 %3
-}
-
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test35(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: paddd
-entry:
-  %0 = bitcast <1 x i64> %b to <2 x i32>
-  %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test34(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: paddw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test33(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: paddb
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
-  %4 = bitcast <8 x i8> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test32(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psadbw
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
-  ret i64 %3
-}
-
-declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test31(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pminsw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test30(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pminub
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
-  %4 = bitcast <8 x i8> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test29(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmaxsw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test28(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmaxub
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
-  %4 = bitcast <8 x i8> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test27(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pavgw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test26(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pavgb
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
-  %4 = bitcast <8 x i8> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare void @llvm.x86.mmx.movnt.dq(x86_mmx*, x86_mmx) nounwind
-
-define void @test25(<1 x i64>* %p, <1 x i64> %a) nounwind optsize ssp {
-; CHECK: movntq
-entry:
-  %mmx_ptr_var.i = bitcast <1 x i64>* %p to x86_mmx*
-  %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  tail call void @llvm.x86.mmx.movnt.dq(x86_mmx* %mmx_ptr_var.i, x86_mmx %mmx_var.i) nounwind
-  ret void
-}
-
-declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx) nounwind readnone
-
-define i32 @test24(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: pmovmskb
-entry:
-  %0 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %0 to x86_mmx
-  %1 = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %mmx_var.i) nounwind
-  ret i32 %1
-}
-
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, i8*) nounwind
-
-define void @test23(<1 x i64> %d, <1 x i64> %n, i8* %p) nounwind optsize ssp {
-; CHECK: maskmovq
-entry:
-  %0 = bitcast <1 x i64> %n to <8 x i8>
-  %1 = bitcast <1 x i64> %d to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  tail call void @llvm.x86.mmx.maskmovq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i, i8* %p) nounwind
-  ret void
-}
-
-declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test22(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmulhuw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
-
-define i64 @test21(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: pshufw
-entry:
-  %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-define i32 @test21_2(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: test21_2
-; CHECK: pshufw
-; CHECK: movd
-entry:
-  %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <2 x i32>
-  %5 = extractelement <2 x i32> %4, i32 0
-  ret i32 %5
-}
-
-declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmuludq
-entry:
-  %0 = bitcast <1 x i64> %b to <2 x i32>
-  %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
-  ret i64 %3
-}
-
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
-
-define <2 x double> @test19(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: cvtpi2pd
-entry:
-  %0 = bitcast <1 x i64> %a to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %1) nounwind readnone
-  ret <2 x double> %2
-}
-
-declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
-
-define i64 @test18(<2 x double> %a) nounwind readnone optsize ssp {
-; CHECK: cvttpd2pi
-entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone
-  %1 = bitcast x86_mmx %0 to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to <1 x i64>
-  %3 = extractelement <1 x i64> %2, i32 0
-  ret i64 %3
-}
-
-declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
-
-define i64 @test17(<2 x double> %a) nounwind readnone optsize ssp {
-; CHECK: cvtpd2pi
-entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone
-  %1 = bitcast x86_mmx %0 to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to <1 x i64>
-  %3 = extractelement <1 x i64> %2, i32 0
-  ret i64 %3
-}
-
-declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
-
-define i64 @test16(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: palignr
-entry:
-  %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
-  %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %mmx_var, x86_mmx %mmx_var1, i8 16)
-  %3 = bitcast x86_mmx %2 to i64
-  ret i64 %3
-}
-
-declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
-
-define i64 @test15(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: pabsd
-entry:
-  %0 = bitcast <1 x i64> %a to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
-
-define i64 @test14(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: pabsw
-entry:
-  %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
-  %4 = bitcast <4 x i16> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
-
-define i64 @test13(<1 x i64> %a) nounwind readnone optsize ssp {
-; CHECK: pabsb
-entry:
-  %0 = bitcast <1 x i64> %a to <8 x i8>
-  %1 = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <8 x i8>
-  %4 = bitcast <8 x i8> %3 to <1 x i64>
-  %5 = extractelement <1 x i64> %4, i32 0
-  ret i64 %5
-}
-
-declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test12(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psignd
-entry:
-  %0 = bitcast <1 x i64> %b to <2 x i32>
-  %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
-  %6 = bitcast <2 x i32> %5 to <1 x i64>
-  %7 = extractelement <1 x i64> %6, i32 0
-  ret i64 %7
-}
-
-declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test11(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psignw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
-  %6 = bitcast <4 x i16> %5 to <1 x i64>
-  %7 = extractelement <1 x i64> %6, i32 0
-  ret i64 %7
-}
-
-declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test10(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: psignb
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
-  %6 = bitcast <8 x i8> %5 to <1 x i64>
-  %7 = extractelement <1 x i64> %6, i32 0
-  ret i64 %7
-}
-
-declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test9(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pshufb
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
-  %6 = bitcast <8 x i8> %5 to <1 x i64>
-  %7 = extractelement <1 x i64> %6, i32 0
-  ret i64 %7
-}
-
-declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test8(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmulhrsw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
-  %6 = bitcast <4 x i16> %5 to <1 x i64>
-  %7 = extractelement <1 x i64> %6, i32 0
-  ret i64 %7
-}
-
-declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test7(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: pmaddubsw
-entry:
-  %0 = bitcast <1 x i64> %b to <8 x i8>
-  %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
-  %6 = bitcast <8 x i8> %5 to <1 x i64>
-  %7 = extractelement <1 x i64> %6, i32 0
-  ret i64 %7
-}
-
-declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test6(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: phsubsw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
-  %6 = bitcast <4 x i16> %5 to <1 x i64>
-  %7 = extractelement <1 x i64> %6, i32 0
-  ret i64 %7
-}
-
-declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test5(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: phsubd
-entry:
-  %0 = bitcast <1 x i64> %b to <2 x i32>
-  %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
-  %6 = bitcast <2 x i32> %5 to <1 x i64>
-  %7 = extractelement <1 x i64> %6, i32 0
-  ret i64 %7
-}
-
-declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test4(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: phsubw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
-  %6 = bitcast <4 x i16> %5 to <1 x i64>
-  %7 = extractelement <1 x i64> %6, i32 0
-  ret i64 %7
-}
-
-declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test3(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: phaddsw
-entry:
-  %0 = bitcast <1 x i64> %b to <4 x i16>
-  %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
-  %6 = bitcast <4 x i16> %5 to <1 x i64>
-  %7 = extractelement <1 x i64> %6, i32 0
-  ret i64 %7
-}
-
-declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
-
-define i64 @test2(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
-; CHECK: phaddd
-entry:
-  %0 = bitcast <1 x i64> %b to <2 x i32>
-  %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
-  %6 = bitcast <2 x i32> %5 to <1 x i64>
-  %7 = extractelement <1 x i64> %6, i32 0
-  ret i64 %7
-}
-
-define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind {
-; CHECK: cvtpi2ps
-  %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, x86_mmx %b)
-  ret <4 x float> %c
-}
-
-declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone
diff --git a/test/CodeGen/X86/mmx-emms.ll b/test/CodeGen/X86/mmx-emms.ll
deleted file mode 100644
index 5ff2588..0000000
--- a/test/CodeGen/X86/mmx-emms.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx | grep emms
-define void @foo() {
-entry:
-	call void @llvm.x86.mmx.emms( )
-	br label %return
-
-return:		; preds = %entry
-	ret void
-}
-
-declare void @llvm.x86.mmx.emms()
diff --git a/test/CodeGen/X86/mmx-fold-load.ll b/test/CodeGen/X86/mmx-fold-load.ll
new file mode 100644
index 0000000..d49edac
--- /dev/null
+++ b/test/CodeGen/X86/mmx-fold-load.ll
@@ -0,0 +1,282 @@
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck %s
+
+define i64 @t0(<1 x i64>* %a, i32* %b) {
+; CHECK-LABEL: t0:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:         movq (%[[REG1:[a-z]+]]), %mm0
+; CHECK-NEXT:    psllq (%[[REG2:[a-z]+]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast <1 x i64>* %a to x86_mmx*
+  %1 = load x86_mmx* %0, align 8
+  %2 = load i32* %b, align 4
+  %3 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %1, i32 %2)
+  %4 = bitcast x86_mmx %3 to i64
+  ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
+
+define i64 @t1(<1 x i64>* %a, i32* %b) {
+; CHECK-LABEL: t1:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:         movq (%[[REG1]]), %mm0
+; CHECK-NEXT:    psrlq (%[[REG2]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast <1 x i64>* %a to x86_mmx*
+  %1 = load x86_mmx* %0, align 8
+  %2 = load i32* %b, align 4
+  %3 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %1, i32 %2)
+  %4 = bitcast x86_mmx %3 to i64
+  ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32)
+
+define i64 @t2(<1 x i64>* %a, i32* %b) {
+; CHECK-LABEL: t2:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:         movq (%[[REG1]]), %mm0
+; CHECK-NEXT:    psllw (%[[REG2]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast <1 x i64>* %a to x86_mmx*
+  %1 = load x86_mmx* %0, align 8
+  %2 = load i32* %b, align 4
+  %3 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %1, i32 %2)
+  %4 = bitcast x86_mmx %3 to i64
+  ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32)
+
+define i64 @t3(<1 x i64>* %a, i32* %b) {
+; CHECK-LABEL: t3:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:         movq (%[[REG1]]), %mm0
+; CHECK-NEXT:    psrlw (%[[REG2]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast <1 x i64>* %a to x86_mmx*
+  %1 = load x86_mmx* %0, align 8
+  %2 = load i32* %b, align 4
+  %3 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %1, i32 %2)
+  %4 = bitcast x86_mmx %3 to i64
+  ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32)
+
+define i64 @t4(<1 x i64>* %a, i32* %b) {
+; CHECK-LABEL: t4:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:         movq (%[[REG1]]), %mm0
+; CHECK-NEXT:    pslld (%[[REG2]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast <1 x i64>* %a to x86_mmx*
+  %1 = load x86_mmx* %0, align 8
+  %2 = load i32* %b, align 4
+  %3 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %1, i32 %2)
+  %4 = bitcast x86_mmx %3 to i64
+  ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32)
+
+define i64 @t5(<1 x i64>* %a, i32* %b) {
+; CHECK-LABEL: t5:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:         movq (%[[REG1]]), %mm0
+; CHECK-NEXT:    psrld (%[[REG2]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast <1 x i64>* %a to x86_mmx*
+  %1 = load x86_mmx* %0, align 8
+  %2 = load i32* %b, align 4
+  %3 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %1, i32 %2)
+  %4 = bitcast x86_mmx %3 to i64
+  ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32)
+
+define i64 @t6(<1 x i64>* %a, i32* %b) {
+; CHECK-LABEL: t6:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:         movq (%[[REG1]]), %mm0
+; CHECK-NEXT:    psraw (%[[REG2]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast <1 x i64>* %a to x86_mmx*
+  %1 = load x86_mmx* %0, align 8
+  %2 = load i32* %b, align 4
+  %3 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %1, i32 %2)
+  %4 = bitcast x86_mmx %3 to i64
+  ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32)
+
+define i64 @t7(<1 x i64>* %a, i32* %b) {
+; CHECK-LABEL: t7:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:         movq (%[[REG1]]), %mm0
+; CHECK-NEXT:    psrad (%[[REG2]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast <1 x i64>* %a to x86_mmx*
+  %1 = load x86_mmx* %0, align 8
+  %2 = load i32* %b, align 4
+  %3 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %1, i32 %2)
+  %4 = bitcast x86_mmx %3 to i64
+  ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32)
+
+define i64 @tt0(x86_mmx %t, x86_mmx* %q) {
+; CHECK-LABEL: tt0:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:    paddb (%[[REG3:[a-z]+]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    emms
+; CHECK-NEXT:    retq
+entry:
+  %v = load x86_mmx* %q
+  %u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %v)
+  %s = bitcast x86_mmx %u to i64
+  call void @llvm.x86.mmx.emms()
+  ret i64 %s
+}
+declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
+declare void @llvm.x86.mmx.emms()
+
+define i64 @tt1(x86_mmx %t, x86_mmx* %q) {
+; CHECK-LABEL: tt1:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:    paddw (%[[REG3]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    emms
+; CHECK-NEXT:    retq
+entry:
+  %v = load x86_mmx* %q
+  %u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %v)
+  %s = bitcast x86_mmx %u to i64
+  call void @llvm.x86.mmx.emms()
+  ret i64 %s
+}
+declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
+
+define i64 @tt2(x86_mmx %t, x86_mmx* %q) {
+; CHECK-LABEL: tt2:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:    paddd (%[[REG3]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    emms
+; CHECK-NEXT:    retq
+entry:
+  %v = load x86_mmx* %q
+  %u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %v)
+  %s = bitcast x86_mmx %u to i64
+  call void @llvm.x86.mmx.emms()
+  ret i64 %s
+}
+declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+
+define i64 @tt3(x86_mmx %t, x86_mmx* %q) {
+; CHECK-LABEL: tt3:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:    paddq (%[[REG3]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    emms
+; CHECK-NEXT:    retq
+entry:
+  %v = load x86_mmx* %q
+  %u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %v)
+  %s = bitcast x86_mmx %u to i64
+  call void @llvm.x86.mmx.emms()
+  ret i64 %s
+}
+declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
+
+define i64 @tt4(x86_mmx %t, x86_mmx* %q) {
+; CHECK-LABEL: tt4:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:    paddusb (%[[REG3]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    emms
+; CHECK-NEXT:    retq
+entry:
+  %v = load x86_mmx* %q
+  %u = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %t, x86_mmx %v)
+  %s = bitcast x86_mmx %u to i64
+  call void @llvm.x86.mmx.emms()
+  ret i64 %s
+}
+declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx)
+
+define i64 @tt5(x86_mmx %t, x86_mmx* %q) {
+; CHECK-LABEL: tt5:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:    paddusw (%[[REG3]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    emms
+; CHECK-NEXT:    retq
+entry:
+  %v = load x86_mmx* %q
+  %u = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %t, x86_mmx %v)
+  %s = bitcast x86_mmx %u to i64
+  call void @llvm.x86.mmx.emms()
+  ret i64 %s
+}
+declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
+
+define i64 @tt6(x86_mmx %t, x86_mmx* %q) {
+; CHECK-LABEL: tt6:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:    psrlw (%[[REG3]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    emms
+; CHECK-NEXT:    retq
+entry:
+  %v = load x86_mmx* %q
+  %u = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %t, x86_mmx %v)
+  %s = bitcast x86_mmx %u to i64
+  call void @llvm.x86.mmx.emms()
+  ret i64 %s
+}
+declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx)
+
+define i64 @tt7(x86_mmx %t, x86_mmx* %q) {
+; CHECK-LABEL: tt7:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:    psrld (%[[REG3]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    emms
+; CHECK-NEXT:    retq
+entry:
+  %v = load x86_mmx* %q
+  %u = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %t, x86_mmx %v)
+  %s = bitcast x86_mmx %u to i64
+  call void @llvm.x86.mmx.emms()
+  ret i64 %s
+}
+declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx)
+
+define i64 @tt8(x86_mmx %t, x86_mmx* %q) {
+; CHECK-LABEL: tt8:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:    psrlq (%[[REG3]]), %mm0
+; CHECK-NEXT:    movd %mm0, %rax
+; CHECK-NEXT:    emms
+; CHECK-NEXT:    retq
+entry:
+  %v = load x86_mmx* %q
+  %u = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %t, x86_mmx %v)
+  %s = bitcast x86_mmx %u to i64
+  call void @llvm.x86.mmx.emms()
+  ret i64 %s
+}
+declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx)
diff --git a/test/CodeGen/X86/mmx-insert-element.ll b/test/CodeGen/X86/mmx-insert-element.ll
deleted file mode 100644
index 348dac8..0000000
--- a/test/CodeGen/X86/mmx-insert-element.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 | grep movq
-; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 | grep pshufd
-; This is not an MMX operation; promoted to XMM.
-
-define x86_mmx @qux(i32 %A) nounwind {
-	%tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1		; <<2 x i32>> [#uses=1]
-        %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx
-	ret x86_mmx %tmp4
-}
diff --git a/test/CodeGen/X86/mmx-intrinsics.ll b/test/CodeGen/X86/mmx-intrinsics.ll
new file mode 100644
index 0000000..39d481b
--- /dev/null
+++ b/test/CodeGen/X86/mmx-intrinsics.ll
@@ -0,0 +1,1358 @@
+; RUN: llc < %s -march=x86 -mattr=+mmx,+ssse3,-avx | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+ssse3,-avx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
+
+declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test1(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: phaddw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to x86_mmx
+  %3 = bitcast <4 x i16> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %6 = bitcast <4 x i16> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pcmpgtd
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test87(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pcmpgtw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test86(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pcmpgtb
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test85(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pcmpeqd
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test84(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pcmpeqw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test83(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pcmpeqb
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test82(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: punpckldq
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test81(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: punpcklwd
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test80(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: punpcklbw
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test79(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: punpckhdq
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test78(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: punpckhwd
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test77(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: punpckhbw
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test76(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: packuswb
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test75(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: packssdw
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test74(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: packsswb
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32) nounwind readnone
+
+define i64 @test73(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: psrad
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %mmx_var.i, i32 3) nounwind
+  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = extractelement <1 x i64> %3, i32 0
+  ret i64 %4
+}
+
+declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32) nounwind readnone
+
+define i64 @test72(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: psraw
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 3) nounwind
+  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
+  %4 = extractelement <1 x i64> %3, i32 0
+  ret i64 %4
+}
+
+declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
+
+define i64 @test71(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: psrlq
+entry:
+  %0 = extractelement <1 x i64> %a, i32 0
+  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %mmx_var.i, i32 3) nounwind
+  %2 = bitcast x86_mmx %1 to i64
+  ret i64 %2
+}
+
+declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32) nounwind readnone
+
+define i64 @test70(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: psrld
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 3) nounwind
+  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = extractelement <1 x i64> %3, i32 0
+  ret i64 %4
+}
+
+declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone
+
+define i64 @test69(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: psrlw
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %mmx_var.i, i32 3) nounwind
+  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
+  %4 = extractelement <1 x i64> %3, i32 0
+  ret i64 %4
+}
+
+declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone
+
+define i64 @test68(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: psllq
+entry:
+  %0 = extractelement <1 x i64> %a, i32 0
+  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %mmx_var.i, i32 3) nounwind
+  %2 = bitcast x86_mmx %1 to i64
+  ret i64 %2
+}
+
+declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32) nounwind readnone
+
+define i64 @test67(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: pslld
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %mmx_var.i, i32 3) nounwind
+  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = extractelement <1 x i64> %3, i32 0
+  ret i64 %4
+}
+
+declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32) nounwind readnone
+
+define i64 @test66(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: psllw
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 3) nounwind
+  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
+  %4 = extractelement <1 x i64> %3, i32 0
+  ret i64 %4
+}
+
+declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test65(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psrad
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1.i = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test64(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psraw
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1.i = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test63(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psrlq
+entry:
+  %0 = extractelement <1 x i64> %a, i32 0
+  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1.i = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to i64
+  ret i64 %3
+}
+
+declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test62(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psrld
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1.i = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test61(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psrlw
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1.i = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test60(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psllq
+entry:
+  %0 = extractelement <1 x i64> %a, i32 0
+  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1.i = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to i64
+  ret i64 %3
+}
+
+declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test59(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pslld
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1.i = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test58(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psllw
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1.i = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test56(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pxor
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test55(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: por
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test54(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pandn
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test53(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pand
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test52(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pmullw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+define i64 @test51(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pmullw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test50(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pmulhw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test49(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pmaddwd
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test48(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psubusw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test47(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psubusb
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test46(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psubsw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test45(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psubsb
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+define i64 @test44(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psubq
+entry:
+  %0 = extractelement <1 x i64> %a, i32 0
+  %mmx_var = bitcast i64 %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1 = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
+  %3 = bitcast x86_mmx %2 to i64
+  ret i64 %3
+}
+
+declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
+
+declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test43(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psubd
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test42(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psubw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test41(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psubb
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test40(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: paddusw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test39(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: paddusb
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test38(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: paddsw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test37(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: paddsb
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test36(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: paddq
+entry:
+  %0 = extractelement <1 x i64> %a, i32 0
+  %mmx_var = bitcast i64 %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1 = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
+  %3 = bitcast x86_mmx %2 to i64
+  ret i64 %3
+}
+
+declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test35(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: paddd
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test34(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: paddw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test33(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: paddb
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test32(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psadbw
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to i64
+  ret i64 %3
+}
+
+declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test31(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pminsw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test30(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pminub
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test29(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pmaxsw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test28(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pmaxub
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test27(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pavgw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test26(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pavgb
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare void @llvm.x86.mmx.movnt.dq(x86_mmx*, x86_mmx) nounwind
+
+define void @test25(<1 x i64>* %p, <1 x i64> %a) nounwind optsize ssp {
+; CHECK: movntq
+entry:
+  %mmx_ptr_var.i = bitcast <1 x i64>* %p to x86_mmx*
+  %0 = extractelement <1 x i64> %a, i32 0
+  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  tail call void @llvm.x86.mmx.movnt.dq(x86_mmx* %mmx_ptr_var.i, x86_mmx %mmx_var.i) nounwind
+  ret void
+}
+
+declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx) nounwind readnone
+
+define i32 @test24(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: pmovmskb
+entry:
+  %0 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %0 to x86_mmx
+  %1 = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %mmx_var.i) nounwind
+  ret i32 %1
+}
+
+declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, i8*) nounwind
+
+define void @test23(<1 x i64> %d, <1 x i64> %n, i8* %p) nounwind optsize ssp {
+; CHECK: maskmovq
+entry:
+  %0 = bitcast <1 x i64> %n to <8 x i8>
+  %1 = bitcast <1 x i64> %d to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  tail call void @llvm.x86.mmx.maskmovq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i, i8* %p) nounwind
+  ret void
+}
+
+declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test22(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pmulhuw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
+
+define i64 @test21(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: pshufw
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+define i32 @test21_2(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: test21_2
+; CHECK: pshufw
+; CHECK: movd
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <2 x i32>
+  %5 = extractelement <2 x i32> %4, i32 0
+  ret i32 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pmuludq
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to i64
+  ret i64 %3
+}
+
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
+
+define <2 x double> @test19(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: cvtpi2pd
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %1) nounwind readnone
+  ret <2 x double> %2
+}
+
+declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
+
+define i64 @test18(<2 x double> %a) nounwind readnone optsize ssp {
+; CHECK: cvttpd2pi
+entry:
+  %0 = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone
+  %1 = bitcast x86_mmx %0 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = extractelement <1 x i64> %2, i32 0
+  ret i64 %3
+}
+
+declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
+
+define i64 @test17(<2 x double> %a) nounwind readnone optsize ssp {
+; CHECK: cvtpd2pi
+entry:
+  %0 = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone
+  %1 = bitcast x86_mmx %0 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = extractelement <1 x i64> %2, i32 0
+  ret i64 %3
+}
+
+declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
+
+define i64 @test16(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: palignr
+entry:
+  %0 = extractelement <1 x i64> %a, i32 0
+  %mmx_var = bitcast i64 %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1 = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %mmx_var, x86_mmx %mmx_var1, i8 16)
+  %3 = bitcast x86_mmx %2 to i64
+  ret i64 %3
+}
+
+declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
+
+define i64 @test15(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: pabsd
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %1) nounwind readnone
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
+
+define i64 @test14(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: pabsw
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %1) nounwind readnone
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
+
+define i64 @test13(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: pabsb
+entry:
+  %0 = bitcast <1 x i64> %a to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %1) nounwind readnone
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test12(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psignd
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to x86_mmx
+  %3 = bitcast <2 x i32> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %6 = bitcast <2 x i32> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test11(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psignw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to x86_mmx
+  %3 = bitcast <4 x i16> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %6 = bitcast <4 x i16> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test10(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: psignb
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to x86_mmx
+  %3 = bitcast <8 x i8> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %6 = bitcast <8 x i8> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test9(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pshufb
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to x86_mmx
+  %3 = bitcast <8 x i8> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %6 = bitcast <8 x i8> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test8(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pmulhrsw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to x86_mmx
+  %3 = bitcast <4 x i16> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %6 = bitcast <4 x i16> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test7(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: pmaddubsw
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to x86_mmx
+  %3 = bitcast <8 x i8> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %6 = bitcast <8 x i8> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test6(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: phsubsw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to x86_mmx
+  %3 = bitcast <4 x i16> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %6 = bitcast <4 x i16> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test5(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: phsubd
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to x86_mmx
+  %3 = bitcast <2 x i32> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %6 = bitcast <2 x i32> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test4(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: phsubw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to x86_mmx
+  %3 = bitcast <4 x i16> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %6 = bitcast <4 x i16> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test3(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: phaddsw
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to x86_mmx
+  %3 = bitcast <4 x i16> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %6 = bitcast <4 x i16> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test2(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
+; CHECK: phaddd
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to x86_mmx
+  %3 = bitcast <2 x i32> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %6 = bitcast <2 x i32> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind {
+; CHECK: cvtpi2ps
+  %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, x86_mmx %b)
+  ret <4 x float> %c
+}
+
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone
+
+; CHECK-LABEL: test90
+define void @test90() {
+; CHECK: emms
+  call void @llvm.x86.mmx.emms()
+  ret void
+}
+
+declare void @llvm.x86.mmx.emms()
diff --git a/test/CodeGen/X86/mmx-pinsrw.ll b/test/CodeGen/X86/mmx-pinsrw.ll
deleted file mode 100644
index 33dd2eb..0000000
--- a/test/CodeGen/X86/mmx-pinsrw.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc < %s  -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s
-; PR2562
-
-; CHECK: pinsr
-
-external global i16		; <i16*>:0 [#uses=1]
-external global <4 x i16>		; <<4 x i16>*>:1 [#uses=2]
-
-declare void @abort()
-
-define void @""() {
-	load i16* @0		; <i16>:1 [#uses=1]
-	load <4 x i16>* @1		; <<4 x i16>>:2 [#uses=1]
-	insertelement <4 x i16> %2, i16 %1, i32 0		; <<4 x i16>>:3 [#uses=1]
-	store <4 x i16> %3, <4 x i16>* @1
-	ret void
-}
diff --git a/test/CodeGen/X86/mmx-punpckhdq.ll b/test/CodeGen/X86/mmx-punpckhdq.ll
deleted file mode 100644
index 9e8f5bf..0000000
--- a/test/CodeGen/X86/mmx-punpckhdq.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+sse4.2 -mtriple=x86_64-apple-darwin10 | FileCheck %s
-; There are no MMX operations in bork; promoted to XMM.
-
-define void @bork(<1 x i64>* %x) {
-; CHECK: bork
-; CHECK: movlpd
-entry:
-	%tmp2 = load <1 x i64>* %x		; <<1 x i64>> [#uses=1]
-	%tmp6 = bitcast <1 x i64> %tmp2 to <2 x i32>		; <<2 x i32>> [#uses=1]
-	%tmp9 = shufflevector <2 x i32> %tmp6, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >		; <<2 x i32>> [#uses=1]
-	%tmp10 = bitcast <2 x i32> %tmp9 to <1 x i64>		; <<1 x i64>> [#uses=1]
-	store <1 x i64> %tmp10, <1 x i64>* %x
-	tail call void @llvm.x86.mmx.emms( )
-	ret void
-}
-
-; pork uses MMX.
-
-define void @pork(x86_mmx* %x) {
-; CHECK: pork
-; CHECK: punpckhdq
-entry:
-	%tmp2 = load x86_mmx* %x		; <x86_mmx> [#uses=1]
-        %tmp9 = tail call x86_mmx @llvm.x86.mmx.punpckhdq (x86_mmx %tmp2, x86_mmx %tmp2)
-	store x86_mmx %tmp9, x86_mmx* %x
-	tail call void @llvm.x86.mmx.emms( )
-	ret void
-}
-
-declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx)
-declare void @llvm.x86.mmx.emms()
diff --git a/test/CodeGen/X86/mmx-s2v.ll b/test/CodeGen/X86/mmx-s2v.ll
deleted file mode 100644
index c98023c..0000000
--- a/test/CodeGen/X86/mmx-s2v.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx
-; PR2574
-
-define void @entry(i32 %m_task_id, i32 %start_x, i32 %end_x) {; <label>:0
-        br i1 true, label %bb.nph, label %._crit_edge
-
-bb.nph:         ; preds = %bb.nph, %0
-        %t2206f2.0 = phi <2 x float> [ %2, %bb.nph ], [ undef, %0 ]             ; <<2 x float>> [#uses=1]
-        insertelement <2 x float> %t2206f2.0, float 0.000000e+00, i32 0         ; <<2 x float>>:1 [#uses=1]
-        insertelement <2 x float> %1, float 0.000000e+00, i32 1         ; <<2 x float>>:2 [#uses=1]
-        br label %bb.nph
-
-._crit_edge:            ; preds = %0
-        ret void
-}
diff --git a/test/CodeGen/X86/mmx-shift.ll b/test/CodeGen/X86/mmx-shift.ll
deleted file mode 100644
index c7c6e75..0000000
--- a/test/CodeGen/X86/mmx-shift.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mattr=+mmx | FileCheck %s
-
-define i64 @t1(<1 x i64> %mm1) nounwind  {
-entry:
-        %tmp = bitcast <1 x i64> %mm1 to x86_mmx
-	%tmp6 = tail call x86_mmx @llvm.x86.mmx.pslli.q( x86_mmx %tmp, i32 32 )		; <x86_mmx> [#uses=1]
-        %retval1112 = bitcast x86_mmx %tmp6 to i64
-	ret i64 %retval1112
-
-; CHECK-LABEL: t1:
-; CHECK: psllq $32
-}
-
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone 
-
-define i64 @t2(x86_mmx %mm1, x86_mmx %mm2) nounwind  {
-entry:
-	%tmp7 = tail call x86_mmx @llvm.x86.mmx.psra.d( x86_mmx %mm1, x86_mmx %mm2 ) nounwind readnone 		; <x86_mmx> [#uses=1]
-        %retval1112 = bitcast x86_mmx %tmp7 to i64
-	ret i64 %retval1112
-
-; CHECK-LABEL: t2:
-; CHECK: psrad
-}
-
-declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone 
-
-define i64 @t3(x86_mmx %mm1, i32 %bits) nounwind  {
-entry:
-	%tmp8 = tail call x86_mmx @llvm.x86.mmx.psrli.w( x86_mmx %mm1, i32 %bits ) nounwind readnone 		; <x86_mmx> [#uses=1]
-        %retval1314 = bitcast x86_mmx %tmp8 to i64
-	ret i64 %retval1314
-
-; CHECK-LABEL: t3:
-; CHECK: psrlw
-}
-
-declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone 
diff --git a/test/CodeGen/X86/mmx-shuffle.ll b/test/CodeGen/X86/mmx-shuffle.ll
deleted file mode 100644
index 869f32b..0000000
--- a/test/CodeGen/X86/mmx-shuffle.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc < %s -mcpu=yonah
-; PR1427
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
-target triple = "i686-pc-linux-gnu"
-	%struct.DrawHelper = type { void (i32, %struct.QT_FT_Span*, i8*)*, void (i32, %struct.QT_FT_Span*, i8*)*, void (%struct.QRasterBuffer*, i32, i32, i32, i8*, i32, i32, i32)*, void (%struct.QRasterBuffer*, i32, i32, i32, i8*, i32, i32, i32)*, void (%struct.QRasterBuffer*, i32, i32, i32, i32, i32)* }
-	%struct.QBasicAtomic = type { i32 }
-	%struct.QClipData = type { i32, %"struct.QClipData::ClipLine"*, i32, i32, %struct.QT_FT_Span*, i32, i32, i32, i32 }
-	%"struct.QClipData::ClipLine" = type { i32, %struct.QT_FT_Span* }
-	%struct.QRasterBuffer = type { %struct.QRect, %struct.QRegion, %struct.QClipData*, %struct.QClipData*, i8, i32, i32, %struct.DrawHelper*, i32, i32, i32, i8* }
-	%struct.QRect = type { i32, i32, i32, i32 }
-	%struct.QRegion = type { %"struct.QRegion::QRegionData"* }
-	%"struct.QRegion::QRegionData" = type { %struct.QBasicAtomic, %struct._XRegion*, i8*, %struct.QRegionPrivate* }
-	%struct.QRegionPrivate = type opaque
-	%struct.QT_FT_Span = type { i16, i16, i16, i8 }
-	%struct._XRegion = type opaque
-
-define void @_Z19qt_bitmapblit16_sseP13QRasterBufferiijPKhiii(%struct.QRasterBuffer* %rasterBuffer, i32 %x, i32 %y, i32 %color, i8* %src, i32 %width, i32 %height, i32 %stride) {
-entry:
-	%tmp528 = bitcast <8 x i8> zeroinitializer to <2 x i32>		; <<2 x i32>> [#uses=1]
-	%tmp529 = and <2 x i32> %tmp528, bitcast (<4 x i16> < i16 -32640, i16 16448, i16 8224, i16 4112 > to <2 x i32>)		; <<2 x i32>> [#uses=1]
-	%tmp542 = bitcast <2 x i32> %tmp529 to <4 x i16>		; <<4 x i16>> [#uses=1]
-	%tmp543 = add <4 x i16> %tmp542, < i16 0, i16 16448, i16 24672, i16 28784 >		; <<4 x i16>> [#uses=1]
-	%tmp555 = bitcast <4 x i16> %tmp543 to <8 x i8>		; <<8 x i8>> [#uses=1]
-        %tmp556 = bitcast <8 x i8> %tmp555 to x86_mmx
-        %tmp557 = bitcast <8 x i8> zeroinitializer to x86_mmx
-	tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp557, x86_mmx %tmp556, i8* null )
-	ret void
-}
-
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, i8*)
diff --git a/test/CodeGen/X86/movntdq-no-avx.ll b/test/CodeGen/X86/movntdq-no-avx.ll
index 8b7e6ef..cc35e20 100644
--- a/test/CodeGen/X86/movntdq-no-avx.ll
+++ b/test/CodeGen/X86/movntdq-no-avx.ll
@@ -9,4 +9,4 @@ entry:
   ret void
 }
 
-!0 = metadata !{i32 1}
+!0 = !{i32 1}
diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll
new file mode 100644
index 0000000..4a5d903
--- /dev/null
+++ b/test/CodeGen/X86/movtopush.ll
@@ -0,0 +1,346 @@
+; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED 
+
+declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
+declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d)
+declare void @oneparam(i32 %a)
+declare void @eightparams(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h)
+
+
+; Here, we should have a reserved frame, so we don't expect pushes
+; NORMAL-LABEL: test1:
+; NORMAL: subl    $16, %esp
+; NORMAL-NEXT: movl    $4, 12(%esp)
+; NORMAL-NEXT: movl    $3, 8(%esp)
+; NORMAL-NEXT: movl    $2, 4(%esp)
+; NORMAL-NEXT: movl    $1, (%esp)
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+define void @test1() {
+entry:
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; We're optimizing for code size, so we should get pushes for x86,
+; even though there is a reserved call frame.
+; Make sure we don't touch x86-64
+; NORMAL-LABEL: test1b:
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: pushl   $4
+; NORMAL-NEXT: pushl   $3
+; NORMAL-NEXT: pushl   $2
+; NORMAL-NEXT: pushl   $1
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+; X64-LABEL: test1b:
+; X64: movl    $1, %ecx
+; X64-NEXT: movl    $2, %edx
+; X64-NEXT: movl    $3, %r8d
+; X64-NEXT: movl    $4, %r9d
+; X64-NEXT: callq   good
+define void @test1b() optsize {
+entry:
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; Same as above, but for minsize
+; NORMAL-LABEL: test1c:
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: pushl   $4
+; NORMAL-NEXT: pushl   $3
+; NORMAL-NEXT: pushl   $2
+; NORMAL-NEXT: pushl   $1
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+define void @test1c() minsize {
+entry:
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; If we have a reserved frame, we should have pushes
+; NORMAL-LABEL: test2:
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: pushl   $4
+; NORMAL-NEXT: pushl   $3
+; NORMAL-NEXT: pushl   $2
+; NORMAL-NEXT: pushl   $1
+; NORMAL-NEXT: call
+define void @test2(i32 %k) {
+entry:
+  %a = alloca i32, i32 %k
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; Again, we expect a sequence of 4 immediate pushes
+; Checks that we generate the right pushes for >8bit immediates
+; NORMAL-LABEL: test2b:
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: pushl   $4096
+; NORMAL-NEXT: pushl   $3072
+; NORMAL-NEXT: pushl   $2048
+; NORMAL-NEXT: pushl   $1024
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+define void @test2b() optsize {
+entry:
+  call void @good(i32 1024, i32 2048, i32 3072, i32 4096)
+  ret void
+}
+
+; The first push should push a register
+; NORMAL-LABEL: test3:
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: pushl   $4
+; NORMAL-NEXT: pushl   $3
+; NORMAL-NEXT: pushl   $2
+; NORMAL-NEXT: pushl   %e{{..}}
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+define void @test3(i32 %k) optsize {
+entry:
+  %f = add i32 %k, 1
+  call void @good(i32 %f, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; We don't support weird calling conventions
+; NORMAL-LABEL: test4:
+; NORMAL: subl    $12, %esp
+; NORMAL-NEXT: movl    $4, 8(%esp)
+; NORMAL-NEXT: movl    $3, 4(%esp)
+; NORMAL-NEXT: movl    $1, (%esp)
+; NORMAL-NEXT: movl    $2, %eax
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $12, %esp
+define void @test4() optsize {
+entry:
+  call void @inreg(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; When there is no reserved call frame, check that additional alignment
+; is added when the pushes don't add up to the required alignment.
+; ALIGNED-LABEL: test5:
+; ALIGNED: subl    $16, %esp
+; ALIGNED-NEXT: pushl   $4
+; ALIGNED-NEXT: pushl   $3
+; ALIGNED-NEXT: pushl   $2
+; ALIGNED-NEXT: pushl   $1
+; ALIGNED-NEXT: call
+define void @test5(i32 %k) {
+entry:
+  %a = alloca i32, i32 %k
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; When the alignment adds up, do the transformation
+; ALIGNED-LABEL: test5b:
+; ALIGNED: pushl   $8
+; ALIGNED-NEXT: pushl   $7
+; ALIGNED-NEXT: pushl   $6
+; ALIGNED-NEXT: pushl   $5
+; ALIGNED-NEXT: pushl   $4
+; ALIGNED-NEXT: pushl   $3
+; ALIGNED-NEXT: pushl   $2
+; ALIGNED-NEXT: pushl   $1
+; ALIGNED-NEXT: call
+define void @test5b() optsize {
+entry:
+  call void @eightparams(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8)
+  ret void
+}
+
+; When having to compensate for the alignment isn't worth it,
+; don't use pushes.
+; ALIGNED-LABEL: test5c:
+; ALIGNED: movl $1, (%esp)
+; ALIGNED-NEXT: call
+define void @test5c() optsize {
+entry:
+  call void @oneparam(i32 1)
+  ret void
+}
+
+; Check that pushing the addresses of globals (Or generally, things that 
+; aren't exactly immediates) isn't broken.
+; Fixes PR21878.
+; NORMAL-LABEL: test6:
+; NORMAL: pushl    $_ext
+; NORMAL-NEXT: call
+declare void @f(i8*)
+@ext = external constant i8
+
+define void @test6() {
+  call void @f(i8* @ext)
+  br label %bb
+bb:
+  alloca i32
+  ret void
+}
+
+; Check that we fold simple cases into the push
+; NORMAL-LABEL: test7:
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: movl 4(%esp), [[EAX:%e..]]
+; NORMAL-NEXT: pushl   $4
+; NORMAL-NEXT: pushl   ([[EAX]])
+; NORMAL-NEXT: pushl   $2
+; NORMAL-NEXT: pushl   $1
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+define void @test7(i32* %ptr) optsize {
+entry:
+  %val = load i32* %ptr
+  call void @good(i32 1, i32 2, i32 %val, i32 4)
+  ret void
+}
+
+; Fold stack-relative loads into the push, with correct offset
+; In particular, at the second push, %b was at 12(%esp) and
+; %a wast at 8(%esp), but the second push bumped %esp, so %a
+; is now it at 12(%esp)
+; NORMAL-LABEL: test8:
+; NORMAL: pushl   $4
+; NORMAL-NEXT: pushl   12(%esp)
+; NORMAL-NEXT: pushl   12(%esp)
+; NORMAL-NEXT: pushl   $1
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+define void @test8(i32 %a, i32 %b) optsize {
+entry:
+  call void @good(i32 1, i32 %a, i32 %b, i32 4)
+  ret void
+}
+
+; If one function is using push instructions, and the other isn't
+; (because it has frame-index references), then we must resolve
+; these references correctly.
+; NORMAL-LABEL: test9:
+; NORMAL-NOT: leal (%esp), 
+; NORMAL: pushl $4
+; NORMAL-NEXT: pushl $3
+; NORMAL-NEXT: pushl $2
+; NORMAL-NEXT: pushl $1
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+; NORMAL-NEXT: subl $16, %esp
+; NORMAL-NEXT: leal 16(%esp), [[EAX:%e..]]
+; NORMAL-NEXT: movl    [[EAX]], 12(%esp)
+; NORMAL-NEXT: movl    $7, 8(%esp)
+; NORMAL-NEXT: movl    $6, 4(%esp)
+; NORMAL-NEXT: movl    $5, (%esp)
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+define void @test9() optsize {
+entry:
+  %p = alloca i32, align 4
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  %0 = ptrtoint i32* %p to i32
+  call void @good(i32 5, i32 6, i32 7, i32 %0)
+  ret void
+}
+
+; We can end up with an indirect call which gets reloaded on the spot.
+; Make sure we reference the correct stack slot - we spill into (%esp)
+; and reload from 16(%esp) due to the pushes.
+; NORMAL-LABEL: test10:
+; NORMAL: movl $_good, [[ALLOC:.*]]
+; NORMAL-NEXT: movl [[ALLOC]], [[EAX:%e..]]
+; NORMAL-NEXT: movl [[EAX]], (%esp) # 4-byte Spill
+; NORMAL: nop
+; NORMAL: pushl $4
+; NORMAL-NEXT: pushl $3
+; NORMAL-NEXT: pushl $2
+; NORMAL-NEXT: pushl $1
+; NORMAL-NEXT: calll *16(%esp)
+; NORMAL-NEXT: addl $16, %esp
+define void @test10() optsize {
+  %stack_fptr = alloca void (i32, i32, i32, i32)*
+  store void (i32, i32, i32, i32)* @good, void (i32, i32, i32, i32)** %stack_fptr
+  %good_ptr = load volatile void (i32, i32, i32, i32)** %stack_fptr
+  call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di}"()
+  call void (i32, i32, i32, i32)* %good_ptr(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; We can't fold the load from the global into the push because of 
+; interference from the store
+; NORMAL-LABEL: test11:
+; NORMAL: movl    _the_global, [[EAX:%e..]]
+; NORMAL-NEXT: movl    $42, _the_global
+; NORMAL-NEXT: pushl $4
+; NORMAL-NEXT: pushl $3
+; NORMAL-NEXT: pushl $2
+; NORMAL-NEXT: pushl [[EAX]]
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+@the_global = external global i32
+define void @test11() optsize {
+  %myload = load i32* @the_global
+  store i32 42, i32* @the_global
+  call void @good(i32 %myload, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; Converting one mov into a push isn't worth it when 
+; doing so forces too much overhead for other calls.
+; NORMAL-LABEL: test12:
+; NORMAL: subl    $16, %esp
+; NORMAL-NEXT: movl    $4, 8(%esp)
+; NORMAL-NEXT: movl    $3, 4(%esp)
+; NORMAL-NEXT: movl    $1, (%esp)
+; NORMAL-NEXT: movl    $2, %eax
+; NORMAL-NEXT: calll _inreg
+; NORMAL-NEXT: movl    $8, 12(%esp)
+; NORMAL-NEXT: movl    $7, 8(%esp)
+; NORMAL-NEXT: movl    $6, 4(%esp)
+; NORMAL-NEXT: movl    $5, (%esp)
+; NORMAL-NEXT: calll _good
+; NORMAL-NEXT: movl    $12, 8(%esp)
+; NORMAL-NEXT: movl    $11, 4(%esp)
+; NORMAL-NEXT: movl    $9, (%esp)
+; NORMAL-NEXT: movl    $10, %eax
+; NORMAL-NEXT: calll _inreg
+; NORMAL-NEXT: addl $16, %esp
+define void @test12() optsize {
+entry:
+  call void @inreg(i32 1, i32 2, i32 3, i32 4)
+  call void @good(i32 5, i32 6, i32 7, i32 8)
+  call void @inreg(i32 9, i32 10, i32 11, i32 12)
+  ret void
+}
+
+; But if the gains outweigh the overhead, we should do it
+; NORMAL-LABEL: test12b:
+; NORMAL: pushl    $4
+; NORMAL-NEXT: pushl    $3
+; NORMAL-NEXT: pushl    $2
+; NORMAL-NEXT: pushl    $1
+; NORMAL-NEXT: calll _good
+; NORMAL-NEXT: addl    $16, %esp
+; NORMAL-NEXT: subl    $12, %esp
+; NORMAL-NEXT: movl    $8, 8(%esp)
+; NORMAL-NEXT: movl    $7, 4(%esp)
+; NORMAL-NEXT: movl    $5, (%esp)
+; NORMAL-NEXT: movl    $6, %eax
+; NORMAL-NEXT: calll _inreg
+; NORMAL-NEXT: addl    $12, %esp
+; NORMAL-NEXT: pushl    $12
+; NORMAL-NEXT: pushl    $11
+; NORMAL-NEXT: pushl    $10
+; NORMAL-NEXT: pushl    $9
+; NORMAL-NEXT: calll _good
+; NORMAL-NEXT: addl $16, %esp
+define void @test12b() optsize {
+entry:
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  call void @inreg(i32 5, i32 6, i32 7, i32 8)
+  call void @good(i32 9, i32 10, i32 11, i32 12)
+  ret void
+}
diff --git a/test/CodeGen/X86/musttail-fastcall.ll b/test/CodeGen/X86/musttail-fastcall.ll
new file mode 100644
index 0000000..c7e5ffc
--- /dev/null
+++ b/test/CodeGen/X86/musttail-fastcall.ll
@@ -0,0 +1,109 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2
+; RUN: llc < %s -mtriple=i686-pc-win32 -mattr=+sse2,+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: llc < %s -mtriple=i686-pc-win32 -mattr=+sse2,+avx,+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
+
+; While we don't support varargs with fastcall, we do support forwarding.
+
+@asdf = internal constant [4 x i8] c"asdf"
+
+declare void @puts(i8*)
+
+define i32 @call_fast_thunk() {
+  %r = call x86_fastcallcc i32 (...)* @fast_thunk(i32 inreg 1, i32 inreg 2, i32 3)
+  ret i32 %r
+}
+
+define x86_fastcallcc i32 @fast_thunk(...) {
+  call void @puts(i8* getelementptr ([4 x i8]* @asdf, i32 0, i32 0))
+  %r = musttail call x86_fastcallcc i32 (...)* bitcast (i32 (i32, i32, i32)* @fast_target to i32 (...)*) (...)
+  ret i32 %r
+}
+
+; Check that we spill and fill around the call to puts.
+
+; CHECK-LABEL: @fast_thunk@0:
+; CHECK-DAG: movl %ecx, {{.*}}
+; CHECK-DAG: movl %edx, {{.*}}
+; CHECK: calll _puts
+; CHECK-DAG: movl {{.*}}, %ecx
+; CHECK-DAG: movl {{.*}}, %edx
+; CHECK: jmp @fast_target@12
+
+define x86_fastcallcc i32 @fast_target(i32 inreg %a, i32 inreg %b, i32 %c) {
+  %a0 = add i32 %a, %b
+  %a1 = add i32 %a0, %c
+  ret i32 %a1
+}
+
+; Repeat the test for vectorcall, which has XMM registers.
+
+define i32 @call_vector_thunk() {
+  %r = call x86_vectorcallcc i32 (...)* @vector_thunk(i32 inreg 1, i32 inreg 2, i32 3)
+  ret i32 %r
+}
+
+define x86_vectorcallcc i32 @vector_thunk(...) {
+  call void @puts(i8* getelementptr ([4 x i8]* @asdf, i32 0, i32 0))
+  %r = musttail call x86_vectorcallcc i32 (...)* bitcast (i32 (i32, i32, i32)* @vector_target to i32 (...)*) (...)
+  ret i32 %r
+}
+
+; Check that we spill and fill SSE registers around the call to puts.
+
+; CHECK-LABEL: vector_thunk@@0:
+; CHECK-DAG: movl %ecx, {{.*}}
+; CHECK-DAG: movl %edx, {{.*}}
+
+; SSE2-DAG: movups %xmm0, {{.*}}
+; SSE2-DAG: movups %xmm1, {{.*}}
+; SSE2-DAG: movups %xmm2, {{.*}}
+; SSE2-DAG: movups %xmm3, {{.*}}
+; SSE2-DAG: movups %xmm4, {{.*}}
+; SSE2-DAG: movups %xmm5, {{.*}}
+
+; AVX-DAG: vmovups %ymm0, {{.*}}
+; AVX-DAG: vmovups %ymm1, {{.*}}
+; AVX-DAG: vmovups %ymm2, {{.*}}
+; AVX-DAG: vmovups %ymm3, {{.*}}
+; AVX-DAG: vmovups %ymm4, {{.*}}
+; AVX-DAG: vmovups %ymm5, {{.*}}
+
+; AVX512-DAG: vmovups %zmm0, {{.*}}
+; AVX512-DAG: vmovups %zmm1, {{.*}}
+; AVX512-DAG: vmovups %zmm2, {{.*}}
+; AVX512-DAG: vmovups %zmm3, {{.*}}
+; AVX512-DAG: vmovups %zmm4, {{.*}}
+; AVX512-DAG: vmovups %zmm5, {{.*}}
+
+; CHECK: calll _puts
+
+; SSE2-DAG: movups {{.*}}, %xmm0
+; SSE2-DAG: movups {{.*}}, %xmm1
+; SSE2-DAG: movups {{.*}}, %xmm2
+; SSE2-DAG: movups {{.*}}, %xmm3
+; SSE2-DAG: movups {{.*}}, %xmm4
+; SSE2-DAG: movups {{.*}}, %xmm5
+
+; AVX-DAG: vmovups {{.*}}, %ymm0
+; AVX-DAG: vmovups {{.*}}, %ymm1
+; AVX-DAG: vmovups {{.*}}, %ymm2
+; AVX-DAG: vmovups {{.*}}, %ymm3
+; AVX-DAG: vmovups {{.*}}, %ymm4
+; AVX-DAG: vmovups {{.*}}, %ymm5
+
+; AVX512-DAG: vmovups {{.*}}, %zmm0
+; AVX512-DAG: vmovups {{.*}}, %zmm1
+; AVX512-DAG: vmovups {{.*}}, %zmm2
+; AVX512-DAG: vmovups {{.*}}, %zmm3
+; AVX512-DAG: vmovups {{.*}}, %zmm4
+; AVX512-DAG: vmovups {{.*}}, %zmm5
+
+; CHECK-DAG: movl {{.*}}, %ecx
+; CHECK-DAG: movl {{.*}}, %edx
+; CHECK: jmp vector_target@@12
+
+define x86_vectorcallcc i32 @vector_target(i32 inreg %a, i32 inreg %b, i32 %c) {
+  %a0 = add i32 %a, %b
+  %a1 = add i32 %a0, %c
+  ret i32 %a1
+}
diff --git a/test/CodeGen/X86/musttail-varargs.ll b/test/CodeGen/X86/musttail-varargs.ll
index 1e99c14..7f105a1 100644
--- a/test/CodeGen/X86/musttail-varargs.ll
+++ b/test/CodeGen/X86/musttail-varargs.ll
@@ -1,13 +1,21 @@
 ; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX
 ; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS
+; RUN: llc < %s -enable-tail-merge=0 -mtriple=i686-windows | FileCheck %s --check-prefix=X86
 
 ; Test that we actually spill and reload all arguments in the variadic argument
 ; pack. Doing a normal call will clobber all argument registers, and we will
 ; spill around it. A simple adjustment should not require any XMM spills.
 
+declare void @llvm.va_start(i8*) nounwind
+
 declare void(i8*, ...)* @get_f(i8* %this)
 
 define void @f_thunk(i8* %this, ...) {
+  ; Use va_start so that we exercise the combination.
+  %ap = alloca [4 x i8*], align 16
+  %ap_i8 = bitcast [4 x i8*]* %ap to i8*
+  call void @llvm.va_start(i8* %ap_i8)
+
   %fptr = call void(i8*, ...)*(i8*)* @get_f(i8* %this)
   musttail call void (i8*, ...)* %fptr(i8* %this, ...)
   ret void
@@ -65,6 +73,12 @@ define void @f_thunk(i8* %this, ...) {
 ; WINDOWS-NOT: mov{{.}}ps
 ; WINDOWS: jmpq *{{.*}} # TAILCALL
 
+; No regparms on normal x86 conventions.
+
+; X86-LABEL: _f_thunk:
+; X86: calll _get_f
+; X86: jmpl *{{.*}} # TAILCALL
+
 ; This thunk shouldn't require any spills and reloads, assuming the register
 ; allocator knows what it's doing.
 
@@ -82,6 +96,9 @@ define void @g_thunk(i8* %fptr_i8, ...) {
 ; WINDOWS-NOT: movq
 ; WINDOWS: jmpq *%rcx # TAILCALL
 
+; X86-LABEL: _g_thunk:
+; X86: jmpl *%eax # TAILCALL
+
 ; Do a simple multi-exit multi-bb test.
 
 %struct.Foo = type { i1, i8*, i8* }
@@ -117,3 +134,7 @@ else:
 ; WINDOWS: jne
 ; WINDOWS: jmpq *{{.*}} # TAILCALL
 ; WINDOWS: jmpq *{{.*}} # TAILCALL
+; X86-LABEL: _h_thunk:
+; X86: jne
+; X86: jmpl *{{.*}} # TAILCALL
+; X86: jmpl *{{.*}} # TAILCALL
diff --git a/test/CodeGen/X86/named-reg-alloc.ll b/test/CodeGen/X86/named-reg-alloc.ll
index 9463ea3..c33b4eb 100644
--- a/test/CodeGen/X86/named-reg-alloc.ll
+++ b/test/CodeGen/X86/named-reg-alloc.ll
@@ -11,4 +11,4 @@ entry:
 
 declare i32 @llvm.read_register.i32(metadata) nounwind
 
-!0 = metadata !{metadata !"eax\00"}
+!0 = !{!"eax\00"}
diff --git a/test/CodeGen/X86/named-reg-notareg.ll b/test/CodeGen/X86/named-reg-notareg.ll
index d85dddd..18c517d 100644
--- a/test/CodeGen/X86/named-reg-notareg.ll
+++ b/test/CodeGen/X86/named-reg-notareg.ll
@@ -10,4 +10,4 @@ entry:
 
 declare i32 @llvm.read_register.i32(metadata) nounwind
 
-!0 = metadata !{metadata !"notareg\00"}
+!0 = !{!"notareg\00"}
diff --git a/test/CodeGen/X86/no-compact-unwind.ll b/test/CodeGen/X86/no-compact-unwind.ll
deleted file mode 100644
index 991cd4e..0000000
--- a/test/CodeGen/X86/no-compact-unwind.ll
+++ /dev/null
@@ -1,64 +0,0 @@
-; RUN: llc < %s -mtriple x86_64-apple-macosx10.8.0 -mcpu corei7 -filetype=obj -o - \
-; RUN:  | llvm-objdump -triple x86_64-apple-macosx10.8.0 -s - \
-; RUN:  | FileCheck -check-prefix=CU %s
-; RUN: llc < %s -mtriple x86_64-apple-darwin11 -mcpu corei7 \
-; RUN:  | llvm-mc -triple x86_64-apple-darwin11 -filetype=obj -o - \
-; RUN:  | llvm-objdump -triple x86_64-apple-darwin11 -s - \
-; RUN:  | FileCheck -check-prefix=FROM-ASM %s
-
-%"struct.dyld::MappedRanges" = type { [400 x %struct.anon], %"struct.dyld::MappedRanges"* }
-%struct.anon = type { %class.ImageLoader*, i64, i64 }
-%class.ImageLoader = type { i32 (...)**, i8*, i8*, i32, i64, i64, i32, i32, %"struct.ImageLoader::recursive_lock"*, i16, i16, [4 x i8] }
-%"struct.ImageLoader::recursive_lock" = type { i32, i32 }
-
-@G1 = external hidden global %"struct.dyld::MappedRanges", align 8
-
-declare void @OSMemoryBarrier() optsize
-
-; This compact unwind encoding indicates that we could not generate correct
-; compact unwind encodings for this function. This then defaults to using the
-; DWARF EH frame.
-
-; CU:      Contents of section __compact_unwind:
-; CU-NEXT: 0048 00000000 00000000 42000000 00000004
-; CU-NEXT: 0058 00000000 00000000 00000000 00000000
-
-; FROM-ASM:      Contents of section __compact_unwind:
-; FROM-ASM-NEXT: 0048 00000000 00000000 42000000 00000004
-; FROM-ASM-NEXT: 0058 00000000 00000000 00000000 00000000
-
-define void @func(%class.ImageLoader* %image) optsize ssp uwtable {
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %for.inc10, %entry
-  %p.019 = phi %"struct.dyld::MappedRanges"* [ @G1, %entry ], [ %1, %for.inc10 ]
-  br label %for.body3
-
-for.body3:                                        ; preds = %for.inc, %for.cond1.preheader
-  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ]
-  %image4 = getelementptr inbounds %"struct.dyld::MappedRanges"* %p.019, i64 0, i32 0, i64 %indvars.iv, i32 0
-  %0 = load %class.ImageLoader** %image4, align 8
-  %cmp5 = icmp eq %class.ImageLoader* %0, %image
-  br i1 %cmp5, label %if.then, label %for.inc
-
-if.then:                                          ; preds = %for.body3
-  tail call void @OSMemoryBarrier() optsize
-  store %class.ImageLoader* null, %class.ImageLoader** %image4, align 8
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.then, %for.body3
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 400
-  br i1 %exitcond, label %for.inc10, label %for.body3
-
-for.inc10:                                        ; preds = %for.inc
-  %next = getelementptr inbounds %"struct.dyld::MappedRanges"* %p.019, i64 0, i32 1
-  %1 = load %"struct.dyld::MappedRanges"** %next, align 8
-  %cmp = icmp eq %"struct.dyld::MappedRanges"* %1, null
-  br i1 %cmp, label %for.end11, label %for.cond1.preheader
-
-for.end11:                                        ; preds = %for.inc10
-  ret void
-}
diff --git a/test/CodeGen/X86/non-unique-sections.ll b/test/CodeGen/X86/non-unique-sections.ll
new file mode 100644
index 0000000..e588b9d
--- /dev/null
+++ b/test/CodeGen/X86/non-unique-sections.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -unique-section-names=false | FileCheck %s
+
+; CHECK:   .section                      .text,"ax",@progbits,unique
+; CHECK-NOT: section
+; CHECK: f:
+define void @f() {
+  ret void
+}
+
+; CHECK:   .section                      .text,"ax",@progbits,unique
+; CHECK-NOT: section
+; CHECK: g:
+define void @g() {
+  ret void
+}
diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll
index 9d0cb9a..f62f372 100644
--- a/test/CodeGen/X86/nontemporal-2.ll
+++ b/test/CodeGen/X86/nontemporal-2.ll
@@ -28,4 +28,4 @@ define void @test3(<2 x double>* %dst) {
   ret void
 }
 
-!1 = metadata !{i32 1}
+!1 = !{i32 1}
diff --git a/test/CodeGen/X86/nontemporal.ll b/test/CodeGen/X86/nontemporal.ll
index ae04435..f9385df 100644
--- a/test/CodeGen/X86/nontemporal.ll
+++ b/test/CodeGen/X86/nontemporal.ll
@@ -19,4 +19,4 @@ define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E) {
   ret void
 }
 
-!0 = metadata !{i32 1}
+!0 = !{i32 1}
diff --git a/test/CodeGen/X86/norex-subreg.ll b/test/CodeGen/X86/norex-subreg.ll
index 2c529fd..fb41ded 100644
--- a/test/CodeGen/X86/norex-subreg.ll
+++ b/test/CodeGen/X86/norex-subreg.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 < %s
-; RUN: llc < %s
+; RUN: llc -O0 < %s -verify-machineinstrs
+; RUN: llc < %s -verify-machineinstrs
 target triple = "x86_64-apple-macosx10.7"
 
 ; This test case extracts a sub_8bit_hi sub-register:
diff --git a/test/CodeGen/X86/nosse-varargs.ll b/test/CodeGen/X86/nosse-varargs.ll
index e6da0ab..8070c47 100644
--- a/test/CodeGen/X86/nosse-varargs.ll
+++ b/test/CodeGen/X86/nosse-varargs.ll
@@ -1,11 +1,12 @@
-; RUN: llvm-as < %s > %t
-; RUN: llc -march=x86-64 -mattr=-sse < %t | not grep xmm
-; RUN: llc -march=x86-64 < %t | grep xmm
+; RUN: llc < %s -march=x86-64 -mattr=-sse | FileCheck %s -check-prefix=NOSSE
+; RUN: llc < %s -march=x86-64 | FileCheck %s -check-prefix=YESSSE
 ; PR3403
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
 	%struct.__va_list_tag = type { i32, i32, i8*, i8* }
 
+; NOSSE-NOT: xmm
+; YESSSE: xmm
 define i32 @foo(float %a, i8* nocapture %fmt, ...) nounwind {
 entry:
 	%ap = alloca [1 x %struct.__va_list_tag], align 8		; <[1 x %struct.__va_list_tag]*> [#uses=4]
diff --git a/test/CodeGen/X86/null-streamer.ll b/test/CodeGen/X86/null-streamer.ll
index b559729..f6eb0e1 100644
--- a/test/CodeGen/X86/null-streamer.ll
+++ b/test/CodeGen/X86/null-streamer.ll
@@ -14,16 +14,16 @@ define void @f1() {
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !13}
 
-!0 = metadata !{metadata !"0x11\004\00 \001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00\00\00\002\000\001\000\006\00256\001\002", metadata !1, metadata !5, metadata !6, null, i32 ()* null, null, null, metadata !2} ; [ DW_TAG_subprogram ]
-!5 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x34\00i\00i\00_ZL1i\001\001\001", null, metadata !5, metadata !8, null, null} ; [ DW_TAG_variable ]
-!11 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00 \001\00\000\00\000", !1, !2, !2, !3, !9, !2} ; [ DW_TAG_compile_unit ]
+!1 = !{!"", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00\00\00\002\000\001\000\006\00256\001\002", !1, !5, !6, null, i32 ()* null, null, null, !2} ; [ DW_TAG_subprogram ]
+!5 = !{!"0x29", !1} ; [ DW_TAG_file_type ]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ]
+!7 = !{!8}
+!8 = !{!"0x24\00\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!9 = !{!10}
+!10 = !{!"0x34\00i\00i\00_ZL1i\001\001\001", null, !5, !8, null, null} ; [ DW_TAG_variable ]
+!11 = !{i32 2, !"Dwarf Version", i32 3}
+!13 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/objc-gc-module-flags.ll b/test/CodeGen/X86/objc-gc-module-flags.ll
index 8cb2c03..f197510 100644
--- a/test/CodeGen/X86/objc-gc-module-flags.ll
+++ b/test/CodeGen/X86/objc-gc-module-flags.ll
@@ -7,7 +7,7 @@
 
 !llvm.module.flags = !{!0, !1, !2, !3}
 
-!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!3 = metadata !{i32 1, metadata !"Objective-C Garbage Collection", i32 2}
+!0 = !{i32 1, !"Objective-C Version", i32 2}
+!1 = !{i32 1, !"Objective-C Image Info Version", i32 0}
+!2 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = !{i32 1, !"Objective-C Garbage Collection", i32 2}
diff --git a/test/CodeGen/X86/odr_comdat.ll b/test/CodeGen/X86/odr_comdat.ll
deleted file mode 100644
index 547334c..0000000
--- a/test/CodeGen/X86/odr_comdat.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=X86LINUX
-
-; Checking that a comdat group gets generated correctly for a static member 
-; of instantiated C++ templates.
-; see http://sourcery.mentor.com/public/cxx-abi/abi.html#vague-itemplate
-; section 5.2.6 Instantiated templates
-; "Any static member data object is emitted in a COMDAT identified by its mangled 
-;  name, in any object file with a reference to its name symbol."
-
-; Case 1: variable is not explicitly initialized, and ends up in a .bss section
-; X86LINUX:   .section        .bss._ZN1CIiE1iE,"aGw",@nobits,_ZN1CIiE1iE,comdat
-@_ZN1CIiE1iE = weak_odr global i32 0, align 4
-
-; Case 2: variable is explicitly initialized, and ends up in a .data section
-; X86LINUX:   .section        .data._ZN1CIiE1jE,"aGw",@progbits,_ZN1CIiE1jE,comdat
-@_ZN1CIiE1jE = weak_odr global i32 12, align 4
diff --git a/test/CodeGen/X86/palignr.ll b/test/CodeGen/X86/palignr.ll
index 3efcc2e..dfa2ced 100644
--- a/test/CodeGen/X86/palignr.ll
+++ b/test/CodeGen/X86/palignr.ll
@@ -40,7 +40,9 @@ define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) nounwind {
 ;
 ; CHECK-YONAH-LABEL: test3:
 ; CHECK-YONAH:       # BB#0:
-; CHECK-YONAH-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
+; CHECK-YONAH-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; CHECK-YONAH-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; CHECK-YONAH-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 1, i32 2, i32 undef, i32 4 >
 	ret <4 x i32> %C
diff --git a/test/CodeGen/X86/peep-test-2.ll b/test/CodeGen/X86/peep-test-2.ll
index e4bafbb..e43b8ef 100644
--- a/test/CodeGen/X86/peep-test-2.ll
+++ b/test/CodeGen/X86/peep-test-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -march=x86 | FileCheck %s
 
 ; CHECK: testl
 
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
index 6eb97c3..12a3adf 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s
 ; rdar://5571034
 
 ; This requires physreg joining, %vreg13 is live everywhere:
diff --git a/test/CodeGen/X86/pic_jumptable.ll b/test/CodeGen/X86/pic_jumptable.ll
index bdd8859..d66ff0c 100644
--- a/test/CodeGen/X86/pic_jumptable.ll
+++ b/test/CodeGen/X86/pic_jumptable.ll
@@ -10,7 +10,7 @@
 
 declare void @_Z3bari(i32)
 
-; CHECK-LINUX: .text._Z3fooILi1EEvi,"axG",@progbits,_Z3fooILi1EEvi,comdat
+; CHECK-LINUX: _Z3fooILi1EEvi:
 define linkonce void @_Z3fooILi1EEvi(i32 %Y) nounwind {
 entry:
 ; CHECK:       L0$pb
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index 8937d6a..6bfa656 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -3,16 +3,19 @@
 
 define <4 x i32> @a(<4 x i32> %i) nounwind  {
 ; SSE2-LABEL: a:
-; SSE2:         movdqa {{.*}}, %[[X1:xmm[0-9]+]]
-; SSE2-NEXT:    pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq %[[X1]], %xmm0
-; SSE2-NEXT:    pmuludq %[[X1]], %[[X2]]
-; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2],[[X2]][0,2]
-; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [117,117,117,117]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: a:
-; SSE41:         pmulld
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 entry:
   %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
@@ -21,9 +24,19 @@ entry:
 
 define <2 x i64> @b(<2 x i64> %i) nounwind  {
 ; ALL-LABEL: b:
-; ALL:         pmuludq
-; ALL:         pmuludq
-; ALL:         pmuludq
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    movdqa {{.*#+}} xmm1 = [117,117]
+; ALL-NEXT:    movdqa %xmm0, %xmm2
+; ALL-NEXT:    pmuludq %xmm1, %xmm2
+; ALL-NEXT:    pxor %xmm3, %xmm3
+; ALL-NEXT:    pmuludq %xmm0, %xmm3
+; ALL-NEXT:    psllq $32, %xmm3
+; ALL-NEXT:    paddq %xmm3, %xmm2
+; ALL-NEXT:    psrlq $32, %xmm0
+; ALL-NEXT:    pmuludq %xmm1, %xmm0
+; ALL-NEXT:    psllq $32, %xmm0
+; ALL-NEXT:    paddq %xmm2, %xmm0
+; ALL-NEXT:    retq
 entry:
   %A = mul <2 x i64> %i, < i64 117, i64 117 >
   ret <2 x i64> %A
@@ -31,16 +44,19 @@ entry:
 
 define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind  {
 ; SSE2-LABEL: c:
-; SSE2:         pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,1,3,3]
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pmuludq %[[X2]], %xmm1
-; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: c:
-; SSE41:         pmulld
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 entry:
   %A = mul <4 x i32> %i, %j
@@ -49,9 +65,19 @@ entry:
 
 define <2 x i64> @d(<2 x i64> %i, <2 x i64> %j) nounwind  {
 ; ALL-LABEL: d:
-; ALL:         pmuludq
-; ALL:         pmuludq
-; ALL:         pmuludq
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    movdqa %xmm0, %xmm2
+; ALL-NEXT:    pmuludq %xmm1, %xmm2
+; ALL-NEXT:    movdqa %xmm1, %xmm3
+; ALL-NEXT:    psrlq $32, %xmm3
+; ALL-NEXT:    pmuludq %xmm0, %xmm3
+; ALL-NEXT:    psllq $32, %xmm3
+; ALL-NEXT:    paddq %xmm3, %xmm2
+; ALL-NEXT:    psrlq $32, %xmm0
+; ALL-NEXT:    pmuludq %xmm1, %xmm0
+; ALL-NEXT:    psllq $32, %xmm0
+; ALL-NEXT:    paddq %xmm2, %xmm0
+; ALL-NEXT:    retq
 entry:
   %A = mul <2 x i64> %i, %j
   ret <2 x i64> %A
@@ -61,20 +87,32 @@ declare void @foo()
 
 define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind  {
 ; SSE2-LABEL: e:
-; SSE2:         movdqa {{[0-9]*}}(%rsp), %xmm0
-; SSE2-NEXT:    pshufd {{.*}} # [[X1:xmm[0-9]+]] = xmm0[1,1,3,3]
-; SSE2-NEXT:    movdqa {{[0-9]*}}(%rsp), %[[X2:xmm[0-9]+]]
-; SSE2-NEXT:    pmuludq %[[X2]], %xmm0
-; SSE2-NEXT:    pshufd {{.*}} # [[X2]] = [[X2]][1,1,3,3]
-; SSE2-NEXT:    pmuludq %[[X1]], %[[X2]]
-; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2],[[X2]][0,2]
-; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
-; SSE2-NEXT:    addq ${{[0-9]+}}, %rsp
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    subq $40, %rsp
+; SSE2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    callq foo
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    addq $40, %rsp
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: e:
-; SSE41:         pmulld {{[0-9]+}}(%rsp), %xmm
-; SSE41-NEXT:    addq ${{[0-9]+}}, %rsp
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    subq $40, %rsp
+; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE41-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE41-NEXT:    callq foo
+; SSE41-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE41-NEXT:    pmulld {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; SSE41-NEXT:    addq $40, %rsp
 ; SSE41-NEXT:    retq
 entry:
   ; Use a call to force spills.
@@ -85,9 +123,26 @@ entry:
 
 define <2 x i64> @f(<2 x i64> %i, <2 x i64> %j) nounwind  {
 ; ALL-LABEL: f:
-; ALL:         pmuludq
-; ALL:         pmuludq
-; ALL:         pmuludq
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    subq $40, %rsp
+; ALL-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; ALL-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; ALL-NEXT:    callq foo
+; ALL-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; ALL-NEXT:    movdqa %xmm0, %xmm2
+; ALL-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; ALL-NEXT:    pmuludq %xmm3, %xmm2
+; ALL-NEXT:    movdqa %xmm3, %xmm1
+; ALL-NEXT:    psrlq $32, %xmm1
+; ALL-NEXT:    pmuludq %xmm0, %xmm1
+; ALL-NEXT:    psllq $32, %xmm1
+; ALL-NEXT:    paddq %xmm1, %xmm2
+; ALL-NEXT:    psrlq $32, %xmm0
+; ALL-NEXT:    pmuludq %xmm3, %xmm0
+; ALL-NEXT:    psllq $32, %xmm0
+; ALL-NEXT:    paddq %xmm2, %xmm0
+; ALL-NEXT:    addq $40, %rsp
+; ALL-NEXT:    retq
 entry:
   ; Use a call to force spills.
   call void @foo()
diff --git a/test/CodeGen/X86/pointer-vector.ll b/test/CodeGen/X86/pointer-vector.ll
index 0ee9987..5e0c2da 100644
--- a/test/CodeGen/X86/pointer-vector.ll
+++ b/test/CodeGen/X86/pointer-vector.ll
@@ -81,8 +81,7 @@ define <4 x i32*> @INT2PTR1(<4 x i8>* %p) nounwind {
 entry:
   %G = load <4 x i8>* %p
 ;CHECK: movl
-;CHECK: pmovzxbd
-;CHECK: pand
+;CHECK: pmovzxbd (%
   %K = inttoptr <4 x i8> %G to <4 x i32*>
 ;CHECK: ret
   ret <4 x i32*> %K
diff --git a/test/CodeGen/X86/pr11468.ll b/test/CodeGen/X86/pr11468.ll
index f7e9adb..f721df1 100644
--- a/test/CodeGen/X86/pr11468.ll
+++ b/test/CodeGen/X86/pr11468.ll
@@ -29,5 +29,5 @@ entry:
 ; CHECK: popq %rbp
 }
 
-!0 = metadata !{i32 125}
+!0 = !{i32 125}
 
diff --git a/test/CodeGen/X86/pr12360.ll b/test/CodeGen/X86/pr12360.ll
index 8b30596..6734036 100644
--- a/test/CodeGen/X86/pr12360.ll
+++ b/test/CodeGen/X86/pr12360.ll
@@ -22,7 +22,7 @@ entry:
   ret i1 %tobool
 }
 
-!0 = metadata !{i8 0, i8 2}
+!0 = !{i8 0, i8 2}
 
 
 ; check that we don't build a "trunc" from i1 to i1, which would assert.
diff --git a/test/CodeGen/X86/pr15267.ll b/test/CodeGen/X86/pr15267.ll
index b4dc5fd..90df990 100644
--- a/test/CodeGen/X86/pr15267.ll
+++ b/test/CodeGen/X86/pr15267.ll
@@ -4,8 +4,7 @@ define <4 x i3> @test1(<4 x i3>* %in) nounwind {
   %ret = load <4 x i3>* %in, align 1
   ret <4 x i3> %ret
 }
-
-; CHECK: test1
+; CHECK-LABEL: test1
 ; CHECK: movzwl
 ; CHECK: shrl $3
 ; CHECK: andl $7
@@ -25,7 +24,7 @@ define <4 x i1> @test2(<4 x i1>* %in) nounwind {
   ret <4 x i1> %ret
 }
 
-; CHECK: test2
+; CHECK-LABEL: test2
 ; CHECK: movzbl
 ; CHECK: shrl
 ; CHECK: andl $1
@@ -46,7 +45,7 @@ define <4 x i64> @test3(<4 x i1>* %in) nounwind {
   ret <4 x i64> %sext
 }
 
-; CHECK: test3
+; CHECK-LABEL: test3
 ; CHECK: movzbl
 ; CHECK: movq
 ; CHECK: shlq
@@ -67,3 +66,71 @@ define <4 x i64> @test3(<4 x i1>* %in) nounwind {
 ; CHECK: vpunpcklqdq
 ; CHECK: vinsertf128
 ; CHECK: ret
+
+define <16 x i4> @test4(<16 x i4>* %in) nounwind {
+  %ret = load <16 x i4>* %in, align 1
+  ret <16 x i4> %ret
+}
+
+; CHECK-LABEL: test4
+; CHECK: movl
+; CHECK-NEXT: shrl
+; CHECK-NEXT: andl
+; CHECK-NEXT: movl
+; CHECK-NEXT: andl
+; CHECK-NEXT: vmovd
+; CHECK-NEXT: vpinsrb
+; CHECK-NEXT: movl
+; CHECK-NEXT: shrl
+; CHECK-NEXT: andl
+; CHECK-NEXT: vpinsrb
+; CHECK-NEXT: movl
+; CHECK-NEXT: shrl
+; CHECK-NEXT: andl
+; CHECK-NEXT: vpinsrb
+; CHECK-NEXT: movl
+; CHECK-NEXT: shrl
+; CHECK-NEXT: andl
+; CHECK-NEXT: vpinsrb
+; CHECK-NEXT: movl
+; CHECK-NEXT: shrl
+; CHECK-NEXT: andl
+; CHECK-NEXT: vpinsrb
+; CHECK-NEXT: movl
+; CHECK-NEXT: shrl
+; CHECK-NEXT: andl
+; CHECK-NEXT: vpinsrb
+; CHECK-NEXT: movl
+; CHECK-NEXT: shrl
+; CHECK-NEXT: vpinsrb
+; CHECK-NEXT: movq
+; CHECK-NEXT: shrq
+; CHECK-NEXT: andl
+; CHECK-NEXT: vpinsrb
+; CHECK-NEXT: movq
+; CHECK-NEXT: shrq
+; CHECK-NEXT: andl
+; CHECK-NEXT: vpinsrb
+; CHECK-NEXT: movq
+; CHECK-NEXT: shrq
+; CHECK-NEXT: andl
+; CHECK-NEXT: vpinsrb
+; CHECK-NEXT: movq
+; CHECK-NEXT: shrq
+; CHECK-NEXT: andl
+; CHECK-NEXT: vpinsrb
+; CHECK-NEXT: movq
+; CHECK-NEXT: shrq
+; CHECK-NEXT: andl
+; CHECK-NEXT: vpinsrb
+; CHECK-NEXT: movq
+; CHECK-NEXT: shrq
+; CHECK-NEXT: andl
+; CHECK-NEXT: vpinsrb
+; CHECK-NEXT: movq
+; CHECK-NEXT: shrq
+; CHECK-NEXT: andl
+; CHECK-NEXT: vpinsrb
+; CHECK-NEXT: shrq
+; CHECK-NEXT: vpinsrb
+; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/pr18846.ll b/test/CodeGen/X86/pr18846.ll
index 27801be..c65bc79 100644
--- a/test/CodeGen/X86/pr18846.ll
+++ b/test/CodeGen/X86/pr18846.ll
@@ -131,9 +131,9 @@ attributes #1 = { nounwind }
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.5 "}
-!1 = metadata !{metadata !2, metadata !2, i64 0}
-!2 = metadata !{metadata !"float", metadata !3, i64 0}
-!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
-!5 = metadata !{metadata !3, metadata !3, i64 0}
+!0 = !{!"clang version 3.5 "}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"float", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!3, !3, i64 0}
diff --git a/test/CodeGen/X86/pr21792.ll b/test/CodeGen/X86/pr21792.ll
new file mode 100644
index 0000000..4138afc
--- /dev/null
+++ b/test/CodeGen/X86/pr21792.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=x86_64-linux -mcpu=corei7 < %s | FileCheck %s
+; This fixes a missing cases in the MI scheduler's constrainLocalCopy exposed by
+; PR21792
+
+@stuff = external constant [256 x double], align 16
+
+define void @func(<4 x float> %vx) {
+entry:
+  %tmp2 = bitcast <4 x float> %vx to <2 x i64>
+  %and.i = and <2 x i64> %tmp2, <i64 8727373547504, i64 8727373547504>
+  %tmp3 = bitcast <2 x i64> %and.i to <4 x i32>
+  %index.sroa.0.0.vec.extract = extractelement <4 x i32> %tmp3, i32 0
+  %idx.ext = sext i32 %index.sroa.0.0.vec.extract to i64
+  %add.ptr = getelementptr inbounds i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext
+  %tmp4 = bitcast i8* %add.ptr to double*
+  %index.sroa.0.4.vec.extract = extractelement <4 x i32> %tmp3, i32 1
+  %idx.ext5 = sext i32 %index.sroa.0.4.vec.extract to i64
+  %add.ptr6 = getelementptr inbounds i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext5
+  %tmp5 = bitcast i8* %add.ptr6 to double*
+  %index.sroa.0.8.vec.extract = extractelement <4 x i32> %tmp3, i32 2
+  %idx.ext14 = sext i32 %index.sroa.0.8.vec.extract to i64
+  %add.ptr15 = getelementptr inbounds i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext14
+  %tmp6 = bitcast i8* %add.ptr15 to double*
+  %index.sroa.0.12.vec.extract = extractelement <4 x i32> %tmp3, i32 3
+  %idx.ext19 = sext i32 %index.sroa.0.12.vec.extract to i64
+  %add.ptr20 = getelementptr inbounds i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext19
+  %tmp7 = bitcast i8* %add.ptr20 to double*
+  %add.ptr46 = getelementptr inbounds i8* bitcast (double* getelementptr inbounds ([256 x double]* @stuff, i64 0, i64 1) to i8*), i64 %idx.ext
+  %tmp16 = bitcast i8* %add.ptr46 to double*
+  %add.ptr51 = getelementptr inbounds i8* bitcast (double* getelementptr inbounds ([256 x double]* @stuff, i64 0, i64 1) to i8*), i64 %idx.ext5
+  %tmp17 = bitcast i8* %add.ptr51 to double*
+  call void @toto(double* %tmp4, double* %tmp5, double* %tmp6, double* %tmp7, double* %tmp16, double* %tmp17)
+  ret void
+; CHECK-LABEL: func:
+; CHECK: pextrq  $1, %xmm0,
+; CHECK-NEXT: movd    %xmm0, %r[[AX:..]]
+; CHECK-NEXT: movslq  %e[[AX]],
+; CHECK-NEXT: sarq    $32, %r[[AX]]
+}
+
+declare void @toto(double*, double*, double*, double*, double*, double*)
diff --git a/test/CodeGen/X86/pr22019.ll b/test/CodeGen/X86/pr22019.ll
new file mode 100644
index 0000000..4cee5d7
--- /dev/null
+++ b/test/CodeGen/X86/pr22019.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+module asm "pselect = __pselect"
+module asm "var = __var"
+module asm "alias = __alias"
+; CHECK: pselect = __pselect
+; CHECK: var = __var
+; CHECK: alias = __alias
+
+; CHECK: pselect:
+; CHECK: retq
+define void @pselect() {
+  ret void
+}
+
+; CHECK: var:
+; CHECK: .long 0
+@var = global i32 0
+
+; CHECK: alias = var
+@alias = alias i32* @var
diff --git a/test/CodeGen/X86/pr22103.ll b/test/CodeGen/X86/pr22103.ll
new file mode 100644
index 0000000..77c0751
--- /dev/null
+++ b/test/CodeGen/X86/pr22103.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s | FileCheck %s
+; Don't try to emit a direct call through a TLS global.
+; This fixes PR22103
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = external thread_local global i64
+
+; Function Attrs: nounwind
+define void @_Z1fv() {
+; CHECK-NOT: callq *$a
+; CHECK: movq %fs:0, [[RAX:%r..]]
+; CHECK-NEXT: addq    a@GOTTPOFF(%rip), [[RAX]]
+; CHECK-NEXT: callq *[[RAX]]
+entry:
+  call void bitcast (i64* @a to void ()*)()
+  ret void
+}
diff --git a/test/CodeGen/X86/pre-ra-sched.ll b/test/CodeGen/X86/pre-ra-sched.ll
index 70135d4..bb4c126 100644
--- a/test/CodeGen/X86/pre-ra-sched.ll
+++ b/test/CodeGen/X86/pre-ra-sched.ll
@@ -1,4 +1,4 @@
-; RUN-disabled: llc < %s -mtriple=x86_64-apple-macosx -pre-RA-sched=ilp -debug-only=pre-RA-sched \
+; RUN-disabled: llc < %s -verify-machineinstrs -mtriple=x86_64-apple-macosx -pre-RA-sched=ilp -debug-only=pre-RA-sched \
 ; RUN-disabled:     2>&1 | FileCheck %s
 ; RUN: true
 ; REQUIRES: asserts
diff --git a/test/CodeGen/X86/prefixdata.ll b/test/CodeGen/X86/prefixdata.ll
index 2ec1892..9bb54a2 100644
--- a/test/CodeGen/X86/prefixdata.ll
+++ b/test/CodeGen/X86/prefixdata.ll
@@ -2,16 +2,17 @@
 
 @i = linkonce_odr global i32 1
 
-; CHECK: f:
-; CHECK-NEXT: .cfi_startproc
+; CHECK: .type f,@function
 ; CHECK-NEXT: .long	1
+; CHECK-NEXT: # 0x1
+; CHECK-NEXT: f:
 define void @f() prefix i32 1 {
   ret void
 }
 
-; CHECK: g:
-; CHECK-NEXT: .cfi_startproc
+; CHECK: .type g,@function
 ; CHECK-NEXT: .quad	i
+; CHECK-NEXT: g:
 define void @g() prefix i32* @i {
   ret void
 }
diff --git a/test/CodeGen/X86/prologuedata.ll b/test/CodeGen/X86/prologuedata.ll
new file mode 100644
index 0000000..6a50ddb
--- /dev/null
+++ b/test/CodeGen/X86/prologuedata.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+@i = linkonce_odr global i32 1
+
+; CHECK: f:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: .long	1
+define void @f() prologue i32 1 {
+  ret void
+}
+
+; CHECK: g:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: .quad	i
+define void @g() prologue i32* @i {
+  ret void
+}
diff --git a/test/CodeGen/X86/pshufb-mask-comments.ll b/test/CodeGen/X86/pshufb-mask-comments.ll
index 7fc9890..ca5a02c 100644
--- a/test/CodeGen/X86/pshufb-mask-comments.ll
+++ b/test/CodeGen/X86/pshufb-mask-comments.ll
@@ -27,4 +27,26 @@ define <16 x i8> @test3(<16 x i8> %V) {
   ret <16 x i8> %1
 }
 
+; Test that we won't crash when the constant was reused for another instruction.
+
+define <16 x i8> @test4(<2 x i64>* %V) {
+; CHECK-LABEL: test4
+; CHECK: pshufb {{.*}}
+  store <2 x i64> <i64 1084818905618843912, i64 506097522914230528>, <2 x i64>* %V, align 16
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> undef, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @test5() {
+; CHECK-LABEL: test5
+; CHECK: pshufb {{.*}}
+  store <2 x i64> <i64 1, i64 0>, <2 x i64>* undef, align 16
+  %l = load <2 x i64>* undef, align 16
+  %shuffle = shufflevector <2 x i64> %l, <2 x i64> undef, <2 x i32> zeroinitializer
+  store <2 x i64> %shuffle, <2 x i64>* undef, align 16
+  %1 = load <16 x i8>* undef, align 16
+  %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> undef, <16 x i8> %1)
+  ret <16 x i8> %2
+}
+
 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll
index aff4afb..5e1343e 100644
--- a/test/CodeGen/X86/psubus.ll
+++ b/test/CodeGen/X86/psubus.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE2
+; RUN: llc -mcpu=core2 < %s | FileCheck %s -check-prefix=SSSE3
 ; RUN: llc -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1
 ; RUN: llc -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
 
@@ -7,334 +7,344 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 define void @test1(i16* nocapture %head) nounwind {
 vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i16* %head, i64 %index
+  %0 = getelementptr inbounds i16* %head, i64 0
   %1 = bitcast i16* %0 to <8 x i16>*
   %2 = load <8 x i16>* %1, align 2
   %3 = icmp slt <8 x i16> %2, zeroinitializer
   %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
   %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
   store <8 x i16> %5, <8 x i16>* %1, align 2
-  %index.next = add i64 %index, 8
-  %6 = icmp eq i64 %index.next, 16384
-  br i1 %6, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
   ret void
 
-; SSE2: @test1
-; SSE2: psubusw LCPI0_0(%rip), %xmm0
+; SSSE3: @test1
+; SSSE3:      # BB#0:
+; SSSE3-NEXT: movdqu (%rdi), %xmm0
+; SSSE3-NEXT: psubusw LCPI0_0(%rip), %xmm0
+; SSSE3-NEXT: movdqu %xmm0, (%rdi)
+; SSSE3-NEXT: retq
 
 ; AVX1: @test1
-; AVX1: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
+; AVX1:      # BB#0:
+; AVX1-NEXT: vmovdqu (%rdi), %xmm0
+; AVX1-NEXT: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX1-NEXT: retq
 
 ; AVX2: @test1
-; AVX2: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
+; AVX2:      # BB#0:
+; AVX2-NEXT: vmovdqu (%rdi), %xmm0
+; AVX2-NEXT: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX2-NEXT: retq
 }
 
 define void @test2(i16* nocapture %head) nounwind {
 vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i16* %head, i64 %index
+  %0 = getelementptr inbounds i16* %head, i64 0
   %1 = bitcast i16* %0 to <8 x i16>*
   %2 = load <8 x i16>* %1, align 2
   %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
   %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
   %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
   store <8 x i16> %5, <8 x i16>* %1, align 2
-  %index.next = add i64 %index, 8
-  %6 = icmp eq i64 %index.next, 16384
-  br i1 %6, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
   ret void
 
-; SSE2: @test2
-; SSE2: psubusw LCPI1_0(%rip), %xmm0
+; SSSE3: @test2
+; SSSE3:      # BB#0:
+; SSSE3-NEXT: movdqu (%rdi), %xmm0
+; SSSE3-NEXT: psubusw LCPI1_0(%rip), %xmm0
+; SSSE3-NEXT: movdqu %xmm0, (%rdi)
+; SSSE3-NEXT: retq
 
 ; AVX1: @test2
-; AVX1: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
+; AVX1:      # BB#0:
+; AVX1-NEXT: vmovdqu (%rdi), %xmm0
+; AVX1-NEXT: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX1-NEXT: retq
 
 ; AVX2: @test2
-; AVX2: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
+; AVX2:      # BB#0:
+; AVX2-NEXT: vmovdqu (%rdi), %xmm0
+; AVX2-NEXT: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX2-NEXT: retq
 }
 
 define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
 vector.ph:
   %0 = insertelement <8 x i16> undef, i16 %w, i32 0
   %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %1 = getelementptr inbounds i16* %head, i64 %index
+  %1 = getelementptr inbounds i16* %head, i64 0
   %2 = bitcast i16* %1 to <8 x i16>*
   %3 = load <8 x i16>* %2, align 2
   %4 = icmp ult <8 x i16> %3, %broadcast15
   %5 = sub <8 x i16> %3, %broadcast15
   %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
   store <8 x i16> %6, <8 x i16>* %2, align 2
-  %index.next = add i64 %index, 8
-  %7 = icmp eq i64 %index.next, 16384
-  br i1 %7, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
   ret void
 
-; SSE2: @test3
-; SSE2: psubusw %xmm0, %xmm1
+; SSSE3: @test3
+; SSSE3:      # BB#0:
+; SSSE3-NEXT: movd %esi, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT: movdqu (%rdi), %xmm1
+; SSSE3-NEXT: psubusw %xmm0, %xmm1
+; SSSE3-NEXT: movdqu %xmm1, (%rdi)
+; SSSE3-NEXT: retq
 
 ; AVX1: @test3
-; AVX1: vpsubusw %xmm0, %xmm1, %xmm1
+; AVX1:      # BB#0:
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vmovdqu (%rdi), %xmm1
+; AVX1-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX1-NEXT: retq
 
 ; AVX2: @test3
-; AVX2: vpsubusw %xmm0, %xmm1, %xmm1
+; AVX2:      # BB#0:
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu (%rdi), %xmm1
+; AVX2-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX2-NEXT: retq
 }
 
 define void @test4(i8* nocapture %head) nounwind {
 vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i8* %head, i64 %index
+  %0 = getelementptr inbounds i8* %head, i64 0
   %1 = bitcast i8* %0 to <16 x i8>*
   %2 = load <16 x i8>* %1, align 1
   %3 = icmp slt <16 x i8> %2, zeroinitializer
   %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
   %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
   store <16 x i8> %5, <16 x i8>* %1, align 1
-  %index.next = add i64 %index, 16
-  %6 = icmp eq i64 %index.next, 16384
-  br i1 %6, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
   ret void
 
-; SSE2: @test4
-; SSE2: psubusb LCPI3_0(%rip), %xmm0
+; SSSE3: @test4
+; SSSE3:      # BB#0:
+; SSSE3-NEXT: movdqu (%rdi), %xmm0
+; SSSE3-NEXT: psubusb LCPI3_0(%rip), %xmm0
+; SSSE3-NEXT: movdqu %xmm0, (%rdi)
+; SSSE3-NEXT: retq
 
 ; AVX1: @test4
-; AVX1: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
+; AVX1:      # BB#0:
+; AVX1-NEXT: vmovdqu (%rdi), %xmm0
+; AVX1-NEXT: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX1-NEXT: retq
 
 ; AVX2: @test4
-; AVX2: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
+; AVX2:      # BB#0:
+; AVX2-NEXT: vmovdqu (%rdi), %xmm0
+; AVX2-NEXT: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX2-NEXT: retq
 }
 
 define void @test5(i8* nocapture %head) nounwind {
 vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i8* %head, i64 %index
+  %0 = getelementptr inbounds i8* %head, i64 0
   %1 = bitcast i8* %0 to <16 x i8>*
   %2 = load <16 x i8>* %1, align 1
   %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
   %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
   %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
   store <16 x i8> %5, <16 x i8>* %1, align 1
-  %index.next = add i64 %index, 16
-  %6 = icmp eq i64 %index.next, 16384
-  br i1 %6, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
   ret void
 
-; SSE2: @test5
-; SSE2: psubusb LCPI4_0(%rip), %xmm0
+; SSSE3: @test5
+; SSSE3:      # BB#0:
+; SSSE3-NEXT: movdqu (%rdi), %xmm0
+; SSSE3-NEXT: psubusb LCPI4_0(%rip), %xmm0
+; SSSE3-NEXT: movdqu %xmm0, (%rdi)
+; SSSE3-NEXT: retq
 
 ; AVX1: @test5
-; AVX1: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0
+; AVX1:      # BB#0:
+; AVX1-NEXT: vmovdqu (%rdi), %xmm0
+; AVX1-NEXT: vpsubusb LCPI4_0(%rip), %xmm0
+; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX1-NEXT: retq
 
 ; AVX2: @test5
-; AVX2: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0
+; AVX2:      # BB#0:
+; AVX2-NEXT: vmovdqu (%rdi), %xmm0
+; AVX2-NEXT: vpsubusb LCPI4_0(%rip), %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX2-NEXT: retq
 }
 
 define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
 vector.ph:
   %0 = insertelement <16 x i8> undef, i8 %w, i32 0
   %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %1 = getelementptr inbounds i8* %head, i64 %index
+  %1 = getelementptr inbounds i8* %head, i64 0
   %2 = bitcast i8* %1 to <16 x i8>*
   %3 = load <16 x i8>* %2, align 1
   %4 = icmp ult <16 x i8> %3, %broadcast15
   %5 = sub <16 x i8> %3, %broadcast15
   %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
   store <16 x i8> %6, <16 x i8>* %2, align 1
-  %index.next = add i64 %index, 16
-  %7 = icmp eq i64 %index.next, 16384
-  br i1 %7, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
   ret void
 
-; SSE2: @test6
-; SSE2: psubusb %xmm0, %xmm1
+; SSSE3: @test6
+; SSSE3:      # BB#0:
+; SSSE3-NEXT: movd %esi, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: movdqu (%rdi), %xmm1
+; SSSE3-NEXT: psubusb %xmm0, %xmm1
+; SSSE3-NEXT: movdqu %xmm1, (%rdi)
+; SSSE3-NEXT: retq
 
 ; AVX1: @test6
-; AVX1: vpsubusb %xmm0, %xmm1, %xmm1
+; AVX1:      # BB#0:
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0
+; AVX1-NEXT: vmovdqu (%rdi), %xmm1
+; AVX1-NEXT: vpsubusb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX1-NEXT: retq
 
 ; AVX2: @test6
-; AVX2: vpsubusb %xmm0, %xmm1, %xmm1
+; AVX2:      # BB#0:
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu (%rdi), %xmm1
+; AVX2-NEXT: vpsubusb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX2-NEXT: retq
 }
 
 define void @test7(i16* nocapture %head) nounwind {
 vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i16* %head, i64 %index
+  %0 = getelementptr inbounds i16* %head, i64 0
   %1 = bitcast i16* %0 to <16 x i16>*
   %2 = load <16 x i16>* %1, align 2
   %3 = icmp slt <16 x i16> %2, zeroinitializer
   %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
   %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
   store <16 x i16> %5, <16 x i16>* %1, align 2
-  %index.next = add i64 %index, 8
-  %6 = icmp eq i64 %index.next, 16384
-  br i1 %6, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
   ret void
 
 ; AVX2: @test7
-; AVX2: vpsubusw LCPI6_0(%rip), %ymm0, %ymm0
+; AVX2:      # BB#0:
+; AVX2-NEXT: vmovdqu (%rdi), %ymm0
+; AVX2-NEXT: vpsubusw LCPI6_0(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
 }
 
 define void @test8(i16* nocapture %head) nounwind {
 vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i16* %head, i64 %index
+  %0 = getelementptr inbounds i16* %head, i64 0
   %1 = bitcast i16* %0 to <16 x i16>*
   %2 = load <16 x i16>* %1, align 2
   %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
   %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
   %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
   store <16 x i16> %5, <16 x i16>* %1, align 2
-  %index.next = add i64 %index, 8
-  %6 = icmp eq i64 %index.next, 16384
-  br i1 %6, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
   ret void
 
 ; AVX2: @test8
-; AVX2: vpsubusw LCPI7_0(%rip), %ymm0, %ymm0
+; AVX2:      # BB#0:
+; AVX2-NEXT: vmovdqu (%rdi), %ymm0
+; AVX2-NEXT: vpsubusw LCPI7_0(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
 }
 
 define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
 vector.ph:
   %0 = insertelement <16 x i16> undef, i16 %w, i32 0
   %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %1 = getelementptr inbounds i16* %head, i64 %index
+  %1 = getelementptr inbounds i16* %head, i64 0
   %2 = bitcast i16* %1 to <16 x i16>*
   %3 = load <16 x i16>* %2, align 2
   %4 = icmp ult <16 x i16> %3, %broadcast15
   %5 = sub <16 x i16> %3, %broadcast15
   %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
   store <16 x i16> %6, <16 x i16>* %2, align 2
-  %index.next = add i64 %index, 8
-  %7 = icmp eq i64 %index.next, 16384
-  br i1 %7, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
   ret void
 
-
 ; AVX2: @test9
-; AVX2: vpsubusw %ymm0, %ymm1, %ymm1
+; AVX2:      # BB#0:
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vmovdqu (%rdi), %ymm1
+; AVX2-NEXT: vpsubusw %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
 }
 
 define void @test10(i8* nocapture %head) nounwind {
 vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i8* %head, i64 %index
+  %0 = getelementptr inbounds i8* %head, i64 0
   %1 = bitcast i8* %0 to <32 x i8>*
   %2 = load <32 x i8>* %1, align 1
   %3 = icmp slt <32 x i8> %2, zeroinitializer
   %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
   %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
   store <32 x i8> %5, <32 x i8>* %1, align 1
-  %index.next = add i64 %index, 16
-  %6 = icmp eq i64 %index.next, 16384
-  br i1 %6, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
   ret void
 
-
 ; AVX2: @test10
-; AVX2: vpsubusb LCPI9_0(%rip), %ymm0, %ymm0
+; AVX2:      # BB#0:
+; AVX2-NEXT: vmovdqu (%rdi), %ymm0
+; AVX2-NEXT: vpsubusb LCPI9_0(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
 }
 
 define void @test11(i8* nocapture %head) nounwind {
 vector.ph:
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i8* %head, i64 %index
+  %0 = getelementptr inbounds i8* %head, i64 0
   %1 = bitcast i8* %0 to <32 x i8>*
   %2 = load <32 x i8>* %1, align 1
   %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
   %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
   %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
   store <32 x i8> %5, <32 x i8>* %1, align 1
-  %index.next = add i64 %index, 16
-  %6 = icmp eq i64 %index.next, 16384
-  br i1 %6, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
   ret void
 
 ; AVX2: @test11
-; AVX2: vpsubusb LCPI10_0(%rip), %ymm0, %ymm0
+; AVX2:      # BB#0:
+; AVX2-NEXT: vmovdqu (%rdi), %ymm0
+; AVX2-NEXT: vpsubusb LCPI10_0(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
 }
 
 define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
 vector.ph:
   %0 = insertelement <32 x i8> undef, i8 %w, i32 0
   %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %1 = getelementptr inbounds i8* %head, i64 %index
+  %1 = getelementptr inbounds i8* %head, i64 0
   %2 = bitcast i8* %1 to <32 x i8>*
   %3 = load <32 x i8>* %2, align 1
   %4 = icmp ult <32 x i8> %3, %broadcast15
   %5 = sub <32 x i8> %3, %broadcast15
   %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
   store <32 x i8> %6, <32 x i8>* %2, align 1
-  %index.next = add i64 %index, 16
-  %7 = icmp eq i64 %index.next, 16384
-  br i1 %7, label %for.end, label %vector.body
-
-for.end:                                          ; preds = %vector.body
   ret void
 
 ; AVX2: @test12
-; AVX2: vpsubusb %ymm0, %ymm1, %ymm1
+; AVX2:      # BB#0:
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqu (%rdi), %ymm1
+; AVX2-NEXT: vpsubusb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
 }
diff --git a/test/CodeGen/X86/ragreedy-bug.ll b/test/CodeGen/X86/ragreedy-bug.ll
index df9b41d..83ac274 100644
--- a/test/CodeGen/X86/ragreedy-bug.ll
+++ b/test/CodeGen/X86/ragreedy-bug.ll
@@ -266,27 +266,27 @@ return:
   %retval.0 = phi i32 [ 0, %entry ], [ 1, %land.lhs.true52 ], [ 1, %land.lhs.true43 ], [ 0, %if.else123 ], [ 1, %while.cond59.preheader ], [ 1, %while.cond95.preheader ], [ 1, %while.cond130.preheader ], [ 1, %land.lhs.true28 ], [ 1, %if.then83 ], [ 0, %lor.lhs.false74 ], [ 1, %land.rhs ], [ 1, %if.then117 ], [ 0, %while.body104 ], [ 1, %land.rhs99 ], [ 1, %if.then152 ], [ 0, %while.body139 ], [ 1, %land.rhs134 ], [ 0, %while.body ]
   ret i32 %retval.0
 }
-!181 = metadata !{metadata !"branch_weights", i32 662038, i32 1}
-!988 = metadata !{metadata !"branch_weights", i32 12091450, i32 1916}
-!989 = metadata !{metadata !"branch_weights", i32 7564670, i32 4526781}
-!990 = metadata !{metadata !"branch_weights", i32 7484958, i32 13283499}
-!991 = metadata !{metadata !"branch_weights", i32 8677007, i32 4606493}
-!992 = metadata !{metadata !"branch_weights", i32 -1172426948, i32 145094705}
-!993 = metadata !{metadata !"branch_weights", i32 1468914, i32 5683688}
-!994 = metadata !{metadata !"branch_weights", i32 114025221, i32 -1217548794, i32 -1199521551, i32 87712616}
-!995 = metadata !{metadata !"branch_weights", i32 1853716452, i32 -444717951, i32 932776759}
-!996 = metadata !{metadata !"branch_weights", i32 1004870, i32 20259}
-!997 = metadata !{metadata !"branch_weights", i32 20071, i32 189}
-!998 = metadata !{metadata !"branch_weights", i32 -1020255939, i32 572177766}
-!999 = metadata !{metadata !"branch_weights", i32 2666513, i32 3466431}
-!1000 = metadata !{metadata !"branch_weights", i32 5117635, i32 1859780}
-!1001 = metadata !{metadata !"branch_weights", i32 354902465, i32 -1444604407}
-!1002 = metadata !{metadata !"branch_weights", i32 -1762419279, i32 1592770684}
-!1003 = metadata !{metadata !"branch_weights", i32 1435905930, i32 -1951930624}
-!1004 = metadata !{metadata !"branch_weights", i32 1, i32 504888}
-!1005 = metadata !{metadata !"branch_weights", i32 94662, i32 504888}
-!1006 = metadata !{metadata !"branch_weights", i32 -1897793104, i32 160196332}
-!1007 = metadata !{metadata !"branch_weights", i32 2074643678, i32 -29579071}
-!1008 = metadata !{metadata !"branch_weights", i32 1, i32 226163}
-!1009 = metadata !{metadata !"branch_weights", i32 58357, i32 226163}
-!1010 = metadata !{metadata !"branch_weights", i32 -2072848646, i32 92907517}
+!181 = !{!"branch_weights", i32 662038, i32 1}
+!988 = !{!"branch_weights", i32 12091450, i32 1916}
+!989 = !{!"branch_weights", i32 7564670, i32 4526781}
+!990 = !{!"branch_weights", i32 7484958, i32 13283499}
+!991 = !{!"branch_weights", i32 8677007, i32 4606493}
+!992 = !{!"branch_weights", i32 -1172426948, i32 145094705}
+!993 = !{!"branch_weights", i32 1468914, i32 5683688}
+!994 = !{!"branch_weights", i32 114025221, i32 -1217548794, i32 -1199521551, i32 87712616}
+!995 = !{!"branch_weights", i32 1853716452, i32 -444717951, i32 932776759}
+!996 = !{!"branch_weights", i32 1004870, i32 20259}
+!997 = !{!"branch_weights", i32 20071, i32 189}
+!998 = !{!"branch_weights", i32 -1020255939, i32 572177766}
+!999 = !{!"branch_weights", i32 2666513, i32 3466431}
+!1000 = !{!"branch_weights", i32 5117635, i32 1859780}
+!1001 = !{!"branch_weights", i32 354902465, i32 -1444604407}
+!1002 = !{!"branch_weights", i32 -1762419279, i32 1592770684}
+!1003 = !{!"branch_weights", i32 1435905930, i32 -1951930624}
+!1004 = !{!"branch_weights", i32 1, i32 504888}
+!1005 = !{!"branch_weights", i32 94662, i32 504888}
+!1006 = !{!"branch_weights", i32 -1897793104, i32 160196332}
+!1007 = !{!"branch_weights", i32 2074643678, i32 -29579071}
+!1008 = !{!"branch_weights", i32 1, i32 226163}
+!1009 = !{!"branch_weights", i32 58357, i32 226163}
+!1010 = !{!"branch_weights", i32 -2072848646, i32 92907517}
diff --git a/test/CodeGen/X86/ragreedy-hoist-spill.ll b/test/CodeGen/X86/ragreedy-hoist-spill.ll
index c6b28f7..57afb41 100644
--- a/test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ b/test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -202,7 +202,6 @@ lor.rhs500:
   ; CHECK: lor.rhs500
   ; Make sure that we don't hoist the spill to outer loops.
   ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
-  ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
   ; CHECK: callq {{.*}}maskrune
   %call3.i.i2792 = call i32 @__maskrune(i32 undef, i64 256)
   br i1 undef, label %land.lhs.true504, label %do.body479.backedge
@@ -378,12 +377,12 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.5.0 (trunk 204257)"}
-!1 = metadata !{metadata !2, metadata !2, i64 0}
-!2 = metadata !{metadata !"int", metadata !3, i64 0}
-!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
-!5 = metadata !{metadata !3, metadata !3, i64 0}
-!6 = metadata !{metadata !7, metadata !8, i64 8}
-!7 = metadata !{metadata !"", metadata !8, i64 0, metadata !8, i64 8, metadata !3, i64 16}
-!8 = metadata !{metadata !"any pointer", metadata !3, i64 0}
+!0 = !{!"clang version 3.5.0 (trunk 204257)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!3, !3, i64 0}
+!6 = !{!7, !8, i64 8}
+!7 = !{!"", !8, i64 0, !8, i64 8, !3, i64 16}
+!8 = !{!"any pointer", !3, i64 0}
diff --git a/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll b/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
new file mode 100644
index 0000000..0067942
--- /dev/null
+++ b/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
@@ -0,0 +1,145 @@
+; RUN: llc < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s
+; Test case for the recoloring of broken hints.
+; This is tricky to have something reasonably small to kick this optimization since
+; it requires that spliting and spilling occur.
+; The bottom line is that this test case is fragile.
+; This was reduced from the make_list function from the llvm-testsuite:
+; SingleSource/Benchmarks/McGill/chomp.c
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+%struct._list = type { i32*, %struct._list* }
+
+@ncol = external global i32, align 4
+@nrow = external global i32, align 4
+
+declare noalias i32* @copy_data()
+
+declare noalias i8* @malloc(i64)
+
+declare i32 @get_value()
+
+declare i32 @in_wanted(i32* nocapture readonly)
+
+declare noalias i32* @make_data()
+
+; CHECK-LABEL: make_list:
+; Function prologue.
+; CHECK: pushq
+; CHECK: subq ${{[0-9]+}}, %rsp
+; Move the first argument (%data) into a temporary register.
+; It will not survive the call to malloc otherwise.
+; CHECK: movq %rdi, [[ARG1:%r[0-9a-z]+]]
+; CHECK: callq _malloc
+; Compute %data - 1 as used for load in land.rhs.i (via the variable  %indvars.iv.next.i).
+; CHECK: addq $-4, [[ARG1]]
+; We use to produce a useless copy here and move %data in another temporary register. 
+; CHECK-NOT: movq [[ARG1]]
+; End of the first basic block.
+; CHECK: .align
+; Now check that %data is used in an address computation.
+; CHECK: leaq ([[ARG1]]
+define %struct._list* @make_list(i32* nocapture readonly %data, i32* nocapture %value, i32* nocapture %all) {
+entry:
+  %call = tail call i8* @malloc(i64 16)
+  %next = getelementptr inbounds i8* %call, i64 8
+  %tmp = bitcast i8* %next to %struct._list**
+  %tmp2 = bitcast i8* %call to %struct._list*
+  %.pre78 = load i32* @ncol, align 4
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc32, %entry
+  %tmp4 = phi i32 [ %.pre78, %entry ], [ 0, %for.inc32 ]
+  %current.077 = phi %struct._list* [ %tmp2, %entry ], [ %current.1.lcssa, %for.inc32 ]
+  %cmp270 = icmp eq i32 %tmp4, 0
+  br i1 %cmp270, label %for.inc32, label %for.body3
+
+for.body3:                                        ; preds = %if.end31, %for.cond1.preheader
+  %current.173 = phi %struct._list* [ %current.2, %if.end31 ], [ %current.077, %for.cond1.preheader ]
+  %row.172 = phi i32 [ %row.3, %if.end31 ], [ 0, %for.cond1.preheader ]
+  %col.071 = phi i32 [ %inc, %if.end31 ], [ 0, %for.cond1.preheader ]
+  %call4 = tail call i32* @make_data()
+  %tmp5 = load i32* @ncol, align 4
+  %tobool14.i = icmp eq i32 %tmp5, 0
+  br i1 %tobool14.i, label %while.cond.i, label %while.body.lr.ph.i
+
+while.body.lr.ph.i:                               ; preds = %for.body3
+  %tmp6 = sext i32 %tmp5 to i64
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %while.body.i, %while.body.lr.ph.i
+  %indvars.iv.i = phi i64 [ %tmp6, %while.body.lr.ph.i ], [ %indvars.iv.next.i, %while.body.i ]
+  %indvars.iv.next.i = add nsw i64 %indvars.iv.i, -1
+  %tmp9 = trunc i64 %indvars.iv.next.i to i32
+  %tobool.i = icmp eq i32 %tmp9, 0
+  br i1 %tobool.i, label %while.cond.i, label %while.body.i
+
+while.cond.i:                                     ; preds = %land.rhs.i, %while.body.i, %for.body3
+  %indvars.iv.i64 = phi i64 [ %indvars.iv.next.i65, %land.rhs.i ], [ 0, %for.body3 ], [ %tmp6, %while.body.i ]
+  %indvars.iv.next.i65 = add nsw i64 %indvars.iv.i64, -1
+  %tmp10 = trunc i64 %indvars.iv.i64 to i32
+  %tobool.i66 = icmp eq i32 %tmp10, 0
+  br i1 %tobool.i66, label %if.else, label %land.rhs.i
+
+land.rhs.i:                                       ; preds = %while.cond.i
+  %arrayidx.i67 = getelementptr inbounds i32* %call4, i64 %indvars.iv.next.i65
+  %tmp11 = load i32* %arrayidx.i67, align 4
+  %arrayidx2.i68 = getelementptr inbounds i32* %data, i64 %indvars.iv.next.i65
+  %tmp12 = load i32* %arrayidx2.i68, align 4
+  %cmp.i69 = icmp eq i32 %tmp11, %tmp12
+  br i1 %cmp.i69, label %while.cond.i, label %equal_data.exit
+
+equal_data.exit:                                  ; preds = %land.rhs.i
+  %cmp3.i = icmp slt i32 %tmp10, 1
+  br i1 %cmp3.i, label %if.else, label %if.then
+
+if.then:                                          ; preds = %equal_data.exit
+  %next7 = getelementptr inbounds %struct._list* %current.173, i64 0, i32 1
+  %tmp14 = load %struct._list** %next7, align 8
+  %next12 = getelementptr inbounds %struct._list* %tmp14, i64 0, i32 1
+  store %struct._list* null, %struct._list** %next12, align 8
+  %tmp15 = load %struct._list** %next7, align 8
+  %tmp16 = load i32* %value, align 4
+  %cmp14 = icmp eq i32 %tmp16, 1
+  %.tmp16 = select i1 %cmp14, i32 0, i32 %tmp16
+  %tmp18 = load i32* %all, align 4
+  %tmp19 = or i32 %tmp18, %.tmp16
+  %tmp20 = icmp eq i32 %tmp19, 0
+  br i1 %tmp20, label %if.then19, label %if.end31
+
+if.then19:                                        ; preds = %if.then
+  %call21 = tail call i32 @in_wanted(i32* %call4)
+  br label %if.end31
+
+if.else:                                          ; preds = %equal_data.exit, %while.cond.i
+  %cmp26 = icmp eq i32 %col.071, 0
+  %.row.172 = select i1 %cmp26, i32 0, i32 %row.172
+  %sub30 = add nsw i32 %tmp5, -1
+  br label %if.end31
+
+if.end31:                                         ; preds = %if.else, %if.then19, %if.then
+  %col.1 = phi i32 [ %sub30, %if.else ], [ 0, %if.then ], [ 0, %if.then19 ]
+  %row.3 = phi i32 [ %.row.172, %if.else ], [ %row.172, %if.then ], [ 0, %if.then19 ]
+  %current.2 = phi %struct._list* [ %current.173, %if.else ], [ %tmp15, %if.then ], [ %tmp15, %if.then19 ]
+  %inc = add nsw i32 %col.1, 1
+  %tmp25 = load i32* @ncol, align 4
+  %cmp2 = icmp eq i32 %inc, %tmp25
+  br i1 %cmp2, label %for.cond1.for.inc32_crit_edge, label %for.body3
+
+for.cond1.for.inc32_crit_edge:                    ; preds = %if.end31
+  %.pre79 = load i32* @nrow, align 4
+  br label %for.inc32
+
+for.inc32:                                        ; preds = %for.cond1.for.inc32_crit_edge, %for.cond1.preheader
+  %tmp26 = phi i32 [ %.pre79, %for.cond1.for.inc32_crit_edge ], [ 0, %for.cond1.preheader ]
+  %current.1.lcssa = phi %struct._list* [ %current.2, %for.cond1.for.inc32_crit_edge ], [ %current.077, %for.cond1.preheader ]
+  %row.1.lcssa = phi i32 [ %row.3, %for.cond1.for.inc32_crit_edge ], [ 0, %for.cond1.preheader ]
+  %inc33 = add nsw i32 %row.1.lcssa, 1
+  %cmp = icmp eq i32 %inc33, %tmp26
+  br i1 %cmp, label %for.end34, label %for.cond1.preheader
+
+for.end34:                                        ; preds = %for.inc32
+  %.pre = load %struct._list** %tmp, align 8
+  ret %struct._list* %.pre
+}
diff --git a/test/CodeGen/X86/remat-phys-dead.ll b/test/CodeGen/X86/remat-phys-dead.ll
index 4d7ee62..6cdcd28 100644
--- a/test/CodeGen/X86/remat-phys-dead.ll
+++ b/test/CodeGen/X86/remat-phys-dead.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc -mtriple=x86_64-apple-darwin -debug -o /dev/null < %s 2>&1 | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-apple-darwin -debug -o /dev/null < %s 2>&1 | FileCheck %s
 
 ; We need to make sure that rematerialization into a physical register marks the
 ; super- or sub-register as dead after this rematerialization since only the
diff --git a/test/CodeGen/X86/scalar_sse_minmax.ll b/test/CodeGen/X86/scalar_sse_minmax.ll
index bc4ab5d..5ca3f85 100644
--- a/test/CodeGen/X86/scalar_sse_minmax.ll
+++ b/test/CodeGen/X86/scalar_sse_minmax.ll
@@ -1,44 +1,53 @@
-; RUN: llc < %s -march=x86 -mattr=+sse,+sse2 | \
-; RUN:   grep mins | count 3
-; RUN: llc < %s -march=x86 -mattr=+sse,+sse2 | \
-; RUN:   grep maxs | count 2
-
-declare i1 @llvm.isunordered.f64(double, double)
-
-declare i1 @llvm.isunordered.f32(float, float)
+; RUN: llc < %s -march=x86 -mattr=+sse,+sse2 | FileCheck %s
 
 define float @min1(float %x, float %y) {
-	%tmp = fcmp olt float %x, %y		; <i1> [#uses=1]
-	%retval = select i1 %tmp, float %x, float %y		; <float> [#uses=1]
+; CHECK-LABEL: min1
+; CHECK: mins
+	%tmp = fcmp olt float %x, %y
+	%retval = select i1 %tmp, float %x, float %y
 	ret float %retval
 }
 
 define double @min2(double %x, double %y) {
-	%tmp = fcmp olt double %x, %y		; <i1> [#uses=1]
-	%retval = select i1 %tmp, double %x, double %y		; <double> [#uses=1]
+; CHECK-LABEL: min2
+; CHECK: mins
+	%tmp = fcmp olt double %x, %y
+	%retval = select i1 %tmp, double %x, double %y
 	ret double %retval
 }
 
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
+define <4 x float> @min3(float %x, float %y) {
+; CHECK-LABEL: min3
+; CHECK: mins
+	%vec0 = insertelement <4 x float> undef, float %x, i32 0
+	%vec1 = insertelement <4 x float> undef, float %y, i32 0
+	%retval = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %vec0, <4 x float> %vec1)
+	ret <4 x float> %retval
+}
+
 define float @max1(float %x, float %y) {
-	%tmp = fcmp oge float %x, %y		; <i1> [#uses=1]
-	%tmp2 = fcmp uno float %x, %y		; <i1> [#uses=1]
-	%tmp3 = or i1 %tmp2, %tmp		; <i1> [#uses=1]
-	%retval = select i1 %tmp3, float %x, float %y		; <float> [#uses=1]
+; CHECK-LABEL: max1
+; CHECK: maxs
+	%tmp = fcmp uge float %x, %y
+	%retval = select i1 %tmp, float %x, float %y
 	ret float %retval
 }
 
 define double @max2(double %x, double %y) {
-	%tmp = fcmp oge double %x, %y		; <i1> [#uses=1]
-	%tmp2 = fcmp uno double %x, %y		; <i1> [#uses=1]
-	%tmp3 = or i1 %tmp2, %tmp		; <i1> [#uses=1]
-	%retval = select i1 %tmp3, double %x, double %y		; <double> [#uses=1]
+; CHECK-LABEL: max2
+; CHECK: maxs
+	%tmp = fcmp uge double %x, %y
+	%retval = select i1 %tmp, double %x, double %y
 	ret double %retval
 }
 
-define <4 x float> @min3(float %tmp37) {
-	%tmp375 = insertelement <4 x float> undef, float %tmp37, i32 0		; <<4 x float>> [#uses=1]
-	%tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp375, <4 x float> < float 6.553500e+04, float undef, float undef, float undef > )		; <<4 x float>> [#uses=1]
-	ret <4 x float> %tmp48
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
+define <4 x float> @max3(float %x, float %y) {
+; CHECK-LABEL: max3
+; CHECK: maxs
+	%vec0 = insertelement <4 x float> undef, float %x, i32 0
+	%vec1 = insertelement <4 x float> undef, float %y, i32 0
+	%retval = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %vec0, <4 x float> %vec1)
+	ret <4 x float> %retval
 }
-
-declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
diff --git a/test/CodeGen/X86/scev-interchange.ll b/test/CodeGen/X86/scev-interchange.ll
index 71a4d21..0e7047b 100644
--- a/test/CodeGen/X86/scev-interchange.ll
+++ b/test/CodeGen/X86/scev-interchange.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64-linux
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 	%"struct.DataOutBase::GmvFlags" = type { i32 }
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index 2db7c11..3e47121 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -1,10 +1,13 @@
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32-Linux
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux  -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -code-model=large -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux-Large
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -verify-machineinstrs | FileCheck %s -check-prefix=X32ABI
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X32-Darwin
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X64-Darwin
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -verify-machineinstrs | FileCheck %s -check-prefix=X32-MinGW
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -verify-machineinstrs | FileCheck %s -check-prefix=X64-FreeBSD
+; RUN: llc < %s -mcpu=generic -mtriple=i686-dragonfly -verify-machineinstrs | FileCheck %s -check-prefix=X32-DFlyBSD
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-dragonfly -verify-machineinstrs | FileCheck %s -check-prefix=X64-DFlyBSD
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -verify-machineinstrs | FileCheck %s -check-prefix=X64-MinGW
 
 ; We used to crash with filetype=obj
@@ -15,6 +18,8 @@
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=i686-dragonfly -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-dragonfly -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -filetype=obj
 
 ; RUN: not llc < %s -mcpu=generic -mtriple=x86_64-solaris 2> %t.log
@@ -53,6 +58,16 @@ define void @test_basic() #0 {
 ; X64-Linux-NEXT:  callq __morestack
 ; X64-Linux-NEXT:  ret
 
+; X64-Linux-Large-LABEL:       test_basic:
+
+; X64-Linux-Large:       cmpq %fs:112, %rsp
+; X64-Linux-Large-NEXT:  ja      .LBB0_2
+
+; X64-Linux-Large:       movabsq $40, %r10
+; X64-Linux-Large-NEXT:  movabsq $0, %r11
+; X64-Linux-Large-NEXT:  callq *__morestack_addr(%rip)
+; X64-Linux-Large-NEXT:  ret
+
 ; X32ABI-LABEL:       test_basic:
 
 ; X32ABI:       cmpl %fs:64, %esp
@@ -114,6 +129,26 @@ define void @test_basic() #0 {
 ; X64-FreeBSD-NEXT:  callq __morestack
 ; X64-FreeBSD-NEXT:  ret
 
+; X32-DFlyBSD-LABEL:       test_basic:
+
+; X32-DFlyBSD:       cmpl %fs:16, %esp
+; X32-DFlyBSD-NEXT:  ja      .LBB0_2
+
+; X32-DFlyBSD:       pushl $0
+; X32-DFlyBSD-NEXT:  pushl $48
+; X32-DFlyBSD-NEXT:  calll __morestack
+; X32-DFlyBSD-NEXT:  ret
+
+; X64-DFlyBSD-LABEL:       test_basic:
+
+; X64-DFlyBSD:       cmpq %fs:32, %rsp
+; X64-DFlyBSD-NEXT:  ja      .LBB0_2
+
+; X64-DFlyBSD:       movabsq $40, %r10
+; X64-DFlyBSD-NEXT:  movabsq $0, %r11
+; X64-DFlyBSD-NEXT:  callq __morestack
+; X64-DFlyBSD-NEXT:  ret
+
 }
 
 define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
@@ -199,6 +234,24 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X64-FreeBSD-NEXT:  ret
 ; X64-FreeBSD-NEXT:  movq %rax, %r10
 
+; X32-DFlyBSD:       cmpl %fs:16, %esp
+; X32-DFlyBSD-NEXT:  ja      .LBB1_2
+
+; X32-DFlyBSD:       pushl $4
+; X32-DFlyBSD-NEXT:  pushl $52
+; X32-DFlyBSD-NEXT:  calll __morestack
+; X32-DFlyBSD-NEXT:  ret
+
+; X64-DFlyBSD:       cmpq %fs:32, %rsp
+; X64-DFlyBSD-NEXT:  ja      .LBB1_2
+
+; X64-DFlyBSD:       movq %r10, %rax
+; X64-DFlyBSD-NEXT:  movabsq $56, %r10
+; X64-DFlyBSD-NEXT:  movabsq $0, %r11
+; X64-DFlyBSD-NEXT:  callq __morestack
+; X64-DFlyBSD-NEXT:  ret
+; X64-DFlyBSD-NEXT:  movq %rax, %r10
+
 }
 
 define void @test_large() #0 {
@@ -280,6 +333,24 @@ define void @test_large() #0 {
 ; X64-FreeBSD-NEXT:  callq __morestack
 ; X64-FreeBSD-NEXT:  ret
 
+; X32-DFlyBSD:       leal -40008(%esp), %ecx
+; X32-DFlyBSD-NEXT:  cmpl %fs:16, %ecx
+; X32-DFlyBSD-NEXT:  ja      .LBB2_2
+
+; X32-DFlyBSD:       pushl $0
+; X32-DFlyBSD-NEXT:  pushl $40008
+; X32-DFlyBSD-NEXT:  calll __morestack
+; X32-DFlyBSD-NEXT:  ret
+
+; X64-DFlyBSD:       leaq -40008(%rsp), %r11
+; X64-DFlyBSD-NEXT:  cmpq %fs:32, %r11
+; X64-DFlyBSD-NEXT:  ja      .LBB2_2
+
+; X64-DFlyBSD:       movabsq $40008, %r10
+; X64-DFlyBSD-NEXT:  movabsq $0, %r11
+; X64-DFlyBSD-NEXT:  callq __morestack
+; X64-DFlyBSD-NEXT:  ret
+
 }
 
 define fastcc void @test_fastcc() #0 {
@@ -368,6 +439,26 @@ define fastcc void @test_fastcc() #0 {
 ; X64-FreeBSD-NEXT:  callq __morestack
 ; X64-FreeBSD-NEXT:  ret
 
+; X32-DFlyBSD-LABEL:       test_fastcc:
+
+; X32-DFlyBSD:       cmpl %fs:16, %esp
+; X32-DFlyBSD-NEXT:  ja      .LBB3_2
+
+; X32-DFlyBSD:       pushl $0
+; X32-DFlyBSD-NEXT:  pushl $48
+; X32-DFlyBSD-NEXT:  calll __morestack
+; X32-DFlyBSD-NEXT:  ret
+
+; X64-DFlyBSD-LABEL:       test_fastcc:
+
+; X64-DFlyBSD:       cmpq %fs:32, %rsp
+; X64-DFlyBSD-NEXT:  ja      .LBB3_2
+
+; X64-DFlyBSD:       movabsq $40, %r10
+; X64-DFlyBSD-NEXT:  movabsq $0, %r11
+; X64-DFlyBSD-NEXT:  callq __morestack
+; X64-DFlyBSD-NEXT:  ret
+
 }
 
 define fastcc void @test_fastcc_large() #0 {
@@ -464,6 +555,28 @@ define fastcc void @test_fastcc_large() #0 {
 ; X64-FreeBSD-NEXT:  callq __morestack
 ; X64-FreeBSD-NEXT:  ret
 
+; X32-DFlyBSD-LABEL:       test_fastcc_large:
+
+; X32-DFlyBSD:       leal -40008(%esp), %eax
+; X32-DFlyBSD-NEXT:  cmpl %fs:16, %eax
+; X32-DFlyBSD-NEXT:  ja      .LBB4_2
+
+; X32-DFlyBSD:       pushl $0
+; X32-DFlyBSD-NEXT:  pushl $40008
+; X32-DFlyBSD-NEXT:  calll __morestack
+; X32-DFlyBSD-NEXT:  ret
+
+; X64-DFlyBSD-LABEL:       test_fastcc_large:
+
+; X64-DFlyBSD:       leaq -40008(%rsp), %r11
+; X64-DFlyBSD-NEXT:  cmpq %fs:32, %r11
+; X64-DFlyBSD-NEXT:  ja      .LBB4_2
+
+; X64-DFlyBSD:       movabsq $40008, %r10
+; X64-DFlyBSD-NEXT:  movabsq $0, %r11
+; X64-DFlyBSD-NEXT:  callq __morestack
+; X64-DFlyBSD-NEXT:  ret
+
 }
 
 define fastcc void @test_fastcc_large_with_ecx_arg(i32 %a) #0 {
@@ -515,6 +628,16 @@ define void @test_nostack() #0 {
 
 ; X64-FreeBSD-LABEL: test_nostack:
 ; X64-FreeBSD-NOT:   callq __morestack
+
+; X32-DFlyBSD-LABEL: test_nostack:
+; X32-DFlyBSD-NOT:   calll __morestack
+
+; X64-DFlyBSD-LABEL: test_nostack:
+; X64-DFlyBSD-NOT:   callq __morestack
 }
 
 attributes #0 = { "split-stack" }
+
+; X64-Linux-Large: .rodata
+; X64-Linux-Large-NEXT: __morestack_addr:
+; X64-Linux-Large-NEXT: .quad	__morestack
diff --git a/test/CodeGen/X86/seh-basic.ll b/test/CodeGen/X86/seh-basic.ll
new file mode 100644
index 0000000..69d70d7
--- /dev/null
+++ b/test/CodeGen/X86/seh-basic.ll
@@ -0,0 +1,175 @@
+; RUN: llc -mtriple x86_64-pc-windows-msvc < %s | FileCheck %s
+
+define void @two_invoke_merged() {
+entry:
+  invoke void @try_body()
+          to label %again unwind label %lpad
+
+again:
+  invoke void @try_body()
+          to label %done unwind label %lpad
+
+done:
+  ret void
+
+lpad:
+  %vals = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
+          catch i8* bitcast (i32 (i8*, i8*)* @filt0 to i8*)
+          catch i8* bitcast (i32 (i8*, i8*)* @filt1 to i8*)
+  %sel = extractvalue { i8*, i32 } %vals, 1
+  call void @use_selector(i32 %sel)
+  ret void
+}
+
+; Normal path code
+
+; CHECK-LABEL: {{^}}two_invoke_merged:
+; CHECK: .seh_proc two_invoke_merged
+; CHECK: .seh_handler __C_specific_handler, @unwind, @except
+; CHECK: .Ltmp0:
+; CHECK: callq try_body
+; CHECK-NEXT: .Ltmp1:
+; CHECK: .Ltmp2:
+; CHECK: callq try_body
+; CHECK-NEXT: .Ltmp3:
+; CHECK: retq
+
+; Landing pad code
+
+; CHECK: .Ltmp5:
+; CHECK: movl $1, %ecx
+; CHECK: jmp
+; CHECK: .Ltmp6:
+; CHECK: movl $2, %ecx
+; CHECK: callq use_selector
+
+; CHECK: .seh_handlerdata
+; CHECK-NEXT: .long 2
+; CHECK-NEXT: .long .Ltmp0@IMGREL
+; CHECK-NEXT: .long .Ltmp3@IMGREL+1
+; CHECK-NEXT: .long filt0@IMGREL
+; CHECK-NEXT: .long .Ltmp5@IMGREL
+; CHECK-NEXT: .long .Ltmp0@IMGREL
+; CHECK-NEXT: .long .Ltmp3@IMGREL+1
+; CHECK-NEXT: .long filt1@IMGREL
+; CHECK-NEXT: .long .Ltmp6@IMGREL
+; CHECK: .text
+; CHECK: .seh_endproc
+
+define void @two_invoke_gap() {
+entry:
+  invoke void @try_body()
+          to label %again unwind label %lpad
+
+again:
+  call void @do_nothing_on_unwind()
+  invoke void @try_body()
+          to label %done unwind label %lpad
+
+done:
+  ret void
+
+lpad:
+  %vals = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
+          catch i8* bitcast (i32 (i8*, i8*)* @filt0 to i8*)
+  %sel = extractvalue { i8*, i32 } %vals, 1
+  call void @use_selector(i32 %sel)
+  ret void
+}
+
+; Normal path code
+
+; CHECK-LABEL: {{^}}two_invoke_gap:
+; CHECK: .seh_proc two_invoke_gap
+; CHECK: .seh_handler __C_specific_handler, @unwind, @except
+; CHECK: .Ltmp11:
+; CHECK: callq try_body
+; CHECK-NEXT: .Ltmp12:
+; CHECK: callq do_nothing_on_unwind
+; CHECK: .Ltmp13:
+; CHECK: callq try_body
+; CHECK-NEXT: .Ltmp14:
+; CHECK: retq
+
+; Landing pad code
+
+; CHECK: .Ltmp16:
+; CHECK: movl $1, %ecx
+; CHECK: callq use_selector
+
+; CHECK: .seh_handlerdata
+; CHECK-NEXT: .long 2
+; CHECK-NEXT: .long .Ltmp11@IMGREL
+; CHECK-NEXT: .long .Ltmp12@IMGREL+1
+; CHECK-NEXT: .long filt0@IMGREL
+; CHECK-NEXT: .long .Ltmp16@IMGREL
+; CHECK-NEXT: .long .Ltmp13@IMGREL
+; CHECK-NEXT: .long .Ltmp14@IMGREL+1
+; CHECK-NEXT: .long filt0@IMGREL
+; CHECK-NEXT: .long .Ltmp16@IMGREL
+; CHECK: .text
+; CHECK: .seh_endproc
+
+define void @two_invoke_nounwind_gap() {
+entry:
+  invoke void @try_body()
+          to label %again unwind label %lpad
+
+again:
+  call void @cannot_unwind()
+  invoke void @try_body()
+          to label %done unwind label %lpad
+
+done:
+  ret void
+
+lpad:
+  %vals = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
+          catch i8* bitcast (i32 (i8*, i8*)* @filt0 to i8*)
+  %sel = extractvalue { i8*, i32 } %vals, 1
+  call void @use_selector(i32 %sel)
+  ret void
+}
+
+; Normal path code
+
+; CHECK-LABEL: {{^}}two_invoke_nounwind_gap:
+; CHECK: .seh_proc two_invoke_nounwind_gap
+; CHECK: .seh_handler __C_specific_handler, @unwind, @except
+; CHECK: .Ltmp21:
+; CHECK: callq try_body
+; CHECK-NEXT: .Ltmp22:
+; CHECK: callq cannot_unwind
+; CHECK: .Ltmp23:
+; CHECK: callq try_body
+; CHECK-NEXT: .Ltmp24:
+; CHECK: retq
+
+; Landing pad code
+
+; CHECK: .Ltmp26:
+; CHECK: movl $1, %ecx
+; CHECK: callq use_selector
+
+; CHECK: .seh_handlerdata
+; CHECK-NEXT: .long 1
+; CHECK-NEXT: .long .Ltmp21@IMGREL
+; CHECK-NEXT: .long .Ltmp24@IMGREL+1
+; CHECK-NEXT: .long filt0@IMGREL
+; CHECK-NEXT: .long .Ltmp26@IMGREL
+; CHECK: .text
+; CHECK: .seh_endproc
+
+declare void @try_body()
+declare void @do_nothing_on_unwind()
+declare void @cannot_unwind() nounwind
+declare void @use_selector(i32)
+
+declare i32 @filt0(i8* %eh_info, i8* %rsp)
+declare i32 @filt1(i8* %eh_info, i8* %rsp)
+
+declare void @handler0()
+declare void @handler1()
+
+declare i32 @__C_specific_handler(...)
+declare i32 @llvm.eh.typeid.for(i8*) readnone nounwind
diff --git a/test/CodeGen/X86/seh-catch-all.ll b/test/CodeGen/X86/seh-catch-all.ll
new file mode 100644
index 0000000..8e1eb55
--- /dev/null
+++ b/test/CodeGen/X86/seh-catch-all.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s
+
+@str = internal unnamed_addr constant [10 x i8] c"recovered\00", align 1
+
+declare i32 @__C_specific_handler(...)
+declare void @crash()
+declare i32 @puts(i8*)
+
+define i32 @main() {
+entry:
+  invoke void @crash()
+          to label %__try.cont unwind label %lpad
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
+          catch i8* null
+  call i32 @puts(i8* getelementptr inbounds ([10 x i8]* @str, i64 0, i64 0))
+  br label %__try.cont
+
+__try.cont:
+  ret i32 0
+
+eh.resume:
+  resume { i8*, i32 } %0
+}
+
+; CHECK-LABEL: main:
+; CHECK: .seh_handlerdata
+; CHECK-NEXT: .long 1
+; CHECK-NEXT: .Ltmp{{[0-9]+}}@IMGREL
+; CHECK-NEXT: .Ltmp{{[0-9]+}}@IMGREL+1
+; CHECK-NEXT: 1
+; CHECK-NEXT: .Ltmp{{[0-9]+}}@IMGREL
diff --git a/test/CodeGen/X86/seh-filter.ll b/test/CodeGen/X86/seh-filter.ll
new file mode 100644
index 0000000..6a3a23e
--- /dev/null
+++ b/test/CodeGen/X86/seh-filter.ll
@@ -0,0 +1,21 @@
+; RUN: llc -O0 -mtriple=x86_64-windows-msvc < %s | FileCheck %s
+
+declare void @g()
+define void @f() {
+  invoke void @g() to label %return unwind label %lpad
+
+return:
+  ret void
+
+lpad:
+  %ehptrs = landingpad {i8*, i32} personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
+    filter [0 x i8*] zeroinitializer
+  call void @__cxa_call_unexpected(i8* null)
+  unreachable
+}
+declare i32 @__C_specific_handler(...)
+declare void @__cxa_call_unexpected(i8*)
+
+; We don't emit entries for filters.
+; CHECK: .seh_handlerdata
+; CHECK: .long 0
diff --git a/test/CodeGen/X86/seh-finally.ll b/test/CodeGen/X86/seh-finally.ll
new file mode 100755
index 0000000..d883663
--- /dev/null
+++ b/test/CodeGen/X86/seh-finally.ll
@@ -0,0 +1,45 @@
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s
+
+@str_recovered = internal unnamed_addr constant [10 x i8] c"recovered\00", align 1
+
+declare void @crash()
+
+define i32 @main() {
+entry:
+  invoke void @crash()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  %call = call i32 @puts(i8* getelementptr inbounds ([10 x i8]* @str_recovered, i64 0, i64 0))
+  call void @abort()
+  ret i32 0
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
+          cleanup
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = extractvalue { i8*, i32 } %0, 1
+  %call2 = invoke i32 @puts(i8* getelementptr inbounds ([10 x i8]* @str_recovered, i64 0, i64 0))
+          to label %invoke.cont1 unwind label %terminate.lpad
+
+invoke.cont1:                                     ; preds = %lpad
+  resume { i8*, i32 } %0
+
+terminate.lpad:                                   ; preds = %lpad
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
+          catch i8* null
+  call void @abort()
+  unreachable
+}
+
+; CHECK: main:
+
+; FIXME: No handlers yet!
+; CHECK: .seh_handlerdata
+; CHECK-NEXT: .long 0
+
+declare i32 @__C_specific_handler(...)
+
+declare i32 @puts(i8*)
+
+declare void @abort()
diff --git a/test/CodeGen/X86/seh-safe-div.ll b/test/CodeGen/X86/seh-safe-div.ll
new file mode 100644
index 0000000..e294f24
--- /dev/null
+++ b/test/CodeGen/X86/seh-safe-div.ll
@@ -0,0 +1,197 @@
+; RUN: llc -mtriple x86_64-pc-windows-msvc < %s | FileCheck %s
+
+; This test case is also intended to be run manually as a complete functional
+; test. It should link, print something, and exit zero rather than crashing.
+; It is the hypothetical lowering of a C source program that looks like:
+;
+;   int safe_div(int *n, int *d) {
+;     int r;
+;     __try {
+;       __try {
+;         r = *n / *d;
+;       } __except(GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION) {
+;         puts("EXCEPTION_ACCESS_VIOLATION");
+;         r = -1;
+;       }
+;     } __except(GetExceptionCode() == EXCEPTION_INT_DIVIDE_BY_ZERO) {
+;       puts("EXCEPTION_INT_DIVIDE_BY_ZERO");
+;       r = -2;
+;     }
+;     return r;
+;   }
+
+@str1 = internal constant [27 x i8] c"EXCEPTION_ACCESS_VIOLATION\00"
+@str2 = internal constant [29 x i8] c"EXCEPTION_INT_DIVIDE_BY_ZERO\00"
+
+define i32 @safe_div(i32* %n, i32* %d) {
+entry:
+  %r = alloca i32, align 4
+  invoke void @try_body(i32* %r, i32* %n, i32* %d)
+          to label %__try.cont unwind label %lpad
+
+lpad:
+  %vals = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
+          catch i8* bitcast (i32 (i8*, i8*)* @safe_div_filt0 to i8*)
+          catch i8* bitcast (i32 (i8*, i8*)* @safe_div_filt1 to i8*)
+  %ehptr = extractvalue { i8*, i32 } %vals, 0
+  %sel = extractvalue { i8*, i32 } %vals, 1
+  %filt0_val = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 (i8*, i8*)* @safe_div_filt0 to i8*))
+  %is_filt0 = icmp eq i32 %sel, %filt0_val
+  br i1 %is_filt0, label %handler0, label %eh.dispatch1
+
+eh.dispatch1:
+  %filt1_val = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 (i8*, i8*)* @safe_div_filt1 to i8*))
+  %is_filt1 = icmp eq i32 %sel, %filt1_val
+  br i1 %is_filt1, label %handler1, label %eh.resume
+
+handler0:
+  call void @puts(i8* getelementptr ([27 x i8]* @str1, i32 0, i32 0))
+  store i32 -1, i32* %r, align 4
+  br label %__try.cont
+
+handler1:
+  call void @puts(i8* getelementptr ([29 x i8]* @str2, i32 0, i32 0))
+  store i32 -2, i32* %r, align 4
+  br label %__try.cont
+
+eh.resume:
+  resume { i8*, i32 } %vals
+
+__try.cont:
+  %safe_ret = load i32* %r, align 4
+  ret i32 %safe_ret
+}
+
+; Normal path code
+
+; CHECK: {{^}}safe_div:
+; CHECK: .seh_proc safe_div
+; CHECK: .seh_handler __C_specific_handler, @unwind, @except
+; CHECK: .Ltmp0:
+; CHECK: leaq [[rloc:.*\(%rsp\)]], %rcx
+; CHECK: callq try_body
+; CHECK-NEXT: .Ltmp1
+; CHECK: .LBB0_7:
+; CHECK: movl [[rloc]], %eax
+; CHECK: retq
+
+; Landing pad code
+
+; CHECK: .Ltmp3:
+; CHECK: movl $1, %[[sel:[a-z]+]]
+; CHECK: .Ltmp4
+; CHECK: movl $2, %[[sel]]
+; CHECK: .L{{.*}}:
+; CHECK: cmpl $1, %[[sel]]
+
+; CHECK: # %handler0
+; CHECK: callq puts
+; CHECK: movl $-1, [[rloc]]
+; CHECK: jmp .LBB0_7
+
+; CHECK: cmpl $2, %[[sel]]
+
+; CHECK: # %handler1
+; CHECK: callq puts
+; CHECK: movl $-2, [[rloc]]
+; CHECK: jmp .LBB0_7
+
+; FIXME: EH preparation should eliminate the 'resume' instr and we should not do
+; the previous 'cmp;jeq'.
+; CHECK-NOT: _Unwind_Resume
+; CHECK: ud2
+
+; CHECK: .seh_handlerdata
+; CHECK: .long 2
+; CHECK: .long .Ltmp0@IMGREL
+; CHECK: .long .Ltmp1@IMGREL+1
+; CHECK: .long safe_div_filt0@IMGREL
+; CHECK: .long .Ltmp3@IMGREL
+; CHECK: .long .Ltmp0@IMGREL
+; CHECK: .long .Ltmp1@IMGREL+1
+; CHECK: .long safe_div_filt1@IMGREL
+; CHECK: .long .Ltmp4@IMGREL
+; CHECK: .text
+; CHECK: .seh_endproc
+
+
+define void @try_body(i32* %r, i32* %n, i32* %d) {
+entry:
+  %0 = load i32* %n, align 4
+  %1 = load i32* %d, align 4
+  %div = sdiv i32 %0, %1
+  store i32 %div, i32* %r, align 4
+  ret void
+}
+
+; The prototype of these filter functions is:
+; int filter(EXCEPTION_POINTERS *eh_ptrs, void *rbp);
+
+; The definition of EXCEPTION_POINTERS is:
+;   typedef struct _EXCEPTION_POINTERS {
+;     EXCEPTION_RECORD *ExceptionRecord;
+;     CONTEXT          *ContextRecord;
+;   } EXCEPTION_POINTERS;
+
+; The definition of EXCEPTION_RECORD is:
+;   typedef struct _EXCEPTION_RECORD {
+;     DWORD ExceptionCode;
+;     ...
+;   } EXCEPTION_RECORD;
+
+; The exception code can be retreived with two loads, one for the record
+; pointer and one for the code.  The values of local variables can be
+; accessed via rbp, but that would require additional not yet implemented LLVM
+; support.
+
+define i32 @safe_div_filt0(i8* %eh_ptrs, i8* %rbp) {
+  %eh_ptrs_c = bitcast i8* %eh_ptrs to i32**
+  %eh_rec = load i32** %eh_ptrs_c
+  %eh_code = load i32* %eh_rec
+  ; EXCEPTION_ACCESS_VIOLATION = 0xC0000005
+  %cmp = icmp eq i32 %eh_code, 3221225477
+  %filt.res = zext i1 %cmp to i32
+  ret i32 %filt.res
+}
+
+define i32 @safe_div_filt1(i8* %eh_ptrs, i8* %rbp) {
+  %eh_ptrs_c = bitcast i8* %eh_ptrs to i32**
+  %eh_rec = load i32** %eh_ptrs_c
+  %eh_code = load i32* %eh_rec
+  ; EXCEPTION_INT_DIVIDE_BY_ZERO = 0xC0000094
+  %cmp = icmp eq i32 %eh_code, 3221225620
+  %filt.res = zext i1 %cmp to i32
+  ret i32 %filt.res
+}
+
+@str_result = internal constant [21 x i8] c"safe_div result: %d\0A\00"
+
+define i32 @main() {
+  %d.addr = alloca i32, align 4
+  %n.addr = alloca i32, align 4
+
+  store i32 10, i32* %n.addr, align 4
+  store i32 2, i32* %d.addr, align 4
+  %r1 = call i32 @safe_div(i32* %n.addr, i32* %d.addr)
+  call void (i8*, ...)* @printf(i8* getelementptr ([21 x i8]* @str_result, i32 0, i32 0), i32 %r1)
+
+  store i32 10, i32* %n.addr, align 4
+  store i32 0, i32* %d.addr, align 4
+  %r2 = call i32 @safe_div(i32* %n.addr, i32* %d.addr)
+  call void (i8*, ...)* @printf(i8* getelementptr ([21 x i8]* @str_result, i32 0, i32 0), i32 %r2)
+
+  %r3 = call i32 @safe_div(i32* %n.addr, i32* null)
+  call void (i8*, ...)* @printf(i8* getelementptr ([21 x i8]* @str_result, i32 0, i32 0), i32 %r3)
+  ret i32 0
+}
+
+define void @_Unwind_Resume() {
+  call void @abort()
+  unreachable
+}
+
+declare i32 @__C_specific_handler(...)
+declare i32 @llvm.eh.typeid.for(i8*) readnone nounwind
+declare void @puts(i8*)
+declare void @printf(i8*, ...)
+declare void @abort()
diff --git a/test/CodeGen/X86/selectiondag-crash.ll b/test/CodeGen/X86/selectiondag-crash.ll
new file mode 100644
index 0000000..9978902
--- /dev/null
+++ b/test/CodeGen/X86/selectiondag-crash.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s
+
+; Check that llc doesn't crash in the attempt to fold a shuffle with
+; a splat mask into a constant build_vector.
+
+define <8 x i8> @autogen_SD26299(i8) {
+BB:
+  %Shuff = shufflevector <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> zeroinitializer, <8 x i32> <i32 2, i32 undef, i32 6, i32 8, i32 undef, i32 12, i32 14, i32 0>
+  %Shuff14 = shufflevector <8 x i32> %Shuff, <8 x i32> %Shuff, <8 x i32> <i32 7, i32 9, i32 11, i32 undef, i32 undef, i32 1, i32 3, i32 5>
+  %Shuff35 = shufflevector <8 x i32> %Shuff14, <8 x i32> %Shuff, <8 x i32> <i32 undef, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13>
+  %I42 = insertelement <8 x i32> %Shuff35, i32 88608, i32 0
+  %Shuff48 = shufflevector <8 x i32> %Shuff35, <8 x i32> %I42, <8 x i32> <i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 0, i32 2>
+  %Tr59 = trunc <8 x i32> %Shuff48 to <8 x i8>
+  ret <8 x i8> %Tr59
+}
diff --git a/test/CodeGen/X86/shrink-compare.ll b/test/CodeGen/X86/shrink-compare.ll
index fc7ee06..4ddef4c 100644
--- a/test/CodeGen/X86/shrink-compare.ll
+++ b/test/CodeGen/X86/shrink-compare.ll
@@ -89,3 +89,151 @@ if.end:
 ; CHECK-NOT: cmpl $1,{{.*}}x+4
 ; CHECK: ret
 }
+
+; CHECK-LABEL: test2_1:
+; CHECK: movzbl
+; CHECK: cmpl $256
+; CHECK: jne
+define void @test2_1(i32 %X) nounwind minsize {
+entry:
+  %and = and i32 %X, 255
+  %cmp = icmp eq i32 %and, 256
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; CHECK-LABEL: test_sext_i8_icmp_1:
+; CHECK: cmpb $1, %{{dil|cl}}
+define void @test_sext_i8_icmp_1(i8 %x) nounwind minsize {
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, 1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; CHECK-LABEL: test_sext_i8_icmp_47:
+; CHECK: cmpb $47, %{{dil|cl}}
+define void @test_sext_i8_icmp_47(i8 %x) nounwind minsize {
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, 47
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; CHECK-LABEL: test_sext_i8_icmp_127:
+; CHECK: cmpb $127, %{{dil|cl}}
+define void @test_sext_i8_icmp_127(i8 %x) nounwind minsize {
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, 127
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; CHECK-LABEL: test_sext_i8_icmp_neg1:
+; CHECK: cmpb $-1, %{{dil|cl}}
+define void @test_sext_i8_icmp_neg1(i8 %x) nounwind minsize {
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, -1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; CHECK-LABEL: test_sext_i8_icmp_neg2:
+; CHECK: cmpb $-2, %{{dil|cl}}
+define void @test_sext_i8_icmp_neg2(i8 %x) nounwind minsize {
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, -2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; CHECK-LABEL: test_sext_i8_icmp_neg127:
+; CHECK: cmpb $-127, %{{dil|cl}}
+define void @test_sext_i8_icmp_neg127(i8 %x) nounwind minsize {
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, -127
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; CHECK-LABEL: test_sext_i8_icmp_neg128:
+; CHECK: cmpb $-128, %{{dil|cl}}
+define void @test_sext_i8_icmp_neg128(i8 %x) nounwind minsize {
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, -128
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; CHECK-LABEL: test_sext_i8_icmp_255:
+; CHECK: movb $1,
+; CHECK: testb
+; CHECK: jne
+define void @test_sext_i8_icmp_255(i8 %x) nounwind minsize {
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, 255
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
diff --git a/test/CodeGen/X86/sibcall-4.ll b/test/CodeGen/X86/sibcall-4.ll
index 980b0f7..2c7f51d 100644
--- a/test/CodeGen/X86/sibcall-4.ll
+++ b/test/CodeGen/X86/sibcall-4.ll
@@ -1,13 +1,13 @@
 ; RUN: llc < %s -mtriple=i386-pc-linux-gnu | FileCheck %s
 ; pr7610
 
-define cc10 void @t(i32* %Base_Arg, i32* %Sp_Arg, i32* %Hp_Arg, i32 %R1_Arg) nounwind {
+define ghccc void @t(i32* %Base_Arg, i32* %Sp_Arg, i32* %Hp_Arg, i32 %R1_Arg) nounwind {
 cm1:
 ; CHECK-LABEL: t:
 ; CHECK: jmpl *%eax
   %nm3 = getelementptr i32* %Sp_Arg, i32 1
   %nm9 = load i32* %Sp_Arg
   %nma = inttoptr i32 %nm9 to void (i32*, i32*, i32*, i32)*
-  tail call cc10 void %nma(i32* %Base_Arg, i32* %nm3, i32* %Hp_Arg, i32 %R1_Arg) nounwind
+  tail call ghccc void %nma(i32* %Base_Arg, i32* %nm3, i32* %Hp_Arg, i32 %R1_Arg) nounwind
   ret void
 }
diff --git a/test/CodeGen/X86/sibcall-5.ll b/test/CodeGen/X86/sibcall-5.ll
index c04af23..b065cce 100644
--- a/test/CodeGen/X86/sibcall-5.ll
+++ b/test/CodeGen/X86/sibcall-5.ll
@@ -62,4 +62,4 @@ declare i8* @objc_msgSend(i8*, i8*, ...)
 
 declare double @floor(double) optsize
 
-!0 = metadata !{}
+!0 = !{}
diff --git a/test/CodeGen/X86/sibcall-win64.ll b/test/CodeGen/X86/sibcall-win64.ll
new file mode 100644
index 0000000..f703872
--- /dev/null
+++ b/test/CodeGen/X86/sibcall-win64.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s
+
+declare x86_64_win64cc void @win64_callee(i32)
+declare void @sysv_callee(i32)
+
+define void @sysv_caller(i32 %p1) {
+entry:
+  tail call x86_64_win64cc void @win64_callee(i32 %p1)
+  ret void
+}
+
+; CHECK-LABEL: sysv_caller:
+; CHECK: subq $40, %rsp
+; CHECK: callq win64_callee
+; CHECK: addq $40, %rsp
+; CHECK: retq
+
+define x86_64_win64cc void @win64_caller(i32 %p1) {
+entry:
+  tail call void @sysv_callee(i32 %p1)
+  ret void
+}
+
+; CHECK-LABEL: win64_caller:
+; CHECK: callq sysv_callee
+; CHECK: retq
+
+define void @sysv_matched(i32 %p1) {
+  tail call void @sysv_callee(i32 %p1)
+  ret void
+}
+
+; CHECK-LABEL: sysv_matched:
+; CHECK: jmp sysv_callee # TAILCALL
+
+define x86_64_win64cc void @win64_matched(i32 %p1) {
+  tail call x86_64_win64cc void @win64_callee(i32 %p1)
+  ret void
+}
+
+; CHECK-LABEL: win64_matched:
+; CHECK: jmp win64_callee # TAILCALL
diff --git a/test/CodeGen/X86/sibcall.ll b/test/CodeGen/X86/sibcall.ll
index 28fc626..4256f9e 100644
--- a/test/CodeGen/X86/sibcall.ll
+++ b/test/CodeGen/X86/sibcall.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mtriple=i686-linux   -mcpu=core2 -mattr=+sse2 -asm-verbose=false | FileCheck %s -check-prefix=32
 ; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core2 -mattr=+sse2 -asm-verbose=false | FileCheck %s -check-prefix=64
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -mcpu=core2 -mattr=+sse2 -asm-verbose=false | FileCheck %s -check-prefix=X32ABI
 
 define void @t1(i32 %x) nounwind ssp {
 entry:
@@ -8,6 +9,9 @@ entry:
 
 ; 64-LABEL: t1:
 ; 64: jmp {{_?}}foo
+
+; X32ABI-LABEL: t1:
+; X32ABI: jmp {{_?}}foo
   tail call void @foo() nounwind
   ret void
 }
@@ -21,6 +25,9 @@ entry:
 
 ; 64-LABEL: t2:
 ; 64: jmp {{_?}}foo2
+
+; X32ABI-LABEL: t2:
+; X32ABI: jmp {{_?}}foo2
   %0 = tail call i32 @foo2() nounwind
   ret void
 }
@@ -34,6 +41,9 @@ entry:
 
 ; 64-LABEL: t3:
 ; 64: jmp {{_?}}foo3
+
+; X32ABI-LABEL: t3:
+; X32ABI: jmp {{_?}}foo3
   %0 = tail call i32 @foo3() nounwind
   ret void
 }
@@ -49,6 +59,10 @@ entry:
 ; 64-LABEL: t4:
 ; 64-NOT: call
 ; 64: jmpq *
+
+; X32ABI-LABEL: t4:
+; X32ABI-NOT: call
+; X32ABI: jmpq *
   tail call void %x(i32 0) nounwind
   ret void
 }
@@ -62,6 +76,13 @@ entry:
 ; 64-LABEL: t5:
 ; 64-NOT: call
 ; 64: jmpq *%rdi
+
+; X32ABI-LABEL: t5:
+; X32ABI-NOT: call
+; FIXME: This isn't needed since x32 psABI specifies that callers must
+;        zero-extend pointers passed in registers.
+; X32ABI: movl %edi, %eax
+; X32ABI: jmpq *%rax
   tail call void %x() nounwind
   ret void
 }
@@ -75,6 +96,10 @@ entry:
 ; 64-LABEL: t6:
 ; 64: jmp {{_?}}t6
 ; 64: jmp {{_?}}bar
+
+; X32ABI-LABEL: t6:
+; X32ABI: jmp {{_?}}t6
+; X32ABI: jmp {{_?}}bar
   %0 = icmp slt i32 %x, 10
   br i1 %0, label %bb, label %bb1
 
@@ -97,6 +122,9 @@ entry:
 
 ; 64-LABEL: t7:
 ; 64: jmp {{_?}}bar2
+
+; X32ABI-LABEL: t7:
+; X32ABI: jmp {{_?}}bar2
   %0 = tail call i32 @bar2(i32 %a, i32 %b, i32 %c) nounwind
   ret i32 %0
 }
@@ -110,6 +138,9 @@ entry:
 
 ; 64-LABEL: t8:
 ; 64: jmp {{_?}}bar3
+
+; X32ABI-LABEL: t8:
+; X32ABI: jmp {{_?}}bar3
   %0 = tail call signext i16 @bar3() nounwind      ; <i16> [#uses=1]
   ret i16 %0
 }
@@ -123,6 +154,9 @@ entry:
 
 ; 64-LABEL: t9:
 ; 64: jmpq *
+
+; X32ABI-LABEL: t9:
+; X32ABI: jmpq *
   %0 = bitcast i32 (i32)* %x to i16 (i32)*
   %1 = tail call signext i16 %0(i32 0) nounwind
   ret i16 %1
@@ -135,6 +169,9 @@ entry:
 
 ; 64-LABEL: t10:
 ; 64: callq
+
+; X32ABI-LABEL: t10:
+; X32ABI: callq
   %0 = tail call i32 @foo4() noreturn nounwind
   unreachable
 }
@@ -153,9 +190,14 @@ define i32 @t11(i32 %x, i32 %y, i32 %z.0, i32 %z.1, i32 %z.2) nounwind ssp {
 ; 32: jmp {{_?}}foo5
 
 ; 64-LABEL: t11:
-; 64-NOT: subq ${{[0-9]+}}, %esp
-; 64-NOT: addq ${{[0-9]+}}, %esp
+; 64-NOT: subq ${{[0-9]+}}, %rsp
+; 64-NOT: addq ${{[0-9]+}}, %rsp
 ; 64: jmp {{_?}}foo5
+
+; X32ABI-LABEL: t11:
+; X32ABI-NOT: subl ${{[0-9]+}}, %esp
+; X32ABI-NOT: addl ${{[0-9]+}}, %esp
+; X32ABI: jmp {{_?}}foo5
 entry:
   %0 = icmp eq i32 %x, 0
   br i1 %0, label %bb6, label %bb
@@ -179,9 +221,14 @@ define i32 @t12(i32 %x, i32 %y, %struct.t* byval align 4 %z) nounwind ssp {
 ; 32: jmp {{_?}}foo6
 
 ; 64-LABEL: t12:
-; 64-NOT: subq ${{[0-9]+}}, %esp
-; 64-NOT: addq ${{[0-9]+}}, %esp
+; 64-NOT: subq ${{[0-9]+}}, %rsp
+; 64-NOT: addq ${{[0-9]+}}, %rsp
 ; 64: jmp {{_?}}foo6
+
+; X32ABI-LABEL: t12:
+; X32ABI-NOT: subl ${{[0-9]+}}, %esp
+; X32ABI-NOT: addl ${{[0-9]+}}, %esp
+; X32ABI: jmp {{_?}}foo6
 entry:
   %0 = icmp eq i32 %x, 0
   br i1 %0, label %bb2, label %bb
@@ -210,6 +257,11 @@ define %struct.ns* @t13(%struct.cp* %yy) nounwind ssp {
 ; 64-NOT: jmp
 ; 64: callq
 ; 64: ret
+
+; X32ABI-LABEL: t13:
+; X32ABI-NOT: jmp
+; X32ABI: callq
+; X32ABI: ret
 entry:
   %0 = tail call fastcc %struct.ns* @foo7(%struct.cp* byval align 4 %yy, i8 signext 0) nounwind
   ret %struct.ns* %0
@@ -230,6 +282,11 @@ entry:
 ; 64: movq 32(%rdi)
 ; 64-NOT: movq 16(%rdi)
 ; 64: jmpq *16({{%rdi|%rax}})
+
+; X32ABI-LABEL: t14:
+; X32ABI: movl 20(%edi), %edi
+; X32ABI-NEXT: movl 12(%edi), %eax
+; X32ABI-NEXT: jmpq *%rax
   %0 = getelementptr inbounds %struct.__block_literal_2* %.block_descriptor, i64 0, i32 5 ; <void ()**> [#uses=1]
   %1 = load void ()** %0, align 8                 ; <void ()*> [#uses=2]
   %2 = bitcast void ()* %1 to %struct.__block_literal_1* ; <%struct.__block_literal_1*> [#uses=1]
@@ -252,6 +309,10 @@ define void @t15(%struct.foo* noalias sret %agg.result) nounwind  {
 ; 64-LABEL: t15:
 ; 64: callq {{_?}}f
 ; 64: retq
+
+; X32ABI-LABEL: t15:
+; X32ABI: callq {{_?}}f
+; X32ABI: retq
   tail call fastcc void @f(%struct.foo* noalias sret %agg.result) nounwind
   ret void
 }
@@ -266,6 +327,9 @@ entry:
 
 ; 64-LABEL: t16:
 ; 64: jmp {{_?}}bar4
+
+; X32ABI-LABEL: t16:
+; X32ABI: jmp {{_?}}bar4
   %0 = tail call double @bar4() nounwind
   ret void
 }
@@ -281,6 +345,10 @@ entry:
 ; 64-LABEL: t17:
 ; 64: xorl %eax, %eax
 ; 64: jmp {{_?}}bar5
+
+; X32ABI-LABEL: t17:
+; X32ABI: xorl %eax, %eax
+; X32ABI: jmp {{_?}}bar5
   tail call void (...)* @bar5() nounwind
   ret void
 }
@@ -297,6 +365,10 @@ entry:
 ; 64-LABEL: t18:
 ; 64: xorl %eax, %eax
 ; 64: jmp {{_?}}bar6
+
+; X32ABI-LABEL: t18:
+; X32ABI: xorl %eax, %eax
+; X32ABI: jmp {{_?}}bar6
   %0 = tail call double (...)* @bar6() nounwind
   ret void
 }
@@ -308,6 +380,10 @@ entry:
 ; CHECK-LABEL: t19:
 ; CHECK: andl $-32
 ; CHECK: calll {{_?}}foo
+
+; X32ABI-LABEL: t19:
+; X32ABI: andl $-32
+; X32ABI: callq {{_?}}foo
   tail call void @foo() nounwind
   ret void
 }
@@ -324,6 +400,9 @@ entry:
 
 ; 64-LABEL: t20:
 ; 64: jmp {{_?}}foo20
+
+; X32ABI-LABEL: t20:
+; X32ABI: jmp {{_?}}foo20
   %0 = tail call fastcc double @foo20(double %x) nounwind
   ret double %0
 }
diff --git a/test/CodeGen/X86/sincos-opt.ll b/test/CodeGen/X86/sincos-opt.ll
index 1e34a2b..9d02bcd 100644
--- a/test/CodeGen/X86/sincos-opt.ll
+++ b/test/CodeGen/X86/sincos-opt.ll
@@ -15,9 +15,8 @@ entry:
 
 ; OSX_SINCOS-LABEL: test1:
 ; OSX_SINCOS: callq ___sincosf_stret
-; OSX_SINCOS: movaps %xmm0, %xmm1
-; OSX_SINCOS: shufps {{.*}} ## xmm1 = xmm1[1,1,2,3]
-; OSX_SINCOS: addss %xmm0, %xmm1
+; OSX_SINCOS: movshdup {{.*}} xmm1 = xmm0[1,1,3,3]
+; OSX_SINCOS: addss %xmm1, %xmm0
 
 ; OSX_NOOPT: test1
 ; OSX_NOOPT: callq _sinf
diff --git a/test/CodeGen/X86/sink-blockfreq.ll b/test/CodeGen/X86/sink-blockfreq.ll
index 6e3a003..c2f0411 100644
--- a/test/CodeGen/X86/sink-blockfreq.ll
+++ b/test/CodeGen/X86/sink-blockfreq.ll
@@ -40,6 +40,6 @@ exit:
   ret i32 0
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 4, i32 1}
-!1 = metadata !{metadata !"branch_weights", i32 128, i32 1}
-!2 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+!0 = !{!"branch_weights", i32 4, i32 1}
+!1 = !{!"branch_weights", i32 128, i32 1}
+!2 = !{!"branch_weights", i32 1, i32 1}
diff --git a/test/CodeGen/X86/sink-hoist.ll b/test/CodeGen/X86/sink-hoist.ll
index 64f5311..455cf24 100644
--- a/test/CodeGen/X86/sink-hoist.ll
+++ b/test/CodeGen/X86/sink-hoist.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -mtriple=x86_64-unknown-linux-gnu -mcpu=nehalem -post-RA-scheduler=true -schedmodel=false | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -march=x86-64 -asm-verbose=false -mtriple=x86_64-unknown-linux-gnu -mcpu=nehalem -post-RA-scheduler=true -schedmodel=false | FileCheck %s
 
 ; Currently, floating-point selects are lowered to CFG triangles.
 ; This means that one side of the select is always unconditionally
diff --git a/test/CodeGen/X86/sjlj-baseptr.ll b/test/CodeGen/X86/sjlj-baseptr.ll
new file mode 100644
index 0000000..e439ff4
--- /dev/null
+++ b/test/CodeGen/X86/sjlj-baseptr.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 -relocation-model=static | FileCheck --check-prefix=X86 %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 -relocation-model=static | FileCheck --check-prefix=X64 %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%Foo = type { [125 x i8] }
+
+declare i32 @llvm.eh.sjlj.setjmp(i8*) nounwind
+
+declare void @whatever(i64, %Foo*, i8**, i8*, i8*, i32)  #0
+
+attributes #0 = { nounwind uwtable "no-frame-pointer-elim"="true" }
+
+define i32 @test1(i64 %n, %Foo* byval nocapture readnone align 8 %f) #0 {
+entry:
+  %buf = alloca [5 x i8*], align 16
+  %p = alloca i8*, align 8
+  %q = alloca i8, align 64
+  %r = bitcast [5 x i8*]* %buf to i8*
+  %s = alloca i8, i64 %n, align 1
+  store i8* %s, i8** %p, align 8
+  %t = call i32 @llvm.eh.sjlj.setjmp(i8* %s)
+  call void @whatever(i64 %n, %Foo* %f, i8** %p, i8* %q, i8* %s, i32 %t) #1
+  ret i32 0
+; X86: movl    %esp, %esi
+; X86: movl    %esp, -16(%ebp)
+; X86: {{.LBB.*:}}
+; X86: movl    -16(%ebp), %esi
+; X86: {{.LBB.*:}}
+; X64: movq    %rsp, %rbx
+; X64: movq    %rsp, -48(%rbp)
+; X64: {{.LBB.*:}}
+; X64: movq    -48(%rbp), %rbx
+; X64: {{.LBB.*:}}
+}
+
+
diff --git a/test/CodeGen/X86/slow-div.ll b/test/CodeGen/X86/slow-div.ll
new file mode 100644
index 0000000..5222382
--- /dev/null
+++ b/test/CodeGen/X86/slow-div.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivl-to-divb < %s | FileCheck -check-prefix=DIV32 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivq-to-divw < %s | FileCheck -check-prefix=DIV64 %s
+
+define i32 @div32(i32 %a, i32 %b) {
+entry:
+; DIV32-LABEL: div32:
+; DIV32: orl %{{.*}}, [[REG:%[a-z]+]]
+; DIV32: testl $-256, [[REG]]
+; DIV32: divb
+; DIV64-LABEL: div32:
+; DIV64-NOT: divb
+  %div = sdiv i32 %a, %b
+  ret i32 %div
+}
+
+define i64 @div64(i64 %a, i64 %b) {
+entry:
+; DIV32-LABEL: div64:
+; DIV32-NOT: divw
+; DIV64-LABEL: div64:
+; DIV64: orq %{{.*}}, [[REG:%[a-z]+]]
+; DIV64: testq   $-65536, [[REG]]
+; DIV64: divw
+  %div = sdiv i64 %a, %b
+  ret i64 %div
+}
+
+
diff --git a/test/CodeGen/X86/slow-incdec.ll b/test/CodeGen/X86/slow-incdec.ll
index 541d992..323e3ae 100644
--- a/test/CodeGen/X86/slow-incdec.ll
+++ b/test/CodeGen/X86/slow-incdec.ll
@@ -74,7 +74,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
   ret i32 %i.0.lcssa
 }
 
-!1 = metadata !{metadata !2, metadata !2, i64 0}
-!2 = metadata !{metadata !"int", metadata !3, i64 0}
-!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll
index 1b596b5..3c03750 100644
--- a/test/CodeGen/X86/small-byval-memcpy.ll
+++ b/test/CodeGen/X86/small-byval-memcpy.ll
@@ -1,20 +1,25 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2   | grep movsd  | count 8
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s --check-prefix=CORE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck %s --check-prefix=NEHALEM
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
 
-define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret  %agg.result, { x86_fp80, x86_fp80 }* byval align 4  %z) nounwind  {
-entry:
-	%iz = alloca { x86_fp80, x86_fp80 }		; <{ x86_fp80, x86_fp80 }*> [#uses=3]
-	%tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1		; <x86_fp80*> [#uses=1]
-	%tmp2 = load x86_fp80* %tmp1, align 16		; <x86_fp80> [#uses=1]
-	%tmp3 = fsub x86_fp80 0xK80000000000000000000, %tmp2		; <x86_fp80> [#uses=1]
-	%tmp4 = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 1		; <x86_fp80*> [#uses=1]
-	%real = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 0		; <x86_fp80*> [#uses=1]
-	%tmp6 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 0		; <x86_fp80*> [#uses=1]
-	%tmp7 = load x86_fp80* %tmp6, align 16		; <x86_fp80> [#uses=1]
-	store x86_fp80 %tmp3, x86_fp80* %real, align 16
-	store x86_fp80 %tmp7, x86_fp80* %tmp4, align 16
-	call void @ccoshl( { x86_fp80, x86_fp80 }* noalias sret  %agg.result, { x86_fp80, x86_fp80 }* byval align 4  %iz ) nounwind 
-	ret void
-}
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
+
+define void @copy16bytes(i8* nocapture %a, i8* nocapture readonly %b) {
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 16, i32 1, i1 false)
+  ret void
+
+  ; CHECK-LABEL: copy16bytes
+  ; CORE2: movq
+  ; CORE2-NEXT: movq
+  ; CORE2-NEXT: movq
+  ; CORE2-NEXT: movq
+  ; CORE2-NEXT: retq
 
-declare void @ccoshl({ x86_fp80, x86_fp80 }* noalias sret , { x86_fp80, x86_fp80 }* byval align 4 ) nounwind 
+  ; NEHALEM: movups
+  ; NEHALEM-NEXT: movups
+  ; NEHALEM-NEXT: retq
+
+  ; BTVER2: movups
+  ; BTVER2-NEXT: movups
+  ; BTVER2-NEXT: retq
+}
diff --git a/test/CodeGen/X86/splat-const.ll b/test/CodeGen/X86/splat-const.ll
new file mode 100644
index 0000000..19997b0
--- /dev/null
+++ b/test/CodeGen/X86/splat-const.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -mcpu=penryn | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mcpu=sandybridge | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mcpu=haswell | FileCheck %s --check-prefix=AVX2
+; This checks that lowering for creation of constant vectors is sane and
+; doesn't use redundant shuffles. (fixes PR22276)
+target triple = "x86_64-unknown-unknown"
+
+define <4 x i32> @zero_vector() {
+; SSE-LABEL: zero_vector:
+; SSE: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+; AVX-LABEL: zero_vector:
+; AVX: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+; AVX2-LABEL: zero_vector:
+; AVX2: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: retq
+  %zero = insertelement <4 x i32> undef, i32 0, i32 0
+  %splat = shufflevector <4 x i32> %zero, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat
+}
+
+; Note that for the "const_vector" versions, lowering that uses a shuffle
+; instead of a load would be legitimate, if it's a single broadcast shuffle.
+; (as opposed to the previous mess)
+; However, this is not the current preferred lowering.
+define <4 x i32> @const_vector() {
+; SSE-LABEL: const_vector:
+; SSE: movaps {{.*}}, %xmm0 # xmm0 = [42,42,42,42]
+; SSE-NEXT: retq
+; AVX-LABEL: const_vector:
+; AVX: vmovaps {{.*}}, %xmm0 # xmm0 = [42,42,42,42]
+; AVX-NEXT: retq
+; AVX2-LABEL: const_vector:
+; AVX2: vbroadcastss {{[^%].*}}, %xmm0
+; AVX2-NEXT: retq
+  %const = insertelement <4 x i32> undef, i32 42, i32 0
+  %splat = shufflevector <4 x i32> %const, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat
+}
diff --git a/test/CodeGen/X86/sret-implicit.ll b/test/CodeGen/X86/sret-implicit.ll
new file mode 100644
index 0000000..3fade1d
--- /dev/null
+++ b/test/CodeGen/X86/sret-implicit.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=x86_64-apple-darwin8 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
+
+; CHECK-LABEL: return32
+; CHECK-DAG: movq	$0, (%rdi)
+; CHECK-DAG: movq	%rdi, %rax
+; CHECK: retq
+define i256 @return32() {
+  ret i256 0
+}
diff --git a/test/CodeGen/X86/sse-domains.ll b/test/CodeGen/X86/sse-domains.ll
index 168959a..8cf522d 100644
--- a/test/CodeGen/X86/sse-domains.ll
+++ b/test/CodeGen/X86/sse-domains.ll
@@ -43,45 +43,3 @@ while.body:
 while.end:
   ret void
 }
-
-; CHECK: f2
-; CHECK: for.body
-;
-; This loop contains two cvtsi2ss instructions that update the same xmm
-; register.  Verify that the execution dependency fix pass breaks those
-; dependencies by inserting xorps instructions.
-;
-; If the register allocator chooses different registers for the two cvtsi2ss
-; instructions, they are still dependent on themselves.
-; CHECK: xorps [[XMM1:%xmm[0-9]+]]
-; CHECK: , [[XMM1]]
-; CHECK: cvtsi2ssl %{{.*}}, [[XMM1]]
-; CHECK: xorps [[XMM2:%xmm[0-9]+]]
-; CHECK: , [[XMM2]]
-; CHECK: cvtsi2ssl %{{.*}}, [[XMM2]]
-;
-define float @f2(i32 %m) nounwind uwtable readnone ssp {
-entry:
-  %tobool3 = icmp eq i32 %m, 0
-  br i1 %tobool3, label %for.end, label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %m.addr.07 = phi i32 [ %dec, %for.body ], [ %m, %entry ]
-  %s1.06 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ]
-  %s2.05 = phi float [ %add2, %for.body ], [ 0.000000e+00, %entry ]
-  %n.04 = phi i32 [ %inc, %for.body ], [ 1, %entry ]
-  %conv = sitofp i32 %n.04 to float
-  %add = fadd float %s1.06, %conv
-  %conv1 = sitofp i32 %m.addr.07 to float
-  %add2 = fadd float %s2.05, %conv1
-  %inc = add nsw i32 %n.04, 1
-  %dec = add nsw i32 %m.addr.07, -1
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %s1.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
-  %s2.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2, %for.body ]
-  %sub = fsub float %s1.0.lcssa, %s2.0.lcssa
-  ret float %sub
-}
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index da36a42..4dcb54c 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -803,11 +803,18 @@ define double @ule_inverse_y(double %x) nounwind {
 ; Test a few more misc. cases.
 
 ; CHECK-LABEL: clampTo3k_a:
-; CHECK: minsd
+; CHECK-NEXT: movsd {{[^,]*}}, %xmm1
+; CHECK-NEXT: minsd %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
 ; UNSAFE-LABEL: clampTo3k_a:
-; UNSAFE: minsd
+; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0
+; UNSAFE-NEXT: ret
 ; FINITE-LABEL: clampTo3k_a:
-; FINITE: minsd
+; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
+; FINITE-NEXT: minsd %xmm0, %xmm1
+; FINITE-NEXT: movaps %xmm1, %xmm0
+; FINITE-NEXT: ret
 define double @clampTo3k_a(double %x) nounwind readnone {
 entry:
   %0 = fcmp ogt double %x, 3.000000e+03           ; <i1> [#uses=1]
@@ -816,11 +823,16 @@ entry:
 }
 
 ; CHECK-LABEL: clampTo3k_b:
-; CHECK: minsd
+; CHECK-NEXT: minsd {{[^,]*}}, %xmm0
+; CHECK-NEXT: ret
 ; UNSAFE-LABEL: clampTo3k_b:
-; UNSAFE: minsd
+; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0
+; UNSAFE-NEXT: ret
 ; FINITE-LABEL: clampTo3k_b:
-; FINITE: minsd
+; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
+; FINITE-NEXT: minsd %xmm0, %xmm1
+; FINITE-NEXT: movaps %xmm1, %xmm0
+; FINITE-NEXT: ret
 define double @clampTo3k_b(double %x) nounwind readnone {
 entry:
   %0 = fcmp uge double %x, 3.000000e+03           ; <i1> [#uses=1]
@@ -829,11 +841,18 @@ entry:
 }
 
 ; CHECK-LABEL: clampTo3k_c:
-; CHECK: maxsd
+; CHECK-NEXT: movsd {{[^,]*}}, %xmm1
+; CHECK-NEXT: maxsd %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
 ; UNSAFE-LABEL: clampTo3k_c:
-; UNSAFE: maxsd
+; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0
+; UNSAFE-NEXT: ret
 ; FINITE-LABEL: clampTo3k_c:
-; FINITE: maxsd
+; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
+; FINITE-NEXT: maxsd %xmm0, %xmm1
+; FINITE-NEXT: movaps %xmm1, %xmm0
+; FINITE-NEXT: ret
 define double @clampTo3k_c(double %x) nounwind readnone {
 entry:
   %0 = fcmp olt double %x, 3.000000e+03           ; <i1> [#uses=1]
@@ -842,11 +861,16 @@ entry:
 }
 
 ; CHECK-LABEL: clampTo3k_d:
-; CHECK: maxsd
+; CHECK-NEXT: maxsd {{[^,]*}}, %xmm0
+; CHECK-NEXT: ret
 ; UNSAFE-LABEL: clampTo3k_d:
-; UNSAFE: maxsd
+; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0
+; UNSAFE-NEXT: ret
 ; FINITE-LABEL: clampTo3k_d:
-; FINITE: maxsd
+; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
+; FINITE-NEXT: maxsd %xmm0, %xmm1
+; FINITE-NEXT: movaps %xmm1, %xmm0
+; FINITE-NEXT: ret
 define double @clampTo3k_d(double %x) nounwind readnone {
 entry:
   %0 = fcmp ule double %x, 3.000000e+03           ; <i1> [#uses=1]
@@ -855,11 +879,18 @@ entry:
 }
 
 ; CHECK-LABEL: clampTo3k_e:
-; CHECK: maxsd
+; CHECK-NEXT: movsd {{[^,]*}}, %xmm1
+; CHECK-NEXT: maxsd %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
 ; UNSAFE-LABEL: clampTo3k_e:
-; UNSAFE: maxsd
+; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0
+; UNSAFE-NEXT: ret
 ; FINITE-LABEL: clampTo3k_e:
-; FINITE: maxsd
+; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
+; FINITE-NEXT: maxsd %xmm0, %xmm1
+; FINITE-NEXT: movaps %xmm1, %xmm0
+; FINITE-NEXT: ret
 define double @clampTo3k_e(double %x) nounwind readnone {
 entry:
   %0 = fcmp olt double %x, 3.000000e+03           ; <i1> [#uses=1]
@@ -868,11 +899,16 @@ entry:
 }
 
 ; CHECK-LABEL: clampTo3k_f:
-; CHECK: maxsd
+; CHECK-NEXT: maxsd {{[^,]*}}, %xmm0
+; CHECK-NEXT: ret
 ; UNSAFE-LABEL: clampTo3k_f:
-; UNSAFE: maxsd
+; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0
+; UNSAFE-NEXT: ret
 ; FINITE-LABEL: clampTo3k_f:
-; FINITE: maxsd
+; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
+; FINITE-NEXT: maxsd %xmm0, %xmm1
+; FINITE-NEXT: movaps %xmm1, %xmm0
+; FINITE-NEXT: ret
 define double @clampTo3k_f(double %x) nounwind readnone {
 entry:
   %0 = fcmp ule double %x, 3.000000e+03           ; <i1> [#uses=1]
@@ -881,11 +917,18 @@ entry:
 }
 
 ; CHECK-LABEL: clampTo3k_g:
-; CHECK: minsd
+; CHECK-NEXT: movsd {{[^,]*}}, %xmm1
+; CHECK-NEXT: minsd %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
 ; UNSAFE-LABEL: clampTo3k_g:
-; UNSAFE: minsd
+; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0
+; UNSAFE-NEXT: ret
 ; FINITE-LABEL: clampTo3k_g:
-; FINITE: minsd
+; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
+; FINITE-NEXT: minsd %xmm0, %xmm1
+; FINITE-NEXT: movaps %xmm1, %xmm0
+; FINITE-NEXT: ret
 define double @clampTo3k_g(double %x) nounwind readnone {
 entry:
   %0 = fcmp ogt double %x, 3.000000e+03           ; <i1> [#uses=1]
@@ -894,11 +937,16 @@ entry:
 }
 
 ; CHECK-LABEL: clampTo3k_h:
-; CHECK: minsd
+; CHECK-NEXT: minsd {{[^,]*}}, %xmm0
+; CHECK-NEXT: ret
 ; UNSAFE-LABEL: clampTo3k_h:
-; UNSAFE: minsd
+; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0
+; UNSAFE-NEXT: ret
 ; FINITE-LABEL: clampTo3k_h:
-; FINITE: minsd
+; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
+; FINITE-NEXT: minsd %xmm0, %xmm1
+; FINITE-NEXT: movaps %xmm1, %xmm0
+; FINITE-NEXT: ret
 define double @clampTo3k_h(double %x) nounwind readnone {
 entry:
   %0 = fcmp uge double %x, 3.000000e+03           ; <i1> [#uses=1]
@@ -907,33 +955,73 @@ entry:
 }
 
 ; UNSAFE-LABEL: test_maxpd:
-; UNSAFE: maxpd
-define <2 x double> @test_maxpd(<2 x double> %x, <2 x double> %y) {
+; UNSAFE-NEXT: maxpd %xmm1, %xmm0
+; UNSAFE-NEXT: ret
+define <2 x double> @test_maxpd(<2 x double> %x, <2 x double> %y) nounwind {
   %max_is_x = fcmp oge <2 x double> %x, %y
   %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y
   ret <2 x double> %max
 }
 
 ; UNSAFE-LABEL: test_minpd:
-; UNSAFE: minpd
-define <2 x double> @test_minpd(<2 x double> %x, <2 x double> %y) {
+; UNSAFE-NEXT: minpd %xmm1, %xmm0
+; UNSAFE-NEXT: ret
+define <2 x double> @test_minpd(<2 x double> %x, <2 x double> %y) nounwind {
   %min_is_x = fcmp ole <2 x double> %x, %y
   %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y
   ret <2 x double> %min
 }
 
 ; UNSAFE-LABEL: test_maxps:
-; UNSAFE: maxps
-define <4 x float> @test_maxps(<4 x float> %x, <4 x float> %y) {
+; UNSAFE-NEXT: maxps %xmm1, %xmm0
+; UNSAFE-NEXT: ret
+define <4 x float> @test_maxps(<4 x float> %x, <4 x float> %y) nounwind {
   %max_is_x = fcmp oge <4 x float> %x, %y
   %max = select <4 x i1> %max_is_x, <4 x float> %x, <4 x float> %y
   ret <4 x float> %max
 }
 
 ; UNSAFE-LABEL: test_minps:
-; UNSAFE: minps
-define <4 x float> @test_minps(<4 x float> %x, <4 x float> %y) {
+; UNSAFE-NEXT: minps %xmm1, %xmm0
+; UNSAFE-NEXT: ret
+define <4 x float> @test_minps(<4 x float> %x, <4 x float> %y) nounwind {
   %min_is_x = fcmp ole <4 x float> %x, %y
   %min = select <4 x i1> %min_is_x, <4 x float> %x, <4 x float> %y
   ret <4 x float> %min
 }
+
+; UNSAFE-LABEL: test_maxps_illegal_v2f32:
+; UNSAFE-NEXT: maxps %xmm1, %xmm0
+; UNSAFE-NEXT: ret
+define <2 x float> @test_maxps_illegal_v2f32(<2 x float> %x, <2 x float> %y) nounwind {
+  %max_is_x = fcmp oge <2 x float> %x, %y
+  %max = select <2 x i1> %max_is_x, <2 x float> %x, <2 x float> %y
+  ret <2 x float> %max
+}
+
+; UNSAFE-LABEL: test_minps_illegal_v2f32:
+; UNSAFE-NEXT: minps %xmm1, %xmm0
+; UNSAFE-NEXT: ret
+define <2 x float> @test_minps_illegal_v2f32(<2 x float> %x, <2 x float> %y) nounwind {
+  %min_is_x = fcmp ole <2 x float> %x, %y
+  %min = select <2 x i1> %min_is_x, <2 x float> %x, <2 x float> %y
+  ret <2 x float> %min
+}
+
+; UNSAFE-LABEL: test_maxps_illegal_v3f32:
+; UNSAFE-NEXT: maxps %xmm1, %xmm0
+; UNSAFE-NEXT: ret
+define <3 x float> @test_maxps_illegal_v3f32(<3 x float> %x, <3 x float> %y) nounwind {
+  %max_is_x = fcmp oge <3 x float> %x, %y
+  %max = select <3 x i1> %max_is_x, <3 x float> %x, <3 x float> %y
+  ret <3 x float> %max
+}
+
+; UNSAFE-LABEL: test_minps_illegal_v3f32:
+; UNSAFE-NEXT: minps %xmm1, %xmm0
+; UNSAFE-NEXT: ret
+define <3 x float> @test_minps_illegal_v3f32(<3 x float> %x, <3 x float> %y) nounwind {
+  %min_is_x = fcmp ole <3 x float> %x, %y
+  %min = select <3 x i1> %min_is_x, <3 x float> %x, <3 x float> %y
+  ret <3 x float> %min
+}
diff --git a/test/CodeGen/X86/sse-scalar-fp-arith.ll b/test/CodeGen/X86/sse-scalar-fp-arith.ll
index b122ef6..8b1c6d0 100644
--- a/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -370,8 +370,155 @@ define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %3
 }
 
+; With SSE4.1 or greater, the shuffles in the following tests may
+; be lowered to X86Blendi nodes. 
+
+define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
+; SSE-LABEL: blend_add_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: blend_add_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %ext = extractelement <4 x float> %a, i32 0
+  %op = fadd float %b, %ext
+  %ins = insertelement <4 x float> undef, float %op, i32 0
+  %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuf
+}
+
+define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
+; SSE-LABEL: blend_sub_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    subss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: blend_sub_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %ext = extractelement <4 x float> %a, i32 0
+  %op = fsub float %ext, %b
+  %ins = insertelement <4 x float> undef, float %op, i32 0
+  %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuf
+}
+
+define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
+; SSE-LABEL: blend_mul_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: blend_mul_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %ext = extractelement <4 x float> %a, i32 0
+  %op = fmul float %b, %ext
+  %ins = insertelement <4 x float> undef, float %op, i32 0
+  %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuf
+}
+
+define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
+; SSE-LABEL: blend_div_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: blend_div_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %ext = extractelement <4 x float> %a, i32 0
+  %op = fdiv float %ext, %b
+  %ins = insertelement <4 x float> undef, float %op, i32 0
+  %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuf
+}
+
+define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
+; SSE-LABEL: blend_add_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: blend_add_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %ext = extractelement <2 x double> %a, i32 0
+  %op = fadd double %b, %ext
+  %ins = insertelement <2 x double> undef, double %op, i32 0
+  %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuf
+}
+
+define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
+; SSE-LABEL: blend_sub_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    subsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: blend_sub_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %ext = extractelement <2 x double> %a, i32 0
+  %op = fsub double %ext, %b
+  %ins = insertelement <2 x double> undef, double %op, i32 0
+  %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuf
+}
+
+define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
+; SSE-LABEL: blend_mul_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: blend_mul_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %ext = extractelement <2 x double> %a, i32 0
+  %op = fmul double %b, %ext
+  %ins = insertelement <2 x double> undef, double %op, i32 0
+  %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuf
+}
+
+define <2 x double> @blend_div_sd(<2 x double> %a, double %b) {
+; SSE-LABEL: blend_div_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: blend_div_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %ext = extractelement <2 x double> %a, i32 0
+  %op = fdiv double %ext, %b
+  %ins = insertelement <2 x double> undef, double %op, i32 0
+  %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuf
+}
+
 ; Ensure that the backend selects SSE/AVX scalar fp instructions
-; from a packed fp instrution plus a vector insert.
+; from a packed fp instruction plus a vector insert.
 
 define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: insert_test_add_ss:
diff --git a/test/CodeGen/X86/sse-unaligned-mem-feature.ll b/test/CodeGen/X86/sse-unaligned-mem-feature.ll
new file mode 100644
index 0000000..bb55829
--- /dev/null
+++ b/test/CodeGen/X86/sse-unaligned-mem-feature.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mcpu=yonah -mattr=sse-unaligned-mem -march=x86 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define <4 x float> @foo(<4 x float>* %P, <4 x float> %In) nounwind {
+	%A = load <4 x float>* %P, align 4
+	%B = fadd <4 x float> %A, %In
+	ret <4 x float> %B
+
+; CHECK-LABEL: @foo
+; CHECK:       addps (%eax), %xmm0
+}
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
new file mode 100644
index 0000000..b0412b9
--- /dev/null
+++ b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=pentium4 -mattr=sse2 | FileCheck %s
+
+define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
+  ; CHECK: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
+  ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone
+
+define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
+  ; CHECK: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
+  ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index c4d9e6d..cab62a3 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -408,22 +408,6 @@ define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
-  ; CHECK: pslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
-  ; CHECK: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
-  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone
 
 
 define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) {
@@ -504,22 +488,6 @@ define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
-  ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
-  ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone
 
 
 define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) {
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index b7db6cb..0b69ae8 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -75,7 +75,7 @@ define <4 x i32> @test5(i8** %ptr) nounwind {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl (%eax), %eax
-; CHECK-NEXT:    movss (%eax), %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    pxor %xmm0, %xmm0
 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -179,8 +179,8 @@ define void @test12() nounwind {
 ; CHECK-LABEL: test12:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movapd 0, %xmm0
-; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; CHECK-NEXT:    movsd %xmm0, %xmm1
+; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; CHECK-NEXT:    xorpd %xmm2, %xmm2
 ; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
 ; CHECK-NEXT:    addps %xmm1, %xmm0
@@ -293,7 +293,7 @@ entry:
 define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
 ; CHECK-LABEL: test_insert_64_zext:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movq %xmm0, %xmm0
+; CHECK-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; CHECK-NEXT:    retl
   %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %1
@@ -302,8 +302,7 @@ define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
 define <4 x i32> @PR19721(<4 x i32> %i) {
 ; CHECK-LABEL: PR19721:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    xorps %xmm1, %xmm1
-; CHECK-NEXT:    movss %xmm1, %xmm0
+; CHECK-NEXT:    andps LCPI19_0, %xmm0
 ; CHECK-NEXT:    retl
   %bc = bitcast <4 x i32> %i to i128
   %insert = and i128 %bc, -4294967296
@@ -316,10 +315,11 @@ define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; CHECK-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-NEXT:    retl
   %m = mul <4 x i32> %x, %y
   ret <4 x i32> %m
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 0a5b0ca..6c0b701 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -25,14 +25,11 @@ entry:
 define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ; X64-LABEL: t1:
 ; X64:       ## BB#0:
-; X64-NEXT:    movdqa (%rdi), %xmm0
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535]
+; X64-NEXT:    movaps %xmm0, %xmm1
+; X64-NEXT:    andnps (%rsi), %xmm1
+; X64-NEXT:    andps (%rdi), %xmm0
+; X64-NEXT:    orps %xmm1, %xmm0
 ; X64-NEXT:    retq
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
@@ -44,11 +41,11 @@ define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind {
 ; X64-LABEL: t2:
 ; X64:       ## BB#0:
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,0,3,4,5,6,7]
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X64-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535]
+; X64-NEXT:    pand %xmm2, %xmm0
+; X64-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,1,4,5,6,7]
+; X64-NEXT:    pandn %xmm1, %xmm2
+; X64-NEXT:    por %xmm2, %xmm0
 ; X64-NEXT:    retq
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 >
 	ret <8 x i16> %tmp
@@ -92,7 +89,7 @@ define <8 x i16> @t5(<8 x i16> %A, <8 x i16> %B) nounwind {
 define <8 x i16> @t6(<8 x i16> %A, <8 x i16> %B) nounwind {
 ; X64-LABEL: t6:
 ; X64:       ## BB#0:
-; X64-NEXT:    movss %xmm1, %xmm0
+; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-NEXT:    retq
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
 	ret <8 x i16> %tmp
@@ -195,8 +192,8 @@ define void @t10() nounwind {
 define <8 x i16> @t11(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
 ; X64-LABEL: t11:
 ; X64:       ## BB#0: ## %entry
+; X64-NEXT:    psrld $16, %xmm0
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
 ; X64-NEXT:    retq
 entry:
 	%tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
@@ -232,8 +229,9 @@ entry:
 define <8 x i16> @t14(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
 ; X64-LABEL: t14:
 ; X64:       ## BB#0: ## %entry
-; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; X64-NEXT:    psrlq $16, %xmm0
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-NEXT:    movdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
 entry:
 	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef >
@@ -245,11 +243,8 @@ define <8 x i16> @t15(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
 ; X64-LABEL: t15:
 ; X64:       ## BB#0: ## %entry
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
-; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X64-NEXT:    retq
 entry:
   %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
@@ -262,15 +257,7 @@ define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone {
 ; X64:       ## BB#0: ## %entry
 ; X64-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0]
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-NEXT:    pxor %xmm2, %xmm2
-; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT:    packuswb %xmm0, %xmm0
+; X64-NEXT:    movdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
 entry:
   %tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0,  i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
@@ -282,7 +269,7 @@ entry:
 define <4 x i32> @t17() nounwind {
 ; X64-LABEL: t17:
 ; X64:       ## BB#0: ## %entry
-; X64-NEXT:    movddup (%rax), %xmm0
+; X64-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
 ; X64-NEXT:    andpd {{.*}}(%rip), %xmm0
 ; X64-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll b/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll
new file mode 100644
index 0000000..55faf4d
--- /dev/null
+++ b/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll
@@ -0,0 +1,123 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+
+define <8 x i16> @test_llvm_x86_sse41_pmovsxbw(<16 x i8>* %a) {
+; CHECK-LABEL: test_llvm_x86_sse41_pmovsxbw
+; SSE41: pmovsxbw (%rdi), %xmm0
+; AVX:  vpmovsxbw (%rdi), %xmm0
+  %1 = load <16 x i8>* %a, align 1
+  %2 = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %1)
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_llvm_x86_sse41_pmovsxbd(<16 x i8>* %a) {
+; CHECK-LABEL: test_llvm_x86_sse41_pmovsxbd
+; SSE41: pmovsxbd (%rdi), %xmm0
+; AVX:  vpmovsxbd (%rdi), %xmm0
+  %1 = load <16 x i8>* %a, align 1
+  %2 = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %1)
+  ret <4 x i32> %2
+}
+
+define <2 x i64> @test_llvm_x86_sse41_pmovsxbq(<16 x i8>* %a) {
+; CHECK-LABEL: test_llvm_x86_sse41_pmovsxbq
+; SSE41: pmovsxbq (%rdi), %xmm0
+; AVX:  vpmovsxbq (%rdi), %xmm0
+  %1 = load <16 x i8>* %a, align 1
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %1)
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @test_llvm_x86_sse41_pmovsxwd(<8 x i16>* %a) {
+; CHECK-LABEL: test_llvm_x86_sse41_pmovsxwd
+; SSE41: pmovsxwd (%rdi), %xmm0
+; AVX:  vpmovsxwd (%rdi), %xmm0
+  %1 = load <8 x i16>* %a, align 1
+  %2 = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1)
+  ret <4 x i32> %2
+}
+
+define <2 x i64> @test_llvm_x86_sse41_pmovsxwq(<8 x i16>* %a) {
+; CHECK-LABEL: test_llvm_x86_sse41_pmovsxwq
+; SSE41: pmovsxwq (%rdi), %xmm0
+; AVX:  vpmovsxwq (%rdi), %xmm0
+  %1 = load <8 x i16>* %a, align 1
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %1)
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_llvm_x86_sse41_pmovsxdq(<4 x i32>* %a) {
+; CHECK-LABEL: test_llvm_x86_sse41_pmovsxdq
+; SSE41: pmovsxdq (%rdi), %xmm0
+; AVX:  vpmovsxdq (%rdi), %xmm0
+  %1 = load <4 x i32>* %a, align 1
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %1)
+  ret <2 x i64> %2
+}
+
+define <8 x i16> @test_llvm_x86_sse41_pmovzxbw(<16 x i8>* %a) {
+; CHECK-LABEL: test_llvm_x86_sse41_pmovzxbw
+; SSE41: pmovzxbw (%rdi), %xmm0
+; AVX:  vpmovzxbw (%rdi), %xmm0
+  %1 = load <16 x i8>* %a, align 1
+  %2 = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %1)
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_llvm_x86_sse41_pmovzxbd(<16 x i8>* %a) {
+; CHECK-LABEL: test_llvm_x86_sse41_pmovzxbd
+; SSE41: pmovzxbd (%rdi), %xmm0
+; AVX:  vpmovzxbd (%rdi), %xmm0
+  %1 = load <16 x i8>* %a, align 1
+  %2 = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %1)
+  ret <4 x i32> %2
+}
+
+define <2 x i64> @test_llvm_x86_sse41_pmovzxbq(<16 x i8>* %a) {
+; CHECK-LABEL: test_llvm_x86_sse41_pmovzxbq
+; SSE41: pmovzxbq (%rdi), %xmm0
+; AVX:  vpmovzxbq (%rdi), %xmm0
+  %1 = load <16 x i8>* %a, align 1
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %1)
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @test_llvm_x86_sse41_pmovzxwd(<8 x i16>* %a) {
+; CHECK-LABEL: test_llvm_x86_sse41_pmovzxwd
+; SSE41: pmovzxwd (%rdi), %xmm0
+; AVX:  vpmovzxwd (%rdi), %xmm0
+  %1 = load <8 x i16>* %a, align 1
+  %2 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %1)
+  ret <4 x i32> %2
+}
+
+define <2 x i64> @test_llvm_x86_sse41_pmovzxwq(<8 x i16>* %a) {
+; CHECK-LABEL: test_llvm_x86_sse41_pmovzxwq
+; SSE41: pmovzxwq (%rdi), %xmm0
+; AVX:  vpmovzxwq (%rdi), %xmm0
+  %1 = load <8 x i16>* %a, align 1
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %1)
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_llvm_x86_sse41_pmovzxdq(<4 x i32>* %a) {
+; CHECK-LABEL: test_llvm_x86_sse41_pmovzxdq
+; SSE41: pmovzxdq (%rdi), %xmm0
+; AVX:  vpmovzxdq (%rdi), %xmm0
+  %1 = load <4 x i32>* %a, align 1
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %1)
+  ret <2 x i64> %2
+}
+
+declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>)
+declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>)
+declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>)
+declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>)
+declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>)
+declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>)
+declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>)
+declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>)
+declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>)
+declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>)
+declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>)
+declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>)
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index d5c6f74..a5b07e7 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -78,13 +78,13 @@ define <2 x i64> @pmovzxbq_1() nounwind {
 ; X32-LABEL: pmovzxbq_1:
 ; X32:       ## BB#0: ## %entry
 ; X32-NEXT:    movl L_g16$non_lazy_ptr, %eax
-; X32-NEXT:    pmovzxbq (%eax), %xmm0
+; X32-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: pmovzxbq_1:
 ; X64:       ## BB#0: ## %entry
 ; X64-NEXT:    movq _g16@{{.*}}(%rip), %rax
-; X64-NEXT:    pmovzxbq (%rax), %xmm0
+; X64-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
 ; X64-NEXT:    retq
 entry:
 	%0 = load i16* @g16, align 2		; <i16> [#uses=1]
@@ -202,7 +202,7 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) noun
 define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind {
 ; X32-LABEL: insertps_2:
 ; X32:       ## BB#0:
-; X32-NEXT:    insertps $0, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT:    insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_2:
@@ -291,22 +291,20 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
 define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind  {
 ; X32-LABEL: buildvector:
 ; X32:       ## BB#0: ## %entry
-; X32-NEXT:    movaps %xmm0, %xmm2
-; X32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; X32-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X32-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
 ; X32-NEXT:    addss %xmm1, %xmm0
-; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; X32-NEXT:    addss %xmm2, %xmm1
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; X32-NEXT:    addss %xmm2, %xmm3
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: buildvector:
 ; X64:       ## BB#0: ## %entry
-; X64-NEXT:    movaps %xmm0, %xmm2
-; X64-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; X64-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X64-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
 ; X64-NEXT:    addss %xmm1, %xmm0
-; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; X64-NEXT:    addss %xmm2, %xmm1
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; X64-NEXT:    addss %xmm2, %xmm3
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
 ; X64-NEXT:    retq
 entry:
   %tmp7 = extractelement <2 x float> %A, i32 0
@@ -324,12 +322,12 @@ define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* n
 ; X32-LABEL: insertps_from_shufflevector_1:
 ; X32:       ## BB#0: ## %entry
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps $48, (%eax), %xmm0
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_shufflevector_1:
 ; X64:       ## BB#0: ## %entry
-; X64-NEXT:    insertps $48, (%rdi), %xmm0
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; X64-NEXT:    retq
 entry:
   %0 = load <4 x float>* %pb, align 16
@@ -358,12 +356,14 @@ define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocaptu
 ; X32-LABEL: pinsrd_from_shufflevector_i32:
 ; X32:       ## BB#0: ## %entry
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps $48, (%eax), %xmm0
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
+; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: pinsrd_from_shufflevector_i32:
 ; X64:       ## BB#0: ## %entry
-; X64-NEXT:    insertps $48, (%rdi), %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
+; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
 ; X64-NEXT:    retq
 entry:
   %0 = load <4 x i32>* %pb, align 16
@@ -374,12 +374,14 @@ entry:
 define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
 ; X32-LABEL: insertps_from_shufflevector_i32_2:
 ; X32:       ## BB#0: ## %entry
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[3],xmm0[2,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_shufflevector_i32_2:
 ; X64:       ## BB#0: ## %entry
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[3],xmm0[2,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
 ; X64-NEXT:    retq
 entry:
   %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
@@ -390,12 +392,12 @@ define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b)
 ; X32-LABEL: insertps_from_load_ins_elt_undef:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps $16, (%eax), %xmm0
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_load_ins_elt_undef:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps $16, (%rdi), %xmm0
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; X64-NEXT:    retq
   %1 = load float* %b, align 4
   %2 = insertelement <4 x float> undef, float %1, i32 0
@@ -408,14 +410,16 @@ define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
 ; X32-LABEL: insertps_from_load_ins_elt_undef_i32:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movd (%eax), %xmm1
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; X32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_load_ins_elt_undef_i32:
 ; X64:       ## BB#0:
-; X64-NEXT:    movd (%rdi), %xmm1
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; X64-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
 ; X64-NEXT:    retq
   %1 = load i32* %b, align 4
   %2 = insertelement <4 x i32> undef, i32 %1, i32 0
@@ -449,12 +453,12 @@ define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
 define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
 ; X32-LABEL: shuf_XY00:
 ; X32:       ## BB#0:
-; X32-NEXT:    movq %xmm0, %xmm0
+; X32-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: shuf_XY00:
 ; X64:       ## BB#0:
-; X64-NEXT:    movq %xmm0, %xmm0
+; X64-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
@@ -527,14 +531,14 @@ define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
 ; X32:       ## BB#0:
 ; X32-NEXT:    xorps %xmm2, %xmm2
 ; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: shuf_X00A:
 ; X64:       ## BB#0:
 ; X64-NEXT:    xorps %xmm2, %xmm2
 ; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
@@ -547,18 +551,12 @@ define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
 define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
 ; X32-LABEL: shuf_X00X:
 ; X32:       ## BB#0:
-; X32-NEXT:    xorps %xmm1, %xmm1
-; X32-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],zero,zero,xmm0[0]
-; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: shuf_X00X:
 ; X64:       ## BB#0:
-; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],zero,zero,xmm0[0]
-; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
 ; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
@@ -571,20 +569,14 @@ define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
 define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
 ; X32-LABEL: shuf_X0YC:
 ; X32:       ## BB#0:
-; X32-NEXT:    xorps %xmm2, %xmm2
-; X32-NEXT:    blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0],zero,xmm0[1],zero
-; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
-; X32-NEXT:    movaps %xmm2, %xmm0
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: shuf_X0YC:
 ; X64:       ## BB#0:
-; X64-NEXT:    xorps %xmm2, %xmm2
-; X64-NEXT:    blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0],zero,xmm0[1],zero
-; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
-; X64-NEXT:    movaps %xmm2, %xmm0
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
 ; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
@@ -619,12 +611,12 @@ define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
 define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
 ; X32-LABEL: i32_shuf_XY00:
 ; X32:       ## BB#0:
-; X32-NEXT:    movq %xmm0, %xmm0
+; X32-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: i32_shuf_XY00:
 ; X64:       ## BB#0:
-; X64-NEXT:    movq %xmm0, %xmm0
+; X64-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
@@ -638,12 +630,16 @@ define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
 define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
 ; X32-LABEL: i32_shuf_XYY0:
 ; X32:       ## BB#0:
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; X32-NEXT:    pxor %xmm0, %xmm0
+; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: i32_shuf_XYY0:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; X64-NEXT:    pxor %xmm0, %xmm0
+; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
 ; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
@@ -657,12 +653,16 @@ define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
 define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
 ; X32-LABEL: i32_shuf_XYW0:
 ; X32:       ## BB#0:
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
+; X32-NEXT:    pxor %xmm0, %xmm0
+; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: i32_shuf_XYW0:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
+; X64-NEXT:    pxor %xmm0, %xmm0
+; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
 ; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
@@ -677,12 +677,16 @@ define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
 define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
 ; X32-LABEL: i32_shuf_W00W:
 ; X32:       ## BB#0:
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; X32-NEXT:    pxor %xmm0, %xmm0
+; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: i32_shuf_W00W:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; X64-NEXT:    pxor %xmm0, %xmm0
+; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
 ; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 3
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
@@ -695,16 +699,18 @@ define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
 define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
 ; X32-LABEL: i32_shuf_X00A:
 ; X32:       ## BB#0:
-; X32-NEXT:    xorps %xmm2, %xmm2
-; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT:    pxor %xmm2, %xmm2
+; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: i32_shuf_X00A:
 ; X64:       ## BB#0:
-; X64-NEXT:    xorps %xmm2, %xmm2
-; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-NEXT:    pxor %xmm2, %xmm2
+; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
 ; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
@@ -717,18 +723,16 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
 define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
 ; X32-LABEL: i32_shuf_X00X:
 ; X32:       ## BB#0:
-; X32-NEXT:    xorps %xmm1, %xmm1
-; X32-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
-; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
+; X32-NEXT:    pxor %xmm0, %xmm0
+; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: i32_shuf_X00X:
 ; X64:       ## BB#0:
-; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
-; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
+; X64-NEXT:    pxor %xmm0, %xmm0
+; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
 ; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
@@ -741,20 +745,16 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
 define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
 ; X32-LABEL: i32_shuf_X0YC:
 ; X32:       ## BB#0:
-; X32-NEXT:    xorps %xmm2, %xmm2
-; X32-NEXT:    blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1],zero
-; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
-; X32-NEXT:    movaps %xmm2, %xmm0
+; X32-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
+; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: i32_shuf_X0YC:
 ; X64:       ## BB#0:
-; X64-NEXT:    xorps %xmm2, %xmm2
-; X64-NEXT:    blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1],zero
-; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
-; X64-NEXT:    movaps %xmm2, %xmm0
+; X64-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
+; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
 ; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
@@ -816,12 +816,12 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap
 ; X32-LABEL: insertps_from_vector_load:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps $48, (%eax), %xmm0
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps $48, (%rdi), %xmm0
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; X64-NEXT:    retq
   %1 = load <4 x float>* %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
@@ -834,12 +834,12 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>
 ; X32-LABEL: insertps_from_vector_load_offset:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps $96, 4(%eax), %xmm0
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load_offset:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps $96, 4(%rdi), %xmm0
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3]
 ; X64-NEXT:    retq
   %1 = load <4 x float>* %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
@@ -853,13 +853,13 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    shll $4, %ecx
-; X32-NEXT:    insertps $-64, 12(%eax,%ecx), %xmm0
+; X32-NEXT:    insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load_offset_2:
 ; X64:       ## BB#0:
 ; X64-NEXT:    shlq $4, %rsi
-; X64-NEXT:    insertps $-64, 12(%rdi,%rsi), %xmm0
+; X64-NEXT:    insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3]
 ; X64-NEXT:    retq
   %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
   %2 = load <4 x float>* %1, align 16
@@ -872,14 +872,14 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movss (%ecx,%eax,4), %xmm1
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_broadcast_loadf32:
 ; X64:       ## BB#0:
-; X64-NEXT:    movss (%rdi,%rsi,4), %xmm1
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; X64-NEXT:    retq
@@ -924,7 +924,7 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movss (%ecx,%eax,4), %xmm4
+; X32-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
 ; X32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
 ; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
@@ -937,7 +937,7 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ;
 ; X64-LABEL: insertps_from_broadcast_multiple_use:
 ; X64:       ## BB#0:
-; X64-NEXT:    movss (%rdi,%rsi,4), %xmm4
+; X64-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
 ; X64-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
 ; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
@@ -967,16 +967,16 @@ define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
 ; X32-LABEL: insertps_with_undefs:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movss (%eax), %xmm1
-; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm0[0],xmm1[3]
-; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X32-NEXT:    movapd %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_with_undefs:
 ; X64:       ## BB#0:
-; X64-NEXT:    movss (%rdi), %xmm1
-; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm0[0],xmm1[3]
-; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-NEXT:    movapd %xmm1, %xmm0
 ; X64-NEXT:    retq
   %1 = load float* %b, align 4
   %2 = insertelement <4 x float> undef, float %1, i32 0
@@ -990,12 +990,12 @@ define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
 ; X32-LABEL: pr20087:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps $-78, 8(%eax), %xmm0
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: pr20087:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps $-78, 8(%rdi), %xmm0
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2]
 ; X64-NEXT:    retq
   %load = load <4 x float> *%ptr
   %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
@@ -1007,16 +1007,18 @@ define void @insertps_pr20411(i32* noalias nocapture %RET) #1 {
 ; X32-LABEL: insertps_pr20411:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
-; X32-NEXT:    insertps $-36, LCPI49_1+12, %xmm0
-; X32-NEXT:    movups %xmm0, (%eax)
+; X32-NEXT:    pshufd {{.*#+}} xmm0 = mem[2,3,0,1]
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = mem[3,1,2,3]
+; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; X32-NEXT:    movdqu %xmm1, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_pr20411:
 ; X64:       ## BB#0:
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
-; X64-NEXT:    insertps $-36, LCPI49_1+{{.*}}(%rip), %xmm0
-; X64-NEXT:    movups %xmm0, (%rdi)
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = mem[2,3,0,1]
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = mem[3,1,2,3]
+; X64-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; X64-NEXT:    movdqu %xmm1, (%rdi)
 ; X64-NEXT:    retq
   %gather_load = shufflevector <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %shuffle109 = shufflevector <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>  ; 4 5 6 7
@@ -1029,12 +1031,12 @@ define void @insertps_pr20411(i32* noalias nocapture %RET) #1 {
 
 define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
 ; X32-LABEL: insertps_4:
-; X32:       ## BB#0:
+; X32:       ## BB#0: ## %entry
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_4:
-; X64:       ## BB#0:
+; X64:       ## BB#0: ## %entry
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
 ; X64-NEXT:    retq
 entry:
@@ -1049,12 +1051,12 @@ entry:
 
 define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
 ; X32-LABEL: insertps_5:
-; X32:       ## BB#0:
+; X32:       ## BB#0: ## %entry
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_5:
-; X64:       ## BB#0:
+; X64:       ## BB#0: ## %entry
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
 ; X64-NEXT:    retq
 entry:
@@ -1069,12 +1071,12 @@ entry:
 
 define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
 ; X32-LABEL: insertps_6:
-; X32:       ## BB#0:
+; X32:       ## BB#0: ## %entry
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_6:
-; X64:       ## BB#0:
+; X64:       ## BB#0: ## %entry
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
 ; X64-NEXT:    retq
 entry:
@@ -1088,12 +1090,12 @@ entry:
 
 define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
 ; X32-LABEL: insertps_7:
-; X32:       ## BB#0:
+; X32:       ## BB#0: ## %entry
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_7:
-; X64:       ## BB#0:
+; X64:       ## BB#0: ## %entry
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
 ; X64-NEXT:    retq
 entry:
@@ -1108,12 +1110,12 @@ entry:
 
 define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
 ; X32-LABEL: insertps_8:
-; X32:       ## BB#0:
+; X32:       ## BB#0: ## %entry
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_8:
-; X64:       ## BB#0:
+; X64:       ## BB#0: ## %entry
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; X64-NEXT:    retq
 entry:
@@ -1128,13 +1130,13 @@ entry:
 
 define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
 ; X32-LABEL: insertps_9:
-; X32:       ## BB#0:
+; X32:       ## BB#0: ## %entry
 ; X32-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
 ; X32-NEXT:    movaps %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_9:
-; X64:       ## BB#0:
+; X64:       ## BB#0: ## %entry
 ; X64-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
 ; X64-NEXT:    movaps %xmm1, %xmm0
 ; X64-NEXT:    retq
@@ -1146,3 +1148,59 @@ entry:
   %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
   ret <4 x float> %vecinit3
 }
+
+define <4 x float> @insertps_10(<4 x float> %A)
+; X32-LABEL: insertps_10:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_10:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
+; X64-NEXT:    retq
+{
+  %vecext = extractelement <4 x float> %A, i32 0
+  %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0
+  %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2
+  ret <4 x float> %vecbuild2
+}
+
+define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
+; X32-LABEL: build_vector_to_shuffle_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    xorps %xmm1, %xmm1
+; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: build_vector_to_shuffle_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; X64-NEXT:    retq
+entry:
+  %vecext = extractelement <4 x float> %A, i32 1
+  %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
+  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x float> %vecinit3
+}
+
+define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
+; X32-LABEL: build_vector_to_shuffle_2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    xorps %xmm1, %xmm1
+; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: build_vector_to_shuffle_2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; X64-NEXT:    retq
+entry:
+  %vecext = extractelement <4 x float> %A, i32 1
+  %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
+  ret <4 x float> %vecinit1
+}
diff --git a/test/CodeGen/X86/sse4a.ll b/test/CodeGen/X86/sse4a.ll
index 165d476..f8fa125 100644
--- a/test/CodeGen/X86/sse4a.ll
+++ b/test/CodeGen/X86/sse4a.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4a | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mattr=sse4a | FileCheck %s
 
 define void @test1(i8* %p, <4 x float> %a) nounwind optsize ssp {
 ; CHECK-LABEL: test1:
diff --git a/test/CodeGen/X86/sse_partial_update.ll b/test/CodeGen/X86/sse_partial_update.ll
index 2c16a55..377c3b7 100644
--- a/test/CodeGen/X86/sse_partial_update.ll
+++ b/test/CodeGen/X86/sse_partial_update.ll
@@ -5,11 +5,18 @@
 ; There is a mismatch between the intrinsic and the actual instruction.
 ; The actual instruction has a partial update of dest, while the intrinsic
 ; passes through the upper FP values. Here, we make sure the source and
-; destination of rsqrtss are the same.
-define void @t1(<4 x float> %a) nounwind uwtable ssp {
+; destination of each scalar unary op are the same.
+
+define void @rsqrtss(<4 x float> %a) nounwind uwtable ssp {
 entry:
-; CHECK-LABEL: t1:
+; CHECK-LABEL: rsqrtss:
 ; CHECK: rsqrtss %xmm0, %xmm0
+; CHECK-NEXT: cvtss2sd %xmm0
+; CHECK-NEXT: movshdup
+; CHECK-NEXT: cvtss2sd %xmm0
+; CHECK-NEXT: movap
+; CHECK-NEXT: jmp
+
   %0 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a) nounwind
   %a.addr.0.extract = extractelement <4 x float> %0, i32 0
   %conv = fpext float %a.addr.0.extract to double
@@ -21,10 +28,16 @@ entry:
 declare void @callee(double, double)
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 
-define void @t2(<4 x float> %a) nounwind uwtable ssp {
+define void @rcpss(<4 x float> %a) nounwind uwtable ssp {
 entry:
-; CHECK-LABEL: t2:
+; CHECK-LABEL: rcpss:
 ; CHECK: rcpss %xmm0, %xmm0
+; CHECK-NEXT: cvtss2sd %xmm0
+; CHECK-NEXT: movshdup
+; CHECK-NEXT: cvtss2sd %xmm0
+; CHECK-NEXT: movap
+; CHECK-NEXT: jmp
+
   %0 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a) nounwind
   %a.addr.0.extract = extractelement <4 x float> %0, i32 0
   %conv = fpext float %a.addr.0.extract to double
@@ -34,3 +47,46 @@ entry:
   ret void
 }
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define void @sqrtss(<4 x float> %a) nounwind uwtable ssp {
+entry:
+; CHECK-LABEL: sqrtss:
+; CHECK: sqrtss %xmm0, %xmm0
+; CHECK-NEXT: cvtss2sd %xmm0
+; CHECK-NEXT: movshdup
+; CHECK-NEXT: cvtss2sd %xmm0
+; CHECK-NEXT: movap
+; CHECK-NEXT: jmp
+
+  %0 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a) nounwind
+  %a.addr.0.extract = extractelement <4 x float> %0, i32 0
+  %conv = fpext float %a.addr.0.extract to double
+  %a.addr.4.extract = extractelement <4 x float> %0, i32 1
+  %conv3 = fpext float %a.addr.4.extract to double
+  tail call void @callee(double %conv, double %conv3) nounwind
+  ret void
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define void @sqrtsd(<2 x double> %a) nounwind uwtable ssp {
+entry:
+; CHECK-LABEL: sqrtsd:
+; CHECK: sqrtsd %xmm0, %xmm0
+; CHECK-NEXT: cvtsd2ss %xmm0
+; CHECK-NEXT: shufpd
+; CHECK-NEXT: cvtsd2ss %xmm0
+; CHECK-NEXT: movap
+; CHECK-NEXT: jmp
+
+ %0 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a) nounwind
+ %a0 = extractelement <2 x double> %0, i32 0
+ %conv = fptrunc double %a0 to float
+ %a1 = extractelement <2 x double> %0, i32 1
+ %conv3 = fptrunc double %a1 to float
+ tail call void @callee2(float %conv, float %conv3) nounwind
+ ret void
+}
+
+declare void @callee2(float, float)
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
diff --git a/test/CodeGen/X86/stack-align.ll b/test/CodeGen/X86/stack-align.ll
index eafb7c2..74f4c78 100644
--- a/test/CodeGen/X86/stack-align.ll
+++ b/test/CodeGen/X86/stack-align.ll
@@ -1,7 +1,10 @@
 ; RUN: llc < %s -relocation-model=static -mcpu=yonah | FileCheck %s
 
-; The double argument is at 4(esp) which is 16-byte aligned, allowing us to
-; fold the load into the andpd.
+; The double argument is at 4(esp) which is 16-byte aligned, but we
+; are required to read in extra bytes of memory in order to fold the
+; load. Bad Things may happen when reading/processing undefined bytes,
+; so don't fold the load.
+; PR22371 / http://reviews.llvm.org/D7474
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin8"
@@ -15,22 +18,31 @@ entry:
 	%tmp = getelementptr { double, double }* %z, i32 0, i32 0		; <double*> [#uses=1]
 	%tmp1 = load volatile double* %tmp, align 8		; <double> [#uses=1]
 	%tmp2 = tail call double @fabs( double %tmp1 ) readnone	; <double> [#uses=1]
-    ; CHECK: andpd{{.*}}4(%esp), %xmm
 	%tmp6 = fadd double %tmp4, %tmp2		; <double> [#uses=1]
 	store volatile double %tmp6, double* %P, align 8
 	ret void
+
+; CHECK-LABEL: test:
+; CHECK:       movsd	{{.*}}G, %xmm{{.*}}
+; CHECK:       andpd	%xmm{{.*}}, %xmm{{.*}}
+; CHECK:       movsd	4(%esp), %xmm{{.*}}
+; CHECK:       andpd	%xmm{{.*}}, %xmm{{.*}}
+
+
 }
 
 define void @test2() alignstack(16) nounwind {
 entry:
-    ; CHECK: andl{{.*}}$-16, %esp
+; CHECK-LABEL: test2:
+; CHECK: andl{{.*}}$-16, %esp
     ret void
 }
 
 ; Use a call to force a spill.
 define <2 x double> @test3(<2 x double> %x, <2 x double> %y) alignstack(32) nounwind {
 entry:
-    ; CHECK: andl{{.*}}$-32, %esp
+; CHECK-LABEL: test3:
+; CHECK: andl{{.*}}$-32, %esp
     call void @test2()
     %A = fmul <2 x double> %x, %y
     ret <2 x double> %A
diff --git a/test/CodeGen/X86/stack-folding-fp-avx1.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll
new file mode 100644
index 0000000..18cd417
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -0,0 +1,1811 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+f16c < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_addpd
+  ;CHECK:       vaddpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fadd <2 x double> %a0, %a1
+  ret <2 x double> %2
+}
+
+define <4 x double> @stack_fold_addpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_addpd_ymm
+  ;CHECK:       vaddpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fadd <4 x double> %a0, %a1
+  ret <4 x double> %2
+}
+
+define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_addps
+  ;CHECK:       vaddps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fadd <4 x float> %a0, %a1
+  ret <4 x float> %2
+}
+
+define <8 x float> @stack_fold_addps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_addps_ymm
+  ;CHECK:       vaddps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fadd <8 x float> %a0, %a1
+  ret <8 x float> %2
+}
+
+define double @stack_fold_addsd(double %a0, double %a1) {
+  ;CHECK-LABEL: stack_fold_addsd
+  ;CHECK:       vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fadd double %a0, %a1
+  ret double %2
+}
+
+define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_addsd_int
+  ;CHECK:       vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_addss(float %a0, float %a1) {
+  ;CHECK-LABEL: stack_fold_addss
+  ;CHECK:       vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fadd float %a0, %a1
+  ret float %2
+}
+
+define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_addss_int
+  ;CHECK:       vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_addsubpd
+  ;CHECK:       vaddsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x double> @stack_fold_addsubpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_addsubpd_ymm
+  ;CHECK:       vaddsubpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
+  ret <4 x double> %2
+}
+declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <4 x float> @stack_fold_addsubps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_addsubps
+  ;CHECK:       vaddsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @stack_fold_addsubps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_addsubps_ymm
+  ;CHECK:       vaddsubps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_andnpd
+  ;CHECK:       vandnpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <2 x double> %a0 to <2 x i64>
+  %3 = bitcast <2 x double> %a1 to <2 x i64>
+  %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
+  %5 = and <2 x i64> %4, %3
+  %6 = bitcast <2 x i64> %5 to <2 x double>
+  ; fadd forces execution domain
+  %7 = fadd <2 x double> %6, <double 0x0, double 0x0>
+  ret <2 x double> %7
+}
+
+define <4 x double> @stack_fold_andnpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_andnpd_ymm
+  ;CHECK:       vandnpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <4 x double> %a0 to <4 x i64>
+  %3 = bitcast <4 x double> %a1 to <4 x i64>
+  %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %5 = and <4 x i64> %4, %3
+  %6 = bitcast <4 x i64> %5 to <4 x double>
+  ; fadd forces execution domain
+  %7 = fadd <4 x double> %6, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %7
+}
+
+define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_andnps
+  ;CHECK:       vandnps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <4 x float> %a0 to <2 x i64>
+  %3 = bitcast <4 x float> %a1 to <2 x i64>
+  %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
+  %5 = and <2 x i64> %4, %3
+  %6 = bitcast <2 x i64> %5 to <4 x float>
+  ; fadd forces execution domain
+  %7 = fadd <4 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0>
+  ret <4 x float> %7
+}
+
+define <8 x float> @stack_fold_andnps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_andnps_ymm
+  ;CHECK:       vandnps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <8 x float> %a0 to <4 x i64>
+  %3 = bitcast <8 x float> %a1 to <4 x i64>
+  %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %5 = and <4 x i64> %4, %3
+  %6 = bitcast <4 x i64> %5 to <8 x float>
+  ; fadd forces execution domain
+  %7 = fadd <8 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
+  ret <8 x float> %7
+}
+
+define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_andpd
+  ;CHECK:       vandpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <2 x double> %a0 to <2 x i64>
+  %3 = bitcast <2 x double> %a1 to <2 x i64>
+  %4 = and <2 x i64> %2, %3
+  %5 = bitcast <2 x i64> %4 to <2 x double>
+  ; fadd forces execution domain
+  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
+  ret <2 x double> %6
+}
+
+define <4 x double> @stack_fold_andpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_andpd_ymm
+  ;CHECK:       vandpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <4 x double> %a0 to <4 x i64>
+  %3 = bitcast <4 x double> %a1 to <4 x i64>
+  %4 = and <4 x i64> %2, %3
+  %5 = bitcast <4 x i64> %4 to <4 x double>
+  ; fadd forces execution domain
+  %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %6
+}
+
+define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_andps
+  ;CHECK:       vandps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <4 x float> %a0 to <2 x i64>
+  %3 = bitcast <4 x float> %a1 to <2 x i64>
+  %4 = and <2 x i64> %2, %3
+  %5 = bitcast <2 x i64> %4 to <4 x float>
+  ; fadd forces execution domain
+  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
+  ret <4 x float> %6
+}
+
+define <8 x float> @stack_fold_andps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_andps_ymm
+  ;CHECK:       vandps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <8 x float> %a0 to <4 x i64>
+  %3 = bitcast <8 x float> %a1 to <4 x i64>
+  %4 = and <4 x i64> %2, %3
+  %5 = bitcast <4 x i64> %4 to <8 x float>
+  ; fadd forces execution domain
+  %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
+  ret <8 x float> %6
+}
+
+define <2 x double> @stack_fold_blendpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_blendpd
+  ;CHECK:       vblendpd $2, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = select <2 x i1> <i1 1, i1 0>, <2 x double> %a0, <2 x double> %a1
+  ret <2 x double> %2
+}
+
+define <4 x double> @stack_fold_blendpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_blendpd_ymm
+  ;CHECK:       vblendpd $6, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x double> %a0, <4 x double> %a1
+  ret <4 x double> %2
+}
+
+define <4 x float> @stack_fold_blendps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_blendps
+  ;CHECK:       vblendps $6, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x float> %a0, <4 x float> %a1
+  ret <4 x float> %2
+}
+
+define <8 x float> @stack_fold_blendps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_blendps_ymm
+  ;CHECK:       vblendps $102, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x float> %a0, <8 x float> %a1
+  ret <8 x float> %2
+}
+
+define <2 x double> @stack_fold_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %c) {
+  ;CHECK-LABEL: stack_fold_blendvpd
+  ;CHECK:       vblendvpd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a1, <2 x double> %c, <2 x double> %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <4 x double> @stack_fold_blendvpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %c) {
+  ;CHECK-LABEL: stack_fold_blendvpd_ymm
+  ;CHECK:       vblendvpd {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a1, <4 x double> %c, <4 x double> %a0)
+  ret <4 x double> %2
+}
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+
+define <4 x float> @stack_fold_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %c) {
+  ;CHECK-LABEL: stack_fold_blendvps
+  ;CHECK:       vblendvps {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a1, <4 x float> %c, <4 x float> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @stack_fold_blendvps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %c) {
+  ;CHECK-LABEL: stack_fold_blendvps_ymm
+  ;CHECK:       vblendvps {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a1, <8 x float> %c, <8 x float> %a0)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_cmppd
+  ;CHECK:       vcmpeqpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x double> @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_cmppd_ymm
+  ;CHECK:       vcmpeqpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0)
+  ret <4 x double> %2
+}
+declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x float> @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_cmpps
+  ;CHECK:       vcmpeqps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x float> @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_cmpps_ymm
+  ;CHECK:       vcmpeqps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 0)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+define i32 @stack_fold_cmpsd(double %a0, double %a1) {
+  ;CHECK-LABEL: stack_fold_cmpsd
+  ;CHECK:       vcmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fcmp oeq double %a0, %a1
+  %3 = zext i1 %2 to i32
+  ret i32 %3
+}
+
+define <2 x double> @stack_fold_cmpsd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_cmpsd_int
+  ;CHECK:       vcmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define i32 @stack_fold_cmpss(float %a0, float %a1) {
+  ;CHECK-LABEL: stack_fold_cmpss
+  ;CHECK:       vcmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fcmp oeq float %a0, %a1
+  %3 = zext i1 %2 to i32
+  ret i32 %3
+}
+
+define <4 x float> @stack_fold_cmpss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_cmpss_int
+  ;CHECK:       vcmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
+
+; TODO stack_fold_comisd
+
+define i32 @stack_fold_comisd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_comisd_int
+  ;CHECK:       vcomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+; TODO stack_fold_comiss
+
+define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_comiss_int
+  ;CHECK:       vcomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtdq2pd
+  ;CHECK:   vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
+
+define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtdq2pd_ymm
+  ;CHECK:   vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0)
+  ret <4 x double> %2
+}
+declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone
+
+define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtdq2ps
+  ;CHECK:   vcvtdq2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = sitofp <4 x i32> %a0 to <4 x float>
+  ret <4 x float> %2
+}
+
+define <8 x float> @stack_fold_cvtdq2ps_ymm(<8 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtdq2ps_ymm
+  ;CHECK:   vcvtdq2ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = sitofp <8 x i32> %a0 to <8 x float>
+  ret <8 x float> %2
+}
+
+define <4 x i32> @stack_fold_cvtpd2dq(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtpd2dq
+  ;CHECK:   vcvtpd2dqx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
+
+define <4 x i32> @stack_fold_cvtpd2dq_ymm(<4 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtpd2dq_ymm
+  ;CHECK:   vcvtpd2dqy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
+
+define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtpd2ps
+  ;CHECK:   vcvtpd2psx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fptrunc <2 x double> %a0 to <2 x float>
+  ret <2 x float> %2
+}
+
+define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtpd2ps_ymm
+  ;CHECK:   vcvtpd2psy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fptrunc <4 x double> %a0 to <4 x float>
+  ret <4 x float> %2
+}
+
+define <4 x float> @stack_fold_cvtph2ps(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtph2ps
+  ;CHECK:   vcvtph2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
+
+define <8 x float> @stack_fold_cvtph2ps_ymm(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtph2ps_ymm
+  ;CHECK:   vcvtph2ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
+
+define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtps2dq
+  ;CHECK:  vcvtps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
+
+define <8 x i32> @stack_fold_cvtps2dq_ymm(<8 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtps2dq_ymm
+  ;CHECK:  vcvtps2dq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtps2pd
+  ;CHECK:   vcvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
+
+define <4 x double> @stack_fold_cvtps2pd_ymm(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtps2pd_ymm
+  ;CHECK:   vcvtps2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0)
+  ret <4 x double> %2
+}
+declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
+
+define <8 x i16> @stack_fold_cvtps2ph(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtps2ph
+  ;CHECK:   vcvtps2ph $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+  %1 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  ret <8 x i16> %1
+}
+declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly
+
+define <8 x i16> @stack_fold_cvtps2ph_ymm(<8 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtps2ph_ymm
+  ;CHECK:   vcvtps2ph $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+  %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  ret <8 x i16> %1
+}
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
+
+; TODO stack_fold_cvtsd2si
+
+define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsd2si_int
+  ;CHECK:  cvtsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
+
+; TODO stack_fold_cvtsd2si64
+
+define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsd2si64_int
+  ;CHECK:  cvtsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
+  ret i64 %2
+}
+declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
+
+; TODO stack_fold_cvtsd2ss
+
+define <4 x float> @stack_fold_cvtsd2ss_int(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsd2ss_int
+  ;CHECK:  cvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, <2 x double> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
+
+define double @stack_fold_cvtsi2sd(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsi2sd
+  ;CHECK:  cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sitofp i32 %a0 to double
+  ret double %2
+}
+
+define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsi2sd_int
+  ;CHECK:  cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double 0x0, double 0x0>, i32 %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
+
+define double @stack_fold_cvtsi642sd(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsi642sd
+  ;CHECK:  cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sitofp i64 %a0 to double
+  ret double %2
+}
+
+define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsi642sd_int
+  ;CHECK:  cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> <double 0x0, double 0x0>, i64 %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
+
+define float @stack_fold_cvtsi2ss(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsi2ss
+  ;CHECK:  cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sitofp i32 %a0 to float
+  ret float %2
+}
+
+define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsi2ss_int
+  ;CHECK:  cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, i32 %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
+
+define float @stack_fold_cvtsi642ss(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsi642ss
+  ;CHECK:  cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sitofp i64 %a0 to float
+  ret float %2
+}
+
+define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsi642ss_int
+  ;CHECK:  cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, i64 %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
+
+; TODO stack_fold_cvtss2sd
+
+define <2 x double> @stack_fold_cvtss2sd_int(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtss2sd_int
+  ;CHECK:  cvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
+
+; TODO stack_fold_cvtss2si
+
+define i32 @stack_fold_cvtss2si_int(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtss2si_int
+  ;CHECK:  vcvtss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
+
+; TODO stack_fold_cvtss2si64
+
+define i64 @stack_fold_cvtss2si64_int(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtss2si64_int
+  ;CHECK:  vcvtss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
+  ret i64 %2
+}
+declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
+
+define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttpd2dq
+  ;CHECK:  vcvttpd2dqx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
+
+define <4 x i32> @stack_fold_cvttpd2dq_ymm(<4 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttpd2dq_ymm
+  ;CHECK:  vcvttpd2dqy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fptosi <4 x double> %a0 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_cvttps2dq(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttps2dq
+  ;CHECK:  vcvttps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fptosi <4 x float> %a0 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_cvttps2dq_ymm(<8 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttps2dq_ymm
+  ;CHECK:  vcvttps2dq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fptosi <8 x float> %a0 to <8 x i32>
+  ret <8 x i32> %2
+}
+
+define i32 @stack_fold_cvttsd2si(double %a0) {
+  ;CHECK-LABEL: stack_fold_cvttsd2si
+  ;CHECK:  vcvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fptosi double %a0 to i32
+  ret i32 %2
+}
+
+define i32 @stack_fold_cvttsd2si_int(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttsd2si_int
+  ;CHECK:  vcvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
+
+define i64 @stack_fold_cvttsd2si64(double %a0) {
+  ;CHECK-LABEL: stack_fold_cvttsd2si64
+  ;CHECK:  vcvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fptosi double %a0 to i64
+  ret i64 %2
+}
+
+define i64 @stack_fold_cvttsd2si64_int(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttsd2si64_int
+  ;CHECK:  vcvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
+  ret i64 %2
+}
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
+
+define i32 @stack_fold_cvttss2si(float %a0) {
+  ;CHECK-LABEL: stack_fold_cvttss2si
+  ;CHECK:  vcvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fptosi float %a0 to i32
+  ret i32 %2
+}
+
+define i32 @stack_fold_cvttss2si_int(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttss2si_int
+  ;CHECK:  vcvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
+
+define i64 @stack_fold_cvttss2si64(float %a0) {
+  ;CHECK-LABEL: stack_fold_cvttss2si64
+  ;CHECK:  vcvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fptosi float %a0 to i64
+  ret i64 %2
+}
+
+define i64 @stack_fold_cvttss2si64_int(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttss2si64_int
+  ;CHECK:  cvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
+  ret i64 %2
+}
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_divpd
+  ;CHECK:       vdivpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fdiv <2 x double> %a0, %a1
+  ret <2 x double> %2
+}
+
+define <4 x double> @stack_fold_divpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_divpd_ymm
+  ;CHECK:       vdivpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fdiv <4 x double> %a0, %a1
+  ret <4 x double> %2
+}
+
+define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_divps
+  ;CHECK:       vdivps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fdiv <4 x float> %a0, %a1
+  ret <4 x float> %2
+}
+
+define <8 x float> @stack_fold_divps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_divps_ymm
+  ;CHECK:       vdivps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fdiv <8 x float> %a0, %a1
+  ret <8 x float> %2
+}
+
+define double @stack_fold_divsd(double %a0, double %a1) {
+  ;CHECK-LABEL: stack_fold_divsd
+  ;CHECK:       vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fdiv double %a0, %a1
+  ret double %2
+}
+
+define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_divsd_int
+  ;CHECK:       vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_divss(float %a0, float %a1) {
+  ;CHECK-LABEL: stack_fold_divss
+  ;CHECK:       vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fdiv float %a0, %a1
+  ret float %2
+}
+
+define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_divss_int
+  ;CHECK:       vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_dppd
+  ;CHECK:       vdppd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_dpps
+  ;CHECK:       vdpps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x float> @stack_fold_dpps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_dpps_ymm
+  ;CHECK:       vdpps $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+define <4 x float> @stack_fold_extractf128(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_extractf128
+  ;CHECK:       vextractf128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+  %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  ret <4 x float> %1
+}
+
+define i32 @stack_fold_extractps(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_extractps
+  ;CHECK:       vextractps $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
+  ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
+  %1 = extractelement <4 x float> %a0, i32 1
+  %2 = bitcast float %1 to i32
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i32 %2
+}
+
+define <2 x double> @stack_fold_haddpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_haddpd
+  ;CHECK:       vhaddpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x double> @stack_fold_haddpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_haddpd_ymm
+  ;CHECK:       vhaddpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
+  ret <4 x double> %2
+}
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <4 x float> @stack_fold_haddps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_haddps
+  ;CHECK:       vhaddps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @stack_fold_haddps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_haddps_ymm
+  ;CHECK:       vhaddps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_hsubpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_hsubpd
+  ;CHECK:       vhsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x double> @stack_fold_hsubpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_hsubpd_ymm
+  ;CHECK:       vhsubpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
+  ret <4 x double> %2
+}
+declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_hsubps
+  ;CHECK:       vhsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @stack_fold_hsubps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_hsubps_ymm
+  ;CHECK:       vhsubps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <8 x float> @stack_fold_insertf128(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_insertf128
+  ;CHECK:       vinsertf128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %2
+}
+
+; TODO stack_fold_insertps
+
+define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_maxpd
+  ;CHECK:       vmaxpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_maxpd_ymm
+  ;CHECK:       vmaxpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
+  ret <4 x double> %2
+}
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_maxps
+  ;CHECK:       vmaxps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_maxps_ymm
+  ;CHECK:       vmaxps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define double @stack_fold_maxsd(double %a0, double %a1) {
+  ;CHECK-LABEL: stack_fold_maxsd
+  ;CHECK:       vmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fcmp ogt double %a0, %a1
+  %3 = select i1 %2, double %a0, double %a1
+  ret double %3
+}
+
+define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_maxsd_int
+  ;CHECK:       vmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_maxss(float %a0, float %a1) {
+  ;CHECK-LABEL: stack_fold_maxss
+  ;CHECK:       vmaxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fcmp ogt float %a0, %a1
+  %3 = select i1 %2, float %a0, float %a1
+  ret float %3
+}
+
+define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_maxss_int
+  ;CHECK:       vmaxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_minpd
+  ;CHECK:       vminpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_minpd_ymm
+  ;CHECK:       vminpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
+  ret <4 x double> %2
+}
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_minps
+  ;CHECK:       vminps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_minps_ymm
+  ;CHECK:       vminps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define double @stack_fold_minsd(double %a0, double %a1) {
+  ;CHECK-LABEL: stack_fold_minsd
+  ;CHECK:       vminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fcmp olt double %a0, %a1
+  %3 = select i1 %2, double %a0, double %a1
+  ret double %3
+}
+
+define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_minsd_int
+  ;CHECK:       vminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_minss(float %a0, float %a1) {
+  ;CHECK-LABEL: stack_fold_minss
+  ;CHECK:       vminss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fcmp olt float %a0, %a1
+  %3 = select i1 %2, float %a0, float %a1
+  ret float %3
+}
+
+define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_minss_int
+  ;CHECK:       vminss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_movddup(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_movddup
+  ;CHECK:   vmovddup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %2
+}
+
+define <4 x double> @stack_fold_movddup_ymm(<4 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_movddup_ymm
+  ;CHECK:   vmovddup {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x double> %2
+}
+
+; TODO stack_fold_movhpd (load / store)
+; TODO stack_fold_movhps (load / store)
+
+; TODO stack_fold_movlpd (load / store)
+; TODO stack_fold_movlps (load / store)
+
+define <4 x float> @stack_fold_movshdup(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_movshdup
+  ;CHECK:   vmovshdup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  ret <4 x float> %2
+}
+
+define <8 x float> @stack_fold_movshdup_ymm(<8 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_movshdup_ymm
+  ;CHECK:   vmovshdup {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  ret <8 x float> %2
+}
+
+define <4 x float> @stack_fold_movsldup(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_movsldup
+  ;CHECK:   vmovsldup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x float> %2
+}
+
+define <8 x float> @stack_fold_movsldup_ymm(<8 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_movsldup_ymm
+  ;CHECK:   vmovsldup {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  ret <8 x float> %2
+}
+
+define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_mulpd
+  ;CHECK:       vmulpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fmul <2 x double> %a0, %a1
+  ret <2 x double> %2
+}
+
+define <4 x double> @stack_fold_mulpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_mulpd_ymm
+  ;CHECK:       vmulpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fmul <4 x double> %a0, %a1
+  ret <4 x double> %2
+}
+
+define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_mulps
+  ;CHECK:       vmulps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fmul <4 x float> %a0, %a1
+  ret <4 x float> %2
+}
+
+define <8 x float> @stack_fold_mulps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_mulps_ymm
+  ;CHECK:       vmulps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fmul <8 x float> %a0, %a1
+  ret <8 x float> %2
+}
+
+define double @stack_fold_mulsd(double %a0, double %a1) {
+  ;CHECK-LABEL: stack_fold_mulsd
+  ;CHECK:       vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fmul double %a0, %a1
+  ret double %2
+}
+
+define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_mulsd_int
+  ;CHECK:       vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_mulss(float %a0, float %a1) {
+  ;CHECK-LABEL: stack_fold_mulss
+  ;CHECK:       vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fmul float %a0, %a1
+  ret float %2
+}
+
+define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_mulss_int
+  ;CHECK:       vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_orpd
+  ;CHECK:       vorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <2 x double> %a0 to <2 x i64>
+  %3 = bitcast <2 x double> %a1 to <2 x i64>
+  %4 = or <2 x i64> %2, %3
+  %5 = bitcast <2 x i64> %4 to <2 x double>
+  ; fadd forces execution domain
+  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
+  ret <2 x double> %6
+}
+
+define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_orpd_ymm
+  ;CHECK:       vorpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <4 x double> %a0 to <4 x i64>
+  %3 = bitcast <4 x double> %a1 to <4 x i64>
+  %4 = or <4 x i64> %2, %3
+  %5 = bitcast <4 x i64> %4 to <4 x double>
+  ; fadd forces execution domain
+  %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %6
+}
+
+define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_orps
+  ;CHECK:       vorps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <4 x float> %a0 to <2 x i64>
+  %3 = bitcast <4 x float> %a1 to <2 x i64>
+  %4 = or <2 x i64> %2, %3
+  %5 = bitcast <2 x i64> %4 to <4 x float>
+  ; fadd forces execution domain
+  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
+  ret <4 x float> %6
+}
+
+define <8 x float> @stack_fold_orps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_orps_ymm
+  ;CHECK:       vorps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <8 x float> %a0 to <4 x i64>
+  %3 = bitcast <8 x float> %a1 to <4 x i64>
+  %4 = or <4 x i64> %2, %3
+  %5 = bitcast <4 x i64> %4 to <8 x float>
+  ; fadd forces execution domain
+  %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
+  ret <8 x float> %6
+}
+
+define <8 x float> @stack_fold_perm2f128(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_perm2f128
+  ;CHECK:   vperm2f128 $33, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x float> %2
+}
+
+define <2 x double> @stack_fold_permilpd(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_permilpd
+  ;CHECK:   vpermilpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  ret <2 x double> %2
+}
+
+define <4 x double> @stack_fold_permilpd_ymm(<4 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_permilpd_ymm
+  ;CHECK:   vpermilpd $5, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x double> %2
+}
+
+define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_permilpdvar
+  ;CHECK:       vpermilpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
+
+define <4 x double> @stack_fold_permilpdvar_ymm(<4 x double> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_permilpdvar_ymm
+  ;CHECK:       vpermilpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
+  ret <4 x double> %2
+}
+declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
+
+define <4 x float> @stack_fold_permilps(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_permilps
+  ;CHECK:   vpermilps $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x float> %2
+}
+
+define <8 x float> @stack_fold_permilps_ymm(<8 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_permilps_ymm
+  ;CHECK:   vpermilps $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x float> %2
+}
+
+define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_permilpsvar
+  ;CHECK:       vpermilps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
+
+define <8 x float> @stack_fold_permilpsvar_ymm(<8 x float> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_permilpsvar_ymm
+  ;CHECK:       vpermilps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
+
+; TODO stack_fold_rcpps
+
+define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_rcpps_int
+  ;CHECK:       vrcpps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+; TODO stack_fold_rcpps_ymm
+
+define <8 x float> @stack_fold_rcpps_ymm_int(<8 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_rcpps_ymm_int
+  ;CHECK:       vrcpps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
+
+; TODO stack_fold_rcpss
+
+define <4 x float> @stack_fold_rcpss_int(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_rcpss_int
+  ;CHECK:       vrcpss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_roundpd(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_roundpd
+  ;CHECK:  vroundpd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <4 x double> @stack_fold_roundpd_ymm(<4 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_roundpd_ymm
+  ;CHECK:  vroundpd $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7)
+  ret <4 x double> %2
+}
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define <4 x float> @stack_fold_roundps(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_roundps
+  ;CHECK:  vroundps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <8 x float> @stack_fold_roundps_ymm(<8 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_roundps_ymm
+  ;CHECK:  vroundps $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
+
+; TODO stack_fold_roundsd
+
+; TODO stack_fold_roundsd_int
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+; TODO stack_fold_roundss
+
+; TODO stack_fold_roundss_int
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
+
+; TODO stack_fold_rsqrtps
+
+define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_rsqrtps_int
+  ;CHECK:       vrsqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+; TODO stack_fold_rsqrtps_ymm
+
+define <8 x float> @stack_fold_rsqrtps_ymm_int(<8 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_rsqrtps_ymm_int
+  ;CHECK:       vrsqrtps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
+
+; TODO stack_fold_rsqrtss
+
+define <4 x float> @stack_fold_rsqrtss_int(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_rsqrtss_int
+  ;CHECK:       vrsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_shufpd
+  ;CHECK:       vshufpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
+  ret <2 x double> %2
+}
+
+define <4 x double> @stack_fold_shufpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_shufpd_ymm
+  ;CHECK:       vshufpd $5, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
+  ret <4 x double> %2
+}
+
+define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_shufps
+  ;CHECK:       vshufps $200, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7>
+  ret <4 x float> %2
+}
+
+define <8 x float> @stack_fold_shufps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_shufps_ymm
+  ;CHECK:       vshufps $148, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 9, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %2
+}
+
+define <2 x double> @stack_fold_sqrtpd(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_sqrtpd
+  ;CHECK:       vsqrtpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <4 x double> @stack_fold_sqrtpd_ymm(<4 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_sqrtpd_ymm
+  ;CHECK:       vsqrtpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0)
+  ret <4 x double> %2
+}
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_sqrtps
+  ;CHECK:       vsqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @stack_fold_sqrtps_ymm(<8 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_sqrtps_ymm
+  ;CHECK:       vsqrtps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
+define double @stack_fold_sqrtsd(double %a0) {
+  ;CHECK-LABEL: stack_fold_sqrtsd
+  ;CHECK:       vsqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call double @llvm.sqrt.f64(double %a0)
+  ret double %2
+}
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+define <2 x double> @stack_fold_sqrtsd_int(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_sqrtsd_int
+  ;CHECK:       vsqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define float @stack_fold_sqrtss(float %a0) {
+  ;CHECK-LABEL: stack_fold_sqrtss
+  ;CHECK:       vsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call float @llvm.sqrt.f32(float %a0)
+  ret float %2
+}
+declare float @llvm.sqrt.f32(float) nounwind readnone
+
+define <4 x float> @stack_fold_sqrtss_int(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_sqrtss_int
+  ;CHECK:       vsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_subpd
+  ;CHECK:       vsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fsub <2 x double> %a0, %a1
+  ret <2 x double> %2
+}
+
+define <4 x double> @stack_fold_subpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_subpd_ymm
+  ;CHECK:       vsubpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fsub <4 x double> %a0, %a1
+  ret <4 x double> %2
+}
+
+define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_subps
+  ;CHECK:       vsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fsub <4 x float> %a0, %a1
+  ret <4 x float> %2
+}
+
+define <8 x float> @stack_fold_subps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_subps_ymm
+  ;CHECK:       vsubps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fsub <8 x float> %a0, %a1
+  ret <8 x float> %2
+}
+
+define double @stack_fold_subsd(double %a0, double %a1) {
+  ;CHECK-LABEL: stack_fold_subsd
+  ;CHECK:       vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fsub double %a0, %a1
+  ret double %2
+}
+
+define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_subsd_int
+  ;CHECK:       vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_subss(float %a0, float %a1) {
+  ;CHECK-LABEL: stack_fold_subss
+  ;CHECK:       vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fsub float %a0, %a1
+  ret float %2
+}
+
+define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_subss_int
+  ;CHECK:       vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @stack_fold_testpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_testpd
+  ;CHECK:       vtestpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @stack_fold_testpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_testpd_ymm
+  ;CHECK:       vtestpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define i32 @stack_fold_testps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_testps
+  ;CHECK:       vtestps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @stack_fold_testps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_testps_ymm
+  ;CHECK:       vtestps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define i32 @stack_fold_ucomisd(double %a0, double %a1) {
+  ;CHECK-LABEL: stack_fold_ucomisd
+  ;CHECK:       vucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fcmp ueq double %a0, %a1
+  %3 = select i1 %2, i32 1, i32 -1
+  ret i32 %3
+}
+
+define i32 @stack_fold_ucomisd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_ucomisd_int
+  ;CHECK:       vucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @stack_fold_ucomiss(float %a0, float %a1) {
+  ;CHECK-LABEL: stack_fold_ucomiss
+  ;CHECK:       vucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fcmp ueq float %a0, %a1
+  %3 = select i1 %2, i32 1, i32 -1
+  ret i32 %3
+}
+
+define i32 @stack_fold_ucomiss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_ucomiss_int
+  ;CHECK:       vucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_unpckhpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_unpckhpd
+  ;CHECK:       vunpckhpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
+  ret <2 x double> %2
+}
+
+define <4 x double> @stack_fold_unpckhpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_unpckhpd_ymm
+  ;CHECK:       vunpckhpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x double> %2
+}
+
+define <4 x float> @stack_fold_unpckhps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_unpckhps
+  ;CHECK:       vunpckhps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x float> %2
+}
+
+define <8 x float> @stack_fold_unpckhps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_unpckhps_ymm
+  ;CHECK:       vunpckhps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x float> %2
+}
+
+define <2 x double> @stack_fold_unpcklpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_unpcklpd
+  ;CHECK:       vunpcklpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
+  ret <2 x double> %2
+}
+
+define <4 x double> @stack_fold_unpcklpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_unpcklpd_ymm
+  ;CHECK:       vunpcklpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x double> %2
+}
+
+define <4 x float> @stack_fold_unpcklps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_unpcklps
+  ;CHECK:       vunpcklps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x float> %2
+}
+
+define <8 x float> @stack_fold_unpcklps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_unpcklps_ymm
+  ;CHECK:       vunpcklps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  ret <8 x float> %2
+}
+
+define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_xorpd
+  ;CHECK:       vxorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <2 x double> %a0 to <2 x i64>
+  %3 = bitcast <2 x double> %a1 to <2 x i64>
+  %4 = xor <2 x i64> %2, %3
+  %5 = bitcast <2 x i64> %4 to <2 x double>
+  ; fadd forces execution domain
+  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
+  ret <2 x double> %6
+}
+
+define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_xorpd_ymm
+  ;CHECK:       vxorpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <4 x double> %a0 to <4 x i64>
+  %3 = bitcast <4 x double> %a1 to <4 x i64>
+  %4 = xor <4 x i64> %2, %3
+  %5 = bitcast <4 x i64> %4 to <4 x double>
+  ; fadd forces execution domain
+  %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %6
+}
+
+define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_xorps
+  ;CHECK:       vxorps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <4 x float> %a0 to <2 x i64>
+  %3 = bitcast <4 x float> %a1 to <2 x i64>
+  %4 = xor <2 x i64> %2, %3
+  %5 = bitcast <2 x i64> %4 to <4 x float>
+  ; fadd forces execution domain
+  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
+  ret <4 x float> %6
+}
+
+define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_xorps_ymm
+  ;CHECK:       vxorps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <8 x float> %a0 to <4 x i64>
+  %3 = bitcast <8 x float> %a1 to <4 x i64>
+  %4 = xor <4 x i64> %2, %3
+  %5 = bitcast <4 x i64> %4 to <8 x float>
+  ; fadd forces execution domain
+  %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
+  ret <8 x float> %6
+}
diff --git a/test/CodeGen/X86/stack-folding-fp-sse42.ll b/test/CodeGen/X86/stack-folding-fp-sse42.ll
new file mode 100644
index 0000000..c26cc9d
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-fp-sse42.ll
@@ -0,0 +1,1089 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.2 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_addpd
+  ;CHECK:       addpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fadd <2 x double> %a0, %a1
+  ret <2 x double> %2
+}
+
+define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_addps
+  ;CHECK:       addps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fadd <4 x float> %a0, %a1
+  ret <4 x float> %2
+}
+
+define double @stack_fold_addsd(double %a0, double %a1) {
+  ;CHECK-LABEL: stack_fold_addsd
+  ;CHECK:       addsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fadd double %a0, %a1
+  ret double %2
+}
+
+define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_addsd_int
+  ;CHECK:       addsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_addss(float %a0, float %a1) {
+  ;CHECK-LABEL: stack_fold_addss
+  ;CHECK:       addss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fadd float %a0, %a1
+  ret float %2
+}
+
+define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_addss_int
+  ;CHECK:       addss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_addsubpd
+  ;CHECK:       addsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @stack_fold_addsubps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_addsubps
+  ;CHECK:       addsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_andnpd
+  ;CHECK:       andnpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <2 x double> %a0 to <2 x i64>
+  %3 = bitcast <2 x double> %a1 to <2 x i64>
+  %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
+  %5 = and <2 x i64> %4, %3
+  %6 = bitcast <2 x i64> %5 to <2 x double>
+  ; fadd forces execution domain
+  %7 = fadd <2 x double> %6, <double 0x0, double 0x0>
+  ret <2 x double> %7
+}
+
+define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_andnps
+  ;CHECK:       andnps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <4 x float> %a0 to <2 x i64>
+  %3 = bitcast <4 x float> %a1 to <2 x i64>
+  %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
+  %5 = and <2 x i64> %4, %3
+  %6 = bitcast <2 x i64> %5 to <4 x float>
+  ; fadd forces execution domain
+  %7 = fadd <4 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0>
+  ret <4 x float> %7
+}
+
+define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_andpd
+  ;CHECK:       andpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <2 x double> %a0 to <2 x i64>
+  %3 = bitcast <2 x double> %a1 to <2 x i64>
+  %4 = and <2 x i64> %2, %3
+  %5 = bitcast <2 x i64> %4 to <2 x double>
+  ; fadd forces execution domain
+  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
+  ret <2 x double> %6
+}
+
+define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_andps
+  ;CHECK:       andps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <4 x float> %a0 to <2 x i64>
+  %3 = bitcast <4 x float> %a1 to <2 x i64>
+  %4 = and <2 x i64> %2, %3
+  %5 = bitcast <2 x i64> %4 to <4 x float>
+  ; fadd forces execution domain
+  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
+  ret <4 x float> %6
+}
+
+define <2 x double> @stack_fold_blendpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_blendpd
+  ;CHECK:       blendpd $2, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = select <2 x i1> <i1 1, i1 0>, <2 x double> %a0, <2 x double> %a1
+  ret <2 x double> %2
+}
+
+define <4 x float> @stack_fold_blendps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_blendps
+  ;CHECK:       blendps $6, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x float> %a0, <4 x float> %a1
+  ret <4 x float> %2
+}
+
+define <2 x double> @stack_fold_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %c) {
+  ;CHECK-LABEL: stack_fold_blendvpd
+  ;CHECK:       blendvpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a1, <2 x double> %c, <2 x double> %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @stack_fold_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %c) {
+  ;CHECK-LABEL: stack_fold_blendvps
+  ;CHECK:       blendvps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a1, <4 x float> %c, <4 x float> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_cmppd
+  ;CHECK:       cmpeqpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x float> @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_cmpps
+  ;CHECK:       cmpeqps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define i32 @stack_fold_cmpsd(double %a0, double %a1) {
+  ;CHECK-LABEL: stack_fold_cmpsd
+  ;CHECK:       cmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fcmp oeq double %a0, %a1
+  %3 = zext i1 %2 to i32
+  ret i32 %3
+}
+
+define <2 x double> @stack_fold_cmpsd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_cmpsd_int
+  ;CHECK:       cmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define i32 @stack_fold_cmpss(float %a0, float %a1) {
+  ;CHECK-LABEL: stack_fold_cmpss
+  ;CHECK:       cmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fcmp oeq float %a0, %a1
+  %3 = zext i1 %2 to i32
+  ret i32 %3
+}
+
+define <4 x float> @stack_fold_cmpss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_cmpss_int
+  ;CHECK:       cmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
+
+; TODO stack_fold_comisd
+
+define i32 @stack_fold_comisd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_comisd_int
+  ;CHECK:       comisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+; TODO stack_fold_comiss
+
+define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_comiss_int
+  ;CHECK:       comiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtdq2pd
+  ;CHECK:       cvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
+
+define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtdq2ps
+  ;CHECK:       cvtdq2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = sitofp <4 x i32> %a0 to <4 x float>
+  ret <4 x float> %2
+}
+
+define <4 x i32> @stack_fold_cvtpd2dq(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtpd2dq
+  ;CHECK:       cvtpd2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
+
+define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtpd2ps
+  ;CHECK:       cvtpd2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fptrunc <2 x double> %a0 to <2 x float>
+  ret <2 x float> %2
+}
+
+define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtps2dq
+  ;CHECK:       cvtps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtps2pd
+  ;CHECK:       cvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
+
+; TODO stack_fold_cvtsd2si
+
+define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsd2si_int
+  ;CHECK:       cvtsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
+
+; TODO stack_fold_cvtsd2si64
+
+define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsd2si64_int
+  ;CHECK:       cvtsd2siq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
+  ret i64 %2
+}
+declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
+
+; TODO stack_fold_cvtsd2ss
+
+define <4 x float> @stack_fold_cvtsd2ss_int(<2 x double> %a0) optsize {
+  ;CHECK-LABEL: stack_fold_cvtsd2ss_int
+  ;CHECK:       cvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, <2 x double> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
+
+define double @stack_fold_cvtsi2sd(i32 %a0) optsize {
+  ;CHECK-LABEL: stack_fold_cvtsi2sd
+  ;CHECK:       cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sitofp i32 %a0 to double
+  ret double %2
+}
+
+define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsi2sd_int
+  ;CHECK:       cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double 0x0, double 0x0>, i32 %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
+
+define double @stack_fold_cvtsi642sd(i64 %a0) optsize {
+  ;CHECK-LABEL: stack_fold_cvtsi642sd
+  ;CHECK:       cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sitofp i64 %a0 to double
+  ret double %2
+}
+
+define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsi642sd_int
+  ;CHECK:       cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> <double 0x0, double 0x0>, i64 %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
+
+define float @stack_fold_cvtsi2ss(i32 %a0) optsize {
+  ;CHECK-LABEL: stack_fold_cvtsi2ss
+  ;CHECK:       cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sitofp i32 %a0 to float
+  ret float %2
+}
+
+define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsi2ss_int
+  ;CHECK:  cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, i32 %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
+
+define float @stack_fold_cvtsi642ss(i64 %a0) optsize {
+  ;CHECK-LABEL: stack_fold_cvtsi642ss
+  ;CHECK:       cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sitofp i64 %a0 to float
+  ret float %2
+}
+
+define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_cvtsi642ss_int
+  ;CHECK:  cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, i64 %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
+
+define double @stack_fold_cvtss2sd(float %a0) optsize {
+  ;CHECK-LABEL: stack_fold_cvtss2sd
+  ;CHECK:       cvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fpext float %a0 to double
+  ret double %2
+}
+
+define <2 x double> @stack_fold_cvtss2sd_int(<4 x float> %a0) optsize {
+  ;CHECK-LABEL: stack_fold_cvtss2sd_int
+  ;CHECK:       cvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
+
+; TODO stack_fold_cvtss2si
+
+define i32 @stack_fold_cvtss2si_int(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtss2si_int
+  ;CHECK:       cvtss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
+
+; TODO stack_fold_cvtss2si64
+
+define i64 @stack_fold_cvtss2si64_int(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtss2si64_int
+  ;CHECK:       cvtss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
+  ret i64 %2
+}
+declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
+
+define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttpd2dq
+  ;CHECK:       cvttpd2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
+
+define <4 x i32> @stack_fold_cvttps2dq(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttps2dq
+  ;CHECK:       cvttps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fptosi <4 x float> %a0 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define i32 @stack_fold_cvttsd2si(double %a0) {
+  ;CHECK-LABEL: stack_fold_cvttsd2si
+  ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fptosi double %a0 to i32
+  ret i32 %2
+}
+
+define i32 @stack_fold_cvttsd2si_int(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttsd2si_int
+  ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
+
+define i64 @stack_fold_cvttsd2si64(double %a0) {
+  ;CHECK-LABEL: stack_fold_cvttsd2si64
+  ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fptosi double %a0 to i64
+  ret i64 %2
+}
+
+define i64 @stack_fold_cvttsd2si64_int(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttsd2si64_int
+  ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
+  ret i64 %2
+}
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
+
+define i32 @stack_fold_cvttss2si(float %a0) {
+  ;CHECK-LABEL: stack_fold_cvttss2si
+  ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fptosi float %a0 to i32
+  ret i32 %2
+}
+
+define i32 @stack_fold_cvttss2si_int(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttss2si_int
+  ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
+
+define i64 @stack_fold_cvttss2si64(float %a0) {
+  ;CHECK-LABEL: stack_fold_cvttss2si64
+  ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fptosi float %a0 to i64
+  ret i64 %2
+}
+
+define i64 @stack_fold_cvttss2si64_int(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttss2si64_int
+  ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
+  ret i64 %2
+}
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_divpd
+  ;CHECK:       divpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fdiv <2 x double> %a0, %a1
+  ret <2 x double> %2
+}
+
+define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_divps
+  ;CHECK:       divps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fdiv <4 x float> %a0, %a1
+  ret <4 x float> %2
+}
+
+define double @stack_fold_divsd(double %a0, double %a1) {
+  ;CHECK-LABEL: stack_fold_divsd
+  ;CHECK:       divsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fdiv double %a0, %a1
+  ret double %2
+}
+
+define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_divsd_int
+  ;CHECK:       divsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_divss(float %a0, float %a1) {
+  ;CHECK-LABEL: stack_fold_divss
+  ;CHECK:       divss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fdiv float %a0, %a1
+  ret float %2
+}
+
+define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_divss_int
+  ;CHECK:       divss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_dppd
+  ;CHECK:       dppd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_dpps
+  ;CHECK:       dpps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define i32 @stack_fold_extractps(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_extractps
+  ;CHECK:       extractps $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
+  ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
+  %1 = extractelement <4 x float> %a0, i32 1
+  %2 = bitcast float %1 to i32
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i32 %2
+}
+
+define <2 x double> @stack_fold_haddpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_haddpd
+  ;CHECK:       haddpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @stack_fold_haddps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_haddps
+  ;CHECK:       haddps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_hsubpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_hsubpd
+  ;CHECK:       hsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_hsubps
+  ;CHECK:       hsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
+
+; TODO stack_fold_insertps
+
+define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_maxpd
+  ;CHECK:       maxpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_maxps
+  ;CHECK:       maxps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define double @stack_fold_maxsd(double %a0, double %a1) {
+  ;CHECK-LABEL: stack_fold_maxsd
+  ;CHECK:       maxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fcmp ogt double %a0, %a1
+  %3 = select i1 %2, double %a0, double %a1
+  ret double %3
+}
+
+define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_maxsd_int
+  ;CHECK:       maxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_maxss(float %a0, float %a1) {
+  ;CHECK-LABEL: stack_fold_maxss
+  ;CHECK:       maxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fcmp ogt float %a0, %a1
+  %3 = select i1 %2, float %a0, float %a1
+  ret float %3
+}
+
+define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_maxss_int
+  ;CHECK:       maxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_minpd
+  ;CHECK:       minpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_minps
+  ;CHECK:       minps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define double @stack_fold_minsd(double %a0, double %a1) {
+  ;CHECK-LABEL: stack_fold_minsd
+  ;CHECK:       minsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fcmp olt double %a0, %a1
+  %3 = select i1 %2, double %a0, double %a1
+  ret double %3
+}
+
+define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_minsd_int
+  ;CHECK:       minsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_minss(float %a0, float %a1) {
+  ;CHECK-LABEL: stack_fold_minss
+  ;CHECK:       minss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fcmp olt float %a0, %a1
+  %3 = select i1 %2, float %a0, float %a1
+  ret float %3
+}
+
+define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_minss_int
+  ;CHECK:       minss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_movddup(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_movddup
+  ;CHECK:   movddup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %2
+}
+; TODO stack_fold_movhpd (load / store)
+; TODO stack_fold_movhps (load / store)
+
+; TODO stack_fold_movlpd (load / store)
+; TODO stack_fold_movlps (load / store)
+
+define <4 x float> @stack_fold_movshdup(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_movshdup
+  ;CHECK:       movshdup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  ret <4 x float> %2
+}
+
+define <4 x float> @stack_fold_movsldup(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_movsldup
+  ;CHECK:       movsldup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x float> %2
+}
+
+define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_mulpd
+  ;CHECK:       mulpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fmul <2 x double> %a0, %a1
+  ret <2 x double> %2
+}
+
+define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_mulps
+  ;CHECK:       mulps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fmul <4 x float> %a0, %a1
+  ret <4 x float> %2
+}
+
+define double @stack_fold_mulsd(double %a0, double %a1) {
+  ;CHECK-LABEL: stack_fold_mulsd
+  ;CHECK:       mulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fmul double %a0, %a1
+  ret double %2
+}
+
+define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_mulsd_int
+  ;CHECK:       mulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_mulss(float %a0, float %a1) {
+  ;CHECK-LABEL: stack_fold_mulss
+  ;CHECK:       mulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fmul float %a0, %a1
+  ret float %2
+}
+
+define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_mulss_int
+  ;CHECK:       mulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_orpd
+  ;CHECK:       orpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <2 x double> %a0 to <2 x i64>
+  %3 = bitcast <2 x double> %a1 to <2 x i64>
+  %4 = or <2 x i64> %2, %3
+  %5 = bitcast <2 x i64> %4 to <2 x double>
+  ; fadd forces execution domain
+  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
+  ret <2 x double> %6
+}
+
+define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_orps
+  ;CHECK:       orps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <4 x float> %a0 to <2 x i64>
+  %3 = bitcast <4 x float> %a1 to <2 x i64>
+  %4 = or <2 x i64> %2, %3
+  %5 = bitcast <2 x i64> %4 to <4 x float>
+  ; fadd forces execution domain
+  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
+  ret <4 x float> %6
+}
+
+; TODO stack_fold_rcpps
+
+define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_rcpps_int
+  ;CHECK:       rcpps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+; TODO stack_fold_rcpss
+; TODO stack_fold_rcpss_int
+
+define <2 x double> @stack_fold_roundpd(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_roundpd
+  ;CHECK:       roundpd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <4 x float> @stack_fold_roundps(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_roundps
+  ;CHECK:       roundps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+; TODO stack_fold_roundsd
+; TODO stack_fold_roundsd_int
+
+; TODO stack_fold_roundss
+; TODO stack_fold_roundss_int
+
+; TODO stack_fold_rsqrtps
+
+define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_rsqrtps_int
+  ;CHECK:       rsqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+; TODO stack_fold_rsqrtss
+; TODO stack_fold_rsqrtss_int
+
+define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_shufpd
+  ;CHECK:       shufpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
+  ret <2 x double> %2
+}
+
+define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_shufps
+  ;CHECK:       shufps $200, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7>
+  ret <4 x float> %2
+}
+
+define <2 x double> @stack_fold_sqrtpd(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_sqrtpd
+  ;CHECK:       sqrtpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_sqrtps
+  ;CHECK:       sqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+; TODO stack_fold_sqrtsd
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+; TODO stack_fold_sqrtsd_int
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+; TODO stack_fold_sqrtss
+declare float @llvm.sqrt.f32(float) nounwind readnone
+
+; TODO stack_fold_sqrtss_int
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_subpd
+  ;CHECK:       subpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fsub <2 x double> %a0, %a1
+  ret <2 x double> %2
+}
+
+define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_subps
+  ;CHECK:       subps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fsub <4 x float> %a0, %a1
+  ret <4 x float> %2
+}
+
+define double @stack_fold_subsd(double %a0, double %a1) {
+  ;CHECK-LABEL: stack_fold_subsd
+  ;CHECK:       subsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fsub double %a0, %a1
+  ret double %2
+}
+
+define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_subsd_int
+  ;CHECK:       subsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_subss(float %a0, float %a1) {
+  ;CHECK-LABEL: stack_fold_subss
+  ;CHECK:       subss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fsub float %a0, %a1
+  ret float %2
+}
+
+define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_subss_int
+  ;CHECK:       subss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @stack_fold_ucomisd(double %a0, double %a1) {
+  ;CHECK-LABEL: stack_fold_ucomisd
+  ;CHECK:       ucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fcmp ueq double %a0, %a1
+  %3 = select i1 %2, i32 1, i32 -1
+  ret i32 %3
+}
+
+define i32 @stack_fold_ucomisd_int(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_ucomisd_int
+  ;CHECK:       ucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @stack_fold_ucomiss(float %a0, float %a1) {
+  ;CHECK-LABEL: stack_fold_ucomiss
+  ;CHECK:       ucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = fcmp ueq float %a0, %a1
+  %3 = select i1 %2, i32 1, i32 -1
+  ret i32 %3
+}
+
+define i32 @stack_fold_ucomiss_int(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_ucomiss_int
+  ;CHECK:       ucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_unpckhpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_unpckhpd
+  ;CHECK:       unpckhpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
+  ret <2 x double> %2
+}
+
+define <4 x float> @stack_fold_unpckhps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_unpckhps
+  ;CHECK:       unpckhps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x float> %2
+}
+
+define <2 x double> @stack_fold_unpcklpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_unpcklpd
+  ;CHECK:       unpcklpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
+  ret <2 x double> %2
+}
+
+define <4 x float> @stack_fold_unpcklps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_unpcklps
+  ;CHECK:       unpcklps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x float> %2
+}
+
+define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_xorpd
+  ;CHECK:       xorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <2 x double> %a0 to <2 x i64>
+  %3 = bitcast <2 x double> %a1 to <2 x i64>
+  %4 = xor <2 x i64> %2, %3
+  %5 = bitcast <2 x i64> %4 to <2 x double>
+  ; fadd forces execution domain
+  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
+  ret <2 x double> %6
+}
+
+define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_xorps
+  ;CHECK:       xorps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = bitcast <4 x float> %a0 to <2 x i64>
+  %3 = bitcast <4 x float> %a1 to <2 x i64>
+  %4 = xor <2 x i64> %2, %3
+  %5 = bitcast <2 x i64> %4 to <4 x float>
+  ; fadd forces execution domain
+  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
+  ret <4 x float> %6
+}
diff --git a/test/CodeGen/X86/stack-folding-int-avx1.ll b/test/CodeGen/X86/stack-folding-int-avx1.ll
new file mode 100644
index 0000000..2387493
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-int-avx1.ll
@@ -0,0 +1,1152 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+aes,+pclmul < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define <2 x i64> @stack_fold_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_aesdec
+  ;CHECK:       vaesdec {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @stack_fold_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_aesdeclast
+  ;CHECK:       vaesdeclast {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @stack_fold_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_aesenc
+  ;CHECK:       vaesenc {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @stack_fold_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_aesenclast
+  ;CHECK:       vaesenclast {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @stack_fold_aesimc(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_aesimc
+  ;CHECK:       vaesimc {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone
+
+define <2 x i64> @stack_fold_aeskeygenassist(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_aeskeygenassist
+  ;CHECK:       vaeskeygenassist $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone
+
+define <4 x i32> @stack_fold_movd_load(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_movd_load
+  ;CHECK:       movd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0
+  ; add forces execution domain
+  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %3
+}
+
+define i32 @stack_fold_movd_store(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_movd_store
+  ;CHECK:       movd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
+  ; add forces execution domain
+  %1 = add <4 x i32> %a0, <i32 1, i32 1, i32 1, i32 1>
+  %2 = extractelement <4 x i32> %1, i32 0
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i32 %2
+}
+
+define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_movq_load
+  ;CHECK:       movq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %2
+}
+
+define i64 @stack_fold_movq_store(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_movq_store
+  ;CHECK:       movq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
+  %1 = extractelement <2 x i64> %a0, i32 0
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i64 %1
+}
+
+define <8 x i16> @stack_fold_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_mpsadbw
+  ;CHECK:       vmpsadbw $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsb
+  ;CHECK:       vpabsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsd
+  ;CHECK:       vpabsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsw
+  ;CHECK:       vpabsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packssdw
+  ;CHECK:       vpackssdw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packsswb
+  ;CHECK:       vpacksswb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packusdw
+  ;CHECK:       vpackusdw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packuswb
+  ;CHECK:       vpackuswb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_paddb
+  ;CHECK:       vpaddb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = add <16 x i8> %a0, %a1
+  ret <16 x i8> %2
+}
+
+define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_paddd
+  ;CHECK:       vpaddd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = add <4 x i32> %a0, %a1
+  ret <4 x i32> %2
+}
+
+define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_paddq
+  ;CHECK:       vpaddq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = add <2 x i64> %a0, %a1
+  ret <2 x i64> %2
+}
+
+define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_paddsb
+  ;CHECK:       vpaddsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_paddsw
+  ;CHECK:       vpaddsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_paddusb
+  ;CHECK:       vpaddusb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_paddusw
+  ;CHECK:       vpaddusw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_paddw
+  ;CHECK:       vpaddw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = add <8 x i16> %a0, %a1
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @stack_fold_palignr(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_palignr
+  ;CHECK:       vpalignr $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <16 x i8> %a1, <16 x i8> %a0, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @stack_fold_pand(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pand
+  ;CHECK:       vpand {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = and <16 x i8> %a0, %a1
+  ; add forces execution domain
+  %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @stack_fold_pandn(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pandn
+  ;CHECK:       vpandn {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = xor <16 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %3 = and <16 x i8> %2, %a1
+  ; add forces execution domain
+  %4 = add <16 x i8> %3, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %4
+}
+
+define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pavgb
+  ;CHECK:       vpavgb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pavgw
+  ;CHECK:       vpavgw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {
+  ;CHECK-LABEL: stack_fold_pblendvb
+  ;CHECK:       vpblendvb {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pblendw
+  ;CHECK:       vpblendw $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_pclmulqdq
+  ;CHECK:       vpclmulqdq $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpeqb
+  ;CHECK:       vpcmpeqb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp eq <16 x i8> %a0, %a1
+  %3 = sext <16 x i1> %2 to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <4 x i32> @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpeqd
+  ;CHECK:       vpcmpeqd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp eq <4 x i32> %a0, %a1
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <2 x i64> @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpeqq
+  ;CHECK:       vpcmpeqq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp eq <2 x i64> %a0, %a1
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <8 x i16> @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpeqw
+  ;CHECK:       vpcmpeqw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp eq <8 x i16> %a0, %a1
+  %3 = sext <8 x i1> %2 to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpestri
+  ;CHECK:       vpcmpestri $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
+  %2 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpestrm
+  ;CHECK:       vpcmpestrm $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpgtb
+  ;CHECK:       vpcmpgtb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp sgt <16 x i8> %a0, %a1
+  %3 = sext <16 x i1> %2 to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <4 x i32> @stack_fold_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpgtd
+  ;CHECK:       vpcmpgtd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp sgt <4 x i32> %a0, %a1
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <2 x i64> @stack_fold_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpgtq
+  ;CHECK:       vpcmpgtq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp sgt <2 x i64> %a0, %a1
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <8 x i16> @stack_fold_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpgtw
+  ;CHECK:       vpcmpgtw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp sgt <8 x i16> %a0, %a1
+  %3 = sext <8 x i1> %2 to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpistri
+  ;CHECK:       vpcmpistri $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpistrm
+  ;CHECK:       vpcmpistrm $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+; TODO stack_fold_pextrb
+
+define i32 @stack_fold_pextrd(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pextrd
+  ;CHECK:       pextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
+  ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
+  %1 = extractelement <4 x i32> %a0, i32 1
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i32 %1
+}
+
+define i64 @stack_fold_pextrq(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_pextrq
+  ;CHECK:       pextrq $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
+  ;CHECK:       movq    {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Reload
+  %1 = extractelement <2 x i64> %a0, i32 1
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i64 %1
+}
+
+; TODO stack_fold_pextrw
+
+define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_phaddd
+  ;CHECK:       vphaddd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_phaddsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_phaddsw
+  ;CHECK:       vphaddsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_phaddw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_phaddw
+  ;CHECK:       vphaddw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_phminposuw
+  ;CHECK:       vphminposuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_phsubd
+  ;CHECK:       vphsubd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_phsubsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_phsubsw
+  ;CHECK:       vphsubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_phsubw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_phsubw
+  ;CHECK:       vphsubw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
+  ;CHECK-LABEL: stack_fold_pinsrb
+  ;CHECK:       vpinsrb $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
+  ret <16 x i8> %2
+}
+
+define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
+  ;CHECK-LABEL: stack_fold_pinsrd
+  ;CHECK:       vpinsrd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
+  ret <4 x i32> %2
+}
+
+define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
+  ;CHECK-LABEL: stack_fold_pinsrq
+  ;CHECK:       vpinsrq $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
+  ret <2 x i64> %2
+}
+
+define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
+  ;CHECK-LABEL: stack_fold_pinsrw
+  ;CHECK:       vpinsrw $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaddubsw
+  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaddwd
+  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsb
+  ;CHECK:       vpmaxsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsd
+  ;CHECK:       vpmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsw
+  ;CHECK:       vpmaxsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxub
+  ;CHECK:       vpmaxub {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxud
+  ;CHECK:       vpmaxud {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxuw
+  ;CHECK:       vpmaxuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsb
+  ;CHECK:       vpminsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsd
+  ;CHECK:       vpminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsw
+  ;CHECK:       vpminsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pminub
+  ;CHECK:       vpminub {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pminud
+  ;CHECK:       vpminud {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pminuw
+  ;CHECK:       vpminuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxbd
+  ;CHECK:       vpmovsxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxbq
+  ;CHECK:       pmovsxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxbw
+  ;CHECK:       vpmovsxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxdq
+  ;CHECK:       vpmovsxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxwd
+  ;CHECK:       vpmovsxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxwq
+  ;CHECK:       vpmovsxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxbd
+  ;CHECK:       vpmovzxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxbq
+  ;CHECK:       vpmovzxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxbw
+  ;CHECK:       vpmovzxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxdq
+  ;CHECK:       vpmovzxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxwd
+  ;CHECK:       vpmovzxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxwq
+  ;CHECK:       vpmovzxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmuldq
+  ;CHECK:       vpmuldq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmulhrsw
+  ;CHECK:       vpmulhrsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_pmulhuw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmulhuw
+  ;CHECK:       vpmulhuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_pmulhw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmulhw
+  ;CHECK:       vpmulhw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmulld(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmulld
+  ;CHECK:       vpmulld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = mul <4 x i32> %a0, %a1
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @stack_fold_pmullw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmullw
+  ;CHECK:       vpmullw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = mul <8 x i16> %a0, %a1
+  ret <8 x i16> %2
+}
+
+define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmuludq
+  ;CHECK:       vpmuludq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <16 x i8> @stack_fold_por(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_por
+  ;CHECK:       vpor {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = or <16 x i8> %a0, %a1
+  ; add forces execution domain
+  %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %3
+}
+
+define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psadbw
+  ;CHECK:       vpsadbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pshufb
+  ;CHECK:       vpshufb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pshufd
+  ;CHECK:       vpshufd $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pshufhw
+  ;CHECK:       vpshufhw $11, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pshuflw
+  ;CHECK:       vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @stack_fold_psignb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psignb
+  ;CHECK:       vpsignb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_psignd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psignd
+  ;CHECK:       vpsignd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_psignw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psignw
+  ;CHECK:       vpsignw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pslld
+  ;CHECK:       vpslld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psllq
+  ;CHECK:       vpsllq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psllw
+  ;CHECK:       vpsllw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrad
+  ;CHECK:       vpsrad {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psraw
+  ;CHECK:       vpsraw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrld
+  ;CHECK:       vpsrld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlq
+  ;CHECK:       vpsrlq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlw
+  ;CHECK:       vpsrlw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubb
+  ;CHECK:       vpsubb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = sub <16 x i8> %a0, %a1
+  ret <16 x i8> %2
+}
+
+define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psubd
+  ;CHECK:       vpsubd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = sub <4 x i32> %a0, %a1
+  ret <4 x i32> %2
+}
+
+define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psubq
+  ;CHECK:       vpsubq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = sub <2 x i64> %a0, %a1
+  ret <2 x i64> %2
+}
+
+define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubsb
+  ;CHECK:       vpsubsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubsw
+  ;CHECK:       vpsubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubusb
+  ;CHECK:       vpsubusb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubusw
+  ;CHECK:       vpsubusw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubw
+  ;CHECK:       vpsubw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = sub <8 x i16> %a0, %a1
+  ret <8 x i16> %2
+}
+
+define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_ptest
+  ;CHECK:       vptest {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
+
+define i32 @stack_fold_ptest_ymm(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_ptest_ymm
+  ;CHECK:       vptest {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckhbw
+  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %2
+}
+
+define <4 x i32> @stack_fold_punpckhdq(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckhdq
+  ;CHECK:       vpunpckhdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ; add forces execution domain
+  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %3
+}
+
+define <2 x i64> @stack_fold_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckhqdq
+  ;CHECK:       vpunpckhqdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
+  ; add forces execution domain
+  %3 = add <2 x i64> %2, <i64 1, i64 1>
+  ret <2 x i64> %3
+}
+
+define <8 x i16> @stack_fold_punpckhwd(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckhwd
+  ;CHECK:       vpunpckhwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @stack_fold_punpcklbw(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_punpcklbw
+  ;CHECK:       vpunpcklbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ret <16 x i8> %2
+}
+
+define <4 x i32> @stack_fold_punpckldq(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckldq
+  ;CHECK:       vpunpckldq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ; add forces execution domain
+  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %3
+}
+
+define <2 x i64> @stack_fold_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_punpcklqdq
+  ;CHECK:       vpunpcklqdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
+  ; add forces execution domain
+  %3 = add <2 x i64> %2, <i64 1, i64 1>
+  ret <2 x i64> %3
+}
+
+define <8 x i16> @stack_fold_punpcklwd(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_punpcklwd
+  ;CHECK:       vpunpcklwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @stack_fold_pxor(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pxor
+  ;CHECK:       vpxor {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = xor <16 x i8> %a0, %a1
+  ; add forces execution domain
+  %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %3
+}
diff --git a/test/CodeGen/X86/stack-folding-int-avx2.ll b/test/CodeGen/X86/stack-folding-int-avx2.ll
new file mode 100644
index 0000000..39169e6
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-int-avx2.ll
@@ -0,0 +1,1200 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define <4 x double> @stack_fold_broadcastsd_ymm(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_broadcastsd_ymm
+  ;CHECK:       vbroadcastsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0)
+  ret <4 x double> %2
+}
+declare <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double>) nounwind readonly
+
+define <4 x float> @stack_fold_broadcastss(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_broadcastss
+  ;CHECK:       vbroadcastss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float>) nounwind readonly
+
+define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_broadcastss_ymm
+  ;CHECK:       vbroadcastss {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind readonly
+
+define <4 x i32> @stack_fold_extracti128(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_extracti128
+  ;CHECK:       vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+  ; add forces execution domain
+  %1 = add <8 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_inserti128(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_inserti128
+  ;CHECK:       vinserti128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ; add forces execution domain
+  %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i32> %3
+}
+
+define <16 x i16> @stack_fold_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_mpsadbw
+  ;CHECK:       vmpsadbw $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
+
+define <32 x i8> @stack_fold_pabsb(<32 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsb
+  ;CHECK:       vpabsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
+
+define <8 x i32> @stack_fold_pabsd(<8 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsd
+  ;CHECK:       vpabsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
+
+define <16 x i16> @stack_fold_pabsw(<16 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsw
+  ;CHECK:       vpabsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_packssdw(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packssdw
+  ;CHECK:       vpackssdw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <32 x i8> @stack_fold_packsswb(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packsswb
+  ;CHECK:       vpacksswb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_packusdw(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packusdw
+  ;CHECK:       vpackusdw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <32 x i8> @stack_fold_packuswb(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packuswb
+  ;CHECK:       vpackuswb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <32 x i8> @stack_fold_paddb(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_paddb
+  ;CHECK:       vpaddb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = add <32 x i8> %a0, %a1
+  ret <32 x i8> %2
+}
+
+define <8 x i32> @stack_fold_paddd(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_paddd
+  ;CHECK:       vpaddd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = add <8 x i32> %a0, %a1
+  ret <8 x i32> %2
+}
+
+define <4 x i64> @stack_fold_paddq(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_paddq
+  ;CHECK:       vpaddq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = add <4 x i64> %a0, %a1
+  ret <4 x i64> %2
+}
+
+define <32 x i8> @stack_fold_paddsb(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_paddsb
+  ;CHECK:       vpaddsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @stack_fold_paddsw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_paddsw
+  ;CHECK:       vpaddsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <32 x i8> @stack_fold_paddusb(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_paddusb
+  ;CHECK:       vpaddusb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @stack_fold_paddusw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_paddusw
+  ;CHECK:       vpaddusw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_paddw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_paddw
+  ;CHECK:       vpaddw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = add <16 x i16> %a0, %a1
+  ret <16 x i16> %2
+}
+
+; TODO stack_fold_palignr
+; define <32 x i8> @stack_fold_palignr(<32 x i8> %a0, <32 x i8> %a1)
+
+define <32 x i8> @stack_fold_pand(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pand
+  ;CHECK:       vpand {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = and <32 x i8> %a0, %a1
+  ; add forces execution domain
+  %3 = add <32 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @stack_fold_pandn(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pandn
+  ;CHECK:       vpandn {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = xor <32 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %3 = and <32 x i8> %2, %a1
+  ; add forces execution domain
+  %4 = add <32 x i8> %3, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <32 x i8> %4
+}
+
+define <32 x i8> @stack_fold_pavgb(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pavgb
+  ;CHECK:       vpavgb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @stack_fold_pavgw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pavgw
+  ;CHECK:       vpavgw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_pblendd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pblendd
+  ;CHECK:       vpblendd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_pblendd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pblendd_ymm
+  ;CHECK:       vpblendd $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %2
+}
+
+define <32 x i8> @stack_fold_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %c) {
+  ;CHECK-LABEL: stack_fold_pblendvb
+  ;CHECK:       vpblendvb {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a1, <32 x i8> %c, <32 x i8> %a0)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @stack_fold_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pblendw
+  ;CHECK:       vpblendw $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i8 7)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone
+
+define <16 x i8> @stack_fold_pbroadcastb(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pbroadcastb
+  ;CHECK:       vpbroadcastb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly
+
+define <32 x i8> @stack_fold_pbroadcastb_ymm(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pbroadcastb_ymm
+  ;CHECK:       vpbroadcastb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly
+
+define <4 x i32> @stack_fold_pbroadcastd(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pbroadcastd
+  ;CHECK:       vpbroadcastd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0)
+  ; add forces execution domain
+  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly
+
+define <8 x i32> @stack_fold_pbroadcastd_ymm(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pbroadcastd_ymm
+  ;CHECK:       vpbroadcastd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0)
+  ; add forces execution domain
+  %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i32> %3
+}
+declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly
+
+define <2 x i64> @stack_fold_pbroadcastq(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_pbroadcastq
+  ;CHECK:       vpbroadcastq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0)
+  ; add forces execution domain
+  %3 = add <2 x i64> %2, <i64 1, i64 1>
+  ret <2 x i64> %3
+}
+declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly
+
+define <4 x i64> @stack_fold_pbroadcastq_ymm(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_pbroadcastq_ymm
+  ;CHECK:       vpbroadcastq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0)
+  ; add forces execution domain
+  %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
+  ret <4 x i64> %3
+}
+declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly
+
+define <8 x i16> @stack_fold_pbroadcastw(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pbroadcastw
+  ;CHECK:       vpbroadcastw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly
+
+define <16 x i16> @stack_fold_pbroadcastw_ymm(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pbroadcastw_ymm
+  ;CHECK:       vpbroadcastw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly
+
+define <32 x i8> @stack_fold_pcmpeqb(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpeqb
+  ;CHECK:       vpcmpeqb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp eq <32 x i8> %a0, %a1
+  %3 = sext <32 x i1> %2 to <32 x i8>
+  ret <32 x i8> %3
+}
+
+define <8 x i32> @stack_fold_pcmpeqd(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpeqd
+  ;CHECK:       vpcmpeqd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp eq <8 x i32> %a0, %a1
+  %3 = sext <8 x i1> %2 to <8 x i32>
+  ret <8 x i32> %3
+}
+
+define <4 x i64> @stack_fold_pcmpeqq(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpeqq
+  ;CHECK:       vpcmpeqq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp eq <4 x i64> %a0, %a1
+  %3 = sext <4 x i1> %2 to <4 x i64>
+  ret <4 x i64> %3
+}
+
+define <16 x i16> @stack_fold_pcmpeqw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpeqw
+  ;CHECK:       vpcmpeqw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp eq <16 x i16> %a0, %a1
+  %3 = sext <16 x i1> %2 to <16 x i16>
+  ret <16 x i16> %3
+}
+
+define <32 x i8> @stack_fold_pcmpgtb(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpgtb
+  ;CHECK:       vpcmpgtb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp sgt <32 x i8> %a0, %a1
+  %3 = sext <32 x i1> %2 to <32 x i8>
+  ret <32 x i8> %3
+}
+
+define <8 x i32> @stack_fold_pcmpgtd(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpgtd
+  ;CHECK:       vpcmpgtd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp sgt <8 x i32> %a0, %a1
+  %3 = sext <8 x i1> %2 to <8 x i32>
+  ret <8 x i32> %3
+}
+
+define <4 x i64> @stack_fold_pcmpgtq(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpgtq
+  ;CHECK:       vpcmpgtq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp sgt <4 x i64> %a0, %a1
+  %3 = sext <4 x i1> %2 to <4 x i64>
+  ret <4 x i64> %3
+}
+
+define <16 x i16> @stack_fold_pcmpgtw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpgtw
+  ;CHECK:       vpcmpgtw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp sgt <16 x i16> %a0, %a1
+  %3 = sext <16 x i1> %2 to <16 x i16>
+  ret <16 x i16> %3
+}
+
+define <8 x i32> @stack_fold_perm2i128(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_perm2i128
+  ;CHECK:   vperm2i128 $33, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  ; add forces execution domain
+  %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i32> %3
+}
+
+define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_permd
+  ;CHECK:   vpermd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <4 x double> @stack_fold_permpd(<4 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_permpd
+  ;CHECK:   vpermpd $255, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  ; fadd forces execution domain
+  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %3
+}
+
+define <8 x float> @stack_fold_permps(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_permps
+  ;CHECK:       vpermps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x float> %a1)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x float>) nounwind readonly
+
+define <4 x i64> @stack_fold_permq(<4 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_permq
+  ;CHECK:   vpermq $255, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  ; add forces execution domain
+  %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
+  ret <4 x i64> %3
+}
+
+define <8 x i32> @stack_fold_phaddd(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_phaddd
+  ;CHECK:       vphaddd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @stack_fold_phaddsw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_phaddsw
+  ;CHECK:       vphaddsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_phaddw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_phaddw
+  ;CHECK:       vphaddw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i32> @stack_fold_phsubd(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_phsubd
+  ;CHECK:       vphsubd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @stack_fold_phsubsw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_phsubsw
+  ;CHECK:       vphsubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_phsubw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_phsubw
+  ;CHECK:       vphsubw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_pmaddubsw(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaddubsw
+  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <8 x i32> @stack_fold_pmaddwd(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaddwd
+  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <32 x i8> @stack_fold_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsb
+  ;CHECK:       vpmaxsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <8 x i32> @stack_fold_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsd
+  ;CHECK:       vpmaxsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @stack_fold_pmaxsw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsw
+  ;CHECK:       vpmaxsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <32 x i8> @stack_fold_pmaxub(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxub
+  ;CHECK:       vpmaxub {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <8 x i32> @stack_fold_pmaxud(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxud
+  ;CHECK:       vpmaxud {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @stack_fold_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxuw
+  ;CHECK:       vpmaxuw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <32 x i8> @stack_fold_pminsb(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsb
+  ;CHECK:       vpminsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <8 x i32> @stack_fold_pminsd(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsd
+  ;CHECK:       vpminsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @stack_fold_pminsw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsw
+  ;CHECK:       vpminsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <32 x i8> @stack_fold_pminub(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pminub
+  ;CHECK:       vpminub {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <8 x i32> @stack_fold_pminud(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pminud
+  ;CHECK:       vpminud {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @stack_fold_pminuw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pminuw
+  ;CHECK:       vpminuw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxbd
+  ;CHECK:       vpmovsxbd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone
+
+define <4 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxbq
+  ;CHECK:       pmovsxbq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %a0)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone
+
+define <16 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxbw
+  ;CHECK:       vpmovsxbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %a0)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone
+
+define <4 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxdq
+  ;CHECK:       vpmovsxdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %a0)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone
+
+define <8 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxwd
+  ;CHECK:       vpmovsxwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %a0)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone
+
+define <4 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxwq
+  ;CHECK:       vpmovsxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %a0)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone
+
+define <8 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxbd
+  ;CHECK:       vpmovzxbd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %a0)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone
+
+define <4 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxbq
+  ;CHECK:       vpmovzxbq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %a0)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone
+
+define <16 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxbw
+  ;CHECK:       vpmovzxbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %a0)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone
+
+define <4 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxdq
+  ;CHECK:       vpmovzxdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %a0)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone
+
+define <8 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxwd
+  ;CHECK:       vpmovzxwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %a0)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone
+
+define <4 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxwq
+  ;CHECK:       vpmovzxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %a0)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone
+
+define <4 x i64> @stack_fold_pmuldq(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmuldq
+  ;CHECK:       vpmuldq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> %a1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @stack_fold_pmulhrsw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmulhrsw
+  ;CHECK:       vpmulhrsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_pmulhuw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmulhuw
+  ;CHECK:       vpmulhuw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_pmulhw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmulhw
+  ;CHECK:       vpmulhw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i32> @stack_fold_pmulld(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmulld
+  ;CHECK:       vpmulld {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = mul <8 x i32> %a0, %a1
+  ret <8 x i32> %2
+}
+
+define <16 x i16> @stack_fold_pmullw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmullw
+  ;CHECK:       vpmullw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = mul <16 x i16> %a0, %a1
+  ret <16 x i16> %2
+}
+
+define <4 x i64> @stack_fold_pmuludq(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmuludq
+  ;CHECK:       vpmuludq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> %a1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <32 x i8> @stack_fold_por(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_por
+  ;CHECK:       vpor {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = or <32 x i8> %a0, %a1
+  ; add forces execution domain
+  %3 = add <32 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <32 x i8> %3
+}
+
+define <4 x i64> @stack_fold_psadbw(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psadbw
+  ;CHECK:       vpsadbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <32 x i8> @stack_fold_pshufb(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pshufb
+  ;CHECK:       vpshufb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <8 x i32> @stack_fold_pshufd(<8 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pshufd
+  ;CHECK:       vpshufd $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i32> %2
+}
+
+; TODO stack_fold_pshufhw
+
+; TODO stack_fold_pshuflw
+
+define <32 x i8> @stack_fold_psignb(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psignb
+  ;CHECK:       vpsignb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <8 x i32> @stack_fold_psignd(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psignd
+  ;CHECK:       vpsignd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @stack_fold_psignw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psignw
+  ;CHECK:       vpsignw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i32> @stack_fold_pslld(<8 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pslld
+  ;CHECK:       vpslld {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i64> @stack_fold_psllq(<4 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psllq
+  ;CHECK:       vpsllq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psllvd
+  ;CHECK:       vpsllvd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i32> @stack_fold_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psllvd_ymm
+  ;CHECK:       vpsllvd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psllvq
+  ;CHECK:       vpsllvq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @stack_fold_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psllvq_ymm
+  ;CHECK:       vpsllvq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <16 x i16> @stack_fold_psllw(<16 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psllw
+  ;CHECK:       vpsllw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i32> @stack_fold_psrad(<8 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrad
+  ;CHECK:       vpsrad {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psravd
+  ;CHECK:       vpsravd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i32> @stack_fold_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psravd_ymm
+  ;CHECK:       vpsravd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @stack_fold_psraw(<16 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psraw
+  ;CHECK:       vpsraw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i32> @stack_fold_psrld(<8 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrld
+  ;CHECK:       vpsrld {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i64> @stack_fold_psrlq(<4 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlq
+  ;CHECK:       vpsrlq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlvd
+  ;CHECK:       vpsrlvd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i32> @stack_fold_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlvd_ymm
+  ;CHECK:       vpsrlvd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlvq
+  ;CHECK:       vpsrlvq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @stack_fold_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlvq_ymm
+  ;CHECK:       vpsrlvq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <16 x i16> @stack_fold_psrlw(<16 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlw
+  ;CHECK:       vpsrlw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
+
+define <32 x i8> @stack_fold_psubb(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubb
+  ;CHECK:       vpsubb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = sub <32 x i8> %a0, %a1
+  ret <32 x i8> %2
+}
+
+define <8 x i32> @stack_fold_psubd(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psubd
+  ;CHECK:       vpsubd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = sub <8 x i32> %a0, %a1
+  ret <8 x i32> %2
+}
+
+define <4 x i64> @stack_fold_psubq(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psubq
+  ;CHECK:       vpsubq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = sub <4 x i64> %a0, %a1
+  ret <4 x i64> %2
+}
+
+define <32 x i8> @stack_fold_psubsb(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubsb
+  ;CHECK:       vpsubsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @stack_fold_psubsw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubsw
+  ;CHECK:       vpsubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <32 x i8> @stack_fold_psubusb(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubusb
+  ;CHECK:       vpsubusb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @stack_fold_psubusw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubusw
+  ;CHECK:       vpsubusw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_psubw(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubw
+  ;CHECK:       vpsubw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = sub <16 x i16> %a0, %a1
+  ret <16 x i16> %2
+}
+
+define <32 x i8> @stack_fold_punpckhbw(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckhbw
+  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  ret <32 x i8> %2
+}
+
+define <8 x i32> @stack_fold_punpckhdq(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckhdq
+  ;CHECK:       vpunpckhdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  ; add forces execution domain
+  %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i32> %3
+}
+
+define <4 x i64> @stack_fold_punpckhqdq(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckhqdq
+  ;CHECK:       vpunpckhqdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ; add forces execution domain
+  %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
+  ret <4 x i64> %3
+}
+
+define <16 x i16> @stack_fold_punpckhwd(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckhwd
+  ;CHECK:       vpunpckhwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i16> %2
+}
+
+define <32 x i8> @stack_fold_punpcklbw(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_punpcklbw
+  ;CHECK:       vpunpcklbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+  ret <32 x i8> %2
+}
+
+define <8 x i32> @stack_fold_punpckldq(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckldq
+  ;CHECK:       vpunpckldq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  ; add forces execution domain
+  %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i32> %3
+}
+
+define <4 x i64> @stack_fold_punpcklqdq(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_punpcklqdq
+  ;CHECK:       vpunpcklqdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ; add forces execution domain
+  %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
+  ret <4 x i64> %3
+}
+
+define <16 x i16> @stack_fold_punpcklwd(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_punpcklwd
+  ;CHECK:       vpunpcklwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+  ret <16 x i16> %2
+}
+
+define <32 x i8> @stack_fold_pxor(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pxor
+  ;CHECK:       vpxor {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = xor <32 x i8> %a0, %a1
+  ; add forces execution domain
+  %3 = add <32 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <32 x i8> %3
+}
diff --git a/test/CodeGen/X86/stack-folding-int-sse42.ll b/test/CodeGen/X86/stack-folding-int-sse42.ll
new file mode 100644
index 0000000..099a5db
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-int-sse42.ll
@@ -0,0 +1,1143 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.2,+aes,+pclmul < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define <2 x i64> @stack_fold_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_aesdec
+  ;CHECK:       aesdec {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @stack_fold_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_aesdeclast
+  ;CHECK:       aesdeclast {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @stack_fold_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_aesenc
+  ;CHECK:       aesenc {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @stack_fold_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_aesenclast
+  ;CHECK:       aesenclast {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @stack_fold_aesimc(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_aesimc
+  ;CHECK:       aesimc {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone
+
+define <2 x i64> @stack_fold_aeskeygenassist(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_aeskeygenassist
+  ;CHECK:       aeskeygenassist $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone
+
+define <4 x i32> @stack_fold_movd_load(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_movd_load
+  ;CHECK:       movd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0
+  ; add forces execution domain
+  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %3
+}
+
+define i32 @stack_fold_movd_store(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_movd_store
+  ;CHECK:       movd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
+  ; add forces execution domain
+  %1 = add <4 x i32> %a0, <i32 1, i32 1, i32 1, i32 1>
+  %2 = extractelement <4 x i32> %1, i32 0
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i32 %2
+}
+
+define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_movq_load
+  ;CHECK:       movq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %2
+}
+
+define i64 @stack_fold_movq_store(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_movq_store
+  ;CHECK:       movq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
+  %1 = extractelement <2 x i64> %a0, i32 0
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i64 %1
+}
+
+define <8 x i16> @stack_fold_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_mpsadbw
+  ;CHECK:       mpsadbw $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsb
+  ;CHECK:       pabsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsd
+  ;CHECK:       pabsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsw
+  ;CHECK:       pabsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packssdw
+  ;CHECK:       packssdw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packsswb
+  ;CHECK:       packsswb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packusdw
+  ;CHECK:       packusdw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packuswb
+  ;CHECK:       packuswb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_paddb
+  ;CHECK:       paddb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = add <16 x i8> %a0, %a1
+  ret <16 x i8> %2
+}
+
+define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_paddd
+  ;CHECK:       paddd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = add <4 x i32> %a0, %a1
+  ret <4 x i32> %2
+}
+
+define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_paddq
+  ;CHECK:       paddq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = add <2 x i64> %a0, %a1
+  ret <2 x i64> %2
+}
+
+define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_paddsb
+  ;CHECK:       paddsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_paddsw
+  ;CHECK:       paddsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_paddusb
+  ;CHECK:       paddusb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_paddusw
+  ;CHECK:       paddusw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_paddw
+  ;CHECK:       paddw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = add <8 x i16> %a0, %a1
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @stack_fold_palignr(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_palignr
+  ;CHECK:       palignr $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <16 x i8> %a1, <16 x i8> %a0, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @stack_fold_pand(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pand
+  ;CHECK:       pand {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = and <16 x i8> %a0, %a1
+  ; add forces execution domain
+  %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @stack_fold_pandn(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pandn
+  ;CHECK:       pandn {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = xor <16 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %3 = and <16 x i8> %2, %a1
+  ; add forces execution domain
+  %4 = add <16 x i8> %3, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %4
+}
+
+define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pavgb
+  ;CHECK:       pavgb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pavgw
+  ;CHECK:       pavgw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {
+  ;CHECK-LABEL: stack_fold_pblendvb
+  ;CHECK:       pblendvb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pblendw
+  ;CHECK:       pblendw $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_pclmulqdq
+  ;CHECK:       pclmulqdq $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpeqb
+  ;CHECK:       pcmpeqb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp eq <16 x i8> %a0, %a1
+  %3 = sext <16 x i1> %2 to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <4 x i32> @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpeqd
+  ;CHECK:       pcmpeqd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp eq <4 x i32> %a0, %a1
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <2 x i64> @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpeqq
+  ;CHECK:       pcmpeqq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp eq <2 x i64> %a0, %a1
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <8 x i16> @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpeqw
+  ;CHECK:       pcmpeqw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp eq <8 x i16> %a0, %a1
+  %3 = sext <8 x i1> %2 to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpestri
+  ;CHECK:       pcmpestri $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
+  %2 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpestrm
+  ;CHECK:       pcmpestrm $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpgtb
+  ;CHECK:       pcmpgtb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp sgt <16 x i8> %a0, %a1
+  %3 = sext <16 x i1> %2 to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <4 x i32> @stack_fold_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpgtd
+  ;CHECK:       pcmpgtd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp sgt <4 x i32> %a0, %a1
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <2 x i64> @stack_fold_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpgtq
+  ;CHECK:       pcmpgtq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp sgt <2 x i64> %a0, %a1
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <8 x i16> @stack_fold_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpgtw
+  ;CHECK:       pcmpgtw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = icmp sgt <8 x i16> %a0, %a1
+  %3 = sext <8 x i1> %2 to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpistri
+  ;CHECK:       pcmpistri $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pcmpistrm
+  ;CHECK:       pcmpistrm $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+; TODO stack_fold_pextrb
+
+define i32 @stack_fold_pextrd(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pextrd
+  ;CHECK:       pextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
+  ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
+  %1 = extractelement <4 x i32> %a0, i32 1
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i32 %1
+}
+
+define i64 @stack_fold_pextrq(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_pextrq
+  ;CHECK:       pextrq $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
+  ;CHECK:       movq    {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Reload
+  %1 = extractelement <2 x i64> %a0, i32 1
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i64 %1
+}
+
+; TODO stack_fold_pextrw
+
+define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_phaddd
+  ;CHECK:       phaddd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_phaddsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_phaddsw
+  ;CHECK:       phaddsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_phaddw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_phaddw
+  ;CHECK:       phaddw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_phminposuw
+  ;CHECK:       phminposuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_phsubd
+  ;CHECK:       phsubd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_phsubsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_phsubsw
+  ;CHECK:       phsubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_phsubw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_phsubw
+  ;CHECK:       phsubw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
+  ;CHECK-LABEL: stack_fold_pinsrb
+  ;CHECK:       pinsrb $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
+  ret <16 x i8> %2
+}
+
+define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
+  ;CHECK-LABEL: stack_fold_pinsrd
+  ;CHECK:       pinsrd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
+  ret <4 x i32> %2
+}
+
+define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
+  ;CHECK-LABEL: stack_fold_pinsrq
+  ;CHECK:       pinsrq $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
+  ret <2 x i64> %2
+}
+
+define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
+  ;CHECK-LABEL: stack_fold_pinsrw
+  ;CHECK:       pinsrw $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaddubsw
+  ;CHECK:       pmaddubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaddwd
+  ;CHECK:       pmaddwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsb
+  ;CHECK:       pmaxsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsd
+  ;CHECK:       pmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsw
+  ;CHECK:       pmaxsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxub
+  ;CHECK:       pmaxub {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxud
+  ;CHECK:       pmaxud {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxuw
+  ;CHECK:       pmaxuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsb
+  ;CHECK:       pminsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsd
+  ;CHECK:       pminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsw
+  ;CHECK:       pminsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pminub
+  ;CHECK:       pminub {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pminud
+  ;CHECK:       pminud {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pminuw
+  ;CHECK:       pminuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxbd
+  ;CHECK:       pmovsxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxbq
+  ;CHECK:       pmovsxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxbw
+  ;CHECK:       pmovsxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxdq
+  ;CHECK:       pmovsxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxwd
+  ;CHECK:       pmovsxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxwq
+  ;CHECK:       pmovsxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxbd
+  ;CHECK:       pmovzxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxbq
+  ;CHECK:       pmovzxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxbw
+  ;CHECK:       pmovzxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxdq
+  ;CHECK:       pmovzxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxwd
+  ;CHECK:       pmovzxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxwq
+  ;CHECK:       pmovzxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmuldq
+  ;CHECK:       pmuldq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmulhrsw
+  ;CHECK:       pmulhrsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_pmulhuw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmulhuw
+  ;CHECK:       pmulhuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_pmulhw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmulhw
+  ;CHECK:       pmulhw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmulld(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmulld
+  ;CHECK:       pmulld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = mul <4 x i32> %a0, %a1
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @stack_fold_pmullw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmullw
+  ;CHECK:       pmullw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = mul <8 x i16> %a0, %a1
+  ret <8 x i16> %2
+}
+
+define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmuludq
+  ;CHECK:       pmuludq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <16 x i8> @stack_fold_por(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_por
+  ;CHECK:       por {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = or <16 x i8> %a0, %a1
+  ; add forces execution domain
+  %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %3
+}
+
+define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psadbw
+  ;CHECK:       psadbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pshufb
+  ;CHECK:       pshufb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pshufd
+  ;CHECK:       pshufd $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pshufhw
+  ;CHECK:       pshufhw $11, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pshuflw
+  ;CHECK:       pshuflw $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @stack_fold_psignb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psignb
+  ;CHECK:       psignb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_psignd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psignd
+  ;CHECK:       psignd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_psignw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psignw
+  ;CHECK:       psignw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pslld
+  ;CHECK:       pslld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psllq
+  ;CHECK:       psllq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psllw
+  ;CHECK:       psllw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrad
+  ;CHECK:       psrad {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psraw
+  ;CHECK:       psraw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrld
+  ;CHECK:       psrld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlq
+  ;CHECK:       psrlq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlw
+  ;CHECK:       psrlw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubb
+  ;CHECK:       psubb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = sub <16 x i8> %a0, %a1
+  ret <16 x i8> %2
+}
+
+define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psubd
+  ;CHECK:       psubd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = sub <4 x i32> %a0, %a1
+  ret <4 x i32> %2
+}
+
+define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psubq
+  ;CHECK:       psubq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = sub <2 x i64> %a0, %a1
+  ret <2 x i64> %2
+}
+
+define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubsb
+  ;CHECK:       psubsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubsw
+  ;CHECK:       psubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubusb
+  ;CHECK:       psubusb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubusw
+  ;CHECK:       psubusw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubw
+  ;CHECK:       psubw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = sub <8 x i16> %a0, %a1
+  ret <8 x i16> %2
+}
+
+define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_ptest
+  ;CHECK:       ptest {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckhbw
+  ;CHECK:       punpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %2
+}
+
+define <4 x i32> @stack_fold_punpckhdq(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckhdq
+  ;CHECK:       punpckhdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ; add forces execution domain
+  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %3
+}
+
+define <2 x i64> @stack_fold_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckhqdq
+  ;CHECK:       punpckhqdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
+  ; add forces execution domain
+  %3 = add <2 x i64> %2, <i64 1, i64 1>
+  ret <2 x i64> %3
+}
+
+define <8 x i16> @stack_fold_punpckhwd(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckhwd
+  ;CHECK:       punpckhwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @stack_fold_punpcklbw(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_punpcklbw
+  ;CHECK:       punpcklbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ret <16 x i8> %2
+}
+
+define <4 x i32> @stack_fold_punpckldq(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckldq
+  ;CHECK:       punpckldq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ; add forces execution domain
+  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %3
+}
+
+define <2 x i64> @stack_fold_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_punpcklqdq
+  ;CHECK:       punpcklqdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
+  ; add forces execution domain
+  %3 = add <2 x i64> %2, <i64 1, i64 1>
+  ret <2 x i64> %3
+}
+
+define <8 x i16> @stack_fold_punpcklwd(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_punpcklwd
+  ;CHECK:       punpcklwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i16> %2
+}
+
+define <16 x i8> @stack_fold_pxor(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pxor
+  ;CHECK:       pxor {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = xor <16 x i8> %a0, %a1
+  ; add forces execution domain
+  %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %3
+}
diff --git a/test/CodeGen/X86/stack-folding-xop.ll b/test/CodeGen/X86/stack-folding-xop.ll
new file mode 100644
index 0000000..44a0d1d
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-xop.ll
@@ -0,0 +1,718 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+xop < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define <2 x double> @stack_fold_vfrczpd(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_vfrczpd
+  ;CHECK:       vfrczpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double> %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double>) nounwind readnone
+
+define <4 x double> @stack_fold_vfrczpd_ymm(<4 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_vfrczpd_ymm
+  ;CHECK:       vfrczpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double> %a0)
+  ret <4 x double> %2
+}
+declare <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double>) nounwind readnone
+
+define <4 x float> @stack_fold_vfrczps(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_vfrczps
+  ;CHECK:       vfrczps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @stack_fold_vfrczps_ymm(<8 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_vfrczps_ymm
+  ;CHECK:       vfrczps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float> %a0)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_vfrczsd(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_vfrczsd
+  ;CHECK:       vfrczsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>) nounwind readnone
+
+define <4 x float> @stack_fold_vfrczss(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_vfrczss
+  ;CHECK:       vfrczss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>) nounwind readnone
+
+define <2 x i64> @stack_fold_vpcmov_rm(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+  ;CHECK-LABEL: stack_fold_vpcmov_rm
+  ;CHECK:       vpcmov {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2)
+  ret <2 x i64> %2
+}
+define <2 x i64> @stack_fold_vpcmov_mr(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+  ;CHECK-LABEL: stack_fold_vpcmov_mr
+  ;CHECK:       vpcmov {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a2, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @stack_fold_vpcmov_rm_ymm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
+  ;CHECK-LABEL: stack_fold_vpcmov_rm_ymm
+  ;CHECK:       vpcmov {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2)
+  ret <4 x i64> %2
+}
+define <4 x i64> @stack_fold_vpcmov_mr_ymm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
+  ;CHECK-LABEL: stack_fold_vpcmov_mr_ymm
+  ;CHECK:       vpcmov {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a2, <4 x i64> %a1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64>, <4 x i64>, <4 x i64>) nounwind readnone
+
+define <16 x i8> @stack_fold_vpcomb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_vpcomb
+  ;CHECK:       vpcomltb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %a0, <16 x i8> %a1, i8 0)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <4 x i32> @stack_fold_vpcomd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_vpcomd
+  ;CHECK:       vpcomltd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %a0, <4 x i32> %a1, i8 0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @stack_fold_vpcomq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_vpcomq
+  ;CHECK:       vpcomltq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <16 x i8> @stack_fold_vpcomub(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_vpcomub
+  ;CHECK:       vpcomltub {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %a0, <16 x i8> %a1, i8 0)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <4 x i32> @stack_fold_vpcomud(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_vpcomud
+  ;CHECK:       vpcomltud {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %a0, <4 x i32> %a1, i8 0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @stack_fold_vpcomuq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_vpcomuq
+  ;CHECK:       vpcomltuq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <8 x i16> @stack_fold_vpcomuw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_vpcomuw
+  ;CHECK:       vpcomltuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %a0, <8 x i16> %a1, i8 0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <8 x i16> @stack_fold_vpcomw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_vpcomw
+  ;CHECK:       vpcomltw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %a0, <8 x i16> %a1, i8 0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <2 x double> @stack_fold_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ;CHECK-LABEL: stack_fold_vpermil2pd_rm
+  ;CHECK:       vpermil2pd $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 0)
+  ret <2 x double> %2
+}
+define <2 x double> @stack_fold_vpermil2pd_mr(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ;CHECK-LABEL: stack_fold_vpermil2pd_mr
+  ;CHECK:       vpermil2pd $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a2, <2 x double> %a1, i8 0)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x double> @stack_fold_vpermil2pd_rm_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+  ;CHECK-LABEL: stack_fold_vpermil2pd_rm
+  ;CHECK:       vpermil2pd $0, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 0)
+  ret <4 x double> %2
+}
+define <4 x double> @stack_fold_vpermil2pd_mr_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+  ;CHECK-LABEL: stack_fold_vpermil2pd_mr
+  ;CHECK:       vpermil2pd $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a2, <4 x double> %a1, i8 0)
+  ret <4 x double> %2
+}
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x float> @stack_fold_vpermil2ps_rm(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ;CHECK-LABEL: stack_fold_vpermil2ps_rm
+  ;CHECK:       vpermil2ps $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 0)
+  ret <4 x float> %2
+}
+define <4 x float> @stack_fold_vpermil2ps_mr(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ;CHECK-LABEL: stack_fold_vpermil2ps_mr
+  ;CHECK:       vpermil2ps $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a2, <4 x float> %a1, i8 0)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x float> @stack_fold_vpermil2ps_rm_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  ;CHECK-LABEL: stack_fold_vpermil2ps_rm
+  ;CHECK:       vpermil2ps $0, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 0)
+  ret <8 x float> %2
+}
+define <8 x float> @stack_fold_vpermil2ps_mr_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  ;CHECK-LABEL: stack_fold_vpermil2ps_mr
+  ;CHECK:       vpermil2ps $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a2, <8 x float> %a1, i8 0)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+
+define <4 x i32> @stack_fold_vphaddbd(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_vphaddbd
+  ;CHECK:       vphaddbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8>) nounwind readnone
+
+define <2 x i64> @stack_fold_vphaddbq(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_vphaddbq
+  ;CHECK:       vphaddbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_vphaddbw(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_vphaddbw
+  ;CHECK:       vphaddbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8> %a0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8>) nounwind readnone
+
+define <2 x i64> @stack_fold_vphadddq(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vphadddq
+  ;CHECK:       vphadddq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32>) nounwind readnone
+
+define <4 x i32> @stack_fold_vphaddubd(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_vphaddubd
+  ;CHECK:       vphaddubd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8>) nounwind readnone
+
+define <2 x i64> @stack_fold_vphaddubq(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_vphaddubq
+  ;CHECK:       vphaddubq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_vphaddubw(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_vphaddubw
+  ;CHECK:       vphaddubw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8> %a0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8>) nounwind readnone
+
+define <2 x i64> @stack_fold_vphaddudq(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vphaddudq
+  ;CHECK:       vphaddudq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32>) nounwind readnone
+
+define <4 x i32> @stack_fold_vphadduwd(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_vphadduwd
+  ;CHECK:       vphadduwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16>) nounwind readnone
+
+define <2 x i64> @stack_fold_vphadduwq(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_vphadduwq
+  ;CHECK:       vphadduwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_vphaddwd(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_vphaddwd
+  ;CHECK:       vphaddwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16>) nounwind readnone
+
+define <2 x i64> @stack_fold_vphaddwq(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_vphaddwq
+  ;CHECK:       vphaddwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_vphsubbw(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_vphsubbw
+  ;CHECK:       vphsubbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8> %a0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8>) nounwind readnone
+
+define <2 x i64> @stack_fold_vphsubdq(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vphsubdq
+  ;CHECK:       vphsubdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32>) nounwind readnone
+
+define <4 x i32> @stack_fold_vphsubwd(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_vphsubwd
+  ;CHECK:       vphsubwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_vpmacsdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+  ;CHECK-LABEL: stack_fold_vpmacsdd
+  ;CHECK:       vpmacsdd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_vpmacsdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
+  ;CHECK-LABEL: stack_fold_vpmacsdqh
+  ;CHECK:       vpmacsdqh {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @stack_fold_vpmacsdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
+  ;CHECK-LABEL: stack_fold_vpmacsdql
+  ;CHECK:       vpmacsdql {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
+
+define <4 x i32> @stack_fold_vpmacssdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+  ;CHECK-LABEL: stack_fold_vpmacssdd
+  ;CHECK:       vpmacssdd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_vpmacssdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
+  ;CHECK-LABEL: stack_fold_vpmacssdqh
+  ;CHECK:       vpmacssdqh {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @stack_fold_vpmacssdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
+  ;CHECK-LABEL: stack_fold_vpmacssdql
+  ;CHECK:       vpmacssdql {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
+
+define <4 x i32> @stack_fold_vpmacsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
+  ;CHECK-LABEL: stack_fold_vpmacsswd
+  ;CHECK:       vpmacsswd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_vpmacssww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) {
+  ;CHECK-LABEL: stack_fold_vpmacssww
+  ;CHECK:       vpmacssww {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_vpmacswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
+  ;CHECK-LABEL: stack_fold_vpmacswd
+  ;CHECK:       vpmacswd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @stack_fold_vpmacsww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) {
+  ;CHECK-LABEL: stack_fold_vpmacsww
+  ;CHECK:       vpmacsww {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_vpmadcsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
+  ;CHECK-LABEL: stack_fold_vpmadcsswd
+  ;CHECK:       vpmadcsswd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @stack_fold_vpmadcswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
+  ;CHECK-LABEL: stack_fold_vpmadcswd
+  ;CHECK:       vpmadcswd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
+
+define <16 x i8> @stack_fold_vpperm_rm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
+  ;CHECK-LABEL: stack_fold_vpperm_rm
+  ;CHECK:       vpperm {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2)
+  ret <16 x i8> %2
+}
+define <16 x i8> @stack_fold_vpperm_mr(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
+  ;CHECK-LABEL: stack_fold_vpperm_mr
+  ;CHECK:       vpperm {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a2, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @stack_fold_vprotb(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_vprotb
+  ;CHECK:       vprotb $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8> %a0, i8 7)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8>, i8) nounwind readnone
+
+define <16 x i8> @stack_fold_vprotb_rm(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_vprotb_rm
+  ;CHECK:       vprotb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.xop.vprotb(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+define <16 x i8> @stack_fold_vprotb_mr(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_vprotb_mr
+  ;CHECK:       vprotb {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.xop.vprotb(<16 x i8> %a1, <16 x i8> %a0)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.xop.vprotb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_vprotd(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vprotd
+  ;CHECK:       vprotd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32> %a0, i8 7)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32>, i8) nounwind readnone
+
+define <4 x i32> @stack_fold_vprotd_rm(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_vprotd_rm
+  ;CHECK:       vprotd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vprotd(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+define <4 x i32> @stack_fold_vprotd_mr(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_vprotd_mr
+  ;CHECK:       vprotd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vprotd(<4 x i32> %a1, <4 x i32> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vprotd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_vprotq(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_vprotq
+  ;CHECK:       vprotq $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64> %a0, i8 7)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64>, i8) nounwind readnone
+
+define <2 x i64> @stack_fold_vprotq_rm(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_vprotq_rm
+  ;CHECK:       vprotq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vprotq(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+define <2 x i64> @stack_fold_vprotq_mr(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_vprotq_mr
+  ;CHECK:       vprotq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vprotq(<2 x i64> %a1, <2 x i64> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vprotq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @stack_fold_vprotw(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_vprotw
+  ;CHECK:       vprotw $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16> %a0, i8 7)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16>, i8) nounwind readnone
+
+define <8 x i16> @stack_fold_vprotw_rm(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_vprotw_rm
+  ;CHECK:       vprotw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.xop.vprotw(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+define <8 x i16> @stack_fold_vprotw_mr(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_vprotw_mr
+  ;CHECK:       vprotw {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.xop.vprotw(<8 x i16> %a1, <8 x i16> %a0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.xop.vprotw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_vpshab_rm(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_vpshab_rm
+  ;CHECK:       vpshab {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+define <16 x i8> @stack_fold_vpshab_mr(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_vpshab_mr
+  ;CHECK:       vpshab {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %a1, <16 x i8> %a0)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.xop.vpshab(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_vpshad_rm(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_vpshad_rm
+  ;CHECK:       vpshad {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vpshad(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+define <4 x i32> @stack_fold_vpshad_mr(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_vpshad_mr
+  ;CHECK:       vpshad {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vpshad(<4 x i32> %a1, <4 x i32> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpshad(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_vpshaq_rm(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_vpshaq_rm
+  ;CHECK:       vpshaq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+define <2 x i64> @stack_fold_vpshaq_mr(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_vpshaq_mr
+  ;CHECK:       vpshaq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64> %a1, <2 x i64> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @stack_fold_vpshaw_rm(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_vpshaw_rm
+  ;CHECK:       vpshaw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+define <8 x i16> @stack_fold_vpshaw_mr(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_vpshaw_mr
+  ;CHECK:       vpshaw {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16> %a1, <8 x i16> %a0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_vpshlb_rm(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_vpshlb_rm
+  ;CHECK:       vpshlb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+define <16 x i8> @stack_fold_vpshlb_mr(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_vpshlb_mr
+  ;CHECK:       vpshlb {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8> %a1, <16 x i8> %a0)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_vpshld_rm(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_vpshld_rm
+  ;CHECK:       vpshld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vpshld(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+define <4 x i32> @stack_fold_vpshld_mr(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_vpshld_mr
+  ;CHECK:       vpshld {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.xop.vpshld(<4 x i32> %a1, <4 x i32> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.xop.vpshld(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_vpshlq_rm(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_vpshlq_rm
+  ;CHECK:       vpshlq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+define <2 x i64> @stack_fold_vpshlq_mr(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_vpshlq_mr
+  ;CHECK:       vpshlq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64> %a1, <2 x i64> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @stack_fold_vpshlw_rm(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_vpshlw_rm
+  ;CHECK:       vpshlw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+define <8 x i16> @stack_fold_vpshlw_mr(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_vpshlw_mr
+  ;CHECK:       vpshlw {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %a1, <8 x i16> %a0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16>, <8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/stack-probe-size.ll b/test/CodeGen/X86/stack-probe-size.ll
new file mode 100644
index 0000000..21482c3
--- /dev/null
+++ b/test/CodeGen/X86/stack-probe-size.ll
@@ -0,0 +1,78 @@
+; This test is attempting to detect that the compiler correctly generates stack
+; probe calls when the size of the local variables exceeds the specified stack
+; probe size.
+;
+; Testing the default value of 4096 bytes makes sense, because the default
+; stack probe size equals the page size (4096 bytes for all x86 targets), and
+; this is unlikely to change in the future.
+;
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+define i32 @test1() "stack-probe-size"="0" {
+  %buffer = alloca [4095 x i8]
+
+  ret i32 0
+
+; CHECK-LABEL: _test1:
+; CHECK-NOT: subl $4095, %esp
+; CHECK: movl $4095, %eax
+; CHECK: calll __chkstk
+}
+
+define i32 @test2() {
+  %buffer = alloca [4095 x i8]
+
+  ret i32 0
+
+; CHECK-LABEL: _test2:
+; CHECK-NOT: movl $4095, %eax
+; CHECK: subl $4095, %esp
+; CHECK-NOT: calll __chkstk
+}
+
+define i32 @test3() "stack-probe-size"="8192" {
+  %buffer = alloca [4095 x i8]
+
+  ret i32 0
+
+; CHECK-LABEL: _test3:
+; CHECK-NOT: movl $4095, %eax
+; CHECK: subl $4095, %esp
+; CHECK-NOT: calll __chkstk
+}
+
+define i32 @test4() "stack-probe-size"="0" {
+  %buffer = alloca [4096 x i8]
+
+  ret i32 0
+
+; CHECK-LABEL: _test4:
+; CHECK-NOT: subl $4096, %esp
+; CHECK: movl $4096, %eax
+; CHECK: calll __chkstk
+}
+
+define i32 @test5() {
+  %buffer = alloca [4096 x i8]
+
+  ret i32 0
+
+; CHECK-LABEL: _test5:
+; CHECK-NOT: subl $4096, %esp
+; CHECK: movl $4096, %eax
+; CHECK: calll __chkstk
+}
+
+define i32 @test6() "stack-probe-size"="8192" {
+  %buffer = alloca [4096 x i8]
+
+  ret i32 0
+
+; CGECK-LABEL: _test6:
+; CGECK-NOT: movl $4096, %eax
+; CGECK: subl $4096, %esp
+; CGECK-NOT: calll __chkstk
+}
diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll
index cf0f999..a84b77e 100644
--- a/test/CodeGen/X86/stack-protector-dbginfo.ll
+++ b/test/CodeGen/X86/stack-protector-dbginfo.ll
@@ -10,9 +10,9 @@
 ; Function Attrs: nounwind sspreq
 define i32 @_Z18read_response_sizev() #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !39
+  tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !23, metadata !{!"0x102"}), !dbg !39
   %0 = load i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0), align 8, !dbg !40
-  tail call void @llvm.dbg.value(metadata !63, i64 0, metadata !64, metadata !{metadata !"0x102"}), !dbg !71
+  tail call void @llvm.dbg.value(metadata i32 undef, i64 0, metadata !64, metadata !{!"0x102"}), !dbg !71
   %1 = trunc i64 %0 to i32
   ret i32 %1
 }
@@ -25,73 +25,73 @@ attributes #0 = { sspreq }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21, !72}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \001\00\000\00\001", metadata !1, metadata !2, metadata !5, metadata !8, metadata !20, metadata !5} ; [ DW_TAG_compile_unit ] [/Users/matt/ryan_bug/<unknown>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"<unknown>", metadata !"/Users/matt/ryan_bug"}
-!2 = metadata !{metadata !3}
-!3 = metadata !{metadata !"0x4\00\0020\0032\0032\000\000\000", metadata !1, metadata !4, null, metadata !6, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
-!4 = metadata !{metadata !"0x13\00C\0019\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 19, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{}
-!6 = metadata !{metadata !7}
-!7 = metadata !{metadata !"0x28\00max_frame_size\000"} ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
-!8 = metadata !{metadata !9, metadata !24, metadata !41, metadata !65}
-!9 = metadata !{metadata !"0x2e\00read_response_size\00read_response_size\00_Z18read_response_sizev\0027\000\001\000\006\00256\001\0027", metadata !1, metadata !10, metadata !11, null, i32 ()* @_Z18read_response_sizev, null, null, metadata !14} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size]
-!10 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!14 = metadata !{metadata !15, metadata !19}
-!15 = metadata !{metadata !"0x100\00b\0028\000", metadata !9, metadata !10, metadata !16} ; [ DW_TAG_auto_variable ] [b] [line 28]
-!16 = metadata !{metadata !"0x13\00B\0016\0032\0032\000\000\000", metadata !1, null, null, metadata !17, null, null} ; [ DW_TAG_structure_type ] [B] [line 16, size 32, align 32, offset 0] [def] [from ]
-!17 = metadata !{metadata !18}
-!18 = metadata !{metadata !"0xd\00end_of_file\0017\0032\0032\000\000", metadata !1, metadata !16, metadata !13} ; [ DW_TAG_member ] [end_of_file] [line 17, size 32, align 32, offset 0] [from int]
-!19 = metadata !{metadata !"0x100\00c\0029\000", metadata !9, metadata !10, metadata !13} ; [ DW_TAG_auto_variable ] [c] [line 29]
-!20 = metadata !{}
-!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!22 = metadata !{i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0)}
-!23 = metadata !{metadata !"0x101\00p2\0033554444\000", metadata !24, metadata !10, metadata !32, metadata !38} ; [ DW_TAG_arg_variable ] [p2] [line 12]
-!24 = metadata !{metadata !"0x2e\00min<unsigned long long>\00min<unsigned long long>\00_ZN3__13minIyEERKT_S3_RS1_\0012\000\001\000\006\00256\001\0012", metadata !1, metadata !25, metadata !27, null, null, metadata !33, null, metadata !35} ; [ DW_TAG_subprogram ] [line 12] [def] [min<unsigned long long>]
-!25 = metadata !{metadata !"0x39\00__1\001", metadata !26, null} ; [ DW_TAG_namespace ] [__1] [line 1]
-!26 = metadata !{metadata !"main.cpp", metadata !"/Users/matt/ryan_bug"}
-!27 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !28, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!28 = metadata !{metadata !29, metadata !29, metadata !32}
-!29 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !30} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!30 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !31} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
-!31 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!32 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !31} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
-!33 = metadata !{metadata !34}
-!34 = metadata !{metadata !"0x2f\00_Tp\000\000", null, metadata !31, null} ; [ DW_TAG_template_type_parameter ]
-!35 = metadata !{metadata !36, metadata !37}
-!36 = metadata !{metadata !"0x101\00p1\0016777228\000", metadata !24, metadata !10, metadata !29} ; [ DW_TAG_arg_variable ] [p1] [line 12]
-!37 = metadata !{metadata !"0x101\00p2\0033554444\000", metadata !24, metadata !10, metadata !32} ; [ DW_TAG_arg_variable ] [p2] [line 12]
-!38 = metadata !{i32 33, i32 0, metadata !9, null}
-!39 = metadata !{i32 12, i32 0, metadata !24, metadata !38}
-!40 = metadata !{i32 9, i32 0, metadata !41, metadata !59}
-!41 = metadata !{metadata !"0x2e\00min<unsigned long long, __1::A>\00min<unsigned long long, __1::A>\00_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_\007\000\001\000\006\00256\001\008", metadata !1, metadata !25, metadata !42, null, null, metadata !53, null, metadata !55} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 8] [min<unsigned long long, __1::A>]
-!42 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !43, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!43 = metadata !{metadata !29, metadata !29, metadata !32, metadata !44}
-!44 = metadata !{metadata !"0x13\00A\000\008\008\000\000\000", metadata !1, metadata !25, null, metadata !45, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 0, size 8, align 8, offset 0] [def] [from ]
-!45 = metadata !{metadata !46}
-!46 = metadata !{metadata !"0x2e\00operator()\00operator()\00_ZN3__11AclERKiS2_\001\000\000\000\006\00256\001\001", metadata !1, metadata !44, metadata !47, null, null, null, i32 0, metadata !52} ; [ DW_TAG_subprogram ] [line 1] [operator()]
-!47 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !48, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!48 = metadata !{metadata !13, metadata !49, metadata !50, metadata !50}
-!49 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
-!50 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !51} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!51 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !13} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
-!52 = metadata !{i32 786468}
-!53 = metadata !{metadata !34, metadata !54}
-!54 = metadata !{metadata !"0x2f\00_Compare\000\000", null, metadata !44, null} ; [ DW_TAG_template_type_parameter ]
-!55 = metadata !{metadata !56, metadata !57, metadata !58}
-!56 = metadata !{metadata !"0x101\00p1\0016777223\000", metadata !41, metadata !10, metadata !29} ; [ DW_TAG_arg_variable ] [p1] [line 7]
-!57 = metadata !{metadata !"0x101\00p2\0033554439\000", metadata !41, metadata !10, metadata !32} ; [ DW_TAG_arg_variable ] [p2] [line 7]
-!58 = metadata !{metadata !"0x101\00p3\0050331656\000", metadata !41, metadata !10, metadata !44} ; [ DW_TAG_arg_variable ] [p3] [line 8]
-!59 = metadata !{i32 13, i32 0, metadata !24, metadata !38}
-!63 = metadata !{i32 undef}
-!64 = metadata !{metadata !"0x101\00p1\0033554433\000", metadata !65, metadata !10, metadata !50, metadata !40} ; [ DW_TAG_arg_variable ] [p1] [line 1]
-!65 = metadata !{metadata !"0x2e\00operator()\00operator()\00_ZN3__11AclERKiS2_\001\000\001\000\006\00256\001\002", metadata !1, metadata !25, metadata !47, null, null, null, metadata !46, metadata !66} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [operator()]
-!66 = metadata !{metadata !67, metadata !69, metadata !70}
-!67 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !65, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!68 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
-!69 = metadata !{metadata !"0x101\00p1\0033554433\000", metadata !65, metadata !10, metadata !50} ; [ DW_TAG_arg_variable ] [p1] [line 1]
-!70 = metadata !{metadata !"0x101\00\0050331650\000", metadata !65, metadata !10, metadata !50} ; [ DW_TAG_arg_variable ] [line 2]
-!71 = metadata !{i32 1, i32 0, metadata !65, metadata !40}
-!72 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 \001\00\000\00\001", !1, !2, !5, !8, !20, !5} ; [ DW_TAG_compile_unit ] [/Users/matt/ryan_bug/<unknown>] [DW_LANG_C_plus_plus]
+!1 = !{!"<unknown>", !"/Users/matt/ryan_bug"}
+!2 = !{!3}
+!3 = !{!"0x4\00\0020\0032\0032\000\000\000", !1, !4, null, !6, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
+!4 = !{!"0x13\00C\0019\008\008\000\000\000", !1, null, null, !5, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 19, size 8, align 8, offset 0] [def] [from ]
+!5 = !{}
+!6 = !{!7}
+!7 = !{!"0x28\00max_frame_size\000"} ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
+!8 = !{!9, !24, !41, !65}
+!9 = !{!"0x2e\00read_response_size\00read_response_size\00_Z18read_response_sizev\0027\000\001\000\006\00256\001\0027", !1, !10, !11, null, i32 ()* @_Z18read_response_sizev, null, null, !14} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size]
+!10 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>]
+!11 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!13}
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = !{!15, !19}
+!15 = !{!"0x100\00b\0028\000", !9, !10, !16} ; [ DW_TAG_auto_variable ] [b] [line 28]
+!16 = !{!"0x13\00B\0016\0032\0032\000\000\000", !1, null, null, !17, null, null} ; [ DW_TAG_structure_type ] [B] [line 16, size 32, align 32, offset 0] [def] [from ]
+!17 = !{!18}
+!18 = !{!"0xd\00end_of_file\0017\0032\0032\000\000", !1, !16, !13} ; [ DW_TAG_member ] [end_of_file] [line 17, size 32, align 32, offset 0] [from int]
+!19 = !{!"0x100\00c\0029\000", !9, !10, !13} ; [ DW_TAG_auto_variable ] [c] [line 29]
+!20 = !{}
+!21 = !{i32 2, !"Dwarf Version", i32 2}
+!22 = !{i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0)}
+!23 = !{!"0x101\00p2\0033554444\000", !24, !10, !32, !38} ; [ DW_TAG_arg_variable ] [p2] [line 12]
+!24 = !{!"0x2e\00min<unsigned long long>\00min<unsigned long long>\00_ZN3__13minIyEERKT_S3_RS1_\0012\000\001\000\006\00256\001\0012", !1, !25, !27, null, null, !33, null, !35} ; [ DW_TAG_subprogram ] [line 12] [def] [min<unsigned long long>]
+!25 = !{!"0x39\00__1\001", !26, null} ; [ DW_TAG_namespace ] [__1] [line 1]
+!26 = !{!"main.cpp", !"/Users/matt/ryan_bug"}
+!27 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !28, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = !{!29, !29, !32}
+!29 = !{!"0x10\00\000\000\000\000\000", null, null, !30} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!30 = !{!"0x26\00\000\000\000\000\000", null, null, !31} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
+!31 = !{!"0x24\00long long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!32 = !{!"0x10\00\000\000\000\000\000", null, null, !31} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
+!33 = !{!34}
+!34 = !{!"0x2f\00_Tp\000\000", null, !31, null} ; [ DW_TAG_template_type_parameter ]
+!35 = !{!36, !37}
+!36 = !{!"0x101\00p1\0016777228\000", !24, !10, !29} ; [ DW_TAG_arg_variable ] [p1] [line 12]
+!37 = !{!"0x101\00p2\0033554444\000", !24, !10, !32} ; [ DW_TAG_arg_variable ] [p2] [line 12]
+!38 = !MDLocation(line: 33, scope: !9)
+!39 = !MDLocation(line: 12, scope: !24, inlinedAt: !38)
+!40 = !MDLocation(line: 9, scope: !41, inlinedAt: !59)
+!41 = !{!"0x2e\00min<unsigned long long, __1::A>\00min<unsigned long long, __1::A>\00_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_\007\000\001\000\006\00256\001\008", !1, !25, !42, null, null, !53, null, !55} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 8] [min<unsigned long long, __1::A>]
+!42 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !43, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!43 = !{!29, !29, !32, !44}
+!44 = !{!"0x13\00A\000\008\008\000\000\000", !1, !25, null, !45, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 0, size 8, align 8, offset 0] [def] [from ]
+!45 = !{!46}
+!46 = !{!"0x2e\00operator()\00operator()\00_ZN3__11AclERKiS2_\001\000\000\000\006\00256\001\001", !1, !44, !47, null, null, null, i32 0, !52} ; [ DW_TAG_subprogram ] [line 1] [operator()]
+!47 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !48, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!48 = !{!13, !49, !50, !50}
+!49 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
+!50 = !{!"0x10\00\000\000\000\000\000", null, null, !51} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!51 = !{!"0x26\00\000\000\000\000\000", null, null, !13} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
+!52 = !{i32 786468}
+!53 = !{!34, !54}
+!54 = !{!"0x2f\00_Compare\000\000", null, !44, null} ; [ DW_TAG_template_type_parameter ]
+!55 = !{!56, !57, !58}
+!56 = !{!"0x101\00p1\0016777223\000", !41, !10, !29} ; [ DW_TAG_arg_variable ] [p1] [line 7]
+!57 = !{!"0x101\00p2\0033554439\000", !41, !10, !32} ; [ DW_TAG_arg_variable ] [p2] [line 7]
+!58 = !{!"0x101\00p3\0050331656\000", !41, !10, !44} ; [ DW_TAG_arg_variable ] [p3] [line 8]
+!59 = !MDLocation(line: 13, scope: !24, inlinedAt: !38)
+!63 = !{i32 undef}
+!64 = !{!"0x101\00p1\0033554433\000", !65, !10, !50, !40} ; [ DW_TAG_arg_variable ] [p1] [line 1]
+!65 = !{!"0x2e\00operator()\00operator()\00_ZN3__11AclERKiS2_\001\000\001\000\006\00256\001\002", !1, !25, !47, null, null, null, !46, !66} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [operator()]
+!66 = !{!67, !69, !70}
+!67 = !{!"0x101\00this\0016777216\001088", !65, null, !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!68 = !{!"0xf\00\000\0064\0064\000\000", null, null, !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!69 = !{!"0x101\00p1\0033554433\000", !65, !10, !50} ; [ DW_TAG_arg_variable ] [p1] [line 1]
+!70 = !{!"0x101\00\0050331650\000", !65, !10, !50} ; [ DW_TAG_arg_variable ] [line 2]
+!71 = !MDLocation(line: 1, scope: !65, inlinedAt: !40)
+!72 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/stack-protector-weight.ll b/test/CodeGen/X86/stack-protector-weight.ll
new file mode 100644
index 0000000..c5bf491
--- /dev/null
+++ b/test/CodeGen/X86/stack-protector-weight.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=true %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=SELDAG
+; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR
+
+; SELDAG: # Machine code for function test_branch_weights:
+; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](1048575) BB#[[FAILURE:[0-9]+]](1)
+; SELDAG: BB#[[FAILURE]]:
+; SELDAG: CALL64pcrel32 <es:__stack_chk_fail>
+; SELDAG: BB#[[SUCCESS]]:
+
+; IR: # Machine code for function test_branch_weights:
+; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](1048575) BB#[[FAILURE:[0-9]+]](1)
+; IR: BB#[[SUCCESS]]:
+; IR: BB#[[FAILURE]]:
+; IR: CALL64pcrel32 <ga:@__stack_chk_fail>
+
+define i32 @test_branch_weights(i32 %n) #0 {
+entry:
+  %a = alloca [128 x i32], align 16
+  %0 = bitcast [128 x i32]* %a to i8*
+  call void @llvm.lifetime.start(i64 512, i8* %0)
+  %arraydecay = getelementptr inbounds [128 x i32]* %a, i64 0, i64 0
+  call void @foo2(i32* %arraydecay)
+  %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds [128 x i32]* %a, i64 0, i64 %idxprom
+  %1 = load i32* %arrayidx, align 4
+  call void @llvm.lifetime.end(i64 512, i8* %0)
+  ret i32 %1
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @foo2(i32*)
+
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+attributes #0 = { ssp "stack-protector-buffer-size"="8" }
diff --git a/test/CodeGen/X86/stackpointer.ll b/test/CodeGen/X86/stackpointer.ll
index 80bcfbf..094856b 100644
--- a/test/CodeGen/X86/stackpointer.ll
+++ b/test/CodeGen/X86/stackpointer.ll
@@ -25,4 +25,4 @@ declare void @llvm.write_register.i64(metadata, i64) nounwind
 
 ; register unsigned long current_stack_pointer asm("rsp");
 ; CHECK-NOT: .asciz  "rsp"
-!0 = metadata !{metadata !"rsp\00"}
+!0 = !{!"rsp\00"}
diff --git a/test/CodeGen/X86/statepoint-call-lowering.ll b/test/CodeGen/X86/statepoint-call-lowering.ll
new file mode 100644
index 0000000..e1a1369
--- /dev/null
+++ b/test/CodeGen/X86/statepoint-call-lowering.ll
@@ -0,0 +1,104 @@
+; RUN: llc < %s | FileCheck %s
+; This file contains a collection of basic tests to ensure we didn't
+; screw up normal call lowering when there are no deopt or gc arguments.
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+declare zeroext i1 @return_i1()
+declare zeroext i32 @return_i32()
+declare i32* @return_i32ptr()
+declare float @return_float()
+declare void @varargf(i32, ...)
+
+define i1 @test_i1_return() gc "statepoint-example" {
+; CHECK-LABEL: test_i1_return
+; This is just checking that a i1 gets lowered normally when there's no extra
+; state arguments to the statepoint
+; CHECK: pushq %rax
+; CHECK: callq return_i1
+; CHECK: popq %rdx
+; CHECK: retq
+entry:
+  %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0)
+  %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
+  ret i1 %call1
+}
+
+define i32 @test_i32_return() gc "statepoint-example" {
+; CHECK-LABEL: test_i32_return
+; CHECK: pushq %rax
+; CHECK: callq return_i32
+; CHECK: popq %rdx
+; CHECK: retq
+entry:
+  %safepoint_token = tail call i32 (i32 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i32f(i32 ()* @return_i32, i32 0, i32 0, i32 0)
+  %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(i32 %safepoint_token)
+  ret i32 %call1
+}
+
+define i32* @test_i32ptr_return() gc "statepoint-example" {
+; CHECK-LABEL: test_i32ptr_return
+; CHECK: pushq %rax
+; CHECK: callq return_i32ptr
+; CHECK: popq %rdx
+; CHECK: retq
+entry:
+  %safepoint_token = tail call i32 (i32* ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_p0i32f(i32* ()* @return_i32ptr, i32 0, i32 0, i32 0)
+  %call1 = call i32* @llvm.experimental.gc.result.p0i32(i32 %safepoint_token)
+  ret i32* %call1
+}
+
+define float @test_float_return() gc "statepoint-example" {
+; CHECK-LABEL: test_float_return
+; CHECK: pushq %rax
+; CHECK: callq return_float
+; CHECK: popq %rax
+; CHECK: retq
+entry:
+  %safepoint_token = tail call i32 (float ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_f32f(float ()* @return_float, i32 0, i32 0, i32 0)
+  %call1 = call float @llvm.experimental.gc.result.f32(i32 %safepoint_token)
+  ret float %call1
+}
+
+define i1 @test_relocate(i32 addrspace(1)* %a) gc "statepoint-example" {
+; CHECK-LABEL: test_relocate
+; Check that an ununsed relocate has no code-generation impact
+; CHECK: pushq %rax
+; CHECK: callq return_i1
+; CHECK-NEXT: .Ltmp13:
+; CHECK-NEXT: popq %rdx
+; CHECK-NEXT: retq
+entry:
+  %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a)
+  %call1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 4, i32 4)
+  %call2 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
+  ret i1 %call2
+}
+
+define void @test_void_vararg() gc "statepoint-example" {
+; CHECK-LABEL: test_void_vararg
+; Check a statepoint wrapping a *void* returning vararg function works
+; CHECK: callq varargf
+entry:
+  %safepoint_token = tail call i32 (void (i32, ...)*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(void (i32, ...)* @varargf, i32 2, i32 0, i32 42, i32 43, i32 0)
+  ;; if we try to use the result from a statepoint wrapping a
+  ;; non-void-returning varargf, we will experience a crash.
+  ret void
+}
+
+declare i32 @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()*, i32, i32, ...)
+declare i1 @llvm.experimental.gc.result.i1(i32)
+
+declare i32 @llvm.experimental.gc.statepoint.p0f_i32f(i32 ()*, i32, i32, ...)
+declare i32 @llvm.experimental.gc.result.i32(i32)
+
+declare i32 @llvm.experimental.gc.statepoint.p0f_p0i32f(i32* ()*, i32, i32, ...)
+declare i32* @llvm.experimental.gc.result.p0i32(i32)
+
+declare i32 @llvm.experimental.gc.statepoint.p0f_f32f(float ()*, i32, i32, ...)
+declare float @llvm.experimental.gc.result.f32(i32)
+
+declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(void (i32, ...)*, i32, i32, ...)
+
+declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32)
diff --git a/test/CodeGen/X86/statepoint-forward.ll b/test/CodeGen/X86/statepoint-forward.ll
new file mode 100644
index 0000000..12a6ac2
--- /dev/null
+++ b/test/CodeGen/X86/statepoint-forward.ll
@@ -0,0 +1,107 @@
+; RUN: opt -O3 -S < %s | FileCheck --check-prefix=CHECK-OPT %s
+; RUN: llc < %s | FileCheck --check-prefix=CHECK-LLC %s
+; These tests are targetted at making sure we don't retain information
+; about memory which contains potential gc references across a statepoint.
+; They're carefully written to only outlaw forwarding of references. 
+; Depending on the collector, forwarding non-reference fields or
+; constant null references may be perfectly legal. (If unimplemented.)
+; The general structure of these tests is:
+; - learn a fact about memory (via an assume)
+; - cross a statepoint
+; - check the same fact about memory (which we no longer know)
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+; If not at a statepoint, we could forward known memory values
+; across this call.
+declare void @func() readonly
+
+;; Forwarding the value of a pointer load is invalid since it may have
+;; changed at the safepoint.  Forwarding a non-gc pointer value would 
+;; be valid, but is not currently implemented.
+define i1 @test_load_forward(i32 addrspace(1)* addrspace(1)* %p) gc "statepoint-example" {
+entry:
+  %before = load i32 addrspace(1)* addrspace(1)* %p
+  %cmp1 = call i1 @f(i32 addrspace(1)* %before)
+  call void @llvm.assume(i1 %cmp1)
+  %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0, i32 addrspace(1)* addrspace(1)* %p)
+  %pnew = call i32 addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1i32(i32 %safepoint_token, i32 4, i32 4)
+  %after = load i32 addrspace(1)* addrspace(1)* %pnew
+  %cmp2 = call i1 @f(i32 addrspace(1)* %after)
+  ret i1 %cmp2
+
+; CHECK-OPT-LABEL: test_load_forward
+; CHECK-OPT: ret i1 %cmp2
+; CHECK-LLC-LABEL: test_load_forward
+; CHECK-LLC: callq f
+}
+
+;; Same as above, but forwarding from a store
+define i1 @test_store_forward(i32 addrspace(1)* addrspace(1)* %p,
+                              i32 addrspace(1)* %v) gc "statepoint-example" {
+entry:
+  %cmp1 = call i1 @f(i32 addrspace(1)* %v)
+  call void @llvm.assume(i1 %cmp1)
+  store i32 addrspace(1)* %v, i32 addrspace(1)* addrspace(1)* %p
+  %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0, i32 addrspace(1)* addrspace(1)* %p)
+  %pnew = call i32 addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1i32(i32 %safepoint_token, i32 4, i32 4)
+  %after = load i32 addrspace(1)* addrspace(1)* %pnew
+  %cmp2 = call i1 @f(i32 addrspace(1)* %after)
+  ret i1 %cmp2
+
+; CHECK-OPT-LABEL: test_store_forward
+; CHECK-OPT: ret i1 %cmp2
+; CHECK-LLC-LABEL: test_store_forward
+; CHECK-LLC: callq f
+}
+
+; A predicate on the pointer which is not simply null, but whose value
+; would be known unchanged if the pointer value could be forwarded.
+; The implementation of such a function could inspect the integral value
+; of the pointer and is thus not safe to reuse after a statepoint.
+declare i1 @f(i32 addrspace(1)* %v) readnone
+
+; This is a variant of the test_load_forward test which is intended to 
+; highlight the fact that a gc pointer can be stored in part of the heap
+; that is not itself GC managed.  The GC may have an external mechanism
+; to know about and update that value at a safepoint.  Note that the 
+; statepoint does not provide the collector with this root.
+define i1 @test_load_forward_nongc_heap(i32 addrspace(1)** %p) gc "statepoint-example" {
+entry:
+  %before = load i32 addrspace(1)** %p
+  %cmp1 = call i1 @f(i32 addrspace(1)* %before)
+  call void @llvm.assume(i1 %cmp1)
+  call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0)
+  %after = load i32 addrspace(1)** %p
+  %cmp2 = call i1 @f(i32 addrspace(1)* %after)
+  ret i1 %cmp2
+
+; CHECK-OPT-LABEL: test_load_forward_nongc_heap
+; CHECK-OPT: ret i1 %cmp2
+; CHECK-LLC-LABEL: test_load_forward_nongc_heap
+; CHECK-LLC: callq f
+}
+
+;; Same as above, but forwarding from a store
+define i1 @test_store_forward_nongc_heap(i32 addrspace(1)** %p,
+                                         i32 addrspace(1)* %v) gc "statepoint-example" {
+entry:
+  %cmp1 = call i1 @f(i32 addrspace(1)* %v)
+  call void @llvm.assume(i1 %cmp1)
+  store i32 addrspace(1)* %v, i32 addrspace(1)** %p
+  call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0)
+  %after = load i32 addrspace(1)** %p
+  %cmp2 = call i1 @f(i32 addrspace(1)* %after)
+  ret i1 %cmp2
+
+; CHECK-OPT-LABEL: test_store_forward_nongc_heap
+; CHECK-OPT: ret i1 %cmp2
+; CHECK-LLC-LABEL: test_store_forward_nongc_heap
+; CHECK-LLC: callq f
+}
+
+declare void @llvm.assume(i1)
+declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()*, i32, i32, ...)
+declare i32 addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1i32(i32, i32, i32) #3
+
diff --git a/test/CodeGen/X86/statepoint-stack-usage.ll b/test/CodeGen/X86/statepoint-stack-usage.ll
new file mode 100644
index 0000000..3ecef33
--- /dev/null
+++ b/test/CodeGen/X86/statepoint-stack-usage.ll
@@ -0,0 +1,60 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+; This test is checking to make sure that we reuse the same stack slots
+; for GC values spilled over two different call sites.  Since the order
+; of GC arguments differ, niave lowering code would insert loads and 
+; stores to rearrange items on the stack.  We need to make sure (for
+; performance) that this doesn't happen.
+define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 {
+; CHECK-LABEL: back_to_back_calls
+; The exact stores don't matter, but there need to be three stack slots created
+; CHECK: movq	%rdx, 16(%rsp)
+; CHECK: movq	%rdi, 8(%rsp)
+; CHECK: movq	%rsi, (%rsp)
+  %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
+  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 9)
+  %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 10)
+  %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 11)
+; CHECK: callq
+; This is the key check.  There should NOT be any memory moves here
+; CHECK-NOT: movq
+  %safepoint_token2 = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
+  %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 11)
+  %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 10)
+  %c2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 9)
+; CHECK: callq
+  ret i32 1
+}
+
+; This test simply checks that minor changes in vm state don't prevent slots
+; being reused for gc values.  
+define i32 @reserve_first(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 {
+; CHECK-LABEL: reserve_first
+; The exact stores don't matter, but there need to be three stack slots created
+; CHECK: movq	%rdx, 16(%rsp)
+; CHECK: movq	%rdi, 8(%rsp)
+; CHECK: movq	%rsi, (%rsp)
+  %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
+  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 9)
+  %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 10)
+  %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 11)
+; CHECK: callq
+; This is the key check.  There should NOT be any memory moves here
+; CHECK-NOT: movq
+  %safepoint_token2 = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 addrspace(1)* %a1, i32 0, i32 addrspace(1)* %c1, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
+  %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 11)
+  %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 10)
+  %c2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 9)
+; CHECK: callq
+  ret i32 1
+}
+
+; Function Attrs: nounwind
+declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32) #3
+
+declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()*, i32, i32, ...)
+
+attributes #1 = { uwtable }
diff --git a/test/CodeGen/X86/statepoint-stackmap-format.ll b/test/CodeGen/X86/statepoint-stackmap-format.ll
new file mode 100644
index 0000000..e452a63
--- /dev/null
+++ b/test/CodeGen/X86/statepoint-stackmap-format.ll
@@ -0,0 +1,109 @@
+; RUN: llc < %s | FileCheck %s
+; This test is a sanity check to ensure statepoints are generating StackMap
+; sections correctly.  This is not intended to be a rigorous test of the 
+; StackMap format (see the stackmap tests for that).
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+declare zeroext i1 @return_i1()
+
+define i1 @test(i32 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test
+; Do we see one spill for the local value and the store to the
+; alloca?
+; CHECK: subq	$24, %rsp
+; CHECK: movq	$0, 8(%rsp)
+; CHECK: movq	%rdi, (%rsp)
+; CHECK: callq return_i1
+; CHECK: addq	$24, %rsp
+; CHECK: retq
+entry:
+  %metadata1 = alloca i32 addrspace(1)*, i32 2, align 8
+  store i32 addrspace(1)* null, i32 addrspace(1)** %metadata1
+  %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr, i32 addrspace(1)* null, i32 addrspace(1)* %ptr, i32 addrspace(1)* null)
+  %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
+  %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 6, i32 6)
+  %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 7, i32 7)
+; 
+  ret i1 %call1
+}
+
+declare i32 @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()*, i32, i32, ...)
+declare i1 @llvm.experimental.gc.result.i1(i32)
+declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32) #3
+
+
+; CHECK-LABEL: .section .llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 1
+; Num LargeConstants
+; CHECK-NEXT:   .long 0
+; Num Callsites
+; CHECK-NEXT:   .long 1
+
+; Functions and stack size
+; CHECK-NEXT:   .quad test
+; CHECK-NEXT:   .quad 24
+
+; Large Constants
+; Statepoint ID only
+; CHECK: .quad	2882400000
+
+; Callsites
+; Constant arguments
+; CHECK: .long	.Ltmp1-test
+; CHECK: .short	0
+; CHECK: .short	8
+; SmallConstant (0)
+; CHECK: .byte	4
+; CHECK: .byte	8
+; CHECK: .short	0
+; CHECK: .long	0
+; SmallConstant (2)
+; CHECK: .byte	4
+; CHECK: .byte	8
+; CHECK: .short	0
+; CHECK: .long	2
+; Direct Spill Slot [RSP+0]
+; CHECK: .byte	2
+; CHECK: .byte	8
+; CHECK: .short	7
+; CHECK: .long	0
+; SmallConstant  (0)
+; CHECK: .byte	4
+; CHECK: .byte	8
+; CHECK: .short	0
+; CHECK: .long	0
+; SmallConstant  (0)
+; CHECK: .byte	4
+; CHECK: .byte	8
+; CHECK: .short	0
+; CHECK: .long	0
+; SmallConstant  (0)
+; CHECK: .byte	4
+; CHECK: .byte	8
+; CHECK: .short	0
+; CHECK: .long	0
+; Direct Spill Slot [RSP+0]
+; CHECK: .byte	2
+; CHECK: .byte	8
+; CHECK: .short	7
+; CHECK: .long	0
+; Direct Spill Slot [RSP+0]
+; CHECK: .byte	2
+; CHECK: .byte	8
+; CHECK: .short	7
+; CHECK: .long	0
+
+; No Padding or LiveOuts
+; CHECK: .short	0
+; CHECK: .short	0
+; CHECK: .align	8
+
+
diff --git a/test/CodeGen/X86/switch-bt.ll b/test/CodeGen/X86/switch-bt.ll
index a80002b..065d8cd 100644
--- a/test/CodeGen/X86/switch-bt.ll
+++ b/test/CodeGen/X86/switch-bt.ll
@@ -99,3 +99,61 @@ if.then:
 if.end:
   ret void
 }
+
+; Ensure that optimizing for jump tables doesn't needlessly deteriorate the
+; created binary tree search. See PR22262.
+define void @test4(i32 %x, i32* %y) {
+; CHECK-LABEL: test4:
+
+entry:
+  switch i32 %x, label %sw.default [
+    i32 10, label %sw.bb
+    i32 20, label %sw.bb1
+    i32 30, label %sw.bb2
+    i32 40, label %sw.bb3
+    i32 50, label %sw.bb4
+    i32 60, label %sw.bb5
+  ]
+sw.bb:
+  store i32 1, i32* %y
+  br label %sw.epilog
+sw.bb1:
+  store i32 2, i32* %y
+  br label %sw.epilog
+sw.bb2:
+  store i32 3, i32* %y
+  br label %sw.epilog
+sw.bb3:
+  store i32 4, i32* %y
+  br label %sw.epilog
+sw.bb4:
+  store i32 5, i32* %y
+  br label %sw.epilog
+sw.bb5:
+  store i32 6, i32* %y
+  br label %sw.epilog
+sw.default:
+  store i32 7, i32* %y
+  br label %sw.epilog
+sw.epilog:
+  ret void
+
+; The balanced binary switch here would start with a comparison against 39, but
+; it is currently starting with 29 because of the density-sum heuristic.
+; CHECK: cmpl $29
+; CHECK: jg
+; CHECK: cmpl $10
+; CHECK: jne
+; CHECK: cmpl $49
+; CHECK: jg
+; CHECK: cmpl $30
+; CHECK: jne
+; CHECK: cmpl $20
+; CHECK: jne
+; CHECK: cmpl $50
+; CHECK: jne
+; CHECK: cmpl $40
+; CHECK: jne
+; CHECK: cmpl $60
+; CHECK: jne
+}
diff --git a/test/CodeGen/X86/switch-default-only.ll b/test/CodeGen/X86/switch-default-only.ll
new file mode 100644
index 0000000..360ace5
--- /dev/null
+++ b/test/CodeGen/X86/switch-default-only.ll
@@ -0,0 +1,14 @@
+; RUN: llc -O0 -fast-isel=false -march=x86 < %s | FileCheck %s
+
+; No need for branching when the default and only destination follows
+; immediately after the switch.
+; CHECK-LABEL: no_branch:
+; CHECK-NOT: jmp
+; CHECK: ret
+
+define void @no_branch(i32 %x) {
+entry:
+  switch i32 %x, label %exit [ ]
+exit:
+  ret void
+}
diff --git a/test/CodeGen/X86/switch-jump-table.ll b/test/CodeGen/X86/switch-jump-table.ll
new file mode 100644
index 0000000..a84fb4a
--- /dev/null
+++ b/test/CodeGen/X86/switch-jump-table.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mtriple=i686-pc-gnu-linux < %s | FileCheck %s
+
+
+; An unreachable default destination is replaced with the most popular case label.
+
+define void @sum2(i32 %x, i32* %to) {
+; CHECK-LABEL: sum2:
+; CHECK: movl 4(%esp), [[REG:%e[a-z]{2}]]
+; CHECK: cmpl $3, [[REG]]
+; CHECK: jbe .LBB0_1
+; CHECK: movl $4
+; CHECK: retl
+; CHECK-LABEL: .LBB0_1:
+; CHECK-NEXT: jmpl *.LJTI0_0(,[[REG]],4)
+
+entry:
+  switch i32 %x, label %default [
+    i32 0, label %bb0
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+    i32 4, label %bb4
+    i32 5, label %bb4
+  ]
+bb0:
+  store i32 0, i32* %to
+  br label %exit
+bb1:
+  store i32 1, i32* %to
+  br label %exit
+bb2:
+  store i32 2, i32* %to
+  br label %exit
+bb3:
+  store i32 3, i32* %to
+  br label %exit
+bb4:
+  store i32 4, i32* %to
+  br label %exit
+exit:
+  ret void
+default:
+  unreachable
+
+; The jump table has four entries.
+; CHECK-LABEL: .LJTI0_0:
+; CHECK-NEXT: .long  .LBB0_2
+; CHECK-NEXT: .long  .LBB0_3
+; CHECK-NEXT: .long  .LBB0_4
+; CHECK-NEXT: .long  .LBB0_5
+; CHECK-NOT: .long
+}
diff --git a/test/CodeGen/X86/tail-call-win64.ll b/test/CodeGen/X86/tail-call-win64.ll
new file mode 100644
index 0000000..23e9280
--- /dev/null
+++ b/test/CodeGen/X86/tail-call-win64.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple=x86_64-windows -show-mc-encoding < %s | FileCheck %s
+
+; The Win64 ABI wants tail jmps to use a REX_W prefix so it can distinguish
+; in-function jumps from function exiting jumps.
+
+define void @tail_jmp_reg(i32, i32, void ()* %fptr) {
+  tail call void ()* %fptr()
+  ret void
+}
+
+; Check that we merge the REX prefixes into 0x49 instead of 0x48, 0x41.
+
+; CHECK-LABEL: tail_jmp_reg:
+; CHECK: rex64 jmpq *%r8
+; CHECK: 	encoding: [0x49,0xff,0xe0]
+
+declare void @tail_tgt()
+
+define void @tail_jmp_imm() {
+  tail call void @tail_tgt()
+  ret void
+}
+
+; CHECK-LABEL: tail_jmp_imm:
+; CHECK: rex64 jmp tail_tgt
+
+@g_fptr = global void ()* @tail_tgt
+
+define void @tail_jmp_mem() {
+  %fptr = load void ()** @g_fptr
+  tail call void ()* %fptr()
+  ret void
+}
+
+; CHECK-LABEL: tail_jmp_mem:
+; CHECK: rex64 jmpq *g_fptr(%rip)
diff --git a/test/CodeGen/X86/tailcall-64.ll b/test/CodeGen/X86/tailcall-64.ll
index deab1dc..25d3802 100644
--- a/test/CodeGen/X86/tailcall-64.ll
+++ b/test/CodeGen/X86/tailcall-64.ll
@@ -182,7 +182,7 @@ define { i64, i64 } @crash(i8* %this) {
 ; Check that we can fold an indexed load into a tail call instruction.
 ; CHECK: fold_indexed_load
 ; CHECK: leaq (%rsi,%rsi,4), %[[RAX:r..]]
-; CHECK: jmpq *16(%{{r..}},%[[RAX]],8)  # TAILCALL
+; CHECK: jmpq *16(%{{r..}},%[[RAX]],8)  ## TAILCALL
 %struct.funcs = type { i32 (i8*, i32*, i32)*, i32 (i8*)*, i32 (i8*)*, i32 (i8*, i32)*, i32 }
 @func_table = external global [0 x %struct.funcs]
 define void @fold_indexed_load(i8* %mbstr, i64 %idxprom) nounwind uwtable ssp {
@@ -207,7 +207,7 @@ entry:
 ; }
 ;
 ; CHECK-LABEL: rdar12282281
-; CHECK: jmpq *%r11 # TAILCALL
+; CHECK: jmpq *%r11 ## TAILCALL
 @funcs = external constant [0 x i32 (i8*, ...)*]
 
 define i32 @rdar12282281(i32 %n) nounwind uwtable ssp {
diff --git a/test/CodeGen/X86/tailcall-returndup-void.ll b/test/CodeGen/X86/tailcall-returndup-void.ll
index c1d6312..2c39cb4 100644
--- a/test/CodeGen/X86/tailcall-returndup-void.ll
+++ b/test/CodeGen/X86/tailcall-returndup-void.ll
@@ -3,9 +3,9 @@
 ; CHECK-NOT: ret
 
 @sES_closure = external global [0 x i64]
-declare cc10 void @sEH_info(i64* noalias nocapture, i64* noalias nocapture, i64* noalias nocapture, i64, i64, i64) align 8
+declare ghccc void @sEH_info(i64* noalias nocapture, i64* noalias nocapture, i64* noalias nocapture, i64, i64, i64) align 8
 
-define cc10 void @rBM_info(i64* noalias nocapture %Base_Arg, i64* noalias nocapture %Sp_Arg, i64* noalias nocapture %Hp_Arg, i64 %R1_Arg, i64 %R2_Arg, i64 %R3_Arg) nounwind align 8 {
+define ghccc void @rBM_info(i64* noalias nocapture %Base_Arg, i64* noalias nocapture %Sp_Arg, i64* noalias nocapture %Hp_Arg, i64 %R1_Arg, i64 %R2_Arg, i64 %R3_Arg) nounwind align 8 {
 c263:
   %ln265 = getelementptr inbounds i64* %Sp_Arg, i64 -2
   %ln266 = ptrtoint i64* %ln265 to i64
@@ -18,11 +18,11 @@ n26p:                                             ; preds = %c263
 n1ZQ.i:                                           ; preds = %n26p
   %ln1ZT.i = load i64* getelementptr inbounds ([0 x i64]* @sES_closure, i64 0, i64 0), align 8
   %ln1ZU.i = inttoptr i64 %ln1ZT.i to void (i64*, i64*, i64*, i64, i64, i64)*
-  tail call cc10 void %ln1ZU.i(i64* %Base_Arg, i64* %Sp_Arg, i64* %Hp_Arg, i64 ptrtoint ([0 x i64]* @sES_closure to i64), i64 ptrtoint ([0 x i64]* @sES_closure to i64), i64 %R3_Arg) nounwind
+  tail call ghccc void %ln1ZU.i(i64* %Base_Arg, i64* %Sp_Arg, i64* %Hp_Arg, i64 ptrtoint ([0 x i64]* @sES_closure to i64), i64 ptrtoint ([0 x i64]* @sES_closure to i64), i64 %R3_Arg) nounwind
   br label %rBL_info.exit
 
 c1ZP.i:                                           ; preds = %n26p
-  tail call cc10 void @sEH_info(i64* %Base_Arg, i64* %Sp_Arg, i64* %Hp_Arg, i64 ptrtoint ([0 x i64]* @sES_closure to i64), i64 ptrtoint ([0 x i64]* @sES_closure to i64), i64 %R3_Arg) nounwind
+  tail call ghccc void @sEH_info(i64* %Base_Arg, i64* %Sp_Arg, i64* %Hp_Arg, i64 ptrtoint ([0 x i64]* @sES_closure to i64), i64 ptrtoint ([0 x i64]* @sES_closure to i64), i64 %R3_Arg) nounwind
   br label %rBL_info.exit
 
 rBL_info.exit:                                    ; preds = %c1ZP.i, %n1ZQ.i
@@ -32,6 +32,6 @@ c26a:                                             ; preds = %c263
   %ln27h = getelementptr inbounds i64* %Base_Arg, i64 -2
   %ln27j = load i64* %ln27h, align 8
   %ln27k = inttoptr i64 %ln27j to void (i64*, i64*, i64*, i64, i64, i64)*
-  tail call cc10 void %ln27k(i64* %Base_Arg, i64* %Sp_Arg, i64* %Hp_Arg, i64 %R1_Arg, i64 %R2_Arg, i64 %R3_Arg) nounwind
+  tail call ghccc void %ln27k(i64* %Base_Arg, i64* %Sp_Arg, i64* %Hp_Arg, i64 %R1_Arg, i64 %R2_Arg, i64 %R3_Arg) nounwind
   ret void
 }
diff --git a/test/CodeGen/X86/tls-models.ll b/test/CodeGen/X86/tls-models.ll
index 8e3e958..0fd7853 100644
--- a/test/CodeGen/X86/tls-models.ll
+++ b/test/CodeGen/X86/tls-models.ll
@@ -128,6 +128,14 @@ entry:
   ; DARWIN:  _internal_ie@TLVP
 }
 
+define i32 @PR22083() {
+entry:
+  ret i32 ptrtoint (i32* @external_ie to i32)
+  ; X64-LABEL:     PR22083:
+  ; X64:     movq    external_ie@GOTTPOFF(%rip), %rax
+  ; X64_PIC-LABEL: PR22083:
+  ; X64_PIC: movq    external_ie@GOTTPOFF(%rip), %rax
+}
 
 ; ----- localexec specified -----
 
diff --git a/test/CodeGen/X86/trap.ll b/test/CodeGen/X86/trap.ll
index 149c667..ca33f9e 100644
--- a/test/CodeGen/X86/trap.ll
+++ b/test/CodeGen/X86/trap.ll
@@ -1,15 +1,25 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin8 -mcpu=yonah | FileCheck %s -check-prefix=DARWIN
+; RUN: llc < %s -mtriple=i686-unknown-linux -mcpu=yonah | FileCheck %s -check-prefix=LINUX
+; RUN: llc < %s -mtriple=x86_64-scei-ps4 | FileCheck %s -check-prefix=PS4
 
-; CHECK-LABEL: test0:
-; CHECK: ud2
+; DARWIN-LABEL: test0:
+; DARWIN: ud2
+; LINUX-LABEL: test0:
+; LINUX: ud2
+; PS4-LABEL: test0:
+; PS4: ud2
 define i32 @test0() noreturn nounwind  {
 entry:
 	tail call void @llvm.trap( )
 	unreachable
 }
 
-; CHECK-LABEL: test1:
-; CHECK: int3
+; DARWIN-LABEL: test1:
+; DARWIN: int3
+; LINUX-LABEL: test1:
+; LINUX: int3
+; PS4-LABEL: test1:
+; PS4: int     $65
 define i32 @test1() noreturn nounwind  {
 entry:
 	tail call void @llvm.debugtrap( )
diff --git a/test/CodeGen/X86/uint_to_fp-2.ll b/test/CodeGen/X86/uint_to_fp-2.ll
index e47f154..4b594f7 100644
--- a/test/CodeGen/X86/uint_to_fp-2.ll
+++ b/test/CodeGen/X86/uint_to_fp-2.ll
@@ -7,7 +7,7 @@ define float @test1(i32 %x) nounwind readnone {
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    movsd .LCPI0_0, %xmm0
 ; CHECK-NEXT:    movd {{[0-9]+}}(%esp), %xmm1
-; CHECK-NEXT:    orps %xmm0, %xmm1
+; CHECK-NEXT:    orpd %xmm0, %xmm1
 ; CHECK-NEXT:    subsd %xmm0, %xmm1
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    cvtsd2ss %xmm1, %xmm0
diff --git a/test/CodeGen/X86/unaligned-32-byte-memops.ll b/test/CodeGen/X86/unaligned-32-byte-memops.ll
new file mode 100644
index 0000000..9cec17d
--- /dev/null
+++ b/test/CodeGen/X86/unaligned-32-byte-memops.ll
@@ -0,0 +1,288 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL --check-prefix=CHECK
+
+; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
+; because that is slower than two 16-byte loads. 
+; Other AVX-capable chips don't have that problem.
+
+define <8 x float> @load32bytes(<8 x float>* %Ap) {
+  ; CHECK-LABEL: load32bytes
+
+  ; SANDYB: vmovaps
+  ; SANDYB: vinsertf128
+  ; SANDYB: retq
+
+  ; BTVER2: vmovups
+  ; BTVER2: retq
+
+  ; HASWELL: vmovups
+  ; HASWELL: retq
+
+  %A = load <8 x float>* %Ap, align 16
+  ret <8 x float> %A
+}
+
+; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store
+; because that is slowerthan two 16-byte stores. 
+; Other AVX-capable chips don't have that problem.
+
+define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
+  ; CHECK-LABEL: store32bytes
+
+  ; SANDYB: vextractf128
+  ; SANDYB: vmovaps
+  ; SANDYB: retq
+
+  ; BTVER2: vmovups
+  ; BTVER2: retq
+
+  ; HASWELL: vmovups
+  ; HASWELL: retq
+
+  store <8 x float> %A, <8 x float>* %P, align 16
+  ret void
+}
+
+; Merge two consecutive 16-byte subvector loads into a single 32-byte load
+; if it's faster.
+
+declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8)
+
+; Use the vinsertf128 intrinsic to model source code 
+; that explicitly uses AVX intrinsics.
+define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) {
+  ; CHECK-LABEL: combine_16_byte_loads
+
+  ; SANDYB: vmovups
+  ; SANDYB-NEXT: vinsertf128
+  ; SANDYB-NEXT: retq
+
+  ; BTVER2: vmovups
+  ; BTVER2-NEXT: retq
+
+  ; HASWELL: vmovups
+  ; HASWELL-NEXT: retq
+
+  %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 1
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 2
+  %v1 = load <4 x float>* %ptr1, align 1
+  %v2 = load <4 x float>* %ptr2, align 1
+  %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
+  ret <8 x float> %v3
+}
+
+; Swap the operands of the shufflevector and vinsertf128 to ensure that the
+; pattern still matches.
+define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) {
+  ; CHECK-LABEL: combine_16_byte_loads_swap
+
+  ; SANDYB: vmovups
+  ; SANDYB-NEXT: vinsertf128
+  ; SANDYB-NEXT: retq
+
+  ; BTVER2: vmovups
+  ; BTVER2-NEXT: retq
+
+  ; HASWELL: vmovups
+  ; HASWELL-NEXT: retq
+
+  %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 2
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 3
+  %v1 = load <4 x float>* %ptr1, align 1
+  %v2 = load <4 x float>* %ptr2, align 1
+  %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+  %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
+  ret <8 x float> %v3
+}
+
+; Replace the vinsertf128 intrinsic with a shufflevector as might be
+; expected from auto-vectorized code.
+define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
+  ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic
+
+  ; SANDYB: vmovups
+  ; SANDYB-NEXT: vinsertf128
+  ; SANDYB-NEXT: retq
+
+  ; BTVER2: vmovups
+  ; BTVER2-NEXT: retq
+
+  ; HASWELL: vmovups
+  ; HASWELL-NEXT: retq
+
+  %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 3
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 4
+  %v1 = load <4 x float>* %ptr1, align 1
+  %v2 = load <4 x float>* %ptr2, align 1
+  %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %v3
+}
+
+; Swap the order of the shufflevector operands to ensure that the
+; pattern still matches.
+define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) {
+  ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic_swap
+
+  ; SANDYB: vmovups
+  ; SANDYB-NEXT: vinsertf128
+  ; SANDYB-NEXT: retq
+
+  ; BTVER2: vmovups
+  ; BTVER2-NEXT: retq
+
+  ; HASWELL: vmovups
+  ; HASWELL-NEXT: retq
+
+  %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 4
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 5
+  %v1 = load <4 x float>* %ptr1, align 1
+  %v2 = load <4 x float>* %ptr2, align 1
+  %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x float> %v3
+}
+
+; Check each element type other than float to make sure it is handled correctly.
+; Use the loaded values with an 'add' to make sure we're using the correct load type.
+; Even though BtVer2 has fast 32-byte loads, we should not generate those for
+; 256-bit integer vectors because BtVer2 doesn't have AVX2.
+
+define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) {
+  ; CHECK-LABEL: combine_16_byte_loads_i64
+
+  ; SANDYB: vextractf128
+  ; SANDYB-NEXT: vpaddq
+  ; SANDYB-NEXT: vpaddq
+  ; SANDYB-NEXT: vinsertf128
+  ; SANDYB-NEXT: retq
+
+  ; BTVER2: vextractf128
+  ; BTVER2-NEXT: vpaddq
+  ; BTVER2-NEXT: vpaddq
+  ; BTVER2-NEXT: vinsertf128
+  ; BTVER2-NEXT: retq
+
+  ; HASWELL-NOT: vextract
+  ; HASWELL: vpaddq
+  ; HASWELL-NEXT: retq
+
+  %ptr1 = getelementptr inbounds <2 x i64>* %ptr, i64 5
+  %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 6
+  %v1 = load <2 x i64>* %ptr1, align 1
+  %v2 = load <2 x i64>* %ptr2, align 1
+  %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v4 = add <4 x i64> %v3, %x
+  ret <4 x i64> %v4
+}
+
+define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {
+  ; CHECK-LABEL: combine_16_byte_loads_i32
+
+  ; SANDYB: vextractf128
+  ; SANDYB-NEXT: vpaddd
+  ; SANDYB-NEXT: vpaddd
+  ; SANDYB-NEXT: vinsertf128
+  ; SANDYB-NEXT: retq
+
+  ; BTVER2: vextractf128
+  ; BTVER2-NEXT: vpaddd
+  ; BTVER2-NEXT: vpaddd
+  ; BTVER2-NEXT: vinsertf128
+  ; BTVER2-NEXT: retq
+
+  ; HASWELL-NOT: vextract
+  ; HASWELL: vpaddd
+  ; HASWELL-NEXT: retq
+
+  %ptr1 = getelementptr inbounds <4 x i32>* %ptr, i64 6
+  %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 7
+  %v1 = load <4 x i32>* %ptr1, align 1
+  %v2 = load <4 x i32>* %ptr2, align 1
+  %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v4 = add <8 x i32> %v3, %x
+  ret <8 x i32> %v4
+}
+
+define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {
+  ; CHECK-LABEL: combine_16_byte_loads_i16
+
+  ; SANDYB: vextractf128
+  ; SANDYB-NEXT: vpaddw
+  ; SANDYB-NEXT: vpaddw
+  ; SANDYB-NEXT: vinsertf128
+  ; SANDYB-NEXT: retq
+
+  ; BTVER2: vextractf128
+  ; BTVER2-NEXT: vpaddw
+  ; BTVER2-NEXT: vpaddw
+  ; BTVER2-NEXT: vinsertf128
+  ; BTVER2-NEXT: retq
+
+  ; HASWELL-NOT: vextract
+  ; HASWELL: vpaddw
+  ; HASWELL-NEXT: retq
+
+  %ptr1 = getelementptr inbounds <8 x i16>* %ptr, i64 7
+  %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 8
+  %v1 = load <8 x i16>* %ptr1, align 1
+  %v2 = load <8 x i16>* %ptr2, align 1
+  %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v4 = add <16 x i16> %v3, %x
+  ret <16 x i16> %v4
+}
+
+define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
+  ; CHECK-LABEL: combine_16_byte_loads_i8
+
+  ; SANDYB: vextractf128
+  ; SANDYB-NEXT: vpaddb
+  ; SANDYB-NEXT: vpaddb
+  ; SANDYB-NEXT: vinsertf128
+  ; SANDYB-NEXT: retq
+
+  ; BTVER2: vextractf128
+  ; BTVER2-NEXT: vpaddb
+  ; BTVER2-NEXT: vpaddb
+  ; BTVER2-NEXT: vinsertf128
+  ; BTVER2-NEXT: retq
+
+  ; HASWELL-NOT: vextract
+  ; HASWELL: vpaddb
+  ; HASWELL-NEXT: retq
+
+  %ptr1 = getelementptr inbounds <16 x i8>* %ptr, i64 8
+  %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 9
+  %v1 = load <16 x i8>* %ptr1, align 1
+  %v2 = load <16 x i8>* %ptr2, align 1
+  %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %v4 = add <32 x i8> %v3, %x
+  ret <32 x i8> %v4
+}
+
+define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) {
+  ; CHECK-LABEL: combine_16_byte_loads_double
+
+  ; SANDYB: vmovupd
+  ; SANDYB-NEXT: vinsertf128
+  ; SANDYB-NEXT: vaddpd
+  ; SANDYB-NEXT: retq
+
+  ; BTVER2-NOT: vinsertf128
+  ; BTVER2: vaddpd
+  ; BTVER2-NEXT: retq
+
+  ; HASWELL-NOT: vinsertf128
+  ; HASWELL: vaddpd
+  ; HASWELL-NEXT: retq
+
+  %ptr1 = getelementptr inbounds <2 x double>* %ptr, i64 9
+  %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 10
+  %v1 = load <2 x double>* %ptr1, align 1
+  %v2 = load <2 x double>* %ptr2, align 1
+  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v4 = fadd <4 x double> %v3, %x
+  ret <4 x double> %v4
+}
+
diff --git a/test/CodeGen/X86/unknown-location.ll b/test/CodeGen/X86/unknown-location.ll
index ca9ea4a..140121b 100644
--- a/test/CodeGen/X86/unknown-location.ll
+++ b/test/CodeGen/X86/unknown-location.ll
@@ -21,16 +21,16 @@ entry:
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!12}
 
-!0 = metadata !{metadata !"0x101\00x\001\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\001\000\001\000\006\000\000\001", metadata !10, metadata !2, metadata !4, null, i32 (i32, i32, i32, i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x29", metadata !10} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\0012\00producer\000\00\000\00\000", metadata !10, metadata !11, metadata !11, metadata !9, null, null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !10, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !10, metadata !2} ; [ DW_TAG_base_type ]
-!7 = metadata !{metadata !"0xb\001\0030\000", metadata !2, metadata !1} ; [ DW_TAG_lexical_block ]
-!8 = metadata !{i32 4, i32 3, metadata !7, null}
-!9 = metadata !{metadata !1}
-!10 = metadata !{metadata !"test.c", metadata !"/dir"}
-!11 = metadata !{i32 0}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x101\00x\001\000", !1, !2, !6} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00foo\00foo\00foo\001\000\001\000\006\000\000\001", !10, !2, !4, null, i32 (i32, i32, i32, i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x29", !10} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\0012\00producer\000\00\000\00\000", !10, !11, !11, !9, null, null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !10, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{!6}
+!6 = !{!"0x24\00int\000\0032\0032\000\000\005", !10, !2} ; [ DW_TAG_base_type ]
+!7 = !{!"0xb\001\0030\000", !2, !1} ; [ DW_TAG_lexical_block ]
+!8 = !MDLocation(line: 4, column: 3, scope: !7)
+!9 = !{!1}
+!10 = !{!"test.c", !"/dir"}
+!11 = !{i32 0}
+!12 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/utf16-cfstrings.ll b/test/CodeGen/X86/utf16-cfstrings.ll
index af76a33..c7ec3eb 100644
--- a/test/CodeGen/X86/utf16-cfstrings.ll
+++ b/test/CodeGen/X86/utf16-cfstrings.ll
@@ -29,7 +29,7 @@ declare void @NSLog(%0*, ...)
 
 !llvm.module.flags = !{!0, !1, !2, !3}
 
-!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!0 = !{i32 1, !"Objective-C Version", i32 2}
+!1 = !{i32 1, !"Objective-C Image Info Version", i32 0}
+!2 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
diff --git a/test/CodeGen/X86/v2f32.ll b/test/CodeGen/X86/v2f32.ll
index b9bd80f9..7beed52 100644
--- a/test/CodeGen/X86/v2f32.ll
+++ b/test/CodeGen/X86/v2f32.ll
@@ -5,8 +5,7 @@
 define void @test1(<2 x float> %Q, float *%P2) nounwind {
 ; X64-LABEL: test1:
 ; X64:       # BB#0:
-; X64-NEXT:    movaps %xmm0, %xmm1
-; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X64-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X64-NEXT:    addss %xmm0, %xmm1
 ; X64-NEXT:    movss %xmm1, (%rdi)
 ; X64-NEXT:    retq
@@ -14,8 +13,7 @@ define void @test1(<2 x float> %Q, float *%P2) nounwind {
 ; X32-LABEL: test1:
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movaps %xmm0, %xmm1
-; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X32-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X32-NEXT:    addss %xmm0, %xmm1
 ; X32-NEXT:    movss %xmm1, (%eax)
 ; X32-NEXT:    retl
diff --git a/test/CodeGen/X86/vaargs.ll b/test/CodeGen/X86/vaargs.ll
index ddeb7a3..43c895e 100644
--- a/test/CodeGen/X86/vaargs.ll
+++ b/test/CodeGen/X86/vaargs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7-avx %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=NO-FLAGS
+; RUN: llc -verify-machineinstrs -mcpu=corei7-avx %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=NO-FLAGS
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
 
diff --git a/test/CodeGen/X86/vec-loadsingles-alignment.ll b/test/CodeGen/X86/vec-loadsingles-alignment.ll
new file mode 100644
index 0000000..6aa2adb
--- /dev/null
+++ b/test/CodeGen/X86/vec-loadsingles-alignment.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s
+
+@e = global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8], align 16
+@d = global [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 16
+
+; The global 'e' has 16 byte alignment, so make sure we don't generate an
+; aligned 32-byte load instruction when we combine the load+insert sequence.
+
+define i32 @subb() nounwind ssp {
+; CHECK-LABEL: subb:
+; CHECK:  vmovups e(%rip), %ymm
+entry:
+  %0 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 7), align 4
+  %1 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 6), align 8
+  %2 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 5), align 4
+  %3 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 4), align 16
+  %4 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 3), align 4
+  %5 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 2), align 8
+  %6 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 1), align 4
+  %7 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 0), align 16
+  %vecinit.i = insertelement <8 x i32> undef, i32 %7, i32 0
+  %vecinit1.i = insertelement <8 x i32> %vecinit.i, i32 %6, i32 1
+  %vecinit2.i = insertelement <8 x i32> %vecinit1.i, i32 %5, i32 2
+  %vecinit3.i = insertelement <8 x i32> %vecinit2.i, i32 %4, i32 3
+  %vecinit4.i = insertelement <8 x i32> %vecinit3.i, i32 %3, i32 4
+  %vecinit5.i = insertelement <8 x i32> %vecinit4.i, i32 %2, i32 5
+  %vecinit6.i = insertelement <8 x i32> %vecinit5.i, i32 %1, i32 6
+  %vecinit7.i = insertelement <8 x i32> %vecinit6.i, i32 %0, i32 7
+  %8 = bitcast <8 x i32> %vecinit7.i to <32 x i8>
+  tail call void @llvm.x86.avx.storeu.dq.256(i8* bitcast ([8 x i32]* @d to i8*), <32 x i8> %8)
+  ret i32 0
+}
+
+declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
+
diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll
index 8600c48..07cd195 100644
--- a/test/CodeGen/X86/vec_cast2.ll
+++ b/test/CodeGen/X86/vec_cast2.ll
@@ -5,7 +5,7 @@ define <8 x float> @foo1_8(<8 x i8> %src) {
 ; CHECK-LABEL: foo1_8:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
-; CHECK-NEXT:    vpmovzxwd %xmm0, %xmm0
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; CHECK-NEXT:    vpslld $24, %xmm0, %xmm0
 ; CHECK-NEXT:    vpsrad $24, %xmm0, %xmm0
 ; CHECK-NEXT:    vpslld $24, %xmm1, %xmm1
@@ -16,7 +16,7 @@ define <8 x float> @foo1_8(<8 x i8> %src) {
 ;
 ; CHECK-WIDE-LABEL: foo1_8:
 ; CHECK-WIDE:       ## BB#0:
-; CHECK-WIDE-NEXT:    vpmovzxbd %xmm0, %xmm1
+; CHECK-WIDE-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; CHECK-WIDE-NEXT:    vpslld $24, %xmm1, %xmm1
 ; CHECK-WIDE-NEXT:    vpsrad $24, %xmm1, %xmm1
 ; CHECK-WIDE-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -40,7 +40,7 @@ define <4 x float> @foo1_4(<4 x i8> %src) {
 ;
 ; CHECK-WIDE-LABEL: foo1_4:
 ; CHECK-WIDE:       ## BB#0:
-; CHECK-WIDE-NEXT:    vpmovzxbd %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; CHECK-WIDE-NEXT:    vpslld $24, %xmm0, %xmm0
 ; CHECK-WIDE-NEXT:    vpsrad $24, %xmm0, %xmm0
 ; CHECK-WIDE-NEXT:    vcvtdq2ps %xmm0, %xmm0
@@ -52,7 +52,7 @@ define <4 x float> @foo1_4(<4 x i8> %src) {
 define <8 x float> @foo2_8(<8 x i8> %src) {
 ; CHECK-LABEL: foo2_8:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovzxwd %xmm0, %xmm1
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; CHECK-NEXT:    vandps LCPI2_0, %ymm0, %ymm0
@@ -61,20 +61,9 @@ define <8 x float> @foo2_8(<8 x i8> %src) {
 ;
 ; CHECK-WIDE-LABEL: foo2_8:
 ; CHECK-WIDE:       ## BB#0:
-; CHECK-WIDE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; CHECK-WIDE-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
-; CHECK-WIDE-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
-; CHECK-WIDE-NEXT:    vmovdqa {{.*#+}} xmm5 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; CHECK-WIDE-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; CHECK-WIDE-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,3]
-; CHECK-WIDE-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; CHECK-WIDE-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; CHECK-WIDE-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; CHECK-WIDE-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; CHECK-WIDE-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; CHECK-WIDE-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-WIDE-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-WIDE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-WIDE-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; CHECK-WIDE-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; CHECK-WIDE-NEXT:    retl
   %res = uitofp <8 x i8> %src to <8 x float>
@@ -90,7 +79,7 @@ define <4 x float> @foo2_4(<4 x i8> %src) {
 ;
 ; CHECK-WIDE-LABEL: foo2_4:
 ; CHECK-WIDE:       ## BB#0:
-; CHECK-WIDE-NEXT:    vpmovzxbd %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; CHECK-WIDE-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; CHECK-WIDE-NEXT:    retl
   %res = uitofp <4 x i8> %src to <4 x float>
@@ -118,7 +107,7 @@ define <8 x i8> @foo3_8(<8 x float> %src) {
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %ecx
 ; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
 ; CHECK-WIDE-NEXT:    orl %eax, %ecx
-; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
 ; CHECK-WIDE-NEXT:    shll $8, %eax
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %edx
@@ -127,7 +116,7 @@ define <8 x i8> @foo3_8(<8 x float> %src) {
 ; CHECK-WIDE-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm1
 ; CHECK-WIDE-NEXT:    vpinsrw $1, %ecx, %xmm1, %xmm1
 ; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
 ; CHECK-WIDE-NEXT:    shll $8, %eax
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
@@ -163,7 +152,7 @@ define <4 x i8> @foo3_4(<4 x float> %src) {
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %ecx
 ; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
 ; CHECK-WIDE-NEXT:    orl %eax, %ecx
-; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
 ; CHECK-WIDE-NEXT:    shll $8, %eax
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %edx
diff --git a/test/CodeGen/X86/vec_clear.ll b/test/CodeGen/X86/vec_clear.ll
deleted file mode 100644
index 166d436..0000000
--- a/test/CodeGen/X86/vec_clear.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin -o %t
-; RUN: not grep and %t
-; RUN: not grep psrldq %t
-; RUN: grep xorps %t
-
-define <4 x float> @test(<4 x float>* %v1) nounwind {
-        %tmp = load <4 x float>* %v1            ; <<4 x float>> [#uses=1]
-        %tmp15 = bitcast <4 x float> %tmp to <2 x i64>          ; <<2 x i64>> [#uses=1]
-        %tmp24 = and <2 x i64> %tmp15, bitcast (<4 x i32> < i32 0, i32 0, i32 -1, i32 -1 > to <2 x i64>)              ; <<2 x i64>> [#uses=1]
-        %tmp31 = bitcast <2 x i64> %tmp24 to <4 x float>                ; <<4 x float>> [#uses=1]
-        ret <4 x float> %tmp31
-}
-
diff --git a/test/CodeGen/X86/vec_compare.ll b/test/CodeGen/X86/vec_compare.ll
index 365fe92..df3eae3 100644
--- a/test/CodeGen/X86/vec_compare.ll
+++ b/test/CodeGen/X86/vec_compare.ll
@@ -45,7 +45,7 @@ define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) nounwind {
 define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK-LABEL: test5:
 ; CHECK: pcmpeqd
-; CHECK: pshufd $-79
+; CHECK: pshufd $177
 ; CHECK: pand
 ; CHECK: ret
 	%C = icmp eq <2 x i64> %A, %B
@@ -56,7 +56,7 @@ define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) nounwind {
 define <2 x i64> @test6(<2 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK-LABEL: test6:
 ; CHECK: pcmpeqd
-; CHECK: pshufd $-79
+; CHECK: pshufd $177
 ; CHECK: pand
 ; CHECK: pcmpeqd
 ; CHECK: pxor
@@ -77,11 +77,11 @@ define <2 x i64> @test7(<2 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK: pxor [[CONSTREG]]
 ; CHECK: pxor [[CONSTREG]]
 ; CHECK: pcmpgtd %xmm1
-; CHECK: pshufd $-96
+; CHECK: pshufd $160
 ; CHECK: pcmpeqd
-; CHECK: pshufd $-11
+; CHECK: pshufd $245
 ; CHECK: pand
-; CHECK: pshufd $-11
+; CHECK: pshufd $245
 ; CHECK: por
 ; CHECK: ret
 	%C = icmp sgt <2 x i64> %A, %B
@@ -94,11 +94,11 @@ define <2 x i64> @test8(<2 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK: pxor
 ; CHECK: pxor
 ; CHECK: pcmpgtd %xmm0
-; CHECK: pshufd $-96
+; CHECK: pshufd $160
 ; CHECK: pcmpeqd
-; CHECK: pshufd $-11
+; CHECK: pshufd $245
 ; CHECK: pand
-; CHECK: pshufd $-11
+; CHECK: pshufd $245
 ; CHECK: por
 ; CHECK: ret
 	%C = icmp slt <2 x i64> %A, %B
@@ -111,11 +111,11 @@ define <2 x i64> @test9(<2 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK: pxor
 ; CHECK: pxor
 ; CHECK: pcmpgtd %xmm0
-; CHECK: pshufd $-96
+; CHECK: pshufd $160
 ; CHECK: pcmpeqd
-; CHECK: pshufd $-11
+; CHECK: pshufd $245
 ; CHECK: pand
-; CHECK: pshufd $-11
+; CHECK: pshufd $245
 ; CHECK: por
 ; CHECK: pcmpeqd
 ; CHECK: pxor
@@ -130,11 +130,11 @@ define <2 x i64> @test10(<2 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK: pxor
 ; CHECK: pxor
 ; CHECK: pcmpgtd %xmm1
-; CHECK: pshufd $-96
+; CHECK: pshufd $160
 ; CHECK: pcmpeqd
-; CHECK: pshufd $-11
+; CHECK: pshufd $245
 ; CHECK: pand
-; CHECK: pshufd $-11
+; CHECK: pshufd $245
 ; CHECK: por
 ; CHECK: pcmpeqd
 ; CHECK: pxor
@@ -155,11 +155,11 @@ define <2 x i64> @test11(<2 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK: pxor [[CONSTREG]]
 ; CHECK: pxor [[CONSTREG]]
 ; CHECK: pcmpgtd %xmm1
-; CHECK: pshufd $-96
+; CHECK: pshufd $160
 ; CHECK: pcmpeqd
-; CHECK: pshufd $-11
+; CHECK: pshufd $245
 ; CHECK: pand
-; CHECK: pshufd $-11
+; CHECK: pshufd $245
 ; CHECK: por
 ; CHECK: ret
 	%C = icmp ugt <2 x i64> %A, %B
@@ -172,11 +172,11 @@ define <2 x i64> @test12(<2 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK: pxor
 ; CHECK: pxor
 ; CHECK: pcmpgtd %xmm0
-; CHECK: pshufd $-96
+; CHECK: pshufd $160
 ; CHECK: pcmpeqd
-; CHECK: pshufd $-11
+; CHECK: pshufd $245
 ; CHECK: pand
-; CHECK: pshufd $-11
+; CHECK: pshufd $245
 ; CHECK: por
 ; CHECK: ret
 	%C = icmp ult <2 x i64> %A, %B
@@ -189,11 +189,11 @@ define <2 x i64> @test13(<2 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK: pxor
 ; CHECK: pxor
 ; CHECK: pcmpgtd %xmm0
-; CHECK: pshufd $-96
+; CHECK: pshufd $160
 ; CHECK: pcmpeqd
-; CHECK: pshufd $-11
+; CHECK: pshufd $245
 ; CHECK: pand
-; CHECK: pshufd $-11
+; CHECK: pshufd $245
 ; CHECK: por
 ; CHECK: pcmpeqd
 ; CHECK: pxor
@@ -208,11 +208,11 @@ define <2 x i64> @test14(<2 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK: pxor
 ; CHECK: pxor
 ; CHECK: pcmpgtd %xmm1
-; CHECK: pshufd $-96
+; CHECK: pshufd $160
 ; CHECK: pcmpeqd
-; CHECK: pshufd $-11
+; CHECK: pshufd $245
 ; CHECK: pand
-; CHECK: pshufd $-11
+; CHECK: pshufd $245
 ; CHECK: por
 ; CHECK: pcmpeqd
 ; CHECK: pxor
diff --git a/test/CodeGen/X86/vec_extract-avx.ll b/test/CodeGen/X86/vec_extract-avx.ll
new file mode 100644
index 0000000..fbb8417
--- /dev/null
+++ b/test/CodeGen/X86/vec_extract-avx.ll
@@ -0,0 +1,82 @@
+target triple = "x86_64-unknown-unknown"
+
+; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
+
+; When extracting multiple consecutive elements from a larger
+; vector into a smaller one, do it efficiently. We should use
+; an EXTRACT_SUBVECTOR node internally rather than a bunch of
+; single element extractions. 
+
+; Extracting the low elements only requires using the right kind of store.
+define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
+  %ext0 = extractelement <8 x float> %v, i32 0
+  %ext1 = extractelement <8 x float> %v, i32 1
+  %ext2 = extractelement <8 x float> %v, i32 2
+  %ext3 = extractelement <8 x float> %v, i32 3
+  %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
+  %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
+  %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
+  %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
+  store <4 x float> %ins3, <4 x float>* %ptr, align 16
+  ret void
+
+; CHECK-LABEL: low_v8f32_to_v4f32
+; CHECK: vmovaps
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+}
+
+; Extracting the high elements requires just one AVX instruction. 
+define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
+  %ext0 = extractelement <8 x float> %v, i32 4
+  %ext1 = extractelement <8 x float> %v, i32 5
+  %ext2 = extractelement <8 x float> %v, i32 6
+  %ext3 = extractelement <8 x float> %v, i32 7
+  %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
+  %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
+  %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
+  %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
+  store <4 x float> %ins3, <4 x float>* %ptr, align 16
+  ret void
+
+; CHECK-LABEL: high_v8f32_to_v4f32
+; CHECK: vextractf128
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+}
+
+; Make sure element type doesn't alter the codegen. Note that
+; if we were actually using the vector in this function and
+; have AVX2, we should generate vextracti128 (the int version).
+define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) {
+  %ext0 = extractelement <8 x i32> %v, i32 4
+  %ext1 = extractelement <8 x i32> %v, i32 5
+  %ext2 = extractelement <8 x i32> %v, i32 6
+  %ext3 = extractelement <8 x i32> %v, i32 7
+  %ins0 = insertelement <4 x i32> undef, i32 %ext0, i32 0
+  %ins1 = insertelement <4 x i32> %ins0, i32 %ext1, i32 1
+  %ins2 = insertelement <4 x i32> %ins1, i32 %ext2, i32 2
+  %ins3 = insertelement <4 x i32> %ins2, i32 %ext3, i32 3
+  store <4 x i32> %ins3, <4 x i32>* %ptr, align 16
+  ret void
+
+; CHECK-LABEL: high_v8i32_to_v4i32
+; CHECK: vextractf128
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+}
+
+; Make sure that element size doesn't alter the codegen.
+define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) {
+  %ext0 = extractelement <4 x double> %v, i32 2
+  %ext1 = extractelement <4 x double> %v, i32 3
+  %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
+  %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
+  store <2 x double> %ins1, <2 x double>* %ptr, align 16
+  ret void
+
+; CHECK-LABEL: high_v4f64_to_v2f64
+; CHECK: vextractf128
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+}
diff --git a/test/CodeGen/X86/vec_extract-mmx.ll b/test/CodeGen/X86/vec_extract-mmx.ll
new file mode 100644
index 0000000..c6c93a1
--- /dev/null
+++ b/test/CodeGen/X86/vec_extract-mmx.ll
@@ -0,0 +1,71 @@
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck %s
+
+define i32 @test0(<1 x i64>* %v4) {
+; CHECK-LABEL: test0:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:    pshufw $238, (%[[REG:[a-z]+]]), %mm0
+; CHECK-NEXT:    movd %mm0, %eax
+; CHECK-NEXT:    addl $32, %eax
+; CHECK-NEXT:    retq
+entry:
+  %v5 = load <1 x i64>* %v4, align 8
+  %v12 = bitcast <1 x i64> %v5 to <4 x i16>
+  %v13 = bitcast <4 x i16> %v12 to x86_mmx
+  %v14 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v13, i8 -18)
+  %v15 = bitcast x86_mmx %v14 to <4 x i16>
+  %v16 = bitcast <4 x i16> %v15 to <1 x i64>
+  %v17 = extractelement <1 x i64> %v16, i32 0
+  %v18 = bitcast i64 %v17 to <2 x i32>
+  %v19 = extractelement <2 x i32> %v18, i32 0
+  %v20 = add i32 %v19, 32
+  ret i32 %v20
+}
+
+define i32 @test1(i32* nocapture readonly %ptr) {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:    movd (%[[REG]]), %mm0
+; CHECK-NEXT:    pshufw $232, %mm0, %mm0
+; CHECK-NEXT:    movd %mm0, %eax
+; CHECK-NEXT:    emms
+; CHECK-NEXT:    retq
+entry:
+  %0 = load i32* %ptr, align 4
+  %1 = insertelement <2 x i32> undef, i32 %0, i32 0
+  %2 = insertelement <2 x i32> %1, i32 0, i32 1
+  %3 = bitcast <2 x i32> %2 to x86_mmx
+  %4 = bitcast x86_mmx %3 to i64
+  %5 = bitcast i64 %4 to <4 x i16>
+  %6 = bitcast <4 x i16> %5 to x86_mmx
+  %7 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %6, i8 -24)
+  %8 = bitcast x86_mmx %7 to <4 x i16>
+  %9 = bitcast <4 x i16> %8 to <1 x i64>
+  %10 = extractelement <1 x i64> %9, i32 0
+  %11 = bitcast i64 %10 to <2 x i32>
+  %12 = extractelement <2 x i32> %11, i32 0
+  tail call void @llvm.x86.mmx.emms()
+  ret i32 %12
+}
+
+define i32 @test2(i32* nocapture readonly %ptr) {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:{{.*}} %entry
+; CHECK:    pshufw $232, (%[[REG]]), %mm0
+; CHECK-NEXT:    movd %mm0, %eax
+; CHECK-NEXT:    emms
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast i32* %ptr to x86_mmx*
+  %1 = load x86_mmx* %0, align 8
+  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 -24)
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  %6 = bitcast i64 %5 to <2 x i32>
+  %7 = extractelement <2 x i32> %6, i32 0
+  tail call void @llvm.x86.mmx.emms()
+  ret i32 %7
+}
+
+declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
+declare void @llvm.x86.mmx.emms()
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
index ac02acf..bfefbcf 100644
--- a/test/CodeGen/X86/vec_fabs.ll
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
 
 
 define <2 x double> @fabs_v2f64(<2 x double> %p)
diff --git a/test/CodeGen/X86/vec_fneg.ll b/test/CodeGen/X86/vec_fneg.ll
index 9743f71..a85ae98 100644
--- a/test/CodeGen/X86/vec_fneg.ll
+++ b/test/CodeGen/X86/vec_fneg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse | FileCheck %s
 
 ; FNEG is defined as subtraction from -0.0.
 
diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll
index b72044a..b77a1b5 100644
--- a/test/CodeGen/X86/vec_insert-5.ll
+++ b/test/CodeGen/X86/vec_insert-5.ll
@@ -25,8 +25,8 @@ define <4 x float> @t2(<4 x float>* %P) nounwind {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movaps (%eax), %xmm1
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
 ; CHECK-NEXT:    retl
   %tmp1 = load <4 x float>* %P
   %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
@@ -37,9 +37,9 @@ define <4 x float> @t3(<4 x float>* %P) nounwind {
 ; CHECK-LABEL: t3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movaps (%eax), %xmm0
-; CHECK-NEXT:    xorps %xmm1, %xmm1
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,0]
+; CHECK-NEXT:    movapd (%eax), %xmm0
+; CHECK-NEXT:    xorpd %xmm1, %xmm1
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; CHECK-NEXT:    retl
   %tmp1 = load <4 x float>* %P
   %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 >
@@ -52,8 +52,8 @@ define <4 x float> @t4(<4 x float>* %P) nounwind {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movaps (%eax), %xmm0
 ; CHECK-NEXT:    xorps %xmm1, %xmm1
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
 ; CHECK-NEXT:    retl
   %tmp1 = load <4 x float>* %P
   %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
@@ -63,7 +63,7 @@ define <4 x float> @t4(<4 x float>* %P) nounwind {
 define <16 x i8> @t5(<16 x i8> %x) nounwind {
 ; CHECK-LABEL: t5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT:    psrlw $8, %xmm0
 ; CHECK-NEXT:    retl
   %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
   ret <16 x i8> %s
@@ -72,7 +72,7 @@ define <16 x i8> @t5(<16 x i8> %x) nounwind {
 define <16 x i8> @t6(<16 x i8> %x) nounwind {
 ; CHECK-LABEL: t6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT:    psrlw $8, %xmm0
 ; CHECK-NEXT:    retl
   %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <16 x i8> %s
@@ -86,3 +86,21 @@ define <16 x i8> @t7(<16 x i8> %x) nounwind {
   %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2>
   ret <16 x i8> %s
 }
+
+define <16 x i8> @t8(<16 x i8> %x) nounwind {
+; CHECK-LABEL: t8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT:    retl
+  %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @t9(<16 x i8> %x) nounwind {
+; CHECK-LABEL: t9:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT:    retl
+  %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 undef, i32 undef>
+  ret <16 x i8> %s
+}
diff --git a/test/CodeGen/X86/vec_insert-mmx.ll b/test/CodeGen/X86/vec_insert-mmx.ll
new file mode 100644
index 0000000..d397d80
--- /dev/null
+++ b/test/CodeGen/X86/vec_insert-mmx.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s -mtriple=i686-darwin -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-32
+; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse4.1 | FileCheck %s -check-prefix=X86-64
+
+; This is not an MMX operation; promoted to XMM.
+define x86_mmx @t0(i32 %A) nounwind {
+; X86-32-LABEL: t0:
+; X86-32:       ## BB#0:
+; X86-32:    movd {{[0-9]+}}(%esp), %xmm0
+; X86-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1]
+; X86-32-NEXT:    movlpd %xmm0, (%esp)
+; X86-32-NEXT:    movq (%esp), %mm0
+; X86-32-NEXT:    addl $12, %esp
+; X86-32-NEXT:    retl
+  %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1
+  %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx
+  ret x86_mmx %tmp4
+}
+
+define <8 x i8> @t1(i8 zeroext %x) nounwind {
+; X86-32-LABEL: t1:
+; X86-32:       ## BB#0:
+; X86-32-NOT:  movl
+; X86-32-NEXT:    movd {{[0-9]+}}(%esp), %xmm0
+; X86-32-NEXT:    retl
+  %r = insertelement <8 x i8> undef, i8 %x, i32 0
+  ret <8 x i8> %r
+}
+
+; PR2574
+define <2 x float> @t2(<2 x float> %a0) {
+; X86-32-LABEL: t2:
+; X86-32:       ## BB#0:
+; X86-32-NEXT:    xorps %xmm0, %xmm0
+; X86-32-NEXT:    retl
+  %v1 = insertelement <2 x float> %a0, float 0.000000e+00, i32 0
+  %v2 = insertelement <2 x float> %v1, float 0.000000e+00, i32 1
+  ret <2 x float> %v2
+}
+
+@g0 = external global i16
+@g1 = external global <4 x i16>
+
+; PR2562
+define void @t3() {
+; X86-64-LABEL: t3:
+; X86-64:       ## BB#0:
+; X86-64:    pmovzxwd (%rcx)
+; X86-64-NEXT:    movzwl
+; X86-64-NEXT:    pinsrd $0
+; X86-64-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; X86-64-NEXT:    movq %xmm0
+; X86-64-NEXT:    retq
+  load i16* @g0
+  load <4 x i16>* @g1
+  insertelement <4 x i16> %2, i16 %1, i32 0
+  store <4 x i16> %3, <4 x i16>* @g1
+  ret void
+}
diff --git a/test/CodeGen/X86/vec_loadsingles.ll b/test/CodeGen/X86/vec_loadsingles.ll
index 8812c4f..fd132a5 100644
--- a/test/CodeGen/X86/vec_loadsingles.ll
+++ b/test/CodeGen/X86/vec_loadsingles.ll
@@ -1,12 +1,145 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movq
-
-define <4 x float> @a(<4 x float> %a, float* nocapture %p) nounwind readonly {
-entry:
-	%tmp1 = load float* %p
-	%vecins = insertelement <4 x float> undef, float %tmp1, i32 0
-	%add.ptr = getelementptr float* %p, i32 1
-	%tmp5 = load float* %add.ptr
-	%vecins7 = insertelement <4 x float> %vecins, float %tmp5, i32 1
-	ret <4 x float> %vecins7
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-slow-unaligned-mem-32 | FileCheck %s --check-prefix=ALL --check-prefix=FAST32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+slow-unaligned-mem-32 | FileCheck %s --check-prefix=ALL --check-prefix=SLOW32
+
+define <4 x float> @merge_2_floats(float* nocapture %p) nounwind readonly {
+  %tmp1 = load float* %p
+  %vecins = insertelement <4 x float> undef, float %tmp1, i32 0
+  %add.ptr = getelementptr float* %p, i32 1
+  %tmp5 = load float* %add.ptr
+  %vecins7 = insertelement <4 x float> %vecins, float %tmp5, i32 1
+  ret <4 x float> %vecins7
+
+; ALL-LABEL: merge_2_floats
+; ALL: vmovq
+; ALL-NEXT: retq
+}
+
+; Test-case generated due to a crash when trying to treat loading the first
+; two i64s of a <4 x i64> as a load of two i32s.
+define <4 x i64> @merge_2_floats_into_4() {
+  %1 = load i64** undef, align 8
+  %2 = getelementptr inbounds i64* %1, i64 0
+  %3 = load i64* %2
+  %4 = insertelement <4 x i64> undef, i64 %3, i32 0
+  %5 = load i64** undef, align 8
+  %6 = getelementptr inbounds i64* %5, i64 1
+  %7 = load i64* %6
+  %8 = insertelement <4 x i64> %4, i64 %7, i32 1
+  %9 = shufflevector <4 x i64> %8, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i64> %9
+  
+; ALL-LABEL: merge_2_floats_into_4
+; ALL: vmovups
+; ALL-NEXT: retq
+}
+
+define <4 x float> @merge_4_floats(float* %ptr) {
+  %a = load float* %ptr, align 8
+  %vec = insertelement <4 x float> undef, float %a, i32 0
+  %idx1 = getelementptr inbounds float* %ptr, i64 1
+  %b = load float* %idx1, align 8
+  %vec2 = insertelement <4 x float> %vec, float %b, i32 1
+  %idx3 = getelementptr inbounds float* %ptr, i64 2
+  %c = load float* %idx3, align 8
+  %vec4 = insertelement <4 x float> %vec2, float %c, i32 2
+  %idx5 = getelementptr inbounds float* %ptr, i64 3
+  %d = load float* %idx5, align 8
+  %vec6 = insertelement <4 x float> %vec4, float %d, i32 3
+  ret <4 x float> %vec6
+
+; ALL-LABEL: merge_4_floats
+; ALL: vmovups
+; ALL-NEXT: retq
+}
+
+; PR21710 ( http://llvm.org/bugs/show_bug.cgi?id=21710 ) 
+; Make sure that 32-byte vectors are handled efficiently.
+; If the target has slow 32-byte accesses, we should still generate
+; 16-byte loads.
+
+define <8 x float> @merge_8_floats(float* %ptr) {
+  %a = load float* %ptr, align 4
+  %vec = insertelement <8 x float> undef, float %a, i32 0
+  %idx1 = getelementptr inbounds float* %ptr, i64 1
+  %b = load float* %idx1, align 4
+  %vec2 = insertelement <8 x float> %vec, float %b, i32 1
+  %idx3 = getelementptr inbounds float* %ptr, i64 2
+  %c = load float* %idx3, align 4
+  %vec4 = insertelement <8 x float> %vec2, float %c, i32 2
+  %idx5 = getelementptr inbounds float* %ptr, i64 3
+  %d = load float* %idx5, align 4
+  %vec6 = insertelement <8 x float> %vec4, float %d, i32 3
+  %idx7 = getelementptr inbounds float* %ptr, i64 4
+  %e = load float* %idx7, align 4
+  %vec8 = insertelement <8 x float> %vec6, float %e, i32 4
+  %idx9 = getelementptr inbounds float* %ptr, i64 5
+  %f = load float* %idx9, align 4
+  %vec10 = insertelement <8 x float> %vec8, float %f, i32 5
+  %idx11 = getelementptr inbounds float* %ptr, i64 6
+  %g = load float* %idx11, align 4
+  %vec12 = insertelement <8 x float> %vec10, float %g, i32 6
+  %idx13 = getelementptr inbounds float* %ptr, i64 7
+  %h = load float* %idx13, align 4
+  %vec14 = insertelement <8 x float> %vec12, float %h, i32 7
+  ret <8 x float> %vec14
+
+; ALL-LABEL: merge_8_floats
+
+; FAST32: vmovups
+; FAST32-NEXT: retq
+
+; SLOW32: vmovups
+; SLOW32-NEXT: vinsertf128
+; SLOW32-NEXT: retq
+}
+
+define <4 x double> @merge_4_doubles(double* %ptr) {
+  %a = load double* %ptr, align 8
+  %vec = insertelement <4 x double> undef, double %a, i32 0
+  %idx1 = getelementptr inbounds double* %ptr, i64 1
+  %b = load double* %idx1, align 8
+  %vec2 = insertelement <4 x double> %vec, double %b, i32 1
+  %idx3 = getelementptr inbounds double* %ptr, i64 2
+  %c = load double* %idx3, align 8
+  %vec4 = insertelement <4 x double> %vec2, double %c, i32 2
+  %idx5 = getelementptr inbounds double* %ptr, i64 3
+  %d = load double* %idx5, align 8
+  %vec6 = insertelement <4 x double> %vec4, double %d, i32 3
+  ret <4 x double> %vec6
+
+; ALL-LABEL: merge_4_doubles
+; FAST32: vmovups
+; FAST32-NEXT: retq
+
+; SLOW32: vmovups
+; SLOW32-NEXT: vinsertf128
+; SLOW32-NEXT: retq
+}
+
+; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 ) 
+; Recognize and combine consecutive loads even when the
+; first of the combined loads is offset from the base address.
+define <4 x double> @merge_4_doubles_offset(double* %ptr) {
+  %arrayidx4 = getelementptr inbounds double* %ptr, i64 4
+  %arrayidx5 = getelementptr inbounds double* %ptr, i64 5
+  %arrayidx6 = getelementptr inbounds double* %ptr, i64 6
+  %arrayidx7 = getelementptr inbounds double* %ptr, i64 7
+  %e = load double* %arrayidx4, align 8
+  %f = load double* %arrayidx5, align 8
+  %g = load double* %arrayidx6, align 8
+  %h = load double* %arrayidx7, align 8
+  %vecinit4 = insertelement <4 x double> undef, double %e, i32 0
+  %vecinit5 = insertelement <4 x double> %vecinit4, double %f, i32 1
+  %vecinit6 = insertelement <4 x double> %vecinit5, double %g, i32 2
+  %vecinit7 = insertelement <4 x double> %vecinit6, double %h, i32 3
+  ret <4 x double> %vecinit7
+
+; ALL-LABEL: merge_4_doubles_offset
+; FAST32: vmovups
+; FAST32-NEXT: retq
+
+; SLOW32: vmovups
+; SLOW32-NEXT: vinsertf128
+; SLOW32-NEXT: retq
 }
 
diff --git a/test/CodeGen/X86/vec_split.ll b/test/CodeGen/X86/vec_split.ll
index bc2c663..1df4cf2 100644
--- a/test/CodeGen/X86/vec_split.ll
+++ b/test/CodeGen/X86/vec_split.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s -check-prefix=SSE4
-; RUN: llc -march=x86-64 -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1
-; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
+; RUN: llc -march=x86-64 -mattr=sse4.1 < %s | FileCheck %s -check-prefix=SSE4
+; RUN: llc -march=x86-64 -mattr=avx < %s | FileCheck %s -check-prefix=AVX1
+; RUN: llc -march=x86-64 -mattr=avx2 < %s | FileCheck %s -check-prefix=AVX2
 
 define <16 x i16> @split16(<16 x i16> %a, <16 x i16> %b, <16 x i8> %__mask) {
 ; SSE4-LABEL: split16:
diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll
index 0a3ed7e..e15daaa 100644
--- a/test/CodeGen/X86/vector-blend.ll
+++ b/test/CodeGen/X86/vector-blend.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 
@@ -9,16 +9,14 @@
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 ; SSE2-LABEL: vsel_float:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: vsel_float:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: vsel_float:
@@ -36,15 +34,26 @@ entry:
 }
 
 define <4 x float> @vsel_float2(<4 x float> %v1, <4 x float> %v2) {
-; SSE-LABEL: vsel_float2:
-; SSE:       # BB#0: # %entry
-; SSE-NEXT:    movss %xmm0, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: vsel_float2:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_float2:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_float2:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: vsel_float2:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vmovss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 entry:
   %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
@@ -54,16 +63,14 @@ entry:
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
 ; SSE2-LABEL: vsel_4xi8:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: vsel_4xi8:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: vsel_4xi8:
@@ -88,16 +95,16 @@ entry:
 define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
 ; SSE2-LABEL: vsel_4xi16:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: vsel_4xi16:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: vsel_4xi16:
@@ -122,16 +129,16 @@ entry:
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
 ; SSE2-LABEL: vsel_i32:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: vsel_i32:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: vsel_i32:
@@ -154,15 +161,26 @@ entry:
 }
 
 define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
-; SSE-LABEL: vsel_double:
-; SSE:       # BB#0: # %entry
-; SSE-NEXT:    movsd %xmm0, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: vsel_double:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_double:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSSE3-NEXT:    movapd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_double:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: vsel_double:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 entry:
   %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2
@@ -170,16 +188,32 @@ entry:
 }
 
 define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) {
-; SSE-LABEL: vsel_i64:
-; SSE:       # BB#0: # %entry
-; SSE-NEXT:    movsd %xmm0, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: vsel_i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: vsel_i64:
-; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-LABEL: vsel_i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSSE3-NEXT:    movapd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: vsel_i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vsel_i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT:    retq
 entry:
   %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v1, <2 x i64> %v2
   ret <2 x i64> %vsel
@@ -188,16 +222,20 @@ entry:
 define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
 ; SSE2-LABEL: vsel_8xi16:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
+; SSE2-NEXT:    andps %xmm2, %xmm1
+; SSE2-NEXT:    andnps %xmm0, %xmm2
+; SSE2-NEXT:    orps %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: vsel_8xi16:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    movaps {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
+; SSSE3-NEXT:    andps %xmm2, %xmm1
+; SSSE3-NEXT:    andnps %xmm0, %xmm2
+; SSSE3-NEXT:    orps %xmm1, %xmm2
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: vsel_8xi16:
@@ -217,29 +255,30 @@ entry:
 define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
 ; SSE2-LABEL: vsel_i8:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    andps %xmm2, %xmm0
+; SSE2-NEXT:    andnps %xmm1, %xmm2
+; SSE2-NEXT:    orps %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: vsel_i8:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
-; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3],zero,xmm1[5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[8,9,10,11,12,13,14,15]
+; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: vsel_i8:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255]
 ; SSE41-NEXT:    pblendvb %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: vsel_i8:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255]
 ; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -251,13 +290,27 @@ entry:
 ; AVX256 tests:
 
 define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
-; SSE-LABEL: vsel_float8:
-; SSE:       # BB#0: # %entry
-; SSE-NEXT:    movss %xmm0, %xmm2
-; SSE-NEXT:    movss %xmm1, %xmm3
-; SSE-NEXT:    movaps %xmm2, %xmm0
-; SSE-NEXT:    movaps %xmm3, %xmm1
-; SSE-NEXT:    retq
+; SSE2-LABEL: vsel_float8:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_float8:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSSE3-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    movaps %xmm3, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_float8:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: vsel_float8:
 ; AVX:       # BB#0: # %entry
@@ -269,13 +322,27 @@ entry:
 }
 
 define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
-; SSE-LABEL: vsel_i328:
-; SSE:       # BB#0: # %entry
-; SSE-NEXT:    movss %xmm0, %xmm2
-; SSE-NEXT:    movss %xmm1, %xmm3
-; SSE-NEXT:    movaps %xmm2, %xmm0
-; SSE-NEXT:    movaps %xmm3, %xmm1
-; SSE-NEXT:    retq
+; SSE2-LABEL: vsel_i328:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_i328:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSSE3-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    movaps %xmm3, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_i328:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5,6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: vsel_i328:
 ; AVX1:       # BB#0: # %entry
@@ -294,21 +361,21 @@ entry:
 define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
 ; SSE2-LABEL: vsel_double8:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movsd %xmm0, %xmm4
-; SSE2-NEXT:    movsd %xmm2, %xmm6
-; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
+; SSE2-NEXT:    movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1]
+; SSE2-NEXT:    movapd %xmm4, %xmm0
 ; SSE2-NEXT:    movaps %xmm5, %xmm1
-; SSE2-NEXT:    movaps %xmm6, %xmm2
+; SSE2-NEXT:    movapd %xmm6, %xmm2
 ; SSE2-NEXT:    movaps %xmm7, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: vsel_double8:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movsd %xmm0, %xmm4
-; SSSE3-NEXT:    movsd %xmm2, %xmm6
-; SSSE3-NEXT:    movaps %xmm4, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
+; SSSE3-NEXT:    movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1]
+; SSSE3-NEXT:    movapd %xmm4, %xmm0
 ; SSSE3-NEXT:    movaps %xmm5, %xmm1
-; SSSE3-NEXT:    movaps %xmm6, %xmm2
+; SSSE3-NEXT:    movapd %xmm6, %xmm2
 ; SSSE3-NEXT:    movaps %xmm7, %xmm3
 ; SSSE3-NEXT:    retq
 ;
@@ -333,21 +400,21 @@ entry:
 define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
 ; SSE2-LABEL: vsel_i648:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movsd %xmm0, %xmm4
-; SSE2-NEXT:    movsd %xmm2, %xmm6
-; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
+; SSE2-NEXT:    movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1]
+; SSE2-NEXT:    movapd %xmm4, %xmm0
 ; SSE2-NEXT:    movaps %xmm5, %xmm1
-; SSE2-NEXT:    movaps %xmm6, %xmm2
+; SSE2-NEXT:    movapd %xmm6, %xmm2
 ; SSE2-NEXT:    movaps %xmm7, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: vsel_i648:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movsd %xmm0, %xmm4
-; SSSE3-NEXT:    movsd %xmm2, %xmm6
-; SSSE3-NEXT:    movaps %xmm4, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
+; SSSE3-NEXT:    movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1]
+; SSSE3-NEXT:    movapd %xmm4, %xmm0
 ; SSSE3-NEXT:    movaps %xmm5, %xmm1
-; SSSE3-NEXT:    movaps %xmm6, %xmm2
+; SSSE3-NEXT:    movapd %xmm6, %xmm2
 ; SSSE3-NEXT:    movaps %xmm7, %xmm3
 ; SSSE3-NEXT:    retq
 ;
@@ -376,13 +443,27 @@ entry:
 }
 
 define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
-; SSE-LABEL: vsel_double4:
-; SSE:       # BB#0: # %entry
-; SSE-NEXT:    movsd %xmm0, %xmm2
-; SSE-NEXT:    movsd %xmm1, %xmm3
-; SSE-NEXT:    movaps %xmm2, %xmm0
-; SSE-NEXT:    movaps %xmm3, %xmm1
-; SSE-NEXT:    retq
+; SSE2-LABEL: vsel_double4:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
+; SSE2-NEXT:    movapd %xmm2, %xmm0
+; SSE2-NEXT:    movapd %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_double4:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSSE3-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
+; SSSE3-NEXT:    movapd %xmm2, %xmm0
+; SSSE3-NEXT:    movapd %xmm3, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_double4:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
+; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm1[0],xmm3[1]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: vsel_double4:
 ; AVX:       # BB#0: # %entry
@@ -474,12 +555,25 @@ entry:
 ; If we can figure out a blend has a constant mask, we should emit the
 ; blend instruction with an immediate mask
 define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
-; SSE-LABEL: constant_blendvpd_avx:
-; SSE:       # BB#0: # %entry
-; SSE-NEXT:    movsd %xmm1, %xmm3
-; SSE-NEXT:    movaps %xmm2, %xmm0
-; SSE-NEXT:    movaps %xmm3, %xmm1
-; SSE-NEXT:    retq
+; SSE2-LABEL: constant_blendvpd_avx:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    movapd %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: constant_blendvpd_avx:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    movapd %xmm3, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: constant_blendvpd_avx:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm1[0],xmm3[1]
+; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: constant_blendvpd_avx:
 ; AVX:       # BB#0: # %entry
@@ -493,26 +587,22 @@ entry:
 define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
 ; SSE2-LABEL: constant_blendvps_avx:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movaps {{.*#+}} xmm4 = [4294967295,4294967295,4294967295,0]
-; SSE2-NEXT:    andps %xmm4, %xmm2
-; SSE2-NEXT:    movaps {{.*#+}} xmm5 = [0,0,0,4294967295]
-; SSE2-NEXT:    andps %xmm5, %xmm0
-; SSE2-NEXT:    orps %xmm2, %xmm0
-; SSE2-NEXT:    andps %xmm4, %xmm3
-; SSE2-NEXT:    andps %xmm5, %xmm1
-; SSE2-NEXT:    orps %xmm3, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm3, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: constant_blendvps_avx:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movaps {{.*#+}} xmm4 = [4294967295,4294967295,4294967295,0]
-; SSSE3-NEXT:    andps %xmm4, %xmm2
-; SSSE3-NEXT:    movaps {{.*#+}} xmm5 = [0,0,0,4294967295]
-; SSSE3-NEXT:    andps %xmm5, %xmm0
-; SSSE3-NEXT:    orps %xmm2, %xmm0
-; SSSE3-NEXT:    andps %xmm4, %xmm3
-; SSSE3-NEXT:    andps %xmm5, %xmm1
-; SSSE3-NEXT:    orps %xmm3, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0]
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    movaps %xmm3, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_blendvps_avx:
@@ -533,32 +623,32 @@ entry:
 define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
 ; SSE2-LABEL: constant_pblendvb_avx2:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
-; SSE2-NEXT:    andps %xmm4, %xmm2
-; SSE2-NEXT:    movaps {{.*#+}} xmm5 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
-; SSE2-NEXT:    andps %xmm5, %xmm0
-; SSE2-NEXT:    orps %xmm2, %xmm0
-; SSE2-NEXT:    andps %xmm4, %xmm3
-; SSE2-NEXT:    andps %xmm5, %xmm1
-; SSE2-NEXT:    orps %xmm3, %xmm1
+; SSE2-NEXT:    movaps {{.*#+}} xmm4 = [0,0,255,0,255,255,255,0,255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    movaps %xmm4, %xmm5
+; SSE2-NEXT:    andnps %xmm2, %xmm5
+; SSE2-NEXT:    andps %xmm4, %xmm0
+; SSE2-NEXT:    orps %xmm5, %xmm0
+; SSE2-NEXT:    andps %xmm4, %xmm1
+; SSE2-NEXT:    andnps %xmm3, %xmm4
+; SSE2-NEXT:    orps %xmm4, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: constant_pblendvb_avx2:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
-; SSSE3-NEXT:    andps %xmm4, %xmm2
-; SSSE3-NEXT:    movaps {{.*#+}} xmm5 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
-; SSSE3-NEXT:    andps %xmm5, %xmm0
-; SSSE3-NEXT:    orps %xmm2, %xmm0
-; SSSE3-NEXT:    andps %xmm4, %xmm3
-; SSSE3-NEXT:    andps %xmm5, %xmm1
-; SSSE3-NEXT:    orps %xmm3, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,128,3,128,128,128,7,128,128,128,128,128,128,128,128]
+; SSSE3-NEXT:    pshufb %xmm4, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [128,128,2,128,4,5,6,128,8,9,10,11,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm5, %xmm0
+; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    pshufb %xmm4, %xmm3
+; SSSE3-NEXT:    pshufb %xmm5, %xmm1
+; SSSE3-NEXT:    por %xmm3, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_pblendvb_avx2:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,255,255,255,255,255,255,255,255]
 ; SSE41-NEXT:    pblendvb %xmm4, %xmm2
 ; SSE41-NEXT:    pblendvb %xmm1, %xmm3
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
@@ -567,14 +657,15 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
 ;
 ; AVX1-LABEL: constant_pblendvb_avx2:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,0,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: constant_pblendvb_avx2:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 entry:
@@ -616,7 +707,7 @@ entry:
 define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) {
 ; SSE2-LABEL: blend_shufflevector_8xfloat:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movss %xmm0, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2]
 ; SSE2-NEXT:    movaps %xmm2, %xmm0
@@ -625,7 +716,7 @@ define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b)
 ;
 ; SSSE3-LABEL: blend_shufflevector_8xfloat:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movss %xmm0, %xmm2
+; SSSE3-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
 ; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2]
 ; SSSE3-NEXT:    movaps %xmm2, %xmm0
@@ -650,14 +741,14 @@ entry:
 define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) {
 ; SSE2-LABEL: blend_shufflevector_4xdouble:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movsd %xmm0, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE2-NEXT:    movapd %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: blend_shufflevector_4xdouble:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movsd %xmm0, %xmm2
-; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSSE3-NEXT:    movapd %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: blend_shufflevector_4xdouble:
@@ -677,13 +768,13 @@ entry:
 define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE2-LABEL: blend_shufflevector_4xi64:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movsd %xmm2, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSE2-NEXT:    movaps %xmm3, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: blend_shufflevector_4xi64:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movsd %xmm2, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSSE3-NEXT:    movaps %xmm3, %xmm1
 ; SSSE3-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-ctpop.ll b/test/CodeGen/X86/vector-ctpop.ll
new file mode 100644
index 0000000..59d6792
--- /dev/null
+++ b/test/CodeGen/X86/vector-ctpop.ll
@@ -0,0 +1,159 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2                | FileCheck -check-prefix=AVX2 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx  -mattr=-popcnt | FileCheck -check-prefix=AVX1-NOPOPCNT %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 -mattr=-popcnt | FileCheck -check-prefix=AVX2-NOPOPCNT %s
+
+; Vector version of:
+; v = v - ((v >> 1) & 0x55555555)
+; v = (v & 0x33333333) + ((v >> 2) & 0x33333333)
+; v = (v + (v >> 4) & 0xF0F0F0F)
+; v = v + (v >> 8)
+; v = v + (v >> 16)
+; v = v + (v >> 32) ; i64 only
+
+define <8 x i32> @test0(<8 x i32> %x) {
+; AVX2-LABEL: @test0
+entry:
+; AVX2:  vpsrld  $1, %ymm
+; AVX2-NEXT:  vpbroadcastd
+; AVX2-NEXT:  vpand
+; AVX2-NEXT:  vpsubd
+; AVX2-NEXT:  vpbroadcastd
+; AVX2-NEXT:  vpand
+; AVX2-NEXT:  vpsrld  $2
+; AVX2-NEXT:  vpand
+; AVX2-NEXT:  vpaddd
+; AVX2-NEXT:  vpsrld  $4
+; AVX2-NEXT:  vpaddd
+; AVX2-NEXT:  vpbroadcastd
+; AVX2-NEXT:	vpand
+; AVX2-NEXT:	vpsrld	$8
+; AVX2-NEXT:	vpaddd
+; AVX2-NEXT:	vpsrld	$16
+; AVX2-NEXT:	vpaddd
+; AVX2-NEXT:	vpbroadcastd
+; AVX2-NEXT:	vpand
+  %y = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %x)
+  ret <8 x i32> %y
+}
+
+define <4 x i64> @test1(<4 x i64> %x) {
+; AVX2-NOPOPCNT-LABEL: @test1
+entry:
+;	AVX2-NOPOPCNT: vpsrlq	$1, %ymm
+;	AVX2-NOPOPCNT-NEXT: vpbroadcastq
+;	AVX2-NOPOPCNT-NEXT: vpand
+;	AVX2-NOPOPCNT-NEXT: vpsubq
+;	AVX2-NOPOPCNT-NEXT: vpbroadcastq
+;	AVX2-NOPOPCNT-NEXT: vpand
+;	AVX2-NOPOPCNT-NEXT: vpsrlq	$2
+;	AVX2-NOPOPCNT-NEXT: vpand
+;	AVX2-NOPOPCNT-NEXT: vpaddq
+;	AVX2-NOPOPCNT-NEXT: vpsrlq	$4
+;	AVX2-NOPOPCNT-NEXT: vpaddq
+;	AVX2-NOPOPCNT-NEXT: vpbroadcastq
+;	AVX2-NOPOPCNT-NEXT: vpand
+;	AVX2-NOPOPCNT-NEXT: vpsrlq	$8
+;	AVX2-NOPOPCNT-NEXT: vpaddq
+;	AVX2-NOPOPCNT-NEXT: vpsrlq	$16
+;	AVX2-NOPOPCNT-NEXT: vpaddq
+;	AVX2-NOPOPCNT-NEXT: vpsrlq	$32
+;	AVX2-NOPOPCNT-NEXT: vpaddq
+;	AVX2-NOPOPCNT-NEXT: vpbroadcastq
+;	AVX2-NOPOPCNT-NEXT: vpand
+  %y = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %x)
+  ret <4 x i64> %y
+}
+
+define <4 x i32> @test2(<4 x i32> %x) {
+; AVX2-NOPOPCNT-LABEL: @test2
+; AVX1-NOPOPCNT-LABEL: @test2
+entry:
+; AVX2-NOPOPCNT:	vpsrld	$1, %xmm
+; AVX2-NOPOPCNT-NEXT:	vpbroadcastd
+; AVX2-NOPOPCNT-NEXT:	vpand
+; AVX2-NOPOPCNT-NEXT:	vpsubd
+; AVX2-NOPOPCNT-NEXT:	vpbroadcastd
+; AVX2-NOPOPCNT-NEXT:	vpand
+; AVX2-NOPOPCNT-NEXT:	vpsrld	$2
+; AVX2-NOPOPCNT-NEXT:	vpand
+; AVX2-NOPOPCNT-NEXT:	vpaddd
+; AVX2-NOPOPCNT-NEXT:	vpsrld	$4
+; AVX2-NOPOPCNT-NEXT:	vpaddd
+; AVX2-NOPOPCNT-NEXT:	vpbroadcastd
+; AVX2-NOPOPCNT-NEXT:	vpand
+; AVX2-NOPOPCNT-NEXT:	vpsrld	$8
+; AVX2-NOPOPCNT-NEXT:	vpaddd
+; AVX2-NOPOPCNT-NEXT:	vpsrld	$16
+; AVX2-NOPOPCNT-NEXT:	vpaddd
+; AVX2-NOPOPCNT-NEXT:	vpbroadcastd
+; AVX2-NOPOPCNT-NEXT:	vpand
+; AVX1-NOPOPCNT:	vpsrld	$1, %xmm
+; AVX1-NOPOPCNT-NEXT:	vpand
+; AVX1-NOPOPCNT-NEXT:	vpsubd
+; AVX1-NOPOPCNT-NEXT:	vmovdqa
+; AVX1-NOPOPCNT-NEXT:	vpand
+; AVX1-NOPOPCNT-NEXT:	vpsrld	$2
+; AVX1-NOPOPCNT-NEXT:	vpand
+; AVX1-NOPOPCNT-NEXT:	vpaddd
+; AVX1-NOPOPCNT-NEXT:	vpsrld	$4
+; AVX1-NOPOPCNT-NEXT:	vpaddd
+; AVX1-NOPOPCNT-NEXT:	vpand
+; AVX1-NOPOPCNT-NEXT:	vpsrld	$8
+; AVX1-NOPOPCNT-NEXT:	vpaddd
+; AVX1-NOPOPCNT-NEXT:	vpsrld	$16
+; AVX1-NOPOPCNT-NEXT:	vpaddd
+; AVX1-NOPOPCNT-NEXT:	vpand
+  %y = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x)
+  ret <4 x i32> %y
+}
+
+define <2 x i64> @test3(<2 x i64> %x) {
+; AVX2-NOPOPCNT-LABEL: @test3
+; AVX1-NOPOPCNT-LABEL: @test3
+entry:
+; AVX2-NOPOPCNT:	vpsrlq	$1, %xmm
+; AVX2-NOPOPCNT-NEXT:	vpand
+; AVX2-NOPOPCNT-NEXT:	vpsubq
+; AVX2-NOPOPCNT-NEXT:	vmovdqa
+; AVX2-NOPOPCNT-NEXT:	vpand
+; AVX2-NOPOPCNT-NEXT:	vpsrlq	$2
+; AVX2-NOPOPCNT-NEXT:	vpand
+; AVX2-NOPOPCNT-NEXT:	vpaddq
+; AVX2-NOPOPCNT-NEXT:	vpsrlq	$4
+; AVX2-NOPOPCNT-NEXT:	vpaddq
+; AVX2-NOPOPCNT-NEXT:	vpand
+; AVX2-NOPOPCNT-NEXT:	vpsrlq	$8
+; AVX2-NOPOPCNT-NEXT:	vpaddq
+; AVX2-NOPOPCNT-NEXT:	vpsrlq	$16
+; AVX2-NOPOPCNT-NEXT:	vpaddq
+; AVX2-NOPOPCNT-NEXT:	vpsrlq	$32
+; AVX2-NOPOPCNT-NEXT:	vpaddq
+; AVX2-NOPOPCNT-NEXT:	vpand
+; AVX1-NOPOPCNT:	vpsrlq	$1, %xmm
+; AVX1-NOPOPCNT-NEXT:	vpand
+; AVX1-NOPOPCNT-NEXT:	vpsubq
+; AVX1-NOPOPCNT-NEXT:	vmovdqa
+; AVX1-NOPOPCNT-NEXT:	vpand
+; AVX1-NOPOPCNT-NEXT:	vpsrlq	$2
+; AVX1-NOPOPCNT-NEXT:	vpand
+; AVX1-NOPOPCNT-NEXT:	vpaddq
+; AVX1-NOPOPCNT-NEXT:	vpsrlq	$4
+; AVX1-NOPOPCNT-NEXT:	vpaddq
+; AVX1-NOPOPCNT-NEXT:	vpand
+; AVX1-NOPOPCNT-NEXT:	vpsrlq	$8
+; AVX1-NOPOPCNT-NEXT:	vpaddq
+; AVX1-NOPOPCNT-NEXT:	vpsrlq	$16
+; AVX1-NOPOPCNT-NEXT:	vpaddq
+; AVX1-NOPOPCNT-NEXT:	vpsrlq	$32
+; AVX1-NOPOPCNT-NEXT:	vpaddq
+; AVX1-NOPOPCNT-NEXT:	vpand
+  %y = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x)
+  ret <2 x i64> %y
+}
+
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
+
+declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
+declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>)
+
diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll
index 4b269dc..06ce543 100644
--- a/test/CodeGen/X86/vector-idiv.ll
+++ b/test/CodeGen/X86/vector-idiv.ll
@@ -8,16 +8,15 @@ define <4 x i32> @test1(<4 x i32> %a) {
 ; SSE41-LABEL: test1:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    pmuludq %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
-; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE41-NEXT:    psubd %xmm2, %xmm0
+; SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; SSE41-NEXT:    pmuludq %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT:    psubd %xmm1, %xmm0
 ; SSE41-NEXT:    psrld $1, %xmm0
-; SSE41-NEXT:    paddd %xmm2, %xmm0
+; SSE41-NEXT:    paddd %xmm1, %xmm0
 ; SSE41-NEXT:    psrld $2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -26,11 +25,12 @@ define <4 x i32> @test1(<4 x i32> %a) {
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
 ; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; SSE-NEXT:    pmuludq %xmm1, %xmm3
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE-NEXT:    psubd %xmm2, %xmm0
 ; SSE-NEXT:    psrld $1, %xmm0
 ; SSE-NEXT:    paddd %xmm2, %xmm0
@@ -40,12 +40,12 @@ define <4 x i32> @test1(<4 x i32> %a) {
 ; AVX-LABEL: test1:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
-; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; AVX-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
 ; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrld $1, %xmm0, %xmm0
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
@@ -59,22 +59,22 @@ define <8 x i32> @test2(<8 x i32> %a) {
 ; SSE41-LABEL: test2:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pmuludq %xmm2, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pmuludq %xmm4, %xmm5
-; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
-; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
-; SSE41-NEXT:    psubd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    pmuludq %xmm2, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
+; SSE41-NEXT:    psubd %xmm5, %xmm0
 ; SSE41-NEXT:    psrld $1, %xmm0
-; SSE41-NEXT:    paddd %xmm3, %xmm0
+; SSE41-NEXT:    paddd %xmm5, %xmm0
 ; SSE41-NEXT:    psrld $2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm3, %xmm4
 ; SSE41-NEXT:    pmuludq %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pmuludq %xmm4, %xmm3
-; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
-; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
 ; SSE41-NEXT:    psubd %xmm2, %xmm1
 ; SSE41-NEXT:    psrld $1, %xmm1
 ; SSE41-NEXT:    paddd %xmm2, %xmm1
@@ -86,20 +86,22 @@ define <8 x i32> @test2(<8 x i32> %a) {
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
 ; SSE-NEXT:    movdqa %xmm0, %xmm3
 ; SSE-NEXT:    pmuludq %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
 ; SSE-NEXT:    pmuludq %xmm4, %xmm5
-; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
 ; SSE-NEXT:    psubd %xmm3, %xmm0
 ; SSE-NEXT:    psrld $1, %xmm0
 ; SSE-NEXT:    paddd %xmm3, %xmm0
 ; SSE-NEXT:    psrld $2, %xmm0
 ; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
 ; SSE-NEXT:    pmuludq %xmm4, %xmm3
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSE-NEXT:    psubd %xmm2, %xmm1
 ; SSE-NEXT:    psrld $1, %xmm1
 ; SSE-NEXT:    paddd %xmm2, %xmm1
@@ -822,14 +824,13 @@ define <16 x i8> @test7(<16 x i8> %a) {
 define <4 x i32> @test8(<4 x i32> %a) {
 ; SSE41-LABEL: test8:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pmuldq %xmm2, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
-; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE41-NEXT:    pmuldq %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
 ; SSE41-NEXT:    paddd %xmm0, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    psrld $31, %xmm0
@@ -840,22 +841,22 @@ define <4 x i32> @test8(<4 x i32> %a) {
 ;
 ; SSE-LABEL: test8:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pmuludq %xmm1, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
 ; SSE-NEXT:    psrad $31, %xmm1
 ; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrad $31, %xmm3
-; SSE-NEXT:    pand %xmm2, %xmm3
-; SSE-NEXT:    paddd %xmm1, %xmm3
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    pmuludq %xmm2, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE-NEXT:    pmuludq %xmm2, %xmm4
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; SSE-NEXT:    psubd %xmm3, %xmm1
+; SSE-NEXT:    paddd %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT:    psubd %xmm2, %xmm1
 ; SSE-NEXT:    paddd %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    psrld $31, %xmm0
@@ -867,12 +868,12 @@ define <4 x i32> @test8(<4 x i32> %a) {
 ; AVX-LABEL: test8:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
-; AVX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX-NEXT:    vpmuldq %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; AVX-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
 ; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpsrld $31, %xmm0, %xmm1
 ; AVX-NEXT:    vpsrad $2, %xmm0, %xmm0
@@ -885,75 +886,77 @@ define <4 x i32> @test8(<4 x i32> %a) {
 define <8 x i32> @test9(<8 x i32> %a) {
 ; SSE41-LABEL: test9:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; SSE41-NEXT:   # kill: XMM0<def> XMM3<kill>
-; SSE41-NEXT:    pmuldq %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    pmuldq %xmm4, %xmm5
-; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm5[1,3]
-; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE41-NEXT:    paddd %xmm3, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    psrld $31, %xmm3
-; SSE41-NEXT:    psrad $2, %xmm0
-; SSE41-NEXT:    paddd %xmm3, %xmm0
-; SSE41-NEXT:    pmuldq %xmm2, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE41-NEXT:    pmuldq %xmm4, %xmm3
-; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
-; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; SSE41-NEXT:    paddd %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psrld $31, %xmm2
-; SSE41-NEXT:    psrad $2, %xmm1
-; SSE41-NEXT:    paddd %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pmuldq %xmm3, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
+; SSE41-NEXT:    paddd %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    psrld $31, %xmm0
+; SSE41-NEXT:    psrad $2, %xmm2
+; SSE41-NEXT:    paddd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm4, %xmm0
+; SSE41-NEXT:    pmuldq %xmm1, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7]
+; SSE41-NEXT:    paddd %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    psrld $31, %xmm0
+; SSE41-NEXT:    psrad $2, %xmm3
+; SSE41-NEXT:    paddd %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; SSE-LABEL: test9:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
+; SSE-NEXT:    movdqa %xmm3, %xmm4
 ; SSE-NEXT:    psrad $31, %xmm4
 ; SSE-NEXT:    movdqa %xmm4, %xmm0
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    movdqa %xmm3, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm2, %xmm5
 ; SSE-NEXT:    psrad $31, %xmm5
-; SSE-NEXT:    pand %xmm1, %xmm5
+; SSE-NEXT:    pand %xmm3, %xmm5
 ; SSE-NEXT:    paddd %xmm0, %xmm5
-; SSE-NEXT:    movdqa %xmm3, %xmm0
-; SSE-NEXT:    pmuludq %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    pmuludq %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
 ; SSE-NEXT:    pmuludq %xmm6, %xmm7
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm7[1,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
 ; SSE-NEXT:    psubd %xmm5, %xmm0
-; SSE-NEXT:    paddd %xmm3, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrld $31, %xmm3
-; SSE-NEXT:    psrad $2, %xmm0
-; SSE-NEXT:    paddd %xmm3, %xmm0
-; SSE-NEXT:    pand %xmm2, %xmm4
-; SSE-NEXT:    movdqa %xmm2, %xmm3
-; SSE-NEXT:    psrad $31, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm3
-; SSE-NEXT:    paddd %xmm4, %xmm3
-; SSE-NEXT:    pmuludq %xmm2, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE-NEXT:    pmuludq %xmm6, %xmm4
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; SSE-NEXT:    psubd %xmm3, %xmm1
-; SSE-NEXT:    paddd %xmm2, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm2
 ; SSE-NEXT:    psrld $31, %xmm2
-; SSE-NEXT:    psrad $2, %xmm1
-; SSE-NEXT:    paddd %xmm2, %xmm1
+; SSE-NEXT:    psrad $2, %xmm0
+; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm4
+; SSE-NEXT:    movdqa %xmm1, %xmm5
+; SSE-NEXT:    psrad $31, %xmm5
+; SSE-NEXT:    pand %xmm3, %xmm5
+; SSE-NEXT:    paddd %xmm4, %xmm5
+; SSE-NEXT:    pmuludq %xmm1, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm6, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT:    psubd %xmm5, %xmm2
+; SSE-NEXT:    paddd %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    psrld $31, %xmm1
+; SSE-NEXT:    psrad $2, %xmm2
+; SSE-NEXT:    paddd %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test9:
@@ -978,72 +981,76 @@ define <8 x i32> @test10(<8 x i32> %a) {
 ; SSE41-LABEL: test10:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pmuludq %xmm2, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pmuludq %xmm4, %xmm5
-; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
-; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm3, %xmm4
 ; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    psubd %xmm3, %xmm5
-; SSE41-NEXT:    psrld $1, %xmm5
-; SSE41-NEXT:    paddd %xmm3, %xmm5
-; SSE41-NEXT:    psrld $2, %xmm5
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [7,7,7,7]
-; SSE41-NEXT:    pmulld %xmm3, %xmm5
-; SSE41-NEXT:    psubd %xmm5, %xmm0
-; SSE41-NEXT:    pmuludq %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pmuludq %xmm4, %xmm5
-; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3]
-; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    psubd %xmm2, %xmm4
+; SSE41-NEXT:    pmuludq %xmm2, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psubd %xmm5, %xmm4
 ; SSE41-NEXT:    psrld $1, %xmm4
-; SSE41-NEXT:    paddd %xmm2, %xmm4
+; SSE41-NEXT:    paddd %xmm5, %xmm4
 ; SSE41-NEXT:    psrld $2, %xmm4
-; SSE41-NEXT:    pmulld %xmm3, %xmm4
-; SSE41-NEXT:    psubd %xmm4, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [7,7,7,7]
+; SSE41-NEXT:    pmulld %xmm5, %xmm4
+; SSE41-NEXT:    psubd %xmm4, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm3, %xmm4
+; SSE41-NEXT:    pmuludq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psubd %xmm2, %xmm3
+; SSE41-NEXT:    psrld $1, %xmm3
+; SSE41-NEXT:    paddd %xmm2, %xmm3
+; SSE41-NEXT:    psrld $2, %xmm3
+; SSE41-NEXT:    pmulld %xmm5, %xmm3
+; SSE41-NEXT:    psubd %xmm3, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; SSE-LABEL: test10:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    pmuludq %xmm2, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pmuludq %xmm3, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
 ; SSE-NEXT:    pmuludq %xmm4, %xmm5
-; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
 ; SSE-NEXT:    movdqa %xmm0, %xmm5
-; SSE-NEXT:    psubd %xmm3, %xmm5
+; SSE-NEXT:    psubd %xmm2, %xmm5
 ; SSE-NEXT:    psrld $1, %xmm5
-; SSE-NEXT:    paddd %xmm3, %xmm5
+; SSE-NEXT:    paddd %xmm2, %xmm5
 ; SSE-NEXT:    psrld $2, %xmm5
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [7,7,7,7]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [7,7,7,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSE-NEXT:    pmuludq %xmm3, %xmm5
-; SSE-NEXT:    pmuludq %xmm3, %xmm6
-; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
-; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2,1,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
 ; SSE-NEXT:    psubd %xmm5, %xmm0
-; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    pmuludq %xmm1, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
 ; SSE-NEXT:    pmuludq %xmm4, %xmm5
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,3,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
-; SSE-NEXT:    psubd %xmm2, %xmm4
+; SSE-NEXT:    psubd %xmm3, %xmm4
 ; SSE-NEXT:    psrld $1, %xmm4
-; SSE-NEXT:    paddd %xmm2, %xmm4
+; SSE-NEXT:    paddd %xmm3, %xmm4
 ; SSE-NEXT:    psrld $2, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE-NEXT:    pmuludq %xmm3, %xmm4
-; SSE-NEXT:    pmuludq %xmm3, %xmm2
-; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,2]
-; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
 ; SSE-NEXT:    psubd %xmm4, %xmm1
 ; SSE-NEXT:    retq
 ;
@@ -1072,32 +1079,32 @@ define <8 x i32> @test11(<8 x i32> %a) {
 ; SSE41-LABEL: test11:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pmuldq %xmm4, %xmm5
-; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
-; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
-; SSE41-NEXT:    paddd %xmm0, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm5
-; SSE41-NEXT:    psrld $31, %xmm5
-; SSE41-NEXT:    psrad $2, %xmm3
-; SSE41-NEXT:    paddd %xmm5, %xmm3
-; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [7,7,7,7]
-; SSE41-NEXT:    pmulld %xmm5, %xmm3
-; SSE41-NEXT:    psubd %xmm3, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    pmuldq %xmm2, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
+; SSE41-NEXT:    paddd %xmm0, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm4
+; SSE41-NEXT:    psrld $31, %xmm4
+; SSE41-NEXT:    psrad $2, %xmm5
+; SSE41-NEXT:    paddd %xmm4, %xmm5
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [7,7,7,7]
+; SSE41-NEXT:    pmulld %xmm4, %xmm5
+; SSE41-NEXT:    psubd %xmm5, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm3, %xmm5
 ; SSE41-NEXT:    pmuldq %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pmuldq %xmm4, %xmm3
-; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
-; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
 ; SSE41-NEXT:    paddd %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; SSE41-NEXT:    psrld $31, %xmm3
 ; SSE41-NEXT:    psrad $2, %xmm2
 ; SSE41-NEXT:    paddd %xmm3, %xmm2
-; SSE41-NEXT:    pmulld %xmm5, %xmm2
+; SSE41-NEXT:    pmulld %xmm4, %xmm2
 ; SSE41-NEXT:    psubd %xmm2, %xmm1
 ; SSE41-NEXT:    retq
 ;
@@ -1112,13 +1119,14 @@ define <8 x i32> @test11(<8 x i32> %a) {
 ; SSE-NEXT:    psrad $31, %xmm6
 ; SSE-NEXT:    pand %xmm2, %xmm6
 ; SSE-NEXT:    paddd %xmm4, %xmm6
-; SSE-NEXT:    movdqa %xmm0, %xmm7
-; SSE-NEXT:    pmuludq %xmm2, %xmm7
+; SSE-NEXT:    movdqa %xmm0, %xmm4
+; SSE-NEXT:    pmuludq %xmm2, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[1,3,2,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
 ; SSE-NEXT:    pmuludq %xmm5, %xmm4
-; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,3],xmm4[1,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
 ; SSE-NEXT:    psubd %xmm6, %xmm7
 ; SSE-NEXT:    paddd %xmm0, %xmm7
 ; SSE-NEXT:    movdqa %xmm7, %xmm4
@@ -1128,9 +1136,10 @@ define <8 x i32> @test11(<8 x i32> %a) {
 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [7,7,7,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
 ; SSE-NEXT:    pmuludq %xmm4, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
 ; SSE-NEXT:    pmuludq %xmm4, %xmm6
-; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[0,2]
-; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
 ; SSE-NEXT:    psubd %xmm7, %xmm0
 ; SSE-NEXT:    pand %xmm1, %xmm3
 ; SSE-NEXT:    movdqa %xmm1, %xmm6
@@ -1138,10 +1147,11 @@ define <8 x i32> @test11(<8 x i32> %a) {
 ; SSE-NEXT:    pand %xmm2, %xmm6
 ; SSE-NEXT:    paddd %xmm3, %xmm6
 ; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
 ; SSE-NEXT:    pmuludq %xmm5, %xmm3
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSE-NEXT:    psubd %xmm6, %xmm2
 ; SSE-NEXT:    paddd %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, %xmm3
@@ -1150,9 +1160,10 @@ define <8 x i32> @test11(<8 x i32> %a) {
 ; SSE-NEXT:    paddd %xmm3, %xmm2
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
 ; SSE-NEXT:    pmuludq %xmm4, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; SSE-NEXT:    pmuludq %xmm4, %xmm3
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSE-NEXT:    psubd %xmm2, %xmm1
 ; SSE-NEXT:    retq
 ;
@@ -1202,16 +1213,15 @@ define <4 x i32> @PR20355(<4 x i32> %a) {
 ; SSE41-LABEL: PR20355:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm2, %xmm3
 ; SSE41-NEXT:    pmuldq %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pmuldq %xmm2, %xmm1
-; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE41-NEXT:    movaps %xmm0, %xmm1
-; SSE41-NEXT:    psrld $31, %xmm1
-; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $31, %xmm0
+; SSE41-NEXT:    paddd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; SSE-LABEL: PR20355:
@@ -1226,26 +1236,26 @@ define <4 x i32> @PR20355(<4 x i32> %a) {
 ; SSE-NEXT:    paddd %xmm2, %xmm3
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE-NEXT:    pmuludq %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT:    pmuludq %xmm2, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE-NEXT:    psubd %xmm3, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $31, %xmm1
-; SSE-NEXT:    paddd %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE-NEXT:    psubd %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    psrld $31, %xmm0
+; SSE-NEXT:    paddd %xmm4, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: PR20355:
 ; AVX:       # BB#0: # %entry
 ; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
-; AVX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
 ; AVX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[1,3],xmm0[1,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 ; AVX-NEXT:    vpsrld $31, %xmm0, %xmm1
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index 7a329d7..962d038 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -523,64 +523,47 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
 define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
 ; SSE2-LABEL: sext_16i8_to_16i16:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movq (%rdi), %xmm0
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psllw $8, %xmm0
 ; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    psllw $8, %xmm1
+; SSE2-NEXT:    movq 8(%rdi), %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_16i8_to_16i16:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movdqa (%rdi), %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    movq (%rdi), %xmm0
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    psllw $8, %xmm0
 ; SSSE3-NEXT:    psraw $8, %xmm0
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT:    psllw $8, %xmm1
+; SSSE3-NEXT:    movq 8(%rdi), %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    psraw $8, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: sext_16i8_to_16i16:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    movdqa (%rdi), %xmm1
-; SSE41-NEXT:    pmovzxbw %xmm1, %xmm0
-; SSE41-NEXT:    psllw $8, %xmm0
-; SSE41-NEXT:    psraw $8, %xmm0
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    psllw $8, %xmm1
-; SSE41-NEXT:    psraw $8, %xmm1
+; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxbw 8(%rdi), %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: sext_16i8_to_16i16:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxbw 8(%rdi), %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: sext_16i8_to_16i16:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxbw (%rdi), %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; X32-SSE41-LABEL: sext_16i8_to_16i16:
 ; X32-SSE41:       # BB#0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT:    movdqa (%eax), %xmm1
-; X32-SSE41-NEXT:    pmovzxbw %xmm1, %xmm0
-; X32-SSE41-NEXT:    psllw $8, %xmm0
-; X32-SSE41-NEXT:    psraw $8, %xmm0
-; X32-SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X32-SSE41-NEXT:    psllw $8, %xmm1
-; X32-SSE41-NEXT:    psraw $8, %xmm1
+; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxbw 8(%eax), %xmm1
 ; X32-SSE41-NEXT:    retl
 entry:
  %X = load <16 x i8>* %ptr
@@ -706,73 +689,36 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
 define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
 ; SSE2-LABEL: load_sext_4i8_to_4i64:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movd (%rdi), %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
-; SSE2-NEXT:    movd %xmm2, %rax
-; SSE2-NEXT:    movsbq %al, %rax
+; SSE2-NEXT:    movsbq 1(%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    movsbq (%rdi), %rax
 ; SSE2-NEXT:    movd %rax, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT:    movd %xmm2, %rax
-; SSE2-NEXT:    movsbq %al, %rax
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movsbq 3(%rdi), %rax
 ; SSE2-NEXT:    movd %rax, %xmm2
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; SSE2-NEXT:    movd %xmm2, %rax
-; SSE2-NEXT:    movsbq %al, %rax
+; SSE2-NEXT:    movsbq 2(%rdi), %rax
 ; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT:    movd %xmm2, %rax
-; SSE2-NEXT:    movsbq %al, %rax
-; SSE2-NEXT:    movd %rax, %xmm2
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: load_sext_4i8_to_4i64:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movd (%rdi), %xmm1
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
-; SSSE3-NEXT:    movd %xmm2, %rax
-; SSSE3-NEXT:    movsbq %al, %rax
+; SSSE3-NEXT:    movsbq 1(%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    movsbq (%rdi), %rax
 ; SSSE3-NEXT:    movd %rax, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSSE3-NEXT:    movd %xmm2, %rax
-; SSSE3-NEXT:    movsbq %al, %rax
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    movsbq 3(%rdi), %rax
 ; SSSE3-NEXT:    movd %rax, %xmm2
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; SSSE3-NEXT:    movd %xmm2, %rax
-; SSSE3-NEXT:    movsbq %al, %rax
+; SSSE3-NEXT:    movsbq 2(%rdi), %rax
 ; SSSE3-NEXT:    movd %rax, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSSE3-NEXT:    movd %xmm2, %rax
-; SSSE3-NEXT:    movsbq %al, %rax
-; SSSE3-NEXT:    movd %rax, %xmm2
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: load_sext_4i8_to_4i64:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovzxbd (%rdi), %xmm1
-; SSE41-NEXT:    pmovzxdq %xmm1, %xmm0
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    movsbq %al, %rax
-; SSE41-NEXT:    movd %rax, %xmm2
-; SSE41-NEXT:    movd %xmm0, %rax
-; SSE41-NEXT:    movsbq %al, %rax
-; SSE41-NEXT:    movd %rax, %xmm0
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; SSE41-NEXT:    pextrq $1, %xmm1, %rax
-; SSE41-NEXT:    movsbq %al, %rax
-; SSE41-NEXT:    movd %rax, %xmm2
-; SSE41-NEXT:    movd %xmm1, %rax
-; SSE41-NEXT:    movsbq %al, %rax
-; SSE41-NEXT:    movd %rax, %xmm1
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: load_sext_4i8_to_4i64:
@@ -792,30 +738,8 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
 ; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
 ; X32-SSE41:       # BB#0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT:    movd (%eax), %xmm0
-; X32-SSE41-NEXT:    pmovzxbd %xmm0, %xmm1
-; X32-SSE41-NEXT:    pmovzxbq %xmm0, %xmm2
-; X32-SSE41-NEXT:    movd %xmm2, %eax
-; X32-SSE41-NEXT:    movsbl %al, %eax
-; X32-SSE41-NEXT:    movd %eax, %xmm0
-; X32-SSE41-NEXT:    sarl $31, %eax
-; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm0
-; X32-SSE41-NEXT:    pextrd $2, %xmm2, %eax
-; X32-SSE41-NEXT:    movsbl %al, %eax
-; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm0
-; X32-SSE41-NEXT:    sarl $31, %eax
-; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
-; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; X32-SSE41-NEXT:    movd %xmm2, %eax
-; X32-SSE41-NEXT:    movsbl %al, %eax
-; X32-SSE41-NEXT:    movd %eax, %xmm1
-; X32-SSE41-NEXT:    sarl $31, %eax
-; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm1
-; X32-SSE41-NEXT:    pextrd $2, %xmm2, %eax
-; X32-SSE41-NEXT:    movsbl %al, %eax
-; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm1
-; X32-SSE41-NEXT:    sarl $31, %eax
-; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm1
+; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
 ; X32-SSE41-NEXT:    retl
 entry:
  %X = load <4 x i8>* %ptr
@@ -826,72 +750,36 @@ entry:
 define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
 ; SSE2-LABEL: load_sext_4i16_to_4i64:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movq (%rdi), %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
-; SSE2-NEXT:    movd %xmm2, %rax
-; SSE2-NEXT:    movswq %ax, %rax
+; SSE2-NEXT:    movswq 2(%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    movswq (%rdi), %rax
 ; SSE2-NEXT:    movd %rax, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT:    movd %xmm2, %rax
-; SSE2-NEXT:    movswq %ax, %rax
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movswq 6(%rdi), %rax
 ; SSE2-NEXT:    movd %rax, %xmm2
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; SSE2-NEXT:    movd %xmm2, %rax
-; SSE2-NEXT:    movswq %ax, %rax
+; SSE2-NEXT:    movswq 4(%rdi), %rax
 ; SSE2-NEXT:    movd %rax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT:    movd %xmm2, %rax
-; SSE2-NEXT:    movswq %ax, %rax
-; SSE2-NEXT:    movd %rax, %xmm2
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: load_sext_4i16_to_4i64:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movq (%rdi), %xmm1
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
-; SSSE3-NEXT:    movd %xmm2, %rax
-; SSSE3-NEXT:    movswq %ax, %rax
+; SSSE3-NEXT:    movswq 2(%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    movswq (%rdi), %rax
 ; SSSE3-NEXT:    movd %rax, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSSE3-NEXT:    movd %xmm2, %rax
-; SSSE3-NEXT:    movswq %ax, %rax
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    movswq 6(%rdi), %rax
 ; SSSE3-NEXT:    movd %rax, %xmm2
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; SSSE3-NEXT:    movd %xmm2, %rax
-; SSSE3-NEXT:    movswq %ax, %rax
+; SSSE3-NEXT:    movswq 4(%rdi), %rax
 ; SSSE3-NEXT:    movd %rax, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSSE3-NEXT:    movd %xmm2, %rax
-; SSSE3-NEXT:    movswq %ax, %rax
-; SSSE3-NEXT:    movd %rax, %xmm2
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: load_sext_4i16_to_4i64:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    movq (%rdi), %xmm0
-; SSE41-NEXT:    pmovzxwd %xmm0, %xmm1
-; SSE41-NEXT:    pmovzxwq %xmm0, %xmm0
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    movswq %ax, %rax
-; SSE41-NEXT:    movd %rax, %xmm2
-; SSE41-NEXT:    movd %xmm0, %rax
-; SSE41-NEXT:    movswq %ax, %rax
-; SSE41-NEXT:    movd %rax, %xmm0
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; SSE41-NEXT:    pextrq $1, %xmm1, %rax
-; SSE41-NEXT:    movswq %ax, %rax
-; SSE41-NEXT:    movd %rax, %xmm2
-; SSE41-NEXT:    movd %xmm1, %rax
-; SSE41-NEXT:    movswq %ax, %rax
-; SSE41-NEXT:    movd %rax, %xmm1
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
+; SSE41-NEXT:    pmovsxwq 4(%rdi), %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: load_sext_4i16_to_4i64:
@@ -911,30 +799,8 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
 ; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
 ; X32-SSE41:       # BB#0: # %entry
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT:    movsd (%eax), %xmm0
-; X32-SSE41-NEXT:    pmovzxwd %xmm0, %xmm1
-; X32-SSE41-NEXT:    pmovzxwq %xmm0, %xmm2
-; X32-SSE41-NEXT:    movd %xmm2, %eax
-; X32-SSE41-NEXT:    cwtl
-; X32-SSE41-NEXT:    movd %eax, %xmm0
-; X32-SSE41-NEXT:    sarl $31, %eax
-; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm0
-; X32-SSE41-NEXT:    pextrd $2, %xmm2, %eax
-; X32-SSE41-NEXT:    cwtl
-; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm0
-; X32-SSE41-NEXT:    sarl $31, %eax
-; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
-; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; X32-SSE41-NEXT:    movd %xmm2, %eax
-; X32-SSE41-NEXT:    cwtl
-; X32-SSE41-NEXT:    movd %eax, %xmm1
-; X32-SSE41-NEXT:    sarl $31, %eax
-; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm1
-; X32-SSE41-NEXT:    pextrd $2, %xmm2, %eax
-; X32-SSE41-NEXT:    cwtl
-; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm1
-; X32-SSE41-NEXT:    sarl $31, %eax
-; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm1
+; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovsxwq 4(%eax), %xmm1
 ; X32-SSE41-NEXT:    retl
 entry:
  %X = load <4 x i16>* %ptr
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 30ad366..c271622 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
@@ -247,13 +247,34 @@ define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(
 }
 
 define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
-; SSE-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
-; SSE:       # BB#0:
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
 ; AVX1:       # BB#0:
@@ -318,23 +339,20 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(
 ;
 ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4]
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20>
   ret <16 x i8> %shuffle
@@ -343,47 +361,181 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(
 define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
 ; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
-; SSE2-NEXT:    movsd %xmm4, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; SSE2-NEXT:    movsd %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    packuswb %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero
-; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4]
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20>
   ret <16 x i8> %shuffle
 }
 
+define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    andps %xmm2, %xmm0
+; SSE2-NEXT:    andnps %xmm1, %xmm2
+; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; SSE2-NEXT:    andps %xmm2, %xmm0
+; SSE2-NEXT:    andnps %xmm1, %xmm2
+; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[15]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6],zero,xmm0[8,9,10],zero,xmm0[12,13,14],zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
+; SSE2-NEXT:    andps %xmm2, %xmm0
+; SSE2-NEXT:    andnps %xmm1, %xmm2
+; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,xmm0[5,6],zero,xmm0[8,9,10,11],zero,xmm0[13,14],zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
+; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
+; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 31>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
+; SSE2-NEXT:    andps %xmm2, %xmm1
+; SSE2-NEXT:    andnps %xmm0, %xmm2
+; SSE2-NEXT:    orps %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[4,5,6,7],zero,zero,xmm0[10,11],zero,xmm0[13],zero,xmm0[15]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[8,9],zero,zero,xmm1[12],zero,xmm1[14],zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
+; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
+; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 13, i32 30, i32 15>
+  ret <16 x i8> %shuffle
+}
+
 define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
 ; SSE2-LABEL: trunc_v4i32_shuffle:
 ; SSE2:       # BB#0:
@@ -429,12 +581,12 @@ entry:
   ret <16 x i8> %s.16.0
 }
 
-define <16 x i8> @stress_test1(<16 x i8> %s.0.5, <16 x i8> %s.0.8, <16 x i8> %s.0.9) noinline nounwind {
+define <16 x i8> @undef_test1(<16 x i8> %s.0.5, <16 x i8> %s.0.8, <16 x i8> %s.0.9) noinline nounwind {
 ; There is nothing interesting to check about these instructions other than
 ; that they survive codegen. However, we actually do better and delete all of
 ; them because the result is 'undef'.
 ;
-; ALL-LABEL: stress_test1:
+; ALL-LABEL: undef_test1:
 ; ALL:       # BB#0: # %entry
 ; ALL-NEXT:    retq
 entry:
@@ -460,36 +612,22 @@ define <16 x i8> @PR20540(<8 x i8> %a) {
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: PR20540:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,0,0,0,0,0,0,0]
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: PR20540:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: PR20540:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,0,0,0,0,0,0,0]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i8> %shuffle
@@ -505,28 +643,19 @@ define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 ; SSSE3-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    movd %edi, %xmm0
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 0
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -544,28 +673,19 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    movd %edi, %xmm0
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,0],zero,xmm1[0,0,0,0,0,0,0,0,0,0]
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,0],zero,xmm1[0,0,0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,0],zero,xmm1[0,0,0,0,0,0,0,0,0,0]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 0
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -573,23 +693,11 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 }
 
 define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
-; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movd %edi, %xmm0
-; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
-; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movd %edi, %xmm0
-; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; SSE41-NEXT:    retq
+; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd %edi, %xmm0
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
 ; AVX:       # BB#0:
@@ -612,31 +720,22 @@ define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    movd %edi, %xmm0
-; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSSE3-NEXT:    pslld $24, %xmm0
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSE41-NEXT:    pslld $24, %xmm0
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT:    vpslld $24, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 3
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -644,44 +743,24 @@ define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 }
 
 define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) {
-; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
-; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
+; SSE:       # BB#0:
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
 ; AVX-NEXT:    retq
-  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 undef, i32 18, i32 undef>
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 undef, i32 18, i32 undef>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
-; SSE2-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSSE3:       # BB#0:
-; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    retq
+; SSE-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; AVX:       # BB#0:
@@ -868,12 +947,12 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(
 ;
 ; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pmovzxbq %xmm0, %xmm0
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovzxbq %xmm0, %xmm0
+; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <16 x i8> %shuffle
@@ -895,12 +974,12 @@ define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(
 ;
 ; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pmovzxbq %xmm0, %xmm0
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovzxbq %xmm0, %xmm0
+; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <16 x i8> %shuffle
@@ -921,12 +1000,12 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(
 ;
 ; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pmovzxbd %xmm0, %xmm0
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovzxbd %xmm0, %xmm0
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
   ret <16 x i8> %shuffle
@@ -949,12 +1028,12 @@ define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(
 ;
 ; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pmovzxbd %xmm0, %xmm0
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovzxbd %xmm0, %xmm0
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
   ret <16 x i8> %shuffle
@@ -973,12 +1052,12 @@ define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(
 ;
 ; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pmovzxbw %xmm0, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovzxbw %xmm0, %xmm0
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef>
   ret <16 x i8> %shuffle
@@ -999,12 +1078,12 @@ define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(
 ;
 ; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pmovzxbw %xmm0, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovzxbw %xmm0, %xmm0
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
   ret <16 x i8> %shuffle
@@ -1016,69 +1095,53 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,0,3,1,4,5,6,7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[2,1,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,3,1,4,5,6,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; SSE2-NEXT:    packuswb %xmm0, %xmm4
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,2,1,3,4,5,6,7]
-; SSE2-NEXT:    packuswb %xmm0, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm4, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    psrlq $16, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,1,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,4]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    packuswb %xmm5, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,7]
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
+; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u]
-; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0>
@@ -1098,13 +1161,178 @@ entry:
   ret <16 x i8> %s.2.0
 }
 
-define void @constant_gets_selected() {
-; ALL-LABEL: constant_gets_selected:
-; ALL-NOT movd $0, {{%xmm[0-9]+}}
+define void @constant_gets_selected(<4 x i32>* %ptr1, <4 x i32>* %ptr2) {
+; SSE-LABEL: constant_gets_selected:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, (%rsi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: constant_gets_selected:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovaps %xmm0, (%rdi)
+; AVX-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX-NEXT:    retq
+entry:
   %weird_zero = bitcast <4 x i32> zeroinitializer to <16 x i8>
   %shuffle.i = shufflevector <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %weird_zero, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
   %weirder_zero = bitcast <16 x i8> %shuffle.i to <4 x i32>
-  store <4 x i32> %weirder_zero, <4 x i32>* undef, align 16
-  store <4 x i32> zeroinitializer, <4 x i32>* undef, align 16
+  store <4 x i32> %weirder_zero, <4 x i32>* %ptr1, align 16
+  store <4 x i32> zeroinitializer, <4 x i32>* %ptr2, align 16
   ret void
 }
+
+;
+; Shuffle to logical bit shifts
+;
+
+define <16 x i8> @shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
+; SSE:       # BB#0:
+; SSE-NEXT:    psllw $8, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
+; SSE:       # BB#0:
+; SSE-NEXT:    pslld $24, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
+; SSE:       # BB#0:
+; SSE-NEXT:    psllq $56, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
+; SSE:       # BB#0:
+; SSE-NEXT:    psllq $8, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllq $8, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 undef, i32 2, i32 3, i32 undef, i32 5, i32 6, i32 16, i32 8, i32 9, i32 undef, i32 11, i32 12, i32 13, i32 14>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlw $8, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 undef, i32 16, i32 undef, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 15, i32 16, i32 16>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlq $56, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlq $56, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
+; SSE2-LABEL: PR12412:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: PR12412:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: PR12412:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    pshufb %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm2, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: PR12412:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+entry:
+  %0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  ret <16 x i8> %0
+}
+
+define <16 x i8> @shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz(<16 x i8> %a) {
+; SSE-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrld $8, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrld $8, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 2, i32 3, i32 16, i32 undef, i32 6, i32 7, i32 16, i32 undef, i32 10, i32 11, i32 16, i32 undef, i32 14, i32 15, i32 16>
+  ret <16 x i8> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 9affee9..7214803 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
@@ -105,22 +105,22 @@ define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) {
 ;
 ; SSE3-LABEL: shuffle_v2f64_00:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v2f64_00:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v2f64_00:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2f64_00:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0>
   ret <2 x double> %shuffle
@@ -160,25 +160,22 @@ define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) {
 ;
 ; SSE3-LABEL: shuffle_v2f64_22:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
-; SSE3-NEXT:    movapd %xmm1, %xmm0
+; SSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v2f64_22:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
-; SSSE3-NEXT:    movapd %xmm1, %xmm0
+; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v2f64_22:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
-; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2f64_22:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm1[0,0]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 2>
   ret <2 x double> %shuffle
@@ -214,20 +211,20 @@ define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
 define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) {
 ; SSE2-LABEL: shuffle_v2f64_03:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movsd %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v2f64_03:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    movsd %xmm0, %xmm1
-; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE3-NEXT:    movapd %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v2f64_03:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movsd %xmm0, %xmm1
-; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSSE3-NEXT:    movapd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v2f64_03:
@@ -245,17 +242,17 @@ define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) {
 define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) {
 ; SSE2-LABEL: shuffle_v2f64_21:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v2f64_21:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v2f64_21:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v2f64_21:
@@ -302,20 +299,20 @@ define <2 x i64> @shuffle_v2i64_02_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
 define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) {
 ; SSE2-LABEL: shuffle_v2i64_03:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movsd %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v2i64_03:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    movsd %xmm0, %xmm1
-; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE3-NEXT:    movapd %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v2i64_03:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movsd %xmm0, %xmm1
-; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSSE3-NEXT:    movapd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v2i64_03:
@@ -338,20 +335,20 @@ define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) {
 define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
 ; SSE2-LABEL: shuffle_v2i64_03_copy:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movsd %xmm1, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; SSE2-NEXT:    movapd %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v2i64_03_copy:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    movsd %xmm1, %xmm2
-; SSE3-NEXT:    movaps %xmm2, %xmm0
+; SSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; SSE3-NEXT:    movapd %xmm2, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v2i64_03_copy:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movsd %xmm1, %xmm2
-; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; SSSE3-NEXT:    movapd %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v2i64_03_copy:
@@ -492,17 +489,17 @@ define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
 define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) {
 ; SSE2-LABEL: shuffle_v2i64_21:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v2i64_21:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v2i64_21:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v2i64_21:
@@ -525,20 +522,20 @@ define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) {
 define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
 ; SSE2-LABEL: shuffle_v2i64_21_copy:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movsd %xmm2, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v2i64_21_copy:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    movsd %xmm2, %xmm1
-; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE3-NEXT:    movapd %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v2i64_21_copy:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movsd %xmm2, %xmm1
-; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSSE3-NEXT:    movapd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v2i64_21_copy:
@@ -653,12 +650,12 @@ define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
 define <2 x i64> @shuffle_v2i64_0z(<2 x i64> %a) {
 ; SSE-LABEL: shuffle_v2i64_0z:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movq %xmm0, %xmm0
+; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_0z:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovq %xmm0, %xmm0
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3>
   ret <2 x i64> %shuffle
@@ -667,14 +664,12 @@ define <2 x i64> @shuffle_v2i64_0z(<2 x i64> %a) {
 define <2 x i64> @shuffle_v2i64_1z(<2 x i64> %a) {
 ; SSE-LABEL: shuffle_v2i64_1z:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_1z:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle
@@ -683,14 +678,12 @@ define <2 x i64> @shuffle_v2i64_1z(<2 x i64> %a) {
 define <2 x i64> @shuffle_v2i64_z0(<2 x i64> %a) {
 ; SSE-LABEL: shuffle_v2i64_z0:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movq %xmm0, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_z0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovq %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 0>
   ret <2 x i64> %shuffle
@@ -699,20 +692,20 @@ define <2 x i64> @shuffle_v2i64_z0(<2 x i64> %a) {
 define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) {
 ; SSE2-LABEL: shuffle_v2i64_z1:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v2i64_z1:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    xorps %xmm1, %xmm1
-; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    xorpd %xmm1, %xmm1
+; SSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v2i64_z1:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    xorps %xmm1, %xmm1
-; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    xorpd %xmm1, %xmm1
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v2i64_z1:
@@ -739,12 +732,12 @@ define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) {
 define <2 x double> @shuffle_v2f64_0z(<2 x double> %a) {
 ; SSE-LABEL: shuffle_v2f64_0z:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movq %xmm0, %xmm0
+; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2f64_0z:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovq %xmm0, %xmm0
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %shuffle
@@ -786,20 +779,20 @@ define <2 x double> @shuffle_v2f64_z0(<2 x double> %a) {
 define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) {
 ; SSE2-LABEL: shuffle_v2f64_z1:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v2f64_z1:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    xorps %xmm1, %xmm1
-; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    xorpd %xmm1, %xmm1
+; SSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v2f64_z1:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    xorps %xmm1, %xmm1
-; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    xorpd %xmm1, %xmm1
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v2f64_z1:
@@ -835,12 +828,12 @@ define <2 x i64> @insert_reg_and_zero_v2i64(i64 %a) {
 define <2 x i64> @insert_mem_and_zero_v2i64(i64* %ptr) {
 ; SSE-LABEL: insert_mem_and_zero_v2i64:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movq (%rdi), %xmm0
+; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: insert_mem_and_zero_v2i64:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovq (%rdi), %xmm0
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    retq
   %a = load i64* %ptr
   %v = insertelement <2 x i64> undef, i64 %a, i32 0
@@ -851,12 +844,12 @@ define <2 x i64> @insert_mem_and_zero_v2i64(i64* %ptr) {
 define <2 x double> @insert_reg_and_zero_v2f64(double %a) {
 ; SSE-LABEL: insert_reg_and_zero_v2f64:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movq %xmm0, %xmm0
+; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: insert_reg_and_zero_v2f64:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovq %xmm0, %xmm0
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; AVX-NEXT:    retq
   %v = insertelement <2 x double> undef, double %a, i32 0
   %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3>
@@ -866,12 +859,12 @@ define <2 x double> @insert_reg_and_zero_v2f64(double %a) {
 define <2 x double> @insert_mem_and_zero_v2f64(double* %ptr) {
 ; SSE-LABEL: insert_mem_and_zero_v2f64:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movsd (%rdi), %xmm0
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: insert_mem_and_zero_v2f64:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovsd (%rdi), %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    retq
   %a = load double* %ptr
   %v = insertelement <2 x double> undef, double %a, i32 0
@@ -883,19 +876,19 @@ define <2 x i64> @insert_reg_lo_v2i64(i64 %a, <2 x i64> %b) {
 ; SSE2-LABEL: insert_reg_lo_v2i64:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movd %rdi, %xmm1
-; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: insert_reg_lo_v2i64:
 ; SSE3:       # BB#0:
 ; SSE3-NEXT:    movd %rdi, %xmm1
-; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: insert_reg_lo_v2i64:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    movd %rdi, %xmm1
-; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: insert_reg_lo_v2i64:
@@ -938,19 +931,19 @@ define <2 x i64> @insert_mem_lo_v2i64(i64* %ptr, <2 x i64> %b) {
 ;
 ; SSE41-LABEL: insert_mem_lo_v2i64:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movq (%rdi), %xmm1
+; SSE41-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: insert_mem_lo_v2i64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovq (%rdi), %xmm1
+; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: insert_mem_lo_v2i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovq (%rdi), %xmm1
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX2-NEXT:    retq
   %a = load i64* %ptr
@@ -979,13 +972,13 @@ define <2 x i64> @insert_reg_hi_v2i64(i64 %a, <2 x i64> %b) {
 define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) {
 ; SSE-LABEL: insert_mem_hi_v2i64:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movq (%rdi), %xmm1
+; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: insert_mem_hi_v2i64:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovq (%rdi), %xmm1
+; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
   %a = load i64* %ptr
@@ -997,13 +990,13 @@ define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) {
 define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) {
 ; SSE-LABEL: insert_reg_lo_v2f64:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movsd %xmm0, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE-NEXT:    movapd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: insert_reg_lo_v2f64:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
   %v = insertelement <2 x double> undef, double %a, i32 0
   %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3>
@@ -1068,22 +1061,22 @@ define <2 x double> @insert_dup_reg_v2f64(double %a) {
 ;
 ; SSE3-LABEL: insert_dup_reg_v2f64:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: insert_dup_reg_v2f64:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: insert_dup_reg_v2f64:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: insert_dup_reg_v2f64:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    retq
   %v = insertelement <2 x double> undef, double %a, i32 0
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
@@ -1092,28 +1085,28 @@ define <2 x double> @insert_dup_reg_v2f64(double %a) {
 define <2 x double> @insert_dup_mem_v2f64(double* %ptr) {
 ; SSE2-LABEL: insert_dup_mem_v2f64:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movsd (%rdi), %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: insert_dup_mem_v2f64:
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    movddup (%rdi), %xmm0
+; SSE3-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: insert_dup_mem_v2f64:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movddup (%rdi), %xmm0
+; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: insert_dup_mem_v2f64:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movddup (%rdi), %xmm0
+; SSE41-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: insert_dup_mem_v2f64:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovddup (%rdi), %xmm0
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX-NEXT:    retq
   %a = load double* %ptr
   %v = insertelement <2 x double> undef, double %a, i32 0
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 833b822..a684e5e 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
@@ -322,60 +322,150 @@ define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; SSE41-LABEL: shuffle_v4i32_0124:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v4i32_0124:
-; AVX:       # BB#0:
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v4i32_0124:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i32_0124:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
-; SSE-LABEL: shuffle_v4i32_0142:
-; SSE:       # BB#0:
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
-; SSE-NEXT:    retq
+; SSE2-LABEL: shuffle_v4i32_0142:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v4i32_0142:
-; AVX:       # BB#0:
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
-; AVX-NEXT:    retq
+; SSE3-LABEL: shuffle_v4i32_0142:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_0142:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_0142:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v4i32_0142:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i32_0142:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) {
-; SSE-LABEL: shuffle_v4i32_0412:
-; SSE:       # BB#0:
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
-; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: shuffle_v4i32_0412:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v4i32_0412:
-; AVX:       # BB#0:
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[1,2]
-; AVX-NEXT:    retq
+; SSE3-LABEL: shuffle_v4i32_0412:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_0412:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_0412:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v4i32_0412:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i32_0412:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
-; SSE-LABEL: shuffle_v4i32_4012:
-; SSE:       # BB#0:
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
-; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: shuffle_v4i32_4012:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v4i32_4012:
-; AVX:       # BB#0:
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,2]
-; AVX-NEXT:    retq
+; SSE3-LABEL: shuffle_v4i32_4012:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_4012:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_4012:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,2]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v4i32_4012:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,2]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i32_4012:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,2]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
   ret <4 x i32> %shuffle
 }
@@ -393,17 +483,44 @@ define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) {
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) {
-; SSE-LABEL: shuffle_v4i32_0451:
-; SSE:       # BB#0:
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
-; SSE-NEXT:    retq
+; SSE2-LABEL: shuffle_v4i32_0451:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v4i32_0451:
-; AVX:       # BB#0:
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
-; AVX-NEXT:    retq
+; SSE3-LABEL: shuffle_v4i32_0451:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_0451:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_0451:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v4i32_0451:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i32_0451:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
   ret <4 x i32> %shuffle
 }
@@ -422,17 +539,44 @@ define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) {
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
-; SSE-LABEL: shuffle_v4i32_4015:
-; SSE:       # BB#0:
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
-; SSE-NEXT:    retq
+; SSE2-LABEL: shuffle_v4i32_4015:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v4i32_4015:
-; AVX:       # BB#0:
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
-; AVX-NEXT:    retq
+; SSE3-LABEL: shuffle_v4i32_4015:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_4015:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_4015:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v4i32_4015:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i32_4015:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
   ret <4 x i32> %shuffle
 }
@@ -441,21 +585,21 @@ define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
 ; SSE2-LABEL: shuffle_v4f32_4zzz:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v4f32_4zzz:
 ; SSE3:       # BB#0:
 ; SSE3-NEXT:    xorps %xmm1, %xmm1
-; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v4f32_4zzz:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
-; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -478,22 +622,22 @@ define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
 ; SSE2-LABEL: shuffle_v4f32_z4zz:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v4f32_z4zz:
 ; SSE3:       # BB#0:
 ; SSE3-NEXT:    xorps %xmm1, %xmm1
-; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
-; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v4f32_z4zz:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v4f32_z4zz:
@@ -513,24 +657,24 @@ define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
 ; SSE2-LABEL: shuffle_v4f32_zz4z:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v4f32_zz4z:
 ; SSE3:       # BB#0:
 ; SSE3-NEXT:    xorps %xmm1, %xmm1
-; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
 ; SSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v4f32_zz4z:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -657,38 +801,204 @@ define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
   ret <4 x float> %shuffle
 }
 
+define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_0z23:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_0z23:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_0z23:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_0z23:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0z23:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_01z3(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_01z3:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_01z3:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_01z3:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_01z3:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_01z3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_012z(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_012z:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_012z:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_012z:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_012z:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_012z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_0zz3(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_0zz3:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_0zz3:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_0zz3:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_0zz3:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0zz3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_u051(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: shuffle_v4f32_u051:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_u051:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 undef, i32 0, i32 5, i32 1>
+  ret <4 x float> %shuffle
+}
+
 define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
 ; SSE2-LABEL: shuffle_v4i32_4zzz:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v4i32_4zzz:
 ; SSE3:       # BB#0:
 ; SSE3-NEXT:    xorps %xmm1, %xmm1
-; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v4i32_4zzz:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
-; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v4i32_4zzz:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v4i32_4zzz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x i32> %shuffle
@@ -698,35 +1008,35 @@ define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
 ; SSE2-LABEL: shuffle_v4i32_z4zz:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v4i32_z4zz:
 ; SSE3:       # BB#0:
 ; SSE3-NEXT:    xorps %xmm1, %xmm1
-; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v4i32_z4zz:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
-; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v4i32_z4zz:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v4i32_z4zz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
@@ -737,35 +1047,35 @@ define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
 ; SSE2-LABEL: shuffle_v4i32_zz4z:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v4i32_zz4z:
 ; SSE3:       # BB#0:
 ; SSE3-NEXT:    xorps %xmm1, %xmm1
-; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v4i32_zz4z:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
-; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v4i32_zz4z:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v4i32_zz4z:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
@@ -773,39 +1083,14 @@ define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
 }
 
 define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) {
-; SSE2-LABEL: shuffle_v4i32_zuu4:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    movss %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
-; SSE2-NEXT:    retq
-;
-; SSE3-LABEL: shuffle_v4i32_zuu4:
-; SSE3:       # BB#0:
-; SSE3-NEXT:    xorps %xmm1, %xmm1
-; SSE3-NEXT:    movss %xmm0, %xmm1
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
-; SSE3-NEXT:    retq
-;
-; SSSE3-LABEL: shuffle_v4i32_zuu4:
-; SSSE3:       # BB#0:
-; SSSE3-NEXT:    xorps %xmm1, %xmm1
-; SSSE3-NEXT:    movss %xmm0, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_v4i32_zuu4:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
-; SSE41-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_zuu4:
+; SSE:       # BB#0:
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v4i32_zuu4:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,0]
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
   ret <4 x i32> %shuffle
@@ -835,13 +1120,24 @@ define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) {
 ;
 ; SSE41-LABEL: shuffle_v4i32_z6zz:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v4i32_z6zz:
-; AVX:       # BB#0:
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v4i32_z6zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i32_z6zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
   ret <4 x i32> %shuffle
 }
@@ -1007,6 +1303,21 @@ define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) {
   ret <4 x i32> %shuffle
 }
 
+define <4 x i32> @shuffle_v4i32_40u1(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: shuffle_v4i32_40u1:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_40u1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 1>
+  ret <4 x i32> %shuffle
+}
+
 define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: shuffle_v4i32_3456:
 ; SSE2:       # BB#0:
@@ -1058,12 +1369,12 @@ define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; SSE41-LABEL: shuffle_v4i32_0u1u:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pmovzxdq %xmm0, %xmm0
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v4i32_0u1u:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovzxdq %xmm0, %xmm0
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
   ret <4 x i32> %shuffle
@@ -1090,17 +1401,179 @@ define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) {
 ;
 ; SSE41-LABEL: shuffle_v4i32_0z1z:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pmovzxdq %xmm0, %xmm0
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v4i32_0z1z:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovzxdq %xmm0, %xmm0
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   ret <4 x i32> %shuffle
 }
 
+define <4 x i32> @shuffle_v4i32_01zu(<4 x i32> %a) {
+; SSE-LABEL: shuffle_v4i32_01zu:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_01zu:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 7, i32 undef>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_0z23:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_0z23:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_0z23:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_0z23:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v4i32_0z23:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i32_0z23:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_01z3:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_01z3:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_01z3:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_01z3:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v4i32_01z3:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i32_01z3:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_012z:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_012z:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_012z:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_012z:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v4i32_012z:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i32_012z:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_0zz3:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_0zz3:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_0zz3:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_0zz3:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v4i32_0zz3:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i32_0zz3:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
+  ret <4 x i32> %shuffle
+}
+
 define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
 ; SSE-LABEL: insert_reg_and_zero_v4i32:
 ; SSE:       # BB#0:
@@ -1119,12 +1592,12 @@ define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
 define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) {
 ; SSE-LABEL: insert_mem_and_zero_v4i32:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movd (%rdi), %xmm0
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: insert_mem_and_zero_v4i32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovd (%rdi), %xmm0
+; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    retq
   %a = load i32* %ptr
   %v = insertelement <4 x i32> undef, i32 %a, i32 0
@@ -1136,21 +1609,21 @@ define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
 ; SSE2-LABEL: insert_reg_and_zero_v4f32:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: insert_reg_and_zero_v4f32:
 ; SSE3:       # BB#0:
 ; SSE3-NEXT:    xorps %xmm1, %xmm1
-; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: insert_reg_and_zero_v4f32:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
-; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -1163,7 +1636,7 @@ define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
 ; AVX-LABEL: insert_reg_and_zero_v4f32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vmovss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %v = insertelement <4 x float> undef, float %a, i32 0
   %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -1173,12 +1646,12 @@ define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
 define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
 ; SSE-LABEL: insert_mem_and_zero_v4f32:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movss (%rdi), %xmm0
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: insert_mem_and_zero_v4f32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovss (%rdi), %xmm0
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    retq
   %a = load float* %ptr
   %v = insertelement <4 x float> undef, float %a, i32 0
@@ -1190,19 +1663,19 @@ define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
 ; SSE2-LABEL: insert_reg_lo_v4i32:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movd %rdi, %xmm1
-; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: insert_reg_lo_v4i32:
 ; SSE3:       # BB#0:
 ; SSE3-NEXT:    movd %rdi, %xmm1
-; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: insert_reg_lo_v4i32:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    movd %rdi, %xmm1
-; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: insert_reg_lo_v4i32:
@@ -1246,19 +1719,19 @@ define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
 ;
 ; SSE41-LABEL: insert_mem_lo_v4i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movq (%rdi), %xmm1
+; SSE41-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: insert_mem_lo_v4i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovq (%rdi), %xmm1
+; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: insert_mem_lo_v4i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovq (%rdi), %xmm1
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX2-NEXT:    retq
   %a = load <2 x i32>* %ptr
@@ -1288,13 +1761,13 @@ define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) {
 define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
 ; SSE-LABEL: insert_mem_hi_v4i32:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movq (%rdi), %xmm1
+; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: insert_mem_hi_v4i32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovq (%rdi), %xmm1
+; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
   %a = load <2 x i32>* %ptr
@@ -1306,13 +1779,13 @@ define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
 define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
 ; SSE-LABEL: insert_reg_lo_v4f32:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movsd %xmm0, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE-NEXT:    movapd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: insert_reg_lo_v4f32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
   %a.cast = bitcast double %a to <2 x float>
   %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -1384,3 +1857,35 @@ define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
   %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x float> %shuffle
 }
+
+;
+; Shuffle to logical bit shifts
+;
+
+define <4 x i32> @shuffle_v4i32_z0zX(<4 x i32> %a) {
+; SSE-LABEL: shuffle_v4i32_z0zX:
+; SSE:       # BB#0:
+; SSE-NEXT:    psllq $32, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_z0zX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 4, i32 undef>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_1z3z(<4 x i32> %a) {
+; SSE-LABEL: shuffle_v4i32_1z3z:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlq $32, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_1z3z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
+  ret <4 x i32> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 59af434..eb77c38 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
@@ -952,20 +952,15 @@ define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) {
 ; SSE-LABEL: shuffle_v8i16_109832ba:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[2,0,3,1,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_109832ba:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[2,0,3,1,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 9, i32 8, i32 3, i32 2, i32 11, i32 10>
   ret <8 x i16> %shuffle
@@ -1023,36 +1018,33 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: shuffle_v8i16_443aXXXX:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,65535,65535]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,1,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v8i16_443aXXXX:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[4,5,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7],zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v8i16_443aXXXX:
 ; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_443aXXXX:
 ; AVX:       # BB#0:
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 3, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
@@ -1061,34 +1053,37 @@ define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: shuffle_v8i16_032dXXXX:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v8i16_032dXXXX:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[10,11,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v8i16_032dXXXX:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v8i16_032dXXXX:
-; AVX:       # BB#0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v8i16_032dXXXX:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i16_032dXXXX:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
@@ -1109,33 +1104,30 @@ define <8 x i16> @shuffle_v8i16_XXXdXXXX(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: shuffle_v8i16_012dXXXX:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v8i16_012dXXXX:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[10,11,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v8i16_012dXXXX:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_012dXXXX:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
@@ -1144,41 +1136,37 @@ define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: shuffle_v8i16_XXXXcde3:
 ; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0]
+; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v8i16_XXXXcde3:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm0[6,7]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,8,9,10,11,12,13],zero,zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v8i16_XXXXcde3:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: shuffle_v8i16_XXXXcde3:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8i16_XXXXcde3:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 3>
   ret <8 x i16> %shuffle
@@ -1187,42 +1175,32 @@ define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: shuffle_v8i16_cde3XXXX:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v8i16_cde3XXXX:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[6,7,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v8i16_cde3XXXX:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: shuffle_v8i16_cde3XXXX:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v8i16_cde3XXXX:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_cde3XXXX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 13, i32 14, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
@@ -1230,100 +1208,117 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: shuffle_v8i16_012dcde3:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,2,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,0,2,4,5,6,7]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v8i16_012dcde3:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[10,11,8,9,10,11,12,13],zero,zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[6,7]
+; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v8i16_012dcde3:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
-; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: shuffle_v8i16_012dcde3:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8i16_012dcde3:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm2
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 12, i32 13, i32 14, i32 3>
   ret <8 x i16> %shuffle
 }
 
+define <8 x i16> @shuffle_v8i16_0923cde7(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_0923cde7:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [65535,0,65535,65535,0,0,0,65535]
+; SSE2-NEXT:    andps %xmm2, %xmm0
+; SSE2-NEXT:    andnps %xmm1, %xmm2
+; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0923cde7:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movaps {{.*#+}} xmm2 = [65535,0,65535,65535,0,0,0,65535]
+; SSSE3-NEXT:    andps %xmm2, %xmm0
+; SSSE3-NEXT:    andnps %xmm1, %xmm2
+; SSSE3-NEXT:    orps %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_0923cde7:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6],xmm0[7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0923cde7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6],xmm0[7]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 12, i32 13, i32 14, i32 7>
+  ret <8 x i16> %shuffle
+}
+
 define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: shuffle_v8i16_XXX1X579:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,2,0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v8i16_XXX1X579:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,xmm1[u,u],zero,zero,zero,zero,xmm1[2,3]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,u,u,10,11,14,15],zero,zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v8i16_XXX1X579:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15]
-; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
+; SSE41-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v8i16_XXX1X579:
-; AVX:       # BB#0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v8i16_XXX1X579:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i16_XXX1X579:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9>
   ret <8 x i16> %shuffle
 }
@@ -1331,42 +1326,40 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: shuffle_v8i16_XX4X8acX:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v8i16_XX4X8acX:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,8,9,u,u],zero,zero,zero,zero,zero,zero,xmm0[u,u]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[u,u,0,1,4,5,8,9,u,u]
+; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v8i16_XX4X8acX:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v8i16_XX4X8acX:
-; AVX:       # BB#0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v8i16_XX4X8acX:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i16_XX4X8acX:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 undef>
   ret <8 x i16> %shuffle
 }
@@ -1429,15 +1422,13 @@ define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
 define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
 ; SSE-LABEL: shuffle_v8i16_zuuzuuz8:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movzwl %di, %eax
-; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movd %edi, %xmm0
 ; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_zuuzuuz8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movzwl %di, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vmovd %edi, %xmm0
 ; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
 ; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 0
@@ -1571,20 +1562,10 @@ define <8 x i16> @shuffle_v8i16_u6uu123u(<8 x i16> %a, <8 x i16> %b) {
 }
 
 define <8 x i16> @shuffle_v8i16_uuuu123u(<8 x i16> %a, <8 x i16> %b) {
-; SSE2-LABEL: shuffle_v8i16_uuuu123u:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: shuffle_v8i16_uuuu123u:
-; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_v8i16_uuuu123u:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
-; SSE41-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_uuuu123u:
+; SSE:       # BB#0:
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_uuuu123u:
 ; AVX:       # BB#0:
@@ -1701,20 +1682,10 @@ define <8 x i16> @shuffle_v8i16_u456uu1u(<8 x i16> %a, <8 x i16> %b) {
 }
 
 define <8 x i16> @shuffle_v8i16_u456uuuu(<8 x i16> %a, <8 x i16> %b) {
-; SSE2-LABEL: shuffle_v8i16_u456uuuu:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: shuffle_v8i16_u456uuuu:
-; SSSE3:       # BB#0:
-; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_v8i16_u456uuuu:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_u456uuuu:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_u456uuuu:
 ; AVX:       # BB#0:
@@ -1851,12 +1822,12 @@ define <8 x i16> @shuffle_v8i16_0uuu1uuu(<8 x i16> %a) {
 ;
 ; SSE41-LABEL: shuffle_v8i16_0uuu1uuu:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pmovzxwq %xmm0, %xmm0
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_0uuu1uuu:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovzxwq %xmm0, %xmm0
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
@@ -1879,12 +1850,12 @@ define <8 x i16> @shuffle_v8i16_0zzz1zzz(<8 x i16> %a) {
 ;
 ; SSE41-LABEL: shuffle_v8i16_0zzz1zzz:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pmovzxwq %xmm0, %xmm0
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_0zzz1zzz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovzxwq %xmm0, %xmm0
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
   ret <8 x i16> %shuffle
@@ -1903,12 +1874,12 @@ define <8 x i16> @shuffle_v8i16_0u1u2u3u(<8 x i16> %a) {
 ;
 ; SSE41-LABEL: shuffle_v8i16_0u1u2u3u:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pmovzxwd %xmm0, %xmm0
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_0u1u2u3u:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovzxwd %xmm0, %xmm0
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef>
   ret <8 x i16> %shuffle
@@ -1929,13 +1900,254 @@ define <8 x i16> @shuffle_v8i16_0z1z2z3z(<8 x i16> %a) {
 ;
 ; SSE41-LABEL: shuffle_v8i16_0z1z2z3z:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pmovzxwd %xmm0, %xmm0
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_0z1z2z3z:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovzxwd %xmm0, %xmm0
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
   ret <8 x i16> %shuffle
 }
+
+;
+; Shuffle to logical bit shifts
+;
+define <8 x i16> @shuffle_v8i16_z0z2z4z6(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_z0z2z4z6:
+; SSE:       # BB#0:
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_z0z2z4z6:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zzz0zzz4(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_zzz0zzz4:
+; SSE:       # BB#0:
+; SSE-NEXT:    psllq $48, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_zzz0zzz4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 8, i32 8, i32 0, i32 8, i32 8, i32 8, i32 4>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zz01zX4X(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_zz01zX4X:
+; SSE:       # BB#0:
+; SSE-NEXT:    psllq $32, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_zz01zX4X:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 8, i32 0, i32 1, i32 8, i32 undef, i32 4, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_z0X2z456(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_z0X2z456:
+; SSE:       # BB#0:
+; SSE-NEXT:    psllq $16, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_z0X2z456:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsllq $16, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 0, i32 undef, i32 2, i32 8, i32 4, i32 5, i32 6>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_1z3zXz7z(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_1z3zXz7z:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_1z3zXz7z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 1, i32 8, i32 3, i32 8, i32 undef, i32 8, i32 7, i32 8>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_1X3z567z(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_1X3z567z:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlq $16, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_1X3z567z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlq $16, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 1, i32 undef, i32 3, i32 8, i32 5, i32 6, i32 7, i32 8>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_23zz67zz(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_23zz67zz:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlq $32, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_23zz67zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 2, i32 3, i32 8, i32 8, i32 6, i32 7, i32 8, i32 8>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_3zXXXzzz(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_3zXXXzzz:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlq $48, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_3zXXXzzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 3, i32 8, i32 undef, i32 undef, i32 undef, i32 8, i32 8, i32 8>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_01u3zzuz(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_01u3zzuz:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_01u3zzuz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 undef, i32 3, i32 8, i32 8, i32 undef, i32 8>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0z234567(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_0z234567:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0z234567:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_0z234567:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0z234567:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0zzzz5z7(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_0zzzz5z7:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0zzzz5z7:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_0zzzz5z7:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0zzzz5z7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 8, i32 5, i32 8, i32 7>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0123456z(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_0123456z:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0123456z:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_0123456z:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0123456z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_fu3ucc5u(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_fu3ucc5u:
+; SSE:       # BB#0:
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4]
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_fu3ucc5u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4]
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 15, i32 undef, i32 3, i32 undef, i32 12, i32 12, i32 5, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_8012345u(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_8012345u:
+; SSE:       # BB#0:
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_8012345u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 8, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef>
+
+  ret <8 x i16> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 4db0280..d00596d 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
 
 target triple = "x86_64-unknown-unknown"
 
@@ -151,9 +151,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
@@ -175,9 +173,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,6,7,0,1]
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
@@ -185,10 +181,9 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0
 ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,2,3,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <16 x i16> %shuffle
@@ -199,10 +194,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,0,0,4,5,6,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
@@ -210,10 +202,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0
 ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,4,5,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <16 x i16> %shuffle
@@ -224,10 +214,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
@@ -235,10 +222,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0
 ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,6,7,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <16 x i16> %shuffle
@@ -248,11 +233,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0
 ; AVX1-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
@@ -260,10 +242,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0
 ; AVX2-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <16 x i16> %shuffle
@@ -273,11 +253,8 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0
 ; AVX1-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
@@ -285,10 +262,8 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0
 ; AVX2-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,10,11,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <16 x i16> %shuffle
@@ -298,12 +273,8 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0
 ; AVX1-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,0,0,4,5,6,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
@@ -311,10 +282,8 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0
 ; AVX2-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <16 x i16> %shuffle
@@ -324,12 +293,8 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
 ; AVX1-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
@@ -337,10 +302,8 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
 ; AVX2-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[14,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <16 x i16> %shuffle
@@ -724,18 +687,16 @@ define <16 x i16> @shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_3
 define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
+; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16>
   ret <16 x i16> %shuffle
@@ -744,15 +705,13 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_1
 define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24(<16 x i16> %a, <16 x i16> %b) {
 ; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
@@ -806,9 +765,8 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_1
 ;
 ; AVX2-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,14,15,12,13,10,11,8,9,u,u,u,u,u,u,u,u,30,31,28,29,26,27,24,25]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,14,15,12,13,10,11,8,9,22,23,20,21,18,19,16,17,30,31,28,29,26,27,24,25]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 7, i32 6, i32 5, i32 4, i32 27, i32 26, i32 25, i32 24, i32 15, i32 14, i32 13, i32 12>
   ret <16 x i16> %shuffle
@@ -818,13 +776,12 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0
 ; AVX1-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,8,9,4,5,0,1,14,15,10,11,6,7,2,3]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -1265,3 +1222,347 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_1
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 12, i32 12>
   ret <16 x i16> %shuffle
 }
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+;
+; Shuffle to logical bit shifts
+;
+
+define <16 x i16> @shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpslld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpslld $16, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsllq $48, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllq $48, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 3, i32 16, i32 5, i32 16, i32 7, i32 16, i32 9, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 16, i32 16, i32 10, i32 11, i32 16, i32 16, i32 14, i32 15, i32 16, i32 16>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4,5,2,3,4,5,6,7,6,7,10,11,4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 16, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_zz(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 16, i32 0, i32 17, i32 0, i32 18, i32 0, i32 19, i32 0, i32 20, i32 0, i32 21, i32 0, i32 22, i32 0, i32 23, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm1[30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 31, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_24(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_24:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_24:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 00, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_30(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_30:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_30:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm0[30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,18,19,20,21,22,23,24,25,26,27,28,29,30,31,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,30,31,16,17,18,19,20,21,22,23,24,25,26,27,28,29]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+  ret <16 x i16> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 79c906b..ed3c666 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
 target triple = "x86_64-unknown-unknown"
 
@@ -314,9 +314,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],zero
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -339,19 +338,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0]
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
@@ -363,19 +360,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2],zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0]
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,2,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
@@ -387,19 +382,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0]
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,3,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
@@ -411,19 +404,16 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4],zero,zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0]
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,4,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
@@ -435,19 +425,16 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0]
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,11,0,0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,5,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
@@ -459,19 +446,16 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[6],zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0]
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,6,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
@@ -483,19 +467,16 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[7],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,7,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
@@ -516,10 +497,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_
 ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,8,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
@@ -540,10 +519,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_
 ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,9,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
@@ -564,10 +541,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_
 ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,10,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
@@ -588,10 +563,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,11,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
@@ -612,10 +585,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX2-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,12,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
@@ -636,10 +607,8 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX2-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,13,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
@@ -660,10 +629,8 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX2-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
@@ -685,15 +652,13 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ;
 ; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    movl $15, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm2
-; AVX2-NEXT:    vpxor %ymm3, %ymm3, %ymm3
-; AVX2-NEXT:    vinserti128 $0, %xmm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $0, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
@@ -947,16 +912,11 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) {
 ; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -972,16 +932,11 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_
 define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) {
 ; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -997,20 +952,17 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_
 define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32(<32 x i8> %a, <32 x i8> %b) {
 ; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32>
   ret <32 x i8> %shuffle
@@ -1020,17 +972,12 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
 ; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -1050,15 +997,15 @@ define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_
 ; AVX1-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,8,9,10,11,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,0,0,0,0,0,0,0,128,128,128,128,128,128,128,128]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -1076,23 +1023,22 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_
 ; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,15,14,13,12,11,10,9,8]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <15,14,13,12,11,10,9,8,u,u,u,u,u,u,u,u>
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u>
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,15,14,13,12,11,10,9,8,u,u,u,u,u,u,u,u,31,30,29,28,27,26,25,24]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24>
   ret <32 x i8> %shuffle
@@ -1102,15 +1048,12 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_
 ; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,7,6,5,4,3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,128,128,128,128,128,128,128,128]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -1520,27 +1463,24 @@ define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_
 define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39(<32 x i8> %a, <32 x i8> %b) {
 ; AVX1-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u],zero,xmm0[u,u,u,u,u,u,u,7,u,u,u,u]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[4,3,u,3,u,u,u,u,u,u,u],zero,xmm3[u,u,u,u]
-; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1],zero,xmm2[3],zero,zero,zero,zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,4,u,1,6],zero,zero,xmm4[0],zero,xmm4[11,u],zero,zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7]
-; AVX1-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2],zero,xmm5[4,5,6,7,8,9,10],zero,xmm5[12,13,14,15]
-; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[u,u,u,u,1,6,13,u,u],zero,xmm3[u,u]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u]
-; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,3],zero,zero,zero,zero,xmm0[8,9,10],zero,zero,xmm0[13],zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm4[u,u],zero,zero,xmm4[12],zero,xmm4[u,u,u],zero,zero,xmm4[u,0,3]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,u,1,6],zero,zero,xmm2[0],zero,xmm2[11,u],zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7]
+; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[8,6,u,6,u,u,u,u,u,u,u,15,u,u,u,u]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[u,u],zero,zero,xmm2[12],zero,xmm2[u,u,u],zero,zero,xmm2[u,0,3]
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero
-; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,xmm1[4,5,6,7],zero,zero,zero,xmm1[11,12],zero,xmm1[14,15]
-; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm4[u,u],zero,zero,xmm4[u,u,u,u,1,6,13,u,u],zero,xmm4[u,u]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255]
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
@@ -1560,3 +1500,461 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 42, i32 45, i32 12, i32 13, i32 35, i32 35, i32 60, i32 40, i32 17, i32 22, i32 29, i32 44, i32 33, i32 12, i32 48, i32 51, i32 20, i32 19, i32 52, i32 19, i32 49, i32 54, i32 37, i32 32, i32 48, i32 42, i32 59, i32 7, i32 36, i32 34, i32 36, i32 39>
   ret <32 x i8> %shuffle
 }
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 32, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 48>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 63, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+;
+; Shuffle to logical bit shifts
+;
+
+define <32 x i8> @shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 0, i32 32, i32 2, i32 32, i32 4, i32 32, i32 6, i32 32, i32 8, i32 32, i32 10, i32 32, i32 12, i32 32, i32 14, i32 32, i32 16, i32 32, i32 18, i32 32, i32 20, i32 32, i32 22, i32 32, i32 24, i32 32, i32 26, i32 32, i32 28, i32 32, i32 30>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpslld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpslld $16, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 32, i32 32, i32 8, i32 9, i32 32, i32 32, i32 12, i32 13, i32 32, i32 32, i32 16, i32 17, i32 32, i32 32, i32 20, i32 21, i32 32, i32 32, i32 24, i32 25, i32 32, i32 32, i32 28, i32 29>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsllq $48, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllq $48, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 32, i32 3, i32 32, i32 5, i32 32, i32 7, i32 32, i32 9, i32 32, i32 11, i32 32, i32 13, i32 32, i32 15, i32 32, i32 17, i32 32, i32 19, i32 32, i32 21, i32 32, i32 23, i32 32, i32 25, i32 32, i32 27, i32 32, i32 29, i32 32, i32 31, i32 32>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 2, i32 3, i32 32, i32 32, i32 6, i32 7, i32 32, i32 32, i32 10, i32 11, i32 32, i32 32, i32 14, i32 15, i32 32, i32 32, i32 18, i32 19, i32 32, i32 32, i32 22, i32 23, i32 32, i32 32, i32 26, i32 27, i32 32, i32 32, i32 30, i32 31, i32 32, i32 32>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsrlq $56, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $56, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrlq $56, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 23, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    retq
+
+  %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 32, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 33, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 34, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 35, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 32, i32 0, i32 0, i32 0, i32 33, i32 0, i32 0, i32 0, i32 34, i32 0, i32 0, i32 0, i32 35, i32 0, i32 0, i32 0, i32 36, i32 0, i32 0, i32 0, i32 37, i32 0, i32 0, i32 0, i32 38, i32 0, i32 0, i32 0, i32 39, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 32, i32 0, i32 33, i32 0, i32 34, i32 0, i32 35, i32 0, i32 36, i32 0, i32 37, i32 0, i32 38, i32 0, i32 39, i32 0, i32 40, i32 0, i32 41, i32 0, i32 42, i32 0, i32 43, i32 0, i32 44, i32 0, i32 45, i32 0, i32 46, i32 0, i32 47, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 47, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 63, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 63, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_uu_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_uu_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_uu_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 47, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 undef, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 63, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_32_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_48(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_32_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_48:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_32_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_48:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_00_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_00_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_00_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 00, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_31_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_31_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_31_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm0[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,31,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+  ret <32 x i8> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 0bd1bd9..3d6ada6 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
 target triple = "x86_64-unknown-unknown"
 
 define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: shuffle_v4f64_0000:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -21,7 +21,7 @@ define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) {
 define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: shuffle_v4f64_0001:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -38,7 +38,7 @@ define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -70,7 +70,7 @@ define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: shuffle_v4f64_1000:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -86,7 +86,7 @@ define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: shuffle_v4f64_2200:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v4f64_2200:
@@ -101,9 +101,8 @@ define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: shuffle_v4f64_3330:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
-; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
-; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v4f64_3330:
@@ -141,7 +140,7 @@ define <4 x double> @shuffle_v4f64_0023(<4 x double> %a, <4 x double> %b) {
 define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) {
 ; ALL-LABEL: shuffle_v4f64_0022:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   ret <4 x double> %shuffle
@@ -186,7 +185,7 @@ define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) {
 define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: shuffle_v4f64_0423:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX1-NEXT:    vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
 ; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
 ; AVX1-NEXT:    retq
 ;
@@ -202,8 +201,8 @@ define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
 define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
 ; ALL-LABEL: shuffle_v4f64_0462:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT:    vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 ; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 6, i32 2>
@@ -300,10 +299,77 @@ define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) {
   ret <4 x double> %shuffle
 }
 
+define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1054:
+; ALL:       # BB#0:
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_3254(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_3254:
+; ALL:       # BB#0:
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_3276:
+; ALL:       # BB#0:
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1076(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1076:
+; ALL:       # BB#0:
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0415(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0415:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_0415:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_u062(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_u062:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 undef, i32 0, i32 6, i32 2>
+  ret <4 x double> %shuffle
+}
+
 define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: shuffle_v4i64_0000:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -318,7 +384,7 @@ define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
 define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: shuffle_v4i64_0001:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -335,7 +401,7 @@ define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -383,7 +449,7 @@ define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: shuffle_v4i64_1000:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -399,7 +465,7 @@ define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: shuffle_v4i64_2200:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v4i64_2200:
@@ -414,9 +480,8 @@ define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: shuffle_v4i64_3330:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
-; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
-; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v4i64_3330:
@@ -445,7 +510,7 @@ define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
 define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: shuffle_v4i64_0124:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX1-NEXT:    retq
@@ -483,7 +548,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX1-NEXT:    vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
 ; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
 ; AVX1-NEXT:    retq
 ;
@@ -502,7 +567,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
-; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
 ; AVX1-NEXT:    retq
@@ -580,9 +645,8 @@ define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) {
 ;
 ; AVX2-LABEL: shuffle_v4i64_2u35:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,1]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 undef, i32 3, i32 5>
   ret <4 x i64> %shuffle
@@ -608,22 +672,135 @@ define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) {
   ret <4 x i64> %shuffle
 }
 
-define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
-; AVX1-LABEL: stress_test1:
+define <4 x i64> @shuffle_v4i64_1054(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_1054:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX1-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm0[1,0,3,2]
-; AVX1-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3]
-; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_1054:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_3254:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_3254:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_3276:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_3276:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_1076(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_1076:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_1076:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0415:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0415:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_z4z6(<4 x i64> %a) {
+; AVX1-LABEL: shuffle_v4i64_z4z6:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_z4z6:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> <i32 0, i32 4, i32 0, i32 6>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_5zuz(<4 x i64> %a) {
+; AVX1-LABEL: shuffle_v4i64_5zuz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
 ; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: stress_test1:
+; AVX2-LABEL: shuffle_v4i64_5zuz:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm1[3,1,1,0]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,1,3]
-; AVX2-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> <i32 5, i32 0, i32 undef, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_40u2(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_40u2:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_40u2:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 2>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
+; ALL-LABEL: stress_test1:
+; ALL:         retq
   %c = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
   %d = shufflevector <4 x i64> %c, <4 x i64> undef, <4 x i32> <i32 3, i32 undef, i32 2, i32 undef>
   %e = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 undef>
@@ -654,14 +831,14 @@ define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
 define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
 ; AVX1-LABEL: insert_mem_and_zero_v4i64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovq (%rdi), %xmm0
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
 ; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: insert_mem_and_zero_v4i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovq (%rdi), %xmm0
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX2-NEXT:    retq
@@ -674,8 +851,8 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
 define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
 ; ALL-LABEL: insert_reg_and_zero_v4f64:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
+; ALL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; ALL-NEXT:    retq
   %v = insertelement <4 x double> undef, double %a, i32 0
   %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -685,7 +862,7 @@ define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
 define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {
 ; ALL-LABEL: insert_mem_and_zero_v4f64:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vmovsd (%rdi), %xmm0
+; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; ALL-NEXT:    retq
   %a = load double* %ptr
   %v = insertelement <4 x double> undef, double %a, i32 0
@@ -707,8 +884,7 @@ define <4 x double> @splat_mem_v4f64(double* %ptr) {
 define <4 x i64> @splat_mem_v4i64(i64* %ptr) {
 ; AVX1-LABEL: splat_mem_v4i64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovddup (%rdi), %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splat_mem_v4i64:
@@ -735,7 +911,7 @@ define <4 x double> @splat_mem_v4f64_2(double* %p) {
 define <4 x double> @splat_v4f64(<2 x double> %r) {
 ; AVX1-LABEL: splat_v4f64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index ded8232..f4e9a3b 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
 
 target triple = "x86_64-unknown-unknown"
 
@@ -91,9 +91,8 @@ define <8 x float> @shuffle_v8f32_00500000(<8 x float> %a, <8 x float> %b) {
 ; AVX1-LABEL: shuffle_v8f32_00500000:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[u,u,1,u,4,4,4,4]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4,5,6,7]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8f32_00500000:
@@ -109,9 +108,8 @@ define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) {
 ; AVX1-LABEL: shuffle_v8f32_06000000:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[u,2,u,u,4,4,4,4]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,0,4,5,4,4]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,0,4,4,4,4]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8f32_06000000:
@@ -127,9 +125,8 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
 ; AVX1-LABEL: shuffle_v8f32_70000000:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,u,u,u,4,4,4,4]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8f32_70000000:
@@ -148,7 +145,7 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: shuffle_v8f32_01014545:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
   ret <8 x float> %shuffle
@@ -202,7 +199,7 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX1-NEXT:    retq
@@ -295,11 +292,11 @@ define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
 ; AVX1-LABEL: shuffle_v8f32_08991abb:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1]
+; AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8f32_08991abb:
@@ -336,7 +333,7 @@ define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) {
 ; AVX1-LABEL: shuffle_v8f32_09ab1def:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 ; AVX1-NEXT:    retq
@@ -426,7 +423,7 @@ define <8 x float> @shuffle_v8f32_00234467(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_00224466(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: shuffle_v8f32_00224466:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; ALL-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   ret <8 x float> %shuffle
@@ -444,7 +441,7 @@ define <8 x float> @shuffle_v8f32_10325476(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shuffle_v8f32_11335577(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: shuffle_v8f32_11335577:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; ALL-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
   ret <8 x float> %shuffle
@@ -736,123 +733,106 @@ define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {
 }
 
 define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) {
-; AVX1-LABEL: shuffle_v8f32_3210ba98:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v8f32_3210ba98:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,u,u,3,2,1,0>
-; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
-; AVX2-NEXT:    retq
+; ALL-LABEL: shuffle_v8f32_3210ba98:
+; ALL:       # BB#0:
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
   ret <8 x float> %shuffle
 }
 
 define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) {
-; AVX1-LABEL: shuffle_v8f32_3210fedc:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v8f32_3210fedc:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
-; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
-; AVX2-NEXT:    retq
+; ALL-LABEL: shuffle_v8f32_3210fedc:
+; ALL:       # BB#0:
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
   ret <8 x float> %shuffle
 }
 
 define <8 x float> @shuffle_v8f32_7654fedc(<8 x float> %a, <8 x float> %b) {
-; AVX1-LABEL: shuffle_v8f32_7654fedc:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v8f32_7654fedc:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
-; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
-; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
-; AVX2-NEXT:    retq
+; ALL-LABEL: shuffle_v8f32_7654fedc:
+; ALL:       # BB#0:
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
   ret <8 x float> %shuffle
 }
 
 define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) {
-; AVX1-LABEL: shuffle_v8f32_fedc7654:
+; ALL-LABEL: shuffle_v8f32_fedc7654:
+; ALL:       # BB#0:
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) {
+; AVX1-LABEL: PR21138:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v8f32_fedc7654:
+; AVX2-LABEL: PR21138:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,u,u,1,3,5,7>
 ; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <1,3,5,7,u,u,u,u>
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX2-NEXT:    retq
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
+  %shuffle = shufflevector <8 x float> %truc, <8 x float> %tchose, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   ret <8 x float> %shuffle
 }
 
 define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) {
-; AVX1-LABEL: shuffle_v8f32_ba987654:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v8f32_ba987654:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
-; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; AVX2-NEXT:    retq
+; ALL-LABEL: shuffle_v8f32_ba987654:
+; ALL:       # BB#0:
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
   ret <8 x float> %shuffle
 }
 
 define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) {
-; AVX1-LABEL: shuffle_v8f32_ba983210:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v8f32_ba983210:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
-; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; AVX2-NEXT:    retq
+; ALL-LABEL: shuffle_v8f32_ba983210:
+; ALL:       # BB#0:
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
   ret <8 x float> %shuffle
 }
 
+define <8 x float> @shuffle_v8f32_80u1c4u5(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_80u1c4u5:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 0, i32 undef, i32 1, i32 12, i32 4, i32 undef, i32 5>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_a2u3e6f7(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_a2u3e6f7:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 10, i32 2, i32 undef, i32 3, i32 14, i32 6, i32 15, i32 7>
+  ret <8 x float> %shuffle
+}
+
 define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_00000000:
 ; AVX1:       # BB#0:
@@ -941,9 +921,8 @@ define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_00500000:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[u,u,1,u,4,4,4,4]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4,5,6,7]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8i32_00500000:
@@ -959,9 +938,8 @@ define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_06000000:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[u,2,u,u,4,4,4,4]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,0,4,5,4,4]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,0,4,4,4,4]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8i32_06000000:
@@ -977,9 +955,8 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_70000000:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,u,u,u,4,4,4,4]
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8i32_70000000:
@@ -998,7 +975,7 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_01014545:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8i32_01014545:
@@ -1012,8 +989,8 @@ define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_00112233:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm0[0,0,1,1]
-; AVX1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -1062,7 +1039,7 @@ define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX1-NEXT:    retq
@@ -1117,9 +1094,8 @@ define <8 x i32> @shuffle_v8i32_9832dc76(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2-LABEL: shuffle_v8i32_9832dc76:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,3,5,4,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
   ret <8 x i32> %shuffle
@@ -1181,8 +1157,7 @@ define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,0,u,1,u,2,u,3>
 ; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
-; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1192,11 +1167,11 @@ define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_08991abb:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1]
+; AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8i32_08991abb:
@@ -1222,8 +1197,7 @@ define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2-LABEL: shuffle_v8i32_091b2d3f:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
-; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
@@ -1233,7 +1207,7 @@ define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_09ab1def:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 ; AVX1-NEXT:    retq
@@ -1363,7 +1337,7 @@ define <8 x i32> @shuffle_v8i32_00234467(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_00224466(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_00224466:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; AVX1-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8i32_00224466:
@@ -1391,7 +1365,7 @@ define <8 x i32> @shuffle_v8i32_10325476(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_11335577(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_11335577:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX1-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8i32_11335577:
@@ -1789,17 +1763,14 @@ define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_3210ba98:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8i32_3210ba98:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,3,2,1,0>
-; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
   ret <8 x i32> %shuffle
@@ -1808,17 +1779,14 @@ define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_3210fedc:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8i32_3210fedc:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
   ret <8 x i32> %shuffle
@@ -1827,19 +1795,14 @@ define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_7654fedc:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8i32_7654fedc:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
-; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
   ret <8 x i32> %shuffle
@@ -1848,19 +1811,14 @@ define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_fedc7654:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8i32_fedc7654:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
-; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
   ret <8 x i32> %shuffle
@@ -1869,17 +1827,14 @@ define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_ba987654:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8i32_ba987654:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
   ret <8 x i32> %shuffle
@@ -1888,22 +1843,64 @@ define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_ba983210:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v8i32_ba983210:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
   ret <8 x i32> %shuffle
 }
 
+define <8 x i32> @shuffle_v8i32_zuu8zuuc(<8 x i32> %a) {
+; AVX1-LABEL: shuffle_v8i32_zuu8zuuc:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_zuu8zuuc:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 8, i32 0, i32 undef, i32 undef, i32 12>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_9ubzdefz(<8 x i32> %a) {
+; AVX1-LABEL: shuffle_v8i32_9ubzdefz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_9ubzdefz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,ymm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <8 x i32> <i32 9, i32 undef, i32 11, i32 0, i32 13, i32 14, i32 15, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_80u1b4uu(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_80u1b4uu:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_80u1b4uu:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 0, i32 undef, i32 1, i32 12, i32 4, i32 undef, i32 undef>
+  ret <8 x i32> %shuffle
+}
+
 define <8 x float> @splat_mem_v8f32_2(float* %p) {
 ; ALL-LABEL: splat_mem_v8f32_2:
 ; ALL:       # BB#0:
@@ -1929,3 +1926,169 @@ define <8 x float> @splat_v8f32(<4 x float> %r) {
   %1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer
   ret <8 x float> %1
 }
+
+;
+; Shuffle to logical bit shifts
+;
+
+define <8 x i32> @shuffle_v8i32_z0U2zUz6(<8 x i32> %a) {
+; AVX1-LABEL: shuffle_v8i32_z0U2zUz6:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_z0U2zUz6:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 8, i32 0, i32 undef, i32 2, i32 8, i32 undef, i32 8, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_1U3z5zUU(<8 x i32> %a) {
+; AVX1-LABEL: shuffle_v8i32_1U3z5zUU:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_1U3z5zUU:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 1, i32 undef, i32 3, i32 8, i32 5, i32 8, i32 undef, i32 undef>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_B012F456(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_B012F456:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[1,2],ymm1[4,6],ymm0[5,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_B012F456:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 0, i32 1, i32 2, i32 15, i32 4, i32 5, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_1238567C(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_1238567C:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[3,0],ymm1[4,4],ymm0[7,4]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_1238567C:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3],ymm0[20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7, i32 12>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_9AB0DEF4(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_9AB0DEF4:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[3,0],ymm0[4,4],ymm1[7,4]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,2],ymm0[2,0],ymm1[5,6],ymm0[6,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_9AB0DEF4:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3],ymm1[20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_389A7CDE(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_389A7CDE:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[1,2],ymm0[4,6],ymm1[5,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_389A7CDE:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 8, i32 9, i32 10, i32 7, i32 12, i32 13, i32 14>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_30127456(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_30127456:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,0,1,2,7,4,5,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_30127456:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,0,1,2,7,4,5,6]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_12305674(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_12305674:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_12305674:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8x float> @concat_v2f32_1(<2 x float>* %tmp64, <2 x float>* %tmp65) {
+; ALL-LABEL: concat_v2f32_1:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT:    vmovhpd (%rsi), %xmm0, %xmm0
+; ALL-NEXT:    retq
+entry:
+  %tmp74 = load <2 x float>* %tmp65, align 8
+  %tmp72 = load <2 x float>* %tmp64, align 8
+  %tmp73 = shufflevector <2 x float> %tmp72, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %tmp75 = shufflevector <2 x float> %tmp74, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %tmp76 = shufflevector <8 x float> %tmp73, <8 x float> %tmp75, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %tmp76
+}
+
+define <8x float> @concat_v2f32_2(<2 x float>* %tmp64, <2 x float>* %tmp65) {
+; ALL-LABEL: concat_v2f32_2:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT:    vmovhpd (%rsi), %xmm0, %xmm0
+; ALL-NEXT:    retq
+entry:
+  %tmp74 = load <2 x float>* %tmp65, align 8
+  %tmp72 = load <2 x float>* %tmp64, align 8
+  %tmp76 = shufflevector <2 x float> %tmp72, <2 x float> %tmp74, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %tmp76
+}
+
+define <8x float> @concat_v2f32_3(<2 x float>* %tmp64, <2 x float>* %tmp65) {
+; ALL-LABEL: concat_v2f32_3:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT:    vmovhpd (%rsi), %xmm0, %xmm0
+; ALL-NEXT:    retq
+entry:
+  %tmp74 = load <2 x float>* %tmp65, align 8
+  %tmp72 = load <2 x float>* %tmp64, align 8
+  %tmp76 = shufflevector <2 x float> %tmp72, <2 x float> %tmp74, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = shufflevector <4 x float> %tmp76, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %res
+}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll
new file mode 100644
index 0000000..406d524
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+
+target triple = "x86_64-unknown-unknown"
+
+define <16 x float> @shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  ret <16 x float> %shuffle
+}
+
+define <16 x i32> @shuffle_v16i32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x i32> %a, <16 x i32> %b) {
+; ALL-LABEL: shuffle_v16i32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  ret <16 x i32> %shuffle
+}
+
+define <16 x float> @shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x float> %shuffle
+}
+
+define <16 x i32> @shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x i32> %a, <16 x i32> %b) {
+; ALL-LABEL: shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i32> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 8f87c7c..5ddec49 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 
 target triple = "x86_64-unknown-unknown"
 
@@ -62,9 +62,9 @@ define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_00500000:
 ; ALL:       # BB#0:
 ; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,0]
 ; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
-; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3]
 ; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -75,9 +75,9 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_06000000:
 ; ALL:       # BB#0:
 ; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,0]
 ; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
-; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3]
 ; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -88,9 +88,9 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_70000000:
 ; ALL:       # BB#0:
 ; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,0,0,0]
 ; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
-; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3]
 ; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -191,15 +191,13 @@ define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_9832dc76:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2]
-; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[1,0,2,2]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm1
 ; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
   ret <8 x double> %shuffle
@@ -208,15 +206,13 @@ define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_9810dc54:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; ALL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,0]
-; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[1,0,2,2]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,0]
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm2
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm1
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
   ret <8 x double> %shuffle
@@ -274,12 +270,11 @@ define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_08991abb:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm0[1,0,2,2]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm1[0,2,3,3]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm1[0,0,1,1]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
   ret <8 x double> %shuffle
@@ -411,9 +406,9 @@ define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_00224466:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT:    vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2]
 ; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 ; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -566,7 +561,7 @@ define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_00226644:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT:    vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2]
 ; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
 ; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
 ; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
@@ -622,7 +617,7 @@ define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_002u6u44:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT:    vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2]
 ; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
 ; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,0]
 ; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
@@ -680,7 +675,7 @@ define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_uuu3uu66:
 ; ALL:       # BB#0:
 ; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; ALL-NEXT:    vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
 ; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
@@ -708,18 +703,17 @@ define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_f511235a:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
-; ALL-NEXT:    vpermpd {{.*#+}} ymm4 = ymm3[0,1,1,3]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3]
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm4 = ymm1[0,0,2,2]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vblendpd {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2,3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[2,3,1,3]
+; ALL-NEXT:    vmovddup {{.*#+}} ymm4 = ymm1[0,0,2,2]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3]
 ; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1]
-; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3]
 ; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm1
 ; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
 ; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
-; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm3, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
   ret <8 x double> %shuffle
@@ -784,9 +778,9 @@ define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) {
 ; ALL-LABEL: shuffle_v8i64_00500000:
 ; ALL:       # BB#0:
 ; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0]
 ; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
-; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -797,9 +791,9 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
 ; ALL-LABEL: shuffle_v8i64_06000000:
 ; ALL:       # BB#0:
 ; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,0,0]
 ; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
-; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -810,9 +804,9 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
 ; ALL-LABEL: shuffle_v8i64_70000000:
 ; ALL:       # BB#0:
 ; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,1,2,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,0,0,0]
 ; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
-; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -913,15 +907,13 @@ define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) {
 define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) {
 ; ALL-LABEL: shuffle_v8i64_9832dc76:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; ALL-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
-; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; ALL-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
-; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
   ret <8 x i64> %shuffle
@@ -930,15 +922,13 @@ define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) {
 define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) {
 ; ALL-LABEL: shuffle_v8i64_9810dc54:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; ALL-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,0]
-; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; ALL-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,0]
-; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; ALL-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
   ret <8 x i64> %shuffle
@@ -996,12 +986,11 @@ define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) {
 define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) {
 ; ALL-LABEL: shuffle_v8i64_08991abb:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7]
-; ALL-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[0,2,3,3]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
-; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[0,0,1,1]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,2,3,3]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
   ret <8 x i64> %shuffle
@@ -1418,12 +1407,47 @@ define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) {
 ; ALL-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
 ; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
 ; ALL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
 ; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
-; ALL-NEXT:    vpbroadcastq %xmm3, %ymm3
-; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
 ; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5>
   ret <8 x i64> %shuffle
 }
+
+define <8 x double> @shuffle_v8f64_082a4c6e(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_082a4c6e:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32><i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x double> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_082a4c6e(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_082a4c6e:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32><i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_193b5d7f(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_193b5d7f:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32><i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x double> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_193b5d7f(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_193b5d7f:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32><i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i64> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
index 22a6749..b99946f 100644
--- a/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -275,16 +275,18 @@ define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32
 define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; SSE2-LABEL: combine_bitwise_ops_test1b:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    andps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_bitwise_ops_test1b:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    andps %xmm1, %xmm0
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_bitwise_ops_test1b:
@@ -313,16 +315,18 @@ define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i3
 define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; SSE2-LABEL: combine_bitwise_ops_test2b:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    orps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_bitwise_ops_test2b:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    orps %xmm1, %xmm0
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_bitwise_ops_test2b:
@@ -352,17 +356,13 @@ define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i3
 ; SSE2-LABEL: combine_bitwise_ops_test3b:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    xorps %xmm1, %xmm0
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_bitwise_ops_test3b:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    xorps %xmm1, %xmm0
-; SSSE3-NEXT:    xorps %xmm1, %xmm1
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_bitwise_ops_test3b:
@@ -394,18 +394,18 @@ define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i3
 define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; SSE2-LABEL: combine_bitwise_ops_test4b:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    andps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_bitwise_ops_test4b:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    andps %xmm1, %xmm0
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_bitwise_ops_test4b:
@@ -434,18 +434,18 @@ define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i3
 define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; SSE2-LABEL: combine_bitwise_ops_test5b:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    orps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_bitwise_ops_test5b:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    orps %xmm1, %xmm0
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_bitwise_ops_test5b:
@@ -475,19 +475,13 @@ define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i3
 ; SSE2-LABEL: combine_bitwise_ops_test6b:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    xorps %xmm1, %xmm0
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_bitwise_ops_test6b:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    xorps %xmm1, %xmm0
-; SSSE3-NEXT:    xorps %xmm1, %xmm1
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_bitwise_ops_test6b:
@@ -517,17 +511,42 @@ define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i3
 }
 
 define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-; SSE-LABEL: combine_bitwise_ops_test1c:
-; SSE:       # BB#0:
-; SSE-NEXT:    andps %xmm1, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_bitwise_ops_test1c:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: combine_bitwise_ops_test1c:
-; AVX:       # BB#0:
-; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
-; AVX-NEXT:    retq
+; SSSE3-LABEL: combine_bitwise_ops_test1c:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test1c:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test1c:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test1c:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
   %and = and <4 x i32> %shuf1, %shuf2
@@ -535,17 +554,42 @@ define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i3
 }
 
 define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-; SSE-LABEL: combine_bitwise_ops_test2c:
-; SSE:       # BB#0:
-; SSE-NEXT:    orps %xmm1, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_bitwise_ops_test2c:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: combine_bitwise_ops_test2c:
-; AVX:       # BB#0:
-; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
-; AVX-NEXT:    retq
+; SSSE3-LABEL: combine_bitwise_ops_test2c:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test2c:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test2c:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test2c:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
   %or = or <4 x i32> %shuf1, %shuf2
@@ -553,18 +597,34 @@ define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i3
 }
 
 define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-; SSE-LABEL: combine_bitwise_ops_test3c:
-; SSE:       # BB#0:
-; SSE-NEXT:    xorps %xmm1, %xmm0
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_bitwise_ops_test3c:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test3c:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test3c:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_bitwise_ops_test3c:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; AVX-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
@@ -573,18 +633,42 @@ define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i3
 }
 
 define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-; SSE-LABEL: combine_bitwise_ops_test4c:
-; SSE:       # BB#0:
-; SSE-NEXT:    andps %xmm1, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSE-NEXT:    movaps %xmm2, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_bitwise_ops_test4c:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: combine_bitwise_ops_test4c:
-; AVX:       # BB#0:
-; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
-; AVX-NEXT:    retq
+; SSSE3-LABEL: combine_bitwise_ops_test4c:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test4c:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test4c:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test4c:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
   %and = and <4 x i32> %shuf1, %shuf2
@@ -592,18 +676,42 @@ define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i3
 }
 
 define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-; SSE-LABEL: combine_bitwise_ops_test5c:
-; SSE:       # BB#0:
-; SSE-NEXT:    orps %xmm1, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSE-NEXT:    movaps %xmm2, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_bitwise_ops_test5c:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: combine_bitwise_ops_test5c:
-; AVX:       # BB#0:
-; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
-; AVX-NEXT:    retq
+; SSSE3-LABEL: combine_bitwise_ops_test5c:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test5c:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test5c:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test5c:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
   %or = or <4 x i32> %shuf1, %shuf2
@@ -611,20 +719,45 @@ define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i3
 }
 
 define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-; SSE-LABEL: combine_bitwise_ops_test6c:
-; SSE:       # BB#0:
-; SSE-NEXT:    xorps %xmm1, %xmm0
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
-; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_bitwise_ops_test6c:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: combine_bitwise_ops_test6c:
-; AVX:       # BB#0:
-; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,3]
-; AVX-NEXT:    retq
+; SSSE3-LABEL: combine_bitwise_ops_test6c:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSSE3-NEXT:    pxor %xmm0, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test6c:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test6c:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test6c:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
   %xor = xor <4 x i32> %shuf1, %shuf2
@@ -855,19 +988,40 @@ define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
 ; it.
 
 define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
-; SSE-LABEL: combine_nested_undef_test15:
-; SSE:       # BB#0:
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,1,0,3]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_nested_undef_test15:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: combine_nested_undef_test15:
-; AVX:       # BB#0:
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[3,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; AVX-NEXT:    retq
+; SSSE3-LABEL: combine_nested_undef_test15:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_nested_undef_test15:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_nested_undef_test15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_nested_undef_test15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT:    retq
   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
   ret <4 x i32> %2
@@ -876,34 +1030,34 @@ define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
 define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
 ; SSE2-LABEL: combine_nested_undef_test16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_nested_undef_test16:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_nested_undef_test16:
 ; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_nested_undef_test16:
 ; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_nested_undef_test16:
 ; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
 ; AVX2-NEXT:    retq
   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
@@ -911,19 +1065,35 @@ define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
 }
 
 define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
-; SSE-LABEL: combine_nested_undef_test17:
-; SSE:       # BB#0:
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[3,1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,1,0,3]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_nested_undef_test17:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: combine_nested_undef_test17:
-; AVX:       # BB#0:
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[3,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; AVX-NEXT:    retq
+; SSSE3-LABEL: combine_nested_undef_test17:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_nested_undef_test17:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_nested_undef_test17:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_nested_undef_test17:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; AVX2-NEXT:    retq
   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
   ret <4 x i32> %2
@@ -945,55 +1115,107 @@ define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
 }
 
 define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
-; SSE-LABEL: combine_nested_undef_test19:
-; SSE:       # BB#0:
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,0,0]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_nested_undef_test19:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: combine_nested_undef_test19:
-; AVX:       # BB#0:
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,0,0,0]
-; AVX-NEXT:    retq
+; SSSE3-LABEL: combine_nested_undef_test19:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_nested_undef_test19:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_nested_undef_test19:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_nested_undef_test19:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; AVX2-NEXT:    retq
   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
   ret <4 x i32> %2
 }
 
 define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
-; SSE-LABEL: combine_nested_undef_test20:
-; SSE:       # BB#0:
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[0,0]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_nested_undef_test20:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: combine_nested_undef_test20:
-; AVX:       # BB#0:
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[0,0]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; AVX-NEXT:    retq
+; SSSE3-LABEL: combine_nested_undef_test20:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_nested_undef_test20:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_nested_undef_test20:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_nested_undef_test20:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
+; AVX2-NEXT:    retq
   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
   ret <4 x i32> %2
 }
 
 define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
-; SSE-LABEL: combine_nested_undef_test21:
-; SSE:       # BB#0:
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[3,1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_nested_undef_test21:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: combine_nested_undef_test21:
-; AVX:       # BB#0:
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[3,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX-NEXT:    retq
+; SSSE3-LABEL: combine_nested_undef_test21:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_nested_undef_test21:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_nested_undef_test21:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_nested_undef_test21:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
   ret <4 x i32> %2
@@ -1119,20 +1341,10 @@ define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
 }
 
 define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: combine_test1:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: combine_test1:
-; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movaps %xmm1, %xmm0
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: combine_test1:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    movaps %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: combine_test1:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_test1:
 ; AVX:       # BB#0:
@@ -1146,13 +1358,13 @@ define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
 ; SSE2-LABEL: combine_test2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_test2:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -1204,22 +1416,14 @@ define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
 ; SSE2-LABEL: combine_test5:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm1, %xmm2
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_test5:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movaps %xmm1, %xmm2
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
-; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_test5:
@@ -1237,20 +1441,10 @@ define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: combine_test6:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: combine_test6:
-; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movaps %xmm1, %xmm0
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: combine_test6:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    movaps %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: combine_test6:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_test6:
 ; AVX:       # BB#0:
@@ -1264,13 +1458,13 @@ define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: combine_test7:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_test7:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -1327,22 +1521,14 @@ define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: combine_test10:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm1, %xmm2
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_test10:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movaps %xmm1, %xmm2
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
-; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_test10:
@@ -1376,13 +1562,13 @@ define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
 ; SSE2-LABEL: combine_test12:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_test12:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -1433,20 +1619,14 @@ define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
 ; SSE2-LABEL: combine_test15:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_test15:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movaps %xmm0, %xmm2
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_test15:
@@ -1475,13 +1655,13 @@ define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: combine_test17:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_test17:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -1537,20 +1717,14 @@ define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: combine_test20:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_test20:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movaps %xmm0, %xmm2
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_test20:
@@ -1572,28 +1746,66 @@ define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
   ret <4 x i32> %2
 }
 
+define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) {
+; SSE-LABEL: combine_test21:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    movdqa %xmm2, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_test21:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test21:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  store <4 x i32> %1, <4 x i32>* %ptr, align 16
+  ret <4 x i32> %2
+}
+
+define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
+; SSE-LABEL: combine_test22:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movhpd (%rsi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test22:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovhpd (%rsi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+; Current AVX2 lowering of this is still awful, not adding a test case.
+  %1 = load <2 x float>* %a, align 8
+  %2 = load <2 x float>* %b, align 8
+  %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %3
+}
 
 ; Check some negative cases.
 ; FIXME: Do any of these really make sense? Are they redundant with the above tests?
 
 define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: combine_test1b:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: combine_test1b:
-; SSSE3:       # BB#0:
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; SSSE3-NEXT:    movaps %xmm1, %xmm0
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: combine_test1b:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; SSE41-NEXT:    movaps %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: combine_test1b:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_test1b:
 ; AVX:       # BB#0:
@@ -1613,19 +1825,17 @@ define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
 ;
 ; SSSE3-LABEL: combine_test2b:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
-; SSSE3-NEXT:    movapd %xmm1, %xmm0
+; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_test2b:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
-; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_test2b:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm1[0,0]
 ; AVX-NEXT:    retq
   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
@@ -1633,21 +1843,28 @@ define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
-; SSE-LABEL: combine_test3b:
-; SSE:       # BB#0:
-; SSE-NEXT:    movaps %xmm1, %xmm2
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_test3b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test3b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test3b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_test3b:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[2,0],xmm0[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; AVX-NEXT:    retq
   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
@@ -1655,23 +1872,11 @@ define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: combine_test4b:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: combine_test4b:
-; SSSE3:       # BB#0:
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; SSSE3-NEXT:    movaps %xmm1, %xmm0
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: combine_test4b:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; SSE41-NEXT:    movaps %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: combine_test4b:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_test4b:
 ; AVX:       # BB#0:
@@ -1688,44 +1893,44 @@ define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
 define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
 ; SSE2-LABEL: combine_test1c:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movd (%rdi), %xmm1
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movd (%rsi), %xmm0
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    movss %xmm1, %xmm0
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_test1c:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movd (%rdi), %xmm1
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    movd (%rsi), %xmm0
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT:    movss %xmm1, %xmm0
+; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_test1c:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pmovzxbd (%rdi), %xmm1
-; SSE41-NEXT:    pmovzxbd (%rsi), %xmm0
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_test1c:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpmovzxbd (%rdi), %xmm0
-; AVX1-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_test1c:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpmovzxbd (%rdi), %xmm0
-; AVX2-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    retq
   %A = load <4 x i8>* %a
@@ -1738,10 +1943,10 @@ define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
 define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
 ; SSE2-LABEL: combine_test2c:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movd (%rdi), %xmm0
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    movd (%rsi), %xmm1
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1749,10 +1954,10 @@ define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
 ;
 ; SSSE3-LABEL: combine_test2c:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movd (%rdi), %xmm0
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT:    movd (%rsi), %xmm1
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1760,15 +1965,15 @@ define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
 ;
 ; SSE41-LABEL: combine_test2c:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pmovzxbd (%rdi), %xmm0
-; SSE41-NEXT:    pmovzxbd (%rsi), %xmm1
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_test2c:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovzxbd (%rdi), %xmm0
-; AVX-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
   %A = load <4 x i8>* %a
@@ -1781,10 +1986,10 @@ define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
 define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
 ; SSE2-LABEL: combine_test3c:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movd (%rdi), %xmm1
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movd (%rsi), %xmm0
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
@@ -1792,10 +1997,10 @@ define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
 ;
 ; SSSE3-LABEL: combine_test3c:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movd (%rdi), %xmm1
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    movd (%rsi), %xmm0
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSSE3-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
@@ -1803,15 +2008,15 @@ define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
 ;
 ; SSE41-LABEL: combine_test3c:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pmovzxbd (%rdi), %xmm1
-; SSE41-NEXT:    pmovzxbd (%rsi), %xmm0
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; SSE41-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_test3c:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovzxbd (%rdi), %xmm0
-; AVX-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
 ; AVX-NEXT:    retq
   %A = load <4 x i8>* %a
@@ -1824,52 +2029,46 @@ define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
 define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
 ; SSE2-LABEL: combine_test4c:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movd (%rdi), %xmm1
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movd (%rsi), %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_test4c:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movd (%rdi), %xmm1
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    movd (%rsi), %xmm2
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_test4c:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pmovzxbd (%rdi), %xmm1
-; SSE41-NEXT:    pmovzxbd (%rsi), %xmm0
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_test4c:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpmovzxbd (%rdi), %xmm0
-; AVX1-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_test4c:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpmovzxbd (%rdi), %xmm0
-; AVX2-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX2-NEXT:    retq
   %A = load <4 x i8>* %a
@@ -1912,12 +2111,12 @@ define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
 define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
 ; SSE2-LABEL: combine_blend_01:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_blend_01:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_blend_01:
@@ -1937,16 +2136,16 @@ define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
 ; SSE2-LABEL: combine_blend_02:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movss %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_blend_02:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movss %xmm1, %xmm0
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_blend_02:
@@ -1966,13 +2165,13 @@ define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
 ; SSE2-LABEL: combine_blend_123:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_blend_123:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -2046,12 +2245,12 @@ define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
 define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
 ; SSE2-LABEL: combine_undef_input_test1:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_undef_input_test1:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_undef_input_test1:
@@ -2117,14 +2316,14 @@ define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
 ; SSE2-LABEL: combine_undef_input_test5:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movsd %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_undef_input_test5:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movsd %xmm0, %xmm1
-; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSSE3-NEXT:    movapd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_undef_input_test5:
@@ -2162,17 +2361,17 @@ define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
 ;
 ; SSSE3-LABEL: combine_undef_input_test7:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_undef_input_test7:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_undef_input_test7:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    retq
   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
@@ -2187,17 +2386,17 @@ define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
 ;
 ; SSSE3-LABEL: combine_undef_input_test8:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_undef_input_test8:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_undef_input_test8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    retq
   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
@@ -2231,12 +2430,12 @@ define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
 define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
 ; SSE2-LABEL: combine_undef_input_test11:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_undef_input_test11:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_undef_input_test11:
@@ -2302,14 +2501,14 @@ define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
 ; SSE2-LABEL: combine_undef_input_test15:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movsd %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_undef_input_test15:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movsd %xmm0, %xmm1
-; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSSE3-NEXT:    movapd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_undef_input_test15:
@@ -2353,17 +2552,17 @@ define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
 ;
 ; SSSE3-LABEL: combine_undef_input_test17:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_undef_input_test17:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_undef_input_test17:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    retq
   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
@@ -2378,17 +2577,17 @@ define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
 ;
 ; SSSE3-LABEL: combine_undef_input_test18:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_undef_input_test18:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_undef_input_test18:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    retq
   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
@@ -2463,19 +2662,16 @@ define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_unneeded_subvector2:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
-; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    retq
   %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
@@ -2483,6 +2679,20 @@ define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_insertps1:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_insertps1:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
 ; SSE41-LABEL: combine_insertps1:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
@@ -2499,6 +2709,20 @@ define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_insertps2:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_insertps2:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
 ; SSE41-LABEL: combine_insertps2:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
@@ -2515,6 +2739,18 @@ define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_insertps3:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_insertps3:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT:    retq
+;
 ; SSE41-LABEL: combine_insertps3:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
@@ -2531,6 +2767,18 @@ define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_insertps4:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_insertps4:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSSE3-NEXT:    retq
+;
 ; SSE41-LABEL: combine_insertps4:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
@@ -2545,3 +2793,115 @@ define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
   ret <4 x float> %d
 }
+
+define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: PR22377:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; SSE-NEXT:    addps %xmm0, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR22377:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    retq
+entry:
+  %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
+  %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
+  %r2 = fadd <4 x float> %s1, %s2
+  %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x float> %s3
+}
+
+define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: PR22390:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT:    addps %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: PR22390:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
+; SSSE3-NEXT:    movaps %xmm0, %xmm2
+; SSSE3-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSSE3-NEXT:    addps %xmm0, %xmm2
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: PR22390:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
+; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
+; SSE41-NEXT:    addps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: PR22390:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+  %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r2 = fadd <4 x float> %s1, %s2
+  ret <4 x float> %r2
+}
+
+define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: PR22412:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE2-NEXT:    movapd %xmm2, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: PR22412:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSSE3-NEXT:    movapd %xmm2, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
+; SSSE3-NEXT:    movaps %xmm3, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: PR22412:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[3,2]
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,2]
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    movaps %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: PR22412:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR22412:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,0,7,6,5,4,3,2]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+entry:
+  %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>
+  ret <8 x float> %s2
+}
diff --git a/test/CodeGen/X86/vector-shuffle-mmx.ll b/test/CodeGen/X86/vector-shuffle-mmx.ll
new file mode 100644
index 0000000..19608bd
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-mmx.ll
@@ -0,0 +1,106 @@
+; RUN: llc < %s -mtriple=i686-darwin -mattr=+mmx,+sse2 | FileCheck --check-prefix=X32 %s
+; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse2 | FileCheck --check-prefix=X64 %s
+
+; If there is no explicit MMX type usage, always promote to XMM.
+
+define void @test0(<1 x i64>* %x) {
+; X32-LABEL: test0:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
+; X32-NEXT:    movlpd %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test0:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
+; X64-NEXT:    movq %xmm0, (%rdi)
+; X64-NEXT:    retq
+entry:
+  %tmp2 = load <1 x i64>* %x
+  %tmp6 = bitcast <1 x i64> %tmp2 to <2 x i32>
+  %tmp9 = shufflevector <2 x i32> %tmp6, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
+  %tmp10 = bitcast <2 x i32> %tmp9 to <1 x i64>
+  store <1 x i64> %tmp10, <1 x i64>* %x
+  ret void
+}
+
+define void @test1() {
+; X32-LABEL: test1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    pushl %edi
+; X32-NEXT:  Ltmp0:
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:  Ltmp1:
+; X32-NEXT:    .cfi_def_cfa_offset 24
+; X32-NEXT:  Ltmp2:
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    xorpd %xmm0, %xmm0
+; X32-NEXT:    movlpd %xmm0, (%esp)
+; X32-NEXT:    movq (%esp), %mm0
+; X32-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
+; X32-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-NEXT:    movlpd %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm1
+; X32-NEXT:    xorl %edi, %edi
+; X32-NEXT:    maskmovq %mm1, %mm0
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    popl %edi
+; X32-NEXT:    retl
+;
+; X64-LABEL: test1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    pxor %xmm0, %xmm0
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %mm0
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %mm1
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    maskmovq %mm1, %mm0
+; X64-NEXT:    retq
+entry:
+  %tmp528 = bitcast <8 x i8> zeroinitializer to <2 x i32>
+  %tmp529 = and <2 x i32> %tmp528, bitcast (<4 x i16> < i16 -32640, i16 16448, i16 8224, i16 4112 > to <2 x i32>)
+  %tmp542 = bitcast <2 x i32> %tmp529 to <4 x i16>
+  %tmp543 = add <4 x i16> %tmp542, < i16 0, i16 16448, i16 24672, i16 28784 >
+  %tmp555 = bitcast <4 x i16> %tmp543 to <8 x i8>
+  %tmp556 = bitcast <8 x i8> %tmp555 to x86_mmx
+  %tmp557 = bitcast <8 x i8> zeroinitializer to x86_mmx
+  tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp557, x86_mmx %tmp556, i8* null)
+  ret void
+}
+
+@tmp_V2i = common global <2 x i32> zeroinitializer
+
+define void @test2() nounwind {
+; X32-LABEL: test2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl L_tmp_V2i$non_lazy_ptr, %eax
+; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-NEXT:    movlpd %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    movq _tmp_V2i@{{.*}}(%rip), %rax
+; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; X64-NEXT:    movq %xmm0, (%rax)
+; X64-NEXT:    retq
+entry:
+  %0 = load <2 x i32>* @tmp_V2i, align 8
+  %1 = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
+  store <2 x i32> %1, <2 x i32>* @tmp_V2i, align 8
+  ret void
+}
+
+declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, i8*)
diff --git a/test/CodeGen/X86/vector-shuffle-sse1.ll b/test/CodeGen/X86/vector-shuffle-sse1.ll
index 226deb0..b4cb0ec 100644
--- a/test/CodeGen/X86/vector-shuffle-sse1.ll
+++ b/test/CodeGen/X86/vector-shuffle-sse1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=x86-64 -mattr=-sse2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=SSE1
+; RUN: llc < %s -mcpu=x86-64 -mattr=-sse2 | FileCheck %s --check-prefix=SSE1
 
 target triple = "x86_64-unknown-unknown"
 
@@ -95,7 +95,7 @@ define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
 ; SSE1-LABEL: shuffle_v4f32_4zzz:
 ; SSE1:       # BB#0:
 ; SSE1-NEXT:    xorps %xmm1, %xmm1
-; SSE1-NEXT:    movss %xmm0, %xmm1
+; SSE1-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE1-NEXT:    movaps %xmm1, %xmm0
 ; SSE1-NEXT:    retq
   %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
@@ -106,8 +106,8 @@ define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
 ; SSE1-LABEL: shuffle_v4f32_z4zz:
 ; SSE1:       # BB#0:
 ; SSE1-NEXT:    xorps %xmm1, %xmm1
-; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
-; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; SSE1-NEXT:    retq
   %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
   ret <4 x float> %shuffle
@@ -117,8 +117,8 @@ define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
 ; SSE1-LABEL: shuffle_v4f32_zz4z:
 ; SSE1:       # BB#0:
 ; SSE1-NEXT:    xorps %xmm1, %xmm1
-; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2]
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
 ; SSE1-NEXT:    movaps %xmm1, %xmm0
 ; SSE1-NEXT:    retq
   %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
@@ -163,7 +163,7 @@ define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
 ; SSE1-LABEL: insert_reg_and_zero_v4f32:
 ; SSE1:       # BB#0:
 ; SSE1-NEXT:    xorps %xmm1, %xmm1
-; SSE1-NEXT:    movss %xmm0, %xmm1
+; SSE1-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE1-NEXT:    movaps %xmm1, %xmm0
 ; SSE1-NEXT:    retq
   %v = insertelement <4 x float> undef, float %a, i32 0
@@ -174,7 +174,7 @@ define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
 define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
 ; SSE1-LABEL: insert_mem_and_zero_v4f32:
 ; SSE1:       # BB#0:
-; SSE1-NEXT:    movss (%rdi), %xmm0
+; SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE1-NEXT:    retq
   %a = load float* %ptr
   %v = insertelement <4 x float> undef, float %a, i32 0
@@ -186,14 +186,14 @@ define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
 ; SSE1-LABEL: insert_mem_lo_v4f32:
 ; SSE1:       # BB#0:
 ; SSE1-NEXT:    movq (%rdi), %rax
-; SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%rsp)
+; SSE1-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
 ; SSE1-NEXT:    shrq $32, %rax
 ; SSE1-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
-; SSE1-NEXT:    movss {{[-0-9]+}}(%rsp), %xmm1
-; SSE1-NEXT:    movss {{[-0-9]+}}(%rsp), %xmm2
+; SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE1-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE1-NEXT:    xorps %xmm2, %xmm2
-; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,1]
+; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
 ; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
 ; SSE1-NEXT:    movaps %xmm1, %xmm0
 ; SSE1-NEXT:    retq
@@ -207,14 +207,14 @@ define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
 ; SSE1-LABEL: insert_mem_hi_v4f32:
 ; SSE1:       # BB#0:
 ; SSE1-NEXT:    movq (%rdi), %rax
-; SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%rsp)
+; SSE1-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
 ; SSE1-NEXT:    shrq $32, %rax
-; SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%rsp)
-; SSE1-NEXT:    movss {{[-0-9]+}}(%rsp), %xmm1
-; SSE1-NEXT:    movss {{[-0-9]+}}(%rsp), %xmm2
+; SSE1-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE1-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE1-NEXT:    xorps %xmm2, %xmm2
-; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,1]
+; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
 ; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
 ; SSE1-NEXT:    retq
   %a = load <2 x float>* %ptr
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
new file mode 100644
index 0000000..a336015
--- /dev/null
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -0,0 +1,223 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+
+define <4 x i32> @trunc2x2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: trunc2x2i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc2x2i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x2i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc2x2i64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX-NEXT:    retq
+
+
+entry:
+  %0 = trunc <2 x i64> %a to <2 x i32>
+  %1 = trunc <2 x i64> %b to <2 x i32>
+  %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %2
+}
+
+define i64 @trunc2i64(<2 x i64> %inval) {
+; SSE-LABEL: trunc2i64:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    movd %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: trunc2i64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+
+
+entry:
+  %0 = trunc <2 x i64> %inval to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to i64
+  ret i64 %1
+}
+
+define <8 x i16> @trunc2x4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: trunc2x4i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc2x4i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x4i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    pshufb %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm2, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc2x4i32:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+
+
+
+
+entry:
+  %0 = trunc <4 x i32> %a to <4 x i16>
+  %1 = trunc <4 x i32> %b to <4 x i16>
+  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %2
+}
+
+; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
+define i64 @trunc4i32(<4 x i32> %inval) {
+; SSE2-LABEL: trunc4i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc4i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    movd %xmm0, %rax
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc4i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    movd %xmm0, %rax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc4i32:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+
+
+
+
+entry:
+  %0 = trunc <4 x i32> %inval to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to i64
+  ret i64 %1
+}
+
+define <16 x i8> @trunc2x8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: trunc2x8i16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc2x8i16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x8i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    pshufb %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm2, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc2x8i16:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+
+
+
+
+entry:
+  %0 = trunc <8 x i16> %a to <8 x i8>
+  %1 = trunc <8 x i16> %b to <8 x i8>
+  %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %2
+}
+
+; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
+define i64 @trunc8i16(<8 x i16> %inval) {
+; SSE2-LABEL: trunc8i16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    movd %xmm0, %rax
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    movd %xmm0, %rax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc8i16:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+
+
+
+
+entry:
+  %0 = trunc <8 x i16> %inval to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to i64
+  ret i64 %1
+}
diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll
index afd7a24..568687d 100644
--- a/test/CodeGen/X86/vector-zext.ll
+++ b/test/CodeGen/X86/vector-zext.ll
@@ -7,47 +7,43 @@
 define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: zext_8i16_to_8i32:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    # kill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pand .LCPI0_0(%rip), %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: zext_8i16_to_8i32:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    # kill
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    pand .LCPI0_0(%rip), %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: zext_8i16_to_8i32:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovzxwd %xmm0, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pand .LCPI0_0(%rip), %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: zext_8i16_to_8i32:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT:    vpmovzxwd %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_8i16_to_8i32:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovzxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    retq
 entry:
   %B = zext <8 x i16> %A to <8 x i32>
@@ -77,7 +73,7 @@ define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
 ;
 ; SSE41-LABEL: zext_4i32_to_4i64:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovzxdq %xmm0, %xmm2
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
 ; SSE41-NEXT:    pand %xmm3, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
@@ -89,13 +85,13 @@ define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpmovzxdq %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_4i32_to_4i64:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovzxdq %xmm0, %ymm0
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX2-NEXT:    retq
 entry:
   %B = zext <4 x i32> %A to <4 x i64>
@@ -127,7 +123,7 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
 ;
 ; SSE41-LABEL: zext_8i8_to_8i32:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovzxwd %xmm0, %xmm2
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
 ; SSE41-NEXT:    pand %xmm1, %xmm2
 ; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
@@ -137,7 +133,7 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
 ;
 ; AVX1-LABEL: zext_8i8_to_8i32:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpmovzxwd %xmm0, %xmm1
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
@@ -145,7 +141,7 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
 ;
 ; AVX2-LABEL: zext_8i8_to_8i32:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovzxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -158,49 +154,324 @@ entry:
 define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) {
 ; SSE2-LABEL: zext_16i8_to_16i16:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    # kill
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    pand .LCPI3_0(%rip), %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: zext_16i8_to_16i16:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    # kill
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT:    pand .LCPI3_0(%rip), %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: zext_16i8_to_16i16:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pmovzxbw %xmm0, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pmovzxbw %xmm1, %xmm0 {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    pand .LCPI3_0(%rip), %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: zext_16i8_to_16i16:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT:    vpmovzxbw %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_16i8_to_16i16:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovzxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX2-NEXT:    retq
 entry:
   %t = zext <16 x i8> %z to <16 x i16>
   ret <16 x i16> %t
 }
+
+define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_16i8_to_16i16:
+; SSE2:        # BB#0: # %entry
+; SSE2-NEXT:   movdqa (%rdi), %xmm1
+; SSE2-NEXT:   pxor %xmm2, %xmm2
+; SSE2-NEXT:   movdqa %xmm1, %xmm0
+; SSE2-NEXT:   punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:   punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:   pand .LCPI4_0(%rip), %xmm1
+; SSE2-NEXT:   retq
+;
+; SSSE3-LABEL: load_zext_16i8_to_16i16:
+; SSSE3:        # BB#0: # %entry
+; SSSE3-NEXT:   movdqa (%rdi), %xmm1
+; SSSE3-NEXT:   pxor %xmm2, %xmm2
+; SSSE3-NEXT:   movdqa %xmm1, %xmm0
+; SSSE3-NEXT:   punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT:   punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT:   pand .LCPI4_0(%rip), %xmm1
+; SSSE3-NEXT:   retq
+;
+; SSE41-LABEL: load_zext_16i8_to_16i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_16i8_to_16i16:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_16i8_to_16i16:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX2-NEXT:    retq
+entry:
+ %X = load <16 x i8>* %ptr
+ %Y = zext <16 x i8> %X to <16 x i16>
+ ret <16 x i16> %Y
+}
+
+define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
+; SSE2-LABEL: load_zext_8i16_to_8i32:
+; SSE2:          # BB#0: # %entry
+; SSE2-NEXT:   movdqa (%rdi), %xmm1
+; SSE2-NEXT:   pxor %xmm2, %xmm2
+; SSE2-NEXT:   movdqa %xmm1, %xmm0
+; SSE2-NEXT:   punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:   punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:   pand .LCPI5_0(%rip), %xmm1
+; SSE2-NEXT:   retq
+;
+; SSSE3-LABEL: load_zext_8i16_to_8i32:
+; SSSE3:        # BB#0: # %entry
+; SSSE3-NEXT:   movdqa (%rdi), %xmm1
+; SSSE3-NEXT:   pxor %xmm2, %xmm2
+; SSSE3-NEXT:   movdqa %xmm1, %xmm0
+; SSSE3-NEXT:   punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:   punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:   pand .LCPI5_0(%rip), %xmm1
+; SSSE3-NEXT:   retq
+;
+; SSE41-LABEL: load_zext_8i16_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_8i16_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_8i16_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    retq
+entry:
+ %X = load <8 x i16>* %ptr
+ %Y = zext <8 x i16> %X to <8 x i32>
+ ret <8 x i32>%Y
+}
+
+define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) {
+; SSE2-LABEL: load_zext_4i32_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_zext_4i32_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_zext_4i32_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_zext_4i32_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_zext_4i32_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX2-NEXT:    retq
+entry:
+ %X = load <4 x i32>* %ptr
+ %Y = zext <4 x i32> %X to <4 x i64>
+ ret <4 x i64>%Y
+}
+
+define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_8i16_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    # kill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_8i16_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    # kill
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_8i16_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_8i16_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_8i16_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    # kill
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8>
+  %Z = bitcast <16 x i16> %B to <8 x i32>
+  ret <8 x i32> %Z
+}
+
+define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: shuf_zext_4i32_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    # kill
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_4i32_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    # kill
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_4i32_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_4i32_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_4i32_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    # kill
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    retq
+entry:
+  %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
+  %Z = bitcast <8 x i32> %B to <4 x i64>
+  ret <4 x i64> %Z
+}
+
+define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
+; SSE2-LABEL: shuf_zext_8i8_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pand .LCPI9_0(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuf_zext_8i8_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuf_zext_8i8_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuf_zext_8i8_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    retq
+entry:
+  %B = shufflevector <8 x i8> %A, <8 x i8> zeroinitializer, <32 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8, i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8, i32 6, i32 8, i32 8, i32 8, i32 7, i32 8, i32 8, i32 8>
+  %Z = bitcast <32 x i8> %B to <8 x i32>
+  ret <8 x i32> %Z
+}
diff --git a/test/CodeGen/X86/vector-zmov.ll b/test/CodeGen/X86/vector-zmov.ll
new file mode 100644
index 0000000..4de2543
--- /dev/null
+++ b/test/CodeGen/X86/vector-zmov.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+define <4 x i32> @load_zmov_4i32_to_0zzz(<4 x i32> *%ptr) {
+; SSE-LABEL:  load_zmov_4i32_to_0zzz:
+; SSE:        # BB#0: # %entry
+; SSE-NEXT:   movd (%rdi), %xmm0
+; SSE-NEXT:   retq
+
+; AVX-LABEL:  load_zmov_4i32_to_0zzz:
+; AVX:        # BB#0: # %entry
+; AVX-NEXT:   vmovd (%rdi), %xmm0
+; AVX-NEXT:   retq
+entry:
+  %X = load <4 x i32>* %ptr
+  %Y = shufflevector <4 x i32> %X, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
+  ret <4 x i32>%Y
+}
+
+define <2 x i64> @load_zmov_2i64_to_0z(<2 x i64> *%ptr) {
+; SSE-LABEL:  load_zmov_2i64_to_0z:
+; SSE:        # BB#0: # %entry
+; SSE-NEXT:   movq (%rdi), %xmm0
+; SSE-NEXT:   retq
+
+; AVX-LABEL:  load_zmov_2i64_to_0z:
+; AVX:        # BB#0: # %entry
+; AVX-NEXT:   vmovq (%rdi), %xmm0
+; AVX-NEXT:   retq
+entry:
+  %X = load <2 x i64>* %ptr
+  %Y = shufflevector <2 x i64> %X, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64>%Y
+}
diff --git a/test/CodeGen/X86/viabs.ll b/test/CodeGen/X86/viabs.ll
index d9f2cb0..c009235 100644
--- a/test/CodeGen/X86/viabs.ll
+++ b/test/CodeGen/X86/viabs.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=x86-64 -mcpu=x86-64 | FileCheck %s -check-prefix=SSE2
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s -check-prefix=SSSE3
-; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s -check-prefix=AVX2
-; RUN: llc < %s -march=x86-64 -mcpu=knl | FileCheck %s -check-prefix=AVX512
+; RUN: llc < %s -march=x86-64 -mattr=sse2    | FileCheck %s -check-prefix=SSE2
+; RUN: llc < %s -march=x86-64 -mattr=ssse3   | FileCheck %s -check-prefix=SSSE3
+; RUN: llc < %s -march=x86-64 -mattr=avx2    | FileCheck %s -check-prefix=AVX2
+; RUN: llc < %s -march=x86-64 -mattr=avx512f | FileCheck %s -check-prefix=AVX512
 
 define <4 x i32> @test1(<4 x i32> %a) nounwind {
 ; SSE2-LABEL: test1:
diff --git a/test/CodeGen/X86/vselect-2.ll b/test/CodeGen/X86/vselect-2.ll
index 50da32c..fe4cfba 100644
--- a/test/CodeGen/X86/vselect-2.ll
+++ b/test/CodeGen/X86/vselect-2.ll
@@ -1,33 +1,60 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
 
 define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
+; SSE2-LABEL: test1:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test1:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    retq
   %select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x i32> %A, <4 x i32> %B
   ret <4 x i32> %select
 }
-; CHECK-LABEL: test1
-; CHECK: movsd
-; CHECK: ret
 
 define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
+; SSE2-LABEL: test2:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test2:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
   %select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x i32> %A, <4 x i32> %B
   ret <4 x i32> %select
 }
-; CHECK-LABEL: test2
-; CHECK: movsd
-; CHECK-NEXT: ret
 
 define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
+; SSE2-LABEL: test3:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test3:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    retq
   %select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x float> %A, <4 x float> %B
   ret <4 x float> %select
 }
-; CHECK-LABEL: test3
-; CHECK: movsd
-; CHECK: ret
 
 define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
+; SSE2-LABEL: test4:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test4:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT:    retq
   %select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x float> %A, <4 x float> %B
   ret <4 x float> %select
 }
-; CHECK-LABEL: test4
-; CHECK: movsd
-; CHECK-NEXT: ret
diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll
index 0c0f4bb..02a9ef4 100644
--- a/test/CodeGen/X86/vselect-avx.ll
+++ b/test/CodeGen/X86/vselect-avx.ll
@@ -59,19 +59,15 @@ bb:
 ; 
 ; <rdar://problem/18819506>
 
-; Note: For now, hard code ORIG_MASK and SHRUNK_MASK registers, because we
-; cannot express that ORIG_MASK must not be equal to ORIG_MASK. Otherwise,
-; even a faulty pattern would pass!
-;  
 ; CHECK-LABEL: test3:
-; Compute the original mask.
-;	CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[ORIG_MASK:%xmm0]]
-; Shrink the bit of the mask.
-; CHECK-NEXT: vpslld	$31, [[ORIG_MASK]], [[SHRUNK_MASK:%xmm3]]
-; Use the shrunk mask in the blend.
-; CHECK-NEXT:	vblendvps	[[SHRUNK_MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
-; Use the original mask in the and.
-; CHECK-NEXT: vpand LCPI2_2(%rip), [[ORIG_MASK]], {{%xmm[0-9]+}} 
+; Compute the mask.
+;	CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[MASK:%xmm[0-9]+]]
+; Do not shrink the bit of the mask.
+; CHECK-NOT: vpslld	$31, [[MASK]], {{%xmm[0-9]+}}
+; Use the mask in the blend.
+; CHECK-NEXT:	vblendvps	[[MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+; Use the mask in the and.
+; CHECK-NEXT: vpand LCPI2_2(%rip), [[MASK]], {{%xmm[0-9]+}} 
 ; CHECK: retq
 define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,  <4 x i16> %tmp3, <4 x i16> %tmp12) {
   %tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3>
@@ -83,3 +79,14 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,
   store <4 x i16> %predphi, <4 x i16>* %tmp17, align 8
  ret void
 }
+
+; We shouldn't try to lower this directly using VSELECT because we don't have
+; vpblendvb in AVX1, only in AVX2. Instead, it should be expanded.
+;
+; CHECK-LABEL: PR22706:
+; CHECK: vpcmpgtb
+; CHECK: vpcmpgtb
+define <32 x i8> @PR22706(<32 x i1> %x) {
+  %tmp = select <32 x i1> %x, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
+  ret <32 x i8> %tmp
+}
diff --git a/test/CodeGen/X86/vselect-minmax.ll b/test/CodeGen/X86/vselect-minmax.ll
index 25189f2..3efe568 100644
--- a/test/CodeGen/X86/vselect-minmax.ll
+++ b/test/CodeGen/X86/vselect-minmax.ll
@@ -2,6 +2,8 @@
 ; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s -check-prefix=SSE4
 ; RUN: llc -march=x86-64 -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1
 ; RUN: llc -march=x86-64 -mcpu=core-avx2 -mattr=+avx2 < %s | FileCheck %s -check-prefix=AVX2
+; RUN: llc -march=x86-64 -mcpu=knl < %s | FileCheck %s  -check-prefix=AVX2 -check-prefix=AVX512F
+; RUN: llc -march=x86-64 -mcpu=skx < %s | FileCheck %s  -check-prefix=AVX512BW -check-prefix=AVX512VL -check-prefix=AVX512F
 
 define void @test1(i8* nocapture %a, i8* nocapture %b) nounwind {
 vector.ph:
@@ -33,6 +35,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test1:
 ; AVX2: vpminsb
+
+; AVX512VL-LABEL: test1:
+; AVX512VL: vpminsb
 }
 
 define void @test2(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -65,6 +70,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test2:
 ; AVX2: vpminsb
+
+; AVX512VL-LABEL: test2:
+; AVX512VL: vpminsb
 }
 
 define void @test3(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -97,6 +105,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test3:
 ; AVX2: vpmaxsb
+
+; AVX512VL-LABEL: test3:
+; AVX512VL: vpmaxsb
 }
 
 define void @test4(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -129,6 +140,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test4:
 ; AVX2: vpmaxsb
+
+; AVX512VL-LABEL: test4:
+; AVX512VL: vpmaxsb
 }
 
 define void @test5(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -161,6 +175,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test5:
 ; AVX2: vpminub
+
+; AVX512VL-LABEL: test5:
+; AVX512VL: vpminub 
 }
 
 define void @test6(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -193,6 +210,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test6:
 ; AVX2: vpminub
+
+; AVX512VL-LABEL: test6:
+; AVX512VL: vpminub
 }
 
 define void @test7(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -225,6 +245,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test7:
 ; AVX2: vpmaxub
+
+; AVX512VL-LABEL: test7:
+; AVX512VL: vpmaxub
 }
 
 define void @test8(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -257,6 +280,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test8:
 ; AVX2: vpmaxub
+
+; AVX512VL-LABEL: test8:
+; AVX512VL: vpmaxub
 }
 
 define void @test9(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -289,6 +315,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test9:
 ; AVX2: vpminsw
+
+; AVX512VL-LABEL: test9:
+; AVX512VL: vpminsw 
 }
 
 define void @test10(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -321,6 +350,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test10:
 ; AVX2: vpminsw
+
+; AVX512VL-LABEL: test10:
+; AVX512VL: vpminsw
 }
 
 define void @test11(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -353,6 +385,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test11:
 ; AVX2: vpmaxsw
+
+; AVX512VL-LABEL: test11:
+; AVX512VL: vpmaxsw
 }
 
 define void @test12(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -385,6 +420,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test12:
 ; AVX2: vpmaxsw
+
+; AVX512VL-LABEL: test12:
+; AVX512VL: vpmaxsw
 }
 
 define void @test13(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -417,6 +455,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test13:
 ; AVX2: vpminuw
+
+; AVX512VL-LABEL: test13:
+; AVX512VL: vpminuw
 }
 
 define void @test14(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -449,6 +490,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test14:
 ; AVX2: vpminuw
+
+; AVX512VL-LABEL: test14:
+; AVX512VL: vpminuw 
 }
 
 define void @test15(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -481,6 +525,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test15:
 ; AVX2: vpmaxuw
+
+; AVX512VL-LABEL: test15:
+; AVX512VL: vpmaxuw
 }
 
 define void @test16(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -513,6 +560,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test16:
 ; AVX2: vpmaxuw
+
+; AVX512VL-LABEL: test16:
+; AVX512VL: vpmaxuw
 }
 
 define void @test17(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -545,6 +595,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test17:
 ; AVX2: vpminsd
+
+; AVX512VL-LABEL: test17:
+; AVX512VL: vpminsd
 }
 
 define void @test18(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -577,6 +630,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test18:
 ; AVX2: vpminsd
+
+; AVX512VL-LABEL: test18:
+; AVX512VL: vpminsd
 }
 
 define void @test19(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -609,6 +665,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test19:
 ; AVX2: vpmaxsd
+
+; AVX512VL-LABEL: test19:
+; AVX512VL: vpmaxsd
 }
 
 define void @test20(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -641,6 +700,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test20:
 ; AVX2: vpmaxsd
+
+; AVX512VL-LABEL: test20:
+; AVX512VL: vpmaxsd
 }
 
 define void @test21(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -673,6 +735,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test21:
 ; AVX2: vpminud
+
+; AVX512VL-LABEL: test21:
+; AVX512VL: vpminud
 }
 
 define void @test22(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -705,6 +770,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test22:
 ; AVX2: vpminud
+
+; AVX512VL-LABEL: test22:
+; AVX512VL: vpminud
 }
 
 define void @test23(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -737,6 +805,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test23:
 ; AVX2: vpmaxud
+
+; AVX512VL-LABEL: test23:
+; AVX512VL: vpmaxud
 }
 
 define void @test24(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -769,6 +840,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test24:
 ; AVX2: vpmaxud
+
+; AVX512VL-LABEL: test24:
+; AVX512VL: vpmaxud
 }
 
 define void @test25(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -795,6 +869,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test25:
 ; AVX2: vpminsb
+
+; AVX512VL-LABEL: test25:
+; AVX512VL: vpminsb
 }
 
 define void @test26(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -821,6 +898,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test26:
 ; AVX2: vpminsb
+
+; AVX512VL-LABEL: test26:
+; AVX512VL: vpminsb
 }
 
 define void @test27(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -847,6 +927,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test27:
 ; AVX2: vpmaxsb
+
+; AVX512VL-LABEL: test27:
+; AVX512VL: vpmaxsb
 }
 
 define void @test28(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -873,6 +956,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test28:
 ; AVX2: vpmaxsb
+
+; AVX512VL-LABEL: test28:
+; AVX512VL: vpmaxsb
 }
 
 define void @test29(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -899,6 +985,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test29:
 ; AVX2: vpminub
+
+; AVX512VL-LABEL: test29:
+; AVX512VL: vpminub
 }
 
 define void @test30(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -925,6 +1014,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test30:
 ; AVX2: vpminub
+
+; AVX512VL-LABEL: test30:
+; AVX512VL: vpminub
 }
 
 define void @test31(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -951,6 +1043,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test31:
 ; AVX2: vpmaxub
+
+; AVX512VL-LABEL: test31:
+; AVX512VL: vpmaxub
 }
 
 define void @test32(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -977,6 +1072,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test32:
 ; AVX2: vpmaxub
+
+; AVX512VL-LABEL: test32:
+; AVX512VL: vpmaxub
 }
 
 define void @test33(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -1003,6 +1101,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test33:
 ; AVX2: vpminsw
+
+; AVX512VL-LABEL: test33:
+; AVX512VL: vpminsw 
 }
 
 define void @test34(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -1029,6 +1130,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test34:
 ; AVX2: vpminsw
+
+; AVX512VL-LABEL: test34:
+; AVX512VL: vpminsw
 }
 
 define void @test35(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -1055,6 +1159,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test35:
 ; AVX2: vpmaxsw
+
+; AVX512VL-LABEL: test35:
+; AVX512VL: vpmaxsw
 }
 
 define void @test36(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -1081,6 +1188,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test36:
 ; AVX2: vpmaxsw
+
+; AVX512VL-LABEL: test36:
+; AVX512VL: vpmaxsw
 }
 
 define void @test37(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -1107,6 +1217,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test37:
 ; AVX2: vpminuw
+
+; AVX512VL-LABEL: test37:
+; AVX512VL: vpminuw
 }
 
 define void @test38(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -1133,6 +1246,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test38:
 ; AVX2: vpminuw
+
+; AVX512VL-LABEL: test38:
+; AVX512VL: vpminuw
 }
 
 define void @test39(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -1159,6 +1275,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test39:
 ; AVX2: vpmaxuw
+
+; AVX512VL-LABEL: test39:
+; AVX512VL: vpmaxuw
 }
 
 define void @test40(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -1185,6 +1304,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test40:
 ; AVX2: vpmaxuw
+
+; AVX512VL-LABEL: test40:
+; AVX512VL: vpmaxuw
 }
 
 define void @test41(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -1211,6 +1333,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test41:
 ; AVX2: vpminsd
+
+; AVX512VL-LABEL: test41:
+; AVX512VL: vpminsd
 }
 
 define void @test42(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -1237,6 +1362,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test42:
 ; AVX2: vpminsd
+
+; AVX512VL-LABEL: test42:
+; AVX512VL: vpminsd
 }
 
 define void @test43(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -1263,6 +1391,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test43:
 ; AVX2: vpmaxsd
+
+; AVX512VL-LABEL: test43:
+; AVX512VL: vpmaxsd
 }
 
 define void @test44(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -1289,6 +1420,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test44:
 ; AVX2: vpmaxsd
+
+; AVX512VL-LABEL: test44:
+; AVX512VL: vpmaxsd
 }
 
 define void @test45(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -1315,6 +1449,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test45:
 ; AVX2: vpminud
+
+; AVX512VL-LABEL: test45:
+; AVX512VL: vpminud
 }
 
 define void @test46(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -1341,6 +1478,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test46:
 ; AVX2: vpminud
+
+; AVX512VL-LABEL: test46:
+; AVX512VL: vpminud
 }
 
 define void @test47(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -1367,6 +1507,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test47:
 ; AVX2: vpmaxud
+
+; AVX512VL-LABEL: test47:
+; AVX512VL: vpmaxud
 }
 
 define void @test48(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -1393,6 +1536,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test48:
 ; AVX2: vpmaxud
+
+; AVX512VL-LABEL: test48:
+; AVX512VL: vpmaxud
 }
 
 define void @test49(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -1425,6 +1571,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test49:
 ; AVX2: vpmaxsb
+
+; AVX512VL-LABEL: test49:
+; AVX512VL: vpmaxsb
 }
 
 define void @test50(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -1457,6 +1606,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test50:
 ; AVX2: vpmaxsb
+
+; AVX512VL-LABEL: test50:
+; AVX512VL: vpmaxsb
 }
 
 define void @test51(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -1489,6 +1641,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test51:
 ; AVX2: vpminsb
+
+; AVX512VL-LABEL: test51:
+; AVX512VL: vpminsb
 }
 
 define void @test52(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -1521,6 +1676,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test52:
 ; AVX2: vpminsb
+
+; AVX512VL-LABEL: test52:
+; AVX512VL: vpminsb
 }
 
 define void @test53(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -1553,6 +1711,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test53:
 ; AVX2: vpmaxub
+
+; AVX512VL-LABEL: test53:
+; AVX512VL: vpmaxub
 }
 
 define void @test54(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -1585,6 +1746,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test54:
 ; AVX2: vpmaxub
+
+; AVX512VL-LABEL: test54:
+; AVX512VL: vpmaxub
 }
 
 define void @test55(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -1617,6 +1781,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test55:
 ; AVX2: vpminub
+
+; AVX512VL-LABEL: test55:
+; AVX512VL: vpminub
 }
 
 define void @test56(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -1649,6 +1816,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test56:
 ; AVX2: vpminub
+
+; AVX512VL-LABEL: test56:
+; AVX512VL: vpminub
 }
 
 define void @test57(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -1681,6 +1851,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test57:
 ; AVX2: vpmaxsw
+
+; AVX512VL-LABEL: test57:
+; AVX512VL: vpmaxsw
 }
 
 define void @test58(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -1713,6 +1886,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test58:
 ; AVX2: vpmaxsw
+
+; AVX512VL-LABEL: test58:
+; AVX512VL: vpmaxsw
 }
 
 define void @test59(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -1745,6 +1921,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test59:
 ; AVX2: vpminsw
+
+; AVX512VL-LABEL: test59:
+; AVX512VL: vpminsw
 }
 
 define void @test60(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -1777,6 +1956,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test60:
 ; AVX2: vpminsw
+
+; AVX512VL-LABEL: test60:
+; AVX512VL: vpminsw
 }
 
 define void @test61(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -1809,6 +1991,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test61:
 ; AVX2: vpmaxuw
+
+; AVX512VL-LABEL: test61:
+; AVX512VL: vpmaxuw
 }
 
 define void @test62(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -1841,6 +2026,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test62:
 ; AVX2: vpmaxuw
+
+; AVX512VL-LABEL: test62:
+; AVX512VL: vpmaxuw
 }
 
 define void @test63(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -1873,6 +2061,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test63:
 ; AVX2: vpminuw
+
+; AVX512VL-LABEL: test63:
+; AVX512VL: vpminuw
 }
 
 define void @test64(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -1905,6 +2096,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test64:
 ; AVX2: vpminuw
+
+; AVX512VL-LABEL: test64:
+; AVX512VL: vpminuw
 }
 
 define void @test65(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -1937,6 +2131,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test65:
 ; AVX2: vpmaxsd
+
+; AVX512VL-LABEL: test65:
+; AVX512VL: vpmaxsd
 }
 
 define void @test66(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -1969,6 +2166,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test66:
 ; AVX2: vpmaxsd
+
+; AVX512VL-LABEL: test66:
+; AVX512VL: vpmaxsd
 }
 
 define void @test67(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -2001,6 +2201,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test67:
 ; AVX2: vpminsd
+
+; AVX512VL-LABEL: test67:
+; AVX512VL: vpminsd
 }
 
 define void @test68(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -2033,6 +2236,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test68:
 ; AVX2: vpminsd
+
+; AVX512VL-LABEL: test68:
+; AVX512VL: vpminsd
 }
 
 define void @test69(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -2065,6 +2271,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test69:
 ; AVX2: vpmaxud
+
+; AVX512VL-LABEL: test69:
+; AVX512VL: vpmaxud
 }
 
 define void @test70(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -2097,6 +2306,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test70:
 ; AVX2: vpmaxud
+
+; AVX512VL-LABEL: test70:
+; AVX512VL: vpmaxud
 }
 
 define void @test71(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -2129,6 +2341,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test71:
 ; AVX2: vpminud
+
+; AVX512VL-LABEL: test71:
+; AVX512VL: vpminud
 }
 
 define void @test72(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -2161,6 +2376,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test72:
 ; AVX2: vpminud
+
+; AVX512VL-LABEL: test72:
+; AVX512VL: vpminud
 }
 
 define void @test73(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -2187,6 +2405,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test73:
 ; AVX2: vpmaxsb
+
+; AVX512VL-LABEL: test73:
+; AVX512VL: vpmaxsb
 }
 
 define void @test74(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -2213,6 +2434,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test74:
 ; AVX2: vpmaxsb
+
+; AVX512VL-LABEL: test74:
+; AVX512VL: vpmaxsb 
 }
 
 define void @test75(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -2239,6 +2463,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test75:
 ; AVX2: vpminsb
+
+; AVX512VL-LABEL: test75:
+; AVX512VL: vpminsb
 }
 
 define void @test76(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -2265,6 +2492,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test76:
 ; AVX2: vpminsb
+
+; AVX512VL-LABEL: test76:
+; AVX512VL: vpminsb
 }
 
 define void @test77(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -2291,6 +2521,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test77:
 ; AVX2: vpmaxub
+
+; AVX512VL-LABEL: test77:
+; AVX512VL: vpmaxub
 }
 
 define void @test78(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -2317,6 +2550,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test78:
 ; AVX2: vpmaxub
+
+; AVX512VL-LABEL: test78:
+; AVX512VL: vpmaxub
 }
 
 define void @test79(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -2343,6 +2579,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test79:
 ; AVX2: vpminub
+
+; AVX512VL-LABEL: test79:
+; AVX512VL: vpminub
 }
 
 define void @test80(i8* nocapture %a, i8* nocapture %b) nounwind {
@@ -2369,6 +2608,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test80:
 ; AVX2: vpminub
+
+; AVX512VL-LABEL: test80:
+; AVX512VL: vpminub
 }
 
 define void @test81(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -2395,6 +2637,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test81:
 ; AVX2: vpmaxsw
+
+; AVX512VL-LABEL: test81:
+; AVX512VL: vpmaxsw
 }
 
 define void @test82(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -2421,6 +2666,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test82:
 ; AVX2: vpmaxsw
+
+; AVX512VL-LABEL: test82:
+; AVX512VL: vpmaxsw
 }
 
 define void @test83(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -2447,6 +2695,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test83:
 ; AVX2: vpminsw
+
+; AVX512VL-LABEL: test83:
+; AVX512VL: vpminsw
 }
 
 define void @test84(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -2473,6 +2724,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test84:
 ; AVX2: vpminsw
+
+; AVX512VL-LABEL: test84:
+; AVX512VL: vpminsw
 }
 
 define void @test85(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -2499,6 +2753,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test85:
 ; AVX2: vpmaxuw
+
+; AVX512VL-LABEL: test85:
+; AVX512VL: vpmaxuw
 }
 
 define void @test86(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -2525,6 +2782,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test86:
 ; AVX2: vpmaxuw
+
+; AVX512VL-LABEL: test86:
+; AVX512VL: vpmaxuw
 }
 
 define void @test87(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -2551,6 +2811,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test87:
 ; AVX2: vpminuw
+
+; AVX512VL-LABEL: test87:
+; AVX512VL: vpminuw
 }
 
 define void @test88(i16* nocapture %a, i16* nocapture %b) nounwind {
@@ -2577,6 +2840,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test88:
 ; AVX2: vpminuw
+
+; AVX512VL-LABEL: test88:
+; AVX512VL: vpminuw
 }
 
 define void @test89(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -2603,6 +2869,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test89:
 ; AVX2: vpmaxsd
+
+; AVX512VL-LABEL: test89:
+; AVX512VL: vpmaxsd
 }
 
 define void @test90(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -2629,6 +2898,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test90:
 ; AVX2: vpmaxsd
+
+; AVX512VL-LABEL: test90:
+; AVX512VL: vpmaxsd
 }
 
 define void @test91(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -2655,6 +2927,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test91:
 ; AVX2: vpminsd
+
+; AVX512VL-LABEL: test91:
+; AVX512VL: vpminsd
 }
 
 define void @test92(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -2681,6 +2956,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test92:
 ; AVX2: vpminsd
+
+; AVX512VL-LABEL: test92:
+; AVX512VL: vpminsd
 }
 
 define void @test93(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -2707,6 +2985,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test93:
 ; AVX2: vpmaxud
+
+; AVX512VL-LABEL: test93:
+; AVX512VL: vpmaxud
 }
 
 define void @test94(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -2733,6 +3014,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test94:
 ; AVX2: vpmaxud
+
+; AVX512VL-LABEL: test94:
+; AVX512VL: vpmaxud
 }
 
 define void @test95(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -2759,6 +3043,9 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test95:
 ; AVX2: vpminud
+
+; AVX512VL-LABEL: test95:
+; AVX512VL: vpminud
 }
 
 define void @test96(i32* nocapture %a, i32* nocapture %b) nounwind {
@@ -2785,4 +3072,2507 @@ for.end:                                          ; preds = %vector.body
 
 ; AVX2-LABEL: test96:
 ; AVX2: vpminud
+
+; AVX512VL-LABEL: test96:
+; AVX512VL: vpminud
+}
+
+; ----------------------------
+
+define void @test97(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
+  %load.a = load <64 x i8>* %ptr.a, align 2
+  %load.b = load <64 x i8>* %ptr.b, align 2
+  %cmp = icmp slt <64 x i8> %load.a, %load.b
+  %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b
+  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test97:
+; AVX512BW: vpminsb {{.*}}
+}
+
+define void @test98(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
+  %load.a = load <64 x i8>* %ptr.a, align 2
+  %load.b = load <64 x i8>* %ptr.b, align 2
+  %cmp = icmp sle <64 x i8> %load.a, %load.b
+  %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b
+  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test98:
+; AVX512BW: vpminsb {{.*}}
+}
+
+define void @test99(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
+  %load.a = load <64 x i8>* %ptr.a, align 2
+  %load.b = load <64 x i8>* %ptr.b, align 2
+  %cmp = icmp sgt <64 x i8> %load.a, %load.b
+  %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b
+  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test99:
+; AVX512BW: vpmaxsb {{.*}}
+}
+
+define void @test100(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
+  %load.a = load <64 x i8>* %ptr.a, align 2
+  %load.b = load <64 x i8>* %ptr.b, align 2
+  %cmp = icmp sge <64 x i8> %load.a, %load.b
+  %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b
+  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test100:
+; AVX512BW: vpmaxsb {{.*}}
+}
+
+define void @test101(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
+  %load.a = load <64 x i8>* %ptr.a, align 2
+  %load.b = load <64 x i8>* %ptr.b, align 2
+  %cmp = icmp ult <64 x i8> %load.a, %load.b
+  %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b
+  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test101:
+; AVX512BW: vpminub {{.*}}
+}
+
+define void @test102(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
+  %load.a = load <64 x i8>* %ptr.a, align 2
+  %load.b = load <64 x i8>* %ptr.b, align 2
+  %cmp = icmp ule <64 x i8> %load.a, %load.b
+  %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b
+  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test102:
+; AVX512BW: vpminub {{.*}}
+}
+
+define void @test103(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
+  %load.a = load <64 x i8>* %ptr.a, align 2
+  %load.b = load <64 x i8>* %ptr.b, align 2
+  %cmp = icmp ugt <64 x i8> %load.a, %load.b
+  %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b
+  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test103:
+; AVX512BW: vpmaxub {{.*}}
+}
+
+define void @test104(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
+  %load.a = load <64 x i8>* %ptr.a, align 2
+  %load.b = load <64 x i8>* %ptr.b, align 2
+  %cmp = icmp uge <64 x i8> %load.a, %load.b
+  %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b
+  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test104:
+; AVX512BW: vpmaxub {{.*}}
+}
+
+define void @test105(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
+  %load.a = load <32 x i16>* %ptr.a, align 2
+  %load.b = load <32 x i16>* %ptr.b, align 2
+  %cmp = icmp slt <32 x i16> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b
+  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test105:
+; AVX512BW: vpminsw {{.*}}
+}
+
+define void @test106(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
+  %load.a = load <32 x i16>* %ptr.a, align 2
+  %load.b = load <32 x i16>* %ptr.b, align 2
+  %cmp = icmp sle <32 x i16> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b
+  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test106:
+; AVX512BW: vpminsw {{.*}}
+}
+
+define void @test107(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
+  %load.a = load <32 x i16>* %ptr.a, align 2
+  %load.b = load <32 x i16>* %ptr.b, align 2
+  %cmp = icmp sgt <32 x i16> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b
+  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test107:
+; AVX512BW: vpmaxsw {{.*}}
+}
+
+define void @test108(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
+  %load.a = load <32 x i16>* %ptr.a, align 2
+  %load.b = load <32 x i16>* %ptr.b, align 2
+  %cmp = icmp sge <32 x i16> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b
+  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test108:
+; AVX512BW: vpmaxsw {{.*}}
+}
+
+define void @test109(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
+  %load.a = load <32 x i16>* %ptr.a, align 2
+  %load.b = load <32 x i16>* %ptr.b, align 2
+  %cmp = icmp ult <32 x i16> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b
+  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test109:
+; AVX512BW: vpminuw {{.*}}
+}
+
+define void @test110(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
+  %load.a = load <32 x i16>* %ptr.a, align 2
+  %load.b = load <32 x i16>* %ptr.b, align 2
+  %cmp = icmp ule <32 x i16> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b
+  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test110:
+; AVX512BW: vpminuw {{.*}}
+}
+
+define void @test111(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
+  %load.a = load <32 x i16>* %ptr.a, align 2
+  %load.b = load <32 x i16>* %ptr.b, align 2
+  %cmp = icmp ugt <32 x i16> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b
+  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test111:
+; AVX512BW: vpmaxuw {{.*}}
+}
+
+define void @test112(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
+  %load.a = load <32 x i16>* %ptr.a, align 2
+  %load.b = load <32 x i16>* %ptr.b, align 2
+  %cmp = icmp uge <32 x i16> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b
+  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test112:
+; AVX512BW: vpmaxuw {{.*}}
+}
+
+define void @test113(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
+  %load.a = load <16 x i32>* %ptr.a, align 2
+  %load.b = load <16 x i32>* %ptr.b, align 2
+  %cmp = icmp slt <16 x i32> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b
+  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test113:
+; AVX512F: vpminsd {{.*}}
+}
+
+define void @test114(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
+  %load.a = load <16 x i32>* %ptr.a, align 2
+  %load.b = load <16 x i32>* %ptr.b, align 2
+  %cmp = icmp sle <16 x i32> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b
+  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test114:
+; AVX512F: vpminsd {{.*}}
+}
+
+define void @test115(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
+  %load.a = load <16 x i32>* %ptr.a, align 2
+  %load.b = load <16 x i32>* %ptr.b, align 2
+  %cmp = icmp sgt <16 x i32> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b
+  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test115:
+; AVX512F: vpmaxsd {{.*}}
+}
+
+define void @test116(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
+  %load.a = load <16 x i32>* %ptr.a, align 2
+  %load.b = load <16 x i32>* %ptr.b, align 2
+  %cmp = icmp sge <16 x i32> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b
+  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test116:
+; AVX512F: vpmaxsd {{.*}}
+}
+
+define void @test117(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
+  %load.a = load <16 x i32>* %ptr.a, align 2
+  %load.b = load <16 x i32>* %ptr.b, align 2
+  %cmp = icmp ult <16 x i32> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b
+  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test117:
+; AVX512F: vpminud {{.*}}
+}
+
+define void @test118(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
+  %load.a = load <16 x i32>* %ptr.a, align 2
+  %load.b = load <16 x i32>* %ptr.b, align 2
+  %cmp = icmp ule <16 x i32> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b
+  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test118:
+; AVX512F: vpminud {{.*}}
+}
+
+define void @test119(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
+  %load.a = load <16 x i32>* %ptr.a, align 2
+  %load.b = load <16 x i32>* %ptr.b, align 2
+  %cmp = icmp ugt <16 x i32> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b
+  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test119:
+; AVX512F: vpmaxud {{.*}}
+}
+
+define void @test120(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
+  %load.a = load <16 x i32>* %ptr.a, align 2
+  %load.b = load <16 x i32>* %ptr.b, align 2
+  %cmp = icmp uge <16 x i32> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b
+  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test120:
+; AVX512F: vpmaxud {{.*}}
+}
+
+define void @test121(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
+  %load.a = load <8 x i64>* %ptr.a, align 2
+  %load.b = load <8 x i64>* %ptr.b, align 2
+  %cmp = icmp slt <8 x i64> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b
+  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test121:
+; AVX512F: vpminsq {{.*}}
+}
+
+define void @test122(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
+  %load.a = load <8 x i64>* %ptr.a, align 2
+  %load.b = load <8 x i64>* %ptr.b, align 2
+  %cmp = icmp sle <8 x i64> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b
+  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test122:
+; AVX512F: vpminsq {{.*}}
+}
+
+define void @test123(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
+  %load.a = load <8 x i64>* %ptr.a, align 2
+  %load.b = load <8 x i64>* %ptr.b, align 2
+  %cmp = icmp sgt <8 x i64> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b
+  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test123:
+; AVX512F: vpmaxsq {{.*}}
+}
+
+define void @test124(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
+  %load.a = load <8 x i64>* %ptr.a, align 2
+  %load.b = load <8 x i64>* %ptr.b, align 2
+  %cmp = icmp sge <8 x i64> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b
+  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test124:
+; AVX512F: vpmaxsq {{.*}}
+}
+
+define void @test125(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
+  %load.a = load <8 x i64>* %ptr.a, align 2
+  %load.b = load <8 x i64>* %ptr.b, align 2
+  %cmp = icmp ult <8 x i64> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b
+  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test125:
+; AVX512F: vpminuq {{.*}}
+}
+
+define void @test126(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
+  %load.a = load <8 x i64>* %ptr.a, align 2
+  %load.b = load <8 x i64>* %ptr.b, align 2
+  %cmp = icmp ule <8 x i64> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b
+  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test126:
+; AVX512F: vpminuq {{.*}}
+}
+
+define void @test127(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
+  %load.a = load <8 x i64>* %ptr.a, align 2
+  %load.b = load <8 x i64>* %ptr.b, align 2
+  %cmp = icmp ugt <8 x i64> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b
+  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test127:
+; AVX512F: vpmaxuq {{.*}}
+}
+
+define void @test128(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
+  %load.a = load <8 x i64>* %ptr.a, align 2
+  %load.b = load <8 x i64>* %ptr.b, align 2
+  %cmp = icmp uge <8 x i64> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b
+  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test128:
+; AVX512F: vpmaxuq {{.*}}
+}
+
+define void @test129(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
+  %load.a = load <64 x i8>* %ptr.a, align 2
+  %load.b = load <64 x i8>* %ptr.b, align 2
+  %cmp = icmp slt <64 x i8> %load.a, %load.b
+  %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a
+  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test129:
+; AVX512BW: vpmaxsb
+}
+
+define void @test130(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
+  %load.a = load <64 x i8>* %ptr.a, align 2
+  %load.b = load <64 x i8>* %ptr.b, align 2
+  %cmp = icmp sle <64 x i8> %load.a, %load.b
+  %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a
+  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test130:
+; AVX512BW: vpmaxsb
+}
+
+define void @test131(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
+  %load.a = load <64 x i8>* %ptr.a, align 2
+  %load.b = load <64 x i8>* %ptr.b, align 2
+  %cmp = icmp sgt <64 x i8> %load.a, %load.b
+  %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a
+  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test131:
+; AVX512BW: vpminsb
+}
+
+define void @test132(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
+  %load.a = load <64 x i8>* %ptr.a, align 2
+  %load.b = load <64 x i8>* %ptr.b, align 2
+  %cmp = icmp sge <64 x i8> %load.a, %load.b
+  %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a
+  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test132:
+; AVX512BW: vpminsb
+}
+
+define void @test133(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
+  %load.a = load <64 x i8>* %ptr.a, align 2
+  %load.b = load <64 x i8>* %ptr.b, align 2
+  %cmp = icmp ult <64 x i8> %load.a, %load.b
+  %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a
+  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test133:
+; AVX512BW: vpmaxub
+}
+
+define void @test134(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
+  %load.a = load <64 x i8>* %ptr.a, align 2
+  %load.b = load <64 x i8>* %ptr.b, align 2
+  %cmp = icmp ule <64 x i8> %load.a, %load.b
+  %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a
+  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test134:
+; AVX512BW: vpmaxub
+}
+
+define void @test135(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
+  %load.a = load <64 x i8>* %ptr.a, align 2
+  %load.b = load <64 x i8>* %ptr.b, align 2
+  %cmp = icmp ugt <64 x i8> %load.a, %load.b
+  %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a
+  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test135:
+; AVX512BW: vpminub
+}
+
+define void @test136(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <64 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <64 x i8>*
+  %load.a = load <64 x i8>* %ptr.a, align 2
+  %load.b = load <64 x i8>* %ptr.b, align 2
+  %cmp = icmp uge <64 x i8> %load.a, %load.b
+  %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a
+  store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test136:
+; AVX512BW: vpminub
+}
+
+define void @test137(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
+  %load.a = load <32 x i16>* %ptr.a, align 2
+  %load.b = load <32 x i16>* %ptr.b, align 2
+  %cmp = icmp slt <32 x i16> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a
+  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test137:
+; AVX512BW: vpmaxsw
+}
+
+define void @test138(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
+  %load.a = load <32 x i16>* %ptr.a, align 2
+  %load.b = load <32 x i16>* %ptr.b, align 2
+  %cmp = icmp sle <32 x i16> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a
+  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test138:
+; AVX512BW: vpmaxsw
+}
+
+define void @test139(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
+  %load.a = load <32 x i16>* %ptr.a, align 2
+  %load.b = load <32 x i16>* %ptr.b, align 2
+  %cmp = icmp sgt <32 x i16> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a
+  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test139:
+; AVX512BW: vpminsw
+}
+
+define void @test140(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
+  %load.a = load <32 x i16>* %ptr.a, align 2
+  %load.b = load <32 x i16>* %ptr.b, align 2
+  %cmp = icmp sge <32 x i16> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a
+  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test140:
+; AVX512BW: vpminsw
+}
+
+define void @test141(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
+  %load.a = load <32 x i16>* %ptr.a, align 2
+  %load.b = load <32 x i16>* %ptr.b, align 2
+  %cmp = icmp ult <32 x i16> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a
+  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test141:
+; AVX512BW: vpmaxuw
+}
+
+define void @test142(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
+  %load.a = load <32 x i16>* %ptr.a, align 2
+  %load.b = load <32 x i16>* %ptr.b, align 2
+  %cmp = icmp ule <32 x i16> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a
+  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test142:
+; AVX512BW: vpmaxuw
+}
+
+define void @test143(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
+  %load.a = load <32 x i16>* %ptr.a, align 2
+  %load.b = load <32 x i16>* %ptr.b, align 2
+  %cmp = icmp ugt <32 x i16> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a
+  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test143:
+; AVX512BW: vpminuw
+}
+
+define void @test144(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <32 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <32 x i16>*
+  %load.a = load <32 x i16>* %ptr.a, align 2
+  %load.b = load <32 x i16>* %ptr.b, align 2
+  %cmp = icmp uge <32 x i16> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a
+  store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512BW-LABEL: test144:
+; AVX512BW: vpminuw
+}
+
+define void @test145(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
+  %load.a = load <16 x i32>* %ptr.a, align 2
+  %load.b = load <16 x i32>* %ptr.b, align 2
+  %cmp = icmp slt <16 x i32> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a
+  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test145:
+; AVX512F: vpmaxsd
+}
+
+define void @test146(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
+  %load.a = load <16 x i32>* %ptr.a, align 2
+  %load.b = load <16 x i32>* %ptr.b, align 2
+  %cmp = icmp sle <16 x i32> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a
+  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test146:
+; AVX512F: vpmaxsd
+}
+
+define void @test147(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
+  %load.a = load <16 x i32>* %ptr.a, align 2
+  %load.b = load <16 x i32>* %ptr.b, align 2
+  %cmp = icmp sgt <16 x i32> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a
+  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test147:
+; AVX512F: vpminsd
+}
+
+define void @test148(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
+  %load.a = load <16 x i32>* %ptr.a, align 2
+  %load.b = load <16 x i32>* %ptr.b, align 2
+  %cmp = icmp sge <16 x i32> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a
+  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test148:
+; AVX512F: vpminsd
+}
+
+define void @test149(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
+  %load.a = load <16 x i32>* %ptr.a, align 2
+  %load.b = load <16 x i32>* %ptr.b, align 2
+  %cmp = icmp ult <16 x i32> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a
+  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test149:
+; AVX512F: vpmaxud
+}
+
+define void @test150(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
+  %load.a = load <16 x i32>* %ptr.a, align 2
+  %load.b = load <16 x i32>* %ptr.b, align 2
+  %cmp = icmp ule <16 x i32> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a
+  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test150:
+; AVX512F: vpmaxud
+}
+
+define void @test151(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
+  %load.a = load <16 x i32>* %ptr.a, align 2
+  %load.b = load <16 x i32>* %ptr.b, align 2
+  %cmp = icmp ugt <16 x i32> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a
+  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test151:
+; AVX512F: vpminud
+}
+
+define void @test152(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <16 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <16 x i32>*
+  %load.a = load <16 x i32>* %ptr.a, align 2
+  %load.b = load <16 x i32>* %ptr.b, align 2
+  %cmp = icmp uge <16 x i32> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a
+  store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test152:
+; AVX512F: vpminud
+}
+
+; -----------------------
+
+define void @test153(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
+  %load.a = load <8 x i64>* %ptr.a, align 2
+  %load.b = load <8 x i64>* %ptr.b, align 2
+  %cmp = icmp slt <8 x i64> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a
+  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test153:
+; AVX512F: vpmaxsq
+}
+
+define void @test154(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
+  %load.a = load <8 x i64>* %ptr.a, align 2
+  %load.b = load <8 x i64>* %ptr.b, align 2
+  %cmp = icmp sle <8 x i64> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a
+  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test154:
+; AVX512F: vpmaxsq
+}
+
+define void @test155(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
+  %load.a = load <8 x i64>* %ptr.a, align 2
+  %load.b = load <8 x i64>* %ptr.b, align 2
+  %cmp = icmp sgt <8 x i64> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a
+  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test155:
+; AVX512F: vpminsq
+}
+
+define void @test156(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
+  %load.a = load <8 x i64>* %ptr.a, align 2
+  %load.b = load <8 x i64>* %ptr.b, align 2
+  %cmp = icmp sge <8 x i64> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a
+  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test156:
+; AVX512F: vpminsq
+}
+
+define void @test157(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
+  %load.a = load <8 x i64>* %ptr.a, align 2
+  %load.b = load <8 x i64>* %ptr.b, align 2
+  %cmp = icmp ult <8 x i64> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a
+  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test157:
+; AVX512F: vpmaxuq
+}
+
+define void @test158(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
+  %load.a = load <8 x i64>* %ptr.a, align 2
+  %load.b = load <8 x i64>* %ptr.b, align 2
+  %cmp = icmp ule <8 x i64> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a
+  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test158:
+; AVX512F: vpmaxuq
+}
+
+define void @test159(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
+  %load.a = load <8 x i64>* %ptr.a, align 2
+  %load.b = load <8 x i64>* %ptr.b, align 2
+  %cmp = icmp ugt <8 x i64> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a
+  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test159:
+; AVX512F: vpminuq
+}
+
+define void @test160(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i64>*
+  %load.a = load <8 x i64>* %ptr.a, align 2
+  %load.b = load <8 x i64>* %ptr.b, align 2
+  %cmp = icmp uge <8 x i64> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a
+  store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512F-LABEL: test160:
+; AVX512F: vpminuq
+}
+
+define void @test161(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
+  %load.a = load <4 x i64>* %ptr.a, align 2
+  %load.b = load <4 x i64>* %ptr.b, align 2
+  %cmp = icmp slt <4 x i64> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b
+  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test161:
+; AVX512VL: vpminsq
+}
+
+define void @test162(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
+  %load.a = load <4 x i64>* %ptr.a, align 2
+  %load.b = load <4 x i64>* %ptr.b, align 2
+  %cmp = icmp sle <4 x i64> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b
+  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test162:
+; AVX512VL: vpminsq
+}
+
+define void @test163(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
+  %load.a = load <4 x i64>* %ptr.a, align 2
+  %load.b = load <4 x i64>* %ptr.b, align 2
+  %cmp = icmp sgt <4 x i64> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b
+  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test163:
+; AVX512VL: vpmaxsq 
+}
+
+define void @test164(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
+  %load.a = load <4 x i64>* %ptr.a, align 2
+  %load.b = load <4 x i64>* %ptr.b, align 2
+  %cmp = icmp sge <4 x i64> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b
+  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test164:
+; AVX512VL: vpmaxsq
+}
+
+define void @test165(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
+  %load.a = load <4 x i64>* %ptr.a, align 2
+  %load.b = load <4 x i64>* %ptr.b, align 2
+  %cmp = icmp ult <4 x i64> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b
+  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test165:
+; AVX512VL: vpminuq 
+}
+
+define void @test166(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
+  %load.a = load <4 x i64>* %ptr.a, align 2
+  %load.b = load <4 x i64>* %ptr.b, align 2
+  %cmp = icmp ule <4 x i64> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b
+  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test166:
+; AVX512VL: vpminuq
+}
+
+define void @test167(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
+  %load.a = load <4 x i64>* %ptr.a, align 2
+  %load.b = load <4 x i64>* %ptr.b, align 2
+  %cmp = icmp ugt <4 x i64> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b
+  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test167:
+; AVX512VL: vpmaxuq
+}
+
+define void @test168(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
+  %load.a = load <4 x i64>* %ptr.a, align 2
+  %load.b = load <4 x i64>* %ptr.b, align 2
+  %cmp = icmp uge <4 x i64> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b
+  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test168:
+; AVX512VL: vpmaxuq
+}
+
+define void @test169(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
+  %load.a = load <4 x i64>* %ptr.a, align 2
+  %load.b = load <4 x i64>* %ptr.b, align 2
+  %cmp = icmp slt <4 x i64> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a
+  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test169:
+; AVX512VL: vpmaxsq
+}
+
+define void @test170(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
+  %load.a = load <4 x i64>* %ptr.a, align 2
+  %load.b = load <4 x i64>* %ptr.b, align 2
+  %cmp = icmp sle <4 x i64> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a
+  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test170:
+; AVX512VL: vpmaxsq
+}
+
+define void @test171(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
+  %load.a = load <4 x i64>* %ptr.a, align 2
+  %load.b = load <4 x i64>* %ptr.b, align 2
+  %cmp = icmp sgt <4 x i64> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a
+  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test171:
+; AVX512VL: vpminsq
+}
+
+define void @test172(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
+  %load.a = load <4 x i64>* %ptr.a, align 2
+  %load.b = load <4 x i64>* %ptr.b, align 2
+  %cmp = icmp sge <4 x i64> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a
+  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test172:
+; AVX512VL: vpminsq
+}
+
+define void @test173(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
+  %load.a = load <4 x i64>* %ptr.a, align 2
+  %load.b = load <4 x i64>* %ptr.b, align 2
+  %cmp = icmp ult <4 x i64> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a
+  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test173:
+; AVX512VL: vpmaxuq
+}
+
+define void @test174(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
+  %load.a = load <4 x i64>* %ptr.a, align 2
+  %load.b = load <4 x i64>* %ptr.b, align 2
+  %cmp = icmp ule <4 x i64> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a
+  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test174:
+; AVX512VL: vpmaxuq
+}
+
+define void @test175(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
+  %load.a = load <4 x i64>* %ptr.a, align 2
+  %load.b = load <4 x i64>* %ptr.b, align 2
+  %cmp = icmp ugt <4 x i64> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a
+  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test175:
+; AVX512VL: vpminuq
+}
+
+define void @test176(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i64>*
+  %load.a = load <4 x i64>* %ptr.a, align 2
+  %load.b = load <4 x i64>* %ptr.b, align 2
+  %cmp = icmp uge <4 x i64> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a
+  store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test176:
+; AVX512VL: vpminuq
+}
+
+define void @test177(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
+  %load.a = load <2 x i64>* %ptr.a, align 2
+  %load.b = load <2 x i64>* %ptr.b, align 2
+  %cmp = icmp slt <2 x i64> %load.a, %load.b
+  %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b
+  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test177:
+; AVX512VL: vpminsq
+}
+
+define void @test178(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
+  %load.a = load <2 x i64>* %ptr.a, align 2
+  %load.b = load <2 x i64>* %ptr.b, align 2
+  %cmp = icmp sle <2 x i64> %load.a, %load.b
+  %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b
+  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test178:
+; AVX512VL: vpminsq
+}
+
+define void @test179(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
+  %load.a = load <2 x i64>* %ptr.a, align 2
+  %load.b = load <2 x i64>* %ptr.b, align 2
+  %cmp = icmp sgt <2 x i64> %load.a, %load.b
+  %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b
+  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test179:
+; AVX512VL: vpmaxsq
+}
+
+define void @test180(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
+  %load.a = load <2 x i64>* %ptr.a, align 2
+  %load.b = load <2 x i64>* %ptr.b, align 2
+  %cmp = icmp sge <2 x i64> %load.a, %load.b
+  %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b
+  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test180:
+; AVX512VL: vpmaxsq
+}
+
+define void @test181(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
+  %load.a = load <2 x i64>* %ptr.a, align 2
+  %load.b = load <2 x i64>* %ptr.b, align 2
+  %cmp = icmp ult <2 x i64> %load.a, %load.b
+  %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b
+  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test181:
+; AVX512VL: vpminuq
+}
+
+define void @test182(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
+  %load.a = load <2 x i64>* %ptr.a, align 2
+  %load.b = load <2 x i64>* %ptr.b, align 2
+  %cmp = icmp ule <2 x i64> %load.a, %load.b
+  %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b
+  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test182:
+; AVX512VL: vpminuq
+}
+
+define void @test183(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
+  %load.a = load <2 x i64>* %ptr.a, align 2
+  %load.b = load <2 x i64>* %ptr.b, align 2
+  %cmp = icmp ugt <2 x i64> %load.a, %load.b
+  %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b
+  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test183:
+; AVX512VL: vpmaxuq
+}
+
+define void @test184(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
+  %load.a = load <2 x i64>* %ptr.a, align 2
+  %load.b = load <2 x i64>* %ptr.b, align 2
+  %cmp = icmp uge <2 x i64> %load.a, %load.b
+  %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b
+  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test184:
+; AVX512VL: vpmaxuq
+}
+
+define void @test185(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
+  %load.a = load <2 x i64>* %ptr.a, align 2
+  %load.b = load <2 x i64>* %ptr.b, align 2
+  %cmp = icmp slt <2 x i64> %load.a, %load.b
+  %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a
+  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test185:
+; AVX512VL: vpmaxsq
+}
+
+define void @test186(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
+  %load.a = load <2 x i64>* %ptr.a, align 2
+  %load.b = load <2 x i64>* %ptr.b, align 2
+  %cmp = icmp sle <2 x i64> %load.a, %load.b
+  %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a
+  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test186:
+; AVX512VL: vpmaxsq
+}
+
+define void @test187(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
+  %load.a = load <2 x i64>* %ptr.a, align 2
+  %load.b = load <2 x i64>* %ptr.b, align 2
+  %cmp = icmp sgt <2 x i64> %load.a, %load.b
+  %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a
+  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test187:
+; AVX512VL: vpminsq
+}
+
+define void @test188(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
+  %load.a = load <2 x i64>* %ptr.a, align 2
+  %load.b = load <2 x i64>* %ptr.b, align 2
+  %cmp = icmp sge <2 x i64> %load.a, %load.b
+  %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a
+  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test188:
+; AVX512VL: vpminsq
+}
+
+define void @test189(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
+  %load.a = load <2 x i64>* %ptr.a, align 2
+  %load.b = load <2 x i64>* %ptr.b, align 2
+  %cmp = icmp ult <2 x i64> %load.a, %load.b
+  %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a
+  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test189:
+; AVX512VL: vpmaxuq
+}
+
+define void @test190(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
+  %load.a = load <2 x i64>* %ptr.a, align 2
+  %load.b = load <2 x i64>* %ptr.b, align 2
+  %cmp = icmp ule <2 x i64> %load.a, %load.b
+  %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a
+  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test190:
+; AVX512VL: vpmaxuq
+}
+
+define void @test191(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
+  %load.a = load <2 x i64>* %ptr.a, align 2
+  %load.b = load <2 x i64>* %ptr.b, align 2
+  %cmp = icmp ugt <2 x i64> %load.a, %load.b
+  %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a
+  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test191:
+; AVX512VL: vpminuq
+}
+
+define void @test192(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <2 x i64>*
+  %ptr.b = bitcast i32* %gep.b to <2 x i64>*
+  %load.a = load <2 x i64>* %ptr.a, align 2
+  %load.b = load <2 x i64>* %ptr.b, align 2
+  %cmp = icmp uge <2 x i64> %load.a, %load.b
+  %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a
+  store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX512VL-LABEL: test192:
+; AVX512VL: vpminuq
 }
diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll
index 3bd1dc4..71620af 100644
--- a/test/CodeGen/X86/vselect.ll
+++ b/test/CodeGen/X86/vselect.ll
@@ -6,9 +6,8 @@
 define <4 x float> @test1(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: test1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    andps {{.*}}(%rip), %xmm1
-; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
 ; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
@@ -17,8 +16,8 @@ define <4 x float> @test1(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @test2(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: test2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movsd %xmm0, %xmm1
-; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; CHECK-NEXT:    movapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
@@ -27,7 +26,7 @@ define <4 x float> @test2(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @test3(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: test3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movsd %xmm1, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
@@ -53,10 +52,6 @@ define <4 x float> @test5(<4 x float> %a, <4 x float> %b) {
 define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [0,65535,0,65535,0,65535,0,65535]
-; CHECK-NEXT:    andps %xmm0, %xmm1
-; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i16> %a, <8 x i16> %a
   ret <8 x i16> %1
@@ -65,9 +60,8 @@ define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    andps {{.*}}(%rip), %xmm1
-; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; CHECK-NEXT:    movapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
@@ -76,9 +70,7 @@ define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @test8(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test8:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    andps {{.*}}(%rip), %xmm1
-; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
@@ -104,7 +96,7 @@ define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test11:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm2 = <0,65535,65535,0,u,65535,65535,u>
+; CHECK-NEXT:    movaps {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535]
 ; CHECK-NEXT:    andps %xmm2, %xmm0
 ; CHECK-NEXT:    andnps %xmm1, %xmm2
 ; CHECK-NEXT:    orps %xmm2, %xmm0
@@ -170,7 +162,7 @@ define <8 x i16> @test17(<8 x i16> %a, <8 x i16> %b) {
 define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: test18:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movss %xmm1, %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
@@ -179,7 +171,7 @@ define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
 define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test19:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movss %xmm1, %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> %a, <4 x i32> %b
   ret <4 x i32> %1
@@ -188,7 +180,7 @@ define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
 define <2 x double> @test20(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test20:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movsd %xmm1, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; CHECK-NEXT:    retq
   %1 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %b
   ret <2 x double> %1
@@ -197,7 +189,7 @@ define <2 x double> @test20(<2 x double> %a, <2 x double> %b) {
 define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test21:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movsd %xmm1, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; CHECK-NEXT:    retq
   %1 = select <2 x i1> <i1 false, i1 true>, <2 x i64> %a, <2 x i64> %b
   ret <2 x i64> %1
@@ -206,7 +198,7 @@ define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
 define <4 x float> @test22(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: test22:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    movaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
@@ -216,7 +208,7 @@ define <4 x float> @test22(<4 x float> %a, <4 x float> %b) {
 define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test23:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    movaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %a, <4 x i32> %b
@@ -226,8 +218,8 @@ define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) {
 define <2 x double> @test24(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test24:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movsd %xmm0, %xmm1
-; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; CHECK-NEXT:    movapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %a, <2 x double> %b
   ret <2 x double> %1
@@ -236,8 +228,8 @@ define <2 x double> @test24(<2 x double> %a, <2 x double> %b) {
 define <2 x i64> @test25(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test25:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movsd %xmm0, %xmm1
-; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; CHECK-NEXT:    movapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %1 = select <2 x i1> <i1 true, i1 false>, <2 x i64> %a, <2 x i64> %b
   ret <2 x i64> %1
@@ -276,6 +268,7 @@ define <16 x double> @select_illegal(<16 x double> %a, <16 x double> %b) {
 ; CHECK-NEXT:    movaps %xmm2, 32(%rdi)
 ; CHECK-NEXT:    movaps %xmm1, 16(%rdi)
 ; CHECK-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    retq
   %sel = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x double> %a, <16 x double> %b
   ret <16 x double> %sel
diff --git a/test/CodeGen/X86/vshift-4.ll b/test/CodeGen/X86/vshift-4.ll
index a060cf8..cda9bc8 100644
--- a/test/CodeGen/X86/vshift-4.ll
+++ b/test/CodeGen/X86/vshift-4.ll
@@ -57,7 +57,7 @@ entry:
 define void @shift3a(<8 x i16> %val, <8 x i16>* %dst, <8 x i16> %amt) nounwind {
 entry:
 ; CHECK-LABEL: shift3a:
-; CHECK: movzwl
+; CHECK: pextrw $6
 ; CHECK: psllw
   %shamt = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
   %shl = shl <8 x i16> %val, %shamt
diff --git a/test/CodeGen/X86/vshift-6.ll b/test/CodeGen/X86/vshift-6.ll
index f50d9a6..175b649 100644
--- a/test/CodeGen/X86/vshift-6.ll
+++ b/test/CodeGen/X86/vshift-6.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86-64 -mattr=+sse2  | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse2  | FileCheck %s
 
 ; This test makes sure that the compiler does not crash with an
 ; assertion failure when trying to fold a vector shift left
diff --git a/test/CodeGen/X86/widen_conversions.ll b/test/CodeGen/X86/widen_conversions.ll
index 8e5174f..fa85400 100644
--- a/test/CodeGen/X86/widen_conversions.ll
+++ b/test/CodeGen/X86/widen_conversions.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=x86-64 -x86-experimental-vector-widening-legalization -x86-experimental-vector-shuffle-lowering | FileCheck %s
+; RUN: llc < %s -mcpu=x86-64 -x86-experimental-vector-widening-legalization | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
diff --git a/test/CodeGen/X86/widen_load-0.ll b/test/CodeGen/X86/widen_load-0.ll
index d543728..768a1be 100644
--- a/test/CodeGen/X86/widen_load-0.ll
+++ b/test/CodeGen/X86/widen_load-0.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -o - -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -o - -mtriple=x86_64-linux | FileCheck %s
 ; PR4891
 
 ; Both loads should happen before either store.
diff --git a/test/CodeGen/X86/widen_load-1.ll b/test/CodeGen/X86/widen_load-1.ll
index c59cc58..6137424 100644
--- a/test/CodeGen/X86/widen_load-1.ll
+++ b/test/CodeGen/X86/widen_load-1.ll
@@ -9,8 +9,8 @@
 ; SSE: movaps  %xmm0, (%rsp)
 ; SSE: callq   killcommon
 
-; AVX: vmovaps    compl+128(%rip), %xmm0
-; AVX: vmovaps  %xmm0, (%rsp)
+; AVX: vmovdqa    compl+128(%rip), %xmm0
+; AVX: vmovdqa  %xmm0, (%rsp)
 ; AVX: callq   killcommon
 
 @compl = linkonce global [20 x i64] zeroinitializer, align 64 ; <[20 x i64]*> [#uses=1]
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index 0ec3574..c6bd964 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -76,10 +76,9 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp
 ; CHECK:         pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]]
 ; CHECK-NEXT:    pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]]
 ; CHECK-NEXT:    paddd    %[[R0]], %[[R1]]
-; CHECK-NEXT:    movdqa   %[[R1]], %[[R0]]
-; CHECK-NEXT:    pshufb   {{.*}}, %[[R0]]
-; CHECK-NEXT:    pmovzxdq %[[R0]], %[[R0]]
 ; CHECK-NEXT:    pextrw   $4, %[[R1]], 4(%{{.*}})
+; CHECK-NEXT:    pshufb   {{.*}}, %[[R1]]
+; CHECK-NEXT:    pmovzxdq %[[R1]], %[[R0]]
 ; CHECK-NEXT:    movd     %[[R0]], (%{{.*}})
 	%a = load %i16vec3* %ap, align 16
 	%b = load %i16vec3* %bp, align 16
@@ -144,10 +143,9 @@ define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) no
 ; CHECK:         pmovzxbd (%{{.*}}), %[[R0:xmm[0-9]+]]
 ; CHECK-NEXT:    pmovzxbd (%{{.*}}), %[[R1:xmm[0-9]+]]
 ; CHECK-NEXT:    paddd    %[[R0]], %[[R1]]
-; CHECK-NEXT:    movdqa   %[[R1]], %[[R0]]
-; CHECK-NEXT:    pshufb   {{.*}}, %[[R0]]
-; CHECK-NEXT:    pmovzxwq %[[R0]], %[[R0]]
 ; CHECK-NEXT:    pextrb   $8, %[[R1]], 2(%{{.*}})
+; CHECK-NEXT:    pshufb   {{.*}}, %[[R1]]
+; CHECK-NEXT:    pmovzxwq %[[R1]], %[[R0]]
 ; CHECK-NEXT:    movd     %[[R0]], %e[[R2:[abcd]]]x
 ; CHECK-NEXT:    movw     %[[R2]]x, (%{{.*}})
 	%a = load %i8vec3* %ap, align 16
@@ -193,8 +191,9 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pa
 ; CHECK-NEXT:    movd    %[[CONSTANT1]], %e[[R1:[abcd]]]x
 ; CHECK-NEXT:    movw    %[[R1]]x, (%[[PTR1:.*]])
 ; CHECK-NEXT:    movb    $1, 2(%[[PTR1]])
-; CHECK-NEXT:    pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]]
-; CHECK-NEXT:    pand    {{.*}}, %[[X0]]
+; CHECK-NEXT:    movl    (%[[PTR0]]), [[TMP1:%e[abcd]+x]]
+; CHECK-NEXT:    movl    [[TMP1]], [[TMP2:.*]]
+; CHECK-NEXT:    pmovzxbd [[TMP2]], %[[X0:xmm[0-9]+]]
 ; CHECK-NEXT:    pextrd  $1, %[[X0]], %e[[R0:[abcd]]]x
 ; CHECK-NEXT:    shrl    %e[[R0]]x
 ; CHECK-NEXT:    movd    %[[X0]], %e[[R1:[abcd]]]x
@@ -206,10 +205,9 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pa
 ; CHECK-NEXT:    pinsrd  $2, %e[[R0]]x, %[[X1]]
 ; CHECK-NEXT:    pextrd  $3, %[[X0]], %e[[R0:[abcd]]]x
 ; CHECK-NEXT:    pinsrd  $3, %e[[R0]]x, %[[X1]]
-; CHECK-NEXT:    movdqa  %[[X1]], %[[X2:xmm[0-9]+]]
-; CHECK-NEXT:    pshufb  %[[SHUFFLE_MASK]], %[[X2]]
-; CHECK-NEXT:    pmovzxwq %[[X2]], %[[X3:xmm[0-9]+]]
 ; CHECK-NEXT:    pextrb  $8, %[[X1]], 2(%{{.*}})
+; CHECK-NEXT:    pshufb  %[[SHUFFLE_MASK]], %[[X1]]
+; CHECK-NEXT:    pmovzxwq %[[X1]], %[[X3:xmm[0-9]+]]
 ; CHECK-NEXT:    movd    %[[X3]], %e[[R0:[abcd]]]x
 ; CHECK-NEXT:    movw    %[[R0]]x, (%{{.*}})
 
diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll
index 70fdbb7..2aa870f 100644
--- a/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/test/CodeGen/X86/widen_shuffle-1.ll
@@ -82,8 +82,8 @@ define void @shuf5(<8 x i8>* %p) nounwind {
 ; CHECK-LABEL: shuf5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = <4,33,u,u,u,u,u,u>
-; CHECK-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33]
+; CHECK-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
 ; CHECK-NEXT:    movlpd %xmm0, (%eax)
 ; CHECK-NEXT:    retl
   %v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/test/CodeGen/X86/win64_alloca_dynalloca.ll b/test/CodeGen/X86/win64_alloca_dynalloca.ll
index a6b6536..abda227 100644
--- a/test/CodeGen/X86/win64_alloca_dynalloca.ll
+++ b/test/CodeGen/X86/win64_alloca_dynalloca.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
 ; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
+; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-win32 -code-model=large | FileCheck %s -check-prefix=L64
 ; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
 ; PR8777
 ; PR8778
@@ -13,19 +14,24 @@ entry:
   %buf0 = alloca i8, i64 4096, align 1
 
 ; ___chkstk_ms does not adjust %rsp.
-; M64: movq  %rsp, %rbp
-; M64:       $4096, %rax
+; M64:       $4096, %eax
 ; M64: callq ___chkstk_ms
 ; M64: subq  %rax, %rsp
+; M64: leaq 128(%rsp), %rbp
 
 ; __chkstk does not adjust %rsp.
-; W64: movq  %rsp, %rbp
-; W64:       $4096, %rax
+; W64:       $4096, %eax
 ; W64: callq __chkstk
 ; W64: subq  %rax, %rsp
+; W64: leaq 128(%rsp), %rbp
+
+; Use %r11 for the large model.
+; L64:       $4096, %eax
+; L64: movabsq $__chkstk, %r11
+; L64: callq *%r11
+; L64: subq  %rax, %rsp
 
 ; Freestanding
-; EFI: movq  %rsp, %rbp
 ; EFI:       $[[B0OFS:4096|4104]], %rsp
 ; EFI-NOT:   call
 
@@ -33,8 +39,8 @@ entry:
 
 ; M64: leaq  15(%{{.*}}), %rax
 ; M64: andq  $-16, %rax
-; M64: callq ___chkstk
-; M64-NOT:   %rsp
+; M64: callq ___chkstk_ms
+; M64: subq  %rax, %rsp
 ; M64: movq  %rsp, %rax
 
 ; W64: leaq  15(%{{.*}}), %rax
@@ -43,6 +49,13 @@ entry:
 ; W64: subq  %rax, %rsp
 ; W64: movq  %rsp, %rax
 
+; L64: leaq  15(%{{.*}}), %rax
+; L64: andq  $-16, %rax
+; L64: movabsq $__chkstk, %r11
+; L64: callq *%r11
+; L64: subq  %rax, %rsp
+; L64: movq  %rsp, %rax
+
 ; EFI: leaq  15(%{{.*}}), [[R1:%r.*]]
 ; EFI: andq  $-16, [[R1]]
 ; EFI: movq  %rsp, [[R64:%r.*]]
@@ -53,12 +66,12 @@ entry:
 
 ; M64: subq  $48, %rsp
 ; M64: movq  %rax, 32(%rsp)
-; M64: leaq  -4096(%rbp), %r9
+; M64: leaq  -128(%rbp), %r9
 ; M64: callq bar
 
 ; W64: subq  $48, %rsp
 ; W64: movq  %rax, 32(%rsp)
-; W64: leaq  -4096(%rbp), %r9
+; W64: leaq  -128(%rbp), %r9
 ; W64: callq bar
 
 ; EFI: subq  $48, %rsp
@@ -68,9 +81,9 @@ entry:
 
   ret i64 %r
 
-; M64: movq    %rbp, %rsp
+; M64: leaq    3968(%rbp), %rsp
 
-; W64: movq    %rbp, %rsp
+; W64: leaq    3968(%rbp), %rsp
 
 }
 
@@ -84,7 +97,8 @@ entry:
 
 ; M64: leaq  15(%{{.*}}), %rax
 ; M64: andq  $-16, %rax
-; M64: callq ___chkstk
+; M64: callq ___chkstk_ms
+; M64: subq  %rax, %rsp
 ; M64: movq  %rsp, [[R2:%r.*]]
 ; M64: andq  $-128, [[R2]]
 ; M64: movq  [[R2]], %rsp
diff --git a/test/CodeGen/X86/win64_call_epi.ll b/test/CodeGen/X86/win64_call_epi.ll
index bc73ad4..71c44b0 100644
--- a/test/CodeGen/X86/win64_call_epi.ll
+++ b/test/CodeGen/X86/win64_call_epi.ll
@@ -44,7 +44,7 @@ b:
 done:
     ret void
 }
-!0 = metadata !{metadata !"branch_weights", i32 100, i32 0}
+!0 = !{!"branch_weights", i32 100, i32 0}
 ; WIN64-LABEL: foo2:
 ; WIN64: callq bar
 ; WIN64: nop
diff --git a/test/CodeGen/X86/win64_eh.ll b/test/CodeGen/X86/win64_eh.ll
index f1f874e..b67ad58 100644
--- a/test/CodeGen/X86/win64_eh.ll
+++ b/test/CodeGen/X86/win64_eh.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -O0 -mcpu=corei7 -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN64
-; RUN: llc < %s -O0 -mcpu=corei7 -mtriple=x86_64-pc-mingw32 | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -O0 -mattr=sse2 -mtriple=x86_64-pc-windows-itanium | FileCheck %s -check-prefix=WIN64 -check-prefix=NORM
+; RUN: llc < %s -O0 -mattr=sse2 -mtriple=x86_64-pc-mingw32 | FileCheck %s -check-prefix=WIN64 -check-prefix=NORM
+; RUN: llc < %s -O0 -mattr=sse2 -mtriple=x86_64-pc-mingw32 -mcpu=atom | FileCheck %s -check-prefix=WIN64 -check-prefix=ATOM
 
 ; Check function without prolog
 define void @foo0() uwtable {
@@ -20,7 +21,8 @@ entry:
 }
 ; WIN64-LABEL: foo1:
 ; WIN64: .seh_proc foo1
-; WIN64: subq $4000, %rsp
+; NORM:  subq $4000, %rsp
+; ATOM:  leaq -4000(%rsp), %rsp
 ; WIN64: .seh_stackalloc 4000
 ; WIN64: .seh_endprologue
 ; WIN64: addq $4000, %rsp
@@ -35,7 +37,7 @@ entry:
 }
 ; WIN64-LABEL: foo2:
 ; WIN64: .seh_proc foo2
-; WIN64: movabsq $8000, %rax
+; WIN64: movl $8000, %eax
 ; WIN64: callq {{__chkstk|___chkstk_ms}}
 ; WIN64: subq %rax, %rsp
 ; WIN64: .seh_stackalloc 8000
@@ -83,7 +85,8 @@ entry:
 ; WIN64: .seh_proc foo3
 ; WIN64: pushq %rsi
 ; WIN64: .seh_pushreg 6
-; WIN64: subq $24, %rsp
+; NORM:  subq $24, %rsp
+; ATOM:  leaq -24(%rsp), %rsp
 ; WIN64: .seh_stackalloc 24
 ; WIN64: .seh_endprologue
 ; WIN64: addq $24, %rsp
@@ -126,7 +129,8 @@ endtryfinally:
 ; WIN64-LABEL: foo4:
 ; WIN64: .seh_proc foo4
 ; WIN64: .seh_handler _d_eh_personality, @unwind, @except
-; WIN64: subq $56, %rsp
+; NORM:  subq $56, %rsp
+; ATOM:  leaq -56(%rsp), %rsp
 ; WIN64: .seh_stackalloc 56
 ; WIN64: .seh_endprologue
 ; WIN64: addq $56, %rsp
@@ -146,23 +150,24 @@ entry:
 ; WIN64: .seh_proc foo5
 ; WIN64: pushq %rbp
 ; WIN64: .seh_pushreg 5
-; WIN64: movq  %rsp, %rbp
 ; WIN64: pushq %rdi
 ; WIN64: .seh_pushreg 7
 ; WIN64: pushq %rbx
 ; WIN64: .seh_pushreg 3
-; WIN64: andq  $-64, %rsp
-; WIN64: subq  $128, %rsp
-; WIN64: .seh_stackalloc 48
-; WIN64: .seh_setframe 5, 64
-; WIN64: movaps  %xmm7, -32(%rbp)        # 16-byte Spill
-; WIN64: movaps  %xmm6, -48(%rbp)        # 16-byte Spill
-; WIN64: .seh_savexmm 6, 16
-; WIN64: .seh_savexmm 7, 32
+; NORM:  subq  $96, %rsp
+; ATOM:  leaq -96(%rsp), %rsp
+; WIN64: .seh_stackalloc 96
+; WIN64: leaq  96(%rsp), %rbp
+; WIN64: .seh_setframe 5, 96
+; WIN64: movaps  %xmm7, -16(%rbp)        # 16-byte Spill
+; WIN64: .seh_savexmm 7, 80
+; WIN64: movaps  %xmm6, -32(%rbp)        # 16-byte Spill
+; WIN64: .seh_savexmm 6, 64
 ; WIN64: .seh_endprologue
-; WIN64: movaps  -48(%rbp), %xmm6        # 16-byte Reload
-; WIN64: movaps  -32(%rbp), %xmm7        # 16-byte Reload
-; WIN64: leaq  -16(%rbp), %rsp
+; WIN64: andq  $-64, %rsp
+; WIN64: movaps  -32(%rbp), %xmm6        # 16-byte Reload
+; WIN64: movaps  -16(%rbp), %xmm7        # 16-byte Reload
+; WIN64: movq  %rbp, %rsp
 ; WIN64: popq  %rbx
 ; WIN64: popq  %rdi
 ; WIN64: popq  %rbp
diff --git a/test/CodeGen/X86/win64_frame.ll b/test/CodeGen/X86/win64_frame.ll
new file mode 100644
index 0000000..ddba716
--- /dev/null
+++ b/test/CodeGen/X86/win64_frame.ll
@@ -0,0 +1,122 @@
+; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s
+
+define i32 @f1(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5) "no-frame-pointer-elim"="true" {
+  ; CHECK-LABEL: f1:
+  ; CHECK:       movl    48(%rbp), %eax
+  ret i32 %p5
+}
+
+define void @f2(i32 %p, ...) "no-frame-pointer-elim"="true" {
+  ; CHECK-LABEL: f2:
+  ; CHECK:      .seh_stackalloc 8
+  ; CHECK:      movq    %rsp, %rbp
+  ; CHECK:      .seh_setframe 5, 0
+  ; CHECK:      movq    %rdx, 32(%rbp)
+  ; CHECK:      leaq    32(%rbp), %rax
+  %ap = alloca i8, align 8
+  call void @llvm.va_start(i8* %ap)
+  ret void
+}
+
+define i8* @f3() "no-frame-pointer-elim"="true" {
+  ; CHECK-LABEL: f3:
+  ; CHECK:      movq    %rsp, %rbp
+  ; CHECK:      .seh_setframe 5, 0
+  ; CHECK:      movq    8(%rbp), %rax
+  %ra = call i8* @llvm.returnaddress(i32 0)
+  ret i8* %ra
+}
+
+define i8* @f4() "no-frame-pointer-elim"="true" {
+  ; CHECK-LABEL: f4:
+  ; CHECK:      pushq   %rbp
+  ; CHECK:      .seh_pushreg 5
+  ; CHECK:      subq    $304, %rsp
+  ; CHECK:      .seh_stackalloc 304
+  ; CHECK:      leaq    128(%rsp), %rbp
+  ; CHECK:      .seh_setframe 5, 128
+  ; CHECK:      .seh_endprologue
+  ; CHECK:      movq    184(%rbp), %rax
+  alloca [300 x i8]
+  %ra = call i8* @llvm.returnaddress(i32 0)
+  ret i8* %ra
+}
+
+declare void @external(i8*)
+
+define void @f5() "no-frame-pointer-elim"="true" {
+  ; CHECK-LABEL: f5:
+  ; CHECK:      subq    $336, %rsp
+  ; CHECK:      .seh_stackalloc 336
+  ; CHECK:      leaq    128(%rsp), %rbp
+  ; CHECK:      .seh_setframe 5, 128
+  ; CHECK:      leaq    -92(%rbp), %rcx
+  ; CHECK:      callq   external
+  %a = alloca [300 x i8]
+  %gep = getelementptr [300 x i8]* %a, i32 0, i32 0
+  call void @external(i8* %gep)
+  ret void
+}
+
+define void @f6(i32 %p, ...) "no-frame-pointer-elim"="true" {
+  ; CHECK-LABEL: f6:
+  ; CHECK:      subq    $336, %rsp
+  ; CHECK:      .seh_stackalloc 336
+  ; CHECK:      leaq    128(%rsp), %rbp
+  ; CHECK:      .seh_setframe 5, 128
+  ; CHECK:      leaq    -92(%rbp), %rcx
+  ; CHECK:      callq   external
+  %a = alloca [300 x i8]
+  %gep = getelementptr [300 x i8]* %a, i32 0, i32 0
+  call void @external(i8* %gep)
+  ret void
+}
+
+define i32 @f7(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) "no-frame-pointer-elim"="true" {
+  ; CHECK-LABEL: f7:
+  ; CHECK:      pushq   %rbp
+  ; CHECK:      .seh_pushreg 5
+  ; CHECK:      subq    $304, %rsp
+  ; CHECK:      .seh_stackalloc 304
+  ; CHECK:      leaq    128(%rsp), %rbp
+  ; CHECK:      .seh_setframe 5, 128
+  ; CHECK:      andq    $-64, %rsp
+  ; CHECK:      movl    224(%rbp), %eax
+  ; CHECK:      leaq    176(%rbp), %rsp
+  alloca [300 x i8], align 64
+  ret i32 %e
+}
+
+define i32 @f8(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) "no-frame-pointer-elim"="true" {
+  ; CHECK-LABEL: f8:
+  ; CHECK:        subq    $352, %rsp
+  ; CHECK:        .seh_stackalloc 352
+  ; CHECK:        leaq    128(%rsp), %rbp
+  ; CHECK:        .seh_setframe 5, 128
+
+  %alloca = alloca [300 x i8], align 64
+  ; CHECK:        andq    $-64, %rsp
+  ; CHECK:        movq    %rsp, %rbx
+
+  alloca i32, i32 %a
+  ; CHECK:        movl    %ecx, %eax
+  ; CHECK:        leaq    15(,%rax,4), %rax
+  ; CHECK:        andq    $-16, %rax
+  ; CHECK:        callq   __chkstk
+  ; CHECK:        subq    %rax, %rsp
+
+  %gep = getelementptr [300 x i8]* %alloca, i32 0, i32 0
+  call void @external(i8* %gep)
+  ; CHECK:        subq    $32, %rsp
+  ; CHECK:        leaq    (%rbx), %rcx
+  ; CHECK:        callq   external
+  ; CHECK:        addq    $32, %rsp
+
+  ret i32 %e
+  ; CHECK:        movl    %esi, %eax
+  ; CHECK:        leaq    224(%rbp), %rsp
+}
+
+declare i8* @llvm.returnaddress(i32) nounwind readnone
+
+declare void @llvm.va_start(i8*) nounwind
diff --git a/test/CodeGen/X86/win_chkstk.ll b/test/CodeGen/X86/win_chkstk.ll
index 0c02c1a..4edc89f 100644
--- a/test/CodeGen/X86/win_chkstk.ll
+++ b/test/CodeGen/X86/win_chkstk.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN_X32
 ; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN_X64
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -code-model=large | FileCheck %s -check-prefix=WIN64_LARGE
 ; RUN: llc < %s -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X32
 ; RUN: llc < %s -mtriple=x86_64-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X64
 ; RUN: llc < %s -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX
@@ -16,6 +17,8 @@ define i32 @main4k() nounwind {
 entry:
 ; WIN_X32:    calll __chkstk
 ; WIN_X64:    callq __chkstk
+; WIN64_LARGE: movabsq $__chkstk, %r11
+; WIN64_LARGE: callq *%r11
 ; MINGW_X32:  calll __alloca
 ; MINGW_X64:  callq ___chkstk_ms
 ; LINUX-NOT:  call __chkstk
@@ -52,6 +55,8 @@ define x86_64_win64cc i32 @main4k_win64() nounwind {
 entry:
 ; WIN_X32:    calll __chkstk
 ; WIN_X64:    callq __chkstk
+; WIN64_LARGE: movabsq $__chkstk, %r11
+; WIN64_LARGE: callq *%r11
 ; MINGW_X32:  calll __alloca
 ; MINGW_X64:  callq ___chkstk_ms
 ; LINUX-NOT:  call __chkstk
diff --git a/test/CodeGen/X86/win_cst_pool.ll b/test/CodeGen/X86/win_cst_pool.ll
index e8b853a..199557d 100644
--- a/test/CodeGen/X86/win_cst_pool.ll
+++ b/test/CodeGen/X86/win_cst_pool.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=sse2 | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-windows-msvc"
 
@@ -6,7 +6,7 @@ define double @double() {
   ret double 0x0000000000800000
 }
 ; CHECK:              .globl  __real@0000000000800000
-; CHECK-NEXT:         .section        .rdata,"rd",discard,__real@0000000000800000
+; CHECK-NEXT:         .section        .rdata,"dr",discard,__real@0000000000800000
 ; CHECK-NEXT:         .align  8
 ; CHECK-NEXT: __real@0000000000800000:
 ; CHECK-NEXT:         .quad   8388608
@@ -18,7 +18,7 @@ define <4 x i32> @vec1() {
   ret <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 }
 ; CHECK:              .globl  __xmm@00000000000000010000000200000003
-; CHECK-NEXT:         .section        .rdata,"rd",discard,__xmm@00000000000000010000000200000003
+; CHECK-NEXT:         .section        .rdata,"dr",discard,__xmm@00000000000000010000000200000003
 ; CHECK-NEXT:         .align  16
 ; CHECK-NEXT: __xmm@00000000000000010000000200000003:
 ; CHECK-NEXT:         .long   3
@@ -33,7 +33,7 @@ define <8 x i16> @vec2() {
   ret <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>
 }
 ; CHECK:             .globl  __xmm@00000001000200030004000500060007
-; CHECK-NEXT:        .section        .rdata,"rd",discard,__xmm@00000001000200030004000500060007
+; CHECK-NEXT:        .section        .rdata,"dr",discard,__xmm@00000001000200030004000500060007
 ; CHECK-NEXT:        .align  16
 ; CHECK-NEXT: __xmm@00000001000200030004000500060007:
 ; CHECK-NEXT:        .short  7
@@ -53,7 +53,7 @@ define <4 x float> @undef1() {
   ret <4 x float> <float 1.0, float 1.0, float undef, float undef>
 
 ; CHECK:             .globl  __xmm@00000000000000003f8000003f800000
-; CHECK-NEXT:        .section        .rdata,"rd",discard,__xmm@00000000000000003f8000003f800000
+; CHECK-NEXT:        .section        .rdata,"dr",discard,__xmm@00000000000000003f8000003f800000
 ; CHECK-NEXT:        .align  16
 ; CHECK-NEXT: __xmm@00000000000000003f8000003f800000:
 ; CHECK-NEXT:        .long   1065353216              # float 1
diff --git a/test/CodeGen/X86/win_eh_prepare.ll b/test/CodeGen/X86/win_eh_prepare.ll
new file mode 100644
index 0000000..f96fed5
--- /dev/null
+++ b/test/CodeGen/X86/win_eh_prepare.ll
@@ -0,0 +1,80 @@
+; RUN: opt -S -winehprepare -mtriple x86_64-pc-windows-msvc < %s | FileCheck %s
+
+; FIXME: Add and test outlining here.
+
+declare void @maybe_throw()
+
+@_ZTIi = external constant i8*
+@g = external global i32
+
+declare i32 @__C_specific_handler(...)
+declare i32 @__gxx_personality_seh0(...)
+declare i32 @llvm.eh.typeid.for(i8*) readnone nounwind
+
+define i32 @use_seh() {
+entry:
+  invoke void @maybe_throw()
+      to label %cont unwind label %lpad
+
+cont:
+  ret i32 0
+
+lpad:
+  %ehvals = landingpad { i8*, i32 } personality i32 (...)* @__C_specific_handler
+      cleanup
+      catch i8* bitcast (i32 (i8*, i8*)* @filt_g to i8*)
+  %ehsel = extractvalue { i8*, i32 } %ehvals, 1
+  %filt_g_sel = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 (i8*, i8*)* @filt_g to i8*))
+  %matches = icmp eq i32 %ehsel, %filt_g_sel
+  br i1 %matches, label %ret1, label %eh.resume
+
+ret1:
+  ret i32 1
+
+eh.resume:
+  resume { i8*, i32 } %ehvals
+}
+
+define internal i32 @filt_g(i8*, i8*) {
+  %g = load i32* @g
+  ret i32 %g
+}
+
+; CHECK-LABEL: define i32 @use_seh()
+; CHECK: invoke void @maybe_throw()
+; CHECK-NEXT: to label %cont unwind label %lpad
+; CHECK: eh.resume:
+; CHECK-NEXT: unreachable
+
+
+; A MinGW64-ish EH style. It could happen if a binary uses both MSVC CRT and
+; mingw CRT and is linked with LTO.
+define i32 @use_gcc() {
+entry:
+  invoke void @maybe_throw()
+      to label %cont unwind label %lpad
+
+cont:
+  ret i32 0
+
+lpad:
+  %ehvals = landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_seh0
+      cleanup
+      catch i8* bitcast (i8** @_ZTIi to i8*)
+  %ehsel = extractvalue { i8*, i32 } %ehvals, 1
+  %filt_g_sel = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 (i8*, i8*)* @filt_g to i8*))
+  %matches = icmp eq i32 %ehsel, %filt_g_sel
+  br i1 %matches, label %ret1, label %eh.resume
+
+ret1:
+  ret i32 1
+
+eh.resume:
+  resume { i8*, i32 } %ehvals
+}
+
+; CHECK-LABEL: define i32 @use_gcc()
+; CHECK: invoke void @maybe_throw()
+; CHECK-NEXT: to label %cont unwind label %lpad
+; CHECK: eh.resume:
+; CHECK: call void @_Unwind_Resume(i8* %exn.obj)
diff --git a/test/CodeGen/X86/x32-lea-1.ll b/test/CodeGen/X86/x32-lea-1.ll
new file mode 100644
index 0000000..7ccb34d
--- /dev/null
+++ b/test/CodeGen/X86/x32-lea-1.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -O0 | FileCheck %s
+; CHECK: leal {{[-0-9]*}}(%r{{s|b}}p),
+; CHECK-NOT: leal {{[-0-9]*}}(%e{{s|b}}p),
+
+define void @foo(i32** %p) {
+  %a = alloca i32, i32 10
+  %addr = getelementptr i32* %a, i32 4
+  store i32* %addr, i32** %p
+  ret void
+}
diff --git a/test/CodeGen/X86/x86-64-and-mask.ll b/test/CodeGen/X86/x86-64-and-mask.ll
index bc6c612..c8a832a 100644
--- a/test/CodeGen/X86/x86-64-and-mask.ll
+++ b/test/CodeGen/X86/x86-64-and-mask.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 < %s | FileCheck %s
+; RUN: llc < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin8"
diff --git a/test/CodeGen/X86/x86-64-baseptr.ll b/test/CodeGen/X86/x86-64-baseptr.ll
new file mode 100644
index 0000000..7fd94fa
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-baseptr.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=x86_64-pc-linux -force-align-stack -stack-alignment=32 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnux32 -force-align-stack -stack-alignment=32 < %s | FileCheck -check-prefix=X32ABI %s
+; This should run with NaCl as well ( -mtriple=x86_64-pc-nacl ) but currently doesn't due to PR22655
+
+; Make sure the correct register gets set up as the base pointer
+; This should be rbx for x64 and 64-bit NaCl and ebx for x32
+; CHECK-LABEL: base
+; CHECK: subq $32, %rsp
+; CHECK: movq %rsp, %rbx
+; X32ABI-LABEL: base
+; X32ABI: subl $32, %esp
+; X32ABI: movl %esp, %ebx
+; NACL-LABEL: base
+; NACL: subq $32, %rsp
+; NACL: movq %rsp, %rbx
+
+declare i32 @helper() nounwind
+define void @base() #0 {
+entry:
+  %k = call i32 @helper()
+  %a = alloca i32, i32 %k, align 4
+  store i32 0, i32* %a, align 4
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"}
diff --git a/test/CodeGen/X86/x86-64-psub.ll b/test/CodeGen/X86/x86-64-psub.ll
index 183ddf4..2e39c14 100644
--- a/test/CodeGen/X86/x86-64-psub.ll
+++ b/test/CodeGen/X86/x86-64-psub.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux -mattr=mmx < %s | FileCheck %s
 
 ; MMX packed sub opcodes were wrongly marked as commutative.
 ; This test checks that the operands of packed sub instructions are
diff --git a/test/CodeGen/X86/x86-inline-asm-validation.ll b/test/CodeGen/X86/x86-inline-asm-validation.ll
new file mode 100644
index 0000000..56bdc48
--- /dev/null
+++ b/test/CodeGen/X86/x86-inline-asm-validation.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple i686-gnu -filetype asm -o - %s 2>&1 | FileCheck %s
+
+define void @test_L_ff() {
+entry:
+  call void asm "", "L,~{dirflag},~{fpsr},~{flags}"(i32 255)
+  ret void
+}
+
+; CHECK-NOT: error: invalid operand for inline asm constraint 'L'
+
+define void @test_L_ffff() {
+entry:
+  call void asm "", "L,~{dirflag},~{fpsr},~{flags}"(i32 65535)
+  ret void
+}
+
+; CHECK-NOT: error: invalid operand for inline asm constraint 'L'
+
+define void @test_M_1() {
+entry:
+  call void asm "", "M,~{dirflag},~{fpsr},~{flags}"(i32 1)
+  ret void
+}
+
+; CHECK-NOT: error: invalid operand for inline asm constraint 'M'
+
+define void @test_O_64() {
+entry:
+  call void asm "", "O,~{dirflag},~{fpsr},~{flags}"(i32 64)
+  ret void
+}
+
+; CHECK-NOT: error: invalid operand for inline asm constraint 'O'
+
diff --git a/test/CodeGen/X86/x86-shifts.ll b/test/CodeGen/X86/x86-shifts.ll
index ec47933..a10134e 100644
--- a/test/CodeGen/X86/x86-shifts.ll
+++ b/test/CodeGen/X86/x86-shifts.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=sse2 | FileCheck %s
 
 ; Splat patterns below
 
diff --git a/test/CodeGen/X86/xaluo.ll b/test/CodeGen/X86/xaluo.ll
index 54a4d6aa..668628c 100644
--- a/test/CodeGen/X86/xaluo.ll
+++ b/test/CodeGen/X86/xaluo.ll
@@ -755,4 +755,4 @@ declare {i16, i1} @llvm.umul.with.overflow.i16(i16, i16) nounwind readnone
 declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
 
-!0 = metadata !{metadata !"branch_weights", i32 0, i32 2147483647}
+!0 = !{!"branch_weights", i32 0, i32 2147483647}
diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64.ll b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
index 8af782c..e154e4a 100644
--- a/test/CodeGen/X86/xop-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
@@ -92,13 +92,13 @@ define <4 x i64> @test_int_x86_xop_vpcmov_256_rm(<4 x i64> %a0, <4 x i64> %a1, <
 declare <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64>, <4 x i64>, <4 x i64>) nounwind readnone
 
 define <16 x i8> @test_int_x86_xop_vpcomeqb(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK:vpcomb
+  ; CHECK:vpcomeqb
   %res = call <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8> %a0, <16 x i8> %a1) ;
   ret <16 x i8> %res
 }
 define <16 x i8> @test_int_x86_xop_vpcomeqb_mem(<16 x i8> %a0, <16 x i8>* %a1) {
   ; CHECK-NOT: vmovaps
-  ; CHECK:vpcomb
+  ; CHECK:vpcomeqb
   %vec = load <16 x i8>* %a1
   %res = call <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8> %a0, <16 x i8> %vec) ;
   ret <16 x i8> %res
@@ -106,441 +106,441 @@ define <16 x i8> @test_int_x86_xop_vpcomeqb_mem(<16 x i8> %a0, <16 x i8>* %a1) {
 declare <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <8 x i16> @test_int_x86_xop_vpcomeqw(<8 x i16> %a0, <8 x i16> %a1) {
-  ; CHECK: vpcomw
+  ; CHECK: vpcomeqw
   %res = call <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16> %a0, <8 x i16> %a1) ;
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vpcomeqd(<4 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vpcomd
+  ; CHECK: vpcomeqd
   %res = call <4 x i32> @llvm.x86.xop.vpcomeqd(<4 x i32> %a0, <4 x i32> %a1) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.xop.vpcomeqd(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcomeqq(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: vpcomq
+  ; CHECK: vpcomeqq
   %res = call <2 x i64> @llvm.x86.xop.vpcomeqq(<2 x i64> %a0, <2 x i64> %a1) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomeqq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <16 x i8> @test_int_x86_xop_vpcomequb(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcomub
+  ; CHECK: vpcomequb
   %res = call <16 x i8> @llvm.x86.xop.vpcomequb(<16 x i8> %a0, <16 x i8> %a1) ;
   ret <16 x i8> %res
 }
 declare <16 x i8> @llvm.x86.xop.vpcomequb(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vpcomequd(<4 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vpcomud
+  ; CHECK: vpcomequd
   %res = call <4 x i32> @llvm.x86.xop.vpcomequd(<4 x i32> %a0, <4 x i32> %a1) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.xop.vpcomequd(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcomequq(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: vpcomuq
+  ; CHECK: vpcomequq
   %res = call <2 x i64> @llvm.x86.xop.vpcomequq(<2 x i64> %a0, <2 x i64> %a1) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomequq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i16> @test_int_x86_xop_vpcomequw(<8 x i16> %a0, <8 x i16> %a1) {
-  ; CHECK: vpcomuw
+  ; CHECK: vpcomequw
   %res = call <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16> %a0, <8 x i16> %a1) ;
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <16 x i8> @test_int_x86_xop_vpcomfalseb(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcomb
+  ; CHECK: vpcomfalseb
   %res = call <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8> %a0, <16 x i8> %a1) ;
   ret <16 x i8> %res
 }
 declare <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vpcomfalsed(<4 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vpcomd
+  ; CHECK: vpcomfalsed
   %res = call <4 x i32> @llvm.x86.xop.vpcomfalsed(<4 x i32> %a0, <4 x i32> %a1) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.xop.vpcomfalsed(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcomfalseq(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: vpcomq
+  ; CHECK: vpcomfalseq
   %res = call <2 x i64> @llvm.x86.xop.vpcomfalseq(<2 x i64> %a0, <2 x i64> %a1) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomfalseq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <16 x i8> @test_int_x86_xop_vpcomfalseub(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcomub
+  ; CHECK: vpcomfalseub
   %res = call <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8> %a0, <16 x i8> %a1) ;
   ret <16 x i8> %res
 }
 declare <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vpcomfalseud(<4 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vpcomud
+  ; CHECK: vpcomfalseud
   %res = call <4 x i32> @llvm.x86.xop.vpcomfalseud(<4 x i32> %a0, <4 x i32> %a1) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.xop.vpcomfalseud(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcomfalseuq(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: vpcomuq
+  ; CHECK: vpcomfalseuq
   %res = call <2 x i64> @llvm.x86.xop.vpcomfalseuq(<2 x i64> %a0, <2 x i64> %a1) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomfalseuq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i16> @test_int_x86_xop_vpcomfalseuw(<8 x i16> %a0, <8 x i16> %a1) {
-  ; CHECK: vpcomuw
+  ; CHECK: vpcomfalseuw
   %res = call <8 x i16> @llvm.x86.xop.vpcomfalseuw(<8 x i16> %a0, <8 x i16> %a1) ;
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomfalseuw(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @test_int_x86_xop_vpcomfalsew(<8 x i16> %a0, <8 x i16> %a1) {
-  ; CHECK: vpcomw
+  ; CHECK: vpcomfalsew
   %res = call <8 x i16> @llvm.x86.xop.vpcomfalsew(<8 x i16> %a0, <8 x i16> %a1) ;
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomfalsew(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <16 x i8> @test_int_x86_xop_vpcomgeb(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcomb
+  ; CHECK: vpcomgeb
   %res = call <16 x i8> @llvm.x86.xop.vpcomgeb(<16 x i8> %a0, <16 x i8> %a1) ;
   ret <16 x i8> %res
 }
 declare <16 x i8> @llvm.x86.xop.vpcomgeb(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vpcomged(<4 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vpcomd
+  ; CHECK: vpcomged
   %res = call <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32> %a0, <4 x i32> %a1) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcomgeq(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: vpcomq
+  ; CHECK: vpcomgeq
   %res = call <2 x i64> @llvm.x86.xop.vpcomgeq(<2 x i64> %a0, <2 x i64> %a1) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomgeq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <16 x i8> @test_int_x86_xop_vpcomgeub(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcomub
+  ; CHECK: vpcomgeub
   %res = call <16 x i8> @llvm.x86.xop.vpcomgeub(<16 x i8> %a0, <16 x i8> %a1) ;
   ret <16 x i8> %res
 }
 declare <16 x i8> @llvm.x86.xop.vpcomgeub(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vpcomgeud(<4 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vpcomud
+  ; CHECK: vpcomgeud
   %res = call <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32> %a0, <4 x i32> %a1) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcomgeuq(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: vpcomuq
+  ; CHECK: vpcomgeuq
   %res = call <2 x i64> @llvm.x86.xop.vpcomgeuq(<2 x i64> %a0, <2 x i64> %a1) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomgeuq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i16> @test_int_x86_xop_vpcomgeuw(<8 x i16> %a0, <8 x i16> %a1) {
-  ; CHECK: vpcomuw
+  ; CHECK: vpcomgeuw
   %res = call <8 x i16> @llvm.x86.xop.vpcomgeuw(<8 x i16> %a0, <8 x i16> %a1) ;
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomgeuw(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @test_int_x86_xop_vpcomgew(<8 x i16> %a0, <8 x i16> %a1) {
-  ; CHECK: vpcomw
+  ; CHECK: vpcomgew
   %res = call <8 x i16> @llvm.x86.xop.vpcomgew(<8 x i16> %a0, <8 x i16> %a1) ;
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomgew(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <16 x i8> @test_int_x86_xop_vpcomgtb(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcomb
+  ; CHECK: vpcomgtb
   %res = call <16 x i8> @llvm.x86.xop.vpcomgtb(<16 x i8> %a0, <16 x i8> %a1) ;
   ret <16 x i8> %res
 }
 declare <16 x i8> @llvm.x86.xop.vpcomgtb(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vpcomgtd(<4 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vpcomd
+  ; CHECK: vpcomgtd
   %res = call <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32> %a0, <4 x i32> %a1) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcomgtq(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: vpcomq
+  ; CHECK: vpcomgtq
   %res = call <2 x i64> @llvm.x86.xop.vpcomgtq(<2 x i64> %a0, <2 x i64> %a1) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomgtq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <16 x i8> @test_int_x86_xop_vpcomgtub(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcomub
+  ; CHECK: vpcomgtub
   %res = call <16 x i8> @llvm.x86.xop.vpcomgtub(<16 x i8> %a0, <16 x i8> %a1) ;
   ret <16 x i8> %res
 }
 declare <16 x i8> @llvm.x86.xop.vpcomgtub(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vpcomgtud(<4 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vpcomud
+  ; CHECK: vpcomgtud
   %res = call <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32> %a0, <4 x i32> %a1) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcomgtuq(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: vpcomuq
+  ; CHECK: vpcomgtuq
   %res = call <2 x i64> @llvm.x86.xop.vpcomgtuq(<2 x i64> %a0, <2 x i64> %a1) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomgtuq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i16> @test_int_x86_xop_vpcomgtuw(<8 x i16> %a0, <8 x i16> %a1) {
-  ; CHECK: vpcomuw
+  ; CHECK: vpcomgtuw
   %res = call <8 x i16> @llvm.x86.xop.vpcomgtuw(<8 x i16> %a0, <8 x i16> %a1) ;
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomgtuw(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @test_int_x86_xop_vpcomgtw(<8 x i16> %a0, <8 x i16> %a1) {
-  ; CHECK: vpcomw
+  ; CHECK: vpcomgtw
   %res = call <8 x i16> @llvm.x86.xop.vpcomgtw(<8 x i16> %a0, <8 x i16> %a1) ;
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomgtw(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <16 x i8> @test_int_x86_xop_vpcomleb(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcomb
+  ; CHECK: vpcomleb
   %res = call <16 x i8> @llvm.x86.xop.vpcomleb(<16 x i8> %a0, <16 x i8> %a1) ;
   ret <16 x i8> %res
 }
 declare <16 x i8> @llvm.x86.xop.vpcomleb(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vpcomled(<4 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vpcomd
+  ; CHECK: vpcomled
   %res = call <4 x i32> @llvm.x86.xop.vpcomled(<4 x i32> %a0, <4 x i32> %a1) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.xop.vpcomled(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcomleq(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: vpcomq
+  ; CHECK: vpcomleq
   %res = call <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64> %a0, <2 x i64> %a1) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <16 x i8> @test_int_x86_xop_vpcomleub(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcomub
+  ; CHECK: vpcomleub
   %res = call <16 x i8> @llvm.x86.xop.vpcomleub(<16 x i8> %a0, <16 x i8> %a1) ;
   ret <16 x i8> %res
 }
 declare <16 x i8> @llvm.x86.xop.vpcomleub(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vpcomleud(<4 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vpcomud
+  ; CHECK: vpcomleud
   %res = call <4 x i32> @llvm.x86.xop.vpcomleud(<4 x i32> %a0, <4 x i32> %a1) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.xop.vpcomleud(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcomleuq(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: vpcomuq
+  ; CHECK: vpcomleuq
   %res = call <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64> %a0, <2 x i64> %a1) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i16> @test_int_x86_xop_vpcomleuw(<8 x i16> %a0, <8 x i16> %a1) {
-  ; CHECK: vpcomuw
+  ; CHECK: vpcomleuw
   %res = call <8 x i16> @llvm.x86.xop.vpcomleuw(<8 x i16> %a0, <8 x i16> %a1) ;
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomleuw(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @test_int_x86_xop_vpcomlew(<8 x i16> %a0, <8 x i16> %a1) {
-  ; CHECK: vpcomw
+  ; CHECK: vpcomlew
   %res = call <8 x i16> @llvm.x86.xop.vpcomlew(<8 x i16> %a0, <8 x i16> %a1) ;
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomlew(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <16 x i8> @test_int_x86_xop_vpcomltb(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcomb
+  ; CHECK: vpcomltb
   %res = call <16 x i8> @llvm.x86.xop.vpcomltb(<16 x i8> %a0, <16 x i8> %a1) ;
   ret <16 x i8> %res
 }
 declare <16 x i8> @llvm.x86.xop.vpcomltb(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vpcomltd(<4 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vpcomd
+  ; CHECK: vpcomltd
   %res = call <4 x i32> @llvm.x86.xop.vpcomltd(<4 x i32> %a0, <4 x i32> %a1) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.xop.vpcomltd(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcomltq(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: vpcomq
+  ; CHECK: vpcomltq
   %res = call <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64> %a0, <2 x i64> %a1) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <16 x i8> @test_int_x86_xop_vpcomltub(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcomub
+  ; CHECK: vpcomltub
   %res = call <16 x i8> @llvm.x86.xop.vpcomltub(<16 x i8> %a0, <16 x i8> %a1) ;
   ret <16 x i8> %res
 }
 declare <16 x i8> @llvm.x86.xop.vpcomltub(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vpcomltud(<4 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vpcomud
+  ; CHECK: vpcomltud
   %res = call <4 x i32> @llvm.x86.xop.vpcomltud(<4 x i32> %a0, <4 x i32> %a1) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.xop.vpcomltud(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcomltuq(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: vpcomuq
+  ; CHECK: vpcomltuq
   %res = call <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64> %a0, <2 x i64> %a1) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i16> @test_int_x86_xop_vpcomltuw(<8 x i16> %a0, <8 x i16> %a1) {
-  ; CHECK: vpcomuw
+  ; CHECK: vpcomltuw
   %res = call <8 x i16> @llvm.x86.xop.vpcomltuw(<8 x i16> %a0, <8 x i16> %a1) ;
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomltuw(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @test_int_x86_xop_vpcomltw(<8 x i16> %a0, <8 x i16> %a1) {
-  ; CHECK: vpcomw
+  ; CHECK: vpcomltw
   %res = call <8 x i16> @llvm.x86.xop.vpcomltw(<8 x i16> %a0, <8 x i16> %a1) ;
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomltw(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <16 x i8> @test_int_x86_xop_vpcomneb(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcomb
+  ; CHECK: vpcomneqb
   %res = call <16 x i8> @llvm.x86.xop.vpcomneb(<16 x i8> %a0, <16 x i8> %a1) ;
   ret <16 x i8> %res
 }
 declare <16 x i8> @llvm.x86.xop.vpcomneb(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vpcomned(<4 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vpcomd
+  ; CHECK: vpcomneqd
   %res = call <4 x i32> @llvm.x86.xop.vpcomned(<4 x i32> %a0, <4 x i32> %a1) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.xop.vpcomned(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcomneq(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: vpcomq
+  ; CHECK: vpcomneqq
   %res = call <2 x i64> @llvm.x86.xop.vpcomneq(<2 x i64> %a0, <2 x i64> %a1) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomneq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <16 x i8> @test_int_x86_xop_vpcomneub(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcomub
+  ; CHECK: vpcomnequb
   %res = call <16 x i8> @llvm.x86.xop.vpcomneub(<16 x i8> %a0, <16 x i8> %a1) ;
   ret <16 x i8> %res
 }
 declare <16 x i8> @llvm.x86.xop.vpcomneub(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vpcomneud(<4 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vpcomud
+  ; CHECK: vpcomnequd
   %res = call <4 x i32> @llvm.x86.xop.vpcomneud(<4 x i32> %a0, <4 x i32> %a1) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.xop.vpcomneud(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcomneuq(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: vpcomuq
+  ; CHECK: vpcomnequq
   %res = call <2 x i64> @llvm.x86.xop.vpcomneuq(<2 x i64> %a0, <2 x i64> %a1) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomneuq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i16> @test_int_x86_xop_vpcomneuw(<8 x i16> %a0, <8 x i16> %a1) {
-  ; CHECK: vpcomuw
+  ; CHECK: vpcomnequw
   %res = call <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16> %a0, <8 x i16> %a1) ;
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @test_int_x86_xop_vpcomnew(<8 x i16> %a0, <8 x i16> %a1) {
-  ; CHECK: vpcomw
+  ; CHECK: vpcomneqw
   %res = call <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16> %a0, <8 x i16> %a1) ;
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <16 x i8> @test_int_x86_xop_vpcomtrueb(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcomb
+  ; CHECK: vpcomtrueb
   %res = call <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8> %a0, <16 x i8> %a1) ;
   ret <16 x i8> %res
 }
 declare <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vpcomtrued(<4 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vpcomd
+  ; CHECK: vpcomtrued
   %res = call <4 x i32> @llvm.x86.xop.vpcomtrued(<4 x i32> %a0, <4 x i32> %a1) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.xop.vpcomtrued(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcomtrueq(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: vpcomq
+  ; CHECK: vpcomtrueq
   %res = call <2 x i64> @llvm.x86.xop.vpcomtrueq(<2 x i64> %a0, <2 x i64> %a1) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomtrueq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <16 x i8> @test_int_x86_xop_vpcomtrueub(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcomub
+  ; CHECK: vpcomtrueub
   %res = call <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8> %a0, <16 x i8> %a1) ;
   ret <16 x i8> %res
 }
 declare <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vpcomtrueud(<4 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vpcomud
+  ; CHECK: vpcomtrueud
   %res = call <4 x i32> @llvm.x86.xop.vpcomtrueud(<4 x i32> %a0, <4 x i32> %a1) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.xop.vpcomtrueud(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcomtrueuq(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: vpcomuq
+  ; CHECK: vpcomtrueuq
   %res = call <2 x i64> @llvm.x86.xop.vpcomtrueuq(<2 x i64> %a0, <2 x i64> %a1) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcomtrueuq(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i16> @test_int_x86_xop_vpcomtrueuw(<8 x i16> %a0, <8 x i16> %a1) {
-  ; CHECK: vpcomuw
+  ; CHECK: vpcomtrueuw
   %res = call <8 x i16> @llvm.x86.xop.vpcomtrueuw(<8 x i16> %a0, <8 x i16> %a1) ;
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomtrueuw(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @test_int_x86_xop_vpcomtruew(<8 x i16> %a0, <8 x i16> %a1) {
-  ; CHECK: vpcomw
+  ; CHECK: vpcomtruew
   %res = call <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16> %a0, <8 x i16> %a1) ;
   ret <8 x i16> %res
 }
diff --git a/test/CodeGen/X86/xor.ll b/test/CodeGen/X86/xor.ll
index fd8e1b4..ea84a3b 100644
--- a/test/CodeGen/X86/xor.ll
+++ b/test/CodeGen/X86/xor.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2  | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-win32 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -march=x86            -mattr=+sse2 | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 | FileCheck %s -check-prefix=X64
 
 ; Though it is undefined, we want xor undef,undef to produce zero.
 define <4 x i32> @test1() nounwind {
diff --git a/test/CodeGen/XCore/dwarf_debug.ll b/test/CodeGen/XCore/dwarf_debug.ll
index 47db82d..8c9c47d 100644
--- a/test/CodeGen/XCore/dwarf_debug.ll
+++ b/test/CodeGen/XCore/dwarf_debug.ll
@@ -13,7 +13,7 @@ define i32 @f(i32 %a) {
 entry:
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !11, metadata !{metadata !"0x102"}), !dbg !12
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !11, metadata !{!"0x102"}), !dbg !12
   %0 = load i32* %a.addr, align 4, !dbg !12
   %add = add nsw i32 %0, 1, !dbg !12
   ret i32 %add, !dbg !12
@@ -23,17 +23,17 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10}
-!0 = metadata !{metadata !"0x11\0012\00\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00f\00f\00\002\000\001\000\006\00256\000\002", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @f, null, null, metadata !2} ; [ DW_TAG_subprogram ]
-!5 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ]
-!7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!11 = metadata !{metadata !"0x101\00a\0016777218\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{i32 2, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00\000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ]
+!1 = !{!"", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00f\00f\00\002\000\001\000\006\00256\000\002", !1, !5, !6, null, i32 (i32)* @f, null, null, !2} ; [ DW_TAG_subprogram ]
+!5 = !{!"0x29", !1} ; [ DW_TAG_file_type ]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ]
+!7 = !{!8, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 2}
+!11 = !{!"0x101\00a\0016777218\000", !4, !5, !8} ; [ DW_TAG_arg_variable ]
+!12 = !MDLocation(line: 2, scope: !4)
 
diff --git a/test/DebugInfo/2009-10-16-Phi.ll b/test/DebugInfo/2009-10-16-Phi.ll
index 0f799e3..e14653f 100644
--- a/test/DebugInfo/2009-10-16-Phi.ll
+++ b/test/DebugInfo/2009-10-16-Phi.ll
@@ -10,4 +10,4 @@ B2:
    ret i32 %0
 }
 
-!0 = metadata !{i32 42}
+!0 = !{i32 42}
diff --git a/test/DebugInfo/2009-11-03-InsertExtractValue.ll b/test/DebugInfo/2009-11-03-InsertExtractValue.ll
index 838ba05..0b9f1b5 100644
--- a/test/DebugInfo/2009-11-03-InsertExtractValue.ll
+++ b/test/DebugInfo/2009-11-03-InsertExtractValue.ll
@@ -4,12 +4,12 @@
 !llvm.dbg.cu = !{!5}
 !llvm.module.flags = !{!6}
 
-!0 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3foo3barEv\003\000\000\000\006\00258\000\003", metadata !4, metadata !1, metadata !2, null, null, null, i32 0, metadata !1} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x29", metadata !4} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !4, metadata !1, null, metadata !3, null} ; [ DW_TAG_subroutine_type ]
-!3 = metadata !{null}
-!4 = metadata !{metadata !"/foo", metadata !"bar.cpp"}
-!5 = metadata !{metadata !"0x11\0012\00\001\00\000\00\000", metadata !4, metadata !3, metadata !3, null, null, null}; [DW_TAG_compile_unit ]
+!0 = !{!"0x2e\00bar\00bar\00_ZN3foo3barEv\003\000\000\000\006\00258\000\003", !4, !1, !2, null, null, null, i32 0, !1} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x29", !4} ; [ DW_TAG_file_type ]
+!2 = !{!"0x15\00\000\000\000\000\000\000", !4, !1, null, !3, null} ; [ DW_TAG_subroutine_type ]
+!3 = !{null}
+!4 = !{!"/foo", !"bar.cpp"}
+!5 = !{!"0x11\0012\00\001\00\000\00\000", !4, !3, !3, null, null, null}; [DW_TAG_compile_unit ]
 
 define <{i32, i32}> @f1() {
 ; CHECK: !dbgx ![[NUMBER:[0-9]+]]
@@ -20,4 +20,4 @@ define <{i32, i32}> @f1() {
 }
 
 ; CHECK: [protected]
-!6 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!6 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll b/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
index 9c714d7..c73b945 100644
--- a/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
+++ b/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
@@ -10,17 +10,17 @@ entry:
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 139632)\001\00\000\00\000", metadata !17, metadata !1, metadata !1, metadata !3, metadata !12, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 0}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\000\001\000", metadata !17, metadata !6, metadata !7, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
-!6 = metadata !{metadata !"0x29", metadata !17} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!12 = metadata !{metadata !14}
-!14 = metadata !{metadata !"0x34\00bar\00bar\00\002\001\001", metadata !5, metadata !6, metadata !9, null, null} ; [ DW_TAG_variable ]
-!15 = metadata !{i32 3, i32 3, metadata !16, null}
-!16 = metadata !{metadata !"0xb\001\0011\000", metadata !17, metadata !5} ; [ DW_TAG_lexical_block ]
-!17 = metadata !{metadata !"fb.c", metadata !"/private/tmp"}
-!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.0 (trunk 139632)\001\00\000\00\000", !17, !1, !1, !3, !12, null} ; [ DW_TAG_compile_unit ]
+!1 = !{i32 0}
+!3 = !{!5}
+!5 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\000\001\000", !17, !6, !7, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
+!6 = !{!"0x29", !17} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!12 = !{!14}
+!14 = !{!"0x34\00bar\00bar\00\002\001\001", !5, !6, !9, null, null} ; [ DW_TAG_variable ]
+!15 = !MDLocation(line: 3, column: 3, scope: !16)
+!16 = !{!"0xb\001\0011\000", !17, !5} ; [ DW_TAG_lexical_block ]
+!17 = !{!"fb.c", !"/private/tmp"}
+!18 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/2009-11-06-NamelessGlobalVariable.ll b/test/DebugInfo/2009-11-06-NamelessGlobalVariable.ll
index 4524b27..1b7c80f 100644
--- a/test/DebugInfo/2009-11-06-NamelessGlobalVariable.ll
+++ b/test/DebugInfo/2009-11-06-NamelessGlobalVariable.ll
@@ -4,11 +4,11 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 139632)\001\00\000\00\000", metadata !8, metadata !2, metadata !2, metadata !2, metadata !3, null} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x34\00a\00a\00\002\000\001", null, metadata !6, metadata !7, i32* @0, null} ; [ DW_TAG_variable ]
-!6 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !"g.c", metadata !"/private/tmp"}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.0 (trunk 139632)\001\00\000\00\000", !8, !2, !2, !2, !3, null} ; [ DW_TAG_compile_unit ]
+!2 = !{}
+!3 = !{!5}
+!5 = !{!"0x34\00a\00a\00\002\000\001", null, !6, !7, i32* @0, null} ; [ DW_TAG_variable ]
+!6 = !{!"0x29", !8} ; [ DW_TAG_file_type ]
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!8 = !{!"g.c", !"/private/tmp"}
+!9 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/2009-11-10-CurrentFn.ll b/test/DebugInfo/2009-11-10-CurrentFn.ll
index 76b1eda..bf237ee 100644
--- a/test/DebugInfo/2009-11-10-CurrentFn.ll
+++ b/test/DebugInfo/2009-11-10-CurrentFn.ll
@@ -13,19 +13,19 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 139632)\001\00\000\00\000", metadata !17, metadata !1, metadata !1, metadata !3, metadata !1, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00bar\00bar\00\003\000\001\000\006\00256\001\000", metadata !17, metadata !6, metadata !7, null, void (i32)* @bar, null, null, metadata !9} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [bar]
-!6 = metadata !{metadata !"0x29", metadata !17} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null}
-!9 = metadata !{metadata !11}
-!11 = metadata !{metadata !"0x101\00i\0016777219\000", metadata !17, metadata !5, metadata !12} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!13 = metadata !{i32 3, i32 14, metadata !5, null}
-!14 = metadata !{i32 4, i32 3, metadata !15, null}
-!15 = metadata !{metadata !"0xb\003\0017\000", metadata !17, metadata !5} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 5, i32 1, metadata !15, null}
-!17 = metadata !{metadata !"cf.c", metadata !"/private/tmp"}
-!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.0 (trunk 139632)\001\00\000\00\000", !17, !1, !1, !3, !1, null} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00bar\00bar\00\003\000\001\000\006\00256\001\000", !17, !6, !7, null, void (i32)* @bar, null, null, !9} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [bar]
+!6 = !{!"0x29", !17} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null}
+!9 = !{!11}
+!11 = !{!"0x101\00i\0016777219\000", !17, !5, !12} ; [ DW_TAG_arg_variable ]
+!12 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!13 = !MDLocation(line: 3, column: 14, scope: !5)
+!14 = !MDLocation(line: 4, column: 3, scope: !15)
+!15 = !{!"0xb\003\0017\000", !17, !5} ; [ DW_TAG_lexical_block ]
+!16 = !MDLocation(line: 5, column: 1, scope: !15)
+!17 = !{!"cf.c", !"/private/tmp"}
+!18 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/2010-01-05-DbgScope.ll b/test/DebugInfo/2010-01-05-DbgScope.ll
index e85a9ec..d559720 100644
--- a/test/DebugInfo/2010-01-05-DbgScope.ll
+++ b/test/DebugInfo/2010-01-05-DbgScope.ll
@@ -11,15 +11,15 @@ entry:
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!14}
 
-!0 = metadata !{i32 571, i32 3, metadata !1, null}
-!1 = metadata !{metadata !"0xb\001\001\000", metadata !11, metadata !2}; [DW_TAG_lexical_block ]
-!2 = metadata !{metadata !"0x2e\00foo\00foo\00foo\00561\000\001\000\006\000\000\000", i32 0, metadata !3, metadata !4, null, null, null, null, null}; [DW_TAG_subprogram ]
-!3 = metadata !{metadata !"0x11\0012\00clang 1.1\001\00\000\00\000", metadata !11, metadata !12, metadata !12, metadata !13, null, null}; [DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, metadata !3, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !3} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 588, i32 1, metadata !2, null}
-!11 = metadata !{metadata !"hashtab.c", metadata !"/usr/src/gnu/usr.bin/cc/cc_tools/../../../../contrib/gcclibs/libiberty"}
-!12 = metadata !{i32 0}
-!13 = metadata !{metadata !2}
-!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !MDLocation(line: 571, column: 3, scope: !1)
+!1 = !{!"0xb\001\001\000", !11, !2}; [DW_TAG_lexical_block ]
+!2 = !{!"0x2e\00foo\00foo\00foo\00561\000\001\000\006\000\000\000", i32 0, !3, !4, null, null, null, null, null}; [DW_TAG_subprogram ]
+!3 = !{!"0x11\0012\00clang 1.1\001\00\000\00\000", !11, !12, !12, !13, null, null}; [DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", null, !3, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{!6}
+!6 = !{!"0x24\00char\000\008\008\000\000\006", null, !3} ; [ DW_TAG_base_type ]
+!10 = !MDLocation(line: 588, column: 1, scope: !2)
+!11 = !{!"hashtab.c", !"/usr/src/gnu/usr.bin/cc/cc_tools/../../../../contrib/gcclibs/libiberty"}
+!12 = !{i32 0}
+!13 = !{!2}
+!14 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/2010-03-12-llc-crash.ll b/test/DebugInfo/2010-03-12-llc-crash.ll
index 0075f4e..1b0d794 100644
--- a/test/DebugInfo/2010-03-12-llc-crash.ll
+++ b/test/DebugInfo/2010-03-12-llc-crash.ll
@@ -5,18 +5,18 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define void @foo() {
 entry:
-  call void @llvm.dbg.declare(metadata !{i32* undef}, metadata !0, metadata !{metadata !"0x102"})
+  call void @llvm.dbg.declare(metadata i32* undef, metadata !0, metadata !{!"0x102"})
   ret void
 }
 
-!0 = metadata !{metadata !"0x101\00sy\00890\000", metadata !1, metadata !2, metadata !7} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\00892\000\001\000\006\000\000\000", metadata !8, metadata !3, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\004\00clang 1.1\001\00\000\00\000", metadata !9, metadata !10, metadata !10, null, null, null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !9, metadata !5, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{metadata !"0x29", metadata !9} ; [ DW_TAG_file_type ]
-!6 = metadata !{null}
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !9, metadata !5} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !"qpainter.h", metadata !"QtGui"}
-!9 = metadata !{metadata !"splineeditor.cpp", metadata !"src"}
-!10 = metadata !{i32 0}
+!0 = !{!"0x101\00sy\00890\000", !1, !2, !7} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00foo\00foo\00foo\00892\000\001\000\006\000\000\000", !8, !3, !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x29", !8} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\004\00clang 1.1\001\00\000\00\000", !9, !10, !10, null, null, null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !9, !5, null, !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{!"0x29", !9} ; [ DW_TAG_file_type ]
+!6 = !{null}
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", !9, !5} ; [ DW_TAG_base_type ]
+!8 = !{!"qpainter.h", !"QtGui"}
+!9 = !{!"splineeditor.cpp", !"src"}
+!10 = !{i32 0}
diff --git a/test/DebugInfo/2010-03-19-DbgDeclare.ll b/test/DebugInfo/2010-03-19-DbgDeclare.ll
index 32021c5..8316801 100644
--- a/test/DebugInfo/2010-03-19-DbgDeclare.ll
+++ b/test/DebugInfo/2010-03-19-DbgDeclare.ll
@@ -4,16 +4,16 @@
 
 define void @Foo(i32 %a, i32 %b) {
 entry:
-  call void @llvm.dbg.declare(metadata !{i32* null}, metadata !1, metadata !{metadata !"0x102"})
+  call void @llvm.dbg.declare(metadata i32* null, metadata !1, metadata !{!"0x102"})
   ret void
 }
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!5}
-!2 = metadata !{metadata !"0x11\0032769\00clang version 3.3 \000\00\000\00\001", metadata !4, metadata !3, metadata !3, metadata !3, metadata !3,  metadata !3} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/scratch.cpp] [lang 0x8001]
-!3 = metadata !{}
-!0 = metadata !{i32 662302, i32 26, metadata !1, null}
-!1 = metadata !{i32 4, metadata !"foo"}
-!4 = metadata !{metadata !"scratch.cpp", metadata !"/usr/local/google/home/blaikie/dev/scratch"}
+!2 = !{!"0x11\0032769\00clang version 3.3 \000\00\000\00\001", !4, !3, !3, !3, !3,  !3} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/scratch.cpp] [lang 0x8001]
+!3 = !{}
+!0 = !MDLocation(line: 662302, column: 26, scope: !1)
+!1 = !{i32 4, !"foo"}
+!4 = !{!"scratch.cpp", !"/usr/local/google/home/blaikie/dev/scratch"}
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
-!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!5 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/2010-03-24-MemberFn.ll b/test/DebugInfo/2010-03-24-MemberFn.ll
index 71f4acb..7b09109 100644
--- a/test/DebugInfo/2010-03-24-MemberFn.ll
+++ b/test/DebugInfo/2010-03-24-MemberFn.ll
@@ -8,7 +8,7 @@ entry:
   %0 = alloca i32                                 ; <i32*> [#uses=2]
   %s1 = alloca %struct.S                          ; <%struct.S*> [#uses=1]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.S* %s1}, metadata !0, metadata !{metadata !"0x102"}), !dbg !16
+  call void @llvm.dbg.declare(metadata %struct.S* %s1, metadata !0, metadata !{!"0x102"}), !dbg !16
   %1 = call i32 @_ZN1S3fooEv(%struct.S* %s1) nounwind, !dbg !17 ; <i32> [#uses=1]
   store i32 %1, i32* %0, align 4, !dbg !17
   %2 = load i32* %0, align 4, !dbg !17            ; <i32> [#uses=1]
@@ -25,7 +25,7 @@ entry:
   %this_addr = alloca %struct.S*                  ; <%struct.S**> [#uses=1]
   %retval = alloca i32                            ; <i32*> [#uses=1]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.S** %this_addr}, metadata !18, metadata !{metadata !"0x102"}), !dbg !21
+  call void @llvm.dbg.declare(metadata %struct.S** %this_addr, metadata !18, metadata !{!"0x102"}), !dbg !21
   store %struct.S* %this, %struct.S** %this_addr
   br label %return, !dbg !21
 
@@ -39,32 +39,32 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!5}
 !llvm.module.flags = !{!28}
 
-!0 = metadata !{metadata !"0x100\00s1\003\000", metadata !1, metadata !4, metadata !9} ; [ DW_TAG_auto_variable ]
-!1 = metadata !{metadata !"0xb\003\000\000", metadata !25, metadata !2} ; [ DW_TAG_lexical_block ]
-!2 = metadata !{metadata !"0xb\003\000\000", metadata !25, metadata !3} ; [ DW_TAG_lexical_block ]
-!3 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3barv\003\000\001\000\006\000\000\003", metadata !25, metadata !4, metadata !6, null, i32 ()* @_Z3barv, null, null, null} ; [ DW_TAG_subprogram ]
-!4 = metadata !{metadata !"0x29", metadata !25} ; [ DW_TAG_file_type ]
-!5 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !25, metadata !27, metadata !27, metadata !24, null,  null} ; [ DW_TAG_compile_unit ]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !25, metadata !4, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !25, metadata !4} ; [ DW_TAG_base_type ]
-!9 = metadata !{metadata !"0x13\00S\002\008\008\000\000\000", metadata !26, metadata !4, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [S] [line 2, size 8, align 8, offset 0] [def] [from ]
-!10 = metadata !{metadata !"0x29", metadata !26} ; [ DW_TAG_file_type ]
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0x2e\00foo\00foo\00_ZN1S3fooEv\003\000\001\000\006\000\000\003", metadata !26, metadata !9, metadata !13, null, i32 (%struct.S*)* @_ZN1S3fooEv, null, null, null} ; [ DW_TAG_subprogram ]
-!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !25, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!14 = metadata !{metadata !8, metadata !15}
-!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !25, metadata !4, metadata !9} ; [ DW_TAG_pointer_type ]
-!16 = metadata !{i32 3, i32 0, metadata !1, null}
-!17 = metadata !{i32 3, i32 0, metadata !3, null}
-!18 = metadata !{metadata !"0x101\00this\003\000", metadata !12, metadata !10, metadata !19} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !25, metadata !4, metadata !20} ; [ DW_TAG_const_type ]
-!20 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !25, metadata !4, metadata !9} ; [ DW_TAG_pointer_type ]
-!21 = metadata !{i32 3, i32 0, metadata !12, null}
-!22 = metadata !{i32 3, i32 0, metadata !23, null}
-!23 = metadata !{metadata !"0xb\003\000\000", metadata !26, metadata !12} ; [ DW_TAG_lexical_block ]
-!24 = metadata !{metadata !3, metadata !12}
-!25 = metadata !{metadata !"one.cc", metadata !"/tmp/"}
-!26 = metadata !{metadata !"one.h", metadata !"/tmp/"}
-!27 = metadata !{i32 0}
-!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x100\00s1\003\000", !1, !4, !9} ; [ DW_TAG_auto_variable ]
+!1 = !{!"0xb\003\000\000", !25, !2} ; [ DW_TAG_lexical_block ]
+!2 = !{!"0xb\003\000\000", !25, !3} ; [ DW_TAG_lexical_block ]
+!3 = !{!"0x2e\00bar\00bar\00_Z3barv\003\000\001\000\006\000\000\003", !25, !4, !6, null, i32 ()* @_Z3barv, null, null, null} ; [ DW_TAG_subprogram ]
+!4 = !{!"0x29", !25} ; [ DW_TAG_file_type ]
+!5 = !{!"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", !25, !27, !27, !24, null,  null} ; [ DW_TAG_compile_unit ]
+!6 = !{!"0x15\00\000\000\000\000\000\000", !25, !4, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", !25, !4} ; [ DW_TAG_base_type ]
+!9 = !{!"0x13\00S\002\008\008\000\000\000", !26, !4, null, !11, null, null, null} ; [ DW_TAG_structure_type ] [S] [line 2, size 8, align 8, offset 0] [def] [from ]
+!10 = !{!"0x29", !26} ; [ DW_TAG_file_type ]
+!11 = !{!12}
+!12 = !{!"0x2e\00foo\00foo\00_ZN1S3fooEv\003\000\001\000\006\000\000\003", !26, !9, !13, null, i32 (%struct.S*)* @_ZN1S3fooEv, null, null, null} ; [ DW_TAG_subprogram ]
+!13 = !{!"0x15\00\000\000\000\000\000\000", !25, null, null, !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = !{!8, !15}
+!15 = !{!"0xf\00\000\0064\0064\000\0064", !25, !4, !9} ; [ DW_TAG_pointer_type ]
+!16 = !MDLocation(line: 3, scope: !1)
+!17 = !MDLocation(line: 3, scope: !3)
+!18 = !{!"0x101\00this\003\000", !12, !10, !19} ; [ DW_TAG_arg_variable ]
+!19 = !{!"0x26\00\000\0064\0064\000\0064", !25, !4, !20} ; [ DW_TAG_const_type ]
+!20 = !{!"0xf\00\000\0064\0064\000\000", !25, !4, !9} ; [ DW_TAG_pointer_type ]
+!21 = !MDLocation(line: 3, scope: !12)
+!22 = !MDLocation(line: 3, scope: !23)
+!23 = !{!"0xb\003\000\000", !26, !12} ; [ DW_TAG_lexical_block ]
+!24 = !{!3, !12}
+!25 = !{!"one.cc", !"/tmp/"}
+!26 = !{!"one.h", !"/tmp/"}
+!27 = !{i32 0}
+!28 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/2010-03-30-InvalidDbgInfoCrash.ll b/test/DebugInfo/2010-03-30-InvalidDbgInfoCrash.ll
index 1f90a34..c967f73 100644
--- a/test/DebugInfo/2010-03-30-InvalidDbgInfoCrash.ll
+++ b/test/DebugInfo/2010-03-30-InvalidDbgInfoCrash.ll
@@ -2,7 +2,7 @@
 
 define void @baz(i32 %i) nounwind ssp {
 entry:
-  call void @llvm.dbg.declare(metadata !0, metadata !1, metadata !{metadata !"0x102"}), !dbg !0
+  call void @llvm.dbg.declare(metadata !0, metadata !1, metadata !{!"0x102"}), !dbg !0
   ret void, !dbg !0
 }
 
@@ -11,26 +11,26 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!5}
 !llvm.module.flags = !{!22}
 
-!0 = metadata !{{ [0 x i8] }** undef}
-!1 = metadata !{metadata !"0x100\00x\0011\000", metadata !2, metadata !4, metadata !9} ; [ DW_TAG_auto_variable ]
-!2 = metadata !{metadata !"0xb\008\000\000", metadata !20, metadata !3} ; [ DW_TAG_lexical_block ]
-!3 = metadata !{metadata !"0x2e\00baz\00baz\00baz\008\001\001\000\006\000\000\000", metadata !20, null, metadata !6, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!4 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
-!5 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !20, metadata !21, metadata !21, null, null, null} ; [ DW_TAG_compile_unit ]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !4, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null, metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !20, metadata !4} ; [ DW_TAG_base_type ]
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !20, metadata !4, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{metadata !"0x13\00\0011\008\008\000\000\000", metadata !20, metadata !3, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [line 11, size 8, align 8, offset 0] [def] [from ]
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0xd\00b\0011\008\008\000\000", metadata !20, metadata !10, metadata !13} ; [ DW_TAG_member ]
-!13 = metadata !{metadata !"0x16\00A\0011\000\000\000\000", metadata !20, metadata !3, metadata !14} ; [ DW_TAG_typedef ]
-!14 = metadata !{metadata !"0x1\00\000\008\008\000\000", metadata !20, metadata !4, metadata !15, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
-!15 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !20, metadata !4} ; [ DW_TAG_base_type ]
-!16 = metadata !{metadata !17}
-!17 = metadata !{metadata !"0x21\000\001"}        ; [ DW_TAG_subrange_type ]
-!18 = metadata !{metadata !"llvm.mdnode.fwdref.19"}
-!19 = metadata !{metadata !"llvm.mdnode.fwdref.23"}
-!20 = metadata !{metadata !"2007-12-VarArrayDebug.c", metadata !"/Users/sabre/llvm/test/FrontendC/"}
-!21 = metadata !{i32 0}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{{ [0 x i8] }** undef}
+!1 = !{!"0x100\00x\0011\000", !2, !4, !9} ; [ DW_TAG_auto_variable ]
+!2 = !{!"0xb\008\000\000", !20, !3} ; [ DW_TAG_lexical_block ]
+!3 = !{!"0x2e\00baz\00baz\00baz\008\001\001\000\006\000\000\000", !20, null, !6, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!4 = !{!"0x29", !20} ; [ DW_TAG_file_type ]
+!5 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", !20, !21, !21, null, null, null} ; [ DW_TAG_compile_unit ]
+!6 = !{!"0x15\00\000\000\000\000\000\000", !20, !4, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", !20, !4} ; [ DW_TAG_base_type ]
+!9 = !{!"0xf\00\000\0064\0064\000\000", !20, !4, !10} ; [ DW_TAG_pointer_type ]
+!10 = !{!"0x13\00\0011\008\008\000\000\000", !20, !3, null, !11, null, null, null} ; [ DW_TAG_structure_type ] [line 11, size 8, align 8, offset 0] [def] [from ]
+!11 = !{!12}
+!12 = !{!"0xd\00b\0011\008\008\000\000", !20, !10, !13} ; [ DW_TAG_member ]
+!13 = !{!"0x16\00A\0011\000\000\000\000", !20, !3, !14} ; [ DW_TAG_typedef ]
+!14 = !{!"0x1\00\000\008\008\000\000", !20, !4, !15, !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
+!15 = !{!"0x24\00char\000\008\008\000\000\006", !20, !4} ; [ DW_TAG_base_type ]
+!16 = !{!17}
+!17 = !{!"0x21\000\001"}        ; [ DW_TAG_subrange_type ]
+!18 = !{!"llvm.mdnode.fwdref.19"}
+!19 = !{!"llvm.mdnode.fwdref.23"}
+!20 = !{!"2007-12-VarArrayDebug.c", !"/Users/sabre/llvm/test/FrontendC/"}
+!21 = !{i32 0}
+!22 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll b/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll
index b60e5c4..ce52d24 100644
--- a/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll
+++ b/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll
@@ -26,7 +26,7 @@ entry:
   %retval = alloca i32, align 4                   ; <i32*> [#uses=3]
   %b = alloca %class.A, align 1                   ; <%class.A*> [#uses=1]
   store i32 0, i32* %retval
-  call void @llvm.dbg.declare(metadata !{%class.A* %b}, metadata !0, metadata !{metadata !"0x102"}), !dbg !14
+  call void @llvm.dbg.declare(metadata %class.A* %b, metadata !0, metadata !{!"0x102"}), !dbg !14
   %call = call i32 @_ZN1B2fnEv(%class.A* %b), !dbg !15 ; <i32> [#uses=1]
   store i32 %call, i32* %retval, !dbg !15
   %0 = load i32* %retval, !dbg !16                ; <i32> [#uses=1]
@@ -42,10 +42,10 @@ entry:
   %a = alloca %class.A, align 1                   ; <%class.A*> [#uses=1]
   %i = alloca i32, align 4                        ; <i32*> [#uses=2]
   store %class.A* %this, %class.A** %this.addr
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !17, metadata !{metadata !"0x102"}), !dbg !18
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !17, metadata !{!"0x102"}), !dbg !18
   %this1 = load %class.A** %this.addr             ; <%class.A*> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !19, metadata !{metadata !"0x102"}), !dbg !27
-  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !28, metadata !{metadata !"0x102"}), !dbg !29
+  call void @llvm.dbg.declare(metadata %class.A* %a, metadata !19, metadata !{!"0x102"}), !dbg !27
+  call void @llvm.dbg.declare(metadata i32* %i, metadata !28, metadata !{!"0x102"}), !dbg !29
   %call = call i32 @_ZZN1B2fnEvEN1A3fooEv(%class.A* %a), !dbg !30 ; <i32> [#uses=1]
   store i32 %call, i32* %i, !dbg !30
   %tmp = load i32* %i, !dbg !31                   ; <i32> [#uses=1]
@@ -59,7 +59,7 @@ entry:
   %retval = alloca i32, align 4                   ; <i32*> [#uses=2]
   %this.addr = alloca %class.A*, align 8          ; <%class.A**> [#uses=2]
   store %class.A* %this, %class.A** %this.addr
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !33, metadata !{metadata !"0x102"}), !dbg !34
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !33, metadata !{!"0x102"}), !dbg !34
   %this1 = load %class.A** %this.addr             ; <%class.A*> [#uses=0]
   store i32 42, i32* %retval, !dbg !35
   %0 = load i32* %retval, !dbg !35                ; <i32> [#uses=1]
@@ -68,45 +68,45 @@ entry:
 
 !llvm.dbg.cu = !{!4}
 !llvm.module.flags = !{!40}
-!37 = metadata !{metadata !2, metadata !10, metadata !23}
+!37 = !{!2, !10, !23}
 
-!0 = metadata !{metadata !"0x100\00b\0016\000", metadata !1, metadata !3, metadata !8} ; [ DW_TAG_auto_variable ]
-!1 = metadata !{metadata !"0xb\0015\0012\000", metadata !38, metadata !2} ; [ DW_TAG_lexical_block ]
-!2 = metadata !{metadata !"0x2e\00main\00main\00main\0015\000\001\000\006\000\000\0015", metadata !38, metadata !3, metadata !5, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
-!3 = metadata !{metadata !"0x29", metadata !38} ; [ DW_TAG_file_type ]
-!4 = metadata !{metadata !"0x11\004\00clang 1.5\000\00\000\00\000", metadata !38, metadata !39, metadata !39, metadata !37, null,  null} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !38, metadata !3, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!6 = metadata !{metadata !7}
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !38, metadata !3} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !"0x2\00B\002\008\008\000\000\000", metadata !38, metadata !3, null, metadata !9, null, null, null} ; [ DW_TAG_class_type ] [B] [line 2, size 8, align 8, offset 0] [def] [from ]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x2e\00fn\00fn\00_ZN1B2fnEv\004\000\001\000\006\000\000\004", metadata !38, metadata !8, metadata !11, null, i32 (%class.A*)* @_ZN1B2fnEv, null, null, null} ; [ DW_TAG_subprogram ]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !38, metadata !3, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !7, metadata !13}
-!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !38, metadata !3, metadata !8} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{i32 16, i32 5, metadata !1, null}
-!15 = metadata !{i32 17, i32 3, metadata !1, null}
-!16 = metadata !{i32 18, i32 1, metadata !2, null}
-!17 = metadata !{metadata !"0x101\00this\004\000", metadata !10, metadata !3, metadata !13} ; [ DW_TAG_arg_variable ]
-!18 = metadata !{i32 4, i32 7, metadata !10, null}
-!19 = metadata !{metadata !"0x100\00a\009\000", metadata !20, metadata !3, metadata !21} ; [ DW_TAG_auto_variable ]
-!20 = metadata !{metadata !"0xb\004\0012\000", metadata !38, metadata !10} ; [ DW_TAG_lexical_block ]
-!21 = metadata !{metadata !"0x2\00A\005\008\008\000\000\000", metadata !38, metadata !10, null, metadata !22, null, null, null} ; [ DW_TAG_class_type ] [A] [line 5, size 8, align 8, offset 0] [def] [from ]
-!22 = metadata !{metadata !23}
-!23 = metadata !{metadata !"0x2e\00foo\00foo\00_ZZN1B2fnEvEN1A3fooEv\007\000\001\000\006\000\000\007", metadata !38, metadata !21, metadata !24, null, i32 (%class.A*)* @_ZZN1B2fnEvEN1A3fooEv, null, null, null} ; [ DW_TAG_subprogram ]
-!24 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !38, metadata !3, null, metadata !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!25 = metadata !{metadata !7, metadata !26}
-!26 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !38, metadata !3, metadata !21} ; [ DW_TAG_pointer_type ]
-!27 = metadata !{i32 9, i32 7, metadata !20, null}
-!28 = metadata !{metadata !"0x100\00i\0010\000", metadata !20, metadata !3, metadata !7} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{i32 10, i32 9, metadata !20, null}
-!30 = metadata !{i32 10, i32 5, metadata !20, null}
-!31 = metadata !{i32 11, i32 5, metadata !20, null}
-!32 = metadata !{i32 12, i32 3, metadata !10, null}
-!33 = metadata !{metadata !"0x101\00this\007\000", metadata !23, metadata !3, metadata !26} ; [ DW_TAG_arg_variable ]
-!34 = metadata !{i32 7, i32 11, metadata !23, null}
-!35 = metadata !{i32 7, i32 19, metadata !36, null}
-!36 = metadata !{metadata !"0xb\007\0017\000", metadata !38, metadata !23} ; [ DW_TAG_lexical_block ]
-!38 = metadata !{metadata !"one.cc", metadata !"/tmp" }
-!39 = metadata !{i32 0}
-!40 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x100\00b\0016\000", !1, !3, !8} ; [ DW_TAG_auto_variable ]
+!1 = !{!"0xb\0015\0012\000", !38, !2} ; [ DW_TAG_lexical_block ]
+!2 = !{!"0x2e\00main\00main\00main\0015\000\001\000\006\000\000\0015", !38, !3, !5, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
+!3 = !{!"0x29", !38} ; [ DW_TAG_file_type ]
+!4 = !{!"0x11\004\00clang 1.5\000\00\000\00\000", !38, !39, !39, !37, null,  null} ; [ DW_TAG_compile_unit ]
+!5 = !{!"0x15\00\000\000\000\000\000\000", !38, !3, null, !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = !{!7}
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", !38, !3} ; [ DW_TAG_base_type ]
+!8 = !{!"0x2\00B\002\008\008\000\000\000", !38, !3, null, !9, null, null, null} ; [ DW_TAG_class_type ] [B] [line 2, size 8, align 8, offset 0] [def] [from ]
+!9 = !{!10}
+!10 = !{!"0x2e\00fn\00fn\00_ZN1B2fnEv\004\000\001\000\006\000\000\004", !38, !8, !11, null, i32 (%class.A*)* @_ZN1B2fnEv, null, null, null} ; [ DW_TAG_subprogram ]
+!11 = !{!"0x15\00\000\000\000\000\000\000", !38, !3, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!7, !13}
+!13 = !{!"0xf\00\000\0064\0064\000\0064", !38, !3, !8} ; [ DW_TAG_pointer_type ]
+!14 = !MDLocation(line: 16, column: 5, scope: !1)
+!15 = !MDLocation(line: 17, column: 3, scope: !1)
+!16 = !MDLocation(line: 18, column: 1, scope: !2)
+!17 = !{!"0x101\00this\004\000", !10, !3, !13} ; [ DW_TAG_arg_variable ]
+!18 = !MDLocation(line: 4, column: 7, scope: !10)
+!19 = !{!"0x100\00a\009\000", !20, !3, !21} ; [ DW_TAG_auto_variable ]
+!20 = !{!"0xb\004\0012\000", !38, !10} ; [ DW_TAG_lexical_block ]
+!21 = !{!"0x2\00A\005\008\008\000\000\000", !38, !10, null, !22, null, null, null} ; [ DW_TAG_class_type ] [A] [line 5, size 8, align 8, offset 0] [def] [from ]
+!22 = !{!23}
+!23 = !{!"0x2e\00foo\00foo\00_ZZN1B2fnEvEN1A3fooEv\007\000\001\000\006\000\000\007", !38, !21, !24, null, i32 (%class.A*)* @_ZZN1B2fnEvEN1A3fooEv, null, null, null} ; [ DW_TAG_subprogram ]
+!24 = !{!"0x15\00\000\000\000\000\000\000", !38, !3, null, !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!25 = !{!7, !26}
+!26 = !{!"0xf\00\000\0064\0064\000\0064", !38, !3, !21} ; [ DW_TAG_pointer_type ]
+!27 = !MDLocation(line: 9, column: 7, scope: !20)
+!28 = !{!"0x100\00i\0010\000", !20, !3, !7} ; [ DW_TAG_auto_variable ]
+!29 = !MDLocation(line: 10, column: 9, scope: !20)
+!30 = !MDLocation(line: 10, column: 5, scope: !20)
+!31 = !MDLocation(line: 11, column: 5, scope: !20)
+!32 = !MDLocation(line: 12, column: 3, scope: !10)
+!33 = !{!"0x101\00this\007\000", !23, !3, !26} ; [ DW_TAG_arg_variable ]
+!34 = !MDLocation(line: 7, column: 11, scope: !23)
+!35 = !MDLocation(line: 7, column: 19, scope: !36)
+!36 = !{!"0xb\007\0017\000", !38, !23} ; [ DW_TAG_lexical_block ]
+!38 = !{!"one.cc", !"/tmp" }
+!39 = !{i32 0}
+!40 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/2010-04-19-FramePtr.ll b/test/DebugInfo/2010-04-19-FramePtr.ll
index e0a9219..fe5a1f4 100644
--- a/test/DebugInfo/2010-04-19-FramePtr.ll
+++ b/test/DebugInfo/2010-04-19-FramePtr.ll
@@ -21,17 +21,17 @@ return:                                           ; preds = %entry
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!12}
-!9 = metadata !{metadata !1}
+!9 = !{!1}
 
-!0 = metadata !{i32 2, i32 0, metadata !1, null}
-!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\000\002", metadata !10, null, metadata !4, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x29", metadata !10} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !10, metadata !11, metadata !11, metadata !9, null,  null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !10, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !10, metadata !2} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 2, i32 0, metadata !8, null}
-!8 = metadata !{metadata !"0xb\002\000\000", metadata !10, metadata !1} ; [ DW_TAG_lexical_block ]
-!10 = metadata !{metadata !"a.c", metadata !"/tmp"}
-!11 = metadata !{i32 0}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !MDLocation(line: 2, scope: !1)
+!1 = !{!"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\000\002", !10, null, !4, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x29", !10} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", !10, !11, !11, !9, null,  null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !10, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{!6}
+!6 = !{!"0x24\00int\000\0032\0032\000\000\005", !10, !2} ; [ DW_TAG_base_type ]
+!7 = !MDLocation(line: 2, scope: !8)
+!8 = !{!"0xb\002\000\000", !10, !1} ; [ DW_TAG_lexical_block ]
+!10 = !{!"a.c", !"/tmp"}
+!11 = !{i32 0}
+!12 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/2010-05-03-DisableFramePtr.ll b/test/DebugInfo/2010-05-03-DisableFramePtr.ll
index 87e2498..d767871 100644
--- a/test/DebugInfo/2010-05-03-DisableFramePtr.ll
+++ b/test/DebugInfo/2010-05-03-DisableFramePtr.ll
@@ -6,7 +6,7 @@ define void @DisposeDMNotificationUPP(void (%struct.AppleEvent*)* %userUPP) "no-
 entry:
   %userUPP_addr = alloca void (%struct.AppleEvent*)* ; <void (%struct.AppleEvent*)**> [#uses=1]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{void (%struct.AppleEvent*)** %userUPP_addr}, metadata !0, metadata !{metadata !"0x102"}), !dbg !13
+  call void @llvm.dbg.declare(metadata void (%struct.AppleEvent*)** %userUPP_addr, metadata !0, metadata !{!"0x102"}), !dbg !13
   store void (%struct.AppleEvent*)* %userUPP, void (%struct.AppleEvent*)** %userUPP_addr
   br label %return, !dbg !14
 
@@ -18,23 +18,23 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!19}
-!0 = metadata !{metadata !"0x101\00userUPP\007\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00DisposeDMNotificationUPP\00DisposeDMNotificationUPP\00DisposeDMNotificationUPP\007\000\001\000\006\000\000\000", metadata !16, null, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x29", metadata !16} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)\001\00\000\00\000", metadata !16, metadata !17, metadata !17, metadata !18, null, null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !16, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{null, metadata !6}
-!6 = metadata !{metadata !"0x16\00DMNotificationUPP\006\000\000\000\000", metadata !16, metadata !2, metadata !7} ; [ DW_TAG_typedef ]
-!7 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !16, metadata !2, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !16, metadata !2, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!9 = metadata !{null, metadata !10}
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !16, metadata !2, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{metadata !"0x16\00AppleEvent\004\000\000\000\000", metadata !16, metadata !2, metadata !12} ; [ DW_TAG_typedef ]
-!12 = metadata !{metadata !"0x13\00AEDesc\001\000\000\000\004\000", metadata !16, metadata !2, null, null, null, null, null} ; [ DW_TAG_structure_type ] [AEDesc] [line 1, size 0, align 0, offset 0] [decl] [from ]
-!13 = metadata !{i32 7, i32 0, metadata !1, null}
-!14 = metadata !{i32 8, i32 0, metadata !15, null}
-!15 = metadata !{metadata !"0xb\007\000\000", metadata !16, metadata !1} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{metadata !"t.c", metadata !"/Users/echeng/LLVM/radars/r7937664/"}
-!17 = metadata !{i32 0}
-!18 = metadata !{metadata !1}
-!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x101\00userUPP\007\000", !1, !2, !6} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00DisposeDMNotificationUPP\00DisposeDMNotificationUPP\00DisposeDMNotificationUPP\007\000\001\000\006\000\000\000", !16, null, !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x29", !16} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)\001\00\000\00\000", !16, !17, !17, !18, null, null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !16, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{null, !6}
+!6 = !{!"0x16\00DMNotificationUPP\006\000\000\000\000", !16, !2, !7} ; [ DW_TAG_typedef ]
+!7 = !{!"0xf\00\000\0064\0064\000\000", !16, !2, !8} ; [ DW_TAG_pointer_type ]
+!8 = !{!"0x15\00\000\000\000\000\000\000", !16, !2, null, !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = !{null, !10}
+!10 = !{!"0xf\00\000\0064\0064\000\000", !16, !2, !11} ; [ DW_TAG_pointer_type ]
+!11 = !{!"0x16\00AppleEvent\004\000\000\000\000", !16, !2, !12} ; [ DW_TAG_typedef ]
+!12 = !{!"0x13\00AEDesc\001\000\000\000\004\000", !16, !2, null, null, null, null, null} ; [ DW_TAG_structure_type ] [AEDesc] [line 1, size 0, align 0, offset 0] [decl] [from ]
+!13 = !MDLocation(line: 7, scope: !1)
+!14 = !MDLocation(line: 8, scope: !15)
+!15 = !{!"0xb\007\000\000", !16, !1} ; [ DW_TAG_lexical_block ]
+!16 = !{!"t.c", !"/Users/echeng/LLVM/radars/r7937664/"}
+!17 = !{i32 0}
+!18 = !{!1}
+!19 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/2010-05-03-OriginDIE.ll b/test/DebugInfo/2010-05-03-OriginDIE.ll
index fd36d47..1caae64 100644
--- a/test/DebugInfo/2010-05-03-OriginDIE.ll
+++ b/test/DebugInfo/2010-05-03-OriginDIE.ll
@@ -23,12 +23,12 @@ entry:
   %a10 = call i64 @llvm.bswap.i64(i64 %a9) nounwind ; <i64> [#uses=1]
   %a11 = getelementptr inbounds %struct.gpt_t* %gpt, i32 0, i32 8, !dbg !7 ; <i64*> [#uses=1]
   %a12 = load i64* %a11, align 4, !dbg !7         ; <i64> [#uses=1]
-  call void @llvm.dbg.declare(metadata !{i64* %data_addr.i17}, metadata !8, metadata !{metadata !"0x102"}) nounwind, !dbg !14
+  call void @llvm.dbg.declare(metadata i64* %data_addr.i17, metadata !8, metadata !{!"0x102"}) nounwind, !dbg !14
   store i64 %a12, i64* %data_addr.i17, align 8
-  call void @llvm.dbg.value(metadata !6, i64 0, metadata !15, metadata !{metadata !"0x102"}) nounwind
-  call void @llvm.dbg.value(metadata !18, i64 0, metadata !19, metadata !{metadata !"0x102"}) nounwind
-  call void @llvm.dbg.declare(metadata !6, metadata !23, metadata !{metadata !"0x102"}) nounwind
-  call void @llvm.dbg.value(metadata !{i64* %data_addr.i17}, i64 0, metadata !34, metadata !{metadata !"0x102"}) nounwind
+  call void @llvm.dbg.value(metadata !6, i64 0, metadata !15, metadata !{!"0x102"}) nounwind
+  call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !19, metadata !{!"0x102"}) nounwind
+  call void @llvm.dbg.declare(metadata !6, metadata !23, metadata !{!"0x102"}) nounwind
+  call void @llvm.dbg.value(metadata i64* %data_addr.i17, i64 0, metadata !34, metadata !{!"0x102"}) nounwind
   %a13 = load volatile i64* %data_addr.i17, align 8 ; <i64> [#uses=1]
   %a14 = call i64 @llvm.bswap.i64(i64 %a13) nounwind ; <i64> [#uses=2]
   %a15 = add i64 %a10, %a14, !dbg !7              ; <i64> [#uses=1]
@@ -50,45 +50,45 @@ declare void @uuid_LtoB(i8*, i8*)
 
 !llvm.dbg.cu = !{!4}
 !llvm.module.flags = !{!41}
-!0 = metadata !{i32 808, i32 0, metadata !1, null}
-!1 = metadata !{metadata !"0xb\00807\000\000", metadata !39, metadata !2} ; [ DW_TAG_lexical_block ]
-!2 = metadata !{metadata !"0x2e\00gpt2gpm\00gpt2gpm\00gpt2gpm\00807\001\001\000\006\000\000\000", metadata !39, null, metadata !5, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!3 = metadata !{metadata !"0x29", metadata !39} ; [ DW_TAG_file_type ]
-!4 = metadata !{metadata !"0x11\001\00llvm-gcc\001\00\000\00\000", metadata !39, metadata !18, metadata !18, metadata !40, null, null} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !39, metadata !3, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!6 = metadata !{null}
-!7 = metadata !{i32 810, i32 0, metadata !1, null}
-!8 = metadata !{metadata !"0x101\00data\00201\000", metadata !9, metadata !10, metadata !11} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{metadata !"0x2e\00_OSSwapInt64\00_OSSwapInt64\00_OSSwapInt64\00202\001\001\000\006\000\000\000", metadata !10, null, metadata !5, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!10 = metadata !{metadata !"0x29", metadata !"OSByteOrder.h", metadata !"/usr/include/libkern/ppc", metadata !4} ; [ DW_TAG_file_type ]
-!11 = metadata !{metadata !"0x16\00uint64_t\0059\000\000\000\000", metadata !36, metadata !3, metadata !13} ; [ DW_TAG_typedef ]
-!12 = metadata !{metadata !"0x29", metadata !"stdint.h", metadata !"/usr/4.2.1/include", metadata !4} ; [ DW_TAG_file_type ]
-!13 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0064\000\000\007", metadata !39, metadata !3} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 202, i32 0, metadata !9, metadata !7}
-!15 = metadata !{metadata !"0x101\00base\0092\000", metadata !16, metadata !10, metadata !17} ; [ DW_TAG_arg_variable ]
-!16 = metadata !{metadata !"0x2e\00OSReadSwapInt64\00OSReadSwapInt64\00OSReadSwapInt64\0095\001\001\000\006\000\000\000", metadata !38, null, metadata !5, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!17 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !39, metadata !3, null} ; [ DW_TAG_pointer_type ]
-!18 = metadata !{i32 0}
-!19 = metadata !{metadata !"0x101\00byteOffset\0094\000", metadata !16, metadata !10, metadata !20} ; [ DW_TAG_arg_variable ]
-!20 = metadata !{metadata !"0x16\00uintptr_t\00114\000\000\000\000", metadata !37, metadata !3, metadata !22} ; [ DW_TAG_typedef ]
-!21 = metadata !{metadata !"0x29", metadata !"types.h", metadata !"/usr/include/ppc", metadata !4} ; [ DW_TAG_file_type ]
-!22 = metadata !{metadata !"0x24\00long unsigned int\000\0032\0032\000\000\007", metadata !39, metadata !3} ; [ DW_TAG_base_type ]
-!23 = metadata !{metadata !"0x100\00u\00100\000", metadata !24, metadata !10, metadata !25} ; [ DW_TAG_auto_variable ]
-!24 = metadata !{metadata !"0xb\0095\000\000", metadata !38, metadata !16} ; [ DW_TAG_lexical_block ]
-!25 = metadata !{metadata !"0x17\00\0097\0064\0064\000\000\000", metadata !38, metadata !16, null, metadata !26, null, null, null} ; [ DW_TAG_union_type ] [line 97, size 64, align 64, offset 0] [def] [from ]
-!26 = metadata !{metadata !27, metadata !28}
-!27 = metadata !{metadata !"0xd\00u64\0098\0064\0064\000\000", metadata !38, metadata !25, metadata !11} ; [ DW_TAG_member ]
-!28 = metadata !{metadata !"0xd\00u32\0099\0064\0032\000\000", metadata !38, metadata !25, metadata !29} ; [ DW_TAG_member ]
-!29 = metadata !{metadata !"0x1\00\000\0064\0032\000\000", metadata !39, metadata !3, metadata !30, metadata !32, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 64, align 32, offset 0] [from uint32_t]
-!30 = metadata !{metadata !"0x16\00uint32_t\0055\000\000\000\000", metadata !36, metadata !3, metadata !31} ; [ DW_TAG_typedef ]
-!31 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", metadata !39, metadata !3} ; [ DW_TAG_base_type ]
-!32 = metadata !{metadata !33}
-!33 = metadata !{metadata !"0x21\000\002"}        ; [ DW_TAG_subrange_type ]
-!34 = metadata !{metadata !"0x100\00addr\0096\000", metadata !24, metadata !10, metadata !35} ; [ DW_TAG_auto_variable ]
-!35 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !39, metadata !3, metadata !11} ; [ DW_TAG_pointer_type ]
-!36 = metadata !{metadata !"stdint.h", metadata !"/usr/4.2.1/include"}
-!37 = metadata !{metadata !"types.h", metadata !"/usr/include/ppc"}
-!38 = metadata !{metadata !"OSByteOrder.h", metadata !"/usr/include/libkern/ppc"}
-!39 = metadata !{metadata !"G.c", metadata !"/tmp"}
-!40 = metadata !{metadata !2, metadata !9, metadata !16}
-!41 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !MDLocation(line: 808, scope: !1)
+!1 = !{!"0xb\00807\000\000", !39, !2} ; [ DW_TAG_lexical_block ]
+!2 = !{!"0x2e\00gpt2gpm\00gpt2gpm\00gpt2gpm\00807\001\001\000\006\000\000\000", !39, null, !5, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!3 = !{!"0x29", !39} ; [ DW_TAG_file_type ]
+!4 = !{!"0x11\001\00llvm-gcc\001\00\000\00\000", !39, !18, !18, !40, null, null} ; [ DW_TAG_compile_unit ]
+!5 = !{!"0x15\00\000\000\000\000\000\000", !39, !3, null, !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = !{null}
+!7 = !MDLocation(line: 810, scope: !1)
+!8 = !{!"0x101\00data\00201\000", !9, !10, !11} ; [ DW_TAG_arg_variable ]
+!9 = !{!"0x2e\00_OSSwapInt64\00_OSSwapInt64\00_OSSwapInt64\00202\001\001\000\006\000\000\000", !10, null, !5, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!10 = !{!"0x29", !"OSByteOrder.h", !"/usr/include/libkern/ppc", !4} ; [ DW_TAG_file_type ]
+!11 = !{!"0x16\00uint64_t\0059\000\000\000\000", !36, !3, !13} ; [ DW_TAG_typedef ]
+!12 = !{!"0x29", !"stdint.h", !"/usr/4.2.1/include", !4} ; [ DW_TAG_file_type ]
+!13 = !{!"0x24\00long long unsigned int\000\0064\0064\000\000\007", !39, !3} ; [ DW_TAG_base_type ]
+!14 = !MDLocation(line: 202, scope: !9, inlinedAt: !7)
+!15 = !{!"0x101\00base\0092\000", !16, !10, !17} ; [ DW_TAG_arg_variable ]
+!16 = !{!"0x2e\00OSReadSwapInt64\00OSReadSwapInt64\00OSReadSwapInt64\0095\001\001\000\006\000\000\000", !38, null, !5, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!17 = !{!"0xf\00\000\0032\0032\000\000", !39, !3, null} ; [ DW_TAG_pointer_type ]
+!18 = !{i32 0}
+!19 = !{!"0x101\00byteOffset\0094\000", !16, !10, !20} ; [ DW_TAG_arg_variable ]
+!20 = !{!"0x16\00uintptr_t\00114\000\000\000\000", !37, !3, !22} ; [ DW_TAG_typedef ]
+!21 = !{!"0x29", !"types.h", !"/usr/include/ppc", !4} ; [ DW_TAG_file_type ]
+!22 = !{!"0x24\00long unsigned int\000\0032\0032\000\000\007", !39, !3} ; [ DW_TAG_base_type ]
+!23 = !{!"0x100\00u\00100\000", !24, !10, !25} ; [ DW_TAG_auto_variable ]
+!24 = !{!"0xb\0095\000\000", !38, !16} ; [ DW_TAG_lexical_block ]
+!25 = !{!"0x17\00\0097\0064\0064\000\000\000", !38, !16, null, !26, null, null, null} ; [ DW_TAG_union_type ] [line 97, size 64, align 64, offset 0] [def] [from ]
+!26 = !{!27, !28}
+!27 = !{!"0xd\00u64\0098\0064\0064\000\000", !38, !25, !11} ; [ DW_TAG_member ]
+!28 = !{!"0xd\00u32\0099\0064\0032\000\000", !38, !25, !29} ; [ DW_TAG_member ]
+!29 = !{!"0x1\00\000\0064\0032\000\000", !39, !3, !30, !32, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 64, align 32, offset 0] [from uint32_t]
+!30 = !{!"0x16\00uint32_t\0055\000\000\000\000", !36, !3, !31} ; [ DW_TAG_typedef ]
+!31 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", !39, !3} ; [ DW_TAG_base_type ]
+!32 = !{!33}
+!33 = !{!"0x21\000\002"}        ; [ DW_TAG_subrange_type ]
+!34 = !{!"0x100\00addr\0096\000", !24, !10, !35} ; [ DW_TAG_auto_variable ]
+!35 = !{!"0xf\00\000\0032\0032\000\000", !39, !3, !11} ; [ DW_TAG_pointer_type ]
+!36 = !{!"stdint.h", !"/usr/4.2.1/include"}
+!37 = !{!"types.h", !"/usr/include/ppc"}
+!38 = !{!"OSByteOrder.h", !"/usr/include/libkern/ppc"}
+!39 = !{!"G.c", !"/tmp"}
+!40 = !{!2, !9, !16}
+!41 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/2010-05-10-MultipleCU.ll b/test/DebugInfo/2010-05-10-MultipleCU.ll
index 2e18dbf..502007c 100644
--- a/test/DebugInfo/2010-05-10-MultipleCU.ll
+++ b/test/DebugInfo/2010-05-10-MultipleCU.ll
@@ -19,26 +19,26 @@ return:
 
 !llvm.dbg.cu = !{!4, !12}
 !llvm.module.flags = !{!21}
-!16 = metadata !{metadata !2}
-!17 = metadata !{metadata !10}
+!16 = !{!2}
+!17 = !{!10}
 
-!0 = metadata !{i32 3, i32 0, metadata !1, null}
-!1 = metadata !{metadata !"0xb\002\000\000", metadata !18, metadata !2} ; [ DW_TAG_lexical_block ]
-!2 = metadata !{metadata !"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\000\000", metadata !18, metadata !3, metadata !5, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
-!3 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
-!4 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !18, metadata !19, metadata !19, metadata !16, null, null} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !3, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!6 = metadata !{metadata !7}
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !18, metadata !3} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 3, i32 0, metadata !9, null}
-!9 = metadata !{metadata !"0xb\002\000\000", metadata !20, metadata !10} ; [ DW_TAG_lexical_block ]
-!10 = metadata !{metadata !"0x2e\00bar\00bar\00bar\002\000\001\000\006\000\000\000", metadata !20, metadata !11, metadata !13, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ]
-!11 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
-!12 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !20, metadata !19, metadata !19, metadata !17, null, null} ; [ DW_TAG_compile_unit ]
-!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !11, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!14 = metadata !{metadata !15}
-!15 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !20, metadata !11} ; [ DW_TAG_base_type ]
-!18 = metadata !{metadata !"a.c", metadata !"/tmp/"}
-!19 = metadata !{i32 0}
-!20 = metadata !{metadata !"b.c", metadata !"/tmp/"}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !MDLocation(line: 3, scope: !1)
+!1 = !{!"0xb\002\000\000", !18, !2} ; [ DW_TAG_lexical_block ]
+!2 = !{!"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\000\000", !18, !3, !5, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!3 = !{!"0x29", !18} ; [ DW_TAG_file_type ]
+!4 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", !18, !19, !19, !16, null, null} ; [ DW_TAG_compile_unit ]
+!5 = !{!"0x15\00\000\000\000\000\000\000", !18, !3, null, !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = !{!7}
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", !18, !3} ; [ DW_TAG_base_type ]
+!8 = !MDLocation(line: 3, scope: !9)
+!9 = !{!"0xb\002\000\000", !20, !10} ; [ DW_TAG_lexical_block ]
+!10 = !{!"0x2e\00bar\00bar\00bar\002\000\001\000\006\000\000\000", !20, !11, !13, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ]
+!11 = !{!"0x29", !20} ; [ DW_TAG_file_type ]
+!12 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", !20, !19, !19, !17, null, null} ; [ DW_TAG_compile_unit ]
+!13 = !{!"0x15\00\000\000\000\000\000\000", !20, !11, null, !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = !{!15}
+!15 = !{!"0x24\00int\000\0032\0032\000\000\005", !20, !11} ; [ DW_TAG_base_type ]
+!18 = !{!"a.c", !"/tmp/"}
+!19 = !{i32 0}
+!20 = !{!"b.c", !"/tmp/"}
+!21 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/2010-06-29-InlinedFnLocalVar.ll b/test/DebugInfo/2010-06-29-InlinedFnLocalVar.ll
index e1e42cd..9f0f7c3 100644
--- a/test/DebugInfo/2010-06-29-InlinedFnLocalVar.ll
+++ b/test/DebugInfo/2010-06-29-InlinedFnLocalVar.ll
@@ -14,8 +14,8 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 define i32 @bar() nounwind ssp {
 entry:
   %0 = load i32* @i, align 4, !dbg !17            ; <i32> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !9, metadata !{metadata !"0x102"}), !dbg !19
-  tail call void @llvm.dbg.declare(metadata !29, metadata !10, metadata !{metadata !"0x102"}), !dbg !21
+  tail call void @llvm.dbg.value(metadata i32 %0, i64 0, metadata !9, metadata !{!"0x102"}), !dbg !19
+  tail call void @llvm.dbg.declare(metadata !29, metadata !10, metadata !{!"0x102"}), !dbg !21
   %1 = mul nsw i32 %0, %0, !dbg !22               ; <i32> [#uses=2]
   store i32 %1, i32* @i, align 4, !dbg !17
   ret i32 %1, !dbg !23
@@ -24,33 +24,33 @@ entry:
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!28}
 
-!0 = metadata !{metadata !"0x2e\00foo\00foo\00\009\001\001\000\006\000\001\009", metadata !27, metadata !1, metadata !3, null, null, null, null, metadata !24} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x29", metadata !27} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !27, metadata !20, metadata !20, metadata !25, metadata !26,  metadata !20} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !27, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5, metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !27, metadata !1} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x2e\00bar\00bar\00bar\0014\000\001\000\006\000\001\000", metadata !27, metadata !1, metadata !7, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !27, metadata !1, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !5}
-!9 = metadata !{metadata !"0x101\00j\009\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!10 = metadata !{metadata !"0x100\00xyz\0010\000", metadata !11, metadata !1, metadata !12} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{metadata !"0xb\009\000\000", metadata !1, metadata !0} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{metadata !"0x13\00X\0010\0064\0032\000\000\000", metadata !27, metadata !0, null, metadata !13, null, null, null} ; [ DW_TAG_structure_type ] [X] [line 10, size 64, align 32, offset 0] [def] [from ]
-!13 = metadata !{metadata !14, metadata !15}
-!14 = metadata !{metadata !"0xd\00a\0010\0032\0032\000\000", metadata !27, metadata !12, metadata !5} ; [ DW_TAG_member ]
-!15 = metadata !{metadata !"0xd\00b\0010\0032\0032\0032\000", metadata !27, metadata !12, metadata !5} ; [ DW_TAG_member ]
-!16 = metadata !{metadata !"0x34\00i\00i\00\005\000\001", metadata !1, metadata !1, metadata !5, i32* @i, null} ; [ DW_TAG_variable ]
-!17 = metadata !{i32 15, i32 0, metadata !18, null}
-!18 = metadata !{metadata !"0xb\0014\000\001", metadata !1, metadata !6} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{i32 9, i32 0, metadata !0, metadata !17}
-!20 = metadata !{}
-!21 = metadata !{i32 9, i32 0, metadata !11, metadata !17}
-!22 = metadata !{i32 11, i32 0, metadata !11, metadata !17}
-!23 = metadata !{i32 16, i32 0, metadata !18, null}
-!24 = metadata !{metadata !9, metadata !10}
-!25 = metadata !{metadata !0, metadata !6}
-!26 = metadata !{metadata !16}
-!27 = metadata !{metadata !"bar.c", metadata !"/tmp/"}
-!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!29 = metadata !{null}
+!0 = !{!"0x2e\00foo\00foo\00\009\001\001\000\006\000\001\009", !27, !1, !3, null, null, null, null, !24} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x29", !27} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", !27, !20, !20, !25, !26,  !20} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !27, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5, !5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", !27, !1} ; [ DW_TAG_base_type ]
+!6 = !{!"0x2e\00bar\00bar\00bar\0014\000\001\000\006\000\001\000", !27, !1, !7, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", !27, !1, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!5}
+!9 = !{!"0x101\00j\009\000", !0, !1, !5} ; [ DW_TAG_arg_variable ]
+!10 = !{!"0x100\00xyz\0010\000", !11, !1, !12} ; [ DW_TAG_auto_variable ]
+!11 = !{!"0xb\009\000\000", !1, !0} ; [ DW_TAG_lexical_block ]
+!12 = !{!"0x13\00X\0010\0064\0032\000\000\000", !27, !0, null, !13, null, null, null} ; [ DW_TAG_structure_type ] [X] [line 10, size 64, align 32, offset 0] [def] [from ]
+!13 = !{!14, !15}
+!14 = !{!"0xd\00a\0010\0032\0032\000\000", !27, !12, !5} ; [ DW_TAG_member ]
+!15 = !{!"0xd\00b\0010\0032\0032\0032\000", !27, !12, !5} ; [ DW_TAG_member ]
+!16 = !{!"0x34\00i\00i\00\005\000\001", !1, !1, !5, i32* @i, null} ; [ DW_TAG_variable ]
+!17 = !MDLocation(line: 15, scope: !18)
+!18 = !{!"0xb\0014\000\001", !1, !6} ; [ DW_TAG_lexical_block ]
+!19 = !MDLocation(line: 9, scope: !0, inlinedAt: !17)
+!20 = !{}
+!21 = !MDLocation(line: 9, scope: !11, inlinedAt: !17)
+!22 = !MDLocation(line: 11, scope: !11, inlinedAt: !17)
+!23 = !MDLocation(line: 16, scope: !18)
+!24 = !{!9, !10}
+!25 = !{!0, !6}
+!26 = !{!16}
+!27 = !{!"bar.c", !"/tmp/"}
+!28 = !{i32 1, !"Debug Info Version", i32 2}
+!29 = !{null}
diff --git a/test/DebugInfo/2010-07-19-Crash.ll b/test/DebugInfo/2010-07-19-Crash.ll
index 7330843..8bbe48c 100644
--- a/test/DebugInfo/2010-07-19-Crash.ll
+++ b/test/DebugInfo/2010-07-19-Crash.ll
@@ -12,19 +12,19 @@ entry:
 !llvm.dbg.sp = !{!0, !6, !11}
 !llvm.dbg.lv.foo = !{!7}
 
-!0 = metadata !{metadata !"0x2e\00bar\00bar\00bar\003\000\001\000\006\000\001\000", metadata !12, metadata !1, metadata !3, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang 2.8\001\00\000\00\000", metadata !12, metadata !14, metadata !14, metadata !13, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !12, metadata !1} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x2e\00foo\00foo\00foo\007\001\001\000\006\000\001\000", metadata !12, metadata !1, metadata !3, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!7 = metadata !{metadata !"0x100\00one\008\000", metadata !8, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!8 = metadata !{metadata !"0xb\007\0018\000", metadata !12, metadata !6} ; [ DW_TAG_lexical_block ]
-!9 = metadata !{i32 4, i32 3, metadata !10, null}
-!10 = metadata !{metadata !"0xb\003\0011\000", metadata !12, metadata !0} ; [ DW_TAG_lexical_block ]
-!11 = metadata !{metadata !"0x2e\00foo\00foo\00foo\007\001\000\000\006\000\001\000", metadata !12, metadata !1, metadata !3, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!12 = metadata !{metadata !"one.c", metadata !"/private/tmp"}
-!13 = metadata !{metadata !0}
-!14 = metadata !{i32 0}
-!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00bar\00bar\00bar\003\000\001\000\006\000\001\000", !12, !1, !3, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x29", !12} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang 2.8\001\00\000\00\000", !12, !14, !14, !13, null, null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !12, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", !12, !1} ; [ DW_TAG_base_type ]
+!6 = !{!"0x2e\00foo\00foo\00foo\007\001\001\000\006\000\001\000", !12, !1, !3, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!7 = !{!"0x100\00one\008\000", !8, !1, !5} ; [ DW_TAG_auto_variable ]
+!8 = !{!"0xb\007\0018\000", !12, !6} ; [ DW_TAG_lexical_block ]
+!9 = !MDLocation(line: 4, column: 3, scope: !10)
+!10 = !{!"0xb\003\0011\000", !12, !0} ; [ DW_TAG_lexical_block ]
+!11 = !{!"0x2e\00foo\00foo\00foo\007\001\000\000\006\000\001\000", !12, !1, !3, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!12 = !{!"one.c", !"/private/tmp"}
+!13 = !{!0}
+!14 = !{i32 0}
+!15 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/2010-10-01-crash.ll b/test/DebugInfo/2010-10-01-crash.ll
index 6c6c7f5..5c822e9 100644
--- a/test/DebugInfo/2010-10-01-crash.ll
+++ b/test/DebugInfo/2010-10-01-crash.ll
@@ -4,7 +4,7 @@
 
 define void @CGRectStandardize(i32* sret %agg.result, i32* byval %rect) nounwind ssp {
 entry:
-  call void @llvm.dbg.declare(metadata !{i32* %rect}, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
+  call void @llvm.dbg.declare(metadata i32* %rect, metadata !23, metadata !{!"0x102"}), !dbg !24
   ret void
 }
 
@@ -15,12 +15,12 @@ declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32,
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!27}
-!0 = metadata !{metadata !"0x2e\00CGRectStandardize\00CGRectStandardize\00CGRectStandardize\0054\000\001\000\006\000\000\000", metadata !1, null, null, null, void (i32*, i32*)* @CGRectStandardize, null, null, null} ; [ DW_TAG_subprogram ] [line 54] [def] [scope 0] [CGRectStandardize]
-!1 = metadata !{metadata !"0x29", metadata !25} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0016\00clang version 2.9 (trunk 115292)\001\00\001\00\000", metadata !25, metadata !26, metadata !26, null, null, null} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{metadata !"0x16\00CGRect\0049\000\000\000\000", metadata !25, null, null} ; [ DW_TAG_typedef ]
-!23 = metadata !{metadata !"0x101\00rect\0053\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!24 = metadata !{i32 53, i32 33, metadata !0, null}
-!25 = metadata !{metadata !"GSFusedSilica.m", metadata !"/Volumes/Data/Users/sabre/Desktop"}
-!26 = metadata !{i32 0}
-!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00CGRectStandardize\00CGRectStandardize\00CGRectStandardize\0054\000\001\000\006\000\000\000", !1, null, null, null, void (i32*, i32*)* @CGRectStandardize, null, null, null} ; [ DW_TAG_subprogram ] [line 54] [def] [scope 0] [CGRectStandardize]
+!1 = !{!"0x29", !25} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0016\00clang version 2.9 (trunk 115292)\001\00\001\00\000", !25, !26, !26, null, null, null} ; [ DW_TAG_compile_unit ]
+!5 = !{!"0x16\00CGRect\0049\000\000\000\000", !25, null, null} ; [ DW_TAG_typedef ]
+!23 = !{!"0x101\00rect\0053\000", !0, !1, !5} ; [ DW_TAG_arg_variable ]
+!24 = !MDLocation(line: 53, column: 33, scope: !0)
+!25 = !{!"GSFusedSilica.m", !"/Volumes/Data/Users/sabre/Desktop"}
+!26 = !{i32 0}
+!27 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/AArch64/big-endian-dump.ll b/test/DebugInfo/AArch64/big-endian-dump.ll
index 3af3001..4a1a9e7 100644
--- a/test/DebugInfo/AArch64/big-endian-dump.ll
+++ b/test/DebugInfo/AArch64/big-endian-dump.ll
@@ -8,9 +8,9 @@ target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128"
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"empty.c", metadata !"/a"}
-!2 = metadata !{}
-!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
-!5 = metadata !{metadata !"clang version 3.6.0 "}
+!0 = !{i32 786449, !1, i32 12, !"clang version 3.6.0 ", i1 false, !"", i32 0, !2, !2, !2, !2, !2, !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = !{!"empty.c", !"/a"}
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 1}
+!5 = !{!"clang version 3.6.0 "}
diff --git a/test/DebugInfo/AArch64/big-endian.ll b/test/DebugInfo/AArch64/big-endian.ll
index 8391d44..79e38c4 100644
--- a/test/DebugInfo/AArch64/big-endian.ll
+++ b/test/DebugInfo/AArch64/big-endian.ll
@@ -9,14 +9,14 @@ target triple = "aarch64_be--none-eabi"
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.6.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2} ; [ DW_TAG_compile_unit ] [/work/validation/-] [DW_LANG_C99]
-!1 = metadata !{metadata !"-", metadata !"/work/validation"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x34\00a\00a\00\001\000\001", null, metadata !5, metadata !7, i32* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
-!5 = metadata !{metadata !"0x29", metadata !6}          ; [ DW_TAG_file_type ] [/work/validation/<stdin>]
-!6 = metadata !{metadata !"<stdin>", metadata !"/work/validation"}
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!10 = metadata !{metadata !"clang version 3.6.0 "}
+!0 = !{!"0x11\0012\00clang version 3.6.0 \001\00\000\00\001", !1, !2, !2, !2, !3, !2} ; [ DW_TAG_compile_unit ] [/work/validation/-] [DW_LANG_C99]
+!1 = !{!"-", !"/work/validation"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x34\00a\00a\00\001\000\001", null, !5, !7, i32* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
+!5 = !{!"0x29", !6}          ; [ DW_TAG_file_type ] [/work/validation/<stdin>]
+!6 = !{!"<stdin>", !"/work/validation"}
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 2}
+!10 = !{!"clang version 3.6.0 "}
diff --git a/test/DebugInfo/AArch64/cfi-eof-prologue.ll b/test/DebugInfo/AArch64/cfi-eof-prologue.ll
new file mode 100644
index 0000000..2d68af6
--- /dev/null
+++ b/test/DebugInfo/AArch64/cfi-eof-prologue.ll
@@ -0,0 +1,112 @@
+; struct A {
+;   A();
+;   virtual ~A();
+; };
+; struct B : A {
+;   B();
+;   virtual ~B();
+; };
+; B::B() {}
+; CHECK: __ZN1BC1Ev:
+; CHECK:     .loc	1 [[@LINE-2]] 0 prologue_end
+; CHECK-NOT: .loc	1 0 0 prologue_end
+
+; The location of the prologue_end marker should not be affected by the presence
+; of CFI instructions.
+
+; RUN: llc -O0 -filetype=asm < %s | FileCheck %s
+
+; ModuleID = 'test1.cpp'
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-apple-ios"
+
+%struct.B = type { %struct.A }
+%struct.A = type { i32 (...)** }
+
+@_ZTV1B = external unnamed_addr constant [4 x i8*]
+
+; Function Attrs: nounwind
+define %struct.B* @_ZN1BC2Ev(%struct.B* %this) unnamed_addr #0 align 2 {
+entry:
+  tail call void @llvm.dbg.value(metadata %struct.B* %this, i64 0, metadata !30, metadata !38), !dbg !39
+  %0 = getelementptr inbounds %struct.B* %this, i64 0, i32 0, !dbg !40
+  %call = tail call %struct.A* @_ZN1AC2Ev(%struct.A* %0) #3, !dbg !40
+  %1 = getelementptr inbounds %struct.B* %this, i64 0, i32 0, i32 0, !dbg !40
+  store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*]* @_ZTV1B, i64 0, i64 2) to i32 (...)**), i32 (...)*** %1, align 8, !dbg !40, !tbaa !41
+  ret %struct.B* %this, !dbg !40
+}
+
+declare %struct.A* @_ZN1AC2Ev(%struct.A*)
+
+; Function Attrs: nounwind
+define %struct.B* @_ZN1BC1Ev(%struct.B* %this) unnamed_addr #0 align 2 {
+entry:
+  tail call void @llvm.dbg.value(metadata %struct.B* %this, i64 0, metadata !34, metadata !38), !dbg !44
+  tail call void @llvm.dbg.value(metadata %struct.B* %this, i64 0, metadata !45, metadata !38) #3, !dbg !47
+  %0 = getelementptr inbounds %struct.B* %this, i64 0, i32 0, !dbg !48
+  %call.i = tail call %struct.A* @_ZN1AC2Ev(%struct.A* %0) #3, !dbg !48
+  %1 = getelementptr inbounds %struct.B* %this, i64 0, i32 0, i32 0, !dbg !48
+  store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*]* @_ZTV1B, i64 0, i64 2) to i32 (...)**), i32 (...)*** %1, align 8, !dbg !48, !tbaa !41
+  ret %struct.B* %this, !dbg !46
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+
+attributes #0 = { nounwind }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!35, !36}
+!llvm.ident = !{!37}
+
+!0 = !{!"0x11\004\00clang version 3.6.0 (trunk 224279) (llvm/trunk 224283)\001\00\000\00\001", !1, !2, !3, !27, !2, !2} ; [ DW_TAG_compile_unit ] [<stdin>] [DW_LANG_C_plus_plus]
+!1 = !{!"<stdin>", !""}
+!2 = !{}
+!3 = !{!4, !13}
+!4 = !{!"0x13\00B\005\0064\0064\000\000\000", !5, null, null, !6, !"_ZTS1A", null, !"_ZTS1B"} ; [ DW_TAG_structure_type ] [B] [line 5, size 64, align 64, offset 0] [def] [from ]
+!5 = !{!"test1.cpp", !""}
+!6 = !{!7, !8, !12}
+!7 = !{!"0x1c\00\000\000\000\000\000", null, !"_ZTS1B", !"_ZTS1A"} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from _ZTS1A]
+!8 = !{!"0x2e\00B\00B\00\006\000\000\000\000\00256\001\006", !5, !"_ZTS1B", !9, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 6] [B]
+!9 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = !{null, !11}
+!11 = !{!"0xf\00\000\0064\0064\000\001088\00", null, null, !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
+!12 = !{!"0x2e\00~B\00~B\00\007\000\000\001\000\00256\001\007", !5, !"_ZTS1B", !9, !"_ZTS1B", null, null, null, null} ; [ DW_TAG_subprogram ] [line 7] [~B]
+!13 = !{!"0x13\00A\001\0064\0064\000\000\000", !5, null, null, !14, !"_ZTS1A", null, !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 64, align 64, offset 0] [def] [from ]
+!14 = !{!15, !22, !26}
+!15 = !{!"0xd\00_vptr$A\000\0064\000\000\0064", !5, !16, !17} ; [ DW_TAG_member ] [_vptr$A] [line 0, size 64, align 0, offset 0] [artificial] [from ]
+!16 = !{!"0x29", !5}                              ; [ DW_TAG_file_type ] [test1.cpp]
+!17 = !{!"0xf\00\000\0064\000\000\000", null, null, !18} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
+!18 = !{!"0xf\00__vtbl_ptr_type\000\0064\000\000\000", null, null, !19} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
+!19 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = !{!21}
+!21 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!22 = !{!"0x2e\00A\00A\00\002\000\000\000\000\00256\001\002", !5, !"_ZTS1A", !23, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [A]
+!23 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !24, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = !{null, !25}
+!25 = !{!"0xf\00\000\0064\0064\000\001088\00", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!26 = !{!"0x2e\00~A\00~A\00\003\000\000\001\000\00256\001\003", !5, !"_ZTS1A", !23, !"_ZTS1A", null, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [~A]
+!27 = !{!28, !32}
+!28 = !{!"0x2e\00B\00B\00_ZN1BC2Ev\009\000\001\000\000\00256\001\009", !5, !"_ZTS1B", !9, null, %struct.B* (%struct.B*)* @_ZN1BC2Ev, null, !8, !29} ; [ DW_TAG_subprogram ] [line 9] [def] [B]
+!29 = !{!30}
+!30 = !{!"0x101\00this\0016777216\001088", !28, null, !31} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!31 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1B]
+!32 = !{!"0x2e\00B\00B\00_ZN1BC1Ev\009\000\001\000\000\00256\001\009", !5, !"_ZTS1B", !9, null, %struct.B* (%struct.B*)* @_ZN1BC1Ev, null, !8, !33} ; [ DW_TAG_subprogram ] [line 9] [def] [B]
+!33 = !{!34}
+!34 = !{!"0x101\00this\0016777216\001088", !32, null, !31} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!35 = !{i32 2, !"Dwarf Version", i32 4}
+!36 = !{i32 2, !"Debug Info Version", i32 2}
+!37 = !{!"clang version 3.6.0 (trunk 224279) (llvm/trunk 224283)"}
+!38 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!39 = !MDLocation(line: 0, scope: !28)
+!40 = !MDLocation(line: 9, scope: !28)
+!41 = !{!42, !42, i64 0}
+!42 = !{!"vtable pointer", !43, i64 0}
+!43 = !{!"Simple C/C++ TBAA"}
+!44 = !MDLocation(line: 0, scope: !32)
+!45 = !{!"0x101\00this\0016777216\001088", !28, null, !31, !46} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!46 = !MDLocation(line: 9, scope: !32)
+!47 = !MDLocation(line: 0, scope: !28, inlinedAt: !46)
+!48 = !MDLocation(line: 9, scope: !28, inlinedAt: !46)
diff --git a/test/DebugInfo/AArch64/coalescing.ll b/test/DebugInfo/AArch64/coalescing.ll
new file mode 100644
index 0000000..35bb041
--- /dev/null
+++ b/test/DebugInfo/AArch64/coalescing.ll
@@ -0,0 +1,65 @@
+; RUN: llc -filetype=obj %s -o - | llvm-dwarfdump - | FileCheck %s
+;
+; Generated at -Os from:
+; void *foo(void *dst);
+; void start() {
+;   unsigned size;
+;   foo(&size);
+;   if (size != 0) { // Work around a bug to preserve the dbg.value.
+;   }
+; }
+
+; ModuleID = 'test1.cpp'
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+; Function Attrs: nounwind optsize
+define void @_Z5startv() #0 {
+entry:
+  %size = alloca i32, align 4
+  %0 = bitcast i32* %size to i8*, !dbg !15
+  %call = call i8* @_Z3fooPv(i8* %0) #3, !dbg !15
+  call void @llvm.dbg.value(metadata i32* %size, i64 0, metadata !10, metadata !16), !dbg !17
+  ; CHECK: .debug_info contents:
+  ; CHECK: DW_TAG_variable
+  ; CHECK-NEXT: DW_AT_location
+  ; CHECK-NEXT: DW_AT_name {{.*}}"size"
+  ; CHECK: .debug_loc contents:
+  ; CHECK: Location description: 70 00
+  ret void, !dbg !18
+}
+
+; Function Attrs: optsize
+declare i8* @_Z3fooPv(i8*) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+
+attributes #0 = { nounwind optsize }
+attributes #1 = { optsize }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind optsize }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!12, !13}
+!llvm.ident = !{!14}
+
+!0 = !{!"0x11\004\00clang version 3.6.0 (trunk 223149) (llvm/trunk 223115)\001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [<stdin>] [DW_LANG_C_plus_plus]
+!1 = !{!"<stdin>", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00start\00start\00_Z5startv\002\000\001\000\000\00256\001\003", !5, !6, !7, null, void ()* @_Z5startv, null, null, !9} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 3] [start]
+!5 = !{!"test1.c", !""}
+!6 = !{!"0x29", !5}    ; [ DW_TAG_file_type ] [/test1.c]
+!7 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null}
+!9 = !{!10}
+!10 = !{!"0x100\00size\004\000", !4, !6, !11} ; [ DW_TAG_auto_variable ] [size] [line 4]
+!11 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!12 = !{i32 2, !"Dwarf Version", i32 2}
+!13 = !{i32 2, !"Debug Info Version", i32 2}
+!14 = !{!"clang version 3.6.0 (trunk 223149) (llvm/trunk 223115)"}
+!15 = !MDLocation(line: 5, column: 3, scope: !4)
+!16 = !{!"0x102"}               ; [ DW_TAG_expression ]
+!17 = !MDLocation(line: 4, column: 12, scope: !4)
+!18 = !MDLocation(line: 8, column: 1, scope: !4)
diff --git a/test/DebugInfo/AArch64/dwarfdump.ll b/test/DebugInfo/AArch64/dwarfdump.ll
index e9dd428..cba18b2 100644
--- a/test/DebugInfo/AArch64/dwarfdump.ll
+++ b/test/DebugInfo/AArch64/dwarfdump.ll
@@ -27,14 +27,14 @@ attributes #0 = { nounwind }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 \000\00\000\00\000", metadata !9, metadata !1, metadata !1, metadata !2, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ] [/home/timnor01/llvm/build/tmp.c] [DW_LANG_C99]
-!1 = metadata !{}
-!2 = metadata !{metadata !3}
-!3 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\000\000\001", metadata !9, metadata !4, metadata !5, null, i32 ()* @main, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
-!4 = metadata !{metadata !"0x29", metadata !9} ; [ DW_TAG_file_type ]
-!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!6 = metadata !{metadata !7}
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!8 = metadata !{i32 2, i32 0, metadata !3, null}
-!9 = metadata !{metadata !"tmp.c", metadata !"/home/tim/llvm/build"}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.3 \000\00\000\00\000", !9, !1, !1, !2, !1,  !1} ; [ DW_TAG_compile_unit ] [/home/timnor01/llvm/build/tmp.c] [DW_LANG_C99]
+!1 = !{}
+!2 = !{!3}
+!3 = !{!"0x2e\00main\00main\00\001\000\001\000\006\000\000\001", !9, !4, !5, null, i32 ()* @main, null, null, !1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
+!4 = !{!"0x29", !9} ; [ DW_TAG_file_type ]
+!5 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = !{!7}
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = !MDLocation(line: 2, scope: !3)
+!9 = !{!"tmp.c", !"/home/tim/llvm/build"}
+!10 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/AArch64/frameindices.ll b/test/DebugInfo/AArch64/frameindices.ll
new file mode 100644
index 0000000..5e00d10
--- /dev/null
+++ b/test/DebugInfo/AArch64/frameindices.ll
@@ -0,0 +1,257 @@
+; RUN: llc -O0 -filetype=obj < %s | llvm-dwarfdump - | FileCheck %s
+; Test that a variable with multiple entries in the MMI table makes it into the
+; debug info.
+;
+; CHECK: DW_TAG_inlined_subroutine
+; CHECK:    "_Z3f111A"
+; CHECK: DW_TAG_formal_parameter
+; CHECK: DW_AT_location [DW_FORM_block1]    (<0x0b> 91 51 9d 78 08 91 4a 9d 38 88 01 )
+;  -- fbreg -47, bit-piece 120 8 , fbreg -54, bit-piece 56 136 ------^
+; CHECK: DW_AT_abstract_origin {{.*}} "p1"
+;
+; long a;
+; struct A {
+;   bool x4;
+;   void *x5;
+;   bool x6;
+; };
+; int *b;
+; struct B {
+;   B(long);
+;   ~B();
+; };
+; void f9(A);
+; void f13(A p1) {
+;   b = (int *)__builtin_operator_new(a);
+;   f9(p1);
+; }
+; void f11(A p1) { f13(p1); }
+; void f16() {
+;   A c;
+;   B d(a);
+;   c.x6 = c.x4 = true;
+;   f11(c);
+; }
+; ModuleID = 'test.cpp'
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-apple-ios"
+
+%struct.A = type { i8, i8*, i8 }
+%struct.B = type { i8 }
+
+@a = global i64 0, align 8
+@b = global i32* null, align 8
+
+define void @_Z3f131A(%struct.A* nocapture readonly %p1) #0 {
+entry:
+  %agg.tmp = alloca %struct.A, align 8
+  tail call void @llvm.dbg.declare(metadata %struct.A* %p1, metadata !30, metadata !46), !dbg !47
+  %0 = load i64* @a, align 8, !dbg !48, !tbaa !49
+  %call = tail call noalias i8* @_Znwm(i64 %0) #5, !dbg !53
+  store i8* %call, i8** bitcast (i32** @b to i8**), align 8, !dbg !54, !tbaa !55
+  %1 = getelementptr inbounds %struct.A* %agg.tmp, i64 0, i32 0, !dbg !57
+  %2 = getelementptr inbounds %struct.A* %p1, i64 0, i32 0, !dbg !57
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 24, i32 8, i1 false), !dbg !57, !tbaa.struct !58
+  call void @_Z2f91A(%struct.A* %agg.tmp), !dbg !61
+  ret void, !dbg !62
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nobuiltin
+declare noalias i8* @_Znwm(i64) #2
+
+declare void @_Z2f91A(%struct.A*) #0
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #3
+
+define void @_Z3f111A(%struct.A* nocapture readonly %p1) #0 {
+entry:
+  %agg.tmp.i = alloca %struct.A, align 8
+  tail call void @llvm.dbg.declare(metadata %struct.A* %p1, metadata !33, metadata !46), !dbg !63
+  %0 = getelementptr inbounds %struct.A* %p1, i64 0, i32 0, !dbg !64
+  %1 = getelementptr inbounds %struct.A* %agg.tmp.i, i64 0, i32 0, !dbg !65
+  call void @llvm.lifetime.start(i64 24, i8* %1), !dbg !65
+  %2 = load i64* @a, align 8, !dbg !67, !tbaa !49
+  %call.i = tail call noalias i8* @_Znwm(i64 %2) #5, !dbg !68
+  store i8* %call.i, i8** bitcast (i32** @b to i8**), align 8, !dbg !69, !tbaa !55
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %0, i64 24, i32 8, i1 false), !dbg !70
+  call void @_Z2f91A(%struct.A* %agg.tmp.i), !dbg !71
+  call void @llvm.lifetime.end(i64 24, i8* %1), !dbg !72
+  ret void, !dbg !73
+}
+
+define void @_Z3f16v() #0 {
+entry:
+  %agg.tmp.i.i = alloca %struct.A, align 8
+  %d = alloca %struct.B, align 1
+  %agg.tmp.sroa.2 = alloca [15 x i8], align 1
+  %agg.tmp.sroa.4 = alloca [7 x i8], align 1
+  tail call void @llvm.dbg.declare(metadata [15 x i8]* %agg.tmp.sroa.2, metadata !74, metadata !76), !dbg !77
+  tail call void @llvm.dbg.declare(metadata [7 x i8]* %agg.tmp.sroa.4, metadata !74, metadata !78), !dbg !77
+  tail call void @llvm.dbg.declare(metadata %struct.A* undef, metadata !38, metadata !79), !dbg !80
+  %0 = load i64* @a, align 8, !dbg !81, !tbaa !49
+  tail call void @llvm.dbg.value(metadata %struct.B* %d, i64 0, metadata !39, metadata !79), !dbg !82
+  %call = call %struct.B* @_ZN1BC1El(%struct.B* %d, i64 %0), !dbg !82
+  call void @llvm.dbg.value(metadata i8 1, i64 0, metadata !38, metadata !83), !dbg !80
+  call void @llvm.dbg.value(metadata i8 1, i64 0, metadata !38, metadata !84), !dbg !80
+  call void @llvm.dbg.value(metadata i8 1, i64 0, metadata !74, metadata !83), !dbg !77
+  call void @llvm.dbg.value(metadata i8 1, i64 0, metadata !74, metadata !84), !dbg !77
+  call void @llvm.dbg.declare(metadata %struct.A* undef, metadata !74, metadata !46), !dbg !77
+  %1 = getelementptr inbounds %struct.A* %agg.tmp.i.i, i64 0, i32 0, !dbg !85
+  call void @llvm.lifetime.start(i64 24, i8* %1), !dbg !85
+  %2 = load i64* @a, align 8, !dbg !87, !tbaa !49
+  %call.i.i5 = invoke noalias i8* @_Znwm(i64 %2) #5
+          to label %call.i.i.noexc unwind label %lpad, !dbg !88
+
+call.i.i.noexc:                                   ; preds = %entry
+  %agg.tmp.sroa.4.17..sroa_idx = getelementptr inbounds [7 x i8]* %agg.tmp.sroa.4, i64 0, i64 0, !dbg !89
+  %agg.tmp.sroa.2.1..sroa_idx = getelementptr inbounds [15 x i8]* %agg.tmp.sroa.2, i64 0, i64 0, !dbg !89
+  store i8* %call.i.i5, i8** bitcast (i32** @b to i8**), align 8, !dbg !90, !tbaa !55
+  store i8 1, i8* %1, align 8, !dbg !91
+  %agg.tmp.sroa.2.0..sroa_raw_idx = getelementptr inbounds i8* %1, i64 1, !dbg !91
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %agg.tmp.sroa.2.0..sroa_raw_idx, i8* %agg.tmp.sroa.2.1..sroa_idx, i64 15, i32 1, i1 false), !dbg !91
+  %agg.tmp.sroa.3.0..sroa_idx = getelementptr inbounds %struct.A* %agg.tmp.i.i, i64 0, i32 2, !dbg !91
+  store i8 1, i8* %agg.tmp.sroa.3.0..sroa_idx, align 8, !dbg !91
+  %agg.tmp.sroa.4.0..sroa_raw_idx = getelementptr inbounds i8* %1, i64 17, !dbg !91
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %agg.tmp.sroa.4.0..sroa_raw_idx, i8* %agg.tmp.sroa.4.17..sroa_idx, i64 7, i32 1, i1 false), !dbg !91
+  invoke void @_Z2f91A(%struct.A* %agg.tmp.i.i)
+          to label %invoke.cont unwind label %lpad, !dbg !92
+
+invoke.cont:                                      ; preds = %call.i.i.noexc
+  call void @llvm.lifetime.end(i64 24, i8* %1), !dbg !93
+  call void @llvm.dbg.value(metadata %struct.B* %d, i64 0, metadata !39, metadata !79), !dbg !82
+  %call1 = call %struct.B* @_ZN1BD1Ev(%struct.B* %d) #3, !dbg !94
+  ret void, !dbg !94
+
+lpad:                                             ; preds = %call.i.i.noexc, %entry
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup, !dbg !94
+  call void @llvm.dbg.value(metadata %struct.B* %d, i64 0, metadata !39, metadata !79), !dbg !82
+  %call2 = call %struct.B* @_ZN1BD1Ev(%struct.B* %d) #3, !dbg !94
+  resume { i8*, i32 } %3, !dbg !94
+}
+
+declare %struct.B* @_ZN1BC1El(%struct.B*, i64)
+
+declare i32 @__gxx_personality_v0(...)
+
+; Function Attrs: nounwind
+declare %struct.B* @_ZN1BD1Ev(%struct.B*) #4
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #3
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #3
+
+attributes #1 = { nounwind readnone }
+attributes #2 = { nobuiltin }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind  }
+attributes #5 = { builtin }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!43, !44}
+!llvm.ident = !{!45}
+
+!0 = !{!"0x11\004\00clang version 3.7.0 \001\00\000\00\001", !1, !2, !3, !24, !40, !2} ; [ DW_TAG_compile_unit ] [/<stdin>] [DW_LANG_C_plus_plus]
+!1 = !{!"<stdin>", !""}
+!2 = !{}
+!3 = !{!4, !12, !14}
+!4 = !{!"0x13\00A\002\00192\0064\000\000\000", !5, null, null, !6, null, null, !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 2, size 192, align 64, offset 0] [def] [from ]
+!5 = !{!"test.cpp", !""}
+!6 = !{!7, !9, !11}
+!7 = !{!"0xd\00x4\003\008\008\000\000", !5, !"_ZTS1A", !8} ; [ DW_TAG_member ] [x4] [line 3, size 8, align 8, offset 0] [from bool]
+!8 = !{!"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
+!9 = !{!"0xd\00x5\004\0064\0064\0064\000", !5, !"_ZTS1A", !10} ; [ DW_TAG_member ] [x5] [line 4, size 64, align 64, offset 64] [from ]
+!10 = !{!"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!11 = !{!"0xd\00x6\005\008\008\00128\000", !5, !"_ZTS1A", !8} ; [ DW_TAG_member ] [x6] [line 5, size 8, align 8, offset 128] [from bool]
+!12 = !{!"0xf\00\000\0064\0064\000\000", null, null, !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = !{!"0x13\00B\008\008\008\000\000\000", !5, null, null, !15, null, null, !"_ZTS1B"} ; [ DW_TAG_structure_type ] [B] [line 8, size 8, align 8, offset 0] [def] [from ]
+!15 = !{!16, !21}
+!16 = !{!"0x2e\00B\00B\00\009\000\000\000\000\00256\001\009", !5, !"_ZTS1B", !17, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 9] [B]
+!17 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = !{null, !19, !20}
+!19 = !{!"0xf\00\000\0064\0064\000\001088\00", null, null, !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
+!20 = !{!"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!21 = !{!"0x2e\00~B\00~B\00\0010\000\000\000\000\00256\001\0010", !5, !"_ZTS1B", !22, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 10] [~B]
+!22 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = !{null, !19}
+!24 = !{!25, !31, !34}
+!25 = !{!"0x2e\00f13\00f13\00_Z3f131A\0013\000\001\000\000\00256\001\0013", !5, !26, !27, null, void (%struct.A*)* @_Z3f131A, null, null, !29} ; [ DW_TAG_subprogram ] [line 13] [def] [f13]
+!26 = !{!"0x29", !5}                              ; [ DW_TAG_file_type ] [/test.cpp]
+!27 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !28, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = !{null, !"_ZTS1A"}
+!29 = !{!30}
+!30 = !{!"0x101\00p1\0016777229\000", !25, !26, !"_ZTS1A"} ; [ DW_TAG_arg_variable ] [p1] [line 13]
+!31 = !{!"0x2e\00f11\00f11\00_Z3f111A\0017\000\001\000\000\00256\001\0017", !5, !26, !27, null, void (%struct.A*)* @_Z3f111A, null, null, !32} ; [ DW_TAG_subprogram ] [line 17] [def] [f11]
+!32 = !{!33}
+!33 = !{!"0x101\00p1\0016777233\000", !31, !26, !"_ZTS1A"} ; [ DW_TAG_arg_variable ] [p1] [line 17]
+!34 = !{!"0x2e\00f16\00f16\00_Z3f16v\0018\000\001\000\000\00256\001\0018", !5, !26, !35, null, void ()* @_Z3f16v, null, null, !37} ; [ DW_TAG_subprogram ] [line 18] [def] [f16]
+!35 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !36, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!36 = !{null}
+!37 = !{!38, !39}
+!38 = !{!"0x100\00c\0019\000", !34, !26, !"_ZTS1A"} ; [ DW_TAG_auto_variable ] [c] [line 19]
+!39 = !{!"0x100\00d\0020\000", !34, !26, !"_ZTS1B"} ; [ DW_TAG_auto_variable ] [d] [line 20]
+!40 = !{!41, !42}
+!41 = !{!"0x34\00a\00a\00\001\000\001", null, !26, !20, i64* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
+!42 = !{!"0x34\00b\00b\00\007\000\001", null, !26, !12, i32** @b, null} ; [ DW_TAG_variable ] [b] [line 7] [def]
+!43 = !{i32 2, !"Dwarf Version", i32 2}
+!44 = !{i32 2, !"Debug Info Version", i32 2}
+!45 = !{!"clang version 3.7.0 "}
+!46 = !{!"0x102\006"}                             ; [ DW_TAG_expression ] [DW_OP_deref]
+!47 = !MDLocation(line: 13, column: 12, scope: !25)
+!48 = !MDLocation(line: 14, column: 37, scope: !25)
+!49 = !{!50, !50, i64 0}
+!50 = !{!"long", !51, i64 0}
+!51 = !{!"omnipotent char", !52, i64 0}
+!52 = !{!"Simple C/C++ TBAA"}
+!53 = !MDLocation(line: 14, column: 14, scope: !25)
+!54 = !MDLocation(line: 14, column: 5, scope: !25)
+!55 = !{!56, !56, i64 0}
+!56 = !{!"any pointer", !51, i64 0}
+!57 = !MDLocation(line: 15, column: 6, scope: !25)
+!58 = !{i64 0, i64 1, !59, i64 8, i64 8, !55, i64 16, i64 1, !59}
+!59 = !{!60, !60, i64 0}
+!60 = !{!"bool", !51, i64 0}
+!61 = !MDLocation(line: 15, column: 3, scope: !25)
+!62 = !MDLocation(line: 16, column: 1, scope: !25)
+!63 = !MDLocation(line: 17, column: 12, scope: !31)
+!64 = !MDLocation(line: 17, column: 22, scope: !31)
+!65 = !MDLocation(line: 13, column: 12, scope: !25, inlinedAt: !66)
+!66 = distinct !MDLocation(line: 17, column: 18, scope: !31)
+!67 = !MDLocation(line: 14, column: 37, scope: !25, inlinedAt: !66)
+!68 = !MDLocation(line: 14, column: 14, scope: !25, inlinedAt: !66)
+!69 = !MDLocation(line: 14, column: 5, scope: !25, inlinedAt: !66)
+!70 = !MDLocation(line: 15, column: 6, scope: !25, inlinedAt: !66)
+!71 = !MDLocation(line: 15, column: 3, scope: !25, inlinedAt: !66)
+!72 = !MDLocation(line: 16, column: 1, scope: !25, inlinedAt: !66)
+!73 = !MDLocation(line: 17, column: 27, scope: !31)
+!74 = !{!"0x101\00p1\0016777233\000", !31, !26, !"_ZTS1A", !75} ; [ DW_TAG_arg_variable ] [p1] [line 17]
+!75 = distinct !MDLocation(line: 22, column: 3, scope: !34)
+!76 = !{!"0x102\00157\008\00120"}                 ; [ DW_TAG_expression ] [DW_OP_bit_piece offset=8, size=120]
+!77 = !MDLocation(line: 17, column: 12, scope: !31, inlinedAt: !75)
+!78 = !{!"0x102\00157\00136\0056"}                ; [ DW_TAG_expression ] [DW_OP_bit_piece offset=136, size=56]
+!79 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!80 = !MDLocation(line: 19, column: 5, scope: !34)
+!81 = !MDLocation(line: 20, column: 7, scope: !34)
+!82 = !MDLocation(line: 20, column: 5, scope: !34)
+!83 = !{!"0x102\00157\000\008"}                   ; [ DW_TAG_expression ] [DW_OP_bit_piece offset=0, size=8]
+!84 = !{!"0x102\00157\00128\008"}                 ; [ DW_TAG_expression ] [DW_OP_bit_piece offset=128, size=8]
+!85 = !MDLocation(line: 13, column: 12, scope: !25, inlinedAt: !86)
+!86 = distinct !MDLocation(line: 17, column: 18, scope: !31, inlinedAt: !75)
+!87 = !MDLocation(line: 14, column: 37, scope: !25, inlinedAt: !86)
+!88 = !MDLocation(line: 14, column: 14, scope: !25, inlinedAt: !86)
+!89 = !MDLocation(line: 22, column: 7, scope: !34)
+!90 = !MDLocation(line: 14, column: 5, scope: !25, inlinedAt: !86)
+!91 = !MDLocation(line: 15, column: 6, scope: !25, inlinedAt: !86)
+!92 = !MDLocation(line: 15, column: 3, scope: !25, inlinedAt: !86)
+!93 = !MDLocation(line: 16, column: 1, scope: !25, inlinedAt: !86)
+!94 = !MDLocation(line: 23, column: 1, scope: !34)
diff --git a/test/DebugInfo/AArch64/little-endian-dump.ll b/test/DebugInfo/AArch64/little-endian-dump.ll
index 5c7f336..65330c9 100644
--- a/test/DebugInfo/AArch64/little-endian-dump.ll
+++ b/test/DebugInfo/AArch64/little-endian-dump.ll
@@ -8,9 +8,9 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"empty.c", metadata !"/a"}
-!2 = metadata !{}
-!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
-!5 = metadata !{metadata !"clang version 3.6.0 "}
+!0 = !{i32 786449, !1, i32 12, !"clang version 3.6.0 ", i1 false, !"", i32 0, !2, !2, !2, !2, !2, !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = !{!"empty.c", !"/a"}
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 1}
+!5 = !{!"clang version 3.6.0 "}
diff --git a/test/DebugInfo/AArch64/processes-relocations.ll b/test/DebugInfo/AArch64/processes-relocations.ll
index 5ce9262..1a9dfd7 100644
--- a/test/DebugInfo/AArch64/processes-relocations.ll
+++ b/test/DebugInfo/AArch64/processes-relocations.ll
@@ -7,9 +7,9 @@
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"empty.c", metadata !"/a"}
-!2 = metadata !{}
-!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
-!5 = metadata !{metadata !"clang version 3.6.0 "}
+!0 = !{i32 786449, !1, i32 12, !"clang version 3.6.0 ", i1 false, !"", i32 0, !2, !2, !2, !2, !2, !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = !{!"empty.c", !"/a"}
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 1}
+!5 = !{!"clang version 3.6.0 "}
diff --git a/test/DebugInfo/AArch64/struct_by_value.ll b/test/DebugInfo/AArch64/struct_by_value.ll
index b9adb45..9996d27 100644
--- a/test/DebugInfo/AArch64/struct_by_value.ll
+++ b/test/DebugInfo/AArch64/struct_by_value.ll
@@ -32,7 +32,7 @@ target triple = "arm64-apple-ios3.0.0"
 ; Function Attrs: nounwind ssp
 define i32 @return_five_int(%struct.five* %f) #0 {
 entry:
-  call void @llvm.dbg.declare(metadata !{%struct.five* %f}, metadata !17, metadata !{metadata !"0x102"}), !dbg !18
+  call void @llvm.dbg.declare(metadata %struct.five* %f, metadata !17, metadata !{!"0x102\006"}), !dbg !18
   %a = getelementptr inbounds %struct.five* %f, i32 0, i32 0, !dbg !19
   %0 = load i32* %a, align 4, !dbg !19
   ret i32 %0, !dbg !19
@@ -47,24 +47,24 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!16, !20}
 
-!0 = metadata !{metadata !"0x11\0012\00LLVM version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [struct_by_value.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"struct_by_value.c", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00return_five_int\00return_five_int\00\0013\000\001\000\006\00256\000\0014", metadata !1, metadata !5, metadata !6, null, i32 (%struct.five*)* @return_five_int, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 14] [return_five_int]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [struct_by_value.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !9}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x13\00five\001\00160\0032\000\000\000", metadata !1, null, null, metadata !10, null, null, null} ; [ DW_TAG_structure_type ] [five] [line 1, size 160, align 32, offset 0] [def] [from ]
-!10 = metadata !{metadata !11, metadata !12, metadata !13, metadata !14, metadata !15}
-!11 = metadata !{metadata !"0xd\00a\003\0032\0032\000\000", metadata !1, metadata !9, metadata !8} ; [ DW_TAG_member ] [a] [line 3, size 32, align 32, offset 0] [from int]
-!12 = metadata !{metadata !"0xd\00b\004\0032\0032\0032\000", metadata !1, metadata !9, metadata !8} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 32] [from int]
-!13 = metadata !{metadata !"0xd\00c\005\0032\0032\0064\000", metadata !1, metadata !9, metadata !8} ; [ DW_TAG_member ] [c] [line 5, size 32, align 32, offset 64] [from int]
-!14 = metadata !{metadata !"0xd\00d\006\0032\0032\0096\000", metadata !1, metadata !9, metadata !8} ; [ DW_TAG_member ] [d] [line 6, size 32, align 32, offset 96] [from int]
-!15 = metadata !{metadata !"0xd\00e\007\0032\0032\00128\000", metadata !1, metadata !9, metadata !8} ; [ DW_TAG_member ] [e] [line 7, size 32, align 32, offset 128] [from int]
-!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!17 = metadata !{metadata !"0x101\00f\0016777229\008192", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [f] [line 13]
-!18 = metadata !{i32 13, i32 0, metadata !4, null}
-!19 = metadata !{i32 16, i32 0, metadata !4, null}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00LLVM version 3.4 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [struct_by_value.c] [DW_LANG_C99]
+!1 = !{!"struct_by_value.c", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00return_five_int\00return_five_int\00\0013\000\001\000\006\00256\000\0014", !1, !5, !6, null, i32 (%struct.five*)* @return_five_int, null, null, !2} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 14] [return_five_int]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [struct_by_value.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !9}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x13\00five\001\00160\0032\000\000\000", !1, null, null, !10, null, null, null} ; [ DW_TAG_structure_type ] [five] [line 1, size 160, align 32, offset 0] [def] [from ]
+!10 = !{!11, !12, !13, !14, !15}
+!11 = !{!"0xd\00a\003\0032\0032\000\000", !1, !9, !8} ; [ DW_TAG_member ] [a] [line 3, size 32, align 32, offset 0] [from int]
+!12 = !{!"0xd\00b\004\0032\0032\0032\000", !1, !9, !8} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 32] [from int]
+!13 = !{!"0xd\00c\005\0032\0032\0064\000", !1, !9, !8} ; [ DW_TAG_member ] [c] [line 5, size 32, align 32, offset 64] [from int]
+!14 = !{!"0xd\00d\006\0032\0032\0096\000", !1, !9, !8} ; [ DW_TAG_member ] [d] [line 6, size 32, align 32, offset 96] [from int]
+!15 = !{!"0xd\00e\007\0032\0032\00128\000", !1, !9, !8} ; [ DW_TAG_member ] [e] [line 7, size 32, align 32, offset 128] [from int]
+!16 = !{i32 2, !"Dwarf Version", i32 2}
+!17 = !{!"0x101\00f\0016777229\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [f] [line 13]
+!18 = !MDLocation(line: 13, scope: !4)
+!19 = !MDLocation(line: 16, scope: !4)
+!20 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/ARM/PR16736.ll b/test/DebugInfo/ARM/PR16736.ll
index afa0ece..7c99ae2 100644
--- a/test/DebugInfo/ARM/PR16736.ll
+++ b/test/DebugInfo/ARM/PR16736.ll
@@ -15,14 +15,14 @@ target triple = "thumbv7-apple-ios"
 ; Function Attrs: nounwind
 define arm_aapcscc void @_Z1hiiiif(i32, i32, i32, i32, float %x) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !18
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !18
-  tail call void @llvm.dbg.value(metadata !{i32 %2}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !18
-  tail call void @llvm.dbg.value(metadata !{i32 %3}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !18
-  tail call void @llvm.dbg.value(metadata !{float %x}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata i32 %0, i64 0, metadata !12, metadata !{!"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !13, metadata !{!"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata i32 %2, i64 0, metadata !14, metadata !{!"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata i32 %3, i64 0, metadata !15, metadata !{!"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata float %x, i64 0, metadata !16, metadata !{!"0x102"}), !dbg !18
   %call = tail call arm_aapcscc i32 @_Z1fv() #3, !dbg !19
   %conv = sitofp i32 %call to float, !dbg !19
-  tail call void @llvm.dbg.value(metadata !{float %conv}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !19
+  tail call void @llvm.dbg.value(metadata float %conv, i64 0, metadata !16, metadata !{!"0x102"}), !dbg !19
   tail call arm_aapcscc void @_Z1gf(float %conv) #3, !dbg !19
   ret void, !dbg !20
 }
@@ -41,25 +41,25 @@ attributes #3 = { nounwind }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!17, !21}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (trunk 190804) (llvm/trunk 190797)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [//<unknown>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"/<unknown>", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00h\00h\00_Z1hiiiif\003\000\001\000\006\00256\001\003", metadata !5, metadata !6, metadata !7, null, void (i32, i32, i32, i32, float)* @_Z1hiiiif, null, null, metadata !11} ; [ DW_TAG_subprogram ] [line 3] [def] [h]
-!5 = metadata !{metadata !"/arm.cpp", metadata !""}
-!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [//arm.cpp]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9, metadata !9, metadata !9, metadata !9, metadata !10}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
-!11 = metadata !{metadata !12, metadata !13, metadata !14, metadata !15, metadata !16}
-!12 = metadata !{metadata !"0x101\00\0016777219\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 3]
-!13 = metadata !{metadata !"0x101\00\0033554435\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 3]
-!14 = metadata !{metadata !"0x101\00\0050331651\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 3]
-!15 = metadata !{metadata !"0x101\00\0067108867\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 3]
-!16 = metadata !{metadata !"0x101\00x\0083886083\000", metadata !4, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ] [x] [line 3]
-!17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!18 = metadata !{i32 3, i32 0, metadata !4, null}
-!19 = metadata !{i32 4, i32 0, metadata !4, null}
-!20 = metadata !{i32 5, i32 0, metadata !4, null}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 (trunk 190804) (llvm/trunk 190797)\001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [//<unknown>] [DW_LANG_C_plus_plus]
+!1 = !{!"/<unknown>", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00h\00h\00_Z1hiiiif\003\000\001\000\006\00256\001\003", !5, !6, !7, null, void (i32, i32, i32, i32, float)* @_Z1hiiiif, null, null, !11} ; [ DW_TAG_subprogram ] [line 3] [def] [h]
+!5 = !{!"/arm.cpp", !""}
+!6 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [//arm.cpp]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9, !9, !9, !9, !10}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!11 = !{!12, !13, !14, !15, !16}
+!12 = !{!"0x101\00\0016777219\000", !4, !6, !9} ; [ DW_TAG_arg_variable ] [line 3]
+!13 = !{!"0x101\00\0033554435\000", !4, !6, !9} ; [ DW_TAG_arg_variable ] [line 3]
+!14 = !{!"0x101\00\0050331651\000", !4, !6, !9} ; [ DW_TAG_arg_variable ] [line 3]
+!15 = !{!"0x101\00\0067108867\000", !4, !6, !9} ; [ DW_TAG_arg_variable ] [line 3]
+!16 = !{!"0x101\00x\0083886083\000", !4, !6, !10} ; [ DW_TAG_arg_variable ] [x] [line 3]
+!17 = !{i32 2, !"Dwarf Version", i32 4}
+!18 = !MDLocation(line: 3, scope: !4)
+!19 = !MDLocation(line: 4, scope: !4)
+!20 = !MDLocation(line: 5, scope: !4)
+!21 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/ARM/big-endian-dump.ll b/test/DebugInfo/ARM/big-endian-dump.ll
index e35f097..7bfe24e 100644
--- a/test/DebugInfo/ARM/big-endian-dump.ll
+++ b/test/DebugInfo/ARM/big-endian-dump.ll
@@ -8,11 +8,11 @@ target datalayout = "E-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
 !llvm.module.flags = !{!3, !4, !5, !6}
 !llvm.ident = !{!7}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"empty.c", metadata !"/a"}
-!2 = metadata !{}
-!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
-!5 = metadata !{i32 1, metadata !"wchar_size", i32 4}
-!6 = metadata !{i32 1, metadata !"min_enum_size", i32 4}
-!7 = metadata !{metadata !"clang version 3.6.0 "}
+!0 = !{i32 786449, !1, i32 12, !"clang version 3.6.0 ", i1 false, !"", i32 0, !2, !2, !2, !2, !2, !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = !{!"empty.c", !"/a"}
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 1}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{i32 1, !"min_enum_size", i32 4}
+!7 = !{!"clang version 3.6.0 "}
diff --git a/test/DebugInfo/ARM/cfi-eof-prologue.ll b/test/DebugInfo/ARM/cfi-eof-prologue.ll
new file mode 100644
index 0000000..599806b
--- /dev/null
+++ b/test/DebugInfo/ARM/cfi-eof-prologue.ll
@@ -0,0 +1,115 @@
+; struct A {
+;   A();
+;   virtual ~A();
+; };
+; struct B : A {
+;   B();
+;   virtual ~B();
+; };
+; B::B() {}
+; CHECK: __ZN1BC1Ev:
+; CHECK:     .loc	1 [[@LINE-2]] 0 prologue_end
+; CHECK-NOT: .loc	1 0 0 prologue_end
+
+; The location of the prologue_end marker should not be affected by the presence
+; of CFI instructions.
+
+; RUN: llc -O0 -filetype=asm -mtriple=thumbv7-apple-ios < %s | FileCheck %s
+; RUN: llc -O0 -filetype=asm -mtriple=thumbv6-apple-ios < %s | FileCheck %s
+
+; ModuleID = 'test1.cpp'
+target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7-apple-ios"
+
+%struct.B = type { %struct.A }
+%struct.A = type { i32 (...)** }
+
+@_ZTV1B = external unnamed_addr constant [4 x i8*]
+
+; Function Attrs: nounwind
+define %struct.B* @_ZN1BC2Ev(%struct.B* %this) unnamed_addr #0 align 2 {
+entry:
+  tail call void @llvm.dbg.value(metadata %struct.B* %this, i64 0, metadata !30, metadata !40), !dbg !41
+  %0 = getelementptr inbounds %struct.B* %this, i32 0, i32 0, !dbg !42
+  %call = tail call %struct.A* @_ZN1AC2Ev(%struct.A* %0) #3, !dbg !42
+  %1 = getelementptr inbounds %struct.B* %this, i32 0, i32 0, i32 0, !dbg !42
+  store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*]* @_ZTV1B, i32 0, i32 2) to i32 (...)**), i32 (...)*** %1, align 4, !dbg !42, !tbaa !43
+  ret %struct.B* %this, !dbg !42
+}
+
+declare %struct.A* @_ZN1AC2Ev(%struct.A*)
+
+; Function Attrs: nounwind
+define %struct.B* @_ZN1BC1Ev(%struct.B* %this) unnamed_addr #0 align 2 {
+entry:
+  tail call void @llvm.dbg.value(metadata %struct.B* %this, i64 0, metadata !34, metadata !40), !dbg !46
+  tail call void @llvm.dbg.value(metadata %struct.B* %this, i64 0, metadata !47, metadata !40) #3, !dbg !49
+  %0 = getelementptr inbounds %struct.B* %this, i32 0, i32 0, !dbg !50
+  %call.i = tail call %struct.A* @_ZN1AC2Ev(%struct.A* %0) #3, !dbg !50
+  %1 = getelementptr inbounds %struct.B* %this, i32 0, i32 0, i32 0, !dbg !50
+  store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*]* @_ZTV1B, i32 0, i32 2) to i32 (...)**), i32 (...)*** %1, align 4, !dbg !50, !tbaa !43
+  ret %struct.B* %this, !dbg !48
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+
+attributes #0 = { nounwind }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!35, !36, !37, !38}
+!llvm.ident = !{!39}
+
+!0 = !{!"0x11\004\00clang version 3.6.0 (trunk 224279) (llvm/trunk 224283)\001\00\000\00\001", !1, !2, !3, !27, !2, !2} ; [ DW_TAG_compile_unit ] [<stdin>] [DW_LANG_C_plus_plus]
+!1 = !{!"<stdin>", !""}
+!2 = !{}
+!3 = !{!4, !13}
+!4 = !{!"0x13\00B\005\0032\0032\000\000\000", !5, null, null, !6, !"_ZTS1A", null, !"_ZTS1B"} ; [ DW_TAG_structure_type ] [B] [line 5, size 32, align 32, offset 0] [def] [from ]
+!5 = !{!"test1.cpp", !""}
+!6 = !{!7, !8, !12}
+!7 = !{!"0x1c\00\000\000\000\000\000", null, !"_ZTS1B", !"_ZTS1A"} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from _ZTS1A]
+!8 = !{!"0x2e\00B\00B\00\006\000\000\000\000\00256\001\006", !5, !"_ZTS1B", !9, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 6] [B]
+!9 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = !{null, !11}
+!11 = !{!"0xf\00\000\0032\0032\000\001088\00", null, null, !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [artificial] [from _ZTS1B]
+!12 = !{!"0x2e\00~B\00~B\00\007\000\000\001\000\00256\001\007", !5, !"_ZTS1B", !9, !"_ZTS1B", null, null, null, null} ; [ DW_TAG_subprogram ] [line 7] [~B]
+!13 = !{!"0x13\00A\001\0032\0032\000\000\000", !5, null, null, !14, !"_ZTS1A", null, !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
+!14 = !{!15, !22, !26}
+!15 = !{!"0xd\00_vptr$A\000\0032\000\000\0064", !5, !16, !17} ; [ DW_TAG_member ] [_vptr$A] [line 0, size 32, align 0, offset 0] [artificial] [from ]
+!16 = !{!"0x29", !5}                              ; [ DW_TAG_file_type ] [test1.cpp]
+!17 = !{!"0xf\00\000\0032\000\000\000", null, null, !18} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 0, offset 0] [from __vtbl_ptr_type]
+!18 = !{!"0xf\00__vtbl_ptr_type\000\0032\000\000\000", null, null, !19} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 32, align 0, offset 0] [from ]
+!19 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = !{!21}
+!21 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!22 = !{!"0x2e\00A\00A\00\002\000\000\000\000\00256\001\002", !5, !"_ZTS1A", !23, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [A]
+!23 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !24, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = !{null, !25}
+!25 = !{!"0xf\00\000\0032\0032\000\001088\00", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [artificial] [from _ZTS1A]
+!26 = !{!"0x2e\00~A\00~A\00\003\000\000\001\000\00256\001\003", !5, !"_ZTS1A", !23, !"_ZTS1A", null, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [~A]
+!27 = !{!28, !32}
+!28 = !{!"0x2e\00B\00B\00_ZN1BC2Ev\009\000\001\000\000\00256\001\009", !5, !"_ZTS1B", !9, null, %struct.B* (%struct.B*)* @_ZN1BC2Ev, null, !8, !29} ; [ DW_TAG_subprogram ] [line 9] [def] [B]
+!29 = !{!30}
+!30 = !{!"0x101\00this\0016777216\001088", !28, null, !31} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!31 = !{!"0xf\00\000\0032\0032\000\000", null, null, !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from _ZTS1B]
+!32 = !{!"0x2e\00B\00B\00_ZN1BC1Ev\009\000\001\000\000\00256\001\009", !5, !"_ZTS1B", !9, null, %struct.B* (%struct.B*)* @_ZN1BC1Ev, null, !8, !33} ; [ DW_TAG_subprogram ] [line 9] [def] [B]
+!33 = !{!34}
+!34 = !{!"0x101\00this\0016777216\001088", !32, null, !31} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!35 = !{i32 2, !"Dwarf Version", i32 4}
+!36 = !{i32 2, !"Debug Info Version", i32 2}
+!37 = !{i32 1, !"wchar_size", i32 4}
+!38 = !{i32 1, !"min_enum_size", i32 4}
+!39 = !{!"clang version 3.6.0 (trunk 224279) (llvm/trunk 224283)"}
+!40 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!41 = !MDLocation(line: 0, scope: !28)
+!42 = !MDLocation(line: 9, scope: !28)
+!43 = !{!44, !44, i64 0}
+!44 = !{!"vtable pointer", !45, i64 0}
+!45 = !{!"Simple C/C++ TBAA"}
+!46 = !MDLocation(line: 0, scope: !32)
+!47 = !{!"0x101\00this\0016777216\001088", !28, null, !31, !48} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!48 = !MDLocation(line: 9, scope: !32)
+!49 = !MDLocation(line: 0, scope: !28, inlinedAt: !48)
+!50 = !MDLocation(line: 9, scope: !28, inlinedAt: !48)
diff --git a/test/DebugInfo/ARM/line.test b/test/DebugInfo/ARM/line.test
new file mode 100644
index 0000000..47feb46
--- /dev/null
+++ b/test/DebugInfo/ARM/line.test
@@ -0,0 +1,7 @@
+; RUN: llc -mtriple=arm-none-linux -O0 -filetype=asm < %S/../Inputs/line.ll | FileCheck %S/../Inputs/line.ll
+
+; This is more complex than it looked. It's mixed up somewhere in SelectionDAG
+; (legalized as br_cc, losing the separation between the comparison and the
+; branch, then further lowered to CMPri + brcc but without the fidelity that
+; those two instructions are on separate lines)
+; XFAIL: *
diff --git a/test/DebugInfo/ARM/little-endian-dump.ll b/test/DebugInfo/ARM/little-endian-dump.ll
index da60657..b5f86e9 100644
--- a/test/DebugInfo/ARM/little-endian-dump.ll
+++ b/test/DebugInfo/ARM/little-endian-dump.ll
@@ -8,11 +8,11 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
 !llvm.module.flags = !{!3, !4, !5, !6}
 !llvm.ident = !{!7}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"empty.c", metadata !"/a"}
-!2 = metadata !{}
-!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
-!5 = metadata !{i32 1, metadata !"wchar_size", i32 4}
-!6 = metadata !{i32 1, metadata !"min_enum_size", i32 4}
-!7 = metadata !{metadata !"clang version 3.6.0 "}
+!0 = !{i32 786449, !1, i32 12, !"clang version 3.6.0 ", i1 false, !"", i32 0, !2, !2, !2, !2, !2, !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = !{!"empty.c", !"/a"}
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 1}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{i32 1, !"min_enum_size", i32 4}
+!7 = !{!"clang version 3.6.0 "}
diff --git a/test/DebugInfo/ARM/lowerbdgdeclare_vla.ll b/test/DebugInfo/ARM/lowerbdgdeclare_vla.ll
index 764c57d..8d75069 100644
--- a/test/DebugInfo/ARM/lowerbdgdeclare_vla.ll
+++ b/test/DebugInfo/ARM/lowerbdgdeclare_vla.ll
@@ -19,18 +19,18 @@ target triple = "thumbv7-apple-ios8.0.0"
 ; Function Attrs: nounwind optsize readnone
 define void @run(float %r) #0 {
 entry:
-  tail call void @llvm.dbg.declare(metadata !{float %r}, metadata !11, metadata !{metadata !"0x102"}), !dbg !22
+  tail call void @llvm.dbg.declare(metadata float %r, metadata !11, metadata !{!"0x102"}), !dbg !22
   %conv = fptosi float %r to i32, !dbg !23
-  tail call void @llvm.dbg.declare(metadata !{i32 %conv}, metadata !12, metadata !{metadata !"0x102"}), !dbg !23
+  tail call void @llvm.dbg.declare(metadata i32 %conv, metadata !12, metadata !{!"0x102"}), !dbg !23
   %vla = alloca float, i32 %conv, align 4, !dbg !24
-  tail call void @llvm.dbg.declare(metadata !{float* %vla}, metadata !14, metadata !{metadata !"0x102"}), !dbg !24
+  tail call void @llvm.dbg.declare(metadata float* %vla, metadata !14, metadata !{!"0x102\006"}), !dbg !24
 ; The VLA alloca should be described by a dbg.declare:
-; CHECK: call void @llvm.dbg.declare(metadata !{float* %vla}, metadata ![[VLA:.*]], metadata {{.*}})
+; CHECK: call void @llvm.dbg.declare(metadata float* %vla, metadata ![[VLA:.*]], metadata {{.*}})
 ; The VLA alloca and following store into the array should not be lowered to like this:
-; CHECK-NOT:  call void @llvm.dbg.value(metadata !{float %r}, i64 0, metadata ![[VLA]])
+; CHECK-NOT:  call void @llvm.dbg.value(metadata float %r, i64 0, metadata ![[VLA]])
 ; the backend interprets this as "vla has the location of %r".
   store float %r, float* %vla, align 4, !dbg !25, !tbaa !26
-  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !30
   %cmp8 = icmp sgt i32 %conv, 0, !dbg !30
   br i1 %cmp8, label %for.body, label %for.end, !dbg !30
 
@@ -41,7 +41,7 @@ for.body:                                         ; preds = %entry, %for.body.fo
   %div = fdiv float %0, %r, !dbg !31
   store float %div, float* %arrayidx2, align 4, !dbg !31, !tbaa !26
   %inc = add nsw i32 %i.09, 1, !dbg !30
-  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata i32 %inc, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !30
   %exitcond = icmp eq i32 %inc, %conv, !dbg !30
   br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge, !dbg !30
 
@@ -67,37 +67,37 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!20, !33}
 !llvm.ident = !{!21}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 \001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Volumes/Data/radar/15464571/<unknown>] [DW_LANG_C99]
-!1 = metadata !{metadata !"<unknown>", metadata !"/Volumes/Data/radar/15464571"}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00run\00run\00\001\000\001\000\006\00256\001\002", metadata !5, metadata !6, metadata !7, null, void (float)* @run, null, null, metadata !10} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [run]
-!5 = metadata !{metadata !"test.c", metadata !"/Volumes/Data/radar/15464571"}
-!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [/Volumes/Data/radar/15464571/test.c]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9}
-!9 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
-!10 = metadata !{metadata !11, metadata !12, metadata !14, metadata !18}
-!11 = metadata !{metadata !"0x101\00r\0016777217\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [r] [line 1]
-!12 = metadata !{metadata !"0x100\00count\003\000", metadata !4, metadata !6, metadata !13} ; [ DW_TAG_auto_variable ] [count] [line 3]
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!14 = metadata !{metadata !"0x100\00vla\004\008192", metadata !4, metadata !6, metadata !15} ; [ DW_TAG_auto_variable ] [vla] [line 4]
-!15 = metadata !{metadata !"0x1\00\000\000\0032\000\000", null, null, metadata !9, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from float]
-!16 = metadata !{metadata !17}
-!17 = metadata !{metadata !"0x21\000\00-1"}       ; [ DW_TAG_subrange_type ] [unbounded]
-!18 = metadata !{metadata !"0x100\00i\006\000", metadata !19, metadata !6, metadata !13} ; [ DW_TAG_auto_variable ] [i] [line 6]
-!19 = metadata !{metadata !"0xb\006\000\000", metadata !5, metadata !4} ; [ DW_TAG_lexical_block ] [/Volumes/Data/radar/15464571/test.c]
-!20 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!21 = metadata !{metadata !"clang version 3.4 "}
-!22 = metadata !{i32 1, i32 0, metadata !4, null}
-!23 = metadata !{i32 3, i32 0, metadata !4, null}
-!24 = metadata !{i32 4, i32 0, metadata !4, null}
-!25 = metadata !{i32 5, i32 0, metadata !4, null}
-!26 = metadata !{metadata !27, metadata !27, i64 0}
-!27 = metadata !{metadata !"float", metadata !28, i64 0}
-!28 = metadata !{metadata !"omnipotent char", metadata !29, i64 0}
-!29 = metadata !{metadata !"Simple C/C++ TBAA"}
-!30 = metadata !{i32 6, i32 0, metadata !19, null}
-!31 = metadata !{i32 7, i32 0, metadata !19, null}
-!32 = metadata !{i32 8, i32 0, metadata !4, null}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4 \001\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/Volumes/Data/radar/15464571/<unknown>] [DW_LANG_C99]
+!1 = !{!"<unknown>", !"/Volumes/Data/radar/15464571"}
+!2 = !{i32 0}
+!3 = !{!4}
+!4 = !{!"0x2e\00run\00run\00\001\000\001\000\006\00256\001\002", !5, !6, !7, null, void (float)* @run, null, null, !10} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [run]
+!5 = !{!"test.c", !"/Volumes/Data/radar/15464571"}
+!6 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [/Volumes/Data/radar/15464571/test.c]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9}
+!9 = !{!"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!10 = !{!11, !12, !14, !18}
+!11 = !{!"0x101\00r\0016777217\000", !4, !6, !9} ; [ DW_TAG_arg_variable ] [r] [line 1]
+!12 = !{!"0x100\00count\003\000", !4, !6, !13} ; [ DW_TAG_auto_variable ] [count] [line 3]
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = !{!"0x100\00vla\004\000", !4, !6, !15} ; [ DW_TAG_auto_variable ] [vla] [line 4]
+!15 = !{!"0x1\00\000\000\0032\000\000", null, null, !9, !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from float]
+!16 = !{!17}
+!17 = !{!"0x21\000\00-1"}       ; [ DW_TAG_subrange_type ] [unbounded]
+!18 = !{!"0x100\00i\006\000", !19, !6, !13} ; [ DW_TAG_auto_variable ] [i] [line 6]
+!19 = !{!"0xb\006\000\000", !5, !4} ; [ DW_TAG_lexical_block ] [/Volumes/Data/radar/15464571/test.c]
+!20 = !{i32 2, !"Dwarf Version", i32 2}
+!21 = !{!"clang version 3.4 "}
+!22 = !MDLocation(line: 1, scope: !4)
+!23 = !MDLocation(line: 3, scope: !4)
+!24 = !MDLocation(line: 4, scope: !4)
+!25 = !MDLocation(line: 5, scope: !4)
+!26 = !{!27, !27, i64 0}
+!27 = !{!"float", !28, i64 0}
+!28 = !{!"omnipotent char", !29, i64 0}
+!29 = !{!"Simple C/C++ TBAA"}
+!30 = !MDLocation(line: 6, scope: !19)
+!31 = !MDLocation(line: 7, scope: !19)
+!32 = !MDLocation(line: 8, scope: !4)
+!33 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/ARM/processes-relocations.ll b/test/DebugInfo/ARM/processes-relocations.ll
index 8edd954..b3db457 100644
--- a/test/DebugInfo/ARM/processes-relocations.ll
+++ b/test/DebugInfo/ARM/processes-relocations.ll
@@ -7,9 +7,9 @@
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"empty.c", metadata !"/a"}
-!2 = metadata !{}
-!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
-!5 = metadata !{metadata !"clang version 3.6.0 "}
+!0 = !{i32 786449, !1, i32 12, !"clang version 3.6.0 ", i1 false, !"", i32 0, !2, !2, !2, !2, !2, !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = !{!"empty.c", !"/a"}
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 1}
+!5 = !{!"clang version 3.6.0 "}
diff --git a/test/DebugInfo/ARM/s-super-register.ll b/test/DebugInfo/ARM/s-super-register.ll
index 0120045..62a315e 100644
--- a/test/DebugInfo/ARM/s-super-register.ll
+++ b/test/DebugInfo/ARM/s-super-register.ll
@@ -12,7 +12,7 @@ target triple = "thumbv7-apple-macosx10.6.7"
 define void @_Z3foov() optsize ssp {
 entry:
   %call = tail call float @_Z3barv() optsize, !dbg !11
-  tail call void @llvm.dbg.value(metadata !{float %call}, i64 0, metadata !5, metadata !{metadata !"0x102"}), !dbg !11
+  tail call void @llvm.dbg.value(metadata float %call, i64 0, metadata !5, metadata !{!"0x102"}), !dbg !11
   %call16 = tail call float @_Z2f2v() optsize, !dbg !12
   %cmp7 = fcmp olt float %call, %call16, !dbg !12
   br i1 %cmp7, label %for.body, label %for.end, !dbg !12
@@ -40,24 +40,24 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.0 (trunk 130845)\001\00\000\00\001", metadata !18, metadata !19, metadata !19, metadata !16, null,  null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3foov\005\000\001\000\006\00256\001\005", metadata !18, metadata !2, metadata !3, null, void ()* @_Z3foov, null, null, metadata !17} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
-!2 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null}
-!5 = metadata !{metadata !"0x100\00k\006\000", metadata !6, metadata !2, metadata !7} ; [ DW_TAG_auto_variable ]
-!6 = metadata !{metadata !"0xb\005\0012\000", metadata !18, metadata !1} ; [ DW_TAG_lexical_block ]
-!7 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !0} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !"0x100\00y\008\000", metadata !9, metadata !2, metadata !7} ; [ DW_TAG_auto_variable ]
-!9 = metadata !{metadata !"0xb\007\0025\002", metadata !18, metadata !10} ; [ DW_TAG_lexical_block ]
-!10 = metadata !{metadata !"0xb\007\003\001", metadata !18, metadata !6} ; [ DW_TAG_lexical_block ]
-!11 = metadata !{i32 6, i32 18, metadata !6, null}
-!12 = metadata !{i32 7, i32 3, metadata !6, null}
-!13 = metadata !{i32 8, i32 20, metadata !9, null}
-!14 = metadata !{i32 7, i32 20, metadata !10, null}
-!15 = metadata !{i32 10, i32 1, metadata !6, null}
-!16 = metadata !{metadata !1}
-!17 = metadata !{metadata !5, metadata !8}
-!18 = metadata !{metadata !"k.cc", metadata !"/private/tmp"}
-!19 = metadata !{i32 0}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.0 (trunk 130845)\001\00\000\00\001", !18, !19, !19, !16, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = !{!"0x2e\00foo\00foo\00_Z3foov\005\000\001\000\006\00256\001\005", !18, !2, !3, null, void ()* @_Z3foov, null, null, !17} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
+!2 = !{!"0x29", !18} ; [ DW_TAG_file_type ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !18, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{null}
+!5 = !{!"0x100\00k\006\000", !6, !2, !7} ; [ DW_TAG_auto_variable ]
+!6 = !{!"0xb\005\0012\000", !18, !1} ; [ DW_TAG_lexical_block ]
+!7 = !{!"0x24\00float\000\0032\0032\000\000\004", null, !0} ; [ DW_TAG_base_type ]
+!8 = !{!"0x100\00y\008\000", !9, !2, !7} ; [ DW_TAG_auto_variable ]
+!9 = !{!"0xb\007\0025\002", !18, !10} ; [ DW_TAG_lexical_block ]
+!10 = !{!"0xb\007\003\001", !18, !6} ; [ DW_TAG_lexical_block ]
+!11 = !MDLocation(line: 6, column: 18, scope: !6)
+!12 = !MDLocation(line: 7, column: 3, scope: !6)
+!13 = !MDLocation(line: 8, column: 20, scope: !9)
+!14 = !MDLocation(line: 7, column: 20, scope: !10)
+!15 = !MDLocation(line: 10, column: 1, scope: !6)
+!16 = !{!1}
+!17 = !{!5, !8}
+!18 = !{!"k.cc", !"/private/tmp"}
+!19 = !{i32 0}
+!20 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/ARM/sectionorder.ll b/test/DebugInfo/ARM/sectionorder.ll
index 24733d9..41677c7 100644
--- a/test/DebugInfo/ARM/sectionorder.ll
+++ b/test/DebugInfo/ARM/sectionorder.ll
@@ -11,8 +11,8 @@ target triple = "thumbv7-apple-ios"
 !llvm.module.flags = !{!3, !4}
 !llvm.dbg.cu = !{!0}
 
-!0 = metadata !{metadata !"0x11\0012\00LLVM\001\00\00\00\00", metadata !5, metadata !1, metadata !1, metadata !1, metadata !1, null} ; [ DW_TAG_compile_unit ] [/Volumes/Data/radar/15623193/test.c] [DW_LANG_C99]
-!1 = metadata !{}
-!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!5 = metadata !{metadata !"test.c", metadata !"/Volumes/Data/radar/15623193"}
+!0 = !{!"0x11\0012\00LLVM\001\00\00\00\00", !5, !1, !1, !1, !1, null} ; [ DW_TAG_compile_unit ] [/Volumes/Data/radar/15623193/test.c] [DW_LANG_C99]
+!1 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 2}
+!4 = !{i32 1, !"Debug Info Version", i32 2}
+!5 = !{!"test.c", !"/Volumes/Data/radar/15623193"}
diff --git a/test/DebugInfo/ARM/selectiondag-deadcode.ll b/test/DebugInfo/ARM/selectiondag-deadcode.ll
index 76e19ef..a974692 100644
--- a/test/DebugInfo/ARM/selectiondag-deadcode.ll
+++ b/test/DebugInfo/ARM/selectiondag-deadcode.ll
@@ -13,15 +13,15 @@ _ZN7Vector39NormalizeEv.exit:                     ; preds = %1, %0
   ; and SelectionDAGISel crashes.  It should definitely not
   ; crash. Drop the dbg_value instead.
   ; CHECK-NOT: "matrix"
-  tail call void @llvm.dbg.declare(metadata !{%class.Matrix3.0.6.10* %agg.result}, metadata !45, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.declare(metadata %class.Matrix3.0.6.10* %agg.result, metadata !45, metadata !{!"0x102\006"})
   %2 = getelementptr inbounds %class.Matrix3.0.6.10* %agg.result, i32 0, i32 0, i32 8
   ret void
 }
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 declare arm_aapcscc void @_ZL4Sqrtd() #2
-!4 = metadata !{metadata !"0x2\00Matrix3\0020\00288\0032\000\000\000", metadata !5, null, null, null, null, null, metadata !"_ZTS7Matrix3"} ; [ DW_TAG_class_type ] [Matrix3] [line 20, size 288, align 32, offset 0] [def] [from ]
-!5 = metadata !{metadata !"test.ii", metadata !"/Volumes/Data/radar/15094721"}
-!39 = metadata !{metadata !"0x2e\00GetMatrix\00GetMatrix\00_Z9GetMatrixv\0032\000\001\000\006\00256\001\0032", metadata !5, metadata !40, metadata !41, null, void (%class.Matrix3.0.6.10*)* @_Z9GetMatrixv, null, null, null} ; [ DW_TAG_subprogram ] [line 32] [def] [GetMatrix]
-!40 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ] [/Volumes/Data/radar/15094721/test.ii]
-!41 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, null, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!45 = metadata !{metadata !"0x100\00matrix\0035\008192", metadata !39, metadata !40, metadata !4} ; [ DW_TAG_auto_variable ] [matrix] [line 35]
+!4 = !{!"0x2\00Matrix3\0020\00288\0032\000\000\000", !5, null, null, null, null, null, !"_ZTS7Matrix3"} ; [ DW_TAG_class_type ] [Matrix3] [line 20, size 288, align 32, offset 0] [def] [from ]
+!5 = !{!"test.ii", !"/Volumes/Data/radar/15094721"}
+!39 = !{!"0x2e\00GetMatrix\00GetMatrix\00_Z9GetMatrixv\0032\000\001\000\006\00256\001\0032", !5, !40, !41, null, void (%class.Matrix3.0.6.10*)* @_Z9GetMatrixv, null, null, null} ; [ DW_TAG_subprogram ] [line 32] [def] [GetMatrix]
+!40 = !{!"0x29", !5}         ; [ DW_TAG_file_type ] [/Volumes/Data/radar/15094721/test.ii]
+!41 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, null, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!45 = !{!"0x100\00matrix\0035\000", !39, !40, !4} ; [ DW_TAG_auto_variable ] [matrix] [line 35]
diff --git a/test/DebugInfo/ARM/tls.ll b/test/DebugInfo/ARM/tls.ll
index c4be030..39436fe 100644
--- a/test/DebugInfo/ARM/tls.ll
+++ b/test/DebugInfo/ARM/tls.ll
@@ -16,13 +16,13 @@
 ; The debug relocation of the address of the tls variable
 ; CHECK: .long x(tlsldo)
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/tls.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"tls.c", metadata !"/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x34\00x\00x\00\001\000\001", null, metadata !5, metadata !6, i32* @x, null} ; [ DW_TAG_variable ] [x] [line 1] [def]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/tls.c]
-!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!9 = metadata !{metadata !"clang version 3.5 "}
+!0 = !{!"0x11\0012\00clang version 3.5 \000\00\000\00\000", !1, !2, !2, !2, !3, !2} ; [ DW_TAG_compile_unit ] [/tmp/tls.c] [DW_LANG_C99]
+!1 = !{!"tls.c", !"/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x34\00x\00x\00\001\000\001", null, !5, !6, i32* @x, null} ; [ DW_TAG_variable ] [x] [line 1] [def]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/tls.c]
+!6 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 2}
+!9 = !{!"clang version 3.5 "}
diff --git a/test/DebugInfo/COFF/asan-module-ctor.ll b/test/DebugInfo/COFF/asan-module-ctor.ll
index a62604c..e2c7aef 100644
--- a/test/DebugInfo/COFF/asan-module-ctor.ll
+++ b/test/DebugInfo/COFF/asan-module-ctor.ll
@@ -13,6 +13,9 @@
 ; X86-NEXT: calll   ___asan_init_v3
 ; X86-NEXT: retl
 
+; Make sure we don't put any DWARF debug info for ASan-instrumented modules.
+; X86-NOT: DWARF
+
 ; ModuleID = 'asan.c'
 target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
 target triple = "i686-pc-win32"
@@ -78,14 +81,14 @@ attributes #0 = { nounwind sanitize_address "less-precise-fpmad"="false" "no-fra
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\/asan.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"asan.c", metadata !"D:\5C"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 ()* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [D:\/asan.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!9 = metadata !{metadata !"clang version 3.5.0 "}
-!10 = metadata !{i32 2, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 \000\00\000\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [D:\/asan.c] [DW_LANG_C99]
+!1 = !{!"asan.c", !"D:\5C"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", !1, !5, !6, null, i32 ()* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [D:\/asan.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 2}
+!9 = !{!"clang version 3.5.0 "}
+!10 = !MDLocation(line: 2, scope: !4)
diff --git a/test/DebugInfo/COFF/asan-module-without-functions.ll b/test/DebugInfo/COFF/asan-module-without-functions.ll
index d5af109..5bd4c91 100644
--- a/test/DebugInfo/COFF/asan-module-without-functions.ll
+++ b/test/DebugInfo/COFF/asan-module-without-functions.ll
@@ -45,9 +45,9 @@ define internal void @asan.module_dtor() {
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\/asan.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"asan.c", metadata !"D:\5C"}
-!2 = metadata !{}
-!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!5 = metadata !{metadata !"clang version 3.5.0 "}
+!0 = !{!"0x11\0012\00clang version 3.5.0 \000\00\000\00\002", !1, !2, !2, !2, !2, !2} ; [ DW_TAG_compile_unit ] [D:\/asan.c] [DW_LANG_C99]
+!1 = !{!"asan.c", !"D:\5C"}
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 1, !"Debug Info Version", i32 2}
+!5 = !{!"clang version 3.5.0 "}
diff --git a/test/DebugInfo/COFF/asm.ll b/test/DebugInfo/COFF/asm.ll
index 9c9dad8..07696e3 100644
--- a/test/DebugInfo/COFF/asm.ll
+++ b/test/DebugInfo/COFF/asm.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mcpu=core2 -mtriple=i686-pc-win32 -O0 < %s | FileCheck --check-prefix=X86 %s
-; RUN: llc -mcpu=core2 -mtriple=i686-pc-win32 -o - -O0 < %s | llvm-mc -triple=i686-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview-linetables | FileCheck --check-prefix=OBJ32 %s
+; RUN: llc -mcpu=core2 -mtriple=i686-pc-win32 -o - -O0 < %s | llvm-mc -triple=i686-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview -section-symbols | FileCheck --check-prefix=OBJ32 %s
 ; RUN: llc -mcpu=core2 -mtriple=x86_64-pc-win32 -O0 < %s | FileCheck --check-prefix=X64 %s
-; RUN: llc -mcpu=core2 -mtriple=x86_64-pc-win32 -o - -O0 < %s | llvm-mc -triple=x86_64-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview-linetables | FileCheck --check-prefix=OBJ64 %s
+; RUN: llc -mcpu=core2 -mtriple=x86_64-pc-win32 -o - -O0 < %s | llvm-mc -triple=x86_64-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview -section-symbols | FileCheck --check-prefix=OBJ64 %s
 
 ; This LL file was generated by running clang on the following code:
 ; D:\asm.c:
@@ -22,7 +22,7 @@
 ; X86-NEXT: L{{.*}}:
 ; X86-NEXT: [[END_OF_F:^L.*]]:
 ;
-; X86-LABEL: .section        .debug$S,"rd"
+; X86-LABEL: .section        .debug$S,"dr"
 ; X86-NEXT: .long   4
 ; Symbol subsection
 ; X86-NEXT: .long   241
@@ -127,7 +127,7 @@
 ; X64-NEXT: .L{{.*}}:
 ; X64-NEXT: [[END_OF_F:.*]]:
 ;
-; X64-LABEL: .section        .debug$S,"rd"
+; X64-LABEL: .section        .debug$S,"dr"
 ; X64-NEXT: .long   4
 ; Symbol subsection
 ; X64-NEXT: .long   241
@@ -239,18 +239,18 @@ attributes #2 = { nounwind }
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\/<unknown>] [DW_LANG_C99]
-!1 = metadata !{metadata !"<unknown>", metadata !"D:\5C"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00f\00f\00\003\000\001\000\006\00256\000\003", metadata !5, metadata !6, metadata !7, null, void ()* @f, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
-!5 = metadata !{metadata !"asm.c", metadata !"D:\5C"}
-!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [D:\/asm.c]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null}
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!11 = metadata !{metadata !"clang version 3.5 "}
-!12 = metadata !{i32 4, i32 0, metadata !4, null}
-!13 = metadata !{i32 5, i32 0, metadata !4, null}
-!14 = metadata !{i32 6, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [D:\/<unknown>] [DW_LANG_C99]
+!1 = !{!"<unknown>", !"D:\5C"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00f\00f\00\003\000\001\000\006\00256\000\003", !5, !6, !7, null, void ()* @f, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!5 = !{!"asm.c", !"D:\5C"}
+!6 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [D:\/asm.c]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null}
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 1, !"Debug Info Version", i32 2}
+!11 = !{!"clang version 3.5 "}
+!12 = !MDLocation(line: 4, scope: !4)
+!13 = !MDLocation(line: 5, scope: !4)
+!14 = !MDLocation(line: 6, scope: !4)
diff --git a/test/DebugInfo/COFF/cpp-mangling.ll b/test/DebugInfo/COFF/cpp-mangling.ll
index 1ccf2f9..85bdd4b 100644
--- a/test/DebugInfo/COFF/cpp-mangling.ll
+++ b/test/DebugInfo/COFF/cpp-mangling.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=core2 -mtriple=i686-pc-win32 -o - -O0 < %s | llvm-mc -triple=i686-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview-linetables | FileCheck %s
+; RUN: llc -mcpu=core2 -mtriple=i686-pc-win32 -o - -O0 < %s | llvm-mc -triple=i686-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview -section-symbols | FileCheck %s
 
 ; This LL file was generated by running clang on the following code:
 ; D:\src.cpp:
@@ -29,15 +29,15 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"=
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \000\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\/<stdin>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"<stdin>", metadata !"D:\5C"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00bar\00bar\00\002\000\001\000\000\00256\000\002", metadata !5, metadata !6, metadata !7, null, i32 (i32)* @"\01?bar@foo@@YAHH@Z", null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
-!5 = metadata !{metadata !"src.cpp", metadata !"D:\5C"}
-!6 = metadata !{metadata !"0x29", metadata !5}    ; [ DW_TAG_file_type ] [D:\/src.cpp]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!10 = metadata !{metadata !"clang version 3.6.0 "}
-!11 = metadata !{i32 3, i32 0, metadata !4, null}
+!0 = !{!"0x11\004\00clang version 3.6.0 \000\00\000\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [D:\/<stdin>] [DW_LANG_C_plus_plus]
+!1 = !{!"<stdin>", !"D:\5C"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00bar\00bar\00\002\000\001\000\000\00256\000\002", !5, !6, !7, null, i32 (i32)* @"\01?bar@foo@@YAHH@Z", null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
+!5 = !{!"src.cpp", !"D:\5C"}
+!6 = !{!"0x29", !5}    ; [ DW_TAG_file_type ] [D:\/src.cpp]
+!7 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 2}
+!10 = !{!"clang version 3.6.0 "}
+!11 = !MDLocation(line: 3, scope: !4)
diff --git a/test/DebugInfo/COFF/multifile.ll b/test/DebugInfo/COFF/multifile.ll
index 3bc1286..e024edd 100644
--- a/test/DebugInfo/COFF/multifile.ll
+++ b/test/DebugInfo/COFF/multifile.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mcpu=core2 -mtriple=i686-pc-win32 -O0 < %s | FileCheck --check-prefix=X86 %s
-; RUN: llc -mcpu=core2 -mtriple=i686-pc-win32 -o - -O0 < %s | llvm-mc -triple=i686-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview-linetables | FileCheck --check-prefix=OBJ32 %s
+; RUN: llc -mcpu=core2 -mtriple=i686-pc-win32 -o - -O0 < %s | llvm-mc -triple=i686-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview -section-symbols | FileCheck --check-prefix=OBJ32 %s
 ; RUN: llc -mcpu=core2 -mtriple=x86_64-pc-win32 -O0 < %s | FileCheck --check-prefix=X64 %s
-; RUN: llc -mcpu=core2 -mtriple=x86_64-pc-win32 -o - -O0 < %s | llvm-mc -triple=x86_64-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview-linetables | FileCheck --check-prefix=OBJ64 %s
+; RUN: llc -mcpu=core2 -mtriple=x86_64-pc-win32 -o - -O0 < %s | llvm-mc -triple=x86_64-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview -section-symbols | FileCheck --check-prefix=OBJ64 %s
 
 ; This LL file was generated by running clang on the following code:
 ; D:\input.c:
@@ -29,7 +29,7 @@
 ; X86-NEXT: L{{.*}}:
 ; X86-NEXT: [[END_OF_F:.*]]:
 ;
-; X86-LABEL: .section        .debug$S,"rd"
+; X86-LABEL: .section        .debug$S,"dr"
 ; X86-NEXT: .long   4
 ; Symbol subsection
 ; X86-NEXT: .long   241
@@ -159,7 +159,7 @@
 ; X64-NEXT: .L{{.*}}:
 ; X64-NEXT: [[END_OF_F:.*]]:
 ;
-; X64-LABEL: .section        .debug$S,"rd"
+; X64-LABEL: .section        .debug$S,"dr"
 ; X64-NEXT: .long   4
 ; Symbol subsection
 ; X64-NEXT: .long   241
@@ -307,23 +307,23 @@ attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\/<unknown>] [DW_LANG_C99]
-!1 = metadata !{metadata !"<unknown>", metadata !"D:\5C"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00f\00f\00\003\000\001\000\006\00256\000\003", metadata !5, metadata !6, metadata !7, null, void ()* @f, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
-!5 = metadata !{metadata !"input.c", metadata !"D:\5C"}
-!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [D:\/input.c]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null}
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!11 = metadata !{metadata !"clang version 3.5 "}
-!12 = metadata !{i32 1, i32 0, metadata !13, null}
-!13 = metadata !{metadata !"0xb\000", metadata !14, metadata !4} ; [ DW_TAG_lexical_block ] [D:\/one.c]
-!14 = metadata !{metadata !"one.c", metadata !"D:\5C"}
-!15 = metadata !{i32 2, i32 0, metadata !16, null}
-!16 = metadata !{metadata !"0xb\000", metadata !17, metadata !4} ; [ DW_TAG_lexical_block ] [D:\/two.c]
-!17 = metadata !{metadata !"two.c", metadata !"D:\5C"}
-!18 = metadata !{i32 7, i32 0, metadata !13, null}
-!19 = metadata !{i32 8, i32 0, metadata !13, null}
+!0 = !{!"0x11\0012\00clang version 3.5 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [D:\/<unknown>] [DW_LANG_C99]
+!1 = !{!"<unknown>", !"D:\5C"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00f\00f\00\003\000\001\000\006\00256\000\003", !5, !6, !7, null, void ()* @f, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!5 = !{!"input.c", !"D:\5C"}
+!6 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [D:\/input.c]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null}
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 1, !"Debug Info Version", i32 2}
+!11 = !{!"clang version 3.5 "}
+!12 = !MDLocation(line: 1, scope: !13)
+!13 = !{!"0xb\000", !14, !4} ; [ DW_TAG_lexical_block ] [D:\/one.c]
+!14 = !{!"one.c", !"D:\5C"}
+!15 = !MDLocation(line: 2, scope: !16)
+!16 = !{!"0xb\000", !17, !4} ; [ DW_TAG_lexical_block ] [D:\/two.c]
+!17 = !{!"two.c", !"D:\5C"}
+!18 = !MDLocation(line: 7, scope: !13)
+!19 = !MDLocation(line: 8, scope: !13)
diff --git a/test/DebugInfo/COFF/multifunction.ll b/test/DebugInfo/COFF/multifunction.ll
index 4d4f506..ab798ab 100644
--- a/test/DebugInfo/COFF/multifunction.ll
+++ b/test/DebugInfo/COFF/multifunction.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mcpu=core2 -mtriple=i686-pc-win32 -O0 < %s | FileCheck --check-prefix=X86 %s
-; RUN: llc -mcpu=core2 -mtriple=i686-pc-win32 -o - -O0 < %s | llvm-mc -triple=i686-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview-linetables | FileCheck --check-prefix=OBJ32 %s
+; RUN: llc -mcpu=core2 -mtriple=i686-pc-win32 -o - -O0 < %s | llvm-mc -triple=i686-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview -section-symbols | FileCheck --check-prefix=OBJ32 %s
 ; RUN: llc -mcpu=core2 -mtriple=x86_64-pc-win32 -O0 < %s | FileCheck --check-prefix=X64 %s
-; RUN: llc -mcpu=core2 -mtriple=x86_64-pc-win32 -o - -O0 < %s | llvm-mc -triple=x86_64-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview-linetables | FileCheck --check-prefix=OBJ64 %s
+; RUN: llc -mcpu=core2 -mtriple=x86_64-pc-win32 -o - -O0 < %s | llvm-mc -triple=x86_64-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview -section-symbols | FileCheck --check-prefix=OBJ64 %s
 
 ; This LL file was generated by running clang on the following code:
 ; D:\source.c:
@@ -53,7 +53,7 @@
 ; X86-NEXT: L{{.*}}:
 ; X86-NEXT: [[END_OF_F:.*]]:
 ;
-; X86-LABEL: .section        .debug$S,"rd"
+; X86-LABEL: .section        .debug$S,"dr"
 ; X86-NEXT: .long   4
 ; Symbol subsection for x
 ; X86-NEXT: .long   241
@@ -317,7 +317,7 @@
 ; X64-NEXT: .L{{.*}}:
 ; X64-NEXT: [[END_OF_F:.*]]:
 ;
-; X64-LABEL: .section        .debug$S,"rd"
+; X64-LABEL: .section        .debug$S,"dr"
 ; X64-NEXT: .long   4
 ; Symbol subsection for x
 ; X64-NEXT: .long   241
@@ -582,25 +582,25 @@ attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "
 !llvm.module.flags = !{!11, !12}
 !llvm.ident = !{!13}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\/<unknown>] [DW_LANG_C99]
-!1 = metadata !{metadata !"<unknown>", metadata !"D:\5C"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !9, metadata !10}
-!4 = metadata !{metadata !"0x2e\00x\00x\00\003\000\001\000\006\00256\000\003", metadata !5, metadata !6, metadata !7, null, void ()* @x, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [x]
-!5 = metadata !{metadata !"source.c", metadata !"D:\5C"}
-!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [D:\/source.c]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null}
-!9 = metadata !{metadata !"0x2e\00y\00y\00\007\000\001\000\006\00256\000\007", metadata !5, metadata !6, metadata !7, null, void ()* @y, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [y]
-!10 = metadata !{metadata !"0x2e\00f\00f\00\0011\000\001\000\006\00256\000\0011", metadata !5, metadata !6, metadata !7, null, void ()* @f, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 11] [def] [f]
-!11 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!13 = metadata !{metadata !"clang version 3.5 "}
-!14 = metadata !{i32 4, i32 0, metadata !4, null}
-!15 = metadata !{i32 5, i32 0, metadata !4, null}
-!16 = metadata !{i32 8, i32 0, metadata !9, null}
-!17 = metadata !{i32 9, i32 0, metadata !9, null}
-!18 = metadata !{i32 12, i32 0, metadata !10, null}
-!19 = metadata !{i32 13, i32 0, metadata !10, null}
-!20 = metadata !{i32 14, i32 0, metadata !10, null}
-!21 = metadata !{i32 15, i32 0, metadata !10, null}
+!0 = !{!"0x11\0012\00clang version 3.5 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [D:\/<unknown>] [DW_LANG_C99]
+!1 = !{!"<unknown>", !"D:\5C"}
+!2 = !{}
+!3 = !{!4, !9, !10}
+!4 = !{!"0x2e\00x\00x\00\003\000\001\000\006\00256\000\003", !5, !6, !7, null, void ()* @x, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [x]
+!5 = !{!"source.c", !"D:\5C"}
+!6 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [D:\/source.c]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null}
+!9 = !{!"0x2e\00y\00y\00\007\000\001\000\006\00256\000\007", !5, !6, !7, null, void ()* @y, null, null, !2} ; [ DW_TAG_subprogram ] [line 7] [def] [y]
+!10 = !{!"0x2e\00f\00f\00\0011\000\001\000\006\00256\000\0011", !5, !6, !7, null, void ()* @f, null, null, !2} ; [ DW_TAG_subprogram ] [line 11] [def] [f]
+!11 = !{i32 2, !"Dwarf Version", i32 4}
+!12 = !{i32 1, !"Debug Info Version", i32 2}
+!13 = !{!"clang version 3.5 "}
+!14 = !MDLocation(line: 4, scope: !4)
+!15 = !MDLocation(line: 5, scope: !4)
+!16 = !MDLocation(line: 8, scope: !9)
+!17 = !MDLocation(line: 9, scope: !9)
+!18 = !MDLocation(line: 12, scope: !10)
+!19 = !MDLocation(line: 13, scope: !10)
+!20 = !MDLocation(line: 14, scope: !10)
+!21 = !MDLocation(line: 15, scope: !10)
diff --git a/test/DebugInfo/COFF/simple.ll b/test/DebugInfo/COFF/simple.ll
index 00f1829..a65cbcd 100644
--- a/test/DebugInfo/COFF/simple.ll
+++ b/test/DebugInfo/COFF/simple.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mcpu=core2 -mtriple=i686-pc-win32 -O0 < %s | FileCheck --check-prefix=X86 %s
-; RUN: llc -mcpu=core2 -mtriple=i686-pc-win32 -o - -O0 < %s | llvm-mc -triple=i686-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview-linetables | FileCheck --check-prefix=OBJ32 %s
+; RUN: llc -mcpu=core2 -mtriple=i686-pc-win32 -o - -O0 < %s | llvm-mc -triple=i686-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview -section-symbols | FileCheck --check-prefix=OBJ32 %s
 ; RUN: llc -mcpu=core2 -mtriple=x86_64-pc-win32 -O0 < %s | FileCheck --check-prefix=X64 %s
-; RUN: llc -mcpu=core2 -mtriple=x86_64-pc-win32 -o - -O0 < %s | llvm-mc -triple=x86_64-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview-linetables | FileCheck --check-prefix=OBJ64 %s
+; RUN: llc -mcpu=core2 -mtriple=x86_64-pc-win32 -o - -O0 < %s | llvm-mc -triple=x86_64-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview -section-symbols | FileCheck --check-prefix=OBJ64 %s
 
 ; This LL file was generated by running clang on the following code:
 ; D:\test.c:
@@ -20,7 +20,7 @@
 ; X86-NEXT: L{{.*}}:
 ; X86-NEXT: [[END_OF_F:.*]]:
 ;
-; X86-LABEL: .section        .debug$S,"rd"
+; X86-LABEL: .section        .debug$S,"dr"
 ; X86-NEXT: .long   4
 ; Symbol subsection
 ; X86-NEXT: .long   241
@@ -118,7 +118,7 @@
 ; X64-NEXT: .L{{.*}}:
 ; X64-NEXT: [[END_OF_F:.*]]:
 ;
-; X64-LABEL: .section        .debug$S,"rd"
+; X64-LABEL: .section        .debug$S,"dr"
 ; X64-NEXT: .long   4
 ; Symbol subsection
 ; X64-NEXT: .long   241
@@ -223,17 +223,17 @@ attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\/<unknown>] [DW_LANG_C99]
-!1 = metadata !{metadata !"<unknown>", metadata !"D:\5C"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00f\00f\00\003\000\001\000\006\00256\000\003", metadata !5, metadata !6, metadata !7, null, void ()* @f, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
-!5 = metadata !{metadata !"test.c", metadata !"D:\5C"}
-!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [D:\/test.c]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null}
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!11 = metadata !{metadata !"clang version 3.5 "}
-!12 = metadata !{i32 4, i32 0, metadata !4, null}
-!13 = metadata !{i32 5, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [D:\/<unknown>] [DW_LANG_C99]
+!1 = !{!"<unknown>", !"D:\5C"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00f\00f\00\003\000\001\000\006\00256\000\003", !5, !6, !7, null, void ()* @f, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!5 = !{!"test.c", !"D:\5C"}
+!6 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [D:\/test.c]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null}
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 1, !"Debug Info Version", i32 2}
+!11 = !{!"clang version 3.5 "}
+!12 = !MDLocation(line: 4, scope: !4)
+!13 = !MDLocation(line: 5, scope: !4)
diff --git a/test/DebugInfo/COFF/tail-call-without-lexical-scopes.ll b/test/DebugInfo/COFF/tail-call-without-lexical-scopes.ll
index 8db2fd0..83d976d 100644
--- a/test/DebugInfo/COFF/tail-call-without-lexical-scopes.ll
+++ b/test/DebugInfo/COFF/tail-call-without-lexical-scopes.ll
@@ -22,7 +22,7 @@
 ; X86-NEXT: [[END_OF_BAR:^L.*]]:{{$}}
 ; X86-NOT:  ret
 
-; X86-LABEL: .section        .debug$S,"rd"
+; X86-LABEL: .section        .debug$S,"dr"
 ; X86:       .secrel32 "?bar@@YAXHZZ"
 ; X86-NEXT:  .secidx   "?bar@@YAXHZZ"
 ; X86:       .long   0
@@ -61,18 +61,18 @@ attributes #2 = { nounwind }
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\/test.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"test.cpp", metadata !"D:\5C"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !7}
-!4 = metadata !{metadata !"0x2e\00spam\00spam\00\007\000\001\000\006\00256\001\007", metadata !1, metadata !5, metadata !6, null, void ()* @"\01?spam@@YAXXZ", null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [spam]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [D:\/test.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !"0x2e\00bar\00bar\00\003\001\001\000\006\00256\001\003", metadata !1, metadata !5, metadata !6, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [local] [def] [bar]
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!10 = metadata !{metadata !"clang version 3.5.0 "}
-!11 = metadata !{i32 8, i32 0, metadata !4, null}
-!12 = metadata !{i32 9, i32 0, metadata !4, null}
-!13 = metadata !{i32 4, i32 0, metadata !7, null}
-!14 = metadata !{i32 5, i32 0, metadata !7, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \001\00\000\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [D:\/test.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"test.cpp", !"D:\5C"}
+!2 = !{}
+!3 = !{!4, !7}
+!4 = !{!"0x2e\00spam\00spam\00\007\000\001\000\006\00256\001\007", !1, !5, !6, null, void ()* @"\01?spam@@YAXXZ", null, null, !2} ; [ DW_TAG_subprogram ] [line 7] [def] [spam]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [D:\/test.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!"0x2e\00bar\00bar\00\003\001\001\000\006\00256\001\003", !1, !5, !6, null, null, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [local] [def] [bar]
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 1, !"Debug Info Version", i32 2}
+!10 = !{!"clang version 3.5.0 "}
+!11 = !MDLocation(line: 8, scope: !4)
+!12 = !MDLocation(line: 9, scope: !4)
+!13 = !MDLocation(line: 4, scope: !7)
+!14 = !MDLocation(line: 5, scope: !7)
diff --git a/test/DebugInfo/Inputs/gmlt.ll b/test/DebugInfo/Inputs/gmlt.ll
index ba8d113..e432640 100644
--- a/test/DebugInfo/Inputs/gmlt.ll
+++ b/test/DebugInfo/Inputs/gmlt.ll
@@ -131,23 +131,23 @@ attributes #2 = { nounwind }
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \000\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/gmlt.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"gmlt.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !7, metadata !8, metadata !9}
-!4 = metadata !{metadata !"0x2e\00f1\00f1\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @_Z2f1v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [f1]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/gmlt.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !"0x2e\00f2\00f2\00\002\000\001\000\006\00256\000\002", metadata !1, metadata !5, metadata !6, null, void ()* @_Z2f2v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [f2]
-!8 = metadata !{metadata !"0x2e\00f3\00f3\00\003\000\001\000\006\00256\000\003", metadata !1, metadata !5, metadata !6, null, void ()* @_Z2f3v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f3]
-!9 = metadata !{metadata !"0x2e\00f4\00f4\00\004\000\001\000\006\00256\000\004", metadata !1, metadata !5, metadata !6, null, void ()* @_Z2f4v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [f4]
-!10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!11 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!12 = metadata !{metadata !"clang version 3.6.0 "}
-!13 = metadata !{i32 1, i32 12, metadata !4, null}
-!14 = metadata !{i32 2, i32 53, metadata !7, null}
-!15 = metadata !{i32 3, i32 44, metadata !8, null}
-!16 = metadata !{i32 3, i32 50, metadata !8, null}
-!17 = metadata !{i32 3, i32 44, metadata !8, metadata !18}
-!18 = metadata !{i32 4, i32 13, metadata !9, null}
-!19 = metadata !{i32 4, i32 19, metadata !9, null}
+!0 = !{!"0x11\004\00clang version 3.6.0 \000\00\000\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/gmlt.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"gmlt.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4, !7, !8, !9}
+!4 = !{!"0x2e\00f1\00f1\00\001\000\001\000\006\00256\000\001", !1, !5, !6, null, void ()* @_Z2f1v, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [f1]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/gmlt.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!"0x2e\00f2\00f2\00\002\000\001\000\006\00256\000\002", !1, !5, !6, null, void ()* @_Z2f2v, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [f2]
+!8 = !{!"0x2e\00f3\00f3\00\003\000\001\000\006\00256\000\003", !1, !5, !6, null, void ()* @_Z2f3v, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f3]
+!9 = !{!"0x2e\00f4\00f4\00\004\000\001\000\006\00256\000\004", !1, !5, !6, null, void ()* @_Z2f4v, null, null, !2} ; [ DW_TAG_subprogram ] [line 4] [def] [f4]
+!10 = !{i32 2, !"Dwarf Version", i32 4}
+!11 = !{i32 2, !"Debug Info Version", i32 2}
+!12 = !{!"clang version 3.6.0 "}
+!13 = !MDLocation(line: 1, column: 12, scope: !4)
+!14 = !MDLocation(line: 2, column: 53, scope: !7)
+!15 = !MDLocation(line: 3, column: 44, scope: !8)
+!16 = !MDLocation(line: 3, column: 50, scope: !8)
+!17 = !MDLocation(line: 3, column: 44, scope: !8, inlinedAt: !18)
+!18 = !MDLocation(line: 4, column: 13, scope: !9)
+!19 = !MDLocation(line: 4, column: 19, scope: !9)
diff --git a/test/DebugInfo/Inputs/line.ll b/test/DebugInfo/Inputs/line.ll
new file mode 100644
index 0000000..1a4a908
--- /dev/null
+++ b/test/DebugInfo/Inputs/line.ll
@@ -0,0 +1,55 @@
+; From source:
+; int f(int a, int b) {
+;   return a   //
+;          &&  //
+;          b;
+; }
+
+; Check that the comparison of 'a' is attributed to line 2, not 3.
+
+; CHECK: .loc{{ +}}1{{ +}}2
+; CHECK-NOT: .loc{{ }}
+; CHECK: cmp
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z1fii(i32 %a, i32 %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 %b, i32* %b.addr, align 4
+  %0 = load i32* %a.addr, align 4, !dbg !10
+  %tobool = icmp ne i32 %0, 0, !dbg !10
+  br i1 %tobool, label %land.rhs, label %land.end, !dbg !11
+
+land.rhs:                                         ; preds = %entry
+  %1 = load i32* %b.addr, align 4, !dbg !12
+  %tobool1 = icmp ne i32 %1, 0, !dbg !12
+  br label %land.end
+
+land.end:                                         ; preds = %land.rhs, %entry
+  %2 = phi i1 [ false, %entry ], [ %tobool1, %land.rhs ]
+  %conv = zext i1 %2 to i32, !dbg !10
+  ret i32 %conv, !dbg !13
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = !{!"0x11\004\00clang version 3.7.0 (trunk 227472) (llvm/trunk 227476)\000\00\000\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/line.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"line.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00f\00f\00\001\000\001\000\000\00256\000\001", !1, !5, !6, null, i32 (i32, i32)* @_Z1fii, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!5 = !{!"0x29", !1}                               ; [ DW_TAG_file_type ] [/tmp/dbginfo/line.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 2}
+!9 = !{!"clang version 3.7.0 (trunk 227472) (llvm/trunk 227476)"}
+!10 = !MDLocation(line: 2, scope: !4)
+!11 = !MDLocation(line: 3, scope: !4)
+!12 = !MDLocation(line: 4, scope: !4)
+!13 = !MDLocation(line: 2, scope: !4)
diff --git a/test/DebugInfo/Mips/delay-slot.ll b/test/DebugInfo/Mips/delay-slot.ll
index 5587bcb..d860cea 100644
--- a/test/DebugInfo/Mips/delay-slot.ll
+++ b/test/DebugInfo/Mips/delay-slot.ll
@@ -26,7 +26,7 @@ target triple = "mips--linux-gnu"
 ; Function Attrs: nounwind
 define i32 @foo(i32 %x) #0 {
 entry:
-  call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !13
+  call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !12, metadata !{!"0x102"}), !dbg !13
   %tobool = icmp ne i32 %x, 0, !dbg !14
   br i1 %tobool, label %if.then, label %if.end, !dbg !14
 
@@ -54,22 +54,22 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/test.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"test.c", metadata !"/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/test.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!11 = metadata !{metadata !"clang version 3.5.0"}
-!12 = metadata !{metadata !"0x101\00x\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [x] [line 1]
-!13 = metadata !{i32 1, i32 0, metadata !4, null}
-!14 = metadata !{i32 2, i32 0, metadata !15, null}
-!15 = metadata !{metadata !"0xb\002\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/tmp/test.c]
-!16 = metadata !{i32 3, i32 0, metadata !15, null}
-!17 = metadata !{i32 4, i32 0, metadata !4, null}
-!18 = metadata !{i32 5, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/test.c] [DW_LANG_C99]
+!1 = !{!"test.c", !"/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", !1, !5, !6, null, i32 (i32)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/test.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 2}
+!11 = !{!"clang version 3.5.0"}
+!12 = !{!"0x101\00x\0016777217\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [x] [line 1]
+!13 = !MDLocation(line: 1, scope: !4)
+!14 = !MDLocation(line: 2, scope: !15)
+!15 = !{!"0xb\002\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [/tmp/test.c]
+!16 = !MDLocation(line: 3, scope: !15)
+!17 = !MDLocation(line: 4, scope: !4)
+!18 = !MDLocation(line: 5, scope: !4)
diff --git a/test/DebugInfo/Mips/fn-call-line.ll b/test/DebugInfo/Mips/fn-call-line.ll
new file mode 100644
index 0000000..14cd8c9
--- /dev/null
+++ b/test/DebugInfo/Mips/fn-call-line.ll
@@ -0,0 +1,84 @@
+; RUN: llc -mtriple=mips-linux-gnu -filetype=asm -asm-verbose=0 -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-linux-gnu -filetype=obj -O0 < %s | llvm-dwarfdump -debug-dump=line - | FileCheck %s --check-prefix=INT
+
+; Mips used to generate 'jumpy' debug line info around calls. The address
+; calculation for each call to f1() would share the same line info so it would
+; emit output of the form:
+;   .loc $first_call_location
+;   .. address calculation ..
+;   .. function call ..
+;   .. address calculation ..
+;   .loc $second_call_location
+;   .. function call ..
+;   .loc $first_call_location
+;   .. address calculation ..
+;   .loc $third_call_location
+;   .. function call ..
+;   ...
+; which would cause confusing stepping behaviour for the end user.
+;
+; This test checks that we emit more user friendly debug line info of the form:
+;   .loc $first_call_location
+;   .. address calculation ..
+;   .. function call ..
+;   .loc $second_call_location
+;   .. address calculation ..
+;   .. function call ..
+;   .loc $third_call_location
+;   .. address calculation ..
+;   .. function call ..
+;   ...
+;
+; Generated with clang from fn-call-line.c:
+; void f1();
+; void f2() {
+;   f1();
+;   f1();
+; }
+
+; CHECK: .loc	1 3 3
+; CHECK-NOT: .loc
+; CHECK: %call16(f1) 
+; CHECK-NOT: .loc
+; CHECK: .loc	1 4 3
+; CHECK-NOT: .loc
+; CHECK: %call16(f1) 
+
+; INT: {{^}}Address
+; INT: -----
+; INT-NEXT: 2 0 1 0 0 is_stmt{{$}}
+; INT-NEXT: 3 3 1 0 0 is_stmt prologue_end{{$}}
+; INT-NEXT: 4 3 1 0 0 is_stmt{{$}}
+
+
+; Function Attrs: nounwind uwtable
+define void @f2() #0 {
+entry:
+  call void (...)* @f1(), !dbg !11
+  call void (...)* @f1(), !dbg !12
+  ret void, !dbg !13
+}
+
+declare void @f1(...) #1
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = !{!"0x11\0012\00clang version 3.7.0 (trunk 226641)\000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/fn-call-line.c] [DW_LANG_C99]
+!1 = !{!"fn-call-line.c", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00f2\00f2\00\002\000\001\000\000\000\000\002", !1, !5, !6, null, void ()* @f2, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [f2]
+!5 = !{!"0x29", !1}                               ; [ DW_TAG_file_type ] [/tmp/dbginfo/fn-call-line.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 2}
+!10 = !{!"clang version 3.7.0 (trunk 226641)"}
+!11 = !MDLocation(line: 3, column: 3, scope: !4)
+!12 = !MDLocation(line: 4, column: 3, scope: !4)
+!13 = !MDLocation(line: 5, column: 1, scope: !4)
diff --git a/test/DebugInfo/Mips/processes-relocations.ll b/test/DebugInfo/Mips/processes-relocations.ll
index 98eba68..5f52ea1 100644
--- a/test/DebugInfo/Mips/processes-relocations.ll
+++ b/test/DebugInfo/Mips/processes-relocations.ll
@@ -9,9 +9,9 @@
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"empty.c", metadata !"/a"}
-!2 = metadata !{}
-!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
-!5 = metadata !{metadata !"clang version 3.6.0 "}
+!0 = !{i32 786449, !1, i32 12, !"clang version 3.6.0 ", i1 false, !"", i32 0, !2, !2, !2, !2, !2, !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = !{!"empty.c", !"/a"}
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 1}
+!5 = !{!"clang version 3.6.0 "}
diff --git a/test/DebugInfo/PDB/Inputs/empty.cpp b/test/DebugInfo/PDB/Inputs/empty.cpp
new file mode 100644
index 0000000..6021aca
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/empty.cpp
@@ -0,0 +1,7 @@
+// Build with "cl.exe /Zi empty.cpp /link /debug /nodefaultlib /entry:main"
+
+void *__purecall = 0;
+
+int main() {
+  return 42;
+}
diff --git a/test/DebugInfo/PDB/Inputs/empty.pdb b/test/DebugInfo/PDB/Inputs/empty.pdb
new file mode 100644
index 0000000..ae65c3a
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/empty.pdb
diff --git a/test/DebugInfo/PDB/Inputs/symbolformat-fpo.cpp b/test/DebugInfo/PDB/Inputs/symbolformat-fpo.cpp
new file mode 100644
index 0000000..56a5b26
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/symbolformat-fpo.cpp
@@ -0,0 +1,6 @@
+// Compile with "cl /GR- /Zi /c /Ox /Oy symbolformat-fpo.cpp"
+// Refer to symbolformat.cpp for linking instructions.
+
+unsigned fpo_func(unsigned n) {
+  return n * 2;
+}
diff --git a/test/DebugInfo/PDB/Inputs/symbolformat.cpp b/test/DebugInfo/PDB/Inputs/symbolformat.cpp
new file mode 100644
index 0000000..c069a35
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/symbolformat.cpp
@@ -0,0 +1,53 @@
+// Compile with "cl /c /Zi /GR- symbolformat.cpp"
+// Compile symbolformat-fpo.cpp (see file for instructions)
+// Link with "link symbolformat.obj symbolformat-fpo.obj /debug /nodefaultlib
+//    /entry:main /out:symbolformat.exe"
+
+int __cdecl _purecall(void) { return 0; }
+
+enum TestEnum {
+  Value,
+  Value10 = 10
+};
+
+enum class TestEnumClass {
+  Value,
+  Value10 = 10
+};
+
+struct A {
+  virtual void PureFunc() = 0 {}
+  virtual void VirtualFunc() {}
+  void RegularFunc() {}
+};
+
+struct VirtualBase {
+};
+
+struct B : public A, protected virtual VirtualBase {
+  void PureFunc() override {}
+
+  enum NestedEnum {
+    FirstVal,
+    SecondVal
+  };
+
+  typedef int NestedTypedef;
+  NestedEnum EnumVar;
+  NestedTypedef TypedefVar;
+};
+
+typedef int IntType;
+typedef A ClassAType;
+
+int main(int argc, char **argv) {
+  B b;
+  auto PureAddr = &B::PureFunc;
+  auto VirtualAddr = &A::PureFunc;
+  auto RegularAddr = &A::RegularFunc;
+  TestEnum Enum = Value;
+  TestEnumClass EnumClass = TestEnumClass::Value10;
+  IntType Int = 12;
+  ClassAType *ClassA = &b;
+  return 0;
+}
diff --git a/test/DebugInfo/PDB/Inputs/symbolformat.pdb b/test/DebugInfo/PDB/Inputs/symbolformat.pdb
new file mode 100644
index 0000000..183870a
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/symbolformat.pdb
diff --git a/test/DebugInfo/PDB/lit.local.cfg b/test/DebugInfo/PDB/lit.local.cfg
new file mode 100644
index 0000000..28a895f
--- /dev/null
+++ b/test/DebugInfo/PDB/lit.local.cfg
@@ -0,0 +1 @@
+config.unsupported = not config.have_dia_sdk
diff --git a/test/DebugInfo/PDB/pdbdump-flags.test b/test/DebugInfo/PDB/pdbdump-flags.test
new file mode 100644
index 0000000..d8d38cb
--- /dev/null
+++ b/test/DebugInfo/PDB/pdbdump-flags.test
@@ -0,0 +1,32 @@
+; RUN: llvm-pdbdump %p/Inputs/empty.pdb | FileCheck %s -check-prefix=NO_ARGS
+; RUN: llvm-pdbdump -types %p/Inputs/empty.pdb | FileCheck %s -check-prefix=TYPES
+; RUN: llvm-pdbdump -compilands %p/Inputs/empty.pdb | FileCheck %s -check-prefix=COMPILANDS
+; RUN: llvm-pdbdump -types -compilands %p/Inputs/empty.pdb | FileCheck %s -check-prefix=BOTH
+
+; Check that neither symbols nor compilands are dumped when neither argument specified.
+; NO_ARGS: empty.pdb
+; NO_ARGS: Guid: {0B355641-86A0-A249-896F-9988FAE52FF0}
+; NO_ARGS: Attributes: HasPrivateSymbols
+; NO_ARGS-NOT: Dumping compilands
+; NO_ARGS-NOT: Dumping symbols
+
+; Check that only symbols are dumped when only -types is specified.
+; TYPES: empty.pdb
+; TYPES: Guid: {0B355641-86A0-A249-896F-9988FAE52FF0}
+; TYPES: Attributes: HasPrivateSymbols
+; TYPES: Dumping types
+; TYPES-NOT: Dumping compilands
+
+; Check that only compilands are dumped when only -compilands is specified.
+; COMPILANDS: empty.pdb
+; COMPILANDS: Guid: {0B355641-86A0-A249-896F-9988FAE52FF0}
+; COMPILANDS: Attributes: HasPrivateSymbols
+; COMPILANDS-NOT: Dumping types
+; COMPILANDS: Dumping compilands
+
+; Check that types and compilands are dumped when both arguments are specified.
+; BOTH: empty.pdb
+; BOTH: Guid: {0B355641-86A0-A249-896F-9988FAE52FF0}
+; BOTH: Attributes: HasPrivateSymbols
+; BOTH: Dumping types
+; BOTH: Dumping compilands
diff --git a/test/DebugInfo/PDB/pdbdump-symbol-format.test b/test/DebugInfo/PDB/pdbdump-symbol-format.test
new file mode 100644
index 0000000..1540e16
--- /dev/null
+++ b/test/DebugInfo/PDB/pdbdump-symbol-format.test
@@ -0,0 +1,49 @@
+; RUN: llvm-pdbdump -symbols %p/Inputs/symbolformat.pdb | FileCheck --check-prefix=SYM_FORMAT %s
+; RUN: llvm-pdbdump -types %p/Inputs/symbolformat.pdb | FileCheck --check-prefix=TYPES_FORMAT %s
+; RUN: llvm-pdbdump -types -class-definitions %p/Inputs/symbolformat.pdb | FileCheck --check-prefix=FULL_CLASS %s
+
+; The format is func [0x<rva_start>+<prologue_length> - 0x<rva_end>-<epilogue_length>]
+; SYM_FORMAT: symbolformat-fpo.obj
+; SYM_FORMAT-DAG: func [0x001130+0 - 0x001137-1] (FPO) uint32_t __cdecl fpo_func(uint32_t n)
+; SYM_FORMAT: symbolformat.obj
+; SYM_FORMAT-DAG: func [0x001140+3 - 0x001147-2] (EBP) int32_t __cdecl _purecall()
+; SYM_FORMAT-DAG: func [0x001150+6 - 0x0011b6-4] (EBP) int32_t __cdecl main(int32_t argc, char** argv)
+; SYM_FORMAT-DAG: func [0x0010b0+7 - 0x0010c7-4] (EBP) void A::A()
+; SYM_FORMAT-DAG: func [0x0011c0+7 - 0x0011f1-6] (EBP) void B::B()
+; SYM_FORMAT-DAG: thunk [0x000010f6 - 0x000010fa] (Pcode) B::`vcall'{0}'
+; SYM_FORMAT-DAG: func [0x001100+7 - 0x00110b-4] (EBP) virtual void B::PureFunc()
+; SYM_FORMAT-DAG: func [0x001110+7 - 0x00111b-4] (EBP) void A::RegularFunc()
+; SYM_FORMAT-DAG: func [0x001120+7 - 0x00112b-4] (EBP) virtual void A::VirtualFunc()
+
+; TYPES_FORMAT: Enums
+; TYPES_FORMAT-DAG: enum TestEnum
+; TYPES_FORMAT-DAG: enum TestEnumClass
+; TYPES_FORMAT: Function Signatures
+; TYPES_FORMAT-DAG: int32_t __cdecl ()
+; TYPES_FORMAT-DAG: int32_t __cdecl (int32_t, char**)
+; TYPES_FORMAT-DAG: void (A::)()
+; TYPES_FORMAT-DAG: void (B::)()
+; TYPES_FORMAT-DAG: void (B::)(B&)
+; TYPES_FORMAT-DAG: void (B::)()
+; TYPES_FORMAT-DAG: B& (B::)(B&)
+; TYPES_FORMAT-DAG: void (A::)(A&)
+; TYPES_FORMAT-DAG: void (A::)()
+; TYPES_FORMAT-DAG: A& (A::)(A&)
+; TYPES_FORMAT: Typedefs
+; TYPES_FORMAT-DAG: typedef int32_t IntType
+; TYPES_FORMAT-DAG: typedef class A ClassAType
+; TYPES_FORMAT: Classes
+; TYPES_FORMAT-DAG: class A
+; TYPES_FORMAT-DAG: class B
+
+; FULL_CLASS: Classes
+; FULL_CLASS-DAG: class A {
+; FULL_CLASS: public:
+; FULL_CLASS: virtual void PureFunc() = 0
+; FULL_CLASS: virtual void VirtualFunc()
+; FULL_CLASS: void RegularFunc()
+; FULL_CLASS: }
+; FULL_CLASS-DAG: class B {
+; FULL_CLASS: public:
+; FULL_CLASS: virtual void PureFunc()
+; FULL_CLASS: }
+\ No newline at end of file
diff --git a/test/DebugInfo/PR20038.ll b/test/DebugInfo/PR20038.ll
index 2cd40fb..bfee8d2 100644
--- a/test/DebugInfo/PR20038.ll
+++ b/test/DebugInfo/PR20038.ll
@@ -74,10 +74,10 @@ land.end:                                         ; preds = %land.rhs, %entry
 
 cleanup.action:                                   ; preds = %land.end
   store %struct.C* %agg.tmp.ensured, %struct.C** %this.addr.i, align 8, !dbg !22
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr.i}, metadata !29, metadata !{metadata !"0x102"}), !dbg !31
+  call void @llvm.dbg.declare(metadata %struct.C** %this.addr.i, metadata !29, metadata !{!"0x102"}), !dbg !31
   %this1.i = load %struct.C** %this.addr.i, !dbg !22
   store %struct.C* %this1.i, %struct.C** %this.addr.i.i, align 8, !dbg !21
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr.i.i}, metadata !32, metadata !{metadata !"0x102"}), !dbg !33
+  call void @llvm.dbg.declare(metadata %struct.C** %this.addr.i.i, metadata !32, metadata !{!"0x102"}), !dbg !33
   %this1.i.i = load %struct.C** %this.addr.i.i, !dbg !21
   br label %cleanup.done, !dbg !22
 
@@ -91,10 +91,10 @@ entry:
   %this.addr.i = alloca %struct.C*, align 8, !dbg !37
   %this.addr = alloca %struct.C*, align 8
   store %struct.C* %this, %struct.C** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !29, metadata !{metadata !"0x102"}), !dbg !38
+  call void @llvm.dbg.declare(metadata %struct.C** %this.addr, metadata !29, metadata !{!"0x102"}), !dbg !38
   %this1 = load %struct.C** %this.addr
   store %struct.C* %this1, %struct.C** %this.addr.i, align 8, !dbg !37
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr.i}, metadata !32, metadata !{metadata !"0x102"}), !dbg !39
+  call void @llvm.dbg.declare(metadata %struct.C** %this.addr.i, metadata !32, metadata !{!"0x102"}), !dbg !39
   %this1.i = load %struct.C** %this.addr.i, !dbg !37
   ret void, !dbg !37
 }
@@ -104,7 +104,7 @@ define void @_ZN1CD2Ev(%struct.C* %this) unnamed_addr #1 align 2 {
 entry:
   %this.addr = alloca %struct.C*, align 8
   store %struct.C* %this, %struct.C** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !32, metadata !{metadata !"0x102"}), !dbg !40
+  call void @llvm.dbg.declare(metadata %struct.C** %this.addr, metadata !32, metadata !{!"0x102"}), !dbg !40
   %this1 = load %struct.C** %this.addr
   ret void, !dbg !41
 }
@@ -120,45 +120,45 @@ attributes #2 = { nounwind readnone }
 !llvm.module.flags = !{!18, !19}
 !llvm.ident = !{!20}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !11, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/<stdin>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"<stdin>", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00C\001\008\008\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !"PR20038.cpp", metadata !"/tmp/dbginfo"}
-!6 = metadata !{metadata !7}
-!7 = metadata !{metadata !"0x2e\00~C\00~C\00\002\000\000\000\006\00256\000\002", metadata !5, metadata !"_ZTS1C", metadata !8, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 2] [~C]
-!8 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!9 = metadata !{null, metadata !10}
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
-!11 = metadata !{metadata !12, metadata !16, metadata !17}
-!12 = metadata !{metadata !"0x2e\00fun4\00fun4\00_Z4fun4v\005\000\001\000\006\00256\000\005", metadata !5, metadata !13, metadata !14, null, void ()* @_Z4fun4v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [fun4]
-!13 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/PR20038.cpp]
-!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!15 = metadata !{null}
-!16 = metadata !{metadata !"0x2e\00~C\00~C\00_ZN1CD2Ev\006\000\001\000\006\00256\000\006", metadata !5, metadata !"_ZTS1C", metadata !8, null, void (%struct.C*)* @_ZN1CD2Ev, null, metadata !7, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [~C]
-!17 = metadata !{metadata !"0x2e\00~C\00~C\00_ZN1CD1Ev\006\000\001\000\006\00256\000\006", metadata !5, metadata !"_ZTS1C", metadata !8, null, void (%struct.C*)* @_ZN1CD1Ev, null, metadata !7, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [~C]
-!18 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!19 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!20 = metadata !{metadata !"clang version 3.5.0 "}
-!21 = metadata !{i32 6, i32 0, metadata !17, metadata !22}
-!22 = metadata !{i32 5, i32 0, metadata !23, null}
-!23 = metadata !{metadata !"0xb\005\000\003", metadata !5, metadata !12} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
-!24 = metadata !{i32 5, i32 0, metadata !12, null}
-!25 = metadata !{i32 5, i32 0, metadata !26, null}
-!26 = metadata !{metadata !"0xb\005\000\001", metadata !5, metadata !12} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
-!27 = metadata !{i32 5, i32 0, metadata !28, null}
-!28 = metadata !{metadata !"0xb\005\000\002", metadata !5, metadata !12} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
-!29 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !17, null, metadata !30} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!30 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
-!31 = metadata !{i32 0, i32 0, metadata !17, metadata !22}
-!32 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !16, null, metadata !30} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!33 = metadata !{i32 0, i32 0, metadata !16, metadata !21}
-!34 = metadata !{i32 5, i32 0, metadata !35, null}
-!35 = metadata !{metadata !"0xb\005\000\005", metadata !5, metadata !36} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
-!36 = metadata !{metadata !"0xb\005\000\004", metadata !5, metadata !12} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
-!37 = metadata !{i32 6, i32 0, metadata !17, null}
-!38 = metadata !{i32 0, i32 0, metadata !17, null}
-!39 = metadata !{i32 0, i32 0, metadata !16, metadata !37}
-!40 = metadata !{i32 0, i32 0, metadata !16, null}
-!41 = metadata !{i32 6, i32 0, metadata !16, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !3, !11, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/<stdin>] [DW_LANG_C_plus_plus]
+!1 = !{!"<stdin>", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00C\001\008\008\000\000\000", !5, null, null, !6, null, null, !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!"PR20038.cpp", !"/tmp/dbginfo"}
+!6 = !{!7}
+!7 = !{!"0x2e\00~C\00~C\00\002\000\000\000\006\00256\000\002", !5, !"_ZTS1C", !8, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 2] [~C]
+!8 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = !{null, !10}
+!10 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!11 = !{!12, !16, !17}
+!12 = !{!"0x2e\00fun4\00fun4\00_Z4fun4v\005\000\001\000\006\00256\000\005", !5, !13, !14, null, void ()* @_Z4fun4v, null, null, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [fun4]
+!13 = !{!"0x29", !5}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/PR20038.cpp]
+!14 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = !{null}
+!16 = !{!"0x2e\00~C\00~C\00_ZN1CD2Ev\006\000\001\000\006\00256\000\006", !5, !"_ZTS1C", !8, null, void (%struct.C*)* @_ZN1CD2Ev, null, !7, !2} ; [ DW_TAG_subprogram ] [line 6] [def] [~C]
+!17 = !{!"0x2e\00~C\00~C\00_ZN1CD1Ev\006\000\001\000\006\00256\000\006", !5, !"_ZTS1C", !8, null, void (%struct.C*)* @_ZN1CD1Ev, null, !7, !2} ; [ DW_TAG_subprogram ] [line 6] [def] [~C]
+!18 = !{i32 2, !"Dwarf Version", i32 4}
+!19 = !{i32 2, !"Debug Info Version", i32 2}
+!20 = !{!"clang version 3.5.0 "}
+!21 = !MDLocation(line: 6, scope: !17, inlinedAt: !22)
+!22 = !MDLocation(line: 5, scope: !23)
+!23 = !{!"0xb\005\000\003", !5, !12} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
+!24 = !MDLocation(line: 5, scope: !12)
+!25 = !MDLocation(line: 5, scope: !26)
+!26 = !{!"0xb\005\000\001", !5, !12} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
+!27 = !MDLocation(line: 5, scope: !28)
+!28 = !{!"0xb\005\000\002", !5, !12} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
+!29 = !{!"0x101\00this\0016777216\001088", !17, null, !30} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!30 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!31 = !MDLocation(line: 0, scope: !17, inlinedAt: !22)
+!32 = !{!"0x101\00this\0016777216\001088", !16, null, !30} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!33 = !MDLocation(line: 0, scope: !16, inlinedAt: !21)
+!34 = !MDLocation(line: 5, scope: !35)
+!35 = !{!"0xb\005\000\005", !5, !36} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
+!36 = !{!"0xb\005\000\004", !5, !12} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
+!37 = !MDLocation(line: 6, scope: !17)
+!38 = !MDLocation(line: 0, scope: !17)
+!39 = !MDLocation(line: 0, scope: !16, inlinedAt: !37)
+!40 = !MDLocation(line: 0, scope: !16)
+!41 = !MDLocation(line: 6, scope: !16)
diff --git a/test/DebugInfo/PowerPC/line.test b/test/DebugInfo/PowerPC/line.test
new file mode 100644
index 0000000..c1970ff
--- /dev/null
+++ b/test/DebugInfo/PowerPC/line.test
@@ -0,0 +1,7 @@
+; RUN: llc -mtriple=powerpc-unknown-linux -O0 -filetype=asm < %S/../Inputs/line.ll | FileCheck %S/../Inputs/line.ll
+
+; This is more complex than it looked. It's mixed up somewhere in SelectionDAG
+; (legalized as br_cc, losing the separation between the comparison and the
+; branch, then further lowered to cmplwi + brcc but without the fidelity that
+; those two instructions are on separate lines)
+; XFAIL: *
diff --git a/test/DebugInfo/PowerPC/processes-relocations.ll b/test/DebugInfo/PowerPC/processes-relocations.ll
index 5e661f7..459055e 100644
--- a/test/DebugInfo/PowerPC/processes-relocations.ll
+++ b/test/DebugInfo/PowerPC/processes-relocations.ll
@@ -9,9 +9,9 @@
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"empty.c", metadata !"/a"}
-!2 = metadata !{}
-!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
-!5 = metadata !{metadata !"clang version 3.6.0 "}
+!0 = !{i32 786449, !1, i32 12, !"clang version 3.6.0 ", i1 false, !"", i32 0, !2, !2, !2, !2, !2, !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = !{!"empty.c", !"/a"}
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 1}
+!5 = !{!"clang version 3.6.0 "}
diff --git a/test/DebugInfo/PowerPC/tls-fission.ll b/test/DebugInfo/PowerPC/tls-fission.ll
index fa198e1..7bb1626 100644
--- a/test/DebugInfo/PowerPC/tls-fission.ll
+++ b/test/DebugInfo/PowerPC/tls-fission.ll
@@ -21,12 +21,12 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00tls.dwo\000", metadata !1, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/tls.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"tls.cpp", metadata !"/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x34\00tls\00tls\00\001\000\001", null, metadata !5, metadata !6, i32* @tls, null} ; [ DW_TAG_variable ] [tls] [line 1] [def]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/tls.cpp]
-!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 \000\00\000\00tls.dwo\000", !1, !2, !2, !2, !3, !2} ; [ DW_TAG_compile_unit ] [/tmp/tls.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"tls.cpp", !"/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x34\00tls\00tls\00\001\000\001", null, !5, !6, i32* @tls, null} ; [ DW_TAG_variable ] [tls] [line 1] [def]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/tls.cpp]
+!6 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!7 = !{i32 2, !"Dwarf Version", i32 3}
+!8 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/PowerPC/tls.ll b/test/DebugInfo/PowerPC/tls.ll
index 22da193..be63136 100644
--- a/test/DebugInfo/PowerPC/tls.ll
+++ b/test/DebugInfo/PowerPC/tls.ll
@@ -17,13 +17,13 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/tls.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"tls.cpp", metadata !"/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x34\00tls\00tls\00\001\000\001", null, metadata !5, metadata !6, i32* @tls, null} ; [ DW_TAG_variable ] [tls] [line 1] [def]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/tls.cpp]
-!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
+!0 = !{!"0x11\004\00clang version 3.4 \000\00\000\00\000", !1, !2, !2, !2, !3, !2} ; [ DW_TAG_compile_unit ] [/tmp/tls.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"tls.cpp", !"/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x34\00tls\00tls\00\001\000\001", null, !5, !6, i32* @tls, null} ; [ DW_TAG_variable ] [tls] [line 1] [def]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/tls.cpp]
+!6 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!7 = !{i32 2, !"Dwarf Version", i32 3}
 
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!8 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/Sparc/gnu-window-save.ll b/test/DebugInfo/Sparc/gnu-window-save.ll
index 66066dd..5bf3f02 100644
--- a/test/DebugInfo/Sparc/gnu-window-save.ll
+++ b/test/DebugInfo/Sparc/gnu-window-save.ll
@@ -55,17 +55,17 @@ attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 (http://llvm.org/git/clang.git 6a0714fee07fb7c4e32d3972b4fe2ce2f5678cf4) (llvm/ 672e88e934757f76d5c5e5258be41e7615094844)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/home/venkatra/work/benchmarks/test/hello/hello.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"hello.c", metadata !"/home/venkatra/work/benchmarks/test/hello"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00main\00main\00\003\000\001\000\006\00256\000\004", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [main]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/home/venkatra/work/benchmarks/test/hello/hello.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!11 = metadata !{metadata !"clang version 3.5 (http://llvm.org/git/clang.git 6a0714fee07fb7c4e32d3972b4fe2ce2f5678cf4) (llvm/ 672e88e934757f76d5c5e5258be41e7615094844)"}
-!12 = metadata !{i32 5, i32 0, metadata !4, null}
-!13 = metadata !{i32 6, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5 (http://llvm.org/git/clang.git 6a0714fee07fb7c4e32d3972b4fe2ce2f5678cf4) (llvm/ 672e88e934757f76d5c5e5258be41e7615094844)\000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/home/venkatra/work/benchmarks/test/hello/hello.c] [DW_LANG_C99]
+!1 = !{!"hello.c", !"/home/venkatra/work/benchmarks/test/hello"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00main\00main\00\003\000\001\000\006\00256\000\004", !1, !5, !6, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [main]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/home/venkatra/work/benchmarks/test/hello/hello.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 1, !"Debug Info Version", i32 2}
+!11 = !{!"clang version 3.5 (http://llvm.org/git/clang.git 6a0714fee07fb7c4e32d3972b4fe2ce2f5678cf4) (llvm/ 672e88e934757f76d5c5e5258be41e7615094844)"}
+!12 = !MDLocation(line: 5, scope: !4)
+!13 = !MDLocation(line: 6, scope: !4)
diff --git a/test/DebugInfo/Sparc/processes-relocations.ll b/test/DebugInfo/Sparc/processes-relocations.ll
index 89cab9e..de44cc9 100644
--- a/test/DebugInfo/Sparc/processes-relocations.ll
+++ b/test/DebugInfo/Sparc/processes-relocations.ll
@@ -9,9 +9,9 @@
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"empty.c", metadata !"/a"}
-!2 = metadata !{}
-!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
-!5 = metadata !{metadata !"clang version 3.6.0 "}
+!0 = !{i32 786449, !1, i32 12, !"clang version 3.6.0 ", i1 false, !"", i32 0, !2, !2, !2, !2, !2, !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = !{!"empty.c", !"/a"}
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 1}
+!5 = !{!"clang version 3.6.0 "}
diff --git a/test/DebugInfo/SystemZ/processes-relocations.ll b/test/DebugInfo/SystemZ/processes-relocations.ll
index 6f276f9..fd1adf6 100644
--- a/test/DebugInfo/SystemZ/processes-relocations.ll
+++ b/test/DebugInfo/SystemZ/processes-relocations.ll
@@ -7,9 +7,9 @@
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"empty.c", metadata !"/a"}
-!2 = metadata !{}
-!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
-!5 = metadata !{metadata !"clang version 3.6.0 "}
+!0 = !{i32 786449, !1, i32 12, !"clang version 3.6.0 ", i1 false, !"", i32 0, !2, !2, !2, !2, !2, !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = !{!"empty.c", !"/a"}
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 1}
+!5 = !{!"clang version 3.6.0 "}
diff --git a/test/DebugInfo/SystemZ/variable-loc.ll b/test/DebugInfo/SystemZ/variable-loc.ll
index 13e2e60..9e5c6a9 100644
--- a/test/DebugInfo/SystemZ/variable-loc.ll
+++ b/test/DebugInfo/SystemZ/variable-loc.ll
@@ -35,8 +35,8 @@ entry:
   %main_arr = alloca [100 x i32], align 4
   %val = alloca i32, align 4
   store volatile i32 0, i32* %retval
-  call void @llvm.dbg.declare(metadata !{[100 x i32]* %main_arr}, metadata !17, metadata !{metadata !"0x102"}), !dbg !22
-  call void @llvm.dbg.declare(metadata !{i32* %val}, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
+  call void @llvm.dbg.declare(metadata [100 x i32]* %main_arr, metadata !17, metadata !{!"0x102"}), !dbg !22
+  call void @llvm.dbg.declare(metadata i32* %val, metadata !23, metadata !{!"0x102"}), !dbg !24
   %arraydecay = getelementptr inbounds [100 x i32]* %main_arr, i32 0, i32 0, !dbg !25
   call void @populate_array(i32* %arraydecay, i32 100), !dbg !25
   %arraydecay1 = getelementptr inbounds [100 x i32]* %main_arr, i32 0, i32 0, !dbg !26
@@ -52,31 +52,31 @@ declare i32 @printf(i8*, ...)
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!30}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.2 \000\00\000\00\000", metadata !29, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ] [/home/timnor01/a64-trunk/build/simple.c] [DW_LANG_C99]
-!1 = metadata !{}
-!3 = metadata !{metadata !5, metadata !11, metadata !14}
-!5 = metadata !{metadata !"0x2e\00populate_array\00populate_array\00\004\000\001\000\006\00256\000\004", metadata !29, metadata !6, metadata !7, null, void (i32*, i32)* @populate_array, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 4] [def] [populate_array]
-!6 = metadata !{metadata !"0x29", metadata !29} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9, metadata !10}
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!11 = metadata !{metadata !"0x2e\00sum_array\00sum_array\00\009\000\001\000\006\00256\000\009", metadata !29, metadata !6, metadata !12, null, i32 (i32*, i32)* @sum_array, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 9] [def] [sum_array]
-!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!13 = metadata !{metadata !10, metadata !9, metadata !10}
-!14 = metadata !{metadata !"0x2e\00main\00main\00\0018\000\001\000\006\00256\000\0018", metadata !29, metadata !6, metadata !15, null, i32 ()* @main, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{metadata !10}
-!17 = metadata !{metadata !"0x100\00main_arr\0019\000", metadata !18, metadata !6, metadata !19} ; [ DW_TAG_auto_variable ] [main_arr] [line 19]
-!18 = metadata !{metadata !"0xb\0018\0016\004", metadata !29, metadata !14} ; [ DW_TAG_lexical_block ] [/home/timnor01/a64-trunk/build/simple.c]
-!19 = metadata !{metadata !"0x1\00\000\003200\0032\000\000", null, null, metadata !10, metadata !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 3200, align 32, offset 0] [from int]
-!20 = metadata !{metadata !"0x21\000\0099"}       ; [ DW_TAG_subrange_type ] [0, 99]
-!22 = metadata !{i32 19, i32 7, metadata !18, null}
-!23 = metadata !{metadata !"0x100\00val\0020\000", metadata !18, metadata !6, metadata !10} ; [ DW_TAG_auto_variable ] [val] [line 20]
-!24 = metadata !{i32 20, i32 7, metadata !18, null}
-!25 = metadata !{i32 22, i32 3, metadata !18, null}
-!26 = metadata !{i32 23, i32 9, metadata !18, null}
-!27 = metadata !{i32 24, i32 3, metadata !18, null}
-!28 = metadata !{i32 26, i32 3, metadata !18, null}
-!29 = metadata !{metadata !"simple.c", metadata !"/home/timnor01/a64-trunk/build"}
-!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.2 \000\00\000\00\000", !29, !1, !1, !3, !1,  !1} ; [ DW_TAG_compile_unit ] [/home/timnor01/a64-trunk/build/simple.c] [DW_LANG_C99]
+!1 = !{}
+!3 = !{!5, !11, !14}
+!5 = !{!"0x2e\00populate_array\00populate_array\00\004\000\001\000\006\00256\000\004", !29, !6, !7, null, void (i32*, i32)* @populate_array, null, null, !1} ; [ DW_TAG_subprogram ] [line 4] [def] [populate_array]
+!6 = !{!"0x29", !29} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9, !10}
+!9 = !{!"0xf\00\000\0064\0064\000\000", null, null, !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!10 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!11 = !{!"0x2e\00sum_array\00sum_array\00\009\000\001\000\006\00256\000\009", !29, !6, !12, null, i32 (i32*, i32)* @sum_array, null, null, !1} ; [ DW_TAG_subprogram ] [line 9] [def] [sum_array]
+!12 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = !{!10, !9, !10}
+!14 = !{!"0x2e\00main\00main\00\0018\000\001\000\006\00256\000\0018", !29, !6, !15, null, i32 ()* @main, null, null, !1} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
+!15 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{!10}
+!17 = !{!"0x100\00main_arr\0019\000", !18, !6, !19} ; [ DW_TAG_auto_variable ] [main_arr] [line 19]
+!18 = !{!"0xb\0018\0016\004", !29, !14} ; [ DW_TAG_lexical_block ] [/home/timnor01/a64-trunk/build/simple.c]
+!19 = !{!"0x1\00\000\003200\0032\000\000", null, null, !10, !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 3200, align 32, offset 0] [from int]
+!20 = !{!"0x21\000\0099"}       ; [ DW_TAG_subrange_type ] [0, 99]
+!22 = !MDLocation(line: 19, column: 7, scope: !18)
+!23 = !{!"0x100\00val\0020\000", !18, !6, !10} ; [ DW_TAG_auto_variable ] [val] [line 20]
+!24 = !MDLocation(line: 20, column: 7, scope: !18)
+!25 = !MDLocation(line: 22, column: 3, scope: !18)
+!26 = !MDLocation(line: 23, column: 9, scope: !18)
+!27 = !MDLocation(line: 24, column: 3, scope: !18)
+!28 = !MDLocation(line: 26, column: 3, scope: !18)
+!29 = !{!"simple.c", !"/home/timnor01/a64-trunk/build"}
+!30 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/2010-04-13-PubType.ll b/test/DebugInfo/X86/2010-04-13-PubType.ll
index 0996725..0aec036 100644
--- a/test/DebugInfo/X86/2010-04-13-PubType.ll
+++ b/test/DebugInfo/X86/2010-04-13-PubType.ll
@@ -12,9 +12,9 @@ entry:
   %retval = alloca i32                            ; <i32*> [#uses=2]
   %0 = alloca i32                                 ; <i32*> [#uses=2]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.X** %x_addr}, metadata !0, metadata !{metadata !"0x102"}), !dbg !13
+  call void @llvm.dbg.declare(metadata %struct.X** %x_addr, metadata !0, metadata !{!"0x102"}), !dbg !13
   store %struct.X* %x, %struct.X** %x_addr
-  call void @llvm.dbg.declare(metadata !{%struct.Y** %y_addr}, metadata !14, metadata !{metadata !"0x102"}), !dbg !13
+  call void @llvm.dbg.declare(metadata %struct.Y** %y_addr, metadata !14, metadata !{!"0x102"}), !dbg !13
   store %struct.Y* %y, %struct.Y** %y_addr
   store i32 0, i32* %0, align 4, !dbg !13
   %1 = load i32* %0, align 4, !dbg !13            ; <i32> [#uses=1]
@@ -31,24 +31,24 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!20}
 
-!0 = metadata !{metadata !"0x101\00x\007\000", metadata !1, metadata !2, metadata !7} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\007\000\001\000\006\000\000\007", metadata !18, metadata !2, metadata !4, null, i32 (%struct.X*, %struct.Y*)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !18, metadata !19, metadata !19, metadata !17, null,  null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{metadata !6, metadata !7, metadata !9}
-!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !18, metadata !2} ; [ DW_TAG_base_type ]
-!7 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !18, metadata !2, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{metadata !"0x13\00X\003\000\000\000\004\000", metadata !18, metadata !2, null, null, null, null, null} ; [ DW_TAG_structure_type ] [X] [line 3, size 0, align 0, offset 0] [decl] [from ]
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !18, metadata !2, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{metadata !"0x13\00Y\004\0032\0032\000\000\000", metadata !18, metadata !2, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [Y] [line 4, size 32, align 32, offset 0] [def] [from ]
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0xd\00x\005\0032\0032\000\000", metadata !18, metadata !10, metadata !6} ; [ DW_TAG_member ]
-!13 = metadata !{i32 7, i32 0, metadata !1, null}
-!14 = metadata !{metadata !"0x101\00y\007\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 7, i32 0, metadata !16, null}
-!16 = metadata !{metadata !"0xb\007\000\000", metadata !18, metadata !1} ; [ DW_TAG_lexical_block ]
-!17 = metadata !{metadata !1}
-!18 = metadata !{metadata !"a.c", metadata !"/tmp/"}
-!19 = metadata !{i32 0}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x101\00x\007\000", !1, !2, !7} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00foo\00foo\00foo\007\000\001\000\006\000\000\007", !18, !2, !4, null, i32 (%struct.X*, %struct.Y*)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x29", !18} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", !18, !19, !19, !17, null,  null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !18, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{!6, !7, !9}
+!6 = !{!"0x24\00int\000\0032\0032\000\000\005", !18, !2} ; [ DW_TAG_base_type ]
+!7 = !{!"0xf\00\000\0064\0064\000\000", !18, !2, !8} ; [ DW_TAG_pointer_type ]
+!8 = !{!"0x13\00X\003\000\000\000\004\000", !18, !2, null, null, null, null, null} ; [ DW_TAG_structure_type ] [X] [line 3, size 0, align 0, offset 0] [decl] [from ]
+!9 = !{!"0xf\00\000\0064\0064\000\000", !18, !2, !10} ; [ DW_TAG_pointer_type ]
+!10 = !{!"0x13\00Y\004\0032\0032\000\000\000", !18, !2, null, !11, null, null, null} ; [ DW_TAG_structure_type ] [Y] [line 4, size 32, align 32, offset 0] [def] [from ]
+!11 = !{!12}
+!12 = !{!"0xd\00x\005\0032\0032\000\000", !18, !10, !6} ; [ DW_TAG_member ]
+!13 = !MDLocation(line: 7, scope: !1)
+!14 = !{!"0x101\00y\007\000", !1, !2, !9} ; [ DW_TAG_arg_variable ]
+!15 = !MDLocation(line: 7, scope: !16)
+!16 = !{!"0xb\007\000\000", !18, !1} ; [ DW_TAG_lexical_block ]
+!17 = !{!1}
+!18 = !{!"a.c", !"/tmp/"}
+!19 = !{i32 0}
+!20 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/2010-08-10-DbgConstant.ll b/test/DebugInfo/X86/2010-08-10-DbgConstant.ll
deleted file mode 100644
index 7a1b4fe..0000000
--- a/test/DebugInfo/X86/2010-08-10-DbgConstant.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc  -mtriple=i686-linux -O0 -filetype=obj -o %t %s
-; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
-; CHECK: DW_TAG_constant
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp] ( .debug_str[0x{{[0-9a-f]*}}] = "ro")
-
-define void @foo() nounwind ssp {
-entry:
-  call void @bar(i32 201), !dbg !8
-  ret void, !dbg !8
-}
-
-declare void @bar(i32)
-
-!llvm.dbg.cu = !{!2}
-!llvm.module.flags = !{!13}
-
-!0 = metadata !{metadata !"0x2e\00foo\00foo\00foo\003\000\001\000\006\000\000\003", metadata !12, metadata !1, metadata !3, null, void ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang 2.8\000\00\000\00\000", metadata !12, metadata !4, metadata !4, metadata !10, metadata !11,  metadata !14} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null}
-!5 = metadata !{metadata !"0x27\00ro\00ro\00ro\001\001\001", metadata !1, metadata !1, metadata !6, i32 201, null} ; [ DW_TAG_constant ]
-!6 = metadata !{metadata !"0x26\00\000\000\000\000\000", metadata !12, metadata !1, metadata !7} ; [ DW_TAG_const_type ]
-!7 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", metadata !12, metadata !1} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 3, i32 14, metadata !9, null}
-!9 = metadata !{metadata !"0xb\003\0012\000", metadata !12, metadata !0} ; [ DW_TAG_lexical_block ]
-!10 = metadata !{metadata !0}
-!11 = metadata !{metadata !5}
-!12 = metadata !{metadata !"/tmp/l.c", metadata !"/Volumes/Lalgate/clean/D"}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!14 = metadata !{}
diff --git a/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll b/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
index 56a1a2b..d1beadc 100644
--- a/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
+++ b/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
@@ -7,7 +7,7 @@
 
 define i32 @f() nounwind {
   %LOC = alloca i32, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %LOC}, metadata !15, metadata !{metadata !"0x102"}), !dbg !17
+  call void @llvm.dbg.declare(metadata i32* %LOC, metadata !15, metadata !{!"0x102"}), !dbg !17
   %1 = load i32* @GLB, align 4, !dbg !18
   store i32 %1, i32* %LOC, align 4, !dbg !18
   %2 = load i32* @GLB, align 4, !dbg !19
@@ -19,22 +19,22 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk)\000\00\000\00\000", metadata !20, metadata !1, metadata !1, metadata !3, metadata !12,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00f\00f\00\003\000\001\000\006\000\000\000", metadata !6, metadata !6, metadata !7, null, i32 ()* @f, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [f]
-!6 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!12 = metadata !{metadata !14}
-!14 = metadata !{metadata !"0x34\00GLB\00GLB\00\001\000\001", null, metadata !6, metadata !9, i32* @GLB, null} ; [ DW_TAG_variable ]
-!15 = metadata !{metadata !"0x100\00LOC\004\000", metadata !16, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ]
-!16 = metadata !{metadata !"0xb\003\009\000", metadata !20, metadata !5} ; [ DW_TAG_lexical_block ]
-!17 = metadata !{i32 4, i32 9, metadata !16, null}
-!18 = metadata !{i32 4, i32 23, metadata !16, null}
-!19 = metadata !{i32 5, i32 5, metadata !16, null}
-!20 = metadata !{metadata !"test.c", metadata !"/work/llvm/vanilla/test/DebugInfo"}
+!0 = !{!"0x11\0012\00clang version 3.0 (trunk)\000\00\000\00\000", !20, !1, !1, !3, !12,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00f\00f\00\003\000\001\000\006\000\000\000", !6, !6, !7, null, i32 ()* @f, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [f]
+!6 = !{!"0x29", !20} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!12 = !{!14}
+!14 = !{!"0x34\00GLB\00GLB\00\001\000\001", null, !6, !9, i32* @GLB, null} ; [ DW_TAG_variable ]
+!15 = !{!"0x100\00LOC\004\000", !16, !6, !9} ; [ DW_TAG_auto_variable ]
+!16 = !{!"0xb\003\009\000", !20, !5} ; [ DW_TAG_lexical_block ]
+!17 = !MDLocation(line: 4, column: 9, scope: !16)
+!18 = !MDLocation(line: 4, column: 23, scope: !16)
+!19 = !MDLocation(line: 5, column: 5, scope: !16)
+!20 = !{!"test.c", !"/work/llvm/vanilla/test/DebugInfo"}
 
 ; CHECK: DW_TAG_variable
 ; CHECK-NOT: DW_TAG
@@ -52,4 +52,4 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_decl_line [DW_FORM_data1]     (4)
 
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!21 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/2011-12-16-BadStructRef.ll b/test/DebugInfo/X86/2011-12-16-BadStructRef.ll
index 5b30480..4880fa4 100644
--- a/test/DebugInfo/X86/2011-12-16-BadStructRef.ll
+++ b/test/DebugInfo/X86/2011-12-16-BadStructRef.ll
@@ -15,10 +15,10 @@ entry:
   %myBar = alloca %struct.bar, align 8
   store i32 0, i32* %retval
   store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !49, metadata !{metadata !"0x102"}), !dbg !50
+  call void @llvm.dbg.declare(metadata i32* %argc.addr, metadata !49, metadata !{!"0x102"}), !dbg !50
   store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !51, metadata !{metadata !"0x102"}), !dbg !52
-  call void @llvm.dbg.declare(metadata !{%struct.bar* %myBar}, metadata !53, metadata !{metadata !"0x102"}), !dbg !55
+  call void @llvm.dbg.declare(metadata i8*** %argv.addr, metadata !51, metadata !{!"0x102"}), !dbg !52
+  call void @llvm.dbg.declare(metadata %struct.bar* %myBar, metadata !53, metadata !{!"0x102"}), !dbg !55
   call void @_ZN3barC1Ei(%struct.bar* %myBar, i32 1), !dbg !56
   ret i32 0, !dbg !57
 }
@@ -30,9 +30,9 @@ entry:
   %this.addr = alloca %struct.bar*, align 8
   %x.addr = alloca i32, align 4
   store %struct.bar* %this, %struct.bar** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.bar** %this.addr}, metadata !58, metadata !{metadata !"0x102"}), !dbg !59
+  call void @llvm.dbg.declare(metadata %struct.bar** %this.addr, metadata !58, metadata !{!"0x102"}), !dbg !59
   store i32 %x, i32* %x.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !60, metadata !{metadata !"0x102"}), !dbg !61
+  call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !60, metadata !{!"0x102"}), !dbg !61
   %this1 = load %struct.bar** %this.addr
   %0 = load i32* %x.addr, align 4, !dbg !62
   call void @_ZN3barC2Ei(%struct.bar* %this1, i32 %0), !dbg !62
@@ -44,9 +44,9 @@ entry:
   %this.addr = alloca %struct.bar*, align 8
   %x.addr = alloca i32, align 4
   store %struct.bar* %this, %struct.bar** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.bar** %this.addr}, metadata !63, metadata !{metadata !"0x102"}), !dbg !64
+  call void @llvm.dbg.declare(metadata %struct.bar** %this.addr, metadata !63, metadata !{!"0x102"}), !dbg !64
   store i32 %x, i32* %x.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !65, metadata !{metadata !"0x102"}), !dbg !66
+  call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !65, metadata !{!"0x102"}), !dbg !66
   %this1 = load %struct.bar** %this.addr
   %b = getelementptr inbounds %struct.bar* %this1, i32 0, i32 0, !dbg !67
   %0 = load i32* %x.addr, align 4, !dbg !67
@@ -62,9 +62,9 @@ entry:
   %this.addr = alloca %struct.baz*, align 8
   %a.addr = alloca i32, align 4
   store %struct.baz* %this, %struct.baz** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.baz** %this.addr}, metadata !70, metadata !{metadata !"0x102"}), !dbg !71
+  call void @llvm.dbg.declare(metadata %struct.baz** %this.addr, metadata !70, metadata !{!"0x102"}), !dbg !71
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !72, metadata !{metadata !"0x102"}), !dbg !73
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !72, metadata !{!"0x102"}), !dbg !73
   %this1 = load %struct.baz** %this.addr
   %0 = load i32* %a.addr, align 4, !dbg !74
   call void @_ZN3bazC2Ei(%struct.baz* %this1, i32 %0), !dbg !74
@@ -76,9 +76,9 @@ entry:
   %this.addr = alloca %struct.baz*, align 8
   %a.addr = alloca i32, align 4
   store %struct.baz* %this, %struct.baz** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.baz** %this.addr}, metadata !75, metadata !{metadata !"0x102"}), !dbg !76
+  call void @llvm.dbg.declare(metadata %struct.baz** %this.addr, metadata !75, metadata !{!"0x102"}), !dbg !76
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !77, metadata !{metadata !"0x102"}), !dbg !78
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !77, metadata !{!"0x102"}), !dbg !78
   %this1 = load %struct.baz** %this.addr
   %h = getelementptr inbounds %struct.baz* %this1, i32 0, i32 0, !dbg !79
   %0 = load i32* %a.addr, align 4, !dbg !79
@@ -89,78 +89,78 @@ entry:
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!83}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.1 (trunk 146596)\000\00\000\00\000", metadata !82, metadata !1, metadata !3, metadata !27, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5, metadata !9}
-!5 = metadata !{metadata !"0x2\00bar\009\00128\0064\000\000\000", metadata !82, null, null, metadata !7, null, null, null} ; [ DW_TAG_class_type ] [bar] [line 9, size 128, align 64, offset 0] [def] [from ]
-!6 = metadata !{metadata !"0x29", metadata !82} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !8, metadata !19, metadata !21}
-!8 = metadata !{metadata !"0xd\00b\0011\0032\0032\000\000", metadata !82, metadata !5, metadata !9} ; [ DW_TAG_member ]
-!9 = metadata !{metadata !"0x2\00baz\003\0032\0032\000\000\000", metadata !82, null, null, metadata !10, null, null, null} ; [ DW_TAG_class_type ] [baz] [line 3, size 32, align 32, offset 0] [def] [from ]
-!10 = metadata !{metadata !11, metadata !13}
-!11 = metadata !{metadata !"0xd\00h\005\0032\0032\000\000", metadata !82, metadata !9, metadata !12} ; [ DW_TAG_member ]
-!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!13 = metadata !{metadata !"0x2e\00baz\00baz\00\006\000\000\000\006\00256\000\000", metadata !82, metadata !9, metadata !14, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ]
-!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!15 = metadata !{null, metadata !16, metadata !12}
-!16 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !9} ; [ DW_TAG_pointer_type ]
-!19 = metadata !{metadata !"0xd\00b_ref\0012\0064\0064\0064\000", metadata !82, metadata !5, metadata !20} ; [ DW_TAG_member ]
-!20 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !9} ; [ DW_TAG_reference_type ]
-!21 = metadata !{metadata !"0x2e\00bar\00bar\00\0013\000\000\000\006\00256\000\000", metadata !82, metadata !5, metadata !22, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ]
-!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!23 = metadata !{null, metadata !24, metadata !12}
-!24 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !5} ; [ DW_TAG_pointer_type ]
-!27 = metadata !{metadata !29, metadata !37, metadata !40, metadata !43, metadata !46}
-!29 = metadata !{metadata !"0x2e\00main\00main\00\0017\000\001\000\006\00256\000\000", metadata !82, metadata !6, metadata !30, null, i32 (i32, i8**)* @main, null, null, null} ; [ DW_TAG_subprogram ] [line 17] [def] [scope 0] [main]
-!30 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !31, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!31 = metadata !{metadata !12, metadata !12, metadata !32}
-!32 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !33} ; [ DW_TAG_pointer_type ]
-!33 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !34} ; [ DW_TAG_pointer_type ]
-!34 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
-!35 = metadata !{metadata !36}
-!36 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!37 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3barC1Ei\0013\000\001\000\006\00256\000\000", metadata !82, null, metadata !22, null, void (%struct.bar*, i32)* @_ZN3barC1Ei, null, metadata !21, null} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 0] [bar]
-!38 = metadata !{metadata !39}
-!39 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!40 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3barC2Ei\0013\000\001\000\006\00256\000\000", metadata !82, null, metadata !22, null, void (%struct.bar*, i32)* @_ZN3barC2Ei, null, metadata !21, null} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 0] [bar]
-!41 = metadata !{metadata !42}
-!42 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!43 = metadata !{metadata !"0x2e\00baz\00baz\00_ZN3bazC1Ei\006\000\001\000\006\00256\000\000", metadata !82, null, metadata !14, null, void (%struct.baz*, i32)* @_ZN3bazC1Ei, null, metadata !13, null} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [baz]
-!44 = metadata !{metadata !45}
-!45 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!46 = metadata !{metadata !"0x2e\00baz\00baz\00_ZN3bazC2Ei\006\000\001\000\006\00256\000\000", metadata !82, null, metadata !14, null, void (%struct.baz*, i32)* @_ZN3bazC2Ei, null, metadata !13, null} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [baz]
-!49 = metadata !{metadata !"0x101\00argc\0016777232\000", metadata !29, metadata !6, metadata !12} ; [ DW_TAG_arg_variable ]
-!50 = metadata !{i32 16, i32 14, metadata !29, null}
-!51 = metadata !{metadata !"0x101\00argv\0033554448\000", metadata !29, metadata !6, metadata !32} ; [ DW_TAG_arg_variable ]
-!52 = metadata !{i32 16, i32 27, metadata !29, null}
-!53 = metadata !{metadata !"0x100\00myBar\0018\000", metadata !54, metadata !6, metadata !5} ; [ DW_TAG_auto_variable ]
-!54 = metadata !{metadata !"0xb\0017\001\000", metadata !82, metadata !29} ; [ DW_TAG_lexical_block ]
-!55 = metadata !{i32 18, i32 9, metadata !54, null}
-!56 = metadata !{i32 18, i32 17, metadata !54, null}
-!57 = metadata !{i32 19, i32 5, metadata !54, null}
-!58 = metadata !{metadata !"0x101\00this\0016777229\0064", metadata !37, metadata !6, metadata !24} ; [ DW_TAG_arg_variable ]
-!59 = metadata !{i32 13, i32 5, metadata !37, null}
-!60 = metadata !{metadata !"0x101\00x\0033554445\000", metadata !37, metadata !6, metadata !12} ; [ DW_TAG_arg_variable ]
-!61 = metadata !{i32 13, i32 13, metadata !37, null}
-!62 = metadata !{i32 13, i32 34, metadata !37, null}
-!63 = metadata !{metadata !"0x101\00this\0016777229\0064", metadata !40, metadata !6, metadata !24} ; [ DW_TAG_arg_variable ]
-!64 = metadata !{i32 13, i32 5, metadata !40, null}
-!65 = metadata !{metadata !"0x101\00x\0033554445\000", metadata !40, metadata !6, metadata !12} ; [ DW_TAG_arg_variable ]
-!66 = metadata !{i32 13, i32 13, metadata !40, null}
-!67 = metadata !{i32 13, i32 33, metadata !40, null}
-!68 = metadata !{i32 13, i32 34, metadata !69, null}
-!69 = metadata !{metadata !"0xb\0013\0033\001", metadata !82, metadata !40} ; [ DW_TAG_lexical_block ]
-!70 = metadata !{metadata !"0x101\00this\0016777222\0064", metadata !43, metadata !6, metadata !16} ; [ DW_TAG_arg_variable ]
-!71 = metadata !{i32 6, i32 5, metadata !43, null}
-!72 = metadata !{metadata !"0x101\00a\0033554438\000", metadata !43, metadata !6, metadata !12} ; [ DW_TAG_arg_variable ]
-!73 = metadata !{i32 6, i32 13, metadata !43, null}
-!74 = metadata !{i32 6, i32 24, metadata !43, null}
-!75 = metadata !{metadata !"0x101\00this\0016777222\0064", metadata !46, metadata !6, metadata !16} ; [ DW_TAG_arg_variable ]
-!76 = metadata !{i32 6, i32 5, metadata !46, null}
-!77 = metadata !{metadata !"0x101\00a\0033554438\000", metadata !46, metadata !6, metadata !12} ; [ DW_TAG_arg_variable ]
-!78 = metadata !{i32 6, i32 13, metadata !46, null}
-!79 = metadata !{i32 6, i32 23, metadata !46, null}
-!80 = metadata !{i32 6, i32 24, metadata !81, null}
-!81 = metadata !{metadata !"0xb\006\0023\002", metadata !82, metadata !46} ; [ DW_TAG_lexical_block ]
-!82 = metadata !{metadata !"main.cpp", metadata !"/Users/echristo/tmp/bad-struct-ref"}
-!83 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.1 (trunk 146596)\000\00\000\00\000", !82, !1, !3, !27, !1,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5, !9}
+!5 = !{!"0x2\00bar\009\00128\0064\000\000\000", !82, null, null, !7, null, null, null} ; [ DW_TAG_class_type ] [bar] [line 9, size 128, align 64, offset 0] [def] [from ]
+!6 = !{!"0x29", !82} ; [ DW_TAG_file_type ]
+!7 = !{!8, !19, !21}
+!8 = !{!"0xd\00b\0011\0032\0032\000\000", !82, !5, !9} ; [ DW_TAG_member ]
+!9 = !{!"0x2\00baz\003\0032\0032\000\000\000", !82, null, null, !10, null, null, null} ; [ DW_TAG_class_type ] [baz] [line 3, size 32, align 32, offset 0] [def] [from ]
+!10 = !{!11, !13}
+!11 = !{!"0xd\00h\005\0032\0032\000\000", !82, !9, !12} ; [ DW_TAG_member ]
+!12 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!13 = !{!"0x2e\00baz\00baz\00\006\000\000\000\006\00256\000\000", !82, !9, !14, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ]
+!14 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = !{null, !16, !12}
+!16 = !{!"0xf\00\000\0064\0064\000\0064", i32 0, null, !9} ; [ DW_TAG_pointer_type ]
+!19 = !{!"0xd\00b_ref\0012\0064\0064\0064\000", !82, !5, !20} ; [ DW_TAG_member ]
+!20 = !{!"0x10\00\000\000\000\000\000", null, null, !9} ; [ DW_TAG_reference_type ]
+!21 = !{!"0x2e\00bar\00bar\00\0013\000\000\000\006\00256\000\000", !82, !5, !22, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ]
+!22 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = !{null, !24, !12}
+!24 = !{!"0xf\00\000\0064\0064\000\0064", i32 0, null, !5} ; [ DW_TAG_pointer_type ]
+!27 = !{!29, !37, !40, !43, !46}
+!29 = !{!"0x2e\00main\00main\00\0017\000\001\000\006\00256\000\000", !82, !6, !30, null, i32 (i32, i8**)* @main, null, null, null} ; [ DW_TAG_subprogram ] [line 17] [def] [scope 0] [main]
+!30 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !31, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!31 = !{!12, !12, !32}
+!32 = !{!"0xf\00\000\0064\0064\000\000", null, null, !33} ; [ DW_TAG_pointer_type ]
+!33 = !{!"0xf\00\000\0064\0064\000\000", null, null, !34} ; [ DW_TAG_pointer_type ]
+!34 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
+!35 = !{!36}
+!36 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!37 = !{!"0x2e\00bar\00bar\00_ZN3barC1Ei\0013\000\001\000\006\00256\000\000", !82, null, !22, null, void (%struct.bar*, i32)* @_ZN3barC1Ei, null, !21, null} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 0] [bar]
+!38 = !{!39}
+!39 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!40 = !{!"0x2e\00bar\00bar\00_ZN3barC2Ei\0013\000\001\000\006\00256\000\000", !82, null, !22, null, void (%struct.bar*, i32)* @_ZN3barC2Ei, null, !21, null} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 0] [bar]
+!41 = !{!42}
+!42 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!43 = !{!"0x2e\00baz\00baz\00_ZN3bazC1Ei\006\000\001\000\006\00256\000\000", !82, null, !14, null, void (%struct.baz*, i32)* @_ZN3bazC1Ei, null, !13, null} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [baz]
+!44 = !{!45}
+!45 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!46 = !{!"0x2e\00baz\00baz\00_ZN3bazC2Ei\006\000\001\000\006\00256\000\000", !82, null, !14, null, void (%struct.baz*, i32)* @_ZN3bazC2Ei, null, !13, null} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [baz]
+!49 = !{!"0x101\00argc\0016777232\000", !29, !6, !12} ; [ DW_TAG_arg_variable ]
+!50 = !MDLocation(line: 16, column: 14, scope: !29)
+!51 = !{!"0x101\00argv\0033554448\000", !29, !6, !32} ; [ DW_TAG_arg_variable ]
+!52 = !MDLocation(line: 16, column: 27, scope: !29)
+!53 = !{!"0x100\00myBar\0018\000", !54, !6, !5} ; [ DW_TAG_auto_variable ]
+!54 = !{!"0xb\0017\001\000", !82, !29} ; [ DW_TAG_lexical_block ]
+!55 = !MDLocation(line: 18, column: 9, scope: !54)
+!56 = !MDLocation(line: 18, column: 17, scope: !54)
+!57 = !MDLocation(line: 19, column: 5, scope: !54)
+!58 = !{!"0x101\00this\0016777229\0064", !37, !6, !24} ; [ DW_TAG_arg_variable ]
+!59 = !MDLocation(line: 13, column: 5, scope: !37)
+!60 = !{!"0x101\00x\0033554445\000", !37, !6, !12} ; [ DW_TAG_arg_variable ]
+!61 = !MDLocation(line: 13, column: 13, scope: !37)
+!62 = !MDLocation(line: 13, column: 34, scope: !37)
+!63 = !{!"0x101\00this\0016777229\0064", !40, !6, !24} ; [ DW_TAG_arg_variable ]
+!64 = !MDLocation(line: 13, column: 5, scope: !40)
+!65 = !{!"0x101\00x\0033554445\000", !40, !6, !12} ; [ DW_TAG_arg_variable ]
+!66 = !MDLocation(line: 13, column: 13, scope: !40)
+!67 = !MDLocation(line: 13, column: 33, scope: !40)
+!68 = !MDLocation(line: 13, column: 34, scope: !69)
+!69 = !{!"0xb\0013\0033\001", !82, !40} ; [ DW_TAG_lexical_block ]
+!70 = !{!"0x101\00this\0016777222\0064", !43, !6, !16} ; [ DW_TAG_arg_variable ]
+!71 = !MDLocation(line: 6, column: 5, scope: !43)
+!72 = !{!"0x101\00a\0033554438\000", !43, !6, !12} ; [ DW_TAG_arg_variable ]
+!73 = !MDLocation(line: 6, column: 13, scope: !43)
+!74 = !MDLocation(line: 6, column: 24, scope: !43)
+!75 = !{!"0x101\00this\0016777222\0064", !46, !6, !16} ; [ DW_TAG_arg_variable ]
+!76 = !MDLocation(line: 6, column: 5, scope: !46)
+!77 = !{!"0x101\00a\0033554438\000", !46, !6, !12} ; [ DW_TAG_arg_variable ]
+!78 = !MDLocation(line: 6, column: 13, scope: !46)
+!79 = !MDLocation(line: 6, column: 23, scope: !46)
+!80 = !MDLocation(line: 6, column: 24, scope: !81)
+!81 = !{!"0xb\006\0023\002", !82, !46} ; [ DW_TAG_lexical_block ]
+!82 = !{!"main.cpp", !"/Users/echristo/tmp/bad-struct-ref"}
+!83 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/DW_AT_byte_size.ll b/test/DebugInfo/X86/DW_AT_byte_size.ll
index 2ce5ed5..8b4f561 100644
--- a/test/DebugInfo/X86/DW_AT_byte_size.ll
+++ b/test/DebugInfo/X86/DW_AT_byte_size.ll
@@ -14,7 +14,7 @@ define i32 @_Z3fooP1A(%struct.A* %a) nounwind uwtable ssp {
 entry:
   %a.addr = alloca %struct.A*, align 8
   store %struct.A* %a, %struct.A** %a.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.A** %a.addr}, metadata !16, metadata !{metadata !"0x102"}), !dbg !17
+  call void @llvm.dbg.declare(metadata %struct.A** %a.addr, metadata !16, metadata !{!"0x102"}), !dbg !17
   %0 = load %struct.A** %a.addr, align 8, !dbg !18
   %b = getelementptr inbounds %struct.A* %0, i32 0, i32 0, !dbg !18
   %1 = load i32* %b, align 4, !dbg !18
@@ -26,21 +26,21 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.1 (trunk 150996)\000\00\000\00\000", metadata !20, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooP1A\003\000\001\000\006\00256\000\003", metadata !20, metadata !6, metadata !7, null, i32 (%struct.A*)* @_Z3fooP1A, null, null, null} ; [ DW_TAG_subprogram ]
-!6 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !10}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{metadata !"0x2\00A\001\0032\0032\000\000\000", metadata !20, null, null, metadata !12, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0xd\00b\001\0032\0032\000\000", metadata !20, metadata !11, metadata !9} ; [ DW_TAG_member ]
-!16 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{i32 3, i32 13, metadata !5, null}
-!18 = metadata !{i32 4, i32 3, metadata !19, null}
-!19 = metadata !{metadata !"0xb\003\0016\000", metadata !20, metadata !5} ; [ DW_TAG_lexical_block ]
-!20 = metadata !{metadata !"foo.cpp", metadata !"/Users/echristo"}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.1 (trunk 150996)\000\00\000\00\000", !20, !1, !1, !3, !1,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00foo\00foo\00_Z3fooP1A\003\000\001\000\006\00256\000\003", !20, !6, !7, null, i32 (%struct.A*)* @_Z3fooP1A, null, null, null} ; [ DW_TAG_subprogram ]
+!6 = !{!"0x29", !20} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !10}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = !{!"0xf\00\000\0064\0064\000\000", null, null, !11} ; [ DW_TAG_pointer_type ]
+!11 = !{!"0x2\00A\001\0032\0032\000\000\000", !20, null, null, !12, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
+!12 = !{!13}
+!13 = !{!"0xd\00b\001\0032\0032\000\000", !20, !11, !9} ; [ DW_TAG_member ]
+!16 = !{!"0x101\00a\0016777219\000", !5, !6, !10} ; [ DW_TAG_arg_variable ]
+!17 = !MDLocation(line: 3, column: 13, scope: !5)
+!18 = !MDLocation(line: 4, column: 3, scope: !19)
+!19 = !{!"0xb\003\0016\000", !20, !5} ; [ DW_TAG_lexical_block ]
+!20 = !{!"foo.cpp", !"/Users/echristo"}
+!21 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/DW_AT_linkage_name.ll b/test/DebugInfo/X86/DW_AT_linkage_name.ll
index ca3b85f..e395e06 100644
--- a/test/DebugInfo/X86/DW_AT_linkage_name.ll
+++ b/test/DebugInfo/X86/DW_AT_linkage_name.ll
@@ -38,7 +38,7 @@ define void @_ZN1AD2Ev(%struct.A* %this) unnamed_addr #0 align 2 {
 entry:
   %this.addr = alloca %struct.A*, align 8
   store %struct.A* %this, %struct.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.A** %this.addr}, metadata !26, metadata !{metadata !"0x102"}), !dbg !28
+  call void @llvm.dbg.declare(metadata %struct.A** %this.addr, metadata !26, metadata !{!"0x102"}), !dbg !28
   %this1 = load %struct.A** %this.addr
   ret void, !dbg !29
 }
@@ -51,7 +51,7 @@ define void @_ZN1AD1Ev(%struct.A* %this) unnamed_addr #0 align 2 {
 entry:
   %this.addr = alloca %struct.A*, align 8
   store %struct.A* %this, %struct.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.A** %this.addr}, metadata !30, metadata !{metadata !"0x102"}), !dbg !31
+  call void @llvm.dbg.declare(metadata %struct.A** %this.addr, metadata !30, metadata !{!"0x102"}), !dbg !31
   %this1 = load %struct.A** %this.addr
   call void @_ZN1AD2Ev(%struct.A* %this1), !dbg !32
   ret void, !dbg !33
@@ -61,7 +61,7 @@ entry:
 define void @_Z3foov() #2 {
 entry:
   %a = alloca %struct.A, align 1
-  call void @llvm.dbg.declare(metadata !{%struct.A* %a}, metadata !34, metadata !{metadata !"0x102"}), !dbg !35
+  call void @llvm.dbg.declare(metadata %struct.A* %a, metadata !34, metadata !{!"0x102"}), !dbg !35
   call void @_ZN1AC1Ei(%struct.A* %a, i32 1), !dbg !35
   call void @_ZN1AD1Ev(%struct.A* %a), !dbg !36
   ret void, !dbg !36
@@ -77,40 +77,40 @@ attributes #2 = { ssp uwtable }
 !llvm.module.flags = !{!23, !24}
 !llvm.ident = !{!25}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !16, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [linkage-name.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"linkage-name.cpp", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00A\001\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !6, metadata !12}
-!6 = metadata !{metadata !"0x2e\00A\00A\00\002\000\000\000\006\00256\000\002", metadata !1, metadata !"_ZTS1A", metadata !7, null, null, null, i32 0, metadata !11} ; [ DW_TAG_subprogram ] [line 2] [A]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9, metadata !10}
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!11 = metadata !{i32 786468}
-!12 = metadata !{metadata !"0x2e\00~A\00~A\00\003\000\000\000\006\00256\000\003", metadata !1, metadata !"_ZTS1A", metadata !13, null, null, null, i32 0, metadata !15} ; [ DW_TAG_subprogram ] [line 3] [~A]
-!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!14 = metadata !{null, metadata !9}
-!15 = metadata !{i32 786468}
-!16 = metadata !{metadata !17, metadata !18, metadata !19}
-!17 = metadata !{metadata !"0x2e\00~A\00~A\00_ZN1AD2Ev\006\000\001\000\006\00256\000\006", metadata !1, metadata !"_ZTS1A", metadata !13, null, void (%struct.A*)* @_ZN1AD2Ev, null, metadata !12, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [~A]
-!18 = metadata !{metadata !"0x2e\00~A\00~A\00_ZN1AD1Ev\006\000\001\000\006\00256\000\006", metadata !1, metadata !"_ZTS1A", metadata !13, null, void (%struct.A*)* @_ZN1AD1Ev, null, metadata !12, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [~A]
-!19 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3foov\0010\000\001\000\006\00256\000\0010", metadata !1, metadata !20, metadata !21, null, void ()* @_Z3foov, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 10] [def] [foo]
-!20 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [linkage-name.cpp]
-!21 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!22 = metadata !{null}
-!23 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!25 = metadata !{metadata !"clang version 3.5.0 "}
-!26 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !17, null, metadata !27} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!27 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
-!28 = metadata !{i32 0, i32 0, metadata !17, null}
-!29 = metadata !{i32 8, i32 0, metadata !17, null}
-!30 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !18, null, metadata !27} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!31 = metadata !{i32 0, i32 0, metadata !18, null}
-!32 = metadata !{i32 6, i32 0, metadata !18, null}
-!33 = metadata !{i32 8, i32 0, metadata !18, null}
-!34 = metadata !{metadata !"0x100\00a\0011\000", metadata !19, metadata !20, metadata !"_ZTS1A"} ; [ DW_TAG_auto_variable ] [a] [line 11]
-!35 = metadata !{i32 11, i32 0, metadata !19, null}
-!36 = metadata !{i32 12, i32 0, metadata !19, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !3, !16, !2, !2} ; [ DW_TAG_compile_unit ] [linkage-name.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"linkage-name.cpp", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00A\001\008\008\000\000\000", !1, null, null, !5, null, null, !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!6, !12}
+!6 = !{!"0x2e\00A\00A\00\002\000\000\000\006\00256\000\002", !1, !"_ZTS1A", !7, null, null, null, i32 0, !11} ; [ DW_TAG_subprogram ] [line 2] [A]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9, !10}
+!9 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!10 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!11 = !{i32 786468}
+!12 = !{!"0x2e\00~A\00~A\00\003\000\000\000\006\00256\000\003", !1, !"_ZTS1A", !13, null, null, null, i32 0, !15} ; [ DW_TAG_subprogram ] [line 3] [~A]
+!13 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = !{null, !9}
+!15 = !{i32 786468}
+!16 = !{!17, !18, !19}
+!17 = !{!"0x2e\00~A\00~A\00_ZN1AD2Ev\006\000\001\000\006\00256\000\006", !1, !"_ZTS1A", !13, null, void (%struct.A*)* @_ZN1AD2Ev, null, !12, !2} ; [ DW_TAG_subprogram ] [line 6] [def] [~A]
+!18 = !{!"0x2e\00~A\00~A\00_ZN1AD1Ev\006\000\001\000\006\00256\000\006", !1, !"_ZTS1A", !13, null, void (%struct.A*)* @_ZN1AD1Ev, null, !12, !2} ; [ DW_TAG_subprogram ] [line 6] [def] [~A]
+!19 = !{!"0x2e\00foo\00foo\00_Z3foov\0010\000\001\000\006\00256\000\0010", !1, !20, !21, null, void ()* @_Z3foov, null, null, !2} ; [ DW_TAG_subprogram ] [line 10] [def] [foo]
+!20 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [linkage-name.cpp]
+!21 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = !{null}
+!23 = !{i32 2, !"Dwarf Version", i32 2}
+!24 = !{i32 1, !"Debug Info Version", i32 2}
+!25 = !{!"clang version 3.5.0 "}
+!26 = !{!"0x101\00this\0016777216\001088", !17, null, !27} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!27 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!28 = !MDLocation(line: 0, scope: !17)
+!29 = !MDLocation(line: 8, scope: !17)
+!30 = !{!"0x101\00this\0016777216\001088", !18, null, !27} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!31 = !MDLocation(line: 0, scope: !18)
+!32 = !MDLocation(line: 6, scope: !18)
+!33 = !MDLocation(line: 8, scope: !18)
+!34 = !{!"0x100\00a\0011\000", !19, !20, !"_ZTS1A"} ; [ DW_TAG_auto_variable ] [a] [line 11]
+!35 = !MDLocation(line: 11, scope: !19)
+!36 = !MDLocation(line: 12, scope: !19)
diff --git a/test/DebugInfo/X86/DW_AT_location-reference.ll b/test/DebugInfo/X86/DW_AT_location-reference.ll
index 874ecd6..a5b5700 100644
--- a/test/DebugInfo/X86/DW_AT_location-reference.ll
+++ b/test/DebugInfo/X86/DW_AT_location-reference.ll
@@ -64,7 +64,7 @@ define void @f() nounwind {
 entry:
   %call = tail call i32 @g(i32 0, i32 0) nounwind, !dbg !8
   store i32 %call, i32* @a, align 4, !dbg !8
-  tail call void @llvm.dbg.value(metadata !12, i64 0, metadata !5, metadata !{metadata !"0x102"}), !dbg !13
+  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !5, metadata !{!"0x102"}), !dbg !13
   br label %while.body
 
 while.body:                                       ; preds = %entry, %while.body
@@ -75,10 +75,10 @@ while.body:                                       ; preds = %entry, %while.body
   br i1 %tobool, label %while.end, label %while.body, !dbg !14
 
 while.end:                                        ; preds = %while.body
-  tail call void @llvm.dbg.value(metadata !{i32 %mul}, i64 0, metadata !5, metadata !{metadata !"0x102"}), !dbg !14
+  tail call void @llvm.dbg.value(metadata i32 %mul, i64 0, metadata !5, metadata !{!"0x102"}), !dbg !14
   %call4 = tail call i32 @g(i32 %mul, i32 0) nounwind, !dbg !15
   store i32 %call4, i32* @a, align 4, !dbg !15
-  tail call void @llvm.dbg.value(metadata !16, i64 0, metadata !5, metadata !{metadata !"0x102"}), !dbg !17
+  tail call void @llvm.dbg.value(metadata i32 2, i64 0, metadata !5, metadata !{!"0x102"}), !dbg !17
   br label %while.body9
 
 while.body9:                                      ; preds = %while.end, %while.body9
@@ -89,7 +89,7 @@ while.body9:                                      ; preds = %while.end, %while.b
   br i1 %tobool8, label %while.end13, label %while.body9, !dbg !18
 
 while.end13:                                      ; preds = %while.body9
-  tail call void @llvm.dbg.value(metadata !{i32 %mul12}, i64 0, metadata !5, metadata !{metadata !"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata i32 %mul12, i64 0, metadata !5, metadata !{!"0x102"}), !dbg !18
   %call15 = tail call i32 @g(i32 0, i32 %mul12) nounwind, !dbg !19
   store i32 %call15, i32* @a, align 4, !dbg !19
   ret void, !dbg !20
@@ -102,25 +102,25 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!24}
 
-!0 = metadata !{metadata !"0x2e\00f\00f\00\004\000\001\000\006\00256\001\004", metadata !23, metadata !1, metadata !3, null, void ()* @f, null, null, metadata !22} ; [ DW_TAG_subprogram ] [line 4] [def] [f]
-!1 = metadata !{metadata !"0x29", metadata !23} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk)\001\00\000\00\001", metadata !23, metadata !4, metadata !4, metadata !21, null,  null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !23, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null}
-!5 = metadata !{metadata !"0x100\00x\005\000", metadata !6, metadata !1, metadata !7} ; [ DW_TAG_auto_variable ]
-!6 = metadata !{metadata !"0xb\004\0014\000", metadata !23, metadata !0} ; [ DW_TAG_lexical_block ]
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 6, i32 3, metadata !6, null}
-!12 = metadata !{i32 1}
-!13 = metadata !{i32 7, i32 3, metadata !6, null}
-!14 = metadata !{i32 8, i32 3, metadata !6, null}
-!15 = metadata !{i32 9, i32 3, metadata !6, null}
-!16 = metadata !{i32 2}
-!17 = metadata !{i32 10, i32 3, metadata !6, null}
-!18 = metadata !{i32 11, i32 3, metadata !6, null}
-!19 = metadata !{i32 12, i32 3, metadata !6, null}
-!20 = metadata !{i32 13, i32 1, metadata !6, null}
-!21 = metadata !{metadata !0}
-!22 = metadata !{metadata !5}
-!23 = metadata !{metadata !"simple.c", metadata !"/home/rengol01/temp/tests/dwarf/relocation"}
-!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00f\00f\00\004\000\001\000\006\00256\001\004", !23, !1, !3, null, void ()* @f, null, null, !22} ; [ DW_TAG_subprogram ] [line 4] [def] [f]
+!1 = !{!"0x29", !23} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 3.0 (trunk)\001\00\000\00\001", !23, !4, !4, !21, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !23, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{null}
+!5 = !{!"0x100\00x\005\000", !6, !1, !7} ; [ DW_TAG_auto_variable ]
+!6 = !{!"0xb\004\0014\000", !23, !0} ; [ DW_TAG_lexical_block ]
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !2} ; [ DW_TAG_base_type ]
+!8 = !MDLocation(line: 6, column: 3, scope: !6)
+!12 = !{i32 1}
+!13 = !MDLocation(line: 7, column: 3, scope: !6)
+!14 = !MDLocation(line: 8, column: 3, scope: !6)
+!15 = !MDLocation(line: 9, column: 3, scope: !6)
+!16 = !{i32 2}
+!17 = !MDLocation(line: 10, column: 3, scope: !6)
+!18 = !MDLocation(line: 11, column: 3, scope: !6)
+!19 = !MDLocation(line: 12, column: 3, scope: !6)
+!20 = !MDLocation(line: 13, column: 1, scope: !6)
+!21 = !{!0}
+!22 = !{!5}
+!23 = !{!"simple.c", !"/home/rengol01/temp/tests/dwarf/relocation"}
+!24 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/DW_AT_object_pointer.ll b/test/DebugInfo/X86/DW_AT_object_pointer.ll
index ca4beb2..8cff0b8 100644
--- a/test/DebugInfo/X86/DW_AT_object_pointer.ll
+++ b/test/DebugInfo/X86/DW_AT_object_pointer.ll
@@ -17,8 +17,8 @@ entry:
   %.addr = alloca i32, align 4
   %a = alloca %class.A, align 4
   store i32 %0, i32* %.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %.addr}, metadata !36, metadata !{metadata !"0x102"}), !dbg !35
-  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !21, metadata !{metadata !"0x102"}), !dbg !23
+  call void @llvm.dbg.declare(metadata i32* %.addr, metadata !36, metadata !{!"0x102"}), !dbg !35
+  call void @llvm.dbg.declare(metadata %class.A* %a, metadata !21, metadata !{!"0x102"}), !dbg !23
   call void @_ZN1AC1Ev(%class.A* %a), !dbg !24
   %m_a = getelementptr inbounds %class.A* %a, i32 0, i32 0, !dbg !25
   %1 = load i32* %m_a, align 4, !dbg !25
@@ -31,7 +31,7 @@ define linkonce_odr void @_ZN1AC1Ev(%class.A* %this) unnamed_addr nounwind uwtab
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !26, metadata !{metadata !"0x102"}), !dbg !28
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !26, metadata !{!"0x102"}), !dbg !28
   %this1 = load %class.A** %this.addr
   call void @_ZN1AC2Ev(%class.A* %this1), !dbg !29
   ret void, !dbg !29
@@ -41,7 +41,7 @@ define linkonce_odr void @_ZN1AC2Ev(%class.A* %this) unnamed_addr nounwind uwtab
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !30, metadata !{metadata !"0x102"}), !dbg !31
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !30, metadata !{!"0x102"}), !dbg !31
   %this1 = load %class.A** %this.addr
   %m_a = getelementptr inbounds %class.A* %this1, i32 0, i32 0, !dbg !32
   store i32 0, i32* %m_a, align 4, !dbg !32
@@ -51,40 +51,40 @@ entry:
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!38}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.2 (trunk 163586) (llvm/trunk 163570)\000\00\000\00\000", metadata !37, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ] [/Users/echristo/debug-tests/bar.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{}
-!3 = metadata !{metadata !5, metadata !10, metadata !20}
-!5 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooi\007\000\001\000\006\00256\000\007", metadata !6, metadata !6, metadata !7, null, i32 (i32)* @_Z3fooi, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 7] [def] [foo]
-!6 = metadata !{metadata !"0x29", metadata !37} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !"0x2e\00A\00A\00_ZN1AC1Ev\003\000\001\000\006\00256\000\003", metadata !6, null, metadata !11, null, void (%class.A*)* @_ZN1AC1Ev, null, metadata !17, metadata !1} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{null, metadata !13}
-!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
-!14 = metadata !{metadata !"0x2\00A\001\0032\0032\000\000\000", metadata !37, null, null, metadata !15, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
-!15 = metadata !{metadata !16, metadata !17}
-!16 = metadata !{metadata !"0xd\00m_a\004\0032\0032\000\000", metadata !37, metadata !14, metadata !9} ; [ DW_TAG_member ] [m_a] [line 4, size 32, align 32, offset 0] [from int]
-!17 = metadata !{metadata !"0x2e\00A\00A\00\003\000\000\000\006\00256\000\003", metadata !6, metadata !14, metadata !11, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 3] [A]
-!18 = metadata !{metadata !19}
-!19 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!20 = metadata !{metadata !"0x2e\00A\00A\00_ZN1AC2Ev\003\000\001\000\006\00256\000\003", metadata !6, null, metadata !11, null, void (%class.A*)* @_ZN1AC2Ev, null, metadata !17, metadata !1} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
-!21 = metadata !{metadata !"0x100\00a\008\000", metadata !22, metadata !6, metadata !14} ; [ DW_TAG_auto_variable ] [a] [line 8]
-!22 = metadata !{metadata !"0xb\007\0011\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
-!23 = metadata !{i32 8, i32 5, metadata !22, null}
-!24 = metadata !{i32 8, i32 6, metadata !22, null}
-!25 = metadata !{i32 9, i32 3, metadata !22, null}
-!26 = metadata !{metadata !"0x101\00this\0016777219\001088", metadata !10, metadata !6, metadata !27} ; [ DW_TAG_arg_variable ] [this] [line 3]
-!27 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
-!28 = metadata !{i32 3, i32 3, metadata !10, null}
-!29 = metadata !{i32 3, i32 18, metadata !10, null}
-!30 = metadata !{metadata !"0x101\00this\0016777219\001088", metadata !20, metadata !6, metadata !27} ; [ DW_TAG_arg_variable ] [this] [line 3]
-!31 = metadata !{i32 3, i32 3, metadata !20, null}
-!32 = metadata !{i32 3, i32 9, metadata !33, null}
-!33 = metadata !{metadata !"0xb\003\007\001", metadata !6, metadata !20} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
-!34 = metadata !{i32 3, i32 18, metadata !33, null}
-!35 = metadata !{i32 7, i32 0, metadata !5, null}
-!36 = metadata !{metadata !"0x101\00\0016777223\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 7]
-!37 = metadata !{metadata !"bar.cpp", metadata !"/Users/echristo/debug-tests"}
-!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.2 (trunk 163586) (llvm/trunk 163570)\000\00\000\00\000", !37, !1, !1, !3, !1,  !1} ; [ DW_TAG_compile_unit ] [/Users/echristo/debug-tests/bar.cpp] [DW_LANG_C_plus_plus]
+!1 = !{}
+!3 = !{!5, !10, !20}
+!5 = !{!"0x2e\00foo\00foo\00_Z3fooi\007\000\001\000\006\00256\000\007", !6, !6, !7, null, i32 (i32)* @_Z3fooi, null, null, !1} ; [ DW_TAG_subprogram ] [line 7] [def] [foo]
+!6 = !{!"0x29", !37} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!"0x2e\00A\00A\00_ZN1AC1Ev\003\000\001\000\006\00256\000\003", !6, null, !11, null, void (%class.A*)* @_ZN1AC1Ev, null, !17, !1} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
+!11 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{null, !13}
+!13 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!14 = !{!"0x2\00A\001\0032\0032\000\000\000", !37, null, null, !15, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
+!15 = !{!16, !17}
+!16 = !{!"0xd\00m_a\004\0032\0032\000\000", !37, !14, !9} ; [ DW_TAG_member ] [m_a] [line 4, size 32, align 32, offset 0] [from int]
+!17 = !{!"0x2e\00A\00A\00\003\000\000\000\006\00256\000\003", !6, !14, !11, null, null, null, i32 0, !18} ; [ DW_TAG_subprogram ] [line 3] [A]
+!18 = !{!19}
+!19 = !{!"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!20 = !{!"0x2e\00A\00A\00_ZN1AC2Ev\003\000\001\000\006\00256\000\003", !6, null, !11, null, void (%class.A*)* @_ZN1AC2Ev, null, !17, !1} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
+!21 = !{!"0x100\00a\008\000", !22, !6, !14} ; [ DW_TAG_auto_variable ] [a] [line 8]
+!22 = !{!"0xb\007\0011\000", !6, !5} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
+!23 = !MDLocation(line: 8, column: 5, scope: !22)
+!24 = !MDLocation(line: 8, column: 6, scope: !22)
+!25 = !MDLocation(line: 9, column: 3, scope: !22)
+!26 = !{!"0x101\00this\0016777219\001088", !10, !6, !27} ; [ DW_TAG_arg_variable ] [this] [line 3]
+!27 = !{!"0xf\00\000\0064\0064\000\000", null, null, !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!28 = !MDLocation(line: 3, column: 3, scope: !10)
+!29 = !MDLocation(line: 3, column: 18, scope: !10)
+!30 = !{!"0x101\00this\0016777219\001088", !20, !6, !27} ; [ DW_TAG_arg_variable ] [this] [line 3]
+!31 = !MDLocation(line: 3, column: 3, scope: !20)
+!32 = !MDLocation(line: 3, column: 9, scope: !33)
+!33 = !{!"0xb\003\007\001", !6, !20} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
+!34 = !MDLocation(line: 3, column: 18, scope: !33)
+!35 = !MDLocation(line: 7, scope: !5)
+!36 = !{!"0x101\00\0016777223\000", !5, !6, !9} ; [ DW_TAG_arg_variable ] [line 7]
+!37 = !{!"bar.cpp", !"/Users/echristo/debug-tests"}
+!38 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/DW_AT_specification.ll b/test/DebugInfo/X86/DW_AT_specification.ll
index 93aa47e..ab2075a 100644
--- a/test/DebugInfo/X86/DW_AT_specification.ll
+++ b/test/DebugInfo/X86/DW_AT_specification.ll
@@ -20,23 +20,23 @@ entry:
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!28}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.0 ()\000\00\000\00\000", metadata !27, metadata !1, metadata !1, metadata !3, metadata !18,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3foo3barEv\004\000\001\000\006\00256\000\004", metadata !6, null, metadata !7, null, void ()* @_ZN3foo3barEv, null, metadata !11, null} ; [ DW_TAG_subprogram ] [line 4] [def] [bar]
-!6 = metadata !{metadata !"0x29", metadata !27} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9}
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{metadata !"0x13\00foo\001\000\000\000\004\000", metadata !27, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 0, align 0, offset 0] [decl] [from ]
-!11 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3foo3barEv\002\000\000\000\006\00256\000\002", metadata !6, metadata !12, metadata !7, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ]
-!12 = metadata !{metadata !"0x2\00foo\001\008\008\000\000\000", metadata !27, null, null, metadata !13, null, null} ; [ DW_TAG_class_type ]
-!13 = metadata !{metadata !11}
-!18 = metadata !{metadata !20}
-!20 = metadata !{metadata !"0x34\00x\00x\00\005\001\001", metadata !5, metadata !6, metadata !21, i32* @_ZZN3foo3barEvE1x, null} ; [ DW_TAG_variable ]
-!21 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !22} ; [ DW_TAG_const_type ]
-!22 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!25 = metadata !{i32 6, i32 1, metadata !26, null}
-!26 = metadata !{metadata !"0xb\004\0017\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ]
-!27 = metadata !{metadata !"nsNativeAppSupportBase.ii", metadata !"/Users/espindola/mozilla-central/obj-x86_64-apple-darwin11.2.0/toolkit/library"}
-!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.0 ()\000\00\000\00\000", !27, !1, !1, !3, !18,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00bar\00bar\00_ZN3foo3barEv\004\000\001\000\006\00256\000\004", !6, null, !7, null, void ()* @_ZN3foo3barEv, null, !11, null} ; [ DW_TAG_subprogram ] [line 4] [def] [bar]
+!6 = !{!"0x29", !27} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9}
+!9 = !{!"0xf\00\000\0064\0064\000\0064", i32 0, null, !10} ; [ DW_TAG_pointer_type ]
+!10 = !{!"0x13\00foo\001\000\000\000\004\000", !27, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 0, align 0, offset 0] [decl] [from ]
+!11 = !{!"0x2e\00bar\00bar\00_ZN3foo3barEv\002\000\000\000\006\00256\000\002", !6, !12, !7, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ]
+!12 = !{!"0x2\00foo\001\008\008\000\000\000", !27, null, null, !13, null, null} ; [ DW_TAG_class_type ]
+!13 = !{!11}
+!18 = !{!20}
+!20 = !{!"0x34\00x\00x\00\005\001\001", !5, !6, !21, i32* @_ZZN3foo3barEvE1x, null} ; [ DW_TAG_variable ]
+!21 = !{!"0x26\00\000\000\000\000\000", null, null, !22} ; [ DW_TAG_const_type ]
+!22 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!25 = !MDLocation(line: 6, column: 1, scope: !26)
+!26 = !{!"0xb\004\0017\000", !6, !5} ; [ DW_TAG_lexical_block ]
+!27 = !{!"nsNativeAppSupportBase.ii", !"/Users/espindola/mozilla-central/obj-x86_64-apple-darwin11.2.0/toolkit/library"}
+!28 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/DW_AT_stmt_list_sec_offset.ll b/test/DebugInfo/X86/DW_AT_stmt_list_sec_offset.ll
index d54774d..39c3340 100644
--- a/test/DebugInfo/X86/DW_AT_stmt_list_sec_offset.ll
+++ b/test/DebugInfo/X86/DW_AT_stmt_list_sec_offset.ll
@@ -30,15 +30,15 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"=
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !11}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [C:\Projects/test.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"test.c", metadata !"C:\5CProjects"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\000\000\002", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [main]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [C:\Projects/test.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 3, i32 0, metadata !4, null}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [C:\Projects/test.c] [DW_LANG_C99]
+!1 = !{!"test.c", !"C:\5CProjects"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00main\00main\00\001\000\001\000\006\000\000\002", !1, !5, !6, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [main]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [C:\Projects/test.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !MDLocation(line: 3, scope: !4)
+!11 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/DW_TAG_friend.ll b/test/DebugInfo/X86/DW_TAG_friend.ll
index 23d5c81..ffa032f 100644
--- a/test/DebugInfo/X86/DW_TAG_friend.ll
+++ b/test/DebugInfo/X86/DW_TAG_friend.ll
@@ -18,31 +18,31 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!29}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.1 (trunk 153413) (llvm/trunk 153428)\000\00\000\00\000", metadata !28, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5, metadata !17}
-!5 = metadata !{metadata !"0x34\00a\00a\00\0010\000\001", null, metadata !6, metadata !7, %class.A* @a, null} ; [ DW_TAG_variable ]
-!6 = metadata !{metadata !"0x29", metadata !28} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x2\00A\001\0032\0032\000\000\000", metadata !28, null, null, metadata !8, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
-!8 = metadata !{metadata !9, metadata !11}
-!9 = metadata !{metadata !"0xd\00a\002\0032\0032\000\001", metadata !28, metadata !7, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!11 = metadata !{metadata !"0x2e\00A\00A\00\001\000\000\000\006\00320\000\001", metadata !6, metadata !7, metadata !12, null, null, null, i32 0, metadata !15} ; [ DW_TAG_subprogram ]
-!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!13 = metadata !{null, metadata !14}
-!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !7} ; [ DW_TAG_pointer_type ]
-!15 = metadata !{metadata !16}
-!16 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!17 = metadata !{metadata !"0x34\00b\00b\00\0011\000\001", null, metadata !6, metadata !18, %class.B* @b, null} ; [ DW_TAG_variable ]
-!18 = metadata !{metadata !"0x2\00B\005\0032\0032\000\000\000", metadata !28, null, null, metadata !19, null, null, null} ; [ DW_TAG_class_type ] [B] [line 5, size 32, align 32, offset 0] [def] [from ]
-!19 = metadata !{metadata !20, metadata !21, metadata !27}
-!20 = metadata !{metadata !"0xd\00b\007\0032\0032\000\001", metadata !28, metadata !18, metadata !10} ; [ DW_TAG_member ]
-!21 = metadata !{metadata !"0x2e\00B\00B\00\005\000\000\000\006\00320\000\005", metadata !6, metadata !18, metadata !22, null, null, null, i32 0, metadata !25} ; [ DW_TAG_subprogram ]
-!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!23 = metadata !{null, metadata !24}
-!24 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !18} ; [ DW_TAG_pointer_type ]
-!25 = metadata !{metadata !26}
-!26 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!27 = metadata !{metadata !"0x2a\00\000\000\000\000\000", metadata !18, null, metadata !7} ; [ DW_TAG_friend ]
-!28 = metadata !{metadata !"foo.cpp", metadata !"/Users/echristo/tmp"}
-!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.1 (trunk 153413) (llvm/trunk 153428)\000\00\000\00\000", !28, !1, !1, !1, !3,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5, !17}
+!5 = !{!"0x34\00a\00a\00\0010\000\001", null, !6, !7, %class.A* @a, null} ; [ DW_TAG_variable ]
+!6 = !{!"0x29", !28} ; [ DW_TAG_file_type ]
+!7 = !{!"0x2\00A\001\0032\0032\000\000\000", !28, null, null, !8, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
+!8 = !{!9, !11}
+!9 = !{!"0xd\00a\002\0032\0032\000\001", !28, !7, !10} ; [ DW_TAG_member ]
+!10 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!11 = !{!"0x2e\00A\00A\00\001\000\000\000\006\00320\000\001", !6, !7, !12, null, null, null, i32 0, !15} ; [ DW_TAG_subprogram ]
+!12 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = !{null, !14}
+!14 = !{!"0xf\00\000\0064\0064\000\0064", i32 0, null, !7} ; [ DW_TAG_pointer_type ]
+!15 = !{!16}
+!16 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!17 = !{!"0x34\00b\00b\00\0011\000\001", null, !6, !18, %class.B* @b, null} ; [ DW_TAG_variable ]
+!18 = !{!"0x2\00B\005\0032\0032\000\000\000", !28, null, null, !19, null, null, null} ; [ DW_TAG_class_type ] [B] [line 5, size 32, align 32, offset 0] [def] [from ]
+!19 = !{!20, !21, !27}
+!20 = !{!"0xd\00b\007\0032\0032\000\001", !28, !18, !10} ; [ DW_TAG_member ]
+!21 = !{!"0x2e\00B\00B\00\005\000\000\000\006\00320\000\005", !6, !18, !22, null, null, null, i32 0, !25} ; [ DW_TAG_subprogram ]
+!22 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = !{null, !24}
+!24 = !{!"0xf\00\000\0064\0064\000\0064", i32 0, null, !18} ; [ DW_TAG_pointer_type ]
+!25 = !{!26}
+!26 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!27 = !{!"0x2a\00\000\000\000\000\000", !18, null, !7} ; [ DW_TAG_friend ]
+!28 = !{!"foo.cpp", !"/Users/echristo/tmp"}
+!29 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/aligned_stack_var.ll b/test/DebugInfo/X86/aligned_stack_var.ll
index 9dea6b7..32c6f24 100644
--- a/test/DebugInfo/X86/aligned_stack_var.ll
+++ b/test/DebugInfo/X86/aligned_stack_var.ll
@@ -18,7 +18,7 @@
 define void @_Z3runv() nounwind uwtable {
 entry:
   %x = alloca i32, align 32
-  call void @llvm.dbg.declare(metadata !{i32* %x}, metadata !9, metadata !{metadata !"0x102"}), !dbg !12
+  call void @llvm.dbg.declare(metadata i32* %x, metadata !9, metadata !{!"0x102"}), !dbg !12
   ret void, !dbg !13
 }
 
@@ -27,17 +27,17 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!15}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.2 (trunk 155696:155697) (llvm/trunk 155696)\000\00\000\00\000", metadata !14, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00run\00run\00_Z3runv\001\000\001\000\006\00256\000\001", metadata !14, metadata !6, metadata !7, null, void ()* @_Z3runv, null, null, metadata !1} ; [ DW_TAG_subprogram ]
-!6 = metadata !{metadata !"0x29", metadata !14} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null}
-!9 = metadata !{metadata !"0x100\00x\002\000", metadata !10, metadata !6, metadata !11} ; [ DW_TAG_auto_variable ]
-!10 = metadata !{metadata !"0xb\001\0012\000", metadata !14, metadata !5} ; [ DW_TAG_lexical_block ]
-!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!12 = metadata !{i32 2, i32 7, metadata !10, null}
-!13 = metadata !{i32 3, i32 1, metadata !10, null}
-!14 = metadata !{metadata !"test.cc", metadata !"/home/samsonov/debuginfo"}
-!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.2 (trunk 155696:155697) (llvm/trunk 155696)\000\00\000\00\000", !14, !1, !1, !3, !1,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00run\00run\00_Z3runv\001\000\001\000\006\00256\000\001", !14, !6, !7, null, void ()* @_Z3runv, null, null, !1} ; [ DW_TAG_subprogram ]
+!6 = !{!"0x29", !14} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null}
+!9 = !{!"0x100\00x\002\000", !10, !6, !11} ; [ DW_TAG_auto_variable ]
+!10 = !{!"0xb\001\0012\000", !14, !5} ; [ DW_TAG_lexical_block ]
+!11 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!12 = !MDLocation(line: 2, column: 7, scope: !10)
+!13 = !MDLocation(line: 3, column: 1, scope: !10)
+!14 = !{!"test.cc", !"/home/samsonov/debuginfo"}
+!15 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/arange.ll b/test/DebugInfo/X86/arange.ll
index d773e87..97ab3c5 100644
--- a/test/DebugInfo/X86/arange.ll
+++ b/test/DebugInfo/X86/arange.ll
@@ -29,18 +29,18 @@
 !llvm.module.flags = !{!12, !13}
 !llvm.ident = !{!14}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !2, metadata !9, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/simple.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"simple.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00foo<&i>\003\008\008\000\000\000", metadata !1, null, null, metadata !2, null, metadata !5, metadata !"_ZTS3fooIXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [foo<&i>] [line 3, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{metadata !"0x30\00x\000\000", null, metadata !7, i32* @i, null} ; [ DW_TAG_template_value_parameter ]
-!7 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x34\00f\00f\00\006\000\001", null, metadata !11, metadata !4, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 6] [def]
-!11 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/simple.cpp]
-!12 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!14 = metadata !{metadata !"clang version 3.5 "}
+!0 = !{!"0x11\004\00clang version 3.5 \000\00\000\00\000", !1, !2, !3, !2, !9, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/simple.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"simple.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00foo<&i>\003\008\008\000\000\000", !1, null, null, !2, null, !5, !"_ZTS3fooIXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [foo<&i>] [line 3, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!6}
+!6 = !{!"0x30\00x\000\000", null, !7, i32* @i, null} ; [ DW_TAG_template_value_parameter ]
+!7 = !{!"0xf\00\000\0064\0064\000\000", null, null, !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!10}
+!10 = !{!"0x34\00f\00f\00\006\000\001", null, !11, !4, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 6] [def]
+!11 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/simple.cpp]
+!12 = !{i32 2, !"Dwarf Version", i32 4}
+!13 = !{i32 1, !"Debug Info Version", i32 2}
+!14 = !{!"clang version 3.5 "}
diff --git a/test/DebugInfo/X86/arguments.ll b/test/DebugInfo/X86/arguments.ll
index 779db48..2bc56b4 100644
--- a/test/DebugInfo/X86/arguments.ll
+++ b/test/DebugInfo/X86/arguments.ll
@@ -31,8 +31,8 @@
 ; Function Attrs: nounwind uwtable
 define void @_Z4func3fooS_(%struct.foo* %f, %struct.foo* %g) #0 {
 entry:
-  call void @llvm.dbg.declare(metadata !{%struct.foo* %f}, metadata !19, metadata !{metadata !"0x102"}), !dbg !20
-  call void @llvm.dbg.declare(metadata !{%struct.foo* %g}, metadata !21, metadata !{metadata !"0x102"}), !dbg !20
+  call void @llvm.dbg.declare(metadata %struct.foo* %f, metadata !19, metadata !{!"0x102"}), !dbg !20
+  call void @llvm.dbg.declare(metadata %struct.foo* %g, metadata !21, metadata !{!"0x102"}), !dbg !20
   %i = getelementptr inbounds %struct.foo* %f, i32 0, i32 0, !dbg !22
   %0 = load i32* %i, align 4, !dbg !22
   %inc = add nsw i32 %0, 1, !dbg !22
@@ -49,28 +49,28 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!24}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/scratch.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"scratch.cpp", metadata !"/usr/local/google/home/blaikie/dev/scratch"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00func\00func\00_Z4func3fooS_\006\000\001\000\006\00256\000\006", metadata !1, metadata !5, metadata !6, null, void (%struct.foo*, %struct.foo*)* @_Z4func3fooS_, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/scratch.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null, metadata !8, metadata !8}
-!8 = metadata !{metadata !"0x13\00foo\001\0032\0032\000\000\000", metadata !1, null, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 32, align 32, offset 0] [def] [from ]
-!9 = metadata !{metadata !10, metadata !12}
-!10 = metadata !{metadata !"0xd\00i\003\0032\0032\000\000", metadata !1, metadata !8, metadata !11} ; [ DW_TAG_member ] [i] [line 3, size 32, align 32, offset 0] [from int]
-!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!12 = metadata !{metadata !"0x2e\00foo\00foo\00\002\000\000\000\006\00256\000\002", metadata !1, metadata !8, metadata !13, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 2] [foo]
-!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!14 = metadata !{null, metadata !15, metadata !16}
-!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo]
-!16 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !17} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!17 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo]
-!18 = metadata !{i32 786468}
-!19 = metadata !{metadata !"0x101\00f\0016777222\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [f] [line 6]
-!20 = metadata !{i32 6, i32 0, metadata !4, null}
-!21 = metadata !{metadata !"0x101\00g\0033554438\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [g] [line 6]
-!22 = metadata !{i32 7, i32 0, metadata !4, null}
-!23 = metadata !{i32 8, i32 0, metadata !4, null}
-!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/scratch.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"scratch.cpp", !"/usr/local/google/home/blaikie/dev/scratch"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00func\00func\00_Z4func3fooS_\006\000\001\000\006\00256\000\006", !1, !5, !6, null, void (%struct.foo*, %struct.foo*)* @_Z4func3fooS_, null, null, !2} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/scratch.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null, !8, !8}
+!8 = !{!"0x13\00foo\001\0032\0032\000\000\000", !1, null, null, !9, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 32, align 32, offset 0] [def] [from ]
+!9 = !{!10, !12}
+!10 = !{!"0xd\00i\003\0032\0032\000\000", !1, !8, !11} ; [ DW_TAG_member ] [i] [line 3, size 32, align 32, offset 0] [from int]
+!11 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = !{!"0x2e\00foo\00foo\00\002\000\000\000\006\00256\000\002", !1, !8, !13, null, null, null, i32 0, !18} ; [ DW_TAG_subprogram ] [line 2] [foo]
+!13 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = !{null, !15, !16}
+!15 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo]
+!16 = !{!"0x10\00\000\000\000\000\000", null, null, !17} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = !{!"0x26\00\000\000\000\000\000", null, null, !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo]
+!18 = !{i32 786468}
+!19 = !{!"0x101\00f\0016777222\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [f] [line 6]
+!20 = !MDLocation(line: 6, scope: !4)
+!21 = !{!"0x101\00g\0033554438\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [g] [line 6]
+!22 = !MDLocation(line: 7, scope: !4)
+!23 = !MDLocation(line: 8, scope: !4)
+!24 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/array.ll b/test/DebugInfo/X86/array.ll
index 3fbfb1d..e39be5a 100644
--- a/test/DebugInfo/X86/array.ll
+++ b/test/DebugInfo/X86/array.ll
@@ -25,7 +25,7 @@ target triple = "x86_64-apple-macosx10.9.0"
 
 ; Function Attrs: nounwind ssp uwtable
 define void @f(i32* nocapture %p) #0 {
-  tail call void @llvm.dbg.value(metadata !{i32* %p}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !28
+  tail call void @llvm.dbg.value(metadata i32* %p, i64 0, metadata !11, metadata !{!"0x102"}), !dbg !28
   store i32 42, i32* %p, align 4, !dbg !29, !tbaa !30
   ret void, !dbg !34
 }
@@ -33,15 +33,15 @@ define void @f(i32* nocapture %p) #0 {
 ; Function Attrs: nounwind ssp uwtable
 define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 {
   %array = alloca [4 x i32], align 16
-  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !19, metadata !{metadata !"0x102"}), !dbg !35
-  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !20, metadata !{metadata !"0x102"}), !dbg !35
-  tail call void @llvm.dbg.value(metadata !{[4 x i32]* %array}, i64 0, metadata !21, metadata !{metadata !"0x102"}), !dbg !36
+  tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !19, metadata !{!"0x102"}), !dbg !35
+  tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !20, metadata !{!"0x102"}), !dbg !35
+  tail call void @llvm.dbg.value(metadata [4 x i32]* %array, i64 0, metadata !21, metadata !{!"0x102"}), !dbg !36
   %1 = bitcast [4 x i32]* %array to i8*, !dbg !36
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([4 x i32]* @main.array to i8*), i64 16, i32 16, i1 false), !dbg !36
-  tail call void @llvm.dbg.value(metadata !{[4 x i32]* %array}, i64 0, metadata !21, metadata !{metadata !"0x102"}), !dbg !36
+  tail call void @llvm.dbg.value(metadata [4 x i32]* %array, i64 0, metadata !21, metadata !{!"0x102"}), !dbg !36
   %2 = getelementptr inbounds [4 x i32]* %array, i64 0, i64 0, !dbg !37
   call void @f(i32* %2), !dbg !37
-  tail call void @llvm.dbg.value(metadata !{[4 x i32]* %array}, i64 0, metadata !21, metadata !{metadata !"0x102"}), !dbg !36
+  tail call void @llvm.dbg.value(metadata [4 x i32]* %array, i64 0, metadata !21, metadata !{!"0x102"}), !dbg !36
   %3 = load i32* %2, align 16, !dbg !38, !tbaa !30
   ret i32 %3, !dbg !38
 }
@@ -60,42 +60,42 @@ attributes #2 = { nounwind readnone }
 !llvm.module.flags = !{!25, !26}
 !llvm.ident = !{!27}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/array.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"array.c", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !12}
-!4 = metadata !{metadata !"0x2e\00f\00f\00\001\000\001\000\006\00256\001\001", metadata !1, metadata !5, metadata !6, null, void (i32*)* @f, null, null, metadata !10} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/array.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null, metadata !8}
-!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !11}
-!11 = metadata !{metadata !"0x101\00p\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [p] [line 1]
-!12 = metadata !{metadata !"0x2e\00main\00main\00\005\000\001\000\006\00256\001\005", metadata !1, metadata !5, metadata !13, null, i32 (i32, i8**)* @main, null, null, metadata !18} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
-!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!14 = metadata !{metadata !9, metadata !9, metadata !15}
-!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!16 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !17} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
-!17 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!18 = metadata !{metadata !19, metadata !20, metadata !21}
-!19 = metadata !{metadata !"0x101\00argc\0016777221\000", metadata !12, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [argc] [line 5]
-!20 = metadata !{metadata !"0x101\00argv\0033554437\000", metadata !12, metadata !5, metadata !15} ; [ DW_TAG_arg_variable ] [argv] [line 5]
-!21 = metadata !{metadata !"0x100\00array\006\000", metadata !12, metadata !5, metadata !22} ; [ DW_TAG_auto_variable ] [array] [line 6]
-!22 = metadata !{metadata !"0x1\00\000\00128\0032\000\000", null, null, metadata !9, metadata !23, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 32, offset 0] [from int]
-!23 = metadata !{metadata !24}
-!24 = metadata !{metadata !"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
-!25 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!27 = metadata !{metadata !"clang version 3.5.0 "}
-!28 = metadata !{i32 1, i32 0, metadata !4, null}
-!29 = metadata !{i32 2, i32 0, metadata !4, null}
-!30 = metadata !{metadata !31, metadata !31, i64 0}
-!31 = metadata !{metadata !"int", metadata !32, i64 0}
-!32 = metadata !{metadata !"omnipotent char", metadata !33, i64 0}
-!33 = metadata !{metadata !"Simple C/C++ TBAA"}
-!34 = metadata !{i32 3, i32 0, metadata !4, null}
-!35 = metadata !{i32 5, i32 0, metadata !12, null}
-!36 = metadata !{i32 6, i32 0, metadata !12, null}
-!37 = metadata !{i32 7, i32 0, metadata !12, null}
-!38 = metadata !{i32 8, i32 0, metadata !12, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 \001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/array.c] [DW_LANG_C99]
+!1 = !{!"array.c", !""}
+!2 = !{}
+!3 = !{!4, !12}
+!4 = !{!"0x2e\00f\00f\00\001\000\001\000\006\00256\001\001", !1, !5, !6, null, void (i32*)* @f, null, null, !10} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/array.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null, !8}
+!8 = !{!"0xf\00\000\0064\0064\000\000", null, null, !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!11}
+!11 = !{!"0x101\00p\0016777217\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [p] [line 1]
+!12 = !{!"0x2e\00main\00main\00\005\000\001\000\006\00256\001\005", !1, !5, !13, null, i32 (i32, i8**)* @main, null, null, !18} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
+!13 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = !{!9, !9, !15}
+!15 = !{!"0xf\00\000\0064\0064\000\000", null, null, !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!16 = !{!"0xf\00\000\0064\0064\000\000", null, null, !17} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!17 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!18 = !{!19, !20, !21}
+!19 = !{!"0x101\00argc\0016777221\000", !12, !5, !9} ; [ DW_TAG_arg_variable ] [argc] [line 5]
+!20 = !{!"0x101\00argv\0033554437\000", !12, !5, !15} ; [ DW_TAG_arg_variable ] [argv] [line 5]
+!21 = !{!"0x100\00array\006\000", !12, !5, !22} ; [ DW_TAG_auto_variable ] [array] [line 6]
+!22 = !{!"0x1\00\000\00128\0032\000\000", null, null, !9, !23, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 32, offset 0] [from int]
+!23 = !{!24}
+!24 = !{!"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
+!25 = !{i32 2, !"Dwarf Version", i32 2}
+!26 = !{i32 1, !"Debug Info Version", i32 2}
+!27 = !{!"clang version 3.5.0 "}
+!28 = !MDLocation(line: 1, scope: !4)
+!29 = !MDLocation(line: 2, scope: !4)
+!30 = !{!31, !31, i64 0}
+!31 = !{!"int", !32, i64 0}
+!32 = !{!"omnipotent char", !33, i64 0}
+!33 = !{!"Simple C/C++ TBAA"}
+!34 = !MDLocation(line: 3, scope: !4)
+!35 = !MDLocation(line: 5, scope: !12)
+!36 = !MDLocation(line: 6, scope: !12)
+!37 = !MDLocation(line: 7, scope: !12)
+!38 = !MDLocation(line: 8, scope: !12)
diff --git a/test/DebugInfo/X86/array2.ll b/test/DebugInfo/X86/array2.ll
index e2d42e8..63c9256 100644
--- a/test/DebugInfo/X86/array2.ll
+++ b/test/DebugInfo/X86/array2.ll
@@ -13,12 +13,12 @@
 ; }
 ;
 ; RUN: opt %s -O2 -S -o - | FileCheck %s
-; Test that we do not lower dbg.declares for arrays.
+; Test that we correctly lower dbg.declares for arrays.
 ;
 ; CHECK: define i32 @main
-; CHECK: call void @llvm.dbg.value
-; CHECK: call void @llvm.dbg.value
-; CHECK: call void @llvm.dbg.declare
+; CHECK: call void @llvm.dbg.value(metadata i32 42, i64 0, metadata ![[ARRAY:[0-9]+]], metadata ![[EXPR:[0-9]+]])
+; CHECK: ![[ARRAY]] = {{.*}}; [ DW_TAG_auto_variable ] [array] [line 6]
+; CHECK: ![[EXPR]] = {{.*}}; [ DW_TAG_expression ] [DW_OP_bit_piece offset=0, size=32]
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
 
@@ -29,7 +29,7 @@ define void @f(i32* %p) #0 {
 entry:
   %p.addr = alloca i32*, align 8
   store i32* %p, i32** %p.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i32** %p.addr}, metadata !19, metadata !{metadata !"0x102"}), !dbg !20
+  call void @llvm.dbg.declare(metadata i32** %p.addr, metadata !19, metadata !{!"0x102"}), !dbg !20
   %0 = load i32** %p.addr, align 8, !dbg !21
   %arrayidx = getelementptr inbounds i32* %0, i64 0, !dbg !21
   store i32 42, i32* %arrayidx, align 4, !dbg !21
@@ -48,10 +48,10 @@ entry:
   %array = alloca [4 x i32], align 16
   store i32 0, i32* %retval
   store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
+  call void @llvm.dbg.declare(metadata i32* %argc.addr, metadata !23, metadata !{!"0x102"}), !dbg !24
   store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !25, metadata !{metadata !"0x102"}), !dbg !24
-  call void @llvm.dbg.declare(metadata !{[4 x i32]* %array}, metadata !26, metadata !{metadata !"0x102"}), !dbg !30
+  call void @llvm.dbg.declare(metadata i8*** %argv.addr, metadata !25, metadata !{!"0x102"}), !dbg !24
+  call void @llvm.dbg.declare(metadata [4 x i32]* %array, metadata !26, metadata !{!"0x102"}), !dbg !30
   %0 = bitcast [4 x i32]* %array to i8*, !dbg !30
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([4 x i32]* @main.array to i8*), i64 16, i32 16, i1 false), !dbg !30
   %arraydecay = getelementptr inbounds [4 x i32]* %array, i32 0, i32 0, !dbg !31
@@ -72,36 +72,36 @@ attributes #2 = { nounwind }
 !llvm.module.flags = !{!16, !17}
 !llvm.ident = !{!18}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [array.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"array.c", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !10}
-!4 = metadata !{metadata !"0x2e\00f\00f\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void (i32*)* @f, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [array.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null, metadata !8}
-!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !"0x2e\00main\00main\00\005\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !11, null, i32 (i32, i8**)* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !9, metadata !9, metadata !13}
-!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !15} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
-!15 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!18 = metadata !{metadata !"clang version 3.5.0 "}
-!19 = metadata !{metadata !"0x101\00p\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [p] [line 1]
-!20 = metadata !{i32 1, i32 0, metadata !4, null}
-!21 = metadata !{i32 2, i32 0, metadata !4, null}
-!22 = metadata !{i32 3, i32 0, metadata !4, null}
-!23 = metadata !{metadata !"0x101\00argc\0016777221\000", metadata !10, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [argc] [line 5]
-!24 = metadata !{i32 5, i32 0, metadata !10, null}
-!25 = metadata !{metadata !"0x101\00argv\0033554437\000", metadata !10, metadata !5, metadata !13} ; [ DW_TAG_arg_variable ] [argv] [line 5]
-!26 = metadata !{metadata !"0x100\00array\006\000", metadata !10, metadata !5, metadata !27} ; [ DW_TAG_auto_variable ] [array] [line 6]
-!27 = metadata !{metadata !"0x1\00\000\00128\0032\000\000", null, null, metadata !9, metadata !28, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 32, offset 0] [from int]
-!28 = metadata !{metadata !29}
-!29 = metadata !{metadata !"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
-!30 = metadata !{i32 6, i32 0, metadata !10, null}
-!31 = metadata !{i32 7, i32 0, metadata !10, null}
-!32 = metadata !{i32 8, i32 0, metadata !10, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [array.c] [DW_LANG_C99]
+!1 = !{!"array.c", !""}
+!2 = !{}
+!3 = !{!4, !10}
+!4 = !{!"0x2e\00f\00f\00\001\000\001\000\006\00256\000\001", !1, !5, !6, null, void (i32*)* @f, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [array.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null, !8}
+!8 = !{!"0xf\00\000\0064\0064\000\000", null, null, !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!"0x2e\00main\00main\00\005\000\001\000\006\00256\000\005", !1, !5, !11, null, i32 (i32, i8**)* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
+!11 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!9, !9, !13}
+!13 = !{!"0xf\00\000\0064\0064\000\000", null, null, !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!14 = !{!"0xf\00\000\0064\0064\000\000", null, null, !15} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!15 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!16 = !{i32 2, !"Dwarf Version", i32 2}
+!17 = !{i32 1, !"Debug Info Version", i32 2}
+!18 = !{!"clang version 3.5.0 "}
+!19 = !{!"0x101\00p\0016777217\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [p] [line 1]
+!20 = !MDLocation(line: 1, scope: !4)
+!21 = !MDLocation(line: 2, scope: !4)
+!22 = !MDLocation(line: 3, scope: !4)
+!23 = !{!"0x101\00argc\0016777221\000", !10, !5, !9} ; [ DW_TAG_arg_variable ] [argc] [line 5]
+!24 = !MDLocation(line: 5, scope: !10)
+!25 = !{!"0x101\00argv\0033554437\000", !10, !5, !13} ; [ DW_TAG_arg_variable ] [argv] [line 5]
+!26 = !{!"0x100\00array\006\000", !10, !5, !27} ; [ DW_TAG_auto_variable ] [array] [line 6]
+!27 = !{!"0x1\00\000\00128\0032\000\000", null, null, !9, !28, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 32, offset 0] [from int]
+!28 = !{!29}
+!29 = !{!"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
+!30 = !MDLocation(line: 6, scope: !10)
+!31 = !MDLocation(line: 7, scope: !10)
+!32 = !MDLocation(line: 8, scope: !10)
diff --git a/test/DebugInfo/X86/asm-macro-line-number.s b/test/DebugInfo/X86/asm-macro-line-number.s
new file mode 100644
index 0000000..0f51dbb
--- /dev/null
+++ b/test/DebugInfo/X86/asm-macro-line-number.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -g -triple i686-linux-gnu -filetype asm -o - %s | FileCheck %s
+
+# 1 "reduced.S"
+# 1 "<built-in>" 1
+# 1 "reduced.S" 2
+
+ .macro return arg
+  movl %eax, \arg
+  retl
+ .endm
+
+function:
+ return 0
+
+# CHECK: .file 2 "reduced.S"
+# CHECK: .loc 2 8 0
+# CHECK: movl %eax, 0
+# CHECK: .loc 2 8 0
+# CHECK: retl
+
diff --git a/test/DebugInfo/X86/block-capture.ll b/test/DebugInfo/X86/block-capture.ll
index e59aa05..51d575f 100644
--- a/test/DebugInfo/X86/block-capture.ll
+++ b/test/DebugInfo/X86/block-capture.ll
@@ -1,133 +1,131 @@
-; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
+; RUN: llc %s -o %t -filetype=obj
 ; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
-; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj -dwarf-version=3
-; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s -check-prefix=DWARF3
 
 ; Checks that we emit debug info for the block variable declare.
 ; CHECK: DW_TAG_subprogram
 ; CHECK: DW_TAG_variable
-; CHECK: DW_AT_location [DW_FORM_sec_offset]
-; CHECK: DW_AT_name {{.*}} "block"
+;                                              fbreg +8, deref, +32
+; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (<0x05> 91 08 06 23 20 )
+; CHECK-NEXT: DW_AT_name {{.*}} "block"
 
-; DWARF3: DW_TAG_subprogram
-; DWARF3: DW_TAG_variable
-; DWARF3: DW_AT_location [DW_FORM_data4]
-; DWARF3: DW_AT_name {{.*}} "block"
+; Extracted from the clang output for:
+; void foo() {
+;  void (^block)() = ^{ block(); };
+; }
+
+; ModuleID = 'foo.m'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin"
 
 %struct.__block_descriptor = type { i64, i64 }
 %struct.__block_literal_generic = type { i8*, i32, i32, i8*, %struct.__block_descriptor* }
 
-declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
-
-define hidden void @__foo_block_invoke_0(i8* %.block_descriptor) uwtable ssp {
-entry:
-  %exn.slot = alloca i8*
-  %ehselector.slot = alloca i32
-  call void @llvm.dbg.value(metadata !{i8* %.block_descriptor}, i64 0, metadata !39, metadata !{metadata !"0x102"}), !dbg !51
-  %block = bitcast i8* %.block_descriptor to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, void ()* }>*, !dbg !52
-  call void @llvm.dbg.declare(metadata !{<{ i8*, i32, i32, i8*, %struct.__block_descriptor*, void ()* }>* %block}, metadata !53, metadata !65), !dbg !54
-  %block.capture.addr = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, void ()* }>* %block, i32 0, i32 5, !dbg !55
-  %0 = load void ()** %block.capture.addr, align 8, !dbg !55
-  %block.literal = bitcast void ()* %0 to %struct.__block_literal_generic*, !dbg !55
-  %1 = getelementptr inbounds %struct.__block_literal_generic* %block.literal, i32 0, i32 3, !dbg !55
-  %2 = bitcast %struct.__block_literal_generic* %block.literal to i8*, !dbg !55
-  %3 = load i8** %1, !dbg !55
-  %4 = bitcast i8* %3 to void (i8*)*, !dbg !55
-  invoke void %4(i8* %2)
-          to label %invoke.cont unwind label %lpad, !dbg !55
-
-invoke.cont:                                      ; preds = %entry
-  br label %eh.cont, !dbg !58
+@_NSConcreteStackBlock = external global i8*
+@.str = private unnamed_addr constant [6 x i8] c"v8@?0\00", align 1
 
-eh.cont:                                          ; preds = %catch, %invoke.cont
-  ret void, !dbg !61
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
-lpad:                                             ; preds = %entry
-  %5 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*)
-          catch i8* null, !dbg !55
-  %6 = extractvalue { i8*, i32 } %5, 0, !dbg !55
-  store i8* %6, i8** %exn.slot, !dbg !55
-  %7 = extractvalue { i8*, i32 } %5, 1, !dbg !55
-  store i32 %7, i32* %ehselector.slot, !dbg !55
-  br label %catch, !dbg !55
-
-catch:                                            ; preds = %lpad
-  %exn = load i8** %exn.slot, !dbg !62
-  %exn.adjusted = call i8* @objc_begin_catch(i8* %exn) nounwind, !dbg !62
-  call void @objc_end_catch(), !dbg !58
-  br label %eh.cont, !dbg !58
+; Function Attrs: ssp uwtable
+define internal void @__foo_block_invoke(i8* %.block_descriptor) #2 {
+entry:
+  %.block_descriptor.addr = alloca i8*, align 8
+  %block.addr = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, void (...)* }>*, align 8
+  store i8* %.block_descriptor, i8** %.block_descriptor.addr, align 8
+  %0 = load i8** %.block_descriptor.addr
+  call void @llvm.dbg.value(metadata i8* %0, i64 0, metadata !47, metadata !43), !dbg !66
+  call void @llvm.dbg.declare(metadata i8* %.block_descriptor, metadata !47, metadata !43), !dbg !66
+  %block = bitcast i8* %.block_descriptor to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, void (...)* }>*, !dbg !67
+  store <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, void (...)* }>* %block, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, void (...)* }>** %block.addr, align 8
+  call void @llvm.dbg.declare(metadata <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, void (...)* }>** %block.addr, metadata !68, metadata !69), !dbg !70
+  %block.capture.addr = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, void (...)* }>* %block, i32 0, i32 5, !dbg !71
+  %1 = load void (...)** %block.capture.addr, align 8, !dbg !71
+  %block.literal = bitcast void (...)* %1 to %struct.__block_literal_generic*, !dbg !71
+  %2 = getelementptr inbounds %struct.__block_literal_generic* %block.literal, i32 0, i32 3, !dbg !71
+  %3 = bitcast %struct.__block_literal_generic* %block.literal to i8*, !dbg !71
+  %4 = load i8** %2, !dbg !71
+  %5 = bitcast i8* %4 to void (i8*, ...)*, !dbg !71
+  call void (i8*, ...)* %5(i8* %3), !dbg !71
+  ret void, !dbg !73
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
-
-declare i8* @objc_begin_catch(i8*)
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
-declare void @objc_end_catch()
 
-declare i32 @__objc_personality_v0(...)
+attributes #0 = { nounwind ssp uwtable }
+attributes #1 = { nounwind readnone }
+attributes #2 = { ssp uwtable }
+attributes #3 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!35, !36, !37, !38, !64}
+!llvm.module.flags = !{!16, !17, !18, !19, !20, !21, !22}
+!llvm.ident = !{!23}
 
-!0 = metadata !{metadata !"0x11\0016\00clang version 3.1 (trunk 151227)\000\00\002\00\001", metadata !63, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5, metadata !28, metadata !31, metadata !34}
-!5 = metadata !{metadata !"0x2e\00foo\00foo\00\005\000\001\000\006\00256\000\005", metadata !6, metadata !6, metadata !7, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!6 = metadata !{metadata !"0x29", metadata !63} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9}
-!9 = metadata !{metadata !"0x16\00dispatch_block_t\001\000\000\000\000", metadata !63, null, metadata !10} ; [ DW_TAG_typedef ]
-!10 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{metadata !"0x13\00__block_literal_generic\005\00256\000\000\008\000", metadata !63, metadata !6, null, metadata !12, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_generic] [line 5, size 256, align 0, offset 0] [def] [from ]
-!12 = metadata !{metadata !13, metadata !15, metadata !17, metadata !18, metadata !19}
-!13 = metadata !{metadata !"0xd\00__isa\000\0064\0064\000\000", metadata !63, metadata !6, metadata !14} ; [ DW_TAG_member ]
-!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ]
-!15 = metadata !{metadata !"0xd\00__flags\000\0032\0032\0064\000", metadata !63, metadata !6, metadata !16} ; [ DW_TAG_member ]
-!16 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!17 = metadata !{metadata !"0xd\00__reserved\000\0032\0032\0096\000", metadata !63, metadata !6, metadata !16} ; [ DW_TAG_member ]
-!18 = metadata !{metadata !"0xd\00__FuncPtr\000\0064\0064\00128\000", metadata !63, metadata !6, metadata !14} ; [ DW_TAG_member ]
-!19 = metadata !{metadata !"0xd\00__descriptor\005\0064\0064\00192\000", metadata !63, metadata !6, metadata !20} ; [ DW_TAG_member ]
-!20 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !21} ; [ DW_TAG_pointer_type ]
-!21 = metadata !{metadata !"0x13\00__block_descriptor\005\00128\000\000\008\000", metadata !63, metadata !6, null, metadata !22, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor] [line 5, size 128, align 0, offset 0] [def] [from ]
-!22 = metadata !{metadata !23, metadata !25}
-!23 = metadata !{metadata !"0xd\00reserved\000\0064\0064\000\000", metadata !63, metadata !6, metadata !24} ; [ DW_TAG_member ]
-!24 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ]
-!25 = metadata !{metadata !"0xd\00Size\000\0064\0064\0064\000", metadata !63, metadata !6, metadata !24} ; [ DW_TAG_member ]
-!28 = metadata !{metadata !"0x2e\00__foo_block_invoke_0\00__foo_block_invoke_0\00\007\001\001\000\006\00256\000\007", metadata !6, metadata !6, metadata !29, null, void (i8*)* @__foo_block_invoke_0, null, null, null} ; [ DW_TAG_subprogram ]
-!29 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !30, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!30 = metadata !{null, metadata !14}
-!31 = metadata !{metadata !"0x2e\00__copy_helper_block_\00__copy_helper_block_\00\0010\001\001\000\006\00256\000\0010", metadata !6, metadata !6, metadata !32, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!32 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !33, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!33 = metadata !{null, metadata !14, metadata !14}
-!34 = metadata !{metadata !"0x2e\00__destroy_helper_block_\00__destroy_helper_block_\00\0010\001\001\000\006\00256\000\0010", metadata !6, metadata !6, metadata !29, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!35 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!36 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!37 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!38 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
-!39 = metadata !{metadata !"0x101\00.block_descriptor\0016777223\0064", metadata !28, metadata !6, metadata !40} ; [ DW_TAG_arg_variable ]
-!40 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !41} ; [ DW_TAG_pointer_type ]
-!41 = metadata !{metadata !"0x13\00__block_literal_1\007\00320\0064\000\000\000", metadata !63, metadata !6, null, metadata !42, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_1] [line 7, size 320, align 64, offset 0] [def] [from ]
-!42 = metadata !{metadata !43, metadata !44, metadata !45, metadata !46, metadata !47, metadata !50}
-!43 = metadata !{metadata !"0xd\00__isa\007\0064\0064\000\000", metadata !63, metadata !6, metadata !14} ; [ DW_TAG_member ]
-!44 = metadata !{metadata !"0xd\00__flags\007\0032\0032\0064\000", metadata !63, metadata !6, metadata !16} ; [ DW_TAG_member ]
-!45 = metadata !{metadata !"0xd\00__reserved\007\0032\0032\0096\000", metadata !63, metadata !6, metadata !16} ; [ DW_TAG_member ]
-!46 = metadata !{metadata !"0xd\00__FuncPtr\007\0064\0064\00128\000", metadata !63, metadata !6, metadata !14} ; [ DW_TAG_member ]
-!47 = metadata !{metadata !"0xd\00__descriptor\007\0064\0064\00192\000", metadata !63, metadata !6, metadata !48} ; [ DW_TAG_member ]
-!48 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !49} ; [ DW_TAG_pointer_type ]
-!49 = metadata !{metadata !"0x13\00__block_descriptor_withcopydispose\007\000\000\000\004\000", metadata !63, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 7, size 0, align 0, offset 0] [decl] [from ]
-!50 = metadata !{metadata !"0xd\00block\007\0064\0064\00256\000", metadata !63, metadata !6, metadata !9} ; [ DW_TAG_member ]
-!51 = metadata !{i32 7, i32 18, metadata !28, null}
-!52 = metadata !{i32 7, i32 19, metadata !28, null}
-!53 = metadata !{metadata !"0x100\00block\005\000", metadata !28, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ]
-!54 = metadata !{i32 5, i32 27, metadata !28, null}
-!55 = metadata !{i32 8, i32 22, metadata !56, null}
-!56 = metadata !{metadata !"0xb\007\0026\002", metadata !6, metadata !57} ; [ DW_TAG_lexical_block ]
-!57 = metadata !{metadata !"0xb\007\0019\001", metadata !6, metadata !28} ; [ DW_TAG_lexical_block ]
-!58 = metadata !{i32 10, i32 20, metadata !59, null}
-!59 = metadata !{metadata !"0xb\009\0035\004", metadata !6, metadata !60} ; [ DW_TAG_lexical_block ]
-!60 = metadata !{metadata !"0xb\009\0035\003", metadata !6, metadata !57} ; [ DW_TAG_lexical_block ]
-!61 = metadata !{i32 10, i32 21, metadata !28, null}
-!62 = metadata !{i32 9, i32 20, metadata !56, null}
-!63 = metadata !{metadata !"foo.m", metadata !"/Users/echristo"}
-!64 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!65 = metadata !{metadata !"0x102\0034\0032"} ; [ DW_TAG_expression ] [DW_OP_plus 32]
+!0 = !{!"0x11\0016\00clang version 3.6.0 (trunk 223471)\000\00\002\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/foo.m] [DW_LANG_ObjC]
+!1 = !{!"foo.m", !""}
+!2 = !{}
+!3 = !{!8}
+!5 = !{!"0x29", !1}    ; [ DW_TAG_file_type ] [/foo.m]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{!"0x2e\00__foo_block_invoke\00__foo_block_invoke\00\002\001\001\000\000\00256\000\002", !1, !5, !9, null, void (i8*)* @__foo_block_invoke, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [__foo_block_invoke]
+!9 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = !{null, !11}
+!11 = !{!"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!13 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = !{null, !11, !11}
+!16 = !{i32 1, !"Objective-C Version", i32 2}
+!17 = !{i32 1, !"Objective-C Image Info Version", i32 0}
+!18 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!19 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
+!20 = !{i32 2, !"Dwarf Version", i32 2}
+!21 = !{i32 2, !"Debug Info Version", i32 2}
+!22 = !{i32 1, !"PIC Level", i32 2}
+!23 = !{!"clang version 3.6.0 (trunk 223471)"}
+!25 = !{!"0xf\00\000\0064\000\000\000", null, null, !26} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_literal_generic]
+!26 = !{!"0x13\00__block_literal_generic\002\00256\000\000\008\000", !1, !5, null, !27, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_generic] [line 2, size 256, align 0, offset 0] [def] [from ]
+!27 = !{!28, !29, !31, !32, !36}
+!28 = !{!"0xd\00__isa\000\0064\0064\000\000", !1, !5, !11} ; [ DW_TAG_member ] [__isa] [line 0, size 64, align 64, offset 0] [from ]
+!29 = !{!"0xd\00__flags\000\0032\0032\0064\000", !1, !5, !30} ; [ DW_TAG_member ] [__flags] [line 0, size 32, align 32, offset 64] [from int]
+!30 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!31 = !{!"0xd\00__reserved\000\0032\0032\0096\000", !1, !5, !30} ; [ DW_TAG_member ] [__reserved] [line 0, size 32, align 32, offset 96] [from int]
+!32 = !{!"0xd\00__FuncPtr\000\0064\0064\00128\000", !1, !5, !33} ; [ DW_TAG_member ] [__FuncPtr] [line 0, size 64, align 64, offset 128] [from ]
+!33 = !{!"0xf\00\000\0064\0064\000\000", null, null, !34} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!34 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !35, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!35 = !{null, null}
+!36 = !{!"0xd\00__descriptor\002\0064\0064\00192\000", !1, !5, !37} ; [ DW_TAG_member ] [__descriptor] [line 2, size 64, align 64, offset 192] [from ]
+!37 = !{!"0xf\00\000\0064\000\000\000", null, null, !38} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_descriptor]
+!38 = !{!"0x13\00__block_descriptor\002\00128\000\000\008\000", !1, !5, null, !39, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor] [line 2, size 128, align 0, offset 0] [def] [from ]
+!39 = !{!40, !42}
+!40 = !{!"0xd\00reserved\000\0064\0064\000\000", !1, !5, !41} ; [ DW_TAG_member ] [reserved] [line 0, size 64, align 64, offset 0] [from long unsigned int]
+!41 = !{!"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!42 = !{!"0xd\00Size\000\0064\0064\0064\000", !1, !5, !41} ; [ DW_TAG_member ] [Size] [line 0, size 64, align 64, offset 64] [from long unsigned int]
+!43 = !{!"0x102"}               ; [ DW_TAG_expression ]
+!47 = !{!"0x101\00.block_descriptor\0016777218\0064", !8, !5, !48} ; [ DW_TAG_arg_variable ] [.block_descriptor] [line 2]
+!48 = !{!"0xf\00\000\0064\000\000\000", null, null, !49} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_literal_1]
+!49 = !{!"0x13\00__block_literal_1\002\00320\0064\000\000\000", !1, !5, null, !50, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_1] [line 2, size 320, align 64, offset 0] [def] [from ]
+!50 = !{!51, !52, !53, !54, !56, !65}
+!51 = !{!"0xd\00__isa\002\0064\0064\000\003", !1, !5, !11} ; [ DW_TAG_member ] [__isa] [line 2, size 64, align 64, offset 0] [public] [from ]
+!52 = !{!"0xd\00__flags\002\0032\0032\0064\003", !1, !5, !30} ; [ DW_TAG_member ] [__flags] [line 2, size 32, align 32, offset 64] [public] [from int]
+!53 = !{!"0xd\00__reserved\002\0032\0032\0096\003", !1, !5, !30} ; [ DW_TAG_member ] [__reserved] [line 2, size 32, align 32, offset 96] [public] [from int]
+!54 = !{!"0xd\00__FuncPtr\002\0064\0064\00128\003", !1, !5, !55} ; [ DW_TAG_member ] [__FuncPtr] [line 2, size 64, align 64, offset 128] [public] [from ]
+!55 = !{!"0xf\00\000\0064\0064\000\000", null, null, !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!56 = !{!"0xd\00__descriptor\002\0064\0064\00192\003", !1, !5, !57} ; [ DW_TAG_member ] [__descriptor] [line 2, size 64, align 64, offset 192] [public] [from ]
+!57 = !{!"0xf\00\000\0064\0064\000\000", null, null, !58} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from __block_descriptor_withcopydispose]
+!58 = !{!"0x13\00__block_descriptor_withcopydispose\002\00256\0064\000\000\000", !1, null, null, !59, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 2, size 256, align 64, offset 0] [def] [from ]
+!59 = !{!60, !61, !62, !64}
+!60 = !{!"0xd\00reserved\002\0064\0064\000\000", !1, !58, !41} ; [ DW_TAG_member ] [reserved] [line 2, size 64, align 64, offset 0] [from long unsigned int]
+!61 = !{!"0xd\00Size\002\0064\0064\0064\000", !1, !58, !41} ; [ DW_TAG_member ] [Size] [line 2, size 64, align 64, offset 64] [from long unsigned int]
+!62 = !{!"0xd\00CopyFuncPtr\002\0064\0064\00128\000", !1, !58, !63} ; [ DW_TAG_member ] [CopyFuncPtr] [line 2, size 64, align 64, offset 128] [from ]
+!63 = !{!"0xf\00\000\0064\0064\000\000", null, null, !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!64 = !{!"0xd\00DestroyFuncPtr\002\0064\0064\00192\000", !1, !58, !63} ; [ DW_TAG_member ] [DestroyFuncPtr] [line 2, size 64, align 64, offset 192] [from ]
+!65 = !{!"0xd\00block\002\0064\0064\00256\003", !1, !5, !25} ; [ DW_TAG_member ] [block] [line 2, size 64, align 64, offset 256] [public] [from ]
+!66 = !MDLocation(line: 2, column: 20, scope: !8)
+!67 = !MDLocation(line: 2, column: 21, scope: !8)
+!68 = !{!"0x100\00block\002\000", !8, !5, !25} ; [ DW_TAG_auto_variable ] [block] [line 2]
+!69 = !{!"0x102\006\0034\0032"} ; [ DW_TAG_expression ] [DW_OP_deref]
+!70 = !MDLocation(line: 2, column: 9, scope: !8)
+!71 = !MDLocation(line: 2, column: 23, scope: !72)
+!72 = !{!"0xb\002\0021\000", !1, !8} ; [ DW_TAG_lexical_block ] [/foo.m]
+!73 = !MDLocation(line: 2, column: 32, scope: !8)
diff --git a/test/DebugInfo/X86/byvalstruct.ll b/test/DebugInfo/X86/byvalstruct.ll
index 0570950..d89ba35 100644
--- a/test/DebugInfo/X86/byvalstruct.ll
+++ b/test/DebugInfo/X86/byvalstruct.ll
@@ -66,14 +66,14 @@ entry:
   %otherBitmap.addr = alloca %0*, align 8
   %length.addr = alloca i64, align 8
   store %0* %self, %0** %self.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%0** %self.addr}, metadata !28, metadata !{metadata !"0x102"}), !dbg !29
+  call void @llvm.dbg.declare(metadata %0** %self.addr, metadata !28, metadata !{!"0x102"}), !dbg !29
   store i8* %_cmd, i8** %_cmd.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8** %_cmd.addr}, metadata !30, metadata !{metadata !"0x102"}), !dbg !29
+  call void @llvm.dbg.declare(metadata i8** %_cmd.addr, metadata !30, metadata !{!"0x102"}), !dbg !29
   store %0* %otherBitmap, %0** %otherBitmap.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%0** %otherBitmap.addr}, metadata !32, metadata !{metadata !"0x102"}), !dbg !29
-  call void @llvm.dbg.declare(metadata !{%struct.ImageInfo* %info}, metadata !33, metadata !{metadata !"0x102"}), !dbg !34
+  call void @llvm.dbg.declare(metadata %0** %otherBitmap.addr, metadata !32, metadata !{!"0x102"}), !dbg !29
+  call void @llvm.dbg.declare(metadata %struct.ImageInfo* %info, metadata !33, metadata !{!"0x102"}), !dbg !34
   store i64 %length, i64* %length.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i64* %length.addr}, metadata !35, metadata !{metadata !"0x102"}), !dbg !36
+  call void @llvm.dbg.declare(metadata i64* %length.addr, metadata !35, metadata !{!"0x102"}), !dbg !36
   %0 = load i8** %retval, !dbg !37
   ret i8* %0, !dbg !37
 }
@@ -87,42 +87,42 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!24, !25, !26, !27, !38}
 
-!0 = metadata !{metadata !"0x11\0017\00clang version 3.4 \000\00\002\00\000", metadata !1, metadata !2, metadata !3, metadata !6, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/t.mm] [DW_LANG_ObjC_plus_plus]
-!1 = metadata !{metadata !"t.mm", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00Bitmap\008\008\008\000\00512\0017", metadata !1, metadata !5, null, metadata !2, null, null, null} ; [ DW_TAG_structure_type ] [Bitmap] [line 8, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/t.mm]
-!6 = metadata !{metadata !7}
-!7 = metadata !{metadata !"0x2e\00-[Bitmap initWithCopy:andInfo:andLength:]\00-[Bitmap initWithCopy:andInfo:andLength:]\00\009\001\001\000\006\00256\000\009", metadata !1, metadata !5, metadata !8, null, i8* (%0*, i8*, %0*, %struct.ImageInfo*, i64)* @"\01-[Bitmap initWithCopy:andInfo:andLength:]", null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 9] [local] [def] [-[Bitmap initWithCopy:andInfo:andLength:]]
-!8 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!9 = metadata !{metadata !4, metadata !10, metadata !11, metadata !14, metadata !15, metadata !19}
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from Bitmap]
-!11 = metadata !{metadata !"0x16\00SEL\009\000\000\000\0064", metadata !1, null, metadata !12} ; [ DW_TAG_typedef ] [SEL] [line 9, size 0, align 0, offset 0] [artificial] [from ]
-!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_selector]
-!13 = metadata !{metadata !"0x13\00objc_selector\000\000\000\000\004\000", metadata !1, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_selector] [line 0, size 0, align 0, offset 0] [decl] [from ]
-!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Bitmap]
-!15 = metadata !{metadata !"0x16\00ImageInfo\007\000\000\000\000", metadata !1, null, metadata !16} ; [ DW_TAG_typedef ] [ImageInfo] [line 7, size 0, align 0, offset 0] [from ]
-!16 = metadata !{metadata !"0x13\00\002\00192\0064\000\000\000", metadata !1, null, null, metadata !17, null, null, null} ; [ DW_TAG_structure_type ] [line 2, size 192, align 64, offset 0] [def] [from ]
-!17 = metadata !{metadata !18, metadata !21, metadata !22}
-!18 = metadata !{metadata !"0xd\00width\004\0064\0064\000\000", metadata !1, metadata !16, metadata !19} ; [ DW_TAG_member ] [width] [line 4, size 64, align 64, offset 0] [from NSUInteger]
-!19 = metadata !{metadata !"0x16\00NSUInteger\001\000\000\000\000", metadata !1, null, metadata !20} ; [ DW_TAG_typedef ] [NSUInteger] [line 1, size 0, align 0, offset 0] [from long unsigned int]
-!20 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!21 = metadata !{metadata !"0xd\00height\005\0064\0064\0064\000", metadata !1, metadata !16, metadata !19} ; [ DW_TAG_member ] [height] [line 5, size 64, align 64, offset 64] [from NSUInteger]
-!22 = metadata !{metadata !"0xd\00pixelAspect\006\0064\0064\00128\000", metadata !1, metadata !16, metadata !23} ; [ DW_TAG_member ] [pixelAspect] [line 6, size 64, align 64, offset 128] [from double]
-!23 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
-!24 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!25 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!26 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!27 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
-!28 = metadata !{metadata !"0x101\00self\0016777225\001088", metadata !7, metadata !5, metadata !14} ; [ DW_TAG_arg_variable ] [self] [line 9]
-!29 = metadata !{i32 9, i32 0, metadata !7, null}
-!30 = metadata !{metadata !"0x101\00_cmd\0033554441\0064", metadata !7, metadata !5, metadata !31} ; [ DW_TAG_arg_variable ] [_cmd] [line 9]
-!31 = metadata !{metadata !"0x16\00SEL\009\000\000\000\000", metadata !1, null, metadata !12} ; [ DW_TAG_typedef ] [SEL] [line 9, size 0, align 0, offset 0] [from ]
-!32 = metadata !{metadata !"0x101\00otherBitmap\0050331657\000", metadata !7, metadata !5, metadata !14} ; [ DW_TAG_arg_variable ] [otherBitmap] [line 9]
-!33 = metadata !{metadata !"0x101\00info\0067108874\000", metadata !7, metadata !5, metadata !15} ; [ DW_TAG_arg_variable ] [info] [line 10]
-!34 = metadata !{i32 10, i32 0, metadata !7, null}
-!35 = metadata !{metadata !"0x101\00length\0083886091\000", metadata !7, metadata !5, metadata !19} ; [ DW_TAG_arg_variable ] [length] [line 11]
-!36 = metadata !{i32 11, i32 0, metadata !7, null}
-!37 = metadata !{i32 13, i32 0, metadata !7, null}
-!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0017\00clang version 3.4 \000\00\002\00\000", !1, !2, !3, !6, !2, !2} ; [ DW_TAG_compile_unit ] [/t.mm] [DW_LANG_ObjC_plus_plus]
+!1 = !{!"t.mm", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00Bitmap\008\008\008\000\00512\0017", !1, !5, null, !2, null, null, null} ; [ DW_TAG_structure_type ] [Bitmap] [line 8, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/t.mm]
+!6 = !{!7}
+!7 = !{!"0x2e\00-[Bitmap initWithCopy:andInfo:andLength:]\00-[Bitmap initWithCopy:andInfo:andLength:]\00\009\001\001\000\006\00256\000\009", !1, !5, !8, null, i8* (%0*, i8*, %0*, %struct.ImageInfo*, i64)* @"\01-[Bitmap initWithCopy:andInfo:andLength:]", null, null, !2} ; [ DW_TAG_subprogram ] [line 9] [local] [def] [-[Bitmap initWithCopy:andInfo:andLength:]]
+!8 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = !{!4, !10, !11, !14, !15, !19}
+!10 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from Bitmap]
+!11 = !{!"0x16\00SEL\009\000\000\000\0064", !1, null, !12} ; [ DW_TAG_typedef ] [SEL] [line 9, size 0, align 0, offset 0] [artificial] [from ]
+!12 = !{!"0xf\00\000\0064\0064\000\000", null, null, !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_selector]
+!13 = !{!"0x13\00objc_selector\000\000\000\000\004\000", !1, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_selector] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!14 = !{!"0xf\00\000\0064\0064\000\000", null, null, !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Bitmap]
+!15 = !{!"0x16\00ImageInfo\007\000\000\000\000", !1, null, !16} ; [ DW_TAG_typedef ] [ImageInfo] [line 7, size 0, align 0, offset 0] [from ]
+!16 = !{!"0x13\00\002\00192\0064\000\000\000", !1, null, null, !17, null, null, null} ; [ DW_TAG_structure_type ] [line 2, size 192, align 64, offset 0] [def] [from ]
+!17 = !{!18, !21, !22}
+!18 = !{!"0xd\00width\004\0064\0064\000\000", !1, !16, !19} ; [ DW_TAG_member ] [width] [line 4, size 64, align 64, offset 0] [from NSUInteger]
+!19 = !{!"0x16\00NSUInteger\001\000\000\000\000", !1, null, !20} ; [ DW_TAG_typedef ] [NSUInteger] [line 1, size 0, align 0, offset 0] [from long unsigned int]
+!20 = !{!"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!21 = !{!"0xd\00height\005\0064\0064\0064\000", !1, !16, !19} ; [ DW_TAG_member ] [height] [line 5, size 64, align 64, offset 64] [from NSUInteger]
+!22 = !{!"0xd\00pixelAspect\006\0064\0064\00128\000", !1, !16, !23} ; [ DW_TAG_member ] [pixelAspect] [line 6, size 64, align 64, offset 128] [from double]
+!23 = !{!"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!24 = !{i32 1, !"Objective-C Version", i32 2}
+!25 = !{i32 1, !"Objective-C Image Info Version", i32 0}
+!26 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!27 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
+!28 = !{!"0x101\00self\0016777225\001088", !7, !5, !14} ; [ DW_TAG_arg_variable ] [self] [line 9]
+!29 = !MDLocation(line: 9, scope: !7)
+!30 = !{!"0x101\00_cmd\0033554441\0064", !7, !5, !31} ; [ DW_TAG_arg_variable ] [_cmd] [line 9]
+!31 = !{!"0x16\00SEL\009\000\000\000\000", !1, null, !12} ; [ DW_TAG_typedef ] [SEL] [line 9, size 0, align 0, offset 0] [from ]
+!32 = !{!"0x101\00otherBitmap\0050331657\000", !7, !5, !14} ; [ DW_TAG_arg_variable ] [otherBitmap] [line 9]
+!33 = !{!"0x101\00info\0067108874\000", !7, !5, !15} ; [ DW_TAG_arg_variable ] [info] [line 10]
+!34 = !MDLocation(line: 10, scope: !7)
+!35 = !{!"0x101\00length\0083886091\000", !7, !5, !19} ; [ DW_TAG_arg_variable ] [length] [line 11]
+!36 = !MDLocation(line: 11, scope: !7)
+!37 = !MDLocation(line: 13, scope: !7)
+!38 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/c-type-units.ll b/test/DebugInfo/X86/c-type-units.ll
index 9326e31..b9bc36e 100644
--- a/test/DebugInfo/X86/c-type-units.ll
+++ b/test/DebugInfo/X86/c-type-units.ll
@@ -17,13 +17,13 @@
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/simple.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"simple.c", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x34\00f\00f\00\002\000\001", null, metadata !5, metadata !6, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 2] [def]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/simple.c]
-!6 = metadata !{metadata !"0x13\00foo\001\000\008\000\000\000", metadata !1, null, null, metadata !2, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 0, align 8, offset 0] [def] [from ]
-!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!9 = metadata !{metadata !"clang version 3.5 "}
+!0 = !{!"0x11\0012\00clang version 3.5 \000\00\000\00\000", !1, !2, !2, !2, !3, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/simple.c] [DW_LANG_C99]
+!1 = !{!"simple.c", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x34\00f\00f\00\002\000\001", null, !5, !6, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 2] [def]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/simple.c]
+!6 = !{!"0x13\00foo\001\000\008\000\000\000", !1, null, null, !2, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 0, align 8, offset 0] [def] [from ]
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 2}
+!9 = !{!"clang version 3.5 "}
diff --git a/test/DebugInfo/X86/coff_debug_info_type.ll b/test/DebugInfo/X86/coff_debug_info_type.ll
index ec85944..89859d2 100644
--- a/test/DebugInfo/X86/coff_debug_info_type.ll
+++ b/test/DebugInfo/X86/coff_debug_info_type.ll
@@ -6,7 +6,7 @@
 ; CHECK:    .section  .apple_types
 
 ; RUN: llc -mtriple=i686-pc-win32 -filetype=asm -O0 < %s | FileCheck -check-prefix=WIN32 %s
-; WIN32:    .section .debug$S,"rd"
+; WIN32:    .section .debug$S,"dr"
 
 ; RUN: llc -mtriple=i686-pc-win32 -filetype=null -O0 < %s
 
@@ -29,15 +29,15 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"=
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !11}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [C:\Projects/test.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"test.c", metadata !"C:\5CProjects"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\000\000\002", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [main]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [C:\Projects/test.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!10 = metadata !{i32 3, i32 0, metadata !4, null}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [C:\Projects/test.c] [DW_LANG_C99]
+!1 = !{!"test.c", !"C:\5CProjects"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00main\00main\00\001\000\001\000\006\000\000\002", !1, !5, !6, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [main]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [C:\Projects/test.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 3}
+!10 = !MDLocation(line: 3, scope: !4)
+!11 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/coff_relative_names.ll b/test/DebugInfo/X86/coff_relative_names.ll
index 067992d..96e70b1 100644
--- a/test/DebugInfo/X86/coff_relative_names.ll
+++ b/test/DebugInfo/X86/coff_relative_names.ll
@@ -23,15 +23,15 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"=
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !11}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [C:\Projects/test.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"test.c", metadata !"C:\5CProjects"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\000\000\002", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [main]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [C:\Projects/test.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!10 = metadata !{i32 3, i32 0, metadata !4, null}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [C:\Projects/test.c] [DW_LANG_C99]
+!1 = !{!"test.c", !"C:\5CProjects"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00main\00main\00\001\000\001\000\006\000\000\002", !1, !5, !6, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [main]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [C:\Projects/test.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 3}
+!10 = !MDLocation(line: 3, scope: !4)
+!11 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/concrete_out_of_line.ll b/test/DebugInfo/X86/concrete_out_of_line.ll
index 43f881e..b5da28a 100644
--- a/test/DebugInfo/X86/concrete_out_of_line.ll
+++ b/test/DebugInfo/X86/concrete_out_of_line.ll
@@ -76,56 +76,56 @@ declare void @_Z8moz_freePv(i8*)
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!60}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.1 ()\001\00\000\00\000", metadata !59, metadata !1, metadata !1, metadata !3, metadata !47,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5, metadata !23, metadata !27, metadata !31}
-!5 = metadata !{metadata !"0x2e\00Release\00Release\00_ZN17nsAutoRefCnt7ReleaseEv\0014\000\001\000\006\00256\001\0014", metadata !6, null, metadata !7, null, i32 ()* @_ZN17nsAutoRefCnt7ReleaseEv , null, metadata !12, metadata !20} ; [ DW_TAG_subprogram ] [line 14] [def] [Release]
-!6 = metadata !{metadata !"0x29", metadata !59} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !10}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{metadata !"0x13\00nsAutoRefCnt\0010\000\000\000\004\000", metadata !59, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [nsAutoRefCnt] [line 10, size 0, align 0, offset 0] [decl] [from ]
-!12 = metadata !{metadata !"0x2e\00Release\00Release\00_ZN17nsAutoRefCnt7ReleaseEv\0011\000\000\000\006\00256\001\0011", metadata !6, metadata !13, metadata !7, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ]
-!13 = metadata !{metadata !"0x2\00nsAutoRefCnt\0010\008\008\000\000\000", metadata !59, null, null, metadata !14, null, null} ; [ DW_TAG_class_type ]
-!14 = metadata !{metadata !12, metadata !15}
-!15 = metadata !{metadata !"0x2e\00~nsAutoRefCnt\00~nsAutoRefCnt\00\0012\000\000\000\006\00256\001\0012", metadata !6, metadata !13, metadata !16, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ]
-!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!17 = metadata !{null, metadata !10}
-!18 = metadata !{}
-!20 = metadata !{metadata !22}
-!22 = metadata !{metadata !"0x101\00this\0016777230\0064", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
-!23 = metadata !{metadata !"0x2e\00~nsAutoRefCnt\00~nsAutoRefCnt\00_ZN17nsAutoRefCntD1Ev\0018\000\001\000\006\00256\001\0018", metadata !6, null, metadata !16, null, void ()* @_ZN17nsAutoRefCntD1Ev, null, metadata !15, metadata !24} ; [ DW_TAG_subprogram ] [line 18] [def] [~nsAutoRefCnt]
-!24 = metadata !{metadata !26}
-!26 = metadata !{metadata !"0x101\00this\0016777234\0064", metadata !23, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
-!27 = metadata !{metadata !"0x2e\00~nsAutoRefCnt\00~nsAutoRefCnt\00_ZN17nsAutoRefCntD2Ev\0018\000\001\000\006\00256\001\0018", metadata !6, null, metadata !16, null, i32* null, null, metadata !15, metadata !28} ; [ DW_TAG_subprogram ] [line 18] [def] [~nsAutoRefCnt]
-!28 = metadata !{metadata !30}
-!30 = metadata !{metadata !"0x101\00this\0016777234\0064", metadata !27, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
-!31 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN12nsAutoRefCntaSEi\004\000\001\000\006\00256\001\004", metadata !6, null, metadata !32, null, null, null, metadata !36, metadata !43} ; [ DW_TAG_subprogram ] [line 4] [def] [operator=]
-!32 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !33, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!33 = metadata !{metadata !9, metadata !34, metadata !9}
-!34 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !35} ; [ DW_TAG_pointer_type ]
-!35 = metadata !{metadata !"0x13\00nsAutoRefCnt\002\000\000\000\004\000", metadata !59, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [nsAutoRefCnt] [line 2, size 0, align 0, offset 0] [decl] [from ]
-!36 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN12nsAutoRefCntaSEi\004\000\000\000\006\00256\001\004", metadata !6, metadata !37, metadata !32, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ]
-!37 = metadata !{metadata !"0x2\00nsAutoRefCnt\002\0032\0032\000\000\000", metadata !59, null, null, metadata !38, null, null, null} ; [ DW_TAG_class_type ] [nsAutoRefCnt] [line 2, size 32, align 32, offset 0] [def] [from ]
-!38 = metadata !{metadata !39, metadata !40, metadata !36}
-!39 = metadata !{metadata !"0xd\00mValue\007\0032\0032\000\000", metadata !59, metadata !37, metadata !9} ; [ DW_TAG_member ]
-!40 = metadata !{metadata !"0x2e\00nsAutoRefCnt\00nsAutoRefCnt\00\003\000\000\000\006\00256\001\003", metadata !6, metadata !37, metadata !41, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ]
-!41 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !42, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!42 = metadata !{null, metadata !34}
-!43 = metadata !{metadata !45, metadata !46}
-!45 = metadata !{metadata !"0x101\00this\0016777220\0064", metadata !31, metadata !6, metadata !34} ; [ DW_TAG_arg_variable ]
-!46 = metadata !{metadata !"0x101\00aValue\0033554436\000", metadata !31, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
-!47 = metadata !{metadata !49}
-!49 = metadata !{metadata !"0x34\00mRefCnt\00mRefCnt\00\009\000\001", null, metadata !6, metadata !37, i32* null, null} ; [ DW_TAG_variable ]
-!50 = metadata !{i32 5, i32 5, metadata !51, metadata !52}
-!51 = metadata !{metadata !"0xb\004\0029\002", metadata !6, metadata !31} ; [ DW_TAG_lexical_block ]
-!52 = metadata !{i32 15, i32 0, metadata !53, null}
-!53 = metadata !{metadata !"0xb\0014\0034\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ]
-!54 = metadata !{i32 19, i32 3, metadata !55, metadata !56}
-!55 = metadata !{metadata !"0xb\0018\0041\001", metadata !6, metadata !27} ; [ DW_TAG_lexical_block ]
-!56 = metadata !{i32 18, i32 41, metadata !23, metadata !52}
-!57 = metadata !{i32 19, i32 3, metadata !55, metadata !58}
-!58 = metadata !{i32 18, i32 41, metadata !23, null}
-!59 = metadata !{metadata !"nsAutoRefCnt.ii", metadata !"/Users/espindola/mozilla-central/obj-x86_64-apple-darwin11.2.0/netwerk/base/src"}
-!60 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.1 ()\001\00\000\00\000", !59, !1, !1, !3, !47,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5, !23, !27, !31}
+!5 = !{!"0x2e\00Release\00Release\00_ZN17nsAutoRefCnt7ReleaseEv\0014\000\001\000\006\00256\001\0014", !6, null, !7, null, i32 ()* @_ZN17nsAutoRefCnt7ReleaseEv , null, !12, !20} ; [ DW_TAG_subprogram ] [line 14] [def] [Release]
+!6 = !{!"0x29", !59} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !10}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = !{!"0xf\00\000\0064\0064\000\0064", i32 0, null, !11} ; [ DW_TAG_pointer_type ]
+!11 = !{!"0x13\00nsAutoRefCnt\0010\000\000\000\004\000", !59, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [nsAutoRefCnt] [line 10, size 0, align 0, offset 0] [decl] [from ]
+!12 = !{!"0x2e\00Release\00Release\00_ZN17nsAutoRefCnt7ReleaseEv\0011\000\000\000\006\00256\001\0011", !6, !13, !7, null, null, null, i32 0, !18} ; [ DW_TAG_subprogram ]
+!13 = !{!"0x2\00nsAutoRefCnt\0010\008\008\000\000\000", !59, null, null, !14, null, null} ; [ DW_TAG_class_type ]
+!14 = !{!12, !15}
+!15 = !{!"0x2e\00~nsAutoRefCnt\00~nsAutoRefCnt\00\0012\000\000\000\006\00256\001\0012", !6, !13, !16, null, null, null, i32 0, !18} ; [ DW_TAG_subprogram ]
+!16 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = !{null, !10}
+!18 = !{}
+!20 = !{!22}
+!22 = !{!"0x101\00this\0016777230\0064", !5, !6, !10} ; [ DW_TAG_arg_variable ]
+!23 = !{!"0x2e\00~nsAutoRefCnt\00~nsAutoRefCnt\00_ZN17nsAutoRefCntD1Ev\0018\000\001\000\006\00256\001\0018", !6, null, !16, null, void ()* @_ZN17nsAutoRefCntD1Ev, null, !15, !24} ; [ DW_TAG_subprogram ] [line 18] [def] [~nsAutoRefCnt]
+!24 = !{!26}
+!26 = !{!"0x101\00this\0016777234\0064", !23, !6, !10} ; [ DW_TAG_arg_variable ]
+!27 = !{!"0x2e\00~nsAutoRefCnt\00~nsAutoRefCnt\00_ZN17nsAutoRefCntD2Ev\0018\000\001\000\006\00256\001\0018", !6, null, !16, null, i32* null, null, !15, !28} ; [ DW_TAG_subprogram ] [line 18] [def] [~nsAutoRefCnt]
+!28 = !{!30}
+!30 = !{!"0x101\00this\0016777234\0064", !27, !6, !10} ; [ DW_TAG_arg_variable ]
+!31 = !{!"0x2e\00operator=\00operator=\00_ZN12nsAutoRefCntaSEi\004\000\001\000\006\00256\001\004", !6, null, !32, null, null, null, !36, !43} ; [ DW_TAG_subprogram ] [line 4] [def] [operator=]
+!32 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !33, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!33 = !{!9, !34, !9}
+!34 = !{!"0xf\00\000\0064\0064\000\0064", i32 0, null, !35} ; [ DW_TAG_pointer_type ]
+!35 = !{!"0x13\00nsAutoRefCnt\002\000\000\000\004\000", !59, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [nsAutoRefCnt] [line 2, size 0, align 0, offset 0] [decl] [from ]
+!36 = !{!"0x2e\00operator=\00operator=\00_ZN12nsAutoRefCntaSEi\004\000\000\000\006\00256\001\004", !6, !37, !32, null, null, null, i32 0, !18} ; [ DW_TAG_subprogram ]
+!37 = !{!"0x2\00nsAutoRefCnt\002\0032\0032\000\000\000", !59, null, null, !38, null, null, null} ; [ DW_TAG_class_type ] [nsAutoRefCnt] [line 2, size 32, align 32, offset 0] [def] [from ]
+!38 = !{!39, !40, !36}
+!39 = !{!"0xd\00mValue\007\0032\0032\000\000", !59, !37, !9} ; [ DW_TAG_member ]
+!40 = !{!"0x2e\00nsAutoRefCnt\00nsAutoRefCnt\00\003\000\000\000\006\00256\001\003", !6, !37, !41, null, null, null, i32 0, !18} ; [ DW_TAG_subprogram ]
+!41 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !42, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!42 = !{null, !34}
+!43 = !{!45, !46}
+!45 = !{!"0x101\00this\0016777220\0064", !31, !6, !34} ; [ DW_TAG_arg_variable ]
+!46 = !{!"0x101\00aValue\0033554436\000", !31, !6, !9} ; [ DW_TAG_arg_variable ]
+!47 = !{!49}
+!49 = !{!"0x34\00mRefCnt\00mRefCnt\00\009\000\001", null, !6, !37, i32* null, null} ; [ DW_TAG_variable ]
+!50 = !MDLocation(line: 5, column: 5, scope: !51, inlinedAt: !52)
+!51 = !{!"0xb\004\0029\002", !6, !31} ; [ DW_TAG_lexical_block ]
+!52 = !MDLocation(line: 15, scope: !53)
+!53 = !{!"0xb\0014\0034\000", !6, !5} ; [ DW_TAG_lexical_block ]
+!54 = !MDLocation(line: 19, column: 3, scope: !55, inlinedAt: !56)
+!55 = !{!"0xb\0018\0041\001", !6, !27} ; [ DW_TAG_lexical_block ]
+!56 = !MDLocation(line: 18, column: 41, scope: !23, inlinedAt: !52)
+!57 = !MDLocation(line: 19, column: 3, scope: !55, inlinedAt: !58)
+!58 = !MDLocation(line: 18, column: 41, scope: !23)
+!59 = !{!"nsAutoRefCnt.ii", !"/Users/espindola/mozilla-central/obj-x86_64-apple-darwin11.2.0/netwerk/base/src"}
+!60 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/constant-aggregate.ll b/test/DebugInfo/X86/constant-aggregate.ll
new file mode 100644
index 0000000..324c831
--- /dev/null
+++ b/test/DebugInfo/X86/constant-aggregate.ll
@@ -0,0 +1,118 @@
+; RUN: llc %s -filetype=obj -o %t.o
+; RUN: llvm-dwarfdump -debug-dump=info %t.o | FileCheck %s
+; Test emitting a constant for an aggregate type.
+;
+; clang -S -O1 -emit-llvm
+;
+; typedef struct { unsigned i; } S;
+;
+; unsigned foo(S s) {
+;   s.i = 1;
+;   return s.i;
+; }
+;
+; class C { public: unsigned i; };
+;
+; unsigned foo(C c) {
+;   c.i = 2;
+;   return c.i;
+; }
+;
+; unsigned bar() {
+;  int a[1] = { 3 };
+;   return a[0];
+; }
+;
+; CHECK:  DW_TAG_formal_parameter
+; CHECK-NEXT: DW_AT_const_value [DW_FORM_udata]	(1)
+; CHECK-NEXT: DW_AT_name {{.*}} "s"
+;
+; CHECK:  DW_TAG_formal_parameter
+; CHECK-NEXT: DW_AT_const_value [DW_FORM_udata]	(2)
+; CHECK-NEXT: DW_AT_name {{.*}} "c"
+;
+; CHECK:  DW_TAG_variable
+; CHECK-NEXT: DW_AT_const_value [DW_FORM_udata]	(3)
+; CHECK-NEXT: DW_AT_name {{.*}} "a"
+
+; ModuleID = 'sroasplit-4.cpp'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Function Attrs: nounwind readnone ssp uwtable
+define i32 @_Z3foo1S(i32 %s.coerce) #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32 %s.coerce, i64 0, metadata !18, metadata !37), !dbg !38
+  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !18, metadata !37), !dbg !38
+  ret i32 1, !dbg !39
+}
+
+; Function Attrs: nounwind readnone ssp uwtable
+define i32 @_Z3foo1C(i32 %c.coerce) #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32 %c.coerce, i64 0, metadata !23, metadata !37), !dbg !40
+  tail call void @llvm.dbg.value(metadata i32 2, i64 0, metadata !23, metadata !37), !dbg !40
+  ret i32 2, !dbg !41
+}
+
+; Function Attrs: nounwind readnone ssp uwtable
+define i32 @_Z3barv() #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32 3, i64 0, metadata !28, metadata !37), !dbg !42
+  ret i32 3, !dbg !43
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind readnone ssp uwtable }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!33, !34, !35}
+!llvm.ident = !{!36}
+
+!0 = !{!"0x11\004\00clang version 3.6.0 (trunk 225364) (llvm/trunk 225366)\001\00\000\00\001", !1, !2, !3, !11, !2, !2} ; [ DW_TAG_compile_unit ] [/sroasplit-4.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"sroasplit-4.cpp", !""}
+!2 = !{}
+!3 = !{!4, !8}
+!4 = !{!"0x13\00\001\0032\0032\000\000\000", !1, null, null, !5, null, null, !"_ZTS1S"} ; [ DW_TAG_structure_type ] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = !{!6}
+!6 = !{!"0xd\00i\001\0032\0032\000\000", !1, !"_ZTS1S", !7} ; [ DW_TAG_member ] [i] [line 1, size 32, align 32, offset 0] [from unsigned int]
+!7 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!8 = !{!"0x2\00C\008\0032\0032\000\000\000", !1, null, null, !9, null, null, !"_ZTS1C"} ; [ DW_TAG_class_type ] [C] [line 8, size 32, align 32, offset 0] [def] [from ]
+!9 = !{!10}
+!10 = !{!"0xd\00i\008\0032\0032\000\003", !1, !"_ZTS1C", !7} ; [ DW_TAG_member ] [i] [line 8, size 32, align 32, offset 0] [public] [from unsigned int]
+!11 = !{!12, !19, !24}
+!12 = !{!"0x2e\00foo\00foo\00_Z3foo1S\003\000\001\000\000\00256\001\003", !1, !13, !14, null, i32 (i32)* @_Z3foo1S, null, null, !17} ; [ DW_TAG_subprogram ] [line 3] [def] [foo]
+!13 = !{!"0x29", !1}                              ; [ DW_TAG_file_type ] [/sroasplit-4.cpp]
+!14 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = !{!7, !16}
+!16 = !{!"0x16\00S\001\000\000\000\000", !1, null, !"_ZTS1S"} ; [ DW_TAG_typedef ] [S] [line 1, size 0, align 0, offset 0] [from _ZTS1S]
+!17 = !{!18}
+!18 = !{!"0x101\00s\0016777219\000", !12, !13, !16} ; [ DW_TAG_arg_variable ] [s] [line 3]
+!19 = !{!"0x2e\00foo\00foo\00_Z3foo1C\0010\000\001\000\000\00256\001\0010", !1, !13, !20, null, i32 (i32)* @_Z3foo1C, null, null, !22} ; [ DW_TAG_subprogram ] [line 10] [def] [foo]
+!20 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!21 = !{!7, !"_ZTS1C"}
+!22 = !{!23}
+!23 = !{!"0x101\00c\0016777226\000", !19, !13, !"_ZTS1C"} ; [ DW_TAG_arg_variable ] [c] [line 10]
+!24 = !{!"0x2e\00bar\00bar\00_Z3barv\0015\000\001\000\000\00256\001\0015", !1, !13, !25, null, i32 ()* @_Z3barv, null, null, !27} ; [ DW_TAG_subprogram ] [line 15] [def] [bar]
+!25 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !26, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!26 = !{!7}
+!27 = !{!28}
+!28 = !{!"0x100\00a\0016\000", !24, !13, !29}     ; [ DW_TAG_auto_variable ] [a] [line 16]
+!29 = !{!"0x1\00\000\0032\0032\000\000\000", null, null, !30, !31, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 32, offset 0] [from int]
+!30 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!31 = !{!32}
+!32 = !{!"0x21\000\001"}                          ; [ DW_TAG_subrange_type ] [0, 0]
+!33 = !{i32 2, !"Dwarf Version", i32 2}
+!34 = !{i32 2, !"Debug Info Version", i32 2}
+!35 = !{i32 1, !"PIC Level", i32 2}
+!36 = !{!"clang version 3.6.0 (trunk 225364) (llvm/trunk 225366)"}
+!37 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!38 = !MDLocation(line: 3, column: 16, scope: !12)
+!39 = !MDLocation(line: 5, column: 3, scope: !12)
+!40 = !MDLocation(line: 10, column: 16, scope: !19)
+!41 = !MDLocation(line: 12, column: 3, scope: !19)
+!42 = !MDLocation(line: 16, column: 6, scope: !24)
+!43 = !MDLocation(line: 17, column: 3, scope: !24)
diff --git a/test/DebugInfo/X86/cu-ranges-odr.ll b/test/DebugInfo/X86/cu-ranges-odr.ll
index b73d33d..c1f58d7 100644
--- a/test/DebugInfo/X86/cu-ranges-odr.ll
+++ b/test/DebugInfo/X86/cu-ranges-odr.ll
@@ -35,9 +35,9 @@ entry:
   %this.addr = alloca %class.A*, align 8
   %i.addr = alloca i32, align 4
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !27, metadata !{metadata !"0x102"}), !dbg !29
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !27, metadata !{!"0x102"}), !dbg !29
   store i32 %i, i32* %i.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !30, metadata !{metadata !"0x102"}), !dbg !31
+  call void @llvm.dbg.declare(metadata i32* %i.addr, metadata !30, metadata !{!"0x102"}), !dbg !31
   %this1 = load %class.A** %this.addr
   %a = getelementptr inbounds %class.A* %this1, i32 0, i32 0, !dbg !31
   %0 = load i32* %i.addr, align 4, !dbg !31
@@ -61,36 +61,36 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!23, !24}
 !llvm.ident = !{!25}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5 (trunk 199923) (llvm/trunk 199940)\000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !13, metadata !21, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/baz.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"baz.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2\00A\001\0032\0032\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
-!5 = metadata !{metadata !6, metadata !8}
-!6 = metadata !{metadata !"0xd\00a\005\0032\0032\000\001", metadata !1, metadata !"_ZTS1A", metadata !7} ; [ DW_TAG_member ] [a] [line 5, size 32, align 32, offset 0] [private] [from int]
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!8 = metadata !{metadata !"0x2e\00A\00A\00\003\000\000\000\006\00256\000\003", metadata !1, metadata !"_ZTS1A", metadata !9, null, null, null, i32 0, metadata !12} ; [ DW_TAG_subprogram ] [line 3] [A]
-!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!10 = metadata !{null, metadata !11, metadata !7}
-!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!12 = metadata !{i32 786468}
-!13 = metadata !{metadata !14, metadata !18, metadata !19}
-!14 = metadata !{metadata !"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\008\001\001\000\006\00256\000\008", metadata !1, metadata !15, metadata !16, null, void ()* @__cxx_global_var_init, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 8] [local] [def] [__cxx_global_var_init]
-!15 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/baz.cpp]
-!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!17 = metadata !{null}
-!18 = metadata !{metadata !"0x2e\00A\00A\00_ZN1AC2Ei\003\000\001\000\006\00256\000\003", metadata !1, metadata !"_ZTS1A", metadata !9, null, void (%class.A*, i32)* @_ZN1AC2Ei, null, metadata !8, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
-!19 = metadata !{metadata !"0x2e\00\00\00_GLOBAL__I_a\003\001\001\000\006\0064\000\003", metadata !1, metadata !15, metadata !20, null, void ()* @_GLOBAL__I_a, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [local] [def]
-!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!21 = metadata !{metadata !22}
-!22 = metadata !{metadata !"0x34\00a\00a\00\008\000\001", null, metadata !15, metadata !4, %class.A* @a, null} ; [ DW_TAG_variable ] [a] [line 8] [def]
-!23 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!25 = metadata !{metadata !"clang version 3.5 (trunk 199923) (llvm/trunk 199940)"}
-!26 = metadata !{i32 8, i32 0, metadata !14, null}
-!27 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !18, null, metadata !28} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!28 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
-!29 = metadata !{i32 0, i32 0, metadata !18, null}
-!30 = metadata !{metadata !"0x101\00i\0033554435\000", metadata !18, metadata !15, metadata !7} ; [ DW_TAG_arg_variable ] [i] [line 3]
-!31 = metadata !{i32 3, i32 0, metadata !18, null}
-!32 = metadata !{i32 3, i32 0, metadata !19, null}
+!0 = !{!"0x11\004\00clang version 3.5 (trunk 199923) (llvm/trunk 199940)\000\00\000\00\001", !1, !2, !3, !13, !21, !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/baz.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"baz.cpp", !"/usr/local/google/home/echristo/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2\00A\001\0032\0032\000\000\000", !1, null, null, !5, null, null, !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = !{!6, !8}
+!6 = !{!"0xd\00a\005\0032\0032\000\001", !1, !"_ZTS1A", !7} ; [ DW_TAG_member ] [a] [line 5, size 32, align 32, offset 0] [private] [from int]
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = !{!"0x2e\00A\00A\00\003\000\000\000\006\00256\000\003", !1, !"_ZTS1A", !9, null, null, null, i32 0, !12} ; [ DW_TAG_subprogram ] [line 3] [A]
+!9 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = !{null, !11, !7}
+!11 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!12 = !{i32 786468}
+!13 = !{!14, !18, !19}
+!14 = !{!"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\008\001\001\000\006\00256\000\008", !1, !15, !16, null, void ()* @__cxx_global_var_init, null, null, !2} ; [ DW_TAG_subprogram ] [line 8] [local] [def] [__cxx_global_var_init]
+!15 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/baz.cpp]
+!16 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = !{null}
+!18 = !{!"0x2e\00A\00A\00_ZN1AC2Ei\003\000\001\000\006\00256\000\003", !1, !"_ZTS1A", !9, null, void (%class.A*, i32)* @_ZN1AC2Ei, null, !8, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
+!19 = !{!"0x2e\00\00\00_GLOBAL__I_a\003\001\001\000\006\0064\000\003", !1, !15, !20, null, void ()* @_GLOBAL__I_a, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [local] [def]
+!20 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!21 = !{!22}
+!22 = !{!"0x34\00a\00a\00\008\000\001", null, !15, !4, %class.A* @a, null} ; [ DW_TAG_variable ] [a] [line 8] [def]
+!23 = !{i32 2, !"Dwarf Version", i32 4}
+!24 = !{i32 1, !"Debug Info Version", i32 2}
+!25 = !{!"clang version 3.5 (trunk 199923) (llvm/trunk 199940)"}
+!26 = !MDLocation(line: 8, scope: !14)
+!27 = !{!"0x101\00this\0016777216\001088", !18, null, !28} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!28 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!29 = !MDLocation(line: 0, scope: !18)
+!30 = !{!"0x101\00i\0033554435\000", !18, !15, !7} ; [ DW_TAG_arg_variable ] [i] [line 3]
+!31 = !MDLocation(line: 3, scope: !18)
+!32 = !MDLocation(line: 3, scope: !19)
diff --git a/test/DebugInfo/X86/cu-ranges.ll b/test/DebugInfo/X86/cu-ranges.ll
index a9821b0..0d872d8 100644
--- a/test/DebugInfo/X86/cu-ranges.ll
+++ b/test/DebugInfo/X86/cu-ranges.ll
@@ -29,7 +29,7 @@ define i32 @foo(i32 %a) #0 {
 entry:
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !13, metadata !{metadata !"0x102"}), !dbg !14
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !13, metadata !{!"0x102"}), !dbg !14
   %0 = load i32* %a.addr, align 4, !dbg !14
   %add = add nsw i32 %0, 1, !dbg !14
   ret i32 %add, !dbg !14
@@ -43,7 +43,7 @@ define i32 @bar(i32 %b) #0 {
 entry:
   %b.addr = alloca i32, align 4
   store i32 %b, i32* %b.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !15, metadata !{metadata !"0x102"}), !dbg !16
+  call void @llvm.dbg.declare(metadata i32* %b.addr, metadata !15, metadata !{!"0x102"}), !dbg !16
   %0 = load i32* %b.addr, align 4, !dbg !16
   %add = add nsw i32 %0, 2, !dbg !16
   ret i32 %add, !dbg !16
@@ -56,20 +56,20 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/z.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"z.c", metadata !"/usr/local/google/home/echristo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !9}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/z.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x2e\00bar\00bar\00\002\000\001\000\006\00256\000\002", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @bar, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
-!10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!12 = metadata !{metadata !"clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)"}
-!13 = metadata !{metadata !"0x101\00a\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 1]
-!14 = metadata !{i32 1, i32 0, metadata !4, null}
-!15 = metadata !{metadata !"0x101\00b\0016777218\000", metadata !9, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [b] [line 2]
-!16 = metadata !{i32 2, i32 0, metadata !9, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)\000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/z.c] [DW_LANG_C99]
+!1 = !{!"z.c", !"/usr/local/google/home/echristo"}
+!2 = !{}
+!3 = !{!4, !9}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", !1, !5, !6, null, i32 (i32)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/z.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x2e\00bar\00bar\00\002\000\001\000\006\00256\000\002", !1, !5, !6, null, i32 (i32)* @bar, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
+!10 = !{i32 2, !"Dwarf Version", i32 4}
+!11 = !{i32 1, !"Debug Info Version", i32 2}
+!12 = !{!"clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)"}
+!13 = !{!"0x101\00a\0016777217\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [a] [line 1]
+!14 = !MDLocation(line: 1, scope: !4)
+!15 = !{!"0x101\00b\0016777218\000", !9, !5, !8} ; [ DW_TAG_arg_variable ] [b] [line 2]
+!16 = !MDLocation(line: 2, scope: !9)
diff --git a/test/DebugInfo/X86/data_member_location.ll b/test/DebugInfo/X86/data_member_location.ll
index db88bb1..4fdcc5d 100644
--- a/test/DebugInfo/X86/data_member_location.ll
+++ b/test/DebugInfo/X86/data_member_location.ll
@@ -34,20 +34,20 @@
 !llvm.module.flags = !{!13, !15}
 !llvm.ident = !{!14}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !2, metadata !10, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/data_member_location.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"data_member_location.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00foo\001\0064\0032\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 64, align 32, offset 0] [def] [from ]
-!5 = metadata !{metadata !6, metadata !8}
-!6 = metadata !{metadata !"0xd\00c\002\008\008\000\000", metadata !1, metadata !"_ZTS3foo", metadata !7} ; [ DW_TAG_member ] [c] [line 2, size 8, align 8, offset 0] [from char]
-!7 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!8 = metadata !{metadata !"0xd\00i\003\0032\0032\0032\000", metadata !1, metadata !"_ZTS3foo", metadata !9} ; [ DW_TAG_member ] [i] [line 3, size 32, align 32, offset 32] [from int]
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !11}
-!11 = metadata !{metadata !"0x34\00f\00f\00\006\000\001", null, metadata !12, metadata !4, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 6] [def]
-!12 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/data_member_location.cpp]
-!13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!14 = metadata !{metadata !"clang version 3.4 "}
-
-!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 \000\00\000\00\000", !1, !2, !3, !2, !10, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/data_member_location.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"data_member_location.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00foo\001\0064\0032\000\000\000", !1, null, null, !5, null, null, !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 64, align 32, offset 0] [def] [from ]
+!5 = !{!6, !8}
+!6 = !{!"0xd\00c\002\008\008\000\000", !1, !"_ZTS3foo", !7} ; [ DW_TAG_member ] [c] [line 2, size 8, align 8, offset 0] [from char]
+!7 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!8 = !{!"0xd\00i\003\0032\0032\0032\000", !1, !"_ZTS3foo", !9} ; [ DW_TAG_member ] [i] [line 3, size 32, align 32, offset 32] [from int]
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!11}
+!11 = !{!"0x34\00f\00f\00\006\000\001", null, !12, !4, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 6] [def]
+!12 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/data_member_location.cpp]
+!13 = !{i32 2, !"Dwarf Version", i32 4}
+!14 = !{!"clang version 3.4 "}
+
+!15 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg-at-specficiation.ll b/test/DebugInfo/X86/dbg-at-specficiation.ll
index 034574b..8269699 100644
--- a/test/DebugInfo/X86/dbg-at-specficiation.ll
+++ b/test/DebugInfo/X86/dbg-at-specficiation.ll
@@ -8,14 +8,14 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!12}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 140253)\001\00\000\00\000", metadata !11, metadata !2, metadata !2, metadata !2, metadata !3, null} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x34\00a\00a\00\001\000\001", null, metadata !6, metadata !7, [10 x i32]* @a, null} ; [ DW_TAG_variable ]
-!6 = metadata !{metadata !"0x29", metadata !11} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x1\00\000\00320\0032\000\000", null, null, metadata !8, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 32, offset 0] [from int]
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x21\000\0010"}        ; [ DW_TAG_subrange_type ]
-!11 = metadata !{metadata !"x.c", metadata !"/private/tmp"}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.0 (trunk 140253)\001\00\000\00\000", !11, !2, !2, !2, !3, null} ; [ DW_TAG_compile_unit ]
+!2 = !{}
+!3 = !{!5}
+!5 = !{!"0x34\00a\00a\00\001\000\001", null, !6, !7, [10 x i32]* @a, null} ; [ DW_TAG_variable ]
+!6 = !{!"0x29", !11} ; [ DW_TAG_file_type ]
+!7 = !{!"0x1\00\000\00320\0032\000\000", null, null, !8, !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 32, offset 0] [from int]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!9 = !{!10}
+!10 = !{!"0x21\000\0010"}        ; [ DW_TAG_subrange_type ]
+!11 = !{!"x.c", !"/private/tmp"}
+!12 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg-byval-parameter.ll b/test/DebugInfo/X86/dbg-byval-parameter.ll
index 49cd6ba..713781f 100644
--- a/test/DebugInfo/X86/dbg-byval-parameter.ll
+++ b/test/DebugInfo/X86/dbg-byval-parameter.ll
@@ -9,7 +9,7 @@ entry:
   %retval = alloca double                         ; <double*> [#uses=2]
   %0 = alloca double                              ; <double*> [#uses=2]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.Rect* %my_r0}, metadata !0, metadata !{metadata !"0x102"}), !dbg !15
+  call void @llvm.dbg.declare(metadata %struct.Rect* %my_r0, metadata !0, metadata !{!"0x102"}), !dbg !15
   %1 = getelementptr inbounds %struct.Rect* %my_r0, i32 0, i32 0, !dbg !16 ; <%struct.Pt*> [#uses=1]
   %2 = getelementptr inbounds %struct.Pt* %1, i32 0, i32 0, !dbg !16 ; <double*> [#uses=1]
   %3 = load double* %2, align 8, !dbg !16         ; <double> [#uses=1]
@@ -28,25 +28,25 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!21}
 
-!0 = metadata !{metadata !"0x101\00my_r0\0011\000", metadata !1, metadata !2, metadata !7} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0011\000\001\000\006\000\000\000", metadata !19, metadata !2, metadata !4, null, double (%struct.Rect*)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !19, metadata !20, metadata !20, metadata !18, null,  null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !19, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{metadata !6, metadata !7}
-!6 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", metadata !19, metadata !2} ; [ DW_TAG_base_type ]
-!7 = metadata !{metadata !"0x13\00Rect\006\00256\0064\000\000\000", metadata !19, metadata !2, null, metadata !8, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ]
-!8 = metadata !{metadata !9, metadata !14}
-!9 = metadata !{metadata !"0xd\00P1\007\00128\0064\000\000", metadata !19, metadata !7, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{metadata !"0x13\00Pt\001\00128\0064\000\000\000", metadata !19, metadata !2, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ]
-!11 = metadata !{metadata !12, metadata !13}
-!12 = metadata !{metadata !"0xd\00x\002\0064\0064\000\000", metadata !19, metadata !10, metadata !6} ; [ DW_TAG_member ]
-!13 = metadata !{metadata !"0xd\00y\003\0064\0064\0064\000", metadata !19, metadata !10, metadata !6} ; [ DW_TAG_member ]
-!14 = metadata !{metadata !"0xd\00P2\008\00128\0064\00128\000", metadata !19, metadata !7, metadata !10} ; [ DW_TAG_member ]
-!15 = metadata !{i32 11, i32 0, metadata !1, null}
-!16 = metadata !{i32 12, i32 0, metadata !17, null}
-!17 = metadata !{metadata !"0xb\0011\000\000", metadata !19, metadata !1} ; [ DW_TAG_lexical_block ]
-!18 = metadata !{metadata !1}
-!19 = metadata !{metadata !"b2.c", metadata !"/tmp/"}
-!20 = metadata !{i32 0}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x101\00my_r0\0011\000", !1, !2, !7} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00foo\00foo\00foo\0011\000\001\000\006\000\000\000", !19, !2, !4, null, double (%struct.Rect*)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x29", !19} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", !19, !20, !20, !18, null,  null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !19, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{!6, !7}
+!6 = !{!"0x24\00double\000\0064\0064\000\000\004", !19, !2} ; [ DW_TAG_base_type ]
+!7 = !{!"0x13\00Rect\006\00256\0064\000\000\000", !19, !2, null, !8, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ]
+!8 = !{!9, !14}
+!9 = !{!"0xd\00P1\007\00128\0064\000\000", !19, !7, !10} ; [ DW_TAG_member ]
+!10 = !{!"0x13\00Pt\001\00128\0064\000\000\000", !19, !2, null, !11, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ]
+!11 = !{!12, !13}
+!12 = !{!"0xd\00x\002\0064\0064\000\000", !19, !10, !6} ; [ DW_TAG_member ]
+!13 = !{!"0xd\00y\003\0064\0064\0064\000", !19, !10, !6} ; [ DW_TAG_member ]
+!14 = !{!"0xd\00P2\008\00128\0064\00128\000", !19, !7, !10} ; [ DW_TAG_member ]
+!15 = !MDLocation(line: 11, scope: !1)
+!16 = !MDLocation(line: 12, scope: !17)
+!17 = !{!"0xb\0011\000\000", !19, !1} ; [ DW_TAG_lexical_block ]
+!18 = !{!1}
+!19 = !{!"b2.c", !"/tmp/"}
+!20 = !{i32 0}
+!21 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg-const-int.ll b/test/DebugInfo/X86/dbg-const-int.ll
index c7e5e92..18abbdd 100644
--- a/test/DebugInfo/X86/dbg-const-int.ll
+++ b/test/DebugInfo/X86/dbg-const-int.ll
@@ -12,7 +12,7 @@ target triple = "x86_64-apple-macosx10.6.7"
 
 define i32 @foo() nounwind uwtable readnone optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !8, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !9
+  tail call void @llvm.dbg.value(metadata i32 42, i64 0, metadata !6, metadata !{!"0x102"}), !dbg !9
   ret i32 42, !dbg !10
 }
 
@@ -21,19 +21,19 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!15}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 132191)\001\00\000\00\000", metadata !13, metadata !14, metadata !14, metadata !11, null,  null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\000\001\000", metadata !13, metadata !2, metadata !3, null, i32 ()* @foo, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
-!2 = metadata !{metadata !"0x29", metadata !13} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !13, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x100\00i\002\000", metadata !7, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
-!7 = metadata !{metadata !"0xb\001\0011\000", metadata !13, metadata !1} ; [ DW_TAG_lexical_block ]
-!8 = metadata !{i32 42}
-!9 = metadata !{i32 2, i32 12, metadata !7, null}
-!10 = metadata !{i32 3, i32 2, metadata !7, null}
-!11 = metadata !{metadata !1}
-!12 = metadata !{metadata !6}
-!13 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
-!14 = metadata !{i32 0}
-!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.0 (trunk 132191)\001\00\000\00\000", !13, !14, !14, !11, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\000\001\000", !13, !2, !3, null, i32 ()* @foo, null, null, !12} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
+!2 = !{!"0x29", !13} ; [ DW_TAG_file_type ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !13, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !0} ; [ DW_TAG_base_type ]
+!6 = !{!"0x100\00i\002\000", !7, !2, !5} ; [ DW_TAG_auto_variable ]
+!7 = !{!"0xb\001\0011\000", !13, !1} ; [ DW_TAG_lexical_block ]
+!8 = !{i32 42}
+!9 = !MDLocation(line: 2, column: 12, scope: !7)
+!10 = !MDLocation(line: 3, column: 2, scope: !7)
+!11 = !{!1}
+!12 = !{!6}
+!13 = !{!"a.c", !"/private/tmp"}
+!14 = !{i32 0}
+!15 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg-const.ll b/test/DebugInfo/X86/dbg-const.ll
index 20e8652..755565d 100644
--- a/test/DebugInfo/X86/dbg-const.ll
+++ b/test/DebugInfo/X86/dbg-const.ll
@@ -17,9 +17,9 @@ target triple = "x86_64-apple-darwin10.0.0"
 ;CHECK-NEXT:  .byte	42
 define i32 @foobar() nounwind readonly noinline ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !8, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !9
+  tail call void @llvm.dbg.value(metadata i32 42, i64 0, metadata !6, metadata !{!"0x102"}), !dbg !9
   %call = tail call i32 @bar(), !dbg !11
-  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !11
+  tail call void @llvm.dbg.value(metadata i32 %call, i64 0, metadata !6, metadata !{!"0x102"}), !dbg !11
   %call2 = tail call i32 @bar(), !dbg !11
   %add = add nsw i32 %call2, %call, !dbg !12
   ret i32 %add, !dbg !10
@@ -31,21 +31,21 @@ declare i32 @bar() nounwind readnone
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!17}
 
-!0 = metadata !{metadata !"0x2e\00foobar\00foobar\00foobar\0012\000\001\000\006\000\001\000", metadata !15, metadata !1, metadata !3, null, i32 ()* @foobar, null, null, metadata !14} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x29", metadata !15} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 114183)\001\00\000\00\001", metadata !15, metadata !16, metadata !16, metadata !13, null,  null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !15, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !15, metadata !1} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x100\00j\0015\000", metadata !7, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!7 = metadata !{metadata !"0xb\0012\0052\000", metadata !15, metadata !0} ; [ DW_TAG_lexical_block ]
-!8 = metadata !{i32 42}
-!9 = metadata !{i32 15, i32 12, metadata !7, null}
-!10 = metadata !{i32 23, i32 3, metadata !7, null}
-!11 = metadata !{i32 17, i32 3, metadata !7, null}
-!12 = metadata !{i32 18, i32 3, metadata !7, null}
-!13 = metadata !{metadata !0}
-!14 = metadata !{metadata !6}
-!15 = metadata !{metadata !"mu.c", metadata !"/private/tmp"}
-!16 = metadata !{i32 0}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00foobar\00foobar\00foobar\0012\000\001\000\006\000\001\000", !15, !1, !3, null, i32 ()* @foobar, null, null, !14} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x29", !15} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 2.9 (trunk 114183)\001\00\000\00\001", !15, !16, !16, !13, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !15, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", !15, !1} ; [ DW_TAG_base_type ]
+!6 = !{!"0x100\00j\0015\000", !7, !1, !5} ; [ DW_TAG_auto_variable ]
+!7 = !{!"0xb\0012\0052\000", !15, !0} ; [ DW_TAG_lexical_block ]
+!8 = !{i32 42}
+!9 = !MDLocation(line: 15, column: 12, scope: !7)
+!10 = !MDLocation(line: 23, column: 3, scope: !7)
+!11 = !MDLocation(line: 17, column: 3, scope: !7)
+!12 = !MDLocation(line: 18, column: 3, scope: !7)
+!13 = !{!0}
+!14 = !{!6}
+!15 = !{!"mu.c", !"/private/tmp"}
+!16 = !{i32 0}
+!17 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg-declare-arg.ll b/test/DebugInfo/X86/dbg-declare-arg.ll
index b589ed97..ef975dd 100644
--- a/test/DebugInfo/X86/dbg-declare-arg.ll
+++ b/test/DebugInfo/X86/dbg-declare-arg.ll
@@ -14,8 +14,8 @@ entry:
   %nrvo = alloca i1
   %cleanup.dest.slot = alloca i32
   store i32 %i, i32* %i.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !26, metadata !{metadata !"0x102"}), !dbg !27
-  call void @llvm.dbg.declare(metadata !{i32* %j}, metadata !28, metadata !{metadata !"0x102"}), !dbg !30
+  call void @llvm.dbg.declare(metadata i32* %i.addr, metadata !26, metadata !{!"0x102"}), !dbg !27
+  call void @llvm.dbg.declare(metadata i32* %j, metadata !28, metadata !{!"0x102"}), !dbg !30
   store i32 0, i32* %j, align 4, !dbg !31
   %tmp = load i32* %i.addr, align 4, !dbg !32
   %cmp = icmp eq i32 %tmp, 42, !dbg !32
@@ -29,7 +29,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.then, %entry
   store i1 false, i1* %nrvo, !dbg !36
-  call void @llvm.dbg.declare(metadata !{%class.A* %agg.result}, metadata !37, metadata !{metadata !"0x102"}), !dbg !39
+  call void @llvm.dbg.declare(metadata %class.A* %agg.result, metadata !37, metadata !{!"0x102"}), !dbg !39
   %tmp2 = load i32* %j, align 4, !dbg !40
   %x = getelementptr inbounds %class.A* %agg.result, i32 0, i32 0, !dbg !40
   store i32 %tmp2, i32* %x, align 4, !dbg !40
@@ -52,7 +52,7 @@ define linkonce_odr void @_ZN1AD1Ev(%class.A* %this) unnamed_addr ssp align 2 {
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !43, metadata !{metadata !"0x102"}), !dbg !44
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !43, metadata !{!"0x102"}), !dbg !44
   %this1 = load %class.A** %this.addr
   call void @_ZN1AD2Ev(%class.A* %this1)
   ret void, !dbg !45
@@ -62,7 +62,7 @@ define linkonce_odr void @_ZN1AD2Ev(%class.A* %this) unnamed_addr nounwind ssp a
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !46, metadata !{metadata !"0x102"}), !dbg !47
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !46, metadata !{!"0x102"}), !dbg !47
   %this1 = load %class.A** %this.addr
   %x = getelementptr inbounds %class.A* %this1, i32 0, i32 0, !dbg !48
   store i32 1, i32* %x, align 4, !dbg !48
@@ -72,56 +72,56 @@ entry:
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!52}
 
-!0 = metadata !{metadata !"0x2e\00~A\00~A\00\002\000\000\000\006\00256\000\000", metadata !51, metadata !1, metadata !11, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x2\00A\002\00128\0032\000\000\000", metadata !51, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_class_type ] [A] [line 2, size 128, align 32, offset 0] [def] [from ]
-!2 = metadata !{metadata !"0x11\004\00clang version 3.0 (trunk 130127)\000\00\000\00\001", metadata !51, metadata !24, metadata !24, metadata !50, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x29", metadata !51} ; [ DW_TAG_file_type ]
-!4 = metadata !{metadata !5, metadata !7, metadata !8, metadata !9, metadata !0, metadata !10, metadata !14}
-!5 = metadata !{metadata !"0xd\00x\002\0032\0032\000\000", metadata !51, metadata !3, metadata !6} ; [ DW_TAG_member ]
-!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
-!7 = metadata !{metadata !"0xd\00y\002\0032\0032\0032\000", metadata !51, metadata !3, metadata !6} ; [ DW_TAG_member ]
-!8 = metadata !{metadata !"0xd\00z\002\0032\0032\0064\000", metadata !51, metadata !3, metadata !6} ; [ DW_TAG_member ]
-!9 = metadata !{metadata !"0xd\00o\002\0032\0032\0096\000", metadata !51, metadata !3, metadata !6} ; [ DW_TAG_member ]
-!10 = metadata !{metadata !"0x2e\00A\00A\00\002\000\000\000\006\00320\000\000", metadata !51, metadata !1, metadata !11, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !51, metadata !3, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{null, metadata !13}
-!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !2, null, metadata !1} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{metadata !"0x2e\00A\00A\00\002\000\000\000\006\00320\000\000", metadata !51, metadata !1, metadata !15, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !51, metadata !3, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{null, metadata !13, metadata !17}
-!17 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, metadata !2, metadata !18} ; [ DW_TAG_reference_type ]
-!18 = metadata !{metadata !"0x26\00\000\000\000\000\000", metadata !2, null, metadata !1} ; [ DW_TAG_const_type ]
-!19 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooi\004\000\001\000\006\00256\000\000", metadata !51, metadata !3, metadata !20, null, void (%class.A*, i32)* @_Z3fooi, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [foo]
-!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !51, metadata !3, null, metadata !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!21 = metadata !{metadata !1}
-!22 = metadata !{metadata !"0x2e\00~A\00~A\00_ZN1AD1Ev\002\000\001\000\006\00256\000\000", metadata !51, metadata !3, metadata !23, null, void (%class.A*)* @_ZN1AD1Ev, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [~A]
-!23 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !51, metadata !3, null, metadata !24, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!24 = metadata !{null}
-!25 = metadata !{metadata !"0x2e\00~A\00~A\00_ZN1AD2Ev\002\000\001\000\006\00256\000\000", metadata !51, metadata !3, metadata !23, null, void (%class.A*)* @_ZN1AD2Ev, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [~A]
-!26 = metadata !{metadata !"0x101\00i\0016777220\000", metadata !19, metadata !3, metadata !6} ; [ DW_TAG_arg_variable ]
-!27 = metadata !{i32 4, i32 11, metadata !19, null}
-!28 = metadata !{metadata !"0x100\00j\005\000", metadata !29, metadata !3, metadata !6} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{metadata !"0xb\004\0014\000", metadata !51, metadata !19} ; [ DW_TAG_lexical_block ]
-!30 = metadata !{i32 5, i32 7, metadata !29, null}
-!31 = metadata !{i32 5, i32 12, metadata !29, null}
-!32 = metadata !{i32 6, i32 3, metadata !29, null}
-!33 = metadata !{i32 7, i32 5, metadata !34, null}
-!34 = metadata !{metadata !"0xb\006\0016\001", metadata !51, metadata !29} ; [ DW_TAG_lexical_block ]
-!35 = metadata !{i32 8, i32 3, metadata !34, null}
-!36 = metadata !{i32 9, i32 9, metadata !29, null}
-!37 = metadata !{metadata !"0x100\00my_a\009\000", metadata !29, metadata !3, metadata !38} ; [ DW_TAG_auto_variable ]
-!38 = metadata !{metadata !"0x10\00\000\000\000\000\000", metadata !2, null, metadata !1} ; [ DW_TAG_reference_type ]
-!39 = metadata !{i32 9, i32 5, metadata !29, null}
-!40 = metadata !{i32 10, i32 3, metadata !29, null}
-!41 = metadata !{i32 11, i32 3, metadata !29, null}
-!42 = metadata !{i32 12, i32 1, metadata !29, null}
-!43 = metadata !{metadata !"0x101\00this\0016777218\0064", metadata !22, metadata !3, metadata !13} ; [ DW_TAG_arg_variable ]
-!44 = metadata !{i32 2, i32 47, metadata !22, null}
-!45 = metadata !{i32 2, i32 61, metadata !22, null}
-!46 = metadata !{metadata !"0x101\00this\0016777218\0064", metadata !25, metadata !3, metadata !13} ; [ DW_TAG_arg_variable ]
-!47 = metadata !{i32 2, i32 47, metadata !25, null}
-!48 = metadata !{i32 2, i32 54, metadata !49, null}
-!49 = metadata !{metadata !"0xb\002\0052\002", metadata !51, metadata !25} ; [ DW_TAG_lexical_block ]
-!50 = metadata !{metadata !19, metadata !22, metadata !25}
-!51 = metadata !{metadata !"a.cc", metadata !"/private/tmp"}
-!52 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00~A\00~A\00\002\000\000\000\006\00256\000\000", !51, !1, !11, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x2\00A\002\00128\0032\000\000\000", !51, !2, null, !4, null, null, null} ; [ DW_TAG_class_type ] [A] [line 2, size 128, align 32, offset 0] [def] [from ]
+!2 = !{!"0x11\004\00clang version 3.0 (trunk 130127)\000\00\000\00\001", !51, !24, !24, !50, null, null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x29", !51} ; [ DW_TAG_file_type ]
+!4 = !{!5, !7, !8, !9, !0, !10, !14}
+!5 = !{!"0xd\00x\002\0032\0032\000\000", !51, !3, !6} ; [ DW_TAG_member ]
+!6 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !2} ; [ DW_TAG_base_type ]
+!7 = !{!"0xd\00y\002\0032\0032\0032\000", !51, !3, !6} ; [ DW_TAG_member ]
+!8 = !{!"0xd\00z\002\0032\0032\0064\000", !51, !3, !6} ; [ DW_TAG_member ]
+!9 = !{!"0xd\00o\002\0032\0032\0096\000", !51, !3, !6} ; [ DW_TAG_member ]
+!10 = !{!"0x2e\00A\00A\00\002\000\000\000\006\00320\000\000", !51, !1, !11, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!11 = !{!"0x15\00\000\000\000\000\000\000", !51, !3, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{null, !13}
+!13 = !{!"0xf\00\000\0064\0064\000\0064", !2, null, !1} ; [ DW_TAG_pointer_type ]
+!14 = !{!"0x2e\00A\00A\00\002\000\000\000\006\00320\000\000", !51, !1, !15, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!15 = !{!"0x15\00\000\000\000\000\000\000", !51, !3, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{null, !13, !17}
+!17 = !{!"0x10\00\000\000\000\000\000", null, !2, !18} ; [ DW_TAG_reference_type ]
+!18 = !{!"0x26\00\000\000\000\000\000", !2, null, !1} ; [ DW_TAG_const_type ]
+!19 = !{!"0x2e\00foo\00foo\00_Z3fooi\004\000\001\000\006\00256\000\000", !51, !3, !20, null, void (%class.A*, i32)* @_Z3fooi, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [foo]
+!20 = !{!"0x15\00\000\000\000\000\000\000", !51, !3, null, !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!21 = !{!1}
+!22 = !{!"0x2e\00~A\00~A\00_ZN1AD1Ev\002\000\001\000\006\00256\000\000", !51, !3, !23, null, void (%class.A*)* @_ZN1AD1Ev, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [~A]
+!23 = !{!"0x15\00\000\000\000\000\000\000", !51, !3, null, !24, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = !{null}
+!25 = !{!"0x2e\00~A\00~A\00_ZN1AD2Ev\002\000\001\000\006\00256\000\000", !51, !3, !23, null, void (%class.A*)* @_ZN1AD2Ev, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [~A]
+!26 = !{!"0x101\00i\0016777220\000", !19, !3, !6} ; [ DW_TAG_arg_variable ]
+!27 = !MDLocation(line: 4, column: 11, scope: !19)
+!28 = !{!"0x100\00j\005\000", !29, !3, !6} ; [ DW_TAG_auto_variable ]
+!29 = !{!"0xb\004\0014\000", !51, !19} ; [ DW_TAG_lexical_block ]
+!30 = !MDLocation(line: 5, column: 7, scope: !29)
+!31 = !MDLocation(line: 5, column: 12, scope: !29)
+!32 = !MDLocation(line: 6, column: 3, scope: !29)
+!33 = !MDLocation(line: 7, column: 5, scope: !34)
+!34 = !{!"0xb\006\0016\001", !51, !29} ; [ DW_TAG_lexical_block ]
+!35 = !MDLocation(line: 8, column: 3, scope: !34)
+!36 = !MDLocation(line: 9, column: 9, scope: !29)
+!37 = !{!"0x100\00my_a\009\000", !29, !3, !38} ; [ DW_TAG_auto_variable ]
+!38 = !{!"0x10\00\000\000\000\000\000", !2, null, !1} ; [ DW_TAG_reference_type ]
+!39 = !MDLocation(line: 9, column: 5, scope: !29)
+!40 = !MDLocation(line: 10, column: 3, scope: !29)
+!41 = !MDLocation(line: 11, column: 3, scope: !29)
+!42 = !MDLocation(line: 12, column: 1, scope: !29)
+!43 = !{!"0x101\00this\0016777218\0064", !22, !3, !13} ; [ DW_TAG_arg_variable ]
+!44 = !MDLocation(line: 2, column: 47, scope: !22)
+!45 = !MDLocation(line: 2, column: 61, scope: !22)
+!46 = !{!"0x101\00this\0016777218\0064", !25, !3, !13} ; [ DW_TAG_arg_variable ]
+!47 = !MDLocation(line: 2, column: 47, scope: !25)
+!48 = !MDLocation(line: 2, column: 54, scope: !49)
+!49 = !{!"0xb\002\0052\002", !51, !25} ; [ DW_TAG_lexical_block ]
+!50 = !{!19, !22, !25}
+!51 = !{!"a.cc", !"/private/tmp"}
+!52 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg-declare.ll b/test/DebugInfo/X86/dbg-declare.ll
index fd30115..2ede97b 100644
--- a/test/DebugInfo/X86/dbg-declare.ll
+++ b/test/DebugInfo/X86/dbg-declare.ll
@@ -7,14 +7,14 @@ entry:
   %saved_stack = alloca i8*
   %cleanup.dest.slot = alloca i32
   store i32* %x, i32** %x.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i32** %x.addr}, metadata !14, metadata !{metadata !"0x102"}), !dbg !15
+  call void @llvm.dbg.declare(metadata i32** %x.addr, metadata !14, metadata !{!"0x102"}), !dbg !15
   %0 = load i32** %x.addr, align 8, !dbg !16
   %1 = load i32* %0, align 4, !dbg !16
   %2 = zext i32 %1 to i64, !dbg !16
   %3 = call i8* @llvm.stacksave(), !dbg !16
   store i8* %3, i8** %saved_stack, !dbg !16
   %vla = alloca i8, i64 %2, align 16, !dbg !16
-  call void @llvm.dbg.declare(metadata !{i8* %vla}, metadata !18, metadata !{metadata !"0x102"}), !dbg !23
+  call void @llvm.dbg.declare(metadata i8* %vla, metadata !18, metadata !{!"0x102"}), !dbg !23
   store i32 1, i32* %cleanup.dest.slot
   %4 = load i8** %saved_stack, !dbg !24
   call void @llvm.stackrestore(i8* %4), !dbg !24
@@ -30,27 +30,27 @@ declare void @llvm.stackrestore(i8*) nounwind
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!27}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 153698)\000\00\000\00\000", metadata !26, metadata !1, metadata !1, metadata !3, metadata !1, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00foo\00foo\00\006\000\001\000\006\00256\000\000", metadata !26, metadata !0, metadata !7, null, i32 (i32*)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
-!6 = metadata !{metadata !"0x29", metadata !26} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !10}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !9} ; [ DW_TAG_const_type ]
-!14 = metadata !{metadata !"0x101\00x\0016777221\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 5, i32 21, metadata !5, null}
-!16 = metadata !{i32 7, i32 13, metadata !17, null}
-!17 = metadata !{metadata !"0xb\006\001\000", metadata !26, metadata !5} ; [ DW_TAG_lexical_block ]
-!18 = metadata !{metadata !"0x100\00a\007\000", metadata !17, metadata !6, metadata !19} ; [ DW_TAG_auto_variable ]
-!19 = metadata !{metadata !"0x1\00\000\000\008\000\000", null, null, metadata !20, metadata !21, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 8, offset 0] [from char]
-!20 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
-!21 = metadata !{metadata !22}
-!22 = metadata !{metadata !"0x21\000\00-1"}        ; [ DW_TAG_subrange_type ]
-!23 = metadata !{i32 7, i32 8, metadata !17, null}
-!24 = metadata !{i32 9, i32 1, metadata !17, null}
-!25 = metadata !{i32 8, i32 3, metadata !17, null}
-!26 = metadata !{metadata !"20020104-2.c", metadata !"/Volumes/Sandbox/llvm"}
-!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.1 (trunk 153698)\000\00\000\00\000", !26, !1, !1, !3, !1, null} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00foo\00foo\00\006\000\001\000\006\00256\000\000", !26, !0, !7, null, i32 (i32*)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!6 = !{!"0x29", !26} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !10}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = !{!"0xf\00\000\0064\0064\000\000", null, null, !11} ; [ DW_TAG_pointer_type ]
+!11 = !{!"0x26\00\000\000\000\000\000", null, null, !9} ; [ DW_TAG_const_type ]
+!14 = !{!"0x101\00x\0016777221\000", !5, !6, !10} ; [ DW_TAG_arg_variable ]
+!15 = !MDLocation(line: 5, column: 21, scope: !5)
+!16 = !MDLocation(line: 7, column: 13, scope: !17)
+!17 = !{!"0xb\006\001\000", !26, !5} ; [ DW_TAG_lexical_block ]
+!18 = !{!"0x100\00a\007\000", !17, !6, !19} ; [ DW_TAG_auto_variable ]
+!19 = !{!"0x1\00\000\000\008\000\000", null, null, !20, !21, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 8, offset 0] [from char]
+!20 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
+!21 = !{!22}
+!22 = !{!"0x21\000\00-1"}        ; [ DW_TAG_subrange_type ]
+!23 = !MDLocation(line: 7, column: 8, scope: !17)
+!24 = !MDLocation(line: 9, column: 1, scope: !17)
+!25 = !MDLocation(line: 8, column: 3, scope: !17)
+!26 = !{!"20020104-2.c", !"/Volumes/Sandbox/llvm"}
+!27 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg-file-name.ll b/test/DebugInfo/X86/dbg-file-name.ll
index f1a9e78..94fdb37 100644
--- a/test/DebugInfo/X86/dbg-file-name.ll
+++ b/test/DebugInfo/X86/dbg-file-name.ll
@@ -12,13 +12,13 @@ define i32 @main() nounwind {
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!12}
 
-!1 = metadata !{metadata !"0x29", metadata !10} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\001\00LLVM build 00\001\00\000\00\000", metadata !10, metadata !11, metadata !11, metadata !9, null, null} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !10, metadata !1} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x2e\00main\00main\00main\009\000\001\000\006\00256\000\000", metadata !10, metadata !1, metadata !7, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !10, metadata !1, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !5}
-!9 = metadata !{metadata !6}
-!10 = metadata !{metadata !"simple.c", metadata !"/Users/manav/one/two"}
-!11 = metadata !{i32 0}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!1 = !{!"0x29", !10} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\001\00LLVM build 00\001\00\000\00\000", !10, !11, !11, !9, null, null} ; [ DW_TAG_compile_unit ]
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", !10, !1} ; [ DW_TAG_base_type ]
+!6 = !{!"0x2e\00main\00main\00main\009\000\001\000\006\00256\000\000", !10, !1, !7, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", !10, !1, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!5}
+!9 = !{!6}
+!10 = !{!"simple.c", !"/Users/manav/one/two"}
+!11 = !{i32 0}
+!12 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg-i128-const.ll b/test/DebugInfo/X86/dbg-i128-const.ll
index 0f5a03e..71654cb 100644
--- a/test/DebugInfo/X86/dbg-i128-const.ll
+++ b/test/DebugInfo/X86/dbg-i128-const.ll
@@ -5,7 +5,7 @@
 
 define i128 @__foo(i128 %a, i128 %b) nounwind {
 entry:
-  tail call void @llvm.dbg.value(metadata !0, i64 0, metadata !1, metadata !{metadata !"0x102"}), !dbg !11
+  tail call void @llvm.dbg.value(metadata i128 42 , i64 0, metadata !1, metadata !{!"0x102"}), !dbg !11
   %add = add i128 %a, %b, !dbg !11
   ret i128 %add, !dbg !11
 }
@@ -15,20 +15,20 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!5}
 !llvm.module.flags = !{!16}
 
-!0 = metadata !{i128 42 }
-!1 = metadata !{metadata !"0x100\00MAX\0029\000", metadata !2, metadata !4, metadata !8} ; [ DW_TAG_auto_variable ]
-!2 = metadata !{metadata !"0xb\0026\000\000", metadata !13, metadata !3} ; [ DW_TAG_lexical_block ]
-!3 = metadata !{metadata !"0x2e\00__foo\00__foo\00__foo\0026\000\001\000\006\000\000\0026", metadata !13, metadata !4, metadata !6, null, i128 (i128, i128)* @__foo, null, null, null} ; [ DW_TAG_subprogram ]
-!4 = metadata !{metadata !"0x29", metadata !13} ; [ DW_TAG_file_type ]
-!5 = metadata !{metadata !"0x11\001\00clang\001\00\000\00\000", metadata !13, metadata !15, metadata !15, metadata !12, null,  null} ; [ DW_TAG_compile_unit ]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !13, metadata !4, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !8, metadata !8}
-!8 = metadata !{metadata !"0x16\00ti_int\0078\000\000\000\000", metadata !14, metadata !4, metadata !10} ; [ DW_TAG_typedef ]
-!9 = metadata !{metadata !"0x29", metadata !14} ; [ DW_TAG_file_type ]
-!10 = metadata !{metadata !"0x24\00\000\00128\00128\000\000\005", metadata !13, metadata !4} ; [ DW_TAG_base_type ]
-!11 = metadata !{i32 29, i32 0, metadata !2, null}
-!12 = metadata !{metadata !3}
-!13 = metadata !{metadata !"foo.c", metadata !"/tmp"}
-!14 = metadata !{metadata !"myint.h", metadata !"/tmp"}
-!15 = metadata !{i32 0}
-!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{i128 42 }
+!1 = !{!"0x100\00MAX\0029\000", !2, !4, !8} ; [ DW_TAG_auto_variable ]
+!2 = !{!"0xb\0026\000\000", !13, !3} ; [ DW_TAG_lexical_block ]
+!3 = !{!"0x2e\00__foo\00__foo\00__foo\0026\000\001\000\006\000\000\0026", !13, !4, !6, null, i128 (i128, i128)* @__foo, null, null, null} ; [ DW_TAG_subprogram ]
+!4 = !{!"0x29", !13} ; [ DW_TAG_file_type ]
+!5 = !{!"0x11\001\00clang\001\00\000\00\000", !13, !15, !15, !12, null,  null} ; [ DW_TAG_compile_unit ]
+!6 = !{!"0x15\00\000\000\000\000\000\000", !13, !4, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8, !8}
+!8 = !{!"0x16\00ti_int\0078\000\000\000\000", !14, !4, !10} ; [ DW_TAG_typedef ]
+!9 = !{!"0x29", !14} ; [ DW_TAG_file_type ]
+!10 = !{!"0x24\00\000\00128\00128\000\000\005", !13, !4} ; [ DW_TAG_base_type ]
+!11 = !MDLocation(line: 29, scope: !2)
+!12 = !{!3}
+!13 = !{!"foo.c", !"/tmp"}
+!14 = !{!"myint.h", !"/tmp"}
+!15 = !{i32 0}
+!16 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg-merge-loc-entry.ll b/test/DebugInfo/X86/dbg-merge-loc-entry.ll
index f4f1788..0d56222 100644
--- a/test/DebugInfo/X86/dbg-merge-loc-entry.ll
+++ b/test/DebugInfo/X86/dbg-merge-loc-entry.ll
@@ -14,8 +14,8 @@ target triple = "x86_64-apple-darwin8"
 
 define hidden i128 @__divti3(i128 %u, i128 %v) nounwind readnone {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i128 %u}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !15
-  tail call void @llvm.dbg.value(metadata !16, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !21
+  tail call void @llvm.dbg.value(metadata i128 %u, i64 0, metadata !14, metadata !{!"0x102"}), !dbg !15
+  tail call void @llvm.dbg.value(metadata i64 0, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !21
   br i1 undef, label %bb2, label %bb4, !dbg !22
 
 bb2:                                              ; preds = %entry
@@ -40,36 +40,36 @@ declare %0 @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!32}
 
-!0 = metadata !{metadata !"0x2e\00__udivmodti4\00__udivmodti4\00\00879\001\001\000\006\00256\001\00879", metadata !29, metadata !1, metadata !3, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x29", metadata !29} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !29, metadata !31, metadata !31, metadata !28, null,  null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !29, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5, metadata !5, metadata !5, metadata !8}
-!5 = metadata !{metadata !"0x16\00UTItype\00166\000\000\000\000", metadata !30, metadata !6, metadata !7} ; [ DW_TAG_typedef ]
-!6 = metadata !{metadata !"0x29", metadata !30} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x24\00\000\00128\00128\000\000\007", metadata !29, metadata !1} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !29, metadata !1, metadata !5} ; [ DW_TAG_pointer_type ]
-!9 = metadata !{metadata !"0x2e\00__divti3\00__divti3\00__divti3\001094\000\001\000\006\00256\001\001094", metadata !29, metadata !1, metadata !10, null, i128 (i128, i128)* @__divti3, null, null, null} ; [ DW_TAG_subprogram ]
-!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !29, metadata !1, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!11 = metadata !{metadata !12, metadata !12, metadata !12}
-!12 = metadata !{metadata !"0x16\00TItype\00160\000\000\000\000", metadata !30, metadata !6, metadata !13} ; [ DW_TAG_typedef ]
-!13 = metadata !{metadata !"0x24\00\000\00128\00128\000\000\005", metadata !29, metadata !1} ; [ DW_TAG_base_type ]
-!14 = metadata !{metadata !"0x101\00u\001093\000", metadata !9, metadata !1, metadata !12} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 1093, i32 0, metadata !9, null}
-!16 = metadata !{i64 0}
-!17 = metadata !{metadata !"0x100\00c\001095\000", metadata !18, metadata !1, metadata !19} ; [ DW_TAG_auto_variable ]
-!18 = metadata !{metadata !"0xb\001094\000\0013", metadata !29, metadata !9} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{metadata !"0x16\00word_type\00424\000\000\000\000", metadata !30, metadata !6, metadata !20} ; [ DW_TAG_typedef ]
-!20 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", metadata !29, metadata !1} ; [ DW_TAG_base_type ]
-!21 = metadata !{i32 1095, i32 0, metadata !18, null}
-!22 = metadata !{i32 1103, i32 0, metadata !18, null}
-!23 = metadata !{i32 1104, i32 0, metadata !18, null}
-!24 = metadata !{i32 1003, i32 0, metadata !25, metadata !26}
-!25 = metadata !{metadata !"0xb\00879\000\000", metadata !29, metadata !0} ; [ DW_TAG_lexical_block ]
-!26 = metadata !{i32 1107, i32 0, metadata !18, null}
-!27 = metadata !{i32 1111, i32 0, metadata !18, null}
-!28 = metadata !{metadata !0, metadata !9}
-!29 = metadata !{metadata !"foobar.c", metadata !"/tmp"}
-!30 = metadata !{metadata !"foobar.h", metadata !"/tmp"}
-!31 = metadata !{i32 0}
-!32 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00__udivmodti4\00__udivmodti4\00\00879\001\001\000\006\00256\001\00879", !29, !1, !3, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x29", !29} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", !29, !31, !31, !28, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !29, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5, !5, !5, !8}
+!5 = !{!"0x16\00UTItype\00166\000\000\000\000", !30, !6, !7} ; [ DW_TAG_typedef ]
+!6 = !{!"0x29", !30} ; [ DW_TAG_file_type ]
+!7 = !{!"0x24\00\000\00128\00128\000\000\007", !29, !1} ; [ DW_TAG_base_type ]
+!8 = !{!"0xf\00\000\0064\0064\000\000", !29, !1, !5} ; [ DW_TAG_pointer_type ]
+!9 = !{!"0x2e\00__divti3\00__divti3\00__divti3\001094\000\001\000\006\00256\001\001094", !29, !1, !10, null, i128 (i128, i128)* @__divti3, null, null, null} ; [ DW_TAG_subprogram ]
+!10 = !{!"0x15\00\000\000\000\000\000\000", !29, !1, null, !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = !{!12, !12, !12}
+!12 = !{!"0x16\00TItype\00160\000\000\000\000", !30, !6, !13} ; [ DW_TAG_typedef ]
+!13 = !{!"0x24\00\000\00128\00128\000\000\005", !29, !1} ; [ DW_TAG_base_type ]
+!14 = !{!"0x101\00u\001093\000", !9, !1, !12} ; [ DW_TAG_arg_variable ]
+!15 = !MDLocation(line: 1093, scope: !9)
+!16 = !{i64 0}
+!17 = !{!"0x100\00c\001095\000", !18, !1, !19} ; [ DW_TAG_auto_variable ]
+!18 = !{!"0xb\001094\000\0013", !29, !9} ; [ DW_TAG_lexical_block ]
+!19 = !{!"0x16\00word_type\00424\000\000\000\000", !30, !6, !20} ; [ DW_TAG_typedef ]
+!20 = !{!"0x24\00long int\000\0064\0064\000\000\005", !29, !1} ; [ DW_TAG_base_type ]
+!21 = !MDLocation(line: 1095, scope: !18)
+!22 = !MDLocation(line: 1103, scope: !18)
+!23 = !MDLocation(line: 1104, scope: !18)
+!24 = !MDLocation(line: 1003, scope: !25, inlinedAt: !26)
+!25 = !{!"0xb\00879\000\000", !29, !0} ; [ DW_TAG_lexical_block ]
+!26 = !MDLocation(line: 1107, scope: !18)
+!27 = !MDLocation(line: 1111, scope: !18)
+!28 = !{!0, !9}
+!29 = !{!"foobar.c", !"/tmp"}
+!30 = !{!"foobar.h", !"/tmp"}
+!31 = !{i32 0}
+!32 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg-prolog-end.ll b/test/DebugInfo/X86/dbg-prolog-end.ll
index f51dd70..4aaaf4a 100644
--- a/test/DebugInfo/X86/dbg-prolog-end.ll
+++ b/test/DebugInfo/X86/dbg-prolog-end.ll
@@ -8,8 +8,8 @@ entry:
   %i.addr = alloca i32, align 4
   %j = alloca i32, align 4
   store i32 %i, i32* %i.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !7, metadata !{metadata !"0x102"}), !dbg !8
-  call void @llvm.dbg.declare(metadata !{i32* %j}, metadata !9, metadata !{metadata !"0x102"}), !dbg !11
+  call void @llvm.dbg.declare(metadata i32* %i.addr, metadata !7, metadata !{!"0x102"}), !dbg !8
+  call void @llvm.dbg.declare(metadata i32* %j, metadata !9, metadata !{!"0x102"}), !dbg !11
   store i32 2, i32* %j, align 4, !dbg !12
   %tmp = load i32* %j, align 4, !dbg !13
   %inc = add nsw i32 %tmp, 1, !dbg !13
@@ -34,26 +34,26 @@ entry:
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21}
-!18 = metadata !{metadata !1, metadata !6}
+!18 = !{!1, !6}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 131100)\000\00\000\00\000", metadata !19, metadata !20, metadata !20, metadata !18, null,  null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !19, metadata !2, metadata !3, null, i32 (i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!2 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !19, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x2e\00main\00main\00\007\000\001\000\006\000\000\007", metadata !19, metadata !2, metadata !3, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
-!7 = metadata !{metadata !"0x101\00i\0016777217\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 1, i32 13, metadata !1, null}
-!9 = metadata !{metadata !"0x100\00j\002\000", metadata !10, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
-!10 = metadata !{metadata !"0xb\001\0016\000", metadata !19, metadata !1} ; [ DW_TAG_lexical_block ]
-!11 = metadata !{i32 2, i32 6, metadata !10, null}
-!12 = metadata !{i32 2, i32 11, metadata !10, null}
-!13 = metadata !{i32 3, i32 2, metadata !10, null}
-!14 = metadata !{i32 4, i32 2, metadata !10, null}
-!15 = metadata !{i32 5, i32 2, metadata !10, null}
-!16 = metadata !{i32 8, i32 2, metadata !17, null}
-!17 = metadata !{metadata !"0xb\007\0012\001", metadata !19, metadata !6} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{metadata !"/tmp/a.c", metadata !"/private/tmp"}
-!20 = metadata !{i32 0}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.0 (trunk 131100)\000\00\000\00\000", !19, !20, !20, !18, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", !19, !2, !3, null, i32 (i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!2 = !{!"0x29", !19} ; [ DW_TAG_file_type ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !19, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !0} ; [ DW_TAG_base_type ]
+!6 = !{!"0x2e\00main\00main\00\007\000\001\000\006\000\000\007", !19, !2, !3, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!7 = !{!"0x101\00i\0016777217\000", !1, !2, !5} ; [ DW_TAG_arg_variable ]
+!8 = !MDLocation(line: 1, column: 13, scope: !1)
+!9 = !{!"0x100\00j\002\000", !10, !2, !5} ; [ DW_TAG_auto_variable ]
+!10 = !{!"0xb\001\0016\000", !19, !1} ; [ DW_TAG_lexical_block ]
+!11 = !MDLocation(line: 2, column: 6, scope: !10)
+!12 = !MDLocation(line: 2, column: 11, scope: !10)
+!13 = !MDLocation(line: 3, column: 2, scope: !10)
+!14 = !MDLocation(line: 4, column: 2, scope: !10)
+!15 = !MDLocation(line: 5, column: 2, scope: !10)
+!16 = !MDLocation(line: 8, column: 2, scope: !17)
+!17 = !{!"0xb\007\0012\001", !19, !6} ; [ DW_TAG_lexical_block ]
+!19 = !{!"/tmp/a.c", !"/private/tmp"}
+!20 = !{i32 0}
+!21 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg-subrange.ll b/test/DebugInfo/X86/dbg-subrange.ll
index 8102779..89754b9 100644
--- a/test/DebugInfo/X86/dbg-subrange.ll
+++ b/test/DebugInfo/X86/dbg-subrange.ll
@@ -15,21 +15,21 @@ entry:
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 144833)\000\00\000\00\000", metadata !21, metadata !1, metadata !1, metadata !3, metadata !11,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00bar\00bar\00\004\000\001\000\006\00256\000\000", metadata !21, metadata !6, metadata !7, null, void ()* @bar, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [bar]
-!6 = metadata !{metadata !"0x29", metadata !21} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null}
-!11 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x34\00s\00s\00\002\000\001", null, metadata !6, metadata !14, [4294967296 x i8]* @s, null} ; [ DW_TAG_variable ]
-!14 = metadata !{metadata !"0x1\00\000\0034359738368\008\000\000", null, null, metadata !15, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 34359738368, align 8, offset 0] [from char]
-!15 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
-!16 = metadata !{metadata !17}
-!17 = metadata !{metadata !"0x21\000\004294967296"} ; [ DW_TAG_subrange_type ]
-!18 = metadata !{i32 5, i32 3, metadata !19, null}
-!19 = metadata !{metadata !"0xb\004\001\000", metadata !21, metadata !5} ; [ DW_TAG_lexical_block ]
-!20 = metadata !{i32 6, i32 1, metadata !19, null}
-!21 = metadata !{metadata !"small.c", metadata !"/private/tmp"}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.1 (trunk 144833)\000\00\000\00\000", !21, !1, !1, !3, !11,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00bar\00bar\00\004\000\001\000\006\00256\000\000", !21, !6, !7, null, void ()* @bar, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [bar]
+!6 = !{!"0x29", !21} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null}
+!11 = !{!13}
+!13 = !{!"0x34\00s\00s\00\002\000\001", null, !6, !14, [4294967296 x i8]* @s, null} ; [ DW_TAG_variable ]
+!14 = !{!"0x1\00\000\0034359738368\008\000\000", null, null, !15, !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 34359738368, align 8, offset 0] [from char]
+!15 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
+!16 = !{!17}
+!17 = !{!"0x21\000\004294967296"} ; [ DW_TAG_subrange_type ]
+!18 = !MDLocation(line: 5, column: 3, scope: !19)
+!19 = !{!"0xb\004\001\000", !21, !5} ; [ DW_TAG_lexical_block ]
+!20 = !MDLocation(line: 6, column: 1, scope: !19)
+!21 = !{!"small.c", !"/private/tmp"}
+!22 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg-value-const-byref.ll b/test/DebugInfo/X86/dbg-value-const-byref.ll
index 0182d65..c8ffba8 100644
--- a/test/DebugInfo/X86/dbg-value-const-byref.ll
+++ b/test/DebugInfo/X86/dbg-value-const-byref.ll
@@ -50,13 +50,13 @@ target triple = "x86_64-apple-macosx10.9.0"
 define i32 @foo() #0 {
 entry:
   %i = alloca i32, align 4
-  call void @llvm.dbg.value(metadata !14, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !15
+  call void @llvm.dbg.value(metadata i32 3, i64 0, metadata !10, metadata !{!"0x102"}), !dbg !15
   %call = call i32 @f3(i32 3) #3, !dbg !16
-  call void @llvm.dbg.value(metadata !17, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !18
+  call void @llvm.dbg.value(metadata i32 7, i64 0, metadata !10, metadata !{!"0x102"}), !dbg !18
   %call1 = call i32 (...)* @f1() #3, !dbg !19
-  call void @llvm.dbg.value(metadata !{i32 %call1}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !19
+  call void @llvm.dbg.value(metadata i32 %call1, i64 0, metadata !10, metadata !{!"0x102"}), !dbg !19
   store i32 %call1, i32* %i, align 4, !dbg !19, !tbaa !20
-  call void @llvm.dbg.value(metadata !{i32* %i}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !24
+  call void @llvm.dbg.value(metadata i32* %i, i64 0, metadata !10, metadata !{!"0x102"}), !dbg !24
   call void @f2(i32* %i) #3, !dbg !24
   ret i32 0, !dbg !25
 }
@@ -78,29 +78,29 @@ attributes #3 = { nounwind }
 !llvm.module.flags = !{!11, !12}
 !llvm.ident = !{!13}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [dbg-value-const-byref.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"dbg-value-const-byref.c", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\005\000\001\000\006\000\001\005", metadata !1, metadata !5, metadata !6, null, i32 ()* @foo, null, null, metadata !9} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [dbg-value-const-byref.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x100\00i\006\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 6]
-!11 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!13 = metadata !{metadata !"clang version 3.5.0 "}
-!14 = metadata !{i32 3}
-!15 = metadata !{i32 6, i32 0, metadata !4, null}
-!16 = metadata !{i32 7, i32 0, metadata !4, null}
-!17 = metadata !{i32 7}
-!18 = metadata !{i32 8, i32 0, metadata !4, null}
-!19 = metadata !{i32 9, i32 0, metadata !4, null}
-!20 = metadata !{metadata !21, metadata !21, i64 0}
-!21 = metadata !{metadata !"int", metadata !22, i64 0}
-!22 = metadata !{metadata !"omnipotent char", metadata !23, i64 0}
-!23 = metadata !{metadata !"Simple C/C++ TBAA"}
-!24 = metadata !{i32 10, i32 0, metadata !4, null}
-!25 = metadata !{i32 11, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 \001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [dbg-value-const-byref.c] [DW_LANG_C99]
+!1 = !{!"dbg-value-const-byref.c", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\005\000\001\000\006\000\001\005", !1, !5, !6, null, i32 ()* @foo, null, null, !9} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [dbg-value-const-byref.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!10}
+!10 = !{!"0x100\00i\006\000", !4, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 6]
+!11 = !{i32 2, !"Dwarf Version", i32 2}
+!12 = !{i32 1, !"Debug Info Version", i32 2}
+!13 = !{!"clang version 3.5.0 "}
+!14 = !{i32 3}
+!15 = !MDLocation(line: 6, scope: !4)
+!16 = !MDLocation(line: 7, scope: !4)
+!17 = !{i32 7}
+!18 = !MDLocation(line: 8, scope: !4)
+!19 = !MDLocation(line: 9, scope: !4)
+!20 = !{!21, !21, i64 0}
+!21 = !{!"int", !22, i64 0}
+!22 = !{!"omnipotent char", !23, i64 0}
+!23 = !{!"Simple C/C++ TBAA"}
+!24 = !MDLocation(line: 10, scope: !4)
+!25 = !MDLocation(line: 11, scope: !4)
diff --git a/test/DebugInfo/X86/dbg-value-dag-combine.ll b/test/DebugInfo/X86/dbg-value-dag-combine.ll
index cf839b2..9392da9 100644
--- a/test/DebugInfo/X86/dbg-value-dag-combine.ll
+++ b/test/DebugInfo/X86/dbg-value-dag-combine.ll
@@ -8,15 +8,15 @@ declare <4 x i32> @__amdil_get_global_id_int()
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 define void @__OpenCL_test_kernel(i32 addrspace(1)* %ip) nounwind {
 entry:
-  call void @llvm.dbg.value(metadata !{i32 addrspace(1)* %ip}, i64 0, metadata !7, metadata !{metadata !"0x102"}), !dbg !8
+  call void @llvm.dbg.value(metadata i32 addrspace(1)* %ip, i64 0, metadata !7, metadata !{!"0x102"}), !dbg !8
   %0 = call <4 x i32> @__amdil_get_global_id_int() nounwind
   %1 = extractelement <4 x i32> %0, i32 0
-  call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !9, metadata !{metadata !"0x102"}), !dbg !11
-  call void @llvm.dbg.value(metadata !12, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !14
+  call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !9, metadata !{!"0x102"}), !dbg !11
+  call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !13, metadata !{!"0x102"}), !dbg !14
   %tmp2 = load i32 addrspace(1)* %ip, align 4, !dbg !15
   %tmp3 = add i32 0, %tmp2, !dbg !15
 ; CHECK:  ##DEBUG_VALUE: idx <- E{{..$}}
-  call void @llvm.dbg.value(metadata !{i32 %tmp3}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !15
+  call void @llvm.dbg.value(metadata i32 %tmp3, i64 0, metadata !13, metadata !{!"0x102"}), !dbg !15
   %arrayidx = getelementptr i32 addrspace(1)* %ip, i32 %1, !dbg !16
   store i32 %tmp3, i32 addrspace(1)* %arrayidx, align 4, !dbg !16
   ret void, !dbg !17
@@ -24,24 +24,24 @@ entry:
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!20}
 
-!0 = metadata !{metadata !"0x2e\00__OpenCL_test_kernel\00__OpenCL_test_kernel\00__OpenCL_test_kernel\002\000\001\000\006\000\000\000", metadata !19, metadata !1, metadata !3, null, void (i32 addrspace(1)*)* @__OpenCL_test_kernel, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [__OpenCL_test_kernel]
-!1 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\001\00clc\000\00\000\00\001", metadata !19, metadata !12, metadata !12, metadata !18, null,  null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !19, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null, metadata !5}
-!5 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !6} ; [ DW_TAG_pointer_type ]
-!6 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, metadata !2} ; [ DW_TAG_base_type ]
-!7 = metadata !{metadata !"0x101\00ip\001\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 1, i32 42, metadata !0, null}
-!9 = metadata !{metadata !"0x100\00gid\003\000", metadata !10, metadata !1, metadata !6} ; [ DW_TAG_auto_variable ]
-!10 = metadata !{metadata !"0xb\002\001\000", metadata !19, metadata !0} ; [ DW_TAG_lexical_block ]
-!11 = metadata !{i32 3, i32 41, metadata !10, null}
-!12 = metadata !{i32 0}
-!13 = metadata !{metadata !"0x100\00idx\004\000", metadata !10, metadata !1, metadata !6} ; [ DW_TAG_auto_variable ]
-!14 = metadata !{i32 4, i32 20, metadata !10, null}
-!15 = metadata !{i32 5, i32 15, metadata !10, null}
-!16 = metadata !{i32 6, i32 18, metadata !10, null}
-!17 = metadata !{i32 7, i32 1, metadata !0, null}
-!18 = metadata !{metadata !0}
-!19 = metadata !{metadata !"OCL6368.tmp.cl", metadata !"E:\5CUsers\5Cmvillmow.AMD\5CAppData\5CLocal\5CTemp"}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00__OpenCL_test_kernel\00__OpenCL_test_kernel\00__OpenCL_test_kernel\002\000\001\000\006\000\000\000", !19, !1, !3, null, void (i32 addrspace(1)*)* @__OpenCL_test_kernel, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [__OpenCL_test_kernel]
+!1 = !{!"0x29", !19} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\001\00clc\000\00\000\00\001", !19, !12, !12, !18, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !19, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{null, !5}
+!5 = !{!"0xf\00\000\0032\0032\000\000", null, !2, !6} ; [ DW_TAG_pointer_type ]
+!6 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", null, !2} ; [ DW_TAG_base_type ]
+!7 = !{!"0x101\00ip\001\000", !0, !1, !5} ; [ DW_TAG_arg_variable ]
+!8 = !MDLocation(line: 1, column: 42, scope: !0)
+!9 = !{!"0x100\00gid\003\000", !10, !1, !6} ; [ DW_TAG_auto_variable ]
+!10 = !{!"0xb\002\001\000", !19, !0} ; [ DW_TAG_lexical_block ]
+!11 = !MDLocation(line: 3, column: 41, scope: !10)
+!12 = !{i32 0}
+!13 = !{!"0x100\00idx\004\000", !10, !1, !6} ; [ DW_TAG_auto_variable ]
+!14 = !MDLocation(line: 4, column: 20, scope: !10)
+!15 = !MDLocation(line: 5, column: 15, scope: !10)
+!16 = !MDLocation(line: 6, column: 18, scope: !10)
+!17 = !MDLocation(line: 7, column: 1, scope: !0)
+!18 = !{!0}
+!19 = !{!"OCL6368.tmp.cl", !"E:\5CUsers\5Cmvillmow.AMD\5CAppData\5CLocal\5CTemp"}
+!20 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
index 2f0454e..31833a8 100644
--- a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
+++ b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
@@ -45,8 +45,8 @@
 
 define i32 @foo(%struct.S1* nocapture %sp, i32 %nums) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.S1* %sp}, i64 0, metadata !9, metadata !{metadata !"0x102"}), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{i32 %nums}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !21
+  tail call void @llvm.dbg.value(metadata %struct.S1* %sp, i64 0, metadata !9, metadata !{!"0x102"}), !dbg !20
+  tail call void @llvm.dbg.value(metadata i32 %nums, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !21
   %tmp2 = getelementptr inbounds %struct.S1* %sp, i64 0, i32 1, !dbg !22
   store i32 %nums, i32* %tmp2, align 4, !dbg !22
   %call = tail call float* @bar(i32 %nums) nounwind optsize, !dbg !27
@@ -61,8 +61,8 @@ declare float* @bar(i32) optsize
 
 define void @foobar() nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !30, i64 0, metadata !9, metadata !{metadata !"0x102"}) nounwind, !dbg !31
-  tail call void @llvm.dbg.value(metadata !34, i64 0, metadata !18, metadata !{metadata !"0x102"}) nounwind, !dbg !35
+  tail call void @llvm.dbg.value(metadata %struct.S1* @p, i64 0, metadata !9, metadata !{!"0x102"}) nounwind, !dbg !31
+  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !18, metadata !{!"0x102"}) nounwind, !dbg !35
   store i32 1, i32* getelementptr inbounds (%struct.S1* @p, i64 0, i32 1), align 8, !dbg !36
   %call.i = tail call float* @bar(i32 1) nounwind optsize, !dbg !37
   store float* %call.i, float** getelementptr inbounds (%struct.S1* @p, i64 0, i32 0), align 8, !dbg !37
@@ -74,44 +74,44 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!43}
 
-!0 = metadata !{metadata !"0x2e\00foo\00foo\00\008\000\001\000\006\00256\001\008", metadata !1, metadata !1, metadata !3, null, i32 (%struct.S1*, i32)* @foo, null, null, metadata !41} ; [ DW_TAG_subprogram ] [line 8] [def] [foo]
-!1 = metadata !{metadata !"0x29", metadata !42} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 125693)\001\00\000\00\001", metadata !42, metadata !8, metadata !8, metadata !39, metadata !40,  metadata !44} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !42, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x2e\00foobar\00foobar\00\0015\000\001\000\006\000\001\000", metadata !1, metadata !1, metadata !7, null, void ()* @foobar, null, null, null} ; [ DW_TAG_subprogram ] [line 15] [def] [scope 0] [foobar]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !42, metadata !1, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null}
-!9 = metadata !{metadata !"0x101\00sp\0016777223\000", metadata !0, metadata !1, metadata !10, metadata !32} ; [ DW_TAG_arg_variable ]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !2, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{metadata !"0x16\00S1\004\000\000\000\000", metadata !42, metadata !2, metadata !12} ; [ DW_TAG_typedef ]
-!12 = metadata !{metadata !"0x13\00S1\001\00128\0064\000\000\000", metadata !42, metadata !2, null, metadata !13, null, null, null} ; [ DW_TAG_structure_type ] [S1] [line 1, size 128, align 64, offset 0] [def] [from ]
-!13 = metadata !{metadata !14, metadata !17}
-!14 = metadata !{metadata !"0xd\00m\002\0064\0064\000\000", metadata !42, metadata !1, metadata !15} ; [ DW_TAG_member ]
-!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !2, metadata !16} ; [ DW_TAG_pointer_type ]
-!16 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !2} ; [ DW_TAG_base_type ]
-!17 = metadata !{metadata !"0xd\00nums\003\0032\0032\0064\000", metadata !42, metadata !1, metadata !5} ; [ DW_TAG_member ]
-!18 = metadata !{metadata !"0x101\00nums\0033554439\000", metadata !0, metadata !1, metadata !5, metadata !32} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{metadata !"0x34\00p\00p\00\0014\000\001", metadata !2, metadata !1, metadata !11, %struct.S1* @p, null} ; [ DW_TAG_variable ]
-!20 = metadata !{i32 7, i32 13, metadata !0, null}
-!21 = metadata !{i32 7, i32 21, metadata !0, null}
-!22 = metadata !{i32 9, i32 3, metadata !23, null}
-!23 = metadata !{metadata !"0xb\008\001\000", metadata !1, metadata !0} ; [ DW_TAG_lexical_block ]
-!27 = metadata !{i32 10, i32 3, metadata !23, null}
-!29 = metadata !{i32 11, i32 3, metadata !23, null}
-!30 = metadata !{%struct.S1* @p}
-!31 = metadata !{i32 7, i32 13, metadata !0, metadata !32}
-!32 = metadata !{i32 16, i32 3, metadata !33, null}
-!33 = metadata !{metadata !"0xb\0015\0015\001", metadata !1, metadata !6} ; [ DW_TAG_lexical_block ]
-!34 = metadata !{i32 1}
-!35 = metadata !{i32 7, i32 21, metadata !0, metadata !32}
-!36 = metadata !{i32 9, i32 3, metadata !23, metadata !32}
-!37 = metadata !{i32 10, i32 3, metadata !23, metadata !32}
-!38 = metadata !{i32 17, i32 1, metadata !33, null}
-!39 = metadata !{metadata !0, metadata !6}
-!40 = metadata !{metadata !19}
-!41 = metadata !{metadata !9, metadata !18}
-!42 = metadata !{metadata !"nm2.c", metadata !"/private/tmp"}
-!43 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!44 = metadata !{}
+!0 = !{!"0x2e\00foo\00foo\00\008\000\001\000\006\00256\001\008", !1, !1, !3, null, i32 (%struct.S1*, i32)* @foo, null, null, !41} ; [ DW_TAG_subprogram ] [line 8] [def] [foo]
+!1 = !{!"0x29", !42} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 2.9 (trunk 125693)\001\00\000\00\001", !42, !8, !8, !39, !40,  !44} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !42, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !2} ; [ DW_TAG_base_type ]
+!6 = !{!"0x2e\00foobar\00foobar\00\0015\000\001\000\006\000\001\000", !1, !1, !7, null, void ()* @foobar, null, null, null} ; [ DW_TAG_subprogram ] [line 15] [def] [scope 0] [foobar]
+!7 = !{!"0x15\00\000\000\000\000\000\000", !42, !1, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null}
+!9 = !{!"0x101\00sp\0016777223\000", !0, !1, !10, !32} ; [ DW_TAG_arg_variable ]
+!10 = !{!"0xf\00\000\0064\0064\000\000", null, !2, !11} ; [ DW_TAG_pointer_type ]
+!11 = !{!"0x16\00S1\004\000\000\000\000", !42, !2, !12} ; [ DW_TAG_typedef ]
+!12 = !{!"0x13\00S1\001\00128\0064\000\000\000", !42, !2, null, !13, null, null, null} ; [ DW_TAG_structure_type ] [S1] [line 1, size 128, align 64, offset 0] [def] [from ]
+!13 = !{!14, !17}
+!14 = !{!"0xd\00m\002\0064\0064\000\000", !42, !1, !15} ; [ DW_TAG_member ]
+!15 = !{!"0xf\00\000\0064\0064\000\000", null, !2, !16} ; [ DW_TAG_pointer_type ]
+!16 = !{!"0x24\00float\000\0032\0032\000\000\004", null, !2} ; [ DW_TAG_base_type ]
+!17 = !{!"0xd\00nums\003\0032\0032\0064\000", !42, !1, !5} ; [ DW_TAG_member ]
+!18 = !{!"0x101\00nums\0033554439\000", !0, !1, !5, !32} ; [ DW_TAG_arg_variable ]
+!19 = !{!"0x34\00p\00p\00\0014\000\001", !2, !1, !11, %struct.S1* @p, null} ; [ DW_TAG_variable ]
+!20 = !MDLocation(line: 7, column: 13, scope: !0)
+!21 = !MDLocation(line: 7, column: 21, scope: !0)
+!22 = !MDLocation(line: 9, column: 3, scope: !23)
+!23 = !{!"0xb\008\001\000", !1, !0} ; [ DW_TAG_lexical_block ]
+!27 = !MDLocation(line: 10, column: 3, scope: !23)
+!29 = !MDLocation(line: 11, column: 3, scope: !23)
+!30 = !{%struct.S1* @p}
+!31 = !MDLocation(line: 7, column: 13, scope: !0, inlinedAt: !32)
+!32 = !MDLocation(line: 16, column: 3, scope: !33)
+!33 = !{!"0xb\0015\0015\001", !1, !6} ; [ DW_TAG_lexical_block ]
+!34 = !{i32 1}
+!35 = !MDLocation(line: 7, column: 21, scope: !0, inlinedAt: !32)
+!36 = !MDLocation(line: 9, column: 3, scope: !23, inlinedAt: !32)
+!37 = !MDLocation(line: 10, column: 3, scope: !23, inlinedAt: !32)
+!38 = !MDLocation(line: 17, column: 1, scope: !33)
+!39 = !{!0, !6}
+!40 = !{!19}
+!41 = !{!9, !18}
+!42 = !{!"nm2.c", !"/private/tmp"}
+!43 = !{i32 1, !"Debug Info Version", i32 2}
+!44 = !{}
diff --git a/test/DebugInfo/X86/dbg-value-isel.ll b/test/DebugInfo/X86/dbg-value-isel.ll
index 6e5d81a..a908b32 100644
--- a/test/DebugInfo/X86/dbg-value-isel.ll
+++ b/test/DebugInfo/X86/dbg-value-isel.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-apple-darwin10.0.0"
 
 define void @__OpenCL_nbt02_kernel(i32 addrspace(1)* %ip) nounwind {
 entry:
-  call void @llvm.dbg.value(metadata !{i32 addrspace(1)* %ip}, i64 0, metadata !8, metadata !{metadata !"0x102"}), !dbg !9
+  call void @llvm.dbg.value(metadata i32 addrspace(1)* %ip, i64 0, metadata !8, metadata !{!"0x102"}), !dbg !9
   %0 = call <4 x i32> @__amdil_get_local_id_int() nounwind
   %1 = extractelement <4 x i32> %0, i32 0
   br label %2
@@ -28,7 +28,7 @@ entry:
 
 get_local_id.exit:                                ; preds = %4
   %6 = phi i32 [ %5, %4 ]
-  call void @llvm.dbg.value(metadata !{i32 %6}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !12
+  call void @llvm.dbg.value(metadata i32 %6, i64 0, metadata !10, metadata !{!"0x102"}), !dbg !12
   %7 = call <4 x i32> @__amdil_get_global_id_int() nounwind, !dbg !12
   %8 = extractelement <4 x i32> %7, i32 0, !dbg !12
   br label %9
@@ -43,7 +43,7 @@ get_local_id.exit:                                ; preds = %4
 
 get_global_id.exit:                               ; preds = %11
   %13 = phi i32 [ %12, %11 ]
-  call void @llvm.dbg.value(metadata !{i32 %13}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !14
+  call void @llvm.dbg.value(metadata i32 %13, i64 0, metadata !13, metadata !{!"0x102"}), !dbg !14
   %14 = call <4 x i32> @__amdil_get_local_size_int() nounwind
   %15 = extractelement <4 x i32> %14, i32 0
   br label %16
@@ -58,7 +58,7 @@ get_global_id.exit:                               ; preds = %11
 
 get_local_size.exit:                              ; preds = %18
   %20 = phi i32 [ %19, %18 ]
-  call void @llvm.dbg.value(metadata !{i32 %20}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !16
+  call void @llvm.dbg.value(metadata i32 %20, i64 0, metadata !15, metadata !{!"0x102"}), !dbg !16
   %tmp5 = add i32 %6, %13, !dbg !17
   %tmp7 = add i32 %tmp5, %20, !dbg !17
   store i32 %tmp7, i32 addrspace(1)* %ip, align 4, !dbg !17
@@ -81,26 +81,26 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!22}
 
-!0 = metadata !{metadata !"0x2e\00__OpenCL_nbt02_kernel\00__OpenCL_nbt02_kernel\00__OpenCL_nbt02_kernel\002\000\001\000\006\000\000\000", metadata !20, metadata !1, metadata !3, null, void (i32 addrspace(1)*)* @__OpenCL_nbt02_kernel, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [__OpenCL_nbt02_kernel]
-!1 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\001\00clc\000\00\000\00\001", metadata !20, metadata !21, metadata !21, metadata !19, null,  null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null, metadata !5}
-!5 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !6} ; [ DW_TAG_pointer_type ]
-!6 = metadata !{metadata !"0x16\00uint\000\000\000\000\000", metadata !20, metadata !2, metadata !7} ; [ DW_TAG_typedef ]
-!7 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, metadata !2} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !"0x101\00ip\001\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{i32 1, i32 32, metadata !0, null}
-!10 = metadata !{metadata !"0x100\00tid\003\000", metadata !11, metadata !1, metadata !6} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{metadata !"0xb\002\001\001", metadata !1, metadata !0} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 5, i32 24, metadata !11, null}
-!13 = metadata !{metadata !"0x100\00gid\003\000", metadata !11, metadata !1, metadata !6} ; [ DW_TAG_auto_variable ]
-!14 = metadata !{i32 6, i32 25, metadata !11, null}
-!15 = metadata !{metadata !"0x100\00lsz\003\000", metadata !11, metadata !1, metadata !6} ; [ DW_TAG_auto_variable ]
-!16 = metadata !{i32 7, i32 26, metadata !11, null}
-!17 = metadata !{i32 9, i32 24, metadata !11, null}
-!18 = metadata !{i32 10, i32 1, metadata !0, null}
-!19 = metadata !{metadata !0}
-!20 = metadata !{metadata !"OCLlLwTXZ.cl", metadata !"/tmp"}
-!21 = metadata !{i32 0}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00__OpenCL_nbt02_kernel\00__OpenCL_nbt02_kernel\00__OpenCL_nbt02_kernel\002\000\001\000\006\000\000\000", !20, !1, !3, null, void (i32 addrspace(1)*)* @__OpenCL_nbt02_kernel, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [__OpenCL_nbt02_kernel]
+!1 = !{!"0x29", !20} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\001\00clc\000\00\000\00\001", !20, !21, !21, !19, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !20, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{null, !5}
+!5 = !{!"0xf\00\000\0032\0032\000\000", null, !2, !6} ; [ DW_TAG_pointer_type ]
+!6 = !{!"0x16\00uint\000\000\000\000\000", !20, !2, !7} ; [ DW_TAG_typedef ]
+!7 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", null, !2} ; [ DW_TAG_base_type ]
+!8 = !{!"0x101\00ip\001\000", !0, !1, !5} ; [ DW_TAG_arg_variable ]
+!9 = !MDLocation(line: 1, column: 32, scope: !0)
+!10 = !{!"0x100\00tid\003\000", !11, !1, !6} ; [ DW_TAG_auto_variable ]
+!11 = !{!"0xb\002\001\001", !1, !0} ; [ DW_TAG_lexical_block ]
+!12 = !MDLocation(line: 5, column: 24, scope: !11)
+!13 = !{!"0x100\00gid\003\000", !11, !1, !6} ; [ DW_TAG_auto_variable ]
+!14 = !MDLocation(line: 6, column: 25, scope: !11)
+!15 = !{!"0x100\00lsz\003\000", !11, !1, !6} ; [ DW_TAG_auto_variable ]
+!16 = !MDLocation(line: 7, column: 26, scope: !11)
+!17 = !MDLocation(line: 9, column: 24, scope: !11)
+!18 = !MDLocation(line: 10, column: 1, scope: !0)
+!19 = !{!0}
+!20 = !{!"OCLlLwTXZ.cl", !"/tmp"}
+!21 = !{i32 0}
+!22 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg-value-location.ll b/test/DebugInfo/X86/dbg-value-location.ll
index 1bfb28f..015ec89 100644
--- a/test/DebugInfo/X86/dbg-value-location.ll
+++ b/test/DebugInfo/X86/dbg-value-location.ll
@@ -18,7 +18,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define i32 @foo(i32 %dev, i64 %cmd, i8* %data, i32 %data2) nounwind optsize ssp {
 entry:
-  call void @llvm.dbg.value(metadata !{i32 %dev}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !13
+  call void @llvm.dbg.value(metadata i32 %dev, i64 0, metadata !12, metadata !{!"0x102"}), !dbg !13
   %tmp.i = load i32* @dfm, align 4, !dbg !14
   %cmp.i = icmp eq i32 %tmp.i, 0, !dbg !14
   br i1 %cmp.i, label %if.else, label %if.end.i, !dbg !14
@@ -50,30 +50,30 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!29}
 
-!0 = metadata !{metadata !"0x2e\00foo\00foo\00\0019510\000\001\000\006\00256\001\0019510", metadata !26, metadata !1, metadata !3, null, i32 (i32, i64, i8*, i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 19510] [def] [foo]
-!1 = metadata !{metadata !"0x29", metadata !26} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 124753)\001\00\000\00\000", metadata !27, metadata !28, metadata !28, metadata !24, null,  null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !26, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x2e\00bar3\00bar3\00\0014827\001\001\000\006\00256\001\000", metadata !26, metadata !1, metadata !3, null, i32 (i32)* @bar3, null, null, null} ; [ DW_TAG_subprogram ] [line 14827] [local] [def] [scope 0] [bar3]
-!7 = metadata !{metadata !"0x2e\00bar2\00bar2\00\0015397\001\001\000\006\00256\001\000", metadata !26, metadata !1, metadata !3, null, i32 (i32)* @bar2, null, null, null} ; [ DW_TAG_subprogram ] [line 15397] [local] [def] [scope 0] [bar2]
-!8 = metadata !{metadata !"0x2e\00bar\00bar\00\0012382\001\001\000\006\00256\001\000", metadata !26, metadata !1, metadata !9, null, i32 (i32, i32*)* @bar, null, null, null} ; [ DW_TAG_subprogram ] [line 12382] [local] [def] [scope 0] [bar]
-!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !26, metadata !1, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!10 = metadata !{metadata !11}
-!11 = metadata !{metadata !"0x24\00unsigned char\000\008\008\000\000\008", null, metadata !2} ; [ DW_TAG_base_type ]
-!12 = metadata !{metadata !"0x101\00var\0019509\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!13 = metadata !{i32 19509, i32 20, metadata !0, null}
-!14 = metadata !{i32 18091, i32 2, metadata !15, metadata !17}
-!15 = metadata !{metadata !"0xb\0018086\001\00748", metadata !26, metadata !16} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{metadata !"0x2e\00foo_bar\00foo_bar\00\0018086\001\001\000\006\00256\001\000", metadata !26, metadata !1, metadata !3, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 18086] [local] [def] [scope 0] [foo_bar]
-!17 = metadata !{i32 19514, i32 2, metadata !18, null}
-!18 = metadata !{metadata !"0xb\0019510\001\0099", metadata !26, metadata !0} ; [ DW_TAG_lexical_block ]
-!22 = metadata !{i32 18094, i32 2, metadata !15, metadata !17}
-!23 = metadata !{i32 19524, i32 1, metadata !18, null}
-!24 = metadata !{metadata !0, metadata !6, metadata !7, metadata !8, metadata !16}
-!25 = metadata !{metadata !"0x29", metadata !27} ; [ DW_TAG_file_type ]
-!26 = metadata !{metadata !"/tmp/f.c", metadata !"/tmp"}
-!27 = metadata !{metadata !"f.i", metadata !"/tmp"}
-!28 = metadata !{i32 0}
-!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00foo\00foo\00\0019510\000\001\000\006\00256\001\0019510", !26, !1, !3, null, i32 (i32, i64, i8*, i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 19510] [def] [foo]
+!1 = !{!"0x29", !26} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 2.9 (trunk 124753)\001\00\000\00\000", !27, !28, !28, !24, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !26, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !2} ; [ DW_TAG_base_type ]
+!6 = !{!"0x2e\00bar3\00bar3\00\0014827\001\001\000\006\00256\001\000", !26, !1, !3, null, i32 (i32)* @bar3, null, null, null} ; [ DW_TAG_subprogram ] [line 14827] [local] [def] [scope 0] [bar3]
+!7 = !{!"0x2e\00bar2\00bar2\00\0015397\001\001\000\006\00256\001\000", !26, !1, !3, null, i32 (i32)* @bar2, null, null, null} ; [ DW_TAG_subprogram ] [line 15397] [local] [def] [scope 0] [bar2]
+!8 = !{!"0x2e\00bar\00bar\00\0012382\001\001\000\006\00256\001\000", !26, !1, !9, null, i32 (i32, i32*)* @bar, null, null, null} ; [ DW_TAG_subprogram ] [line 12382] [local] [def] [scope 0] [bar]
+!9 = !{!"0x15\00\000\000\000\000\000\000", !26, !1, null, !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = !{!11}
+!11 = !{!"0x24\00unsigned char\000\008\008\000\000\008", null, !2} ; [ DW_TAG_base_type ]
+!12 = !{!"0x101\00var\0019509\000", !0, !1, !5} ; [ DW_TAG_arg_variable ]
+!13 = !MDLocation(line: 19509, column: 20, scope: !0)
+!14 = !MDLocation(line: 18091, column: 2, scope: !15, inlinedAt: !17)
+!15 = !{!"0xb\0018086\001\00748", !26, !16} ; [ DW_TAG_lexical_block ]
+!16 = !{!"0x2e\00foo_bar\00foo_bar\00\0018086\001\001\000\006\00256\001\000", !26, !1, !3, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 18086] [local] [def] [scope 0] [foo_bar]
+!17 = !MDLocation(line: 19514, column: 2, scope: !18)
+!18 = !{!"0xb\0019510\001\0099", !26, !0} ; [ DW_TAG_lexical_block ]
+!22 = !MDLocation(line: 18094, column: 2, scope: !15, inlinedAt: !17)
+!23 = !MDLocation(line: 19524, column: 1, scope: !18)
+!24 = !{!0, !6, !7, !8, !16}
+!25 = !{!"0x29", !27} ; [ DW_TAG_file_type ]
+!26 = !{!"/tmp/f.c", !"/tmp"}
+!27 = !{!"f.i", !"/tmp"}
+!28 = !{i32 0}
+!29 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg-value-range.ll b/test/DebugInfo/X86/dbg-value-range.ll
index aa75369..727f906 100644
--- a/test/DebugInfo/X86/dbg-value-range.ll
+++ b/test/DebugInfo/X86/dbg-value-range.ll
@@ -4,10 +4,10 @@
 
 define i32 @bar(%struct.a* nocapture %b) nounwind ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.a* %b}, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !13
+  tail call void @llvm.dbg.value(metadata %struct.a* %b, i64 0, metadata !6, metadata !{!"0x102"}), !dbg !13
   %tmp1 = getelementptr inbounds %struct.a* %b, i64 0, i32 0, !dbg !14
   %tmp2 = load i32* %tmp1, align 4, !dbg !14
-  tail call void @llvm.dbg.value(metadata !{i32 %tmp2}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !14
+  tail call void @llvm.dbg.value(metadata i32 %tmp2, i64 0, metadata !11, metadata !{!"0x102"}), !dbg !14
   %call = tail call i32 (...)* @foo(i32 %tmp2) nounwind , !dbg !18
   %add = add nsw i32 %tmp2, 1, !dbg !19
   ret i32 %add, !dbg !19
@@ -20,27 +20,27 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!24}
 
-!0 = metadata !{metadata !"0x2e\00bar\00bar\00\005\000\001\000\006\00256\001\000", metadata !22, metadata !1, metadata !3, null, i32 (%struct.a*)* @bar, null, null, metadata !21} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [bar]
-!1 = metadata !{metadata !"0x29", metadata !22} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 122997)\001\00\000\00\001", metadata !22, metadata !23, metadata !23, metadata !20, null,  null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !22, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x101\00b\005\000", metadata !0, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !2, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{metadata !"0x13\00a\001\0032\0032\000\000\000", metadata !22, metadata !2, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 1, size 32, align 32, offset 0] [def] [from ]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0xd\00c\002\0032\0032\000\000", metadata !22, metadata !1, metadata !5} ; [ DW_TAG_member ]
-!11 = metadata !{metadata !"0x100\00x\006\000", metadata !12, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!12 = metadata !{metadata !"0xb\005\0022\000", metadata !22, metadata !0} ; [ DW_TAG_lexical_block ]
-!13 = metadata !{i32 5, i32 19, metadata !0, null}
-!14 = metadata !{i32 6, i32 14, metadata !12, null}
-!18 = metadata !{i32 7, i32 2, metadata !12, null}
-!19 = metadata !{i32 8, i32 2, metadata !12, null}
-!20 = metadata !{metadata !0}
-!21 = metadata !{metadata !6, metadata !11}
-!22 = metadata !{metadata !"bar.c", metadata !"/private/tmp"}
-!23 = metadata !{i32 0}
+!0 = !{!"0x2e\00bar\00bar\00\005\000\001\000\006\00256\001\000", !22, !1, !3, null, i32 (%struct.a*)* @bar, null, null, !21} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [bar]
+!1 = !{!"0x29", !22} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 2.9 (trunk 122997)\001\00\000\00\001", !22, !23, !23, !20, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !22, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !2} ; [ DW_TAG_base_type ]
+!6 = !{!"0x101\00b\005\000", !0, !1, !7} ; [ DW_TAG_arg_variable ]
+!7 = !{!"0xf\00\000\0064\0064\000\000", null, !2, !8} ; [ DW_TAG_pointer_type ]
+!8 = !{!"0x13\00a\001\0032\0032\000\000\000", !22, !2, null, !9, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 1, size 32, align 32, offset 0] [def] [from ]
+!9 = !{!10}
+!10 = !{!"0xd\00c\002\0032\0032\000\000", !22, !1, !5} ; [ DW_TAG_member ]
+!11 = !{!"0x100\00x\006\000", !12, !1, !5} ; [ DW_TAG_auto_variable ]
+!12 = !{!"0xb\005\0022\000", !22, !0} ; [ DW_TAG_lexical_block ]
+!13 = !MDLocation(line: 5, column: 19, scope: !0)
+!14 = !MDLocation(line: 6, column: 14, scope: !12)
+!18 = !MDLocation(line: 7, column: 2, scope: !12)
+!19 = !MDLocation(line: 8, column: 2, scope: !12)
+!20 = !{!0}
+!21 = !{!6, !11}
+!22 = !{!"bar.c", !"/private/tmp"}
+!23 = !{i32 0}
 
 ; Check that variable bar:b value range is appropriately truncated in debug info.
 ; The variable is in %rdi which is clobbered by 'movl %ebx, %edi'
@@ -62,4 +62,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 ;CHECK-NEXT: Ltmp
 ;CHECK-NEXT:	.quad	0
 ;CHECK-NEXT:	.quad	0
-!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!24 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg-value-terminator.ll b/test/DebugInfo/X86/dbg-value-terminator.ll
index 763034d..88c3ba2 100644
--- a/test/DebugInfo/X86/dbg-value-terminator.ll
+++ b/test/DebugInfo/X86/dbg-value-terminator.ll
@@ -6,7 +6,7 @@
 ; terminator.
 ;
 ; CHECK-LABEL: test:
-; CHECK: ##DEBUG_VALUE: i
+; CHECK: ##DEBUG_VALUE: foo:i
 %a = type { i32, i32 }
 
 define hidden fastcc %a* @test() #1 {
@@ -87,7 +87,7 @@ VEC_edge_base_index.exit7.i:                      ; preds = %"3.i5.i"
 "44.i":                                           ; preds = %"42.i"
   %2 = load %a** undef, align 8, !dbg !12
   %3 = bitcast %a* %2 to %a*, !dbg !12
-  call void @llvm.dbg.value(metadata !{%a* %3}, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !12
+  call void @llvm.dbg.value(metadata %a* %3, i64 0, metadata !6, metadata !{!"0x102"}), !dbg !12
   br label %may_unswitch_on.exit, !dbg !12
 
 "45.i":                                           ; preds = %"38.i"
@@ -113,21 +113,21 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22}
 
-!0 = metadata !{metadata !"0x11\0012\00Apple clang version\001\00\000\00\001", metadata !20, metadata !21, metadata !21, metadata !18, null,  null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"0x2e\00foo\00\00\002\000\001\000\006\00256\001\000", metadata !20, metadata !2, metadata !3, null, %a* ()* @test, null, null, metadata !19} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
-!2 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x101\00i\0016777218\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{metadata !"0x101\00c\0033554434\000", metadata !1, metadata !2, metadata !8} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !0, metadata !9} ; [ DW_TAG_pointer_type ]
-!9 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !0} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !"0x100\00a\003\000", metadata !11, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{metadata !"0xb\002\0025\000", metadata !20, metadata !1} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 2, i32 13, metadata !1, null}
-!18 = metadata !{metadata !1}
-!19 = metadata !{metadata !6, metadata !7, metadata !10}
-!20 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
-!21 = metadata !{i32 0}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00Apple clang version\001\00\000\00\001", !20, !21, !21, !18, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = !{!"0x2e\00foo\00foo\00\002\000\001\000\006\00256\001\000", !20, !2, !3, null, %a* ()* @test, null, null, !19} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
+!2 = !{!"0x29", !20} ; [ DW_TAG_file_type ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !20, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !0} ; [ DW_TAG_base_type ]
+!6 = !{!"0x101\00i\0016777218\000", !1, !2, !5} ; [ DW_TAG_arg_variable ]
+!7 = !{!"0x101\00c\0033554434\000", !1, !2, !8} ; [ DW_TAG_arg_variable ]
+!8 = !{!"0xf\00\000\0064\0064\000\000", null, !0, !9} ; [ DW_TAG_pointer_type ]
+!9 = !{!"0x24\00char\000\008\008\000\000\006", null, !0} ; [ DW_TAG_base_type ]
+!10 = !{!"0x100\00a\003\000", !11, !2, !9} ; [ DW_TAG_auto_variable ]
+!11 = !{!"0xb\002\0025\000", !20, !1} ; [ DW_TAG_lexical_block ]
+!12 = !MDLocation(line: 2, column: 13, scope: !1)
+!18 = !{!1}
+!19 = !{!6, !7, !10}
+!20 = !{!"a.c", !"/private/tmp"}
+!21 = !{i32 0}
+!22 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dbg_value_direct.ll b/test/DebugInfo/X86/dbg_value_direct.ll
index edc42c0..6723ba5 100644
--- a/test/DebugInfo/X86/dbg_value_direct.ll
+++ b/test/DebugInfo/X86/dbg_value_direct.ll
@@ -53,7 +53,7 @@ entry:
   %19 = inttoptr i64 %18 to i8*
   %20 = load i8* %19
   %21 = icmp ne i8 %20, 0
-  call void @llvm.dbg.declare(metadata !{i32* %3}, metadata !23, metadata !28)
+  call void @llvm.dbg.declare(metadata i32* %3, metadata !23, metadata !28)
   br i1 %21, label %22, label %28
 
 ; <label>:22                                      ; preds = %entry
@@ -70,7 +70,7 @@ entry:
 
 ; <label>:28                                      ; preds = %22, %entry
   store i32 %0, i32* %3, align 4
-  call void @llvm.dbg.declare(metadata !{%struct.A* %agg.result}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
+  call void @llvm.dbg.declare(metadata %struct.A* %agg.result, metadata !24, metadata !{!"0x102\006"}), !dbg !25
   call void @_ZN1AC1Ev(%struct.A* %agg.result), !dbg !25
   store i64 1172321806, i64* %4, !dbg !26
   %29 = inttoptr i64 %10 to i32*, !dbg !26
@@ -147,32 +147,32 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22, !27}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/crash.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"crash.cpp", metadata !"/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00func\00func\00_Z4funci\006\000\001\000\006\00256\000\006", metadata !1, metadata !5, metadata !6, null, void (%struct.A*, i32)* @_Z4funci, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/crash.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !21}
-!8 = metadata !{metadata !"0x13\00A\001\008\008\000\000\000", metadata !1, null, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
-!9 = metadata !{metadata !10, metadata !15}
-!10 = metadata !{metadata !"0x2e\00A\00A\00\002\000\000\000\006\00256\000\002", metadata !1, metadata !8, metadata !11, null, null, null, i32 0, metadata !14} ; [ DW_TAG_subprogram ] [line 2] [A]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{null, metadata !13}
-!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
-!14 = metadata !{i32 786468}
-!15 = metadata !{metadata !"0x2e\00A\00A\00\003\000\000\000\006\00256\000\003", metadata !1, metadata !8, metadata !16, null, null, null, i32 0, metadata !20} ; [ DW_TAG_subprogram ] [line 3] [A]
-!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!17 = metadata !{null, metadata !13, metadata !18}
-!18 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !19} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!19 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from A]
-!20 = metadata !{i32 786468}
-!21 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!22 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!23 = metadata !{metadata !"0x101\00\0016777222\000", metadata !4, metadata !5, metadata !21} ; [ DW_TAG_arg_variable ] [line 6]
-!24 = metadata !{metadata !"0x100\00a\007\008192", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [a] [line 7]
-!25 = metadata !{i32 7, i32 0, metadata !4, null}
-!26 = metadata !{i32 8, i32 0, metadata !4, null}
-!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!28 = metadata !{metadata !"0x102\006"} ; [ DW_TAG_expression ] [DW_OP_deref]
+!0 = !{!"0x11\004\00clang version 3.4 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/crash.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"crash.cpp", !"/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00func\00func\00_Z4funci\006\000\001\000\006\00256\000\006", !1, !5, !6, null, void (%struct.A*, i32)* @_Z4funci, null, null, !2} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/crash.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !21}
+!8 = !{!"0x13\00A\001\008\008\000\000\000", !1, null, null, !9, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
+!9 = !{!10, !15}
+!10 = !{!"0x2e\00A\00A\00\002\000\000\000\006\00256\000\002", !1, !8, !11, null, null, null, i32 0, !14} ; [ DW_TAG_subprogram ] [line 2] [A]
+!11 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{null, !13}
+!13 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
+!14 = !{i32 786468}
+!15 = !{!"0x2e\00A\00A\00\003\000\000\000\006\00256\000\003", !1, !8, !16, null, null, null, i32 0, !20} ; [ DW_TAG_subprogram ] [line 3] [A]
+!16 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = !{null, !13, !18}
+!18 = !{!"0x10\00\000\000\000\000\000", null, null, !19} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = !{!"0x26\00\000\000\000\000\000", null, null, !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from A]
+!20 = !{i32 786468}
+!21 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!22 = !{i32 2, !"Dwarf Version", i32 3}
+!23 = !{!"0x101\00\0016777222\000", !4, !5, !21} ; [ DW_TAG_arg_variable ] [line 6]
+!24 = !{!"0x100\00a\007\000", !4, !5, !8} ; [ DW_TAG_auto_variable ] [a] [line 7]
+!25 = !MDLocation(line: 7, scope: !4)
+!26 = !MDLocation(line: 8, scope: !4)
+!27 = !{i32 1, !"Debug Info Version", i32 2}
+!28 = !{!"0x102\006"} ; [ DW_TAG_expression ] [DW_OP_deref]
diff --git a/test/DebugInfo/X86/debug-dead-local-var.ll b/test/DebugInfo/X86/debug-dead-local-var.ll
index 08a22a6..6733dd8 100644
--- a/test/DebugInfo/X86/debug-dead-local-var.ll
+++ b/test/DebugInfo/X86/debug-dead-local-var.ll
@@ -27,25 +27,25 @@ attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-fra
 !llvm.module.flags = !{!18, !19}
 !llvm.ident = !{!20}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 (trunk 209255) (llvm/trunk 209253)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/debug-dead-local-var.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"debug-dead-local-var.c", metadata !"/usr/local/google/home/echristo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !9}
-!4 = metadata !{metadata !"0x2e\00bar\00bar\00\0011\000\001\000\006\000\001\0011", metadata !1, metadata !5, metadata !6, null, i32 ()* @bar, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 11] [def] [bar]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/debug-dead-local-var.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x2e\00foo\00foo\00\006\001\001\000\006\000\001\006", metadata !1, metadata !5, metadata !10, null, null, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 6] [local] [def] [foo]
-!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!11 = metadata !{null}
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x100\00xyz\008\000", metadata !9, metadata !5, metadata !14} ; [ DW_TAG_auto_variable ] [xyz] [line 8]
-!14 = metadata !{metadata !"0x13\00X\008\0064\0032\000\000\000", metadata !1, metadata !9, null, metadata !15, null, null, null} ; [ DW_TAG_structure_type ] [X] [line 8, size 64, align 32, offset 0] [def] [from ]
-!15 = metadata !{metadata !16, metadata !17}
-!16 = metadata !{metadata !"0xd\00a\008\0032\0032\000\000", metadata !1, metadata !14, metadata !8} ; [ DW_TAG_member ] [a] [line 8, size 32, align 32, offset 0] [from int]
-!17 = metadata !{metadata !"0xd\00b\008\0032\0032\0032\000", metadata !1, metadata !14, metadata !8} ; [ DW_TAG_member ] [b] [line 8, size 32, align 32, offset 32] [from int]
-!18 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!19 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!20 = metadata !{metadata !"clang version 3.5.0 (trunk 209255) (llvm/trunk 209253)"}
-!21 = metadata !{i32 13, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 (trunk 209255) (llvm/trunk 209253)\001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/debug-dead-local-var.c] [DW_LANG_C99]
+!1 = !{!"debug-dead-local-var.c", !"/usr/local/google/home/echristo"}
+!2 = !{}
+!3 = !{!4, !9}
+!4 = !{!"0x2e\00bar\00bar\00\0011\000\001\000\006\000\001\0011", !1, !5, !6, null, i32 ()* @bar, null, null, !2} ; [ DW_TAG_subprogram ] [line 11] [def] [bar]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/debug-dead-local-var.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x2e\00foo\00foo\00\006\001\001\000\006\000\001\006", !1, !5, !10, null, null, null, null, !12} ; [ DW_TAG_subprogram ] [line 6] [local] [def] [foo]
+!10 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = !{null}
+!12 = !{!13}
+!13 = !{!"0x100\00xyz\008\000", !9, !5, !14} ; [ DW_TAG_auto_variable ] [xyz] [line 8]
+!14 = !{!"0x13\00X\008\0064\0032\000\000\000", !1, !9, null, !15, null, null, null} ; [ DW_TAG_structure_type ] [X] [line 8, size 64, align 32, offset 0] [def] [from ]
+!15 = !{!16, !17}
+!16 = !{!"0xd\00a\008\0032\0032\000\000", !1, !14, !8} ; [ DW_TAG_member ] [a] [line 8, size 32, align 32, offset 0] [from int]
+!17 = !{!"0xd\00b\008\0032\0032\0032\000", !1, !14, !8} ; [ DW_TAG_member ] [b] [line 8, size 32, align 32, offset 32] [from int]
+!18 = !{i32 2, !"Dwarf Version", i32 4}
+!19 = !{i32 2, !"Debug Info Version", i32 2}
+!20 = !{!"clang version 3.5.0 (trunk 209255) (llvm/trunk 209253)"}
+!21 = !MDLocation(line: 13, scope: !4)
diff --git a/test/DebugInfo/X86/debug-info-access.ll b/test/DebugInfo/X86/debug-info-access.ll
index 952330c..7727384 100644
--- a/test/DebugInfo/X86/debug-info-access.ll
+++ b/test/DebugInfo/X86/debug-info-access.ll
@@ -106,45 +106,45 @@ attributes #0 = { nounwind ssp uwtable }
 !llvm.module.flags = !{!38, !39}
 !llvm.ident = !{!40}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !29, metadata !34, metadata !2} ; [ DW_TAG_compile_unit ] [/llvm/tools/clang/test/CodeGenCXX/debug-info-access.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"/llvm/tools/clang/test/CodeGenCXX/debug-info-access.cpp", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !12, metadata !22}
-!4 = metadata !{metadata !"0x13\00A\003\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !6, metadata !8}
-!6 = metadata !{metadata !"0xd\00pub_default_static\007\000\000\000\004096", metadata !1, metadata !"_ZTS1A", metadata !7, null} ; [ DW_TAG_member ] [pub_default_static] [line 7, size 0, align 0, offset 0] [static] [from int]
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!8 = metadata !{metadata !"0x2e\00pub_default\00pub_default\00_ZN1A11pub_defaultEv\005\000\000\000\006\00256\000\005", metadata !1, metadata !"_ZTS1A", metadata !9, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 5] [pub_default]
-!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!10 = metadata !{null, metadata !11}
-!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!12 = metadata !{metadata !"0x2\00B\0011\008\008\000\000\000", metadata !1, null, null, metadata !13, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_class_type ] [B] [line 11, size 8, align 8, offset 0] [def] [from ]
-!13 = metadata !{metadata !14, metadata !15, metadata !16, metadata !20, metadata !21}
-!14 = metadata !{metadata !"0x1c\00\000\000\000\000\003", null, metadata !"_ZTS1B", metadata !"_ZTS1A"} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [public] [from _ZTS1A]
-!15 = metadata !{metadata !"0xd\00public_static\0016\000\000\000\004099", metadata !1, metadata !"_ZTS1B", metadata !7, null} ; [ DW_TAG_member ] [public_static] [line 16, size 0, align 0, offset 0] [public] [static] [from int]
-!16 = metadata !{metadata !"0x2e\00pub\00pub\00_ZN1B3pubEv\0014\000\000\000\006\00259\000\0014", metadata !1, metadata !"_ZTS1B", metadata !17, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 14] [public] [pub]
-!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!18 = metadata !{null, metadata !19}
-!19 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
-!20 = metadata !{metadata !"0x2e\00prot\00prot\00_ZN1B4protEv\0019\000\000\000\006\00258\000\0019", metadata !1, metadata !"_ZTS1B", metadata !17, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 19] [protected] [prot]
-!21 = metadata !{metadata !"0x2e\00priv_default\00priv_default\00_ZN1B12priv_defaultEv\0022\000\000\000\006\00256\000\0022", metadata !1, metadata !"_ZTS1B", metadata !17, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 22] [priv_default]
-!22 = metadata !{metadata !"0x17\00U\0025\0032\0032\000\000\000", metadata !1, null, null, metadata !23, null, null, metadata !"_ZTS1U"} ; [ DW_TAG_union_type ] [U] [line 25, size 32, align 32, offset 0] [def] [from ]
-!23 = metadata !{metadata !24, metadata !25}
-!24 = metadata !{metadata !"0xd\00union_priv\0030\0032\0032\000\001", metadata !1, metadata !"_ZTS1U", metadata !7} ; [ DW_TAG_member ] [union_priv] [line 30, size 32, align 32, offset 0] [private] [from int]
-!25 = metadata !{metadata !"0x2e\00union_pub_default\00union_pub_default\00_ZN1U17union_pub_defaultEv\0027\000\000\000\006\00256\000\0027", metadata !1, metadata !"_ZTS1U", metadata !26, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 27] [union_pub_default]
-!26 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !27, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!27 = metadata !{null, metadata !28}
-!28 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1U"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1U]
-!29 = metadata !{metadata !30}
-!30 = metadata !{metadata !"0x2e\00free\00free\00_Z4freev\0035\000\001\000\006\00256\000\0035", metadata !1, metadata !31, metadata !32, null, void ()* @_Z4freev, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 35] [def] [free]
-!31 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/llvm/tools/clang/test/CodeGenCXX/debug-info-access.cpp]
-!32 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !33, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!33 = metadata !{null}
-!34 = metadata !{metadata !35, metadata !36, metadata !37}
-!35 = metadata !{metadata !"0x34\00a\00a\00\0037\000\001", null, metadata !31, metadata !"_ZTS1A", %struct.A* @a, null} ; [ DW_TAG_variable ] [a] [line 37] [def]
-!36 = metadata !{metadata !"0x34\00b\00b\00\0038\000\001", null, metadata !31, metadata !"_ZTS1B", %class.B* @b, null} ; [ DW_TAG_variable ] [b] [line 38] [def]
-!37 = metadata !{metadata !"0x34\00u\00u\00\0039\000\001", null, metadata !31, metadata !"_ZTS1U", %union.U* @u, null} ; [ DW_TAG_variable ] [u] [line 39] [def]
-!38 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!39 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!40 = metadata !{metadata !"clang version 3.6.0 "}
-!41 = metadata !{i32 35, i32 14, metadata !30, null}
+!0 = !{!"0x11\004\00clang version 3.6.0 \000\00\000\00\001", !1, !2, !3, !29, !34, !2} ; [ DW_TAG_compile_unit ] [/llvm/tools/clang/test/CodeGenCXX/debug-info-access.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"/llvm/tools/clang/test/CodeGenCXX/debug-info-access.cpp", !""}
+!2 = !{}
+!3 = !{!4, !12, !22}
+!4 = !{!"0x13\00A\003\008\008\000\000\000", !1, null, null, !5, null, null, !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!6, !8}
+!6 = !{!"0xd\00pub_default_static\007\000\000\000\004096", !1, !"_ZTS1A", !7, null} ; [ DW_TAG_member ] [pub_default_static] [line 7, size 0, align 0, offset 0] [static] [from int]
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = !{!"0x2e\00pub_default\00pub_default\00_ZN1A11pub_defaultEv\005\000\000\000\006\00256\000\005", !1, !"_ZTS1A", !9, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 5] [pub_default]
+!9 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = !{null, !11}
+!11 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!12 = !{!"0x2\00B\0011\008\008\000\000\000", !1, null, null, !13, null, null, !"_ZTS1B"} ; [ DW_TAG_class_type ] [B] [line 11, size 8, align 8, offset 0] [def] [from ]
+!13 = !{!14, !15, !16, !20, !21}
+!14 = !{!"0x1c\00\000\000\000\000\003", null, !"_ZTS1B", !"_ZTS1A"} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [public] [from _ZTS1A]
+!15 = !{!"0xd\00public_static\0016\000\000\000\004099", !1, !"_ZTS1B", !7, null} ; [ DW_TAG_member ] [public_static] [line 16, size 0, align 0, offset 0] [public] [static] [from int]
+!16 = !{!"0x2e\00pub\00pub\00_ZN1B3pubEv\0014\000\000\000\006\00259\000\0014", !1, !"_ZTS1B", !17, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 14] [public] [pub]
+!17 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = !{null, !19}
+!19 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
+!20 = !{!"0x2e\00prot\00prot\00_ZN1B4protEv\0019\000\000\000\006\00258\000\0019", !1, !"_ZTS1B", !17, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 19] [protected] [prot]
+!21 = !{!"0x2e\00priv_default\00priv_default\00_ZN1B12priv_defaultEv\0022\000\000\000\006\00256\000\0022", !1, !"_ZTS1B", !17, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 22] [priv_default]
+!22 = !{!"0x17\00U\0025\0032\0032\000\000\000", !1, null, null, !23, null, null, !"_ZTS1U"} ; [ DW_TAG_union_type ] [U] [line 25, size 32, align 32, offset 0] [def] [from ]
+!23 = !{!24, !25}
+!24 = !{!"0xd\00union_priv\0030\0032\0032\000\001", !1, !"_ZTS1U", !7} ; [ DW_TAG_member ] [union_priv] [line 30, size 32, align 32, offset 0] [private] [from int]
+!25 = !{!"0x2e\00union_pub_default\00union_pub_default\00_ZN1U17union_pub_defaultEv\0027\000\000\000\006\00256\000\0027", !1, !"_ZTS1U", !26, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 27] [union_pub_default]
+!26 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !27, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!27 = !{null, !28}
+!28 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1U"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1U]
+!29 = !{!30}
+!30 = !{!"0x2e\00free\00free\00_Z4freev\0035\000\001\000\006\00256\000\0035", !1, !31, !32, null, void ()* @_Z4freev, null, null, !2} ; [ DW_TAG_subprogram ] [line 35] [def] [free]
+!31 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/llvm/tools/clang/test/CodeGenCXX/debug-info-access.cpp]
+!32 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !33, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!33 = !{null}
+!34 = !{!35, !36, !37}
+!35 = !{!"0x34\00a\00a\00\0037\000\001", null, !31, !"_ZTS1A", %struct.A* @a, null} ; [ DW_TAG_variable ] [a] [line 37] [def]
+!36 = !{!"0x34\00b\00b\00\0038\000\001", null, !31, !"_ZTS1B", %class.B* @b, null} ; [ DW_TAG_variable ] [b] [line 38] [def]
+!37 = !{!"0x34\00u\00u\00\0039\000\001", null, !31, !"_ZTS1U", %union.U* @u, null} ; [ DW_TAG_variable ] [u] [line 39] [def]
+!38 = !{i32 2, !"Dwarf Version", i32 2}
+!39 = !{i32 2, !"Debug Info Version", i32 2}
+!40 = !{!"clang version 3.6.0 "}
+!41 = !MDLocation(line: 35, column: 14, scope: !30)
diff --git a/test/DebugInfo/X86/debug-info-block-captured-self.ll b/test/DebugInfo/X86/debug-info-block-captured-self.ll
index d610aa6..b7d6dd4 100644
--- a/test/DebugInfo/X86/debug-info-block-captured-self.ll
+++ b/test/DebugInfo/X86/debug-info-block-captured-self.ll
@@ -67,46 +67,46 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 define internal void @"__24-[Main initWithContext:]_block_invoke"(i8* %.block_descriptor, i8* %obj) #0 {
   %block = bitcast i8* %.block_descriptor to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>*, !dbg !84
   %block.captured-self = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, i32 0, i32 5, !dbg !84
-  call void @llvm.dbg.declare(metadata !{<{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block}, metadata !86, metadata !110), !dbg !87
+  call void @llvm.dbg.declare(metadata <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, metadata !86, metadata !110), !dbg !87
   ret void, !dbg !87
 }
 
 define internal void @"__24-[Main initWithContext:]_block_invoke_2"(i8* %.block_descriptor, i8* %object) #0 {
   %block = bitcast i8* %.block_descriptor to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>*, !dbg !103
   %block.captured-self = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, i32 0, i32 5, !dbg !103
-  call void @llvm.dbg.declare(metadata !{<{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block}, metadata !105, metadata !109), !dbg !106
+  call void @llvm.dbg.declare(metadata <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, metadata !105, metadata !109), !dbg !106
   ret void, !dbg !106
 }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!108}
-!0 = metadata !{metadata !"0x11\0016\00clang version 3.3 \000\00\002\00\000", metadata !107, metadata !2, metadata !4, metadata !23, metadata !15,  metadata !15} ; [ DW_TAG_compile_unit ] [llvm/tools/clang/test/CodeGenObjC/debug-info-block-captured-self.m] [DW_LANG_ObjC]
-!1 = metadata !{metadata !"0x29", metadata !107} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !3}
-!3 = metadata !{metadata !"0x4\00\0020\0032\0032\000\000\000", metadata !107, null, null, metadata !4, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
-!4 = metadata !{}
-!15 = metadata !{}
-!23 = metadata !{metadata !38, metadata !42}
-!27 = metadata !{metadata !"0x16\00id\0031\000\000\000\000", metadata !107, null, metadata !28} ; [ DW_TAG_typedef ] [id] [line 31, size 0, align 0, offset 0] [from ]
-!28 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !29} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_object]
-!29 = metadata !{metadata !"0x13\00objc_object\000\000\000\000\000\000", metadata !107, null, null, metadata !30, null, null, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [def] [from ]
-!30 = metadata !{metadata !31}
-!31 = metadata !{metadata !"0xd\00isa\000\0064\000\000\000", metadata !107, metadata !29, metadata !32} ; [ DW_TAG_member ] [isa] [line 0, size 64, align 0, offset 0] [from ]
-!32 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !33} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from objc_class]
-!33 = metadata !{metadata !"0x13\00objc_class\000\000\000\000\004\000", metadata !107, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
-!34 = metadata !{metadata !"0x13\00Main\0023\000\000\000\001092\0016", metadata !107, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Main] [line 23, size 0, align 0, offset 0] [artificial] [decl] [from ]
-!38 = metadata !{metadata !"0x2e\00__24-[Main initWithContext:]_block_invoke\00__24-[Main initWithContext:]_block_invoke\00\0033\001\001\000\006\00256\000\0033", metadata !1, metadata !1, metadata !39, null, void (i8*, i8*)* @"__24-[Main initWithContext:]_block_invoke", null, null, metadata !15} ; [ DW_TAG_subprogram ] [line 33] [local] [def] [__24-[Main initWithContext:]_block_invoke]
-!39 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !40, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!40 = metadata !{null, metadata !41, metadata !27}
-!41 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!42 = metadata !{metadata !"0x2e\00__24-[Main initWithContext:]_block_invoke_2\00__24-[Main initWithContext:]_block_invoke_2\00\0035\001\001\000\006\00256\000\0035", metadata !1, metadata !1, metadata !39, null, void (i8*, i8*)* @"__24-[Main initWithContext:]_block_invoke_2", null, null, metadata !15} ; [ DW_TAG_subprogram ] [line 35] [local] [def] [__24-[Main initWithContext:]_block_invoke_2]
-!84 = metadata !{i32 33, i32 0, metadata !38, null}
-!86 = metadata !{metadata !"0x100\00self\0041\000", metadata !38, metadata !1, metadata !34} ; [ DW_TAG_auto_variable ] [self] [line 41]
-!87 = metadata !{i32 41, i32 0, metadata !38, null}
-!103 = metadata !{i32 35, i32 0, metadata !42, null}
-!105 = metadata !{metadata !"0x100\00self\0040\000", metadata !42, metadata !1, metadata !34} ; [ DW_TAG_auto_variable ] [self] [line 40]
-!106 = metadata !{i32 40, i32 0, metadata !42, null}
-!107 = metadata !{metadata !"llvm/tools/clang/test/CodeGenObjC/debug-info-block-captured-self.m", metadata !""}
-!108 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!109 = metadata !{metadata !"0x102\0034\0032"} ; [ DW_TAG_expression ] [DW_OP_plus 32]
-!110 = metadata !{metadata !"0x102\0034\0032"} ; [ DW_TAG_expression ] [DW_OP_plus 32]
+!0 = !{!"0x11\0016\00clang version 3.3 \000\00\002\00\000", !107, !2, !4, !23, !15,  !15} ; [ DW_TAG_compile_unit ] [llvm/tools/clang/test/CodeGenObjC/debug-info-block-captured-self.m] [DW_LANG_ObjC]
+!1 = !{!"0x29", !107} ; [ DW_TAG_file_type ]
+!2 = !{!3}
+!3 = !{!"0x4\00\0020\0032\0032\000\000\000", !107, null, null, !4, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
+!4 = !{}
+!15 = !{}
+!23 = !{!38, !42}
+!27 = !{!"0x16\00id\0031\000\000\000\000", !107, null, !28} ; [ DW_TAG_typedef ] [id] [line 31, size 0, align 0, offset 0] [from ]
+!28 = !{!"0xf\00\000\0064\0064\000\000", null, null, !29} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_object]
+!29 = !{!"0x13\00objc_object\000\000\000\000\000\000", !107, null, null, !30, null, null, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [def] [from ]
+!30 = !{!31}
+!31 = !{!"0xd\00isa\000\0064\000\000\000", !107, !29, !32} ; [ DW_TAG_member ] [isa] [line 0, size 64, align 0, offset 0] [from ]
+!32 = !{!"0xf\00\000\0064\000\000\000", null, null, !33} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from objc_class]
+!33 = !{!"0x13\00objc_class\000\000\000\000\004\000", !107, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!34 = !{!"0x13\00Main\0023\000\000\000\001092\0016", !107, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Main] [line 23, size 0, align 0, offset 0] [artificial] [decl] [from ]
+!38 = !{!"0x2e\00__24-[Main initWithContext:]_block_invoke\00__24-[Main initWithContext:]_block_invoke\00\0033\001\001\000\006\00256\000\0033", !1, !1, !39, null, void (i8*, i8*)* @"__24-[Main initWithContext:]_block_invoke", null, null, !15} ; [ DW_TAG_subprogram ] [line 33] [local] [def] [__24-[Main initWithContext:]_block_invoke]
+!39 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !40, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!40 = !{null, !41, !27}
+!41 = !{!"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!42 = !{!"0x2e\00__24-[Main initWithContext:]_block_invoke_2\00__24-[Main initWithContext:]_block_invoke_2\00\0035\001\001\000\006\00256\000\0035", !1, !1, !39, null, void (i8*, i8*)* @"__24-[Main initWithContext:]_block_invoke_2", null, null, !15} ; [ DW_TAG_subprogram ] [line 35] [local] [def] [__24-[Main initWithContext:]_block_invoke_2]
+!84 = !MDLocation(line: 33, scope: !38)
+!86 = !{!"0x100\00self\0041\000", !38, !1, !34} ; [ DW_TAG_auto_variable ] [self] [line 41]
+!87 = !MDLocation(line: 41, scope: !38)
+!103 = !MDLocation(line: 35, scope: !42)
+!105 = !{!"0x100\00self\0040\000", !42, !1, !34} ; [ DW_TAG_auto_variable ] [self] [line 40]
+!106 = !MDLocation(line: 40, scope: !42)
+!107 = !{!"llvm/tools/clang/test/CodeGenObjC/debug-info-block-captured-self.m", !""}
+!108 = !{i32 1, !"Debug Info Version", i32 2}
+!109 = !{!"0x102\0034\0032"} ; [ DW_TAG_expression ] [DW_OP_plus 32]
+!110 = !{!"0x102\0034\0032"} ; [ DW_TAG_expression ] [DW_OP_plus 32]
diff --git a/test/DebugInfo/X86/debug-info-blocks.ll b/test/DebugInfo/X86/debug-info-blocks.ll
index 9f6ed5c..7cba57f 100644
--- a/test/DebugInfo/X86/debug-info-blocks.ll
+++ b/test/DebugInfo/X86/debug-info-blocks.ll
@@ -101,9 +101,9 @@ define internal i8* @"\01-[A init]"(%0* %self, i8* %_cmd) #0 {
   %3 = alloca %struct._objc_super
   %4 = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>, align 8
   store %0* %self, %0** %1, align 8
-  call void @llvm.dbg.declare(metadata !{%0** %1}, metadata !60, metadata !{metadata !"0x102"}), !dbg !62
+  call void @llvm.dbg.declare(metadata %0** %1, metadata !60, metadata !{!"0x102"}), !dbg !62
   store i8* %_cmd, i8** %2, align 8
-  call void @llvm.dbg.declare(metadata !{i8** %2}, metadata !63, metadata !{metadata !"0x102"}), !dbg !62
+  call void @llvm.dbg.declare(metadata i8** %2, metadata !63, metadata !{!"0x102"}), !dbg !62
   %5 = load %0** %1, !dbg !65
   %6 = bitcast %0* %5 to i8*, !dbg !65
   %7 = getelementptr inbounds %struct._objc_super* %3, i32 0, i32 0, !dbg !65
@@ -150,7 +150,7 @@ declare i8* @objc_msgSendSuper2(%struct._objc_super*, i8*, ...)
 define internal void @run(void ()* %block) #0 {
   %1 = alloca void ()*, align 8
   store void ()* %block, void ()** %1, align 8
-  call void @llvm.dbg.declare(metadata !{void ()** %1}, metadata !72, metadata !{metadata !"0x102"}), !dbg !73
+  call void @llvm.dbg.declare(metadata void ()** %1, metadata !72, metadata !{!"0x102"}), !dbg !73
   %2 = load void ()** %1, align 8, !dbg !74
   %3 = bitcast void ()* %2 to %struct.__block_literal_generic*, !dbg !74
   %4 = getelementptr inbounds %struct.__block_literal_generic* %3, i32 0, i32 3, !dbg !74
@@ -167,13 +167,13 @@ define internal void @"__9-[A init]_block_invoke"(i8* %.block_descriptor) #0 {
   %d = alloca %1*, align 8
   store i8* %.block_descriptor, i8** %1, align 8
   %3 = load i8** %1
-  call void @llvm.dbg.value(metadata !{i8* %3}, i64 0, metadata !76, metadata !{metadata !"0x102"}), !dbg !88
-  call void @llvm.dbg.declare(metadata !{i8* %.block_descriptor}, metadata !76, metadata !{metadata !"0x102"}), !dbg !88
+  call void @llvm.dbg.value(metadata i8* %3, i64 0, metadata !76, metadata !{!"0x102"}), !dbg !88
+  call void @llvm.dbg.declare(metadata i8* %.block_descriptor, metadata !76, metadata !{!"0x102"}), !dbg !88
   %4 = bitcast i8* %.block_descriptor to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>*, !dbg !88
   store <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %4, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>** %2, align 8, !dbg !88
   %5 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %4, i32 0, i32 5, !dbg !88
-  call void @llvm.dbg.declare(metadata !{<{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>** %2}, metadata !89, metadata !111), !dbg !90
-  call void @llvm.dbg.declare(metadata !{%1** %d}, metadata !91, metadata !{metadata !"0x102"}), !dbg !100
+  call void @llvm.dbg.declare(metadata <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>** %2, metadata !89, metadata !111), !dbg !90
+  call void @llvm.dbg.declare(metadata %1** %d, metadata !91, metadata !{!"0x102"}), !dbg !100
   %6 = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_", !dbg !100
   %7 = bitcast %struct._class_t* %6 to i8*, !dbg !100
   %8 = load i8** getelementptr inbounds (%struct._message_ref_t* bitcast ({ i8* (i8*, %struct._message_ref_t*, ...)*, i8* }* @"\01l_objc_msgSend_fixup_alloc" to %struct._message_ref_t*), i32 0, i32 0), !dbg !100
@@ -210,9 +210,9 @@ define internal void @__copy_helper_block_(i8*, i8*) {
   %3 = alloca i8*, align 8
   %4 = alloca i8*, align 8
   store i8* %0, i8** %3, align 8
-  call void @llvm.dbg.declare(metadata !{i8** %3}, metadata !102, metadata !{metadata !"0x102"}), !dbg !103
+  call void @llvm.dbg.declare(metadata i8** %3, metadata !102, metadata !{!"0x102"}), !dbg !103
   store i8* %1, i8** %4, align 8
-  call void @llvm.dbg.declare(metadata !{i8** %4}, metadata !104, metadata !{metadata !"0x102"}), !dbg !103
+  call void @llvm.dbg.declare(metadata i8** %4, metadata !104, metadata !{!"0x102"}), !dbg !103
   %5 = load i8** %4, !dbg !103
   %6 = bitcast i8* %5 to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>*, !dbg !103
   %7 = load i8** %3, !dbg !103
@@ -231,7 +231,7 @@ declare void @_Block_object_assign(i8*, i8*, i32)
 define internal void @__destroy_helper_block_(i8*) {
   %2 = alloca i8*, align 8
   store i8* %0, i8** %2, align 8
-  call void @llvm.dbg.declare(metadata !{i8** %2}, metadata !105, metadata !{metadata !"0x102"}), !dbg !106
+  call void @llvm.dbg.declare(metadata i8** %2, metadata !105, metadata !{!"0x102"}), !dbg !106
   %3 = load i8** %2, !dbg !106
   %4 = bitcast i8* %3 to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>*, !dbg !106
   %5 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %4, i32 0, i32 5, !dbg !106
@@ -247,7 +247,7 @@ define i32 @main() #0 {
   %1 = alloca i32, align 4
   %a = alloca %0*, align 8
   store i32 0, i32* %1
-  call void @llvm.dbg.declare(metadata !{%0** %a}, metadata !107, metadata !{metadata !"0x102"}), !dbg !108
+  call void @llvm.dbg.declare(metadata %0** %a, metadata !107, metadata !{!"0x102"}), !dbg !108
   %2 = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_5", !dbg !108
   %3 = bitcast %struct._class_t* %2 to i8*, !dbg !108
   %4 = load i8** getelementptr inbounds (%struct._message_ref_t* bitcast ({ i8* (i8*, %struct._message_ref_t*, ...)*, i8* }* @"\01l_objc_msgSend_fixup_alloc" to %struct._message_ref_t*), i32 0, i32 0), !dbg !108
@@ -270,115 +270,115 @@ attributes #3 = { nounwind }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!56, !57, !58, !59, !110}
 
-!0 = metadata !{metadata !"0x11\0016\00clang version 3.3 \000\00\002\00\001", metadata !1, metadata !2, metadata !3, metadata !12, metadata !2,  metadata !2} ; [ DW_TAG_compile_unit ] [llvm/tools/clang/test/CodeGenObjC/<unknown>] [DW_LANG_ObjC]
-!1 = metadata !{metadata !"llvm/tools/clang/test/CodeGenObjC/<unknown>", metadata !"llvm/_build.ninja.Debug"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00A\0033\0032\0032\000\00512\0016", metadata !5, metadata !6, null, metadata !7, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 33, size 32, align 32, offset 0] [def] [from ]
-!5 = metadata !{metadata !"llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m", metadata !"llvm/_build.ninja.Debug"}
-!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
-!7 = metadata !{metadata !8, metadata !10}
-!8 = metadata !{metadata !"0x1c\00\000\000\000\000\000", null, metadata !4, metadata !9} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from NSObject]
-!9 = metadata !{metadata !"0x13\00NSObject\0021\000\008\000\000\0016", metadata !5, metadata !6, null, metadata !2, null, null, null} ; [ DW_TAG_structure_type ] [NSObject] [line 21, size 0, align 8, offset 0] [def] [from ]
-!10 = metadata !{metadata !"0xd\00ivar\0035\0032\0032\000\000", metadata !5, metadata !6, metadata !11, null} ; [ DW_TAG_member ] [ivar] [line 35, size 32, align 32, offset 0] [from int]
-!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!12 = metadata !{metadata !13, metadata !27, metadata !31, metadata !35, metadata !36, metadata !39}
-!13 = metadata !{metadata !"0x2e\00-[A init]\00-[A init]\00\0046\001\001\000\006\00256\000\0046", metadata !5, metadata !6, metadata !14, null, i8* (%0*, i8*)* @"\01-[A init]", null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 46] [local] [def] [-[A init]]
-!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!15 = metadata !{metadata !16, metadata !23, metadata !24}
-!16 = metadata !{metadata !"0x16\00id\0046\000\000\000\000", metadata !5, null, metadata !17} ; [ DW_TAG_typedef ] [id] [line 46, size 0, align 0, offset 0] [from ]
-!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !18} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_object]
-!18 = metadata !{metadata !"0x13\00objc_object\000\000\000\000\000\000", metadata !1, null, null, metadata !19, null, null, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [def] [from ]
-!19 = metadata !{metadata !20}
-!20 = metadata !{metadata !"0xd\00isa\000\0064\000\000\000", metadata !1, metadata !18, metadata !21} ; [ DW_TAG_member ] [isa] [line 0, size 64, align 0, offset 0] [from ]
-!21 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from objc_class]
-!22 = metadata !{metadata !"0x13\00objc_class\000\000\000\000\004\000", metadata !1, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
-!23 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
-!24 = metadata !{metadata !"0x16\00SEL\0046\000\000\000\0064", metadata !5, null, metadata !25} ; [ DW_TAG_typedef ] [SEL] [line 46, size 0, align 0, offset 0] [artificial] [from ]
-!25 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !26} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_selector]
-!26 = metadata !{metadata !"0x13\00objc_selector\000\000\000\000\004\000", metadata !1, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_selector] [line 0, size 0, align 0, offset 0] [decl] [from ]
-!27 = metadata !{metadata !"0x2e\00__9-[A init]_block_invoke\00__9-[A init]_block_invoke\00\0049\001\001\000\006\00256\000\0049", metadata !5, metadata !6, metadata !28, null, void (i8*)* @"__9-[A init]_block_invoke", null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 49] [local] [def] [__9-[A init]_block_invoke]
-!28 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!29 = metadata !{null, metadata !30}
-!30 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!31 = metadata !{metadata !"0x2e\00__copy_helper_block_\00__copy_helper_block_\00\0052\001\001\000\006\000\000\0052", metadata !1, metadata !32, metadata !33, null, void (i8*, i8*)* @__copy_helper_block_, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 52] [local] [def] [__copy_helper_block_]
-!32 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [llvm/tools/clang/test/CodeGenObjC/<unknown>]
-!33 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !34, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!34 = metadata !{null, metadata !30, metadata !30}
-!35 = metadata !{metadata !"0x2e\00__destroy_helper_block_\00__destroy_helper_block_\00\0052\001\001\000\006\000\000\0052", metadata !1, metadata !32, metadata !28, null, void (i8*)* @__destroy_helper_block_, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 52] [local] [def] [__destroy_helper_block_]
-!36 = metadata !{metadata !"0x2e\00main\00main\00\0059\000\001\000\006\000\000\0060", metadata !5, metadata !6, metadata !37, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 59] [def] [scope 60] [main]
-!37 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !38, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!38 = metadata !{metadata !11}
-!39 = metadata !{metadata !"0x2e\00run\00run\00\0039\001\001\000\006\00256\000\0040", metadata !5, metadata !6, metadata !40, null, void (void ()*)* @run, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 39] [local] [def] [scope 40] [run]
-!40 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !41, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!41 = metadata !{null, metadata !42}
-!42 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !43} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_literal_generic]
-!43 = metadata !{metadata !"0x13\00__block_literal_generic\0040\00256\000\000\008\000", metadata !5, metadata !6, null, metadata !44, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_generic] [line 40, size 256, align 0, offset 0] [def] [from ]
-!44 = metadata !{metadata !45, metadata !46, metadata !47, metadata !48, metadata !49}
-!45 = metadata !{metadata !"0xd\00__isa\000\0064\0064\000\000", metadata !5, metadata !6, metadata !30} ; [ DW_TAG_member ] [__isa] [line 0, size 64, align 64, offset 0] [from ]
-!46 = metadata !{metadata !"0xd\00__flags\000\0032\0032\0064\000", metadata !5, metadata !6, metadata !11} ; [ DW_TAG_member ] [__flags] [line 0, size 32, align 32, offset 64] [from int]
-!47 = metadata !{metadata !"0xd\00__reserved\000\0032\0032\0096\000", metadata !5, metadata !6, metadata !11} ; [ DW_TAG_member ] [__reserved] [line 0, size 32, align 32, offset 96] [from int]
-!48 = metadata !{metadata !"0xd\00__FuncPtr\000\0064\0064\00128\000", metadata !5, metadata !6, metadata !30} ; [ DW_TAG_member ] [__FuncPtr] [line 0, size 64, align 64, offset 128] [from ]
-!49 = metadata !{metadata !"0xd\00__descriptor\0040\0064\0064\00192\000", metadata !5, metadata !6, metadata !50} ; [ DW_TAG_member ] [__descriptor] [line 40, size 64, align 64, offset 192] [from ]
-!50 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !51} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_descriptor]
-!51 = metadata !{metadata !"0x13\00__block_descriptor\0040\00128\000\000\008\000", metadata !5, metadata !6, null, metadata !52, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor] [line 40, size 128, align 0, offset 0] [def] [from ]
-!52 = metadata !{metadata !53, metadata !55}
-!53 = metadata !{metadata !"0xd\00reserved\000\0064\0064\000\000", metadata !5, metadata !6, metadata !54} ; [ DW_TAG_member ] [reserved] [line 0, size 64, align 64, offset 0] [from long unsigned int]
-!54 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!55 = metadata !{metadata !"0xd\00Size\000\0064\0064\0064\000", metadata !5, metadata !6, metadata !54} ; [ DW_TAG_member ] [Size] [line 0, size 64, align 64, offset 64] [from long unsigned int]
-!56 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!57 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!58 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!59 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
-!60 = metadata !{metadata !"0x101\00self\0016777262\001088", metadata !13, metadata !32, metadata !61} ; [ DW_TAG_arg_variable ] [self] [line 46]
-!61 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
-!62 = metadata !{i32 46, i32 0, metadata !13, null}
-!63 = metadata !{metadata !"0x101\00_cmd\0033554478\0064", metadata !13, metadata !32, metadata !64} ; [ DW_TAG_arg_variable ] [_cmd] [line 46]
-!64 = metadata !{metadata !"0x16\00SEL\0046\000\000\000\000", metadata !5, null, metadata !25} ; [ DW_TAG_typedef ] [SEL] [line 46, size 0, align 0, offset 0] [from ]
-!65 = metadata !{i32 48, i32 0, metadata !66, null}
-!66 = metadata !{metadata !"0xb\0047\000\000", metadata !5, metadata !13} ; [ DW_TAG_lexical_block ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
-!67 = metadata !{}
-!68 = metadata !{i32 49, i32 0, metadata !69, null}
-!69 = metadata !{metadata !"0xb\0048\000\001", metadata !5, metadata !66} ; [ DW_TAG_lexical_block ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
-!70 = metadata !{i32 53, i32 0, metadata !69, null}
-!71 = metadata !{i32 54, i32 0, metadata !66, null}
-!72 = metadata !{metadata !"0x101\00block\0016777255\000", metadata !39, metadata !6, metadata !42} ; [ DW_TAG_arg_variable ] [block] [line 39]
-!73 = metadata !{i32 39, i32 0, metadata !39, null}
-!74 = metadata !{i32 41, i32 0, metadata !39, null}
-!75 = metadata !{i32 42, i32 0, metadata !39, null}
-!76 = metadata !{metadata !"0x101\00.block_descriptor\0016777265\0064", metadata !27, metadata !6, metadata !77} ; [ DW_TAG_arg_variable ] [.block_descriptor] [line 49]
-!77 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !78} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_literal_1]
-!78 = metadata !{metadata !"0x13\00__block_literal_1\0049\00320\0064\000\000\000", metadata !5, metadata !6, null, metadata !79, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_1] [line 49, size 320, align 64, offset 0] [def] [from ]
-!79 = metadata !{metadata !80, metadata !81, metadata !82, metadata !83, metadata !84, metadata !87}
-!80 = metadata !{metadata !"0xd\00__isa\0049\0064\0064\000\000", metadata !5, metadata !6, metadata !30} ; [ DW_TAG_member ] [__isa] [line 49, size 64, align 64, offset 0] [from ]
-!81 = metadata !{metadata !"0xd\00__flags\0049\0032\0032\0064\000", metadata !5, metadata !6, metadata !11} ; [ DW_TAG_member ] [__flags] [line 49, size 32, align 32, offset 64] [from int]
-!82 = metadata !{metadata !"0xd\00__reserved\0049\0032\0032\0096\000", metadata !5, metadata !6, metadata !11} ; [ DW_TAG_member ] [__reserved] [line 49, size 32, align 32, offset 96] [from int]
-!83 = metadata !{metadata !"0xd\00__FuncPtr\0049\0064\0064\00128\000", metadata !5, metadata !6, metadata !30} ; [ DW_TAG_member ] [__FuncPtr] [line 49, size 64, align 64, offset 128] [from ]
-!84 = metadata !{metadata !"0xd\00__descriptor\0049\0064\0064\00192\000", metadata !5, metadata !6, metadata !85} ; [ DW_TAG_member ] [__descriptor] [line 49, size 64, align 64, offset 192] [from ]
-!85 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !86} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from __block_descriptor_withcopydispose]
-!86 = metadata !{metadata !"0x13\00__block_descriptor_withcopydispose\0049\000\000\000\004\000", metadata !1, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 49, size 0, align 0, offset 0] [decl] [from ]
-!87 = metadata !{metadata !"0xd\00self\0049\0064\0064\00256\000", metadata !5, metadata !6, metadata !61} ; [ DW_TAG_member ] [self] [line 49, size 64, align 64, offset 256] [from ]
-!88 = metadata !{i32 49, i32 0, metadata !27, null}
-!89 = metadata !{metadata !"0x100\00self\0052\000", metadata !27, metadata !32, metadata !23} ; [ DW_TAG_auto_variable ] [self] [line 52]
-!90 = metadata !{i32 52, i32 0, metadata !27, null}
-!91 = metadata !{metadata !"0x100\00d\0050\000", metadata !92, metadata !6, metadata !93} ; [ DW_TAG_auto_variable ] [d] [line 50]
-!92 = metadata !{metadata !"0xb\0049\000\002", metadata !5, metadata !27} ; [ DW_TAG_lexical_block ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
-!93 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !94} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from NSMutableDictionary]
-!94 = metadata !{metadata !"0x13\00NSMutableDictionary\0030\000\008\000\000\0016", metadata !5, metadata !6, null, metadata !95, null, null, null} ; [ DW_TAG_structure_type ] [NSMutableDictionary] [line 30, size 0, align 8, offset 0] [def] [from ]
-!95 = metadata !{metadata !96}
-!96 = metadata !{metadata !"0x1c\00\000\000\000\000\000", null, metadata !94, metadata !97} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from NSDictionary]
-!97 = metadata !{metadata !"0x13\00NSDictionary\0026\000\008\000\000\0016", metadata !5, metadata !6, null, metadata !98, null, null, null} ; [ DW_TAG_structure_type ] [NSDictionary] [line 26, size 0, align 8, offset 0] [def] [from ]
-!98 = metadata !{metadata !99}
-!99 = metadata !{metadata !"0x1c\00\000\000\000\000\000", null, metadata !97, metadata !9} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from NSObject]
-!100 = metadata !{i32 50, i32 0, metadata !92, null}
-!101 = metadata !{i32 51, i32 0, metadata !92, null}
-!102 = metadata !{metadata !"0x101\00\0016777268\001088", metadata !31, metadata !32, metadata !30} ; [ DW_TAG_arg_variable ] [line 52]
-!103 = metadata !{i32 52, i32 0, metadata !31, null}
-!104 = metadata !{metadata !"0x101\00\0033554484\0064", metadata !31, metadata !32, metadata !30} ; [ DW_TAG_arg_variable ] [line 52]
-!105 = metadata !{metadata !"0x101\00\0016777268\001088", metadata !35, metadata !32, metadata !30} ; [ DW_TAG_arg_variable ] [line 52]
-!106 = metadata !{i32 52, i32 0, metadata !35, null}
-!107 = metadata !{metadata !"0x100\00a\0061\000", metadata !36, metadata !6, metadata !61} ; [ DW_TAG_auto_variable ] [a] [line 61]
-!108 = metadata !{i32 61, i32 0, metadata !36, null}
-!109 = metadata !{i32 62, i32 0, metadata !36, null}
-!110 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!111 = metadata !{metadata !"0x102\006\0034\0032"} ; [ DW_TAG_expression ] [DW_OP_deref DW_OP_plus 32]
+!0 = !{!"0x11\0016\00clang version 3.3 \000\00\002\00\001", !1, !2, !3, !12, !2,  !2} ; [ DW_TAG_compile_unit ] [llvm/tools/clang/test/CodeGenObjC/<unknown>] [DW_LANG_ObjC]
+!1 = !{!"llvm/tools/clang/test/CodeGenObjC/<unknown>", !"llvm/_build.ninja.Debug"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00A\0033\0032\0032\000\00512\0016", !5, !6, null, !7, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 33, size 32, align 32, offset 0] [def] [from ]
+!5 = !{!"llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m", !"llvm/_build.ninja.Debug"}
+!6 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
+!7 = !{!8, !10}
+!8 = !{!"0x1c\00\000\000\000\000\000", null, !4, !9} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from NSObject]
+!9 = !{!"0x13\00NSObject\0021\000\008\000\000\0016", !5, !6, null, !2, null, null, null} ; [ DW_TAG_structure_type ] [NSObject] [line 21, size 0, align 8, offset 0] [def] [from ]
+!10 = !{!"0xd\00ivar\0035\0032\0032\000\000", !5, !6, !11, null} ; [ DW_TAG_member ] [ivar] [line 35, size 32, align 32, offset 0] [from int]
+!11 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = !{!13, !27, !31, !35, !36, !39}
+!13 = !{!"0x2e\00-[A init]\00-[A init]\00\0046\001\001\000\006\00256\000\0046", !5, !6, !14, null, i8* (%0*, i8*)* @"\01-[A init]", null, null, !2} ; [ DW_TAG_subprogram ] [line 46] [local] [def] [-[A init]]
+!14 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = !{!16, !23, !24}
+!16 = !{!"0x16\00id\0046\000\000\000\000", !5, null, !17} ; [ DW_TAG_typedef ] [id] [line 46, size 0, align 0, offset 0] [from ]
+!17 = !{!"0xf\00\000\0064\0064\000\000", null, null, !18} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_object]
+!18 = !{!"0x13\00objc_object\000\000\000\000\000\000", !1, null, null, !19, null, null, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [def] [from ]
+!19 = !{!20}
+!20 = !{!"0xd\00isa\000\0064\000\000\000", !1, !18, !21} ; [ DW_TAG_member ] [isa] [line 0, size 64, align 0, offset 0] [from ]
+!21 = !{!"0xf\00\000\0064\000\000\000", null, null, !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from objc_class]
+!22 = !{!"0x13\00objc_class\000\000\000\000\004\000", !1, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!23 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
+!24 = !{!"0x16\00SEL\0046\000\000\000\0064", !5, null, !25} ; [ DW_TAG_typedef ] [SEL] [line 46, size 0, align 0, offset 0] [artificial] [from ]
+!25 = !{!"0xf\00\000\0064\0064\000\000", null, null, !26} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_selector]
+!26 = !{!"0x13\00objc_selector\000\000\000\000\004\000", !1, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_selector] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!27 = !{!"0x2e\00__9-[A init]_block_invoke\00__9-[A init]_block_invoke\00\0049\001\001\000\006\00256\000\0049", !5, !6, !28, null, void (i8*)* @"__9-[A init]_block_invoke", null, null, !2} ; [ DW_TAG_subprogram ] [line 49] [local] [def] [__9-[A init]_block_invoke]
+!28 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!29 = !{null, !30}
+!30 = !{!"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!31 = !{!"0x2e\00__copy_helper_block_\00__copy_helper_block_\00\0052\001\001\000\006\000\000\0052", !1, !32, !33, null, void (i8*, i8*)* @__copy_helper_block_, null, null, !2} ; [ DW_TAG_subprogram ] [line 52] [local] [def] [__copy_helper_block_]
+!32 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [llvm/tools/clang/test/CodeGenObjC/<unknown>]
+!33 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !34, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!34 = !{null, !30, !30}
+!35 = !{!"0x2e\00__destroy_helper_block_\00__destroy_helper_block_\00\0052\001\001\000\006\000\000\0052", !1, !32, !28, null, void (i8*)* @__destroy_helper_block_, null, null, !2} ; [ DW_TAG_subprogram ] [line 52] [local] [def] [__destroy_helper_block_]
+!36 = !{!"0x2e\00main\00main\00\0059\000\001\000\006\000\000\0060", !5, !6, !37, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 59] [def] [scope 60] [main]
+!37 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !38, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!38 = !{!11}
+!39 = !{!"0x2e\00run\00run\00\0039\001\001\000\006\00256\000\0040", !5, !6, !40, null, void (void ()*)* @run, null, null, !2} ; [ DW_TAG_subprogram ] [line 39] [local] [def] [scope 40] [run]
+!40 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !41, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!41 = !{null, !42}
+!42 = !{!"0xf\00\000\0064\000\000\000", null, null, !43} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_literal_generic]
+!43 = !{!"0x13\00__block_literal_generic\0040\00256\000\000\008\000", !5, !6, null, !44, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_generic] [line 40, size 256, align 0, offset 0] [def] [from ]
+!44 = !{!45, !46, !47, !48, !49}
+!45 = !{!"0xd\00__isa\000\0064\0064\000\000", !5, !6, !30} ; [ DW_TAG_member ] [__isa] [line 0, size 64, align 64, offset 0] [from ]
+!46 = !{!"0xd\00__flags\000\0032\0032\0064\000", !5, !6, !11} ; [ DW_TAG_member ] [__flags] [line 0, size 32, align 32, offset 64] [from int]
+!47 = !{!"0xd\00__reserved\000\0032\0032\0096\000", !5, !6, !11} ; [ DW_TAG_member ] [__reserved] [line 0, size 32, align 32, offset 96] [from int]
+!48 = !{!"0xd\00__FuncPtr\000\0064\0064\00128\000", !5, !6, !30} ; [ DW_TAG_member ] [__FuncPtr] [line 0, size 64, align 64, offset 128] [from ]
+!49 = !{!"0xd\00__descriptor\0040\0064\0064\00192\000", !5, !6, !50} ; [ DW_TAG_member ] [__descriptor] [line 40, size 64, align 64, offset 192] [from ]
+!50 = !{!"0xf\00\000\0064\000\000\000", null, null, !51} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_descriptor]
+!51 = !{!"0x13\00__block_descriptor\0040\00128\000\000\008\000", !5, !6, null, !52, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor] [line 40, size 128, align 0, offset 0] [def] [from ]
+!52 = !{!53, !55}
+!53 = !{!"0xd\00reserved\000\0064\0064\000\000", !5, !6, !54} ; [ DW_TAG_member ] [reserved] [line 0, size 64, align 64, offset 0] [from long unsigned int]
+!54 = !{!"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!55 = !{!"0xd\00Size\000\0064\0064\0064\000", !5, !6, !54} ; [ DW_TAG_member ] [Size] [line 0, size 64, align 64, offset 64] [from long unsigned int]
+!56 = !{i32 1, !"Objective-C Version", i32 2}
+!57 = !{i32 1, !"Objective-C Image Info Version", i32 0}
+!58 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!59 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
+!60 = !{!"0x101\00self\0016777262\001088", !13, !32, !61} ; [ DW_TAG_arg_variable ] [self] [line 46]
+!61 = !{!"0xf\00\000\0064\0064\000\000", null, null, !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!62 = !MDLocation(line: 46, scope: !13)
+!63 = !{!"0x101\00_cmd\0033554478\0064", !13, !32, !64} ; [ DW_TAG_arg_variable ] [_cmd] [line 46]
+!64 = !{!"0x16\00SEL\0046\000\000\000\000", !5, null, !25} ; [ DW_TAG_typedef ] [SEL] [line 46, size 0, align 0, offset 0] [from ]
+!65 = !MDLocation(line: 48, scope: !66)
+!66 = !{!"0xb\0047\000\000", !5, !13} ; [ DW_TAG_lexical_block ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
+!67 = !{}
+!68 = !MDLocation(line: 49, scope: !69)
+!69 = !{!"0xb\0048\000\001", !5, !66} ; [ DW_TAG_lexical_block ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
+!70 = !MDLocation(line: 53, scope: !69)
+!71 = !MDLocation(line: 54, scope: !66)
+!72 = !{!"0x101\00block\0016777255\000", !39, !6, !42} ; [ DW_TAG_arg_variable ] [block] [line 39]
+!73 = !MDLocation(line: 39, scope: !39)
+!74 = !MDLocation(line: 41, scope: !39)
+!75 = !MDLocation(line: 42, scope: !39)
+!76 = !{!"0x101\00.block_descriptor\0016777265\0064", !27, !6, !77} ; [ DW_TAG_arg_variable ] [.block_descriptor] [line 49]
+!77 = !{!"0xf\00\000\0064\000\000\000", null, null, !78} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_literal_1]
+!78 = !{!"0x13\00__block_literal_1\0049\00320\0064\000\000\000", !5, !6, null, !79, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_1] [line 49, size 320, align 64, offset 0] [def] [from ]
+!79 = !{!80, !81, !82, !83, !84, !87}
+!80 = !{!"0xd\00__isa\0049\0064\0064\000\000", !5, !6, !30} ; [ DW_TAG_member ] [__isa] [line 49, size 64, align 64, offset 0] [from ]
+!81 = !{!"0xd\00__flags\0049\0032\0032\0064\000", !5, !6, !11} ; [ DW_TAG_member ] [__flags] [line 49, size 32, align 32, offset 64] [from int]
+!82 = !{!"0xd\00__reserved\0049\0032\0032\0096\000", !5, !6, !11} ; [ DW_TAG_member ] [__reserved] [line 49, size 32, align 32, offset 96] [from int]
+!83 = !{!"0xd\00__FuncPtr\0049\0064\0064\00128\000", !5, !6, !30} ; [ DW_TAG_member ] [__FuncPtr] [line 49, size 64, align 64, offset 128] [from ]
+!84 = !{!"0xd\00__descriptor\0049\0064\0064\00192\000", !5, !6, !85} ; [ DW_TAG_member ] [__descriptor] [line 49, size 64, align 64, offset 192] [from ]
+!85 = !{!"0xf\00\000\0064\0064\000\000", null, null, !86} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from __block_descriptor_withcopydispose]
+!86 = !{!"0x13\00__block_descriptor_withcopydispose\0049\000\000\000\004\000", !1, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 49, size 0, align 0, offset 0] [decl] [from ]
+!87 = !{!"0xd\00self\0049\0064\0064\00256\000", !5, !6, !61} ; [ DW_TAG_member ] [self] [line 49, size 64, align 64, offset 256] [from ]
+!88 = !MDLocation(line: 49, scope: !27)
+!89 = !{!"0x100\00self\0052\000", !27, !32, !23} ; [ DW_TAG_auto_variable ] [self] [line 52]
+!90 = !MDLocation(line: 52, scope: !27)
+!91 = !{!"0x100\00d\0050\000", !92, !6, !93} ; [ DW_TAG_auto_variable ] [d] [line 50]
+!92 = !{!"0xb\0049\000\002", !5, !27} ; [ DW_TAG_lexical_block ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
+!93 = !{!"0xf\00\000\0064\0064\000\000", null, null, !94} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from NSMutableDictionary]
+!94 = !{!"0x13\00NSMutableDictionary\0030\000\008\000\000\0016", !5, !6, null, !95, null, null, null} ; [ DW_TAG_structure_type ] [NSMutableDictionary] [line 30, size 0, align 8, offset 0] [def] [from ]
+!95 = !{!96}
+!96 = !{!"0x1c\00\000\000\000\000\000", null, !94, !97} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from NSDictionary]
+!97 = !{!"0x13\00NSDictionary\0026\000\008\000\000\0016", !5, !6, null, !98, null, null, null} ; [ DW_TAG_structure_type ] [NSDictionary] [line 26, size 0, align 8, offset 0] [def] [from ]
+!98 = !{!99}
+!99 = !{!"0x1c\00\000\000\000\000\000", null, !97, !9} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from NSObject]
+!100 = !MDLocation(line: 50, scope: !92)
+!101 = !MDLocation(line: 51, scope: !92)
+!102 = !{!"0x101\00\0016777268\001088", !31, !32, !30} ; [ DW_TAG_arg_variable ] [line 52]
+!103 = !MDLocation(line: 52, scope: !31)
+!104 = !{!"0x101\00\0033554484\0064", !31, !32, !30} ; [ DW_TAG_arg_variable ] [line 52]
+!105 = !{!"0x101\00\0016777268\001088", !35, !32, !30} ; [ DW_TAG_arg_variable ] [line 52]
+!106 = !MDLocation(line: 52, scope: !35)
+!107 = !{!"0x100\00a\0061\000", !36, !6, !61} ; [ DW_TAG_auto_variable ] [a] [line 61]
+!108 = !MDLocation(line: 61, scope: !36)
+!109 = !MDLocation(line: 62, scope: !36)
+!110 = !{i32 1, !"Debug Info Version", i32 2}
+!111 = !{!"0x102\006\0034\0032"} ; [ DW_TAG_expression ] [DW_OP_deref DW_OP_plus 32]
diff --git a/test/DebugInfo/X86/debug-info-static-member.ll b/test/DebugInfo/X86/debug-info-static-member.ll
index 37fe997..8a14b6a 100644
--- a/test/DebugInfo/X86/debug-info-static-member.ll
+++ b/test/DebugInfo/X86/debug-info-static-member.ll
@@ -47,7 +47,7 @@ entry:
   %retval = alloca i32, align 4
   %instance_C = alloca %class.C, align 4
   store i32 0, i32* %retval
-  call void @llvm.dbg.declare(metadata !{%class.C* %instance_C}, metadata !29, metadata !{metadata !"0x102"}), !dbg !30
+  call void @llvm.dbg.declare(metadata %class.C* %instance_C, metadata !29, metadata !{!"0x102"}), !dbg !30
   %d = getelementptr inbounds %class.C* %instance_C, i32 0, i32 0, !dbg !31
   store i32 8, i32* %d, align 4, !dbg !31
   %0 = load i32* @_ZN1C1cE, align 4, !dbg !32
@@ -59,37 +59,37 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!34}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 171914)\000\00\000\00\000", metadata !33, metadata !1, metadata !1, metadata !3, metadata !10,  metadata !1} ; [ DW_TAG_compile_unit ] [/home/probinson/projects/upstream/static-member/test/debug-info-static-member.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00main\00main\00\0018\000\001\000\006\00256\000\0023", metadata !33, metadata !6, metadata !7, null, i32 ()* @main, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 18] [def] [scope 23] [main]
-!6 = metadata !{metadata !"0x29", metadata !33} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !12, metadata !27, metadata !28}
-!12 = metadata !{metadata !"0x34\00a\00a\00_ZN1C1aE\0014\000\001", null, metadata !6, metadata !9, i32* @_ZN1C1aE, metadata !15} ; [ DW_TAG_variable ] [a] [line 14] [def]
-!13 = metadata !{metadata !"0x2\00C\001\0032\0032\000\000\000", metadata !33, null, null, metadata !14, null, null, null} ; [ DW_TAG_class_type ] [C] [line 1, size 32, align 32, offset 0] [def] [from ]
-!14 = metadata !{metadata !15, metadata !16, metadata !19, metadata !20, metadata !23, metadata !24, metadata !26}
-!15 = metadata !{metadata !"0xd\00a\003\000\000\000\004097", metadata !33, metadata !13, metadata !9, null} ; [ DW_TAG_member ] [a] [line 3, size 0, align 0, offset 0] [private] [static] [from int]
-!16 = metadata !{metadata !"0xd\00const_a\004\000\000\000\004097", metadata !33, metadata !13, metadata !17, i1 true} ; [ DW_TAG_member ] [const_a] [line 4, size 0, align 0, offset 0] [private] [static] [from ]
-!17 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !18} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from bool]
-!18 = metadata !{metadata !"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
-!19 = metadata !{metadata !"0xd\00b\006\000\000\000\004098", metadata !33, metadata !13, metadata !9, null} ; [ DW_TAG_member ] [b] [line 6, size 0, align 0, offset 0] [protected] [static] [from int]
-!20 = metadata !{metadata !"0xd\00const_b\007\000\000\000\004098", metadata !33, metadata !13, metadata !21, float 0x40091EB860000000} ; [ DW_TAG_member ] [const_b] [line 7, size 0, align 0, offset 0] [protected] [static] [from ]
-!21 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !22} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from float]
-!22 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
-!23 = metadata !{metadata !"0xd\00c\009\000\000\000\004099", metadata !33, metadata !13, metadata !9, null} ; [ DW_TAG_member ] [c] [line 9, size 0, align 0, offset 0] [static] [from int]
-!24 = metadata !{metadata !"0xd\00const_c\0010\000\000\000\004099", metadata !33, metadata !13, metadata !25, i32 18} ; [ DW_TAG_member ] [const_c] [line 10, size 0, align 0, offset 0] [static] [from ]
-!25 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !9} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
-!26 = metadata !{metadata !"0xd\00d\0011\0032\0032\000\003", metadata !33, metadata !13, metadata !9} ; [ DW_TAG_member ] [d] [line 11, size 32, align 32, offset 0] [from int]
-!27 = metadata !{metadata !"0x34\00b\00b\00_ZN1C1bE\0015\000\001", null, metadata !6, metadata !9, i32* @_ZN1C1bE, metadata !19} ; [ DW_TAG_variable ] [b] [line 15] [def]
-!28 = metadata !{metadata !"0x34\00c\00c\00_ZN1C1cE\0016\000\001", null, metadata !6, metadata !9, i32* @_ZN1C1cE, metadata !23} ; [ DW_TAG_variable ] [c] [line 16] [def]
-!29 = metadata !{metadata !"0x100\00instance_C\0020\000", metadata !5, metadata !6, metadata !13} ; [ DW_TAG_auto_variable ] [instance_C] [line 20]
-!30 = metadata !{i32 20, i32 0, metadata !5, null}
-!31 = metadata !{i32 21, i32 0, metadata !5, null}
-!32 = metadata !{i32 22, i32 0, metadata !5, null}
-!33 = metadata !{metadata !"/usr/local/google/home/blaikie/Development/llvm/src/tools/clang/test/CodeGenCXX/debug-info-static-member.cpp", metadata !"/home/blaikie/local/Development/llvm/build/clang/x86-64/Debug/llvm"}
+!0 = !{!"0x11\004\00clang version 3.3 (trunk 171914)\000\00\000\00\000", !33, !1, !1, !3, !10,  !1} ; [ DW_TAG_compile_unit ] [/home/probinson/projects/upstream/static-member/test/debug-info-static-member.cpp] [DW_LANG_C_plus_plus]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00main\00main\00\0018\000\001\000\006\00256\000\0023", !33, !6, !7, null, i32 ()* @main, null, null, !1} ; [ DW_TAG_subprogram ] [line 18] [def] [scope 23] [main]
+!6 = !{!"0x29", !33} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!12, !27, !28}
+!12 = !{!"0x34\00a\00a\00_ZN1C1aE\0014\000\001", null, !6, !9, i32* @_ZN1C1aE, !15} ; [ DW_TAG_variable ] [a] [line 14] [def]
+!13 = !{!"0x2\00C\001\0032\0032\000\000\000", !33, null, null, !14, null, null, null} ; [ DW_TAG_class_type ] [C] [line 1, size 32, align 32, offset 0] [def] [from ]
+!14 = !{!15, !16, !19, !20, !23, !24, !26}
+!15 = !{!"0xd\00a\003\000\000\000\004097", !33, !13, !9, null} ; [ DW_TAG_member ] [a] [line 3, size 0, align 0, offset 0] [private] [static] [from int]
+!16 = !{!"0xd\00const_a\004\000\000\000\004097", !33, !13, !17, i1 true} ; [ DW_TAG_member ] [const_a] [line 4, size 0, align 0, offset 0] [private] [static] [from ]
+!17 = !{!"0x26\00\000\000\000\000\000", null, null, !18} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from bool]
+!18 = !{!"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
+!19 = !{!"0xd\00b\006\000\000\000\004098", !33, !13, !9, null} ; [ DW_TAG_member ] [b] [line 6, size 0, align 0, offset 0] [protected] [static] [from int]
+!20 = !{!"0xd\00const_b\007\000\000\000\004098", !33, !13, !21, float 0x40091EB860000000} ; [ DW_TAG_member ] [const_b] [line 7, size 0, align 0, offset 0] [protected] [static] [from ]
+!21 = !{!"0x26\00\000\000\000\000\000", null, null, !22} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from float]
+!22 = !{!"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!23 = !{!"0xd\00c\009\000\000\000\004099", !33, !13, !9, null} ; [ DW_TAG_member ] [c] [line 9, size 0, align 0, offset 0] [static] [from int]
+!24 = !{!"0xd\00const_c\0010\000\000\000\004099", !33, !13, !25, i32 18} ; [ DW_TAG_member ] [const_c] [line 10, size 0, align 0, offset 0] [static] [from ]
+!25 = !{!"0x26\00\000\000\000\000\000", null, null, !9} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
+!26 = !{!"0xd\00d\0011\0032\0032\000\003", !33, !13, !9} ; [ DW_TAG_member ] [d] [line 11, size 32, align 32, offset 0] [from int]
+!27 = !{!"0x34\00b\00b\00_ZN1C1bE\0015\000\001", null, !6, !9, i32* @_ZN1C1bE, !19} ; [ DW_TAG_variable ] [b] [line 15] [def]
+!28 = !{!"0x34\00c\00c\00_ZN1C1cE\0016\000\001", null, !6, !9, i32* @_ZN1C1cE, !23} ; [ DW_TAG_variable ] [c] [line 16] [def]
+!29 = !{!"0x100\00instance_C\0020\000", !5, !6, !13} ; [ DW_TAG_auto_variable ] [instance_C] [line 20]
+!30 = !MDLocation(line: 20, scope: !5)
+!31 = !MDLocation(line: 21, scope: !5)
+!32 = !MDLocation(line: 22, scope: !5)
+!33 = !{!"/usr/local/google/home/blaikie/Development/llvm/src/tools/clang/test/CodeGenCXX/debug-info-static-member.cpp", !"/home/blaikie/local/Development/llvm/build/clang/x86-64/Debug/llvm"}
 ; PRESENT verifies that static member declarations have these attributes:
 ; external, declaration, accessibility, and either DW_AT_MIPS_linkage_name
 ; (for variables) or DW_AT_const_value (for constants).
@@ -253,4 +253,4 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 ; DARWINA-NOT:  DW_AT_const_value
 ; DARWINA-NOT:  DW_AT_location
 ; DARWINA:      NULL
-!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!34 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/debug-loc-asan.ll b/test/DebugInfo/X86/debug-loc-asan.ll
index 869db75..13e193b 100644
--- a/test/DebugInfo/X86/debug-loc-asan.ll
+++ b/test/DebugInfo/X86/debug-loc-asan.ll
@@ -21,10 +21,10 @@
 ; CHECK: .Ldebug_loc{{[0-9]+}}:
 ; We expect two location ranges for the variable.
 
-; First, it is stored in %rdx:
+; First, its address is stored in %rdi:
 ; CHECK:      .quad .Lfunc_begin0-.Lfunc_begin0
 ; CHECK-NEXT: .quad [[START_LABEL]]-.Lfunc_begin0
-; CHECK: DW_OP_reg5
+; CHECK: DW_OP_breg5
 
 ; Then it's addressed via %rsp:
 ; CHECK:      .quad [[START_LABEL]]-.Lfunc_begin0
@@ -77,7 +77,7 @@ entry:
   %21 = inttoptr i64 %20 to i8*
   %22 = load i8* %21
   %23 = icmp ne i8 %22, 0
-  call void @llvm.dbg.declare(metadata !{i32* %8}, metadata !12, metadata !14)
+  call void @llvm.dbg.declare(metadata i32* %8, metadata !12, metadata !14)
   br i1 %23, label %24, label %30
 
 ; <label>:24                                      ; preds = %5
@@ -165,18 +165,18 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (209308)\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/test.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"test.cc", metadata !"/llvm_cmake_gcc"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3bari\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @_Z3bari, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/test.cc]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!11 = metadata !{metadata !"clang version 3.5.0 (209308)"}
-!12 = metadata !{metadata !"0x101\00y\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [y] [line 1]
-!13 = metadata !{i32 2, i32 0, metadata !4, null}
-!14 = metadata !{metadata !"0x102\006"} ; [ DW_TAG_expression ] [DW_OP_deref]
+!0 = !{!"0x11\004\00clang version 3.5.0 (209308)\000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/test.cc] [DW_LANG_C_plus_plus]
+!1 = !{!"test.cc", !"/llvm_cmake_gcc"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00bar\00bar\00_Z3bari\001\000\001\000\006\00256\000\001", !1, !5, !6, null, i32 (i32)* @_Z3bari, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/test.cc]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 2}
+!11 = !{!"clang version 3.5.0 (209308)"}
+!12 = !{!"0x101\00y\0016777217\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [y] [line 1]
+!13 = !MDLocation(line: 2, scope: !4)
+!14 = !{!"0x102\006"} ; [ DW_TAG_expression ] [DW_OP_deref]
diff --git a/test/DebugInfo/X86/debug-loc-offset.ll b/test/DebugInfo/X86/debug-loc-offset.ll
index bdd3f20..03b4b40 100644
--- a/test/DebugInfo/X86/debug-loc-offset.ll
+++ b/test/DebugInfo/X86/debug-loc-offset.ll
@@ -64,7 +64,7 @@ define i32 @_Z3bari(i32 %b) #0 {
 entry:
   %b.addr = alloca i32, align 4
   store i32 %b, i32* %b.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !21, metadata !{metadata !"0x102"}), !dbg !22
+  call void @llvm.dbg.declare(metadata i32* %b.addr, metadata !21, metadata !{!"0x102"}), !dbg !22
   %0 = load i32* %b.addr, align 4, !dbg !23
   %add = add nsw i32 %0, 4, !dbg !23
   ret i32 %add, !dbg !23
@@ -76,8 +76,8 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 define void @_Z3baz1A(%struct.A* %a) #2 {
 entry:
   %z = alloca i32, align 4
-  call void @llvm.dbg.declare(metadata !{%struct.A* %a}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
-  call void @llvm.dbg.declare(metadata !{i32* %z}, metadata !26, metadata !{metadata !"0x102"}), !dbg !27
+  call void @llvm.dbg.declare(metadata %struct.A* %a, metadata !24, metadata !{!"0x102\006"}), !dbg !25
+  call void @llvm.dbg.declare(metadata i32* %z, metadata !26, metadata !{!"0x102"}), !dbg !27
   store i32 2, i32* %z, align 4, !dbg !27
   %var = getelementptr inbounds %struct.A* %a, i32 0, i32 1, !dbg !28
   %0 = load i32* %var, align 4, !dbg !28
@@ -116,38 +116,38 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !llvm.module.flags = !{!18, !19}
 !llvm.ident = !{!20, !20}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (210479)\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/debug-loc-offset1.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"debug-loc-offset1.cc", metadata !"/llvm_cmake_gcc"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3bari\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @_Z3bari, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/debug-loc-offset1.cc]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (210479)\000\00\000\00\001", metadata !10, metadata !2, metadata !11, metadata !13, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/debug-loc-offset2.cc] [DW_LANG_C_plus_plus]
-!10 = metadata !{metadata !"debug-loc-offset2.cc", metadata !"/llvm_cmake_gcc"}
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0x13\00A\001\000\000\000\004\000", metadata !10, null, null, null, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 0, align 0, offset 0] [decl] [from ]
-!13 = metadata !{metadata !14}
-!14 = metadata !{metadata !"0x2e\00baz\00baz\00_Z3baz1A\006\000\001\000\006\00256\000\006", metadata !10, metadata !15, metadata !16, null, void (%struct.A*)* @_Z3baz1A, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [baz]
-!15 = metadata !{metadata !"0x29", metadata !10}        ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
-!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!17 = metadata !{null, metadata !12}
-!18 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!19 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!20 = metadata !{metadata !"clang version 3.5.0 (210479)"}
-!21 = metadata !{metadata !"0x101\00b\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [b] [line 1]
-!22 = metadata !{i32 1, i32 0, metadata !4, null}
-!23 = metadata !{i32 2, i32 0, metadata !4, null}
-!24 = metadata !{metadata !"0x101\00a\0016777222\008192", metadata !14, metadata !15, metadata !"_ZTS1A"} ; [ DW_TAG_arg_variable ] [a] [line 6]
-!25 = metadata !{i32 6, i32 0, metadata !14, null}
-!26 = metadata !{metadata !"0x100\00z\007\000", metadata !14, metadata !15, metadata !8} ; [ DW_TAG_auto_variable ] [z] [line 7]
-!27 = metadata !{i32 7, i32 0, metadata !14, null}
-!28 = metadata !{i32 8, i32 0, metadata !29, null}
-!29 = metadata !{metadata !"0xb\008\000\000", metadata !10, metadata !14} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
-!30 = metadata !{i32 9, i32 0, metadata !29, null}
-!31 = metadata !{i32 10, i32 0, metadata !32, null}
-!32 = metadata !{metadata !"0xb\0010\000\000", metadata !10, metadata !14} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
-!33 = metadata !{i32 11, i32 0, metadata !32, null}
-!34 = metadata !{i32 12, i32 0, metadata !14, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 (210479)\000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/debug-loc-offset1.cc] [DW_LANG_C_plus_plus]
+!1 = !{!"debug-loc-offset1.cc", !"/llvm_cmake_gcc"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00bar\00bar\00_Z3bari\001\000\001\000\006\00256\000\001", !1, !5, !6, null, i32 (i32)* @_Z3bari, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/debug-loc-offset1.cc]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x11\004\00clang version 3.5.0 (210479)\000\00\000\00\001", !10, !2, !11, !13, !2, !2} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/debug-loc-offset2.cc] [DW_LANG_C_plus_plus]
+!10 = !{!"debug-loc-offset2.cc", !"/llvm_cmake_gcc"}
+!11 = !{!12}
+!12 = !{!"0x13\00A\001\000\000\000\004\000", !10, null, null, null, null, null, !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 0, align 0, offset 0] [decl] [from ]
+!13 = !{!14}
+!14 = !{!"0x2e\00baz\00baz\00_Z3baz1A\006\000\001\000\006\00256\000\006", !10, !15, !16, null, void (%struct.A*)* @_Z3baz1A, null, null, !2} ; [ DW_TAG_subprogram ] [line 6] [def] [baz]
+!15 = !{!"0x29", !10}        ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
+!16 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = !{null, !12}
+!18 = !{i32 2, !"Dwarf Version", i32 4}
+!19 = !{i32 2, !"Debug Info Version", i32 2}
+!20 = !{!"clang version 3.5.0 (210479)"}
+!21 = !{!"0x101\00b\0016777217\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [b] [line 1]
+!22 = !MDLocation(line: 1, scope: !4)
+!23 = !MDLocation(line: 2, scope: !4)
+!24 = !{!"0x101\00a\0016777222\000", !14, !15, !"_ZTS1A"} ; [ DW_TAG_arg_variable ] [a] [line 6]
+!25 = !MDLocation(line: 6, scope: !14)
+!26 = !{!"0x100\00z\007\000", !14, !15, !8} ; [ DW_TAG_auto_variable ] [z] [line 7]
+!27 = !MDLocation(line: 7, scope: !14)
+!28 = !MDLocation(line: 8, scope: !29)
+!29 = !{!"0xb\008\000\000", !10, !14} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
+!30 = !MDLocation(line: 9, scope: !29)
+!31 = !MDLocation(line: 10, scope: !32)
+!32 = !{!"0xb\0010\000\000", !10, !14} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
+!33 = !MDLocation(line: 11, scope: !32)
+!34 = !MDLocation(line: 12, scope: !14)
diff --git a/test/DebugInfo/X86/debug-ranges-offset.ll b/test/DebugInfo/X86/debug-ranges-offset.ll
index 48d1db6..fd8fe0e 100644
--- a/test/DebugInfo/X86/debug-ranges-offset.ll
+++ b/test/DebugInfo/X86/debug-ranges-offset.ll
@@ -31,11 +31,11 @@ entry:
   %call = call i8* @_Znwm(i64 4) #4, !dbg !19
   %_msret = load i64* getelementptr inbounds ([8 x i64]* @__msan_retval_tls, i64 0, i64 0), align 8, !dbg !19
   %3 = bitcast i8* %call to i32*, !dbg !19
-  tail call void @llvm.dbg.value(metadata !{i32* %3}, i64 0, metadata !9, metadata !{metadata !"0x102"}), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32* %3, i64 0, metadata !9, metadata !{!"0x102"}), !dbg !19
   %4 = inttoptr i64 %1 to i64*, !dbg !19
   store i64 %_msret, i64* %4, align 8, !dbg !19
   store volatile i32* %3, i32** %p, align 8, !dbg !19
-  tail call void @llvm.dbg.value(metadata !{i32** %p}, i64 0, metadata !9, metadata !{metadata !"0x102"}), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32** %p, i64 0, metadata !9, metadata !{!"0x102"}), !dbg !19
   %p.0.p.0. = load volatile i32** %p, align 8, !dbg !20
   %_msld = load i64* %4, align 8, !dbg !20
   %_mscmp = icmp eq i64 %_msld, 0, !dbg !20
@@ -96,11 +96,11 @@ entry:
   %call.i = call i8* @_Znwm(i64 4) #4, !dbg !30
   %_msret = load i64* getelementptr inbounds ([8 x i64]* @__msan_retval_tls, i64 0, i64 0), align 8, !dbg !30
   %3 = bitcast i8* %call.i to i32*, !dbg !30
-  tail call void @llvm.dbg.value(metadata !{i32* %3}, i64 0, metadata !32, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata i32* %3, i64 0, metadata !32, metadata !{!"0x102"}), !dbg !30
   %4 = inttoptr i64 %1 to i64*, !dbg !30
   store i64 %_msret, i64* %4, align 8, !dbg !30
   store volatile i32* %3, i32** %p.i, align 8, !dbg !30
-  tail call void @llvm.dbg.value(metadata !{i32** %p.i}, i64 0, metadata !32, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata i32** %p.i, i64 0, metadata !32, metadata !{!"0x102"}), !dbg !30
   %p.i.0.p.0.p.0..i = load volatile i32** %p.i, align 8, !dbg !33
   %_msld = load i64* %4, align 8, !dbg !33
   %_mscmp = icmp eq i64 %_msld, 0, !dbg !33
@@ -202,40 +202,40 @@ attributes #4 = { builtin }
 !llvm.module.flags = !{!16, !17}
 !llvm.ident = !{!18}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 207243) (llvm/trunk 207259)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !13}
-!4 = metadata !{metadata !"0x2e\00f\00f\00_Z1fv\003\000\001\000\006\00256\001\003", metadata !1, metadata !5, metadata !6, null, void ()* @_Z1fv, null, null, metadata !8} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x100\00p\004\000", metadata !4, metadata !5, metadata !10} ; [ DW_TAG_auto_variable ] [p] [line 4]
-!10 = metadata !{metadata !"0x35\00\000\000\000\000\000", null, null, metadata !11} ; [ DW_TAG_volatile_type ] [line 0, size 0, align 0, offset 0] [from ]
-!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !12} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!13 = metadata !{metadata !"0x2e\00main\00main\00\009\000\001\000\006\00256\001\009", metadata !1, metadata !5, metadata !14, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 9] [def] [main]
-!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!15 = metadata !{metadata !12}
-!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!18 = metadata !{metadata !"clang version 3.5.0 (trunk 207243) (llvm/trunk 207259)"}
-!19 = metadata !{i32 4, i32 0, metadata !4, null}
-!20 = metadata !{i32 5, i32 0, metadata !21, null}
-!21 = metadata !{metadata !"0xb\005\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/foo.cpp]
-!22 = metadata !{metadata !"branch_weights", i32 1000, i32 1}
-!23 = metadata !{metadata !24, metadata !24, i64 0}
-!24 = metadata !{metadata !"int", metadata !25, i64 0}
-!25 = metadata !{metadata !"omnipotent char", metadata !26, i64 0}
-!26 = metadata !{metadata !"Simple C/C++ TBAA"}
-!27 = metadata !{metadata !"branch_weights", i32 1, i32 1000}
-!28 = metadata !{i32 6, i32 0, metadata !21, null}
-!29 = metadata !{i32 7, i32 0, metadata !4, null}
-!30 = metadata !{i32 4, i32 0, metadata !4, metadata !31}
-!31 = metadata !{i32 10, i32 0, metadata !13, null}
-!32 = metadata !{metadata !"0x100\00p\004\000", metadata !4, metadata !5, metadata !10, metadata !31} ; [ DW_TAG_auto_variable ] [p] [line 4]
-!33 = metadata !{i32 5, i32 0, metadata !21, metadata !31}
-!34 = metadata !{i32 6, i32 0, metadata !21, metadata !31}
-!35 = metadata !{i32 7, i32 0, metadata !4, metadata !31}
-!36 = metadata !{i32 11, i32 0, metadata !13, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 (trunk 207243) (llvm/trunk 207259)\001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"foo.cpp", !"/usr/local/google/home/echristo/tmp"}
+!2 = !{}
+!3 = !{!4, !13}
+!4 = !{!"0x2e\00f\00f\00_Z1fv\003\000\001\000\006\00256\001\003", !1, !5, !6, null, void ()* @_Z1fv, null, null, !8} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{!9}
+!9 = !{!"0x100\00p\004\000", !4, !5, !10} ; [ DW_TAG_auto_variable ] [p] [line 4]
+!10 = !{!"0x35\00\000\000\000\000\000", null, null, !11} ; [ DW_TAG_volatile_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = !{!"0xf\00\000\0064\0064\000\000", null, null, !12} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!12 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = !{!"0x2e\00main\00main\00\009\000\001\000\006\00256\001\009", !1, !5, !14, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 9] [def] [main]
+!14 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = !{!12}
+!16 = !{i32 2, !"Dwarf Version", i32 4}
+!17 = !{i32 1, !"Debug Info Version", i32 2}
+!18 = !{!"clang version 3.5.0 (trunk 207243) (llvm/trunk 207259)"}
+!19 = !MDLocation(line: 4, scope: !4)
+!20 = !MDLocation(line: 5, scope: !21)
+!21 = !{!"0xb\005\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/foo.cpp]
+!22 = !{!"branch_weights", i32 1000, i32 1}
+!23 = !{!24, !24, i64 0}
+!24 = !{!"int", !25, i64 0}
+!25 = !{!"omnipotent char", !26, i64 0}
+!26 = !{!"Simple C/C++ TBAA"}
+!27 = !{!"branch_weights", i32 1, i32 1000}
+!28 = !MDLocation(line: 6, scope: !21)
+!29 = !MDLocation(line: 7, scope: !4)
+!30 = !MDLocation(line: 4, scope: !4, inlinedAt: !31)
+!31 = !MDLocation(line: 10, scope: !13)
+!32 = !{!"0x100\00p\004\000", !4, !5, !10, !31} ; [ DW_TAG_auto_variable ] [p] [line 4]
+!33 = !MDLocation(line: 5, scope: !21, inlinedAt: !31)
+!34 = !MDLocation(line: 6, scope: !21, inlinedAt: !31)
+!35 = !MDLocation(line: 7, scope: !4, inlinedAt: !31)
+!36 = !MDLocation(line: 11, scope: !13)
diff --git a/test/DebugInfo/X86/debug_frame.ll b/test/DebugInfo/X86/debug_frame.ll
index 3b3071f..56122e3 100644
--- a/test/DebugInfo/X86/debug_frame.ll
+++ b/test/DebugInfo/X86/debug_frame.ll
@@ -11,12 +11,12 @@ entry:
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!7}
-!5 = metadata !{metadata !0}
+!5 = !{!0}
 
-!0 = metadata !{metadata !"0x2e\00f\00f\00\001\000\001\000\006\00256\001\001", metadata !6, metadata !1, metadata !3, null, void ()* @f, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!1 = metadata !{metadata !"0x29", metadata !6} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 ()\001\00\000\00\000", metadata !6, metadata !4, metadata !4, metadata !5, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !6, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null}
-!6 = metadata !{metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/llvm/build"}
-!7 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00f\00f\00\001\000\001\000\006\00256\001\001", !6, !1, !3, null, void ()* @f, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!1 = !{!"0x29", !6} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 3.0 ()\001\00\000\00\000", !6, !4, !4, !5, null, null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !6, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{null}
+!6 = !{!"/home/espindola/llvm/test.c", !"/home/espindola/llvm/build"}
+!7 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/decl-derived-member.ll b/test/DebugInfo/X86/decl-derived-member.ll
index 43985b2..c4f3dd9 100644
--- a/test/DebugInfo/X86/decl-derived-member.ll
+++ b/test/DebugInfo/X86/decl-derived-member.ll
@@ -7,8 +7,9 @@
 ; struct base {
 ;  virtual ~base();
 ; };
+; typedef base base_type;
 ; struct foo {
-;  base b;
+;  base_type b;
 ; };
 ; foo f;
 
@@ -20,40 +21,47 @@
 
 %struct.foo = type { %struct.base }
 %struct.base = type { i32 (...)** }
+
+$_ZN3fooC2Ev = comdat any
+
+$_ZN3fooD2Ev = comdat any
+
+$_ZN4baseC2Ev = comdat any
+
 @f = global %struct.foo zeroinitializer, align 8
 @__dso_handle = external global i8
 @_ZTV4base = external unnamed_addr constant [4 x i8*]
-@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_decl_derived_member.cpp, i8* null }]
 
 define internal void @__cxx_global_var_init() section ".text.startup" {
 entry:
-  call void @_ZN3fooC2Ev(%struct.foo* @f) #2, !dbg !35
-  %0 = call i32 @__cxa_atexit(void (i8*)* bitcast (void (%struct.foo*)* @_ZN3fooD2Ev to void (i8*)*), i8* bitcast (%struct.foo* @f to i8*), i8* @__dso_handle) #2, !dbg !35
-  ret void, !dbg !35
+  call void @_ZN3fooC2Ev(%struct.foo* @f) #2, !dbg !33
+  %0 = call i32 @__cxa_atexit(void (i8*)* bitcast (void (%struct.foo*)* @_ZN3fooD2Ev to void (i8*)*), i8* bitcast (%struct.foo* @f to i8*), i8* @__dso_handle) #2, !dbg !33
+  ret void, !dbg !33
 }
 
 ; Function Attrs: inlinehint nounwind uwtable
-define linkonce_odr void @_ZN3fooC2Ev(%struct.foo* %this) unnamed_addr #0 align 2 {
+define linkonce_odr void @_ZN3fooC2Ev(%struct.foo* %this) unnamed_addr #0 comdat align 2 {
 entry:
   %this.addr = alloca %struct.foo*, align 8
   store %struct.foo* %this, %struct.foo** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr}, metadata !36, metadata !{metadata !"0x102"}), !dbg !38
+  call void @llvm.dbg.declare(metadata %struct.foo** %this.addr, metadata !34, metadata !36), !dbg !37
   %this1 = load %struct.foo** %this.addr
-  %b = getelementptr inbounds %struct.foo* %this1, i32 0, i32 0, !dbg !39
-  call void @_ZN4baseC2Ev(%struct.base* %b) #2, !dbg !39
-  ret void, !dbg !39
+  %b = getelementptr inbounds %struct.foo* %this1, i32 0, i32 0, !dbg !38
+  call void @_ZN4baseC2Ev(%struct.base* %b) #2, !dbg !38
+  ret void, !dbg !38
 }
 
 ; Function Attrs: inlinehint uwtable
-define linkonce_odr void @_ZN3fooD2Ev(%struct.foo* %this) unnamed_addr #1 align 2 {
+define linkonce_odr void @_ZN3fooD2Ev(%struct.foo* %this) unnamed_addr #1 comdat align 2 {
 entry:
   %this.addr = alloca %struct.foo*, align 8
   store %struct.foo* %this, %struct.foo** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr}, metadata !40, metadata !{metadata !"0x102"}), !dbg !41
+  call void @llvm.dbg.declare(metadata %struct.foo** %this.addr, metadata !39, metadata !36), !dbg !40
   %this1 = load %struct.foo** %this.addr
-  %b = getelementptr inbounds %struct.foo* %this1, i32 0, i32 0, !dbg !42
-  call void @_ZN4baseD1Ev(%struct.base* %b), !dbg !42
-  ret void, !dbg !44
+  %b = getelementptr inbounds %struct.foo* %this1, i32 0, i32 0, !dbg !41
+  call void @_ZN4baseD1Ev(%struct.base* %b), !dbg !41
+  ret void, !dbg !43
 }
 
 ; Function Attrs: nounwind
@@ -62,24 +70,24 @@ declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #2
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #3
 
-declare void @_ZN4baseD1Ev(%struct.base*) #4
-
 ; Function Attrs: inlinehint nounwind uwtable
-define linkonce_odr void @_ZN4baseC2Ev(%struct.base* %this) unnamed_addr #0 align 2 {
+define linkonce_odr void @_ZN4baseC2Ev(%struct.base* %this) unnamed_addr #0 comdat align 2 {
 entry:
   %this.addr = alloca %struct.base*, align 8
   store %struct.base* %this, %struct.base** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.base** %this.addr}, metadata !45, metadata !{metadata !"0x102"}), !dbg !47
+  call void @llvm.dbg.declare(metadata %struct.base** %this.addr, metadata !44, metadata !36), !dbg !46
   %this1 = load %struct.base** %this.addr
-  %0 = bitcast %struct.base* %this1 to i8***, !dbg !48
-  store i8** getelementptr inbounds ([4 x i8*]* @_ZTV4base, i64 0, i64 2), i8*** %0, !dbg !48
-  ret void, !dbg !48
+  %0 = bitcast %struct.base* %this1 to i32 (...)***, !dbg !47
+  store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*]* @_ZTV4base, i64 0, i64 2) to i32 (...)**), i32 (...)*** %0, !dbg !47
+  ret void, !dbg !47
 }
 
-define internal void @_GLOBAL__I_a() section ".text.startup" {
+declare void @_ZN4baseD1Ev(%struct.base*) #4
+
+define internal void @_GLOBAL__sub_I_decl_derived_member.cpp() section ".text.startup" {
 entry:
-  call void @__cxx_global_var_init(), !dbg !49
-  ret void, !dbg !49
+  call void @__cxx_global_var_init(), !dbg !48
+  ret void
 }
 
 attributes #0 = { inlinehint nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -89,56 +97,55 @@ attributes #3 = { nounwind readnone }
 attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!32, !33}
-!llvm.ident = !{!34}
-
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 203673) (llvm/trunk 203681)\000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !8, metadata !30, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"foo.cc", metadata !"/usr/local/google/home/echristo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !7}
-!4 = metadata !{metadata !"0x13\00foo\005\0064\0064\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 5, size 64, align 64, offset 0] [def] [from ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{metadata !"0xd\00b\006\0064\0064\000\000", metadata !1, metadata !"_ZTS3foo", metadata !"_ZTS4base"} ; [ DW_TAG_member ] [b] [line 6, size 64, align 64, offset 0] [from _ZTS4base]
-!7 = metadata !{metadata !"0x13\00base\001\000\000\000\004\000", metadata !1, null, null, null, null, null, metadata !"_ZTS4base"} ; [ DW_TAG_structure_type ] [base] [line 1, size 0, align 0, offset 0] [decl] [from ]
-!8 = metadata !{metadata !9, metadata !13, metadata !19, metadata !22, metadata !28}
-!9 = metadata !{metadata !"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\009\001\001\000\006\00256\000\009", metadata !1, metadata !10, metadata !11, null, void ()* @__cxx_global_var_init, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 9] [local] [def] [__cxx_global_var_init]
-!10 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/foo.cc]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{null}
-!13 = metadata !{metadata !"0x2e\00~foo\00~foo\00_ZN3fooD2Ev\005\000\001\000\006\00320\000\005", metadata !1, metadata !"_ZTS3foo", metadata !14, null, void (%struct.foo*)* @_ZN3fooD2Ev, null, metadata !17, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [~foo]
-!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!15 = metadata !{null, metadata !16}
-!16 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
-!17 = metadata !{metadata !"0x2e\00~foo\00~foo\00\000\000\000\000\006\00320\000\000", null, metadata !"_ZTS3foo", metadata !14, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 0] [~foo]
-!18 = metadata !{i32 786468}
-!19 = metadata !{metadata !"0x2e\00foo\00foo\00_ZN3fooC2Ev\005\000\001\000\006\00320\000\005", metadata !1, metadata !"_ZTS3foo", metadata !14, null, void (%struct.foo*)* @_ZN3fooC2Ev, null, metadata !20, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
-!20 = metadata !{metadata !"0x2e\00foo\00foo\00\000\000\000\000\006\00320\000\000", null, metadata !"_ZTS3foo", metadata !14, null, null, null, i32 0, metadata !21} ; [ DW_TAG_subprogram ] [line 0] [foo]
-!21 = metadata !{i32 786468}
-!22 = metadata !{metadata !"0x2e\00base\00base\00_ZN4baseC2Ev\001\000\001\000\006\00320\000\001", metadata !1, metadata !"_ZTS4base", metadata !23, null, void (%struct.base*)* @_ZN4baseC2Ev, null, metadata !26, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [base]
-!23 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !24, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!24 = metadata !{null, metadata !25}
-!25 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS4base"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4base]
-!26 = metadata !{metadata !"0x2e\00base\00base\00\000\000\000\000\006\00320\000\000", null, metadata !"_ZTS4base", metadata !23, null, null, null, i32 0, metadata !27} ; [ DW_TAG_subprogram ] [line 0] [base]
-!27 = metadata !{i32 786468}
-!28 = metadata !{metadata !"0x2e\00\00\00_GLOBAL__I_a\001\001\001\000\006\0064\000\001", metadata !1, metadata !10, metadata !29, null, void ()* @_GLOBAL__I_a, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [local] [def]
-!29 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!30 = metadata !{metadata !31}
-!31 = metadata !{metadata !"0x34\00f\00f\00\009\000\001", null, metadata !10, metadata !4, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 9] [def]
-!32 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!34 = metadata !{metadata !"clang version 3.5.0 (trunk 203673) (llvm/trunk 203681)"}
-!35 = metadata !{i32 9, i32 0, metadata !9, null}
-!36 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !19, null, metadata !37} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!37 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
-!38 = metadata !{i32 0, i32 0, metadata !19, null}
-!39 = metadata !{i32 5, i32 0, metadata !19, null}
-!40 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !13, null, metadata !37} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!41 = metadata !{i32 0, i32 0, metadata !13, null}
-!42 = metadata !{i32 5, i32 0, metadata !43, null}
-!43 = metadata !{metadata !"0xb\005\000\000", metadata !1, metadata !13} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cc]
-!44 = metadata !{i32 5, i32 0, metadata !13, null}
-!45 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !22, null, metadata !46} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!46 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS4base"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4base]
-!47 = metadata !{i32 0, i32 0, metadata !22, null}
-!48 = metadata !{i32 1, i32 0, metadata !22, null}
-!49 = metadata !{i32 1, i32 0, metadata !28, null}
+!llvm.module.flags = !{!30, !31}
+!llvm.ident = !{!32}
+
+!0 = !{!"0x11\004\00clang version 3.7.0 (trunk 227104) (llvm/trunk 227103)\000\00\000\00\001", !1, !2, !3, !9, !28, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/decl-derived-member.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"decl-derived-member.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4, !8}
+!4 = !{!"0x13\00foo\005\0064\0064\000\000\000", !1, null, null, !5, null, null, !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 5, size 64, align 64, offset 0] [def] [from ]
+!5 = !{!6}
+!6 = !{!"0xd\00b\006\0064\0064\000\000", !1, !"_ZTS3foo", !7} ; [ DW_TAG_member ] [b] [line 6, size 64, align 64, offset 0] [from base_type]
+!7 = !{!"0x16\00base_type\004\000\000\000\000", !1, null, !"_ZTS4base"} ; [ DW_TAG_typedef ] [base_type] [line 4, size 0, align 0, offset 0] [from _ZTS4base]
+!8 = !{!"0x13\00base\001\000\000\000\004\000", !1, null, null, null, null, null, !"_ZTS4base"} ; [ DW_TAG_structure_type ] [base] [line 1, size 0, align 0, offset 0] [decl] [from ]
+!9 = !{!10, !14, !19, !24, !26}
+!10 = !{!"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\008\001\001\000\000\00256\000\008", !1, !11, !12, null, void ()* @__cxx_global_var_init, null, null, !2} ; [ DW_TAG_subprogram ] [line 8] [local] [def] [__cxx_global_var_init]
+!11 = !{!"0x29", !1}                              ; [ DW_TAG_file_type ] [/tmp/dbginfo/decl-derived-member.cpp]
+!12 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = !{null}
+!14 = !{!"0x2e\00foo\00foo\00_ZN3fooC2Ev\005\000\001\000\000\00320\000\005", !1, !"_ZTS3foo", !15, null, void (%struct.foo*)* @_ZN3fooC2Ev, null, !18, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
+!15 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{null, !17}
+!17 = !{!"0xf\00\000\0064\0064\000\001088\00", null, null, !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
+!18 = !{!"0x2e\00foo\00foo\00\000\000\000\000\000\00320\000\000", null, !"_ZTS3foo", !15, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 0] [foo]
+!19 = !{!"0x2e\00base\00base\00_ZN4baseC2Ev\001\000\001\000\000\00320\000\001", !1, !"_ZTS4base", !20, null, void (%struct.base*)* @_ZN4baseC2Ev, null, !23, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [base]
+!20 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!21 = !{null, !22}
+!22 = !{!"0xf\00\000\0064\0064\000\001088\00", null, null, !"_ZTS4base"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4base]
+!23 = !{!"0x2e\00base\00base\00\000\000\000\000\000\00320\000\000", null, !"_ZTS4base", !20, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 0] [base]
+!24 = !{!"0x2e\00~foo\00~foo\00_ZN3fooD2Ev\005\000\001\000\000\00320\000\005", !1, !"_ZTS3foo", !15, null, void (%struct.foo*)* @_ZN3fooD2Ev, null, !25, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [~foo]
+!25 = !{!"0x2e\00~foo\00~foo\00\000\000\000\000\000\00320\000\000", null, !"_ZTS3foo", !15, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 0] [~foo]
+!26 = !{!"0x2e\00\00\00_GLOBAL__sub_I_decl_derived_member.cpp\000\001\001\000\000\0064\000\000", !1, !11, !27, null, void ()* @_GLOBAL__sub_I_decl_derived_member.cpp, null, null, !2} ; [ DW_TAG_subprogram ] [line 0] [local] [def]
+!27 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = !{!29}
+!29 = !{!"0x34\00f\00f\00\008\000\001", null, !11, !"_ZTS3foo", %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 8] [def]
+!30 = !{i32 2, !"Dwarf Version", i32 4}
+!31 = !{i32 2, !"Debug Info Version", i32 2}
+!32 = !{!"clang version 3.7.0 (trunk 227104) (llvm/trunk 227103)"}
+!33 = !MDLocation(line: 8, column: 5, scope: !10)
+!34 = !{!"0x101\00this\0016777216\001088", !14, null, !35} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!35 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
+!36 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!37 = !MDLocation(line: 0, scope: !14)
+!38 = !MDLocation(line: 5, column: 8, scope: !14)
+!39 = !{!"0x101\00this\0016777216\001088", !24, null, !35} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!40 = !MDLocation(line: 0, scope: !24)
+!41 = !MDLocation(line: 5, column: 8, scope: !42)
+!42 = !{!"0xb\005\008\002", !1, !24}              ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/decl-derived-member.cpp]
+!43 = !MDLocation(line: 5, column: 8, scope: !24)
+!44 = !{!"0x101\00this\0016777216\001088", !19, null, !45} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!45 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS4base"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4base]
+!46 = !MDLocation(line: 0, scope: !19)
+!47 = !MDLocation(line: 1, column: 8, scope: !19)
+!48 = !MDLocation(line: 0, scope: !26)
diff --git a/test/DebugInfo/X86/discriminator.ll b/test/DebugInfo/X86/discriminator.ll
index b906e18..185f7cd 100644
--- a/test/DebugInfo/X86/discriminator.ll
+++ b/test/DebugInfo/X86/discriminator.ll
@@ -41,23 +41,23 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [./discriminator.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"discriminator.c", metadata !"."}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [./discriminator.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!9 = metadata !{metadata !"clang version 3.5 "}
-!10 = metadata !{i32 2, i32 0, metadata !11, null}
-!11 = metadata !{metadata !"0xb\002\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [./discriminator.c]
-!12 = metadata !{i32 3, i32 0, metadata !4, null}
-!13 = metadata !{i32 4, i32 0, metadata !4, null}
-!14 = metadata !{i32 2, i32 0, metadata !15, null}
-!15 = metadata !{metadata !"0xb\0042", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [./discriminator.c]
+!0 = !{!"0x11\0012\00clang version 3.5 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [./discriminator.c] [DW_LANG_C99]
+!1 = !{!"discriminator.c", !"."}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", !1, !5, !6, null, i32 (i32)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [./discriminator.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 2}
+!9 = !{!"clang version 3.5 "}
+!10 = !MDLocation(line: 2, scope: !11)
+!11 = !{!"0xb\002\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [./discriminator.c]
+!12 = !MDLocation(line: 3, scope: !4)
+!13 = !MDLocation(line: 4, scope: !4)
+!14 = !MDLocation(line: 2, scope: !15)
+!15 = !{!"0xb\0042", !1, !4} ; [ DW_TAG_lexical_block ] [./discriminator.c]
 
 ; CHECK: Address            Line   Column File   ISA Discriminator Flags
 ; CHECK: ------------------ ------ ------ ------ --- ------------- -------------
-; CHECK: 0x0000000000000011      2      0      1   0            42  is_stmt
+; CHECK: 0x0000000000000011      2      0      1   0            42 {{$}}
diff --git a/test/DebugInfo/X86/dwarf-aranges-no-dwarf-labels.ll b/test/DebugInfo/X86/dwarf-aranges-no-dwarf-labels.ll
index d5d1f72..1bda8ec 100644
--- a/test/DebugInfo/X86/dwarf-aranges-no-dwarf-labels.ll
+++ b/test/DebugInfo/X86/dwarf-aranges-no-dwarf-labels.ll
@@ -28,14 +28,14 @@ target triple = "x86_64-unknown-linux-gnu"
 ; Function Attrs: nounwind readnone uwtable
 define i32 @_Z3fooi(i32 %bar) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %bar}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !20
+  tail call void @llvm.dbg.value(metadata i32 %bar, i64 0, metadata !10, metadata !{!"0x102"}), !dbg !20
   ret i32 %bar, !dbg !20
 }
 
 ; Function Attrs: nounwind readnone uwtable
 define i32 @_Z4foo2i(i32 %bar2) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %bar2}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !21
+  tail call void @llvm.dbg.value(metadata i32 %bar2, i64 0, metadata !13, metadata !{!"0x102"}), !dbg !21
   ret i32 %bar2, !dbg !21
 }
 
@@ -60,30 +60,30 @@ attributes #2 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!19, !26}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (191881)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !17, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/debug_ranges/a.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"tmp/debug_ranges/a.cc", metadata !"/"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !11, metadata !14}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooi\002\000\001\000\006\00256\001\002", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @_Z3fooi, null, null, metadata !9} ; [ DW_TAG_subprogram ] [line 2] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/debug_ranges/a.cc]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x101\00bar\0016777218\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [bar] [line 2]
-!11 = metadata !{metadata !"0x2e\00foo2\00foo2\00_Z4foo2i\003\000\001\000\006\00256\001\003", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @_Z4foo2i, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 3] [def] [foo2]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x101\00bar2\0016777219\000", metadata !11, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [bar2] [line 3]
-!14 = metadata !{metadata !"0x2e\00main\00main\00\005\000\001\000\006\00256\001\005", metadata !1, metadata !5, metadata !15, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{metadata !8}
-!17 = metadata !{metadata !18}
-!18 = metadata !{metadata !"0x34\00global\00global\00\001\000\001", null, metadata !5, metadata !8, i32* @global, null} ; [ DW_TAG_variable ] [global] [line 1] [def]
-!19 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!20 = metadata !{i32 2, i32 0, metadata !4, null}
-!21 = metadata !{i32 3, i32 0, metadata !11, null}
-!22 = metadata !{i32 6, i32 0, metadata !14, null}
-!23 = metadata !{metadata !"int", metadata !24}
-!24 = metadata !{metadata !"omnipotent char", metadata !25}
-!25 = metadata !{metadata !"Simple C/C++ TBAA"}
-!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 (191881)\001\00\000\00\001", !1, !2, !2, !3, !17, !2} ; [ DW_TAG_compile_unit ] [/tmp/debug_ranges/a.cc] [DW_LANG_C_plus_plus]
+!1 = !{!"tmp/debug_ranges/a.cc", !"/"}
+!2 = !{}
+!3 = !{!4, !11, !14}
+!4 = !{!"0x2e\00foo\00foo\00_Z3fooi\002\000\001\000\006\00256\001\002", !1, !5, !6, null, i32 (i32)* @_Z3fooi, null, null, !9} ; [ DW_TAG_subprogram ] [line 2] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/debug_ranges/a.cc]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!10}
+!10 = !{!"0x101\00bar\0016777218\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [bar] [line 2]
+!11 = !{!"0x2e\00foo2\00foo2\00_Z4foo2i\003\000\001\000\006\00256\001\003", !1, !5, !6, null, i32 (i32)* @_Z4foo2i, null, null, !12} ; [ DW_TAG_subprogram ] [line 3] [def] [foo2]
+!12 = !{!13}
+!13 = !{!"0x101\00bar2\0016777219\000", !11, !5, !8} ; [ DW_TAG_arg_variable ] [bar2] [line 3]
+!14 = !{!"0x2e\00main\00main\00\005\000\001\000\006\00256\001\005", !1, !5, !15, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
+!15 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{!8}
+!17 = !{!18}
+!18 = !{!"0x34\00global\00global\00\001\000\001", null, !5, !8, i32* @global, null} ; [ DW_TAG_variable ] [global] [line 1] [def]
+!19 = !{i32 2, !"Dwarf Version", i32 4}
+!20 = !MDLocation(line: 2, scope: !4)
+!21 = !MDLocation(line: 3, scope: !11)
+!22 = !MDLocation(line: 6, scope: !14)
+!23 = !{!"int", !24}
+!24 = !{!"omnipotent char", !25}
+!25 = !{!"Simple C/C++ TBAA"}
+!26 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dwarf-aranges.ll b/test/DebugInfo/X86/dwarf-aranges.ll
index 237e418..6873e58 100644
--- a/test/DebugInfo/X86/dwarf-aranges.ll
+++ b/test/DebugInfo/X86/dwarf-aranges.ll
@@ -62,20 +62,20 @@ entry:
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !16}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !8, metadata !2} ; [ DW_TAG_compile_unit ] [/home/kayamon/test.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"test.c", metadata !"/home/kayamon"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00some_code\00some_code\00\005\000\001\000\006\000\000\006", metadata !1, metadata !5, metadata !6, null, void ()* @some_code, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 6] [some_code]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/home/kayamon/test.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{metadata !9, metadata !11, metadata !12}
-!9 = metadata !{metadata !"0x34\00some_data\00some_data\00\001\000\001", null, metadata !5, metadata !10, i32* @some_data, null} ; [ DW_TAG_variable ] [some_data] [line 1] [def]
-!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!11 = metadata !{metadata !"0x34\00some_other\00some_other\00\003\000\001", null, metadata !5, metadata !10, i32* @some_other, null} ; [ DW_TAG_variable ] [some_other] [line 3] [def]
-!12 = metadata !{metadata !"0x34\00some_bss\00some_bss\00\002\000\001", null, metadata !5, metadata !10, i32* @some_bss, null} ; [ DW_TAG_variable ] [some_bss] [line 2] [def]
-!13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!14 = metadata !{i32 7, i32 0, metadata !4, null}
-!15 = metadata !{i32 8, i32 0, metadata !4, null}
-!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4 \000\00\000\00\000", !1, !2, !2, !3, !8, !2} ; [ DW_TAG_compile_unit ] [/home/kayamon/test.c] [DW_LANG_C99]
+!1 = !{!"test.c", !"/home/kayamon"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00some_code\00some_code\00\005\000\001\000\006\000\000\006", !1, !5, !6, null, void ()* @some_code, null, null, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 6] [some_code]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/home/kayamon/test.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{!9, !11, !12}
+!9 = !{!"0x34\00some_data\00some_data\00\001\000\001", null, !5, !10, i32* @some_data, null} ; [ DW_TAG_variable ] [some_data] [line 1] [def]
+!10 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!11 = !{!"0x34\00some_other\00some_other\00\003\000\001", null, !5, !10, i32* @some_other, null} ; [ DW_TAG_variable ] [some_other] [line 3] [def]
+!12 = !{!"0x34\00some_bss\00some_bss\00\002\000\001", null, !5, !10, i32* @some_bss, null} ; [ DW_TAG_variable ] [some_bss] [line 2] [def]
+!13 = !{i32 2, !"Dwarf Version", i32 4}
+!14 = !MDLocation(line: 7, scope: !4)
+!15 = !MDLocation(line: 8, scope: !4)
+!16 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dwarf-public-names.ll b/test/DebugInfo/X86/dwarf-public-names.ll
index aebc7ef..778738c 100644
--- a/test/DebugInfo/X86/dwarf-public-names.ll
+++ b/test/DebugInfo/X86/dwarf-public-names.ll
@@ -63,7 +63,7 @@ define void @_ZN1C15member_functionEv(%struct.C* %this) nounwind uwtable align 2
 entry:
   %this.addr = alloca %struct.C*, align 8
   store %struct.C* %this, %struct.C** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !28, metadata !{metadata !"0x102"}), !dbg !30
+  call void @llvm.dbg.declare(metadata %struct.C** %this.addr, metadata !28, metadata !{!"0x102"}), !dbg !30
   %this1 = load %struct.C** %this.addr
   store i32 0, i32* @_ZN1C22static_member_variableE, align 4, !dbg !31
   ret void, !dbg !32
@@ -94,42 +94,42 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!38}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (http://llvm.org/git/clang.git a09cd8103a6a719cb2628cdf0c91682250a17bd2) (http://llvm.org/git/llvm.git 47d03cec0afca0c01ae42b82916d1d731716cd20)\000\00\000\00\000", metadata !37, metadata !1, metadata !1, metadata !2, metadata !24,  metadata !1} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{}
-!2 = metadata !{metadata !3, metadata !18, metadata !19, metadata !20}
-!3 = metadata !{metadata !"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\009\000\001\000\006\00256\000\009", metadata !4, null, metadata !5, null, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !12, metadata !1} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
-!4 = metadata !{metadata !"0x29", metadata !37} ; [ DW_TAG_file_type ]
-!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!6 = metadata !{null, metadata !7}
-!7 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from C]
-!8 = metadata !{metadata !"0x13\00C\001\008\008\000\000\000", metadata !37, null, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
-!9 = metadata !{metadata !10, metadata !12, metadata !14}
-!10 = metadata !{metadata !"0xd\00static_member_variable\004\000\000\000\004096", metadata !37, metadata !8, metadata !11, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
-!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!12 = metadata !{metadata !"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\002\000\000\000\006\00256\000\002", metadata !4, metadata !8, metadata !5, null, null, null, i32 0, metadata !13} ; [ DW_TAG_subprogram ] [line 2] [member_function]
-!13 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!14 = metadata !{metadata !"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\003\000\000\000\006\00256\000\003", metadata !4, metadata !8, metadata !15, null, null, null, i32 0, metadata !17} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{metadata !11}
-!17 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!18 = metadata !{metadata !"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\0013\000\001\000\006\00256\000\0013", metadata !4, null, metadata !15, null, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !14, metadata !1} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
-!19 = metadata !{metadata !"0x2e\00global_function\00global_function\00_Z15global_functionv\0019\000\001\000\006\00256\000\0019", metadata !4, metadata !4, metadata !15, null, i32 ()* @_Z15global_functionv, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
-!20 = metadata !{metadata !"0x2e\00global_namespace_function\00global_namespace_function\00_ZN2ns25global_namespace_functionEv\0024\000\001\000\006\00256\000\0024", metadata !4, metadata !21, metadata !22, null, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
-!21 = metadata !{metadata !"0x39\00ns\0023", metadata !4, null} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
-!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!23 = metadata !{null}
-!24 = metadata !{metadata !25, metadata !26, metadata !27}
-!25 = metadata !{metadata !"0x34\00static_member_variable\00static_member_variable\00_ZN1C22static_member_variableE\007\000\001", metadata !8, metadata !4, metadata !11, i32* @_ZN1C22static_member_variableE, metadata !10} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
-!26 = metadata !{metadata !"0x34\00global_variable\00global_variable\00\0017\000\001", null, metadata !4, metadata !8, %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 17] [def]
-!27 = metadata !{metadata !"0x34\00global_namespace_variable\00global_namespace_variable\00_ZN2ns25global_namespace_variableE\0027\000\001", metadata !21, metadata !4, metadata !11, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 27] [def]
-!28 = metadata !{metadata !"0x101\00this\0016777225\001088", metadata !3, metadata !4, metadata !29} ; [ DW_TAG_arg_variable ] [this] [line 9]
-!29 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from C]
-!30 = metadata !{i32 9, i32 0, metadata !3, null}
-!31 = metadata !{i32 10, i32 0, metadata !3, null}
-!32 = metadata !{i32 11, i32 0, metadata !3, null}
-!33 = metadata !{i32 14, i32 0, metadata !18, null}
-!34 = metadata !{i32 20, i32 0, metadata !19, null}
-!35 = metadata !{i32 25, i32 0, metadata !20, null}
-!36 = metadata !{i32 26, i32 0, metadata !20, null}
-!37 = metadata !{metadata !"dwarf-public-names.cpp", metadata !"/usr2/kparzysz/s.hex/t"}
-!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.3 (http://llvm.org/git/clang.git a09cd8103a6a719cb2628cdf0c91682250a17bd2) (http://llvm.org/git/llvm.git 47d03cec0afca0c01ae42b82916d1d731716cd20)\000\00\000\00\000", !37, !1, !1, !2, !24,  !1} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp] [DW_LANG_C_plus_plus]
+!1 = !{}
+!2 = !{!3, !18, !19, !20}
+!3 = !{!"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\009\000\001\000\006\00256\000\009", !4, null, !5, null, void (%struct.C*)* @_ZN1C15member_functionEv, null, !12, !1} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
+!4 = !{!"0x29", !37} ; [ DW_TAG_file_type ]
+!5 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = !{null, !7}
+!7 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from C]
+!8 = !{!"0x13\00C\001\008\008\000\000\000", !37, null, null, !9, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
+!9 = !{!10, !12, !14}
+!10 = !{!"0xd\00static_member_variable\004\000\000\000\004096", !37, !8, !11, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
+!11 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = !{!"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\002\000\000\000\006\00256\000\002", !4, !8, !5, null, null, null, i32 0, !13} ; [ DW_TAG_subprogram ] [line 2] [member_function]
+!13 = !{!"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!14 = !{!"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\003\000\000\000\006\00256\000\003", !4, !8, !15, null, null, null, i32 0, !17} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
+!15 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{!11}
+!17 = !{!"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!18 = !{!"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\0013\000\001\000\006\00256\000\0013", !4, null, !15, null, i32 ()* @_ZN1C22static_member_functionEv, null, !14, !1} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
+!19 = !{!"0x2e\00global_function\00global_function\00_Z15global_functionv\0019\000\001\000\006\00256\000\0019", !4, !4, !15, null, i32 ()* @_Z15global_functionv, null, null, !1} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
+!20 = !{!"0x2e\00global_namespace_function\00global_namespace_function\00_ZN2ns25global_namespace_functionEv\0024\000\001\000\006\00256\000\0024", !4, !21, !22, null, void ()* @_ZN2ns25global_namespace_functionEv, null, null, !1} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
+!21 = !{!"0x39\00ns\0023", !4, null} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
+!22 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = !{null}
+!24 = !{!25, !26, !27}
+!25 = !{!"0x34\00static_member_variable\00static_member_variable\00_ZN1C22static_member_variableE\007\000\001", !8, !4, !11, i32* @_ZN1C22static_member_variableE, !10} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
+!26 = !{!"0x34\00global_variable\00global_variable\00\0017\000\001", null, !4, !8, %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 17] [def]
+!27 = !{!"0x34\00global_namespace_variable\00global_namespace_variable\00_ZN2ns25global_namespace_variableE\0027\000\001", !21, !4, !11, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 27] [def]
+!28 = !{!"0x101\00this\0016777225\001088", !3, !4, !29} ; [ DW_TAG_arg_variable ] [this] [line 9]
+!29 = !{!"0xf\00\000\0064\0064\000\000", null, null, !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from C]
+!30 = !MDLocation(line: 9, scope: !3)
+!31 = !MDLocation(line: 10, scope: !3)
+!32 = !MDLocation(line: 11, scope: !3)
+!33 = !MDLocation(line: 14, scope: !18)
+!34 = !MDLocation(line: 20, scope: !19)
+!35 = !MDLocation(line: 25, scope: !20)
+!36 = !MDLocation(line: 26, scope: !20)
+!37 = !{!"dwarf-public-names.cpp", !"/usr2/kparzysz/s.hex/t"}
+!38 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/dwarf-pubnames-split.ll b/test/DebugInfo/X86/dwarf-pubnames-split.ll
index 87dd0ff..fc99474 100644
--- a/test/DebugInfo/X86/dwarf-pubnames-split.ll
+++ b/test/DebugInfo/X86/dwarf-pubnames-split.ll
@@ -24,15 +24,15 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !11}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 189287) (llvm/trunk 189296)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!10 = metadata !{i32 2, i32 0, metadata !4, null}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4 (trunk 189287) (llvm/trunk 189296)\000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
+!1 = !{!"foo.c", !"/usr/local/google/home/echristo/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00main\00main\00\001\000\001\000\006\00256\000\001", !1, !5, !6, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 3}
+!10 = !MDLocation(line: 2, scope: !4)
+!11 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/earlydup-crash.ll b/test/DebugInfo/X86/earlydup-crash.ll
index 6bbd620..fb85ada 100644
--- a/test/DebugInfo/X86/earlydup-crash.ll
+++ b/test/DebugInfo/X86/earlydup-crash.ll
@@ -13,7 +13,7 @@ entry:
 bb:                                               ; preds = %entry
   %tmp = icmp eq i32 undef, 0
   %tmp1 = add i32 0, 11
-  call void @llvm.dbg.value(metadata !{i32 %tmp1}, i64 0, metadata !0, metadata !{metadata !"0x102"})
+  call void @llvm.dbg.value(metadata i32 %tmp1, i64 0, metadata !0, metadata !{!"0x102"})
   br i1 undef, label %bb18, label %bb31.preheader
 
 bb31.preheader:                                   ; preds = %bb19, %bb
@@ -44,51 +44,51 @@ declare void @foobar(i32)
 
 !llvm.dbg.cu = !{!4}
 !llvm.module.flags = !{!47}
-!0 = metadata !{metadata !"0x100\00frname_len\00517\000", metadata !1, metadata !3, metadata !38} ; [ DW_TAG_auto_variable ]
-!1 = metadata !{metadata !"0xb\00515\000\0019", metadata !44, metadata !2} ; [ DW_TAG_lexical_block ]
-!2 = metadata !{metadata !"0x2e\00framework_construct_pathname\00framework_construct_pathname\00\00515\001\001\000\006\00256\001\000", metadata !44, null, metadata !5, null, i8* (i8*, %struct.cpp_dir*)* @framework_construct_pathname, null, null, null} ; [ DW_TAG_subprogram ]
-!3 = metadata !{metadata !"0x29", metadata !44}  ; [ DW_TAG_file_type ]
-!4 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !44, metadata !46, metadata !46, metadata !45, null, null} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !44, metadata !3, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!6 = metadata !{metadata !7, metadata !9, metadata !11}
-!7 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !44, metadata !3, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !44, metadata !3} ; [ DW_TAG_base_type ]
-!9 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !44, metadata !3, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{metadata !"0x26\00\000\008\008\000\000", metadata !44, metadata !3, metadata !8} ; [ DW_TAG_const_type ]
-!11 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !44, metadata !3, metadata !12} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{metadata !"0x16\00cpp_dir\0045\000\000\000\000", metadata !41, metadata !13, metadata !14} ; [ DW_TAG_typedef ]
-!13 = metadata !{metadata !"0x29", metadata !41} ; [ DW_TAG_file_type ]
-!14 = metadata !{metadata !"0x13\00cpp_dir\0043\00352\0032\000\000\000", metadata !41, metadata !3, null, metadata !15, null, null, null} ; [ DW_TAG_structure_type ] [cpp_dir] [line 43, size 352, align 32, offset 0] [def] [from ]
-!15 = metadata !{metadata !16, metadata !18, metadata !19, metadata !21, metadata !23, metadata !25, metadata !27, metadata !29, metadata !33, metadata !36}
-!16 = metadata !{metadata !"0xd\00next\00572\0032\0032\000\000", metadata !41, metadata !14, metadata !17} ; [ DW_TAG_member ]
-!17 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !44, metadata !3, metadata !14} ; [ DW_TAG_pointer_type ]
-!18 = metadata !{metadata !"0xd\00name\00575\0032\0032\0032\000", metadata !41, metadata !14, metadata !7} ; [ DW_TAG_member ]
-!19 = metadata !{metadata !"0xd\00len\00576\0032\0032\0064\000", metadata !41, metadata !14, metadata !20} ; [ DW_TAG_member ]
-!20 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", metadata !44, metadata !3} ; [ DW_TAG_base_type ]
-!21 = metadata !{metadata !"0xd\00sysp\00580\008\008\0096\000", metadata !41, metadata !14, metadata !22} ; [ DW_TAG_member ]
-!22 = metadata !{metadata !"0x24\00unsigned char\000\008\008\000\000\008", metadata !44, metadata !3} ; [ DW_TAG_base_type ]
-!23 = metadata !{metadata !"0xd\00name_map\00584\0032\0032\00128\000", metadata !41, metadata !14, metadata !24} ; [ DW_TAG_member ]
-!24 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !44, metadata !3, metadata !9} ; [ DW_TAG_pointer_type ]
-!25 = metadata !{metadata !"0xd\00header_map\00590\0032\0032\00160\000", metadata !41, metadata !14, metadata !26} ; [ DW_TAG_member ]
-!26 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !44, metadata !3, null} ; [ DW_TAG_pointer_type ]
-!27 = metadata !{metadata !"0xd\00construct\00597\0032\0032\00192\000", metadata !41, metadata !14, metadata !28} ; [ DW_TAG_member ]
-!28 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !44, metadata !3, metadata !5} ; [ DW_TAG_pointer_type ]
-!29 = metadata !{metadata !"0xd\00ino\00601\0064\0064\00224\000", metadata !41, metadata !14, metadata !30} ; [ DW_TAG_member ]
-!30 = metadata !{metadata !"0x16\00ino_t\00141\000\000\000\000", metadata !42, metadata !31, metadata !32} ; [ DW_TAG_typedef ]
-!31 = metadata !{metadata !"0x29", metadata !42} ; [ DW_TAG_file_type ]
-!32 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0064\000\000\007", metadata !44, metadata !3} ; [ DW_TAG_base_type ]
-!33 = metadata !{metadata !"0xd\00dev\00602\0032\0032\00288\000", metadata !41, metadata !14, metadata !34} ; [ DW_TAG_member ]
-!34 = metadata !{metadata !"0x16\00dev_t\00107\000\000\000\000", metadata !42, metadata !31, metadata !35} ; [ DW_TAG_typedef ]
-!35 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !44, metadata !3} ; [ DW_TAG_base_type ]
-!36 = metadata !{metadata !"0xd\00user_supplied_p\00605\008\008\00320\000", metadata !41, metadata !14, metadata !37} ; [ DW_TAG_member ]
-!37 = metadata !{metadata !"0x24\00_Bool\000\008\008\000\000\002", metadata !44, metadata !3} ; [ DW_TAG_base_type ]
-!38 = metadata !{metadata !"0x16\00size_t\00326\000\000\000\000", metadata !43, metadata !39, metadata !40} ; [ DW_TAG_typedef ]
-!39 = metadata !{metadata !"0x29", metadata !43} ; [ DW_TAG_file_type ]
-!40 = metadata !{metadata !"0x24\00long unsigned int\000\0032\0032\000\000\007", metadata !44, metadata !3} ; [ DW_TAG_base_type ]
-!41 = metadata !{metadata !"cpplib.h", metadata !"/Users/espindola/llvm/build-llvm-gcc/gcc/../../llvm-gcc-4.2/gcc/../libcpp/include"}
-!42 = metadata !{metadata !"types.h", metadata !"/usr/include/sys"}
-!43 = metadata !{metadata !"stddef.h", metadata !"/Users/espindola/llvm/build-llvm-gcc/./prev-gcc/include"}
-!44 = metadata !{metadata !"darwin-c.c", metadata !"/Users/espindola/llvm/build-llvm-gcc/gcc/../../llvm-gcc-4.2/gcc/config"}
-!45 = metadata !{metadata !2}
-!46 = metadata !{i32 0}
-!47 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x100\00frname_len\00517\000", !1, !3, !38} ; [ DW_TAG_auto_variable ]
+!1 = !{!"0xb\00515\000\0019", !44, !2} ; [ DW_TAG_lexical_block ]
+!2 = !{!"0x2e\00framework_construct_pathname\00framework_construct_pathname\00\00515\001\001\000\006\00256\001\000", !44, null, !5, null, i8* (i8*, %struct.cpp_dir*)* @framework_construct_pathname, null, null, null} ; [ DW_TAG_subprogram ]
+!3 = !{!"0x29", !44}  ; [ DW_TAG_file_type ]
+!4 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", !44, !46, !46, !45, null, null} ; [ DW_TAG_compile_unit ]
+!5 = !{!"0x15\00\000\000\000\000\000\000", !44, !3, null, !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = !{!7, !9, !11}
+!7 = !{!"0xf\00\000\0032\0032\000\000", !44, !3, !8} ; [ DW_TAG_pointer_type ]
+!8 = !{!"0x24\00char\000\008\008\000\000\006", !44, !3} ; [ DW_TAG_base_type ]
+!9 = !{!"0xf\00\000\0032\0032\000\000", !44, !3, !10} ; [ DW_TAG_pointer_type ]
+!10 = !{!"0x26\00\000\008\008\000\000", !44, !3, !8} ; [ DW_TAG_const_type ]
+!11 = !{!"0xf\00\000\0032\0032\000\000", !44, !3, !12} ; [ DW_TAG_pointer_type ]
+!12 = !{!"0x16\00cpp_dir\0045\000\000\000\000", !41, !13, !14} ; [ DW_TAG_typedef ]
+!13 = !{!"0x29", !41} ; [ DW_TAG_file_type ]
+!14 = !{!"0x13\00cpp_dir\0043\00352\0032\000\000\000", !41, !3, null, !15, null, null, null} ; [ DW_TAG_structure_type ] [cpp_dir] [line 43, size 352, align 32, offset 0] [def] [from ]
+!15 = !{!16, !18, !19, !21, !23, !25, !27, !29, !33, !36}
+!16 = !{!"0xd\00next\00572\0032\0032\000\000", !41, !14, !17} ; [ DW_TAG_member ]
+!17 = !{!"0xf\00\000\0032\0032\000\000", !44, !3, !14} ; [ DW_TAG_pointer_type ]
+!18 = !{!"0xd\00name\00575\0032\0032\0032\000", !41, !14, !7} ; [ DW_TAG_member ]
+!19 = !{!"0xd\00len\00576\0032\0032\0064\000", !41, !14, !20} ; [ DW_TAG_member ]
+!20 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", !44, !3} ; [ DW_TAG_base_type ]
+!21 = !{!"0xd\00sysp\00580\008\008\0096\000", !41, !14, !22} ; [ DW_TAG_member ]
+!22 = !{!"0x24\00unsigned char\000\008\008\000\000\008", !44, !3} ; [ DW_TAG_base_type ]
+!23 = !{!"0xd\00name_map\00584\0032\0032\00128\000", !41, !14, !24} ; [ DW_TAG_member ]
+!24 = !{!"0xf\00\000\0032\0032\000\000", !44, !3, !9} ; [ DW_TAG_pointer_type ]
+!25 = !{!"0xd\00header_map\00590\0032\0032\00160\000", !41, !14, !26} ; [ DW_TAG_member ]
+!26 = !{!"0xf\00\000\0032\0032\000\000", !44, !3, null} ; [ DW_TAG_pointer_type ]
+!27 = !{!"0xd\00construct\00597\0032\0032\00192\000", !41, !14, !28} ; [ DW_TAG_member ]
+!28 = !{!"0xf\00\000\0032\0032\000\000", !44, !3, !5} ; [ DW_TAG_pointer_type ]
+!29 = !{!"0xd\00ino\00601\0064\0064\00224\000", !41, !14, !30} ; [ DW_TAG_member ]
+!30 = !{!"0x16\00ino_t\00141\000\000\000\000", !42, !31, !32} ; [ DW_TAG_typedef ]
+!31 = !{!"0x29", !42} ; [ DW_TAG_file_type ]
+!32 = !{!"0x24\00long long unsigned int\000\0064\0064\000\000\007", !44, !3} ; [ DW_TAG_base_type ]
+!33 = !{!"0xd\00dev\00602\0032\0032\00288\000", !41, !14, !34} ; [ DW_TAG_member ]
+!34 = !{!"0x16\00dev_t\00107\000\000\000\000", !42, !31, !35} ; [ DW_TAG_typedef ]
+!35 = !{!"0x24\00int\000\0032\0032\000\000\005", !44, !3} ; [ DW_TAG_base_type ]
+!36 = !{!"0xd\00user_supplied_p\00605\008\008\00320\000", !41, !14, !37} ; [ DW_TAG_member ]
+!37 = !{!"0x24\00_Bool\000\008\008\000\000\002", !44, !3} ; [ DW_TAG_base_type ]
+!38 = !{!"0x16\00size_t\00326\000\000\000\000", !43, !39, !40} ; [ DW_TAG_typedef ]
+!39 = !{!"0x29", !43} ; [ DW_TAG_file_type ]
+!40 = !{!"0x24\00long unsigned int\000\0032\0032\000\000\007", !44, !3} ; [ DW_TAG_base_type ]
+!41 = !{!"cpplib.h", !"/Users/espindola/llvm/build-llvm-gcc/gcc/../../llvm-gcc-4.2/gcc/../libcpp/include"}
+!42 = !{!"types.h", !"/usr/include/sys"}
+!43 = !{!"stddef.h", !"/Users/espindola/llvm/build-llvm-gcc/./prev-gcc/include"}
+!44 = !{!"darwin-c.c", !"/Users/espindola/llvm/build-llvm-gcc/gcc/../../llvm-gcc-4.2/gcc/config"}
+!45 = !{!2}
+!46 = !{i32 0}
+!47 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/elf-names.ll b/test/DebugInfo/X86/elf-names.ll
index 71be903..910a674 100644
--- a/test/DebugInfo/X86/elf-names.ll
+++ b/test/DebugInfo/X86/elf-names.ll
@@ -22,7 +22,7 @@
 
 define void @_ZN1DC2Ev(%class.D* nocapture %this) unnamed_addr nounwind uwtable align 2 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{%class.D* %this}, i64 0, metadata !29, metadata !{metadata !"0x102"}), !dbg !36
+  tail call void @llvm.dbg.value(metadata %class.D* %this, i64 0, metadata !29, metadata !{!"0x102"}), !dbg !36
   %c1 = getelementptr inbounds %class.D* %this, i64 0, i32 0, !dbg !37
   store i32 1, i32* %c1, align 4, !dbg !37
   %c2 = getelementptr inbounds %class.D* %this, i64 0, i32 1, !dbg !42
@@ -36,8 +36,8 @@ entry:
 
 define void @_ZN1DC2ERKS_(%class.D* nocapture %this, %class.D* nocapture %d) unnamed_addr nounwind uwtable align 2 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{%class.D* %this}, i64 0, metadata !34, metadata !{metadata !"0x102"}), !dbg !46
-  tail call void @llvm.dbg.value(metadata !{%class.D* %d}, i64 0, metadata !35, metadata !{metadata !"0x102"}), !dbg !46
+  tail call void @llvm.dbg.value(metadata %class.D* %this, i64 0, metadata !34, metadata !{!"0x102"}), !dbg !46
+  tail call void @llvm.dbg.value(metadata %class.D* %d, i64 0, metadata !35, metadata !{!"0x102"}), !dbg !46
   %c1 = getelementptr inbounds %class.D* %d, i64 0, i32 0, !dbg !47
   %0 = load i32* %c1, align 4, !dbg !47
   %c12 = getelementptr inbounds %class.D* %this, i64 0, i32 0, !dbg !47
@@ -62,51 +62,51 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!54}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.2 (trunk 167506) (llvm/trunk 167505)\001\00\000\00\000", metadata !53, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{}
-!3 = metadata !{metadata !5, metadata !31}
-!5 = metadata !{metadata !"0x2e\00D\00D\00_ZN1DC2Ev\0012\000\001\000\006\00256\001\0012", metadata !6, null, metadata !7, null, void (%class.D*)* @_ZN1DC2Ev, null, metadata !17, metadata !27} ; [ DW_TAG_subprogram ] [line 12] [def] [D]
-!6 = metadata !{metadata !"0x29", metadata !53} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9}
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
-!10 = metadata !{metadata !"0x2\00D\001\00128\0032\000\000\000", metadata !53, null, null, metadata !11, null, null, null} ; [ DW_TAG_class_type ] [D] [line 1, size 128, align 32, offset 0] [def] [from ]
-!11 = metadata !{metadata !12, metadata !14, metadata !15, metadata !16, metadata !17, metadata !20}
-!12 = metadata !{metadata !"0xd\00c1\006\0032\0032\000\001", metadata !53, metadata !10, metadata !13} ; [ DW_TAG_member ] [c1] [line 6, size 32, align 32, offset 0] [private] [from int]
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!14 = metadata !{metadata !"0xd\00c2\007\0032\0032\0032\001", metadata !53, metadata !10, metadata !13} ; [ DW_TAG_member ] [c2] [line 7, size 32, align 32, offset 32] [private] [from int]
-!15 = metadata !{metadata !"0xd\00c3\008\0032\0032\0064\001", metadata !53, metadata !10, metadata !13} ; [ DW_TAG_member ] [c3] [line 8, size 32, align 32, offset 64] [private] [from int]
-!16 = metadata !{metadata !"0xd\00c4\009\0032\0032\0096\001", metadata !53, metadata !10, metadata !13} ; [ DW_TAG_member ] [c4] [line 9, size 32, align 32, offset 96] [private] [from int]
-!17 = metadata !{metadata !"0x2e\00D\00D\00\003\000\000\000\006\00256\001\003", metadata !6, metadata !10, metadata !7, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 3] [D]
-!18 = metadata !{metadata !19}
-!19 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!20 = metadata !{metadata !"0x2e\00D\00D\00\004\000\000\000\006\00256\001\004", metadata !6, metadata !10, metadata !21, null, null, null, i32 0, metadata !25} ; [ DW_TAG_subprogram ] [line 4] [D]
-!21 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!22 = metadata !{null, metadata !9, metadata !23}
-!23 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !24} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!24 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !10} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from D]
-!25 = metadata !{metadata !26}
-!26 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!27 = metadata !{metadata !29}
-!29 = metadata !{metadata !"0x101\00this\0016777228\001088", metadata !5, metadata !6, metadata !30} ; [ DW_TAG_arg_variable ] [this] [line 12]
-!30 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
-!31 = metadata !{metadata !"0x2e\00D\00D\00_ZN1DC2ERKS_\0019\000\001\000\006\00256\001\0019", metadata !6, null, metadata !21, null, void (%class.D*, %class.D*)* @_ZN1DC2ERKS_, null, metadata !20, metadata !32} ; [ DW_TAG_subprogram ] [line 19] [def] [D]
-!32 = metadata !{metadata !34, metadata !35}
-!34 = metadata !{metadata !"0x101\00this\0016777235\001088", metadata !31, metadata !6, metadata !30} ; [ DW_TAG_arg_variable ] [this] [line 19]
-!35 = metadata !{metadata !"0x101\00d\0033554451\000", metadata !31, metadata !6, metadata !23} ; [ DW_TAG_arg_variable ] [d] [line 19]
-!36 = metadata !{i32 12, i32 0, metadata !5, null}
-!37 = metadata !{i32 13, i32 0, metadata !38, null}
-!38 = metadata !{metadata !"0xb\0012\000\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
-!42 = metadata !{i32 14, i32 0, metadata !38, null}
-!43 = metadata !{i32 15, i32 0, metadata !38, null}
-!44 = metadata !{i32 16, i32 0, metadata !38, null}
-!45 = metadata !{i32 17, i32 0, metadata !38, null}
-!46 = metadata !{i32 19, i32 0, metadata !31, null}
-!47 = metadata !{i32 20, i32 0, metadata !48, null}
-!48 = metadata !{metadata !"0xb\0019\000\001", metadata !6, metadata !31} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
-!49 = metadata !{i32 21, i32 0, metadata !48, null}
-!50 = metadata !{i32 22, i32 0, metadata !48, null}
-!51 = metadata !{i32 23, i32 0, metadata !48, null}
-!52 = metadata !{i32 24, i32 0, metadata !48, null}
-!53 = metadata !{metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo"}
-!54 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.2 (trunk 167506) (llvm/trunk 167505)\001\00\000\00\000", !53, !1, !1, !3, !1,  !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.cpp] [DW_LANG_C_plus_plus]
+!1 = !{}
+!3 = !{!5, !31}
+!5 = !{!"0x2e\00D\00D\00_ZN1DC2Ev\0012\000\001\000\006\00256\001\0012", !6, null, !7, null, void (%class.D*)* @_ZN1DC2Ev, null, !17, !27} ; [ DW_TAG_subprogram ] [line 12] [def] [D]
+!6 = !{!"0x29", !53} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9}
+!9 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
+!10 = !{!"0x2\00D\001\00128\0032\000\000\000", !53, null, null, !11, null, null, null} ; [ DW_TAG_class_type ] [D] [line 1, size 128, align 32, offset 0] [def] [from ]
+!11 = !{!12, !14, !15, !16, !17, !20}
+!12 = !{!"0xd\00c1\006\0032\0032\000\001", !53, !10, !13} ; [ DW_TAG_member ] [c1] [line 6, size 32, align 32, offset 0] [private] [from int]
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = !{!"0xd\00c2\007\0032\0032\0032\001", !53, !10, !13} ; [ DW_TAG_member ] [c2] [line 7, size 32, align 32, offset 32] [private] [from int]
+!15 = !{!"0xd\00c3\008\0032\0032\0064\001", !53, !10, !13} ; [ DW_TAG_member ] [c3] [line 8, size 32, align 32, offset 64] [private] [from int]
+!16 = !{!"0xd\00c4\009\0032\0032\0096\001", !53, !10, !13} ; [ DW_TAG_member ] [c4] [line 9, size 32, align 32, offset 96] [private] [from int]
+!17 = !{!"0x2e\00D\00D\00\003\000\000\000\006\00256\001\003", !6, !10, !7, null, null, null, i32 0, !18} ; [ DW_TAG_subprogram ] [line 3] [D]
+!18 = !{!19}
+!19 = !{!"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!20 = !{!"0x2e\00D\00D\00\004\000\000\000\006\00256\001\004", !6, !10, !21, null, null, null, i32 0, !25} ; [ DW_TAG_subprogram ] [line 4] [D]
+!21 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = !{null, !9, !23}
+!23 = !{!"0x10\00\000\000\000\000\000", null, null, !24} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = !{!"0x26\00\000\000\000\000\000", null, null, !10} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from D]
+!25 = !{!26}
+!26 = !{!"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!27 = !{!29}
+!29 = !{!"0x101\00this\0016777228\001088", !5, !6, !30} ; [ DW_TAG_arg_variable ] [this] [line 12]
+!30 = !{!"0xf\00\000\0064\0064\000\000", null, null, !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
+!31 = !{!"0x2e\00D\00D\00_ZN1DC2ERKS_\0019\000\001\000\006\00256\001\0019", !6, null, !21, null, void (%class.D*, %class.D*)* @_ZN1DC2ERKS_, null, !20, !32} ; [ DW_TAG_subprogram ] [line 19] [def] [D]
+!32 = !{!34, !35}
+!34 = !{!"0x101\00this\0016777235\001088", !31, !6, !30} ; [ DW_TAG_arg_variable ] [this] [line 19]
+!35 = !{!"0x101\00d\0033554451\000", !31, !6, !23} ; [ DW_TAG_arg_variable ] [d] [line 19]
+!36 = !MDLocation(line: 12, scope: !5)
+!37 = !MDLocation(line: 13, scope: !38)
+!38 = !{!"0xb\0012\000\000", !6, !5} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
+!42 = !MDLocation(line: 14, scope: !38)
+!43 = !MDLocation(line: 15, scope: !38)
+!44 = !MDLocation(line: 16, scope: !38)
+!45 = !MDLocation(line: 17, scope: !38)
+!46 = !MDLocation(line: 19, scope: !31)
+!47 = !MDLocation(line: 20, scope: !48)
+!48 = !{!"0xb\0019\000\001", !6, !31} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
+!49 = !MDLocation(line: 21, scope: !48)
+!50 = !MDLocation(line: 22, scope: !48)
+!51 = !MDLocation(line: 23, scope: !48)
+!52 = !MDLocation(line: 24, scope: !48)
+!53 = !{!"foo.cpp", !"/usr/local/google/home/echristo"}
+!54 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/empty-and-one-elem-array.ll b/test/DebugInfo/X86/empty-and-one-elem-array.ll
index bbf527d..94e75e4 100644
--- a/test/DebugInfo/X86/empty-and-one-elem-array.ll
+++ b/test/DebugInfo/X86/empty-and-one-elem-array.ll
@@ -9,8 +9,8 @@ define i32 @func() nounwind uwtable ssp {
 entry:
   %my_foo = alloca %struct.foo, align 4
   %my_bar = alloca %struct.bar, align 4
-  call void @llvm.dbg.declare(metadata !{%struct.foo* %my_foo}, metadata !10, metadata !{metadata !"0x102"}), !dbg !19
-  call void @llvm.dbg.declare(metadata !{%struct.bar* %my_bar}, metadata !20, metadata !{metadata !"0x102"}), !dbg !28
+  call void @llvm.dbg.declare(metadata %struct.foo* %my_foo, metadata !10, metadata !{!"0x102"}), !dbg !19
+  call void @llvm.dbg.declare(metadata %struct.bar* %my_bar, metadata !20, metadata !{!"0x102"}), !dbg !28
   %a = getelementptr inbounds %struct.foo* %my_foo, i32 0, i32 0, !dbg !29
   store i32 3, i32* %a, align 4, !dbg !29
   %a1 = getelementptr inbounds %struct.bar* %my_bar, i32 0, i32 0, !dbg !30
@@ -63,35 +63,35 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 169136)\000\00\000\00\000", metadata !32, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/test.c] [DW_LANG_C99]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00func\00func\00\0011\000\001\000\006\000\000\0011", metadata !6, metadata !6, metadata !7, null, i32 ()* @func, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 11] [def] [func]
-!6 = metadata !{metadata !"0x29", metadata !32} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !"0x100\00my_foo\0012\000", metadata !11, metadata !6, metadata !12} ; [ DW_TAG_auto_variable ] [my_foo] [line 12]
-!11 = metadata !{metadata !"0xb\0011\000\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ] [/Volumes/Sandbox/llvm/test.c]
-!12 = metadata !{metadata !"0x13\00foo\001\0064\0032\000\000\000", metadata !32, null, null, metadata !13, null, i32 0, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 64, align 32, offset 0] [def] [from ]
-!13 = metadata !{metadata !14, metadata !15}
-!14 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !32, metadata !12, metadata !9} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!15 = metadata !{metadata !"0xd\00b\003\0032\0032\0032\000", metadata !32, metadata !12, metadata !16} ; [ DW_TAG_member ] [b] [line 3, size 32, align 32, offset 32] [from ]
-!16 = metadata !{metadata !"0x1\00\000\0032\0032\000\000", null, null, metadata !9, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 32, offset 0] [from int]
-!17 = metadata !{metadata !18}
-!18 = metadata !{metadata !"0x21\000\001"} ; [ DW_TAG_subrange_type ] [0, 1]
-!19 = metadata !{i32 12, i32 0, metadata !11, null}
-!20 = metadata !{metadata !"0x100\00my_bar\0013\000", metadata !11, metadata !6, metadata !21} ; [ DW_TAG_auto_variable ] [my_bar] [line 13]
-!21 = metadata !{metadata !"0x13\00bar\006\0032\0032\000\000\000", metadata !32, null, null, metadata !22, null, i32 0, null} ; [ DW_TAG_structure_type ] [bar] [line 6, size 32, align 32, offset 0] [def] [from ]
-!22 = metadata !{metadata !23, metadata !24}
-!23 = metadata !{metadata !"0xd\00a\007\0032\0032\000\000", metadata !32, metadata !21, metadata !9} ; [ DW_TAG_member ] [a] [line 7, size 32, align 32, offset 0] [from int]
-!24 = metadata !{metadata !"0xd\00b\008\000\0032\0032\000", metadata !32, metadata !21, metadata !25} ; [ DW_TAG_member ] [b] [line 8, size 0, align 32, offset 32] [from ]
-!25 = metadata !{metadata !"0x1\00\000\000\0032\000\000", null, null, metadata !9, metadata !26, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
-!26 = metadata !{metadata !27}
-!27 = metadata !{metadata !"0x21\000\000"} ; [ DW_TAG_subrange_type ] [0, 0]
-!28 = metadata !{i32 13, i32 0, metadata !11, null}
-!29 = metadata !{i32 15, i32 0, metadata !11, null}
-!30 = metadata !{i32 16, i32 0, metadata !11, null}
-!31 = metadata !{i32 17, i32 0, metadata !11, null}
-!32 = metadata !{metadata !"test.c", metadata !"/Volumes/Sandbox/llvm"}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.3 (trunk 169136)\000\00\000\00\000", !32, !1, !1, !3, !1,  !1} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/test.c] [DW_LANG_C99]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00func\00func\00\0011\000\001\000\006\000\000\0011", !6, !6, !7, null, i32 ()* @func, null, null, !1} ; [ DW_TAG_subprogram ] [line 11] [def] [func]
+!6 = !{!"0x29", !32} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!"0x100\00my_foo\0012\000", !11, !6, !12} ; [ DW_TAG_auto_variable ] [my_foo] [line 12]
+!11 = !{!"0xb\0011\000\000", !6, !5} ; [ DW_TAG_lexical_block ] [/Volumes/Sandbox/llvm/test.c]
+!12 = !{!"0x13\00foo\001\0064\0032\000\000\000", !32, null, null, !13, null, i32 0, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 64, align 32, offset 0] [def] [from ]
+!13 = !{!14, !15}
+!14 = !{!"0xd\00a\002\0032\0032\000\000", !32, !12, !9} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!15 = !{!"0xd\00b\003\0032\0032\0032\000", !32, !12, !16} ; [ DW_TAG_member ] [b] [line 3, size 32, align 32, offset 32] [from ]
+!16 = !{!"0x1\00\000\0032\0032\000\000", null, null, !9, !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 32, offset 0] [from int]
+!17 = !{!18}
+!18 = !{!"0x21\000\001"} ; [ DW_TAG_subrange_type ] [0, 1]
+!19 = !MDLocation(line: 12, scope: !11)
+!20 = !{!"0x100\00my_bar\0013\000", !11, !6, !21} ; [ DW_TAG_auto_variable ] [my_bar] [line 13]
+!21 = !{!"0x13\00bar\006\0032\0032\000\000\000", !32, null, null, !22, null, i32 0, null} ; [ DW_TAG_structure_type ] [bar] [line 6, size 32, align 32, offset 0] [def] [from ]
+!22 = !{!23, !24}
+!23 = !{!"0xd\00a\007\0032\0032\000\000", !32, !21, !9} ; [ DW_TAG_member ] [a] [line 7, size 32, align 32, offset 0] [from int]
+!24 = !{!"0xd\00b\008\000\0032\0032\000", !32, !21, !25} ; [ DW_TAG_member ] [b] [line 8, size 0, align 32, offset 32] [from ]
+!25 = !{!"0x1\00\000\000\0032\000\000", null, null, !9, !26, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!26 = !{!27}
+!27 = !{!"0x21\000\000"} ; [ DW_TAG_subrange_type ] [0, 0]
+!28 = !MDLocation(line: 13, scope: !11)
+!29 = !MDLocation(line: 15, scope: !11)
+!30 = !MDLocation(line: 16, scope: !11)
+!31 = !MDLocation(line: 17, scope: !11)
+!32 = !{!"test.c", !"/Volumes/Sandbox/llvm"}
+!33 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/empty-array.ll b/test/DebugInfo/X86/empty-array.ll
index f334ed3..d5a521a 100644
--- a/test/DebugInfo/X86/empty-array.ll
+++ b/test/DebugInfo/X86/empty-array.ll
@@ -27,23 +27,23 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 169136)\000\00\000\00\000", metadata !20, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/t.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x34\00a\00a\00\001\000\001", null, metadata !6, metadata !7, %class.A* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
-!6 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x2\00A\001\000\0032\000\000\000", metadata !20, null, null, metadata !8, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 0, align 32, offset 0] [def] [from ]
-!8 = metadata !{metadata !9, metadata !14}
-!9 = metadata !{metadata !"0xd\00x\001\000\000\000\001", metadata !20, metadata !7, metadata !10} ; [ DW_TAG_member ] [x] [line 1, size 0, align 0, offset 0] [private] [from ]
-!10 = metadata !{metadata !"0x1\00\000\000\0032\000\000", null, null, metadata !11, metadata !12, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
-!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x21\000\00-1"} ; [ DW_TAG_subrange_type ] [unbound]
-!14 = metadata !{metadata !"0x2e\00A\00A\00\001\000\000\000\006\00320\000\001", metadata !6, metadata !7, metadata !15, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 1] [A]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{null, metadata !17}
-!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
-!18 = metadata !{metadata !19}
-!19 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!20 = metadata !{metadata !"t.cpp", metadata !"/Volumes/Sandbox/llvm"}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.3 (trunk 169136)\000\00\000\00\000", !20, !1, !1, !1, !3,  !1} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/t.cpp] [DW_LANG_C_plus_plus]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x34\00a\00a\00\001\000\001", null, !6, !7, %class.A* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
+!6 = !{!"0x29", !20} ; [ DW_TAG_file_type ]
+!7 = !{!"0x2\00A\001\000\0032\000\000\000", !20, null, null, !8, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 0, align 32, offset 0] [def] [from ]
+!8 = !{!9, !14}
+!9 = !{!"0xd\00x\001\000\000\000\001", !20, !7, !10} ; [ DW_TAG_member ] [x] [line 1, size 0, align 0, offset 0] [private] [from ]
+!10 = !{!"0x1\00\000\000\0032\000\000", null, null, !11, !12, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!11 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = !{!13}
+!13 = !{!"0x21\000\00-1"} ; [ DW_TAG_subrange_type ] [unbound]
+!14 = !{!"0x2e\00A\00A\00\001\000\000\000\006\00320\000\001", !6, !7, !15, null, null, null, i32 0, !18} ; [ DW_TAG_subprogram ] [line 1] [A]
+!15 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{null, !17}
+!17 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!18 = !{!19}
+!19 = !{!"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!20 = !{!"t.cpp", !"/Volumes/Sandbox/llvm"}
+!21 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/ending-run.ll b/test/DebugInfo/X86/ending-run.ll
index 0fcfdf1..0b5c77f 100644
--- a/test/DebugInfo/X86/ending-run.ll
+++ b/test/DebugInfo/X86/ending-run.ll
@@ -13,8 +13,8 @@ entry:
   %x.addr = alloca i32, align 4
   %y = alloca i32, align 4
   store i32 %x, i32* %x.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !12, metadata !{metadata !"0x102"}), !dbg !13
-  call void @llvm.dbg.declare(metadata !{i32* %y}, metadata !14, metadata !{metadata !"0x102"}), !dbg !16
+  call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !12, metadata !{!"0x102"}), !dbg !13
+  call void @llvm.dbg.declare(metadata i32* %y, metadata !14, metadata !{!"0x102"}), !dbg !16
   %0 = load i32* %x.addr, align 4, !dbg !17
   %1 = load i32* %x.addr, align 4, !dbg !17
   %mul = mul nsw i32 %0, %1, !dbg !17
@@ -29,20 +29,20 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 153921) (llvm/trunk 153916)\000\00\000\00\000", metadata !19, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00callee\00callee\00\004\000\001\000\006\000\000\007", metadata !19, metadata !6, metadata !7, null, i32 (i32)* @callee, null, null, null} ; [ DW_TAG_subprogram ]
-!6 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!12 = metadata !{metadata !"0x101\00x\0016777221\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
-!13 = metadata !{i32 5, i32 5, metadata !5, null}
-!14 = metadata !{metadata !"0x100\00y\008\000", metadata !15, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ]
-!15 = metadata !{metadata !"0xb\007\001\000", metadata !19, metadata !5} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 8, i32 9, metadata !15, null}
-!17 = metadata !{i32 8, i32 18, metadata !15, null}
-!18 = metadata !{i32 9, i32 5, metadata !15, null}
-!19 = metadata !{metadata !"ending-run.c", metadata !"/Users/echristo/tmp"}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.1 (trunk 153921) (llvm/trunk 153916)\000\00\000\00\000", !19, !1, !1, !3, !1,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00callee\00callee\00\004\000\001\000\006\000\000\007", !19, !6, !7, null, i32 (i32)* @callee, null, null, null} ; [ DW_TAG_subprogram ]
+!6 = !{!"0x29", !19} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!12 = !{!"0x101\00x\0016777221\000", !5, !6, !9} ; [ DW_TAG_arg_variable ]
+!13 = !MDLocation(line: 5, column: 5, scope: !5)
+!14 = !{!"0x100\00y\008\000", !15, !6, !9} ; [ DW_TAG_auto_variable ]
+!15 = !{!"0xb\007\001\000", !19, !5} ; [ DW_TAG_lexical_block ]
+!16 = !MDLocation(line: 8, column: 9, scope: !15)
+!17 = !MDLocation(line: 8, column: 18, scope: !15)
+!18 = !MDLocation(line: 9, column: 5, scope: !15)
+!19 = !{!"ending-run.c", !"/Users/echristo/tmp"}
+!20 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/enum-class.ll b/test/DebugInfo/X86/enum-class.ll
index 7520d08..c2975a9 100644
--- a/test/DebugInfo/X86/enum-class.ll
+++ b/test/DebugInfo/X86/enum-class.ll
@@ -8,26 +8,26 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!23}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.2 (trunk 157269) (llvm/trunk 157264)\000\00\000\00\000", metadata !22, metadata !1, metadata !15, metadata !15, metadata !17,  metadata !15} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !3, metadata !8, metadata !12}
-!3 = metadata !{metadata !"0x4\00A\001\0032\0032\000\000\000", metadata !4, null, metadata !5, metadata !6, null, null, null} ; [ DW_TAG_enumeration_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from int]
-!4 = metadata !{metadata !"0x29", metadata !22} ; [ DW_TAG_file_type ]
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !7}
-!7 = metadata !{metadata !"0x28\00A1\001"} ; [ DW_TAG_enumerator ]
-!8 = metadata !{metadata !"0x4\00B\002\0064\0064\000\000\000", metadata !4, null, metadata !9, metadata !10, null, null, null} ; [ DW_TAG_enumeration_type ] [B] [line 2, size 64, align 64, offset 0] [def] [from long unsigned int]
-!9 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !11}
-!11 = metadata !{metadata !"0x28\00B1\001"} ; [ DW_TAG_enumerator ]
-!12 = metadata !{metadata !"0x4\00C\003\0032\0032\000\000\000", metadata !4, null, null, metadata !13, null, null, null} ; [ DW_TAG_enumeration_type ] [C] [line 3, size 32, align 32, offset 0] [def] [from ]
-!13 = metadata !{metadata !14}
-!14 = metadata !{metadata !"0x28\00C1\001"} ; [ DW_TAG_enumerator ]
-!15 = metadata !{}
-!17 = metadata !{metadata !19, metadata !20, metadata !21}
-!19 = metadata !{metadata !"0x34\00a\00a\00\004\000\001", null, metadata !4, metadata !3, i32* @a, null} ; [ DW_TAG_variable ]
-!20 = metadata !{metadata !"0x34\00b\00b\00\005\000\001", null, metadata !4, metadata !8, i64* @b, null} ; [ DW_TAG_variable ]
-!21 = metadata !{metadata !"0x34\00c\00c\00\006\000\001", null, metadata !4, metadata !12, i32* @c, null} ; [ DW_TAG_variable ]
-!22 = metadata !{metadata !"foo.cpp", metadata !"/Users/echristo/tmp"}
+!0 = !{!"0x11\004\00clang version 3.2 (trunk 157269) (llvm/trunk 157264)\000\00\000\00\000", !22, !1, !15, !15, !17,  !15} ; [ DW_TAG_compile_unit ]
+!1 = !{!3, !8, !12}
+!3 = !{!"0x4\00A\001\0032\0032\000\000\000", !4, null, !5, !6, null, null, null} ; [ DW_TAG_enumeration_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from int]
+!4 = !{!"0x29", !22} ; [ DW_TAG_file_type ]
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!6 = !{!7}
+!7 = !{!"0x28\00A1\001"} ; [ DW_TAG_enumerator ]
+!8 = !{!"0x4\00B\002\0064\0064\000\000\000", !4, null, !9, !10, null, null, null} ; [ DW_TAG_enumeration_type ] [B] [line 2, size 64, align 64, offset 0] [def] [from long unsigned int]
+!9 = !{!"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ]
+!10 = !{!11}
+!11 = !{!"0x28\00B1\001"} ; [ DW_TAG_enumerator ]
+!12 = !{!"0x4\00C\003\0032\0032\000\000\000", !4, null, null, !13, null, null, null} ; [ DW_TAG_enumeration_type ] [C] [line 3, size 32, align 32, offset 0] [def] [from ]
+!13 = !{!14}
+!14 = !{!"0x28\00C1\001"} ; [ DW_TAG_enumerator ]
+!15 = !{}
+!17 = !{!19, !20, !21}
+!19 = !{!"0x34\00a\00a\00\004\000\001", null, !4, !3, i32* @a, null} ; [ DW_TAG_variable ]
+!20 = !{!"0x34\00b\00b\00\005\000\001", null, !4, !8, i64* @b, null} ; [ DW_TAG_variable ]
+!21 = !{!"0x34\00c\00c\00\006\000\001", null, !4, !12, i32* @c, null} ; [ DW_TAG_variable ]
+!22 = !{!"foo.cpp", !"/Users/echristo/tmp"}
 
 ; CHECK: DW_TAG_enumeration_type [{{.*}}]
 ; CHECK: DW_AT_type [DW_FORM_ref4]
@@ -42,4 +42,4 @@
 ; CHECK: DW_TAG_enumeration_type [6]
 ; CHECK-NOT: DW_AT_enum_class
 ; CHECK: DW_AT_name [DW_FORM_strp]      ( .debug_str[{{.*}}] = "C")
-!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!23 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/enum-fwd-decl.ll b/test/DebugInfo/X86/enum-fwd-decl.ll
index 91472f2..832fdb0 100644
--- a/test/DebugInfo/X86/enum-fwd-decl.ll
+++ b/test/DebugInfo/X86/enum-fwd-decl.ll
@@ -6,16 +6,16 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.2 (trunk 165274) (llvm/trunk 165272)\000\00\000\00\000", metadata !8, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ] [/tmp/foo.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x34\00e\00e\00\002\000\001", null, metadata !6, metadata !7, i16* @e, null} ; [ DW_TAG_variable ] [e] [line 2] [def]
-!6 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x4\00E\001\0016\0016\000\004\000", metadata !8, null, null, null, null, null, null} ; [ DW_TAG_enumeration_type ] [E] [line 1, size 16, align 16, offset 0] [decl] [from ]
-!8 = metadata !{metadata !"foo.cpp", metadata !"/tmp"}
+!0 = !{!"0x11\004\00clang version 3.2 (trunk 165274) (llvm/trunk 165272)\000\00\000\00\000", !8, !1, !1, !1, !3,  !1} ; [ DW_TAG_compile_unit ] [/tmp/foo.cpp] [DW_LANG_C_plus_plus]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x34\00e\00e\00\002\000\001", null, !6, !7, i16* @e, null} ; [ DW_TAG_variable ] [e] [line 2] [def]
+!6 = !{!"0x29", !8} ; [ DW_TAG_file_type ]
+!7 = !{!"0x4\00E\001\0016\0016\000\004\000", !8, null, null, null, null, null, null} ; [ DW_TAG_enumeration_type ] [E] [line 1, size 16, align 16, offset 0] [decl] [from ]
+!8 = !{!"foo.cpp", !"/tmp"}
 
 ; CHECK: DW_TAG_enumeration_type
 ; CHECK-NEXT: DW_AT_name
 ; CHECK-NEXT: DW_AT_byte_size
 ; CHECK-NEXT: DW_AT_declaration
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!9 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/fission-cu.ll b/test/DebugInfo/X86/fission-cu.ll
index 58692b9..f1ee0fe 100644
--- a/test/DebugInfo/X86/fission-cu.ll
+++ b/test/DebugInfo/X86/fission-cu.ll
@@ -8,13 +8,13 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 169021) (llvm/trunk 169020)\000\00\000\00baz.dwo\000", metadata !8, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/baz.c] [DW_LANG_C99]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x34\00a\00a\00\001\000\001", null, metadata !6, metadata !7, i32* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
-!6 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!8 = metadata !{metadata !"baz.c", metadata !"/usr/local/google/home/echristo/tmp"}
+!0 = !{!"0x11\0012\00clang version 3.3 (trunk 169021) (llvm/trunk 169020)\000\00\000\00baz.dwo\000", !8, !1, !1, !1, !3,  !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/baz.c] [DW_LANG_C99]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x34\00a\00a\00\001\000\001", null, !6, !7, i32* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
+!6 = !{!"0x29", !8} ; [ DW_TAG_file_type ]
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = !{!"baz.c", !"/usr/local/google/home/echristo/tmp"}
 
 ; Check that the skeleton compile unit contains the proper attributes:
 ; This DIE has the following attributes: DW_AT_comp_dir, DW_AT_stmt_list,
@@ -111,4 +111,4 @@
 ; HDR-NOT: .debug_aranges
 ; HDR-NOT: .rela.{{.*}}.dwo
 
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!9 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/fission-hash.ll b/test/DebugInfo/X86/fission-hash.ll
index 9831063..5002ac6 100644
--- a/test/DebugInfo/X86/fission-hash.ll
+++ b/test/DebugInfo/X86/fission-hash.ll
@@ -9,8 +9,8 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 188230) (llvm/trunk 188234)\000\00\000\00foo.dwo\000", metadata !1, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp"}
-!2 = metadata !{}
-!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4 (trunk 188230) (llvm/trunk 188234)\000\00\000\00foo.dwo\000", !1, !2, !2, !2, !2, !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
+!1 = !{!"foo.c", !"/usr/local/google/home/echristo/tmp"}
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 3}
+!4 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/fission-inline.ll b/test/DebugInfo/X86/fission-inline.ll
index 29c770d..6499966 100644
--- a/test/DebugInfo/X86/fission-inline.ll
+++ b/test/DebugInfo/X86/fission-inline.ll
@@ -87,33 +87,33 @@ attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !llvm.module.flags = !{!22, !23}
 !llvm.ident = !{!24}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \000\00\000\00fission-inline.dwo\001", metadata !1, metadata !2, metadata !3, metadata !9, metadata !2, metadata !18} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/fission-inline.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"fission-inline.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00foo\001\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{metadata !"0x2e\00f3\00f3\00_ZN3foo2f3Ez\004\000\000\000\000\00256\000\004", metadata !1, metadata !"_ZTS3foo", metadata !7, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [f3]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, null}
-!9 = metadata !{metadata !10, metadata !11}
-!10 = metadata !{metadata !"0x2e\00f3\00f3\00_ZN3foo2f3Ez\0015\000\001\000\000\00256\000\0015", metadata !1, metadata !"_ZTS3foo", metadata !7, null, void (...)* @_ZN3foo2f3Ez, null, metadata !6, metadata !2} ; [ DW_TAG_subprogram ] [line 15] [def] [f3]
-!11 = metadata !{metadata !"0x2e\00f2<int>\00f2<int>\00_ZN3foo2f2IiEEvv\0010\000\001\000\000\00256\000\0010", metadata !1, metadata !"_ZTS3foo", metadata !12, null, null, metadata !14, metadata !17, metadata !2} ; [ DW_TAG_subprogram ] [line 10] [def] [f2<int>]
-!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!13 = metadata !{null}
-!14 = metadata !{metadata !15}
-!15 = metadata !{metadata !"0x2f\00T\000\000", null, metadata !16, null} ; [ DW_TAG_template_type_parameter ]
-!16 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!17 = metadata !{metadata !"0x2e\00f2<int>\00f2<int>\00_ZN3foo2f2IiEEvv\0010\000\000\000\000\00256\000\0010", metadata !1, metadata !"_ZTS3foo", metadata !12, null, null, metadata !14, null, null} ; [ DW_TAG_subprogram ] [line 10] [f2<int>]
-!18 = metadata !{metadata !19}
-!19 = metadata !{metadata !"0x8\0019\00", metadata !20, metadata !"_ZTS3foo"} ; [ DW_TAG_imported_declaration ]
-!20 = metadata !{metadata !"0xb\0016\0013\001", metadata !1, metadata !21} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/fission-inline.cpp]
-!21 = metadata !{metadata !"0xb\0016\007\000", metadata !1, metadata !10} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/fission-inline.cpp]
-!22 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!23 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!24 = metadata !{metadata !"clang version 3.6.0 "}
-!25 = metadata !{i32 17, i32 5, metadata !20, null}
-!26 = metadata !{i32 11, i32 3, metadata !11, metadata !27}
-!27 = metadata !{i32 18, i32 5, metadata !20, null}
-!28 = metadata !{i32 12, i32 3, metadata !11, metadata !27}
-!29 = metadata !{i32 21, i32 1, metadata !10, null}
+!0 = !{!"0x11\004\00clang version 3.6.0 \000\00\000\00fission-inline.dwo\001", !1, !2, !3, !9, !2, !18} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/fission-inline.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"fission-inline.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00foo\001\008\008\000\000\000", !1, null, null, !5, null, null, !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!6}
+!6 = !{!"0x2e\00f3\00f3\00_ZN3foo2f3Ez\004\000\000\000\000\00256\000\004", !1, !"_ZTS3foo", !7, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [f3]
+!7 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, null}
+!9 = !{!10, !11}
+!10 = !{!"0x2e\00f3\00f3\00_ZN3foo2f3Ez\0015\000\001\000\000\00256\000\0015", !1, !"_ZTS3foo", !7, null, void (...)* @_ZN3foo2f3Ez, null, !6, !2} ; [ DW_TAG_subprogram ] [line 15] [def] [f3]
+!11 = !{!"0x2e\00f2<int>\00f2<int>\00_ZN3foo2f2IiEEvv\0010\000\001\000\000\00256\000\0010", !1, !"_ZTS3foo", !12, null, null, !14, !17, !2} ; [ DW_TAG_subprogram ] [line 10] [def] [f2<int>]
+!12 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = !{null}
+!14 = !{!15}
+!15 = !{!"0x2f\00T\000\000", null, !16, null} ; [ DW_TAG_template_type_parameter ]
+!16 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!17 = !{!"0x2e\00f2<int>\00f2<int>\00_ZN3foo2f2IiEEvv\0010\000\000\000\000\00256\000\0010", !1, !"_ZTS3foo", !12, null, null, !14, null, null} ; [ DW_TAG_subprogram ] [line 10] [f2<int>]
+!18 = !{!19}
+!19 = !{!"0x8\0019\00", !20, !"_ZTS3foo"} ; [ DW_TAG_imported_declaration ]
+!20 = !{!"0xb\0016\0013\001", !1, !21} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/fission-inline.cpp]
+!21 = !{!"0xb\0016\007\000", !1, !10} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/fission-inline.cpp]
+!22 = !{i32 2, !"Dwarf Version", i32 4}
+!23 = !{i32 2, !"Debug Info Version", i32 2}
+!24 = !{!"clang version 3.6.0 "}
+!25 = !MDLocation(line: 17, column: 5, scope: !20)
+!26 = !MDLocation(line: 11, column: 3, scope: !11, inlinedAt: !27)
+!27 = !MDLocation(line: 18, column: 5, scope: !20)
+!28 = !MDLocation(line: 12, column: 3, scope: !11, inlinedAt: !27)
+!29 = !MDLocation(line: 21, column: 1, scope: !10)
diff --git a/test/DebugInfo/X86/fission-ranges.ll b/test/DebugInfo/X86/fission-ranges.ll
index f66382a..400998e 100644
--- a/test/DebugInfo/X86/fission-ranges.ll
+++ b/test/DebugInfo/X86/fission-ranges.ll
@@ -91,8 +91,8 @@ entry:
 ; Function Attrs: nounwind uwtable
 define internal fastcc void @foo() #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !29, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !30
-  tail call void @llvm.dbg.value(metadata !44, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !31
+  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !13, metadata !{!"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !14, metadata !{!"0x102"}), !dbg !31
   %c.promoted9 = load i32* @c, align 4, !dbg !32, !tbaa !33
   br label %for.cond1.preheader, !dbg !31
 
@@ -114,28 +114,28 @@ for.cond7.preheader:                              ; preds = %for.inc10, %for.con
 for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
   %and2 = phi i32 [ %and.lcssa5, %for.cond7.preheader ], [ %and, %for.body9 ], !dbg !40
   %e.01 = phi i32 [ 0, %for.cond7.preheader ], [ %inc, %for.body9 ]
-  tail call void @llvm.dbg.value(metadata !41, i64 0, metadata !19, metadata !{metadata !"0x102"}), !dbg !40
+  tail call void @llvm.dbg.value(metadata i32* @c, i64 0, metadata !19, metadata !{!"0x102"}), !dbg !40
   %and = and i32 %and2, 1, !dbg !32
   %inc = add i32 %e.01, 1, !dbg !39
-  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !39
+  tail call void @llvm.dbg.value(metadata i32 %inc, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !39
   %exitcond = icmp eq i32 %inc, 30, !dbg !39
   br i1 %exitcond, label %for.inc10, label %for.body9, !dbg !39
 
 for.inc10:                                        ; preds = %for.body9
   %inc11 = add nsw i32 %b.03, 1, !dbg !38
-  tail call void @llvm.dbg.value(metadata !{i32 %inc11}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !38
+  tail call void @llvm.dbg.value(metadata i32 %inc11, i64 0, metadata !15, metadata !{!"0x102"}), !dbg !38
   %exitcond11 = icmp eq i32 %inc11, 30, !dbg !38
   br i1 %exitcond11, label %for.inc13, label %for.cond7.preheader, !dbg !38
 
 for.inc13:                                        ; preds = %for.inc10
   %inc14 = add i32 %d.06, 1, !dbg !37
-  tail call void @llvm.dbg.value(metadata !{i32 %inc14}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !37
+  tail call void @llvm.dbg.value(metadata i32 %inc14, i64 0, metadata !16, metadata !{!"0x102"}), !dbg !37
   %exitcond12 = icmp eq i32 %inc14, 30, !dbg !37
   br i1 %exitcond12, label %for.inc16, label %for.cond4.preheader, !dbg !37
 
 for.inc16:                                        ; preds = %for.inc13
   %inc17 = add nsw i32 %a.08, 1, !dbg !31
-  tail call void @llvm.dbg.value(metadata !{i32 %inc17}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !31
+  tail call void @llvm.dbg.value(metadata i32 %inc17, i64 0, metadata !14, metadata !{!"0x102"}), !dbg !31
   %exitcond13 = icmp eq i32 %inc17, 30, !dbg !31
   br i1 %exitcond13, label %for.end18, label %for.cond1.preheader, !dbg !31
 
@@ -153,48 +153,48 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!26, !43}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 191700) (llvm/trunk 191710)\001\00\000\00small.dwo\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/small.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"small.c", metadata !"/usr/local/google/home/echristo/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !8}
-!4 = metadata !{metadata !"0x2e\00bar\00bar\00\0018\000\001\000\006\000\001\0019", metadata !1, metadata !5, metadata !6, null, void ()* @bar, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 18] [def] [scope 19] [bar]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/small.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{metadata !"0x2e\00foo\00foo\00\002\001\001\000\006\00256\001\003", metadata !1, metadata !5, metadata !9, null, void ()* @foo, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [scope 3] [foo]
-!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!10 = metadata !{null, metadata !11}
-!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!12 = metadata !{metadata !13, metadata !14, metadata !15, metadata !16, metadata !18, metadata !19}
-!13 = metadata !{metadata !"0x101\00p\0016777218\000", metadata !8, metadata !5, metadata !11} ; [ DW_TAG_arg_variable ] [p] [line 2]
-!14 = metadata !{metadata !"0x100\00a\004\000", metadata !8, metadata !5, metadata !11} ; [ DW_TAG_auto_variable ] [a] [line 4]
-!15 = metadata !{metadata !"0x100\00b\004\000", metadata !8, metadata !5, metadata !11} ; [ DW_TAG_auto_variable ] [b] [line 4]
-!16 = metadata !{metadata !"0x100\00d\005\000", metadata !8, metadata !5, metadata !17} ; [ DW_TAG_auto_variable ] [d] [line 5]
-!17 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
-!18 = metadata !{metadata !"0x100\00e\005\000", metadata !8, metadata !5, metadata !17} ; [ DW_TAG_auto_variable ] [e] [line 5]
-!19 = metadata !{metadata !"0x100\00w\0012\000", metadata !20, metadata !5, metadata !25} ; [ DW_TAG_auto_variable ] [w] [line 12]
-!20 = metadata !{metadata !"0xb\0011\000\004", metadata !1, metadata !21} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
-!21 = metadata !{metadata !"0xb\0010\000\003", metadata !1, metadata !22} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
-!22 = metadata !{metadata !"0xb\009\000\002", metadata !1, metadata !23} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
-!23 = metadata !{metadata !"0xb\008\000\001", metadata !1, metadata !24} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
-!24 = metadata !{metadata !"0xb\007\000\000", metadata !1, metadata !8} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
-!25 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!26 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!27 = metadata !{i32 20, i32 0, metadata !4, null}
-!28 = metadata !{i32 21, i32 0, metadata !4, null}
-!29 = metadata !{i32 1}
-!30 = metadata !{i32 2, i32 0, metadata !8, null}
-!31 = metadata !{i32 7, i32 0, metadata !24, null}
-!32 = metadata !{i32 13, i32 0, metadata !20, null}
-!33 = metadata !{metadata !34, metadata !34, i64 0}
-!34 = metadata !{metadata !"int", metadata !35, i64 0}
-!35 = metadata !{metadata !"omnipotent char", metadata !36, i64 0}
-!36 = metadata !{metadata !"Simple C/C++ TBAA"}
-!37 = metadata !{i32 8, i32 0, metadata !23, null}
-!38 = metadata !{i32 9, i32 0, metadata !22, null}
-!39 = metadata !{i32 10, i32 0, metadata !21, null}
-!40 = metadata !{i32 12, i32 0, metadata !20, null}
-!41 = metadata !{i32* @c}
-!42 = metadata !{i32 15, i32 0, metadata !8, null}
-!43 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!44 = metadata !{i32 0}
+!0 = !{!"0x11\0012\00clang version 3.4 (trunk 191700) (llvm/trunk 191710)\001\00\000\00small.dwo\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/small.c] [DW_LANG_C99]
+!1 = !{!"small.c", !"/usr/local/google/home/echristo/tmp"}
+!2 = !{}
+!3 = !{!4, !8}
+!4 = !{!"0x2e\00bar\00bar\00\0018\000\001\000\006\000\001\0019", !1, !5, !6, null, void ()* @bar, null, null, !2} ; [ DW_TAG_subprogram ] [line 18] [def] [scope 19] [bar]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/small.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{!"0x2e\00foo\00foo\00\002\001\001\000\006\00256\001\003", !1, !5, !9, null, void ()* @foo, null, null, !12} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [scope 3] [foo]
+!9 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = !{null, !11}
+!11 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = !{!13, !14, !15, !16, !18, !19}
+!13 = !{!"0x101\00p\0016777218\000", !8, !5, !11} ; [ DW_TAG_arg_variable ] [p] [line 2]
+!14 = !{!"0x100\00a\004\000", !8, !5, !11} ; [ DW_TAG_auto_variable ] [a] [line 4]
+!15 = !{!"0x100\00b\004\000", !8, !5, !11} ; [ DW_TAG_auto_variable ] [b] [line 4]
+!16 = !{!"0x100\00d\005\000", !8, !5, !17} ; [ DW_TAG_auto_variable ] [d] [line 5]
+!17 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!18 = !{!"0x100\00e\005\000", !8, !5, !17} ; [ DW_TAG_auto_variable ] [e] [line 5]
+!19 = !{!"0x100\00w\0012\000", !20, !5, !25} ; [ DW_TAG_auto_variable ] [w] [line 12]
+!20 = !{!"0xb\0011\000\004", !1, !21} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
+!21 = !{!"0xb\0010\000\003", !1, !22} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
+!22 = !{!"0xb\009\000\002", !1, !23} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
+!23 = !{!"0xb\008\000\001", !1, !24} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
+!24 = !{!"0xb\007\000\000", !1, !8} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
+!25 = !{!"0xf\00\000\0064\0064\000\000", null, null, !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!26 = !{i32 2, !"Dwarf Version", i32 4}
+!27 = !MDLocation(line: 20, scope: !4)
+!28 = !MDLocation(line: 21, scope: !4)
+!29 = !{i32 1}
+!30 = !MDLocation(line: 2, scope: !8)
+!31 = !MDLocation(line: 7, scope: !24)
+!32 = !MDLocation(line: 13, scope: !20)
+!33 = !{!34, !34, i64 0}
+!34 = !{!"int", !35, i64 0}
+!35 = !{!"omnipotent char", !36, i64 0}
+!36 = !{!"Simple C/C++ TBAA"}
+!37 = !MDLocation(line: 8, scope: !23)
+!38 = !MDLocation(line: 9, scope: !22)
+!39 = !MDLocation(line: 10, scope: !21)
+!40 = !MDLocation(line: 12, scope: !20)
+!41 = !{i32* @c}
+!42 = !MDLocation(line: 15, scope: !8)
+!43 = !{i32 1, !"Debug Info Version", i32 2}
+!44 = !{i32 0}
diff --git a/test/DebugInfo/X86/float_const.ll b/test/DebugInfo/X86/float_const.ll
new file mode 100644
index 0000000..c1590bb
--- /dev/null
+++ b/test/DebugInfo/X86/float_const.ll
@@ -0,0 +1,55 @@
+; RUN: llc < %s -filetype=obj | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+; from (at -Os):
+; void foo() {
+;   float a = 3.14;
+;   *(int *)&a = 0;
+; }
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Function Attrs: nounwind optsize readnone uwtable
+define void @foo() #0 {
+entry:
+  tail call void @llvm.dbg.declare(metadata float* undef, metadata !13, metadata !19), !dbg !20
+  tail call void @llvm.dbg.value(metadata i32 1078523331, i64 0, metadata !13, metadata !19), !dbg !20
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !13, metadata !19), !dbg !20
+; CHECK:  DW_AT_const_value [DW_FORM_sdata]    (0)
+; CHECK-NEXT: DW_AT_name {{.*}}"a"
+  ret void, !dbg !21
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind optsize readnone uwtable }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!15, !16, !17}
+!llvm.ident = !{!18}
+
+!0 = !{!"0x11\0012\00clang version 3.7.0 (trunk 227686)\001\00\000\00\001", !1, !2, !3, !6, !2, !2} ; [ DW_TAG_compile_unit ] [foo.c] [DW_LANG_C99]
+!1 = !{!"foo.c", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0xf\00\000\0064\0064\000\000", null, null, !5} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!6 = !{!7}
+!7 = !{!"0x2e\00foo\00foo\00\001\000\001\000\000\000\001\001", !8, !9, !10, null, void ()* @foo, null, null, !12} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!8 = !{!"foo.c", !""}
+!9 = !{!"0x29", !8}                               ; [ DW_TAG_file_type ]
+!10 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = !{null}
+!12 = !{!13}
+!13 = !{!"0x100\00a\002\000", !7, !9, !14}        ; [ DW_TAG_auto_variable ] [a] [line 2]
+!14 = !{!"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!15 = !{i32 2, !"Dwarf Version", i32 2}
+!16 = !{i32 2, !"Debug Info Version", i32 2}
+!17 = !{i32 1, !"PIC Level", i32 2}
+!18 = !{!"clang version 3.7.0 (trunk 227686)"}
+!19 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!20 = !MDLocation(line: 2, column: 9, scope: !7)
+!21 = !MDLocation(line: 4, column: 1, scope: !7)
diff --git a/test/DebugInfo/X86/formal_parameter.ll b/test/DebugInfo/X86/formal_parameter.ll
index 56891ec..9077c74 100644
--- a/test/DebugInfo/X86/formal_parameter.ll
+++ b/test/DebugInfo/X86/formal_parameter.ll
@@ -28,7 +28,7 @@ define void @foo(i32 %map) #0 {
 entry:
   %map.addr = alloca i32, align 4
   store i32 %map, i32* %map.addr, align 4, !tbaa !15
-  call void @llvm.dbg.declare(metadata !{i32* %map.addr}, metadata !10, metadata !{metadata !"0x102"}), !dbg !14
+  call void @llvm.dbg.declare(metadata i32* %map.addr, metadata !10, metadata !{!"0x102"}), !dbg !14
   %call = call i32 (i32*, ...)* bitcast (i32 (...)* @lookup to i32 (i32*, ...)*)(i32* %map.addr) #3, !dbg !19
   ; Ensure that all dbg intrinsics have the same scope after
   ; LowerDbgDeclare is finished with them.
@@ -59,26 +59,26 @@ attributes #3 = { nounwind }
 !llvm.module.flags = !{!11, !12}
 !llvm.ident = !{!13}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [formal_parameter.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"formal_parameter.c", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\001\002", metadata !1, metadata !5, metadata !6, null, void (i32)* @foo, null, null, metadata !9} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [formal_parameter.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null, metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x101\00map\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [map] [line 1]
-!11 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!13 = metadata !{metadata !"clang version 3.5.0 "}
-!14 = metadata !{i32 1, i32 0, metadata !4, null}
-!15 = metadata !{metadata !16, metadata !16, i64 0}
-!16 = metadata !{metadata !"int", metadata !17, i64 0}
-!17 = metadata !{metadata !"omnipotent char", metadata !18, i64 0}
-!18 = metadata !{metadata !"Simple C/C++ TBAA"}
-!19 = metadata !{i32 3, i32 0, metadata !4, null}
-!20 = metadata !{i32 4, i32 0, metadata !21, null}
-!21 = metadata !{metadata !"0xb\004\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [formal_parameter.c]
-!22 = metadata !{i32 5, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 \001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [formal_parameter.c] [DW_LANG_C99]
+!1 = !{!"formal_parameter.c", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\00256\001\002", !1, !5, !6, null, void (i32)* @foo, null, null, !9} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [formal_parameter.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!10}
+!10 = !{!"0x101\00map\0016777217\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [map] [line 1]
+!11 = !{i32 2, !"Dwarf Version", i32 2}
+!12 = !{i32 1, !"Debug Info Version", i32 2}
+!13 = !{!"clang version 3.5.0 "}
+!14 = !MDLocation(line: 1, scope: !4)
+!15 = !{!16, !16, i64 0}
+!16 = !{!"int", !17, i64 0}
+!17 = !{!"omnipotent char", !18, i64 0}
+!18 = !{!"Simple C/C++ TBAA"}
+!19 = !MDLocation(line: 3, scope: !4)
+!20 = !MDLocation(line: 4, scope: !21)
+!21 = !{!"0xb\004\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [formal_parameter.c]
+!22 = !MDLocation(line: 5, scope: !4)
diff --git a/test/DebugInfo/X86/generate-odr-hash.ll b/test/DebugInfo/X86/generate-odr-hash.ll
index e7a37ea..6d4fa86 100644
--- a/test/DebugInfo/X86/generate-odr-hash.ll
+++ b/test/DebugInfo/X86/generate-odr-hash.ll
@@ -183,7 +183,7 @@
 define void @_Z3foov() #0 {
 entry:
   %b = alloca %struct.baz, align 1
-  call void @llvm.dbg.declare(metadata !{%struct.baz* %b}, metadata !46, metadata !{metadata !"0x102"}), !dbg !48
+  call void @llvm.dbg.declare(metadata %struct.baz* %b, metadata !46, metadata !{!"0x102"}), !dbg !48
   ret void, !dbg !49
 }
 
@@ -201,7 +201,7 @@ define internal void @_ZN12_GLOBAL__N_16walrusC2Ev(%"struct.<anonymous namespace
 entry:
   %this.addr = alloca %"struct.<anonymous namespace>::walrus"*, align 8
   store %"struct.<anonymous namespace>::walrus"* %this, %"struct.<anonymous namespace>::walrus"** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%"struct.<anonymous namespace>::walrus"** %this.addr}, metadata !51, metadata !{metadata !"0x102"}), !dbg !53
+  call void @llvm.dbg.declare(metadata %"struct.<anonymous namespace>::walrus"** %this.addr, metadata !51, metadata !{!"0x102"}), !dbg !53
   %this1 = load %"struct.<anonymous namespace>::walrus"** %this.addr
   ret void, !dbg !54
 }
@@ -219,59 +219,59 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!43, !44}
 !llvm.ident = !{!45}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00bar.dwo\000", metadata !1, metadata !2, metadata !3, metadata !21, metadata !38, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/bar.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"bar.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !6, metadata !14, metadata !17}
-!4 = metadata !{metadata !"0x13\00bar\001\008\008\000\000\000", metadata !5, null, null, metadata !2, null, null, metadata !"_ZTS3bar"} ; [ DW_TAG_structure_type ] [bar] [line 1, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !"bar.h", metadata !"/tmp/dbginfo"}
-!6 = metadata !{metadata !"0x2\00fluffy\0013\0064\0032\000\000\000", metadata !1, metadata !7, null, metadata !10, null, null, metadata !"_ZTSN7echidna8capybara8mongoose6fluffyE"} ; [ DW_TAG_class_type ] [fluffy] [line 13, size 64, align 32, offset 0] [def] [from ]
-!7 = metadata !{metadata !"0x39\00mongoose\0012", metadata !1, metadata !8} ; [ DW_TAG_namespace ] [mongoose] [line 12]
-!8 = metadata !{metadata !"0x39\00capybara\0011", metadata !1, metadata !9} ; [ DW_TAG_namespace ] [capybara] [line 11]
-!9 = metadata !{metadata !"0x39\00echidna\0010", metadata !1, null} ; [ DW_TAG_namespace ] [echidna] [line 10]
-!10 = metadata !{metadata !11, metadata !13}
-!11 = metadata !{metadata !"0xd\00a\0014\0032\0032\000\001", metadata !1, metadata !"_ZTSN7echidna8capybara8mongoose6fluffyE", metadata !12} ; [ DW_TAG_member ] [a] [line 14, size 32, align 32, offset 0] [private] [from int]
-!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!13 = metadata !{metadata !"0xd\00b\0015\0032\0032\0032\001", metadata !1, metadata !"_ZTSN7echidna8capybara8mongoose6fluffyE", metadata !12} ; [ DW_TAG_member ] [b] [line 15, size 32, align 32, offset 32] [private] [from int]
-!14 = metadata !{metadata !"0x13\00wombat\0031\0064\0032\000\000\000", metadata !1, null, null, metadata !15, null, null, metadata !"_ZTS6wombat"} ; [ DW_TAG_structure_type ] [wombat] [line 31, size 64, align 32, offset 0] [def] [from ]
-!15 = metadata !{metadata !16}
-!16 = metadata !{metadata !"0xd\00a_b\0035\0064\0032\000\000", metadata !1, metadata !"_ZTS6wombat", metadata !"_ZTSN6wombatUt_E"} ; [ DW_TAG_member ] [a_b] [line 35, size 64, align 32, offset 0] [from _ZTSN6wombatUt_E]
-!17 = metadata !{metadata !"0x13\00\0032\0064\0032\000\000\000", metadata !1, metadata !"_ZTS6wombat", null, metadata !18, null, null, metadata !"_ZTSN6wombatUt_E"} ; [ DW_TAG_structure_type ] [line 32, size 64, align 32, offset 0] [def] [from ]
-!18 = metadata !{metadata !19, metadata !20}
-!19 = metadata !{metadata !"0xd\00a\0033\0032\0032\000\000", metadata !1, metadata !"_ZTSN6wombatUt_E", metadata !12} ; [ DW_TAG_member ] [a] [line 33, size 32, align 32, offset 0] [from int]
-!20 = metadata !{metadata !"0xd\00b\0034\0032\0032\0032\000", metadata !1, metadata !"_ZTSN6wombatUt_E", metadata !12} ; [ DW_TAG_member ] [b] [line 34, size 32, align 32, offset 32] [from int]
-!21 = metadata !{metadata !22, metadata !26, metadata !27, metadata !36}
-!22 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3foov\005\000\001\000\006\00256\000\005", metadata !1, metadata !23, metadata !24, null, void ()* @_Z3foov, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
-!23 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/bar.cpp]
-!24 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!25 = metadata !{null}
-!26 = metadata !{metadata !"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\0029\001\001\000\006\00256\000\0029", metadata !1, metadata !23, metadata !24, null, void ()* @__cxx_global_var_init, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 29] [local] [def] [__cxx_global_var_init]
-!27 = metadata !{metadata !"0x2e\00walrus\00walrus\00_ZN12_GLOBAL__N_16walrusC2Ev\0025\001\001\000\006\00256\000\0025", metadata !1, metadata !28, metadata !32, null, void (%"struct.<anonymous namespace>::walrus"*)* @_ZN12_GLOBAL__N_16walrusC2Ev, null, metadata !31, metadata !2} ; [ DW_TAG_subprogram ] [line 25] [local] [def] [walrus]
-!28 = metadata !{metadata !"0x13\00walrus\0024\008\008\000\000\000", metadata !1, metadata !29, null, metadata !30, null, null, null} ; [ DW_TAG_structure_type ] [walrus] [line 24, size 8, align 8, offset 0] [def] [from ]
-!29 = metadata !{metadata !"0x39\00\0023", metadata !1, null} ; [ DW_TAG_namespace ] [line 23]
-!30 = metadata !{metadata !31}
-!31 = metadata !{metadata !"0x2e\00walrus\00walrus\00\0025\000\000\000\006\00256\000\0025", metadata !1, metadata !28, metadata !32, null, null, null, i32 0, metadata !35} ; [ DW_TAG_subprogram ] [line 25] [walrus]
-!32 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !33, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!33 = metadata !{null, metadata !34}
-!34 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from walrus]
-!35 = metadata !{i32 786468}
-!36 = metadata !{metadata !"0x2e\00\00\00_GLOBAL__I_a\0025\001\001\000\006\0064\000\0025", metadata !1, metadata !23, metadata !37, null, void ()* @_GLOBAL__I_a, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 25] [local] [def]
-!37 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!38 = metadata !{metadata !39, metadata !40, metadata !41, metadata !42}
-!39 = metadata !{metadata !"0x34\00b\00b\00\003\000\001", null, metadata !23, metadata !4, %struct.bar* @b, null} ; [ DW_TAG_variable ] [b] [line 3] [def]
-!40 = metadata !{metadata !"0x34\00animal\00animal\00_ZN7echidna8capybara8mongoose6animalE\0018\000\001", metadata !7, metadata !23, metadata !6, %"class.echidna::capybara::mongoose::fluffy"* @_ZN7echidna8capybara8mongoose6animalE, null} ; [ DW_TAG_variable ] [animal] [line 18] [def]
-!41 = metadata !{metadata !"0x34\00w\00w\00\0029\001\001", null, metadata !23, metadata !28, %"struct.<anonymous namespace>::walrus"* @w, null} ; [ DW_TAG_variable ] [w] [line 29] [local] [def]
-!42 = metadata !{metadata !"0x34\00wom\00wom\00\0038\000\001", null, metadata !23, metadata !14, %struct.wombat* @wom, null} ; [ DW_TAG_variable ] [wom] [line 38] [def]
-!43 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!44 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!45 = metadata !{metadata !"clang version 3.5 "}
-!46 = metadata !{metadata !"0x100\00b\007\000", metadata !22, metadata !23, metadata !47} ; [ DW_TAG_auto_variable ] [b] [line 7]
-!47 = metadata !{metadata !"0x13\00baz\006\008\008\000\000\000", metadata !1, metadata !22, null, metadata !2, null, null, null} ; [ DW_TAG_structure_type ] [baz] [line 6, size 8, align 8, offset 0] [def] [from ]
-!48 = metadata !{i32 7, i32 0, metadata !22, null}
-!49 = metadata !{i32 8, i32 0, metadata !22, null}
-!50 = metadata !{i32 29, i32 0, metadata !26, null}
-!51 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !27, null, metadata !52} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!52 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from walrus]
-!53 = metadata !{i32 0, i32 0, metadata !27, null}
-!54 = metadata !{i32 25, i32 0, metadata !27, null}
-!55 = metadata !{i32 25, i32 0, metadata !36, null}
+!0 = !{!"0x11\004\00clang version 3.5 \000\00\000\00bar.dwo\000", !1, !2, !3, !21, !38, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/bar.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"bar.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4, !6, !14, !17}
+!4 = !{!"0x13\00bar\001\008\008\000\000\000", !5, null, null, !2, null, null, !"_ZTS3bar"} ; [ DW_TAG_structure_type ] [bar] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!"bar.h", !"/tmp/dbginfo"}
+!6 = !{!"0x2\00fluffy\0013\0064\0032\000\000\000", !1, !7, null, !10, null, null, !"_ZTSN7echidna8capybara8mongoose6fluffyE"} ; [ DW_TAG_class_type ] [fluffy] [line 13, size 64, align 32, offset 0] [def] [from ]
+!7 = !{!"0x39\00mongoose\0012", !1, !8} ; [ DW_TAG_namespace ] [mongoose] [line 12]
+!8 = !{!"0x39\00capybara\0011", !1, !9} ; [ DW_TAG_namespace ] [capybara] [line 11]
+!9 = !{!"0x39\00echidna\0010", !1, null} ; [ DW_TAG_namespace ] [echidna] [line 10]
+!10 = !{!11, !13}
+!11 = !{!"0xd\00a\0014\0032\0032\000\001", !1, !"_ZTSN7echidna8capybara8mongoose6fluffyE", !12} ; [ DW_TAG_member ] [a] [line 14, size 32, align 32, offset 0] [private] [from int]
+!12 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = !{!"0xd\00b\0015\0032\0032\0032\001", !1, !"_ZTSN7echidna8capybara8mongoose6fluffyE", !12} ; [ DW_TAG_member ] [b] [line 15, size 32, align 32, offset 32] [private] [from int]
+!14 = !{!"0x13\00wombat\0031\0064\0032\000\000\000", !1, null, null, !15, null, null, !"_ZTS6wombat"} ; [ DW_TAG_structure_type ] [wombat] [line 31, size 64, align 32, offset 0] [def] [from ]
+!15 = !{!16}
+!16 = !{!"0xd\00a_b\0035\0064\0032\000\000", !1, !"_ZTS6wombat", !"_ZTSN6wombatUt_E"} ; [ DW_TAG_member ] [a_b] [line 35, size 64, align 32, offset 0] [from _ZTSN6wombatUt_E]
+!17 = !{!"0x13\00\0032\0064\0032\000\000\000", !1, !"_ZTS6wombat", null, !18, null, null, !"_ZTSN6wombatUt_E"} ; [ DW_TAG_structure_type ] [line 32, size 64, align 32, offset 0] [def] [from ]
+!18 = !{!19, !20}
+!19 = !{!"0xd\00a\0033\0032\0032\000\000", !1, !"_ZTSN6wombatUt_E", !12} ; [ DW_TAG_member ] [a] [line 33, size 32, align 32, offset 0] [from int]
+!20 = !{!"0xd\00b\0034\0032\0032\0032\000", !1, !"_ZTSN6wombatUt_E", !12} ; [ DW_TAG_member ] [b] [line 34, size 32, align 32, offset 32] [from int]
+!21 = !{!22, !26, !27, !36}
+!22 = !{!"0x2e\00foo\00foo\00_Z3foov\005\000\001\000\006\00256\000\005", !1, !23, !24, null, void ()* @_Z3foov, null, null, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
+!23 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/bar.cpp]
+!24 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!25 = !{null}
+!26 = !{!"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\0029\001\001\000\006\00256\000\0029", !1, !23, !24, null, void ()* @__cxx_global_var_init, null, null, !2} ; [ DW_TAG_subprogram ] [line 29] [local] [def] [__cxx_global_var_init]
+!27 = !{!"0x2e\00walrus\00walrus\00_ZN12_GLOBAL__N_16walrusC2Ev\0025\001\001\000\006\00256\000\0025", !1, !28, !32, null, void (%"struct.<anonymous namespace>::walrus"*)* @_ZN12_GLOBAL__N_16walrusC2Ev, null, !31, !2} ; [ DW_TAG_subprogram ] [line 25] [local] [def] [walrus]
+!28 = !{!"0x13\00walrus\0024\008\008\000\000\000", !1, !29, null, !30, null, null, null} ; [ DW_TAG_structure_type ] [walrus] [line 24, size 8, align 8, offset 0] [def] [from ]
+!29 = !{!"0x39\00\0023", !1, null} ; [ DW_TAG_namespace ] [line 23]
+!30 = !{!31}
+!31 = !{!"0x2e\00walrus\00walrus\00\0025\000\000\000\006\00256\000\0025", !1, !28, !32, null, null, null, i32 0, !35} ; [ DW_TAG_subprogram ] [line 25] [walrus]
+!32 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !33, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!33 = !{null, !34}
+!34 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from walrus]
+!35 = !{i32 786468}
+!36 = !{!"0x2e\00\00\00_GLOBAL__I_a\0025\001\001\000\006\0064\000\0025", !1, !23, !37, null, void ()* @_GLOBAL__I_a, null, null, !2} ; [ DW_TAG_subprogram ] [line 25] [local] [def]
+!37 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!38 = !{!39, !40, !41, !42}
+!39 = !{!"0x34\00b\00b\00\003\000\001", null, !23, !4, %struct.bar* @b, null} ; [ DW_TAG_variable ] [b] [line 3] [def]
+!40 = !{!"0x34\00animal\00animal\00_ZN7echidna8capybara8mongoose6animalE\0018\000\001", !7, !23, !6, %"class.echidna::capybara::mongoose::fluffy"* @_ZN7echidna8capybara8mongoose6animalE, null} ; [ DW_TAG_variable ] [animal] [line 18] [def]
+!41 = !{!"0x34\00w\00w\00\0029\001\001", null, !23, !28, %"struct.<anonymous namespace>::walrus"* @w, null} ; [ DW_TAG_variable ] [w] [line 29] [local] [def]
+!42 = !{!"0x34\00wom\00wom\00\0038\000\001", null, !23, !14, %struct.wombat* @wom, null} ; [ DW_TAG_variable ] [wom] [line 38] [def]
+!43 = !{i32 2, !"Dwarf Version", i32 4}
+!44 = !{i32 1, !"Debug Info Version", i32 2}
+!45 = !{!"clang version 3.5 "}
+!46 = !{!"0x100\00b\007\000", !22, !23, !47} ; [ DW_TAG_auto_variable ] [b] [line 7]
+!47 = !{!"0x13\00baz\006\008\008\000\000\000", !1, !22, null, !2, null, null, null} ; [ DW_TAG_structure_type ] [baz] [line 6, size 8, align 8, offset 0] [def] [from ]
+!48 = !MDLocation(line: 7, scope: !22)
+!49 = !MDLocation(line: 8, scope: !22)
+!50 = !MDLocation(line: 29, scope: !26)
+!51 = !{!"0x101\00this\0016777216\001088", !27, null, !52} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!52 = !{!"0xf\00\000\0064\0064\000\000", null, null, !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from walrus]
+!53 = !MDLocation(line: 0, scope: !27)
+!54 = !MDLocation(line: 25, scope: !27)
+!55 = !MDLocation(line: 25, scope: !36)
diff --git a/test/DebugInfo/X86/ghost-sdnode-dbgvalues.ll b/test/DebugInfo/X86/ghost-sdnode-dbgvalues.ll
index c430b3e..9391da6 100644
--- a/test/DebugInfo/X86/ghost-sdnode-dbgvalues.ll
+++ b/test/DebugInfo/X86/ghost-sdnode-dbgvalues.ll
@@ -31,27 +31,27 @@
 ; Function Attrs: nounwind ssp uwtable
 define i32 @foo(i32 %a) #0 {
 entry:
-  call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !16, metadata !17), !dbg !18
+  call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !16, metadata !17), !dbg !18
   %conv = trunc i32 %a to i16, !dbg !19
   %conv1 = sext i16 %conv to i32, !dbg !19
   %add = add nsw i32 %conv1, 8, !dbg !19
-  call void @llvm.dbg.value(metadata !{i32 %add}, i64 0, metadata !20, metadata !17), !dbg !21
+  call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !20, metadata !17), !dbg !21
   %conv2 = trunc i32 %add to i16, !dbg !22
   %conv3 = sext i16 %conv2 to i32, !dbg !22
   %add4 = add nsw i32 %conv3, 8, !dbg !22
-  call void @llvm.dbg.value(metadata !{i32 %add4}, i64 0, metadata !23, metadata !17), !dbg !24
+  call void @llvm.dbg.value(metadata i32 %add4, i64 0, metadata !23, metadata !17), !dbg !24
   %conv5 = trunc i32 %add4 to i16, !dbg !25
   %conv6 = sext i16 %conv5 to i32, !dbg !25
   %add7 = add nsw i32 %conv6, 8, !dbg !25
-  call void @llvm.dbg.value(metadata !{i32 %add7}, i64 0, metadata !26, metadata !17), !dbg !27
+  call void @llvm.dbg.value(metadata i32 %add7, i64 0, metadata !26, metadata !17), !dbg !27
   %conv8 = trunc i32 %add7 to i16, !dbg !28
   %conv9 = sext i16 %conv8 to i32, !dbg !28
   %add10 = add nsw i32 %conv9, 8, !dbg !28
-  call void @llvm.dbg.value(metadata !{i32 %add10}, i64 0, metadata !29, metadata !17), !dbg !30
+  call void @llvm.dbg.value(metadata i32 %add10, i64 0, metadata !29, metadata !17), !dbg !30
   %conv11 = trunc i32 %add10 to i16, !dbg !31
   %conv12 = sext i16 %conv11 to i32, !dbg !31
   %add13 = add nsw i32 %conv12, 8, !dbg !31
-  call void @llvm.dbg.value(metadata !{i32 %add13}, i64 0, metadata !32, metadata !17), !dbg !33
+  call void @llvm.dbg.value(metadata i32 %add13, i64 0, metadata !32, metadata !17), !dbg !33
   ret i32 %add13, !dbg !34
 }
 
@@ -68,38 +68,38 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!13, !14}
 !llvm.ident = !{!15}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.6.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !7, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/ghost-sdnode-dbgvalues.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"ghost-sdnode-dbgvalues.c", metadata !"/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x16\00int16_t\0030\000\000\000\000", metadata !5, null, metadata !6} ; [ DW_TAG_typedef ] [int16_t] [line 30, size 0, align 0, offset 0] [from short]
-!5 = metadata !{metadata !"/usr/include/sys/_types/_int16_t.h", metadata !"/tmp"}
-!6 = metadata !{metadata !"0x24\00short\000\0016\0016\000\000\005", null, null} ; [ DW_TAG_base_type ] [short] [line 0, size 16, align 16, offset 0, enc DW_ATE_signed]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\001\000\000\00256\000\003", metadata !1, metadata !9, metadata !10, null, i32 (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [foo]
-!9 = metadata !{metadata !"0x29", metadata !1}    ; [ DW_TAG_file_type ] [/tmp/ghost-sdnode-dbgvalues.c]
-!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!11 = metadata !{metadata !12, metadata !12}
-!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!13 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!14 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!15 = metadata !{metadata !"clang version 3.6.0 "}
-!16 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !8, metadata !9, metadata !12} ; [ DW_TAG_arg_variable ] [a] [line 3]
-!17 = metadata !{metadata !"0x102"}               ; [ DW_TAG_expression ]
-!18 = metadata !{i32 3, i32 13, metadata !8, null}
-!19 = metadata !{i32 4, i32 5, metadata !8, null}
-!20 = metadata !{metadata !"0x100\00b\004\000", metadata !8, metadata !9, metadata !12} ; [ DW_TAG_auto_variable ] [b] [line 4]
-!21 = metadata !{i32 4, i32 9, metadata !8, null}
-!22 = metadata !{i32 5, i32 5, metadata !8, null}
-!23 = metadata !{metadata !"0x100\00c\005\000", metadata !8, metadata !9, metadata !12} ; [ DW_TAG_auto_variable ] [c] [line 5]
-!24 = metadata !{i32 5, i32 9, metadata !8, null}
-!25 = metadata !{i32 6, i32 5, metadata !8, null}
-!26 = metadata !{metadata !"0x100\00d\006\000", metadata !8, metadata !9, metadata !12} ; [ DW_TAG_auto_variable ] [d] [line 6]
-!27 = metadata !{i32 6, i32 9, metadata !8, null}
-!28 = metadata !{i32 7, i32 5, metadata !8, null}
-!29 = metadata !{metadata !"0x100\00e\007\000", metadata !8, metadata !9, metadata !12} ; [ DW_TAG_auto_variable ] [e] [line 7]
-!30 = metadata !{i32 7, i32 9, metadata !8, null}
-!31 = metadata !{i32 8, i32 5, metadata !8, null}
-!32 = metadata !{metadata !"0x100\00f\008\000", metadata !8, metadata !9, metadata !12} ; [ DW_TAG_auto_variable ] [f] [line 8]
-!33 = metadata !{i32 8, i32 9, metadata !8, null}
-!34 = metadata !{i32 9, i32 5, metadata !8, null}
+!0 = !{!"0x11\0012\00clang version 3.6.0 \000\00\000\00\001", !1, !2, !3, !7, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/ghost-sdnode-dbgvalues.c] [DW_LANG_C99]
+!1 = !{!"ghost-sdnode-dbgvalues.c", !"/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x16\00int16_t\0030\000\000\000\000", !5, null, !6} ; [ DW_TAG_typedef ] [int16_t] [line 30, size 0, align 0, offset 0] [from short]
+!5 = !{!"/usr/include/sys/_types/_int16_t.h", !"/tmp"}
+!6 = !{!"0x24\00short\000\0016\0016\000\000\005", null, null} ; [ DW_TAG_base_type ] [short] [line 0, size 16, align 16, offset 0, enc DW_ATE_signed]
+!7 = !{!8}
+!8 = !{!"0x2e\00foo\00foo\00\003\000\001\000\000\00256\000\003", !1, !9, !10, null, i32 (i32)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [foo]
+!9 = !{!"0x29", !1}    ; [ DW_TAG_file_type ] [/tmp/ghost-sdnode-dbgvalues.c]
+!10 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = !{!12, !12}
+!12 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = !{i32 2, !"Dwarf Version", i32 2}
+!14 = !{i32 2, !"Debug Info Version", i32 2}
+!15 = !{!"clang version 3.6.0 "}
+!16 = !{!"0x101\00a\0016777219\000", !8, !9, !12} ; [ DW_TAG_arg_variable ] [a] [line 3]
+!17 = !{!"0x102"}               ; [ DW_TAG_expression ]
+!18 = !MDLocation(line: 3, column: 13, scope: !8)
+!19 = !MDLocation(line: 4, column: 5, scope: !8)
+!20 = !{!"0x100\00b\004\000", !8, !9, !12} ; [ DW_TAG_auto_variable ] [b] [line 4]
+!21 = !MDLocation(line: 4, column: 9, scope: !8)
+!22 = !MDLocation(line: 5, column: 5, scope: !8)
+!23 = !{!"0x100\00c\005\000", !8, !9, !12} ; [ DW_TAG_auto_variable ] [c] [line 5]
+!24 = !MDLocation(line: 5, column: 9, scope: !8)
+!25 = !MDLocation(line: 6, column: 5, scope: !8)
+!26 = !{!"0x100\00d\006\000", !8, !9, !12} ; [ DW_TAG_auto_variable ] [d] [line 6]
+!27 = !MDLocation(line: 6, column: 9, scope: !8)
+!28 = !MDLocation(line: 7, column: 5, scope: !8)
+!29 = !{!"0x100\00e\007\000", !8, !9, !12} ; [ DW_TAG_auto_variable ] [e] [line 7]
+!30 = !MDLocation(line: 7, column: 9, scope: !8)
+!31 = !MDLocation(line: 8, column: 5, scope: !8)
+!32 = !{!"0x100\00f\008\000", !8, !9, !12} ; [ DW_TAG_auto_variable ] [f] [line 8]
+!33 = !MDLocation(line: 8, column: 9, scope: !8)
+!34 = !MDLocation(line: 9, column: 5, scope: !8)
diff --git a/test/DebugInfo/X86/gnu-public-names-empty.ll b/test/DebugInfo/X86/gnu-public-names-empty.ll
index 4c97b3f..f9a0617 100644
--- a/test/DebugInfo/X86/gnu-public-names-empty.ll
+++ b/test/DebugInfo/X86/gnu-public-names-empty.ll
@@ -12,8 +12,8 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 191846) (llvm/trunk 191866)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp"}
-!2 = metadata !{}
-!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4 (trunk 191846) (llvm/trunk 191866)\000\00\000\00\000", !1, !2, !2, !2, !2, !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
+!1 = !{!"foo.c", !"/usr/local/google/home/echristo/tmp"}
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/gnu-public-names.ll b/test/DebugInfo/X86/gnu-public-names.ll
index 1696288..7e92b53 100644
--- a/test/DebugInfo/X86/gnu-public-names.ll
+++ b/test/DebugInfo/X86/gnu-public-names.ll
@@ -214,7 +214,7 @@ define void @_ZN1C15member_functionEv(%struct.C* %this) #0 align 2 {
 entry:
   %this.addr = alloca %struct.C*, align 8
   store %struct.C* %this, %struct.C** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !50, metadata !{metadata !"0x102"}), !dbg !52
+  call void @llvm.dbg.declare(metadata %struct.C** %this.addr, metadata !50, metadata !{!"0x102"}), !dbg !52
   %this1 = load %struct.C** %this.addr
   store i32 0, i32* @_ZN1C22static_member_variableE, align 4, !dbg !53
   ret void, !dbg !54
@@ -270,64 +270,64 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!47, !48}
 !llvm.ident = !{!49}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !19, metadata !32, metadata !45} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/pubnames.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"pubnames.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !15}
-!4 = metadata !{metadata !"0x13\00C\001\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !6, metadata !8, metadata !12}
-!6 = metadata !{metadata !"0xd\00static_member_variable\004\000\000\000\004096", metadata !1, metadata !"_ZTS1C", metadata !7, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!8 = metadata !{metadata !"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\002\000\000\000\006\00256\000\002", metadata !1, metadata !"_ZTS1C", metadata !9, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 2] [member_function]
-!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!10 = metadata !{null, metadata !11}
-!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
-!12 = metadata !{metadata !"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\003\000\000\000\006\00256\000\003", metadata !1, metadata !"_ZTS1C", metadata !13, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
-!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!14 = metadata !{metadata !7}
-!15 = metadata !{metadata !"0x13\00D\0028\0032\0032\000\000\000", metadata !1, metadata !16, null, metadata !17, null, null, metadata !"_ZTSN2ns1DE"} ; [ DW_TAG_structure_type ] [D] [line 28, size 32, align 32, offset 0] [def] [from ]
-!16 = metadata !{metadata !"0x39\00ns\0023", metadata !1, null} ; [ DW_TAG_namespace ] [ns] [line 23]
-!17 = metadata !{metadata !18}
-!18 = metadata !{metadata !"0xd\00A\0029\0032\0032\000\000", metadata !1, metadata !"_ZTSN2ns1DE", metadata !7} ; [ DW_TAG_member ] [A] [line 29, size 32, align 32, offset 0] [from int]
-!19 = metadata !{metadata !20, metadata !21, metadata !22, metadata !24, metadata !27, metadata !31}
-!20 = metadata !{metadata !"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\009\000\001\000\006\00256\000\009", metadata !1, metadata !"_ZTS1C", metadata !9, null, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !8, metadata !2} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
-!21 = metadata !{metadata !"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\0013\000\001\000\006\00256\000\0013", metadata !1, metadata !"_ZTS1C", metadata !13, null, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !12, metadata !2} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
-!22 = metadata !{metadata !"0x2e\00global_function\00global_function\00_Z15global_functionv\0019\000\001\000\006\00256\000\0019", metadata !1, metadata !23, metadata !13, null, i32 ()* @_Z15global_functionv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
-!23 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/pubnames.cpp]
-!24 = metadata !{metadata !"0x2e\00global_namespace_function\00global_namespace_function\00_ZN2ns25global_namespace_functionEv\0024\000\001\000\006\00256\000\0024", metadata !1, metadata !16, metadata !25, null, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
-!25 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !26, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!26 = metadata !{null}
-!27 = metadata !{metadata !"0x2e\00f3\00f3\00_Z2f3v\0037\000\001\000\006\00256\000\0037", metadata !1, metadata !23, metadata !28, null, i32* ()* @_Z2f3v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 37] [def] [f3]
-!28 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!29 = metadata !{metadata !30}
-!30 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!31 = metadata !{metadata !"0x2e\00f7\00f7\00_Z2f7v\0054\000\001\000\006\00256\000\0054", metadata !1, metadata !23, metadata !13, null, i32 ()* @_Z2f7v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 54] [def] [f7]
-!32 = metadata !{metadata !33, metadata !34, metadata !35, metadata !36, metadata !37, metadata !38, metadata !41, metadata !44}
-!33 = metadata !{metadata !"0x34\00static_member_variable\00static_member_variable\00_ZN1C22static_member_variableE\007\000\001", null, metadata !23, metadata !7, i32* @_ZN1C22static_member_variableE, metadata !6} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
-!34 = metadata !{metadata !"0x34\00global_variable\00global_variable\00\0017\000\001", null, metadata !23, metadata !"_ZTS1C", %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 17] [def]
-!35 = metadata !{metadata !"0x34\00global_namespace_variable\00global_namespace_variable\00_ZN2ns25global_namespace_variableE\0027\000\001", metadata !16, metadata !23, metadata !7, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 27] [def]
-!36 = metadata !{metadata !"0x34\00d\00d\00_ZN2ns1dE\0030\000\001", metadata !16, metadata !23, metadata !"_ZTSN2ns1DE", %"struct.ns::D"* @_ZN2ns1dE, null} ; [ DW_TAG_variable ] [d] [line 30] [def]
-!37 = metadata !{metadata !"0x34\00z\00z\00\0038\001\001", metadata !27, metadata !23, metadata !7, i32* @_ZZ2f3vE1z, null} ; [ DW_TAG_variable ] [z] [line 38] [local] [def]
-!38 = metadata !{metadata !"0x34\00c\00c\00_ZN5outer12_GLOBAL__N_11cE\0050\001\001", metadata !39, metadata !23, metadata !7, i32* @_ZN5outer12_GLOBAL__N_11cE, null} ; [ DW_TAG_variable ] [c] [line 50] [local] [def]
-!39 = metadata !{metadata !"0x39\00\0049", metadata !1, metadata !40} ; [ DW_TAG_namespace ] [line 49]
-!40 = metadata !{metadata !"0x39\00outer\0048", metadata !1, null} ; [ DW_TAG_namespace ] [outer] [line 48]
-!41 = metadata !{metadata !"0x34\00b\00b\00_ZN12_GLOBAL__N_15inner1bE\0044\001\001", metadata !42, metadata !23, metadata !7, i32* @_ZN12_GLOBAL__N_15inner1bE, null} ; [ DW_TAG_variable ] [b] [line 44] [local] [def]
-!42 = metadata !{metadata !"0x39\00inner\0043", metadata !1, metadata !43} ; [ DW_TAG_namespace ] [inner] [line 43]
-!43 = metadata !{metadata !"0x39\00\0033", metadata !1, null} ; [ DW_TAG_namespace ] [line 33]
-!44 = metadata !{metadata !"0x34\00i\00i\00_ZN12_GLOBAL__N_11iE\0034\001\001", metadata !43, metadata !23, metadata !7, i32* @_ZN12_GLOBAL__N_11iE, null} ; [ DW_TAG_variable ] [i] [line 34] [local] [def]
-!45 = metadata !{metadata !46}
-!46 = metadata !{metadata !"0x3a\0040\00", metadata !40, metadata !39} ; [ DW_TAG_imported_module ]
-!47 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!48 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!49 = metadata !{metadata !"clang version 3.5.0 "}
-!50 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !20, null, metadata !51} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!51 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
-!52 = metadata !{i32 0, i32 0, metadata !20, null}
-!53 = metadata !{i32 10, i32 0, metadata !20, null}
-!54 = metadata !{i32 11, i32 0, metadata !20, null}
-!55 = metadata !{i32 14, i32 0, metadata !21, null}
-!56 = metadata !{i32 20, i32 0, metadata !22, null}
-!57 = metadata !{i32 25, i32 0, metadata !24, null}
-!58 = metadata !{i32 26, i32 0, metadata !24, null}
-!59 = metadata !{i32 39, i32 0, metadata !27, null}
-!60 = metadata !{i32 55, i32 0, metadata !31, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !3, !19, !32, !45} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/pubnames.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"pubnames.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4, !15}
+!4 = !{!"0x13\00C\001\008\008\000\000\000", !1, null, null, !5, null, null, !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!6, !8, !12}
+!6 = !{!"0xd\00static_member_variable\004\000\000\000\004096", !1, !"_ZTS1C", !7, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = !{!"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\002\000\000\000\006\00256\000\002", !1, !"_ZTS1C", !9, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 2] [member_function]
+!9 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = !{null, !11}
+!11 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!12 = !{!"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\003\000\000\000\006\00256\000\003", !1, !"_ZTS1C", !13, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
+!13 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = !{!7}
+!15 = !{!"0x13\00D\0028\0032\0032\000\000\000", !1, !16, null, !17, null, null, !"_ZTSN2ns1DE"} ; [ DW_TAG_structure_type ] [D] [line 28, size 32, align 32, offset 0] [def] [from ]
+!16 = !{!"0x39\00ns\0023", !1, null} ; [ DW_TAG_namespace ] [ns] [line 23]
+!17 = !{!18}
+!18 = !{!"0xd\00A\0029\0032\0032\000\000", !1, !"_ZTSN2ns1DE", !7} ; [ DW_TAG_member ] [A] [line 29, size 32, align 32, offset 0] [from int]
+!19 = !{!20, !21, !22, !24, !27, !31}
+!20 = !{!"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\009\000\001\000\006\00256\000\009", !1, !"_ZTS1C", !9, null, void (%struct.C*)* @_ZN1C15member_functionEv, null, !8, !2} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
+!21 = !{!"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\0013\000\001\000\006\00256\000\0013", !1, !"_ZTS1C", !13, null, i32 ()* @_ZN1C22static_member_functionEv, null, !12, !2} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
+!22 = !{!"0x2e\00global_function\00global_function\00_Z15global_functionv\0019\000\001\000\006\00256\000\0019", !1, !23, !13, null, i32 ()* @_Z15global_functionv, null, null, !2} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
+!23 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/pubnames.cpp]
+!24 = !{!"0x2e\00global_namespace_function\00global_namespace_function\00_ZN2ns25global_namespace_functionEv\0024\000\001\000\006\00256\000\0024", !1, !16, !25, null, void ()* @_ZN2ns25global_namespace_functionEv, null, null, !2} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
+!25 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !26, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!26 = !{null}
+!27 = !{!"0x2e\00f3\00f3\00_Z2f3v\0037\000\001\000\006\00256\000\0037", !1, !23, !28, null, i32* ()* @_Z2f3v, null, null, !2} ; [ DW_TAG_subprogram ] [line 37] [def] [f3]
+!28 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!29 = !{!30}
+!30 = !{!"0xf\00\000\0064\0064\000\000", null, null, !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!31 = !{!"0x2e\00f7\00f7\00_Z2f7v\0054\000\001\000\006\00256\000\0054", !1, !23, !13, null, i32 ()* @_Z2f7v, null, null, !2} ; [ DW_TAG_subprogram ] [line 54] [def] [f7]
+!32 = !{!33, !34, !35, !36, !37, !38, !41, !44}
+!33 = !{!"0x34\00static_member_variable\00static_member_variable\00_ZN1C22static_member_variableE\007\000\001", null, !23, !7, i32* @_ZN1C22static_member_variableE, !6} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
+!34 = !{!"0x34\00global_variable\00global_variable\00\0017\000\001", null, !23, !"_ZTS1C", %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 17] [def]
+!35 = !{!"0x34\00global_namespace_variable\00global_namespace_variable\00_ZN2ns25global_namespace_variableE\0027\000\001", !16, !23, !7, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 27] [def]
+!36 = !{!"0x34\00d\00d\00_ZN2ns1dE\0030\000\001", !16, !23, !"_ZTSN2ns1DE", %"struct.ns::D"* @_ZN2ns1dE, null} ; [ DW_TAG_variable ] [d] [line 30] [def]
+!37 = !{!"0x34\00z\00z\00\0038\001\001", !27, !23, !7, i32* @_ZZ2f3vE1z, null} ; [ DW_TAG_variable ] [z] [line 38] [local] [def]
+!38 = !{!"0x34\00c\00c\00_ZN5outer12_GLOBAL__N_11cE\0050\001\001", !39, !23, !7, i32* @_ZN5outer12_GLOBAL__N_11cE, null} ; [ DW_TAG_variable ] [c] [line 50] [local] [def]
+!39 = !{!"0x39\00\0049", !1, !40} ; [ DW_TAG_namespace ] [line 49]
+!40 = !{!"0x39\00outer\0048", !1, null} ; [ DW_TAG_namespace ] [outer] [line 48]
+!41 = !{!"0x34\00b\00b\00_ZN12_GLOBAL__N_15inner1bE\0044\001\001", !42, !23, !7, i32* @_ZN12_GLOBAL__N_15inner1bE, null} ; [ DW_TAG_variable ] [b] [line 44] [local] [def]
+!42 = !{!"0x39\00inner\0043", !1, !43} ; [ DW_TAG_namespace ] [inner] [line 43]
+!43 = !{!"0x39\00\0033", !1, null} ; [ DW_TAG_namespace ] [line 33]
+!44 = !{!"0x34\00i\00i\00_ZN12_GLOBAL__N_11iE\0034\001\001", !43, !23, !7, i32* @_ZN12_GLOBAL__N_11iE, null} ; [ DW_TAG_variable ] [i] [line 34] [local] [def]
+!45 = !{!46}
+!46 = !{!"0x3a\0040\00", !40, !39} ; [ DW_TAG_imported_module ]
+!47 = !{i32 2, !"Dwarf Version", i32 4}
+!48 = !{i32 2, !"Debug Info Version", i32 2}
+!49 = !{!"clang version 3.5.0 "}
+!50 = !{!"0x101\00this\0016777216\001088", !20, null, !51} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!51 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!52 = !MDLocation(line: 0, scope: !20)
+!53 = !MDLocation(line: 10, scope: !20)
+!54 = !MDLocation(line: 11, scope: !20)
+!55 = !MDLocation(line: 14, scope: !21)
+!56 = !MDLocation(line: 20, scope: !22)
+!57 = !MDLocation(line: 25, scope: !24)
+!58 = !MDLocation(line: 26, scope: !24)
+!59 = !MDLocation(line: 39, scope: !27)
+!60 = !MDLocation(line: 55, scope: !31)
diff --git a/test/DebugInfo/X86/inline-member-function.ll b/test/DebugInfo/X86/inline-member-function.ll
index 214fdba..68a211f 100644
--- a/test/DebugInfo/X86/inline-member-function.ll
+++ b/test/DebugInfo/X86/inline-member-function.ll
@@ -46,9 +46,9 @@ entry:
   store i32 0, i32* %retval
   %0 = load i32* @i, align 4, !dbg !23
   store %struct.foo* %tmp, %struct.foo** %this.addr.i, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr.i}, metadata !24, metadata !{metadata !"0x102"}), !dbg !26
+  call void @llvm.dbg.declare(metadata %struct.foo** %this.addr.i, metadata !24, metadata !{!"0x102"}), !dbg !26
   store i32 %0, i32* %x.addr.i, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x.addr.i}, metadata !27, metadata !{metadata !"0x102"}), !dbg !28
+  call void @llvm.dbg.declare(metadata i32* %x.addr.i, metadata !27, metadata !{!"0x102"}), !dbg !28
   %this1.i = load %struct.foo** %this.addr.i
   %1 = load i32* %x.addr.i, align 4, !dbg !28
   %add.i = add nsw i32 %1, 2, !dbg !28
@@ -65,32 +65,32 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!20, !21}
 !llvm.ident = !{!22}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !12, metadata !18, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/inline.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"inline.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00foo\001\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{metadata !"0x2e\00func\00func\00_ZN3foo4funcEi\002\000\000\000\006\00256\000\002", metadata !1, metadata !"_ZTS3foo", metadata !7, null, null, null, i32 0, metadata !11} ; [ DW_TAG_subprogram ] [line 2] [func]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !10, metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
-!11 = metadata !{i32 786468}
-!12 = metadata !{metadata !13, metadata !17}
-!13 = metadata !{metadata !"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", metadata !1, metadata !14, metadata !15, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
-!14 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/inline.cpp]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{metadata !9}
-!17 = metadata !{metadata !"0x2e\00func\00func\00_ZN3foo4funcEi\002\000\001\000\006\00256\000\002", metadata !1, metadata !"_ZTS3foo", metadata !7, null, null, null, metadata !6, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [func]
-!18 = metadata !{metadata !19}
-!19 = metadata !{metadata !"0x34\00i\00i\00\005\000\001", null, metadata !14, metadata !9, i32* @i, null} ; [ DW_TAG_variable ] [i] [line 5] [def]
-!20 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!22 = metadata !{metadata !"clang version 3.5.0 "}
-!23 = metadata !{i32 8, i32 0, metadata !13, null}
-!24 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !17, null, metadata !25} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!25 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
-!26 = metadata !{i32 0, i32 0, metadata !17, metadata !23}
-!27 = metadata !{metadata !"0x101\00x\0033554434\000", metadata !17, metadata !14, metadata !9} ; [ DW_TAG_arg_variable ] [x] [line 2]
-!28 = metadata !{i32 2, i32 0, metadata !17, metadata !23}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !3, !12, !18, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/inline.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"inline.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00foo\001\008\008\000\000\000", !1, null, null, !5, null, null, !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!6}
+!6 = !{!"0x2e\00func\00func\00_ZN3foo4funcEi\002\000\000\000\006\00256\000\002", !1, !"_ZTS3foo", !7, null, null, null, i32 0, !11} ; [ DW_TAG_subprogram ] [line 2] [func]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !10, !9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
+!11 = !{i32 786468}
+!12 = !{!13, !17}
+!13 = !{!"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", !1, !14, !15, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!14 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/inline.cpp]
+!15 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{!9}
+!17 = !{!"0x2e\00func\00func\00_ZN3foo4funcEi\002\000\001\000\006\00256\000\002", !1, !"_ZTS3foo", !7, null, null, null, !6, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [func]
+!18 = !{!19}
+!19 = !{!"0x34\00i\00i\00\005\000\001", null, !14, !9, i32* @i, null} ; [ DW_TAG_variable ] [i] [line 5] [def]
+!20 = !{i32 2, !"Dwarf Version", i32 4}
+!21 = !{i32 1, !"Debug Info Version", i32 2}
+!22 = !{!"clang version 3.5.0 "}
+!23 = !MDLocation(line: 8, scope: !13)
+!24 = !{!"0x101\00this\0016777216\001088", !17, null, !25} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!25 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
+!26 = !MDLocation(line: 0, scope: !17, inlinedAt: !23)
+!27 = !{!"0x101\00x\0033554434\000", !17, !14, !9} ; [ DW_TAG_arg_variable ] [x] [line 2]
+!28 = !MDLocation(line: 2, scope: !17, inlinedAt: !23)
diff --git a/test/DebugInfo/X86/inline-seldag-test.ll b/test/DebugInfo/X86/inline-seldag-test.ll
index 278604d..8c10e3a 100644
--- a/test/DebugInfo/X86/inline-seldag-test.ll
+++ b/test/DebugInfo/X86/inline-seldag-test.ll
@@ -27,10 +27,10 @@ define void @func() #0 {
 entry:
   %y.addr.i = alloca i32, align 4
   %x = alloca i32, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x}, metadata !15, metadata !{metadata !"0x102"}), !dbg !17
+  call void @llvm.dbg.declare(metadata i32* %x, metadata !15, metadata !{!"0x102"}), !dbg !17
   %0 = load volatile i32* %x, align 4, !dbg !18
   store i32 %0, i32* %y.addr.i, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %y.addr.i}, metadata !19, metadata !{metadata !"0x102"}), !dbg !20
+  call void @llvm.dbg.declare(metadata i32* %y.addr.i, metadata !19, metadata !{!"0x102"}), !dbg !20
   %1 = load i32* %y.addr.i, align 4, !dbg !21
   %tobool.i = icmp ne i32 %1, 0, !dbg !21
   %cond.i = select i1 %tobool.i, i32 4, i32 7, !dbg !21
@@ -48,26 +48,26 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!12, !13}
 !llvm.ident = !{!14}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/inline-seldag-test.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"inline-seldag-test.c", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !8}
-!4 = metadata !{metadata !"0x2e\00func\00func\00\004\000\001\000\006\000\000\004", metadata !1, metadata !5, metadata !6, null, void ()* @func, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [func]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/inline-seldag-test.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{metadata !"0x2e\00f\00f\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !9, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!10 = metadata !{metadata !11, metadata !11}
-!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!12 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!14 = metadata !{metadata !"clang version 3.5.0 "}
-!15 = metadata !{metadata !"0x100\00x\005\000", metadata !4, metadata !5, metadata !16} ; [ DW_TAG_auto_variable ] [x] [line 5]
-!16 = metadata !{metadata !"0x35\00\000\000\000\000\000", null, null, metadata !11} ; [ DW_TAG_volatile_type ] [line 0, size 0, align 0, offset 0] [from int]
-!17 = metadata !{i32 5, i32 0, metadata !4, null}
-!18 = metadata !{i32 6, i32 7, metadata !4, null}
-!19 = metadata !{metadata !"0x101\00y\0016777217\000", metadata !8, metadata !5, metadata !11} ; [ DW_TAG_arg_variable ] [y] [line 1]
-!20 = metadata !{i32 1, i32 0, metadata !8, metadata !18}
-!21 = metadata !{i32 2, i32 0, metadata !8, metadata !18}
-!22 = metadata !{i32 7, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/inline-seldag-test.c] [DW_LANG_C99]
+!1 = !{!"inline-seldag-test.c", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4, !8}
+!4 = !{!"0x2e\00func\00func\00\004\000\001\000\006\000\000\004", !1, !5, !6, null, void ()* @func, null, null, !2} ; [ DW_TAG_subprogram ] [line 4] [def] [func]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/inline-seldag-test.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{!"0x2e\00f\00f\00\001\000\001\000\006\00256\000\001", !1, !5, !9, null, null, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!9 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = !{!11, !11}
+!11 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = !{i32 2, !"Dwarf Version", i32 4}
+!13 = !{i32 1, !"Debug Info Version", i32 2}
+!14 = !{!"clang version 3.5.0 "}
+!15 = !{!"0x100\00x\005\000", !4, !5, !16} ; [ DW_TAG_auto_variable ] [x] [line 5]
+!16 = !{!"0x35\00\000\000\000\000\000", null, null, !11} ; [ DW_TAG_volatile_type ] [line 0, size 0, align 0, offset 0] [from int]
+!17 = !MDLocation(line: 5, scope: !4)
+!18 = !MDLocation(line: 6, column: 7, scope: !4)
+!19 = !{!"0x101\00y\0016777217\000", !8, !5, !11} ; [ DW_TAG_arg_variable ] [y] [line 1]
+!20 = !MDLocation(line: 1, scope: !8, inlinedAt: !18)
+!21 = !MDLocation(line: 2, scope: !8, inlinedAt: !18)
+!22 = !MDLocation(line: 7, scope: !4)
diff --git a/test/DebugInfo/X86/instcombine-instrinsics.ll b/test/DebugInfo/X86/instcombine-instrinsics.ll
index a2cc35e..12b99a3 100644
--- a/test/DebugInfo/X86/instcombine-instrinsics.ll
+++ b/test/DebugInfo/X86/instcombine-instrinsics.ll
@@ -2,8 +2,8 @@
 ; Verify that we emit the same intrinsic at most once.
 ; rdar://problem/13056109
 ;
-; CHECK: call void @llvm.dbg.value(metadata !{%struct.i14** %p}
-; CHECK-NOT: call void @llvm.dbg.value(metadata !{%struct.i14** %p}
+; CHECK: call void @llvm.dbg.value(metadata %struct.i14** %p
+; CHECK-NOT: call void @llvm.dbg.value(metadata %struct.i14** %p
 ; CHECK-NEXT: call i32 @foo
 ; CHECK: ret
 ;
@@ -30,7 +30,7 @@ target triple = "x86_64-apple-macosx10.9.0"
 ; Function Attrs: nounwind ssp uwtable
 define void @init() #0 {
   %p = alloca %struct.i14*, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.i14** %p}, metadata !11, metadata !{metadata !"0x102"}), !dbg !18
+  call void @llvm.dbg.declare(metadata %struct.i14** %p, metadata !11, metadata !{!"0x102"}), !dbg !18
   store %struct.i14* null, %struct.i14** %p, align 8, !dbg !18
   %1 = call i32 @foo(%struct.i14** %p), !dbg !19
   %2 = load %struct.i14** %p, align 8, !dbg !20
@@ -54,26 +54,26 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [instcombine_intrinsics.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"instcombine_intrinsics.c", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00init\00init\00\007\000\001\000\006\000\000\007", metadata !1, metadata !5, metadata !6, null, void ()* @init, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [init]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [instcombine_intrinsics.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!10 = metadata !{metadata !"clang version 3.5.0 "}
-!11 = metadata !{metadata !"0x100\00p\008\000", metadata !4, metadata !5, metadata !12} ; [ DW_TAG_auto_variable ] [p] [line 8]
-!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from i14]
-!13 = metadata !{metadata !"0x16\00i14\003\000\000\000\000", metadata !1, null, metadata !14} ; [ DW_TAG_typedef ] [i14] [line 3, size 0, align 0, offset 0] [from ]
-!14 = metadata !{metadata !"0x13\00\001\0064\0064\000\000\000", metadata !1, null, null, metadata !15, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 64, align 64, offset 0] [def] [from ]
-!15 = metadata !{metadata !16}
-!16 = metadata !{metadata !"0xd\00i\002\0064\0064\000\000", metadata !1, metadata !14, metadata !17} ; [ DW_TAG_member ] [i] [line 2, size 64, align 64, offset 0] [from long int]
-!17 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
-!18 = metadata !{i32 8, i32 0, metadata !4, null}
-!19 = metadata !{i32 9, i32 0, metadata !4, null}
-!20 = metadata !{i32 10, i32 0, metadata !4, null}
-!21 = metadata !{i32 11, i32 0, metadata !4, null}
-!22 = metadata !{i32 12, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [instcombine_intrinsics.c] [DW_LANG_C99]
+!1 = !{!"instcombine_intrinsics.c", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00init\00init\00\007\000\001\000\006\000\000\007", !1, !5, !6, null, void ()* @init, null, null, !2} ; [ DW_TAG_subprogram ] [line 7] [def] [init]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [instcombine_intrinsics.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{i32 2, !"Dwarf Version", i32 2}
+!9 = !{i32 1, !"Debug Info Version", i32 2}
+!10 = !{!"clang version 3.5.0 "}
+!11 = !{!"0x100\00p\008\000", !4, !5, !12} ; [ DW_TAG_auto_variable ] [p] [line 8]
+!12 = !{!"0xf\00\000\0064\0064\000\000", null, null, !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from i14]
+!13 = !{!"0x16\00i14\003\000\000\000\000", !1, null, !14} ; [ DW_TAG_typedef ] [i14] [line 3, size 0, align 0, offset 0] [from ]
+!14 = !{!"0x13\00\001\0064\0064\000\000\000", !1, null, null, !15, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 64, align 64, offset 0] [def] [from ]
+!15 = !{!16}
+!16 = !{!"0xd\00i\002\0064\0064\000\000", !1, !14, !17} ; [ DW_TAG_member ] [i] [line 2, size 64, align 64, offset 0] [from long int]
+!17 = !{!"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!18 = !MDLocation(line: 8, scope: !4)
+!19 = !MDLocation(line: 9, scope: !4)
+!20 = !MDLocation(line: 10, scope: !4)
+!21 = !MDLocation(line: 11, scope: !4)
+!22 = !MDLocation(line: 12, scope: !4)
diff --git a/test/DebugInfo/X86/lexical_block.ll b/test/DebugInfo/X86/lexical_block.ll
index e2832a0..a9e377a 100644
--- a/test/DebugInfo/X86/lexical_block.ll
+++ b/test/DebugInfo/X86/lexical_block.ll
@@ -25,7 +25,7 @@
 define void @_Z1bv() #0 {
 entry:
   %i = alloca i32, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !11, metadata !{metadata !"0x102"}), !dbg !14
+  call void @llvm.dbg.declare(metadata i32* %i, metadata !11, metadata !{!"0x102"}), !dbg !14
   store i32 3, i32* %i, align 4, !dbg !14
   %0 = load i32* %i, align 4, !dbg !14
   %tobool = icmp ne i32 %0, 0, !dbg !14
@@ -48,20 +48,20 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/lexical_block.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"lexical_block.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00b\00b\00_Z1bv\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @_Z1bv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [b]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/lexical_block.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!10 = metadata !{metadata !"clang version 3.5.0 "}
-!11 = metadata !{metadata !"0x100\00i\002\000", metadata !12, metadata !5, metadata !13} ; [ DW_TAG_auto_variable ] [i] [line 2]
-!12 = metadata !{metadata !"0xb\002\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/lexical_block.cpp]
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!14 = metadata !{i32 2, i32 0, metadata !12, null}
-!15 = metadata !{i32 3, i32 0, metadata !12, null}
-!16 = metadata !{i32 4, i32 0, metadata !4, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/lexical_block.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"lexical_block.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00b\00b\00_Z1bv\001\000\001\000\006\00256\000\001", !1, !5, !6, null, void ()* @_Z1bv, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [b]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/lexical_block.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 1, !"Debug Info Version", i32 2}
+!10 = !{!"clang version 3.5.0 "}
+!11 = !{!"0x100\00i\002\000", !12, !5, !13} ; [ DW_TAG_auto_variable ] [i] [line 2]
+!12 = !{!"0xb\002\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/lexical_block.cpp]
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = !MDLocation(line: 2, scope: !12)
+!15 = !MDLocation(line: 3, scope: !12)
+!16 = !MDLocation(line: 4, scope: !4)
diff --git a/test/DebugInfo/X86/line-info.ll b/test/DebugInfo/X86/line-info.ll
index 8e0afee..e436426 100644
--- a/test/DebugInfo/X86/line-info.ll
+++ b/test/DebugInfo/X86/line-info.ll
@@ -18,7 +18,7 @@ define i32 @foo(i32 %x) #0 {
 entry:
   %x.addr = alloca i32, align 4
   store i32 %x, i32* %x.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !14, metadata !{metadata !"0x102"}), !dbg !15
+  call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !14, metadata !{!"0x102"}), !dbg !15
   %0 = load i32* %x.addr, align 4, !dbg !16
   %inc = add nsw i32 %0, 1, !dbg !16
   store i32 %inc, i32* %x.addr, align 4, !dbg !16
@@ -38,23 +38,23 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!19}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2,  metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/list0.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"list0.c", metadata !"/usr/local/google/home/blaikie/dev/scratch"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !10}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !5, metadata !6, metadata !7, null, i32 (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{metadata !"./list0.h", metadata !"/usr/local/google/home/blaikie/dev/scratch"}
-!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/./list0.h]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !"0x2e\00main\00main\00\002\000\001\000\006\000\000\002", metadata !1, metadata !11, metadata !12, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [main]
-!11 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/list0.c]
-!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!13 = metadata !{metadata !9}
-!14 = metadata !{metadata !"0x101\00x\0016777217\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [x] [line 1]
-!15 = metadata !{i32 1, i32 0, metadata !4, null}
-!16 = metadata !{i32 2, i32 0, metadata !4, null}
-!17 = metadata !{i32 3, i32 0, metadata !18, null}
-!18 = metadata !{metadata !"0xb\000", metadata !11, metadata !10} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/blaikie/dev/scratch/list0.c]
-!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.3 \000\00\000\00\000", !1, !2, !2, !3, !2,  !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/list0.c] [DW_LANG_C99]
+!1 = !{!"list0.c", !"/usr/local/google/home/blaikie/dev/scratch"}
+!2 = !{}
+!3 = !{!4, !10}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", !5, !6, !7, null, i32 (i32)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"./list0.h", !"/usr/local/google/home/blaikie/dev/scratch"}
+!6 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/./list0.h]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!"0x2e\00main\00main\00\002\000\001\000\006\000\000\002", !1, !11, !12, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [main]
+!11 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/list0.c]
+!12 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = !{!9}
+!14 = !{!"0x101\00x\0016777217\000", !4, !6, !9} ; [ DW_TAG_arg_variable ] [x] [line 1]
+!15 = !MDLocation(line: 1, scope: !4)
+!16 = !MDLocation(line: 2, scope: !4)
+!17 = !MDLocation(line: 3, scope: !18)
+!18 = !{!"0xb\000", !11, !10} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/blaikie/dev/scratch/list0.c]
+!19 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/line.test b/test/DebugInfo/X86/line.test
new file mode 100644
index 0000000..24d9c5c
--- /dev/null
+++ b/test/DebugInfo/X86/line.test
@@ -0,0 +1 @@
+; RUN: llc -mtriple=x86_64-linux -O0 -filetype=asm < %S/../Inputs/line.ll | FileCheck %S/../Inputs/line.ll
diff --git a/test/DebugInfo/X86/linkage-name.ll b/test/DebugInfo/X86/linkage-name.ll
index f687078..187ff8b 100644
--- a/test/DebugInfo/X86/linkage-name.ll
+++ b/test/DebugInfo/X86/linkage-name.ll
@@ -14,9 +14,9 @@ entry:
   %this.addr = alloca %class.A*, align 8
   %b.addr = alloca i32, align 4
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !21, metadata !{metadata !"0x102"}), !dbg !23
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !21, metadata !{!"0x102"}), !dbg !23
   store i32 %b, i32* %b.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
+  call void @llvm.dbg.declare(metadata i32* %b.addr, metadata !24, metadata !{!"0x102"}), !dbg !25
   %this1 = load %class.A** %this.addr
   %0 = load i32* %b.addr, align 4, !dbg !26
   ret i32 %0, !dbg !26
@@ -27,26 +27,26 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!29}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.1 (trunk 152691) (llvm/trunk 152692)\000\00\000\00\000", metadata !28, metadata !1, metadata !1, metadata !3, metadata !18,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00a\00a\00_ZN1A1aEi\005\000\001\000\006\00256\000\005", metadata !6, null, metadata !7, null, i32 (%class.A*, i32)* @_ZN1A1aEi, null, metadata !13, null} ; [ DW_TAG_subprogram ]
-!6 = metadata !{metadata !"0x29", metadata !28} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !10, metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{metadata !"0x2\00A\001\008\008\000\000\000", metadata !28, null, null, metadata !12, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x2e\00a\00a\00_ZN1A1aEi\002\000\000\000\006\00257\000\000", metadata !6, metadata !11, metadata !7, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ]
-!18 = metadata !{metadata !20}
-!20 = metadata !{metadata !"0x34\00a\00a\00\009\000\001", null, metadata !6, metadata !11, %class.A* @a, null} ; [ DW_TAG_variable ]
-!21 = metadata !{metadata !"0x101\00this\0016777221\0064", metadata !5, metadata !6, metadata !22} ; [ DW_TAG_arg_variable ]
-!22 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ]
-!23 = metadata !{i32 5, i32 8, metadata !5, null}
-!24 = metadata !{metadata !"0x101\00b\0033554437\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
-!25 = metadata !{i32 5, i32 14, metadata !5, null}
-!26 = metadata !{i32 6, i32 4, metadata !27, null}
-!27 = metadata !{metadata !"0xb\005\0017\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ]
-!28 = metadata !{metadata !"foo.cpp", metadata !"/Users/echristo"}
-!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.1 (trunk 152691) (llvm/trunk 152692)\000\00\000\00\000", !28, !1, !1, !3, !18,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00a\00a\00_ZN1A1aEi\005\000\001\000\006\00256\000\005", !6, null, !7, null, i32 (%class.A*, i32)* @_ZN1A1aEi, null, !13, null} ; [ DW_TAG_subprogram ]
+!6 = !{!"0x29", !28} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !10, !9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = !{!"0xf\00\000\0064\0064\000\0064", i32 0, null, !11} ; [ DW_TAG_pointer_type ]
+!11 = !{!"0x2\00A\001\008\008\000\000\000", !28, null, null, !12, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
+!12 = !{!13}
+!13 = !{!"0x2e\00a\00a\00_ZN1A1aEi\002\000\000\000\006\00257\000\000", !6, !11, !7, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ]
+!18 = !{!20}
+!20 = !{!"0x34\00a\00a\00\009\000\001", null, !6, !11, %class.A* @a, null} ; [ DW_TAG_variable ]
+!21 = !{!"0x101\00this\0016777221\0064", !5, !6, !22} ; [ DW_TAG_arg_variable ]
+!22 = !{!"0xf\00\000\0064\0064\000\000", null, null, !11} ; [ DW_TAG_pointer_type ]
+!23 = !MDLocation(line: 5, column: 8, scope: !5)
+!24 = !{!"0x101\00b\0033554437\000", !5, !6, !9} ; [ DW_TAG_arg_variable ]
+!25 = !MDLocation(line: 5, column: 14, scope: !5)
+!26 = !MDLocation(line: 6, column: 4, scope: !27)
+!27 = !{!"0xb\005\0017\000", !6, !5} ; [ DW_TAG_lexical_block ]
+!28 = !{!"foo.cpp", !"/Users/echristo"}
+!29 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/low-pc-cu.ll b/test/DebugInfo/X86/low-pc-cu.ll
index 7fd8f19..cac572e 100644
--- a/test/DebugInfo/X86/low-pc-cu.ll
+++ b/test/DebugInfo/X86/low-pc-cu.ll
@@ -32,15 +32,15 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/z.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"z.c", metadata !"/usr/local/google/home/echristo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00z\00z\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @z, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [z]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/z.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!10 = metadata !{metadata !"clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)"}
-!11 = metadata !{i32 1, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)\000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/z.c] [DW_LANG_C99]
+!1 = !{!"z.c", !"/usr/local/google/home/echristo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00z\00z\00\001\000\001\000\006\00256\000\001", !1, !5, !6, null, void ()* @z, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [z]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/z.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 1, !"Debug Info Version", i32 2}
+!10 = !{!"clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)"}
+!11 = !MDLocation(line: 1, scope: !4)
diff --git a/test/DebugInfo/X86/memberfnptr.ll b/test/DebugInfo/X86/memberfnptr.ll
new file mode 100644
index 0000000..6d34601
--- /dev/null
+++ b/test/DebugInfo/X86/memberfnptr.ll
@@ -0,0 +1,44 @@
+; struct A {
+;   void foo();
+; };
+;  
+; void (A::*p)() = &A::foo;
+;
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+; Check that the member function pointer is emitted without a DW_AT_size attribute.
+; CHECK: DW_TAG_ptr_to_member_type
+; CHECK-NOT: DW_AT_{{.*}}size
+; CHECK: DW_TAG
+;
+; ModuleID = 'memberfnptr.cpp'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+%struct.A = type { i8 }
+
+@p = global { i64, i64 } { i64 ptrtoint (void (%struct.A*)* @_ZN1A3fooEv to i64), i64 0 }, align 8
+
+declare void @_ZN1A3fooEv(%struct.A*)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!14, !15, !16}
+!llvm.ident = !{!17}
+
+!0 = !{!"0x11\004\00clang version 3.6.0 \000\00\000\00\001", !1, !2, !3, !2, !10, !2} ; [ DW_TAG_compile_unit ] [/memberfnptr.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"memberfnptr.cpp", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00A\001\008\008\000\000\000", !1, null, null, !5, null, null, !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!6}
+!6 = !{!"0x2e\00foo\00foo\00_ZN1A3fooEv\002\000\000\000\000\00256\000\002", !1, !"_ZTS1A", !7, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [foo]
+!7 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9}
+!9 = !{!"0xf\00\000\0064\0064\000\001088\00", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!10 = !{!11}
+!11 = !{!"0x34\00p\00p\00\005\000\001", null, !12, !13, { i64, i64 }* @p, null} ; [ DW_TAG_variable ] [p] [line 5] [def]
+!12 = !{!"0x29", !1}                              ; [ DW_TAG_file_type ] [/memberfnptr.cpp]
+!13 = !{!"0x1f\00\000\0064\000\000\000", null, null, !7, !"_ZTS1A"} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 64, align 0, offset 0] [from ]
+!14 = !{i32 2, !"Dwarf Version", i32 2}
+!15 = !{i32 2, !"Debug Info Version", i32 2}
+!16 = !{i32 1, !"PIC Level", i32 2}
+!17 = !{!"clang version 3.6.0 "}
diff --git a/test/DebugInfo/X86/misched-dbg-value.ll b/test/DebugInfo/X86/misched-dbg-value.ll
index b2033a5..709b6b2 100644
--- a/test/DebugInfo/X86/misched-dbg-value.ll
+++ b/test/DebugInfo/X86/misched-dbg-value.ll
@@ -48,12 +48,12 @@
 
 define void @Proc8(i32* nocapture %Array1Par, [51 x i32]* nocapture %Array2Par, i32 %IntParI1, i32 %IntParI2) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32* %Array1Par}, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !64
-  tail call void @llvm.dbg.value(metadata !{[51 x i32]* %Array2Par}, i64 0, metadata !24, metadata !{metadata !"0x102"}), !dbg !65
-  tail call void @llvm.dbg.value(metadata !{i32 %IntParI1}, i64 0, metadata !25, metadata !{metadata !"0x102"}), !dbg !66
-  tail call void @llvm.dbg.value(metadata !{i32 %IntParI2}, i64 0, metadata !26, metadata !{metadata !"0x102"}), !dbg !67
+  tail call void @llvm.dbg.value(metadata i32* %Array1Par, i64 0, metadata !23, metadata !{!"0x102"}), !dbg !64
+  tail call void @llvm.dbg.value(metadata [51 x i32]* %Array2Par, i64 0, metadata !24, metadata !{!"0x102"}), !dbg !65
+  tail call void @llvm.dbg.value(metadata i32 %IntParI1, i64 0, metadata !25, metadata !{!"0x102"}), !dbg !66
+  tail call void @llvm.dbg.value(metadata i32 %IntParI2, i64 0, metadata !26, metadata !{!"0x102"}), !dbg !67
   %add = add i32 %IntParI1, 5, !dbg !68
-  tail call void @llvm.dbg.value(metadata !{i32 %add}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !68
+  tail call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !27, metadata !{!"0x102"}), !dbg !68
   %idxprom = sext i32 %add to i64, !dbg !69
   %arrayidx = getelementptr inbounds i32* %Array1Par, i64 %idxprom, !dbg !69
   store i32 %IntParI2, i32* %arrayidx, align 4, !dbg !69
@@ -65,7 +65,7 @@ entry:
   %idxprom7 = sext i32 %add6 to i64, !dbg !74
   %arrayidx8 = getelementptr inbounds i32* %Array1Par, i64 %idxprom7, !dbg !74
   store i32 %add, i32* %arrayidx8, align 4, !dbg !74
-  tail call void @llvm.dbg.value(metadata !{i32 %add}, i64 0, metadata !28, metadata !{metadata !"0x102"}), !dbg !75
+  tail call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !28, metadata !{!"0x102"}), !dbg !75
   br label %for.body, !dbg !75
 
 for.body:                                         ; preds = %entry, %for.body
@@ -74,7 +74,7 @@ for.body:                                         ; preds = %entry, %for.body
   %arrayidx13 = getelementptr inbounds [51 x i32]* %Array2Par, i64 %idxprom, i64 %indvars.iv, !dbg !77
   store i32 %add, i32* %arrayidx13, align 4, !dbg !77
   %inc = add nsw i32 %IntIndex.046, 1, !dbg !75
-  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !28, metadata !{metadata !"0x102"}), !dbg !75
+  tail call void @llvm.dbg.value(metadata i32 %inc, i64 0, metadata !28, metadata !{!"0x102"}), !dbg !75
   %cmp = icmp sgt i32 %inc, %add3, !dbg !75
   %indvars.iv.next = add i64 %indvars.iv, 1, !dbg !75
   br i1 %cmp, label %for.end, label %for.body, !dbg !75
@@ -103,84 +103,84 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!83}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 175015)\001\00\000\00\001", metadata !82, metadata !1, metadata !10, metadata !11, metadata !29,  metadata !10} ; [ DW_TAG_compile_unit ] [/Users/manmanren/test-Nov/rdar_13183203/test2/dry.c] [DW_LANG_C99]
-!1 = metadata !{metadata !2}
-!2 = metadata !{metadata !"0x4\00\00128\0032\0032\000\000\000", metadata !82, null, null, metadata !4, null, null, null} ; [ DW_TAG_enumeration_type ] [line 128, size 32, align 32, offset 0] [def] [from ]
-!3 = metadata !{metadata !"0x29", metadata !82} ; [ DW_TAG_file_type ]
-!4 = metadata !{metadata !5, metadata !6, metadata !7, metadata !8, metadata !9}
-!5 = metadata !{metadata !"0x28\00Ident1\000"} ; [ DW_TAG_enumerator ] [Ident1 :: 0]
-!6 = metadata !{metadata !"0x28\00Ident2\0010000"} ; [ DW_TAG_enumerator ] [Ident2 :: 10000]
-!7 = metadata !{metadata !"0x28\00Ident3\0010001"} ; [ DW_TAG_enumerator ] [Ident3 :: 10001]
-!8 = metadata !{metadata !"0x28\00Ident4\0010002"} ; [ DW_TAG_enumerator ] [Ident4 :: 10002]
-!9 = metadata !{metadata !"0x28\00Ident5\0010003"} ; [ DW_TAG_enumerator ] [Ident5 :: 10003]
-!10 = metadata !{}
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0x2e\00Proc8\00Proc8\00\00180\000\001\000\006\000\001\00185", metadata !82, metadata !3, metadata !13, null, void (i32*, [51 x i32]*, i32, i32)* @Proc8, null, null, metadata !22} ; [ DW_TAG_subprogram ] [line 180] [def] [scope 185] [Proc8]
-!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!14 = metadata !{null, metadata !15, metadata !17, metadata !21, metadata !21}
-!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!16 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !18} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!18 = metadata !{metadata !"0x1\00\000\001632\0032\000\000", null, null, metadata !16, metadata !19, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1632, align 32, offset 0] [from int]
-!19 = metadata !{metadata !20}
-!20 = metadata !{metadata !"0x21\000\0051"}       ; [ DW_TAG_subrange_type ] [0, 50]
-!21 = metadata !{metadata !"0x16\00OneToFifty\00132\000\000\000\000", metadata !82, null, metadata !16} ; [ DW_TAG_typedef ] [OneToFifty] [line 132, size 0, align 0, offset 0] [from int]
-!22 = metadata !{metadata !23, metadata !24, metadata !25, metadata !26, metadata !27, metadata !28}
-!23 = metadata !{metadata !"0x101\00Array1Par\0016777397\000", metadata !12, metadata !3, metadata !15} ; [ DW_TAG_arg_variable ] [Array1Par] [line 181]
-!24 = metadata !{metadata !"0x101\00Array2Par\0033554614\000", metadata !12, metadata !3, metadata !17} ; [ DW_TAG_arg_variable ] [Array2Par] [line 182]
-!25 = metadata !{metadata !"0x101\00IntParI1\0050331831\000", metadata !12, metadata !3, metadata !21} ; [ DW_TAG_arg_variable ] [IntParI1] [line 183]
-!26 = metadata !{metadata !"0x101\00IntParI2\0067109048\000", metadata !12, metadata !3, metadata !21} ; [ DW_TAG_arg_variable ] [IntParI2] [line 184]
-!27 = metadata !{metadata !"0x100\00IntLoc\00186\000", metadata !12, metadata !3, metadata !21} ; [ DW_TAG_auto_variable ] [IntLoc] [line 186]
-!28 = metadata !{metadata !"0x100\00IntIndex\00187\000", metadata !12, metadata !3, metadata !21} ; [ DW_TAG_auto_variable ] [IntIndex] [line 187]
-!29 = metadata !{metadata !30, metadata !35, metadata !36, metadata !38, metadata !39, metadata !40, metadata !42, metadata !46, metadata !63}
-!30 = metadata !{metadata !"0x34\00Version\00Version\00\00111\000\001", null, metadata !3, metadata !31, [4 x i8]* @Version, null} ; [ DW_TAG_variable ] [Version] [line 111] [def]
-!31 = metadata !{metadata !"0x1\00\000\0032\008\000\000", null, null, metadata !32, metadata !33, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char]
-!32 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!33 = metadata !{metadata !34}
-!34 = metadata !{metadata !"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
-!35 = metadata !{metadata !"0x34\00IntGlob\00IntGlob\00\00171\000\001", null, metadata !3, metadata !16, i32* @IntGlob, null} ; [ DW_TAG_variable ] [IntGlob] [line 171] [def]
-!36 = metadata !{metadata !"0x34\00BoolGlob\00BoolGlob\00\00172\000\001", null, metadata !3, metadata !37, i32* @BoolGlob, null} ; [ DW_TAG_variable ] [BoolGlob] [line 172] [def]
-!37 = metadata !{metadata !"0x16\00boolean\00149\000\000\000\000", metadata !82, null, metadata !16} ; [ DW_TAG_typedef ] [boolean] [line 149, size 0, align 0, offset 0] [from int]
-!38 = metadata !{metadata !"0x34\00Char1Glob\00Char1Glob\00\00173\000\001", null, metadata !3, metadata !32, i8* @Char1Glob, null} ; [ DW_TAG_variable ] [Char1Glob] [line 173] [def]
-!39 = metadata !{metadata !"0x34\00Char2Glob\00Char2Glob\00\00174\000\001", null, metadata !3, metadata !32, i8* @Char2Glob, null} ; [ DW_TAG_variable ] [Char2Glob] [line 174] [def]
-!40 = metadata !{metadata !"0x34\00Array1Glob\00Array1Glob\00\00175\000\001", null, metadata !3, metadata !41, [51 x i32]* @Array1Glob, null} ; [ DW_TAG_variable ] [Array1Glob] [line 175] [def]
-!41 = metadata !{metadata !"0x16\00Array1Dim\00135\000\000\000\000", metadata !82, null, metadata !18} ; [ DW_TAG_typedef ] [Array1Dim] [line 135, size 0, align 0, offset 0] [from ]
-!42 = metadata !{metadata !"0x34\00Array2Glob\00Array2Glob\00\00176\000\001", null, metadata !3, metadata !43, [51 x [51 x i32]]* @Array2Glob, null} ; [ DW_TAG_variable ] [Array2Glob] [line 176] [def]
-!43 = metadata !{metadata !"0x16\00Array2Dim\00136\000\000\000\000", metadata !82, null, metadata !44} ; [ DW_TAG_typedef ] [Array2Dim] [line 136, size 0, align 0, offset 0] [from ]
-!44 = metadata !{metadata !"0x1\00\000\0083232\0032\000\000", null, null, metadata !16, metadata !45, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 83232, align 32, offset 0] [from int]
-!45 = metadata !{metadata !20, metadata !20}
-!46 = metadata !{metadata !"0x34\00PtrGlb\00PtrGlb\00\00177\000\001", null, metadata !3, metadata !47, %struct.Record** @PtrGlb, null} ; [ DW_TAG_variable ] [PtrGlb] [line 177] [def]
-!47 = metadata !{metadata !"0x16\00RecordPtr\00148\000\000\000\000", metadata !82, null, metadata !48} ; [ DW_TAG_typedef ] [RecordPtr] [line 148, size 0, align 0, offset 0] [from ]
-!48 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !49} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from RecordType]
-!49 = metadata !{metadata !"0x16\00RecordType\00147\000\000\000\000", metadata !82, null, metadata !50} ; [ DW_TAG_typedef ] [RecordType] [line 147, size 0, align 0, offset 0] [from Record]
-!50 = metadata !{metadata !"0x13\00Record\00138\00448\0064\000\000\000", metadata !82, null, null, metadata !51, null, i32 0, null} ; [ DW_TAG_structure_type ] [Record] [line 138, size 448, align 64, offset 0] [def] [from ]
-!51 = metadata !{metadata !52, metadata !54, metadata !56, metadata !57, metadata !58}
-!52 = metadata !{metadata !"0xd\00PtrComp\00140\0064\0064\000\000", metadata !82, metadata !50, metadata !53} ; [ DW_TAG_member ] [PtrComp] [line 140, size 64, align 64, offset 0] [from ]
-!53 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !50} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Record]
-!54 = metadata !{metadata !"0xd\00Discr\00141\0032\0032\0064\000", metadata !82, metadata !50, metadata !55} ; [ DW_TAG_member ] [Discr] [line 141, size 32, align 32, offset 64] [from Enumeration]
-!55 = metadata !{metadata !"0x16\00Enumeration\00128\000\000\000\000", metadata !82, null, metadata !2} ; [ DW_TAG_typedef ] [Enumeration] [line 128, size 0, align 0, offset 0] [from ]
-!56 = metadata !{metadata !"0xd\00EnumComp\00142\0032\0032\0096\000", metadata !82, metadata !50, metadata !55} ; [ DW_TAG_member ] [EnumComp] [line 142, size 32, align 32, offset 96] [from Enumeration]
-!57 = metadata !{metadata !"0xd\00IntComp\00143\0032\0032\00128\000", metadata !82, metadata !50, metadata !21} ; [ DW_TAG_member ] [IntComp] [line 143, size 32, align 32, offset 128] [from OneToFifty]
-!58 = metadata !{metadata !"0xd\00StringComp\00144\00248\008\00160\000", metadata !82, metadata !50, metadata !59} ; [ DW_TAG_member ] [StringComp] [line 144, size 248, align 8, offset 160] [from String30]
-!59 = metadata !{metadata !"0x16\00String30\00134\000\000\000\000", metadata !82, null, metadata !60} ; [ DW_TAG_typedef ] [String30] [line 134, size 0, align 0, offset 0] [from ]
-!60 = metadata !{metadata !"0x1\00\000\00248\008\000\000", null, null, metadata !32, metadata !61, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 248, align 8, offset 0] [from char]
-!61 = metadata !{metadata !62}
-!62 = metadata !{metadata !"0x21\000\0031"}       ; [ DW_TAG_subrange_type ] [0, 30]
-!63 = metadata !{metadata !"0x34\00PtrGlbNext\00PtrGlbNext\00\00178\000\001", null, metadata !3, metadata !47, %struct.Record** @PtrGlbNext, null} ; [ DW_TAG_variable ] [PtrGlbNext] [line 178] [def]
-!64 = metadata !{i32 181, i32 0, metadata !12, null}
-!65 = metadata !{i32 182, i32 0, metadata !12, null}
-!66 = metadata !{i32 183, i32 0, metadata !12, null}
-!67 = metadata !{i32 184, i32 0, metadata !12, null}
-!68 = metadata !{i32 189, i32 0, metadata !12, null}
-!69 = metadata !{i32 190, i32 0, metadata !12, null}
-!73 = metadata !{i32 191, i32 0, metadata !12, null}
-!74 = metadata !{i32 192, i32 0, metadata !12, null}
-!75 = metadata !{i32 193, i32 0, metadata !76, null}
-!76 = metadata !{metadata !"0xb\00193\000\000", metadata !82, metadata !12} ; [ DW_TAG_lexical_block ] [/Users/manmanren/test-Nov/rdar_13183203/test2/dry.c]
-!77 = metadata !{i32 194, i32 0, metadata !76, null}
-!78 = metadata !{i32 195, i32 0, metadata !12, null}
-!79 = metadata !{i32 196, i32 0, metadata !12, null}
-!80 = metadata !{i32 197, i32 0, metadata !12, null}
-!81 = metadata !{i32 198, i32 0, metadata !12, null}
-!82 = metadata !{metadata !"dry.c", metadata !"/Users/manmanren/test-Nov/rdar_13183203/test2"}
-!83 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.3 (trunk 175015)\001\00\000\00\001", !82, !1, !10, !11, !29,  !10} ; [ DW_TAG_compile_unit ] [/Users/manmanren/test-Nov/rdar_13183203/test2/dry.c] [DW_LANG_C99]
+!1 = !{!2}
+!2 = !{!"0x4\00\00128\0032\0032\000\000\000", !82, null, null, !4, null, null, null} ; [ DW_TAG_enumeration_type ] [line 128, size 32, align 32, offset 0] [def] [from ]
+!3 = !{!"0x29", !82} ; [ DW_TAG_file_type ]
+!4 = !{!5, !6, !7, !8, !9}
+!5 = !{!"0x28\00Ident1\000"} ; [ DW_TAG_enumerator ] [Ident1 :: 0]
+!6 = !{!"0x28\00Ident2\0010000"} ; [ DW_TAG_enumerator ] [Ident2 :: 10000]
+!7 = !{!"0x28\00Ident3\0010001"} ; [ DW_TAG_enumerator ] [Ident3 :: 10001]
+!8 = !{!"0x28\00Ident4\0010002"} ; [ DW_TAG_enumerator ] [Ident4 :: 10002]
+!9 = !{!"0x28\00Ident5\0010003"} ; [ DW_TAG_enumerator ] [Ident5 :: 10003]
+!10 = !{}
+!11 = !{!12}
+!12 = !{!"0x2e\00Proc8\00Proc8\00\00180\000\001\000\006\000\001\00185", !82, !3, !13, null, void (i32*, [51 x i32]*, i32, i32)* @Proc8, null, null, !22} ; [ DW_TAG_subprogram ] [line 180] [def] [scope 185] [Proc8]
+!13 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = !{null, !15, !17, !21, !21}
+!15 = !{!"0xf\00\000\0064\0064\000\000", null, null, !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!16 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!17 = !{!"0xf\00\000\0064\0064\000\000", null, null, !18} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!18 = !{!"0x1\00\000\001632\0032\000\000", null, null, !16, !19, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1632, align 32, offset 0] [from int]
+!19 = !{!20}
+!20 = !{!"0x21\000\0051"}       ; [ DW_TAG_subrange_type ] [0, 50]
+!21 = !{!"0x16\00OneToFifty\00132\000\000\000\000", !82, null, !16} ; [ DW_TAG_typedef ] [OneToFifty] [line 132, size 0, align 0, offset 0] [from int]
+!22 = !{!23, !24, !25, !26, !27, !28}
+!23 = !{!"0x101\00Array1Par\0016777397\000", !12, !3, !15} ; [ DW_TAG_arg_variable ] [Array1Par] [line 181]
+!24 = !{!"0x101\00Array2Par\0033554614\000", !12, !3, !17} ; [ DW_TAG_arg_variable ] [Array2Par] [line 182]
+!25 = !{!"0x101\00IntParI1\0050331831\000", !12, !3, !21} ; [ DW_TAG_arg_variable ] [IntParI1] [line 183]
+!26 = !{!"0x101\00IntParI2\0067109048\000", !12, !3, !21} ; [ DW_TAG_arg_variable ] [IntParI2] [line 184]
+!27 = !{!"0x100\00IntLoc\00186\000", !12, !3, !21} ; [ DW_TAG_auto_variable ] [IntLoc] [line 186]
+!28 = !{!"0x100\00IntIndex\00187\000", !12, !3, !21} ; [ DW_TAG_auto_variable ] [IntIndex] [line 187]
+!29 = !{!30, !35, !36, !38, !39, !40, !42, !46, !63}
+!30 = !{!"0x34\00Version\00Version\00\00111\000\001", null, !3, !31, [4 x i8]* @Version, null} ; [ DW_TAG_variable ] [Version] [line 111] [def]
+!31 = !{!"0x1\00\000\0032\008\000\000", null, null, !32, !33, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char]
+!32 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!33 = !{!34}
+!34 = !{!"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
+!35 = !{!"0x34\00IntGlob\00IntGlob\00\00171\000\001", null, !3, !16, i32* @IntGlob, null} ; [ DW_TAG_variable ] [IntGlob] [line 171] [def]
+!36 = !{!"0x34\00BoolGlob\00BoolGlob\00\00172\000\001", null, !3, !37, i32* @BoolGlob, null} ; [ DW_TAG_variable ] [BoolGlob] [line 172] [def]
+!37 = !{!"0x16\00boolean\00149\000\000\000\000", !82, null, !16} ; [ DW_TAG_typedef ] [boolean] [line 149, size 0, align 0, offset 0] [from int]
+!38 = !{!"0x34\00Char1Glob\00Char1Glob\00\00173\000\001", null, !3, !32, i8* @Char1Glob, null} ; [ DW_TAG_variable ] [Char1Glob] [line 173] [def]
+!39 = !{!"0x34\00Char2Glob\00Char2Glob\00\00174\000\001", null, !3, !32, i8* @Char2Glob, null} ; [ DW_TAG_variable ] [Char2Glob] [line 174] [def]
+!40 = !{!"0x34\00Array1Glob\00Array1Glob\00\00175\000\001", null, !3, !41, [51 x i32]* @Array1Glob, null} ; [ DW_TAG_variable ] [Array1Glob] [line 175] [def]
+!41 = !{!"0x16\00Array1Dim\00135\000\000\000\000", !82, null, !18} ; [ DW_TAG_typedef ] [Array1Dim] [line 135, size 0, align 0, offset 0] [from ]
+!42 = !{!"0x34\00Array2Glob\00Array2Glob\00\00176\000\001", null, !3, !43, [51 x [51 x i32]]* @Array2Glob, null} ; [ DW_TAG_variable ] [Array2Glob] [line 176] [def]
+!43 = !{!"0x16\00Array2Dim\00136\000\000\000\000", !82, null, !44} ; [ DW_TAG_typedef ] [Array2Dim] [line 136, size 0, align 0, offset 0] [from ]
+!44 = !{!"0x1\00\000\0083232\0032\000\000", null, null, !16, !45, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 83232, align 32, offset 0] [from int]
+!45 = !{!20, !20}
+!46 = !{!"0x34\00PtrGlb\00PtrGlb\00\00177\000\001", null, !3, !47, %struct.Record** @PtrGlb, null} ; [ DW_TAG_variable ] [PtrGlb] [line 177] [def]
+!47 = !{!"0x16\00RecordPtr\00148\000\000\000\000", !82, null, !48} ; [ DW_TAG_typedef ] [RecordPtr] [line 148, size 0, align 0, offset 0] [from ]
+!48 = !{!"0xf\00\000\0064\0064\000\000", null, null, !49} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from RecordType]
+!49 = !{!"0x16\00RecordType\00147\000\000\000\000", !82, null, !50} ; [ DW_TAG_typedef ] [RecordType] [line 147, size 0, align 0, offset 0] [from Record]
+!50 = !{!"0x13\00Record\00138\00448\0064\000\000\000", !82, null, null, !51, null, i32 0, null} ; [ DW_TAG_structure_type ] [Record] [line 138, size 448, align 64, offset 0] [def] [from ]
+!51 = !{!52, !54, !56, !57, !58}
+!52 = !{!"0xd\00PtrComp\00140\0064\0064\000\000", !82, !50, !53} ; [ DW_TAG_member ] [PtrComp] [line 140, size 64, align 64, offset 0] [from ]
+!53 = !{!"0xf\00\000\0064\0064\000\000", null, null, !50} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Record]
+!54 = !{!"0xd\00Discr\00141\0032\0032\0064\000", !82, !50, !55} ; [ DW_TAG_member ] [Discr] [line 141, size 32, align 32, offset 64] [from Enumeration]
+!55 = !{!"0x16\00Enumeration\00128\000\000\000\000", !82, null, !2} ; [ DW_TAG_typedef ] [Enumeration] [line 128, size 0, align 0, offset 0] [from ]
+!56 = !{!"0xd\00EnumComp\00142\0032\0032\0096\000", !82, !50, !55} ; [ DW_TAG_member ] [EnumComp] [line 142, size 32, align 32, offset 96] [from Enumeration]
+!57 = !{!"0xd\00IntComp\00143\0032\0032\00128\000", !82, !50, !21} ; [ DW_TAG_member ] [IntComp] [line 143, size 32, align 32, offset 128] [from OneToFifty]
+!58 = !{!"0xd\00StringComp\00144\00248\008\00160\000", !82, !50, !59} ; [ DW_TAG_member ] [StringComp] [line 144, size 248, align 8, offset 160] [from String30]
+!59 = !{!"0x16\00String30\00134\000\000\000\000", !82, null, !60} ; [ DW_TAG_typedef ] [String30] [line 134, size 0, align 0, offset 0] [from ]
+!60 = !{!"0x1\00\000\00248\008\000\000", null, null, !32, !61, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 248, align 8, offset 0] [from char]
+!61 = !{!62}
+!62 = !{!"0x21\000\0031"}       ; [ DW_TAG_subrange_type ] [0, 30]
+!63 = !{!"0x34\00PtrGlbNext\00PtrGlbNext\00\00178\000\001", null, !3, !47, %struct.Record** @PtrGlbNext, null} ; [ DW_TAG_variable ] [PtrGlbNext] [line 178] [def]
+!64 = !MDLocation(line: 181, scope: !12)
+!65 = !MDLocation(line: 182, scope: !12)
+!66 = !MDLocation(line: 183, scope: !12)
+!67 = !MDLocation(line: 184, scope: !12)
+!68 = !MDLocation(line: 189, scope: !12)
+!69 = !MDLocation(line: 190, scope: !12)
+!73 = !MDLocation(line: 191, scope: !12)
+!74 = !MDLocation(line: 192, scope: !12)
+!75 = !MDLocation(line: 193, scope: !76)
+!76 = !{!"0xb\00193\000\000", !82, !12} ; [ DW_TAG_lexical_block ] [/Users/manmanren/test-Nov/rdar_13183203/test2/dry.c]
+!77 = !MDLocation(line: 194, scope: !76)
+!78 = !MDLocation(line: 195, scope: !12)
+!79 = !MDLocation(line: 196, scope: !12)
+!80 = !MDLocation(line: 197, scope: !12)
+!81 = !MDLocation(line: 198, scope: !12)
+!82 = !{!"dry.c", !"/Users/manmanren/test-Nov/rdar_13183203/test2"}
+!83 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/multiple-aranges.ll b/test/DebugInfo/X86/multiple-aranges.ll
index 47eef2d..88b8bc9 100644
--- a/test/DebugInfo/X86/multiple-aranges.ll
+++ b/test/DebugInfo/X86/multiple-aranges.ll
@@ -42,17 +42,17 @@ target triple = "x86_64-unknown-linux-gnu"
 !llvm.dbg.cu = !{!0, !7}
 !llvm.module.flags = !{!12, !13}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2} ; [ DW_TAG_compile_unit ] [/home/kayamon/test1.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"test1.c", metadata !"/home/kayamon"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x34\00kittens\00kittens\00\001\000\001", null, metadata !5, metadata !6, i32* @kittens, null} ; [ DW_TAG_variable ] [kittens] [line 1] [def]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/home/kayamon/test1.c]
-!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!7 = metadata !{metadata !"0x11\0012\00clang version 3.4 \000\00\000\00\000", metadata !8, metadata !2, metadata !2, metadata !2, metadata !9, metadata !2} ; [ DW_TAG_compile_unit ] [/home/kayamon/test2.c] [DW_LANG_C99]
-!8 = metadata !{metadata !"test2.c", metadata !"/home/kayamon"}
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x34\00rainbows\00rainbows\00\001\000\001", null, metadata !11, metadata !6, i32* @rainbows, null} ; [ DW_TAG_variable ] [rainbows] [line 1] [def]
-!11 = metadata !{metadata !"0x29", metadata !8}         ; [ DW_TAG_file_type ] [/home/kayamon/test2.c]
-!12 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4 \000\00\000\00\000", !1, !2, !2, !2, !3, !2} ; [ DW_TAG_compile_unit ] [/home/kayamon/test1.c] [DW_LANG_C99]
+!1 = !{!"test1.c", !"/home/kayamon"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x34\00kittens\00kittens\00\001\000\001", null, !5, !6, i32* @kittens, null} ; [ DW_TAG_variable ] [kittens] [line 1] [def]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/home/kayamon/test1.c]
+!6 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!7 = !{!"0x11\0012\00clang version 3.4 \000\00\000\00\000", !8, !2, !2, !2, !9, !2} ; [ DW_TAG_compile_unit ] [/home/kayamon/test2.c] [DW_LANG_C99]
+!8 = !{!"test2.c", !"/home/kayamon"}
+!9 = !{!10}
+!10 = !{!"0x34\00rainbows\00rainbows\00\001\000\001", null, !11, !6, i32* @rainbows, null} ; [ DW_TAG_variable ] [rainbows] [line 1] [def]
+!11 = !{!"0x29", !8}         ; [ DW_TAG_file_type ] [/home/kayamon/test2.c]
+!12 = !{i32 2, !"Dwarf Version", i32 4}
+!13 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/multiple-at-const-val.ll b/test/DebugInfo/X86/multiple-at-const-val.ll
index 55991c1..17dc2c4 100644
--- a/test/DebugInfo/X86/multiple-at-const-val.ll
+++ b/test/DebugInfo/X86/multiple-at-const-val.ll
@@ -32,32 +32,32 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!1803}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 174207)\001\00\000\00\000", metadata !1802, metadata !1, metadata !955, metadata !956, metadata !1786,  metadata !955} ; [ DW_TAG_compile_unit ] [/privite/tmp/student2.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !26}
-!4 = metadata !{metadata !"0x39\00std\0048", null, metadata !5} ; [ DW_TAG_namespace ]
-!5 = metadata !{metadata !"0x29", metadata !1801} ; [ DW_TAG_file_type ]
-!25 = metadata !{metadata !"0x28\00_S_os_fmtflags_end\0065536"} ; [ DW_TAG_enumerator ]
-!26 = metadata !{metadata !"0x4\00_Ios_Iostate\00146\0032\0032\000\000\000", metadata !1801, metadata !4, null, metadata !27, null, null, null} ; [ DW_TAG_enumeration_type ] [_Ios_Iostate] [line 146, size 32, align 32, offset 0] [def] [from ]
-!27 = metadata !{metadata !28, metadata !29, metadata !30, metadata !31, metadata !32}
-!28 = metadata !{metadata !"0x28\00_S_goodbit\000"} ; [ DW_TAG_enumerator ] [_S_goodbit :: 0]
-!29 = metadata !{metadata !"0x28\00_S_badbit\001"} ; [ DW_TAG_enumerator ] [_S_badbit :: 1]
-!30 = metadata !{metadata !"0x28\00_S_eofbit\002"} ; [ DW_TAG_enumerator ] [_S_eofbit :: 2]
-!31 = metadata !{metadata !"0x28\00_S_failbit\004"} ; [ DW_TAG_enumerator ] [_S_failbit :: 4]
-!32 = metadata !{metadata !"0x28\00_S_os_ostate_end\0065536"} ; [ DW_TAG_enumerator ] [_S_os_ostate_end :: 65536]
-!49 = metadata !{metadata !"0x2\00os_base\00200\001728\0064\000\000\000", metadata !1801, metadata !4, null, metadata !50, metadata !49, null, null} ; [ DW_TAG_class_type ] [os_base] [line 200, size 1728, align 64, offset 0] [def] [from ]
-!50 = metadata !{metadata !77}
-!54 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !55, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!55 = metadata !{metadata !56}
-!56 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!77 = metadata !{metadata !"0xd\00badbit\00331\000\000\000\004096", metadata !1801, metadata !49, metadata !78, i32 1} ; [ DW_TAG_member ]
-!78 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !79} ; [ DW_TAG_const_type ]
-!79 = metadata !{metadata !"0x16\00ostate\00327\000\000\000\000", metadata !1801, metadata !49, metadata !26} ; [ DW_TAG_typedef ]
-!955 = metadata !{}
-!956 = metadata !{metadata !960}
-!960 = metadata !{metadata !"0x2e\00main\00main\00\0073\000\001\000\006\00256\001\0073", metadata !1802, null, metadata !54, null, i32 ()* @main, null, null, metadata !955} ; [ DW_TAG_subprogram ]
-!961 = metadata !{metadata !"0x29", metadata !1802} ; [ DW_TAG_file_type ]
-!1786 = metadata !{metadata !1800}
-!1800 = metadata !{metadata !"0x34\00badbit\00badbit\00badbit\00331\001\001", metadata !5, metadata !5, metadata !78, i32 1, metadata !77} ; [ DW_TAG_variable ]
-!1801 = metadata !{metadata !"os_base.h", metadata !"/privite/tmp"}
-!1802 = metadata !{metadata !"student2.cpp", metadata !"/privite/tmp"}
-!1803 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.3 (trunk 174207)\001\00\000\00\000", !1802, !1, !955, !956, !1786,  !955} ; [ DW_TAG_compile_unit ] [/privite/tmp/student2.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!26}
+!4 = !{!"0x39\00std\0048", null, !5} ; [ DW_TAG_namespace ]
+!5 = !{!"0x29", !1801} ; [ DW_TAG_file_type ]
+!25 = !{!"0x28\00_S_os_fmtflags_end\0065536"} ; [ DW_TAG_enumerator ]
+!26 = !{!"0x4\00_Ios_Iostate\00146\0032\0032\000\000\000", !1801, !4, null, !27, null, null, null} ; [ DW_TAG_enumeration_type ] [_Ios_Iostate] [line 146, size 32, align 32, offset 0] [def] [from ]
+!27 = !{!28, !29, !30, !31, !32}
+!28 = !{!"0x28\00_S_goodbit\000"} ; [ DW_TAG_enumerator ] [_S_goodbit :: 0]
+!29 = !{!"0x28\00_S_badbit\001"} ; [ DW_TAG_enumerator ] [_S_badbit :: 1]
+!30 = !{!"0x28\00_S_eofbit\002"} ; [ DW_TAG_enumerator ] [_S_eofbit :: 2]
+!31 = !{!"0x28\00_S_failbit\004"} ; [ DW_TAG_enumerator ] [_S_failbit :: 4]
+!32 = !{!"0x28\00_S_os_ostate_end\0065536"} ; [ DW_TAG_enumerator ] [_S_os_ostate_end :: 65536]
+!49 = !{!"0x2\00os_base\00200\001728\0064\000\000\000", !1801, !4, null, !50, !49, null, null} ; [ DW_TAG_class_type ] [os_base] [line 200, size 1728, align 64, offset 0] [def] [from ]
+!50 = !{!77}
+!54 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !55, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!55 = !{!56}
+!56 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!77 = !{!"0xd\00badbit\00331\000\000\000\004096", !1801, !49, !78, i32 1} ; [ DW_TAG_member ]
+!78 = !{!"0x26\00\000\000\000\000\000", null, null, !79} ; [ DW_TAG_const_type ]
+!79 = !{!"0x16\00ostate\00327\000\000\000\000", !1801, !49, !26} ; [ DW_TAG_typedef ]
+!955 = !{}
+!956 = !{!960}
+!960 = !{!"0x2e\00main\00main\00\0073\000\001\000\006\00256\001\0073", !1802, null, !54, null, i32 ()* @main, null, null, !955} ; [ DW_TAG_subprogram ]
+!961 = !{!"0x29", !1802} ; [ DW_TAG_file_type ]
+!1786 = !{!1800}
+!1800 = !{!"0x34\00badbit\00badbit\00badbit\00331\001\001", !5, !5, !78, i32 1, !77} ; [ DW_TAG_variable ]
+!1801 = !{!"os_base.h", !"/privite/tmp"}
+!1802 = !{!"student2.cpp", !"/privite/tmp"}
+!1803 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/nodebug_with_debug_loc.ll b/test/DebugInfo/X86/nodebug_with_debug_loc.ll
index 555abe6..cc9a874 100644
--- a/test/DebugInfo/X86/nodebug_with_debug_loc.ll
+++ b/test/DebugInfo/X86/nodebug_with_debug_loc.ll
@@ -58,8 +58,8 @@ entry:
 for.body:                                         ; preds = %for.body, %entry
   %iter.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
   call void @llvm.lifetime.start(i64 4, i8* %0), !dbg !26
-  call void @llvm.dbg.value(metadata !{%struct.string* %str2.i}, i64 0, metadata !16, metadata !{metadata !"0x102"}) #3, !dbg !26
-  call void @llvm.dbg.value(metadata !{%struct.string* %str2.i}, i64 0, metadata !27, metadata !{metadata !"0x102"}) #3, !dbg !29
+  call void @llvm.dbg.value(metadata %struct.string* %str2.i, i64 0, metadata !16, metadata !{!"0x102"}) #3, !dbg !26
+  call void @llvm.dbg.value(metadata %struct.string* %str2.i, i64 0, metadata !27, metadata !{!"0x102"}) #3, !dbg !29
   call void @_Z4sinkPKv(i8* undef) #3, !dbg !29
   call void @_Z4sinkPKv(i8* %0) #3, !dbg !30
   call void @llvm.lifetime.end(i64 4, i8* %0), !dbg !31
@@ -97,43 +97,43 @@ attributes #3 = { nounwind }
 !llvm.module.flags = !{!23, !24}
 !llvm.ident = !{!25}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !10, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/<stdin>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"<stdin>", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00string\007\0032\0032\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS6string"} ; [ DW_TAG_structure_type ] [string] [line 7, size 32, align 32, offset 0] [def] [from ]
-!5 = metadata !{metadata !"repro.cpp", metadata !"/tmp/dbginfo"}
-!6 = metadata !{metadata !7}
-!7 = metadata !{metadata !"0xd\00mem\008\0032\0032\000\000", metadata !5, metadata !"_ZTS6string", metadata !8} ; [ DW_TAG_member ] [mem] [line 8, size 32, align 32, offset 0] [from ]
-!8 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from unsigned int]
-!9 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
-!10 = metadata !{metadata !11, metadata !17}
-!11 = metadata !{metadata !"0x2e\00f\00f\00_Z1fv\0014\000\001\000\006\00256\001\0014", metadata !5, metadata !12, metadata !13, null, null, null, null, metadata !15} ; [ DW_TAG_subprogram ] [line 14] [def] [f]
-!12 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/repro.cpp]
-!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!14 = metadata !{null}
-!15 = metadata !{metadata !16}
-!16 = metadata !{metadata !"0x100\00str2\0015\000", metadata !11, metadata !12, metadata !"_ZTS6string"} ; [ DW_TAG_auto_variable ] [str2] [line 15]
-!17 = metadata !{metadata !"0x2e\00s2\00s2\00_Z2s2P6string\0013\000\001\000\006\00256\001\0013", metadata !5, metadata !12, metadata !18, null, null, null, null, metadata !21} ; [ DW_TAG_subprogram ] [line 13] [def] [s2]
-!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!19 = metadata !{null, metadata !20}
-!20 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, metadata !"_ZTS6string"} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from _ZTS6string]
-!21 = metadata !{metadata !22}
-!22 = metadata !{metadata !"0x101\00lhs\0016777229\000", metadata !17, metadata !12, metadata !20} ; [ DW_TAG_arg_variable ] [lhs] [line 13]
-!23 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!24 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!25 = metadata !{metadata !"clang version 3.5.0 "}
-!26 = metadata !{i32 15, i32 0, metadata !11, null}
-!27 = metadata !{metadata !"0x101\00lhs\0016777229\000", metadata !17, metadata !12, metadata !20, metadata !28} ; [ DW_TAG_arg_variable ] [lhs] [line 13]
-!28 = metadata !{i32 16, i32 0, metadata !11, null}
-!29 = metadata !{i32 13, i32 0, metadata !17, metadata !28}
-!30 = metadata !{i32 17, i32 0, metadata !11, null}
-!31 = metadata !{i32 18, i32 0, metadata !11, null}
-!32 = metadata !{metadata !33, metadata !34, i64 0}
-!33 = metadata !{metadata !"_ZTS6string", metadata !34, i64 0}
-!34 = metadata !{metadata !"any pointer", metadata !35, i64 0}
-!35 = metadata !{metadata !"omnipotent char", metadata !36, i64 0}
-!36 = metadata !{metadata !"Simple C/C++ TBAA"}
-!37 = metadata !{metadata !38, metadata !38, i64 0}
-!38 = metadata !{metadata !"bool", metadata !35, i64 0}
-!39 = metadata !{i8 0, i8 2}
+!0 = !{!"0x11\004\00clang version 3.5.0 \001\00\000\00\001", !1, !2, !3, !10, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/<stdin>] [DW_LANG_C_plus_plus]
+!1 = !{!"<stdin>", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00string\007\0032\0032\000\000\000", !5, null, null, !6, null, null, !"_ZTS6string"} ; [ DW_TAG_structure_type ] [string] [line 7, size 32, align 32, offset 0] [def] [from ]
+!5 = !{!"repro.cpp", !"/tmp/dbginfo"}
+!6 = !{!7}
+!7 = !{!"0xd\00mem\008\0032\0032\000\000", !5, !"_ZTS6string", !8} ; [ DW_TAG_member ] [mem] [line 8, size 32, align 32, offset 0] [from ]
+!8 = !{!"0xf\00\000\0032\0032\000\000", null, null, !9} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from unsigned int]
+!9 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!10 = !{!11, !17}
+!11 = !{!"0x2e\00f\00f\00_Z1fv\0014\000\001\000\006\00256\001\0014", !5, !12, !13, null, null, null, null, !15} ; [ DW_TAG_subprogram ] [line 14] [def] [f]
+!12 = !{!"0x29", !5}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/repro.cpp]
+!13 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = !{null}
+!15 = !{!16}
+!16 = !{!"0x100\00str2\0015\000", !11, !12, !"_ZTS6string"} ; [ DW_TAG_auto_variable ] [str2] [line 15]
+!17 = !{!"0x2e\00s2\00s2\00_Z2s2P6string\0013\000\001\000\006\00256\001\0013", !5, !12, !18, null, null, null, null, !21} ; [ DW_TAG_subprogram ] [line 13] [def] [s2]
+!18 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = !{null, !20}
+!20 = !{!"0xf\00\000\0032\0032\000\000", null, null, !"_ZTS6string"} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from _ZTS6string]
+!21 = !{!22}
+!22 = !{!"0x101\00lhs\0016777229\000", !17, !12, !20} ; [ DW_TAG_arg_variable ] [lhs] [line 13]
+!23 = !{i32 2, !"Dwarf Version", i32 4}
+!24 = !{i32 2, !"Debug Info Version", i32 2}
+!25 = !{!"clang version 3.5.0 "}
+!26 = !MDLocation(line: 15, scope: !11)
+!27 = !{!"0x101\00lhs\0016777229\000", !17, !12, !20, !28} ; [ DW_TAG_arg_variable ] [lhs] [line 13]
+!28 = !MDLocation(line: 16, scope: !11)
+!29 = !MDLocation(line: 13, scope: !17, inlinedAt: !28)
+!30 = !MDLocation(line: 17, scope: !11)
+!31 = !MDLocation(line: 18, scope: !11)
+!32 = !{!33, !34, i64 0}
+!33 = !{!"_ZTS6string", !34, i64 0}
+!34 = !{!"any pointer", !35, i64 0}
+!35 = !{!"omnipotent char", !36, i64 0}
+!36 = !{!"Simple C/C++ TBAA"}
+!37 = !{!38, !38, i64 0}
+!38 = !{!"bool", !35, i64 0}
+!39 = !{i8 0, i8 2}
diff --git a/test/DebugInfo/X86/nondefault-subrange-array.ll b/test/DebugInfo/X86/nondefault-subrange-array.ll
index 212114f..3167d9f 100644
--- a/test/DebugInfo/X86/nondefault-subrange-array.ll
+++ b/test/DebugInfo/X86/nondefault-subrange-array.ll
@@ -30,23 +30,23 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 169136)\000\00\000\00\000", metadata !20, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/t.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x34\00a\00a\00\001\000\001", null, metadata !6, metadata !7, %class.A* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
-!6 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x2\00A\001\000\0032\000\000\000", metadata !20, null, null, metadata !8, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 0, align 32, offset 0] [def] [from ]
-!8 = metadata !{metadata !9, metadata !14}
-!9 = metadata !{metadata !"0xd\00x\001\000\000\000\001", metadata !20, metadata !7, metadata !10} ; [ DW_TAG_member ] [x] [line 1, size 0, align 0, offset 0] [private] [from ]
-!10 = metadata !{metadata !"0x1\00\000\000\0032\000\000", null, null, metadata !11, metadata !12, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
-!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x21\00-3\0042"} ; [ DW_TAG_subrange_type ] [-3, 39]
-!14 = metadata !{metadata !"0x2e\00A\00A\00\001\000\000\000\006\00320\000\001", metadata !6, metadata !7, metadata !15, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 1] [A]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{null, metadata !17}
-!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
-!18 = metadata !{metadata !19}
-!19 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!20 = metadata !{metadata !"t.cpp", metadata !"/Volumes/Sandbox/llvm"}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.3 (trunk 169136)\000\00\000\00\000", !20, !1, !1, !1, !3,  !1} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/t.cpp] [DW_LANG_C_plus_plus]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x34\00a\00a\00\001\000\001", null, !6, !7, %class.A* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
+!6 = !{!"0x29", !20} ; [ DW_TAG_file_type ]
+!7 = !{!"0x2\00A\001\000\0032\000\000\000", !20, null, null, !8, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 0, align 32, offset 0] [def] [from ]
+!8 = !{!9, !14}
+!9 = !{!"0xd\00x\001\000\000\000\001", !20, !7, !10} ; [ DW_TAG_member ] [x] [line 1, size 0, align 0, offset 0] [private] [from ]
+!10 = !{!"0x1\00\000\000\0032\000\000", null, null, !11, !12, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!11 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = !{!13}
+!13 = !{!"0x21\00-3\0042"} ; [ DW_TAG_subrange_type ] [-3, 39]
+!14 = !{!"0x2e\00A\00A\00\001\000\000\000\006\00320\000\001", !6, !7, !15, null, null, null, i32 0, !18} ; [ DW_TAG_subprogram ] [line 1] [A]
+!15 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{null, !17}
+!17 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!18 = !{!19}
+!19 = !{!"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!20 = !{!"t.cpp", !"/Volumes/Sandbox/llvm"}
+!21 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/nophysreg.ll b/test/DebugInfo/X86/nophysreg.ll
new file mode 100644
index 0000000..6a8b2e6
--- /dev/null
+++ b/test/DebugInfo/X86/nophysreg.ll
@@ -0,0 +1,203 @@
+; RUN: llc -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+;
+; PR22296: In this testcase the DBG_VALUE describing "p5" becomes unavailable
+; because the register its address is in is clobbered and we (currently) aren't
+; smart enough to realize that the value is rematerialized immediately after the
+; DBG_VALUE and/or is actually a stack slot.
+;
+; Test that we handle this situation gracefully by omitting the DW_AT_location
+; and not asserting.
+; Note that this check may XPASS in the future if DbgValueHistoryCalculator
+; becoms smarter. That would be fine, too.
+;
+; CHECK: DW_TAG_subprogram
+; CHECK: linkage_name{{.*}}_Z2f21A
+; CHECK: DW_TAG_formal_parameter
+; CHECK-NOT: DW_AT_location
+; CHECK-NEXT: DW_AT_name {{.*}}"p5"
+;
+; // Compile at -O1
+; struct A {
+;   int *m1;
+;   int m2;
+; };
+;
+; void f1(int *p1, int p2);
+; void __attribute__((always_inline)) f2(A p5) { f1(p5.m1, p5.m2); }
+;
+; void func(void*);
+; void func(const int &, const int&);
+; int cond();
+; void f() {
+;   while (cond()) {
+;     int x;
+;     func(x, 0);
+;     while (cond()) {
+;       char y;
+;       func(&y);
+;       char j;
+;       func(&j);
+;       char I;
+;       func(&I);
+;       func(0, 0);
+;       A g;
+;       g.m1 = &x;
+;       f2(g);
+;     }
+;   }
+; }
+; ModuleID = 'test.cpp'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+%struct.A = type { i32*, i32 }
+
+; Function Attrs: alwaysinline ssp uwtable
+define void @_Z2f21A(i32* %p5.coerce0, i32 %p5.coerce1) #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32* %p5.coerce0, i64 0, metadata !16, metadata !33), !dbg !34
+  tail call void @llvm.dbg.value(metadata i32 %p5.coerce1, i64 0, metadata !16, metadata !35), !dbg !34
+  tail call void @llvm.dbg.declare(metadata %struct.A* undef, metadata !16, metadata !36), !dbg !34
+  tail call void @_Z2f1Pii(i32* %p5.coerce0, i32 %p5.coerce1), !dbg !37
+  ret void, !dbg !38
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare void @_Z2f1Pii(i32*, i32) #2
+
+; Function Attrs: ssp uwtable
+define void @_Z1fv() #3 {
+entry:
+  %x = alloca i32, align 4
+  %ref.tmp = alloca i32, align 4
+  %y = alloca i8, align 1
+  %j = alloca i8, align 1
+  %I = alloca i8, align 1
+  %ref.tmp5 = alloca i32, align 4
+  %ref.tmp6 = alloca i32, align 4
+  %call11 = call i32 @_Z4condv(), !dbg !39
+  %tobool12 = icmp eq i32 %call11, 0, !dbg !39
+  br i1 %tobool12, label %while.end7, label %while.body, !dbg !40
+
+while.cond.loopexit:                              ; preds = %while.body4, %while.body
+  %call = call i32 @_Z4condv(), !dbg !39
+  %tobool = icmp eq i32 %call, 0, !dbg !39
+  br i1 %tobool, label %while.end7, label %while.body, !dbg !40
+
+while.body:                                       ; preds = %entry, %while.cond.loopexit
+  store i32 0, i32* %ref.tmp, align 4, !dbg !41, !tbaa !42
+  call void @llvm.dbg.value(metadata i32* %x, i64 0, metadata !21, metadata !36), !dbg !46
+  call void @_Z4funcRKiS0_(i32* dereferenceable(4) %x, i32* dereferenceable(4) %ref.tmp), !dbg !47
+  %call29 = call i32 @_Z4condv(), !dbg !48
+  %tobool310 = icmp eq i32 %call29, 0, !dbg !48
+  br i1 %tobool310, label %while.cond.loopexit, label %while.body4, !dbg !49
+
+while.body4:                                      ; preds = %while.body, %while.body4
+  call void @llvm.dbg.value(metadata i8* %y, i64 0, metadata !23, metadata !36), !dbg !50
+  call void @_Z4funcPv(i8* %y), !dbg !51
+  call void @llvm.dbg.value(metadata i8* %j, i64 0, metadata !26, metadata !36), !dbg !52
+  call void @_Z4funcPv(i8* %j), !dbg !53
+  call void @llvm.dbg.value(metadata i8* %I, i64 0, metadata !27, metadata !36), !dbg !54
+  call void @_Z4funcPv(i8* %I), !dbg !55
+  store i32 0, i32* %ref.tmp5, align 4, !dbg !56, !tbaa !42
+  store i32 0, i32* %ref.tmp6, align 4, !dbg !57, !tbaa !42
+  call void @_Z4funcRKiS0_(i32* dereferenceable(4) %ref.tmp5, i32* dereferenceable(4) %ref.tmp6), !dbg !58
+  call void @llvm.dbg.declare(metadata %struct.A* undef, metadata !28, metadata !36), !dbg !59
+  call void @llvm.dbg.value(metadata i32* %x, i64 0, metadata !28, metadata !33), !dbg !59
+  call void @llvm.dbg.value(metadata i32* %x, i64 0, metadata !21, metadata !36), !dbg !46
+  call void @llvm.dbg.value(metadata i32* %x, i64 0, metadata !60, metadata !33), !dbg !62
+  call void @llvm.dbg.value(metadata i32 undef, i64 0, metadata !60, metadata !35), !dbg !62
+  call void @llvm.dbg.declare(metadata %struct.A* undef, metadata !60, metadata !36), !dbg !62
+  call void @_Z2f1Pii(i32* %x, i32 undef), !dbg !63
+  %call2 = call i32 @_Z4condv(), !dbg !48
+  %tobool3 = icmp eq i32 %call2, 0, !dbg !48
+  br i1 %tobool3, label %while.cond.loopexit, label %while.body4, !dbg !49
+
+while.end7:                                       ; preds = %while.cond.loopexit, %entry
+  ret void, !dbg !64
+}
+
+declare i32 @_Z4condv()
+
+declare void @_Z4funcRKiS0_(i32* dereferenceable(4), i32* dereferenceable(4))
+
+declare void @_Z4funcPv(i8*)
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { alwaysinline ssp uwtable }
+attributes #1 = { nounwind readnone }
+attributes #3 = { ssp uwtable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!29, !30, !31}
+!llvm.ident = !{!32}
+
+!0 = !{!"0x11\004\00clang version 3.7.0 (trunk 227088) (llvm/trunk 227091)\001\00\000\00\001", !1, !2, !3, !10, !2, !2} ; [ DW_TAG_compile_unit ] [/test.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"test.cpp", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00A\001\00128\0064\000\000\000", !1, null, null, !5, null, null, !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 128, align 64, offset 0] [def] [from ]
+!5 = !{!6, !9}
+!6 = !{!"0xd\00m1\002\0064\0064\000\000", !1, !"_ZTS1A", !7} ; [ DW_TAG_member ] [m1] [line 2, size 64, align 64, offset 0] [from ]
+!7 = !{!"0xf\00\000\0064\0064\000\000", null, null, !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0xd\00m2\003\0032\0032\0064\000", !1, !"_ZTS1A", !8} ; [ DW_TAG_member ] [m2] [line 3, size 32, align 32, offset 64] [from int]
+!10 = !{!11, !17}
+!11 = !{!"0x2e\00f2\00f2\00_Z2f21A\007\000\001\000\000\00256\001\007", !1, !12, !13, null, void (i32*, i32)* @_Z2f21A, null, null, !15} ; [ DW_TAG_subprogram ] [line 7] [def] [f2]
+!12 = !{!"0x29", !1}                              ; [ DW_TAG_file_type ] [/test.cpp]
+!13 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = !{null, !"_ZTS1A"}
+!15 = !{!16}
+!16 = !{!"0x101\00p5\0016777223\000", !11, !12, !"_ZTS1A"} ; [ DW_TAG_arg_variable ] [p5] [line 7]
+!17 = !{!"0x2e\00f\00f\00_Z1fv\0012\000\001\000\000\00256\001\0012", !1, !12, !18, null, void ()* @_Z1fv, null, null, !20} ; [ DW_TAG_subprogram ] [line 12] [def] [f]
+!18 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = !{null}
+!20 = !{!21, !23, !26, !27, !28}
+!21 = !{!"0x100\00x\0014\000", !22, !12, !8}      ; [ DW_TAG_auto_variable ] [x] [line 14]
+!22 = !{!"0xb\0013\0018\000", !1, !17}            ; [ DW_TAG_lexical_block ] [/test.cpp]
+!23 = !{!"0x100\00y\0017\000", !24, !12, !25}     ; [ DW_TAG_auto_variable ] [y] [line 17]
+!24 = !{!"0xb\0016\0020\001", !1, !22}            ; [ DW_TAG_lexical_block ] [/test.cpp]
+!25 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!26 = !{!"0x100\00j\0019\000", !24, !12, !25}     ; [ DW_TAG_auto_variable ] [j] [line 19]
+!27 = !{!"0x100\00I\0021\000", !24, !12, !25}     ; [ DW_TAG_auto_variable ] [I] [line 21]
+!28 = !{!"0x100\00g\0024\000", !24, !12, !"_ZTS1A"} ; [ DW_TAG_auto_variable ] [g] [line 24]
+!29 = !{i32 2, !"Dwarf Version", i32 2}
+!30 = !{i32 2, !"Debug Info Version", i32 2}
+!31 = !{i32 1, !"PIC Level", i32 2}
+!32 = !{!"clang version 3.7.0 (trunk 227088) (llvm/trunk 227091)"}
+!33 = !{!"0x102\00157\000\008"}                   ; [ DW_TAG_expression ] [DW_OP_bit_piece offset=0, size=8]
+!34 = !MDLocation(line: 7, column: 42, scope: !11)
+!35 = !{!"0x102\00157\008\004"}                   ; [ DW_TAG_expression ] [DW_OP_bit_piece offset=8, size=4]
+!36 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!37 = !MDLocation(line: 7, column: 48, scope: !11)
+!38 = !MDLocation(line: 7, column: 66, scope: !11)
+!39 = !MDLocation(line: 13, column: 10, scope: !17)
+!40 = !MDLocation(line: 13, column: 3, scope: !17)
+!41 = !MDLocation(line: 15, column: 13, scope: !22)
+!42 = !{!43, !43, i64 0}
+!43 = !{!"int", !44, i64 0}
+!44 = !{!"omnipotent char", !45, i64 0}
+!45 = !{!"Simple C/C++ TBAA"}
+!46 = !MDLocation(line: 14, column: 9, scope: !22)
+!47 = !MDLocation(line: 15, column: 5, scope: !22)
+!48 = !MDLocation(line: 16, column: 12, scope: !22)
+!49 = !MDLocation(line: 16, column: 5, scope: !22)
+!50 = !MDLocation(line: 17, column: 12, scope: !24)
+!51 = !MDLocation(line: 18, column: 7, scope: !24)
+!52 = !MDLocation(line: 19, column: 12, scope: !24)
+!53 = !MDLocation(line: 20, column: 7, scope: !24)
+!54 = !MDLocation(line: 21, column: 12, scope: !24)
+!55 = !MDLocation(line: 22, column: 7, scope: !24)
+!56 = !MDLocation(line: 23, column: 12, scope: !24)
+!57 = !MDLocation(line: 23, column: 15, scope: !24)
+!58 = !MDLocation(line: 23, column: 7, scope: !24)
+!59 = !MDLocation(line: 24, column: 9, scope: !24)
+!60 = !{!"0x101\00p5\0016777223\000", !11, !12, !"_ZTS1A", !61} ; [ DW_TAG_arg_variable ] [p5] [line 7]
+!61 = distinct !MDLocation(line: 26, column: 7, scope: !24)
+!62 = !MDLocation(line: 7, column: 42, scope: !11, inlinedAt: !61)
+!63 = !MDLocation(line: 7, column: 48, scope: !11, inlinedAt: !61)
+!64 = !MDLocation(line: 29, column: 1, scope: !17)
diff --git a/test/DebugInfo/X86/objc-fwd-decl.ll b/test/DebugInfo/X86/objc-fwd-decl.ll
index e6144d0..cd71396 100644
--- a/test/DebugInfo/X86/objc-fwd-decl.ll
+++ b/test/DebugInfo/X86/objc-fwd-decl.ll
@@ -12,16 +12,16 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11, !12, !14}
 
-!0 = metadata !{metadata !"0x11\0016\00clang version 3.1 (trunk 152054 trunk 152094)\000\00\002\00\000", metadata !13, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x34\00a\00a\00\003\000\001", null, metadata !6, metadata !7, %0** @a, null} ; [ DW_TAG_variable ]
-!6 = metadata !{metadata !"0x29", metadata !13} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{metadata !"0x13\00FooBarBaz\001\000\000\000\004\0016", metadata !13, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [FooBarBaz] [line 1, size 0, align 0, offset 0] [decl] [from ]
-!9 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!10 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!11 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!12 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
-!13 = metadata !{metadata !"foo.m", metadata !"/Users/echristo"}
-!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0016\00clang version 3.1 (trunk 152054 trunk 152094)\000\00\002\00\000", !13, !1, !1, !1, !3,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x34\00a\00a\00\003\000\001", null, !6, !7, %0** @a, null} ; [ DW_TAG_variable ]
+!6 = !{!"0x29", !13} ; [ DW_TAG_file_type ]
+!7 = !{!"0xf\00\000\0064\0064\000\000", null, null, !8} ; [ DW_TAG_pointer_type ]
+!8 = !{!"0x13\00FooBarBaz\001\000\000\000\004\0016", !13, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [FooBarBaz] [line 1, size 0, align 0, offset 0] [decl] [from ]
+!9 = !{i32 1, !"Objective-C Version", i32 2}
+!10 = !{i32 1, !"Objective-C Image Info Version", i32 0}
+!11 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!12 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
+!13 = !{!"foo.m", !"/Users/echristo"}
+!14 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/objc-property-void.ll b/test/DebugInfo/X86/objc-property-void.ll
index 0f50869..0ae9e1a 100644
--- a/test/DebugInfo/X86/objc-property-void.ll
+++ b/test/DebugInfo/X86/objc-property-void.ll
@@ -56,9 +56,9 @@ entry:
   %self.addr = alloca %0*, align 8
   %_cmd.addr = alloca i8*, align 8
   store %0* %self, %0** %self.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%0** %self.addr}, metadata !24, metadata !{metadata !"0x102"}), !dbg !26
+  call void @llvm.dbg.declare(metadata %0** %self.addr, metadata !24, metadata !{!"0x102"}), !dbg !26
   store i8* %_cmd, i8** %_cmd.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8** %_cmd.addr}, metadata !27, metadata !{metadata !"0x102"}), !dbg !26
+  call void @llvm.dbg.declare(metadata i8** %_cmd.addr, metadata !27, metadata !{!"0x102"}), !dbg !26
   ret void, !dbg !29
 }
 
@@ -72,33 +72,33 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!17, !18, !19, !20, !21, !22}
 !llvm.ident = !{!23}
 
-!0 = metadata !{metadata !"0x11\0016\00\000\00\002\00\000", metadata !1, metadata !2, metadata !3, metadata !9, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [] [DW_LANG_ObjC]
-!1 = metadata !{metadata !"-", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00Foo\001\000\008\000\00512\0016", metadata !5, metadata !6, null, metadata !7, null, null, null} ; [ DW_TAG_structure_type ] [Foo] [line 1, size 0, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !"<stdin>", metadata !""}
-!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] []
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x4200\00foo\002\00\00\002117", metadata !6, null} ; [ DW_TAG_APPLE_property ] [foo] [line 2, properties 2117]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x2e\00-[Foo foo]\00-[Foo foo]\00\005\001\001\000\006\00256\000\005", metadata !5, metadata !6, metadata !11, null, void (%0*, i8*)* @"\01-[Foo foo]", null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [local] [def] [-[Foo foo]]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{null, metadata !13, metadata !14}
-!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from Foo]
-!14 = metadata !{metadata !"0x16\00SEL\005\000\000\000\0064", metadata !5, null, metadata !15} ; [ DW_TAG_typedef ] [SEL] [line 5, size 0, align 0, offset 0] [artificial] [from ]
-!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_selector]
-!16 = metadata !{metadata !"0x13\00objc_selector\000\000\000\000\004\000", metadata !1, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_selector] [line 0, size 0, align 0, offset 0] [decl] [from ]
-!17 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!18 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!19 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!20 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
-!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!23 = metadata !{metadata !""}
-!24 = metadata !{metadata !"0x101\00self\0016777216\001088", metadata !10, null, metadata !25} ; [ DW_TAG_arg_variable ] [self] [line 0]
-!25 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Foo]
-!26 = metadata !{i32 0, i32 0, metadata !10, null}
-!27 = metadata !{metadata !"0x101\00_cmd\0033554432\0064", metadata !10, null, metadata !28} ; [ DW_TAG_arg_variable ] [_cmd] [line 0]
-!28 = metadata !{metadata !"0x16\00SEL\005\000\000\000\000", metadata !5, null, metadata !15} ; [ DW_TAG_typedef ] [SEL] [line 5, size 0, align 0, offset 0] [from ]
-!29 = metadata !{i32 5, i32 0, metadata !10, null}
+!0 = !{!"0x11\0016\00\000\00\002\00\000", !1, !2, !3, !9, !2, !2} ; [ DW_TAG_compile_unit ] [] [DW_LANG_ObjC]
+!1 = !{!"-", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00Foo\001\000\008\000\00512\0016", !5, !6, null, !7, null, null, null} ; [ DW_TAG_structure_type ] [Foo] [line 1, size 0, align 8, offset 0] [def] [from ]
+!5 = !{!"<stdin>", !""}
+!6 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] []
+!7 = !{!8}
+!8 = !{!"0x4200\00foo\002\00\00\002117", !6, null} ; [ DW_TAG_APPLE_property ] [foo] [line 2, properties 2117]
+!9 = !{!10}
+!10 = !{!"0x2e\00-[Foo foo]\00-[Foo foo]\00\005\001\001\000\006\00256\000\005", !5, !6, !11, null, void (%0*, i8*)* @"\01-[Foo foo]", null, null, !2} ; [ DW_TAG_subprogram ] [line 5] [local] [def] [-[Foo foo]]
+!11 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{null, !13, !14}
+!13 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from Foo]
+!14 = !{!"0x16\00SEL\005\000\000\000\0064", !5, null, !15} ; [ DW_TAG_typedef ] [SEL] [line 5, size 0, align 0, offset 0] [artificial] [from ]
+!15 = !{!"0xf\00\000\0064\0064\000\000", null, null, !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_selector]
+!16 = !{!"0x13\00objc_selector\000\000\000\000\004\000", !1, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_selector] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!17 = !{i32 1, !"Objective-C Version", i32 2}
+!18 = !{i32 1, !"Objective-C Image Info Version", i32 0}
+!19 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!20 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
+!21 = !{i32 2, !"Dwarf Version", i32 2}
+!22 = !{i32 1, !"Debug Info Version", i32 2}
+!23 = !{!""}
+!24 = !{!"0x101\00self\0016777216\001088", !10, null, !25} ; [ DW_TAG_arg_variable ] [self] [line 0]
+!25 = !{!"0xf\00\000\0064\0064\000\000", null, null, !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Foo]
+!26 = !MDLocation(line: 0, scope: !10)
+!27 = !{!"0x101\00_cmd\0033554432\0064", !10, null, !28} ; [ DW_TAG_arg_variable ] [_cmd] [line 0]
+!28 = !{!"0x16\00SEL\005\000\000\000\000", !5, null, !15} ; [ DW_TAG_typedef ] [SEL] [line 5, size 0, align 0, offset 0] [from ]
+!29 = !MDLocation(line: 5, scope: !10)
diff --git a/test/DebugInfo/X86/op_deref.ll b/test/DebugInfo/X86/op_deref.ll
index 18c4fc1..3f9a289 100644
--- a/test/DebugInfo/X86/op_deref.ll
+++ b/test/DebugInfo/X86/op_deref.ll
@@ -23,20 +23,23 @@
 ; ASM-CHECK: DEBUG_VALUE: vla <- RCX
 ; ASM-CHECK: DW_OP_breg2
 
+; RUN: llvm-as %s -o - | llvm-dis - | FileCheck %s --check-prefix=PRETTY-PRINT
+; PRETTY-PRINT: [ DW_TAG_expression ] [DW_OP_deref]
+
 define void @testVLAwithSize(i32 %s) nounwind uwtable ssp {
 entry:
   %s.addr = alloca i32, align 4
   %saved_stack = alloca i8*
   %i = alloca i32, align 4
   store i32 %s, i32* %s.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %s.addr}, metadata !10, metadata !{metadata !"0x102"}), !dbg !11
+  call void @llvm.dbg.declare(metadata i32* %s.addr, metadata !10, metadata !{!"0x102"}), !dbg !11
   %0 = load i32* %s.addr, align 4, !dbg !12
   %1 = zext i32 %0 to i64, !dbg !12
   %2 = call i8* @llvm.stacksave(), !dbg !12
   store i8* %2, i8** %saved_stack, !dbg !12
   %vla = alloca i32, i64 %1, align 16, !dbg !12
-  call void @llvm.dbg.declare(metadata !{i32* %vla}, metadata !14, metadata !30), !dbg !18
-  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !19, metadata !{metadata !"0x102"}), !dbg !20
+  call void @llvm.dbg.declare(metadata i32* %vla, metadata !14, metadata !30), !dbg !18
+  call void @llvm.dbg.declare(metadata i32* %i, metadata !19, metadata !{!"0x102"}), !dbg !20
   store i32 0, i32* %i, align 4, !dbg !21
   br label %for.cond, !dbg !21
 
@@ -77,32 +80,32 @@ declare void @llvm.stackrestore(i8*) nounwind
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!29}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.2 (trunk 156005) (llvm/trunk 156000)\000\00\000\00\001", metadata !28, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00testVLAwithSize\00testVLAwithSize\00\001\000\001\000\006\00256\000\002", metadata !28, metadata !6, metadata !7, null, void (i32)* @testVLAwithSize, null, null, metadata !1} ; [ DW_TAG_subprogram ]
-!6 = metadata !{metadata !"0x29", metadata !28} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !"0x101\00s\0016777217\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{i32 1, i32 26, metadata !5, null}
-!12 = metadata !{i32 3, i32 13, metadata !13, null}
-!13 = metadata !{metadata !"0xb\002\001\000", metadata !28, metadata !5} ; [ DW_TAG_lexical_block ]
-!14 = metadata !{metadata !"0x100\00vla\003\008192", metadata !13, metadata !6, metadata !15} ; [ DW_TAG_auto_variable ]
-!15 = metadata !{metadata !"0x1\00\000\000\0032\000\000", null, null, metadata !9, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
-!16 = metadata !{metadata !17}
-!17 = metadata !{metadata !"0x21\000\00-1"}        ; [ DW_TAG_subrange_type ]
-!18 = metadata !{i32 3, i32 7, metadata !13, null}
-!19 = metadata !{metadata !"0x100\00i\004\000", metadata !13, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ]
-!20 = metadata !{i32 4, i32 7, metadata !13, null}
-!21 = metadata !{i32 5, i32 8, metadata !22, null}
-!22 = metadata !{metadata !"0xb\005\003\001", metadata !28, metadata !13} ; [ DW_TAG_lexical_block ]
-!23 = metadata !{i32 6, i32 5, metadata !24, null}
-!24 = metadata !{metadata !"0xb\005\0027\002", metadata !28, metadata !22} ; [ DW_TAG_lexical_block ]
-!25 = metadata !{i32 7, i32 3, metadata !24, null}
-!26 = metadata !{i32 5, i32 22, metadata !22, null}
-!27 = metadata !{i32 8, i32 1, metadata !13, null}
-!28 = metadata !{metadata !"bar.c", metadata !"/Users/echristo/tmp"}
-!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!30 = metadata !{metadata !"0x102\006"} ; [ DW_TAG_expression ] [DW_OP_deref]
+!0 = !{!"0x11\0012\00clang version 3.2 (trunk 156005) (llvm/trunk 156000)\000\00\000\00\001", !28, !1, !1, !3, !1,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00testVLAwithSize\00testVLAwithSize\00\001\000\001\000\006\00256\000\002", !28, !6, !7, null, void (i32)* @testVLAwithSize, null, null, !1} ; [ DW_TAG_subprogram ]
+!6 = !{!"0x29", !28} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = !{!"0x101\00s\0016777217\000", !5, !6, !9} ; [ DW_TAG_arg_variable ]
+!11 = !MDLocation(line: 1, column: 26, scope: !5)
+!12 = !MDLocation(line: 3, column: 13, scope: !13)
+!13 = !{!"0xb\002\001\000", !28, !5} ; [ DW_TAG_lexical_block ]
+!14 = !{!"0x100\00vla\003\000", !13, !6, !15} ; [ DW_TAG_auto_variable ]
+!15 = !{!"0x1\00\000\000\0032\000\000", null, null, !9, !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!16 = !{!17}
+!17 = !{!"0x21\000\00-1"}        ; [ DW_TAG_subrange_type ]
+!18 = !MDLocation(line: 3, column: 7, scope: !13)
+!19 = !{!"0x100\00i\004\000", !13, !6, !9} ; [ DW_TAG_auto_variable ]
+!20 = !MDLocation(line: 4, column: 7, scope: !13)
+!21 = !MDLocation(line: 5, column: 8, scope: !22)
+!22 = !{!"0xb\005\003\001", !28, !13} ; [ DW_TAG_lexical_block ]
+!23 = !MDLocation(line: 6, column: 5, scope: !24)
+!24 = !{!"0xb\005\0027\002", !28, !22} ; [ DW_TAG_lexical_block ]
+!25 = !MDLocation(line: 7, column: 3, scope: !24)
+!26 = !MDLocation(line: 5, column: 22, scope: !22)
+!27 = !MDLocation(line: 8, column: 1, scope: !13)
+!28 = !{!"bar.c", !"/Users/echristo/tmp"}
+!29 = !{i32 1, !"Debug Info Version", i32 2}
+!30 = !{!"0x102\006\006"} ; [ DW_TAG_expression ] [DW_OP_deref]
diff --git a/test/DebugInfo/X86/parameters.ll b/test/DebugInfo/X86/parameters.ll
index fde63e7..9e6ee4a 100644
--- a/test/DebugInfo/X86/parameters.ll
+++ b/test/DebugInfo/X86/parameters.ll
@@ -42,7 +42,7 @@
 ; Function Attrs: uwtable
 define void @_ZN7pr147634funcENS_3fooE(%"struct.pr14763::foo"* noalias sret %agg.result, %"struct.pr14763::foo"* %f) #0 {
 entry:
-  call void @llvm.dbg.declare(metadata !{%"struct.pr14763::foo"* %f}, metadata !22, metadata !{metadata !"0x102"}), !dbg !24
+  call void @llvm.dbg.declare(metadata %"struct.pr14763::foo"* %f, metadata !22, metadata !{!"0x102\006"}), !dbg !24
   call void @_ZN7pr147633fooC1ERKS0_(%"struct.pr14763::foo"* %agg.result, %"struct.pr14763::foo"* %f), !dbg !25
   ret void, !dbg !25
 }
@@ -58,8 +58,8 @@ entry:
   %b.addr = alloca i8, align 1
   %frombool = zext i1 %b to i8
   store i8 %frombool, i8* %b.addr, align 1
-  call void @llvm.dbg.declare(metadata !{i8* %b.addr}, metadata !26, metadata !{metadata !"0x102"}), !dbg !27
-  call void @llvm.dbg.declare(metadata !{%"struct.pr14763::foo"* %g}, metadata !28, metadata !{metadata !"0x102"}), !dbg !27
+  call void @llvm.dbg.declare(metadata i8* %b.addr, metadata !26, metadata !{!"0x102"}), !dbg !27
+  call void @llvm.dbg.declare(metadata %"struct.pr14763::foo"* %g, metadata !28, metadata !{!"0x102\006"}), !dbg !27
   %0 = load i8* %b.addr, align 1, !dbg !29
   %tobool = trunc i8 %0 to i1, !dbg !29
   br i1 %tobool, label %if.then, label %if.end, !dbg !29
@@ -82,37 +82,37 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21, !33}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/pass.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"pass.cpp", metadata !"/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !17}
-!4 = metadata !{metadata !"0x2e\00func\00func\00_ZN7pr147634funcENS_3fooE\006\000\001\000\006\00256\000\006", metadata !1, metadata !5, metadata !6, null, void (%"struct.pr14763::foo"*, %"struct.pr14763::foo"*)* @_ZN7pr147634funcENS_3fooE, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
-!5 = metadata !{metadata !"0x39\00pr14763\001", metadata !1, null} ; [ DW_TAG_namespace ] [pr14763] [line 1]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{metadata !"0x13\00foo\002\008\008\000\000\000", metadata !1, metadata !5, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 2, size 8, align 8, offset 0] [def] [from ]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\000\000\006\00256\000\003", metadata !1, metadata !8, metadata !11, null, null, null, i32 0, metadata !16} ; [ DW_TAG_subprogram ] [line 3] [foo]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{null, metadata !13, metadata !14}
-!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo]
-!14 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !15} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!15 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo]
-!16 = metadata !{i32 786468}
-!17 = metadata !{metadata !"0x2e\00func2\00func2\00_ZN7pr147635func2EbNS_3fooE\0012\000\001\000\006\00256\000\0012", metadata !1, metadata !5, metadata !18, null, void (i1, %"struct.pr14763::foo"*)* @_ZN7pr147635func2EbNS_3fooE, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 12] [def] [func2]
-!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!19 = metadata !{null, metadata !20, metadata !8}
-!20 = metadata !{metadata !"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
-!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!22 = metadata !{metadata !"0x101\00f\0016777222\008192", metadata !4, metadata !23, metadata !8} ; [ DW_TAG_arg_variable ] [f] [line 6]
-!23 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/pass.cpp]
-!24 = metadata !{i32 6, i32 0, metadata !4, null}
-!25 = metadata !{i32 7, i32 0, metadata !4, null}
-!26 = metadata !{metadata !"0x101\00b\0016777228\000", metadata !17, metadata !23, metadata !20} ; [ DW_TAG_arg_variable ] [b] [line 12]
-!27 = metadata !{i32 12, i32 0, metadata !17, null}
-!28 = metadata !{metadata !"0x101\00g\0033554444\008192", metadata !17, metadata !23, metadata !8} ; [ DW_TAG_arg_variable ] [g] [line 12]
-!29 = metadata !{i32 13, i32 0, metadata !30, null}
-!30 = metadata !{metadata !"0xb\0013\000\000", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ] [/tmp/pass.cpp]
-!31 = metadata !{i32 14, i32 0, metadata !30, null}
-!32 = metadata !{i32 15, i32 0, metadata !17, null}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/pass.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"pass.cpp", !"/tmp"}
+!2 = !{}
+!3 = !{!4, !17}
+!4 = !{!"0x2e\00func\00func\00_ZN7pr147634funcENS_3fooE\006\000\001\000\006\00256\000\006", !1, !5, !6, null, void (%"struct.pr14763::foo"*, %"struct.pr14763::foo"*)* @_ZN7pr147634funcENS_3fooE, null, null, !2} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
+!5 = !{!"0x39\00pr14763\001", !1, null} ; [ DW_TAG_namespace ] [pr14763] [line 1]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8}
+!8 = !{!"0x13\00foo\002\008\008\000\000\000", !1, !5, null, !9, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 2, size 8, align 8, offset 0] [def] [from ]
+!9 = !{!10}
+!10 = !{!"0x2e\00foo\00foo\00\003\000\000\000\006\00256\000\003", !1, !8, !11, null, null, null, i32 0, !16} ; [ DW_TAG_subprogram ] [line 3] [foo]
+!11 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{null, !13, !14}
+!13 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo]
+!14 = !{!"0x10\00\000\000\000\000\000", null, null, !15} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = !{!"0x26\00\000\000\000\000\000", null, null, !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo]
+!16 = !{i32 786468}
+!17 = !{!"0x2e\00func2\00func2\00_ZN7pr147635func2EbNS_3fooE\0012\000\001\000\006\00256\000\0012", !1, !5, !18, null, void (i1, %"struct.pr14763::foo"*)* @_ZN7pr147635func2EbNS_3fooE, null, null, !2} ; [ DW_TAG_subprogram ] [line 12] [def] [func2]
+!18 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = !{null, !20, !8}
+!20 = !{!"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
+!21 = !{i32 2, !"Dwarf Version", i32 3}
+!22 = !{!"0x101\00f\0016777222\000", !4, !23, !8} ; [ DW_TAG_arg_variable ] [f] [line 6]
+!23 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/tmp/pass.cpp]
+!24 = !MDLocation(line: 6, scope: !4)
+!25 = !MDLocation(line: 7, scope: !4)
+!26 = !{!"0x101\00b\0016777228\000", !17, !23, !20} ; [ DW_TAG_arg_variable ] [b] [line 12]
+!27 = !MDLocation(line: 12, scope: !17)
+!28 = !{!"0x101\00g\0033554444\000", !17, !23, !8} ; [ DW_TAG_arg_variable ] [g] [line 12]
+!29 = !MDLocation(line: 13, scope: !30)
+!30 = !{!"0xb\0013\000\000", !1, !17} ; [ DW_TAG_lexical_block ] [/tmp/pass.cpp]
+!31 = !MDLocation(line: 14, scope: !30)
+!32 = !MDLocation(line: 15, scope: !17)
+!33 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/pieces-1.ll b/test/DebugInfo/X86/pieces-1.ll
index db36b03..5fbfedd 100644
--- a/test/DebugInfo/X86/pieces-1.ll
+++ b/test/DebugInfo/X86/pieces-1.ll
@@ -32,8 +32,8 @@ target triple = "x86_64-apple-macosx10.9.0"
 ; Function Attrs: nounwind ssp uwtable
 define i32 @foo(i64 %s.coerce0, i32 %s.coerce1) #0 {
 entry:
-  call void @llvm.dbg.value(metadata !{i64 %s.coerce0}, i64 0, metadata !20, metadata !24), !dbg !21
-  call void @llvm.dbg.value(metadata !{i32 %s.coerce1}, i64 0, metadata !22, metadata !27), !dbg !21
+  call void @llvm.dbg.value(metadata i64 %s.coerce0, i64 0, metadata !20, metadata !24), !dbg !21
+  call void @llvm.dbg.value(metadata i32 %s.coerce1, i64 0, metadata !22, metadata !27), !dbg !21
   ret i32 %s.coerce1, !dbg !23
 }
 
@@ -50,30 +50,30 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!17, !18}
 !llvm.ident = !{!19}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"pieces.c", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\001\000\006\00256\001\003", metadata !1, metadata !5, metadata !6, null, i32 (i64, i32)* @foo, null, null, metadata !15} ; [ DW_TAG_subprogram ] [line 3] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/pieces.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !9}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x16\00S\001\000\000\000\000", metadata !1, null, metadata !10} ; [ DW_TAG_typedef ] [S] [line 1, size 0, align 0, offset 0] [from ]
-!10 = metadata !{metadata !"0x13\00\001\00128\0064\000\000\000", metadata !1, null, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 128, align 64, offset 0] [def] [from ]
-!11 = metadata !{metadata !12, metadata !14}
-!12 = metadata !{metadata !"0xd\00a\001\0064\0064\000\000", metadata !1, metadata !10, metadata !13} ; [ DW_TAG_member ] [a] [line 1, size 64, align 64, offset 0] [from long int]
-!13 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
-!14 = metadata !{metadata !"0xd\00b\001\0032\0032\0064\000", metadata !1, metadata !10, metadata !8} ; [ DW_TAG_member ] [b] [line 1, size 32, align 32, offset 64] [from int]
-!15 = metadata !{metadata !16}
-!16 = metadata !{metadata !"0x101\00s\0016777219\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [s] [line 3]
-!17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!19 = metadata !{metadata !"clang version 3.5 "}
-!20 = metadata !{metadata !"0x101\00s\0016777219\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [s] [line 3]
-!21 = metadata !{i32 3, i32 0, metadata !4, null}
-!22 = metadata !{metadata !"0x101\00s\0016777219\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [s] [line 3]
-!23 = metadata !{i32 4, i32 0, metadata !4, null}
-!24 = metadata !{metadata !"0x102\00147\000\008"} ; [ DW_TAG_expression ] [DW_OP_piece 0 8] [piece, size 8, offset 0]
-!25 = metadata !{}
-!27 = metadata !{metadata !"0x102\00147\008\004"} ; [ DW_TAG_expression ] [DW_OP_piece 8 4] [piece, size 4, offset 8]
+!0 = !{!"0x11\0012\00clang version 3.5 \001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ]
+!1 = !{!"pieces.c", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\003\000\001\000\006\00256\001\003", !1, !5, !6, null, i32 (i64, i32)* @foo, null, null, !15} ; [ DW_TAG_subprogram ] [line 3] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/pieces.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !9}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x16\00S\001\000\000\000\000", !1, null, !10} ; [ DW_TAG_typedef ] [S] [line 1, size 0, align 0, offset 0] [from ]
+!10 = !{!"0x13\00\001\00128\0064\000\000\000", !1, null, null, !11, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 128, align 64, offset 0] [def] [from ]
+!11 = !{!12, !14}
+!12 = !{!"0xd\00a\001\0064\0064\000\000", !1, !10, !13} ; [ DW_TAG_member ] [a] [line 1, size 64, align 64, offset 0] [from long int]
+!13 = !{!"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!14 = !{!"0xd\00b\001\0032\0032\0064\000", !1, !10, !8} ; [ DW_TAG_member ] [b] [line 1, size 32, align 32, offset 64] [from int]
+!15 = !{!16}
+!16 = !{!"0x101\00s\0016777219\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [s] [line 3]
+!17 = !{i32 2, !"Dwarf Version", i32 4}
+!18 = !{i32 1, !"Debug Info Version", i32 2}
+!19 = !{!"clang version 3.5 "}
+!20 = !{!"0x101\00s\0016777219\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [s] [line 3]
+!21 = !MDLocation(line: 3, scope: !4)
+!22 = !{!"0x101\00s\0016777219\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [s] [line 3]
+!23 = !MDLocation(line: 4, scope: !4)
+!24 = !{!"0x102\00157\000\0064"} ; [ DW_TAG_expression ] [DW_OP_bit_piece size=64, offset=0]
+!25 = !{}
+!27 = !{!"0x102\00157\0064\0032"} ; [ DW_TAG_expression ] [DW_OP_bit_piece size=32, offset=64]
diff --git a/test/DebugInfo/X86/pieces-2.ll b/test/DebugInfo/X86/pieces-2.ll
index 760c9f6..801a67d 100644
--- a/test/DebugInfo/X86/pieces-2.ll
+++ b/test/DebugInfo/X86/pieces-2.ll
@@ -31,10 +31,10 @@ target triple = "x86_64-apple-macosx10.9.0"
 ; Function Attrs: nounwind ssp uwtable
 define i32 @foo(%struct.Outer* byval align 8 %outer) #0 {
 entry:
-  call void @llvm.dbg.declare(metadata !{%struct.Outer* %outer}, metadata !25, metadata !{metadata !"0x102"}), !dbg !26
+  call void @llvm.dbg.declare(metadata %struct.Outer* %outer, metadata !25, metadata !{!"0x102"}), !dbg !26
   %i1.sroa.0.0..sroa_idx = getelementptr inbounds %struct.Outer* %outer, i64 0, i32 0, i64 1, i32 0, !dbg !27
   %i1.sroa.0.0.copyload = load i32* %i1.sroa.0.0..sroa_idx, align 8, !dbg !27
-  call void @llvm.dbg.value(metadata !{i32 %i1.sroa.0.0.copyload}, i64 0, metadata !28, metadata !29), !dbg !27
+  call void @llvm.dbg.value(metadata i32 %i1.sroa.0.0.copyload, i64 0, metadata !28, metadata !29), !dbg !27
   %i1.sroa.2.0..sroa_raw_cast = bitcast %struct.Outer* %outer to i8*, !dbg !27
   %i1.sroa.2.0..sroa_raw_idx = getelementptr inbounds i8* %i1.sroa.2.0..sroa_raw_cast, i64 20, !dbg !27
   ret i32 %i1.sroa.0.0.copyload, !dbg !32
@@ -57,35 +57,35 @@ attributes #2 = { nounwind }
 !llvm.module.flags = !{!22, !23}
 !llvm.ident = !{!24}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/sroasplit-1.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"sroasplit-1.c", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\0010\000\001\000\006\00256\000\0010", metadata !1, metadata !5, metadata !6, null, i32 (%struct.Outer*)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 10] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/sroasplit-1.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !9}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x16\00Outer\008\000\000\000\000", metadata !1, null, metadata !10} ; [ DW_TAG_typedef ] [Outer] [line 8, size 0, align 0, offset 0] [from ]
-!10 = metadata !{metadata !"0x13\00\006\00256\0064\000\000\000", metadata !1, null, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [line 6, size 256, align 64, offset 0] [def] [from ]
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0xd\00inner\007\00256\0064\000\000", metadata !1, metadata !10, metadata !13} ; [ DW_TAG_member ] [inner] [line 7, size 256, align 64, offset 0] [from ]
-!13 = metadata !{metadata !"0x1\00\000\00256\0064\000\000", null, null, metadata !14, metadata !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 256, align 64, offset 0] [from Inner]
-!14 = metadata !{metadata !"0x16\00Inner\004\000\000\000\000", metadata !1, null, metadata !15} ; [ DW_TAG_typedef ] [Inner] [line 4, size 0, align 0, offset 0] [from ]
-!15 = metadata !{metadata !"0x13\00\001\00128\0064\000\000\000", metadata !1, null, null, metadata !16, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 128, align 64, offset 0] [def] [from ]
-!16 = metadata !{metadata !17, metadata !18}
-!17 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !1, metadata !15, metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!18 = metadata !{metadata !"0xd\00b\003\0064\0064\0064\000", metadata !1, metadata !15, metadata !19} ; [ DW_TAG_member ] [b] [line 3, size 64, align 64, offset 64] [from long int]
-!19 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
-!20 = metadata !{metadata !21}
-!21 = metadata !{metadata !"0x21\000\002"}        ; [ DW_TAG_subrange_type ] [0, 1]
-!22 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!24 = metadata !{metadata !"clang version 3.5.0 "}
-!25 = metadata !{metadata !"0x101\00outer\0016777226\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [outer] [line 10]
-!26 = metadata !{i32 10, i32 0, metadata !4, null}
-!27 = metadata !{i32 11, i32 0, metadata !4, null}
-!28 = metadata !{metadata !"0x100\00i1\0011\000", metadata !4, metadata !5, metadata !14} ; [ DW_TAG_auto_variable ] [i1] [line 11]
-!29 = metadata !{metadata !"0x102\00147\000\004"} ; [ DW_TAG_expression ] [DW_OP_piece 0 4] [piece, size 4, offset 0]
-!31 = metadata !{i32 3, i32 0, i32 12}
-!32 = metadata !{i32 12, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/sroasplit-1.c] [DW_LANG_C99]
+!1 = !{!"sroasplit-1.c", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\0010\000\001\000\006\00256\000\0010", !1, !5, !6, null, i32 (%struct.Outer*)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 10] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/sroasplit-1.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !9}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x16\00Outer\008\000\000\000\000", !1, null, !10} ; [ DW_TAG_typedef ] [Outer] [line 8, size 0, align 0, offset 0] [from ]
+!10 = !{!"0x13\00\006\00256\0064\000\000\000", !1, null, null, !11, null, null, null} ; [ DW_TAG_structure_type ] [line 6, size 256, align 64, offset 0] [def] [from ]
+!11 = !{!12}
+!12 = !{!"0xd\00inner\007\00256\0064\000\000", !1, !10, !13} ; [ DW_TAG_member ] [inner] [line 7, size 256, align 64, offset 0] [from ]
+!13 = !{!"0x1\00\000\00256\0064\000\000", null, null, !14, !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 256, align 64, offset 0] [from Inner]
+!14 = !{!"0x16\00Inner\004\000\000\000\000", !1, null, !15} ; [ DW_TAG_typedef ] [Inner] [line 4, size 0, align 0, offset 0] [from ]
+!15 = !{!"0x13\00\001\00128\0064\000\000\000", !1, null, null, !16, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 128, align 64, offset 0] [def] [from ]
+!16 = !{!17, !18}
+!17 = !{!"0xd\00a\002\0032\0032\000\000", !1, !15, !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!18 = !{!"0xd\00b\003\0064\0064\0064\000", !1, !15, !19} ; [ DW_TAG_member ] [b] [line 3, size 64, align 64, offset 64] [from long int]
+!19 = !{!"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!20 = !{!21}
+!21 = !{!"0x21\000\002"}        ; [ DW_TAG_subrange_type ] [0, 1]
+!22 = !{i32 2, !"Dwarf Version", i32 2}
+!23 = !{i32 1, !"Debug Info Version", i32 2}
+!24 = !{!"clang version 3.5.0 "}
+!25 = !{!"0x101\00outer\0016777226\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [outer] [line 10]
+!26 = !MDLocation(line: 10, scope: !4)
+!27 = !MDLocation(line: 11, scope: !4)
+!28 = !{!"0x100\00i1\0011\000", !4, !5, !14} ; [ DW_TAG_auto_variable ] [i1] [line 11]
+!29 = !{!"0x102\00157\000\0032"} ; [ DW_TAG_expression ] [DW_OP_bit_piece size=32, offset=0]
+!31 = !{i32 3, i32 0, i32 12}
+!32 = !MDLocation(line: 12, scope: !4)
diff --git a/test/DebugInfo/X86/pieces-3.ll b/test/DebugInfo/X86/pieces-3.ll
index 5dd480d..7709400 100644
--- a/test/DebugInfo/X86/pieces-3.ll
+++ b/test/DebugInfo/X86/pieces-3.ll
@@ -36,16 +36,16 @@ target triple = "x86_64-apple-macosx10.9.0"
 
 ; Function Attrs: nounwind ssp uwtable
 define i32 @foo(i64 %outer.coerce0, i64 %outer.coerce1) #0 {
-  call void @llvm.dbg.value(metadata !{i64 %outer.coerce0}, i64 0, metadata !24, metadata !25), !dbg !26
+  call void @llvm.dbg.value(metadata i64 %outer.coerce0, i64 0, metadata !24, metadata !25), !dbg !26
   call void @llvm.dbg.declare(metadata !{null}, metadata !27, metadata !28), !dbg !26
-  call void @llvm.dbg.value(metadata !{i64 %outer.coerce1}, i64 0, metadata !29, metadata !30), !dbg !26
+  call void @llvm.dbg.value(metadata i64 %outer.coerce1, i64 0, metadata !29, metadata !30), !dbg !26
   call void @llvm.dbg.declare(metadata !{null}, metadata !31, metadata !32), !dbg !26
   %outer.sroa.1.8.extract.trunc = trunc i64 %outer.coerce1 to i32, !dbg !33
-  call void @llvm.dbg.value(metadata !{i32 %outer.sroa.1.8.extract.trunc}, i64 0, metadata !34, metadata !35), !dbg !33
+  call void @llvm.dbg.value(metadata i32 %outer.sroa.1.8.extract.trunc, i64 0, metadata !34, metadata !35), !dbg !33
   %outer.sroa.1.12.extract.shift = lshr i64 %outer.coerce1, 32, !dbg !33
   %outer.sroa.1.12.extract.trunc = trunc i64 %outer.sroa.1.12.extract.shift to i32, !dbg !33
-  call void @llvm.dbg.value(metadata !{i64 %outer.sroa.1.12.extract.shift}, i64 0, metadata !34, metadata !35), !dbg !33
-  call void @llvm.dbg.value(metadata !{i32 %outer.sroa.1.12.extract.trunc}, i64 0, metadata !34, metadata !35), !dbg !33
+  call void @llvm.dbg.value(metadata i64 %outer.sroa.1.12.extract.shift, i64 0, metadata !34, metadata !35), !dbg !33
+  call void @llvm.dbg.value(metadata i32 %outer.sroa.1.12.extract.trunc, i64 0, metadata !34, metadata !35), !dbg !33
   call void @llvm.dbg.declare(metadata !{null}, metadata !34, metadata !35), !dbg !33
   ret i32 %outer.sroa.1.8.extract.trunc, !dbg !36
 }
@@ -67,40 +67,40 @@ attributes #2 = { nounwind }
 !llvm.module.flags = !{!21, !22}
 !llvm.ident = !{!23}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/sroasplit-2.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"sroasplit-2.c", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\0010\000\001\000\006\00256\000\0010", metadata !1, metadata !5, metadata !6, null, i32 (i64, i64)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 10] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/sroasplit-2.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !9}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x16\00Outer\008\000\000\000\000", metadata !1, null, metadata !10} ; [ DW_TAG_typedef ] [Outer] [line 8, size 0, align 0, offset 0] [from ]
-!10 = metadata !{metadata !"0x13\00\006\00128\0032\000\000\000", metadata !1, null, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [line 6, size 128, align 32, offset 0] [def] [from ]
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0xd\00inner\007\00128\0032\000\000", metadata !1, metadata !10, metadata !13} ; [ DW_TAG_member ] [inner] [line 7, size 128, align 32, offset 0] [from ]
-!13 = metadata !{metadata !"0x1\00\000\00128\0032\000\000", null, null, metadata !14, metadata !19, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 32, offset 0] [from Inner]
-!14 = metadata !{metadata !"0x16\00Inner\004\000\000\000\000", metadata !1, null, metadata !15} ; [ DW_TAG_typedef ] [Inner] [line 4, size 0, align 0, offset 0] [from ]
-!15 = metadata !{metadata !"0x13\00\001\0064\0032\000\000\000", metadata !1, null, null, metadata !16, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 64, align 32, offset 0] [def] [from ]
-!16 = metadata !{metadata !17, metadata !18}
-!17 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !1, metadata !15, metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!18 = metadata !{metadata !"0xd\00b\003\0032\0032\0032\000", metadata !1, metadata !15, metadata !8} ; [ DW_TAG_member ] [b] [line 3, size 32, align 32, offset 32] [from int]
-!19 = metadata !{metadata !20}
-!20 = metadata !{metadata !"0x21\000\002"}        ; [ DW_TAG_subrange_type ] [0, 1]
-!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!23 = metadata !{metadata !"clang version 3.5.0 "}
-!24 = metadata !{metadata !"0x101\00outer\0016777226\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [outer] [line 10]
-!25 = metadata !{metadata !"0x102\00147\000\008"} ; [ DW_TAG_expression ] [DW_OP_piece 0 8] [piece, size 8, offset 0]
-!26 = metadata !{i32 10, i32 0, metadata !4, null}
-!27 = metadata !{metadata !"0x101\00outer\0016777226\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [outer] [line 10]
-!28 = metadata !{metadata !"0x102\00147\008\008"} ; [ DW_TAG_expression ] [DW_OP_piece 8 8] [piece, size 8, offset 8]
-!29 = metadata !{metadata !"0x101\00outer\0016777226\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [outer] [line 10]
-!30 = metadata !{metadata !"0x102\00147\0012\004"} ; [ DW_TAG_expression ] [DW_OP_piece 12 4] [piece, size 4, offset 12]
-!31 = metadata !{metadata !"0x101\00outer\0016777226\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [outer] [line 10]
-!32 = metadata !{metadata !"0x102\00147\008\004"} ; [ DW_TAG_expression ] [DW_OP_piece 8 4] [piece, size 4, offset 8]
-!33 = metadata !{i32 11, i32 0, metadata !4, null}
-!34 = metadata !{metadata !"0x100\00i1\0011\000", metadata !4, metadata !5, metadata !14} ; [ DW_TAG_auto_variable ] [i1] [line 11]
-!35 = metadata !{metadata !"0x102\00147\000\004"} ; [ DW_TAG_expression ] [DW_OP_piece 0 4] [piece, size 4, offset 0]
-!36 = metadata !{i32 12, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/sroasplit-2.c] [DW_LANG_C99]
+!1 = !{!"sroasplit-2.c", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\0010\000\001\000\006\00256\000\0010", !1, !5, !6, null, i32 (i64, i64)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 10] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/sroasplit-2.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !9}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x16\00Outer\008\000\000\000\000", !1, null, !10} ; [ DW_TAG_typedef ] [Outer] [line 8, size 0, align 0, offset 0] [from ]
+!10 = !{!"0x13\00\006\00128\0032\000\000\000", !1, null, null, !11, null, null, null} ; [ DW_TAG_structure_type ] [line 6, size 128, align 32, offset 0] [def] [from ]
+!11 = !{!12}
+!12 = !{!"0xd\00inner\007\00128\0032\000\000", !1, !10, !13} ; [ DW_TAG_member ] [inner] [line 7, size 128, align 32, offset 0] [from ]
+!13 = !{!"0x1\00\000\00128\0032\000\000", null, null, !14, !19, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 32, offset 0] [from Inner]
+!14 = !{!"0x16\00Inner\004\000\000\000\000", !1, null, !15} ; [ DW_TAG_typedef ] [Inner] [line 4, size 0, align 0, offset 0] [from ]
+!15 = !{!"0x13\00\001\0064\0032\000\000\000", !1, null, null, !16, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 64, align 32, offset 0] [def] [from ]
+!16 = !{!17, !18}
+!17 = !{!"0xd\00a\002\0032\0032\000\000", !1, !15, !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!18 = !{!"0xd\00b\003\0032\0032\0032\000", !1, !15, !8} ; [ DW_TAG_member ] [b] [line 3, size 32, align 32, offset 32] [from int]
+!19 = !{!20}
+!20 = !{!"0x21\000\002"}        ; [ DW_TAG_subrange_type ] [0, 1]
+!21 = !{i32 2, !"Dwarf Version", i32 2}
+!22 = !{i32 1, !"Debug Info Version", i32 2}
+!23 = !{!"clang version 3.5.0 "}
+!24 = !{!"0x101\00outer\0016777226\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [outer] [line 10]
+!25 = !{!"0x102\00157\000\0064"} ; [ DW_TAG_expression ] [DW_OP_bit_piece size=64, offset=0]
+!26 = !MDLocation(line: 10, scope: !4)
+!27 = !{!"0x101\00outer\0016777226\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [outer] [line 10]
+!28 = !{!"0x102\00157\0064\0064"} ; [ DW_TAG_expression ] [DW_OP_bit_piece size=64, offset=64]
+!29 = !{!"0x101\00outer\0016777226\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [outer] [line 10]
+!30 = !{!"0x102\00157\0096\0032"} ; [ DW_TAG_expression ] [DW_OP_bit_piece size=32, offset=96]
+!31 = !{!"0x101\00outer\0016777226\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [outer] [line 10]
+!32 = !{!"0x102\00157\0064\0032"} ; [ DW_TAG_expression ] [DW_OP_bit_piece size=32, offset=64]
+!33 = !MDLocation(line: 11, scope: !4)
+!34 = !{!"0x100\00i1\0011\000", !4, !5, !14} ; [ DW_TAG_auto_variable ] [i1] [line 11]
+!35 = !{!"0x102\00157\000\0032"} ; [ DW_TAG_expression ] [DW_OP_bit_piece size=32, offset=0]
+!36 = !MDLocation(line: 12, scope: !4)
diff --git a/test/DebugInfo/X86/pointer-type-size.ll b/test/DebugInfo/X86/pointer-type-size.ll
index 1280181..a7f569d 100644
--- a/test/DebugInfo/X86/pointer-type-size.ll
+++ b/test/DebugInfo/X86/pointer-type-size.ll
@@ -11,16 +11,16 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 147882)\000\00\000\00\000", metadata !13, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x34\00crass\00crass\00\001\000\001", null, metadata !6, metadata !7, %struct.crass* @crass, null} ; [ DW_TAG_variable ]
-!6 = metadata !{metadata !"0x29", metadata !13} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x13\00crass\001\0064\0064\000\000\000", metadata !13, null, null, metadata !8, null, null, null} ; [ DW_TAG_structure_type ] [crass] [line 1, size 64, align 64, offset 0] [def] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0xd\00ptr\001\0064\0064\000\000", metadata !13, metadata !7, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !11} ; [ DW_TAG_const_type ]
-!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !12} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
-!13 = metadata !{metadata !"foo.c", metadata !"/Users/echristo/tmp"}
-!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.1 (trunk 147882)\000\00\000\00\000", !13, !1, !1, !1, !3,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x34\00crass\00crass\00\001\000\001", null, !6, !7, %struct.crass* @crass, null} ; [ DW_TAG_variable ]
+!6 = !{!"0x29", !13} ; [ DW_TAG_file_type ]
+!7 = !{!"0x13\00crass\001\0064\0064\000\000\000", !13, null, null, !8, null, null, null} ; [ DW_TAG_structure_type ] [crass] [line 1, size 64, align 64, offset 0] [def] [from ]
+!8 = !{!9}
+!9 = !{!"0xd\00ptr\001\0064\0064\000\000", !13, !7, !10} ; [ DW_TAG_member ]
+!10 = !{!"0x26\00\000\000\000\000\000", null, null, !11} ; [ DW_TAG_const_type ]
+!11 = !{!"0xf\00\000\0064\0064\000\000", null, null, !12} ; [ DW_TAG_pointer_type ]
+!12 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
+!13 = !{!"foo.c", !"/Users/echristo/tmp"}
+!14 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/pr11300.ll b/test/DebugInfo/X86/pr11300.ll
index 4fdbbed..53e85ac 100644
--- a/test/DebugInfo/X86/pr11300.ll
+++ b/test/DebugInfo/X86/pr11300.ll
@@ -18,7 +18,7 @@ define void @_Z3zedP3foo(%struct.foo* %x) uwtable {
 entry:
   %x.addr = alloca %struct.foo*, align 8
   store %struct.foo* %x, %struct.foo** %x.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.foo** %x.addr}, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
+  call void @llvm.dbg.declare(metadata %struct.foo** %x.addr, metadata !23, metadata !{!"0x102"}), !dbg !24
   %0 = load %struct.foo** %x.addr, align 8, !dbg !25
   call void @_ZN3foo3barEv(%struct.foo* %0), !dbg !25
   ret void, !dbg !27
@@ -30,7 +30,7 @@ define linkonce_odr void @_ZN3foo3barEv(%struct.foo* %this) nounwind uwtable ali
 entry:
   %this.addr = alloca %struct.foo*, align 8
   store %struct.foo* %this, %struct.foo** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr}, metadata !28, metadata !{metadata !"0x102"}), !dbg !29
+  call void @llvm.dbg.declare(metadata %struct.foo** %this.addr, metadata !28, metadata !{!"0x102"}), !dbg !29
   %this1 = load %struct.foo** %this.addr
   ret void, !dbg !30
 }
@@ -38,33 +38,33 @@ entry:
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.0 ()\000\00\000\00\000", metadata !32, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5, metadata !20}
-!5 = metadata !{metadata !"0x2e\00zed\00zed\00_Z3zedP3foo\004\000\001\000\006\00256\000\004", metadata !6, metadata !6, metadata !7, null, void (%struct.foo*)* @_Z3zedP3foo, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [def] [zed]
-!6 = metadata !{metadata !"0x29", metadata !32} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9}
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{metadata !"0x2\00foo\001\008\008\000\000\000", metadata !32, null, null, metadata !11, null, null, null} ; [ DW_TAG_class_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3foo3barEv\002\000\000\000\006\00256\000\002", metadata !6, metadata !10, metadata !13, null, null, null, i32 0, metadata !16} ; [ DW_TAG_subprogram ]
-!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!14 = metadata !{null, metadata !15}
-!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !10} ; [ DW_TAG_pointer_type ]
-!16 = metadata !{metadata !17}
-!17 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!18 = metadata !{metadata !19}
-!19 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!20 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3foo3barEv\002\000\001\000\006\00256\000\002", metadata !6, null, metadata !13, null, void (%struct.foo*)* @_ZN3foo3barEv, null, metadata !12, null} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
-!23 = metadata !{metadata !"0x101\00x\0016777220\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
-!24 = metadata !{i32 4, i32 15, metadata !5, null}
-!25 = metadata !{i32 4, i32 20, metadata !26, null}
-!26 = metadata !{metadata !"0xb\004\0018\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ]
-!27 = metadata !{i32 4, i32 30, metadata !26, null}
-!28 = metadata !{metadata !"0x101\00this\0016777218\0064", metadata !20, metadata !6, metadata !15} ; [ DW_TAG_arg_variable ]
-!29 = metadata !{i32 2, i32 8, metadata !20, null}
-!30 = metadata !{i32 2, i32 15, metadata !31, null}
-!31 = metadata !{metadata !"0xb\002\0014\001", metadata !6, metadata !20} ; [ DW_TAG_lexical_block ]
-!32 = metadata !{metadata !"/home/espindola/llvm/test.cc", metadata !"/home/espindola/tmpfs/build"}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.0 ()\000\00\000\00\000", !32, !1, !1, !3, !1,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5, !20}
+!5 = !{!"0x2e\00zed\00zed\00_Z3zedP3foo\004\000\001\000\006\00256\000\004", !6, !6, !7, null, void (%struct.foo*)* @_Z3zedP3foo, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [def] [zed]
+!6 = !{!"0x29", !32} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9}
+!9 = !{!"0xf\00\000\0064\0064\000\000", null, null, !10} ; [ DW_TAG_pointer_type ]
+!10 = !{!"0x2\00foo\001\008\008\000\000\000", !32, null, null, !11, null, null, null} ; [ DW_TAG_class_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!11 = !{!12}
+!12 = !{!"0x2e\00bar\00bar\00_ZN3foo3barEv\002\000\000\000\006\00256\000\002", !6, !10, !13, null, null, null, i32 0, !16} ; [ DW_TAG_subprogram ]
+!13 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = !{null, !15}
+!15 = !{!"0xf\00\000\0064\0064\000\0064", i32 0, null, !10} ; [ DW_TAG_pointer_type ]
+!16 = !{!17}
+!17 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!18 = !{!19}
+!19 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!20 = !{!"0x2e\00bar\00bar\00_ZN3foo3barEv\002\000\001\000\006\00256\000\002", !6, null, !13, null, void (%struct.foo*)* @_ZN3foo3barEv, null, !12, null} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
+!23 = !{!"0x101\00x\0016777220\000", !5, !6, !9} ; [ DW_TAG_arg_variable ]
+!24 = !MDLocation(line: 4, column: 15, scope: !5)
+!25 = !MDLocation(line: 4, column: 20, scope: !26)
+!26 = !{!"0xb\004\0018\000", !6, !5} ; [ DW_TAG_lexical_block ]
+!27 = !MDLocation(line: 4, column: 30, scope: !26)
+!28 = !{!"0x101\00this\0016777218\0064", !20, !6, !15} ; [ DW_TAG_arg_variable ]
+!29 = !MDLocation(line: 2, column: 8, scope: !20)
+!30 = !MDLocation(line: 2, column: 15, scope: !31)
+!31 = !{!"0xb\002\0014\001", !6, !20} ; [ DW_TAG_lexical_block ]
+!32 = !{!"/home/espindola/llvm/test.cc", !"/home/espindola/tmpfs/build"}
+!33 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/pr12831.ll b/test/DebugInfo/X86/pr12831.ll
index 3951bbd..5adaeaf 100644
--- a/test/DebugInfo/X86/pr12831.ll
+++ b/test/DebugInfo/X86/pr12831.ll
@@ -20,7 +20,7 @@ entry:
   %agg.tmp4 = alloca %class.function, align 1
   %agg.tmp5 = alloca %class.anon.0, align 1
   store %class.BPLFunctionWriter* %this, %class.BPLFunctionWriter** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.BPLFunctionWriter** %this.addr}, metadata !133, metadata !{metadata !"0x102"}), !dbg !135
+  call void @llvm.dbg.declare(metadata %class.BPLFunctionWriter** %this.addr, metadata !133, metadata !{!"0x102"}), !dbg !135
   %this1 = load %class.BPLFunctionWriter** %this.addr
   %MW = getelementptr inbounds %class.BPLFunctionWriter* %this1, i32 0, i32 0, !dbg !136
   %0 = load %struct.BPLModuleWriter** %MW, align 8, !dbg !136
@@ -42,8 +42,8 @@ entry:
   %this.addr = alloca %class.function*, align 8
   %__f = alloca %class.anon.0, align 1
   store %class.function* %this, %class.function** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.function** %this.addr}, metadata !140, metadata !{metadata !"0x102"}), !dbg !142
-  call void @llvm.dbg.declare(metadata !{%class.anon.0* %__f}, metadata !143, metadata !{metadata !"0x102"}), !dbg !144
+  call void @llvm.dbg.declare(metadata %class.function** %this.addr, metadata !140, metadata !{!"0x102"}), !dbg !142
+  call void @llvm.dbg.declare(metadata %class.anon.0* %__f, metadata !143, metadata !{!"0x102"}), !dbg !144
   %this1 = load %class.function** %this.addr
   call void @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_"(%class.anon.0* %__f), !dbg !145
   ret void, !dbg !147
@@ -61,8 +61,8 @@ entry:
   %this.addr = alloca %class.function*, align 8
   %__f = alloca %class.anon, align 1
   store %class.function* %this, %class.function** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.function** %this.addr}, metadata !150, metadata !{metadata !"0x102"}), !dbg !151
-  call void @llvm.dbg.declare(metadata !{%class.anon* %__f}, metadata !152, metadata !{metadata !"0x102"}), !dbg !153
+  call void @llvm.dbg.declare(metadata %class.function** %this.addr, metadata !150, metadata !{!"0x102"}), !dbg !151
+  call void @llvm.dbg.declare(metadata %class.anon* %__f, metadata !152, metadata !{!"0x102"}), !dbg !153
   %this1 = load %class.function** %this.addr
   call void @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_"(%class.anon* %__f), !dbg !154
   ret void, !dbg !156
@@ -78,163 +78,163 @@ entry:
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!162}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.2 \000\00\000\00\000", metadata !161, metadata !1, metadata !1, metadata !3, metadata !128, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5, metadata !106, metadata !107, metadata !126, metadata !127}
-!5 = metadata !{metadata !"0x2e\00writeExpr\00writeExpr\00_ZN17BPLFunctionWriter9writeExprEv\0019\000\001\000\006\00256\000\0019", metadata !6, null, metadata !7, null, void (%class.BPLFunctionWriter*)* @_ZN17BPLFunctionWriter9writeExprEv, null, metadata !103, metadata !1} ; [ DW_TAG_subprogram ]
-!6 = metadata !{metadata !"0x29", metadata !160} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9}
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{metadata !"0x2\00BPLFunctionWriter\0015\0064\0064\000\000\000", metadata !160, null, null, metadata !11, null, null, null} ; [ DW_TAG_class_type ] [BPLFunctionWriter] [line 15, size 64, align 64, offset 0] [def] [from ]
-!11 = metadata !{metadata !12, metadata !103}
-!12 = metadata !{metadata !"0xd\00MW\0016\0064\0064\000\001", metadata !160, metadata !10, metadata !13} ; [ DW_TAG_member ]
-!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !14} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{metadata !"0x2\00BPLModuleWriter\0012\008\008\000\000\000", metadata !160, null, null, metadata !15, null, null, null} ; [ DW_TAG_class_type ] [BPLModuleWriter] [line 12, size 8, align 8, offset 0] [def] [from ]
-!15 = metadata !{metadata !16}
-!16 = metadata !{metadata !"0x2e\00writeIntrinsic\00writeIntrinsic\00_ZN15BPLModuleWriter14writeIntrinsicE8functionIFvvEE\0013\000\000\000\006\00256\000\0013", metadata !6, metadata !14, metadata !17, null, null, null, i32 0, metadata !101} ; [ DW_TAG_subprogram ]
-!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!18 = metadata !{null, metadata !19, metadata !20}
-!19 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !14} ; [ DW_TAG_pointer_type ]
-!20 = metadata !{metadata !"0x2\00function<void ()>\006\008\008\000\000\000", metadata !160, null, null, metadata !21, null, metadata !97, null} ; [ DW_TAG_class_type ] [function<void ()>] [line 6, size 8, align 8, offset 0] [def] [from ]
-!21 = metadata !{metadata !22, metadata !51, metadata !58, metadata !86, metadata !92}
-!22 = metadata !{metadata !"0x2e\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00\008\000\000\000\006\00256\000\008", metadata !6, metadata !20, metadata !23, null, null, metadata !47, i32 0, metadata !49} ; [ DW_TAG_subprogram ]
-!23 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !24, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!24 = metadata !{null, metadata !25, metadata !26}
-!25 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !20} ; [ DW_TAG_pointer_type ]
-!26 = metadata !{metadata !"0x2\00\0020\008\008\000\000\000", metadata !160, metadata !5, null, metadata !27, null, null, null} ; [ DW_TAG_class_type ] [line 20, size 8, align 8, offset 0] [def] [from ]
-!27 = metadata !{metadata !28, metadata !35, metadata !41}
-!28 = metadata !{metadata !"0x2e\00operator()\00operator()\00\0020\000\000\000\006\00256\000\0020", metadata !6, metadata !26, metadata !29, null, null, null, i32 0, metadata !33} ; [ DW_TAG_subprogram ]
-!29 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !30, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!30 = metadata !{null, metadata !31}
-!31 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !32} ; [ DW_TAG_pointer_type ]
-!32 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !26} ; [ DW_TAG_const_type ]
-!33 = metadata !{metadata !34}
-!34 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!35 = metadata !{metadata !"0x2e\00~\00~\00\0020\000\000\000\006\00320\000\0020", metadata !6, metadata !26, metadata !36, null, null, null, i32 0, metadata !39} ; [ DW_TAG_subprogram ]
-!36 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !37, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!37 = metadata !{null, metadata !38}
-!38 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !26} ; [ DW_TAG_pointer_type ]
-!39 = metadata !{metadata !40}
-!40 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!41 = metadata !{metadata !"0x2e\00\00\00\0020\000\000\000\006\00320\000\0020", metadata !6, metadata !26, metadata !42, null, null, null, i32 0, metadata !45} ; [ DW_TAG_subprogram ]
-!42 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !43, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!43 = metadata !{null, metadata !38, metadata !44}
-!44 = metadata !{metadata !"0x42\00\000\000\000\000\000", null, null, metadata !26} ; [ DW_TAG_rvalue_reference_type ]
-!45 = metadata !{metadata !46}
-!46 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!47 = metadata !{metadata !48}
-!48 = metadata !{metadata !"0x2f\00_Functor\000\000", null, metadata !26, null} ; [ DW_TAG_template_type_parameter ]
-!49 = metadata !{metadata !50}
-!50 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!51 = metadata !{metadata !"0x2e\00function<function<void ()> >\00function<function<void ()> >\00\008\000\000\000\006\00256\000\008", metadata !6, metadata !20, metadata !52, null, null, metadata !54, i32 0, metadata !56} ; [ DW_TAG_subprogram ]
-!52 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !53, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!53 = metadata !{null, metadata !25, metadata !20}
-!54 = metadata !{metadata !55}
-!55 = metadata !{metadata !"0x2f\00_Functor\000\000", null, metadata !20, null} ; [ DW_TAG_template_type_parameter ]
-!56 = metadata !{metadata !57}
-!57 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!58 = metadata !{metadata !"0x2e\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00\008\000\000\000\006\00256\000\008", metadata !6, metadata !20, metadata !59, null, null, metadata !82, i32 0, metadata !84} ; [ DW_TAG_subprogram ]
-!59 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !60, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!60 = metadata !{null, metadata !25, metadata !61}
-!61 = metadata !{metadata !"0x2\00\0023\008\008\000\000\000", metadata !160, metadata !5, null, metadata !62, null, null, null} ; [ DW_TAG_class_type ] [line 23, size 8, align 8, offset 0] [def] [from ]
-!62 = metadata !{metadata !63, metadata !70, metadata !76}
-!63 = metadata !{metadata !"0x2e\00operator()\00operator()\00\0023\000\000\000\006\00256\000\0023", metadata !6, metadata !61, metadata !64, null, null, null, i32 0, metadata !68} ; [ DW_TAG_subprogram ]
-!64 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !65, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!65 = metadata !{null, metadata !66}
-!66 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !67} ; [ DW_TAG_pointer_type ]
-!67 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !61} ; [ DW_TAG_const_type ]
-!68 = metadata !{metadata !69}
-!69 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!70 = metadata !{metadata !"0x2e\00~\00~\00\0023\000\000\000\006\00320\000\0023", metadata !6, metadata !61, metadata !71, null, null, null, i32 0, metadata !74} ; [ DW_TAG_subprogram ]
-!71 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !72, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!72 = metadata !{null, metadata !73}
-!73 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !61} ; [ DW_TAG_pointer_type ]
-!74 = metadata !{metadata !75}
-!75 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!76 = metadata !{metadata !"0x2e\00\00\00\0023\000\000\000\006\00320\000\0023", metadata !6, metadata !61, metadata !77, null, null, null, i32 0, metadata !80} ; [ DW_TAG_subprogram ]
-!77 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !78, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!78 = metadata !{null, metadata !73, metadata !79}
-!79 = metadata !{metadata !"0x42\00\000\000\000\000\000", null, null, metadata !61} ; [ DW_TAG_rvalue_reference_type ]
-!80 = metadata !{metadata !81}
-!81 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!82 = metadata !{metadata !83}
-!83 = metadata !{metadata !"0x2f\00_Functor\000\000", null, metadata !61, null} ; [ DW_TAG_template_type_parameter ]
-!84 = metadata !{metadata !85}
-!85 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!86 = metadata !{metadata !"0x2e\00function\00function\00\006\000\000\000\006\00320\000\006", metadata !6, metadata !20, metadata !87, null, null, null, i32 0, metadata !90} ; [ DW_TAG_subprogram ]
-!87 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !88, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!88 = metadata !{null, metadata !25, metadata !89}
-!89 = metadata !{metadata !"0x42\00\000\000\000\000\000", null, null, metadata !20} ; [ DW_TAG_rvalue_reference_type ]
-!90 = metadata !{metadata !91}
-!91 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!92 = metadata !{metadata !"0x2e\00~function\00~function\00\006\000\000\000\006\00320\000\006", metadata !6, metadata !20, metadata !93, null, null, null, i32 0, metadata !95} ; [ DW_TAG_subprogram ]
-!93 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !94, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!94 = metadata !{null, metadata !25}
-!95 = metadata !{metadata !96}
-!96 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!97 = metadata !{metadata !98}
-!98 = metadata !{metadata !"0x2f\00T\000\000", null, metadata !99, null} ; [ DW_TAG_template_type_parameter ]
-!99 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !100, i32 0} ; [ DW_TAG_subroutine_type ]
-!100 = metadata !{null}
-!101 = metadata !{metadata !102}
-!102 = metadata !{metadata !"0x24"}                     ; [ DW_TAG_base_type ]
-!103 = metadata !{metadata !"0x2e\00writeExpr\00writeExpr\00_ZN17BPLFunctionWriter9writeExprEv\0017\000\000\000\006\00257\000\0017", metadata !6, metadata !10, metadata !7, null, null, null, i32 0, metadata !104} ; [ DW_TAG_subprogram ]
-!104 = metadata !{metadata !105}
-!105 = metadata !{metadata !"0x24"}                     ; [ DW_TAG_base_type ]
-!106 = metadata !{metadata !"0x2e\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_\008\001\001\000\006\00256\000\008", metadata !6, null, metadata !59, null, void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_", metadata !82, metadata !58, metadata !1} ; [ DW_TAG_subprogram ]
-!107 = metadata !{metadata !"0x2e\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_\003\001\001\000\006\00256\000\003", metadata !6, null, metadata !108, null, void (%class.anon.0*)* @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_", metadata !111, metadata !113, metadata !1} ; [ DW_TAG_subprogram ]
-!108 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !109, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!109 = metadata !{null, metadata !110}
-!110 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !61} ; [ DW_TAG_reference_type ]
-!111 = metadata !{metadata !112}
-!112 = metadata !{metadata !"0x2f\00_Tp\000\000", null, metadata !61, null} ; [ DW_TAG_template_type_parameter ]
-!113 = metadata !{metadata !"0x2e\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_\003\000\000\000\006\00256\000\003", metadata !6, metadata !114, metadata !108, null, null, metadata !111, i32 0, metadata !124} ; [ DW_TAG_subprogram ]
-!114 = metadata !{metadata !"0x2\00_Base_manager\001\008\008\000\000\000", metadata !160, null, null, metadata !115, null, null, null} ; [ DW_TAG_class_type ] [_Base_manager] [line 1, size 8, align 8, offset 0] [def] [from ]
-!115 = metadata !{metadata !116, metadata !113}
-!116 = metadata !{metadata !"0x2e\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_\003\000\000\000\006\00256\000\003", metadata !6, metadata !114, metadata !117, null, null, metadata !120, i32 0, metadata !122} ; [ DW_TAG_subprogram ]
-!117 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !118, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!118 = metadata !{null, metadata !119}
-!119 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !26} ; [ DW_TAG_reference_type ]
-!120 = metadata !{metadata !121}
-!121 = metadata !{metadata !"0x2f\00_Tp\000\000", null, metadata !26, null} ; [ DW_TAG_template_type_parameter ]
-!122 = metadata !{metadata !123}
-!123 = metadata !{metadata !"0x24"}                     ; [ DW_TAG_base_type ]
-!124 = metadata !{metadata !125}
-!125 = metadata !{metadata !"0x24"}                     ; [ DW_TAG_base_type ]
-!126 = metadata !{metadata !"0x2e\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_0EET_\008\001\001\000\006\00256\000\008", metadata !6, null, metadata !23, null, void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_0EET_", metadata !47, metadata !22, metadata !1} ; [ DW_TAG_subprogram ]
-!127 = metadata !{metadata !"0x2e\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_\003\001\001\000\006\00256\000\003", metadata !6, null, metadata !117, null, void (%class.anon*)* @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_", metadata !120, metadata !116, metadata !1} ; [ DW_TAG_subprogram ]
-!128 = metadata !{metadata !130}
-!130 = metadata !{metadata !"0x34\00__stored_locally\00__stored_locally\00__stored_locally\002\001\001", metadata !114, metadata !6, metadata !131, i1 1, null} ; [ DW_TAG_variable ]
-!131 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !132} ; [ DW_TAG_const_type ]
-!132 = metadata !{metadata !"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ]
-!133 = metadata !{metadata !"0x101\00this\0016777235\0064", metadata !5, metadata !6, metadata !134} ; [ DW_TAG_arg_variable ]
-!134 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ]
-!135 = metadata !{i32 19, i32 39, metadata !5, null}
-!136 = metadata !{i32 20, i32 17, metadata !137, null}
-!137 = metadata !{metadata !"0xb\0019\0051\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ]
-!138 = metadata !{i32 23, i32 17, metadata !137, null}
-!139 = metadata !{i32 26, i32 15, metadata !137, null}
-!140 = metadata !{metadata !"0x101\00this\0016777224\0064", metadata !106, metadata !6, metadata !141} ; [ DW_TAG_arg_variable ]
-!141 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !20} ; [ DW_TAG_pointer_type ]
-!142 = metadata !{i32 8, i32 45, metadata !106, null}
-!143 = metadata !{metadata !"0x101\00__f\0033554440\000", metadata !106, metadata !6, metadata !61} ; [ DW_TAG_arg_variable ]
-!144 = metadata !{i32 8, i32 63, metadata !106, null}
-!145 = metadata !{i32 9, i32 9, metadata !146, null}
-!146 = metadata !{metadata !"0xb\008\0081\001", metadata !6, metadata !106} ; [ DW_TAG_lexical_block ]
-!147 = metadata !{i32 10, i32 13, metadata !146, null}
-!148 = metadata !{i32 4, i32 5, metadata !149, null}
-!149 = metadata !{metadata !"0xb\003\00105\002", metadata !6, metadata !107} ; [ DW_TAG_lexical_block ]
-!150 = metadata !{metadata !"0x101\00this\0016777224\0064", metadata !126, metadata !6, metadata !141} ; [ DW_TAG_arg_variable ]
-!151 = metadata !{i32 8, i32 45, metadata !126, null}
-!152 = metadata !{metadata !"0x101\00__f\0033554440\000", metadata !126, metadata !6, metadata !26} ; [ DW_TAG_arg_variable ]
-!153 = metadata !{i32 8, i32 63, metadata !126, null}
-!154 = metadata !{i32 9, i32 9, metadata !155, null}
-!155 = metadata !{metadata !"0xb\008\0081\003", metadata !6, metadata !126} ; [ DW_TAG_lexical_block ]
-!156 = metadata !{i32 10, i32 13, metadata !155, null}
-!157 = metadata !{i32 4, i32 5, metadata !158, null}
-!158 = metadata !{metadata !"0xb\003\00105\004", metadata !6, metadata !127} ; [ DW_TAG_lexical_block ]
-!159 = metadata !{metadata !"0x29", metadata !161} ; [ DW_TAG_file_type ]
-!160 = metadata !{metadata !"BPLFunctionWriter2.ii", metadata !"/home/peter/crashdelta"}
-!161 = metadata !{metadata !"BPLFunctionWriter.cpp", metadata !"/home/peter/crashdelta"}
-!162 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.2 \000\00\000\00\000", !161, !1, !1, !3, !128, null} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5, !106, !107, !126, !127}
+!5 = !{!"0x2e\00writeExpr\00writeExpr\00_ZN17BPLFunctionWriter9writeExprEv\0019\000\001\000\006\00256\000\0019", !6, null, !7, null, void (%class.BPLFunctionWriter*)* @_ZN17BPLFunctionWriter9writeExprEv, null, !103, !1} ; [ DW_TAG_subprogram ]
+!6 = !{!"0x29", !160} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9}
+!9 = !{!"0xf\00\000\0064\0064\000\0064", i32 0, null, !10} ; [ DW_TAG_pointer_type ]
+!10 = !{!"0x2\00BPLFunctionWriter\0015\0064\0064\000\000\000", !160, null, null, !11, null, null, null} ; [ DW_TAG_class_type ] [BPLFunctionWriter] [line 15, size 64, align 64, offset 0] [def] [from ]
+!11 = !{!12, !103}
+!12 = !{!"0xd\00MW\0016\0064\0064\000\001", !160, !10, !13} ; [ DW_TAG_member ]
+!13 = !{!"0xf\00\000\0064\0064\000\000", null, null, !14} ; [ DW_TAG_pointer_type ]
+!14 = !{!"0x2\00BPLModuleWriter\0012\008\008\000\000\000", !160, null, null, !15, null, null, null} ; [ DW_TAG_class_type ] [BPLModuleWriter] [line 12, size 8, align 8, offset 0] [def] [from ]
+!15 = !{!16}
+!16 = !{!"0x2e\00writeIntrinsic\00writeIntrinsic\00_ZN15BPLModuleWriter14writeIntrinsicE8functionIFvvEE\0013\000\000\000\006\00256\000\0013", !6, !14, !17, null, null, null, i32 0, !101} ; [ DW_TAG_subprogram ]
+!17 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = !{null, !19, !20}
+!19 = !{!"0xf\00\000\0064\0064\000\0064", i32 0, null, !14} ; [ DW_TAG_pointer_type ]
+!20 = !{!"0x2\00function<void ()>\006\008\008\000\000\000", !160, null, null, !21, null, !97, null} ; [ DW_TAG_class_type ] [function<void ()>] [line 6, size 8, align 8, offset 0] [def] [from ]
+!21 = !{!22, !51, !58, !86, !92}
+!22 = !{!"0x2e\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00\008\000\000\000\006\00256\000\008", !6, !20, !23, null, null, !47, i32 0, !49} ; [ DW_TAG_subprogram ]
+!23 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !24, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = !{null, !25, !26}
+!25 = !{!"0xf\00\000\0064\0064\000\0064", i32 0, null, !20} ; [ DW_TAG_pointer_type ]
+!26 = !{!"0x2\00\0020\008\008\000\000\000", !160, !5, null, !27, null, null, null} ; [ DW_TAG_class_type ] [line 20, size 8, align 8, offset 0] [def] [from ]
+!27 = !{!28, !35, !41}
+!28 = !{!"0x2e\00operator()\00operator()\00\0020\000\000\000\006\00256\000\0020", !6, !26, !29, null, null, null, i32 0, !33} ; [ DW_TAG_subprogram ]
+!29 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !30, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!30 = !{null, !31}
+!31 = !{!"0xf\00\000\0064\0064\000\0064", i32 0, null, !32} ; [ DW_TAG_pointer_type ]
+!32 = !{!"0x26\00\000\000\000\000\000", null, null, !26} ; [ DW_TAG_const_type ]
+!33 = !{!34}
+!34 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!35 = !{!"0x2e\00~\00~\00\0020\000\000\000\006\00320\000\0020", !6, !26, !36, null, null, null, i32 0, !39} ; [ DW_TAG_subprogram ]
+!36 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !37, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!37 = !{null, !38}
+!38 = !{!"0xf\00\000\0064\0064\000\0064", i32 0, null, !26} ; [ DW_TAG_pointer_type ]
+!39 = !{!40}
+!40 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!41 = !{!"0x2e\00\00\00\0020\000\000\000\006\00320\000\0020", !6, !26, !42, null, null, null, i32 0, !45} ; [ DW_TAG_subprogram ]
+!42 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !43, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!43 = !{null, !38, !44}
+!44 = !{!"0x42\00\000\000\000\000\000", null, null, !26} ; [ DW_TAG_rvalue_reference_type ]
+!45 = !{!46}
+!46 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!47 = !{!48}
+!48 = !{!"0x2f\00_Functor\000\000", null, !26, null} ; [ DW_TAG_template_type_parameter ]
+!49 = !{!50}
+!50 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!51 = !{!"0x2e\00function<function<void ()> >\00function<function<void ()> >\00\008\000\000\000\006\00256\000\008", !6, !20, !52, null, null, !54, i32 0, !56} ; [ DW_TAG_subprogram ]
+!52 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !53, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!53 = !{null, !25, !20}
+!54 = !{!55}
+!55 = !{!"0x2f\00_Functor\000\000", null, !20, null} ; [ DW_TAG_template_type_parameter ]
+!56 = !{!57}
+!57 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!58 = !{!"0x2e\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00\008\000\000\000\006\00256\000\008", !6, !20, !59, null, null, !82, i32 0, !84} ; [ DW_TAG_subprogram ]
+!59 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !60, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!60 = !{null, !25, !61}
+!61 = !{!"0x2\00\0023\008\008\000\000\000", !160, !5, null, !62, null, null, null} ; [ DW_TAG_class_type ] [line 23, size 8, align 8, offset 0] [def] [from ]
+!62 = !{!63, !70, !76}
+!63 = !{!"0x2e\00operator()\00operator()\00\0023\000\000\000\006\00256\000\0023", !6, !61, !64, null, null, null, i32 0, !68} ; [ DW_TAG_subprogram ]
+!64 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !65, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!65 = !{null, !66}
+!66 = !{!"0xf\00\000\0064\0064\000\0064", i32 0, null, !67} ; [ DW_TAG_pointer_type ]
+!67 = !{!"0x26\00\000\000\000\000\000", null, null, !61} ; [ DW_TAG_const_type ]
+!68 = !{!69}
+!69 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!70 = !{!"0x2e\00~\00~\00\0023\000\000\000\006\00320\000\0023", !6, !61, !71, null, null, null, i32 0, !74} ; [ DW_TAG_subprogram ]
+!71 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !72, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!72 = !{null, !73}
+!73 = !{!"0xf\00\000\0064\0064\000\0064", i32 0, null, !61} ; [ DW_TAG_pointer_type ]
+!74 = !{!75}
+!75 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!76 = !{!"0x2e\00\00\00\0023\000\000\000\006\00320\000\0023", !6, !61, !77, null, null, null, i32 0, !80} ; [ DW_TAG_subprogram ]
+!77 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !78, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!78 = !{null, !73, !79}
+!79 = !{!"0x42\00\000\000\000\000\000", null, null, !61} ; [ DW_TAG_rvalue_reference_type ]
+!80 = !{!81}
+!81 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!82 = !{!83}
+!83 = !{!"0x2f\00_Functor\000\000", null, !61, null} ; [ DW_TAG_template_type_parameter ]
+!84 = !{!85}
+!85 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!86 = !{!"0x2e\00function\00function\00\006\000\000\000\006\00320\000\006", !6, !20, !87, null, null, null, i32 0, !90} ; [ DW_TAG_subprogram ]
+!87 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !88, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!88 = !{null, !25, !89}
+!89 = !{!"0x42\00\000\000\000\000\000", null, null, !20} ; [ DW_TAG_rvalue_reference_type ]
+!90 = !{!91}
+!91 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!92 = !{!"0x2e\00~function\00~function\00\006\000\000\000\006\00320\000\006", !6, !20, !93, null, null, null, i32 0, !95} ; [ DW_TAG_subprogram ]
+!93 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !94, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!94 = !{null, !25}
+!95 = !{!96}
+!96 = !{!"0x24"}                      ; [ DW_TAG_base_type ]
+!97 = !{!98}
+!98 = !{!"0x2f\00T\000\000", null, !99, null} ; [ DW_TAG_template_type_parameter ]
+!99 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !100, i32 0} ; [ DW_TAG_subroutine_type ]
+!100 = !{null}
+!101 = !{!102}
+!102 = !{!"0x24"}                     ; [ DW_TAG_base_type ]
+!103 = !{!"0x2e\00writeExpr\00writeExpr\00_ZN17BPLFunctionWriter9writeExprEv\0017\000\000\000\006\00257\000\0017", !6, !10, !7, null, null, null, i32 0, !104} ; [ DW_TAG_subprogram ]
+!104 = !{!105}
+!105 = !{!"0x24"}                     ; [ DW_TAG_base_type ]
+!106 = !{!"0x2e\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_\008\001\001\000\006\00256\000\008", !6, null, !59, null, void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_", !82, !58, !1} ; [ DW_TAG_subprogram ]
+!107 = !{!"0x2e\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_\003\001\001\000\006\00256\000\003", !6, null, !108, null, void (%class.anon.0*)* @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_", !111, !113, !1} ; [ DW_TAG_subprogram ]
+!108 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !109, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!109 = !{null, !110}
+!110 = !{!"0x10\00\000\000\000\000\000", null, null, !61} ; [ DW_TAG_reference_type ]
+!111 = !{!112}
+!112 = !{!"0x2f\00_Tp\000\000", null, !61, null} ; [ DW_TAG_template_type_parameter ]
+!113 = !{!"0x2e\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_\003\000\000\000\006\00256\000\003", !6, !114, !108, null, null, !111, i32 0, !124} ; [ DW_TAG_subprogram ]
+!114 = !{!"0x2\00_Base_manager\001\008\008\000\000\000", !160, null, null, !115, null, null, null} ; [ DW_TAG_class_type ] [_Base_manager] [line 1, size 8, align 8, offset 0] [def] [from ]
+!115 = !{!116, !113}
+!116 = !{!"0x2e\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_\003\000\000\000\006\00256\000\003", !6, !114, !117, null, null, !120, i32 0, !122} ; [ DW_TAG_subprogram ]
+!117 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !118, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!118 = !{null, !119}
+!119 = !{!"0x10\00\000\000\000\000\000", null, null, !26} ; [ DW_TAG_reference_type ]
+!120 = !{!121}
+!121 = !{!"0x2f\00_Tp\000\000", null, !26, null} ; [ DW_TAG_template_type_parameter ]
+!122 = !{!123}
+!123 = !{!"0x24"}                     ; [ DW_TAG_base_type ]
+!124 = !{!125}
+!125 = !{!"0x24"}                     ; [ DW_TAG_base_type ]
+!126 = !{!"0x2e\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_0EET_\008\001\001\000\006\00256\000\008", !6, null, !23, null, void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_0EET_", !47, !22, !1} ; [ DW_TAG_subprogram ]
+!127 = !{!"0x2e\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_\003\001\001\000\006\00256\000\003", !6, null, !117, null, void (%class.anon*)* @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_", !120, !116, !1} ; [ DW_TAG_subprogram ]
+!128 = !{!130}
+!130 = !{!"0x34\00__stored_locally\00__stored_locally\00__stored_locally\002\001\001", !114, !6, !131, i1 1, null} ; [ DW_TAG_variable ]
+!131 = !{!"0x26\00\000\000\000\000\000", null, null, !132} ; [ DW_TAG_const_type ]
+!132 = !{!"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ]
+!133 = !{!"0x101\00this\0016777235\0064", !5, !6, !134} ; [ DW_TAG_arg_variable ]
+!134 = !{!"0xf\00\000\0064\0064\000\000", null, null, !10} ; [ DW_TAG_pointer_type ]
+!135 = !MDLocation(line: 19, column: 39, scope: !5)
+!136 = !MDLocation(line: 20, column: 17, scope: !137)
+!137 = !{!"0xb\0019\0051\000", !6, !5} ; [ DW_TAG_lexical_block ]
+!138 = !MDLocation(line: 23, column: 17, scope: !137)
+!139 = !MDLocation(line: 26, column: 15, scope: !137)
+!140 = !{!"0x101\00this\0016777224\0064", !106, !6, !141} ; [ DW_TAG_arg_variable ]
+!141 = !{!"0xf\00\000\0064\0064\000\000", null, null, !20} ; [ DW_TAG_pointer_type ]
+!142 = !MDLocation(line: 8, column: 45, scope: !106)
+!143 = !{!"0x101\00__f\0033554440\000", !106, !6, !61} ; [ DW_TAG_arg_variable ]
+!144 = !MDLocation(line: 8, column: 63, scope: !106)
+!145 = !MDLocation(line: 9, column: 9, scope: !146)
+!146 = !{!"0xb\008\0081\001", !6, !106} ; [ DW_TAG_lexical_block ]
+!147 = !MDLocation(line: 10, column: 13, scope: !146)
+!148 = !MDLocation(line: 4, column: 5, scope: !149)
+!149 = !{!"0xb\003\00105\002", !6, !107} ; [ DW_TAG_lexical_block ]
+!150 = !{!"0x101\00this\0016777224\0064", !126, !6, !141} ; [ DW_TAG_arg_variable ]
+!151 = !MDLocation(line: 8, column: 45, scope: !126)
+!152 = !{!"0x101\00__f\0033554440\000", !126, !6, !26} ; [ DW_TAG_arg_variable ]
+!153 = !MDLocation(line: 8, column: 63, scope: !126)
+!154 = !MDLocation(line: 9, column: 9, scope: !155)
+!155 = !{!"0xb\008\0081\003", !6, !126} ; [ DW_TAG_lexical_block ]
+!156 = !MDLocation(line: 10, column: 13, scope: !155)
+!157 = !MDLocation(line: 4, column: 5, scope: !158)
+!158 = !{!"0xb\003\00105\004", !6, !127} ; [ DW_TAG_lexical_block ]
+!159 = !{!"0x29", !161} ; [ DW_TAG_file_type ]
+!160 = !{!"BPLFunctionWriter2.ii", !"/home/peter/crashdelta"}
+!161 = !{!"BPLFunctionWriter.cpp", !"/home/peter/crashdelta"}
+!162 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/pr13303.ll b/test/DebugInfo/X86/pr13303.ll
index 0b417bf..2ca697a 100644
--- a/test/DebugInfo/X86/pr13303.ll
+++ b/test/DebugInfo/X86/pr13303.ll
@@ -15,15 +15,15 @@ entry:
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.2 (trunk 160143)\000\00\000\00\000", metadata !12, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ] [/home/probinson/PR13303.c] [DW_LANG_C99]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\000\000\001", metadata !12, metadata !6, metadata !7, null, i32 ()* @main, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
-!6 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 1, i32 14, metadata !11, null}
-!11 = metadata !{metadata !"0xb\001\0012\000", metadata !12, metadata !5} ; [ DW_TAG_lexical_block ] [/home/probinson/PR13303.c]
-!12 = metadata !{metadata !"PR13303.c", metadata !"/home/probinson"}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.2 (trunk 160143)\000\00\000\00\000", !12, !1, !1, !3, !1,  !1} ; [ DW_TAG_compile_unit ] [/home/probinson/PR13303.c] [DW_LANG_C99]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00main\00main\00\001\000\001\000\006\000\000\001", !12, !6, !7, null, i32 ()* @main, null, null, !1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
+!6 = !{!"0x29", !12} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !MDLocation(line: 1, column: 14, scope: !11)
+!11 = !{!"0xb\001\0012\000", !12, !5} ; [ DW_TAG_lexical_block ] [/home/probinson/PR13303.c]
+!12 = !{!"PR13303.c", !"/home/probinson"}
+!13 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/pr19307.ll b/test/DebugInfo/X86/pr19307.ll
index 4223cb7..38d8050 100644
--- a/test/DebugInfo/X86/pr19307.ll
+++ b/test/DebugInfo/X86/pr19307.ll
@@ -40,10 +40,10 @@ entry:
   %offset.addr = alloca i64*, align 8
   %limit.addr = alloca i64*, align 8
   store i64* %offset, i64** %offset.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i64** %offset.addr}, metadata !45, metadata !{metadata !"0x102"}), !dbg !46
+  call void @llvm.dbg.declare(metadata i64** %offset.addr, metadata !45, metadata !{!"0x102"}), !dbg !46
   store i64* %limit, i64** %limit.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i64** %limit.addr}, metadata !47, metadata !{metadata !"0x102"}), !dbg !46
-  call void @llvm.dbg.declare(metadata !{%"class.std::basic_string"* %range}, metadata !48, metadata !{metadata !"0x102"}), !dbg !49
+  call void @llvm.dbg.declare(metadata i64** %limit.addr, metadata !47, metadata !{!"0x102"}), !dbg !46
+  call void @llvm.dbg.declare(metadata %"class.std::basic_string"* %range, metadata !48, metadata !{!"0x102\006"}), !dbg !49
   %call = call i32 @_ZNKSs7compareEmmPKc(%"class.std::basic_string"* %range, i64 0, i64 6, i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0)), !dbg !50
   %cmp = icmp ne i32 %call, 0, !dbg !50
   br i1 %cmp, label %if.then, label %lor.lhs.false, !dbg !50
@@ -84,62 +84,62 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !llvm.module.flags = !{!42, !43}
 !llvm.ident = !{!44}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (209308)\000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !12, metadata !2, metadata !21} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/pr19307.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"pr19307.cc", metadata !"/llvm_cmake_gcc"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !6, metadata !8}
-!4 = metadata !{metadata !"0x13\00\0083\000\000\000\004\000", metadata !5, null, null, null, null, null, metadata !"_ZTS11__mbstate_t"} ; [ DW_TAG_structure_type ] [line 83, size 0, align 0, offset 0] [decl] [from ]
-!5 = metadata !{metadata !"/usr/include/wchar.h", metadata !"/llvm_cmake_gcc"}
-!6 = metadata !{metadata !"0x13\00lconv\0054\000\000\000\004\000", metadata !7, null, null, null, null, null, metadata !"_ZTS5lconv"} ; [ DW_TAG_structure_type ] [lconv] [line 54, size 0, align 0, offset 0] [decl] [from ]
-!7 = metadata !{metadata !"/usr/include/locale.h", metadata !"/llvm_cmake_gcc"}
-!8 = metadata !{metadata !"0x2\00basic_string<char, std::char_traits<char>, std::allocator<char> >\001134\000\000\000\004\000", metadata !9, metadata !10, null, null, null, null, metadata !"_ZTSSs"} ; [ DW_TAG_class_type ] [basic_string<char, std::char_traits<char>, std::allocator<char> >] [line 1134, size 0, align 0, offset 0] [decl] [from ]
-!9 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/basic_string.tcc", metadata !"/llvm_cmake_gcc"}
-!10 = metadata !{metadata !"0x39\00std\00153", metadata !11, null} ; [ DW_TAG_namespace ] [std] [line 153]
-!11 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/x86_64-linux-gnu/bits/c++config.h", metadata !"/llvm_cmake_gcc"}
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x2e\00parse_range\00parse_range\00_Z11parse_rangeRyS_Ss\003\000\001\000\006\00256\000\004", metadata !1, metadata !14, metadata !15, null, void (i64*, i64*, %"class.std::basic_string"*)* @_Z11parse_rangeRyS_Ss, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [parse_range]
-!14 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/pr19307.cc]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{null, metadata !17, metadata !17, metadata !19}
-!17 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !18} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
-!18 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!19 = metadata !{metadata !"0x16\00string\0065\000\000\000\000", metadata !20, metadata !10, metadata !"_ZTSSs"} ; [ DW_TAG_typedef ] [string] [line 65, size 0, align 0, offset 0] [from _ZTSSs]
-!20 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/stringfwd.h", metadata !"/llvm_cmake_gcc"}
-!21 = metadata !{metadata !22, metadata !26, metadata !29, metadata !33, metadata !38, metadata !41}
-!22 = metadata !{metadata !"0x3a\0057\00", metadata !23, metadata !25} ; [ DW_TAG_imported_module ]
-!23 = metadata !{metadata !"0x39\00__gnu_debug\0055", metadata !24, null} ; [ DW_TAG_namespace ] [__gnu_debug] [line 55]
-!24 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/debug/debug.h", metadata !"/llvm_cmake_gcc"}
-!25 = metadata !{metadata !"0x39\00__debug\0049", metadata !24, metadata !10} ; [ DW_TAG_namespace ] [__debug] [line 49]
-!26 = metadata !{metadata !"0x8\0066\00", metadata !10, metadata !27} ; [ DW_TAG_imported_declaration ]
-!27 = metadata !{metadata !"0x16\00mbstate_t\00106\000\000\000\000", metadata !5, null, metadata !28} ; [ DW_TAG_typedef ] [mbstate_t] [line 106, size 0, align 0, offset 0] [from __mbstate_t]
-!28 = metadata !{metadata !"0x16\00__mbstate_t\0095\000\000\000\000", metadata !5, null, metadata !"_ZTS11__mbstate_t"} ; [ DW_TAG_typedef ] [__mbstate_t] [line 95, size 0, align 0, offset 0] [from _ZTS11__mbstate_t]
-!29 = metadata !{metadata !"0x8\00141\00", metadata !10, metadata !30} ; [ DW_TAG_imported_declaration ]
-!30 = metadata !{metadata !"0x16\00wint_t\00141\000\000\000\000", metadata !31, null, metadata !32} ; [ DW_TAG_typedef ] [wint_t] [line 141, size 0, align 0, offset 0] [from unsigned int]
-!31 = metadata !{metadata !"/llvm_cmake_gcc/bin/../lib/clang/3.5.0/include/stddef.h", metadata !"/llvm_cmake_gcc"}
-!32 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
-!33 = metadata !{metadata !"0x8\0042\00", metadata !34, metadata !36} ; [ DW_TAG_imported_declaration ]
-!34 = metadata !{metadata !"0x39\00__gnu_cxx\0069", metadata !35, null} ; [ DW_TAG_namespace ] [__gnu_cxx] [line 69]
-!35 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/cpp_type_traits.h", metadata !"/llvm_cmake_gcc"}
-!36 = metadata !{metadata !"0x16\00size_t\00155\000\000\000\000", metadata !11, metadata !10, metadata !37} ; [ DW_TAG_typedef ] [size_t] [line 155, size 0, align 0, offset 0] [from long unsigned int]
-!37 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!38 = metadata !{metadata !"0x8\0043\00", metadata !34, metadata !39} ; [ DW_TAG_imported_declaration ]
-!39 = metadata !{metadata !"0x16\00ptrdiff_t\00156\000\000\000\000", metadata !11, metadata !10, metadata !40} ; [ DW_TAG_typedef ] [ptrdiff_t] [line 156, size 0, align 0, offset 0] [from long int]
-!40 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
-!41 = metadata !{metadata !"0x8\0055\00", metadata !10, metadata !"_ZTS5lconv"} ; [ DW_TAG_imported_declaration ]
-!42 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!43 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!44 = metadata !{metadata !"clang version 3.5.0 (209308)"}
-!45 = metadata !{metadata !"0x101\00offset\0016777219\000", metadata !13, metadata !14, metadata !17} ; [ DW_TAG_arg_variable ] [offset] [line 3]
-!46 = metadata !{i32 3, i32 0, metadata !13, null}
-!47 = metadata !{metadata !"0x101\00limit\0033554435\000", metadata !13, metadata !14, metadata !17} ; [ DW_TAG_arg_variable ] [limit] [line 3]
-!48 = metadata !{metadata !"0x101\00range\0050331652\008192", metadata !13, metadata !14, metadata !19} ; [ DW_TAG_arg_variable ] [range] [line 4]
-!49 = metadata !{i32 4, i32 0, metadata !13, null}
-!50 = metadata !{i32 5, i32 0, metadata !51, null}
-!51 = metadata !{metadata !"0xb\005\000\000", metadata !1, metadata !13} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/pr19307.cc]
-!52 = metadata !{i32 5, i32 0, metadata !53, null}
-!53 = metadata !{metadata !"0xb\005\000\001", metadata !1, metadata !51} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/pr19307.cc]
-!54 = metadata !{i32 6, i32 0, metadata !51, null}
-!55 = metadata !{i32 7, i32 0, metadata !13, null}
-!56 = metadata !{i32 8, i32 0, metadata !13, null}
-!57 = metadata !{i32 9, i32 0, metadata !13, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 (209308)\000\00\000\00\001", !1, !2, !3, !12, !2, !21} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/pr19307.cc] [DW_LANG_C_plus_plus]
+!1 = !{!"pr19307.cc", !"/llvm_cmake_gcc"}
+!2 = !{}
+!3 = !{!4, !6, !8}
+!4 = !{!"0x13\00\0083\000\000\000\004\000", !5, null, null, null, null, null, !"_ZTS11__mbstate_t"} ; [ DW_TAG_structure_type ] [line 83, size 0, align 0, offset 0] [decl] [from ]
+!5 = !{!"/usr/include/wchar.h", !"/llvm_cmake_gcc"}
+!6 = !{!"0x13\00lconv\0054\000\000\000\004\000", !7, null, null, null, null, null, !"_ZTS5lconv"} ; [ DW_TAG_structure_type ] [lconv] [line 54, size 0, align 0, offset 0] [decl] [from ]
+!7 = !{!"/usr/include/locale.h", !"/llvm_cmake_gcc"}
+!8 = !{!"0x2\00basic_string<char, std::char_traits<char>, std::allocator<char> >\001134\000\000\000\004\000", !9, !10, null, null, null, null, !"_ZTSSs"} ; [ DW_TAG_class_type ] [basic_string<char, std::char_traits<char>, std::allocator<char> >] [line 1134, size 0, align 0, offset 0] [decl] [from ]
+!9 = !{!"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/basic_string.tcc", !"/llvm_cmake_gcc"}
+!10 = !{!"0x39\00std\00153", !11, null} ; [ DW_TAG_namespace ] [std] [line 153]
+!11 = !{!"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/x86_64-linux-gnu/bits/c++config.h", !"/llvm_cmake_gcc"}
+!12 = !{!13}
+!13 = !{!"0x2e\00parse_range\00parse_range\00_Z11parse_rangeRyS_Ss\003\000\001\000\006\00256\000\004", !1, !14, !15, null, void (i64*, i64*, %"class.std::basic_string"*)* @_Z11parse_rangeRyS_Ss, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [parse_range]
+!14 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/pr19307.cc]
+!15 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{null, !17, !17, !19}
+!17 = !{!"0x10\00\000\000\000\000\000", null, null, !18} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
+!18 = !{!"0x24\00long long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!19 = !{!"0x16\00string\0065\000\000\000\000", !20, !10, !"_ZTSSs"} ; [ DW_TAG_typedef ] [string] [line 65, size 0, align 0, offset 0] [from _ZTSSs]
+!20 = !{!"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/stringfwd.h", !"/llvm_cmake_gcc"}
+!21 = !{!22, !26, !29, !33, !38, !41}
+!22 = !{!"0x3a\0057\00", !23, !25} ; [ DW_TAG_imported_module ]
+!23 = !{!"0x39\00__gnu_debug\0055", !24, null} ; [ DW_TAG_namespace ] [__gnu_debug] [line 55]
+!24 = !{!"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/debug/debug.h", !"/llvm_cmake_gcc"}
+!25 = !{!"0x39\00__debug\0049", !24, !10} ; [ DW_TAG_namespace ] [__debug] [line 49]
+!26 = !{!"0x8\0066\00", !10, !27} ; [ DW_TAG_imported_declaration ]
+!27 = !{!"0x16\00mbstate_t\00106\000\000\000\000", !5, null, !28} ; [ DW_TAG_typedef ] [mbstate_t] [line 106, size 0, align 0, offset 0] [from __mbstate_t]
+!28 = !{!"0x16\00__mbstate_t\0095\000\000\000\000", !5, null, !"_ZTS11__mbstate_t"} ; [ DW_TAG_typedef ] [__mbstate_t] [line 95, size 0, align 0, offset 0] [from _ZTS11__mbstate_t]
+!29 = !{!"0x8\00141\00", !10, !30} ; [ DW_TAG_imported_declaration ]
+!30 = !{!"0x16\00wint_t\00141\000\000\000\000", !31, null, !32} ; [ DW_TAG_typedef ] [wint_t] [line 141, size 0, align 0, offset 0] [from unsigned int]
+!31 = !{!"/llvm_cmake_gcc/bin/../lib/clang/3.5.0/include/stddef.h", !"/llvm_cmake_gcc"}
+!32 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!33 = !{!"0x8\0042\00", !34, !36} ; [ DW_TAG_imported_declaration ]
+!34 = !{!"0x39\00__gnu_cxx\0069", !35, null} ; [ DW_TAG_namespace ] [__gnu_cxx] [line 69]
+!35 = !{!"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/cpp_type_traits.h", !"/llvm_cmake_gcc"}
+!36 = !{!"0x16\00size_t\00155\000\000\000\000", !11, !10, !37} ; [ DW_TAG_typedef ] [size_t] [line 155, size 0, align 0, offset 0] [from long unsigned int]
+!37 = !{!"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!38 = !{!"0x8\0043\00", !34, !39} ; [ DW_TAG_imported_declaration ]
+!39 = !{!"0x16\00ptrdiff_t\00156\000\000\000\000", !11, !10, !40} ; [ DW_TAG_typedef ] [ptrdiff_t] [line 156, size 0, align 0, offset 0] [from long int]
+!40 = !{!"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!41 = !{!"0x8\0055\00", !10, !"_ZTS5lconv"} ; [ DW_TAG_imported_declaration ]
+!42 = !{i32 2, !"Dwarf Version", i32 4}
+!43 = !{i32 2, !"Debug Info Version", i32 2}
+!44 = !{!"clang version 3.5.0 (209308)"}
+!45 = !{!"0x101\00offset\0016777219\000", !13, !14, !17} ; [ DW_TAG_arg_variable ] [offset] [line 3]
+!46 = !MDLocation(line: 3, scope: !13)
+!47 = !{!"0x101\00limit\0033554435\000", !13, !14, !17} ; [ DW_TAG_arg_variable ] [limit] [line 3]
+!48 = !{!"0x101\00range\0050331652\000", !13, !14, !19} ; [ DW_TAG_arg_variable ] [range] [line 4]
+!49 = !MDLocation(line: 4, scope: !13)
+!50 = !MDLocation(line: 5, scope: !51)
+!51 = !{!"0xb\005\000\000", !1, !13} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/pr19307.cc]
+!52 = !MDLocation(line: 5, scope: !53)
+!53 = !{!"0xb\005\000\001", !1, !51} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/pr19307.cc]
+!54 = !MDLocation(line: 6, scope: !51)
+!55 = !MDLocation(line: 7, scope: !13)
+!56 = !MDLocation(line: 8, scope: !13)
+!57 = !MDLocation(line: 9, scope: !13)
 
diff --git a/test/DebugInfo/X86/processes-relocations.ll b/test/DebugInfo/X86/processes-relocations.ll
index 2a29be4..b60ffdf 100644
--- a/test/DebugInfo/X86/processes-relocations.ll
+++ b/test/DebugInfo/X86/processes-relocations.ll
@@ -13,9 +13,9 @@
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"empty.c", metadata !"/a"}
-!2 = metadata !{}
-!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!5 = metadata !{metadata !"clang version 3.6.0 "}
+!0 = !{i32 786449, !1, i32 12, !"clang version 3.6.0 ", i1 false, !"", i32 0, !2, !2, !2, !2, !2, !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = !{!"empty.c", !"/a"}
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 2}
+!5 = !{!"clang version 3.6.0 "}
diff --git a/test/DebugInfo/X86/prologue-stack.ll b/test/DebugInfo/X86/prologue-stack.ll
index b6dbd41..1cc872a 100644
--- a/test/DebugInfo/X86/prologue-stack.ll
+++ b/test/DebugInfo/X86/prologue-stack.ll
@@ -21,16 +21,16 @@ declare i32 @callme(i32)
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.2 (trunk 164980) (llvm/trunk 164979)\000\00\000\00\000", metadata !13, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/bar.c] [DW_LANG_C99]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00isel_line_test2\00isel_line_test2\00\003\000\001\000\006\000\000\004", metadata !13, metadata !6, metadata !7, null, i32 ()* @isel_line_test2, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [isel_line_test2]
-!6 = metadata !{metadata !"0x29", metadata !13} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 5, i32 3, metadata !11, null}
-!11 = metadata !{metadata !"0xb\004\001\000", metadata !13, metadata !5} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/bar.c]
-!12 = metadata !{i32 6, i32 3, metadata !11, null}
-!13 = metadata !{metadata !"bar.c", metadata !"/usr/local/google/home/echristo/tmp"}
-!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.2 (trunk 164980) (llvm/trunk 164979)\000\00\000\00\000", !13, !1, !1, !3, !1,  !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/bar.c] [DW_LANG_C99]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00isel_line_test2\00isel_line_test2\00\003\000\001\000\006\000\000\004", !13, !6, !7, null, i32 ()* @isel_line_test2, null, null, !1} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [isel_line_test2]
+!6 = !{!"0x29", !13} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !MDLocation(line: 5, column: 3, scope: !11)
+!11 = !{!"0xb\004\001\000", !13, !5} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/bar.c]
+!12 = !MDLocation(line: 6, column: 3, scope: !11)
+!13 = !{!"bar.c", !"/usr/local/google/home/echristo/tmp"}
+!14 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/recursive_inlining.ll b/test/DebugInfo/X86/recursive_inlining.ll
index 251f04e..6cd935c 100644
--- a/test/DebugInfo/X86/recursive_inlining.ll
+++ b/test/DebugInfo/X86/recursive_inlining.ll
@@ -95,7 +95,7 @@ define void @_Z3fn6v() #0 {
 entry:
   tail call void @_Z3fn8v() #3, !dbg !31
   %0 = load %struct.C** @x, align 8, !dbg !32, !tbaa !33
-  tail call void @llvm.dbg.value(metadata !{%struct.C* %0}, i64 0, metadata !37, metadata !{metadata !"0x102"}) #3, !dbg !38
+  tail call void @llvm.dbg.value(metadata %struct.C* %0, i64 0, metadata !37, metadata !{!"0x102"}) #3, !dbg !38
   tail call void @_Z3fn8v() #3, !dbg !39
   %b.i = getelementptr inbounds %struct.C* %0, i64 0, i32 0, !dbg !40
   %1 = load i32* %b.i, align 4, !dbg !40, !tbaa !42
@@ -116,7 +116,7 @@ declare void @_Z3fn8v() #1
 ; Function Attrs: nounwind
 define linkonce_odr void @_ZN1C5m_fn2Ev(%struct.C* nocapture readonly %this) #0 align 2 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.C* %this}, i64 0, metadata !24, metadata !{metadata !"0x102"}), !dbg !49
+  tail call void @llvm.dbg.value(metadata %struct.C* %this, i64 0, metadata !24, metadata !{!"0x102"}), !dbg !49
   tail call void @_Z3fn8v() #3, !dbg !50
   %b = getelementptr inbounds %struct.C* %this, i64 0, i32 0, !dbg !51
   %0 = load i32* %b, align 4, !dbg !51, !tbaa !42
@@ -130,7 +130,7 @@ if.then:                                          ; preds = %entry
 if.end:                                           ; preds = %entry, %if.then
   tail call void @_Z3fn8v() #3, !dbg !53
   %1 = load %struct.C** @x, align 8, !dbg !56, !tbaa !33
-  tail call void @llvm.dbg.value(metadata !{%struct.C* %1}, i64 0, metadata !57, metadata !{metadata !"0x102"}) #3, !dbg !58
+  tail call void @llvm.dbg.value(metadata %struct.C* %1, i64 0, metadata !57, metadata !{!"0x102"}) #3, !dbg !58
   tail call void @_Z3fn8v() #3, !dbg !59
   %b.i.i = getelementptr inbounds %struct.C* %1, i64 0, i32 0, !dbg !60
   %2 = load i32* %b.i.i, align 4, !dbg !60, !tbaa !42
@@ -154,7 +154,7 @@ entry:
 tailrecurse:                                      ; preds = %tailrecurse.backedge, %entry
   tail call void @_Z3fn8v() #3, !dbg !64
   %0 = load %struct.C** @x, align 8, !dbg !66, !tbaa !33
-  tail call void @llvm.dbg.value(metadata !{%struct.C* %0}, i64 0, metadata !67, metadata !{metadata !"0x102"}) #3, !dbg !68
+  tail call void @llvm.dbg.value(metadata %struct.C* %0, i64 0, metadata !67, metadata !{!"0x102"}) #3, !dbg !68
   tail call void @_Z3fn8v() #3, !dbg !69
   %b.i.i = getelementptr inbounds %struct.C* %0, i64 0, i32 0, !dbg !70
   %1 = load i32* %b.i.i, align 4, !dbg !70, !tbaa !42
@@ -199,77 +199,77 @@ attributes #3 = { nounwind }
 !llvm.module.flags = !{!28, !29}
 !llvm.ident = !{!30}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !13, metadata !26, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce/<stdin>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"<stdin>", metadata !"/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00C\005\0032\0032\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 5, size 32, align 32, offset 0] [def] [from ]
-!5 = metadata !{metadata !"recursive_inlining.cpp", metadata !"/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce"}
-!6 = metadata !{metadata !7, metadata !9}
-!7 = metadata !{metadata !"0xd\00b\006\0032\0032\000\000", metadata !5, metadata !"_ZTS1C", metadata !8} ; [ DW_TAG_member ] [b] [line 6, size 32, align 32, offset 0] [from int]
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x2e\00m_fn2\00m_fn2\00_ZN1C5m_fn2Ev\007\000\000\000\006\00256\001\007", metadata !5, metadata !"_ZTS1C", metadata !10, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [m_fn2]
-!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!11 = metadata !{null, metadata !12}
-!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
-!13 = metadata !{metadata !14, metadata !18, metadata !19, metadata !20, metadata !21, metadata !22}
-!14 = metadata !{metadata !"0x2e\00fn6\00fn6\00_Z3fn6v\0015\000\001\000\006\00256\001\0015", metadata !5, metadata !15, metadata !16, null, void ()* @_Z3fn6v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 15] [def] [fn6]
-!15 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce/recursive_inlining.cpp]
-!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!17 = metadata !{null}
-!18 = metadata !{metadata !"0x2e\00fn3\00fn3\00_Z3fn3v\0020\000\001\000\006\00256\001\0020", metadata !5, metadata !15, metadata !16, null, void ()* @_Z3fn3v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 20] [def] [fn3]
-!19 = metadata !{metadata !"0x2e\00fn4\00fn4\00_Z3fn4v\0021\000\001\000\006\00256\001\0021", metadata !5, metadata !15, metadata !16, null, void ()* @_Z3fn4v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 21] [def] [fn4]
-!20 = metadata !{metadata !"0x2e\00fn5\00fn5\00_Z3fn5v\0022\000\001\000\006\00256\001\0022", metadata !5, metadata !15, metadata !16, null, void ()* @_Z3fn5v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 22] [def] [fn5]
-!21 = metadata !{metadata !"0x2e\00fn7\00fn7\00_Z3fn7v\0014\000\001\000\006\00256\001\0014", metadata !5, metadata !15, metadata !16, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 14] [def] [fn7]
-!22 = metadata !{metadata !"0x2e\00m_fn2\00m_fn2\00_ZN1C5m_fn2Ev\007\000\001\000\006\00256\001\007", metadata !5, metadata !"_ZTS1C", metadata !10, null, void (%struct.C*)* @_ZN1C5m_fn2Ev, null, metadata !9, metadata !23} ; [ DW_TAG_subprogram ] [line 7] [def] [m_fn2]
-!23 = metadata !{metadata !24}
-!24 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !22, null, metadata !25} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!25 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
-!26 = metadata !{metadata !27}
-!27 = metadata !{metadata !"0x34\00x\00x\00\0013\000\001", null, metadata !15, metadata !25, %struct.C** @x, null} ; [ DW_TAG_variable ] [x] [line 13] [def]
-!28 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!29 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!30 = metadata !{metadata !"clang version 3.6.0 "}
-!31 = metadata !{i32 16, i32 0, metadata !14, null}
-!32 = metadata !{i32 17, i32 0, metadata !14, null}
-!33 = metadata !{metadata !34, metadata !34, i64 0}
-!34 = metadata !{metadata !"any pointer", metadata !35, i64 0}
-!35 = metadata !{metadata !"omnipotent char", metadata !36, i64 0}
-!36 = metadata !{metadata !"Simple C/C++ TBAA"}
-!37 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !22, null, metadata !25, metadata !32} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!38 = metadata !{i32 0, i32 0, metadata !22, metadata !32}
-!39 = metadata !{i32 8, i32 0, metadata !22, metadata !32}
-!40 = metadata !{i32 9, i32 0, metadata !41, metadata !32}
-!41 = metadata !{metadata !"0xb\009\000\000", metadata !5, metadata !22} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce/recursive_inlining.cpp]
-!42 = metadata !{metadata !43, metadata !44, i64 0}
-!43 = metadata !{metadata !"_ZTS1C", metadata !44, i64 0}
-!44 = metadata !{metadata !"int", metadata !35, i64 0}
-!45 = metadata !{i32 9, i32 0, metadata !46, metadata !32}
-!46 = metadata !{metadata !"0xb\009\000\001", metadata !5, metadata !41} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce/recursive_inlining.cpp]
-!47 = metadata !{i32 10, i32 0, metadata !22, metadata !32}
-!48 = metadata !{i32 19, i32 0, metadata !14, null}
-!49 = metadata !{i32 0, i32 0, metadata !22, null}
-!50 = metadata !{i32 8, i32 0, metadata !22, null}
-!51 = metadata !{i32 9, i32 0, metadata !41, null}
-!52 = metadata !{i32 9, i32 0, metadata !46, null}
-!53 = metadata !{i32 16, i32 0, metadata !14, metadata !54}
-!54 = metadata !{i32 20, i32 0, metadata !18, metadata !55}
-!55 = metadata !{i32 10, i32 0, metadata !22, null}
-!56 = metadata !{i32 17, i32 0, metadata !14, metadata !54}
-!57 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !22, null, metadata !25, metadata !56} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!58 = metadata !{i32 0, i32 0, metadata !22, metadata !56}
-!59 = metadata !{i32 8, i32 0, metadata !22, metadata !56}
-!60 = metadata !{i32 9, i32 0, metadata !41, metadata !56}
-!61 = metadata !{i32 9, i32 0, metadata !46, metadata !56}
-!62 = metadata !{i32 10, i32 0, metadata !22, metadata !56}
-!63 = metadata !{i32 11, i32 0, metadata !22, null}
-!64 = metadata !{i32 16, i32 0, metadata !14, metadata !65}
-!65 = metadata !{i32 20, i32 0, metadata !18, null}
-!66 = metadata !{i32 17, i32 0, metadata !14, metadata !65}
-!67 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !22, null, metadata !25, metadata !66} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!68 = metadata !{i32 0, i32 0, metadata !22, metadata !66}
-!69 = metadata !{i32 8, i32 0, metadata !22, metadata !66}
-!70 = metadata !{i32 9, i32 0, metadata !41, metadata !66}
-!71 = metadata !{i32 9, i32 0, metadata !46, metadata !66}
-!72 = metadata !{i32 21, i32 0, metadata !19, null}
-!73 = metadata !{i32 22, i32 0, metadata !20, null}
+!0 = !{!"0x11\004\00clang version 3.6.0 \001\00\000\00\001", !1, !2, !3, !13, !26, !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce/<stdin>] [DW_LANG_C_plus_plus]
+!1 = !{!"<stdin>", !"/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00C\005\0032\0032\000\000\000", !5, null, null, !6, null, null, !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 5, size 32, align 32, offset 0] [def] [from ]
+!5 = !{!"recursive_inlining.cpp", !"/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce"}
+!6 = !{!7, !9}
+!7 = !{!"0xd\00b\006\0032\0032\000\000", !5, !"_ZTS1C", !8} ; [ DW_TAG_member ] [b] [line 6, size 32, align 32, offset 0] [from int]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x2e\00m_fn2\00m_fn2\00_ZN1C5m_fn2Ev\007\000\000\000\006\00256\001\007", !5, !"_ZTS1C", !10, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [m_fn2]
+!10 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = !{null, !12}
+!12 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!13 = !{!14, !18, !19, !20, !21, !22}
+!14 = !{!"0x2e\00fn6\00fn6\00_Z3fn6v\0015\000\001\000\006\00256\001\0015", !5, !15, !16, null, void ()* @_Z3fn6v, null, null, !2} ; [ DW_TAG_subprogram ] [line 15] [def] [fn6]
+!15 = !{!"0x29", !5}         ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce/recursive_inlining.cpp]
+!16 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = !{null}
+!18 = !{!"0x2e\00fn3\00fn3\00_Z3fn3v\0020\000\001\000\006\00256\001\0020", !5, !15, !16, null, void ()* @_Z3fn3v, null, null, !2} ; [ DW_TAG_subprogram ] [line 20] [def] [fn3]
+!19 = !{!"0x2e\00fn4\00fn4\00_Z3fn4v\0021\000\001\000\006\00256\001\0021", !5, !15, !16, null, void ()* @_Z3fn4v, null, null, !2} ; [ DW_TAG_subprogram ] [line 21] [def] [fn4]
+!20 = !{!"0x2e\00fn5\00fn5\00_Z3fn5v\0022\000\001\000\006\00256\001\0022", !5, !15, !16, null, void ()* @_Z3fn5v, null, null, !2} ; [ DW_TAG_subprogram ] [line 22] [def] [fn5]
+!21 = !{!"0x2e\00fn7\00fn7\00_Z3fn7v\0014\000\001\000\006\00256\001\0014", !5, !15, !16, null, null, null, null, !2} ; [ DW_TAG_subprogram ] [line 14] [def] [fn7]
+!22 = !{!"0x2e\00m_fn2\00m_fn2\00_ZN1C5m_fn2Ev\007\000\001\000\006\00256\001\007", !5, !"_ZTS1C", !10, null, void (%struct.C*)* @_ZN1C5m_fn2Ev, null, !9, !23} ; [ DW_TAG_subprogram ] [line 7] [def] [m_fn2]
+!23 = !{!24}
+!24 = !{!"0x101\00this\0016777216\001088", !22, null, !25} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!25 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!26 = !{!27}
+!27 = !{!"0x34\00x\00x\00\0013\000\001", null, !15, !25, %struct.C** @x, null} ; [ DW_TAG_variable ] [x] [line 13] [def]
+!28 = !{i32 2, !"Dwarf Version", i32 4}
+!29 = !{i32 2, !"Debug Info Version", i32 2}
+!30 = !{!"clang version 3.6.0 "}
+!31 = !MDLocation(line: 16, scope: !14)
+!32 = !MDLocation(line: 17, scope: !14)
+!33 = !{!34, !34, i64 0}
+!34 = !{!"any pointer", !35, i64 0}
+!35 = !{!"omnipotent char", !36, i64 0}
+!36 = !{!"Simple C/C++ TBAA"}
+!37 = !{!"0x101\00this\0016777216\001088", !22, null, !25, !32} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!38 = !MDLocation(line: 0, scope: !22, inlinedAt: !32)
+!39 = !MDLocation(line: 8, scope: !22, inlinedAt: !32)
+!40 = !MDLocation(line: 9, scope: !41, inlinedAt: !32)
+!41 = !{!"0xb\009\000\000", !5, !22} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce/recursive_inlining.cpp]
+!42 = !{!43, !44, i64 0}
+!43 = !{!"_ZTS1C", !44, i64 0}
+!44 = !{!"int", !35, i64 0}
+!45 = !MDLocation(line: 9, scope: !46, inlinedAt: !32)
+!46 = !{!"0xb\009\000\001", !5, !41} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce/recursive_inlining.cpp]
+!47 = !MDLocation(line: 10, scope: !22, inlinedAt: !32)
+!48 = !MDLocation(line: 19, scope: !14)
+!49 = !MDLocation(line: 0, scope: !22)
+!50 = !MDLocation(line: 8, scope: !22)
+!51 = !MDLocation(line: 9, scope: !41)
+!52 = !MDLocation(line: 9, scope: !46)
+!53 = !MDLocation(line: 16, scope: !14, inlinedAt: !54)
+!54 = !MDLocation(line: 20, scope: !18, inlinedAt: !55)
+!55 = !MDLocation(line: 10, scope: !22)
+!56 = !MDLocation(line: 17, scope: !14, inlinedAt: !54)
+!57 = !{!"0x101\00this\0016777216\001088", !22, null, !25, !56} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!58 = !MDLocation(line: 0, scope: !22, inlinedAt: !56)
+!59 = !MDLocation(line: 8, scope: !22, inlinedAt: !56)
+!60 = !MDLocation(line: 9, scope: !41, inlinedAt: !56)
+!61 = !MDLocation(line: 9, scope: !46, inlinedAt: !56)
+!62 = !MDLocation(line: 10, scope: !22, inlinedAt: !56)
+!63 = !MDLocation(line: 11, scope: !22)
+!64 = !MDLocation(line: 16, scope: !14, inlinedAt: !65)
+!65 = !MDLocation(line: 20, scope: !18)
+!66 = !MDLocation(line: 17, scope: !14, inlinedAt: !65)
+!67 = !{!"0x101\00this\0016777216\001088", !22, null, !25, !66} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!68 = !MDLocation(line: 0, scope: !22, inlinedAt: !66)
+!69 = !MDLocation(line: 8, scope: !22, inlinedAt: !66)
+!70 = !MDLocation(line: 9, scope: !41, inlinedAt: !66)
+!71 = !MDLocation(line: 9, scope: !46, inlinedAt: !66)
+!72 = !MDLocation(line: 21, scope: !19)
+!73 = !MDLocation(line: 22, scope: !20)
diff --git a/test/DebugInfo/X86/ref_addr_relocation.ll b/test/DebugInfo/X86/ref_addr_relocation.ll
index 4d77322..7e39b8e 100644
--- a/test/DebugInfo/X86/ref_addr_relocation.ll
+++ b/test/DebugInfo/X86/ref_addr_relocation.ll
@@ -53,19 +53,19 @@
 !llvm.dbg.cu = !{!0, !9}
 !llvm.module.flags = !{!14, !15}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (trunk 191799)\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !2, metadata !6, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu1.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"tu1.cpp", metadata !"/Users/manmanren/test-Nov/type_unique_air/ref_addr"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00foo\001\008\008\000\000\000", metadata !5, null, null, metadata !2, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !"./hdr.h", metadata !"/Users/manmanren/test-Nov/type_unique_air/ref_addr"}
-!6 = metadata !{metadata !7}
-!7 = metadata !{metadata !"0x34\00f\00f\00\002\000\001", null, metadata !8, metadata !4, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 2] [def]
-!8 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu1.cpp]
-!9 = metadata !{metadata !"0x11\004\00clang version 3.4 (trunk 191799)\000\00\000\00\000", metadata !10, metadata !2, metadata !3, metadata !2, metadata !11, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu2.cpp] [DW_LANG_C_plus_plus]
-!10 = metadata !{metadata !"tu2.cpp", metadata !"/Users/manmanren/test-Nov/type_unique_air/ref_addr"}
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0x34\00g\00g\00\002\000\001", null, metadata !13, metadata !4, %struct.foo* @g, null} ; [ DW_TAG_variable ] [g] [line 2] [def]
-!13 = metadata !{metadata !"0x29", metadata !10}        ; [ DW_TAG_file_type ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu2.cpp]
-!14 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 (trunk 191799)\000\00\000\00\000", !1, !2, !3, !2, !6, !2} ; [ DW_TAG_compile_unit ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu1.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"tu1.cpp", !"/Users/manmanren/test-Nov/type_unique_air/ref_addr"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00foo\001\008\008\000\000\000", !5, null, null, !2, null, null, !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!"./hdr.h", !"/Users/manmanren/test-Nov/type_unique_air/ref_addr"}
+!6 = !{!7}
+!7 = !{!"0x34\00f\00f\00\002\000\001", null, !8, !4, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 2] [def]
+!8 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu1.cpp]
+!9 = !{!"0x11\004\00clang version 3.4 (trunk 191799)\000\00\000\00\000", !10, !2, !3, !2, !11, !2} ; [ DW_TAG_compile_unit ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu2.cpp] [DW_LANG_C_plus_plus]
+!10 = !{!"tu2.cpp", !"/Users/manmanren/test-Nov/type_unique_air/ref_addr"}
+!11 = !{!12}
+!12 = !{!"0x34\00g\00g\00\002\000\001", null, !13, !4, %struct.foo* @g, null} ; [ DW_TAG_variable ] [g] [line 2] [def]
+!13 = !{!"0x29", !10}        ; [ DW_TAG_file_type ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu2.cpp]
+!14 = !{i32 2, !"Dwarf Version", i32 2}
+!15 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/reference-argument.ll b/test/DebugInfo/X86/reference-argument.ll
index fe268e2..57ff994 100644
--- a/test/DebugInfo/X86/reference-argument.ll
+++ b/test/DebugInfo/X86/reference-argument.ll
@@ -20,8 +20,8 @@ define linkonce_odr void @_ZN1A3fooE4SVal(%class.A* %this, %class.SVal* %v) noun
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !59, metadata !{metadata !"0x102"}), !dbg !61
-  call void @llvm.dbg.declare(metadata !{%class.SVal* %v}, metadata !62, metadata !{metadata !"0x102"}), !dbg !61
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !59, metadata !{!"0x102"}), !dbg !61
+  call void @llvm.dbg.declare(metadata %class.SVal* %v, metadata !62, metadata !{!"0x102\006"}), !dbg !61
   %this1 = load %class.A** %this.addr
   call void @_Z3barR4SVal(%class.SVal* %v), !dbg !61
   ret void, !dbg !61
@@ -32,72 +32,72 @@ declare void @_ZN4SValD2Ev(%class.SVal* %this)
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!47, !68}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [aggregate-indirect-arg.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"aggregate-indirect-arg.cpp", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !29, metadata !33, metadata !34, metadata !35}
-!4 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3barR4SVal\0019\000\001\000\006\00256\000\0019", metadata !1, metadata !5, metadata !6, null, void (%class.SVal*)* @_Z3barR4SVal, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 19] [def] [bar]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [aggregate-indirect-arg.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null, metadata !8}
-!8 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !9} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from SVal]
-!9 = metadata !{metadata !"0x2\00SVal\0012\00128\0064\000\000\000", metadata !1, null, null, metadata !10, null, null, null} ; [ DW_TAG_class_type ] [SVal] [line 12, size 128, align 64, offset 0] [def] [from ]
-!10 = metadata !{metadata !11, metadata !14, metadata !16, metadata !21, metadata !23}
-!11 = metadata !{metadata !"0xd\00Data\0015\0064\0064\000\000", metadata !1, metadata !9, metadata !12} ; [ DW_TAG_member ] [Data] [line 15, size 64, align 64, offset 0] [from ]
-!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!13 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, null} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from ]
-!14 = metadata !{metadata !"0xd\00Kind\0016\0032\0032\0064\000", metadata !1, metadata !9, metadata !15} ; [ DW_TAG_member ] [Kind] [line 16, size 32, align 32, offset 64] [from unsigned int]
-!15 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
-!16 = metadata !{metadata !"0x2e\00~SVal\00~SVal\00\0014\000\000\000\006\00256\000\0014", metadata !1, metadata !9, metadata !17, null, null, null, i32 0, metadata !20} ; [ DW_TAG_subprogram ] [line 14] [~SVal]
-!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!18 = metadata !{null, metadata !19}
-!19 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from SVal]
-!20 = metadata !{i32 786468}
-!21 = metadata !{metadata !"0x2e\00SVal\00SVal\00\0012\000\000\000\006\00320\000\0012", metadata !1, metadata !9, metadata !17, null, null, null, i32 0, metadata !22} ; [ DW_TAG_subprogram ] [line 12] [SVal]
-!22 = metadata !{i32 786468}
-!23 = metadata !{metadata !"0x2e\00SVal\00SVal\00\0012\000\000\000\006\00320\000\0012", metadata !1, metadata !9, metadata !24, null, null, null, i32 0, metadata !28} ; [ DW_TAG_subprogram ] [line 12] [SVal]
-!24 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!25 = metadata !{null, metadata !19, metadata !26}
-!26 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !27} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!27 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !9} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from SVal]
-!28 = metadata !{i32 786468}
-!29 = metadata !{metadata !"0x2e\00main\00main\00\0025\000\001\000\006\00256\000\0025", metadata !1, metadata !5, metadata !30, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 25] [def] [main]
-!30 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !31, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!31 = metadata !{metadata !32}
-!32 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!33 = metadata !{metadata !"0x2e\00~SVal\00~SVal\00_ZN4SValD1Ev\0014\000\001\000\006\00256\000\0014", metadata !1, null, metadata !17, null, void (%class.SVal*)* @_ZN4SValD1Ev, null, metadata !16, metadata !2} ; [ DW_TAG_subprogram ] [line 14] [def] [~SVal]
-!34 = metadata !{metadata !"0x2e\00~SVal\00~SVal\00_ZN4SValD2Ev\0014\000\001\000\006\00256\000\0014", metadata !1, null, metadata !17, null, void (%class.SVal*)* @_ZN4SValD2Ev, null, metadata !16, metadata !2} ; [ DW_TAG_subprogram ] [line 14] [def] [~SVal]
-!35 = metadata !{metadata !"0x2e\00foo\00foo\00_ZN1A3fooE4SVal\0022\000\001\000\006\00256\000\0022", metadata !1, null, metadata !36, null, void (%class.A*, %class.SVal*)* @_ZN1A3fooE4SVal, null, metadata !41, metadata !2} ; [ DW_TAG_subprogram ] [line 22] [def] [foo]
-!36 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !37, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!37 = metadata !{null, metadata !38, metadata !9}
-!38 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !39} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
-!39 = metadata !{metadata !"0x2\00A\0020\008\008\000\000\000", metadata !1, null, null, metadata !40, null, null, null} ; [ DW_TAG_class_type ] [A] [line 20, size 8, align 8, offset 0] [def] [from ]
-!40 = metadata !{metadata !41, metadata !43}
-!41 = metadata !{metadata !"0x2e\00foo\00foo\00_ZN1A3fooE4SVal\0022\000\000\000\006\00256\000\0022", metadata !1, metadata !39, metadata !36, null, null, null, i32 0, metadata !42} ; [ DW_TAG_subprogram ] [line 22] [foo]
-!42 = metadata !{i32 786468}
-!43 = metadata !{metadata !"0x2e\00A\00A\00\0020\000\000\000\006\00320\000\0020", metadata !1, metadata !39, metadata !44, null, null, null, i32 0, metadata !46} ; [ DW_TAG_subprogram ] [line 20] [A]
-!44 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !45, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!45 = metadata !{null, metadata !38}
-!46 = metadata !{i32 786468}
-!47 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!48 = metadata !{metadata !"0x101\00v\0016777235\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [v] [line 19]
-!49 = metadata !{i32 19, i32 0, metadata !4, null}
-!50 = metadata !{metadata !"0x100\00v\0026\000", metadata !29, metadata !5, metadata !9} ; [ DW_TAG_auto_variable ] [v] [line 26]
-!51 = metadata !{i32 26, i32 0, metadata !29, null}
-!52 = metadata !{i32 27, i32 0, metadata !29, null}
-!53 = metadata !{i32 28, i32 0, metadata !29, null}
-!54 = metadata !{metadata !"0x100\00a\0029\000", metadata !29, metadata !5, metadata !39} ; [ DW_TAG_auto_variable ] [a] [line 29]
-!55 = metadata !{i32 29, i32 0, metadata !29, null}
-!56 = metadata !{i32 30, i32 0, metadata !29, null}
-!57 = metadata !{i32 31, i32 0, metadata !29, null}
-!58 = metadata !{i32 32, i32 0, metadata !29, null}
-!59 = metadata !{metadata !"0x101\00this\0016777238\001088", metadata !35, metadata !5, metadata !60} ; [ DW_TAG_arg_variable ] [this] [line 22]
-!60 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !39} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
-!61 = metadata !{i32 22, i32 0, metadata !35, null}
-!62 = metadata !{metadata !"0x101\00v\0033554454\008192", metadata !35, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [v] [line 22]
-!63 = metadata !{metadata !"0x101\00this\0016777230\001088", metadata !33, metadata !5, metadata !64} ; [ DW_TAG_arg_variable ] [this] [line 14]
-!64 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from SVal]
-!65 = metadata !{i32 14, i32 0, metadata !33, null}
-!66 = metadata !{metadata !"0x101\00this\0016777230\001088", metadata !34, metadata !5, metadata !64} ; [ DW_TAG_arg_variable ] [this] [line 14]
-!67 = metadata !{i32 14, i32 0, metadata !34, null}
-!68 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [aggregate-indirect-arg.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"aggregate-indirect-arg.cpp", !""}
+!2 = !{}
+!3 = !{!4, !29, !33, !34, !35}
+!4 = !{!"0x2e\00bar\00bar\00_Z3barR4SVal\0019\000\001\000\006\00256\000\0019", !1, !5, !6, null, void (%class.SVal*)* @_Z3barR4SVal, null, null, !2} ; [ DW_TAG_subprogram ] [line 19] [def] [bar]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [aggregate-indirect-arg.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null, !8}
+!8 = !{!"0x10\00\000\000\000\000\000", null, null, !9} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from SVal]
+!9 = !{!"0x2\00SVal\0012\00128\0064\000\000\000", !1, null, null, !10, null, null, null} ; [ DW_TAG_class_type ] [SVal] [line 12, size 128, align 64, offset 0] [def] [from ]
+!10 = !{!11, !14, !16, !21, !23}
+!11 = !{!"0xd\00Data\0015\0064\0064\000\000", !1, !9, !12} ; [ DW_TAG_member ] [Data] [line 15, size 64, align 64, offset 0] [from ]
+!12 = !{!"0xf\00\000\0064\0064\000\000", null, null, !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!13 = !{!"0x26\00\000\000\000\000\000", null, null, null} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = !{!"0xd\00Kind\0016\0032\0032\0064\000", !1, !9, !15} ; [ DW_TAG_member ] [Kind] [line 16, size 32, align 32, offset 64] [from unsigned int]
+!15 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!16 = !{!"0x2e\00~SVal\00~SVal\00\0014\000\000\000\006\00256\000\0014", !1, !9, !17, null, null, null, i32 0, !20} ; [ DW_TAG_subprogram ] [line 14] [~SVal]
+!17 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = !{null, !19}
+!19 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from SVal]
+!20 = !{i32 786468}
+!21 = !{!"0x2e\00SVal\00SVal\00\0012\000\000\000\006\00320\000\0012", !1, !9, !17, null, null, null, i32 0, !22} ; [ DW_TAG_subprogram ] [line 12] [SVal]
+!22 = !{i32 786468}
+!23 = !{!"0x2e\00SVal\00SVal\00\0012\000\000\000\006\00320\000\0012", !1, !9, !24, null, null, null, i32 0, !28} ; [ DW_TAG_subprogram ] [line 12] [SVal]
+!24 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!25 = !{null, !19, !26}
+!26 = !{!"0x10\00\000\000\000\000\000", null, null, !27} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!27 = !{!"0x26\00\000\000\000\000\000", null, null, !9} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from SVal]
+!28 = !{i32 786468}
+!29 = !{!"0x2e\00main\00main\00\0025\000\001\000\006\00256\000\0025", !1, !5, !30, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 25] [def] [main]
+!30 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !31, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!31 = !{!32}
+!32 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!33 = !{!"0x2e\00~SVal\00~SVal\00_ZN4SValD1Ev\0014\000\001\000\006\00256\000\0014", !1, null, !17, null, void (%class.SVal*)* @_ZN4SValD1Ev, null, !16, !2} ; [ DW_TAG_subprogram ] [line 14] [def] [~SVal]
+!34 = !{!"0x2e\00~SVal\00~SVal\00_ZN4SValD2Ev\0014\000\001\000\006\00256\000\0014", !1, null, !17, null, void (%class.SVal*)* @_ZN4SValD2Ev, null, !16, !2} ; [ DW_TAG_subprogram ] [line 14] [def] [~SVal]
+!35 = !{!"0x2e\00foo\00foo\00_ZN1A3fooE4SVal\0022\000\001\000\006\00256\000\0022", !1, null, !36, null, void (%class.A*, %class.SVal*)* @_ZN1A3fooE4SVal, null, !41, !2} ; [ DW_TAG_subprogram ] [line 22] [def] [foo]
+!36 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !37, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!37 = !{null, !38, !9}
+!38 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !39} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
+!39 = !{!"0x2\00A\0020\008\008\000\000\000", !1, null, null, !40, null, null, null} ; [ DW_TAG_class_type ] [A] [line 20, size 8, align 8, offset 0] [def] [from ]
+!40 = !{!41, !43}
+!41 = !{!"0x2e\00foo\00foo\00_ZN1A3fooE4SVal\0022\000\000\000\006\00256\000\0022", !1, !39, !36, null, null, null, i32 0, !42} ; [ DW_TAG_subprogram ] [line 22] [foo]
+!42 = !{i32 786468}
+!43 = !{!"0x2e\00A\00A\00\0020\000\000\000\006\00320\000\0020", !1, !39, !44, null, null, null, i32 0, !46} ; [ DW_TAG_subprogram ] [line 20] [A]
+!44 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !45, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!45 = !{null, !38}
+!46 = !{i32 786468}
+!47 = !{i32 2, !"Dwarf Version", i32 3}
+!48 = !{!"0x101\00v\0016777235\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [v] [line 19]
+!49 = !MDLocation(line: 19, scope: !4)
+!50 = !{!"0x100\00v\0026\000", !29, !5, !9} ; [ DW_TAG_auto_variable ] [v] [line 26]
+!51 = !MDLocation(line: 26, scope: !29)
+!52 = !MDLocation(line: 27, scope: !29)
+!53 = !MDLocation(line: 28, scope: !29)
+!54 = !{!"0x100\00a\0029\000", !29, !5, !39} ; [ DW_TAG_auto_variable ] [a] [line 29]
+!55 = !MDLocation(line: 29, scope: !29)
+!56 = !MDLocation(line: 30, scope: !29)
+!57 = !MDLocation(line: 31, scope: !29)
+!58 = !MDLocation(line: 32, scope: !29)
+!59 = !{!"0x101\00this\0016777238\001088", !35, !5, !60} ; [ DW_TAG_arg_variable ] [this] [line 22]
+!60 = !{!"0xf\00\000\0064\0064\000\000", null, null, !39} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!61 = !MDLocation(line: 22, scope: !35)
+!62 = !{!"0x101\00v\0033554454\000", !35, !5, !9} ; [ DW_TAG_arg_variable ] [v] [line 22]
+!63 = !{!"0x101\00this\0016777230\001088", !33, !5, !64} ; [ DW_TAG_arg_variable ] [this] [line 14]
+!64 = !{!"0xf\00\000\0064\0064\000\000", null, null, !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from SVal]
+!65 = !MDLocation(line: 14, scope: !33)
+!66 = !{!"0x101\00this\0016777230\001088", !34, !5, !64} ; [ DW_TAG_arg_variable ] [this] [line 14]
+!67 = !MDLocation(line: 14, scope: !34)
+!68 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/rvalue-ref.ll b/test/DebugInfo/X86/rvalue-ref.ll
index bbee6a2..3829966 100644
--- a/test/DebugInfo/X86/rvalue-ref.ll
+++ b/test/DebugInfo/X86/rvalue-ref.ll
@@ -9,7 +9,7 @@ define void @_Z3fooOi(i32* %i) uwtable ssp {
 entry:
   %i.addr = alloca i32*, align 8
   store i32* %i, i32** %i.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i32** %i.addr}, metadata !11, metadata !{metadata !"0x102"}), !dbg !12
+  call void @llvm.dbg.declare(metadata i32** %i.addr, metadata !11, metadata !{!"0x102"}), !dbg !12
   %0 = load i32** %i.addr, align 8, !dbg !13
   %1 = load i32* %0, align 4, !dbg !13
   %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %1), !dbg !13
@@ -23,19 +23,19 @@ declare i32 @printf(i8*, ...)
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!17}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.2 (trunk 157054) (llvm/trunk 157060)\000\00\000\00\000", metadata !16, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooOi\004\000\001\000\006\00256\000\005", metadata !16, metadata !6, metadata !7, null, void (i32*)* @_Z3fooOi, null, null, metadata !1} ; [ DW_TAG_subprogram ]
-!6 = metadata !{metadata !"0x29", metadata !16} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9}
-!9 = metadata !{metadata !"0x42\00\000\000\000\000\000", null, null, metadata !10} ; [ DW_TAG_rvalue_reference_type ]
-!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!11 = metadata !{metadata !"0x101\00i\0016777220\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{i32 4, i32 17, metadata !5, null}
-!13 = metadata !{i32 6, i32 3, metadata !14, null}
-!14 = metadata !{metadata !"0xb\005\001\000", metadata !16, metadata !5} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{i32 7, i32 1, metadata !14, null}
-!16 = metadata !{metadata !"foo.cpp", metadata !"/Users/echristo/tmp"}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.2 (trunk 157054) (llvm/trunk 157060)\000\00\000\00\000", !16, !1, !1, !3, !1,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00foo\00foo\00_Z3fooOi\004\000\001\000\006\00256\000\005", !16, !6, !7, null, void (i32*)* @_Z3fooOi, null, null, !1} ; [ DW_TAG_subprogram ]
+!6 = !{!"0x29", !16} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9}
+!9 = !{!"0x42\00\000\000\000\000\000", null, null, !10} ; [ DW_TAG_rvalue_reference_type ]
+!10 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!11 = !{!"0x101\00i\0016777220\000", !5, !6, !9} ; [ DW_TAG_arg_variable ]
+!12 = !MDLocation(line: 4, column: 17, scope: !5)
+!13 = !MDLocation(line: 6, column: 3, scope: !14)
+!14 = !{!"0xb\005\001\000", !16, !5} ; [ DW_TAG_lexical_block ]
+!15 = !MDLocation(line: 7, column: 1, scope: !14)
+!16 = !{!"foo.cpp", !"/Users/echristo/tmp"}
+!17 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/sret.ll b/test/DebugInfo/X86/sret.ll
index 7e51183..bddf533 100644
--- a/test/DebugInfo/X86/sret.ll
+++ b/test/DebugInfo/X86/sret.ll
@@ -23,9 +23,9 @@ entry:
   %this.addr = alloca %class.A*, align 8
   %i.addr = alloca i32, align 4
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !67, metadata !{metadata !"0x102"}), !dbg !69
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !67, metadata !{!"0x102"}), !dbg !69
   store i32 %i, i32* %i.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !70, metadata !{metadata !"0x102"}), !dbg !71
+  call void @llvm.dbg.declare(metadata i32* %i.addr, metadata !70, metadata !{!"0x102"}), !dbg !71
   %this1 = load %class.A** %this.addr
   %0 = bitcast %class.A* %this1 to i8***, !dbg !72
   store i8** getelementptr inbounds ([4 x i8*]* @_ZTV1A, i64 0, i64 2), i8*** %0, !dbg !72
@@ -44,9 +44,9 @@ entry:
   %this.addr = alloca %class.A*, align 8
   %rhs.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !74, metadata !{metadata !"0x102"}), !dbg !75
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !74, metadata !{!"0x102"}), !dbg !75
   store %class.A* %rhs, %class.A** %rhs.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %rhs.addr}, metadata !76, metadata !{metadata !"0x102"}), !dbg !77
+  call void @llvm.dbg.declare(metadata %class.A** %rhs.addr, metadata !76, metadata !{!"0x102"}), !dbg !77
   %this1 = load %class.A** %this.addr
   %0 = bitcast %class.A* %this1 to i8***, !dbg !78
   store i8** getelementptr inbounds ([4 x i8*]* @_ZTV1A, i64 0, i64 2), i8*** %0, !dbg !78
@@ -64,9 +64,9 @@ entry:
   %this.addr = alloca %class.A*, align 8
   %rhs.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !80, metadata !{metadata !"0x102"}), !dbg !81
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !80, metadata !{!"0x102"}), !dbg !81
   store %class.A* %rhs, %class.A** %rhs.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %rhs.addr}, metadata !82, metadata !{metadata !"0x102"}), !dbg !83
+  call void @llvm.dbg.declare(metadata %class.A** %rhs.addr, metadata !82, metadata !{!"0x102"}), !dbg !83
   %this1 = load %class.A** %this.addr
   %0 = load %class.A** %rhs.addr, align 8, !dbg !84
   %m_int = getelementptr inbounds %class.A* %0, i32 0, i32 1, !dbg !84
@@ -81,7 +81,7 @@ define i32 @_ZN1A7get_intEv(%class.A* %this) #0 align 2 {
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !86, metadata !{metadata !"0x102"}), !dbg !87
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !86, metadata !{!"0x102"}), !dbg !87
   %this1 = load %class.A** %this.addr
   %m_int = getelementptr inbounds %class.A* %this1, i32 0, i32 1, !dbg !88
   %0 = load i32* %m_int, align 4, !dbg !88
@@ -95,10 +95,10 @@ entry:
   %nrvo = alloca i1
   %cleanup.dest.slot = alloca i32
   store %class.B* %this, %class.B** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.B** %this.addr}, metadata !89, metadata !{metadata !"0x102"}), !dbg !91
+  call void @llvm.dbg.declare(metadata %class.B** %this.addr, metadata !89, metadata !{!"0x102"}), !dbg !91
   %this1 = load %class.B** %this.addr
   store i1 false, i1* %nrvo, !dbg !92
-  call void @llvm.dbg.declare(metadata !{%class.A* %agg.result}, metadata !93, metadata !{metadata !"0x102"}), !dbg !92
+  call void @llvm.dbg.declare(metadata %class.A* %agg.result, metadata !93, metadata !{!"0x102\006"}), !dbg !92
   call void @_ZN1AC1Ei(%class.A* %agg.result, i32 12), !dbg !92
   store i1 true, i1* %nrvo, !dbg !94
   store i32 1, i32* %cleanup.dest.slot
@@ -118,7 +118,7 @@ define linkonce_odr void @_ZN1AD2Ev(%class.A* %this) unnamed_addr #0 align 2 {
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !101, metadata !{metadata !"0x102"}), !dbg !102
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !101, metadata !{!"0x102"}), !dbg !102
   %this1 = load %class.A** %this.addr
   ret void, !dbg !103
 }
@@ -138,12 +138,12 @@ entry:
   %cleanup.dest.slot = alloca i32
   store i32 0, i32* %retval
   store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !104, metadata !{metadata !"0x102"}), !dbg !105
+  call void @llvm.dbg.declare(metadata i32* %argc.addr, metadata !104, metadata !{!"0x102"}), !dbg !105
   store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !106, metadata !{metadata !"0x102"}), !dbg !105
-  call void @llvm.dbg.declare(metadata !{%class.B* %b}, metadata !107, metadata !{metadata !"0x102"}), !dbg !108
+  call void @llvm.dbg.declare(metadata i8*** %argv.addr, metadata !106, metadata !{!"0x102"}), !dbg !105
+  call void @llvm.dbg.declare(metadata %class.B* %b, metadata !107, metadata !{!"0x102"}), !dbg !108
   call void @_ZN1BC2Ev(%class.B* %b), !dbg !108
-  call void @llvm.dbg.declare(metadata !{i32* %return_val}, metadata !109, metadata !{metadata !"0x102"}), !dbg !110
+  call void @llvm.dbg.declare(metadata i32* %return_val, metadata !109, metadata !{!"0x102"}), !dbg !110
   call void @_ZN1B9AInstanceEv(%class.A* sret %temp.lvalue, %class.B* %b), !dbg !110
   %call = invoke i32 @_ZN1A7get_intEv(%class.A* %temp.lvalue)
           to label %invoke.cont unwind label %lpad, !dbg !110
@@ -151,7 +151,7 @@ entry:
 invoke.cont:                                      ; preds = %entry
   call void @_ZN1AD2Ev(%class.A* %temp.lvalue), !dbg !111
   store i32 %call, i32* %return_val, align 4, !dbg !111
-  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !113, metadata !{metadata !"0x102"}), !dbg !114
+  call void @llvm.dbg.declare(metadata %class.A* %a, metadata !113, metadata !{!"0x102"}), !dbg !114
   call void @_ZN1B9AInstanceEv(%class.A* sret %a, %class.B* %b), !dbg !114
   %0 = load i32* %return_val, align 4, !dbg !115
   store i32 %0, i32* %retval, !dbg !115
@@ -193,7 +193,7 @@ define linkonce_odr void @_ZN1BC2Ev(%class.B* %this) unnamed_addr #0 align 2 {
 entry:
   %this.addr = alloca %class.B*, align 8
   store %class.B* %this, %class.B** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.B** %this.addr}, metadata !123, metadata !{metadata !"0x102"}), !dbg !124
+  call void @llvm.dbg.declare(metadata %class.B** %this.addr, metadata !123, metadata !{!"0x102"}), !dbg !124
   %this1 = load %class.B** %this.addr
   ret void, !dbg !125
 }
@@ -218,7 +218,7 @@ entry:
   %exn.slot = alloca i8*
   %ehselector.slot = alloca i32
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !126, metadata !{metadata !"0x102"}), !dbg !127
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !126, metadata !{!"0x102"}), !dbg !127
   %this1 = load %class.A** %this.addr
   invoke void @_ZN1AD2Ev(%class.A* %this1)
           to label %invoke.cont unwind label %lpad, !dbg !128
@@ -263,131 +263,131 @@ attributes #7 = { builtin nounwind }
 !llvm.module.flags = !{!64, !65}
 !llvm.ident = !{!66}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 203283) (llvm/trunk 203307)\000\00\000\00sret.dwo\001", metadata !1, metadata !2, metadata !3, metadata !48, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/sret.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"sret.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !37}
-!4 = metadata !{metadata !"0x2\00A\001\00128\0064\000\000\000", metadata !1, null, null, metadata !5, metadata !"_ZTS1A", null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 1, size 128, align 64, offset 0] [def] [from ]
-!5 = metadata !{metadata !6, metadata !13, metadata !14, metadata !19, metadata !25, metadata !29, metadata !33}
-!6 = metadata !{metadata !"0xd\00_vptr$A\000\0064\000\000\0064", metadata !1, metadata !7, metadata !8} ; [ DW_TAG_member ] [_vptr$A] [line 0, size 64, align 0, offset 0] [artificial] [from ]
-!7 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!8 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
-!9 = metadata !{metadata !"0xf\00__vtbl_ptr_type\000\0064\000\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
-!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!13 = metadata !{metadata !"0xd\00m_int\0013\0032\0032\0064\002", metadata !1, metadata !"_ZTS1A", metadata !12} ; [ DW_TAG_member ] [m_int] [line 13, size 32, align 32, offset 64] [protected] [from int]
-!14 = metadata !{metadata !"0x2e\00A\00A\00\004\000\000\000\006\00256\000\004", metadata !1, metadata !"_ZTS1A", metadata !15, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 4] [A]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{null, metadata !17, metadata !12}
-!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!19 = metadata !{metadata !"0x2e\00A\00A\00\005\000\000\000\006\00256\000\005", metadata !1, metadata !"_ZTS1A", metadata !20, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 5] [A]
-!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!21 = metadata !{null, metadata !17, metadata !22}
-!22 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !23} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!23 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS1A]
-!25 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN1AaSERKS_\007\000\000\000\006\00256\000\007", metadata !1, metadata !"_ZTS1A", metadata !26, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [operator=]
-!26 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !27, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!27 = metadata !{metadata !22, metadata !17, metadata !22}
-!29 = metadata !{metadata !"0x2e\00~A\00~A\00\008\000\000\001\006\00256\000\008", metadata !1, metadata !"_ZTS1A", metadata !30, metadata !"_ZTS1A", null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 8] [~A]
-!30 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !31, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!31 = metadata !{null, metadata !17}
-!33 = metadata !{metadata !"0x2e\00get_int\00get_int\00_ZN1A7get_intEv\0010\000\000\000\006\00256\000\0010", metadata !1, metadata !"_ZTS1A", metadata !34, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 10] [get_int]
-!34 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !35, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!35 = metadata !{metadata !12, metadata !17}
-!37 = metadata !{metadata !"0x2\00B\0038\008\008\000\000\000", metadata !1, null, null, metadata !38, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_class_type ] [B] [line 38, size 8, align 8, offset 0] [def] [from ]
-!38 = metadata !{metadata !39, metadata !44}
-!39 = metadata !{metadata !"0x2e\00B\00B\00\0041\000\000\000\006\00256\000\0041", metadata !1, metadata !"_ZTS1B", metadata !40, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 41] [B]
-!40 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !41, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!41 = metadata !{null, metadata !42}
-!42 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
-!44 = metadata !{metadata !"0x2e\00AInstance\00AInstance\00_ZN1B9AInstanceEv\0043\000\000\000\006\00256\000\0043", metadata !1, metadata !"_ZTS1B", metadata !45, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 43] [AInstance]
-!45 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !46, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!46 = metadata !{metadata !4, metadata !42}
-!48 = metadata !{metadata !49, metadata !50, metadata !51, metadata !52, metadata !53, metadata !54, metadata !61, metadata !62, metadata !63}
-!49 = metadata !{metadata !"0x2e\00A\00A\00_ZN1AC2Ei\0016\000\001\000\006\00256\000\0018", metadata !1, metadata !"_ZTS1A", metadata !15, null, void (%class.A*, i32)* @_ZN1AC2Ei, null, metadata !14, metadata !2} ; [ DW_TAG_subprogram ] [line 16] [def] [scope 18] [A]
-!50 = metadata !{metadata !"0x2e\00A\00A\00_ZN1AC2ERKS_\0021\000\001\000\006\00256\000\0023", metadata !1, metadata !"_ZTS1A", metadata !20, null, void (%class.A*, %class.A*)* @_ZN1AC2ERKS_, null, metadata !19, metadata !2} ; [ DW_TAG_subprogram ] [line 21] [def] [scope 23] [A]
-!51 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN1AaSERKS_\0027\000\001\000\006\00256\000\0028", metadata !1, metadata !"_ZTS1A", metadata !26, null, %class.A* (%class.A*, %class.A*)* @_ZN1AaSERKS_, null, metadata !25, metadata !2} ; [ DW_TAG_subprogram ] [line 27] [def] [scope 28] [operator=]
-!52 = metadata !{metadata !"0x2e\00get_int\00get_int\00_ZN1A7get_intEv\0033\000\001\000\006\00256\000\0034", metadata !1, metadata !"_ZTS1A", metadata !34, null, i32 (%class.A*)* @_ZN1A7get_intEv, null, metadata !33, metadata !2} ; [ DW_TAG_subprogram ] [line 33] [def] [scope 34] [get_int]
-!53 = metadata !{metadata !"0x2e\00AInstance\00AInstance\00_ZN1B9AInstanceEv\0047\000\001\000\006\00256\000\0048", metadata !1, metadata !"_ZTS1B", metadata !45, null, void (%class.A*, %class.B*)* @_ZN1B9AInstanceEv, null, metadata !44, metadata !2} ; [ DW_TAG_subprogram ] [line 47] [def] [scope 48] [AInstance]
-!54 = metadata !{metadata !"0x2e\00main\00main\00\0053\000\001\000\006\00256\000\0054", metadata !1, metadata !7, metadata !55, null, i32 (i32, i8**)* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 53] [def] [scope 54] [main]
-!55 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !56, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!56 = metadata !{metadata !12, metadata !12, metadata !57}
-!57 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !58} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!58 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !59} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!59 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !60} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from char]
-!60 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!61 = metadata !{metadata !"0x2e\00~A\00~A\00_ZN1AD0Ev\008\000\001\000\006\00256\000\008", metadata !1, metadata !"_ZTS1A", metadata !30, null, void (%class.A*)* @_ZN1AD0Ev, null, metadata !29, metadata !2} ; [ DW_TAG_subprogram ] [line 8] [def] [~A]
-!62 = metadata !{metadata !"0x2e\00B\00B\00_ZN1BC2Ev\0041\000\001\000\006\00256\000\0041", metadata !1, metadata !"_ZTS1B", metadata !40, null, void (%class.B*)* @_ZN1BC2Ev, null, metadata !39, metadata !2} ; [ DW_TAG_subprogram ] [line 41] [def] [B]
-!63 = metadata !{metadata !"0x2e\00~A\00~A\00_ZN1AD2Ev\008\000\001\000\006\00256\000\008", metadata !1, metadata !"_ZTS1A", metadata !30, null, void (%class.A*)* @_ZN1AD2Ev, null, metadata !29, metadata !2} ; [ DW_TAG_subprogram ] [line 8] [def] [~A]
-!64 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!65 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!66 = metadata !{metadata !"clang version 3.5.0 (trunk 203283) (llvm/trunk 203307)"}
-!67 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !49, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!68 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
-!69 = metadata !{i32 0, i32 0, metadata !49, null}
-!70 = metadata !{metadata !"0x101\00i\0033554448\000", metadata !49, metadata !7, metadata !12} ; [ DW_TAG_arg_variable ] [i] [line 16]
-!71 = metadata !{i32 16, i32 0, metadata !49, null}
-!72 = metadata !{i32 18, i32 0, metadata !49, null}
-!73 = metadata !{i32 19, i32 0, metadata !49, null}
-!74 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !50, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!75 = metadata !{i32 0, i32 0, metadata !50, null}
-!76 = metadata !{metadata !"0x101\00rhs\0033554453\000", metadata !50, metadata !7, metadata !22} ; [ DW_TAG_arg_variable ] [rhs] [line 21]
-!77 = metadata !{i32 21, i32 0, metadata !50, null}
-!78 = metadata !{i32 23, i32 0, metadata !50, null}
-!79 = metadata !{i32 24, i32 0, metadata !50, null}
-!80 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !51, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!81 = metadata !{i32 0, i32 0, metadata !51, null}
-!82 = metadata !{metadata !"0x101\00rhs\0033554459\000", metadata !51, metadata !7, metadata !22} ; [ DW_TAG_arg_variable ] [rhs] [line 27]
-!83 = metadata !{i32 27, i32 0, metadata !51, null}
-!84 = metadata !{i32 29, i32 0, metadata !51, null}
-!85 = metadata !{i32 30, i32 0, metadata !51, null}
-!86 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !52, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!87 = metadata !{i32 0, i32 0, metadata !52, null}
-!88 = metadata !{i32 35, i32 0, metadata !52, null}
-!89 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !53, null, metadata !90} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!90 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1B]
-!91 = metadata !{i32 0, i32 0, metadata !53, null}
-!92 = metadata !{i32 49, i32 0, metadata !53, null}
-!93 = metadata !{metadata !"0x100\00a\0049\008192", metadata !53, metadata !7, metadata !4} ; [ DW_TAG_auto_variable ] [a] [line 49]
-!94 = metadata !{i32 50, i32 0, metadata !53, null}
-!95 = metadata !{i32 51, i32 0, metadata !53, null}
-!96 = metadata !{i32 51, i32 0, metadata !97, null}
-!97 = metadata !{metadata !"0xb\0051\000\002", metadata !1, metadata !53} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!98 = metadata !{i32 51, i32 0, metadata !99, null}
-!99 = metadata !{metadata !"0xb\0051\000\003", metadata !1, metadata !100} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!100 = metadata !{metadata !"0xb\0051\000\001", metadata !1, metadata !53} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!101 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !63, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!102 = metadata !{i32 0, i32 0, metadata !63, null}
-!103 = metadata !{i32 8, i32 0, metadata !63, null}
-!104 = metadata !{metadata !"0x101\00argc\0016777269\000", metadata !54, metadata !7, metadata !12} ; [ DW_TAG_arg_variable ] [argc] [line 53]
-!105 = metadata !{i32 53, i32 0, metadata !54, null}
-!106 = metadata !{metadata !"0x101\00argv\0033554485\000", metadata !54, metadata !7, metadata !57} ; [ DW_TAG_arg_variable ] [argv] [line 53]
-!107 = metadata !{metadata !"0x100\00b\0055\000", metadata !54, metadata !7, metadata !37} ; [ DW_TAG_auto_variable ] [b] [line 55]
-!108 = metadata !{i32 55, i32 0, metadata !54, null}
-!109 = metadata !{metadata !"0x100\00return_val\0056\000", metadata !54, metadata !7, metadata !12} ; [ DW_TAG_auto_variable ] [return_val] [line 56]
-!110 = metadata !{i32 56, i32 0, metadata !54, null}
-!111 = metadata !{i32 56, i32 0, metadata !112, null}
-!112 = metadata !{metadata !"0xb\0056\000\001", metadata !1, metadata !54} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!113 = metadata !{metadata !"0x100\00a\0058\000", metadata !54, metadata !7, metadata !4} ; [ DW_TAG_auto_variable ] [a] [line 58]
-!114 = metadata !{i32 58, i32 0, metadata !54, null}
-!115 = metadata !{i32 59, i32 0, metadata !54, null}
-!116 = metadata !{i32 60, i32 0, metadata !54, null}
-!117 = metadata !{i32 60, i32 0, metadata !118, null}
-!118 = metadata !{metadata !"0xb\0060\000\001", metadata !1, metadata !54} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!119 = metadata !{i32 60, i32 0, metadata !120, null}
-!120 = metadata !{metadata !"0xb\0060\000\003", metadata !1, metadata !54} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!121 = metadata !{i32 60, i32 0, metadata !122, null}
-!122 = metadata !{metadata !"0xb\0060\000\002", metadata !1, metadata !54} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!123 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !62, null, metadata !90} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!124 = metadata !{i32 0, i32 0, metadata !62, null}
-!125 = metadata !{i32 41, i32 0, metadata !62, null}
-!126 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !61, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!127 = metadata !{i32 0, i32 0, metadata !61, null}
-!128 = metadata !{i32 8, i32 0, metadata !61, null}
-!129 = metadata !{i32 8, i32 0, metadata !130, null}
-!130 = metadata !{metadata !"0xb\008\000\001", metadata !1, metadata !61} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!131 = metadata !{i32 8, i32 0, metadata !132, null}
-!132 = metadata !{metadata !"0xb\008\000\002", metadata !1, metadata !61} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!133 = metadata !{i32 8, i32 0, metadata !134, null}
-!134 = metadata !{metadata !"0xb\008\000\003", metadata !1, metadata !61} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!0 = !{!"0x11\004\00clang version 3.5.0 (trunk 203283) (llvm/trunk 203307)\000\00\000\00sret.dwo\001", !1, !2, !3, !48, !2, !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/sret.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"sret.cpp", !"/usr/local/google/home/echristo/tmp"}
+!2 = !{}
+!3 = !{!4, !37}
+!4 = !{!"0x2\00A\001\00128\0064\000\000\000", !1, null, null, !5, !"_ZTS1A", null, !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 1, size 128, align 64, offset 0] [def] [from ]
+!5 = !{!6, !13, !14, !19, !25, !29, !33}
+!6 = !{!"0xd\00_vptr$A\000\0064\000\000\0064", !1, !7, !8} ; [ DW_TAG_member ] [_vptr$A] [line 0, size 64, align 0, offset 0] [artificial] [from ]
+!7 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!8 = !{!"0xf\00\000\0064\000\000\000", null, null, !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
+!9 = !{!"0xf\00__vtbl_ptr_type\000\0064\000\000\000", null, null, !10} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
+!10 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = !{!12}
+!12 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = !{!"0xd\00m_int\0013\0032\0032\0064\002", !1, !"_ZTS1A", !12} ; [ DW_TAG_member ] [m_int] [line 13, size 32, align 32, offset 64] [protected] [from int]
+!14 = !{!"0x2e\00A\00A\00\004\000\000\000\006\00256\000\004", !1, !"_ZTS1A", !15, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 4] [A]
+!15 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{null, !17, !12}
+!17 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!19 = !{!"0x2e\00A\00A\00\005\000\000\000\006\00256\000\005", !1, !"_ZTS1A", !20, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 5] [A]
+!20 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!21 = !{null, !17, !22}
+!22 = !{!"0x10\00\000\000\000\000\000", null, null, !23} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = !{!"0x26\00\000\000\000\000\000", null, null, !"_ZTS1A"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS1A]
+!25 = !{!"0x2e\00operator=\00operator=\00_ZN1AaSERKS_\007\000\000\000\006\00256\000\007", !1, !"_ZTS1A", !26, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [operator=]
+!26 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !27, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!27 = !{!22, !17, !22}
+!29 = !{!"0x2e\00~A\00~A\00\008\000\000\001\006\00256\000\008", !1, !"_ZTS1A", !30, !"_ZTS1A", null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 8] [~A]
+!30 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !31, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!31 = !{null, !17}
+!33 = !{!"0x2e\00get_int\00get_int\00_ZN1A7get_intEv\0010\000\000\000\006\00256\000\0010", !1, !"_ZTS1A", !34, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 10] [get_int]
+!34 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !35, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!35 = !{!12, !17}
+!37 = !{!"0x2\00B\0038\008\008\000\000\000", !1, null, null, !38, null, null, !"_ZTS1B"} ; [ DW_TAG_class_type ] [B] [line 38, size 8, align 8, offset 0] [def] [from ]
+!38 = !{!39, !44}
+!39 = !{!"0x2e\00B\00B\00\0041\000\000\000\006\00256\000\0041", !1, !"_ZTS1B", !40, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 41] [B]
+!40 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !41, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!41 = !{null, !42}
+!42 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
+!44 = !{!"0x2e\00AInstance\00AInstance\00_ZN1B9AInstanceEv\0043\000\000\000\006\00256\000\0043", !1, !"_ZTS1B", !45, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 43] [AInstance]
+!45 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !46, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!46 = !{!4, !42}
+!48 = !{!49, !50, !51, !52, !53, !54, !61, !62, !63}
+!49 = !{!"0x2e\00A\00A\00_ZN1AC2Ei\0016\000\001\000\006\00256\000\0018", !1, !"_ZTS1A", !15, null, void (%class.A*, i32)* @_ZN1AC2Ei, null, !14, !2} ; [ DW_TAG_subprogram ] [line 16] [def] [scope 18] [A]
+!50 = !{!"0x2e\00A\00A\00_ZN1AC2ERKS_\0021\000\001\000\006\00256\000\0023", !1, !"_ZTS1A", !20, null, void (%class.A*, %class.A*)* @_ZN1AC2ERKS_, null, !19, !2} ; [ DW_TAG_subprogram ] [line 21] [def] [scope 23] [A]
+!51 = !{!"0x2e\00operator=\00operator=\00_ZN1AaSERKS_\0027\000\001\000\006\00256\000\0028", !1, !"_ZTS1A", !26, null, %class.A* (%class.A*, %class.A*)* @_ZN1AaSERKS_, null, !25, !2} ; [ DW_TAG_subprogram ] [line 27] [def] [scope 28] [operator=]
+!52 = !{!"0x2e\00get_int\00get_int\00_ZN1A7get_intEv\0033\000\001\000\006\00256\000\0034", !1, !"_ZTS1A", !34, null, i32 (%class.A*)* @_ZN1A7get_intEv, null, !33, !2} ; [ DW_TAG_subprogram ] [line 33] [def] [scope 34] [get_int]
+!53 = !{!"0x2e\00AInstance\00AInstance\00_ZN1B9AInstanceEv\0047\000\001\000\006\00256\000\0048", !1, !"_ZTS1B", !45, null, void (%class.A*, %class.B*)* @_ZN1B9AInstanceEv, null, !44, !2} ; [ DW_TAG_subprogram ] [line 47] [def] [scope 48] [AInstance]
+!54 = !{!"0x2e\00main\00main\00\0053\000\001\000\006\00256\000\0054", !1, !7, !55, null, i32 (i32, i8**)* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 53] [def] [scope 54] [main]
+!55 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !56, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!56 = !{!12, !12, !57}
+!57 = !{!"0xf\00\000\0064\0064\000\000", null, null, !58} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!58 = !{!"0xf\00\000\0064\0064\000\000", null, null, !59} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!59 = !{!"0x26\00\000\000\000\000\000", null, null, !60} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from char]
+!60 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!61 = !{!"0x2e\00~A\00~A\00_ZN1AD0Ev\008\000\001\000\006\00256\000\008", !1, !"_ZTS1A", !30, null, void (%class.A*)* @_ZN1AD0Ev, null, !29, !2} ; [ DW_TAG_subprogram ] [line 8] [def] [~A]
+!62 = !{!"0x2e\00B\00B\00_ZN1BC2Ev\0041\000\001\000\006\00256\000\0041", !1, !"_ZTS1B", !40, null, void (%class.B*)* @_ZN1BC2Ev, null, !39, !2} ; [ DW_TAG_subprogram ] [line 41] [def] [B]
+!63 = !{!"0x2e\00~A\00~A\00_ZN1AD2Ev\008\000\001\000\006\00256\000\008", !1, !"_ZTS1A", !30, null, void (%class.A*)* @_ZN1AD2Ev, null, !29, !2} ; [ DW_TAG_subprogram ] [line 8] [def] [~A]
+!64 = !{i32 2, !"Dwarf Version", i32 4}
+!65 = !{i32 1, !"Debug Info Version", i32 2}
+!66 = !{!"clang version 3.5.0 (trunk 203283) (llvm/trunk 203307)"}
+!67 = !{!"0x101\00this\0016777216\001088", !49, null, !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!68 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!69 = !MDLocation(line: 0, scope: !49)
+!70 = !{!"0x101\00i\0033554448\000", !49, !7, !12} ; [ DW_TAG_arg_variable ] [i] [line 16]
+!71 = !MDLocation(line: 16, scope: !49)
+!72 = !MDLocation(line: 18, scope: !49)
+!73 = !MDLocation(line: 19, scope: !49)
+!74 = !{!"0x101\00this\0016777216\001088", !50, null, !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!75 = !MDLocation(line: 0, scope: !50)
+!76 = !{!"0x101\00rhs\0033554453\000", !50, !7, !22} ; [ DW_TAG_arg_variable ] [rhs] [line 21]
+!77 = !MDLocation(line: 21, scope: !50)
+!78 = !MDLocation(line: 23, scope: !50)
+!79 = !MDLocation(line: 24, scope: !50)
+!80 = !{!"0x101\00this\0016777216\001088", !51, null, !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!81 = !MDLocation(line: 0, scope: !51)
+!82 = !{!"0x101\00rhs\0033554459\000", !51, !7, !22} ; [ DW_TAG_arg_variable ] [rhs] [line 27]
+!83 = !MDLocation(line: 27, scope: !51)
+!84 = !MDLocation(line: 29, scope: !51)
+!85 = !MDLocation(line: 30, scope: !51)
+!86 = !{!"0x101\00this\0016777216\001088", !52, null, !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!87 = !MDLocation(line: 0, scope: !52)
+!88 = !MDLocation(line: 35, scope: !52)
+!89 = !{!"0x101\00this\0016777216\001088", !53, null, !90} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!90 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1B]
+!91 = !MDLocation(line: 0, scope: !53)
+!92 = !MDLocation(line: 49, scope: !53)
+!93 = !{!"0x100\00a\0049\000", !53, !7, !4} ; [ DW_TAG_auto_variable ] [a] [line 49]
+!94 = !MDLocation(line: 50, scope: !53)
+!95 = !MDLocation(line: 51, scope: !53)
+!96 = !MDLocation(line: 51, scope: !97)
+!97 = !{!"0xb\0051\000\002", !1, !53} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!98 = !MDLocation(line: 51, scope: !99)
+!99 = !{!"0xb\0051\000\003", !1, !100} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!100 = !{!"0xb\0051\000\001", !1, !53} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!101 = !{!"0x101\00this\0016777216\001088", !63, null, !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!102 = !MDLocation(line: 0, scope: !63)
+!103 = !MDLocation(line: 8, scope: !63)
+!104 = !{!"0x101\00argc\0016777269\000", !54, !7, !12} ; [ DW_TAG_arg_variable ] [argc] [line 53]
+!105 = !MDLocation(line: 53, scope: !54)
+!106 = !{!"0x101\00argv\0033554485\000", !54, !7, !57} ; [ DW_TAG_arg_variable ] [argv] [line 53]
+!107 = !{!"0x100\00b\0055\000", !54, !7, !37} ; [ DW_TAG_auto_variable ] [b] [line 55]
+!108 = !MDLocation(line: 55, scope: !54)
+!109 = !{!"0x100\00return_val\0056\000", !54, !7, !12} ; [ DW_TAG_auto_variable ] [return_val] [line 56]
+!110 = !MDLocation(line: 56, scope: !54)
+!111 = !MDLocation(line: 56, scope: !112)
+!112 = !{!"0xb\0056\000\001", !1, !54} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!113 = !{!"0x100\00a\0058\000", !54, !7, !4} ; [ DW_TAG_auto_variable ] [a] [line 58]
+!114 = !MDLocation(line: 58, scope: !54)
+!115 = !MDLocation(line: 59, scope: !54)
+!116 = !MDLocation(line: 60, scope: !54)
+!117 = !MDLocation(line: 60, scope: !118)
+!118 = !{!"0xb\0060\000\001", !1, !54} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!119 = !MDLocation(line: 60, scope: !120)
+!120 = !{!"0xb\0060\000\003", !1, !54} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!121 = !MDLocation(line: 60, scope: !122)
+!122 = !{!"0xb\0060\000\002", !1, !54} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!123 = !{!"0x101\00this\0016777216\001088", !62, null, !90} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!124 = !MDLocation(line: 0, scope: !62)
+!125 = !MDLocation(line: 41, scope: !62)
+!126 = !{!"0x101\00this\0016777216\001088", !61, null, !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!127 = !MDLocation(line: 0, scope: !61)
+!128 = !MDLocation(line: 8, scope: !61)
+!129 = !MDLocation(line: 8, scope: !130)
+!130 = !{!"0xb\008\000\001", !1, !61} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!131 = !MDLocation(line: 8, scope: !132)
+!132 = !{!"0xb\008\000\002", !1, !61} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!133 = !MDLocation(line: 8, scope: !134)
+!134 = !{!"0xb\008\000\003", !1, !61} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
diff --git a/test/DebugInfo/X86/sroasplit-1.ll b/test/DebugInfo/X86/sroasplit-1.ll
new file mode 100644
index 0000000..ad0689b
--- /dev/null
+++ b/test/DebugInfo/X86/sroasplit-1.ll
@@ -0,0 +1,97 @@
+; RUN: opt %s -sroa -verify -S -o - | FileCheck %s
+;
+; Test that we can partial emit debug info for aggregates repeatedly
+; split up by SROA.
+;
+;    // Compile with -O1
+;    typedef struct {
+;      int a;
+;      long int b;
+;    } Inner;
+;
+;    typedef struct {
+;      Inner inner[2];
+;    } Outer;
+;
+;    int foo(Outer outer) {
+;      Inner i1 = outer.inner[1];
+;      return i1.a;
+;    }
+;
+
+; Verify that SROA creates a variable piece when splitting i1.
+; CHECK: %[[I1:.*]] = alloca [12 x i8], align 4
+; CHECK: call void @llvm.dbg.declare(metadata [12 x i8]* %[[I1]], metadata ![[VAR:[0-9]+]], metadata ![[PIECE1:[0-9]+]])
+; CHECK: call void @llvm.dbg.value(metadata i32 %[[A:.*]], i64 0, metadata ![[VAR]], metadata ![[PIECE2:[0-9]+]])
+; CHECK: ret i32 %[[A]]
+; Read Var and Piece:
+; CHECK: ![[VAR]] = {{.*}} ; [ DW_TAG_auto_variable ] [i1] [line 11]
+; CHECK: ![[PIECE1]] = {{.*}} ; [ DW_TAG_expression ] [DW_OP_bit_piece offset=32, size=96]
+; CHECK: ![[PIECE2]] = {{.*}} ; [ DW_TAG_expression ] [DW_OP_bit_piece offset=0, size=32]
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+%struct.Outer = type { [2 x %struct.Inner] }
+%struct.Inner = type { i32, i64 }
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @foo(%struct.Outer* byval align 8 %outer) #0 {
+entry:
+  %i1 = alloca %struct.Inner, align 8
+  call void @llvm.dbg.declare(metadata %struct.Outer* %outer, metadata !25, metadata !2), !dbg !26
+  call void @llvm.dbg.declare(metadata %struct.Inner* %i1, metadata !27, metadata !2), !dbg !28
+  %inner = getelementptr inbounds %struct.Outer* %outer, i32 0, i32 0, !dbg !28
+  %arrayidx = getelementptr inbounds [2 x %struct.Inner]* %inner, i32 0, i64 1, !dbg !28
+  %0 = bitcast %struct.Inner* %i1 to i8*, !dbg !28
+  %1 = bitcast %struct.Inner* %arrayidx to i8*, !dbg !28
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 8, i1 false), !dbg !28
+  %a = getelementptr inbounds %struct.Inner* %i1, i32 0, i32 0, !dbg !29
+  %2 = load i32* %a, align 4, !dbg !29
+  ret i32 %2, !dbg !29
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #2
+
+attributes #0 = { nounwind ssp uwtable }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!22, !23}
+!llvm.ident = !{!24}
+
+!0 = !{!"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [ DW_TAG_compile_unit ] [sroasplit-1.c] [DW_LANG_C99]
+!1 = !{!"sroasplit-1.c", !""}
+!2 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\0010\000\001\000\006\00256\000\0010", !1, !5, !6, null, i32 (%struct.Outer*)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [ DW_TAG_subprogram ] [line 10] [def] [foo]
+!5 = !{!"0x29", !1} ; [ DW_TAG_file_type ] [ DW_TAG_file_type ] [sroasplit-1.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !9}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x16\00Outer\008\000\000\000\000", !1, null, !10} ; [ DW_TAG_typedef ] [ DW_TAG_typedef ] [Outer] [line 8, size 0, align 0, offset 0] [from ]
+!10 = !{!"0x13\00\006\00256\0064\000\000\000", !1, null, null, !11, null, null, null} ; [ DW_TAG_structure_type ] [line 6, size 256, align 64, offset 0] [def] [from ]
+!11 = !{!12}
+!12 = !{!"0xd\00inner\007\00256\0064\000\000", !1, !10, !13} ; [ DW_TAG_member ] [inner] [line 7, size 256, align 64, offset 0] [from ]
+!13 = !{!"0x1\00\000\00256\0064\000\000", null, null, !14, !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 256, align 64, offset 0] [from Inner]
+!14 = !{!"0x16\00Inner\004\000\000\000\000", !1, null, !15} ; [ DW_TAG_typedef ] [ DW_TAG_typedef ] [Inner] [line 4, size 0, align 0, offset 0] [from ]
+!15 = !{!"0x13\00\001\00128\0064\000\000\000", !1, null, null, !16, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 128, align 64, offset 0] [def] [from ]
+!16 = !{!17, !18}
+!17 = !{!"0xd\00a\002\0032\0032\000\000", !1, !15, !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!18 = !{!"0xd\00b\003\0064\0064\0064\000", !1, !15, !19} ; [ DW_TAG_member ] [b] [line 3, size 64, align 64, offset 64] [from long int]
+!19 = !{!"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!20 = !{!21}
+!21 = !{!"0x21\000\002"}        ; [ DW_TAG_subrange_type ] [0, 1]
+!22 = !{i32 2, !"Dwarf Version", i32 2}
+!23 = !{i32 1, !"Debug Info Version", i32 2}
+!24 = !{!"clang version 3.5.0 "}
+!25 = !{!"0x101\00outer\0016777226\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [ DW_TAG_arg_variable ] [outer] [line 10]
+!26 = !MDLocation(line: 10, scope: !4)
+!27 = !{!"0x100\00i1\0011\000", !4, !5, !14} ; [ DW_TAG_auto_variable ] [ DW_TAG_auto_variable ] [i1] [line 11]
+!28 = !MDLocation(line: 11, scope: !4)
+!29 = !MDLocation(line: 12, scope: !4)
diff --git a/test/DebugInfo/X86/sroasplit-2.ll b/test/DebugInfo/X86/sroasplit-2.ll
new file mode 100644
index 0000000..e8226d9
--- /dev/null
+++ b/test/DebugInfo/X86/sroasplit-2.ll
@@ -0,0 +1,102 @@
+; RUN: opt %s -sroa -verify -S -o - | FileCheck %s
+;
+; Test that we can partial emit debug info for aggregates repeatedly
+; split up by SROA.
+;
+;    // Compile with -O1
+;    typedef struct {
+;      int a;
+;      int b;
+;    } Inner;
+;
+;    typedef struct {
+;      Inner inner[2];
+;    } Outer;
+;
+;    int foo(Outer outer) {
+;      Inner i1 = outer.inner[1];
+;      return i1.a;
+;    }
+;
+
+; Verify that SROA creates a variable piece when splitting i1.
+; CHECK:  call void @llvm.dbg.value(metadata i64 %outer.coerce0, i64 0, metadata ![[O:[0-9]+]], metadata ![[PIECE1:[0-9]+]]),
+; CHECK:  call void @llvm.dbg.value(metadata i64 %outer.coerce1, i64 0, metadata ![[O]], metadata ![[PIECE2:[0-9]+]]),
+; CHECK:  call void @llvm.dbg.value({{.*}}, i64 0, metadata ![[I1:[0-9]+]], metadata ![[PIECE3:[0-9]+]]),
+; CHECK-DAG: ![[O]] = {{.*}} [ DW_TAG_arg_variable ] [outer] [line 10]
+; CHECK-DAG: ![[PIECE1]] = {{.*}} [ DW_TAG_expression ] [DW_OP_bit_piece offset=0, size=64]
+; CHECK-DAG: ![[PIECE2]] = {{.*}} [ DW_TAG_expression ] [DW_OP_bit_piece offset=64, size=64]
+; CHECK-DAG: ![[I1]] = {{.*}} [ DW_TAG_auto_variable ] [i1] [line 11]
+; CHECK-DAG: ![[PIECE3]] = {{.*}} [ DW_TAG_expression ] [DW_OP_bit_piece offset=0, size=32]
+
+; ModuleID = 'sroasplit-2.c'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+%struct.Outer = type { [2 x %struct.Inner] }
+%struct.Inner = type { i32, i32 }
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @foo(i64 %outer.coerce0, i64 %outer.coerce1) #0 {
+  %outer = alloca %struct.Outer, align 8
+  %i1 = alloca %struct.Inner, align 4
+  %1 = bitcast %struct.Outer* %outer to { i64, i64 }*
+  %2 = getelementptr { i64, i64 }* %1, i32 0, i32 0
+  store i64 %outer.coerce0, i64* %2
+  %3 = getelementptr { i64, i64 }* %1, i32 0, i32 1
+  store i64 %outer.coerce1, i64* %3
+  call void @llvm.dbg.declare(metadata %struct.Outer* %outer, metadata !24, metadata !2), !dbg !25
+  call void @llvm.dbg.declare(metadata %struct.Inner* %i1, metadata !26, metadata !2), !dbg !27
+  %4 = getelementptr inbounds %struct.Outer* %outer, i32 0, i32 0, !dbg !27
+  %5 = getelementptr inbounds [2 x %struct.Inner]* %4, i32 0, i64 1, !dbg !27
+  %6 = bitcast %struct.Inner* %i1 to i8*, !dbg !27
+  %7 = bitcast %struct.Inner* %5 to i8*, !dbg !27
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* %7, i64 8, i32 4, i1 false), !dbg !27
+  %8 = getelementptr inbounds %struct.Inner* %i1, i32 0, i32 0, !dbg !28
+  %9 = load i32* %8, align 4, !dbg !28
+  ret i32 %9, !dbg !28
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #2
+
+attributes #0 = { nounwind ssp uwtable "no-frame-pointer-elim"="true" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!21, !22}
+!llvm.ident = !{!23}
+
+!0 = !{!"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [ DW_TAG_compile_unit ] [sroasplit-2.c] [DW_LANG_C99]
+!1 = !{!"sroasplit-2.c", !""}
+!2 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\0010\000\001\000\006\00256\000\0010", !1, !5, !6, null, i32 (i64, i64)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [ DW_TAG_subprogram ] [line 10] [def] [foo]
+!5 = !{!"0x29", !1} ; [ DW_TAG_file_type ] [ DW_TAG_file_type ] [sroasplit-2.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !9}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x16\00Outer\008\000\000\000\000", !1, null, !10} ; [ DW_TAG_typedef ] [ DW_TAG_typedef ] [Outer] [line 8, size 0, align 0, offset 0] [from ]
+!10 = !{!"0x13\00\006\00128\0032\000\000\000", !1, null, null, !11, null, null, null} ; [ DW_TAG_structure_type ] [line 6, size 128, align 32, offset 0] [def] [from ]
+!11 = !{!12}
+!12 = !{!"0xd\00inner\007\00128\0032\000\000", !1, !10, !13} ; [ DW_TAG_member ] [inner] [line 7, size 128, align 32, offset 0] [from ]
+!13 = !{!"0x1\00\000\00128\0032\000\000", null, null, !14, !19, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 32, offset 0] [from Inner]
+!14 = !{!"0x16\00Inner\004\000\000\000\000", !1, null, !15} ; [ DW_TAG_typedef ] [ DW_TAG_typedef ] [Inner] [line 4, size 0, align 0, offset 0] [from ]
+!15 = !{!"0x13\00\001\0064\0032\000\000\000", !1, null, null, !16, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 64, align 32, offset 0] [def] [from ]
+!16 = !{!17, !18}
+!17 = !{!"0xd\00a\002\0032\0032\000\000", !1, !15, !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!18 = !{!"0xd\00b\003\0032\0032\0032\000", !1, !15, !8} ; [ DW_TAG_member ] [b] [line 3, size 32, align 32, offset 32] [from int]
+!19 = !{!20}
+!20 = !{!"0x21\000\002"}        ; [ DW_TAG_subrange_type ] [0, 1]
+!21 = !{i32 2, !"Dwarf Version", i32 2}
+!22 = !{i32 1, !"Debug Info Version", i32 2}
+!23 = !{!"clang version 3.5.0 "}
+!24 = !{!"0x101\00outer\0016777226\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [ DW_TAG_arg_variable ] [outer] [line 10]
+!25 = !MDLocation(line: 10, scope: !4)
+!26 = !{!"0x100\00i1\0011\000", !4, !5, !14} ; [ DW_TAG_auto_variable ] [ DW_TAG_auto_variable ] [i1] [line 11]
+!27 = !MDLocation(line: 11, scope: !4)
+!28 = !MDLocation(line: 12, scope: !4)
diff --git a/test/DebugInfo/X86/sroasplit-3.ll b/test/DebugInfo/X86/sroasplit-3.ll
new file mode 100644
index 0000000..e51097a
--- /dev/null
+++ b/test/DebugInfo/X86/sroasplit-3.ll
@@ -0,0 +1,63 @@
+; RUN: opt %s -sroa -verify -S -o - | FileCheck %s
+; ModuleID = 'test.c'
+; Test that SROA updates the debug info correctly if an alloca was rewritten but
+; not partitioned into multiple allocas.
+;
+; CHECK: call void @llvm.dbg.value(metadata float %s.coerce, i64 0, metadata ![[VAR:[0-9]+]], metadata ![[EXPR:[0-9]+]])
+; CHECK: ![[VAR]] = {{.*}} [ DW_TAG_arg_variable ] [s] [line 3]
+; CHECK: ![[EXPR]] = {{.*}} [ DW_TAG_expression ]
+; CHECK-NOT: DW_OP_bit_piece
+
+;
+; struct S { float f; };
+;  
+; float foo(struct S s) {
+;   return s.f;
+; }
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+%struct.S = type { float }
+
+; Function Attrs: nounwind ssp uwtable
+define float @foo(float %s.coerce) #0 {
+entry:
+  %s = alloca %struct.S, align 4
+  %coerce.dive = getelementptr %struct.S* %s, i32 0, i32 0
+  store float %s.coerce, float* %coerce.dive, align 1
+  call void @llvm.dbg.declare(metadata %struct.S* %s, metadata !16, metadata !17), !dbg !18
+  %f = getelementptr inbounds %struct.S* %s, i32 0, i32 0, !dbg !19
+  %0 = load float* %f, align 4, !dbg !19
+  ret float %0, !dbg !19
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!12, !13, !14}
+!llvm.ident = !{!15}
+
+!0 = !{!"0x11\0012\00clang version 3.6.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/Volumes/Data/llvm/_build.ninja.debug/test.c] [DW_LANG_C99]
+!1 = !{!"test.c", !"/Volumes/Data/llvm/_build.ninja.debug"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\003\000\001\000\000\00256\000\003", !1, !5, !6, null, float (float)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [foo]
+!5 = !{!"0x29", !1}                               ; [ DW_TAG_file_type ] [/Volumes/Data/llvm/_build.ninja.debug/test.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !9}
+!8 = !{!"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!9 = !{!"0x13\00S\001\0032\0032\000\000\000", !1, null, null, !10, null, null, null} ; [ DW_TAG_structure_type ] [S] [line 1, size 32, align 32, offset 0] [def] [from ]
+!10 = !{!11}
+!11 = !{!"0xd\00f\001\0032\0032\000\000", !1, !9, !8} ; [ DW_TAG_member ] [f] [line 1, size 32, align 32, offset 0] [from float]
+!12 = !{i32 2, !"Dwarf Version", i32 2}
+!13 = !{i32 2, !"Debug Info Version", i32 2}
+!14 = !{i32 1, !"PIC Level", i32 2}
+!15 = !{!"clang version 3.6.0 "}
+!16 = !{!"0x101\00s\0016777219\000", !4, !5, !9}  ; [ DW_TAG_arg_variable ] [s] [line 3]
+!17 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!18 = !MDLocation(line: 3, column: 20, scope: !4)
+!19 = !MDLocation(line: 4, column: 2, scope: !4)
diff --git a/test/DebugInfo/X86/sroasplit-4.ll b/test/DebugInfo/X86/sroasplit-4.ll
new file mode 100644
index 0000000..b043476
--- /dev/null
+++ b/test/DebugInfo/X86/sroasplit-4.ll
@@ -0,0 +1,146 @@
+; RUN: opt -sroa < %s -S -o - | FileCheck %s
+;
+; Test that recursively splitting an alloca updates the debug info correctly.
+; CHECK: %[[T:.*]] = load i64* @t, align 8
+; CHECK: call void @llvm.dbg.value(metadata i64 %[[T]], i64 0, metadata ![[Y:.*]], metadata ![[P1:.*]])
+; CHECK: %[[T1:.*]] = load i64* @t, align 8
+; CHECK: call void @llvm.dbg.value(metadata i64 %[[T1]], i64 0, metadata ![[Y]], metadata ![[P2:.*]])
+; CHECK: call void @llvm.dbg.value(metadata i64 %[[T]], i64 0, metadata ![[R:.*]], metadata ![[P3:.*]])
+; CHECK: call void @llvm.dbg.value(metadata i64 %[[T1]], i64 0, metadata ![[R]], metadata ![[P4:.*]])
+; CHECK: ![[P1]] = {{.*}} [DW_OP_bit_piece offset=0, size=64]
+; CHECK: ![[P2]] = {{.*}} [DW_OP_bit_piece offset=64, size=64]
+; CHECK: ![[P3]] = {{.*}} [DW_OP_bit_piece offset=192, size=64]
+; CHECK: ![[P4]] = {{.*}} [DW_OP_bit_piece offset=256, size=64]
+; 
+; struct p {
+;   __SIZE_TYPE__ s;
+;   __SIZE_TYPE__ t;
+; };
+;  
+; struct r {
+;   int i;
+;   struct p x;
+;   struct p y;
+; };
+;  
+; extern int call_me(struct r);
+; extern int maybe();
+; extern __SIZE_TYPE__ t;
+;  
+; int test() {
+;   if (maybe())
+;     return 0;
+;   struct p y = {t, t};
+;   struct r r = {.y = y};
+;   return call_me(r);
+; }
+
+; ModuleID = 'pr22393.cc'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin"
+
+%struct.p = type { i64, i64 }
+%struct.r = type { i32, %struct.p, %struct.p }
+
+@t = external global i64
+
+; Function Attrs: nounwind
+define i32 @_Z4testv() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %y = alloca %struct.p, align 8
+  %r = alloca %struct.r, align 8
+  %agg.tmp = alloca %struct.r, align 8
+  %call = call i32 @_Z5maybev(), !dbg !24
+  %tobool = icmp ne i32 %call, 0, !dbg !24
+  br i1 %tobool, label %if.then, label %if.end, !dbg !26
+
+if.then:                                          ; preds = %entry
+  store i32 0, i32* %retval, !dbg !27
+  br label %return, !dbg !27
+
+if.end:                                           ; preds = %entry
+  call void @llvm.dbg.declare(metadata %struct.p* %y, metadata !28, metadata !29), !dbg !30
+  %s = getelementptr inbounds %struct.p* %y, i32 0, i32 0, !dbg !30
+  %0 = load i64* @t, align 8, !dbg !30
+  store i64 %0, i64* %s, align 8, !dbg !30
+  %t = getelementptr inbounds %struct.p* %y, i32 0, i32 1, !dbg !30
+  %1 = load i64* @t, align 8, !dbg !30
+  store i64 %1, i64* %t, align 8, !dbg !30
+  call void @llvm.dbg.declare(metadata %struct.r* %r, metadata !31, metadata !29), !dbg !32
+  %i = getelementptr inbounds %struct.r* %r, i32 0, i32 0, !dbg !32
+  store i32 0, i32* %i, align 4, !dbg !32
+  %x = getelementptr inbounds %struct.r* %r, i32 0, i32 1, !dbg !32
+  %s1 = getelementptr inbounds %struct.p* %x, i32 0, i32 0, !dbg !32
+  store i64 0, i64* %s1, align 8, !dbg !32
+  %t2 = getelementptr inbounds %struct.p* %x, i32 0, i32 1, !dbg !32
+  store i64 0, i64* %t2, align 8, !dbg !32
+  %y3 = getelementptr inbounds %struct.r* %r, i32 0, i32 2, !dbg !32
+  %2 = bitcast %struct.p* %y3 to i8*, !dbg !32
+  %3 = bitcast %struct.p* %y to i8*, !dbg !32
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 8, i1 false), !dbg !32
+  %4 = bitcast %struct.r* %agg.tmp to i8*, !dbg !33
+  %5 = bitcast %struct.r* %r to i8*, !dbg !33
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 40, i32 8, i1 false), !dbg !33
+  %call4 = call i32 @_Z7call_me1r(%struct.r* byval align 8 %agg.tmp), !dbg !33
+  store i32 %call4, i32* %retval, !dbg !33
+  br label %return, !dbg !33
+
+return:                                           ; preds = %if.end, %if.then
+  %6 = load i32* %retval, !dbg !34
+  ret i32 %6, !dbg !34
+}
+
+declare i32 @_Z5maybev()
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #3
+
+declare i32 @_Z7call_me1r(%struct.r* byval align 8)
+
+attributes #0 = { nounwind }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!21, !22}
+!llvm.ident = !{!23}
+
+!0 = !{!"0x11\004\00clang version 3.7.0 \000\00\000\00\001", !1, !2, !3, !16, !2, !2} ; [ DW_TAG_compile_unit ] [/<stdin>] [DW_LANG_C_plus_plus]
+!1 = !{!"<stdin>", !""}
+!2 = !{}
+!3 = !{!4, !10}
+!4 = !{!"0x13\00p\003\00128\0064\000\000\000", !5, null, null, !6, null, null, !"_ZTS1p"} ; [ DW_TAG_structure_type ] [p] [line 3, size 128, align 64, offset 0] [def] [from ]
+!5 = !{!"pr22393.cc", !""}
+!6 = !{!7, !9}
+!7 = !{!"0xd\00s\004\0064\0064\000\000", !5, !"_ZTS1p", !8} ; [ DW_TAG_member ] [s] [line 4, size 64, align 64, offset 0] [from long unsigned int]
+!8 = !{!"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!9 = !{!"0xd\00t\005\0064\0064\0064\000", !5, !"_ZTS1p", !8} ; [ DW_TAG_member ] [t] [line 5, size 64, align 64, offset 64] [from long unsigned int]
+!10 = !{!"0x13\00r\008\00320\0064\000\000\000", !5, null, null, !11, null, null, !"_ZTS1r"} ; [ DW_TAG_structure_type ] [r] [line 8, size 320, align 64, offset 0] [def] [from ]
+!11 = !{!12, !14, !15}
+!12 = !{!"0xd\00i\009\0032\0032\000\000", !5, !"_ZTS1r", !13} ; [ DW_TAG_member ] [i] [line 9, size 32, align 32, offset 0] [from int]
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = !{!"0xd\00x\0010\00128\0064\0064\000", !5, !"_ZTS1r", !"_ZTS1p"} ; [ DW_TAG_member ] [x] [line 10, size 128, align 64, offset 64] [from _ZTS1p]
+!15 = !{!"0xd\00y\0011\00128\0064\00192\000", !5, !"_ZTS1r", !"_ZTS1p"} ; [ DW_TAG_member ] [y] [line 11, size 128, align 64, offset 192] [from _ZTS1p]
+!16 = !{!17}
+!17 = !{!"0x2e\00test\00test\00_Z4testv\0018\000\001\000\000\00256\000\0018", !5, !18, !19, null, i32 ()* @_Z4testv, null, null, !2} ; [ DW_TAG_subprogram ] [line 18] [def] [test]
+!18 = !{!"0x29", !5}                              ; [ DW_TAG_file_type ] [/pr22393.cc]
+!19 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = !{!13}
+!21 = !{i32 2, !"Dwarf Version", i32 4}
+!22 = !{i32 2, !"Debug Info Version", i32 2}
+!23 = !{!"clang version 3.7.0 "}
+!24 = !MDLocation(line: 19, scope: !25)
+!25 = !{!"0xb\0019\000\000", !5, !17}             ; [ DW_TAG_lexical_block ] [/pr22393.cc]
+!26 = !MDLocation(line: 19, scope: !17)
+!27 = !MDLocation(line: 20, scope: !25)
+!28 = !{!"0x100\00y\0021\000", !17, !18, !"_ZTS1p"} ; [ DW_TAG_auto_variable ] [y] [line 21]
+!29 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!30 = !MDLocation(line: 21, scope: !17)
+!31 = !{!"0x100\00r\0022\000", !17, !18, !"_ZTS1r"} ; [ DW_TAG_auto_variable ] [r] [line 22]
+!32 = !MDLocation(line: 22, scope: !17)
+!33 = !MDLocation(line: 23, scope: !17)
+!34 = !MDLocation(line: 24, scope: !17)
diff --git a/test/DebugInfo/X86/sroasplit-5.ll b/test/DebugInfo/X86/sroasplit-5.ll
new file mode 100644
index 0000000..9c05e83
--- /dev/null
+++ b/test/DebugInfo/X86/sroasplit-5.ll
@@ -0,0 +1,91 @@
+; RUN: opt %s -sroa -verify -S -o - | FileCheck %s
+; From:
+; struct prog_src_register {
+;   unsigned : 4;       
+;   int Index : 12 + 1; 
+;   unsigned : 12;      
+;   unsigned : 4;       
+;   int : 12 + 1        
+; } src_reg_for_float() {
+;   struct prog_src_register a;
+;   memset(&a, 0, sizeof(a));
+;   int local = a.Index;
+;   return a;
+; }
+; ModuleID = 'pr22495.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; When SROA is creating new smaller allocas, it may add padding.
+;
+; There should be no debug info for the padding.
+; CHECK-NOT: DW_OP_bit_piece offset=56
+; CHECK: [ DW_TAG_expression ] [DW_OP_bit_piece offset=32, size=24]
+; CHECK-NOT: DW_OP_bit_piece offset=56
+; CHECK: [ DW_TAG_expression ] [DW_OP_bit_piece offset=0, size=32]
+; CHECK-NOT: DW_OP_bit_piece offset=56
+%struct.prog_src_register = type { i32, i24 }
+
+; Function Attrs: nounwind
+define i64 @src_reg_for_float() #0 {
+entry:
+  %retval = alloca %struct.prog_src_register, align 4
+  %a = alloca %struct.prog_src_register, align 4
+  %local = alloca i32, align 4
+  call void @llvm.dbg.declare(metadata %struct.prog_src_register* %a, metadata !16, metadata !17), !dbg !18
+  %0 = bitcast %struct.prog_src_register* %a to i8*, !dbg !19
+  call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 8, i32 4, i1 false), !dbg !19
+  call void @llvm.dbg.declare(metadata i32* %local, metadata !20, metadata !17), !dbg !21
+  %1 = bitcast %struct.prog_src_register* %a to i32*, !dbg !21
+  %bf.load = load i32* %1, align 4, !dbg !21
+  %bf.shl = shl i32 %bf.load, 15, !dbg !21
+  %bf.ashr = ashr i32 %bf.shl, 19, !dbg !21
+  store i32 %bf.ashr, i32* %local, align 4, !dbg !21
+  %2 = bitcast %struct.prog_src_register* %retval to i8*, !dbg !22
+  %3 = bitcast %struct.prog_src_register* %a to i8*, !dbg !22
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 8, i32 4, i1 false), !dbg !22
+  %4 = bitcast %struct.prog_src_register* %retval to i64*, !dbg !22
+  %5 = load i64* %4, align 1, !dbg !22
+  ret i64 %5, !dbg !22
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #2
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!13, !14}
+!llvm.ident = !{!15}
+
+!0 = !{!"0x11\0012\00clang version 3.7.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/<stdin>] [DW_LANG_C99]
+!1 = !{!"<stdin>", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00src_reg_for_float\00src_reg_for_float\00\007\000\001\000\000\000\000\007", !5, !6, !7, null, i64 ()* @src_reg_for_float, null, null, !2} ; [ DW_TAG_subprogram ] [line 7] [def] [src_reg_for_float]
+!5 = !{!"pr22495.c", !""}
+!6 = !{!"0x29", !5}                               ; [ DW_TAG_file_type ] [/pr22495.c]
+!7 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{!"0x13\00prog_src_register\001\0064\0032\000\000\000", !5, null, null, !10, null, null, null} ; [ DW_TAG_structure_type ] [prog_src_register] [line 1, size 64, align 32, offset 0] [def] [from ]
+!10 = !{!11}
+!11 = !{!"0xd\00Index\003\0013\0032\004\000", !5, !9, !12} ; [ DW_TAG_member ] [Index] [line 3, size 13, align 32, offset 4] [from int]
+!12 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = !{i32 2, !"Dwarf Version", i32 4}
+!14 = !{i32 2, !"Debug Info Version", i32 2}
+!15 = !{!"clang version 3.7.0 "}
+!16 = !{!"0x100\00a\008\000", !4, !6, !9}         ; [ DW_TAG_auto_variable ] [a] [line 8]
+!17 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!18 = !MDLocation(line: 8, scope: !4)
+!19 = !MDLocation(line: 9, scope: !4)
+!20 = !{!"0x100\00local\0010\000", !4, !6, !12}   ; [ DW_TAG_auto_variable ] [local] [line 10]
+!21 = !MDLocation(line: 10, scope: !4)
+!22 = !MDLocation(line: 11, scope: !4)
diff --git a/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll b/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
index c98ef28..0b2c50e 100644
--- a/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
+++ b/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
@@ -60,7 +60,7 @@ define i32 @test(i32 %a) nounwind uwtable ssp {
 entry:
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !15, metadata !{metadata !"0x102"}), !dbg !16
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !15, metadata !{!"0x102"}), !dbg !16
   %0 = load i32* %a.addr, align 4, !dbg !17
   %call = call i32 @fn(i32 %0), !dbg !17
   ret i32 %call, !dbg !17
@@ -72,33 +72,33 @@ define i32 @fn(i32 %a) nounwind uwtable ssp {
 entry:
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !19, metadata !{metadata !"0x102"}), !dbg !20
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !19, metadata !{!"0x102"}), !dbg !20
   %0 = load i32* %a.addr, align 4, !dbg !21
   ret i32 %0, !dbg !21
 }
 
 !llvm.dbg.cu = !{!0, !10}
 !llvm.module.flags = !{!25}
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.3\000\00\000\00\001", metadata !23, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00test\00test\00\002\000\001\000\006\00256\000\003", metadata !23, metadata !6, metadata !7, null, i32 (i32)* @test, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 3] [test]
-!6 = metadata !{metadata !"0x29", metadata !23} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 172862)\000\00\000\00\001", metadata !24, metadata !1, metadata !1, metadata !11, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
-!11 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x2e\00fn\00fn\00\001\000\001\000\006\00256\000\001", metadata !24, metadata !14, metadata !7, null, i32 (i32)* @fn, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 1] [def] [fn]
-!14 = metadata !{metadata !"0x29", metadata !24} ; [ DW_TAG_file_type ]
-!15 = metadata !{metadata !"0x101\00a\0016777218\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [a] [line 2]
-!16 = metadata !{i32 2, i32 0, metadata !5, null}
-!17 = metadata !{i32 4, i32 0, metadata !18, null}
-!18 = metadata !{metadata !"0xb\003\000\000", metadata !23, metadata !5} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{metadata !"0x101\00a\0016777217\000", metadata !13, metadata !14, metadata !9} ; [ DW_TAG_arg_variable ] [a] [line 1]
-!20 = metadata !{i32 1, i32 0, metadata !13, null}
-!21 = metadata !{i32 2, i32 0, metadata !22, null}
-!22 = metadata !{metadata !"0xb\001\000\000", metadata !24, metadata !13} ; [ DW_TAG_lexical_block ]
-!23 = metadata !{metadata !"simple.c", metadata !"/private/tmp"}
-!24 = metadata !{metadata !"simple2.c", metadata !"/private/tmp"}
-!25 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.3\000\00\000\00\001", !23, !1, !1, !3, !1,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00test\00test\00\002\000\001\000\006\00256\000\003", !23, !6, !7, null, i32 (i32)* @test, null, null, !1} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 3] [test]
+!6 = !{!"0x29", !23} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!"0x11\0012\00clang version 3.3 (trunk 172862)\000\00\000\00\001", !24, !1, !1, !11, !1,  !1} ; [ DW_TAG_compile_unit ]
+!11 = !{!13}
+!13 = !{!"0x2e\00fn\00fn\00\001\000\001\000\006\00256\000\001", !24, !14, !7, null, i32 (i32)* @fn, null, null, !1} ; [ DW_TAG_subprogram ] [line 1] [def] [fn]
+!14 = !{!"0x29", !24} ; [ DW_TAG_file_type ]
+!15 = !{!"0x101\00a\0016777218\000", !5, !6, !9} ; [ DW_TAG_arg_variable ] [a] [line 2]
+!16 = !MDLocation(line: 2, scope: !5)
+!17 = !MDLocation(line: 4, scope: !18)
+!18 = !{!"0xb\003\000\000", !23, !5} ; [ DW_TAG_lexical_block ]
+!19 = !{!"0x101\00a\0016777217\000", !13, !14, !9} ; [ DW_TAG_arg_variable ] [a] [line 1]
+!20 = !MDLocation(line: 1, scope: !13)
+!21 = !MDLocation(line: 2, scope: !22)
+!22 = !{!"0xb\001\000\000", !24, !13} ; [ DW_TAG_lexical_block ]
+!23 = !{!"simple.c", !"/private/tmp"}
+!24 = !{!"simple2.c", !"/private/tmp"}
+!25 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/stmt-list.ll b/test/DebugInfo/X86/stmt-list.ll
index 2bf4339..abd9006 100644
--- a/test/DebugInfo/X86/stmt-list.ll
+++ b/test/DebugInfo/X86/stmt-list.ll
@@ -12,12 +12,12 @@ entry:
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!7}
-!5 = metadata !{metadata !0}
+!5 = !{!0}
 
-!0 = metadata !{metadata !"0x2e\00f\00f\00\001\000\001\000\006\00256\001\001", metadata !6, metadata !1, metadata !3, null, void ()* @f, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!1 = metadata !{metadata !"0x29", metadata !6} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 ()\001\00\000\00\000", metadata !6, metadata !4, metadata !4, metadata !5, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !6, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null}
-!6 = metadata !{metadata !"test2.c", metadata !"/home/espindola/llvm"}
-!7 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00f\00f\00\001\000\001\000\006\00256\001\001", !6, !1, !3, null, void ()* @f, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!1 = !{!"0x29", !6} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 3.0 ()\001\00\000\00\000", !6, !4, !4, !5, null, null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !6, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{null}
+!6 = !{!"test2.c", !"/home/espindola/llvm"}
+!7 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/stringpool.ll b/test/DebugInfo/X86/stringpool.ll
index 9ff4a2a..3d5da31 100644
--- a/test/DebugInfo/X86/stringpool.ll
+++ b/test/DebugInfo/X86/stringpool.ll
@@ -6,13 +6,13 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 143009)\001\00\000\00\000", metadata !8, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x34\00yyyy\00yyyy\00\001\000\001", null, metadata !6, metadata !7, i32* @yyyy, null} ; [ DW_TAG_variable ]
-!6 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !"z.c", metadata !"/home/nicholas"}
+!0 = !{!"0x11\0012\00clang version 3.1 (trunk 143009)\001\00\000\00\000", !8, !1, !1, !1, !3,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x34\00yyyy\00yyyy\00\001\000\001", null, !6, !7, i32* @yyyy, null} ; [ DW_TAG_variable ]
+!6 = !{!"0x29", !8} ; [ DW_TAG_file_type ]
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!8 = !{!"z.c", !"/home/nicholas"}
 
 ; Verify that "yyyy" ended up in the stringpool.
 ; LINUX: .section .debug_str,"MS",@progbits,1
@@ -40,4 +40,4 @@
 ; DARWIN-NEXT:        .byte   9                       ## DW_AT_location
 ; DARWIN-NEXT:        .byte   3
 ; DARWIN-NEXT:        .quad   _yyyy
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!9 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/struct-loc.ll b/test/DebugInfo/X86/struct-loc.ll
index 4ce04a7..b2b7e64 100644
--- a/test/DebugInfo/X86/struct-loc.ll
+++ b/test/DebugInfo/X86/struct-loc.ll
@@ -14,14 +14,14 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!12}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 152837) (llvm/trunk 152845)\000\00\000\00\000", metadata !11, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x34\00f\00f\00\005\000\001", null, metadata !6, metadata !7, %struct.foo* @f, null} ; [ DW_TAG_variable ]
-!6 = metadata !{metadata !"0x29", metadata !11} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x13\00foo\001\0032\0032\000\000\000", metadata !11, null, null, metadata !8, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 32, align 32, offset 0] [def] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !11, metadata !7, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!11 = metadata !{metadata !"struct_bug.c", metadata !"/Users/echristo/tmp"}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.1 (trunk 152837) (llvm/trunk 152845)\000\00\000\00\000", !11, !1, !1, !1, !3,  !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x34\00f\00f\00\005\000\001", null, !6, !7, %struct.foo* @f, null} ; [ DW_TAG_variable ]
+!6 = !{!"0x29", !11} ; [ DW_TAG_file_type ]
+!7 = !{!"0x13\00foo\001\0032\0032\000\000\000", !11, null, null, !8, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 32, align 32, offset 0] [def] [from ]
+!8 = !{!9}
+!9 = !{!"0xd\00a\002\0032\0032\000\000", !11, !7, !10} ; [ DW_TAG_member ]
+!10 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!11 = !{!"struct_bug.c", !"/Users/echristo/tmp"}
+!12 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/subrange-type.ll b/test/DebugInfo/X86/subrange-type.ll
index 035e50b..65d19f2 100644
--- a/test/DebugInfo/X86/subrange-type.ll
+++ b/test/DebugInfo/X86/subrange-type.ll
@@ -12,7 +12,7 @@ entry:
   %retval = alloca i32, align 4
   %i = alloca [2 x i32], align 4
   store i32 0, i32* %retval
-  call void @llvm.dbg.declare(metadata !{[2 x i32]* %i}, metadata !10, metadata !{metadata !"0x102"}), !dbg !15
+  call void @llvm.dbg.declare(metadata [2 x i32]* %i, metadata !10, metadata !{!"0x102"}), !dbg !15
   ret i32 0, !dbg !16
 }
 
@@ -21,20 +21,20 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 171472) (llvm/trunk 171487)\000\00\000\00\000", metadata !17, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00main\00main\00\002\000\001\000\006\00256\000\003", metadata !6, metadata !6, metadata !7, null, i32 ()* @main, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 3] [main]
-!6 = metadata !{metadata !"0x29", metadata !17} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !"0x100\00i\004\000", metadata !11, metadata !6, metadata !12} ; [ DW_TAG_auto_variable ] [i] [line 4]
-!11 = metadata !{metadata !"0xb\003\000\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/foo.c]
-!12 = metadata !{metadata !"0x1\00\000\0064\0032\000\000", null, null, metadata !9, metadata !13, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 64, align 32, offset 0] [from int]
-!13 = metadata !{metadata !14}
-!14 = metadata !{metadata !"0x21\000\002"}        ; [ DW_TAG_subrange_type ] [0, 1]
-!15 = metadata !{i32 4, i32 0, metadata !11, null}
-!16 = metadata !{i32 6, i32 0, metadata !11, null}
-!17 = metadata !{metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp"}
-!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.3 (trunk 171472) (llvm/trunk 171487)\000\00\000\00\000", !17, !1, !1, !3, !1,  !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00main\00main\00\002\000\001\000\006\00256\000\003", !6, !6, !7, null, i32 ()* @main, null, null, !1} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 3] [main]
+!6 = !{!"0x29", !17} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!"0x100\00i\004\000", !11, !6, !12} ; [ DW_TAG_auto_variable ] [i] [line 4]
+!11 = !{!"0xb\003\000\000", !6, !5} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/foo.c]
+!12 = !{!"0x1\00\000\0064\0032\000\000", null, null, !9, !13, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 64, align 32, offset 0] [from int]
+!13 = !{!14}
+!14 = !{!"0x21\000\002"}        ; [ DW_TAG_subrange_type ] [0, 1]
+!15 = !MDLocation(line: 4, scope: !11)
+!16 = !MDLocation(line: 6, scope: !11)
+!17 = !{!"foo.c", !"/usr/local/google/home/echristo/tmp"}
+!18 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/subreg.ll b/test/DebugInfo/X86/subreg.ll
index a9665cb..2a1de49 100644
--- a/test/DebugInfo/X86/subreg.ll
+++ b/test/DebugInfo/X86/subreg.ll
@@ -3,14 +3,13 @@
 ; We are testing that a value in a 16 bit register gets reported as
 ; being in its superregister.
 
-; CHECK: .byte   80                      # super-register
-; CHECK-NEXT:                            # DW_OP_reg0
+; CHECK: .byte   80                      # super-register DW_OP_reg0
 ; CHECK-NEXT: .byte   147                # DW_OP_piece
 ; CHECK-NEXT: .byte   2                  # 2
 
 define i16 @f(i16 signext %zzz) nounwind {
 entry:
-  call void @llvm.dbg.value(metadata !{i16 %zzz}, i64 0, metadata !0, metadata !{metadata !"0x102"})
+  call void @llvm.dbg.value(metadata i16 %zzz, i64 0, metadata !0, metadata !{!"0x102"})
   %conv = sext i16 %zzz to i32, !dbg !7
   %conv1 = trunc i32 %conv to i16
   ret i16 %conv1
@@ -20,16 +19,16 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!11}
-!9 = metadata !{metadata !1}
+!9 = !{!1}
 
-!0 = metadata !{metadata !"0x101\00zzz\0016777219\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00f\00f\00\003\000\001\000\006\00256\000\003", metadata !10, metadata !2, metadata !4, null, i16 (i16)* @f, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
-!2 = metadata !{metadata !"0x29", metadata !10} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\0012\00clang version 3.0 ()\000\00\000\00\001", metadata !10, metadata !5, metadata !5, metadata !9, null,  null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !10, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{null}
-!6 = metadata !{metadata !"0x24\00short\000\0016\0016\000\000\005", null, metadata !3} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 4, i32 22, metadata !8, null}
-!8 = metadata !{metadata !"0xb\003\0019\000", metadata !10, metadata !1} ; [ DW_TAG_lexical_block ]
-!10 = metadata !{metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/tmpfs/build"}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x101\00zzz\0016777219\000", !1, !2, !6} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00f\00f\00\003\000\001\000\006\00256\000\003", !10, !2, !4, null, i16 (i16)* @f, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!2 = !{!"0x29", !10} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\0012\00clang version 3.0 ()\000\00\000\00\001", !10, !5, !5, !9, null,  null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !10, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{null}
+!6 = !{!"0x24\00short\000\0016\0016\000\000\005", null, !3} ; [ DW_TAG_base_type ]
+!7 = !MDLocation(line: 4, column: 22, scope: !8)
+!8 = !{!"0xb\003\0019\000", !10, !1} ; [ DW_TAG_lexical_block ]
+!10 = !{!"/home/espindola/llvm/test.c", !"/home/espindola/tmpfs/build"}
+!11 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/subregisters.ll b/test/DebugInfo/X86/subregisters.ll
index dfad9c8..8e912ad 100644
--- a/test/DebugInfo/X86/subregisters.ll
+++ b/test/DebugInfo/X86/subregisters.ll
@@ -40,10 +40,10 @@ target triple = "x86_64-apple-macosx10.9.0"
 ; Function Attrs: noinline nounwind ssp uwtable
 define void @doSomething(%struct.bar* nocapture readonly %b) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.bar* %b}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !25
+  tail call void @llvm.dbg.value(metadata %struct.bar* %b, i64 0, metadata !15, metadata !{!"0x102"}), !dbg !25
   %a1 = getelementptr inbounds %struct.bar* %b, i64 0, i32 0, !dbg !26
   %0 = load i32* %a1, align 4, !dbg !26, !tbaa !27
-  tail call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !26
+  tail call void @llvm.dbg.value(metadata i32 %0, i64 0, metadata !16, metadata !{!"0x102"}), !dbg !26
   %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %0) #4, !dbg !32
   ret void, !dbg !33
 }
@@ -59,7 +59,7 @@ define i32 @main() #3 {
 entry:
   %myBar = alloca i64, align 8, !dbg !34
   %tmpcast = bitcast i64* %myBar to %struct.bar*, !dbg !34
-  tail call void @llvm.dbg.declare(metadata !{%struct.bar* %tmpcast}, metadata !21, metadata !{metadata !"0x102"}), !dbg !34
+  tail call void @llvm.dbg.declare(metadata %struct.bar* %tmpcast, metadata !21, metadata !{!"0x102"}), !dbg !34
   store i64 17179869187, i64* %myBar, align 8, !dbg !34
   call void @doSomething(%struct.bar* %tmpcast), !dbg !35
   ret i32 0, !dbg !36
@@ -78,40 +78,40 @@ attributes #4 = { nounwind }
 !llvm.module.flags = !{!22, !23}
 !llvm.ident = !{!24}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [subregisters.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"subregisters.c", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !17}
-!4 = metadata !{metadata !"0x2e\00doSomething\00doSomething\00\0010\000\001\000\006\00256\001\0011", metadata !1, metadata !5, metadata !6, null, void (%struct.bar*)* @doSomething, null, null, metadata !14} ; [ DW_TAG_subprogram ] [line 10] [def] [scope 11] [doSomething]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [subregisters.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null, metadata !8}
-!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from bar]
-!9 = metadata !{metadata !"0x13\00bar\003\0064\0032\000\000\000", metadata !1, null, null, metadata !10, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 3, size 64, align 32, offset 0] [def] [from ]
-!10 = metadata !{metadata !11, metadata !13}
-!11 = metadata !{metadata !"0xd\00a\004\0032\0032\000\000", metadata !1, metadata !9, metadata !12} ; [ DW_TAG_member ] [a] [line 4, size 32, align 32, offset 0] [from int]
-!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!13 = metadata !{metadata !"0xd\00b\005\0032\0032\0032\000", metadata !1, metadata !9, metadata !12} ; [ DW_TAG_member ] [b] [line 5, size 32, align 32, offset 32] [from int]
-!14 = metadata !{metadata !15, metadata !16}
-!15 = metadata !{metadata !"0x101\00b\0016777226\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [b] [line 10]
-!16 = metadata !{metadata !"0x100\00a\0012\000", metadata !4, metadata !5, metadata !12} ; [ DW_TAG_auto_variable ] [a] [line 12]
-!17 = metadata !{metadata !"0x2e\00main\00main\00\0016\000\001\000\006\000\001\0017", metadata !1, metadata !5, metadata !18, null, i32 ()* @main, null, null, metadata !20} ; [ DW_TAG_subprogram ] [line 16] [def] [scope 17] [main]
-!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!19 = metadata !{metadata !12}
-!20 = metadata !{metadata !21}
-!21 = metadata !{metadata !"0x100\00myBar\0018\000", metadata !17, metadata !5, metadata !9} ; [ DW_TAG_auto_variable ] [myBar] [line 18]
-!22 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!24 = metadata !{metadata !"clang version 3.5 "}
-!25 = metadata !{i32 10, i32 0, metadata !4, null}
-!26 = metadata !{i32 12, i32 0, metadata !4, null}
-!27 = metadata !{metadata !28, metadata !29, i64 0}
-!28 = metadata !{metadata !"bar", metadata !29, i64 0, metadata !29, i64 4}
-!29 = metadata !{metadata !"int", metadata !30, i64 0}
-!30 = metadata !{metadata !"omnipotent char", metadata !31, i64 0}
-!31 = metadata !{metadata !"Simple C/C++ TBAA"}
-!32 = metadata !{i32 13, i32 0, metadata !4, null}
-!33 = metadata !{i32 14, i32 0, metadata !4, null}
-!34 = metadata !{i32 18, i32 0, metadata !17, null}
-!35 = metadata !{i32 19, i32 0, metadata !17, null}
-!36 = metadata !{i32 20, i32 0, metadata !17, null}
+!0 = !{!"0x11\0012\00clang version 3.5 \001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [subregisters.c] [DW_LANG_C99]
+!1 = !{!"subregisters.c", !""}
+!2 = !{}
+!3 = !{!4, !17}
+!4 = !{!"0x2e\00doSomething\00doSomething\00\0010\000\001\000\006\00256\001\0011", !1, !5, !6, null, void (%struct.bar*)* @doSomething, null, null, !14} ; [ DW_TAG_subprogram ] [line 10] [def] [scope 11] [doSomething]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [subregisters.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null, !8}
+!8 = !{!"0xf\00\000\0064\0064\000\000", null, null, !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from bar]
+!9 = !{!"0x13\00bar\003\0064\0032\000\000\000", !1, null, null, !10, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 3, size 64, align 32, offset 0] [def] [from ]
+!10 = !{!11, !13}
+!11 = !{!"0xd\00a\004\0032\0032\000\000", !1, !9, !12} ; [ DW_TAG_member ] [a] [line 4, size 32, align 32, offset 0] [from int]
+!12 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = !{!"0xd\00b\005\0032\0032\0032\000", !1, !9, !12} ; [ DW_TAG_member ] [b] [line 5, size 32, align 32, offset 32] [from int]
+!14 = !{!15, !16}
+!15 = !{!"0x101\00b\0016777226\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [b] [line 10]
+!16 = !{!"0x100\00a\0012\000", !4, !5, !12} ; [ DW_TAG_auto_variable ] [a] [line 12]
+!17 = !{!"0x2e\00main\00main\00\0016\000\001\000\006\000\001\0017", !1, !5, !18, null, i32 ()* @main, null, null, !20} ; [ DW_TAG_subprogram ] [line 16] [def] [scope 17] [main]
+!18 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = !{!12}
+!20 = !{!21}
+!21 = !{!"0x100\00myBar\0018\000", !17, !5, !9} ; [ DW_TAG_auto_variable ] [myBar] [line 18]
+!22 = !{i32 2, !"Dwarf Version", i32 2}
+!23 = !{i32 1, !"Debug Info Version", i32 2}
+!24 = !{!"clang version 3.5 "}
+!25 = !MDLocation(line: 10, scope: !4)
+!26 = !MDLocation(line: 12, scope: !4)
+!27 = !{!28, !29, i64 0}
+!28 = !{!"bar", !29, i64 0, !29, i64 4}
+!29 = !{!"int", !30, i64 0}
+!30 = !{!"omnipotent char", !31, i64 0}
+!31 = !{!"Simple C/C++ TBAA"}
+!32 = !MDLocation(line: 13, scope: !4)
+!33 = !MDLocation(line: 14, scope: !4)
+!34 = !MDLocation(line: 18, scope: !17)
+!35 = !MDLocation(line: 19, scope: !17)
+!36 = !MDLocation(line: 20, scope: !17)
diff --git a/test/DebugInfo/X86/template.ll b/test/DebugInfo/X86/template.ll
index 9652973..5125b09 100644
--- a/test/DebugInfo/X86/template.ll
+++ b/test/DebugInfo/X86/template.ll
@@ -1,12 +1,11 @@
 ; REQUIRES: object-emission
 
-; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj < %s > %t
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
 
 ; IR generated with `clang++ -g -emit-llvm -S` from the following code:
-; template<int x, int*, template<typename> class y, int ...z>  int func() { return 3; }
+; template<int x, int*, template<typename> class y, decltype(nullptr) n, int ...z>  int func() { return 3; }
 ; template<typename> struct y_impl { struct nested { }; };
-; int glbl = func<3, &glbl, y_impl, 1, 2>();
+; int glbl = func<3, &glbl, y_impl, nullptr, 1, 2>();
 ; y_impl<int>::nested n;
 
 ; CHECK: [[INT:0x[0-9a-f]*]]:{{ *}}DW_TAG_base_type
@@ -17,16 +16,11 @@
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_template_type_parameter
 
-; CHECK: DW_AT_name{{.*}}"func<3, &glbl, y_impl, 1, 2>"
+; CHECK: DW_AT_name{{.*}}"func<3, &glbl, y_impl, nullptr, 1, 2>"
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_template_value_parameter
 ; CHECK-NEXT: DW_AT_type{{.*}}=> {[[INT]]}
 ; CHECK-NEXT: DW_AT_name{{.*}}= "x"
-
-; This could be made shorter by encoding it as _sdata rather than data4, or
-; even as data1. DWARF strongly urges implementations to prefer 
-; _sdata/_udata rather than dataN
-
 ; CHECK-NEXT: DW_AT_const_value [DW_FORM_sdata]{{.*}}(3)
 
 ; CHECK: DW_TAG_template_value_parameter
@@ -43,6 +37,11 @@
 ; CHECK-NEXT: DW_AT_GNU_template_name{{.*}}= "y_impl"
 ; CHECK-NOT: NULL
 
+; CHECK: DW_TAG_template_value_parameter
+; CHECK-NEXT: DW_AT_type{{.*}}=> {[[NULLPTR:0x[0-9a-f]*]]}
+; CHECK-NEXT: DW_AT_name{{.*}}= "n"
+; CHECK-NEXT: DW_AT_const_value [DW_FORM_udata]{{.*}}(0)
+
 ; CHECK: DW_TAG_GNU_template_parameter_pack
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_template_value_parameter
@@ -56,71 +55,76 @@
 ; CHECK: [[INTPTR]]:{{ *}}DW_TAG_pointer_type
 ; CHECK-NEXT: DW_AT_type{{.*}} => {[[INT]]}
 
+; CHECK: [[NULLPTR]]:{{ *}}DW_TAG_unspecified_type
+; CHECK-NEXT: DW_AT_name{{.*}}= "decltype(nullptr)"
+
 %"struct.y_impl<int>::nested" = type { i8 }
 
 @glbl = global i32 0, align 4
 @n = global %"struct.y_impl<int>::nested" zeroinitializer, align 1
-@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_template.cpp, i8* null }]
 
 define internal void @__cxx_global_var_init() section ".text.startup" {
 entry:
-  %call = call i32 @_Z4funcILi3EXadL_Z4glblEE6y_implJLi1ELi2EEEiv(), !dbg !33
-  store i32 %call, i32* @glbl, align 4, !dbg !33
-  ret void, !dbg !33
+  %call = call i32 @_Z4funcILi3EXadL_Z4glblEE6y_implLDn0EJLi1ELi2EEEiv(), !dbg !36
+  store i32 %call, i32* @glbl, align 4, !dbg !36
+  ret void, !dbg !36
 }
 
 ; Function Attrs: nounwind uwtable
-define linkonce_odr i32 @_Z4funcILi3EXadL_Z4glblEE6y_implJLi1ELi2EEEiv() #0 {
+define linkonce_odr i32 @_Z4funcILi3EXadL_Z4glblEE6y_implLDn0EJLi1ELi2EEEiv() #0 {
 entry:
-  ret i32 3, !dbg !34
+  ret i32 3, !dbg !37
 }
 
-define internal void @_GLOBAL__I_a() section ".text.startup" {
+define internal void @_GLOBAL__sub_I_template.cpp() section ".text.startup" {
 entry:
-  call void @__cxx_global_var_init(), !dbg !35
-  ret void, !dbg !35
+  call void @__cxx_global_var_init(), !dbg !38
+  ret void
 }
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!31, !36}
-!llvm.ident = !{!32}
-
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (trunk 192849) (llvm/trunk 192850)\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !9, metadata !28, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/bar.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"bar.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !8}
-!4 = metadata !{metadata !"0x13\00y_impl<int>\002\008\008\000\000\000", metadata !1, null, null, metadata !2, null, metadata !5, metadata !"_ZTS6y_implIiE"} ; [ DW_TAG_structure_type ] [y_impl<int>] [line 2, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{metadata !"0x2f\00\000\000", null, metadata !7, null} ; [ DW_TAG_template_type_parameter ]
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!8 = metadata !{metadata !"0x13\00nested\002\008\008\000\000\000", metadata !1, metadata !"_ZTS6y_implIiE", null, metadata !2, null, null, metadata !"_ZTSN6y_implIiE6nestedE"} ; [ DW_TAG_structure_type ] [nested] [line 2, size 8, align 8, offset 0] [def] [from ]
-!9 = metadata !{metadata !10, metadata !14, metadata !26}
-!10 = metadata !{metadata !"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\003\001\001\000\006\00256\000\003", metadata !1, metadata !11, metadata !12, null, void ()* @__cxx_global_var_init, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [local] [def] [__cxx_global_var_init]
-!11 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/bar.cpp]
-!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!13 = metadata !{null}
-!14 = metadata !{metadata !"0x2e\00func<3, &glbl, y_impl, 1, 2>\00func<3, &glbl, y_impl, 1, 2>\00_Z4funcILi3EXadL_Z4glblEE6y_implJLi1ELi2EEEiv\001\000\001\000\006\00256\000\001", metadata !1, metadata !11, metadata !15, null, i32 ()* @_Z4funcILi3EXadL_Z4glblEE6y_implJLi1ELi2EEEiv, metadata !17, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func<3, &glbl, y_impl, 1, 2>]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{metadata !7}
-!17 = metadata !{metadata !18, metadata !19, metadata !21, metadata !22}
-!18 = metadata !{metadata !"0x30\00x\000\000", null, metadata !7, i32 3, null} ; [ DW_TAG_template_value_parameter ]
-!19 = metadata !{metadata !"0x30\00\000\000", null, metadata !20, i32* @glbl, null} ; [ DW_TAG_template_value_parameter ]
-!20 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!21 = metadata !{metadata !"0x4106\00y\000\000", null, null, metadata !"y_impl", null} ; [ DW_TAG_GNU_template_template_param ]
-!22 = metadata !{metadata !"0x4107\00z\000\000", null, null, metadata !23, null} ; [ DW_TAG_GNU_template_parameter_pack ]
-!23 = metadata !{metadata !24, metadata !25}
-!24 = metadata !{metadata !"0x30\00\000\000", null, metadata !7, i32 1, null} ; [ DW_TAG_template_value_parameter ]
-!25 = metadata !{metadata !"0x30\00\000\000", null, metadata !7, i32 2, null} ; [ DW_TAG_template_value_parameter ]
-!26 = metadata !{metadata !"0x2e\00\00\00_GLOBAL__I_a\001\001\001\000\006\0064\000\001", metadata !1, metadata !11, metadata !27, null, void ()* @_GLOBAL__I_a, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [local] [def]
-!27 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!28 = metadata !{metadata !29, metadata !30}
-!29 = metadata !{metadata !"0x34\00glbl\00glbl\00\003\000\001", null, metadata !11, metadata !7, i32* @glbl, null} ; [ DW_TAG_variable ] [glbl] [line 3] [def]
-!30 = metadata !{metadata !"0x34\00n\00n\00\004\000\001", null, metadata !11, metadata !8, %"struct.y_impl<int>::nested"* @n, null} ; [ DW_TAG_variable ] [n] [line 4] [def]
-!31 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!32 = metadata !{metadata !"clang version 3.4 (trunk 192849) (llvm/trunk 192850)"}
-!33 = metadata !{i32 3, i32 0, metadata !10, null}
-!34 = metadata !{i32 1, i32 0, metadata !14, null}
-!35 = metadata !{i32 1, i32 0, metadata !26, null}
-!36 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!llvm.module.flags = !{!33, !34}
+!llvm.ident = !{!35}
+
+!0 = !{!"0x11\004\00clang version 3.6.0 (trunk 224394) (llvm/trunk 224384)\000\00\000\00\001", !1, !2, !3, !9, !30, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/template.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"template.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4, !8}
+!4 = !{!"0x13\00y_impl<int>\002\008\008\000\000\000", !1, null, null, !2, null, !5, !"_ZTS6y_implIiE"} ; [ DW_TAG_structure_type ] [y_impl<int>] [line 2, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!6}
+!6 = !{!"0x2f\00\000\000", null, !7, null}        ; [ DW_TAG_template_type_parameter ]
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = !{!"0x13\00nested\002\008\008\000\000\000", !1, !"_ZTS6y_implIiE", null, !2, null, null, !"_ZTSN6y_implIiE6nestedE"} ; [ DW_TAG_structure_type ] [nested] [line 2, size 8, align 8, offset 0] [def] [from ]
+!9 = !{!10, !14, !28}
+!10 = !{!"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\003\001\001\000\000\00256\000\003", !1, !11, !12, null, void ()* @__cxx_global_var_init, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [local] [def] [__cxx_global_var_init]
+!11 = !{!"0x29", !1}                              ; [ DW_TAG_file_type ] [/tmp/dbginfo/template.cpp]
+!12 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = !{null}
+!14 = !{!"0x2e\00func<3, &glbl, y_impl, nullptr, 1, 2>\00func<3, &glbl, y_impl, nullptr, 1, 2>\00_Z4funcILi3EXadL_Z4glblEE6y_implLDn0EJLi1ELi2EEEiv\001\000\001\000\000\00256\000\001", !1, !11, !15, null, i32 ()* @_Z4funcILi3EXadL_Z4glblEE6y_implLDn0EJLi1ELi2EEEiv, !17, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func<3, &glbl, y_impl, nullptr, 1, 2>]
+!15 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{!7}
+!17 = !{!18, !19, !21, !22, !24}
+!18 = !{!"0x30\00x\000\000", null, !7, i32 3, null} ; [ DW_TAG_template_value_parameter ]
+!19 = !{!"0x30\00\000\000", null, !20, i32* @glbl, null} ; [ DW_TAG_template_value_parameter ]
+!20 = !{!"0xf\00\000\0064\0064\000\000", null, null, !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!21 = !{!"0x4106\00y\000\000", null, null, !"y_impl", null} ; [ DW_TAG_GNU_template_template_param ]
+!22 = !{!"0x30\00n\000\000", null, !23, i8 0, null} ; [ DW_TAG_template_value_parameter ]
+!23 = !{!"0x3b\00decltype(nullptr)\000\000\000\000\000\000", null, null} ; [ DW_TAG_unspecified_type ] [decltype(nullptr)] [line 0, size 0, align 0, offset 0]
+!24 = !{!"0x4107\00z\000\000", null, null, !25, null} ; [ DW_TAG_GNU_template_parameter_pack ]
+!25 = !{!26, !27}
+!26 = !{!"0x30\00\000\000", null, !7, i32 1, null} ; [ DW_TAG_template_value_parameter ]
+!27 = !{!"0x30\00\000\000", null, !7, i32 2, null} ; [ DW_TAG_template_value_parameter ]
+!28 = !{!"0x2e\00\00\00_GLOBAL__sub_I_template.cpp\000\001\001\000\000\0064\000\000", !1, !11, !29, null, void ()* @_GLOBAL__sub_I_template.cpp, null, null, !2} ; [ DW_TAG_subprogram ] [line 0] [local] [def]
+!29 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!30 = !{!31, !32}
+!31 = !{!"0x34\00glbl\00glbl\00\003\000\001", null, !11, !7, i32* @glbl, null} ; [ DW_TAG_variable ] [glbl] [line 3] [def]
+!32 = !{!"0x34\00n\00n\00\004\000\001", null, !11, !"_ZTSN6y_implIiE6nestedE", %"struct.y_impl<int>::nested"* @n, null} ; [ DW_TAG_variable ] [n] [line 4] [def]
+!33 = !{i32 2, !"Dwarf Version", i32 4}
+!34 = !{i32 2, !"Debug Info Version", i32 2}
+!35 = !{!"clang version 3.6.0 (trunk 224394) (llvm/trunk 224384)"}
+!36 = !MDLocation(line: 3, column: 12, scope: !10)
+!37 = !MDLocation(line: 1, column: 96, scope: !14)
+!38 = !MDLocation(line: 0, scope: !28)
diff --git a/test/DebugInfo/X86/tls.ll b/test/DebugInfo/X86/tls.ll
index cb71797..6f673dd 100644
--- a/test/DebugInfo/X86/tls.ll
+++ b/test/DebugInfo/X86/tls.ll
@@ -81,22 +81,22 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !llvm.module.flags = !{!15, !16}
 !llvm.ident = !{!17}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00-.dwo\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !12, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/tls.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"tls.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00func<&glbl>\00func<&glbl>\00_Z4funcIXadL_Z4glblEEEiv\005\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !6, null, i32 ()* @_Z4funcIXadL_Z4glblEEEiv, metadata !9, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [func<&glbl>]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/tls.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x30\00I\000\000", null, metadata !11, i32* @glbl, null} ; [ DW_TAG_template_value_parameter ]
-!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!12 = metadata !{metadata !13, metadata !14}
-!13 = metadata !{metadata !"0x34\00tls\00tls\00\001\000\001", null, metadata !5, metadata !8, i32* @tls, null} ; [ DW_TAG_variable ] [tls] [line 1] [def]
-!14 = metadata !{metadata !"0x34\00glbl\00glbl\00\002\000\001", null, metadata !5, metadata !8, i32* @glbl, null} ; [ DW_TAG_variable ] [glbl] [line 2] [def]
-!15 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!17 = metadata !{metadata !"clang version 3.5 "}
-!18 = metadata !{i32 6, i32 0, metadata !4, null}
+!0 = !{!"0x11\004\00clang version 3.5 \000\00\000\00-.dwo\000", !1, !2, !2, !3, !12, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/tls.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"tls.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00func<&glbl>\00func<&glbl>\00_Z4funcIXadL_Z4glblEEEiv\005\000\001\000\006\00256\000\005", !1, !5, !6, null, i32 ()* @_Z4funcIXadL_Z4glblEEEiv, !9, null, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [func<&glbl>]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/tls.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!10}
+!10 = !{!"0x30\00I\000\000", null, !11, i32* @glbl, null} ; [ DW_TAG_template_value_parameter ]
+!11 = !{!"0xf\00\000\0064\0064\000\000", null, null, !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!12 = !{!13, !14}
+!13 = !{!"0x34\00tls\00tls\00\001\000\001", null, !5, !8, i32* @tls, null} ; [ DW_TAG_variable ] [tls] [line 1] [def]
+!14 = !{!"0x34\00glbl\00glbl\00\002\000\001", null, !5, !8, i32* @glbl, null} ; [ DW_TAG_variable ] [glbl] [line 2] [def]
+!15 = !{i32 2, !"Dwarf Version", i32 4}
+!16 = !{i32 1, !"Debug Info Version", i32 2}
+!17 = !{!"clang version 3.5 "}
+!18 = !MDLocation(line: 6, scope: !4)
diff --git a/test/DebugInfo/X86/type_units_with_addresses.ll b/test/DebugInfo/X86/type_units_with_addresses.ll
index de7e717..0cd5439 100644
--- a/test/DebugInfo/X86/type_units_with_addresses.ll
+++ b/test/DebugInfo/X86/type_units_with_addresses.ll
@@ -112,40 +112,40 @@
 !llvm.module.flags = !{!34, !35}
 !llvm.ident = !{!36}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00tu.dwo\001", metadata !1, metadata !2, metadata !3, metadata !2, metadata !27, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/tu.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"tu.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !9, metadata !12, metadata !13, metadata !17, metadata !18, metadata !19, metadata !23, metadata !24}
-!4 = metadata !{metadata !"0x13\00S1<&i>\004\008\008\000\000\000", metadata !1, null, null, metadata !2, null, metadata !5, metadata !"_ZTS2S1IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S1<&i>] [line 4, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{metadata !"0x30\00I\000\000", null, metadata !7, i32* @i, null} ; [ DW_TAG_template_value_parameter ]
-!7 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x13\00S2\0011\008\008\000\000\000", metadata !1, null, null, metadata !10, null, null, metadata !"_ZTS2S2"} ; [ DW_TAG_structure_type ] [S2] [line 11, size 8, align 8, offset 0] [def] [from ]
-!10 = metadata !{metadata !11}
-!11 = metadata !{metadata !"0xd\00s2_1\0012\008\008\000\000", metadata !1, metadata !"_ZTS2S2", metadata !"_ZTS4S2_1IXadL_Z1iEEE"} ; [ DW_TAG_member ] [s2_1] [line 12, size 8, align 8, offset 0] [from _ZTS4S2_1IXadL_Z1iEEE]
-!12 = metadata !{metadata !"0x13\00S2_1<&i>\009\008\008\000\000\000", metadata !1, null, null, metadata !2, null, metadata !5, metadata !"_ZTS4S2_1IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S2_1<&i>] [line 9, size 8, align 8, offset 0] [def] [from ]
-!13 = metadata !{metadata !"0x13\00S3\0022\0016\008\000\000\000", metadata !1, null, null, metadata !14, null, null, metadata !"_ZTS2S3"} ; [ DW_TAG_structure_type ] [S3] [line 22, size 16, align 8, offset 0] [def] [from ]
-!14 = metadata !{metadata !15, metadata !16}
-!15 = metadata !{metadata !"0xd\00s3_1\0023\008\008\000\000", metadata !1, metadata !"_ZTS2S3", metadata !"_ZTS4S3_1IXadL_Z1iEEE"} ; [ DW_TAG_member ] [s3_1] [line 23, size 8, align 8, offset 0] [from _ZTS4S3_1IXadL_Z1iEEE]
-!16 = metadata !{metadata !"0xd\00s3_2\0024\008\008\008\000", metadata !1, metadata !"_ZTS2S3", metadata !"_ZTS4S3_2"} ; [ DW_TAG_member ] [s3_2] [line 24, size 8, align 8, offset 8] [from _ZTS4S3_2]
-!17 = metadata !{metadata !"0x13\00S3_1<&i>\0018\008\008\000\000\000", metadata !1, null, null, metadata !2, null, metadata !5, metadata !"_ZTS4S3_1IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S3_1<&i>] [line 18, size 8, align 8, offset 0] [def] [from ]
-!18 = metadata !{metadata !"0x13\00S3_2\0020\008\008\000\000\000", metadata !1, null, null, metadata !2, null, null, metadata !"_ZTS4S3_2"} ; [ DW_TAG_structure_type ] [S3_2] [line 20, size 8, align 8, offset 0] [def] [from ]
-!19 = metadata !{metadata !"0x13\00S4\0034\0016\008\000\000\000", metadata !1, null, null, metadata !20, null, null, metadata !"_ZTS2S4"} ; [ DW_TAG_structure_type ] [S4] [line 34, size 16, align 8, offset 0] [def] [from ]
-!20 = metadata !{metadata !21, metadata !22}
-!21 = metadata !{metadata !"0xd\00s4_1\0035\008\008\000\000", metadata !1, metadata !"_ZTS2S4", metadata !"_ZTS4S4_1"} ; [ DW_TAG_member ] [s4_1] [line 35, size 8, align 8, offset 0] [from _ZTS4S4_1]
-!22 = metadata !{metadata !"0xd\00s4_2\0036\008\008\008\000", metadata !1, metadata !"_ZTS2S4", metadata !"_ZTS4S4_2IXadL_Z1iEEE"} ; [ DW_TAG_member ] [s4_2] [line 36, size 8, align 8, offset 8] [from _ZTS4S4_2IXadL_Z1iEEE]
-!23 = metadata !{metadata !"0x13\00S4_1\0029\008\008\000\000\000", metadata !1, null, null, metadata !2, null, null, metadata !"_ZTS4S4_1"} ; [ DW_TAG_structure_type ] [S4_1] [line 29, size 8, align 8, offset 0] [def] [from ]
-!24 = metadata !{metadata !"0x13\00S4_2<&i>\0032\008\008\000\000\000", metadata !1, null, null, metadata !2, null, metadata !25, metadata !"_ZTS4S4_2IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S4_2<&i>] [line 32, size 8, align 8, offset 0] [def] [from ]
-!25 = metadata !{metadata !26}
-!26 = metadata !{metadata !"0x30\00T\000\000", null, metadata !7, i32* @i, null} ; [ DW_TAG_template_value_parameter ]
-!27 = metadata !{metadata !28, metadata !30, metadata !31, metadata !32, metadata !33}
-!28 = metadata !{metadata !"0x34\00i\00i\00\001\000\001", null, metadata !29, metadata !8, i32* @i, null} ; [ DW_TAG_variable ] [i] [line 1] [def]
-!29 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/tu.cpp]
-!30 = metadata !{metadata !"0x34\00a\00a\00\006\000\001", null, metadata !29, metadata !"_ZTS2S1IXadL_Z1iEEE", %struct.S1* @a, null} ; [ DW_TAG_variable ] [a] [line 6] [def]
-!31 = metadata !{metadata !"0x34\00s2\00s2\00\0015\000\001", null, metadata !29, metadata !"_ZTS2S2", %struct.S2* @s2, null} ; [ DW_TAG_variable ] [s2] [line 15] [def]
-!32 = metadata !{metadata !"0x34\00s3\00s3\00\0027\000\001", null, metadata !29, metadata !"_ZTS2S3", %struct.S3* @s3, null} ; [ DW_TAG_variable ] [s3] [line 27] [def]
-!33 = metadata !{metadata !"0x34\00s4\00s4\00\0039\000\001", null, metadata !29, metadata !"_ZTS2S4", %struct.S4* @s4, null} ; [ DW_TAG_variable ] [s4] [line 39] [def]
-!34 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!36 = metadata !{metadata !"clang version 3.5.0 "}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00tu.dwo\001", !1, !2, !3, !2, !27, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/tu.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"tu.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4, !9, !12, !13, !17, !18, !19, !23, !24}
+!4 = !{!"0x13\00S1<&i>\004\008\008\000\000\000", !1, null, null, !2, null, !5, !"_ZTS2S1IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S1<&i>] [line 4, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!6}
+!6 = !{!"0x30\00I\000\000", null, !7, i32* @i, null} ; [ DW_TAG_template_value_parameter ]
+!7 = !{!"0xf\00\000\0064\0064\000\000", null, null, !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x13\00S2\0011\008\008\000\000\000", !1, null, null, !10, null, null, !"_ZTS2S2"} ; [ DW_TAG_structure_type ] [S2] [line 11, size 8, align 8, offset 0] [def] [from ]
+!10 = !{!11}
+!11 = !{!"0xd\00s2_1\0012\008\008\000\000", !1, !"_ZTS2S2", !"_ZTS4S2_1IXadL_Z1iEEE"} ; [ DW_TAG_member ] [s2_1] [line 12, size 8, align 8, offset 0] [from _ZTS4S2_1IXadL_Z1iEEE]
+!12 = !{!"0x13\00S2_1<&i>\009\008\008\000\000\000", !1, null, null, !2, null, !5, !"_ZTS4S2_1IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S2_1<&i>] [line 9, size 8, align 8, offset 0] [def] [from ]
+!13 = !{!"0x13\00S3\0022\0016\008\000\000\000", !1, null, null, !14, null, null, !"_ZTS2S3"} ; [ DW_TAG_structure_type ] [S3] [line 22, size 16, align 8, offset 0] [def] [from ]
+!14 = !{!15, !16}
+!15 = !{!"0xd\00s3_1\0023\008\008\000\000", !1, !"_ZTS2S3", !"_ZTS4S3_1IXadL_Z1iEEE"} ; [ DW_TAG_member ] [s3_1] [line 23, size 8, align 8, offset 0] [from _ZTS4S3_1IXadL_Z1iEEE]
+!16 = !{!"0xd\00s3_2\0024\008\008\008\000", !1, !"_ZTS2S3", !"_ZTS4S3_2"} ; [ DW_TAG_member ] [s3_2] [line 24, size 8, align 8, offset 8] [from _ZTS4S3_2]
+!17 = !{!"0x13\00S3_1<&i>\0018\008\008\000\000\000", !1, null, null, !2, null, !5, !"_ZTS4S3_1IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S3_1<&i>] [line 18, size 8, align 8, offset 0] [def] [from ]
+!18 = !{!"0x13\00S3_2\0020\008\008\000\000\000", !1, null, null, !2, null, null, !"_ZTS4S3_2"} ; [ DW_TAG_structure_type ] [S3_2] [line 20, size 8, align 8, offset 0] [def] [from ]
+!19 = !{!"0x13\00S4\0034\0016\008\000\000\000", !1, null, null, !20, null, null, !"_ZTS2S4"} ; [ DW_TAG_structure_type ] [S4] [line 34, size 16, align 8, offset 0] [def] [from ]
+!20 = !{!21, !22}
+!21 = !{!"0xd\00s4_1\0035\008\008\000\000", !1, !"_ZTS2S4", !"_ZTS4S4_1"} ; [ DW_TAG_member ] [s4_1] [line 35, size 8, align 8, offset 0] [from _ZTS4S4_1]
+!22 = !{!"0xd\00s4_2\0036\008\008\008\000", !1, !"_ZTS2S4", !"_ZTS4S4_2IXadL_Z1iEEE"} ; [ DW_TAG_member ] [s4_2] [line 36, size 8, align 8, offset 8] [from _ZTS4S4_2IXadL_Z1iEEE]
+!23 = !{!"0x13\00S4_1\0029\008\008\000\000\000", !1, null, null, !2, null, null, !"_ZTS4S4_1"} ; [ DW_TAG_structure_type ] [S4_1] [line 29, size 8, align 8, offset 0] [def] [from ]
+!24 = !{!"0x13\00S4_2<&i>\0032\008\008\000\000\000", !1, null, null, !2, null, !25, !"_ZTS4S4_2IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S4_2<&i>] [line 32, size 8, align 8, offset 0] [def] [from ]
+!25 = !{!26}
+!26 = !{!"0x30\00T\000\000", null, !7, i32* @i, null} ; [ DW_TAG_template_value_parameter ]
+!27 = !{!28, !30, !31, !32, !33}
+!28 = !{!"0x34\00i\00i\00\001\000\001", null, !29, !8, i32* @i, null} ; [ DW_TAG_variable ] [i] [line 1] [def]
+!29 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/tu.cpp]
+!30 = !{!"0x34\00a\00a\00\006\000\001", null, !29, !"_ZTS2S1IXadL_Z1iEEE", %struct.S1* @a, null} ; [ DW_TAG_variable ] [a] [line 6] [def]
+!31 = !{!"0x34\00s2\00s2\00\0015\000\001", null, !29, !"_ZTS2S2", %struct.S2* @s2, null} ; [ DW_TAG_variable ] [s2] [line 15] [def]
+!32 = !{!"0x34\00s3\00s3\00\0027\000\001", null, !29, !"_ZTS2S3", %struct.S3* @s3, null} ; [ DW_TAG_variable ] [s3] [line 27] [def]
+!33 = !{!"0x34\00s4\00s4\00\0039\000\001", null, !29, !"_ZTS2S4", %struct.S4* @s4, null} ; [ DW_TAG_variable ] [s4] [line 39] [def]
+!34 = !{i32 2, !"Dwarf Version", i32 4}
+!35 = !{i32 1, !"Debug Info Version", i32 2}
+!36 = !{!"clang version 3.5.0 "}
diff --git a/test/DebugInfo/X86/union-const.ll b/test/DebugInfo/X86/union-const.ll
new file mode 100644
index 0000000..552b52d
--- /dev/null
+++ b/test/DebugInfo/X86/union-const.ll
@@ -0,0 +1,66 @@
+; RUN: llc -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+; CHECK: DW_TAG_variable
+; CHECK-NEXT: DW_AT_const_value [DW_FORM_udata]	(0)
+; CHECK-NEXT: DW_AT_name {{.*}}"a"
+;
+; ModuleID = 'union.c'
+; generated at -O1 from:
+; union mfi_evt {
+;   struct {
+;     int reserved;
+;   } members;
+; } mfi_aen_setup() {
+;   union mfi_evt a;
+;   a.members.reserved = 0;
+; }
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+%union.mfi_evt = type { %struct.anon }
+%struct.anon = type { i32 }
+
+; Function Attrs: nounwind readnone ssp uwtable
+define i32 @mfi_aen_setup() #0 {
+entry:
+  tail call void @llvm.dbg.declare(metadata %union.mfi_evt* undef, metadata !16, metadata !21), !dbg !22
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !16, metadata !21), !dbg !22
+  ret i32 undef, !dbg !23
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind readnone ssp uwtable }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!17, !18, !19}
+!llvm.ident = !{!20}
+
+!0 = !{!"0x11\0012\00clang version 3.7.0 (trunk 226915) (llvm/trunk 226905)\001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/union.c] [DW_LANG_C99]
+!1 = !{!"union.c", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00mfi_aen_setup\00mfi_aen_setup\00\005\000\001\000\000\000\001\005", !1, !5, !6, null, i32 ()* @mfi_aen_setup, null, null, !15} ; [ DW_TAG_subprogram ] [line 5] [def] [mfi_aen_setup]
+!5 = !{!"0x29", !1}                               ; [ DW_TAG_file_type ] [/union.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x17\00mfi_evt\001\0032\0032\000\000\000", !1, null, null, !9, null, null, null} ; [ DW_TAG_union_type ] [mfi_evt] [line 1, size 32, align 32, offset 0] [def] [from ]
+!9 = !{!10}
+!10 = !{!"0xd\00members\004\0032\0032\000\000", !1, !8, !11} ; [ DW_TAG_member ] [members] [line 4, size 32, align 32, offset 0] [from ]
+!11 = !{!"0x13\00\002\0032\0032\000\000\000", !1, !8, null, !12, null, null, null} ; [ DW_TAG_structure_type ] [line 2, size 32, align 32, offset 0] [def] [from ]
+!12 = !{!13}
+!13 = !{!"0xd\00reserved\003\0032\0032\000\000", !1, !11, !14} ; [ DW_TAG_member ] [reserved] [line 3, size 32, align 32, offset 0] [from int]
+!14 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!15 = !{!16}
+!16 = !{!"0x100\00a\006\000", !4, !5, !8}         ; [ DW_TAG_auto_variable ] [a] [line 6]
+!17 = !{i32 2, !"Dwarf Version", i32 2}
+!18 = !{i32 2, !"Debug Info Version", i32 2}
+!19 = !{i32 1, !"PIC Level", i32 2}
+!20 = !{!"clang version 3.7.0 (trunk 226915) (llvm/trunk 226905)"}
+!21 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!22 = !MDLocation(line: 6, column: 17, scope: !4)
+!23 = !MDLocation(line: 8, column: 1, scope: !4)
diff --git a/test/DebugInfo/X86/union-template.ll b/test/DebugInfo/X86/union-template.ll
index 6580a39..df07054 100644
--- a/test/DebugInfo/X86/union-template.ll
+++ b/test/DebugInfo/X86/union-template.ll
@@ -16,8 +16,8 @@ entry:
   %value.addr = alloca float, align 4
   %tempValue = alloca %"union.PR15637::Value", align 4
   store float %value, float* %value.addr, align 4
-  call void @llvm.dbg.declare(metadata !{float* %value.addr}, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
-  call void @llvm.dbg.declare(metadata !{%"union.PR15637::Value"* %tempValue}, metadata !25, metadata !{metadata !"0x102"}), !dbg !26
+  call void @llvm.dbg.declare(metadata float* %value.addr, metadata !23, metadata !{!"0x102"}), !dbg !24
+  call void @llvm.dbg.declare(metadata %"union.PR15637::Value"* %tempValue, metadata !25, metadata !{!"0x102"}), !dbg !26
   ret void, !dbg !27
 }
 
@@ -29,32 +29,32 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!28}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 178499) (llvm/trunk 178472)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !9,  metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"foo.cc", metadata !"/usr/local/google/home/echristo/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00g\00g\00_ZN7PR156371gEf\003\000\001\000\006\00256\000\003", metadata !1, metadata !5, metadata !6, null, void (float)* @_ZN7PR156371gEf, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [g]
-!5 = metadata !{metadata !"0x39\00PR15637\001", metadata !1, null} ; [ DW_TAG_namespace ] [PR15637] [line 1]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null, metadata !8}
-!8 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x34\00f\00f\00_ZN7PR156371fE\006\000\001", metadata !5, metadata !11, metadata !12, %"union.PR15637::Value"* @_ZN7PR156371fE, null} ; [ DW_TAG_variable ] [f] [line 6] [def]
-!11 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.cc]
-!12 = metadata !{metadata !"0x17\00Value<float>\002\0032\0032\000\000\000", metadata !1, metadata !5, null, metadata !13, null, metadata !21, null} ; [ DW_TAG_union_type ] [Value<float>] [line 2, size 32, align 32, offset 0] [def] [from ]
-!13 = metadata !{metadata !14, metadata !16}
-!14 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !1, metadata !12, metadata !15} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!15 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!16 = metadata !{metadata !"0x2e\00Value\00Value\00\002\000\000\000\006\00320\000\002", metadata !1, metadata !12, metadata !17, null, null, null, i32 0, metadata !20} ; [ DW_TAG_subprogram ] [line 2] [Value]
-!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!18 = metadata !{null, metadata !19}
-!19 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !12} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from Value<float>]
-!20 = metadata !{i32 786468}
-!21 = metadata !{metadata !22}
-!22 = metadata !{metadata !"0x2f\00T\000\000", null, metadata !8, null} ; [ DW_TAG_template_type_parameter ]
-!23 = metadata !{metadata !"0x101\00value\0016777219\000", metadata !4, metadata !11, metadata !8} ; [ DW_TAG_arg_variable ] [value] [line 3]
-!24 = metadata !{i32 3, i32 0, metadata !4, null}
-!25 = metadata !{metadata !"0x100\00tempValue\004\000", metadata !4, metadata !11, metadata !12} ; [ DW_TAG_auto_variable ] [tempValue] [line 4]
-!26 = metadata !{i32 4, i32 0, metadata !4, null}
-!27 = metadata !{i32 5, i32 0, metadata !4, null}
-!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.3 (trunk 178499) (llvm/trunk 178472)\000\00\000\00\000", !1, !2, !2, !3, !9,  !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.cc] [DW_LANG_C_plus_plus]
+!1 = !{!"foo.cc", !"/usr/local/google/home/echristo/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00g\00g\00_ZN7PR156371gEf\003\000\001\000\006\00256\000\003", !1, !5, !6, null, void (float)* @_ZN7PR156371gEf, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [g]
+!5 = !{!"0x39\00PR15637\001", !1, null} ; [ DW_TAG_namespace ] [PR15637] [line 1]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null, !8}
+!8 = !{!"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!9 = !{!10}
+!10 = !{!"0x34\00f\00f\00_ZN7PR156371fE\006\000\001", !5, !11, !12, %"union.PR15637::Value"* @_ZN7PR156371fE, null} ; [ DW_TAG_variable ] [f] [line 6] [def]
+!11 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.cc]
+!12 = !{!"0x17\00Value<float>\002\0032\0032\000\000\000", !1, !5, null, !13, null, !21, null} ; [ DW_TAG_union_type ] [Value<float>] [line 2, size 32, align 32, offset 0] [def] [from ]
+!13 = !{!14, !16}
+!14 = !{!"0xd\00a\002\0032\0032\000\000", !1, !12, !15} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!15 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!16 = !{!"0x2e\00Value\00Value\00\002\000\000\000\006\00320\000\002", !1, !12, !17, null, null, null, i32 0, !20} ; [ DW_TAG_subprogram ] [line 2] [Value]
+!17 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = !{null, !19}
+!19 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !12} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from Value<float>]
+!20 = !{i32 786468}
+!21 = !{!22}
+!22 = !{!"0x2f\00T\000\000", null, !8, null} ; [ DW_TAG_template_type_parameter ]
+!23 = !{!"0x101\00value\0016777219\000", !4, !11, !8} ; [ DW_TAG_arg_variable ] [value] [line 3]
+!24 = !MDLocation(line: 3, scope: !4)
+!25 = !{!"0x100\00tempValue\004\000", !4, !11, !12} ; [ DW_TAG_auto_variable ] [tempValue] [line 4]
+!26 = !MDLocation(line: 4, scope: !4)
+!27 = !MDLocation(line: 5, scope: !4)
+!28 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/vector.ll b/test/DebugInfo/X86/vector.ll
index cd9fcd0..fc17c06 100644
--- a/test/DebugInfo/X86/vector.ll
+++ b/test/DebugInfo/X86/vector.ll
@@ -12,19 +12,19 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 171825) (llvm/trunk 171822)\000\00\000\00\000", metadata !12, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ] [/Users/echristo/foo.c] [DW_LANG_C99]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x34\00a\00a\00\003\000\001", null, metadata !6, metadata !7, <4 x i32>* @a, null} ; [ DW_TAG_variable ] [a] [line 3] [def]
-!6 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x16\00v4si\001\000\000\000\000", metadata !12, null, metadata !8} ; [ DW_TAG_typedef ] [v4si] [line 1, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !"0x1\00\000\00128\00128\000\002048", null, null, metadata !9, metadata !10, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [vector] [from int]
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !11}
-!11 = metadata !{metadata !"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
-!12 = metadata !{metadata !"foo.c", metadata !"/Users/echristo"}
+!0 = !{!"0x11\0012\00clang version 3.3 (trunk 171825) (llvm/trunk 171822)\000\00\000\00\000", !12, !1, !1, !1, !3,  !1} ; [ DW_TAG_compile_unit ] [/Users/echristo/foo.c] [DW_LANG_C99]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x34\00a\00a\00\003\000\001", null, !6, !7, <4 x i32>* @a, null} ; [ DW_TAG_variable ] [a] [line 3] [def]
+!6 = !{!"0x29", !12} ; [ DW_TAG_file_type ]
+!7 = !{!"0x16\00v4si\001\000\000\000\000", !12, null, !8} ; [ DW_TAG_typedef ] [v4si] [line 1, size 0, align 0, offset 0] [from ]
+!8 = !{!"0x1\00\000\00128\00128\000\002048", null, null, !9, !10, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [vector] [from int]
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!11}
+!11 = !{!"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
+!12 = !{!"foo.c", !"/Users/echristo"}
 
 ; Check that we get an array type with a vector attribute.
 ; CHECK: DW_TAG_array_type
 ; CHECK-NEXT: DW_AT_GNU_vector
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!13 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/X86/vla.ll b/test/DebugInfo/X86/vla.ll
index be05c3b..3d2ca5e 100644
--- a/test/DebugInfo/X86/vla.ll
+++ b/test/DebugInfo/X86/vla.ll
@@ -27,13 +27,13 @@ entry:
   %saved_stack = alloca i8*
   %cleanup.dest.slot = alloca i32
   store i32 %n, i32* %n.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %n.addr}, metadata !15, metadata !{metadata !"0x102"}), !dbg !16
+  call void @llvm.dbg.declare(metadata i32* %n.addr, metadata !15, metadata !{!"0x102"}), !dbg !16
   %0 = load i32* %n.addr, align 4, !dbg !17
   %1 = zext i32 %0 to i64, !dbg !17
   %2 = call i8* @llvm.stacksave(), !dbg !17
   store i8* %2, i8** %saved_stack, !dbg !17
   %vla = alloca i32, i64 %1, align 16, !dbg !17
-  call void @llvm.dbg.declare(metadata !{i32* %vla}, metadata !18, metadata !{metadata !"0x102"}), !dbg !17
+  call void @llvm.dbg.declare(metadata i32* %vla, metadata !18, metadata !{!"0x102\006"}), !dbg !17
   %arrayidx = getelementptr inbounds i32* %vla, i64 0, !dbg !22
   store i32 42, i32* %arrayidx, align 4, !dbg !22
   %3 = load i32* %n.addr, align 4, !dbg !23
@@ -64,9 +64,9 @@ entry:
   %argv.addr = alloca i8**, align 8
   store i32 0, i32* %retval
   store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !25, metadata !{metadata !"0x102"}), !dbg !26
+  call void @llvm.dbg.declare(metadata i32* %argc.addr, metadata !25, metadata !{!"0x102"}), !dbg !26
   store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !27, metadata !{metadata !"0x102"}), !dbg !26
+  call void @llvm.dbg.declare(metadata i8*** %argv.addr, metadata !27, metadata !{!"0x102"}), !dbg !26
   %0 = load i32* %argc.addr, align 4, !dbg !28
   %call = call i32 @vla(i32 %0), !dbg !28
   ret i32 %call, !dbg !28
@@ -75,33 +75,33 @@ entry:
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!29}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/vla.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"vla.c", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !9}
-!4 = metadata !{metadata !"0x2e\00vla\00vla\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @vla, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [vla]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/vla.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", metadata !1, metadata !5, metadata !10, null, i32 (i32, i8**)* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
-!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!11 = metadata !{metadata !8, metadata !8, metadata !12}
-!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
-!14 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!15 = metadata !{metadata !"0x101\00n\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [n] [line 1]
-!16 = metadata !{i32 1, i32 0, metadata !4, null}
-!17 = metadata !{i32 2, i32 0, metadata !4, null}
-!18 = metadata !{metadata !"0x100\00a\002\008192", metadata !4, metadata !5, metadata !19} ; [ DW_TAG_auto_variable ] [a] [line 2]
-!19 = metadata !{metadata !"0x1\00\000\000\0032\000\000", null, null, metadata !8, metadata !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
-!20 = metadata !{metadata !21}
-!21 = metadata !{metadata !"0x21\000\00-1"}       ; [ DW_TAG_subrange_type ] [unbounded]
-!22 = metadata !{i32 3, i32 0, metadata !4, null}
-!23 = metadata !{i32 4, i32 0, metadata !4, null}
-!24 = metadata !{i32 5, i32 0, metadata !4, null}
-!25 = metadata !{metadata !"0x101\00argc\0016777223\000", metadata !9, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [argc] [line 7]
-!26 = metadata !{i32 7, i32 0, metadata !9, null}
-!27 = metadata !{metadata !"0x101\00argv\0033554439\000", metadata !9, metadata !5, metadata !12} ; [ DW_TAG_arg_variable ] [argv] [line 7]
-!28 = metadata !{i32 8, i32 0, metadata !9, null}
-!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.3 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/vla.c] [DW_LANG_C99]
+!1 = !{!"vla.c", !""}
+!2 = !{}
+!3 = !{!4, !9}
+!4 = !{!"0x2e\00vla\00vla\00\001\000\001\000\006\00256\000\001", !1, !5, !6, null, i32 (i32)* @vla, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [vla]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/vla.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", !1, !5, !10, null, i32 (i32, i8**)* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!10 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = !{!8, !8, !12}
+!12 = !{!"0xf\00\000\0064\0064\000\000", null, null, !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!13 = !{!"0xf\00\000\0064\0064\000\000", null, null, !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!14 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!15 = !{!"0x101\00n\0016777217\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [n] [line 1]
+!16 = !MDLocation(line: 1, scope: !4)
+!17 = !MDLocation(line: 2, scope: !4)
+!18 = !{!"0x100\00a\002\000", !4, !5, !19} ; [ DW_TAG_auto_variable ] [a] [line 2]
+!19 = !{!"0x1\00\000\000\0032\000\000", null, null, !8, !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!20 = !{!21}
+!21 = !{!"0x21\000\00-1"}       ; [ DW_TAG_subrange_type ] [unbounded]
+!22 = !MDLocation(line: 3, scope: !4)
+!23 = !MDLocation(line: 4, scope: !4)
+!24 = !MDLocation(line: 5, scope: !4)
+!25 = !{!"0x101\00argc\0016777223\000", !9, !5, !8} ; [ DW_TAG_arg_variable ] [argc] [line 7]
+!26 = !MDLocation(line: 7, scope: !9)
+!27 = !{!"0x101\00argv\0033554439\000", !9, !5, !12} ; [ DW_TAG_arg_variable ] [argv] [line 7]
+!28 = !MDLocation(line: 8, scope: !9)
+!29 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/array.ll b/test/DebugInfo/array.ll
index 2c7195b..4439571 100644
--- a/test/DebugInfo/array.ll
+++ b/test/DebugInfo/array.ll
@@ -6,7 +6,7 @@ entry:
   %retval = alloca i32, align 4
   %a = alloca [0 x i32], align 4
   store i32 0, i32* %retval
-  call void @llvm.dbg.declare(metadata !{[0 x i32]* %a}, metadata !6, metadata !{metadata !"0x102"}), !dbg !11
+  call void @llvm.dbg.declare(metadata [0 x i32]* %a, metadata !6, metadata !{!"0x102"}), !dbg !11
   ret i32 0, !dbg !12
 }
 
@@ -15,25 +15,25 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!16}
 
-!0 = metadata !{metadata !"0x2e\00main\00main\00\003\000\001\000\006\000\000\003", metadata !14, metadata !1, metadata !3, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [def] [main]
-!1 = metadata !{metadata !"0x29", metadata !14} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 129138)\000\00\000\00\000", metadata !14, metadata !15, metadata !15, metadata !13, null,  null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !14, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x100\00a\004\000", metadata !7, metadata !1, metadata !8} ; [ DW_TAG_auto_variable ]
-!7 = metadata !{metadata !"0xb\003\0012\000", metadata !14, metadata !0} ; [ DW_TAG_lexical_block ]
-!8 = metadata !{metadata !"0x1\00\000\000\0032\000\000", metadata !14, metadata !2, metadata !5, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
-!9 = metadata !{metadata !10}
+!0 = !{!"0x2e\00main\00main\00\003\000\001\000\006\000\000\003", !14, !1, !3, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [def] [main]
+!1 = !{!"0x29", !14} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 3.0 (trunk 129138)\000\00\000\00\000", !14, !15, !15, !13, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !14, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !2} ; [ DW_TAG_base_type ]
+!6 = !{!"0x100\00a\004\000", !7, !1, !8} ; [ DW_TAG_auto_variable ]
+!7 = !{!"0xb\003\0012\000", !14, !0} ; [ DW_TAG_lexical_block ]
+!8 = !{!"0x1\00\000\000\0032\000\000", !14, !2, !5, !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!9 = !{!10}
 ;CHECK: DW_TAG_subrange_type
 ;CHECK-NEXT: DW_AT_type
 ;CHECK-NOT: DW_AT_lower_bound
 ;CHECK-NOT: DW_AT_upper_bound
 ;CHECK-NEXT: End Of Children Mark
-!10 = metadata !{metadata !"0x21\000\00-1"}        ; [ DW_TAG_subrange_type ]
-!11 = metadata !{i32 4, i32 7, metadata !7, null}
-!12 = metadata !{i32 5, i32 3, metadata !7, null}
-!13 = metadata !{metadata !0}
-!14 = metadata !{metadata !"array.c", metadata !"/private/tmp"}
-!15 = metadata !{i32 0}
-!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!10 = !{!"0x21\000\00-1"}        ; [ DW_TAG_subrange_type ]
+!11 = !MDLocation(line: 4, column: 7, scope: !7)
+!12 = !MDLocation(line: 5, column: 3, scope: !7)
+!13 = !{!0}
+!14 = !{!"array.c", !"/private/tmp"}
+!15 = !{i32 0}
+!16 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/block-asan.ll b/test/DebugInfo/block-asan.ll
new file mode 100644
index 0000000..b25aee1
--- /dev/null
+++ b/test/DebugInfo/block-asan.ll
@@ -0,0 +1,87 @@
+; RUN: opt -S -asan %s | FileCheck %s
+
+; The IR of this testcase is generated from the following C code:
+; void bar (int);
+;
+; void foo() {
+;   __block int x;
+;   bar(x);
+; }
+; by compiling it with 'clang -emit-llvm -g -S' and then by manually
+; adding the sanitize_address attribute to the @foo() function (so
+; that ASAN accepts to instrument the function in the above opt run).
+
+; Check that the location of the ASAN instrumented __block variable is
+; correct.
+; CHECK: [ DW_TAG_expression ] [DW_OP_deref] [DW_OP_plus 8] [DW_OP_deref] [DW_OP_plus 24]
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.__block_byref_x = type { i8*, %struct.__block_byref_x*, i32, i32, i32 }
+
+; Function Attrs: nounwind ssp uwtable
+define void @foo() #0 {
+entry:
+  %x = alloca %struct.__block_byref_x, align 8
+  call void @llvm.dbg.declare(metadata %struct.__block_byref_x* %x, metadata !12, metadata !22), !dbg !23
+  %byref.isa = getelementptr inbounds %struct.__block_byref_x* %x, i32 0, i32 0, !dbg !24
+  store i8* null, i8** %byref.isa, !dbg !24
+  %byref.forwarding = getelementptr inbounds %struct.__block_byref_x* %x, i32 0, i32 1, !dbg !24
+  store %struct.__block_byref_x* %x, %struct.__block_byref_x** %byref.forwarding, !dbg !24
+  %byref.flags = getelementptr inbounds %struct.__block_byref_x* %x, i32 0, i32 2, !dbg !24
+  store i32 0, i32* %byref.flags, !dbg !24
+  %byref.size = getelementptr inbounds %struct.__block_byref_x* %x, i32 0, i32 3, !dbg !24
+  store i32 32, i32* %byref.size, !dbg !24
+  %forwarding = getelementptr inbounds %struct.__block_byref_x* %x, i32 0, i32 1, !dbg !25
+  %0 = load %struct.__block_byref_x** %forwarding, !dbg !25
+  %x1 = getelementptr inbounds %struct.__block_byref_x* %0, i32 0, i32 4, !dbg !25
+  %1 = load i32* %x1, align 4, !dbg !25
+  call void @bar(i32 %1), !dbg !25
+  %2 = bitcast %struct.__block_byref_x* %x to i8*, !dbg !26
+  call void @_Block_object_dispose(i8* %2, i32 8) #3, !dbg !26
+  ret void, !dbg !26
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare void @bar(i32) #2
+
+declare void @_Block_object_dispose(i8*, i32)
+
+attributes #0 = { nounwind ssp uwtable sanitize_address "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9, !10}
+!llvm.ident = !{!11}
+
+!0 = !{!"0x11\0012\00clang version 3.6.0 (trunk 223120) (llvm/trunk 223119)\000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/block.c] [DW_LANG_C99]
+!1 = !{!"block.c", !"/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\003\000\001\000\000\000\000\003", !1, !5, !6, null, void ()* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [foo]
+!5 = !{!"0x29", !1}    ; [ DW_TAG_file_type ] [/tmp/block.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{i32 2, !"Dwarf Version", i32 2}
+!9 = !{i32 2, !"Debug Info Version", i32 2}
+!10 = !{i32 1, !"PIC Level", i32 2}
+!11 = !{!"clang version 3.6.0 (trunk 223120) (llvm/trunk 223119)"}
+!12 = !{!"0x100\00x\004\000", !4, !5, !13} ; [ DW_TAG_auto_variable ] [x] [line 4]
+!13 = !{!"0x13\00\000\00224\000\000\0016\000", !1, !5, null, !14, null, null, null} ; [ DW_TAG_structure_type ] [line 0, size 224, align 0, offset 0] [def] [from ]
+!14 = !{!15, !17, !18, !20, !21}
+!15 = !{!"0xd\00__isa\000\0064\0064\000\000", !1, !5, !16} ; [ DW_TAG_member ] [__isa] [line 0, size 64, align 64, offset 0] [from ]
+!16 = !{!"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!17 = !{!"0xd\00__forwarding\000\0064\0064\0064\000", !1, !5, !16} ; [ DW_TAG_member ] [__forwarding] [line 0, size 64, align 64, offset 64] [from ]
+!18 = !{!"0xd\00__flags\000\0032\0032\00128\000", !1, !5, !19} ; [ DW_TAG_member ] [__flags] [line 0, size 32, align 32, offset 128] [from int]
+!19 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!20 = !{!"0xd\00__size\000\0032\0032\00160\000", !1, !5, !19} ; [ DW_TAG_member ] [__size] [line 0, size 32, align 32, offset 160] [from int]
+!21 = !{!"0xd\00x\000\0032\0032\00192\000", !1, !5, !19} ; [ DW_TAG_member ] [x] [line 0, size 32, align 32, offset 192] [from int]
+!22 = !{!"0x102\0034\008\006\0034\0024"} ; [ DW_TAG_expression ] [DW_OP_plus 8] [DW_OP_deref] [DW_OP_plus 24]
+!23 = !MDLocation(line: 4, column: 15, scope: !4)
+!24 = !MDLocation(line: 4, column: 3, scope: !4)
+!25 = !MDLocation(line: 5, column: 3, scope: !4)
+!26 = !MDLocation(line: 6, column: 1, scope: !4)
diff --git a/test/DebugInfo/bug_null_debuginfo.ll b/test/DebugInfo/bug_null_debuginfo.ll
index fd22fb3..784f17e 100644
--- a/test/DebugInfo/bug_null_debuginfo.ll
+++ b/test/DebugInfo/bug_null_debuginfo.ll
@@ -3,6 +3,6 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!2}
 
-!0 = metadata !{metadata !"0x11\0012\00\000\00\000\00\000", metadata !1, null, null, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"t", metadata !""}
-!2 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00\000\00\000\00\000", !1, null, null, null,  null, null} ; [ DW_TAG_compile_unit ]
+!1 = !{!"t", !""}
+!2 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/constant-pointers.ll b/test/DebugInfo/constant-pointers.ll
index e344fb8..add9780 100644
--- a/test/DebugInfo/constant-pointers.ll
+++ b/test/DebugInfo/constant-pointers.ll
@@ -30,22 +30,22 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !llvm.module.flags = !{!15, !16}
 !llvm.ident = !{!17}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/constant-pointers.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"constant-pointers.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00func<nullptr, nullptr, 42>\00func<nullptr, nullptr, 42>\00_Z4funcILPv0ELPFvvE0ELi42EEvv\002\000\001\000\006\00256\000\002", metadata !1, metadata !5, metadata !6, null, void ()* @_Z4funcILPv0ELPFvvE0ELi42EEvv, metadata !8, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [func<nullptr, nullptr, 42>]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/constant-pointers.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{metadata !9, metadata !11, metadata !13}
-!9 = metadata !{metadata !"0x30\00V\000\000", null, metadata !10, i8 0, null} ; [ DW_TAG_template_value_parameter ]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!11 = metadata !{metadata !"0x30\00F\000\000", null, metadata !12, i8 0, null} ; [ DW_TAG_template_value_parameter ]
-!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!13 = metadata !{metadata !"0x30\00i\000\000", null, metadata !14, i32 42, null} ; [ DW_TAG_template_value_parameter ]
-!14 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!15 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!16 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!17 = metadata !{metadata !"clang version 3.5.0 "}
-!18 = metadata !{i32 3, i32 0, metadata !4, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/constant-pointers.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"constant-pointers.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00func<nullptr, nullptr, 42>\00func<nullptr, nullptr, 42>\00_Z4funcILPv0ELPFvvE0ELi42EEvv\002\000\001\000\006\00256\000\002", !1, !5, !6, null, void ()* @_Z4funcILPv0ELPFvvE0ELi42EEvv, !8, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [func<nullptr, nullptr, 42>]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/constant-pointers.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{!9, !11, !13}
+!9 = !{!"0x30\00V\000\000", null, !10, i8 0, null} ; [ DW_TAG_template_value_parameter ]
+!10 = !{!"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!11 = !{!"0x30\00F\000\000", null, !12, i8 0, null} ; [ DW_TAG_template_value_parameter ]
+!12 = !{!"0xf\00\000\0064\0064\000\000", null, null, !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!13 = !{!"0x30\00i\000\000", null, !14, i32 42, null} ; [ DW_TAG_template_value_parameter ]
+!14 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!15 = !{i32 2, !"Dwarf Version", i32 4}
+!16 = !{i32 2, !"Debug Info Version", i32 2}
+!17 = !{!"clang version 3.5.0 "}
+!18 = !MDLocation(line: 3, scope: !4)
diff --git a/test/DebugInfo/cross-cu-inlining.ll b/test/DebugInfo/cross-cu-inlining.ll
index f262022..fafa3fa 100644
--- a/test/DebugInfo/cross-cu-inlining.ll
+++ b/test/DebugInfo/cross-cu-inlining.ll
@@ -75,7 +75,7 @@ entry:
   %1 = bitcast i32* %x.addr.i to i8*
   call void @llvm.lifetime.start(i64 4, i8* %1)
   store i32 %0, i32* %x.addr.i, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x.addr.i}, metadata !20, metadata !{metadata !"0x102"}), !dbg !21
+  call void @llvm.dbg.declare(metadata i32* %x.addr.i, metadata !20, metadata !{!"0x102"}), !dbg !21
   %2 = load i32* %x.addr.i, align 4, !dbg !22
   %mul.i = mul nsw i32 %2, 2, !dbg !22
   %3 = bitcast i32* %x.addr.i to i8*, !dbg !22
@@ -88,7 +88,7 @@ define i32 @_Z4funci(i32 %x) #1 {
 entry:
   %x.addr = alloca i32, align 4
   store i32 %x, i32* %x.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !20, metadata !{metadata !"0x102"}), !dbg !23
+  call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !20, metadata !{!"0x102"}), !dbg !23
   %0 = load i32* %x.addr, align 4, !dbg !24
   %mul = mul nsw i32 %0, 2, !dbg !24
   ret i32 %mul, !dbg !24
@@ -112,29 +112,29 @@ attributes #3 = { nounwind }
 !llvm.module.flags = !{!16, !17}
 !llvm.ident = !{!18, !18}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"a.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00main\00main\00\003\000\001\000\006\00256\000\003", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [main]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/a.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !10, metadata !2, metadata !2, metadata !11, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
-!10 = metadata !{metadata !"b.cpp", metadata !"/tmp/dbginfo"}
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0x2e\00func\00func\00_Z4funci\001\000\001\000\006\00256\000\001", metadata !10, metadata !13, metadata !14, null, i32 (i32)* @_Z4funci, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
-!13 = metadata !{metadata !"0x29", metadata !10}        ; [ DW_TAG_file_type ] [/tmp/dbginfo/b.cpp]
-!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!15 = metadata !{metadata !8, metadata !8}
-!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!17 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!18 = metadata !{metadata !"clang version 3.5.0 "}
-!19 = metadata !{i32 4, i32 0, metadata !4, null}
-!20 = metadata !{metadata !"0x101\00x\0016777217\000", metadata !12, metadata !13, metadata !8} ; [ DW_TAG_arg_variable ] [x] [line 1]
-!21 = metadata !{i32 1, i32 0, metadata !12, metadata !19}
-!22 = metadata !{i32 2, i32 0, metadata !12, metadata !19}
-!23 = metadata !{i32 1, i32 0, metadata !12, null}
-!24 = metadata !{i32 2, i32 0, metadata !12, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"a.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00main\00main\00\003\000\001\000\006\00256\000\003", !1, !5, !6, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [main]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/a.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !10, !2, !2, !11, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
+!10 = !{!"b.cpp", !"/tmp/dbginfo"}
+!11 = !{!12}
+!12 = !{!"0x2e\00func\00func\00_Z4funci\001\000\001\000\006\00256\000\001", !10, !13, !14, null, i32 (i32)* @_Z4funci, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
+!13 = !{!"0x29", !10}        ; [ DW_TAG_file_type ] [/tmp/dbginfo/b.cpp]
+!14 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = !{!8, !8}
+!16 = !{i32 2, !"Dwarf Version", i32 4}
+!17 = !{i32 2, !"Debug Info Version", i32 2}
+!18 = !{!"clang version 3.5.0 "}
+!19 = !MDLocation(line: 4, scope: !4)
+!20 = !{!"0x101\00x\0016777217\000", !12, !13, !8} ; [ DW_TAG_arg_variable ] [x] [line 1]
+!21 = !MDLocation(line: 1, scope: !12, inlinedAt: !19)
+!22 = !MDLocation(line: 2, scope: !12, inlinedAt: !19)
+!23 = !MDLocation(line: 1, scope: !12)
+!24 = !MDLocation(line: 2, scope: !12)
 
diff --git a/test/DebugInfo/cross-cu-linkonce-distinct.ll b/test/DebugInfo/cross-cu-linkonce-distinct.ll
index e19f89c..2bd7c47 100644
--- a/test/DebugInfo/cross-cu-linkonce-distinct.ll
+++ b/test/DebugInfo/cross-cu-linkonce-distinct.ll
@@ -52,7 +52,7 @@
 define linkonce_odr i32 @_Z4funci(i32 %i) #0 {
   %1 = alloca i32, align 4
   store i32 %i, i32* %1, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !22, metadata !{metadata !"0x102"}), !dbg !23
+  call void @llvm.dbg.declare(metadata i32* %1, metadata !22, metadata !{!"0x102"}), !dbg !23
   %2 = load i32* %1, align 4, !dbg !24
   %3 = mul nsw i32 %2, 2, !dbg !24
   ret i32 %3, !dbg !24
@@ -68,28 +68,28 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!19, !20}
 !llvm.ident = !{!21, !21}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"a.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00func\00func\00_Z4funci\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @_Z4funci, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/a.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x34\00x\00x\00\004\000\001", null, metadata !5, metadata !11, i32 (i32)** @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
-!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!12 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !13, metadata !2, metadata !2, metadata !14, metadata !17, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
-!13 = metadata !{metadata !"b.cpp", metadata !"/tmp/dbginfo"}
-!14 = metadata !{metadata !15}
-!15 = metadata !{metadata !"0x2e\00func\00func\00_Z4funci\001\000\001\000\006\00256\000\001", metadata !13, metadata !16, metadata !6, null, i32 (i32)* @_Z4funci, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
-!16 = metadata !{metadata !"0x29", metadata !13}        ; [ DW_TAG_file_type ] [/tmp/dbginfo/b.cpp]
-!17 = metadata !{metadata !18}
-!18 = metadata !{metadata !"0x34\00y\00y\00\004\000\001", null, metadata !16, metadata !11, i32 (i32)** @y, null} ; [ DW_TAG_variable ] [y] [line 4] [def]
-!19 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!21 = metadata !{metadata !"clang version 3.5.0 "}
-!22 = metadata !{metadata !"0x101\00i\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [i] [line 1]
-!23 = metadata !{i32 1, i32 0, metadata !4, null}
-!24 = metadata !{i32 2, i32 0, metadata !4, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !9, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"a.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00func\00func\00_Z4funci\001\000\001\000\006\00256\000\001", !1, !5, !6, null, i32 (i32)* @_Z4funci, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/a.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!10}
+!10 = !{!"0x34\00x\00x\00\004\000\001", null, !5, !11, i32 (i32)** @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
+!11 = !{!"0xf\00\000\0064\0064\000\000", null, null, !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!12 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !13, !2, !2, !14, !17, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
+!13 = !{!"b.cpp", !"/tmp/dbginfo"}
+!14 = !{!15}
+!15 = !{!"0x2e\00func\00func\00_Z4funci\001\000\001\000\006\00256\000\001", !13, !16, !6, null, i32 (i32)* @_Z4funci, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
+!16 = !{!"0x29", !13}        ; [ DW_TAG_file_type ] [/tmp/dbginfo/b.cpp]
+!17 = !{!18}
+!18 = !{!"0x34\00y\00y\00\004\000\001", null, !16, !11, i32 (i32)** @y, null} ; [ DW_TAG_variable ] [y] [line 4] [def]
+!19 = !{i32 2, !"Dwarf Version", i32 4}
+!20 = !{i32 1, !"Debug Info Version", i32 2}
+!21 = !{!"clang version 3.5.0 "}
+!22 = !{!"0x101\00i\0016777217\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [i] [line 1]
+!23 = !MDLocation(line: 1, scope: !4)
+!24 = !MDLocation(line: 2, scope: !4)
diff --git a/test/DebugInfo/cross-cu-linkonce.ll b/test/DebugInfo/cross-cu-linkonce.ll
index 8beb6fd..aaae4c1 100644
--- a/test/DebugInfo/cross-cu-linkonce.ll
+++ b/test/DebugInfo/cross-cu-linkonce.ll
@@ -32,7 +32,7 @@
 define linkonce_odr i32 @_Z4funci(i32 %i) #0 {
   %1 = alloca i32, align 4
   store i32 %i, i32* %1, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !20, metadata !{metadata !"0x102"}), !dbg !21
+  call void @llvm.dbg.declare(metadata i32* %1, metadata !20, metadata !{!"0x102"}), !dbg !21
   %2 = load i32* %1, align 4, !dbg !22
   %3 = mul nsw i32 %2, 2, !dbg !22
   ret i32 %3, !dbg !22
@@ -48,26 +48,26 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!17, !18}
 !llvm.ident = !{!19, !19}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !10, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"a.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00func\00func\00_Z4funci\001\000\001\000\006\00256\000\001", metadata !5, metadata !6, metadata !7, null, i32 (i32)* @_Z4funci, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
-!5 = metadata !{metadata !"func.h", metadata !"/tmp/dbginfo"}
-!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/func.h]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !11}
-!11 = metadata !{metadata !"0x34\00x\00x\00\004\000\001", null, metadata !6, metadata !12, i32 (i32)** @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
-!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!13 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !14, metadata !2, metadata !2, metadata !3, metadata !15, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
-!14 = metadata !{metadata !"b.cpp", metadata !"/tmp/dbginfo"}
-!15 = metadata !{metadata !16}
-!16 = metadata !{metadata !"0x34\00y\00y\00\004\000\001", null, metadata !6, metadata !12, i32 (i32)** @y, null} ; [ DW_TAG_variable ] [y] [line 4] [def]
-!17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!19 = metadata !{metadata !"clang version 3.5.0 "}
-!20 = metadata !{metadata !"0x101\00i\0016777217\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [i] [line 1]
-!21 = metadata !{i32 1, i32 0, metadata !4, null}
-!22 = metadata !{i32 2, i32 0, metadata !4, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !10, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"a.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00func\00func\00_Z4funci\001\000\001\000\006\00256\000\001", !5, !6, !7, null, i32 (i32)* @_Z4funci, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
+!5 = !{!"func.h", !"/tmp/dbginfo"}
+!6 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/func.h]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!11}
+!11 = !{!"0x34\00x\00x\00\004\000\001", null, !6, !12, i32 (i32)** @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
+!12 = !{!"0xf\00\000\0064\0064\000\000", null, null, !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!13 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !14, !2, !2, !3, !15, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
+!14 = !{!"b.cpp", !"/tmp/dbginfo"}
+!15 = !{!16}
+!16 = !{!"0x34\00y\00y\00\004\000\001", null, !6, !12, i32 (i32)** @y, null} ; [ DW_TAG_variable ] [y] [line 4] [def]
+!17 = !{i32 2, !"Dwarf Version", i32 4}
+!18 = !{i32 1, !"Debug Info Version", i32 2}
+!19 = !{!"clang version 3.5.0 "}
+!20 = !{!"0x101\00i\0016777217\000", !4, !6, !9} ; [ DW_TAG_arg_variable ] [i] [line 1]
+!21 = !MDLocation(line: 1, scope: !4)
+!22 = !MDLocation(line: 2, scope: !4)
diff --git a/test/DebugInfo/cu-range-hole.ll b/test/DebugInfo/cu-range-hole.ll
index 0bdabba..aa489b6 100644
--- a/test/DebugInfo/cu-range-hole.ll
+++ b/test/DebugInfo/cu-range-hole.ll
@@ -18,7 +18,7 @@ define i32 @b(i32 %c) #0 {
 entry:
   %c.addr = alloca i32, align 4
   store i32 %c, i32* %c.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %c.addr}, metadata !13, metadata !{metadata !"0x102"}), !dbg !14
+  call void @llvm.dbg.declare(metadata i32* %c.addr, metadata !13, metadata !{!"0x102"}), !dbg !14
   %0 = load i32* %c.addr, align 4, !dbg !14
   %add = add nsw i32 %0, 1, !dbg !14
   ret i32 %add, !dbg !14
@@ -42,7 +42,7 @@ define i32 @d(i32 %e) #0 {
 entry:
   %e.addr = alloca i32, align 4
   store i32 %e, i32* %e.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %e.addr}, metadata !15, metadata !{metadata !"0x102"}), !dbg !16
+  call void @llvm.dbg.declare(metadata i32* %e.addr, metadata !15, metadata !{!"0x102"}), !dbg !16
   %0 = load i32* %e.addr, align 4, !dbg !16
   %add = add nsw i32 %0, 1, !dbg !16
   ret i32 %add, !dbg !16
@@ -55,20 +55,20 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!1}
 !llvm.module.flags = !{!11, !12}
 
-!0 = metadata !{metadata !"clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)"}
-!1 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)\000\00\000\00\001", metadata !2, metadata !3, metadata !3, metadata !4, metadata !3, metadata !3} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{metadata !"b.c", metadata !"/usr/local/google/home/echristo"}
-!3 = metadata !{}
-!4 = metadata !{metadata !5, metadata !10}
-!5 = metadata !{metadata !"0x2e\00b\00b\00\001\000\001\000\006\00256\000\001", metadata !2, metadata !6, metadata !7, null, i32 (i32)* @b, null, null, metadata !3} ; [ DW_TAG_subprogram ] [line 1] [def] [b]
-!6 = metadata !{metadata !"0x29", metadata !2}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/b.c]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !"0x2e\00d\00d\00\003\000\001\000\006\00256\000\003", metadata !2, metadata !6, metadata !7, null, i32 (i32)* @d, null, null, metadata !3} ; [ DW_TAG_subprogram ] [line 3] [def] [d]
-!11 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!13 = metadata !{metadata !"0x101\00c\0016777217\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [c] [line 1]
-!14 = metadata !{i32 1, i32 0, metadata !5, null}
-!15 = metadata !{metadata !"0x101\00e\0016777219\000", metadata !10, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [e] [line 3]
-!16 = metadata !{i32 3, i32 0, metadata !10, null}
+!0 = !{!"clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)"}
+!1 = !{!"0x11\0012\00clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)\000\00\000\00\001", !2, !3, !3, !4, !3, !3} ; [ DW_TAG_compile_unit ]
+!2 = !{!"b.c", !"/usr/local/google/home/echristo"}
+!3 = !{}
+!4 = !{!5, !10}
+!5 = !{!"0x2e\00b\00b\00\001\000\001\000\006\00256\000\001", !2, !6, !7, null, i32 (i32)* @b, null, null, !3} ; [ DW_TAG_subprogram ] [line 1] [def] [b]
+!6 = !{!"0x29", !2}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/b.c]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!"0x2e\00d\00d\00\003\000\001\000\006\00256\000\003", !2, !6, !7, null, i32 (i32)* @d, null, null, !3} ; [ DW_TAG_subprogram ] [line 3] [def] [d]
+!11 = !{i32 2, !"Dwarf Version", i32 4}
+!12 = !{i32 1, !"Debug Info Version", i32 2}
+!13 = !{!"0x101\00c\0016777217\000", !5, !6, !9} ; [ DW_TAG_arg_variable ] [c] [line 1]
+!14 = !MDLocation(line: 1, scope: !5)
+!15 = !{!"0x101\00e\0016777219\000", !10, !6, !9} ; [ DW_TAG_arg_variable ] [e] [line 3]
+!16 = !MDLocation(line: 3, scope: !10)
diff --git a/test/DebugInfo/cu-ranges.ll b/test/DebugInfo/cu-ranges.ll
index 83d176a..6296b93 100644
--- a/test/DebugInfo/cu-ranges.ll
+++ b/test/DebugInfo/cu-ranges.ll
@@ -22,7 +22,7 @@ define i32 @foo(i32 %a) #0 section "__TEXT,__foo" {
 entry:
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !13, metadata !{metadata !"0x102"}), !dbg !14
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !13, metadata !{!"0x102"}), !dbg !14
   %0 = load i32* %a.addr, align 4, !dbg !15
   %add = add nsw i32 %0, 5, !dbg !15
   ret i32 %add, !dbg !15
@@ -36,7 +36,7 @@ define i32 @bar(i32 %a) #0 {
 entry:
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !16, metadata !{metadata !"0x102"}), !dbg !17
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !16, metadata !{!"0x102"}), !dbg !17
   %0 = load i32* %a.addr, align 4, !dbg !18
   %add = add nsw i32 %0, 5, !dbg !18
   ret i32 %add, !dbg !18
@@ -49,23 +49,23 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"foo.c", metadata !"/usr/local/google/home/echristo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !9}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/foo.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x2e\00bar\00bar\00\005\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @bar, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [bar]
-!10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!12 = metadata !{metadata !"clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)"}
-!13 = metadata !{metadata !"0x101\00a\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 1]
-!14 = metadata !{i32 1, i32 0, metadata !4, null}
-!15 = metadata !{i32 2, i32 0, metadata !4, null}
-!16 = metadata !{metadata !"0x101\00a\0016777221\000", metadata !9, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 5]
-!17 = metadata !{i32 5, i32 0, metadata !9, null}
-!18 = metadata !{i32 6, i32 0, metadata !9, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)\000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.c] [DW_LANG_C99]
+!1 = !{!"foo.c", !"/usr/local/google/home/echristo"}
+!2 = !{}
+!3 = !{!4, !9}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", !1, !5, !6, null, i32 (i32)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/foo.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x2e\00bar\00bar\00\005\000\001\000\006\00256\000\005", !1, !5, !6, null, i32 (i32)* @bar, null, null, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [bar]
+!10 = !{i32 2, !"Dwarf Version", i32 4}
+!11 = !{i32 1, !"Debug Info Version", i32 2}
+!12 = !{!"clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)"}
+!13 = !{!"0x101\00a\0016777217\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [a] [line 1]
+!14 = !MDLocation(line: 1, scope: !4)
+!15 = !MDLocation(line: 2, scope: !4)
+!16 = !{!"0x101\00a\0016777221\000", !9, !5, !8} ; [ DW_TAG_arg_variable ] [a] [line 5]
+!17 = !MDLocation(line: 5, scope: !9)
+!18 = !MDLocation(line: 6, scope: !9)
 
diff --git a/test/DebugInfo/dead-argument-order.ll b/test/DebugInfo/dead-argument-order.ll
index 2809ccc..d375412 100644
--- a/test/DebugInfo/dead-argument-order.ll
+++ b/test/DebugInfo/dead-argument-order.ll
@@ -38,8 +38,8 @@
 ; Function Attrs: nounwind readnone uwtable
 define i32 @_Z8function1Si(i32 %s.coerce, i32 %i) #0 {
 entry:
-  tail call void @llvm.dbg.declare(metadata !19, metadata !14, metadata !{metadata !"0x102"}), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !20
+  tail call void @llvm.dbg.declare(metadata %struct.S* undef, metadata !14, metadata !{!"0x102"}), !dbg !20
+  tail call void @llvm.dbg.value(metadata i32 %i, i64 0, metadata !15, metadata !{!"0x102"}), !dbg !20
   %add = add nsw i32 %i, %s.coerce, !dbg !20
   ret i32 %add, !dbg !20
 }
@@ -57,25 +57,25 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!16, !17}
 !llvm.ident = !{!18}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !8, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dead-argument-order.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"dead-argument-order.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00S\001\0032\0032\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1S"} ; [ DW_TAG_structure_type ] [S] [line 1, size 32, align 32, offset 0] [def] [from ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{metadata !"0xd\00i\001\0032\0032\000\000", metadata !1, metadata !"_ZTS1S", metadata !7} ; [ DW_TAG_member ] [i] [line 1, size 32, align 32, offset 0] [from int]
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x2e\00function\00function\00_Z8function1Si\002\000\001\000\006\00256\001\002", metadata !1, metadata !10, metadata !11, null, i32 (i32, i32)* @_Z8function1Si, null, null, metadata !13} ; [ DW_TAG_subprogram ] [line 2] [def] [function]
-!10 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/dead-argument-order.cpp]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !7, metadata !4, metadata !7}
-!13 = metadata !{metadata !14, metadata !15}
-!14 = metadata !{metadata !"0x101\00s\0016777218\000", metadata !9, metadata !10, metadata !"_ZTS1S"} ; [ DW_TAG_arg_variable ] [s] [line 2]
-!15 = metadata !{metadata !"0x101\00i\0033554434\000", metadata !9, metadata !10, metadata !7} ; [ DW_TAG_arg_variable ] [i] [line 2]
-!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!17 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!18 = metadata !{metadata !"clang version 3.5.0 "}
-!19 = metadata !{%struct.S* undef}
-!20 = metadata !{i32 2, i32 0, metadata !9, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \001\00\000\00\001", !1, !2, !3, !8, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dead-argument-order.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"dead-argument-order.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00S\001\0032\0032\000\000\000", !1, null, null, !5, null, null, !"_ZTS1S"} ; [ DW_TAG_structure_type ] [S] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = !{!6}
+!6 = !{!"0xd\00i\001\0032\0032\000\000", !1, !"_ZTS1S", !7} ; [ DW_TAG_member ] [i] [line 1, size 32, align 32, offset 0] [from int]
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = !{!9}
+!9 = !{!"0x2e\00function\00function\00_Z8function1Si\002\000\001\000\006\00256\001\002", !1, !10, !11, null, i32 (i32, i32)* @_Z8function1Si, null, null, !13} ; [ DW_TAG_subprogram ] [line 2] [def] [function]
+!10 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/dead-argument-order.cpp]
+!11 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!7, !4, !7}
+!13 = !{!14, !15}
+!14 = !{!"0x101\00s\0016777218\000", !9, !10, !"_ZTS1S"} ; [ DW_TAG_arg_variable ] [s] [line 2]
+!15 = !{!"0x101\00i\0033554434\000", !9, !10, !7} ; [ DW_TAG_arg_variable ] [i] [line 2]
+!16 = !{i32 2, !"Dwarf Version", i32 4}
+!17 = !{i32 2, !"Debug Info Version", i32 2}
+!18 = !{!"clang version 3.5.0 "}
+!19 = !{%struct.S* undef}
+!20 = !MDLocation(line: 2, scope: !9)
 
diff --git a/test/DebugInfo/debug-info-always-inline.ll b/test/DebugInfo/debug-info-always-inline.ll
index 57ae079..88ac4cb 100644
--- a/test/DebugInfo/debug-info-always-inline.ll
+++ b/test/DebugInfo/debug-info-always-inline.ll
@@ -77,10 +77,10 @@ define i32 @_Z3foov() #0 {
 entry:
   %arr = alloca [10 x i32], align 16
   %sum = alloca i32, align 4
-  call void @llvm.dbg.declare(metadata !{[10 x i32]* %arr}, metadata !14), !dbg !18
+  call void @llvm.dbg.declare(metadata [10 x i32]* %arr, metadata !14), !dbg !18
   %arrayidx = getelementptr inbounds [10 x i32]* %arr, i32 0, i64 0, !dbg !19
   store i32 5, i32* %arrayidx, align 4, !dbg !19
-  call void @llvm.dbg.declare(metadata !{i32* %sum}, metadata !20), !dbg !21
+  call void @llvm.dbg.declare(metadata i32* %sum, metadata !20), !dbg !21
   store i32 4, i32* %sum, align 4, !dbg !21
   %0 = load i32* %sum, align 4, !dbg !22
   ret i32 %0, !dbg !22
@@ -96,7 +96,7 @@ entry:
   %i = alloca i32, align 4
   store i32 0, i32* %retval
   call void @_Z3barv(), !dbg !23
-  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !24), !dbg !25
+  call void @llvm.dbg.declare(metadata i32* %i, metadata !24), !dbg !25
   %call = call i32 @_Z3foov(), !dbg !25
   store i32 %call, i32* %i, align 4, !dbg !25
   %0 = load i32* %i, align 4, !dbg !26
@@ -114,30 +114,30 @@ attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "
 !llvm.module.flags = !{!11, !12}
 !llvm.ident = !{!13}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.6.0 (217844)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/home/user/test/<stdin>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"<stdin>", metadata !"/home/user/test"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !10}
-!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z3foov, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [foo]
-!5 = metadata !{metadata !"test.cpp", metadata !"/home/user/test"}
-!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/home/user/test/test.cpp]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 11, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 12} ; [ DW_TAG_subprogram ] [line 11] [def] [scope 12] [main]
-!11 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!12 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
-!13 = metadata !{metadata !"clang version 3.6.0 (217844)"}
-!14 = metadata !{i32 786688, metadata !4, metadata !"arr", metadata !6, i32 3, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [arr] [line 3]
-!15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 320, i64 32, i32 0, i32 0, metadata !9, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 32, offset 0] [from int]
-!16 = metadata !{metadata !17}
-!17 = metadata !{i32 786465, i64 0, i64 10}       ; [ DW_TAG_subrange_type ] [0, 9]
-!18 = metadata !{i32 3, i32 0, metadata !4, null}
-!19 = metadata !{i32 4, i32 0, metadata !4, null}
-!20 = metadata !{i32 786688, metadata !4, metadata !"sum", metadata !6, i32 5, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [sum] [line 5]
-!21 = metadata !{i32 5, i32 0, metadata !4, null}
-!22 = metadata !{i32 6, i32 0, metadata !4, null}
-!23 = metadata !{i32 13, i32 0, metadata !10, null}
-!24 = metadata !{i32 786688, metadata !10, metadata !"i", metadata !6, i32 14, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 14]
-!25 = metadata !{i32 14, i32 0, metadata !10, null}
-!26 = metadata !{i32 15, i32 0, metadata !10, null}
+!0 = !{i32 786449, !1, i32 4, !"clang version 3.6.0 (217844)", i1 false, !"", i32 0, !2, !2, !3, !2, !2, !"", i32 1} ; [ DW_TAG_compile_unit ] [/home/user/test/<stdin>] [DW_LANG_C_plus_plus]
+!1 = !{!"<stdin>", !"/home/user/test"}
+!2 = !{}
+!3 = !{!4, !10}
+!4 = !{i32 786478, !5, !6, !"foo", !"foo", !"_Z3foov", i32 1, !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z3foov, null, null, !2, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [foo]
+!5 = !{!"test.cpp", !"/home/user/test"}
+!6 = !{i32 786473, !5}          ; [ DW_TAG_file_type ] [/home/user/test/test.cpp]
+!7 = !{i32 786453, i32 0, null, !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{i32 786468, null, null, !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{i32 786478, !5, !6, !"main", !"main", !"", i32 11, !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, !2, i32 12} ; [ DW_TAG_subprogram ] [line 11] [def] [scope 12] [main]
+!11 = !{i32 2, !"Dwarf Version", i32 4}
+!12 = !{i32 2, !"Debug Info Version", i32 1}
+!13 = !{!"clang version 3.6.0 (217844)"}
+!14 = !{i32 786688, !4, !"arr", !6, i32 3, !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [arr] [line 3]
+!15 = !{i32 786433, null, null, !"", i32 0, i64 320, i64 32, i32 0, i32 0, !9, !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 32, offset 0] [from int]
+!16 = !{!17}
+!17 = !{i32 786465, i64 0, i64 10}       ; [ DW_TAG_subrange_type ] [0, 9]
+!18 = !MDLocation(line: 3, scope: !4)
+!19 = !MDLocation(line: 4, scope: !4)
+!20 = !{i32 786688, !4, !"sum", !6, i32 5, !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [sum] [line 5]
+!21 = !MDLocation(line: 5, scope: !4)
+!22 = !MDLocation(line: 6, scope: !4)
+!23 = !MDLocation(line: 13, scope: !10)
+!24 = !{i32 786688, !10, !"i", !6, i32 14, !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 14]
+!25 = !MDLocation(line: 14, scope: !10)
+!26 = !MDLocation(line: 15, scope: !10)
diff --git a/test/DebugInfo/debug-info-qualifiers.ll b/test/DebugInfo/debug-info-qualifiers.ll
index 5b21225..7e53d89 100644
--- a/test/DebugInfo/debug-info-qualifiers.ll
+++ b/test/DebugInfo/debug-info-qualifiers.ll
@@ -39,10 +39,10 @@ define void @_Z1gv() #0 {
   %a = alloca %class.A, align 1
   %pl = alloca { i64, i64 }, align 8
   %pr = alloca { i64, i64 }, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
-  call void @llvm.dbg.declare(metadata !{{ i64, i64 }* %pl}, metadata !26, metadata !{metadata !"0x102"}), !dbg !31
+  call void @llvm.dbg.declare(metadata %class.A* %a, metadata !24, metadata !{!"0x102"}), !dbg !25
+  call void @llvm.dbg.declare(metadata { i64, i64 }* %pl, metadata !26, metadata !{!"0x102"}), !dbg !31
   store { i64, i64 } { i64 ptrtoint (void (%class.A*)* @_ZNKR1A1lEv to i64), i64 0 }, { i64, i64 }* %pl, align 8, !dbg !31
-  call void @llvm.dbg.declare(metadata !{{ i64, i64 }* %pr}, metadata !32, metadata !{metadata !"0x102"}), !dbg !35
+  call void @llvm.dbg.declare(metadata { i64, i64 }* %pr, metadata !32, metadata !{!"0x102"}), !dbg !35
   store { i64, i64 } { i64 ptrtoint (void (%class.A*)* @_ZNKO1A1rEv to i64), i64 0 }, { i64, i64 }* %pr, align 8, !dbg !35
   ret void, !dbg !36
 }
@@ -61,40 +61,40 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!21, !22}
 !llvm.ident = !{!23}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !16, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2\00A\002\008\008\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 2, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !"debug-info-qualifiers.cpp", metadata !""}
-!6 = metadata !{metadata !7, metadata !13}
-!7 = metadata !{metadata !"0x2e\00l\00l\00_ZNKR1A1lEv\005\000\000\000\006\0016640\000\005", metadata !5, metadata !"_ZTS1A", metadata !8, null, null, null, i32 0, metadata !12} ; [ DW_TAG_subprogram ] [line 5] [reference] [l]
-!8 = metadata !{metadata !"0x15\00\000\000\000\000\0016384\000", i32 0, null, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [reference] [from ]
-!9 = metadata !{null, metadata !10}
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ]
-!11 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS1A]
-!12 = metadata !{i32 786468}
-!13 = metadata !{metadata !"0x2e\00r\00r\00_ZNKO1A1rEv\007\000\000\000\006\0033024\000\007", metadata !5, metadata !"_ZTS1A", metadata !14, null, null, null, i32 0, metadata !15} ; [ DW_TAG_subprogram ] [line 7] [rvalue reference] [r]
-!14 = metadata !{metadata !"0x15\00\000\000\000\000\0032768\000", i32 0, null, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [rvalue reference] [from ]
-!15 = metadata !{i32 786468}
-!16 = metadata !{metadata !17}
-!17 = metadata !{metadata !"0x2e\00g\00g\00_Z1gv\0010\000\001\000\006\00256\000\0010", metadata !5, metadata !18, metadata !19, null, void ()* @_Z1gv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 10] [def] [g]
-!18 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ]
-!19 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!20 = metadata !{null}
-!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!23 = metadata !{metadata !"clang version 3.5 "}
-!24 = metadata !{metadata !"0x100\00a\0011\000", metadata !17, metadata !18, metadata !4} ; [ DW_TAG_auto_variable ] [a] [line 11]
-!25 = metadata !{i32 11, i32 0, metadata !17, null}
-!26 = metadata !{metadata !"0x100\00pl\0016\000", metadata !17, metadata !18, metadata !27} ; [ DW_TAG_auto_variable ] [pl] [line 16]
-!27 = metadata !{metadata !"0x1f\00\000\000\000\000\000", null, null, metadata !28, metadata !"_ZTS1A"} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from ]
-!28 = metadata !{metadata !"0x15\00\000\000\000\000\0016384\000", i32 0, null, null, metadata !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [reference] [from ]
-!29 = metadata !{null, metadata !30}
-!30 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!31 = metadata !{i32 16, i32 0, metadata !17, null}
-!32 = metadata !{metadata !"0x100\00pr\0021\000", metadata !17, metadata !18, metadata !33} ; [ DW_TAG_auto_variable ] [pr] [line 21]
-!33 = metadata !{metadata !"0x1f\00\000\000\000\000\000", null, null, metadata !34, metadata !"_ZTS1A"} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from ]
-!34 = metadata !{metadata !"0x15\00\000\000\000\000\0032768\000", i32 0, null, null, metadata !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [rvalue reference] [from ]
-!35 = metadata !{i32 21, i32 0, metadata !17, null}
-!36 = metadata !{i32 22, i32 0, metadata !17, null}
+!0 = !{!"0x11\004\00clang version 3.5 \000\00\000\00\000", !1, !2, !3, !16, !2, !2} ; [ DW_TAG_compile_unit ] [] [DW_LANG_C_plus_plus]
+!1 = !{!"", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2\00A\002\008\008\000\000\000", !5, null, null, !6, null, null, !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 2, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!"debug-info-qualifiers.cpp", !""}
+!6 = !{!7, !13}
+!7 = !{!"0x2e\00l\00l\00_ZNKR1A1lEv\005\000\000\000\006\008448\000\005", !5, !"_ZTS1A", !8, null, null, null, i32 0, !12} ; [ DW_TAG_subprogram ] [line 5] [reference] [l]
+!8 = !{!"0x15\00\000\000\000\000\008192\000", i32 0, null, null, !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [reference] [from ]
+!9 = !{null, !10}
+!10 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ]
+!11 = !{!"0x26\00\000\000\000\000\000", null, null, !"_ZTS1A"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS1A]
+!12 = !{i32 786468}
+!13 = !{!"0x2e\00r\00r\00_ZNKO1A1rEv\007\000\000\000\006\0017408\000\007", !5, !"_ZTS1A", !14, null, null, null, i32 0, !15} ; [ DW_TAG_subprogram ] [line 7] [rvalue reference] [r]
+!14 = !{!"0x15\00\000\000\000\000\0016384\000", i32 0, null, null, !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [rvalue reference] [from ]
+!15 = !{i32 786468}
+!16 = !{!17}
+!17 = !{!"0x2e\00g\00g\00_Z1gv\0010\000\001\000\006\00256\000\0010", !5, !18, !19, null, void ()* @_Z1gv, null, null, !2} ; [ DW_TAG_subprogram ] [line 10] [def] [g]
+!18 = !{!"0x29", !5}         ; [ DW_TAG_file_type ]
+!19 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = !{null}
+!21 = !{i32 2, !"Dwarf Version", i32 4}
+!22 = !{i32 1, !"Debug Info Version", i32 2}
+!23 = !{!"clang version 3.5 "}
+!24 = !{!"0x100\00a\0011\000", !17, !18, !4} ; [ DW_TAG_auto_variable ] [a] [line 11]
+!25 = !MDLocation(line: 11, scope: !17)
+!26 = !{!"0x100\00pl\0016\000", !17, !18, !27} ; [ DW_TAG_auto_variable ] [pl] [line 16]
+!27 = !{!"0x1f\00\000\000\000\000\000", null, null, !28, !"_ZTS1A"} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = !{!"0x15\00\000\000\000\000\008192\000", i32 0, null, null, !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [reference] [from ]
+!29 = !{null, !30}
+!30 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!31 = !MDLocation(line: 16, scope: !17)
+!32 = !{!"0x100\00pr\0021\000", !17, !18, !33} ; [ DW_TAG_auto_variable ] [pr] [line 21]
+!33 = !{!"0x1f\00\000\000\000\000\000", null, null, !34, !"_ZTS1A"} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from ]
+!34 = !{!"0x15\00\000\000\000\000\0016384\000", i32 0, null, null, !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [rvalue reference] [from ]
+!35 = !MDLocation(line: 21, scope: !17)
+!36 = !MDLocation(line: 22, scope: !17)
diff --git a/test/DebugInfo/debuginfofinder-multiple-cu.ll b/test/DebugInfo/debuginfofinder-multiple-cu.ll
index 7892306..0eba64d 100644
--- a/test/DebugInfo/debuginfofinder-multiple-cu.ll
+++ b/test/DebugInfo/debuginfofinder-multiple-cu.ll
@@ -22,20 +22,20 @@ define void @g() {
 !llvm.dbg.cu = !{!0, !8}
 !llvm.module.flags = !{!13, !16}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (192092)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/test1.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"test1.c", metadata !"/tmp"}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00f\00f\00\001\000\001\000\006\000\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @f, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/test1.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{metadata !"0x11\0012\00clang version 3.4 (192092)\000\00\000\00\000", metadata !9, metadata !2, metadata !2, metadata !10, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/test2.c] [DW_LANG_C99]
-!9 = metadata !{metadata !"test2.c", metadata !"/tmp"}
-!10 = metadata !{metadata !11}
-!11 = metadata !{metadata !"0x2e\00g\00g\00\001\000\001\000\006\000\000\001", metadata !9, metadata !12, metadata !6, null, void ()* @g, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [g]
-!12 = metadata !{metadata !"0x29", metadata !9}         ; [ DW_TAG_file_type ] [/tmp/test2.c]
-!13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!14 = metadata !{i32 1, i32 0, metadata !4, null}
-!15 = metadata !{i32 1, i32 0, metadata !11, null}
-!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4 (192092)\000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/test1.c] [DW_LANG_C99]
+!1 = !{!"test1.c", !"/tmp"}
+!2 = !{i32 0}
+!3 = !{!4}
+!4 = !{!"0x2e\00f\00f\00\001\000\001\000\006\000\000\001", !1, !5, !6, null, void ()* @f, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/test1.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{!"0x11\0012\00clang version 3.4 (192092)\000\00\000\00\000", !9, !2, !2, !10, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/test2.c] [DW_LANG_C99]
+!9 = !{!"test2.c", !"/tmp"}
+!10 = !{!11}
+!11 = !{!"0x2e\00g\00g\00\001\000\001\000\006\000\000\001", !9, !12, !6, null, void ()* @g, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [g]
+!12 = !{!"0x29", !9}         ; [ DW_TAG_file_type ] [/tmp/test2.c]
+!13 = !{i32 2, !"Dwarf Version", i32 4}
+!14 = !MDLocation(line: 1, scope: !4)
+!15 = !MDLocation(line: 1, scope: !11)
+!16 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/duplicate_inline.ll b/test/DebugInfo/duplicate_inline.ll
deleted file mode 100644
index 008b52f..0000000
--- a/test/DebugInfo/duplicate_inline.ll
+++ /dev/null
@@ -1,117 +0,0 @@
-; REQUIRES: object-emission
-
-; RUN: %llc_dwarf < %s -filetype=obj | llvm-dwarfdump -debug-dump=info - | FileCheck %s
-
-; Built with clang from the following source:
-; void f1(int);
-; __attribute__((always_inline)) inline void f2(int i) { f1(i); }
-;
-; #define MULTICALL \
-;   f2(x);          \
-;   f2(y);
-;
-; void f3(int x, int y) { MULTICALL; }
-
-; FIXME: This produces only one inlined_subroutine, with two formal_parameters
-; (both named "this"), one for each of the actual inlined subroutines.  ;
-; Inlined scopes are differentiated by the combination of 'inlined at' (call)
-; location and the location within the function. If two calls to the same
-; function occur at the same location the scopes end up conflated and there
-; appears to be only one inlined function.
-; To fix this, we'd need to add some kind of unique metadata per call site, possibly something like:
-;
-; !42 = metadata !{i32 1, i32 0, metadata !43, metadata !44}
-; !44 = metadata !{i32 2, i32 0, metadata !45, null}
-;
-; ->
-;
-; !42 = metadata !{i32 1, i32 0, metadata !43, metadata !44}
-; !44 = metadata !{metadata !45, metadata !44}
-; !45 = metadata !{i32 2, i32 0, metadata !45, null}
-;
-; since cycles in metadata are not uniqued, the !44 node would not be shared
-; between calls to the same function from the same location, ensuring separate
-; inlined subroutines would be generated.
-;
-; Once this is done, the (insufficient) hack in clang that adds column
-; information to call sites to differentiate inlined callers can be removed as it
-; will no longer be necessary.
-;
-; While it might be nice to omit the duplicate parameter in this case (while
-; we wait/work on the real fix), it's actually better to leave it in because it
-; allows us to hold the invariant that every DbgVariable has a DIE, every time.
-; This has proved valuable in finding other bugs, so I want to avoid removing the
-; invariant/assertion. Besides, we don't know which one's the right one anyway...
-
-; CHECK: DW_TAG_subprogram
-; CHECK:   DW_TAG_inlined_subroutine
-; CHECK:     DW_TAG_formal_parameter
-; CHECK-NOT: DW_TAG
-; CHECK:     DW_TAG_formal_parameter
-; CHECK-NOT: DW_TAG
-; CHECK:     NULL
-; CHECK-NOT: DW_TAG
-; CHECK:   NULL
-
-; Function Attrs: uwtable
-define void @_Z2f3ii(i32 %x, i32 %y) #0 {
-entry:
-  %i.addr.i1 = alloca i32, align 4
-  %i.addr.i = alloca i32, align 4
-  %x.addr = alloca i32, align 4
-  %y.addr = alloca i32, align 4
-  store i32 %x, i32* %x.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !15, metadata !16), !dbg !17
-  store i32 %y, i32* %y.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %y.addr}, metadata !18, metadata !16), !dbg !19
-  %0 = load i32* %x.addr, align 4, !dbg !20
-  store i32 %0, i32* %i.addr.i, align 4, !dbg !20
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr.i}, metadata !21, metadata !16), !dbg !22
-  %1 = load i32* %i.addr.i, align 4, !dbg !23
-  call void @_Z2f1i(i32 %1), !dbg !23
-  %2 = load i32* %y.addr, align 4, !dbg !20
-  store i32 %2, i32* %i.addr.i1, align 4, !dbg !20
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr.i1}, metadata !21, metadata !16), !dbg !22
-  %3 = load i32* %i.addr.i1, align 4, !dbg !23
-  call void @_Z2f1i(i32 %3), !dbg !23
-  ret void, !dbg !24
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-declare void @_Z2f1i(i32) #2
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!12, !13}
-!llvm.ident = !{!14}
-
-!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/duplicate_inline.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"duplicate_inline.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !9}
-!4 = metadata !{metadata !"0x2e\00f3\00f3\00_Z2f3ii\008\000\001\000\000\00256\000\008", metadata !1, metadata !5, metadata !6, null, void (i32, i32)* @_Z2f3ii, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 8] [def] [f3]
-!5 = metadata !{metadata !"0x29", metadata !1}    ; [ DW_TAG_file_type ] [/tmp/dbginfo/duplicate_inline.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null, metadata !8, metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x2e\00f2\00f2\00_Z2f2i\002\000\001\000\000\00256\000\002", metadata !1, metadata !5, metadata !10, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [f2]
-!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!11 = metadata !{null, metadata !8}
-!12 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!13 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!14 = metadata !{metadata !"clang version 3.6.0 "}
-!15 = metadata !{metadata !"0x101\00x\0016777224\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [x] [line 8]
-!16 = metadata !{metadata !"0x102"}               ; [ DW_TAG_expression ]
-!17 = metadata !{i32 8, i32 13, metadata !4, null}
-!18 = metadata !{metadata !"0x101\00y\0033554440\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [y] [line 8]
-!19 = metadata !{i32 8, i32 20, metadata !4, null}
-!20 = metadata !{i32 8, i32 25, metadata !4, null}
-!21 = metadata !{metadata !"0x101\00i\0016777218\000", metadata !9, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [i] [line 2]
-!22 = metadata !{i32 2, i32 51, metadata !9, metadata !20}
-!23 = metadata !{i32 2, i32 56, metadata !9, metadata !20}
-!24 = metadata !{i32 8, i32 36, metadata !4, null}
diff --git a/test/DebugInfo/dwarf-public-names.ll b/test/DebugInfo/dwarf-public-names.ll
index f6d8cd3..d2b8664 100644
--- a/test/DebugInfo/dwarf-public-names.ll
+++ b/test/DebugInfo/dwarf-public-names.ll
@@ -59,7 +59,7 @@ define void @_ZN1C15member_functionEv(%struct.C* %this) nounwind uwtable align 2
 entry:
   %this.addr = alloca %struct.C*, align 8
   store %struct.C* %this, %struct.C** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !28, metadata !{metadata !"0x102"}), !dbg !30
+  call void @llvm.dbg.declare(metadata %struct.C** %this.addr, metadata !28, metadata !{!"0x102"}), !dbg !30
   %this1 = load %struct.C** %this.addr
   store i32 0, i32* @_ZN1C22static_member_variableE, align 4, !dbg !31
   ret void, !dbg !32
@@ -90,42 +90,42 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!38}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (http://llvm.org/git/clang.git a09cd8103a6a719cb2628cdf0c91682250a17bd2) (http://llvm.org/git/llvm.git 47d03cec0afca0c01ae42b82916d1d731716cd20)\000\00\000\00\000", metadata !37, metadata !1, metadata !1, metadata !2, metadata !24,  metadata !1} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{}
-!2 = metadata !{metadata !3, metadata !18, metadata !19, metadata !20}
-!3 = metadata !{metadata !"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\009\000\001\000\006\00256\000\009", metadata !4, null, metadata !5, null, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !12, metadata !1} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
-!4 = metadata !{metadata !"0x29", metadata !37} ; [ DW_TAG_file_type ]
-!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!6 = metadata !{null, metadata !7}
-!7 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from C]
-!8 = metadata !{metadata !"0x13\00C\001\008\008\000\000\000", metadata !37, null, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
-!9 = metadata !{metadata !10, metadata !12, metadata !14}
-!10 = metadata !{metadata !"0xd\00static_member_variable\004\000\000\000\004096", metadata !37, metadata !8, metadata !11, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
-!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!12 = metadata !{metadata !"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\002\000\000\000\006\00256\000\002", metadata !4, metadata !8, metadata !5, null, null, null, i32 0, metadata !13} ; [ DW_TAG_subprogram ] [line 2] [member_function]
-!13 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!14 = metadata !{metadata !"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\003\000\000\000\006\00256\000\003", metadata !4, metadata !8, metadata !15, null, null, null, i32 0, metadata !17} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{metadata !11}
-!17 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!18 = metadata !{metadata !"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\0013\000\001\000\006\00256\000\0013", metadata !4, null, metadata !15, null, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !14, metadata !1} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
-!19 = metadata !{metadata !"0x2e\00global_function\00global_function\00_Z15global_functionv\0019\000\001\000\006\00256\000\0019", metadata !4, metadata !4, metadata !15, null, i32 ()* @_Z15global_functionv, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
-!20 = metadata !{metadata !"0x2e\00global_namespace_function\00global_namespace_function\00_ZN2ns25global_namespace_functionEv\0024\000\001\000\006\00256\000\0024", metadata !4, metadata !21, metadata !22, null, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
-!21 = metadata !{metadata !"0x39\00ns\0023", metadata !4, null} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
-!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!23 = metadata !{null}
-!24 = metadata !{metadata !25, metadata !26, metadata !27}
-!25 = metadata !{metadata !"0x34\00static_member_variable\00static_member_variable\00_ZN1C22static_member_variableE\007\000\001", metadata !8, metadata !4, metadata !11, i32* @_ZN1C22static_member_variableE, metadata !10} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
-!26 = metadata !{metadata !"0x34\00global_variable\00global_variable\00\0017\000\001", null, metadata !4, metadata !8, %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 17] [def]
-!27 = metadata !{metadata !"0x34\00global_namespace_variable\00global_namespace_variable\00_ZN2ns25global_namespace_variableE\0027\000\001", metadata !21, metadata !4, metadata !11, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 27] [def]
-!28 = metadata !{metadata !"0x101\00this\0016777225\001088", metadata !3, metadata !4, metadata !29} ; [ DW_TAG_arg_variable ] [this] [line 9]
-!29 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from C]
-!30 = metadata !{i32 9, i32 0, metadata !3, null}
-!31 = metadata !{i32 10, i32 0, metadata !3, null}
-!32 = metadata !{i32 11, i32 0, metadata !3, null}
-!33 = metadata !{i32 14, i32 0, metadata !18, null}
-!34 = metadata !{i32 20, i32 0, metadata !19, null}
-!35 = metadata !{i32 25, i32 0, metadata !20, null}
-!36 = metadata !{i32 26, i32 0, metadata !20, null}
-!37 = metadata !{metadata !"dwarf-public-names.cpp", metadata !"/usr2/kparzysz/s.hex/t"}
-!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.3 (http://llvm.org/git/clang.git a09cd8103a6a719cb2628cdf0c91682250a17bd2) (http://llvm.org/git/llvm.git 47d03cec0afca0c01ae42b82916d1d731716cd20)\000\00\000\00\000", !37, !1, !1, !2, !24,  !1} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp] [DW_LANG_C_plus_plus]
+!1 = !{}
+!2 = !{!3, !18, !19, !20}
+!3 = !{!"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\009\000\001\000\006\00256\000\009", !4, null, !5, null, void (%struct.C*)* @_ZN1C15member_functionEv, null, !12, !1} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
+!4 = !{!"0x29", !37} ; [ DW_TAG_file_type ]
+!5 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = !{null, !7}
+!7 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from C]
+!8 = !{!"0x13\00C\001\008\008\000\000\000", !37, null, null, !9, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
+!9 = !{!10, !12, !14}
+!10 = !{!"0xd\00static_member_variable\004\000\000\000\004096", !37, !8, !11, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
+!11 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = !{!"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\002\000\000\000\006\00256\000\002", !4, !8, !5, null, null, null, i32 0, !13} ; [ DW_TAG_subprogram ] [line 2] [member_function]
+!13 = !{!"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!14 = !{!"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\003\000\000\000\006\00256\000\003", !4, !8, !15, null, null, null, i32 0, !17} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
+!15 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{!11}
+!17 = !{!"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!18 = !{!"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\0013\000\001\000\006\00256\000\0013", !4, null, !15, null, i32 ()* @_ZN1C22static_member_functionEv, null, !14, !1} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
+!19 = !{!"0x2e\00global_function\00global_function\00_Z15global_functionv\0019\000\001\000\006\00256\000\0019", !4, !4, !15, null, i32 ()* @_Z15global_functionv, null, null, !1} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
+!20 = !{!"0x2e\00global_namespace_function\00global_namespace_function\00_ZN2ns25global_namespace_functionEv\0024\000\001\000\006\00256\000\0024", !4, !21, !22, null, void ()* @_ZN2ns25global_namespace_functionEv, null, null, !1} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
+!21 = !{!"0x39\00ns\0023", !4, null} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
+!22 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = !{null}
+!24 = !{!25, !26, !27}
+!25 = !{!"0x34\00static_member_variable\00static_member_variable\00_ZN1C22static_member_variableE\007\000\001", !8, !4, !11, i32* @_ZN1C22static_member_variableE, !10} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
+!26 = !{!"0x34\00global_variable\00global_variable\00\0017\000\001", null, !4, !8, %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 17] [def]
+!27 = !{!"0x34\00global_namespace_variable\00global_namespace_variable\00_ZN2ns25global_namespace_variableE\0027\000\001", !21, !4, !11, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 27] [def]
+!28 = !{!"0x101\00this\0016777225\001088", !3, !4, !29} ; [ DW_TAG_arg_variable ] [this] [line 9]
+!29 = !{!"0xf\00\000\0064\0064\000\000", null, null, !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from C]
+!30 = !MDLocation(line: 9, scope: !3)
+!31 = !MDLocation(line: 10, scope: !3)
+!32 = !MDLocation(line: 11, scope: !3)
+!33 = !MDLocation(line: 14, scope: !18)
+!34 = !MDLocation(line: 20, scope: !19)
+!35 = !MDLocation(line: 25, scope: !20)
+!36 = !MDLocation(line: 26, scope: !20)
+!37 = !{!"dwarf-public-names.cpp", !"/usr2/kparzysz/s.hex/t"}
+!38 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/dwarfdump-debug-frame-simple.test b/test/DebugInfo/dwarfdump-debug-frame-simple.test
index c2427d8..b1a58e6 100644
--- a/test/DebugInfo/dwarfdump-debug-frame-simple.test
+++ b/test/DebugInfo/dwarfdump-debug-frame-simple.test
@@ -6,22 +6,22 @@
 
 ; FRAMES: 00000000 00000010 ffffffff CIE
 ; FRAMES: Version: 1
-; FRAMES:      DW_CFA_def_cfa
-; FRAMES-NEXT: DW_CFA_offset
-; FRAMES-NEXT: DW_CFA_nop
-; FRAMES-NEXT: DW_CFA_nop
+; FRAMES:      DW_CFA_def_cfa: reg4 +4
+; FRAMES-NEXT: DW_CFA_offset: reg8 -4
+; FRAMES-NEXT: DW_CFA_nop:
+; FRAMES-NEXT: DW_CFA_nop:
 
 ; FRAMES: 00000014 00000010 00000000 FDE cie=00000000 pc=00000000...00000022
-; FRAMES:      DW_CFA_advance_loc
-; FRAMES-NEXT: DW_CFA_def_cfa_offset
-; FRAMES-NEXT: DW_CFA_nop
+; FRAMES:      DW_CFA_advance_loc: 3
+; FRAMES-NEXT: DW_CFA_def_cfa_offset: +12
+; FRAMES-NEXT: DW_CFA_nop:
 
 ; FRAMES: 00000028 00000014 00000000 FDE cie=00000000 pc=00000030...00000080
-; FRAMES:      DW_CFA_advance_loc
-; FRAMES-NEXT: DW_CFA_def_cfa_offset
-; FRAMES-NEXT: DW_CFA_offset
-; FRAMES-NEXT: DW_CFA_advance_loc
-; FRAMES-NEXT: DW_CFA_def_cfa_register
+; FRAMES:      DW_CFA_advance_loc: 1
+; FRAMES-NEXT: DW_CFA_def_cfa_offset: +8
+; FRAMES-NEXT: DW_CFA_offset: reg5 -8
+; FRAMES-NEXT: DW_CFA_advance_loc: 2
+; FRAMES-NEXT: DW_CFA_def_cfa_register: reg5
 
 ; FRAMES-NOT: CIE
 ; FRAMES-NOT: FDE
diff --git a/test/DebugInfo/empty.ll b/test/DebugInfo/empty.ll
index 52211af..3f7f546 100644
--- a/test/DebugInfo/empty.ll
+++ b/test/DebugInfo/empty.ll
@@ -24,8 +24,8 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!5}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 143523)\001\00\000\00\000", metadata !4, metadata !2, metadata !2, metadata !2, metadata !2, null} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{}
-!3 = metadata !{metadata !"0x29", metadata !4} ; [ DW_TAG_file_type ]
-!4 = metadata !{metadata !"empty.c", metadata !"/home/nlewycky"}
-!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.1 (trunk 143523)\001\00\000\00\000", !4, !2, !2, !2, !2, null} ; [ DW_TAG_compile_unit ]
+!2 = !{}
+!3 = !{!"0x29", !4} ; [ DW_TAG_file_type ]
+!4 = !{!"empty.c", !"/home/nlewycky"}
+!5 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/enum-types.ll b/test/DebugInfo/enum-types.ll
index 787e5f5..3932535 100644
--- a/test/DebugInfo/enum-types.ll
+++ b/test/DebugInfo/enum-types.ll
@@ -25,7 +25,7 @@ define void @_Z4topA2EA(i32 %sa) #0 {
 entry:
   %sa.addr = alloca i32, align 4
   store i32 %sa, i32* %sa.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %sa.addr}, metadata !22, metadata !{metadata !"0x102"}), !dbg !23
+  call void @llvm.dbg.declare(metadata i32* %sa.addr, metadata !22, metadata !{!"0x102"}), !dbg !23
   ret void, !dbg !24
 }
 
@@ -37,7 +37,7 @@ define void @_Z4topB2EA(i32 %sa) #0 {
 entry:
   %sa.addr = alloca i32, align 4
   store i32 %sa, i32* %sa.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %sa.addr}, metadata !25, metadata !{metadata !"0x102"}), !dbg !26
+  call void @llvm.dbg.declare(metadata i32* %sa.addr, metadata !25, metadata !{!"0x102"}), !dbg !26
   ret void, !dbg !27
 }
 
@@ -48,31 +48,31 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!19, !20}
 !llvm.ident = !{!21, !21}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 214102:214133) (llvm/trunk 214102:214132)\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !6, metadata !11, metadata !11} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"a.cpp", metadata !""}
-!2 = metadata !{metadata !3}
-!3 = metadata !{metadata !"0x4\00EA\001\0032\0032\000\000\000", metadata !1, null, null, metadata !4, null, null, metadata !"_ZTS2EA"} ; [ DW_TAG_enumeration_type ] [EA] [line 1, size 32, align 32, offset 0] [def] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x28\00EA_0\000"} ; [ DW_TAG_enumerator ] [EA_0 :: 0]
-!6 = metadata !{metadata !7}
-!7 = metadata !{metadata !"0x2e\00topA\00topA\00_Z4topA2EA\005\000\001\000\006\00256\000\005", metadata !1, metadata !8, metadata !9, null, void (i32)* @_Z4topA2EA, null, null, metadata !11} ; [ DW_TAG_subprogram ] [line 5] [def] [topA]
-!8 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [a.cpp]
-!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!10 = metadata !{null, metadata !"_ZTS2EA"}
-!11 = metadata !{}
-!12 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 214102:214133) (llvm/trunk 214102:214132)\000\00\000\00\001", metadata !13, metadata !14, metadata !14, metadata !16, metadata !11, metadata !11} ; [ DW_TAG_compile_unit ] [b.cpp] [DW_LANG_C_plus_plus]
-!13 = metadata !{metadata !"b.cpp", metadata !""}
-!14 = metadata !{metadata !15}
-!15 = metadata !{metadata !"0x4\00EA\001\0032\0032\000\000\000", metadata !13, null, null, metadata !4, null, null, metadata !"_ZTS2EA"} ; [ DW_TAG_enumeration_type ] [EA] [line 1, size 32, align 32, offset 0] [def] [from ]
-!16 = metadata !{metadata !17}
-!17 = metadata !{metadata !"0x2e\00topB\00topB\00_Z4topB2EA\005\000\001\000\006\00256\000\005", metadata !13, metadata !18, metadata !9, null, void (i32)* @_Z4topB2EA, null, null, metadata !11} ; [ DW_TAG_subprogram ] [line 5] [def] [topB]
-!18 = metadata !{metadata !"0x29", metadata !13}        ; [ DW_TAG_file_type ] [b.cpp]
-!19 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!20 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!21 = metadata !{metadata !"clang version 3.5.0 (trunk 214102:214133) (llvm/trunk 214102:214132)"}
-!22 = metadata !{metadata !"0x101\00sa\0016777221\000", metadata !7, metadata !8, metadata !"_ZTS2EA"} ; [ DW_TAG_arg_variable ] [sa] [line 5]
-!23 = metadata !{i32 5, i32 14, metadata !7, null}
-!24 = metadata !{i32 6, i32 1, metadata !7, null}
-!25 = metadata !{metadata !"0x101\00sa\0016777221\000", metadata !17, metadata !18, metadata !"_ZTS2EA"} ; [ DW_TAG_arg_variable ] [sa] [line 5]
-!26 = metadata !{i32 5, i32 14, metadata !17, null}
-!27 = metadata !{i32 6, i32 1, metadata !17, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 (trunk 214102:214133) (llvm/trunk 214102:214132)\000\00\000\00\001", !1, !2, !2, !6, !11, !11} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
+!1 = !{!"a.cpp", !""}
+!2 = !{!3}
+!3 = !{!"0x4\00EA\001\0032\0032\000\000\000", !1, null, null, !4, null, null, !"_ZTS2EA"} ; [ DW_TAG_enumeration_type ] [EA] [line 1, size 32, align 32, offset 0] [def] [from ]
+!4 = !{!5}
+!5 = !{!"0x28\00EA_0\000"} ; [ DW_TAG_enumerator ] [EA_0 :: 0]
+!6 = !{!7}
+!7 = !{!"0x2e\00topA\00topA\00_Z4topA2EA\005\000\001\000\006\00256\000\005", !1, !8, !9, null, void (i32)* @_Z4topA2EA, null, null, !11} ; [ DW_TAG_subprogram ] [line 5] [def] [topA]
+!8 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [a.cpp]
+!9 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = !{null, !"_ZTS2EA"}
+!11 = !{}
+!12 = !{!"0x11\004\00clang version 3.5.0 (trunk 214102:214133) (llvm/trunk 214102:214132)\000\00\000\00\001", !13, !14, !14, !16, !11, !11} ; [ DW_TAG_compile_unit ] [b.cpp] [DW_LANG_C_plus_plus]
+!13 = !{!"b.cpp", !""}
+!14 = !{!15}
+!15 = !{!"0x4\00EA\001\0032\0032\000\000\000", !13, null, null, !4, null, null, !"_ZTS2EA"} ; [ DW_TAG_enumeration_type ] [EA] [line 1, size 32, align 32, offset 0] [def] [from ]
+!16 = !{!17}
+!17 = !{!"0x2e\00topB\00topB\00_Z4topB2EA\005\000\001\000\006\00256\000\005", !13, !18, !9, null, void (i32)* @_Z4topB2EA, null, null, !11} ; [ DW_TAG_subprogram ] [line 5] [def] [topB]
+!18 = !{!"0x29", !13}        ; [ DW_TAG_file_type ] [b.cpp]
+!19 = !{i32 2, !"Dwarf Version", i32 2}
+!20 = !{i32 2, !"Debug Info Version", i32 2}
+!21 = !{!"clang version 3.5.0 (trunk 214102:214133) (llvm/trunk 214102:214132)"}
+!22 = !{!"0x101\00sa\0016777221\000", !7, !8, !"_ZTS2EA"} ; [ DW_TAG_arg_variable ] [sa] [line 5]
+!23 = !MDLocation(line: 5, column: 14, scope: !7)
+!24 = !MDLocation(line: 6, column: 1, scope: !7)
+!25 = !{!"0x101\00sa\0016777221\000", !17, !18, !"_ZTS2EA"} ; [ DW_TAG_arg_variable ] [sa] [line 5]
+!26 = !MDLocation(line: 5, column: 14, scope: !17)
+!27 = !MDLocation(line: 6, column: 1, scope: !17)
diff --git a/test/DebugInfo/enum.ll b/test/DebugInfo/enum.ll
index a64795c..4dd4c68 100644
--- a/test/DebugInfo/enum.ll
+++ b/test/DebugInfo/enum.ll
@@ -39,7 +39,7 @@
 define void @_Z4funcv() #0 {
 entry:
   %b = alloca i32, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %b}, metadata !20, metadata !{metadata !"0x102"}), !dbg !22
+  call void @llvm.dbg.declare(metadata i32* %b, metadata !20, metadata !{!"0x102"}), !dbg !22
   store i32 0, i32* %b, align 4, !dbg !22
   ret void, !dbg !23
 }
@@ -53,28 +53,28 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!19, !24}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !11, metadata !12, metadata !17, metadata !11} ; [ DW_TAG_compile_unit ] [/tmp/enum.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"enum.cpp", metadata !"/tmp"}
-!2 = metadata !{metadata !3, metadata !8}
-!3 = metadata !{metadata !"0x4\00e1\001\0064\0064\000\000\000", metadata !1, null, null, metadata !4, null, null, null} ; [ DW_TAG_enumeration_type ] [e1] [line 1, size 64, align 64, offset 0] [def] [from ]
-!4 = metadata !{metadata !5, metadata !6, metadata !7}
-!5 = metadata !{metadata !"0x28\00I\000"} ; [ DW_TAG_enumerator ] [I :: 0]
-!6 = metadata !{metadata !"0x28\00J\004294967295"} ; [ DW_TAG_enumerator ] [J :: 4294967295]
-!7 = metadata !{metadata !"0x28\00K\00-1152921504606846976"} ; [ DW_TAG_enumerator ] [K :: 17293822569102704640]
-!8 = metadata !{metadata !"0x4\00e2\002\0032\0032\000\000\000", metadata !1, null, null, metadata !9, null, null, null} ; [ DW_TAG_enumeration_type ] [e2] [line 2, size 32, align 32, offset 0] [def] [from ]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x28\00X\000"} ; [ DW_TAG_enumerator ] [X :: 0]
-!11 = metadata !{}
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x2e\00func\00func\00_Z4funcv\003\000\001\000\006\00256\000\003", metadata !1, metadata !14, metadata !15, null, void ()* @_Z4funcv, null, null, metadata !11} ; [ DW_TAG_subprogram ] [line 3] [def] [func]
-!14 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/enum.cpp]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{null}
-!17 = metadata !{metadata !18}
-!18 = metadata !{metadata !"0x34\00a\00a\00\001\000\001", null, metadata !14, metadata !3, i64* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
-!19 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!20 = metadata !{metadata !"0x100\00b\004\000", metadata !13, metadata !14, metadata !21} ; [ DW_TAG_auto_variable ] [b] [line 4]
-!21 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!22 = metadata !{i32 4, i32 0, metadata !13, null}
-!23 = metadata !{i32 5, i32 0, metadata !13, null}
-!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 \000\00\000\00\000", !1, !2, !11, !12, !17, !11} ; [ DW_TAG_compile_unit ] [/tmp/enum.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"enum.cpp", !"/tmp"}
+!2 = !{!3, !8}
+!3 = !{!"0x4\00e1\001\0064\0064\000\000\000", !1, null, null, !4, null, null, null} ; [ DW_TAG_enumeration_type ] [e1] [line 1, size 64, align 64, offset 0] [def] [from ]
+!4 = !{!5, !6, !7}
+!5 = !{!"0x28\00I\000"} ; [ DW_TAG_enumerator ] [I :: 0]
+!6 = !{!"0x28\00J\004294967295"} ; [ DW_TAG_enumerator ] [J :: 4294967295]
+!7 = !{!"0x28\00K\00-1152921504606846976"} ; [ DW_TAG_enumerator ] [K :: 17293822569102704640]
+!8 = !{!"0x4\00e2\002\0032\0032\000\000\000", !1, null, null, !9, null, null, null} ; [ DW_TAG_enumeration_type ] [e2] [line 2, size 32, align 32, offset 0] [def] [from ]
+!9 = !{!10}
+!10 = !{!"0x28\00X\000"} ; [ DW_TAG_enumerator ] [X :: 0]
+!11 = !{}
+!12 = !{!13}
+!13 = !{!"0x2e\00func\00func\00_Z4funcv\003\000\001\000\006\00256\000\003", !1, !14, !15, null, void ()* @_Z4funcv, null, null, !11} ; [ DW_TAG_subprogram ] [line 3] [def] [func]
+!14 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/tmp/enum.cpp]
+!15 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{null}
+!17 = !{!18}
+!18 = !{!"0x34\00a\00a\00\001\000\001", null, !14, !3, i64* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
+!19 = !{i32 2, !"Dwarf Version", i32 3}
+!20 = !{!"0x100\00b\004\000", !13, !14, !21} ; [ DW_TAG_auto_variable ] [b] [line 4]
+!21 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!22 = !MDLocation(line: 4, scope: !13)
+!23 = !MDLocation(line: 5, scope: !13)
+!24 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/global-with-type-context.ll b/test/DebugInfo/global-with-type-context.ll
deleted file mode 100644
index 10b98a7..0000000
--- a/test/DebugInfo/global-with-type-context.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; REQUIRES: object-emission
-
-; RUN: %llc_dwarf -filetype=obj -O0 < %s > %t
-; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
-
-; IR generated from clang -g with the following source:
-; struct F {
-;   static const int i = 2;
-;   virtual ~F();
-; };
-;
-; void f1() {
-;   int i = F::i;
-; }
-
-; Make sure we correctly handle context of a global variable being a type identifier.
-; CHECK:  [[STRUCT:.*]]: DW_TAG_structure_type
-; CHECK: DW_AT_name [DW_FORM_strp] {{.*}}= "F")
-; CHECK: DW_TAG_variable
-; CHECK-NEXT: DW_AT_specification {{.*}} "i"
-; CHECK-NEXT: DW_AT_const_value [DW_FORM_sdata] (2)
-
-; Function Attrs: nounwind
-define void @_Z2f1v() #0 {
-entry:
-  %i = alloca i32, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !29, metadata !30), !dbg !31
-  store i32 2, i32* %i, align 4, !dbg !31
-  ret void, !dbg !32
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!26, !27}
-!llvm.ident = !{!28}
-
-!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 (trunk 222175)\000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !20, metadata !24, metadata !2} ; [ DW_TAG_compile_unit ] [<stdin>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"<stdin>", metadata !"."}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00F\001\0064\0064\000\000\000", metadata !5, null, null, metadata !6, metadata !"_ZTS1F", null, metadata !"_ZTS1F"} ; [ DW_TAG_structure_type ] [F] [line 1, size 64, align 64, offset 0] [def] [from ]
-!5 = metadata !{metadata !"test.cpp", metadata !"."}
-!6 = metadata !{metadata !7, metadata !14, metadata !16}
-!7 = metadata !{metadata !"0xd\00_vptr$F\000\0064\000\000\0064", metadata !5, metadata !8, metadata !9} ; [ DW_TAG_member ] [_vptr$F] [line 0, size 64, align 0, offset 0] [artificial] [from ]
-!8 = metadata !{metadata !"0x29", metadata !5}    ; [ DW_TAG_file_type ]
-!9 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
-!10 = metadata !{metadata !"0xf\00__vtbl_ptr_type\000\0064\000\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!14 = metadata !{metadata !"0xd\00i\002\000\000\000\004096", metadata !5, metadata !"_ZTS1F", metadata !15, i32 2} ; [ DW_TAG_member ] [i] [line 2, size 0, align 0, offset 0] [static] [from ]
-!15 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !13} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
-!16 = metadata !{metadata !"0x2e\00~F\00~F\00\003\000\000\001\000\00256\000\003", metadata !5, metadata !"_ZTS1F", metadata !17, metadata !"_ZTS1F", null, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [~F]
-!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!18 = metadata !{null, metadata !19}
-!19 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088\00", null, null, metadata !"_ZTS1F"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1F]
-!20 = metadata !{metadata !21}
-!21 = metadata !{metadata !"0x2e\00f1\00f1\00_Z2f1v\006\000\001\000\000\00256\000\006", metadata !5, metadata !8, metadata !22, null, void ()* @_Z2f1v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [f1]
-!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!23 = metadata !{null}
-!24 = metadata !{metadata !25}
-!25 = metadata !{metadata !"0x34\00i\00i\00\002\001\001", metadata !"_ZTS1F", metadata !8, metadata !15, i32 2, metadata !14} ; [ DW_TAG_variable ] [i] [line 2] [local] [def]
-!26 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!27 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!28 = metadata !{metadata !"clang version 3.6.0 (trunk 222175)"}
-!29 = metadata !{metadata !"0x100\00i\007\000", metadata !21, metadata !8, metadata !13} ; [ DW_TAG_auto_variable ] [i] [line 7]
-!30 = metadata !{metadata !"0x102"}               ; [ DW_TAG_expression ]
-!31 = metadata !{i32 7, i32 0, metadata !21, null}
-!32 = metadata !{i32 8, i32 0, metadata !21, null}
diff --git a/test/DebugInfo/global.ll b/test/DebugInfo/global.ll
index 80f30c2..1715ca8 100644
--- a/test/DebugInfo/global.ll
+++ b/test/DebugInfo/global.ll
@@ -26,17 +26,17 @@ attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-fra
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !13}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/global.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"global.cpp", metadata !"/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00main\00main\00\002\000\001\000\006\00256\001\002", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [main]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/global.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x34\00i\00i\00_ZL1i\001\001\001", null, metadata !5, metadata !8, null, null} ; [ DW_TAG_variable ]
-!11 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!12 = metadata !{i32 4, i32 0, metadata !4, null}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 \001\00\000\00\000", !1, !2, !2, !3, !9, !2} ; [ DW_TAG_compile_unit ] [/tmp/global.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"global.cpp", !"/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00main\00main\00\002\000\001\000\006\00256\001\002", !1, !5, !6, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [main]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/global.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!10}
+!10 = !{!"0x34\00i\00i\00_ZL1i\001\001\001", null, !5, !8, null, null} ; [ DW_TAG_variable ]
+!11 = !{i32 2, !"Dwarf Version", i32 3}
+!12 = !MDLocation(line: 4, scope: !4)
+!13 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/incorrect-variable-debugloc.ll b/test/DebugInfo/incorrect-variable-debugloc.ll
index 987521c..afccd67 100644
--- a/test/DebugInfo/incorrect-variable-debugloc.ll
+++ b/test/DebugInfo/incorrect-variable-debugloc.ll
@@ -110,7 +110,7 @@ entry:
 
 ; <label>:30                                      ; preds = %24, %5
   store i32 0, i32* %i.i, align 4, !dbg !39, !tbaa !41
-  tail call void @llvm.dbg.value(metadata !{%struct.C* %8}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !46
+  tail call void @llvm.dbg.value(metadata %struct.C* %8, i64 0, metadata !27, metadata !{!"0x102"}), !dbg !46
   call void @_ZN1C5m_fn3Ev(%struct.C* %8), !dbg !47
   unreachable, !dbg !47
 }
@@ -145,7 +145,7 @@ entry:
   %16 = add i64 %15, 0, !dbg !48
   %17 = inttoptr i64 %16 to i64*, !dbg !48
   store i64 -868083113472691727, i64* %17, !dbg !48
-  tail call void @llvm.dbg.value(metadata !{%struct.C* %this}, i64 0, metadata !30, metadata !{metadata !"0x102"}), !dbg !48
+  tail call void @llvm.dbg.value(metadata %struct.C* %this, i64 0, metadata !30, metadata !{!"0x102"}), !dbg !48
   %call = call i32 @_ZN1A5m_fn1Ev(%struct.A* %8), !dbg !49
   %i.i = getelementptr inbounds %struct.C* %this, i64 0, i32 1, i32 0, !dbg !50
   %18 = ptrtoint i32* %i.i to i64, !dbg !50
@@ -336,56 +336,56 @@ attributes #3 = { nounwind readnone }
 !llvm.module.flags = !{!36, !37}
 !llvm.ident = !{!38}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !21, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/<stdin>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"<stdin>", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !14}
-!4 = metadata !{metadata !"0x13\00C\0010\0064\0032\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 10, size 64, align 32, offset 0] [def] [from ]
-!5 = metadata !{metadata !"incorrect-variable-debug-loc.cpp", metadata !"/tmp/dbginfo"}
-!6 = metadata !{metadata !7, metadata !9, metadata !10}
-!7 = metadata !{metadata !"0xd\00j\0012\0032\0032\000\000", metadata !5, metadata !"_ZTS1C", metadata !8} ; [ DW_TAG_member ] [j] [line 12, size 32, align 32, offset 0] [from int]
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0xd\00b\0013\0032\0032\0032\000", metadata !5, metadata !"_ZTS1C", metadata !"_ZTS1B"} ; [ DW_TAG_member ] [b] [line 13, size 32, align 32, offset 32] [from _ZTS1B]
-!10 = metadata !{metadata !"0x2e\00m_fn3\00m_fn3\00_ZN1C5m_fn3Ev\0011\000\000\000\006\00256\001\0011", metadata !5, metadata !"_ZTS1C", metadata !11, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 11] [m_fn3]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{null, metadata !13}
-!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
-!14 = metadata !{metadata !"0x13\00B\005\0032\0032\000\000\000", metadata !5, null, null, metadata !15, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_structure_type ] [B] [line 5, size 32, align 32, offset 0] [def] [from ]
-!15 = metadata !{metadata !16, metadata !17}
-!16 = metadata !{metadata !"0xd\00i\007\0032\0032\000\000", metadata !5, metadata !"_ZTS1B", metadata !8} ; [ DW_TAG_member ] [i] [line 7, size 32, align 32, offset 0] [from int]
-!17 = metadata !{metadata !"0x2e\00m_fn2\00m_fn2\00_ZN1B5m_fn2Ev\006\000\000\000\006\00256\001\006", metadata !5, metadata !"_ZTS1B", metadata !18, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 6] [m_fn2]
-!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!19 = metadata !{null, metadata !20}
-!20 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
-!21 = metadata !{metadata !22, metadata !28, metadata !32}
-!22 = metadata !{metadata !"0x2e\00fn1\00fn1\00_Z3fn1v\0016\000\001\000\006\00256\001\0016", metadata !5, metadata !23, metadata !24, null, i32 ()* @_Z3fn1v, null, null, metadata !26} ; [ DW_TAG_subprogram ] [line 16] [def] [fn1]
-!23 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/incorrect-variable-debug-loc.cpp]
-!24 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!25 = metadata !{metadata !8}
-!26 = metadata !{metadata !27}
-!27 = metadata !{metadata !"0x100\00A\0017\000", metadata !22, metadata !23, metadata !"_ZTS1C"} ; [ DW_TAG_auto_variable ] [A] [line 17]
-!28 = metadata !{metadata !"0x2e\00m_fn3\00m_fn3\00_ZN1C5m_fn3Ev\0021\000\001\000\006\00256\001\0021", metadata !5, metadata !"_ZTS1C", metadata !11, null, void (%struct.C*)* @_ZN1C5m_fn3Ev, null, metadata !10, metadata !29} ; [ DW_TAG_subprogram ] [line 21] [def] [m_fn3]
-!29 = metadata !{metadata !30}
-!30 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !28, null, metadata !31} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!31 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
-!32 = metadata !{metadata !"0x2e\00m_fn2\00m_fn2\00_ZN1B5m_fn2Ev\006\000\001\000\006\00256\001\006", metadata !5, metadata !"_ZTS1B", metadata !18, null, null, null, metadata !17, metadata !33} ; [ DW_TAG_subprogram ] [line 6] [def] [m_fn2]
-!33 = metadata !{metadata !34}
-!34 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !32, null, metadata !35} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!35 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1B]
-!36 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!37 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!38 = metadata !{metadata !"clang version 3.5.0 "}
-!39 = metadata !{i32 6, i32 0, metadata !32, metadata !40}
-!40 = metadata !{i32 18, i32 0, metadata !22, null}
-!41 = metadata !{metadata !42, metadata !43, i64 0}
-!42 = metadata !{metadata !"_ZTS1B", metadata !43, i64 0}
-!43 = metadata !{metadata !"int", metadata !44, i64 0}
-!44 = metadata !{metadata !"omnipotent char", metadata !45, i64 0}
-!45 = metadata !{metadata !"Simple C/C++ TBAA"}
-!46 = metadata !{i32 17, i32 0, metadata !22, null}
-!47 = metadata !{i32 19, i32 0, metadata !22, null}
-!48 = metadata !{i32 0, i32 0, metadata !28, null}
-!49 = metadata !{i32 22, i32 0, metadata !28, null}
-!50 = metadata !{i32 6, i32 0, metadata !32, metadata !51}
-!51 = metadata !{i32 23, i32 0, metadata !28, null}
-!52 = metadata !{i32 24, i32 0, metadata !28, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \001\00\000\00\001", !1, !2, !3, !21, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/<stdin>] [DW_LANG_C_plus_plus]
+!1 = !{!"<stdin>", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4, !14}
+!4 = !{!"0x13\00C\0010\0064\0032\000\000\000", !5, null, null, !6, null, null, !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 10, size 64, align 32, offset 0] [def] [from ]
+!5 = !{!"incorrect-variable-debug-loc.cpp", !"/tmp/dbginfo"}
+!6 = !{!7, !9, !10}
+!7 = !{!"0xd\00j\0012\0032\0032\000\000", !5, !"_ZTS1C", !8} ; [ DW_TAG_member ] [j] [line 12, size 32, align 32, offset 0] [from int]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0xd\00b\0013\0032\0032\0032\000", !5, !"_ZTS1C", !"_ZTS1B"} ; [ DW_TAG_member ] [b] [line 13, size 32, align 32, offset 32] [from _ZTS1B]
+!10 = !{!"0x2e\00m_fn3\00m_fn3\00_ZN1C5m_fn3Ev\0011\000\000\000\006\00256\001\0011", !5, !"_ZTS1C", !11, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 11] [m_fn3]
+!11 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{null, !13}
+!13 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!14 = !{!"0x13\00B\005\0032\0032\000\000\000", !5, null, null, !15, null, null, !"_ZTS1B"} ; [ DW_TAG_structure_type ] [B] [line 5, size 32, align 32, offset 0] [def] [from ]
+!15 = !{!16, !17}
+!16 = !{!"0xd\00i\007\0032\0032\000\000", !5, !"_ZTS1B", !8} ; [ DW_TAG_member ] [i] [line 7, size 32, align 32, offset 0] [from int]
+!17 = !{!"0x2e\00m_fn2\00m_fn2\00_ZN1B5m_fn2Ev\006\000\000\000\006\00256\001\006", !5, !"_ZTS1B", !18, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 6] [m_fn2]
+!18 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = !{null, !20}
+!20 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
+!21 = !{!22, !28, !32}
+!22 = !{!"0x2e\00fn1\00fn1\00_Z3fn1v\0016\000\001\000\006\00256\001\0016", !5, !23, !24, null, i32 ()* @_Z3fn1v, null, null, !26} ; [ DW_TAG_subprogram ] [line 16] [def] [fn1]
+!23 = !{!"0x29", !5}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/incorrect-variable-debug-loc.cpp]
+!24 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!25 = !{!8}
+!26 = !{!27}
+!27 = !{!"0x100\00A\0017\000", !22, !23, !"_ZTS1C"} ; [ DW_TAG_auto_variable ] [A] [line 17]
+!28 = !{!"0x2e\00m_fn3\00m_fn3\00_ZN1C5m_fn3Ev\0021\000\001\000\006\00256\001\0021", !5, !"_ZTS1C", !11, null, void (%struct.C*)* @_ZN1C5m_fn3Ev, null, !10, !29} ; [ DW_TAG_subprogram ] [line 21] [def] [m_fn3]
+!29 = !{!30}
+!30 = !{!"0x101\00this\0016777216\001088", !28, null, !31} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!31 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!32 = !{!"0x2e\00m_fn2\00m_fn2\00_ZN1B5m_fn2Ev\006\000\001\000\006\00256\001\006", !5, !"_ZTS1B", !18, null, null, null, !17, !33} ; [ DW_TAG_subprogram ] [line 6] [def] [m_fn2]
+!33 = !{!34}
+!34 = !{!"0x101\00this\0016777216\001088", !32, null, !35} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!35 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1B]
+!36 = !{i32 2, !"Dwarf Version", i32 4}
+!37 = !{i32 2, !"Debug Info Version", i32 2}
+!38 = !{!"clang version 3.5.0 "}
+!39 = !MDLocation(line: 6, scope: !32, inlinedAt: !40)
+!40 = !MDLocation(line: 18, scope: !22)
+!41 = !{!42, !43, i64 0}
+!42 = !{!"_ZTS1B", !43, i64 0}
+!43 = !{!"int", !44, i64 0}
+!44 = !{!"omnipotent char", !45, i64 0}
+!45 = !{!"Simple C/C++ TBAA"}
+!46 = !MDLocation(line: 17, scope: !22)
+!47 = !MDLocation(line: 19, scope: !22)
+!48 = !MDLocation(line: 0, scope: !28)
+!49 = !MDLocation(line: 22, scope: !28)
+!50 = !MDLocation(line: 6, scope: !32, inlinedAt: !51)
+!51 = !MDLocation(line: 23, scope: !28)
+!52 = !MDLocation(line: 24, scope: !28)
diff --git a/test/DebugInfo/incorrect-variable-debugloc1.ll b/test/DebugInfo/incorrect-variable-debugloc1.ll
new file mode 100644
index 0000000..18f2dc7
--- /dev/null
+++ b/test/DebugInfo/incorrect-variable-debugloc1.ll
@@ -0,0 +1,77 @@
+; REQUIRES: object-emission
+; This test is failing for powerpc64, because a location list for the
+; variable 'c' is not generated at all. Temporary marking this test as XFAIL 
+; for powerpc, until PR21881 is fixed.
+; XFAIL: powerpc64
+
+; RUN: %llc_dwarf -O2  -dwarf-version 2 -filetype=obj < %s | llvm-dwarfdump - | FileCheck %s  --check-prefix=DWARF23
+; RUN: %llc_dwarf -O2  -dwarf-version 3 -filetype=obj < %s | llvm-dwarfdump - | FileCheck %s  --check-prefix=DWARF23
+; RUN: %llc_dwarf -O2  -dwarf-version 4 -filetype=obj < %s | llvm-dwarfdump - | FileCheck %s  --check-prefix=DWARF4
+
+; This is a test for PR21176.
+; DW_OP_const <const> doesn't describe a constant value, but a value at a constant address. 
+; The proper way to describe a constant value is DW_OP_constu <const>, DW_OP_stack_value.
+
+; Generated with clang -S -emit-llvm -g -O2 test.cpp
+
+; extern int func();
+; 
+; int main()
+; {
+;   volatile int c = 13;
+;   c = func();
+;   return c;
+; }
+
+; DWARF23: Location description: 10 0d {{$}}
+; DWARF4: Location description: 10 0d 9f
+
+; Function Attrs: uwtable
+define i32 @main() #0 {
+entry:
+  %c = alloca i32, align 4
+  tail call void @llvm.dbg.value(metadata i32 13, i64 0, metadata !10, metadata !16), !dbg !17
+  store volatile i32 13, i32* %c, align 4, !dbg !18
+  %call = tail call i32 @_Z4funcv(), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 %call, i64 0, metadata !10, metadata !16), !dbg !17
+  store volatile i32 %call, i32* %c, align 4, !dbg !19
+  tail call void @llvm.dbg.value(metadata i32* %c, i64 0, metadata !10, metadata !16), !dbg !17
+  %c.0.c.0. = load volatile i32* %c, align 4, !dbg !20
+  ret i32 %c.0.c.0., !dbg !20
+}
+
+declare i32 @_Z4funcv() #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!12, !13}
+!llvm.ident = !{!14}
+
+!0 = !{!"0x11\004\00clang version 3.6.0 (trunk 223522)\001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/home/kromanova/ngh/ToT_latest/llvm/test/DebugInfo/test.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"test.cpp", !"/home/kromanova/ngh/ToT_latest/llvm/test/DebugInfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00main\00main\00\003\000\001\000\000\00256\001\004", !1, !5, !6, null, i32 ()* @main, null, null, !9} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [main]
+!5 = !{!"0x29", !1}    ; [ DW_TAG_file_type ] [/home/kromanova/ngh/ToT_latest/llvm/test/DebugInfo/test.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!10}
+!10 = !{!"0x100\00c\005\000", !4, !5, !11} ; [ DW_TAG_auto_variable ] [c] [line 5]
+!11 = !{!"0x35\00\000\000\000\000\000", null, null, !8} ; [ DW_TAG_volatile_type ] [line 0, size 0, align 0, offset 0] [from int]
+!12 = !{i32 2, !"Dwarf Version", i32 2}
+!13 = !{i32 2, !"Debug Info Version", i32 2}
+!14 = !{!"clang version 3.6.0 (trunk 223522)"}
+!15 = !{i32 13}
+!16 = !{!"0x102"}               ; [ DW_TAG_expression ]
+!17 = !MDLocation(line: 5, column: 16, scope: !4)
+!18 = !MDLocation(line: 5, column: 3, scope: !4)
+!19 = !MDLocation(line: 6, column: 7, scope: !4)
+!20 = !MDLocation(line: 7, column: 3, scope: !4)
+
diff --git a/test/DebugInfo/inheritance.ll b/test/DebugInfo/inheritance.ll
index 514f828..ab71d25 100644
--- a/test/DebugInfo/inheritance.ll
+++ b/test/DebugInfo/inheritance.ll
@@ -16,7 +16,7 @@ entry:
   %0 = alloca i32                                 ; <i32*> [#uses=2]
   %tst = alloca %struct.test1                     ; <%struct.test1*> [#uses=1]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.test1* %tst}, metadata !0, metadata !{metadata !"0x102"}), !dbg !21
+  call void @llvm.dbg.declare(metadata %struct.test1* %tst, metadata !0, metadata !{!"0x102"}), !dbg !21
   call void @_ZN5test1C1Ev(%struct.test1* %tst) nounwind, !dbg !22
   store i32 0, i32* %0, align 4, !dbg !23
   %1 = load i32* %0, align 4, !dbg !23            ; <i32> [#uses=1]
@@ -32,7 +32,7 @@ define linkonce_odr void @_ZN5test1C1Ev(%struct.test1* %this) nounwind ssp align
 entry:
   %this_addr = alloca %struct.test1*              ; <%struct.test1**> [#uses=2]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.test1** %this_addr}, metadata !24, metadata !{metadata !"0x102"}), !dbg !28
+  call void @llvm.dbg.declare(metadata %struct.test1** %this_addr, metadata !24, metadata !{!"0x102"}), !dbg !28
   store %struct.test1* %this, %struct.test1** %this_addr
   %0 = load %struct.test1** %this_addr, align 8, !dbg !28 ; <%struct.test1*> [#uses=1]
   %1 = getelementptr inbounds %struct.test1* %0, i32 0, i32 0, !dbg !28 ; <i32 (...)***> [#uses=1]
@@ -49,7 +49,7 @@ define linkonce_odr void @_ZN5test1D1Ev(%struct.test1* %this) nounwind ssp align
 entry:
   %this_addr = alloca %struct.test1*              ; <%struct.test1**> [#uses=3]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.test1** %this_addr}, metadata !32, metadata !{metadata !"0x102"}), !dbg !34
+  call void @llvm.dbg.declare(metadata %struct.test1** %this_addr, metadata !32, metadata !{!"0x102"}), !dbg !34
   store %struct.test1* %this, %struct.test1** %this_addr
   %0 = load %struct.test1** %this_addr, align 8, !dbg !35 ; <%struct.test1*> [#uses=1]
   %1 = getelementptr inbounds %struct.test1* %0, i32 0, i32 0, !dbg !35 ; <i32 (...)***> [#uses=1]
@@ -78,7 +78,7 @@ define linkonce_odr void @_ZN5test1D0Ev(%struct.test1* %this) nounwind ssp align
 entry:
   %this_addr = alloca %struct.test1*              ; <%struct.test1**> [#uses=3]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.test1** %this_addr}, metadata !38, metadata !{metadata !"0x102"}), !dbg !40
+  call void @llvm.dbg.declare(metadata %struct.test1** %this_addr, metadata !38, metadata !{!"0x102"}), !dbg !40
   store %struct.test1* %this, %struct.test1** %this_addr
   %0 = load %struct.test1** %this_addr, align 8, !dbg !41 ; <%struct.test1*> [#uses=1]
   %1 = getelementptr inbounds %struct.test1* %0, i32 0, i32 0, !dbg !41 ; <i32 (...)***> [#uses=1]
@@ -105,50 +105,50 @@ return:                                           ; preds = %bb2
 
 declare void @_ZdlPv(i8*) nounwind
 
-!0 = metadata !{metadata !"0x100\00tst\0013\000", metadata !1, metadata !4, metadata !8} ; [ DW_TAG_auto_variable ]
-!1 = metadata !{metadata !"0xb\000\000\000", metadata !44, metadata !2} ; [ DW_TAG_lexical_block ]
-!2 = metadata !{metadata !"0xb\000\000\000", metadata !44, metadata !3} ; [ DW_TAG_lexical_block ]
-!3 = metadata !{metadata !"0x2e\00main\00main\00main\0011\000\001\000\006\000\000\000", i32 0, metadata !4, metadata !5, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!4 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !44, metadata !45, metadata !45, null, null, null} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !4, null, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!6 = metadata !{metadata !7}
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !4} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !"0x13\00test1\001\0064\0064\000\000\000", metadata !44, metadata !4, null, metadata !9, metadata !8, null, null} ; [ DW_TAG_structure_type ] [test1] [line 1, size 64, align 64, offset 0] [def] [from ]
-!9 = metadata !{metadata !10, metadata !14, metadata !18}
-!10 = metadata !{metadata !"0xd\00_vptr$test1\001\0064\0064\000\000", metadata !44, metadata !8, metadata !11} ; [ DW_TAG_member ]
-!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !4, null, metadata !12} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{metadata !"0xf\00__vtbl_ptr_type\000\000\000\000\000", null, metadata !4, metadata !5} ; [ DW_TAG_pointer_type ]
-!13 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !46, metadata !45, metadata !45, null, null, null} ; [ DW_TAG_compile_unit ]
-!14 = metadata !{metadata !"0x2e\00test1\00test1\00\001\000\000\000\006\001\000\000", i32 0, metadata !8, metadata !15, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !4, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{null, metadata !17}
-!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !4, null, metadata !8} ; [ DW_TAG_pointer_type ]
-!18 = metadata !{metadata !"0x2e\00~test1\00~test1\00\004\000\000\001\006\000\000\000", i32 0, metadata !8, metadata !19, metadata !8, null, null, null, null} ; [ DW_TAG_subprogram ]
-!19 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !4, null, null, metadata !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!20 = metadata !{null, metadata !17, metadata !7}
-!21 = metadata !{i32 11, i32 0, metadata !1, null}
-!22 = metadata !{i32 13, i32 0, metadata !1, null}
-!23 = metadata !{i32 14, i32 0, metadata !1, null}
-!24 = metadata !{metadata !"0x101\00this\0013\000", metadata !25, metadata !4, metadata !26} ; [ DW_TAG_arg_variable ]
-!25 = metadata !{metadata !"0x2e\00test1\00test1\00_ZN5test1C1Ev\001\000\001\000\006\000\000\000", i32 0, metadata !4, metadata !15, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!26 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !4, null, metadata !27} ; [ DW_TAG_const_type ]
-!27 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !4, null, metadata !8} ; [ DW_TAG_pointer_type ]
-!28 = metadata !{i32 1, i32 0, metadata !25, null}
-!29 = metadata !{i32 1, i32 0, metadata !30, null}
-!30 = metadata !{metadata !"0xb\000\000\000", metadata !44, metadata !31} ; [ DW_TAG_lexical_block ]
-!31 = metadata !{metadata !"0xb\000\000\000", metadata !44, metadata !25} ; [ DW_TAG_lexical_block ]
-!32 = metadata !{metadata !"0x101\00this\004\000", metadata !33, metadata !4, metadata !26} ; [ DW_TAG_arg_variable ]
-!33 = metadata !{metadata !"0x2e\00~test1\00~test1\00_ZN5test1D1Ev\004\000\001\001\006\000\000\000", i32 0, metadata !8, metadata !15, metadata !8, null, null, null, null} ; [ DW_TAG_subprogram ]
-!34 = metadata !{i32 4, i32 0, metadata !33, null}
-!35 = metadata !{i32 5, i32 0, metadata !36, null}
-!36 = metadata !{metadata !"0xb\000\000\000", metadata !44, metadata !33} ; [ DW_TAG_lexical_block ]
-!37 = metadata !{i32 6, i32 0, metadata !36, null}
-!38 = metadata !{metadata !"0x101\00this\004\000", metadata !39, metadata !4, metadata !26} ; [ DW_TAG_arg_variable ]
-!39 = metadata !{metadata !"0x2e\00~test1\00~test1\00_ZN5test1D0Ev\004\000\001\001\006\000\000\000", i32 0, metadata !8, metadata !15, metadata !8, null, null, null, null} ; [ DW_TAG_subprogram ]
-!40 = metadata !{i32 4, i32 0, metadata !39, null}
-!41 = metadata !{i32 5, i32 0, metadata !42, null}
-!42 = metadata !{metadata !"0xb\000\000\000", metadata !44, metadata !39} ; [ DW_TAG_lexical_block ]
-!43 = metadata !{i32 6, i32 0, metadata !42, null}
-!44 = metadata !{metadata !"inheritance.cpp", metadata !"/tmp/"}
-!45 = metadata !{i32 0}
-!46 = metadata !{metadata !"<built-in>", metadata !"/tmp/"}
+!0 = !{!"0x100\00tst\0013\000", !1, !4, !8} ; [ DW_TAG_auto_variable ]
+!1 = !{!"0xb\000\000\000", !44, !2} ; [ DW_TAG_lexical_block ]
+!2 = !{!"0xb\000\000\000", !44, !3} ; [ DW_TAG_lexical_block ]
+!3 = !{!"0x2e\00main\00main\00main\0011\000\001\000\006\000\000\000", i32 0, !4, !5, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!4 = !{!"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", !44, !45, !45, null, null, null} ; [ DW_TAG_compile_unit ]
+!5 = !{!"0x15\00\000\000\000\000\000\000", !4, null, null, !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = !{!7}
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !4} ; [ DW_TAG_base_type ]
+!8 = !{!"0x13\00test1\001\0064\0064\000\000\000", !44, !4, null, !9, !8, null, null} ; [ DW_TAG_structure_type ] [test1] [line 1, size 64, align 64, offset 0] [def] [from ]
+!9 = !{!10, !14, !18}
+!10 = !{!"0xd\00_vptr$test1\001\0064\0064\000\000", !44, !8, !11} ; [ DW_TAG_member ]
+!11 = !{!"0xf\00\000\0064\0064\000\000", !4, null, !12} ; [ DW_TAG_pointer_type ]
+!12 = !{!"0xf\00__vtbl_ptr_type\000\000\000\000\000", null, !4, !5} ; [ DW_TAG_pointer_type ]
+!13 = !{!"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", !46, !45, !45, null, null, null} ; [ DW_TAG_compile_unit ]
+!14 = !{!"0x2e\00test1\00test1\00\001\000\000\000\006\001\000\000", i32 0, !8, !15, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!15 = !{!"0x15\00\000\000\000\000\000\000", !4, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{null, !17}
+!17 = !{!"0xf\00\000\0064\0064\000\0064", !4, null, !8} ; [ DW_TAG_pointer_type ]
+!18 = !{!"0x2e\00~test1\00~test1\00\004\000\000\001\006\000\000\000", i32 0, !8, !19, !8, null, null, null, null} ; [ DW_TAG_subprogram ]
+!19 = !{!"0x15\00\000\000\000\000\000\000", !4, null, null, !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = !{null, !17, !7}
+!21 = !MDLocation(line: 11, scope: !1)
+!22 = !MDLocation(line: 13, scope: !1)
+!23 = !MDLocation(line: 14, scope: !1)
+!24 = !{!"0x101\00this\0013\000", !25, !4, !26} ; [ DW_TAG_arg_variable ]
+!25 = !{!"0x2e\00test1\00test1\00_ZN5test1C1Ev\001\000\001\000\006\000\000\000", i32 0, !4, !15, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!26 = !{!"0x26\00\000\0064\0064\000\0064", !4, null, !27} ; [ DW_TAG_const_type ]
+!27 = !{!"0xf\00\000\0064\0064\000\000", !4, null, !8} ; [ DW_TAG_pointer_type ]
+!28 = !MDLocation(line: 1, scope: !25)
+!29 = !MDLocation(line: 1, scope: !30)
+!30 = !{!"0xb\000\000\000", !44, !31} ; [ DW_TAG_lexical_block ]
+!31 = !{!"0xb\000\000\000", !44, !25} ; [ DW_TAG_lexical_block ]
+!32 = !{!"0x101\00this\004\000", !33, !4, !26} ; [ DW_TAG_arg_variable ]
+!33 = !{!"0x2e\00~test1\00~test1\00_ZN5test1D1Ev\004\000\001\001\006\000\000\000", i32 0, !8, !15, !8, null, null, null, null} ; [ DW_TAG_subprogram ]
+!34 = !MDLocation(line: 4, scope: !33)
+!35 = !MDLocation(line: 5, scope: !36)
+!36 = !{!"0xb\000\000\000", !44, !33} ; [ DW_TAG_lexical_block ]
+!37 = !MDLocation(line: 6, scope: !36)
+!38 = !{!"0x101\00this\004\000", !39, !4, !26} ; [ DW_TAG_arg_variable ]
+!39 = !{!"0x2e\00~test1\00~test1\00_ZN5test1D0Ev\004\000\001\001\006\000\000\000", i32 0, !8, !15, !8, null, null, null, null} ; [ DW_TAG_subprogram ]
+!40 = !MDLocation(line: 4, scope: !39)
+!41 = !MDLocation(line: 5, scope: !42)
+!42 = !{!"0xb\000\000\000", !44, !39} ; [ DW_TAG_lexical_block ]
+!43 = !MDLocation(line: 6, scope: !42)
+!44 = !{!"inheritance.cpp", !"/tmp/"}
+!45 = !{i32 0}
+!46 = !{!"<built-in>", !"/tmp/"}
diff --git a/test/DebugInfo/inline-debug-info-multiret.ll b/test/DebugInfo/inline-debug-info-multiret.ll
index 05b429a..71f29ec 100644
--- a/test/DebugInfo/inline-debug-info-multiret.ll
+++ b/test/DebugInfo/inline-debug-info-multiret.ll
@@ -10,8 +10,8 @@
 ; CHECK: br label %invoke.cont, !dbg ![[MD]]
 ; The branch instruction has the source location of line 9 and its inlined location
 ; has the source location of line 14.
-; CHECK: ![[INL:[0-9]+]] = metadata !{i32 14, i32 0, metadata {{.*}}, null}
-; CHECK: ![[MD]] = metadata !{i32 9, i32 0, metadata {{.*}}, metadata ![[INL]]}
+; CHECK: ![[INL:[0-9]+]] = distinct !MDLocation(line: 14, scope: {{.*}})
+; CHECK: ![[MD]] = !MDLocation(line: 9, scope: {{.*}}, inlinedAt: ![[INL]])
 
 ; ModuleID = 'test.cpp'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -27,8 +27,8 @@ entry:
   %k.addr = alloca i32, align 4
   %k2 = alloca i32, align 4
   store i32 %k, i32* %k.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %k.addr}, metadata !13, metadata !{metadata !"0x102"}), !dbg !14
-  call void @llvm.dbg.declare(metadata !{i32* %k2}, metadata !15, metadata !{metadata !"0x102"}), !dbg !16
+  call void @llvm.dbg.declare(metadata i32* %k.addr, metadata !13, metadata !{!"0x102"}), !dbg !14
+  call void @llvm.dbg.declare(metadata i32* %k2, metadata !15, metadata !{!"0x102"}), !dbg !16
   %0 = load i32* %k.addr, align 4, !dbg !16
   %call = call i32 @_Z8test_exti(i32 %0), !dbg !16
   store i32 %call, i32* %k2, align 4, !dbg !16
@@ -85,7 +85,7 @@ catch.dispatch:                                   ; preds = %lpad
   br i1 %matches, label %catch, label %eh.resume, !dbg !23
 
 catch:                                            ; preds = %catch.dispatch
-  call void @llvm.dbg.declare(metadata !{i32* %e}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
+  call void @llvm.dbg.declare(metadata i32* %e, metadata !24, metadata !{!"0x102"}), !dbg !25
   %exn = load i8** %exn.slot, !dbg !23
   %5 = call i8* @__cxa_begin_catch(i8* %exn) #2, !dbg !23
   %6 = bitcast i8* %5 to i32*, !dbg !23
@@ -122,35 +122,35 @@ attributes #2 = { nounwind }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!31}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"<unknown>", metadata !""}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4, metadata !10}
-!4 = metadata !{metadata !"0x2e\00test\00test\00_Z4testi\004\000\001\000\006\00256\000\004", metadata !5, metadata !6, metadata !7, null, i32 (i32)* @_Z4testi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [test]
-!5 = metadata !{metadata !"test.cpp", metadata !""}
-!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [test.cpp]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !"0x2e\00test2\00test2\00_Z5test2v\0011\000\001\000\006\00256\000\0011", metadata !5, metadata !6, metadata !11, null, i32 ()* @_Z5test2v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 11] [def] [test2]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !9}
-!13 = metadata !{metadata !"0x101\00k\0016777220\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [k] [line 4]
-!14 = metadata !{i32 4, i32 0, metadata !4, null}
-!15 = metadata !{metadata !"0x100\00k2\005\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ] [k2] [line 5]
-!16 = metadata !{i32 5, i32 0, metadata !4, null}
-!17 = metadata !{i32 6, i32 0, metadata !4, null}
-!18 = metadata !{i32 7, i32 0, metadata !4, null}
-!19 = metadata !{i32 8, i32 0, metadata !4, null}
-!20 = metadata !{i32 9, i32 0, metadata !4, null}
-!21 = metadata !{i32 14, i32 0, metadata !22, null}
-!22 = metadata !{metadata !"0xb\0013\000\000", metadata !5, metadata !10} ; [ DW_TAG_lexical_block ] [test.cpp]
-!23 = metadata !{i32 15, i32 0, metadata !22, null}
-!24 = metadata !{metadata !"0x100\00e\0016\000", metadata !10, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ] [e] [line 16]
-!25 = metadata !{i32 16, i32 0, metadata !10, null}
-!26 = metadata !{i32 17, i32 0, metadata !27, null}
-!27 = metadata !{metadata !"0xb\0016\000\001", metadata !5, metadata !10} ; [ DW_TAG_lexical_block ] [test.cpp]
-!28 = metadata !{i32 18, i32 0, metadata !27, null}
-!29 = metadata !{i32 19, i32 0, metadata !10, null}
-!30 = metadata !{i32 20, i32 0, metadata !10, null}
-!31 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.3 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
+!1 = !{!"<unknown>", !""}
+!2 = !{i32 0}
+!3 = !{!4, !10}
+!4 = !{!"0x2e\00test\00test\00_Z4testi\004\000\001\000\006\00256\000\004", !5, !6, !7, null, i32 (i32)* @_Z4testi, null, null, !2} ; [ DW_TAG_subprogram ] [line 4] [def] [test]
+!5 = !{!"test.cpp", !""}
+!6 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [test.cpp]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!"0x2e\00test2\00test2\00_Z5test2v\0011\000\001\000\006\00256\000\0011", !5, !6, !11, null, i32 ()* @_Z5test2v, null, null, !2} ; [ DW_TAG_subprogram ] [line 11] [def] [test2]
+!11 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!9}
+!13 = !{!"0x101\00k\0016777220\000", !4, !6, !9} ; [ DW_TAG_arg_variable ] [k] [line 4]
+!14 = !MDLocation(line: 4, scope: !4)
+!15 = !{!"0x100\00k2\005\000", !4, !6, !9} ; [ DW_TAG_auto_variable ] [k2] [line 5]
+!16 = !MDLocation(line: 5, scope: !4)
+!17 = !MDLocation(line: 6, scope: !4)
+!18 = !MDLocation(line: 7, scope: !4)
+!19 = !MDLocation(line: 8, scope: !4)
+!20 = !MDLocation(line: 9, scope: !4)
+!21 = !MDLocation(line: 14, scope: !22)
+!22 = !{!"0xb\0013\000\000", !5, !10} ; [ DW_TAG_lexical_block ] [test.cpp]
+!23 = !MDLocation(line: 15, scope: !22)
+!24 = !{!"0x100\00e\0016\000", !10, !6, !9} ; [ DW_TAG_auto_variable ] [e] [line 16]
+!25 = !MDLocation(line: 16, scope: !10)
+!26 = !MDLocation(line: 17, scope: !27)
+!27 = !{!"0xb\0016\000\001", !5, !10} ; [ DW_TAG_lexical_block ] [test.cpp]
+!28 = !MDLocation(line: 18, scope: !27)
+!29 = !MDLocation(line: 19, scope: !10)
+!30 = !MDLocation(line: 20, scope: !10)
+!31 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/inline-debug-info.ll b/test/DebugInfo/inline-debug-info.ll
index 3f971e4..9b9439b 100644
--- a/test/DebugInfo/inline-debug-info.ll
+++ b/test/DebugInfo/inline-debug-info.ll
@@ -28,11 +28,11 @@
 ; CHECK: _Z4testi.exit:
 ; Make sure the branch instruction created during inlining has a debug location,
 ; so the range of the inlined function is correct.
-; CHECK: br label %invoke.cont, !dbg ![[MD:[0-9]+]]
+; CHECK: br label %invoke.cont, !dbg [[MD:![0-9]+]]
 ; The branch instruction has the source location of line 9 and its inlined location
 ; has the source location of line 14.
-; CHECK: ![[INL:[0-9]+]] = metadata !{i32 14, i32 0, metadata {{.*}}, null}
-; CHECK: ![[MD]] = metadata !{i32 9, i32 0, metadata {{.*}}, metadata ![[INL]]}
+; CHECK: [[INL:![0-9]*]] = distinct !MDLocation(line: 14, scope: {{.*}})
+; CHECK: [[MD]] = !MDLocation(line: 9, scope: {{.*}}, inlinedAt: [[INL]])
 
 ; ModuleID = 'test.cpp'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -47,8 +47,8 @@ entry:
   %k.addr = alloca i32, align 4
   %k2 = alloca i32, align 4
   store i32 %k, i32* %k.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %k.addr}, metadata !13, metadata !{metadata !"0x102"}), !dbg !14
-  call void @llvm.dbg.declare(metadata !{i32* %k2}, metadata !15, metadata !{metadata !"0x102"}), !dbg !16
+  call void @llvm.dbg.declare(metadata i32* %k.addr, metadata !13, metadata !{!"0x102"}), !dbg !14
+  call void @llvm.dbg.declare(metadata i32* %k2, metadata !15, metadata !{!"0x102"}), !dbg !16
   %0 = load i32* %k.addr, align 4, !dbg !16
   %call = call i32 @_Z8test_exti(i32 %0), !dbg !16
   store i32 %call, i32* %k2, align 4, !dbg !16
@@ -103,7 +103,7 @@ catch.dispatch:                                   ; preds = %lpad
   br i1 %matches, label %catch, label %eh.resume, !dbg !23
 
 catch:                                            ; preds = %catch.dispatch
-  call void @llvm.dbg.declare(metadata !{i32* %e}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
+  call void @llvm.dbg.declare(metadata i32* %e, metadata !24, metadata !{!"0x102"}), !dbg !25
   %exn = load i8** %exn.slot, !dbg !23
   %5 = call i8* @__cxa_begin_catch(i8* %exn) #2, !dbg !23
   %6 = bitcast i8* %5 to i32*, !dbg !23
@@ -140,35 +140,35 @@ attributes #2 = { nounwind }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!31}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"<unknown>", metadata !""}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4, metadata !10}
-!4 = metadata !{metadata !"0x2e\00test\00test\00_Z4testi\004\000\001\000\006\00256\000\004", metadata !5, metadata !6, metadata !7, null, i32 (i32)* @_Z4testi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [test]
-!5 = metadata !{metadata !"test.cpp", metadata !""}
-!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [test.cpp]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !"0x2e\00test2\00test2\00_Z5test2v\0011\000\001\000\006\00256\000\0011", metadata !5, metadata !6, metadata !11, null, i32 ()* @_Z5test2v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 11] [def] [test2]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !9}
-!13 = metadata !{metadata !"0x101\00k\0016777220\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [k] [line 4]
-!14 = metadata !{i32 4, i32 0, metadata !4, null}
-!15 = metadata !{metadata !"0x100\00k2\005\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ] [k2] [line 5]
-!16 = metadata !{i32 5, i32 0, metadata !4, null}
-!17 = metadata !{i32 6, i32 0, metadata !4, null}
-!18 = metadata !{i32 7, i32 0, metadata !4, null}
-!19 = metadata !{i32 8, i32 0, metadata !4, null}
-!20 = metadata !{i32 9, i32 0, metadata !4, null}
-!21 = metadata !{i32 14, i32 0, metadata !22, null}
-!22 = metadata !{metadata !"0xb\0013\000\000", metadata !5, metadata !10} ; [ DW_TAG_lexical_block ] [test.cpp]
-!23 = metadata !{i32 15, i32 0, metadata !22, null}
-!24 = metadata !{metadata !"0x100\00e\0016\000", metadata !10, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ] [e] [line 16]
-!25 = metadata !{i32 16, i32 0, metadata !10, null}
-!26 = metadata !{i32 17, i32 0, metadata !27, null}
-!27 = metadata !{metadata !"0xb\0016\000\001", metadata !5, metadata !10} ; [ DW_TAG_lexical_block ] [test.cpp]
-!28 = metadata !{i32 18, i32 0, metadata !27, null}
-!29 = metadata !{i32 19, i32 0, metadata !10, null}
-!30 = metadata !{i32 20, i32 0, metadata !10, null}
-!31 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.3 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
+!1 = !{!"<unknown>", !""}
+!2 = !{i32 0}
+!3 = !{!4, !10}
+!4 = !{!"0x2e\00test\00test\00_Z4testi\004\000\001\000\006\00256\000\004", !5, !6, !7, null, i32 (i32)* @_Z4testi, null, null, !2} ; [ DW_TAG_subprogram ] [line 4] [def] [test]
+!5 = !{!"test.cpp", !""}
+!6 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [test.cpp]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!"0x2e\00test2\00test2\00_Z5test2v\0011\000\001\000\006\00256\000\0011", !5, !6, !11, null, i32 ()* @_Z5test2v, null, null, !2} ; [ DW_TAG_subprogram ] [line 11] [def] [test2]
+!11 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!9}
+!13 = !{!"0x101\00k\0016777220\000", !4, !6, !9} ; [ DW_TAG_arg_variable ] [k] [line 4]
+!14 = !MDLocation(line: 4, scope: !4)
+!15 = !{!"0x100\00k2\005\000", !4, !6, !9} ; [ DW_TAG_auto_variable ] [k2] [line 5]
+!16 = !MDLocation(line: 5, scope: !4)
+!17 = !MDLocation(line: 6, scope: !4)
+!18 = !MDLocation(line: 7, scope: !4)
+!19 = !MDLocation(line: 8, scope: !4)
+!20 = !MDLocation(line: 9, scope: !4)
+!21 = !MDLocation(line: 14, scope: !22)
+!22 = !{!"0xb\0013\000\000", !5, !10} ; [ DW_TAG_lexical_block ] [test.cpp]
+!23 = !MDLocation(line: 15, scope: !22)
+!24 = !{!"0x100\00e\0016\000", !10, !6, !9} ; [ DW_TAG_auto_variable ] [e] [line 16]
+!25 = !MDLocation(line: 16, scope: !10)
+!26 = !MDLocation(line: 17, scope: !27)
+!27 = !{!"0xb\0016\000\001", !5, !10} ; [ DW_TAG_lexical_block ] [test.cpp]
+!28 = !MDLocation(line: 18, scope: !27)
+!29 = !MDLocation(line: 19, scope: !10)
+!30 = !MDLocation(line: 20, scope: !10)
+!31 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/inline-no-debug-info.ll b/test/DebugInfo/inline-no-debug-info.ll
index 2de6a49..6b58bcc 100644
--- a/test/DebugInfo/inline-no-debug-info.ll
+++ b/test/DebugInfo/inline-no-debug-info.ll
@@ -21,10 +21,11 @@
 
 ; Debug location of the code in caller() and of the inlined code that did not
 ; have any debug location before.
-; CHECK-DAG: [[A]] = metadata !{i32 4, i32 0, metadata !{{[01-9]+}}, null}
+; CHECK-DAG: [[A]] = !MDLocation(line: 4, scope: !{{[0-9]+}})
 
 ; Debug location of the inlined code.
-; CHECK-DAG: [[B]] = metadata !{i32 2, i32 0, metadata !{{[01-9]+}}, metadata [[A]]}
+; CHECK-DAG: [[B]] = !MDLocation(line: 2, scope: !{{[0-9]+}}, inlinedAt: [[A_INL:![0-9]*]])
+; CHECK-DAG: [[A_INL]] = distinct !MDLocation(line: 4, scope: !{{[0-9]+}})
 
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -54,16 +55,16 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 (210174)\001\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/code/llvm/build0/test.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"test.c", metadata !"/code/llvm/build0"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !7}
-!4 = metadata !{metadata !"0x2e\00caller\00caller\00\004\000\001\000\006\000\001\004", metadata !1, metadata !5, metadata !6, null, void ()* @caller, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [caller]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/code/llvm/build0/test.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !"0x2e\00callee2\00callee2\00\002\001\001\000\006\000\001\002", metadata !1, metadata !5, metadata !6, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [callee2]
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!10 = metadata !{metadata !"clang version 3.5.0 (210174)"}
-!11 = metadata !{i32 2, i32 0, metadata !7, null}
-!12 = metadata !{i32 4, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 (210174)\001\00\000\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/code/llvm/build0/test.c] [DW_LANG_C99]
+!1 = !{!"test.c", !"/code/llvm/build0"}
+!2 = !{}
+!3 = !{!4, !7}
+!4 = !{!"0x2e\00caller\00caller\00\004\000\001\000\006\000\001\004", !1, !5, !6, null, void ()* @caller, null, null, !2} ; [ DW_TAG_subprogram ] [line 4] [def] [caller]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/code/llvm/build0/test.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!"0x2e\00callee2\00callee2\00\002\001\001\000\006\000\001\002", !1, !5, !6, null, null, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [callee2]
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 2}
+!10 = !{!"clang version 3.5.0 (210174)"}
+!11 = !MDLocation(line: 2, scope: !7)
+!12 = !MDLocation(line: 4, scope: !4)
diff --git a/test/DebugInfo/inline-scopes.ll b/test/DebugInfo/inline-scopes.ll
index cdcfaf5..ec36a2f 100644
--- a/test/DebugInfo/inline-scopes.ll
+++ b/test/DebugInfo/inline-scopes.ll
@@ -43,7 +43,7 @@ entry:
   %b.i3 = alloca i8, align 1
   %retval.i = alloca i32, align 4
   %b.i = alloca i8, align 1
-  call void @llvm.dbg.declare(metadata !{i8* %b.i}, metadata !16, metadata !{metadata !"0x102"}), !dbg !19
+  call void @llvm.dbg.declare(metadata i8* %b.i, metadata !16, metadata !{!"0x102"}), !dbg !19
   %call.i = call zeroext i1 @_Z1fv(), !dbg !19
   %frombool.i = zext i1 %call.i to i8, !dbg !19
   store i8 %frombool.i, i8* %b.i, align 1, !dbg !19
@@ -61,7 +61,7 @@ if.end.i:                                         ; preds = %entry
 
 _Z2f1v.exit:                                      ; preds = %if.then.i, %if.end.i
   %1 = load i32* %retval.i, !dbg !23
-  call void @llvm.dbg.declare(metadata !{i8* %b.i3}, metadata !24, metadata !{metadata !"0x102"}), !dbg !27
+  call void @llvm.dbg.declare(metadata i8* %b.i3, metadata !24, metadata !{!"0x102"}), !dbg !27
   %call.i4 = call zeroext i1 @_Z1fv(), !dbg !27
   %frombool.i5 = zext i1 %call.i4 to i8, !dbg !27
   store i8 %frombool.i5, i8* %b.i3, align 1, !dbg !27
@@ -95,36 +95,36 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !llvm.module.flags = !{!13, !14}
 !llvm.ident = !{!15}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/inline-scopes.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"inline-scopes.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !10, metadata !12}
-!4 = metadata !{metadata !"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", metadata !5, metadata !6, metadata !7, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
-!5 = metadata !{metadata !"y.cc", metadata !"/tmp/dbginfo"}
-!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/y.cc]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !"0x2e\00f2\00f2\00_Z2f2v\008\000\001\000\006\00256\000\008", metadata !1, metadata !11, metadata !7, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 8] [def] [f2]
-!11 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/inline-scopes.cpp]
-!12 = metadata !{metadata !"0x2e\00f1\00f1\00_Z2f1v\002\000\001\000\006\00256\000\002", metadata !1, metadata !11, metadata !7, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [f1]
-!13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!15 = metadata !{metadata !"clang version 3.5.0 "}
-!16 = metadata !{metadata !"0x100\00b\003\000", metadata !17, metadata !11, metadata !18} ; [ DW_TAG_auto_variable ] [b] [line 3]
-!17 = metadata !{metadata !"0xb\003\000\001", metadata !1, metadata !12} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/inline-scopes.cpp]
-!18 = metadata !{metadata !"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
-!19 = metadata !{i32 3, i32 0, metadata !17, metadata !20}
-!20 = metadata !{i32 8, i32 0, metadata !4, null}
-!21 = metadata !{i32 4, i32 0, metadata !17, metadata !20}
-!22 = metadata !{i32 5, i32 0, metadata !12, metadata !20}
-!23 = metadata !{i32 6, i32 0, metadata !12, metadata !20}
-!24 = metadata !{metadata !"0x100\00b\002\000", metadata !25, metadata !6, metadata !18} ; [ DW_TAG_auto_variable ] [b] [line 2]
-!25 = metadata !{metadata !"0xb\002\000\000", metadata !5, metadata !26} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/y.cc]
-!26 = metadata !{metadata !"0xb\000", metadata !5, metadata !10} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/y.cc]
-!27 = metadata !{i32 2, i32 0, metadata !25, metadata !28}
-!28 = metadata !{i32 9, i32 0, metadata !4, null}
-!29 = metadata !{i32 3, i32 0, metadata !25, metadata !28}
-!30 = metadata !{i32 4, i32 0, metadata !26, metadata !28}
-!31 = metadata !{i32 5, i32 0, metadata !26, metadata !28}
-!32 = metadata !{i32 10, i32 0, metadata !4, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/inline-scopes.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"inline-scopes.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4, !10, !12}
+!4 = !{!"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", !5, !6, !7, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!5 = !{!"y.cc", !"/tmp/dbginfo"}
+!6 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/y.cc]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!"0x2e\00f2\00f2\00_Z2f2v\008\000\001\000\006\00256\000\008", !1, !11, !7, null, null, null, null, !2} ; [ DW_TAG_subprogram ] [line 8] [def] [f2]
+!11 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/inline-scopes.cpp]
+!12 = !{!"0x2e\00f1\00f1\00_Z2f1v\002\000\001\000\006\00256\000\002", !1, !11, !7, null, null, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [f1]
+!13 = !{i32 2, !"Dwarf Version", i32 4}
+!14 = !{i32 1, !"Debug Info Version", i32 2}
+!15 = !{!"clang version 3.5.0 "}
+!16 = !{!"0x100\00b\003\000", !17, !11, !18} ; [ DW_TAG_auto_variable ] [b] [line 3]
+!17 = !{!"0xb\003\000\001", !1, !12} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/inline-scopes.cpp]
+!18 = !{!"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
+!19 = !MDLocation(line: 3, scope: !17, inlinedAt: !20)
+!20 = !MDLocation(line: 8, scope: !4)
+!21 = !MDLocation(line: 4, scope: !17, inlinedAt: !20)
+!22 = !MDLocation(line: 5, scope: !12, inlinedAt: !20)
+!23 = !MDLocation(line: 6, scope: !12, inlinedAt: !20)
+!24 = !{!"0x100\00b\002\000", !25, !6, !18} ; [ DW_TAG_auto_variable ] [b] [line 2]
+!25 = !{!"0xb\002\000\000", !5, !26} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/y.cc]
+!26 = !{!"0xb\000", !5, !10} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/y.cc]
+!27 = !MDLocation(line: 2, scope: !25, inlinedAt: !28)
+!28 = !MDLocation(line: 9, scope: !4)
+!29 = !MDLocation(line: 3, scope: !25, inlinedAt: !28)
+!30 = !MDLocation(line: 4, scope: !26, inlinedAt: !28)
+!31 = !MDLocation(line: 5, scope: !26, inlinedAt: !28)
+!32 = !MDLocation(line: 10, scope: !4)
diff --git a/test/DebugInfo/inlined-arguments.ll b/test/DebugInfo/inlined-arguments.ll
index 71d4414..c705cf8 100644
--- a/test/DebugInfo/inlined-arguments.ll
+++ b/test/DebugInfo/inlined-arguments.ll
@@ -24,16 +24,16 @@
 
 ; Function Attrs: uwtable
 define void @_Z2f2v() #0 {
-  tail call void @llvm.dbg.value(metadata !15, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !18
-  tail call void @llvm.dbg.value(metadata !19, i64 0, metadata !20, metadata !{metadata !"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata i32 undef, i64 0, metadata !16, metadata !{!"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata i32 2, i64 0, metadata !20, metadata !{!"0x102"}), !dbg !18
   tail call void @_Z2f3i(i32 2), !dbg !21
   ret void, !dbg !22
 }
 
 ; Function Attrs: uwtable
 define void @_Z2f1ii(i32 %x, i32 %y) #0 {
-  tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !23
-  tail call void @llvm.dbg.value(metadata !{i32 %y}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !23
+  tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !13, metadata !{!"0x102"}), !dbg !23
+  tail call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !14, metadata !{!"0x102"}), !dbg !23
   tail call void @_Z2f3i(i32 %y), !dbg !24
   ret void, !dbg !25
 }
@@ -50,30 +50,30 @@ attributes #2 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!26}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/exp.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"exp.cpp", metadata !"/usr/local/google/home/blaikie/dev/scratch"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !8}
-!4 = metadata !{metadata !"0x2e\00f2\00f2\00_Z2f2v\003\000\001\000\006\00256\001\003", metadata !1, metadata !5, metadata !6, null, void ()* @_Z2f2v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f2]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/exp.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{metadata !"0x2e\00f1\00f1\00_Z2f1ii\006\000\001\000\006\00256\001\006", metadata !1, metadata !5, metadata !9, null, void (i32, i32)* @_Z2f1ii, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 6] [def] [f1]
-!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!10 = metadata !{null, metadata !11, metadata !11}
-!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!12 = metadata !{metadata !13, metadata !14}
-!13 = metadata !{metadata !"0x101\00x\0016777222\000", metadata !8, metadata !5, metadata !11} ; [ DW_TAG_arg_variable ] [x] [line 6]
-!14 = metadata !{metadata !"0x101\00y\0033554438\000", metadata !8, metadata !5, metadata !11} ; [ DW_TAG_arg_variable ] [y] [line 6]
-!15 = metadata !{i32 undef}
-!16 = metadata !{metadata !"0x101\00x\0016777222\000", metadata !8, metadata !5, metadata !11, metadata !17} ; [ DW_TAG_arg_variable ] [x] [line 6]
-!17 = metadata !{i32 4, i32 0, metadata !4, null}
-!18 = metadata !{i32 6, i32 0, metadata !8, metadata !17}
-!19 = metadata !{i32 2}
-!20 = metadata !{metadata !"0x101\00y\0033554438\000", metadata !8, metadata !5, metadata !11, metadata !17} ; [ DW_TAG_arg_variable ] [y] [line 6]
-!21 = metadata !{i32 7, i32 0, metadata !8, metadata !17}
-!22 = metadata !{i32 5, i32 0, metadata !4, null}
-!23 = metadata !{i32 6, i32 0, metadata !8, null}
-!24 = metadata !{i32 7, i32 0, metadata !8, null}
-!25 = metadata !{i32 8, i32 0, metadata !8, null}
-!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 \001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/exp.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"exp.cpp", !"/usr/local/google/home/blaikie/dev/scratch"}
+!2 = !{}
+!3 = !{!4, !8}
+!4 = !{!"0x2e\00f2\00f2\00_Z2f2v\003\000\001\000\006\00256\001\003", !1, !5, !6, null, void ()* @_Z2f2v, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f2]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/exp.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{!"0x2e\00f1\00f1\00_Z2f1ii\006\000\001\000\006\00256\001\006", !1, !5, !9, null, void (i32, i32)* @_Z2f1ii, null, null, !12} ; [ DW_TAG_subprogram ] [line 6] [def] [f1]
+!9 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = !{null, !11, !11}
+!11 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = !{!13, !14}
+!13 = !{!"0x101\00x\0016777222\000", !8, !5, !11} ; [ DW_TAG_arg_variable ] [x] [line 6]
+!14 = !{!"0x101\00y\0033554438\000", !8, !5, !11} ; [ DW_TAG_arg_variable ] [y] [line 6]
+!15 = !{i32 undef}
+!16 = !{!"0x101\00x\0016777222\000", !8, !5, !11, !17} ; [ DW_TAG_arg_variable ] [x] [line 6]
+!17 = !MDLocation(line: 4, scope: !4)
+!18 = !MDLocation(line: 6, scope: !8, inlinedAt: !17)
+!19 = !{i32 2}
+!20 = !{!"0x101\00y\0033554438\000", !8, !5, !11, !17} ; [ DW_TAG_arg_variable ] [y] [line 6]
+!21 = !MDLocation(line: 7, scope: !8, inlinedAt: !17)
+!22 = !MDLocation(line: 5, scope: !4)
+!23 = !MDLocation(line: 6, scope: !8)
+!24 = !MDLocation(line: 7, scope: !8)
+!25 = !MDLocation(line: 8, scope: !8)
+!26 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/inlined-vars.ll b/test/DebugInfo/inlined-vars.ll
index 1c540ec..b84e12f 100644
--- a/test/DebugInfo/inlined-vars.ll
+++ b/test/DebugInfo/inlined-vars.ll
@@ -4,8 +4,8 @@
 
 define i32 @main() uwtable {
 entry:
-  tail call void @llvm.dbg.value(metadata !1, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !21
-  tail call void @llvm.dbg.value(metadata !1, i64 0, metadata !22, metadata !{metadata !"0x102"}), !dbg !23
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !21
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !22, metadata !{!"0x102"}), !dbg !23
   tail call void @smth(i32 0), !dbg !24
   tail call void @smth(i32 0), !dbg !25
   ret i32 0, !dbg !19
@@ -18,39 +18,39 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!27}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.2 (trunk 159419)\001\00\000\00\000", metadata !26, metadata !2, metadata !2, metadata !3, metadata !2,  metadata !2} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 0}
-!2 = metadata !{}
-!3 = metadata !{metadata !5, metadata !10}
-!5 = metadata !{metadata !"0x2e\00main\00main\00\0010\000\001\000\006\00256\001\0010", metadata !26, metadata !6, metadata !7, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ]
-!6 = metadata !{metadata !"0x29", metadata !26} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !"0x2e\00f\00f\00_ZL1fi\003\001\001\000\006\00256\001\003", metadata !26, metadata !6, metadata !11, null, null, null, null, metadata !13} ; [ DW_TAG_subprogram ]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !9, metadata !9}
-!13 = metadata !{metadata !15, metadata !16}
-!15 = metadata !{metadata !"0x101\00argument\0016777219\000", metadata !10, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
+!0 = !{!"0x11\004\00clang version 3.2 (trunk 159419)\001\00\000\00\000", !26, !2, !2, !3, !2,  !2} ; [ DW_TAG_compile_unit ]
+!1 = !{i32 0}
+!2 = !{}
+!3 = !{!5, !10}
+!5 = !{!"0x2e\00main\00main\00\0010\000\001\000\006\00256\001\0010", !26, !6, !7, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ]
+!6 = !{!"0x29", !26} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = !{!"0x2e\00f\00f\00_ZL1fi\003\001\001\000\006\00256\001\003", !26, !6, !11, null, null, null, null, !13} ; [ DW_TAG_subprogram ]
+!11 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!9, !9}
+!13 = !{!15, !16}
+!15 = !{!"0x101\00argument\0016777219\000", !10, !6, !9} ; [ DW_TAG_arg_variable ]
 
 ; Two DW_TAG_formal_parameter: one abstract and one inlined.
 ; ARGUMENT: {{.*Abbrev.*DW_TAG_formal_parameter}}
 ; ARGUMENT: {{.*Abbrev.*DW_TAG_formal_parameter}}
 ; ARGUMENT-NOT: {{.*Abbrev.*DW_TAG_formal_parameter}}
 
-!16 = metadata !{metadata !"0x100\00local\004\000", metadata !10, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ]
+!16 = !{!"0x100\00local\004\000", !10, !6, !9} ; [ DW_TAG_auto_variable ]
 
 ; Two DW_TAG_variable: one abstract and one inlined.
 ; VARIABLE: {{.*Abbrev.*DW_TAG_variable}}
 ; VARIABLE: {{.*Abbrev.*DW_TAG_variable}}
 ; VARIABLE-NOT: {{.*Abbrev.*DW_TAG_variable}}
 
-!18 = metadata !{metadata !"0x101\00argument\0016777219\000", metadata !10, metadata !6, metadata !9, metadata !19} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 11, i32 10, metadata !5, null}
-!21 = metadata !{i32 3, i32 25, metadata !10, metadata !19}
-!22 = metadata !{metadata !"0x100\00local\004\000", metadata !10, metadata !6, metadata !9, metadata !19} ; [ DW_TAG_auto_variable ]
-!23 = metadata !{i32 4, i32 16, metadata !10, metadata !19}
-!24 = metadata !{i32 5, i32 3, metadata !10, metadata !19}
-!25 = metadata !{i32 6, i32 3, metadata !10, metadata !19}
-!26 = metadata !{metadata !"inline-bug.cc", metadata !"/tmp/dbginfo/pr13202"}
-!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!18 = !{!"0x101\00argument\0016777219\000", !10, !6, !9, !19} ; [ DW_TAG_arg_variable ]
+!19 = !MDLocation(line: 11, column: 10, scope: !5)
+!21 = !MDLocation(line: 3, column: 25, scope: !10, inlinedAt: !19)
+!22 = !{!"0x100\00local\004\000", !10, !6, !9, !19} ; [ DW_TAG_auto_variable ]
+!23 = !MDLocation(line: 4, column: 16, scope: !10, inlinedAt: !19)
+!24 = !MDLocation(line: 5, column: 3, scope: !10, inlinedAt: !19)
+!25 = !MDLocation(line: 6, column: 3, scope: !10, inlinedAt: !19)
+!26 = !{!"inline-bug.cc", !"/tmp/dbginfo/pr13202"}
+!27 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/location-verifier.ll b/test/DebugInfo/location-verifier.ll
new file mode 100644
index 0000000..0e56be4
--- /dev/null
+++ b/test/DebugInfo/location-verifier.ll
@@ -0,0 +1,33 @@
+; RUN: not llvm-as -disable-output -verify-debug-info < %s 2>&1 | FileCheck %s
+; ModuleID = 'test.c'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @foo() #0 {
+entry:
+  ret i32 42, !dbg !13
+}
+
+attributes #0 = { nounwind ssp uwtable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10, !11}
+!llvm.ident = !{!12}
+
+!0 = !{!"0x11\0012\00clang version 3.7.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/test.c] [DW_LANG_C99]
+!1 = !{!"test.c", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\000\000\000\001", !1, !5, !6, null, i32 ()* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}                               ; [ DW_TAG_file_type ] [/test.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 2}
+!11 = !{i32 1, !"PIC Level", i32 2}
+!12 = !{!"clang version 3.7.0 "}
+; An old-style MDLocation should not pass verify.
+; CHECK: DISubprogram does not Verify
+!13 = !{i32 2, i32 2, !4, null}
diff --git a/test/DebugInfo/lto-comp-dir.ll b/test/DebugInfo/lto-comp-dir.ll
index f07b751..a79cf32 100644
--- a/test/DebugInfo/lto-comp-dir.ll
+++ b/test/DebugInfo/lto-comp-dir.ll
@@ -59,26 +59,26 @@ attributes #1 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="
 !llvm.module.flags = !{!16, !17}
 !llvm.ident = !{!18, !18}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"a.cpp", metadata !"/tmp/dbginfo/a"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00func\00func\00_Z4funcv\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @_Z4funcv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/a/a.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !9, metadata !2, metadata !2, metadata !10, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
-!9 = metadata !{metadata !"b.cpp", metadata !"/tmp/dbginfo/b"}
-!10 = metadata !{metadata !11}
-!11 = metadata !{metadata !"0x2e\00main\00main\00\002\000\001\000\006\00256\000\002", metadata !9, metadata !12, metadata !13, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [main]
-!12 = metadata !{metadata !"0x29", metadata !9}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/b/b.cpp]
-!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!14 = metadata !{metadata !15}
-!15 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!18 = metadata !{metadata !"clang version 3.5.0 "}
-!19 = metadata !{i32 2, i32 0, metadata !4, null}
-!20 = metadata !{i32 3, i32 0, metadata !11, null}
-!21 = metadata !{i32 4, i32 0, metadata !11, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ]
+!1 = !{!"a.cpp", !"/tmp/dbginfo/a"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00func\00func\00_Z4funcv\001\000\001\000\006\00256\000\001", !1, !5, !6, null, void ()* @_Z4funcv, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/a/a.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !9, !2, !2, !10, !2, !2} ; [ DW_TAG_compile_unit ]
+!9 = !{!"b.cpp", !"/tmp/dbginfo/b"}
+!10 = !{!11}
+!11 = !{!"0x2e\00main\00main\00\002\000\001\000\006\00256\000\002", !9, !12, !13, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [main]
+!12 = !{!"0x29", !9}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/b/b.cpp]
+!13 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = !{!15}
+!15 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!16 = !{i32 2, !"Dwarf Version", i32 4}
+!17 = !{i32 1, !"Debug Info Version", i32 2}
+!18 = !{!"clang version 3.5.0 "}
+!19 = !MDLocation(line: 2, scope: !4)
+!20 = !MDLocation(line: 3, scope: !11)
+!21 = !MDLocation(line: 4, scope: !11)
 
diff --git a/test/DebugInfo/member-order.ll b/test/DebugInfo/member-order.ll
index de485a6..ae84571 100644
--- a/test/DebugInfo/member-order.ll
+++ b/test/DebugInfo/member-order.ll
@@ -29,7 +29,7 @@ define void @_ZN3foo2f1Ev(%struct.foo* %this) #0 align 2 {
 entry:
   %this.addr = alloca %struct.foo*, align 8
   store %struct.foo* %this, %struct.foo** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr}, metadata !16, metadata !{metadata !"0x102"}), !dbg !18
+  call void @llvm.dbg.declare(metadata %struct.foo** %this.addr, metadata !16, metadata !{!"0x102"}), !dbg !18
   %this1 = load %struct.foo** %this.addr
   ret void, !dbg !19
 }
@@ -43,24 +43,24 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!15, !20}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !13, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/member-order.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"member-order.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00foo\001\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !6, metadata !11}
-!6 = metadata !{metadata !"0x2e\00f1\00f1\00_ZN3foo2f1Ev\002\000\000\000\006\00256\000\002", metadata !1, metadata !4, metadata !7, null, null, null, i32 0, metadata !10} ; [ DW_TAG_subprogram ] [line 2] [f1]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9}
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
-!10 = metadata !{i32 786468}
-!11 = metadata !{metadata !"0x2e\00f2\00f2\00_ZN3foo2f2Ev\003\000\000\000\006\00256\000\003", metadata !1, metadata !4, metadata !7, null, null, null, i32 0, metadata !12} ; [ DW_TAG_subprogram ] [line 3] [f2]
-!12 = metadata !{i32 786468}
-!13 = metadata !{metadata !14}
-!14 = metadata !{metadata !"0x2e\00f1\00f1\00_ZN3foo2f1Ev\006\000\001\000\006\00256\000\006", metadata !1, null, metadata !7, null, void (%struct.foo*)* @_ZN3foo2f1Ev, null, metadata !6, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [f1]
-!15 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!16 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !14, null, metadata !17} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
-!18 = metadata !{i32 0, i32 0, metadata !14, null}
-!19 = metadata !{i32 7, i32 0, metadata !14, null}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 \000\00\000\00\000", !1, !2, !3, !13, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/member-order.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"member-order.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00foo\001\008\008\000\000\000", !1, null, null, !5, null, null, !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!6, !11}
+!6 = !{!"0x2e\00f1\00f1\00_ZN3foo2f1Ev\002\000\000\000\006\00256\000\002", !1, !4, !7, null, null, null, i32 0, !10} ; [ DW_TAG_subprogram ] [line 2] [f1]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9}
+!9 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
+!10 = !{i32 786468}
+!11 = !{!"0x2e\00f2\00f2\00_ZN3foo2f2Ev\003\000\000\000\006\00256\000\003", !1, !4, !7, null, null, null, i32 0, !12} ; [ DW_TAG_subprogram ] [line 3] [f2]
+!12 = !{i32 786468}
+!13 = !{!14}
+!14 = !{!"0x2e\00f1\00f1\00_ZN3foo2f1Ev\006\000\001\000\006\00256\000\006", !1, null, !7, null, void (%struct.foo*)* @_ZN3foo2f1Ev, null, !6, !2} ; [ DW_TAG_subprogram ] [line 6] [def] [f1]
+!15 = !{i32 2, !"Dwarf Version", i32 4}
+!16 = !{!"0x101\00this\0016777216\001088", !14, null, !17} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!17 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
+!18 = !MDLocation(line: 0, scope: !14)
+!19 = !MDLocation(line: 7, scope: !14)
+!20 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/member-pointers.ll b/test/DebugInfo/member-pointers.ll
index 4d45ba6..54bb0a5 100644
--- a/test/DebugInfo/member-pointers.ll
+++ b/test/DebugInfo/member-pointers.ll
@@ -23,18 +23,18 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!16}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 \000\00\000\00\000", metadata !15, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ] [/home/blaikie/Development/scratch/simple.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{}
-!3 = metadata !{metadata !5, metadata !10}
-!5 = metadata !{metadata !"0x34\00x\00x\00\004\000\001", null, metadata !6, metadata !7, i64* @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
-!6 = metadata !{metadata !"0x29", metadata !15} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x1f\00\000\000\000\000\000", null, null, metadata !8, metadata !9} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from int]
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x13\00S\001\008\008\000\000\000", metadata !15, null, null, metadata !1, null, null, null} ; [ DW_TAG_structure_type ] [S] [line 1, size 8, align 8, offset 0] [def] [from ]
-!10 = metadata !{metadata !"0x34\00y\00y\00\005\000\001", null, metadata !6, metadata !11, { i64, i64 }* @y, null} ; [ DW_TAG_variable ] [y] [line 5] [def]
-!11 = metadata !{metadata !"0x1f\00\000\000\000\000\000", null, null, metadata !12, metadata !9} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!13 = metadata !{null, metadata !14, metadata !8}
-!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from S]
-!15 = metadata !{metadata !"simple.cpp", metadata !"/home/blaikie/Development/scratch"}
-!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.3 \000\00\000\00\000", !15, !1, !1, !1, !3,  !1} ; [ DW_TAG_compile_unit ] [/home/blaikie/Development/scratch/simple.cpp] [DW_LANG_C_plus_plus]
+!1 = !{}
+!3 = !{!5, !10}
+!5 = !{!"0x34\00x\00x\00\004\000\001", null, !6, !7, i64* @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
+!6 = !{!"0x29", !15} ; [ DW_TAG_file_type ]
+!7 = !{!"0x1f\00\000\000\000\000\000", null, null, !8, !9} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from int]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x13\00S\001\008\008\000\000\000", !15, null, null, !1, null, null, null} ; [ DW_TAG_structure_type ] [S] [line 1, size 8, align 8, offset 0] [def] [from ]
+!10 = !{!"0x34\00y\00y\00\005\000\001", null, !6, !11, { i64, i64 }* @y, null} ; [ DW_TAG_variable ] [y] [line 5] [def]
+!11 = !{!"0x1f\00\000\000\000\000\000", null, null, !12, !9} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = !{null, !14, !8}
+!14 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from S]
+!15 = !{!"simple.cpp", !"/home/blaikie/Development/scratch"}
+!16 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/missing-abstract-variable.ll b/test/DebugInfo/missing-abstract-variable.ll
index 104080a..dcaa2db 100644
--- a/test/DebugInfo/missing-abstract-variable.ll
+++ b/test/DebugInfo/missing-abstract-variable.ll
@@ -99,7 +99,7 @@
 ; Function Attrs: uwtable
 define void @_Z1bv() #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !24, i64 0, metadata !25, metadata !{metadata !"0x102"}), !dbg !27
+  tail call void @llvm.dbg.value(metadata i1 false, i64 0, metadata !25, metadata !{!"0x102"}), !dbg !27
   tail call void @_Z1fi(i32 0), !dbg !28
   ret void, !dbg !29
 }
@@ -107,13 +107,13 @@ entry:
 ; Function Attrs: uwtable
 define void @_Z1ab(i1 zeroext %u) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i1 %u}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !30
-  tail call void @llvm.dbg.value(metadata !{i1 %u}, i64 0, metadata !31, metadata !{metadata !"0x102"}), !dbg !33
+  tail call void @llvm.dbg.value(metadata i1 %u, i64 0, metadata !13, metadata !{!"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata i1 %u, i64 0, metadata !31, metadata !{!"0x102"}), !dbg !33
   br i1 %u, label %if.then.i, label %_Z1xb.exit, !dbg !34
 
 if.then.i:                                        ; preds = %entry
   %0 = load i32* @t, align 4, !dbg !35, !tbaa !36
-  tail call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !40, metadata !{metadata !"0x102"}), !dbg !35
+  tail call void @llvm.dbg.value(metadata i32 %0, i64 0, metadata !40, metadata !{!"0x102"}), !dbg !35
   tail call void @_Z1fi(i32 %0), !dbg !41
   br label %_Z1xb.exit, !dbg !42
 
@@ -135,48 +135,48 @@ attributes #2 = { nounwind readnone }
 !llvm.module.flags = !{!21, !22}
 !llvm.ident = !{!23}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/missing-abstract-variables.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"missing-abstract-variables.cc", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !8, metadata !14}
-!4 = metadata !{metadata !"0x2e\00b\00b\00_Z1bv\0013\000\001\000\006\00256\001\0013", metadata !1, metadata !5, metadata !6, null, void ()* @_Z1bv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 13] [def] [b]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/missing-abstract-variables.cc]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{metadata !"0x2e\00a\00a\00_Z1ab\0017\000\001\000\006\00256\001\0017", metadata !1, metadata !5, metadata !9, null, void (i1)* @_Z1ab, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 17] [def] [a]
-!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!10 = metadata !{null, metadata !11}
-!11 = metadata !{metadata !"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x101\00u\0016777233\000", metadata !8, metadata !5, metadata !11} ; [ DW_TAG_arg_variable ] [u] [line 17]
-!14 = metadata !{metadata !"0x2e\00x\00x\00_Z1xb\005\000\001\000\006\00256\001\005", metadata !1, metadata !5, metadata !9, null, null, null, null, metadata !15} ; [ DW_TAG_subprogram ] [line 5] [def] [x]
-!15 = metadata !{metadata !16, metadata !17}
-!16 = metadata !{metadata !"0x101\00b\0016777221\000", metadata !14, metadata !5, metadata !11} ; [ DW_TAG_arg_variable ] [b] [line 5]
-!17 = metadata !{metadata !"0x100\00s\007\000", metadata !18, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [s] [line 7]
-!18 = metadata !{metadata !"0xb\006\000\000", metadata !1, metadata !19} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/missing-abstract-variables.cc]
-!19 = metadata !{metadata !"0xb\006\000\000", metadata !1, metadata !14} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/missing-abstract-variables.cc]
-!20 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!22 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!23 = metadata !{metadata !"clang version 3.5.0 "}
-!24 = metadata !{i1 false}
-!25 = metadata !{metadata !"0x101\00b\0016777221\000", metadata !14, metadata !5, metadata !11, metadata !26} ; [ DW_TAG_arg_variable ] [b] [line 5]
-!26 = metadata !{i32 14, i32 0, metadata !4, null}
-!27 = metadata !{i32 5, i32 0, metadata !14, metadata !26}
-!28 = metadata !{i32 10, i32 0, metadata !14, metadata !26}
-!29 = metadata !{i32 15, i32 0, metadata !4, null}
-!30 = metadata !{i32 17, i32 0, metadata !8, null}
-!31 = metadata !{metadata !"0x101\00b\0016777221\000", metadata !14, metadata !5, metadata !11, metadata !32} ; [ DW_TAG_arg_variable ] [b] [line 5]
-!32 = metadata !{i32 18, i32 0, metadata !8, null}
-!33 = metadata !{i32 5, i32 0, metadata !14, metadata !32}
-!34 = metadata !{i32 6, i32 0, metadata !19, metadata !32}
-!35 = metadata !{i32 7, i32 0, metadata !18, metadata !32}
-!36 = metadata !{metadata !37, metadata !37, i64 0}
-!37 = metadata !{metadata !"int", metadata !38, i64 0}
-!38 = metadata !{metadata !"omnipotent char", metadata !39, i64 0}
-!39 = metadata !{metadata !"Simple C/C++ TBAA"}
-!40 = metadata !{metadata !"0x100\00s\007\000", metadata !18, metadata !5, metadata !20, metadata !32} ; [ DW_TAG_auto_variable ] [s] [line 7]
-!41 = metadata !{i32 8, i32 0, metadata !18, metadata !32}
-!42 = metadata !{i32 9, i32 0, metadata !18, metadata !32}
-!43 = metadata !{i32 10, i32 0, metadata !14, metadata !32}
-!44 = metadata !{i32 19, i32 0, metadata !8, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/missing-abstract-variables.cc] [DW_LANG_C_plus_plus]
+!1 = !{!"missing-abstract-variables.cc", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4, !8, !14}
+!4 = !{!"0x2e\00b\00b\00_Z1bv\0013\000\001\000\006\00256\001\0013", !1, !5, !6, null, void ()* @_Z1bv, null, null, !2} ; [ DW_TAG_subprogram ] [line 13] [def] [b]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/missing-abstract-variables.cc]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{!"0x2e\00a\00a\00_Z1ab\0017\000\001\000\006\00256\001\0017", !1, !5, !9, null, void (i1)* @_Z1ab, null, null, !12} ; [ DW_TAG_subprogram ] [line 17] [def] [a]
+!9 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = !{null, !11}
+!11 = !{!"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
+!12 = !{!13}
+!13 = !{!"0x101\00u\0016777233\000", !8, !5, !11} ; [ DW_TAG_arg_variable ] [u] [line 17]
+!14 = !{!"0x2e\00x\00x\00_Z1xb\005\000\001\000\006\00256\001\005", !1, !5, !9, null, null, null, null, !15} ; [ DW_TAG_subprogram ] [line 5] [def] [x]
+!15 = !{!16, !17}
+!16 = !{!"0x101\00b\0016777221\000", !14, !5, !11} ; [ DW_TAG_arg_variable ] [b] [line 5]
+!17 = !{!"0x100\00s\007\000", !18, !5, !20} ; [ DW_TAG_auto_variable ] [s] [line 7]
+!18 = !{!"0xb\006\000\000", !1, !19} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/missing-abstract-variables.cc]
+!19 = !{!"0xb\006\000\000", !1, !14} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/missing-abstract-variables.cc]
+!20 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!21 = !{i32 2, !"Dwarf Version", i32 4}
+!22 = !{i32 2, !"Debug Info Version", i32 2}
+!23 = !{!"clang version 3.5.0 "}
+!24 = !{i1 false}
+!25 = !{!"0x101\00b\0016777221\000", !14, !5, !11, !26} ; [ DW_TAG_arg_variable ] [b] [line 5]
+!26 = !MDLocation(line: 14, scope: !4)
+!27 = !MDLocation(line: 5, scope: !14, inlinedAt: !26)
+!28 = !MDLocation(line: 10, scope: !14, inlinedAt: !26)
+!29 = !MDLocation(line: 15, scope: !4)
+!30 = !MDLocation(line: 17, scope: !8)
+!31 = !{!"0x101\00b\0016777221\000", !14, !5, !11, !32} ; [ DW_TAG_arg_variable ] [b] [line 5]
+!32 = !MDLocation(line: 18, scope: !8)
+!33 = !MDLocation(line: 5, scope: !14, inlinedAt: !32)
+!34 = !MDLocation(line: 6, scope: !19, inlinedAt: !32)
+!35 = !MDLocation(line: 7, scope: !18, inlinedAt: !32)
+!36 = !{!37, !37, i64 0}
+!37 = !{!"int", !38, i64 0}
+!38 = !{!"omnipotent char", !39, i64 0}
+!39 = !{!"Simple C/C++ TBAA"}
+!40 = !{!"0x100\00s\007\000", !18, !5, !20, !32} ; [ DW_TAG_auto_variable ] [s] [line 7]
+!41 = !MDLocation(line: 8, scope: !18, inlinedAt: !32)
+!42 = !MDLocation(line: 9, scope: !18, inlinedAt: !32)
+!43 = !MDLocation(line: 10, scope: !14, inlinedAt: !32)
+!44 = !MDLocation(line: 19, scope: !8)
diff --git a/test/DebugInfo/multiline.ll b/test/DebugInfo/multiline.ll
new file mode 100644
index 0000000..e67af32
--- /dev/null
+++ b/test/DebugInfo/multiline.ll
@@ -0,0 +1,82 @@
+; RUN: llc -filetype=asm -asm-verbose=0 -O0 < %s | FileCheck %s
+; RUN: llc -filetype=obj -O0 < %s | llvm-dwarfdump -debug-dump=line - | FileCheck %s --check-prefix=INT
+; XFAIL: hexagon
+
+; Check that the assembly output properly handles is_stmt changes. And since
+; we're testing anyway, check the integrated assembler too.
+
+; Generated with clang from multiline.c:
+; void f1();
+; void f2() {
+;   f1(); f1(); f1();
+;   f1(); f1(); f1();
+; }
+
+
+; CHECK: .loc	1 2 0{{$}}
+; CHECK-NOT: .loc{{ }}
+; CHECK: .loc	1 3 3 prologue_end{{$}}
+; CHECK-NOT: .loc
+; CHECK: .loc	1 3 9 is_stmt 0{{$}}
+; CHECK-NOT: .loc
+; CHECK: .loc	1 3 15{{$}}
+; CHECK-NOT: .loc
+; CHECK: .loc	1 4 3 is_stmt 1{{$}}
+; CHECK-NOT: .loc
+; CHECK: .loc	1 4 9 is_stmt 0{{$}}
+; CHECK-NOT: .loc
+; CHECK: .loc	1 4 15{{$}}
+; CHECK-NOT: .loc
+; CHECK: .loc	1 5 1 is_stmt 1{{$}}
+
+; INT: {{^}}Address
+; INT: -----
+; INT-NEXT: 2 0 1 0 0 is_stmt{{$}}
+; INT-NEXT: 3 3 1 0 0 is_stmt prologue_end{{$}}
+; INT-NEXT: 3 9 1 0 0 {{$}}
+; INT-NEXT: 3 15 1 0 0 {{$}}
+; INT-NEXT: 4 3 1 0 0 is_stmt{{$}}
+; INT-NEXT: 4 9 1 0 0 {{$}}
+; INT-NEXT: 4 15 1 0 0 {{$}}
+; INT-NEXT: 5 1 1 0 0 is_stmt{{$}}
+
+
+; Function Attrs: nounwind uwtable
+define void @f2() #0 {
+entry:
+  call void (...)* @f1(), !dbg !11
+  call void (...)* @f1(), !dbg !12
+  call void (...)* @f1(), !dbg !13
+  call void (...)* @f1(), !dbg !14
+  call void (...)* @f1(), !dbg !15
+  call void (...)* @f1(), !dbg !16
+  ret void, !dbg !17
+}
+
+declare void @f1(...) #1
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = !{!"0x11\0012\00clang version 3.6.0 (trunk 225000) (llvm/trunk 224999)\000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/multiline.c] [DW_LANG_C99]
+!1 = !{!"multiline.c", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00f2\00f2\00\002\000\001\000\000\000\000\002", !1, !5, !6, null, void ()* @f2, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [f2]
+!5 = !{!"0x29", !1}                               ; [ DW_TAG_file_type ] [/tmp/dbginfo/multiline.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 2}
+!10 = !{!"clang version 3.6.0 (trunk 225000) (llvm/trunk 224999)"}
+!11 = !MDLocation(line: 3, column: 3, scope: !4)
+!12 = !MDLocation(line: 3, column: 9, scope: !4)
+!13 = !MDLocation(line: 3, column: 15, scope: !4)
+!14 = !MDLocation(line: 4, column: 3, scope: !4)
+!15 = !MDLocation(line: 4, column: 9, scope: !4)
+!16 = !MDLocation(line: 4, column: 15, scope: !4)
+!17 = !MDLocation(line: 5, column: 1, scope: !4)
diff --git a/test/DebugInfo/namespace.ll b/test/DebugInfo/namespace.ll
index edbeed5..a4fdbd2 100644
--- a/test/DebugInfo/namespace.ll
+++ b/test/DebugInfo/namespace.ll
@@ -216,7 +216,7 @@ define void @_ZN1A1B2f1Ei(i32) #0 {
 entry:
   %.addr = alloca i32, align 4
   store i32 %0, i32* %.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %.addr}, metadata !61, metadata !62), !dbg !63
+  call void @llvm.dbg.declare(metadata i32* %.addr, metadata !61, metadata !62), !dbg !63
   ret void, !dbg !64
 }
 
@@ -237,7 +237,7 @@ entry:
   %b.addr = alloca i8, align 1
   %frombool = zext i1 %b to i8
   store i8 %frombool, i8* %b.addr, align 1
-  call void @llvm.dbg.declare(metadata !{i8* %b.addr}, metadata !66, metadata !62), !dbg !67
+  call void @llvm.dbg.declare(metadata i8* %b.addr, metadata !66, metadata !62), !dbg !67
   %0 = load i8* %b.addr, align 1, !dbg !68
   %tobool = trunc i8 %0 to i1, !dbg !68
   br i1 %tobool, label %if.then, label %if.end, !dbg !68
@@ -288,79 +288,79 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!57, !58}
 !llvm.ident = !{!59}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !9, metadata !30, metadata !33} ; [ DW_TAG_compile_unit ] [/tmp/debug-info-namespace.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"debug-info-namespace.cpp", metadata !"/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !8}
-!4 = metadata !{metadata !"0x13\00foo\005\000\000\000\004\000", metadata !5, metadata !6, null, null, null, null, metadata !"_ZTSN1A1B3fooE"} ; [ DW_TAG_structure_type ] [foo] [line 5, size 0, align 0, offset 0] [decl] [from ]
-!5 = metadata !{metadata !"foo.cpp", metadata !"/tmp"}
-!6 = metadata !{metadata !"0x39\00B\001", metadata !5, metadata !7} ; [ DW_TAG_namespace ] [B] [line 1]
-!7 = metadata !{metadata !"0x39\00A\005", metadata !1, null} ; [ DW_TAG_namespace ] [A] [line 5]
-!8 = metadata !{metadata !"0x13\00bar\006\008\008\000\000\000", metadata !5, metadata !6, null, metadata !2, null, null, metadata !"_ZTSN1A1B3barE"} ; [ DW_TAG_structure_type ] [bar] [line 6, size 8, align 8, offset 0] [def] [from ]
-!9 = metadata !{metadata !10, metadata !14, metadata !17, metadata !21, metadata !25, metadata !26, metadata !27}
-!10 = metadata !{metadata !"0x2e\00f1\00f1\00_ZN1A1B2f1Ev\003\000\001\000\000\00256\000\003", metadata !5, metadata !6, metadata !11, null, i32 ()* @_ZN1A1B2f1Ev, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f1]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!14 = metadata !{metadata !"0x2e\00f1\00f1\00_ZN1A1B2f1Ei\004\000\001\000\000\00256\000\004", metadata !5, metadata !6, metadata !15, null, void (i32)* @_ZN1A1B2f1Ei, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [f1]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{null, metadata !13}
-!17 = metadata !{metadata !"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\0020\001\001\000\000\00256\000\0020", metadata !5, metadata !18, metadata !19, null, void ()* @__cxx_global_var_init, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 20] [local] [def] [__cxx_global_var_init]
-!18 = metadata !{metadata !"0x29", metadata !5}   ; [ DW_TAG_file_type ] [/tmp/foo.cpp]
-!19 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!20 = metadata !{null}
-!21 = metadata !{metadata !"0x2e\00func\00func\00_Z4funcb\0021\000\001\000\000\00256\000\0021", metadata !5, metadata !18, metadata !22, null, i32 (i1)* @_Z4funcb, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 21] [def] [func]
-!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!23 = metadata !{metadata !13, metadata !24}
-!24 = metadata !{metadata !"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
-!25 = metadata !{metadata !"0x2e\00__cxx_global_var_init1\00__cxx_global_var_init1\00\0044\001\001\000\000\00256\000\0044", metadata !5, metadata !18, metadata !19, null, void ()* @__cxx_global_var_init1, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 44] [local] [def] [__cxx_global_var_init1]
-!26 = metadata !{metadata !"0x2e\00func_fwd\00func_fwd\00_ZN1A1B8func_fwdEv\0047\000\001\000\000\00256\000\0047", metadata !5, metadata !6, metadata !19, null, void ()* @_ZN1A1B8func_fwdEv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 47] [def] [func_fwd]
-!27 = metadata !{metadata !"0x2e\00\00\00_GLOBAL__sub_I_debug_info_namespace.cpp\000\001\001\000\000\0064\000\000", metadata !1, metadata !28, metadata !29, null, void ()* @_GLOBAL__sub_I_debug_info_namespace.cpp, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 0] [local] [def]
-!28 = metadata !{metadata !"0x29", metadata !1}   ; [ DW_TAG_file_type ] [/tmp/debug-info-namespace.cpp]
-!29 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!30 = metadata !{metadata !31, metadata !32}
-!31 = metadata !{metadata !"0x34\00i\00i\00_ZN1A1B1iE\0020\000\001", metadata !6, metadata !18, metadata !13, i32* @_ZN1A1B1iE, null} ; [ DW_TAG_variable ] [i] [line 20] [def]
-!32 = metadata !{metadata !"0x34\00var_fwd\00var_fwd\00_ZN1A1B7var_fwdE\0044\000\001", metadata !6, metadata !18, metadata !13, i32* @_ZN1A1B7var_fwdE, null} ; [ DW_TAG_variable ] [var_fwd] [line 44] [def]
-!33 = metadata !{metadata !34, metadata !35, metadata !36, metadata !37, metadata !40, metadata !41, metadata !42, metadata !43, metadata !44, metadata !45, metadata !47, metadata !48, metadata !49, metadata !51, metadata !54, metadata !55, metadata !56}
-!34 = metadata !{metadata !"0x3a\0015\00", metadata !7, metadata !6} ; [ DW_TAG_imported_module ]
-!35 = metadata !{metadata !"0x3a\0018\00", metadata !0, metadata !7} ; [ DW_TAG_imported_module ]
-!36 = metadata !{metadata !"0x8\0019\00E", metadata !0, metadata !7} ; [ DW_TAG_imported_declaration ]
-!37 = metadata !{metadata !"0x3a\0023\00", metadata !38, metadata !6} ; [ DW_TAG_imported_module ]
-!38 = metadata !{metadata !"0xb\0022\0010\001", metadata !5, metadata !39} ; [ DW_TAG_lexical_block ] [/tmp/foo.cpp]
-!39 = metadata !{metadata !"0xb\0022\007\000", metadata !5, metadata !21} ; [ DW_TAG_lexical_block ] [/tmp/foo.cpp]
-!40 = metadata !{metadata !"0x3a\0026\00", metadata !21, metadata !7} ; [ DW_TAG_imported_module ]
-!41 = metadata !{metadata !"0x8\0027\00", metadata !21, metadata !"_ZTSN1A1B3fooE"} ; [ DW_TAG_imported_declaration ]
-!42 = metadata !{metadata !"0x8\0028\00", metadata !21, metadata !"_ZTSN1A1B3barE"} ; [ DW_TAG_imported_declaration ]
-!43 = metadata !{metadata !"0x8\0029\00", metadata !21, metadata !14} ; [ DW_TAG_imported_declaration ]
-!44 = metadata !{metadata !"0x8\0030\00", metadata !21, metadata !31} ; [ DW_TAG_imported_declaration ]
-!45 = metadata !{metadata !"0x8\0031\00", metadata !21, metadata !46} ; [ DW_TAG_imported_declaration ]
-!46 = metadata !{metadata !"0x16\00baz\007\000\000\000\000", metadata !5, metadata !6, metadata !"_ZTSN1A1B3barE"} ; [ DW_TAG_typedef ] [baz] [line 7, size 0, align 0, offset 0] [from _ZTSN1A1B3barE]
-!47 = metadata !{metadata !"0x8\0032\00X", metadata !21, metadata !7} ; [ DW_TAG_imported_declaration ]
-!48 = metadata !{metadata !"0x8\0033\00Y", metadata !21, metadata !47} ; [ DW_TAG_imported_declaration ]
-!49 = metadata !{metadata !"0x8\0034\00", metadata !21, metadata !50} ; [ DW_TAG_imported_declaration ]
-!50 = metadata !{metadata !"0x34\00var_decl\00var_decl\00_ZN1A1B8var_declE\008\000\000", metadata !6, metadata !18, metadata !13, null, null} ; [ DW_TAG_variable ] [var_decl] [line 8]
-!51 = metadata !{metadata !"0x8\0035\00", metadata !21, metadata !52} ; [ DW_TAG_imported_declaration ]
-!52 = metadata !{metadata !"0x2e\00func_decl\00func_decl\00_ZN1A1B9func_declEv\009\000\000\000\000\00256\000\000", metadata !5, metadata !6, metadata !19, null, null, null, null, metadata !53} ; [ DW_TAG_subprogram ] [line 9] [scope 0] [func_decl]
-!53 = metadata !{metadata !"0x24"}
-!54 = metadata !{metadata !"0x8\0036\00", metadata !21, metadata !32} ; [ DW_TAG_imported_declaration ]
-!55 = metadata !{metadata !"0x8\0037\00", metadata !21, metadata !26} ; [ DW_TAG_imported_declaration ]
-!56 = metadata !{metadata !"0x8\0042\00", metadata !7, metadata !31} ; [ DW_TAG_imported_declaration ]
-!57 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!58 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!59 = metadata !{metadata !"clang version 3.6.0 "}
-!60 = metadata !{i32 3, i32 12, metadata !10, null}
-!61 = metadata !{metadata !"0x101\00\0016777220\000", metadata !14, metadata !18, metadata !13} ; [ DW_TAG_arg_variable ] [line 4]
-!62 = metadata !{metadata !"0x102"}               ; [ DW_TAG_expression ]
-!63 = metadata !{i32 4, i32 12, metadata !14, null}
-!64 = metadata !{i32 4, i32 16, metadata !14, null}
-!65 = metadata !{i32 20, i32 12, metadata !17, null}
-!66 = metadata !{metadata !"0x101\00b\0016777237\000", metadata !21, metadata !18, metadata !24} ; [ DW_TAG_arg_variable ] [b] [line 21]
-!67 = metadata !{i32 21, i32 15, metadata !21, null}
-!68 = metadata !{i32 22, i32 7, metadata !21, null}
-!69 = metadata !{i32 24, i32 5, metadata !38, null}
-!70 = metadata !{i32 38, i32 3, metadata !21, null}
-!71 = metadata !{i32 39, i32 1, metadata !21, null}
-!72 = metadata !{i32 44, i32 15, metadata !25, null}
-!73 = metadata !{i32 47, i32 21, metadata !26, null}
-!74 = metadata !{i32 0, i32 0, metadata !75, null}
-!75 = metadata !{metadata !"0xb\000", metadata !5, metadata !27} ; [ DW_TAG_lexical_block ] [/tmp/foo.cpp]
+!0 = !{!"0x11\004\00clang version 3.6.0 \000\00\000\00\001", !1, !2, !3, !9, !30, !33} ; [ DW_TAG_compile_unit ] [/tmp/debug-info-namespace.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"debug-info-namespace.cpp", !"/tmp"}
+!2 = !{}
+!3 = !{!4, !8}
+!4 = !{!"0x13\00foo\005\000\000\000\004\000", !5, !6, null, null, null, null, !"_ZTSN1A1B3fooE"} ; [ DW_TAG_structure_type ] [foo] [line 5, size 0, align 0, offset 0] [decl] [from ]
+!5 = !{!"foo.cpp", !"/tmp"}
+!6 = !{!"0x39\00B\001", !5, !7} ; [ DW_TAG_namespace ] [B] [line 1]
+!7 = !{!"0x39\00A\005", !1, null} ; [ DW_TAG_namespace ] [A] [line 5]
+!8 = !{!"0x13\00bar\006\008\008\000\000\000", !5, !6, null, !2, null, null, !"_ZTSN1A1B3barE"} ; [ DW_TAG_structure_type ] [bar] [line 6, size 8, align 8, offset 0] [def] [from ]
+!9 = !{!10, !14, !17, !21, !25, !26, !27}
+!10 = !{!"0x2e\00f1\00f1\00_ZN1A1B2f1Ev\003\000\001\000\000\00256\000\003", !5, !6, !11, null, i32 ()* @_ZN1A1B2f1Ev, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f1]
+!11 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!13}
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = !{!"0x2e\00f1\00f1\00_ZN1A1B2f1Ei\004\000\001\000\000\00256\000\004", !5, !6, !15, null, void (i32)* @_ZN1A1B2f1Ei, null, null, !2} ; [ DW_TAG_subprogram ] [line 4] [def] [f1]
+!15 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{null, !13}
+!17 = !{!"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\0020\001\001\000\000\00256\000\0020", !5, !18, !19, null, void ()* @__cxx_global_var_init, null, null, !2} ; [ DW_TAG_subprogram ] [line 20] [local] [def] [__cxx_global_var_init]
+!18 = !{!"0x29", !5}   ; [ DW_TAG_file_type ] [/tmp/foo.cpp]
+!19 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = !{null}
+!21 = !{!"0x2e\00func\00func\00_Z4funcb\0021\000\001\000\000\00256\000\0021", !5, !18, !22, null, i32 (i1)* @_Z4funcb, null, null, !2} ; [ DW_TAG_subprogram ] [line 21] [def] [func]
+!22 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = !{!13, !24}
+!24 = !{!"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
+!25 = !{!"0x2e\00__cxx_global_var_init1\00__cxx_global_var_init1\00\0044\001\001\000\000\00256\000\0044", !5, !18, !19, null, void ()* @__cxx_global_var_init1, null, null, !2} ; [ DW_TAG_subprogram ] [line 44] [local] [def] [__cxx_global_var_init1]
+!26 = !{!"0x2e\00func_fwd\00func_fwd\00_ZN1A1B8func_fwdEv\0047\000\001\000\000\00256\000\0047", !5, !6, !19, null, void ()* @_ZN1A1B8func_fwdEv, null, null, !2} ; [ DW_TAG_subprogram ] [line 47] [def] [func_fwd]
+!27 = !{!"0x2e\00\00\00_GLOBAL__sub_I_debug_info_namespace.cpp\000\001\001\000\000\0064\000\000", !1, !28, !29, null, void ()* @_GLOBAL__sub_I_debug_info_namespace.cpp, null, null, !2} ; [ DW_TAG_subprogram ] [line 0] [local] [def]
+!28 = !{!"0x29", !1}   ; [ DW_TAG_file_type ] [/tmp/debug-info-namespace.cpp]
+!29 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!30 = !{!31, !32}
+!31 = !{!"0x34\00i\00i\00_ZN1A1B1iE\0020\000\001", !6, !18, !13, i32* @_ZN1A1B1iE, null} ; [ DW_TAG_variable ] [i] [line 20] [def]
+!32 = !{!"0x34\00var_fwd\00var_fwd\00_ZN1A1B7var_fwdE\0044\000\001", !6, !18, !13, i32* @_ZN1A1B7var_fwdE, null} ; [ DW_TAG_variable ] [var_fwd] [line 44] [def]
+!33 = !{!34, !35, !36, !37, !40, !41, !42, !43, !44, !45, !47, !48, !49, !51, !54, !55, !56}
+!34 = !{!"0x3a\0015\00", !7, !6} ; [ DW_TAG_imported_module ]
+!35 = !{!"0x3a\0018\00", !0, !7} ; [ DW_TAG_imported_module ]
+!36 = !{!"0x8\0019\00E", !0, !7} ; [ DW_TAG_imported_declaration ]
+!37 = !{!"0x3a\0023\00", !38, !6} ; [ DW_TAG_imported_module ]
+!38 = !{!"0xb\0022\0010\001", !5, !39} ; [ DW_TAG_lexical_block ] [/tmp/foo.cpp]
+!39 = !{!"0xb\0022\007\000", !5, !21} ; [ DW_TAG_lexical_block ] [/tmp/foo.cpp]
+!40 = !{!"0x3a\0026\00", !21, !7} ; [ DW_TAG_imported_module ]
+!41 = !{!"0x8\0027\00", !21, !"_ZTSN1A1B3fooE"} ; [ DW_TAG_imported_declaration ]
+!42 = !{!"0x8\0028\00", !21, !"_ZTSN1A1B3barE"} ; [ DW_TAG_imported_declaration ]
+!43 = !{!"0x8\0029\00", !21, !14} ; [ DW_TAG_imported_declaration ]
+!44 = !{!"0x8\0030\00", !21, !31} ; [ DW_TAG_imported_declaration ]
+!45 = !{!"0x8\0031\00", !21, !46} ; [ DW_TAG_imported_declaration ]
+!46 = !{!"0x16\00baz\007\000\000\000\000", !5, !6, !"_ZTSN1A1B3barE"} ; [ DW_TAG_typedef ] [baz] [line 7, size 0, align 0, offset 0] [from _ZTSN1A1B3barE]
+!47 = !{!"0x8\0032\00X", !21, !7} ; [ DW_TAG_imported_declaration ]
+!48 = !{!"0x8\0033\00Y", !21, !47} ; [ DW_TAG_imported_declaration ]
+!49 = !{!"0x8\0034\00", !21, !50} ; [ DW_TAG_imported_declaration ]
+!50 = !{!"0x34\00var_decl\00var_decl\00_ZN1A1B8var_declE\008\000\000", !6, !18, !13, null, null} ; [ DW_TAG_variable ] [var_decl] [line 8]
+!51 = !{!"0x8\0035\00", !21, !52} ; [ DW_TAG_imported_declaration ]
+!52 = !{!"0x2e\00func_decl\00func_decl\00_ZN1A1B9func_declEv\009\000\000\000\000\00256\000\000", !5, !6, !19, null, null, null, null, !53} ; [ DW_TAG_subprogram ] [line 9] [scope 0] [func_decl]
+!53 = !{!"0x24"}
+!54 = !{!"0x8\0036\00", !21, !32} ; [ DW_TAG_imported_declaration ]
+!55 = !{!"0x8\0037\00", !21, !26} ; [ DW_TAG_imported_declaration ]
+!56 = !{!"0x8\0042\00", !7, !31} ; [ DW_TAG_imported_declaration ]
+!57 = !{i32 2, !"Dwarf Version", i32 2}
+!58 = !{i32 2, !"Debug Info Version", i32 2}
+!59 = !{!"clang version 3.6.0 "}
+!60 = !MDLocation(line: 3, column: 12, scope: !10)
+!61 = !{!"0x101\00\0016777220\000", !14, !18, !13} ; [ DW_TAG_arg_variable ] [line 4]
+!62 = !{!"0x102"}               ; [ DW_TAG_expression ]
+!63 = !MDLocation(line: 4, column: 12, scope: !14)
+!64 = !MDLocation(line: 4, column: 16, scope: !14)
+!65 = !MDLocation(line: 20, column: 12, scope: !17)
+!66 = !{!"0x101\00b\0016777237\000", !21, !18, !24} ; [ DW_TAG_arg_variable ] [b] [line 21]
+!67 = !MDLocation(line: 21, column: 15, scope: !21)
+!68 = !MDLocation(line: 22, column: 7, scope: !21)
+!69 = !MDLocation(line: 24, column: 5, scope: !38)
+!70 = !MDLocation(line: 38, column: 3, scope: !21)
+!71 = !MDLocation(line: 39, column: 1, scope: !21)
+!72 = !MDLocation(line: 44, column: 15, scope: !25)
+!73 = !MDLocation(line: 47, column: 21, scope: !26)
+!74 = !MDLocation(line: 0, scope: !75)
+!75 = !{!"0xb\000", !5, !27} ; [ DW_TAG_lexical_block ] [/tmp/foo.cpp]
diff --git a/test/DebugInfo/namespace_function_definition.ll b/test/DebugInfo/namespace_function_definition.ll
index 7a7e8b8..02c55bf 100644
--- a/test/DebugInfo/namespace_function_definition.ll
+++ b/test/DebugInfo/namespace_function_definition.ll
@@ -30,15 +30,15 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/namespace_function_definition.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"namespace_function_definition.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00func\00func\00_ZN2ns4funcEv\002\000\001\000\006\00256\000\002", metadata !1, metadata !5, metadata !6, null, void ()* @_ZN2ns4funcEv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [func]
-!5 = metadata !{metadata !"0x39\00ns\001", metadata !1, null} ; [ DW_TAG_namespace ] [ns] [line 1]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!10 = metadata !{metadata !"clang version 3.5.0 "}
-!11 = metadata !{i32 3, i32 0, metadata !4, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/namespace_function_definition.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"namespace_function_definition.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00func\00func\00_ZN2ns4funcEv\002\000\001\000\006\00256\000\002", !1, !5, !6, null, void ()* @_ZN2ns4funcEv, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [func]
+!5 = !{!"0x39\00ns\001", !1, null} ; [ DW_TAG_namespace ] [ns] [line 1]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 1, !"Debug Info Version", i32 2}
+!10 = !{!"clang version 3.5.0 "}
+!11 = !MDLocation(line: 3, scope: !4)
diff --git a/test/DebugInfo/namespace_inline_function_definition.ll b/test/DebugInfo/namespace_inline_function_definition.ll
index 943a836..b6f1b5f 100644
--- a/test/DebugInfo/namespace_inline_function_definition.ll
+++ b/test/DebugInfo/namespace_inline_function_definition.ll
@@ -42,7 +42,7 @@ entry:
   store i32 0, i32* %retval
   %0 = load i32* @x, align 4, !dbg !16
   store i32 %0, i32* %i.addr.i, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr.i}, metadata !17, metadata !{metadata !"0x102"}), !dbg !18
+  call void @llvm.dbg.declare(metadata i32* %i.addr.i, metadata !17, metadata !{!"0x102"}), !dbg !18
   %1 = load i32* %i.addr.i, align 4, !dbg !18
   %mul.i = mul nsw i32 %1, 2, !dbg !18
   ret i32 %mul.i, !dbg !16
@@ -53,7 +53,7 @@ define i32 @_ZN2ns4funcEi(i32 %i) #1 {
 entry:
   %i.addr = alloca i32, align 4
   store i32 %i, i32* %i.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !17, metadata !{metadata !"0x102"}), !dbg !19
+  call void @llvm.dbg.declare(metadata i32* %i.addr, metadata !17, metadata !{!"0x102"}), !dbg !19
   %0 = load i32* %i.addr, align 4, !dbg !19
   %mul = mul nsw i32 %0, 2, !dbg !19
   ret i32 %mul, !dbg !19
@@ -70,23 +70,23 @@ attributes #2 = { nounwind readnone }
 !llvm.module.flags = !{!13, !14}
 !llvm.ident = !{!15}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/namespace_inline_function_definition.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"namespace_inline_function_definition.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !9}
-!4 = metadata !{metadata !"0x2e\00main\00main\00\005\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/namespace_inline_function_definition.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x2e\00func\00func\00_ZN2ns4funcEi\006\000\001\000\006\00256\000\006", metadata !1, metadata !10, metadata !11, null, i32 (i32)* @_ZN2ns4funcEi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
-!10 = metadata !{metadata !"0x39\00ns\001", metadata !1, null} ; [ DW_TAG_namespace ] [ns] [line 1]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !8, metadata !8}
-!13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!14 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!15 = metadata !{metadata !"clang version 3.5.0 "}
-!16 = metadata !{i32 5, i32 0, metadata !4, null}
-!17 = metadata !{metadata !"0x101\00i\0016777222\000", metadata !9, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [i] [line 6]
-!18 = metadata !{i32 6, i32 0, metadata !9, metadata !16}
-!19 = metadata !{i32 6, i32 0, metadata !9, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/namespace_inline_function_definition.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"namespace_inline_function_definition.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4, !9}
+!4 = !{!"0x2e\00main\00main\00\005\000\001\000\006\00256\000\005", !1, !5, !6, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/namespace_inline_function_definition.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x2e\00func\00func\00_ZN2ns4funcEi\006\000\001\000\006\00256\000\006", !1, !10, !11, null, i32 (i32)* @_ZN2ns4funcEi, null, null, !2} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
+!10 = !{!"0x39\00ns\001", !1, null} ; [ DW_TAG_namespace ] [ns] [line 1]
+!11 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!8, !8}
+!13 = !{i32 2, !"Dwarf Version", i32 4}
+!14 = !{i32 2, !"Debug Info Version", i32 2}
+!15 = !{!"clang version 3.5.0 "}
+!16 = !MDLocation(line: 5, scope: !4)
+!17 = !{!"0x101\00i\0016777222\000", !9, !5, !8} ; [ DW_TAG_arg_variable ] [i] [line 6]
+!18 = !MDLocation(line: 6, scope: !9, inlinedAt: !16)
+!19 = !MDLocation(line: 6, scope: !9)
diff --git a/test/DebugInfo/nodebug.ll b/test/DebugInfo/nodebug.ll
index acd3e82..83ce262 100644
--- a/test/DebugInfo/nodebug.ll
+++ b/test/DebugInfo/nodebug.ll
@@ -37,15 +37,15 @@ attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/nodebug.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"nodebug.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00f1\00f1\00_Z2f1v\002\000\001\000\006\00256\000\002", metadata !1, metadata !5, metadata !6, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [f1]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/nodebug.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!10 = metadata !{metadata !"clang version 3.5.0 "}
-!11 = metadata !{i32 3, i32 0, metadata !4, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/nodebug.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"nodebug.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00f1\00f1\00_Z2f1v\002\000\001\000\006\00256\000\002", !1, !5, !6, null, null, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [f1]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/nodebug.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 2}
+!10 = !{!"clang version 3.5.0 "}
+!11 = !MDLocation(line: 3, scope: !4)
diff --git a/test/DebugInfo/piece-verifier.ll b/test/DebugInfo/piece-verifier.ll
new file mode 100644
index 0000000..462bf27
--- /dev/null
+++ b/test/DebugInfo/piece-verifier.ll
@@ -0,0 +1,54 @@
+; RUN: not llvm-as -disable-output -verify-debug-info < %s 2>&1 | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @foo(i64 %s.coerce0, i32 %s.coerce1) #0 {
+entry:
+  call void @llvm.dbg.value(metadata i64 %s.coerce0, i64 0, metadata !20, metadata !24), !dbg !21
+  call void @llvm.dbg.value(metadata i32 %s.coerce1, i64 0, metadata !22, metadata !27), !dbg !21
+  ret i32 %s.coerce1, !dbg !23
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!17, !18}
+!llvm.ident = !{!19}
+
+!0 = !{!"0x11\0012\00clang version 3.5 \001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ]
+!1 = !{!"pieces.c", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\003\000\001\000\006\00256\001\003", !1, !5, !6, null, i32 (i64, i32)* @foo, null, null, !15} ; [ DW_TAG_subprogram ] [line 3] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/pieces.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !9}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x16\00S\001\000\000\000\000", !1, null, !10} ; [ DW_TAG_typedef ] [S] [line 1, size 0, align 0, offset 0] [from ]
+!10 = !{!"0x13\00\001\00128\0064\000\000\000", !1, null, null, !11, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 128, align 64, offset 0] [def] [from ]
+!11 = !{!12, !14}
+!12 = !{!"0xd\00a\001\0064\0064\000\000", !1, !10, !13} ; [ DW_TAG_member ] [a] [line 1, size 64, align 64, offset 0] [from long int]
+!13 = !{!"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!14 = !{!"0xd\00b\001\0032\0032\0064\000", !1, !10, !8} ; [ DW_TAG_member ] [b] [line 1, size 32, align 32, offset 64] [from int]
+!15 = !{!16}
+!16 = !{!"0x101\00s\0016777219\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [s] [line 3]
+!17 = !{i32 2, !"Dwarf Version", i32 4}
+!18 = !{i32 1, !"Debug Info Version", i32 2}
+!19 = !{!"clang version 3.5 "}
+!20 = !{!"0x101\00s\0016777219\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [s] [line 3]
+!21 = !MDLocation(line: 3, scope: !4)
+!22 = !{!"0x101\00s\0016777219\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [s] [line 3]
+!23 = !MDLocation(line: 4, scope: !4)
+!24 = !{!"0x102\006\00147\000\008"} ; [ DW_TAG_expression ] [DW_OP_piece 0 8] [piece, size 8, offset 0]
+!25 = !{}
+; This expression has elements after DW_OP_piece.
+; CHECK: DIExpression does not Verify
+!27 = !{!"0x102\00147\008\004\006"} ; [ DW_TAG_expression ] [DW_OP_piece 8 4] [piece, size 4, offset 8]
diff --git a/test/DebugInfo/restrict.ll b/test/DebugInfo/restrict.ll
index 82d91a7..54bdec7 100644
--- a/test/DebugInfo/restrict.ll
+++ b/test/DebugInfo/restrict.ll
@@ -21,7 +21,7 @@ define void @_Z3fooPv(i8* noalias %dst) #0 {
 entry:
   %dst.addr = alloca i8*, align 8
   store i8* %dst, i8** %dst.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8** %dst.addr}, metadata !13, metadata !{metadata !"0x102"}), !dbg !14
+  call void @llvm.dbg.declare(metadata i8** %dst.addr, metadata !13, metadata !{!"0x102"}), !dbg !14
   ret void, !dbg !15
 }
 
@@ -35,19 +35,19 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/restrict.c] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"restrict.c", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooPv\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void (i8*)* @_Z3fooPv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/restrict.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null, metadata !8}
-!8 = metadata !{metadata !"0x37\00\000\000\000\000\000", null, null, metadata !9} ; [ DW_TAG_restrict_type ] [line 0, size 0, align 0, offset 0] [from ]
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!12 = metadata !{metadata !"clang version 3.5.0 "}
-!13 = metadata !{metadata !"0x101\00dst\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [dst] [line 1]
-!14 = metadata !{i32 1, i32 0, metadata !4, null}
-!15 = metadata !{i32 2, i32 0, metadata !4, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/restrict.c] [DW_LANG_C_plus_plus]
+!1 = !{!"restrict.c", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00_Z3fooPv\001\000\001\000\006\00256\000\001", !1, !5, !6, null, void (i8*)* @_Z3fooPv, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/restrict.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null, !8}
+!8 = !{!"0x37\00\000\000\000\000\000", null, null, !9} ; [ DW_TAG_restrict_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = !{!"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!10 = !{i32 2, !"Dwarf Version", i32 4}
+!11 = !{i32 1, !"Debug Info Version", i32 2}
+!12 = !{!"clang version 3.5.0 "}
+!13 = !{!"0x101\00dst\0016777217\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [dst] [line 1]
+!14 = !MDLocation(line: 1, scope: !4)
+!15 = !MDLocation(line: 2, scope: !4)
diff --git a/test/DebugInfo/sugared-constants.ll b/test/DebugInfo/sugared-constants.ll
index 8f2a776..9e4f374 100644
--- a/test/DebugInfo/sugared-constants.ll
+++ b/test/DebugInfo/sugared-constants.ll
@@ -24,11 +24,11 @@
 ; Function Attrs: uwtable
 define i32 @main() #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !20, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !21
+  tail call void @llvm.dbg.value(metadata i32 42, i64 0, metadata !10, metadata !{!"0x102"}), !dbg !21
   tail call void @_Z4funci(i32 42), !dbg !22
-  tail call void @llvm.dbg.value(metadata !23, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !24
+  tail call void @llvm.dbg.value(metadata i32 117, i64 0, metadata !12, metadata !{!"0x102"}), !dbg !24
   tail call void @_Z4funcj(i32 117), !dbg !25
-  tail call void @llvm.dbg.value(metadata !26, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !27
+  tail call void @llvm.dbg.value(metadata i16 7, i64 0, metadata !15, metadata !{!"0x102"}), !dbg !27
   tail call void @_Z4funcDs(i16 zeroext 7), !dbg !28
   ret i32 0, !dbg !29
 }
@@ -50,33 +50,33 @@ attributes #2 = { nounwind readnone }
 !llvm.module.flags = !{!17, !18}
 !llvm.ident = !{!19}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/const.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"const.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00main\00main\00\004\000\001\000\006\00256\001\004", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !9} ; [ DW_TAG_subprogram ] [line 4] [def] [main]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/const.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !10, metadata !12, metadata !15}
-!10 = metadata !{metadata !"0x100\00i\005\000", metadata !4, metadata !5, metadata !11} ; [ DW_TAG_auto_variable ] [i] [line 5]
-!11 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
-!12 = metadata !{metadata !"0x100\00j\007\000", metadata !4, metadata !5, metadata !13} ; [ DW_TAG_auto_variable ] [j] [line 7]
-!13 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !14} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from unsigned int]
-!14 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
-!15 = metadata !{metadata !"0x100\00c\009\000", metadata !4, metadata !5, metadata !16} ; [ DW_TAG_auto_variable ] [c] [line 9]
-!16 = metadata !{metadata !"0x24\00char16_t\000\0016\0016\000\000\0016", null, null} ; [ DW_TAG_base_type ] [char16_t] [line 0, size 16, align 16, offset 0, enc DW_ATE_UTF]
-!17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!19 = metadata !{metadata !"clang version 3.5.0 "}
-!20 = metadata !{i32 42}
-!21 = metadata !{i32 5, i32 0, metadata !4, null}
-!22 = metadata !{i32 6, i32 0, metadata !4, null}
-!23 = metadata !{i32 117}
-!24 = metadata !{i32 7, i32 0, metadata !4, null}
-!25 = metadata !{i32 8, i32 0, metadata !4, null}
-!26 = metadata !{i16 7}
-!27 = metadata !{i32 9, i32 0, metadata !4, null}
-!28 = metadata !{i32 10, i32 0, metadata !4, null}
-!29 = metadata !{i32 11, i32 0, metadata !4, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/const.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"const.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00main\00main\00\004\000\001\000\006\00256\001\004", !1, !5, !6, null, i32 ()* @main, null, null, !9} ; [ DW_TAG_subprogram ] [line 4] [def] [main]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/const.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!10, !12, !15}
+!10 = !{!"0x100\00i\005\000", !4, !5, !11} ; [ DW_TAG_auto_variable ] [i] [line 5]
+!11 = !{!"0x26\00\000\000\000\000\000", null, null, !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
+!12 = !{!"0x100\00j\007\000", !4, !5, !13} ; [ DW_TAG_auto_variable ] [j] [line 7]
+!13 = !{!"0x26\00\000\000\000\000\000", null, null, !14} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from unsigned int]
+!14 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!15 = !{!"0x100\00c\009\000", !4, !5, !16} ; [ DW_TAG_auto_variable ] [c] [line 9]
+!16 = !{!"0x24\00char16_t\000\0016\0016\000\000\0016", null, null} ; [ DW_TAG_base_type ] [char16_t] [line 0, size 16, align 16, offset 0, enc DW_ATE_UTF]
+!17 = !{i32 2, !"Dwarf Version", i32 4}
+!18 = !{i32 1, !"Debug Info Version", i32 2}
+!19 = !{!"clang version 3.5.0 "}
+!20 = !{i32 42}
+!21 = !MDLocation(line: 5, scope: !4)
+!22 = !MDLocation(line: 6, scope: !4)
+!23 = !{i32 117}
+!24 = !MDLocation(line: 7, scope: !4)
+!25 = !MDLocation(line: 8, scope: !4)
+!26 = !{i16 7}
+!27 = !MDLocation(line: 9, scope: !4)
+!28 = !MDLocation(line: 10, scope: !4)
+!29 = !MDLocation(line: 11, scope: !4)
diff --git a/test/DebugInfo/template-recursive-void.ll b/test/DebugInfo/template-recursive-void.ll
index 155b3e8..b7e7244 100644
--- a/test/DebugInfo/template-recursive-void.ll
+++ b/test/DebugInfo/template-recursive-void.ll
@@ -25,41 +25,41 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!36, !37}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (trunk 187958) (llvm/trunk 187964)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/debug-info-template-recursive.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"debug-info-template-recursive.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x34\00filters\00filters\00\0010\000\001", null, metadata !5, metadata !6, %class.bar* @filters, null} ; [ DW_TAG_variable ] [filters] [line 10] [def]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/debug-info-template-recursive.cpp]
-!6 = metadata !{metadata !"0x2\00bar\009\008\008\000\000\000", metadata !1, null, null, metadata !7, null, null, null} ; [ DW_TAG_class_type ] [bar] [line 9, size 8, align 8, offset 0] [def] [from ]
-!7 = metadata !{metadata !8, metadata !31}
-!8 = metadata !{metadata !"0x1c\00\000\000\000\000\000", null, metadata !6, metadata !9} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from foo<void>]
-!9 = metadata !{metadata !"0x2\00foo<void>\005\008\008\000\000\000", metadata !1, null, null, metadata !10, null, metadata !29, null} ; [ DW_TAG_class_type ] [foo<void>] [line 5, size 8, align 8, offset 0] [def] [from ]
-!10 = metadata !{metadata !11, metadata !19, metadata !25}
-!11 = metadata !{metadata !"0x1c\00\000\000\000\000\000", null, metadata !9, metadata !12} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from base]
-!12 = metadata !{metadata !"0x2\00base\003\008\008\000\000\000", metadata !1, null, null, metadata !13, null, null, null} ; [ DW_TAG_class_type ] [base] [line 3, size 8, align 8, offset 0] [def] [from ]
-!13 = metadata !{metadata !14}
-!14 = metadata !{metadata !"0x2e\00base\00base\00\003\000\000\000\006\00320\000\003", metadata !1, metadata !12, metadata !15, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 3] [base]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{null, metadata !17}
-!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !12} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from base]
-!18 = metadata !{i32 786468}
-!19 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN3fooIvEaSES0_\006\000\000\000\006\00257\000\006", metadata !1, metadata !9, metadata !20, null, null, null, i32 0, metadata !24} ; [ DW_TAG_subprogram ] [line 6] [private] [operator=]
-!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!21 = metadata !{null, metadata !22, metadata !23}
-!22 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo<void>]
-!23 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !9} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo<void>]
-!24 = metadata !{i32 786468}
-!25 = metadata !{metadata !"0x2e\00foo\00foo\00\005\000\000\000\006\00320\000\005", metadata !1, metadata !9, metadata !26, null, null, null, i32 0, metadata !28} ; [ DW_TAG_subprogram ] [line 5] [foo]
-!26 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !27, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!27 = metadata !{null, metadata !22}
-!28 = metadata !{i32 786468}
-!29 = metadata !{metadata !30}
-!30 = metadata !{metadata !"0x2f\00T\000\000", null, null, null} ; [ DW_TAG_template_type_parameter ]
-!31 = metadata !{metadata !"0x2e\00bar\00bar\00\009\000\000\000\006\00320\000\009", metadata !1, metadata !6, metadata !32, null, null, null, i32 0, metadata !35} ; [ DW_TAG_subprogram ] [line 9] [bar]
-!32 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !33, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!33 = metadata !{null, metadata !34}
-!34 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from bar]
-!35 = metadata !{i32 786468}
-!36 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!37 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 (trunk 187958) (llvm/trunk 187964)\000\00\000\00\000", !1, !2, !2, !2, !3, !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/debug-info-template-recursive.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"debug-info-template-recursive.cpp", !"/usr/local/google/home/echristo/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x34\00filters\00filters\00\0010\000\001", null, !5, !6, %class.bar* @filters, null} ; [ DW_TAG_variable ] [filters] [line 10] [def]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/debug-info-template-recursive.cpp]
+!6 = !{!"0x2\00bar\009\008\008\000\000\000", !1, null, null, !7, null, null, null} ; [ DW_TAG_class_type ] [bar] [line 9, size 8, align 8, offset 0] [def] [from ]
+!7 = !{!8, !31}
+!8 = !{!"0x1c\00\000\000\000\000\000", null, !6, !9} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from foo<void>]
+!9 = !{!"0x2\00foo<void>\005\008\008\000\000\000", !1, null, null, !10, null, !29, null} ; [ DW_TAG_class_type ] [foo<void>] [line 5, size 8, align 8, offset 0] [def] [from ]
+!10 = !{!11, !19, !25}
+!11 = !{!"0x1c\00\000\000\000\000\000", null, !9, !12} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from base]
+!12 = !{!"0x2\00base\003\008\008\000\000\000", !1, null, null, !13, null, null, null} ; [ DW_TAG_class_type ] [base] [line 3, size 8, align 8, offset 0] [def] [from ]
+!13 = !{!14}
+!14 = !{!"0x2e\00base\00base\00\003\000\000\000\006\00320\000\003", !1, !12, !15, null, null, null, i32 0, !18} ; [ DW_TAG_subprogram ] [line 3] [base]
+!15 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{null, !17}
+!17 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !12} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from base]
+!18 = !{i32 786468}
+!19 = !{!"0x2e\00operator=\00operator=\00_ZN3fooIvEaSES0_\006\000\000\000\006\00257\000\006", !1, !9, !20, null, null, null, i32 0, !24} ; [ DW_TAG_subprogram ] [line 6] [private] [operator=]
+!20 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!21 = !{null, !22, !23}
+!22 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo<void>]
+!23 = !{!"0x26\00\000\000\000\000\000", null, null, !9} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo<void>]
+!24 = !{i32 786468}
+!25 = !{!"0x2e\00foo\00foo\00\005\000\000\000\006\00320\000\005", !1, !9, !26, null, null, null, i32 0, !28} ; [ DW_TAG_subprogram ] [line 5] [foo]
+!26 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !27, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!27 = !{null, !22}
+!28 = !{i32 786468}
+!29 = !{!30}
+!30 = !{!"0x2f\00T\000\000", null, null, null} ; [ DW_TAG_template_type_parameter ]
+!31 = !{!"0x2e\00bar\00bar\00\009\000\000\000\006\00320\000\009", !1, !6, !32, null, null, null, i32 0, !35} ; [ DW_TAG_subprogram ] [line 9] [bar]
+!32 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !33, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!33 = !{null, !34}
+!34 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from bar]
+!35 = !{i32 786468}
+!36 = !{i32 2, !"Dwarf Version", i32 3}
+!37 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/tu-composite.ll b/test/DebugInfo/tu-composite.ll
index 036c683..6f052ee 100644
--- a/test/DebugInfo/tu-composite.ll
+++ b/test/DebugInfo/tu-composite.ll
@@ -91,7 +91,7 @@ define void @_ZN1C3fooEv(%struct.C* %this) unnamed_addr #0 align 2 {
 entry:
   %this.addr = alloca %struct.C*, align 8
   store %struct.C* %this, %struct.C** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !36, metadata !{metadata !"0x102"}), !dbg !38
+  call void @llvm.dbg.declare(metadata %struct.C** %this.addr, metadata !36, metadata !{!"0x102"}), !dbg !38
   %this1 = load %struct.C** %this.addr
   ret void, !dbg !39
 }
@@ -108,12 +108,12 @@ entry:
   %e = alloca %"struct.D::Nested", align 1
   %p = alloca %"struct.D::Nested2"*, align 8
   %t = alloca %"struct.D::virt", align 8
-  call void @llvm.dbg.declare(metadata !{%struct.bar* %B}, metadata !40, metadata !{metadata !"0x102"}), !dbg !42
-  call void @llvm.dbg.declare(metadata !{[3 x %struct.bar]* %A}, metadata !43, metadata !{metadata !"0x102"}), !dbg !47
-  call void @llvm.dbg.declare(metadata !{%struct.bar* %B2}, metadata !48, metadata !{metadata !"0x102"}), !dbg !50
-  call void @llvm.dbg.declare(metadata !{%"struct.D::Nested"* %e}, metadata !51, metadata !{metadata !"0x102"}), !dbg !52
-  call void @llvm.dbg.declare(metadata !{%"struct.D::Nested2"** %p}, metadata !53, metadata !{metadata !"0x102"}), !dbg !55
-  call void @llvm.dbg.declare(metadata !{%"struct.D::virt"* %t}, metadata !56, metadata !{metadata !"0x102"}), !dbg !57
+  call void @llvm.dbg.declare(metadata %struct.bar* %B, metadata !40, metadata !{!"0x102"}), !dbg !42
+  call void @llvm.dbg.declare(metadata [3 x %struct.bar]* %A, metadata !43, metadata !{!"0x102"}), !dbg !47
+  call void @llvm.dbg.declare(metadata %struct.bar* %B2, metadata !48, metadata !{!"0x102"}), !dbg !50
+  call void @llvm.dbg.declare(metadata %"struct.D::Nested"* %e, metadata !51, metadata !{!"0x102"}), !dbg !52
+  call void @llvm.dbg.declare(metadata %"struct.D::Nested2"** %p, metadata !53, metadata !{!"0x102"}), !dbg !55
+  call void @llvm.dbg.declare(metadata %"struct.D::virt"* %t, metadata !56, metadata !{!"0x102"}), !dbg !57
   ret void, !dbg !58
 }
 
@@ -123,63 +123,63 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!35, !59}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !30, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [tmp.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"tmp.cpp", metadata !"."}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !18, metadata !19, metadata !22, metadata !23, metadata !24}
-!4 = metadata !{metadata !"0x13\00C\001\0064\0064\000\000\000", metadata !1, null, null, metadata !5, metadata !"_ZTS1C", null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 64, align 64, offset 0] [def] [from ]
-!5 = metadata !{metadata !6, metadata !13}
-!6 = metadata !{metadata !"0xd\00_vptr$C\000\0064\000\000\0064", metadata !1, metadata !7, metadata !8} ; [ DW_TAG_member ] [_vptr$C] [line 0, size 64, align 0, offset 0] [artificial] [from ]
-!7 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [tmp.cpp]
-!8 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
-!9 = metadata !{metadata !"0xf\00__vtbl_ptr_type\000\0064\000\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
-!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!13 = metadata !{metadata !"0x2e\00foo\00foo\00_ZN1C3fooEv\002\000\000\001\006\00256\000\002", metadata !1, metadata !"_ZTS1C", metadata !14, metadata !"_ZTS1C", null, null, i32 0, metadata !17} ; [ DW_TAG_subprogram ] [line 2] [foo]
-!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!15 = metadata !{null, metadata !16}
-!16 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
-!17 = metadata !{i32 786468}
-!18 = metadata !{metadata !"0x13\00bar\007\008\008\000\000\000", metadata !1, null, null, metadata !2, null, null, metadata !"_ZTS3bar"} ; [ DW_TAG_structure_type ] [bar] [line 7, size 8, align 8, offset 0] [def] [from ]
-!19 = metadata !{metadata !"0x13\00D\009\008\008\000\000\000", metadata !1, null, null, metadata !20, null, null, metadata !"_ZTS1D"} ; [ DW_TAG_structure_type ] [D] [line 9, size 8, align 8, offset 0] [def] [from ]
-!20 = metadata !{metadata !21}
-!21 = metadata !{metadata !"0xd\00a\0011\000\000\000\004096", metadata !1, metadata !"_ZTS1D", metadata !12, null} ; [ DW_TAG_member ] [a] [line 11, size 0, align 0, offset 0] [static] [from int]
-!22 = metadata !{metadata !"0x13\00Nested\0012\008\008\000\000\000", metadata !1, metadata !"_ZTS1D", null, metadata !2, null, null, metadata !"_ZTSN1D6NestedE"} ; [ DW_TAG_structure_type ] [Nested] [line 12, size 8, align 8, offset 0] [def] [from ]
-!23 = metadata !{metadata !"0x13\00Nested2\0013\000\000\000\004\000", metadata !1, metadata !"_ZTS1D", null, null, null, null, metadata !"_ZTSN1D7Nested2E"} ; [ DW_TAG_structure_type ] [Nested2] [line 13, size 0, align 0, offset 0] [decl] [from ]
-!24 = metadata !{metadata !"0x13\00virt<bar>\0015\0064\0064\000\000\000", metadata !1, metadata !"_ZTS1D", null, metadata !25, null, metadata !28, metadata !"_ZTSN1D4virtI3barEE"} ; [ DW_TAG_structure_type ] [virt<bar>] [line 15, size 64, align 64, offset 0] [def] [from ]
-!25 = metadata !{metadata !26}
-!26 = metadata !{metadata !"0xd\00values\0016\0064\0064\000\000", metadata !1, metadata !"_ZTSN1D4virtI3barEE", metadata !27} ; [ DW_TAG_member ] [values] [line 16, size 64, align 64, offset 0] [from ]
-!27 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS3bar"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3bar]
-!28 = metadata !{metadata !29}
-!29 = metadata !{metadata !"0x2f\00T\000\000", null, metadata !"_ZTS3bar", null} ; [ DW_TAG_template_type_parameter ]
-!30 = metadata !{metadata !31, metadata !32}
-!31 = metadata !{metadata !"0x2e\00foo\00foo\00_ZN1C3fooEv\004\000\001\000\006\00256\000\004", metadata !1, null, metadata !14, null, void (%struct.C*)* @_ZN1C3fooEv, null, metadata !13, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [foo]
-!32 = metadata !{metadata !"0x2e\00test\00test\00_Z4testv\0020\000\001\000\006\00256\000\0020", metadata !1, metadata !7, metadata !33, null, void ()* @_Z4testv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 20] [def] [test]
-!33 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !34, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!34 = metadata !{null}
-!35 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!36 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !31, null, metadata !37} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!37 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
-!38 = metadata !{i32 0, i32 0, metadata !31, null}
-!39 = metadata !{i32 5, i32 0, metadata !31, null}
-!40 = metadata !{metadata !"0x100\00B\0021\000", metadata !32, metadata !7, metadata !41} ; [ DW_TAG_auto_variable ] [B] [line 21]
-!41 = metadata !{metadata !"0x16\00baz\008\000\000\000\000", metadata !1, null, metadata !"_ZTS3bar"} ; [ DW_TAG_typedef ] [baz] [line 8, size 0, align 0, offset 0] [from _ZTS3bar]
-!42 = metadata !{i32 21, i32 0, metadata !32, null}
-!43 = metadata !{metadata !"0x100\00A\0022\000", metadata !32, metadata !7, metadata !44} ; [ DW_TAG_auto_variable ] [A] [line 22]
-!44 = metadata !{metadata !"0x1\00\000\0024\008\000\000", null, null, metadata !"_ZTS3bar", metadata !45, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 24, align 8, offset 0] [from _ZTS3bar]
-!45 = metadata !{metadata !46}
-!46 = metadata !{metadata !"0x21\000\003"}        ; [ DW_TAG_subrange_type ] [0, 2]
-!47 = metadata !{i32 22, i32 0, metadata !32, null}
-!48 = metadata !{metadata !"0x100\00B2\0023\000", metadata !32, metadata !7, metadata !49} ; [ DW_TAG_auto_variable ] [B2] [line 23]
-!49 = metadata !{metadata !"0x16\00baz2\0010\000\000\000\000", metadata !1, metadata !"_ZTS1D", metadata !"_ZTS3bar"} ; [ DW_TAG_typedef ] [baz2] [line 10, size 0, align 0, offset 0] [from _ZTS3bar]
-!50 = metadata !{i32 23, i32 0, metadata !32, null}
-!51 = metadata !{metadata !"0x100\00e\0024\000", metadata !32, metadata !7, metadata !22} ; [ DW_TAG_auto_variable ] [e] [line 24]
-!52 = metadata !{i32 24, i32 0, metadata !32, null}
-!53 = metadata !{metadata !"0x100\00p\0025\000", metadata !32, metadata !7, metadata !54} ; [ DW_TAG_auto_variable ] [p] [line 25]
-!54 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTSN1D7Nested2E"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTSN1D7Nested2E]
-!55 = metadata !{i32 25, i32 0, metadata !32, null}
-!56 = metadata !{metadata !"0x100\00t\0026\000", metadata !32, metadata !7, metadata !24} ; [ DW_TAG_auto_variable ] [t] [line 26]
-!57 = metadata !{i32 26, i32 0, metadata !32, null}
-!58 = metadata !{i32 27, i32 0, metadata !32, null}
-!59 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4\000\00\000\00\000", !1, !2, !3, !30, !2, !2} ; [ DW_TAG_compile_unit ] [tmp.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"tmp.cpp", !"."}
+!2 = !{}
+!3 = !{!4, !18, !19, !22, !23, !24}
+!4 = !{!"0x13\00C\001\0064\0064\000\000\000", !1, null, null, !5, !"_ZTS1C", null, !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 64, align 64, offset 0] [def] [from ]
+!5 = !{!6, !13}
+!6 = !{!"0xd\00_vptr$C\000\0064\000\000\0064", !1, !7, !8} ; [ DW_TAG_member ] [_vptr$C] [line 0, size 64, align 0, offset 0] [artificial] [from ]
+!7 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [tmp.cpp]
+!8 = !{!"0xf\00\000\0064\000\000\000", null, null, !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
+!9 = !{!"0xf\00__vtbl_ptr_type\000\0064\000\000\000", null, null, !10} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
+!10 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = !{!12}
+!12 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = !{!"0x2e\00foo\00foo\00_ZN1C3fooEv\002\000\000\001\006\00256\000\002", !1, !"_ZTS1C", !14, !"_ZTS1C", null, null, i32 0, !17} ; [ DW_TAG_subprogram ] [line 2] [foo]
+!14 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = !{null, !16}
+!16 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!17 = !{i32 786468}
+!18 = !{!"0x13\00bar\007\008\008\000\000\000", !1, null, null, !2, null, null, !"_ZTS3bar"} ; [ DW_TAG_structure_type ] [bar] [line 7, size 8, align 8, offset 0] [def] [from ]
+!19 = !{!"0x13\00D\009\008\008\000\000\000", !1, null, null, !20, null, null, !"_ZTS1D"} ; [ DW_TAG_structure_type ] [D] [line 9, size 8, align 8, offset 0] [def] [from ]
+!20 = !{!21}
+!21 = !{!"0xd\00a\0011\000\000\000\004096", !1, !"_ZTS1D", !12, null} ; [ DW_TAG_member ] [a] [line 11, size 0, align 0, offset 0] [static] [from int]
+!22 = !{!"0x13\00Nested\0012\008\008\000\000\000", !1, !"_ZTS1D", null, !2, null, null, !"_ZTSN1D6NestedE"} ; [ DW_TAG_structure_type ] [Nested] [line 12, size 8, align 8, offset 0] [def] [from ]
+!23 = !{!"0x13\00Nested2\0013\000\000\000\004\000", !1, !"_ZTS1D", null, null, null, null, !"_ZTSN1D7Nested2E"} ; [ DW_TAG_structure_type ] [Nested2] [line 13, size 0, align 0, offset 0] [decl] [from ]
+!24 = !{!"0x13\00virt<bar>\0015\0064\0064\000\000\000", !1, !"_ZTS1D", null, !25, null, !28, !"_ZTSN1D4virtI3barEE"} ; [ DW_TAG_structure_type ] [virt<bar>] [line 15, size 64, align 64, offset 0] [def] [from ]
+!25 = !{!26}
+!26 = !{!"0xd\00values\0016\0064\0064\000\000", !1, !"_ZTSN1D4virtI3barEE", !27} ; [ DW_TAG_member ] [values] [line 16, size 64, align 64, offset 0] [from ]
+!27 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS3bar"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3bar]
+!28 = !{!29}
+!29 = !{!"0x2f\00T\000\000", null, !"_ZTS3bar", null} ; [ DW_TAG_template_type_parameter ]
+!30 = !{!31, !32}
+!31 = !{!"0x2e\00foo\00foo\00_ZN1C3fooEv\004\000\001\000\006\00256\000\004", !1, null, !14, null, void (%struct.C*)* @_ZN1C3fooEv, null, !13, !2} ; [ DW_TAG_subprogram ] [line 4] [def] [foo]
+!32 = !{!"0x2e\00test\00test\00_Z4testv\0020\000\001\000\006\00256\000\0020", !1, !7, !33, null, void ()* @_Z4testv, null, null, !2} ; [ DW_TAG_subprogram ] [line 20] [def] [test]
+!33 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !34, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!34 = !{null}
+!35 = !{i32 2, !"Dwarf Version", i32 2}
+!36 = !{!"0x101\00this\0016777216\001088", !31, null, !37} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!37 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!38 = !MDLocation(line: 0, scope: !31)
+!39 = !MDLocation(line: 5, scope: !31)
+!40 = !{!"0x100\00B\0021\000", !32, !7, !41} ; [ DW_TAG_auto_variable ] [B] [line 21]
+!41 = !{!"0x16\00baz\008\000\000\000\000", !1, null, !"_ZTS3bar"} ; [ DW_TAG_typedef ] [baz] [line 8, size 0, align 0, offset 0] [from _ZTS3bar]
+!42 = !MDLocation(line: 21, scope: !32)
+!43 = !{!"0x100\00A\0022\000", !32, !7, !44} ; [ DW_TAG_auto_variable ] [A] [line 22]
+!44 = !{!"0x1\00\000\0024\008\000\000", null, null, !"_ZTS3bar", !45, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 24, align 8, offset 0] [from _ZTS3bar]
+!45 = !{!46}
+!46 = !{!"0x21\000\003"}        ; [ DW_TAG_subrange_type ] [0, 2]
+!47 = !MDLocation(line: 22, scope: !32)
+!48 = !{!"0x100\00B2\0023\000", !32, !7, !49} ; [ DW_TAG_auto_variable ] [B2] [line 23]
+!49 = !{!"0x16\00baz2\0010\000\000\000\000", !1, !"_ZTS1D", !"_ZTS3bar"} ; [ DW_TAG_typedef ] [baz2] [line 10, size 0, align 0, offset 0] [from _ZTS3bar]
+!50 = !MDLocation(line: 23, scope: !32)
+!51 = !{!"0x100\00e\0024\000", !32, !7, !22} ; [ DW_TAG_auto_variable ] [e] [line 24]
+!52 = !MDLocation(line: 24, scope: !32)
+!53 = !{!"0x100\00p\0025\000", !32, !7, !54} ; [ DW_TAG_auto_variable ] [p] [line 25]
+!54 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTSN1D7Nested2E"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTSN1D7Nested2E]
+!55 = !MDLocation(line: 25, scope: !32)
+!56 = !{!"0x100\00t\0026\000", !32, !7, !24} ; [ DW_TAG_auto_variable ] [t] [line 26]
+!57 = !MDLocation(line: 26, scope: !32)
+!58 = !MDLocation(line: 27, scope: !32)
+!59 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/tu-member-pointer.ll b/test/DebugInfo/tu-member-pointer.ll
index 7f25f5a..0e4b15a 100644
--- a/test/DebugInfo/tu-member-pointer.ll
+++ b/test/DebugInfo/tu-member-pointer.ll
@@ -16,15 +16,15 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !2, metadata !5, metadata !2} ; [ DW_TAG_compile_unit ] [foo.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"foo.cpp", metadata !"."}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00Foo\001\000\000\000\004\000", metadata !1, null, null, null, null, null, metadata !"_ZTS3Foo"} ; [ DW_TAG_structure_type ] [Foo] [line 1, size 0, align 0, offset 0] [decl] [from ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{metadata !"0x34\00x\00x\00\004\000\001", null, metadata !7, metadata !8, i64* @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
-!7 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [foo.cpp]
-!8 = metadata !{metadata !"0x1f\00\000\000\000\000\000", null, null, metadata !9, metadata !"_ZTS3Foo"} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from int]
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4\000\00\000\00\000", !1, !2, !3, !2, !5, !2} ; [ DW_TAG_compile_unit ] [foo.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"foo.cpp", !"."}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00Foo\001\000\000\000\004\000", !1, null, null, null, null, null, !"_ZTS3Foo"} ; [ DW_TAG_structure_type ] [Foo] [line 1, size 0, align 0, offset 0] [decl] [from ]
+!5 = !{!6}
+!6 = !{!"0x34\00x\00x\00\004\000\001", null, !7, !8, i64* @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
+!7 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [foo.cpp]
+!8 = !{!"0x1f\00\000\000\000\000\000", null, null, !9, !"_ZTS3Foo"} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from int]
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{i32 2, !"Dwarf Version", i32 2}
+!11 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/two-cus-from-same-file.ll b/test/DebugInfo/two-cus-from-same-file.ll
index d893319..b810a91 100644
--- a/test/DebugInfo/two-cus-from-same-file.ll
+++ b/test/DebugInfo/two-cus-from-same-file.ll
@@ -23,8 +23,8 @@ declare i32 @puts(i8* nocapture) nounwind
 
 define i32 @main(i32 %argc, i8** nocapture %argv) nounwind {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !21, metadata !{metadata !"0x102"}), !dbg !26
-  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !22, metadata !{metadata !"0x102"}), !dbg !27
+  tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !21, metadata !{!"0x102"}), !dbg !26
+  tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !22, metadata !{!"0x102"}), !dbg !27
   %puts = tail call i32 @puts(i8* getelementptr inbounds ([6 x i8]* @str1, i32 0, i32 0)), !dbg !28
   tail call void @foo() nounwind, !dbg !30
   ret i32 0, !dbg !31
@@ -35,39 +35,39 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0, !9}
 !llvm.module.flags = !{!33}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.2 (trunk 156513)\001\00\000\00\001", metadata !32, metadata !1, metadata !1, metadata !3, metadata !1, metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00foo\00foo\00\005\000\001\000\006\00256\001\005", metadata !32, metadata !6, metadata !7, null, void ()* @foo, null, null, metadata !1} ; [ DW_TAG_subprogram ]
-!6 = metadata !{metadata !"0x29", metadata !32} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null}
-!9 = metadata !{metadata !"0x11\0012\00clang version 3.2 (trunk 156513)\001\00\000\00\001", metadata !32, metadata !1, metadata !1, metadata !10, metadata !1, metadata !1} ; [ DW_TAG_compile_unit ]
-!10 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0x2e\00main\00main\00\0011\000\001\000\006\00256\001\0011", metadata !32, metadata !6, metadata !13, null, i32 (i32, i8**)* @main, null, null, metadata !19} ; [ DW_TAG_subprogram ]
-!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!14 = metadata !{metadata !15, metadata !15, metadata !16}
-!15 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!16 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, metadata !17} ; [ DW_TAG_pointer_type ]
-!17 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, metadata !18} ; [ DW_TAG_pointer_type ]
-!18 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
-!19 = metadata !{metadata !21, metadata !22}
-!21 = metadata !{metadata !"0x101\00argc\0016777227\000", metadata !12, metadata !6, metadata !15} ; [ DW_TAG_arg_variable ]
-!22 = metadata !{metadata !"0x101\00argv\0033554443\000", metadata !12, metadata !6, metadata !16} ; [ DW_TAG_arg_variable ]
-!23 = metadata !{i32 6, i32 3, metadata !24, null}
-!24 = metadata !{metadata !"0xb\005\0016\000", metadata !32, metadata !5} ; [ DW_TAG_lexical_block ]
-!25 = metadata !{i32 7, i32 1, metadata !24, null}
-!26 = metadata !{i32 11, i32 14, metadata !12, null}
-!27 = metadata !{i32 11, i32 26, metadata !12, null}
-!28 = metadata !{i32 12, i32 3, metadata !29, null}
-!29 = metadata !{metadata !"0xb\0011\0034\000", metadata !32, metadata !12} ; [ DW_TAG_lexical_block ]
-!30 = metadata !{i32 13, i32 3, metadata !29, null}
-!31 = metadata !{i32 14, i32 3, metadata !29, null}
-!32 = metadata !{metadata !"foo.c", metadata !"/tmp"}
+!0 = !{!"0x11\0012\00clang version 3.2 (trunk 156513)\001\00\000\00\001", !32, !1, !1, !3, !1, !1} ; [ DW_TAG_compile_unit ]
+!1 = !{}
+!3 = !{!5}
+!5 = !{!"0x2e\00foo\00foo\00\005\000\001\000\006\00256\001\005", !32, !6, !7, null, void ()* @foo, null, null, !1} ; [ DW_TAG_subprogram ]
+!6 = !{!"0x29", !32} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null}
+!9 = !{!"0x11\0012\00clang version 3.2 (trunk 156513)\001\00\000\00\001", !32, !1, !1, !10, !1, !1} ; [ DW_TAG_compile_unit ]
+!10 = !{!12}
+!12 = !{!"0x2e\00main\00main\00\0011\000\001\000\006\00256\001\0011", !32, !6, !13, null, i32 (i32, i8**)* @main, null, null, !19} ; [ DW_TAG_subprogram ]
+!13 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = !{!15, !15, !16}
+!15 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!16 = !{!"0xf\00\000\0032\0032\000\000", null, null, !17} ; [ DW_TAG_pointer_type ]
+!17 = !{!"0xf\00\000\0032\0032\000\000", null, null, !18} ; [ DW_TAG_pointer_type ]
+!18 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
+!19 = !{!21, !22}
+!21 = !{!"0x101\00argc\0016777227\000", !12, !6, !15} ; [ DW_TAG_arg_variable ]
+!22 = !{!"0x101\00argv\0033554443\000", !12, !6, !16} ; [ DW_TAG_arg_variable ]
+!23 = !MDLocation(line: 6, column: 3, scope: !24)
+!24 = !{!"0xb\005\0016\000", !32, !5} ; [ DW_TAG_lexical_block ]
+!25 = !MDLocation(line: 7, column: 1, scope: !24)
+!26 = !MDLocation(line: 11, column: 14, scope: !12)
+!27 = !MDLocation(line: 11, column: 26, scope: !12)
+!28 = !MDLocation(line: 12, column: 3, scope: !29)
+!29 = !{!"0xb\0011\0034\000", !32, !12} ; [ DW_TAG_lexical_block ]
+!30 = !MDLocation(line: 13, column: 3, scope: !29)
+!31 = !MDLocation(line: 14, column: 3, scope: !29)
+!32 = !{!"foo.c", !"/tmp"}
 
 ; This test is simple to be cross platform (many targets don't yet have
 ; sufficiently good DWARF emission and/or dumping)
 ; CHECK: {{DW_TAG_compile_unit}}
 ; CHECK: {{foo\.c}}
 
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!33 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/DebugInfo/typedef.ll b/test/DebugInfo/typedef.ll
index 941f5da..b8528be 100644
--- a/test/DebugInfo/typedef.ll
+++ b/test/DebugInfo/typedef.ll
@@ -18,15 +18,15 @@
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/typedef.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"typedef.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x34\00y\00y\00\002\000\001", null, metadata !5, metadata !6, i8** @y, null} ; [ DW_TAG_variable ] [y] [line 2] [def]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/typedef.cpp]
-!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from x]
-!7 = metadata !{metadata !"0x16\00x\001\000\000\000\000", metadata !1, null, null} ; [ DW_TAG_typedef ] [x] [line 1, size 0, align 0, offset 0] [from ]
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!10 = metadata !{metadata !"clang version 3.5.0 "}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !2, !3, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/typedef.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"typedef.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x34\00y\00y\00\002\000\001", null, !5, !6, i8** @y, null} ; [ DW_TAG_variable ] [y] [line 2] [def]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/typedef.cpp]
+!6 = !{!"0xf\00\000\0064\0064\000\000", null, null, !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from x]
+!7 = !{!"0x16\00x\001\000\000\000\000", !1, null, null} ; [ DW_TAG_typedef ] [x] [line 1, size 0, align 0, offset 0] [from ]
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 1, !"Debug Info Version", i32 2}
+!10 = !{!"clang version 3.5.0 "}
 
diff --git a/test/DebugInfo/unconditional-branch.ll b/test/DebugInfo/unconditional-branch.ll
index 95f5f9e..c82f1ba 100644
--- a/test/DebugInfo/unconditional-branch.ll
+++ b/test/DebugInfo/unconditional-branch.ll
@@ -22,16 +22,17 @@ define void @foo(i32 %i) #0 {
 entry:
   %i.addr = alloca i32, align 4
   store i32 %i, i32* %i.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !12, metadata !{metadata !"0x102"}), !dbg !13
+  call void @llvm.dbg.declare(metadata i32* %i.addr, metadata !12, metadata !{!"0x102"}), !dbg !13
   %0 = load i32* %i.addr, align 4, !dbg !14
   switch i32 %0, label %sw.default [
   ], !dbg !14
 
+sw.epilog:                                        ; preds = %sw.default
+  ret void, !dbg !17
+
 sw.default:                                       ; preds = %entry
   br label %sw.epilog, !dbg !15
 
-sw.epilog:                                        ; preds = %sw.default
-  ret void, !dbg !17
 }
 
 ; Function Attrs: nounwind readnone
@@ -44,21 +45,21 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 (204712)\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\work\EPRs\396363/test.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"test.c", metadata !"D:\5Cwork\5CEPRs\5C396363"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [D:\work\EPRs\396363/test.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null, metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!11 = metadata !{metadata !"clang version 3.5.0 (204712)"}
-!12 = metadata !{metadata !"0x101\00i\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [i] [line 1]
-!13 = metadata !{i32 1, i32 0, metadata !4, null}
-!14 = metadata !{i32 2, i32 0, metadata !4, null}
-!15 = metadata !{i32 4, i32 0, metadata !16, null}
-!16 = metadata !{metadata !"0xb\002\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [D:\work\EPRs\396363/test.c]
-!17 = metadata !{i32 6, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 (204712)\000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [D:\work\EPRs\396363/test.c] [DW_LANG_C99]
+!1 = !{!"test.c", !"D:\5Cwork\5CEPRs\5C396363"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", !1, !5, !6, null, void (i32)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [D:\work\EPRs\396363/test.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 1, !"Debug Info Version", i32 2}
+!11 = !{!"clang version 3.5.0 (204712)"}
+!12 = !{!"0x101\00i\0016777217\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [i] [line 1]
+!13 = !MDLocation(line: 1, scope: !4)
+!14 = !MDLocation(line: 2, scope: !4)
+!15 = !MDLocation(line: 4, scope: !16)
+!16 = !{!"0xb\002\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [D:\work\EPRs\396363/test.c]
+!17 = !MDLocation(line: 6, scope: !4)
diff --git a/test/DebugInfo/varargs.ll b/test/DebugInfo/varargs.ll
index 1fe598a..907d2e3 100644
--- a/test/DebugInfo/varargs.ll
+++ b/test/DebugInfo/varargs.ll
@@ -55,9 +55,9 @@ define void @_Z1biz(i32 %c, ...) #0 {
   %a = alloca %struct.A, align 1
   %fptr = alloca void (i32, ...)*, align 8
   store i32 %c, i32* %1, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !21, metadata !{metadata !"0x102"}), !dbg !22
-  call void @llvm.dbg.declare(metadata !{%struct.A* %a}, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
-  call void @llvm.dbg.declare(metadata !{void (i32, ...)** %fptr}, metadata !25, metadata !{metadata !"0x102"}), !dbg !27
+  call void @llvm.dbg.declare(metadata i32* %1, metadata !21, metadata !{!"0x102"}), !dbg !22
+  call void @llvm.dbg.declare(metadata %struct.A* %a, metadata !23, metadata !{!"0x102"}), !dbg !24
+  call void @llvm.dbg.declare(metadata void (i32, ...)** %fptr, metadata !25, metadata !{!"0x102"}), !dbg !27
   store void (i32, ...)* @_Z1biz, void (i32, ...)** %fptr, align 8, !dbg !27
   ret void, !dbg !28
 }
@@ -72,31 +72,31 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!18, !19}
 !llvm.ident = !{!20}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !13, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [llvm/tools/clang/test/CodeGenCXX/debug-info-varargs.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"llvm/tools/clang/test/CodeGenCXX/debug-info-varargs.cpp", metadata !"radar/13690847"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00A\003\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{metadata !"0x2e\00a\00a\00_ZN1A1aEiz\006\000\000\000\006\00256\000\006", metadata !1, metadata !"_ZTS1A", metadata !7, null, null, null, i32 0, metadata !12} ; [ DW_TAG_subprogram ] [line 6] [a]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9, metadata !10, null}
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!12 = metadata !{i32 786468}
-!13 = metadata !{metadata !14}
-!14 = metadata !{metadata !"0x2e\00b\00b\00_Z1biz\0013\000\001\000\006\00256\000\0013", metadata !1, metadata !15, metadata !16, null, void (i32, ...)* @_Z1biz, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 13] [def] [b]
-!15 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [llvm/tools/clang/test/CodeGenCXX/debug-info-varargs.cpp]
-!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!17 = metadata !{null, metadata !10, null}
-!18 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!20 = metadata !{metadata !"clang version 3.5 "}
-!21 = metadata !{metadata !"0x101\00c\0016777229\000", metadata !14, metadata !15, metadata !10} ; [ DW_TAG_arg_variable ] [c] [line 13]
-!22 = metadata !{i32 13, i32 0, metadata !14, null}
-!23 = metadata !{metadata !"0x100\00a\0016\000", metadata !14, metadata !15, metadata !4} ; [ DW_TAG_auto_variable ] [a] [line 16]
-!24 = metadata !{i32 16, i32 0, metadata !14, null}
-!25 = metadata !{metadata !"0x100\00fptr\0018\000", metadata !14, metadata !15, metadata !26} ; [ DW_TAG_auto_variable ] [fptr] [line 18]
-!26 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!27 = metadata !{i32 18, i32 0, metadata !14, null}
-!28 = metadata !{i32 22, i32 0, metadata !14, null}
+!0 = !{!"0x11\004\00clang version 3.5 \000\00\000\00\000", !1, !2, !3, !13, !2, !2} ; [ DW_TAG_compile_unit ] [llvm/tools/clang/test/CodeGenCXX/debug-info-varargs.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"llvm/tools/clang/test/CodeGenCXX/debug-info-varargs.cpp", !"radar/13690847"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00A\003\008\008\000\000\000", !1, null, null, !5, null, null, !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!6}
+!6 = !{!"0x2e\00a\00a\00_ZN1A1aEiz\006\000\000\000\006\00256\000\006", !1, !"_ZTS1A", !7, null, null, null, i32 0, !12} ; [ DW_TAG_subprogram ] [line 6] [a]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9, !10, null}
+!9 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!10 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = !{i32 786468}
+!13 = !{!14}
+!14 = !{!"0x2e\00b\00b\00_Z1biz\0013\000\001\000\006\00256\000\0013", !1, !15, !16, null, void (i32, ...)* @_Z1biz, null, null, !2} ; [ DW_TAG_subprogram ] [line 13] [def] [b]
+!15 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [llvm/tools/clang/test/CodeGenCXX/debug-info-varargs.cpp]
+!16 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = !{null, !10, null}
+!18 = !{i32 2, !"Dwarf Version", i32 2}
+!19 = !{i32 1, !"Debug Info Version", i32 2}
+!20 = !{!"clang version 3.5 "}
+!21 = !{!"0x101\00c\0016777229\000", !14, !15, !10} ; [ DW_TAG_arg_variable ] [c] [line 13]
+!22 = !MDLocation(line: 13, scope: !14)
+!23 = !{!"0x100\00a\0016\000", !14, !15, !4} ; [ DW_TAG_auto_variable ] [a] [line 16]
+!24 = !MDLocation(line: 16, scope: !14)
+!25 = !{!"0x100\00fptr\0018\000", !14, !15, !26} ; [ DW_TAG_auto_variable ] [fptr] [line 18]
+!26 = !{!"0xf\00\000\0064\0064\000\000", null, null, !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!27 = !MDLocation(line: 18, scope: !14)
+!28 = !MDLocation(line: 22, scope: !14)
diff --git a/test/DebugInfo/version.ll b/test/DebugInfo/version.ll
index 73d62fa..6ee33b6 100644
--- a/test/DebugInfo/version.ll
+++ b/test/DebugInfo/version.ll
@@ -18,15 +18,15 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !11}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 185475)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"CodeGen/dwarf-version.c", metadata !"test"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00main\00main\00\006\000\001\000\006\00256\000\006", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [main]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!10 = metadata !{i32 7, i32 0, metadata !4, null}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4 (trunk 185475)\000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ]
+!1 = !{!"CodeGen/dwarf-version.c", !"test"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00main\00main\00\006\000\001\000\006\00256\000\006", !1, !5, !6, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 6] [def] [main]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 3}
+!10 = !MDLocation(line: 7, scope: !4)
+!11 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/ExecutionEngine/2002-12-16-ArgTest.ll b/test/ExecutionEngine/2002-12-16-ArgTest.ll
deleted file mode 100644
index eb2fe8c..0000000
--- a/test/ExecutionEngine/2002-12-16-ArgTest.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-@.LC0 = internal global [10 x i8] c"argc: %d\0A\00"		; <[10 x i8]*> [#uses=1]
-
-declare i32 @puts(i8*)
-
-define void @getoptions(i32* %argc) {
-bb0:
-	ret void
-}
-
-declare i32 @printf(i8*, ...)
-
-define i32 @main(i32 %argc, i8** %argv) {
-bb0:
-	call i32 (i8*, ...)* @printf( i8* getelementptr ([10 x i8]* @.LC0, i64 0, i64 0), i32 %argc )		; <i32>:0 [#uses=0]
-	%cast224 = bitcast i8** %argv to i8*		; <i8*> [#uses=1]
-	%local = alloca i8*		; <i8**> [#uses=3]
-	store i8* %cast224, i8** %local
-	%cond226 = icmp sle i32 %argc, 0		; <i1> [#uses=1]
-	br i1 %cond226, label %bb3, label %bb2
-bb2:		; preds = %bb2, %bb0
-	%cann-indvar = phi i32 [ 0, %bb0 ], [ %add1-indvar, %bb2 ]		; <i32> [#uses=2]
-	%add1-indvar = add i32 %cann-indvar, 1		; <i32> [#uses=2]
-	%cann-indvar-idxcast = sext i32 %cann-indvar to i64		; <i64> [#uses=1]
-	%CT = bitcast i8** %local to i8***		; <i8***> [#uses=1]
-	%reg115 = load i8*** %CT		; <i8**> [#uses=1]
-	%cast235 = getelementptr i8** %reg115, i64 %cann-indvar-idxcast		; <i8**> [#uses=1]
-	%reg117 = load i8** %cast235		; <i8*> [#uses=1]
-	%reg236 = call i32 @puts( i8* %reg117 )		; <i32> [#uses=0]
-	%cond239 = icmp slt i32 %add1-indvar, %argc		; <i1> [#uses=1]
-	br i1 %cond239, label %bb2, label %bb3
-bb3:		; preds = %bb2, %bb0
-	%cast243 = bitcast i8** %local to i32*		; <i32*> [#uses=1]
-	call void @getoptions( i32* %cast243 )
-	ret i32 0
-}
diff --git a/test/ExecutionEngine/2003-01-04-ArgumentBug.ll b/test/ExecutionEngine/2003-01-04-ArgumentBug.ll
deleted file mode 100644
index 68fdefe..0000000
--- a/test/ExecutionEngine/2003-01-04-ArgumentBug.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define i32 @foo(i32 %X, i32 %Y, double %A) {
-	%cond212 = fcmp une double %A, 1.000000e+00		; <i1> [#uses=1]
-	%cast110 = zext i1 %cond212 to i32		; <i32> [#uses=1]
-	ret i32 %cast110
-}
-
-define i32 @main() {
-	%reg212 = call i32 @foo( i32 0, i32 1, double 1.000000e+00 )		; <i32> [#uses=1]
-	ret i32 %reg212
-}
-
diff --git a/test/ExecutionEngine/2003-01-04-LoopTest.ll b/test/ExecutionEngine/2003-01-04-LoopTest.ll
deleted file mode 100644
index 5a0311d..0000000
--- a/test/ExecutionEngine/2003-01-04-LoopTest.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define i32 @main() {
-	call i32 @mylog( i32 4 )		; <i32>:1 [#uses=0]
-	ret i32 0
-}
-
-define internal i32 @mylog(i32 %num) {
-bb0:
-	br label %bb2
-bb2:		; preds = %bb2, %bb0
-	%reg112 = phi i32 [ 10, %bb2 ], [ 1, %bb0 ]		; <i32> [#uses=1]
-	%cann-indvar = phi i32 [ %cann-indvar, %bb2 ], [ 0, %bb0 ]		; <i32> [#uses=1]
-	%reg114 = add i32 %reg112, 1		; <i32> [#uses=2]
-	%cond222 = icmp slt i32 %reg114, %num		; <i1> [#uses=1]
-	br i1 %cond222, label %bb2, label %bb3
-bb3:		; preds = %bb2
-	ret i32 %reg114
-}
-
diff --git a/test/ExecutionEngine/2003-01-04-PhiTest.ll b/test/ExecutionEngine/2003-01-04-PhiTest.ll
deleted file mode 100644
index 48576e7..0000000
--- a/test/ExecutionEngine/2003-01-04-PhiTest.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define i32 @main() {
-; <label>:0
-	br label %Loop
-Loop:		; preds = %Loop, %0
-	%X = phi i32 [ 0, %0 ], [ 1, %Loop ]		; <i32> [#uses=1]
-	br i1 true, label %Out, label %Loop
-Out:		; preds = %Loop
-	ret i32 %X
-}
-
diff --git a/test/ExecutionEngine/2003-01-09-SARTest.ll b/test/ExecutionEngine/2003-01-09-SARTest.ll
deleted file mode 100644
index ed58e11..0000000
--- a/test/ExecutionEngine/2003-01-09-SARTest.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-; We were accidentally inverting the signedness of right shifts.  Whoops.
-
-define i32 @main() {
-	%X = ashr i32 -1, 16		; <i32> [#uses=1]
-	%Y = ashr i32 %X, 16		; <i32> [#uses=1]
-	%Z = add i32 %Y, 1		; <i32> [#uses=1]
-	ret i32 %Z
-}
-
diff --git a/test/ExecutionEngine/2003-01-10-FUCOM.ll b/test/ExecutionEngine/2003-01-10-FUCOM.ll
deleted file mode 100644
index 4960e59..0000000
--- a/test/ExecutionEngine/2003-01-10-FUCOM.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define i32 @main() {
-	%X = fadd double 0.000000e+00, 1.000000e+00		; <double> [#uses=1]
-	%Y = fsub double 0.000000e+00, 1.000000e+00		; <double> [#uses=2]
-	%Z = fcmp oeq double %X, %Y		; <i1> [#uses=0]
-	fadd double %Y, 0.000000e+00		; <double>:1 [#uses=0]
-	ret i32 0
-}
-
diff --git a/test/ExecutionEngine/2003-01-15-AlignmentTest.ll b/test/ExecutionEngine/2003-01-15-AlignmentTest.ll
deleted file mode 100644
index 038d750..0000000
--- a/test/ExecutionEngine/2003-01-15-AlignmentTest.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define i32 @bar(i8* %X) {
-        ; pointer should be 4 byte aligned!
-	%P = alloca double		; <double*> [#uses=1]
-	%R = ptrtoint double* %P to i32		; <i32> [#uses=1]
-	%A = and i32 %R, 3		; <i32> [#uses=1]
-	ret i32 %A
-}
-
-define i32 @main() {
-	%SP = alloca i8		; <i8*> [#uses=1]
-	%X = add i32 0, 0		; <i32> [#uses=1]
-	alloca i8, i32 %X		; <i8*>:1 [#uses=0]
-	call i32 @bar( i8* %SP )		; <i32>:2 [#uses=1]
-	ret i32 %2
-}
diff --git a/test/ExecutionEngine/2003-05-07-ArgumentTest.ll b/test/ExecutionEngine/2003-05-07-ArgumentTest.ll
deleted file mode 100644
index 42db5fe..0000000
--- a/test/ExecutionEngine/2003-05-07-ArgumentTest.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: %lli %s test
-
-declare i32 @puts(i8*)
-
-define i32 @main(i32 %argc.1, i8** %argv.1) {
-	%tmp.5 = getelementptr i8** %argv.1, i64 1		; <i8**> [#uses=1]
-	%tmp.6 = load i8** %tmp.5		; <i8*> [#uses=1]
-	%tmp.0 = call i32 @puts( i8* %tmp.6 )		; <i32> [#uses=0]
-	ret i32 0
-}
-
diff --git a/test/ExecutionEngine/2003-05-11-PHIRegAllocBug.ll b/test/ExecutionEngine/2003-05-11-PHIRegAllocBug.ll
deleted file mode 100644
index 45279ad..0000000
--- a/test/ExecutionEngine/2003-05-11-PHIRegAllocBug.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-target datalayout = "e-p:32:32"
-
-define i32 @main() {
-entry:
-	br label %endif
-then:		; No predecessors!
-	br label %endif
-endif:		; preds = %then, %entry
-	%x = phi i32 [ 4, %entry ], [ 27, %then ]		; <i32> [#uses=0]
-	%result = phi i32 [ 32, %then ], [ 0, %entry ]		; <i32> [#uses=0]
-	ret i32 0
-}
-
diff --git a/test/ExecutionEngine/2003-06-04-bzip2-bug.ll b/test/ExecutionEngine/2003-06-04-bzip2-bug.ll
deleted file mode 100644
index 4342aa4..0000000
--- a/test/ExecutionEngine/2003-06-04-bzip2-bug.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-; Testcase distilled from 256.bzip2.
-
-target datalayout = "e-p:32:32"
-
-define i32 @main() {
-entry:
-	br label %loopentry.0
-loopentry.0:		; preds = %loopentry.0, %entry
-	%h.0 = phi i32 [ %tmp.2, %loopentry.0 ], [ -1, %entry ]		; <i32> [#uses=1]
-	%tmp.2 = add i32 %h.0, 1		; <i32> [#uses=3]
-	%tmp.4 = icmp ne i32 %tmp.2, 0		; <i1> [#uses=1]
-	br i1 %tmp.4, label %loopentry.0, label %loopentry.1
-loopentry.1:		; preds = %loopentry.0
-	%h.1 = phi i32 [ %tmp.2, %loopentry.0 ]		; <i32> [#uses=1]
-	ret i32 %h.1
-}
-
diff --git a/test/ExecutionEngine/2003-06-05-PHIBug.ll b/test/ExecutionEngine/2003-06-05-PHIBug.ll
deleted file mode 100644
index 03b66c4..0000000
--- a/test/ExecutionEngine/2003-06-05-PHIBug.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-; Testcase distilled from 256.bzip2.
-
-target datalayout = "e-p:32:32"
-
-define i32 @main() {
-entry:
-	%X = add i32 1, -1		; <i32> [#uses=3]
-	br label %Next
-Next:		; preds = %entry
-	%A = phi i32 [ %X, %entry ]		; <i32> [#uses=0]
-	%B = phi i32 [ %X, %entry ]		; <i32> [#uses=0]
-	%C = phi i32 [ %X, %entry ]		; <i32> [#uses=1]
-	ret i32 %C
-}
-
diff --git a/test/ExecutionEngine/2003-08-15-AllocaAssertion.ll b/test/ExecutionEngine/2003-08-15-AllocaAssertion.ll
deleted file mode 100644
index bee409c..0000000
--- a/test/ExecutionEngine/2003-08-15-AllocaAssertion.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-; This testcase failed to work because two variable sized allocas confused the
-; local register allocator.
-
-define i32 @main(i32 %X) {
-	%A = alloca i32, i32 %X		; <i32*> [#uses=0]
-	%B = alloca float, i32 %X		; <float*> [#uses=0]
-	ret i32 0
-}
-
diff --git a/test/ExecutionEngine/2003-08-21-EnvironmentTest.ll b/test/ExecutionEngine/2003-08-21-EnvironmentTest.ll
deleted file mode 100644
index 63303fc..0000000
--- a/test/ExecutionEngine/2003-08-21-EnvironmentTest.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-;
-; Regression Test: EnvironmentTest.ll
-;
-; Description:
-;	This is a regression test that verifies that the JIT passes the
-;	environment to the main() function.
-;
-
-
-declare i32 @strlen(i8*)
-
-define i32 @main(i32 %argc.1, i8** %argv.1, i8** %envp.1) {
-	%tmp.2 = load i8** %envp.1		; <i8*> [#uses=1]
-	%tmp.3 = call i32 @strlen( i8* %tmp.2 )		; <i32> [#uses=1]
-	%T = icmp eq i32 %tmp.3, 0		; <i1> [#uses=1]
-	%R = zext i1 %T to i32		; <i32> [#uses=1]
-	ret i32 %R
-}
-
diff --git a/test/ExecutionEngine/2003-08-23-RegisterAllocatePhysReg.ll b/test/ExecutionEngine/2003-08-23-RegisterAllocatePhysReg.ll
deleted file mode 100644
index 8fb1bbb..0000000
--- a/test/ExecutionEngine/2003-08-23-RegisterAllocatePhysReg.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-; This testcase exposes a bug in the local register allocator where it runs out
-; of registers (due to too many overlapping live ranges), but then attempts to
-; use the ESP register (which is not allocatable) to hold a value.
-
-define i32 @main(i32 %A) {
-        ; ESP gets used again...
-	%Ap2 = alloca i32, i32 %A		; <i32*> [#uses=11]
-	; Produce lots of overlapping live ranges
-        %B = add i32 %A, 1		; <i32> [#uses=1]
-	%C = add i32 %A, 2		; <i32> [#uses=1]
-	%D = add i32 %A, 3		; <i32> [#uses=1]
-	%E = add i32 %A, 4		; <i32> [#uses=1]
-	%F = add i32 %A, 5		; <i32> [#uses=1]
-	%G = add i32 %A, 6		; <i32> [#uses=1]
-	%H = add i32 %A, 7		; <i32> [#uses=1]
-	%I = add i32 %A, 8		; <i32> [#uses=1]
-	%J = add i32 %A, 9		; <i32> [#uses=1]
-	%K = add i32 %A, 10		; <i32> [#uses=1]
-        ; Uses of all of the values
-	store i32 %A, i32* %Ap2
-	store i32 %B, i32* %Ap2
-	store i32 %C, i32* %Ap2
-	store i32 %D, i32* %Ap2
-	store i32 %E, i32* %Ap2
-	store i32 %F, i32* %Ap2
-	store i32 %G, i32* %Ap2
-	store i32 %H, i32* %Ap2
-	store i32 %I, i32* %Ap2
-	store i32 %J, i32* %Ap2
-	store i32 %K, i32* %Ap2
-	ret i32 0
-}
diff --git a/test/ExecutionEngine/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll b/test/ExecutionEngine/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
deleted file mode 100644
index 6513540..0000000
--- a/test/ExecutionEngine/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-@A = global i32 0		; <i32*> [#uses=1]
-
-define i32 @main() {
-	%Ret = call i32 @test( i1 true, i32 0 )		; <i32> [#uses=1]
-	ret i32 %Ret
-}
-
-define i32 @test(i1 %c, i32 %A) {
-	br i1 %c, label %Taken1, label %NotTaken
-Cont:		; preds = %Taken1, %NotTaken
-	%V = phi i32 [ 0, %NotTaken ], [ sub (i32 ptrtoint (i32* @A to i32), i32 1234), %Taken1 ]		; <i32> [#uses=0]
-	ret i32 0
-NotTaken:		; preds = %0
-	br label %Cont
-Taken1:		; preds = %0
-	%B = icmp eq i32 %A, 0		; <i1> [#uses=1]
-	br i1 %B, label %Cont, label %ExitError
-ExitError:		; preds = %Taken1
-	ret i32 12
-}
-
diff --git a/test/ExecutionEngine/2005-12-02-TailCallBug.ll b/test/ExecutionEngine/2005-12-02-TailCallBug.ll
deleted file mode 100644
index 2ac8ad1..0000000
--- a/test/ExecutionEngine/2005-12-02-TailCallBug.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; PR672
-; RUN: %lli %s
-
-define i32 @main() {
-	%f = bitcast i32 (i32, i32*, i32)* @check_tail to i32*		; <i32*> [#uses=1]
-	%res = tail call fastcc i32 @check_tail( i32 10, i32* %f, i32 10 )		; <i32> [#uses=1]
-	ret i32 %res
-}
-
-define fastcc i32 @check_tail(i32 %x, i32* %f, i32 %g) {
-	%tmp1 = icmp sgt i32 %x, 0		; <i1> [#uses=1]
-	br i1 %tmp1, label %if-then, label %if-else
-if-then:		; preds = %0
-	%fun_ptr = bitcast i32* %f to i32 (i32, i32*, i32)*		; <i32 (i32, i32*, i32)*> [#uses=1]
-	%arg1 = add i32 %x, -1		; <i32> [#uses=1]
-	%res = tail call fastcc i32 %fun_ptr( i32 %arg1, i32* %f, i32 %g )		; <i32> [#uses=1]
-	ret i32 %res
-if-else:		; preds = %0
-	ret i32 %x
-}
-
diff --git a/test/ExecutionEngine/2007-12-10-APIntLoadStore.ll b/test/ExecutionEngine/2007-12-10-APIntLoadStore.ll
deleted file mode 100644
index 4183611..0000000
--- a/test/ExecutionEngine/2007-12-10-APIntLoadStore.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: %lli -force-interpreter %s
-; PR1836
-
-define i32 @main() {
-entry:
-    %retval = alloca i32        ; <i32*> [#uses=2]
-    %tmp = alloca i32       ; <i32*> [#uses=2]
-    %x = alloca i75, align 16       ; <i75*> [#uses=1]
-    %"alloca point" = bitcast i32 0 to i32      ; <i32> [#uses=0]
-    store i75 999, i75* %x, align 16
-    store i32 0, i32* %tmp, align 4
-    %tmp1 = load i32* %tmp, align 4     ; <i32> [#uses=1]
-    store i32 %tmp1, i32* %retval, align 4
-    br label %return
-
-return:     ; preds = %entry
-    %retval2 = load i32* %retval        ; <i32> [#uses=1]
-    ret i32 %retval2
-}
diff --git a/test/ExecutionEngine/2008-06-05-APInt-OverAShr.ll b/test/ExecutionEngine/2008-06-05-APInt-OverAShr.ll
deleted file mode 100644
index 349db69..0000000
--- a/test/ExecutionEngine/2008-06-05-APInt-OverAShr.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: %lli -force-interpreter=true %s | FileCheck %s
-; CHECK: 1
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i686-pc-linux-gnu"
-@.str = internal constant [10 x i8] c"MSB = %d\0A\00"		; <[10 x i8]*> [#uses=1]
-
-define i65 @foo(i65 %x) {
-entry:
-	%x_addr = alloca i65		; <i65*> [#uses=2]
-	%retval = alloca i65		; <i65*> [#uses=2]
-	%tmp = alloca i65		; <i65*> [#uses=2]
-	%"alloca point" = bitcast i65 0 to i65		; <i65> [#uses=0]
-	store i65 %x, i65* %x_addr
-	%tmp1 = load i65* %x_addr, align 4		; <i65> [#uses=1]
-	%tmp2 = ashr i65 %tmp1, 65		; <i65> [#uses=1]
-	store i65 %tmp2, i65* %tmp, align 4
-	%tmp3 = load i65* %tmp, align 4		; <i65> [#uses=1]
-	store i65 %tmp3, i65* %retval, align 4
-	br label %return
-
-return:		; preds = %entry
-	%retval4 = load i65* %retval		; <i65> [#uses=1]
-	ret i65 %retval4
-}
-
-define i32 @main() {
-entry:
-	%retval = alloca i32		; <i32*> [#uses=1]
-	%iftmp.0 = alloca i32		; <i32*> [#uses=3]
-	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
-	%tmp = call i65 @foo( i65 -9 )		; <i65> [#uses=1]
-	%tmp1 = lshr i65 %tmp, 64		; <i65> [#uses=1]
-	%tmp2 = xor i65 %tmp1, 1		; <i65> [#uses=1]
-	%tmp3 = and i65 %tmp2, 1		; <i65> [#uses=1]
-	%tmp34 = trunc i65 %tmp3 to i8		; <i8> [#uses=1]
-	%toBool = icmp ne i8 %tmp34, 0		; <i1> [#uses=1]
-	br i1 %toBool, label %cond_true, label %cond_false
-
-cond_true:		; preds = %entry
-	store i32 0, i32* %iftmp.0, align 4
-	br label %cond_next
-
-cond_false:		; preds = %entry
-	store i32 1, i32* %iftmp.0, align 4
-	br label %cond_next
-
-cond_next:		; preds = %cond_false, %cond_true
-	%tmp5 = getelementptr [10 x i8]* @.str, i32 0, i32 0		; <i8*> [#uses=1]
-	%tmp6 = load i32* %iftmp.0, align 4		; <i32> [#uses=1]
-	%tmp7 = call i32 (i8*, ...)* @printf( i8* noalias  %tmp5, i32 %tmp6 ) nounwind 		; <i32> [#uses=0]
-	br label %return
-
-return:		; preds = %cond_next
-    store i32 0, i32* %retval, align 4
-	%retval8 = load i32* %retval		; <i32> [#uses=1]
-	ret i32 %retval8
-}
-
-declare i32 @printf(i8* noalias , ...) nounwind 
diff --git a/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll b/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
index 45279ad..2f9b143 100644
--- a/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
@@ -1,7 +1,5 @@
 ; RUN: %lli %s > /dev/null
 
-target datalayout = "e-p:32:32"
-
 define i32 @main() {
 entry:
 	br label %endif
diff --git a/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll b/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
index 4342aa4..3a25789 100644
--- a/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
@@ -2,8 +2,6 @@
 
 ; Testcase distilled from 256.bzip2.
 
-target datalayout = "e-p:32:32"
-
 define i32 @main() {
 entry:
 	br label %loopentry.0
diff --git a/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll b/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
index 03b66c4..8a62e06 100644
--- a/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
@@ -2,8 +2,6 @@
 
 ; Testcase distilled from 256.bzip2.
 
-target datalayout = "e-p:32:32"
-
 define i32 @main() {
 entry:
 	%X = add i32 1, -1		; <i32> [#uses=3]
diff --git a/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll b/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
deleted file mode 100644
index 8bf03de..0000000
--- a/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: %lli -force-interpreter=true %s > /dev/null
-
-define i32 @main() {
-       %a = add i32 0, undef
-       %b = fadd float 0.0, undef
-       %c = fadd double 0.0, undef
-       ret i32 0
-}
diff --git a/test/ExecutionEngine/MCJIT/eh-lg-pic.ll b/test/ExecutionEngine/MCJIT/eh-lg-pic.ll
index bd097f2..9277ec4 100644
--- a/test/ExecutionEngine/MCJIT/eh-lg-pic.ll
+++ b/test/ExecutionEngine/MCJIT/eh-lg-pic.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -relocation-model=pic -code-model=large %s
-; XFAIL: cygwin, win32, mingw, mips, i686, i386, aarch64, arm
+; XFAIL: cygwin, win32, mingw, mips, i686, i386, aarch64, arm, asan, msan
 declare i8* @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(i8*, i8*, i8*)
 declare i32 @__gxx_personality_v0(...)
diff --git a/test/ExecutionEngine/MCJIT/eh-sm-pic.ll b/test/ExecutionEngine/MCJIT/eh-sm-pic.ll
index f3e61dc..37fb628 100644
--- a/test/ExecutionEngine/MCJIT/eh-sm-pic.ll
+++ b/test/ExecutionEngine/MCJIT/eh-sm-pic.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -relocation-model=pic -code-model=small %s
-; XFAIL: cygwin, win32, mingw, mips, i686, i386, darwin, aarch64, arm
+; XFAIL: cygwin, win32, mingw, mips, i686, i386, darwin, aarch64, arm, asan, msan
 declare i8* @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(i8*, i8*, i8*)
 declare i32 @__gxx_personality_v0(...)
diff --git a/test/ExecutionEngine/MCJIT/eh.ll b/test/ExecutionEngine/MCJIT/eh.ll
index aa81bb5..9f73e3a 100644
--- a/test/ExecutionEngine/MCJIT/eh.ll
+++ b/test/ExecutionEngine/MCJIT/eh.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli %s
-; XFAIL: arm, cygwin, win32, mingw
+; XFAIL: arm, cygwin, win32, mingw, asan, msan
 declare i8* @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(i8*, i8*, i8*)
 declare i32 @__gxx_personality_v0(...)
diff --git a/test/ExecutionEngine/MCJIT/multi-module-eh-a.ll b/test/ExecutionEngine/MCJIT/multi-module-eh-a.ll
index 10cfdcd..8626626 100644
--- a/test/ExecutionEngine/MCJIT/multi-module-eh-a.ll
+++ b/test/ExecutionEngine/MCJIT/multi-module-eh-a.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -extra-module=%p/Inputs/multi-module-eh-b.ll %s
-; XFAIL: arm, cygwin, win32, mingw
+; XFAIL: arm, cygwin, win32, mingw, asan, msan
 declare i8* @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(i8*, i8*, i8*)
 declare i32 @__gxx_personality_v0(...)
diff --git a/test/ExecutionEngine/MCJIT/non-extern-addend-smallcodemodel.ll b/test/ExecutionEngine/MCJIT/non-extern-addend-smallcodemodel.ll
deleted file mode 100644
index 03de30a..0000000
--- a/test/ExecutionEngine/MCJIT/non-extern-addend-smallcodemodel.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: %lli -code-model=small %s > /dev/null
-; XFAIL: mips
-;
-; FIXME: Merge this file with non-extern-addend.ll once AArch64 supports PC-rel
-;        relocations in ELF. (The code is identical, only the run line differs).
-;
-define i32 @foo(i32 %x, i32 %y, double %d) {
-entry:
-  %d.int64 = bitcast double %d to i64
-  %d.top64 = lshr i64 %d.int64, 32
-  %d.top   = trunc i64 %d.top64 to i32
-  %d.bottom = trunc i64 %d.int64 to i32
-  %topCorrect = icmp eq i32 %d.top, 3735928559
-  %bottomCorrect = icmp eq i32 %d.bottom, 4277009102
-  %right = and i1 %topCorrect, %bottomCorrect
-  %nRight = xor i1 %right, true
-  %retVal = zext i1 %nRight to i32
-  ret i32 %retVal
-}
-
-define i32 @main() {
-entry:
-  %call = call i32 @foo(i32 0, i32 1, double 0xDEADBEEFFEEDFACE)
-  ret i32 %call
-}
diff --git a/test/ExecutionEngine/OrcJIT/2002-12-16-ArgTest.ll b/test/ExecutionEngine/OrcJIT/2002-12-16-ArgTest.ll
new file mode 100644
index 0000000..e2fee8d
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2002-12-16-ArgTest.ll
@@ -0,0 +1,37 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+@.LC0 = internal global [10 x i8] c"argc: %d\0A\00"		; <[10 x i8]*> [#uses=1]
+
+declare i32 @puts(i8*)
+
+define void @getoptions(i32* %argc) {
+bb0:
+	ret void
+}
+
+declare i32 @printf(i8*, ...)
+
+define i32 @main(i32 %argc, i8** %argv) {
+bb0:
+	call i32 (i8*, ...)* @printf( i8* getelementptr ([10 x i8]* @.LC0, i64 0, i64 0), i32 %argc )		; <i32>:0 [#uses=0]
+	%cast224 = bitcast i8** %argv to i8*		; <i8*> [#uses=1]
+	%local = alloca i8*		; <i8**> [#uses=3]
+	store i8* %cast224, i8** %local
+	%cond226 = icmp sle i32 %argc, 0		; <i1> [#uses=1]
+	br i1 %cond226, label %bb3, label %bb2
+bb2:		; preds = %bb2, %bb0
+	%cann-indvar = phi i32 [ 0, %bb0 ], [ %add1-indvar, %bb2 ]		; <i32> [#uses=2]
+	%add1-indvar = add i32 %cann-indvar, 1		; <i32> [#uses=2]
+	%cann-indvar-idxcast = sext i32 %cann-indvar to i64		; <i64> [#uses=1]
+	%CT = bitcast i8** %local to i8***		; <i8***> [#uses=1]
+	%reg115 = load i8*** %CT		; <i8**> [#uses=1]
+	%cast235 = getelementptr i8** %reg115, i64 %cann-indvar-idxcast		; <i8**> [#uses=1]
+	%reg117 = load i8** %cast235		; <i8*> [#uses=1]
+	%reg236 = call i32 @puts( i8* %reg117 )		; <i32> [#uses=0]
+	%cond239 = icmp slt i32 %add1-indvar, %argc		; <i1> [#uses=1]
+	br i1 %cond239, label %bb2, label %bb3
+bb3:		; preds = %bb2, %bb0
+	%cast243 = bitcast i8** %local to i32*		; <i32*> [#uses=1]
+	call void @getoptions( i32* %cast243 )
+	ret i32 0
+}
diff --git a/test/ExecutionEngine/OrcJIT/2003-01-04-ArgumentBug.ll b/test/ExecutionEngine/OrcJIT/2003-01-04-ArgumentBug.ll
new file mode 100644
index 0000000..67425a9
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2003-01-04-ArgumentBug.ll
@@ -0,0 +1,13 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @foo(i32 %X, i32 %Y, double %A) {
+	%cond212 = fcmp une double %A, 1.000000e+00		; <i1> [#uses=1]
+	%cast110 = zext i1 %cond212 to i32		; <i32> [#uses=1]
+	ret i32 %cast110
+}
+
+define i32 @main() {
+	%reg212 = call i32 @foo( i32 0, i32 1, double 1.000000e+00 )		; <i32> [#uses=1]
+	ret i32 %reg212
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/2003-01-04-LoopTest.ll b/test/ExecutionEngine/OrcJIT/2003-01-04-LoopTest.ll
new file mode 100644
index 0000000..cf805ea
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2003-01-04-LoopTest.ll
@@ -0,0 +1,20 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @main() {
+	call i32 @mylog( i32 4 )		; <i32>:1 [#uses=0]
+	ret i32 0
+}
+
+define internal i32 @mylog(i32 %num) {
+bb0:
+	br label %bb2
+bb2:		; preds = %bb2, %bb0
+	%reg112 = phi i32 [ 10, %bb2 ], [ 1, %bb0 ]		; <i32> [#uses=1]
+	%cann-indvar = phi i32 [ %cann-indvar, %bb2 ], [ 0, %bb0 ]		; <i32> [#uses=1]
+	%reg114 = add i32 %reg112, 1		; <i32> [#uses=2]
+	%cond222 = icmp slt i32 %reg114, %num		; <i1> [#uses=1]
+	br i1 %cond222, label %bb2, label %bb3
+bb3:		; preds = %bb2
+	ret i32 %reg114
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/2003-01-04-PhiTest.ll b/test/ExecutionEngine/OrcJIT/2003-01-04-PhiTest.ll
new file mode 100644
index 0000000..b8b8519
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2003-01-04-PhiTest.ll
@@ -0,0 +1,12 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @main() {
+; <label>:0
+	br label %Loop
+Loop:		; preds = %Loop, %0
+	%X = phi i32 [ 0, %0 ], [ 1, %Loop ]		; <i32> [#uses=1]
+	br i1 true, label %Out, label %Loop
+Out:		; preds = %Loop
+	ret i32 %X
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/2003-01-09-SARTest.ll b/test/ExecutionEngine/OrcJIT/2003-01-09-SARTest.ll
new file mode 100644
index 0000000..85b0031
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2003-01-09-SARTest.ll
@@ -0,0 +1,11 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+; We were accidentally inverting the signedness of right shifts.  Whoops.
+
+define i32 @main() {
+	%X = ashr i32 -1, 16		; <i32> [#uses=1]
+	%Y = ashr i32 %X, 16		; <i32> [#uses=1]
+	%Z = add i32 %Y, 1		; <i32> [#uses=1]
+	ret i32 %Z
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/2003-01-10-FUCOM.ll b/test/ExecutionEngine/OrcJIT/2003-01-10-FUCOM.ll
new file mode 100644
index 0000000..66b21c9
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2003-01-10-FUCOM.ll
@@ -0,0 +1,10 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @main() {
+	%X = fadd double 0.000000e+00, 1.000000e+00		; <double> [#uses=1]
+	%Y = fsub double 0.000000e+00, 1.000000e+00		; <double> [#uses=2]
+	%Z = fcmp oeq double %X, %Y		; <i1> [#uses=0]
+	fadd double %Y, 0.000000e+00		; <double>:1 [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/2003-01-15-AlignmentTest.ll b/test/ExecutionEngine/OrcJIT/2003-01-15-AlignmentTest.ll
new file mode 100644
index 0000000..1f27c1f
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2003-01-15-AlignmentTest.ll
@@ -0,0 +1,17 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @bar(i8* %X) {
+        ; pointer should be 4 byte aligned!
+	%P = alloca double		; <double*> [#uses=1]
+	%R = ptrtoint double* %P to i32		; <i32> [#uses=1]
+	%A = and i32 %R, 3		; <i32> [#uses=1]
+	ret i32 %A
+}
+
+define i32 @main() {
+	%SP = alloca i8		; <i8*> [#uses=1]
+	%X = add i32 0, 0		; <i32> [#uses=1]
+	alloca i8, i32 %X		; <i8*>:1 [#uses=0]
+	call i32 @bar( i8* %SP )		; <i32>:2 [#uses=1]
+	ret i32 %2
+}
diff --git a/test/ExecutionEngine/2003-05-06-LivenessClobber.ll b/test/ExecutionEngine/OrcJIT/2003-05-06-LivenessClobber.ll
index 576ef7c..576ef7c 100644
--- a/test/ExecutionEngine/2003-05-06-LivenessClobber.ll
+++ b/test/ExecutionEngine/OrcJIT/2003-05-06-LivenessClobber.ll
diff --git a/test/ExecutionEngine/OrcJIT/2003-05-07-ArgumentTest.ll b/test/ExecutionEngine/OrcJIT/2003-05-07-ArgumentTest.ll
new file mode 100644
index 0000000..b45178e
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2003-05-07-ArgumentTest.ll
@@ -0,0 +1,11 @@
+; RUN: %lli -use-orcmcjit %s test
+
+declare i32 @puts(i8*)
+
+define i32 @main(i32 %argc.1, i8** %argv.1) {
+	%tmp.5 = getelementptr i8** %argv.1, i64 1		; <i8**> [#uses=1]
+	%tmp.6 = load i8** %tmp.5		; <i8*> [#uses=1]
+	%tmp.0 = call i32 @puts( i8* %tmp.6 )		; <i32> [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/2003-05-11-PHIRegAllocBug.ll b/test/ExecutionEngine/OrcJIT/2003-05-11-PHIRegAllocBug.ll
new file mode 100644
index 0000000..68402d9
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2003-05-11-PHIRegAllocBug.ll
@@ -0,0 +1,13 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @main() {
+entry:
+	br label %endif
+then:		; No predecessors!
+	br label %endif
+endif:		; preds = %then, %entry
+	%x = phi i32 [ 4, %entry ], [ 27, %then ]		; <i32> [#uses=0]
+	%result = phi i32 [ 32, %then ], [ 0, %entry ]		; <i32> [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/2003-06-04-bzip2-bug.ll b/test/ExecutionEngine/OrcJIT/2003-06-04-bzip2-bug.ll
new file mode 100644
index 0000000..0907993
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2003-06-04-bzip2-bug.ll
@@ -0,0 +1,17 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+; Testcase distilled from 256.bzip2.
+
+define i32 @main() {
+entry:
+	br label %loopentry.0
+loopentry.0:		; preds = %loopentry.0, %entry
+	%h.0 = phi i32 [ %tmp.2, %loopentry.0 ], [ -1, %entry ]		; <i32> [#uses=1]
+	%tmp.2 = add i32 %h.0, 1		; <i32> [#uses=3]
+	%tmp.4 = icmp ne i32 %tmp.2, 0		; <i1> [#uses=1]
+	br i1 %tmp.4, label %loopentry.0, label %loopentry.1
+loopentry.1:		; preds = %loopentry.0
+	%h.1 = phi i32 [ %tmp.2, %loopentry.0 ]		; <i32> [#uses=1]
+	ret i32 %h.1
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/2003-06-05-PHIBug.ll b/test/ExecutionEngine/OrcJIT/2003-06-05-PHIBug.ll
new file mode 100644
index 0000000..2eb497b
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2003-06-05-PHIBug.ll
@@ -0,0 +1,15 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+; Testcase distilled from 256.bzip2.
+
+define i32 @main() {
+entry:
+	%X = add i32 1, -1		; <i32> [#uses=3]
+	br label %Next
+Next:		; preds = %entry
+	%A = phi i32 [ %X, %entry ]		; <i32> [#uses=0]
+	%B = phi i32 [ %X, %entry ]		; <i32> [#uses=0]
+	%C = phi i32 [ %X, %entry ]		; <i32> [#uses=1]
+	ret i32 %C
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/2003-08-15-AllocaAssertion.ll b/test/ExecutionEngine/OrcJIT/2003-08-15-AllocaAssertion.ll
new file mode 100644
index 0000000..290d5a2
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2003-08-15-AllocaAssertion.ll
@@ -0,0 +1,11 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+; This testcase failed to work because two variable sized allocas confused the
+; local register allocator.
+
+define i32 @main(i32 %X) {
+	%A = alloca i32, i32 %X		; <i32*> [#uses=0]
+	%B = alloca float, i32 %X		; <float*> [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/2003-08-21-EnvironmentTest.ll b/test/ExecutionEngine/OrcJIT/2003-08-21-EnvironmentTest.ll
new file mode 100644
index 0000000..f73f10e
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2003-08-21-EnvironmentTest.ll
@@ -0,0 +1,21 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+;
+; Regression Test: EnvironmentTest.ll
+;
+; Description:
+;	This is a regression test that verifies that the JIT passes the
+;	environment to the main() function.
+;
+
+
+declare i32 @strlen(i8*)
+
+define i32 @main(i32 %argc.1, i8** %argv.1, i8** %envp.1) {
+	%tmp.2 = load i8** %envp.1		; <i8*> [#uses=1]
+	%tmp.3 = call i32 @strlen( i8* %tmp.2 )		; <i32> [#uses=1]
+	%T = icmp eq i32 %tmp.3, 0		; <i1> [#uses=1]
+	%R = zext i1 %T to i32		; <i32> [#uses=1]
+	ret i32 %R
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/2003-08-23-RegisterAllocatePhysReg.ll b/test/ExecutionEngine/OrcJIT/2003-08-23-RegisterAllocatePhysReg.ll
new file mode 100644
index 0000000..bd26c38
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2003-08-23-RegisterAllocatePhysReg.ll
@@ -0,0 +1,34 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+; This testcase exposes a bug in the local register allocator where it runs out
+; of registers (due to too many overlapping live ranges), but then attempts to
+; use the ESP register (which is not allocatable) to hold a value.
+
+define i32 @main(i32 %A) {
+        ; ESP gets used again...
+	%Ap2 = alloca i32, i32 %A		; <i32*> [#uses=11]
+	; Produce lots of overlapping live ranges
+        %B = add i32 %A, 1		; <i32> [#uses=1]
+	%C = add i32 %A, 2		; <i32> [#uses=1]
+	%D = add i32 %A, 3		; <i32> [#uses=1]
+	%E = add i32 %A, 4		; <i32> [#uses=1]
+	%F = add i32 %A, 5		; <i32> [#uses=1]
+	%G = add i32 %A, 6		; <i32> [#uses=1]
+	%H = add i32 %A, 7		; <i32> [#uses=1]
+	%I = add i32 %A, 8		; <i32> [#uses=1]
+	%J = add i32 %A, 9		; <i32> [#uses=1]
+	%K = add i32 %A, 10		; <i32> [#uses=1]
+        ; Uses of all of the values
+	store i32 %A, i32* %Ap2
+	store i32 %B, i32* %Ap2
+	store i32 %C, i32* %Ap2
+	store i32 %D, i32* %Ap2
+	store i32 %E, i32* %Ap2
+	store i32 %F, i32* %Ap2
+	store i32 %G, i32* %Ap2
+	store i32 %H, i32* %Ap2
+	store i32 %I, i32* %Ap2
+	store i32 %J, i32* %Ap2
+	store i32 %K, i32* %Ap2
+	ret i32 0
+}
diff --git a/test/ExecutionEngine/OrcJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll b/test/ExecutionEngine/OrcJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
new file mode 100644
index 0000000..c59ad32
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
@@ -0,0 +1,23 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+@A = global i32 0		; <i32*> [#uses=1]
+
+define i32 @main() {
+	%Ret = call i32 @test( i1 true, i32 0 )		; <i32> [#uses=1]
+	ret i32 %Ret
+}
+
+define i32 @test(i1 %c, i32 %A) {
+	br i1 %c, label %Taken1, label %NotTaken
+Cont:		; preds = %Taken1, %NotTaken
+	%V = phi i32 [ 0, %NotTaken ], [ sub (i32 ptrtoint (i32* @A to i32), i32 1234), %Taken1 ]		; <i32> [#uses=0]
+	ret i32 0
+NotTaken:		; preds = %0
+	br label %Cont
+Taken1:		; preds = %0
+	%B = icmp eq i32 %A, 0		; <i1> [#uses=1]
+	br i1 %B, label %Cont, label %ExitError
+ExitError:		; preds = %Taken1
+	ret i32 12
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/2005-12-02-TailCallBug.ll b/test/ExecutionEngine/OrcJIT/2005-12-02-TailCallBug.ll
new file mode 100644
index 0000000..7f1d3b0
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2005-12-02-TailCallBug.ll
@@ -0,0 +1,22 @@
+; PR672
+; RUN: %lli -use-orcmcjit %s
+; XFAIL: mcjit-ia32
+
+define i32 @main() {
+	%f = bitcast i32 (i32, i32*, i32)* @check_tail to i32*		; <i32*> [#uses=1]
+	%res = tail call fastcc i32 @check_tail( i32 10, i32* %f, i32 10 )		; <i32> [#uses=1]
+	ret i32 %res
+}
+
+define fastcc i32 @check_tail(i32 %x, i32* %f, i32 %g) {
+	%tmp1 = icmp sgt i32 %x, 0		; <i1> [#uses=1]
+	br i1 %tmp1, label %if-then, label %if-else
+if-then:		; preds = %0
+	%fun_ptr = bitcast i32* %f to i32 (i32, i32*, i32)*		; <i32 (i32, i32*, i32)*> [#uses=1]
+	%arg1 = add i32 %x, -1		; <i32> [#uses=1]
+	%res = tail call fastcc i32 %fun_ptr( i32 %arg1, i32* %f, i32 %g )		; <i32> [#uses=1]
+	ret i32 %res
+if-else:		; preds = %0
+	ret i32 %x
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/2007-12-10-APIntLoadStore.ll b/test/ExecutionEngine/OrcJIT/2007-12-10-APIntLoadStore.ll
new file mode 100644
index 0000000..efe5d83
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2007-12-10-APIntLoadStore.ll
@@ -0,0 +1,19 @@
+; RUN: %lli -use-orcmcjit -force-interpreter %s
+; PR1836
+
+define i32 @main() {
+entry:
+    %retval = alloca i32        ; <i32*> [#uses=2]
+    %tmp = alloca i32       ; <i32*> [#uses=2]
+    %x = alloca i75, align 16       ; <i75*> [#uses=1]
+    %"alloca point" = bitcast i32 0 to i32      ; <i32> [#uses=0]
+    store i75 999, i75* %x, align 16
+    store i32 0, i32* %tmp, align 4
+    %tmp1 = load i32* %tmp, align 4     ; <i32> [#uses=1]
+    store i32 %tmp1, i32* %retval, align 4
+    br label %return
+
+return:     ; preds = %entry
+    %retval2 = load i32* %retval        ; <i32> [#uses=1]
+    ret i32 %retval2
+}
diff --git a/test/ExecutionEngine/OrcJIT/2008-06-05-APInt-OverAShr.ll b/test/ExecutionEngine/OrcJIT/2008-06-05-APInt-OverAShr.ll
new file mode 100644
index 0000000..6b27528
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2008-06-05-APInt-OverAShr.ll
@@ -0,0 +1,60 @@
+; RUN: %lli -use-orcmcjit -force-interpreter=true %s | FileCheck %s
+; CHECK: 1
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+target triple = "i686-pc-linux-gnu"
+@.str = internal constant [10 x i8] c"MSB = %d\0A\00"		; <[10 x i8]*> [#uses=1]
+
+define i65 @foo(i65 %x) {
+entry:
+	%x_addr = alloca i65		; <i65*> [#uses=2]
+	%retval = alloca i65		; <i65*> [#uses=2]
+	%tmp = alloca i65		; <i65*> [#uses=2]
+	%"alloca point" = bitcast i65 0 to i65		; <i65> [#uses=0]
+	store i65 %x, i65* %x_addr
+	%tmp1 = load i65* %x_addr, align 4		; <i65> [#uses=1]
+	%tmp2 = ashr i65 %tmp1, 65		; <i65> [#uses=1]
+	store i65 %tmp2, i65* %tmp, align 4
+	%tmp3 = load i65* %tmp, align 4		; <i65> [#uses=1]
+	store i65 %tmp3, i65* %retval, align 4
+	br label %return
+
+return:		; preds = %entry
+	%retval4 = load i65* %retval		; <i65> [#uses=1]
+	ret i65 %retval4
+}
+
+define i32 @main() {
+entry:
+	%retval = alloca i32		; <i32*> [#uses=1]
+	%iftmp.0 = alloca i32		; <i32*> [#uses=3]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	%tmp = call i65 @foo( i65 -9 )		; <i65> [#uses=1]
+	%tmp1 = lshr i65 %tmp, 64		; <i65> [#uses=1]
+	%tmp2 = xor i65 %tmp1, 1		; <i65> [#uses=1]
+	%tmp3 = and i65 %tmp2, 1		; <i65> [#uses=1]
+	%tmp34 = trunc i65 %tmp3 to i8		; <i8> [#uses=1]
+	%toBool = icmp ne i8 %tmp34, 0		; <i1> [#uses=1]
+	br i1 %toBool, label %cond_true, label %cond_false
+
+cond_true:		; preds = %entry
+	store i32 0, i32* %iftmp.0, align 4
+	br label %cond_next
+
+cond_false:		; preds = %entry
+	store i32 1, i32* %iftmp.0, align 4
+	br label %cond_next
+
+cond_next:		; preds = %cond_false, %cond_true
+	%tmp5 = getelementptr [10 x i8]* @.str, i32 0, i32 0		; <i8*> [#uses=1]
+	%tmp6 = load i32* %iftmp.0, align 4		; <i32> [#uses=1]
+	%tmp7 = call i32 (i8*, ...)* @printf( i8* noalias  %tmp5, i32 %tmp6 ) nounwind 		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %cond_next
+    store i32 0, i32* %retval, align 4
+	%retval8 = load i32* %retval		; <i32> [#uses=1]
+	ret i32 %retval8
+}
+
+declare i32 @printf(i8* noalias , ...) nounwind 
diff --git a/test/ExecutionEngine/OrcJIT/2013-04-04-RelocAddend.ll b/test/ExecutionEngine/OrcJIT/2013-04-04-RelocAddend.ll
new file mode 100644
index 0000000..199e948
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/2013-04-04-RelocAddend.ll
@@ -0,0 +1,25 @@
+; RUN: %lli -use-orcmcjit %s
+;
+; Verify relocations to global symbols with addend work correctly.
+;
+; Compiled from this C code:
+;
+; int test[2] = { -1, 0 };
+; int *p = &test[1];
+; 
+; int main (void)
+; {
+;   return *p;
+; }
+; 
+
+@test = global [2 x i32] [i32 -1, i32 0], align 4
+@p = global i32* getelementptr inbounds ([2 x i32]* @test, i64 0, i64 1), align 8
+
+define i32 @main() {
+entry:
+  %0 = load i32** @p, align 8
+  %1 = load i32* %0, align 4
+  ret i32 %1
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/Inputs/cross-module-b.ll b/test/ExecutionEngine/OrcJIT/Inputs/cross-module-b.ll
new file mode 100644
index 0000000..6870117
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/Inputs/cross-module-b.ll
@@ -0,0 +1,7 @@
+declare i32 @FA()
+
+define i32 @FB() {
+  %r = call i32 @FA( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/Inputs/multi-module-b.ll b/test/ExecutionEngine/OrcJIT/Inputs/multi-module-b.ll
new file mode 100644
index 0000000..103b601
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/Inputs/multi-module-b.ll
@@ -0,0 +1,7 @@
+declare i32 @FC()
+
+define i32 @FB() {
+  %r = call i32 @FC( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/Inputs/multi-module-c.ll b/test/ExecutionEngine/OrcJIT/Inputs/multi-module-c.ll
new file mode 100644
index 0000000..b39306b
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/Inputs/multi-module-c.ll
@@ -0,0 +1,4 @@
+define i32 @FC() {
+  ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/Inputs/multi-module-eh-b.ll b/test/ExecutionEngine/OrcJIT/Inputs/multi-module-eh-b.ll
new file mode 100644
index 0000000..d7dbb03
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/Inputs/multi-module-eh-b.ll
@@ -0,0 +1,30 @@
+declare i8* @__cxa_allocate_exception(i64)
+declare void @__cxa_throw(i8*, i8*, i8*)
+declare i32 @__gxx_personality_v0(...)
+declare void @__cxa_end_catch()
+declare i8* @__cxa_begin_catch(i8*)
+
+@_ZTIi = external constant i8*
+
+define void @throwException_B() {
+  %exception = tail call i8* @__cxa_allocate_exception(i64 4)
+  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+  unreachable
+}
+
+define i32 @FB() {
+entry:
+  invoke void @throwException_B()
+          to label %try.cont unwind label %lpad
+
+lpad:
+  %p = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %e = extractvalue { i8*, i32 } %p, 0
+  call i8* @__cxa_begin_catch(i8* %e)
+  call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+  ret i32 0
+}
diff --git a/test/ExecutionEngine/OrcJIT/cross-module-a.ll b/test/ExecutionEngine/OrcJIT/cross-module-a.ll
new file mode 100644
index 0000000..14a73f5
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/cross-module-a.ll
@@ -0,0 +1,13 @@
+; RUN: %lli -use-orcmcjit -extra-module=%p/Inputs/cross-module-b.ll %s > /dev/null
+
+declare i32 @FB()
+
+define i32 @FA() {
+  ret i32 0
+}
+
+define i32 @main() {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/cross-module-sm-pic-a.ll b/test/ExecutionEngine/OrcJIT/cross-module-sm-pic-a.ll
new file mode 100644
index 0000000..50ad1c0
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/cross-module-sm-pic-a.ll
@@ -0,0 +1,14 @@
+; RUN: %lli -use-orcmcjit -extra-module=%p/Inputs/cross-module-b.ll -relocation-model=pic -code-model=small %s > /dev/null
+; XFAIL: mips, i686, i386
+
+declare i32 @FB()
+
+define i32 @FA() {
+  ret i32 0
+}
+
+define i32 @main() {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/eh-lg-pic.ll b/test/ExecutionEngine/OrcJIT/eh-lg-pic.ll
new file mode 100644
index 0000000..e5fa22c
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/eh-lg-pic.ll
@@ -0,0 +1,32 @@
+; RUN: %lli -use-orcmcjit -relocation-model=pic -code-model=large %s
+; XFAIL: cygwin, win32, mingw, mips, i686, i386, aarch64, arm, asan, msan
+declare i8* @__cxa_allocate_exception(i64)
+declare void @__cxa_throw(i8*, i8*, i8*)
+declare i32 @__gxx_personality_v0(...)
+declare void @__cxa_end_catch()
+declare i8* @__cxa_begin_catch(i8*)
+
+@_ZTIi = external constant i8*
+
+define void @throwException() {
+  %exception = tail call i8* @__cxa_allocate_exception(i64 4)
+  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+  unreachable
+}
+
+define i32 @main() {
+entry:
+  invoke void @throwException()
+          to label %try.cont unwind label %lpad
+
+lpad:
+  %p = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %e = extractvalue { i8*, i32 } %p, 0
+  call i8* @__cxa_begin_catch(i8* %e)
+  call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+  ret i32 0
+}
diff --git a/test/ExecutionEngine/OrcJIT/eh-sm-pic.ll b/test/ExecutionEngine/OrcJIT/eh-sm-pic.ll
new file mode 100644
index 0000000..f22cea9
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/eh-sm-pic.ll
@@ -0,0 +1,32 @@
+; RUN: %lli -use-orcmcjit -relocation-model=pic -code-model=small %s
+; XFAIL: cygwin, win32, mingw, mips, i686, i386, darwin, aarch64, arm, asan, msan
+declare i8* @__cxa_allocate_exception(i64)
+declare void @__cxa_throw(i8*, i8*, i8*)
+declare i32 @__gxx_personality_v0(...)
+declare void @__cxa_end_catch()
+declare i8* @__cxa_begin_catch(i8*)
+
+@_ZTIi = external constant i8*
+
+define void @throwException() {
+  %exception = tail call i8* @__cxa_allocate_exception(i64 4)
+  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+  unreachable
+}
+
+define i32 @main() {
+entry:
+  invoke void @throwException()
+          to label %try.cont unwind label %lpad
+
+lpad:
+  %p = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %e = extractvalue { i8*, i32 } %p, 0
+  call i8* @__cxa_begin_catch(i8* %e)
+  call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+  ret i32 0
+}
diff --git a/test/ExecutionEngine/OrcJIT/eh.ll b/test/ExecutionEngine/OrcJIT/eh.ll
new file mode 100644
index 0000000..130146b
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/eh.ll
@@ -0,0 +1,32 @@
+; RUN: %lli -use-orcmcjit %s
+; XFAIL: arm, cygwin, win32, mingw, asan, msan
+declare i8* @__cxa_allocate_exception(i64)
+declare void @__cxa_throw(i8*, i8*, i8*)
+declare i32 @__gxx_personality_v0(...)
+declare void @__cxa_end_catch()
+declare i8* @__cxa_begin_catch(i8*)
+
+@_ZTIi = external constant i8*
+
+define void @throwException() {
+  %exception = tail call i8* @__cxa_allocate_exception(i64 4)
+  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+  unreachable
+}
+
+define i32 @main() {
+entry:
+  invoke void @throwException()
+          to label %try.cont unwind label %lpad
+
+lpad:
+  %p = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %e = extractvalue { i8*, i32 } %p, 0
+  call i8* @__cxa_begin_catch(i8* %e)
+  call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+  ret i32 0
+}
diff --git a/test/ExecutionEngine/OrcJIT/fpbitcast.ll b/test/ExecutionEngine/OrcJIT/fpbitcast.ll
new file mode 100644
index 0000000..0e39f88
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/fpbitcast.ll
@@ -0,0 +1,21 @@
+; RUN: %lli -use-orcmcjit -force-interpreter=true %s | FileCheck %s
+; CHECK: 40091eb8
+
+define i32 @test(double %x) {
+entry:
+	%x46.i = bitcast double %x to i64	
+	%tmp343.i = lshr i64 %x46.i, 32	
+	%tmp344.i = trunc i64 %tmp343.i to i32
+        ret i32 %tmp344.i
+}
+
+define i32 @main()
+{
+       %res = call i32 @test(double 3.14)
+       %ptr = getelementptr [4 x i8]* @format, i32 0, i32 0
+       call i32 (i8*,...)* @printf(i8* %ptr, i32 %res)
+       ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
+@format = internal constant [4 x i8] c"%x\0A\00"
diff --git a/test/ExecutionEngine/OrcJIT/hello-sm-pic.ll b/test/ExecutionEngine/OrcJIT/hello-sm-pic.ll
new file mode 100644
index 0000000..ae98ae4
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/hello-sm-pic.ll
@@ -0,0 +1,12 @@
+; RUN: %lli -use-orcmcjit -relocation-model=pic -code-model=small %s > /dev/null
+; XFAIL: mips, i686, i386, darwin, aarch64, arm
+
+@.LC0 = internal global [12 x i8] c"Hello World\00"		; <[12 x i8]*> [#uses=1]
+
+declare i32 @puts(i8*)
+
+define i32 @main() {
+	%reg210 = call i32 @puts( i8* getelementptr ([12 x i8]* @.LC0, i64 0, i64 0) )		; <i32> [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/hello.ll b/test/ExecutionEngine/OrcJIT/hello.ll
new file mode 100644
index 0000000..f96e3ee
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/hello.ll
@@ -0,0 +1,11 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+@.LC0 = internal global [12 x i8] c"Hello World\00"		; <[12 x i8]*> [#uses=1]
+
+declare i32 @puts(i8*)
+
+define i32 @main() {
+	%reg210 = call i32 @puts( i8* getelementptr ([12 x i8]* @.LC0, i64 0, i64 0) )		; <i32> [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/hello2.ll b/test/ExecutionEngine/OrcJIT/hello2.ll
new file mode 100644
index 0000000..9e7cf5b
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/hello2.ll
@@ -0,0 +1,17 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+@X = global i32 7		; <i32*> [#uses=0]
+@msg = internal global [13 x i8] c"Hello World\0A\00"		; <[13 x i8]*> [#uses=1]
+
+declare void @printf([13 x i8]*, ...)
+
+define void @bar() {
+	call void ([13 x i8]*, ...)* @printf( [13 x i8]* @msg )
+	ret void
+}
+
+define i32 @main() {
+	call void @bar( )
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/lit.local.cfg b/test/ExecutionEngine/OrcJIT/lit.local.cfg
new file mode 100644
index 0000000..f981403
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/lit.local.cfg
@@ -0,0 +1,26 @@
+root = config.root
+targets = root.targets
+if ('X86' in targets) | ('AArch64' in targets) | ('ARM' in targets) | \
+   ('Mips' in targets) | ('PowerPC' in targets) | ('SystemZ' in targets):
+    config.unsupported = False
+else:
+    config.unsupported = True
+
+# FIXME: autoconf and cmake produce different arch names. We should normalize
+# them before getting here.
+if root.host_arch not in ['i386', 'x86', 'x86_64', 'AMD64',
+                          'AArch64', 'ARM', 'Mips', 'PowerPC', 'ppc64', 'SystemZ']:
+    config.unsupported = True
+
+if 'armv7' in root.host_arch:
+    config.unsupported = False
+
+if 'i386-apple-darwin' in root.target_triple:
+    config.unsupported = True
+
+if 'powerpc' in root.target_triple and not 'powerpc64' in root.target_triple:
+    config.unsupported = True
+
+# ExecutionEngine tests are not expected to pass in a cross-compilation setup.
+if 'native' not in config.available_features:
+    config.unsupported = True
diff --git a/test/ExecutionEngine/OrcJIT/load-object-a.ll b/test/ExecutionEngine/OrcJIT/load-object-a.ll
new file mode 100644
index 0000000..080bf6c
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/load-object-a.ll
@@ -0,0 +1,24 @@
+; This first line will generate the .o files for the next run line
+; RUN: rm -rf %t.cachedir %t.cachedir2 %t.cachedir3
+; RUN: mkdir -p %t.cachedir %t.cachedir2 %t.cachedir3
+; RUN: %lli -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -enable-cache-manager -object-cache-dir=%t.cachedir %s
+
+; Collect generated objects.
+; RUN: find %t.cachedir -type f -name 'multi-module-?.o' -exec mv -v '{}' %t.cachedir2 ';'
+
+; This line tests MCJIT object loading
+; RUN: %lli -extra-object=%t.cachedir2/multi-module-b.o -extra-object=%t.cachedir2/multi-module-c.o %s
+
+; These lines put the object files into an archive
+; RUN: llvm-ar r %t.cachedir3/load-object.a %t.cachedir2/multi-module-b.o
+; RUN: llvm-ar r %t.cachedir3/load-object.a %t.cachedir2/multi-module-c.o
+
+; This line test MCJIT archive loading
+; RUN: %lli -extra-archive=%t.cachedir3/load-object.a %s
+
+declare i32 @FB()
+
+define i32 @main() {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
diff --git a/test/ExecutionEngine/OrcJIT/multi-module-a.ll b/test/ExecutionEngine/OrcJIT/multi-module-a.ll
new file mode 100644
index 0000000..587a1e8
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/multi-module-a.ll
@@ -0,0 +1,9 @@
+; RUN: %lli -use-orcmcjit -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll %s > /dev/null
+
+declare i32 @FB()
+
+define i32 @main() {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/multi-module-eh-a.ll b/test/ExecutionEngine/OrcJIT/multi-module-eh-a.ll
new file mode 100644
index 0000000..6117e4c
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/multi-module-eh-a.ll
@@ -0,0 +1,35 @@
+; RUN: %lli -use-orcmcjit -extra-module=%p/Inputs/multi-module-eh-b.ll %s
+; XFAIL: arm, cygwin, win32, mingw, asan, msan
+declare i8* @__cxa_allocate_exception(i64)
+declare void @__cxa_throw(i8*, i8*, i8*)
+declare i32 @__gxx_personality_v0(...)
+declare void @__cxa_end_catch()
+declare i8* @__cxa_begin_catch(i8*)
+
+@_ZTIi = external constant i8*
+
+declare i32 @FB()
+
+define void @throwException() {
+  %exception = tail call i8* @__cxa_allocate_exception(i64 4)
+  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+  unreachable
+}
+
+define i32 @main() {
+entry:
+  invoke void @throwException()
+          to label %try.cont unwind label %lpad
+
+lpad:
+  %p = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %e = extractvalue { i8*, i32 } %p, 0
+  call i8* @__cxa_begin_catch(i8* %e)
+  call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+  %r = call i32 @FB( )
+  ret i32 %r
+}
diff --git a/test/ExecutionEngine/OrcJIT/multi-module-sm-pic-a.ll b/test/ExecutionEngine/OrcJIT/multi-module-sm-pic-a.ll
new file mode 100644
index 0000000..b5ee3d1
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/multi-module-sm-pic-a.ll
@@ -0,0 +1,10 @@
+; RUN: %lli -use-orcmcjit -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -relocation-model=pic -code-model=small %s > /dev/null
+; XFAIL: mips, i686, i386
+
+declare i32 @FB()
+
+define i32 @main() {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/non-extern-addend.ll b/test/ExecutionEngine/OrcJIT/non-extern-addend.ll
new file mode 100644
index 0000000..d768e2b
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/non-extern-addend.ll
@@ -0,0 +1,21 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @foo(i32 %x, i32 %y, double %d) {
+entry:
+  %d.int64 = bitcast double %d to i64
+  %d.top64 = lshr i64 %d.int64, 32
+  %d.top   = trunc i64 %d.top64 to i32
+  %d.bottom = trunc i64 %d.int64 to i32
+  %topCorrect = icmp eq i32 %d.top, 3735928559
+  %bottomCorrect = icmp eq i32 %d.bottom, 4277009102
+  %right = and i1 %topCorrect, %bottomCorrect
+  %nRight = xor i1 %right, true
+  %retVal = zext i1 %nRight to i32
+  ret i32 %retVal
+}
+
+define i32 @main() {
+entry:
+  %call = call i32 @foo(i32 0, i32 1, double 0xDEADBEEFFEEDFACE)
+  ret i32 %call
+}
diff --git a/test/ExecutionEngine/OrcJIT/pr13727.ll b/test/ExecutionEngine/OrcJIT/pr13727.ll
new file mode 100644
index 0000000..9c4f10b
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/pr13727.ll
@@ -0,0 +1,88 @@
+; RUN: %lli -use-orcmcjit -O0 -disable-lazy-compilation=false %s
+
+; The intention of this test is to verify that symbols mapped to COMMON in ELF
+; work as expected.
+;
+; Compiled from this C code:
+;
+; int zero_int;
+; double zero_double;
+; int zero_arr[10];
+; 
+; int main()
+; {
+;     zero_arr[zero_int + 5] = 40;
+; 
+;     if (zero_double < 1.1)
+;         zero_arr[zero_int + 2] = 70;
+; 
+;     for (int i = 1; i < 10; ++i) {
+;         zero_arr[i] = zero_arr[i - 1] + zero_arr[i];
+;     }
+;     return zero_arr[9] == 110 ? 0 : -1;
+; }
+
+@zero_int = common global i32 0, align 4
+@zero_arr = common global [10 x i32] zeroinitializer, align 16
+@zero_double = common global double 0.000000e+00, align 8
+
+define i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @zero_int, align 4
+  %add = add nsw i32 %0, 5
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom
+  store i32 40, i32* %arrayidx, align 4
+  %1 = load double* @zero_double, align 8
+  %cmp = fcmp olt double %1, 1.100000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %2 = load i32* @zero_int, align 4
+  %add1 = add nsw i32 %2, 2
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom2
+  store i32 70, i32* %arrayidx3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  store i32 1, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.end
+  %3 = load i32* %i, align 4
+  %cmp4 = icmp slt i32 %3, 10
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %4 = load i32* %i, align 4
+  %sub = sub nsw i32 %4, 1
+  %idxprom5 = sext i32 %sub to i64
+  %arrayidx6 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom5
+  %5 = load i32* %arrayidx6, align 4
+  %6 = load i32* %i, align 4
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom7
+  %7 = load i32* %arrayidx8, align 4
+  %add9 = add nsw i32 %5, %7
+  %8 = load i32* %i, align 4
+  %idxprom10 = sext i32 %8 to i64
+  %arrayidx11 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom10
+  store i32 %add9, i32* %arrayidx11, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %9 = load i32* %i, align 4
+  %inc = add nsw i32 %9, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %10 = load i32* getelementptr inbounds ([10 x i32]* @zero_arr, i32 0, i64 9), align 4
+  %cmp12 = icmp eq i32 %10, 110
+  %cond = select i1 %cmp12, i32 0, i32 -1
+  ret i32 %cond
+}
diff --git a/test/ExecutionEngine/OrcJIT/remote/Inputs/cross-module-b.ll b/test/ExecutionEngine/OrcJIT/remote/Inputs/cross-module-b.ll
new file mode 100644
index 0000000..bc13b1d
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/Inputs/cross-module-b.ll
@@ -0,0 +1,7 @@
+declare i32 @FA()
+
+define i32 @FB() nounwind {
+  %r = call i32 @FA( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/remote/Inputs/multi-module-b.ll b/test/ExecutionEngine/OrcJIT/remote/Inputs/multi-module-b.ll
new file mode 100644
index 0000000..0b8d5eb
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/Inputs/multi-module-b.ll
@@ -0,0 +1,7 @@
+declare i32 @FC()
+
+define i32 @FB() nounwind {
+  %r = call i32 @FC( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/remote/Inputs/multi-module-c.ll b/test/ExecutionEngine/OrcJIT/remote/Inputs/multi-module-c.ll
new file mode 100644
index 0000000..98350a8
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/Inputs/multi-module-c.ll
@@ -0,0 +1,4 @@
+define i32 @FC() nounwind {
+  ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/remote/cross-module-a.ll b/test/ExecutionEngine/OrcJIT/remote/cross-module-a.ll
new file mode 100644
index 0000000..c315723
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/cross-module-a.ll
@@ -0,0 +1,12 @@
+; RUN: %lli -extra-module=%p/Inputs/cross-module-b.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
+
+declare i32 @FB()
+
+define i32 @FA() nounwind {
+  ret i32 0
+}
+
+define i32 @main() nounwind {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
diff --git a/test/ExecutionEngine/OrcJIT/remote/cross-module-sm-pic-a.ll b/test/ExecutionEngine/OrcJIT/remote/cross-module-sm-pic-a.ll
new file mode 100644
index 0000000..d47fc6c
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/cross-module-sm-pic-a.ll
@@ -0,0 +1,14 @@
+; RUN: %lli -extra-module=%p/Inputs/cross-module-b.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext -relocation-model=pic -code-model=small %s > /dev/null
+; XFAIL: mips, i686, i386, arm
+
+declare i32 @FB()
+
+define i32 @FA() {
+  ret i32 0
+}
+
+define i32 @main() {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/remote/lit.local.cfg b/test/ExecutionEngine/OrcJIT/remote/lit.local.cfg
new file mode 100644
index 0000000..625d82d
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/lit.local.cfg
@@ -0,0 +1,8 @@
+if 'armv4' in config.root.target_triple or \
+   'armv5' in config.root.target_triple:
+    config.unsupported = True
+
+# This is temporary, until Remote MCJIT works on ARM
+# See http://llvm.org/bugs/show_bug.cgi?id=18057
+#if 'armv7' in config.root.target_triple:
+#    config.unsupported = True
diff --git a/test/ExecutionEngine/OrcJIT/remote/multi-module-a.ll b/test/ExecutionEngine/OrcJIT/remote/multi-module-a.ll
new file mode 100644
index 0000000..0fd363b
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/multi-module-a.ll
@@ -0,0 +1,9 @@
+; RUN: %lli -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
+
+declare i32 @FB()
+
+define i32 @main() nounwind {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/remote/multi-module-sm-pic-a.ll b/test/ExecutionEngine/OrcJIT/remote/multi-module-sm-pic-a.ll
new file mode 100644
index 0000000..d248c4b
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/multi-module-sm-pic-a.ll
@@ -0,0 +1,10 @@
+; RUN: %lli -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext -relocation-model=pic -code-model=small %s > /dev/null
+; XFAIL: mips, i686, i386, arm
+
+declare i32 @FB()
+
+define i32 @main() {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/remote/simpletest-remote.ll b/test/ExecutionEngine/OrcJIT/remote/simpletest-remote.ll
new file mode 100644
index 0000000..30b4dd8
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/simpletest-remote.ll
@@ -0,0 +1,10 @@
+; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
+
+define i32 @bar() nounwind {
+	ret i32 0
+}
+
+define i32 @main() nounwind {
+	%r = call i32 @bar( )		; <i32> [#uses=1]
+	ret i32 %r
+}
diff --git a/test/ExecutionEngine/OrcJIT/remote/stubs-remote.ll b/test/ExecutionEngine/OrcJIT/remote/stubs-remote.ll
new file mode 100644
index 0000000..da4ddc6
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/stubs-remote.ll
@@ -0,0 +1,37 @@
+; RUN: %lli -remote-mcjit -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s
+; XFAIL: *
+; This test should fail until remote symbol resolution is supported.
+
+define i32 @main() nounwind {
+entry:
+	call void @lazily_compiled_address_is_consistent()
+	ret i32 0
+}
+
+; Test PR3043: @test should have the same address before and after
+; it's JIT-compiled.
+@funcPtr = common global i1 ()* null, align 4
+@lcaic_failure = internal constant [46 x i8] c"@lazily_compiled_address_is_consistent failed\00"
+
+define void @lazily_compiled_address_is_consistent() nounwind {
+entry:
+	store i1 ()* @test, i1 ()** @funcPtr
+	%pass = tail call i1 @test()		; <i32> [#uses=1]
+	br i1 %pass, label %pass_block, label %fail_block
+pass_block:
+	ret void
+fail_block:
+	call i32 @puts(i8* getelementptr([46 x i8]* @lcaic_failure, i32 0, i32 0))
+	call void @exit(i32 1)
+	unreachable
+}
+
+define i1 @test() nounwind {
+entry:
+	%tmp = load i1 ()** @funcPtr
+	%eq = icmp eq i1 ()* %tmp, @test
+	ret i1 %eq
+}
+
+declare i32 @puts(i8*) noreturn
+declare void @exit(i32) noreturn
diff --git a/test/ExecutionEngine/OrcJIT/remote/stubs-sm-pic.ll b/test/ExecutionEngine/OrcJIT/remote/stubs-sm-pic.ll
new file mode 100644
index 0000000..f6a1607
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/stubs-sm-pic.ll
@@ -0,0 +1,37 @@
+; RUN: %lli -remote-mcjit -disable-lazy-compilation=false -relocation-model=pic -code-model=small %s
+; XFAIL: *
+; This function should fail until remote symbol resolution is supported.
+
+define i32 @main() nounwind {
+entry:
+	call void @lazily_compiled_address_is_consistent()
+	ret i32 0
+}
+
+; Test PR3043: @test should have the same address before and after
+; it's JIT-compiled.
+@funcPtr = common global i1 ()* null, align 4
+@lcaic_failure = internal constant [46 x i8] c"@lazily_compiled_address_is_consistent failed\00"
+
+define void @lazily_compiled_address_is_consistent() nounwind {
+entry:
+	store i1 ()* @test, i1 ()** @funcPtr
+	%pass = tail call i1 @test()		; <i32> [#uses=1]
+	br i1 %pass, label %pass_block, label %fail_block
+pass_block:
+	ret void
+fail_block:
+	call i32 @puts(i8* getelementptr([46 x i8]* @lcaic_failure, i32 0, i32 0))
+	call void @exit(i32 1)
+	unreachable
+}
+
+define i1 @test() nounwind {
+entry:
+	%tmp = load i1 ()** @funcPtr
+	%eq = icmp eq i1 ()* %tmp, @test
+	ret i1 %eq
+}
+
+declare i32 @puts(i8*) noreturn
+declare void @exit(i32) noreturn
diff --git a/test/ExecutionEngine/OrcJIT/remote/test-common-symbols-remote.ll b/test/ExecutionEngine/OrcJIT/remote/test-common-symbols-remote.ll
new file mode 100644
index 0000000..0f58710
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/test-common-symbols-remote.ll
@@ -0,0 +1,88 @@
+; RUN: %lli -remote-mcjit -O0 -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s
+
+; The intention of this test is to verify that symbols mapped to COMMON in ELF
+; work as expected.
+;
+; Compiled from this C code:
+;
+; int zero_int;
+; double zero_double;
+; int zero_arr[10];
+;
+; int main()
+; {
+;     zero_arr[zero_int + 5] = 40;
+;
+;     if (zero_double < 1.0)
+;         zero_arr[zero_int + 2] = 70;
+;
+;     for (int i = 1; i < 10; ++i) {
+;         zero_arr[i] = zero_arr[i - 1] + zero_arr[i];
+;     }
+;     return zero_arr[9] == 110 ? 0 : -1;
+; }
+
+@zero_int = common global i32 0, align 4
+@zero_arr = common global [10 x i32] zeroinitializer, align 16
+@zero_double = common global double 0.000000e+00, align 8
+
+define i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @zero_int, align 4
+  %add = add nsw i32 %0, 5
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom
+  store i32 40, i32* %arrayidx, align 4
+  %1 = load double* @zero_double, align 8
+  %cmp = fcmp olt double %1, 1.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %2 = load i32* @zero_int, align 4
+  %add1 = add nsw i32 %2, 2
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom2
+  store i32 70, i32* %arrayidx3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  store i32 1, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.end
+  %3 = load i32* %i, align 4
+  %cmp4 = icmp slt i32 %3, 10
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %4 = load i32* %i, align 4
+  %sub = sub nsw i32 %4, 1
+  %idxprom5 = sext i32 %sub to i64
+  %arrayidx6 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom5
+  %5 = load i32* %arrayidx6, align 4
+  %6 = load i32* %i, align 4
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom7
+  %7 = load i32* %arrayidx8, align 4
+  %add9 = add nsw i32 %5, %7
+  %8 = load i32* %i, align 4
+  %idxprom10 = sext i32 %8 to i64
+  %arrayidx11 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom10
+  store i32 %add9, i32* %arrayidx11, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %9 = load i32* %i, align 4
+  %inc = add nsw i32 %9, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %10 = load i32* getelementptr inbounds ([10 x i32]* @zero_arr, i32 0, i64 9), align 4
+  %cmp12 = icmp eq i32 %10, 110
+  %cond = select i1 %cmp12, i32 0, i32 -1
+  ret i32 %cond
+}
diff --git a/test/ExecutionEngine/OrcJIT/remote/test-data-align-remote.ll b/test/ExecutionEngine/OrcJIT/remote/test-data-align-remote.ll
new file mode 100644
index 0000000..435c21a
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/test-data-align-remote.ll
@@ -0,0 +1,15 @@
+; RUN:  %lli -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s
+
+; Check that a variable is always aligned as specified.
+
+@var = global i32 0, align 32
+define i32 @main() nounwind {
+  %addr = ptrtoint i32* @var to i64
+  %mask = and i64 %addr, 31
+  %tst = icmp eq i64 %mask, 0
+  br i1 %tst, label %good, label %bad
+good:
+  ret i32 0
+bad:
+  ret i32 1
+}
diff --git a/test/ExecutionEngine/OrcJIT/remote/test-fp-no-external-funcs-remote.ll b/test/ExecutionEngine/OrcJIT/remote/test-fp-no-external-funcs-remote.ll
new file mode 100644
index 0000000..9d11415
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/test-fp-no-external-funcs-remote.ll
@@ -0,0 +1,20 @@
+; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
+
+define double @test(double* %DP, double %Arg) nounwind {
+	%D = load double* %DP		; <double> [#uses=1]
+	%V = fadd double %D, 1.000000e+00		; <double> [#uses=2]
+	%W = fsub double %V, %V		; <double> [#uses=3]
+	%X = fmul double %W, %W		; <double> [#uses=2]
+	%Y = fdiv double %X, %X		; <double> [#uses=2]
+	%Q = fadd double %Y, %Arg		; <double> [#uses=1]
+	%R = bitcast double %Q to double		; <double> [#uses=1]
+	store double %Q, double* %DP
+	ret double %Y
+}
+
+define i32 @main() nounwind {
+	%X = alloca double		; <double*> [#uses=2]
+	store double 0.000000e+00, double* %X
+	call double @test( double* %X, double 2.000000e+00 )		; <double>:1 [#uses=0]
+	ret i32 0
+}
diff --git a/test/ExecutionEngine/OrcJIT/remote/test-global-init-nonzero-remote.ll b/test/ExecutionEngine/OrcJIT/remote/test-global-init-nonzero-remote.ll
new file mode 100644
index 0000000..40b514f
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/test-global-init-nonzero-remote.ll
@@ -0,0 +1,34 @@
+; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
+
+@count = global i32 1, align 4
+
+define i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 49
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* @count, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* @count, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %2 = load i32* %i, align 4
+  %inc1 = add nsw i32 %2, 1
+  store i32 %inc1, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %3 = load i32* @count, align 4
+  %sub = sub nsw i32 %3, 50
+  ret i32 %sub
+}
diff --git a/test/ExecutionEngine/OrcJIT/remote/test-global-init-nonzero-sm-pic.ll b/test/ExecutionEngine/OrcJIT/remote/test-global-init-nonzero-sm-pic.ll
new file mode 100644
index 0000000..5119b72
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/test-global-init-nonzero-sm-pic.ll
@@ -0,0 +1,35 @@
+; RUN: %lli -remote-mcjit -relocation-model=pic -code-model=small %s > /dev/null
+; XFAIL: mips, aarch64, arm, i686, i386
+
+@count = global i32 1, align 4
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 49
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* @count, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* @count, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %2 = load i32* %i, align 4
+  %inc1 = add nsw i32 %2, 1
+  store i32 %inc1, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %3 = load i32* @count, align 4
+  %sub = sub nsw i32 %3, 50
+  ret i32 %sub
+}
diff --git a/test/ExecutionEngine/OrcJIT/remote/test-ptr-reloc-remote.ll b/test/ExecutionEngine/OrcJIT/remote/test-ptr-reloc-remote.ll
new file mode 100644
index 0000000..ba3ffff
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/test-ptr-reloc-remote.ll
@@ -0,0 +1,15 @@
+; RUN: %lli -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s
+
+@.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
+@ptr = global i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), align 4
+@.str1 = private unnamed_addr constant [6 x i8] c"data2\00", align 1
+@ptr2 = global i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0), align 4
+
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind readonly {
+entry:
+  %0 = load i8** @ptr, align 4
+  %1 = load i8** @ptr2, align 4
+  %cmp = icmp eq i8* %0, %1
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
diff --git a/test/ExecutionEngine/OrcJIT/remote/test-ptr-reloc-sm-pic.ll b/test/ExecutionEngine/OrcJIT/remote/test-ptr-reloc-sm-pic.ll
new file mode 100644
index 0000000..bbc71af
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/remote/test-ptr-reloc-sm-pic.ll
@@ -0,0 +1,17 @@
+; RUN: %lli -remote-mcjit -O0 -relocation-model=pic -code-model=small %s
+; XFAIL: mips, aarch64, arm, i686, i386
+
+@.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
+@ptr = global i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), align 4
+@.str1 = private unnamed_addr constant [6 x i8] c"data2\00", align 1
+@ptr2 = global i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0), align 4
+
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind readonly {
+entry:
+  %0 = load i8** @ptr, align 4
+  %1 = load i8** @ptr2, align 4
+  %cmp = icmp eq i8* %0, %1
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/simplesttest.ll b/test/ExecutionEngine/OrcJIT/simplesttest.ll
new file mode 100644
index 0000000..c2f24f6
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/simplesttest.ll
@@ -0,0 +1,6 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @main() {
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/simpletest.ll b/test/ExecutionEngine/OrcJIT/simpletest.ll
new file mode 100644
index 0000000..e99f615
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/simpletest.ll
@@ -0,0 +1,11 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @bar() {
+	ret i32 0
+}
+
+define i32 @main() {
+	%r = call i32 @bar( )		; <i32> [#uses=1]
+	ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/stubs-sm-pic.ll b/test/ExecutionEngine/OrcJIT/stubs-sm-pic.ll
new file mode 100644
index 0000000..28f8a76
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/stubs-sm-pic.ll
@@ -0,0 +1,36 @@
+; RUN: %lli -use-orcmcjit -disable-lazy-compilation=false -relocation-model=pic -code-model=small %s
+; XFAIL: mips, i686, i386, aarch64, arm
+
+define i32 @main() nounwind {
+entry:
+	call void @lazily_compiled_address_is_consistent()
+	ret i32 0
+}
+
+; Test PR3043: @test should have the same address before and after
+; it's JIT-compiled.
+@funcPtr = common global i1 ()* null, align 4
+@lcaic_failure = internal constant [46 x i8] c"@lazily_compiled_address_is_consistent failed\00"
+
+define void @lazily_compiled_address_is_consistent() nounwind {
+entry:
+	store i1 ()* @test, i1 ()** @funcPtr
+	%pass = tail call i1 @test()		; <i32> [#uses=1]
+	br i1 %pass, label %pass_block, label %fail_block
+pass_block:
+	ret void
+fail_block:
+	call i32 @puts(i8* getelementptr([46 x i8]* @lcaic_failure, i32 0, i32 0))
+	call void @exit(i32 1)
+	unreachable
+}
+
+define i1 @test() nounwind {
+entry:
+	%tmp = load i1 ()** @funcPtr
+	%eq = icmp eq i1 ()* %tmp, @test
+	ret i1 %eq
+}
+
+declare i32 @puts(i8*) noreturn
+declare void @exit(i32) noreturn
diff --git a/test/ExecutionEngine/OrcJIT/stubs.ll b/test/ExecutionEngine/OrcJIT/stubs.ll
new file mode 100644
index 0000000..ec3c458
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/stubs.ll
@@ -0,0 +1,35 @@
+; RUN: %lli -use-orcmcjit -disable-lazy-compilation=false %s
+
+define i32 @main() nounwind {
+entry:
+	call void @lazily_compiled_address_is_consistent()
+	ret i32 0
+}
+
+; Test PR3043: @test should have the same address before and after
+; it's JIT-compiled.
+@funcPtr = common global i1 ()* null, align 4
+@lcaic_failure = internal constant [46 x i8] c"@lazily_compiled_address_is_consistent failed\00"
+
+define void @lazily_compiled_address_is_consistent() nounwind {
+entry:
+	store i1 ()* @test, i1 ()** @funcPtr
+	%pass = tail call i1 @test()		; <i32> [#uses=1]
+	br i1 %pass, label %pass_block, label %fail_block
+pass_block:
+	ret void
+fail_block:
+	call i32 @puts(i8* getelementptr([46 x i8]* @lcaic_failure, i32 0, i32 0))
+	call void @exit(i32 1)
+	unreachable
+}
+
+define i1 @test() nounwind {
+entry:
+	%tmp = load i1 ()** @funcPtr
+	%eq = icmp eq i1 ()* %tmp, @test
+	ret i1 %eq
+}
+
+declare i32 @puts(i8*) noreturn
+declare void @exit(i32) noreturn
diff --git a/test/ExecutionEngine/OrcJIT/test-arith.ll b/test/ExecutionEngine/OrcJIT/test-arith.ll
new file mode 100644
index 0000000..b662567
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-arith.ll
@@ -0,0 +1,34 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @main() {
+	%A = add i8 0, 12		; <i8> [#uses=1]
+	%B = sub i8 %A, 1		; <i8> [#uses=2]
+	%C = mul i8 %B, %B		; <i8> [#uses=2]
+	%D = sdiv i8 %C, %C		; <i8> [#uses=2]
+	%E = srem i8 %D, %D		; <i8> [#uses=0]
+	%F = udiv i8 5, 6		; <i8> [#uses=0]
+	%G = urem i8 6, 5		; <i8> [#uses=0]
+	%A.upgrd.1 = add i16 0, 12		; <i16> [#uses=1]
+	%B.upgrd.2 = sub i16 %A.upgrd.1, 1		; <i16> [#uses=2]
+	%C.upgrd.3 = mul i16 %B.upgrd.2, %B.upgrd.2		; <i16> [#uses=2]
+	%D.upgrd.4 = sdiv i16 %C.upgrd.3, %C.upgrd.3		; <i16> [#uses=2]
+	%E.upgrd.5 = srem i16 %D.upgrd.4, %D.upgrd.4		; <i16> [#uses=0]
+	%F.upgrd.6 = udiv i16 5, 6		; <i16> [#uses=0]
+	%G.upgrd.7 = urem i32 6, 5		; <i32> [#uses=0]
+	%A.upgrd.8 = add i32 0, 12		; <i32> [#uses=1]
+	%B.upgrd.9 = sub i32 %A.upgrd.8, 1		; <i32> [#uses=2]
+	%C.upgrd.10 = mul i32 %B.upgrd.9, %B.upgrd.9		; <i32> [#uses=2]
+	%D.upgrd.11 = sdiv i32 %C.upgrd.10, %C.upgrd.10		; <i32> [#uses=2]
+	%E.upgrd.12 = srem i32 %D.upgrd.11, %D.upgrd.11		; <i32> [#uses=0]
+	%F.upgrd.13 = udiv i32 5, 6		; <i32> [#uses=0]
+	%G1 = urem i32 6, 5		; <i32> [#uses=0]
+	%A.upgrd.14 = add i64 0, 12		; <i64> [#uses=1]
+	%B.upgrd.15 = sub i64 %A.upgrd.14, 1		; <i64> [#uses=2]
+	%C.upgrd.16 = mul i64 %B.upgrd.15, %B.upgrd.15		; <i64> [#uses=2]
+	%D.upgrd.17 = sdiv i64 %C.upgrd.16, %C.upgrd.16		; <i64> [#uses=2]
+	%E.upgrd.18 = srem i64 %D.upgrd.17, %D.upgrd.17		; <i64> [#uses=0]
+	%F.upgrd.19 = udiv i64 5, 6		; <i64> [#uses=0]
+	%G.upgrd.20 = urem i64 6, 5		; <i64> [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/test-branch.ll b/test/ExecutionEngine/OrcJIT/test-branch.ll
new file mode 100644
index 0000000..b66cfaf
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-branch.ll
@@ -0,0 +1,12 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+; test unconditional branch
+define i32 @main() {
+	br label %Test
+Test:		; preds = %Test, %0
+	%X = icmp eq i32 0, 4		; <i1> [#uses=1]
+	br i1 %X, label %Test, label %Label
+Label:		; preds = %Test
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/test-call-no-external-funcs.ll b/test/ExecutionEngine/OrcJIT/test-call-no-external-funcs.ll
new file mode 100644
index 0000000..c536efe
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-call-no-external-funcs.ll
@@ -0,0 +1,14 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @_Z14func_exit_codev() nounwind uwtable {
+entry:
+  ret i32 0
+}
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 @_Z14func_exit_codev()
+  ret i32 %call
+}
diff --git a/test/ExecutionEngine/OrcJIT/test-call.ll b/test/ExecutionEngine/OrcJIT/test-call.ll
new file mode 100644
index 0000000..8f50bdc
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-call.ll
@@ -0,0 +1,21 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+declare void @exit(i32)
+
+define i32 @test(i8 %C, i16 %S) {
+	%X = trunc i16 %S to i8		; <i8> [#uses=1]
+	%Y = zext i8 %X to i32		; <i32> [#uses=1]
+	ret i32 %Y
+}
+
+define void @FP(void (i32)* %F) {
+	%X = call i32 @test( i8 123, i16 1024 )		; <i32> [#uses=1]
+	call void %F( i32 %X )
+	ret void
+}
+
+define i32 @main() {
+	call void @FP( void (i32)* @exit )
+	ret i32 1
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/test-cast.ll b/test/ExecutionEngine/OrcJIT/test-cast.ll
new file mode 100644
index 0000000..4efd760
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-cast.ll
@@ -0,0 +1,109 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @foo() {
+	ret i32 0
+}
+
+define i32 @main() {
+	icmp ne i1 true, false		; <i1>:1 [#uses=0]
+	zext i1 true to i8		; <i8>:2 [#uses=0]
+	zext i1 true to i8		; <i8>:3 [#uses=0]
+	zext i1 true to i16		; <i16>:4 [#uses=0]
+	zext i1 true to i16		; <i16>:5 [#uses=0]
+	zext i1 true to i32		; <i32>:6 [#uses=0]
+	zext i1 true to i32		; <i32>:7 [#uses=0]
+	zext i1 true to i64		; <i64>:8 [#uses=0]
+	zext i1 true to i64		; <i64>:9 [#uses=0]
+	uitofp i1 true to float		; <float>:10 [#uses=0]
+	uitofp i1 true to double		; <double>:11 [#uses=0]
+	icmp ne i8 0, 0		; <i1>:12 [#uses=0]
+	icmp ne i8 1, 0		; <i1>:13 [#uses=0]
+	bitcast i8 0 to i8		; <i8>:14 [#uses=0]
+	bitcast i8 -1 to i8		; <i8>:15 [#uses=0]
+	sext i8 4 to i16		; <i16>:16 [#uses=0]
+	sext i8 4 to i16		; <i16>:17 [#uses=0]
+	sext i8 4 to i64		; <i64>:18 [#uses=0]
+	sext i8 4 to i64		; <i64>:19 [#uses=0]
+	sitofp i8 4 to float		; <float>:20 [#uses=0]
+	sitofp i8 4 to double		; <double>:21 [#uses=0]
+	icmp ne i8 0, 0		; <i1>:22 [#uses=0]
+	icmp ne i8 1, 0		; <i1>:23 [#uses=0]
+	bitcast i8 0 to i8		; <i8>:24 [#uses=0]
+	bitcast i8 1 to i8		; <i8>:25 [#uses=0]
+	zext i8 4 to i16		; <i16>:26 [#uses=0]
+	zext i8 4 to i16		; <i16>:27 [#uses=0]
+	zext i8 4 to i64		; <i64>:28 [#uses=0]
+	zext i8 4 to i64		; <i64>:29 [#uses=0]
+	uitofp i8 0 to float		; <float>:30 [#uses=0]
+	uitofp i8 0 to double		; <double>:31 [#uses=0]
+	icmp ne i16 1, 0		; <i1>:32 [#uses=0]
+	trunc i16 -1 to i8		; <i8>:33 [#uses=0]
+	trunc i16 255 to i8		; <i8>:34 [#uses=0]
+	bitcast i16 0 to i16		; <i16>:35 [#uses=0]
+	bitcast i16 0 to i16		; <i16>:36 [#uses=0]
+	sext i16 0 to i64		; <i64>:37 [#uses=0]
+	sext i16 0 to i64		; <i64>:38 [#uses=0]
+	sitofp i16 0 to float		; <float>:39 [#uses=0]
+	sitofp i16 0 to double		; <double>:40 [#uses=0]
+	icmp ne i16 1, 0		; <i1>:41 [#uses=0]
+	trunc i16 1 to i8		; <i8>:42 [#uses=0]
+	trunc i16 255 to i8		; <i8>:43 [#uses=0]
+	bitcast i16 0 to i16		; <i16>:44 [#uses=0]
+	bitcast i16 0 to i16		; <i16>:45 [#uses=0]
+	zext i16 0 to i64		; <i64>:46 [#uses=0]
+	zext i16 0 to i64		; <i64>:47 [#uses=0]
+	uitofp i16 0 to float		; <float>:48 [#uses=0]
+	uitofp i16 0 to double		; <double>:49 [#uses=0]
+	icmp ne i32 6, 0		; <i1>:50 [#uses=0]
+	trunc i32 -6 to i8		; <i8>:51 [#uses=0]
+	trunc i32 6 to i8		; <i8>:52 [#uses=0]
+	trunc i32 6 to i16		; <i16>:53 [#uses=0]
+	bitcast i32 0 to i32		; <i32>:54 [#uses=0]
+	sext i32 0 to i64		; <i64>:55 [#uses=0]
+	sext i32 0 to i64		; <i64>:56 [#uses=0]
+	sitofp i32 0 to float		; <float>:57 [#uses=0]
+	sitofp i32 0 to double		; <double>:58 [#uses=0]
+	icmp ne i32 6, 0		; <i1>:59 [#uses=0]
+	trunc i32 7 to i8		; <i8>:60 [#uses=0]
+	trunc i32 8 to i8		; <i8>:61 [#uses=0]
+	trunc i32 9 to i16		; <i16>:62 [#uses=0]
+	bitcast i32 10 to i32		; <i32>:63 [#uses=0]
+	zext i32 0 to i64		; <i64>:64 [#uses=0]
+	zext i32 0 to i64		; <i64>:65 [#uses=0]
+	uitofp i32 0 to float		; <float>:66 [#uses=0]
+	uitofp i32 0 to double		; <double>:67 [#uses=0]
+	icmp ne i64 0, 0		; <i1>:68 [#uses=0]
+	trunc i64 0 to i8		; <i8>:69 [#uses=0]
+	trunc i64 0 to i8		; <i8>:70 [#uses=0]
+	trunc i64 0 to i16		; <i16>:71 [#uses=0]
+	trunc i64 0 to i16		; <i16>:72 [#uses=0]
+	trunc i64 0 to i32		; <i32>:73 [#uses=0]
+	trunc i64 0 to i32		; <i32>:74 [#uses=0]
+	bitcast i64 0 to i64		; <i64>:75 [#uses=0]
+	bitcast i64 0 to i64		; <i64>:76 [#uses=0]
+	sitofp i64 0 to float		; <float>:77 [#uses=0]
+	sitofp i64 0 to double		; <double>:78 [#uses=0]
+	icmp ne i64 1, 0		; <i1>:79 [#uses=0]
+	trunc i64 1 to i8		; <i8>:80 [#uses=0]
+	trunc i64 1 to i8		; <i8>:81 [#uses=0]
+	trunc i64 1 to i16		; <i16>:82 [#uses=0]
+	trunc i64 1 to i16		; <i16>:83 [#uses=0]
+	trunc i64 1 to i32		; <i32>:84 [#uses=0]
+	trunc i64 1 to i32		; <i32>:85 [#uses=0]
+	bitcast i64 1 to i64		; <i64>:86 [#uses=0]
+	bitcast i64 1 to i64		; <i64>:87 [#uses=0]
+	uitofp i64 1 to float		; <float>:88 [#uses=0]
+	uitofp i64 0 to double		; <double>:89 [#uses=0]
+	bitcast float 0.000000e+00 to float		; <float>:90 [#uses=0]
+	fpext float 0.000000e+00 to double		; <double>:91 [#uses=0]
+	fptosi double 0.000000e+00 to i8		; <i8>:92 [#uses=0]
+	fptoui double 0.000000e+00 to i8		; <i8>:93 [#uses=0]
+	fptosi double 0.000000e+00 to i16		; <i16>:94 [#uses=0]
+	fptoui double 0.000000e+00 to i16		; <i16>:95 [#uses=0]
+	fptosi double 0.000000e+00 to i32		; <i32>:96 [#uses=0]
+	fptoui double 0.000000e+00 to i32		; <i32>:97 [#uses=0]
+	fptosi double 0.000000e+00 to i64		; <i64>:98 [#uses=0]
+	fptrunc double 0.000000e+00 to float		; <float>:99 [#uses=0]
+	bitcast double 0.000000e+00 to double		; <double>:100 [#uses=0]
+	ret i32 0
+}
diff --git a/test/ExecutionEngine/OrcJIT/test-common-symbols-alignment.ll b/test/ExecutionEngine/OrcJIT/test-common-symbols-alignment.ll
new file mode 100644
index 0000000..35349e3
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-common-symbols-alignment.ll
@@ -0,0 +1,32 @@
+; RUN: %lli -use-orcmcjit -O0 %s
+
+; This test checks that common symbols have been allocated addresses honouring
+; the alignment requirement.
+
+@CS1 = common global i32 0, align 16
+@CS2 = common global i8 0, align 1
+@CS3 = common global i32 0, align 16
+
+define i32 @main() nounwind {
+entry:
+    %retval = alloca i32, align 4
+    %ptr = alloca i32, align 4
+    store i32 0, i32* %retval
+    store i32 ptrtoint (i32* @CS3 to i32), i32* %ptr, align 4
+    %0 = load i32* %ptr, align 4
+    %and = and i32 %0, 15
+    %tobool = icmp ne i32 %and, 0
+    br i1 %tobool, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+    store i32 1, i32* %retval
+    br label %return
+
+if.else:                                          ; preds = %entry
+    store i32 0, i32* %retval
+    br label %return
+
+return:                                           ; preds = %if.else, %if.then
+    %1 = load i32* %retval
+    ret i32 %1
+}
diff --git a/test/ExecutionEngine/OrcJIT/test-common-symbols.ll b/test/ExecutionEngine/OrcJIT/test-common-symbols.ll
new file mode 100644
index 0000000..7129e14
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-common-symbols.ll
@@ -0,0 +1,88 @@
+; RUN: %lli -use-orcmcjit -O0 -disable-lazy-compilation=false %s
+
+; The intention of this test is to verify that symbols mapped to COMMON in ELF
+; work as expected.
+;
+; Compiled from this C code:
+;
+; int zero_int;
+; double zero_double;
+; int zero_arr[10];
+; 
+; int main()
+; {
+;     zero_arr[zero_int + 5] = 40;
+; 
+;     if (zero_double < 1.0)
+;         zero_arr[zero_int + 2] = 70;
+; 
+;     for (int i = 1; i < 10; ++i) {
+;         zero_arr[i] = zero_arr[i - 1] + zero_arr[i];
+;     }
+;     return zero_arr[9] == 110 ? 0 : -1;
+; }
+
+@zero_int = common global i32 0, align 4
+@zero_arr = common global [10 x i32] zeroinitializer, align 16
+@zero_double = common global double 0.000000e+00, align 8
+
+define i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @zero_int, align 4
+  %add = add nsw i32 %0, 5
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom
+  store i32 40, i32* %arrayidx, align 4
+  %1 = load double* @zero_double, align 8
+  %cmp = fcmp olt double %1, 1.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %2 = load i32* @zero_int, align 4
+  %add1 = add nsw i32 %2, 2
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom2
+  store i32 70, i32* %arrayidx3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  store i32 1, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.end
+  %3 = load i32* %i, align 4
+  %cmp4 = icmp slt i32 %3, 10
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %4 = load i32* %i, align 4
+  %sub = sub nsw i32 %4, 1
+  %idxprom5 = sext i32 %sub to i64
+  %arrayidx6 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom5
+  %5 = load i32* %arrayidx6, align 4
+  %6 = load i32* %i, align 4
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom7
+  %7 = load i32* %arrayidx8, align 4
+  %add9 = add nsw i32 %5, %7
+  %8 = load i32* %i, align 4
+  %idxprom10 = sext i32 %8 to i64
+  %arrayidx11 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom10
+  store i32 %add9, i32* %arrayidx11, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %9 = load i32* %i, align 4
+  %inc = add nsw i32 %9, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %10 = load i32* getelementptr inbounds ([10 x i32]* @zero_arr, i32 0, i64 9), align 4
+  %cmp12 = icmp eq i32 %10, 110
+  %cond = select i1 %cmp12, i32 0, i32 -1
+  ret i32 %cond
+}
diff --git a/test/ExecutionEngine/OrcJIT/test-constantexpr.ll b/test/ExecutionEngine/OrcJIT/test-constantexpr.ll
new file mode 100644
index 0000000..380848c
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-constantexpr.ll
@@ -0,0 +1,12 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+; This tests to make sure that we can evaluate weird constant expressions
+
+@A = global i32 5		; <i32*> [#uses=1]
+@B = global i32 6		; <i32*> [#uses=1]
+
+define i32 @main() {
+	%A = or i1 false, icmp slt (i32* @A, i32* @B)		; <i1> [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/test-data-align.ll b/test/ExecutionEngine/OrcJIT/test-data-align.ll
new file mode 100644
index 0000000..f76dda9
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-data-align.ll
@@ -0,0 +1,15 @@
+; RUN: %lli -use-orcmcjit -O0 %s
+
+; Check that a variable is always aligned as specified.
+
+@var = global i32 0, align 32
+define i32 @main() {
+  %addr = ptrtoint i32* @var to i64
+  %mask = and i64 %addr, 31
+  %tst = icmp eq i64 %mask, 0
+  br i1 %tst, label %good, label %bad
+good:
+  ret i32 0
+bad:
+  ret i32 1
+}
diff --git a/test/ExecutionEngine/OrcJIT/test-fp-no-external-funcs.ll b/test/ExecutionEngine/OrcJIT/test-fp-no-external-funcs.ll
new file mode 100644
index 0000000..cf8db4c
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-fp-no-external-funcs.ll
@@ -0,0 +1,21 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define double @test(double* %DP, double %Arg) {
+	%D = load double* %DP		; <double> [#uses=1]
+	%V = fadd double %D, 1.000000e+00		; <double> [#uses=2]
+	%W = fsub double %V, %V		; <double> [#uses=3]
+	%X = fmul double %W, %W		; <double> [#uses=2]
+	%Y = fdiv double %X, %X		; <double> [#uses=2]
+	%Q = fadd double %Y, %Arg		; <double> [#uses=1]
+	%R = bitcast double %Q to double		; <double> [#uses=1]
+	store double %Q, double* %DP
+	ret double %Y
+}
+
+define i32 @main() {
+	%X = alloca double		; <double*> [#uses=2]
+	store double 0.000000e+00, double* %X
+	call double @test( double* %X, double 2.000000e+00 )		; <double>:1 [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/test-fp.ll b/test/ExecutionEngine/OrcJIT/test-fp.ll
new file mode 100644
index 0000000..77a4c7e
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-fp.ll
@@ -0,0 +1,23 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define double @test(double* %DP, double %Arg) {
+	%D = load double* %DP		; <double> [#uses=1]
+	%V = fadd double %D, 1.000000e+00		; <double> [#uses=2]
+	%W = fsub double %V, %V		; <double> [#uses=3]
+	%X = fmul double %W, %W		; <double> [#uses=2]
+	%Y = fdiv double %X, %X		; <double> [#uses=2]
+	%Z = frem double %Y, %Y		; <double> [#uses=3]
+	%Z1 = fdiv double %Z, %W		; <double> [#uses=0]
+	%Q = fadd double %Z, %Arg		; <double> [#uses=1]
+	%R = bitcast double %Q to double		; <double> [#uses=1]
+	store double %R, double* %DP
+	ret double %Z
+}
+
+define i32 @main() {
+	%X = alloca double		; <double*> [#uses=2]
+	store double 0.000000e+00, double* %X
+	call double @test( double* %X, double 2.000000e+00 )		; <double>:1 [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/test-global-ctors.ll b/test/ExecutionEngine/OrcJIT/test-global-ctors.ll
new file mode 100644
index 0000000..bb00af6
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-global-ctors.ll
@@ -0,0 +1,22 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+; XFAIL: darwin
+@var = global i32 1, align 4
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @ctor_func }]
+@llvm.global_dtors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @dtor_func }]
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @var, align 4
+  ret i32 %0
+}
+
+define internal void @ctor_func() section ".text.startup" {
+entry:
+  store i32 0, i32* @var, align 4
+  ret void
+}
+
+define internal void @dtor_func() section ".text.startup" {
+entry:
+  ret void
+}
diff --git a/test/ExecutionEngine/OrcJIT/test-global-init-nonzero-sm-pic.ll b/test/ExecutionEngine/OrcJIT/test-global-init-nonzero-sm-pic.ll
new file mode 100644
index 0000000..c8ef597
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-global-init-nonzero-sm-pic.ll
@@ -0,0 +1,35 @@
+; RUN: %lli -use-orcmcjit -relocation-model=pic -code-model=small %s > /dev/null
+; XFAIL: mips, aarch64, arm, i686, i386
+
+@count = global i32 1, align 4
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 49
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* @count, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* @count, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %2 = load i32* %i, align 4
+  %inc1 = add nsw i32 %2, 1
+  store i32 %inc1, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %3 = load i32* @count, align 4
+  %sub = sub nsw i32 %3, 50
+  ret i32 %sub
+}
diff --git a/test/ExecutionEngine/OrcJIT/test-global-init-nonzero.ll b/test/ExecutionEngine/OrcJIT/test-global-init-nonzero.ll
new file mode 100644
index 0000000..46b721d
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-global-init-nonzero.ll
@@ -0,0 +1,34 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+@count = global i32 1, align 4
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 49
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* @count, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* @count, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %2 = load i32* %i, align 4
+  %inc1 = add nsw i32 %2, 1
+  store i32 %inc1, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %3 = load i32* @count, align 4
+  %sub = sub nsw i32 %3, 50
+  ret i32 %sub
+}
diff --git a/test/ExecutionEngine/OrcJIT/test-global.ll b/test/ExecutionEngine/OrcJIT/test-global.ll
new file mode 100644
index 0000000..5ece354
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-global.ll
@@ -0,0 +1,34 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+@count = global i32 0, align 4
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* @count, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* @count, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %2 = load i32* %i, align 4
+  %inc1 = add nsw i32 %2, 1
+  store i32 %inc1, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %3 = load i32* @count, align 4
+  %sub = sub nsw i32 %3, 50
+  ret i32 %sub
+}
diff --git a/test/ExecutionEngine/OrcJIT/test-loadstore.ll b/test/ExecutionEngine/OrcJIT/test-loadstore.ll
new file mode 100644
index 0000000..24ddd7a
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-loadstore.ll
@@ -0,0 +1,31 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define void @test(i8* %P, i16* %P.upgrd.1, i32* %P.upgrd.2, i64* %P.upgrd.3) {
+	%V = load i8* %P		; <i8> [#uses=1]
+	store i8 %V, i8* %P
+	%V.upgrd.4 = load i16* %P.upgrd.1		; <i16> [#uses=1]
+	store i16 %V.upgrd.4, i16* %P.upgrd.1
+	%V.upgrd.5 = load i32* %P.upgrd.2		; <i32> [#uses=1]
+	store i32 %V.upgrd.5, i32* %P.upgrd.2
+	%V.upgrd.6 = load i64* %P.upgrd.3		; <i64> [#uses=1]
+	store i64 %V.upgrd.6, i64* %P.upgrd.3
+	ret void
+}
+
+define i32 @varalloca(i32 %Size) {
+        ;; Variable sized alloca
+	%X = alloca i32, i32 %Size		; <i32*> [#uses=2]
+	store i32 %Size, i32* %X
+	%Y = load i32* %X		; <i32> [#uses=1]
+	ret i32 %Y
+}
+
+define i32 @main() {
+	%A = alloca i8		; <i8*> [#uses=1]
+	%B = alloca i16		; <i16*> [#uses=1]
+	%C = alloca i32		; <i32*> [#uses=1]
+	%D = alloca i64		; <i64*> [#uses=1]
+	call void @test( i8* %A, i16* %B, i32* %C, i64* %D )
+	call i32 @varalloca( i32 7 )		; <i32>:1 [#uses=0]
+	ret i32 0
+}
diff --git a/test/ExecutionEngine/OrcJIT/test-local.ll b/test/ExecutionEngine/OrcJIT/test-local.ll
new file mode 100644
index 0000000..b541650
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-local.ll
@@ -0,0 +1,34 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %count = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %count, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %count, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* %count, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %2 = load i32* %i, align 4
+  %inc1 = add nsw i32 %2, 1
+  store i32 %inc1, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %3 = load i32* %count, align 4
+  %sub = sub nsw i32 %3, 50
+  ret i32 %sub
+}
diff --git a/test/ExecutionEngine/OrcJIT/test-logical.ll b/test/ExecutionEngine/OrcJIT/test-logical.ll
new file mode 100644
index 0000000..aa8e5de
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-logical.ll
@@ -0,0 +1,18 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @main() {
+	%A = and i8 4, 8		; <i8> [#uses=2]
+	%B = or i8 %A, 7		; <i8> [#uses=1]
+	%C = xor i8 %B, %A		; <i8> [#uses=0]
+	%A.upgrd.1 = and i16 4, 8		; <i16> [#uses=2]
+	%B.upgrd.2 = or i16 %A.upgrd.1, 7		; <i16> [#uses=1]
+	%C.upgrd.3 = xor i16 %B.upgrd.2, %A.upgrd.1		; <i16> [#uses=0]
+	%A.upgrd.4 = and i32 4, 8		; <i32> [#uses=2]
+	%B.upgrd.5 = or i32 %A.upgrd.4, 7		; <i32> [#uses=1]
+	%C.upgrd.6 = xor i32 %B.upgrd.5, %A.upgrd.4		; <i32> [#uses=0]
+	%A.upgrd.7 = and i64 4, 8		; <i64> [#uses=2]
+	%B.upgrd.8 = or i64 %A.upgrd.7, 7		; <i64> [#uses=1]
+	%C.upgrd.9 = xor i64 %B.upgrd.8, %A.upgrd.7		; <i64> [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/test-loop.ll b/test/ExecutionEngine/OrcJIT/test-loop.ll
new file mode 100644
index 0000000..5cb9353
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-loop.ll
@@ -0,0 +1,14 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @main() {
+; <label>:0
+	br label %Loop
+Loop:		; preds = %Loop, %0
+	%I = phi i32 [ 0, %0 ], [ %i2, %Loop ]		; <i32> [#uses=1]
+	%i2 = add i32 %I, 1		; <i32> [#uses=2]
+	%C = icmp eq i32 %i2, 10		; <i1> [#uses=1]
+	br i1 %C, label %Out, label %Loop
+Out:		; preds = %Loop
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/test-phi.ll b/test/ExecutionEngine/OrcJIT/test-phi.ll
new file mode 100644
index 0000000..880a916
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-phi.ll
@@ -0,0 +1,34 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+; test phi node
+@Y = global i32 6		; <i32*> [#uses=1]
+
+define void @blah(i32* %X) {
+; <label>:0
+	br label %T
+T:		; preds = %Dead, %0
+	phi i32* [ %X, %0 ], [ @Y, %Dead ]		; <i32*>:1 [#uses=0]
+	ret void
+Dead:		; No predecessors!
+	br label %T
+}
+
+define i32 @test(i1 %C) {
+; <label>:0
+	br i1 %C, label %T, label %T
+T:		; preds = %0, %0
+	%X = phi i32 [ 123, %0 ], [ 123, %0 ]		; <i32> [#uses=1]
+	ret i32 %X
+}
+
+define i32 @main() {
+; <label>:0
+	br label %Test
+Test:		; preds = %Dead, %0
+	%X = phi i32 [ 0, %0 ], [ %Y, %Dead ]		; <i32> [#uses=1]
+	ret i32 %X
+Dead:		; No predecessors!
+	%Y = ashr i32 12, 4		; <i32> [#uses=1]
+	br label %Test
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/test-ptr-reloc-sm-pic.ll b/test/ExecutionEngine/OrcJIT/test-ptr-reloc-sm-pic.ll
new file mode 100644
index 0000000..d940adc
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-ptr-reloc-sm-pic.ll
@@ -0,0 +1,17 @@
+; RUN: %lli -use-orcmcjit -O0 -relocation-model=pic -code-model=small %s
+; XFAIL: mips, aarch64, arm, i686, i386
+
+@.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
+@ptr = global i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), align 4
+@.str1 = private unnamed_addr constant [6 x i8] c"data2\00", align 1
+@ptr2 = global i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0), align 4
+
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind readonly {
+entry:
+  %0 = load i8** @ptr, align 4
+  %1 = load i8** @ptr2, align 4
+  %cmp = icmp eq i8* %0, %1
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/test-ptr-reloc.ll b/test/ExecutionEngine/OrcJIT/test-ptr-reloc.ll
new file mode 100644
index 0000000..95fa106
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-ptr-reloc.ll
@@ -0,0 +1,16 @@
+; RUN: %lli -use-orcmcjit -O0 %s
+
+@.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
+@ptr = global i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), align 4
+@.str1 = private unnamed_addr constant [6 x i8] c"data2\00", align 1
+@ptr2 = global i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0), align 4
+
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind readonly {
+entry:
+  %0 = load i8** @ptr, align 4
+  %1 = load i8** @ptr2, align 4
+  %cmp = icmp eq i8* %0, %1
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
+
diff --git a/test/ExecutionEngine/OrcJIT/test-ret.ll b/test/ExecutionEngine/OrcJIT/test-ret.ll
new file mode 100644
index 0000000..71ff452
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-ret.ll
@@ -0,0 +1,46 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+; test return instructions
+define void @test1() {
+	ret void
+}
+
+define i8 @test2() {
+	ret i8 1
+}
+
+define i8 @test3() {
+	ret i8 1
+}
+
+define i16 @test4() {
+	ret i16 -1
+}
+
+define i16 @test5() {
+	ret i16 -1
+}
+
+define i32 @main() {
+	ret i32 0
+}
+
+define i32 @test6() {
+	ret i32 4
+}
+
+define i64 @test7() {
+	ret i64 0
+}
+
+define i64 @test8() {
+	ret i64 0
+}
+
+define float @test9() {
+	ret float 1.000000e+00
+}
+
+define double @test10() {
+	ret double 2.000000e+00
+}
diff --git a/test/ExecutionEngine/OrcJIT/test-return.ll b/test/ExecutionEngine/OrcJIT/test-return.ll
new file mode 100644
index 0000000..07e74b0
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-return.ll
@@ -0,0 +1,8 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  ret i32 0
+}
diff --git a/test/ExecutionEngine/OrcJIT/test-setcond-fp.ll b/test/ExecutionEngine/OrcJIT/test-setcond-fp.ll
new file mode 100644
index 0000000..d708b90
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-setcond-fp.ll
@@ -0,0 +1,24 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+
+define i32 @main() {
+	%double1 = fadd double 0.000000e+00, 0.000000e+00		; <double> [#uses=6]
+	%double2 = fadd double 0.000000e+00, 0.000000e+00		; <double> [#uses=6]
+	%float1 = fadd float 0.000000e+00, 0.000000e+00		; <float> [#uses=6]
+	%float2 = fadd float 0.000000e+00, 0.000000e+00		; <float> [#uses=6]
+	%test49 = fcmp oeq float %float1, %float2		; <i1> [#uses=0]
+	%test50 = fcmp oge float %float1, %float2		; <i1> [#uses=0]
+	%test51 = fcmp ogt float %float1, %float2		; <i1> [#uses=0]
+	%test52 = fcmp ole float %float1, %float2		; <i1> [#uses=0]
+	%test53 = fcmp olt float %float1, %float2		; <i1> [#uses=0]
+	%test54 = fcmp une float %float1, %float2		; <i1> [#uses=0]
+	%test55 = fcmp oeq double %double1, %double2		; <i1> [#uses=0]
+	%test56 = fcmp oge double %double1, %double2		; <i1> [#uses=0]
+	%test57 = fcmp ogt double %double1, %double2		; <i1> [#uses=0]
+	%test58 = fcmp ole double %double1, %double2		; <i1> [#uses=0]
+	%test59 = fcmp olt double %double1, %double2		; <i1> [#uses=0]
+	%test60 = fcmp une double %double1, %double2		; <i1> [#uses=0]
+	ret i32 0
+}
+
+
diff --git a/test/ExecutionEngine/OrcJIT/test-setcond-int.ll b/test/ExecutionEngine/OrcJIT/test-setcond-int.ll
new file mode 100644
index 0000000..b801d97
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-setcond-int.ll
@@ -0,0 +1,69 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @main() {
+	%int1 = add i32 0, 0		; <i32> [#uses=6]
+	%int2 = add i32 0, 0		; <i32> [#uses=6]
+	%long1 = add i64 0, 0		; <i64> [#uses=6]
+	%long2 = add i64 0, 0		; <i64> [#uses=6]
+	%sbyte1 = add i8 0, 0		; <i8> [#uses=6]
+	%sbyte2 = add i8 0, 0		; <i8> [#uses=6]
+	%short1 = add i16 0, 0		; <i16> [#uses=6]
+	%short2 = add i16 0, 0		; <i16> [#uses=6]
+	%ubyte1 = add i8 0, 0		; <i8> [#uses=6]
+	%ubyte2 = add i8 0, 0		; <i8> [#uses=6]
+	%uint1 = add i32 0, 0		; <i32> [#uses=6]
+	%uint2 = add i32 0, 0		; <i32> [#uses=6]
+	%ulong1 = add i64 0, 0		; <i64> [#uses=6]
+	%ulong2 = add i64 0, 0		; <i64> [#uses=6]
+	%ushort1 = add i16 0, 0		; <i16> [#uses=6]
+	%ushort2 = add i16 0, 0		; <i16> [#uses=6]
+	%test1 = icmp eq i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
+	%test2 = icmp uge i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
+	%test3 = icmp ugt i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
+	%test4 = icmp ule i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
+	%test5 = icmp ult i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
+	%test6 = icmp ne i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
+	%test7 = icmp eq i16 %ushort1, %ushort2		; <i1> [#uses=0]
+	%test8 = icmp uge i16 %ushort1, %ushort2		; <i1> [#uses=0]
+	%test9 = icmp ugt i16 %ushort1, %ushort2		; <i1> [#uses=0]
+	%test10 = icmp ule i16 %ushort1, %ushort2		; <i1> [#uses=0]
+	%test11 = icmp ult i16 %ushort1, %ushort2		; <i1> [#uses=0]
+	%test12 = icmp ne i16 %ushort1, %ushort2		; <i1> [#uses=0]
+	%test13 = icmp eq i32 %uint1, %uint2		; <i1> [#uses=0]
+	%test14 = icmp uge i32 %uint1, %uint2		; <i1> [#uses=0]
+	%test15 = icmp ugt i32 %uint1, %uint2		; <i1> [#uses=0]
+	%test16 = icmp ule i32 %uint1, %uint2		; <i1> [#uses=0]
+	%test17 = icmp ult i32 %uint1, %uint2		; <i1> [#uses=0]
+	%test18 = icmp ne i32 %uint1, %uint2		; <i1> [#uses=0]
+	%test19 = icmp eq i64 %ulong1, %ulong2		; <i1> [#uses=0]
+	%test20 = icmp uge i64 %ulong1, %ulong2		; <i1> [#uses=0]
+	%test21 = icmp ugt i64 %ulong1, %ulong2		; <i1> [#uses=0]
+	%test22 = icmp ule i64 %ulong1, %ulong2		; <i1> [#uses=0]
+	%test23 = icmp ult i64 %ulong1, %ulong2		; <i1> [#uses=0]
+	%test24 = icmp ne i64 %ulong1, %ulong2		; <i1> [#uses=0]
+	%test25 = icmp eq i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
+	%test26 = icmp sge i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
+	%test27 = icmp sgt i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
+	%test28 = icmp sle i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
+	%test29 = icmp slt i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
+	%test30 = icmp ne i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
+	%test31 = icmp eq i16 %short1, %short2		; <i1> [#uses=0]
+	%test32 = icmp sge i16 %short1, %short2		; <i1> [#uses=0]
+	%test33 = icmp sgt i16 %short1, %short2		; <i1> [#uses=0]
+	%test34 = icmp sle i16 %short1, %short2		; <i1> [#uses=0]
+	%test35 = icmp slt i16 %short1, %short2		; <i1> [#uses=0]
+	%test36 = icmp ne i16 %short1, %short2		; <i1> [#uses=0]
+	%test37 = icmp eq i32 %int1, %int2		; <i1> [#uses=0]
+	%test38 = icmp sge i32 %int1, %int2		; <i1> [#uses=0]
+	%test39 = icmp sgt i32 %int1, %int2		; <i1> [#uses=0]
+	%test40 = icmp sle i32 %int1, %int2		; <i1> [#uses=0]
+	%test41 = icmp slt i32 %int1, %int2		; <i1> [#uses=0]
+	%test42 = icmp ne i32 %int1, %int2		; <i1> [#uses=0]
+	%test43 = icmp eq i64 %long1, %long2		; <i1> [#uses=0]
+	%test44 = icmp sge i64 %long1, %long2		; <i1> [#uses=0]
+	%test45 = icmp sgt i64 %long1, %long2		; <i1> [#uses=0]
+	%test46 = icmp sle i64 %long1, %long2		; <i1> [#uses=0]
+	%test47 = icmp slt i64 %long1, %long2		; <i1> [#uses=0]
+	%test48 = icmp ne i64 %long1, %long2		; <i1> [#uses=0]
+	ret i32 0
+}
diff --git a/test/ExecutionEngine/OrcJIT/test-shift.ll b/test/ExecutionEngine/OrcJIT/test-shift.ll
new file mode 100644
index 0000000..500987c
--- /dev/null
+++ b/test/ExecutionEngine/OrcJIT/test-shift.ll
@@ -0,0 +1,32 @@
+; RUN: %lli -use-orcmcjit %s > /dev/null
+
+define i32 @main() {
+	%shamt = add i8 0, 1		; <i8> [#uses=8]
+	%shift.upgrd.1 = zext i8 %shamt to i32		; <i32> [#uses=1]
+	%t1.s = shl i32 1, %shift.upgrd.1		; <i32> [#uses=0]
+	%t2.s = shl i32 1, 4		; <i32> [#uses=0]
+	%shift.upgrd.2 = zext i8 %shamt to i32		; <i32> [#uses=1]
+	%t1 = shl i32 1, %shift.upgrd.2		; <i32> [#uses=0]
+	%t2 = shl i32 1, 5		; <i32> [#uses=0]
+	%t2.s.upgrd.3 = shl i64 1, 4		; <i64> [#uses=0]
+	%t2.upgrd.4 = shl i64 1, 5		; <i64> [#uses=0]
+	%shift.upgrd.5 = zext i8 %shamt to i32		; <i32> [#uses=1]
+	%tr1.s = ashr i32 1, %shift.upgrd.5		; <i32> [#uses=0]
+	%tr2.s = ashr i32 1, 4		; <i32> [#uses=0]
+	%shift.upgrd.6 = zext i8 %shamt to i32		; <i32> [#uses=1]
+	%tr1 = lshr i32 1, %shift.upgrd.6		; <i32> [#uses=0]
+	%tr2 = lshr i32 1, 5		; <i32> [#uses=0]
+	%tr1.l = ashr i64 1, 4		; <i64> [#uses=0]
+	%shift.upgrd.7 = zext i8 %shamt to i64		; <i64> [#uses=1]
+	%tr2.l = ashr i64 1, %shift.upgrd.7		; <i64> [#uses=0]
+	%tr3.l = shl i64 1, 4		; <i64> [#uses=0]
+	%shift.upgrd.8 = zext i8 %shamt to i64		; <i64> [#uses=1]
+	%tr4.l = shl i64 1, %shift.upgrd.8		; <i64> [#uses=0]
+	%tr1.u = lshr i64 1, 5		; <i64> [#uses=0]
+	%shift.upgrd.9 = zext i8 %shamt to i64		; <i64> [#uses=1]
+	%tr2.u = lshr i64 1, %shift.upgrd.9		; <i64> [#uses=0]
+	%tr3.u = shl i64 1, 5		; <i64> [#uses=0]
+	%shift.upgrd.10 = zext i8 %shamt to i64		; <i64> [#uses=1]
+	%tr4.u = shl i64 1, %shift.upgrd.10		; <i64> [#uses=0]
+	ret i32 0
+}
diff --git a/test/ExecutionEngine/fpbitcast.ll b/test/ExecutionEngine/fpbitcast.ll
deleted file mode 100644
index e6d06f8..0000000
--- a/test/ExecutionEngine/fpbitcast.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: %lli -force-interpreter=true %s | FileCheck %s
-; CHECK: 40091eb8
-
-define i32 @test(double %x) {
-entry:
-	%x46.i = bitcast double %x to i64	
-	%tmp343.i = lshr i64 %x46.i, 32	
-	%tmp344.i = trunc i64 %tmp343.i to i32
-        ret i32 %tmp344.i
-}
-
-define i32 @main()
-{
-       %res = call i32 @test(double 3.14)
-       %ptr = getelementptr [4 x i8]* @format, i32 0, i32 0
-       call i32 (i8*,...)* @printf(i8* %ptr, i32 %res)
-       ret i32 0
-}
-
-declare i32 @printf(i8*, ...)
-@format = internal constant [4 x i8] c"%x\0A\00"
diff --git a/test/ExecutionEngine/frem.ll b/test/ExecutionEngine/frem.ll
index 7e0b606..ce83d20 100644
--- a/test/ExecutionEngine/frem.ll
+++ b/test/ExecutionEngine/frem.ll
@@ -8,12 +8,14 @@
 @str = internal constant [18 x i8] c"Double value: %f\0A\00"
 
 declare i32 @printf(i8* nocapture, ...) nounwind
+declare i32 @fflush(i8*) nounwind
 
 define i32 @main() {
   %flt = load float* @flt
   %float2 = frem float %flt, 5.0
   %double1 = fpext float %float2 to double
   call i32 (i8*, ...)* @printf(i8* getelementptr ([18 x i8]* @str, i32 0, i64 0), double %double1)
+  call i32 @fflush(i8* null)
   ret i32 0
 }
 
diff --git a/test/ExecutionEngine/hello.ll b/test/ExecutionEngine/hello.ll
deleted file mode 100644
index 47e36a5..0000000
--- a/test/ExecutionEngine/hello.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-@.LC0 = internal global [12 x i8] c"Hello World\00"		; <[12 x i8]*> [#uses=1]
-
-declare i32 @puts(i8*)
-
-define i32 @main() {
-	%reg210 = call i32 @puts( i8* getelementptr ([12 x i8]* @.LC0, i64 0, i64 0) )		; <i32> [#uses=0]
-	ret i32 0
-}
-
diff --git a/test/ExecutionEngine/hello2.ll b/test/ExecutionEngine/hello2.ll
deleted file mode 100644
index 13b2588..0000000
--- a/test/ExecutionEngine/hello2.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-@X = global i32 7		; <i32*> [#uses=0]
-@msg = internal global [13 x i8] c"Hello World\0A\00"		; <[13 x i8]*> [#uses=1]
-
-declare void @printf([13 x i8]*, ...)
-
-define void @bar() {
-	call void ([13 x i8]*, ...)* @printf( [13 x i8]* @msg )
-	ret void
-}
-
-define i32 @main() {
-	call void @bar( )
-	ret i32 0
-}
-
diff --git a/test/ExecutionEngine/simplesttest.ll b/test/ExecutionEngine/simplesttest.ll
deleted file mode 100644
index 85c1715..0000000
--- a/test/ExecutionEngine/simplesttest.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define i32 @main() {
-	ret i32 0
-}
-
diff --git a/test/ExecutionEngine/simpletest.ll b/test/ExecutionEngine/simpletest.ll
deleted file mode 100644
index 167a0fd..0000000
--- a/test/ExecutionEngine/simpletest.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define i32 @bar() {
-	ret i32 0
-}
-
-define i32 @main() {
-	%r = call i32 @bar( )		; <i32> [#uses=1]
-	ret i32 %r
-}
-
diff --git a/test/ExecutionEngine/stubs.ll b/test/ExecutionEngine/stubs.ll
deleted file mode 100644
index b7d922f..0000000
--- a/test/ExecutionEngine/stubs.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: %lli -disable-lazy-compilation=false %s
-
-define i32 @main() nounwind {
-entry:
-	call void @lazily_compiled_address_is_consistent()
-	ret i32 0
-}
-
-; Test PR3043: @test should have the same address before and after
-; it's JIT-compiled.
-@funcPtr = common global i1 ()* null, align 4
-@lcaic_failure = internal constant [46 x i8] c"@lazily_compiled_address_is_consistent failed\00"
-
-define void @lazily_compiled_address_is_consistent() nounwind {
-entry:
-	store i1 ()* @test, i1 ()** @funcPtr
-	%pass = tail call i1 @test()		; <i32> [#uses=1]
-	br i1 %pass, label %pass_block, label %fail_block
-pass_block:
-	ret void
-fail_block:
-	call i32 @puts(i8* getelementptr([46 x i8]* @lcaic_failure, i32 0, i32 0))
-	call void @exit(i32 1)
-	unreachable
-}
-
-define i1 @test() nounwind {
-entry:
-	%tmp = load i1 ()** @funcPtr
-	%eq = icmp eq i1 ()* %tmp, @test
-	ret i1 %eq
-}
-
-declare i32 @puts(i8*) noreturn
-declare void @exit(i32) noreturn
diff --git a/test/ExecutionEngine/test-arith.ll b/test/ExecutionEngine/test-arith.ll
deleted file mode 100644
index 79f989f..0000000
--- a/test/ExecutionEngine/test-arith.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define i32 @main() {
-	%A = add i8 0, 12		; <i8> [#uses=1]
-	%B = sub i8 %A, 1		; <i8> [#uses=2]
-	%C = mul i8 %B, %B		; <i8> [#uses=2]
-	%D = sdiv i8 %C, %C		; <i8> [#uses=2]
-	%E = srem i8 %D, %D		; <i8> [#uses=0]
-	%F = udiv i8 5, 6		; <i8> [#uses=0]
-	%G = urem i8 6, 5		; <i8> [#uses=0]
-	%A.upgrd.1 = add i16 0, 12		; <i16> [#uses=1]
-	%B.upgrd.2 = sub i16 %A.upgrd.1, 1		; <i16> [#uses=2]
-	%C.upgrd.3 = mul i16 %B.upgrd.2, %B.upgrd.2		; <i16> [#uses=2]
-	%D.upgrd.4 = sdiv i16 %C.upgrd.3, %C.upgrd.3		; <i16> [#uses=2]
-	%E.upgrd.5 = srem i16 %D.upgrd.4, %D.upgrd.4		; <i16> [#uses=0]
-	%F.upgrd.6 = udiv i16 5, 6		; <i16> [#uses=0]
-	%G.upgrd.7 = urem i32 6, 5		; <i32> [#uses=0]
-	%A.upgrd.8 = add i32 0, 12		; <i32> [#uses=1]
-	%B.upgrd.9 = sub i32 %A.upgrd.8, 1		; <i32> [#uses=2]
-	%C.upgrd.10 = mul i32 %B.upgrd.9, %B.upgrd.9		; <i32> [#uses=2]
-	%D.upgrd.11 = sdiv i32 %C.upgrd.10, %C.upgrd.10		; <i32> [#uses=2]
-	%E.upgrd.12 = srem i32 %D.upgrd.11, %D.upgrd.11		; <i32> [#uses=0]
-	%F.upgrd.13 = udiv i32 5, 6		; <i32> [#uses=0]
-	%G1 = urem i32 6, 5		; <i32> [#uses=0]
-	%A.upgrd.14 = add i64 0, 12		; <i64> [#uses=1]
-	%B.upgrd.15 = sub i64 %A.upgrd.14, 1		; <i64> [#uses=2]
-	%C.upgrd.16 = mul i64 %B.upgrd.15, %B.upgrd.15		; <i64> [#uses=2]
-	%D.upgrd.17 = sdiv i64 %C.upgrd.16, %C.upgrd.16		; <i64> [#uses=2]
-	%E.upgrd.18 = srem i64 %D.upgrd.17, %D.upgrd.17		; <i64> [#uses=0]
-	%F.upgrd.19 = udiv i64 5, 6		; <i64> [#uses=0]
-	%G.upgrd.20 = urem i64 6, 5		; <i64> [#uses=0]
-	ret i32 0
-}
-
diff --git a/test/ExecutionEngine/test-branch.ll b/test/ExecutionEngine/test-branch.ll
deleted file mode 100644
index 3ae55d0..0000000
--- a/test/ExecutionEngine/test-branch.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-; test unconditional branch
-define i32 @main() {
-	br label %Test
-Test:		; preds = %Test, %0
-	%X = icmp eq i32 0, 4		; <i1> [#uses=1]
-	br i1 %X, label %Test, label %Label
-Label:		; preds = %Test
-	ret i32 0
-}
-
diff --git a/test/ExecutionEngine/test-call-no-external-funcs.ll b/test/ExecutionEngine/test-call-no-external-funcs.ll
deleted file mode 100644
index c3cb931..0000000
--- a/test/ExecutionEngine/test-call-no-external-funcs.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define i32 @_Z14func_exit_codev() nounwind uwtable {
-entry:
-  ret i32 0
-}
-
-define i32 @main() nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  %call = call i32 @_Z14func_exit_codev()
-  ret i32 %call
-}
diff --git a/test/ExecutionEngine/test-call.ll b/test/ExecutionEngine/test-call.ll
deleted file mode 100644
index 313a6c5..0000000
--- a/test/ExecutionEngine/test-call.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-declare void @exit(i32)
-
-define i32 @test(i8 %C, i16 %S) {
-	%X = trunc i16 %S to i8		; <i8> [#uses=1]
-	%Y = zext i8 %X to i32		; <i32> [#uses=1]
-	ret i32 %Y
-}
-
-define void @FP(void (i32)* %F) {
-	%X = call i32 @test( i8 123, i16 1024 )		; <i32> [#uses=1]
-	call void %F( i32 %X )
-	ret void
-}
-
-define i32 @main() {
-	call void @FP( void (i32)* @exit )
-	ret i32 1
-}
-
diff --git a/test/ExecutionEngine/test-cast.ll b/test/ExecutionEngine/test-cast.ll
deleted file mode 100644
index 667fa80..0000000
--- a/test/ExecutionEngine/test-cast.ll
+++ /dev/null
@@ -1,109 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define i32 @foo() {
-	ret i32 0
-}
-
-define i32 @main() {
-	icmp ne i1 true, false		; <i1>:1 [#uses=0]
-	zext i1 true to i8		; <i8>:2 [#uses=0]
-	zext i1 true to i8		; <i8>:3 [#uses=0]
-	zext i1 true to i16		; <i16>:4 [#uses=0]
-	zext i1 true to i16		; <i16>:5 [#uses=0]
-	zext i1 true to i32		; <i32>:6 [#uses=0]
-	zext i1 true to i32		; <i32>:7 [#uses=0]
-	zext i1 true to i64		; <i64>:8 [#uses=0]
-	zext i1 true to i64		; <i64>:9 [#uses=0]
-	uitofp i1 true to float		; <float>:10 [#uses=0]
-	uitofp i1 true to double		; <double>:11 [#uses=0]
-	icmp ne i8 0, 0		; <i1>:12 [#uses=0]
-	icmp ne i8 1, 0		; <i1>:13 [#uses=0]
-	bitcast i8 0 to i8		; <i8>:14 [#uses=0]
-	bitcast i8 -1 to i8		; <i8>:15 [#uses=0]
-	sext i8 4 to i16		; <i16>:16 [#uses=0]
-	sext i8 4 to i16		; <i16>:17 [#uses=0]
-	sext i8 4 to i64		; <i64>:18 [#uses=0]
-	sext i8 4 to i64		; <i64>:19 [#uses=0]
-	sitofp i8 4 to float		; <float>:20 [#uses=0]
-	sitofp i8 4 to double		; <double>:21 [#uses=0]
-	icmp ne i8 0, 0		; <i1>:22 [#uses=0]
-	icmp ne i8 1, 0		; <i1>:23 [#uses=0]
-	bitcast i8 0 to i8		; <i8>:24 [#uses=0]
-	bitcast i8 1 to i8		; <i8>:25 [#uses=0]
-	zext i8 4 to i16		; <i16>:26 [#uses=0]
-	zext i8 4 to i16		; <i16>:27 [#uses=0]
-	zext i8 4 to i64		; <i64>:28 [#uses=0]
-	zext i8 4 to i64		; <i64>:29 [#uses=0]
-	uitofp i8 0 to float		; <float>:30 [#uses=0]
-	uitofp i8 0 to double		; <double>:31 [#uses=0]
-	icmp ne i16 1, 0		; <i1>:32 [#uses=0]
-	trunc i16 -1 to i8		; <i8>:33 [#uses=0]
-	trunc i16 255 to i8		; <i8>:34 [#uses=0]
-	bitcast i16 0 to i16		; <i16>:35 [#uses=0]
-	bitcast i16 0 to i16		; <i16>:36 [#uses=0]
-	sext i16 0 to i64		; <i64>:37 [#uses=0]
-	sext i16 0 to i64		; <i64>:38 [#uses=0]
-	sitofp i16 0 to float		; <float>:39 [#uses=0]
-	sitofp i16 0 to double		; <double>:40 [#uses=0]
-	icmp ne i16 1, 0		; <i1>:41 [#uses=0]
-	trunc i16 1 to i8		; <i8>:42 [#uses=0]
-	trunc i16 255 to i8		; <i8>:43 [#uses=0]
-	bitcast i16 0 to i16		; <i16>:44 [#uses=0]
-	bitcast i16 0 to i16		; <i16>:45 [#uses=0]
-	zext i16 0 to i64		; <i64>:46 [#uses=0]
-	zext i16 0 to i64		; <i64>:47 [#uses=0]
-	uitofp i16 0 to float		; <float>:48 [#uses=0]
-	uitofp i16 0 to double		; <double>:49 [#uses=0]
-	icmp ne i32 6, 0		; <i1>:50 [#uses=0]
-	trunc i32 -6 to i8		; <i8>:51 [#uses=0]
-	trunc i32 6 to i8		; <i8>:52 [#uses=0]
-	trunc i32 6 to i16		; <i16>:53 [#uses=0]
-	bitcast i32 0 to i32		; <i32>:54 [#uses=0]
-	sext i32 0 to i64		; <i64>:55 [#uses=0]
-	sext i32 0 to i64		; <i64>:56 [#uses=0]
-	sitofp i32 0 to float		; <float>:57 [#uses=0]
-	sitofp i32 0 to double		; <double>:58 [#uses=0]
-	icmp ne i32 6, 0		; <i1>:59 [#uses=0]
-	trunc i32 7 to i8		; <i8>:60 [#uses=0]
-	trunc i32 8 to i8		; <i8>:61 [#uses=0]
-	trunc i32 9 to i16		; <i16>:62 [#uses=0]
-	bitcast i32 10 to i32		; <i32>:63 [#uses=0]
-	zext i32 0 to i64		; <i64>:64 [#uses=0]
-	zext i32 0 to i64		; <i64>:65 [#uses=0]
-	uitofp i32 0 to float		; <float>:66 [#uses=0]
-	uitofp i32 0 to double		; <double>:67 [#uses=0]
-	icmp ne i64 0, 0		; <i1>:68 [#uses=0]
-	trunc i64 0 to i8		; <i8>:69 [#uses=0]
-	trunc i64 0 to i8		; <i8>:70 [#uses=0]
-	trunc i64 0 to i16		; <i16>:71 [#uses=0]
-	trunc i64 0 to i16		; <i16>:72 [#uses=0]
-	trunc i64 0 to i32		; <i32>:73 [#uses=0]
-	trunc i64 0 to i32		; <i32>:74 [#uses=0]
-	bitcast i64 0 to i64		; <i64>:75 [#uses=0]
-	bitcast i64 0 to i64		; <i64>:76 [#uses=0]
-	sitofp i64 0 to float		; <float>:77 [#uses=0]
-	sitofp i64 0 to double		; <double>:78 [#uses=0]
-	icmp ne i64 1, 0		; <i1>:79 [#uses=0]
-	trunc i64 1 to i8		; <i8>:80 [#uses=0]
-	trunc i64 1 to i8		; <i8>:81 [#uses=0]
-	trunc i64 1 to i16		; <i16>:82 [#uses=0]
-	trunc i64 1 to i16		; <i16>:83 [#uses=0]
-	trunc i64 1 to i32		; <i32>:84 [#uses=0]
-	trunc i64 1 to i32		; <i32>:85 [#uses=0]
-	bitcast i64 1 to i64		; <i64>:86 [#uses=0]
-	bitcast i64 1 to i64		; <i64>:87 [#uses=0]
-	uitofp i64 1 to float		; <float>:88 [#uses=0]
-	uitofp i64 0 to double		; <double>:89 [#uses=0]
-	bitcast float 0.000000e+00 to float		; <float>:90 [#uses=0]
-	fpext float 0.000000e+00 to double		; <double>:91 [#uses=0]
-	fptosi double 0.000000e+00 to i8		; <i8>:92 [#uses=0]
-	fptoui double 0.000000e+00 to i8		; <i8>:93 [#uses=0]
-	fptosi double 0.000000e+00 to i16		; <i16>:94 [#uses=0]
-	fptoui double 0.000000e+00 to i16		; <i16>:95 [#uses=0]
-	fptosi double 0.000000e+00 to i32		; <i32>:96 [#uses=0]
-	fptoui double 0.000000e+00 to i32		; <i32>:97 [#uses=0]
-	fptosi double 0.000000e+00 to i64		; <i64>:98 [#uses=0]
-	fptrunc double 0.000000e+00 to float		; <float>:99 [#uses=0]
-	bitcast double 0.000000e+00 to double		; <double>:100 [#uses=0]
-	ret i32 0
-}
diff --git a/test/ExecutionEngine/test-common-symbols.ll b/test/ExecutionEngine/test-common-symbols.ll
deleted file mode 100644
index 19e2ce5..0000000
--- a/test/ExecutionEngine/test-common-symbols.ll
+++ /dev/null
@@ -1,88 +0,0 @@
-; RUN: %lli -O0 -disable-lazy-compilation=false %s
-
-; The intention of this test is to verify that symbols mapped to COMMON in ELF
-; work as expected.
-;
-; Compiled from this C code:
-;
-; int zero_int;
-; double zero_double;
-; int zero_arr[10];
-; 
-; int main()
-; {
-;     zero_arr[zero_int + 5] = 40;
-; 
-;     if (zero_double < 1.0)
-;         zero_arr[zero_int + 2] = 70;
-; 
-;     for (int i = 1; i < 10; ++i) {
-;         zero_arr[i] = zero_arr[i - 1] + zero_arr[i];
-;     }
-;     return zero_arr[9] == 110 ? 0 : -1;
-; }
-
-@zero_int = common global i32 0, align 4
-@zero_arr = common global [10 x i32] zeroinitializer, align 16
-@zero_double = common global double 0.000000e+00, align 8
-
-define i32 @main() nounwind {
-entry:
-  %retval = alloca i32, align 4
-  %i = alloca i32, align 4
-  store i32 0, i32* %retval
-  %0 = load i32* @zero_int, align 4
-  %add = add nsw i32 %0, 5
-  %idxprom = sext i32 %add to i64
-  %arrayidx = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom
-  store i32 40, i32* %arrayidx, align 4
-  %1 = load double* @zero_double, align 8
-  %cmp = fcmp olt double %1, 1.000000e+00
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  %2 = load i32* @zero_int, align 4
-  %add1 = add nsw i32 %2, 2
-  %idxprom2 = sext i32 %add1 to i64
-  %arrayidx3 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom2
-  store i32 70, i32* %arrayidx3, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %entry
-  store i32 1, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %if.end
-  %3 = load i32* %i, align 4
-  %cmp4 = icmp slt i32 %3, 10
-  br i1 %cmp4, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %4 = load i32* %i, align 4
-  %sub = sub nsw i32 %4, 1
-  %idxprom5 = sext i32 %sub to i64
-  %arrayidx6 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom5
-  %5 = load i32* %arrayidx6, align 4
-  %6 = load i32* %i, align 4
-  %idxprom7 = sext i32 %6 to i64
-  %arrayidx8 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom7
-  %7 = load i32* %arrayidx8, align 4
-  %add9 = add nsw i32 %5, %7
-  %8 = load i32* %i, align 4
-  %idxprom10 = sext i32 %8 to i64
-  %arrayidx11 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom10
-  store i32 %add9, i32* %arrayidx11, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body
-  %9 = load i32* %i, align 4
-  %inc = add nsw i32 %9, 1
-  store i32 %inc, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %10 = load i32* getelementptr inbounds ([10 x i32]* @zero_arr, i32 0, i64 9), align 4
-  %cmp12 = icmp eq i32 %10, 110
-  %cond = select i1 %cmp12, i32 0, i32 -1
-  ret i32 %cond
-}
diff --git a/test/ExecutionEngine/test-constantexpr.ll b/test/ExecutionEngine/test-constantexpr.ll
deleted file mode 100644
index d01479a..0000000
--- a/test/ExecutionEngine/test-constantexpr.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-; This tests to make sure that we can evaluate weird constant expressions
-
-@A = global i32 5		; <i32*> [#uses=1]
-@B = global i32 6		; <i32*> [#uses=1]
-
-define i32 @main() {
-	%A = or i1 false, icmp slt (i32* @A, i32* @B)		; <i1> [#uses=0]
-	ret i32 0
-}
-
diff --git a/test/ExecutionEngine/test-fp-no-external-funcs.ll b/test/ExecutionEngine/test-fp-no-external-funcs.ll
deleted file mode 100644
index 61b12c2..0000000
--- a/test/ExecutionEngine/test-fp-no-external-funcs.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: %lli  %s > /dev/null
-
-define double @test(double* %DP, double %Arg) {
-	%D = load double* %DP		; <double> [#uses=1]
-	%V = fadd double %D, 1.000000e+00		; <double> [#uses=2]
-	%W = fsub double %V, %V		; <double> [#uses=3]
-	%X = fmul double %W, %W		; <double> [#uses=2]
-	%Y = fdiv double %X, %X		; <double> [#uses=2]
-	%Q = fadd double %Y, %Arg		; <double> [#uses=1]
-	%R = bitcast double %Q to double		; <double> [#uses=1]
-	store double %Q, double* %DP
-	ret double %Y
-}
-
-define i32 @main() {
-	%X = alloca double		; <double*> [#uses=2]
-	store double 0.000000e+00, double* %X
-	call double @test( double* %X, double 2.000000e+00 )		; <double>:1 [#uses=0]
-	ret i32 0
-}
-
diff --git a/test/ExecutionEngine/test-fp.ll b/test/ExecutionEngine/test-fp.ll
deleted file mode 100644
index 2bf0210..0000000
--- a/test/ExecutionEngine/test-fp.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define double @test(double* %DP, double %Arg) {
-	%D = load double* %DP		; <double> [#uses=1]
-	%V = fadd double %D, 1.000000e+00		; <double> [#uses=2]
-	%W = fsub double %V, %V		; <double> [#uses=3]
-	%X = fmul double %W, %W		; <double> [#uses=2]
-	%Y = fdiv double %X, %X		; <double> [#uses=2]
-	%Z = frem double %Y, %Y		; <double> [#uses=3]
-	%Z1 = fdiv double %Z, %W		; <double> [#uses=0]
-	%Q = fadd double %Z, %Arg		; <double> [#uses=1]
-	%R = bitcast double %Q to double		; <double> [#uses=1]
-	store double %R, double* %DP
-	ret double %Z
-}
-
-define i32 @main() {
-	%X = alloca double		; <double*> [#uses=2]
-	store double 0.000000e+00, double* %X
-	call double @test( double* %X, double 2.000000e+00 )		; <double>:1 [#uses=0]
-	ret i32 0
-}
-
diff --git a/test/ExecutionEngine/test-global-init-nonzero.ll b/test/ExecutionEngine/test-global-init-nonzero.ll
deleted file mode 100644
index 749a485..0000000
--- a/test/ExecutionEngine/test-global-init-nonzero.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: %lli  %s > /dev/null
-
-@count = global i32 1, align 4
-
-define i32 @main() nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  %i = alloca i32, align 4
-  store i32 0, i32* %retval
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32* %i, align 4
-  %cmp = icmp slt i32 %0, 49
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %1 = load i32* @count, align 4
-  %inc = add nsw i32 %1, 1
-  store i32 %inc, i32* @count, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body
-  %2 = load i32* %i, align 4
-  %inc1 = add nsw i32 %2, 1
-  store i32 %inc1, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %3 = load i32* @count, align 4
-  %sub = sub nsw i32 %3, 50
-  ret i32 %sub
-}
diff --git a/test/ExecutionEngine/test-global.ll b/test/ExecutionEngine/test-global.ll
deleted file mode 100644
index 69e5455..0000000
--- a/test/ExecutionEngine/test-global.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-@count = global i32 0, align 4
-
-define i32 @main() nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  %i = alloca i32, align 4
-  store i32 0, i32* %retval
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32* %i, align 4
-  %cmp = icmp slt i32 %0, 50
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %1 = load i32* @count, align 4
-  %inc = add nsw i32 %1, 1
-  store i32 %inc, i32* @count, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body
-  %2 = load i32* %i, align 4
-  %inc1 = add nsw i32 %2, 1
-  store i32 %inc1, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %3 = load i32* @count, align 4
-  %sub = sub nsw i32 %3, 50
-  ret i32 %sub
-}
diff --git a/test/ExecutionEngine/test-loadstore.ll b/test/ExecutionEngine/test-loadstore.ll
deleted file mode 100644
index 1797599..0000000
--- a/test/ExecutionEngine/test-loadstore.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define void @test(i8* %P, i16* %P.upgrd.1, i32* %P.upgrd.2, i64* %P.upgrd.3) {
-	%V = load i8* %P		; <i8> [#uses=1]
-	store i8 %V, i8* %P
-	%V.upgrd.4 = load i16* %P.upgrd.1		; <i16> [#uses=1]
-	store i16 %V.upgrd.4, i16* %P.upgrd.1
-	%V.upgrd.5 = load i32* %P.upgrd.2		; <i32> [#uses=1]
-	store i32 %V.upgrd.5, i32* %P.upgrd.2
-	%V.upgrd.6 = load i64* %P.upgrd.3		; <i64> [#uses=1]
-	store i64 %V.upgrd.6, i64* %P.upgrd.3
-	ret void
-}
-
-define i32 @varalloca(i32 %Size) {
-        ;; Variable sized alloca
-	%X = alloca i32, i32 %Size		; <i32*> [#uses=2]
-	store i32 %Size, i32* %X
-	%Y = load i32* %X		; <i32> [#uses=1]
-	ret i32 %Y
-}
-
-define i32 @main() {
-	%A = alloca i8		; <i8*> [#uses=1]
-	%B = alloca i16		; <i16*> [#uses=1]
-	%C = alloca i32		; <i32*> [#uses=1]
-	%D = alloca i64		; <i64*> [#uses=1]
-	call void @test( i8* %A, i16* %B, i32* %C, i64* %D )
-	call i32 @varalloca( i32 7 )		; <i32>:1 [#uses=0]
-	ret i32 0
-}
diff --git a/test/ExecutionEngine/test-local.ll b/test/ExecutionEngine/test-local.ll
deleted file mode 100644
index ec5ba16..0000000
--- a/test/ExecutionEngine/test-local.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define i32 @main() nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  %count = alloca i32, align 4
-  %i = alloca i32, align 4
-  store i32 0, i32* %retval
-  store i32 0, i32* %count, align 4
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32* %i, align 4
-  %cmp = icmp slt i32 %0, 50
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %1 = load i32* %count, align 4
-  %inc = add nsw i32 %1, 1
-  store i32 %inc, i32* %count, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body
-  %2 = load i32* %i, align 4
-  %inc1 = add nsw i32 %2, 1
-  store i32 %inc1, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %3 = load i32* %count, align 4
-  %sub = sub nsw i32 %3, 50
-  ret i32 %sub
-}
diff --git a/test/ExecutionEngine/test-logical.ll b/test/ExecutionEngine/test-logical.ll
deleted file mode 100644
index 05b381b..0000000
--- a/test/ExecutionEngine/test-logical.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define i32 @main() {
-	%A = and i8 4, 8		; <i8> [#uses=2]
-	%B = or i8 %A, 7		; <i8> [#uses=1]
-	%C = xor i8 %B, %A		; <i8> [#uses=0]
-	%A.upgrd.1 = and i16 4, 8		; <i16> [#uses=2]
-	%B.upgrd.2 = or i16 %A.upgrd.1, 7		; <i16> [#uses=1]
-	%C.upgrd.3 = xor i16 %B.upgrd.2, %A.upgrd.1		; <i16> [#uses=0]
-	%A.upgrd.4 = and i32 4, 8		; <i32> [#uses=2]
-	%B.upgrd.5 = or i32 %A.upgrd.4, 7		; <i32> [#uses=1]
-	%C.upgrd.6 = xor i32 %B.upgrd.5, %A.upgrd.4		; <i32> [#uses=0]
-	%A.upgrd.7 = and i64 4, 8		; <i64> [#uses=2]
-	%B.upgrd.8 = or i64 %A.upgrd.7, 7		; <i64> [#uses=1]
-	%C.upgrd.9 = xor i64 %B.upgrd.8, %A.upgrd.7		; <i64> [#uses=0]
-	ret i32 0
-}
-
diff --git a/test/ExecutionEngine/test-loop.ll b/test/ExecutionEngine/test-loop.ll
deleted file mode 100644
index e951a14..0000000
--- a/test/ExecutionEngine/test-loop.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define i32 @main() {
-; <label>:0
-	br label %Loop
-Loop:		; preds = %Loop, %0
-	%I = phi i32 [ 0, %0 ], [ %i2, %Loop ]		; <i32> [#uses=1]
-	%i2 = add i32 %I, 1		; <i32> [#uses=2]
-	%C = icmp eq i32 %i2, 10		; <i1> [#uses=1]
-	br i1 %C, label %Out, label %Loop
-Out:		; preds = %Loop
-	ret i32 0
-}
-
diff --git a/test/ExecutionEngine/test-phi.ll b/test/ExecutionEngine/test-phi.ll
deleted file mode 100644
index c5bdfd5..0000000
--- a/test/ExecutionEngine/test-phi.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-; test phi node
-@Y = global i32 6		; <i32*> [#uses=1]
-
-define void @blah(i32* %X) {
-; <label>:0
-	br label %T
-T:		; preds = %Dead, %0
-	phi i32* [ %X, %0 ], [ @Y, %Dead ]		; <i32*>:1 [#uses=0]
-	ret void
-Dead:		; No predecessors!
-	br label %T
-}
-
-define i32 @test(i1 %C) {
-; <label>:0
-	br i1 %C, label %T, label %T
-T:		; preds = %0, %0
-	%X = phi i32 [ 123, %0 ], [ 123, %0 ]		; <i32> [#uses=1]
-	ret i32 %X
-}
-
-define i32 @main() {
-; <label>:0
-	br label %Test
-Test:		; preds = %Dead, %0
-	%X = phi i32 [ 0, %0 ], [ %Y, %Dead ]		; <i32> [#uses=1]
-	ret i32 %X
-Dead:		; No predecessors!
-	%Y = ashr i32 12, 4		; <i32> [#uses=1]
-	br label %Test
-}
-
diff --git a/test/ExecutionEngine/test-ret.ll b/test/ExecutionEngine/test-ret.ll
deleted file mode 100644
index 025f53e..0000000
--- a/test/ExecutionEngine/test-ret.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-; test return instructions
-define void @test1() {
-	ret void
-}
-
-define i8 @test2() {
-	ret i8 1
-}
-
-define i8 @test3() {
-	ret i8 1
-}
-
-define i16 @test4() {
-	ret i16 -1
-}
-
-define i16 @test5() {
-	ret i16 -1
-}
-
-define i32 @main() {
-	ret i32 0
-}
-
-define i32 @test6() {
-	ret i32 4
-}
-
-define i64 @test7() {
-	ret i64 0
-}
-
-define i64 @test8() {
-	ret i64 0
-}
-
-define float @test9() {
-	ret float 1.000000e+00
-}
-
-define double @test10() {
-	ret double 2.000000e+00
-}
diff --git a/test/ExecutionEngine/test-return.ll b/test/ExecutionEngine/test-return.ll
deleted file mode 100644
index d464a4b..0000000
--- a/test/ExecutionEngine/test-return.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define i32 @main() nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  ret i32 0
-}
diff --git a/test/ExecutionEngine/test-setcond-fp.ll b/test/ExecutionEngine/test-setcond-fp.ll
deleted file mode 100644
index 68276e6..0000000
--- a/test/ExecutionEngine/test-setcond-fp.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-
-define i32 @main() {
-	%double1 = fadd double 0.000000e+00, 0.000000e+00		; <double> [#uses=6]
-	%double2 = fadd double 0.000000e+00, 0.000000e+00		; <double> [#uses=6]
-	%float1 = fadd float 0.000000e+00, 0.000000e+00		; <float> [#uses=6]
-	%float2 = fadd float 0.000000e+00, 0.000000e+00		; <float> [#uses=6]
-	%test49 = fcmp oeq float %float1, %float2		; <i1> [#uses=0]
-	%test50 = fcmp oge float %float1, %float2		; <i1> [#uses=0]
-	%test51 = fcmp ogt float %float1, %float2		; <i1> [#uses=0]
-	%test52 = fcmp ole float %float1, %float2		; <i1> [#uses=0]
-	%test53 = fcmp olt float %float1, %float2		; <i1> [#uses=0]
-	%test54 = fcmp une float %float1, %float2		; <i1> [#uses=0]
-	%test55 = fcmp oeq double %double1, %double2		; <i1> [#uses=0]
-	%test56 = fcmp oge double %double1, %double2		; <i1> [#uses=0]
-	%test57 = fcmp ogt double %double1, %double2		; <i1> [#uses=0]
-	%test58 = fcmp ole double %double1, %double2		; <i1> [#uses=0]
-	%test59 = fcmp olt double %double1, %double2		; <i1> [#uses=0]
-	%test60 = fcmp une double %double1, %double2		; <i1> [#uses=0]
-	ret i32 0
-}
-
-
diff --git a/test/ExecutionEngine/test-setcond-int.ll b/test/ExecutionEngine/test-setcond-int.ll
deleted file mode 100644
index 48dc021..0000000
--- a/test/ExecutionEngine/test-setcond-int.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define i32 @main() {
-	%int1 = add i32 0, 0		; <i32> [#uses=6]
-	%int2 = add i32 0, 0		; <i32> [#uses=6]
-	%long1 = add i64 0, 0		; <i64> [#uses=6]
-	%long2 = add i64 0, 0		; <i64> [#uses=6]
-	%sbyte1 = add i8 0, 0		; <i8> [#uses=6]
-	%sbyte2 = add i8 0, 0		; <i8> [#uses=6]
-	%short1 = add i16 0, 0		; <i16> [#uses=6]
-	%short2 = add i16 0, 0		; <i16> [#uses=6]
-	%ubyte1 = add i8 0, 0		; <i8> [#uses=6]
-	%ubyte2 = add i8 0, 0		; <i8> [#uses=6]
-	%uint1 = add i32 0, 0		; <i32> [#uses=6]
-	%uint2 = add i32 0, 0		; <i32> [#uses=6]
-	%ulong1 = add i64 0, 0		; <i64> [#uses=6]
-	%ulong2 = add i64 0, 0		; <i64> [#uses=6]
-	%ushort1 = add i16 0, 0		; <i16> [#uses=6]
-	%ushort2 = add i16 0, 0		; <i16> [#uses=6]
-	%test1 = icmp eq i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
-	%test2 = icmp uge i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
-	%test3 = icmp ugt i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
-	%test4 = icmp ule i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
-	%test5 = icmp ult i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
-	%test6 = icmp ne i8 %ubyte1, %ubyte2		; <i1> [#uses=0]
-	%test7 = icmp eq i16 %ushort1, %ushort2		; <i1> [#uses=0]
-	%test8 = icmp uge i16 %ushort1, %ushort2		; <i1> [#uses=0]
-	%test9 = icmp ugt i16 %ushort1, %ushort2		; <i1> [#uses=0]
-	%test10 = icmp ule i16 %ushort1, %ushort2		; <i1> [#uses=0]
-	%test11 = icmp ult i16 %ushort1, %ushort2		; <i1> [#uses=0]
-	%test12 = icmp ne i16 %ushort1, %ushort2		; <i1> [#uses=0]
-	%test13 = icmp eq i32 %uint1, %uint2		; <i1> [#uses=0]
-	%test14 = icmp uge i32 %uint1, %uint2		; <i1> [#uses=0]
-	%test15 = icmp ugt i32 %uint1, %uint2		; <i1> [#uses=0]
-	%test16 = icmp ule i32 %uint1, %uint2		; <i1> [#uses=0]
-	%test17 = icmp ult i32 %uint1, %uint2		; <i1> [#uses=0]
-	%test18 = icmp ne i32 %uint1, %uint2		; <i1> [#uses=0]
-	%test19 = icmp eq i64 %ulong1, %ulong2		; <i1> [#uses=0]
-	%test20 = icmp uge i64 %ulong1, %ulong2		; <i1> [#uses=0]
-	%test21 = icmp ugt i64 %ulong1, %ulong2		; <i1> [#uses=0]
-	%test22 = icmp ule i64 %ulong1, %ulong2		; <i1> [#uses=0]
-	%test23 = icmp ult i64 %ulong1, %ulong2		; <i1> [#uses=0]
-	%test24 = icmp ne i64 %ulong1, %ulong2		; <i1> [#uses=0]
-	%test25 = icmp eq i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
-	%test26 = icmp sge i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
-	%test27 = icmp sgt i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
-	%test28 = icmp sle i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
-	%test29 = icmp slt i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
-	%test30 = icmp ne i8 %sbyte1, %sbyte2		; <i1> [#uses=0]
-	%test31 = icmp eq i16 %short1, %short2		; <i1> [#uses=0]
-	%test32 = icmp sge i16 %short1, %short2		; <i1> [#uses=0]
-	%test33 = icmp sgt i16 %short1, %short2		; <i1> [#uses=0]
-	%test34 = icmp sle i16 %short1, %short2		; <i1> [#uses=0]
-	%test35 = icmp slt i16 %short1, %short2		; <i1> [#uses=0]
-	%test36 = icmp ne i16 %short1, %short2		; <i1> [#uses=0]
-	%test37 = icmp eq i32 %int1, %int2		; <i1> [#uses=0]
-	%test38 = icmp sge i32 %int1, %int2		; <i1> [#uses=0]
-	%test39 = icmp sgt i32 %int1, %int2		; <i1> [#uses=0]
-	%test40 = icmp sle i32 %int1, %int2		; <i1> [#uses=0]
-	%test41 = icmp slt i32 %int1, %int2		; <i1> [#uses=0]
-	%test42 = icmp ne i32 %int1, %int2		; <i1> [#uses=0]
-	%test43 = icmp eq i64 %long1, %long2		; <i1> [#uses=0]
-	%test44 = icmp sge i64 %long1, %long2		; <i1> [#uses=0]
-	%test45 = icmp sgt i64 %long1, %long2		; <i1> [#uses=0]
-	%test46 = icmp sle i64 %long1, %long2		; <i1> [#uses=0]
-	%test47 = icmp slt i64 %long1, %long2		; <i1> [#uses=0]
-	%test48 = icmp ne i64 %long1, %long2		; <i1> [#uses=0]
-	ret i32 0
-}
diff --git a/test/ExecutionEngine/test-shift.ll b/test/ExecutionEngine/test-shift.ll
deleted file mode 100644
index 590e262..0000000
--- a/test/ExecutionEngine/test-shift.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: %lli %s > /dev/null
-
-define i32 @main() {
-	%shamt = add i8 0, 1		; <i8> [#uses=8]
-	%shift.upgrd.1 = zext i8 %shamt to i32		; <i32> [#uses=1]
-	%t1.s = shl i32 1, %shift.upgrd.1		; <i32> [#uses=0]
-	%t2.s = shl i32 1, 4		; <i32> [#uses=0]
-	%shift.upgrd.2 = zext i8 %shamt to i32		; <i32> [#uses=1]
-	%t1 = shl i32 1, %shift.upgrd.2		; <i32> [#uses=0]
-	%t2 = shl i32 1, 5		; <i32> [#uses=0]
-	%t2.s.upgrd.3 = shl i64 1, 4		; <i64> [#uses=0]
-	%t2.upgrd.4 = shl i64 1, 5		; <i64> [#uses=0]
-	%shift.upgrd.5 = zext i8 %shamt to i32		; <i32> [#uses=1]
-	%tr1.s = ashr i32 1, %shift.upgrd.5		; <i32> [#uses=0]
-	%tr2.s = ashr i32 1, 4		; <i32> [#uses=0]
-	%shift.upgrd.6 = zext i8 %shamt to i32		; <i32> [#uses=1]
-	%tr1 = lshr i32 1, %shift.upgrd.6		; <i32> [#uses=0]
-	%tr2 = lshr i32 1, 5		; <i32> [#uses=0]
-	%tr1.l = ashr i64 1, 4		; <i64> [#uses=0]
-	%shift.upgrd.7 = zext i8 %shamt to i64		; <i64> [#uses=1]
-	%tr2.l = ashr i64 1, %shift.upgrd.7		; <i64> [#uses=0]
-	%tr3.l = shl i64 1, 4		; <i64> [#uses=0]
-	%shift.upgrd.8 = zext i8 %shamt to i64		; <i64> [#uses=1]
-	%tr4.l = shl i64 1, %shift.upgrd.8		; <i64> [#uses=0]
-	%tr1.u = lshr i64 1, 5		; <i64> [#uses=0]
-	%shift.upgrd.9 = zext i8 %shamt to i64		; <i64> [#uses=1]
-	%tr2.u = lshr i64 1, %shift.upgrd.9		; <i64> [#uses=0]
-	%tr3.u = shl i64 1, 5		; <i64> [#uses=0]
-	%shift.upgrd.10 = zext i8 %shamt to i64		; <i64> [#uses=1]
-	%tr4.u = shl i64 1, %shift.upgrd.10		; <i64> [#uses=0]
-	ret i32 0
-}
diff --git a/test/Feature/NamedMDNode.ll b/test/Feature/NamedMDNode.ll
index 0c6bcd9..c1e72a8 100644
--- a/test/Feature/NamedMDNode.ll
+++ b/test/Feature/NamedMDNode.ll
@@ -1,8 +1,8 @@
 ; RUN: llvm-as < %s | llvm-dis | grep "llvm.stuff = "
 
 ;; Simple NamedMDNode
-!0 = metadata !{i32 42}
-!1 = metadata !{metadata !"foo"}
+!0 = !{i32 42}
+!1 = !{!"foo"}
 !llvm.stuff = !{!0, !1}
 
 !samename = !{!0, !1}
diff --git a/test/Feature/NamedMDNode2.ll b/test/Feature/NamedMDNode2.ll
index 0524dd2..94e4458 100644
--- a/test/Feature/NamedMDNode2.ll
+++ b/test/Feature/NamedMDNode2.ll
@@ -3,5 +3,5 @@
 
 
 @foo = constant i1 false
-!0 = metadata !{i1 false}
+!0 = !{i1 false}
 !a = !{!0}
diff --git a/test/Feature/callingconventions.ll b/test/Feature/callingconventions.ll
index 192f07a..8b339d4 100644
--- a/test/Feature/callingconventions.ll
+++ b/test/Feature/callingconventions.ll
@@ -52,4 +52,11 @@ U:
   resume { i8*, i32 } %exn
 }
 
+declare ghccc void @ghc_callee()
+
+define void @ghc_caller() {
+  call ghccc void @ghc_callee()
+  ret void
+}
+
 declare i32 @__gxx_personality_v0(...)
diff --git a/test/Feature/comdat.ll b/test/Feature/comdat.ll
index 1e878bb..c2a9d63 100644
--- a/test/Feature/comdat.ll
+++ b/test/Feature/comdat.ll
@@ -6,16 +6,16 @@ $f = comdat any
 $f2 = comdat any
 ; CHECK-NOT: f2
 
-@v = global i32 0, comdat $f
-; CHECK: @v = global i32 0, comdat $f
+@v = global i32 0, comdat($f)
+; CHECK: @v = global i32 0, comdat($f)
 
 @a = alias i32* @v
 ; CHECK: @a = alias i32* @v{{$}}
 
-define void @f() comdat $f {
+define void @f() comdat($f) {
   ret void
 }
-; CHECK: define void @f() comdat $f
+; CHECK: define void @f() comdat {
 
 $i = comdat largest
-@i = internal global i32 0, comdat $i
+@i = internal global i32 0, comdat($i)
diff --git a/test/Feature/md_on_instruction.ll b/test/Feature/md_on_instruction.ll
index fe01162..511cc85 100644
--- a/test/Feature/md_on_instruction.ll
+++ b/test/Feature/md_on_instruction.ll
@@ -18,10 +18,10 @@ declare void @llvm.dbg.region.end(metadata) nounwind readnone
 
 !llvm.module.flags = !{!6}
 
-!0 = metadata !{metadata !"0x2e\00foo\00foo\00foo\001\000\001\000\006\000\000\000", i32 0, metadata !1, metadata !2, null, null, null, null} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x11\0012\00clang 1.0\001\00\000\00\000", metadata !4, metadata !5, metadata !5, metadata !4, null, null} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !1} ; [ DW_TAG_base_type ]
-!3 = metadata !{i32 1, i32 13, metadata !1, metadata !1}
-!4 = metadata !{metadata !"foo.c", metadata !"/tmp"}
-!5 = metadata !{i32 0}
-!6 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00foo\00foo\00foo\001\000\001\000\006\000\000\000", i32 0, !1, !2, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x11\0012\00clang 1.0\001\00\000\00\000", !4, !5, !5, !4, null, null} ; [ DW_TAG_compile_unit ]
+!2 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !1} ; [ DW_TAG_base_type ]
+!3 = !MDLocation(line: 1, column: 13, scope: !1, inlinedAt: !1)
+!4 = !{!"foo.c", !"/tmp"}
+!5 = !{i32 0}
+!6 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Feature/metadata.ll b/test/Feature/metadata.ll
index 9856b37..612a79e 100644
--- a/test/Feature/metadata.ll
+++ b/test/Feature/metadata.ll
@@ -2,16 +2,16 @@
 ; PR7105
 
 define void @foo(i32 %x) {
-  call void @llvm.zonk(metadata !1, i64 0, metadata !1)
-  store i32 0, i32* null, !whatever !0, !whatever_else !{}, !more !{metadata !"hello"}
-  store i32 0, i32* null, !whatever !{i32 %x, metadata !"hello", metadata !1, metadata !{}, metadata !2}
-  ret void, !whatever !{i32 %x}
+  call void @llvm.zonk(metadata i32 %x, i64 0, metadata !1)
+  store i32 0, i32* null, !whatever !0, !whatever_else !{}, !more !{!"hello"}
+  store i32 0, i32* null, !whatever !{!"hello", !1, !{}, !2}
+  ret void, !_1 !0
 }
 
 declare void @llvm.zonk(metadata, i64, metadata) nounwind readnone
 
 !named = !{!0}
 !another_named = !{}
-!0 = metadata !{i8** null}
-!1 = metadata !{i8* null, metadata !2}
-!2 = metadata !{}
+!0 = !{i8** null}
+!1 = !{i8* null, !2}
+!2 = !{}
diff --git a/test/Feature/prologuedata.ll b/test/Feature/prologuedata.ll
new file mode 100644
index 0000000..63f424c
--- /dev/null
+++ b/test/Feature/prologuedata.ll
@@ -0,0 +1,18 @@
+; RUN: llvm-as < %s | llvm-dis > %t1.ll
+; RUN: FileCheck %s < %t1.ll
+; RUN: llvm-as < %t1.ll | llvm-dis > %t2.ll
+; RUN: diff %t1.ll %t2.ll
+; RUN: opt -O3 -S < %t1.ll | FileCheck %s
+
+; CHECK: @i
+@i = linkonce_odr global i32 1
+
+; CHECK: f(){{.*}}prologue i32 1
+define void @f() prologue i32 1 {
+  ret void
+}
+
+; CHECK: g(){{.*}}prologue i32* @i
+define void @g() prologue i32* @i {
+  ret void
+}
diff --git a/test/Feature/seh-nounwind.ll b/test/Feature/seh-nounwind.ll
new file mode 100644
index 0000000..2034716
--- /dev/null
+++ b/test/Feature/seh-nounwind.ll
@@ -0,0 +1,32 @@
+; RUN: opt -S -O2 < %s | FileCheck %s
+
+; Feature test that verifies that all optimizations leave asynch personality
+; invokes of nounwind functions alone.
+; The @div function in this test can fault, even though it can't
+; throw a synchronous exception.
+
+define i32 @div(i32 %n, i32 %d) nounwind noinline {
+entry:
+  %div = sdiv i32 %n, %d
+  ret i32 %div
+}
+
+define i32 @main() nounwind {
+entry:
+  %call = invoke i32 @div(i32 10, i32 0)
+          to label %__try.cont unwind label %lpad
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
+          catch i8* null
+  br label %__try.cont
+
+__try.cont:
+  %retval.0 = phi i32 [ %call, %entry ], [ 0, %lpad ]
+  ret i32 %retval.0
+}
+
+; CHECK-LABEL: define i32 @main()
+; CHECK: invoke i32 @div(i32 10, i32 0)
+
+declare i32 @__C_specific_handler(...)
diff --git a/test/FileCheck/same.txt b/test/FileCheck/same.txt
new file mode 100644
index 0000000..7160936
--- /dev/null
+++ b/test/FileCheck/same.txt
@@ -0,0 +1,23 @@
+foo bat bar
+baz
+
+RUN: FileCheck --input-file=%s --check-prefix=PASS1 %s
+PASS1: foo
+PASS1-SAME: bat
+PASS1-SAME: bar
+PASS1-NEXT: baz
+
+RUN: FileCheck --input-file=%s --check-prefix=PASS2 %s
+PASS2: foo
+PASS2-NOT: baz
+PASS2-SAME: bar
+PASS2-NEXT: baz
+
+RUN: not FileCheck --input-file=%s --check-prefix=FAIL1 %s
+FAIL1: foo
+FAIL1-SAME: baz
+
+RUN: not FileCheck --input-file=%s --check-prefix=FAIL2 %s
+FAIL2: foo
+FAIL2-NOT: bat
+FAIL2-SAME: bar
diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_mov.ll b/test/Instrumentation/AddressSanitizer/X86/asm_mov.ll
index 7f5d3b0..4162d9d 100644
--- a/test/Instrumentation/AddressSanitizer/X86/asm_mov.ll
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_mov.ll
@@ -145,8 +145,8 @@ entry:
 attributes #0 = { nounwind uwtable sanitize_address "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
-!0 = metadata !{i32 98, i32 122, i32 160}
-!1 = metadata !{i32 305, i32 329, i32 367}
-!2 = metadata !{i32 512, i32 537, i32 576}
-!3 = metadata !{i32 721, i32 746, i32 785}
-!4 = metadata !{i32 929, i32 957, i32 999}
+!0 = !{i32 98, i32 122, i32 160}
+!1 = !{i32 305, i32 329, i32 367}
+!2 = !{i32 512, i32 537, i32 576}
+!3 = !{i32 721, i32 746, i32 785}
+!4 = !{i32 929, i32 957, i32 999}
diff --git a/test/Instrumentation/AddressSanitizer/X86/bug_11395.ll b/test/Instrumentation/AddressSanitizer/X86/bug_11395.ll
index 63477aa..3f944c3 100644
--- a/test/Instrumentation/AddressSanitizer/X86/bug_11395.ll
+++ b/test/Instrumentation/AddressSanitizer/X86/bug_11395.ll
@@ -64,10 +64,10 @@ entry:
   ret void
 }
 
-!0 = metadata !{metadata !5, metadata !5, i64 0}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !6, metadata !6, i64 0}
-!4 = metadata !{i32 156132, i32 156164, i32 156205, i32 156238, i32 156282, i32 156332, i32 156370, i32 156408, i32 156447, i32 156486, i32 156536, i32 156574, i32 156612, i32 156651, i32 156690, i32 156740, i32 156778, i32 156816, i32 156855, i32 156894, i32 156944, i32 156982, i32 157020, i32 157059, i32 157098, i32 157148, i32 157186, i32 157224, i32 157263, i32 157302, i32 157352, i32 157390, i32 157428, i32 157467, i32 157506, i32 157556, i32 157594, i32 157632, i32 157671, i32 157710, i32 157760, i32 157798, i32 157836, i32 157875, i32 157914, i32 157952, i32 157996, i32 158046, i32 158099, i32 158140, i32 158179, i32 158218, i32 158268, i32 158321, i32 158362, i32 158401, i32 158440, i32 158490, i32 158543, i32 158584, i32 158623, i32 158662, i32 158712, i32 158765, i32 158806, i32 158845, i32 158884, i32 158922, i32 158963, i32 158996, i32 159029, i32 159062, i32 159109, i32 159154, i32 159199, i32 159243, i32 159286, i32 159329, i32 159375, i32 159422, i32 159478, i32 159522, i32 159566}
-!5 = metadata !{metadata !"any pointer", metadata !1}
-!6 = metadata !{metadata !"int", metadata !1}
+!0 = !{!5, !5, i64 0}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA", null}
+!3 = !{!6, !6, i64 0}
+!4 = !{i32 156132, i32 156164, i32 156205, i32 156238, i32 156282, i32 156332, i32 156370, i32 156408, i32 156447, i32 156486, i32 156536, i32 156574, i32 156612, i32 156651, i32 156690, i32 156740, i32 156778, i32 156816, i32 156855, i32 156894, i32 156944, i32 156982, i32 157020, i32 157059, i32 157098, i32 157148, i32 157186, i32 157224, i32 157263, i32 157302, i32 157352, i32 157390, i32 157428, i32 157467, i32 157506, i32 157556, i32 157594, i32 157632, i32 157671, i32 157710, i32 157760, i32 157798, i32 157836, i32 157875, i32 157914, i32 157952, i32 157996, i32 158046, i32 158099, i32 158140, i32 158179, i32 158218, i32 158268, i32 158321, i32 158362, i32 158401, i32 158440, i32 158490, i32 158543, i32 158584, i32 158623, i32 158662, i32 158712, i32 158765, i32 158806, i32 158845, i32 158884, i32 158922, i32 158963, i32 158996, i32 159029, i32 159062, i32 159109, i32 159154, i32 159199, i32 159243, i32 159286, i32 159329, i32 159375, i32 159422, i32 159478, i32 159522, i32 159566}
+!5 = !{!"any pointer", !1}
+!6 = !{!"int", !1}
diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll
index d9997e2..8020660 100644
--- a/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/test/Instrumentation/AddressSanitizer/basic.ll
@@ -170,4 +170,4 @@ define void @memintr_test(i8* %a, i8* %b) nounwind uwtable sanitize_address {
 ; CHECK: ret void
 
 ; PROF
-; CHECK: ![[PROF]] = metadata !{metadata !"branch_weights", i32 1, i32 100000}
+; CHECK: ![[PROF]] = !{!"branch_weights", i32 1, i32 100000}
diff --git a/test/Instrumentation/AddressSanitizer/debug_info.ll b/test/Instrumentation/AddressSanitizer/debug_info.ll
index ea51551..c0939c5 100644
--- a/test/Instrumentation/AddressSanitizer/debug_info.ll
+++ b/test/Instrumentation/AddressSanitizer/debug_info.ll
@@ -11,8 +11,8 @@ entry:
   %p.addr = alloca i32, align 4
   %r = alloca i32, align 4
   store i32 %p, i32* %p.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %p.addr}, metadata !10, metadata !{metadata !"0x102"}), !dbg !11
-  call void @llvm.dbg.declare(metadata !{i32* %r}, metadata !12, metadata !{metadata !"0x102"}), !dbg !14
+  call void @llvm.dbg.declare(metadata i32* %p.addr, metadata !10, metadata !{!"0x102"}), !dbg !11
+  call void @llvm.dbg.declare(metadata i32* %r, metadata !12, metadata !{!"0x102"}), !dbg !14
   %0 = load i32* %p.addr, align 4, !dbg !14
   %add = add nsw i32 %0, 1, !dbg !14
   store i32 %add, i32* %r, align 4, !dbg !14
@@ -33,30 +33,30 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!17}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 169314)\001\00\000\00\000", metadata !16, metadata !1, metadata !1, metadata !3, metadata !1, null} ; [ DW_TAG_compile_unit ] [/usr/local/google/llvm_cmake_clang/tmp/debuginfo/a.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{i32 0}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00zzz\00zzz\00_Z3zzzi\001\000\001\000\006\00256\000\001", metadata !16, metadata !6, metadata !7, null, i32 (i32)* @_Z3zzzi, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 1] [def] [zzz]
-!6 = metadata !{metadata !"0x29", metadata !16} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !"0x101\00p\0016777217\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [p] [line 1]
-!11 = metadata !{i32 1, i32 0, metadata !5, null}
-!12 = metadata !{metadata !"0x100\00r\002\000", metadata !13, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ] [r] [line 2]
+!0 = !{!"0x11\004\00clang version 3.3 (trunk 169314)\001\00\000\00\000", !16, !1, !1, !3, !1, null} ; [ DW_TAG_compile_unit ] [/usr/local/google/llvm_cmake_clang/tmp/debuginfo/a.cc] [DW_LANG_C_plus_plus]
+!1 = !{i32 0}
+!3 = !{!5}
+!5 = !{!"0x2e\00zzz\00zzz\00_Z3zzzi\001\000\001\000\006\00256\000\001", !16, !6, !7, null, i32 (i32)* @_Z3zzzi, null, null, !1} ; [ DW_TAG_subprogram ] [line 1] [def] [zzz]
+!6 = !{!"0x29", !16} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!"0x101\00p\0016777217\000", !5, !6, !9} ; [ DW_TAG_arg_variable ] [p] [line 1]
+!11 = !MDLocation(line: 1, scope: !5)
+!12 = !{!"0x100\00r\002\000", !13, !6, !9} ; [ DW_TAG_auto_variable ] [r] [line 2]
 
 ; Verify that debug descriptors for argument and local variable will be replaced
 ; with descriptors that end with OpDeref (encoded as 2).
 ;   CHECK: ![[ARG_ID]] = {{.*}} ; [ DW_TAG_arg_variable ] [p] [line 1]
-;   CHECK: ![[OPDEREF]] = metadata !{metadata !"0x102\006"}
+;   CHECK: ![[OPDEREF]] = !{!"0x102\006"}
 ;   CHECK: ![[VAR_ID]] = {{.*}} ; [ DW_TAG_auto_variable ] [r] [line 2]
 ; Verify that there are no more variable descriptors.
 ;   CHECK-NOT: DW_TAG_arg_variable
 ;   CHECK-NOT: DW_TAG_auto_variable
 
 
-!13 = metadata !{metadata !"0xb\001\000\000", metadata !16, metadata !5} ; [ DW_TAG_lexical_block ] [/usr/local/google/llvm_cmake_clang/tmp/debuginfo/a.cc]
-!14 = metadata !{i32 2, i32 0, metadata !13, null}
-!15 = metadata !{i32 3, i32 0, metadata !13, null}
-!16 = metadata !{metadata !"a.cc", metadata !"/usr/local/google/llvm_cmake_clang/tmp/debuginfo"}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!13 = !{!"0xb\001\000\000", !16, !5} ; [ DW_TAG_lexical_block ] [/usr/local/google/llvm_cmake_clang/tmp/debuginfo/a.cc]
+!14 = !MDLocation(line: 2, scope: !13)
+!15 = !MDLocation(line: 3, scope: !13)
+!16 = !{!"a.cc", !"/usr/local/google/llvm_cmake_clang/tmp/debuginfo"}
+!17 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Instrumentation/AddressSanitizer/do-not-instrument-cstring.ll b/test/Instrumentation/AddressSanitizer/do-not-instrument-cstring.ll
index de6a4de..f096ac1 100644
--- a/test/Instrumentation/AddressSanitizer/do-not-instrument-cstring.ll
+++ b/test/Instrumentation/AddressSanitizer/do-not-instrument-cstring.ll
@@ -1,6 +1,7 @@
 ; RUN: opt < %s -asan -asan-module -S | FileCheck %s
 
 target datalayout = "e"
+target triple = "x86_64-apple-darwin10.0.0"
 
 @foo = private global [19 x i8] c"scannerWithString:\00", section "__TEXT,__objc_methname,cstring_literals"
 
diff --git a/test/Instrumentation/AddressSanitizer/do-not-touch-comdat-global.ll b/test/Instrumentation/AddressSanitizer/do-not-touch-comdat-global.ll
index 8d14e83..fcc166e 100644
--- a/test/Instrumentation/AddressSanitizer/do-not-touch-comdat-global.ll
+++ b/test/Instrumentation/AddressSanitizer/do-not-touch-comdat-global.ll
@@ -5,7 +5,7 @@ target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
 target triple = "i686-pc-windows-msvc"
 ; no action should be taken for these globals
 $global_noinst = comdat largest
-@aliasee = private unnamed_addr constant [2 x i8] [i8 1, i8 2], comdat $global_noinst
+@aliasee = private unnamed_addr constant [2 x i8] [i8 1, i8 2], comdat($global_noinst)
 @global_noinst = unnamed_addr alias [2 x i8]* @aliasee
 ; CHECK-NOT: {{asan_gen.*global_noinst}}
 ; CHECK-DAG: @global_noinst = unnamed_addr alias [2 x i8]* @aliasee
diff --git a/test/Instrumentation/AddressSanitizer/global_metadata.ll b/test/Instrumentation/AddressSanitizer/global_metadata.ll
index fd5a8c6..3901745 100644
--- a/test/Instrumentation/AddressSanitizer/global_metadata.ll
+++ b/test/Instrumentation/AddressSanitizer/global_metadata.ll
@@ -53,15 +53,15 @@ attributes #1 = { nounwind sanitize_address "less-precise-fpmad"="false" "no-fra
 !llvm.asan.globals = !{!0, !1, !2, !3, !4}
 !llvm.ident = !{!5}
 
-!0 = metadata !{i32* @global, metadata !6, metadata !"global", i1 false, i1 false}
-!1 = metadata !{i32* @dyn_init_global, metadata !7, metadata !"dyn_init_global", i1 true, i1 false}
-!2 = metadata !{i32* @blacklisted_global, null, null, i1 false, i1 true}
-!3 = metadata !{i32* @_ZZ4funcvE10static_var, metadata !8, metadata !"static_var", i1 false, i1 false}
-!4 = metadata !{[14 x i8]* @.str, metadata !9, metadata !"<string literal>", i1 false, i1 false}
+!0 = !{i32* @global, !6, !"global", i1 false, i1 false}
+!1 = !{i32* @dyn_init_global, !7, !"dyn_init_global", i1 true, i1 false}
+!2 = !{i32* @blacklisted_global, null, null, i1 false, i1 true}
+!3 = !{i32* @_ZZ4funcvE10static_var, !8, !"static_var", i1 false, i1 false}
+!4 = !{[14 x i8]* @.str, !9, !"<string literal>", i1 false, i1 false}
 
-!5 = metadata !{metadata !"clang version 3.5.0 (211282)"}
+!5 = !{!"clang version 3.5.0 (211282)"}
 
-!6 = metadata !{metadata !"/tmp/asan-globals.cpp", i32 5, i32 5}
-!7 = metadata !{metadata !"/tmp/asan-globals.cpp", i32 7, i32 5}
-!8 = metadata !{metadata !"/tmp/asan-globals.cpp", i32 12, i32 14}
-!9 = metadata !{metadata !"/tmp/asan-globals.cpp", i32 14, i32 25}
+!6 = !{!"/tmp/asan-globals.cpp", i32 5, i32 5}
+!7 = !{!"/tmp/asan-globals.cpp", i32 7, i32 5}
+!8 = !{!"/tmp/asan-globals.cpp", i32 12, i32 14}
+!9 = !{!"/tmp/asan-globals.cpp", i32 14, i32 25}
diff --git a/test/Instrumentation/AddressSanitizer/instrument-dynamic-allocas.ll b/test/Instrumentation/AddressSanitizer/instrument-dynamic-allocas.ll
new file mode 100644
index 0000000..25807bb
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/instrument-dynamic-allocas.ll
@@ -0,0 +1,24 @@
+; Test asan internal compiler flags:
+;   -asan-instrument-allocas=1
+
+; RUN: opt < %s -asan -asan-module -asan-instrument-allocas=1 -S | FileCheck %s --check-prefix=CHECK-ALLOCA
+; RUN: opt < %s -asan -asan-module -asan-instrument-allocas=0 -S | FileCheck %s --check-prefix=CHECK-NOALLOCA
+; RUN: opt < %s -asan -asan-module -S | FileCheck %s --check-prefix=CHECK-NOALLOCA
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i32 %len) sanitize_address {
+entry:
+; CHECK-ALLOCA: store i32 -892679478
+; CHECK-ALLOCA: store i32 -875836469
+; CHECK-NOALLOCA-NOT: store i32 -892679478
+; CHECK-NOALLOCA-NOT: store i32 -875836469
+  %0 = alloca i32, align 4
+  %1 = alloca i8*
+  store i32 %len, i32* %0, align 4
+  %2 = load i32* %0, align 4
+  %3 = zext i32 %2 to i64
+  %4 = alloca i8, i64 %3, align 32
+  ret void
+}
+
diff --git a/test/Instrumentation/AddressSanitizer/instrument_global.ll b/test/Instrumentation/AddressSanitizer/instrument_global.ll
index 80791d9..259c815 100644
--- a/test/Instrumentation/AddressSanitizer/instrument_global.ll
+++ b/test/Instrumentation/AddressSanitizer/instrument_global.ll
@@ -69,7 +69,7 @@ entry:
 
 
 !llvm.asan.globals = !{!0}
-!0 = metadata !{[10 x i32]* @GlobDy, null, null, i1 true, i1 false}
+!0 = !{[10 x i32]* @GlobDy, null, null, i1 true, i1 false}
 
 ; CHECK-LABEL: define internal void @asan.module_ctor
 ; CHECK-NOT: ret
diff --git a/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll b/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
index c2bb0aa..b89ca44 100644
--- a/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
+++ b/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
@@ -7,10 +7,10 @@ target triple = "x86_64-unknown-linux-gnu"
 @YYY = global i32 0, align 4           ; W/o dynamic initializer.
 ; Clang will emit the following metadata identifying @xxx as dynamically
 ; initialized.
-!0 = metadata !{i32* @xxx, null, null, i1 true, i1 false}
-!1 = metadata !{i32* @XXX, null, null, i1 true, i1 false}
-!2 = metadata !{i32* @yyy, null, null, i1 false, i1 false}
-!3 = metadata !{i32* @YYY, null, null, i1 false, i1 false}
+!0 = !{i32* @xxx, null, null, i1 true, i1 false}
+!1 = !{i32* @XXX, null, null, i1 true, i1 false}
+!2 = !{i32* @yyy, null, null, i1 false, i1 false}
+!3 = !{i32* @YYY, null, null, i1 false, i1 false}
 !llvm.asan.globals = !{!0, !1, !2, !3}
 
 define i32 @initializer() uwtable {
diff --git a/test/Instrumentation/AddressSanitizer/keep-instrumented_functions.ll b/test/Instrumentation/AddressSanitizer/keep-instrumented_functions.ll
deleted file mode 100644
index 8726b8e..0000000
--- a/test/Instrumentation/AddressSanitizer/keep-instrumented_functions.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; Test the -asan-keep-uninstrumented-functions flag: FOO should get cloned
-; RUN: opt < %s -asan -asan-module -asan-keep-uninstrumented-functions -S | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@a = global i32 0, align 4
-
-define i32 @main() sanitize_address {
-entry:
-  tail call void @FOO(i32* @a)
-  ret i32 0
-}
-
-define void @FOO(i32* nocapture %x) sanitize_address {
-entry:
-  store i32 1, i32* %x, align 4
-  ret void
-}
-
-; main should not be cloned since it is not being instrumented by asan.
-; CHECK-NOT: NOASAN_main
-; CHECK: define void @FOO{{.*}} section "ASAN"
-; CHECK: define void @NOASAN_FOO{{.*}} section "NOASAN"
diff --git a/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll b/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll
new file mode 100644
index 0000000..43711b7
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -asan -asan-module -asan-stack-dynamic-alloca \
+; RUN:       -asan-use-after-return -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @Func1() sanitize_address {
+entry:
+; CHECK-LABEL: Func1
+
+; CHECK: entry:
+; CHECK: load i32* @__asan_option_detect_stack_use_after_return
+
+; CHECK: <label>:[[UAR_ENABLED_BB:[0-9]+]]
+; CHECK: [[FAKE_STACK_RT:%[0-9]+]] = call i64 @__asan_stack_malloc_
+
+; CHECK: <label>:[[FAKE_STACK_BB:[0-9]+]]
+; CHECK: [[FAKE_STACK:%[0-9]+]] = phi i64 [ 0, %entry ], [ [[FAKE_STACK_RT]], %[[UAR_ENABLED_BB]] ]
+; CHECK: icmp eq i64 [[FAKE_STACK]], 0
+
+; CHECK: <label>:[[NO_FAKE_STACK_BB:[0-9]+]]
+; CHECK: %MyAlloca = alloca i8, i64
+; CHECK: [[ALLOCA:%[0-9]+]] = ptrtoint i8* %MyAlloca
+
+; CHECK: phi i64 [ [[FAKE_STACK]], %[[FAKE_STACK_BB]] ], [ [[ALLOCA]], %[[NO_FAKE_STACK_BB]] ]
+
+; CHECK: ret void
+
+  %XXX = alloca [20 x i8], align 1
+  ret void
+}
+
+; Test that dynamic alloca is not used for functions with inline assembly.
+define void @Func2() sanitize_address {
+entry:
+; CHECK-LABEL: Func2
+; CHECK: alloca [96 x i8]
+; CHECK: ret void
+
+  %XXX = alloca [20 x i8], align 1
+  call void asm sideeffect "mov %%rbx, %%rcx", "~{dirflag},~{fpsr},~{flags}"() nounwind
+  ret void
+}
diff --git a/test/Instrumentation/AddressSanitizer/stack_layout.ll b/test/Instrumentation/AddressSanitizer/stack_layout.ll
index c027acf..97e3bbb 100644
--- a/test/Instrumentation/AddressSanitizer/stack_layout.ll
+++ b/test/Instrumentation/AddressSanitizer/stack_layout.ll
@@ -1,6 +1,9 @@
 ; Test the ASan's stack layout.
 ; More tests in tests/Transforms/Utils/ASanStackFrameLayoutTest.cpp
-; RUN: opt < %s -asan -asan-module -S | FileCheck %s
+; RUN: opt < %s -asan -asan-module -asan-stack-dynamic-alloca=0 -S \
+; RUN:     | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-STATIC
+; RUN: opt < %s -asan -asan-module -asan-stack-dynamic-alloca=1 -S \
+; RUN:     | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DYNAMIC
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -14,7 +17,10 @@ declare void @Use(i8*)
 define void @Func1() sanitize_address {
 entry:
 ; CHECK-LABEL: Func1
-; CHECK: alloca [192 x i8]
+
+; CHECK-STATIC: alloca [192 x i8]
+; CHECK-DYNAMIC: alloca i8, i64 192
+
 ; CHECK-NOT: alloca
 ; CHECK: ret void
   %XXX = alloca [10 x i8], align 1
@@ -26,7 +32,10 @@ entry:
 define void @Func2() sanitize_address {
 entry:
 ; CHECK-LABEL: Func2
-; CHECK: alloca [864 x i8]
+
+; CHECK-STATIC: alloca [864 x i8]
+; CHECK-DYNAMIC: alloca i8, i64 864
+
 ; CHECK-NOT: alloca
 ; CHECK: ret void
   %AAA = alloca [5 x i8], align 1
@@ -39,7 +48,10 @@ entry:
 define void @Func3() sanitize_address {
 entry:
 ; CHECK-LABEL: Func3
-; CHECK: alloca [768 x i8]
+
+; CHECK-STATIC: alloca [768 x i8]
+; CHECK-DYNAMIC: alloca i8, i64 768
+
 ; CHECK-NOT: alloca
 ; CHECK: ret void
   %AAA = alloca [128 x i8], align 16
diff --git a/test/Instrumentation/AddressSanitizer/ubsan.ll b/test/Instrumentation/AddressSanitizer/ubsan.ll
index 22e4172..5535efe 100644
--- a/test/Instrumentation/AddressSanitizer/ubsan.ll
+++ b/test/Instrumentation/AddressSanitizer/ubsan.ll
@@ -49,4 +49,4 @@ cont:                                             ; preds = %handler.dynamic_typ
   ret void
 }
 
-!0 = metadata !{}
+!0 = !{}
diff --git a/test/Instrumentation/AddressSanitizer/undecidable-dynamic-alloca-1.ll b/test/Instrumentation/AddressSanitizer/undecidable-dynamic-alloca-1.ll
new file mode 100644
index 0000000..c67fb50
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/undecidable-dynamic-alloca-1.ll
@@ -0,0 +1,23 @@
+; Test that undecidable dynamic allocas are skipped by ASan.
+
+; RUN: opt < %s -asan -asan-module -asan-instrument-allocas=1 -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @g(i64 %n) sanitize_address {
+entry:
+  %cmp = icmp sgt i64 %n, 100
+  br i1 %cmp, label %do_alloca, label %done
+
+do_alloca:
+; CHECK-NOT: store i32 -892679478
+  %0 = alloca i8, i64 %n, align 1
+  call void @f(i8* %0)
+  br label %done
+
+done:
+  ret void
+}
+
+declare void @f(i8*)
+
diff --git a/test/Instrumentation/DataFlowSanitizer/abilist.ll b/test/Instrumentation/DataFlowSanitizer/abilist.ll
index ebf55d9..9a45dbc 100644
--- a/test/Instrumentation/DataFlowSanitizer/abilist.ll
+++ b/test/Instrumentation/DataFlowSanitizer/abilist.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -dfsan -dfsan-args-abi -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK: i32 @discard(i32 %a, i32 %b)
 define i32 @discard(i32 %a, i32 %b) {
diff --git a/test/Instrumentation/DataFlowSanitizer/args-unreachable-bb.ll b/test/Instrumentation/DataFlowSanitizer/args-unreachable-bb.ll
index a699f75..1133462 100644
--- a/test/Instrumentation/DataFlowSanitizer/args-unreachable-bb.ll
+++ b/test/Instrumentation/DataFlowSanitizer/args-unreachable-bb.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -dfsan -verify -dfsan-args-abi -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK-LABEL: @"dfs$unreachable_bb1"
 define i8 @unreachable_bb1() {
diff --git a/test/Instrumentation/DataFlowSanitizer/arith.ll b/test/Instrumentation/DataFlowSanitizer/arith.ll
index dc61896..db33e45 100644
--- a/test/Instrumentation/DataFlowSanitizer/arith.ll
+++ b/test/Instrumentation/DataFlowSanitizer/arith.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
 
 define i8 @add(i8 %a, i8 %b) {
   ; CHECK: @"dfs$add"
diff --git a/test/Instrumentation/DataFlowSanitizer/call.ll b/test/Instrumentation/DataFlowSanitizer/call.ll
index 813f4c1..dadb40f 100644
--- a/test/Instrumentation/DataFlowSanitizer/call.ll
+++ b/test/Instrumentation/DataFlowSanitizer/call.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK: @__dfsan_arg_tls = external thread_local(initialexec) global [64 x i16]
 ; CHECK: @__dfsan_retval_tls = external thread_local(initialexec) global i16
diff --git a/test/Instrumentation/DataFlowSanitizer/debug-nonzero-labels.ll b/test/Instrumentation/DataFlowSanitizer/debug-nonzero-labels.ll
index eb28c2c..16de9cc 100644
--- a/test/Instrumentation/DataFlowSanitizer/debug-nonzero-labels.ll
+++ b/test/Instrumentation/DataFlowSanitizer/debug-nonzero-labels.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -dfsan -dfsan-args-abi -dfsan-debug-nonzero-labels -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
 
 declare i32 @g()
 
diff --git a/test/Instrumentation/DataFlowSanitizer/debug.ll b/test/Instrumentation/DataFlowSanitizer/debug.ll
index cfc9dd9..837e953 100644
--- a/test/Instrumentation/DataFlowSanitizer/debug.ll
+++ b/test/Instrumentation/DataFlowSanitizer/debug.ll
@@ -21,16 +21,16 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/debug.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"debug.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\000\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
-!5 = metadata !{metadata !"0x29", metadata !1}    ; [ DW_TAG_file_type ] [/tmp/dbginfo/debug.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!11 = metadata !{metadata !"clang version 3.6.0 "}
-!12 = metadata !{i32 2, i32 1, metadata !4, null}
+!0 = !{!"0x11\004\00clang version 3.6.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/debug.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"debug.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00main\00main\00\001\000\001\000\000\00256\000\001", !1, !5, !6, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
+!5 = !{!"0x29", !1}    ; [ DW_TAG_file_type ] [/tmp/dbginfo/debug.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 2}
+!11 = !{!"clang version 3.6.0 "}
+!12 = !MDLocation(line: 2, column: 1, scope: !4)
diff --git a/test/Instrumentation/DataFlowSanitizer/load.ll b/test/Instrumentation/DataFlowSanitizer/load.ll
index 8324224..4d36c09 100644
--- a/test/Instrumentation/DataFlowSanitizer/load.ll
+++ b/test/Instrumentation/DataFlowSanitizer/load.ll
@@ -1,6 +1,7 @@
 ; RUN: opt < %s -dfsan -dfsan-combine-pointer-labels-on-load=1 -S | FileCheck %s --check-prefix=COMBINE_PTR_LABEL
 ; RUN: opt < %s -dfsan -dfsan-combine-pointer-labels-on-load=0 -S | FileCheck %s --check-prefix=NO_COMBINE_PTR_LABEL
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
 
 define {} @load0({}* %p) {
   ; COMBINE_PTR_LABEL: @"dfs$load0"
diff --git a/test/Instrumentation/DataFlowSanitizer/memset.ll b/test/Instrumentation/DataFlowSanitizer/memset.ll
index 062ef1a..7b3cb68 100644
--- a/test/Instrumentation/DataFlowSanitizer/memset.ll
+++ b/test/Instrumentation/DataFlowSanitizer/memset.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -dfsan -dfsan-args-abi -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
 
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
 
diff --git a/test/Instrumentation/DataFlowSanitizer/prefix-rename.ll b/test/Instrumentation/DataFlowSanitizer/prefix-rename.ll
index f3c36b1..b14de5f 100644
--- a/test/Instrumentation/DataFlowSanitizer/prefix-rename.ll
+++ b/test/Instrumentation/DataFlowSanitizer/prefix-rename.ll
@@ -1,6 +1,7 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s
 ; RUN: opt < %s -dfsan -dfsan-args-abi -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK: module asm ".symver dfs$f1,dfs$f@@version1"
 module asm ".symver f1,f@@version1"
diff --git a/test/Instrumentation/DataFlowSanitizer/store.ll b/test/Instrumentation/DataFlowSanitizer/store.ll
index d14bdb6..365b62d 100644
--- a/test/Instrumentation/DataFlowSanitizer/store.ll
+++ b/test/Instrumentation/DataFlowSanitizer/store.ll
@@ -1,6 +1,7 @@
 ; RUN: opt < %s -dfsan -dfsan-combine-pointer-labels-on-store=1 -S | FileCheck %s --check-prefix=COMBINE_PTR_LABEL
 ; RUN: opt < %s -dfsan -dfsan-combine-pointer-labels-on-store=0 -S | FileCheck %s --check-prefix=NO_COMBINE_PTR_LABEL
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
 
 define void @store0({} %v, {}* %p) {
   ; COMBINE_PTR_LABEL: @"dfs$store0"
diff --git a/test/Instrumentation/DataFlowSanitizer/union-large.ll b/test/Instrumentation/DataFlowSanitizer/union-large.ll
index a388f73..0408239 100644
--- a/test/Instrumentation/DataFlowSanitizer/union-large.ll
+++ b/test/Instrumentation/DataFlowSanitizer/union-large.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
 
 ; Check that we use dfsan_union in large functions instead of __dfsan_union.
 
diff --git a/test/Instrumentation/DataFlowSanitizer/union.ll b/test/Instrumentation/DataFlowSanitizer/union.ll
index 2b31081..7fcba2c 100644
--- a/test/Instrumentation/DataFlowSanitizer/union.ll
+++ b/test/Instrumentation/DataFlowSanitizer/union.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
 
 @a = common global i32 0
 @b = common global i32 0
diff --git a/test/Instrumentation/InstrProfiling/linkage.ll b/test/Instrumentation/InstrProfiling/linkage.ll
new file mode 100644
index 0000000..0a92d5d
--- /dev/null
+++ b/test/Instrumentation/InstrProfiling/linkage.ll
@@ -0,0 +1,46 @@
+;; Check that runtime symbols get appropriate linkage.
+
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.10.0 -instrprof -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -instrprof -S | FileCheck %s
+
+@__llvm_profile_name_foo = hidden constant [3 x i8] c"foo"
+@__llvm_profile_name_foo_weak = weak hidden constant [8 x i8] c"foo_weak"
+@"__llvm_profile_name_linkage.ll:foo_internal" = internal constant [23 x i8] c"linkage.ll:foo_internal"
+@__llvm_profile_name_foo_inline = linkonce_odr hidden constant [10 x i8] c"foo_inline"
+
+; CHECK: @__llvm_profile_counters_foo = hidden global
+; CHECK: @__llvm_profile_data_foo = hidden constant
+define void @foo() {
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8]* @__llvm_profile_name_foo, i32 0, i32 0), i64 0, i32 1, i32 0)
+  ret void
+}
+
+; CHECK: @__llvm_profile_counters_foo_weak = weak hidden global
+; CHECK: @__llvm_profile_data_foo_weak = weak hidden constant
+define weak void @foo_weak() {
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([8 x i8]* @__llvm_profile_name_foo_weak, i32 0, i32 0), i64 0, i32 1, i32 0)
+  ret void
+}
+
+; CHECK: @"__llvm_profile_counters_linkage.ll:foo_internal" = internal global
+; CHECK: @"__llvm_profile_data_linkage.ll:foo_internal" = internal constant
+define internal void @foo_internal() {
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([23 x i8]* @"__llvm_profile_name_linkage.ll:foo_internal", i32 0, i32 0), i64 0, i32 1, i32 0)
+  ret void
+}
+
+; CHECK: @__llvm_profile_counters_foo_inline = linkonce_odr hidden global
+; CHECK: @__llvm_profile_data_foo_inline = linkonce_odr hidden constant
+define linkonce_odr void @foo_inline() {
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([10 x i8]* @__llvm_profile_name_foo_inline, i32 0, i32 0), i64 0, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
+
+; CHECK: @__llvm_profile_runtime = external global i32
+
+; CHECK: define linkonce_odr hidden i32 @__llvm_profile_runtime_user() {{.*}} {
+; CHECK:   %[[REG:.*]] = load i32* @__llvm_profile_runtime
+; CHECK:   ret i32 %[[REG]]
+; CHECK: }
diff --git a/test/Instrumentation/InstrProfiling/no-counters.ll b/test/Instrumentation/InstrProfiling/no-counters.ll
new file mode 100644
index 0000000..0716b0d
--- /dev/null
+++ b/test/Instrumentation/InstrProfiling/no-counters.ll
@@ -0,0 +1,10 @@
+;; No instrumentation should be emitted if there are no counter increments.
+
+; RUN: opt < %s -instrprof -S | FileCheck %s
+; CHECK-NOT: @__llvm_profile_counters
+; CHECK-NOT: @__llvm_profile_data
+; CHECK-NOT: @__llvm_profile_runtime
+
+define void @foo() {
+  ret void
+}
diff --git a/test/Instrumentation/InstrProfiling/noruntime.ll b/test/Instrumentation/InstrProfiling/noruntime.ll
new file mode 100644
index 0000000..e69445d
--- /dev/null
+++ b/test/Instrumentation/InstrProfiling/noruntime.ll
@@ -0,0 +1,16 @@
+;; Check that we don't emit the runtime hooks if the user provided them.
+
+; RUN: opt < %s -instrprof -S | FileCheck %s
+; CHECK-NOT: define {{.*}} @__llvm_profile_runtime_user()
+; CHECK-NOT: load i32* @__llvm_profile_runtime
+
+@__llvm_profile_runtime = global i32 0, align 4
+
+@__llvm_profile_name_foo = hidden constant [3 x i8] c"foo"
+
+define void @foo() {
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8]* @__llvm_profile_name_foo, i32 0, i32 0), i64 0, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
diff --git a/test/Instrumentation/InstrProfiling/platform.ll b/test/Instrumentation/InstrProfiling/platform.ll
new file mode 100644
index 0000000..e032768
--- /dev/null
+++ b/test/Instrumentation/InstrProfiling/platform.ll
@@ -0,0 +1,29 @@
+;; Checks for platform specific section names and initialization code.
+
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.10.0 -instrprof -S | FileCheck %s -check-prefix=MACHO
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -instrprof -S | FileCheck %s -check-prefix=ELF
+
+@__llvm_profile_name_foo = hidden constant [3 x i8] c"foo"
+; MACHO: @__llvm_profile_name_foo = hidden constant [3 x i8] c"foo", section "__DATA,__llvm_prf_names", align 1
+; ELF: @__llvm_profile_name_foo = hidden constant [3 x i8] c"foo", section "__llvm_prf_names", align 1
+
+; MACHO: @__llvm_profile_counters_foo = hidden global [1 x i64] zeroinitializer, section "__DATA,__llvm_prf_cnts", align 8
+; ELF: @__llvm_profile_counters_foo = hidden global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", align 8
+
+; MACHO: @__llvm_profile_data_foo = hidden constant {{.*}}, section "__DATA,__llvm_prf_data", align 8
+; ELF: @__llvm_profile_data_foo = hidden constant {{.*}}, section "__llvm_prf_data", align 8
+define void @foo() {
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8]* @__llvm_profile_name_foo, i32 0, i32 0), i64 0, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
+
+;; Emit registration functions for platforms that don't find the
+;; symbols by their sections.
+
+; MACHO-NOT: define internal void @__llvm_profile_register_functions
+; ELF: define internal void @__llvm_profile_register_functions
+
+; MACHO-NOT: define internal void @__llvm_profile_init
+; ELF: define internal void @__llvm_profile_init
diff --git a/test/Instrumentation/InstrProfiling/profiling.ll b/test/Instrumentation/InstrProfiling/profiling.ll
new file mode 100644
index 0000000..246bf6b
--- /dev/null
+++ b/test/Instrumentation/InstrProfiling/profiling.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s -instrprof -S | FileCheck %s
+
+target triple = "x86_64-apple-macosx10.10.0"
+
+@__llvm_profile_name_foo = hidden constant [3 x i8] c"foo"
+; CHECK: @__llvm_profile_name_foo = hidden constant [3 x i8] c"foo", section "__DATA,__llvm_prf_names", align 1
+@__llvm_profile_name_bar = hidden constant [4 x i8] c"bar\00"
+; CHECK: @__llvm_profile_name_bar = hidden constant [4 x i8] c"bar\00", section "__DATA,__llvm_prf_names", align 1
+@baz_prof_name = hidden constant [3 x i8] c"baz"
+; CHECK: @baz_prof_name = hidden constant [3 x i8] c"baz", section "__DATA,__llvm_prf_names", align 1
+
+; CHECK: @__llvm_profile_counters_foo = hidden global [1 x i64] zeroinitializer, section "__DATA,__llvm_prf_cnts", align 8
+; CHECK: @__llvm_profile_data_foo = hidden constant {{.*}}, section "__DATA,__llvm_prf_data", align 8
+define void @foo() {
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8]* @__llvm_profile_name_foo, i32 0, i32 0), i64 0, i32 1, i32 0)
+  ret void
+}
+
+; CHECK: @__llvm_profile_counters_bar = hidden global [1 x i64] zeroinitializer, section "__DATA,__llvm_prf_cnts", align 8
+; CHECK: @__llvm_profile_data_bar = hidden constant {{.*}}, section "__DATA,__llvm_prf_data", align 8
+define void @bar() {
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([4 x i8]* @__llvm_profile_name_bar, i32 0, i32 0), i64 0, i32 1, i32 0)
+  ret void
+}
+
+; CHECK: @__llvm_profile_counters_baz = hidden global [3 x i64] zeroinitializer, section "__DATA,__llvm_prf_cnts", align 8
+; CHECK: @__llvm_profile_data_baz = hidden constant {{.*}}, section "__DATA,__llvm_prf_data", align 8
+define void @baz() {
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8]* @baz_prof_name, i32 0, i32 0), i64 0, i32 3, i32 0)
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8]* @baz_prof_name, i32 0, i32 0), i64 0, i32 3, i32 1)
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8]* @baz_prof_name, i32 0, i32 0), i64 0, i32 3, i32 2)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
+
+; CHECK: @__llvm_profile_runtime = external global i32
+; CHECK: @llvm.used = appending global {{.*}} @__llvm_profile_data_foo {{.*}} @__llvm_profile_data_bar {{.*}} @__llvm_profile_data_baz {{.*}} section "llvm.metadata"
diff --git a/test/Instrumentation/MemorySanitizer/atomics.ll b/test/Instrumentation/MemorySanitizer/atomics.ll
index c8f3b88..28736ad 100644
--- a/test/Instrumentation/MemorySanitizer/atomics.ll
+++ b/test/Instrumentation/MemorySanitizer/atomics.ll
@@ -1,4 +1,6 @@
 ; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-track-origins=1 -S | FileCheck %s
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-track-origins=2 -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Instrumentation/MemorySanitizer/check-constant-shadow.ll b/test/Instrumentation/MemorySanitizer/check-constant-shadow.ll
index 11e4410..f147944 100644
--- a/test/Instrumentation/MemorySanitizer/check-constant-shadow.ll
+++ b/test/Instrumentation/MemorySanitizer/check-constant-shadow.ll
@@ -1,10 +1,11 @@
-; RUN: opt < %s -msan -msan-check-constant-shadow=1 -S | FileCheck %s
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-check-constant-shadow=1 -msan-track-origins=1 -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Test that returning a literal undef from main() triggers an MSan warning.
 
+; main() is special: it inserts check for the return value
 define i32 @main() nounwind uwtable sanitize_memory {
 entry:
   ret i32 undef
@@ -13,3 +14,40 @@ entry:
 ; CHECK-LABEL: @main
 ; CHECK: call void @__msan_warning_noreturn
 ; CHECK: ret i32 undef
+
+
+; This function stores known initialized value.
+; Expect 2 stores: one for the shadow (0), one for the value (42), but no origin.
+define void @StoreConstant(i32* nocapture %p) nounwind uwtable sanitize_memory {
+entry:
+  store i32 42, i32* %p, align 4
+  ret void
+}
+
+; CHECK-LABEL: @StoreConstant
+; CHECK-NOT: store i32
+; CHECK: store i32 0,
+; CHECK-NOT: store i32
+; CHECK: store i32 42,
+; CHECK-NOT: store i32
+; CHECK: ret void
+
+
+; This function stores known uninitialized value.
+; Expect 3 stores: shadow, value and origin.
+; Expect no icmp(s): everything here is unconditional.
+define void @StoreUndef(i32* nocapture %p) nounwind uwtable sanitize_memory {
+entry:
+  store i32 undef, i32* %p, align 4
+  ret void
+}
+
+; CHECK-LABEL: @StoreUndef
+; CHECK-NOT: icmp
+; CHECK: store i32
+; CHECK-NOT: icmp
+; CHECK: store i32
+; CHECK-NOT: icmp
+; CHECK: store i32
+; CHECK-NOT: icmp
+; CHECK: ret void
diff --git a/test/Instrumentation/MemorySanitizer/do-not-emit-module-limits.ll b/test/Instrumentation/MemorySanitizer/do-not-emit-module-limits.ll
deleted file mode 100644
index 7d0a62a..0000000
--- a/test/Instrumentation/MemorySanitizer/do-not-emit-module-limits.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; Test that MSan does not emit undefined symbol __executable_start when it is
-; not needed (i.e. without -msan-wrap-indirect-calls).
-
-; RUN: opt < %s -msan -S | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: nounwind uwtable
-define void @_Z1fv() #0 {
-entry:
-  ret void
-}
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.ident = !{!0}
-
-!0 = metadata !{metadata !"clang version 3.5.0 (208165)"}
-
-; CHECK-NOT: __executable_start
diff --git a/test/Instrumentation/MemorySanitizer/missing_origin.ll b/test/Instrumentation/MemorySanitizer/missing_origin.ll
index 673e853..f7385b9 100644
--- a/test/Instrumentation/MemorySanitizer/missing_origin.ll
+++ b/test/Instrumentation/MemorySanitizer/missing_origin.ll
@@ -17,3 +17,17 @@ entry:
 ; CHECK: [[A:%.*]] = load i32* {{.*}}@__msan_param_origin_tls,
 ; CHECK: store i32 [[A]], i32* @__msan_retval_origin_tls
 ; CHECK: ret <4 x i32>
+
+
+; Regression test for origin propagation in "select i1, float, float".
+; https://code.google.com/p/memory-sanitizer/issues/detail?id=78
+
+define float @SelectFloat(i1 %b, float %x, float %y) nounwind uwtable sanitize_memory {
+entry:
+  %z = select i1 %b, float %x, float %y
+  ret float %z
+}
+
+; CHECK-LABEL: @SelectFloat(
+; CHECK-NOT: select {{.*}} i32 0, i32 0
+; CHECK: ret float
diff --git a/test/Instrumentation/MemorySanitizer/origin-alignment.ll b/test/Instrumentation/MemorySanitizer/origin-alignment.ll
new file mode 100644
index 0000000..ce0dbfc
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/origin-alignment.ll
@@ -0,0 +1,73 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-track-origins=1 -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS1 %s
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-track-origins=2 -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS2 %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+
+; Check origin instrumentation of stores.
+; Check that debug info for origin propagation code is set correctly.
+
+@a8 = global i8 0, align 8
+@a4 = global i8 0, align 4
+@a2 = global i8 0, align 2
+@a1 = global i8 0, align 1
+
+; 8-aligned store => 8-aligned origin store, origin address is not realigned
+define void @Store8(i8 %x) sanitize_memory {
+entry:
+  store i8 %x, i8* @a8, align 8
+  ret void
+}
+
+; CHECK-LABEL: @Store8
+; CHECK-ORIGINS1: [[ORIGIN:%[01-9a-z]+]] = load {{.*}} @__msan_param_origin_tls
+; CHECK-ORIGINS2: [[ORIGIN0:%[01-9a-z]+]] = load {{.*}} @__msan_param_origin_tls
+; CHECK-ORIGINS2: [[ORIGIN:%[01-9a-z]+]] = call i32 @__msan_chain_origin(i32 [[ORIGIN0]])
+; CHECK: store i32 [[ORIGIN]],  i32* inttoptr (i64 add (i64 and (i64 ptrtoint {{.*}} to i32*), align 8
+; CHECK: ret void
+
+
+; 4-aligned store => 4-aligned origin store, origin address is not realigned
+define void @Store4(i8 %x) sanitize_memory {
+entry:
+  store i8 %x, i8* @a4, align 4
+  ret void
+}
+
+; CHECK-LABEL: @Store4
+; CHECK-ORIGINS1: [[ORIGIN:%[01-9a-z]+]] = load {{.*}} @__msan_param_origin_tls
+; CHECK-ORIGINS2: [[ORIGIN0:%[01-9a-z]+]] = load {{.*}} @__msan_param_origin_tls
+; CHECK-ORIGINS2: [[ORIGIN:%[01-9a-z]+]] = call i32 @__msan_chain_origin(i32 [[ORIGIN0]])
+; CHECK: store i32 [[ORIGIN]],  i32* inttoptr (i64 add (i64 and (i64 ptrtoint {{.*}} to i32*), align 4
+; CHECK: ret void
+
+
+; 2-aligned store => 4-aligned origin store, origin address is realigned
+define void @Store2(i8 %x) sanitize_memory {
+entry:
+  store i8 %x, i8* @a2, align 2
+  ret void
+}
+
+; CHECK-LABEL: @Store2
+; CHECK-ORIGINS1: [[ORIGIN:%[01-9a-z]+]] = load {{.*}} @__msan_param_origin_tls
+; CHECK-ORIGINS2: [[ORIGIN0:%[01-9a-z]+]] = load {{.*}} @__msan_param_origin_tls
+; CHECK-ORIGINS2: [[ORIGIN:%[01-9a-z]+]] = call i32 @__msan_chain_origin(i32 [[ORIGIN0]])
+; CHECK: store i32 [[ORIGIN]],  i32* inttoptr (i64 and (i64 add (i64 and (i64 ptrtoint {{.*}} i64 -4) to i32*), align 4
+; CHECK: ret void
+
+
+; 1-aligned store => 4-aligned origin store, origin address is realigned
+define void @Store1(i8 %x) sanitize_memory {
+entry:
+  store i8 %x, i8* @a1, align 1
+  ret void
+}
+
+; CHECK-LABEL: @Store1
+; CHECK-ORIGINS1: [[ORIGIN:%[01-9a-z]+]] = load {{.*}} @__msan_param_origin_tls
+; CHECK-ORIGINS2: [[ORIGIN0:%[01-9a-z]+]] = load {{.*}} @__msan_param_origin_tls
+; CHECK-ORIGINS2: [[ORIGIN:%[01-9a-z]+]] = call i32 @__msan_chain_origin(i32 [[ORIGIN0]])
+; CHECK: store i32 [[ORIGIN]],  i32* inttoptr (i64 and (i64 add (i64 and (i64 ptrtoint {{.*}} i64 -4) to i32*), align 4
+; CHECK: ret void
diff --git a/test/Instrumentation/MemorySanitizer/store-long-origin.ll b/test/Instrumentation/MemorySanitizer/store-long-origin.ll
new file mode 100644
index 0000000..128f810
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/store-long-origin.ll
@@ -0,0 +1,89 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-track-origins=1 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+
+; Test origin for longer stores.
+
+define void @Store8(i64* nocapture %p, i64 %x) sanitize_memory {
+entry:
+  store i64 %x, i64* %p, align 8
+  ret void
+}
+
+; Single 8-byte origin store
+; CHECK-LABEL: define void @Store8(
+; CHECK: store i64 {{.*}}, align 8
+; CHECK: store i64 {{.*}}, align 8
+; CHECK: store i64 {{.*}}, align 8
+; CHECK: ret void
+
+define void @Store8_align4(i64* nocapture %p, i64 %x) sanitize_memory {
+entry:
+  store i64 %x, i64* %p, align 4
+  ret void
+}
+
+; Two 4-byte origin stores
+; CHECK-LABEL: define void @Store8_align4(
+; CHECK: store i64 {{.*}}, align 4
+; CHECK: store i32 {{.*}}, align 4
+; CHECK: getelementptr i32* {{.*}}, i32 1
+; CHECK: store i32 {{.*}}, align 4
+; CHECK: store i64 {{.*}}, align 4
+; CHECK: ret void
+
+%struct.S = type { i32, i32, i32 }
+
+define void @StoreAgg(%struct.S* nocapture %p, %struct.S %x) sanitize_memory {
+entry:
+  store %struct.S %x, %struct.S* %p, align 4
+  ret void
+}
+
+; Three 4-byte origin stores
+; CHECK-LABEL: define void @StoreAgg(
+; CHECK: store { i32, i32, i32 }  {{.*}}, align 4
+; CHECK: store i32 {{.*}}, align 4
+; CHECK: getelementptr i32* {{.*}}, i32 1
+; CHECK: store i32 {{.*}}, align 4
+; CHECK: getelementptr i32* {{.*}}, i32 2
+; CHECK: store i32 {{.*}}, align 4
+; CHECK: store %struct.S {{.*}}, align 4
+; CHECK: ret void
+
+
+define void @StoreAgg8(%struct.S* nocapture %p, %struct.S %x) sanitize_memory {
+entry:
+  store %struct.S %x, %struct.S* %p, align 8
+  ret void
+}
+
+; 8-byte + 4-byte origin stores
+; CHECK-LABEL: define void @StoreAgg8(
+; CHECK: store { i32, i32, i32 }  {{.*}}, align 8
+; CHECK: store i64 {{.*}}, align 8
+; CHECK: getelementptr i32* {{.*}}, i32 2
+; CHECK: store i32 {{.*}}, align 8
+; CHECK: store %struct.S {{.*}}, align 8
+; CHECK: ret void
+
+
+%struct.Q = type { i64, i64, i64 }
+define void @StoreAgg24(%struct.Q* nocapture %p, %struct.Q %x) sanitize_memory {
+entry:
+  store %struct.Q %x, %struct.Q* %p, align 8
+  ret void
+}
+
+; 3 8-byte origin stores
+; CHECK-LABEL: define void @StoreAgg24(
+; CHECK: store { i64, i64, i64 }  {{.*}}, align 8
+; CHECK: store i64 {{.*}}, align 8
+; CHECK: getelementptr i64* {{.*}}, i32 1
+; CHECK: store i64 {{.*}}, align 8
+; CHECK: getelementptr i64* {{.*}}, i32 2
+; CHECK: store i64 {{.*}}, align 8
+; CHECK: store %struct.Q {{.*}}, align 8
+; CHECK: ret void
diff --git a/test/Instrumentation/MemorySanitizer/store-origin.ll b/test/Instrumentation/MemorySanitizer/store-origin.ll
index bde4e90..c2948b1 100644
--- a/test/Instrumentation/MemorySanitizer/store-origin.ll
+++ b/test/Instrumentation/MemorySanitizer/store-origin.ll
@@ -11,8 +11,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; Function Attrs: nounwind
 define void @Store(i32* nocapture %p, i32 %x) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32* %p}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !16
-  tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !16
+  tail call void @llvm.dbg.value(metadata i32* %p, i64 0, metadata !11, metadata !{!"0x102"}), !dbg !16
+  tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !12, metadata !{!"0x102"}), !dbg !16
   store i32 %x, i32* %p, align 4, !dbg !17, !tbaa !18
   ret void, !dbg !22
 }
@@ -27,29 +27,29 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!13, !14}
 !llvm.ident = !{!15}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 (204220)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/build0/../2.cc] [DW_LANG_C99]
-!1 = metadata !{metadata !"../2.cc", metadata !"/tmp/build0"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00Store\00Store\00\001\000\001\000\006\00256\001\001", metadata !1, metadata !5, metadata !6, null, void (i32*, i32)* @Store, null, null, metadata !10} ; [ DW_TAG_subprogram ] [line 1] [def] [Store]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/build0/../2.cc]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null, metadata !8, metadata !9}
-!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !11, metadata !12}
-!11 = metadata !{metadata !"0x101\00p\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [p] [line 1]
-!12 = metadata !{metadata !"0x101\00x\0033554433\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [x] [line 1]
-!13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!15 = metadata !{metadata !"clang version 3.5.0 (204220)"}
-!16 = metadata !{i32 1, i32 0, metadata !4, null}
-!17 = metadata !{i32 2, i32 0, metadata !4, null}
-!18 = metadata !{metadata !19, metadata !19, i64 0}
-!19 = metadata !{metadata !"int", metadata !20, i64 0}
-!20 = metadata !{metadata !"omnipotent char", metadata !21, i64 0}
-!21 = metadata !{metadata !"Simple C/C++ TBAA"}
-!22 = metadata !{i32 3, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 (204220)\001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/build0/../2.cc] [DW_LANG_C99]
+!1 = !{!"../2.cc", !"/tmp/build0"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00Store\00Store\00\001\000\001\000\006\00256\001\001", !1, !5, !6, null, void (i32*, i32)* @Store, null, null, !10} ; [ DW_TAG_subprogram ] [line 1] [def] [Store]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/build0/../2.cc]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null, !8, !9}
+!8 = !{!"0xf\00\000\0064\0064\000\000", null, null, !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!11, !12}
+!11 = !{!"0x101\00p\0016777217\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [p] [line 1]
+!12 = !{!"0x101\00x\0033554433\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [x] [line 1]
+!13 = !{i32 2, !"Dwarf Version", i32 4}
+!14 = !{i32 1, !"Debug Info Version", i32 2}
+!15 = !{!"clang version 3.5.0 (204220)"}
+!16 = !MDLocation(line: 1, scope: !4)
+!17 = !MDLocation(line: 2, scope: !4)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"int", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !MDLocation(line: 3, scope: !4)
 
 
 ; CHECK: @Store
diff --git a/test/Instrumentation/MemorySanitizer/wrap_indirect_calls.ll b/test/Instrumentation/MemorySanitizer/wrap_indirect_calls.ll
deleted file mode 100644
index 65037cb..0000000
--- a/test/Instrumentation/MemorySanitizer/wrap_indirect_calls.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: opt < %s -msan -msan-check-access-address=0 -msan-wrap-indirect-calls=zzz -msan-wrap-indirect-calls-fast=0 -S | FileCheck %s
-; RUN: opt < %s -msan -msan-check-access-address=0 -msan-wrap-indirect-calls=zzz -msan-wrap-indirect-calls-fast=1 -S | FileCheck -check-prefix=CHECK-FAST %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Test for -msan-wrap-indirect-calls functionality.
-; Replaces indirect call to %f with a call to whatever is returned from the
-; wrapper function.
-
-; This does not depend on the sanitize_memory attribute.
-define i32 @func1(i32 (i32, i32)* nocapture %f, i32 %x, i32 %y) {
-entry:
-  %call = tail call i32 %f(i32 %x, i32 %y)
-  ret i32 %call
-}
-
-; CHECK: @func1
-; CHECK: bitcast i32 (i32, i32)* %f to void ()*
-; CHECK: call void ()* (void ()*)* @zzz(void ()*
-; CHECK: [[A:%[01-9a-z_.]+]] = bitcast void ()* {{.*}} to i32 (i32, i32)*
-; CHECK: call i32 {{.*}}[[A]](i32 {{.*}}, i32 {{.*}})
-; CHECK: ret i32
-
-; CHECK-FAST: @func1
-; CHECK-FAST: bitcast i32 (i32, i32)* %f to void ()*
-; CHECK-FAST-DAG: icmp ult void ()* {{.*}}, bitcast (i32* @__executable_start to void ()*)
-; CHECK-FAST-DAG: icmp uge void ()* {{.*}}, bitcast (i32* @_end to void ()*)
-; CHECK-FAST: or i1
-; CHECK-FAST: br i1
-; CHECK-FAST: call void ()* (void ()*)* @zzz(void ()*
-; CHECK-FAST: br label
-; CHECK-FAST: [[A:%[01-9a-z_.]+]] = phi i32 (i32, i32)* [ %f, %entry ], [ {{.*}} ]
-; CHECK-FAST: call i32 {{.*}}[[A]](i32 {{.*}}, i32 {{.*}})
-; CHECK-FAST: ret i32
-
-
-; The same test, but with a complex expression as the call target.
-
-declare i8* @callee(i32)
-
-define i8* @func2(i64 %x) #1 {
-entry:
-  %call = tail call i8* bitcast (i8* (i32)* @callee to i8* (i64)*)(i64 %x)
-  ret i8* %call
-}
-
-; CHECK: @func2
-; CHECK: call {{.*}} @zzz
-; CHECK: [[A:%[01-9a-z_.]+]] = bitcast void ()* {{.*}} to i8* (i64)*
-; CHECK: call i8* {{.*}}[[A]](i64 {{.*}})
-; CHECK: ret i8*
-
-; CHECK-FAST: @func2
-; CHECK-FAST: {{br i1 or .* icmp ult .* bitcast .* @callee .* @__executable_start.* icmp uge .* bitcast .* @callee .* @_end}}
-; CHECK-FAST: {{call .* @zzz.* bitcast .*@callee}}
-; CHECK-FAST: bitcast void ()* {{.*}} to i8* (i64)*
-; CHECK-FAST: br label
-; CHECK-FAST: [[A:%[01-9a-z_.]+]] = phi i8* (i64)* [{{.*bitcast .* @callee.*, %entry.*}}], [ {{.*}} ]
-; CHECK-FAST: call i8* {{.*}}[[A]](i64 {{.*}})
-; CHECK-FAST: ret i8*
diff --git a/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll b/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll
index eea93b8..a1b23f1 100644
--- a/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll
+++ b/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll
@@ -15,8 +15,8 @@
 ; and add sanitize_address to @_ZN1A1fEv
 
 ; Test that __sanitizer_cov call has !dbg pointing to the opening { of A::f().
-; CHECK: call void @__sanitizer_cov(), !dbg [[A:!.*]]
-; CHECK: [[A]] = metadata !{i32 6, i32 0, metadata !{{.*}}, null}
+; CHECK: call void @__sanitizer_cov(i32*{{.*}}), !dbg [[A:!.*]]
+; CHECK: [[A]] = !MDLocation(line: 6, scope: !{{.*}})
 
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -27,7 +27,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; Function Attrs: nounwind readonly uwtable
 define i32 @_ZN1A1fEv(%struct.A* nocapture readonly %this) #0 align 2 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.A* %this}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !20
+  tail call void @llvm.dbg.value(metadata %struct.A* %this, i64 0, metadata !15, metadata !{!"0x102"}), !dbg !20
   %x = getelementptr inbounds %struct.A* %this, i64 0, i32 0, !dbg !21
   %0 = load i32* %x, align 4, !dbg !21
   ret i32 %0, !dbg !21
@@ -43,25 +43,25 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!17, !18}
 !llvm.ident = !{!19}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (210251)\001\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !12, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/code/llvm/build0/../1.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"../1.cc", metadata !"/code/llvm/build0"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00A\001\0032\0032\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
-!5 = metadata !{metadata !6, metadata !8}
-!6 = metadata !{metadata !"0xd\00x\003\0032\0032\000\000", metadata !1, metadata !"_ZTS1A", metadata !7} ; [ DW_TAG_member ] [x] [line 3, size 32, align 32, offset 0] [from int]
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!8 = metadata !{metadata !"0x2e\00f\00f\00_ZN1A1fEv\002\000\000\000\006\00256\001\002", metadata !1, metadata !"_ZTS1A", metadata !9, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 2] [f]
-!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!10 = metadata !{metadata !7, metadata !11}
-!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x2e\00f\00f\00_ZN1A1fEv\006\000\001\000\006\00256\001\006", metadata !1, metadata !"_ZTS1A", metadata !9, null, i32 (%struct.A*)* @_ZN1A1fEv, null, metadata !8, metadata !14} ; [ DW_TAG_subprogram ] [line 6] [def] [f]
-!14 = metadata !{metadata !15}
-!15 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !13, null, metadata !16} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!16 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
-!17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!18 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!19 = metadata !{metadata !"clang version 3.5.0 (210251)"}
-!20 = metadata !{i32 0, i32 0, metadata !13, null}
-!21 = metadata !{i32 7, i32 0, metadata !13, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 (210251)\001\00\000\00\001", !1, !2, !3, !12, !2, !2} ; [ DW_TAG_compile_unit ] [/code/llvm/build0/../1.cc] [DW_LANG_C_plus_plus]
+!1 = !{!"../1.cc", !"/code/llvm/build0"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00A\001\0032\0032\000\000\000", !1, null, null, !5, null, null, !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = !{!6, !8}
+!6 = !{!"0xd\00x\003\0032\0032\000\000", !1, !"_ZTS1A", !7} ; [ DW_TAG_member ] [x] [line 3, size 32, align 32, offset 0] [from int]
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = !{!"0x2e\00f\00f\00_ZN1A1fEv\002\000\000\000\006\00256\001\002", !1, !"_ZTS1A", !9, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 2] [f]
+!9 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = !{!7, !11}
+!11 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!12 = !{!13}
+!13 = !{!"0x2e\00f\00f\00_ZN1A1fEv\006\000\001\000\006\00256\001\006", !1, !"_ZTS1A", !9, null, i32 (%struct.A*)* @_ZN1A1fEv, null, !8, !14} ; [ DW_TAG_subprogram ] [line 6] [def] [f]
+!14 = !{!15}
+!15 = !{!"0x101\00this\0016777216\001088", !13, null, !16} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!16 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!17 = !{i32 2, !"Dwarf Version", i32 4}
+!18 = !{i32 2, !"Debug Info Version", i32 2}
+!19 = !{!"clang version 3.5.0 (210251)"}
+!20 = !MDLocation(line: 0, scope: !13)
+!21 = !MDLocation(line: 7, scope: !13)
diff --git a/test/Instrumentation/SanitizerCoverage/coverage.ll b/test/Instrumentation/SanitizerCoverage/coverage.ll
index da0498d..1595727 100644
--- a/test/Instrumentation/SanitizerCoverage/coverage.ll
+++ b/test/Instrumentation/SanitizerCoverage/coverage.ll
@@ -2,7 +2,7 @@
 ; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -S | FileCheck %s --check-prefix=CHECK1
 ; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -S | FileCheck %s --check-prefix=CHECK2
 ; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=10 -S | FileCheck %s --check-prefix=CHECK2
-; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=1  -S | FileCheck %s --check-prefix=CHECK1
+; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=1  -S | FileCheck %s --check-prefix=CHECK_WITH_CHECK
 ; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-block-threshold=10 -S | FileCheck %s --check-prefix=CHECK3
 ; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -S | FileCheck %s --check-prefix=CHECK4
 
@@ -12,7 +12,7 @@
 ; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=10 \
 ; RUN:      -S | FileCheck %s --check-prefix=CHECK2
 ; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=1 \
-; RUN:      -S | FileCheck %s --check-prefix=CHECK1
+; RUN:      -S | FileCheck %s --check-prefix=CHECK_WITH_CHECK
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
@@ -29,33 +29,44 @@ entry:
   ret void
 }
 
+; CHECK0-NOT: @llvm.global_ctors = {{.*}}{ i32 2, void ()* @sancov.module_ctor }
+; CHECK1: @llvm.global_ctors = {{.*}}{ i32 2, void ()* @sancov.module_ctor }
+; CHECK2: @llvm.global_ctors = {{.*}}{ i32 2, void ()* @sancov.module_ctor }
+
 ; CHECK0-NOT: call void @__sanitizer_cov(
 ; CHECK0-NOT: call void @__sanitizer_cov_module_init(
 
 ; CHECK1-LABEL: define void @foo
-; CHECK1: %0 = load atomic i8* @__sancov_gen_cov_foo monotonic, align 1
-; CHECK1: %1 = icmp eq i8 0, %0
+; CHECK1: %0 = load atomic i32* {{.*}} monotonic, align 4, !nosanitize
+; CHECK1: %1 = icmp sge i32 0, %0
 ; CHECK1: br i1 %1, label %2, label %3
-; CHECK1: call void @__sanitizer_cov
+; CHECK1: call void @__sanitizer_cov(i32*{{.*}})
+; CHECK1: call void asm sideeffect "", ""()
 ; CHECK1-NOT: call void @__sanitizer_cov
-; CHECK1: store atomic i8 1, i8* @__sancov_gen_cov_foo monotonic, align 1
+; CHECK1: ret void
 
 ; CHECK1-LABEL: define internal void @sancov.module_ctor
 ; CHECK1-NOT: ret
-; CHECK1: call void @__sanitizer_cov_module_init(i64 2)
+; CHECK1: call void @__sanitizer_cov_module_init({{.*}}, i64 2,
 ; CHECK1: ret
 
+; CHECK_WITH_CHECK-LABEL: define void @foo
+; CHECK_WITH_CHECK: __sanitizer_cov_with_check
+; CHECK_WITH_CHECK: ret void
 
 ; CHECK2-LABEL: define void @foo
 ; CHECK2: call void @__sanitizer_cov
+; CHECK2: call void asm sideeffect "", ""()
 ; CHECK2: call void @__sanitizer_cov
+; CHECK2: call void asm sideeffect "", ""()
 ; CHECK2: call void @__sanitizer_cov
+; CHECK2: call void asm sideeffect "", ""()
 ; CHECK2-NOT: call void @__sanitizer_cov
 ; CHECK2: ret void
 
 ; CHECK2-LABEL: define internal void @sancov.module_ctor
 ; CHECK2-NOT: ret
-; CHECK2: call void @__sanitizer_cov_module_init(i64 4)
+; CHECK2: call void @__sanitizer_cov_module_init({{.*}}, i64 4,
 ; CHECK2: ret
 
 ; CHECK3-LABEL: define void @foo
diff --git a/test/Instrumentation/SanitizerCoverage/coverage2-dbg.ll b/test/Instrumentation/SanitizerCoverage/coverage2-dbg.ll
index 9b26329..cb436a8 100644
--- a/test/Instrumentation/SanitizerCoverage/coverage2-dbg.ll
+++ b/test/Instrumentation/SanitizerCoverage/coverage2-dbg.ll
@@ -17,17 +17,17 @@ target triple = "x86_64-unknown-linux-gnu"
 ; Check that __sanitizer_cov call has !dgb pointing to the beginning
 ; of appropriate basic blocks.
 ; CHECK-LABEL:_Z3fooPi
-; CHECK: call void @__sanitizer_cov(), !dbg [[A:!.*]]
-; CHECK: call void @__sanitizer_cov(), !dbg [[B:!.*]]
-; CHECK: call void @__sanitizer_cov(), !dbg [[C:!.*]]
+; CHECK: call void @__sanitizer_cov(i32*{{.*}}), !dbg [[A:!.*]]
+; CHECK: call void @__sanitizer_cov(i32*{{.*}}), !dbg [[B:!.*]]
+; CHECK: call void @__sanitizer_cov(i32*{{.*}}), !dbg [[C:!.*]]
 ; CHECK: ret void
-; CHECK: [[A]] = metadata !{i32 1, i32 0, metadata !{{.*}}, null}
-; CHECK: [[B]] = metadata !{i32 3, i32 5, metadata !{{.*}}, null}
-; CHECK: [[C]] = metadata !{i32 4, i32 1, metadata !{{.*}}, null}
+; CHECK: [[A]] = !MDLocation(line: 1, scope: !{{.*}})
+; CHECK: [[B]] = !MDLocation(line: 3, column: 5, scope: !{{.*}})
+; CHECK: [[C]] = !MDLocation(line: 4, column: 1, scope: !{{.*}})
 
 define void @_Z3fooPi(i32* %a) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32* %a}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !15
+  tail call void @llvm.dbg.value(metadata i32* %a, i64 0, metadata !11, metadata !{!"0x102"}), !dbg !15
   %tobool = icmp eq i32* %a, null, !dbg !16
   br i1 %tobool, label %if.end, label %if.then, !dbg !16
 
@@ -49,27 +49,27 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!12, !13}
 !llvm.ident = !{!14}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 (217079)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [FOO/if.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"if.cc", metadata !"FOO"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooPi\001\000\001\000\006\00256\001\001", metadata !1, metadata !5, metadata !6, null, void (i32*)* @_Z3fooPi, null, null, metadata !10} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [FOO/if.cc]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null, metadata !8}
-!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !11}
-!11 = metadata !{metadata !"0x101\00a\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 1]
-!12 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!13 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!14 = metadata !{metadata !"clang version 3.6.0 (217079)"}
-!15 = metadata !{i32 1, i32 15, metadata !4, null}
-!16 = metadata !{i32 2, i32 7, metadata !17, null}
-!17 = metadata !{metadata !"0xb\002\007\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [FOO/if.cc]
-!18 = metadata !{i32 3, i32 5, metadata !17, null}
-!19 = metadata !{metadata !20, metadata !20, i64 0}
-!20 = metadata !{metadata !"int", metadata !21, i64 0}
-!21 = metadata !{metadata !"omnipotent char", metadata !22, i64 0}
-!22 = metadata !{metadata !"Simple C/C++ TBAA"}
-!23 = metadata !{i32 4, i32 1, metadata !4, null}
+!0 = !{!"0x11\004\00clang version 3.6.0 (217079)\001\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [FOO/if.cc] [DW_LANG_C_plus_plus]
+!1 = !{!"if.cc", !"FOO"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00_Z3fooPi\001\000\001\000\006\00256\001\001", !1, !5, !6, null, void (i32*)* @_Z3fooPi, null, null, !10} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [FOO/if.cc]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null, !8}
+!8 = !{!"0xf\00\000\0064\0064\000\000", null, null, !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!11}
+!11 = !{!"0x101\00a\0016777217\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [a] [line 1]
+!12 = !{i32 2, !"Dwarf Version", i32 4}
+!13 = !{i32 2, !"Debug Info Version", i32 2}
+!14 = !{!"clang version 3.6.0 (217079)"}
+!15 = !MDLocation(line: 1, column: 15, scope: !4)
+!16 = !MDLocation(line: 2, column: 7, scope: !17)
+!17 = !{!"0xb\002\007\000", !1, !4} ; [ DW_TAG_lexical_block ] [FOO/if.cc]
+!18 = !MDLocation(line: 3, column: 5, scope: !17)
+!19 = !{!20, !20, i64 0}
+!20 = !{!"int", !21, i64 0}
+!21 = !{!"omnipotent char", !22, i64 0}
+!22 = !{!"Simple C/C++ TBAA"}
+!23 = !MDLocation(line: 4, column: 1, scope: !4)
diff --git a/test/Instrumentation/SanitizerCoverage/tracing.ll b/test/Instrumentation/SanitizerCoverage/tracing.ll
index c39cb1c..8c3a6df 100644
--- a/test/Instrumentation/SanitizerCoverage/tracing.ll
+++ b/test/Instrumentation/SanitizerCoverage/tracing.ll
@@ -1,5 +1,5 @@
 ; Test -sanitizer-coverage-experimental-tracing
-; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-experimental-tracing  -S | FileCheck %s --check-prefix=CHECK1
+; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-experimental-tracing  -S | FileCheck %s --check-prefix=CHECK1
 ; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-experimental-tracing  -S | FileCheck %s --check-prefix=CHECK3
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/test/Instrumentation/ThreadSanitizer/capture.ll b/test/Instrumentation/ThreadSanitizer/capture.ll
new file mode 100644
index 0000000..d6c62f0
--- /dev/null
+++ b/test/Instrumentation/ThreadSanitizer/capture.ll
@@ -0,0 +1,91 @@
+; RUN: opt < %s -tsan -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+declare void @escape(i32*)
+
+@sink = global i32* null, align 4
+
+define void @captured0() nounwind uwtable sanitize_thread {
+entry:
+  %ptr = alloca i32, align 4
+  ; escapes due to call
+  call void @escape(i32* %ptr)
+  store i32 42, i32* %ptr, align 4
+  ret void
+}
+; CHECK-LABEL: define void @captured0
+; CHECK: __tsan_write
+; CHECK: ret void
+
+define void @captured1() nounwind uwtable sanitize_thread {
+entry:
+  %ptr = alloca i32, align 4
+  ; escapes due to store into global
+  store i32* %ptr, i32** @sink, align 8
+  store i32 42, i32* %ptr, align 4
+  ret void
+}
+; CHECK-LABEL: define void @captured1
+; CHECK: __tsan_write
+; CHECK: __tsan_write
+; CHECK: ret void
+
+define void @captured2() nounwind uwtable sanitize_thread {
+entry:
+  %ptr = alloca i32, align 4
+  %tmp = alloca i32*, align 8
+  ; transitive escape
+  store i32* %ptr, i32** %tmp, align 8
+  %0 = load i32** %tmp, align 8
+  store i32* %0, i32** @sink, align 8
+  store i32 42, i32* %ptr, align 4
+  ret void
+}
+; CHECK-LABEL: define void @captured2
+; CHECK: __tsan_write
+; CHECK: __tsan_write
+; CHECK: ret void
+
+define void @notcaptured0() nounwind uwtable sanitize_thread {
+entry:
+  %ptr = alloca i32, align 4
+  store i32 42, i32* %ptr, align 4
+  ; escapes due to call
+  call void @escape(i32* %ptr)
+  ret void
+}
+; CHECK-LABEL: define void @notcaptured0
+; CHECK: __tsan_write
+; CHECK: ret void
+
+define void @notcaptured1() nounwind uwtable sanitize_thread {
+entry:
+  %ptr = alloca i32, align 4
+  store i32 42, i32* %ptr, align 4
+  ; escapes due to store into global
+  store i32* %ptr, i32** @sink, align 8
+  ret void
+}
+; CHECK-LABEL: define void @notcaptured1
+; CHECK: __tsan_write
+; CHECK: __tsan_write
+; CHECK: ret void
+
+define void @notcaptured2() nounwind uwtable sanitize_thread {
+entry:
+  %ptr = alloca i32, align 4
+  %tmp = alloca i32*, align 8
+  store i32 42, i32* %ptr, align 4
+  ; transitive escape
+  store i32* %ptr, i32** %tmp, align 8
+  %0 = load i32** %tmp, align 8
+  store i32* %0, i32** @sink, align 8
+  ret void
+}
+; CHECK-LABEL: define void @notcaptured2
+; CHECK: __tsan_write
+; CHECK: __tsan_write
+; CHECK: ret void
+
+
diff --git a/test/Instrumentation/ThreadSanitizer/read_from_global.ll b/test/Instrumentation/ThreadSanitizer/read_from_global.ll
index 33614a3..037dd56 100644
--- a/test/Instrumentation/ThreadSanitizer/read_from_global.ll
+++ b/test/Instrumentation/ThreadSanitizer/read_from_global.ll
@@ -54,6 +54,6 @@ entry:
 ; CHECK: = load
 ; CHECK: ret void
 
-!0 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!1 = metadata !{metadata !"vtable pointer", metadata !0}
-!2 = metadata !{metadata !1, metadata !1, i64 0}
+!0 = !{!"Simple C/C++ TBAA", null}
+!1 = !{!"vtable pointer", !0}
+!2 = !{!1, !1, i64 0}
diff --git a/test/Instrumentation/ThreadSanitizer/unaligned.ll b/test/Instrumentation/ThreadSanitizer/unaligned.ll
new file mode 100644
index 0000000..7a240e3
--- /dev/null
+++ b/test/Instrumentation/ThreadSanitizer/unaligned.ll
@@ -0,0 +1,143 @@
+; RUN: opt < %s -tsan -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define i16 @test_unaligned_read2(i16* %a) sanitize_thread {
+entry:
+  %tmp1 = load i16* %a, align 1
+  ret i16 %tmp1
+}
+
+; CHECK-LABEL: define i16 @test_unaligned_read2(i16* %a)
+; CHECK:        call void @__tsan_func_entry(i8* %0)
+; CHECK-NEXT:   %1 = bitcast i16* %a to i8*
+; CHECK-NEXT:   call void @__tsan_unaligned_read2(i8* %1)
+; CHECK-NEXT:   %tmp1 = load i16* %a, align 1
+; CHECK-NEXT:   call void @__tsan_func_exit()
+; CHECK: ret i16
+
+define i32 @test_unaligned_read4(i32* %a) sanitize_thread {
+entry:
+  %tmp1 = load i32* %a, align 2
+  ret i32 %tmp1
+}
+
+; CHECK-LABEL: define i32 @test_unaligned_read4(i32* %a)
+; CHECK:        call void @__tsan_func_entry(i8* %0)
+; CHECK-NEXT:   %1 = bitcast i32* %a to i8*
+; CHECK-NEXT:   call void @__tsan_unaligned_read4(i8* %1)
+; CHECK-NEXT:   %tmp1 = load i32* %a, align 2
+; CHECK-NEXT:   call void @__tsan_func_exit()
+; CHECK: ret i32
+
+define i64 @test_unaligned_read8(i64* %a) sanitize_thread {
+entry:
+  %tmp1 = load i64* %a, align 4
+  ret i64 %tmp1
+}
+
+; CHECK-LABEL: define i64 @test_unaligned_read8(i64* %a)
+; CHECK:        call void @__tsan_func_entry(i8* %0)
+; CHECK-NEXT:   %1 = bitcast i64* %a to i8*
+; CHECK-NEXT:   call void @__tsan_unaligned_read8(i8* %1)
+; CHECK-NEXT:   %tmp1 = load i64* %a, align 4
+; CHECK-NEXT:   call void @__tsan_func_exit()
+; CHECK: ret i64
+
+define i128 @test_unaligned_read16(i128* %a) sanitize_thread {
+entry:
+  %tmp1 = load i128* %a, align 1
+  ret i128 %tmp1
+}
+
+; CHECK-LABEL: define i128 @test_unaligned_read16(i128* %a)
+; CHECK:        call void @__tsan_func_entry(i8* %0)
+; CHECK-NEXT:   %1 = bitcast i128* %a to i8*
+; CHECK-NEXT:   call void @__tsan_unaligned_read16(i8* %1)
+; CHECK-NEXT:   %tmp1 = load i128* %a, align 1
+; CHECK-NEXT:   call void @__tsan_func_exit()
+; CHECK: ret i128
+
+define i128 @test_aligned_read16(i128* %a) sanitize_thread {
+entry:
+  %tmp1 = load i128* %a, align 8
+  ret i128 %tmp1
+}
+
+; CHECK-LABEL: define i128 @test_aligned_read16(i128* %a)
+; CHECK:        call void @__tsan_func_entry(i8* %0)
+; CHECK-NEXT:   %1 = bitcast i128* %a to i8*
+; CHECK-NEXT:   call void @__tsan_read16(i8* %1)
+; CHECK-NEXT:   %tmp1 = load i128* %a, align 8
+; CHECK-NEXT:   call void @__tsan_func_exit()
+; CHECK: ret i128
+
+define void @test_unaligned_write2(i16* %a) sanitize_thread {
+entry:
+  store i16 1, i16* %a, align 1
+  ret void
+}
+
+; CHECK-LABEL: define void @test_unaligned_write2(i16* %a)
+; CHECK:        call void @__tsan_func_entry(i8* %0)
+; CHECK-NEXT:   %1 = bitcast i16* %a to i8*
+; CHECK-NEXT:   call void @__tsan_unaligned_write2(i8* %1)
+; CHECK-NEXT:   store i16 1, i16* %a, align 1
+; CHECK-NEXT:   call void @__tsan_func_exit()
+; CHECK: ret void
+
+define void @test_unaligned_write4(i32* %a) sanitize_thread {
+entry:
+  store i32 1, i32* %a, align 1
+  ret void
+}
+
+; CHECK-LABEL: define void @test_unaligned_write4(i32* %a)
+; CHECK:        call void @__tsan_func_entry(i8* %0)
+; CHECK-NEXT:   %1 = bitcast i32* %a to i8*
+; CHECK-NEXT:   call void @__tsan_unaligned_write4(i8* %1)
+; CHECK-NEXT:   store i32 1, i32* %a, align 1
+; CHECK-NEXT:   call void @__tsan_func_exit()
+; CHECK: ret void
+
+define void @test_unaligned_write8(i64* %a) sanitize_thread {
+entry:
+  store i64 1, i64* %a, align 1
+  ret void
+}
+
+; CHECK-LABEL: define void @test_unaligned_write8(i64* %a)
+; CHECK:        call void @__tsan_func_entry(i8* %0)
+; CHECK-NEXT:   %1 = bitcast i64* %a to i8*
+; CHECK-NEXT:   call void @__tsan_unaligned_write8(i8* %1)
+; CHECK-NEXT:   store i64 1, i64* %a, align 1
+; CHECK-NEXT:   call void @__tsan_func_exit()
+; CHECK: ret void
+
+define void @test_unaligned_write16(i128* %a) sanitize_thread {
+entry:
+  store i128 1, i128* %a, align 1
+  ret void
+}
+
+; CHECK-LABEL: define void @test_unaligned_write16(i128* %a)
+; CHECK:        call void @__tsan_func_entry(i8* %0)
+; CHECK-NEXT:   %1 = bitcast i128* %a to i8*
+; CHECK-NEXT:   call void @__tsan_unaligned_write16(i8* %1)
+; CHECK-NEXT:   store i128 1, i128* %a, align 1
+; CHECK-NEXT:   call void @__tsan_func_exit()
+; CHECK: ret void
+
+define void @test_aligned_write16(i128* %a) sanitize_thread {
+entry:
+  store i128 1, i128* %a, align 8
+  ret void
+}
+
+; CHECK-LABEL: define void @test_aligned_write16(i128* %a)
+; CHECK:        call void @__tsan_func_entry(i8* %0)
+; CHECK-NEXT:   %1 = bitcast i128* %a to i8*
+; CHECK-NEXT:   call void @__tsan_write16(i8* %1)
+; CHECK-NEXT:   store i128 1, i128* %a, align 8
+; CHECK-NEXT:   call void @__tsan_func_exit()
+; CHECK: ret void
diff --git a/test/Instrumentation/ThreadSanitizer/vptr_read.ll b/test/Instrumentation/ThreadSanitizer/vptr_read.ll
index 811ad8d..cccdeb8 100644
--- a/test/Instrumentation/ThreadSanitizer/vptr_read.ll
+++ b/test/Instrumentation/ThreadSanitizer/vptr_read.ll
@@ -8,6 +8,6 @@ entry:
   %0 = load i8* %a, align 8, !tbaa !0
   ret i8 %0
 }
-!0 = metadata !{metadata !2, metadata !2, i64 0}
-!1 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!2 = metadata !{metadata !"vtable pointer", metadata !1}
+!0 = !{!2, !2, i64 0}
+!1 = !{!"Simple C/C++ TBAA", null}
+!2 = !{!"vtable pointer", !1}
diff --git a/test/Instrumentation/ThreadSanitizer/vptr_update.ll b/test/Instrumentation/ThreadSanitizer/vptr_update.ll
index 83d28b6..78f7f31 100644
--- a/test/Instrumentation/ThreadSanitizer/vptr_update.ll
+++ b/test/Instrumentation/ThreadSanitizer/vptr_update.ll
@@ -35,6 +35,6 @@ entry:
   ret void
 }
 
-!0 = metadata !{metadata !2, metadata !2, i64 0}
-!1 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!2 = metadata !{metadata !"vtable pointer", metadata !1}
+!0 = !{!2, !2, i64 0}
+!1 = !{!"Simple C/C++ TBAA", null}
+!2 = !{!"vtable pointer", !1}
diff --git a/test/JitListener/lit.local.cfg b/test/JitListener/lit.local.cfg
index d995820..05f34a7 100644
--- a/test/JitListener/lit.local.cfg
+++ b/test/JitListener/lit.local.cfg
@@ -1,3 +1,3 @@
-if not config.root.llvm_use_intel_jitevents == "ON":
+if not config.root.llvm_use_intel_jitevents == "true":
     config.unsupported = True
 
diff --git a/test/JitListener/multiple.ll b/test/JitListener/multiple.ll
new file mode 100644
index 0000000..ae54608
--- /dev/null
+++ b/test/JitListener/multiple.ll
@@ -0,0 +1,167 @@
+; Verify the behavior of the IntelJITEventListener.
+; RUN: llvm-jitlistener %s | FileCheck %s
+
+; This test was created using the following file:
+;
+;  1: int foo(int a) {
+;  2:   return a;
+;  3: }
+;  4:
+;  5: int bar(int a) {
+;  6:   if (a == 0) {
+;  7:     return 0;
+;  8:   }
+;  9:   return 100/a;
+; 10: }
+; 11: 
+; 12: int fubar(int a) {
+; 13:   switch (a) {
+; 14:     case 0:
+; 15:       return 10;
+; 16:     case 1:
+; 17:       return 20;
+; 18:     default:
+; 19:       return 30;
+; 20:   }
+; 21: }
+;
+
+; CHECK: Method load [1]: bar, Size = {{[0-9]+}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[5,6,7,9]}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[5,6,7,9]}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[5,6,7,9]}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[5,6,7,9]}}
+
+; CHECK: Method load [2]: foo, Size = {{[0-9]+}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[1,2]}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[1,2]}}
+
+; CHECK: Method load [3]: fubar, Size = {{[0-9]+}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[12,13,15,17,19]}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[12,13,15,17,19]}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[12,13,15,17,19]}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[12,13,15,17,19]}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[12,13,15,17,19]}}
+
+; CHECK: Method unload [1]
+; CHECK: Method unload [2]
+; CHECK: Method unload [3]
+
+; ModuleID = 'multiple.c'
+
+; Function Attrs: nounwind uwtable
+define i32 @foo(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !15, metadata !16), !dbg !17
+  %0 = load i32* %a.addr, align 4, !dbg !18
+  ret i32 %0, !dbg !19
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind uwtable
+define i32 @bar(i32 %a) #0 {
+entry:
+  %retval = alloca i32, align 4
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !20, metadata !16), !dbg !21
+  %0 = load i32* %a.addr, align 4, !dbg !22
+  %cmp = icmp eq i32 %0, 0, !dbg !22
+  br i1 %cmp, label %if.then, label %if.end, !dbg !24
+
+if.then:                                          ; preds = %entry
+  store i32 0, i32* %retval, !dbg !25
+  br label %return, !dbg !25
+
+if.end:                                           ; preds = %entry
+  %1 = load i32* %a.addr, align 4, !dbg !27
+  %div = sdiv i32 100, %1, !dbg !28
+  store i32 %div, i32* %retval, !dbg !29
+  br label %return, !dbg !29
+
+return:                                           ; preds = %if.end, %if.then
+  %2 = load i32* %retval, !dbg !30
+  ret i32 %2, !dbg !30
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @fubar(i32 %a) #0 {
+entry:
+  %retval = alloca i32, align 4
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !31, metadata !16), !dbg !32
+  %0 = load i32* %a.addr, align 4, !dbg !33
+  switch i32 %0, label %sw.default [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+  ], !dbg !34
+
+sw.bb:                                            ; preds = %entry
+  store i32 10, i32* %retval, !dbg !35
+  br label %return, !dbg !35
+
+sw.bb1:                                           ; preds = %entry
+  store i32 20, i32* %retval, !dbg !37
+  br label %return, !dbg !37
+
+sw.default:                                       ; preds = %entry
+  store i32 30, i32* %retval, !dbg !38
+  br label %return, !dbg !38
+
+return:                                           ; preds = %sw.default, %sw.bb1, %sw.bb
+  %1 = load i32* %retval, !dbg !39
+  ret i32 %1, !dbg !39
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!11, !12, !13}
+!llvm.ident = !{!14}
+
+!0 = !{!"0x11\0012\00clang version 3.6.0 (trunk)\000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [F:\users\akaylor\llvm-s\llvm\test\JitListener/multiple.c] [DW_LANG_C99]
+!1 = !{!"multiple.c", !"F:\5Cusers\5Cakaylor\5Cllvm-s\5Cllvm\5Ctest\5CJitListener"}
+!2 = !{}
+!3 = !{!4, !9, !10}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\000\00256\000\001", !1, !5, !6, null, i32 (i32)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}                               ; [ DW_TAG_file_type ] [F:\users\akaylor\llvm-s\llvm\test\JitListener/multiple.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x2e\00bar\00bar\00\005\000\001\000\000\00256\000\005", !1, !5, !6, null, i32 (i32)* @bar, null, null, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [bar]
+!10 = !{!"0x2e\00fubar\00fubar\00\0012\000\001\000\000\00256\000\0012", !1, !5, !6, null, i32 (i32)* @fubar, null, null, !2} ; [ DW_TAG_subprogram ] [line 12] [def] [fubar]
+!11 = !{i32 2, !"Dwarf Version", i32 4}
+!12 = !{i32 2, !"Debug Info Version", i32 2}
+!13 = !{i32 1, !"PIC Level", i32 2}
+!14 = !{!"clang version 3.6.0 (trunk)"}
+!15 = !{!"0x101\00a\0016777217\000", !4, !5, !8}  ; [ DW_TAG_arg_variable ] [a] [line 1]
+!16 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!17 = !MDLocation(line: 1, column: 13, scope: !4)
+!18 = !MDLocation(line: 2, column: 10, scope: !4)
+!19 = !MDLocation(line: 2, column: 3, scope: !4)
+!20 = !{!"0x101\00a\0016777221\000", !9, !5, !8}  ; [ DW_TAG_arg_variable ] [a] [line 5]
+!21 = !MDLocation(line: 5, column: 13, scope: !9)
+!22 = !MDLocation(line: 6, column: 7, scope: !23)
+!23 = !{!"0xb\006\007\000", !1, !9}               ; [ DW_TAG_lexical_block ] [F:\users\akaylor\llvm-s\llvm\test\JitListener/multiple.c]
+!24 = !MDLocation(line: 6, column: 7, scope: !9)
+!25 = !MDLocation(line: 7, column: 5, scope: !26)
+!26 = !{!"0xb\006\0015\001", !1, !23}             ; [ DW_TAG_lexical_block ] [F:\users\akaylor\llvm-s\llvm\test\JitListener/multiple.c]
+!27 = !MDLocation(line: 9, column: 14, scope: !9)
+!28 = !MDLocation(line: 9, column: 10, scope: !9)
+!29 = !MDLocation(line: 9, column: 3, scope: !9)
+!30 = !MDLocation(line: 10, column: 1, scope: !9)
+!31 = !{!"0x101\00a\0016777228\000", !10, !5, !8} ; [ DW_TAG_arg_variable ] [a] [line 12]
+!32 = !MDLocation(line: 12, column: 15, scope: !10)
+!33 = !MDLocation(line: 13, column: 11, scope: !10)
+!34 = !MDLocation(line: 13, column: 3, scope: !10)
+!35 = !MDLocation(line: 15, column: 7, scope: !36)
+!36 = !{!"0xb\0013\0014\002", !1, !10}            ; [ DW_TAG_lexical_block ] [F:\users\akaylor\llvm-s\llvm\test\JitListener/multiple.c]
+!37 = !MDLocation(line: 17, column: 7, scope: !36)
+!38 = !MDLocation(line: 19, column: 7, scope: !36)
+!39 = !MDLocation(line: 21, column: 1, scope: !10)
diff --git a/test/JitListener/simple.ll b/test/JitListener/simple.ll
new file mode 100644
index 0000000..1732170
--- /dev/null
+++ b/test/JitListener/simple.ll
@@ -0,0 +1,54 @@
+; Verify the behavior of the IntelJITEventListener.
+; RUN: llvm-jitlistener %s | FileCheck %s
+
+; This test was created using the following file:
+;
+; 1: int foo(int a) {
+; 2:   return a;
+; 3: }
+;
+
+; CHECK: Method load [1]: foo, Size = {{[0-9]+}}
+; CHECK:   Line info @ {{[0-9]+}}: simple.c, line 1
+; CHECK:   Line info @ {{[0-9]+}}: simple.c, line 2
+; CHECK: Method unload [1]
+
+; ModuleID = 'simple.c'
+
+; Function Attrs: nounwind uwtable
+define i32 @foo(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !12, metadata !13), !dbg !14
+  %0 = load i32* %a.addr, align 4, !dbg !15
+  ret i32 %0, !dbg !16
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = !{!"0x11\0012\00clang version 3.6.0 (trunk)\000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [F:\users\akaylor\llvm-s\llvm\test\JitListener/simple.c] [DW_LANG_C99]
+!1 = !{!"simple.c", !"F:\5Cusers\5Cakaylor\5Cllvm-s\5Cllvm\5Ctest\5CJitListener"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\000\00256\000\001", !1, !5, !6, null, i32 (i32)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}                               ; [ DW_TAG_file_type ] [F:\users\akaylor\llvm-s\llvm\test\JitListener/simple.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 2}
+!11 = !{!"clang version 3.6.0 (trunk)"}
+!12 = !{!"0x101\00a\0016777217\000", !4, !5, !8}  ; [ DW_TAG_arg_variable ] [a] [line 1]
+!13 = !{!"0x102"}                                 ; [ DW_TAG_expression ]
+!14 = !MDLocation(line: 1, column: 13, scope: !4)
+!15 = !MDLocation(line: 2, column: 10, scope: !4)
+!16 = !MDLocation(line: 2, column: 3, scope: !4)
diff --git a/test/JitListener/test-common-symbols.ll b/test/JitListener/test-common-symbols.ll
deleted file mode 100644
index 3c8b9e3..0000000
--- a/test/JitListener/test-common-symbols.ll
+++ /dev/null
@@ -1,113 +0,0 @@
-; RUN: llvm-jitlistener %s | FileCheck %s
-
-; CHECK: Method load [1]: main, Size = 164
-; CHECK: Method unload [1]
-
-; ModuleID = '<stdin>'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@zero_int = common global i32 0, align 4
-@zero_arr = common global [10 x i32] zeroinitializer, align 16
-@zero_double = common global double 0.000000e+00, align 8
-
-define i32 @main() nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  %i = alloca i32, align 4
-  store i32 0, i32* %retval
-  %0 = load i32* @zero_int, align 4, !dbg !21
-  %add = add nsw i32 %0, 5, !dbg !21
-  %idxprom = sext i32 %add to i64, !dbg !21
-  %arrayidx = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom, !dbg !21
-  store i32 40, i32* %arrayidx, align 4, !dbg !21
-  %1 = load double* @zero_double, align 8, !dbg !23
-  %cmp = fcmp olt double %1, 1.000000e+00, !dbg !23
-  br i1 %cmp, label %if.then, label %if.end, !dbg !23
-
-if.then:                                          ; preds = %entry
-  %2 = load i32* @zero_int, align 4, !dbg !24
-  %add1 = add nsw i32 %2, 2, !dbg !24
-  %idxprom2 = sext i32 %add1 to i64, !dbg !24
-  %arrayidx3 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom2, !dbg !24
-  store i32 70, i32* %arrayidx3, align 4, !dbg !24
-  br label %if.end, !dbg !24
-
-if.end:                                           ; preds = %if.then, %entry
-  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !25, metadata !{metadata !"0x102"}), !dbg !27
-  store i32 1, i32* %i, align 4, !dbg !28
-  br label %for.cond, !dbg !28
-
-for.cond:                                         ; preds = %for.inc, %if.end
-  %3 = load i32* %i, align 4, !dbg !28
-  %cmp4 = icmp slt i32 %3, 10, !dbg !28
-  br i1 %cmp4, label %for.body, label %for.end, !dbg !28
-
-for.body:                                         ; preds = %for.cond
-  %4 = load i32* %i, align 4, !dbg !29
-  %sub = sub nsw i32 %4, 1, !dbg !29
-  %idxprom5 = sext i32 %sub to i64, !dbg !29
-  %arrayidx6 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom5, !dbg !29
-  %5 = load i32* %arrayidx6, align 4, !dbg !29
-  %6 = load i32* %i, align 4, !dbg !29
-  %idxprom7 = sext i32 %6 to i64, !dbg !29
-  %arrayidx8 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom7, !dbg !29
-  %7 = load i32* %arrayidx8, align 4, !dbg !29
-  %add9 = add nsw i32 %5, %7, !dbg !29
-  %8 = load i32* %i, align 4, !dbg !29
-  %idxprom10 = sext i32 %8 to i64, !dbg !29
-  %arrayidx11 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom10, !dbg !29
-  store i32 %add9, i32* %arrayidx11, align 4, !dbg !29
-  br label %for.inc, !dbg !31
-
-for.inc:                                          ; preds = %for.body
-  %9 = load i32* %i, align 4, !dbg !32
-  %inc = add nsw i32 %9, 1, !dbg !32
-  store i32 %inc, i32* %i, align 4, !dbg !32
-  br label %for.cond, !dbg !32
-
-for.end:                                          ; preds = %for.cond
-  %10 = load i32* getelementptr inbounds ([10 x i32]* @zero_arr, i32 0, i64 9), align 4, !dbg !33
-  %cmp12 = icmp eq i32 %10, 110, !dbg !33
-  %cond = select i1 %cmp12, i32 0, i32 -1, !dbg !33
-  ret i32 %cond, !dbg !33
-}
-
-declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!35}
-
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 ()\001\00\000\00\000", metadata !34, metadata !1, metadata !1, metadata !3, metadata !12, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 0}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00main\00main\00\006\000\001\000\006\000\000\000", metadata !34, metadata !6, metadata !7, null, i32 ()* @main, null, null, metadata !10} ; [ DW_TAG_subprogram ]
-!6 = metadata !{metadata !"0x29", metadata !34} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !11}
-!11 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
-!12 = metadata !{metadata !14, metadata !15, metadata !17}
-!14 = metadata !{metadata !"0x34\00zero_int\00zero_int\00\001\000\001", null, metadata !6, metadata !9, i32* @zero_int, null} ; [ DW_TAG_variable ]
-!15 = metadata !{metadata !"0x34\00zero_double\00zero_double\00\002\000\001", null, metadata !6, metadata !16, double* @zero_double, null} ; [ DW_TAG_variable ]
-!16 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ]
-!17 = metadata !{metadata !"0x34\00zero_arr\00zero_arr\00\003\000\001", null, metadata !6, metadata !18, [10 x i32]* @zero_arr, null} ; [ DW_TAG_variable ]
-!18 = metadata !{metadata !"0x1\00\000\00320\0032\000\000", null, metadata !"", metadata !9, metadata !19, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 32, offset 0] [from int]
-!19 = metadata !{metadata !20}
-!20 = metadata !{metadata !"0x21\000\0010"}        ; [ DW_TAG_subrange_type ]
-!21 = metadata !{i32 7, i32 5, metadata !22, null}
-!22 = metadata !{metadata !"0xb\006\001\000", metadata !34, metadata !5} ; [ DW_TAG_lexical_block ]
-!23 = metadata !{i32 9, i32 5, metadata !22, null}
-!24 = metadata !{i32 10, i32 9, metadata !22, null}
-!25 = metadata !{metadata !"0x100\00i\0012\000", metadata !26, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ]
-!26 = metadata !{metadata !"0xb\0012\005\001", metadata !34, metadata !22} ; [ DW_TAG_lexical_block ]
-!27 = metadata !{i32 12, i32 14, metadata !26, null}
-!28 = metadata !{i32 12, i32 19, metadata !26, null}
-!29 = metadata !{i32 13, i32 9, metadata !30, null}
-!30 = metadata !{metadata !"0xb\0012\0034\002", metadata !34, metadata !26} ; [ DW_TAG_lexical_block ]
-!31 = metadata !{i32 14, i32 5, metadata !30, null}
-!32 = metadata !{i32 12, i32 29, metadata !26, null}
-!33 = metadata !{i32 15, i32 5, metadata !22, null}
-!34 = metadata !{metadata !"test-common-symbols.c", metadata !"/store/store/llvm/build"}
-!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/JitListener/test-inline.ll b/test/JitListener/test-inline.ll
deleted file mode 100644
index a600734..0000000
--- a/test/JitListener/test-inline.ll
+++ /dev/null
@@ -1,212 +0,0 @@
-; RUN: llvm-jitlistener %s | FileCheck %s
-
-; CHECK: Method load [1]: _Z15test_parametersPfPA2_dR11char_structPPitm, Size = 170
-; CHECK:   Line info @ 0: test-inline.cpp, line 33
-; CHECK:   Line info @ 35: test-inline.cpp, line 34
-; CHECK:   Line info @ 165: test-inline.cpp, line 35
-; CHECK: Method load [2]: _Z3foov, Size = 3
-; CHECK:   Line info @ 0: test-inline.cpp, line 28
-; CHECK:   Line info @ 2: test-inline.cpp, line 29
-; CHECK:   Line info @ 3: test-inline.cpp, line 29
-; CHECK: Method load [3]: main, Size = 146
-; CHECK:   Line info @ 0: test-inline.cpp, line 39
-; CHECK:   Line info @ 21: test-inline.cpp, line 41
-; CHECK:   Line info @ 39: test-inline.cpp, line 42
-; CHECK:   Line info @ 60: test-inline.cpp, line 44
-; CHECK:   Line info @ 80: test-inline.cpp, line 48
-; CHECK:   Line info @ 90: test-inline.cpp, line 45
-; CHECK:   Line info @ 95: test-inline.cpp, line 46
-; CHECK:   Line info @ 114: test-inline.cpp, line 48
-; CHECK:   Line info @ 141: test-inline.cpp, line 49
-; CHECK:   Line info @ 146: test-inline.cpp, line 49
-; CHECK: Method unload [1]
-; CHECK: Method unload [2]
-; CHECK: Method unload [3]
-
-; ModuleID = 'test-inline.cpp'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%struct.char_struct = type { i8, [2 x i8] }
-
-@compound_char = global %struct.char_struct zeroinitializer, align 1
-@_ZZ4mainE1d = private unnamed_addr constant [2 x [2 x double]] [[2 x double] [double 0.000000e+00, double 1.000000e+00], [2 x double] [double 2.000000e+00, double 3.000000e+00]], align 16
-
-define double @_Z15test_parametersPfPA2_dR11char_structPPitm(float* %pf, [2 x double]* %ppd, %struct.char_struct* %s, i32** %ppn, i16 zeroext %us, i64 %l) uwtable {
-entry:
-  %pf.addr = alloca float*, align 8
-  %ppd.addr = alloca [2 x double]*, align 8
-  %s.addr = alloca %struct.char_struct*, align 8
-  %ppn.addr = alloca i32**, align 8
-  %us.addr = alloca i16, align 2
-  %l.addr = alloca i64, align 8
-  %result = alloca double, align 8
-  store float* %pf, float** %pf.addr, align 8
-  call void @llvm.dbg.declare(metadata !{float** %pf.addr}, metadata !46, metadata !{metadata !"0x102"}), !dbg !47
-  store [2 x double]* %ppd, [2 x double]** %ppd.addr, align 8
-  call void @llvm.dbg.declare(metadata !{[2 x double]** %ppd.addr}, metadata !48, metadata !{metadata !"0x102"}), !dbg !47
-  store %struct.char_struct* %s, %struct.char_struct** %s.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.char_struct** %s.addr}, metadata !49, metadata !{metadata !"0x102"}), !dbg !47
-  store i32** %ppn, i32*** %ppn.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i32*** %ppn.addr}, metadata !50, metadata !{metadata !"0x102"}), !dbg !47
-  store i16 %us, i16* %us.addr, align 2
-  call void @llvm.dbg.declare(metadata !{i16* %us.addr}, metadata !51, metadata !{metadata !"0x102"}), !dbg !47
-  store i64 %l, i64* %l.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i64* %l.addr}, metadata !52, metadata !{metadata !"0x102"}), !dbg !47
-  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !53, metadata !{metadata !"0x102"}), !dbg !55
-  %0 = load float** %pf.addr, align 8, !dbg !55
-  %arrayidx = getelementptr inbounds float* %0, i64 0, !dbg !55
-  %1 = load float* %arrayidx, align 4, !dbg !55
-  %conv = fpext float %1 to double, !dbg !55
-  %2 = load [2 x double]** %ppd.addr, align 8, !dbg !55
-  %arrayidx1 = getelementptr inbounds [2 x double]* %2, i64 1, !dbg !55
-  %arrayidx2 = getelementptr inbounds [2 x double]* %arrayidx1, i32 0, i64 1, !dbg !55
-  %3 = load double* %arrayidx2, align 8, !dbg !55
-  %mul = fmul double %conv, %3, !dbg !55
-  %4 = load %struct.char_struct** %s.addr, align 8, !dbg !55
-  %c = getelementptr inbounds %struct.char_struct* %4, i32 0, i32 0, !dbg !55
-  %5 = load i8* %c, align 1, !dbg !55
-  %conv3 = sext i8 %5 to i32, !dbg !55
-  %conv4 = sitofp i32 %conv3 to double, !dbg !55
-  %mul5 = fmul double %mul, %conv4, !dbg !55
-  %6 = load i16* %us.addr, align 2, !dbg !55
-  %conv6 = zext i16 %6 to i32, !dbg !55
-  %conv7 = sitofp i32 %conv6 to double, !dbg !55
-  %mul8 = fmul double %mul5, %conv7, !dbg !55
-  %7 = load i64* %l.addr, align 8, !dbg !55
-  %conv9 = uitofp i64 %7 to double, !dbg !55
-  %mul10 = fmul double %mul8, %conv9, !dbg !55
-  %call = call i32 @_Z3foov(), !dbg !55
-  %conv11 = sitofp i32 %call to double, !dbg !55
-  %add = fadd double %mul10, %conv11, !dbg !55
-  store double %add, double* %result, align 8, !dbg !55
-  %8 = load double* %result, align 8, !dbg !56
-  ret double %8, !dbg !56
-}
-
-declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
-
-define linkonce_odr i32 @_Z3foov() nounwind uwtable inlinehint {
-entry:
-  ret i32 0, !dbg !57
-}
-
-define i32 @main(i32 %argc, i8** %argv) uwtable {
-entry:
-  %retval = alloca i32, align 4
-  %argc.addr = alloca i32, align 4
-  %argv.addr = alloca i8**, align 8
-  %s = alloca %struct.char_struct, align 1
-  %f = alloca float, align 4
-  %d = alloca [2 x [2 x double]], align 16
-  %result = alloca double, align 8
-  store i32 0, i32* %retval
-  store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !59, metadata !{metadata !"0x102"}), !dbg !60
-  store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !61, metadata !{metadata !"0x102"}), !dbg !60
-  call void @llvm.dbg.declare(metadata !{%struct.char_struct* %s}, metadata !62, metadata !{metadata !"0x102"}), !dbg !64
-  call void @llvm.dbg.declare(metadata !{float* %f}, metadata !65, metadata !{metadata !"0x102"}), !dbg !66
-  store float 0.000000e+00, float* %f, align 4, !dbg !66
-  call void @llvm.dbg.declare(metadata !{[2 x [2 x double]]* %d}, metadata !67, metadata !{metadata !"0x102"}), !dbg !70
-  %0 = bitcast [2 x [2 x double]]* %d to i8*, !dbg !70
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([2 x [2 x double]]* @_ZZ4mainE1d to i8*), i64 32, i32 16, i1 false), !dbg !70
-  %c = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 0, !dbg !71
-  store i8 97, i8* %c, align 1, !dbg !71
-  %c2 = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 1, !dbg !72
-  %arrayidx = getelementptr inbounds [2 x i8]* %c2, i32 0, i64 0, !dbg !72
-  store i8 48, i8* %arrayidx, align 1, !dbg !72
-  %c21 = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 1, !dbg !73
-  %arrayidx2 = getelementptr inbounds [2 x i8]* %c21, i32 0, i64 1, !dbg !73
-  store i8 49, i8* %arrayidx2, align 1, !dbg !73
-  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !74, metadata !{metadata !"0x102"}), !dbg !75
-  %arraydecay = getelementptr inbounds [2 x [2 x double]]* %d, i32 0, i32 0, !dbg !75
-  %call = call double @_Z15test_parametersPfPA2_dR11char_structPPitm(float* %f, [2 x double]* %arraydecay, %struct.char_struct* %s, i32** null, i16 zeroext 10, i64 42), !dbg !75
-  store double %call, double* %result, align 8, !dbg !75
-  %1 = load double* %result, align 8, !dbg !76
-  %cmp = fcmp oeq double %1, 0.000000e+00, !dbg !76
-  %cond = select i1 %cmp, i32 0, i32 -1, !dbg !76
-  ret i32 %cond, !dbg !76
-}
-
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!78}
-
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-clang2 gitosis@miro.kw.intel.com:clang.git 39450d0469e0d5589ad39fd0b20b5742750619a0) (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-llvm gitosis@miro.kw.intel.com:llvm.git 376642ed620ecae05b68c7bc81f79aeb2065abe0)\001\00\000\00\000", metadata !77, metadata !1, metadata !1, metadata !3, metadata !43, null} ; [ DW_TAG_compile_unit ] [/home/akaylor/dev/test-inline.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{i32 0}
-!3 = metadata !{metadata !5, metadata !35, metadata !40}
-!5 = metadata !{metadata !"0x2e\00test_parameters\00test_parameters\00_Z15test_parametersPfPA2_dR11char_structPPitm\0032\000\001\000\006\00256\000\0033", metadata !77, metadata !6, metadata !7, null, double (float*, [2 x double]*, %struct.char_struct*, i32**, i16, i64)* @_Z15test_parametersPfPA2_dR11char_structPPitm, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 32] [def] [scope 33] [test_parameters]
-!6 = metadata !{metadata !"0x29", metadata !77} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !10, metadata !12, metadata !16, metadata !29, metadata !32, metadata !33}
-!9 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from float]
-!11 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
-!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!13 = metadata !{metadata !"0x1\00\000\00128\0064\000\000", null, metadata !"", metadata !9, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 64, offset 0] [from double]
-!14 = metadata !{metadata !15}
-!15 = metadata !{metadata !"0x21\000\002"}        ; [ DW_TAG_subrange_type ] [0, 1]
-!16 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !17} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from char_struct]
-!17 = metadata !{metadata !"0x13\00char_struct\0022\0024\008\000\000\000", metadata !77, null, null, metadata !18, null, null, null} ; [ DW_TAG_structure_type ] [char_struct] [line 22, size 24, align 8, offset 0] [def] [from ]
-!18 = metadata !{metadata !19, metadata !21, metadata !23}
-!19 = metadata !{metadata !"0xd\00c\0023\008\008\000\000", metadata !77, metadata !17, metadata !20} ; [ DW_TAG_member ] [c] [line 23, size 8, align 8, offset 0] [from char]
-!20 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!21 = metadata !{metadata !"0xd\00c2\0024\0016\008\008\000", metadata !77, metadata !17, metadata !22} ; [ DW_TAG_member ] [c2] [line 24, size 16, align 8, offset 8] [from ]
-!22 = metadata !{metadata !"0x1\00\000\0016\008\000\000", null, metadata !"", metadata !20, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 16, align 8, offset 0] [from char]
-!23 = metadata !{metadata !"0x2e\00char_struct\00char_struct\00\0022\000\000\000\006\00320\000\0022", metadata !77, metadata !17, metadata !24, null, null, null, i32 0, metadata !27} ; [ DW_TAG_subprogram ] [line 22] [char_struct]
-!24 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!25 = metadata !{null, metadata !26}
-!26 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, metadata !"", metadata !17} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char_struct]
-!27 = metadata !{metadata !28}
-!28 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!29 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !30} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!30 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !31} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!31 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!32 = metadata !{metadata !"0x24\00unsigned short\000\0016\0016\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned short] [line 0, size 16, align 16, offset 0, enc DW_ATE_unsigned]
-!33 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, metadata !"", metadata !34} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long unsigned int]
-!34 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!35 = metadata !{metadata !"0x2e\00main\00main\00\0038\000\001\000\006\00256\000\0039", metadata !77, metadata !6, metadata !36, null, i32 (i32, i8**)* @main, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 38] [def] [scope 39] [main]
-!36 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !37, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!37 = metadata !{metadata !31, metadata !31, metadata !38}
-!38 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !39} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!39 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
-!40 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3foov\0027\000\001\000\006\00256\000\0028", metadata !77, metadata !6, metadata !41, null, i32 ()* @_Z3foov, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 27] [def] [scope 28] [foo]
-!41 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !42, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!42 = metadata !{metadata !31}
-!43 = metadata !{metadata !45}
-!45 = metadata !{metadata !"0x34\00compound_char\00compound_char\00\0025\000\001", null, metadata !6, metadata !17, %struct.char_struct* @compound_char, null} ; [ DW_TAG_variable ] [compound_char] [line 25] [def]
-!46 = metadata !{metadata !"0x101\00pf\0016777248\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ] [pf] [line 32]
-!47 = metadata !{i32 32, i32 0, metadata !5, null}
-!48 = metadata !{metadata !"0x101\00ppd\0033554464\000", metadata !5, metadata !6, metadata !12} ; [ DW_TAG_arg_variable ] [ppd] [line 32]
-!49 = metadata !{metadata !"0x101\00s\0050331680\000", metadata !5, metadata !6, metadata !16} ; [ DW_TAG_arg_variable ] [s] [line 32]
-!50 = metadata !{metadata !"0x101\00ppn\0067108896\000", metadata !5, metadata !6, metadata !29} ; [ DW_TAG_arg_variable ] [ppn] [line 32]
-!51 = metadata !{metadata !"0x101\00us\0083886112\000", metadata !5, metadata !6, metadata !32} ; [ DW_TAG_arg_variable ] [us] [line 32]
-!52 = metadata !{metadata !"0x101\00l\00100663328\000", metadata !5, metadata !6, metadata !33} ; [ DW_TAG_arg_variable ] [l] [line 32]
-!53 = metadata !{metadata !"0x100\00result\0034\000", metadata !54, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ] [result] [line 34]
-!54 = metadata !{metadata !"0xb\0033\000\000", metadata !77, metadata !5} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-inline.cpp]
-!55 = metadata !{i32 34, i32 0, metadata !54, null}
-!56 = metadata !{i32 35, i32 0, metadata !54, null}
-!57 = metadata !{i32 29, i32 0, metadata !58, null}
-!58 = metadata !{metadata !"0xb\0028\000\002", metadata !77, metadata !40} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-inline.cpp]
-!59 = metadata !{metadata !"0x101\00argc\0016777254\000", metadata !35, metadata !6, metadata !31} ; [ DW_TAG_arg_variable ] [argc] [line 38]
-!60 = metadata !{i32 38, i32 0, metadata !35, null}
-!61 = metadata !{metadata !"0x101\00argv\0033554470\000", metadata !35, metadata !6, metadata !38} ; [ DW_TAG_arg_variable ] [argv] [line 38]
-!62 = metadata !{metadata !"0x100\00s\0040\000", metadata !63, metadata !6, metadata !17} ; [ DW_TAG_auto_variable ] [s] [line 40]
-!63 = metadata !{metadata !"0xb\0039\000\001", metadata !77, metadata !35} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-inline.cpp]
-!64 = metadata !{i32 40, i32 0, metadata !63, null}
-!65 = metadata !{metadata !"0x100\00f\0041\000", metadata !63, metadata !6, metadata !11} ; [ DW_TAG_auto_variable ] [f] [line 41]
-!66 = metadata !{i32 41, i32 0, metadata !63, null}
-!67 = metadata !{metadata !"0x100\00d\0042\000", metadata !63, metadata !6, metadata !68} ; [ DW_TAG_auto_variable ] [d] [line 42]
-!68 = metadata !{metadata !"0x1\00\000\00256\0064\000\000", null, metadata !"", metadata !9, metadata !69, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 256, align 64, offset 0] [from double]
-!69 = metadata !{metadata !15, metadata !15}
-!70 = metadata !{i32 42, i32 0, metadata !63, null}
-!71 = metadata !{i32 44, i32 0, metadata !63, null}
-!72 = metadata !{i32 45, i32 0, metadata !63, null}
-!73 = metadata !{i32 46, i32 0, metadata !63, null}
-!74 = metadata !{metadata !"0x100\00result\0048\000", metadata !63, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ] [result] [line 48]
-!75 = metadata !{i32 48, i32 0, metadata !63, null}
-!76 = metadata !{i32 49, i32 0, metadata !63, null}
-!77 = metadata !{metadata !"test-inline.cpp", metadata !"/home/akaylor/dev"}
-!78 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/JitListener/test-parameters.ll b/test/JitListener/test-parameters.ll
deleted file mode 100644
index d1f3b76..0000000
--- a/test/JitListener/test-parameters.ll
+++ /dev/null
@@ -1,211 +0,0 @@
-; RUN: llvm-jitlistener %s | FileCheck %s
-
-; CHECK: Method load [1]: _Z15test_parametersPfPA2_dR11char_structPPitm, Size = 170
-; CHECK:   Line info @ 0: test-parameters.cpp, line 33
-; CHECK:   Line info @ 35: test-parameters.cpp, line 34
-; CHECK:   Line info @ 165: test-parameters.cpp, line 35
-; CHECK: Method load [2]: _Z3foov, Size = 3
-; CHECK:   Line info @ 0: test-parameters.cpp, line 28
-; CHECK:   Line info @ 2: test-parameters.cpp, line 29
-; CHECK: Method load [3]: main, Size = 146
-; CHECK:   Line info @ 0: test-parameters.cpp, line 39
-; CHECK:   Line info @ 21: test-parameters.cpp, line 41
-; CHECK:   Line info @ 39: test-parameters.cpp, line 42
-; CHECK:   Line info @ 60: test-parameters.cpp, line 44
-; CHECK:   Line info @ 80: test-parameters.cpp, line 48
-; CHECK:   Line info @ 90: test-parameters.cpp, line 45
-; CHECK:   Line info @ 95: test-parameters.cpp, line 46
-; CHECK:   Line info @ 114: test-parameters.cpp, line 48
-; CHECK:   Line info @ 141: test-parameters.cpp, line 49
-; CHECK:   Line info @ 146: test-parameters.cpp, line 49
-; CHECK: Method unload [1]
-; CHECK: Method unload [2]
-; CHECK: Method unload [3]
-
-; ModuleID = 'test-parameters.cpp'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%struct.char_struct = type { i8, [2 x i8] }
-
-@compound_char = global %struct.char_struct zeroinitializer, align 1
-@_ZZ4mainE1d = private unnamed_addr constant [2 x [2 x double]] [[2 x double] [double 0.000000e+00, double 1.000000e+00], [2 x double] [double 2.000000e+00, double 3.000000e+00]], align 16
-
-define i32 @_Z3foov() nounwind uwtable {
-entry:
-  ret i32 0, !dbg !46
-}
-
-define double @_Z15test_parametersPfPA2_dR11char_structPPitm(float* %pf, [2 x double]* %ppd, %struct.char_struct* %s, i32** %ppn, i16 zeroext %us, i64 %l) nounwind uwtable {
-entry:
-  %pf.addr = alloca float*, align 8
-  %ppd.addr = alloca [2 x double]*, align 8
-  %s.addr = alloca %struct.char_struct*, align 8
-  %ppn.addr = alloca i32**, align 8
-  %us.addr = alloca i16, align 2
-  %l.addr = alloca i64, align 8
-  %result = alloca double, align 8
-  store float* %pf, float** %pf.addr, align 8
-  call void @llvm.dbg.declare(metadata !{float** %pf.addr}, metadata !48, metadata !{metadata !"0x102"}), !dbg !49
-  store [2 x double]* %ppd, [2 x double]** %ppd.addr, align 8
-  call void @llvm.dbg.declare(metadata !{[2 x double]** %ppd.addr}, metadata !50, metadata !{metadata !"0x102"}), !dbg !49
-  store %struct.char_struct* %s, %struct.char_struct** %s.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.char_struct** %s.addr}, metadata !51, metadata !{metadata !"0x102"}), !dbg !49
-  store i32** %ppn, i32*** %ppn.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i32*** %ppn.addr}, metadata !52, metadata !{metadata !"0x102"}), !dbg !49
-  store i16 %us, i16* %us.addr, align 2
-  call void @llvm.dbg.declare(metadata !{i16* %us.addr}, metadata !53, metadata !{metadata !"0x102"}), !dbg !49
-  store i64 %l, i64* %l.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i64* %l.addr}, metadata !54, metadata !{metadata !"0x102"}), !dbg !49
-  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !55, metadata !{metadata !"0x102"}), !dbg !57
-  %0 = load float** %pf.addr, align 8, !dbg !57
-  %arrayidx = getelementptr inbounds float* %0, i64 0, !dbg !57
-  %1 = load float* %arrayidx, align 4, !dbg !57
-  %conv = fpext float %1 to double, !dbg !57
-  %2 = load [2 x double]** %ppd.addr, align 8, !dbg !57
-  %arrayidx1 = getelementptr inbounds [2 x double]* %2, i64 1, !dbg !57
-  %arrayidx2 = getelementptr inbounds [2 x double]* %arrayidx1, i32 0, i64 1, !dbg !57
-  %3 = load double* %arrayidx2, align 8, !dbg !57
-  %mul = fmul double %conv, %3, !dbg !57
-  %4 = load %struct.char_struct** %s.addr, align 8, !dbg !57
-  %c = getelementptr inbounds %struct.char_struct* %4, i32 0, i32 0, !dbg !57
-  %5 = load i8* %c, align 1, !dbg !57
-  %conv3 = sext i8 %5 to i32, !dbg !57
-  %conv4 = sitofp i32 %conv3 to double, !dbg !57
-  %mul5 = fmul double %mul, %conv4, !dbg !57
-  %6 = load i16* %us.addr, align 2, !dbg !57
-  %conv6 = zext i16 %6 to i32, !dbg !57
-  %conv7 = sitofp i32 %conv6 to double, !dbg !57
-  %mul8 = fmul double %mul5, %conv7, !dbg !57
-  %7 = load i64* %l.addr, align 8, !dbg !57
-  %conv9 = uitofp i64 %7 to double, !dbg !57
-  %mul10 = fmul double %mul8, %conv9, !dbg !57
-  %call = call i32 @_Z3foov(), !dbg !57
-  %conv11 = sitofp i32 %call to double, !dbg !57
-  %add = fadd double %mul10, %conv11, !dbg !57
-  store double %add, double* %result, align 8, !dbg !57
-  %8 = load double* %result, align 8, !dbg !58
-  ret double %8, !dbg !58
-}
-
-declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
-
-define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  %argc.addr = alloca i32, align 4
-  %argv.addr = alloca i8**, align 8
-  %s = alloca %struct.char_struct, align 1
-  %f = alloca float, align 4
-  %d = alloca [2 x [2 x double]], align 16
-  %result = alloca double, align 8
-  store i32 0, i32* %retval
-  store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !59, metadata !{metadata !"0x102"}), !dbg !60
-  store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !61, metadata !{metadata !"0x102"}), !dbg !60
-  call void @llvm.dbg.declare(metadata !{%struct.char_struct* %s}, metadata !62, metadata !{metadata !"0x102"}), !dbg !64
-  call void @llvm.dbg.declare(metadata !{float* %f}, metadata !65, metadata !{metadata !"0x102"}), !dbg !66
-  store float 0.000000e+00, float* %f, align 4, !dbg !66
-  call void @llvm.dbg.declare(metadata !{[2 x [2 x double]]* %d}, metadata !67, metadata !{metadata !"0x102"}), !dbg !70
-  %0 = bitcast [2 x [2 x double]]* %d to i8*, !dbg !70
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([2 x [2 x double]]* @_ZZ4mainE1d to i8*), i64 32, i32 16, i1 false), !dbg !70
-  %c = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 0, !dbg !71
-  store i8 97, i8* %c, align 1, !dbg !71
-  %c2 = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 1, !dbg !72
-  %arrayidx = getelementptr inbounds [2 x i8]* %c2, i32 0, i64 0, !dbg !72
-  store i8 48, i8* %arrayidx, align 1, !dbg !72
-  %c21 = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 1, !dbg !73
-  %arrayidx2 = getelementptr inbounds [2 x i8]* %c21, i32 0, i64 1, !dbg !73
-  store i8 49, i8* %arrayidx2, align 1, !dbg !73
-  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !74, metadata !{metadata !"0x102"}), !dbg !75
-  %arraydecay = getelementptr inbounds [2 x [2 x double]]* %d, i32 0, i32 0, !dbg !75
-  %call = call double @_Z15test_parametersPfPA2_dR11char_structPPitm(float* %f, [2 x double]* %arraydecay, %struct.char_struct* %s, i32** null, i16 zeroext 10, i64 42), !dbg !75
-  store double %call, double* %result, align 8, !dbg !75
-  %1 = load double* %result, align 8, !dbg !76
-  %cmp = fcmp oeq double %1, 0.000000e+00, !dbg !76
-  %cond = select i1 %cmp, i32 0, i32 -1, !dbg !76
-  ret i32 %cond, !dbg !76
-}
-
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!78}
-
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-clang2 gitosis@miro.kw.intel.com:clang.git 39450d0469e0d5589ad39fd0b20b5742750619a0) (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-llvm gitosis@miro.kw.intel.com:llvm.git 376642ed620ecae05b68c7bc81f79aeb2065abe0)\001\00\000\00\000", metadata !77, metadata !1, metadata !1, metadata !3, metadata !43, null} ; [ DW_TAG_compile_unit ] [/home/akaylor/dev/test-parameters.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{i32 0}
-!3 = metadata !{metadata !5, metadata !10, metadata !38}
-!5 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3foov\0027\000\001\000\006\00256\000\0028", metadata !77, metadata !6, metadata !7, null, i32 ()* @_Z3foov, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 27] [def] [scope 28] [foo]
-!6 = metadata !{metadata !"0x29", metadata !77} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !"0x2e\00test_parameters\00test_parameters\00_Z15test_parametersPfPA2_dR11char_structPPitm\0032\000\001\000\006\00256\000\0033", metadata !77, metadata !6, metadata !11, null, double (float*, [2 x double]*, %struct.char_struct*, i32**, i16, i64)* @_Z15test_parametersPfPA2_dR11char_structPPitm, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 32] [def] [scope 33] [test_parameters]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !13, metadata !14, metadata !16, metadata !20, metadata !33, metadata !35, metadata !36}
-!13 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
-!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !15} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from float]
-!15 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
-!16 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !17} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!17 = metadata !{metadata !"0x1\00\000\00128\0064\000\000", null, metadata !"", metadata !13, metadata !18, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 64, offset 0] [from double]
-!18 = metadata !{metadata !19}
-!19 = metadata !{metadata !"0x21\000\002"}        ; [ DW_TAG_subrange_type ] [0, 1]
-!20 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !21} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from char_struct]
-!21 = metadata !{metadata !"0x13\00char_struct\0022\0024\008\000\000\000", metadata !77, null, null, metadata !22, null, null, null} ; [ DW_TAG_structure_type ] [char_struct] [line 22, size 24, align 8, offset 0] [def] [from ]
-!22 = metadata !{metadata !23, metadata !25, metadata !27}
-!23 = metadata !{metadata !"0xd\00c\0023\008\008\000\000", metadata !77, metadata !21, metadata !24} ; [ DW_TAG_member ] [c] [line 23, size 8, align 8, offset 0] [from char]
-!24 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!25 = metadata !{metadata !"0xd\00c2\0024\0016\008\008\000", metadata !77, metadata !21, metadata !26} ; [ DW_TAG_member ] [c2] [line 24, size 16, align 8, offset 8] [from ]
-!26 = metadata !{metadata !"0x1\00\000\0016\008\000\000", null, metadata !"", metadata !24, metadata !18, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 16, align 8, offset 0] [from char]
-!27 = metadata !{metadata !"0x2e\00char_struct\00char_struct\00\0022\000\000\000\006\00320\000\0022", metadata !77, metadata !21, metadata !28, null, null, null, i32 0, metadata !31} ; [ DW_TAG_subprogram ] [line 22] [char_struct]
-!28 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!29 = metadata !{null, metadata !30}
-!30 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, metadata !"", metadata !21} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char_struct]
-!31 = metadata !{metadata !32}
-!32 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!33 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !34} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!34 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!35 = metadata !{metadata !"0x24\00unsigned short\000\0016\0016\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned short] [line 0, size 16, align 16, offset 0, enc DW_ATE_unsigned]
-!36 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, metadata !"", metadata !37} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long unsigned int]
-!37 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!38 = metadata !{metadata !"0x2e\00main\00main\00\0038\000\001\000\006\00256\000\0039", metadata !77, metadata !6, metadata !39, null, i32 (i32, i8**)* @main, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 38] [def] [scope 39] [main]
-!39 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !40, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!40 = metadata !{metadata !9, metadata !9, metadata !41}
-!41 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !42} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!42 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !24} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
-!43 = metadata !{metadata !45}
-!45 = metadata !{metadata !"0x34\00compound_char\00compound_char\00\0025\000\001", null, metadata !6, metadata !21, %struct.char_struct* @compound_char, null} ; [ DW_TAG_variable ] [compound_char] [line 25] [def]
-!46 = metadata !{i32 29, i32 0, metadata !47, null}
-!47 = metadata !{metadata !"0xb\0028\000\000", metadata !77, metadata !5} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-parameters.cpp]
-!48 = metadata !{metadata !"0x101\00pf\0016777248\000", metadata !10, metadata !6, metadata !14} ; [ DW_TAG_arg_variable ] [pf] [line 32]
-!49 = metadata !{i32 32, i32 0, metadata !10, null}
-!50 = metadata !{metadata !"0x101\00ppd\0033554464\000", metadata !10, metadata !6, metadata !16} ; [ DW_TAG_arg_variable ] [ppd] [line 32]
-!51 = metadata !{metadata !"0x101\00s\0050331680\000", metadata !10, metadata !6, metadata !20} ; [ DW_TAG_arg_variable ] [s] [line 32]
-!52 = metadata !{metadata !"0x101\00ppn\0067108896\000", metadata !10, metadata !6, metadata !33} ; [ DW_TAG_arg_variable ] [ppn] [line 32]
-!53 = metadata !{metadata !"0x101\00us\0083886112\000", metadata !10, metadata !6, metadata !35} ; [ DW_TAG_arg_variable ] [us] [line 32]
-!54 = metadata !{metadata !"0x101\00l\00100663328\000", metadata !10, metadata !6, metadata !36} ; [ DW_TAG_arg_variable ] [l] [line 32]
-!55 = metadata !{metadata !"0x100\00result\0034\000", metadata !56, metadata !6, metadata !13} ; [ DW_TAG_auto_variable ] [result] [line 34]
-!56 = metadata !{metadata !"0xb\0033\000\001", metadata !77, metadata !10} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-parameters.cpp]
-!57 = metadata !{i32 34, i32 0, metadata !56, null}
-!58 = metadata !{i32 35, i32 0, metadata !56, null}
-!59 = metadata !{metadata !"0x101\00argc\0016777254\000", metadata !38, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [argc] [line 38]
-!60 = metadata !{i32 38, i32 0, metadata !38, null}
-!61 = metadata !{metadata !"0x101\00argv\0033554470\000", metadata !38, metadata !6, metadata !41} ; [ DW_TAG_arg_variable ] [argv] [line 38]
-!62 = metadata !{metadata !"0x100\00s\0040\000", metadata !63, metadata !6, metadata !21} ; [ DW_TAG_auto_variable ] [s] [line 40]
-!63 = metadata !{metadata !"0xb\0039\000\002", metadata !77, metadata !38} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-parameters.cpp]
-!64 = metadata !{i32 40, i32 0, metadata !63, null}
-!65 = metadata !{metadata !"0x100\00f\0041\000", metadata !63, metadata !6, metadata !15} ; [ DW_TAG_auto_variable ] [f] [line 41]
-!66 = metadata !{i32 41, i32 0, metadata !63, null}
-!67 = metadata !{metadata !"0x100\00d\0042\000", metadata !63, metadata !6, metadata !68} ; [ DW_TAG_auto_variable ] [d] [line 42]
-!68 = metadata !{metadata !"0x1\00\000\00256\0064\000\000", null, metadata !"", metadata !13, metadata !69, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 256, align 64, offset 0] [from double]
-!69 = metadata !{metadata !19, metadata !19}
-!70 = metadata !{i32 42, i32 0, metadata !63, null}
-!71 = metadata !{i32 44, i32 0, metadata !63, null}
-!72 = metadata !{i32 45, i32 0, metadata !63, null}
-!73 = metadata !{i32 46, i32 0, metadata !63, null}
-!74 = metadata !{metadata !"0x100\00result\0048\000", metadata !63, metadata !6, metadata !13} ; [ DW_TAG_auto_variable ] [result] [line 48]
-!75 = metadata !{i32 48, i32 0, metadata !63, null}
-!76 = metadata !{i32 49, i32 0, metadata !63, null}
-!77 = metadata !{metadata !"test-parameters.cpp", metadata !"/home/akaylor/dev"}
-!78 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/LTO/ARM/inline-asm.ll b/test/LTO/ARM/inline-asm.ll
new file mode 100644
index 0000000..23fb904
--- /dev/null
+++ b/test/LTO/ARM/inline-asm.ll
@@ -0,0 +1,9 @@
+; Check that we don't crash on target-specific inline asm directives.
+;
+; RUN: llvm-as < %s > %t
+; RUN: llvm-lto -o /dev/null %t -mcpu armv4t
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv4t-unknown-linux"
+
+module asm ".fnstart"
diff --git a/test/LTO/ARM/lit.local.cfg b/test/LTO/ARM/lit.local.cfg
new file mode 100644
index 0000000..20e19ae
--- /dev/null
+++ b/test/LTO/ARM/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'ARM' in config.root.targets:
+  config.unsupported = True
diff --git a/test/LTO/ARM/runtime-library-subtarget.ll b/test/LTO/ARM/runtime-library-subtarget.ll
new file mode 100644
index 0000000..aab1d90
--- /dev/null
+++ b/test/LTO/ARM/runtime-library-subtarget.ll
@@ -0,0 +1,18 @@
+; Check that user-defined runtime library function __addsf3vfp is not removed
+;
+; RUN: llvm-as <%s >%t1
+; RUN: llvm-lto -o %t2 %t1 -mcpu arm1176jz-s
+; RUN: llvm-nm %t2 | FileCheck %s
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7-apple-ios"
+
+; CHECK: ___addsf3vfp
+
+define float @__addsf3vfp(float %a, float %b) #0 {
+entry:
+  %add = fadd float %a, %b
+  ret float %add
+}
+
+attributes #0 = { "target-cpu"="arm1176jzf-s"}
diff --git a/test/LTO/Inputs/bcsection.macho.s b/test/LTO/X86/Inputs/bcsection.macho.s
index cb7fe03..cb7fe03 100644
--- a/test/LTO/Inputs/bcsection.macho.s
+++ b/test/LTO/X86/Inputs/bcsection.macho.s
diff --git a/test/LTO/Inputs/bcsection.s b/test/LTO/X86/Inputs/bcsection.s
index ede1e5c..ede1e5c 100644
--- a/test/LTO/Inputs/bcsection.s
+++ b/test/LTO/X86/Inputs/bcsection.s
diff --git a/test/LTO/X86/Inputs/invalid.ll.bc b/test/LTO/X86/Inputs/invalid.ll.bc
new file mode 100644
index 0000000..a85c364
--- /dev/null
+++ b/test/LTO/X86/Inputs/invalid.ll.bc
diff --git a/test/LTO/X86/Inputs/list-symbols.ll b/test/LTO/X86/Inputs/list-symbols.ll
new file mode 100644
index 0000000..9443d53
--- /dev/null
+++ b/test/LTO/X86/Inputs/list-symbols.ll
@@ -0,0 +1,4 @@
+@glob = global i32 0
+define void @bar() {
+  ret void
+}
diff --git a/test/LTO/attrs.ll b/test/LTO/X86/attrs.ll
index d196747..d196747 100644
--- a/test/LTO/attrs.ll
+++ b/test/LTO/X86/attrs.ll
diff --git a/test/LTO/bcsection.ll b/test/LTO/X86/bcsection.ll
index e65ade6..e65ade6 100644
--- a/test/LTO/bcsection.ll
+++ b/test/LTO/X86/bcsection.ll
diff --git a/test/LTO/X86/cfi_endproc.ll b/test/LTO/X86/cfi_endproc.ll
new file mode 100644
index 0000000..1a69bf6
--- /dev/null
+++ b/test/LTO/X86/cfi_endproc.ll
@@ -0,0 +1,42 @@
+; RUN: llvm-as < %s >%t1
+; RUN: llvm-lto -o %t2 %t1
+; RUN: llvm-nm %t2 | FileCheck %s -check-prefix=NOEXPORT
+; RUN: llvm-lto -o %t3 -exported-symbol=main %t1
+; RUN: llvm-nm %t3 | FileCheck %s -check-prefix=EXPORT
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+module asm ".text"
+module asm ".align 16, 0x90"
+module asm ".type PR14512, @function"
+module asm "PR14512:.cfi_startproc"
+module asm "ret"
+module asm ".cfi_endproc"
+
+declare void @PR14512()
+
+; Without -exported-symbol, main should be eliminated by LTO.
+; With -exported-symbol=main, main should be preserved by LTO.
+define i32 @main(i32 %argc, i8** %argv) {
+; NOEXPORT-NOT: main
+; EXPORT: main
+  call void @PR14512()
+  ret i32 0
+}
+
+; RUN: llvm-lto -o %t -dso-symbol=zed1 -dso-symbol=zed2 %t1 -disable-opt
+; RUN: llvm-nm %t | FileCheck %s -check-prefix=ZED1_AND_ZED2
+; ZED1_AND_ZED2: V zed1
+@zed1 = linkonce_odr global i32 42
+define i32* @get_zed1() {
+  ret i32* @zed1
+}
+
+; ZED1_AND_ZED2: d zed2
+@zed2 = linkonce_odr unnamed_addr global i32 42
+
+define i32 @useZed2() {
+  %x = load i32* @zed2
+  ret i32 %x
+}
diff --git a/test/LTO/current-section.ll b/test/LTO/X86/current-section.ll
index f79b378..f79b378 100644
--- a/test/LTO/current-section.ll
+++ b/test/LTO/X86/current-section.ll
diff --git a/test/LTO/diagnostic-handler-remarks.ll b/test/LTO/X86/diagnostic-handler-remarks.ll
index 4da9101..4da9101 100644
--- a/test/LTO/diagnostic-handler-remarks.ll
+++ b/test/LTO/X86/diagnostic-handler-remarks.ll
diff --git a/test/LTO/X86/invalid.ll b/test/LTO/X86/invalid.ll
new file mode 100644
index 0000000..5b6996d
--- /dev/null
+++ b/test/LTO/X86/invalid.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-lto %S/Inputs/invalid.ll.bc 2>&1 | FileCheck %s
+
+
+; CHECK: llvm-lto{{.*}}: error loading file '{{.*}}/Inputs/invalid.ll.bc': Unknown attribute kind (48)
diff --git a/test/LTO/jump-table-type.ll b/test/LTO/X86/jump-table-type.ll
index a806c30..a806c30 100644
--- a/test/LTO/jump-table-type.ll
+++ b/test/LTO/X86/jump-table-type.ll
diff --git a/test/LTO/keep-used-puts-during-instcombine.ll b/test/LTO/X86/keep-used-puts-during-instcombine.ll
index 69ce3ee..69ce3ee 100644
--- a/test/LTO/keep-used-puts-during-instcombine.ll
+++ b/test/LTO/X86/keep-used-puts-during-instcombine.ll
diff --git a/test/LTO/X86/linkonce_odr_func.ll b/test/LTO/X86/linkonce_odr_func.ll
new file mode 100644
index 0000000..48da795
--- /dev/null
+++ b/test/LTO/X86/linkonce_odr_func.ll
@@ -0,0 +1,61 @@
+; RUN: llvm-as < %s >%t1
+; RUN: llvm-lto -o %t2 -dso-symbol=foo1 -dso-symbol=foo2 -dso-symbol=foo3 \
+; RUN:     -dso-symbol=foo4 -dso-symbol=v1 -dso-symbol=v2 %t1 -disable-opt
+; RUN: llvm-nm %t2 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: t foo1
+define linkonce_odr void @foo1() noinline {
+  ret void
+}
+
+; CHECK: W foo2
+define linkonce_odr void @foo2() noinline {
+  ret void
+}
+
+; CHECK: t foo3
+define linkonce_odr void @foo3() noinline {
+  ret void
+}
+
+; CHECK: W foo4
+define linkonce_odr void @foo4() noinline {
+  ret void
+}
+
+; CHECK: r v1
+@v1 = linkonce_odr constant i32 32
+
+define i32 @useV1() {
+  %x = load i32* @v1
+  ret i32 %x
+}
+
+; CHECK: V v2
+@v2 = linkonce_odr global i32 32
+
+define i32 @useV2() {
+  %x = load i32* @v2
+  ret i32 %x
+}
+
+declare void @f(void()*)
+
+declare void @p()
+
+define void @bar() {
+bb0:
+  call void @foo1()
+  call void @f(void()* @foo2)
+  invoke void @foo3() to label %bb1 unwind label %clean
+bb1:
+  invoke void @f(void()* @foo4) to label %bb2 unwind label %clean
+bb2:
+  ret void
+clean:
+  landingpad {i32, i32} personality void()* @p cleanup
+  ret void
+}
diff --git a/test/LTO/X86/list-symbols.ll b/test/LTO/X86/list-symbols.ll
new file mode 100644
index 0000000..41b7d00
--- /dev/null
+++ b/test/LTO/X86/list-symbols.ll
@@ -0,0 +1,15 @@
+; RUN: llvm-as -o %T/1.bc %s
+; RUN: llvm-as -o %T/2.bc %S/Inputs/list-symbols.ll
+; RUN: llvm-lto -list-symbols-only %T/1.bc %T/2.bc | FileCheck %s
+
+; CHECK-LABEL: 1.bc:
+; CHECK-DAG: foo
+; CHECK-DAG: glob
+; CHECK-LABEL: 2.bc:
+; CHECK-DAG: glob
+; CHECK-DAG: bar
+
+@glob = global i32 0
+define void @foo() {
+  ret void
+}
diff --git a/test/LTO/lit.local.cfg b/test/LTO/X86/lit.local.cfg
index afde89b..afde89b 100644
--- a/test/LTO/lit.local.cfg
+++ b/test/LTO/X86/lit.local.cfg
diff --git a/test/LTO/no-undefined-puts-when-implemented.ll b/test/LTO/X86/no-undefined-puts-when-implemented.ll
index 29db8a6..29db8a6 100644
--- a/test/LTO/no-undefined-puts-when-implemented.ll
+++ b/test/LTO/X86/no-undefined-puts-when-implemented.ll
diff --git a/test/LTO/private-symbol.ll b/test/LTO/X86/private-symbol.ll
index e13a393..e13a393 100644
--- a/test/LTO/private-symbol.ll
+++ b/test/LTO/X86/private-symbol.ll
diff --git a/test/LTO/runtime-library.ll b/test/LTO/X86/runtime-library.ll
index 76fc6f0..76fc6f0 100644
--- a/test/LTO/runtime-library.ll
+++ b/test/LTO/X86/runtime-library.ll
diff --git a/test/LTO/X86/set-merged.ll b/test/LTO/X86/set-merged.ll
new file mode 100644
index 0000000..0e2e1ea
--- /dev/null
+++ b/test/LTO/X86/set-merged.ll
@@ -0,0 +1,36 @@
+; RUN: llvm-as < %s >%t1
+; RUN: llvm-lto -exported-symbol=_main -set-merged-module -o %t2 %t1
+; RUN: llvm-objdump -d %t2 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; CHECK: _main
+; CHECK: movl $132
+define i32 @_Z3fooi(i32 %a) {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %0 = load i32* %a.addr, align 4
+  %1 = load i32* %a.addr, align 4
+  %call = call i32 @_Z4bar2i(i32 %1)
+  %add = add nsw i32 %0, %call
+  ret i32 %add
+}
+
+define i32 @_Z4bar2i(i32 %a) {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %0 = load i32* %a.addr, align 4
+  %mul = mul nsw i32 2, %0
+  ret i32 %mul
+}
+
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 @_Z3fooi(i32 44)
+  ret i32 %call
+}
diff --git a/test/LTO/symver-asm.ll b/test/LTO/X86/symver-asm.ll
index 03dda2b..03dda2b 100644
--- a/test/LTO/symver-asm.ll
+++ b/test/LTO/X86/symver-asm.ll
diff --git a/test/LTO/triple-init.ll b/test/LTO/X86/triple-init.ll
index e0ad879..e0ad879 100644
--- a/test/LTO/triple-init.ll
+++ b/test/LTO/X86/triple-init.ll
diff --git a/test/LTO/cfi_endproc.ll b/test/LTO/cfi_endproc.ll
deleted file mode 100644
index a5cc649..0000000
--- a/test/LTO/cfi_endproc.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; RUN: llvm-as < %s >%t1
-; RUN: llvm-lto -o %t2 %t1
-; RUN: llvm-nm %t2 | FileCheck %s -check-prefix=NOEXPORT
-; RUN: llvm-lto -o %t3 -exported-symbol=main %t1
-; RUN: llvm-nm %t3 | FileCheck %s -check-prefix=EXPORT
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-module asm ".text"
-module asm ".align 16, 0x90"
-module asm ".type PR14512, @function"
-module asm "PR14512:.cfi_startproc"
-module asm "ret"
-module asm ".cfi_endproc"
-
-declare void @PR14512()
-
-; Without -exported-symbol, main should be eliminated by LTO.
-; With -exported-symbol=main, main should be preserved by LTO.
-define i32 @main(i32 %argc, i8** %argv) {
-; NOEXPORT-NOT: main
-; EXPORT: main
-  call void @PR14512()
-  ret i32 0
-}
-
-; RUN: llvm-lto -o %t -dso-symbol=zed1 -dso-symbol=zed2 %t1 -disable-opt
-; RUN: llvm-nm %t | FileCheck %s -check-prefix=ZED1_AND_ZED2
-; ZED1_AND_ZED2: V zed1
-@zed1 = linkonce_odr global i32 42
-define i32* @get_zed1() {
-  ret i32* @zed1
-}
-
-; ZED1_AND_ZED2: d zed2
-@zed2 = linkonce_odr unnamed_addr global i32 42
diff --git a/test/LTO/linkonce_odr_func.ll b/test/LTO/linkonce_odr_func.ll
deleted file mode 100644
index a67ffc0..0000000
--- a/test/LTO/linkonce_odr_func.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llvm-as < %s >%t1
-; RUN: llvm-lto -o %t2 -dso-symbol=foo1 -dso-symbol=foo2 -dso-symbol=foo3 \
-; RUN:     -dso-symbol=foo4 -dso-symbol=v1 -dso-symbol=v2 %t1 -disable-opt
-; RUN: llvm-nm %t2 | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; CHECK: t foo1
-define linkonce_odr void @foo1() noinline {
-  ret void
-}
-
-; CHECK: W foo2
-define linkonce_odr void @foo2() noinline {
-  ret void
-}
-
-; CHECK: t foo3
-define linkonce_odr void @foo3() noinline {
-  ret void
-}
-
-; CHECK: W foo4
-define linkonce_odr void @foo4() noinline {
-  ret void
-}
-
-; CHECK: r v1
-@v1 = linkonce_odr constant i32 32
-
-; CHECK: V v2
-@v2 = linkonce_odr global i32 32
-
-declare void @f(void()*)
-
-declare void @p()
-
-define void @bar() {
-bb0:
-  call void @foo1()
-  call void @f(void()* @foo2)
-  invoke void @foo3() to label %bb1 unwind label %clean
-bb1:
-  invoke void @f(void()* @foo4) to label %bb2 unwind label %clean
-bb2:
-  ret void
-clean:
-  landingpad {i32, i32} personality void()* @p cleanup
-  ret void
-}
diff --git a/test/Linker/2006-06-15-GlobalVarAlignment.ll b/test/Linker/2006-06-15-GlobalVarAlignment.ll
deleted file mode 100644
index c9f9b0e..0000000
--- a/test/Linker/2006-06-15-GlobalVarAlignment.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; The linker should choose the largest alignment when linking.
-
-; RUN: echo "@X = global i32 7, align 8" | llvm-as > %t.2.bc
-; RUN: llvm-as < %s > %t.1.bc
-; RUN: llvm-link %t.1.bc %t.2.bc -S | FileCheck %s
-; CHECK: align 8
-
-@X = weak global i32 7, align 4
diff --git a/test/Linker/2009-09-03-mdnode.ll b/test/Linker/2009-09-03-mdnode.ll
index d9871b2..1f308e7 100644
--- a/test/Linker/2009-09-03-mdnode.ll
+++ b/test/Linker/2009-09-03-mdnode.ll
@@ -26,6 +26,6 @@ declare void @llvm.dbg.stoppoint(i32, i32, metadata) nounwind readnone
 
 declare void @llvm.dbg.region.end(metadata) nounwind readnone
 
-!0 = metadata !{metadata !"0x2e\00main\00main\00main\002\000\001\000\006\000\000\000", i32 0, metadata !1, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x11\0012\00ellcc 0.1.0\001\00\000\00\000", metadata !2, null, null, null, null, null} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{metadata !"a.c", metadata !"/home/rich/ellcc/test/source"}
+!0 = !{!"0x2e\00main\00main\00main\002\000\001\000\006\000\000\000", i32 0, !1, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x11\0012\00ellcc 0.1.0\001\00\000\00\000", !2, null, null, null, null, null} ; [ DW_TAG_compile_unit ]
+!2 = !{!"a.c", !"/home/rich/ellcc/test/source"}
diff --git a/test/Linker/2009-09-03-mdnode2.ll b/test/Linker/2009-09-03-mdnode2.ll
index b01f947..68e3294 100644
--- a/test/Linker/2009-09-03-mdnode2.ll
+++ b/test/Linker/2009-09-03-mdnode2.ll
@@ -21,6 +21,6 @@ declare void @llvm.dbg.stoppoint(i32, i32, metadata) nounwind readnone
 
 declare void @llvm.dbg.region.end(metadata) nounwind readnone
 
-!0 = metadata !{metadata !"0x2e\00f\00f\00f\001\000\001\000\006\000\000\000", i32 0, metadata !1, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x11\0012\00ellcc 0.1.0\001\00\000\00\000", metadata !2, null, null, null, null, null} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{metadata !"b.c", metadata !"/home/rich/ellcc/test/source"}
+!0 = !{!"0x2e\00f\00f\00f\001\000\001\000\006\000\000\000", i32 0, !1, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x11\0012\00ellcc 0.1.0\001\00\000\00\000", !2, null, null, null, null, null} ; [ DW_TAG_compile_unit ]
+!2 = !{!"b.c", !"/home/rich/ellcc/test/source"}
diff --git a/test/Linker/2011-08-04-DebugLoc.ll b/test/Linker/2011-08-04-DebugLoc.ll
index a9307af..85b9e17 100644
--- a/test/Linker/2011-08-04-DebugLoc.ll
+++ b/test/Linker/2011-08-04-DebugLoc.ll
@@ -17,15 +17,15 @@ define i32 @foo() nounwind ssp {
 !llvm.module.flags = !{!11}
 !llvm.dbg.sp = !{!1}
 
-!0 = metadata !{metadata !"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-209.11) (based on LLVM 3.0svn)\001\00\000\00\000", metadata !8, metadata !9, metadata !9, metadata !10, null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"0x2e\00foo\00foo\00\002\000\001\000\006\000\000\000", metadata !8, metadata !2, metadata !3, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
-!2 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !8, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 2, i32 13, metadata !7, null}
-!7 = metadata !{metadata !"0xb\002\0011\000", metadata !8, metadata !1} ; [ DW_TAG_lexical_block ]
-!8 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
-!9 = metadata !{i32 0}
-!10 = metadata !{metadata !1}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-209.11) (based on LLVM 3.0svn)\001\00\000\00\000", !8, !9, !9, !10, null, null} ; [ DW_TAG_compile_unit ]
+!1 = !{!"0x2e\00foo\00foo\00\002\000\001\000\006\000\000\000", !8, !2, !3, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
+!2 = !{!"0x29", !8} ; [ DW_TAG_file_type ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !8, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !0} ; [ DW_TAG_base_type ]
+!6 = !MDLocation(line: 2, column: 13, scope: !7)
+!7 = !{!"0xb\002\0011\000", !8, !1} ; [ DW_TAG_lexical_block ]
+!8 = !{!"a.c", !"/private/tmp"}
+!9 = !{i32 0}
+!10 = !{!1}
+!11 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Linker/2011-08-04-DebugLoc2.ll b/test/Linker/2011-08-04-DebugLoc2.ll
index 948dd18..a3cd001 100644
--- a/test/Linker/2011-08-04-DebugLoc2.ll
+++ b/test/Linker/2011-08-04-DebugLoc2.ll
@@ -14,15 +14,15 @@ define i32 @bar() nounwind ssp {
 !llvm.module.flags = !{!11}
 !llvm.dbg.sp = !{!1}
 
-!0 = metadata !{metadata !"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-209.11) (based on LLVM 3.0svn)\001\00\000\00\000", metadata !8, metadata !9, metadata !9, metadata !10, null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"0x2e\00bar\00bar\00\001\000\001\000\006\000\000\000", metadata !8, metadata !2, metadata !3, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [bar]
-!2 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !8, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 1, i32 13, metadata !7, null}
-!7 = metadata !{metadata !"0xb\001\0011\000", metadata !8, metadata !1} ; [ DW_TAG_lexical_block ]
-!8 = metadata !{metadata !"b.c", metadata !"/private/tmp"}
-!9 = metadata !{i32 0}
-!10 = metadata !{metadata !1}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-209.11) (based on LLVM 3.0svn)\001\00\000\00\000", !8, !9, !9, !10, null, null} ; [ DW_TAG_compile_unit ]
+!1 = !{!"0x2e\00bar\00bar\00\001\000\001\000\006\000\000\000", !8, !2, !3, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [bar]
+!2 = !{!"0x29", !8} ; [ DW_TAG_file_type ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !8, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !0} ; [ DW_TAG_base_type ]
+!6 = !MDLocation(line: 1, column: 13, scope: !7)
+!7 = !{!"0xb\001\0011\000", !8, !1} ; [ DW_TAG_lexical_block ]
+!8 = !{!"b.c", !"/private/tmp"}
+!9 = !{i32 0}
+!10 = !{!1}
+!11 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Linker/2011-08-04-Metadata.ll b/test/Linker/2011-08-04-Metadata.ll
index 7bdbb33..da98f20 100644
--- a/test/Linker/2011-08-04-Metadata.ll
+++ b/test/Linker/2011-08-04-Metadata.ll
@@ -2,8 +2,8 @@
 ; RUN: llvm-dis < %t.bc | FileCheck %s
 ; Test if internal global variable's debug info is merged appropriately or not.
 
-;CHECK:  metadata !{metadata !"0x34\00x\00x\00\002\001\001", metadata !{{[0-9]+}}, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}, i32* @x}
-;CHECK:  metadata !{metadata !"0x34\00x\00x\00\001\001\001", metadata !{{[0-9]+}}, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}, i32* @x1}
+;CHECK:   !"0x34\00x\00x\00\002\001\001", !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, i32* @x}
+;CHECK:   !"0x34\00x\00x\00\001\001\001", !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, i32* @x1}
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.7.0"
 
@@ -20,15 +20,15 @@ entry:
 !llvm.dbg.sp = !{!1}
 !llvm.dbg.gv = !{!5}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 ()\001\00\000\00\000", metadata !9, metadata !4, metadata !4, metadata !10, null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\001\000\006\000\000\000", metadata !9, metadata !2, metadata !3, null, void ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [foo]
-!2 = metadata !{metadata !"0x29", metadata !9} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !9, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null}
-!5 = metadata !{metadata !"0x34\00x\00x\00\002\001\001", metadata !0, metadata !2, metadata !6, i32* @x} ; [ DW_TAG_variable ]
-!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 3, i32 14, metadata !8, null}
-!8 = metadata !{metadata !"0xb\003\0012\000", metadata !9, metadata !1} ; [ DW_TAG_lexical_block ]
-!9 = metadata !{metadata !"/tmp/one.c", metadata !"/Volumes/Lalgate/Slate/D"}
-!10 = metadata !{metadata !1}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.0 ()\001\00\000\00\000", !9, !4, !4, !10, null, null} ; [ DW_TAG_compile_unit ]
+!1 = !{!"0x2e\00foo\00foo\00\003\000\001\000\006\000\000\000", !9, !2, !3, null, void ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [foo]
+!2 = !{!"0x29", !9} ; [ DW_TAG_file_type ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !9, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{null}
+!5 = !{!"0x34\00x\00x\00\002\001\001", !0, !2, !6, i32* @x} ; [ DW_TAG_variable ]
+!6 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !0} ; [ DW_TAG_base_type ]
+!7 = !MDLocation(line: 3, column: 14, scope: !8)
+!8 = !{!"0xb\003\0012\000", !9, !1} ; [ DW_TAG_lexical_block ]
+!9 = !{!"/tmp/one.c", !"/Volumes/Lalgate/Slate/D"}
+!10 = !{!1}
+!11 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Linker/2011-08-04-Metadata2.ll b/test/Linker/2011-08-04-Metadata2.ll
index fcf72aa..fb196d9 100644
--- a/test/Linker/2011-08-04-Metadata2.ll
+++ b/test/Linker/2011-08-04-Metadata2.ll
@@ -19,15 +19,15 @@ entry:
 !llvm.dbg.sp = !{!1}
 !llvm.dbg.gv = !{!5}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 ()\001\00\000\00\000", metadata !9, metadata !4, metadata !4, metadata !10, null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"0x2e\00bar\00bar\00\002\000\001\000\006\000\000\000", metadata !9, metadata !2, metadata !3, null, void ()* @bar, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [bar]
-!2 = metadata !{metadata !"0x29", metadata !9} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !9, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null}
-!5 = metadata !{metadata !"0x34\00x\00x\00\001\001\001", metadata !0, metadata !2, metadata !6, i32* @x} ; [ DW_TAG_variable ]
-!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 2, i32 14, metadata !8, null}
-!8 = metadata !{metadata !"0xb\002\0012\000", metadata !9, metadata !1} ; [ DW_TAG_lexical_block ]
-!9 = metadata !{metadata !"/tmp/two.c", metadata !"/Volumes/Lalgate/Slate/D"}
-!10 = metadata !{metadata !1}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.0 ()\001\00\000\00\000", !9, !4, !4, !10, null, null} ; [ DW_TAG_compile_unit ]
+!1 = !{!"0x2e\00bar\00bar\00\002\000\001\000\006\000\000\000", !9, !2, !3, null, void ()* @bar, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [bar]
+!2 = !{!"0x29", !9} ; [ DW_TAG_file_type ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !9, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{null}
+!5 = !{!"0x34\00x\00x\00\001\001\001", !0, !2, !6, i32* @x} ; [ DW_TAG_variable ]
+!6 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !0} ; [ DW_TAG_base_type ]
+!7 = !MDLocation(line: 2, column: 14, scope: !8)
+!8 = !{!"0xb\002\0012\000", !9, !1} ; [ DW_TAG_lexical_block ]
+!9 = !{!"/tmp/two.c", !"/Volumes/Lalgate/Slate/D"}
+!10 = !{!1}
+!11 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Linker/2011-08-18-unique-class-type.ll b/test/Linker/2011-08-18-unique-class-type.ll
index 6fa2126..a8f1350 100644
--- a/test/Linker/2011-08-18-unique-class-type.ll
+++ b/test/Linker/2011-08-18-unique-class-type.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-apple-macosx10.7.0"
 define void @_Z3fooN2N11AE() nounwind uwtable ssp {
 entry:
   %mya = alloca %"class.N1::A", align 1
-  call void @llvm.dbg.declare(metadata !{%"class.N1::A"* %mya}, metadata !9, metadata !{metadata !"0x102"}), !dbg !13
+  call void @llvm.dbg.declare(metadata %"class.N1::A"* %mya, metadata !9, metadata !{!"0x102"}), !dbg !13
   ret void, !dbg !14
 }
 
@@ -20,21 +20,21 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.0 (trunk 137954)\001\00\000\00\000", metadata !16, metadata !2, metadata !2, metadata !3, metadata !2, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooN2N11AE\004\000\001\000\006\00256\000\000", metadata !16, metadata !6, metadata !7, null, void ()* @_Z3fooN2N11AE, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [foo]
-!6 = metadata !{metadata !"0x29", metadata !16} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !16, metadata !6, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null}
-!9 = metadata !{metadata !"0x101\00mya\0016777220\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
-!10 = metadata !{metadata !"0x2\00A\003\008\008\000\000\000", metadata !17, metadata !11, null, metadata !2, null, null, null} ; [ DW_TAG_class_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
-!11 = metadata !{metadata !"0x39\00N1\002", metadata !17, null} ; [ DW_TAG_namespace ]
-!12 = metadata !{metadata !"0x29", metadata !17} ; [ DW_TAG_file_type ]
-!13 = metadata !{i32 4, i32 12, metadata !5, null}
-!14 = metadata !{i32 4, i32 18, metadata !15, null}
-!15 = metadata !{metadata !"0xb\004\0017\000", metadata !16, metadata !5} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{metadata !"n1.c", metadata !"/private/tmp"}
-!17 = metadata !{metadata !"./n.h", metadata !"/private/tmp"}
-!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.0 (trunk 137954)\001\00\000\00\000", !16, !2, !2, !3, !2, null} ; [ DW_TAG_compile_unit ]
+!1 = !{!2}
+!2 = !{i32 0}
+!3 = !{!5}
+!5 = !{!"0x2e\00foo\00foo\00_Z3fooN2N11AE\004\000\001\000\006\00256\000\000", !16, !6, !7, null, void ()* @_Z3fooN2N11AE, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [foo]
+!6 = !{!"0x29", !16} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", !16, !6, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null}
+!9 = !{!"0x101\00mya\0016777220\000", !5, !6, !10} ; [ DW_TAG_arg_variable ]
+!10 = !{!"0x2\00A\003\008\008\000\000\000", !17, !11, null, !2, null, null, null} ; [ DW_TAG_class_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
+!11 = !{!"0x39\00N1\002", !17, null} ; [ DW_TAG_namespace ]
+!12 = !{!"0x29", !17} ; [ DW_TAG_file_type ]
+!13 = !MDLocation(line: 4, column: 12, scope: !5)
+!14 = !MDLocation(line: 4, column: 18, scope: !15)
+!15 = !{!"0xb\004\0017\000", !16, !5} ; [ DW_TAG_lexical_block ]
+!16 = !{!"n1.c", !"/private/tmp"}
+!17 = !{!"./n.h", !"/private/tmp"}
+!18 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Linker/2011-08-18-unique-class-type2.ll b/test/Linker/2011-08-18-unique-class-type2.ll
index 97fdcd0..dd0df58 100644
--- a/test/Linker/2011-08-18-unique-class-type2.ll
+++ b/test/Linker/2011-08-18-unique-class-type2.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-macosx10.7.0"
 define void @_Z3barN2N11AE() nounwind uwtable ssp {
 entry:
   %youra = alloca %"class.N1::A", align 1
-  call void @llvm.dbg.declare(metadata !{%"class.N1::A"* %youra}, metadata !9, metadata !{metadata !"0x102"}), !dbg !13
+  call void @llvm.dbg.declare(metadata %"class.N1::A"* %youra, metadata !9, metadata !{!"0x102"}), !dbg !13
   ret void, !dbg !14
 }
 
@@ -18,21 +18,21 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.0 (trunk 137954)\001\00\000\00\000", metadata !16, metadata !2, metadata !2, metadata !3, metadata !2, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3barN2N11AE\004\000\001\000\006\00256\000\000", i32 0, metadata !6, metadata !7, null, void ()* @_Z3barN2N11AE, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [bar]
-!6 = metadata !{metadata !"0x29", metadata !16} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !16, metadata !6, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null}
-!9 = metadata !{metadata !"0x101\00youra\0016777220\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
-!10 = metadata !{metadata !"0x2\00A\003\008\008\000\000\000", metadata !17, metadata !11, null, metadata !2, null, null, null} ; [ DW_TAG_class_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
-!11 = metadata !{metadata !"0x39\00N1\002", metadata !17, null} ; [ DW_TAG_namespace ]
-!12 = metadata !{metadata !"0x29", metadata !17} ; [ DW_TAG_file_type ]
-!13 = metadata !{i32 4, i32 12, metadata !5, null}
-!14 = metadata !{i32 4, i32 20, metadata !15, null}
-!15 = metadata !{metadata !"0xb\004\0019\000", metadata !16, metadata !5} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{metadata !"n2.c", metadata !"/private/tmp"}
-!17 = metadata !{metadata !"./n.h", metadata !"/private/tmp"}
-!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.0 (trunk 137954)\001\00\000\00\000", !16, !2, !2, !3, !2, null} ; [ DW_TAG_compile_unit ]
+!1 = !{!2}
+!2 = !{i32 0}
+!3 = !{!5}
+!5 = !{!"0x2e\00bar\00bar\00_Z3barN2N11AE\004\000\001\000\006\00256\000\000", i32 0, !6, !7, null, void ()* @_Z3barN2N11AE, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [bar]
+!6 = !{!"0x29", !16} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", !16, !6, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null}
+!9 = !{!"0x101\00youra\0016777220\000", !5, !6, !10} ; [ DW_TAG_arg_variable ]
+!10 = !{!"0x2\00A\003\008\008\000\000\000", !17, !11, null, !2, null, null, null} ; [ DW_TAG_class_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
+!11 = !{!"0x39\00N1\002", !17, null} ; [ DW_TAG_namespace ]
+!12 = !{!"0x29", !17} ; [ DW_TAG_file_type ]
+!13 = !MDLocation(line: 4, column: 12, scope: !5)
+!14 = !MDLocation(line: 4, column: 20, scope: !15)
+!15 = !{!"0xb\004\0019\000", !16, !5} ; [ DW_TAG_lexical_block ]
+!16 = !{!"n2.c", !"/private/tmp"}
+!17 = !{!"./n.h", !"/private/tmp"}
+!18 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Linker/2011-08-18-unique-debug-type.ll b/test/Linker/2011-08-18-unique-debug-type.ll
index e9dcf87..c1b3a1d 100644
--- a/test/Linker/2011-08-18-unique-debug-type.ll
+++ b/test/Linker/2011-08-18-unique-debug-type.ll
@@ -12,16 +12,16 @@ entry:
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 137954)\001\00\000\00\000", metadata !12, metadata !2, metadata !2, metadata !3, metadata !2, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\000\000\000", metadata !12, metadata !6, metadata !7, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
-!6 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !6, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 1, i32 13, metadata !11, null}
-!11 = metadata !{metadata !"0xb\001\0011\000", metadata !12, metadata !5} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{metadata !"one.c", metadata !"/private/tmp"}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.0 (trunk 137954)\001\00\000\00\000", !12, !2, !2, !3, !2, null} ; [ DW_TAG_compile_unit ]
+!1 = !{!2}
+!2 = !{i32 0}
+!3 = !{!5}
+!5 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\000\000\000", !12, !6, !7, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
+!6 = !{!"0x29", !12} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", !12, !6, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = !MDLocation(line: 1, column: 13, scope: !11)
+!11 = !{!"0xb\001\0011\000", !12, !5} ; [ DW_TAG_lexical_block ]
+!12 = !{!"one.c", !"/private/tmp"}
+!13 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Linker/2011-08-18-unique-debug-type2.ll b/test/Linker/2011-08-18-unique-debug-type2.ll
index 7bbed9f..49c0a5c 100644
--- a/test/Linker/2011-08-18-unique-debug-type2.ll
+++ b/test/Linker/2011-08-18-unique-debug-type2.ll
@@ -12,16 +12,16 @@ entry:
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 137954)\001\00\000\00\000", metadata !12, metadata !2, metadata !2, metadata !3, metadata !2, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00bar\00bar\00\001\000\001\000\006\000\000\000", metadata !12, metadata !6, metadata !7, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [bar]
-!6 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !6, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 1, i32 13, metadata !11, null}
-!11 = metadata !{metadata !"0xb\001\0011\000", metadata !12, metadata !5} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{metadata !"two.c", metadata !"/private/tmp"}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.0 (trunk 137954)\001\00\000\00\000", !12, !2, !2, !3, !2, null} ; [ DW_TAG_compile_unit ]
+!1 = !{!2}
+!2 = !{i32 0}
+!3 = !{!5}
+!5 = !{!"0x2e\00bar\00bar\00\001\000\001\000\006\000\000\000", !12, !6, !7, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [bar]
+!6 = !{!"0x29", !12} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", !12, !6, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = !MDLocation(line: 1, column: 13, scope: !11)
+!11 = !{!"0xb\001\0011\000", !12, !5} ; [ DW_TAG_lexical_block ]
+!12 = !{!"two.c", !"/private/tmp"}
+!13 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Linker/DbgDeclare.ll b/test/Linker/DbgDeclare.ll
index 3d39b30..de5ac9e 100644
--- a/test/Linker/DbgDeclare.ll
+++ b/test/Linker/DbgDeclare.ll
@@ -4,12 +4,12 @@
 
 ; rdar://13089880
 ; CHECK: define i32 @main(i32 %argc, i8** %argv)
-; CHECK: call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !{{[0-9]+}}, metadata {{.*}})
-; CHECK: call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !{{[0-9]+}}, metadata {{.*}})
+; CHECK: call void @llvm.dbg.declare(metadata i32* %argc.addr, metadata !{{[0-9]+}}, metadata {{.*}})
+; CHECK: call void @llvm.dbg.declare(metadata i8*** %argv.addr, metadata !{{[0-9]+}}, metadata {{.*}})
 ; CHECK: define void @test(i32 %argc, i8** %argv)
-; CHECK: call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !{{[0-9]+}}, metadata {{.*}})
-; CHECK: call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !{{[0-9]+}}, metadata {{.*}})
-; CHECK: call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !{{[0-9]+}}, metadata {{.*}})
+; CHECK: call void @llvm.dbg.declare(metadata i32* %argc.addr, metadata !{{[0-9]+}}, metadata {{.*}})
+; CHECK: call void @llvm.dbg.declare(metadata i8*** %argv.addr, metadata !{{[0-9]+}}, metadata {{.*}})
+; CHECK: call void @llvm.dbg.declare(metadata i32* %i, metadata !{{[0-9]+}}, metadata {{.*}})
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
@@ -21,9 +21,9 @@ entry:
   %argv.addr = alloca i8**, align 8
   store i32 0, i32* %retval
   store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !14, metadata !{metadata !"0x102"}), !dbg !15
+  call void @llvm.dbg.declare(metadata i32* %argc.addr, metadata !14, metadata !{!"0x102"}), !dbg !15
   store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !16, metadata !{metadata !"0x102"}), !dbg !15
+  call void @llvm.dbg.declare(metadata i8*** %argv.addr, metadata !16, metadata !{!"0x102"}), !dbg !15
   %0 = load i32* %argc.addr, align 4, !dbg !17
   %1 = load i8*** %argv.addr, align 8, !dbg !17
   call void @test(i32 %0, i8** %1), !dbg !17
@@ -37,24 +37,24 @@ declare void @test(i32, i8**)
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 173515)\001\00\000\00\000", metadata !20, metadata !2, metadata !2, metadata !3, metadata !2, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00main\00main\00\003\000\001\000\006\00256\000\004", metadata !20, null, metadata !7, null, i32 (i32, i8**)* @main, null, null, metadata !1} ; [ DW_TAG_subprogram ]
-!6 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !9, metadata !10}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !12} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !13} ; [ DW_TAG_const_type ]
-!13 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
-!14 = metadata !{metadata !"0x101\00argc\0016777219\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 3, i32 0, metadata !5, null}
-!16 = metadata !{metadata !"0x101\00argv\0033554435\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{i32 5, i32 0, metadata !18, null}
-!18 = metadata !{metadata !"0xb\004\000\000", metadata !20, metadata !5} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{i32 6, i32 0, metadata !18, null}
-!20 = metadata !{metadata !"main.cpp", metadata !"/private/tmp"}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.3 (trunk 173515)\001\00\000\00\000", !20, !2, !2, !3, !2, null} ; [ DW_TAG_compile_unit ]
+!1 = !{!2}
+!2 = !{i32 0}
+!3 = !{!5}
+!5 = !{!"0x2e\00main\00main\00\003\000\001\000\006\00256\000\004", !20, null, !7, null, i32 (i32, i8**)* @main, null, null, !1} ; [ DW_TAG_subprogram ]
+!6 = !{!"0x29", !20} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !9, !10}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = !{!"0xf\00\000\0064\0064\000\000", null, null, !11} ; [ DW_TAG_pointer_type ]
+!11 = !{!"0xf\00\000\0064\0064\000\000", null, null, !12} ; [ DW_TAG_pointer_type ]
+!12 = !{!"0x26\00\000\000\000\000\000", null, null, !13} ; [ DW_TAG_const_type ]
+!13 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
+!14 = !{!"0x101\00argc\0016777219\000", !5, !6, !9} ; [ DW_TAG_arg_variable ]
+!15 = !MDLocation(line: 3, scope: !5)
+!16 = !{!"0x101\00argv\0033554435\000", !5, !6, !10} ; [ DW_TAG_arg_variable ]
+!17 = !MDLocation(line: 5, scope: !18)
+!18 = !{!"0xb\004\000\000", !20, !5} ; [ DW_TAG_lexical_block ]
+!19 = !MDLocation(line: 6, scope: !18)
+!20 = !{!"main.cpp", !"/private/tmp"}
+!21 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Linker/DbgDeclare2.ll b/test/Linker/DbgDeclare2.ll
index d27ce53..2f01b0e 100644
--- a/test/Linker/DbgDeclare2.ll
+++ b/test/Linker/DbgDeclare2.ll
@@ -11,10 +11,10 @@ entry:
   %argv.addr = alloca i8**, align 8
   %i = alloca i32, align 4
   store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !14, metadata !{metadata !"0x102"}), !dbg !15
+  call void @llvm.dbg.declare(metadata i32* %argc.addr, metadata !14, metadata !{!"0x102"}), !dbg !15
   store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !16, metadata !{metadata !"0x102"}), !dbg !15
-  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !17, metadata !{metadata !"0x102"}), !dbg !20
+  call void @llvm.dbg.declare(metadata i8*** %argv.addr, metadata !16, metadata !{!"0x102"}), !dbg !15
+  call void @llvm.dbg.declare(metadata i32* %i, metadata !17, metadata !{!"0x102"}), !dbg !20
   store i32 0, i32* %i, align 4, !dbg !20
   br label %for.cond, !dbg !20
 
@@ -50,30 +50,30 @@ declare i32 @puts(i8*)
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!27}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 173515)\001\00\000\00\000", metadata !25, metadata !2, metadata !2, metadata !3, metadata !2, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00print_args\00print_args\00test\004\000\001\000\006\00256\000\005", metadata !26, null, metadata !7, null, void (i32, i8**)* @test, null, null, metadata !1} ; [ DW_TAG_subprogram ]
-!6 = metadata !{metadata !"0x29", metadata !26} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9, metadata !10}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !12} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !13} ; [ DW_TAG_const_type ]
-!13 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
-!14 = metadata !{metadata !"0x101\00argc\0016777220\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 4, i32 0, metadata !5, null}
-!16 = metadata !{metadata !"0x101\00argv\0033554436\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{metadata !"0x100\00i\006\000", metadata !18, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ]
-!18 = metadata !{metadata !"0xb\006\000\001", metadata !26, metadata !19} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{metadata !"0xb\005\000\000", metadata !26, metadata !5} ; [ DW_TAG_lexical_block ]
-!20 = metadata !{i32 6, i32 0, metadata !18, null}
-!21 = metadata !{i32 8, i32 0, metadata !22, null}
-!22 = metadata !{metadata !"0xb\007\000\002", metadata !26, metadata !18} ; [ DW_TAG_lexical_block ]
-!23 = metadata !{i32 9, i32 0, metadata !22, null}
-!24 = metadata !{i32 10, i32 0, metadata !19, null}
-!25 = metadata !{metadata !"main.cpp", metadata !"/private/tmp"}
-!26 = metadata !{metadata !"test.cpp", metadata !"/private/tmp"}
-!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.3 (trunk 173515)\001\00\000\00\000", !25, !2, !2, !3, !2, null} ; [ DW_TAG_compile_unit ]
+!1 = !{!2}
+!2 = !{i32 0}
+!3 = !{!5}
+!5 = !{!"0x2e\00print_args\00print_args\00test\004\000\001\000\006\00256\000\005", !26, null, !7, null, void (i32, i8**)* @test, null, null, !1} ; [ DW_TAG_subprogram ]
+!6 = !{!"0x29", !26} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9, !10}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = !{!"0xf\00\000\0064\0064\000\000", null, null, !11} ; [ DW_TAG_pointer_type ]
+!11 = !{!"0xf\00\000\0064\0064\000\000", null, null, !12} ; [ DW_TAG_pointer_type ]
+!12 = !{!"0x26\00\000\000\000\000\000", null, null, !13} ; [ DW_TAG_const_type ]
+!13 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
+!14 = !{!"0x101\00argc\0016777220\000", !5, !6, !9} ; [ DW_TAG_arg_variable ]
+!15 = !MDLocation(line: 4, scope: !5)
+!16 = !{!"0x101\00argv\0033554436\000", !5, !6, !10} ; [ DW_TAG_arg_variable ]
+!17 = !{!"0x100\00i\006\000", !18, !6, !9} ; [ DW_TAG_auto_variable ]
+!18 = !{!"0xb\006\000\001", !26, !19} ; [ DW_TAG_lexical_block ]
+!19 = !{!"0xb\005\000\000", !26, !5} ; [ DW_TAG_lexical_block ]
+!20 = !MDLocation(line: 6, scope: !18)
+!21 = !MDLocation(line: 8, scope: !22)
+!22 = !{!"0xb\007\000\002", !26, !18} ; [ DW_TAG_lexical_block ]
+!23 = !MDLocation(line: 9, scope: !22)
+!24 = !MDLocation(line: 10, scope: !19)
+!25 = !{!"main.cpp", !"/private/tmp"}
+!26 = !{!"test.cpp", !"/private/tmp"}
+!27 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Linker/Inputs/alignment.ll b/test/Linker/Inputs/alignment.ll
new file mode 100644
index 0000000..337cb8e
--- /dev/null
+++ b/test/Linker/Inputs/alignment.ll
@@ -0,0 +1,12 @@
+@A = global i32 7, align 8
+@B = global i32 7, align 4
+
+define void @C() align 8 {
+  ret void
+}
+
+define void @D() align 4 {
+  ret void
+}
+
+@E = common global i32 0, align 8
diff --git a/test/Linker/Inputs/apple-version/1.ll b/test/Linker/Inputs/apple-version/1.ll
new file mode 100644
index 0000000..5cafa5e
--- /dev/null
+++ b/test/Linker/Inputs/apple-version/1.ll
@@ -0,0 +1 @@
+target triple = "x86_64-apple-macosx10.10.0"
diff --git a/test/Linker/Inputs/apple-version/2.ll b/test/Linker/Inputs/apple-version/2.ll
new file mode 100644
index 0000000..b63ea1f
--- /dev/null
+++ b/test/Linker/Inputs/apple-version/2.ll
@@ -0,0 +1 @@
+target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Linker/Inputs/apple-version/3.ll b/test/Linker/Inputs/apple-version/3.ll
new file mode 100644
index 0000000..68e58ca
--- /dev/null
+++ b/test/Linker/Inputs/apple-version/3.ll
@@ -0,0 +1 @@
+target triple = "i386-apple-macosx10.9.0"
diff --git a/test/Linker/Inputs/apple-version/4.ll b/test/Linker/Inputs/apple-version/4.ll
new file mode 100644
index 0000000..467a634
--- /dev/null
+++ b/test/Linker/Inputs/apple-version/4.ll
@@ -0,0 +1 @@
+target triple = "x86_64h-apple-macosx10.9.0"
diff --git a/test/Linker/Inputs/comdat.ll b/test/Linker/Inputs/comdat.ll
index fdcca49..74a805c 100644
--- a/test/Linker/Inputs/comdat.ll
+++ b/test/Linker/Inputs/comdat.ll
@@ -2,19 +2,19 @@ target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
 target triple = "i686-pc-windows-msvc"
 
 $foo = comdat largest
-@foo = global i64 43, comdat $foo
+@foo = global i64 43, comdat($foo)
 
-define i32 @bar() comdat $foo {
+define i32 @bar() comdat($foo) {
   ret i32 43
 }
 
 $qux = comdat largest
-@qux = global i32 13, comdat $qux
-@in_unselected_group = global i32 13, comdat $qux
+@qux = global i32 13, comdat($qux)
+@in_unselected_group = global i32 13, comdat($qux)
 
-define i32 @baz() comdat $qux {
+define i32 @baz() comdat($qux) {
   ret i32 13
 }
 
 $any = comdat any
-@any = global i64 7, comdat $any
+@any = global i64 7, comdat($any)
diff --git a/test/Linker/Inputs/comdat2.ll b/test/Linker/Inputs/comdat2.ll
index 9e18304..ed2af62 100644
--- a/test/Linker/Inputs/comdat2.ll
+++ b/test/Linker/Inputs/comdat2.ll
@@ -1,2 +1,2 @@
 $foo = comdat largest
-@foo = global i64 43, comdat $foo
+@foo = global i64 43, comdat($foo)
diff --git a/test/Linker/Inputs/comdat3.ll b/test/Linker/Inputs/comdat3.ll
index 06f08b9..a1b730f 100644
--- a/test/Linker/Inputs/comdat3.ll
+++ b/test/Linker/Inputs/comdat3.ll
@@ -1,2 +1,2 @@
 $foo = comdat noduplicates
-@foo = global i64 43, comdat $foo
+@foo = global i64 43, comdat($foo)
diff --git a/test/Linker/Inputs/comdat4.ll b/test/Linker/Inputs/comdat4.ll
index bbfe3f7..5b4b812 100644
--- a/test/Linker/Inputs/comdat4.ll
+++ b/test/Linker/Inputs/comdat4.ll
@@ -2,4 +2,4 @@ target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
 target triple = "i686-pc-windows-msvc"
 
 $foo = comdat samesize
-@foo = global i64 42, comdat $foo
+@foo = global i64 42, comdat($foo)
diff --git a/test/Linker/Inputs/comdat5.ll b/test/Linker/Inputs/comdat5.ll
index 800af18..98c42b7 100644
--- a/test/Linker/Inputs/comdat5.ll
+++ b/test/Linker/Inputs/comdat5.ll
@@ -1,15 +1,9 @@
 target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
-target triple = "i686-pc-windows-msvc"
 
-%MSRTTICompleteObjectLocator = type { i32, i32, i32, i8*, %MSRTTIClassHierarchyDescriptor* }
-%MSRTTIClassHierarchyDescriptor = type { i32, i32, i32, %MSRTTIBaseClassDescriptor** }
-%MSRTTIBaseClassDescriptor = type { i8*, i32, i32, i32, i32, i32, %MSRTTIClassHierarchyDescriptor* }
-%struct.S = type { i32 (...)** }
+$foo = comdat largest
 
-$"\01??_7S@@6B@" = comdat largest
+@zed = external constant i8
+@some_name = private unnamed_addr constant [2 x i8*] [i8* @zed, i8* bitcast (void ()* @bar to i8*)], comdat($foo)
+@foo = alias getelementptr([2 x i8*]* @some_name, i32 0, i32 1)
 
-@"\01??_R4S@@6B@" = external constant %MSRTTICompleteObjectLocator
-@some_name = private unnamed_addr constant [2 x i8*] [i8* bitcast (%MSRTTICompleteObjectLocator* @"\01??_R4S@@6B@" to i8*), i8* bitcast (void (%struct.S*, i32)* @"\01??_GS@@UAEPAXI@Z" to i8*)], comdat $"\01??_7S@@6B@"
-@"\01??_7S@@6B@" = alias getelementptr([2 x i8*]* @some_name, i32 0, i32 1)
-
-declare x86_thiscallcc void @"\01??_GS@@UAEPAXI@Z"(%struct.S*, i32) unnamed_addr
+declare void @bar() unnamed_addr
diff --git a/test/Linker/Inputs/comdat8.ll b/test/Linker/Inputs/comdat8.ll
index eaa9625..a2833b0 100644
--- a/test/Linker/Inputs/comdat8.ll
+++ b/test/Linker/Inputs/comdat8.ll
@@ -1,4 +1,4 @@
 $c1 = comdat largest
 
-@some_name = private unnamed_addr constant i32 42, comdat $c1
+@some_name = private unnamed_addr constant i32 42, comdat($c1)
 @c1 = alias i32* @some_name
diff --git a/test/Linker/Inputs/comdat9.ll b/test/Linker/Inputs/comdat9.ll
deleted file mode 100644
index 679dbde..0000000
--- a/test/Linker/Inputs/comdat9.ll
+++ /dev/null
@@ -1,5 +0,0 @@
-$c = comdat any
-@a = alias void ()* @f
-define internal void @f() comdat $c {
-  ret void
-}
diff --git a/test/Linker/Inputs/distinct.ll b/test/Linker/Inputs/distinct.ll
new file mode 100644
index 0000000..07ae224
--- /dev/null
+++ b/test/Linker/Inputs/distinct.ll
@@ -0,0 +1,13 @@
+@global = linkonce global i32 0
+
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
+
+!0 = !{}
+!1 = !{!0}
+!2 = !{i32* @global}
+!3 = distinct !{}
+!4 = distinct !{!0}
+!5 = distinct !{i32* @global}
+!6 = !{!3}
+!7 = !{!4}
+!8 = !{!5}
diff --git a/test/Linker/Inputs/ident.a.ll b/test/Linker/Inputs/ident.a.ll
index ebda940..3c3fb19 100644
--- a/test/Linker/Inputs/ident.a.ll
+++ b/test/Linker/Inputs/ident.a.ll
@@ -1,3 +1,3 @@
 !llvm.ident = !{!0, !1}
-!0 = metadata !{metadata !"Compiler V1"}
-!1 = metadata !{metadata !"Compiler V2"}
+!0 = !{!"Compiler V1"}
+!1 = !{!"Compiler V2"}
diff --git a/test/Linker/Inputs/ident.b.ll b/test/Linker/Inputs/ident.b.ll
index 21ee1d8..d9daf85 100644
--- a/test/Linker/Inputs/ident.b.ll
+++ b/test/Linker/Inputs/ident.b.ll
@@ -1,2 +1,2 @@
 !llvm.ident = !{!0}
-!0 = metadata !{metadata !"Compiler V3"}
+!0 = !{!"Compiler V3"}
diff --git a/test/Linker/Inputs/mdlocation.ll b/test/Linker/Inputs/mdlocation.ll
new file mode 100644
index 0000000..f85c1dc
--- /dev/null
+++ b/test/Linker/Inputs/mdlocation.ll
@@ -0,0 +1,13 @@
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9}
+
+!0 = !{} ; Use this as a scope.
+!1 = !MDLocation(line: 3, column: 7, scope: !0)
+!2 = !MDLocation(line: 3, column: 7, scope: !0, inlinedAt: !1)
+!3 = !MDLocation(line: 3, column: 7, scope: !0, inlinedAt: !2)
+!4 = distinct !{} ; Test actual remapping.
+!5 = !MDLocation(line: 3, column: 7, scope: !4)
+!6 = !MDLocation(line: 3, column: 7, scope: !4, inlinedAt: !5)
+!7 = !MDLocation(line: 3, column: 7, scope: !4, inlinedAt: !6)
+; Test distinct nodes.
+!8 = distinct !MDLocation(line: 3, column: 7, scope: !0)
+!9 = distinct !MDLocation(line: 3, column: 7, scope: !0, inlinedAt: !8)
diff --git a/test/Linker/Inputs/module-flags-dont-change-others.ll b/test/Linker/Inputs/module-flags-dont-change-others.ll
new file mode 100644
index 0000000..61d57e5
--- /dev/null
+++ b/test/Linker/Inputs/module-flags-dont-change-others.ll
@@ -0,0 +1,8 @@
+!llvm.module.flags = !{!3, !4, !5}
+
+!0 = !{}
+!1 = !{!0}
+!2 = !{!0, !1}
+!3 = !{i32 4, !"foo", i32 37} ; Override the "foo" value.
+!4 = !{i32 5, !"bar", !1}
+!5 = !{i32 6, !"baz", !2}
diff --git a/test/Linker/Inputs/module-flags-pic-2-b.ll b/test/Linker/Inputs/module-flags-pic-2-b.ll
index 228e04a..0d78caf 100644
--- a/test/Linker/Inputs/module-flags-pic-2-b.ll
+++ b/test/Linker/Inputs/module-flags-pic-2-b.ll
@@ -1,3 +1,3 @@
-!0 = metadata !{ i32 1, metadata !"PIC Level", i32 2 }
+!0 = !{ i32 1, !"PIC Level", i32 2 }
 
 !llvm.module.flags = !{!0}
diff --git a/test/Linker/Inputs/opaque.ll b/test/Linker/Inputs/opaque.ll
new file mode 100644
index 0000000..2b0d7d3
--- /dev/null
+++ b/test/Linker/Inputs/opaque.ll
@@ -0,0 +1,13 @@
+%A = type { }
+%B = type { %D, %E, %B* }
+
+%D = type { %E }
+%E = type opaque
+
+@g2 = external global %A
+@g3 = external global %B
+
+define void @f1()  {
+  getelementptr %A* null, i32 0
+  ret void
+}
diff --git a/test/Linker/Inputs/pr21374.ll b/test/Linker/Inputs/pr21374.ll
new file mode 100644
index 0000000..fcddeaf
--- /dev/null
+++ b/test/Linker/Inputs/pr21374.ll
@@ -0,0 +1,4 @@
+%foo = type { i8* }
+define void @g(%foo* %x) {
+  ret void
+}
diff --git a/test/Linker/Inputs/replaced-function-matches-first-subprogram.ll b/test/Linker/Inputs/replaced-function-matches-first-subprogram.ll
new file mode 100644
index 0000000..a5de89f
--- /dev/null
+++ b/test/Linker/Inputs/replaced-function-matches-first-subprogram.ll
@@ -0,0 +1,27 @@
+%struct.Class = type { i8 }
+
+define weak_odr i32 @_ZN5ClassIiE3fooEv(%struct.Class* %this) align 2 {
+entry:
+  %this.addr = alloca %struct.Class*, align 8
+  store %struct.Class* %this, %struct.Class** %this.addr, align 8
+  %this1 = load %struct.Class** %this.addr
+  ret i32 0, !dbg !12
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9, !10}
+!llvm.ident = !{!11}
+
+!0 = !{!"0x11\004\00clang version 3.6.0 (trunk 224193) (llvm/trunk 224197)\000\00\000\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/Users/dexonsmith/data/llvm/staging/test/Linker/repro/d2/t2.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"t2.cpp", !"/Users/dexonsmith/data/llvm/staging/test/Linker/repro/d2"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\002\000\001\000\000\00256\000\002", !5, !6, !7, null, i32 (%struct.Class*)* @_ZN5ClassIiE3fooEv, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [foo]
+!5 = !{!"../t.h", !"/Users/dexonsmith/data/llvm/staging/test/Linker/repro/d2"}
+!6 = !{!"0x29", !5}    ; [ DW_TAG_file_type ] [/Users/dexonsmith/data/llvm/staging/test/Linker/repro/d2/../t.h]
+!7 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{i32 2, !"Dwarf Version", i32 2}
+!9 = !{i32 2, !"Debug Info Version", i32 2}
+!10 = !{i32 1, !"PIC Level", i32 2}
+!11 = !{!"clang version 3.6.0 (trunk 224193) (llvm/trunk 224197)"}
+!12 = !MDLocation(line: 2, column: 15, scope: !4)
diff --git a/test/Linker/Inputs/targettriple-a.ll b/test/Linker/Inputs/targettriple-a.ll
index 296d2df..b8c7901 100644
--- a/test/Linker/Inputs/targettriple-a.ll
+++ b/test/Linker/Inputs/targettriple-a.ll
@@ -1 +1 @@
-target triple = "e"
+target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Linker/Inputs/targettriple-b.ll b/test/Linker/Inputs/targettriple-b.ll
index cca872e..0122b4a 100644
--- a/test/Linker/Inputs/targettriple-b.ll
+++ b/test/Linker/Inputs/targettriple-b.ll
@@ -1 +1 @@
-target triple = "E"
+target triple = "i386-unknown-linux-gnu"
diff --git a/test/Linker/Inputs/targettriple-c.ll b/test/Linker/Inputs/targettriple-c.ll
new file mode 100644
index 0000000..e669a4c
--- /dev/null
+++ b/test/Linker/Inputs/targettriple-c.ll
@@ -0,0 +1 @@
+target triple = "x86_64h-unknown-linux-gnu"
diff --git a/test/Linker/Inputs/testlink.ll b/test/Linker/Inputs/testlink.ll
new file mode 100644
index 0000000..89f02ba
--- /dev/null
+++ b/test/Linker/Inputs/testlink.ll
@@ -0,0 +1,56 @@
+%intlist = type { %intlist*, i32 }
+
+
+%Ty1 = type { %Ty2* }
+%Ty2 = type opaque
+
+%VecSize = type { <10 x i32> }
+
+@GVTy1 = global %Ty1* null
+@GVTy2 = external global %Ty2*
+
+
+@MyVar = global i32 4
+@MyIntList = external global %intlist
+@AConst = constant i32 1234
+
+;; Intern in both testlink[12].ll
+@Intern1 = internal constant i32 52
+
+@Use2Intern1 = global i32* @Intern1
+
+;; Intern in one but not in other
+@Intern2 = constant i32 12345
+
+@MyIntListPtr = constant { %intlist* } { %intlist* @MyIntList }
+@MyVarPtr = linkonce global { i32* } { i32* @MyVar }
+@0 = constant i32 412
+
+; Provides definition of Struct1 and of S1GV.
+%Struct1 = type { i32 }
+@S1GV = global %Struct1* null
+
+define i32 @foo(i32 %blah) {
+  store i32 %blah, i32* @MyVar
+  %idx = getelementptr %intlist* @MyIntList, i64 0, i32 1
+  store i32 12, i32* %idx
+  %ack = load i32* @0
+  %fzo = add i32 %ack, %blah
+  ret i32 %fzo
+}
+
+declare void @unimp(float, double)
+
+define internal void @testintern() {
+  ret void
+}
+
+define void @Testintern() {
+  ret void
+}
+
+define internal void @testIntern() {
+  ret void
+}
+
+declare void @VecSizeCrash1(%VecSize)
diff --git a/test/Linker/Inputs/type-unique-alias.ll b/test/Linker/Inputs/type-unique-alias.ll
new file mode 100644
index 0000000..3ee162c
--- /dev/null
+++ b/test/Linker/Inputs/type-unique-alias.ll
@@ -0,0 +1,4 @@
+%u = type { i8 }
+
+@g2 = global %u zeroinitializer
+@a = weak alias %u* @g2
diff --git a/test/Linker/Inputs/type-unique-dst-types2.ll b/test/Linker/Inputs/type-unique-dst-types2.ll
new file mode 100644
index 0000000..b565c6d
--- /dev/null
+++ b/test/Linker/Inputs/type-unique-dst-types2.ll
@@ -0,0 +1,3 @@
+%A.11 = type { %B }
+%B = type { i8 }
+@g1 = external global %A.11
diff --git a/test/Linker/Inputs/type-unique-dst-types3.ll b/test/Linker/Inputs/type-unique-dst-types3.ll
new file mode 100644
index 0000000..c5794ad
--- /dev/null
+++ b/test/Linker/Inputs/type-unique-dst-types3.ll
@@ -0,0 +1,2 @@
+%A.11 = type opaque
+@g2 = external global %A.11
diff --git a/test/Linker/Inputs/type-unique-inheritance-a.ll b/test/Linker/Inputs/type-unique-inheritance-a.ll
index 31df5b2..c503919 100644
--- a/test/Linker/Inputs/type-unique-inheritance-a.ll
+++ b/test/Linker/Inputs/type-unique-inheritance-a.ll
@@ -52,8 +52,8 @@ entry:
   %a.addr = alloca i32, align 4
   %t = alloca %class.A, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !20, metadata !{metadata !"0x102"}), !dbg !21
-  call void @llvm.dbg.declare(metadata !{%class.A* %t}, metadata !22, metadata !{metadata !"0x102"}), !dbg !23
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !20, metadata !{!"0x102"}), !dbg !21
+  call void @llvm.dbg.declare(metadata %class.A* %t, metadata !22, metadata !{!"0x102"}), !dbg !23
   ret void, !dbg !24
 }
 
@@ -66,29 +66,29 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!19, !25}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git f54e02f969d02d640103db73efc30c45439fceab) (http://llvm.org/git/llvm.git 284353b55896cb1babfaa7add7c0a363245342d2)\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/inher/foo.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"foo.cpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4, metadata !8}
-!4 = metadata !{metadata !"0x2\00A\003\0064\0032\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 3, size 64, align 32, offset 0] [def] [from ]
-!5 = metadata !{metadata !"./a.hpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
-!6 = metadata !{metadata !7, metadata !13}
-!7 = metadata !{metadata !"0x1c\00\000\000\000\000\001", null, metadata !"_ZTS1A", metadata !8} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [private] [from Base]
-!8 = metadata !{metadata !"0x2\00Base\003\0032\0032\000\000\000", metadata !9, null, null, metadata !10, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_class_type ] [Base] [line 3, size 32, align 32, offset 0] [def] [from ]
-!9 = metadata !{metadata !"./b.hpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
-!10 = metadata !{metadata !11}
-!11 = metadata !{metadata !"0xd\00b\004\0032\0032\000\001", metadata !9, metadata !"_ZTS4Base", metadata !12} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 0] [private] [from int]
-!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!13 = metadata !{metadata !"0xd\00x\004\0032\0032\0032\001", metadata !5, metadata !"_ZTS1A", metadata !12} ; [ DW_TAG_member ] [x] [line 4, size 32, align 32, offset 32] [private] [from int]
-!14 = metadata !{metadata !15}
-!15 = metadata !{metadata !"0x2e\00f\00f\00_Z1fi\005\000\001\000\006\00256\000\005", metadata !1, metadata !16, metadata !17, null, void (i32)* @_Z1fi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [f]
-!16 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/inher/foo.cpp]
-!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!18 = metadata !{null, metadata !12}
-!19 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!20 = metadata !{metadata !"0x101\00a\0016777221\000", metadata !15, metadata !16, metadata !12} ; [ DW_TAG_arg_variable ] [a] [line 5]
-!21 = metadata !{i32 5, i32 0, metadata !15, null}
-!22 = metadata !{metadata !"0x100\00t\006\000", metadata !15, metadata !16, metadata !4} ; [ DW_TAG_auto_variable ] [t] [line 6]
-!23 = metadata !{i32 6, i32 0, metadata !15, null}
-!24 = metadata !{i32 7, i32 0, metadata !15, null}
-!25 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git f54e02f969d02d640103db73efc30c45439fceab) (http://llvm.org/git/llvm.git 284353b55896cb1babfaa7add7c0a363245342d2)\000\00\000\00\000", !1, !2, !3, !14, !2, !2} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/inher/foo.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"foo.cpp", !"/Users/mren/c_testing/type_unique_air/inher"}
+!2 = !{i32 0}
+!3 = !{!4, !8}
+!4 = !{!"0x2\00A\003\0064\0032\000\000\000", !5, null, null, !6, null, null, !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 3, size 64, align 32, offset 0] [def] [from ]
+!5 = !{!"./a.hpp", !"/Users/mren/c_testing/type_unique_air/inher"}
+!6 = !{!7, !13}
+!7 = !{!"0x1c\00\000\000\000\000\001", null, !"_ZTS1A", !8} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [private] [from Base]
+!8 = !{!"0x2\00Base\003\0032\0032\000\000\000", !9, null, null, !10, null, null, !"_ZTS4Base"} ; [ DW_TAG_class_type ] [Base] [line 3, size 32, align 32, offset 0] [def] [from ]
+!9 = !{!"./b.hpp", !"/Users/mren/c_testing/type_unique_air/inher"}
+!10 = !{!11}
+!11 = !{!"0xd\00b\004\0032\0032\000\001", !9, !"_ZTS4Base", !12} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 0] [private] [from int]
+!12 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = !{!"0xd\00x\004\0032\0032\0032\001", !5, !"_ZTS1A", !12} ; [ DW_TAG_member ] [x] [line 4, size 32, align 32, offset 32] [private] [from int]
+!14 = !{!15}
+!15 = !{!"0x2e\00f\00f\00_Z1fi\005\000\001\000\006\00256\000\005", !1, !16, !17, null, void (i32)* @_Z1fi, null, null, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [f]
+!16 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/inher/foo.cpp]
+!17 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = !{null, !12}
+!19 = !{i32 2, !"Dwarf Version", i32 2}
+!20 = !{!"0x101\00a\0016777221\000", !15, !16, !12} ; [ DW_TAG_arg_variable ] [a] [line 5]
+!21 = !MDLocation(line: 5, scope: !15)
+!22 = !{!"0x100\00t\006\000", !15, !16, !4} ; [ DW_TAG_auto_variable ] [t] [line 6]
+!23 = !MDLocation(line: 6, scope: !15)
+!24 = !MDLocation(line: 7, scope: !15)
+!25 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Linker/Inputs/type-unique-inheritance-b.ll b/test/Linker/Inputs/type-unique-inheritance-b.ll
index d915e45..d3b9dea 100644
--- a/test/Linker/Inputs/type-unique-inheritance-b.ll
+++ b/test/Linker/Inputs/type-unique-inheritance-b.ll
@@ -10,8 +10,8 @@ entry:
   %a.addr = alloca i32, align 4
   %t = alloca %class.B, align 8
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !28, metadata !{metadata !"0x102"}), !dbg !29
-  call void @llvm.dbg.declare(metadata !{%class.B* %t}, metadata !30, metadata !{metadata !"0x102"}), !dbg !31
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !28, metadata !{!"0x102"}), !dbg !29
+  call void @llvm.dbg.declare(metadata %class.B* %t, metadata !30, metadata !{!"0x102"}), !dbg !31
   ret void, !dbg !32
 }
 
@@ -24,7 +24,7 @@ entry:
   %retval = alloca i32, align 4
   %a = alloca %class.A, align 4
   store i32 0, i32* %retval
-  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !33, metadata !{metadata !"0x102"}), !dbg !34
+  call void @llvm.dbg.declare(metadata %class.A* %a, metadata !33, metadata !{!"0x102"}), !dbg !34
   call void @_Z1fi(i32 0), !dbg !35
   call void @_Z1gi(i32 1), !dbg !36
   ret i32 0, !dbg !37
@@ -40,42 +40,42 @@ attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!27, !38}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git f54e02f969d02d640103db73efc30c45439fceab) (http://llvm.org/git/llvm.git 284353b55896cb1babfaa7add7c0a363245342d2)\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !19, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/inher/bar.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"bar.cpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4, metadata !11, metadata !15}
-!4 = metadata !{metadata !"0x2\00B\007\00128\0064\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_class_type ] [B] [line 7, size 128, align 64, offset 0] [def] [from ]
-!5 = metadata !{metadata !"./b.hpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
-!6 = metadata !{metadata !7, metadata !9}
-!7 = metadata !{metadata !"0xd\00bb\008\0032\0032\000\001", metadata !5, metadata !"_ZTS1B", metadata !8} ; [ DW_TAG_member ] [bb] [line 8, size 32, align 32, offset 0] [private] [from int]
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0xd\00a\009\0064\0064\0064\001", metadata !5, metadata !"_ZTS1B", metadata !10} ; [ DW_TAG_member ] [a] [line 9, size 64, align 64, offset 64] [private] [from ]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
-!11 = metadata !{metadata !"0x2\00A\003\0064\0032\000\000\000", metadata !12, null, null, metadata !13, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 3, size 64, align 32, offset 0] [def] [from ]
-!12 = metadata !{metadata !"./a.hpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
-!13 = metadata !{metadata !14, metadata !18}
-!14 = metadata !{metadata !"0x1c\00\000\000\000\000\001", null, metadata !"_ZTS1A", metadata !15} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [private] [from Base]
-!15 = metadata !{metadata !"0x2\00Base\003\0032\0032\000\000\000", metadata !5, null, null, metadata !16, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_class_type ] [Base] [line 3, size 32, align 32, offset 0] [def] [from ]
-!16 = metadata !{metadata !17}
-!17 = metadata !{metadata !"0xd\00b\004\0032\0032\000\001", metadata !5, metadata !"_ZTS4Base", metadata !8} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 0] [private] [from int]
-!18 = metadata !{metadata !"0xd\00x\004\0032\0032\0032\001", metadata !12, metadata !"_ZTS1A", metadata !8} ; [ DW_TAG_member ] [x] [line 4, size 32, align 32, offset 32] [private] [from int]
-!19 = metadata !{metadata !20, metadata !24}
-!20 = metadata !{metadata !"0x2e\00g\00g\00_Z1gi\004\000\001\000\006\00256\000\004", metadata !1, metadata !21, metadata !22, null, void (i32)* @_Z1gi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [g]
-!21 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/inher/bar.cpp]
-!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!23 = metadata !{null, metadata !8}
-!24 = metadata !{metadata !"0x2e\00main\00main\00\009\000\001\000\006\00256\000\009", metadata !1, metadata !21, metadata !25, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 9] [def] [main]
-!25 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !26, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!26 = metadata !{metadata !8}
-!27 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!28 = metadata !{metadata !"0x101\00a\0016777220\000", metadata !20, metadata !21, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 4]
-!29 = metadata !{i32 4, i32 0, metadata !20, null}
-!30 = metadata !{metadata !"0x100\00t\005\000", metadata !20, metadata !21, metadata !4} ; [ DW_TAG_auto_variable ] [t] [line 5]
-!31 = metadata !{i32 5, i32 0, metadata !20, null}
-!32 = metadata !{i32 6, i32 0, metadata !20, null}
-!33 = metadata !{metadata !"0x100\00a\0010\000", metadata !24, metadata !21, metadata !11} ; [ DW_TAG_auto_variable ] [a] [line 10]
-!34 = metadata !{i32 10, i32 0, metadata !24, null}
-!35 = metadata !{i32 11, i32 0, metadata !24, null}
-!36 = metadata !{i32 12, i32 0, metadata !24, null}
-!37 = metadata !{i32 13, i32 0, metadata !24, null}
-!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git f54e02f969d02d640103db73efc30c45439fceab) (http://llvm.org/git/llvm.git 284353b55896cb1babfaa7add7c0a363245342d2)\000\00\000\00\000", !1, !2, !3, !19, !2, !2} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/inher/bar.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"bar.cpp", !"/Users/mren/c_testing/type_unique_air/inher"}
+!2 = !{i32 0}
+!3 = !{!4, !11, !15}
+!4 = !{!"0x2\00B\007\00128\0064\000\000\000", !5, null, null, !6, null, null, !"_ZTS1B"} ; [ DW_TAG_class_type ] [B] [line 7, size 128, align 64, offset 0] [def] [from ]
+!5 = !{!"./b.hpp", !"/Users/mren/c_testing/type_unique_air/inher"}
+!6 = !{!7, !9}
+!7 = !{!"0xd\00bb\008\0032\0032\000\001", !5, !"_ZTS1B", !8} ; [ DW_TAG_member ] [bb] [line 8, size 32, align 32, offset 0] [private] [from int]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0xd\00a\009\0064\0064\0064\001", !5, !"_ZTS1B", !10} ; [ DW_TAG_member ] [a] [line 9, size 64, align 64, offset 64] [private] [from ]
+!10 = !{!"0xf\00\000\0064\0064\000\000", null, null, !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!11 = !{!"0x2\00A\003\0064\0032\000\000\000", !12, null, null, !13, null, null, !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 3, size 64, align 32, offset 0] [def] [from ]
+!12 = !{!"./a.hpp", !"/Users/mren/c_testing/type_unique_air/inher"}
+!13 = !{!14, !18}
+!14 = !{!"0x1c\00\000\000\000\000\001", null, !"_ZTS1A", !15} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [private] [from Base]
+!15 = !{!"0x2\00Base\003\0032\0032\000\000\000", !5, null, null, !16, null, null, !"_ZTS4Base"} ; [ DW_TAG_class_type ] [Base] [line 3, size 32, align 32, offset 0] [def] [from ]
+!16 = !{!17}
+!17 = !{!"0xd\00b\004\0032\0032\000\001", !5, !"_ZTS4Base", !8} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 0] [private] [from int]
+!18 = !{!"0xd\00x\004\0032\0032\0032\001", !12, !"_ZTS1A", !8} ; [ DW_TAG_member ] [x] [line 4, size 32, align 32, offset 32] [private] [from int]
+!19 = !{!20, !24}
+!20 = !{!"0x2e\00g\00g\00_Z1gi\004\000\001\000\006\00256\000\004", !1, !21, !22, null, void (i32)* @_Z1gi, null, null, !2} ; [ DW_TAG_subprogram ] [line 4] [def] [g]
+!21 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/inher/bar.cpp]
+!22 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = !{null, !8}
+!24 = !{!"0x2e\00main\00main\00\009\000\001\000\006\00256\000\009", !1, !21, !25, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 9] [def] [main]
+!25 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !26, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!26 = !{!8}
+!27 = !{i32 2, !"Dwarf Version", i32 2}
+!28 = !{!"0x101\00a\0016777220\000", !20, !21, !8} ; [ DW_TAG_arg_variable ] [a] [line 4]
+!29 = !MDLocation(line: 4, scope: !20)
+!30 = !{!"0x100\00t\005\000", !20, !21, !4} ; [ DW_TAG_auto_variable ] [t] [line 5]
+!31 = !MDLocation(line: 5, scope: !20)
+!32 = !MDLocation(line: 6, scope: !20)
+!33 = !{!"0x100\00a\0010\000", !24, !21, !11} ; [ DW_TAG_auto_variable ] [a] [line 10]
+!34 = !MDLocation(line: 10, scope: !24)
+!35 = !MDLocation(line: 11, scope: !24)
+!36 = !MDLocation(line: 12, scope: !24)
+!37 = !MDLocation(line: 13, scope: !24)
+!38 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Linker/Inputs/type-unique-name.ll b/test/Linker/Inputs/type-unique-name.ll
new file mode 100644
index 0000000..2553246
--- /dev/null
+++ b/test/Linker/Inputs/type-unique-name.ll
@@ -0,0 +1,5 @@
+%t = type { i8 }
+
+define %t* @f() {
+  ret %t* null
+}
diff --git a/test/Linker/Inputs/type-unique-opaque.ll b/test/Linker/Inputs/type-unique-opaque.ll
new file mode 100644
index 0000000..872b601
--- /dev/null
+++ b/test/Linker/Inputs/type-unique-opaque.ll
@@ -0,0 +1,6 @@
+%t = type { i8 }
+%t2 = type { %t*, i16 }
+
+define %t2* @f() {
+  ret %t2* null
+}
diff --git a/test/Linker/Inputs/type-unique-simple2-a.ll b/test/Linker/Inputs/type-unique-simple2-a.ll
index 5ed5c2a..6d6e93c 100644
--- a/test/Linker/Inputs/type-unique-simple2-a.ll
+++ b/test/Linker/Inputs/type-unique-simple2-a.ll
@@ -49,8 +49,8 @@ entry:
   %a.addr = alloca i32, align 4
   %t = alloca %struct.Base, align 8
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !17, metadata !{metadata !"0x102"}), !dbg !18
-  call void @llvm.dbg.declare(metadata !{%struct.Base* %t}, metadata !19, metadata !{metadata !"0x102"}), !dbg !20
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !17, metadata !{!"0x102"}), !dbg !18
+  call void @llvm.dbg.declare(metadata %struct.Base* %t, metadata !19, metadata !{!"0x102"}), !dbg !20
   ret void, !dbg !21
 }
 
@@ -63,26 +63,26 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!16, !22}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git 8a3f9e46cb988d2c664395b21910091e3730ae82) (http://llvm.org/git/llvm.git 4699e9549358bc77824a59114548eecc3f7c523c)\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !11, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [foo.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"foo.cpp", metadata !"."}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00Base\001\00128\0064\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 128, align 64, offset 0] [def] [from ]
-!5 = metadata !{metadata !"./a.hpp", metadata !"."}
-!6 = metadata !{metadata !7, metadata !9}
-!7 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !5, metadata !"_ZTS4Base", metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0xd\00b\003\0064\0064\0064\000", metadata !5, metadata !"_ZTS4Base", metadata !10} ; [ DW_TAG_member ] [b] [line 3, size 64, align 64, offset 64] [from ]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0x2e\00f\00f\00_Z1fi\003\000\001\000\006\00256\000\003", metadata !1, metadata !13, metadata !14, null, void (i32)* @_Z1fi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
-!13 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [foo.cpp]
-!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!15 = metadata !{null, metadata !8}
-!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!17 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !12, metadata !13, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 3]
-!18 = metadata !{i32 3, i32 0, metadata !12, null}
-!19 = metadata !{metadata !"0x100\00t\004\000", metadata !12, metadata !13, metadata !4} ; [ DW_TAG_auto_variable ] [t] [line 4]
-!20 = metadata !{i32 4, i32 0, metadata !12, null}
-!21 = metadata !{i32 5, i32 0, metadata !12, null}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git 8a3f9e46cb988d2c664395b21910091e3730ae82) (http://llvm.org/git/llvm.git 4699e9549358bc77824a59114548eecc3f7c523c)\000\00\000\00\000", !1, !2, !3, !11, !2, !2} ; [ DW_TAG_compile_unit ] [foo.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"foo.cpp", !"."}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00Base\001\00128\0064\000\000\000", !5, null, null, !6, null, null, !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 128, align 64, offset 0] [def] [from ]
+!5 = !{!"./a.hpp", !"."}
+!6 = !{!7, !9}
+!7 = !{!"0xd\00a\002\0032\0032\000\000", !5, !"_ZTS4Base", !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0xd\00b\003\0064\0064\0064\000", !5, !"_ZTS4Base", !10} ; [ DW_TAG_member ] [b] [line 3, size 64, align 64, offset 64] [from ]
+!10 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS4Base"} ; [ DW_TAG_pointer_type ]
+!11 = !{!12}
+!12 = !{!"0x2e\00f\00f\00_Z1fi\003\000\001\000\006\00256\000\003", !1, !13, !14, null, void (i32)* @_Z1fi, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!13 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [foo.cpp]
+!14 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = !{null, !8}
+!16 = !{i32 2, !"Dwarf Version", i32 2}
+!17 = !{!"0x101\00a\0016777219\000", !12, !13, !8} ; [ DW_TAG_arg_variable ] [a] [line 3]
+!18 = !MDLocation(line: 3, scope: !12)
+!19 = !{!"0x100\00t\004\000", !12, !13, !4} ; [ DW_TAG_auto_variable ] [t] [line 4]
+!20 = !MDLocation(line: 4, scope: !12)
+!21 = !MDLocation(line: 5, scope: !12)
+!22 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Linker/Inputs/type-unique-simple2-b.ll b/test/Linker/Inputs/type-unique-simple2-b.ll
index 241218d..d3b2f7e 100644
--- a/test/Linker/Inputs/type-unique-simple2-b.ll
+++ b/test/Linker/Inputs/type-unique-simple2-b.ll
@@ -8,8 +8,8 @@ entry:
   %a.addr = alloca i32, align 4
   %t = alloca %struct.Base, align 8
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !20, metadata !{metadata !"0x102"}), !dbg !21
-  call void @llvm.dbg.declare(metadata !{%struct.Base* %t}, metadata !22, metadata !{metadata !"0x102"}), !dbg !23
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !20, metadata !{!"0x102"}), !dbg !21
+  call void @llvm.dbg.declare(metadata %struct.Base* %t, metadata !22, metadata !{!"0x102"}), !dbg !23
   ret void, !dbg !24
 }
 
@@ -36,32 +36,32 @@ attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!19, !28}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git 8a3f9e46cb988d2c664395b21910091e3730ae82) (http://llvm.org/git/llvm.git 4699e9549358bc77824a59114548eecc3f7c523c)\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !11, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [bar.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"bar.cpp", metadata !"."}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00Base\001\00128\0064\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 128, align 64, offset 0] [def] [from ]
-!5 = metadata !{metadata !"./a.hpp", metadata !"."}
-!6 = metadata !{metadata !7, metadata !9}
-!7 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !5, metadata !"_ZTS4Base", metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0xd\00b\003\0064\0064\0064\000", metadata !5, metadata !"_ZTS4Base", metadata !10} ; [ DW_TAG_member ] [b] [line 3, size 64, align 64, offset 64] [from ]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{metadata !12, metadata !16}
-!12 = metadata !{metadata !"0x2e\00g\00g\00_Z1gi\004\000\001\000\006\00256\000\004", metadata !1, metadata !13, metadata !14, null, void (i32)* @_Z1gi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [g]
-!13 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [bar.cpp]
-!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!15 = metadata !{null, metadata !8}
-!16 = metadata !{metadata !"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", metadata !1, metadata !13, metadata !17, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
-!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!18 = metadata !{metadata !8}
-!19 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!20 = metadata !{metadata !"0x101\00a\0016777220\000", metadata !12, metadata !13, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 4]
-!21 = metadata !{i32 4, i32 0, metadata !12, null}
-!22 = metadata !{metadata !"0x100\00t\005\000", metadata !12, metadata !13, metadata !4} ; [ DW_TAG_auto_variable ] [t] [line 5]
-!23 = metadata !{i32 5, i32 0, metadata !12, null}
-!24 = metadata !{i32 6, i32 0, metadata !12, null}
-!25 = metadata !{i32 8, i32 0, metadata !16, null}
-!26 = metadata !{i32 9, i32 0, metadata !16, null}
-!27 = metadata !{i32 10, i32 0, metadata !16, null}
-!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git 8a3f9e46cb988d2c664395b21910091e3730ae82) (http://llvm.org/git/llvm.git 4699e9549358bc77824a59114548eecc3f7c523c)\000\00\000\00\000", !1, !2, !3, !11, !2, !2} ; [ DW_TAG_compile_unit ] [bar.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"bar.cpp", !"."}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00Base\001\00128\0064\000\000\000", !5, null, null, !6, null, null, !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 128, align 64, offset 0] [def] [from ]
+!5 = !{!"./a.hpp", !"."}
+!6 = !{!7, !9}
+!7 = !{!"0xd\00a\002\0032\0032\000\000", !5, !"_ZTS4Base", !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0xd\00b\003\0064\0064\0064\000", !5, !"_ZTS4Base", !10} ; [ DW_TAG_member ] [b] [line 3, size 64, align 64, offset 64] [from ]
+!10 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS4Base"} ; [ DW_TAG_pointer_type ]
+!11 = !{!12, !16}
+!12 = !{!"0x2e\00g\00g\00_Z1gi\004\000\001\000\006\00256\000\004", !1, !13, !14, null, void (i32)* @_Z1gi, null, null, !2} ; [ DW_TAG_subprogram ] [line 4] [def] [g]
+!13 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [bar.cpp]
+!14 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = !{null, !8}
+!16 = !{!"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", !1, !13, !17, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!17 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = !{!8}
+!19 = !{i32 2, !"Dwarf Version", i32 2}
+!20 = !{!"0x101\00a\0016777220\000", !12, !13, !8} ; [ DW_TAG_arg_variable ] [a] [line 4]
+!21 = !MDLocation(line: 4, scope: !12)
+!22 = !{!"0x100\00t\005\000", !12, !13, !4} ; [ DW_TAG_auto_variable ] [t] [line 5]
+!23 = !MDLocation(line: 5, scope: !12)
+!24 = !MDLocation(line: 6, scope: !12)
+!25 = !MDLocation(line: 8, scope: !16)
+!26 = !MDLocation(line: 9, scope: !16)
+!27 = !MDLocation(line: 10, scope: !16)
+!28 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Linker/Inputs/type-unique-unrelated2.ll b/test/Linker/Inputs/type-unique-unrelated2.ll
new file mode 100644
index 0000000..b7c2cec
--- /dev/null
+++ b/test/Linker/Inputs/type-unique-unrelated2.ll
@@ -0,0 +1,7 @@
+%t = type { i8* }
+declare %t @g()
+
+define %t @g2() {
+ %x = call %t @g()
+ ret %t %x
+}
diff --git a/test/Linker/Inputs/type-unique-unrelated3.ll b/test/Linker/Inputs/type-unique-unrelated3.ll
new file mode 100644
index 0000000..e3f2dd9
--- /dev/null
+++ b/test/Linker/Inputs/type-unique-unrelated3.ll
@@ -0,0 +1,7 @@
+%t = type { i8* }
+declare %t @f()
+
+define %t @g() {
+ %x = call %t @f()
+ ret %t %x
+}
diff --git a/test/Linker/Inputs/unique-fwd-decl-b.ll b/test/Linker/Inputs/unique-fwd-decl-b.ll
index 240fbee..24099b8 100644
--- a/test/Linker/Inputs/unique-fwd-decl-b.ll
+++ b/test/Linker/Inputs/unique-fwd-decl-b.ll
@@ -1,3 +1,3 @@
 !b = !{!0}
-!0 = metadata !{metadata !1}
-!1 = metadata !{}
+!0 = !{!1}
+!1 = !{}
diff --git a/test/Linker/Inputs/unique-fwd-decl-order.ll b/test/Linker/Inputs/unique-fwd-decl-order.ll
new file mode 100644
index 0000000..e87ac84
--- /dev/null
+++ b/test/Linker/Inputs/unique-fwd-decl-order.ll
@@ -0,0 +1,6 @@
+!named = !{!0}
+
+; These nodes are intentionally in the opposite order from the test-driver.
+; However, they are numbered the same for the reader's convenience.
+!1 = !{}
+!0 = !{!1}
diff --git a/test/Linker/Inputs/visibility.ll b/test/Linker/Inputs/visibility.ll
index 2ab58fd..2cd112e 100644
--- a/test/Linker/Inputs/visibility.ll
+++ b/test/Linker/Inputs/visibility.ll
@@ -4,7 +4,7 @@ $c1 = comdat any
 @v1 = weak hidden global i32 0
 @v2 = weak protected global i32 0
 @v3 = weak hidden global i32 0
-@v4 = hidden global i32 1, comdat $c1
+@v4 = hidden global i32 1, comdat($c1)
 
 ; Aliases
 @a1 = weak hidden alias i32* @v1
diff --git a/test/Linker/alignment.ll b/test/Linker/alignment.ll
new file mode 100644
index 0000000..16cfe62
--- /dev/null
+++ b/test/Linker/alignment.ll
@@ -0,0 +1,22 @@
+; RUN: llvm-link %p/alignment.ll %p/Inputs/alignment.ll -S | FileCheck %s
+; RUN: llvm-link %p/Inputs/alignment.ll %p/alignment.ll -S | FileCheck %s
+
+
+@A = weak global i32 7, align 4
+; CHECK-DAG: @A = global i32 7, align 8
+
+@B = weak global i32 7, align 8
+; CHECK-DAG: @B = global i32 7, align 4
+
+define weak void @C() align 4 {
+  ret void
+}
+; CHECK-DAG: define void @C() align 8 {
+
+define weak void @D() align 8 {
+  ret void
+}
+; CHECK-DAG: define void @D() align 4 {
+
+@E = common global i32 0, align 4
+; CHECK-DAG: @E = common global i32 0, align 8
diff --git a/test/Linker/apple-version.ll b/test/Linker/apple-version.ll
new file mode 100644
index 0000000..4dbc866
--- /dev/null
+++ b/test/Linker/apple-version.ll
@@ -0,0 +1,24 @@
+; RUN: llvm-link %s %S/Inputs/apple-version/1.ll -S -o - 2>%t.err | FileCheck %s -check-prefix=CHECK1
+; RUN: cat %t.err | FileCheck --check-prefix=WARN1 --allow-empty %s
+; RUN: llvm-link %s %S/Inputs/apple-version/2.ll -S -o - 2>%t.err | FileCheck %s -check-prefix=CHECK2
+; RUN: cat %t.err | FileCheck --check-prefix=WARN2 --allow-empty %s
+; RUN: llvm-link %s %S/Inputs/apple-version/3.ll -S -o /dev/null 2>%t.err
+; RUN: cat %t.err | FileCheck --check-prefix=WARN3 %s
+; RUN: llvm-link %s %S/Inputs/apple-version/4.ll -S -o /dev/null 2>%t.err
+; RUN: cat %t.err | FileCheck --check-prefix=WARN4 --allow-empty %s
+
+; Check that the triple that has the larger version number is chosen and no
+; warnings are issued when the Triples differ only in version numbers.
+
+; CHECK1: target triple = "x86_64-apple-macosx10.10.0"
+; WARN1-NOT: WARNING
+; CHECK2: target triple = "x86_64-apple-macosx10.9.0"
+; WARN2-NOT: WARNING
+
+; i386 and x86_64 map to different ArchType enums.
+; WARN3: WARNING: Linking two modules of different target triples
+
+; x86_64h and x86_64 map to the same ArchType enum.
+; WARN4-NOT: WARNING
+
+target triple = "x86_64-apple-macosx10.9.0"
diff --git a/test/Linker/comdat.ll b/test/Linker/comdat.ll
index 4d2aef7..2a2ec3b 100644
--- a/test/Linker/comdat.ll
+++ b/test/Linker/comdat.ll
@@ -3,30 +3,30 @@ target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
 target triple = "i686-pc-windows-msvc"
 
 $foo = comdat largest
-@foo = global i32 42, comdat $foo
+@foo = global i32 42, comdat($foo)
 
-define i32 @bar() comdat $foo {
+define i32 @bar() comdat($foo) {
   ret i32 42
 }
 
 $qux = comdat largest
-@qux = global i64 12, comdat $qux
+@qux = global i64 12, comdat($qux)
 
-define i32 @baz() comdat $qux {
+define i32 @baz() comdat($qux) {
   ret i32 12
 }
 
 $any = comdat any
-@any = global i64 6, comdat $any
+@any = global i64 6, comdat($any)
 
 ; CHECK: $qux = comdat largest
 ; CHECK: $foo = comdat largest
 ; CHECK: $any = comdat any
 
-; CHECK: @qux = global i64 12, comdat $qux
-; CHECK: @any = global i64 6, comdat $any
-; CHECK: @foo = global i64 43, comdat $foo
+; CHECK: @qux = global i64 12, comdat{{$}}
+; CHECK: @any = global i64 6, comdat{{$}}
+; CHECK: @foo = global i64 43, comdat{{$}}
 ; CHECK-NOT: @in_unselected_group = global i32 13, comdat $qux
 
-; CHECK: define i32 @baz() comdat $qux
-; CHECK: define i32 @bar() comdat $foo
+; CHECK: define i32 @baz() comdat($qux)
+; CHECK: define i32 @bar() comdat($foo)
diff --git a/test/Linker/comdat2.ll b/test/Linker/comdat2.ll
index 60c3d7c..ba8115d 100644
--- a/test/Linker/comdat2.ll
+++ b/test/Linker/comdat2.ll
@@ -3,5 +3,5 @@ target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
 target triple = "i686-pc-windows-msvc"
 
 $foo = comdat samesize
-@foo = global i32 42, comdat $foo
+@foo = global i32 42, comdat($foo)
 ; CHECK: Linking COMDATs named 'foo': invalid selection kinds!
diff --git a/test/Linker/comdat3.ll b/test/Linker/comdat3.ll
index f0d9a48..3b5db0a 100644
--- a/test/Linker/comdat3.ll
+++ b/test/Linker/comdat3.ll
@@ -1,5 +1,5 @@
 ; RUN: not llvm-link %s %p/Inputs/comdat2.ll -S -o - 2>&1 | FileCheck %s
 
 $foo = comdat largest
-@foo = global i32 43, comdat $foo
+@foo = global i32 43, comdat($foo)
 ; CHECK: Linking COMDATs named 'foo': can't do size dependent selection without DataLayout!
diff --git a/test/Linker/comdat4.ll b/test/Linker/comdat4.ll
index 50c1778..cf7ac5f 100644
--- a/test/Linker/comdat4.ll
+++ b/test/Linker/comdat4.ll
@@ -1,5 +1,5 @@
 ; RUN: not llvm-link %s %p/Inputs/comdat3.ll -S -o - 2>&1 | FileCheck %s
 
 $foo = comdat noduplicates
-@foo = global i64 43, comdat $foo
+@foo = global i64 43, comdat($foo)
 ; CHECK: Linking COMDATs named 'foo': noduplicates has been violated!
diff --git a/test/Linker/comdat5.ll b/test/Linker/comdat5.ll
index 011fb8c..759688e 100644
--- a/test/Linker/comdat5.ll
+++ b/test/Linker/comdat5.ll
@@ -3,5 +3,5 @@ target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
 target triple = "i686-pc-windows-msvc"
 
 $foo = comdat samesize
-@foo = global i32 42, comdat $foo
+@foo = global i32 42, comdat($foo)
 ; CHECK: Linking COMDATs named 'foo': SameSize violated!
diff --git a/test/Linker/comdat6.ll b/test/Linker/comdat6.ll
index efa5dfb..b5360a7 100644
--- a/test/Linker/comdat6.ll
+++ b/test/Linker/comdat6.ll
@@ -1,13 +1,10 @@
-; RUN: llvm-link %s %p/Inputs/comdat5.ll -S -o - 2>&1 | FileCheck %s
-; RUN: llvm-link %p/Inputs/comdat5.ll %s -S -o - 2>&1 | FileCheck %s
+; RUN: llvm-link %s %p/Inputs/comdat5.ll -S -o - | FileCheck %s
+; RUN: llvm-link %p/Inputs/comdat5.ll %s -S -o - | FileCheck %s
 target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
-target triple = "i686-pc-windows-msvc"
 
-%struct.S = type { i32 (...)** }
+$foo = comdat largest
+@foo = linkonce_odr unnamed_addr constant [1 x i8*] [i8* bitcast (void ()* @bar to i8*)], comdat($foo)
 
-$"\01??_7S@@6B@" = comdat largest
-@"\01??_7S@@6B@" = linkonce_odr unnamed_addr constant [1 x i8*] [i8* bitcast (void (%struct.S*, i32)* @"\01??_GS@@UAEPAXI@Z" to i8*)], comdat $"\01??_7S@@6B@"
+; CHECK: @foo = alias getelementptr inbounds ([2 x i8*]* @some_name, i32 0, i32 1)
 
-; CHECK: @"\01??_7S@@6B@" = alias getelementptr inbounds ([2 x i8*]* @some_name, i32 0, i32 1)
-
-declare x86_thiscallcc void @"\01??_GS@@UAEPAXI@Z"(%struct.S*, i32) unnamed_addr
+declare void @bar() unnamed_addr
diff --git a/test/Linker/comdat7.ll b/test/Linker/comdat7.ll
index d7e5e2d..ef938a7 100644
--- a/test/Linker/comdat7.ll
+++ b/test/Linker/comdat7.ll
@@ -2,7 +2,7 @@
 
 $c1 = comdat largest
 
-define void @c1() comdat $c1 {
+define void @c1() comdat($c1) {
   ret void
 }
 ; CHECK: GlobalVariable required for data dependent selection!
diff --git a/test/Linker/comdat8.ll b/test/Linker/comdat8.ll
index e6da583..5ca352a 100644
--- a/test/Linker/comdat8.ll
+++ b/test/Linker/comdat8.ll
@@ -2,7 +2,7 @@
 
 $c1 = comdat largest
 
-@some_name = private unnamed_addr constant i32 42, comdat $c1
+@some_name = private unnamed_addr constant i32 42, comdat($c1)
 @c1 = alias i8* inttoptr (i32 ptrtoint (i32* @some_name to i32) to i8*)
 
 ; CHECK: COMDAT key involves incomputable alias size.
diff --git a/test/Linker/comdat9.ll b/test/Linker/comdat9.ll
index eada8c6..f155a6e 100644
--- a/test/Linker/comdat9.ll
+++ b/test/Linker/comdat9.ll
@@ -1,7 +1,19 @@
-; RUN: llvm-link %s %p/Inputs/comdat9.ll -S -o - | FileCheck %s
+; RUN: llvm-link %s -S -o - | FileCheck %s
 
-; CHECK: $c = comdat any
-; CHECK: @a = alias void ()* @f
-; CHECK: define internal void @f() comdat $c {
-; CHECK:   ret void
-; CHECK: }
+$c = comdat any
+@a = alias void ()* @f
+define internal void @f() comdat($c) {
+  ret void
+}
+
+; CHECK-DAG: $c = comdat any
+; CHECK-DAG: @a = alias void ()* @f
+; CHECK-DAG: define internal void @f() comdat($c)
+
+$f2 = comdat largest
+define internal void @f2() comdat($f2) {
+  ret void
+}
+
+; CHECK-DAG: $f2 = comdat largest
+; CHECK-DAG: define internal void @f2() comdat {
diff --git a/test/Linker/constructor-comdat.ll b/test/Linker/constructor-comdat.ll
index 42e2d83..dfc8992 100644
--- a/test/Linker/constructor-comdat.ll
+++ b/test/Linker/constructor-comdat.ll
@@ -7,7 +7,7 @@ $_ZN3fooIiEC5Ev = comdat any
 @_ZN3fooIiEC1Ev = weak_odr alias void ()* @_ZN3fooIiEC2Ev
 ; CHECK: @_ZN3fooIiEC1Ev = weak_odr alias void ()* @_ZN3fooIiEC2Ev
 
-; CHECK: define weak_odr void @_ZN3fooIiEC2Ev() comdat $_ZN3fooIiEC5Ev {
-define weak_odr void @_ZN3fooIiEC2Ev() comdat $_ZN3fooIiEC5Ev {
+; CHECK: define weak_odr void @_ZN3fooIiEC2Ev() comdat($_ZN3fooIiEC5Ev) {
+define weak_odr void @_ZN3fooIiEC2Ev() comdat($_ZN3fooIiEC5Ev) {
   ret void
 }
diff --git a/test/Linker/debug-info-version-a.ll b/test/Linker/debug-info-version-a.ll
index 64a0583..352fcdc 100644
--- a/test/Linker/debug-info-version-a.ll
+++ b/test/Linker/debug-info-version-a.ll
@@ -4,13 +4,13 @@
 ; from the other file should be dropped.
 
 ; CHECK-NOT: metadata !{metadata !"b.c", metadata !""}
-; CHECK: metadata !{metadata !"a.c", metadata !""}
+; CHECK:  !"a.c", !""}
 ; CHECK-NOT: metadata !{metadata !"b.c", metadata !""}
 
 !llvm.module.flags = !{ !0 }
 !llvm.dbg.cu = !{!1}
 
-!0 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!1 = metadata !{metadata !"0x11\0012\00clang\001\00\000\00\000", metadata !2, metadata !3, metadata !3, metadata !3, null, null} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{metadata !"a.c", metadata !""}
-!3 = metadata !{}
+!0 = !{i32 2, !"Debug Info Version", i32 2}
+!1 = !{!"0x11\0012\00clang\001\00\000\00\000", !2, !3, !3, !3, null, null} ; [ DW_TAG_compile_unit ]
+!2 = !{!"a.c", !""}
+!3 = !{}
diff --git a/test/Linker/debug-info-version-b.ll b/test/Linker/debug-info-version-b.ll
index 515291f..e494420 100644
--- a/test/Linker/debug-info-version-b.ll
+++ b/test/Linker/debug-info-version-b.ll
@@ -4,7 +4,7 @@
 !llvm.module.flags = !{ !0 }
 !llvm.dbg.cu = !{!1}
 
-!0 = metadata !{i32 2, metadata !"Debug Info Version", i32 42}
-!1 = metadata !{metadata !"0x11\0012\00clang\000\00", metadata !"I AM UNEXPECTED!"} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{metadata !"b.c", metadata !""}
-!3 = metadata !{}
+!0 = !{i32 2, !"Debug Info Version", i32 42}
+!1 = !{!"0x11\0012\00clang\000\00", !"I AM UNEXPECTED!"} ; [ DW_TAG_compile_unit ]
+!2 = !{!"b.c", !""}
+!3 = !{}
diff --git a/test/Linker/distinct-cycles.ll b/test/Linker/distinct-cycles.ll
new file mode 100644
index 0000000..b9b496c
--- /dev/null
+++ b/test/Linker/distinct-cycles.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-link -o - -S %s | FileCheck %s
+; Crasher for PR22456: MapMetadata() should resolve all cycles.
+
+; CHECK: !named = !{!0}
+!named = !{!0}
+
+; CHECK: !0 = distinct !{!1}
+!0 = distinct !{!1}
+
+; CHECK-NEXT: !1 = !{!2}
+; CHECK-NEXT: !2 = !{!1}
+!1 = !{!2}
+!2 = !{!1}
diff --git a/test/Linker/distinct.ll b/test/Linker/distinct.ll
new file mode 100644
index 0000000..c8e5c89
--- /dev/null
+++ b/test/Linker/distinct.ll
@@ -0,0 +1,37 @@
+; RUN: llvm-link %s %S/Inputs/distinct.ll -o - -S | FileCheck %s
+
+; Test that distinct nodes from other modules remain distinct.  The @global
+; cases are the most interesting, since the operands actually need to be
+; remapped.
+
+; CHECK: @global = linkonce global i32 0
+@global = linkonce global i32 0
+
+; CHECK: !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !0, !1, !2, !9, !10, !11, !12, !13, !14}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
+
+; CHECK:      !0 = !{}
+; CHECK-NEXT: !1 = !{!0}
+; CHECK-NEXT: !2 = !{i32* @global}
+; CHECK-NEXT: !3 = distinct !{}
+; CHECK-NEXT: !4 = distinct !{!0}
+; CHECK-NEXT: !5 = distinct !{i32* @global}
+; CHECK-NEXT: !6 = !{!3}
+; CHECK-NEXT: !7 = !{!4}
+; CHECK-NEXT: !8 = !{!5}
+; CHECK-NEXT: !9 = distinct !{}
+; CHECK-NEXT: !10 = distinct !{!0}
+; CHECK-NEXT: !11 = distinct !{i32* @global}
+; CHECK-NEXT: !12 = !{!9}
+; CHECK-NEXT: !13 = !{!10}
+; CHECK-NEXT: !14 = !{!11}
+; CHECK-NOT:  !
+!0 = !{}
+!1 = !{!0}
+!2 = !{i32* @global}
+!3 = distinct !{}
+!4 = distinct !{!0}
+!5 = distinct !{i32* @global}
+!6 = !{!3}
+!7 = !{!4}
+!8 = !{!5}
diff --git a/test/Linker/linkmdnode.ll b/test/Linker/linkmdnode.ll
index 5f11580..2469c29 100644
--- a/test/Linker/linkmdnode.ll
+++ b/test/Linker/linkmdnode.ll
@@ -3,7 +3,7 @@
 ; RUN: llvm-link %t.bc %t2.bc
 
 
-!21 = metadata !{i32 42, metadata !"foobar"}
+!21 = !{i32 42, !"foobar"}
 
 declare i8 @llvm.something(metadata %a)
 define void @foo() {
diff --git a/test/Linker/linkmdnode2.ll b/test/Linker/linkmdnode2.ll
index a7d991a..5b2b58d 100644
--- a/test/Linker/linkmdnode2.ll
+++ b/test/Linker/linkmdnode2.ll
@@ -2,7 +2,7 @@
 ;
 ; RUN: true
 
-!22 = metadata !{i32 42, metadata !"foobar"}
+!22 = !{i32 42, !"foobar"}
 
 declare i8 @llvm.something(metadata %a)
 define void @foo1() {
@@ -18,5 +18,5 @@ define void @test() {
   ret void, !abc !0
 }
 
-!0 = metadata !{metadata !0, i32 42 }
+!0 = !{!0, i32 42 }
 
diff --git a/test/Linker/linknamedmdnode.ll b/test/Linker/linknamedmdnode.ll
index 73e7554..60f95a6 100644
--- a/test/Linker/linknamedmdnode.ll
+++ b/test/Linker/linknamedmdnode.ll
@@ -3,5 +3,5 @@
 ; RUN: llvm-link %t.bc %t2.bc -S | FileCheck %s
 ; CHECK: !llvm.stuff = !{!0, !1}
 
-!0 = metadata !{i32 42}
+!0 = !{i32 42}
 !llvm.stuff = !{!0}
diff --git a/test/Linker/linknamedmdnode2.ll b/test/Linker/linknamedmdnode2.ll
index d16f62a..ff4330d 100644
--- a/test/Linker/linknamedmdnode2.ll
+++ b/test/Linker/linknamedmdnode2.ll
@@ -2,5 +2,5 @@
 ;
 ; RUN: true
 
-!0 = metadata !{i32 41}
+!0 = !{i32 41}
 !llvm.stuff = !{!0}
diff --git a/test/Linker/lto-attributes.ll b/test/Linker/lto-attributes.ll
index 0dc78ad..e55029a 100644
--- a/test/Linker/lto-attributes.ll
+++ b/test/Linker/lto-attributes.ll
@@ -1,7 +1,10 @@
 ; RUN: llvm-link -S %s -o - | FileCheck %s
 
-; CHECK: @foo = private externally_initialized global i8* null
+; CHECK-DAG: @foo = private externally_initialized global i8* null
 @foo = private externally_initialized global i8* null
-; CHECK: @array = appending global [7 x i8] c"abcdefg", align 1
+
+@useFoo = global i8** @foo
+
+; CHECK-DAG: @array = appending global [7 x i8] c"abcdefg", align 1
 @array = appending global [7 x i8] c"abcdefg", align 1
 
diff --git a/test/Linker/mdlocation.ll b/test/Linker/mdlocation.ll
new file mode 100644
index 0000000..5ee366c
--- /dev/null
+++ b/test/Linker/mdlocation.ll
@@ -0,0 +1,34 @@
+; RUN: llvm-link %s %S/Inputs/mdlocation.ll -o - -S | FileCheck %s
+
+; Test that MDLocations are remapped properly.
+
+; CHECK: !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !0, !1, !2, !3, !10, !11, !12, !13, !14, !15}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9}
+
+; CHECK:      !0 = !{}
+; CHECK-NEXT: !1 = !MDLocation(line: 3, column: 7, scope: !0)
+; CHECK-NEXT: !2 = !MDLocation(line: 3, column: 7, scope: !0, inlinedAt: !1)
+; CHECK-NEXT: !3 = !MDLocation(line: 3, column: 7, scope: !0, inlinedAt: !2)
+; CHECK-NEXT: !4 = distinct !{}
+; CHECK-NEXT: !5 = !MDLocation(line: 3, column: 7, scope: !4)
+; CHECK-NEXT: !6 = !MDLocation(line: 3, column: 7, scope: !4, inlinedAt: !5)
+; CHECK-NEXT: !7 = !MDLocation(line: 3, column: 7, scope: !4, inlinedAt: !6)
+; CHECK-NEXT: !8 = distinct !MDLocation(line: 3, column: 7, scope: !0)
+; CHECK-NEXT: !9 = distinct !MDLocation(line: 3, column: 7, scope: !0, inlinedAt: !8)
+; CHECK-NEXT: !10 = distinct !{}
+; CHECK-NEXT: !11 = !MDLocation(line: 3, column: 7, scope: !10)
+; CHECK-NEXT: !12 = !MDLocation(line: 3, column: 7, scope: !10, inlinedAt: !11)
+; CHECK-NEXT: !13 = !MDLocation(line: 3, column: 7, scope: !10, inlinedAt: !12)
+; CHECK-NEXT: !14 = distinct !MDLocation(line: 3, column: 7, scope: !0)
+; CHECK-NEXT: !15 = distinct !MDLocation(line: 3, column: 7, scope: !0, inlinedAt: !14)
+!0 = !{} ; Use this as a scope.
+!1 = !MDLocation(line: 3, column: 7, scope: !0)
+!2 = !MDLocation(line: 3, column: 7, scope: !0, inlinedAt: !1)
+!3 = !MDLocation(line: 3, column: 7, scope: !0, inlinedAt: !2)
+!4 = distinct !{} ; Test actual remapping.
+!5 = !MDLocation(line: 3, column: 7, scope: !4)
+!6 = !MDLocation(line: 3, column: 7, scope: !4, inlinedAt: !5)
+!7 = !MDLocation(line: 3, column: 7, scope: !4, inlinedAt: !6)
+; Test distinct nodes.
+!8 = distinct !MDLocation(line: 3, column: 7, scope: !0)
+!9 = distinct !MDLocation(line: 3, column: 7, scope: !0, inlinedAt: !8)
diff --git a/test/Linker/metadata-a.ll b/test/Linker/metadata-a.ll
index 5a9d2e4..78715c6 100644
--- a/test/Linker/metadata-a.ll
+++ b/test/Linker/metadata-a.ll
@@ -1,15 +1,15 @@
 ; RUN: llvm-link %s %p/metadata-b.ll -S -o - | FileCheck %s
 
 ; CHECK: define void @foo(i32 %a)
-; CHECK: ret void, !attach !0, !also !{i32 %a}
+; CHECK: ret void, !attach !0
 ; CHECK: define void @goo(i32 %b)
-; CHECK: ret void, !attach !1, !and !{i32 %b}
-; CHECK: !0 = metadata !{i32 524334, void (i32)* @foo}
-; CHECK: !1 = metadata !{i32 524334, void (i32)* @goo}
+; CHECK: ret void, !attach !1
+; CHECK: !0 = !{i32 524334, void (i32)* @foo}
+; CHECK: !1 = !{i32 524334, void (i32)* @goo}
 
 define void @foo(i32 %a) nounwind {
 entry:
-  ret void, !attach !0, !also !{ i32 %a }
+  ret void, !attach !0
 }
 
-!0 = metadata !{i32 524334, void (i32)* @foo}
+!0 = !{i32 524334, void (i32)* @foo}
diff --git a/test/Linker/metadata-b.ll b/test/Linker/metadata-b.ll
index ef0270a..fd2fd28 100644
--- a/test/Linker/metadata-b.ll
+++ b/test/Linker/metadata-b.ll
@@ -3,7 +3,7 @@
 
 define void @goo(i32 %b) nounwind {
 entry:
-  ret void, !attach !0, !and !{ i32 %b }
+  ret void, !attach !0
 }
 
-!0 = metadata !{i32 524334, void (i32)* @goo}
+!0 = !{i32 524334, void (i32)* @goo}
diff --git a/test/Linker/module-flags-1-a.ll b/test/Linker/module-flags-1-a.ll
index 32f189c..fb3fcc1 100644
--- a/test/Linker/module-flags-1-a.ll
+++ b/test/Linker/module-flags-1-a.ll
@@ -2,15 +2,15 @@
 
 ; Test basic functionality of module flags.
 
-; CHECK: !0 = metadata !{i32 1, metadata !"foo", i32 37}
-; CHECK: !1 = metadata !{i32 2, metadata !"bar", i32 42}
-; CHECK: !2 = metadata !{i32 1, metadata !"mux", metadata !3}
-; CHECK: !3 = metadata !{metadata !"hello world", i32 927}
-; CHECK: !4 = metadata !{i32 1, metadata !"qux", i32 42}
+; CHECK: !0 = !{i32 1, !"foo", i32 37}
+; CHECK: !1 = !{i32 2, !"bar", i32 42}
+; CHECK: !2 = !{i32 1, !"mux", !3}
+; CHECK: !3 = !{!"hello world", i32 927}
+; CHECK: !4 = !{i32 1, !"qux", i32 42}
 ; CHECK: !llvm.module.flags = !{!0, !1, !2, !4}
 
-!0 = metadata !{ i32 1, metadata !"foo", i32 37 }
-!1 = metadata !{ i32 2, metadata !"bar", i32 42 }
-!2 = metadata !{ i32 1, metadata !"mux", metadata !{ metadata !"hello world", i32 927 } }
+!0 = !{ i32 1, !"foo", i32 37 }
+!1 = !{ i32 2, !"bar", i32 42 }
+!2 = !{ i32 1, !"mux", !{ !"hello world", i32 927 } }
 
 !llvm.module.flags = !{ !0, !1, !2 }
diff --git a/test/Linker/module-flags-1-b.ll b/test/Linker/module-flags-1-b.ll
index bf3f5e5..b3d1412 100644
--- a/test/Linker/module-flags-1-b.ll
+++ b/test/Linker/module-flags-1-b.ll
@@ -1,8 +1,8 @@
 ; This file is used with module-flags-1-a.ll
 ; RUN: true
 
-!0 = metadata !{ i32 1, metadata !"foo", i32 37 }
-!1 = metadata !{ i32 1, metadata !"qux", i32 42 }
-!2 = metadata !{ i32 1, metadata !"mux", metadata !{ metadata !"hello world", i32 927 } }
+!0 = !{ i32 1, !"foo", i32 37 }
+!1 = !{ i32 1, !"qux", i32 42 }
+!2 = !{ i32 1, !"mux", !{ !"hello world", i32 927 } }
 
 !llvm.module.flags = !{ !0, !1, !2 }
diff --git a/test/Linker/module-flags-2-a.ll b/test/Linker/module-flags-2-a.ll
index 3ae0288..5f03461 100644
--- a/test/Linker/module-flags-2-a.ll
+++ b/test/Linker/module-flags-2-a.ll
@@ -2,9 +2,9 @@
 
 ; Test the 'override' behavior.
 
-; CHECK: !0 = metadata !{i32 4, metadata !"foo", i32 37}
+; CHECK: !0 = !{i32 4, !"foo", i32 37}
 ; CHECK: !llvm.module.flags = !{!0}
 
-!0 = metadata !{ i32 1, metadata !"foo", i32 927 }
+!0 = !{ i32 1, !"foo", i32 927 }
 
 !llvm.module.flags = !{ !0 }
diff --git a/test/Linker/module-flags-2-b.ll b/test/Linker/module-flags-2-b.ll
index ab55e4b..93d7ac1 100644
--- a/test/Linker/module-flags-2-b.ll
+++ b/test/Linker/module-flags-2-b.ll
@@ -1,6 +1,6 @@
 ; This file is used with module-flags-2-a.ll
 ; RUN: true
 
-!0 = metadata !{ i32 4, metadata !"foo", i32 37 } ; Override the "foo" value.
+!0 = !{ i32 4, !"foo", i32 37 } ; Override the "foo" value.
 
 !llvm.module.flags = !{ !0 }
diff --git a/test/Linker/module-flags-3-a.ll b/test/Linker/module-flags-3-a.ll
index e7a720e..26a9391 100644
--- a/test/Linker/module-flags-3-a.ll
+++ b/test/Linker/module-flags-3-a.ll
@@ -2,13 +2,13 @@
 
 ; Test 'require' behavior.
 
-; CHECK: !0 = metadata !{i32 1, metadata !"foo", i32 37}
-; CHECK: !1 = metadata !{i32 1, metadata !"bar", i32 42}
-; CHECK: !2 = metadata !{i32 3, metadata !"foo", metadata !3}
-; CHECK: !3 = metadata !{metadata !"bar", i32 42}
+; CHECK: !0 = !{i32 1, !"foo", i32 37}
+; CHECK: !1 = !{i32 1, !"bar", i32 42}
+; CHECK: !2 = !{i32 3, !"foo", !3}
+; CHECK: !3 = !{!"bar", i32 42}
 ; CHECK: !llvm.module.flags = !{!0, !1, !2}
 
-!0 = metadata !{ i32 1, metadata !"foo", i32 37 }
-!1 = metadata !{ i32 1, metadata !"bar", i32 42 }
+!0 = !{ i32 1, !"foo", i32 37 }
+!1 = !{ i32 1, !"bar", i32 42 }
 
 !llvm.module.flags = !{ !0, !1 }
diff --git a/test/Linker/module-flags-3-b.ll b/test/Linker/module-flags-3-b.ll
index 76be802..2e6e529 100644
--- a/test/Linker/module-flags-3-b.ll
+++ b/test/Linker/module-flags-3-b.ll
@@ -1,8 +1,6 @@
 ; This file is used with module-flags-3-a.ll
 ; RUN: true
 
-!0 = metadata !{ i32 3, metadata !"foo",
-  metadata !{ metadata !"bar", i32 42 }
-}
+!0 = !{i32 3, !"foo", !{!"bar", i32 42}}
 
 !llvm.module.flags = !{ !0 }
diff --git a/test/Linker/module-flags-4-a.ll b/test/Linker/module-flags-4-a.ll
index a656c8b..14d850d 100644
--- a/test/Linker/module-flags-4-a.ll
+++ b/test/Linker/module-flags-4-a.ll
@@ -4,7 +4,7 @@
 
 ; CHECK: linking module flags 'bar': does not have the required value
 
-!0 = metadata !{ i32 1, metadata !"foo", i32 37 }
-!1 = metadata !{ i32 1, metadata !"bar", i32 927 }
+!0 = !{ i32 1, !"foo", i32 37 }
+!1 = !{ i32 1, !"bar", i32 927 }
 
 !llvm.module.flags = !{ !0, !1 }
diff --git a/test/Linker/module-flags-4-b.ll b/test/Linker/module-flags-4-b.ll
index 3a460bb..25aaf19 100644
--- a/test/Linker/module-flags-4-b.ll
+++ b/test/Linker/module-flags-4-b.ll
@@ -1,8 +1,6 @@
 ; This file is used with module-flags-4-a.ll
 ; RUN: true
 
-!0 = metadata !{ i32 3, metadata !"foo",
-  metadata !{ metadata !"bar", i32 42 }
-}
+!0 = !{i32 3, !"foo", !{!"bar", i32 42}}
 
 !llvm.module.flags = !{ !0 }
diff --git a/test/Linker/module-flags-5-a.ll b/test/Linker/module-flags-5-a.ll
index 8d625cd..00fb4d1 100644
--- a/test/Linker/module-flags-5-a.ll
+++ b/test/Linker/module-flags-5-a.ll
@@ -4,6 +4,6 @@
 
 ; CHECK: linking module flags 'foo': IDs have conflicting override values
 
-!0 = metadata !{ i32 4, metadata !"foo", i32 927 }
+!0 = !{ i32 4, !"foo", i32 927 }
 
 !llvm.module.flags = !{ !0 }
diff --git a/test/Linker/module-flags-5-b.ll b/test/Linker/module-flags-5-b.ll
index 1e99b20..7182317 100644
--- a/test/Linker/module-flags-5-b.ll
+++ b/test/Linker/module-flags-5-b.ll
@@ -1,6 +1,6 @@
 ; This file is used with module-flags-5-a.ll
 ; RUN: true
 
-!0 = metadata !{ i32 4, metadata !"foo", i32 37 } ; Override the "foo" value.
+!0 = !{ i32 4, !"foo", i32 37 } ; Override the "foo" value.
 
 !llvm.module.flags = !{ !0 }
diff --git a/test/Linker/module-flags-6-a.ll b/test/Linker/module-flags-6-a.ll
index 5329c43..e15fd34 100644
--- a/test/Linker/module-flags-6-a.ll
+++ b/test/Linker/module-flags-6-a.ll
@@ -4,6 +4,6 @@
 
 ; CHECK: linking module flags 'foo': IDs have conflicting values
 
-!0 = metadata !{ i32 1, metadata !"foo", i32 37 }
+!0 = !{ i32 1, !"foo", i32 37 }
 
 !llvm.module.flags = !{ !0 }
diff --git a/test/Linker/module-flags-6-b.ll b/test/Linker/module-flags-6-b.ll
index 2bc5a96..e277700 100644
--- a/test/Linker/module-flags-6-b.ll
+++ b/test/Linker/module-flags-6-b.ll
@@ -1,6 +1,6 @@
 ; This file is used with module-flags-6-a.ll
 ; RUN: true
 
-!0 = metadata !{ i32 1, metadata !"foo", i32 38 }
+!0 = !{ i32 1, !"foo", i32 38 }
 
 !llvm.module.flags = !{ !0 }
diff --git a/test/Linker/module-flags-7-a.ll b/test/Linker/module-flags-7-a.ll
index 976c8fe..46c9d26 100644
--- a/test/Linker/module-flags-7-a.ll
+++ b/test/Linker/module-flags-7-a.ll
@@ -4,6 +4,6 @@
 
 ; CHECK: linking module flags 'foo': IDs have conflicting behaviors
 
-!0 = metadata !{ i32 1, metadata !"foo", i32 37 }
+!0 = !{ i32 1, !"foo", i32 37 }
 
 !llvm.module.flags = !{ !0 }
diff --git a/test/Linker/module-flags-7-b.ll b/test/Linker/module-flags-7-b.ll
index 2bc7250..47d44e9 100644
--- a/test/Linker/module-flags-7-b.ll
+++ b/test/Linker/module-flags-7-b.ll
@@ -1,6 +1,6 @@
 ; This file is used with module-flags-7-a.ll
 ; RUN: true
 
-!0 = metadata !{ i32 2, metadata !"foo", i32 37 }
+!0 = !{ i32 2, !"foo", i32 37 }
 
 !llvm.module.flags = !{ !0 }
diff --git a/test/Linker/module-flags-8-a.ll b/test/Linker/module-flags-8-a.ll
index 146cae7..b65a50f 100644
--- a/test/Linker/module-flags-8-a.ll
+++ b/test/Linker/module-flags-8-a.ll
@@ -2,13 +2,13 @@
 
 ; Test append-type module flags.
 
-; CHECK: !0 = metadata !{i32 5, metadata !"flag-0", metadata !1}
-; CHECK: !1 = metadata !{i32 0, i32 0, i32 1}
-; CHECK: !2 = metadata !{i32 6, metadata !"flag-1", metadata !3}
-; CHECK: !3 = metadata !{i32 0, i32 1, i32 2}
+; CHECK: !0 = !{i32 5, !"flag-0", !1}
+; CHECK: !1 = !{i32 0, i32 0, i32 1}
+; CHECK: !2 = !{i32 6, !"flag-1", !3}
+; CHECK: !3 = !{i32 0, i32 1, i32 2}
 ; CHECK: !llvm.module.flags = !{!0, !2}
 
-!0 = metadata !{ i32 5, metadata !"flag-0", metadata !{ i32 0 } }
-!1 = metadata !{ i32 6, metadata !"flag-1", metadata !{ i32 0, i32 1 } }
+!0 = !{ i32 5, !"flag-0", !{ i32 0 } }
+!1 = !{ i32 6, !"flag-1", !{ i32 0, i32 1 } }
 
 !llvm.module.flags = !{ !0, !1 }
diff --git a/test/Linker/module-flags-8-b.ll b/test/Linker/module-flags-8-b.ll
index 08f9bc4..afd737a 100644
--- a/test/Linker/module-flags-8-b.ll
+++ b/test/Linker/module-flags-8-b.ll
@@ -1,7 +1,7 @@
 ; This file is used with module-flags-6-a.ll
 ; RUN: true
 
-!0 = metadata !{ i32 5, metadata !"flag-0", metadata !{ i32 0, i32 1 } }
-!1 = metadata !{ i32 6, metadata !"flag-1", metadata !{ i32 1, i32 2 } }
+!0 = !{ i32 5, !"flag-0", !{ i32 0, i32 1 } }
+!1 = !{ i32 6, !"flag-1", !{ i32 1, i32 2 } }
 
 !llvm.module.flags = !{ !0, !1 }
diff --git a/test/Linker/module-flags-dont-change-others.ll b/test/Linker/module-flags-dont-change-others.ll
new file mode 100644
index 0000000..4cc9f39
--- /dev/null
+++ b/test/Linker/module-flags-dont-change-others.ll
@@ -0,0 +1,26 @@
+; RUN: llvm-link %s %S/Inputs/module-flags-dont-change-others.ll -S -o - | FileCheck %s
+
+; Test that module-flag linking doesn't change other metadata.  In particular,
+; !named should still point at the unmodified tuples (!3, !4, and !5) that
+; happen to also serve as module flags.
+
+; CHECK: !named = !{!0, !1, !2, !3, !4, !5}
+; CHECK: !llvm.module.flags = !{!6, !7, !8}
+!named = !{!0, !1, !2, !3, !4, !5}
+!llvm.module.flags = !{!3, !4, !5}
+
+; CHECK: !0 = !{}
+; CHECK: !1 = !{!0}
+; CHECK: !2 = !{!0, !1}
+; CHECK: !3 = !{i32 1, !"foo", i32 927}
+; CHECK: !4 = !{i32 5, !"bar", !0}
+; CHECK: !5 = !{i32 6, !"baz", !1}
+; CHECK: !6 = !{i32 4, !"foo", i32 37}
+; CHECK: !7 = !{i32 5, !"bar", !1}
+; CHECK: !8 = !{i32 6, !"baz", !2}
+!0 = !{}
+!1 = !{!0}
+!2 = !{!0, !1}
+!3 = !{i32 1, !"foo", i32 927}
+!4 = !{i32 5, !"bar", !0}
+!5 = !{i32 6, !"baz", !1}
diff --git a/test/Linker/module-flags-pic-1-a.ll b/test/Linker/module-flags-pic-1-a.ll
index bc4da95..ea93335 100644
--- a/test/Linker/module-flags-pic-1-a.ll
+++ b/test/Linker/module-flags-pic-1-a.ll
@@ -2,8 +2,8 @@
 
 ; test linking modules with specified and default PIC levels
 
-!0 = metadata !{ i32 1, metadata !"PIC Level", i32 1 }
+!0 = !{ i32 1, !"PIC Level", i32 1 }
 
 !llvm.module.flags = !{!0}
 ; CHECK: !llvm.module.flags = !{!0}
-; CHECK: !0 = metadata !{i32 1, metadata !"PIC Level", i32 1}
+; CHECK: !0 = !{i32 1, !"PIC Level", i32 1}
diff --git a/test/Linker/module-flags-pic-2-a.ll b/test/Linker/module-flags-pic-2-a.ll
index 3ff9c8f..e09af6b 100644
--- a/test/Linker/module-flags-pic-2-a.ll
+++ b/test/Linker/module-flags-pic-2-a.ll
@@ -3,7 +3,7 @@
 
 ; test linking modules with two different PIC levels
 
-!0 = metadata !{ i32 1, metadata !"PIC Level", i32 1 }
+!0 = !{ i32 1, !"PIC Level", i32 1 }
 
 !llvm.module.flags = !{!0}
 
diff --git a/test/Linker/opaque.ll b/test/Linker/opaque.ll
new file mode 100644
index 0000000..1ba878c
--- /dev/null
+++ b/test/Linker/opaque.ll
@@ -0,0 +1,21 @@
+; RUN: llvm-link %p/opaque.ll %p/Inputs/opaque.ll -S -o - | FileCheck %s
+
+; CHECK-DAG: %A =   type {}
+; CHECK-DAG: %B =   type { %C, %C, %B* }
+; CHECK-DAG: %B.1 = type { %D, %E, %B.1* }
+; CHECK-DAG: %C =   type { %A }
+; CHECK-DAG: %D =   type { %E }
+; CHECK-DAG: %E =   type opaque
+
+; CHECK-DAG: @g1 = external global %B
+; CHECK-DAG: @g2 = external global %A
+; CHECK-DAG: @g3 = external global %B.1
+
+; CHECK-DAG: getelementptr %A* null, i32 0
+
+%A = type opaque
+%B = type { %C, %C, %B* }
+
+%C = type { %A }
+
+@g1 = external global %B
diff --git a/test/Linker/pr21374.ll b/test/Linker/pr21374.ll
new file mode 100644
index 0000000..d777971
--- /dev/null
+++ b/test/Linker/pr21374.ll
@@ -0,0 +1,20 @@
+; RUN: llvm-link -S -o - %p/pr21374.ll %p/Inputs/pr21374.ll | FileCheck %s
+; RUN: llvm-link -S -o - %p/Inputs/pr21374.ll %p/pr21374.ll | FileCheck %s
+
+; RUN: llvm-as -o %t1.bc %p/pr21374.ll
+; RUN: llvm-as -o %t2.bc %p/Inputs/pr21374.ll
+
+; RUN: llvm-link -S -o - %t1.bc %t2.bc | FileCheck %s
+; RUN: llvm-link -S -o - %t2.bc %t1.bc | FileCheck %s
+
+; Test that we get the same result with or without lazy loading.
+
+; CHECK: %foo = type { i8* }
+; CHECK-DAG: bitcast i32* null to %foo*
+; CHECK-DAG: define void @g(%foo* %x)
+
+%foo = type { i8* }
+define void @f() {
+  bitcast i32* null to %foo*
+  ret void
+}
diff --git a/test/Linker/pr21494.ll b/test/Linker/pr21494.ll
new file mode 100644
index 0000000..8a17233
--- /dev/null
+++ b/test/Linker/pr21494.ll
@@ -0,0 +1,23 @@
+; RUN: llvm-link %s -S -o - | FileCheck %s
+
+@g1 = private global i8 0
+; CHECK-NOT: @g1
+
+@g2 = linkonce_odr global i8 0
+; CHECK-NOT: @g2
+
+@a1 = private alias i8* @g1
+; CHECK-NOT: @a1
+
+@a2 = linkonce_odr alias i8* @g2
+; CHECK-NOT: @a2
+
+define private void @f1() {
+  ret void
+}
+; CHECK-NOT: @f1
+
+define linkonce_odr void @f2() {
+  ret void
+}
+; CHECK-NOT: @f2
diff --git a/test/Linker/prefixdata.ll b/test/Linker/prefixdata.ll
deleted file mode 100644
index 1f11dc7..0000000
--- a/test/Linker/prefixdata.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: echo > %t.ll
-; RUN: llvm-link %t.ll %s -S -o - | FileCheck %s
-
-@i = linkonce_odr global i32 1
-
-; CHECK: define void @f() prefix i32* @i
-define void @f() prefix i32* @i {
-  ret void
-}
diff --git a/test/Linker/prologuedata.ll b/test/Linker/prologuedata.ll
new file mode 100644
index 0000000..70204fd
--- /dev/null
+++ b/test/Linker/prologuedata.ll
@@ -0,0 +1,21 @@
+; RUN: llvm-link %s -S -o - | FileCheck %s
+
+@g1 = global void()* @f2
+; CHECK: @g1 = global void ()* @f2
+
+@p1 = global i8 42
+; CHECK: @p1 = global i8 42
+
+@p2 = internal global i8 43
+; CHECK: @p2 = internal global i8 43
+
+define void @f1() prologue i8* @p1 {
+  ret void
+}
+; CHECK: define void @f1() prologue i8* @p1 {
+
+define internal void @f2() prologue i8* @p2 {
+  ret void
+}
+
+; CHECK: define internal void @f2() prologue i8* @p2 {
diff --git a/test/Linker/replaced-function-matches-first-subprogram.ll b/test/Linker/replaced-function-matches-first-subprogram.ll
new file mode 100644
index 0000000..c0ec5f3
--- /dev/null
+++ b/test/Linker/replaced-function-matches-first-subprogram.ll
@@ -0,0 +1,75 @@
+; RUN: llvm-link %s %S/Inputs/replaced-function-matches-first-subprogram.ll -S | FileCheck %s
+
+; Generated from C++ source:
+;
+; // repro/t.h
+; template <class T> struct Class {
+;   int foo() { return 0; }
+; };
+; // repro/d1/t1.cpp
+; #include "t.h"
+; int foo() { return Class<int>().foo(); }
+; // repro/d2/t2.cpp
+; #include "t.h"
+; template struct Class<int>;
+
+%struct.Class = type { i8 }
+
+define i32 @_Z3foov() {
+entry:
+  %tmp = alloca %struct.Class, align 1
+  %call = call i32 @_ZN5ClassIiE3fooEv(%struct.Class* %tmp), !dbg !14
+  ret i32 %call, !dbg !14
+}
+
+; CHECK: define weak_odr i32 @_ZN5ClassIiE3fooEv(%struct.Class* %this){{.*}}{
+; CHECK-NOT: }
+; CHECK: !dbg ![[LOC:[0-9]+]]
+define linkonce_odr i32 @_ZN5ClassIiE3fooEv(%struct.Class* %this) align 2 {
+entry:
+  %this.addr = alloca %struct.Class*, align 8
+  store %struct.Class* %this, %struct.Class** %this.addr, align 8
+  %this1 = load %struct.Class** %this.addr
+  ret i32 0, !dbg !15
+}
+
+; CHECK: !llvm.dbg.cu = !{![[CU1:[0-9]+]], ![[CU2:[0-9]+]]}
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!10, !11, !12}
+!llvm.ident = !{!13}
+
+; Extract out the list of subprograms from each compile unit.
+; CHECK-DAG: ![[CU1]] = !{!"0x11{{[^"]+}}", {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, ![[SPs1:[0-9]+]],
+; CHECK-DAG: ![[CU2]] = !{!"0x11{{[^"]+}}", {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, ![[SPs2:[0-9]+]],
+!0 = !{!"0x11\004\00clang version 3.6.0 (trunk 224193) (llvm/trunk 224197)\000\00\000\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/Users/dexonsmith/data/llvm/staging/test/Linker/repro/d1/t1.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"t1.cpp", !"/Users/dexonsmith/data/llvm/staging/test/Linker/repro/d1"}
+!2 = !{}
+
+; Extract out each compile unit's single subprogram.  The replaced subprogram
+; should be dropped by the first compile unit.
+; CHECK-DAG: ![[SPs1]] = !{![[SP1:[0-9]+]]}
+; CHECK-DAG: ![[SPs2]] = !{![[SP2:[0-9]+]]}
+!3 = !{!4, !7}
+!4 = !{!"0x2e\00foo\00foo\00\002\000\001\000\000\00256\000\002", !1, !5, !6, null, i32 ()* @_Z3foov, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [foo]
+!5 = !{!"0x29", !1}    ; [ DW_TAG_file_type ] [/Users/dexonsmith/data/llvm/staging/test/Linker/repro/d1/t1.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+
+; Extract out the file from the replaced subprogram.  Confirm that each
+; subprogram is pointing at the correct function.
+; CHECK-DAG: ![[SP1]] = !{!"0x2e{{[^"]+}}", {{.*}}, i32 ()* @_Z3foov,
+; CHECK-DAG: ![[SP2]] = !{!"0x2e{{[^"]+}}", ![[FILE:[0-9]+]], {{.*}}, i32 (%struct.Class*)* @_ZN5ClassIiE3fooEv,
+!7 = !{!"0x2e\00foo\00foo\00\002\000\001\000\000\00256\000\002", !8, !9, !6, null, i32 (%struct.Class*)* @_ZN5ClassIiE3fooEv, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [foo]
+
+; The new subprogram should be pointing at the new directory.
+; CHECK-DAG: ![[FILE]] = !{!"../t.h", !"/Users/dexonsmith/data/llvm/staging/test/Linker/repro/d2"}
+!8 = !{!"../t.h", !"/Users/dexonsmith/data/llvm/staging/test/Linker/repro/d1"}
+!9 = !{!"0x29", !8}    ; [ DW_TAG_file_type ] [/Users/dexonsmith/data/llvm/staging/test/Linker/repro/d1/../t.h]
+!10 = !{i32 2, !"Dwarf Version", i32 2}
+!11 = !{i32 2, !"Debug Info Version", i32 2}
+!12 = !{i32 1, !"PIC Level", i32 2}
+!13 = !{!"clang version 3.6.0 (trunk 224193) (llvm/trunk 224197)"}
+!14 = !MDLocation(line: 2, column: 20, scope: !4)
+
+; The same subprogram should be pointed to by inside the !dbg reference.
+; CHECK: ![[LOC]] = !MDLocation(line: 2, column: 15, scope: ![[SP2]])
+!15 = !MDLocation(line: 2, column: 15, scope: !7)
diff --git a/test/Linker/targettriple.ll b/test/Linker/targettriple.ll
index c544a14..8778706 100644
--- a/test/Linker/targettriple.ll
+++ b/test/Linker/targettriple.ll
@@ -1,17 +1,23 @@
 ; REQUIRES: shell
 ; RUN: llvm-link %s %S/Inputs/targettriple-a.ll -S -o - 2>%t.a.err | FileCheck %s
-; RUN: (echo foo ;cat %t.a.err) | FileCheck --check-prefix=WARN-A %s
+; RUN: cat %t.a.err | FileCheck --check-prefix=WARN-A %s --allow-empty
 
 ; RUN: llvm-link %s %S/Inputs/targettriple-b.ll -S -o - 2>%t.b.err | FileCheck %s
 ; RUN: cat %t.b.err | FileCheck --check-prefix=WARN-B %s
+; RUN: llvm-link %s %S/Inputs/targettriple-c.ll -S -o - 2>%t.c.err | FileCheck %s
+; RUN: cat %t.c.err | FileCheck --check-prefix=WARN-C %s --allow-empty
 
 ; RUN: llvm-link -suppress-warnings %s %S/Inputs/targettriple-b.ll -S -o - 2>%t.no-warn.err | FileCheck %s
-; RUN: (echo foo ;cat %t.no-warn.err) | FileCheck --check-prefix=WARN-A %s
+; RUN: cat %t.no-warn.err | FileCheck --check-prefix=WARN-A %s --allow-empty
 
-target triple = "e"
+target triple = "x86_64-unknown-linux-gnu"
 
-; CHECK: target triple = "e"
+; CHECK: target triple = "x86_64-unknown-linux-gnu"
 
 ; WARN-A-NOT: WARNING
 
+; i386 and x86_64 map to different ArchType enums.
 ; WARN-B: WARNING: Linking two modules of different target triples:
+
+; x86_64h and x86_64 map to the same ArchType enum.
+; WARN-C-NOT: WARNING
diff --git a/test/Linker/testlink.ll b/test/Linker/testlink.ll
new file mode 100644
index 0000000..aad4b9b
--- /dev/null
+++ b/test/Linker/testlink.ll
@@ -0,0 +1,104 @@
+; RUN: llvm-link %s %S/Inputs/testlink.ll -S | FileCheck %s
+
+; CHECK: %Ty2 = type { %Ty1* }
+; CHECK: %Ty1 = type { %Ty2* }
+%Ty1 = type opaque
+%Ty2 = type { %Ty1* }
+
+; CHECK: %intlist = type { %intlist*, i32 }
+%intlist = type { %intlist*, i32 }
+
+; The uses of intlist in the other file should be remapped.
+; CHECK-NOT: {{%intlist.[0-9]}}
+
+; CHECK: %VecSize = type { <5 x i32> }
+; CHECK: %VecSize.{{[0-9]}} = type { <10 x i32> }
+%VecSize = type { <5 x i32> }
+
+%Struct1 = type opaque
+@S1GV = external global %Struct1*
+
+
+@GVTy1 = external global %Ty1*
+@GVTy2 = global %Ty2* null
+
+
+; This should stay the same
+; CHECK-DAG: @MyIntList = global %intlist { %intlist* null, i32 17 }
+@MyIntList = global %intlist { %intlist* null, i32 17 }
+
+
+; Nothing to link here.
+
+; CHECK-DAG: @0 = external global i32
+@0 = external global i32
+; CHECK-DAG: @Inte = global i32 1
+@Inte = global i32 1
+
+; Intern1 is intern in both files, rename testlink2's.
+; CHECK-DAG: @Intern1 = internal constant i32 42
+@Intern1 = internal constant i32 42
+
+@UseIntern1 = global i32* @Intern1
+
+; This should get renamed since there is a definition that is non-internal in
+; the other module.
+; CHECK-DAG: @Intern2{{[0-9]+}} = internal constant i32 792
+@Intern2 = internal constant i32 792
+
+@UseIntern2 = global i32* @Intern2
+
+; CHECK-DAG: @MyVarPtr = linkonce global { i32* } { i32* @MyVar }
+@MyVarPtr = linkonce global { i32* } { i32* @MyVar }
+
+@UseMyVarPtr = global { i32* }* @MyVarPtr
+
+; CHECK-DAG: @MyVar = global i32 4
+@MyVar = external global i32
+
+; Take value from other module.
+; CHECK-DAG: AConst = constant i32 1234
+@AConst = linkonce constant i32 123
+
+; Renamed version of Intern1.
+; CHECK-DAG: @Intern1{{[0-9]+}} = internal constant i32 52
+
+
+; Globals linked from testlink2.
+; CHECK-DAG: @Intern2 = constant i32 12345
+
+; CHECK-DAG: @MyIntListPtr = constant
+; CHECK-DAG: @1 = constant i32 412
+
+
+declare i32 @foo(i32)
+
+declare void @print(i32)
+
+define void @main() {
+  %v1 = load i32* @MyVar
+  call void @print(i32 %v1)
+  %idx = getelementptr %intlist* @MyIntList, i64 0, i32 1
+  %v2 = load i32* %idx
+  call void @print(i32 %v2)
+  %1 = call i32 @foo(i32 5)
+  %v3 = load i32* @MyVar
+  call void @print(i32 %v3)
+  %v4 = load i32* %idx
+  call void @print(i32 %v4)
+  ret void
+}
+
+define internal void @testintern() {
+  ret void
+}
+
+define internal void @Testintern() {
+  ret void
+}
+
+define void @testIntern() {
+  ret void
+}
+
+declare void @VecSizeCrash(%VecSize)
diff --git a/test/Linker/testlink1.ll b/test/Linker/testlink1.ll
deleted file mode 100644
index 6ba6fd5..0000000
--- a/test/Linker/testlink1.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llvm-as < %s > %t.bc
-; RUN: llvm-as < %p/testlink2.ll > %t2.bc
-; RUN: llvm-link %t.bc %t2.bc -S | FileCheck %s
-
-; CHECK: %Ty2 = type { %Ty1* }
-; CHECK: %Ty1 = type { %Ty2* }
-%Ty1 = type opaque
-%Ty2 = type { %Ty1* }
-
-; CHECK: %intlist = type { %intlist*, i32 }
-%intlist = type { %intlist*, i32 }
-
-; The uses of intlist in the other file should be remapped.
-; CHECK-NOT: {{%intlist.[0-9]}}
-
-; CHECK: %VecSize = type { <5 x i32> }
-; CHECK: %VecSize.{{[0-9]}} = type { <10 x i32> }
-%VecSize = type { <5 x i32> }
-
-%Struct1 = type opaque
-@S1GV = external global %Struct1*
-
-
-@GVTy1 = external global %Ty1*
-@GVTy2 = global %Ty2* null
-
-
-; This should stay the same
-; CHECK: @MyIntList = global %intlist { %intlist* null, i32 17 }
-@MyIntList = global %intlist { %intlist* null, i32 17 }
-
-
-; Nothing to link here.
-
-; CHECK: @0 = external global i32
-@0 = external global i32
-; CHECK: @Inte = global i32 1
-@Inte = global i32 1
-
-; Intern1 is intern in both files, rename testlink2's.
-; CHECK: @Intern1 = internal constant i32 42
-@Intern1 = internal constant i32 42
-
-; This should get renamed since there is a definition that is non-internal in
-; the other module.
-; CHECK: @Intern2{{[0-9]+}} = internal constant i32 792
-@Intern2 = internal constant i32 792
-
-
-; CHECK: @MyVarPtr = linkonce global { i32* } { i32* @MyVar }
-@MyVarPtr = linkonce global { i32* } { i32* @MyVar }
-
-; CHECK: @MyVar = global i32 4
-@MyVar = external global i32
-
-; Take value from other module.
-; CHECK: AConst = constant i32 1234
-@AConst = linkonce constant i32 123
-
-; Renamed version of Intern1.
-; CHECK: @Intern1{{[0-9]+}} = internal constant i32 52
-
-
-; Globals linked from testlink2.
-; CHECK: @Intern2 = constant i32 12345
-
-; CHECK: @MyIntListPtr = constant 
-; CHECK: @1 = constant i32 412
-
-
-declare i32 @foo(i32)
-
-declare void @print(i32)
-
-define void @main() {
-  %v1 = load i32* @MyVar
-  call void @print(i32 %v1)
-  %idx = getelementptr %intlist* @MyIntList, i64 0, i32 1
-  %v2 = load i32* %idx
-  call void @print(i32 %v2)
-  %1 = call i32 @foo(i32 5)
-  %v3 = load i32* @MyVar
-  call void @print(i32 %v3)
-  %v4 = load i32* %idx
-  call void @print(i32 %v4)
-  ret void
-}
-
-define internal void @testintern() {
-  ret void
-}
-
-define internal void @Testintern() {
-  ret void
-}
-
-define void @testIntern() {
-  ret void
-}
-
-declare void @VecSizeCrash(%VecSize)
diff --git a/test/Linker/testlink2.ll b/test/Linker/testlink2.ll
deleted file mode 100644
index ff8e529..0000000
--- a/test/Linker/testlink2.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; This file is used by testlink1.ll, so it doesn't actually do anything itself
-;
-; RUN: true
-
-%intlist = type { %intlist*, i32 }
-
-
-%Ty1 = type { %Ty2* }
-%Ty2 = type opaque
-
-%VecSize = type { <10 x i32> }
-
-@GVTy1 = global %Ty1* null
-@GVTy2 = external global %Ty2*
-
-
-@MyVar = global i32 4
-@MyIntList = external global %intlist
-@AConst = constant i32 1234
-
-;; Intern in both testlink[12].ll
-@Intern1 = internal constant i32 52
-
-;; Intern in one but not in other
-@Intern2 = constant i32 12345
-
-@MyIntListPtr = constant { %intlist* } { %intlist* @MyIntList }
-@MyVarPtr = linkonce global { i32* } { i32* @MyVar }
-@0 = constant i32 412
-
-; Provides definition of Struct1 and of S1GV.
-%Struct1 = type { i32 }
-@S1GV = global %Struct1* null
-
-define i32 @foo(i32 %blah) {
-  store i32 %blah, i32* @MyVar
-  %idx = getelementptr %intlist* @MyIntList, i64 0, i32 1
-  store i32 12, i32* %idx
-  %ack = load i32* @0
-  %fzo = add i32 %ack, %blah
-  ret i32 %fzo
-}
-
-declare void @unimp(float, double)
-
-define internal void @testintern() {
-  ret void
-}
-
-define void @Testintern() {
-  ret void
-}
-
-define internal void @testIntern() {
-  ret void
-}
-
-declare void @VecSizeCrash1(%VecSize)
diff --git a/test/Linker/type-unique-alias.ll b/test/Linker/type-unique-alias.ll
new file mode 100644
index 0000000..e43450f
--- /dev/null
+++ b/test/Linker/type-unique-alias.ll
@@ -0,0 +1,10 @@
+; RUN: llvm-link -S %s %p/Inputs/type-unique-alias.ll | FileCheck %s
+
+%t = type { i8 }
+
+@g = global %t zeroinitializer
+@a = weak alias %t* @g
+
+; CHECK: @g = global %t zeroinitializer
+; CHECK: @g2 = global %t zeroinitializer
+; CHECK: @a = weak alias %t* @g
diff --git a/test/Linker/type-unique-dst-types.ll b/test/Linker/type-unique-dst-types.ll
new file mode 100644
index 0000000..30aecbb
--- /dev/null
+++ b/test/Linker/type-unique-dst-types.ll
@@ -0,0 +1,19 @@
+; RUN: llvm-link %p/type-unique-dst-types.ll \
+; RUN:           %p/Inputs/type-unique-dst-types2.ll \
+; RUN:           %p/Inputs/type-unique-dst-types3.ll -S -o - | FileCheck %s
+
+; This tests the importance of keeping track of which types are part of the
+; destination module.
+; When the second input is merged in, the context gets an unused A.11. When
+; the third module is then merged, we should pretend it doesn't exist.
+
+; CHECK: %A = type { %B }
+; CHECK-NEXT: %B = type { i8 }
+
+; CHECK: @g3 = external global %A
+; CHECK: @g1 = external global %A
+; CHECK: @g2 = external global %A
+
+%A = type { %B }
+%B = type { i8 }
+@g3 = external global %A
diff --git a/test/Linker/type-unique-name.ll b/test/Linker/type-unique-name.ll
new file mode 100644
index 0000000..191b6dd
--- /dev/null
+++ b/test/Linker/type-unique-name.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-link -S %s %p/Inputs/type-unique-name.ll | FileCheck %s
+
+; Test that we keep the type name
+; CHECK: %abc = type { i8 }
+
+%abc = type opaque
+
+declare %abc* @f()
+
+define %abc* @g() {
+  %x = call %abc* @f()
+  ret %abc* %x
+}
diff --git a/test/Linker/type-unique-odr-a.ll b/test/Linker/type-unique-odr-a.ll
index e17cd2b..900b035 100644
--- a/test/Linker/type-unique-odr-a.ll
+++ b/test/Linker/type-unique-odr-a.ll
@@ -59,7 +59,7 @@ entry:
 define internal void @_ZL3barv() #0 {
 entry:
   %a = alloca %class.A, align 4
-  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
+  call void @llvm.dbg.declare(metadata %class.A* %a, metadata !24, metadata !{!"0x102"}), !dbg !25
   ret void, !dbg !26
 }
 
@@ -73,30 +73,30 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!20, !21}
 !llvm.ident = !{!22}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"<unknown>", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2\00A\001\0032\0032\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
-!5 = metadata !{metadata !"type-unique-odr-a.cpp", metadata !""}
-!6 = metadata !{metadata !7, metadata !9}
-!7 = metadata !{metadata !"0xd\00data\002\0032\0032\000\001", metadata !5, metadata !"_ZTS1A", metadata !8} ; [ DW_TAG_member ] [data] [line 2, size 32, align 32, offset 0] [private] [from int]
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\004\000\000\000\006\00258\000\004", metadata !5, metadata !"_ZTS1A", metadata !10, null, null, null, i32 0, metadata !13} ; [ DW_TAG_subprogram ] [line 4] [protected] [getFoo]
-!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!11 = metadata !{null, metadata !12}
-!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!13 = metadata !{i32 786468}
-!14 = metadata !{metadata !15, metadata !19}
-!15 = metadata !{metadata !"0x2e\00baz\00baz\00_Z3bazv\0011\000\001\000\006\00256\000\0011", metadata !5, metadata !16, metadata !17, null, void ()* @_Z3bazv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 11] [def] [baz]
-!16 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ] [type-unique-odr-a.cpp]
-!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!18 = metadata !{null}
-!19 = metadata !{metadata !"0x2e\00bar\00bar\00_ZL3barv\007\001\001\000\006\00256\000\007", metadata !5, metadata !16, metadata !17, null, void ()* @_ZL3barv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [local] [def] [bar]
-!20 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!22 = metadata !{metadata !"clang version 3.5.0 "}
-!23 = metadata !{i32 11, i32 0, metadata !15, null}
-!24 = metadata !{metadata !"0x100\00a\008\000", metadata !19, metadata !16, metadata !"_ZTS1A"} ; [ DW_TAG_auto_variable ] [a] [line 8]
-!25 = metadata !{i32 8, i32 0, metadata !19, null}
-!26 = metadata !{i32 9, i32 0, metadata !19, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !3, !14, !2, !2} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
+!1 = !{!"<unknown>", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2\00A\001\0032\0032\000\000\000", !5, null, null, !6, null, null, !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = !{!"type-unique-odr-a.cpp", !""}
+!6 = !{!7, !9}
+!7 = !{!"0xd\00data\002\0032\0032\000\001", !5, !"_ZTS1A", !8} ; [ DW_TAG_member ] [data] [line 2, size 32, align 32, offset 0] [private] [from int]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\004\000\000\000\006\00258\000\004", !5, !"_ZTS1A", !10, null, null, null, i32 0, !13} ; [ DW_TAG_subprogram ] [line 4] [protected] [getFoo]
+!10 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = !{null, !12}
+!12 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!13 = !{i32 786468}
+!14 = !{!15, !19}
+!15 = !{!"0x2e\00baz\00baz\00_Z3bazv\0011\000\001\000\006\00256\000\0011", !5, !16, !17, null, void ()* @_Z3bazv, null, null, !2} ; [ DW_TAG_subprogram ] [line 11] [def] [baz]
+!16 = !{!"0x29", !5}         ; [ DW_TAG_file_type ] [type-unique-odr-a.cpp]
+!17 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = !{null}
+!19 = !{!"0x2e\00bar\00bar\00_ZL3barv\007\001\001\000\006\00256\000\007", !5, !16, !17, null, void ()* @_ZL3barv, null, null, !2} ; [ DW_TAG_subprogram ] [line 7] [local] [def] [bar]
+!20 = !{i32 2, !"Dwarf Version", i32 4}
+!21 = !{i32 1, !"Debug Info Version", i32 2}
+!22 = !{!"clang version 3.5.0 "}
+!23 = !MDLocation(line: 11, scope: !15)
+!24 = !{!"0x100\00a\008\000", !19, !16, !"_ZTS1A"} ; [ DW_TAG_auto_variable ] [a] [line 8]
+!25 = !MDLocation(line: 8, scope: !19)
+!26 = !MDLocation(line: 9, scope: !19)
diff --git a/test/Linker/type-unique-odr-b.ll b/test/Linker/type-unique-odr-b.ll
index e5f094e..b262191 100644
--- a/test/Linker/type-unique-odr-b.ll
+++ b/test/Linker/type-unique-odr-b.ll
@@ -26,7 +26,7 @@ define void @_ZN1A6getFooEv(%class.A* %this) #0 align 2 {
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !24, metadata !{metadata !"0x102"}), !dbg !26
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !24, metadata !{!"0x102"}), !dbg !26
   %this1 = load %class.A** %this.addr
   ret void, !dbg !27
 }
@@ -54,33 +54,33 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!21, !22}
 !llvm.ident = !{!23}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"<unknown>", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2\00A\002\0032\0032\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 2, size 32, align 32, offset 0] [def] [from ]
-!5 = metadata !{metadata !"type-unique-odr-b.cpp", metadata !""}
-!6 = metadata !{metadata !7, metadata !9}
-!7 = metadata !{metadata !"0xd\00data\003\0032\0032\000\001", metadata !5, metadata !"_ZTS1A", metadata !8} ; [ DW_TAG_member ] [data] [line 3, size 32, align 32, offset 0] [private] [from int]
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\005\000\000\000\006\00258\000\005", metadata !5, metadata !"_ZTS1A", metadata !10, null, null, null, i32 0, metadata !13} ; [ DW_TAG_subprogram ] [line 5] [protected] [getFoo]
-!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!11 = metadata !{null, metadata !12}
-!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!13 = metadata !{i32 786468}
-!14 = metadata !{metadata !15, metadata !16, metadata !20}
-!15 = metadata !{metadata !"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\008\000\001\000\006\00256\000\008", metadata !5, metadata !"_ZTS1A", metadata !10, null, void (%class.A*)* @_ZN1A6getFooEv, null, metadata !9, metadata !2} ; [ DW_TAG_subprogram ] [line 8] [def] [getFoo]
-!16 = metadata !{metadata !"0x2e\00f\00f\00_Z1fv\0011\000\001\000\006\00256\000\0011", metadata !5, metadata !17, metadata !18, null, void ()* @_Z1fv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 11] [def] [f]
-!17 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ] [type-unique-odr-b.cpp]
-!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!19 = metadata !{null}
-!20 = metadata !{metadata !"0x2e\00bar\00bar\00_ZL3barv\0010\001\001\000\006\00256\000\0010", metadata !5, metadata !17, metadata !18, null, void ()* @_ZL3barv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 10] [local] [def] [bar]
-!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!23 = metadata !{metadata !"clang version 3.5.0 "}
-!24 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !15, null, metadata !25} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!25 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
-!26 = metadata !{i32 0, i32 0, metadata !15, null}
-!27 = metadata !{i32 8, i32 0, metadata !15, null}
-!28 = metadata !{i32 11, i32 0, metadata !16, null}
-!29 = metadata !{i32 10, i32 0, metadata !20, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !3, !14, !2, !2} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
+!1 = !{!"<unknown>", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2\00A\002\0032\0032\000\000\000", !5, null, null, !6, null, null, !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 2, size 32, align 32, offset 0] [def] [from ]
+!5 = !{!"type-unique-odr-b.cpp", !""}
+!6 = !{!7, !9}
+!7 = !{!"0xd\00data\003\0032\0032\000\001", !5, !"_ZTS1A", !8} ; [ DW_TAG_member ] [data] [line 3, size 32, align 32, offset 0] [private] [from int]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\005\000\000\000\006\00258\000\005", !5, !"_ZTS1A", !10, null, null, null, i32 0, !13} ; [ DW_TAG_subprogram ] [line 5] [protected] [getFoo]
+!10 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = !{null, !12}
+!12 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!13 = !{i32 786468}
+!14 = !{!15, !16, !20}
+!15 = !{!"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\008\000\001\000\006\00256\000\008", !5, !"_ZTS1A", !10, null, void (%class.A*)* @_ZN1A6getFooEv, null, !9, !2} ; [ DW_TAG_subprogram ] [line 8] [def] [getFoo]
+!16 = !{!"0x2e\00f\00f\00_Z1fv\0011\000\001\000\006\00256\000\0011", !5, !17, !18, null, void ()* @_Z1fv, null, null, !2} ; [ DW_TAG_subprogram ] [line 11] [def] [f]
+!17 = !{!"0x29", !5}         ; [ DW_TAG_file_type ] [type-unique-odr-b.cpp]
+!18 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = !{null}
+!20 = !{!"0x2e\00bar\00bar\00_ZL3barv\0010\001\001\000\006\00256\000\0010", !5, !17, !18, null, void ()* @_ZL3barv, null, null, !2} ; [ DW_TAG_subprogram ] [line 10] [local] [def] [bar]
+!21 = !{i32 2, !"Dwarf Version", i32 4}
+!22 = !{i32 1, !"Debug Info Version", i32 2}
+!23 = !{!"clang version 3.5.0 "}
+!24 = !{!"0x101\00this\0016777216\001088", !15, null, !25} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!25 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!26 = !MDLocation(line: 0, scope: !15)
+!27 = !MDLocation(line: 8, scope: !15)
+!28 = !MDLocation(line: 11, scope: !16)
+!29 = !MDLocation(line: 10, scope: !20)
diff --git a/test/Linker/type-unique-opaque.ll b/test/Linker/type-unique-opaque.ll
new file mode 100644
index 0000000..b4f6966
--- /dev/null
+++ b/test/Linker/type-unique-opaque.ll
@@ -0,0 +1,16 @@
+; RUN: llvm-link -S %s %p/Inputs/type-unique-opaque.ll | FileCheck %s
+
+; Test that a failed attempt at merging %u2 and %t2 (for the other file) will
+; not cause %u and %t to get merged.
+
+; CHECK: %u = type opaque
+; CHECK: define %u* @g() {
+
+%u = type opaque
+%u2 = type { %u*, i8 }
+
+declare %u2* @f()
+
+define %u* @g() {
+  ret %u* null
+}
diff --git a/test/Linker/type-unique-simple-a.ll b/test/Linker/type-unique-simple-a.ll
index c01cd5c..ef29cd9 100644
--- a/test/Linker/type-unique-simple-a.ll
+++ b/test/Linker/type-unique-simple-a.ll
@@ -54,8 +54,8 @@ entry:
   %a.addr = alloca i32, align 4
   %t = alloca %struct.Base, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !15, metadata !{metadata !"0x102"}), !dbg !16
-  call void @llvm.dbg.declare(metadata !{%struct.Base* %t}, metadata !17, metadata !{metadata !"0x102"}), !dbg !18
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !15, metadata !{!"0x102"}), !dbg !16
+  call void @llvm.dbg.declare(metadata %struct.Base* %t, metadata !17, metadata !{!"0x102"}), !dbg !18
   ret void, !dbg !19
 }
 
@@ -68,24 +68,24 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !20}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git c23b1db6268c8e7ce64026d57d1510c1aac200a0) (http://llvm.org/git/llvm.git 09b98fe3978eddefc2145adc1056cf21580ce945)\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !9, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/simple/foo.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"foo.cpp", metadata !"/Users/mren/c_testing/type_unique_air/simple"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00Base\001\0032\0032\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 32, align 32, offset 0] [def] [from ]
-!5 = metadata !{metadata !"./a.hpp", metadata !"/Users/mren/c_testing/type_unique_air/simple"}
-!6 = metadata !{metadata !7}
-!7 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !5, metadata !"_ZTS4Base", metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x2e\00f\00f\00_Z1fi\003\000\001\000\006\00256\000\003", metadata !1, metadata !11, metadata !12, null, void (i32)* @_Z1fi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
-!11 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/simple/foo.cpp]
-!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!13 = metadata !{null, metadata !8}
-!14 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!15 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !10, metadata !11, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 3]
-!16 = metadata !{i32 3, i32 0, metadata !10, null}
-!17 = metadata !{metadata !"0x100\00t\004\000", metadata !10, metadata !11, metadata !4} ; [ DW_TAG_auto_variable ] [t] [line 4]
-!18 = metadata !{i32 4, i32 0, metadata !10, null}
-!19 = metadata !{i32 5, i32 0, metadata !10, null}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git c23b1db6268c8e7ce64026d57d1510c1aac200a0) (http://llvm.org/git/llvm.git 09b98fe3978eddefc2145adc1056cf21580ce945)\000\00\000\00\000", !1, !2, !3, !9, !2, !2} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/simple/foo.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"foo.cpp", !"/Users/mren/c_testing/type_unique_air/simple"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00Base\001\0032\0032\000\000\000", !5, null, null, !6, null, null, !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = !{!"./a.hpp", !"/Users/mren/c_testing/type_unique_air/simple"}
+!6 = !{!7}
+!7 = !{!"0xd\00a\002\0032\0032\000\000", !5, !"_ZTS4Base", !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!10}
+!10 = !{!"0x2e\00f\00f\00_Z1fi\003\000\001\000\006\00256\000\003", !1, !11, !12, null, void (i32)* @_Z1fi, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!11 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/simple/foo.cpp]
+!12 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = !{null, !8}
+!14 = !{i32 2, !"Dwarf Version", i32 2}
+!15 = !{!"0x101\00a\0016777219\000", !10, !11, !8} ; [ DW_TAG_arg_variable ] [a] [line 3]
+!16 = !MDLocation(line: 3, scope: !10)
+!17 = !{!"0x100\00t\004\000", !10, !11, !4} ; [ DW_TAG_auto_variable ] [t] [line 4]
+!18 = !MDLocation(line: 4, scope: !10)
+!19 = !MDLocation(line: 5, scope: !10)
+!20 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Linker/type-unique-simple-b.ll b/test/Linker/type-unique-simple-b.ll
index fabdb03..fb1a529 100644
--- a/test/Linker/type-unique-simple-b.ll
+++ b/test/Linker/type-unique-simple-b.ll
@@ -10,8 +10,8 @@ entry:
   %a.addr = alloca i32, align 4
   %t = alloca %struct.Base, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !18, metadata !{metadata !"0x102"}), !dbg !19
-  call void @llvm.dbg.declare(metadata !{%struct.Base* %t}, metadata !20, metadata !{metadata !"0x102"}), !dbg !21
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !18, metadata !{!"0x102"}), !dbg !19
+  call void @llvm.dbg.declare(metadata %struct.Base* %t, metadata !20, metadata !{!"0x102"}), !dbg !21
   ret void, !dbg !22
 }
 
@@ -38,30 +38,30 @@ attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!17, !26}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git c23b1db6268c8e7ce64026d57d1510c1aac200a0) (http://llvm.org/git/llvm.git 09b98fe3978eddefc2145adc1056cf21580ce945)\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !9, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/simple/bar.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"bar.cpp", metadata !"/Users/mren/c_testing/type_unique_air/simple"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00Base\001\0032\0032\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 32, align 32, offset 0] [def] [from ]
-!5 = metadata !{metadata !"./a.hpp", metadata !"/Users/mren/c_testing/type_unique_air/simple"}
-!6 = metadata !{metadata !7}
-!7 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !5, metadata !"_ZTS4Base", metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !10, metadata !14}
-!10 = metadata !{metadata !"0x2e\00g\00g\00_Z1gi\004\000\001\000\006\00256\000\004", metadata !1, metadata !11, metadata !12, null, void (i32)* @_Z1gi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [g]
-!11 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/simple/bar.cpp]
-!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!13 = metadata !{null, metadata !8}
-!14 = metadata !{metadata !"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", metadata !1, metadata !11, metadata !15, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{metadata !8}
-!17 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!18 = metadata !{metadata !"0x101\00a\0016777220\000", metadata !10, metadata !11, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 4]
-!19 = metadata !{i32 4, i32 0, metadata !10, null}
-!20 = metadata !{metadata !"0x100\00t\005\000", metadata !10, metadata !11, metadata !4} ; [ DW_TAG_auto_variable ] [t] [line 5]
-!21 = metadata !{i32 5, i32 0, metadata !10, null}
-!22 = metadata !{i32 6, i32 0, metadata !10, null}
-!23 = metadata !{i32 8, i32 0, metadata !14, null}
-!24 = metadata !{i32 9, i32 0, metadata !14, null}
-!25 = metadata !{i32 10, i32 0, metadata !14, null}
-!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git c23b1db6268c8e7ce64026d57d1510c1aac200a0) (http://llvm.org/git/llvm.git 09b98fe3978eddefc2145adc1056cf21580ce945)\000\00\000\00\000", !1, !2, !3, !9, !2, !2} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/simple/bar.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"bar.cpp", !"/Users/mren/c_testing/type_unique_air/simple"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00Base\001\0032\0032\000\000\000", !5, null, null, !6, null, null, !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = !{!"./a.hpp", !"/Users/mren/c_testing/type_unique_air/simple"}
+!6 = !{!7}
+!7 = !{!"0xd\00a\002\0032\0032\000\000", !5, !"_ZTS4Base", !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!10, !14}
+!10 = !{!"0x2e\00g\00g\00_Z1gi\004\000\001\000\006\00256\000\004", !1, !11, !12, null, void (i32)* @_Z1gi, null, null, !2} ; [ DW_TAG_subprogram ] [line 4] [def] [g]
+!11 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/simple/bar.cpp]
+!12 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = !{null, !8}
+!14 = !{!"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", !1, !11, !15, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!15 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{!8}
+!17 = !{i32 2, !"Dwarf Version", i32 2}
+!18 = !{!"0x101\00a\0016777220\000", !10, !11, !8} ; [ DW_TAG_arg_variable ] [a] [line 4]
+!19 = !MDLocation(line: 4, scope: !10)
+!20 = !{!"0x100\00t\005\000", !10, !11, !4} ; [ DW_TAG_auto_variable ] [t] [line 5]
+!21 = !MDLocation(line: 5, scope: !10)
+!22 = !MDLocation(line: 6, scope: !10)
+!23 = !MDLocation(line: 8, scope: !14)
+!24 = !MDLocation(line: 9, scope: !14)
+!25 = !MDLocation(line: 10, scope: !14)
+!26 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Linker/type-unique-simple2-a.ll b/test/Linker/type-unique-simple2-a.ll
index 691c5c5..72a776b 100644
--- a/test/Linker/type-unique-simple2-a.ll
+++ b/test/Linker/type-unique-simple2-a.ll
@@ -48,7 +48,7 @@ define linkonce_odr void @_ZN1AC1Ev(%class.A* %this) unnamed_addr #2 align 2 {
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !39, metadata !{metadata !"0x102"}), !dbg !41
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !39, metadata !{!"0x102"}), !dbg !41
   %this1 = load %class.A** %this.addr
   call void @_ZN1AC2Ev(%class.A* %this1) #1, !dbg !42
   ret void, !dbg !42
@@ -64,7 +64,7 @@ define linkonce_odr void @_ZN1AC2Ev(%class.A* %this) unnamed_addr #2 align 2 {
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !44, metadata !{metadata !"0x102"}), !dbg !45
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !44, metadata !{!"0x102"}), !dbg !45
   %this1 = load %class.A** %this.addr
   %0 = bitcast %class.A* %this1 to i8***, !dbg !46
   store i8** getelementptr inbounds ([4 x i8*]* @_ZTV1A, i64 0, i64 2), i8*** %0, !dbg !46
@@ -80,50 +80,50 @@ attributes #4 = { nounwind readnone }
 !llvm.module.flags = !{!35, !36}
 !llvm.ident = !{!37}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !26, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/<unknown>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"<unknown>", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2\00A\002\0064\0064\000\000\000", metadata !5, null, null, metadata !6, metadata !"_ZTS1A", null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 2, size 64, align 64, offset 0] [def] [from ]
-!5 = metadata !{metadata !"./ab.h", metadata !""}
-!6 = metadata !{metadata !7, metadata !14, metadata !19}
-!7 = metadata !{metadata !"0xd\00_vptr$A\000\0064\000\000\0064", metadata !5, metadata !8, metadata !9} ; [ DW_TAG_member ] [_vptr$A] [line 0, size 64, align 0, offset 0] [artificial] [from ]
-!8 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [/./ab.h]
-!9 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
-!10 = metadata !{metadata !"0xf\00__vtbl_ptr_type\000\0064\000\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!14 = metadata !{metadata !"0x2e\00setFoo\00setFoo\00_ZN1A6setFooEv\004\000\000\001\006\00259\000\004", metadata !5, metadata !"_ZTS1A", metadata !15, metadata !"_ZTS1A", null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 4] [setFoo]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{null, metadata !17}
-!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!18 = metadata !{i32 786468}
-!19 = metadata !{metadata !"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\005\000\000\001\006\00259\000\005", metadata !5, metadata !"_ZTS1A", metadata !20, metadata !"_ZTS1A", null, null, i32 0, metadata !25} ; [ DW_TAG_subprogram ] [line 5] [getFoo]
-!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!21 = metadata !{metadata !22, metadata !17}
-!22 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !23} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo_t]
-!23 = metadata !{metadata !"0x16\00foo_t\001\000\000\000\000", metadata !24, null, metadata !13} ; [ DW_TAG_typedef ] [foo_t] [line 1, size 0, align 0, offset 0] [from int]
-!24 = metadata !{metadata !"a.cpp", metadata !""}
-!25 = metadata !{i32 786468}
-!26 = metadata !{metadata !27, metadata !31, metadata !34}
-!27 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3barv\002\000\001\000\006\00256\000\002", metadata !24, metadata !28, metadata !29, null, i32 ()* @_Z3barv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
-!28 = metadata !{metadata !"0x29", metadata !24}        ; [ DW_TAG_file_type ] [/a.cpp]
-!29 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !30, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!30 = metadata !{metadata !23}
-!31 = metadata !{metadata !"0x2e\00A\00A\00_ZN1AC1Ev\002\000\001\000\006\00320\000\002", metadata !5, metadata !"_ZTS1A", metadata !15, null, void (%class.A*)* @_ZN1AC1Ev, null, metadata !32, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [A]
-!32 = metadata !{metadata !"0x2e\00A\00A\00\000\000\000\000\006\00320\000\000", null, metadata !"_ZTS1A", metadata !15, null, null, null, i32 0, metadata !33} ; [ DW_TAG_subprogram ] [line 0] [A]
-!33 = metadata !{i32 786468}
-!34 = metadata !{metadata !"0x2e\00A\00A\00_ZN1AC2Ev\002\000\001\000\006\00320\000\002", metadata !5, metadata !"_ZTS1A", metadata !15, null, void (%class.A*)* @_ZN1AC2Ev, null, metadata !32, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [A]
-!35 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!36 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!37 = metadata !{metadata !"clang version 3.5 "}
-!38 = metadata !{i32 3, i32 0, metadata !27, null}
-!39 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !31, null, metadata !40} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!40 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
-!41 = metadata !{i32 0, i32 0, metadata !31, null}
-!42 = metadata !{i32 2, i32 0, metadata !43, null}
-!43 = metadata !{metadata !"0xb\000", metadata !5, metadata !31} ; [ DW_TAG_lexical_block ] [/./ab.h]
-!44 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !34, null, metadata !40} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!45 = metadata !{i32 0, i32 0, metadata !34, null}
-!46 = metadata !{i32 2, i32 0, metadata !34, null}
+!0 = !{!"0x11\004\00clang version 3.5 \000\00\000\00\000", !1, !2, !3, !26, !2, !2} ; [ DW_TAG_compile_unit ] [/<unknown>] [DW_LANG_C_plus_plus]
+!1 = !{!"<unknown>", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2\00A\002\0064\0064\000\000\000", !5, null, null, !6, !"_ZTS1A", null, !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 2, size 64, align 64, offset 0] [def] [from ]
+!5 = !{!"./ab.h", !""}
+!6 = !{!7, !14, !19}
+!7 = !{!"0xd\00_vptr$A\000\0064\000\000\0064", !5, !8, !9} ; [ DW_TAG_member ] [_vptr$A] [line 0, size 64, align 0, offset 0] [artificial] [from ]
+!8 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [/./ab.h]
+!9 = !{!"0xf\00\000\0064\000\000\000", null, null, !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
+!10 = !{!"0xf\00__vtbl_ptr_type\000\0064\000\000\000", null, null, !11} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
+!11 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!13}
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = !{!"0x2e\00setFoo\00setFoo\00_ZN1A6setFooEv\004\000\000\001\006\00259\000\004", !5, !"_ZTS1A", !15, !"_ZTS1A", null, null, i32 0, !18} ; [ DW_TAG_subprogram ] [line 4] [setFoo]
+!15 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{null, !17}
+!17 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!18 = !{i32 786468}
+!19 = !{!"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\005\000\000\001\006\00259\000\005", !5, !"_ZTS1A", !20, !"_ZTS1A", null, null, i32 0, !25} ; [ DW_TAG_subprogram ] [line 5] [getFoo]
+!20 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!21 = !{!22, !17}
+!22 = !{!"0x26\00\000\000\000\000\000", null, null, !23} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo_t]
+!23 = !{!"0x16\00foo_t\001\000\000\000\000", !24, null, !13} ; [ DW_TAG_typedef ] [foo_t] [line 1, size 0, align 0, offset 0] [from int]
+!24 = !{!"a.cpp", !""}
+!25 = !{i32 786468}
+!26 = !{!27, !31, !34}
+!27 = !{!"0x2e\00bar\00bar\00_Z3barv\002\000\001\000\006\00256\000\002", !24, !28, !29, null, i32 ()* @_Z3barv, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
+!28 = !{!"0x29", !24}        ; [ DW_TAG_file_type ] [/a.cpp]
+!29 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !30, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!30 = !{!23}
+!31 = !{!"0x2e\00A\00A\00_ZN1AC1Ev\002\000\001\000\006\00320\000\002", !5, !"_ZTS1A", !15, null, void (%class.A*)* @_ZN1AC1Ev, null, !32, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [A]
+!32 = !{!"0x2e\00A\00A\00\000\000\000\000\006\00320\000\000", null, !"_ZTS1A", !15, null, null, null, i32 0, !33} ; [ DW_TAG_subprogram ] [line 0] [A]
+!33 = !{i32 786468}
+!34 = !{!"0x2e\00A\00A\00_ZN1AC2Ev\002\000\001\000\006\00320\000\002", !5, !"_ZTS1A", !15, null, void (%class.A*)* @_ZN1AC2Ev, null, !32, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [A]
+!35 = !{i32 2, !"Dwarf Version", i32 2}
+!36 = !{i32 1, !"Debug Info Version", i32 2}
+!37 = !{!"clang version 3.5 "}
+!38 = !MDLocation(line: 3, scope: !27)
+!39 = !{!"0x101\00this\0016777216\001088", !31, null, !40} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!40 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!41 = !MDLocation(line: 0, scope: !31)
+!42 = !MDLocation(line: 2, scope: !43)
+!43 = !{!"0xb\000", !5, !31} ; [ DW_TAG_lexical_block ] [/./ab.h]
+!44 = !{!"0x101\00this\0016777216\001088", !34, null, !40} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!45 = !MDLocation(line: 0, scope: !34)
+!46 = !MDLocation(line: 2, scope: !34)
diff --git a/test/Linker/type-unique-simple2-b.ll b/test/Linker/type-unique-simple2-b.ll
index f851316..25e67d4 100644
--- a/test/Linker/type-unique-simple2-b.ll
+++ b/test/Linker/type-unique-simple2-b.ll
@@ -22,7 +22,7 @@ define void @_ZN1A6setFooEv(%class.A* %this) unnamed_addr #0 align 2 {
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !32, metadata !{metadata !"0x102"}), !dbg !34
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !32, metadata !{!"0x102"}), !dbg !34
   %this1 = load %class.A** %this.addr
   ret void, !dbg !35
 }
@@ -35,7 +35,7 @@ define i32 @_ZN1A6getFooEv(%class.A* %this) unnamed_addr #0 align 2 {
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !36, metadata !{metadata !"0x102"}), !dbg !37
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !36, metadata !{!"0x102"}), !dbg !37
   %this1 = load %class.A** %this.addr
   ret i32 1, !dbg !38
 }
@@ -47,42 +47,42 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!29, !30}
 !llvm.ident = !{!31}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !25, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/<unknown>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"<unknown>", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2\00A\002\0064\0064\000\000\000", metadata !5, null, null, metadata !6, metadata !"_ZTS1A", null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 2, size 64, align 64, offset 0] [def] [from ]
-!5 = metadata !{metadata !"./ab.h", metadata !""}
-!6 = metadata !{metadata !7, metadata !14, metadata !19}
-!7 = metadata !{metadata !"0xd\00_vptr$A\000\0064\000\000\0064", metadata !5, metadata !8, metadata !9} ; [ DW_TAG_member ] [_vptr$A] [line 0, size 64, align 0, offset 0] [artificial] [from ]
-!8 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [/./ab.h]
-!9 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
-!10 = metadata !{metadata !"0xf\00__vtbl_ptr_type\000\0064\000\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!14 = metadata !{metadata !"0x2e\00setFoo\00setFoo\00_ZN1A6setFooEv\004\000\000\001\006\00259\000\004", metadata !5, metadata !"_ZTS1A", metadata !15, metadata !"_ZTS1A", null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 4] [setFoo]
-!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{null, metadata !17}
-!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!18 = metadata !{i32 786468}
-!19 = metadata !{metadata !"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\005\000\000\001\006\00259\000\005", metadata !5, metadata !"_ZTS1A", metadata !20, metadata !"_ZTS1A", null, null, i32 0, metadata !24} ; [ DW_TAG_subprogram ] [line 5] [getFoo]
-!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!21 = metadata !{metadata !22, metadata !17}
-!22 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !23} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo_t]
-!23 = metadata !{metadata !"0x16\00foo_t\001\000\000\000\000", metadata !5, null, metadata !13} ; [ DW_TAG_typedef ] [foo_t] [line 1, size 0, align 0, offset 0] [from int]
-!24 = metadata !{i32 786468}
-!25 = metadata !{metadata !26, metadata !28}
-!26 = metadata !{metadata !"0x2e\00setFoo\00setFoo\00_ZN1A6setFooEv\002\000\001\000\006\00259\000\002", metadata !27, metadata !"_ZTS1A", metadata !15, null, void (%class.A*)* @_ZN1A6setFooEv, null, metadata !14, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [setFoo]
-!27 = metadata !{metadata !"b.cpp", metadata !""}
-!28 = metadata !{metadata !"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\004\000\001\000\006\00259\000\004", metadata !27, metadata !"_ZTS1A", metadata !20, null, i32 (%class.A*)* @_ZN1A6getFooEv, null, metadata !19, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [getFoo]
-!29 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!31 = metadata !{metadata !"clang version 3.5 "}
-!32 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !26, null, metadata !33} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!33 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
-!34 = metadata !{i32 0, i32 0, metadata !26, null}
-!35 = metadata !{i32 2, i32 0, metadata !26, null}
-!36 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !28, null, metadata !33} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!37 = metadata !{i32 0, i32 0, metadata !28, null}
-!38 = metadata !{i32 4, i32 0, metadata !28, null}
+!0 = !{!"0x11\004\00clang version 3.5 \000\00\000\00\000", !1, !2, !3, !25, !2, !2} ; [ DW_TAG_compile_unit ] [/<unknown>] [DW_LANG_C_plus_plus]
+!1 = !{!"<unknown>", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2\00A\002\0064\0064\000\000\000", !5, null, null, !6, !"_ZTS1A", null, !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 2, size 64, align 64, offset 0] [def] [from ]
+!5 = !{!"./ab.h", !""}
+!6 = !{!7, !14, !19}
+!7 = !{!"0xd\00_vptr$A\000\0064\000\000\0064", !5, !8, !9} ; [ DW_TAG_member ] [_vptr$A] [line 0, size 64, align 0, offset 0] [artificial] [from ]
+!8 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [/./ab.h]
+!9 = !{!"0xf\00\000\0064\000\000\000", null, null, !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
+!10 = !{!"0xf\00__vtbl_ptr_type\000\0064\000\000\000", null, null, !11} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
+!11 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!13}
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = !{!"0x2e\00setFoo\00setFoo\00_ZN1A6setFooEv\004\000\000\001\006\00259\000\004", !5, !"_ZTS1A", !15, !"_ZTS1A", null, null, i32 0, !18} ; [ DW_TAG_subprogram ] [line 4] [setFoo]
+!15 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = !{null, !17}
+!17 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!18 = !{i32 786468}
+!19 = !{!"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\005\000\000\001\006\00259\000\005", !5, !"_ZTS1A", !20, !"_ZTS1A", null, null, i32 0, !24} ; [ DW_TAG_subprogram ] [line 5] [getFoo]
+!20 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!21 = !{!22, !17}
+!22 = !{!"0x26\00\000\000\000\000\000", null, null, !23} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo_t]
+!23 = !{!"0x16\00foo_t\001\000\000\000\000", !5, null, !13} ; [ DW_TAG_typedef ] [foo_t] [line 1, size 0, align 0, offset 0] [from int]
+!24 = !{i32 786468}
+!25 = !{!26, !28}
+!26 = !{!"0x2e\00setFoo\00setFoo\00_ZN1A6setFooEv\002\000\001\000\006\00259\000\002", !27, !"_ZTS1A", !15, null, void (%class.A*)* @_ZN1A6setFooEv, null, !14, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [setFoo]
+!27 = !{!"b.cpp", !""}
+!28 = !{!"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\004\000\001\000\006\00259\000\004", !27, !"_ZTS1A", !20, null, i32 (%class.A*)* @_ZN1A6getFooEv, null, !19, !2} ; [ DW_TAG_subprogram ] [line 4] [def] [getFoo]
+!29 = !{i32 2, !"Dwarf Version", i32 2}
+!30 = !{i32 1, !"Debug Info Version", i32 2}
+!31 = !{!"clang version 3.5 "}
+!32 = !{!"0x101\00this\0016777216\001088", !26, null, !33} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!33 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!34 = !MDLocation(line: 0, scope: !26)
+!35 = !MDLocation(line: 2, scope: !26)
+!36 = !{!"0x101\00this\0016777216\001088", !28, null, !33} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!37 = !MDLocation(line: 0, scope: !28)
+!38 = !MDLocation(line: 4, scope: !28)
diff --git a/test/Linker/type-unique-src-type.ll b/test/Linker/type-unique-src-type.ll
new file mode 100644
index 0000000..01b33a2
--- /dev/null
+++ b/test/Linker/type-unique-src-type.ll
@@ -0,0 +1,24 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-link -S %t.bc -o - | FileCheck %s
+; RUN: llvm-link -S %s -o - | FileCheck %s
+
+; Test that we don't try to map %C.0 and C and then try to map %C to a new type.
+; This used to happen when lazy loading since we wouldn't then identify %C
+; as a destination type until it was too late.
+
+; CHECK: %C.0 = type { %B }
+; CHECK-NEXT: %B = type { %A }
+; CHECK-NEXT: %A = type { i8 }
+
+; CHECK: @g1 = external global %C.0
+; CHECK:  getelementptr %C.0* null, i64 0, i32 0, i32 0
+
+%A   = type { i8 }
+%B   = type { %A }
+%C   = type { %B }
+%C.0 = type { %B }
+define void @f1() {
+  getelementptr %C* null, i64 0, i32 0, i32 0
+  ret void
+}
+@g1 = external global %C.0
diff --git a/test/Linker/type-unique-type-array-a.ll b/test/Linker/type-unique-type-array-a.ll
index 1b908c6..9e2de88 100644
--- a/test/Linker/type-unique-type-array-a.ll
+++ b/test/Linker/type-unique-type-array-a.ll
@@ -51,8 +51,8 @@ entry:
   %coerce.dive = getelementptr %struct.SA* %sa, i32 0, i32 0
   store i32 %sa.coerce, i32* %coerce.dive
   store %class.A* %a, %class.A** %a.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %a.addr}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
-  call void @llvm.dbg.declare(metadata !{%struct.SA* %sa}, metadata !26, metadata !{metadata !"0x102"}), !dbg !27
+  call void @llvm.dbg.declare(metadata %class.A** %a.addr, metadata !24, metadata !{!"0x102"}), !dbg !25
+  call void @llvm.dbg.declare(metadata %struct.SA* %sa, metadata !26, metadata !{!"0x102"}), !dbg !27
   %0 = load %class.A** %a.addr, align 8, !dbg !28
   %1 = bitcast %struct.SA* %agg.tmp to i8*, !dbg !28
   %2 = bitcast %struct.SA* %sa to i8*, !dbg !28
@@ -74,8 +74,8 @@ entry:
   %coerce.dive = getelementptr %struct.SA* %a, i32 0, i32 0
   store i32 %a.coerce, i32* %coerce.dive
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !30, metadata !{metadata !"0x102"}), !dbg !31
-  call void @llvm.dbg.declare(metadata !{%struct.SA* %a}, metadata !32, metadata !{metadata !"0x102"}), !dbg !33
+  call void @llvm.dbg.declare(metadata %class.A** %this.addr, metadata !30, metadata !{!"0x102"}), !dbg !31
+  call void @llvm.dbg.declare(metadata %struct.SA* %a, metadata !32, metadata !{!"0x102"}), !dbg !33
   %this1 = load %class.A** %this.addr
   ret void, !dbg !34
 }
@@ -92,38 +92,38 @@ attributes #3 = { nounwind }
 !llvm.module.flags = !{!21, !22}
 !llvm.ident = !{!23}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 214102:214113M) (llvm/trunk 214102:214115M)\000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [a.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"a.cpp", metadata !"/Users/manmanren/test-Nov/type_unique/rdar_di_array"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !10}
-!4 = metadata !{metadata !"0x2\00A\005\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 5, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{metadata !"0x2e\00testA\00testA\00_ZN1A5testAE2SA\007\000\000\000\006\00256\000\007", metadata !1, metadata !"_ZTS1A", metadata !7, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [testA]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9, metadata !"_ZTS2SA"}
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!10 = metadata !{metadata !"0x13\00SA\001\0032\0032\000\000\000", metadata !1, null, null, metadata !11, null, null, metadata !"_ZTS2SA"} ; [ DW_TAG_structure_type ] [SA] [line 1, size 32, align 32, offset 0] [def] [from ]
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !1, metadata !"_ZTS2SA", metadata !13} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!14 = metadata !{metadata !15, metadata !20}
-!15 = metadata !{metadata !"0x2e\00topA\00topA\00_Z4topAP1A2SA\0011\000\001\000\006\00256\000\0011", metadata !1, metadata !16, metadata !17, null, void (%class.A*, i32)* @_Z4topAP1A2SA, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 11] [def] [topA]
-!16 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [a.cpp]
-!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!18 = metadata !{null, metadata !19, metadata !"_ZTS2SA"}
-!19 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
-!20 = metadata !{metadata !"0x2e\00testA\00testA\00_ZN1A5testAE2SA\007\000\001\000\006\00256\000\007", metadata !1, metadata !"_ZTS1A", metadata !7, null, void (%class.A*, i32)* @_ZN1A5testAE2SA, null, metadata !6, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [testA]
-!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!22 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!23 = metadata !{metadata !"clang version 3.5.0 (trunk 214102:214113M) (llvm/trunk 214102:214115M)"}
-!24 = metadata !{metadata !"0x101\00a\0016777227\000", metadata !15, metadata !16, metadata !19} ; [ DW_TAG_arg_variable ] [a] [line 11]
-!25 = metadata !{i32 11, i32 14, metadata !15, null}
-!26 = metadata !{metadata !"0x101\00sa\0033554443\000", metadata !15, metadata !16, metadata !"_ZTS2SA"} ; [ DW_TAG_arg_variable ] [sa] [line 11]
-!27 = metadata !{i32 11, i32 20, metadata !15, null}
-!28 = metadata !{i32 12, i32 3, metadata !15, null}
-!29 = metadata !{i32 13, i32 1, metadata !15, null}
-!30 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !20, null, metadata !19} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!31 = metadata !{i32 0, i32 0, metadata !20, null}
-!32 = metadata !{metadata !"0x101\00a\0033554439\000", metadata !20, metadata !16, metadata !"_ZTS2SA"} ; [ DW_TAG_arg_variable ] [a] [line 7]
-!33 = metadata !{i32 7, i32 17, metadata !20, null}
-!34 = metadata !{i32 8, i32 3, metadata !20, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 (trunk 214102:214113M) (llvm/trunk 214102:214115M)\000\00\000\00\001", !1, !2, !3, !14, !2, !2} ; [ DW_TAG_compile_unit ] [a.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"a.cpp", !"/Users/manmanren/test-Nov/type_unique/rdar_di_array"}
+!2 = !{}
+!3 = !{!4, !10}
+!4 = !{!"0x2\00A\005\008\008\000\000\000", !1, null, null, !5, null, null, !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 5, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!6}
+!6 = !{!"0x2e\00testA\00testA\00_ZN1A5testAE2SA\007\000\000\000\006\00256\000\007", !1, !"_ZTS1A", !7, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [testA]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9, !"_ZTS2SA"}
+!9 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!10 = !{!"0x13\00SA\001\0032\0032\000\000\000", !1, null, null, !11, null, null, !"_ZTS2SA"} ; [ DW_TAG_structure_type ] [SA] [line 1, size 32, align 32, offset 0] [def] [from ]
+!11 = !{!12}
+!12 = !{!"0xd\00a\002\0032\0032\000\000", !1, !"_ZTS2SA", !13} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = !{!15, !20}
+!15 = !{!"0x2e\00topA\00topA\00_Z4topAP1A2SA\0011\000\001\000\006\00256\000\0011", !1, !16, !17, null, void (%class.A*, i32)* @_Z4topAP1A2SA, null, null, !2} ; [ DW_TAG_subprogram ] [line 11] [def] [topA]
+!16 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [a.cpp]
+!17 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = !{null, !19, !"_ZTS2SA"}
+!19 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!20 = !{!"0x2e\00testA\00testA\00_ZN1A5testAE2SA\007\000\001\000\006\00256\000\007", !1, !"_ZTS1A", !7, null, void (%class.A*, i32)* @_ZN1A5testAE2SA, null, !6, !2} ; [ DW_TAG_subprogram ] [line 7] [def] [testA]
+!21 = !{i32 2, !"Dwarf Version", i32 2}
+!22 = !{i32 2, !"Debug Info Version", i32 2}
+!23 = !{!"clang version 3.5.0 (trunk 214102:214113M) (llvm/trunk 214102:214115M)"}
+!24 = !{!"0x101\00a\0016777227\000", !15, !16, !19} ; [ DW_TAG_arg_variable ] [a] [line 11]
+!25 = !MDLocation(line: 11, column: 14, scope: !15)
+!26 = !{!"0x101\00sa\0033554443\000", !15, !16, !"_ZTS2SA"} ; [ DW_TAG_arg_variable ] [sa] [line 11]
+!27 = !MDLocation(line: 11, column: 20, scope: !15)
+!28 = !MDLocation(line: 12, column: 3, scope: !15)
+!29 = !MDLocation(line: 13, column: 1, scope: !15)
+!30 = !{!"0x101\00this\0016777216\001088", !20, null, !19} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!31 = !MDLocation(line: 0, scope: !20)
+!32 = !{!"0x101\00a\0033554439\000", !20, !16, !"_ZTS2SA"} ; [ DW_TAG_arg_variable ] [a] [line 7]
+!33 = !MDLocation(line: 7, column: 17, scope: !20)
+!34 = !MDLocation(line: 8, column: 3, scope: !20)
diff --git a/test/Linker/type-unique-type-array-b.ll b/test/Linker/type-unique-type-array-b.ll
index 85ee5a5..0fdae2a 100644
--- a/test/Linker/type-unique-type-array-b.ll
+++ b/test/Linker/type-unique-type-array-b.ll
@@ -30,8 +30,8 @@ entry:
   %coerce.dive = getelementptr %struct.SA* %sa, i32 0, i32 0
   store i32 %sa.coerce, i32* %coerce.dive
   store %class.B* %b, %class.B** %b.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.B** %b.addr}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
-  call void @llvm.dbg.declare(metadata !{%struct.SA* %sa}, metadata !26, metadata !{metadata !"0x102"}), !dbg !27
+  call void @llvm.dbg.declare(metadata %class.B** %b.addr, metadata !24, metadata !{!"0x102"}), !dbg !25
+  call void @llvm.dbg.declare(metadata %struct.SA* %sa, metadata !26, metadata !{!"0x102"}), !dbg !27
   %0 = load %class.B** %b.addr, align 8, !dbg !28
   %1 = bitcast %struct.SA* %agg.tmp to i8*, !dbg !28
   %2 = bitcast %struct.SA* %sa to i8*, !dbg !28
@@ -53,8 +53,8 @@ entry:
   %coerce.dive = getelementptr %struct.SA* %sa, i32 0, i32 0
   store i32 %sa.coerce, i32* %coerce.dive
   store %class.B* %this, %class.B** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.B** %this.addr}, metadata !30, metadata !{metadata !"0x102"}), !dbg !31
-  call void @llvm.dbg.declare(metadata !{%struct.SA* %sa}, metadata !32, metadata !{metadata !"0x102"}), !dbg !33
+  call void @llvm.dbg.declare(metadata %class.B** %this.addr, metadata !30, metadata !{!"0x102"}), !dbg !31
+  call void @llvm.dbg.declare(metadata %struct.SA* %sa, metadata !32, metadata !{!"0x102"}), !dbg !33
   %this1 = load %class.B** %this.addr
   ret void, !dbg !34
 }
@@ -71,38 +71,38 @@ attributes #3 = { nounwind }
 !llvm.module.flags = !{!21, !22}
 !llvm.ident = !{!23}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 214102:214113M) (llvm/trunk 214102:214115M)\000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [b.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"b.cpp", metadata !"/Users/manmanren/test-Nov/type_unique/rdar_di_array"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !10}
-!4 = metadata !{metadata !"0x2\00B\005\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_class_type ] [B] [line 5, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{metadata !"0x2e\00testB\00testB\00_ZN1B5testBE2SA\007\000\000\000\006\00256\000\007", metadata !1, metadata !"_ZTS1B", metadata !7, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [testB]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9, metadata !"_ZTS2SA"}
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
-!10 = metadata !{metadata !"0x13\00SA\001\0032\0032\000\000\000", metadata !1, null, null, metadata !11, null, null, metadata !"_ZTS2SA"} ; [ DW_TAG_structure_type ] [SA] [line 1, size 32, align 32, offset 0] [def] [from ]
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !1, metadata !"_ZTS2SA", metadata !13} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!14 = metadata !{metadata !15, metadata !20}
-!15 = metadata !{metadata !"0x2e\00topB\00topB\00_Z4topBP1B2SA\0011\000\001\000\006\00256\000\0011", metadata !1, metadata !16, metadata !17, null, void (%class.B*, i32)* @_Z4topBP1B2SA, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 11] [def] [topB]
-!16 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [b.cpp]
-!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!18 = metadata !{null, metadata !19, metadata !"_ZTS2SA"}
-!19 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1B]
-!20 = metadata !{metadata !"0x2e\00testB\00testB\00_ZN1B5testBE2SA\007\000\001\000\006\00256\000\007", metadata !1, metadata !"_ZTS1B", metadata !7, null, void (%class.B*, i32)* @_ZN1B5testBE2SA, null, metadata !6, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [testB]
-!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!22 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!23 = metadata !{metadata !"clang version 3.5.0 (trunk 214102:214113M) (llvm/trunk 214102:214115M)"}
-!24 = metadata !{metadata !"0x101\00b\0016777227\000", metadata !15, metadata !16, metadata !19} ; [ DW_TAG_arg_variable ] [b] [line 11]
-!25 = metadata !{i32 11, i32 14, metadata !15, null}
-!26 = metadata !{metadata !"0x101\00sa\0033554443\000", metadata !15, metadata !16, metadata !"_ZTS2SA"} ; [ DW_TAG_arg_variable ] [sa] [line 11]
-!27 = metadata !{i32 11, i32 20, metadata !15, null}
-!28 = metadata !{i32 12, i32 3, metadata !15, null}
-!29 = metadata !{i32 13, i32 1, metadata !15, null}
-!30 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !20, null, metadata !19} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!31 = metadata !{i32 0, i32 0, metadata !20, null}
-!32 = metadata !{metadata !"0x101\00sa\0033554439\000", metadata !20, metadata !16, metadata !"_ZTS2SA"} ; [ DW_TAG_arg_variable ] [sa] [line 7]
-!33 = metadata !{i32 7, i32 17, metadata !20, null}
-!34 = metadata !{i32 8, i32 3, metadata !20, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 (trunk 214102:214113M) (llvm/trunk 214102:214115M)\000\00\000\00\001", !1, !2, !3, !14, !2, !2} ; [ DW_TAG_compile_unit ] [b.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"b.cpp", !"/Users/manmanren/test-Nov/type_unique/rdar_di_array"}
+!2 = !{}
+!3 = !{!4, !10}
+!4 = !{!"0x2\00B\005\008\008\000\000\000", !1, null, null, !5, null, null, !"_ZTS1B"} ; [ DW_TAG_class_type ] [B] [line 5, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!6}
+!6 = !{!"0x2e\00testB\00testB\00_ZN1B5testBE2SA\007\000\000\000\006\00256\000\007", !1, !"_ZTS1B", !7, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [testB]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null, !9, !"_ZTS2SA"}
+!9 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
+!10 = !{!"0x13\00SA\001\0032\0032\000\000\000", !1, null, null, !11, null, null, !"_ZTS2SA"} ; [ DW_TAG_structure_type ] [SA] [line 1, size 32, align 32, offset 0] [def] [from ]
+!11 = !{!12}
+!12 = !{!"0xd\00a\002\0032\0032\000\000", !1, !"_ZTS2SA", !13} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = !{!15, !20}
+!15 = !{!"0x2e\00topB\00topB\00_Z4topBP1B2SA\0011\000\001\000\006\00256\000\0011", !1, !16, !17, null, void (%class.B*, i32)* @_Z4topBP1B2SA, null, null, !2} ; [ DW_TAG_subprogram ] [line 11] [def] [topB]
+!16 = !{!"0x29", !1}         ; [ DW_TAG_file_type ] [b.cpp]
+!17 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = !{null, !19, !"_ZTS2SA"}
+!19 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1B]
+!20 = !{!"0x2e\00testB\00testB\00_ZN1B5testBE2SA\007\000\001\000\006\00256\000\007", !1, !"_ZTS1B", !7, null, void (%class.B*, i32)* @_ZN1B5testBE2SA, null, !6, !2} ; [ DW_TAG_subprogram ] [line 7] [def] [testB]
+!21 = !{i32 2, !"Dwarf Version", i32 2}
+!22 = !{i32 2, !"Debug Info Version", i32 2}
+!23 = !{!"clang version 3.5.0 (trunk 214102:214113M) (llvm/trunk 214102:214115M)"}
+!24 = !{!"0x101\00b\0016777227\000", !15, !16, !19} ; [ DW_TAG_arg_variable ] [b] [line 11]
+!25 = !MDLocation(line: 11, column: 14, scope: !15)
+!26 = !{!"0x101\00sa\0033554443\000", !15, !16, !"_ZTS2SA"} ; [ DW_TAG_arg_variable ] [sa] [line 11]
+!27 = !MDLocation(line: 11, column: 20, scope: !15)
+!28 = !MDLocation(line: 12, column: 3, scope: !15)
+!29 = !MDLocation(line: 13, column: 1, scope: !15)
+!30 = !{!"0x101\00this\0016777216\001088", !20, null, !19} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!31 = !MDLocation(line: 0, scope: !20)
+!32 = !{!"0x101\00sa\0033554439\000", !20, !16, !"_ZTS2SA"} ; [ DW_TAG_arg_variable ] [sa] [line 7]
+!33 = !MDLocation(line: 7, column: 17, scope: !20)
+!34 = !MDLocation(line: 8, column: 3, scope: !20)
diff --git a/test/Linker/type-unique-unrelated.ll b/test/Linker/type-unique-unrelated.ll
new file mode 100644
index 0000000..26d05bb
--- /dev/null
+++ b/test/Linker/type-unique-unrelated.ll
@@ -0,0 +1,31 @@
+; RUN: llvm-link -S %s %p/Inputs/type-unique-unrelated2.ll %p/Inputs/type-unique-unrelated3.ll | FileCheck %s
+
+; CHECK: %t = type { i8* }
+
+; CHECK: define %t @f2() {
+; CHECK-NEXT:   %x = call %t @f2()
+; CHECK-NEXT:   ret %t %x
+; CHECK-NEXT: }
+
+; CHECK: define %t @g2() {
+; CHECK-NEXT:   %x = call %t @g()
+; CHECK-NEXT:   ret %t %x
+; CHECK-NEXT: }
+
+; CHECK: define %t @g() {
+; CHECK-NEXT:  %x = call %t @f()
+; CHECK-NEXT:  ret %t %x
+; CHECK-NEXT: }
+
+; The idea of this test is that the %t in this file and the one in
+; type-unique-unrelated2.ll look unrelated until type-unique-unrelated3.ll
+; is merged in.
+
+%t = type { i8* }
+declare %t @f()
+
+define %t @f2() {
+ %x = call %t @f2()
+ ret %t %x
+}
+
diff --git a/test/Linker/unique-fwd-decl-a.ll b/test/Linker/unique-fwd-decl-a.ll
index b9c7b2f..c1a4b1d 100644
--- a/test/Linker/unique-fwd-decl-a.ll
+++ b/test/Linker/unique-fwd-decl-a.ll
@@ -5,5 +5,5 @@
 ; CHECK: !b = !{!0}
 
 !a = !{!0}
-!0 = metadata !{metadata !1}
-!1 = metadata !{}
+!0 = !{!1}
+!1 = !{}
diff --git a/test/Linker/unique-fwd-decl-order.ll b/test/Linker/unique-fwd-decl-order.ll
new file mode 100644
index 0000000..e1d8c2e
--- /dev/null
+++ b/test/Linker/unique-fwd-decl-order.ll
@@ -0,0 +1,20 @@
+; RUN: llvm-link %s %S/Inputs/unique-fwd-decl-order.ll -S -o - | FileCheck %s
+; RUN: llvm-link %S/Inputs/unique-fwd-decl-order.ll %s -S -o - | FileCheck %s
+
+; This test exercises MDNode hashing.  For the nodes to be correctly uniqued,
+; the hash of a to-be-created MDNode has to match the hash of an
+; operand-just-changed MDNode (with the same operands).
+;
+; Note that these two assembly files number the nodes identically, even though
+; the nodes are in a different order.  This is for the reader's convenience.
+
+; CHECK: !named = !{!0, !0}
+!named = !{!0}
+
+; CHECK: !0 = !{!1}
+!0 = !{!1}
+
+; CHECK: !1 = !{}
+!1 = !{}
+
+; CHECK-NOT: !2
diff --git a/test/Linker/visibility.ll b/test/Linker/visibility.ll
index 6436197..4938d7a 100644
--- a/test/Linker/visibility.ll
+++ b/test/Linker/visibility.ll
@@ -17,8 +17,8 @@ $c1 = comdat any
 ; CHECK-DAG: @v3 = hidden global i32 0
 @v3 = protected global i32 0
 
-; CHECK-DAG: @v4 = hidden global i32 1, comdat $c1
-@v4 = global i32 1, comdat $c1
+; CHECK-DAG: @v4 = hidden global i32 1, comdat($c1)
+@v4 = global i32 1, comdat($c1)
 
 ; Aliases
 ; CHECK: @a1 = hidden alias i32* @v1
diff --git a/test/Linker/weakextern.ll b/test/Linker/weakextern.ll
index b9f2584..8d479a0 100644
--- a/test/Linker/weakextern.ll
+++ b/test/Linker/weakextern.ll
@@ -1,5 +1,5 @@
 ; RUN: llvm-as < %s > %t.bc
-; RUN: llvm-as < %p/testlink1.ll > %t2.bc
+; RUN: llvm-as < %p/testlink.ll > %t2.bc
 ; RUN: llvm-link %t.bc %t.bc %t2.bc -o %t1.bc
 ; RUN: llvm-dis < %t1.bc | FileCheck %s
 ; CHECK: kallsyms_names = extern_weak
diff --git a/test/MC/AArch64/adrp-relocation.s b/test/MC/AArch64/adrp-relocation.s
index 3bcef34..3bc6039 100644
--- a/test/MC/AArch64/adrp-relocation.s
+++ b/test/MC/AArch64/adrp-relocation.s
@@ -15,4 +15,4 @@ sym:
 // CHECK: R_AARCH64_ADR_PREL_PG_HI21 sym
 // CHECK: R_AARCH64_ADR_GOT_PAGE sym
 // CHECK: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 sym
-// CHECK: R_AARCH64_TLSDESC_ADR_PAGE sym
+// CHECK: R_AARCH64_TLSDESC_ADR_PAGE21 sym
diff --git a/test/MC/AArch64/arm64-elf-relocs.s b/test/MC/AArch64/arm64-elf-relocs.s
index eb22cc2..612819c 100644
--- a/test/MC/AArch64/arm64-elf-relocs.s
+++ b/test/MC/AArch64/arm64-elf-relocs.s
@@ -77,7 +77,7 @@
 
    adrp x2, :tlsdesc:sym
 // CHECK: adrp x2, :tlsdesc:sym
-// CHECK-OBJ: 58 R_AARCH64_TLSDESC_ADR_PAGE sym
+// CHECK-OBJ: 58 R_AARCH64_TLSDESC_ADR_PAGE21 sym
 
    // LLVM is not competent enough to do this relocation because the
    // page boundary could occur anywhere after linking. A relocation
diff --git a/test/MC/AArch64/arm64-tls-relocs.s b/test/MC/AArch64/arm64-tls-relocs.s
index 96c2b55..be7e24a 100644
--- a/test/MC/AArch64/arm64-tls-relocs.s
+++ b/test/MC/AArch64/arm64-tls-relocs.s
@@ -304,7 +304,7 @@
 // CHECK: blr     x3                      // encoding: [0x60,0x00,0x3f,0xd6]
 
 
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_ADR_PAGE [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_ADR_PAGE21 [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_LD64_LO12_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_ADD_LO12_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_CALL [[VARSYM]]
diff --git a/test/MC/AArch64/dot-req.s b/test/MC/AArch64/dot-req.s
index 947f945..a557f0c 100644
--- a/test/MC/AArch64/dot-req.s
+++ b/test/MC/AArch64/dot-req.s
@@ -1,7 +1,9 @@
-// RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding < %s 2>&1 | FileCheck %s
 
 bar:
         fred .req x5
+// CHECK-NOT: ignoring redefinition of register alias 'fred'
+        fred .req x5
         mov fred, x11
         .unreq fred
         fred .req w6
diff --git a/test/MC/AArch64/inline-asm-modifiers.s b/test/MC/AArch64/inline-asm-modifiers.s
index cf34a95..c3ba1cf 100644
--- a/test/MC/AArch64/inline-asm-modifiers.s
+++ b/test/MC/AArch64/inline-asm-modifiers.s
@@ -73,7 +73,7 @@ test_inline_modifier_A:                 // @test_inline_modifier_A
 	.size	test_inline_modifier_A, .Ltmp2-test_inline_modifier_A
 // CHECK: R_AARCH64_ADR_PREL_PG_HI21 var_simple
 // CHECK: R_AARCH64_ADR_GOT_PAGE var_got
-// CHECK: R_AARCH64_TLSDESC_ADR_PAGE var_tlsgd
+// CHECK: R_AARCH64_TLSDESC_ADR_PAGE21 var_tlsgd
 // CHECK: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 var_tlsie
 
 	.globl	test_inline_modifier_wx
diff --git a/test/MC/AArch64/tls-relocs.s b/test/MC/AArch64/tls-relocs.s
index ebf0216..9e94a52 100644
--- a/test/MC/AArch64/tls-relocs.s
+++ b/test/MC/AArch64/tls-relocs.s
@@ -391,7 +391,7 @@
 // CHECK:                                 //   fixup A - offset: 0, value: var, kind: fixup_aarch64_tlsdesc_call
 // CHECK: blr    x3                      // encoding: [0x60,0x00,0x3f,0xd6]
 
-// CHECK-ELF-NEXT:     0x104 R_AARCH64_TLSDESC_ADR_PAGE [[VARSYM]]
+// CHECK-ELF-NEXT:     0x104 R_AARCH64_TLSDESC_ADR_PAGE21 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x108 R_AARCH64_TLSDESC_LD64_LO12_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0x10C R_AARCH64_TLSDESC_ADD_LO12_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0x110 R_AARCH64_TLSDESC_CALL [[VARSYM]]
diff --git a/test/MC/ARM/Windows/invalid-relocation.s b/test/MC/ARM/Windows/invalid-relocation.s
new file mode 100644
index 0000000..4f4c598
--- /dev/null
+++ b/test/MC/ARM/Windows/invalid-relocation.s
@@ -0,0 +1,14 @@
+# RUN: not llvm-mc -triple thumbv7-windows -filetype obj -o /dev/null 2>&1 %s \
+# RUN:     | FileCheck %s
+
+	.def invalid_relocation
+		.type 32
+		.scl 2
+	.endef
+	.global invalid_relocation
+	.thumb_func
+invalid_relocation:
+	adr r0, invalid_relocation+1
+
+# CHECK: LLVM ERROR: unsupported relocation type: fixup_t2_adr_pcrel_12
+
diff --git a/test/MC/ARM/arm-elf-relocation-diagnostics.s b/test/MC/ARM/arm-elf-relocation-diagnostics.s
new file mode 100644
index 0000000..5fe903f
--- /dev/null
+++ b/test/MC/ARM/arm-elf-relocation-diagnostics.s
@@ -0,0 +1,27 @@
+@ RUN: not llvm-mc -triple armv7-eabi -filetype obj -o - %s 2>&1 \
+@ RUN:     | FileCheck %s
+@ RUN: not llvm-mc -triple thumbv7-eabi -filetype obj -o - %s 2>&1 \
+@ RUN:     | FileCheck %s
+
+	.byte target(sbrel)
+@ CHECK: error: relocated expression must be 32-bit
+@ CHECK: .byte target(sbrel)
+@ CHECK:       ^
+
+@ TODO: enable these negative test cases
+@ 	.hword target(sbrel)
+@ @ CHECK-SBREL-HWORD: error: relocated expression must be 32-bit
+@ @ CHECK-SBREL-HWORD: .hword target(sbrel)
+@ @ CHECK-SBREL-HWORD:        ^
+@
+@ 	.short target(sbrel)
+@ @ CHECK-SBREL-SHORT: error: relocated expression must be 32-bit
+@ @ CHECK-SBREL-SHORT: .short target(sbrel)
+@ @ CHECK-SBREL-SHORT:        ^
+@
+@ 	.quad target(sbrel)
+@ @ CHECK-SBREL-SHORT: error: relocated expression must be 32-bit
+@ @ CHECK-SBREL-SHORT: .quad target(sbrel)
+@ @ CHECK-SBREL-SHORT:        ^
+
+
diff --git a/test/MC/ARM/arm-elf-relocations.s b/test/MC/ARM/arm-elf-relocations.s
new file mode 100644
index 0000000..4059591
--- /dev/null
+++ b/test/MC/ARM/arm-elf-relocations.s
@@ -0,0 +1,37 @@
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s | llvm-readobj -r - \
+@ RUN:     | FileCheck %s
+@ RUN: llvm-mc -triple thumbv7-eabi -filetype obj -o - %s | llvm-readobj -r - \
+@ RUN:     | FileCheck %s
+
+	.syntax unified
+
+	.section .text.r_arm_abs8
+
+	.byte abs8_0 -128
+	.byte abs8_1 +255
+
+@ CHECK: Section {{.*}} .rel.text.r_arm_abs8 {
+@ CHECK:   0x0 R_ARM_ABS8 abs8_0 0x0
+@ CHECK:   0x1 R_ARM_ABS8 abs8_1 0x0
+@ CHECK: }
+
+	.section .text.r_arm_abs16
+
+	.short abs16_0 -32768
+	.short abs16_1 +65535
+
+@ CHECK: Section {{.*}} .rel.text.r_arm_abs16 {
+@ CHECK:   0x0 R_ARM_ABS16 abs16_0 0x0
+@ CHECK:   0x2 R_ARM_ABS16 abs16_1 0x0
+@ CHECK: }
+
+	.section .text.r_arm_sbrel32
+
+	.word target(sbrel)
+	.word target(SBREL)
+
+@ CHECK: Section {{.*}} .rel.text.r_arm_sbrel32 {
+@ CHECK:   0x0 R_ARM_SBREL32 target 0x0
+@ CHECK:   0x4 R_ARM_SBREL32 target 0x0
+@ CHECK: }
+
diff --git a/test/MC/ARM/arm-load-store-multiple-deprecated.s b/test/MC/ARM/arm-load-store-multiple-deprecated.s
new file mode 100644
index 0000000..9354822
--- /dev/null
+++ b/test/MC/ARM/arm-load-store-multiple-deprecated.s
@@ -0,0 +1,222 @@
+@ RUN: llvm-mc -triple armv6t2-linux-eabi -filetype asm -o - %s 2>&1 \
+@ RUN:   | FileCheck %s
+
+@ RUN: not llvm-mc -triple armv7-linux-eabi -filetype asm -o - %s 2>&1 \
+@ RUN:   | FileCheck %s -check-prefix CHECK -check-prefix CHECK-V7
+
+	.syntax unified
+	.arm
+
+	.global stm
+	.type stm,%function
+stm:
+	stm sp!, {r0, pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stm sp!, {r0, pc}
+@ CHECK: ^
+	stm r0!, {r0, sp}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stm r0!, {r0, sp}
+@ CHECK: ^
+	stm r1!, {r0, sp, pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stm r1!, {r0, sp, pc}
+@ CHECK: ^
+	stm r2!, {sp, pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stm r2!, {sp, pc}
+@ CHECK: ^
+	stm sp!, {pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stm sp!, {pc}
+@ CHECK: ^
+	stm r0!, {sp}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stm r0!, {sp}
+@ CHECK: ^
+
+	.global stmda
+	.type stmda,%function
+stmda:
+	stmda sp!, {r0, pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmda sp!, {r0, pc}
+@ CHECK: ^
+	stmda r0!, {r0, sp}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmda r0!, {r0, sp}
+@ CHECK: ^
+	stmda r1!, {r0, sp, pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmda r1!, {r0, sp, pc}
+@ CHECK: ^
+	stmda r2!, {sp, pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmda r2!, {sp, pc}
+@ CHECK: ^
+	stmda sp!, {pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmda sp!, {pc}
+@ CHECK: ^
+	stmda r0!, {sp}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmda r0!, {sp}
+@ CHECK: ^
+
+	.global stmdb
+	.type stmdb,%function
+stmdb:
+	stmdb sp!, {r0, pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmdb sp!, {r0, pc}
+@ CHECK: ^
+	stmdb r0!, {r0, sp}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmdb r0!, {r0, sp}
+@ CHECK: ^
+	stmdb r1!, {r0, sp, pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmdb r1!, {r0, sp, pc}
+@ CHECK: ^
+	stmdb r2!, {sp, pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmdb r2!, {sp, pc}
+@ CHECK: ^
+	stmdb sp!, {pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmdb sp!, {pc}
+@ CHECK: ^
+	stmdb r0!, {sp}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmdb r0!, {sp}
+@ CHECK: ^
+
+	.global stmib
+	.type stmib,%function
+stmib:
+	stmib sp!, {r0, pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmib sp!, {r0, pc}
+@ CHECK: ^
+	stmib r0!, {r0, sp}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmib r0!, {r0, sp}
+@ CHECK: ^
+	stmib r1!, {r0, sp, pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmib r1!, {r0, sp, pc}
+@ CHECK: ^
+	stmib r2!, {sp, pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmib r2!, {sp, pc}
+@ CHECK: ^
+	stmib sp!, {pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmib sp!, {pc}
+@ CHECK: ^
+	stmib r0!, {sp}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: stmib r0!, {sp}
+@ CHECK: ^
+
+
+	.global push
+	.type push,%function
+push:
+	push {r0, pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: push {r0, pc}
+@ CHECK: ^
+	push {r0, sp}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: push {r0, sp}
+@ CHECK: ^
+	push {r0, sp, pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: push {r0, sp, pc}
+@ CHECK: ^
+	push {sp, pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: push {sp, pc}
+@ CHECK: ^
+	push {pc}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: push {pc}
+@ CHECK: ^
+	push {sp}
+@ CHECK: warning: use of SP or PC in the list is deprecated
+@ CHECK: push {sp}
+@ CHECK: ^
+
+	.global ldm
+	.type ldm,%function
+ldm:
+	ldm r0!, {r1, sp}
+@ CHECK: warning: use of SP in the list is deprecated
+	ldm r0!, {sp}
+@ CHECK: warning: use of SP in the list is deprecated
+	ldm r0!, {r1, lr, pc}
+@ CHECK: warning: use of LR and PC simultaneously in the list is deprecated
+	ldm r0!, {lr, pc}
+@ CHECK: warning: use of LR and PC simultaneously in the list is deprecated
+
+	.global ldmda
+	.type ldmda,%function
+ldmda:
+	ldmda r0!, {r1, sp}
+@ CHECK: warning: use of SP in the list is deprecated
+	ldmda r0!, {sp}
+@ CHECK: warning: use of SP in the list is deprecated
+	ldmda r0!, {r1, lr, pc}
+@ CHECK: warning: use of LR and PC simultaneously in the list is deprecated
+	ldmda r0!, {lr, pc}
+@ CHECK: warning: use of LR and PC simultaneously in the list is deprecated
+
+	.global ldmdb
+	.type ldmdb,%function
+ldmdb:
+	ldmdb r0!, {r1, sp}
+@ CHECK: warning: use of SP in the list is deprecated
+	ldmdb r0!, {sp}
+@ CHECK: warning: use of SP in the list is deprecated
+	ldmdb r0!, {r1, lr, pc}
+@ CHECK: warning: use of LR and PC simultaneously in the list is deprecated
+	ldmdb r0!, {lr, pc}
+@ CHECK: warning: use of LR and PC simultaneously in the list is deprecated
+
+	.global ldmib
+	.type ldmib,%function
+ldmib:
+	ldmib r0!, {r1, sp}
+@ CHECK: warning: use of SP in the list is deprecated
+	ldmib r0!, {sp}
+@ CHECK: warning: use of SP in the list is deprecated
+	ldmib r0!, {r1, lr, pc}
+@ CHECK: warning: use of LR and PC simultaneously in the list is deprecated
+	ldmib r0!, {lr, pc}
+@ CHECK: warning: use of LR and PC simultaneously in the list is deprecated
+
+	.global pop
+	.type pop,%function
+pop:
+	pop {r0, sp}
+@ CHECK: warning: use of SP in the list is deprecated
+@ CHECK-V7: error: writeback register not allowed in register list
+	pop {sp}
+@ CHECK: warning: use of SP in the list is deprecated
+@ CHECK-V7: error: writeback register not allowed in register list
+	pop {r0, lr, pc}
+@ CHECK: warning: use of LR and PC simultaneously in the list is deprecated
+	pop {lr, pc}
+@ CHECK: warning: use of LR and PC simultaneously in the list is deprecated
+
+	.global valid
+	.type valid,%function
+valid:
+	stmdaeq r0, {r0}
+@ CHECK: stmdaeq r0, {r0}
+	ldmdaeq r0, {r0}
+@ CHECK: ldmdaeq r0, {r0}
+	pop {r0, pc}
+@ CHECK: pop {r0, pc}
+
diff --git a/test/MC/ARM/arm-thumb-cpus.s b/test/MC/ARM/arm-thumb-cpus.s
index 9005c7f..459b5c5 100644
--- a/test/MC/ARM/arm-thumb-cpus.s
+++ b/test/MC/ARM/arm-thumb-cpus.s
@@ -16,6 +16,9 @@
 @ RUN: not llvm-mc -show-encoding -triple=armv6m-eabi < %s 2>&1 \
 @ RUN:  | FileCheck %s --check-prefix=CHECK-THUMB-ONLY
 
+@ RUN: not llvm-mc -show-encoding -triple=armv6sm-eabi < %s 2>&1 \
+@ RUN:  | FileCheck %s --check-prefix=CHECK-THUMB-ONLY
+
         @ Make sure correct diagnostics are given for CPUs without support for
         @ one or other of the execution states.
         .thumb
diff --git a/test/MC/ARM/basic-arm-instructions.s b/test/MC/ARM/basic-arm-instructions.s
index e5e9617..a1f13b7 100644
--- a/test/MC/ARM/basic-arm-instructions.s
+++ b/test/MC/ARM/basic-arm-instructions.s
@@ -16,6 +16,15 @@ _func:
 @ ADC (immediate)
 @------------------------------------------------------------------------------
         adc r1, r2, #0xf
+        adc r1, r2, $0xf
+        adc r1, r2, 0xf
+        adc r7, r8, #(0xff << 16)
+        adc r7, r8, #-2147483638
+        adc r7, r8, #42, #2
+        adc r7, r8, #40, #2
+        adc r7, r8, $40, $2
+        adc r7, r8, 40, 2
+        adc r7, r8, (2 * 20), (1 << 1)
         adc r1, r2, #0xf0
         adc r1, r2, #0xf00
         adc r1, r2, #0xf000
@@ -25,20 +34,30 @@ _func:
         adc r1, r2, #0xf0000000
         adc r1, r2, #0xf000000f
         adcs r1, r2, #0xf00
+        adcs r7, r8, #40, #2
         adcseq r1, r2, #0xf00
         adceq r1, r2, #0xf00
 
 @ CHECK: adc	r1, r2, #15             @ encoding: [0x0f,0x10,0xa2,0xe2]
+@ CHECK: adc	r1, r2, #15             @ encoding: [0x0f,0x10,0xa2,0xe2]
+@ CHECK: adc	r1, r2, #15             @ encoding: [0x0f,0x10,0xa2,0xe2]
+@ CHECK: adc	r7, r8, #16711680       @ encoding: [0xff,0x78,0xa8,0xe2]
+@ CHECK: adc    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0xa8,0xe2]
+@ CHECK: adc    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0xa8,0xe2]
+@ CHECK: adc    r7, r8, #40, #2         @ encoding: [0x28,0x71,0xa8,0xe2]
+@ CHECK: adc    r7, r8, #40, #2         @ encoding: [0x28,0x71,0xa8,0xe2]
+@ CHECK: adc    r7, r8, #40, #2         @ encoding: [0x28,0x71,0xa8,0xe2]
+@ CHECK: adc	r7, r8, #40, #2         @ encoding: [0x28,0x71,0xa8,0xe2]
 @ CHECK: adc	r1, r2, #240            @ encoding: [0xf0,0x10,0xa2,0xe2]
 @ CHECK: adc	r1, r2, #3840           @ encoding: [0x0f,0x1c,0xa2,0xe2]
 @ CHECK: adc	r1, r2, #61440          @ encoding: [0x0f,0x1a,0xa2,0xe2]
 @ CHECK: adc	r1, r2, #983040         @ encoding: [0x0f,0x18,0xa2,0xe2]
 @ CHECK: adc	r1, r2, #15728640       @ encoding: [0x0f,0x16,0xa2,0xe2]
 @ CHECK: adc	r1, r2, #251658240      @ encoding: [0x0f,0x14,0xa2,0xe2]
-@ CHECK: adc	r1, r2, #4026531840     @ encoding: [0x0f,0x12,0xa2,0xe2]
-@ CHECK: adc	r1, r2, #4026531855     @ encoding: [0xff,0x12,0xa2,0xe2]
-
+@ CHECK: adc	r1, r2, #-268435456     @ encoding: [0x0f,0x12,0xa2,0xe2]
+@ CHECK: adc	r1, r2, #-268435441     @ encoding: [0xff,0x12,0xa2,0xe2]
 @ CHECK: adcs	r1, r2, #3840           @ encoding: [0x0f,0x1c,0xb2,0xe2]
+@ CHECK: adcs   r7, r8, #40, #2         @ encoding: [0x28,0x71,0xb8,0xe2]
 @ CHECK: adcseq	r1, r2, #3840           @ encoding: [0x0f,0x1c,0xb2,0x02]
 @ CHECK: adceq	r1, r2, #3840           @ encoding: [0x0f,0x1c,0xa2,0x02]
 
@@ -162,6 +181,16 @@ Lforward:
 @ ADD
 @------------------------------------------------------------------------------
         add r4, r5, #0xf000
+        add r4, r5, $0xf000
+        add r4, r5, 0xf000
+        add r4, r5, -0xf000
+	add r7, r8, #(0xff << 16)
+        add r7, r8, #-2147483638
+        add r7, r8, #42, #2
+        add r7, r8, #40, #2
+        add r7, r8, $40, $2
+        add r7, r8, 40, 2
+        add r7, r8, (2 * 20), (1 << 1)
         add r4, r5, r6
         add r4, r5, r6, lsl #5
         add r4, r5, r6, lsr #5
@@ -177,6 +206,16 @@ Lforward:
 
         @ destination register is optional
         add r5, #0xf000
+        add r5, $0xf000
+        add r5, 0xf000
+        add r5, -0xf000
+	add r7, #(0xff << 16)
+        add r7, #-2147483638
+        add r7, #42, #2
+        add r7, #40, #2
+        add r7, $40, $2
+        add r7, 40, 2
+        add r7, (2 * 20), (1 << 1)
         add r4, r5
         add r4, r5, lsl #5
         add r4, r5, lsr #5
@@ -189,11 +228,25 @@ Lforward:
         add r6, r7, ror r9
         add r4, r5, rrx
 
-	add r0, #-4
-	add r4, r5, #-21
+        add r0, #-4
+        add r4, r5, #-21
         add r0, pc, #0xc0000000
+        addseq r0,pc,#0xc0000000
+
+
+        add r0, pc, #(Lback - .)
 
 @ CHECK: add	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x85,0xe2]
+@ CHECK: add	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x85,0xe2]
+@ CHECK: add	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x85,0xe2]
+@ CHECK: sub	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x45,0xe2]
+@ CHECK: add	r7, r8, #16711680       @ encoding: [0xff,0x78,0x88,0xe2]
+@ CHECK: add    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0x88,0xe2]
+@ CHECK: add    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0x88,0xe2]
+@ CHECK: add    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x88,0xe2]
+@ CHECK: add    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x88,0xe2]
+@ CHECK: add    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x88,0xe2]
+@ CHECK: add    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x88,0xe2]
 @ CHECK: add	r4, r5, r6              @ encoding: [0x06,0x40,0x85,0xe0]
 @ CHECK: add	r4, r5, r6, lsl #5      @ encoding: [0x86,0x42,0x85,0xe0]
 @ CHECK: add	r4, r5, r6, lsr #5      @ encoding: [0xa6,0x42,0x85,0xe0]
@@ -208,6 +261,16 @@ Lforward:
 @ CHECK: add	r4, r5, r6, rrx         @ encoding: [0x66,0x40,0x85,0xe0]
 
 @ CHECK: add	r5, r5, #61440          @ encoding: [0x0f,0x5a,0x85,0xe2]
+@ CHECK: add	r5, r5, #61440          @ encoding: [0x0f,0x5a,0x85,0xe2]
+@ CHECK: add	r5, r5, #61440          @ encoding: [0x0f,0x5a,0x85,0xe2]
+@ CHECK: sub	r5, r5, #61440          @ encoding: [0x0f,0x5a,0x45,0xe2]
+@ CHECK: add	r7, r7, #16711680       @ encoding: [0xff,0x78,0x87,0xe2]
+@ CHECK: add	r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0x87,0xe2]
+@ CHECK: add	r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0x87,0xe2]
+@ CHECK: add	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x87,0xe2]
+@ CHECK: add	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x87,0xe2]
+@ CHECK: add	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x87,0xe2]
+@ CHECK: add	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x87,0xe2]
 @ CHECK: add	r4, r4, r5              @ encoding: [0x05,0x40,0x84,0xe0]
 @ CHECK: add	r4, r4, r5, lsl #5      @ encoding: [0x85,0x42,0x84,0xe0]
 @ CHECK: add	r4, r4, r5, lsr #5      @ encoding: [0xa5,0x42,0x84,0xe0]
@@ -222,7 +285,12 @@ Lforward:
 
 @ CHECK: sub	r0, r0, #4              @ encoding: [0x04,0x00,0x40,0xe2]
 @ CHECK: sub	r4, r5, #21             @ encoding: [0x15,0x40,0x45,0xe2]
-@ CHECK: adr    r0, #-1073741824        @ encoding: [0x03,0x01,0x8f,0xe2]
+@ CHECK: adr	r0, #-1073741824        @ encoding: [0x03,0x01,0x8f,0xe2]
+@ CHECK: addseq r0, pc, #-1073741824    @ encoding: [0x03,0x01,0x9f,0x02]
+@ CHECK:        Ltmp0:
+@ CHECK-NEXT:   Ltmp1:
+@ CHECK-NEXT:   adr	r0, (Ltmp1+8)+(Lback-Ltmp0) @ encoding: [A,A,0x0f'A',0xe2'A']
+@ CHECK-NEXT:                           @   fixup A - offset: 0, value: (Ltmp1+8)+(Lback-Ltmp0), kind: fixup_arm_adr_pcrel_12
 
     @ Test right shift by 32, which is encoded as 0
     add r3, r1, r2, lsr #32
@@ -231,9 +299,44 @@ Lforward:
 @ CHECK: add	r3, r1, r2, asr #32     @ encoding: [0x42,0x30,0x81,0xe0]
 
 @------------------------------------------------------------------------------
+@ ADDS
+@------------------------------------------------------------------------------
+    adds r7, r8, #16711680
+    adds r7, r8, $16711680
+    adds r7, r8, 16711680
+    adds r7, r8, #(0xff << 16)
+    adds r7, r8, #-2147483638
+    adds r7, r8, #42, #2
+    adds r7, r8, #40, #2
+    adds r7, r8, $40, $2
+    adds r7, r8, 40, 2
+    adds r7, r8, (2 * 20), (1 << 1)
+
+@ CHECK: adds	r7, r8, #16711680         @ encoding: [0xff,0x78,0x98,0xe2]
+@ CHECK: adds	r7, r8, #16711680         @ encoding: [0xff,0x78,0x98,0xe2]
+@ CHECK: adds	r7, r8, #16711680         @ encoding: [0xff,0x78,0x98,0xe2]
+@ CHECK: adds	r7, r8, #16711680         @ encoding: [0xff,0x78,0x98,0xe2]
+@ CHECK: adds   r7, r8, #-2147483638      @ encoding: [0x2a,0x71,0x98,0xe2]
+@ CHECK: adds   r7, r8, #-2147483638      @ encoding: [0x2a,0x71,0x98,0xe2]
+@ CHECK: adds   r7, r8, #40, #2           @ encoding: [0x28,0x71,0x98,0xe2]
+@ CHECK: adds   r7, r8, #40, #2           @ encoding: [0x28,0x71,0x98,0xe2]
+@ CHECK: adds   r7, r8, #40, #2           @ encoding: [0x28,0x71,0x98,0xe2]
+@ CHECK: adds   r7, r8, #40, #2           @ encoding: [0x28,0x71,0x98,0xe2]
+
+@------------------------------------------------------------------------------
 @ AND
 @------------------------------------------------------------------------------
     and r10, r1, #0xf
+    and r10, r1, $0xf
+    and r10, r1, 0xf
+    and r10, r1, -0xf
+    and r7, r8, #(0xff << 16)
+    and r7, r8, #-2147483638
+    and r7, r8, #42, #2
+    and r7, r8, #40, #2
+    and r7, r8, $40, $2
+    and r7, r8, 40, 2
+    and r7, r8, (2 * 20), (1 << 1)
     and r10, r1, r6
     and r10, r1, r6, lsl #10
     and r10, r1, r6, lsr #10
@@ -249,6 +352,16 @@ Lforward:
 
     @ destination register is optional
     and r1, #0xf
+    and r1, $0xf
+    and r1, 0xf
+    and r1, -0xf
+    and r7, #(0xff << 16)
+    and r7, #-2147483638
+    and r7, #42, #2
+    and r7, #40, #2
+    and r7, $40, $2
+    and r7, 40, 2
+    and r7, (2 * 20), (1 << 1)
     and r10, r1
     and r10, r1, lsl #10
     and r10, r1, lsr #10
@@ -262,6 +375,16 @@ Lforward:
     and r10, r1, rrx
 
 @ CHECK: and	r10, r1, #15            @ encoding: [0x0f,0xa0,0x01,0xe2]
+@ CHECK: and	r10, r1, #15            @ encoding: [0x0f,0xa0,0x01,0xe2]
+@ CHECK: and	r10, r1, #15            @ encoding: [0x0f,0xa0,0x01,0xe2]
+@ CHECK: bic	r10, r1, #14            @ encoding: [0x0e,0xa0,0xc1,0xe3]
+@ CHECK: and	r7, r8, #16711680       @ encoding: [0xff,0x78,0x08,0xe2]
+@ CHECK: and    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0x08,0xe2]
+@ CHECK: and    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0x08,0xe2]
+@ CHECK: and    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x08,0xe2]
+@ CHECK: and    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x08,0xe2]
+@ CHECK: and    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x08,0xe2]
+@ CHECK: and    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x08,0xe2]
 @ CHECK: and	r10, r1, r6             @ encoding: [0x06,0xa0,0x01,0xe0]
 @ CHECK: and	r10, r1, r6, lsl #10    @ encoding: [0x06,0xa5,0x01,0xe0]
 @ CHECK: and	r10, r1, r6, lsr #10    @ encoding: [0x26,0xa5,0x01,0xe0]
@@ -276,6 +399,16 @@ Lforward:
 @ CHECK: bic	r2, r3, #-2147483648    @ encoding: [0x02,0x21,0xc3,0xe3]
 
 @ CHECK: and	r1, r1, #15             @ encoding: [0x0f,0x10,0x01,0xe2]
+@ CHECK: and	r1, r1, #15             @ encoding: [0x0f,0x10,0x01,0xe2]
+@ CHECK: and	r1, r1, #15             @ encoding: [0x0f,0x10,0x01,0xe2]
+@ CHECK: bic	r1, r1, #14             @ encoding: [0x0e,0x10,0xc1,0xe3]
+@ CHECK: and	r7, r7, #16711680       @ encoding: [0xff,0x78,0x07,0xe2]
+@ CHECK: and	r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0x07,0xe2]
+@ CHECK: and	r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0x07,0xe2]
+@ CHECK: and	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x07,0xe2]
+@ CHECK: and	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x07,0xe2]
+@ CHECK: and	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x07,0xe2]
+@ CHECK: and	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x07,0xe2]
 @ CHECK: and	r10, r10, r1            @ encoding: [0x01,0xa0,0x0a,0xe0]
 @ CHECK: and	r10, r10, r1, lsl #10   @ encoding: [0x01,0xa5,0x0a,0xe0]
 @ CHECK: and	r10, r10, r1, lsr #10   @ encoding: [0x21,0xa5,0x0a,0xe0]
@@ -348,6 +481,16 @@ Lforward:
 @ BIC
 @------------------------------------------------------------------------------
         bic r10, r1, #0xf
+        bic r10, r1, $0xf
+        bic r10, r1, 0xf
+        bic r10, r1, -0xf
+        bic r7, r8, #(0xff << 16)
+        bic r7, r8, #-2147483638
+        bic r7, r8, #42, #2
+        bic r7, r8, #40, #2
+        bic r7, r8, $40, $2
+        bic r7, r8, 40, 2
+        bic r7, r8, (2 * 20), (1 << 1)
         bic r10, r1, r6
         bic r10, r1, r6, lsl #10
         bic r10, r1, r6, lsr #10
@@ -362,6 +505,16 @@ Lforward:
 
         @ destination register is optional
         bic r1, #0xf
+        bic r1, $0xf
+        bic r1, 0xf
+        bic r1, -0xf
+        bic r7, #(0xff << 16)
+        bic r7, #-2147483638
+        bic r7, #42, #2
+        bic r7, #40, #2
+        bic r7, $40, $2
+        bic r7, 40, 2
+        bic r7, (2 * 20), (1 << 1)
         bic r10, r1
         bic r10, r1, lsl #10
         bic r10, r1, lsr #10
@@ -375,6 +528,15 @@ Lforward:
         bic r10, r1, rrx
 
 @ CHECK: bic	r10, r1, #15            @ encoding: [0x0f,0xa0,0xc1,0xe3]
+@ CHECK: bic	r10, r1, #15            @ encoding: [0x0f,0xa0,0xc1,0xe3]
+@ CHECK: bic	r10, r1, #15            @ encoding: [0x0f,0xa0,0xc1,0xe3]
+@ CHECK: and	r10, r1, #14            @ encoding: [0x0e,0xa0,0x01,0xe2]
+@ CHECK: bic	r7, r8, #16711680       @ encoding: [0xff,0x78,0xc8,0xe3]
+@ CHECK: bic    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0xc8,0xe3]
+@ CHECK: bic    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0xc8,0xe3]
+@ CHECK: bic    r7, r8, #40, #2         @ encoding: [0x28,0x71,0xc8,0xe3]
+@ CHECK: bic    r7, r8, #40, #2         @ encoding: [0x28,0x71,0xc8,0xe3]
+@ CHECK: bic    r7, r8, #40, #2         @ encoding: [0x28,0x71,0xc8,0xe3]
 @ CHECK: bic	r10, r1, r6             @ encoding: [0x06,0xa0,0xc1,0xe1]
 @ CHECK: bic	r10, r1, r6, lsl #10    @ encoding: [0x06,0xa5,0xc1,0xe1]
 @ CHECK: bic	r10, r1, r6, lsr #10    @ encoding: [0x26,0xa5,0xc1,0xe1]
@@ -389,6 +551,16 @@ Lforward:
 
 
 @ CHECK: bic	r1, r1, #15             @ encoding: [0x0f,0x10,0xc1,0xe3]
+@ CHECK: bic	r1, r1, #15             @ encoding: [0x0f,0x10,0xc1,0xe3]
+@ CHECK: bic	r1, r1, #15             @ encoding: [0x0f,0x10,0xc1,0xe3]
+@ CHECK: and	r1, r1, #14             @ encoding: [0x0e,0x10,0x01,0xe2]
+@ CHECK: bic	r7, r7, #16711680       @ encoding: [0xff,0x78,0xc7,0xe3]
+@ CHECK: bic    r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0xc7,0xe3]
+@ CHECK: bic    r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0xc7,0xe3]
+@ CHECK: bic    r7, r7, #40, #2         @ encoding: [0x28,0x71,0xc7,0xe3]
+@ CHECK: bic    r7, r7, #40, #2         @ encoding: [0x28,0x71,0xc7,0xe3]
+@ CHECK: bic    r7, r7, #40, #2         @ encoding: [0x28,0x71,0xc7,0xe3]
+@ CHECK: bic    r7, r7, #40, #2         @ encoding: [0x28,0x71,0xc7,0xe3]
 @ CHECK: bic	r10, r10, r1            @ encoding: [0x01,0xa0,0xca,0xe1]
 @ CHECK: bic	r10, r10, r1, lsl #10   @ encoding: [0x01,0xa5,0xca,0xe1]
 @ CHECK: bic	r10, r10, r1, lsr #10   @ encoding: [0x21,0xa5,0xca,0xe1]
@@ -505,6 +677,16 @@ Lforward:
 @ CMN
 @------------------------------------------------------------------------------
         cmn r1, #0xf
+        cmn r1, $0xf
+        cmn r1, 0xf
+        cmn r1, -0xf
+        cmn r7, #(0xff << 16)
+        cmn r7, #-2147483638
+        cmn r7, #42, #2
+        cmn r7, #40, #2
+        cmn r7, $40, $2
+        cmn r7, 40, 2
+        cmn r7, (20 * 2), (1 << 1)
         cmn r1, r6
         cmn r1, r6, lsl #10
         cmn r1, r6, lsr #10
@@ -518,6 +700,16 @@ Lforward:
         cmn r1, r6, rrx
 
 @ CHECK: cmn	r1, #15                 @ encoding: [0x0f,0x00,0x71,0xe3]
+@ CHECK: cmn	r1, #15                 @ encoding: [0x0f,0x00,0x71,0xe3]
+@ CHECK: cmn	r1, #15                 @ encoding: [0x0f,0x00,0x71,0xe3]
+@ CHECK: cmp	r1, #15                 @ encoding: [0x0f,0x00,0x51,0xe3]
+@ CHECK: cmn	r7, #16711680           @ encoding: [0xff,0x08,0x77,0xe3]
+@ CHECK: cmn	r7, #-2147483638        @ encoding: [0x2a,0x01,0x77,0xe3]
+@ CHECK: cmn	r7, #-2147483638        @ encoding: [0x2a,0x01,0x77,0xe3]
+@ CHECK: cmn	r7, #40, #2             @ encoding: [0x28,0x01,0x77,0xe3]
+@ CHECK: cmn	r7, #40, #2             @ encoding: [0x28,0x01,0x77,0xe3]
+@ CHECK: cmn	r7, #40, #2             @ encoding: [0x28,0x01,0x77,0xe3]
+@ CHECK: cmn	r7, #40, #2             @ encoding: [0x28,0x01,0x77,0xe3]
 @ CHECK: cmn	r1, r6                  @ encoding: [0x06,0x00,0x71,0xe1]
 @ CHECK: cmn	r1, r6, lsl #10         @ encoding: [0x06,0x05,0x71,0xe1]
 @ CHECK: cmn	r1, r6, lsr #10         @ encoding: [0x26,0x05,0x71,0xe1]
@@ -534,6 +726,16 @@ Lforward:
 @ CMP
 @------------------------------------------------------------------------------
         cmp r1, #0xf
+        cmp r1, $0xf
+        cmp r1, 0xf
+        cmp r1, -0xf
+        cmp r7, #(0xff << 16)
+        cmp r7, #-2147483638
+        cmp r7, #42, #2
+        cmp r7, #40, #2
+        cmp r7, $40, $2
+        cmp r7, 40, 2
+        cmp r7, (2 * 20), (1 << 1)
         cmp r1, r6
         cmp r1, r6, lsl #10
         cmp r1, r6, lsr #10
@@ -549,6 +751,16 @@ Lforward:
         cmp lr, #0
 
 @ CHECK: cmp	r1, #15                 @ encoding: [0x0f,0x00,0x51,0xe3]
+@ CHECK: cmp	r1, #15                 @ encoding: [0x0f,0x00,0x51,0xe3]
+@ CHECK: cmp	r1, #15                 @ encoding: [0x0f,0x00,0x51,0xe3]
+@ CHECK: cmn	r1, #15                 @ encoding: [0x0f,0x00,0x71,0xe3]
+@ CHECK: cmp	r7, #16711680           @ encoding: [0xff,0x08,0x57,0xe3]
+@ CHECK: cmp	r7, #-2147483638        @ encoding: [0x2a,0x01,0x57,0xe3]
+@ CHECK: cmp	r7, #-2147483638        @ encoding: [0x2a,0x01,0x57,0xe3]
+@ CHECK: cmp    r7, #40, #2             @ encoding: [0x28,0x01,0x57,0xe3]
+@ CHECK: cmp    r7, #40, #2             @ encoding: [0x28,0x01,0x57,0xe3]
+@ CHECK: cmp    r7, #40, #2             @ encoding: [0x28,0x01,0x57,0xe3]
+@ CHECK: cmp    r7, #40, #2             @ encoding: [0x28,0x01,0x57,0xe3]
 @ CHECK: cmp	r1, r6                  @ encoding: [0x06,0x00,0x51,0xe1]
 @ CHECK: cmp	r1, r6, lsl #10         @ encoding: [0x06,0x05,0x51,0xe1]
 @ CHECK: cmp	r1, r6, lsr #10         @ encoding: [0x26,0x05,0x51,0xe1]
@@ -734,6 +946,15 @@ Lforward:
 @ EOR
 @------------------------------------------------------------------------------
         eor r4, r5, #0xf000
+        eor r4, r5, $0xf000
+        eor r4, r5, 0xf000
+        eor r7, r8, #(0xff << 16)
+        eor r7, r8, #-2147483638
+        eor r7, r8, #42, #2
+        eor r7, r8, #40, #2
+        eor r7, r8, $40, $2
+        eor r7, r8, 40, 2
+        eor r7, r8, (20 * 2), (1 << 1)
         eor r4, r5, r6
         eor r4, r5, r6, lsl #5
         eor r4, r5, r6, lsr #5
@@ -748,6 +969,15 @@ Lforward:
 
         @ destination register is optional
         eor r5, #0xf000
+        eor r5, $0xf000
+        eor r5, 0xf000
+        eor r7, #(0xff << 16)
+        eor r7, #-2147483638
+        eor r7, #42, #2
+        eor r7, #40, #2
+        eor r7, $40, $2
+        eor r7, 40, 2
+        eor r7, (20 * 2), (1 << 1)
         eor r4, r5
         eor r4, r5, lsl #5
         eor r4, r5, lsr #5
@@ -761,6 +991,15 @@ Lforward:
         eor r4, r5, rrx
 
 @ CHECK: eor	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x25,0xe2]
+@ CHECK: eor	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x25,0xe2]
+@ CHECK: eor	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x25,0xe2]
+@ CHECK: eor	r7, r8, #16711680       @ encoding: [0xff,0x78,0x28,0xe2]
+@ CHECK: eor    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0x28,0xe2]
+@ CHECK: eor    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0x28,0xe2]
+@ CHECK: eor    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x28,0xe2]
+@ CHECK: eor    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x28,0xe2]
+@ CHECK: eor    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x28,0xe2]
+@ CHECK: eor    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x28,0xe2]
 @ CHECK: eor	r4, r5, r6              @ encoding: [0x06,0x40,0x25,0xe0]
 @ CHECK: eor	r4, r5, r6, lsl #5      @ encoding: [0x86,0x42,0x25,0xe0]
 @ CHECK: eor	r4, r5, r6, lsr #5      @ encoding: [0xa6,0x42,0x25,0xe0]
@@ -775,6 +1014,15 @@ Lforward:
 
 
 @ CHECK: eor	r5, r5, #61440          @ encoding: [0x0f,0x5a,0x25,0xe2]
+@ CHECK: eor	r5, r5, #61440          @ encoding: [0x0f,0x5a,0x25,0xe2]
+@ CHECK: eor	r5, r5, #61440          @ encoding: [0x0f,0x5a,0x25,0xe2]
+@ CHECK: eor	r7, r7, #16711680       @ encoding: [0xff,0x78,0x27,0xe2]
+@ CHECK: eor	r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0x27,0xe2]
+@ CHECK: eor	r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0x27,0xe2]
+@ CHECK: eor	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x27,0xe2]
+@ CHECK: eor	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x27,0xe2]
+@ CHECK: eor	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x27,0xe2]
+@ CHECK: eor	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x27,0xe2]
 @ CHECK: eor	r4, r4, r5              @ encoding: [0x05,0x40,0x24,0xe0]
 @ CHECK: eor	r4, r4, r5, lsl #5      @ encoding: [0x85,0x42,0x24,0xe0]
 @ CHECK: eor	r4, r4, r5, lsr #5      @ encoding: [0xa5,0x42,0x24,0xe0]
@@ -1028,8 +1276,23 @@ Lforward:
 @ MOV (immediate)
 @------------------------------------------------------------------------------
         mov r3, #7
+        mov r3, $7
+        mov r3, 7
+        mov r3, -7
         mov r4, #0xff0
         mov r5, #0xff0000
+        mov r7, #42, #0
+        mov r7, #42, #10
+	mov r7, #(0xff << 16)
+        mov r7, #-2147483638
+        mov r7, #42, #2
+        mov pc, #42, #2
+        mov r7, #0, #2
+        mov r7, #40, #2
+        mov r7, $40, $2
+        mov r7, 40, 2
+        mov r7, (2 * 20), (1 << 1)
+        mov r7, #42, #30
         mov r6, #0xffff
         movw r9, #0xffff
         movs r3, #7
@@ -1037,8 +1300,23 @@ Lforward:
         movseq r5, #0xff0000
 
 @ CHECK: mov	r3, #7                  @ encoding: [0x07,0x30,0xa0,0xe3]
+@ CHECK: mov	r3, #7                  @ encoding: [0x07,0x30,0xa0,0xe3]
+@ CHECK: mov	r3, #7                  @ encoding: [0x07,0x30,0xa0,0xe3]
+@ CHECK: mvn	r3, #6                  @ encoding: [0x06,0x30,0xe0,0xe3]
 @ CHECK: mov	r4, #4080               @ encoding: [0xff,0x4e,0xa0,0xe3]
 @ CHECK: mov	r5, #16711680           @ encoding: [0xff,0x58,0xa0,0xe3]
+@ CHECK: mov    r7, #42                 @ encoding: [0x2a,0x70,0xa0,0xe3]
+@ CHECK: mov    r7, #176160768          @ encoding: [0x2a,0x75,0xa0,0xe3]
+@ CHECK: mov	r7, #16711680           @ encoding: [0xff,0x78,0xa0,0xe3]
+@ CHECK: mov    r7, #-2147483638        @ encoding: [0x2a,0x71,0xa0,0xe3]
+@ CHECK: mov    r7, #-2147483638        @ encoding: [0x2a,0x71,0xa0,0xe3]
+@ CHECK: mov    pc, #2147483658         @ encoding: [0x2a,0xf1,0xa0,0xe3]
+@ CHECK: mov    r7, #0, #2              @ encoding: [0x00,0x71,0xa0,0xe3]
+@ CHECK: mov    r7, #40, #2             @ encoding: [0x28,0x71,0xa0,0xe3]
+@ CHECK: mov    r7, #40, #2             @ encoding: [0x28,0x71,0xa0,0xe3]
+@ CHECK: mov    r7, #40, #2             @ encoding: [0x28,0x71,0xa0,0xe3]
+@ CHECK: mov    r7, #40, #2             @ encoding: [0x28,0x71,0xa0,0xe3]
+@ CHECK: mov    r7, #42, #30            @ encoding: [0x2a,0x7f,0xa0,0xe3]
 @ CHECK: movw	r6, #65535              @ encoding: [0xff,0x6f,0x0f,0xe3]
 @ CHECK: movw	r9, #65535              @ encoding: [0xff,0x9f,0x0f,0xe3]
 @ CHECK: movs	r3, #7                  @ encoding: [0x07,0x30,0xb0,0xe3]
@@ -1132,6 +1410,8 @@ Lforward:
 @------------------------------------------------------------------------------
 
         msr  apsr, #5
+        msr  apsr, $5
+        msr  apsr, 5
         msr  apsr_g, #5
         msr  apsr_nzcvq, #5
         msr  APSR_nzcvq, #5
@@ -1145,8 +1425,17 @@ Lforward:
         msr  spsr_fc, #5
         msr  SPSR_fsxc, #5
         msr  cpsr_fsxc, #5
+	msr  apsr_nzcvqg, #(0xff << 16)
+        msr  APSR_nzcvq, #42, #2
+        msr  apsr_nzcvqg, #2147483658
+        msr  SPSR_fsxc, #40, #2
+        msr  SPSR_fsxc, $40, $2
+        msr  SPSR_fsxc, 40, 2
+        msr  SPSR_fsxc, (2 * 20), (1 << 1)
 
 @ CHECK: msr	APSR_nzcvq, #5          @ encoding: [0x05,0xf0,0x28,0xe3]
+@ CHECK: msr	APSR_nzcvq, #5          @ encoding: [0x05,0xf0,0x28,0xe3]
+@ CHECK: msr	APSR_nzcvq, #5          @ encoding: [0x05,0xf0,0x28,0xe3]
 @ CHECK: msr	APSR_g, #5              @ encoding: [0x05,0xf0,0x24,0xe3]
 @ CHECK: msr	APSR_nzcvq, #5          @ encoding: [0x05,0xf0,0x28,0xe3]
 @ CHECK: msr	APSR_nzcvq, #5          @ encoding: [0x05,0xf0,0x28,0xe3]
@@ -1160,6 +1449,13 @@ Lforward:
 @ CHECK: msr	SPSR_fc, #5             @ encoding: [0x05,0xf0,0x69,0xe3]
 @ CHECK: msr	SPSR_fsxc, #5           @ encoding: [0x05,0xf0,0x6f,0xe3]
 @ CHECK: msr	CPSR_fsxc, #5           @ encoding: [0x05,0xf0,0x2f,0xe3]
+@ CHECK: msr	APSR_nzcvqg, #16711680  @ encoding: [0xff,0xf8,0x2c,0xe3]
+@ CHECK: msr    APSR_nzcvq, #2147483658 @ encoding: [0x2a,0xf1,0x28,0xe3]
+@ CHECK: msr    APSR_nzcvqg, #2147483658 @ encoding: [0x2a,0xf1,0x2c,0xe3]
+@ CHECK: msr    SPSR_fsxc, #40, #2      @ encoding: [0x28,0xf1,0x6f,0xe3]
+@ CHECK: msr    SPSR_fsxc, #40, #2      @ encoding: [0x28,0xf1,0x6f,0xe3]
+@ CHECK: msr    SPSR_fsxc, #40, #2      @ encoding: [0x28,0xf1,0x6f,0xe3]
+@ CHECK: msr    SPSR_fsxc, #40, #2      @ encoding: [0x28,0xf1,0x6f,0xe3]
 
         msr  apsr, r0
         msr  apsr_g, r0
@@ -1210,15 +1506,37 @@ Lforward:
 @ MVN (immediate)
 @------------------------------------------------------------------------------
         mvn r3, #7
+        mvn r3, $7
+        mvn r3, 7
+        mvn r3, -7
+        mvn r7, #~0xffffff00
         mvn r4, #0xff0
         mvn r5, #0xff0000
+	mvn r7, #(0xff << 16)
+        mvn r7, #-2147483638
+        mvn r7, #42, #2
+        mvn r7, #40, #2
+        mvn r7, $40, $2
+        mvn r7, 40, 2
+        mvn r7, (2 * 20), (1 << 1)
         mvns r3, #7
         mvneq r4, #0xff0
         mvnseq r5, #0xff0000
 
 @ CHECK: mvn	r3, #7                  @ encoding: [0x07,0x30,0xe0,0xe3]
+@ CHECK: mvn	r3, #7                  @ encoding: [0x07,0x30,0xe0,0xe3]
+@ CHECK: mvn	r3, #7                  @ encoding: [0x07,0x30,0xe0,0xe3]
+@ CHECK: mov	r3, #6                  @ encoding: [0x06,0x30,0xa0,0xe3]
+@ CHECK: mvn    r7, #255                @ encoding: [0xff,0x70,0xe0,0xe3]
 @ CHECK: mvn	r4, #4080               @ encoding: [0xff,0x4e,0xe0,0xe3]
 @ CHECK: mvn	r5, #16711680           @ encoding: [0xff,0x58,0xe0,0xe3]
+@ CHECK: mvn	r7, #16711680           @ encoding: [0xff,0x78,0xe0,0xe3]
+@ CHECK: mvn    r7, #-2147483638        @ encoding: [0x2a,0x71,0xe0,0xe3]
+@ CHECK: mvn    r7, #-2147483638        @ encoding: [0x2a,0x71,0xe0,0xe3]
+@ CHECK: mvn    r7, #40, #2             @ encoding: [0x28,0x71,0xe0,0xe3]
+@ CHECK: mvn    r7, #40, #2             @ encoding: [0x28,0x71,0xe0,0xe3]
+@ CHECK: mvn    r7, #40, #2             @ encoding: [0x28,0x71,0xe0,0xe3]
+@ CHECK: mvn    r7, #40, #2             @ encoding: [0x28,0x71,0xe0,0xe3]
 @ CHECK: mvns	r3, #7                  @ encoding: [0x07,0x30,0xf0,0xe3]
 @ CHECK: mvneq	r4, #4080               @ encoding: [0xff,0x4e,0xe0,0x03]
 @ CHECK: mvnseq	r5, #16711680           @ encoding: [0xff,0x58,0xf0,0x03]
@@ -1285,6 +1603,15 @@ Lforward:
 @ ORR
 @------------------------------------------------------------------------------
         orr r4, r5, #0xf000
+        orr r4, r5, $0xf000
+        orr r4, r5, 0xf000
+	orr r7, r8, #(0xff << 16)
+        orr r7, r8, #-2147483638
+        orr r7, r8, #42, #2
+        orr r7, r8, #40, #2
+        orr r7, r8, $40, $2
+        orr r7, r8, 40, 2
+        orr r7, r8, (2 * 20), (1 << 1)
         orr r4, r5, r6
         orr r4, r5, r6, lsl #5
         orr r4, r5, r6, lsr #5
@@ -1299,6 +1626,17 @@ Lforward:
 
         @ destination register is optional
         orr r5, #0xf000
+        orr r5, $0xf000
+        orr r5, 0xf000
+
+        orr r7, #(0xff << 16)
+        orr r7, #-2147483638
+        orr r7, #42, #2
+        orr r7, #40, #2
+        orr r7, $40, $2
+        orr r7, 40, 2
+        orr r7, (2 * 20), (1 << 1)
+
         orr r4, r5
         orr r4, r5, lsl #5
         orr r4, r5, lsr #5
@@ -1312,6 +1650,15 @@ Lforward:
         orr r4, r5, rrx
 
 @ CHECK: orr	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x85,0xe3]
+@ CHECK: orr	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x85,0xe3]
+@ CHECK: orr	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x85,0xe3]
+@ CHECK: orr	r7, r8, #16711680       @ encoding: [0xff,0x78,0x88,0xe3]
+@ CHECK: orr    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0x88,0xe3]
+@ CHECK: orr    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0x88,0xe3]
+@ CHECK: orr    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x88,0xe3]
+@ CHECK: orr    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x88,0xe3]
+@ CHECK: orr    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x88,0xe3]
+@ CHECK: orr    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x88,0xe3]
 @ CHECK: orr	r4, r5, r6              @ encoding: [0x06,0x40,0x85,0xe1]
 @ CHECK: orr	r4, r5, r6, lsl #5      @ encoding: [0x86,0x42,0x85,0xe1]
 @ CHECK: orr	r4, r5, r6, lsr #5      @ encoding: [0xa6,0x42,0x85,0xe1]
@@ -1325,6 +1672,15 @@ Lforward:
 @ CHECK: orr	r4, r5, r6, rrx         @ encoding: [0x66,0x40,0x85,0xe1]
 
 @ CHECK: orr	r5, r5, #61440          @ encoding: [0x0f,0x5a,0x85,0xe3]
+@ CHECK: orr	r5, r5, #61440          @ encoding: [0x0f,0x5a,0x85,0xe3]
+@ CHECK: orr	r5, r5, #61440          @ encoding: [0x0f,0x5a,0x85,0xe3]
+@ CHECK: orr	r7, r7, #16711680       @ encoding: [0xff,0x78,0x87,0xe3]
+@ CHECK: orr	r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0x87,0xe3]
+@ CHECK: orr	r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0x87,0xe3]
+@ CHECK: orr	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x87,0xe3]
+@ CHECK: orr 	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x87,0xe3]
+@ CHECK: orr 	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x87,0xe3]
+@ CHECK: orr 	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x87,0xe3]
 @ CHECK: orr	r4, r4, r5              @ encoding: [0x05,0x40,0x84,0xe1]
 @ CHECK: orr	r4, r4, r5, lsl #5      @ encoding: [0x85,0x42,0x84,0xe1]
 @ CHECK: orr	r4, r4, r5, lsr #5      @ encoding: [0xa5,0x42,0x84,0xe1]
@@ -1570,6 +1926,15 @@ Lforward:
 @ RSB
 @------------------------------------------------------------------------------
         rsb r4, r5, #0xf000
+        rsb r4, r5, $0xf000
+        rsb r4, r5, 0xf000
+        rsb r7, r8, #(0xff << 16)
+        rsb r7, r8, #-2147483638
+        rsb r7, r8, #42, #2
+        rsb r7, r8, #40, #2
+        rsb r7, r8, $40, $2
+        rsb r7, r8, 40, 2
+        rsb r7, r8, (2 * 20), (1 << 1)
         rsb r4, r5, r6
         rsb r4, r5, r6, lsl #5
         rsblo r4, r5, r6, lsr #5
@@ -1584,6 +1949,15 @@ Lforward:
 
         @ destination register is optional
         rsb r5, #0xf000
+        rsb r5, $0xf000
+        rsb r5, 0xf000
+        rsb r7, #(0xff << 16)
+        rsb r7, #-2147483638
+        rsb r7, #42, #2
+        rsb r7, #40, #2
+        rsb r7, $40, $2
+        rsb r7, 40, 2
+        rsb r7, (2 * 20), (1 << 1)
         rsb r4, r5
         rsb r4, r5, lsl #5
         rsb r4, r5, lsr #5
@@ -1597,6 +1971,15 @@ Lforward:
         rsb r4, r5, rrx
 
 @ CHECK: rsb	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x65,0xe2]
+@ CHECK: rsb	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x65,0xe2]
+@ CHECK: rsb	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x65,0xe2]
+@ CHECK: rsb	r7, r8, #16711680       @ encoding: [0xff,0x78,0x68,0xe2]
+@ CHECK: rsb    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0x68,0xe2]
+@ CHECK: rsb    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0x68,0xe2]
+@ CHECK: rsb    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x68,0xe2]
+@ CHECK: rsb    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x68,0xe2]
+@ CHECK: rsb    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x68,0xe2]
+@ CHECK: rsb    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x68,0xe2]
 @ CHECK: rsb	r4, r5, r6              @ encoding: [0x06,0x40,0x65,0xe0]
 @ CHECK: rsb	r4, r5, r6, lsl #5      @ encoding: [0x86,0x42,0x65,0xe0]
 @ CHECK: rsblo	r4, r5, r6, lsr #5      @ encoding: [0xa6,0x42,0x65,0x30]
@@ -1610,6 +1993,15 @@ Lforward:
 @ CHECK: rsb	r4, r5, r6, rrx         @ encoding: [0x66,0x40,0x65,0xe0]
 
 @ CHECK: rsb	r5, r5, #61440          @ encoding: [0x0f,0x5a,0x65,0xe2]
+@ CHECK: rsb	r5, r5, #61440          @ encoding: [0x0f,0x5a,0x65,0xe2]
+@ CHECK: rsb	r5, r5, #61440          @ encoding: [0x0f,0x5a,0x65,0xe2]
+@ CHECK: rsb	r7, r7, #16711680       @ encoding: [0xff,0x78,0x67,0xe2]
+@ CHECK: rsb	r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0x67,0xe2]
+@ CHECK: rsb	r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0x67,0xe2]
+@ CHECK: rsb	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x67,0xe2]
+@ CHECK: rsb	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x67,0xe2]
+@ CHECK: rsb	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x67,0xe2]
+@ CHECK: rsb	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x67,0xe2]
 @ CHECK: rsb	r4, r4, r5              @ encoding: [0x05,0x40,0x64,0xe0]
 @ CHECK: rsb	r4, r4, r5, lsl #5      @ encoding: [0x85,0x42,0x64,0xe0]
 @ CHECK: rsb	r4, r4, r5, lsr #5      @ encoding: [0xa5,0x42,0x64,0xe0]
@@ -1623,9 +2015,43 @@ Lforward:
 @ CHECK: rsb	r4, r4, r5, rrx         @ encoding: [0x65,0x40,0x64,0xe0]
 
 @------------------------------------------------------------------------------
+@ RSBS
+@------------------------------------------------------------------------------
+    rsbs r7, #16711680
+    rsbs r7, $16711680
+    rsbs r7, 16711680
+    rsbs r7, #(0xff << 16)
+    rsbs r7, r8, #-2147483638
+    rsbs r7, r8, #42, #2
+    rsbs r7, r8, #40, #2
+    rsbs r7, r8, $40, $2
+    rsbs r7, r8, 40, 2
+    rsbs r7, r8, (2 * 20), (1 << 1)
+
+@ CHECK: rsbs	r7, r7, #16711680       @ encoding: [0xff,0x78,0x77,0xe2]
+@ CHECK: rsbs	r7, r7, #16711680       @ encoding: [0xff,0x78,0x77,0xe2]
+@ CHECK: rsbs	r7, r7, #16711680       @ encoding: [0xff,0x78,0x77,0xe2]
+@ CHECK: rsbs	r7, r7, #16711680       @ encoding: [0xff,0x78,0x77,0xe2]
+@ CHECK: rsbs   r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0x78,0xe2]
+@ CHECK: rsbs   r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0x78,0xe2]
+@ CHECK: rsbs   r7, r8, #40, #2         @ encoding: [0x28,0x71,0x78,0xe2]
+@ CHECK: rsbs   r7, r8, #40, #2         @ encoding: [0x28,0x71,0x78,0xe2]
+@ CHECK: rsbs   r7, r8, #40, #2         @ encoding: [0x28,0x71,0x78,0xe2]
+@ CHECK: rsbs   r7, r8, #40, #2         @ encoding: [0x28,0x71,0x78,0xe2]
+
+@------------------------------------------------------------------------------
 @ RSC
 @------------------------------------------------------------------------------
         rsc r4, r5, #0xf000
+        rsc r4, r5, $0xf000
+        rsc r4, r5, 0xf000
+        rsc r7, r8, #(0xff << 16)
+        rsc r7, r8, #-2147483638
+        rsc r7, r8, #42, #2
+        rsc r7, r8, #40, #2
+        rsc r7, r8, $40, $2
+        rsc r7, r8, 40, 2
+        rsc r7, r8, (2 * 20), (1 << 1)
         rsc r4, r5, r6
         rsc r4, r5, r6, lsl #5
         rsclo r4, r5, r6, lsr #5
@@ -1640,6 +2066,15 @@ Lforward:
 
         @ destination register is optional
         rsc r5, #0xf000
+        rsc r5, $0xf000
+        rsc r5, 0xf000
+        rsc r7, #(0xff << 16)
+        rsc r7, #-2147483638
+        rsc r7, #42, #2
+        rsc r7, #40, #2
+        rsc r7, $40, $2
+        rsc r7, 40, 2
+        rsc r7, (2 * 20), (1 << 1)
         rsc r4, r5
         rsc r4, r5, lsl #5
         rsc r4, r5, lsr #5
@@ -1652,6 +2087,15 @@ Lforward:
         rsc r6, r7, ror r9
 
 @ CHECK: rsc	r4, r5, #61440          @ encoding: [0x0f,0x4a,0xe5,0xe2]
+@ CHECK: rsc	r4, r5, #61440          @ encoding: [0x0f,0x4a,0xe5,0xe2]
+@ CHECK: rsc	r4, r5, #61440          @ encoding: [0x0f,0x4a,0xe5,0xe2]
+@ CHECK: rsc    r7, r8, #16711680       @ encoding: [0xff,0x78,0xe8,0xe2]
+@ CHECK: rsc    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0xe8,0xe2]
+@ CHECK: rsc    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0xe8,0xe2]
+@ CHECK: rsc    r7, r8, #40, #2         @ encoding: [0x28,0x71,0xe8,0xe2]
+@ CHECK: rsc    r7, r8, #40, #2         @ encoding: [0x28,0x71,0xe8,0xe2]
+@ CHECK: rsc    r7, r8, #40, #2         @ encoding: [0x28,0x71,0xe8,0xe2]
+@ CHECK: rsc    r7, r8, #40, #2         @ encoding: [0x28,0x71,0xe8,0xe2]
 @ CHECK: rsc	r4, r5, r6              @ encoding: [0x06,0x40,0xe5,0xe0]
 @ CHECK: rsc	r4, r5, r6, lsl #5      @ encoding: [0x86,0x42,0xe5,0xe0]
 @ CHECK: rsclo	r4, r5, r6, lsr #5      @ encoding: [0xa6,0x42,0xe5,0x30]
@@ -1665,6 +2109,15 @@ Lforward:
 @ CHECK: rscs	r1, r8, #4064           @ encoding: [0xfe,0x1e,0xf8,0xe2]
 
 @ CHECK: rsc	r5, r5, #61440          @ encoding: [0x0f,0x5a,0xe5,0xe2]
+@ CHECK: rsc	r5, r5, #61440          @ encoding: [0x0f,0x5a,0xe5,0xe2]
+@ CHECK: rsc	r5, r5, #61440          @ encoding: [0x0f,0x5a,0xe5,0xe2]
+@ CHECK: rsc	r7, r7, #16711680       @ encoding: [0xff,0x78,0xe7,0xe2]
+@ CHECK: rsc	r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0xe7,0xe2]
+@ CHECK: rsc	r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0xe7,0xe2]
+@ CHECK: rsc	r7, r7, #40, #2         @ encoding: [0x28,0x71,0xe7,0xe2]
+@ CHECK: rsc	r7, r7, #40, #2         @ encoding: [0x28,0x71,0xe7,0xe2]
+@ CHECK: rsc	r7, r7, #40, #2         @ encoding: [0x28,0x71,0xe7,0xe2]
+@ CHECK: rsc	r7, r7, #40, #2         @ encoding: [0x28,0x71,0xe7,0xe2]
 @ CHECK: rsc	r4, r4, r5              @ encoding: [0x05,0x40,0xe4,0xe0]
 @ CHECK: rsc	r4, r4, r5, lsl #5      @ encoding: [0x85,0x42,0xe4,0xe0]
 @ CHECK: rsc	r4, r4, r5, lsr #5      @ encoding: [0xa5,0x42,0xe4,0xe0]
@@ -1728,6 +2181,15 @@ Lforward:
 @ SBC
 @------------------------------------------------------------------------------
         sbc r4, r5, #0xf000
+        sbc r4, r5, $0xf000
+        sbc r4, r5, 0xf000
+        sbc r7, r8, #(0xff << 16)
+        sbc r7, r8, #-2147483638
+        sbc r7, r8, #42, #2
+        sbc r7, r8, #40, #2
+        sbc r7, r8, $40, $2
+        sbc r7, r8, 40, 2
+        sbc r7, r8, (20 * 2), (1 << 1)
         sbc r4, r5, r6
         sbc r4, r5, r6, lsl #5
         sbc r4, r5, r6, lsr #5
@@ -1741,6 +2203,15 @@ Lforward:
 
         @ destination register is optional
         sbc r5, #0xf000
+        sbc r5, $0xf000
+        sbc r5, 0xf000
+        sbc r7, #(0xff << 16)
+        sbc r7, #-2147483638
+        sbc r7, #42, #2
+        sbc r7, #40, #2
+        sbc r7, $40, $2
+        sbc r7, 40, 2
+        sbc r7, (20 * 2), (1 << 1)
         sbc r4, r5
         sbc r4, r5, lsl #5
         sbc r4, r5, lsr #5
@@ -1753,6 +2224,15 @@ Lforward:
         sbc r6, r7, ror r9
 
 @ CHECK: sbc	r4, r5, #61440          @ encoding: [0x0f,0x4a,0xc5,0xe2]
+@ CHECK: sbc	r4, r5, #61440          @ encoding: [0x0f,0x4a,0xc5,0xe2]
+@ CHECK: sbc	r4, r5, #61440          @ encoding: [0x0f,0x4a,0xc5,0xe2]
+@ CHECK: sbc	r7, r8, #16711680       @ encoding: [0xff,0x78,0xc8,0xe2]
+@ CHECK: sbc    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0xc8,0xe2]
+@ CHECK: sbc    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0xc8,0xe2]
+@ CHECK: sbc    r7, r8, #40, #2         @ encoding: [0x28,0x71,0xc8,0xe2]
+@ CHECK: sbc    r7, r8, #40, #2         @ encoding: [0x28,0x71,0xc8,0xe2]
+@ CHECK: sbc    r7, r8, #40, #2         @ encoding: [0x28,0x71,0xc8,0xe2]
+@ CHECK: sbc    r7, r8, #40, #2         @ encoding: [0x28,0x71,0xc8,0xe2]
 @ CHECK: sbc	r4, r5, r6              @ encoding: [0x06,0x40,0xc5,0xe0]
 @ CHECK: sbc	r4, r5, r6, lsl #5      @ encoding: [0x86,0x42,0xc5,0xe0]
 @ CHECK: sbc	r4, r5, r6, lsr #5      @ encoding: [0xa6,0x42,0xc5,0xe0]
@@ -1765,6 +2245,15 @@ Lforward:
 @ CHECK: sbc	r6, r7, r8, ror r9      @ encoding: [0x78,0x69,0xc7,0xe0]
 
 @ CHECK: sbc	r5, r5, #61440          @ encoding: [0x0f,0x5a,0xc5,0xe2]
+@ CHECK: sbc	r5, r5, #61440          @ encoding: [0x0f,0x5a,0xc5,0xe2]
+@ CHECK: sbc	r5, r5, #61440          @ encoding: [0x0f,0x5a,0xc5,0xe2]
+@ CHECK: sbc	r7, r7, #16711680       @ encoding: [0xff,0x78,0xc7,0xe2]
+@ CHECK: sbc	r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0xc7,0xe2]
+@ CHECK: sbc	r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0xc7,0xe2]
+@ CHECK: sbc	r7, r7, #40, #2         @ encoding: [0x28,0x71,0xc7,0xe2]
+@ CHECK: sbc	r7, r7, #40, #2         @ encoding: [0x28,0x71,0xc7,0xe2]
+@ CHECK: sbc	r7, r7, #40, #2         @ encoding: [0x28,0x71,0xc7,0xe2]
+@ CHECK: sbc	r7, r7, #40, #2         @ encoding: [0x28,0x71,0xc7,0xe2]
 @ CHECK: sbc	r4, r4, r5              @ encoding: [0x05,0x40,0xc4,0xe0]
 @ CHECK: sbc	r4, r4, r5, lsl #5      @ encoding: [0x85,0x42,0xc4,0xe0]
 @ CHECK: sbc	r4, r4, r5, lsr #5      @ encoding: [0xa5,0x42,0xc4,0xe0]
@@ -2383,6 +2872,15 @@ Lforward:
 @ SUB
 @------------------------------------------------------------------------------
         sub r4, r5, #0xf000
+        sub r4, r5, $0xf000
+        sub r4, r5, 0xf000
+        sub r7, r8, #(0xff << 16)
+        sub r7, r8, #-2147483638
+        sub r7, r8, #42, #2
+        sub r7, r8, #40, #2
+        sub r7, r8, $40, $2
+        sub r7, r8, 40, 2
+        sub r7, r8, (20 * 2), (1 << 1)
         sub r4, r5, r6
         sub r4, r5, r6, lsl #5
         sub r4, r5, r6, lsr #5
@@ -2396,6 +2894,15 @@ Lforward:
 
         @ destination register is optional
         sub r5, #0xf000
+        sub r5, $0xf000
+        sub r5, 0xf000
+        sub r7, #(0xff << 16)
+        sub r7, #-2147483638
+        sub r7, #42, #2
+        sub r7, #40, #2
+        sub r7, $40, $2
+        sub r7, 40, 2
+        sub r7, (20 * 2), (1 << 1)
         sub r4, r5
         sub r4, r5, lsl #5
         sub r4, r5, lsr #5
@@ -2408,6 +2915,15 @@ Lforward:
         sub r6, r7, ror r9
 
 @ CHECK: sub	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x45,0xe2]
+@ CHECK: sub	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x45,0xe2]
+@ CHECK: sub	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x45,0xe2]
+@ CHECK: sub	r7, r8, #16711680       @ encoding: [0xff,0x78,0x48,0xe2]
+@ CHECK: sub    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0x48,0xe2]
+@ CHECK: sub    r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0x48,0xe2]
+@ CHECK: sub    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x48,0xe2]
+@ CHECK: sub    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x48,0xe2]
+@ CHECK: sub    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x48,0xe2]
+@ CHECK: sub    r7, r8, #40, #2         @ encoding: [0x28,0x71,0x48,0xe2]
 @ CHECK: sub	r4, r5, r6              @ encoding: [0x06,0x40,0x45,0xe0]
 @ CHECK: sub	r4, r5, r6, lsl #5      @ encoding: [0x86,0x42,0x45,0xe0]
 @ CHECK: sub	r4, r5, r6, lsr #5      @ encoding: [0xa6,0x42,0x45,0xe0]
@@ -2421,6 +2937,15 @@ Lforward:
 
 
 @ CHECK: sub	r5, r5, #61440          @ encoding: [0x0f,0x5a,0x45,0xe2]
+@ CHECK: sub	r5, r5, #61440          @ encoding: [0x0f,0x5a,0x45,0xe2]
+@ CHECK: sub	r5, r5, #61440          @ encoding: [0x0f,0x5a,0x45,0xe2]
+@ CHECK: sub	r7, r7, #16711680       @ encoding: [0xff,0x78,0x47,0xe2]
+@ CHECK: sub	r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0x47,0xe2]
+@ CHECK: sub	r7, r7, #-2147483638    @ encoding: [0x2a,0x71,0x47,0xe2]
+@ CHECK: sub	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x47,0xe2]
+@ CHECK: sub	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x47,0xe2
+@ CHECK: sub	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x47,0xe2]
+@ CHECK: sub	r7, r7, #40, #2         @ encoding: [0x28,0x71,0x47,0xe2]
 @ CHECK: sub	r4, r4, r5              @ encoding: [0x05,0x40,0x44,0xe0]
 @ CHECK: sub	r4, r4, r5, lsl #5      @ encoding: [0x85,0x42,0x44,0xe0]
 @ CHECK: sub	r4, r4, r5, lsr #5      @ encoding: [0xa5,0x42,0x44,0xe0]
@@ -2439,6 +2964,31 @@ Lforward:
 @ CHECK: sub	r3, r1, r2, asr #32     @ encoding: [0x42,0x30,0x41,0xe0]
 
 @------------------------------------------------------------------------------
+@ SUBS
+@------------------------------------------------------------------------------
+    subs r7, r8, #16711680
+    subs r7, r8, $16711680
+    subs r7, r8, 16711680
+    subs r7, r8, #(0xff << 16)
+    subs r7, r8, #-2147483638
+    subs r7, r8, #42, #2
+    subs r7, r8, #40, #2
+    subs r7, r8, $40, $2
+    subs r7, r8, 40, 2
+    subs r7, r8, (20 * 2), (1 << 1)
+
+@ CHECK: subs	r7, r8, #16711680       @ encoding: [0xff,0x78,0x58,0xe2]
+@ CHECK: subs	r7, r8, #16711680       @ encoding: [0xff,0x78,0x58,0xe2]
+@ CHECK: subs	r7, r8, #16711680       @ encoding: [0xff,0x78,0x58,0xe2]
+@ CHECK: subs	r7, r8, #16711680       @ encoding: [0xff,0x78,0x58,0xe2]
+@ CHECK: subs   r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0x58,0xe2]
+@ CHECK: subs   r7, r8, #-2147483638    @ encoding: [0x2a,0x71,0x58,0xe2]
+@ CHECK: subs   r7, r8, #40, #2         @ encoding: [0x28,0x71,0x58,0xe2]
+@ CHECK: subs   r7, r8, #40, #2         @ encoding: [0x28,0x71,0x58,0xe2]
+@ CHECK: subs   r7, r8, #40, #2         @ encoding: [0x28,0x71,0x58,0xe2]
+@ CHECK: subs   r7, r8, #40, #2         @ encoding: [0x28,0x71,0x58,0xe2]
+
+@------------------------------------------------------------------------------
 @ SVC
 @------------------------------------------------------------------------------
         svc #16
@@ -2560,6 +3110,15 @@ Lforward:
 @ TEQ
 @------------------------------------------------------------------------------
         teq r5, #0xf000
+        teq r5, $0xf000
+        teq r5, 0xf000
+        teq r7, #(0xff << 16)
+        teq r7, #-2147483638
+        teq r7, #42, #2
+        teq r7, #40, #2
+        teq r7, $40, $2
+        teq r7, 40, 2
+        teq r7, (20 * 2), (1 << 1)
         teq r4, r5
         teq r4, r5, lsl #5
         teq r4, r5, lsr #5
@@ -2572,6 +3131,15 @@ Lforward:
         teq r6, r7, ror r9
 
 @ CHECK: teq	r5, #61440              @ encoding: [0x0f,0x0a,0x35,0xe3]
+@ CHECK: teq	r5, #61440              @ encoding: [0x0f,0x0a,0x35,0xe3]
+@ CHECK: teq	r5, #61440              @ encoding: [0x0f,0x0a,0x35,0xe3]
+@ CHECK: teq	r7, #16711680           @ encoding: [0xff,0x08,0x37,0xe3]
+@ CHECK: teq    r7, #-2147483638        @ encoding: [0x2a,0x01,0x37,0xe3]
+@ CHECK: teq    r7, #-2147483638        @ encoding: [0x2a,0x01,0x37,0xe3]
+@ CHECK: teq    r7, #40, #2             @ encoding: [0x28,0x01,0x37,0xe3]
+@ CHECK: teq    r7, #40, #2             @ encoding: [0x28,0x01,0x37,0xe3]
+@ CHECK: teq    r7, #40, #2             @ encoding: [0x28,0x01,0x37,0xe3]
+@ CHECK: teq    r7, #40, #2             @ encoding: [0x28,0x01,0x37,0xe3]
 @ CHECK: teq	r4, r5                  @ encoding: [0x05,0x00,0x34,0xe1]
 @ CHECK: teq	r4, r5, lsl #5          @ encoding: [0x85,0x02,0x34,0xe1]
 @ CHECK: teq	r4, r5, lsr #5          @ encoding: [0xa5,0x02,0x34,0xe1]
@@ -2588,6 +3156,15 @@ Lforward:
 @ TST
 @------------------------------------------------------------------------------
         tst r5, #0xf000
+        tst r5, $0xf000
+        tst r5, 0xf000
+        tst r7, #(0xff << 16)
+        tst r7, #-2147483638
+        tst r7, #42, #2
+        tst r7, #40, #2
+        tst r7, $40, $2
+        tst r7, 40, 2
+        tst r7, (20 * 2), (1 << 1)
         tst r4, r5
         tst r4, r5, lsl #5
         tst r4, r5, lsr #5
@@ -2600,6 +3177,15 @@ Lforward:
         tst r6, r7, ror r9
 
 @ CHECK: tst	r5, #61440              @ encoding: [0x0f,0x0a,0x15,0xe3]
+@ CHECK: tst	r5, #61440              @ encoding: [0x0f,0x0a,0x15,0xe3]
+@ CHECK: tst	r5, #61440              @ encoding: [0x0f,0x0a,0x15,0xe3]
+@ CHECK: tst    r7, #16711680           @ encoding: [0xff,0x08,0x17,0xe3]
+@ CHECK: tst    r7, #-2147483638        @ encoding: [0x2a,0x01,0x17,0xe3]
+@ CHECK: tst    r7, #-2147483638        @ encoding: [0x2a,0x01,0x17,0xe3]
+@ CHECK: tst    r7, #40, #2             @ encoding: [0x28,0x01,0x17,0xe3]
+@ CHECK: tst    r7, #40, #2             @ encoding: [0x28,0x01,0x17,0xe3]
+@ CHECK: tst    r7, #40, #2             @ encoding: [0x28,0x01,0x17,0xe3]
+@ CHECK: tst    r7, #40, #2             @ encoding: [0x28,0x01,0x17,0xe3]
 @ CHECK: tst	r4, r5                  @ encoding: [0x05,0x00,0x14,0xe1]
 @ CHECK: tst	r4, r5, lsl #5          @ encoding: [0x85,0x02,0x14,0xe1]
 @ CHECK: tst	r4, r5, lsr #5          @ encoding: [0xa5,0x02,0x14,0xe1]
diff --git a/test/MC/ARM/coff-debugging-secrel.ll b/test/MC/ARM/coff-debugging-secrel.ll
index 0e5c8e6..7323fc6 100644
--- a/test/MC/ARM/coff-debugging-secrel.ll
+++ b/test/MC/ARM/coff-debugging-secrel.ll
@@ -16,17 +16,17 @@ entry:
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!9, !10}
 
-!0 = metadata !{i32 1, i32 0, metadata !1, null}
-!1 = metadata !{metadata !"0x2e\00function\00function\00\001\000\001\000\006\000\000\001", metadata !2, metadata !3, metadata !4, null, void ()* @function, null, null, metadata !6} ; [ DW_TAG_subprogram ], [line 1], [def], [function]
-!2 = metadata !{metadata !"/Users/compnerd/work/llvm/test/MC/ARM/reduced.c", metadata !"/Users/compnerd/work/llvm"}
-!3 = metadata !{metadata !"0x29", metadata !2} ; [ DW_TAG_file_type] [/Users/compnerd/work/llvm/test/MC/ARM/reduced.c]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ], [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{null}
-!6 = metadata !{}
-!7 = metadata !{metadata !"0x11\0012\00clang version 3.5.0\000\00\000\00\001", metadata !2, metadata !6, metadata !6, metadata !8, metadata !6, metadata !6} ; [ DW_TAG_compile_unit ] [/Users/compnerd/work/llvm/test/MC/ARM/reduced.c] [DW_LANG_C99]
-!8 = metadata !{metadata !1}
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !MDLocation(line: 1, scope: !1)
+!1 = !{!"0x2e\00function\00function\00\001\000\001\000\006\000\000\001", !2, !3, !4, null, void ()* @function, null, null, !6} ; [ DW_TAG_subprogram ], [line 1], [def], [function]
+!2 = !{!"/Users/compnerd/work/llvm/test/MC/ARM/reduced.c", !"/Users/compnerd/work/llvm"}
+!3 = !{!"0x29", !2} ; [ DW_TAG_file_type] [/Users/compnerd/work/llvm/test/MC/ARM/reduced.c]
+!4 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ], [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{null}
+!6 = !{}
+!7 = !{!"0x11\0012\00clang version 3.5.0\000\00\000\00\001", !2, !6, !6, !8, !6, !6} ; [ DW_TAG_compile_unit ] [/Users/compnerd/work/llvm/test/MC/ARM/reduced.c] [DW_LANG_C99]
+!8 = !{!1}
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 1, !"Debug Info Version", i32 2}
 
 ; CHECK-ITANIUM: Relocations [
 ; CHECK-ITANIUM:   Section {{.*}} .debug_info {
diff --git a/test/MC/ARM/cpu-test.s b/test/MC/ARM/cpu-test.s
new file mode 100644
index 0000000..7a61907
--- /dev/null
+++ b/test/MC/ARM/cpu-test.s
@@ -0,0 +1,17 @@
+// RUN: not llvm-mc -o - -triple arm-gnueabi-freebsd11.0 < %s > %t 2> %t2
+// RUN: FileCheck %s < %t
+// RUN: FileCheck %s --check-prefix=CHECK-ERROR < %t2
+
+// CHECK: .cpu cortex-a8
+.cpu cortex-a8
+// CHECK: dsb     sy
+dsb
+.cpu arm9       
+// CHECK-ERROR: error: instruction requires: data-barriers
+dsb
+// CHECK-ERROR: error: Unknown CPU name
+.cpu foobar
+// CHECK: .cpu cortex-m3
+.cpu cortex-m3
+// CHECK: sub sp, #16
+sub sp,#16
diff --git a/test/MC/ARM/diagnostics.s b/test/MC/ARM/diagnostics.s
index 6b9574b..6f66dc3 100644
--- a/test/MC/ARM/diagnostics.s
+++ b/test/MC/ARM/diagnostics.s
@@ -621,3 +621,83 @@ foo2:
 @ CHECK-ERRORS: error: destination register and base register can't be identical
 @ CHECK-ERRORS: ldrsb r0, [r0], r1
 @ CHECK-ERRORS:           ^
+
+        @ Out of range modified immediate values
+        mov  r5, #-256, #6
+        mov  r6, #42, #7
+        mvn  r5, #256, #6
+        mvn  r6, #42, #298
+        cmp  r5, #65535, #6
+        cmp  r6, #42, #31
+        cmn  r5, #-1, #6
+        cmn  r6, #42, #32
+	msr  APSR_nzcvq, #-128, #2
+	msr  apsr_nzcvqg, #0, #1
+        adc  r7, r8, #-256, #2
+        adc  r7, r8, #128, #1
+        sbc  r7, r8, #-256, #2
+        sbc  r7, r8, #128, #1
+        add  r7, r8, #-2149, #0
+        add  r7, r8, #100, #1
+        sub  r7, r8, #-2149, #0
+        sub  r7, r8, #100, #1
+        and  r7, r8, #-2149, #0
+        and  r7, r8, #100, #1
+        orr  r7, r8, #-2149, #0
+        orr  r7, r8, #100, #1
+        eor  r7, r8, #-2149, #0
+        eor  r7, r8, #100, #1
+        bic  r7, r8, #-2149, #0
+        bic  r7, r8, #100, #1
+        rsb  r7, r8, #-2149, #0
+        rsb  r7, r8, #100, #1
+        adds r7, r8, #-2149, #0
+        adds r7, r8, #100, #1
+        subs r7, r8, #-2149, #0
+        subs r7, r8, #100, #1
+        rsbs r7, r8, #-2149, #0
+        rsbs r7, r8, #100, #1
+        rsc r7, r8, #-2149, #0
+        rsc r7, r8, #100, #1
+        TST r7, #-2149, #0
+        TST r7, #100, #1
+        TEQ r7, #-2149, #0
+        TEQ r7, #100, #1
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
+@ CHECK-ERRORS: error: immediate operand must a number in the range [0, 255]
+@ CHECK-ERRORS: error: immediate operand must an even number in the range [0, 30]
diff --git a/test/MC/ARM/directive-arch-iwmmxt.s b/test/MC/ARM/directive-arch-iwmmxt.s
index db25ec6..c54846d 100644
--- a/test/MC/ARM/directive-arch-iwmmxt.s
+++ b/test/MC/ARM/directive-arch-iwmmxt.s
@@ -16,7 +16,7 @@
 @ CHECK-ATTR: FileAttributes {
 @ CHECK-ATTR:   Attribute {
 @ CHECK-ATTR:     TagName: CPU_name
-@ CHECK-ATTR:     Value: IWMMXT
+@ CHECK-ATTR:     Value: iwmmxt
 @ CHECK-ATTR:   }
 @ CHECK-ATTR:   Attribute {
 @ CHECK-ATTR:     TagName: CPU_arch
diff --git a/test/MC/ARM/directive-arch-iwmmxt2.s b/test/MC/ARM/directive-arch-iwmmxt2.s
index de94f97..a4e59b5 100644
--- a/test/MC/ARM/directive-arch-iwmmxt2.s
+++ b/test/MC/ARM/directive-arch-iwmmxt2.s
@@ -16,7 +16,7 @@
 @ CHECK-ATTR: FileAttributes {
 @ CHECK-ATTR:   Attribute {
 @ CHECK-ATTR:     TagName: CPU_name
-@ CHECK-ATTR:     Value: IWMMXT2
+@ CHECK-ATTR:     Value: iwmmxt2
 @ CHECK-ATTR:   }
 @ CHECK-ATTR:   Attribute {
 @ CHECK-ATTR:     TagName: CPU_arch
diff --git a/test/MC/ARM/directive-cpu.s b/test/MC/ARM/directive-cpu.s
index 952dd93..d81a03e 100644
--- a/test/MC/ARM/directive-cpu.s
+++ b/test/MC/ARM/directive-cpu.s
@@ -20,7 +20,6 @@
 @ CHECK: 10000000
 
 	.cpu	cortex-a8
-@ CHECK: 05
-@ CHECK: 434F52 5445582D 413800
+@ CHECK: 05636F72 7465782D 613800
 
 @ CHECK: )
diff --git a/test/MC/ARM/directive-eabi_attribute-diagnostics.s b/test/MC/ARM/directive-eabi_attribute-diagnostics.s
index d1ae352..2b0375e 100644
--- a/test/MC/ARM/directive-eabi_attribute-diagnostics.s
+++ b/test/MC/ARM/directive-eabi_attribute-diagnostics.s
@@ -29,6 +29,11 @@
 @ CHECK: 	.eabi_attribute 0
 @ CHECK:                         ^
 
+        .eabi_attribute Tag_compatibility, 1
+@ CHECK: error: comma expected
+@ CHECK: .eabi_attribute Tag_compatibility, 1
+@ CHECK:                                     ^
+
 	.eabi_attribute Tag_MPextension_use_old, 0
 @ CHECK: error: attribute name not recognised: Tag_MPextension_use_old
 @ CHECK: 	.eabi_attribute Tag_MPextension_use_old, 0
diff --git a/test/MC/ARM/directive-eabi_attribute-overwrite.s b/test/MC/ARM/directive-eabi_attribute-overwrite.s
index 6fdded3..e2c5099 100644
--- a/test/MC/ARM/directive-eabi_attribute-overwrite.s
+++ b/test/MC/ARM/directive-eabi_attribute-overwrite.s
@@ -3,13 +3,11 @@
 
 	.syntax unified
 	.thumb
-
-	.eabi_attribute Tag_compatibility, 1
 	.eabi_attribute Tag_compatibility, 1, "aeabi"
 
 @ CHECK-ATTR: FileAttributes {
 @ CHECK-ATTR:   Attribute {
-@ CHECK-ATTR:     Value: 1, AEABI
+@ CHECK-ATTR:     Value: 1, aeabi
 @ CHECK-ATTR:     TagName: compatibility
 @ CHECK-ATTR:     Description: AEABI Conformant
 @ CHECK-ATTR:   }
diff --git a/test/MC/ARM/directive-eabi_attribute.s b/test/MC/ARM/directive-eabi_attribute.s
index e2f1f9b..74a51ab 100644
--- a/test/MC/ARM/directive-eabi_attribute.s
+++ b/test/MC/ARM/directive-eabi_attribute.s
@@ -5,16 +5,24 @@
         .syntax unified
         .thumb
 
+        .eabi_attribute Tag_conformance, "2.09"
+@ CHECK: .eabi_attribute 67, "2.09"
+@ Tag_conformance should be be emitted first in a file-scope
+@ sub-subsection of the first public subsection of the attributes
+@ section. 2.3.7.4 of ABI Addenda.
+@ CHECK-OBJ:        Tag: 67
+@ CHECK-OBJ-NEXT:   TagName: conformance
+@ CHECK-OBJ-NEXT:   Value: 2.09
 	.eabi_attribute Tag_CPU_raw_name, "Cortex-A9"
 @ CHECK: .eabi_attribute 4, "Cortex-A9"
 @ CHECK-OBJ:        Tag: 4
 @ CHECK-OBJ-NEXT:   TagName: CPU_raw_name
-@ CHECK-OBJ-NEXT:   Value: CORTEX-A9
+@ CHECK-OBJ-NEXT:   Value: Cortex-A9
 	.eabi_attribute Tag_CPU_name, "cortex-a9"
 @ CHECK: .cpu cortex-a9
 @ CHECK-OBJ:        Tag: 5
 @ CHECK-OBJ-NEXT:   TagName: CPU_name
-@ CHECK-OBJ-NEXT:   Value: CORTEX-A9
+@ CHECK-OBJ-NEXT:   Value: cortex-a9
 	.eabi_attribute Tag_CPU_arch, 10
 @ CHECK: .eabi_attribute 6, 10
 @ CHECK-OBJ:        Tag: 6
@@ -165,12 +173,10 @@
 @ CHECK-OBJ-NEXT:   Value: 1
 @ CHECK-OBJ-NEXT:   TagName: ABI_FP_optimization_goals
 @ CHECK-OBJ-NEXT:   Description: Speed
-	.eabi_attribute Tag_compatibility, 1
-@ CHECK: .eabi_attribute 32, 1
 	.eabi_attribute Tag_compatibility, 1, "aeabi"
 @ CHECK: .eabi_attribute 32, 1, "aeabi"
 @ CHECK-OBJ:        Tag: 32
-@ CHECK-OBJ-NEXT:   Value: 1, AEABI
+@ CHECK-OBJ-NEXT:   Value: 1, aeabi
 @ CHECK-OBJ-NEXT:   TagName: compatibility
 @ CHECK-OBJ-NEXT:   Description: AEABI Conformant
 	.eabi_attribute Tag_CPU_unaligned_access, 0
@@ -213,18 +219,13 @@
 @ CHECK: .eabi_attribute 65, "gnu"
 @ CHECK-OBJ:        Tag: 65
 @ CHECK-OBJ-NEXT:   TagName: also_compatible_with
-@ CHECK-OBJ-NEXT:   Value: GNU
+@ CHECK-OBJ-NEXT:   Value: gnu
 	.eabi_attribute Tag_T2EE_use, 0
 @ CHECK: .eabi_attribute 66, 0
 @ CHECK-OBJ:        Tag: 66
 @ CHECK-OBJ-NEXT:   Value: 0
 @ CHECK-OBJ-NEXT:   TagName: T2EE_use
 @ CHECK-OBJ-NEXT:   Description: Not Permitted
-	.eabi_attribute Tag_conformance, "2.09"
-@ CHECK: .eabi_attribute 67, "2.09"
-@ CHECK-OBJ:        Tag: 67
-@ CHECK-OBJ-NEXT:   TagName: conformance
-@ CHECK-OBJ-NEXT:   Value: 2.09
 	.eabi_attribute Tag_Virtualization_use, 0
 @ CHECK: .eabi_attribute 68, 0
 @ CHECK-OBJ:        Tag: 68
diff --git a/test/MC/ARM/directive-fpu-diagnostics.s b/test/MC/ARM/directive-fpu-diagnostics.s
new file mode 100644
index 0000000..67c6129
--- /dev/null
+++ b/test/MC/ARM/directive-fpu-diagnostics.s
@@ -0,0 +1,10 @@
+@ RUN: not llvm-mc -triple armv7 -filetype asm -o /dev/null %s 2>&1 \
+@ RUN:     | FileCheck %s -strict-whitespace
+
+	.text
+	.thumb
+
+	.fpu invalid
+@ CHECK: error: Unknown FPU name
+@ CHECK: .fpu invalid
+@ CHECK:      ^
diff --git a/test/MC/ARM/dot-req.s b/test/MC/ARM/dot-req.s
index 3b4cf5c..848c124 100644
--- a/test/MC/ARM/dot-req.s
+++ b/test/MC/ARM/dot-req.s
@@ -1,6 +1,9 @@
 @ RUN: llvm-mc -triple=armv7-apple-darwin -show-encoding < %s | FileCheck %s
         .syntax unified
 bar:
+@ The line is duplicated on purpose, it is legal to redefine a req with
+@ the same value.
+fred .req r5
 fred .req r5
         mov r11, fred
 .unreq fred
diff --git a/test/MC/ARM/ldr-pseudo-parse-errors.s b/test/MC/ARM/ldr-pseudo-parse-errors.s
index 2e6114d..2516239 100644
--- a/test/MC/ARM/ldr-pseudo-parse-errors.s
+++ b/test/MC/ARM/ldr-pseudo-parse-errors.s
@@ -4,7 +4,7 @@
 .text
 bar:
   mov r0, =0x101
-@ CHECK: error: unexpected token in operand
+@ CHECK: error: unknown token in expression
 @ CHECK: mov r0, =0x101
 @ CHECK:         ^
 
diff --git a/test/MC/ARM/move-banked-regs.s b/test/MC/ARM/move-banked-regs.s
index 3fac846..b3b91f5 100644
--- a/test/MC/ARM/move-banked-regs.s
+++ b/test/MC/ARM/move-banked-regs.s
@@ -8,13 +8,13 @@
         mrs r11, r12_usr
         mrs r1, sp_usr
         mrs r2, lr_usr
-@ CHECK-ARM:         mrs     r2, r8_usr              @ encoding: [0x00,0x22,0x20,0xe1]
-@ CHECK-ARM:         mrs     r3, r9_usr              @ encoding: [0x00,0x32,0x21,0xe1]
-@ CHECK-ARM:         mrs     r5, r10_usr             @ encoding: [0x00,0x52,0x22,0xe1]
-@ CHECK-ARM:         mrs     r7, r11_usr             @ encoding: [0x00,0x72,0x23,0xe1]
-@ CHECK-ARM:         mrs     r11, r12_usr            @ encoding: [0x00,0xb2,0x24,0xe1]
-@ CHECK-ARM:         mrs     r1, sp_usr              @ encoding: [0x00,0x12,0x25,0xe1]
-@ CHECK-ARM:         mrs     r2, lr_usr              @ encoding: [0x00,0x22,0x26,0xe1]
+@ CHECK-ARM:         mrs     r2, r8_usr              @ encoding: [0x00,0x22,0x00,0xe1]
+@ CHECK-ARM:         mrs     r3, r9_usr              @ encoding: [0x00,0x32,0x01,0xe1]
+@ CHECK-ARM:         mrs     r5, r10_usr             @ encoding: [0x00,0x52,0x02,0xe1]
+@ CHECK-ARM:         mrs     r7, r11_usr             @ encoding: [0x00,0x72,0x03,0xe1]
+@ CHECK-ARM:         mrs     r11, r12_usr            @ encoding: [0x00,0xb2,0x04,0xe1]
+@ CHECK-ARM:         mrs     r1, sp_usr              @ encoding: [0x00,0x12,0x05,0xe1]
+@ CHECK-ARM:         mrs     r2, lr_usr              @ encoding: [0x00,0x22,0x06,0xe1]
 @ CHECK-THUMB:         mrs     r2, r8_usr              @ encoding: [0xe0,0xf3,0x20,0x82]
 @ CHECK-THUMB:         mrs     r3, r9_usr              @ encoding: [0xe1,0xf3,0x20,0x83]
 @ CHECK-THUMB:         mrs     r5, r10_usr             @ encoding: [0xe2,0xf3,0x20,0x85]
@@ -31,14 +31,14 @@
         mrs r1, sp_fiq
         mrs r2, lr_fiq
         mrs r3, spsr_fiq
-@ CHECK-ARM:         mrs     r2, r8_fiq              @ encoding: [0x00,0x22,0x28,0xe1]
-@ CHECK-ARM:         mrs     r3, r9_fiq              @ encoding: [0x00,0x32,0x29,0xe1]
-@ CHECK-ARM:         mrs     r5, r10_fiq             @ encoding: [0x00,0x52,0x2a,0xe1]
-@ CHECK-ARM:         mrs     r7, r11_fiq             @ encoding: [0x00,0x72,0x2b,0xe1]
-@ CHECK-ARM:         mrs     r11, r12_fiq            @ encoding: [0x00,0xb2,0x2c,0xe1]
-@ CHECK-ARM:         mrs     r1, sp_fiq              @ encoding: [0x00,0x12,0x2d,0xe1]
-@ CHECK-ARM:         mrs     r2, lr_fiq              @ encoding: [0x00,0x22,0x2e,0xe1]
-@ CHECK-ARM:         mrs     r3, SPSR_fiq            @ encoding: [0x00,0x32,0x6e,0xe1]
+@ CHECK-ARM:         mrs     r2, r8_fiq              @ encoding: [0x00,0x22,0x08,0xe1]
+@ CHECK-ARM:         mrs     r3, r9_fiq              @ encoding: [0x00,0x32,0x09,0xe1]
+@ CHECK-ARM:         mrs     r5, r10_fiq             @ encoding: [0x00,0x52,0x0a,0xe1]
+@ CHECK-ARM:         mrs     r7, r11_fiq             @ encoding: [0x00,0x72,0x0b,0xe1]
+@ CHECK-ARM:         mrs     r11, r12_fiq            @ encoding: [0x00,0xb2,0x0c,0xe1]
+@ CHECK-ARM:         mrs     r1, sp_fiq              @ encoding: [0x00,0x12,0x0d,0xe1]
+@ CHECK-ARM:         mrs     r2, lr_fiq              @ encoding: [0x00,0x22,0x0e,0xe1]
+@ CHECK-ARM:         mrs     r3, SPSR_fiq            @ encoding: [0x00,0x32,0x4e,0xe1]
 @ CHECK-THUMB:         mrs     r2, r8_fiq              @ encoding: [0xe8,0xf3,0x20,0x82]
 @ CHECK-THUMB:         mrs     r3, r9_fiq              @ encoding: [0xe9,0xf3,0x20,0x83]
 @ CHECK-THUMB:         mrs     r5, r10_fiq             @ encoding: [0xea,0xf3,0x20,0x85]
@@ -51,9 +51,9 @@
         mrs r4, lr_irq
         mrs r9, sp_irq
         mrs r1, spsr_irq
-@ CHECK-ARM:         mrs     r4, lr_irq              @ encoding: [0x00,0x43,0x20,0xe1]
-@ CHECK-ARM:         mrs     r9, sp_irq              @ encoding: [0x00,0x93,0x21,0xe1]
-@ CHECK-ARM:         mrs     r1, SPSR_irq            @ encoding: [0x00,0x13,0x60,0xe1]
+@ CHECK-ARM:         mrs     r4, lr_irq              @ encoding: [0x00,0x43,0x00,0xe1]
+@ CHECK-ARM:         mrs     r9, sp_irq              @ encoding: [0x00,0x93,0x01,0xe1]
+@ CHECK-ARM:         mrs     r1, SPSR_irq            @ encoding: [0x00,0x13,0x40,0xe1]
 @ CHECK-THUMB:         mrs     r4, lr_irq              @ encoding: [0xe0,0xf3,0x30,0x84]
 @ CHECK-THUMB:         mrs     r9, sp_irq              @ encoding: [0xe1,0xf3,0x30,0x89]
 @ CHECK-THUMB:         mrs     r1, SPSR_irq            @ encoding: [0xf0,0xf3,0x30,0x81]
@@ -61,9 +61,9 @@
         mrs r1, lr_svc
         mrs r3, sp_svc
         mrs r5, spsr_svc
-@ CHECK-ARM:         mrs     r1, lr_svc              @ encoding: [0x00,0x13,0x22,0xe1]
-@ CHECK-ARM:         mrs     r3, sp_svc              @ encoding: [0x00,0x33,0x23,0xe1]
-@ CHECK-ARM:         mrs     r5, SPSR_svc            @ encoding: [0x00,0x53,0x62,0xe1]
+@ CHECK-ARM:         mrs     r1, lr_svc              @ encoding: [0x00,0x13,0x02,0xe1]
+@ CHECK-ARM:         mrs     r3, sp_svc              @ encoding: [0x00,0x33,0x03,0xe1]
+@ CHECK-ARM:         mrs     r5, SPSR_svc            @ encoding: [0x00,0x53,0x42,0xe1]
 @ CHECK-THUMB:         mrs     r1, lr_svc              @ encoding: [0xe2,0xf3,0x30,0x81]
 @ CHECK-THUMB:         mrs     r3, sp_svc              @ encoding: [0xe3,0xf3,0x30,0x83]
 @ CHECK-THUMB:         mrs     r5, SPSR_svc            @ encoding: [0xf2,0xf3,0x30,0x85]
@@ -71,9 +71,9 @@
         mrs r5, lr_abt
         mrs r7, sp_abt
         mrs r9, spsr_abt
-@ CHECK-ARM:         mrs     r5, lr_abt              @ encoding: [0x00,0x53,0x24,0xe1]
-@ CHECK-ARM:         mrs     r7, sp_abt              @ encoding: [0x00,0x73,0x25,0xe1]
-@ CHECK-ARM:         mrs     r9, SPSR_abt            @ encoding: [0x00,0x93,0x64,0xe1]
+@ CHECK-ARM:         mrs     r5, lr_abt              @ encoding: [0x00,0x53,0x04,0xe1]
+@ CHECK-ARM:         mrs     r7, sp_abt              @ encoding: [0x00,0x73,0x05,0xe1]
+@ CHECK-ARM:         mrs     r9, SPSR_abt            @ encoding: [0x00,0x93,0x44,0xe1]
 @ CHECK-THUMB:         mrs     r5, lr_abt              @ encoding: [0xe4,0xf3,0x30,0x85]
 @ CHECK-THUMB:         mrs     r7, sp_abt              @ encoding: [0xe5,0xf3,0x30,0x87]
 @ CHECK-THUMB:         mrs     r9, SPSR_abt            @ encoding: [0xf4,0xf3,0x30,0x89]
@@ -81,9 +81,9 @@
         mrs r9, lr_und
         mrs r11, sp_und
         mrs r12, spsr_und
-@ CHECK-ARM:         mrs     r9, lr_und              @ encoding: [0x00,0x93,0x26,0xe1]
-@ CHECK-ARM:         mrs     r11, sp_und             @ encoding: [0x00,0xb3,0x27,0xe1]
-@ CHECK-ARM:         mrs     r12, SPSR_und           @ encoding: [0x00,0xc3,0x66,0xe1]
+@ CHECK-ARM:         mrs     r9, lr_und              @ encoding: [0x00,0x93,0x06,0xe1]
+@ CHECK-ARM:         mrs     r11, sp_und             @ encoding: [0x00,0xb3,0x07,0xe1]
+@ CHECK-ARM:         mrs     r12, SPSR_und           @ encoding: [0x00,0xc3,0x46,0xe1]
 @ CHECK-THUMB:         mrs     r9, lr_und              @ encoding: [0xe6,0xf3,0x30,0x89]
 @ CHECK-THUMB:         mrs     r11, sp_und             @ encoding: [0xe7,0xf3,0x30,0x8b]
 @ CHECK-THUMB:         mrs     r12, SPSR_und           @ encoding: [0xf6,0xf3,0x30,0x8c]
@@ -92,9 +92,9 @@
         mrs r2, lr_mon
         mrs r4, sp_mon
         mrs r6, spsr_mon
-@ CHECK-ARM:         mrs     r2, lr_mon              @ encoding: [0x00,0x23,0x2c,0xe1]
-@ CHECK-ARM:         mrs     r4, sp_mon              @ encoding: [0x00,0x43,0x2d,0xe1]
-@ CHECK-ARM:         mrs     r6, SPSR_mon            @ encoding: [0x00,0x63,0x6c,0xe1]
+@ CHECK-ARM:         mrs     r2, lr_mon              @ encoding: [0x00,0x23,0x0c,0xe1]
+@ CHECK-ARM:         mrs     r4, sp_mon              @ encoding: [0x00,0x43,0x0d,0xe1]
+@ CHECK-ARM:         mrs     r6, SPSR_mon            @ encoding: [0x00,0x63,0x4c,0xe1]
 @ CHECK-THUMB:         mrs     r2, lr_mon              @ encoding: [0xec,0xf3,0x30,0x82]
 @ CHECK-THUMB:         mrs     r4, sp_mon              @ encoding: [0xed,0xf3,0x30,0x84]
 @ CHECK-THUMB:         mrs     r6, SPSR_mon            @ encoding: [0xfc,0xf3,0x30,0x86]
@@ -103,9 +103,9 @@
         mrs r6, elr_hyp
         mrs r8, sp_hyp
         mrs r10, spsr_hyp
-@ CHECK-ARM:         mrs     r6, elr_hyp             @ encoding: [0x00,0x63,0x2e,0xe1]
-@ CHECK-ARM:         mrs     r8, sp_hyp              @ encoding: [0x00,0x83,0x2f,0xe1]
-@ CHECK-ARM:         mrs     r10, SPSR_hyp            @ encoding: [0x00,0xa3,0x6e,0xe1]
+@ CHECK-ARM:         mrs     r6, elr_hyp             @ encoding: [0x00,0x63,0x0e,0xe1]
+@ CHECK-ARM:         mrs     r8, sp_hyp              @ encoding: [0x00,0x83,0x0f,0xe1]
+@ CHECK-ARM:         mrs     r10, SPSR_hyp            @ encoding: [0x00,0xa3,0x4e,0xe1]
 @ CHECK-THUMB:         mrs     r6, elr_hyp             @ encoding: [0xee,0xf3,0x30,0x86]
 @ CHECK-THUMB:         mrs     r8, sp_hyp              @ encoding: [0xef,0xf3,0x30,0x88]
 @ CHECK-THUMB:         mrs     r10, SPSR_hyp            @ encoding: [0xfe,0xf3,0x30,0x8a]
diff --git a/test/MC/ARM/pr22395-2.s b/test/MC/ARM/pr22395-2.s
new file mode 100644
index 0000000..3d2a10d
--- /dev/null
+++ b/test/MC/ARM/pr22395-2.s
@@ -0,0 +1,37 @@
+@ RUN: llvm-mc -triple armv4t-eabi -mattr +d16 -filetype asm -o - %s 2>&1 | FileCheck %s
+
+	.text
+	.thumb
+
+	.p2align 2
+
+	.fpu vfpv3
+	vldmia r0, {d16-d31}
+@ CHECK: vldmia	r0, {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
+@ CHECK-NOT: error: register expected
+
+	.fpu vfpv4
+	vldmia r0, {d16-d31}
+@ CHECK: vldmia	r0, {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
+@ CHECK-NOT: error: register expected
+
+	.fpu neon
+	vldmia r0, {d16-d31}
+@ CHECK: vldmia	r0, {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
+@ CHECK-NOT: error: register expected
+
+	.fpu neon-vfpv4
+	vldmia r0, {d16-d31}
+@ CHECK: vldmia	r0, {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
+@ CHECK-NOT: error: register expected
+
+	.fpu neon-fp-armv8
+	vldmia r0, {d16-d31}
+@ CHECK: vldmia	r0, {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
+@ CHECK-NOT: error: register expected
+
+	.fpu crypto-neon-fp-armv8
+	vldmia r0, {d16-d31}
+@ CHECK: vldmia	r0, {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
+@ CHECK-NOT: error: register expected
+
diff --git a/test/MC/ARM/pr22395.s b/test/MC/ARM/pr22395.s
new file mode 100644
index 0000000..5da5d96
--- /dev/null
+++ b/test/MC/ARM/pr22395.s
@@ -0,0 +1,63 @@
+@ RUN: llvm-mc -triple armv4t-eabi -filetype asm -o - %s 2>&1 | FileCheck %s
+
+	.text
+	.thumb
+
+	.p2align 2
+
+	.fpu neon
+	vldmia r0, {d16-d31}
+
+@ CHECK: vldmia	r0, {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
+@ CHECK-NOT: error: instruction requires: VFP2
+
+	.fpu vfpv3
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu vfpv3-d16
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu vfpv4
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu vfpv4-d16
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu fpv5-d16
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu fp-armv8
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu fp-armv8
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu neon
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu neon-vfpv4
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
+	.fpu crypto-neon-fp-armv8
+	vadd.f32 s1, s2, s3
+@ CHECK: vadd.f32 s1, s2, s3
+@ CHECK-NOT: error: instruction requires: VPF2
+
diff --git a/test/MC/ARM/thumb-diagnostics.s b/test/MC/ARM/thumb-diagnostics.s
index 2a79132..bd26d06 100644
--- a/test/MC/ARM/thumb-diagnostics.s
+++ b/test/MC/ARM/thumb-diagnostics.s
@@ -1,11 +1,11 @@
-@ RUN: not llvm-mc -triple=thumbv6-apple-darwin < %s 2> %t
-@ RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-@ RUN: not llvm-mc -triple=thumbv5-apple-darwin < %s 2> %t
-@ RUN: FileCheck --check-prefix=CHECK-ERRORS-V5 < %t %s
-@ RUN: not llvm-mc -triple=thumbv7m < %s 2> %t
-@ RUN: FileCheck --check-prefix=CHECK-ERRORS-V7M < %t %s
-@ RUN: not llvm-mc -triple=thumbv8 < %s 2> %t
-@ RUN: FileCheck --check-prefix=CHECK-ERRORS-V8 < %t %s
+@ RUN: not llvm-mc -triple=thumbv6-apple-darwin -o /dev/null < %s 2>&1 \
+@ RUN:     | FileCheck --check-prefix=CHECK-ERRORS %s
+@ RUN: not llvm-mc -triple=thumbv5-apple-darwin -o /dev/null < %s 2>&1 \
+@ RUN:     | FileCheck --check-prefix=CHECK-ERRORS-V5 %s
+@ RUN: not llvm-mc -triple=thumbv7m -o /dev/null < %s 2>&1 \
+@ RUN:     | FileCheck --check-prefix=CHECK-ERRORS-V7M %s
+@ RUN: not llvm-mc -triple=thumbv8 -o /dev/null < %s 2>&1 \
+@ RUN:     | FileCheck --check-prefix=CHECK-ERRORS-V8 %s
 
 @ Check for various assembly diagnostic messages on invalid input.
 
@@ -83,25 +83,25 @@ error: invalid operand for instruction
 @ CHECK-ERRORS-V8: error: writeback register not allowed in register list
 @ CHECK-ERRORS-V8:         ldmdb r2!, {r2, r3, r4}
 @ CHECK-ERRORS-V8:                 ^
-@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M: error: SP may not be in the register list
 @ CHECK-ERRORS-V7M:         ldm r0, {r2, sp}
 @ CHECK-ERRORS-V7M:                 ^
-@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M: error: SP may not be in the register list
 @ CHECK-ERRORS-V7M:         ldmia r0, {r2-r3, sp}
 @ CHECK-ERRORS-V7M:                   ^
-@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M: error: SP may not be in the register list
 @ CHECK-ERRORS-V7M:         ldmia r0!, {r2-r3, sp}
 @ CHECK-ERRORS-V7M:                    ^
-@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M: error: SP may not be in the register list
 @ CHECK-ERRORS-V7M:         ldmfd r2, {r1, r3-r6, sp}
 @ CHECK-ERRORS-V7M:                   ^
-@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M: error: SP may not be in the register list
 @ CHECK-ERRORS-V7M:         ldmfd r2!, {r1, r3-r6, sp}
 @ CHECK-ERRORS-V7M:                    ^
-@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M: error: SP may not be in the register list
 @ CHECK-ERRORS-V7M:         ldmdb r1, {r2, r3, sp}
 @ CHECK-ERRORS-V7M:                   ^
-@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M: error: SP may not be in the register list
 @ CHECK-ERRORS-V7M:         ldmdb r1!, {r2, r3, sp}
 @ CHECK-ERRORS-V7M:                    ^
 
@@ -137,16 +137,16 @@ error: invalid operand for instruction
 @ CHECK-ERRORS-V8: error: writeback register not allowed in register list
 @ CHECK-ERRORS-V8:         stmdb r2!, {r0, r2}
 @ CHECK-ERRORS-V8:                  ^
-@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M: error: SP may not be in the register list
 @ CHECK-ERRORS-V7M:         stm r1!, {r2, sp}
 @ CHECK-ERRORS-V7M:                  ^
-@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M: error: SP may not be in the register list
 @ CHECK-ERRORS-V7M:         stmia r4!, {r0-r3, sp}
 @ CHECK-ERRORS-V7M:                    ^
-@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M: error: SP may not be in the register list
 @ CHECK-ERRORS-V7M:         stmdb r1, {r2, r3, sp}
 @ CHECK-ERRORS-V7M:                   ^
-@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M: error: SP may not be in the register list
 @ CHECK-ERRORS-V7M:         stmdb r1!, {r2, r3, sp}
 @ CHECK-ERRORS-V7M:                    ^
 
@@ -206,7 +206,7 @@ error: invalid operand for instruction
 @ CHECK-ERRORS: error: instruction requires: thumb2
 @ CHECK-ERRORS:         add sp, sp, #512
 @ CHECK-ERRORS:                     ^
-@ CHECK-ERRORS: error: instruction requires: arm-mode
+@ CHECK-ERRORS: error: instruction requires: thumb2
 @ CHECK-ERRORS:         add r2, sp, #1024
 @ CHECK-ERRORS:         ^
 
diff --git a/test/MC/ARM/thumb-load-store-multiple.s b/test/MC/ARM/thumb-load-store-multiple.s
new file mode 100644
index 0000000..6958450
--- /dev/null
+++ b/test/MC/ARM/thumb-load-store-multiple.s
@@ -0,0 +1,100 @@
+@ RUN: not llvm-mc -triple thumbv7-eabi -filetype asm -o - %s 2>&1 \
+@ RUN:     | FileCheck %s
+@ RUN: not llvm-mc -triple thumbv7a-eabi -filetype asm -o - %s 2>&1 \
+@ RUN: | FileCheck --check-prefix=CHECK --check-prefix=CHECK-V7A %s
+@ RUN: not llvm-mc -triple thumbv7m-eabi -filetype asm -o - %s 2>&1 \
+@ RUN: | FileCheck --check-prefix=CHECK --check-prefix=CHECK-V7M %s
+
+	.syntax unified
+	.thumb
+
+	.global ldm
+	.type ldm,%function
+ldm:
+	ldm r0!, {r1, sp}
+@ CHECK: error: SP may not be in the register list
+@ CHECK: ldm r0!, {r1, sp}
+@ CHECK:          ^
+	ldm r0!, {lr, pc}
+@ CHECK: error: PC and LR may not be in the register list simultaneously
+@ CHECK: ldm r0!, {lr, pc}
+@ CHECK:          ^
+	itt eq
+	ldmeq r0!, {r1, pc}
+	ldmeq r0!, {r2, lr}
+@ CHECK: error: instruction must be outside of IT block or the last instruction in an IT block
+@ CHECK: ldmeq r0!, {r1, pc}
+@ CHECK:            ^
+
+	.global ldmdb
+	.type ldmdb,%function
+ldmdb:
+	ldmdb r0!, {r1, sp}
+@ CHECK: error: SP may not be in the register list
+	ldmdb r0!, {lr, pc}
+@ error: PC and LR may not be in the register list simultaneously
+	itt eq
+	ldmeq r0!, {r1, pc}
+	ldmeq r0!, {r2, lr}
+@ CHECK: error: instruction must be outside of IT block or the last instruction in an IT block
+@ CHECK: ldmeq r0!, {r1, pc}
+@ CHECK:            ^
+
+	.global stm
+	.type stm,%function
+stm:
+	stm r0!, {r1, sp}
+@ CHECK: error: SP may not be in the register list
+	stm r0!, {r2, pc}
+@ CHECK: error: PC may not be in the register list
+	stm r0!, {sp, pc}
+@ CHECK: error: SP and PC may not be in the register list
+
+	.global stmdb
+	.type stmdb,%function
+stmdb:
+	stmdb r0!, {r1, sp}
+@ CHECK: error: SP may not be in the register list
+	stmdb r0!, {r2, pc}
+@ CHECK: error: PC may not be in the register list
+	stmdb r0!, {sp, pc}
+@ CHECK: error: SP and PC may not be in the register list
+
+	.global push
+	.type push,%function
+push:
+	push {sp}
+@ CHECK: error: SP may not be in the register list
+	push {pc}
+@ CHECK: error: PC may not be in the register list
+	push {sp, pc}
+@ CHECK: error: SP and PC may not be in the register list
+
+	.global pop
+	.type pop,%function
+pop:
+        pop {sp}
+@ CHECK-V7M: error: SP may not be in the register list
+	pop {lr, pc}
+@ CHECK: error: PC and LR may not be in the register list simultaneously
+@ CHECK: pop {lr, pc}
+@ CHECK:     ^
+	itt eq
+	popeq {r1, pc}
+	popeq {r2, lr}
+@ CHECK: error: instruction must be outside of IT block or the last instruction in an IT block
+@ CHECK: popeq {r1, pc}
+@ CHECK:     ^
+
+	.global valid
+	.type valid,%function
+valid:
+	pop {sp}
+@ CHECK-V7A: ldr sp, [sp], #4
+	pop {sp, pc}
+@ CHECK-V7A: pop.w {sp, pc}
+	push.w {r0}
+@ CHECK: str r0, [sp, #-4]
+	pop.w {r0}
+@ CHECK: ldr r0, [sp], #4
+
diff --git a/test/MC/ARM/thumb2-diagnostics.s b/test/MC/ARM/thumb2-diagnostics.s
index b2b14bc..8fd161c 100644
--- a/test/MC/ARM/thumb2-diagnostics.s
+++ b/test/MC/ARM/thumb2-diagnostics.s
@@ -87,4 +87,7 @@ foo2:
 @ CHECK-ERRORS: error: invalid operand for instruction
 @ CHECK-ERRORS: error: invalid operand for instruction
 
-
+ssat r0, #1, r0, asr #32
+usat r0, #1, r0, asr #32
+@ CHECK-ERRORS: error: 'asr #32' shift amount not allowed in Thumb mode
+@ CHECK-ERRORS: error: 'asr #32' shift amount not allowed in Thumb mode
diff --git a/test/MC/ARM/thumb2-dsp-diag.s b/test/MC/ARM/thumb2-dsp-diag.s
new file mode 100644
index 0000000..cb0e774
--- /dev/null
+++ b/test/MC/ARM/thumb2-dsp-diag.s
@@ -0,0 +1,24 @@
+; RUN: not llvm-mc -triple=thumbv7m < %s 2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+sxtab r0, r0, r0
+sxtah r0, r0, r0
+sxtab16 r0, r0, r0
+sxtb16 r0, r0
+sxtb16 r0, r0, ror #8
+; CHECK-ERRORS: error: instruction requires: arm-mode
+; CHECK-ERRORS: error: instruction requires: arm-mode
+; CHECK-ERRORS: error: instruction requires: arm-mode
+; CHECK-ERRORS: error: instruction requires: arm-mode
+; CHECK-ERRORS: error: invalid operand for instruction
+
+uxtab r0, r0, r0
+uxtah r0, r0, r0
+uxtab16 r0, r0, r0
+uxtb16 r0, r0
+uxtb16 r0, r0, ror #8
+; CHECK-ERRORS: error: instruction requires: arm-mode
+; CHECK-ERRORS: error: instruction requires: arm-mode
+; CHECK-ERRORS: error: instruction requires: arm-mode
+; CHECK-ERRORS: error: instruction requires: arm-mode
+; CHECK-ERRORS: error: invalid operand for instruction
diff --git a/test/MC/ARM/v8_IT_manual.s b/test/MC/ARM/v8_IT_manual.s
index 4b63aa8..160e98c 100644
--- a/test/MC/ARM/v8_IT_manual.s
+++ b/test/MC/ARM/v8_IT_manual.s
@@ -554,11 +554,11 @@ pushge {r1, r3, r7}
 @ PUSH, encoding T2 (32-bit)
 @ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
 it ge
-pushge {r1, r13, r7}
+pushge {r1, r3, r7}
 @ PUSH, encoding T3 (32-bit)
 @ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
 it ge
-pushge {r13}
+pushge {r3}
 
 @ REV, encoding T1
 @ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
@@ -614,9 +614,10 @@ stmge r1!, {r2, r3}
 @ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
 it ge
 stmge r1, {r2, r3}
+@ STM, encoding T3 (32-bit)
 @ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
 it ge
-stmge r1!, {r2, r13}
+stmge r1!, {r2, r3}
 
 @ LDM, encoding T1
 @ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
diff --git a/test/MC/ARM/virtexts-arm.s b/test/MC/ARM/virtexts-arm.s
new file mode 100644
index 0000000..a67a8fe
--- /dev/null
+++ b/test/MC/ARM/virtexts-arm.s
@@ -0,0 +1,42 @@
+# RUN: llvm-mc -triple armv7 -mattr=virtualization -show-encoding %s | FileCheck %s --check-prefix=CHECK-ARM
+
+    hvc    #1
+    hvc    #7
+    hvc    #257
+    hvc    #65535
+# CHECK-ARM: [0x71,0x00,0x40,0xe1]
+# CHECK-ARM: [0x77,0x00,0x40,0xe1]
+# CHECK-ARM: [0x71,0x10,0x40,0xe1]
+# CHECK-ARM: [0x7f,0xff,0x4f,0xe1]
+
+    eret
+    ereteq
+    eretne
+    ereths
+    eretlo
+    eretmi
+    eretpl
+    eretvs
+    eretvc
+    erethi
+    eretls
+    eretge
+    eretlt
+    eretgt
+    eretle
+# CHECK-ARM: [0x6e,0x00,0x60,0xe1]
+# CHECK-ARM: [0x6e,0x00,0x60,0x01]
+# CHECK-ARM: [0x6e,0x00,0x60,0x11]
+# CHECK-ARM: [0x6e,0x00,0x60,0x21]
+# CHECK-ARM: [0x6e,0x00,0x60,0x31]
+# CHECK-ARM: [0x6e,0x00,0x60,0x41]
+# CHECK-ARM: [0x6e,0x00,0x60,0x51]
+# CHECK-ARM: [0x6e,0x00,0x60,0x61]
+# CHECK-ARM: [0x6e,0x00,0x60,0x71]
+# CHECK-ARM: [0x6e,0x00,0x60,0x81]
+# CHECK-ARM: [0x6e,0x00,0x60,0x91]
+# CHECK-ARM: [0x6e,0x00,0x60,0xa1]
+# CHECK-ARM: [0x6e,0x00,0x60,0xb1]
+# CHECK-ARM: [0x6e,0x00,0x60,0xc1]
+# CHECK-ARM: [0x6e,0x00,0x60,0xd1]
+
diff --git a/test/MC/ARM/virtexts-thumb.s b/test/MC/ARM/virtexts-thumb.s
new file mode 100644
index 0000000..d911e1d
--- /dev/null
+++ b/test/MC/ARM/virtexts-thumb.s
@@ -0,0 +1,59 @@
+# RUN: llvm-mc -triple thumbv7 -mattr=virtualization -show-encoding %s | FileCheck %s --check-prefix=CHECK-THUMB
+
+    hvc    #1
+    hvc    #7
+    hvc    #257
+    hvc    #65535
+# CHECK-THUMB: [0xe0,0xf7,0x01,0x80]
+# CHECK-THUMB: [0xe0,0xf7,0x07,0x80]
+# CHECK-THUMB: [0xe0,0xf7,0x01,0x81]
+# CHECK-THUMB: [0xef,0xf7,0xff,0x8f]
+
+    hvc.w    #1
+    hvc.w    #7
+    hvc.w    #257
+    hvc.w    #65535
+# CHECK-THUMB: [0xe0,0xf7,0x01,0x80]
+# CHECK-THUMB: [0xe0,0xf7,0x07,0x80]
+# CHECK-THUMB: [0xe0,0xf7,0x01,0x81]
+# CHECK-THUMB: [0xef,0xf7,0xff,0x8f]
+
+    eret
+    it eq; ereteq
+    it ne; eretne
+    it hs; ereths
+    it lo; eretlo
+    it mi; eretmi
+    it pl; eretpl
+    it vs; eretvs
+    it vc; eretvc
+    it hi; erethi
+    it ls; eretls
+    it ge; eretge
+    it lt; eretlt
+    it gt; eretgt
+    it le; eretle
+# CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
+# CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
+# CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
+# CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
+# CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
+# CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
+# CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
+# CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
+# CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
+# CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
+# CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
+# CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
+# CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
+# CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
+# CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
+
+# SUBS PC, LR, #0 should have the same encoding as ERET.
+# The conditional forms can't be tested becuse the ARM assembler parser doesn't
+# accept SUBS<cond> PC, LR, #<imm>, only the unconditonal form is allowed. This
+# is due to the way that the custom parser handles optional operands; see the
+# FIXME in ARM/AsmParser/ARMAsmParser.cpp.
+
+    subs pc, lr, #0
+# CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
diff --git a/test/MC/AsmParser/directive_set.s b/test/MC/AsmParser/directive_set.s
index 69abce0..8d4180a 100644
--- a/test/MC/AsmParser/directive_set.s
+++ b/test/MC/AsmParser/directive_set.s
@@ -1,12 +1,14 @@
-# RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s
+# RUN: llvm-mc -triple i386-unknown-elf %s | FileCheck %s
 
 # CHECK: TEST0:
 # CHECK: a = 0
+# CHECK-NOT: .no_dead_strip a
 TEST0:  
         .set a, 0
         
 # CHECK: TEST1:
 # CHECK: a = 0
+# CHECK-NOT: .no_dead_strip a
 TEST1:  
         .equ a, 0
 
diff --git a/test/MC/COFF/bss_section.ll b/test/MC/COFF/bss_section.ll
index 1921eeb..d3a8aec 100644
--- a/test/MC/COFF/bss_section.ll
+++ b/test/MC/COFF/bss_section.ll
@@ -5,5 +5,6 @@
 @"\01?thingy@@3Ufoo@@B" = global %struct.foo zeroinitializer, align 4
 ; CHECK: .bss
 
-@thingy_linkonce = linkonce_odr global %struct.foo zeroinitializer, align 4
-; CHECK: .section .bss,"wb",discard,_thingy_linkonce
+$thingy_linkonce = comdat any
+@thingy_linkonce = linkonce_odr global %struct.foo zeroinitializer, comdat, align 4
+; CHECK: .section .bss,"bw",discard,_thingy_linkonce
diff --git a/test/MC/COFF/const-gv-with-rel-init.ll b/test/MC/COFF/const-gv-with-rel-init.ll
index 7d3c5f6..fde5a17 100644
--- a/test/MC/COFF/const-gv-with-rel-init.ll
+++ b/test/MC/COFF/const-gv-with-rel-init.ll
@@ -5,7 +5,7 @@ define void @f() {
 }
 
 @ptr = constant void ()* @f, section ".CRT$XLB", align 8
-; CHECK:  .section  .CRT$XLB,"rd"
+; CHECK:  .section  .CRT$XLB,"dr"
 
 @weak_array = weak_odr unnamed_addr constant [1 x i8*] [i8* bitcast (void ()* @f to i8*)]
-; CHECK:  .section  .rdata,"rd",discard,weak_array
+; CHECK:  .section  .rdata,"dr"
diff --git a/test/MC/COFF/diff.s b/test/MC/COFF/diff.s
index 820272a..5111600 100644
--- a/test/MC/COFF/diff.s
+++ b/test/MC/COFF/diff.s
@@ -1,5 +1,23 @@
 // RUN: llvm-mc -filetype=obj -triple i686-pc-mingw32 %s | llvm-readobj -s -sr -sd | FileCheck %s
 
+.section baz, "xr"
+	.def	X
+	.scl	2;
+	.type	32;
+	.endef
+	.globl	X
+X:
+	mov	Y-X+42,	%eax
+	retl
+
+	.def	Y
+	.scl	2;
+	.type	32;
+	.endef
+	.globl	Y
+Y:
+	retl
+
 	.def	 _foobar;
 	.scl	2;
 	.type	32;
@@ -30,3 +48,10 @@ _rust_crate:
 // CHECK:        SectionData (
 // CHECK-NEXT:     0000: 00000000 00000000 1C000000 20000000
 // CHECK-NEXT:   )
+
+// CHECK:        Name: baz
+// CHECK:        Relocations [
+// CHECK-NEXT:   ]
+// CHECK:        SectionData (
+// CHECK-NEXT:     0000: A1300000 00C3C3
+// CHECK-NEXT:   )
diff --git a/test/MC/COFF/directive-section-characteristics.ll b/test/MC/COFF/directive-section-characteristics.ll
index ca8102a..a44c81d 100644
--- a/test/MC/COFF/directive-section-characteristics.ll
+++ b/test/MC/COFF/directive-section-characteristics.ll
@@ -7,7 +7,13 @@ entry:
 }
 
 ; CHECK: Section {
+; CHECK:   Name: .text
+; CHECK:   PointerToRawData: 0xB4
+; CHECK: }
+
+; CHECK: Section {
 ; CHECK:   Name: .drectve
+; CHECK:   PointerToRawData: 0xB8
 ; CHECK:   Characteristics [
 ; CHECK:     IMAGE_SCN_ALIGN_1BYTES
 ; CHECK:     IMAGE_SCN_LNK_INFO
diff --git a/test/MC/COFF/global_ctors_dtors.ll b/test/MC/COFF/global_ctors_dtors.ll
index ca17f24..be92c27 100644
--- a/test/MC/COFF/global_ctors_dtors.ll
+++ b/test/MC/COFF/global_ctors_dtors.ll
@@ -49,17 +49,17 @@ define i32 @main() nounwind {
   ret i32 0
 }
 
-; WIN32: .section .CRT$XCU,"rd"
+; WIN32: .section .CRT$XCU,"dr"
 ; WIN32: a_global_ctor
-; WIN32: .section .CRT$XCU,"rd",associative,{{_?}}b
+; WIN32: .section .CRT$XCU,"dr",associative,{{_?}}b
 ; WIN32: b_global_ctor
 ; WIN32-NOT: c_global_ctor
-; WIN32: .section .CRT$XTX,"rd"
+; WIN32: .section .CRT$XTX,"dr"
 ; WIN32: a_global_dtor
-; MINGW32: .section .ctors,"wd"
+; MINGW32: .section .ctors,"dw"
 ; MINGW32: a_global_ctor
-; MINGW32: .section .ctors,"wd",associative,{{_?}}b
+; MINGW32: .section .ctors,"dw",associative,{{_?}}b
 ; MINGW32: b_global_ctor
 ; MINGW32-NOT: c_global_ctor
-; MINGW32: .section .dtors,"wd"
+; MINGW32: .section .dtors,"dw"
 ; MINGW32: a_global_dtor
diff --git a/test/MC/COFF/initialised-data.ll b/test/MC/COFF/initialised-data.ll
index c428469..a2faac7 100644
--- a/test/MC/COFF/initialised-data.ll
+++ b/test/MC/COFF/initialised-data.ll
@@ -3,5 +3,5 @@
 
 @data = dllexport constant [5 x i8] c"data\00", align 1
 
-; CHECK: .section	.rdata,"rd"
+; CHECK: .section	.rdata,"dr"
 
diff --git a/test/MC/COFF/linker-options.ll b/test/MC/COFF/linker-options.ll
index 60baccf..afc55af 100755
--- a/test/MC/COFF/linker-options.ll
+++ b/test/MC/COFF/linker-options.ll
@@ -1,12 +1,6 @@
 ; RUN: llc -O0 -mtriple=i386-pc-win32 -filetype=asm -o - %s | FileCheck %s
 
-!0 = metadata !{ i32 6, metadata !"Linker Options",
-   metadata !{
-      metadata !{ metadata !"/DEFAULTLIB:msvcrt.lib" },
-      metadata !{ metadata !"/DEFAULTLIB:msvcrt.lib",
-                  metadata !"/DEFAULTLIB:secur32.lib" },
-      metadata !{ metadata !"/DEFAULTLIB:C:\5Cpath to\5Casan_rt.lib" },
-      metadata !{ metadata !"/with spaces" } } }
+!0 = !{i32 6, !"Linker Options", !{!{!"/DEFAULTLIB:msvcrt.lib"}, !{!"/DEFAULTLIB:msvcrt.lib", !"/DEFAULTLIB:secur32.lib"}, !{!"/DEFAULTLIB:\22C:\5Cpath to\5Casan_rt.lib\22"}, !{!"\22/with spaces\22"}}}
 
 !llvm.module.flags = !{ !0 }
 
@@ -18,6 +12,6 @@ define dllexport void @foo() {
 ; CHECK: .ascii   " /DEFAULTLIB:msvcrt.lib"
 ; CHECK: .ascii   " /DEFAULTLIB:msvcrt.lib"
 ; CHECK: .ascii   " /DEFAULTLIB:secur32.lib"
-; CHECK: .ascii   " \"/DEFAULTLIB:C:\\path to\\asan_rt.lib\""
+; CHECK: .ascii   " /DEFAULTLIB:\"C:\\path to\\asan_rt.lib\""
 ; CHECK: .ascii   " \"/with spaces\""
 ; CHECK: .ascii   " /EXPORT:_foo"
diff --git a/test/MC/COFF/section-passthru-flags.s b/test/MC/COFF/section-passthru-flags.s
index 3bd061b..96e42d2 100644
--- a/test/MC/COFF/section-passthru-flags.s
+++ b/test/MC/COFF/section-passthru-flags.s
@@ -3,5 +3,5 @@
 // CHECK: .section .klaatu,"wn"
 .section .barada,"y"
 // CHECK: .section .barada,"y"
-.section .nikto,"wds"
-// CHECK: .section .nikto,"wds"
+.section .nikto,"dws"
+// CHECK: .section .nikto,"dws"
diff --git a/test/MC/COFF/seh-section.s b/test/MC/COFF/seh-section.s
index 026c0d7..c95eece 100644
--- a/test/MC/COFF/seh-section.s
+++ b/test/MC/COFF/seh-section.s
@@ -1,5 +1,7 @@
-// This test ensures that, if the section containing a function has a suffix
-// (e.g. .text$foo), its unwind info section also has a suffix (.xdata$foo).
+// This test ensures functions in custom sections get unwind info emitted in a
+// distinct .xdata section. Ideally we'd just emit a second .xdata section with
+// the same name and characteristics, but MC uniques sections by name and
+// characteristics, so that is not possible.
 // RUN: llvm-mc -filetype=obj -triple x86_64-pc-win32 %s | llvm-readobj -s -sd | FileCheck %s
 
 // CHECK:      Name: .xdata$foo
@@ -20,6 +22,44 @@
 // CHECK-NEXT:   0000: 01050200 05500402
 // CHECK-NEXT: )
 
+// CHECK:      Name: .xdata$.mytext
+// CHECK-NEXT: VirtualSize
+// CHECK-NEXT: VirtualAddress
+// CHECK-NEXT: RawDataSize: 8
+// CHECK-NEXT: PointerToRawData
+// CHECK-NEXT: PointerToRelocations
+// CHECK-NEXT: PointerToLineNumbers
+// CHECK-NEXT: RelocationCount: 0
+// CHECK-NEXT: LineNumberCount: 0
+// CHECK-NEXT: Characteristics [
+// CHECK-NEXT:   IMAGE_SCN_ALIGN_4BYTES
+// CHECK-NEXT:   IMAGE_SCN_CNT_INITIALIZED_DATA
+// CHECK-NEXT:   IMAGE_SCN_MEM_READ
+// CHECK-NEXT: ]
+// CHECK-NEXT: SectionData (
+// CHECK-NEXT:   0000: 01050200 05500402
+// CHECK-NEXT: )
+
+// CHECK:      Name: .xdata
+// CHECK-NEXT: VirtualSize
+// CHECK-NEXT: VirtualAddress
+// CHECK-NEXT: RawDataSize: 8
+// CHECK-NEXT: PointerToRawData
+// CHECK-NEXT: PointerToRelocations
+// CHECK-NEXT: PointerToLineNumbers
+// CHECK-NEXT: RelocationCount: 0
+// CHECK-NEXT: LineNumberCount: 0
+// CHECK-NEXT: Characteristics [
+// CHECK-NEXT:   IMAGE_SCN_ALIGN_4BYTES
+// CHECK-NEXT:   IMAGE_SCN_CNT_INITIALIZED_DATA
+// CHECK-NEXT:   IMAGE_SCN_MEM_READ
+// CHECK-NEXT: ]
+// CHECK-NEXT: SectionData (
+// CHECK-NEXT:   0000: 01050200 05500402
+// CHECK-NEXT: )
+
+
+
     .section .text$foo,"x"
     .globl foo
     .def foo; .scl 2; .type 32; .endef
@@ -35,3 +75,33 @@ foo:
     ret
     .seh_endproc
 
+    .section .mytext,"x"
+    .globl bar
+    .def bar; .scl 2; .type 32; .endef
+    .seh_proc bar
+bar:
+    subq $8, %rsp
+    .seh_stackalloc 8
+    pushq %rbp
+    .seh_pushreg %rbp
+    .seh_endprologue
+    popq %rbp
+    addq $8, %rsp
+    ret
+    .seh_endproc
+
+    .section .text
+    .globl baz
+    .def baz; .scl 2; .type 32; .endef
+    .seh_proc baz
+baz:
+    subq $8, %rsp
+    .seh_stackalloc 8
+    pushq %rbp
+    .seh_pushreg %rbp
+    .seh_endprologue
+    popq %rbp
+    addq $8, %rsp
+    ret
+    .seh_endproc
+
diff --git a/test/MC/COFF/weak-symbol.ll b/test/MC/COFF/weak-symbol.ll
deleted file mode 100644
index fd78307..0000000
--- a/test/MC/COFF/weak-symbol.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; Test that weak functions and globals are placed into selectany COMDAT
-; sections with the mangled name as suffix. Ensure that the weak linkage
-; type is not ignored by the backend if the section was specialized.
-;
-; RUN: llc -mtriple=i686-pc-win32 %s     -o - | FileCheck %s --check-prefix=X86
-; RUN: llc -mtriple=i686-pc-mingw32 %s   -o - | FileCheck %s --check-prefix=X86
-; RUN: llc -mtriple=x86_64-pc-win32 %s   -o - | FileCheck %s --check-prefix=X64
-; RUN: llc -mtriple=x86_64-pc-mingw32 %s -o - | FileCheck %s --check-prefix=X64
-
-; Mangled function
-; X86: .section .text,"xr",discard,__Z3foo
-; X86: .globl __Z3foo
-;
-; X64: .section .text,"xr",discard,_Z3foo
-; X64: .globl _Z3foo
-define weak void @_Z3foo() {
-  ret void
-}
-
-; Unmangled function
-; X86: .section .sect,"xr",discard,_f
-; X86: .globl _f
-;
-; X64: .section .sect,"xr",discard,f
-; X64: .globl f
-define weak void @f() section ".sect" {
-  ret void
-}
-
-; Weak global
-; X86: .section .data,"rd",discard,_a
-; X86: .globl _a
-; X86: .zero 12
-;
-; X64: .section .data,"rd",discard,a
-; X64: .globl a
-; X64: .zero 12
-@a = weak unnamed_addr constant { i32, i32, i32 } { i32 0, i32 0, i32 0}, section ".data"
-
-; X86:  .section        .tls$,"wd",discard,_b
-; X86:  .globl  _b
-; X86:  .long   0
-;
-; X64:  .section        .tls$,"wd",discard,b
-; X64:  .globl  b
-; X64:  .long   0
-
-@b = weak_odr thread_local global i32 0, align 4
diff --git a/test/MC/Disassembler/ARM/arm-tests.txt b/test/MC/Disassembler/ARM/arm-tests.txt
index e82f75a..008bb11 100644
--- a/test/MC/Disassembler/ARM/arm-tests.txt
+++ b/test/MC/Disassembler/ARM/arm-tests.txt
@@ -1,6 +1,6 @@
 # RUN: llvm-mc --disassemble %s -triple=armv7-apple-darwin9 -mcpu=cortex-a9 | FileCheck %s
 
-# CHECK:	addpl	r4, pc, #318767104
+# CHECK:	addpl	r4, pc, #76, #10
 0x4c 0x45 0x8f 0x52
 
 # CHECK:	b	#0
diff --git a/test/MC/Disassembler/ARM/basic-arm-instructions.txt b/test/MC/Disassembler/ARM/basic-arm-instructions.txt
index 8bcf4e6..335e69f 100644
--- a/test/MC/Disassembler/ARM/basic-arm-instructions.txt
+++ b/test/MC/Disassembler/ARM/basic-arm-instructions.txt
@@ -10,9 +10,12 @@
 # CHECK: adc r1, r2, #983040
 # CHECK: adc r1, r2, #15728640
 # CHECK: adc r1, r2, #251658240
-# CHECK: adc r1, r2, #4026531840
-# CHECK: adc r1, r2, #4026531855
+# CHECK: adc r1, r2, #-268435456
+# CHECK: adc r1, r2, #-268435441
+# CHECK: adc r7, r8, #-2147483638
+# CHECK: adc r7, r8, #40, #2
 # CHECK: adcs r1, r2, #3840
+# CHECK: adcs r7, r8, #40, #2
 # CHECK: adcseq r1, r2, #3840
 # CHECK: adceq r1, r2, #3840
 
@@ -25,8 +28,11 @@
 0x0f 0x14 0xa2 0xe2
 0x0f 0x12 0xa2 0xe2
 0xff 0x12 0xa2 0xe2
+0x2a 0x71 0xa8 0xe2
+0x28 0x71 0xa8 0xe2
 
 0x0f 0x1c 0xb2 0xe2
+0x28 0x71 0xb8 0xe2
 0x0f 0x1c 0xb2 0x02
 0x0f 0x1c 0xa2 0x02
 
@@ -112,6 +118,8 @@
 # ADD
 #------------------------------------------------------------------------------
 # CHECK: add r4, r5, #61440
+# CHECK: add r7, r8, #-2147483638
+# CHECK: add r7, r8, #40, #2
 # CHECK: add r4, r5, r6
 # CHECK: add r4, r5, r6, lsl #5
 # CHECK: add r4, r5, r6, lsr #5
@@ -138,6 +146,8 @@
 # CHECK: add r4, r4, r5, rrx
 
 0x0f 0x4a 0x85 0xe2
+0x2a 0x71 0x88 0xe2
+0x28 0x71 0x88 0xe2
 0x06 0x40 0x85 0xe0
 0x86 0x42 0x85 0xe0
 0xa6 0x42 0x85 0xe0
@@ -165,6 +175,15 @@
 0x65 0x40 0x84 0xe0
 
 #------------------------------------------------------------------------------
+# ADDS
+#------------------------------------------------------------------------------
+# CHECK: adds   r7, r8, #-2147483638
+# CHECK: adds   r7, r8, #40, #2
+
+0x2a 0x71 0x98 0xe2
+0x28 0x71 0x98 0xe2
+
+#------------------------------------------------------------------------------
 # ADR
 #------------------------------------------------------------------------------
 # CHECK: add	r2, pc, #3
@@ -183,6 +202,8 @@
 # AND
 #------------------------------------------------------------------------------
 # CHECK: and r10, r1, #15
+# CHECK: and r7, r8, #-2147483638
+# CHECK: and r7, r8, #40, #2
 # CHECK: and r10, r1, r6
 # CHECK: and r10, r1, r6, lsl #10
 # CHECK: and r10, r1, r6, lsr #10
@@ -209,6 +230,8 @@
 # CHECK: and r10, r10, r1, rrx
 
 0x0f 0xa0 0x01 0xe2
+0x2a 0x71 0x08 0xe2
+0x28 0x71 0x08 0xe2
 0x06 0xa0 0x01 0xe0
 0x06 0xa5 0x01 0xe0
 0x26 0xa5 0x01 0xe0
@@ -262,6 +285,8 @@
 # BIC
 #------------------------------------------------------------------------------
 # CHECK: bic r10, r1, #15
+# CHECK: bic r7, r8, #-2147483638
+# CHECK: bic r7, r8, #40, #2
 # CHECK: bic r10, r1, r6
 # CHECK: bic r10, r1, r6, lsl #10
 # CHECK: bic r10, r1, r6, lsr #10
@@ -288,6 +313,8 @@
 # CHECK: bic r10, r10, r1, rrx
 
 0x0f 0xa0 0xc1 0xe3
+0x2a 0x71 0xc8 0xe3
+0x28 0x71 0xc8 0xe3
 0x06 0xa0 0xc1 0xe1
 0x06 0xa5 0xc1 0xe1
 0x26 0xa5 0xc1 0xe1
@@ -393,6 +420,8 @@
 # CMN
 #------------------------------------------------------------------------------
 # CHECK: cmn r1, #15
+# CHECK: cmn r7, #40, #2
+# CHECK: cmn r7, #-2147483638
 # CHECK: cmn r1, r6
 # CHECK: cmn r1, r6, lsl #10
 # CHECK: cmn r1, r6, lsr #10
@@ -406,6 +435,8 @@
 # CHECK: cmn r1, r6, rrx
 
 0x0f 0x00 0x71 0xe3
+0x28 0x01 0x77 0xe3
+0x2a 0x01 0x77 0xe3
 0x06 0x00 0x71 0xe1
 0x06 0x05 0x71 0xe1
 0x26 0x05 0x71 0xe1
@@ -422,6 +453,8 @@
 # CMP
 #------------------------------------------------------------------------------
 # CHECK: cmp r1, #15
+# CHECK: cmp r7, #40, #2
+# CHECK: cmp r7, #-2147483638
 # CHECK: cmp r1, r6
 # CHECK: cmp r1, r6, lsl #10
 # CHECK: cmp r1, r6, lsr #10
@@ -435,6 +468,8 @@
 # CHECK: cmp r1, r6, rrx
 
 0x0f 0x00 0x51 0xe3
+0x28 0x01 0x57 0xe3
+0x2a 0x01 0x57 0xe3
 0x06 0x00 0x51 0xe1
 0x06 0x05 0x51 0xe1
 0x26 0x05 0x51 0xe1
@@ -556,6 +591,8 @@
 # EOR
 #------------------------------------------------------------------------------
 # CHECK: eor r4, r5, #61440
+# CHECK: eor r7, r8, #-2147483638
+# CHECK: eor r7, r8, #40, #2
 # CHECK: eor r4, r5, r6
 # CHECK: eor r4, r5, r6, lsl #5
 # CHECK: eor r4, r5, r6, lsr #5
@@ -582,6 +619,8 @@
 # CHECK: eor r4, r4, r5, rrx
 
 0x0f 0x4a 0x25 0xe2
+0x2a 0x71 0x28 0xe2
+0x28 0x71 0x28 0xe2
 0x06 0x40 0x25 0xe0
 0x86 0x42 0x25 0xe0
 0xa6 0x42 0x25 0xe0
@@ -714,10 +753,15 @@
 # CHECK: mov r4, #4080
 # CHECK: mov r5, #16711680
 # CHECK: mov sp, #35
+# CHECK: mov r9, #240, #30
+# CHECK: mov r7, #-2147483638
+# CHECK: mov pc, #2147483658
 # CHECK: movw r6, #65535
 # CHECK: movw r9, #65535
 # CHECK: movw sp, #1193
 # CHECK: movs r3, #7
+# CHECK: movs r11, #99
+# CHECK: movs r11, #240, #30
 # CHECK: moveq r4, #4080
 # CHECK: movseq r5, #16711680
 
@@ -725,10 +769,15 @@
 0xff 0x4e 0xa0 0xe3
 0xff 0x58 0xa0 0xe3
 0x23 0xd0 0xa0 0xe3
+0xf0 0x9f 0xa0 0xe3
+0x2a 0x71 0xa0 0xe3
+0x2a 0xf1 0xa0 0xe3
 0xff 0x6f 0x0f 0xe3
 0xff 0x9f 0x0f 0xe3
 0xa9 0xd4 0x00 0xe3
 0x07 0x30 0xb0 0xe3
+0x63 0xb0 0xb0 0xe3
+0xf0 0xbf 0xb0 0xe3
 0xff 0x4e 0xa0 0x03
 0xff 0x58 0xb0 0x03
 
@@ -810,6 +859,8 @@
 # CHECK: msr  SPSR_fc, #5
 # CHECK: msr  SPSR_fsxc, #5
 # CHECK: msr  CPSR_fsxc, #5
+# CHECK: msr  APSR_nzcvq, #2147483658
+# CHECK: msr  SPSR_fsxc, #40, #2
 
 0x05 0xf0 0x29 0xe3
 0x05 0xf0 0x24 0xe3
@@ -825,6 +876,8 @@
 0x05 0xf0 0x69 0xe3
 0x05 0xf0 0x6f 0xe3
 0x05 0xf0 0x2f 0xe3
+0x2a 0xf1 0x28 0xe3
+0x28 0xf1 0x6f 0xe3
 
 # CHECK: msr  CPSR_fc, r0
 # CHECK: msr  APSR_g, r0
@@ -877,14 +930,22 @@
 # CHECK: mvn r3, #7
 # CHECK: mvn r4, #4080
 # CHECK: mvn r5, #16711680
+# CHECK: mvn r7, #40, #2
+# CHECK: mvn r7, #-2147483638
 # CHECK: mvns r3, #7
+# CHECK: mvns r11, #240, #30
+# CHECK: mvns r11, #-2147483638
 # CHECK: mvneq r4, #4080
 # CHECK: mvnseq r5, #16711680
 
 0x07 0x30 0xe0 0xe3
 0xff 0x4e 0xe0 0xe3
 0xff 0x58 0xe0 0xe3
+0x28 0x71 0xe0 0xe3
+0x2a 0x71 0xe0 0xe3
 0x07 0x30 0xf0 0xe3
+0xf0 0xbf 0xf0 0xe3
+0x2a 0xb1 0xf0 0xe3
 0xff 0x4e 0xe0 0x03
 0xff 0x58 0xf0 0x03
 
@@ -940,6 +1001,8 @@
 # ORR
 #------------------------------------------------------------------------------
 # CHECK: orr r4, r5, #61440
+# CHECK: orr r7, r8, #-2147483638
+# CHECK: orr r7, r8, #40, #2
 # CHECK: orr r4, r5, r6
 # CHECK: orr r4, r5, r6, lsl #5
 # CHECK: orr r4, r5, r6, lsr #5
@@ -966,6 +1029,8 @@
 # CHECK: orr r4, r4, r5, rrx
 
 0x0f 0x4a 0x85 0xe3
+0x2a 0x71 0x88 0xe3
+0x28 0x71 0x88 0xe3
 0x06 0x40 0x85 0xe1
 0x86 0x42 0x85 0xe1
 0xa6 0x42 0x85 0xe1
@@ -1204,6 +1269,8 @@
 # RSB
 #------------------------------------------------------------------------------
 # CHECK: rsb r4, r5, #61440
+# CHECK: rsb r7, r8, #-2147483638
+# CHECK: rsb r7, r8, #40, #2
 # CHECK: rsb r4, r5, r6
 # CHECK: rsb r4, r5, r6, lsl #5
 # CHECK: rsblo r4, r5, r6, lsr #5
@@ -1230,6 +1297,8 @@
 # CHECK: rsb r4, r4, r5, rrx
 
 0x0f 0x4a 0x65 0xe2
+0x2a 0x71 0x68 0xe2
+0x28 0x71 0x68 0xe2
 0x06 0x40 0x65 0xe0
 0x86 0x42 0x65 0xe0
 0xa6 0x42 0x65 0x30
@@ -1256,9 +1325,20 @@
 0x65 0x40 0x64 0xe0
 
 #------------------------------------------------------------------------------
+# RSBS
+#------------------------------------------------------------------------------
+# CHECK: rsbs   r7, r8, #-2147483638
+# CHECK: rsbs   r7, r8, #40, #2
+
+0x2a 0x71 0x78 0xe2
+0x28 0x71 0x78 0xe2
+
+#------------------------------------------------------------------------------
 # RSC
 #------------------------------------------------------------------------------
 # CHECK: rsc r4, r5, #61440
+# CHECK: rsc r7, r8, #-2147483638
+# CHECK: rsc r7, r8, #40, #2
 # CHECK: rsc r4, r5, r6
 # CHECK: rsc r4, r5, r6, lsl #5
 # CHECK: rsclo r4, r5, r6, lsr #5
@@ -1283,6 +1363,8 @@
 # CHECK: rsc r6, r6, r7, ror r9
 
 0x0f 0x4a 0xe5 0xe2
+0x2a 0x71 0xe8 0xe2
+0x28 0x71 0xe8 0xe2
 0x06 0x40 0xe5 0xe0
 0x86 0x42 0xe5 0xe0
 0xa6 0x42 0xe5 0x30
@@ -1357,6 +1439,8 @@
 # SBC
 #------------------------------------------------------------------------------
 # CHECK: sbc r4, r5, #61440
+# CHECK: sbc r7, r8, #-2147483638
+# CHECK: sbc r7, r8, #40, #2
 # CHECK: sbc r4, r5, r6
 # CHECK: sbc r4, r5, r6, lsl #5
 # CHECK: sbc r4, r5, r6, lsr #5
@@ -1381,6 +1465,8 @@
 # CHECK: sbc r6, r6, r7, ror r9
 
 0x0f 0x4a 0xc5 0xe2
+0x2a 0x71 0xc8 0xe2
+0x28 0x71 0xc8 0xe2
 0x06 0x40 0xc5 0xe0
 0x86 0x42 0xc5 0xe0
 0xa6 0x42 0xc5 0xe0
@@ -1868,6 +1954,8 @@
 # SUB
 #------------------------------------------------------------------------------
 # CHECK: sub r4, r5, #61440
+# CHECK: sub r7, r8, #-2147483638
+# CHECK: sub r7, r8, #40, #2
 # CHECK: sub r4, r5, r6
 # CHECK: sub r4, r5, r6, lsl #5
 # CHECK: sub r4, r5, r6, lsr #5
@@ -1892,6 +1980,8 @@
 # CHECK: sub r6, r6, r7, ror r9
 
 0x0f 0x4a 0x45 0xe2
+0x2a 0x71 0x48 0xe2
+0x28 0x71 0x48 0xe2
 0x06 0x40 0x45 0xe0
 0x86 0x42 0x45 0xe0
 0xa6 0x42 0x45 0xe0
@@ -1916,6 +2006,14 @@
 0x57 0x69 0x46 0xe0
 0x77 0x69 0x46 0xe0
 
+#------------------------------------------------------------------------------
+# SUBS
+#------------------------------------------------------------------------------
+# CHECK: subs   r7, r8, #-2147483638
+# CHECK: subs   r7, r8, #40, #2
+
+0x2a 0x71 0x58 0xe2
+0x28 0x71 0x58 0xe2
 
 #------------------------------------------------------------------------------
 # SVC
@@ -2044,6 +2142,8 @@
 # TEQ
 #------------------------------------------------------------------------------
 # CHECK: teq r5, #61440
+# CHECK: teq r7, #-2147483638
+# CHECK: teq r7, #40, #2
 # CHECK: teq r4, r5
 # CHECK: teq r4, r5, lsl #5
 # CHECK: teq r4, r5, lsr #5
@@ -2056,6 +2156,8 @@
 # CHECK: teq r6, r7, ror r9
 
 0x0f 0x0a 0x35 0xe3
+0x2a 0x01 0x37 0xe3
+0x28 0x01 0x37 0xe3
 0x05 0x00 0x34 0xe1
 0x85 0x02 0x34 0xe1
 0xa5 0x02 0x34 0xe1
@@ -2072,6 +2174,8 @@
 # TST
 #------------------------------------------------------------------------------
 # CHECK: tst r5, #61440
+# CHECK: tst r7, #-2147483638
+# CHECK: tst r7, #40, #2
 # CHECK: tst r4, r5
 # CHECK: tst r4, r5, lsl #5
 # CHECK: tst r4, r5, lsr #5
@@ -2084,6 +2188,8 @@
 # CHECK: tst r6, r7, ror r9
 
 0x0f 0x0a 0x15 0xe3
+0x2a 0x01 0x17 0xe3
+0x28 0x01 0x17 0xe3
 0x05 0x00 0x14 0xe1
 0x85 0x02 0x14 0xe1
 0xa5 0x02 0x14 0xe1
diff --git a/test/MC/Disassembler/ARM/invalid-virtexts.arm.txt b/test/MC/Disassembler/ARM/invalid-virtexts.arm.txt
new file mode 100644
index 0000000..1daada9
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-virtexts.arm.txt
@@ -0,0 +1,10 @@
+# RUN: not llvm-mc -disassemble -triple armv7a -mcpu=cortex-a15 %s 2>&1 | FileCheck --check-prefix=CHECK-ARM %s
+
+# HVC (ARM)
+[0x7f,0xff,0x4f,0xf1]
+# CHECK-ARM:   warning: invalid instruction encoding
+
+[0x70,0xff,0x4f,0x01]
+[0x7f,0xff,0x4f,0xd1]
+# CHECK-ARM:   warning: potentially undefined instruction encoding
+# CHECK-ARM:   warning: potentially undefined instruction encoding
diff --git a/test/MC/Disassembler/ARM/move-banked-regs-arm.txt b/test/MC/Disassembler/ARM/move-banked-regs-arm.txt
index dd1d463..b92c577 100644
--- a/test/MC/Disassembler/ARM/move-banked-regs-arm.txt
+++ b/test/MC/Disassembler/ARM/move-banked-regs-arm.txt
@@ -1,13 +1,13 @@
 @ RUN: llvm-mc -disassemble -triple armv7 -mcpu=cyclone %s | FileCheck %s
 
 
-[0x00,0x22,0x20,0xe1]
-[0x00,0x32,0x21,0xe1]
-[0x00,0x52,0x22,0xe1]
-[0x00,0x72,0x23,0xe1]
-[0x00,0xb2,0x24,0xe1]
-[0x00,0x12,0x25,0xe1]
-[0x00,0x22,0x26,0xe1]
+[0x00,0x22,0x00,0xe1]
+[0x00,0x32,0x01,0xe1]
+[0x00,0x52,0x02,0xe1]
+[0x00,0x72,0x03,0xe1]
+[0x00,0xb2,0x04,0xe1]
+[0x00,0x12,0x05,0xe1]
+[0x00,0x22,0x06,0xe1]
 @ CHECK:         mrs     r2, r8_usr
 @ CHECK:         mrs     r3, r9_usr
 @ CHECK:         mrs     r5, r10_usr
@@ -16,14 +16,14 @@
 @ CHECK:         mrs     r1, sp_usr
 @ CHECK:         mrs     r2, lr_usr
 
-[0x00,0x22,0x28,0xe1]
-[0x00,0x32,0x29,0xe1]
-[0x00,0x52,0x2a,0xe1]
-[0x00,0x72,0x2b,0xe1]
-[0x00,0xb2,0x2c,0xe1]
-[0x00,0x12,0x2d,0xe1]
-[0x00,0x22,0x2e,0xe1]
-[0x00,0x32,0x6e,0xe1]
+[0x00,0x22,0x08,0xe1]
+[0x00,0x32,0x09,0xe1]
+[0x00,0x52,0x0a,0xe1]
+[0x00,0x72,0x0b,0xe1]
+[0x00,0xb2,0x0c,0xe1]
+[0x00,0x12,0x0d,0xe1]
+[0x00,0x22,0x0e,0xe1]
+[0x00,0x32,0x4e,0xe1]
 @ CHECK:         mrs     r2, r8_fiq
 @ CHECK:         mrs     r3, r9_fiq
 @ CHECK:         mrs     r5, r10_fiq
@@ -33,44 +33,44 @@
 @ CHECK:         mrs     r2, lr_fiq
 @ CHECK:         mrs     r3, SPSR_fiq
 
-[0x00,0x43,0x20,0xe1]
-[0x00,0x93,0x21,0xe1]
-[0x00,0x13,0x60,0xe1]
+[0x00,0x43,0x00,0xe1]
+[0x00,0x93,0x01,0xe1]
+[0x00,0x13,0x40,0xe1]
 @ CHECK:         mrs     r4, lr_irq
 @ CHECK:         mrs     r9, sp_irq
 @ CHECK:         mrs     r1, SPSR_irq
 
-[0x00,0x13,0x22,0xe1]
-[0x00,0x33,0x23,0xe1]
-[0x00,0x53,0x62,0xe1]
+[0x00,0x13,0x02,0xe1]
+[0x00,0x33,0x03,0xe1]
+[0x00,0x53,0x42,0xe1]
 @ CHECK:         mrs     r1, lr_svc
 @ CHECK:         mrs     r3, sp_svc
 @ CHECK:         mrs     r5, SPSR_svc
 
-[0x00,0x53,0x24,0xe1]
-[0x00,0x73,0x25,0xe1]
-[0x00,0x93,0x64,0xe1]
+[0x00,0x53,0x04,0xe1]
+[0x00,0x73,0x05,0xe1]
+[0x00,0x93,0x44,0xe1]
 @ CHECK:         mrs     r5, lr_abt
 @ CHECK:         mrs     r7, sp_abt
 @ CHECK:         mrs     r9, SPSR_abt
 
-[0x00,0x93,0x26,0xe1]
-[0x00,0xb3,0x27,0xe1]
-[0x00,0xc3,0x66,0xe1]
+[0x00,0x93,0x06,0xe1]
+[0x00,0xb3,0x07,0xe1]
+[0x00,0xc3,0x46,0xe1]
 @ CHECK:         mrs     r9, lr_und
 @ CHECK:         mrs     r11, sp_und
 @ CHECK:         mrs     r12, SPSR_und
 
-[0x00,0x23,0x2c,0xe1]
-[0x00,0x43,0x2d,0xe1]
-[0x00,0x63,0x6c,0xe1]
+[0x00,0x23,0x0c,0xe1]
+[0x00,0x43,0x0d,0xe1]
+[0x00,0x63,0x4c,0xe1]
 @ CHECK:         mrs     r2, lr_mon
 @ CHECK:         mrs     r4, sp_mon
 @ CHECK:         mrs     r6, SPSR_mon
 
-[0x00,0x63,0x2e,0xe1]
-[0x00,0x83,0x2f,0xe1]
-[0x00,0xa3,0x6e,0xe1]
+[0x00,0x63,0x0e,0xe1]
+[0x00,0x83,0x0f,0xe1]
+[0x00,0xa3,0x4e,0xe1]
 @ CHECK:         mrs     r6, elr_hyp
 @ CHECK:         mrs     r8, sp_hyp
 @ CHECK:         mrs     r10, SPSR_hyp
diff --git a/test/MC/Disassembler/ARM/virtexts-arm.txt b/test/MC/Disassembler/ARM/virtexts-arm.txt
new file mode 100644
index 0000000..a18466f
--- /dev/null
+++ b/test/MC/Disassembler/ARM/virtexts-arm.txt
@@ -0,0 +1,41 @@
+# RUN: llvm-mc -disassemble -triple armv7a -mcpu=cortex-a15 %s | FileCheck %s
+
+[0x71,0x00,0x40,0xe1]
+[0x77,0x00,0x40,0xe1]
+[0x71,0x10,0x40,0xe1]
+[0x7f,0xff,0x4f,0xe1]
+# CHECK:    hvc    #1
+# CHECK:    hvc    #7
+# CHECK:    hvc    #257
+# CHECK:    hvc    #65535
+
+[0x6e,0x00,0x60,0xe1]
+[0x6e,0x00,0x60,0x01]
+[0x6e,0x00,0x60,0x11]
+[0x6e,0x00,0x60,0x21]
+[0x6e,0x00,0x60,0x31]
+[0x6e,0x00,0x60,0x41]
+[0x6e,0x00,0x60,0x51]
+[0x6e,0x00,0x60,0x61]
+[0x6e,0x00,0x60,0x71]
+[0x6e,0x00,0x60,0x81]
+[0x6e,0x00,0x60,0x91]
+[0x6e,0x00,0x60,0xa1]
+[0x6e,0x00,0x60,0xb1]
+[0x6e,0x00,0x60,0xc1]
+[0x6e,0x00,0x60,0xd1]
+# CHECK:    eret
+# CHECK:    ereteq
+# CHECK:    eretne
+# CHECK:    ereths
+# CHECK:    eretlo
+# CHECK:    eretmi
+# CHECK:    eretpl
+# CHECK:    eretvs
+# CHECK:    eretvc
+# CHECK:    erethi
+# CHECK:    eretls
+# CHECK:    eretge
+# CHECK:    eretlt
+# CHECK:    eretgt
+# CHECK:    eretle
diff --git a/test/MC/Disassembler/ARM/virtexts-thumb.txt b/test/MC/Disassembler/ARM/virtexts-thumb.txt
new file mode 100644
index 0000000..da0f621
--- /dev/null
+++ b/test/MC/Disassembler/ARM/virtexts-thumb.txt
@@ -0,0 +1,61 @@
+# RUN: llvm-mc -disassemble -triple thumbv7 -mcpu=cortex-a15 %s | FileCheck %s --check-prefix=CHECK-THUMB
+# RUN: not llvm-mc -disassemble -triple thumbv7 -mcpu=cortex-a9 %s 2>&1 | FileCheck %s --check-prefix=CHECK-NOVIRT
+
+[0xe0,0xf7,0x01,0x80]
+[0xe0,0xf7,0x07,0x80]
+[0xe0,0xf7,0x01,0x81]
+[0xef,0xf7,0xff,0x8f]
+# CHECK-THUMB:    hvc.w    #1
+# CHECK-THUMB:    hvc.w    #7
+# CHECK-THUMB:    hvc.w    #257
+# CHECK-THUMB:    hvc.w    #65535
+# CHECK-NOVIRT:    warning: invalid instruction encoding
+# CHECK-NOVIRT:    warning: invalid instruction encoding
+# CHECK-NOVIRT:    warning: invalid instruction encoding
+# CHECK-NOVIRT:    warning: invalid instruction encoding
+
+[0xde,0xf3,0x00,0x8f]
+[0x08,0xbf] [0xde,0xf3,0x00,0x8f]
+[0x18,0xbf] [0xde,0xf3,0x00,0x8f]
+[0x28,0xbf] [0xde,0xf3,0x00,0x8f]
+[0x38,0xbf] [0xde,0xf3,0x00,0x8f]
+[0x48,0xbf] [0xde,0xf3,0x00,0x8f]
+[0x58,0xbf] [0xde,0xf3,0x00,0x8f]
+[0x68,0xbf] [0xde,0xf3,0x00,0x8f]
+[0x78,0xbf] [0xde,0xf3,0x00,0x8f]
+[0x88,0xbf] [0xde,0xf3,0x00,0x8f]
+[0x98,0xbf] [0xde,0xf3,0x00,0x8f]
+[0xa8,0xbf] [0xde,0xf3,0x00,0x8f]
+[0xb8,0xbf] [0xde,0xf3,0x00,0x8f]
+[0xc8,0xbf] [0xde,0xf3,0x00,0x8f]
+[0xd8,0xbf] [0xde,0xf3,0x00,0x8f]
+# CHECK-THUMB:    eret
+# CHECK-THUMB:    ereteq
+# CHECK-THUMB:    eretne
+# CHECK-THUMB:    ereths
+# CHECK-THUMB:    eretlo
+# CHECK-THUMB:    eretmi
+# CHECK-THUMB:    eretpl
+# CHECK-THUMB:    eretvs
+# CHECK-THUMB:    eretvc
+# CHECK-THUMB:    erethi
+# CHECK-THUMB:    eretls
+# CHECK-THUMB:    eretge
+# CHECK-THUMB:    eretlt
+# CHECK-THUMB:    eretgt
+# CHECK-THUMB:    eretle
+# CHECK-NOVIRT:    subs    pc, lr, #0
+# CHECK-NOVIRT:    subseq  pc, lr, #0
+# CHECK-NOVIRT:    subsne  pc, lr, #0
+# CHECK-NOVIRT:    subshs  pc, lr, #0
+# CHECK-NOVIRT:    subslo  pc, lr, #0
+# CHECK-NOVIRT:    subsmi  pc, lr, #0
+# CHECK-NOVIRT:    subspl  pc, lr, #0
+# CHECK-NOVIRT:    subsvs  pc, lr, #0
+# CHECK-NOVIRT:    subsvc  pc, lr, #0
+# CHECK-NOVIRT:    subshi  pc, lr, #0
+# CHECK-NOVIRT:    subsls  pc, lr, #0
+# CHECK-NOVIRT:    subsge  pc, lr, #0
+# CHECK-NOVIRT:    subslt  pc, lr, #0
+# CHECK-NOVIRT:    subsgt  pc, lr, #0
+# CHECK-NOVIRT:    subsle  pc, lr, #0
diff --git a/test/MC/Disassembler/Hexagon/alu32_alu.txt b/test/MC/Disassembler/Hexagon/alu32_alu.txt
new file mode 100644
index 0000000..4dde7df
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/alu32_alu.txt
@@ -0,0 +1,84 @@
+# RUN: llvm-mc -triple hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.1.1 ALU32/ALU
+
+# Add
+0xf1 0xc3 0x15 0xb0
+# CHECK: r17 = add(r21, #31)
+0x11 0xdf 0x15 0xf3
+# CHECK: r17 = add(r21, r31)
+0x11 0xdf 0x55 0xf6
+# CHECK: r17 = add(r21, r31):sat
+
+# And
+0xf1 0xc3 0x15 0x76
+# CHECK: r17 = and(r21, #31)
+0xf1 0xc3 0x95 0x76
+# CHECK: r17 = or(r21, #31)
+0x11 0xdf 0x15 0xf1
+# CHECK: r17 = and(r21, r31)
+0x11 0xdf 0x35 0xf1
+# CHECK: r17 = or(r21, r31)
+0x11 0xdf 0x75 0xf1
+# CHECK: r17 = xor(r21, r31)
+0x11 0xd5 0x9f 0xf1
+# CHECK: r17 = and(r21, ~r31)
+0x11 0xd5 0xbf 0xf1
+# CHECK: r17 = or(r21, ~r31)
+
+# Nop
+0x00 0xc0 0x00 0x7f
+# CHECK: nop
+
+# Subtract
+0xb1 0xc2 0x5f 0x76
+# CHECK: r17 = sub(#21, r31)
+0x11 0xdf 0x35 0xf3
+# CHECK: r17 = sub(r31, r21)
+0x11 0xdf 0xd5 0xf6
+# CHECK: r17 = sub(r31, r21):sat
+
+# Sign extend
+0x11 0xc0 0xbf 0x70
+# CHECK: r17 = sxtb(r31)
+
+# Transfer immediate
+0x15 0xc0 0x31 0x72
+# CHECK: r17.h = #21
+0x15 0xc0 0x31 0x71
+# CHECK: r17.l = #21
+0xf1 0xff 0x5f 0x78
+# CHECK: r17 = #32767
+0xf1 0xff 0xdf 0x78
+# CHECK: r17 = ##65535
+
+# Transfer register
+0x11 0xc0 0x75 0x70
+# CHECK: r17 = r21
+
+# Vector add halfwords
+0x11 0xdf 0x15 0xf6
+# CHECK: r17 = vaddh(r21, r31)
+0x11 0xdf 0x35 0xf6
+# CHECK: r17 = vaddh(r21, r31):sat
+0x11 0xdf 0x75 0xf6
+# CHECK: r17 = vadduh(r21, r31):sat
+
+# Vector average halfwords
+0x11 0xdf 0x15 0xf7
+# CHECK: r17 = vavgh(r21, r31)
+0x11 0xdf 0x35 0xf7
+# CHECK: r17 = vavgh(r21, r31):rnd
+0x11 0xdf 0x75 0xf7
+# CHECK: r17 = vnavgh(r31, r21)
+
+# Vector subtract halfwords
+0x11 0xdf 0x95 0xf6
+# CHECK: r17 = vsubh(r31, r21)
+0x11 0xdf 0xb5 0xf6
+# CHECK: r17 = vsubh(r31, r21):sat
+0x11 0xdf 0xf5 0xf6
+# CHECK: r17 = vsubuh(r31, r21):sat
+
+# Zero extend
+0x11 0xc0 0xd5 0x70
+# CHECK: r17 = zxth(r21)
diff --git a/test/MC/Disassembler/Hexagon/alu32_perm.txt b/test/MC/Disassembler/Hexagon/alu32_perm.txt
new file mode 100644
index 0000000..a295350
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/alu32_perm.txt
@@ -0,0 +1,40 @@
+# RUN: llvm-mc -triple hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.1.2 ALU32/PERM
+
+# Combine words in to doublewords
+0x11 0xdf 0x95 0xf3
+# CHECK: r17 = combine(r31.h, r21.h)
+0x11 0xdf 0xb5 0xf3
+# CHECK: r17 = combine(r31.h, r21.l)
+0x11 0xdf 0xd5 0xf3
+# CHECK: r17 = combine(r31.l, r21.h)
+0x11 0xdf 0xf5 0xf3
+# CHECK: r17 = combine(r31.l, r21.l)
+0xb0 0xe2 0x0f 0x7c
+# CHECK: r17:16 = combine(#21, #31)
+0xb0 0xe2 0x3f 0x73
+# CHECK: r17:16 = combine(#21, r31)
+0xf0 0xe3 0x15 0x73
+# CHECK: r17:16 = combine(r21, #31)
+0x10 0xdf 0x15 0xf5
+# CHECK: r17:16 = combine(r21, r31)
+
+# Mux
+0xf1 0xc3 0x75 0x73
+# CHECK: r17 = mux(p3, r21, #31)
+0xb1 0xc2 0xff 0x73
+# CHECK: r17 = mux(p3, #21, r31)
+0xb1 0xe2 0x8f 0x7b
+# CHECK: r17 = mux(p3, #21, #31)
+0x71 0xdf 0x15 0xf4
+# CHECK: r17 = mux(p3, r21, r31)
+
+# Shift word by 16
+0x11 0xc0 0x15 0x70
+# CHECK: r17 = aslh(r21)
+0x11 0xc0 0x35 0x70
+# CHECK: r17 = asrh(r21)
+
+# Pack high and low halfwords
+0x10 0xdf 0x95 0xf5
+# CHECK: r17:16 = packhl(r21, r31)
diff --git a/test/MC/Disassembler/Hexagon/alu32_pred.txt b/test/MC/Disassembler/Hexagon/alu32_pred.txt
new file mode 100644
index 0000000..084b39d
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/alu32_pred.txt
@@ -0,0 +1,194 @@
+# RUN: llvm-mc -triple hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.1.3 ALU32/PRED
+
+# Conditional add
+0xf1 0xc3 0x75 0x74
+# CHECK: if (p3) r17 = add(r21, #31)
+0x03 0x40 0x45 0x85 0xf1 0xe3 0x75 0x74
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = add(r21, #31)
+0xf1 0xc3 0xf5 0x74
+# CHECK: if (!p3) r17 = add(r21, #31)
+0x03 0x40 0x45 0x85 0xf1 0xe3 0xf5 0x74
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = add(r21, #31)
+0x71 0xdf 0x15 0xfb
+# CHECK: if (p3) r17 = add(r21, r31)
+0x03 0x40 0x45 0x85 0x71 0xff 0x15 0xfb
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = add(r21, r31)
+0xf1 0xdf 0x15 0xfb
+# CHECK: if (!p3) r17 = add(r21, r31)
+0x03 0x40 0x45 0x85 0xf1 0xff 0x15 0xfb
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = add(r21, r31)
+
+# Conditional shift halfword
+0x11 0xe3 0x15 0x70
+# CHECK: if (p3) r17 = aslh(r21)
+0x03 0x40 0x45 0x85 0x11 0xe7 0x15 0x70
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = aslh(r21)
+0x11 0xeb 0x15 0x70
+# CHECK: if (!p3) r17 = aslh(r21)
+0x03 0x40 0x45 0x85 0x11 0xef 0x15 0x70
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = aslh(r21)
+0x11 0xe3 0x35 0x70
+# CHECK: if (p3) r17 = asrh(r21)
+0x03 0x40 0x45 0x85 0x11 0xe7 0x35 0x70
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = asrh(r21)
+0x11 0xeb 0x35 0x70
+# CHECK: if (!p3) r17 = asrh(r21)
+0x03 0x40 0x45 0x85 0x11 0xef 0x35 0x70
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = asrh(r21)
+
+# Conditional combine
+0x70 0xdf 0x15 0xfd
+# CHECK: if (p3) r17:16 = combine(r21, r31)
+0xf0 0xdf 0x15 0xfd
+# CHECK: if (!p3) r17:16 = combine(r21, r31)
+0x03 0x40 0x45 0x85 0x70 0xff 0x15 0xfd
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17:16 = combine(r21, r31)
+0x03 0x40 0x45 0x85 0xf0 0xff 0x15 0xfd
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17:16 = combine(r21, r31)
+
+# Conditional logical operations
+0x71 0xdf 0x15 0xf9
+# CHECK: if (p3) r17 = and(r21, r31)
+0xf1 0xdf 0x15 0xf9
+# CHECK: if (!p3) r17 = and(r21, r31)
+0x03 0x40 0x45 0x85 0x71 0xff 0x15 0xf9
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = and(r21, r31)
+0x03 0x40 0x45 0x85 0xf1 0xff 0x15 0xf9
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = and(r21, r31)
+0x71 0xdf 0x35 0xf9
+# CHECK: if (p3) r17 = or(r21, r31)
+0xf1 0xdf 0x35 0xf9
+# CHECK: if (!p3) r17 = or(r21, r31)
+0x03 0x40 0x45 0x85 0x71 0xff 0x35 0xf9
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = or(r21, r31)
+0x03 0x40 0x45 0x85 0xf1 0xff 0x35 0xf9
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = or(r21, r31)
+0x71 0xdf 0x75 0xf9
+# CHECK: if (p3) r17 = xor(r21, r31)
+0xf1 0xdf 0x75 0xf9
+# CHECK: if (!p3) r17 = xor(r21, r31)
+0x03 0x40 0x45 0x85 0x71 0xff 0x75 0xf9
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = xor(r21, r31)
+0x03 0x40 0x45 0x85 0xf1 0xff 0x75 0xf9
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = xor(r21, r31)
+
+# Conditional subtract
+0x71 0xdf 0x35 0xfb
+# CHECK: if (p3) r17 = sub(r31, r21)
+0xf1 0xdf 0x35 0xfb
+# CHECK: if (!p3) r17 = sub(r31, r21)
+0x03 0x40 0x45 0x85 0x71 0xff 0x35 0xfb
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = sub(r31, r21)
+0x03 0x40 0x45 0x85 0xf1 0xff 0x35 0xfb
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = sub(r31, r21)
+
+# Conditional sign extend
+0x11 0xe3 0xb5 0x70
+# CHECK: if (p3) r17 = sxtb(r21)
+0x11 0xeb 0xb5 0x70
+# CHECK: if (!p3) r17 = sxtb(r21)
+0x03 0x40 0x45 0x85 0x11 0xe7 0xb5 0x70
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = sxtb(r21)
+0x03 0x40 0x45 0x85 0x11 0xef 0xb5 0x70
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = sxtb(r21)
+0x11 0xe3 0xf5 0x70
+# CHECK: if (p3) r17 = sxth(r21)
+0x11 0xeb 0xf5 0x70
+# CHECK: if (!p3) r17 = sxth(r21)
+0x03 0x40 0x45 0x85 0x11 0xe7 0xf5 0x70
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = sxth(r21)
+0x03 0x40 0x45 0x85 0x11 0xef 0xf5 0x70
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = sxth(r21)
+
+# Conditional transfer
+0xb1 0xc2 0x60 0x7e
+# CHECK: if (p3) r17 = #21
+0xb1 0xc2 0xe0 0x7e
+# CHECK: if (!p3) r17 = #21
+0x03 0x40 0x45 0x85 0xb1 0xe2 0x60 0x7e
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = #21
+0x03 0x40 0x45 0x85 0xb1 0xe2 0xe0 0x7e
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = #21
+
+# Conditional zero extend
+0x11 0xe3 0x95 0x70
+# CHECK: if (p3) r17 = zxtb(r21)
+0x11 0xeb 0x95 0x70
+# CHECK: if (!p3) r17 = zxtb(r21)
+0x03 0x40 0x45 0x85 0x11 0xe7 0x95 0x70
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = zxtb(r21)
+0x03 0x40 0x45 0x85 0x11 0xef 0x95 0x70
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = zxtb(r21)
+0x11 0xe3 0xd5 0x70
+# CHECK: if (p3) r17 = zxth(r21)
+0x11 0xeb 0xd5 0x70
+# CHECK: if (!p3) r17 = zxth(r21)
+0x03 0x40 0x45 0x85 0x11 0xe7 0xd5 0x70
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = zxth(r21)
+0x03 0x40 0x45 0x85 0x11 0xef 0xd5 0x70
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = zxth(r21)
+
+# Compare
+0xe3 0xc3 0x15 0x75
+# CHECK: p3 = cmp.eq(r21, #31)
+0xf3 0xc3 0x15 0x75
+# CHECK: p3 = !cmp.eq(r21, #31)
+0xe3 0xc3 0x55 0x75
+# CHECK: p3 = cmp.gt(r21, #31)
+0xf3 0xc3 0x55 0x75
+# CHECK: p3 = !cmp.gt(r21, #31)
+0xe3 0xc3 0x95 0x75
+# CHECK: p3 = cmp.gtu(r21, #31)
+0xf3 0xc3 0x95 0x75
+# CHECK: p3 = !cmp.gtu(r21, #31)
+0x03 0xdf 0x15 0xf2
+# CHECK: p3 = cmp.eq(r21, r31)
+0x13 0xdf 0x15 0xf2
+# CHECK: p3 = !cmp.eq(r21, r31)
+0x03 0xdf 0x55 0xf2
+# CHECK: p3 = cmp.gt(r21, r31)
+0x13 0xdf 0x55 0xf2
+# CHECK: p3 = !cmp.gt(r21, r31)
+0x03 0xdf 0x75 0xf2
+# CHECK: p3 = cmp.gtu(r21, r31)
+0x13 0xdf 0x75 0xf2
+# CHECK: p3 = !cmp.gtu(r21, r31)
+
+# Compare to general register
+0xf1 0xe3 0x55 0x73
+# CHECK: r17 = cmp.eq(r21, #31)
+0xf1 0xe3 0x75 0x73
+# CHECK: r17 = !cmp.eq(r21, #31)
+0x11 0xdf 0x55 0xf3
+# CHECK: r17 = cmp.eq(r21, r31)
+0x11 0xdf 0x75 0xf3
+# CHECK: r17 = !cmp.eq(r21, r31)
diff --git a/test/MC/Disassembler/Hexagon/cr.txt b/test/MC/Disassembler/Hexagon/cr.txt
new file mode 100644
index 0000000..6cf2b5f
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/cr.txt
@@ -0,0 +1,78 @@
+# RUN: llvm-mc --triple hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.2 CR
+
+# Corner detection acceleration
+0x93 0xe1 0x12 0x6b
+# CHECK: p3 = !fastcorner9(p2, p1)
+0x91 0xe3 0x02 0x6b
+# CHECK: p1 = fastcorner9(p2, p3)
+
+# Logical reductions on predicates
+0x01 0xc0 0x82 0x6b
+# CHECK: p1 = any8(p2)
+0x01 0xc0 0xa2 0x6b
+# CHECK: p1 = all8(p2)
+
+# Looping instructions
+0x08 0xc4 0x15 0x60
+# CHECK: loop0
+0x08 0xc4 0x35 0x60
+# CHECK: loop1
+0x68 0xc4 0x00 0x69
+# CHECK: loop0
+0x68 0xc4 0x20 0x69
+# CHECK: loop1
+
+# Add to PC
+0x91 0xca 0x49 0x6a
+# CHECK: r17 = add(pc, #21)
+
+# Pipelined loop instructions
+0x08 0xc4 0xb5 0x60
+# CHECK: p3 = sp1loop0
+0x08 0xc4 0xd5 0x60
+# CHECK: p3 = sp2loop0
+0x08 0xc4 0xf5 0x60
+# CHECK: p3 = sp3loop0
+0xa9 0xc4 0xa0 0x69
+# CHECK: p3 = sp1loop0
+0xa9 0xc4 0xc0 0x69
+# CHECK: p3 = sp2loop0
+0xa9 0xc4 0xe0 0x69
+# CHECK: p3 = sp3loop0
+
+# Logical operations on predicates
+0x01 0xc3 0x02 0x6b
+# CHECK: p1 = and(p3, p2)
+0xc1 0xc3 0x12 0x6b
+# CHECK: p1 = and(p2, and(p3, p3))
+0x01 0xc3 0x22 0x6b
+# CHECK: p1 = or(p3, p2)
+0xc1 0xc3 0x32 0x6b
+# CHECK: p1 = and(p2, or(p3, p3))
+0x01 0xc3 0x42 0x6b
+# CHECK: p1 = xor(p2, p3)
+0xc1 0xc3 0x52 0x6b
+# CHECK: p1 = or(p2, and(p3, p3))
+0x01 0xc2 0x63 0x6b
+# CHECK: p1 = and(p2, !p3)
+0xc1 0xc3 0x72 0x6b
+# CHECK: p1 = or(p2, or(p3, p3))
+0xc1 0xc3 0x92 0x6b
+# CHECK: p1 = and(p2, and(p3, !p3))
+0xc1 0xc3 0xb2 0x6b
+# CHECK: p1 = and(p2, or(p3, !p3))
+0x01 0xc0 0xc2 0x6b
+# CHECK: p1 = not(p2)
+0xc1 0xc3 0xd2 0x6b
+# CHECK: p1 = or(p2, and(p3, !p3))
+0x01 0xc2 0xe3 0x6b
+# CHECK: p1 = or(p2, !p3)
+0xc1 0xc3 0xf2 0x6b
+# CHECK: p1 = or(p2, or(p3, !p3))
+
+# User control register transfer
+0x0d 0xc0 0x35 0x62
+# CHECK: cs1 = r21
+0x11 0xc0 0x0d 0x6a
+# CHECK: r17 = cs1
diff --git a/test/MC/Disassembler/Hexagon/j.txt b/test/MC/Disassembler/Hexagon/j.txt
new file mode 100644
index 0000000..0c2cc7a
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/j.txt
@@ -0,0 +1,202 @@
+# RUN: llvm-mc -triple hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.4 J
+
+# Call subroutine
+0x22 0xc0 0x00 0x5a
+# CHECK: call
+0x22 0xc3 0x00 0x5d
+# CHECK: if (p3) call
+0x22 0xc3 0x20 0x5d
+# CHECK: if (!p3) call
+
+# Compare and jump
+0x00 0xc0 0x89 0x11
+# CHECK: p0 = cmp.eq(r9,#-1); if (p0.new) jump:nt
+0x00 0xc1 0x89 0x11
+# CHECK: p0 = cmp.gt(r9,#-1); if (p0.new) jump:nt
+0x00 0xc3 0x89 0x11
+# CHECK: p0 = tstbit(r9, #0); if (p0.new) jump:nt
+0x00 0xe0 0x89 0x11
+# CHECK: p0 = cmp.eq(r9,#-1); if (p0.new) jump:t
+0x00 0xe1 0x89 0x11
+# CHECK: p0 = cmp.gt(r9,#-1); if (p0.new) jump:t
+0x00 0xe3 0x89 0x11
+# CHECK: p0 = tstbit(r9, #0); if (p0.new) jump:t
+0x00 0xc0 0xc9 0x11
+# CHECK: p0 = cmp.eq(r9,#-1); if (!p0.new) jump:nt
+0x00 0xc1 0xc9 0x11
+# CHECK: p0 = cmp.gt(r9,#-1); if (!p0.new) jump:nt
+0x00 0xc3 0xc9 0x11
+# CHECK: p0 = tstbit(r9, #0); if (!p0.new) jump:nt
+0x00 0xe0 0xc9 0x11
+# CHECK: p0 = cmp.eq(r9,#-1); if (!p0.new) jump:t
+0x00 0xe1 0xc9 0x11
+# CHECK: p0 = cmp.gt(r9,#-1); if (!p0.new) jump:t
+0x00 0xe3 0xc9 0x11
+# CHECK: p0 = tstbit(r9, #0); if (!p0.new) jump:t
+0x00 0xd5 0x09 0x10
+# CHECK: p0 = cmp.eq(r9, #21); if (p0.new) jump:nt
+0x00 0xf5 0x09 0x10
+# CHECK: p0 = cmp.eq(r9, #21); if (p0.new) jump:t
+0x00 0xd5 0x49 0x10
+# CHECK: p0 = cmp.eq(r9, #21); if (!p0.new) jump:nt
+0x00 0xf5 0x49 0x10
+# CHECK: p0 = cmp.eq(r9, #21); if (!p0.new) jump:t
+0x00 0xd5 0x89 0x10
+# CHECK: p0 = cmp.gt(r9, #21); if (p0.new) jump:nt
+0x00 0xf5 0x89 0x10
+# CHECK: p0 = cmp.gt(r9, #21); if (p0.new) jump:t
+0x00 0xd5 0xc9 0x10
+# CHECK: p0 = cmp.gt(r9, #21); if (!p0.new) jump:nt
+0x00 0xf5 0xc9 0x10
+# CHECK: p0 = cmp.gt(r9, #21); if (!p0.new) jump:t
+0x00 0xd5 0x09 0x11
+# CHECK: p0 = cmp.gtu(r9, #21); if (p0.new) jump:nt
+0x00 0xf5 0x09 0x11
+# CHECK: p0 = cmp.gtu(r9, #21); if (p0.new) jump:t
+0x00 0xd5 0x49 0x11
+# CHECK: p0 = cmp.gtu(r9, #21); if (!p0.new) jump:nt
+0x00 0xf5 0x49 0x11
+# CHECK: p0 = cmp.gtu(r9, #21); if (!p0.new) jump:t
+0x00 0xc0 0x89 0x13
+# CHECK: p1 = cmp.eq(r9,#-1); if (p1.new) jump:nt
+0x00 0xc1 0x89 0x13
+# CHECK: p1 = cmp.gt(r9,#-1); if (p1.new) jump:nt
+0x00 0xc3 0x89 0x13
+# CHECK: p1 = tstbit(r9, #0); if (p1.new) jump:nt
+0x00 0xe0 0x89 0x13
+# CHECK: p1 = cmp.eq(r9,#-1); if (p1.new) jump:t
+0x00 0xe1 0x89 0x13
+# CHECK: p1 = cmp.gt(r9,#-1); if (p1.new) jump:t
+0x00 0xe3 0x89 0x13
+# CHECK: p1 = tstbit(r9, #0); if (p1.new) jump:t
+0x00 0xc0 0xc9 0x13
+# CHECK: p1 = cmp.eq(r9,#-1); if (!p1.new) jump:nt
+0x00 0xc1 0xc9 0x13
+# CHECK: p1 = cmp.gt(r9,#-1); if (!p1.new) jump:nt
+0x00 0xc3 0xc9 0x13
+# CHECK: p1 = tstbit(r9, #0); if (!p1.new) jump:nt
+0x00 0xe0 0xc9 0x13
+# CHECK: p1 = cmp.eq(r9,#-1); if (!p1.new) jump:t
+0x00 0xe1 0xc9 0x13
+# CHECK: p1 = cmp.gt(r9,#-1); if (!p1.new) jump:t
+0x00 0xe3 0xc9 0x13
+# CHECK: p1 = tstbit(r9, #0); if (!p1.new) jump:t
+0x00 0xd5 0x09 0x12
+# CHECK: p1 = cmp.eq(r9, #21); if (p1.new) jump:nt
+0x00 0xf5 0x09 0x12
+# CHECK: p1 = cmp.eq(r9, #21); if (p1.new) jump:t
+0x00 0xd5 0x49 0x12
+# CHECK: p1 = cmp.eq(r9, #21); if (!p1.new) jump:nt
+0x00 0xf5 0x49 0x12
+# CHECK: p1 = cmp.eq(r9, #21); if (!p1.new) jump:t
+0x00 0xd5 0x89 0x12
+# CHECK: p1 = cmp.gt(r9, #21); if (p1.new) jump:nt
+0x00 0xf5 0x89 0x12
+# CHECK: p1 = cmp.gt(r9, #21); if (p1.new) jump:t
+0x00 0xd5 0xc9 0x12
+# CHECK: p1 = cmp.gt(r9, #21); if (!p1.new) jump:nt
+0x00 0xf5 0xc9 0x12
+# CHECK: p1 = cmp.gt(r9, #21); if (!p1.new) jump:t
+0x00 0xd5 0x09 0x13
+# CHECK: p1 = cmp.gtu(r9, #21); if (p1.new) jump:nt
+0x00 0xf5 0x09 0x13
+# CHECK: p1 = cmp.gtu(r9, #21); if (p1.new) jump:t
+0x00 0xd5 0x49 0x13
+# CHECK: p1 = cmp.gtu(r9, #21); if (!p1.new) jump:nt
+0x00 0xf5 0x49 0x13
+# CHECK: p1 = cmp.gtu(r9, #21); if (!p1.new) jump:t
+0x00 0xcd 0x09 0x14
+# CHECK: p0 = cmp.eq(r9, r13); if (p0.new) jump:nt
+0x00 0xdd 0x09 0x14
+# CHECK: p1 = cmp.eq(r9, r13); if (p1.new) jump:nt
+0x00 0xed 0x09 0x14
+# CHECK: p0 = cmp.eq(r9, r13); if (p0.new) jump:t
+0x00 0xfd 0x09 0x14
+# CHECK: p1 = cmp.eq(r9, r13); if (p1.new) jump:t
+0x00 0xcd 0x49 0x14
+# CHECK: p0 = cmp.eq(r9, r13); if (!p0.new) jump:nt
+0x00 0xdd 0x49 0x14
+# CHECK: p1 = cmp.eq(r9, r13); if (!p1.new) jump:nt
+0x00 0xed 0x49 0x14
+# CHECK: p0 = cmp.eq(r9, r13); if (!p0.new) jump:t
+0x00 0xfd 0x49 0x14
+# CHECK: p1 = cmp.eq(r9, r13); if (!p1.new) jump:t
+0x00 0xcd 0x89 0x14
+# CHECK: p0 = cmp.gt(r9, r13); if (p0.new) jump:nt
+0x00 0xdd 0x89 0x14
+# CHECK: p1 = cmp.gt(r9, r13); if (p1.new) jump:nt
+0x00 0xed 0x89 0x14
+# CHECK: p0 = cmp.gt(r9, r13); if (p0.new) jump:t
+0x00 0xfd 0x89 0x14
+# CHECK: p1 = cmp.gt(r9, r13); if (p1.new) jump:t
+0x00 0xcd 0xc9 0x14
+# CHECK: p0 = cmp.gt(r9, r13); if (!p0.new) jump:nt
+0x00 0xdd 0xc9 0x14
+# CHECK: p1 = cmp.gt(r9, r13); if (!p1.new) jump:nt
+0x00 0xed 0xc9 0x14
+# CHECK: p0 = cmp.gt(r9, r13); if (!p0.new) jump:t
+0x00 0xfd 0xc9 0x14
+# CHECK: p1 = cmp.gt(r9, r13); if (!p1.new) jump:t
+0x00 0xcd 0x09 0x15
+# CHECK: p0 = cmp.gtu(r9, r13); if (p0.new) jump:nt
+0x00 0xdd 0x09 0x15
+# CHECK: p1 = cmp.gtu(r9, r13); if (p1.new) jump:nt
+0x00 0xed 0x09 0x15
+# CHECK: p0 = cmp.gtu(r9, r13); if (p0.new) jump:t
+0x00 0xfd 0x09 0x15
+# CHECK: p1 = cmp.gtu(r9, r13); if (p1.new) jump:t
+0x00 0xcd 0x49 0x15
+# CHECK: p0 = cmp.gtu(r9, r13); if (!p0.new) jump:nt
+0x00 0xdd 0x49 0x15
+# CHECK: p1 = cmp.gtu(r9, r13); if (!p1.new) jump:nt
+0x00 0xed 0x49 0x15
+# CHECK: p0 = cmp.gtu(r9, r13); if (!p0.new) jump:t
+0x00 0xfd 0x49 0x15
+# CHECK: p1 = cmp.gtu(r9, r13); if (!p1.new) jump:t
+
+# Jump to address
+0x22 0xc0 0x00 0x58
+# CHECK: jump
+0x22 0xc3 0x00 0x5c
+# CHECK: if (p3) jump
+0x22 0xc3 0x20 0x5c
+# CHECK: if (!p3) jump
+
+# Jump to address conditioned on new predicate
+0x03 0x40 0x45 0x85 0x00 0xcb 0x00 0x5c
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) jump:nt
+0x03 0x40 0x45 0x85 0x00 0xdb 0x00 0x5c
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) jump:t
+0x03 0x40 0x45 0x85 0x00 0xcb 0x20 0x5c
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) jump:nt
+0x03 0x40 0x45 0x85 0x00 0xdb 0x20 0x5c
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) jump:t
+
+# Jump to address conditioned on register value
+0x00 0xc0 0x11 0x61
+# CHECK: if (r17!=#0) jump:nt
+0x00 0xd0 0x11 0x61
+# CHECK: if (r17!=#0) jump:t
+0x00 0xc0 0x51 0x61
+# CHECK: if (r17>=#0) jump:nt
+0x00 0xd0 0x51 0x61
+# CHECK: if (r17>=#0) jump:t
+0x00 0xc0 0x91 0x61
+# CHECK: if (r17==#0) jump:nt
+0x00 0xd0 0x91 0x61
+# CHECK: if (r17==#0) jump:t
+0x00 0xc0 0xd1 0x61
+# CHECK: if (r17<=#0) jump:nt
+0x00 0xd0 0xd1 0x61
+# CHECK: if (r17<=#0) jump:t
+
+# Transfer and jump
+0x00 0xd5 0x09 0x16
+# CHECK: r9 = #21 ; jump
+0x00 0xc9 0x0d 0x17
+# CHECK: r9 = r13 ; jump
diff --git a/test/MC/Disassembler/Hexagon/jr.txt b/test/MC/Disassembler/Hexagon/jr.txt
new file mode 100644
index 0000000..c9deb5f
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/jr.txt
@@ -0,0 +1,34 @@
+# RUN: llvm-mc -triple hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.3 JR
+
+# Call subroutine from register
+0x00 0xc0 0xb5 0x50
+# CHECK: callr r21
+0x00 0xc1 0x15 0x51
+# CHECK: if (p1) callr r21
+0x00 0xc3 0x35 0x51
+# CHECK: if (!p3) callr r21
+
+# Hint an indirect jump address
+0x00 0xc0 0xb5 0x52
+# CHECK: hintjr(r21)
+
+# Jump to address from register
+0x00 0xc0 0x95 0x52
+# CHECK: jumpr r21
+0x00 0xc1 0x55 0x53
+# CHECK: if (p1) jumpr r21
+0x03 0x40 0x45 0x85 0x00 0xcb 0x55 0x53
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) jumpr:nt r21
+0x03 0x40 0x45 0x85 0x00 0xdb 0x55 0x53
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) jumpr:t r21
+0x00 0xc3 0x75 0x53
+# CHECK: if (!p3) jumpr r21
+0x03 0x40 0x45 0x85 0x00 0xcb 0x75 0x53
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) jumpr:nt r21
+0x03 0x40 0x45 0x85 0x00 0xdb 0x75 0x53
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) jumpr:t r21
diff --git a/test/MC/Disassembler/Hexagon/ld.txt b/test/MC/Disassembler/Hexagon/ld.txt
new file mode 100644
index 0000000..15c23b6
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/ld.txt
@@ -0,0 +1,364 @@
+# RUN: llvm-mc -triple hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.5 LD
+
+# Load doubleword
+0x90 0xff 0xd5 0x3a
+# CHECK: r17:16 = memd(r21 + r31<<#3)
+0x10 0xc5 0xc0 0x49
+# CHECK: r17:16 = memd(##320)
+0xb0 0xe0 0xd5 0x99
+# CHECK: r17:16 = memd(r21 ++ #40:circ(m1))
+0x10 0xe2 0xd5 0x99
+# CHECK: r17:16 = memd(r21 ++ I:circ(m1))
+0xb0 0xc0 0xd5 0x9b
+# CHECK: r17:16 = memd(r21++#40)
+0x10 0xe0 0xd5 0x9d
+# CHECK: r17:16 = memd(r21++m1)
+0x10 0xe0 0xd5 0x9f
+# CHECK: r17:16 = memd(r21 ++ m1:brev)
+
+# Load doubleword conditionally
+0xf0 0xff 0xd5 0x30
+# CHECK: if (p3) r17:16 = memd(r21+r31<<#3)
+0xf0 0xff 0xd5 0x31
+# CHECK: if (!p3) r17:16 = memd(r21+r31<<#3)
+0x03 0x40 0x45 0x85 0xf0 0xff 0xd5 0x32
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17:16 = memd(r21+r31<<#3)
+0x03 0x40 0x45 0x85 0xf0 0xff 0xd5 0x33
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17:16 = memd(r21+r31<<#3)
+0x70 0xd8 0xd5 0x41
+# CHECK: if (p3) r17:16 = memd(r21 + #24)
+0x03 0x40 0x45 0x85 0x70 0xd8 0xd5 0x43
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17:16 = memd(r21 + #24)
+0x70 0xd8 0xd5 0x45
+# CHECK: if (!p3) r17:16 = memd(r21 + #24)
+0x03 0x40 0x45 0x85 0x70 0xd8 0xd5 0x47
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17:16 = memd(r21 + #24)
+0xb0 0xe6 0xd5 0x9b
+# CHECK: if (p3) r17:16 = memd(r21++#40)
+0xb0 0xee 0xd5 0x9b
+# CHECK: if (!p3) r17:16 = memd(r21++#40)
+0x03 0x40 0x45 0x85 0xb0 0xf6 0xd5 0x9b
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17:16 = memd(r21++#40)
+0x03 0x40 0x45 0x85 0xb0 0xfe 0xd5 0x9b
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17:16 = memd(r21++#40)
+
+# Load byte
+0x91 0xff 0x15 0x3a
+# CHECK: r17 = memb(r21 + r31<<#3)
+0xb1 0xc2 0x00 0x49
+# CHECK: r17 = memb(##21)
+0xf1 0xc3 0x15 0x91
+# CHECK: r17 = memb(r21 + #31)
+0xb1 0xe0 0x15 0x99
+# CHECK: r17 = memb(r21 ++ #5:circ(m1))
+0x11 0xe2 0x15 0x99
+# CHECK: r17 = memb(r21 ++ I:circ(m1))
+0xb1 0xc0 0x15 0x9b
+# CHECK: r17 = memb(r21++#5)
+0x11 0xe0 0x15 0x9d
+# CHECK: r17 = memb(r21++m1)
+0x11 0xe0 0x15 0x9f
+# CHECK: r17 = memb(r21 ++ m1:brev)
+
+# Load byte conditionally
+0xf1 0xff 0x15 0x30
+# CHECK: if (p3) r17 = memb(r21+r31<<#3)
+0xf1 0xff 0x15 0x31
+# CHECK: if (!p3) r17 = memb(r21+r31<<#3)
+0x03 0x40 0x45 0x85 0xf1 0xff 0x15 0x32
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = memb(r21+r31<<#3)
+0x03 0x40 0x45 0x85 0xf1 0xff 0x15 0x33
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = memb(r21+r31<<#3)
+0x91 0xdd 0x15 0x41
+# CHECK: if (p3) r17 = memb(r21 + #44)
+0x03 0x40 0x45 0x85 0x91 0xdd 0x15 0x43
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = memb(r21 + #44)
+0x91 0xdd 0x15 0x45
+# CHECK: if (!p3) r17 = memb(r21 + #44)
+0x03 0x40 0x45 0x85 0x91 0xdd 0x15 0x47
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = memb(r21 + #44)
+0xb1 0xe6 0x15 0x9b
+# CHECK: if (p3) r17 = memb(r21++#5)
+0xb1 0xee 0x15 0x9b
+# CHECK: if (!p3) r17 = memb(r21++#5)
+0x03 0x40 0x45 0x85 0xb1 0xf6 0x15 0x9b
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = memb(r21++#5)
+0x03 0x40 0x45 0x85 0xb1 0xfe 0x15 0x9b
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = memb(r21++#5)
+
+# Load halfword
+0x91 0xff 0x55 0x3a
+# CHECK: r17 = memh(r21 + r31<<#3)
+0x51 0xc5 0x40 0x49
+# CHECK: r17 = memh(##84)
+0xf1 0xc3 0x55 0x91
+# CHECK: r17 = memh(r21 + #62)
+0xb1 0xe0 0x55 0x99
+# CHECK: r17 = memh(r21 ++ #10:circ(m1))
+0x11 0xe2 0x55 0x99
+# CHECK: r17 = memh(r21 ++ I:circ(m1))
+0xb1 0xc0 0x55 0x9b
+# CHECK: r17 = memh(r21++#10)
+0x11 0xe0 0x55 0x9d
+# CHECK: r17 = memh(r21++m1)
+0x11 0xe0 0x55 0x9f
+# CHECK: r17 = memh(r21 ++ m1:brev)
+
+# Load halfword conditionally
+0xf1 0xff 0x55 0x30
+# CHECK: if (p3) r17 = memh(r21+r31<<#3)
+0xf1 0xff 0x55 0x31
+# CHECK: if (!p3) r17 = memh(r21+r31<<#3)
+0x03 0x40 0x45 0x85 0xf1 0xff 0x55 0x32
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = memh(r21+r31<<#3)
+0x03 0x40 0x45 0x85 0xf1 0xff 0x55 0x33
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = memh(r21+r31<<#3)
+0xb1 0xe6 0x55 0x9b
+# CHECK: if (p3) r17 = memh(r21++#10)
+0xb1 0xee 0x55 0x9b
+# CHECK: if (!p3) r17 = memh(r21++#10)
+0x03 0x40 0x45 0x85 0xb1 0xf6 0x55 0x9b
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = memh(r21++#10)
+0x03 0x40 0x45 0x85 0xb1 0xfe 0x55 0x9b
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = memh(r21++#10)
+
+# Load unsigned byte
+0x91 0xff 0x35 0x3a
+# CHECK: r17 = memub(r21 + r31<<#3)
+0xb1 0xc2 0x20 0x49
+# CHECK: r17 = memub(##21)
+0xf1 0xc3 0x35 0x91
+# CHECK: r17 = memub(r21 + #31)
+0xb1 0xe0 0x35 0x99
+# CHECK: r17 = memub(r21 ++ #5:circ(m1))
+0x11 0xe2 0x35 0x99
+# CHECK: r17 = memub(r21 ++ I:circ(m1))
+0xb1 0xc0 0x35 0x9b
+# CHECK: r17 = memub(r21++#5)
+0x11 0xe0 0x35 0x9d
+# CHECK: r17 = memub(r21++m1)
+0x11 0xe0 0x35 0x9f
+# CHECK: r17 = memub(r21 ++ m1:brev)
+
+# Load unsigned byte conditionally
+0xf1 0xff 0x35 0x30
+# CHECK: if (p3) r17 = memub(r21+r31<<#3)
+0xf1 0xff 0x35 0x31
+# CHECK: if (!p3) r17 = memub(r21+r31<<#3)
+0x03 0x40 0x45 0x85 0xf1 0xff 0x35 0x32
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = memub(r21+r31<<#3)
+0x03 0x40 0x45 0x85 0xf1 0xff 0x35 0x33
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = memub(r21+r31<<#3)
+0xf1 0xdb 0x35 0x41
+# CHECK: if (p3) r17 = memub(r21 + #31)
+0x03 0x40 0x45 0x85 0xf1 0xdb 0x35 0x43
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = memub(r21 + #31)
+0xf1 0xdb 0x35 0x45
+# CHECK: if (!p3) r17 = memub(r21 + #31)
+0x03 0x40 0x45 0x85 0xf1 0xdb 0x35 0x47
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = memub(r21 + #31)
+0xb1 0xe6 0x35 0x9b
+# CHECK: if (p3) r17 = memub(r21++#5)
+0xb1 0xee 0x35 0x9b
+# CHECK: if (!p3) r17 = memub(r21++#5)
+0x03 0x40 0x45 0x85 0xb1 0xf6 0x35 0x9b
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = memub(r21++#5)
+0x03 0x40 0x45 0x85 0xb1 0xfe 0x35 0x9b
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = memub(r21++#5)
+
+# Load unsigned halfword
+0x91 0xff 0x75 0x3a
+# CHECK: r17 = memuh(r21 + r31<<#3)
+0x51 0xc5 0x60 0x49
+# CHECK: r17 = memuh(##84)
+0xb1 0xc2 0x75 0x91
+# CHECK: r17 = memuh(r21 + #42)
+0xb1 0xe0 0x75 0x99
+# CHECK: r17 = memuh(r21 ++ #10:circ(m1))
+0x11 0xe2 0x75 0x99
+# CHECK: r17 = memuh(r21 ++ I:circ(m1))
+0xb1 0xc0 0x75 0x9b
+# CHECK: r17 = memuh(r21++#10)
+0x11 0xe0 0x75 0x9d
+# CHECK: r17 = memuh(r21++m1)
+0x11 0xe0 0x75 0x9f
+# CHECK: r17 = memuh(r21 ++ m1:brev)
+
+# Load unsigned halfword conditionally
+0xf1 0xff 0x75 0x30
+# CHECK: if (p3) r17 = memuh(r21+r31<<#3)
+0xf1 0xff 0x75 0x31
+# CHECK: if (!p3) r17 = memuh(r21+r31<<#3)
+0x03 0x40 0x45 0x85 0xf1 0xff 0x75 0x32
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = memuh(r21+r31<<#3)
+0x03 0x40 0x45 0x85 0xf1 0xff 0x75 0x33
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = memuh(r21+r31<<#3)
+0xb1 0xda 0x75 0x41
+# CHECK: if (p3) r17 = memuh(r21 + #42)
+0xb1 0xda 0x75 0x45
+# CHECK: if (!p3) r17 = memuh(r21 + #42)
+0x03 0x40 0x45 0x85 0xb1 0xda 0x75 0x43
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = memuh(r21 + #42)
+0x03 0x40 0x45 0x85 0xb1 0xda 0x75 0x47
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = memuh(r21 + #42)
+0xb1 0xe6 0x75 0x9b
+# CHECK: if (p3) r17 = memuh(r21++#10)
+0xb1 0xee 0x75 0x9b
+# CHECK: if (!p3) r17 = memuh(r21++#10)
+0x03 0x40 0x45 0x85 0xb1 0xf6 0x75 0x9b
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = memuh(r21++#10)
+0x03 0x40 0x45 0x85 0xb1 0xfe 0x75 0x9b
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = memuh(r21++#10)
+
+# Load word
+0x91 0xff 0x95 0x3a
+# CHECK: r17 = memw(r21 + r31<<#3)
+0x91 0xc2 0x80 0x49
+# CHECK: r17 = memw(##80)
+0xb1 0xc2 0x95 0x91
+# CHECK: r17 = memw(r21 + #84)
+0xb1 0xe0 0x95 0x99
+# CHECK: r17 = memw(r21 ++ #20:circ(m1))
+0x11 0xe2 0x95 0x99
+# CHECK: r17 = memw(r21 ++ I:circ(m1))
+0xb1 0xc0 0x95 0x9b
+# CHECK: r17 = memw(r21++#20)
+0x11 0xe0 0x95 0x9d
+# CHECK: r17 = memw(r21++m1)
+0x11 0xe0 0x95 0x9f
+# CHECK: r17 = memw(r21 ++ m1:brev)
+
+# Load word conditionally
+0xf1 0xff 0x95 0x30
+# CHECK: if (p3) r17 = memw(r21+r31<<#3)
+0xf1 0xff 0x95 0x31
+# CHECK: if (!p3) r17 = memw(r21+r31<<#3)
+0x03 0x40 0x45 0x85 0xf1 0xff 0x95 0x32
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = memw(r21+r31<<#3)
+0x03 0x40 0x45 0x85 0xf1 0xff 0x95 0x33
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = memw(r21+r31<<#3)
+0xb1 0xda 0x95 0x41
+# CHECK: if (p3) r17 = memw(r21 + #84)
+0xb1 0xda 0x95 0x45
+# CHECK: if (!p3) r17 = memw(r21 + #84)
+0x03 0x40 0x45 0x85 0xb1 0xda 0x95 0x43
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = memw(r21 + #84)
+0x03 0x40 0x45 0x85 0xb1 0xda 0x95 0x47
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = memw(r21 + #84)
+0xb1 0xe6 0x95 0x9b
+# CHECK: if (p3) r17 = memw(r21++#20)
+0xb1 0xee 0x95 0x9b
+# CHECK: if (!p3) r17 = memw(r21++#20)
+0x03 0x40 0x45 0x85 0xb1 0xf6 0x95 0x9b
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) r17 = memw(r21++#20)
+0x03 0x40 0x45 0x85 0xb1 0xfe 0x95 0x9b
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) r17 = memw(r21++#20)
+
+# Deallocate stack frame
+0x1e 0xc0 0x1e 0x90
+# CHECK: deallocframe
+
+# Deallocate stack frame and return
+0x1e 0xc0 0x1e 0x96
+# CHECK: dealloc_return
+0x03 0x40 0x45 0x85 0x1e 0xcb 0x1e 0x96
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) dealloc_return:nt
+0x1e 0xd3 0x1e 0x96
+# CHECK: if (p3) dealloc_return
+0x03 0x40 0x45 0x85 0x1e 0xdb 0x1e 0x96
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) dealloc_return:t
+0x03 0x40 0x45 0x85 0x1e 0xeb 0x1e 0x96
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) dealloc_return:nt
+0x1e 0xf3 0x1e 0x96
+# CHECK: if (!p3) dealloc_return
+0x03 0x40 0x45 0x85 0x1e 0xfb 0x1e 0x96
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) dealloc_return:t
+
+# Load and unpack bytes to halfwords
+0xf1 0xc3 0x35 0x90
+# CHECK: r17 = membh(r21 + #62)
+0xf1 0xc3 0x75 0x90
+# CHECK: r17 = memubh(r21 + #62)
+0xf0 0xc3 0xb5 0x90
+# CHECK: r17:16 = memubh(r21 + #124)
+0xf0 0xc3 0xf5 0x90
+# CHECK: r17:16 = membh(r21 + #124)
+0xb1 0xe0 0x35 0x98
+# CHECK: r17 = membh(r21 ++ #10:circ(m1))
+0x11 0xe2 0x35 0x98
+# CHECK: r17 = membh(r21 ++ I:circ(m1))
+0xb1 0xe0 0x75 0x98
+# CHECK: r17 = memubh(r21 ++ #10:circ(m1))
+0x11 0xe2 0x75 0x98
+# CHECK: r17 = memubh(r21 ++ I:circ(m1))
+0xb0 0xe0 0xf5 0x98
+# CHECK: r17:16 = membh(r21 ++ #20:circ(m1))
+0x10 0xe2 0xf5 0x98
+# CHECK: r17:16 = membh(r21 ++ I:circ(m1))
+0xb0 0xe0 0xb5 0x98
+# CHECK: r17:16 = memubh(r21 ++ #20:circ(m1))
+0x10 0xe2 0xb5 0x98
+# CHECK: r17:16 = memubh(r21 ++ I:circ(m1))
+0xb1 0xc0 0x35 0x9a
+# CHECK: r17 = membh(r21++#10)
+0xb1 0xc0 0x75 0x9a
+# CHECK: r17 = memubh(r21++#10)
+0xb0 0xc0 0xb5 0x9a
+# CHECK: r17:16 = memubh(r21++#20)
+0xb0 0xc0 0xf5 0x9a
+# CHECK: r17:16 = membh(r21++#20)
+0x11 0xe0 0x35 0x9c
+# CHECK: r17 = membh(r21++m1)
+0x11 0xe0 0x75 0x9c
+# CHECK: r17 = memubh(r21++m1)
+0x10 0xe0 0xf5 0x9c
+# CHECK: r17:16 = membh(r21++m1)
+0x10 0xe0 0xb5 0x9c
+# CHECK: r17:16 = memubh(r21++m1)
+0x11 0xe0 0x35 0x9e
+# CHECK: r17 = membh(r21 ++ m1:brev)
+0x11 0xe0 0x75 0x9e
+# CHECK: r17 = memubh(r21 ++ m1:brev)
+0x10 0xe0 0xb5 0x9e
+# CHECK: r17:16 = memubh(r21 ++ m1:brev)
+0x10 0xe0 0xf5 0x9e
+# CHECK: r17:16 = membh(r21 ++ m1:brev)
diff --git a/test/MC/Disassembler/Hexagon/lit.local.cfg b/test/MC/Disassembler/Hexagon/lit.local.cfg
new file mode 100644
index 0000000..6500d4d
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'Hexagon' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/MC/Disassembler/Hexagon/memop.txt b/test/MC/Disassembler/Hexagon/memop.txt
new file mode 100644
index 0000000..47dfd93
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/memop.txt
@@ -0,0 +1,56 @@
+# RUN: llvm-mc -triple=hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.6 MEMOP
+
+# Operation on memory byte
+0x95 0xd9 0x11 0x3e
+# CHECK: memb(r17+#51) += r21
+0xb5 0xd9 0x11 0x3e
+# CHECK: memb(r17+#51) -= r21
+0xd5 0xd9 0x11 0x3e
+# CHECK: memb(r17+#51) &= r21
+0xf5 0xd9 0x11 0x3e
+# CHECK: memb(r17+#51) |= r21
+0x95 0xd9 0x11 0x3f
+# CHECK: memb(r17+#51) += #21
+0xb5 0xd9 0x11 0x3f
+# CHECK: memb(r17+#51) -= #21
+0xd5 0xd9 0x11 0x3f
+# CHECK: memb(r17+#51) = clrbit(#21)
+0xf5 0xd9 0x11 0x3f
+# CHECK: memb(r17+#51) = setbit(#21)
+
+# Operation on memory halfword
+0x95 0xd9 0x31 0x3e
+# CHECK: memh(r17+#102) += r21
+0xb5 0xd9 0x31 0x3e
+# CHECK: memh(r17+#102) -= r21
+0xd5 0xd9 0x31 0x3e
+# CHECK: memh(r17+#102) &= r21
+0xf5 0xd9 0x31 0x3e
+# CHECK: memh(r17+#102) |= r21
+0x95 0xd9 0x31 0x3f
+# CHECK: memh(r17+#102) += #21
+0xb5 0xd9 0x31 0x3f
+# CHECK: memh(r17+#102) -= #21
+0xd5 0xd9 0x31 0x3f
+# CHECK: memh(r17+#102) = clrbit(#21)
+0xf5 0xd9 0x31 0x3f
+# CHECK: memh(r17+#102) = setbit(#21)
+
+# Operation on memory word
+0x95 0xd9 0x51 0x3e
+# CHECK: memw(r17+#204) += r21
+0xb5 0xd9 0x51 0x3e
+# CHECK: memw(r17+#204) -= r21
+0xd5 0xd9 0x51 0x3e
+# CHECK: memw(r17+#204) &= r21
+0xf5 0xd9 0x51 0x3e
+# CHECK: memw(r17+#204) |= r21
+0x95 0xd9 0x51 0x3f
+# CHECK: memw(r17+#204) += #21
+0xb5 0xd9 0x51 0x3f
+# CHECK: memw(r17+#204) -= #21
+0xd5 0xd9 0x51 0x3f
+# CHECK: memw(r17+#204) = clrbit(#21)
+0xf5 0xd9 0x51 0x3f
+# CHECK: memw(r17+#204) = setbit(#21)
diff --git a/test/MC/Disassembler/Hexagon/nv_j.txt b/test/MC/Disassembler/Hexagon/nv_j.txt
new file mode 100644
index 0000000..a6773c3
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/nv_j.txt
@@ -0,0 +1,136 @@
+# RUN: llvm-mc -triple=hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.7.1 NV/J
+
+# Jump to address conditioned on new register value
+0x11 0x40 0x71 0x70 0x92 0xd5 0x02 0x20
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.eq(r2.new, r21)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xf5 0x02 0x20
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.eq(r2.new, r21)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xd5 0x42 0x20
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.eq(r2.new, r21)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xf5 0x42 0x20
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.eq(r2.new, r21)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xd5 0x82 0x20
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.gt(r2.new, r21)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xf5 0x82 0x20
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.gt(r2.new, r21)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xd5 0xc2 0x20
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.gt(r2.new, r21)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xf5 0xc2 0x20
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.gt(r2.new, r21)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xd5 0x02 0x21
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.gtu(r2.new, r21)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xf5 0x02 0x21
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.gtu(r2.new, r21)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xd5 0x42 0x21
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.gtu(r2.new, r21)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xf5 0x42 0x21
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.gtu(r2.new, r21)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xd5 0x82 0x21
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.gt(r21, r2.new)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xf5 0x82 0x21
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.gt(r21, r2.new)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xd5 0xc2 0x21
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.gt(r21, r2.new)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xf5 0xc2 0x21
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.gt(r21, r2.new)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xd5 0x02 0x22
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.gtu(r21, r2.new)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xf5 0x02 0x22
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.gtu(r21, r2.new)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xd5 0x42 0x22
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.gtu(r21, r2.new)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xf5 0x42 0x22
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.gtu(r21, r2.new)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xd5 0x02 0x24
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.eq(r2.new, #21)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xf5 0x02 0x24
+# CHECK: r17 = r17
+# CHECK-NETX: if (cmp.eq(r2.new, #21)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xd5 0x42 0x24
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.eq(r2.new, #21)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xf5 0x42 0x24
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.eq(r2.new, #21)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xd5 0x82 0x24
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.gt(r2.new, #21)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xf5 0x82 0x24
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.gt(r2.new, #21)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xd5 0xc2 0x24
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.gt(r2.new, #21)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xf5 0xc2 0x24
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.gt(r2.new, #21)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xd5 0x02 0x25
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.gtu(r2.new, #21)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xf5 0x02 0x25
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.gtu(r2.new, #21)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xd5 0x42 0x25
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.gtu(r2.new, #21)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xf5 0x42 0x25
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.gtu(r2.new, #21)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xc0 0x82 0x25
+# CHECK: r17 = r17
+# CHECK-NEXT: if (tstbit(r2.new, #0)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xe0 0x82 0x25
+# CHECK: r17 = r17
+# CHECK-NEXT: if (tstbit(r2.new, #0)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xc0 0xc2 0x25
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!tstbit(r2.new, #0)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xe0 0xc2 0x25
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!tstbit(r2.new, #0)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xc0 0x02 0x26
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.eq(r2.new, #-1)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xe0 0x02 0x26
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.eq(r2.new, #-1)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xc0 0x42 0x26
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.eq(r2.new, #-1)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xe0 0x42 0x26
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.eq(r2.new, #-1)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xc0 0x82 0x26
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.gt(r2.new, #-1)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xe0 0x82 0x26
+# CHECK: r17 = r17
+# CHECK-NEXT: if (cmp.gt(r2.new, #-1)) jump:t
+0x11 0x40 0x71 0x70 0x92 0xc0 0xc2 0x26
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.gt(r2.new, #-1)) jump:nt
+0x11 0x40 0x71 0x70 0x92 0xe0 0xc2 0x26
+# CHECK: r17 = r17
+# CHECK-NEXT: if (!cmp.gt(r2.new, #-1)) jump:t
diff --git a/test/MC/Disassembler/Hexagon/nv_st.txt b/test/MC/Disassembler/Hexagon/nv_st.txt
new file mode 100644
index 0000000..ef49455
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/nv_st.txt
@@ -0,0 +1,203 @@
+# RUN: llvm-mc -triple=hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.7.2 NV/ST
+
+# Store new-value byte
+0x1f 0x40 0x7f 0x70 0x82 0xf5 0xb1 0x3b
+# CHECK: r31 = r31
+# CHECK-NEXT: memb(r17 + r21<<#3) = r2.new
+0x1f 0x40 0x7f 0x70 0x15 0xc2 0xb1 0xa1
+# CHECK: r31 = r31
+# CHECK-NEXT: memb(r17+#21) = r2.new
+0x1f 0x40 0x7f 0x70 0x02 0xe2 0xb1 0xa9
+# CHECK: r31 = r31
+# CHECK-NEXT: memb(r17 ++ I:circ(m1)) = r2.new
+0x1f 0x40 0x7f 0x70 0x28 0xe2 0xb1 0xa9
+# CHECK: r31 = r31
+# CHECK-NEXT: memb(r17 ++ #5:circ(m1)) = r2.new
+0x1f 0x40 0x7f 0x70 0x28 0xc2 0xb1 0xab
+# CHECK: r31 = r31
+# CHECK-NEXT: memb(r17++#5) = r2.new
+0x1f 0x40 0x7f 0x70 0x00 0xe2 0xb1 0xad
+# CHECK: r31 = r31
+# CHECK-NEXT: memb(r17++m1) = r2.new
+0x1f 0x40 0x7f 0x70 0x00 0xe2 0xb1 0xaf
+# CHECK: r31 = r31
+# CHECK-NEXT: memb(r17 ++ m1:brev) = r2.new
+
+# Store new-value byte conditionally
+0x1f 0x40 0x7f 0x70 0xe2 0xf5 0xb1 0x34
+# CHECK: r31 = r31
+# CHECK-NEXT: if (p3) memb(r17+r21<<#3) = r2.new
+0x1f 0x40 0x7f 0x70 0xe2 0xf5 0xb1 0x35
+# CHECK: r31 = r31
+# CHECK-NEXT: if (!p3) memb(r17+r21<<#3) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xe2 0xf5 0xb1 0x36
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (p3.new) memb(r17+r21<<#3) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xe2 0xf5 0xb1 0x37
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (!p3.new) memb(r17+r21<<#3) = r2.new
+0x1f 0x40 0x7f 0x70 0xab 0xc2 0xb1 0x40
+# CHECK: r31 = r31
+# CHECK-NEXT: if (p3) memb(r17+#21) = r2.new
+0x1f 0x40 0x7f 0x70 0xab 0xc2 0xb1 0x44
+# CHECK: r31 = r31
+# CHECK-NEXT: if (!p3) memb(r17+#21) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xab 0xc2 0xb1 0x42
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (p3.new) memb(r17+#21) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xab 0xc2 0xb1 0x46
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (!p3.new) memb(r17+#21) = r2.new
+0x1f 0x40 0x7f 0x70 0x2b 0xe2 0xb1 0xab
+# CHECK: r31 = r31
+# CHECK-NEXT: if (p3) memb(r17++#5) = r2.new
+0x1f 0x40 0x7f 0x70 0x2f 0xe2 0xb1 0xab
+# CHECK: r31 = r31
+# CHECK-NEXT: if (!p3) memb(r17++#5) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xab 0xe2 0xb1 0xab
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (p3.new) memb(r17++#5) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xaf 0xe2 0xb1 0xab
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (!p3.new) memb(r17++#5) = r2.new
+
+# Store new-value halfword
+0x1f 0x40 0x7f 0x70 0x8a 0xf5 0xb1 0x3b
+# CHECK: r31 = r31
+# CHECK-NEXT: memh(r17 + r21<<#3) = r2.new
+0x1f 0x40 0x7f 0x70 0x15 0xca 0xb1 0xa1
+# CHECK: r31 = r31
+# CHECK-NEXT: memh(r17+#42) = r2.new
+0x1f 0x40 0x7f 0x70 0x02 0xea 0xb1 0xa9
+# CHECK: r31 = r31
+# CHECK-NEXT: memh(r17 ++ I:circ(m1)) = r2.new
+0x1f 0x40 0x7f 0x70 0x28 0xea 0xb1 0xa9
+# CHECK: r31 = r31
+# CHECK-NEXT: memh(r17 ++ #10:circ(m1)) = r2.new
+0x1f 0x40 0x7f 0x70 0x28 0xca 0xb1 0xab
+# CHECK: r31 = r31
+# CHECK-NEXT: memh(r17++#10) = r2.new
+0x1f 0x40 0x7f 0x70 0x00 0xea 0xb1 0xad
+# CHECK: r31 = r31
+# CHECK-NEXT: memh(r17++m1) = r2.new
+0x1f 0x40 0x7f 0x70 0x00 0xea 0xb1 0xaf
+# CHECK: r31 = r31
+# CHECK-NEXT: memh(r17 ++ m1:brev) = r2.new
+
+# Store new-value halfword conditionally
+0x1f 0x40 0x7f 0x70 0xea 0xf5 0xb1 0x34
+# CHECK: r31 = r31
+# CHECK-NEXT: if (p3) memh(r17+r21<<#3) = r2.new
+0x1f 0x40 0x7f 0x70 0xea 0xf5 0xb1 0x35
+# CHECK: r31 = r31
+# CHECK-NEXT: if (!p3) memh(r17+r21<<#3) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xea 0xf5 0xb1 0x36
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (p3.new) memh(r17+r21<<#3) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xea 0xf5 0xb1 0x37
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (!p3.new) memh(r17+r21<<#3) = r2.new
+0x1f 0x40 0x7f 0x70 0xab 0xca 0xb1 0x40
+# CHECK: r31 = r31
+# CHECK-NEXT: if (p3) memh(r17+#42) = r2.new
+0x1f 0x40 0x7f 0x70 0xab 0xca 0xb1 0x44
+# CHECK: r31 = r31
+# CHECK-NEXT: if (!p3) memh(r17+#42) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xab 0xca 0xb1 0x42
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (p3.new) memh(r17+#42) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xab 0xca 0xb1 0x46
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (!p3.new) memh(r17+#42) = r2.new
+0x1f 0x40 0x7f 0x70 0x2b 0xea 0xb1 0xab
+# CHECK: r31 = r31
+# CHECK-NEXT: if (p3) memh(r17++#10) = r2.new
+0x1f 0x40 0x7f 0x70 0x2f 0xea 0xb1 0xab
+# CHECK: r31 = r31
+# CHECK-NEXT: if (!p3) memh(r17++#10) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xab 0xea 0xb1 0xab
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (p3.new) memh(r17++#10) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xaf 0xea 0xb1 0xab
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (!p3.new) memh(r17++#10) = r2.new
+
+# Store new-value word
+0x1f 0x40 0x7f 0x70 0x92 0xf5 0xb1 0x3b
+# CHECK: r31 = r31
+# CHECK-NEXT: memw(r17 + r21<<#3) = r2.new
+0x1f 0x40 0x7f 0x70 0x15 0xd2 0xb1 0xa1
+# CHECK: r31 = r31
+# CHECK-NEXT: memw(r17+#84) = r2.new
+0x1f 0x40 0x7f 0x70 0x28 0xf2 0xb1 0xa9
+# CHECK: r31 = r31
+# CHECK-NEXT: memw(r17 ++ #20:circ(m1)) = r2.new
+0x1f 0x40 0x7f 0x70 0x02 0xf2 0xb1 0xa9
+# CHECK: r31 = r31
+# CHECK-NEXT: memw(r17 ++ I:circ(m1)) = r2.new
+0x1f 0x40 0x7f 0x70 0x28 0xd2 0xb1 0xab
+# CHECK: r31 = r31
+# CHECK-NEXT: memw(r17++#20) = r2.new
+0x1f 0x40 0x7f 0x70 0x00 0xf2 0xb1 0xad
+# CHECK: r31 = r31
+# CHECK-NEXT: memw(r17++m1) = r2.new
+0x1f 0x40 0x7f 0x70 0x00 0xf2 0xb1 0xaf
+# CHECK: r31 = r31
+# CHECK-NEXT: memw(r17 ++ m1:brev) = r2.new
+
+# Store new-value word conditionally
+0x1f 0x40 0x7f 0x70 0xf2 0xf5 0xb1 0x34
+# CHECK: r31 = r31
+# CHECK-NEXT: if (p3) memw(r17+r21<<#3) = r2.new
+0x1f 0x40 0x7f 0x70 0xf2 0xf5 0xb1 0x35
+# CHECK: r31 = r31
+# CHECK-NEXT: if (!p3) memw(r17+r21<<#3) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xf2 0xf5 0xb1 0x36
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (p3.new) memw(r17+r21<<#3) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xf2 0xf5 0xb1 0x37
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (!p3.new) memw(r17+r21<<#3) = r2.new
+0x1f 0x40 0x7f 0x70 0xab 0xd2 0xb1 0x40
+# CHECK: r31 = r31
+# CHECK-NEXT: if (p3) memw(r17+#84) = r2.new
+0x1f 0x40 0x7f 0x70 0xab 0xd2 0xb1 0x44
+# CHECK: r31 = r31
+# CHECK-NEXT: if (!p3) memw(r17+#84) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xab 0xd2 0xb1 0x42
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (p3.new) memw(r17+#84) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xab 0xd2 0xb1 0x46
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (!p3.new) memw(r17+#84) = r2.new
+0x1f 0x40 0x7f 0x70 0x2b 0xf2 0xb1 0xab
+# CHECK: r31 = r31
+# CHECK-NEXT: if (p3) memw(r17++#20) = r2.new
+0x1f 0x40 0x7f 0x70 0x2f 0xf2 0xb1 0xab
+# CHECK: r31 = r31
+# CHECK-NEXT: if (!p3) memw(r17++#20) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xab 0xf2 0xb1 0xab
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (p3.new) memw(r17++#20) = r2.new
+0x03 0x40 0x45 0x85 0x1f 0x40 0x7f 0x70 0xaf 0xf2 0xb1 0xab
+# CHECK: p3 = r5
+# CHECK-NEXT: r31 = r31
+# CHECK-NEXT: if (!p3.new) memw(r17++#20) = r2.new
diff --git a/test/MC/Disassembler/Hexagon/st.txt b/test/MC/Disassembler/Hexagon/st.txt
new file mode 100644
index 0000000..3b809d3
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/st.txt
@@ -0,0 +1,288 @@
+# RUN: llvm-mc -triple=hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.8 ST
+
+# Store doubleword
+0x9e 0xf5 0xd1 0x3b
+# CHECK: memd(r17 + r21<<#3) = r31:30
+0x28 0xd4 0xc0 0x48
+# CHECK: memd(##320) = r21:20
+0x15 0xd4 0xd1 0xa1
+# CHECK: memd(r17+#168) = r21:20
+0x02 0xf4 0xd1 0xa9
+# CHECK: memd(r17 ++ I:circ(m1)) = r21:20
+0x28 0xf4 0xd1 0xa9
+# CHECK: memd(r17 ++ #40:circ(m1)) = r21:20
+0x28 0xd4 0xd1 0xab
+# CHECK: memd(r17++#40) = r21:20
+0x00 0xf4 0xd1 0xad
+# CHECK: memd(r17++m1) = r21:20
+0x00 0xf4 0xd1 0xaf
+# CHECK: memd(r17 ++ m1:brev) = r21:20
+
+# Store doubleword conditionally
+0xfe 0xf5 0xd1 0x34
+# CHECK: if (p3) memd(r17+r21<<#3) = r31:30
+0xfe 0xf5 0xd1 0x35
+# CHECK: if (!p3) memd(r17+r21<<#3) = r31:30
+0x03 0x40 0x45 0x85 0xfe 0xf5 0xd1 0x36
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memd(r17+r21<<#3) = r31:30
+0x03 0x40 0x45 0x85 0xfe 0xf5 0xd1 0x37
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memd(r17+r21<<#3) = r31:30
+0xab 0xde 0xd1 0x40
+# CHECK: if (p3) memd(r17+#168) = r31:30
+0xab 0xde 0xd1 0x44
+# CHECK: if (!p3) memd(r17+#168) = r31:30
+0x03 0x40 0x45 0x85 0xab 0xde 0xd1 0x42
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memd(r17+#168) = r31:30
+0x03 0x40 0x45 0x85 0xab 0xde 0xd1 0x46
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memd(r17+#168) = r31:30
+0x2b 0xf4 0xd1 0xab
+# CHECK: if (p3) memd(r17++#40) = r21:20
+0x2f 0xf4 0xd1 0xab
+# CHECK: if (!p3) memd(r17++#40) = r21:20
+0x03 0x40 0x45 0x85 0xab 0xf4 0xd1 0xab
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memd(r17++#40) = r21:20
+0x03 0x40 0x45 0x85 0xaf 0xf4 0xd1 0xab
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memd(r17++#40) = r21:20
+
+# Store byte
+0x9f 0xf5 0x11 0x3b
+# CHECK: memb(r17 + r21<<#3) = r31
+0x9f 0xca 0x11 0x3c
+# CHECK: memb(r17+#21)=#31
+0x15 0xd5 0x00 0x48
+# CHECK: memb(##21) = r21
+0x15 0xd5 0x11 0xa1
+# CHECK: memb(r17+#21) = r21
+0x02 0xf5 0x11 0xa9
+# CHECK: memb(r17 ++ I:circ(m1)) = r21
+0x28 0xf5 0x11 0xa9
+# CHECK: memb(r17 ++ #5:circ(m1)) = r21
+0x28 0xd5 0x11 0xab
+# CHECK: memb(r17++#5) = r21
+0x00 0xf5 0x11 0xad
+# CHECK: memb(r17++m1) = r21
+0x00 0xf5 0x11 0xaf
+# CHECK: memb(r17 ++ m1:brev) = r21
+
+# Store byte conditionally
+0xff 0xf5 0x11 0x34
+# CHECK: if (p3) memb(r17+r21<<#3) = r31
+0xff 0xf5 0x11 0x35
+# CHECK: if (!p3) memb(r17+r21<<#3) = r31
+0x03 0x40 0x45 0x85 0xff 0xf5 0x11 0x36
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memb(r17+r21<<#3) = r31
+0x03 0x40 0x45 0x85 0xff 0xf5 0x11 0x37
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memb(r17+r21<<#3) = r31
+0xff 0xca 0x11 0x38
+# CHECK: if (p3) memb(r17+#21)=#31
+0xff 0xca 0x91 0x38
+# CHECK: if (!p3) memb(r17+#21)=#31
+0x03 0x40 0x45 0x85 0xff 0xca 0x11 0x39
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memb(r17+#21)=#31
+0x03 0x40 0x45 0x85 0xff 0xca 0x91 0x39
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memb(r17+#21)=#31
+0xab 0xdf 0x11 0x40
+# CHECK: if (p3) memb(r17+#21) = r31
+0xab 0xdf 0x11 0x44
+# CHECK: if (!p3) memb(r17+#21) = r31
+0x03 0x40 0x45 0x85 0xab 0xdf 0x11 0x42
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memb(r17+#21) = r31
+0x03 0x40 0x45 0x85 0xab 0xdf 0x11 0x46
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memb(r17+#21) = r31
+0x2b 0xf5 0x11 0xab
+# CHECK: if (p3) memb(r17++#5) = r21
+0x2f 0xf5 0x11 0xab
+# CHECK: if (!p3) memb(r17++#5) = r21
+0x03 0x40 0x45 0x85 0xab 0xf5 0x11 0xab
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memb(r17++#5) = r21
+0x03 0x40 0x45 0x85 0xaf 0xf5 0x11 0xab
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memb(r17++#5) = r21
+
+# Store halfword
+0x9f 0xf5 0x51 0x3b
+# CHECK: memh(r17 + r21<<#3) = r31
+0x9f 0xf5 0x71 0x3b
+# CHECK: memh(r17 + r21<<#3) = r31.h
+0x95 0xcf 0x31 0x3c
+# CHECK: memh(r17+#62)=#21
+0x2a 0xd5 0x40 0x48
+# CHECK: memh(##84) = r21
+0x2a 0xd5 0x60 0x48
+# CHECK: memh(##84) = r21.h
+0x15 0xdf 0x51 0xa1
+# CHECK: memh(r17+#42) = r31
+0x15 0xdf 0x71 0xa1
+# CHECK: memh(r17+#42) = r31.h
+0x02 0xf5 0x51 0xa9
+# CHECK: memh(r17 ++ I:circ(m1)) = r21
+0x28 0xf5 0x51 0xa9
+# CHECK: memh(r17 ++ #10:circ(m1)) = r21
+0x02 0xf5 0x71 0xa9
+# CHECK: memh(r17 ++ I:circ(m1)) = r21.h
+0x28 0xf5 0x71 0xa9
+# CHECK: memh(r17 ++ #10:circ(m1)) = r21.h
+0x28 0xd5 0x51 0xab
+# CHECK: memh(r17++#10) = r21
+0x28 0xd5 0x71 0xab
+# CHECK: memh(r17++#10) = r21.h
+0x00 0xf5 0x51 0xad
+# CHECK: memh(r17++m1) = r21
+0x00 0xf5 0x71 0xad
+# CHECK: memh(r17++m1) = r21.h
+0x00 0xf5 0x51 0xaf
+# CHECK: memh(r17 ++ m1:brev) = r21
+0x00 0xf5 0x71 0xaf
+# CHECK: memh(r17 ++ m1:brev) = r21.h
+
+# Store halfword conditionally
+0xff 0xf5 0x51 0x34
+# CHECK: if (p3) memh(r17+r21<<#3) = r31
+0xff 0xf5 0x71 0x34
+# CHECK: if (p3) memh(r17+r21<<#3) = r31.h
+0xff 0xf5 0x51 0x35
+# CHECK: if (!p3) memh(r17+r21<<#3) = r31
+0xff 0xf5 0x71 0x35
+# CHECK: if (!p3) memh(r17+r21<<#3) = r31.h
+0x03 0x40 0x45 0x85 0xff 0xf5 0x51 0x36
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memh(r17+r21<<#3) = r31
+0x03 0x40 0x45 0x85 0xff 0xf5 0x71 0x36
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memh(r17+r21<<#3) = r31.h
+0x03 0x40 0x45 0x85 0xff 0xf5 0x51 0x37
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memh(r17+r21<<#3) = r31
+0x03 0x40 0x45 0x85 0xff 0xf5 0x71 0x37
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memh(r17+r21<<#3) = r31.h
+0xf5 0xcf 0x31 0x38
+# CHECK: if (p3) memh(r17+#62)=#21
+0xf5 0xcf 0xb1 0x38
+# CHECK: if (!p3) memh(r17+#62)=#21
+0x03 0x40 0x45 0x85 0xf5 0xcf 0x31 0x39
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memh(r17+#62)=#21
+0x03 0x40 0x45 0x85 0xf5 0xcf 0xb1 0x39
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memh(r17+#62)=#21
+0xfb 0xd5 0x51 0x40
+# CHECK: if (p3) memh(r17+#62) = r21
+0xfb 0xd5 0x71 0x40
+# CHECK: if (p3) memh(r17+#62) = r21.h
+0xfb 0xd5 0x51 0x44
+# CHECK: if (!p3) memh(r17+#62) = r21
+0xfb 0xd5 0x71 0x44
+# CHECK: if (!p3) memh(r17+#62) = r21.h
+0x03 0x40 0x45 0x85 0xfb 0xd5 0x51 0x42
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memh(r17+#62) = r21
+0x03 0x40 0x45 0x85 0xfb 0xd5 0x71 0x42
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memh(r17+#62) = r21.h
+0x03 0x40 0x45 0x85 0xfb 0xd5 0x51 0x46
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memh(r17+#62) = r21
+0x03 0x40 0x45 0x85 0xfb 0xd5 0x71 0x46
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memh(r17+#62) = r21.h
+0x2b 0xf5 0x51 0xab
+# CHECK: if (p3) memh(r17++#10) = r21
+0x2f 0xf5 0x51 0xab
+# CHECK: if (!p3) memh(r17++#10) = r21
+0x03 0x40 0x45 0x85 0xab 0xf5 0x51 0xab
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memh(r17++#10) = r21
+0x03 0x40 0x45 0x85 0xaf 0xf5 0x51 0xab 
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memh(r17++#10) = r21
+0x2b 0xf5 0x71 0xab
+# CHECK: if (p3) memh(r17++#10) = r21.h
+0x2f 0xf5 0x71 0xab
+# CHECK: if (!p3) memh(r17++#10) = r21.h
+0x03 0x40 0x45 0x85 0xab 0xf5 0x71 0xab
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memh(r17++#10) = r21.h
+0x03 0x40 0x45 0x85 0xaf 0xf5 0x71 0xab
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memh(r17++#10) = r21.h
+
+# Store word
+0x9f 0xf5 0x91 0x3b
+# CHECK: memw(r17 + r21<<#3) = r31
+0x9f 0xca 0x51 0x3c
+# CHECK: memw(r17+#84)=#31
+0x15 0xdf 0x91 0xa1
+# CHECK: memw(r17+#84) = r31
+0x14 0xd5 0x80 0x48
+# CHECK: memw(##80) = r21
+0x02 0xf5 0x91 0xa9
+# CHECK: memw(r17 ++ I:circ(m1)) = r21
+0x28 0xf5 0x91 0xa9
+# CHECK: memw(r17 ++ #20:circ(m1)) = r21
+0x28 0xd5 0x91 0xab
+# CHECK: memw(r17++#20) = r21
+0x00 0xf5 0x91 0xad
+# CHECK: memw(r17++m1) = r21
+0x00 0xf5 0x91 0xaf
+# CHECK: memw(r17 ++ m1:brev) = r21
+
+# Store word conditionally
+0xff 0xf5 0x91 0x34
+# CHECK: if (p3) memw(r17+r21<<#3) = r31
+0xff 0xf5 0x91 0x35
+# CHECK: if (!p3) memw(r17+r21<<#3) = r31
+0x03 0x40 0x45 0x85 0xff 0xf5 0x91 0x36
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memw(r17+r21<<#3) = r31
+0x03 0x40 0x45 0x85 0xff 0xf5 0x91 0x37
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memw(r17+r21<<#3) = r31
+0xff 0xca 0x51 0x38
+# CHECK: if (p3) memw(r17+#84)=#31
+0xff 0xca 0xd1 0x38
+# CHECK: if (!p3) memw(r17+#84)=#31
+0x03 0x40 0x45 0x85 0xff 0xca 0x51 0x39
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memw(r17+#84)=#31
+0x03 0x40 0x45 0x85 0xff 0xca 0xd1 0x39
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memw(r17+#84)=#31
+0xab 0xdf 0x91 0x40
+# CHECK: if (p3) memw(r17+#84) = r31
+0xab 0xdf 0x91 0x44 
+# CHECK: if (!p3) memw(r17+#84) = r31
+0x03 0x40 0x45 0x85 0xab 0xdf 0x91 0x42
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memw(r17+#84) = r31
+0x03 0x40 0x45 0x85 0xab 0xdf 0x91 0x46
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memw(r17+#84) = r31
+0x2b 0xf5 0x91 0xab
+# CHECK: if (p3) memw(r17++#20) = r21
+0x2f 0xf5 0x91 0xab
+# CHECK: if (!p3) memw(r17++#20) = r21
+0x03 0x40 0x45 0x85 0xaf 0xf5 0x91 0xab
+# CHECK: p3 = r5
+# CHECK-NEXT: if (!p3.new) memw(r17++#20) = r21
+0x03 0x40 0x45 0x85 0xab 0xf5 0x91 0xab
+# CHECK: p3 = r5
+# CHECK-NEXT: if (p3.new) memw(r17++#20) = r21
+
+# Allocate stack frame
+0x1f 0xc0 0x9d 0xa0
+# CHECK: allocframe(#248)
+\ No newline at end of file
diff --git a/test/MC/Disassembler/Hexagon/system_user.txt b/test/MC/Disassembler/Hexagon/system_user.txt
new file mode 100644
index 0000000..d55a94e
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/system_user.txt
@@ -0,0 +1,26 @@
+# RUN: llvm-mc -triple=hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.9.1 SYSTEM/USER
+
+# Load locked
+0x11 0xc0 0x15 0x92
+# CHECK: r17 = memw_locked(r21)
+0x10 0xd0 0x15 0x92
+# CHECK: r17:16 = memd_locked(r21)
+
+# Store conditional
+0x03 0xd5 0xb1 0xa0
+# CHECK: memw_locked(r17, p3) = r21
+0x03 0xd4 0xf1 0xa0
+# CHECK: memd_locked(r17, p3) = r21:20
+
+# Memory barrier
+0x00 0xc0 0x00 0xa8
+# CHECK: barrier
+
+# Data cache prefetch
+0x15 0xc0 0x11 0x94
+# CHECK: dcfetch(r17 + #168)
+
+# Send value to ETM trace
+0x00 0xc0 0x51 0x62
+# CHECK: trace(r17)
diff --git a/test/MC/Disassembler/Hexagon/xtype_alu.txt b/test/MC/Disassembler/Hexagon/xtype_alu.txt
new file mode 100644
index 0000000..03d0f05
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/xtype_alu.txt
@@ -0,0 +1,395 @@
+# RUN: llvm-mc -triple=hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.10.1 XTYPE/ALU
+
+# Absolute value doubleword
+0xd0 0xc0 0x94 0x80
+# CHECK: r17:16 = abs(r21:20)
+0x91 0xc0 0x95 0x8c
+# CHECK: r17 = abs(r21)
+0xb1 0xc0 0x95 0x8c
+# CHECK: r17 = abs(r21):sat
+
+# Add and accumulate
+0xff 0xd1 0x35 0xdb
+# CHECK: r17 = add(r21, add(r31, #23))
+0xff 0xd1 0xb5 0xdb
+# CHECK: r17 = add(r21, sub(#23, r31))
+0xf1 0xc2 0x15 0xe2
+# CHECK: r17 += add(r21, #23)
+0xf1 0xc2 0x95 0xe2
+# CHECK: r17 -= add(r21, #23)
+0x31 0xdf 0x15 0xef
+# CHECK: r17 += add(r21, r31)
+0x31 0xdf 0x95 0xef
+# CHECK: r17 -= add(r21, r31)
+
+# Add doublewords
+0xf0 0xde 0x14 0xd3
+# CHECK: r17:16 = add(r21:20, r31:30)
+0xb0 0xde 0x74 0xd3
+# CHECK: r17:16 = add(r21:20, r31:30):sat
+0xd0 0xde 0x74 0xd3
+# CHECK: r17:16 = add(r21:20, r31:30):raw:lo
+0xf0 0xde 0x74 0xd3
+# CHECK: r17:16 = add(r21:20, r31:30):raw:hi
+
+# Add halfword
+0x11 0xd5 0x1f 0xd5
+# CHECK: r17 = add(r21.l, r31.l)
+0x51 0xd5 0x1f 0xd5
+# CHECK: r17 = add(r21.l, r31.h)
+0x91 0xd5 0x1f 0xd5
+# CHECK: r17 = add(r21.l, r31.l):sat
+0xd1 0xd5 0x1f 0xd5
+# CHECK: r17 = add(r21.l, r31.h):sat
+0x11 0xd5 0x5f 0xd5
+# CHECK: r17 = add(r21.l, r31.l):<<16
+0x31 0xd5 0x5f 0xd5
+# CHECK: r17 = add(r21.l, r31.h):<<16
+0x51 0xd5 0x5f 0xd5
+# CHECK: r17 = add(r21.h, r31.l):<<16
+0x71 0xd5 0x5f 0xd5
+# CHECK: r17 = add(r21.h, r31.h):<<16
+0x91 0xd5 0x5f 0xd5
+# CHECK: r17 = add(r21.l, r31.l):sat:<<16
+0xb1 0xd5 0x5f 0xd5
+# CHECK: r17 = add(r21.l, r31.h):sat:<<16
+0xd1 0xd5 0x5f 0xd5
+# CHECK: r17 = add(r21.h, r31.l):sat:<<16
+0xf1 0xd5 0x5f 0xd5
+# CHECK: r17 = add(r21.h, r31.h):sat:<<16
+
+# Add or subtract doublewords with carry
+0x70 0xde 0xd4 0xc2
+# CHECK: r17:16 = add(r21:20, r31:30, p3):carry
+0x70 0xde 0xf4 0xc2
+# CHECK: r17:16 = sub(r21:20, r31:30, p3):carry
+
+# Logical doublewords
+0x90 0xc0 0x94 0x80
+# CHECK: r17:16 = not(r21:20)
+0x10 0xde 0xf4 0xd3
+# CHECK: r17:16 = and(r21:20, r31:30)
+0x30 0xd4 0xfe 0xd3
+# CHECK: r17:16 = and(r21:20, ~r31:30)
+0x50 0xde 0xf4 0xd3
+# CHECK: r17:16 = or(r21:20, r31:30)
+0x70 0xd4 0xfe 0xd3
+# CHECK: r17:16 = or(r21:20, ~r31:30)
+0x90 0xde 0xf4 0xd3
+# CHECK: r17:16 = xor(r21:20, r31:30)
+
+# Logical-logical doublewords
+0x10 0xde 0x94 0xca
+# CHECK: r17:16 ^= xor(r21:20, r31:30)
+
+# Logical-logical words
+0xf1 0xc3 0x15 0xda
+# CHECK: r17 |= and(r21, #31)
+0xf5 0xc3 0x51 0xda
+# CHECK: r17 = or(r21, and(r17, #31))
+0xf1 0xc3 0x95 0xda
+# CHECK: r17 |= or(r21, #31)
+0x11 0xdf 0x35 0xef
+# CHECK: r17 |= and(r21, ~r31)
+0x31 0xdf 0x35 0xef
+# CHECK: r17 &= and(r21, ~r31)
+0x51 0xdf 0x35 0xef
+# CHECK: r17 ^= and(r21, ~r31)
+0x11 0xdf 0x55 0xef
+# CHECK: r17 &= and(r21, r31)
+0x31 0xdf 0x55 0xef
+# CHECK: r17 &= or(r21, r31)
+0x51 0xdf 0x55 0xef
+# CHECK: r17 &= xor(r21, r31)
+0x71 0xdf 0x55 0xef
+# CHECK: r17 |= and(r21, r31)
+0x71 0xdf 0x95 0xef
+# CHECK: r17 ^= xor(r21, r31)
+0x11 0xdf 0xd5 0xef
+# CHECK: r17 |= or(r21, r31)
+0x31 0xdf 0xd5 0xef
+# CHECK: r17 |= xor(r21, r31)
+0x51 0xdf 0xd5 0xef
+# CHECK: r17 ^= and(r21, r31)
+0x71 0xdf 0xd5 0xef
+# CHECK: r17 ^= or(r21, r31)
+
+# Maximum words
+0x11 0xdf 0xd5 0xd5
+# CHECK: r17 = max(r21, r31)
+0x91 0xdf 0xd5 0xd5
+# CHECK: r17 = maxu(r21, r31)
+
+# Maximum doublewords
+0x90 0xde 0xd4 0xd3
+# CHECK: r17:16 = max(r21:20, r31:30)
+0xb0 0xde 0xd4 0xd3
+# CHECK: r17:16 = maxu(r21:20, r31:30)
+
+# Minimum words
+0x11 0xd5 0xbf 0xd5
+# CHECK: r17 = min(r21, r31)
+0x91 0xd5 0xbf 0xd5
+# CHECK: r17 = minu(r21, r31)
+
+# Minimum doublewords
+0xd0 0xd4 0xbe 0xd3
+# CHECK: r17:16 = min(r21:20, r31:30)
+0xf0 0xd4 0xbe 0xd3
+# CHECK: r17:16 = minu(r21:20, r31:30)
+
+# Module wrap
+0xf1 0xdf 0xf5 0xd3
+# CHECK: r17 = modwrap(r21, r31)
+
+# Negate
+0xb0 0xc0 0x94 0x80
+# CHECK: r17:16 = neg(r21:20)
+0xd1 0xc0 0x95 0x8c
+# CHECK: r17 = neg(r21):sat
+
+# Round
+0x31 0xc0 0xd4 0x88
+# CHECK: r17 = round(r21:20):sat
+0x11 0xdf 0xf5 0x8c
+# CHECK: r17 = cround(r21, #31)
+0x91 0xdf 0xf5 0x8c
+# CHECK: r17 = round(r21, #31)
+0xd1 0xdf 0xf5 0x8c
+# CHECK: r17 = round(r21, #31):sat
+0x11 0xdf 0xd5 0xc6
+# CHECK: r17 = cround(r21, r31)
+0x91 0xdf 0xd5 0xc6
+# CHECK: r17 = round(r21, r31)
+0xd1 0xdf 0xd5 0xc6
+# CHECK: r17 = round(r21, r31):sat
+
+# Subtract doublewords
+0xf0 0xd4 0x3e 0xd3
+# CHECK: r17:16 = sub(r21:20, r31:30)
+
+# Subtract and accumulate words
+0x71 0xd5 0x1f 0xef
+# CHECK: r17 += sub(r21, r31)
+
+# Subtract halfword
+0x11 0xd5 0x3f 0xd5
+# CHECK: r17 = sub(r21.l, r31.l)
+0x51 0xd5 0x3f 0xd5
+# CHECK: r17 = sub(r21.l, r31.h)
+0x91 0xd5 0x3f 0xd5
+# CHECK: r17 = sub(r21.l, r31.l):sat
+0xd1 0xd5 0x3f 0xd5
+# CHECK: r17 = sub(r21.l, r31.h):sat
+0x11 0xd5 0x7f 0xd5
+# CHECK: r17 = sub(r21.l, r31.l):<<16
+0x31 0xd5 0x7f 0xd5
+# CHECK: r17 = sub(r21.l, r31.h):<<16
+0x51 0xd5 0x7f 0xd5
+# CHECK: r17 = sub(r21.h, r31.l):<<16
+0x71 0xd5 0x7f 0xd5
+# CHECK: r17 = sub(r21.h, r31.h):<<16
+0x91 0xd5 0x7f 0xd5
+# CHECK: r17 = sub(r21.l, r31.l):sat:<<16
+0xb1 0xd5 0x7f 0xd5
+# CHECK: r17 = sub(r21.l, r31.h):sat:<<16
+0xd1 0xd5 0x7f 0xd5
+# CHECK: r17 = sub(r21.h, r31.l):sat:<<16
+0xf1 0xd5 0x7f 0xd5
+# CHECK: r17 = sub(r21.h, r31.h):sat:<<16
+
+# Sign extend word to doubleword
+0x10 0xc0 0x55 0x84
+# CHECK: r17:16 = sxtw(r21)
+
+# Vector absolute value halfwords
+0x90 0xc0 0x54 0x80
+# CHECK: r17:16 = vabsh(r21:20)
+0xb0 0xc0 0x54 0x80
+# CHECK: r17:16 = vabsh(r21:20):sat
+
+# Vector absolute value words
+0xd0 0xc0 0x54 0x80
+# CHECK: r17:16 = vabsw(r21:20)
+0xf0 0xc0 0x54 0x80
+# CHECK: r17:16 = vabsw(r21:20):sat
+
+# Vector absolute difference halfwords
+0x10 0xd4 0x7e 0xe8
+# CHECK: r17:16 = vabsdiffh(r21:20, r31:30)
+
+# Vector absolute difference words
+0x10 0xd4 0x3e 0xe8
+# CHECK: r17:16 = vabsdiffw(r21:20, r31:30)
+
+# Vector add halfwords
+0x50 0xde 0x14 0xd3
+# CHECK: r17:16 = vaddh(r21:20, r31:30)
+0x70 0xde 0x14 0xd3
+# CHECK: r17:16 = vaddh(r21:20, r31:30):sat
+0x90 0xde 0x14 0xd3
+# CHECK: r17:16 = vadduh(r21:20, r31:30):sat
+
+# Vector add halfwords with saturate and pack to unsigned bytes
+0x31 0xde 0x54 0xc1
+# CHECK: r17 = vaddhub(r21:20, r31:30):sat
+
+# Vector reduce add unsigned bytes
+0x30 0xde 0x54 0xe8
+# CHECK: r17:16 = vraddub(r21:20, r31:30)
+0x30 0xde 0x54 0xea
+# CHECK: r17:16 += vraddub(r21:20, r31:30)
+
+# Vector reduce add halfwords
+0x31 0xde 0x14 0xe9
+# CHECK: r17 = vradduh(r21:20, r31:30)
+0xf1 0xde 0x34 0xe9
+# CHECK: r17 = vraddh(r21:20, r31:30)
+
+# Vector add bytes
+0x10 0xde 0x14 0xd3
+# CHECK: r17:16 = vaddub(r21:20, r31:30)
+0x30 0xde 0x14 0xd3
+# CHECK: r17:16 = vaddub(r21:20, r31:30):sat
+
+# Vector add words
+0xb0 0xde 0x14 0xd3
+# CHECK: r17:16 = vaddw(r21:20, r31:30)
+0xd0 0xde 0x14 0xd3
+# CHECK: r17:16 = vaddw(r21:20, r31:30):sat
+
+# Vector average halfwords
+0x50 0xde 0x54 0xd3
+# CHECK: r17:16 = vavgh(r21:20, r31:30)
+0x70 0xde 0x54 0xd3
+# CHECK: r17:16 = vavgh(r21:20, r31:30):rnd
+0x90 0xde 0x54 0xd3
+# CHECK: r17:16 = vavgh(r21:20, r31:30):crnd
+0xb0 0xde 0x54 0xd3
+# CHECK: r17:16 = vavguh(r21:20, r31:30)
+0xd0 0xde 0x54 0xd3
+# CHECK: r17:16 = vavguh(r21:20, r31:30):rnd
+0x10 0xd4 0x9e 0xd3
+# CHECK: r17:16 = vnavgh(r21:20, r31:30)
+0x30 0xd4 0x9e 0xd3
+# CHECK: r17:16 = vnavgh(r21:20, r31:30):rnd:sat
+0x50 0xd4 0x9e 0xd3
+# CHECK: r17:16 = vnavgh(r21:20, r31:30):crnd:sat
+
+# Vector average unsigned bytes
+0x10 0xde 0x54 0xd3
+# CHECK: r17:16 = vavgub(r21:20, r31:30)
+0x30 0xde 0x54 0xd3
+# CHECK: r17:16 = vavgub(r21:20, r31:30):rnd
+
+# Vector average words
+0x10 0xde 0x74 0xd3
+# CHECK: r17:16 = vavgw(r21:20, r31:30)
+0x30 0xde 0x74 0xd3
+# CHECK: r17:16 = vavgw(r21:20, r31:30):rnd
+0x50 0xde 0x74 0xd3
+# CHECK: r17:16 = vavgw(r21:20, r31:30):crnd
+0x70 0xde 0x74 0xd3
+# CHECK: r17:16 = vavguw(r21:20, r31:30)
+0x90 0xde 0x74 0xd3
+# CHECK: r17:16 = vavguw(r21:20, r31:30):rnd
+0x70 0xd4 0x9e 0xd3
+# CHECK: r17:16 = vnavgw(r21:20, r31:30)
+0x90 0xd4 0x9e 0xd3
+# CHECK: r17:16 = vnavgw(r21:20, r31:30):rnd:sat
+0xd0 0xd4 0x9e 0xd3
+# CHECK: r17:16 = vnavgw(r21:20, r31:30):crnd:sat
+
+# Vector conditional negate
+0x50 0xdf 0xd4 0xc3
+# CHECK: r17:16 = vcnegh(r21:20, r31)
+
+0xf0 0xff 0x34 0xcb
+# CHECK: r17:16 += vrcnegh(r21:20, r31)
+
+# Vector maximum bytes
+0x10 0xd4 0xde 0xd3
+# CHECK: r17:16 = vmaxub(r21:20, r31:30)
+0xd0 0xd4 0xde 0xd3
+# CHECK: r17:16 = vmaxb(r21:20, r31:30)
+
+# Vector maximum halfwords
+0x30 0xd4 0xde 0xd3
+# CHECK: r17:16 = vmaxh(r21:20, r31:30)
+0x50 0xd4 0xde 0xd3
+# CHECK: r17:16 = vmaxuh(r21:20, r31:30)
+
+# Vector reduce maximum halfwords
+0x3f 0xd0 0x34 0xcb
+# CHECK: r17:16 = vrmaxh(r21:20, r31)
+0x3f 0xf0 0x34 0xcb
+# CHECK: r17:16 = vrmaxuh(r21:20, r31)
+
+# Vector reduce maximum words
+0x5f 0xd0 0x34 0xcb
+# CHECK: r17:16 = vrmaxw(r21:20, r31)
+0x5f 0xf0 0x34 0xcb
+# CHECK: r17:16 = vrmaxuw(r21:20, r31)
+
+# Vector maximum words
+0xb0 0xd4 0xbe 0xd3
+# CHECK: r17:16 = vmaxuw(r21:20, r31:30)
+0x70 0xd4 0xde 0xd3
+# CHECK: r17:16 = vmaxw(r21:20, r31:30)
+
+# Vector minimum bytes
+0x10 0xd4 0xbe 0xd3
+# CHECK: r17:16 = vminub(r21:20, r31:30)
+0xf0 0xd4 0xde 0xd3
+# CHECK: r17:16 = vminb(r21:20, r31:30)
+
+# Vector minimum halfwords
+0x30 0xd4 0xbe 0xd3
+# CHECK: r17:16 = vminh(r21:20, r31:30)
+0x50 0xd4 0xbe 0xd3
+# CHECK: r17:16 = vminuh(r21:20, r31:30)
+
+# Vector reduce minimum halfwords
+0xbf 0xd0 0x34 0xcb
+# CHECK: r17:16 = vrminh(r21:20, r31)
+0xbf 0xf0 0x34 0xcb
+# CHECK: r17:16 = vrminuh(r21:20, r31)
+
+# Vector reduce minimum words
+0xdf 0xd0 0x34 0xcb
+# CHECK: r17:16 = vrminw(r21:20, r31)
+0xdf 0xf0 0x34 0xcb
+# CHECK: r17:16 = vrminuw(r21:20, r31)
+
+# Vector minimum words
+0x70 0xd4 0xbe 0xd3
+# CHECK: r17:16 = vminw(r21:20, r31:30)
+0x90 0xd4 0xbe 0xd3
+# CHECK: r17:16 = vminuw(r21:20, r31:30)
+
+# Vector sum of absolute differences unsigned bytes
+0x50 0xde 0x54 0xe8
+# CHECK: r17:16 = vrsadub(r21:20, r31:30)
+0x50 0xde 0x54 0xea
+# CHECK: r17:16 += vrsadub(r21:20, r31:30)
+
+# Vector subtract halfwords
+0x50 0xd4 0x3e 0xd3
+# CHECK: r17:16 = vsubh(r21:20, r31:30)
+0x70 0xd4 0x3e 0xd3
+# CHECK: r17:16 = vsubh(r21:20, r31:30):sat
+0x90 0xd4 0x3e 0xd3
+# CHECK: r17:16 = vsubuh(r21:20, r31:30):sat
+
+# Vector subtract bytes
+0x10 0xd4 0x3e 0xd3
+# CHECK: r17:16 = vsubub(r21:20, r31:30)
+0x30 0xd4 0x3e 0xd3
+# CHECK: r17:16 = vsubub(r21:20, r31:30):sat
+
+# Vector subtract words
+0xb0 0xd4 0x3e 0xd3
+# CHECK: r17:16 = vsubw(r21:20, r31:30)
+0xd0 0xd4 0x3e 0xd3
+# CHECK: r17:16 = vsubw(r21:20, r31:30):sat
diff --git a/test/MC/Disassembler/Hexagon/xtype_bit.txt b/test/MC/Disassembler/Hexagon/xtype_bit.txt
new file mode 100644
index 0000000..89b6906
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/xtype_bit.txt
@@ -0,0 +1,118 @@
+# RUN: llvm-mc -triple=hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.10.2 XTYPE/BIT
+
+# Count leading
+0x11 0xc0 0x54 0x88
+# CHECK: r17 = clb(r21:20)
+0x51 0xc0 0x54 0x88
+# CHECK: r17 = cl0(r21:20)
+0x91 0xc0 0x54 0x88
+# CHECK: r17 = cl1(r21:20)
+0x11 0xc0 0x74 0x88
+# CHECK: r17 = normamt(r21:20)
+0x51 0xd7 0x74 0x88
+# CHECK: r17 = add(clb(r21:20), #23)
+0x11 0xd7 0x35 0x8c
+# CHECK: r17 = add(clb(r21), #23)
+0x91 0xc0 0x15 0x8c
+# CHECK: r17 = clb(r21)
+0xb1 0xc0 0x15 0x8c
+# CHECK: r17 = cl0(r21)
+0xd1 0xc0 0x15 0x8c
+# CHECK: r17 = cl1(r21)
+0xf1 0xc0 0x15 0x8c
+# CHECK: r17 = normamt(r21)
+
+# Count population
+0x71 0xc0 0x74 0x88
+# CHECK: r17 = popcount(r21:20)
+
+# Count trailing
+0x51 0xc0 0xf4 0x88
+# CHECK: r17 = ct0(r21:20)
+0x91 0xc0 0xf4 0x88
+# CHECK: r17 = ct1(r21:20)
+0x91 0xc0 0x55 0x8c
+# CHECK: r17 = ct0(r21)
+0xb1 0xc0 0x55 0x8c
+# CHECK: r17 = ct1(r21)
+
+# Extract bitfield
+0xf0 0xdf 0x54 0x81
+# CHECK: r17:16 = extractu(r21:20, #31, #23)
+0xf0 0xdf 0x54 0x8a
+# CHECK: r17:16 = extract(r21:20, #31, #23)
+0xf1 0xdf 0x55 0x8d
+# CHECK: r17 = extractu(r21, #31, #23)
+0xf1 0xdf 0xd5 0x8d
+# CHECK: r17 = extract(r21, #31, #23)
+0x10 0xde 0x14 0xc1
+# CHECK: r17:16 = extractu(r21:20, r31:30)
+0x90 0xde 0xd4 0xc1
+# CHECK: r17:16 = extract(r21:20, r31:30)
+0x11 0xde 0x15 0xc9
+# CHECK: r17 = extractu(r21, r31:30)
+0x51 0xde 0x15 0xc9
+# CHECK: r17 = extract(r21, r31:30)
+
+# Insert bitfield
+0xf0 0xdf 0x54 0x83
+# CHECK: r17:16 = insert(r21:20, #31, #23)
+0xf1 0xdf 0x55 0x8f
+# CHECK: r17 = insert(r21, #31, #23)
+0x11 0xde 0x15 0xc8
+# CHECK: r17 = insert(r21, r31:30)
+0x10 0xde 0x14 0xca
+# CHECK: r17:16 = insert(r21:20, r31:30)
+
+# Interleave/deinterleave
+0x90 0xc0 0xd4 0x80
+# CHECK: r17:16 = deinterleave(r21:20)
+0xb0 0xc0 0xd4 0x80
+# CHECK: r17:16 = interleave(r21:20)
+
+# Linear feedback-shift iteration
+0xd0 0xde 0x94 0xc1
+# CHECK: r17:16 = lfs(r21:20, r31:30)
+
+# Masked parity
+0x11 0xde 0x14 0xd0
+# CHECK: r17 = parity(r21:20, r31:30)
+0x11 0xdf 0xf5 0xd5
+# CHECK: r17 = parity(r21, r31)
+
+# Bit reverse
+0xd0 0xc0 0xd4 0x80
+# CHECK: r17:16 = brev(r21:20)
+0xd1 0xc0 0x55 0x8c
+# CHECK: r17 = brev(r21)
+
+# Set/clear/toggle bit
+0x11 0xdf 0xd5 0x8c
+# CHECK: r17 = setbit(r21, #31)
+0x31 0xdf 0xd5 0x8c
+# CHECK: r17 = clrbit(r21, #31)
+0x51 0xdf 0xd5 0x8c
+# CHECK: r17 = togglebit(r21, #31)
+0x11 0xdf 0x95 0xc6
+# CHECK: r17 = setbit(r21, r31)
+0x51 0xdf 0x95 0xc6
+# CHECK: r17 = clrbit(r21, r31)
+0x91 0xdf 0x95 0xc6
+# CHECK: r17 = togglebit(r21, r31)
+
+# Split bitfield
+0x90 0xdf 0xd5 0x88
+# CHECK: r17:16 = bitsplit(r21, #31)
+0x10 0xdf 0x35 0xd4
+# CHECK: r17:16 = bitsplit(r21, r31)
+
+# Table index
+0xf1 0xcd 0x15 0x87
+# CHECK: r17 = tableidxb(r21, #7, #13):raw
+0xf1 0xcd 0x55 0x87
+# CHECK: r17 = tableidxh(r21, #7, #13):raw
+0xf1 0xcd 0x95 0x87
+# CHECK: r17 = tableidxw(r21, #7, #13):raw
+0xf1 0xcd 0xd5 0x87
+# CHECK: r17 = tableidxd(r21, #7, #13):raw
diff --git a/test/MC/Disassembler/Hexagon/xtype_complex.txt b/test/MC/Disassembler/Hexagon/xtype_complex.txt
new file mode 100644
index 0000000..2332082
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/xtype_complex.txt
@@ -0,0 +1,128 @@
+# RUN: llvm-mc -triple=hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.10.3 XTYPE/COMPLEX
+
+# Complex add/sub halfwords
+0x90 0xde 0x54 0xc1
+# CHECK: r17:16 = vxaddsubh(r21:20, r31:30):sat
+0xd0 0xde 0x54 0xc1
+# CHECK: r17:16 = vxsubaddh(r21:20, r31:30):sat
+0x10 0xde 0xd4 0xc1
+# CHECK: r17:16 = vxaddsubh(r21:20, r31:30):rnd:>>1:sat
+0x50 0xde 0xd4 0xc1
+# CHECK: r17:16 = vxsubaddh(r21:20, r31:30):rnd:>>1:sat
+
+# Complex add/sub words
+0x10 0xde 0x54 0xc1
+# CHECK: r17:16 = vxaddsubw(r21:20, r31:30):sat
+0x50 0xde 0x54 0xc1
+# CHECK: r17:16 = vxsubaddw(r21:20, r31:30):sat
+
+# Complex multiply
+0xd0 0xdf 0x15 0xe5
+# CHECK: r17:16 = cmpy(r21, r31):sat
+0xd0 0xdf 0x95 0xe5
+# CHECK: r17:16 = cmpy(r21, r31):<<1:sat
+0xd0 0xdf 0x55 0xe5
+# CHECK: r17:16 = cmpy(r21, r31*):sat
+0xd0 0xdf 0xd5 0xe5
+# CHECK: r17:16 = cmpy(r21, r31*):<<1:sat
+0xd0 0xdf 0x15 0xe7
+# CHECK: r17:16 += cmpy(r21, r31):sat
+0xd0 0xdf 0x95 0xe7
+# CHECK: r17:16 += cmpy(r21, r31):<<1:sat
+0xf0 0xdf 0x15 0xe7
+# CHECK: r17:16 -= cmpy(r21, r31):sat
+0xf0 0xdf 0x95 0xe7
+# CHECK: r17:16 -= cmpy(r21, r31):<<1:sat
+0xd0 0xdf 0x55 0xe7
+# CHECK: r17:16 += cmpy(r21, r31*):sat
+0xd0 0xdf 0xd5 0xe7
+# CHECK: r17:16 += cmpy(r21, r31*):<<1:sat
+0xf0 0xdf 0x55 0xe7
+# CHECK: r17:16 -= cmpy(r21, r31*):sat
+0xf0 0xdf 0xd5 0xe7
+# CHECK: r17:16 -= cmpy(r21, r31*):<<1:sat
+
+# Complex multiply real or imaginary
+0x30 0xdf 0x15 0xe5
+# CHECK: r17:16 = cmpyi(r21, r31)
+0x50 0xdf 0x15 0xe5
+# CHECK: r17:16 = cmpyr(r21, r31)
+0x30 0xdf 0x15 0xe7
+# CHECK: r17:16 += cmpyi(r21, r31)
+0x50 0xdf 0x15 0xe7
+# CHECK: r17:16 += cmpyr(r21, r31)
+
+# Complex multiply with round and pack
+0xd1 0xdf 0x35 0xed
+# CHECK: r17 = cmpy(r21, r31):rnd:sat
+0xd1 0xdf 0xb5 0xed
+# CHECK: r17 = cmpy(r21, r31):<<1:rnd:sat
+0xd1 0xdf 0x75 0xed
+# CHECK: r17 = cmpy(r21, r31*):rnd:sat
+0xd1 0xdf 0xf5 0xed
+# CHECK: r17 = cmpy(r21, r31*):<<1:rnd:sat
+
+# Complex multiply 32x16
+0x91 0xdf 0x14 0xc5
+# CHECK: r17 = cmpyiwh(r21:20, r31):<<1:rnd:sat
+0xb1 0xdf 0x14 0xc5
+# CHECK: r17 = cmpyiwh(r21:20, r31*):<<1:rnd:sat
+0xd1 0xdf 0x14 0xc5
+# CHECK: r17 = cmpyrwh(r21:20, r31):<<1:rnd:sat
+0xf1 0xdf 0x14 0xc5
+# CHECK: r17 = cmpyrwh(r21:20, r31*):<<1:rnd:sat
+
+# Vector complex multiply real or imaginary
+0xd0 0xde 0x34 0xe8
+# CHECK: r17:16 = vcmpyr(r21:20, r31:30):sat
+0xd0 0xde 0xb4 0xe8
+# CHECK: r17:16 = vcmpyr(r21:20, r31:30):<<1:sat
+0xd0 0xde 0x54 0xe8
+# CHECK: r17:16 = vcmpyi(r21:20, r31:30):sat
+0xd0 0xde 0xd4 0xe8
+# CHECK: r17:16 = vcmpyi(r21:20, r31:30):<<1:sat
+0x90 0xde 0x34 0xea
+# CHECK: r17:16 += vcmpyr(r21:20, r31:30):sat
+0x90 0xde 0x54 0xea
+# CHECK: r17:16 += vcmpyi(r21:20, r31:30):sat
+
+# Vector complex conjugate
+0xf0 0xc0 0x94 0x80
+# CHECK: r17:16 = vconj(r21:20):sat
+
+# Vector complex rotate
+0x10 0xdf 0xd4 0xc3
+# CHECK: r17:16 = vcrotate(r21:20, r31)
+
+# Vector reduce complex multiply real or imaginary
+0x10 0xde 0x14 0xe8
+# CHECK: r17:16 = vrcmpyi(r21:20, r31:30)
+0x30 0xde 0x14 0xe8
+# CHECK: r17:16 = vrcmpyr(r21:20, r31:30)
+0x10 0xde 0x54 0xe8
+# CHECK: r17:16 = vrcmpyi(r21:20, r31:30*)
+0x30 0xde 0x74 0xe8
+# CHECK: r17:16 = vrcmpyr(r21:20, r31:30*)
+
+# Vector reduce complex multiply by scalar
+0x90 0xde 0xb4 0xe8
+# CHECK: r17:16 = vrcmpys(r21:20, r31:30):<<1:sat:raw:hi
+0x90 0xde 0xf4 0xe8
+# CHECK: r17:16 = vrcmpys(r21:20, r31:30):<<1:sat:raw:lo
+0x90 0xde 0xb4 0xea
+# CHECK: r17:16 += vrcmpys(r21:20, r31:30):<<1:sat:raw:hi
+0x90 0xde 0xf4 0xea
+# CHECK: r17:16 += vrcmpys(r21:20, r31:30):<<1:sat:raw:lo
+
+# Vector reduce complex multiply by scalar with round and pack
+0xd1 0xde 0xb4 0xe9
+# CHECK: r17 = vrcmpys(r21:20, r31:30):<<1:rnd:sat:raw:hi
+0xf1 0xde 0xb4 0xe9
+# CHECK: r17 = vrcmpys(r21:20, r31:30):<<1:rnd:sat:raw:lo
+
+# Vector reduce complex rotate
+0xf0 0xff 0xd4 0xc3
+# CHECK: r17:16 = vrcrotate(r21:20, r31, #3)
+0x30 0xff 0xb4 0xcb
+# CHECK: r17:16 += vrcrotate(r21:20, r31, #3)
diff --git a/test/MC/Disassembler/Hexagon/xtype_fp.txt b/test/MC/Disassembler/Hexagon/xtype_fp.txt
new file mode 100644
index 0000000..7007420
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/xtype_fp.txt
@@ -0,0 +1,146 @@
+# RUN: llvm-mc -triple=hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.10.4 XTYPE/FP
+
+# Floating point addition
+0x11 0xdf 0x15 0xeb
+# CHECK: r17 = sfadd(r21, r31)
+
+# Classify floating-point value
+0x03 0xd5 0xf1 0x85
+# CHECK: p3 = sfclass(r17, #21)
+0xb3 0xc2 0x90 0xdc
+# CHECK: p3 = dfclass(r17:16, #21)
+
+# Compare floating-point value
+0x03 0xd5 0xf1 0xc7
+# CHECK: p3 = sfcmp.ge(r17, r21)
+0x23 0xd5 0xf1 0xc7
+# CHECK: p3 = sfcmp.uo(r17, r21)
+0x63 0xd5 0xf1 0xc7
+# CHECK: p3 = sfcmp.eq(r17, r21)
+0x83 0xd5 0xf1 0xc7
+# CHECK: p3 = sfcmp.gt(r17, r21)
+0x03 0xd4 0xf0 0xd2
+# CHECK: p3 = dfcmp.eq(r17:16, r21:20)
+0x23 0xd4 0xf0 0xd2
+# CHECK: p3 = dfcmp.gt(r17:16, r21:20)
+0x43 0xd4 0xf0 0xd2
+# CHECK: p3 = dfcmp.ge(r17:16, r21:20)
+0x63 0xd4 0xf0 0xd2
+# CHECK: p3 = dfcmp.uo(r17:16, r21:20)
+
+# Convert floating-point value to other format
+0x10 0xc0 0x95 0x84
+# CHECK: r17:16 = convert_sf2df(r21)
+0x31 0xc0 0x14 0x88
+# CHECK: r17 = convert_df2sf(r21:20)
+
+# Convert integer to floating-point value
+0x50 0xc0 0xf4 0x80
+# CHECK: r17:16 = convert_ud2df(r21:20)
+0x70 0xc0 0xf4 0x80
+# CHECK: r17:16 = convert_d2df(r21:20)
+0x30 0xc0 0x95 0x84
+# CHECK: r17:16 = convert_uw2df(r21)
+0x50 0xc0 0x95 0x84
+# CHECK: r17:16 = convert_w2df(r21)
+0x31 0xc0 0x34 0x88
+# CHECK: r17 = convert_ud2sf(r21:20)
+0x31 0xc0 0x54 0x88
+# CHECK: r17 = convert_d2sf(r21:20)
+0x11 0xc0 0x35 0x8b
+# CHECK: r17 = convert_uw2sf(r21)
+0x11 0xc0 0x55 0x8b
+# CHECK: r17 = convert_w2sf(r21)
+
+# Convert floating-point value to integer
+0x10 0xc0 0xf4 0x80
+# CHECK: r17:16 = convert_df2d(r21:20)
+0x30 0xc0 0xf4 0x80
+# CHECK: r17:16 = convert_df2ud(r21:20)
+0xd0 0xc0 0xf4 0x80
+# CHECK: r17:16 = convert_df2d(r21:20):chop
+0xf0 0xc0 0xf4 0x80
+# CHECK: r17:16 = convert_df2ud(r21:20):chop
+0x70 0xc0 0x95 0x84
+# CHECK: r17:16 = convert_sf2ud(r21)
+0x90 0xc0 0x95 0x84
+# CHECK: r17:16 = convert_sf2d(r21)
+0xb0 0xc0 0x95 0x84
+# CHECK: r17:16 = convert_sf2ud(r21):chop
+0xd0 0xc0 0x95 0x84
+# CHECK: r17:16 = convert_sf2d(r21):chop
+0x31 0xc0 0x74 0x88
+# CHECK: r17 = convert_df2uw(r21:20)
+0x31 0xc0 0x94 0x88
+# CHECK: r17 = convert_df2w(r21:20)
+0x31 0xc0 0xb4 0x88
+# CHECK: r17 = convert_df2uw(r21:20):chop
+0x31 0xc0 0xf4 0x88
+# CHECK: r17 = convert_df2w(r21:20):chop
+0x11 0xc0 0x75 0x8b
+# CHECK: r17 = convert_sf2uw(r21)
+0x31 0xc0 0x75 0x8b
+# CHECK: r17 = convert_sf2uw(r21):chop
+0x11 0xc0 0x95 0x8b
+# CHECK: r17 = convert_sf2w(r21)
+0x31 0xc0 0x95 0x8b
+# CHECK: r17 = convert_sf2w(r21):chop
+
+# Floating point extreme value assistance
+0x11 0xc0 0xb5 0x8b
+# CHECK: r17 = sffixupr(r21)
+0x11 0xdf 0xd5 0xeb
+# CHECK: r17 = sffixupn(r21, r31)
+0x31 0xdf 0xd5 0xeb
+# CHECK: r17 = sffixupd(r21, r31)
+
+# Floating point fused multiply-add
+0x91 0xdf 0x15 0xef
+# CHECK: r17 += sfmpy(r21, r31)
+0xb1 0xdf 0x15 0xef
+# CHECK: r17 -= sfmpy(r21, r31)
+
+# Floating point fused multiply-add with scaling
+0xf1 0xdf 0x75 0xef
+# CHECK: r17 += sfmpy(r21, r31, p3):scale
+
+# Floating point reciprocal square root approximation
+0x71 0xc0 0xf5 0x8b
+# CHECK: r17, p3 = sfinvsqrta(r21)
+
+# Floating point fused multiply-add for library routines
+0xd1 0xdf 0x15 0xef
+# CHECK: r17 += sfmpy(r21, r31):lib
+0xf1 0xdf 0x15 0xef
+# CHECK: r17 -= sfmpy(r21, r31):lib
+
+# Create floating-point constant
+0xb1 0xc2 0x00 0xd6
+# CHECK: r17 = sfmake(#21):pos
+0xb1 0xc2 0x40 0xd6
+# CHECK: r17 = sfmake(#21):neg
+0xb0 0xc2 0x00 0xd9
+# CHECK: r17:16 = dfmake(#21):pos
+0xb0 0xc2 0x40 0xd9
+# CHECK: r17:16 = dfmake(#21):neg
+
+# Floating point maximum
+0x11 0xdf 0x95 0xeb
+# CHECK: r17 = sfmax(r21, r31)
+
+# Floating point minimum
+0x31 0xdf 0x95 0xeb
+# CHECK: r17 = sfmin(r21, r31)
+
+# Floating point multiply
+0x11 0xdf 0x55 0xeb
+# CHECK: r17 = sfmpy(r21, r31)
+
+# Floating point reciprocal approximation
+0xf1 0xdf 0xf5 0xeb
+# CHECK: r17, p3 = sfrecipa(r21, r31)
+
+# Floating point subtraction
+0x31 0xdf 0x15 0xeb
+# CHECK: r17 = sfsub(r21, r31)
diff --git a/test/MC/Disassembler/Hexagon/xtype_mpy.txt b/test/MC/Disassembler/Hexagon/xtype_mpy.txt
new file mode 100644
index 0000000..ada3216
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/xtype_mpy.txt
@@ -0,0 +1,400 @@
+# RUN: llvm-mc -triple=hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.10.5 XTYPE/MPY
+
+# Multiply and use lower result
+0xb1 0xdf 0x35 0xd7
+# CHECK: r17 = add(#21, mpyi(r21, r31))
+0xbf 0xd1 0x35 0xd8
+# CHECK: r17 = add(#21, mpyi(r21, #31))
+0xb5 0xd1 0x3f 0xdf
+# CHECK: r17 = add(r21, mpyi(#84, r31))
+0xf5 0xf1 0xb5 0xdf
+# CHECK: r17 = add(r21, mpyi(r21, #31))
+0x15 0xd1 0x1f 0xe3
+# CHECK: r17 = add(r21, mpyi(r17, r31))
+0xf1 0xc3 0x15 0xe0
+# CHECK: r17 =+ mpyi(r21, #31)
+0xf1 0xc3 0x95 0xe0
+# CHECK: r17 =- mpyi(r21, #31)
+0xf1 0xc3 0x15 0xe1
+# CHECK: r17 += mpyi(r21, #31)
+0xf1 0xc3 0x95 0xe1
+# CHECK: r17 -= mpyi(r21, #31)
+0x11 0xdf 0x15 0xed
+# CHECK: r17 = mpyi(r21, r31)
+0x11 0xdf 0x15 0xef
+# CHECK: r17 += mpyi(r21, r31)
+
+# Vector multiply word by signed half (32x16)
+0xb0 0xde 0x14 0xe8
+# CHECK: r17:16 = vmpyweh(r21:20, r31:30):sat
+0xb0 0xde 0x94 0xe8
+# CHECK: r17:16 = vmpyweh(r21:20, r31:30):<<1:sat
+0xf0 0xde 0x14 0xe8
+# CHECK: r17:16 = vmpywoh(r21:20, r31:30):sat
+0xf0 0xde 0x94 0xe8
+# CHECK: r17:16 = vmpywoh(r21:20, r31:30):<<1:sat
+0xb0 0xde 0x34 0xe8
+# CHECK: r17:16 = vmpyweh(r21:20, r31:30):rnd:sat
+0xb0 0xde 0xb4 0xe8
+# CHECK: r17:16 = vmpyweh(r21:20, r31:30):<<1:rnd:sat
+0xf0 0xde 0x34 0xe8
+# CHECK: r17:16 = vmpywoh(r21:20, r31:30):rnd:sat
+0xf0 0xde 0xb4 0xe8
+# CHECK: r17:16 = vmpywoh(r21:20, r31:30):<<1:rnd:sat
+0xb0 0xde 0x14 0xea
+# CHECK: r17:16 += vmpyweh(r21:20, r31:30):sat
+0xb0 0xde 0x94 0xea
+# CHECK: r17:16 += vmpyweh(r21:20, r31:30):<<1:sat
+0xf0 0xde 0x14 0xea
+# CHECK: r17:16 += vmpywoh(r21:20, r31:30):sat
+0xf0 0xde 0x94 0xea
+# CHECK: r17:16 += vmpywoh(r21:20, r31:30):<<1:sat
+0xb0 0xde 0x34 0xea
+# CHECK: r17:16 += vmpyweh(r21:20, r31:30):rnd:sat
+0xb0 0xde 0xb4 0xea
+# CHECK: r17:16 += vmpyweh(r21:20, r31:30):<<1:rnd:sat
+0xf0 0xde 0x34 0xea
+# CHECK: r17:16 += vmpywoh(r21:20, r31:30):rnd:sat
+0xf0 0xde 0xb4 0xea
+# CHECK: r17:16 += vmpywoh(r21:20, r31:30):<<1:rnd:sat
+
+# Vector multiply word by unsigned half (32x16)
+0xb0 0xde 0x54 0xe8
+# CHECK: r17:16 = vmpyweuh(r21:20, r31:30):sat
+0xb0 0xde 0xd4 0xe8
+# CHECK: r17:16 = vmpyweuh(r21:20, r31:30):<<1:sat
+0xf0 0xde 0x54 0xe8
+# CHECK: r17:16 = vmpywouh(r21:20, r31:30):sat
+0xf0 0xde 0xd4 0xe8
+# CHECK: r17:16 = vmpywouh(r21:20, r31:30):<<1:sat
+0xb0 0xde 0x74 0xe8
+# CHECK: r17:16 = vmpyweuh(r21:20, r31:30):rnd:sat
+0xb0 0xde 0xf4 0xe8
+# CHECK: r17:16 = vmpyweuh(r21:20, r31:30):<<1:rnd:sat
+0xf0 0xde 0x74 0xe8
+# CHECK: r17:16 = vmpywouh(r21:20, r31:30):rnd:sat
+0xf0 0xde 0xf4 0xe8
+# CHECK: r17:16 = vmpywouh(r21:20, r31:30):<<1:rnd:sat
+0xb0 0xde 0x54 0xea
+# CHECK: r17:16 += vmpyweuh(r21:20, r31:30):sat
+0xb0 0xde 0xd4 0xea
+# CHECK: r17:16 += vmpyweuh(r21:20, r31:30):<<1:sat
+0xf0 0xde 0x54 0xea
+# CHECK: r17:16 += vmpywouh(r21:20, r31:30):sat
+0xf0 0xde 0xd4 0xea
+# CHECK: r17:16 += vmpywouh(r21:20, r31:30):<<1:sat
+0xb0 0xde 0x74 0xea
+# CHECK: r17:16 += vmpyweuh(r21:20, r31:30):rnd:sat
+0xb0 0xde 0xf4 0xea
+# CHECK: r17:16 += vmpyweuh(r21:20, r31:30):<<1:rnd:sat
+0xf0 0xde 0x74 0xea
+# CHECK: r17:16 += vmpywouh(r21:20, r31:30):rnd:sat
+0xf0 0xde 0xf4 0xea
+# CHECK: r17:16 += vmpywouh(r21:20, r31:30):<<1:rnd:sat
+
+# Multiply signed halfwords
+0x10 0xdf 0x95 0xe4
+# CHECK: r17:16 = mpy(r21.l, r31.l):<<1
+0x30 0xdf 0x95 0xe4
+# CHECK: r17:16 = mpy(r21.l, r31.h):<<1
+0x50 0xdf 0x95 0xe4
+# CHECK: r17:16 = mpy(r21.h, r31.l):<<1
+0x70 0xdf 0x95 0xe4
+# CHECK: r17:16 = mpy(r21.h, r31.h):<<1
+0x10 0xdf 0xb5 0xe4
+# CHECK: r17:16 = mpy(r21.l, r31.l):<<1:rnd
+0x30 0xdf 0xb5 0xe4
+# CHECK: r17:16 = mpy(r21.l, r31.h):<<1:rnd
+0x50 0xdf 0xb5 0xe4
+# CHECK: r17:16 = mpy(r21.h, r31.l):<<1:rnd
+0x70 0xdf 0xb5 0xe4
+# CHECK: r17:16 = mpy(r21.h, r31.h):<<1:rnd
+0x10 0xdf 0x95 0xe6
+# CHECK: r17:16 += mpy(r21.l, r31.l):<<1
+0x30 0xdf 0x95 0xe6
+# CHECK: r17:16 += mpy(r21.l, r31.h):<<1
+0x50 0xdf 0x95 0xe6
+# CHECK: r17:16 += mpy(r21.h, r31.l):<<1
+0x70 0xdf 0x95 0xe6
+# CHECK: r17:16 += mpy(r21.h, r31.h):<<1
+0x10 0xdf 0xb5 0xe6
+# CHECK: r17:16 -= mpy(r21.l, r31.l):<<1
+0x30 0xdf 0xb5 0xe6
+# CHECK: r17:16 -= mpy(r21.l, r31.h):<<1
+0x50 0xdf 0xb5 0xe6
+# CHECK: r17:16 -= mpy(r21.h, r31.l):<<1
+0x70 0xdf 0xb5 0xe6
+# CHECK: r17:16 -= mpy(r21.h, r31.h):<<1
+0x11 0xdf 0x95 0xec
+# CHECK: r17 = mpy(r21.l, r31.l):<<1
+0x31 0xdf 0x95 0xec
+# CHECK: r17 = mpy(r21.l, r31.h):<<1
+0x51 0xdf 0x95 0xec
+# CHECK: r17 = mpy(r21.h, r31.l):<<1
+0x71 0xdf 0x95 0xec
+# CHECK: r17 = mpy(r21.h, r31.h):<<1
+0x91 0xdf 0x95 0xec
+# CHECK: r17 = mpy(r21.l, r31.l):<<1:sat
+0xb1 0xdf 0x95 0xec
+# CHECK: r17 = mpy(r21.l, r31.h):<<1:sat
+0xd1 0xdf 0x95 0xec
+# CHECK: r17 = mpy(r21.h, r31.l):<<1:sat
+0xf1 0xdf 0x95 0xec
+# CHECK: r17 = mpy(r21.h, r31.h):<<1:sat
+0x11 0xdf 0xb5 0xec
+# CHECK: r17 = mpy(r21.l, r31.l):<<1:rnd
+0x31 0xdf 0xb5 0xec
+# CHECK: r17 = mpy(r21.l, r31.h):<<1:rnd
+0x51 0xdf 0xb5 0xec
+# CHECK: r17 = mpy(r21.h, r31.l):<<1:rnd
+0x71 0xdf 0xb5 0xec
+# CHECK: r17 = mpy(r21.h, r31.h):<<1:rnd
+0x91 0xdf 0xb5 0xec
+# CHECK: r17 = mpy(r21.l, r31.l):<<1:rnd:sat
+0xb1 0xdf 0xb5 0xec
+# CHECK: r17 = mpy(r21.l, r31.h):<<1:rnd:sat
+0xd1 0xdf 0xb5 0xec
+# CHECK: r17 = mpy(r21.h, r31.l):<<1:rnd:sat
+0xf1 0xdf 0xb5 0xec
+# CHECK: r17 = mpy(r21.h, r31.h):<<1:rnd:sat
+0x11 0xdf 0x95 0xee
+# CHECK: r17 += mpy(r21.l, r31.l):<<1
+0x31 0xdf 0x95 0xee
+# CHECK: r17 += mpy(r21.l, r31.h):<<1
+0x51 0xdf 0x95 0xee
+# CHECK: r17 += mpy(r21.h, r31.l):<<1
+0x71 0xdf 0x95 0xee
+# CHECK: r17 += mpy(r21.h, r31.h):<<1
+0x91 0xdf 0x95 0xee
+# CHECK: r17 += mpy(r21.l, r31.l):<<1:sat
+0xb1 0xdf 0x95 0xee
+# CHECK: r17 += mpy(r21.l, r31.h):<<1:sat
+0xd1 0xdf 0x95 0xee
+# CHECK: r17 += mpy(r21.h, r31.l):<<1:sat
+0xf1 0xdf 0x95 0xee
+# CHECK: r17 += mpy(r21.h, r31.h):<<1:sat
+0x11 0xdf 0xb5 0xee
+# CHECK: r17 -= mpy(r21.l, r31.l):<<1
+0x31 0xdf 0xb5 0xee
+# CHECK: r17 -= mpy(r21.l, r31.h):<<1
+0x51 0xdf 0xb5 0xee
+# CHECK: r17 -= mpy(r21.h, r31.l):<<1
+0x71 0xdf 0xb5 0xee
+# CHECK: r17 -= mpy(r21.h, r31.h):<<1
+0x91 0xdf 0xb5 0xee
+# CHECK: r17 -= mpy(r21.l, r31.l):<<1:sat
+0xb1 0xdf 0xb5 0xee
+# CHECK: r17 -= mpy(r21.l, r31.h):<<1:sat
+0xd1 0xdf 0xb5 0xee
+# CHECK: r17 -= mpy(r21.h, r31.l):<<1:sat
+0xf1 0xdf 0xb5 0xee
+# CHECK: r17 -= mpy(r21.h, r31.h):<<1:sat
+
+# Multiply unsigned halfwords
+0x10 0xdf 0xd5 0xe4
+# CHECK: r17:16 = mpyu(r21.l, r31.l):<<1
+0x30 0xdf 0xd5 0xe4
+# CHECK: r17:16 = mpyu(r21.l, r31.h):<<1
+0x50 0xdf 0xd5 0xe4
+# CHECK: r17:16 = mpyu(r21.h, r31.l):<<1
+0x70 0xdf 0xd5 0xe4
+# CHECK: r17:16 = mpyu(r21.h, r31.h):<<1
+0x10 0xdf 0xd5 0xe6
+# CHECK: r17:16 += mpyu(r21.l, r31.l):<<1
+0x30 0xdf 0xd5 0xe6
+# CHECK: r17:16 += mpyu(r21.l, r31.h):<<1
+0x50 0xdf 0xd5 0xe6
+# CHECK: r17:16 += mpyu(r21.h, r31.l):<<1
+0x70 0xdf 0xd5 0xe6
+# CHECK: r17:16 += mpyu(r21.h, r31.h):<<1
+0x10 0xdf 0xf5 0xe6
+# CHECK: r17:16 -= mpyu(r21.l, r31.l):<<1
+0x30 0xdf 0xf5 0xe6
+# CHECK: r17:16 -= mpyu(r21.l, r31.h):<<1
+0x50 0xdf 0xf5 0xe6
+# CHECK: r17:16 -= mpyu(r21.h, r31.l):<<1
+0x70 0xdf 0xf5 0xe6
+# CHECK: r17:16 -= mpyu(r21.h, r31.h):<<1
+0x11 0xdf 0xd5 0xec
+# CHECK: r17 = mpyu(r21.l, r31.l):<<1
+0x31 0xdf 0xd5 0xec
+# CHECK: r17 = mpyu(r21.l, r31.h):<<1
+0x51 0xdf 0xd5 0xec
+# CHECK: r17 = mpyu(r21.h, r31.l):<<1
+0x71 0xdf 0xd5 0xec
+# CHECK: r17 = mpyu(r21.h, r31.h):<<1
+0x11 0xdf 0xd5 0xee
+# CHECK: r17 += mpyu(r21.l, r31.l):<<1
+0x31 0xdf 0xd5 0xee
+# CHECK: r17 += mpyu(r21.l, r31.h):<<1
+0x51 0xdf 0xd5 0xee
+# CHECK: r17 += mpyu(r21.h, r31.l):<<1
+0x71 0xdf 0xd5 0xee
+# CHECK: r17 += mpyu(r21.h, r31.h):<<1
+0x11 0xdf 0xf5 0xee
+# CHECK: r17 -= mpyu(r21.l, r31.l):<<1
+0x31 0xdf 0xf5 0xee
+# CHECK: r17 -= mpyu(r21.l, r31.h):<<1
+0x51 0xdf 0xf5 0xee
+# CHECK: r17 -= mpyu(r21.h, r31.l):<<1
+0x71 0xdf 0xf5 0xee
+# CHECK: r17 -= mpyu(r21.h, r31.h):<<1
+
+# Polynomial multiply words
+0xf0 0xdf 0x55 0xe5
+# CHECK: r17:16 = pmpyw(r21, r31)
+0xf0 0xdf 0x35 0xe7
+# CHECK: r17:16 ^= pmpyw(r21, r31)
+
+# Vector reduce multiply word by signed half (32x16)
+0x50 0xde 0x34 0xe8
+# CHECK: r17:16 = vrmpywoh(r21:20, r31:30)
+0x50 0xde 0xb4 0xe8
+# CHECK: r17:16 = vrmpywoh(r21:20, r31:30):<<1
+0x90 0xde 0x54 0xe8
+# CHECK: r17:16 = vrmpyweh(r21:20, r31:30)
+0x90 0xde 0xd4 0xe8
+# CHECK: r17:16 = vrmpyweh(r21:20, r31:30):<<1
+0xd0 0xde 0x74 0xea
+# CHECK: r17:16 += vrmpywoh(r21:20, r31:30)
+0xd0 0xde 0xf4 0xea
+# CHECK: r17:16 += vrmpywoh(r21:20, r31:30):<<1
+0xd0 0xde 0x34 0xea
+# CHECK: r17:16 += vrmpyweh(r21:20, r31:30)
+0xd0 0xde 0xb4 0xea
+# CHECK: r17:16 += vrmpyweh(r21:20, r31:30):<<1
+
+# Multiply and use upper result
+0x31 0xdf 0x15 0xed
+# CHECK: r17 = mpy(r21, r31)
+0x31 0xdf 0x35 0xed
+# CHECK: r17 = mpy(r21, r31):rnd
+0x31 0xdf 0x55 0xed
+# CHECK: r17 = mpyu(r21, r31)
+0x31 0xdf 0x75 0xed
+# CHECK: r17 = mpysu(r21, r31)
+0x11 0xdf 0xb5 0xed
+# CHECK: r17 = mpy(r21, r31.h):<<1:sat
+0x31 0xdf 0xb5 0xed
+# CHECK: r17 = mpy(r21, r31.l):<<1:sat
+0x91 0xdf 0xb5 0xed
+# CHECK: r17 = mpy(r21, r31.h):<<1:rnd:sat
+0x11 0xdf 0xf5 0xed
+# CHECK: r17 = mpy(r21, r31):<<1:sat
+0x91 0xdf 0xf5 0xed
+# CHECK: r17 = mpy(r21, r31.l):<<1:rnd:sat
+0x51 0xdf 0xb5 0xed
+# CHECK: r17 = mpy(r21, r31):<<1
+0x11 0xdf 0x75 0xef
+# CHECK: r17 += mpy(r21, r31):<<1:sat
+0x31 0xdf 0x75 0xef
+# CHECK: r17 -= mpy(r21, r31):<<1:sat
+
+# Multiply and use full result
+0x10 0xdf 0x15 0xe5
+# CHECK: r17:16 = mpy(r21, r31)
+0x10 0xdf 0x55 0xe5
+# CHECK: r17:16 = mpyu(r21, r31)
+0x10 0xdf 0x15 0xe7
+# CHECK: r17:16 += mpy(r21, r31)
+0x10 0xdf 0x35 0xe7
+# CHECK: r17:16 -= mpy(r21, r31)
+0x10 0xdf 0x55 0xe7
+# CHECK: r17:16 += mpyu(r21, r31)
+0x10 0xdf 0x75 0xe7
+# CHECK: r17:16 -= mpyu(r21, r31)
+
+# Vector dual multiply
+0x90 0xde 0x14 0xe8
+# CHECK: r17:16 = vdmpy(r21:20, r31:30):sat
+0x90 0xde 0x94 0xe8
+# CHECK: r17:16 = vdmpy(r21:20, r31:30):<<1:sat
+0x90 0xde 0x14 0xea
+# CHECK: r17:16 += vdmpy(r21:20, r31:30):sat
+0x90 0xde 0x94 0xea
+# CHECK: r17:16 += vdmpy(r21:20, r31:30):<<1:sat
+
+# Vector dual multiply with round and pack
+0x11 0xde 0x14 0xe9
+# CHECK: r17 = vdmpy(r21:20, r31:30):rnd:sat
+0x11 0xde 0x94 0xe9
+# CHECK: r17 = vdmpy(r21:20, r31:30):<<1:rnd:sat
+
+# Vector reduce multiply bytes
+0x30 0xde 0x94 0xe8
+# CHECK: r17:16 = vrmpybu(r21:20, r31:30)
+0x30 0xde 0xd4 0xe8
+# CHECK: r17:16 = vrmpybsu(r21:20, r31:30)
+0x30 0xde 0x94 0xea
+# CHECK: r17:16 += vrmpybu(r21:20, r31:30)
+0x30 0xde 0xd4 0xea
+# CHECK: r17:16 += vrmpybsu(r21:20, r31:30)
+
+# Vector dual multiply signed by unsigned bytes
+0x30 0xde 0xb4 0xe8
+# CHECK: r17:16 = vdmpybsu(r21:20, r31:30):sat
+0x30 0xde 0x34 0xea
+# CHECK: r17:16 += vdmpybsu(r21:20, r31:30):sat
+
+# Vector multiply even haldwords
+0xd0 0xde 0x14 0xe8
+# CHECK: r17:16 = vmpyeh(r21:20, r31:30):sat
+0xd0 0xde 0x94 0xe8
+# CHECK: r17:16 = vmpyeh(r21:20, r31:30):<<1:sat
+0x50 0xde 0x34 0xea
+# CHECK: r17:16 += vmpyeh(r21:20, r31:30)
+0xd0 0xde 0x14 0xea
+# CHECK: r17:16 += vmpyeh(r21:20, r31:30):sat
+0xd0 0xde 0x94 0xea
+# CHECK: r17:16 += vmpyeh(r21:20, r31:30):<<1:sat
+
+# Vector multiply halfwords
+0xb0 0xdf 0x15 0xe5
+# CHECK: r17:16 = vmpyh(r21, r31):sat
+0xb0 0xdf 0x95 0xe5
+# CHECK: r17:16 = vmpyh(r21, r31):<<1:sat
+0x30 0xdf 0x35 0xe7
+# CHECK: r17:16 += vmpyh(r21, r31)
+0xb0 0xdf 0x15 0xe7
+# CHECK: r17:16 += vmpyh(r21, r31):sat
+0xb0 0xdf 0x95 0xe7
+# CHECK: r17:16 += vmpyh(r21, r31):<<1:sat
+
+# Vector multiply halfwords with round and pack
+0xf1 0xdf 0x35 0xed
+# CHECK: r17 = vmpyh(r21, r31):rnd:sat
+0xf1 0xdf 0xb5 0xed
+# CHECK: r17 = vmpyh(r21, r31):<<1:rnd:sat
+
+# Vector multiply halfwords signed by unsigned
+0xf0 0xdf 0x15 0xe5
+# CHECK: r17:16 = vmpyhsu(r21, r31):sat
+0xf0 0xdf 0x95 0xe5
+# CHECK: r17:16 = vmpyhsu(r21, r31):<<1:sat
+0xb0 0xdf 0x75 0xe7
+# CHECK: r17:16 += vmpyhsu(r21, r31):sat
+0xb0 0xdf 0xf5 0xe7
+# CHECK: r17:16 += vmpyhsu(r21, r31):<<1:sat
+
+# Vector reduce multiply halfwords
+0x50 0xde 0x14 0xe8
+# CHECK: r17:16 = vrmpyh(r21:20, r31:30)
+0x50 0xde 0x14 0xea
+# CHECK: r17:16 += vrmpyh(r21:20, r31:30)
+
+# Vector multiply bytes
+0x30 0xdf 0x55 0xe5
+# CHECK: r17:16 = vmpybsu(r21, r31)
+0x30 0xdf 0x95 0xe5
+# CHECK: r17:16 = vmpybu(r21, r31)
+0x30 0xdf 0x95 0xe7
+# CHECK: r17:16 += vmpybu(r21, r31)
+0x30 0xdf 0xd5 0xe7
+# CHECK: r17:16 += vmpybsu(r21, r31)
+
+# Vector polynomial multiply halfwords
+0xf0 0xdf 0xd5 0xe5
+# CHECK: r17:16 = vpmpyh(r21, r31)
+0xf0 0xdf 0xb5 0xe7
+# CHECK: r17:16 ^= vpmpyh(r21, r31)
diff --git a/test/MC/Disassembler/Hexagon/xtype_perm.txt b/test/MC/Disassembler/Hexagon/xtype_perm.txt
new file mode 100644
index 0000000..91d2fc5
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/xtype_perm.txt
@@ -0,0 +1,104 @@
+# RUN: llvm-mc -triple=hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.10.6 XTYPE/PERM
+
+# CABAC decode bin
+0xd0 0xde 0xd4 0xc1
+# CHECK: r17:16 = decbin(r21:20, r31:30)
+
+# Saturate
+0x11 0xc0 0xd4 0x88
+# CHECK: r17 = sat(r21:20)
+0x91 0xc0 0xd5 0x8c
+# CHECK: r17 = sath(r21)
+0xb1 0xc0 0xd5 0x8c
+# CHECK: r17 = satuh(r21)
+0xd1 0xc0 0xd5 0x8c
+# CHECK: r17 = satub(r21)
+0xf1 0xc0 0xd5 0x8c
+# CHECK: r17 = satb(r21)
+
+# Swizzle bytes
+0xf1 0xc0 0x95 0x8c
+# CHECK: r17 = swiz(r21)
+
+# Vector align
+0x70 0xd4 0x1e 0xc2
+# CHECK: r17:16 = valignb(r21:20, r31:30, p3)
+0x70 0xde 0x94 0xc2
+# CHECK: r17:16 = vspliceb(r21:20, r31:30, p3)
+
+# Vector round and pack
+0x91 0xc0 0x94 0x88
+# CHECK: r17 = vrndwh(r21:20)
+0xd1 0xc0 0x94 0x88
+# CHECK: r17 = vrndwh(r21:20):sat
+
+# Vector saturate and pack
+0x11 0xc0 0x14 0x88
+# CHECK: r17 = vsathub(r21:20)
+0x51 0xc0 0x14 0x88
+# CHECK: r17 = vsatwh(r21:20)
+0x91 0xc0 0x14 0x88
+# CHECK: r17 = vsatwuh(r21:20)
+0xd1 0xc0 0x14 0x88
+# CHECK: r17 = vsathb(r21:20)
+0x11 0xc0 0x95 0x8c
+# CHECK: r17 = vsathb(r21)
+0x51 0xc0 0x95 0x8c
+# CHECK: r17 = vsathub(r21)
+
+# Vector saturate without pack
+0x90 0xc0 0x14 0x80
+# CHECK: r17:16 = vsathub(r21:20)
+0xb0 0xc0 0x14 0x80
+# CHECK: r17:16 = vsatwuh(r21:20)
+0xd0 0xc0 0x14 0x80
+# CHECK: r17:16 = vsatwh(r21:20)
+0xf0 0xc0 0x14 0x80
+# CHECK: r17:16 = vsathb(r21:20)
+
+# Vector shuffle
+0x50 0xde 0x14 0xc1
+# CHECK: r17:16 = shuffeb(r21:20, r31:30)
+0x90 0xd4 0x1e 0xc1
+# CHECK: r17:16 = shuffob(r21:20, r31:30)
+0xd0 0xde 0x14 0xc1
+# CHECK: r17:16 = shuffeh(r21:20, r31:30)
+0x10 0xd4 0x9e 0xc1
+# CHECK: r17:16 = shuffoh(r21:20, r31:30)
+
+# Vector splat bytes
+0xf1 0xc0 0x55 0x8c
+# CHECK: r17 = vsplatb(r21)
+
+# Vector splat halfwords
+0x50 0xc0 0x55 0x84
+# CHECK: r17:16 = vsplath(r21)
+
+# Vector splice
+0x70 0xde 0x94 0xc0
+# CHECK: r17:16 = vspliceb(r21:20, r31:30, #3)
+0x70 0xde 0x94 0xc2
+# CHECK: r17:16 = vspliceb(r21:20, r31:30, p3)
+
+# Vector sign extend
+0x10 0xc0 0x15 0x84
+# CHECK: r17:16 = vsxtbh(r21)
+0x90 0xc0 0x15 0x84
+# CHECK: r17:16 = vsxthw(r21)
+
+# Vector truncate
+0x11 0xc0 0x94 0x88
+# CHECK: r17 = vtrunohb(r21:20)
+0x51 0xc0 0x94 0x88
+# CHECK: r17 = vtrunehb(r21:20)
+0x50 0xde 0x94 0xc1
+# CHECK: r17:16 = vtrunewh(r21:20, r31:30)
+0x90 0xde 0x94 0xc1
+# CHECK: r17:16 = vtrunowh(r21:20, r31:30)
+
+# Vector zero extend
+0x50 0xc0 0x15 0x84
+# CHECK: r17:16 = vzxtbh(r21)
+0xd0 0xc0 0x15 0x84
+# CHECK: r17:16 = vzxthw(r21)
diff --git a/test/MC/Disassembler/Hexagon/xtype_pred.txt b/test/MC/Disassembler/Hexagon/xtype_pred.txt
new file mode 100644
index 0000000..cec6d1b
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/xtype_pred.txt
@@ -0,0 +1,136 @@
+# RUN: llvm-mc -triple=hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.10.7 XTYPE/PRED
+
+# Bounds check
+0x83 0xf4 0x10 0xd2
+# CHECK: p3 = boundscheck(r17:16, r21:20):raw:lo
+0xa3 0xf4 0x10 0xd2
+# CHECK: p3 = boundscheck(r17:16, r21:20):raw:hi
+
+# Compare byte
+0x43 0xd5 0xd1 0xc7
+# CHECK: p3 = cmpb.gt(r17, r21)
+0xc3 0xd5 0xd1 0xc7
+# CHECK: p3 = cmpb.eq(r17, r21)
+0xe3 0xd5 0xd1 0xc7
+# CHECK: p3 = cmpb.gtu(r17, r21)
+0xa3 0xc2 0x11 0xdd
+# CHECK: p3 = cmpb.eq(r17, #21)
+0xa3 0xc2 0x31 0xdd
+# CHECK: p3 = cmpb.gt(r17, #21)
+0xa3 0xc2 0x51 0xdd
+# CHECK: p3 = cmpb.gtu(r17, #21)
+
+# Compare half
+0x63 0xd5 0xd1 0xc7
+# CHECK: p3 = cmph.eq(r17, r21)
+0x83 0xd5 0xd1 0xc7
+# CHECK: p3 = cmph.gt(r17, r21)
+0xa3 0xd5 0xd1 0xc7
+# CHECK: p3 = cmph.gtu(r17, r21)
+0xab 0xc2 0x11 0xdd
+# CHECK: p3 = cmph.eq(r17, #21)
+0xab 0xc2 0x31 0xdd
+# CHECK: p3 = cmph.gt(r17, #21)
+0xab 0xc2 0x51 0xdd
+# CHECK: p3 = cmph.gtu(r17, #21)
+
+# Compare doublewords
+0x03 0xde 0x94 0xd2
+# CHECK: p3 = cmp.eq(r21:20, r31:30)
+0x43 0xde 0x94 0xd2
+# CHECK: p3 = cmp.gt(r21:20, r31:30)
+0x83 0xde 0x94 0xd2
+# CHECK: p3 = cmp.gtu(r21:20, r31:30)
+
+# Compare bitmask
+0x03 0xd5 0x91 0x85
+# CHECK: p3 = bitsclr(r17, #21)
+0x03 0xd5 0xb1 0x85
+# CHECK: p3 = !bitsclr(r17, #21)
+0x03 0xd5 0x51 0xc7
+# CHECK: p3 = bitsset(r17, r21)
+0x03 0xd5 0x71 0xc7
+# CHECK: p3 = !bitsset(r17, r21)
+0x03 0xd5 0x91 0xc7
+# CHECK: p3 = bitsclr(r17, r21)
+0x03 0xd5 0xb1 0xc7
+# CHECK: p3 = !bitsclr(r17, r21)
+
+# mask generate from predicate
+0x10 0xc3 0x00 0x86
+# CHECK: r17:16 = mask(p3)
+
+# Check for TLB match
+0x63 0xf5 0x10 0xd2
+# CHECK: p3 = tlbmatch(r17:16, r21)
+
+# Predicate Transfer
+0x03 0xc0 0x45 0x85
+# CHECK: p3 = r5
+0x05 0xc0 0x43 0x89
+# CHECK: r5 = p3
+
+# Test bit
+0x03 0xd5 0x11 0x85
+# CHECK: p3 = tstbit(r17, #21)
+0x03 0xd5 0x31 0x85
+# CHECK: p3 = !tstbit(r17, #21)
+0x03 0xd5 0x11 0xc7
+# CHECK: p3 = tstbit(r17, r21)
+0x03 0xd5 0x31 0xc7
+# CHECK: p3 = !tstbit(r17, r21)
+
+# Vector compare halfwords
+0x63 0xde 0x14 0xd2
+# CHECK: p3 = vcmph.eq(r21:20, r31:30)
+0x83 0xde 0x14 0xd2
+# CHECK: p3 = vcmph.gt(r21:20, r31:30)
+0xa3 0xde 0x14 0xd2
+# CHECK: p3 = vcmph.gtu(r21:20, r31:30)
+0xeb 0xc3 0x14 0xdc
+# CHECK: p3 = vcmph.eq(r21:20, #31)
+0xeb 0xc3 0x34 0xdc
+# CHECK: p3 = vcmph.gt(r21:20, #31)
+0xeb 0xc3 0x54 0xdc
+# CHECK: p3 = vcmph.gtu(r21:20, #31)
+
+# Vector compare bytes for any match
+0x03 0xfe 0x14 0xd2
+# CHECK: p3 = any8(vcmpb.eq(r21:20, r31:30))
+
+# Vector compare bytes
+0x63 0xde 0x14 0xd2
+# CHECK: p3 = vcmph.eq(r21:20, r31:30)
+0x83 0xde 0x14 0xd2
+# CHECK: p3 = vcmph.gt(r21:20, r31:30)
+0xa3 0xde 0x14 0xd2
+# CHECK: p3 = vcmph.gtu(r21:20, r31:30)
+0xeb 0xc3 0x14 0xdc
+# CHECK: p3 = vcmph.eq(r21:20, #31)
+0xeb 0xc3 0x34 0xdc
+# CHECK: p3 = vcmph.gt(r21:20, #31)
+0xeb 0xc3 0x54 0xdc
+# CHECK: p3 = vcmph.gtu(r21:20, #31)
+
+# Vector compare words
+0x03 0xde 0x14 0xd2
+# CHECK: p3 = vcmpw.eq(r21:20, r31:30)
+0x23 0xde 0x14 0xd2
+# CHECK: p3 = vcmpw.gt(r21:20, r31:30)
+0x43 0xde 0x14 0xd2
+# CHECK: p3 = vcmpw.gtu(r21:20, r31:30)
+0xf3 0xc3 0x14 0xdc
+# CHECK: p3 = vcmpw.eq(r21:20, #31)
+0xf3 0xc3 0x34 0xdc
+# CHECK: p3 = vcmpw.gt(r21:20, #31)
+0xf3 0xc3 0x54 0xdc
+# CHECK: p3 = vcmpw.gtu(r21:20, #31)
+
+# Viterbi pack even and odd predicate bits
+0x11 0xc2 0x03 0x89
+# CHECK: r17 = vitpack(p3, p2)
+
+# Vector mux
+0x70 0xde 0x14 0xd1
+# CHECK: r17:16 = vmux(p3, r21:20, r31:30)
diff --git a/test/MC/Disassembler/Hexagon/xtype_shift.txt b/test/MC/Disassembler/Hexagon/xtype_shift.txt
new file mode 100644
index 0000000..e2d6816
--- /dev/null
+++ b/test/MC/Disassembler/Hexagon/xtype_shift.txt
@@ -0,0 +1,260 @@
+# RUN: llvm-mc -triple=hexagon -disassemble < %s | FileCheck %s
+# Hexagon Programmer's Reference Manual 11.10.8 XTYPE/SHIFT
+
+# Shift by immediate
+0x10 0xdf 0x14 0x80
+# CHECK: r17:16 = asr(r21:20, #31)
+0x30 0xdf 0x14 0x80
+# CHECK: r17:16 = lsr(r21:20, #31)
+0x50 0xdf 0x14 0x80
+# CHECK: r17:16 = asl(r21:20, #31)
+0x11 0xdf 0x15 0x8c
+# CHECK: r17 = asr(r21, #31)
+0x31 0xdf 0x15 0x8c
+# CHECK: r17 = lsr(r21, #31)
+0x51 0xdf 0x15 0x8c
+# CHECK: r17 = asl(r21, #31)
+
+# Shift by immediate and accumulate
+0x10 0xdf 0x14 0x82
+# CHECK: r17:16 -= asr(r21:20, #31)
+0x30 0xdf 0x14 0x82
+# CHECK: r17:16 -= lsr(r21:20, #31)
+0x50 0xdf 0x14 0x82
+# CHECK: r17:16 -= asl(r21:20, #31)
+0x90 0xdf 0x14 0x82
+# CHECK: r17:16 += asr(r21:20, #31)
+0xb0 0xdf 0x14 0x82
+# CHECK: r17:16 += lsr(r21:20, #31)
+0xd0 0xdf 0x14 0x82
+# CHECK: r17:16 += asl(r21:20, #31)
+0x11 0xdf 0x15 0x8e
+# CHECK: r17 -= asr(r21, #31)
+0x31 0xdf 0x15 0x8e
+# CHECK: r17 -= lsr(r21, #31)
+0x51 0xdf 0x15 0x8e
+# CHECK: r17 -= asl(r21, #31)
+0x91 0xdf 0x15 0x8e
+# CHECK: r17 += asr(r21, #31)
+0xb1 0xdf 0x15 0x8e
+# CHECK: r17 += lsr(r21, #31)
+0xd1 0xdf 0x15 0x8e
+# CHECK: r17 += asl(r21, #31)
+0x4c 0xf7 0x11 0xde
+# CHECK: r17 = add(#21, asl(r17, #23))
+0x4e 0xf7 0x11 0xde
+# CHECK: r17 = sub(#21, asl(r17, #23))
+0x5c 0xf7 0x11 0xde
+# CHECK: r17 = add(#21, lsr(r17, #23))
+0x5e 0xf7 0x11 0xde
+# CHECK: r17 = sub(#21, lsr(r17, #23))
+
+# Shift by immediate and add
+0xf1 0xd5 0x1f 0xc4
+# CHECK: r17 = addasl(r21, r31, #7)
+
+# Shift by immediate and logical
+0x10 0xdf 0x54 0x82
+# CHECK: r17:16 &= asr(r21:20, #31)
+0x30 0xdf 0x54 0x82
+# CHECK: r17:16 &= lsr(r21:20, #31)
+0x50 0xdf 0x54 0x82
+# CHECK: r17:16 &= asl(r21:20, #31)
+0x90 0xdf 0x54 0x82
+# CHECK: r17:16 |= asr(r21:20, #31)
+0xb0 0xdf 0x54 0x82
+# CHECK: r17:16 |= lsr(r21:20, #31)
+0xd0 0xdf 0x54 0x82
+# CHECK: r17:16 |= asl(r21:20, #31)
+0x30 0xdf 0x94 0x82
+# CHECK: r17:16 ^= lsr(r21:20, #31)
+0x50 0xdf 0x94 0x82
+# CHECK: r17:16 ^= asl(r21:20, #31)
+0x11 0xdf 0x55 0x8e
+# CHECK: r17 &= asr(r21, #31)
+0x31 0xdf 0x55 0x8e
+# CHECK: r17 &= lsr(r21, #31)
+0x51 0xdf 0x55 0x8e
+# CHECK: r17 &= asl(r21, #31)
+0x91 0xdf 0x55 0x8e
+# CHECK: r17 |= asr(r21, #31)
+0xb1 0xdf 0x55 0x8e
+# CHECK: r17 |= lsr(r21, #31)
+0xd1 0xdf 0x55 0x8e
+# CHECK: r17 |= asl(r21, #31)
+0x31 0xdf 0x95 0x8e
+# CHECK: r17 ^= lsr(r21, #31)
+0x51 0xdf 0x95 0x8e
+# CHECK: r17 ^= asl(r21, #31)
+0x48 0xff 0x11 0xde
+# CHECK: r17 = and(#21, asl(r17, #31))
+0x4a 0xff 0x11 0xde
+# CHECK: r17 = or(#21, asl(r17, #31))
+0x58 0xff 0x11 0xde
+# CHECK: r17 = and(#21, lsr(r17, #31))
+0x5a 0xff 0x11 0xde
+# CHECK: r17 = or(#21, lsr(r17, #31))
+
+# Shift right by immediate with rounding
+0xf0 0xdf 0xd4 0x80
+# CHECK: r17:16 = asr(r21:20, #31):rnd
+0x11 0xdf 0x55 0x8c
+# CHECK: r17 = asr(r21, #31):rnd
+
+# Shift left by immediate with saturation
+0x51 0xdf 0x55 0x8c
+# CHECK: r17 = asl(r21, #31):sat
+
+# Shift by register
+0x10 0xdf 0x94 0xc3
+# CHECK: r17:16 = asr(r21:20, r31)
+0x50 0xdf 0x94 0xc3
+# CHECK: r17:16 = lsr(r21:20, r31)
+0x90 0xdf 0x94 0xc3
+# CHECK: r17:16 = asl(r21:20, r31)
+0xd0 0xdf 0x94 0xc3
+# CHECK: r17:16 = lsl(r21:20, r31)
+0x11 0xdf 0x55 0xc6
+# CHECK: r17 = asr(r21, r31)
+0x51 0xdf 0x55 0xc6
+# CHECK: r17 = lsr(r21, r31)
+0x91 0xdf 0x55 0xc6
+# CHECK: r17 = asl(r21, r31)
+0xd1 0xdf 0x55 0xc6
+# CHECK: r17 = lsl(r21, r31)
+0xf1 0xdf 0x8a 0xc6
+# CHECK: r17 = lsl(#21, r31)
+
+# Shift by register and accumulate
+0x10 0xdf 0x94 0xcb
+# CHECK: r17:16 -= asr(r21:20, r31)
+0x50 0xdf 0x94 0xcb
+# CHECK: r17:16 -= lsr(r21:20, r31)
+0x90 0xdf 0x94 0xcb
+# CHECK: r17:16 -= asl(r21:20, r31)
+0xd0 0xdf 0x94 0xcb
+# CHECK: r17:16 -= lsl(r21:20, r31)
+0x10 0xdf 0xd4 0xcb
+# CHECK: r17:16 += asr(r21:20, r31)
+0x50 0xdf 0xd4 0xcb
+# CHECK: r17:16 += lsr(r21:20, r31)
+0x90 0xdf 0xd4 0xcb
+# CHECK: r17:16 += asl(r21:20, r31)
+0xd0 0xdf 0xd4 0xcb
+# CHECK: r17:16 += lsl(r21:20, r31)
+0x11 0xdf 0x95 0xcc
+# CHECK: r17 -= asr(r21, r31)
+0x51 0xdf 0x95 0xcc
+# CHECK: r17 -= lsr(r21, r31)
+0x91 0xdf 0x95 0xcc
+# CHECK: r17 -= asl(r21, r31)
+0xd1 0xdf 0x95 0xcc
+# CHECK: r17 -= lsl(r21, r31)
+0x11 0xdf 0xd5 0xcc
+# CHECK: r17 += asr(r21, r31)
+0x51 0xdf 0xd5 0xcc
+# CHECK: r17 += lsr(r21, r31)
+0x91 0xdf 0xd5 0xcc
+# CHECK: r17 += asl(r21, r31)
+0xd1 0xdf 0xd5 0xcc
+# CHECK: r17 += lsl(r21, r31)
+
+# Shift by register and logical
+0x10 0xdf 0x14 0xcb
+# CHECK: r17:16 |= asr(r21:20, r31)
+0x50 0xdf 0x14 0xcb
+# CHECK: r17:16 |= lsr(r21:20, r31)
+0x90 0xdf 0x14 0xcb
+# CHECK: r17:16 |= asl(r21:20, r31)
+0xd0 0xdf 0x14 0xcb
+# CHECK: r17:16 |= lsl(r21:20, r31)
+0x10 0xdf 0x54 0xcb
+# CHECK: r17:16 &= asr(r21:20, r31)
+0x50 0xdf 0x54 0xcb
+# CHECK: r17:16 &= lsr(r21:20, r31)
+0x90 0xdf 0x54 0xcb
+# CHECK: r17:16 &= asl(r21:20, r31)
+0xd0 0xdf 0x54 0xcb
+# CHECK: r17:16 &= lsl(r21:20, r31)
+0x10 0xdf 0x74 0xcb
+# CHECK: r17:16 ^= asr(r21:20, r31)
+0x50 0xdf 0x74 0xcb
+# CHECK: r17:16 ^= lsr(r21:20, r31)
+0x90 0xdf 0x74 0xcb
+# CHECK: r17:16 ^= asl(r21:20, r31)
+0xd0 0xdf 0x74 0xcb
+# CHECK: r17:16 ^= lsl(r21:20, r31)
+0x11 0xdf 0x15 0xcc
+# CHECK: r17 |= asr(r21, r31)
+0x51 0xdf 0x15 0xcc
+# CHECK: r17 |= lsr(r21, r31)
+0x91 0xdf 0x15 0xcc
+# CHECK: r17 |= asl(r21, r31)
+0xd1 0xdf 0x15 0xcc
+# CHECK: r17 |= lsl(r21, r31)
+0x11 0xdf 0x55 0xcc
+# CHECK: r17 &= asr(r21, r31)
+0x51 0xdf 0x55 0xcc
+# CHECK: r17 &= lsr(r21, r31)
+0x91 0xdf 0x55 0xcc
+# CHECK: r17 &= asl(r21, r31)
+0xd1 0xdf 0x55 0xcc
+# CHECK: r17 &= lsl(r21, r31)
+
+# Shift by register with saturation
+0x11 0xdf 0x15 0xc6
+# CHECK: r17 = asr(r21, r31):sat
+0x91 0xdf 0x15 0xc6
+# CHECK: r17 = asl(r21, r31):sat
+
+# Vector shift halfwords by immediate
+0x10 0xc5 0x94 0x80
+# CHECK: r17:16 = vasrh(r21:20, #5)
+0x30 0xc5 0x94 0x80
+# CHECK: r17:16 = vlsrh(r21:20, #5)
+0x50 0xc5 0x94 0x80
+# CHECK: r17:16 = vaslh(r21:20, #5)
+
+# Vector arithmetic shift halfwords with round
+0x10 0xc5 0x34 0x80
+# CHECK: r17:16 = vasrh(r21:20, #5):raw
+
+# Vector arithmetic shift halfwords with saturate and pack
+0x91 0xc5 0x74 0x88
+# CHECK: r17 = vasrhub(r21:20, #5):raw
+0xb1 0xc5 0x74 0x88
+# CHECK: r17 = vasrhub(r21:20, #5):sat
+
+# Vector shift halfwords by register
+0x10 0xdf 0x54 0xc3
+# CHECK: r17:16 = vasrh(r21:20, r31)
+0x50 0xdf 0x54 0xc3
+# CHECK: r17:16 = vlsrh(r21:20, r31)
+0x90 0xdf 0x54 0xc3
+# CHECK: r17:16 = vaslh(r21:20, r31)
+0xd0 0xdf 0x54 0xc3
+# CHECK: r17:16 = vlslh(r21:20, r31)
+
+# Vector shift words by immediate
+0x10 0xdf 0x54 0x80
+# CHECK: r17:16 = vasrw(r21:20, #31)
+0x30 0xdf 0x54 0x80
+# CHECK: r17:16 = vlsrw(r21:20, #31)
+0x50 0xdf 0x54 0x80
+# CHECK: r17:16 = vaslw(r21:20, #31)
+
+# Vector shift words by register
+0x10 0xdf 0x14 0xc3
+# CHECK: r17:16 = vasrw(r21:20, r31)
+0x50 0xdf 0x14 0xc3
+# CHECK: r17:16 = vlsrw(r21:20, r31)
+0x90 0xdf 0x14 0xc3
+# CHECK: r17:16 = vaslw(r21:20, r31)
+0xd0 0xdf 0x14 0xc3
+# CHECK: r17:16 = vlslw(r21:20, r31)
+
+# Vector shift words with truncate and pack
+0x51 0xdf 0xd4 0x88
+# CHECK: r17 = vasrw(r21:20, #31)
+0x51 0xdf 0x14 0xc5
+# CHECK: r17 = vasrw(r21:20, r31)
diff --git a/test/MC/Disassembler/Mips/micromips.txt b/test/MC/Disassembler/Mips/micromips.txt
index 6464824..2b75d46 100644
--- a/test/MC/Disassembler/Mips/micromips.txt
+++ b/test/MC/Disassembler/Mips/micromips.txt
@@ -16,6 +16,21 @@
 # CHECK: addiu $9, $6, -15001
 0x31 0x26 0xc5 0x67
 
+# CHECK: addiusp -16
+0x4f 0xf9
+
+# CHECK: addiusp -1028
+0x4f 0xff
+
+# CHECK: addiusp -1032
+0x4f 0xfd
+
+# CHECK: addiusp 1024
+0x4c 0x01
+
+# CHECK: addiusp 1028
+0x4c 0x03
+
 # CHECK: addu $9, $6, $7
 0x00 0xe6 0x49 0x50
 
@@ -61,6 +76,9 @@
 # CHECK: andi $9, $6, 17767
 0xd1 0x26 0x45 0x67
 
+# CHECK: andi16 $16, $2, 31
+0x2c 0x29
+
 # CHECK: or $3, $4, $5
 0x00 0xa4 0x1a 0x90
 
@@ -136,6 +154,9 @@
 # CHECK: lw  $6, 4($5)
 0xfc 0xc5 0x00 0x04
 
+# CHECK: lw $6, 123($sp)
+0xfc 0xdd 0x00 0x7b
+
 # CHECK: sb $5, 8($4)
 0x18 0xa4 0x00 0x08
 
@@ -145,6 +166,9 @@
 # CHECK: sw  $5, 4($6)
 0xf8 0xa6 0x00 0x04
 
+# CHECK: sw $5, 123($sp)
+0xf8 0xbd 0x00 0x7b
+
 # CHECK: lwu $2, 8($4)
 0x60 0x44 0xe0 0x08
 
@@ -229,6 +253,9 @@
 # CHECK: jr $7
 0x00 0x07 0x0f 0x3c
 
+# CHECK: jraddiusp 20
+0x47 0x05
+
 # CHECK: beq $9, $6, 1332
 0x94 0xc9 0x02 0x9a
 
@@ -289,6 +316,21 @@
 # CHECK: tnei $9, 17767
 0x41 0x89 0x45 0x67
 
+# CHECK: cache 1, 8($5)
+0x20 0x25 0x60 0x08
+
+# CHECK: pref 1, 8($5)
+0x60 0x25 0x20 0x08
+
+# CHECK: ssnop
+0x00 0x00 0x08 0x00
+
+# CHECK: ehb
+0x00 0x00 0x18 0x00
+
+# CHECK: pause
+0x00 0x00 0x28 0x00
+
 # CHECK: ll $2, 8($4)
 0x60 0x44 0x30 0x08
 
@@ -321,3 +363,144 @@
 
 # CHECK: swm32 $16, $17, 8($4)
 0x20 0x44 0xd0 0x08
+
+# CHECK: swp $16, 8($4)
+0x22 0x04 0x90 0x08
+
+# CHECK: lwp $16, 8($4)
+0x22 0x04 0x10 0x08
+
+# CHECK: nop
+0x00 0x00 0x00 0x00
+
+# CHECK: addiupc $2, 20
+0x79 0x00 0x00 0x05
+
+# CHECK: addiupc $7, 16777212
+0x7b 0xbf 0xff 0xff
+
+# CHECK: addiupc $7, -16777216
+0x7b 0xc0 0x00 0x00
+
+# CHECK: addu16 $6, $17, $4
+0x07 0x42
+
+# CHECK: subu16 $5, $16, $3
+0x06 0xb1
+
+# CHECK: and16 $16, $2
+0x44 0x82
+
+# CHECK: not16 $17, $3
+0x44 0x0b
+
+# CHECK: or16 $16, $4
+0x44 0xc4
+
+# CHECK: xor16 $17, $5
+0x44 0x4d
+
+# CHECK: sll16 $3, $16, 5
+0x25 0x8a
+
+# CHECK: srl16 $4, $17, 6
+0x26 0x1d
+
+# CHECK: lbu16 $3, 4($17)
+0x09 0x94
+
+# CHECK: lbu16 $3, -1($16)
+0x09 0x8f
+
+# CHECK: lhu16 $3, 4($16)
+0x29 0x82
+
+# CHECK: lw16 $4, 8($17)
+0x6a 0x12
+
+# CHECK: sb16 $3, 4($16)
+0x89 0x84
+
+# CHECK: sh16 $4, 8($17)
+0xaa 0x14
+
+# CHECK: sw16 $4, 4($17)
+0xea 0x11
+
+# CHECK: sw16 $zero, 4($17)
+0xe8 0x11
+
+# CHECK: mfhi $9
+0x46 0x09
+
+# CHECK: mflo $9
+0x46 0x49
+
+# CHECK: move $25, $1
+0x0f 0x21
+
+# CHECK: jrc $9
+0x45 0xa9
+
+# CHECK: jalr $9
+0x45 0xc9
+
+# CHECK: jalrs16 $9
+0x45 0xe9
+
+# CHECK: jr16 $9
+0x45 0x89
+
+# CHECK: li16 $3, -1
+0xed 0xff
+
+# CHECK: li16 $3, 126
+0xed 0xfe
+
+# CHECK: addiur1sp $7, 4
+0x6f 0x83
+
+# CHECK: addiur2 $6, $7, -1
+0x6f 0x7e
+
+# CHECK: addiur2 $6, $7, 12
+0x6f 0x76
+
+# CHECK: addius5 $7, -2
+0x4c 0xfc
+
+# CHECK: nop
+0x0c 0x00
+
+# CHECK: lw $3, 32($sp)
+0x48 0x68
+
+# CHECK: sw $4, 124($sp)
+0xc8 0x9f
+
+# CHECK: beqz16 $6, 20
+0x8f 0x0a
+
+# CHECK: bnez16 $6, 20
+0xaf 0x0a
+
+# CHECK: b16 132
+0xcc 0x42
+
+# CHECK: lw $3, 32($gp)
+0x65 0x88
+
+# CHECK: lwm16 $16, $17, $ra, 8($sp)
+0x45 0x12
+
+# CHECK: swm16 $16, $17, $ra, 8($sp)
+0x45 0x52
+
+# CHECK: break16 8
+0x46 0x88
+
+# CHECK: sdbbp16 14
+0x46 0xce
+
+# CHECK: movep $5, $6, $2, $3
+0x84 0x34
diff --git a/test/MC/Disassembler/Mips/micromips_le.txt b/test/MC/Disassembler/Mips/micromips_le.txt
index d4dbc46..3f3b325 100644
--- a/test/MC/Disassembler/Mips/micromips_le.txt
+++ b/test/MC/Disassembler/Mips/micromips_le.txt
@@ -16,9 +16,27 @@
 # CHECK: addiu $9, $6, -15001
 0x26 0x31 0x67 0xc5
 
+# CHECK: addiusp -16
+0xf9 0x4f
+
+# CHECK: addiusp -1028
+0xff 0x4f
+
+# CHECK: addiusp -1032
+0xfd 0x4f
+
+# CHECK: addiusp 1024
+0x01 0x4c
+
+# CHECK: addiusp 1028
+0x03 0x4c
+
 # CHECK: addu $9, $6, $7
 0xe6 0x00 0x50 0x49
 
+# CHECK: andi16 $16, $2, 31
+0x29 0x2c
+
 # CHECK: sub $9, $6, $7
 0xe6 0x00 0x90 0x49
 
@@ -136,6 +154,9 @@
 # CHECK: lw $6, 4($5)
 0xc5 0xfc 0x04 0x00
 
+# CHECK: lw $6, 123($sp)
+0xdd 0xfc 0x7b 0x00
+
 # CHECK: sb $5, 8($4)
 0xa4 0x18 0x08 0x00
 
@@ -145,6 +166,9 @@
 # CHECK: sw $5, 4($6)
 0xa6 0xf8 0x04 0x00
 
+# CHECK: sw $5, 123($sp)
+0xbd 0xf8 0x7b 0x00
+
 # CHECK: lwu $2, 8($4)
 0x44 0x60 0x08 0xe0
 
@@ -229,6 +253,9 @@
 # CHECK: jr $7
 0x07 0x00 0x3c 0x0f
 
+# CHECK: jraddiusp 20
+0x05 0x47
+
 # CHECK: beq $9, $6, 1332
 0xc9 0x94 0x9a 0x02
 
@@ -289,6 +316,21 @@
 # CHECK: tnei $9, 17767
 0x89 0x41 0x67 0x45
 
+# CHECK: cache 1, 8($5)
+0x25 0x20 0x08 0x60
+
+# CHECK: pref 1, 8($5)
+0x25 0x60 0x08 0x20
+
+# CHECK: ssnop
+0x00 0x00 0x00 0x08
+
+# CHECK: ehb
+0x00 0x00 0x00 0x18
+
+# CHECK: pause
+0x00 0x00 0x00 0x28
+
 # CHECK: ll $2, 8($4)
 0x44 0x60 0x08 0x30
 
@@ -321,3 +363,144 @@
 
 # CHECK: swm32 $16, $17, 8($4)
 0x44 0x20 0x08 0xd0
+
+# CHECK: swp $16, 8($4)
+0x04 0x22  0x08 0x90
+
+# CHECK: lwp $16, 8($4)
+0x04 0x22 0x08 0x10
+
+# CHECK: nop
+0x00 0x00 0x00 0x00
+
+# CHECK: addiupc $2, 20
+0x00 0x79 0x05 0x00
+
+# CHECK: addiupc $7, 16777212
+0xbf 0x7b 0xff 0xff
+
+# CHECK: addiupc $7, -16777216
+0xc0 0x7b 0x00 0x00
+
+# CHECK: addu16 $6, $17, $4
+0x42 0x07
+
+# CHECK: subu16 $5, $16, $3
+0xb1 0x06
+
+# CHECK: and16 $16, $2
+0x82 0x44
+
+# CHECK: not16 $17, $3
+0x0b 0x44
+
+# CHECK: or16 $16, $4
+0xc4 0x44
+
+# CHECK: xor16 $17, $5
+0x4d 0x44
+
+# CHECK: sll16 $3, $16, 5
+0x8a 0x25
+
+# CHECK: srl16 $4, $17, 6
+0x1d 0x26
+
+# CHECK: lbu16 $3, 4($17)
+0x94 0x09
+
+# CHECK: lbu16 $3, -1($16)
+0x8f 0x09
+
+# CHECK: lhu16 $3, 4($16)
+0x82 0x29
+
+# CHECK: lw16 $4, 8($17)
+0x12 0x6a
+
+# CHECK: sb16 $3, 4($16)
+0x84 0x89
+
+# CHECK: sh16 $4, 8($17)
+0x14 0xaa
+
+# CHECK: sw16 $4, 4($17)
+0x11 0xea
+
+# CHECK: sw16 $zero, 4($17)
+0x11 0xe8
+
+# CHECK: mfhi $9
+0x09 0x46
+
+# CHECK: mflo $9
+0x49 0x46
+
+# CHECK: move $25, $1
+0x21 0x0f
+
+# CHECK: jrc $9
+0xa9 0x45
+
+# CHECK: jalr $9
+0xc9 0x45
+
+# CHECK: jalrs16 $9
+0xe9 0x45
+
+# CHECK: jr16 $9
+0x89 0x45
+
+# CHECK: li16 $3, -1
+0xff 0xed
+
+# CHECK: li16 $3, 126
+0xfe 0xed
+
+# CHECK: addiur1sp $7, 4
+0x83 0x6f
+
+# CHECK: addiur2 $6, $7, -1
+0x7e 0x6f
+
+# CHECK: addiur2 $6, $7, 12
+0x76 0x6f
+
+# CHECK: addius5 $7, -2
+0xfc 0x4c
+
+# CHECK: nop
+0x00 0x0c
+
+# CHECK: lw $3, 32($sp)
+0x68 0x48
+
+# CHECK: sw $4, 124($sp)
+0x9f 0xc8
+
+# CHECK: beqz16 $6, 20
+0x0a 0x8f
+
+# CHECK: bnez16 $6, 20
+0x0a 0xaf
+
+# CHECK: b16 132
+0x42 0xcc
+
+# CHECK: lw $3, 32($gp)
+0x88 0x65
+
+# CHECK: lwm16 $16, $17, $ra, 8($sp)
+0x12 0x45
+
+# CHECK: swm16 $16, $17, $ra, 8($sp)
+0x52 0x45
+
+# CHECK: break16 8
+0x88 0x46
+
+# CHECK: sdbbp16 14
+0xce 0x46
+
+# CHECK: movep $5, $6, $2, $3
+0x34 0x84
diff --git a/test/MC/Disassembler/Mips/mips1/valid-mips1-el.txt b/test/MC/Disassembler/Mips/mips1/valid-mips1-el.txt
new file mode 100644
index 0000000..dba949a
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips1/valid-mips1-el.txt
@@ -0,0 +1,116 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -disassemble -mcpu=mips1 | FileCheck %s
+# CHECK: .text
+0x85 0xc1 0x20 0x46 # CHECK: abs.d $f6, $f24
+0x45 0x82 0x00 0x46 # CHECK: abs.s $f9, $f16
+0x20 0xb8 0x45 0x02 # CHECK: add $23, $18, $5
+0x00 0x30 0x3c 0x46 # CHECK: add.d $f0, $f6, $f28
+0x00 0xaa 0x18 0x46 # CHECK: add.s $f8, $f21, $f24
+0xd2 0x66 0x2d 0x21 # CHECK: addi $13, $9, 26322
+0xfe 0xff 0x08 0x21 # CHECK: addi $8, $8, -2
+0x48 0x3b 0xc9 0x21 # CHECK: addi $9, $14, 15176
+0xe7 0xe3 0x18 0x23 # CHECK: addi $24, $24, -7193
+0x21 0x48 0x86 0x00 # CHECK: addu $9, $4, $6
+0x0a 0x00 0x29 0x25 # CHECK: addiu $9, $9, 10
+0x24 0xb8 0x4c 0x00 # CHECK: and $23, $2, $12
+0x01 0x00 0x00 0x45 # CHECK: bc1f 8
+0x04 0x00 0x42 0x30 # CHECK: andi $2, $2, 4
+0x01 0x00 0x01 0x45 # CHECK: bc1t 8
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x9b 0x14 0x11 0x04 # CHECK: bal 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x9b 0x14 0x11 0x04 # CHECK: bal 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x9b 0x14 0xd1 0x04 # CHECK: bgezal $6, 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x9b 0x14 0xd0 0x04 # CHECK: bltzal $6, 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x3b 0xe0 0x3c 0x46 # CHECK: c.ngl.d $f28, $f28
+0x39 0x00 0x30 0x46 # CHECK: c.ngle.d $f0, $f16
+0x38 0xf0 0x20 0x46 # CHECK: c.sf.d $f30, $f0
+0x38 0x70 0x16 0x46 # CHECK: c.sf.s $f14, $f22
+0x00 0xa8 0x51 0x44 # CHECK: cfc1 $17, $21
+0x00 0xd0 0xc6 0x44 # CHECK: ctc1 $6, $26
+0xa1 0xe5 0x00 0x46 # CHECK: cvt.d.s $f22, $f28
+0xa1 0x5e 0x80 0x46 # CHECK: cvt.d.w $f26, $f11
+0xa0 0x46 0x20 0x46 # CHECK: cvt.s.d $f26, $f8
+0xa0 0x7d 0x80 0x46 # CHECK: cvt.s.w $f22, $f15
+0x24 0x75 0x20 0x46 # CHECK: cvt.w.d $f20, $f14
+0x24 0xc5 0x00 0x46 # CHECK: cvt.w.s $f20, $f24
+0x1a 0x00 0x2b 0x03 # CHECK: div $zero, $25, $11
+0x03 0xa7 0x3a 0x46 # CHECK: div.d $f28, $f20, $f26
+0x03 0x29 0x0f 0x46 # CHECK: div.s $f4, $f5, $f15
+0x1b 0x00 0x2f 0x03 # CHECK: divu $zero, $25, $15
+0x4d 0xc7 0x58 0x81 # CHECK: lb $24, -14515($10)
+0xf3 0x75 0x68 0x90 # CHECK: lbu $8, 30195($3)
+0x94 0xde 0xab 0x86 # CHECK: lh $11, -8556($21)
+0xbd 0xa6 0x53 0x94 # CHECK: lhu $19, -22851($2)
+0xb3 0x8b 0x01 0x24 # CHECK: addiu $1, $zero, -29773
+0x3f 0x8b 0x00 0x24 # CHECK: addiu $zero, $zero, -29889
+0x2a 0x16 0xa8 0x8c # CHECK: lw $8, 5674($5)
+0xf1 0x27 0x50 0xc7 # CHECK: lwc1 $f16, 10225($26)
+0xb7 0xfc 0xd2 0xc8 # CHECK: lwc2 $18, -841($6)
+0xf7 0x81 0x4a 0xcf # CHECK: lwc3 $10, -32265($26)
+0x79 0xef 0xf4 0x89 # CHECK: lwl $20, -4231($15)
+0x35 0xb5 0x80 0x9b # CHECK: lwr $zero, -19147($gp)
+0x00 0xd8 0x07 0x44 # CHECK: mfc1 $7, $f27
+0x10 0x98 0x00 0x00 # CHECK: mfhi $19
+0x10 0xe8 0x00 0x00 # CHECK: mfhi $sp
+0x12 0x88 0x00 0x00 # CHECK: mflo $17
+0x06 0x75 0x20 0x46 # CHECK: mov.d $f20, $f14
+0x86 0xd8 0x00 0x46 # CHECK: mov.s $f2, $f27
+0x21 0xf0 0x80 0x00 # CHECK: move $fp, $4
+0x21 0xc8 0xc0 0x00 # CHECK: move $25, $6
+0x00 0x48 0x9e 0x44 # CHECK: mtc1 $fp, $f9
+0x11 0x00 0x20 0x02 # CHECK: mthi $17
+0x13 0x00 0xa0 0x03 # CHECK: mtlo $sp
+0x13 0x00 0x20 0x03 # CHECK: mtlo $25
+0x02 0xa5 0x30 0x46 # CHECK: mul.d $f20, $f20, $f16
+0x82 0x57 0x02 0x46 # CHECK: mul.s $f30, $f10, $f2
+0x18 0x00 0xb4 0x03 # CHECK: mult $sp, $20
+0x18 0x00 0xa2 0x03 # CHECK: mult $sp, $2
+0x19 0x00 0x9a 0x03 # CHECK: multu $gp, $26
+0x19 0x00 0x32 0x01 # CHECK: multu $9, $18
+0x23 0x10 0x02 0x00 # CHECK: negu $2, $2
+0x23 0x10 0x03 0x00 # CHECK: negu $2, $3
+0x87 0x96 0x20 0x46 # CHECK: neg.d $f26, $f18
+0x47 0x78 0x00 0x46 # CHECK: neg.s $f1, $f15
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x27 0x38 0x07 0x00 # CHECK: nor $7, $zero, $7
+0x25 0x60 0x1d 0x02 # CHECK: or $12, $16, $sp
+0x04 0x00 0x42 0x34 # CHECK: ori $2, $2, 4
+0x6f 0xb2 0xd6 0xa1 # CHECK: sb $22, -19857($14)
+0xd0 0xe5 0xee 0xa5 # CHECK: sh $14, -6704($15)
+0x80 0x3c 0x07 0x00 # CHECK: sll $7, $7, 18
+0x80 0x3c 0x00 0x00 # CHECK: sll $7, $zero, 18
+0x04 0x38 0x20 0x01 # CHECK: sllv $7, $zero, $9
+0x04 0x38 0x20 0x01 # CHECK: sllv $7, $zero, $9
+0x2a 0xb8 0x7b 0x01 # CHECK: slt $23, $11, $27
+0x11 0x25 0x51 0x29 # CHECK: slti $17, $10, 9489
+0x55 0xc3 0x39 0x2f # CHECK: sltiu $25, $25, -15531
+0x2b 0xa0 0xab 0x02 # CHECK: sltu $20, $21, $11
+0x55 0xc3 0x38 0x2f # CHECK: sltiu $24, $25, -15531
+0xc3 0x8b 0x11 0x00 # CHECK: sra $17, $17, 15
+0xc3 0x8b 0x17 0x00 # CHECK: sra $17, $23, 15
+0x07 0x88 0xb7 0x03 # CHECK: srav $17, $23, $sp
+0x07 0x88 0xb7 0x03 # CHECK: srav $17, $23, $sp
+0xc2 0x11 0x02 0x00 # CHECK: srl $2, $2, 7
+0xc2 0x11 0x02 0x00 # CHECK: srl $2, $2, 7
+0x06 0xc8 0x94 0x00 # CHECK: srlv $25, $20, $4
+0x06 0xc8 0x94 0x00 # CHECK: srlv $25, $20, $4
+0x40 0x00 0x00 0x00 # CHECK: ssnop
+0x22 0xb0 0x6c 0x02 # CHECK: sub $22, $19, $12
+0x36 0x0c 0x36 0x22 # CHECK: addi $22, $17, 3126
+0x90 0xe6 0xad 0x21 # CHECK: addi $13, $13, -6512
+0x81 0x14 0x30 0x46 # CHECK: sub.d $f18, $f2, $f16
+0xc1 0xb5 0x16 0x46 # CHECK: sub.s $f23, $f22, $f22
+0x23 0xe8 0xd6 0x02 # CHECK: subu $sp, $22, $22
+0x50 0xd8 0xbf 0xaf # CHECK: sw $ra, -10160($sp)
+0xef 0xde 0x06 0xe7 # CHECK: swc1 $f6, -8465($24)
+0x30 0x61 0x19 0xea # CHECK: swc2 $25, 24880($16)
+0x7e 0x35 0x6f 0xaa # CHECK: swl $15, 13694($19)
+0x22 0x98 0xd1 0xb9 # CHECK: swr $17, -26590($14)
+0x08 0x00 0x00 0x42 # CHECK: tlbp
+0x01 0x00 0x00 0x42 # CHECK: tlbr
+0x02 0x00 0x00 0x42 # CHECK: tlbwi
+0x06 0x00 0x00 0x42 # CHECK: tlbwr
+0x26 0x90 0x9e 0x00 # CHECK: xor $18, $4, $fp
diff --git a/test/MC/Disassembler/Mips/mips1/valid-mips1.txt b/test/MC/Disassembler/Mips/mips1/valid-mips1.txt
new file mode 100644
index 0000000..1a4f94f
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips1/valid-mips1.txt
@@ -0,0 +1,116 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -disassemble -mcpu=mips1 | FileCheck %s
+# CHECK: .text
+0x46 0x20 0xc1 0x85 # CHECK: abs.d $f6, $f24
+0x46 0x00 0x82 0x45 # CHECK: abs.s $f9, $f16
+0x02 0x45 0xb8 0x20 # CHECK: add $23, $18, $5
+0x46 0x3c 0x30 0x00 # CHECK: add.d $f0, $f6, $f28
+0x46 0x18 0xaa 0x00 # CHECK: add.s $f8, $f21, $f24
+0x21 0x2d 0x66 0xd2 # CHECK: addi $13, $9, 26322
+0x21 0x08 0xff 0xfe # CHECK: addi $8, $8, -2
+0x21 0xc9 0x3b 0x48 # CHECK: addi $9, $14, 15176
+0x23 0x18 0xe3 0xe7 # CHECK: addi $24, $24, -7193
+0x00 0x86 0x48 0x21 # CHECK: addu $9, $4, $6
+0x25 0x29 0x00 0x0a # CHECK: addiu $9, $9, 10
+0x00 0x4c 0xb8 0x24 # CHECK: and $23, $2, $12
+0x45 0x00 0x00 0x01 # CHECK: bc1f 8
+0x30 0x42 0x00 0x04 # CHECK: andi $2, $2, 4
+0x45 0x01 0x00 0x01 # CHECK: bc1t 8
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x04 0x11 0x14 0x9b # CHECK: bal 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x04 0x11 0x14 0x9b # CHECK: bal 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x04 0xd1 0x14 0x9b # CHECK: bgezal $6, 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x04 0xd0 0x14 0x9b # CHECK: bltzal $6, 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x46 0x3c 0xe0 0x3b # CHECK: c.ngl.d $f28, $f28
+0x46 0x30 0x00 0x39 # CHECK: c.ngle.d $f0, $f16
+0x46 0x20 0xf0 0x38 # CHECK: c.sf.d $f30, $f0
+0x46 0x16 0x70 0x38 # CHECK: c.sf.s $f14, $f22
+0x44 0x51 0xa8 0x00 # CHECK: cfc1 $17, $21
+0x44 0xc6 0xd0 0x00 # CHECK: ctc1 $6, $26
+0x46 0x00 0xe5 0xa1 # CHECK: cvt.d.s $f22, $f28
+0x46 0x80 0x5e 0xa1 # CHECK: cvt.d.w $f26, $f11
+0x46 0x20 0x46 0xa0 # CHECK: cvt.s.d $f26, $f8
+0x46 0x80 0x7d 0xa0 # CHECK: cvt.s.w $f22, $f15
+0x46 0x20 0x75 0x24 # CHECK: cvt.w.d $f20, $f14
+0x46 0x00 0xc5 0x24 # CHECK: cvt.w.s $f20, $f24
+0x03 0x2b 0x00 0x1a # CHECK: div $zero, $25, $11
+0x46 0x3a 0xa7 0x03 # CHECK: div.d $f28, $f20, $f26
+0x46 0x0f 0x29 0x03 # CHECK: div.s $f4, $f5, $f15
+0x03 0x2f 0x00 0x1b # CHECK: divu $zero, $25, $15
+0x81 0x58 0xc7 0x4d # CHECK: lb $24, -14515($10)
+0x90 0x68 0x75 0xf3 # CHECK: lbu $8, 30195($3)
+0x86 0xab 0xde 0x94 # CHECK: lh $11, -8556($21)
+0x94 0x53 0xa6 0xbd # CHECK: lhu $19, -22851($2)
+0x24 0x01 0x8b 0xb3 # CHECK: addiu $1, $zero, -29773
+0x24 0x00 0x8b 0x3f # CHECK: addiu $zero, $zero, -29889
+0x8c 0xa8 0x16 0x2a # CHECK: lw $8, 5674($5)
+0xc7 0x50 0x27 0xf1 # CHECK: lwc1 $f16, 10225($26)
+0xc8 0xd2 0xfc 0xb7 # CHECK: lwc2 $18, -841($6)
+0xcf 0x4a 0x81 0xf7 # CHECK: lwc3 $10, -32265($26)
+0x89 0xf4 0xef 0x79 # CHECK: lwl $20, -4231($15)
+0x9b 0x80 0xb5 0x35 # CHECK: lwr $zero, -19147($gp)
+0x44 0x07 0xd8 0x00 # CHECK: mfc1 $7, $f27
+0x00 0x00 0x98 0x10 # CHECK: mfhi $19
+0x00 0x00 0xe8 0x10 # CHECK: mfhi $sp
+0x00 0x00 0x88 0x12 # CHECK: mflo $17
+0x46 0x20 0x75 0x06 # CHECK: mov.d $f20, $f14
+0x46 0x00 0xd8 0x86 # CHECK: mov.s $f2, $f27
+0x00 0x80 0xf0 0x21 # CHECK: move $fp, $4
+0x00 0xc0 0xc8 0x21 # CHECK: move $25, $6
+0x44 0x9e 0x48 0x00 # CHECK: mtc1 $fp, $f9
+0x02 0x20 0x00 0x11 # CHECK: mthi $17
+0x03 0xa0 0x00 0x13 # CHECK: mtlo $sp
+0x03 0x20 0x00 0x13 # CHECK: mtlo $25
+0x46 0x30 0xa5 0x02 # CHECK: mul.d $f20, $f20, $f16
+0x46 0x02 0x57 0x82 # CHECK: mul.s $f30, $f10, $f2
+0x03 0xb4 0x00 0x18 # CHECK: mult $sp, $20
+0x03 0xa2 0x00 0x18 # CHECK: mult $sp, $2
+0x03 0x9a 0x00 0x19 # CHECK: multu $gp, $26
+0x01 0x32 0x00 0x19 # CHECK: multu $9, $18
+0x00 0x02 0x10 0x23 # CHECK: negu $2, $2
+0x00 0x03 0x10 0x23 # CHECK: negu $2, $3
+0x46 0x20 0x96 0x87 # CHECK: neg.d $f26, $f18
+0x46 0x00 0x78 0x47 # CHECK: neg.s $f1, $f15
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x00 0x07 0x38 0x27 # CHECK: nor $7, $zero, $7
+0x02 0x1d 0x60 0x25 # CHECK: or $12, $16, $sp
+0x34 0x42 0x00 0x04 # CHECK: ori $2, $2, 4
+0xa1 0xd6 0xb2 0x6f # CHECK: sb $22, -19857($14)
+0xa5 0xee 0xe5 0xd0 # CHECK: sh $14, -6704($15)
+0x00 0x07 0x3c 0x80 # CHECK: sll $7, $7, 18
+0x00 0x00 0x3c 0x80 # CHECK: sll $7, $zero, 18
+0x01 0x20 0x38 0x04 # CHECK: sllv $7, $zero, $9
+0x01 0x20 0x38 0x04 # CHECK: sllv $7, $zero, $9
+0x01 0x7b 0xb8 0x2a # CHECK: slt $23, $11, $27
+0x29 0x51 0x25 0x11 # CHECK: slti $17, $10, 9489
+0x2f 0x39 0xc3 0x55 # CHECK: sltiu $25, $25, -15531
+0x02 0xab 0xa0 0x2b # CHECK: sltu $20, $21, $11
+0x2f 0x38 0xc3 0x55 # CHECK: sltiu $24, $25, -15531
+0x00 0x11 0x8b 0xc3 # CHECK: sra $17, $17, 15
+0x00 0x17 0x8b 0xc3 # CHECK: sra $17, $23, 15
+0x03 0xb7 0x88 0x07 # CHECK: srav $17, $23, $sp
+0x03 0xb7 0x88 0x07 # CHECK: srav $17, $23, $sp
+0x00 0x02 0x11 0xc2 # CHECK: srl $2, $2, 7
+0x00 0x02 0x11 0xc2 # CHECK: srl $2, $2, 7
+0x00 0x94 0xc8 0x06 # CHECK: srlv $25, $20, $4
+0x00 0x94 0xc8 0x06 # CHECK: srlv $25, $20, $4
+0x00 0x00 0x00 0x40 # CHECK: ssnop
+0x02 0x6c 0xb0 0x22 # CHECK: sub $22, $19, $12
+0x22 0x36 0x0c 0x36 # CHECK: addi $22, $17, 3126
+0x21 0xad 0xe6 0x90 # CHECK: addi $13, $13, -6512
+0x46 0x30 0x14 0x81 # CHECK: sub.d $f18, $f2, $f16
+0x46 0x16 0xb5 0xc1 # CHECK: sub.s $f23, $f22, $f22
+0x02 0xd6 0xe8 0x23 # CHECK: subu $sp, $22, $22
+0xaf 0xbf 0xd8 0x50 # CHECK: sw $ra, -10160($sp)
+0xe7 0x06 0xde 0xef # CHECK: swc1 $f6, -8465($24)
+0xea 0x19 0x61 0x30 # CHECK: swc2 $25, 24880($16)
+0xaa 0x6f 0x35 0x7e # CHECK: swl $15, 13694($19)
+0xb9 0xd1 0x98 0x22 # CHECK: swr $17, -26590($14)
+0x42 0x00 0x00 0x08 # CHECK: tlbp
+0x42 0x00 0x00 0x01 # CHECK: tlbr
+0x42 0x00 0x00 0x02 # CHECK: tlbwi
+0x42 0x00 0x00 0x06 # CHECK: tlbwr
+0x00 0x9e 0x90 0x26 # CHECK: xor $18, $4, $fp
diff --git a/test/MC/Disassembler/Mips/mips1/valid-xfail.txt b/test/MC/Disassembler/Mips/mips1/valid-xfail.txt
new file mode 100644
index 0000000..759097c
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips1/valid-xfail.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -disassemble -mcpu=mips1 | FileCheck %s
+# XFAIL: *
+0xc2 0x44 0xe3 0x67 # CHECK: lwc0 $4, -7321($18)
+0xe2 0x64 0x49 0xd8 # CHECK: swc0 $4, 18904($19)
diff --git a/test/MC/Disassembler/Mips/mips2/valid-mips2-el.txt b/test/MC/Disassembler/Mips/mips2/valid-mips2-el.txt
new file mode 100644
index 0000000..8060409
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips2/valid-mips2-el.txt
@@ -0,0 +1,159 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -disassemble -mcpu=mips2 | FileCheck %s
+# CHECK: .text
+0x85 0xc1 0x20 0x46 # CHECK: abs.d $f6, $f24
+0x45 0x82 0x00 0x46 # CHECK: abs.s $f9, $f16
+0x20 0xb8 0x45 0x02 # CHECK: add $23, $18, $5
+0x48 0x3b 0xc9 0x21 # CHECK: addi $9, $14, 15176
+0xe7 0xe3 0x18 0x23 # CHECK: addi $24, $24, -7193
+0x00 0x30 0x3c 0x46 # CHECK: add.d $f0, $f6, $f28
+0x00 0xaa 0x18 0x46 # CHECK: add.s $f8, $f21, $f24
+0xd2 0x66 0x2d 0x21 # CHECK: addi $13, $9, 26322
+0xfe 0xff 0x08 0x21 # CHECK: addi $8, $8, -2
+0x21 0x48 0x86 0x00 # CHECK: addu $9, $4, $6
+0x0a 0x00 0x29 0x25 # CHECK: addiu $9, $9, 10
+0x24 0xb8 0x4c 0x00 # CHECK: and $23, $2, $12
+0x04 0x00 0x42 0x30 # CHECK: andi $2, $2, 4
+0x01 0x00 0x00 0x45 # CHECK: bc1f 8
+0x0c 0x00 0x02 0x45 # CHECK: bc1fl 52
+0x01 0x00 0x01 0x45 # CHECK: bc1t 8
+0xf4 0xf7 0x03 0x45 # CHECK: bc1tl -8236
+0x9b 0x14 0x11 0x04 # CHECK: bal 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x9b 0x14 0xd0 0x04 # CHECK: bltzal $6, 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x40 0x0c 0xd3 0x51 # CHECK: beql $14, $19, 12548
+0x1f 0x07 0x93 0x05 # CHECK: bgezall $12, 7296
+0x4d 0xf9 0x83 0x04 # CHECK: bgezl $4, -6856
+0x59 0xfc 0x40 0x5d # CHECK: bgtzl $10, -3736
+0xe7 0x02 0xc0 0x58 # CHECK: blezl $6, 2976
+0x7a 0x00 0xd2 0x04 # CHECK: bltzall $6, 492
+0x45 0xf6 0x22 0x06 # CHECK: bltzl $17, -9960
+0xfc 0x04 0x94 0x57 # CHECK: bnel $gp, $20, 5108
+0x3b 0xe0 0x3c 0x46 # CHECK: c.ngl.d $f28, $f28
+0x39 0x00 0x30 0x46 # CHECK: c.ngle.d $f0, $f16
+0x38 0xf0 0x20 0x46 # CHECK: c.sf.d $f30, $f0
+0x38 0x70 0x16 0x46 # CHECK: c.sf.s $f14, $f22
+0xce 0xc2 0x20 0x46 # CHECK: ceil.w.d $f11, $f24
+0x8e 0xa1 0x00 0x46 # CHECK: ceil.w.s $f6, $f20
+0x00 0xa8 0x51 0x44 # CHECK: cfc1 $17, $21
+0x00 0xd0 0xc6 0x44 # CHECK: ctc1 $6, $26
+0xa1 0xe5 0x00 0x46 # CHECK: cvt.d.s $f22, $f28
+0xa1 0x5e 0x80 0x46 # CHECK: cvt.d.w $f26, $f11
+0xa0 0x46 0x20 0x46 # CHECK: cvt.s.d $f26, $f8
+0xa0 0x7d 0x80 0x46 # CHECK: cvt.s.w $f22, $f15
+0x24 0x75 0x20 0x46 # CHECK: cvt.w.d $f20, $f14
+0x24 0xc5 0x00 0x46 # CHECK: cvt.w.s $f20, $f24
+0x1a 0x00 0x2b 0x03 # CHECK: div $zero, $25, $11
+0x03 0xa7 0x3a 0x46 # CHECK: div.d $f28, $f20, $f26
+0x03 0x29 0x0f 0x46 # CHECK: div.s $f4, $f5, $f15
+0x1b 0x00 0x2f 0x03 # CHECK: divu $zero, $25, $15
+0xc0 0x00 0x00 0x00 # CHECK: ehb
+0x8f 0x53 0x20 0x46 # CHECK: floor.w.d $f14, $f10
+0x0f 0x4a 0x00 0x46 # CHECK: floor.w.s $f8, $f9
+0x4d 0xc7 0x58 0x81 # CHECK: lb $24, -14515($10)
+0xf3 0x75 0x68 0x90 # CHECK: lbu $8, 30195($3)
+0x07 0x40 0x0a 0xd6 # CHECK: ldc1 $f10, 16391($16)
+0x43 0xad 0x28 0xd8 # CHECK: ldc2 $8, -21181($1)
+0x1b 0x90 0x3d 0xde # CHECK: ldc3 $29, -28645($17)
+0x94 0xde 0xab 0x86 # CHECK: lh $11, -8556($21)
+0xbd 0xa6 0x53 0x94 # CHECK: lhu $19, -22851($2)
+0xb3 0x8b 0x01 0x24 # CHECK: addiu $1, $zero, -29773
+0x3f 0x8b 0x00 0x24 # CHECK: addiu $zero, $zero, -29889
+0x67 0xe3 0x42 0xc2 # CHECK: ll $2, -7321($18)
+0x2a 0x16 0xa8 0x8c # CHECK: lw $8, 5674($5)
+0xf1 0x27 0x50 0xc7 # CHECK: lwc1 $f16, 10225($26)
+0xb7 0xfc 0xd2 0xc8 # CHECK: lwc2 $18, -841($6)
+0xf7 0x81 0x4a 0xcf # CHECK: lwc3 $10, -32265($26)
+0x79 0xef 0xf4 0x89 # CHECK: lwl $20, -4231($15)
+0x35 0xb5 0x80 0x9b # CHECK: lwr $zero, -19147($gp)
+0x00 0xd8 0x07 0x44 # CHECK: mfc1 $7, $f27
+0x10 0x98 0x00 0x00 # CHECK: mfhi $19
+0x10 0xe8 0x00 0x00 # CHECK: mfhi $sp
+0x12 0x88 0x00 0x00 # CHECK: mflo $17
+0x06 0x75 0x20 0x46 # CHECK: mov.d $f20, $f14
+0x86 0xd8 0x00 0x46 # CHECK: mov.s $f2, $f27
+0x21 0xf0 0x80 0x00 # CHECK: move $fp, $4
+0x21 0xc8 0xc0 0x00 # CHECK: move $25, $6
+0x00 0x48 0x9e 0x44 # CHECK: mtc1 $fp, $f9
+0x11 0x00 0x20 0x02 # CHECK: mthi $17
+0x13 0x00 0xa0 0x03 # CHECK: mtlo $sp
+0x13 0x00 0x20 0x03 # CHECK: mtlo $25
+0x02 0xa5 0x30 0x46 # CHECK: mul.d $f20, $f20, $f16
+0x82 0x57 0x02 0x46 # CHECK: mul.s $f30, $f10, $f2
+0x18 0x00 0xb4 0x03 # CHECK: mult $sp, $20
+0x18 0x00 0xa2 0x03 # CHECK: mult $sp, $2
+0x19 0x00 0x9a 0x03 # CHECK: multu $gp, $26
+0x19 0x00 0x32 0x01 # CHECK: multu $9, $18
+0x23 0x10 0x02 0x00 # CHECK: negu $2, $2
+0x23 0x10 0x03 0x00 # CHECK: negu $2, $3
+0x87 0x96 0x20 0x46 # CHECK: neg.d $f26, $f18
+0x47 0x78 0x00 0x46 # CHECK: neg.s $f1, $f15
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x27 0x38 0x07 0x00 # CHECK: nor $7, $zero, $7
+0x25 0x60 0x1d 0x02 # CHECK: or $12, $16, $sp
+0x04 0x00 0x42 0x34 # CHECK: ori $2, $2, 4
+0x8c 0x21 0x20 0x46 # CHECK: round.w.d $f6, $f4
+0xcc 0xe6 0x00 0x46 # CHECK: round.w.s $f27, $f28
+0x6f 0xb2 0xd6 0xa1 # CHECK: sb $22, -19857($14)
+0xd8 0x49 0x6f 0xe2 # CHECK: sc $15, 18904($19)
+0x6e 0x77 0xbe 0xf5 # CHECK: sdc1 $f30, 30574($13)
+0x75 0x5a 0x54 0xfa # CHECK: sdc2 $20, 23157($18)
+0xcb 0x16 0x4c 0xfd # CHECK: sdc3 $12, 5835($10)
+0xd0 0xe5 0xee 0xa5 # CHECK: sh $14, -6704($15)
+0x80 0x3c 0x07 0x00 # CHECK: sll $7, $7, 18
+0x80 0x3c 0x00 0x00 # CHECK: sll $7, $zero, 18
+0x04 0x38 0x20 0x01 # CHECK: sllv $7, $zero, $9
+0x04 0x38 0x20 0x01 # CHECK: sllv $7, $zero, $9
+0x2a 0xb8 0x7b 0x01 # CHECK: slt $23, $11, $27
+0x11 0x25 0x51 0x29 # CHECK: slti $17, $10, 9489
+0x55 0xc3 0x39 0x2f # CHECK: sltiu $25, $25, -15531
+0x2b 0xa0 0xab 0x02 # CHECK: sltu $20, $21, $11
+0x55 0xc3 0x38 0x2f # CHECK: sltiu $24, $25, -15531
+0x04 0xb4 0x20 0x46 # CHECK: sqrt.d $f16, $f22
+0x04 0x08 0x00 0x46 # CHECK: sqrt.s $f0, $f1
+0xc3 0x8b 0x11 0x00 # CHECK: sra $17, $17, 15
+0xc3 0x8b 0x17 0x00 # CHECK: sra $17, $23, 15
+0x07 0x88 0xb7 0x03 # CHECK: srav $17, $23, $sp
+0x07 0x88 0xb7 0x03 # CHECK: srav $17, $23, $sp
+0xc2 0x11 0x02 0x00 # CHECK: srl $2, $2, 7
+0xc2 0x11 0x02 0x00 # CHECK: srl $2, $2, 7
+0x06 0xc8 0x94 0x00 # CHECK: srlv $25, $20, $4
+0x06 0xc8 0x94 0x00 # CHECK: srlv $25, $20, $4
+0x40 0x00 0x00 0x00 # CHECK: ssnop
+0x22 0xb0 0x6c 0x02 # CHECK: sub $22, $19, $12
+0x36 0x0c 0x36 0x22 # CHECK: addi $22, $17, 3126
+0x90 0xe6 0xad 0x21 # CHECK: addi $13, $13, -6512
+0x81 0x14 0x30 0x46 # CHECK: sub.d $f18, $f2, $f16
+0xc1 0xb5 0x16 0x46 # CHECK: sub.s $f23, $f22, $f22
+0x23 0xe8 0xd6 0x02 # CHECK: subu $sp, $22, $22
+0x50 0xd8 0xbf 0xaf # CHECK: sw $ra, -10160($sp)
+0xef 0xde 0x06 0xe7 # CHECK: swc1 $f6, -8465($24)
+0x30 0x61 0x19 0xea # CHECK: swc2 $25, 24880($16)
+0xf7 0x81 0x4a 0xef # CHECK: swc3 $10, -32265($26)
+0x7e 0x35 0x6f 0xaa # CHECK: swl $15, 13694($19)
+0x22 0x98 0xd1 0xb9 # CHECK: swr $17, -26590($14)
+0x34 0x00 0x03 0x00 # CHECK: teq $zero, $3
+0x34 0x9b 0xa7 0x00 # CHECK: teq $5, $7, 620
+0xa0 0xbb 0xac 0x06 # CHECK: teqi $21, 48032
+0x30 0x00 0xea 0x00 # CHECK: tge $7, $10
+0x30 0x55 0xb3 0x00 # CHECK: tge $5, $19, 340
+0xa1 0x13 0x28 0x06 # CHECK: tgei $17, 5025
+0x33 0x90 0xa9 0x07 # CHECK: tgeiu $sp, 36915
+0x31 0x00 0xdc 0x02 # CHECK: tgeu $22, $gp
+0xf1 0x5e 0x8e 0x02 # CHECK: tgeu $20, $14, 379
+0x08 0x00 0x00 0x42 # CHECK: tlbp
+0x01 0x00 0x00 0x42 # CHECK: tlbr
+0x02 0x00 0x00 0x42 # CHECK: tlbwi
+0x06 0x00 0x00 0x42 # CHECK: tlbwr
+0x32 0x00 0xed 0x01 # CHECK: tlt $15, $13
+0x72 0x21 0x53 0x00 # CHECK: tlt $2, $19, 133
+0xbd 0xad 0xca 0x05 # CHECK: tlti $14, 44477
+0x2c 0xec 0xeb 0x07 # CHECK: tltiu $ra, 60460
+0x33 0x00 0x70 0x01 # CHECK: tltu $11, $16
+0x33 0xfe 0x1d 0x02 # CHECK: tltu $16, $sp, 1016
+0x36 0x00 0xd1 0x00 # CHECK: tne $6, $17
+0x76 0xdd 0xe8 0x00 # CHECK: tne $7, $8, 885
+0x31 0x8c 0x8e 0x05 # CHECK: tnei $12, 35889
+0x8d 0x75 0x20 0x46 # CHECK: trunc.w.d $f22, $f14
+0x0d 0xf7 0x00 0x46 # CHECK: trunc.w.s $f28, $f30
+0x26 0x90 0x9e 0x00 # CHECK: xor $18, $4, $fp
diff --git a/test/MC/Disassembler/Mips/mips2/valid-mips2.txt b/test/MC/Disassembler/Mips/mips2/valid-mips2.txt
new file mode 100644
index 0000000..3dc5231
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips2/valid-mips2.txt
@@ -0,0 +1,159 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -disassemble -mcpu=mips2 | FileCheck %s
+# CHECK: .text
+0x46 0x20 0xc1 0x85 # CHECK: abs.d $f6, $f24
+0x46 0x00 0x82 0x45 # CHECK: abs.s $f9, $f16
+0x02 0x45 0xb8 0x20 # CHECK: add $23, $18, $5
+0x21 0xc9 0x3b 0x48 # CHECK: addi $9, $14, 15176
+0x23 0x18 0xe3 0xe7 # CHECK: addi $24, $24, -7193
+0x46 0x3c 0x30 0x00 # CHECK: add.d $f0, $f6, $f28
+0x46 0x18 0xaa 0x00 # CHECK: add.s $f8, $f21, $f24
+0x21 0x2d 0x66 0xd2 # CHECK: addi $13, $9, 26322
+0x21 0x08 0xff 0xfe # CHECK: addi $8, $8, -2
+0x00 0x86 0x48 0x21 # CHECK: addu $9, $4, $6
+0x25 0x29 0x00 0x0a # CHECK: addiu $9, $9, 10
+0x00 0x4c 0xb8 0x24 # CHECK: and $23, $2, $12
+0x30 0x42 0x00 0x04 # CHECK: andi $2, $2, 4
+0x45 0x00 0x00 0x01 # CHECK: bc1f 8
+0x45 0x02 0x00 0x0c # CHECK: bc1fl 52
+0x45 0x01 0x00 0x01 # CHECK: bc1t 8
+0x45 0x03 0xf7 0xf4 # CHECK: bc1tl -8236
+0x04 0x11 0x14 0x9b # CHECK: bal 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x04 0xd0 0x14 0x9b # CHECK: bltzal $6, 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x51 0xd3 0x0c 0x40 # CHECK: beql $14, $19, 12548
+0x05 0x93 0x07 0x1f # CHECK: bgezall $12, 7296
+0x04 0x83 0xf9 0x4d # CHECK: bgezl $4, -6856
+0x5d 0x40 0xfc 0x59 # CHECK: bgtzl $10, -3736
+0x58 0xc0 0x02 0xe7 # CHECK: blezl $6, 2976
+0x04 0xd2 0x00 0x7a # CHECK: bltzall $6, 492
+0x06 0x22 0xf6 0x45 # CHECK: bltzl $17, -9960
+0x57 0x94 0x04 0xfc # CHECK: bnel $gp, $20, 5108
+0x46 0x3c 0xe0 0x3b # CHECK: c.ngl.d $f28, $f28
+0x46 0x30 0x00 0x39 # CHECK: c.ngle.d $f0, $f16
+0x46 0x20 0xf0 0x38 # CHECK: c.sf.d $f30, $f0
+0x46 0x16 0x70 0x38 # CHECK: c.sf.s $f14, $f22
+0x46 0x20 0xc2 0xce # CHECK: ceil.w.d $f11, $f24
+0x46 0x00 0xa1 0x8e # CHECK: ceil.w.s $f6, $f20
+0x44 0x51 0xa8 0x00 # CHECK: cfc1 $17, $21
+0x44 0xc6 0xd0 0x00 # CHECK: ctc1 $6, $26
+0x46 0x00 0xe5 0xa1 # CHECK: cvt.d.s $f22, $f28
+0x46 0x80 0x5e 0xa1 # CHECK: cvt.d.w $f26, $f11
+0x46 0x20 0x46 0xa0 # CHECK: cvt.s.d $f26, $f8
+0x46 0x80 0x7d 0xa0 # CHECK: cvt.s.w $f22, $f15
+0x46 0x20 0x75 0x24 # CHECK: cvt.w.d $f20, $f14
+0x46 0x00 0xc5 0x24 # CHECK: cvt.w.s $f20, $f24
+0x03 0x2b 0x00 0x1a # CHECK: div $zero, $25, $11
+0x46 0x3a 0xa7 0x03 # CHECK: div.d $f28, $f20, $f26
+0x46 0x0f 0x29 0x03 # CHECK: div.s $f4, $f5, $f15
+0x03 0x2f 0x00 0x1b # CHECK: divu $zero, $25, $15
+0x00 0x00 0x00 0xc0 # CHECK: ehb
+0x46 0x20 0x53 0x8f # CHECK: floor.w.d $f14, $f10
+0x46 0x00 0x4a 0x0f # CHECK: floor.w.s $f8, $f9
+0x81 0x58 0xc7 0x4d # CHECK: lb $24, -14515($10)
+0x90 0x68 0x75 0xf3 # CHECK: lbu $8, 30195($3)
+0xd6 0x0a 0x40 0x07 # CHECK: ldc1 $f10, 16391($16)
+0xd8 0x28 0xad 0x43 # CHECK: ldc2 $8, -21181($1)
+0xde 0x3d 0x90 0x1b # CHECK: ldc3 $29, -28645($17)
+0x86 0xab 0xde 0x94 # CHECK: lh $11, -8556($21)
+0x94 0x53 0xa6 0xbd # CHECK: lhu $19, -22851($2)
+0x24 0x01 0x8b 0xb3 # CHECK: addiu $1, $zero, -29773
+0x24 0x00 0x8b 0x3f # CHECK: addiu $zero, $zero, -29889
+0xc2 0x42 0xe3 0x67 # CHECK: ll $2, -7321($18)
+0x8c 0xa8 0x16 0x2a # CHECK: lw $8, 5674($5)
+0xc7 0x50 0x27 0xf1 # CHECK: lwc1 $f16, 10225($26)
+0xc8 0xd2 0xfc 0xb7 # CHECK: lwc2 $18, -841($6)
+0xcf 0x4a 0x81 0xf7 # CHECK: lwc3 $10, -32265($26)
+0x89 0xf4 0xef 0x79 # CHECK: lwl $20, -4231($15)
+0x9b 0x80 0xb5 0x35 # CHECK: lwr $zero, -19147($gp)
+0x44 0x07 0xd8 0x00 # CHECK: mfc1 $7, $f27
+0x00 0x00 0x98 0x10 # CHECK: mfhi $19
+0x00 0x00 0xe8 0x10 # CHECK: mfhi $sp
+0x00 0x00 0x88 0x12 # CHECK: mflo $17
+0x46 0x20 0x75 0x06 # CHECK: mov.d $f20, $f14
+0x46 0x00 0xd8 0x86 # CHECK: mov.s $f2, $f27
+0x00 0x80 0xf0 0x21 # CHECK: move $fp, $4
+0x00 0xc0 0xc8 0x21 # CHECK: move $25, $6
+0x44 0x9e 0x48 0x00 # CHECK: mtc1 $fp, $f9
+0x02 0x20 0x00 0x11 # CHECK: mthi $17
+0x03 0xa0 0x00 0x13 # CHECK: mtlo $sp
+0x03 0x20 0x00 0x13 # CHECK: mtlo $25
+0x46 0x30 0xa5 0x02 # CHECK: mul.d $f20, $f20, $f16
+0x46 0x02 0x57 0x82 # CHECK: mul.s $f30, $f10, $f2
+0x03 0xb4 0x00 0x18 # CHECK: mult $sp, $20
+0x03 0xa2 0x00 0x18 # CHECK: mult $sp, $2
+0x03 0x9a 0x00 0x19 # CHECK: multu $gp, $26
+0x01 0x32 0x00 0x19 # CHECK: multu $9, $18
+0x00 0x02 0x10 0x23 # CHECK: negu $2, $2
+0x00 0x03 0x10 0x23 # CHECK: negu $2, $3
+0x46 0x20 0x96 0x87 # CHECK: neg.d $f26, $f18
+0x46 0x00 0x78 0x47 # CHECK: neg.s $f1, $f15
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x00 0x07 0x38 0x27 # CHECK: nor $7, $zero, $7
+0x02 0x1d 0x60 0x25 # CHECK: or $12, $16, $sp
+0x34 0x42 0x00 0x04 # CHECK: ori $2, $2, 4
+0x46 0x20 0x21 0x8c # CHECK: round.w.d $f6, $f4
+0x46 0x00 0xe6 0xcc # CHECK: round.w.s $f27, $f28
+0xa1 0xd6 0xb2 0x6f # CHECK: sb $22, -19857($14)
+0xe2 0x6f 0x49 0xd8 # CHECK: sc $15, 18904($19)
+0xf5 0xbe 0x77 0x6e # CHECK: sdc1 $f30, 30574($13)
+0xfa 0x54 0x5a 0x75 # CHECK: sdc2 $20, 23157($18)
+0xfd 0x4c 0x16 0xcb # CHECK: sdc3 $12, 5835($10)
+0xa5 0xee 0xe5 0xd0 # CHECK: sh $14, -6704($15)
+0x00 0x07 0x3c 0x80 # CHECK: sll $7, $7, 18
+0x00 0x00 0x3c 0x80 # CHECK: sll $7, $zero, 18
+0x01 0x20 0x38 0x04 # CHECK: sllv $7, $zero, $9
+0x01 0x20 0x38 0x04 # CHECK: sllv $7, $zero, $9
+0x01 0x7b 0xb8 0x2a # CHECK: slt $23, $11, $27
+0x29 0x51 0x25 0x11 # CHECK: slti $17, $10, 9489
+0x2f 0x39 0xc3 0x55 # CHECK: sltiu $25, $25, -15531
+0x02 0xab 0xa0 0x2b # CHECK: sltu $20, $21, $11
+0x2f 0x38 0xc3 0x55 # CHECK: sltiu $24, $25, -15531
+0x46 0x20 0xb4 0x04 # CHECK: sqrt.d $f16, $f22
+0x46 0x00 0x08 0x04 # CHECK: sqrt.s $f0, $f1
+0x00 0x11 0x8b 0xc3 # CHECK: sra $17, $17, 15
+0x00 0x17 0x8b 0xc3 # CHECK: sra $17, $23, 15
+0x03 0xb7 0x88 0x07 # CHECK: srav $17, $23, $sp
+0x03 0xb7 0x88 0x07 # CHECK: srav $17, $23, $sp
+0x00 0x02 0x11 0xc2 # CHECK: srl $2, $2, 7
+0x00 0x02 0x11 0xc2 # CHECK: srl $2, $2, 7
+0x00 0x94 0xc8 0x06 # CHECK: srlv $25, $20, $4
+0x00 0x94 0xc8 0x06 # CHECK: srlv $25, $20, $4
+0x00 0x00 0x00 0x40 # CHECK: ssnop
+0x02 0x6c 0xb0 0x22 # CHECK: sub $22, $19, $12
+0x22 0x36 0x0c 0x36 # CHECK: addi $22, $17, 3126
+0x21 0xad 0xe6 0x90 # CHECK: addi $13, $13, -6512
+0x46 0x30 0x14 0x81 # CHECK: sub.d $f18, $f2, $f16
+0x46 0x16 0xb5 0xc1 # CHECK: sub.s $f23, $f22, $f22
+0x02 0xd6 0xe8 0x23 # CHECK: subu $sp, $22, $22
+0xaf 0xbf 0xd8 0x50 # CHECK: sw $ra, -10160($sp)
+0xe7 0x06 0xde 0xef # CHECK: swc1 $f6, -8465($24)
+0xea 0x19 0x61 0x30 # CHECK: swc2 $25, 24880($16)
+0xef 0x4a 0x81 0xf7 # CHECK: swc3 $10, -32265($26)
+0xaa 0x6f 0x35 0x7e # CHECK: swl $15, 13694($19)
+0xb9 0xd1 0x98 0x22 # CHECK: swr $17, -26590($14)
+0x00 0x03 0x00 0x34 # CHECK: teq $zero, $3
+0x00 0xa7 0x9b 0x34 # CHECK: teq $5, $7, 620
+0x06 0xac 0xbb 0xa0 # CHECK: teqi $21, 48032
+0x00 0xea 0x00 0x30 # CHECK: tge $7, $10
+0x00 0xb3 0x55 0x30 # CHECK: tge $5, $19, 340
+0x06 0x28 0x13 0xa1 # CHECK: tgei $17, 5025
+0x07 0xa9 0x90 0x33 # CHECK: tgeiu $sp, 36915
+0x02 0xdc 0x00 0x31 # CHECK: tgeu $22, $gp
+0x02 0x8e 0x5e 0xf1 # CHECK: tgeu $20, $14, 379
+0x42 0x00 0x00 0x08 # CHECK: tlbp
+0x42 0x00 0x00 0x01 # CHECK: tlbr
+0x42 0x00 0x00 0x02 # CHECK: tlbwi
+0x42 0x00 0x00 0x06 # CHECK: tlbwr
+0x01 0xed 0x00 0x32 # CHECK: tlt $15, $13
+0x00 0x53 0x21 0x72 # CHECK: tlt $2, $19, 133
+0x05 0xca 0xad 0xbd # CHECK: tlti $14, 44477
+0x07 0xeb 0xec 0x2c # CHECK: tltiu $ra, 60460
+0x01 0x70 0x00 0x33 # CHECK: tltu $11, $16
+0x02 0x1d 0xfe 0x33 # CHECK: tltu $16, $sp, 1016
+0x00 0xd1 0x00 0x36 # CHECK: tne $6, $17
+0x00 0xe8 0xdd 0x76 # CHECK: tne $7, $8, 885
+0x05 0x8e 0x8c 0x31 # CHECK: tnei $12, 35889
+0x46 0x20 0x75 0x8d # CHECK: trunc.w.d $f22, $f14
+0x46 0x00 0xf7 0x0d # CHECK: trunc.w.s $f28, $f30
+0x00 0x9e 0x90 0x26 # CHECK: xor $18, $4, $fp
diff --git a/test/MC/Disassembler/Mips/mips3/valid-mips3-el.txt b/test/MC/Disassembler/Mips/mips3/valid-mips3-el.txt
new file mode 100644
index 0000000..98ce16b
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips3/valid-mips3-el.txt
@@ -0,0 +1,209 @@
+# RUN: llvm-mc %s -triple=mips64el-unknown-linux -disassemble -mcpu=mips3 | FileCheck %s
+# CHECK: .text
+0x85 0xc1 0x20 0x46 # CHECK: abs.d $f6, $f24
+0x45 0x82 0x00 0x46 # CHECK: abs.s $f9, $f16
+0x20 0xb8 0x45 0x02 # CHECK: add $23, $18, $5
+0x48 0x3b 0xc9 0x21 # CHECK: addi $9, $14, 15176
+0xe7 0xe3 0x18 0x23 # CHECK: addi $24, $24, -7193
+0x00 0x30 0x3c 0x46 # CHECK: add.d $f0, $f6, $f28
+0x00 0xaa 0x18 0x46 # CHECK: add.s $f8, $f21, $f24
+0xd2 0x66 0x2d 0x21 # CHECK: addi $13, $9, 26322
+0xfe 0xff 0x08 0x21 # CHECK: addi $8, $8, -2
+0x21 0x48 0x86 0x00 # CHECK: addu $9, $4, $6
+0x0a 0x00 0x29 0x25 # CHECK: addiu $9, $9, 10
+0x24 0xb8 0x4c 0x00 # CHECK: and $23, $2, $12
+0x04 0x00 0x42 0x30 # CHECK: andi $2, $2, 4
+0x01 0x00 0x00 0x45 # CHECK: bc1f 8
+0x0c 0x00 0x02 0x45 # CHECK: bc1fl 52
+0x01 0x00 0x01 0x45 # CHECK: bc1t 8
+0xf4 0xf7 0x03 0x45 # CHECK: bc1tl -8236
+0x9b 0x14 0x11 0x04 # CHECK: bal 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x9b 0x14 0xd0 0x04 # CHECK: bltzal $6, 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x40 0x0c 0xd3 0x51 # CHECK: beql $14, $19, 12548
+0x1f 0x07 0x93 0x05 # CHECK: bgezall $12, 7296
+0x4d 0xf9 0x83 0x04 # CHECK: bgezl $4, -6856
+0x59 0xfc 0x40 0x5d # CHECK: bgtzl $10, -3736
+0xe7 0x02 0xc0 0x58 # CHECK: blezl $6, 2976
+0x7a 0x00 0xd2 0x04 # CHECK: bltzall $6, 492
+0x45 0xf6 0x22 0x06 # CHECK: bltzl $17, -9960
+0xfc 0x04 0x94 0x57 # CHECK: bnel $gp, $20, 5108
+0x08 0x00 0xa1 0xbc # CHECK: cache 1, 8($5)
+0x3b 0xe0 0x3c 0x46 # CHECK: c.ngl.d $f28, $f28
+0x39 0x00 0x30 0x46 # CHECK: c.ngle.d $f0, $f16
+0x38 0xf0 0x20 0x46 # CHECK: c.sf.d $f30, $f0
+0x38 0x70 0x16 0x46 # CHECK: c.sf.s $f14, $f22
+0x4a 0x18 0x20 0x46 # CHECK: ceil.l.d $f1, $f3
+0x8a 0x6c 0x00 0x46 # CHECK: ceil.l.s $f18, $f13
+0xce 0xc2 0x20 0x46 # CHECK: ceil.w.d $f11, $f24
+0x8e 0xa1 0x00 0x46 # CHECK: ceil.w.s $f6, $f20
+0x00 0xa8 0x51 0x44 # CHECK: cfc1 $17, $21
+0x00 0xd0 0xc6 0x44 # CHECK: ctc1 $6, $26
+0xa1 0xe5 0x00 0x46 # CHECK: cvt.d.s $f22, $f28
+0xa1 0x5e 0x80 0x46 # CHECK: cvt.d.w $f26, $f11
+0x21 0x81 0xa0 0x46 # CHECK: cvt.d.l $f4, $f16
+0x25 0x7e 0x20 0x46 # CHECK: cvt.l.d $f24, $f15
+0xe5 0xea 0x00 0x46 # CHECK: cvt.l.s $f11, $f29
+0xe0 0xf3 0xa0 0x46 # CHECK: cvt.s.l $f15, $f30
+0xa0 0x46 0x20 0x46 # CHECK: cvt.s.d $f26, $f8
+0xa0 0x7d 0x80 0x46 # CHECK: cvt.s.w $f22, $f15
+0x24 0x75 0x20 0x46 # CHECK: cvt.w.d $f20, $f14
+0x24 0xc5 0x00 0x46 # CHECK: cvt.w.s $f20, $f24
+0x2c 0x98 0x3f 0x00 # CHECK: dadd $19, $1, $ra
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0xbd 0x63 # CHECK: daddi $sp, $sp, -27705
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0xbd 0x63 # CHECK: daddi $sp, $sp, -27705
+0x16 0xee 0xda 0x66 # CHECK: daddiu $26, $22, -4586
+0x2d 0x98 0x3f 0x00 # CHECK: daddu $19, $1, $ra
+0x9f 0x46 0x58 0x64 # CHECK: daddiu $24, $2, 18079
+0x3f 0x69 0x73 0x66 # CHECK: daddiu $19, $19, 26943
+0x1e 0x00 0x53 0x03 # CHECK: ddiv $zero, $26, $19
+0x1f 0x00 0x11 0x02 # CHECK: ddivu $zero, $16, $17
+0x1a 0x00 0x2b 0x03 # CHECK: div $zero, $25, $11
+0x03 0xa7 0x3a 0x46 # CHECK: div.d $f28, $f20, $f26
+0x03 0x29 0x0f 0x46 # CHECK: div.s $f4, $f5, $f15
+0x1b 0x00 0x2f 0x03 # CHECK: divu $zero, $25, $15
+0x00 0x68 0x2c 0x44 # CHECK: dmfc1 $12, $f13
+0x00 0x70 0xb0 0x44 # CHECK: dmtc1 $16, $f14
+0x1c 0x00 0xe9 0x02 # CHECK: dmult $23, $9
+0x1d 0x00 0xa6 0x00 # CHECK: dmultu $5, $6
+0xb8 0x04 0x00 0x00 # CHECK: dsll $zero, $zero, 18
+0xb8 0x04 0x14 0x00 # CHECK: dsll $zero, $20, 18
+0x14 0x00 0x94 0x01 # CHECK: dsllv $zero, $20, $12
+0xbc 0x04 0x00 0x00 # CHECK: dsll32 $zero, $zero, 18
+0xbc 0x04 0x00 0x00 # CHECK: dsll32 $zero, $zero, 18
+0x14 0x00 0x94 0x01 # CHECK: dsllv $zero, $20, $12
+0xbb 0xe2 0x1c 0x00 # CHECK: dsra $gp, $gp, 10
+0xbb 0xe2 0x12 0x00 # CHECK: dsra $gp, $18, 10
+0x17 0xe0 0x72 0x02 # CHECK: dsrav $gp, $18, $19
+0xbf 0xe2 0x1c 0x00 # CHECK: dsra32 $gp, $gp, 10
+0xbf 0xe2 0x12 0x00 # CHECK: dsra32 $gp, $18, 10
+0x17 0xe0 0x72 0x02 # CHECK: dsrav $gp, $18, $19
+0xfa 0x9d 0x13 0x00 # CHECK: dsrl $19, $19, 23
+0xfa 0x9d 0x06 0x00 # CHECK: dsrl $19, $6, 23
+0x16 0x98 0x86 0x02 # CHECK: dsrlv $19, $6, $20
+0xfe 0x9d 0x13 0x00 # CHECK: dsrl32 $19, $19, 23
+0xfe 0x9d 0x06 0x00 # CHECK: dsrl32 $19, $6, 23
+0x16 0x98 0x86 0x02 # CHECK: dsrlv $19, $6, $20
+0x2e 0x38 0xc8 0x02 # CHECK: dsub $7, $22, $8
+0x2f 0x28 0xba 0x00 # CHECK: dsubu $5, $5, $26
+0xc0 0x00 0x00 0x00 # CHECK: ehb
+0x18 0x00 0x00 0x42 # CHECK: eret
+0x8f 0x53 0x20 0x46 # CHECK: floor.w.d $f14, $f10
+0x0f 0x4a 0x00 0x46 # CHECK: floor.w.s $f8, $f9
+0x8b 0x3e 0x20 0x46 # CHECK: floor.l.d $f26, $f7
+0x0b 0x2b 0x00 0x46 # CHECK: floor.l.s $f12, $f5
+0x4d 0xc7 0x58 0x81 # CHECK: lb $24, -14515($10)
+0xf3 0x75 0x68 0x90 # CHECK: lbu $8, 30195($3)
+0x07 0x40 0x0a 0xd6 # CHECK: ldc1 $f10, 16391($16)
+0x43 0xad 0x28 0xd8 # CHECK: ldc2 $8, -21181($1)
+0x94 0xde 0xab 0x86 # CHECK: lh $11, -8556($21)
+0xbd 0xa6 0x53 0x94 # CHECK: lhu $19, -22851($2)
+0xb3 0x8b 0x01 0x24 # CHECK: addiu $1, $zero, -29773
+0x3f 0x8b 0x00 0x24 # CHECK: addiu $zero, $zero, -29889
+0x67 0xe3 0x42 0xc2 # CHECK: ll $2, -7321($18)
+0x2a 0x16 0xa8 0x8c # CHECK: lw $8, 5674($5)
+0xf1 0x27 0x50 0xc7 # CHECK: lwc1 $f16, 10225($26)
+0xb7 0xfc 0xd2 0xc8 # CHECK: lwc2 $18, -841($6)
+0x79 0xef 0xf4 0x89 # CHECK: lwl $20, -4231($15)
+0x35 0xb5 0x80 0x9b # CHECK: lwr $zero, -19147($gp)
+0x00 0xd8 0x07 0x44 # CHECK: mfc1 $7, $f27
+0x10 0x98 0x00 0x00 # CHECK: mfhi $19
+0x10 0xe8 0x00 0x00 # CHECK: mfhi $sp
+0x12 0x88 0x00 0x00 # CHECK: mflo $17
+0x06 0x75 0x20 0x46 # CHECK: mov.d $f20, $f14
+0x86 0xd8 0x00 0x46 # CHECK: mov.s $f2, $f27
+0x21 0xf0 0x80 0x00 # CHECK: move $fp, $4
+0x21 0xc8 0xc0 0x00 # CHECK: move $25, $6
+0x00 0x48 0x9e 0x44 # CHECK: mtc1 $fp, $f9
+0x11 0x00 0x20 0x02 # CHECK: mthi $17
+0x13 0x00 0xa0 0x03 # CHECK: mtlo $sp
+0x13 0x00 0x20 0x03 # CHECK: mtlo $25
+0x02 0xa5 0x30 0x46 # CHECK: mul.d $f20, $f20, $f16
+0x82 0x57 0x02 0x46 # CHECK: mul.s $f30, $f10, $f2
+0x18 0x00 0xb4 0x03 # CHECK: mult $sp, $20
+0x18 0x00 0xa2 0x03 # CHECK: mult $sp, $2
+0x19 0x00 0x9a 0x03 # CHECK: multu $gp, $26
+0x19 0x00 0x32 0x01 # CHECK: multu $9, $18
+0x23 0x10 0x02 0x00 # CHECK: negu $2, $2
+0x23 0x10 0x03 0x00 # CHECK: negu $2, $3
+0x87 0x96 0x20 0x46 # CHECK: neg.d $f26, $f18
+0x47 0x78 0x00 0x46 # CHECK: neg.s $f1, $f15
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x27 0x38 0x07 0x00 # CHECK: nor $7, $zero, $7
+0x25 0x60 0x1d 0x02 # CHECK: or $12, $16, $sp
+0x04 0x00 0x42 0x34 # CHECK: ori $2, $2, 4
+0x08 0x0b 0x20 0x46 # CHECK: round.l.d $f12, $f1
+0x48 0x2e 0x00 0x46 # CHECK: round.l.s $f25, $f5
+0x8c 0x21 0x20 0x46 # CHECK: round.w.d $f6, $f4
+0xcc 0xe6 0x00 0x46 # CHECK: round.w.s $f27, $f28
+0x6f 0xb2 0xd6 0xa1 # CHECK: sb $22, -19857($14)
+0xd8 0x49 0x6f 0xe2 # CHECK: sc $15, 18904($19)
+0xcd 0xdf 0xaf 0xf3 # CHECK: scd $15, -8243($sp)
+0xcb 0x16 0x4c 0xfd # CHECK: sd $12, 5835($10)
+0x1f 0xae 0xc7 0xb3 # CHECK: sdl $7, -20961($fp)
+0x39 0xb0 0x8b 0xb5 # CHECK: sdr $11, -20423($12)
+0x6e 0x77 0xbe 0xf5 # CHECK: sdc1 $f30, 30574($13)
+0x75 0x5a 0x54 0xfa # CHECK: sdc2 $20, 23157($18)
+0xd0 0xe5 0xee 0xa5 # CHECK: sh $14, -6704($15)
+0x80 0x3c 0x07 0x00 # CHECK: sll $7, $7, 18
+0x80 0x3c 0x00 0x00 # CHECK: sll $7, $zero, 18
+0x04 0x38 0x20 0x01 # CHECK: sllv $7, $zero, $9
+0x04 0x38 0x20 0x01 # CHECK: sllv $7, $zero, $9
+0x2a 0xb8 0x7b 0x01 # CHECK: slt $23, $11, $27
+0x11 0x25 0x51 0x29 # CHECK: slti $17, $10, 9489
+0x55 0xc3 0x39 0x2f # CHECK: sltiu $25, $25, -15531
+0x2b 0xa0 0xab 0x02 # CHECK: sltu $20, $21, $11
+0x55 0xc3 0x38 0x2f # CHECK: sltiu $24, $25, -15531
+0x04 0xb4 0x20 0x46 # CHECK: sqrt.d $f16, $f22
+0x04 0x08 0x00 0x46 # CHECK: sqrt.s $f0, $f1
+0xc3 0x8b 0x11 0x00 # CHECK: sra $17, $17, 15
+0xc3 0x8b 0x17 0x00 # CHECK: sra $17, $23, 15
+0x07 0x88 0xb7 0x03 # CHECK: srav $17, $23, $sp
+0x07 0x88 0xb7 0x03 # CHECK: srav $17, $23, $sp
+0xc2 0x11 0x02 0x00 # CHECK: srl $2, $2, 7
+0xc2 0x11 0x02 0x00 # CHECK: srl $2, $2, 7
+0x06 0xc8 0x94 0x00 # CHECK: srlv $25, $20, $4
+0x06 0xc8 0x94 0x00 # CHECK: srlv $25, $20, $4
+0x40 0x00 0x00 0x00 # CHECK: ssnop
+0x22 0xb0 0x6c 0x02 # CHECK: sub $22, $19, $12
+0x36 0x0c 0x36 0x22 # CHECK: addi $22, $17, 3126
+0x90 0xe6 0xad 0x21 # CHECK: addi $13, $13, -6512
+0x81 0x14 0x30 0x46 # CHECK: sub.d $f18, $f2, $f16
+0xc1 0xb5 0x16 0x46 # CHECK: sub.s $f23, $f22, $f22
+0x23 0xe8 0xd6 0x02 # CHECK: subu $sp, $22, $22
+0x50 0xd8 0xbf 0xaf # CHECK: sw $ra, -10160($sp)
+0xef 0xde 0x06 0xe7 # CHECK: swc1 $f6, -8465($24)
+0x30 0x61 0x19 0xea # CHECK: swc2 $25, 24880($16)
+0x7e 0x35 0x6f 0xaa # CHECK: swl $15, 13694($19)
+0x22 0x98 0xd1 0xb9 # CHECK: swr $17, -26590($14)
+0x34 0x00 0x03 0x00 # CHECK: teq $zero, $3
+0x34 0x9b 0xa7 0x00 # CHECK: teq $5, $7, 620
+0xa0 0xbb 0xac 0x06 # CHECK: teqi $21, 48032
+0x30 0x00 0xea 0x00 # CHECK: tge $7, $10
+0x30 0x55 0xb3 0x00 # CHECK: tge $5, $19, 340
+0xa1 0x13 0x28 0x06 # CHECK: tgei $17, 5025
+0x33 0x90 0xa9 0x07 # CHECK: tgeiu $sp, 36915
+0x31 0x00 0xdc 0x02 # CHECK: tgeu $22, $gp
+0xf1 0x5e 0x8e 0x02 # CHECK: tgeu $20, $14, 379
+0x08 0x00 0x00 0x42 # CHECK: tlbp
+0x01 0x00 0x00 0x42 # CHECK: tlbr
+0x02 0x00 0x00 0x42 # CHECK: tlbwi
+0x06 0x00 0x00 0x42 # CHECK: tlbwr
+0x32 0x00 0xed 0x01 # CHECK: tlt $15, $13
+0x72 0x21 0x53 0x00 # CHECK: tlt $2, $19, 133
+0xbd 0xad 0xca 0x05 # CHECK: tlti $14, 44477
+0x2c 0xec 0xeb 0x07 # CHECK: tltiu $ra, 60460
+0x33 0x00 0x70 0x01 # CHECK: tltu $11, $16
+0x33 0xfe 0x1d 0x02 # CHECK: tltu $16, $sp, 1016
+0x36 0x00 0xd1 0x00 # CHECK: tne $6, $17
+0x76 0xdd 0xe8 0x00 # CHECK: tne $7, $8, 885
+0x31 0x8c 0x8e 0x05 # CHECK: tnei $12, 35889
+0xc9 0xbd 0x20 0x46 # CHECK: trunc.l.d $f23, $f23
+0x09 0xff 0x00 0x46 # CHECK: trunc.l.s $f28, $f31
+0x8d 0x75 0x20 0x46 # CHECK: trunc.w.d $f22, $f14
+0x0d 0xf7 0x00 0x46 # CHECK: trunc.w.s $f28, $f30
+0x26 0x90 0x9e 0x00 # CHECK: xor $18, $4, $fp
diff --git a/test/MC/Disassembler/Mips/mips3/valid-mips3.txt b/test/MC/Disassembler/Mips/mips3/valid-mips3.txt
new file mode 100644
index 0000000..cb602a3
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips3/valid-mips3.txt
@@ -0,0 +1,209 @@
+# RUN: llvm-mc %s -triple=mips64-unknown-linux -disassemble -mcpu=mips3 | FileCheck %s
+# CHECK: .text
+0x46 0x20 0xc1 0x85 # CHECK: abs.d $f6, $f24
+0x46 0x00 0x82 0x45 # CHECK: abs.s $f9, $f16
+0x02 0x45 0xb8 0x20 # CHECK: add $23, $18, $5
+0x21 0xc9 0x3b 0x48 # CHECK: addi $9, $14, 15176
+0x23 0x18 0xe3 0xe7 # CHECK: addi $24, $24, -7193
+0x46 0x3c 0x30 0x00 # CHECK: add.d $f0, $f6, $f28
+0x46 0x18 0xaa 0x00 # CHECK: add.s $f8, $f21, $f24
+0x21 0x2d 0x66 0xd2 # CHECK: addi $13, $9, 26322
+0x21 0x08 0xff 0xfe # CHECK: addi $8, $8, -2
+0x00 0x86 0x48 0x21 # CHECK: addu $9, $4, $6
+0x25 0x29 0x00 0x0a # CHECK: addiu $9, $9, 10
+0x00 0x4c 0xb8 0x24 # CHECK: and $23, $2, $12
+0x30 0x42 0x00 0x04 # CHECK: andi $2, $2, 4
+0x45 0x00 0x00 0x01 # CHECK: bc1f 8
+0x45 0x02 0x00 0x0c # CHECK: bc1fl 52
+0x45 0x01 0x00 0x01 # CHECK: bc1t 8
+0x45 0x03 0xf7 0xf4 # CHECK: bc1tl -8236
+0x04 0x11 0x14 0x9b # CHECK: bal 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x04 0xd0 0x14 0x9b # CHECK: bltzal $6, 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x51 0xd3 0x0c 0x40 # CHECK: beql $14, $19, 12548
+0x05 0x93 0x07 0x1f # CHECK: bgezall $12, 7296
+0x04 0x83 0xf9 0x4d # CHECK: bgezl $4, -6856
+0x5d 0x40 0xfc 0x59 # CHECK: bgtzl $10, -3736
+0x58 0xc0 0x02 0xe7 # CHECK: blezl $6, 2976
+0x04 0xd2 0x00 0x7a # CHECK: bltzall $6, 492
+0x06 0x22 0xf6 0x45 # CHECK: bltzl $17, -9960
+0x57 0x94 0x04 0xfc # CHECK: bnel $gp, $20, 5108
+0xbc 0xa1 0x00 0x08 # CHECK: cache 1, 8($5)
+0x46 0x3c 0xe0 0x3b # CHECK: c.ngl.d $f28, $f28
+0x46 0x30 0x00 0x39 # CHECK: c.ngle.d $f0, $f16
+0x46 0x20 0xf0 0x38 # CHECK: c.sf.d $f30, $f0
+0x46 0x16 0x70 0x38 # CHECK: c.sf.s $f14, $f22
+0x46 0x20 0x18 0x4a # CHECK: ceil.l.d $f1, $f3
+0x46 0x00 0x6c 0x8a # CHECK: ceil.l.s $f18, $f13
+0x46 0x20 0xc2 0xce # CHECK: ceil.w.d $f11, $f24
+0x46 0x00 0xa1 0x8e # CHECK: ceil.w.s $f6, $f20
+0x44 0x51 0xa8 0x00 # CHECK: cfc1 $17, $21
+0x44 0xc6 0xd0 0x00 # CHECK: ctc1 $6, $26
+0x46 0x00 0xe5 0xa1 # CHECK: cvt.d.s $f22, $f28
+0x46 0x80 0x5e 0xa1 # CHECK: cvt.d.w $f26, $f11
+0x46 0xa0 0x81 0x21 # CHECK: cvt.d.l $f4, $f16
+0x46 0x20 0x7e 0x25 # CHECK: cvt.l.d $f24, $f15
+0x46 0x00 0xea 0xe5 # CHECK: cvt.l.s $f11, $f29
+0x46 0xa0 0xf3 0xe0 # CHECK: cvt.s.l $f15, $f30
+0x46 0x20 0x46 0xa0 # CHECK: cvt.s.d $f26, $f8
+0x46 0x80 0x7d 0xa0 # CHECK: cvt.s.w $f22, $f15
+0x46 0x20 0x75 0x24 # CHECK: cvt.w.d $f20, $f14
+0x46 0x00 0xc5 0x24 # CHECK: cvt.w.s $f20, $f24
+0x00 0x3f 0x98 0x2c # CHECK: dadd $19, $1, $ra
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x63 0xbd 0x93 0xc7 # CHECK: daddi $sp, $sp, -27705
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x63 0xbd 0x93 0xc7 # CHECK: daddi $sp, $sp, -27705
+0x66 0xda 0xee 0x16 # CHECK: daddiu $26, $22, -4586
+0x00 0x3f 0x98 0x2d # CHECK: daddu $19, $1, $ra
+0x64 0x58 0x46 0x9f # CHECK: daddiu $24, $2, 18079
+0x66 0x73 0x69 0x3f # CHECK: daddiu $19, $19, 26943
+0x03 0x53 0x00 0x1e # CHECK: ddiv $zero, $26, $19
+0x02 0x11 0x00 0x1f # CHECK: ddivu $zero, $16, $17
+0x03 0x2b 0x00 0x1a # CHECK: div $zero, $25, $11
+0x46 0x3a 0xa7 0x03 # CHECK: div.d $f28, $f20, $f26
+0x46 0x0f 0x29 0x03 # CHECK: div.s $f4, $f5, $f15
+0x03 0x2f 0x00 0x1b # CHECK: divu $zero, $25, $15
+0x44 0x2c 0x68 0x00 # CHECK: dmfc1 $12, $f13
+0x44 0xb0 0x70 0x00 # CHECK: dmtc1 $16, $f14
+0x02 0xe9 0x00 0x1c # CHECK: dmult $23, $9
+0x00 0xa6 0x00 0x1d # CHECK: dmultu $5, $6
+0x00 0x00 0x04 0xb8 # CHECK: dsll $zero, $zero, 18
+0x00 0x14 0x04 0xb8 # CHECK: dsll $zero, $20, 18
+0x01 0x94 0x00 0x14 # CHECK: dsllv $zero, $20, $12
+0x00 0x00 0x04 0xbc # CHECK: dsll32 $zero, $zero, 18
+0x00 0x00 0x04 0xbc # CHECK: dsll32 $zero, $zero, 18
+0x01 0x94 0x00 0x14 # CHECK: dsllv $zero, $20, $12
+0x00 0x1c 0xe2 0xbb # CHECK: dsra $gp, $gp, 10
+0x00 0x12 0xe2 0xbb # CHECK: dsra $gp, $18, 10
+0x02 0x72 0xe0 0x17 # CHECK: dsrav $gp, $18, $19
+0x00 0x1c 0xe2 0xbf # CHECK: dsra32 $gp, $gp, 10
+0x00 0x12 0xe2 0xbf # CHECK: dsra32 $gp, $18, 10
+0x02 0x72 0xe0 0x17 # CHECK: dsrav $gp, $18, $19
+0x00 0x13 0x9d 0xfa # CHECK: dsrl $19, $19, 23
+0x00 0x06 0x9d 0xfa # CHECK: dsrl $19, $6, 23
+0x02 0x86 0x98 0x16 # CHECK: dsrlv $19, $6, $20
+0x00 0x13 0x9d 0xfe # CHECK: dsrl32 $19, $19, 23
+0x00 0x06 0x9d 0xfe # CHECK: dsrl32 $19, $6, 23
+0x02 0x86 0x98 0x16 # CHECK: dsrlv $19, $6, $20
+0x02 0xc8 0x38 0x2e # CHECK: dsub $7, $22, $8
+0x00 0xba 0x28 0x2f # CHECK: dsubu $5, $5, $26
+0x00 0x00 0x00 0xc0 # CHECK: ehb
+0x42 0x00 0x00 0x18 # CHECK: eret
+0x46 0x20 0x53 0x8f # CHECK: floor.w.d $f14, $f10
+0x46 0x00 0x4a 0x0f # CHECK: floor.w.s $f8, $f9
+0x46 0x20 0x3e 0x8b # CHECK: floor.l.d $f26, $f7
+0x46 0x00 0x2b 0x0b # CHECK: floor.l.s $f12, $f5
+0x81 0x58 0xc7 0x4d # CHECK: lb $24, -14515($10)
+0x90 0x68 0x75 0xf3 # CHECK: lbu $8, 30195($3)
+0xd6 0x0a 0x40 0x07 # CHECK: ldc1 $f10, 16391($16)
+0xd8 0x28 0xad 0x43 # CHECK: ldc2 $8, -21181($1)
+0x86 0xab 0xde 0x94 # CHECK: lh $11, -8556($21)
+0x94 0x53 0xa6 0xbd # CHECK: lhu $19, -22851($2)
+0x24 0x01 0x8b 0xb3 # CHECK: addiu $1, $zero, -29773
+0x24 0x00 0x8b 0x3f # CHECK: addiu $zero, $zero, -29889
+0xc2 0x42 0xe3 0x67 # CHECK: ll $2, -7321($18)
+0x8c 0xa8 0x16 0x2a # CHECK: lw $8, 5674($5)
+0xc7 0x50 0x27 0xf1 # CHECK: lwc1 $f16, 10225($26)
+0xc8 0xd2 0xfc 0xb7 # CHECK: lwc2 $18, -841($6)
+0x89 0xf4 0xef 0x79 # CHECK: lwl $20, -4231($15)
+0x9b 0x80 0xb5 0x35 # CHECK: lwr $zero, -19147($gp)
+0x44 0x07 0xd8 0x00 # CHECK: mfc1 $7, $f27
+0x00 0x00 0x98 0x10 # CHECK: mfhi $19
+0x00 0x00 0xe8 0x10 # CHECK: mfhi $sp
+0x00 0x00 0x88 0x12 # CHECK: mflo $17
+0x46 0x20 0x75 0x06 # CHECK: mov.d $f20, $f14
+0x46 0x00 0xd8 0x86 # CHECK: mov.s $f2, $f27
+0x00 0x80 0xf0 0x21 # CHECK: move $fp, $4
+0x00 0xc0 0xc8 0x21 # CHECK: move $25, $6
+0x44 0x9e 0x48 0x00 # CHECK: mtc1 $fp, $f9
+0x02 0x20 0x00 0x11 # CHECK: mthi $17
+0x03 0xa0 0x00 0x13 # CHECK: mtlo $sp
+0x03 0x20 0x00 0x13 # CHECK: mtlo $25
+0x46 0x30 0xa5 0x02 # CHECK: mul.d $f20, $f20, $f16
+0x46 0x02 0x57 0x82 # CHECK: mul.s $f30, $f10, $f2
+0x03 0xb4 0x00 0x18 # CHECK: mult $sp, $20
+0x03 0xa2 0x00 0x18 # CHECK: mult $sp, $2
+0x03 0x9a 0x00 0x19 # CHECK: multu $gp, $26
+0x01 0x32 0x00 0x19 # CHECK: multu $9, $18
+0x00 0x02 0x10 0x23 # CHECK: negu $2, $2
+0x00 0x03 0x10 0x23 # CHECK: negu $2, $3
+0x46 0x20 0x96 0x87 # CHECK: neg.d $f26, $f18
+0x46 0x00 0x78 0x47 # CHECK: neg.s $f1, $f15
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x00 0x07 0x38 0x27 # CHECK: nor $7, $zero, $7
+0x02 0x1d 0x60 0x25 # CHECK: or $12, $16, $sp
+0x34 0x42 0x00 0x04 # CHECK: ori $2, $2, 4
+0x46 0x20 0x0b 0x08 # CHECK: round.l.d $f12, $f1
+0x46 0x00 0x2e 0x48 # CHECK: round.l.s $f25, $f5
+0x46 0x20 0x21 0x8c # CHECK: round.w.d $f6, $f4
+0x46 0x00 0xe6 0xcc # CHECK: round.w.s $f27, $f28
+0xa1 0xd6 0xb2 0x6f # CHECK: sb $22, -19857($14)
+0xe2 0x6f 0x49 0xd8 # CHECK: sc $15, 18904($19)
+0xf3 0xaf 0xdf 0xcd # CHECK: scd $15, -8243($sp)
+0xfd 0x4c 0x16 0xcb # CHECK: sd $12, 5835($10)
+0xb3 0xc7 0xae 0x1f # CHECK: sdl $7, -20961($fp)
+0xb5 0x8b 0xb0 0x39 # CHECK: sdr $11, -20423($12)
+0xf5 0xbe 0x77 0x6e # CHECK: sdc1 $f30, 30574($13)
+0xfa 0x54 0x5a 0x75 # CHECK: sdc2 $20, 23157($18)
+0xa5 0xee 0xe5 0xd0 # CHECK: sh $14, -6704($15)
+0x00 0x07 0x3c 0x80 # CHECK: sll $7, $7, 18
+0x00 0x00 0x3c 0x80 # CHECK: sll $7, $zero, 18
+0x01 0x20 0x38 0x04 # CHECK: sllv $7, $zero, $9
+0x01 0x20 0x38 0x04 # CHECK: sllv $7, $zero, $9
+0x01 0x7b 0xb8 0x2a # CHECK: slt $23, $11, $27
+0x29 0x51 0x25 0x11 # CHECK: slti $17, $10, 9489
+0x2f 0x39 0xc3 0x55 # CHECK: sltiu $25, $25, -15531
+0x02 0xab 0xa0 0x2b # CHECK: sltu $20, $21, $11
+0x2f 0x38 0xc3 0x55 # CHECK: sltiu $24, $25, -15531
+0x46 0x20 0xb4 0x04 # CHECK: sqrt.d $f16, $f22
+0x46 0x00 0x08 0x04 # CHECK: sqrt.s $f0, $f1
+0x00 0x11 0x8b 0xc3 # CHECK: sra $17, $17, 15
+0x00 0x17 0x8b 0xc3 # CHECK: sra $17, $23, 15
+0x03 0xb7 0x88 0x07 # CHECK: srav $17, $23, $sp
+0x03 0xb7 0x88 0x07 # CHECK: srav $17, $23, $sp
+0x00 0x02 0x11 0xc2 # CHECK: srl $2, $2, 7
+0x00 0x02 0x11 0xc2 # CHECK: srl $2, $2, 7
+0x00 0x94 0xc8 0x06 # CHECK: srlv $25, $20, $4
+0x00 0x94 0xc8 0x06 # CHECK: srlv $25, $20, $4
+0x00 0x00 0x00 0x40 # CHECK: ssnop
+0x02 0x6c 0xb0 0x22 # CHECK: sub $22, $19, $12
+0x22 0x36 0x0c 0x36 # CHECK: addi $22, $17, 3126
+0x21 0xad 0xe6 0x90 # CHECK: addi $13, $13, -6512
+0x46 0x30 0x14 0x81 # CHECK: sub.d $f18, $f2, $f16
+0x46 0x16 0xb5 0xc1 # CHECK: sub.s $f23, $f22, $f22
+0x02 0xd6 0xe8 0x23 # CHECK: subu $sp, $22, $22
+0xaf 0xbf 0xd8 0x50 # CHECK: sw $ra, -10160($sp)
+0xe7 0x06 0xde 0xef # CHECK: swc1 $f6, -8465($24)
+0xea 0x19 0x61 0x30 # CHECK: swc2 $25, 24880($16)
+0xaa 0x6f 0x35 0x7e # CHECK: swl $15, 13694($19)
+0xb9 0xd1 0x98 0x22 # CHECK: swr $17, -26590($14)
+0x00 0x03 0x00 0x34 # CHECK: teq $zero, $3
+0x00 0xa7 0x9b 0x34 # CHECK: teq $5, $7, 620
+0x06 0xac 0xbb 0xa0 # CHECK: teqi $21, 48032
+0x00 0xea 0x00 0x30 # CHECK: tge $7, $10
+0x00 0xb3 0x55 0x30 # CHECK: tge $5, $19, 340
+0x06 0x28 0x13 0xa1 # CHECK: tgei $17, 5025
+0x07 0xa9 0x90 0x33 # CHECK: tgeiu $sp, 36915
+0x02 0xdc 0x00 0x31 # CHECK: tgeu $22, $gp
+0x02 0x8e 0x5e 0xf1 # CHECK: tgeu $20, $14, 379
+0x42 0x00 0x00 0x08 # CHECK: tlbp
+0x42 0x00 0x00 0x01 # CHECK: tlbr
+0x42 0x00 0x00 0x02 # CHECK: tlbwi
+0x42 0x00 0x00 0x06 # CHECK: tlbwr
+0x01 0xed 0x00 0x32 # CHECK: tlt $15, $13
+0x00 0x53 0x21 0x72 # CHECK: tlt $2, $19, 133
+0x05 0xca 0xad 0xbd # CHECK: tlti $14, 44477
+0x07 0xeb 0xec 0x2c # CHECK: tltiu $ra, 60460
+0x01 0x70 0x00 0x33 # CHECK: tltu $11, $16
+0x02 0x1d 0xfe 0x33 # CHECK: tltu $16, $sp, 1016
+0x00 0xd1 0x00 0x36 # CHECK: tne $6, $17
+0x00 0xe8 0xdd 0x76 # CHECK: tne $7, $8, 885
+0x05 0x8e 0x8c 0x31 # CHECK: tnei $12, 35889
+0x46 0x20 0xbd 0xc9 # CHECK: trunc.l.d $f23, $f23
+0x46 0x00 0xff 0x09 # CHECK: trunc.l.s $f28, $f31
+0x46 0x20 0x75 0x8d # CHECK: trunc.w.d $f22, $f14
+0x46 0x00 0xf7 0x0d # CHECK: trunc.w.s $f28, $f30
+0x00 0x9e 0x90 0x26 # CHECK: xor $18, $4, $fp
diff --git a/test/MC/Disassembler/Mips/mips32/valid-mips32-el.txt b/test/MC/Disassembler/Mips/mips32/valid-mips32-el.txt
new file mode 100644
index 0000000..ea209d1
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32/valid-mips32-el.txt
@@ -0,0 +1,149 @@
+# RUN: llvm-mc --disassemble %s -triple=mipsel-unknown-linux | FileCheck %s
+0x05 0x73 0x20 0x46 # CHECK: abs.d $f12, $f14
+0x85 0x39 0x00 0x46 # CHECK: abs.s $f6, $f7
+0x20 0x48 0xc7 0x00 # CHECK: add $9, $6, $7
+0x00 0x62 0x2e 0x46 # CHECK: add.d $f8, $f12, $f14
+0x40 0x32 0x07 0x46 # CHECK: add.s $f9, $f6, $f7
+0x67 0x45 0xc9 0x20 # CHECK: addi $9, $6, 17767
+0x67 0xc5 0xc9 0x24 # CHECK: addiu $9, $6, -15001
+0x21 0x48 0xc7 0x00 # CHECK: addu $9, $6, $7
+0x24 0x48 0xc7 0x00 # CHECK: and $9, $6, $7
+0x67 0x45 0xc9 0x30 # CHECK: andi $9, $6, 17767
+0x4c 0x01 0x00 0x10 # CHECK: b 1332
+0x4c 0x01 0x00 0x45 # CHECK: bc1f 1332
+0x4c 0x01 0x1c 0x45 # CHECK: bc1f $fcc7, 1332
+0x4c 0x01 0x01 0x45 # CHECK: bc1t 1332
+0x4c 0x01 0x1d 0x45 # CHECK: bc1t $fcc7, 1332
+0x4c 0x01 0x26 0x11 # CHECK: beq $9, $6, 1332
+0x4c 0x01 0xc1 0x04 # CHECK: bgez $6, 1332
+0x4c 0x01 0xd1 0x04 # CHECK: bgezal $6, 1332
+0x4c 0x01 0xc0 0x1c # CHECK: bgtz $6, 1332
+0x4c 0x01 0xc0 0x18 # CHECK: blez $6, 1332
+0x4c 0x01 0x26 0x15 # CHECK: bne $9, $6, 1332
+0x32 0x60 0x2e 0x46 # CHECK: c.eq.d $f12, $f14
+0x32 0x30 0x07 0x46 # CHECK: c.eq.s $f6, $f7
+0x30 0x60 0x2e 0x46 # CHECK: c.f.d $f12, $f14
+0x30 0x30 0x07 0x46 # CHECK: c.f.s $f6, $f7
+0x3e 0x60 0x2e 0x46 # CHECK: c.le.d $f12, $f14
+0x3e 0x30 0x07 0x46 # CHECK: c.le.s $f6, $f7
+0x3c 0x60 0x2e 0x46 # CHECK: c.lt.d $f12, $f14
+0x3c 0x30 0x07 0x46 # CHECK: c.lt.s $f6, $f7
+0x3d 0x60 0x2e 0x46 # CHECK: c.nge.d $f12, $f14
+0x3d 0x30 0x07 0x46 # CHECK: c.nge.s $f6, $f7
+0x3b 0x60 0x2e 0x46 # CHECK: c.ngl.d $f12, $f14
+0x3b 0x30 0x07 0x46 # CHECK: c.ngl.s $f6, $f7
+0x39 0x60 0x2e 0x46 # CHECK: c.ngle.d $f12, $f14
+0x39 0x30 0x07 0x46 # CHECK: c.ngle.s $f6, $f7
+0x3f 0x60 0x2e 0x46 # CHECK: c.ngt.d $f12, $f14
+0x3f 0x30 0x07 0x46 # CHECK: c.ngt.s $f6, $f7
+0x36 0x60 0x2e 0x46 # CHECK: c.ole.d $f12, $f14
+0x36 0x30 0x07 0x46 # CHECK: c.ole.s $f6, $f7
+0x34 0x60 0x2e 0x46 # CHECK: c.olt.d $f12, $f14
+0x34 0x30 0x07 0x46 # CHECK: c.olt.s $f6, $f7
+0x3a 0x60 0x2e 0x46 # CHECK: c.seq.d $f12, $f14
+0x3a 0x30 0x07 0x46 # CHECK: c.seq.s $f6, $f7
+0x38 0x60 0x2e 0x46 # CHECK: c.sf.d $f12, $f14
+0x38 0x30 0x07 0x46 # CHECK: c.sf.s $f6, $f7
+0x33 0x60 0x2e 0x46 # CHECK: c.ueq.d $f12, $f14
+0x33 0xe0 0x12 0x46 # CHECK: c.ueq.s $f28, $f18
+0x37 0x60 0x2e 0x46 # CHECK: c.ule.d $f12, $f14
+0x37 0x30 0x07 0x46 # CHECK: c.ule.s $f6, $f7
+0x35 0x60 0x2e 0x46 # CHECK: c.ult.d $f12, $f14
+0x35 0x30 0x07 0x46 # CHECK: c.ult.s $f6, $f7
+0x31 0x60 0x2e 0x46 # CHECK: c.un.d $f12, $f14
+0x31 0x30 0x07 0x46 # CHECK: c.un.s $f6, $f7
+0x0e 0x73 0x20 0x46 # CHECK: ceil.w.d $f12, $f14
+0x8e 0x39 0x00 0x46 # CHECK: ceil.w.s $f6, $f7
+0x00 0x38 0x46 0x44 # CHECK: cfc1 $6, $7
+0x21 0x30 0xe6 0x70 # CHECK: clo $6, $7
+0x20 0x30 0xe6 0x70 # CHECK: clz $6, $7
+0x00 0x38 0xc6 0x44 # CHECK: ctc1 $6, $7
+0xa1 0x39 0x00 0x46 # CHECK: cvt.d.s $f6, $f7
+0x21 0x73 0x80 0x46 # CHECK: cvt.d.w $f12, $f14
+0x20 0x73 0x20 0x46 # CHECK: cvt.s.d $f12, $f14
+0xa0 0x39 0x80 0x46 # CHECK: cvt.s.w $f6, $f7
+0x24 0x73 0x20 0x46 # CHECK: cvt.w.d $f12, $f14
+0xa4 0x39 0x00 0x46 # CHECK: cvt.w.s $f6, $f7
+0x0f 0x73 0x20 0x46 # CHECK: floor.w.d $f12, $f14
+0x8f 0x39 0x00 0x46 # CHECK: floor.w.s $f6, $f7
+0x4c 0x01 0x00 0x08 # CHECK: j 1328
+0x4c 0x01 0x00 0x0c # CHECK: jal 1328
+0x4c 0x01 0x00 0x74 # CHECK: jalx 1328
+0x09 0xf8 0xe0 0x00 # CHECK: jalr $7
+0x09 0xfc 0x80 0x00 # CHECK: jalr.hb $4
+0x09 0x24 0xa0 0x00 # CHECK: jalr.hb $4, $5
+0x08 0x00 0xe0 0x00 # CHECK: jr $7
+0xc6 0x23 0xa4 0x80 # CHECK: lb $4, 9158($5)
+0x06 0x00 0xa4 0x90 # CHECK: lbu $4, 6($5)
+0xc6 0x23 0xe9 0xd4 # CHECK: ldc1 $f9, 9158($7)
+0x0c 0x00 0xa4 0x84 # CHECK: lh $4, 12($5)
+0x0c 0x00 0xa4 0x84 # CHECK: lh $4, 12($5)
+0xc6 0x23 0xe9 0xc0 # CHECK: ll $9, 9158($7)
+0x67 0x45 0x06 0x3c # CHECK: lui $6, 17767
+0x18 0x00 0xa4 0x8c # CHECK: lw $4, 24($5)
+0xc6 0x23 0xe9 0xc4 # CHECK: lwc1 $f9, 9158($7)
+0x03 0x00 0x82 0x88 # CHECK: lwl $2, 3($4)
+0x10 0x00 0xa3 0x98 # CHECK: lwr $3, 16($5)
+0x00 0x00 0xc7 0x70 # CHECK: madd $6, $7
+0x01 0x00 0xc7 0x70 # CHECK: maddu $6, $7
+0x00 0x38 0x06 0x44 # CHECK: mfc1 $6, $f7
+0x10 0x28 0x00 0x00 # CHECK: mfhi $5
+0x12 0x28 0x00 0x00 # CHECK: mflo $5
+0x86 0x41 0x20 0x46 # CHECK: mov.d $f6, $f8
+0x86 0x39 0x00 0x46 # CHECK: mov.s $f6, $f7
+0x04 0x00 0xc7 0x70 # CHECK: msub $6, $7
+0x05 0x00 0xc7 0x70 # CHECK: msubu $6, $7
+0x00 0x38 0x86 0x44 # CHECK: mtc1 $6, $f7
+0x11 0x00 0xe0 0x00 # CHECK: mthi $7
+0x13 0x00 0xe0 0x00 # CHECK: mtlo $7
+0x02 0x62 0x2e 0x46 # CHECK: mul.d $f8, $f12, $f14
+0x42 0x32 0x07 0x46 # CHECK: mul.s $f9, $f6, $f7
+0x02 0x48 0xc7 0x70 # CHECK: mul $9, $6, $7
+0x18 0x00 0x65 0x00 # CHECK: mult $3, $5
+0x19 0x00 0x65 0x00 # CHECK: multu $3, $5
+0x07 0x73 0x20 0x46 # CHECK: neg.d $f12, $f14
+0x87 0x39 0x00 0x46 # CHECK: neg.s $f6, $f7
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x27 0x48 0xc7 0x00 # CHECK: nor $9, $6, $7
+0x25 0x18 0x65 0x00 # CHECK: or $3, $3, $5
+0x67 0x45 0xc9 0x34 # CHECK: ori $9, $6, 17767
+0x0c 0x73 0x20 0x46 # CHECK: round.w.d $f12, $f14
+0x8c 0x39 0x00 0x46 # CHECK: round.w.s $f6, $f7
+0xc6 0x23 0xa4 0xa0 # CHECK: sb $4, 9158($5)
+0x06 0x00 0xa4 0xa0 # CHECK: sb $4, 6($5)
+0xc6 0x23 0xe9 0xe0 # CHECK: sc $9, 9158($7)
+0xc6 0x23 0xe9 0xf4 # CHECK: sdc1 $f9, 9158($7)
+0xc6 0x23 0xa4 0xa4 # CHECK: sh $4, 9158($5)
+0xc0 0x21 0x03 0x00 # CHECK: sll $4, $3, 7
+0x04 0x10 0xa3 0x00 # CHECK: sllv $2, $3, $5
+0x2a 0x18 0x65 0x00 # CHECK: slt $3, $3, $5
+0x67 0x00 0x63 0x28 # CHECK: slti $3, $3, 103
+0x67 0x00 0x63 0x2c # CHECK: sltiu $3, $3, 103
+0x2b 0x18 0x65 0x00 # CHECK: sltu $3, $3, $5
+0x04 0x73 0x20 0x46 # CHECK: sqrt.d $f12, $f14
+0x84 0x39 0x00 0x46 # CHECK: sqrt.s $f6, $f7
+0xc3 0x21 0x03 0x00 # CHECK: sra $4, $3, 7
+0x07 0x10 0xa3 0x00 # CHECK: srav $2, $3, $5
+0xc2 0x21 0x03 0x00 # CHECK: srl $4, $3, 7
+0x06 0x10 0xa3 0x00 # CHECK: srlv $2, $3, $5
+0x01 0x62 0x2e 0x46 # CHECK: sub.d $f8, $f12, $f14
+0x41 0x32 0x07 0x46 # CHECK: sub.s $f9, $f6, $f7
+0x22 0x48 0xc7 0x00 # CHECK: sub $9, $6, $7
+0x23 0x20 0x65 0x00 # CHECK: subu $4, $3, $5
+0x18 0x00 0xa4 0xac # CHECK: sw $4, 24($5)
+0xc6 0x23 0xe9 0xe4 # CHECK: swc1 $f9, 9158($7)
+0x10 0x00 0xa4 0xa8 # CHECK: swl $4, 16($5)
+0x10 0x00 0xe6 0xb8 # CHECK: swr $6, 16($7)
+0xcf 0x01 0x00 0x00 # CHECK: sync 7
+0x0d 0x73 0x20 0x46 # CHECK: trunc.w.d $f12, $f14
+0x8d 0x39 0x00 0x46 # CHECK: trunc.w.s $f6, $f7
+0x26 0x18 0x65 0x00 # CHECK: xor $3, $3, $5
+0x67 0x45 0xc9 0x38 # CHECK: xori $9, $6, 17767
+0x3b 0xe8 0x05 0x7c # CHECK: .set push
+                    # CHECK: .set mips32r2
+                    # CHECK: rdhwr $5, $29
+                    # CHECK: .set pop
+0x02 0x00 0x61 0xbc # CHECK: cache 1, 2($3)
+0x04 0x00 0x43 0xcc # CHECK: pref 3, 4($2)
+0xc6 0x23 0xe9 0xe8 # CHECK: swc2 $9, 9158($7)
+0xca 0x23 0xc8 0xc8 # CHECK: lwc2 $8, 9162($6)
diff --git a/test/MC/Disassembler/Mips/mips32/valid-mips32.txt b/test/MC/Disassembler/Mips/mips32/valid-mips32.txt
new file mode 100644
index 0000000..45b672b
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32/valid-mips32.txt
@@ -0,0 +1,149 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux | FileCheck %s
+0x46 0x20 0x73 0x05 # CHECK: abs.d $f12, $f14
+0x46 0x00 0x39 0x85 # CHECK: abs.s $f6, $f7
+0x00 0xc7 0x48 0x20 # CHECK: add $9, $6, $7
+0x46 0x2e 0x62 0x00 # CHECK: add.d $f8, $f12, $f14
+0x46 0x07 0x32 0x40 # CHECK: add.s $f9, $f6, $f7
+0x20 0xc9 0x45 0x67 # CHECK: addi $9, $6, 17767
+0x24 0xc9 0xc5 0x67 # CHECK: addiu $9, $6, -15001
+0x00 0xc7 0x48 0x21 # CHECK: addu $9, $6, $7
+0x00 0xc7 0x48 0x24 # CHECK: and $9, $6, $7
+0x30 0xc9 0x45 0x67 # CHECK: andi $9, $6, 17767
+0x10 0x00 0x01 0x4c # CHECK: b 1332
+0x45 0x00 0x01 0x4c # CHECK: bc1f 1332
+0x45 0x1c 0x01 0x4c # CHECK: bc1f $fcc7, 1332
+0x45 0x01 0x01 0x4c # CHECK: bc1t 1332
+0x45 0x1d 0x01 0x4c # CHECK: bc1t $fcc7, 1332
+0x11 0x26 0x01 0x4c # CHECK: beq $9, $6, 1332
+0x04 0xc1 0x01 0x4c # CHECK: bgez $6, 1332
+0x04 0xd1 0x01 0x4c # CHECK: bgezal $6, 1332
+0x1c 0xc0 0x01 0x4c # CHECK: bgtz $6, 1332
+0x18 0xc0 0x01 0x4c # CHECK: blez $6, 1332
+0x15 0x26 0x01 0x4c # CHECK: bne $9, $6, 1332
+0x46 0x2e 0x60 0x32 # CHECK: c.eq.d $f12, $f14
+0x46 0x07 0x30 0x32 # CHECK: c.eq.s $f6, $f7
+0x46 0x2e 0x60 0x30 # CHECK: c.f.d $f12, $f14
+0x46 0x07 0x30 0x30 # CHECK: c.f.s $f6, $f7
+0x46 0x2e 0x60 0x3e # CHECK: c.le.d $f12, $f14
+0x46 0x07 0x30 0x3e # CHECK: c.le.s $f6, $f7
+0x46 0x2e 0x60 0x3c # CHECK: c.lt.d $f12, $f14
+0x46 0x07 0x30 0x3c # CHECK: c.lt.s $f6, $f7
+0x46 0x2e 0x60 0x3d # CHECK: c.nge.d $f12, $f14
+0x46 0x07 0x30 0x3d # CHECK: c.nge.s $f6, $f7
+0x46 0x2e 0x60 0x3b # CHECK: c.ngl.d $f12, $f14
+0x46 0x07 0x30 0x3b # CHECK: c.ngl.s $f6, $f7
+0x46 0x2e 0x60 0x39 # CHECK: c.ngle.d $f12, $f14
+0x46 0x07 0x30 0x39 # CHECK: c.ngle.s $f6, $f7
+0x46 0x2e 0x60 0x3f # CHECK: c.ngt.d $f12, $f14
+0x46 0x07 0x30 0x3f # CHECK: c.ngt.s $f6, $f7
+0x46 0x2e 0x60 0x36 # CHECK: c.ole.d $f12, $f14
+0x46 0x07 0x30 0x36 # CHECK: c.ole.s $f6, $f7
+0x46 0x2e 0x60 0x34 # CHECK: c.olt.d $f12, $f14
+0x46 0x07 0x30 0x34 # CHECK: c.olt.s $f6, $f7
+0x46 0x2e 0x60 0x3a # CHECK: c.seq.d $f12, $f14
+0x46 0x07 0x30 0x3a # CHECK: c.seq.s $f6, $f7
+0x46 0x2e 0x60 0x38 # CHECK: c.sf.d $f12, $f14
+0x46 0x07 0x30 0x38 # CHECK: c.sf.s $f6, $f7
+0x46 0x2e 0x60 0x33 # CHECK: c.ueq.d $f12, $f14
+0x46 0x12 0xe0 0x33 # CHECK: c.ueq.s $f28, $f18
+0x46 0x2e 0x60 0x37 # CHECK: c.ule.d $f12, $f14
+0x46 0x07 0x30 0x37 # CHECK: c.ule.s $f6, $f7
+0x46 0x2e 0x60 0x35 # CHECK: c.ult.d $f12, $f14
+0x46 0x07 0x30 0x35 # CHECK: c.ult.s $f6, $f7
+0x46 0x2e 0x60 0x31 # CHECK: c.un.d $f12, $f14
+0x46 0x07 0x30 0x31 # CHECK: c.un.s $f6, $f7
+0x46 0x20 0x73 0x0e # CHECK: ceil.w.d $f12, $f14
+0x46 0x00 0x39 0x8e # CHECK: ceil.w.s $f6, $f7
+0x44 0x46 0x38 0x00 # CHECK: cfc1 $6, $7
+0x70 0xe6 0x30 0x21 # CHECK: clo $6, $7
+0x70 0xe6 0x30 0x20 # CHECK: clz $6, $7
+0x44 0xc6 0x38 0x00 # CHECK: ctc1 $6, $7
+0x46 0x00 0x39 0xa1 # CHECK: cvt.d.s $f6, $f7
+0x46 0x80 0x73 0x21 # CHECK: cvt.d.w $f12, $f14
+0x46 0x20 0x73 0x20 # CHECK: cvt.s.d $f12, $f14
+0x46 0x80 0x39 0xa0 # CHECK: cvt.s.w $f6, $f7
+0x46 0x20 0x73 0x24 # CHECK: cvt.w.d $f12, $f14
+0x46 0x00 0x39 0xa4 # CHECK: cvt.w.s $f6, $f7
+0x46 0x20 0x73 0x0f # CHECK: floor.w.d $f12, $f14
+0x46 0x00 0x39 0x8f # CHECK: floor.w.s $f6, $f7
+0x08 0x00 0x01 0x4c # CHECK: j 1328
+0x0c 0x00 0x01 0x4c # CHECK: jal 1328
+0x74 0x00 0x01 0x4c # CHECK: jalx 1328
+0x00 0xe0 0xf8 0x09 # CHECK: jalr $7
+0x00 0x80 0xfc 0x09 # CHECK: jalr.hb $4
+0x00 0xa0 0x24 0x09 # CHECK: jalr.hb $4, $5
+0x00 0xe0 0x00 0x08 # CHECK: jr $7
+0x80 0xa4 0x23 0xc6 # CHECK: lb $4, 9158($5)
+0x90 0xa4 0x00 0x06 # CHECK: lbu $4, 6($5)
+0xd4 0xe9 0x23 0xc6 # CHECK: ldc1 $f9, 9158($7)
+0x84 0xa4 0x00 0x0c # CHECK: lh $4, 12($5)
+0x84 0xa4 0x00 0x0c # CHECK: lh $4, 12($5)
+0xc0 0xe9 0x23 0xc6 # CHECK: ll $9, 9158($7)
+0x3c 0x06 0x45 0x67 # CHECK: lui $6, 17767
+0x8c 0xa4 0x00 0x18 # CHECK: lw $4, 24($5)
+0xc4 0xe9 0x23 0xc6 # CHECK: lwc1 $f9, 9158($7)
+0x88 0x82 0x00 0x03 # CHECK: lwl $2, 3($4)
+0x98 0xa3 0x00 0x10 # CHECK: lwr $3, 16($5)
+0x70 0xc7 0x00 0x00 # CHECK: madd $6, $7
+0x70 0xc7 0x00 0x01 # CHECK: maddu $6, $7
+0x44 0x06 0x38 0x00 # CHECK: mfc1 $6, $f7
+0x00 0x00 0x28 0x10 # CHECK: mfhi $5
+0x00 0x00 0x28 0x12 # CHECK: mflo $5
+0x46 0x20 0x41 0x86 # CHECK: mov.d $f6, $f8
+0x46 0x00 0x39 0x86 # CHECK: mov.s $f6, $f7
+0x70 0xc7 0x00 0x04 # CHECK: msub $6, $7
+0x70 0xc7 0x00 0x05 # CHECK: msubu $6, $7
+0x44 0x86 0x38 0x00 # CHECK: mtc1 $6, $f7
+0x00 0xe0 0x00 0x11 # CHECK: mthi $7
+0x00 0xe0 0x00 0x13 # CHECK: mtlo $7
+0x46 0x2e 0x62 0x02 # CHECK: mul.d $f8, $f12, $f14
+0x46 0x07 0x32 0x42 # CHECK: mul.s $f9, $f6, $f7
+0x70 0xc7 0x48 0x02 # CHECK: mul $9, $6, $7
+0x00 0x65 0x00 0x18 # CHECK: mult $3, $5
+0x00 0x65 0x00 0x19 # CHECK: multu $3, $5
+0x46 0x20 0x73 0x07 # CHECK: neg.d $f12, $f14
+0x46 0x00 0x39 0x87 # CHECK: neg.s $f6, $f7
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x00 0xc7 0x48 0x27 # CHECK: nor $9, $6, $7
+0x00 0x65 0x18 0x25 # CHECK: or $3, $3, $5
+0x34 0xc9 0x45 0x67 # CHECK: ori $9, $6, 17767
+0x46 0x20 0x73 0x0c # CHECK: round.w.d $f12, $f14
+0x46 0x00 0x39 0x8c # CHECK: round.w.s $f6, $f7
+0xa0 0xa4 0x23 0xc6 # CHECK: sb $4, 9158($5)
+0xa0 0xa4 0x00 0x06 # CHECK: sb $4, 6($5)
+0xe0 0xe9 0x23 0xc6 # CHECK: sc $9, 9158($7)
+0xf4 0xe9 0x23 0xc6 # CHECK: sdc1 $f9, 9158($7)
+0xa4 0xa4 0x23 0xc6 # CHECK: sh $4, 9158($5)
+0x00 0x03 0x21 0xc0 # CHECK: sll $4, $3, 7
+0x00 0xa3 0x10 0x04 # CHECK: sllv $2, $3, $5
+0x00 0x65 0x18 0x2a # CHECK: slt $3, $3, $5
+0x28 0x63 0x00 0x67 # CHECK: slti $3, $3, 103
+0x2c 0x63 0x00 0x67 # CHECK: sltiu $3, $3, 103
+0x00 0x65 0x18 0x2b # CHECK: sltu $3, $3, $5
+0x46 0x20 0x73 0x04 # CHECK: sqrt.d $f12, $f14
+0x46 0x00 0x39 0x84 # CHECK: sqrt.s $f6, $f7
+0x00 0x03 0x21 0xc3 # CHECK: sra $4, $3, 7
+0x00 0xa3 0x10 0x07 # CHECK: srav $2, $3, $5
+0x00 0x03 0x21 0xc2 # CHECK: srl $4, $3, 7
+0x00 0xa3 0x10 0x06 # CHECK: srlv $2, $3, $5
+0x46 0x2e 0x62 0x01 # CHECK: sub.d $f8, $f12, $f14
+0x46 0x07 0x32 0x41 # CHECK: sub.s $f9, $f6, $f7
+0x00 0xc7 0x48 0x22 # CHECK: sub $9, $6, $7
+0x00 0x65 0x20 0x23 # CHECK: subu $4, $3, $5
+0xac 0xa4 0x00 0x18 # CHECK: sw $4, 24($5)
+0xe4 0xe9 0x23 0xc6 # CHECK: swc1 $f9, 9158($7)
+0xa8 0xa4 0x00 0x10 # CHECK: swl $4, 16($5)
+0xb8 0xe6 0x00 0x10 # CHECK: swr $6, 16($7)
+0x00 0x00 0x01 0xcf # CHECK: sync 7
+0x46 0x20 0x73 0x0d # CHECK: trunc.w.d $f12, $f14
+0x46 0x00 0x39 0x8d # CHECK: trunc.w.s $f6, $f7
+0x00 0x65 0x18 0x26 # CHECK: xor $3, $3, $5
+0x38 0xc9 0x45 0x67 # CHECK: xori $9, $6, 17767
+0x7c 0x05 0xe8 0x3b # CHECK: .set push
+                    # CHECK: .set mips32r2
+                    # CHECK: rdhwr $5, $29
+                    # CHECK: .set pop
+0xbc 0x61 0x00 0x02 # CHECK: cache 1, 2($3)
+0xcc 0x43 0x00 0x04 # CHECK: pref 3, 4($2)
+0xe8 0xe9 0x23 0xc6 # CHECK: swc2 $9, 9158($7)
+0xc8 0xc8 0x23 0xca # CHECK: lwc2 $8, 9162($6)
diff --git a/test/MC/Disassembler/Mips/mips32/valid-xfail-mips32.txt b/test/MC/Disassembler/Mips/mips32/valid-xfail-mips32.txt
new file mode 100644
index 0000000..f614271
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32/valid-xfail-mips32.txt
@@ -0,0 +1,30 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -disassemble | FileCheck %s
+# XFAIL: *
+0x46 0x2f 0x79 0x32 # CHECK: c.eq.d $fcc1, $f15, $f15
+0x46 0x11 0xc5 0x32 # CHECK: c.eq.s $fcc5, $f24, $f17
+0x46 0x35 0x5c 0x30 # CHECK: c.f.d $fcc4, $f11, $f21
+0x46 0x07 0xf4 0x30 # CHECK: c.f.s $fcc4, $f30, $f7
+0x46 0x21 0x94 0x3e # CHECK: c.le.d $fcc4, $f18, $f1
+0x46 0x04 0xc6 0x3e # CHECK: c.le.s $fcc6, $f24, $f4
+0x46 0x23 0x4b 0x3c # CHECK: c.lt.d $fcc3, $f9, $f3
+0x46 0x0e 0x8a 0x3c # CHECK: c.lt.s $fcc2, $f17, $f14
+0x46 0x30 0xad 0x3d # CHECK: c.nge.d $fcc5, $f21, $f16
+0x46 0x08 0x5b 0x3d # CHECK: c.nge.s $fcc3, $f11, $f8
+0x46 0x17 0xfa 0x3b # CHECK: c.ngl.s $fcc2, $f31, $f23
+0x46 0x17 0x92 0x39 # CHECK: c.ngle.s $fcc2, $f18, $f23
+0x46 0x27 0xc4 0x3f # CHECK: c.ngt.d $fcc4, $f24, $f7
+0x46 0x0d 0x45 0x3f # CHECK: c.ngt.s $fcc5, $f8, $f13
+0x46 0x3f 0x82 0x36 # CHECK: c.ole.d $fcc2, $f16, $f31
+0x46 0x14 0x3b 0x36 # CHECK: c.ole.s $fcc3, $f7, $f20
+0x46 0x3c 0x9c 0x34 # CHECK: c.olt.d $fcc4, $f19, $f28
+0x46 0x07 0xa6 0x34 # CHECK: c.olt.s $fcc6, $f20, $f7
+0x46 0x27 0xfc 0x3a # CHECK: c.seq.d $fcc4, $f31, $f7
+0x46 0x19 0x0f 0x3a # CHECK: c.seq.s $fcc7, $f1, $f25
+0x46 0x39 0x6c 0x33 # CHECK: c.ueq.d $fcc4, $f13, $f25
+0x46 0x1e 0x1e 0x33 # CHECK: c.ueq.s $fcc6, $f3, $f30
+0x46 0x32 0xcf 0x37 # CHECK: c.ule.d $fcc7, $f25, $f18
+0x46 0x1e 0xaf 0x37 # CHECK: c.ule.s $fcc7, $f21, $f30
+0x46 0x31 0x36 0x35 # CHECK: c.ult.d $fcc6, $f6, $f17
+0x46 0x0a 0xc7 0x35 # CHECK: c.ult.s $fcc7, $f24, $f10
+0x46 0x38 0xbe 0x31 # CHECK: c.un.d $fcc6, $f23, $f24
+0x46 0x04 0xf1 0x31 # CHECK: c.un.s $fcc1, $f30, $f4
diff --git a/test/MC/Disassembler/Mips/mips32r2.txt b/test/MC/Disassembler/Mips/mips32r2.txt
index 299f6f0..354ef74 100644
--- a/test/MC/Disassembler/Mips/mips32r2.txt
+++ b/test/MC/Disassembler/Mips/mips32r2.txt
@@ -448,3 +448,6 @@
 
 # CHECK: xori  $9,  $6, 17767
 0x38 0xc9 0x45 0x67
+
+# CHECK: synci -6137($fp)
+0x07 0xdf 0xe8 0x07
diff --git a/test/MC/Disassembler/Mips/mips32r2/valid-mips32r2-le.txt b/test/MC/Disassembler/Mips/mips32r2/valid-mips32r2-le.txt
new file mode 100644
index 0000000..d0eb13c
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32r2/valid-mips32r2-le.txt
@@ -0,0 +1,172 @@
+# RUN: llvm-mc --disassemble %s -triple=mipsel-unknown-linux -mcpu=mips32r2 | FileCheck %s
+# Try a mips64* triple to confirm that mips* vs mips64* triples no longer have
+# an effect on the disassembler behaviour.
+# RUN: llvm-mc --disassemble %s -triple=mips64el-unknown-linux -mcpu=mips32r2 | FileCheck %s
+0x05 0x73 0x20 0x46 # CHECK: abs.d $f12, $f14
+0x85 0x39 0x00 0x46 # CHECK: abs.s $f6, $f7
+0x20 0x48 0xc7 0x00 # CHECK: add $9, $6, $7
+0x00 0x62 0x2e 0x46 # CHECK: add.d $f8, $f12, $f14
+0x40 0x32 0x07 0x46 # CHECK: add.s $f9, $f6, $f7
+0x67 0x45 0xc9 0x20 # CHECK: addi $9, $6, 17767
+0x67 0xc5 0xc9 0x24 # CHECK: addiu $9, $6, -15001
+0x21 0x48 0xc7 0x00 # CHECK: addu $9, $6, $7
+0x24 0x48 0xc7 0x00 # CHECK: and $9, $6, $7
+0x67 0x45 0xc9 0x30 # CHECK: andi $9, $6, 17767
+0x4c 0x01 0x00 0x10 # CHECK: b 1332
+0x4c 0x01 0x00 0x45 # CHECK: bc1f 1332
+0x4c 0x01 0x1c 0x45 # CHECK: bc1f $fcc7, 1332
+0x4c 0x01 0x01 0x45 # CHECK: bc1t 1332
+0x4c 0x01 0x1d 0x45 # CHECK: bc1t $fcc7, 1332
+0x4c 0x01 0x26 0x11 # CHECK: beq $9, $6, 1332
+0x4c 0x01 0xc1 0x04 # CHECK: bgez $6, 1332
+0x4c 0x01 0xd1 0x04 # CHECK: bgezal $6, 1332
+0x4c 0x01 0xc0 0x1c # CHECK: bgtz $6, 1332
+0x4c 0x01 0xc0 0x18 # CHECK: blez $6, 1332
+0x4c 0x01 0x26 0x15 # CHECK: bne $9, $6, 1332
+0x32 0x60 0x2e 0x46 # CHECK: c.eq.d $f12, $f14
+0x32 0x30 0x07 0x46 # CHECK: c.eq.s $f6, $f7
+0x30 0x60 0x2e 0x46 # CHECK: c.f.d $f12, $f14
+0x30 0x30 0x07 0x46 # CHECK: c.f.s $f6, $f7
+0x3e 0x60 0x2e 0x46 # CHECK: c.le.d $f12, $f14
+0x3e 0x30 0x07 0x46 # CHECK: c.le.s $f6, $f7
+0x3c 0x60 0x2e 0x46 # CHECK: c.lt.d $f12, $f14
+0x3c 0x30 0x07 0x46 # CHECK: c.lt.s $f6, $f7
+0x3d 0x60 0x2e 0x46 # CHECK: c.nge.d $f12, $f14
+0x3d 0x30 0x07 0x46 # CHECK: c.nge.s $f6, $f7
+0x3b 0x60 0x2e 0x46 # CHECK: c.ngl.d $f12, $f14
+0x3b 0x30 0x07 0x46 # CHECK: c.ngl.s $f6, $f7
+0x39 0x60 0x2e 0x46 # CHECK: c.ngle.d $f12, $f14
+0x39 0x30 0x07 0x46 # CHECK: c.ngle.s $f6, $f7
+0x3f 0x60 0x2e 0x46 # CHECK: c.ngt.d $f12, $f14
+0x3f 0x30 0x07 0x46 # CHECK: c.ngt.s $f6, $f7
+0x36 0x60 0x2e 0x46 # CHECK: c.ole.d $f12, $f14
+0x36 0x30 0x07 0x46 # CHECK: c.ole.s $f6, $f7
+0x34 0x60 0x2e 0x46 # CHECK: c.olt.d $f12, $f14
+0x34 0x30 0x07 0x46 # CHECK: c.olt.s $f6, $f7
+0x3a 0x60 0x2e 0x46 # CHECK: c.seq.d $f12, $f14
+0x3a 0x30 0x07 0x46 # CHECK: c.seq.s $f6, $f7
+0x38 0x60 0x2e 0x46 # CHECK: c.sf.d $f12, $f14
+0x38 0x30 0x07 0x46 # CHECK: c.sf.s $f6, $f7
+0x33 0x60 0x2e 0x46 # CHECK: c.ueq.d $f12, $f14
+0x33 0xe0 0x12 0x46 # CHECK: c.ueq.s $f28, $f18
+0x37 0x60 0x2e 0x46 # CHECK: c.ule.d $f12, $f14
+0x37 0x30 0x07 0x46 # CHECK: c.ule.s $f6, $f7
+0x35 0x60 0x2e 0x46 # CHECK: c.ult.d $f12, $f14
+0x35 0x30 0x07 0x46 # CHECK: c.ult.s $f6, $f7
+0x31 0x60 0x2e 0x46 # CHECK: c.un.d $f12, $f14
+0x31 0x30 0x07 0x46 # CHECK: c.un.s $f6, $f7
+0x0e 0x73 0x20 0x46 # CHECK: ceil.w.d $f12, $f14
+0x8e 0x39 0x00 0x46 # CHECK: ceil.w.s $f6, $f7
+0x00 0x38 0x46 0x44 # CHECK: cfc1 $6, $7
+0x21 0x30 0xe6 0x70 # CHECK: clo $6, $7
+0x20 0x30 0xe6 0x70 # CHECK: clz $6, $7
+0x00 0x38 0xc6 0x44 # CHECK: ctc1 $6, $7
+0xa1 0x39 0x00 0x46 # CHECK: cvt.d.s $f6, $f7
+0x21 0x73 0x80 0x46 # CHECK: cvt.d.w $f12, $f14
+0x25 0x73 0x20 0x46 # CHECK: cvt.l.d $f12, $f14
+0xa5 0x39 0x00 0x46 # CHECK: cvt.l.s $f6, $f7
+0x20 0x73 0x20 0x46 # CHECK: cvt.s.d $f12, $f14
+0xa0 0x39 0x80 0x46 # CHECK: cvt.s.w $f6, $f7
+0x24 0x73 0x20 0x46 # CHECK: cvt.w.d $f12, $f14
+0xa4 0x39 0x00 0x46 # CHECK: cvt.w.s $f6, $f7
+0x00 0x60 0x7e 0x41 # CHECK: di $fp
+0x00 0x60 0x60 0x41 # CHECK: di
+0x20 0x60 0x6e 0x41 # CHECK: ei $14
+0x20 0x60 0x60 0x41 # CHECK: ei
+0x0f 0x73 0x20 0x46 # CHECK: floor.w.d $f12, $f14
+0x8f 0x39 0x00 0x46 # CHECK: floor.w.s $f6, $f7
+0x84 0x61 0x33 0x7d # CHECK: ins $19, $9, 6, 7
+0x4c 0x01 0x00 0x08 # CHECK: j 1328
+0x4c 0x01 0x00 0x0c # CHECK: jal 1328
+0x4c 0x01 0x00 0x74 # CHECK: jalx 1328
+0x09 0xf8 0xe0 0x00 # CHECK: jalr $7
+0x09 0xfc 0x80 0x00 # CHECK: jalr.hb $4
+0x09 0x24 0xa0 0x00 # CHECK: jalr.hb $4, $5
+0x08 0x00 0xe0 0x00 # CHECK: jr $7
+0xc6 0x23 0xa4 0x80 # CHECK: lb $4, 9158($5)
+0x06 0x00 0xa4 0x90 # CHECK: lbu $4, 6($5)
+0xc6 0x23 0xe9 0xd4 # CHECK: ldc1 $f9, 9158($7)
+0x01 0x02 0xf7 0x4d # CHECK: ldxc1 $f8, $23($15)
+0x0c 0x00 0xa4 0x84 # CHECK: lh $4, 12($5)
+0x0c 0x00 0xa4 0x84 # CHECK: lh $4, 12($5)
+0xc6 0x23 0xe9 0xc0 # CHECK: ll $9, 9158($7)
+0x67 0x45 0x06 0x3c # CHECK: lui $6, 17767
+0x05 0x00 0xa6 0x4c # CHECK: luxc1 $f0, $6($5)
+0x18 0x00 0xa4 0x8c # CHECK: lw $4, 24($5)
+0xc6 0x23 0xe9 0xc4 # CHECK: lwc1 $f9, 9158($7)
+0x03 0x00 0x82 0x88 # CHECK: lwl $2, 3($4)
+0x10 0x00 0xa3 0x98 # CHECK: lwr $3, 16($5)
+0x00 0x05 0xcc 0x4d # CHECK: lwxc1 $f20, $12($14)
+0x00 0x00 0xc7 0x70 # CHECK: madd $6, $7
+0xa1 0xd4 0x94 0x4e # CHECK: madd.d $f18, $f20, $f26, $f20
+0x60 0x98 0xf9 0x4f # CHECK: madd.s $f1, $f31, $f19, $f25
+0x01 0x00 0xc7 0x70 # CHECK: maddu $6, $7
+0x00 0x38 0x06 0x44 # CHECK: mfc1 $6, $f7
+0x10 0x28 0x00 0x00 # CHECK: mfhi $5
+0x00 0xc0 0x7e 0x44 # CHECK: mfhc1 $fp, $f24
+0x12 0x28 0x00 0x00 # CHECK: mflo $5
+0x86 0x41 0x20 0x46 # CHECK: mov.d $f6, $f8
+0x86 0x39 0x00 0x46 # CHECK: mov.s $f6, $f7
+0x04 0x00 0xc7 0x70 # CHECK: msub $6, $7
+0xa9 0xf2 0x52 0x4c # CHECK: msub.d $f10, $f2, $f30, $f18
+0x28 0x53 0x70 0x4e # CHECK: msub.s $f12, $f19, $f10, $f16
+0x05 0x00 0xc7 0x70 # CHECK: msubu $6, $7
+0x00 0x38 0x86 0x44 # CHECK: mtc1 $6, $f7
+0x11 0x00 0xe0 0x00 # CHECK: mthi $7
+0x00 0x80 0xe0 0x44 # CHECK: mthc1 $zero, $f16
+0x13 0x00 0xe0 0x00 # CHECK: mtlo $7
+0x02 0x62 0x2e 0x46 # CHECK: mul.d $f8, $f12, $f14
+0x42 0x32 0x07 0x46 # CHECK: mul.s $f9, $f6, $f7
+0x02 0x48 0xc7 0x70 # CHECK: mul $9, $6, $7
+0x18 0x00 0x65 0x00 # CHECK: mult $3, $5
+0x19 0x00 0x65 0x00 # CHECK: multu $3, $5
+0x07 0x73 0x20 0x46 # CHECK: neg.d $f12, $f14
+0x87 0x39 0x00 0x46 # CHECK: neg.s $f6, $f7
+0xb1 0x74 0x54 0x4d # CHECK: nmadd.d $f18, $f10, $f14, $f20
+0x30 0xc8 0xac 0x4c # CHECK: nmadd.s $f0, $f5, $f25, $f12
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x27 0x48 0xc7 0x00 # CHECK: nor $9, $6, $7
+0xb9 0x87 0x1e 0x4d # CHECK: nmsub.d $f30, $f8, $f16, $f30
+0x78 0x98 0x04 0x4f # CHECK: nmsub.s $f1, $f24, $f19, $f4
+0x25 0x18 0x65 0x00 # CHECK: or $3, $3, $5
+0x67 0x45 0xc9 0x34 # CHECK: ori $9, $6, 17767
+0xc2 0x49 0x26 0x00 # CHECK: rotr $9, $6, 7
+0x46 0x48 0xe6 0x00 # CHECK: rotrv $9, $6, $7
+0x0c 0x73 0x20 0x46 # CHECK: round.w.d $f12, $f14
+0x8c 0x39 0x00 0x46 # CHECK: round.w.s $f6, $f7
+0xc6 0x23 0xa4 0xa0 # CHECK: sb $4, 9158($5)
+0x06 0x00 0xa4 0xa0 # CHECK: sb $4, 6($5)
+0xc6 0x23 0xe9 0xe0 # CHECK: sc $9, 9158($7)
+0xc6 0x23 0xe9 0xf4 # CHECK: sdc1 $f9, 9158($7)
+0x09 0x40 0x24 0x4f # CHECK: sdxc1 $f8, $4($25)
+0x20 0x34 0x07 0x7c # CHECK: seb $6, $7
+0x20 0x36 0x07 0x7c # CHECK: seh $6, $7
+0xc6 0x23 0xa4 0xa4 # CHECK: sh $4, 9158($5)
+0xc0 0x21 0x03 0x00 # CHECK: sll $4, $3, 7
+0x04 0x10 0xa3 0x00 # CHECK: sllv $2, $3, $5
+0x2a 0x18 0x65 0x00 # CHECK: slt $3, $3, $5
+0x67 0x00 0x63 0x28 # CHECK: slti $3, $3, 103
+0x67 0x00 0x63 0x2c # CHECK: sltiu $3, $3, 103
+0x2b 0x18 0x65 0x00 # CHECK: sltu $3, $3, $5
+0x04 0x73 0x20 0x46 # CHECK: sqrt.d $f12, $f14
+0x84 0x39 0x00 0x46 # CHECK: sqrt.s $f6, $f7
+0xc3 0x21 0x03 0x00 # CHECK: sra $4, $3, 7
+0x07 0x10 0xa3 0x00 # CHECK: srav $2, $3, $5
+0xc2 0x21 0x03 0x00 # CHECK: srl $4, $3, 7
+0x06 0x10 0xa3 0x00 # CHECK: srlv $2, $3, $5
+0x01 0x62 0x2e 0x46 # CHECK: sub.d $f8, $f12, $f14
+0x41 0x32 0x07 0x46 # CHECK: sub.s $f9, $f6, $f7
+0x22 0x48 0xc7 0x00 # CHECK: sub $9, $6, $7
+0x23 0x20 0x65 0x00 # CHECK: subu $4, $3, $5
+0x0d 0x20 0xb8 0x4c # CHECK: suxc1 $f4, $24($5)
+0x18 0x00 0xa4 0xac # CHECK: sw $4, 24($5)
+0xc6 0x23 0xe9 0xe4 # CHECK: swc1 $f9, 9158($7)
+0x10 0x00 0xa4 0xa8 # CHECK: swl $4, 16($5)
+0x10 0x00 0xe6 0xb8 # CHECK: swr $6, 16($7)
+0x08 0xd0 0xd2 0x4e # CHECK: swxc1 $f26, $18($22)
+0xcf 0x01 0x00 0x00 # CHECK: sync 7
+0x0d 0x73 0x20 0x46 # CHECK: trunc.w.d $f12, $f14
+0x8d 0x39 0x00 0x46 # CHECK: trunc.w.s $f6, $f7
+0xa0 0x30 0x07 0x7c # CHECK: wsbh $6, $7
+0x26 0x18 0x65 0x00 # CHECK: xor $3, $3, $5
+0x67 0x45 0xc9 0x38 # CHECK: xori $9, $6, 17767
diff --git a/test/MC/Disassembler/Mips/mips32r2/valid-mips32r2.txt b/test/MC/Disassembler/Mips/mips32r2/valid-mips32r2.txt
new file mode 100644
index 0000000..9637835
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32r2/valid-mips32r2.txt
@@ -0,0 +1,172 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r2 | FileCheck %s
+# Try a mips64* triple to confirm that mips* vs mips64* triples no longer have
+# an effect on the disassembler behaviour.
+# RUN: llvm-mc --disassemble %s -triple=mips64-unknown-linux -mcpu=mips32r2 | FileCheck %s
+0x46 0x20 0x73 0x05 # CHECK: abs.d $f12, $f14
+0x46 0x00 0x39 0x85 # CHECK: abs.s $f6, $f7
+0x00 0xc7 0x48 0x20 # CHECK: add $9, $6, $7
+0x46 0x2e 0x62 0x00 # CHECK: add.d $f8, $f12, $f14
+0x46 0x07 0x32 0x40 # CHECK: add.s $f9, $f6, $f7
+0x20 0xc9 0x45 0x67 # CHECK: addi $9, $6, 17767
+0x24 0xc9 0xc5 0x67 # CHECK: addiu $9, $6, -15001
+0x00 0xc7 0x48 0x21 # CHECK: addu $9, $6, $7
+0x00 0xc7 0x48 0x24 # CHECK: and $9, $6, $7
+0x30 0xc9 0x45 0x67 # CHECK: andi $9, $6, 17767
+0x10 0x00 0x01 0x4c # CHECK: b 1332
+0x45 0x00 0x01 0x4c # CHECK: bc1f 1332
+0x45 0x1c 0x01 0x4c # CHECK: bc1f $fcc7, 1332
+0x45 0x01 0x01 0x4c # CHECK: bc1t 1332
+0x45 0x1d 0x01 0x4c # CHECK: bc1t $fcc7, 1332
+0x11 0x26 0x01 0x4c # CHECK: beq $9, $6, 1332
+0x04 0xc1 0x01 0x4c # CHECK: bgez $6, 1332
+0x04 0xd1 0x01 0x4c # CHECK: bgezal $6, 1332
+0x1c 0xc0 0x01 0x4c # CHECK: bgtz $6, 1332
+0x18 0xc0 0x01 0x4c # CHECK: blez $6, 1332
+0x15 0x26 0x01 0x4c # CHECK: bne $9, $6, 1332
+0x46 0x2e 0x60 0x32 # CHECK: c.eq.d $f12, $f14
+0x46 0x07 0x30 0x32 # CHECK: c.eq.s $f6, $f7
+0x46 0x2e 0x60 0x30 # CHECK: c.f.d $f12, $f14
+0x46 0x07 0x30 0x30 # CHECK: c.f.s $f6, $f7
+0x46 0x2e 0x60 0x3e # CHECK: c.le.d $f12, $f14
+0x46 0x07 0x30 0x3e # CHECK: c.le.s $f6, $f7
+0x46 0x2e 0x60 0x3c # CHECK: c.lt.d $f12, $f14
+0x46 0x07 0x30 0x3c # CHECK: c.lt.s $f6, $f7
+0x46 0x2e 0x60 0x3d # CHECK: c.nge.d $f12, $f14
+0x46 0x07 0x30 0x3d # CHECK: c.nge.s $f6, $f7
+0x46 0x2e 0x60 0x3b # CHECK: c.ngl.d $f12, $f14
+0x46 0x07 0x30 0x3b # CHECK: c.ngl.s $f6, $f7
+0x46 0x2e 0x60 0x39 # CHECK: c.ngle.d $f12, $f14
+0x46 0x07 0x30 0x39 # CHECK: c.ngle.s $f6, $f7
+0x46 0x2e 0x60 0x3f # CHECK: c.ngt.d $f12, $f14
+0x46 0x07 0x30 0x3f # CHECK: c.ngt.s $f6, $f7
+0x46 0x2e 0x60 0x36 # CHECK: c.ole.d $f12, $f14
+0x46 0x07 0x30 0x36 # CHECK: c.ole.s $f6, $f7
+0x46 0x2e 0x60 0x34 # CHECK: c.olt.d $f12, $f14
+0x46 0x07 0x30 0x34 # CHECK: c.olt.s $f6, $f7
+0x46 0x2e 0x60 0x3a # CHECK: c.seq.d $f12, $f14
+0x46 0x07 0x30 0x3a # CHECK: c.seq.s $f6, $f7
+0x46 0x2e 0x60 0x38 # CHECK: c.sf.d $f12, $f14
+0x46 0x07 0x30 0x38 # CHECK: c.sf.s $f6, $f7
+0x46 0x2e 0x60 0x33 # CHECK: c.ueq.d $f12, $f14
+0x46 0x12 0xe0 0x33 # CHECK: c.ueq.s $f28, $f18
+0x46 0x2e 0x60 0x37 # CHECK: c.ule.d $f12, $f14
+0x46 0x07 0x30 0x37 # CHECK: c.ule.s $f6, $f7
+0x46 0x2e 0x60 0x35 # CHECK: c.ult.d $f12, $f14
+0x46 0x07 0x30 0x35 # CHECK: c.ult.s $f6, $f7
+0x46 0x2e 0x60 0x31 # CHECK: c.un.d $f12, $f14
+0x46 0x07 0x30 0x31 # CHECK: c.un.s $f6, $f7
+0x46 0x20 0x73 0x0e # CHECK: ceil.w.d $f12, $f14
+0x46 0x00 0x39 0x8e # CHECK: ceil.w.s $f6, $f7
+0x44 0x46 0x38 0x00 # CHECK: cfc1 $6, $7
+0x70 0xe6 0x30 0x21 # CHECK: clo $6, $7
+0x70 0xe6 0x30 0x20 # CHECK: clz $6, $7
+0x44 0xc6 0x38 0x00 # CHECK: ctc1 $6, $7
+0x46 0x00 0x39 0xa1 # CHECK: cvt.d.s $f6, $f7
+0x46 0x80 0x73 0x21 # CHECK: cvt.d.w $f12, $f14
+0x46 0x20 0x73 0x25 # CHECK: cvt.l.d $f12, $f14
+0x46 0x00 0x39 0xa5 # CHECK: cvt.l.s $f6, $f7
+0x46 0x20 0x73 0x20 # CHECK: cvt.s.d $f12, $f14
+0x46 0x80 0x39 0xa0 # CHECK: cvt.s.w $f6, $f7
+0x46 0x20 0x73 0x24 # CHECK: cvt.w.d $f12, $f14
+0x46 0x00 0x39 0xa4 # CHECK: cvt.w.s $f6, $f7
+0x41 0x7e 0x60 0x00 # CHECK: di $fp
+0x41 0x60 0x60 0x00 # CHECK: di
+0x41 0x6e 0x60 0x20 # CHECK: ei $14
+0x41 0x60 0x60 0x20 # CHECK: ei
+0x46 0x20 0x73 0x0f # CHECK: floor.w.d $f12, $f14
+0x46 0x00 0x39 0x8f # CHECK: floor.w.s $f6, $f7
+0x7d 0x33 0x61 0x84 # CHECK: ins $19, $9, 6, 7
+0x08 0x00 0x01 0x4c # CHECK: j 1328
+0x0c 0x00 0x01 0x4c # CHECK: jal 1328
+0x74 0x00 0x01 0x4c # CHECK: jalx 1328
+0x00 0xe0 0xf8 0x09 # CHECK: jalr $7
+0x00 0x80 0xfc 0x09 # CHECK: jalr.hb $4
+0x00 0xa0 0x24 0x09 # CHECK: jalr.hb $4, $5
+0x00 0xe0 0x00 0x08 # CHECK: jr $7
+0x80 0xa4 0x23 0xc6 # CHECK: lb $4, 9158($5)
+0x90 0xa4 0x00 0x06 # CHECK: lbu $4, 6($5)
+0xd4 0xe9 0x23 0xc6 # CHECK: ldc1 $f9, 9158($7)
+0x4d 0xf7 0x02 0x01 # CHECK: ldxc1 $f8, $23($15)
+0x84 0xa4 0x00 0x0c # CHECK: lh $4, 12($5)
+0x84 0xa4 0x00 0x0c # CHECK: lh $4, 12($5)
+0xc0 0xe9 0x23 0xc6 # CHECK: ll $9, 9158($7)
+0x3c 0x06 0x45 0x67 # CHECK: lui $6, 17767
+0x4c 0xa6 0x00 0x05 # CHECK: luxc1 $f0, $6($5)
+0x8c 0xa4 0x00 0x18 # CHECK: lw $4, 24($5)
+0xc4 0xe9 0x23 0xc6 # CHECK: lwc1 $f9, 9158($7)
+0x88 0x82 0x00 0x03 # CHECK: lwl $2, 3($4)
+0x98 0xa3 0x00 0x10 # CHECK: lwr $3, 16($5)
+0x4d 0xcc 0x05 0x00 # CHECK: lwxc1 $f20, $12($14)
+0x70 0xc7 0x00 0x00 # CHECK: madd $6, $7
+0x4e 0x94 0xd4 0xa1 # CHECK: madd.d $f18, $f20, $f26, $f20
+0x4f 0xf9 0x98 0x60 # CHECK: madd.s $f1, $f31, $f19, $f25
+0x70 0xc7 0x00 0x01 # CHECK: maddu $6, $7
+0x44 0x06 0x38 0x00 # CHECK: mfc1 $6, $f7
+0x00 0x00 0x28 0x10 # CHECK: mfhi $5
+0x44 0x7e 0xc0 0x00 # CHECK: mfhc1 $fp, $f24
+0x00 0x00 0x28 0x12 # CHECK: mflo $5
+0x46 0x20 0x41 0x86 # CHECK: mov.d $f6, $f8
+0x46 0x00 0x39 0x86 # CHECK: mov.s $f6, $f7
+0x70 0xc7 0x00 0x04 # CHECK: msub $6, $7
+0x4c 0x52 0xf2 0xa9 # CHECK: msub.d $f10, $f2, $f30, $f18
+0x4e 0x70 0x53 0x28 # CHECK: msub.s $f12, $f19, $f10, $f16
+0x70 0xc7 0x00 0x05 # CHECK: msubu $6, $7
+0x44 0x86 0x38 0x00 # CHECK: mtc1 $6, $f7
+0x00 0xe0 0x00 0x11 # CHECK: mthi $7
+0x44 0xe0 0x80 0x00 # CHECK: mthc1 $zero, $f16
+0x00 0xe0 0x00 0x13 # CHECK: mtlo $7
+0x46 0x2e 0x62 0x02 # CHECK: mul.d $f8, $f12, $f14
+0x46 0x07 0x32 0x42 # CHECK: mul.s $f9, $f6, $f7
+0x70 0xc7 0x48 0x02 # CHECK: mul $9, $6, $7
+0x00 0x65 0x00 0x18 # CHECK: mult $3, $5
+0x00 0x65 0x00 0x19 # CHECK: multu $3, $5
+0x46 0x20 0x73 0x07 # CHECK: neg.d $f12, $f14
+0x46 0x00 0x39 0x87 # CHECK: neg.s $f6, $f7
+0x4d 0x54 0x74 0xb1 # CHECK: nmadd.d $f18, $f10, $f14, $f20
+0x4c 0xac 0xc8 0x30 # CHECK: nmadd.s $f0, $f5, $f25, $f12
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x00 0xc7 0x48 0x27 # CHECK: nor $9, $6, $7
+0x4d 0x1e 0x87 0xb9 # CHECK: nmsub.d $f30, $f8, $f16, $f30
+0x4f 0x04 0x98 0x78 # CHECK: nmsub.s $f1, $f24, $f19, $f4
+0x00 0x65 0x18 0x25 # CHECK: or $3, $3, $5
+0x34 0xc9 0x45 0x67 # CHECK: ori $9, $6, 17767
+0x00 0x26 0x49 0xc2 # CHECK: rotr $9, $6, 7
+0x00 0xe6 0x48 0x46 # CHECK: rotrv $9, $6, $7
+0x46 0x20 0x73 0x0c # CHECK: round.w.d $f12, $f14
+0x46 0x00 0x39 0x8c # CHECK: round.w.s $f6, $f7
+0xa0 0xa4 0x23 0xc6 # CHECK: sb $4, 9158($5)
+0xa0 0xa4 0x00 0x06 # CHECK: sb $4, 6($5)
+0xe0 0xe9 0x23 0xc6 # CHECK: sc $9, 9158($7)
+0xf4 0xe9 0x23 0xc6 # CHECK: sdc1 $f9, 9158($7)
+0x4f 0x24 0x40 0x09 # CHECK: sdxc1 $f8, $4($25)
+0x7c 0x07 0x34 0x20 # CHECK: seb $6, $7
+0x7c 0x07 0x36 0x20 # CHECK: seh $6, $7
+0xa4 0xa4 0x23 0xc6 # CHECK: sh $4, 9158($5)
+0x00 0x03 0x21 0xc0 # CHECK: sll $4, $3, 7
+0x00 0xa3 0x10 0x04 # CHECK: sllv $2, $3, $5
+0x00 0x65 0x18 0x2a # CHECK: slt $3, $3, $5
+0x28 0x63 0x00 0x67 # CHECK: slti $3, $3, 103
+0x2c 0x63 0x00 0x67 # CHECK: sltiu $3, $3, 103
+0x00 0x65 0x18 0x2b # CHECK: sltu $3, $3, $5
+0x46 0x20 0x73 0x04 # CHECK: sqrt.d $f12, $f14
+0x46 0x00 0x39 0x84 # CHECK: sqrt.s $f6, $f7
+0x00 0x03 0x21 0xc3 # CHECK: sra $4, $3, 7
+0x00 0xa3 0x10 0x07 # CHECK: srav $2, $3, $5
+0x00 0x03 0x21 0xc2 # CHECK: srl $4, $3, 7
+0x00 0xa3 0x10 0x06 # CHECK: srlv $2, $3, $5
+0x46 0x2e 0x62 0x01 # CHECK: sub.d $f8, $f12, $f14
+0x46 0x07 0x32 0x41 # CHECK: sub.s $f9, $f6, $f7
+0x00 0xc7 0x48 0x22 # CHECK: sub $9, $6, $7
+0x00 0x65 0x20 0x23 # CHECK: subu $4, $3, $5
+0x4c 0xb8 0x20 0x0d # CHECK: suxc1 $f4, $24($5)
+0xac 0xa4 0x00 0x18 # CHECK: sw $4, 24($5)
+0xe4 0xe9 0x23 0xc6 # CHECK: swc1 $f9, 9158($7)
+0xa8 0xa4 0x00 0x10 # CHECK: swl $4, 16($5)
+0xb8 0xe6 0x00 0x10 # CHECK: swr $6, 16($7)
+0x4e 0xd2 0xd0 0x08 # CHECK: swxc1 $f26, $18($22)
+0x00 0x00 0x01 0xcf # CHECK: sync 7
+0x46 0x20 0x73 0x0d # CHECK: trunc.w.d $f12, $f14
+0x46 0x00 0x39 0x8d # CHECK: trunc.w.s $f6, $f7
+0x7c 0x07 0x30 0xa0 # CHECK: wsbh $6, $7
+0x00 0x65 0x18 0x26 # CHECK: xor $3, $3, $5
+0x38 0xc9 0x45 0x67 # CHECK: xori $9, $6, 17767
diff --git a/test/MC/Disassembler/Mips/mips32r2/valid-xfail-mips32r2.txt b/test/MC/Disassembler/Mips/mips32r2/valid-xfail-mips32r2.txt
new file mode 100644
index 0000000..da8130c
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32r2/valid-xfail-mips32r2.txt
@@ -0,0 +1,83 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -disassemble -mcpu=mips32r2 | FileCheck %s
+# XFAIL: *
+0x46 0x2f 0x79 0x32 # CHECK: c.eq.d $fcc1, $f15, $f15
+0x46 0x11 0xc5 0x32 # CHECK: c.eq.s $fcc5, $f24, $f17
+0x46 0x35 0x5c 0x30 # CHECK: c.f.d $fcc4, $f11, $f21
+0x46 0x07 0xf4 0x30 # CHECK: c.f.s $fcc4, $f30, $f7
+0x46 0x21 0x94 0x3e # CHECK: c.le.d $fcc4, $f18, $f1
+0x46 0x04 0xc6 0x3e # CHECK: c.le.s $fcc6, $f24, $f4
+0x46 0x23 0x4b 0x3c # CHECK: c.lt.d $fcc3, $f9, $f3
+0x46 0x0e 0x8a 0x3c # CHECK: c.lt.s $fcc2, $f17, $f14
+0x46 0x30 0xad 0x3d # CHECK: c.nge.d $fcc5, $f21, $f16
+0x46 0x08 0x5b 0x3d # CHECK: c.nge.s $fcc3, $f11, $f8
+0x46 0x17 0xfa 0x3b # CHECK: c.ngl.s $fcc2, $f31, $f23
+0x46 0x17 0x92 0x39 # CHECK: c.ngle.s $fcc2, $f18, $f23
+0x46 0x27 0xc4 0x3f # CHECK: c.ngt.d $fcc4, $f24, $f7
+0x46 0x0d 0x45 0x3f # CHECK: c.ngt.s $fcc5, $f8, $f13
+0x46 0x3f 0x82 0x36 # CHECK: c.ole.d $fcc2, $f16, $f31
+0x46 0x14 0x3b 0x36 # CHECK: c.ole.s $fcc3, $f7, $f20
+0x46 0x3c 0x9c 0x34 # CHECK: c.olt.d $fcc4, $f19, $f28
+0x46 0x07 0xa6 0x34 # CHECK: c.olt.s $fcc6, $f20, $f7
+0x46 0x27 0xfc 0x3a # CHECK: c.seq.d $fcc4, $f31, $f7
+0x46 0x19 0x0f 0x3a # CHECK: c.seq.s $fcc7, $f1, $f25
+0x46 0x39 0x6c 0x33 # CHECK: c.ueq.d $fcc4, $f13, $f25
+0x46 0x1e 0x1e 0x33 # CHECK: c.ueq.s $fcc6, $f3, $f30
+0x46 0x32 0xcf 0x37 # CHECK: c.ule.d $fcc7, $f25, $f18
+0x46 0x1e 0xaf 0x37 # CHECK: c.ule.s $fcc7, $f21, $f30
+0x46 0x31 0x36 0x35 # CHECK: c.ult.d $fcc6, $f6, $f17
+0x46 0x0a 0xc7 0x35 # CHECK: c.ult.s $fcc7, $f24, $f10
+0x46 0x38 0xbe 0x31 # CHECK: c.un.d $fcc6, $f23, $f24
+0x46 0x04 0xf1 0x31 # CHECK: c.un.s $fcc1, $f30, $f4
+0x46 0xc0 0x45 0x85 # CHECK: abs.ps $f22, $f8
+0x46 0xcc 0xc6 0x00 # CHECK: add.ps $f24, $f24, $f12
+0x46 0xca 0x04 0x32 # CHECK: c.eq.ps $fcc4, $f0, $f10
+0x46 0xcc 0x66 0x30 # CHECK: c.f.ps $fcc6, $f12, $f12
+0x46 0xd4 0x42 0x3e # CHECK: c.le.ps $fcc2, $f8, $f20
+0x46 0xc4 0x90 0x3c # CHECK: c.lt.ps $f18, $f4
+0x46 0xda 0x10 0x3d # CHECK: c.nge.ps $f2, $f26
+0x46 0xde 0xb0 0x3b # CHECK: c.ngl.ps $f22, $f30
+0x46 0xd4 0x66 0x39 # CHECK: c.ngle.ps $fcc6, $f12, $f20
+0x46 0xc6 0xf6 0x3f # CHECK: c.ngt.ps $fcc6, $f30, $f6
+0x46 0xc8 0xa6 0x36 # CHECK: c.ole.ps $fcc6, $f20, $f8
+0x46 0xd0 0x32 0x34 # CHECK: c.olt.ps $fcc2, $f6, $f16
+0x46 0xce 0xf6 0x3a # CHECK: c.seq.ps $fcc6, $f30, $f14
+0x46 0xc6 0x26 0x38 # CHECK: c.sf.ps $fcc6, $f4, $f6
+0x46 0xdc 0x20 0x33 # CHECK: c.ueq.ps $f4, $f28
+0x46 0xc2 0x86 0x37 # CHECK: c.ule.ps $fcc6, $f16, $f2
+0x46 0xc0 0x76 0x35 # CHECK: c.ult.ps $fcc6, $f14, $f0
+0x46 0xda 0x14 0x31 # CHECK: c.un.ps $fcc4, $f2, $f26
+0x46 0x20 0x20 0x8a # CHECK: ceil.l.d $f2, $f4
+0x46 0x00 0x6c 0x8a # CHECK: ceil.l.s $f18, $f13
+0x46 0xa0 0x81 0x21 # CHECK: cvt.d.l $f4, $f16
+0x46 0x14 0x90 0xa6 # CHECK: cvt.ps.s $f2, $f18, $f20
+0x46 0xa0 0xf3 0xe0 # CHECK: cvt.s.l $f15, $f30
+0x46 0xc0 0x17 0xa8 # CHECK: cvt.s.pl $f30, $f2
+0x46 0xc0 0xd3 0xa0 # CHECK: cvt.s.pu $f14, $f26
+0x46 0x20 0x36 0x8b # CHECK: floor.l.d $f26, $f6
+0x46 0x00 0x23 0x0b # CHECK: floor.l.s $f12, $f4
+0x4c 0x42 0x75 0xa6 # CHECK: madd.ps $f22, $f2, $f14, $f2
+0x46 0xc0 0x85 0x86 # CHECK: mov.ps $f22, $f16
+0x46 0xd8 0xe2 0x91 # CHECK: movf.ps $f10, $f28, $fcc6
+0x46 0xd3 0xf7 0x93 # CHECK: movn.ps $f30, $f30, s3
+0x46 0xc9 0xc5 0x11 # CHECK: movt.ps $f20, $f24, $fcc2
+0x46 0xdf 0x84 0x92 # CHECK: movz.ps $f18, $f16, ra
+0x4d 0xd0 0xe3 0x2e # CHECK: msub.ps $f12, $f14, $f28, $f16
+0x46 0xc0 0x64 0x87 # CHECK: neg.ps $f18, $f12
+0x4c 0x98 0x46 0xb6 # CHECK: nmadd.ps $f26, $f4, $f8, $f24
+0x4d 0x90 0x71 0xbe # CHECK: nmsub.ps $f6, $f12, $f14, $f16
+0x46 0xde 0x46 0x2c # CHECK: pll.ps $f24, $f8, $f30
+0x46 0xdc 0xd0 0x2d # CHECK: plu.ps $f0, $f26, $f28
+0x46 0xda 0xf2 0x2e # CHECK: pul.ps $f8, $f30, $f26
+0x46 0xc2 0x46 0x2f # CHECK: puu.ps $f24, $f8, $f2
+0x41 0x49 0x98 0x00 # CHECK: rdpgpr s3, t1
+0x46 0x20 0x34 0x95 # CHECK: recip.d $f18, $f6
+0x46 0x00 0xf0 0xd5 # CHECK: recip.s $f3, $f30
+0x02 0xa7 0x68 0x46 # CHECK: rorv t5, a3, s5
+0x46 0x20 0x03 0x08 # CHECK: round.l.d $f12, $f0
+0x46 0x00 0x2e 0x08 # CHECK: round.l.s $f24, $f5
+0x46 0x20 0xe0 0x96 # CHECK: rsqrt.d $f2, $f28
+0x46 0x00 0x41 0x16 # CHECK: rsqrt.s $f4, $f8
+0x46 0xda 0x71 0x01 # CHECK: sub.ps $f4, $f14, $f26
+0x46 0x20 0xb5 0x89 # CHECK: trunc.l.d $f22, $f22
+0x46 0x00 0xff 0x09 # CHECK: trunc.l.s $f28, $f31
+0x41 0xcd 0x00 0x00 # CHECK: wrpgpr zero, t5
diff --git a/test/MC/Disassembler/Mips/mips32r2_le.txt b/test/MC/Disassembler/Mips/mips32r2_le.txt
index 0362ca6..81a05b3 100644
--- a/test/MC/Disassembler/Mips/mips32r2_le.txt
+++ b/test/MC/Disassembler/Mips/mips32r2_le.txt
@@ -448,3 +448,6 @@
 
 # CHECK: xori  $9,  $6, 17767
 0x67 0x45 0xc9 0x38
+
+# CHECK: synci 7500($19)
+0x4c 0x1d 0x7f 0x06
diff --git a/test/MC/Disassembler/Mips/mips32r3/valid-mips32r3-le.txt b/test/MC/Disassembler/Mips/mips32r3/valid-mips32r3-le.txt
new file mode 100644
index 0000000..1909e2a
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32r3/valid-mips32r3-le.txt
@@ -0,0 +1,169 @@
+# RUN: llvm-mc --disassemble %s -triple=mipsel-unknown-linux -mcpu=mips32r3 | FileCheck %s
+0x05 0x73 0x20 0x46 # CHECK: abs.d $f12, $f14
+0x85 0x39 0x00 0x46 # CHECK: abs.s $f6, $f7
+0x20 0x48 0xc7 0x00 # CHECK: add $9, $6, $7
+0x00 0x62 0x2e 0x46 # CHECK: add.d $f8, $f12, $f14
+0x40 0x32 0x07 0x46 # CHECK: add.s $f9, $f6, $f7
+0x67 0x45 0xc9 0x20 # CHECK: addi $9, $6, 17767
+0x67 0xc5 0xc9 0x24 # CHECK: addiu $9, $6, -15001
+0x21 0x48 0xc7 0x00 # CHECK: addu $9, $6, $7
+0x24 0x48 0xc7 0x00 # CHECK: and $9, $6, $7
+0x67 0x45 0xc9 0x30 # CHECK: andi $9, $6, 17767
+0x4c 0x01 0x00 0x10 # CHECK: b 1332
+0x4c 0x01 0x00 0x45 # CHECK: bc1f 1332
+0x4c 0x01 0x1c 0x45 # CHECK: bc1f $fcc7, 1332
+0x4c 0x01 0x01 0x45 # CHECK: bc1t 1332
+0x4c 0x01 0x1d 0x45 # CHECK: bc1t $fcc7, 1332
+0x4c 0x01 0x26 0x11 # CHECK: beq $9, $6, 1332
+0x4c 0x01 0xc1 0x04 # CHECK: bgez $6, 1332
+0x4c 0x01 0xd1 0x04 # CHECK: bgezal $6, 1332
+0x4c 0x01 0xc0 0x1c # CHECK: bgtz $6, 1332
+0x4c 0x01 0xc0 0x18 # CHECK: blez $6, 1332
+0x4c 0x01 0x26 0x15 # CHECK: bne $9, $6, 1332
+0x32 0x60 0x2e 0x46 # CHECK: c.eq.d $f12, $f14
+0x32 0x30 0x07 0x46 # CHECK: c.eq.s $f6, $f7
+0x30 0x60 0x2e 0x46 # CHECK: c.f.d $f12, $f14
+0x30 0x30 0x07 0x46 # CHECK: c.f.s $f6, $f7
+0x3e 0x60 0x2e 0x46 # CHECK: c.le.d $f12, $f14
+0x3e 0x30 0x07 0x46 # CHECK: c.le.s $f6, $f7
+0x3c 0x60 0x2e 0x46 # CHECK: c.lt.d $f12, $f14
+0x3c 0x30 0x07 0x46 # CHECK: c.lt.s $f6, $f7
+0x3d 0x60 0x2e 0x46 # CHECK: c.nge.d $f12, $f14
+0x3d 0x30 0x07 0x46 # CHECK: c.nge.s $f6, $f7
+0x3b 0x60 0x2e 0x46 # CHECK: c.ngl.d $f12, $f14
+0x3b 0x30 0x07 0x46 # CHECK: c.ngl.s $f6, $f7
+0x39 0x60 0x2e 0x46 # CHECK: c.ngle.d $f12, $f14
+0x39 0x30 0x07 0x46 # CHECK: c.ngle.s $f6, $f7
+0x3f 0x60 0x2e 0x46 # CHECK: c.ngt.d $f12, $f14
+0x3f 0x30 0x07 0x46 # CHECK: c.ngt.s $f6, $f7
+0x36 0x60 0x2e 0x46 # CHECK: c.ole.d $f12, $f14
+0x36 0x30 0x07 0x46 # CHECK: c.ole.s $f6, $f7
+0x34 0x60 0x2e 0x46 # CHECK: c.olt.d $f12, $f14
+0x34 0x30 0x07 0x46 # CHECK: c.olt.s $f6, $f7
+0x3a 0x60 0x2e 0x46 # CHECK: c.seq.d $f12, $f14
+0x3a 0x30 0x07 0x46 # CHECK: c.seq.s $f6, $f7
+0x38 0x60 0x2e 0x46 # CHECK: c.sf.d $f12, $f14
+0x38 0x30 0x07 0x46 # CHECK: c.sf.s $f6, $f7
+0x33 0x60 0x2e 0x46 # CHECK: c.ueq.d $f12, $f14
+0x33 0xe0 0x12 0x46 # CHECK: c.ueq.s $f28, $f18
+0x37 0x60 0x2e 0x46 # CHECK: c.ule.d $f12, $f14
+0x37 0x30 0x07 0x46 # CHECK: c.ule.s $f6, $f7
+0x35 0x60 0x2e 0x46 # CHECK: c.ult.d $f12, $f14
+0x35 0x30 0x07 0x46 # CHECK: c.ult.s $f6, $f7
+0x31 0x60 0x2e 0x46 # CHECK: c.un.d $f12, $f14
+0x31 0x30 0x07 0x46 # CHECK: c.un.s $f6, $f7
+0x0e 0x73 0x20 0x46 # CHECK: ceil.w.d $f12, $f14
+0x8e 0x39 0x00 0x46 # CHECK: ceil.w.s $f6, $f7
+0x00 0x38 0x46 0x44 # CHECK: cfc1 $6, $7
+0x21 0x30 0xe6 0x70 # CHECK: clo $6, $7
+0x20 0x30 0xe6 0x70 # CHECK: clz $6, $7
+0x00 0x38 0xc6 0x44 # CHECK: ctc1 $6, $7
+0xa1 0x39 0x00 0x46 # CHECK: cvt.d.s $f6, $f7
+0x21 0x73 0x80 0x46 # CHECK: cvt.d.w $f12, $f14
+0x25 0x73 0x20 0x46 # CHECK: cvt.l.d $f12, $f14
+0xa5 0x39 0x00 0x46 # CHECK: cvt.l.s $f6, $f7
+0x20 0x73 0x20 0x46 # CHECK: cvt.s.d $f12, $f14
+0xa0 0x39 0x80 0x46 # CHECK: cvt.s.w $f6, $f7
+0x24 0x73 0x20 0x46 # CHECK: cvt.w.d $f12, $f14
+0xa4 0x39 0x00 0x46 # CHECK: cvt.w.s $f6, $f7
+0x00 0x60 0x7e 0x41 # CHECK: di $fp
+0x00 0x60 0x60 0x41 # CHECK: di
+0x20 0x60 0x6e 0x41 # CHECK: ei $14
+0x20 0x60 0x60 0x41 # CHECK: ei
+0x0f 0x73 0x20 0x46 # CHECK: floor.w.d $f12, $f14
+0x8f 0x39 0x00 0x46 # CHECK: floor.w.s $f6, $f7
+0x84 0x61 0x33 0x7d # CHECK: ins $19, $9, 6, 7
+0x4c 0x01 0x00 0x08 # CHECK: j 1328
+0x4c 0x01 0x00 0x0c # CHECK: jal 1328
+0x4c 0x01 0x00 0x74 # CHECK: jalx 1328
+0x09 0xf8 0xe0 0x00 # CHECK: jalr $7
+0x09 0xfc 0x80 0x00 # CHECK: jalr.hb $4
+0x09 0x24 0xa0 0x00 # CHECK: jalr.hb $4, $5
+0x08 0x00 0xe0 0x00 # CHECK: jr $7
+0xc6 0x23 0xa4 0x80 # CHECK: lb $4, 9158($5)
+0x06 0x00 0xa4 0x90 # CHECK: lbu $4, 6($5)
+0xc6 0x23 0xe9 0xd4 # CHECK: ldc1 $f9, 9158($7)
+0x01 0x02 0xf7 0x4d # CHECK: ldxc1 $f8, $23($15)
+0x0c 0x00 0xa4 0x84 # CHECK: lh $4, 12($5)
+0x0c 0x00 0xa4 0x84 # CHECK: lh $4, 12($5)
+0xc6 0x23 0xe9 0xc0 # CHECK: ll $9, 9158($7)
+0x67 0x45 0x06 0x3c # CHECK: lui $6, 17767
+0x05 0x00 0xa6 0x4c # CHECK: luxc1 $f0, $6($5)
+0x18 0x00 0xa4 0x8c # CHECK: lw $4, 24($5)
+0xc6 0x23 0xe9 0xc4 # CHECK: lwc1 $f9, 9158($7)
+0x03 0x00 0x82 0x88 # CHECK: lwl $2, 3($4)
+0x10 0x00 0xa3 0x98 # CHECK: lwr $3, 16($5)
+0x00 0x05 0xcc 0x4d # CHECK: lwxc1 $f20, $12($14)
+0x00 0x00 0xc7 0x70 # CHECK: madd $6, $7
+0xa1 0xd4 0x94 0x4e # CHECK: madd.d $f18, $f20, $f26, $f20
+0x60 0x98 0xf9 0x4f # CHECK: madd.s $f1, $f31, $f19, $f25
+0x01 0x00 0xc7 0x70 # CHECK: maddu $6, $7
+0x00 0x38 0x06 0x44 # CHECK: mfc1 $6, $f7
+0x10 0x28 0x00 0x00 # CHECK: mfhi $5
+0x00 0xc0 0x7e 0x44 # CHECK: mfhc1 $fp, $f24
+0x12 0x28 0x00 0x00 # CHECK: mflo $5
+0x86 0x41 0x20 0x46 # CHECK: mov.d $f6, $f8
+0x86 0x39 0x00 0x46 # CHECK: mov.s $f6, $f7
+0x04 0x00 0xc7 0x70 # CHECK: msub $6, $7
+0xa9 0xf2 0x52 0x4c # CHECK: msub.d $f10, $f2, $f30, $f18
+0x28 0x53 0x70 0x4e # CHECK: msub.s $f12, $f19, $f10, $f16
+0x05 0x00 0xc7 0x70 # CHECK: msubu $6, $7
+0x00 0x38 0x86 0x44 # CHECK: mtc1 $6, $f7
+0x11 0x00 0xe0 0x00 # CHECK: mthi $7
+0x00 0x80 0xe0 0x44 # CHECK: mthc1 $zero, $f16
+0x13 0x00 0xe0 0x00 # CHECK: mtlo $7
+0x02 0x62 0x2e 0x46 # CHECK: mul.d $f8, $f12, $f14
+0x42 0x32 0x07 0x46 # CHECK: mul.s $f9, $f6, $f7
+0x02 0x48 0xc7 0x70 # CHECK: mul $9, $6, $7
+0x18 0x00 0x65 0x00 # CHECK: mult $3, $5
+0x19 0x00 0x65 0x00 # CHECK: multu $3, $5
+0x07 0x73 0x20 0x46 # CHECK: neg.d $f12, $f14
+0x87 0x39 0x00 0x46 # CHECK: neg.s $f6, $f7
+0xb1 0x74 0x54 0x4d # CHECK: nmadd.d $f18, $f10, $f14, $f20
+0x30 0xc8 0xac 0x4c # CHECK: nmadd.s $f0, $f5, $f25, $f12
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x27 0x48 0xc7 0x00 # CHECK: nor $9, $6, $7
+0xb9 0x87 0x1e 0x4d # CHECK: nmsub.d $f30, $f8, $f16, $f30
+0x78 0x98 0x04 0x4f # CHECK: nmsub.s $f1, $f24, $f19, $f4
+0x25 0x18 0x65 0x00 # CHECK: or $3, $3, $5
+0x67 0x45 0xc9 0x34 # CHECK: ori $9, $6, 17767
+0xc2 0x49 0x26 0x00 # CHECK: rotr $9, $6, 7
+0x46 0x48 0xe6 0x00 # CHECK: rotrv $9, $6, $7
+0x0c 0x73 0x20 0x46 # CHECK: round.w.d $f12, $f14
+0x8c 0x39 0x00 0x46 # CHECK: round.w.s $f6, $f7
+0xc6 0x23 0xa4 0xa0 # CHECK: sb $4, 9158($5)
+0x06 0x00 0xa4 0xa0 # CHECK: sb $4, 6($5)
+0xc6 0x23 0xe9 0xe0 # CHECK: sc $9, 9158($7)
+0xc6 0x23 0xe9 0xf4 # CHECK: sdc1 $f9, 9158($7)
+0x09 0x40 0x24 0x4f # CHECK: sdxc1 $f8, $4($25)
+0x20 0x34 0x07 0x7c # CHECK: seb $6, $7
+0x20 0x36 0x07 0x7c # CHECK: seh $6, $7
+0xc6 0x23 0xa4 0xa4 # CHECK: sh $4, 9158($5)
+0xc0 0x21 0x03 0x00 # CHECK: sll $4, $3, 7
+0x04 0x10 0xa3 0x00 # CHECK: sllv $2, $3, $5
+0x2a 0x18 0x65 0x00 # CHECK: slt $3, $3, $5
+0x67 0x00 0x63 0x28 # CHECK: slti $3, $3, 103
+0x67 0x00 0x63 0x2c # CHECK: sltiu $3, $3, 103
+0x2b 0x18 0x65 0x00 # CHECK: sltu $3, $3, $5
+0x04 0x73 0x20 0x46 # CHECK: sqrt.d $f12, $f14
+0x84 0x39 0x00 0x46 # CHECK: sqrt.s $f6, $f7
+0xc3 0x21 0x03 0x00 # CHECK: sra $4, $3, 7
+0x07 0x10 0xa3 0x00 # CHECK: srav $2, $3, $5
+0xc2 0x21 0x03 0x00 # CHECK: srl $4, $3, 7
+0x06 0x10 0xa3 0x00 # CHECK: srlv $2, $3, $5
+0x01 0x62 0x2e 0x46 # CHECK: sub.d $f8, $f12, $f14
+0x41 0x32 0x07 0x46 # CHECK: sub.s $f9, $f6, $f7
+0x22 0x48 0xc7 0x00 # CHECK: sub $9, $6, $7
+0x23 0x20 0x65 0x00 # CHECK: subu $4, $3, $5
+0x0d 0x20 0xb8 0x4c # CHECK: suxc1 $f4, $24($5)
+0x18 0x00 0xa4 0xac # CHECK: sw $4, 24($5)
+0xc6 0x23 0xe9 0xe4 # CHECK: swc1 $f9, 9158($7)
+0x10 0x00 0xa4 0xa8 # CHECK: swl $4, 16($5)
+0x10 0x00 0xe6 0xb8 # CHECK: swr $6, 16($7)
+0x08 0xd0 0xd2 0x4e # CHECK: swxc1 $f26, $18($22)
+0xcf 0x01 0x00 0x00 # CHECK: sync 7
+0x0d 0x73 0x20 0x46 # CHECK: trunc.w.d $f12, $f14
+0x8d 0x39 0x00 0x46 # CHECK: trunc.w.s $f6, $f7
+0xa0 0x30 0x07 0x7c # CHECK: wsbh $6, $7
+0x26 0x18 0x65 0x00 # CHECK: xor $3, $3, $5
+0x67 0x45 0xc9 0x38 # CHECK: xori $9, $6, 17767
diff --git a/test/MC/Disassembler/Mips/mips32r3/valid-mips32r3.txt b/test/MC/Disassembler/Mips/mips32r3/valid-mips32r3.txt
new file mode 100644
index 0000000..a273c24
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32r3/valid-mips32r3.txt
@@ -0,0 +1,169 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r3 | FileCheck %s
+0x46 0x20 0x73 0x05 # CHECK: abs.d $f12, $f14
+0x46 0x00 0x39 0x85 # CHECK: abs.s $f6, $f7
+0x00 0xc7 0x48 0x20 # CHECK: add $9, $6, $7
+0x46 0x2e 0x62 0x00 # CHECK: add.d $f8, $f12, $f14
+0x46 0x07 0x32 0x40 # CHECK: add.s $f9, $f6, $f7
+0x20 0xc9 0x45 0x67 # CHECK: addi $9, $6, 17767
+0x24 0xc9 0xc5 0x67 # CHECK: addiu $9, $6, -15001
+0x00 0xc7 0x48 0x21 # CHECK: addu $9, $6, $7
+0x00 0xc7 0x48 0x24 # CHECK: and $9, $6, $7
+0x30 0xc9 0x45 0x67 # CHECK: andi $9, $6, 17767
+0x10 0x00 0x01 0x4c # CHECK: b 1332
+0x45 0x00 0x01 0x4c # CHECK: bc1f 1332
+0x45 0x1c 0x01 0x4c # CHECK: bc1f $fcc7, 1332
+0x45 0x01 0x01 0x4c # CHECK: bc1t 1332
+0x45 0x1d 0x01 0x4c # CHECK: bc1t $fcc7, 1332
+0x11 0x26 0x01 0x4c # CHECK: beq $9, $6, 1332
+0x04 0xc1 0x01 0x4c # CHECK: bgez $6, 1332
+0x04 0xd1 0x01 0x4c # CHECK: bgezal $6, 1332
+0x1c 0xc0 0x01 0x4c # CHECK: bgtz $6, 1332
+0x18 0xc0 0x01 0x4c # CHECK: blez $6, 1332
+0x15 0x26 0x01 0x4c # CHECK: bne $9, $6, 1332
+0x46 0x2e 0x60 0x32 # CHECK: c.eq.d $f12, $f14
+0x46 0x07 0x30 0x32 # CHECK: c.eq.s $f6, $f7
+0x46 0x2e 0x60 0x30 # CHECK: c.f.d $f12, $f14
+0x46 0x07 0x30 0x30 # CHECK: c.f.s $f6, $f7
+0x46 0x2e 0x60 0x3e # CHECK: c.le.d $f12, $f14
+0x46 0x07 0x30 0x3e # CHECK: c.le.s $f6, $f7
+0x46 0x2e 0x60 0x3c # CHECK: c.lt.d $f12, $f14
+0x46 0x07 0x30 0x3c # CHECK: c.lt.s $f6, $f7
+0x46 0x2e 0x60 0x3d # CHECK: c.nge.d $f12, $f14
+0x46 0x07 0x30 0x3d # CHECK: c.nge.s $f6, $f7
+0x46 0x2e 0x60 0x3b # CHECK: c.ngl.d $f12, $f14
+0x46 0x07 0x30 0x3b # CHECK: c.ngl.s $f6, $f7
+0x46 0x2e 0x60 0x39 # CHECK: c.ngle.d $f12, $f14
+0x46 0x07 0x30 0x39 # CHECK: c.ngle.s $f6, $f7
+0x46 0x2e 0x60 0x3f # CHECK: c.ngt.d $f12, $f14
+0x46 0x07 0x30 0x3f # CHECK: c.ngt.s $f6, $f7
+0x46 0x2e 0x60 0x36 # CHECK: c.ole.d $f12, $f14
+0x46 0x07 0x30 0x36 # CHECK: c.ole.s $f6, $f7
+0x46 0x2e 0x60 0x34 # CHECK: c.olt.d $f12, $f14
+0x46 0x07 0x30 0x34 # CHECK: c.olt.s $f6, $f7
+0x46 0x2e 0x60 0x3a # CHECK: c.seq.d $f12, $f14
+0x46 0x07 0x30 0x3a # CHECK: c.seq.s $f6, $f7
+0x46 0x2e 0x60 0x38 # CHECK: c.sf.d $f12, $f14
+0x46 0x07 0x30 0x38 # CHECK: c.sf.s $f6, $f7
+0x46 0x2e 0x60 0x33 # CHECK: c.ueq.d $f12, $f14
+0x46 0x12 0xe0 0x33 # CHECK: c.ueq.s $f28, $f18
+0x46 0x2e 0x60 0x37 # CHECK: c.ule.d $f12, $f14
+0x46 0x07 0x30 0x37 # CHECK: c.ule.s $f6, $f7
+0x46 0x2e 0x60 0x35 # CHECK: c.ult.d $f12, $f14
+0x46 0x07 0x30 0x35 # CHECK: c.ult.s $f6, $f7
+0x46 0x2e 0x60 0x31 # CHECK: c.un.d $f12, $f14
+0x46 0x07 0x30 0x31 # CHECK: c.un.s $f6, $f7
+0x46 0x20 0x73 0x0e # CHECK: ceil.w.d $f12, $f14
+0x46 0x00 0x39 0x8e # CHECK: ceil.w.s $f6, $f7
+0x44 0x46 0x38 0x00 # CHECK: cfc1 $6, $7
+0x70 0xe6 0x30 0x21 # CHECK: clo $6, $7
+0x70 0xe6 0x30 0x20 # CHECK: clz $6, $7
+0x44 0xc6 0x38 0x00 # CHECK: ctc1 $6, $7
+0x46 0x00 0x39 0xa1 # CHECK: cvt.d.s $f6, $f7
+0x46 0x80 0x73 0x21 # CHECK: cvt.d.w $f12, $f14
+0x46 0x20 0x73 0x25 # CHECK: cvt.l.d $f12, $f14
+0x46 0x00 0x39 0xa5 # CHECK: cvt.l.s $f6, $f7
+0x46 0x20 0x73 0x20 # CHECK: cvt.s.d $f12, $f14
+0x46 0x80 0x39 0xa0 # CHECK: cvt.s.w $f6, $f7
+0x46 0x20 0x73 0x24 # CHECK: cvt.w.d $f12, $f14
+0x46 0x00 0x39 0xa4 # CHECK: cvt.w.s $f6, $f7
+0x41 0x7e 0x60 0x00 # CHECK: di $fp
+0x41 0x60 0x60 0x00 # CHECK: di
+0x41 0x6e 0x60 0x20 # CHECK: ei $14
+0x41 0x60 0x60 0x20 # CHECK: ei
+0x46 0x20 0x73 0x0f # CHECK: floor.w.d $f12, $f14
+0x46 0x00 0x39 0x8f # CHECK: floor.w.s $f6, $f7
+0x7d 0x33 0x61 0x84 # CHECK: ins $19, $9, 6, 7
+0x08 0x00 0x01 0x4c # CHECK: j 1328
+0x0c 0x00 0x01 0x4c # CHECK: jal 1328
+0x74 0x00 0x01 0x4c # CHECK: jalx 1328
+0x00 0xe0 0xf8 0x09 # CHECK: jalr $7
+0x00 0x80 0xfc 0x09 # CHECK: jalr.hb $4
+0x00 0xa0 0x24 0x09 # CHECK: jalr.hb $4, $5
+0x00 0xe0 0x00 0x08 # CHECK: jr $7
+0x80 0xa4 0x23 0xc6 # CHECK: lb $4, 9158($5)
+0x90 0xa4 0x00 0x06 # CHECK: lbu $4, 6($5)
+0xd4 0xe9 0x23 0xc6 # CHECK: ldc1 $f9, 9158($7)
+0x4d 0xf7 0x02 0x01 # CHECK: ldxc1 $f8, $23($15)
+0x84 0xa4 0x00 0x0c # CHECK: lh $4, 12($5)
+0x84 0xa4 0x00 0x0c # CHECK: lh $4, 12($5)
+0xc0 0xe9 0x23 0xc6 # CHECK: ll $9, 9158($7)
+0x3c 0x06 0x45 0x67 # CHECK: lui $6, 17767
+0x4c 0xa6 0x00 0x05 # CHECK: luxc1 $f0, $6($5)
+0x8c 0xa4 0x00 0x18 # CHECK: lw $4, 24($5)
+0xc4 0xe9 0x23 0xc6 # CHECK: lwc1 $f9, 9158($7)
+0x88 0x82 0x00 0x03 # CHECK: lwl $2, 3($4)
+0x98 0xa3 0x00 0x10 # CHECK: lwr $3, 16($5)
+0x4d 0xcc 0x05 0x00 # CHECK: lwxc1 $f20, $12($14)
+0x70 0xc7 0x00 0x00 # CHECK: madd $6, $7
+0x4e 0x94 0xd4 0xa1 # CHECK: madd.d $f18, $f20, $f26, $f20
+0x4f 0xf9 0x98 0x60 # CHECK: madd.s $f1, $f31, $f19, $f25
+0x70 0xc7 0x00 0x01 # CHECK: maddu $6, $7
+0x44 0x06 0x38 0x00 # CHECK: mfc1 $6, $f7
+0x00 0x00 0x28 0x10 # CHECK: mfhi $5
+0x44 0x7e 0xc0 0x00 # CHECK: mfhc1 $fp, $f24
+0x00 0x00 0x28 0x12 # CHECK: mflo $5
+0x46 0x20 0x41 0x86 # CHECK: mov.d $f6, $f8
+0x46 0x00 0x39 0x86 # CHECK: mov.s $f6, $f7
+0x70 0xc7 0x00 0x04 # CHECK: msub $6, $7
+0x4c 0x52 0xf2 0xa9 # CHECK: msub.d $f10, $f2, $f30, $f18
+0x4e 0x70 0x53 0x28 # CHECK: msub.s $f12, $f19, $f10, $f16
+0x70 0xc7 0x00 0x05 # CHECK: msubu $6, $7
+0x44 0x86 0x38 0x00 # CHECK: mtc1 $6, $f7
+0x00 0xe0 0x00 0x11 # CHECK: mthi $7
+0x44 0xe0 0x80 0x00 # CHECK: mthc1 $zero, $f16
+0x00 0xe0 0x00 0x13 # CHECK: mtlo $7
+0x46 0x2e 0x62 0x02 # CHECK: mul.d $f8, $f12, $f14
+0x46 0x07 0x32 0x42 # CHECK: mul.s $f9, $f6, $f7
+0x70 0xc7 0x48 0x02 # CHECK: mul $9, $6, $7
+0x00 0x65 0x00 0x18 # CHECK: mult $3, $5
+0x00 0x65 0x00 0x19 # CHECK: multu $3, $5
+0x46 0x20 0x73 0x07 # CHECK: neg.d $f12, $f14
+0x46 0x00 0x39 0x87 # CHECK: neg.s $f6, $f7
+0x4d 0x54 0x74 0xb1 # CHECK: nmadd.d $f18, $f10, $f14, $f20
+0x4c 0xac 0xc8 0x30 # CHECK: nmadd.s $f0, $f5, $f25, $f12
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x00 0xc7 0x48 0x27 # CHECK: nor $9, $6, $7
+0x4d 0x1e 0x87 0xb9 # CHECK: nmsub.d $f30, $f8, $f16, $f30
+0x4f 0x04 0x98 0x78 # CHECK: nmsub.s $f1, $f24, $f19, $f4
+0x00 0x65 0x18 0x25 # CHECK: or $3, $3, $5
+0x34 0xc9 0x45 0x67 # CHECK: ori $9, $6, 17767
+0x00 0x26 0x49 0xc2 # CHECK: rotr $9, $6, 7
+0x00 0xe6 0x48 0x46 # CHECK: rotrv $9, $6, $7
+0x46 0x20 0x73 0x0c # CHECK: round.w.d $f12, $f14
+0x46 0x00 0x39 0x8c # CHECK: round.w.s $f6, $f7
+0xa0 0xa4 0x23 0xc6 # CHECK: sb $4, 9158($5)
+0xa0 0xa4 0x00 0x06 # CHECK: sb $4, 6($5)
+0xe0 0xe9 0x23 0xc6 # CHECK: sc $9, 9158($7)
+0xf4 0xe9 0x23 0xc6 # CHECK: sdc1 $f9, 9158($7)
+0x4f 0x24 0x40 0x09 # CHECK: sdxc1 $f8, $4($25)
+0x7c 0x07 0x34 0x20 # CHECK: seb $6, $7
+0x7c 0x07 0x36 0x20 # CHECK: seh $6, $7
+0xa4 0xa4 0x23 0xc6 # CHECK: sh $4, 9158($5)
+0x00 0x03 0x21 0xc0 # CHECK: sll $4, $3, 7
+0x00 0xa3 0x10 0x04 # CHECK: sllv $2, $3, $5
+0x00 0x65 0x18 0x2a # CHECK: slt $3, $3, $5
+0x28 0x63 0x00 0x67 # CHECK: slti $3, $3, 103
+0x2c 0x63 0x00 0x67 # CHECK: sltiu $3, $3, 103
+0x00 0x65 0x18 0x2b # CHECK: sltu $3, $3, $5
+0x46 0x20 0x73 0x04 # CHECK: sqrt.d $f12, $f14
+0x46 0x00 0x39 0x84 # CHECK: sqrt.s $f6, $f7
+0x00 0x03 0x21 0xc3 # CHECK: sra $4, $3, 7
+0x00 0xa3 0x10 0x07 # CHECK: srav $2, $3, $5
+0x00 0x03 0x21 0xc2 # CHECK: srl $4, $3, 7
+0x00 0xa3 0x10 0x06 # CHECK: srlv $2, $3, $5
+0x46 0x2e 0x62 0x01 # CHECK: sub.d $f8, $f12, $f14
+0x46 0x07 0x32 0x41 # CHECK: sub.s $f9, $f6, $f7
+0x00 0xc7 0x48 0x22 # CHECK: sub $9, $6, $7
+0x00 0x65 0x20 0x23 # CHECK: subu $4, $3, $5
+0x4c 0xb8 0x20 0x0d # CHECK: suxc1 $f4, $24($5)
+0xac 0xa4 0x00 0x18 # CHECK: sw $4, 24($5)
+0xe4 0xe9 0x23 0xc6 # CHECK: swc1 $f9, 9158($7)
+0xa8 0xa4 0x00 0x10 # CHECK: swl $4, 16($5)
+0xb8 0xe6 0x00 0x10 # CHECK: swr $6, 16($7)
+0x4e 0xd2 0xd0 0x08 # CHECK: swxc1 $f26, $18($22)
+0x00 0x00 0x01 0xcf # CHECK: sync 7
+0x46 0x20 0x73 0x0d # CHECK: trunc.w.d $f12, $f14
+0x46 0x00 0x39 0x8d # CHECK: trunc.w.s $f6, $f7
+0x7c 0x07 0x30 0xa0 # CHECK: wsbh $6, $7
+0x00 0x65 0x18 0x26 # CHECK: xor $3, $3, $5
+0x38 0xc9 0x45 0x67 # CHECK: xori $9, $6, 17767
diff --git a/test/MC/Disassembler/Mips/mips32r3/valid-xfail-mips32r3.txt b/test/MC/Disassembler/Mips/mips32r3/valid-xfail-mips32r3.txt
new file mode 100644
index 0000000..7623bba
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32r3/valid-xfail-mips32r3.txt
@@ -0,0 +1,83 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -disassemble -mcpu=mips32r3 | FileCheck %s
+# XFAIL: *
+0x46 0x2f 0x79 0x32 # CHECK: c.eq.d $fcc1, $f15, $f15
+0x46 0x11 0xc5 0x32 # CHECK: c.eq.s $fcc5, $f24, $f17
+0x46 0x35 0x5c 0x30 # CHECK: c.f.d $fcc4, $f11, $f21
+0x46 0x07 0xf4 0x30 # CHECK: c.f.s $fcc4, $f30, $f7
+0x46 0x21 0x94 0x3e # CHECK: c.le.d $fcc4, $f18, $f1
+0x46 0x04 0xc6 0x3e # CHECK: c.le.s $fcc6, $f24, $f4
+0x46 0x23 0x4b 0x3c # CHECK: c.lt.d $fcc3, $f9, $f3
+0x46 0x0e 0x8a 0x3c # CHECK: c.lt.s $fcc2, $f17, $f14
+0x46 0x30 0xad 0x3d # CHECK: c.nge.d $fcc5, $f21, $f16
+0x46 0x08 0x5b 0x3d # CHECK: c.nge.s $fcc3, $f11, $f8
+0x46 0x17 0xfa 0x3b # CHECK: c.ngl.s $fcc2, $f31, $f23
+0x46 0x17 0x92 0x39 # CHECK: c.ngle.s $fcc2, $f18, $f23
+0x46 0x27 0xc4 0x3f # CHECK: c.ngt.d $fcc4, $f24, $f7
+0x46 0x0d 0x45 0x3f # CHECK: c.ngt.s $fcc5, $f8, $f13
+0x46 0x3f 0x82 0x36 # CHECK: c.ole.d $fcc2, $f16, $f31
+0x46 0x14 0x3b 0x36 # CHECK: c.ole.s $fcc3, $f7, $f20
+0x46 0x3c 0x9c 0x34 # CHECK: c.olt.d $fcc4, $f19, $f28
+0x46 0x07 0xa6 0x34 # CHECK: c.olt.s $fcc6, $f20, $f7
+0x46 0x27 0xfc 0x3a # CHECK: c.seq.d $fcc4, $f31, $f7
+0x46 0x19 0x0f 0x3a # CHECK: c.seq.s $fcc7, $f1, $f25
+0x46 0x39 0x6c 0x33 # CHECK: c.ueq.d $fcc4, $f13, $f25
+0x46 0x1e 0x1e 0x33 # CHECK: c.ueq.s $fcc6, $f3, $f30
+0x46 0x32 0xcf 0x37 # CHECK: c.ule.d $fcc7, $f25, $f18
+0x46 0x1e 0xaf 0x37 # CHECK: c.ule.s $fcc7, $f21, $f30
+0x46 0x31 0x36 0x35 # CHECK: c.ult.d $fcc6, $f6, $f17
+0x46 0x0a 0xc7 0x35 # CHECK: c.ult.s $fcc7, $f24, $f10
+0x46 0x38 0xbe 0x31 # CHECK: c.un.d $fcc6, $f23, $f24
+0x46 0x04 0xf1 0x31 # CHECK: c.un.s $fcc1, $f30, $f4
+0x46 0xc0 0x45 0x85 # CHECK: abs.ps $f22, $f8
+0x46 0xcc 0xc6 0x00 # CHECK: add.ps $f24, $f24, $f12
+0x46 0xca 0x04 0x32 # CHECK: c.eq.ps $fcc4, $f0, $f10
+0x46 0xcc 0x66 0x30 # CHECK: c.f.ps $fcc6, $f12, $f12
+0x46 0xd4 0x42 0x3e # CHECK: c.le.ps $fcc2, $f8, $f20
+0x46 0xc4 0x90 0x3c # CHECK: c.lt.ps $f18, $f4
+0x46 0xda 0x10 0x3d # CHECK: c.nge.ps $f2, $f26
+0x46 0xde 0xb0 0x3b # CHECK: c.ngl.ps $f22, $f30
+0x46 0xd4 0x66 0x39 # CHECK: c.ngle.ps $fcc6, $f12, $f20
+0x46 0xc6 0xf6 0x3f # CHECK: c.ngt.ps $fcc6, $f30, $f6
+0x46 0xc8 0xa6 0x36 # CHECK: c.ole.ps $fcc6, $f20, $f8
+0x46 0xd0 0x32 0x34 # CHECK: c.olt.ps $fcc2, $f6, $f16
+0x46 0xce 0xf6 0x3a # CHECK: c.seq.ps $fcc6, $f30, $f14
+0x46 0xc6 0x26 0x38 # CHECK: c.sf.ps $fcc6, $f4, $f6
+0x46 0xdc 0x20 0x33 # CHECK: c.ueq.ps $f4, $f28
+0x46 0xc2 0x86 0x37 # CHECK: c.ule.ps $fcc6, $f16, $f2
+0x46 0xc0 0x76 0x35 # CHECK: c.ult.ps $fcc6, $f14, $f0
+0x46 0xda 0x14 0x31 # CHECK: c.un.ps $fcc4, $f2, $f26
+0x46 0x20 0x20 0x8a # CHECK: ceil.l.d $f2, $f4
+0x46 0x00 0x6c 0x8a # CHECK: ceil.l.s $f18, $f13
+0x46 0xa0 0x81 0x21 # CHECK: cvt.d.l $f4, $f16
+0x46 0x14 0x90 0xa6 # CHECK: cvt.ps.s $f2, $f18, $f20
+0x46 0xa0 0xf3 0xe0 # CHECK: cvt.s.l $f15, $f30
+0x46 0xc0 0x17 0xa8 # CHECK: cvt.s.pl $f30, $f2
+0x46 0xc0 0xd3 0xa0 # CHECK: cvt.s.pu $f14, $f26
+0x46 0x20 0x36 0x8b # CHECK: floor.l.d $f26, $f6
+0x46 0x00 0x23 0x0b # CHECK: floor.l.s $f12, $f4
+0x4c 0x42 0x75 0xa6 # CHECK: madd.ps $f22, $f2, $f14, $f2
+0x46 0xc0 0x85 0x86 # CHECK: mov.ps $f22, $f16
+0x46 0xd8 0xe2 0x91 # CHECK: movf.ps $f10, $f28, $fcc6
+0x46 0xd3 0xf7 0x93 # CHECK: movn.ps $f30, $f30, s3
+0x46 0xc9 0xc5 0x11 # CHECK: movt.ps $f20, $f24, $fcc2
+0x46 0xdf 0x84 0x92 # CHECK: movz.ps $f18, $f16, ra
+0x4d 0xd0 0xe3 0x2e # CHECK: msub.ps $f12, $f14, $f28, $f16
+0x46 0xc0 0x64 0x87 # CHECK: neg.ps $f18, $f12
+0x4c 0x98 0x46 0xb6 # CHECK: nmadd.ps $f26, $f4, $f8, $f24
+0x4d 0x90 0x71 0xbe # CHECK: nmsub.ps $f6, $f12, $f14, $f16
+0x46 0xde 0x46 0x2c # CHECK: pll.ps $f24, $f8, $f30
+0x46 0xdc 0xd0 0x2d # CHECK: plu.ps $f0, $f26, $f28
+0x46 0xda 0xf2 0x2e # CHECK: pul.ps $f8, $f30, $f26
+0x46 0xc2 0x46 0x2f # CHECK: puu.ps $f24, $f8, $f2
+0x41 0x49 0x98 0x00 # CHECK: rdpgpr s3, t1
+0x46 0x20 0x34 0x95 # CHECK: recip.d $f18, $f6
+0x46 0x00 0xf0 0xd5 # CHECK: recip.s $f3, $f30
+0x02 0xa7 0x68 0x46 # CHECK: rorv t5, a3, s5
+0x46 0x20 0x03 0x08 # CHECK: round.l.d $f12, $f0
+0x46 0x00 0x2e 0x08 # CHECK: round.l.s $f24, $f5
+0x46 0x20 0xe0 0x96 # CHECK: rsqrt.d $f2, $f28
+0x46 0x00 0x41 0x16 # CHECK: rsqrt.s $f4, $f8
+0x46 0xda 0x71 0x01 # CHECK: sub.ps $f4, $f14, $f26
+0x46 0x20 0xb5 0x89 # CHECK: trunc.l.d $f22, $f22
+0x46 0x00 0xff 0x09 # CHECK: trunc.l.s $f28, $f31
+0x41 0xcd 0x00 0x00 # CHECK: wrpgpr zero, t5
diff --git a/test/MC/Disassembler/Mips/mips32r5/valid-mips32r5-le.txt b/test/MC/Disassembler/Mips/mips32r5/valid-mips32r5-le.txt
new file mode 100644
index 0000000..62977dc
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32r5/valid-mips32r5-le.txt
@@ -0,0 +1,169 @@
+# RUN: llvm-mc --disassemble %s -triple=mipsel-unknown-linux -mcpu=mips32r5 | FileCheck %s
+0x05 0x73 0x20 0x46 # CHECK: abs.d $f12, $f14
+0x85 0x39 0x00 0x46 # CHECK: abs.s $f6, $f7
+0x20 0x48 0xc7 0x00 # CHECK: add $9, $6, $7
+0x00 0x62 0x2e 0x46 # CHECK: add.d $f8, $f12, $f14
+0x40 0x32 0x07 0x46 # CHECK: add.s $f9, $f6, $f7
+0x67 0x45 0xc9 0x20 # CHECK: addi $9, $6, 17767
+0x67 0xc5 0xc9 0x24 # CHECK: addiu $9, $6, -15001
+0x21 0x48 0xc7 0x00 # CHECK: addu $9, $6, $7
+0x24 0x48 0xc7 0x00 # CHECK: and $9, $6, $7
+0x67 0x45 0xc9 0x30 # CHECK: andi $9, $6, 17767
+0x4c 0x01 0x00 0x10 # CHECK: b 1332
+0x4c 0x01 0x00 0x45 # CHECK: bc1f 1332
+0x4c 0x01 0x1c 0x45 # CHECK: bc1f $fcc7, 1332
+0x4c 0x01 0x01 0x45 # CHECK: bc1t 1332
+0x4c 0x01 0x1d 0x45 # CHECK: bc1t $fcc7, 1332
+0x4c 0x01 0x26 0x11 # CHECK: beq $9, $6, 1332
+0x4c 0x01 0xc1 0x04 # CHECK: bgez $6, 1332
+0x4c 0x01 0xd1 0x04 # CHECK: bgezal $6, 1332
+0x4c 0x01 0xc0 0x1c # CHECK: bgtz $6, 1332
+0x4c 0x01 0xc0 0x18 # CHECK: blez $6, 1332
+0x4c 0x01 0x26 0x15 # CHECK: bne $9, $6, 1332
+0x32 0x60 0x2e 0x46 # CHECK: c.eq.d $f12, $f14
+0x32 0x30 0x07 0x46 # CHECK: c.eq.s $f6, $f7
+0x30 0x60 0x2e 0x46 # CHECK: c.f.d $f12, $f14
+0x30 0x30 0x07 0x46 # CHECK: c.f.s $f6, $f7
+0x3e 0x60 0x2e 0x46 # CHECK: c.le.d $f12, $f14
+0x3e 0x30 0x07 0x46 # CHECK: c.le.s $f6, $f7
+0x3c 0x60 0x2e 0x46 # CHECK: c.lt.d $f12, $f14
+0x3c 0x30 0x07 0x46 # CHECK: c.lt.s $f6, $f7
+0x3d 0x60 0x2e 0x46 # CHECK: c.nge.d $f12, $f14
+0x3d 0x30 0x07 0x46 # CHECK: c.nge.s $f6, $f7
+0x3b 0x60 0x2e 0x46 # CHECK: c.ngl.d $f12, $f14
+0x3b 0x30 0x07 0x46 # CHECK: c.ngl.s $f6, $f7
+0x39 0x60 0x2e 0x46 # CHECK: c.ngle.d $f12, $f14
+0x39 0x30 0x07 0x46 # CHECK: c.ngle.s $f6, $f7
+0x3f 0x60 0x2e 0x46 # CHECK: c.ngt.d $f12, $f14
+0x3f 0x30 0x07 0x46 # CHECK: c.ngt.s $f6, $f7
+0x36 0x60 0x2e 0x46 # CHECK: c.ole.d $f12, $f14
+0x36 0x30 0x07 0x46 # CHECK: c.ole.s $f6, $f7
+0x34 0x60 0x2e 0x46 # CHECK: c.olt.d $f12, $f14
+0x34 0x30 0x07 0x46 # CHECK: c.olt.s $f6, $f7
+0x3a 0x60 0x2e 0x46 # CHECK: c.seq.d $f12, $f14
+0x3a 0x30 0x07 0x46 # CHECK: c.seq.s $f6, $f7
+0x38 0x60 0x2e 0x46 # CHECK: c.sf.d $f12, $f14
+0x38 0x30 0x07 0x46 # CHECK: c.sf.s $f6, $f7
+0x33 0x60 0x2e 0x46 # CHECK: c.ueq.d $f12, $f14
+0x33 0xe0 0x12 0x46 # CHECK: c.ueq.s $f28, $f18
+0x37 0x60 0x2e 0x46 # CHECK: c.ule.d $f12, $f14
+0x37 0x30 0x07 0x46 # CHECK: c.ule.s $f6, $f7
+0x35 0x60 0x2e 0x46 # CHECK: c.ult.d $f12, $f14
+0x35 0x30 0x07 0x46 # CHECK: c.ult.s $f6, $f7
+0x31 0x60 0x2e 0x46 # CHECK: c.un.d $f12, $f14
+0x31 0x30 0x07 0x46 # CHECK: c.un.s $f6, $f7
+0x0e 0x73 0x20 0x46 # CHECK: ceil.w.d $f12, $f14
+0x8e 0x39 0x00 0x46 # CHECK: ceil.w.s $f6, $f7
+0x00 0x38 0x46 0x44 # CHECK: cfc1 $6, $7
+0x21 0x30 0xe6 0x70 # CHECK: clo $6, $7
+0x20 0x30 0xe6 0x70 # CHECK: clz $6, $7
+0x00 0x38 0xc6 0x44 # CHECK: ctc1 $6, $7
+0xa1 0x39 0x00 0x46 # CHECK: cvt.d.s $f6, $f7
+0x21 0x73 0x80 0x46 # CHECK: cvt.d.w $f12, $f14
+0x25 0x73 0x20 0x46 # CHECK: cvt.l.d $f12, $f14
+0xa5 0x39 0x00 0x46 # CHECK: cvt.l.s $f6, $f7
+0x20 0x73 0x20 0x46 # CHECK: cvt.s.d $f12, $f14
+0xa0 0x39 0x80 0x46 # CHECK: cvt.s.w $f6, $f7
+0x24 0x73 0x20 0x46 # CHECK: cvt.w.d $f12, $f14
+0xa4 0x39 0x00 0x46 # CHECK: cvt.w.s $f6, $f7
+0x00 0x60 0x7e 0x41 # CHECK: di $fp
+0x00 0x60 0x60 0x41 # CHECK: di
+0x20 0x60 0x6e 0x41 # CHECK: ei $14
+0x20 0x60 0x60 0x41 # CHECK: ei
+0x0f 0x73 0x20 0x46 # CHECK: floor.w.d $f12, $f14
+0x8f 0x39 0x00 0x46 # CHECK: floor.w.s $f6, $f7
+0x84 0x61 0x33 0x7d # CHECK: ins $19, $9, 6, 7
+0x4c 0x01 0x00 0x08 # CHECK: j 1328
+0x4c 0x01 0x00 0x0c # CHECK: jal 1328
+0x4c 0x01 0x00 0x74 # CHECK: jalx 1328
+0x09 0xf8 0xe0 0x00 # CHECK: jalr $7
+0x09 0xfc 0x80 0x00 # CHECK: jalr.hb $4
+0x09 0x24 0xa0 0x00 # CHECK: jalr.hb $4, $5
+0x08 0x00 0xe0 0x00 # CHECK: jr $7
+0xc6 0x23 0xa4 0x80 # CHECK: lb $4, 9158($5)
+0x06 0x00 0xa4 0x90 # CHECK: lbu $4, 6($5)
+0xc6 0x23 0xe9 0xd4 # CHECK: ldc1 $f9, 9158($7)
+0x01 0x02 0xf7 0x4d # CHECK: ldxc1 $f8, $23($15)
+0x0c 0x00 0xa4 0x84 # CHECK: lh $4, 12($5)
+0x0c 0x00 0xa4 0x84 # CHECK: lh $4, 12($5)
+0xc6 0x23 0xe9 0xc0 # CHECK: ll $9, 9158($7)
+0x67 0x45 0x06 0x3c # CHECK: lui $6, 17767
+0x05 0x00 0xa6 0x4c # CHECK: luxc1 $f0, $6($5)
+0x18 0x00 0xa4 0x8c # CHECK: lw $4, 24($5)
+0xc6 0x23 0xe9 0xc4 # CHECK: lwc1 $f9, 9158($7)
+0x03 0x00 0x82 0x88 # CHECK: lwl $2, 3($4)
+0x10 0x00 0xa3 0x98 # CHECK: lwr $3, 16($5)
+0x00 0x05 0xcc 0x4d # CHECK: lwxc1 $f20, $12($14)
+0x00 0x00 0xc7 0x70 # CHECK: madd $6, $7
+0xa1 0xd4 0x94 0x4e # CHECK: madd.d $f18, $f20, $f26, $f20
+0x60 0x98 0xf9 0x4f # CHECK: madd.s $f1, $f31, $f19, $f25
+0x01 0x00 0xc7 0x70 # CHECK: maddu $6, $7
+0x00 0x38 0x06 0x44 # CHECK: mfc1 $6, $f7
+0x10 0x28 0x00 0x00 # CHECK: mfhi $5
+0x00 0xc0 0x7e 0x44 # CHECK: mfhc1 $fp, $f24
+0x12 0x28 0x00 0x00 # CHECK: mflo $5
+0x86 0x41 0x20 0x46 # CHECK: mov.d $f6, $f8
+0x86 0x39 0x00 0x46 # CHECK: mov.s $f6, $f7
+0x04 0x00 0xc7 0x70 # CHECK: msub $6, $7
+0xa9 0xf2 0x52 0x4c # CHECK: msub.d $f10, $f2, $f30, $f18
+0x28 0x53 0x70 0x4e # CHECK: msub.s $f12, $f19, $f10, $f16
+0x05 0x00 0xc7 0x70 # CHECK: msubu $6, $7
+0x00 0x38 0x86 0x44 # CHECK: mtc1 $6, $f7
+0x11 0x00 0xe0 0x00 # CHECK: mthi $7
+0x00 0x80 0xe0 0x44 # CHECK: mthc1 $zero, $f16
+0x13 0x00 0xe0 0x00 # CHECK: mtlo $7
+0x02 0x62 0x2e 0x46 # CHECK: mul.d $f8, $f12, $f14
+0x42 0x32 0x07 0x46 # CHECK: mul.s $f9, $f6, $f7
+0x02 0x48 0xc7 0x70 # CHECK: mul $9, $6, $7
+0x18 0x00 0x65 0x00 # CHECK: mult $3, $5
+0x19 0x00 0x65 0x00 # CHECK: multu $3, $5
+0x07 0x73 0x20 0x46 # CHECK: neg.d $f12, $f14
+0x87 0x39 0x00 0x46 # CHECK: neg.s $f6, $f7
+0xb1 0x74 0x54 0x4d # CHECK: nmadd.d $f18, $f10, $f14, $f20
+0x30 0xc8 0xac 0x4c # CHECK: nmadd.s $f0, $f5, $f25, $f12
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x27 0x48 0xc7 0x00 # CHECK: nor $9, $6, $7
+0xb9 0x87 0x1e 0x4d # CHECK: nmsub.d $f30, $f8, $f16, $f30
+0x78 0x98 0x04 0x4f # CHECK: nmsub.s $f1, $f24, $f19, $f4
+0x25 0x18 0x65 0x00 # CHECK: or $3, $3, $5
+0x67 0x45 0xc9 0x34 # CHECK: ori $9, $6, 17767
+0xc2 0x49 0x26 0x00 # CHECK: rotr $9, $6, 7
+0x46 0x48 0xe6 0x00 # CHECK: rotrv $9, $6, $7
+0x0c 0x73 0x20 0x46 # CHECK: round.w.d $f12, $f14
+0x8c 0x39 0x00 0x46 # CHECK: round.w.s $f6, $f7
+0xc6 0x23 0xa4 0xa0 # CHECK: sb $4, 9158($5)
+0x06 0x00 0xa4 0xa0 # CHECK: sb $4, 6($5)
+0xc6 0x23 0xe9 0xe0 # CHECK: sc $9, 9158($7)
+0xc6 0x23 0xe9 0xf4 # CHECK: sdc1 $f9, 9158($7)
+0x09 0x40 0x24 0x4f # CHECK: sdxc1 $f8, $4($25)
+0x20 0x34 0x07 0x7c # CHECK: seb $6, $7
+0x20 0x36 0x07 0x7c # CHECK: seh $6, $7
+0xc6 0x23 0xa4 0xa4 # CHECK: sh $4, 9158($5)
+0xc0 0x21 0x03 0x00 # CHECK: sll $4, $3, 7
+0x04 0x10 0xa3 0x00 # CHECK: sllv $2, $3, $5
+0x2a 0x18 0x65 0x00 # CHECK: slt $3, $3, $5
+0x67 0x00 0x63 0x28 # CHECK: slti $3, $3, 103
+0x67 0x00 0x63 0x2c # CHECK: sltiu $3, $3, 103
+0x2b 0x18 0x65 0x00 # CHECK: sltu $3, $3, $5
+0x04 0x73 0x20 0x46 # CHECK: sqrt.d $f12, $f14
+0x84 0x39 0x00 0x46 # CHECK: sqrt.s $f6, $f7
+0xc3 0x21 0x03 0x00 # CHECK: sra $4, $3, 7
+0x07 0x10 0xa3 0x00 # CHECK: srav $2, $3, $5
+0xc2 0x21 0x03 0x00 # CHECK: srl $4, $3, 7
+0x06 0x10 0xa3 0x00 # CHECK: srlv $2, $3, $5
+0x01 0x62 0x2e 0x46 # CHECK: sub.d $f8, $f12, $f14
+0x41 0x32 0x07 0x46 # CHECK: sub.s $f9, $f6, $f7
+0x22 0x48 0xc7 0x00 # CHECK: sub $9, $6, $7
+0x23 0x20 0x65 0x00 # CHECK: subu $4, $3, $5
+0x0d 0x20 0xb8 0x4c # CHECK: suxc1 $f4, $24($5)
+0x18 0x00 0xa4 0xac # CHECK: sw $4, 24($5)
+0xc6 0x23 0xe9 0xe4 # CHECK: swc1 $f9, 9158($7)
+0x10 0x00 0xa4 0xa8 # CHECK: swl $4, 16($5)
+0x10 0x00 0xe6 0xb8 # CHECK: swr $6, 16($7)
+0x08 0xd0 0xd2 0x4e # CHECK: swxc1 $f26, $18($22)
+0xcf 0x01 0x00 0x00 # CHECK: sync 7
+0x0d 0x73 0x20 0x46 # CHECK: trunc.w.d $f12, $f14
+0x8d 0x39 0x00 0x46 # CHECK: trunc.w.s $f6, $f7
+0xa0 0x30 0x07 0x7c # CHECK: wsbh $6, $7
+0x26 0x18 0x65 0x00 # CHECK: xor $3, $3, $5
+0x67 0x45 0xc9 0x38 # CHECK: xori $9, $6, 17767
diff --git a/test/MC/Disassembler/Mips/mips32r5/valid-mips32r5.txt b/test/MC/Disassembler/Mips/mips32r5/valid-mips32r5.txt
new file mode 100644
index 0000000..39c4644
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32r5/valid-mips32r5.txt
@@ -0,0 +1,169 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r5 | FileCheck %s
+0x46 0x20 0x73 0x05 # CHECK: abs.d $f12, $f14
+0x46 0x00 0x39 0x85 # CHECK: abs.s $f6, $f7
+0x00 0xc7 0x48 0x20 # CHECK: add $9, $6, $7
+0x46 0x2e 0x62 0x00 # CHECK: add.d $f8, $f12, $f14
+0x46 0x07 0x32 0x40 # CHECK: add.s $f9, $f6, $f7
+0x20 0xc9 0x45 0x67 # CHECK: addi $9, $6, 17767
+0x24 0xc9 0xc5 0x67 # CHECK: addiu $9, $6, -15001
+0x00 0xc7 0x48 0x21 # CHECK: addu $9, $6, $7
+0x00 0xc7 0x48 0x24 # CHECK: and $9, $6, $7
+0x30 0xc9 0x45 0x67 # CHECK: andi $9, $6, 17767
+0x10 0x00 0x01 0x4c # CHECK: b 1332
+0x45 0x00 0x01 0x4c # CHECK: bc1f 1332
+0x45 0x1c 0x01 0x4c # CHECK: bc1f $fcc7, 1332
+0x45 0x01 0x01 0x4c # CHECK: bc1t 1332
+0x45 0x1d 0x01 0x4c # CHECK: bc1t $fcc7, 1332
+0x11 0x26 0x01 0x4c # CHECK: beq $9, $6, 1332
+0x04 0xc1 0x01 0x4c # CHECK: bgez $6, 1332
+0x04 0xd1 0x01 0x4c # CHECK: bgezal $6, 1332
+0x1c 0xc0 0x01 0x4c # CHECK: bgtz $6, 1332
+0x18 0xc0 0x01 0x4c # CHECK: blez $6, 1332
+0x15 0x26 0x01 0x4c # CHECK: bne $9, $6, 1332
+0x46 0x2e 0x60 0x32 # CHECK: c.eq.d $f12, $f14
+0x46 0x07 0x30 0x32 # CHECK: c.eq.s $f6, $f7
+0x46 0x2e 0x60 0x30 # CHECK: c.f.d $f12, $f14
+0x46 0x07 0x30 0x30 # CHECK: c.f.s $f6, $f7
+0x46 0x2e 0x60 0x3e # CHECK: c.le.d $f12, $f14
+0x46 0x07 0x30 0x3e # CHECK: c.le.s $f6, $f7
+0x46 0x2e 0x60 0x3c # CHECK: c.lt.d $f12, $f14
+0x46 0x07 0x30 0x3c # CHECK: c.lt.s $f6, $f7
+0x46 0x2e 0x60 0x3d # CHECK: c.nge.d $f12, $f14
+0x46 0x07 0x30 0x3d # CHECK: c.nge.s $f6, $f7
+0x46 0x2e 0x60 0x3b # CHECK: c.ngl.d $f12, $f14
+0x46 0x07 0x30 0x3b # CHECK: c.ngl.s $f6, $f7
+0x46 0x2e 0x60 0x39 # CHECK: c.ngle.d $f12, $f14
+0x46 0x07 0x30 0x39 # CHECK: c.ngle.s $f6, $f7
+0x46 0x2e 0x60 0x3f # CHECK: c.ngt.d $f12, $f14
+0x46 0x07 0x30 0x3f # CHECK: c.ngt.s $f6, $f7
+0x46 0x2e 0x60 0x36 # CHECK: c.ole.d $f12, $f14
+0x46 0x07 0x30 0x36 # CHECK: c.ole.s $f6, $f7
+0x46 0x2e 0x60 0x34 # CHECK: c.olt.d $f12, $f14
+0x46 0x07 0x30 0x34 # CHECK: c.olt.s $f6, $f7
+0x46 0x2e 0x60 0x3a # CHECK: c.seq.d $f12, $f14
+0x46 0x07 0x30 0x3a # CHECK: c.seq.s $f6, $f7
+0x46 0x2e 0x60 0x38 # CHECK: c.sf.d $f12, $f14
+0x46 0x07 0x30 0x38 # CHECK: c.sf.s $f6, $f7
+0x46 0x2e 0x60 0x33 # CHECK: c.ueq.d $f12, $f14
+0x46 0x12 0xe0 0x33 # CHECK: c.ueq.s $f28, $f18
+0x46 0x2e 0x60 0x37 # CHECK: c.ule.d $f12, $f14
+0x46 0x07 0x30 0x37 # CHECK: c.ule.s $f6, $f7
+0x46 0x2e 0x60 0x35 # CHECK: c.ult.d $f12, $f14
+0x46 0x07 0x30 0x35 # CHECK: c.ult.s $f6, $f7
+0x46 0x2e 0x60 0x31 # CHECK: c.un.d $f12, $f14
+0x46 0x07 0x30 0x31 # CHECK: c.un.s $f6, $f7
+0x46 0x20 0x73 0x0e # CHECK: ceil.w.d $f12, $f14
+0x46 0x00 0x39 0x8e # CHECK: ceil.w.s $f6, $f7
+0x44 0x46 0x38 0x00 # CHECK: cfc1 $6, $7
+0x70 0xe6 0x30 0x21 # CHECK: clo $6, $7
+0x70 0xe6 0x30 0x20 # CHECK: clz $6, $7
+0x44 0xc6 0x38 0x00 # CHECK: ctc1 $6, $7
+0x46 0x00 0x39 0xa1 # CHECK: cvt.d.s $f6, $f7
+0x46 0x80 0x73 0x21 # CHECK: cvt.d.w $f12, $f14
+0x46 0x20 0x73 0x25 # CHECK: cvt.l.d $f12, $f14
+0x46 0x00 0x39 0xa5 # CHECK: cvt.l.s $f6, $f7
+0x46 0x20 0x73 0x20 # CHECK: cvt.s.d $f12, $f14
+0x46 0x80 0x39 0xa0 # CHECK: cvt.s.w $f6, $f7
+0x46 0x20 0x73 0x24 # CHECK: cvt.w.d $f12, $f14
+0x46 0x00 0x39 0xa4 # CHECK: cvt.w.s $f6, $f7
+0x41 0x7e 0x60 0x00 # CHECK: di $fp
+0x41 0x60 0x60 0x00 # CHECK: di
+0x41 0x6e 0x60 0x20 # CHECK: ei $14
+0x41 0x60 0x60 0x20 # CHECK: ei
+0x46 0x20 0x73 0x0f # CHECK: floor.w.d $f12, $f14
+0x46 0x00 0x39 0x8f # CHECK: floor.w.s $f6, $f7
+0x7d 0x33 0x61 0x84 # CHECK: ins $19, $9, 6, 7
+0x08 0x00 0x01 0x4c # CHECK: j 1328
+0x0c 0x00 0x01 0x4c # CHECK: jal 1328
+0x74 0x00 0x01 0x4c # CHECK: jalx 1328
+0x00 0xe0 0xf8 0x09 # CHECK: jalr $7
+0x00 0x80 0xfc 0x09 # CHECK: jalr.hb $4
+0x00 0xa0 0x24 0x09 # CHECK: jalr.hb $4, $5
+0x00 0xe0 0x00 0x08 # CHECK: jr $7
+0x80 0xa4 0x23 0xc6 # CHECK: lb $4, 9158($5)
+0x90 0xa4 0x00 0x06 # CHECK: lbu $4, 6($5)
+0xd4 0xe9 0x23 0xc6 # CHECK: ldc1 $f9, 9158($7)
+0x4d 0xf7 0x02 0x01 # CHECK: ldxc1 $f8, $23($15)
+0x84 0xa4 0x00 0x0c # CHECK: lh $4, 12($5)
+0x84 0xa4 0x00 0x0c # CHECK: lh $4, 12($5)
+0xc0 0xe9 0x23 0xc6 # CHECK: ll $9, 9158($7)
+0x3c 0x06 0x45 0x67 # CHECK: lui $6, 17767
+0x4c 0xa6 0x00 0x05 # CHECK: luxc1 $f0, $6($5)
+0x8c 0xa4 0x00 0x18 # CHECK: lw $4, 24($5)
+0xc4 0xe9 0x23 0xc6 # CHECK: lwc1 $f9, 9158($7)
+0x88 0x82 0x00 0x03 # CHECK: lwl $2, 3($4)
+0x98 0xa3 0x00 0x10 # CHECK: lwr $3, 16($5)
+0x4d 0xcc 0x05 0x00 # CHECK: lwxc1 $f20, $12($14)
+0x70 0xc7 0x00 0x00 # CHECK: madd $6, $7
+0x4e 0x94 0xd4 0xa1 # CHECK: madd.d $f18, $f20, $f26, $f20
+0x4f 0xf9 0x98 0x60 # CHECK: madd.s $f1, $f31, $f19, $f25
+0x70 0xc7 0x00 0x01 # CHECK: maddu $6, $7
+0x44 0x06 0x38 0x00 # CHECK: mfc1 $6, $f7
+0x00 0x00 0x28 0x10 # CHECK: mfhi $5
+0x44 0x7e 0xc0 0x00 # CHECK: mfhc1 $fp, $f24
+0x00 0x00 0x28 0x12 # CHECK: mflo $5
+0x46 0x20 0x41 0x86 # CHECK: mov.d $f6, $f8
+0x46 0x00 0x39 0x86 # CHECK: mov.s $f6, $f7
+0x70 0xc7 0x00 0x04 # CHECK: msub $6, $7
+0x4c 0x52 0xf2 0xa9 # CHECK: msub.d $f10, $f2, $f30, $f18
+0x4e 0x70 0x53 0x28 # CHECK: msub.s $f12, $f19, $f10, $f16
+0x70 0xc7 0x00 0x05 # CHECK: msubu $6, $7
+0x44 0x86 0x38 0x00 # CHECK: mtc1 $6, $f7
+0x00 0xe0 0x00 0x11 # CHECK: mthi $7
+0x44 0xe0 0x80 0x00 # CHECK: mthc1 $zero, $f16
+0x00 0xe0 0x00 0x13 # CHECK: mtlo $7
+0x46 0x2e 0x62 0x02 # CHECK: mul.d $f8, $f12, $f14
+0x46 0x07 0x32 0x42 # CHECK: mul.s $f9, $f6, $f7
+0x70 0xc7 0x48 0x02 # CHECK: mul $9, $6, $7
+0x00 0x65 0x00 0x18 # CHECK: mult $3, $5
+0x00 0x65 0x00 0x19 # CHECK: multu $3, $5
+0x46 0x20 0x73 0x07 # CHECK: neg.d $f12, $f14
+0x46 0x00 0x39 0x87 # CHECK: neg.s $f6, $f7
+0x4d 0x54 0x74 0xb1 # CHECK: nmadd.d $f18, $f10, $f14, $f20
+0x4c 0xac 0xc8 0x30 # CHECK: nmadd.s $f0, $f5, $f25, $f12
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x00 0xc7 0x48 0x27 # CHECK: nor $9, $6, $7
+0x4d 0x1e 0x87 0xb9 # CHECK: nmsub.d $f30, $f8, $f16, $f30
+0x4f 0x04 0x98 0x78 # CHECK: nmsub.s $f1, $f24, $f19, $f4
+0x00 0x65 0x18 0x25 # CHECK: or $3, $3, $5
+0x34 0xc9 0x45 0x67 # CHECK: ori $9, $6, 17767
+0x00 0x26 0x49 0xc2 # CHECK: rotr $9, $6, 7
+0x00 0xe6 0x48 0x46 # CHECK: rotrv $9, $6, $7
+0x46 0x20 0x73 0x0c # CHECK: round.w.d $f12, $f14
+0x46 0x00 0x39 0x8c # CHECK: round.w.s $f6, $f7
+0xa0 0xa4 0x23 0xc6 # CHECK: sb $4, 9158($5)
+0xa0 0xa4 0x00 0x06 # CHECK: sb $4, 6($5)
+0xe0 0xe9 0x23 0xc6 # CHECK: sc $9, 9158($7)
+0xf4 0xe9 0x23 0xc6 # CHECK: sdc1 $f9, 9158($7)
+0x4f 0x24 0x40 0x09 # CHECK: sdxc1 $f8, $4($25)
+0x7c 0x07 0x34 0x20 # CHECK: seb $6, $7
+0x7c 0x07 0x36 0x20 # CHECK: seh $6, $7
+0xa4 0xa4 0x23 0xc6 # CHECK: sh $4, 9158($5)
+0x00 0x03 0x21 0xc0 # CHECK: sll $4, $3, 7
+0x00 0xa3 0x10 0x04 # CHECK: sllv $2, $3, $5
+0x00 0x65 0x18 0x2a # CHECK: slt $3, $3, $5
+0x28 0x63 0x00 0x67 # CHECK: slti $3, $3, 103
+0x2c 0x63 0x00 0x67 # CHECK: sltiu $3, $3, 103
+0x00 0x65 0x18 0x2b # CHECK: sltu $3, $3, $5
+0x46 0x20 0x73 0x04 # CHECK: sqrt.d $f12, $f14
+0x46 0x00 0x39 0x84 # CHECK: sqrt.s $f6, $f7
+0x00 0x03 0x21 0xc3 # CHECK: sra $4, $3, 7
+0x00 0xa3 0x10 0x07 # CHECK: srav $2, $3, $5
+0x00 0x03 0x21 0xc2 # CHECK: srl $4, $3, 7
+0x00 0xa3 0x10 0x06 # CHECK: srlv $2, $3, $5
+0x46 0x2e 0x62 0x01 # CHECK: sub.d $f8, $f12, $f14
+0x46 0x07 0x32 0x41 # CHECK: sub.s $f9, $f6, $f7
+0x00 0xc7 0x48 0x22 # CHECK: sub $9, $6, $7
+0x00 0x65 0x20 0x23 # CHECK: subu $4, $3, $5
+0x4c 0xb8 0x20 0x0d # CHECK: suxc1 $f4, $24($5)
+0xac 0xa4 0x00 0x18 # CHECK: sw $4, 24($5)
+0xe4 0xe9 0x23 0xc6 # CHECK: swc1 $f9, 9158($7)
+0xa8 0xa4 0x00 0x10 # CHECK: swl $4, 16($5)
+0xb8 0xe6 0x00 0x10 # CHECK: swr $6, 16($7)
+0x4e 0xd2 0xd0 0x08 # CHECK: swxc1 $f26, $18($22)
+0x00 0x00 0x01 0xcf # CHECK: sync 7
+0x46 0x20 0x73 0x0d # CHECK: trunc.w.d $f12, $f14
+0x46 0x00 0x39 0x8d # CHECK: trunc.w.s $f6, $f7
+0x7c 0x07 0x30 0xa0 # CHECK: wsbh $6, $7
+0x00 0x65 0x18 0x26 # CHECK: xor $3, $3, $5
+0x38 0xc9 0x45 0x67 # CHECK: xori $9, $6, 17767
diff --git a/test/MC/Disassembler/Mips/mips32r5/valid-xfail-mips32r5.txt b/test/MC/Disassembler/Mips/mips32r5/valid-xfail-mips32r5.txt
new file mode 100644
index 0000000..27f5498
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32r5/valid-xfail-mips32r5.txt
@@ -0,0 +1,83 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -disassemble -mcpu=mips32r5 | FileCheck %s
+# XFAIL: *
+0x46 0x2f 0x79 0x32 # CHECK: c.eq.d $fcc1, $f15, $f15
+0x46 0x11 0xc5 0x32 # CHECK: c.eq.s $fcc5, $f24, $f17
+0x46 0x35 0x5c 0x30 # CHECK: c.f.d $fcc4, $f11, $f21
+0x46 0x07 0xf4 0x30 # CHECK: c.f.s $fcc4, $f30, $f7
+0x46 0x21 0x94 0x3e # CHECK: c.le.d $fcc4, $f18, $f1
+0x46 0x04 0xc6 0x3e # CHECK: c.le.s $fcc6, $f24, $f4
+0x46 0x23 0x4b 0x3c # CHECK: c.lt.d $fcc3, $f9, $f3
+0x46 0x0e 0x8a 0x3c # CHECK: c.lt.s $fcc2, $f17, $f14
+0x46 0x30 0xad 0x3d # CHECK: c.nge.d $fcc5, $f21, $f16
+0x46 0x08 0x5b 0x3d # CHECK: c.nge.s $fcc3, $f11, $f8
+0x46 0x17 0xfa 0x3b # CHECK: c.ngl.s $fcc2, $f31, $f23
+0x46 0x17 0x92 0x39 # CHECK: c.ngle.s $fcc2, $f18, $f23
+0x46 0x27 0xc4 0x3f # CHECK: c.ngt.d $fcc4, $f24, $f7
+0x46 0x0d 0x45 0x3f # CHECK: c.ngt.s $fcc5, $f8, $f13
+0x46 0x3f 0x82 0x36 # CHECK: c.ole.d $fcc2, $f16, $f31
+0x46 0x14 0x3b 0x36 # CHECK: c.ole.s $fcc3, $f7, $f20
+0x46 0x3c 0x9c 0x34 # CHECK: c.olt.d $fcc4, $f19, $f28
+0x46 0x07 0xa6 0x34 # CHECK: c.olt.s $fcc6, $f20, $f7
+0x46 0x27 0xfc 0x3a # CHECK: c.seq.d $fcc4, $f31, $f7
+0x46 0x19 0x0f 0x3a # CHECK: c.seq.s $fcc7, $f1, $f25
+0x46 0x39 0x6c 0x33 # CHECK: c.ueq.d $fcc4, $f13, $f25
+0x46 0x1e 0x1e 0x33 # CHECK: c.ueq.s $fcc6, $f3, $f30
+0x46 0x32 0xcf 0x37 # CHECK: c.ule.d $fcc7, $f25, $f18
+0x46 0x1e 0xaf 0x37 # CHECK: c.ule.s $fcc7, $f21, $f30
+0x46 0x31 0x36 0x35 # CHECK: c.ult.d $fcc6, $f6, $f17
+0x46 0x0a 0xc7 0x35 # CHECK: c.ult.s $fcc7, $f24, $f10
+0x46 0x38 0xbe 0x31 # CHECK: c.un.d $fcc6, $f23, $f24
+0x46 0x04 0xf1 0x31 # CHECK: c.un.s $fcc1, $f30, $f4
+0x46 0xc0 0x45 0x85 # CHECK: abs.ps $f22, $f8
+0x46 0xcc 0xc6 0x00 # CHECK: add.ps $f24, $f24, $f12
+0x46 0xca 0x04 0x32 # CHECK: c.eq.ps $fcc4, $f0, $f10
+0x46 0xcc 0x66 0x30 # CHECK: c.f.ps $fcc6, $f12, $f12
+0x46 0xd4 0x42 0x3e # CHECK: c.le.ps $fcc2, $f8, $f20
+0x46 0xc4 0x90 0x3c # CHECK: c.lt.ps $f18, $f4
+0x46 0xda 0x10 0x3d # CHECK: c.nge.ps $f2, $f26
+0x46 0xde 0xb0 0x3b # CHECK: c.ngl.ps $f22, $f30
+0x46 0xd4 0x66 0x39 # CHECK: c.ngle.ps $fcc6, $f12, $f20
+0x46 0xc6 0xf6 0x3f # CHECK: c.ngt.ps $fcc6, $f30, $f6
+0x46 0xc8 0xa6 0x36 # CHECK: c.ole.ps $fcc6, $f20, $f8
+0x46 0xd0 0x32 0x34 # CHECK: c.olt.ps $fcc2, $f6, $f16
+0x46 0xce 0xf6 0x3a # CHECK: c.seq.ps $fcc6, $f30, $f14
+0x46 0xc6 0x26 0x38 # CHECK: c.sf.ps $fcc6, $f4, $f6
+0x46 0xdc 0x20 0x33 # CHECK: c.ueq.ps $f4, $f28
+0x46 0xc2 0x86 0x37 # CHECK: c.ule.ps $fcc6, $f16, $f2
+0x46 0xc0 0x76 0x35 # CHECK: c.ult.ps $fcc6, $f14, $f0
+0x46 0xda 0x14 0x31 # CHECK: c.un.ps $fcc4, $f2, $f26
+0x46 0x20 0x20 0x8a # CHECK: ceil.l.d $f2, $f4
+0x46 0x00 0x6c 0x8a # CHECK: ceil.l.s $f18, $f13
+0x46 0xa0 0x81 0x21 # CHECK: cvt.d.l $f4, $f16
+0x46 0x14 0x90 0xa6 # CHECK: cvt.ps.s $f2, $f18, $f20
+0x46 0xa0 0xf3 0xe0 # CHECK: cvt.s.l $f15, $f30
+0x46 0xc0 0x17 0xa8 # CHECK: cvt.s.pl $f30, $f2
+0x46 0xc0 0xd3 0xa0 # CHECK: cvt.s.pu $f14, $f26
+0x46 0x20 0x36 0x8b # CHECK: floor.l.d $f26, $f6
+0x46 0x00 0x23 0x0b # CHECK: floor.l.s $f12, $f4
+0x4c 0x42 0x75 0xa6 # CHECK: madd.ps $f22, $f2, $f14, $f2
+0x46 0xc0 0x85 0x86 # CHECK: mov.ps $f22, $f16
+0x46 0xd8 0xe2 0x91 # CHECK: movf.ps $f10, $f28, $fcc6
+0x46 0xd3 0xf7 0x93 # CHECK: movn.ps $f30, $f30, s3
+0x46 0xc9 0xc5 0x11 # CHECK: movt.ps $f20, $f24, $fcc2
+0x46 0xdf 0x84 0x92 # CHECK: movz.ps $f18, $f16, ra
+0x4d 0xd0 0xe3 0x2e # CHECK: msub.ps $f12, $f14, $f28, $f16
+0x46 0xc0 0x64 0x87 # CHECK: neg.ps $f18, $f12
+0x4c 0x98 0x46 0xb6 # CHECK: nmadd.ps $f26, $f4, $f8, $f24
+0x4d 0x90 0x71 0xbe # CHECK: nmsub.ps $f6, $f12, $f14, $f16
+0x46 0xde 0x46 0x2c # CHECK: pll.ps $f24, $f8, $f30
+0x46 0xdc 0xd0 0x2d # CHECK: plu.ps $f0, $f26, $f28
+0x46 0xda 0xf2 0x2e # CHECK: pul.ps $f8, $f30, $f26
+0x46 0xc2 0x46 0x2f # CHECK: puu.ps $f24, $f8, $f2
+0x41 0x49 0x98 0x00 # CHECK: rdpgpr s3, t1
+0x46 0x20 0x34 0x95 # CHECK: recip.d $f18, $f6
+0x46 0x00 0xf0 0xd5 # CHECK: recip.s $f3, $f30
+0x02 0xa7 0x68 0x46 # CHECK: rorv t5, a3, s5
+0x46 0x20 0x03 0x08 # CHECK: round.l.d $f12, $f0
+0x46 0x00 0x2e 0x08 # CHECK: round.l.s $f24, $f5
+0x46 0x20 0xe0 0x96 # CHECK: rsqrt.d $f2, $f28
+0x46 0x00 0x41 0x16 # CHECK: rsqrt.s $f4, $f8
+0x46 0xda 0x71 0x01 # CHECK: sub.ps $f4, $f14, $f26
+0x46 0x20 0xb5 0x89 # CHECK: trunc.l.d $f22, $f22
+0x46 0x00 0xff 0x09 # CHECK: trunc.l.s $f28, $f31
+0x41 0xcd 0x00 0x00 # CHECK: wrpgpr zero, t5
diff --git a/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt b/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt
new file mode 100644
index 0000000..c10d166
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt
@@ -0,0 +1,148 @@
+# RUN: llvm-mc --disassemble %s -triple=mipsel-unknown-linux -mcpu=mips32r6 | FileCheck %s
+0x04 0x00 0x42 0x30 # CHECK: andi $2, $2, 4
+0x19 0x00 0x80 0xec # CHECK: addiupc $4, 100
+0x0a 0x00 0x29 0x25 # CHECK: addiu $9, $9, 10
+0xa0 0x22 0x43 0x7c # CHECK: align $4, $2, $3, 2
+0x38 0x00 0x7f 0xec # CHECK: aluipc $3, 56
+0xe9 0xff 0x62 0x3c # CHECK: aui $3, $2, -23
+0xff 0xff 0x7e 0xec # CHECK: auipc $3, -1
+0x9b 0x14 0x11 0x04 # CHECK: bal 21104
+0xb8 0x96 0x37 0xe8 # CHECK: balc 14572256
+0xb8 0x96 0x37 0xc8 # CHECK: bc 14572256
+0x01 0x00 0x20 0x45 # CHECK: bc1eqz $f0, 8
+0x01 0x00 0x3f 0x45 # CHECK: bc1eqz $f31, 8
+0x01 0x00 0xa0 0x45 # CHECK: bc1nez $f0, 8
+0x01 0x00 0xbf 0x45 # CHECK: bc1nez $f31, 8
+0x02 0x00 0x20 0x49 # CHECK: bc2eqz $0, 12
+0x02 0x00 0x3f 0x49 # CHECK: bc2eqz $31, 12
+0x02 0x00 0xa0 0x49 # CHECK: bc2nez $0, 12
+0x02 0x00 0xbf 0x49 # CHECK: bc2nez $31, 12
+0x40 0x00 0xa6 0x20 # CHECK: beqc $5, $6, 256
+0x4d 0x01 0x02 0x20 # CHECK: beqzalc $2, 1332
+0x40 0x00 0xa6 0x60 # CHECK: bnec $5, $6, 256
+0x4d 0x01 0x02 0x60 # CHECK: bnezalc $2, 1332
+0x90 0x46 0xa0 0xd8 # CHECK: beqzc $5, 72256
+0x40 0x00 0x43 0x58 # CHECK: bgec $2, $3, 256
+0x40 0x00 0x43 0x18 # CHECK: bgeuc $2, $3, 256
+0x4d 0x01 0x42 0x18 # CHECK: bgezalc $2, 1332
+0x90 0x46 0xa0 0xf8 # CHECK: bnezc $5, 72256
+0x40 0x00 0xa5 0x5c # CHECK: bltzc $5, 256
+0x40 0x00 0xa5 0x58 # CHECK: bgezc $5, 256
+0x4d 0x01 0x02 0x1c # CHECK: bgtzalc $2, 1332
+0x40 0x00 0x05 0x58 # CHECK: blezc $5, 256
+0x4d 0x01 0x42 0x1c # CHECK: bltzalc $2, 1332
+0x40 0x00 0x05 0x5c # CHECK: bgtzc $5, 256
+0x20 0x20 0x02 0x7c # CHECK: bitswap $4, $2
+0x4d 0x01 0x02 0x18 # CHECK: blezalc $2, 1332
+0x40 0x00 0xa6 0x5c # CHECK: bltc $5, $6, 256
+0x40 0x00 0xa6 0x1c # CHECK: bltuc $5, $6, 256
+0x01 0x00 0x00 0x60 # CHECK: bnvc $zero, $zero, 4
+0x01 0x00 0x40 0x60 # CHECK: bnvc $2, $zero, 4
+0x01 0x00 0x82 0x60 # CHECK: bnvc $4, $2, 4
+0x01 0x00 0x00 0x20 # CHECK: bovc $zero, $zero, 4
+0x01 0x00 0x40 0x20 # CHECK: bovc $2, $zero, 4
+0x01 0x00 0x82 0x20 # CHECK: bovc $4, $2, 4
+0x80 0x18 0x84 0x46 # CHECK: cmp.af.s $f2, $f3, $f4
+0x80 0x18 0xa4 0x46 # CHECK: cmp.af.d $f2, $f3, $f4
+0x81 0x18 0x84 0x46 # CHECK: cmp.un.s $f2, $f3, $f4
+0x81 0x18 0xa4 0x46 # CHECK: cmp.un.d $f2, $f3, $f4
+0x82 0x18 0x84 0x46 # CHECK: cmp.eq.s $f2, $f3, $f4
+0x82 0x18 0xa4 0x46 # CHECK: cmp.eq.d $f2, $f3, $f4
+0x83 0x18 0x84 0x46 # CHECK: cmp.ueq.s $f2, $f3, $f4
+0x83 0x18 0xa4 0x46 # CHECK: cmp.ueq.d $f2, $f3, $f4
+0x84 0x18 0x84 0x46 # CHECK: cmp.lt.s $f2, $f3, $f4
+0x84 0x18 0xa4 0x46 # CHECK: cmp.lt.d $f2, $f3, $f4
+0x85 0x18 0x84 0x46 # CHECK: cmp.ult.s $f2, $f3, $f4
+0x85 0x18 0xa4 0x46 # CHECK: cmp.ult.d $f2, $f3, $f4
+0x86 0x18 0x84 0x46 # CHECK: cmp.le.s $f2, $f3, $f4
+0x86 0x18 0xa4 0x46 # CHECK: cmp.le.d $f2, $f3, $f4
+0x87 0x18 0x84 0x46 # CHECK: cmp.ule.s $f2, $f3, $f4
+0x87 0x18 0xa4 0x46 # CHECK: cmp.ule.d $f2, $f3, $f4
+0x88 0x18 0x84 0x46 # CHECK: cmp.saf.s $f2, $f3, $f4
+0x88 0x18 0xa4 0x46 # CHECK: cmp.saf.d $f2, $f3, $f4
+0x89 0x18 0x84 0x46 # CHECK: cmp.sun.s $f2, $f3, $f4
+0x89 0x18 0xa4 0x46 # CHECK: cmp.sun.d $f2, $f3, $f4
+0x8a 0x18 0x84 0x46 # CHECK: cmp.seq.s $f2, $f3, $f4
+0x8a 0x18 0xa4 0x46 # CHECK: cmp.seq.d $f2, $f3, $f4
+0x8b 0x18 0x84 0x46 # CHECK: cmp.sueq.s $f2, $f3, $f4
+0x8b 0x18 0xa4 0x46 # CHECK: cmp.sueq.d $f2, $f3, $f4
+0x8c 0x18 0x84 0x46 # CHECK: cmp.slt.s $f2, $f3, $f4
+0x8c 0x18 0xa4 0x46 # CHECK: cmp.slt.d $f2, $f3, $f4
+0x8d 0x18 0x84 0x46 # CHECK: cmp.sult.s $f2, $f3, $f4
+0x8d 0x18 0xa4 0x46 # CHECK: cmp.sult.d $f2, $f3, $f4
+0x8e 0x18 0x84 0x46 # CHECK: cmp.sle.s $f2, $f3, $f4
+0x8e 0x18 0xa4 0x46 # CHECK: cmp.sle.d $f2, $f3, $f4
+0x8f 0x18 0x84 0x46 # CHECK: cmp.sule.s $f2, $f3, $f4
+0x8f 0x18 0xa4 0x46 # CHECK: cmp.sule.d $f2, $f3, $f4
+0x00 0x60 0x7e 0x41 # CHECK: di $fp
+0x00 0x60 0x60 0x41 # CHECK: di
+0x9a 0x10 0x64 0x00 # CHECK: div $2, $3, $4
+0x9b 0x10 0x64 0x00 # CHECK: divu $2, $3, $4
+0x20 0x60 0x6e 0x41 # CHECK: ei $14
+0x20 0x60 0x60 0x41 # CHECK: ei
+0xc5 0x10 0x64 0x00 # CHECK: lsa $2, $3, $4, 3
+0x43 0x00 0x48 0xec # CHECK: lwpc $2, 268
+0x43 0x00 0x50 0xec # CHECK: lwupc $2, 268
+0xda 0x10 0x64 0x00 # CHECK: mod $2, $3, $4
+0xdb 0x10 0x64 0x00 # CHECK: modu $2, $3, $4
+0x98 0x10 0x64 0x00 # CHECK: mul $2, $3, $4
+0xd8 0x10 0x64 0x00 # CHECK: muh $2, $3, $4
+0x99 0x10 0x64 0x00 # CHECK: mulu $2, $3, $4
+0xd9 0x10 0x64 0x00 # CHECK: muhu $2, $3, $4
+0x98 0x18 0x04 0x46 # CHECK: maddf.s $f2, $f3, $f4
+0x98 0x18 0x24 0x46 # CHECK: maddf.d $f2, $f3, $f4
+0x99 0x18 0x04 0x46 # CHECK: msubf.s $f2, $f3, $f4
+0x99 0x18 0x24 0x46 # CHECK: msubf.d $f2, $f3, $f4
+0x10 0x08 0x22 0x46 # CHECK: sel.d $f0, $f1, $f2
+0x10 0x08 0x02 0x46 # CHECK: sel.s $f0, $f1, $f2
+0x35 0x10 0x64 0x00 # CHECK: seleqz $2, $3, $4
+0x37 0x10 0x64 0x00 # CHECK: selnez $2, $3, $4
+0x1d 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4
+0x1d 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4
+0x1c 0x10 0x04 0x46 # CHECK: min.s $f0, $f2, $f4
+0x1c 0x10 0x24 0x46 # CHECK: min.d $f0, $f2, $f4
+0x1f 0x10 0x04 0x46 # CHECK: maxa.s $f0, $f2, $f4
+0x1f 0x10 0x24 0x46 # CHECK: maxa.d $f0, $f2, $f4
+0x1e 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4
+0x1e 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4
+0x04 0x00 0x42 0x34 # CHECK: ori $2, $2, 4
+0x14 0x10 0x04 0x46 # CHECK: seleqz.s $f0, $f2, $f4
+0x14 0x10 0x24 0x46 # CHECK: seleqz.d $f0, $f2, $f4
+0x17 0x10 0x04 0x46 # CHECK: selnez.s $f0, $f2, $f4
+0x17 0x10 0x24 0x46 # CHECK: selnez.d $f0, $f2, $f4
+0x9a 0x20 0x00 0x46 # CHECK: rint.s $f2, $f4
+0x9a 0x20 0x20 0x46 # CHECK: rint.d $f2, $f4
+0x9b 0x20 0x00 0x46 # CHECK: class.s $f2, $f4
+0x9b 0x20 0x20 0x46 # CHECK: class.d $f2, $f4
+0x09 0x04 0x80 0x00 # CHECK: jr.hb $4
+0x09 0xfc 0x80 0x00 # CHECK: jalr.hb $4
+0x09 0x24 0xa0 0x00 # CHECK: jalr.hb $4, $5
+0xb6 0xb3 0x42 0x7e # CHECK: ll $2, -153($18)
+0x26 0xec 0x6f 0x7e # CHECK: sc $15, -40($19)
+0x51 0x58 0xa0 0x00 # CHECK: clo $11, $5
+0x50 0xe8 0x80 0x03 # CHECK: clz $sp, $gp
+0x40 0x00 0x00 0x00 # CHECK: ssnop
+0x0e 0x00 0x00 0x00 # CHECK: sdbbp
+0x8e 0x08 0x00 0x00 # CHECK: sdbbp 34
+0x0f 0x00 0x00 0x00 # CHECK: sync
+0x4f 0x00 0x00 0x00 # CHECK: sync 1
+0x34 0x00 0x03 0x00 # CHECK: teq $zero, $3
+0x34 0x9b 0xa7 0x00 # CHECK: teq $5, $7, 620
+0x30 0x00 0xea 0x00 # CHECK: tge $7, $10
+0x30 0x55 0xb3 0x00 # CHECK: tge $5, $19, 340
+0x31 0x00 0xdc 0x02 # CHECK: tgeu $22, $gp
+0xf1 0x5e 0x8e 0x02 # CHECK: tgeu $20, $14, 379
+0x32 0x00 0xed 0x01 # CHECK: tlt $15, $13
+0x72 0x21 0x53 0x00 # CHECK: tlt $2, $19, 133
+0x33 0x00 0x70 0x01 # CHECK: tltu $11, $16
+0x33 0xfe 0x1d 0x02 # CHECK: tltu $16, $sp, 1016
+0x36 0x00 0xd1 0x00 # CHECK: tne $6, $17
+0x76 0xdd 0xe8 0x00 # CHECK: tne $7, $8, 885
+0x43 0x0d 0xc8 0x49 # CHECK: ldc2 $8, -701($1)
+0xb7 0x34 0x52 0x49 # CHECK: lwc2 $18, -841($6)
+0x75 0x92 0xf4 0x49 # CHECK: sdc2 $20, 629($18)
+0x30 0x81 0x79 0x49 # CHECK: swc2 $25, 304($16)
+0x00 0x01 0x05 0xf8 # CHECK: jialc $5, 256
+0x00 0x01 0x05 0xd8 # CHECK: jic $5, 256
+0x25 0x04 0xa1 0x7c # CHECK: cache 1, 8($5)
+0x35 0x04 0xa1 0x7c # CHECK: pref 1, 8($5
diff --git a/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt b/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt
new file mode 100644
index 0000000..0b78003
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt
@@ -0,0 +1,148 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r6 | FileCheck %s
+0x30 0x42 0x00 0x04 # CHECK: andi $2, $2, 4
+0xec 0x80 0x00 0x19 # CHECK: addiupc $4, 100
+0x25 0x29 0x00 0x0a # CHECK: addiu $9, $9, 10
+0x7c 0x43 0x22 0xa0 # CHECK: align $4, $2, $3, 2
+0xec 0x7f 0x00 0x38 # CHECK: aluipc $3, 56
+0x3c 0x62 0xff 0xe9 # CHECK: aui $3, $2, -23
+0xec 0x7e 0xff 0xff # CHECK: auipc $3, -1
+0x04 0x11 0x14 0x9b # CHECK: bal 21104
+0xe8 0x37 0x96 0xb8 # CHECK: balc 14572256
+0xc8 0x37 0x96 0xb8 # CHECK: bc 14572256
+0x45 0x20 0x00 0x01 # CHECK: bc1eqz $f0, 8
+0x45 0x3f 0x00 0x01 # CHECK: bc1eqz $f31, 8
+0x45 0xa0 0x00 0x01 # CHECK: bc1nez $f0, 8
+0x45 0xbf 0x00 0x01 # CHECK: bc1nez $f31, 8
+0x49 0x20 0x00 0x02 # CHECK: bc2eqz $0, 12
+0x49 0x3f 0x00 0x02 # CHECK: bc2eqz $31, 12
+0x49 0xa0 0x00 0x02 # CHECK: bc2nez $0, 12
+0x49 0xbf 0x00 0x02 # CHECK: bc2nez $31, 12
+0x20 0xa6 0x00 0x40 # CHECK: beqc $5, $6, 256
+0x20 0x02 0x01 0x4d # CHECK: beqzalc $2, 1332
+0x60 0xa6 0x00 0x40 # CHECK: bnec $5, $6, 256
+0x60 0x02 0x01 0x4d # CHECK: bnezalc $2, 1332
+0xd8 0xa0 0x46 0x90 # CHECK: beqzc $5, 72256
+0x58 0x43 0x00 0x40 # CHECK: bgec $2, $3, 256
+0x18 0x43 0x00 0x40 # CHECK: bgeuc $2, $3, 256
+0x18 0x42 0x01 0x4d # CHECK: bgezalc $2, 1332
+0xf8 0xa0 0x46 0x90 # CHECK: bnezc $5, 72256
+0x5c 0xa5 0x00 0x40 # CHECK: bltzc $5, 256
+0x58 0xa5 0x00 0x40 # CHECK: bgezc $5, 256
+0x1c 0x02 0x01 0x4d # CHECK: bgtzalc $2, 1332
+0x58 0x05 0x00 0x40 # CHECK: blezc $5, 256
+0x1c 0x42 0x01 0x4d # CHECK: bltzalc $2, 1332
+0x5c 0x05 0x00 0x40 # CHECK: bgtzc $5, 256
+0x7c 0x02 0x20 0x20 # CHECK: bitswap $4, $2
+0x18 0x02 0x01 0x4d # CHECK: blezalc $2, 1332
+0x5c 0xa6 0x00 0x40 # CHECK: bltc $5, $6, 256
+0x1c 0xa6 0x00 0x40 # CHECK: bltuc $5, $6, 256
+0x60 0x00 0x00 0x01 # CHECK: bnvc $zero, $zero, 4
+0x60 0x40 0x00 0x01 # CHECK: bnvc $2, $zero, 4
+0x60 0x82 0x00 0x01 # CHECK: bnvc $4, $2, 4
+0x20 0x00 0x00 0x01 # CHECK: bovc $zero, $zero, 4
+0x20 0x40 0x00 0x01 # CHECK: bovc $2, $zero, 4
+0x20 0x82 0x00 0x01 # CHECK: bovc $4, $2, 4
+0x46 0x84 0x18 0x80 # CHECK: cmp.af.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x80 # CHECK: cmp.af.d $f2, $f3, $f4
+0x46 0x84 0x18 0x81 # CHECK: cmp.un.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x81 # CHECK: cmp.un.d $f2, $f3, $f4
+0x46 0x84 0x18 0x82 # CHECK: cmp.eq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x82 # CHECK: cmp.eq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x83 # CHECK: cmp.ueq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x83 # CHECK: cmp.ueq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x84 # CHECK: cmp.lt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x84 # CHECK: cmp.lt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x85 # CHECK: cmp.ult.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x85 # CHECK: cmp.ult.d $f2, $f3, $f4
+0x46 0x84 0x18 0x86 # CHECK: cmp.le.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x86 # CHECK: cmp.le.d $f2, $f3, $f4
+0x46 0x84 0x18 0x87 # CHECK: cmp.ule.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x87 # CHECK: cmp.ule.d $f2, $f3, $f4
+0x46 0x84 0x18 0x88 # CHECK: cmp.saf.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x88 # CHECK: cmp.saf.d $f2, $f3, $f4
+0x46 0x84 0x18 0x89 # CHECK: cmp.sun.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x89 # CHECK: cmp.sun.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8a # CHECK: cmp.seq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8a # CHECK: cmp.seq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8b # CHECK: cmp.sueq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8b # CHECK: cmp.sueq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8c # CHECK: cmp.slt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8c # CHECK: cmp.slt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8d # CHECK: cmp.sult.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8d # CHECK: cmp.sult.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8e # CHECK: cmp.sle.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8e # CHECK: cmp.sle.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8f # CHECK: cmp.sule.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8f # CHECK: cmp.sule.d $f2, $f3, $f4
+0x41 0x7e 0x60 0x00 # CHECK: di $fp
+0x41 0x60 0x60 0x00 # CHECK: di
+0x00 0x64 0x10 0x9a # CHECK: div $2, $3, $4
+0x00 0x64 0x10 0x9b # CHECK: divu $2, $3, $4
+0x41 0x6e 0x60 0x20 # CHECK: ei $14
+0x41 0x60 0x60 0x20 # CHECK: ei
+0x00 0x64 0x10 0xc5 # CHECK: lsa $2, $3, $4, 3
+0xec 0x48 0x00 0x43 # CHECK: lwpc $2, 268
+0xec 0x50 0x00 0x43 # CHECK: lwupc $2, 268
+0x00 0x64 0x10 0xda # CHECK: mod $2, $3, $4
+0x00 0x64 0x10 0xdb # CHECK: modu $2, $3, $4
+0x00 0x64 0x10 0x98 # CHECK: mul $2, $3, $4
+0x00 0x64 0x10 0xd8 # CHECK: muh $2, $3, $4
+0x00 0x64 0x10 0x99 # CHECK: mulu $2, $3, $4
+0x00 0x64 0x10 0xd9 # CHECK: muhu $2, $3, $4
+0x46 0x04 0x18 0x98 # CHECK: maddf.s $f2, $f3, $f4
+0x46 0x24 0x18 0x98 # CHECK: maddf.d $f2, $f3, $f4
+0x46 0x04 0x18 0x99 # CHECK: msubf.s $f2, $f3, $f4
+0x46 0x24 0x18 0x99 # CHECK: msubf.d $f2, $f3, $f4
+0x46 0x22 0x08 0x10 # CHECK: sel.d $f0, $f1, $f2
+0x46 0x02 0x08 0x10 # CHECK: sel.s $f0, $f1, $f2
+0x00 0x64 0x10 0x35 # CHECK: seleqz $2, $3, $4
+0x00 0x64 0x10 0x37 # CHECK: selnez $2, $3, $4
+0x46 0x04 0x10 0x1d # CHECK: max.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1d # CHECK: max.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1c # CHECK: min.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1c # CHECK: min.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1f # CHECK: maxa.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1f # CHECK: maxa.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1e # CHECK: mina.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1e # CHECK: mina.d $f0, $f2, $f4
+0x34 0x42 0x00 0x04 # CHECK: ori $2, $2, 4
+0x46 0x04 0x10 0x14 # CHECK: seleqz.s $f0, $f2, $f4
+0x46 0x24 0x10 0x14 # CHECK: seleqz.d $f0, $f2, $f4
+0x46 0x04 0x10 0x17 # CHECK: selnez.s $f0, $f2, $f4
+0x46 0x24 0x10 0x17 # CHECK: selnez.d $f0, $f2, $f4
+0x46 0x00 0x20 0x9a # CHECK: rint.s $f2, $f4
+0x46 0x20 0x20 0x9a # CHECK: rint.d $f2, $f4
+0x46 0x00 0x20 0x9b # CHECK: class.s $f2, $f4
+0x46 0x20 0x20 0x9b # CHECK: class.d $f2, $f4
+0x00 0x80 0x04 0x09 # CHECK: jr.hb $4
+0x00 0x80 0xfc 0x09 # CHECK: jalr.hb $4
+0x00 0xa0 0x24 0x09 # CHECK: jalr.hb $4, $5
+0x7e 0x42 0xb3 0xb6 # CHECK: ll $2, -153($18)
+0x7e 0x6f 0xec 0x26 # CHECK: sc $15, -40($19)
+0x00 0xa0 0x58 0x51 # CHECK: clo $11, $5
+0x03 0x80 0xe8 0x50 # CHECK: clz $sp, $gp
+0x00 0x00 0x00 0x40 # CHECK: ssnop
+0x00 0x00 0x00 0x0e # CHECK: sdbbp
+0x00 0x00 0x08 0x8e # CHECK: sdbbp 34
+0x00 0x00 0x00 0x0f # CHECK: sync
+0x00 0x00 0x00 0x4f # CHECK: sync 1
+0x00 0x03 0x00 0x34 # CHECK: teq $zero, $3
+0x00 0xa7 0x9b 0x34 # CHECK: teq $5, $7, 620
+0x00 0xea 0x00 0x30 # CHECK: tge $7, $10
+0x00 0xb3 0x55 0x30 # CHECK: tge $5, $19, 340
+0x02 0xdc 0x00 0x31 # CHECK: tgeu $22, $gp
+0x02 0x8e 0x5e 0xf1 # CHECK: tgeu $20, $14, 379
+0x01 0xed 0x00 0x32 # CHECK: tlt $15, $13
+0x00 0x53 0x21 0x72 # CHECK: tlt $2, $19, 133
+0x01 0x70 0x00 0x33 # CHECK: tltu $11, $16
+0x02 0x1d 0xfe 0x33 # CHECK: tltu $16, $sp, 1016
+0x00 0xd1 0x00 0x36 # CHECK: tne $6, $17
+0x00 0xe8 0xdd 0x76 # CHECK: tne $7, $8, 885
+0x49 0xc8 0x0d 0x43 # CHECK: ldc2 $8, -701($1)
+0x49 0x52 0x34 0xb7 # CHECK: lwc2 $18, -841($6)
+0x49 0xf4 0x92 0x75 # CHECK: sdc2 $20, 629($18)
+0x49 0x79 0x81 0x30 # CHECK: swc2 $25, 304($16)
+0xf8 0x05 0x01 0x00 # CHECK: jialc $5, 256
+0xd8 0x05 0x01 0x00 # CHECK: jic $5, 256
+0x7c 0xa1 0x04 0x25 # CHECK: cache 1, 8($5)
+0x7c 0xa1 0x04 0x35 # CHECK: pref 1, 8($5)
diff --git a/test/MC/Disassembler/Mips/mips32r6/valid-xfail-mips32r6.txt b/test/MC/Disassembler/Mips/mips32r6/valid-xfail-mips32r6.txt
new file mode 100644
index 0000000..e9afd03
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32r6/valid-xfail-mips32r6.txt
@@ -0,0 +1,15 @@
+# Instructions that should be valid but currently fail for known reasons (e.g.
+# they aren't implemented yet).
+#
+# RUN: llvm-mc %s -disassemble -triple=mips-unknown-linux -mcpu=mips32r6 | FileCheck %s
+# XFAIL: *
+0x20 0x40 0x00 0x01 # CHECK: bovc $0, $2, 4
+0x20 0x82 0x00 0x01 # CHECK: bovc $2, $4, 4
+0x60 0x40 0x00 0x01 # CHECK: bnvc $0, $2, 4
+0x60 0x82 0x00 0x01 # CHECK: bnvc $2, $4, 4
+0x20 0xc0 0x00 0x40 # CHECK: beqc $6, $zero, 256
+0x20 0xa0 0x00 0x40 # CHECK: beqc $5, $zero, 256
+0x20 0xa6 0x00 0x40 # CHECK: beqc $5, $6, 256
+0x60 0xc0 0x00 0x40 # CHECK: bnec $6, $zero, 256
+0x60 0xa0 0x00 0x40 # CHECK: bnec $5, $zero, 256
+0x60 0xa6 0x00 0x40 # CHECK: bnec $5, $6, 256
diff --git a/test/MC/Disassembler/Mips/mips4/valid-mips4-el.txt b/test/MC/Disassembler/Mips/mips4/valid-mips4-el.txt
new file mode 100644
index 0000000..0c9e2f1
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips4/valid-mips4-el.txt
@@ -0,0 +1,229 @@
+# RUN: llvm-mc %s -triple=mips64el-unknown-linux -disassemble -mcpu=mips4 | FileCheck %s
+# CHECK: .text
+0x85 0xc1 0x20 0x46 # CHECK: abs.d $f6, $f24
+0x45 0x82 0x00 0x46 # CHECK: abs.s $f9, $f16
+0x20 0xb8 0x45 0x02 # CHECK: add $23, $18, $5
+0x48 0x3b 0xc9 0x21 # CHECK: addi $9, $14, 15176
+0xe7 0xe3 0x18 0x23 # CHECK: addi $24, $24, -7193
+0x00 0x30 0x3c 0x46 # CHECK: add.d $f0, $f6, $f28
+0x00 0xaa 0x18 0x46 # CHECK: add.s $f8, $f21, $f24
+0xd2 0x66 0x2d 0x21 # CHECK: addi $13, $9, 26322
+0xfe 0xff 0x08 0x21 # CHECK: addi $8, $8, -2
+0x21 0x48 0x86 0x00 # CHECK: addu $9, $4, $6
+0x0a 0x00 0x29 0x25 # CHECK: addiu $9, $9, 10
+0x24 0xb8 0x4c 0x00 # CHECK: and $23, $2, $12
+0x04 0x00 0x42 0x30 # CHECK: andi $2, $2, 4
+0x01 0x00 0x00 0x45 # CHECK: bc1f 8
+0x00 0x00 0x04 0x45 # CHECK: bc1f $fcc1, 4
+0x06 0x00 0x1e 0x45 # CHECK: bc1fl $fcc7, 28
+0x0c 0x00 0x02 0x45 # CHECK: bc1fl 52
+0x01 0x00 0x01 0x45 # CHECK: bc1t 8
+0x00 0x00 0x05 0x45 # CHECK: bc1t $fcc1, 4
+0xf4 0xf7 0x03 0x45 # CHECK: bc1tl -8236
+0x06 0x00 0x1f 0x45 # CHECK: bc1tl $fcc7, 28
+0x9b 0x14 0x11 0x04 # CHECK: bal 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x9b 0x14 0xd0 0x04 # CHECK: bltzal $6, 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x40 0x0c 0xd3 0x51 # CHECK: beql $14, $19, 12548
+0x1f 0x07 0x93 0x05 # CHECK: bgezall $12, 7296
+0x4d 0xf9 0x83 0x04 # CHECK: bgezl $4, -6856
+0x59 0xfc 0x40 0x5d # CHECK: bgtzl $10, -3736
+0xe7 0x02 0xc0 0x58 # CHECK: blezl $6, 2976
+0x7a 0x00 0xd2 0x04 # CHECK: bltzall $6, 492
+0x45 0xf6 0x22 0x06 # CHECK: bltzl $17, -9960
+0xfc 0x04 0x94 0x57 # CHECK: bnel $gp, $20, 5108
+0x08 0x00 0xa1 0xbc # CHECK: cache 1, 8($5)
+0x3b 0xe0 0x3c 0x46 # CHECK: c.ngl.d $f28, $f28
+0x39 0x00 0x30 0x46 # CHECK: c.ngle.d $f0, $f16
+0x38 0xf0 0x20 0x46 # CHECK: c.sf.d $f30, $f0
+0x38 0x70 0x16 0x46 # CHECK: c.sf.s $f14, $f22
+0x4a 0x18 0x20 0x46 # CHECK: ceil.l.d $f1, $f3
+0x8a 0x6c 0x00 0x46 # CHECK: ceil.l.s $f18, $f13
+0xce 0xc2 0x20 0x46 # CHECK: ceil.w.d $f11, $f24
+0x8e 0xa1 0x00 0x46 # CHECK: ceil.w.s $f6, $f20
+0x00 0xa8 0x51 0x44 # CHECK: cfc1 $17, $21
+0x00 0xd0 0xc6 0x44 # CHECK: ctc1 $6, $26
+0xa1 0xe5 0x00 0x46 # CHECK: cvt.d.s $f22, $f28
+0xa1 0x5e 0x80 0x46 # CHECK: cvt.d.w $f26, $f11
+0x21 0x81 0xa0 0x46 # CHECK: cvt.d.l $f4, $f16
+0x25 0x7e 0x20 0x46 # CHECK: cvt.l.d $f24, $f15
+0xe5 0xea 0x00 0x46 # CHECK: cvt.l.s $f11, $f29
+0xe0 0xf3 0xa0 0x46 # CHECK: cvt.s.l $f15, $f30
+0xa0 0x46 0x20 0x46 # CHECK: cvt.s.d $f26, $f8
+0xa0 0x7d 0x80 0x46 # CHECK: cvt.s.w $f22, $f15
+0x24 0x75 0x20 0x46 # CHECK: cvt.w.d $f20, $f14
+0x24 0xc5 0x00 0x46 # CHECK: cvt.w.s $f20, $f24
+0x2c 0x98 0x3f 0x00 # CHECK: dadd $19, $1, $ra
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0xbd 0x63 # CHECK: daddi $sp, $sp, -27705
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0xbd 0x63 # CHECK: daddi $sp, $sp, -27705
+0x16 0xee 0xda 0x66 # CHECK: daddiu $26, $22, -4586
+0x2d 0x98 0x3f 0x00 # CHECK: daddu $19, $1, $ra
+0x9f 0x46 0x58 0x64 # CHECK: daddiu $24, $2, 18079
+0x3f 0x69 0x73 0x66 # CHECK: daddiu $19, $19, 26943
+0x1e 0x00 0x53 0x03 # CHECK: ddiv $zero, $26, $19
+0x1f 0x00 0x11 0x02 # CHECK: ddivu $zero, $16, $17
+0x1a 0x00 0x2b 0x03 # CHECK: div $zero, $25, $11
+0x03 0xa7 0x3a 0x46 # CHECK: div.d $f28, $f20, $f26
+0x03 0x29 0x0f 0x46 # CHECK: div.s $f4, $f5, $f15
+0x1b 0x00 0x2f 0x03 # CHECK: divu $zero, $25, $15
+0x00 0x68 0x2c 0x44 # CHECK: dmfc1 $12, $f13
+0x00 0x70 0xb0 0x44 # CHECK: dmtc1 $16, $f14
+0x1c 0x00 0xe9 0x02 # CHECK: dmult $23, $9
+0x1d 0x00 0xa6 0x00 # CHECK: dmultu $5, $6
+0xb8 0x04 0x00 0x00 # CHECK: dsll $zero, $zero, 18
+0xb8 0x04 0x14 0x00 # CHECK: dsll $zero, $20, 18
+0x14 0x00 0x94 0x01 # CHECK: dsllv $zero, $20, $12
+0xbc 0x04 0x00 0x00 # CHECK: dsll32 $zero, $zero, 18
+0xbc 0x04 0x00 0x00 # CHECK: dsll32 $zero, $zero, 18
+0x14 0x00 0x94 0x01 # CHECK: dsllv $zero, $20, $12
+0xbb 0xe2 0x1c 0x00 # CHECK: dsra $gp, $gp, 10
+0xbb 0xe2 0x12 0x00 # CHECK: dsra $gp, $18, 10
+0x17 0xe0 0x72 0x02 # CHECK: dsrav $gp, $18, $19
+0xbf 0xe2 0x1c 0x00 # CHECK: dsra32 $gp, $gp, 10
+0xbf 0xe2 0x12 0x00 # CHECK: dsra32 $gp, $18, 10
+0x17 0xe0 0x72 0x02 # CHECK: dsrav $gp, $18, $19
+0xfa 0x9d 0x13 0x00 # CHECK: dsrl $19, $19, 23
+0xfa 0x9d 0x06 0x00 # CHECK: dsrl $19, $6, 23
+0x16 0x98 0x86 0x02 # CHECK: dsrlv $19, $6, $20
+0xfe 0x9d 0x13 0x00 # CHECK: dsrl32 $19, $19, 23
+0xfe 0x9d 0x06 0x00 # CHECK: dsrl32 $19, $6, 23
+0x16 0x98 0x86 0x02 # CHECK: dsrlv $19, $6, $20
+0x2e 0x38 0xc8 0x02 # CHECK: dsub $7, $22, $8
+0x2f 0x28 0xba 0x00 # CHECK: dsubu $5, $5, $26
+0xc0 0x00 0x00 0x00 # CHECK: ehb
+0x18 0x00 0x00 0x42 # CHECK: eret
+0x8f 0x53 0x20 0x46 # CHECK: floor.w.d $f14, $f10
+0x0f 0x4a 0x00 0x46 # CHECK: floor.w.s $f8, $f9
+0x8b 0x3e 0x20 0x46 # CHECK: floor.l.d $f26, $f7
+0x0b 0x2b 0x00 0x46 # CHECK: floor.l.s $f12, $f5
+0x4d 0xc7 0x58 0x81 # CHECK: lb $24, -14515($10)
+0xf3 0x75 0x68 0x90 # CHECK: lbu $8, 30195($3)
+0x07 0x40 0x0a 0xd6 # CHECK: ldc1 $f10, 16391($16)
+0x43 0xad 0x28 0xd8 # CHECK: ldc2 $8, -21181($1)
+0x94 0xde 0xab 0x86 # CHECK: lh $11, -8556($21)
+0xbd 0xa6 0x53 0x94 # CHECK: lhu $19, -22851($2)
+0xb3 0x8b 0x01 0x24 # CHECK: addiu $1, $zero, -29773
+0x3f 0x8b 0x00 0x24 # CHECK: addiu $zero, $zero, -29889
+0x67 0xe3 0x42 0xc2 # CHECK: ll $2, -7321($18)
+0x2a 0x16 0xa8 0x8c # CHECK: lw $8, 5674($5)
+0xf1 0x27 0x50 0xc7 # CHECK: lwc1 $f16, 10225($26)
+0xb7 0xfc 0xd2 0xc8 # CHECK: lwc2 $18, -841($6)
+0x79 0xef 0xf4 0x89 # CHECK: lwl $20, -4231($15)
+0x35 0xb5 0x80 0x9b # CHECK: lwr $zero, -19147($gp)
+0x00 0x03 0xd1 0x4f # CHECK: lwxc1 $f12, $17($fp)
+0x00 0xd8 0x07 0x44 # CHECK: mfc1 $7, $f27
+0x10 0x98 0x00 0x00 # CHECK: mfhi $19
+0x10 0xe8 0x00 0x00 # CHECK: mfhi $sp
+0x12 0x88 0x00 0x00 # CHECK: mflo $17
+0x06 0x75 0x20 0x46 # CHECK: mov.d $f20, $f14
+0x86 0xd8 0x00 0x46 # CHECK: mov.s $f2, $f27
+0x01 0xe0 0x1c 0x01 # CHECK: movf $gp, $8, $fcc7
+0x91 0x59 0x34 0x46 # CHECK: movf.d $f6, $f11, $fcc5
+0xd1 0x2d 0x18 0x46 # CHECK: movf.s $f23, $f5, $fcc6
+0x21 0xf0 0x80 0x00 # CHECK: move $fp, $4
+0x21 0xc8 0xc0 0x00 # CHECK: move $25, $6
+0x0b 0x18 0x30 0x02 # CHECK: movn $3, $17, $16
+0xd3 0xae 0x3a 0x46 # CHECK: movn.d $f27, $f21, $26
+0x13 0x03 0x17 0x46 # CHECK: movn.s $f12, $f0, $23
+0x01 0x00 0x95 0x02 # CHECK: movt $zero, $20, $fcc5
+0x11 0x10 0x21 0x46 # CHECK: movt.d $f0, $f2, $fcc0
+0x91 0x17 0x05 0x46 # CHECK: movt.s $f30, $f2, $fcc1
+0x0a 0x28 0xc9 0x02 # CHECK: movz $5, $22, $9
+0x12 0xeb 0x29 0x46 # CHECK: movz.d $f12, $f29, $9
+0x52 0x3e 0x03 0x46 # CHECK: movz.s $f25, $f7, $3
+0x00 0x48 0x9e 0x44 # CHECK: mtc1 $fp, $f9
+0x11 0x00 0x20 0x02 # CHECK: mthi $17
+0x13 0x00 0xa0 0x03 # CHECK: mtlo $sp
+0x13 0x00 0x20 0x03 # CHECK: mtlo $25
+0x02 0xa5 0x30 0x46 # CHECK: mul.d $f20, $f20, $f16
+0x82 0x57 0x02 0x46 # CHECK: mul.s $f30, $f10, $f2
+0x18 0x00 0xb4 0x03 # CHECK: mult $sp, $20
+0x18 0x00 0xa2 0x03 # CHECK: mult $sp, $2
+0x19 0x00 0x9a 0x03 # CHECK: multu $gp, $26
+0x19 0x00 0x32 0x01 # CHECK: multu $9, $18
+0x23 0x10 0x02 0x00 # CHECK: negu $2, $2
+0x23 0x10 0x03 0x00 # CHECK: negu $2, $3
+0x87 0x96 0x20 0x46 # CHECK: neg.d $f26, $f18
+0x47 0x78 0x00 0x46 # CHECK: neg.s $f1, $f15
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x27 0x38 0x07 0x00 # CHECK: nor $7, $zero, $7
+0x25 0x60 0x1d 0x02 # CHECK: or $12, $16, $sp
+0x04 0x00 0x42 0x34 # CHECK: ori $2, $2, 4
+0x08 0x00 0xa1 0xcc # CHECK: pref 1, 8($5) 
+0x08 0x0b 0x20 0x46 # CHECK: round.l.d $f12, $f1
+0x48 0x2e 0x00 0x46 # CHECK: round.l.s $f25, $f5
+0x8c 0x21 0x20 0x46 # CHECK: round.w.d $f6, $f4
+0xcc 0xe6 0x00 0x46 # CHECK: round.w.s $f27, $f28
+0x6f 0xb2 0xd6 0xa1 # CHECK: sb $22, -19857($14)
+0xd8 0x49 0x6f 0xe2 # CHECK: sc $15, 18904($19)
+0xcd 0xdf 0xaf 0xf3 # CHECK: scd $15, -8243($sp)
+0xcb 0x16 0x4c 0xfd # CHECK: sd $12, 5835($10)
+0x1f 0xae 0xc7 0xb3 # CHECK: sdl $7, -20961($fp)
+0x39 0xb0 0x8b 0xb5 # CHECK: sdr $11, -20423($12)
+0x6e 0x77 0xbe 0xf5 # CHECK: sdc1 $f30, 30574($13)
+0x75 0x5a 0x54 0xfa # CHECK: sdc2 $20, 23157($18)
+0x09 0x58 0xca 0x4d # CHECK: sdxc1 $f11, $10($14)
+0xd0 0xe5 0xee 0xa5 # CHECK: sh $14, -6704($15)
+0x80 0x3c 0x07 0x00 # CHECK: sll $7, $7, 18
+0x80 0x3c 0x00 0x00 # CHECK: sll $7, $zero, 18
+0x04 0x38 0x20 0x01 # CHECK: sllv $7, $zero, $9
+0x04 0x38 0x20 0x01 # CHECK: sllv $7, $zero, $9
+0x2a 0xb8 0x7b 0x01 # CHECK: slt $23, $11, $27
+0x11 0x25 0x51 0x29 # CHECK: slti $17, $10, 9489
+0x55 0xc3 0x39 0x2f # CHECK: sltiu $25, $25, -15531
+0x2b 0xa0 0xab 0x02 # CHECK: sltu $20, $21, $11
+0x55 0xc3 0x38 0x2f # CHECK: sltiu $24, $25, -15531
+0x04 0xb4 0x20 0x46 # CHECK: sqrt.d $f16, $f22
+0x04 0x08 0x00 0x46 # CHECK: sqrt.s $f0, $f1
+0xc3 0x8b 0x11 0x00 # CHECK: sra $17, $17, 15
+0xc3 0x8b 0x17 0x00 # CHECK: sra $17, $23, 15
+0x07 0x88 0xb7 0x03 # CHECK: srav $17, $23, $sp
+0x07 0x88 0xb7 0x03 # CHECK: srav $17, $23, $sp
+0xc2 0x11 0x02 0x00 # CHECK: srl $2, $2, 7
+0xc2 0x11 0x02 0x00 # CHECK: srl $2, $2, 7
+0x06 0xc8 0x94 0x00 # CHECK: srlv $25, $20, $4
+0x06 0xc8 0x94 0x00 # CHECK: srlv $25, $20, $4
+0x40 0x00 0x00 0x00 # CHECK: ssnop
+0x22 0xb0 0x6c 0x02 # CHECK: sub $22, $19, $12
+0x36 0x0c 0x36 0x22 # CHECK: addi $22, $17, 3126
+0x90 0xe6 0xad 0x21 # CHECK: addi $13, $13, -6512
+0x81 0x14 0x30 0x46 # CHECK: sub.d $f18, $f2, $f16
+0xc1 0xb5 0x16 0x46 # CHECK: sub.s $f23, $f22, $f22
+0x23 0xe8 0xd6 0x02 # CHECK: subu $sp, $22, $22
+0x50 0xd8 0xbf 0xaf # CHECK: sw $ra, -10160($sp)
+0xef 0xde 0x06 0xe7 # CHECK: swc1 $f6, -8465($24)
+0x30 0x61 0x19 0xea # CHECK: swc2 $25, 24880($16)
+0x7e 0x35 0x6f 0xaa # CHECK: swl $15, 13694($19)
+0x22 0x98 0xd1 0xb9 # CHECK: swr $17, -26590($14)
+0x08 0x98 0x4c 0x4f # CHECK: swxc1 $f19, $12($26)
+0x34 0x00 0x03 0x00 # CHECK: teq $zero, $3
+0x34 0x9b 0xa7 0x00 # CHECK: teq $5, $7, 620
+0xa0 0xbb 0xac 0x06 # CHECK: teqi $21, 48032
+0x30 0x00 0xea 0x00 # CHECK: tge $7, $10
+0x30 0x55 0xb3 0x00 # CHECK: tge $5, $19, 340
+0xa1 0x13 0x28 0x06 # CHECK: tgei $17, 5025
+0x33 0x90 0xa9 0x07 # CHECK: tgeiu $sp, 36915
+0x31 0x00 0xdc 0x02 # CHECK: tgeu $22, $gp
+0xf1 0x5e 0x8e 0x02 # CHECK: tgeu $20, $14, 379
+0x08 0x00 0x00 0x42 # CHECK: tlbp
+0x01 0x00 0x00 0x42 # CHECK: tlbr
+0x02 0x00 0x00 0x42 # CHECK: tlbwi
+0x06 0x00 0x00 0x42 # CHECK: tlbwr
+0x32 0x00 0xed 0x01 # CHECK: tlt $15, $13
+0x72 0x21 0x53 0x00 # CHECK: tlt $2, $19, 133
+0xbd 0xad 0xca 0x05 # CHECK: tlti $14, 44477
+0x2c 0xec 0xeb 0x07 # CHECK: tltiu $ra, 60460
+0x33 0x00 0x70 0x01 # CHECK: tltu $11, $16
+0x33 0xfe 0x1d 0x02 # CHECK: tltu $16, $sp, 1016
+0x36 0x00 0xd1 0x00 # CHECK: tne $6, $17
+0x76 0xdd 0xe8 0x00 # CHECK: tne $7, $8, 885
+0x31 0x8c 0x8e 0x05 # CHECK: tnei $12, 35889
+0xc9 0xbd 0x20 0x46 # CHECK: trunc.l.d $f23, $f23
+0x09 0xff 0x00 0x46 # CHECK: trunc.l.s $f28, $f31
+0x8d 0x75 0x20 0x46 # CHECK: trunc.w.d $f22, $f14
+0x0d 0xf7 0x00 0x46 # CHECK: trunc.w.s $f28, $f30
+0x26 0x90 0x9e 0x00 # CHECK: xor $18, $4, $fp
diff --git a/test/MC/Disassembler/Mips/mips4/valid-mips4.txt b/test/MC/Disassembler/Mips/mips4/valid-mips4.txt
new file mode 100644
index 0000000..c8c35e9
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips4/valid-mips4.txt
@@ -0,0 +1,229 @@
+# RUN: llvm-mc %s -triple=mips64-unknown-linux -disassemble -mcpu=mips4 | FileCheck %s
+# CHECK: .text
+0x46 0x20 0xc1 0x85 # CHECK: abs.d $f6, $f24
+0x46 0x00 0x82 0x45 # CHECK: abs.s $f9, $f16
+0x02 0x45 0xb8 0x20 # CHECK: add $23, $18, $5
+0x21 0xc9 0x3b 0x48 # CHECK: addi $9, $14, 15176
+0x23 0x18 0xe3 0xe7 # CHECK: addi $24, $24, -7193
+0x46 0x3c 0x30 0x00 # CHECK: add.d $f0, $f6, $f28
+0x46 0x18 0xaa 0x00 # CHECK: add.s $f8, $f21, $f24
+0x21 0x2d 0x66 0xd2 # CHECK: addi $13, $9, 26322
+0x21 0x08 0xff 0xfe # CHECK: addi $8, $8, -2
+0x00 0x86 0x48 0x21 # CHECK: addu $9, $4, $6
+0x25 0x29 0x00 0x0a # CHECK: addiu $9, $9, 10
+0x00 0x4c 0xb8 0x24 # CHECK: and $23, $2, $12
+0x30 0x42 0x00 0x04 # CHECK: andi $2, $2, 4
+0x45 0x00 0x00 0x01 # CHECK: bc1f 8
+0x45 0x04 0x00 0x00 # CHECK: bc1f $fcc1, 4
+0x45 0x1e 0x00 0x06 # CHECK: bc1fl $fcc7, 28
+0x45 0x02 0x00 0x0c # CHECK: bc1fl 52
+0x45 0x01 0x00 0x01 # CHECK: bc1t 8
+0x45 0x05 0x00 0x00 # CHECK: bc1t $fcc1, 4
+0x45 0x03 0xf7 0xf4 # CHECK: bc1tl -8236
+0x45 0x1f 0x00 0x06 # CHECK: bc1tl $fcc7, 28
+0x04 0x11 0x14 0x9b # CHECK: bal 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x04 0xd0 0x14 0x9b # CHECK: bltzal $6, 21104
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x51 0xd3 0x0c 0x40 # CHECK: beql $14, $19, 12548
+0x05 0x93 0x07 0x1f # CHECK: bgezall $12, 7296
+0x04 0x83 0xf9 0x4d # CHECK: bgezl $4, -6856
+0x5d 0x40 0xfc 0x59 # CHECK: bgtzl $10, -3736
+0x58 0xc0 0x02 0xe7 # CHECK: blezl $6, 2976
+0x04 0xd2 0x00 0x7a # CHECK: bltzall $6, 492
+0x06 0x22 0xf6 0x45 # CHECK: bltzl $17, -9960
+0x57 0x94 0x04 0xfc # CHECK: bnel $gp, $20, 5108
+0xbc 0xa1 0x00 0x08 # CHECK: cache 1, 8($5)
+0x46 0x3c 0xe0 0x3b # CHECK: c.ngl.d $f28, $f28
+0x46 0x30 0x00 0x39 # CHECK: c.ngle.d $f0, $f16
+0x46 0x20 0xf0 0x38 # CHECK: c.sf.d $f30, $f0
+0x46 0x16 0x70 0x38 # CHECK: c.sf.s $f14, $f22
+0x46 0x20 0x18 0x4a # CHECK: ceil.l.d $f1, $f3
+0x46 0x00 0x6c 0x8a # CHECK: ceil.l.s $f18, $f13
+0x46 0x20 0xc2 0xce # CHECK: ceil.w.d $f11, $f24
+0x46 0x00 0xa1 0x8e # CHECK: ceil.w.s $f6, $f20
+0x44 0x51 0xa8 0x00 # CHECK: cfc1 $17, $21
+0x44 0xc6 0xd0 0x00 # CHECK: ctc1 $6, $26
+0x46 0x00 0xe5 0xa1 # CHECK: cvt.d.s $f22, $f28
+0x46 0x80 0x5e 0xa1 # CHECK: cvt.d.w $f26, $f11
+0x46 0xa0 0x81 0x21 # CHECK: cvt.d.l $f4, $f16
+0x46 0x20 0x7e 0x25 # CHECK: cvt.l.d $f24, $f15
+0x46 0x00 0xea 0xe5 # CHECK: cvt.l.s $f11, $f29
+0x46 0xa0 0xf3 0xe0 # CHECK: cvt.s.l $f15, $f30
+0x46 0x20 0x46 0xa0 # CHECK: cvt.s.d $f26, $f8
+0x46 0x80 0x7d 0xa0 # CHECK: cvt.s.w $f22, $f15
+0x46 0x20 0x75 0x24 # CHECK: cvt.w.d $f20, $f14
+0x46 0x00 0xc5 0x24 # CHECK: cvt.w.s $f20, $f24
+0x00 0x3f 0x98 0x2c # CHECK: dadd $19, $1, $ra
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x63 0xbd 0x93 0xc7 # CHECK: daddi $sp, $sp, -27705
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x63 0xbd 0x93 0xc7 # CHECK: daddi $sp, $sp, -27705
+0x66 0xda 0xee 0x16 # CHECK: daddiu $26, $22, -4586
+0x00 0x3f 0x98 0x2d # CHECK: daddu $19, $1, $ra
+0x64 0x58 0x46 0x9f # CHECK: daddiu $24, $2, 18079
+0x66 0x73 0x69 0x3f # CHECK: daddiu $19, $19, 26943
+0x03 0x53 0x00 0x1e # CHECK: ddiv $zero, $26, $19
+0x02 0x11 0x00 0x1f # CHECK: ddivu $zero, $16, $17
+0x03 0x2b 0x00 0x1a # CHECK: div $zero, $25, $11
+0x46 0x3a 0xa7 0x03 # CHECK: div.d $f28, $f20, $f26
+0x46 0x0f 0x29 0x03 # CHECK: div.s $f4, $f5, $f15
+0x03 0x2f 0x00 0x1b # CHECK: divu $zero, $25, $15
+0x44 0x2c 0x68 0x00 # CHECK: dmfc1 $12, $f13
+0x44 0xb0 0x70 0x00 # CHECK: dmtc1 $16, $f14
+0x02 0xe9 0x00 0x1c # CHECK: dmult $23, $9
+0x00 0xa6 0x00 0x1d # CHECK: dmultu $5, $6
+0x00 0x00 0x04 0xb8 # CHECK: dsll $zero, $zero, 18
+0x00 0x14 0x04 0xb8 # CHECK: dsll $zero, $20, 18
+0x01 0x94 0x00 0x14 # CHECK: dsllv $zero, $20, $12
+0x00 0x00 0x04 0xbc # CHECK: dsll32 $zero, $zero, 18
+0x00 0x00 0x04 0xbc # CHECK: dsll32 $zero, $zero, 18
+0x01 0x94 0x00 0x14 # CHECK: dsllv $zero, $20, $12
+0x00 0x1c 0xe2 0xbb # CHECK: dsra $gp, $gp, 10
+0x00 0x12 0xe2 0xbb # CHECK: dsra $gp, $18, 10
+0x02 0x72 0xe0 0x17 # CHECK: dsrav $gp, $18, $19
+0x00 0x1c 0xe2 0xbf # CHECK: dsra32 $gp, $gp, 10
+0x00 0x12 0xe2 0xbf # CHECK: dsra32 $gp, $18, 10
+0x02 0x72 0xe0 0x17 # CHECK: dsrav $gp, $18, $19
+0x00 0x13 0x9d 0xfa # CHECK: dsrl $19, $19, 23
+0x00 0x06 0x9d 0xfa # CHECK: dsrl $19, $6, 23
+0x02 0x86 0x98 0x16 # CHECK: dsrlv $19, $6, $20
+0x00 0x13 0x9d 0xfe # CHECK: dsrl32 $19, $19, 23
+0x00 0x06 0x9d 0xfe # CHECK: dsrl32 $19, $6, 23
+0x02 0x86 0x98 0x16 # CHECK: dsrlv $19, $6, $20
+0x02 0xc8 0x38 0x2e # CHECK: dsub $7, $22, $8
+0x00 0xba 0x28 0x2f # CHECK: dsubu $5, $5, $26
+0x00 0x00 0x00 0xc0 # CHECK: ehb
+0x42 0x00 0x00 0x18 # CHECK: eret
+0x46 0x20 0x53 0x8f # CHECK: floor.w.d $f14, $f10
+0x46 0x00 0x4a 0x0f # CHECK: floor.w.s $f8, $f9
+0x46 0x20 0x3e 0x8b # CHECK: floor.l.d $f26, $f7
+0x46 0x00 0x2b 0x0b # CHECK: floor.l.s $f12, $f5
+0x81 0x58 0xc7 0x4d # CHECK: lb $24, -14515($10)
+0x90 0x68 0x75 0xf3 # CHECK: lbu $8, 30195($3)
+0xd6 0x0a 0x40 0x07 # CHECK: ldc1 $f10, 16391($16)
+0xd8 0x28 0xad 0x43 # CHECK: ldc2 $8, -21181($1)
+0x86 0xab 0xde 0x94 # CHECK: lh $11, -8556($21)
+0x94 0x53 0xa6 0xbd # CHECK: lhu $19, -22851($2)
+0x24 0x01 0x8b 0xb3 # CHECK: addiu $1, $zero, -29773
+0x24 0x00 0x8b 0x3f # CHECK: addiu $zero, $zero, -29889
+0xc2 0x42 0xe3 0x67 # CHECK: ll $2, -7321($18)
+0x8c 0xa8 0x16 0x2a # CHECK: lw $8, 5674($5)
+0xc7 0x50 0x27 0xf1 # CHECK: lwc1 $f16, 10225($26)
+0xc8 0xd2 0xfc 0xb7 # CHECK: lwc2 $18, -841($6)
+0x89 0xf4 0xef 0x79 # CHECK: lwl $20, -4231($15)
+0x9b 0x80 0xb5 0x35 # CHECK: lwr $zero, -19147($gp)
+0x4f 0xd1 0x03 0x00 # CHECK: lwxc1 $f12, $17($fp)
+0x44 0x07 0xd8 0x00 # CHECK: mfc1 $7, $f27
+0x00 0x00 0x98 0x10 # CHECK: mfhi $19
+0x00 0x00 0xe8 0x10 # CHECK: mfhi $sp
+0x00 0x00 0x88 0x12 # CHECK: mflo $17
+0x46 0x20 0x75 0x06 # CHECK: mov.d $f20, $f14
+0x46 0x00 0xd8 0x86 # CHECK: mov.s $f2, $f27
+0x01 0x1c 0xe0 0x01 # CHECK: movf $gp, $8, $fcc7
+0x46 0x34 0x59 0x91 # CHECK: movf.d $f6, $f11, $fcc5
+0x46 0x18 0x2d 0xd1 # CHECK: movf.s $f23, $f5, $fcc6
+0x00 0x80 0xf0 0x21 # CHECK: move $fp, $4
+0x00 0xc0 0xc8 0x21 # CHECK: move $25, $6
+0x02 0x30 0x18 0x0b # CHECK: movn $3, $17, $16
+0x46 0x3a 0xae 0xd3 # CHECK: movn.d $f27, $f21, $26
+0x46 0x17 0x03 0x13 # CHECK: movn.s $f12, $f0, $23
+0x02 0x95 0x00 0x01 # CHECK: movt $zero, $20, $fcc5
+0x46 0x21 0x10 0x11 # CHECK: movt.d $f0, $f2, $fcc0
+0x46 0x05 0x17 0x91 # CHECK: movt.s $f30, $f2, $fcc1
+0x02 0xc9 0x28 0x0a # CHECK: movz $5, $22, $9
+0x46 0x29 0xeb 0x12 # CHECK: movz.d $f12, $f29, $9
+0x46 0x03 0x3e 0x52 # CHECK: movz.s $f25, $f7, $3
+0x44 0x9e 0x48 0x00 # CHECK: mtc1 $fp, $f9
+0x02 0x20 0x00 0x11 # CHECK: mthi $17
+0x03 0xa0 0x00 0x13 # CHECK: mtlo $sp
+0x03 0x20 0x00 0x13 # CHECK: mtlo $25
+0x46 0x30 0xa5 0x02 # CHECK: mul.d $f20, $f20, $f16
+0x46 0x02 0x57 0x82 # CHECK: mul.s $f30, $f10, $f2
+0x03 0xb4 0x00 0x18 # CHECK: mult $sp, $20
+0x03 0xa2 0x00 0x18 # CHECK: mult $sp, $2
+0x03 0x9a 0x00 0x19 # CHECK: multu $gp, $26
+0x01 0x32 0x00 0x19 # CHECK: multu $9, $18
+0x00 0x02 0x10 0x23 # CHECK: negu $2, $2
+0x00 0x03 0x10 0x23 # CHECK: negu $2, $3
+0x46 0x20 0x96 0x87 # CHECK: neg.d $f26, $f18
+0x46 0x00 0x78 0x47 # CHECK: neg.s $f1, $f15
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x00 0x07 0x38 0x27 # CHECK: nor $7, $zero, $7
+0x02 0x1d 0x60 0x25 # CHECK: or $12, $16, $sp
+0x34 0x42 0x00 0x04 # CHECK: ori $2, $2, 4
+0xcc 0xa1 0x00 0x08 # CHECK: pref 1, 8($5) 
+0x46 0x20 0x0b 0x08 # CHECK: round.l.d $f12, $f1
+0x46 0x00 0x2e 0x48 # CHECK: round.l.s $f25, $f5
+0x46 0x20 0x21 0x8c # CHECK: round.w.d $f6, $f4
+0x46 0x00 0xe6 0xcc # CHECK: round.w.s $f27, $f28
+0xa1 0xd6 0xb2 0x6f # CHECK: sb $22, -19857($14)
+0xe2 0x6f 0x49 0xd8 # CHECK: sc $15, 18904($19)
+0xf3 0xaf 0xdf 0xcd # CHECK: scd $15, -8243($sp)
+0xfd 0x4c 0x16 0xcb # CHECK: sd $12, 5835($10)
+0xb3 0xc7 0xae 0x1f # CHECK: sdl $7, -20961($fp)
+0xb5 0x8b 0xb0 0x39 # CHECK: sdr $11, -20423($12)
+0xf5 0xbe 0x77 0x6e # CHECK: sdc1 $f30, 30574($13)
+0xfa 0x54 0x5a 0x75 # CHECK: sdc2 $20, 23157($18)
+0x4d 0xca 0x58 0x09 # CHECK: sdxc1 $f11, $10($14)
+0xa5 0xee 0xe5 0xd0 # CHECK: sh $14, -6704($15)
+0x00 0x07 0x3c 0x80 # CHECK: sll $7, $7, 18
+0x00 0x00 0x3c 0x80 # CHECK: sll $7, $zero, 18
+0x01 0x20 0x38 0x04 # CHECK: sllv $7, $zero, $9
+0x01 0x20 0x38 0x04 # CHECK: sllv $7, $zero, $9
+0x01 0x7b 0xb8 0x2a # CHECK: slt $23, $11, $27
+0x29 0x51 0x25 0x11 # CHECK: slti $17, $10, 9489
+0x2f 0x39 0xc3 0x55 # CHECK: sltiu $25, $25, -15531
+0x02 0xab 0xa0 0x2b # CHECK: sltu $20, $21, $11
+0x2f 0x38 0xc3 0x55 # CHECK: sltiu $24, $25, -15531
+0x46 0x20 0xb4 0x04 # CHECK: sqrt.d $f16, $f22
+0x46 0x00 0x08 0x04 # CHECK: sqrt.s $f0, $f1
+0x00 0x11 0x8b 0xc3 # CHECK: sra $17, $17, 15
+0x00 0x17 0x8b 0xc3 # CHECK: sra $17, $23, 15
+0x03 0xb7 0x88 0x07 # CHECK: srav $17, $23, $sp
+0x03 0xb7 0x88 0x07 # CHECK: srav $17, $23, $sp
+0x00 0x02 0x11 0xc2 # CHECK: srl $2, $2, 7
+0x00 0x02 0x11 0xc2 # CHECK: srl $2, $2, 7
+0x00 0x94 0xc8 0x06 # CHECK: srlv $25, $20, $4
+0x00 0x94 0xc8 0x06 # CHECK: srlv $25, $20, $4
+0x00 0x00 0x00 0x40 # CHECK: ssnop
+0x02 0x6c 0xb0 0x22 # CHECK: sub $22, $19, $12
+0x22 0x36 0x0c 0x36 # CHECK: addi $22, $17, 3126
+0x21 0xad 0xe6 0x90 # CHECK: addi $13, $13, -6512
+0x46 0x30 0x14 0x81 # CHECK: sub.d $f18, $f2, $f16
+0x46 0x16 0xb5 0xc1 # CHECK: sub.s $f23, $f22, $f22
+0x02 0xd6 0xe8 0x23 # CHECK: subu $sp, $22, $22
+0xaf 0xbf 0xd8 0x50 # CHECK: sw $ra, -10160($sp)
+0xe7 0x06 0xde 0xef # CHECK: swc1 $f6, -8465($24)
+0xea 0x19 0x61 0x30 # CHECK: swc2 $25, 24880($16)
+0xaa 0x6f 0x35 0x7e # CHECK: swl $15, 13694($19)
+0xb9 0xd1 0x98 0x22 # CHECK: swr $17, -26590($14)
+0x4f 0x4c 0x98 0x08 # CHECK: swxc1 $f19, $12($26)
+0x00 0x03 0x00 0x34 # CHECK: teq $zero, $3
+0x00 0xa7 0x9b 0x34 # CHECK: teq $5, $7, 620
+0x06 0xac 0xbb 0xa0 # CHECK: teqi $21, 48032
+0x00 0xea 0x00 0x30 # CHECK: tge $7, $10
+0x00 0xb3 0x55 0x30 # CHECK: tge $5, $19, 340
+0x06 0x28 0x13 0xa1 # CHECK: tgei $17, 5025
+0x07 0xa9 0x90 0x33 # CHECK: tgeiu $sp, 36915
+0x02 0xdc 0x00 0x31 # CHECK: tgeu $22, $gp
+0x02 0x8e 0x5e 0xf1 # CHECK: tgeu $20, $14, 379
+0x42 0x00 0x00 0x08 # CHECK: tlbp
+0x42 0x00 0x00 0x01 # CHECK: tlbr
+0x42 0x00 0x00 0x02 # CHECK: tlbwi
+0x42 0x00 0x00 0x06 # CHECK: tlbwr
+0x01 0xed 0x00 0x32 # CHECK: tlt $15, $13
+0x00 0x53 0x21 0x72 # CHECK: tlt $2, $19, 133
+0x05 0xca 0xad 0xbd # CHECK: tlti $14, 44477
+0x07 0xeb 0xec 0x2c # CHECK: tltiu $ra, 60460
+0x01 0x70 0x00 0x33 # CHECK: tltu $11, $16
+0x02 0x1d 0xfe 0x33 # CHECK: tltu $16, $sp, 1016
+0x00 0xd1 0x00 0x36 # CHECK: tne $6, $17
+0x00 0xe8 0xdd 0x76 # CHECK: tne $7, $8, 885
+0x05 0x8e 0x8c 0x31 # CHECK: tnei $12, 35889
+0x46 0x20 0xbd 0xc9 # CHECK: trunc.l.d $f23, $f23
+0x46 0x00 0xff 0x09 # CHECK: trunc.l.s $f28, $f31
+0x46 0x20 0x75 0x8d # CHECK: trunc.w.d $f22, $f14
+0x46 0x00 0xf7 0x0d # CHECK: trunc.w.s $f28, $f30
+0x00 0x9e 0x90 0x26 # CHECK: xor $18, $4, $fp
diff --git a/test/MC/Disassembler/Mips/mips4/valid-xfail-mips4.txt b/test/MC/Disassembler/Mips/mips4/valid-xfail-mips4.txt
new file mode 100644
index 0000000..910de16
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips4/valid-xfail-mips4.txt
@@ -0,0 +1,42 @@
+# RUN: llvm-mc %s -triple=mips64-unknown-linux -disassemble -mcpu=mips4 | FileCheck %s
+# XFAIL: *
+0x46 0x2f 0x79 0x32 # CHECK: c.eq.d $fcc1, $f15, $f15
+0x46 0x11 0xc5 0x32 # CHECK: c.eq.s $fcc5, $f24, $f17
+0x46 0x35 0x5c 0x30 # CHECK: c.f.d $fcc4, $f11, $f21
+0x46 0x07 0xf4 0x30 # CHECK: c.f.s $fcc4, $f30, $f7
+0x46 0x21 0x94 0x3e # CHECK: c.le.d $fcc4, $f18, $f1
+0x46 0x04 0xc6 0x3e # CHECK: c.le.s $fcc6, $f24, $f4
+0x46 0x23 0x4b 0x3c # CHECK: c.lt.d $fcc3, $f9, $f3
+0x46 0x0e 0x8a 0x3c # CHECK: c.lt.s $fcc2, $f17, $f14
+0x46 0x30 0xad 0x3d # CHECK: c.nge.d $fcc5, $f21, $f16
+0x46 0x08 0x5b 0x3d # CHECK: c.nge.s $fcc3, $f11, $f8
+0x46 0x17 0xfa 0x3b # CHECK: c.ngl.s $fcc2, $f31, $f23
+0x46 0x17 0x92 0x39 # CHECK: c.ngle.s $fcc2, $f18, $f23
+0x46 0x27 0xc4 0x3f # CHECK: c.ngt.d $fcc4, $f24, $f7
+0x46 0x0d 0x45 0x3f # CHECK: c.ngt.s $fcc5, $f8, $f13
+0x46 0x3f 0x82 0x36 # CHECK: c.ole.d $fcc2, $f16, $f31
+0x46 0x14 0x3b 0x36 # CHECK: c.ole.s $fcc3, $f7, $f20
+0x46 0x3c 0x9c 0x34 # CHECK: c.olt.d $fcc4, $f19, $f28
+0x46 0x07 0xa6 0x34 # CHECK: c.olt.s $fcc6, $f20, $f7
+0x46 0x27 0xfc 0x3a # CHECK: c.seq.d $fcc4, $f31, $f7
+0x46 0x19 0x0f 0x3a # CHECK: c.seq.s $fcc7, $f1, $f25
+0x46 0x39 0x6c 0x33 # CHECK: c.ueq.d $fcc4, $f13, $f25
+0x46 0x1e 0x1e 0x33 # CHECK: c.ueq.s $fcc6, $f3, $f30
+0x46 0x32 0xcf 0x37 # CHECK: c.ule.d $fcc7, $f25, $f18
+0x46 0x1e 0xaf 0x37 # CHECK: c.ule.s $fcc7, $f21, $f30
+0x46 0x31 0x36 0x35 # CHECK: c.ult.d $fcc6, $f6, $f17
+0x46 0x0a 0xc7 0x35 # CHECK: c.ult.s $fcc7, $f24, $f10
+0x46 0x38 0xbe 0x31 # CHECK: c.un.d $fcc6, $f23, $f24
+0x46 0x04 0xf1 0x31 # CHECK: c.un.s $fcc1, $f30, $f4
+0x4e 0x74 0xd4 0xa1 # CHECK: madd.d $f18, $f19, $f26, $f20
+0x4f 0xf9 0x98 0x60 # CHECK: madd.s $f1, $f31, $f19, $f25
+0x4c 0x32 0xfa 0xa9 # CHECK: msub.d $f10, $f1, $f31, $f18
+0x4e 0x70 0x53 0x28 # CHECK: msub.s $f12, $f19, $f10, $f16
+0x4d 0x33 0x74 0xb1 # CHECK: nmadd.d $f18, $f9, $f14, $f19
+0x4c 0xac 0xc8 0x30 # CHECK: nmadd.s $f0, $f5, $f25, $f12
+0x4d 0x1e 0x87 0xb9 # CHECK: nmsub.d $f30, $f8, $f16, $f30
+0x4f 0x04 0x98 0x78 # CHECK: nmsub.s $f1, $f24, $f19, $f4
+0x46 0x20 0x34 0xd5 # CHECK: recip.d $f19, $f6
+0x46 0x00 0xf0 0xd5 # CHECK: recip.s $f3, $f30
+0x46 0x20 0xe0 0xd6 # CHECK: rsqrt.d $f3, $f28
+0x46 0x00 0x41 0x16 # CHECK: rsqrt.s $f4, $f8
diff --git a/test/MC/Disassembler/Mips/mips64/valid-mips64-el.txt b/test/MC/Disassembler/Mips/mips64/valid-mips64-el.txt
new file mode 100644
index 0000000..698ebfb
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64/valid-mips64-el.txt
@@ -0,0 +1,216 @@
+# RUN: llvm-mc --disassemble %s -triple=mips64el-unknown-linux | FileCheck %s
+0x05 0x73 0x20 0x46 # CHECK: abs.d $f12, $f14
+0x85 0x39 0x00 0x46 # CHECK: abs.s $f6, $f7
+0x20 0x48 0xc7 0x00 # CHECK: add $9, $6, $7
+0x00 0x62 0x2e 0x46 # CHECK: add.d $f8, $f12, $f14
+0x40 0x32 0x07 0x46 # CHECK: add.s $f9, $f6, $f7
+0x67 0x45 0xc9 0x20 # CHECK: addi $9, $6, 17767
+0x67 0xc5 0xc9 0x24 # CHECK: addiu $9, $6, -15001
+0x21 0x48 0xc7 0x00 # CHECK: addu $9, $6, $7
+0x24 0x48 0xc7 0x00 # CHECK: and $9, $6, $7
+0x67 0x45 0xc9 0x30 # CHECK: andi $9, $6, 17767
+0x4c 0x01 0x00 0x10 # CHECK: b 1332
+0x4c 0x01 0x00 0x45 # CHECK: bc1f 1332
+0x4c 0x01 0x1c 0x45 # CHECK: bc1f $fcc7, 1332
+0x4c 0x01 0x01 0x45 # CHECK: bc1t 1332
+0x4c 0x01 0x1d 0x45 # CHECK: bc1t $fcc7, 1332
+0x4c 0x01 0x26 0x11 # CHECK: beq $9, $6, 1332
+0x4c 0x01 0xc1 0x04 # CHECK: bgez $6, 1332
+0x4c 0x01 0xd1 0x04 # CHECK: bgezal $6, 1332
+0x4c 0x01 0xc0 0x1c # CHECK: bgtz $6, 1332
+0x4c 0x01 0xc0 0x18 # CHECK: blez $6, 1332
+0x4c 0x01 0x26 0x15 # CHECK: bne $9, $6, 1332
+0x32 0x60 0x2e 0x46 # CHECK: c.eq.d $f12, $f14
+0x32 0x30 0x07 0x46 # CHECK: c.eq.s $f6, $f7
+0x30 0x60 0x2e 0x46 # CHECK: c.f.d $f12, $f14
+0x30 0x30 0x07 0x46 # CHECK: c.f.s $f6, $f7
+0x3e 0x60 0x2e 0x46 # CHECK: c.le.d $f12, $f14
+0x3e 0x30 0x07 0x46 # CHECK: c.le.s $f6, $f7
+0x3c 0x60 0x2e 0x46 # CHECK: c.lt.d $f12, $f14
+0x3c 0x30 0x07 0x46 # CHECK: c.lt.s $f6, $f7
+0x3d 0x60 0x2e 0x46 # CHECK: c.nge.d $f12, $f14
+0x3d 0x30 0x07 0x46 # CHECK: c.nge.s $f6, $f7
+0x3b 0x60 0x2e 0x46 # CHECK: c.ngl.d $f12, $f14
+0x3b 0x30 0x07 0x46 # CHECK: c.ngl.s $f6, $f7
+0x39 0x60 0x2e 0x46 # CHECK: c.ngle.d $f12, $f14
+0x39 0x30 0x07 0x46 # CHECK: c.ngle.s $f6, $f7
+0x3f 0x60 0x2e 0x46 # CHECK: c.ngt.d $f12, $f14
+0x3f 0x30 0x07 0x46 # CHECK: c.ngt.s $f6, $f7
+0x36 0x60 0x2e 0x46 # CHECK: c.ole.d $f12, $f14
+0x36 0x30 0x07 0x46 # CHECK: c.ole.s $f6, $f7
+0x34 0x60 0x2e 0x46 # CHECK: c.olt.d $f12, $f14
+0x34 0x30 0x07 0x46 # CHECK: c.olt.s $f6, $f7
+0x3a 0x60 0x2e 0x46 # CHECK: c.seq.d $f12, $f14
+0x3a 0x30 0x07 0x46 # CHECK: c.seq.s $f6, $f7
+0x38 0x60 0x2e 0x46 # CHECK: c.sf.d $f12, $f14
+0x38 0x30 0x07 0x46 # CHECK: c.sf.s $f6, $f7
+0x33 0x60 0x2e 0x46 # CHECK: c.ueq.d $f12, $f14
+0x33 0xe0 0x12 0x46 # CHECK: c.ueq.s $f28, $f18
+0x37 0x60 0x2e 0x46 # CHECK: c.ule.d $f12, $f14
+0x37 0x30 0x07 0x46 # CHECK: c.ule.s $f6, $f7
+0x35 0x60 0x2e 0x46 # CHECK: c.ult.d $f12, $f14
+0x35 0x30 0x07 0x46 # CHECK: c.ult.s $f6, $f7
+0x31 0x60 0x2e 0x46 # CHECK: c.un.d $f12, $f14
+0x31 0x30 0x07 0x46 # CHECK: c.un.s $f6, $f7
+0x0e 0x73 0x20 0x46 # CHECK: ceil.w.d $f12, $f14
+0x8e 0x39 0x00 0x46 # CHECK: ceil.w.s $f6, $f7
+0x4a 0x18 0x20 0x46 # CHECK: ceil.l.d $f1, $f3
+0x8a 0x6c 0x00 0x46 # CHECK: ceil.l.s $f18, $f13
+0x00 0x38 0x46 0x44 # CHECK: cfc1 $6, $7
+0x21 0x30 0xe6 0x70 # CHECK: clo $6, $7
+0x20 0x30 0xe6 0x70 # CHECK: clz $6, $7
+0x00 0x38 0xc6 0x44 # CHECK: ctc1 $6, $7
+0xa1 0x39 0x00 0x46 # CHECK: cvt.d.s $f6, $f7
+0x21 0x81 0xa0 0x46 # CHECK: cvt.d.l $f4, $f16
+0x21 0x73 0x80 0x46 # CHECK: cvt.d.w $f12, $f14
+0x20 0x73 0x20 0x46 # CHECK: cvt.s.d $f12, $f14
+0xe0 0xf3 0xa0 0x46 # CHECK: cvt.s.l $f15, $f30
+0xa0 0x39 0x80 0x46 # CHECK: cvt.s.w $f6, $f7
+0x24 0x73 0x20 0x46 # CHECK: cvt.w.d $f12, $f14
+0xa4 0x39 0x00 0x46 # CHECK: cvt.w.s $f6, $f7
+0x2c 0x98 0x3f 0x00 # CHECK: dadd $19, $1, $ra
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0xbd 0x63 # CHECK: daddi $sp, $sp, -27705
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0xbd 0x63 # CHECK: daddi $sp, $sp, -27705
+0x16 0xee 0xda 0x66 # CHECK: daddiu $26, $22, -4586
+0x2d 0x98 0x3f 0x00 # CHECK: daddu $19, $1, $ra
+0x9f 0x46 0x58 0x64 # CHECK: daddiu $24, $2, 18079
+0x3f 0x69 0x73 0x66 # CHECK: daddiu $19, $19, 26943
+0x25 0x90 0xd2 0x70 # CHECK: dclo $18, $6
+0x24 0x80 0x30 0x73 # CHECK: dclz $16, $25
+0x1e 0x00 0x53 0x03 # CHECK: ddiv $zero, $26, $19
+0x1f 0x00 0x11 0x02 # CHECK: ddivu $zero, $16, $17
+0x00 0x68 0x2c 0x44 # CHECK: dmfc1 $12, $f13
+0x00 0x70 0xb0 0x44 # CHECK: dmtc1 $16, $f14
+0x1c 0x00 0xe9 0x02 # CHECK: dmult $23, $9
+0x1d 0x00 0xa6 0x00 # CHECK: dmultu $5, $6
+0xb8 0x04 0x00 0x00 # CHECK: dsll $zero, $zero, 18
+0xb8 0x04 0x14 0x00 # CHECK: dsll $zero, $20, 18
+0x14 0x00 0x94 0x01 # CHECK: dsllv $zero, $20, $12
+0xbc 0x04 0x00 0x00 # CHECK: dsll32 $zero, $zero, 18
+0xbc 0x04 0x00 0x00 # CHECK: dsll32 $zero, $zero, 18
+0x14 0x00 0x94 0x01 # CHECK: dsllv $zero, $20, $12
+0xbb 0xe2 0x1c 0x00 # CHECK: dsra $gp, $gp, 10
+0xbb 0xe2 0x12 0x00 # CHECK: dsra $gp, $18, 10
+0x17 0xe0 0x72 0x02 # CHECK: dsrav $gp, $18, $19
+0xbf 0xe2 0x1c 0x00 # CHECK: dsra32 $gp, $gp, 10
+0xbf 0xe2 0x12 0x00 # CHECK: dsra32 $gp, $18, 10
+0x17 0xe0 0x72 0x02 # CHECK: dsrav $gp, $18, $19
+0xfa 0x9d 0x13 0x00 # CHECK: dsrl $19, $19, 23
+0xfa 0x9d 0x06 0x00 # CHECK: dsrl $19, $6, 23
+0x16 0x98 0x86 0x02 # CHECK: dsrlv $19, $6, $20
+0xfe 0x9d 0x13 0x00 # CHECK: dsrl32 $19, $19, 23
+0xfe 0x9d 0x06 0x00 # CHECK: dsrl32 $19, $6, 23
+0x16 0x98 0x86 0x02 # CHECK: dsrlv $19, $6, $20
+0x2e 0x38 0xc8 0x02 # CHECK: dsub $7, $22, $8
+0x39 0x6c 0x9d 0x62 # CHECK: daddi $sp, $20, 27705
+0x39 0x6c 0xbd 0x63 # CHECK: daddi $sp, $sp, 27705
+0x2f 0x28 0xba 0x00 # CHECK: dsubu $5, $5, $26
+0x5f 0xec 0x6f 0x65 # CHECK: daddiu $15, $11, -5025
+0xea 0x11 0xce 0x65 # CHECK: daddiu $14, $14, 4586
+0x8b 0x3e 0x20 0x46 # CHECK: floor.l.d $f26, $f7
+0x0b 0x2b 0x00 0x46 # CHECK: floor.l.s $f12, $f5
+0x0f 0x73 0x20 0x46 # CHECK: floor.w.d $f12, $f14
+0x8f 0x39 0x00 0x46 # CHECK: floor.w.s $f6, $f7
+0x4c 0x01 0x00 0x08 # CHECK: j 1328
+0x4c 0x01 0x00 0x0c # CHECK: jal 1328
+0x4c 0x01 0x00 0x74 # CHECK: jalx 1328
+0x09 0xf8 0xe0 0x00 # CHECK: jalr $7
+0x09 0xfc 0x80 0x00 # CHECK: jalr.hb $4
+0x09 0x24 0xa0 0x00 # CHECK: jalr.hb $4, $5
+0x08 0x00 0xe0 0x00 # CHECK: jr $7
+0xc6 0x23 0xa4 0x80 # CHECK: lb $4, 9158($5)
+0x06 0x00 0xa4 0x90 # CHECK: lbu $4, 6($5)
+0xc6 0x23 0xe9 0xd4 # CHECK: ldc1 $f9, 9158($7)
+0x1b 0x90 0x3d 0xde # CHECK: ld $sp, -28645($17)
+0xb9 0xef 0x18 0x6b # CHECK: ldl $24, -4167($24)
+0x6a 0x89 0x8e 0x6e # CHECK: ldr $14, -30358($20)
+0x01 0x02 0xf7 0x4d # CHECK: ldxc1 $f8, $23($15)
+0x0c 0x00 0xa4 0x84 # CHECK: lh $4, 12($5)
+0x0c 0x00 0xa4 0x84 # CHECK: lh $4, 12($5)
+0xc6 0x23 0xe9 0xc0 # CHECK: ll $9, 9158($7)
+0x70 0xc6 0xe0 0xd3 # CHECK: lld $zero, -14736($ra)
+0x67 0x45 0x06 0x3c # CHECK: lui $6, 17767
+0xc5 0x04 0xb6 0x4e # CHECK: luxc1 $f19, $22($21)
+0xea 0xa1 0x73 0x9c # CHECK: lwu $19, -24086($3)
+0x00 0x03 0xd1 0x4f # CHECK: lwxc1 $f12, $17($fp)
+0x18 0x00 0xa4 0x8c # CHECK: lw $4, 24($5)
+0xc6 0x23 0xe9 0xc4 # CHECK: lwc1 $f9, 9158($7)
+0x03 0x00 0x82 0x88 # CHECK: lwl $2, 3($4)
+0x10 0x00 0xa3 0x98 # CHECK: lwr $3, 16($5)
+0x00 0x00 0xc7 0x70 # CHECK: madd $6, $7
+0x01 0x00 0xc7 0x70 # CHECK: maddu $6, $7
+0x00 0x38 0x06 0x44 # CHECK: mfc1 $6, $f7
+0x10 0x28 0x00 0x00 # CHECK: mfhi $5
+0x12 0x28 0x00 0x00 # CHECK: mflo $5
+0x86 0x41 0x20 0x46 # CHECK: mov.d $f6, $f8
+0x86 0x39 0x00 0x46 # CHECK: mov.s $f6, $f7
+0x04 0x00 0xc7 0x70 # CHECK: msub $6, $7
+0x05 0x00 0xc7 0x70 # CHECK: msubu $6, $7
+0x00 0x38 0x86 0x44 # CHECK: mtc1 $6, $f7
+0x11 0x00 0xe0 0x00 # CHECK: mthi $7
+0x13 0x00 0xe0 0x00 # CHECK: mtlo $7
+0x02 0x62 0x2e 0x46 # CHECK: mul.d $f8, $f12, $f14
+0x42 0x32 0x07 0x46 # CHECK: mul.s $f9, $f6, $f7
+0x02 0x48 0xc7 0x70 # CHECK: mul $9, $6, $7
+0x18 0x00 0x65 0x00 # CHECK: mult $3, $5
+0x19 0x00 0x65 0x00 # CHECK: multu $3, $5
+0x07 0x73 0x20 0x46 # CHECK: neg.d $f12, $f14
+0x87 0x39 0x00 0x46 # CHECK: neg.s $f6, $f7
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x27 0x48 0xc7 0x00 # CHECK: nor $9, $6, $7
+0x25 0x18 0x65 0x00 # CHECK: or $3, $3, $5
+0x67 0x45 0xc9 0x34 # CHECK: ori $9, $6, 17767
+0x08 0x0b 0x20 0x46 # CHECK: round.l.d $f12, $f1
+0x48 0x2e 0x00 0x46 # CHECK: round.l.s $f25, $f5
+0x0c 0x73 0x20 0x46 # CHECK: round.w.d $f12, $f14
+0x8c 0x39 0x00 0x46 # CHECK: round.w.s $f6, $f7
+0xc6 0x23 0xa4 0xa0 # CHECK: sb $4, 9158($5)
+0x06 0x00 0xa4 0xa0 # CHECK: sb $4, 6($5)
+0xc6 0x23 0xe9 0xe0 # CHECK: sc $9, 9158($7)
+0xcd 0xdf 0xaf 0xf3 # CHECK: scd $15, -8243($sp)
+0xcb 0x16 0x4c 0xfd # CHECK: sd $12, 5835($10)
+0xc6 0x23 0xe9 0xf4 # CHECK: sdc1 $f9, 9158($7)
+0x1f 0xae 0xc7 0xb3 # CHECK: sdl $7, -20961($fp)
+0x39 0xb0 0x8b 0xb5 # CHECK: sdr $11, -20423($12)
+0x09 0x58 0xca 0x4d # CHECK: sdxc1 $f11, $10($14)
+0xc6 0x23 0xa4 0xa4 # CHECK: sh $4, 9158($5)
+0xc0 0x21 0x03 0x00 # CHECK: sll $4, $3, 7
+0x04 0x10 0xa3 0x00 # CHECK: sllv $2, $3, $5
+0x2a 0x18 0x65 0x00 # CHECK: slt $3, $3, $5
+0x67 0x00 0x63 0x28 # CHECK: slti $3, $3, 103
+0x67 0x00 0x63 0x2c # CHECK: sltiu $3, $3, 103
+0x2b 0x18 0x65 0x00 # CHECK: sltu $3, $3, $5
+0x04 0x73 0x20 0x46 # CHECK: sqrt.d $f12, $f14
+0x84 0x39 0x00 0x46 # CHECK: sqrt.s $f6, $f7
+0xc3 0x21 0x03 0x00 # CHECK: sra $4, $3, 7
+0x07 0x10 0xa3 0x00 # CHECK: srav $2, $3, $5
+0xc2 0x21 0x03 0x00 # CHECK: srl $4, $3, 7
+0x06 0x10 0xa3 0x00 # CHECK: srlv $2, $3, $5
+0x01 0x62 0x2e 0x46 # CHECK: sub.d $f8, $f12, $f14
+0x41 0x32 0x07 0x46 # CHECK: sub.s $f9, $f6, $f7
+0x22 0x48 0xc7 0x00 # CHECK: sub $9, $6, $7
+0x23 0x20 0x65 0x00 # CHECK: subu $4, $3, $5
+0x18 0x00 0xa4 0xac # CHECK: sw $4, 24($5)
+0xc6 0x23 0xe9 0xe4 # CHECK: swc1 $f9, 9158($7)
+0x0d 0x60 0xbb 0x4d # CHECK: suxc1 $f12, $27($13)
+0x08 0x98 0x4c 0x4f # CHECK: swxc1 $f19, $12($26)
+0x10 0x00 0xa4 0xa8 # CHECK: swl $4, 16($5)
+0x10 0x00 0xe6 0xb8 # CHECK: swr $6, 16($7)
+0xcf 0x01 0x00 0x00 # CHECK: sync 7
+0xc9 0xbd 0x20 0x46 # CHECK: trunc.l.d $f23, $f23
+0x09 0xff 0x00 0x46 # CHECK: trunc.l.s $f28, $f31
+0x0d 0x73 0x20 0x46 # CHECK: trunc.w.d $f12, $f14
+0x8d 0x39 0x00 0x46 # CHECK: trunc.w.s $f6, $f7
+0x26 0x18 0x65 0x00 # CHECK: xor $3, $3, $5
+0x67 0x45 0xc9 0x38 # CHECK: xori $9, $6, 17767
+0x3b 0xe8 0x05 0x7c # CHECK: .set push
+                    # CHECK: .set mips32r2
+                    # CHECK: rdhwr $5, $29
+                    # CHECK: .set pop
+0x02 0x00 0x61 0xbc # CHECK: cache 1, 2($3)
+0x04 0x00 0x43 0xcc # CHECK: pref 3, 4($2)
+0xc6 0x23 0xe9 0xe8 # CHECK: swc2 $9, 9158($7)
+0xca 0x23 0xc8 0xc8 # CHECK: lwc2 $8, 9162($6)
diff --git a/test/MC/Disassembler/Mips/mips64/valid-mips64-xfail.txt b/test/MC/Disassembler/Mips/mips64/valid-mips64-xfail.txt
new file mode 100644
index 0000000..dd97bcd
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64/valid-mips64-xfail.txt
@@ -0,0 +1,80 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -disassemble | FileCheck %s
+# XFAIL: *
+0x46 0x2f 0x79 0x32 # CHECK: c.eq.d $fcc1, $f15, $f15
+0x46 0x11 0xc5 0x32 # CHECK: c.eq.s $fcc5, $f24, $f17
+0x46 0x35 0x5c 0x30 # CHECK: c.f.d $fcc4, $f11, $f21
+0x46 0x07 0xf4 0x30 # CHECK: c.f.s $fcc4, $f30, $f7
+0x46 0x21 0x94 0x3e # CHECK: c.le.d $fcc4, $f18, $f1
+0x46 0x04 0xc6 0x3e # CHECK: c.le.s $fcc6, $f24, $f4
+0x46 0x23 0x4b 0x3c # CHECK: c.lt.d $fcc3, $f9, $f3
+0x46 0x0e 0x8a 0x3c # CHECK: c.lt.s $fcc2, $f17, $f14
+0x46 0x30 0xad 0x3d # CHECK: c.nge.d $fcc5, $f21, $f16
+0x46 0x08 0x5b 0x3d # CHECK: c.nge.s $fcc3, $f11, $f8
+0x46 0x17 0xfa 0x3b # CHECK: c.ngl.s $fcc2, $f31, $f23
+0x46 0x17 0x92 0x39 # CHECK: c.ngle.s $fcc2, $f18, $f23
+0x46 0x27 0xc4 0x3f # CHECK: c.ngt.d $fcc4, $f24, $f7
+0x46 0x0d 0x45 0x3f # CHECK: c.ngt.s $fcc5, $f8, $f13
+0x46 0x3f 0x82 0x36 # CHECK: c.ole.d $fcc2, $f16, $f31
+0x46 0x14 0x3b 0x36 # CHECK: c.ole.s $fcc3, $f7, $f20
+0x46 0x3c 0x9c 0x34 # CHECK: c.olt.d $fcc4, $f19, $f28
+0x46 0x07 0xa6 0x34 # CHECK: c.olt.s $fcc6, $f20, $f7
+0x46 0x27 0xfc 0x3a # CHECK: c.seq.d $fcc4, $f31, $f7
+0x46 0x19 0x0f 0x3a # CHECK: c.seq.s $fcc7, $f1, $f25
+0x46 0x39 0x6c 0x33 # CHECK: c.ueq.d $fcc4, $f13, $f25
+0x46 0x1e 0x1e 0x33 # CHECK: c.ueq.s $fcc6, $f3, $f30
+0x46 0x32 0xcf 0x37 # CHECK: c.ule.d $fcc7, $f25, $f18
+0x46 0x1e 0xaf 0x37 # CHECK: c.ule.s $fcc7, $f21, $f30
+0x46 0x31 0x36 0x35 # CHECK: c.ult.d $fcc6, $f6, $f17
+0x46 0x0a 0xc7 0x35 # CHECK: c.ult.s $fcc7, $f24, $f10
+0x46 0x38 0xbe 0x31 # CHECK: c.un.d $fcc6, $f23, $f24
+0x46 0x04 0xf1 0x31 # CHECK: c.un.s $fcc1, $f30, $f4
+0x46 0xc0 0x45 0x85 # CHECK: abs.ps $f22, $f8
+0x46 0xcd 0xde 0x40 # CHECK: add.ps $f25, $f27, $f13
+0x4d 0x9e 0x93 0x1e # CHECK: alnv.ps $f12, $f18, $f30, $8
+0x46 0xc9 0x05 0x32 # CHECK: c.eq.ps $fcc5, $f0, $f9
+0x46 0xcb 0x5e 0x30 # CHECK: c.f.ps $fcc6, $f11, $f11
+0x46 0xd4 0x39 0x3e # CHECK: c.le.ps $fcc1, $f7, $f20
+0x46 0xc5 0x98 0x3c # CHECK: c.lt.ps $f19, $f5
+0x46 0xda 0x08 0x3d # CHECK: c.nge.ps $f1, $f26
+0x46 0xde 0xa8 0x3b # CHECK: c.ngl.ps $f21, $f30
+0x46 0xd4 0x67 0x39 # CHECK: c.ngle.ps $fcc7, $f12, $f20
+0x46 0xc6 0xf5 0x3f # CHECK: c.ngt.ps $fcc5, $f30, $f6
+0x46 0xc8 0xaf 0x36 # CHECK: c.ole.ps $fcc7, $f21, $f8
+0x46 0xd0 0x3b 0x34 # CHECK: c.olt.ps $fcc3, $f7, $f16
+0x46 0xce 0xfe 0x3a # CHECK: c.seq.ps $fcc6, $f31, $f14
+0x46 0xc6 0x26 0x38 # CHECK: c.sf.ps $fcc6, $f4, $f6
+0x46 0xdd 0x29 0x33 # CHECK: c.ueq.ps $fcc1, $f5, $f29
+0x46 0xc3 0x8e 0x37 # CHECK: c.ule.ps $fcc6, $f17, $f3
+0x46 0xc0 0x77 0x35 # CHECK: c.ult.ps $fcc7, $f14, $f0
+0x46 0xda 0x14 0x31 # CHECK: c.un.ps $fcc4, $f2, $f26
+0x46 0x13 0x90 0xe6 # CHECK: cvt.ps.s $f3, $f18, $f19
+0x46 0xc0 0x0f 0xa8 # CHECK: cvt.s.pl $f30, $f1
+0x46 0xc0 0xcb 0xa0 # CHECK: cvt.s.pu $f14, $f25
+0x4e 0x74 0xd4 0xa1 # CHECK: madd.d $f18, $f19, $f26, $f20
+0x4c 0x63 0x75 0xa6 # CHECK: madd.ps $f22, $f3, $f14, $f3
+0x4f 0xf9 0x98 0x60 # CHECK: madd.s $f1, $f31, $f19, $f25
+0x46 0xc0 0x8d 0x86 # CHECK: mov.ps $f22, $f17
+0x46 0xd8 0xe2 0x91 # CHECK: movf.ps $f10, $f28, $fcc6
+0x46 0xd3 0xff 0xd3 # CHECK: movn.ps $f31, $f31, $19
+0x46 0xc9 0xcd 0x11 # CHECK: movt.ps $f20, $f25, $fcc2
+0x46 0xdf 0x8c 0x92 # CHECK: movz.ps $f18, $f17, ra
+0x4c 0x32 0xfa 0xa9 # CHECK: msub.d $f10, $f1, $f31, $f18
+0x4d 0xd1 0xeb 0x2e # CHECK: msub.ps $f12, $f14, $f29, $f17
+0x4e 0x70 0x53 0x28 # CHECK: msub.s $f12, $f19, $f10, $f16
+0x46 0xd0 0x03 0x82 # CHECK: mul.ps $f14, $f0, $f16
+0x46 0xc0 0x6c 0xc7 # CHECK: neg.ps $f19, $f13
+0x4d 0x33 0x74 0xb1 # CHECK: nmadd.d $f18, $f9, $f14, $f19
+0x4c 0x99 0x4e 0xf6 # CHECK: nmadd.ps $f27, $f4, $f9, $f25
+0x4c 0xac 0xc8 0x30 # CHECK: nmadd.s $f0, $f5, $f25, $f12
+0x4d 0x1e 0x87 0xb9 # CHECK: nmsub.d $f30, $f8, $f16, $f30
+0x4d 0x91 0x71 0xbe # CHECK: nmsub.ps $f6, $f12, $f14, $f17
+0x4f 0x04 0x98 0x78 # CHECK: nmsub.s $f1, $f24, $f19, $f4
+0x46 0xde 0x4e 0x6c # CHECK: pll.ps $f25, $f9, $f30
+0x46 0xdd 0xd0 0x6d # CHECK: plu.ps $f1, $f26, $f29
+0x46 0xda 0xf2 0x6e # CHECK: pul.ps $f9, $f30, $f26
+0x46 0xc2 0x4e 0x2f # CHECK: puu.ps $f24, $f9, $f2
+0x46 0x20 0x34 0xd5 # CHECK: recip.d $f19, $f6
+0x46 0x00 0xf0 0xd5 # CHECK: recip.s $f3, $f30
+0x46 0x20 0xe0 0xd6 # CHECK: rsqrt.d $f3, $f28
+0x46 0x00 0x41 0x16 # CHECK: rsqrt.s $f4, $f8
+0x46 0xda 0x71 0x41 # CHECK: sub.ps $f5, $f14, $f26
diff --git a/test/MC/Disassembler/Mips/mips64/valid-mips64.txt b/test/MC/Disassembler/Mips/mips64/valid-mips64.txt
new file mode 100644
index 0000000..b0809d8
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64/valid-mips64.txt
@@ -0,0 +1,216 @@
+# RUN: llvm-mc --disassemble %s -triple=mips64-unknown-linux | FileCheck %s
+0x46 0x20 0x73 0x05 # CHECK: abs.d $f12, $f14
+0x46 0x00 0x39 0x85 # CHECK: abs.s $f6, $f7
+0x00 0xc7 0x48 0x20 # CHECK: add $9, $6, $7
+0x46 0x2e 0x62 0x00 # CHECK: add.d $f8, $f12, $f14
+0x46 0x07 0x32 0x40 # CHECK: add.s $f9, $f6, $f7
+0x20 0xc9 0x45 0x67 # CHECK: addi $9, $6, 17767
+0x24 0xc9 0xc5 0x67 # CHECK: addiu $9, $6, -15001
+0x00 0xc7 0x48 0x21 # CHECK: addu $9, $6, $7
+0x00 0xc7 0x48 0x24 # CHECK: and $9, $6, $7
+0x30 0xc9 0x45 0x67 # CHECK: andi $9, $6, 17767
+0x10 0x00 0x01 0x4c # CHECK: b 1332
+0x45 0x00 0x01 0x4c # CHECK: bc1f 1332
+0x45 0x1c 0x01 0x4c # CHECK: bc1f $fcc7, 1332
+0x45 0x01 0x01 0x4c # CHECK: bc1t 1332
+0x45 0x1d 0x01 0x4c # CHECK: bc1t $fcc7, 1332
+0x11 0x26 0x01 0x4c # CHECK: beq $9, $6, 1332
+0x04 0xc1 0x01 0x4c # CHECK: bgez $6, 1332
+0x04 0xd1 0x01 0x4c # CHECK: bgezal $6, 1332
+0x1c 0xc0 0x01 0x4c # CHECK: bgtz $6, 1332
+0x18 0xc0 0x01 0x4c # CHECK: blez $6, 1332
+0x15 0x26 0x01 0x4c # CHECK: bne $9, $6, 1332
+0x46 0x2e 0x60 0x32 # CHECK: c.eq.d $f12, $f14
+0x46 0x07 0x30 0x32 # CHECK: c.eq.s $f6, $f7
+0x46 0x2e 0x60 0x30 # CHECK: c.f.d $f12, $f14
+0x46 0x07 0x30 0x30 # CHECK: c.f.s $f6, $f7
+0x46 0x2e 0x60 0x3e # CHECK: c.le.d $f12, $f14
+0x46 0x07 0x30 0x3e # CHECK: c.le.s $f6, $f7
+0x46 0x2e 0x60 0x3c # CHECK: c.lt.d $f12, $f14
+0x46 0x07 0x30 0x3c # CHECK: c.lt.s $f6, $f7
+0x46 0x2e 0x60 0x3d # CHECK: c.nge.d $f12, $f14
+0x46 0x07 0x30 0x3d # CHECK: c.nge.s $f6, $f7
+0x46 0x2e 0x60 0x3b # CHECK: c.ngl.d $f12, $f14
+0x46 0x07 0x30 0x3b # CHECK: c.ngl.s $f6, $f7
+0x46 0x2e 0x60 0x39 # CHECK: c.ngle.d $f12, $f14
+0x46 0x07 0x30 0x39 # CHECK: c.ngle.s $f6, $f7
+0x46 0x2e 0x60 0x3f # CHECK: c.ngt.d $f12, $f14
+0x46 0x07 0x30 0x3f # CHECK: c.ngt.s $f6, $f7
+0x46 0x2e 0x60 0x36 # CHECK: c.ole.d $f12, $f14
+0x46 0x07 0x30 0x36 # CHECK: c.ole.s $f6, $f7
+0x46 0x2e 0x60 0x34 # CHECK: c.olt.d $f12, $f14
+0x46 0x07 0x30 0x34 # CHECK: c.olt.s $f6, $f7
+0x46 0x2e 0x60 0x3a # CHECK: c.seq.d $f12, $f14
+0x46 0x07 0x30 0x3a # CHECK: c.seq.s $f6, $f7
+0x46 0x2e 0x60 0x38 # CHECK: c.sf.d $f12, $f14
+0x46 0x07 0x30 0x38 # CHECK: c.sf.s $f6, $f7
+0x46 0x2e 0x60 0x33 # CHECK: c.ueq.d $f12, $f14
+0x46 0x12 0xe0 0x33 # CHECK: c.ueq.s $f28, $f18
+0x46 0x2e 0x60 0x37 # CHECK: c.ule.d $f12, $f14
+0x46 0x07 0x30 0x37 # CHECK: c.ule.s $f6, $f7
+0x46 0x2e 0x60 0x35 # CHECK: c.ult.d $f12, $f14
+0x46 0x07 0x30 0x35 # CHECK: c.ult.s $f6, $f7
+0x46 0x2e 0x60 0x31 # CHECK: c.un.d $f12, $f14
+0x46 0x07 0x30 0x31 # CHECK: c.un.s $f6, $f7
+0x46 0x20 0x73 0x0e # CHECK: ceil.w.d $f12, $f14
+0x46 0x00 0x39 0x8e # CHECK: ceil.w.s $f6, $f7
+0x46 0x20 0x18 0x4a # CHECK: ceil.l.d $f1, $f3
+0x46 0x00 0x6c 0x8a # CHECK: ceil.l.s $f18, $f13
+0x44 0x46 0x38 0x00 # CHECK: cfc1 $6, $7
+0x70 0xe6 0x30 0x21 # CHECK: clo $6, $7
+0x70 0xe6 0x30 0x20 # CHECK: clz $6, $7
+0x44 0xc6 0x38 0x00 # CHECK: ctc1 $6, $7
+0x46 0x00 0x39 0xa1 # CHECK: cvt.d.s $f6, $f7
+0x46 0xa0 0x81 0x21 # CHECK: cvt.d.l $f4, $f16
+0x46 0x80 0x73 0x21 # CHECK: cvt.d.w $f12, $f14
+0x46 0x20 0x73 0x20 # CHECK: cvt.s.d $f12, $f14
+0x46 0xa0 0xf3 0xe0 # CHECK: cvt.s.l $f15, $f30
+0x46 0x80 0x39 0xa0 # CHECK: cvt.s.w $f6, $f7
+0x46 0x20 0x73 0x24 # CHECK: cvt.w.d $f12, $f14
+0x46 0x00 0x39 0xa4 # CHECK: cvt.w.s $f6, $f7
+0x00 0x3f 0x98 0x2c # CHECK: dadd $19, $1, $ra
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x63 0xbd 0x93 0xc7 # CHECK: daddi $sp, $sp, -27705
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x63 0xbd 0x93 0xc7 # CHECK: daddi $sp, $sp, -27705
+0x66 0xda 0xee 0x16 # CHECK: daddiu $26, $22, -4586
+0x00 0x3f 0x98 0x2d # CHECK: daddu $19, $1, $ra
+0x64 0x58 0x46 0x9f # CHECK: daddiu $24, $2, 18079
+0x66 0x73 0x69 0x3f # CHECK: daddiu $19, $19, 26943
+0x70 0xd2 0x90 0x25 # CHECK: dclo $18, $6
+0x73 0x30 0x80 0x24 # CHECK: dclz $16, $25
+0x03 0x53 0x00 0x1e # CHECK: ddiv $zero, $26, $19
+0x02 0x11 0x00 0x1f # CHECK: ddivu $zero, $16, $17
+0x44 0x2c 0x68 0x00 # CHECK: dmfc1 $12, $f13
+0x44 0xb0 0x70 0x00 # CHECK: dmtc1 $16, $f14
+0x02 0xe9 0x00 0x1c # CHECK: dmult $23, $9
+0x00 0xa6 0x00 0x1d # CHECK: dmultu $5, $6
+0x00 0x00 0x04 0xb8 # CHECK: dsll $zero, $zero, 18
+0x00 0x14 0x04 0xb8 # CHECK: dsll $zero, $20, 18
+0x01 0x94 0x00 0x14 # CHECK: dsllv $zero, $20, $12
+0x00 0x00 0x04 0xbc # CHECK: dsll32 $zero, $zero, 18
+0x00 0x00 0x04 0xbc # CHECK: dsll32 $zero, $zero, 18
+0x01 0x94 0x00 0x14 # CHECK: dsllv $zero, $20, $12
+0x00 0x1c 0xe2 0xbb # CHECK: dsra $gp, $gp, 10
+0x00 0x12 0xe2 0xbb # CHECK: dsra $gp, $18, 10
+0x02 0x72 0xe0 0x17 # CHECK: dsrav $gp, $18, $19
+0x00 0x1c 0xe2 0xbf # CHECK: dsra32 $gp, $gp, 10
+0x00 0x12 0xe2 0xbf # CHECK: dsra32 $gp, $18, 10
+0x02 0x72 0xe0 0x17 # CHECK: dsrav $gp, $18, $19
+0x00 0x13 0x9d 0xfa # CHECK: dsrl $19, $19, 23
+0x00 0x06 0x9d 0xfa # CHECK: dsrl $19, $6, 23
+0x02 0x86 0x98 0x16 # CHECK: dsrlv $19, $6, $20
+0x00 0x13 0x9d 0xfe # CHECK: dsrl32 $19, $19, 23
+0x00 0x06 0x9d 0xfe # CHECK: dsrl32 $19, $6, 23
+0x02 0x86 0x98 0x16 # CHECK: dsrlv $19, $6, $20
+0x02 0xc8 0x38 0x2e # CHECK: dsub $7, $22, $8
+0x62 0x9d 0x6c 0x39 # CHECK: daddi $sp, $20, 27705
+0x63 0xbd 0x6c 0x39 # CHECK: daddi $sp, $sp, 27705
+0x00 0xba 0x28 0x2f # CHECK: dsubu $5, $5, $26
+0x65 0x6f 0xec 0x5f # CHECK: daddiu $15, $11, -5025
+0x65 0xce 0x11 0xea # CHECK: daddiu $14, $14, 4586
+0x46 0x20 0x3e 0x8b # CHECK: floor.l.d $f26, $f7
+0x46 0x00 0x2b 0x0b # CHECK: floor.l.s $f12, $f5
+0x46 0x20 0x73 0x0f # CHECK: floor.w.d $f12, $f14
+0x46 0x00 0x39 0x8f # CHECK: floor.w.s $f6, $f7
+0x08 0x00 0x01 0x4c # CHECK: j 1328
+0x0c 0x00 0x01 0x4c # CHECK: jal 1328
+0x74 0x00 0x01 0x4c # CHECK: jalx 1328
+0x00 0xe0 0xf8 0x09 # CHECK: jalr $7
+0x00 0x80 0xfc 0x09 # CHECK: jalr.hb $4
+0x00 0xa0 0x24 0x09 # CHECK: jalr.hb $4, $5
+0x00 0xe0 0x00 0x08 # CHECK: jr $7
+0x80 0xa4 0x23 0xc6 # CHECK: lb $4, 9158($5)
+0x90 0xa4 0x00 0x06 # CHECK: lbu $4, 6($5)
+0xd4 0xe9 0x23 0xc6 # CHECK: ldc1 $f9, 9158($7)
+0xde 0x3d 0x90 0x1b # CHECK: ld $sp, -28645($17)
+0x6b 0x18 0xef 0xb9 # CHECK: ldl $24, -4167($24)
+0x6e 0x8e 0x89 0x6a # CHECK: ldr $14, -30358($20)
+0x4d 0xf7 0x02 0x01 # CHECK: ldxc1 $f8, $23($15)
+0x84 0xa4 0x00 0x0c # CHECK: lh $4, 12($5)
+0x84 0xa4 0x00 0x0c # CHECK: lh $4, 12($5)
+0xc0 0xe9 0x23 0xc6 # CHECK: ll $9, 9158($7)
+0xd3 0xe0 0xc6 0x70 # CHECK: lld $zero, -14736($ra)
+0x3c 0x06 0x45 0x67 # CHECK: lui $6, 17767
+0x4e 0xb6 0x04 0xc5 # CHECK: luxc1 $f19, $22($21)
+0x9c 0x73 0xa1 0xea # CHECK: lwu $19, -24086($3)
+0x4f 0xd1 0x03 0x00 # CHECK: lwxc1 $f12, $17($fp)
+0x8c 0xa4 0x00 0x18 # CHECK: lw $4, 24($5)
+0xc4 0xe9 0x23 0xc6 # CHECK: lwc1 $f9, 9158($7)
+0x88 0x82 0x00 0x03 # CHECK: lwl $2, 3($4)
+0x98 0xa3 0x00 0x10 # CHECK: lwr $3, 16($5)
+0x70 0xc7 0x00 0x00 # CHECK: madd $6, $7
+0x70 0xc7 0x00 0x01 # CHECK: maddu $6, $7
+0x44 0x06 0x38 0x00 # CHECK: mfc1 $6, $f7
+0x00 0x00 0x28 0x10 # CHECK: mfhi $5
+0x00 0x00 0x28 0x12 # CHECK: mflo $5
+0x46 0x20 0x41 0x86 # CHECK: mov.d $f6, $f8
+0x46 0x00 0x39 0x86 # CHECK: mov.s $f6, $f7
+0x70 0xc7 0x00 0x04 # CHECK: msub $6, $7
+0x70 0xc7 0x00 0x05 # CHECK: msubu $6, $7
+0x44 0x86 0x38 0x00 # CHECK: mtc1 $6, $f7
+0x00 0xe0 0x00 0x11 # CHECK: mthi $7
+0x00 0xe0 0x00 0x13 # CHECK: mtlo $7
+0x46 0x2e 0x62 0x02 # CHECK: mul.d $f8, $f12, $f14
+0x46 0x07 0x32 0x42 # CHECK: mul.s $f9, $f6, $f7
+0x70 0xc7 0x48 0x02 # CHECK: mul $9, $6, $7
+0x00 0x65 0x00 0x18 # CHECK: mult $3, $5
+0x00 0x65 0x00 0x19 # CHECK: multu $3, $5
+0x46 0x20 0x73 0x07 # CHECK: neg.d $f12, $f14
+0x46 0x00 0x39 0x87 # CHECK: neg.s $f6, $f7
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x00 0xc7 0x48 0x27 # CHECK: nor $9, $6, $7
+0x00 0x65 0x18 0x25 # CHECK: or $3, $3, $5
+0x34 0xc9 0x45 0x67 # CHECK: ori $9, $6, 17767
+0x46 0x20 0x0b 0x08 # CHECK: round.l.d $f12, $f1
+0x46 0x00 0x2e 0x48 # CHECK: round.l.s $f25, $f5
+0x46 0x20 0x73 0x0c # CHECK: round.w.d $f12, $f14
+0x46 0x00 0x39 0x8c # CHECK: round.w.s $f6, $f7
+0xa0 0xa4 0x23 0xc6 # CHECK: sb $4, 9158($5)
+0xa0 0xa4 0x00 0x06 # CHECK: sb $4, 6($5)
+0xe0 0xe9 0x23 0xc6 # CHECK: sc $9, 9158($7)
+0xf3 0xaf 0xdf 0xcd # CHECK: scd $15, -8243($sp)
+0xfd 0x4c 0x16 0xcb # CHECK: sd $12, 5835($10)
+0xf4 0xe9 0x23 0xc6 # CHECK: sdc1 $f9, 9158($7)
+0xb3 0xc7 0xae 0x1f # CHECK: sdl $7, -20961($fp)
+0xb5 0x8b 0xb0 0x39 # CHECK: sdr $11, -20423($12)
+0x4d 0xca 0x58 0x09 # CHECK: sdxc1 $f11, $10($14)
+0xa4 0xa4 0x23 0xc6 # CHECK: sh $4, 9158($5)
+0x00 0x03 0x21 0xc0 # CHECK: sll $4, $3, 7
+0x00 0xa3 0x10 0x04 # CHECK: sllv $2, $3, $5
+0x00 0x65 0x18 0x2a # CHECK: slt $3, $3, $5
+0x28 0x63 0x00 0x67 # CHECK: slti $3, $3, 103
+0x2c 0x63 0x00 0x67 # CHECK: sltiu $3, $3, 103
+0x00 0x65 0x18 0x2b # CHECK: sltu $3, $3, $5
+0x46 0x20 0x73 0x04 # CHECK: sqrt.d $f12, $f14
+0x46 0x00 0x39 0x84 # CHECK: sqrt.s $f6, $f7
+0x00 0x03 0x21 0xc3 # CHECK: sra $4, $3, 7
+0x00 0xa3 0x10 0x07 # CHECK: srav $2, $3, $5
+0x00 0x03 0x21 0xc2 # CHECK: srl $4, $3, 7
+0x00 0xa3 0x10 0x06 # CHECK: srlv $2, $3, $5
+0x46 0x2e 0x62 0x01 # CHECK: sub.d $f8, $f12, $f14
+0x46 0x07 0x32 0x41 # CHECK: sub.s $f9, $f6, $f7
+0x00 0xc7 0x48 0x22 # CHECK: sub $9, $6, $7
+0x00 0x65 0x20 0x23 # CHECK: subu $4, $3, $5
+0xac 0xa4 0x00 0x18 # CHECK: sw $4, 24($5)
+0xe4 0xe9 0x23 0xc6 # CHECK: swc1 $f9, 9158($7)
+0x4d 0xbb 0x60 0x0d # CHECK: suxc1 $f12, $27($13)
+0x4f 0x4c 0x98 0x08 # CHECK: swxc1 $f19, $12($26)
+0xa8 0xa4 0x00 0x10 # CHECK: swl $4, 16($5)
+0xb8 0xe6 0x00 0x10 # CHECK: swr $6, 16($7)
+0x00 0x00 0x01 0xcf # CHECK: sync 7
+0x46 0x20 0xbd 0xc9 # CHECK: trunc.l.d $f23, $f23
+0x46 0x00 0xff 0x09 # CHECK: trunc.l.s $f28, $f31
+0x46 0x20 0x73 0x0d # CHECK: trunc.w.d $f12, $f14
+0x46 0x00 0x39 0x8d # CHECK: trunc.w.s $f6, $f7
+0x00 0x65 0x18 0x26 # CHECK: xor $3, $3, $5
+0x38 0xc9 0x45 0x67 # CHECK: xori $9, $6, 17767
+0x7c 0x05 0xe8 0x3b # CHECK: .set push
+                    # CHECK: .set mips32r2
+                    # CHECK: rdhwr $5, $29
+                    # CHECK: .set pop
+0xbc 0x61 0x00 0x02 # CHECK: cache 1, 2($3)
+0xcc 0x43 0x00 0x04 # CHECK: pref 3, 4($2)
+0xe8 0xe9 0x23 0xc6 # CHECK: swc2 $9, 9158($7)
+0xc8 0xc8 0x23 0xca # CHECK: lwc2 $8, 9162($6)
diff --git a/test/MC/Disassembler/Mips/mips64r2/valid-mips64r2-el.txt b/test/MC/Disassembler/Mips/mips64r2/valid-mips64r2-el.txt
new file mode 100644
index 0000000..6509456
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64r2/valid-mips64r2-el.txt
@@ -0,0 +1,237 @@
+# RUN: llvm-mc --disassemble %s -triple=mips64el-unknown-linux -mcpu=mips64r2 | FileCheck %s
+# Try a mips* triple to confirm that mips* vs mips64* triples no longer have
+# an effect on the disassembler behaviour.
+# RUN: llvm-mc --disassemble %s -triple=mipsel-unknown-linux -mcpu=mips64r2 | FileCheck %s
+# CHECK: .text
+0x05 0x73 0x20 0x46 # CHECK: abs.d $f12, $f14
+0x85 0x39 0x00 0x46 # CHECK: abs.s $f6, $f7
+0x20 0x48 0xc7 0x00 # CHECK: add $9, $6, $7
+0x00 0x62 0x2e 0x46 # CHECK: add.d $f8, $f12, $f14
+0x40 0x32 0x07 0x46 # CHECK: add.s $f9, $f6, $f7
+0x67 0x45 0xc9 0x20 # CHECK: addi $9, $6, 17767
+0x67 0xc5 0xc9 0x24 # CHECK: addiu $9, $6, -15001
+0x21 0x48 0xc7 0x00 # CHECK: addu $9, $6, $7
+0x24 0x48 0xc7 0x00 # CHECK: and $9, $6, $7
+0x67 0x45 0xc9 0x30 # CHECK: andi $9, $6, 17767
+0x4c 0x01 0x00 0x10 # CHECK: b 1332
+0x4c 0x01 0x00 0x45 # CHECK: bc1f 1332
+0x4c 0x01 0x1c 0x45 # CHECK: bc1f $fcc7, 1332
+0x4c 0x01 0x01 0x45 # CHECK: bc1t 1332
+0x4c 0x01 0x1d 0x45 # CHECK: bc1t $fcc7, 1332
+0x4c 0x01 0x26 0x11 # CHECK: beq $9, $6, 1332
+0x4c 0x01 0xc1 0x04 # CHECK: bgez $6, 1332
+0x4c 0x01 0xd1 0x04 # CHECK: bgezal $6, 1332
+0x4c 0x01 0xc0 0x1c # CHECK: bgtz $6, 1332
+0x4c 0x01 0xc0 0x18 # CHECK: blez $6, 1332
+0x4c 0x01 0x26 0x15 # CHECK: bne $9, $6, 1332
+0x32 0x60 0x2e 0x46 # CHECK: c.eq.d $f12, $f14
+0x32 0x30 0x07 0x46 # CHECK: c.eq.s $f6, $f7
+0x30 0x60 0x2e 0x46 # CHECK: c.f.d $f12, $f14
+0x30 0x30 0x07 0x46 # CHECK: c.f.s $f6, $f7
+0x3e 0x60 0x2e 0x46 # CHECK: c.le.d $f12, $f14
+0x3e 0x30 0x07 0x46 # CHECK: c.le.s $f6, $f7
+0x3c 0x60 0x2e 0x46 # CHECK: c.lt.d $f12, $f14
+0x3c 0x30 0x07 0x46 # CHECK: c.lt.s $f6, $f7
+0x3d 0x60 0x2e 0x46 # CHECK: c.nge.d $f12, $f14
+0x3d 0x30 0x07 0x46 # CHECK: c.nge.s $f6, $f7
+0x3b 0x60 0x2e 0x46 # CHECK: c.ngl.d $f12, $f14
+0x3b 0x30 0x07 0x46 # CHECK: c.ngl.s $f6, $f7
+0x39 0x60 0x2e 0x46 # CHECK: c.ngle.d $f12, $f14
+0x39 0x30 0x07 0x46 # CHECK: c.ngle.s $f6, $f7
+0x3f 0x60 0x2e 0x46 # CHECK: c.ngt.d $f12, $f14
+0x3f 0x30 0x07 0x46 # CHECK: c.ngt.s $f6, $f7
+0x36 0x60 0x2e 0x46 # CHECK: c.ole.d $f12, $f14
+0x36 0x30 0x07 0x46 # CHECK: c.ole.s $f6, $f7
+0x34 0x60 0x2e 0x46 # CHECK: c.olt.d $f12, $f14
+0x34 0x30 0x07 0x46 # CHECK: c.olt.s $f6, $f7
+0x3a 0x60 0x2e 0x46 # CHECK: c.seq.d $f12, $f14
+0x3a 0x30 0x07 0x46 # CHECK: c.seq.s $f6, $f7
+0x38 0x60 0x2e 0x46 # CHECK: c.sf.d $f12, $f14
+0x38 0x30 0x07 0x46 # CHECK: c.sf.s $f6, $f7
+0x33 0x60 0x2e 0x46 # CHECK: c.ueq.d $f12, $f14
+0x33 0xe0 0x12 0x46 # CHECK: c.ueq.s $f28, $f18
+0x37 0x60 0x2e 0x46 # CHECK: c.ule.d $f12, $f14
+0x37 0x30 0x07 0x46 # CHECK: c.ule.s $f6, $f7
+0x35 0x60 0x2e 0x46 # CHECK: c.ult.d $f12, $f14
+0x35 0x30 0x07 0x46 # CHECK: c.ult.s $f6, $f7
+0x31 0x60 0x2e 0x46 # CHECK: c.un.d $f12, $f14
+0x31 0x30 0x07 0x46 # CHECK: c.un.s $f6, $f7
+0x4a 0x18 0x20 0x46 # CHECK: ceil.l.d $f1, $f3
+0x8a 0x6c 0x00 0x46 # CHECK: ceil.l.s $f18, $f13
+0x0e 0x73 0x20 0x46 # CHECK: ceil.w.d $f12, $f14
+0x8e 0x39 0x00 0x46 # CHECK: ceil.w.s $f6, $f7
+0x00 0x38 0x46 0x44 # CHECK: cfc1 $6, $7
+0x21 0x30 0xe6 0x70 # CHECK: clo $6, $7
+0x20 0x30 0xe6 0x70 # CHECK: clz $6, $7
+0x00 0x38 0xc6 0x44 # CHECK: ctc1 $6, $7
+0x21 0x81 0xa0 0x46 # CHECK: cvt.d.l $f4, $f16
+0xa1 0x39 0x00 0x46 # CHECK: cvt.d.s $f6, $f7
+0x21 0x73 0x80 0x46 # CHECK: cvt.d.w $f12, $f14
+0x25 0x73 0x20 0x46 # CHECK: cvt.l.d $f12, $f14
+0xa5 0x39 0x00 0x46 # CHECK: cvt.l.s $f6, $f7
+0xe0 0xf3 0xa0 0x46 # CHECK: cvt.s.l $f15, $f30
+0x20 0x73 0x20 0x46 # CHECK: cvt.s.d $f12, $f14
+0xa0 0x39 0x80 0x46 # CHECK: cvt.s.w $f6, $f7
+0x24 0x73 0x20 0x46 # CHECK: cvt.w.d $f12, $f14
+0xa4 0x39 0x00 0x46 # CHECK: cvt.w.s $f6, $f7
+0x2c 0x98 0x3f 0x00 # CHECK: dadd $19, $1, $ra
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0xbd 0x63 # CHECK: daddi $sp, $sp, -27705
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0xbd 0x63 # CHECK: daddi $sp, $sp, -27705
+0x16 0xee 0xda 0x66 # CHECK: daddiu $26, $22, -4586
+0x2d 0x98 0x3f 0x00 # CHECK: daddu $19, $1, $ra
+0x9f 0x46 0x58 0x64 # CHECK: daddiu $24, $2, 18079
+0x3f 0x69 0x73 0x66 # CHECK: daddiu $19, $19, 26943
+0x25 0x90 0xd2 0x70 # CHECK: dclo $18, $6
+0x24 0x80 0x30 0x73 # CHECK: dclz $16, $25
+0x1e 0x00 0x53 0x03 # CHECK: ddiv $zero, $26, $19
+0x1f 0x00 0x11 0x02 # CHECK: ddivu $zero, $16, $17
+0x00 0x68 0x2c 0x44 # CHECK: dmfc1 $12, $f13
+0x00 0x70 0xb0 0x44 # CHECK: dmtc1 $16, $f14
+0x1c 0x00 0xe9 0x02 # CHECK: dmult $23, $9
+0x1d 0x00 0xa6 0x00 # CHECK: dmultu $5, $6
+0xb8 0x04 0x00 0x00 # CHECK: dsll $zero, $zero, 18
+0xb8 0x04 0x14 0x00 # CHECK: dsll $zero, $20, 18
+0x14 0x00 0x94 0x01 # CHECK: dsllv $zero, $20, $12
+0xbc 0x04 0x00 0x00 # CHECK: dsll32 $zero, $zero, 18
+0xbc 0x04 0x00 0x00 # CHECK: dsll32 $zero, $zero, 18
+0x14 0x00 0x94 0x01 # CHECK: dsllv $zero, $20, $12
+0xbb 0xe2 0x1c 0x00 # CHECK: dsra $gp, $gp, 10
+0xbb 0xe2 0x12 0x00 # CHECK: dsra $gp, $18, 10
+0x17 0xe0 0x72 0x02 # CHECK: dsrav $gp, $18, $19
+0xbf 0xe2 0x1c 0x00 # CHECK: dsra32 $gp, $gp, 10
+0xbf 0xe2 0x12 0x00 # CHECK: dsra32 $gp, $18, 10
+0x17 0xe0 0x72 0x02 # CHECK: dsrav $gp, $18, $19
+0xfa 0x9d 0x13 0x00 # CHECK: dsrl $19, $19, 23
+0xfa 0x9d 0x06 0x00 # CHECK: dsrl $19, $6, 23
+0x16 0x98 0x86 0x02 # CHECK: dsrlv $19, $6, $20
+0xfe 0x9d 0x13 0x00 # CHECK: dsrl32 $19, $19, 23
+0xfe 0x9d 0x06 0x00 # CHECK: dsrl32 $19, $6, 23
+0x16 0x98 0x86 0x02 # CHECK: dsrlv $19, $6, $20
+0x2e 0x38 0xc8 0x02 # CHECK: dsub $7, $22, $8
+0xa4 0x18 0x0e 0x7c # CHECK: dsbh $3, $14
+0x64 0x11 0x1d 0x7c # CHECK: dshd $2, $sp
+0x39 0x6c 0x9d 0x62 # CHECK: daddi $sp, $20, 27705
+0x39 0x6c 0xbd 0x63 # CHECK: daddi $sp, $sp, 27705
+0x2f 0x28 0xba 0x00 # CHECK: dsubu $5, $5, $26
+0x5f 0xec 0x6f 0x65 # CHECK: daddiu $15, $11, -5025
+0xea 0x11 0xce 0x65 # CHECK: daddiu $14, $14, 4586
+0x00 0x60 0x7e 0x41 # CHECK: di $fp
+0x00 0x60 0x60 0x41 # CHECK: di
+0xfa 0x0b 0x21 0x00 # CHECK: drotr $1, $1, 15
+0xfa 0x0b 0x2e 0x00 # CHECK: drotr $1, $14, 15
+0xfe 0x0b 0x21 0x00 # CHECK: drotr32 $1, $1, 15
+0xfe 0x0b 0x2e 0x00 # CHECK: drotr32 $1, $14, 15
+0x56 0x08 0xee 0x01 # CHECK: drotrv $1, $14, $15
+0x20 0x60 0x6e 0x41 # CHECK: ei $14
+0x20 0x60 0x60 0x41 # CHECK: ei
+0x8b 0x3e 0x20 0x46 # CHECK: floor.l.d $f26, $f7
+0x0b 0x2b 0x00 0x46 # CHECK: floor.l.s $f12, $f5
+0x0f 0x73 0x20 0x46 # CHECK: floor.w.d $f12, $f14
+0x8f 0x39 0x00 0x46 # CHECK: floor.w.s $f6, $f7
+0x84 0x61 0x33 0x7d # CHECK: ins $19, $9, 6, 7
+0x4c 0x01 0x00 0x08 # CHECK: j 1328
+0x4c 0x01 0x00 0x0c # CHECK: jal 1328
+0x4c 0x01 0x00 0x74 # CHECK: jalx 1328
+0x09 0xf8 0xe0 0x00 # CHECK: jalr $7
+0x09 0xfc 0x80 0x00 # CHECK: jalr.hb $4
+0x09 0x24 0xa0 0x00 # CHECK: jalr.hb $4, $5
+0x08 0x00 0xe0 0x00 # CHECK: jr $7
+0xc6 0x23 0xa4 0x80 # CHECK: lb $4, 9158($5)
+0x06 0x00 0xa4 0x90 # CHECK: lbu $4, 6($5)
+0x1b 0x90 0x3d 0xde # CHECK: ld $sp, -28645($17)
+0xb9 0xef 0x18 0x6b # CHECK: ldl $24, -4167($24)
+0x6a 0x89 0x8e 0x6e # CHECK: ldr $14, -30358($20)
+0xc6 0x23 0xe9 0xd4 # CHECK: ldc1 $f9, 9158($7)
+0x01 0x02 0xf7 0x4d # CHECK: ldxc1 $f8, $23($15)
+0x0c 0x00 0xa4 0x84 # CHECK: lh $4, 12($5)
+0x0c 0x00 0xa4 0x84 # CHECK: lh $4, 12($5)
+0xc6 0x23 0xe9 0xc0 # CHECK: ll $9, 9158($7)
+0x70 0xc6 0xe0 0xd3 # CHECK: lld $zero, -14736($ra)
+0x67 0x45 0x06 0x3c # CHECK: lui $6, 17767
+0x05 0x00 0xa6 0x4c # CHECK: luxc1 $f0, $6($5)
+0x18 0x00 0xa4 0x8c # CHECK: lw $4, 24($5)
+0xc6 0x23 0xe9 0xc4 # CHECK: lwc1 $f9, 9158($7)
+0x03 0x00 0x82 0x88 # CHECK: lwl $2, 3($4)
+0x10 0x00 0xa3 0x98 # CHECK: lwr $3, 16($5)
+0x00 0x05 0xcc 0x4d # CHECK: lwxc1 $f20, $12($14)
+0xea 0xa1 0x73 0x9c # CHECK: lwu $19, -24086($3)
+0x00 0x00 0xc7 0x70 # CHECK: madd $6, $7
+0x60 0x98 0xf9 0x4f # CHECK: madd.s $f1, $f31, $f19, $f25
+0x01 0x00 0xc7 0x70 # CHECK: maddu $6, $7
+0x00 0x38 0x06 0x44 # CHECK: mfc1 $6, $f7
+0x10 0x28 0x00 0x00 # CHECK: mfhi $5
+0x00 0xc0 0x7e 0x44 # CHECK: mfhc1 $fp, $f24
+0x12 0x28 0x00 0x00 # CHECK: mflo $5
+0x86 0x41 0x20 0x46 # CHECK: mov.d $f6, $f8
+0x86 0x39 0x00 0x46 # CHECK: mov.s $f6, $f7
+0x04 0x00 0xc7 0x70 # CHECK: msub $6, $7
+0x28 0x53 0x70 0x4e # CHECK: msub.s $f12, $f19, $f10, $f16
+0x05 0x00 0xc7 0x70 # CHECK: msubu $6, $7
+0x00 0x38 0x86 0x44 # CHECK: mtc1 $6, $f7
+0x11 0x00 0xe0 0x00 # CHECK: mthi $7
+0x00 0x80 0xe0 0x44 # CHECK: mthc1 $zero, $f16
+0x13 0x00 0xe0 0x00 # CHECK: mtlo $7
+0x02 0x62 0x2e 0x46 # CHECK: mul.d $f8, $f12, $f14
+0x42 0x32 0x07 0x46 # CHECK: mul.s $f9, $f6, $f7
+0x02 0x48 0xc7 0x70 # CHECK: mul $9, $6, $7
+0x18 0x00 0x65 0x00 # CHECK: mult $3, $5
+0x19 0x00 0x65 0x00 # CHECK: multu $3, $5
+0x07 0x73 0x20 0x46 # CHECK: neg.d $f12, $f14
+0x87 0x39 0x00 0x46 # CHECK: neg.s $f6, $f7
+0x30 0xc8 0xac 0x4c # CHECK: nmadd.s $f0, $f5, $f25, $f12
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x27 0x48 0xc7 0x00 # CHECK: nor $9, $6, $7
+0x78 0x98 0x04 0x4f # CHECK: nmsub.s $f1, $f24, $f19, $f4
+0x25 0x18 0x65 0x00 # CHECK: or $3, $3, $5
+0x67 0x45 0xc9 0x34 # CHECK: ori $9, $6, 17767
+0xc2 0x49 0x26 0x00 # CHECK: rotr $9, $6, 7
+0x46 0x48 0xe6 0x00 # CHECK: rotrv $9, $6, $7
+0x08 0x0b 0x20 0x46 # CHECK: round.l.d $f12, $f1
+0x48 0x2e 0x00 0x46 # CHECK: round.l.s $f25, $f5
+0x0c 0x73 0x20 0x46 # CHECK: round.w.d $f12, $f14
+0x8c 0x39 0x00 0x46 # CHECK: round.w.s $f6, $f7
+0xc6 0x23 0xa4 0xa0 # CHECK: sb $4, 9158($5)
+0x06 0x00 0xa4 0xa0 # CHECK: sb $4, 6($5)
+0xc6 0x23 0xe9 0xe0 # CHECK: sc $9, 9158($7)
+0xcd 0xdf 0xaf 0xf3 # CHECK: scd $15, -8243($sp)
+0xcb 0x16 0x4c 0xfd # CHECK: sd $12, 5835($10)
+0xc6 0x23 0xe9 0xf4 # CHECK: sdc1 $f9, 9158($7)
+0x1f 0xae 0xc7 0xb3 # CHECK: sdl $7, -20961($fp)
+0x39 0xb0 0x8b 0xb5 # CHECK: sdr $11, -20423($12)
+0x09 0x40 0x24 0x4f # CHECK: sdxc1 $f8, $4($25)
+0x20 0x34 0x07 0x7c # CHECK: seb $6, $7
+0x20 0x36 0x07 0x7c # CHECK: seh $6, $7
+0xc6 0x23 0xa4 0xa4 # CHECK: sh $4, 9158($5)
+0xc0 0x21 0x03 0x00 # CHECK: sll $4, $3, 7
+0x04 0x10 0xa3 0x00 # CHECK: sllv $2, $3, $5
+0x2a 0x18 0x65 0x00 # CHECK: slt $3, $3, $5
+0x67 0x00 0x63 0x28 # CHECK: slti $3, $3, 103
+0x67 0x00 0x63 0x2c # CHECK: sltiu $3, $3, 103
+0x2b 0x18 0x65 0x00 # CHECK: sltu $3, $3, $5
+0x04 0x73 0x20 0x46 # CHECK: sqrt.d $f12, $f14
+0x84 0x39 0x00 0x46 # CHECK: sqrt.s $f6, $f7
+0xc3 0x21 0x03 0x00 # CHECK: sra $4, $3, 7
+0x07 0x10 0xa3 0x00 # CHECK: srav $2, $3, $5
+0xc2 0x21 0x03 0x00 # CHECK: srl $4, $3, 7
+0x06 0x10 0xa3 0x00 # CHECK: srlv $2, $3, $5
+0x01 0x62 0x2e 0x46 # CHECK: sub.d $f8, $f12, $f14
+0x41 0x32 0x07 0x46 # CHECK: sub.s $f9, $f6, $f7
+0x22 0x48 0xc7 0x00 # CHECK: sub $9, $6, $7
+0x23 0x20 0x65 0x00 # CHECK: subu $4, $3, $5
+0x0d 0x20 0xb8 0x4c # CHECK: suxc1 $f4, $24($5)
+0x18 0x00 0xa4 0xac # CHECK: sw $4, 24($5)
+0xc6 0x23 0xe9 0xe4 # CHECK: swc1 $f9, 9158($7)
+0x10 0x00 0xa4 0xa8 # CHECK: swl $4, 16($5)
+0x10 0x00 0xe6 0xb8 # CHECK: swr $6, 16($7)
+0x08 0xd0 0xd2 0x4e # CHECK: swxc1 $f26, $18($22)
+0xcf 0x01 0x00 0x00 # CHECK: sync 7
+0xc9 0xbd 0x20 0x46 # CHECK: trunc.l.d $f23, $f23
+0x09 0xff 0x00 0x46 # CHECK: trunc.l.s $f28, $f31
+0x0d 0x73 0x20 0x46 # CHECK: trunc.w.d $f12, $f14
+0x8d 0x39 0x00 0x46 # CHECK: trunc.w.s $f6, $f7
+0xa0 0x30 0x07 0x7c # CHECK: wsbh $6, $7
+0x26 0x18 0x65 0x00 # CHECK: xor $3, $3, $5
+0x67 0x45 0xc9 0x38 # CHECK: xori $9, $6, 17767
diff --git a/test/MC/Disassembler/Mips/mips64r2/valid-mips64r2.txt b/test/MC/Disassembler/Mips/mips64r2/valid-mips64r2.txt
new file mode 100644
index 0000000..9fc9f6e
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64r2/valid-mips64r2.txt
@@ -0,0 +1,237 @@
+# RUN: llvm-mc --disassemble %s -triple=mips64-unknown-linux -mcpu=mips64r2 | FileCheck %s
+# Try a mips* triple to confirm that mips* vs mips64* triples no longer have
+# an effect on the disassembler behaviour.
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips64r2 | FileCheck %s
+# CHECK: .text
+0x46 0x20 0x73 0x05 # CHECK: abs.d $f12, $f14
+0x46 0x00 0x39 0x85 # CHECK: abs.s $f6, $f7
+0x00 0xc7 0x48 0x20 # CHECK: add $9, $6, $7
+0x46 0x2e 0x62 0x00 # CHECK: add.d $f8, $f12, $f14
+0x46 0x07 0x32 0x40 # CHECK: add.s $f9, $f6, $f7
+0x20 0xc9 0x45 0x67 # CHECK: addi $9, $6, 17767
+0x24 0xc9 0xc5 0x67 # CHECK: addiu $9, $6, -15001
+0x00 0xc7 0x48 0x21 # CHECK: addu $9, $6, $7
+0x00 0xc7 0x48 0x24 # CHECK: and $9, $6, $7
+0x30 0xc9 0x45 0x67 # CHECK: andi $9, $6, 17767
+0x10 0x00 0x01 0x4c # CHECK: b 1332
+0x45 0x00 0x01 0x4c # CHECK: bc1f 1332
+0x45 0x1c 0x01 0x4c # CHECK: bc1f $fcc7, 1332
+0x45 0x01 0x01 0x4c # CHECK: bc1t 1332
+0x45 0x1d 0x01 0x4c # CHECK: bc1t $fcc7, 1332
+0x11 0x26 0x01 0x4c # CHECK: beq $9, $6, 1332
+0x04 0xc1 0x01 0x4c # CHECK: bgez $6, 1332
+0x04 0xd1 0x01 0x4c # CHECK: bgezal $6, 1332
+0x1c 0xc0 0x01 0x4c # CHECK: bgtz $6, 1332
+0x18 0xc0 0x01 0x4c # CHECK: blez $6, 1332
+0x15 0x26 0x01 0x4c # CHECK: bne $9, $6, 1332
+0x46 0x2e 0x60 0x32 # CHECK: c.eq.d $f12, $f14
+0x46 0x07 0x30 0x32 # CHECK: c.eq.s $f6, $f7
+0x46 0x2e 0x60 0x30 # CHECK: c.f.d $f12, $f14
+0x46 0x07 0x30 0x30 # CHECK: c.f.s $f6, $f7
+0x46 0x2e 0x60 0x3e # CHECK: c.le.d $f12, $f14
+0x46 0x07 0x30 0x3e # CHECK: c.le.s $f6, $f7
+0x46 0x2e 0x60 0x3c # CHECK: c.lt.d $f12, $f14
+0x46 0x07 0x30 0x3c # CHECK: c.lt.s $f6, $f7
+0x46 0x2e 0x60 0x3d # CHECK: c.nge.d $f12, $f14
+0x46 0x07 0x30 0x3d # CHECK: c.nge.s $f6, $f7
+0x46 0x2e 0x60 0x3b # CHECK: c.ngl.d $f12, $f14
+0x46 0x07 0x30 0x3b # CHECK: c.ngl.s $f6, $f7
+0x46 0x2e 0x60 0x39 # CHECK: c.ngle.d $f12, $f14
+0x46 0x07 0x30 0x39 # CHECK: c.ngle.s $f6, $f7
+0x46 0x2e 0x60 0x3f # CHECK: c.ngt.d $f12, $f14
+0x46 0x07 0x30 0x3f # CHECK: c.ngt.s $f6, $f7
+0x46 0x2e 0x60 0x36 # CHECK: c.ole.d $f12, $f14
+0x46 0x07 0x30 0x36 # CHECK: c.ole.s $f6, $f7
+0x46 0x2e 0x60 0x34 # CHECK: c.olt.d $f12, $f14
+0x46 0x07 0x30 0x34 # CHECK: c.olt.s $f6, $f7
+0x46 0x2e 0x60 0x3a # CHECK: c.seq.d $f12, $f14
+0x46 0x07 0x30 0x3a # CHECK: c.seq.s $f6, $f7
+0x46 0x2e 0x60 0x38 # CHECK: c.sf.d $f12, $f14
+0x46 0x07 0x30 0x38 # CHECK: c.sf.s $f6, $f7
+0x46 0x2e 0x60 0x33 # CHECK: c.ueq.d $f12, $f14
+0x46 0x12 0xe0 0x33 # CHECK: c.ueq.s $f28, $f18
+0x46 0x2e 0x60 0x37 # CHECK: c.ule.d $f12, $f14
+0x46 0x07 0x30 0x37 # CHECK: c.ule.s $f6, $f7
+0x46 0x2e 0x60 0x35 # CHECK: c.ult.d $f12, $f14
+0x46 0x07 0x30 0x35 # CHECK: c.ult.s $f6, $f7
+0x46 0x2e 0x60 0x31 # CHECK: c.un.d $f12, $f14
+0x46 0x07 0x30 0x31 # CHECK: c.un.s $f6, $f7
+0x46 0x20 0x18 0x4a # CHECK: ceil.l.d $f1, $f3
+0x46 0x00 0x6c 0x8a # CHECK: ceil.l.s $f18, $f13
+0x46 0x20 0x73 0x0e # CHECK: ceil.w.d $f12, $f14
+0x46 0x00 0x39 0x8e # CHECK: ceil.w.s $f6, $f7
+0x44 0x46 0x38 0x00 # CHECK: cfc1 $6, $7
+0x70 0xe6 0x30 0x21 # CHECK: clo $6, $7
+0x70 0xe6 0x30 0x20 # CHECK: clz $6, $7
+0x44 0xc6 0x38 0x00 # CHECK: ctc1 $6, $7
+0x46 0xa0 0x81 0x21 # CHECK: cvt.d.l $f4, $f16
+0x46 0x00 0x39 0xa1 # CHECK: cvt.d.s $f6, $f7
+0x46 0x80 0x73 0x21 # CHECK: cvt.d.w $f12, $f14
+0x46 0x20 0x73 0x25 # CHECK: cvt.l.d $f12, $f14
+0x46 0x00 0x39 0xa5 # CHECK: cvt.l.s $f6, $f7
+0x46 0xa0 0xf3 0xe0 # CHECK: cvt.s.l $f15, $f30
+0x46 0x20 0x73 0x20 # CHECK: cvt.s.d $f12, $f14
+0x46 0x80 0x39 0xa0 # CHECK: cvt.s.w $f6, $f7
+0x46 0x20 0x73 0x24 # CHECK: cvt.w.d $f12, $f14
+0x46 0x00 0x39 0xa4 # CHECK: cvt.w.s $f6, $f7
+0x00 0x3f 0x98 0x2c # CHECK: dadd $19, $1, $ra
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x63 0xbd 0x93 0xc7 # CHECK: daddi $sp, $sp, -27705
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x63 0xbd 0x93 0xc7 # CHECK: daddi $sp, $sp, -27705
+0x66 0xda 0xee 0x16 # CHECK: daddiu $26, $22, -4586
+0x00 0x3f 0x98 0x2d # CHECK: daddu $19, $1, $ra
+0x64 0x58 0x46 0x9f # CHECK: daddiu $24, $2, 18079
+0x66 0x73 0x69 0x3f # CHECK: daddiu $19, $19, 26943
+0x70 0xd2 0x90 0x25 # CHECK: dclo $18, $6
+0x73 0x30 0x80 0x24 # CHECK: dclz $16, $25
+0x03 0x53 0x00 0x1e # CHECK: ddiv $zero, $26, $19
+0x02 0x11 0x00 0x1f # CHECK: ddivu $zero, $16, $17
+0x44 0x2c 0x68 0x00 # CHECK: dmfc1 $12, $f13
+0x44 0xb0 0x70 0x00 # CHECK: dmtc1 $16, $f14
+0x02 0xe9 0x00 0x1c # CHECK: dmult $23, $9
+0x00 0xa6 0x00 0x1d # CHECK: dmultu $5, $6
+0x00 0x00 0x04 0xb8 # CHECK: dsll $zero, $zero, 18
+0x00 0x14 0x04 0xb8 # CHECK: dsll $zero, $20, 18
+0x01 0x94 0x00 0x14 # CHECK: dsllv $zero, $20, $12
+0x00 0x00 0x04 0xbc # CHECK: dsll32 $zero, $zero, 18
+0x00 0x00 0x04 0xbc # CHECK: dsll32 $zero, $zero, 18
+0x01 0x94 0x00 0x14 # CHECK: dsllv $zero, $20, $12
+0x00 0x1c 0xe2 0xbb # CHECK: dsra $gp, $gp, 10
+0x00 0x12 0xe2 0xbb # CHECK: dsra $gp, $18, 10
+0x02 0x72 0xe0 0x17 # CHECK: dsrav $gp, $18, $19
+0x00 0x1c 0xe2 0xbf # CHECK: dsra32 $gp, $gp, 10
+0x00 0x12 0xe2 0xbf # CHECK: dsra32 $gp, $18, 10
+0x02 0x72 0xe0 0x17 # CHECK: dsrav $gp, $18, $19
+0x00 0x13 0x9d 0xfa # CHECK: dsrl $19, $19, 23
+0x00 0x06 0x9d 0xfa # CHECK: dsrl $19, $6, 23
+0x02 0x86 0x98 0x16 # CHECK: dsrlv $19, $6, $20
+0x00 0x13 0x9d 0xfe # CHECK: dsrl32 $19, $19, 23
+0x00 0x06 0x9d 0xfe # CHECK: dsrl32 $19, $6, 23
+0x02 0x86 0x98 0x16 # CHECK: dsrlv $19, $6, $20
+0x02 0xc8 0x38 0x2e # CHECK: dsub $7, $22, $8
+0x7c 0x0e 0x18 0xa4 # CHECK: dsbh $3, $14
+0x7c 0x1d 0x11 0x64 # CHECK: dshd $2, $sp
+0x62 0x9d 0x6c 0x39 # CHECK: daddi $sp, $20, 27705
+0x63 0xbd 0x6c 0x39 # CHECK: daddi $sp, $sp, 27705
+0x00 0xba 0x28 0x2f # CHECK: dsubu $5, $5, $26
+0x65 0x6f 0xec 0x5f # CHECK: daddiu $15, $11, -5025
+0x65 0xce 0x11 0xea # CHECK: daddiu $14, $14, 4586
+0x41 0x7e 0x60 0x00 # CHECK: di $fp
+0x41 0x60 0x60 0x00 # CHECK: di
+0x00 0x21 0x0b 0xfa # CHECK: drotr $1, $1, 15
+0x00 0x2e 0x0b 0xfa # CHECK: drotr $1, $14, 15
+0x00 0x21 0x0b 0xfe # CHECK: drotr32 $1, $1, 15
+0x00 0x2e 0x0b 0xfe # CHECK: drotr32 $1, $14, 15
+0x01 0xee 0x08 0x56 # CHECK: drotrv $1, $14, $15
+0x41 0x6e 0x60 0x20 # CHECK: ei $14
+0x41 0x60 0x60 0x20 # CHECK: ei
+0x46 0x20 0x3e 0x8b # CHECK: floor.l.d $f26, $f7
+0x46 0x00 0x2b 0x0b # CHECK: floor.l.s $f12, $f5
+0x46 0x20 0x73 0x0f # CHECK: floor.w.d $f12, $f14
+0x46 0x00 0x39 0x8f # CHECK: floor.w.s $f6, $f7
+0x7d 0x33 0x61 0x84 # CHECK: ins $19, $9, 6, 7
+0x08 0x00 0x01 0x4c # CHECK: j 1328
+0x0c 0x00 0x01 0x4c # CHECK: jal 1328
+0x74 0x00 0x01 0x4c # CHECK: jalx 1328
+0x00 0xe0 0xf8 0x09 # CHECK: jalr $7
+0x00 0x80 0xfc 0x09 # CHECK: jalr.hb $4
+0x00 0xa0 0x24 0x09 # CHECK: jalr.hb $4, $5
+0x00 0xe0 0x00 0x08 # CHECK: jr $7
+0x80 0xa4 0x23 0xc6 # CHECK: lb $4, 9158($5)
+0x90 0xa4 0x00 0x06 # CHECK: lbu $4, 6($5)
+0xde 0x3d 0x90 0x1b # CHECK: ld $sp, -28645($17)
+0x6b 0x18 0xef 0xb9 # CHECK: ldl $24, -4167($24)
+0x6e 0x8e 0x89 0x6a # CHECK: ldr $14, -30358($20)
+0xd4 0xe9 0x23 0xc6 # CHECK: ldc1 $f9, 9158($7)
+0x4d 0xf7 0x02 0x01 # CHECK: ldxc1 $f8, $23($15)
+0x84 0xa4 0x00 0x0c # CHECK: lh $4, 12($5)
+0x84 0xa4 0x00 0x0c # CHECK: lh $4, 12($5)
+0xc0 0xe9 0x23 0xc6 # CHECK: ll $9, 9158($7)
+0xd3 0xe0 0xc6 0x70 # CHECK: lld $zero, -14736($ra)
+0x3c 0x06 0x45 0x67 # CHECK: lui $6, 17767
+0x4c 0xa6 0x00 0x05 # CHECK: luxc1 $f0, $6($5)
+0x8c 0xa4 0x00 0x18 # CHECK: lw $4, 24($5)
+0xc4 0xe9 0x23 0xc6 # CHECK: lwc1 $f9, 9158($7)
+0x88 0x82 0x00 0x03 # CHECK: lwl $2, 3($4)
+0x98 0xa3 0x00 0x10 # CHECK: lwr $3, 16($5)
+0x4d 0xcc 0x05 0x00 # CHECK: lwxc1 $f20, $12($14)
+0x9c 0x73 0xa1 0xea # CHECK: lwu $19, -24086($3)
+0x70 0xc7 0x00 0x00 # CHECK: madd $6, $7
+0x4f 0xf9 0x98 0x60 # CHECK: madd.s $f1, $f31, $f19, $f25
+0x70 0xc7 0x00 0x01 # CHECK: maddu $6, $7
+0x44 0x06 0x38 0x00 # CHECK: mfc1 $6, $f7
+0x00 0x00 0x28 0x10 # CHECK: mfhi $5
+0x44 0x7e 0xc0 0x00 # CHECK: mfhc1 $fp, $f24
+0x00 0x00 0x28 0x12 # CHECK: mflo $5
+0x46 0x20 0x41 0x86 # CHECK: mov.d $f6, $f8
+0x46 0x00 0x39 0x86 # CHECK: mov.s $f6, $f7
+0x70 0xc7 0x00 0x04 # CHECK: msub $6, $7
+0x4e 0x70 0x53 0x28 # CHECK: msub.s $f12, $f19, $f10, $f16
+0x70 0xc7 0x00 0x05 # CHECK: msubu $6, $7
+0x44 0x86 0x38 0x00 # CHECK: mtc1 $6, $f7
+0x00 0xe0 0x00 0x11 # CHECK: mthi $7
+0x44 0xe0 0x80 0x00 # CHECK: mthc1 $zero, $f16
+0x00 0xe0 0x00 0x13 # CHECK: mtlo $7
+0x46 0x2e 0x62 0x02 # CHECK: mul.d $f8, $f12, $f14
+0x46 0x07 0x32 0x42 # CHECK: mul.s $f9, $f6, $f7
+0x70 0xc7 0x48 0x02 # CHECK: mul $9, $6, $7
+0x00 0x65 0x00 0x18 # CHECK: mult $3, $5
+0x00 0x65 0x00 0x19 # CHECK: multu $3, $5
+0x46 0x20 0x73 0x07 # CHECK: neg.d $f12, $f14
+0x46 0x00 0x39 0x87 # CHECK: neg.s $f6, $f7
+0x4c 0xac 0xc8 0x30 # CHECK: nmadd.s $f0, $f5, $f25, $f12
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x00 0xc7 0x48 0x27 # CHECK: nor $9, $6, $7
+0x4f 0x04 0x98 0x78 # CHECK: nmsub.s $f1, $f24, $f19, $f4
+0x00 0x65 0x18 0x25 # CHECK: or $3, $3, $5
+0x34 0xc9 0x45 0x67 # CHECK: ori $9, $6, 17767
+0x00 0x26 0x49 0xc2 # CHECK: rotr $9, $6, 7
+0x00 0xe6 0x48 0x46 # CHECK: rotrv $9, $6, $7
+0x46 0x20 0x0b 0x08 # CHECK: round.l.d $f12, $f1
+0x46 0x00 0x2e 0x48 # CHECK: round.l.s $f25, $f5
+0x46 0x20 0x73 0x0c # CHECK: round.w.d $f12, $f14
+0x46 0x00 0x39 0x8c # CHECK: round.w.s $f6, $f7
+0xa0 0xa4 0x23 0xc6 # CHECK: sb $4, 9158($5)
+0xa0 0xa4 0x00 0x06 # CHECK: sb $4, 6($5)
+0xe0 0xe9 0x23 0xc6 # CHECK: sc $9, 9158($7)
+0xf3 0xaf 0xdf 0xcd # CHECK: scd $15, -8243($sp)
+0xfd 0x4c 0x16 0xcb # CHECK: sd $12, 5835($10)
+0xf4 0xe9 0x23 0xc6 # CHECK: sdc1 $f9, 9158($7)
+0xb3 0xc7 0xae 0x1f # CHECK: sdl $7, -20961($fp)
+0xb5 0x8b 0xb0 0x39 # CHECK: sdr $11, -20423($12)
+0x4f 0x24 0x40 0x09 # CHECK: sdxc1 $f8, $4($25)
+0x7c 0x07 0x34 0x20 # CHECK: seb $6, $7
+0x7c 0x07 0x36 0x20 # CHECK: seh $6, $7
+0xa4 0xa4 0x23 0xc6 # CHECK: sh $4, 9158($5)
+0x00 0x03 0x21 0xc0 # CHECK: sll $4, $3, 7
+0x00 0xa3 0x10 0x04 # CHECK: sllv $2, $3, $5
+0x00 0x65 0x18 0x2a # CHECK: slt $3, $3, $5
+0x28 0x63 0x00 0x67 # CHECK: slti $3, $3, 103
+0x2c 0x63 0x00 0x67 # CHECK: sltiu $3, $3, 103
+0x00 0x65 0x18 0x2b # CHECK: sltu $3, $3, $5
+0x46 0x20 0x73 0x04 # CHECK: sqrt.d $f12, $f14
+0x46 0x00 0x39 0x84 # CHECK: sqrt.s $f6, $f7
+0x00 0x03 0x21 0xc3 # CHECK: sra $4, $3, 7
+0x00 0xa3 0x10 0x07 # CHECK: srav $2, $3, $5
+0x00 0x03 0x21 0xc2 # CHECK: srl $4, $3, 7
+0x00 0xa3 0x10 0x06 # CHECK: srlv $2, $3, $5
+0x46 0x2e 0x62 0x01 # CHECK: sub.d $f8, $f12, $f14
+0x46 0x07 0x32 0x41 # CHECK: sub.s $f9, $f6, $f7
+0x00 0xc7 0x48 0x22 # CHECK: sub $9, $6, $7
+0x00 0x65 0x20 0x23 # CHECK: subu $4, $3, $5
+0x4c 0xb8 0x20 0x0d # CHECK: suxc1 $f4, $24($5)
+0xac 0xa4 0x00 0x18 # CHECK: sw $4, 24($5)
+0xe4 0xe9 0x23 0xc6 # CHECK: swc1 $f9, 9158($7)
+0xa8 0xa4 0x00 0x10 # CHECK: swl $4, 16($5)
+0xb8 0xe6 0x00 0x10 # CHECK: swr $6, 16($7)
+0x4e 0xd2 0xd0 0x08 # CHECK: swxc1 $f26, $18($22)
+0x00 0x00 0x01 0xcf # CHECK: sync 7
+0x46 0x20 0xbd 0xc9 # CHECK: trunc.l.d $f23, $f23
+0x46 0x00 0xff 0x09 # CHECK: trunc.l.s $f28, $f31
+0x46 0x20 0x73 0x0d # CHECK: trunc.w.d $f12, $f14
+0x46 0x00 0x39 0x8d # CHECK: trunc.w.s $f6, $f7
+0x7c 0x07 0x30 0xa0 # CHECK: wsbh $6, $7
+0x00 0x65 0x18 0x26 # CHECK: xor $3, $3, $5
+0x38 0xc9 0x45 0x67 # CHECK: xori $9, $6, 17767
diff --git a/test/MC/Disassembler/Mips/mips64r2/valid-xfail-mips64r2.txt b/test/MC/Disassembler/Mips/mips64r2/valid-xfail-mips64r2.txt
new file mode 100644
index 0000000..cbc2eee
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64r2/valid-xfail-mips64r2.txt
@@ -0,0 +1,76 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -disassemble -mcpu=mips64r2 | FileCheck %s
+# XFAIL: *
+0x46 0x2f 0x79 0x32 # CHECK: c.eq.d $fcc1, $f15, $f15
+0x46 0x11 0xc5 0x32 # CHECK: c.eq.s $fcc5, $f24, $f17
+0x46 0x35 0x5c 0x30 # CHECK: c.f.d $fcc4, $f11, $f21
+0x46 0x07 0xf4 0x30 # CHECK: c.f.s $fcc4, $f30, $f7
+0x46 0x21 0x94 0x3e # CHECK: c.le.d $fcc4, $f18, $f1
+0x46 0x04 0xc6 0x3e # CHECK: c.le.s $fcc6, $f24, $f4
+0x46 0x23 0x4b 0x3c # CHECK: c.lt.d $fcc3, $f9, $f3
+0x46 0x0e 0x8a 0x3c # CHECK: c.lt.s $fcc2, $f17, $f14
+0x46 0x30 0xad 0x3d # CHECK: c.nge.d $fcc5, $f21, $f16
+0x46 0x08 0x5b 0x3d # CHECK: c.nge.s $fcc3, $f11, $f8
+0x46 0x17 0xfa 0x3b # CHECK: c.ngl.s $fcc2, $f31, $f23
+0x46 0x17 0x92 0x39 # CHECK: c.ngle.s $fcc2, $f18, $f23
+0x46 0x27 0xc4 0x3f # CHECK: c.ngt.d $fcc4, $f24, $f7
+0x46 0x0d 0x45 0x3f # CHECK: c.ngt.s $fcc5, $f8, $f13
+0x46 0x3f 0x82 0x36 # CHECK: c.ole.d $fcc2, $f16, $f31
+0x46 0x14 0x3b 0x36 # CHECK: c.ole.s $fcc3, $f7, $f20
+0x46 0x3c 0x9c 0x34 # CHECK: c.olt.d $fcc4, $f19, $f28
+0x46 0x07 0xa6 0x34 # CHECK: c.olt.s $fcc6, $f20, $f7
+0x46 0x27 0xfc 0x3a # CHECK: c.seq.d $fcc4, $f31, $f7
+0x46 0x19 0x0f 0x3a # CHECK: c.seq.s $fcc7, $f1, $f25
+0x46 0x39 0x6c 0x33 # CHECK: c.ueq.d $fcc4, $f13, $f25
+0x46 0x1e 0x1e 0x33 # CHECK: c.ueq.s $fcc6, $f3, $f30
+0x46 0x32 0xcf 0x37 # CHECK: c.ule.d $fcc7, $f25, $f18
+0x46 0x1e 0xaf 0x37 # CHECK: c.ule.s $fcc7, $f21, $f30
+0x46 0x31 0x36 0x35 # CHECK: c.ult.d $fcc6, $f6, $f17
+0x46 0x0a 0xc7 0x35 # CHECK: c.ult.s $fcc7, $f24, $f10
+0x46 0x38 0xbe 0x31 # CHECK: c.un.d $fcc6, $f23, $f24
+0x46 0x04 0xf1 0x31 # CHECK: c.un.s $fcc1, $f30, $f4
+0x46 0xc0 0x45 0x85 # CHECK: abs.ps $f22, $f8
+0x46 0xcc 0xc6 0x00 # CHECK: add.ps $f24, $f24, $f12
+0x46 0xca 0x04 0x32 # CHECK: c.eq.ps $fcc4, $f0, $f10
+0x46 0xcc 0x66 0x30 # CHECK: c.f.ps $fcc6, $f12, $f12
+0x46 0xd4 0x42 0x3e # CHECK: c.le.ps $fcc2, $f8, $f20
+0x46 0xc4 0x90 0x3c # CHECK: c.lt.ps $f18, $f4
+0x46 0xda 0x10 0x3d # CHECK: c.nge.ps $f2, $f26
+0x46 0xde 0xb0 0x3b # CHECK: c.ngl.ps $f22, $f30
+0x46 0xd4 0x66 0x39 # CHECK: c.ngle.ps $fcc6, $f12, $f20
+0x46 0xc6 0xf6 0x3f # CHECK: c.ngt.ps $fcc6, $f30, $f6
+0x46 0xc8 0xa6 0x36 # CHECK: c.ole.ps $fcc6, $f20, $f8
+0x46 0xd0 0x32 0x34 # CHECK: c.olt.ps $fcc2, $f6, $f16
+0x46 0xce 0xf6 0x3a # CHECK: c.seq.ps $fcc6, $f30, $f14
+0x46 0xc6 0x26 0x38 # CHECK: c.sf.ps $fcc6, $f4, $f6
+0x46 0xdc 0x20 0x33 # CHECK: c.ueq.ps $f4, $f28
+0x46 0xc2 0x86 0x37 # CHECK: c.ule.ps $fcc6, $f16, $f2
+0x46 0xc0 0x76 0x35 # CHECK: c.ult.ps $fcc6, $f14, $f0
+0x46 0xda 0x14 0x31 # CHECK: c.un.ps $fcc4, $f2, $f26
+0x46 0xc0 0x17 0xa8 # CHECK: cvt.s.pl $f30, $f2
+0x46 0xc0 0xd3 0xa0 # CHECK: cvt.s.pu $f14, $f26
+0x4e 0x94 0xd4 0xa1 # CHECK: madd.d $f18, $f20, $f26, $f20
+0x4c 0x42 0x75 0xa6 # CHECK: madd.ps $f22, $f2, $f14, $f2
+0x46 0xc0 0x85 0x86 # CHECK: mov.ps $f22, $f16
+0x46 0xd8 0xe2 0x91 # CHECK: movf.ps $f10, $f28, $fcc6
+0x46 0xd3 0xf7 0x93 # CHECK: movn.ps $f30, $f30, s3
+0x46 0xc9 0xc5 0x11 # CHECK: movt.ps $f20, $f24, $fcc2
+0x46 0xdf 0x84 0x92 # CHECK: movz.ps $f18, $f16, ra
+0x4c 0x52 0xf2 0xa9 # CHECK: msub.d $f10, $f2, $f30, $f18
+0x4d 0xd0 0xe3 0x2e # CHECK: msub.ps $f12, $f14, $f28, $f16
+0x46 0xc0 0x64 0x87 # CHECK: neg.ps $f18, $f12
+0x4d 0x54 0x74 0xb1 # CHECK: nmadd.d $f18, $f10, $f14, $f20
+0x4c 0x98 0x46 0xb6 # CHECK: nmadd.ps $f26, $f4, $f8, $f24
+0x4d 0x1e 0x87 0xb9 # CHECK: nmsub.d $f30, $f8, $f16, $f30
+0x4d 0x90 0x71 0xbe # CHECK: nmsub.ps $f6, $f12, $f14, $f16
+0x46 0xde 0x46 0x2c # CHECK: pll.ps $f24, $f8, $f30
+0x46 0xdc 0xd0 0x2d # CHECK: plu.ps $f0, $f26, $f28
+0x46 0xda 0xf2 0x2e # CHECK: pul.ps $f8, $f30, $f26
+0x46 0xc2 0x46 0x2f # CHECK: puu.ps $f24, $f8, $f2
+0x41 0x49 0x98 0x00 # CHECK: rdpgpr s3, t1
+0x46 0x20 0x34 0x95 # CHECK: recip.d $f18, $f6
+0x46 0x00 0xf0 0xd5 # CHECK: recip.s $f3, $f30
+0x02 0xa7 0x68 0x46 # CHECK: rorv t5, a3, s5
+0x46 0x20 0xe0 0x96 # CHECK: rsqrt.d $f2, $f28
+0x46 0x00 0x41 0x16 # CHECK: rsqrt.s $f4, $f8
+0x46 0xda 0x71 0x01 # CHECK: sub.ps $f4, $f14, $f26
+0x41 0xcd 0x00 0x00 # CHECK: wrpgpr zero, t5
diff --git a/test/MC/Disassembler/Mips/mips64r3/valid-mips64r3-el.txt b/test/MC/Disassembler/Mips/mips64r3/valid-mips64r3-el.txt
new file mode 100644
index 0000000..52374af2
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64r3/valid-mips64r3-el.txt
@@ -0,0 +1,234 @@
+# RUN: llvm-mc --disassemble %s -triple=mips64el-unknown-linux -mcpu=mips64r3 | FileCheck %s
+# CHECK: .text
+0x05 0x73 0x20 0x46 # CHECK: abs.d $f12, $f14
+0x85 0x39 0x00 0x46 # CHECK: abs.s $f6, $f7
+0x20 0x48 0xc7 0x00 # CHECK: add $9, $6, $7
+0x00 0x62 0x2e 0x46 # CHECK: add.d $f8, $f12, $f14
+0x40 0x32 0x07 0x46 # CHECK: add.s $f9, $f6, $f7
+0x67 0x45 0xc9 0x20 # CHECK: addi $9, $6, 17767
+0x67 0xc5 0xc9 0x24 # CHECK: addiu $9, $6, -15001
+0x21 0x48 0xc7 0x00 # CHECK: addu $9, $6, $7
+0x24 0x48 0xc7 0x00 # CHECK: and $9, $6, $7
+0x67 0x45 0xc9 0x30 # CHECK: andi $9, $6, 17767
+0x4c 0x01 0x00 0x10 # CHECK: b 1332
+0x4c 0x01 0x00 0x45 # CHECK: bc1f 1332
+0x4c 0x01 0x1c 0x45 # CHECK: bc1f $fcc7, 1332
+0x4c 0x01 0x01 0x45 # CHECK: bc1t 1332
+0x4c 0x01 0x1d 0x45 # CHECK: bc1t $fcc7, 1332
+0x4c 0x01 0x26 0x11 # CHECK: beq $9, $6, 1332
+0x4c 0x01 0xc1 0x04 # CHECK: bgez $6, 1332
+0x4c 0x01 0xd1 0x04 # CHECK: bgezal $6, 1332
+0x4c 0x01 0xc0 0x1c # CHECK: bgtz $6, 1332
+0x4c 0x01 0xc0 0x18 # CHECK: blez $6, 1332
+0x4c 0x01 0x26 0x15 # CHECK: bne $9, $6, 1332
+0x32 0x60 0x2e 0x46 # CHECK: c.eq.d $f12, $f14
+0x32 0x30 0x07 0x46 # CHECK: c.eq.s $f6, $f7
+0x30 0x60 0x2e 0x46 # CHECK: c.f.d $f12, $f14
+0x30 0x30 0x07 0x46 # CHECK: c.f.s $f6, $f7
+0x3e 0x60 0x2e 0x46 # CHECK: c.le.d $f12, $f14
+0x3e 0x30 0x07 0x46 # CHECK: c.le.s $f6, $f7
+0x3c 0x60 0x2e 0x46 # CHECK: c.lt.d $f12, $f14
+0x3c 0x30 0x07 0x46 # CHECK: c.lt.s $f6, $f7
+0x3d 0x60 0x2e 0x46 # CHECK: c.nge.d $f12, $f14
+0x3d 0x30 0x07 0x46 # CHECK: c.nge.s $f6, $f7
+0x3b 0x60 0x2e 0x46 # CHECK: c.ngl.d $f12, $f14
+0x3b 0x30 0x07 0x46 # CHECK: c.ngl.s $f6, $f7
+0x39 0x60 0x2e 0x46 # CHECK: c.ngle.d $f12, $f14
+0x39 0x30 0x07 0x46 # CHECK: c.ngle.s $f6, $f7
+0x3f 0x60 0x2e 0x46 # CHECK: c.ngt.d $f12, $f14
+0x3f 0x30 0x07 0x46 # CHECK: c.ngt.s $f6, $f7
+0x36 0x60 0x2e 0x46 # CHECK: c.ole.d $f12, $f14
+0x36 0x30 0x07 0x46 # CHECK: c.ole.s $f6, $f7
+0x34 0x60 0x2e 0x46 # CHECK: c.olt.d $f12, $f14
+0x34 0x30 0x07 0x46 # CHECK: c.olt.s $f6, $f7
+0x3a 0x60 0x2e 0x46 # CHECK: c.seq.d $f12, $f14
+0x3a 0x30 0x07 0x46 # CHECK: c.seq.s $f6, $f7
+0x38 0x60 0x2e 0x46 # CHECK: c.sf.d $f12, $f14
+0x38 0x30 0x07 0x46 # CHECK: c.sf.s $f6, $f7
+0x33 0x60 0x2e 0x46 # CHECK: c.ueq.d $f12, $f14
+0x33 0xe0 0x12 0x46 # CHECK: c.ueq.s $f28, $f18
+0x37 0x60 0x2e 0x46 # CHECK: c.ule.d $f12, $f14
+0x37 0x30 0x07 0x46 # CHECK: c.ule.s $f6, $f7
+0x35 0x60 0x2e 0x46 # CHECK: c.ult.d $f12, $f14
+0x35 0x30 0x07 0x46 # CHECK: c.ult.s $f6, $f7
+0x31 0x60 0x2e 0x46 # CHECK: c.un.d $f12, $f14
+0x31 0x30 0x07 0x46 # CHECK: c.un.s $f6, $f7
+0x4a 0x18 0x20 0x46 # CHECK: ceil.l.d $f1, $f3
+0x8a 0x6c 0x00 0x46 # CHECK: ceil.l.s $f18, $f13
+0x0e 0x73 0x20 0x46 # CHECK: ceil.w.d $f12, $f14
+0x8e 0x39 0x00 0x46 # CHECK: ceil.w.s $f6, $f7
+0x00 0x38 0x46 0x44 # CHECK: cfc1 $6, $7
+0x21 0x30 0xe6 0x70 # CHECK: clo $6, $7
+0x20 0x30 0xe6 0x70 # CHECK: clz $6, $7
+0x00 0x38 0xc6 0x44 # CHECK: ctc1 $6, $7
+0x21 0x81 0xa0 0x46 # CHECK: cvt.d.l $f4, $f16
+0xa1 0x39 0x00 0x46 # CHECK: cvt.d.s $f6, $f7
+0x21 0x73 0x80 0x46 # CHECK: cvt.d.w $f12, $f14
+0x25 0x73 0x20 0x46 # CHECK: cvt.l.d $f12, $f14
+0xa5 0x39 0x00 0x46 # CHECK: cvt.l.s $f6, $f7
+0xe0 0xf3 0xa0 0x46 # CHECK: cvt.s.l $f15, $f30
+0x20 0x73 0x20 0x46 # CHECK: cvt.s.d $f12, $f14
+0xa0 0x39 0x80 0x46 # CHECK: cvt.s.w $f6, $f7
+0x24 0x73 0x20 0x46 # CHECK: cvt.w.d $f12, $f14
+0xa4 0x39 0x00 0x46 # CHECK: cvt.w.s $f6, $f7
+0x2c 0x98 0x3f 0x00 # CHECK: dadd $19, $1, $ra
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0xbd 0x63 # CHECK: daddi $sp, $sp, -27705
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0xbd 0x63 # CHECK: daddi $sp, $sp, -27705
+0x16 0xee 0xda 0x66 # CHECK: daddiu $26, $22, -4586
+0x2d 0x98 0x3f 0x00 # CHECK: daddu $19, $1, $ra
+0x9f 0x46 0x58 0x64 # CHECK: daddiu $24, $2, 18079
+0x3f 0x69 0x73 0x66 # CHECK: daddiu $19, $19, 26943
+0x25 0x90 0xd2 0x70 # CHECK: dclo $18, $6
+0x24 0x80 0x30 0x73 # CHECK: dclz $16, $25
+0x1e 0x00 0x53 0x03 # CHECK: ddiv $zero, $26, $19
+0x1f 0x00 0x11 0x02 # CHECK: ddivu $zero, $16, $17
+0x00 0x68 0x2c 0x44 # CHECK: dmfc1 $12, $f13
+0x00 0x70 0xb0 0x44 # CHECK: dmtc1 $16, $f14
+0x1c 0x00 0xe9 0x02 # CHECK: dmult $23, $9
+0x1d 0x00 0xa6 0x00 # CHECK: dmultu $5, $6
+0xb8 0x04 0x00 0x00 # CHECK: dsll $zero, $zero, 18
+0xb8 0x04 0x14 0x00 # CHECK: dsll $zero, $20, 18
+0x14 0x00 0x94 0x01 # CHECK: dsllv $zero, $20, $12
+0xbc 0x04 0x00 0x00 # CHECK: dsll32 $zero, $zero, 18
+0xbc 0x04 0x00 0x00 # CHECK: dsll32 $zero, $zero, 18
+0x14 0x00 0x94 0x01 # CHECK: dsllv $zero, $20, $12
+0xbb 0xe2 0x1c 0x00 # CHECK: dsra $gp, $gp, 10
+0xbb 0xe2 0x12 0x00 # CHECK: dsra $gp, $18, 10
+0x17 0xe0 0x72 0x02 # CHECK: dsrav $gp, $18, $19
+0xbf 0xe2 0x1c 0x00 # CHECK: dsra32 $gp, $gp, 10
+0xbf 0xe2 0x12 0x00 # CHECK: dsra32 $gp, $18, 10
+0x17 0xe0 0x72 0x02 # CHECK: dsrav $gp, $18, $19
+0xfa 0x9d 0x13 0x00 # CHECK: dsrl $19, $19, 23
+0xfa 0x9d 0x06 0x00 # CHECK: dsrl $19, $6, 23
+0x16 0x98 0x86 0x02 # CHECK: dsrlv $19, $6, $20
+0xfe 0x9d 0x13 0x00 # CHECK: dsrl32 $19, $19, 23
+0xfe 0x9d 0x06 0x00 # CHECK: dsrl32 $19, $6, 23
+0x16 0x98 0x86 0x02 # CHECK: dsrlv $19, $6, $20
+0x2e 0x38 0xc8 0x02 # CHECK: dsub $7, $22, $8
+0xa4 0x18 0x0e 0x7c # CHECK: dsbh $3, $14
+0x64 0x11 0x1d 0x7c # CHECK: dshd $2, $sp
+0x39 0x6c 0x9d 0x62 # CHECK: daddi $sp, $20, 27705
+0x39 0x6c 0xbd 0x63 # CHECK: daddi $sp, $sp, 27705
+0x2f 0x28 0xba 0x00 # CHECK: dsubu $5, $5, $26
+0x5f 0xec 0x6f 0x65 # CHECK: daddiu $15, $11, -5025
+0xea 0x11 0xce 0x65 # CHECK: daddiu $14, $14, 4586
+0x00 0x60 0x7e 0x41 # CHECK: di $fp
+0x00 0x60 0x60 0x41 # CHECK: di
+0xfa 0x0b 0x21 0x00 # CHECK: drotr $1, $1, 15
+0xfa 0x0b 0x2e 0x00 # CHECK: drotr $1, $14, 15
+0xfe 0x0b 0x21 0x00 # CHECK: drotr32 $1, $1, 15
+0xfe 0x0b 0x2e 0x00 # CHECK: drotr32 $1, $14, 15
+0x56 0x08 0xee 0x01 # CHECK: drotrv $1, $14, $15
+0x20 0x60 0x6e 0x41 # CHECK: ei $14
+0x20 0x60 0x60 0x41 # CHECK: ei
+0x8b 0x3e 0x20 0x46 # CHECK: floor.l.d $f26, $f7
+0x0b 0x2b 0x00 0x46 # CHECK: floor.l.s $f12, $f5
+0x0f 0x73 0x20 0x46 # CHECK: floor.w.d $f12, $f14
+0x8f 0x39 0x00 0x46 # CHECK: floor.w.s $f6, $f7
+0x84 0x61 0x33 0x7d # CHECK: ins $19, $9, 6, 7
+0x4c 0x01 0x00 0x08 # CHECK: j 1328
+0x4c 0x01 0x00 0x0c # CHECK: jal 1328
+0x4c 0x01 0x00 0x74 # CHECK: jalx 1328
+0x09 0xf8 0xe0 0x00 # CHECK: jalr $7
+0x09 0xfc 0x80 0x00 # CHECK: jalr.hb $4
+0x09 0x24 0xa0 0x00 # CHECK: jalr.hb $4, $5
+0x08 0x00 0xe0 0x00 # CHECK: jr $7
+0xc6 0x23 0xa4 0x80 # CHECK: lb $4, 9158($5)
+0x06 0x00 0xa4 0x90 # CHECK: lbu $4, 6($5)
+0x1b 0x90 0x3d 0xde # CHECK: ld $sp, -28645($17)
+0xb9 0xef 0x18 0x6b # CHECK: ldl $24, -4167($24)
+0x6a 0x89 0x8e 0x6e # CHECK: ldr $14, -30358($20)
+0xc6 0x23 0xe9 0xd4 # CHECK: ldc1 $f9, 9158($7)
+0x01 0x02 0xf7 0x4d # CHECK: ldxc1 $f8, $23($15)
+0x0c 0x00 0xa4 0x84 # CHECK: lh $4, 12($5)
+0x0c 0x00 0xa4 0x84 # CHECK: lh $4, 12($5)
+0xc6 0x23 0xe9 0xc0 # CHECK: ll $9, 9158($7)
+0x70 0xc6 0xe0 0xd3 # CHECK: lld $zero, -14736($ra)
+0x67 0x45 0x06 0x3c # CHECK: lui $6, 17767
+0x05 0x00 0xa6 0x4c # CHECK: luxc1 $f0, $6($5)
+0x18 0x00 0xa4 0x8c # CHECK: lw $4, 24($5)
+0xc6 0x23 0xe9 0xc4 # CHECK: lwc1 $f9, 9158($7)
+0x03 0x00 0x82 0x88 # CHECK: lwl $2, 3($4)
+0x10 0x00 0xa3 0x98 # CHECK: lwr $3, 16($5)
+0x00 0x05 0xcc 0x4d # CHECK: lwxc1 $f20, $12($14)
+0xea 0xa1 0x73 0x9c # CHECK: lwu $19, -24086($3)
+0x00 0x00 0xc7 0x70 # CHECK: madd $6, $7
+0x60 0x98 0xf9 0x4f # CHECK: madd.s $f1, $f31, $f19, $f25
+0x01 0x00 0xc7 0x70 # CHECK: maddu $6, $7
+0x00 0x38 0x06 0x44 # CHECK: mfc1 $6, $f7
+0x10 0x28 0x00 0x00 # CHECK: mfhi $5
+0x00 0xc0 0x7e 0x44 # CHECK: mfhc1 $fp, $f24
+0x12 0x28 0x00 0x00 # CHECK: mflo $5
+0x86 0x41 0x20 0x46 # CHECK: mov.d $f6, $f8
+0x86 0x39 0x00 0x46 # CHECK: mov.s $f6, $f7
+0x04 0x00 0xc7 0x70 # CHECK: msub $6, $7
+0x28 0x53 0x70 0x4e # CHECK: msub.s $f12, $f19, $f10, $f16
+0x05 0x00 0xc7 0x70 # CHECK: msubu $6, $7
+0x00 0x38 0x86 0x44 # CHECK: mtc1 $6, $f7
+0x11 0x00 0xe0 0x00 # CHECK: mthi $7
+0x00 0x80 0xe0 0x44 # CHECK: mthc1 $zero, $f16
+0x13 0x00 0xe0 0x00 # CHECK: mtlo $7
+0x02 0x62 0x2e 0x46 # CHECK: mul.d $f8, $f12, $f14
+0x42 0x32 0x07 0x46 # CHECK: mul.s $f9, $f6, $f7
+0x02 0x48 0xc7 0x70 # CHECK: mul $9, $6, $7
+0x18 0x00 0x65 0x00 # CHECK: mult $3, $5
+0x19 0x00 0x65 0x00 # CHECK: multu $3, $5
+0x07 0x73 0x20 0x46 # CHECK: neg.d $f12, $f14
+0x87 0x39 0x00 0x46 # CHECK: neg.s $f6, $f7
+0x30 0xc8 0xac 0x4c # CHECK: nmadd.s $f0, $f5, $f25, $f12
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x27 0x48 0xc7 0x00 # CHECK: nor $9, $6, $7
+0x78 0x98 0x04 0x4f # CHECK: nmsub.s $f1, $f24, $f19, $f4
+0x25 0x18 0x65 0x00 # CHECK: or $3, $3, $5
+0x67 0x45 0xc9 0x34 # CHECK: ori $9, $6, 17767
+0xc2 0x49 0x26 0x00 # CHECK: rotr $9, $6, 7
+0x46 0x48 0xe6 0x00 # CHECK: rotrv $9, $6, $7
+0x08 0x0b 0x20 0x46 # CHECK: round.l.d $f12, $f1
+0x48 0x2e 0x00 0x46 # CHECK: round.l.s $f25, $f5
+0x0c 0x73 0x20 0x46 # CHECK: round.w.d $f12, $f14
+0x8c 0x39 0x00 0x46 # CHECK: round.w.s $f6, $f7
+0xc6 0x23 0xa4 0xa0 # CHECK: sb $4, 9158($5)
+0x06 0x00 0xa4 0xa0 # CHECK: sb $4, 6($5)
+0xc6 0x23 0xe9 0xe0 # CHECK: sc $9, 9158($7)
+0xcd 0xdf 0xaf 0xf3 # CHECK: scd $15, -8243($sp)
+0xcb 0x16 0x4c 0xfd # CHECK: sd $12, 5835($10)
+0xc6 0x23 0xe9 0xf4 # CHECK: sdc1 $f9, 9158($7)
+0x1f 0xae 0xc7 0xb3 # CHECK: sdl $7, -20961($fp)
+0x39 0xb0 0x8b 0xb5 # CHECK: sdr $11, -20423($12)
+0x09 0x40 0x24 0x4f # CHECK: sdxc1 $f8, $4($25)
+0x20 0x34 0x07 0x7c # CHECK: seb $6, $7
+0x20 0x36 0x07 0x7c # CHECK: seh $6, $7
+0xc6 0x23 0xa4 0xa4 # CHECK: sh $4, 9158($5)
+0xc0 0x21 0x03 0x00 # CHECK: sll $4, $3, 7
+0x04 0x10 0xa3 0x00 # CHECK: sllv $2, $3, $5
+0x2a 0x18 0x65 0x00 # CHECK: slt $3, $3, $5
+0x67 0x00 0x63 0x28 # CHECK: slti $3, $3, 103
+0x67 0x00 0x63 0x2c # CHECK: sltiu $3, $3, 103
+0x2b 0x18 0x65 0x00 # CHECK: sltu $3, $3, $5
+0x04 0x73 0x20 0x46 # CHECK: sqrt.d $f12, $f14
+0x84 0x39 0x00 0x46 # CHECK: sqrt.s $f6, $f7
+0xc3 0x21 0x03 0x00 # CHECK: sra $4, $3, 7
+0x07 0x10 0xa3 0x00 # CHECK: srav $2, $3, $5
+0xc2 0x21 0x03 0x00 # CHECK: srl $4, $3, 7
+0x06 0x10 0xa3 0x00 # CHECK: srlv $2, $3, $5
+0x01 0x62 0x2e 0x46 # CHECK: sub.d $f8, $f12, $f14
+0x41 0x32 0x07 0x46 # CHECK: sub.s $f9, $f6, $f7
+0x22 0x48 0xc7 0x00 # CHECK: sub $9, $6, $7
+0x23 0x20 0x65 0x00 # CHECK: subu $4, $3, $5
+0x0d 0x20 0xb8 0x4c # CHECK: suxc1 $f4, $24($5)
+0x18 0x00 0xa4 0xac # CHECK: sw $4, 24($5)
+0xc6 0x23 0xe9 0xe4 # CHECK: swc1 $f9, 9158($7)
+0x10 0x00 0xa4 0xa8 # CHECK: swl $4, 16($5)
+0x10 0x00 0xe6 0xb8 # CHECK: swr $6, 16($7)
+0x08 0xd0 0xd2 0x4e # CHECK: swxc1 $f26, $18($22)
+0xcf 0x01 0x00 0x00 # CHECK: sync 7
+0xc9 0xbd 0x20 0x46 # CHECK: trunc.l.d $f23, $f23
+0x09 0xff 0x00 0x46 # CHECK: trunc.l.s $f28, $f31
+0x0d 0x73 0x20 0x46 # CHECK: trunc.w.d $f12, $f14
+0x8d 0x39 0x00 0x46 # CHECK: trunc.w.s $f6, $f7
+0xa0 0x30 0x07 0x7c # CHECK: wsbh $6, $7
+0x26 0x18 0x65 0x00 # CHECK: xor $3, $3, $5
+0x67 0x45 0xc9 0x38 # CHECK: xori $9, $6, 17767
diff --git a/test/MC/Disassembler/Mips/mips64r3/valid-mips64r3.txt b/test/MC/Disassembler/Mips/mips64r3/valid-mips64r3.txt
new file mode 100644
index 0000000..5a427df
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64r3/valid-mips64r3.txt
@@ -0,0 +1,234 @@
+# RUN: llvm-mc --disassemble %s -triple=mips64-unknown-linux -mcpu=mips64r3 | FileCheck %s
+# CHECK: .text
+0x46 0x20 0x73 0x05 # CHECK: abs.d $f12, $f14
+0x46 0x00 0x39 0x85 # CHECK: abs.s $f6, $f7
+0x00 0xc7 0x48 0x20 # CHECK: add $9, $6, $7
+0x46 0x2e 0x62 0x00 # CHECK: add.d $f8, $f12, $f14
+0x46 0x07 0x32 0x40 # CHECK: add.s $f9, $f6, $f7
+0x20 0xc9 0x45 0x67 # CHECK: addi $9, $6, 17767
+0x24 0xc9 0xc5 0x67 # CHECK: addiu $9, $6, -15001
+0x00 0xc7 0x48 0x21 # CHECK: addu $9, $6, $7
+0x00 0xc7 0x48 0x24 # CHECK: and $9, $6, $7
+0x30 0xc9 0x45 0x67 # CHECK: andi $9, $6, 17767
+0x10 0x00 0x01 0x4c # CHECK: b 1332
+0x45 0x00 0x01 0x4c # CHECK: bc1f 1332
+0x45 0x1c 0x01 0x4c # CHECK: bc1f $fcc7, 1332
+0x45 0x01 0x01 0x4c # CHECK: bc1t 1332
+0x45 0x1d 0x01 0x4c # CHECK: bc1t $fcc7, 1332
+0x11 0x26 0x01 0x4c # CHECK: beq $9, $6, 1332
+0x04 0xc1 0x01 0x4c # CHECK: bgez $6, 1332
+0x04 0xd1 0x01 0x4c # CHECK: bgezal $6, 1332
+0x1c 0xc0 0x01 0x4c # CHECK: bgtz $6, 1332
+0x18 0xc0 0x01 0x4c # CHECK: blez $6, 1332
+0x15 0x26 0x01 0x4c # CHECK: bne $9, $6, 1332
+0x46 0x2e 0x60 0x32 # CHECK: c.eq.d $f12, $f14
+0x46 0x07 0x30 0x32 # CHECK: c.eq.s $f6, $f7
+0x46 0x2e 0x60 0x30 # CHECK: c.f.d $f12, $f14
+0x46 0x07 0x30 0x30 # CHECK: c.f.s $f6, $f7
+0x46 0x2e 0x60 0x3e # CHECK: c.le.d $f12, $f14
+0x46 0x07 0x30 0x3e # CHECK: c.le.s $f6, $f7
+0x46 0x2e 0x60 0x3c # CHECK: c.lt.d $f12, $f14
+0x46 0x07 0x30 0x3c # CHECK: c.lt.s $f6, $f7
+0x46 0x2e 0x60 0x3d # CHECK: c.nge.d $f12, $f14
+0x46 0x07 0x30 0x3d # CHECK: c.nge.s $f6, $f7
+0x46 0x2e 0x60 0x3b # CHECK: c.ngl.d $f12, $f14
+0x46 0x07 0x30 0x3b # CHECK: c.ngl.s $f6, $f7
+0x46 0x2e 0x60 0x39 # CHECK: c.ngle.d $f12, $f14
+0x46 0x07 0x30 0x39 # CHECK: c.ngle.s $f6, $f7
+0x46 0x2e 0x60 0x3f # CHECK: c.ngt.d $f12, $f14
+0x46 0x07 0x30 0x3f # CHECK: c.ngt.s $f6, $f7
+0x46 0x2e 0x60 0x36 # CHECK: c.ole.d $f12, $f14
+0x46 0x07 0x30 0x36 # CHECK: c.ole.s $f6, $f7
+0x46 0x2e 0x60 0x34 # CHECK: c.olt.d $f12, $f14
+0x46 0x07 0x30 0x34 # CHECK: c.olt.s $f6, $f7
+0x46 0x2e 0x60 0x3a # CHECK: c.seq.d $f12, $f14
+0x46 0x07 0x30 0x3a # CHECK: c.seq.s $f6, $f7
+0x46 0x2e 0x60 0x38 # CHECK: c.sf.d $f12, $f14
+0x46 0x07 0x30 0x38 # CHECK: c.sf.s $f6, $f7
+0x46 0x2e 0x60 0x33 # CHECK: c.ueq.d $f12, $f14
+0x46 0x12 0xe0 0x33 # CHECK: c.ueq.s $f28, $f18
+0x46 0x2e 0x60 0x37 # CHECK: c.ule.d $f12, $f14
+0x46 0x07 0x30 0x37 # CHECK: c.ule.s $f6, $f7
+0x46 0x2e 0x60 0x35 # CHECK: c.ult.d $f12, $f14
+0x46 0x07 0x30 0x35 # CHECK: c.ult.s $f6, $f7
+0x46 0x2e 0x60 0x31 # CHECK: c.un.d $f12, $f14
+0x46 0x07 0x30 0x31 # CHECK: c.un.s $f6, $f7
+0x46 0x20 0x18 0x4a # CHECK: ceil.l.d $f1, $f3
+0x46 0x00 0x6c 0x8a # CHECK: ceil.l.s $f18, $f13
+0x46 0x20 0x73 0x0e # CHECK: ceil.w.d $f12, $f14
+0x46 0x00 0x39 0x8e # CHECK: ceil.w.s $f6, $f7
+0x44 0x46 0x38 0x00 # CHECK: cfc1 $6, $7
+0x70 0xe6 0x30 0x21 # CHECK: clo $6, $7
+0x70 0xe6 0x30 0x20 # CHECK: clz $6, $7
+0x44 0xc6 0x38 0x00 # CHECK: ctc1 $6, $7
+0x46 0xa0 0x81 0x21 # CHECK: cvt.d.l $f4, $f16
+0x46 0x00 0x39 0xa1 # CHECK: cvt.d.s $f6, $f7
+0x46 0x80 0x73 0x21 # CHECK: cvt.d.w $f12, $f14
+0x46 0x20 0x73 0x25 # CHECK: cvt.l.d $f12, $f14
+0x46 0x00 0x39 0xa5 # CHECK: cvt.l.s $f6, $f7
+0x46 0xa0 0xf3 0xe0 # CHECK: cvt.s.l $f15, $f30
+0x46 0x20 0x73 0x20 # CHECK: cvt.s.d $f12, $f14
+0x46 0x80 0x39 0xa0 # CHECK: cvt.s.w $f6, $f7
+0x46 0x20 0x73 0x24 # CHECK: cvt.w.d $f12, $f14
+0x46 0x00 0x39 0xa4 # CHECK: cvt.w.s $f6, $f7
+0x00 0x3f 0x98 0x2c # CHECK: dadd $19, $1, $ra
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x63 0xbd 0x93 0xc7 # CHECK: daddi $sp, $sp, -27705
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x63 0xbd 0x93 0xc7 # CHECK: daddi $sp, $sp, -27705
+0x66 0xda 0xee 0x16 # CHECK: daddiu $26, $22, -4586
+0x00 0x3f 0x98 0x2d # CHECK: daddu $19, $1, $ra
+0x64 0x58 0x46 0x9f # CHECK: daddiu $24, $2, 18079
+0x66 0x73 0x69 0x3f # CHECK: daddiu $19, $19, 26943
+0x70 0xd2 0x90 0x25 # CHECK: dclo $18, $6
+0x73 0x30 0x80 0x24 # CHECK: dclz $16, $25
+0x03 0x53 0x00 0x1e # CHECK: ddiv $zero, $26, $19
+0x02 0x11 0x00 0x1f # CHECK: ddivu $zero, $16, $17
+0x44 0x2c 0x68 0x00 # CHECK: dmfc1 $12, $f13
+0x44 0xb0 0x70 0x00 # CHECK: dmtc1 $16, $f14
+0x02 0xe9 0x00 0x1c # CHECK: dmult $23, $9
+0x00 0xa6 0x00 0x1d # CHECK: dmultu $5, $6
+0x00 0x00 0x04 0xb8 # CHECK: dsll $zero, $zero, 18
+0x00 0x14 0x04 0xb8 # CHECK: dsll $zero, $20, 18
+0x01 0x94 0x00 0x14 # CHECK: dsllv $zero, $20, $12
+0x00 0x00 0x04 0xbc # CHECK: dsll32 $zero, $zero, 18
+0x00 0x00 0x04 0xbc # CHECK: dsll32 $zero, $zero, 18
+0x01 0x94 0x00 0x14 # CHECK: dsllv $zero, $20, $12
+0x00 0x1c 0xe2 0xbb # CHECK: dsra $gp, $gp, 10
+0x00 0x12 0xe2 0xbb # CHECK: dsra $gp, $18, 10
+0x02 0x72 0xe0 0x17 # CHECK: dsrav $gp, $18, $19
+0x00 0x1c 0xe2 0xbf # CHECK: dsra32 $gp, $gp, 10
+0x00 0x12 0xe2 0xbf # CHECK: dsra32 $gp, $18, 10
+0x02 0x72 0xe0 0x17 # CHECK: dsrav $gp, $18, $19
+0x00 0x13 0x9d 0xfa # CHECK: dsrl $19, $19, 23
+0x00 0x06 0x9d 0xfa # CHECK: dsrl $19, $6, 23
+0x02 0x86 0x98 0x16 # CHECK: dsrlv $19, $6, $20
+0x00 0x13 0x9d 0xfe # CHECK: dsrl32 $19, $19, 23
+0x00 0x06 0x9d 0xfe # CHECK: dsrl32 $19, $6, 23
+0x02 0x86 0x98 0x16 # CHECK: dsrlv $19, $6, $20
+0x02 0xc8 0x38 0x2e # CHECK: dsub $7, $22, $8
+0x7c 0x0e 0x18 0xa4 # CHECK: dsbh $3, $14
+0x7c 0x1d 0x11 0x64 # CHECK: dshd $2, $sp
+0x62 0x9d 0x6c 0x39 # CHECK: daddi $sp, $20, 27705
+0x63 0xbd 0x6c 0x39 # CHECK: daddi $sp, $sp, 27705
+0x00 0xba 0x28 0x2f # CHECK: dsubu $5, $5, $26
+0x65 0x6f 0xec 0x5f # CHECK: daddiu $15, $11, -5025
+0x65 0xce 0x11 0xea # CHECK: daddiu $14, $14, 4586
+0x41 0x7e 0x60 0x00 # CHECK: di $fp
+0x41 0x60 0x60 0x00 # CHECK: di
+0x00 0x21 0x0b 0xfa # CHECK: drotr $1, $1, 15
+0x00 0x2e 0x0b 0xfa # CHECK: drotr $1, $14, 15
+0x00 0x21 0x0b 0xfe # CHECK: drotr32 $1, $1, 15
+0x00 0x2e 0x0b 0xfe # CHECK: drotr32 $1, $14, 15
+0x01 0xee 0x08 0x56 # CHECK: drotrv $1, $14, $15
+0x41 0x6e 0x60 0x20 # CHECK: ei $14
+0x41 0x60 0x60 0x20 # CHECK: ei
+0x46 0x20 0x3e 0x8b # CHECK: floor.l.d $f26, $f7
+0x46 0x00 0x2b 0x0b # CHECK: floor.l.s $f12, $f5
+0x46 0x20 0x73 0x0f # CHECK: floor.w.d $f12, $f14
+0x46 0x00 0x39 0x8f # CHECK: floor.w.s $f6, $f7
+0x7d 0x33 0x61 0x84 # CHECK: ins $19, $9, 6, 7
+0x08 0x00 0x01 0x4c # CHECK: j 1328
+0x0c 0x00 0x01 0x4c # CHECK: jal 1328
+0x74 0x00 0x01 0x4c # CHECK: jalx 1328
+0x00 0xe0 0xf8 0x09 # CHECK: jalr $7
+0x00 0x80 0xfc 0x09 # CHECK: jalr.hb $4
+0x00 0xa0 0x24 0x09 # CHECK: jalr.hb $4, $5
+0x00 0xe0 0x00 0x08 # CHECK: jr $7
+0x80 0xa4 0x23 0xc6 # CHECK: lb $4, 9158($5)
+0x90 0xa4 0x00 0x06 # CHECK: lbu $4, 6($5)
+0xde 0x3d 0x90 0x1b # CHECK: ld $sp, -28645($17)
+0x6b 0x18 0xef 0xb9 # CHECK: ldl $24, -4167($24)
+0x6e 0x8e 0x89 0x6a # CHECK: ldr $14, -30358($20)
+0xd4 0xe9 0x23 0xc6 # CHECK: ldc1 $f9, 9158($7)
+0x4d 0xf7 0x02 0x01 # CHECK: ldxc1 $f8, $23($15)
+0x84 0xa4 0x00 0x0c # CHECK: lh $4, 12($5)
+0x84 0xa4 0x00 0x0c # CHECK: lh $4, 12($5)
+0xc0 0xe9 0x23 0xc6 # CHECK: ll $9, 9158($7)
+0xd3 0xe0 0xc6 0x70 # CHECK: lld $zero, -14736($ra)
+0x3c 0x06 0x45 0x67 # CHECK: lui $6, 17767
+0x4c 0xa6 0x00 0x05 # CHECK: luxc1 $f0, $6($5)
+0x8c 0xa4 0x00 0x18 # CHECK: lw $4, 24($5)
+0xc4 0xe9 0x23 0xc6 # CHECK: lwc1 $f9, 9158($7)
+0x88 0x82 0x00 0x03 # CHECK: lwl $2, 3($4)
+0x98 0xa3 0x00 0x10 # CHECK: lwr $3, 16($5)
+0x4d 0xcc 0x05 0x00 # CHECK: lwxc1 $f20, $12($14)
+0x9c 0x73 0xa1 0xea # CHECK: lwu $19, -24086($3)
+0x70 0xc7 0x00 0x00 # CHECK: madd $6, $7
+0x4f 0xf9 0x98 0x60 # CHECK: madd.s $f1, $f31, $f19, $f25
+0x70 0xc7 0x00 0x01 # CHECK: maddu $6, $7
+0x44 0x06 0x38 0x00 # CHECK: mfc1 $6, $f7
+0x00 0x00 0x28 0x10 # CHECK: mfhi $5
+0x44 0x7e 0xc0 0x00 # CHECK: mfhc1 $fp, $f24
+0x00 0x00 0x28 0x12 # CHECK: mflo $5
+0x46 0x20 0x41 0x86 # CHECK: mov.d $f6, $f8
+0x46 0x00 0x39 0x86 # CHECK: mov.s $f6, $f7
+0x70 0xc7 0x00 0x04 # CHECK: msub $6, $7
+0x4e 0x70 0x53 0x28 # CHECK: msub.s $f12, $f19, $f10, $f16
+0x70 0xc7 0x00 0x05 # CHECK: msubu $6, $7
+0x44 0x86 0x38 0x00 # CHECK: mtc1 $6, $f7
+0x00 0xe0 0x00 0x11 # CHECK: mthi $7
+0x44 0xe0 0x80 0x00 # CHECK: mthc1 $zero, $f16
+0x00 0xe0 0x00 0x13 # CHECK: mtlo $7
+0x46 0x2e 0x62 0x02 # CHECK: mul.d $f8, $f12, $f14
+0x46 0x07 0x32 0x42 # CHECK: mul.s $f9, $f6, $f7
+0x70 0xc7 0x48 0x02 # CHECK: mul $9, $6, $7
+0x00 0x65 0x00 0x18 # CHECK: mult $3, $5
+0x00 0x65 0x00 0x19 # CHECK: multu $3, $5
+0x46 0x20 0x73 0x07 # CHECK: neg.d $f12, $f14
+0x46 0x00 0x39 0x87 # CHECK: neg.s $f6, $f7
+0x4c 0xac 0xc8 0x30 # CHECK: nmadd.s $f0, $f5, $f25, $f12
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x00 0xc7 0x48 0x27 # CHECK: nor $9, $6, $7
+0x4f 0x04 0x98 0x78 # CHECK: nmsub.s $f1, $f24, $f19, $f4
+0x00 0x65 0x18 0x25 # CHECK: or $3, $3, $5
+0x34 0xc9 0x45 0x67 # CHECK: ori $9, $6, 17767
+0x00 0x26 0x49 0xc2 # CHECK: rotr $9, $6, 7
+0x00 0xe6 0x48 0x46 # CHECK: rotrv $9, $6, $7
+0x46 0x20 0x0b 0x08 # CHECK: round.l.d $f12, $f1
+0x46 0x00 0x2e 0x48 # CHECK: round.l.s $f25, $f5
+0x46 0x20 0x73 0x0c # CHECK: round.w.d $f12, $f14
+0x46 0x00 0x39 0x8c # CHECK: round.w.s $f6, $f7
+0xa0 0xa4 0x23 0xc6 # CHECK: sb $4, 9158($5)
+0xa0 0xa4 0x00 0x06 # CHECK: sb $4, 6($5)
+0xe0 0xe9 0x23 0xc6 # CHECK: sc $9, 9158($7)
+0xf3 0xaf 0xdf 0xcd # CHECK: scd $15, -8243($sp)
+0xfd 0x4c 0x16 0xcb # CHECK: sd $12, 5835($10)
+0xf4 0xe9 0x23 0xc6 # CHECK: sdc1 $f9, 9158($7)
+0xb3 0xc7 0xae 0x1f # CHECK: sdl $7, -20961($fp)
+0xb5 0x8b 0xb0 0x39 # CHECK: sdr $11, -20423($12)
+0x4f 0x24 0x40 0x09 # CHECK: sdxc1 $f8, $4($25)
+0x7c 0x07 0x34 0x20 # CHECK: seb $6, $7
+0x7c 0x07 0x36 0x20 # CHECK: seh $6, $7
+0xa4 0xa4 0x23 0xc6 # CHECK: sh $4, 9158($5)
+0x00 0x03 0x21 0xc0 # CHECK: sll $4, $3, 7
+0x00 0xa3 0x10 0x04 # CHECK: sllv $2, $3, $5
+0x00 0x65 0x18 0x2a # CHECK: slt $3, $3, $5
+0x28 0x63 0x00 0x67 # CHECK: slti $3, $3, 103
+0x2c 0x63 0x00 0x67 # CHECK: sltiu $3, $3, 103
+0x00 0x65 0x18 0x2b # CHECK: sltu $3, $3, $5
+0x46 0x20 0x73 0x04 # CHECK: sqrt.d $f12, $f14
+0x46 0x00 0x39 0x84 # CHECK: sqrt.s $f6, $f7
+0x00 0x03 0x21 0xc3 # CHECK: sra $4, $3, 7
+0x00 0xa3 0x10 0x07 # CHECK: srav $2, $3, $5
+0x00 0x03 0x21 0xc2 # CHECK: srl $4, $3, 7
+0x00 0xa3 0x10 0x06 # CHECK: srlv $2, $3, $5
+0x46 0x2e 0x62 0x01 # CHECK: sub.d $f8, $f12, $f14
+0x46 0x07 0x32 0x41 # CHECK: sub.s $f9, $f6, $f7
+0x00 0xc7 0x48 0x22 # CHECK: sub $9, $6, $7
+0x00 0x65 0x20 0x23 # CHECK: subu $4, $3, $5
+0x4c 0xb8 0x20 0x0d # CHECK: suxc1 $f4, $24($5)
+0xac 0xa4 0x00 0x18 # CHECK: sw $4, 24($5)
+0xe4 0xe9 0x23 0xc6 # CHECK: swc1 $f9, 9158($7)
+0xa8 0xa4 0x00 0x10 # CHECK: swl $4, 16($5)
+0xb8 0xe6 0x00 0x10 # CHECK: swr $6, 16($7)
+0x4e 0xd2 0xd0 0x08 # CHECK: swxc1 $f26, $18($22)
+0x00 0x00 0x01 0xcf # CHECK: sync 7
+0x46 0x20 0xbd 0xc9 # CHECK: trunc.l.d $f23, $f23
+0x46 0x00 0xff 0x09 # CHECK: trunc.l.s $f28, $f31
+0x46 0x20 0x73 0x0d # CHECK: trunc.w.d $f12, $f14
+0x46 0x00 0x39 0x8d # CHECK: trunc.w.s $f6, $f7
+0x7c 0x07 0x30 0xa0 # CHECK: wsbh $6, $7
+0x00 0x65 0x18 0x26 # CHECK: xor $3, $3, $5
+0x38 0xc9 0x45 0x67 # CHECK: xori $9, $6, 17767
diff --git a/test/MC/Disassembler/Mips/mips64r3/valid-xfail-mips64r3.txt b/test/MC/Disassembler/Mips/mips64r3/valid-xfail-mips64r3.txt
new file mode 100644
index 0000000..8c58eb1
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64r3/valid-xfail-mips64r3.txt
@@ -0,0 +1,76 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -disassemble -mcpu=mips64r3 | FileCheck %s
+# XFAIL: *
+0x46 0x2f 0x79 0x32 # CHECK: c.eq.d $fcc1, $f15, $f15
+0x46 0x11 0xc5 0x32 # CHECK: c.eq.s $fcc5, $f24, $f17
+0x46 0x35 0x5c 0x30 # CHECK: c.f.d $fcc4, $f11, $f21
+0x46 0x07 0xf4 0x30 # CHECK: c.f.s $fcc4, $f30, $f7
+0x46 0x21 0x94 0x3e # CHECK: c.le.d $fcc4, $f18, $f1
+0x46 0x04 0xc6 0x3e # CHECK: c.le.s $fcc6, $f24, $f4
+0x46 0x23 0x4b 0x3c # CHECK: c.lt.d $fcc3, $f9, $f3
+0x46 0x0e 0x8a 0x3c # CHECK: c.lt.s $fcc2, $f17, $f14
+0x46 0x30 0xad 0x3d # CHECK: c.nge.d $fcc5, $f21, $f16
+0x46 0x08 0x5b 0x3d # CHECK: c.nge.s $fcc3, $f11, $f8
+0x46 0x17 0xfa 0x3b # CHECK: c.ngl.s $fcc2, $f31, $f23
+0x46 0x17 0x92 0x39 # CHECK: c.ngle.s $fcc2, $f18, $f23
+0x46 0x27 0xc4 0x3f # CHECK: c.ngt.d $fcc4, $f24, $f7
+0x46 0x0d 0x45 0x3f # CHECK: c.ngt.s $fcc5, $f8, $f13
+0x46 0x3f 0x82 0x36 # CHECK: c.ole.d $fcc2, $f16, $f31
+0x46 0x14 0x3b 0x36 # CHECK: c.ole.s $fcc3, $f7, $f20
+0x46 0x3c 0x9c 0x34 # CHECK: c.olt.d $fcc4, $f19, $f28
+0x46 0x07 0xa6 0x34 # CHECK: c.olt.s $fcc6, $f20, $f7
+0x46 0x27 0xfc 0x3a # CHECK: c.seq.d $fcc4, $f31, $f7
+0x46 0x19 0x0f 0x3a # CHECK: c.seq.s $fcc7, $f1, $f25
+0x46 0x39 0x6c 0x33 # CHECK: c.ueq.d $fcc4, $f13, $f25
+0x46 0x1e 0x1e 0x33 # CHECK: c.ueq.s $fcc6, $f3, $f30
+0x46 0x32 0xcf 0x37 # CHECK: c.ule.d $fcc7, $f25, $f18
+0x46 0x1e 0xaf 0x37 # CHECK: c.ule.s $fcc7, $f21, $f30
+0x46 0x31 0x36 0x35 # CHECK: c.ult.d $fcc6, $f6, $f17
+0x46 0x0a 0xc7 0x35 # CHECK: c.ult.s $fcc7, $f24, $f10
+0x46 0x38 0xbe 0x31 # CHECK: c.un.d $fcc6, $f23, $f24
+0x46 0x04 0xf1 0x31 # CHECK: c.un.s $fcc1, $f30, $f4
+0x46 0xc0 0x45 0x85 # CHECK: abs.ps $f22, $f8
+0x46 0xcc 0xc6 0x00 # CHECK: add.ps $f24, $f24, $f12
+0x46 0xca 0x04 0x32 # CHECK: c.eq.ps $fcc4, $f0, $f10
+0x46 0xcc 0x66 0x30 # CHECK: c.f.ps $fcc6, $f12, $f12
+0x46 0xd4 0x42 0x3e # CHECK: c.le.ps $fcc2, $f8, $f20
+0x46 0xc4 0x90 0x3c # CHECK: c.lt.ps $f18, $f4
+0x46 0xda 0x10 0x3d # CHECK: c.nge.ps $f2, $f26
+0x46 0xde 0xb0 0x3b # CHECK: c.ngl.ps $f22, $f30
+0x46 0xd4 0x66 0x39 # CHECK: c.ngle.ps $fcc6, $f12, $f20
+0x46 0xc6 0xf6 0x3f # CHECK: c.ngt.ps $fcc6, $f30, $f6
+0x46 0xc8 0xa6 0x36 # CHECK: c.ole.ps $fcc6, $f20, $f8
+0x46 0xd0 0x32 0x34 # CHECK: c.olt.ps $fcc2, $f6, $f16
+0x46 0xce 0xf6 0x3a # CHECK: c.seq.ps $fcc6, $f30, $f14
+0x46 0xc6 0x26 0x38 # CHECK: c.sf.ps $fcc6, $f4, $f6
+0x46 0xdc 0x20 0x33 # CHECK: c.ueq.ps $f4, $f28
+0x46 0xc2 0x86 0x37 # CHECK: c.ule.ps $fcc6, $f16, $f2
+0x46 0xc0 0x76 0x35 # CHECK: c.ult.ps $fcc6, $f14, $f0
+0x46 0xda 0x14 0x31 # CHECK: c.un.ps $fcc4, $f2, $f26
+0x46 0xc0 0x17 0xa8 # CHECK: cvt.s.pl $f30, $f2
+0x46 0xc0 0xd3 0xa0 # CHECK: cvt.s.pu $f14, $f26
+0x4e 0x94 0xd4 0xa1 # CHECK: madd.d $f18, $f20, $f26, $f20
+0x4c 0x42 0x75 0xa6 # CHECK: madd.ps $f22, $f2, $f14, $f2
+0x46 0xc0 0x85 0x86 # CHECK: mov.ps $f22, $f16
+0x46 0xd8 0xe2 0x91 # CHECK: movf.ps $f10, $f28, $fcc6
+0x46 0xd3 0xf7 0x93 # CHECK: movn.ps $f30, $f30, s3
+0x46 0xc9 0xc5 0x11 # CHECK: movt.ps $f20, $f24, $fcc2
+0x46 0xdf 0x84 0x92 # CHECK: movz.ps $f18, $f16, ra
+0x4c 0x52 0xf2 0xa9 # CHECK: msub.d $f10, $f2, $f30, $f18
+0x4d 0xd0 0xe3 0x2e # CHECK: msub.ps $f12, $f14, $f28, $f16
+0x46 0xc0 0x64 0x87 # CHECK: neg.ps $f18, $f12
+0x4d 0x54 0x74 0xb1 # CHECK: nmadd.d $f18, $f10, $f14, $f20
+0x4c 0x98 0x46 0xb6 # CHECK: nmadd.ps $f26, $f4, $f8, $f24
+0x4d 0x1e 0x87 0xb9 # CHECK: nmsub.d $f30, $f8, $f16, $f30
+0x4d 0x90 0x71 0xbe # CHECK: nmsub.ps $f6, $f12, $f14, $f16
+0x46 0xde 0x46 0x2c # CHECK: pll.ps $f24, $f8, $f30
+0x46 0xdc 0xd0 0x2d # CHECK: plu.ps $f0, $f26, $f28
+0x46 0xda 0xf2 0x2e # CHECK: pul.ps $f8, $f30, $f26
+0x46 0xc2 0x46 0x2f # CHECK: puu.ps $f24, $f8, $f2
+0x41 0x49 0x98 0x00 # CHECK: rdpgpr s3, t1
+0x46 0x20 0x34 0x95 # CHECK: recip.d $f18, $f6
+0x46 0x00 0xf0 0xd5 # CHECK: recip.s $f3, $f30
+0x02 0xa7 0x68 0x46 # CHECK: rorv t5, a3, s5
+0x46 0x20 0xe0 0x96 # CHECK: rsqrt.d $f2, $f28
+0x46 0x00 0x41 0x16 # CHECK: rsqrt.s $f4, $f8
+0x46 0xda 0x71 0x01 # CHECK: sub.ps $f4, $f14, $f26
+0x41 0xcd 0x00 0x00 # CHECK: wrpgpr zero, t5
diff --git a/test/MC/Disassembler/Mips/mips64r5/valid-mips64r5-el.txt b/test/MC/Disassembler/Mips/mips64r5/valid-mips64r5-el.txt
new file mode 100644
index 0000000..3d97a2b
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64r5/valid-mips64r5-el.txt
@@ -0,0 +1,234 @@
+# RUN: llvm-mc --disassemble %s -triple=mips64el-unknown-linux -mcpu=mips64r5 | FileCheck %s
+# CHECK: .text
+0x05 0x73 0x20 0x46 # CHECK: abs.d $f12, $f14
+0x85 0x39 0x00 0x46 # CHECK: abs.s $f6, $f7
+0x20 0x48 0xc7 0x00 # CHECK: add $9, $6, $7
+0x00 0x62 0x2e 0x46 # CHECK: add.d $f8, $f12, $f14
+0x40 0x32 0x07 0x46 # CHECK: add.s $f9, $f6, $f7
+0x67 0x45 0xc9 0x20 # CHECK: addi $9, $6, 17767
+0x67 0xc5 0xc9 0x24 # CHECK: addiu $9, $6, -15001
+0x21 0x48 0xc7 0x00 # CHECK: addu $9, $6, $7
+0x24 0x48 0xc7 0x00 # CHECK: and $9, $6, $7
+0x67 0x45 0xc9 0x30 # CHECK: andi $9, $6, 17767
+0x4c 0x01 0x00 0x10 # CHECK: b 1332
+0x4c 0x01 0x00 0x45 # CHECK: bc1f 1332
+0x4c 0x01 0x1c 0x45 # CHECK: bc1f $fcc7, 1332
+0x4c 0x01 0x01 0x45 # CHECK: bc1t 1332
+0x4c 0x01 0x1d 0x45 # CHECK: bc1t $fcc7, 1332
+0x4c 0x01 0x26 0x11 # CHECK: beq $9, $6, 1332
+0x4c 0x01 0xc1 0x04 # CHECK: bgez $6, 1332
+0x4c 0x01 0xd1 0x04 # CHECK: bgezal $6, 1332
+0x4c 0x01 0xc0 0x1c # CHECK: bgtz $6, 1332
+0x4c 0x01 0xc0 0x18 # CHECK: blez $6, 1332
+0x4c 0x01 0x26 0x15 # CHECK: bne $9, $6, 1332
+0x32 0x60 0x2e 0x46 # CHECK: c.eq.d $f12, $f14
+0x32 0x30 0x07 0x46 # CHECK: c.eq.s $f6, $f7
+0x30 0x60 0x2e 0x46 # CHECK: c.f.d $f12, $f14
+0x30 0x30 0x07 0x46 # CHECK: c.f.s $f6, $f7
+0x3e 0x60 0x2e 0x46 # CHECK: c.le.d $f12, $f14
+0x3e 0x30 0x07 0x46 # CHECK: c.le.s $f6, $f7
+0x3c 0x60 0x2e 0x46 # CHECK: c.lt.d $f12, $f14
+0x3c 0x30 0x07 0x46 # CHECK: c.lt.s $f6, $f7
+0x3d 0x60 0x2e 0x46 # CHECK: c.nge.d $f12, $f14
+0x3d 0x30 0x07 0x46 # CHECK: c.nge.s $f6, $f7
+0x3b 0x60 0x2e 0x46 # CHECK: c.ngl.d $f12, $f14
+0x3b 0x30 0x07 0x46 # CHECK: c.ngl.s $f6, $f7
+0x39 0x60 0x2e 0x46 # CHECK: c.ngle.d $f12, $f14
+0x39 0x30 0x07 0x46 # CHECK: c.ngle.s $f6, $f7
+0x3f 0x60 0x2e 0x46 # CHECK: c.ngt.d $f12, $f14
+0x3f 0x30 0x07 0x46 # CHECK: c.ngt.s $f6, $f7
+0x36 0x60 0x2e 0x46 # CHECK: c.ole.d $f12, $f14
+0x36 0x30 0x07 0x46 # CHECK: c.ole.s $f6, $f7
+0x34 0x60 0x2e 0x46 # CHECK: c.olt.d $f12, $f14
+0x34 0x30 0x07 0x46 # CHECK: c.olt.s $f6, $f7
+0x3a 0x60 0x2e 0x46 # CHECK: c.seq.d $f12, $f14
+0x3a 0x30 0x07 0x46 # CHECK: c.seq.s $f6, $f7
+0x38 0x60 0x2e 0x46 # CHECK: c.sf.d $f12, $f14
+0x38 0x30 0x07 0x46 # CHECK: c.sf.s $f6, $f7
+0x33 0x60 0x2e 0x46 # CHECK: c.ueq.d $f12, $f14
+0x33 0xe0 0x12 0x46 # CHECK: c.ueq.s $f28, $f18
+0x37 0x60 0x2e 0x46 # CHECK: c.ule.d $f12, $f14
+0x37 0x30 0x07 0x46 # CHECK: c.ule.s $f6, $f7
+0x35 0x60 0x2e 0x46 # CHECK: c.ult.d $f12, $f14
+0x35 0x30 0x07 0x46 # CHECK: c.ult.s $f6, $f7
+0x31 0x60 0x2e 0x46 # CHECK: c.un.d $f12, $f14
+0x31 0x30 0x07 0x46 # CHECK: c.un.s $f6, $f7
+0x4a 0x18 0x20 0x46 # CHECK: ceil.l.d $f1, $f3
+0x8a 0x6c 0x00 0x46 # CHECK: ceil.l.s $f18, $f13
+0x0e 0x73 0x20 0x46 # CHECK: ceil.w.d $f12, $f14
+0x8e 0x39 0x00 0x46 # CHECK: ceil.w.s $f6, $f7
+0x00 0x38 0x46 0x44 # CHECK: cfc1 $6, $7
+0x21 0x30 0xe6 0x70 # CHECK: clo $6, $7
+0x20 0x30 0xe6 0x70 # CHECK: clz $6, $7
+0x00 0x38 0xc6 0x44 # CHECK: ctc1 $6, $7
+0x21 0x81 0xa0 0x46 # CHECK: cvt.d.l $f4, $f16
+0xa1 0x39 0x00 0x46 # CHECK: cvt.d.s $f6, $f7
+0x21 0x73 0x80 0x46 # CHECK: cvt.d.w $f12, $f14
+0x25 0x73 0x20 0x46 # CHECK: cvt.l.d $f12, $f14
+0xa5 0x39 0x00 0x46 # CHECK: cvt.l.s $f6, $f7
+0xe0 0xf3 0xa0 0x46 # CHECK: cvt.s.l $f15, $f30
+0x20 0x73 0x20 0x46 # CHECK: cvt.s.d $f12, $f14
+0xa0 0x39 0x80 0x46 # CHECK: cvt.s.w $f6, $f7
+0x24 0x73 0x20 0x46 # CHECK: cvt.w.d $f12, $f14
+0xa4 0x39 0x00 0x46 # CHECK: cvt.w.s $f6, $f7
+0x2c 0x98 0x3f 0x00 # CHECK: dadd $19, $1, $ra
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0xbd 0x63 # CHECK: daddi $sp, $sp, -27705
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0x9d 0x62 # CHECK: daddi $sp, $20, -27705
+0xc7 0x93 0xbd 0x63 # CHECK: daddi $sp, $sp, -27705
+0x16 0xee 0xda 0x66 # CHECK: daddiu $26, $22, -4586
+0x2d 0x98 0x3f 0x00 # CHECK: daddu $19, $1, $ra
+0x9f 0x46 0x58 0x64 # CHECK: daddiu $24, $2, 18079
+0x3f 0x69 0x73 0x66 # CHECK: daddiu $19, $19, 26943
+0x25 0x90 0xd2 0x70 # CHECK: dclo $18, $6
+0x24 0x80 0x30 0x73 # CHECK: dclz $16, $25
+0x1e 0x00 0x53 0x03 # CHECK: ddiv $zero, $26, $19
+0x1f 0x00 0x11 0x02 # CHECK: ddivu $zero, $16, $17
+0x00 0x68 0x2c 0x44 # CHECK: dmfc1 $12, $f13
+0x00 0x70 0xb0 0x44 # CHECK: dmtc1 $16, $f14
+0x1c 0x00 0xe9 0x02 # CHECK: dmult $23, $9
+0x1d 0x00 0xa6 0x00 # CHECK: dmultu $5, $6
+0xb8 0x04 0x00 0x00 # CHECK: dsll $zero, $zero, 18
+0xb8 0x04 0x14 0x00 # CHECK: dsll $zero, $20, 18
+0x14 0x00 0x94 0x01 # CHECK: dsllv $zero, $20, $12
+0xbc 0x04 0x00 0x00 # CHECK: dsll32 $zero, $zero, 18
+0xbc 0x04 0x00 0x00 # CHECK: dsll32 $zero, $zero, 18
+0x14 0x00 0x94 0x01 # CHECK: dsllv $zero, $20, $12
+0xbb 0xe2 0x1c 0x00 # CHECK: dsra $gp, $gp, 10
+0xbb 0xe2 0x12 0x00 # CHECK: dsra $gp, $18, 10
+0x17 0xe0 0x72 0x02 # CHECK: dsrav $gp, $18, $19
+0xbf 0xe2 0x1c 0x00 # CHECK: dsra32 $gp, $gp, 10
+0xbf 0xe2 0x12 0x00 # CHECK: dsra32 $gp, $18, 10
+0x17 0xe0 0x72 0x02 # CHECK: dsrav $gp, $18, $19
+0xfa 0x9d 0x13 0x00 # CHECK: dsrl $19, $19, 23
+0xfa 0x9d 0x06 0x00 # CHECK: dsrl $19, $6, 23
+0x16 0x98 0x86 0x02 # CHECK: dsrlv $19, $6, $20
+0xfe 0x9d 0x13 0x00 # CHECK: dsrl32 $19, $19, 23
+0xfe 0x9d 0x06 0x00 # CHECK: dsrl32 $19, $6, 23
+0x16 0x98 0x86 0x02 # CHECK: dsrlv $19, $6, $20
+0x2e 0x38 0xc8 0x02 # CHECK: dsub $7, $22, $8
+0xa4 0x18 0x0e 0x7c # CHECK: dsbh $3, $14
+0x64 0x11 0x1d 0x7c # CHECK: dshd $2, $sp
+0x39 0x6c 0x9d 0x62 # CHECK: daddi $sp, $20, 27705
+0x39 0x6c 0xbd 0x63 # CHECK: daddi $sp, $sp, 27705
+0x2f 0x28 0xba 0x00 # CHECK: dsubu $5, $5, $26
+0x5f 0xec 0x6f 0x65 # CHECK: daddiu $15, $11, -5025
+0xea 0x11 0xce 0x65 # CHECK: daddiu $14, $14, 4586
+0x00 0x60 0x7e 0x41 # CHECK: di $fp
+0x00 0x60 0x60 0x41 # CHECK: di
+0xfa 0x0b 0x21 0x00 # CHECK: drotr $1, $1, 15
+0xfa 0x0b 0x2e 0x00 # CHECK: drotr $1, $14, 15
+0xfe 0x0b 0x21 0x00 # CHECK: drotr32 $1, $1, 15
+0xfe 0x0b 0x2e 0x00 # CHECK: drotr32 $1, $14, 15
+0x56 0x08 0xee 0x01 # CHECK: drotrv $1, $14, $15
+0x20 0x60 0x6e 0x41 # CHECK: ei $14
+0x20 0x60 0x60 0x41 # CHECK: ei
+0x8b 0x3e 0x20 0x46 # CHECK: floor.l.d $f26, $f7
+0x0b 0x2b 0x00 0x46 # CHECK: floor.l.s $f12, $f5
+0x0f 0x73 0x20 0x46 # CHECK: floor.w.d $f12, $f14
+0x8f 0x39 0x00 0x46 # CHECK: floor.w.s $f6, $f7
+0x84 0x61 0x33 0x7d # CHECK: ins $19, $9, 6, 7
+0x4c 0x01 0x00 0x08 # CHECK: j 1328
+0x4c 0x01 0x00 0x0c # CHECK: jal 1328
+0x4c 0x01 0x00 0x74 # CHECK: jalx 1328
+0x09 0xf8 0xe0 0x00 # CHECK: jalr $7
+0x09 0xfc 0x80 0x00 # CHECK: jalr.hb $4
+0x09 0x24 0xa0 0x00 # CHECK: jalr.hb $4, $5
+0x08 0x00 0xe0 0x00 # CHECK: jr $7
+0xc6 0x23 0xa4 0x80 # CHECK: lb $4, 9158($5)
+0x06 0x00 0xa4 0x90 # CHECK: lbu $4, 6($5)
+0x1b 0x90 0x3d 0xde # CHECK: ld $sp, -28645($17)
+0xb9 0xef 0x18 0x6b # CHECK: ldl $24, -4167($24)
+0x6a 0x89 0x8e 0x6e # CHECK: ldr $14, -30358($20)
+0xc6 0x23 0xe9 0xd4 # CHECK: ldc1 $f9, 9158($7)
+0x01 0x02 0xf7 0x4d # CHECK: ldxc1 $f8, $23($15)
+0x0c 0x00 0xa4 0x84 # CHECK: lh $4, 12($5)
+0x0c 0x00 0xa4 0x84 # CHECK: lh $4, 12($5)
+0xc6 0x23 0xe9 0xc0 # CHECK: ll $9, 9158($7)
+0x70 0xc6 0xe0 0xd3 # CHECK: lld $zero, -14736($ra)
+0x67 0x45 0x06 0x3c # CHECK: lui $6, 17767
+0x05 0x00 0xa6 0x4c # CHECK: luxc1 $f0, $6($5)
+0x18 0x00 0xa4 0x8c # CHECK: lw $4, 24($5)
+0xc6 0x23 0xe9 0xc4 # CHECK: lwc1 $f9, 9158($7)
+0x03 0x00 0x82 0x88 # CHECK: lwl $2, 3($4)
+0x10 0x00 0xa3 0x98 # CHECK: lwr $3, 16($5)
+0x00 0x05 0xcc 0x4d # CHECK: lwxc1 $f20, $12($14)
+0xea 0xa1 0x73 0x9c # CHECK: lwu $19, -24086($3)
+0x00 0x00 0xc7 0x70 # CHECK: madd $6, $7
+0x60 0x98 0xf9 0x4f # CHECK: madd.s $f1, $f31, $f19, $f25
+0x01 0x00 0xc7 0x70 # CHECK: maddu $6, $7
+0x00 0x38 0x06 0x44 # CHECK: mfc1 $6, $f7
+0x10 0x28 0x00 0x00 # CHECK: mfhi $5
+0x00 0xc0 0x7e 0x44 # CHECK: mfhc1 $fp, $f24
+0x12 0x28 0x00 0x00 # CHECK: mflo $5
+0x86 0x41 0x20 0x46 # CHECK: mov.d $f6, $f8
+0x86 0x39 0x00 0x46 # CHECK: mov.s $f6, $f7
+0x04 0x00 0xc7 0x70 # CHECK: msub $6, $7
+0x28 0x53 0x70 0x4e # CHECK: msub.s $f12, $f19, $f10, $f16
+0x05 0x00 0xc7 0x70 # CHECK: msubu $6, $7
+0x00 0x38 0x86 0x44 # CHECK: mtc1 $6, $f7
+0x11 0x00 0xe0 0x00 # CHECK: mthi $7
+0x00 0x80 0xe0 0x44 # CHECK: mthc1 $zero, $f16
+0x13 0x00 0xe0 0x00 # CHECK: mtlo $7
+0x02 0x62 0x2e 0x46 # CHECK: mul.d $f8, $f12, $f14
+0x42 0x32 0x07 0x46 # CHECK: mul.s $f9, $f6, $f7
+0x02 0x48 0xc7 0x70 # CHECK: mul $9, $6, $7
+0x18 0x00 0x65 0x00 # CHECK: mult $3, $5
+0x19 0x00 0x65 0x00 # CHECK: multu $3, $5
+0x07 0x73 0x20 0x46 # CHECK: neg.d $f12, $f14
+0x87 0x39 0x00 0x46 # CHECK: neg.s $f6, $f7
+0x30 0xc8 0xac 0x4c # CHECK: nmadd.s $f0, $f5, $f25, $f12
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x27 0x48 0xc7 0x00 # CHECK: nor $9, $6, $7
+0x78 0x98 0x04 0x4f # CHECK: nmsub.s $f1, $f24, $f19, $f4
+0x25 0x18 0x65 0x00 # CHECK: or $3, $3, $5
+0x67 0x45 0xc9 0x34 # CHECK: ori $9, $6, 17767
+0xc2 0x49 0x26 0x00 # CHECK: rotr $9, $6, 7
+0x46 0x48 0xe6 0x00 # CHECK: rotrv $9, $6, $7
+0x08 0x0b 0x20 0x46 # CHECK: round.l.d $f12, $f1
+0x48 0x2e 0x00 0x46 # CHECK: round.l.s $f25, $f5
+0x0c 0x73 0x20 0x46 # CHECK: round.w.d $f12, $f14
+0x8c 0x39 0x00 0x46 # CHECK: round.w.s $f6, $f7
+0xc6 0x23 0xa4 0xa0 # CHECK: sb $4, 9158($5)
+0x06 0x00 0xa4 0xa0 # CHECK: sb $4, 6($5)
+0xc6 0x23 0xe9 0xe0 # CHECK: sc $9, 9158($7)
+0xcd 0xdf 0xaf 0xf3 # CHECK: scd $15, -8243($sp)
+0xcb 0x16 0x4c 0xfd # CHECK: sd $12, 5835($10)
+0xc6 0x23 0xe9 0xf4 # CHECK: sdc1 $f9, 9158($7)
+0x1f 0xae 0xc7 0xb3 # CHECK: sdl $7, -20961($fp)
+0x39 0xb0 0x8b 0xb5 # CHECK: sdr $11, -20423($12)
+0x09 0x40 0x24 0x4f # CHECK: sdxc1 $f8, $4($25)
+0x20 0x34 0x07 0x7c # CHECK: seb $6, $7
+0x20 0x36 0x07 0x7c # CHECK: seh $6, $7
+0xc6 0x23 0xa4 0xa4 # CHECK: sh $4, 9158($5)
+0xc0 0x21 0x03 0x00 # CHECK: sll $4, $3, 7
+0x04 0x10 0xa3 0x00 # CHECK: sllv $2, $3, $5
+0x2a 0x18 0x65 0x00 # CHECK: slt $3, $3, $5
+0x67 0x00 0x63 0x28 # CHECK: slti $3, $3, 103
+0x67 0x00 0x63 0x2c # CHECK: sltiu $3, $3, 103
+0x2b 0x18 0x65 0x00 # CHECK: sltu $3, $3, $5
+0x04 0x73 0x20 0x46 # CHECK: sqrt.d $f12, $f14
+0x84 0x39 0x00 0x46 # CHECK: sqrt.s $f6, $f7
+0xc3 0x21 0x03 0x00 # CHECK: sra $4, $3, 7
+0x07 0x10 0xa3 0x00 # CHECK: srav $2, $3, $5
+0xc2 0x21 0x03 0x00 # CHECK: srl $4, $3, 7
+0x06 0x10 0xa3 0x00 # CHECK: srlv $2, $3, $5
+0x01 0x62 0x2e 0x46 # CHECK: sub.d $f8, $f12, $f14
+0x41 0x32 0x07 0x46 # CHECK: sub.s $f9, $f6, $f7
+0x22 0x48 0xc7 0x00 # CHECK: sub $9, $6, $7
+0x23 0x20 0x65 0x00 # CHECK: subu $4, $3, $5
+0x0d 0x20 0xb8 0x4c # CHECK: suxc1 $f4, $24($5)
+0x18 0x00 0xa4 0xac # CHECK: sw $4, 24($5)
+0xc6 0x23 0xe9 0xe4 # CHECK: swc1 $f9, 9158($7)
+0x10 0x00 0xa4 0xa8 # CHECK: swl $4, 16($5)
+0x10 0x00 0xe6 0xb8 # CHECK: swr $6, 16($7)
+0x08 0xd0 0xd2 0x4e # CHECK: swxc1 $f26, $18($22)
+0xcf 0x01 0x00 0x00 # CHECK: sync 7
+0xc9 0xbd 0x20 0x46 # CHECK: trunc.l.d $f23, $f23
+0x09 0xff 0x00 0x46 # CHECK: trunc.l.s $f28, $f31
+0x0d 0x73 0x20 0x46 # CHECK: trunc.w.d $f12, $f14
+0x8d 0x39 0x00 0x46 # CHECK: trunc.w.s $f6, $f7
+0xa0 0x30 0x07 0x7c # CHECK: wsbh $6, $7
+0x26 0x18 0x65 0x00 # CHECK: xor $3, $3, $5
+0x67 0x45 0xc9 0x38 # CHECK: xori $9, $6, 17767
diff --git a/test/MC/Disassembler/Mips/mips64r5/valid-mips64r5.txt b/test/MC/Disassembler/Mips/mips64r5/valid-mips64r5.txt
new file mode 100644
index 0000000..1f52bd1
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64r5/valid-mips64r5.txt
@@ -0,0 +1,234 @@
+# RUN: llvm-mc --disassemble %s -triple=mips64-unknown-linux -mcpu=mips64r5 | FileCheck %s
+# CHECK: .text
+0x46 0x20 0x73 0x05 # CHECK: abs.d $f12, $f14
+0x46 0x00 0x39 0x85 # CHECK: abs.s $f6, $f7
+0x00 0xc7 0x48 0x20 # CHECK: add $9, $6, $7
+0x46 0x2e 0x62 0x00 # CHECK: add.d $f8, $f12, $f14
+0x46 0x07 0x32 0x40 # CHECK: add.s $f9, $f6, $f7
+0x20 0xc9 0x45 0x67 # CHECK: addi $9, $6, 17767
+0x24 0xc9 0xc5 0x67 # CHECK: addiu $9, $6, -15001
+0x00 0xc7 0x48 0x21 # CHECK: addu $9, $6, $7
+0x00 0xc7 0x48 0x24 # CHECK: and $9, $6, $7
+0x30 0xc9 0x45 0x67 # CHECK: andi $9, $6, 17767
+0x10 0x00 0x01 0x4c # CHECK: b 1332
+0x45 0x00 0x01 0x4c # CHECK: bc1f 1332
+0x45 0x1c 0x01 0x4c # CHECK: bc1f $fcc7, 1332
+0x45 0x01 0x01 0x4c # CHECK: bc1t 1332
+0x45 0x1d 0x01 0x4c # CHECK: bc1t $fcc7, 1332
+0x11 0x26 0x01 0x4c # CHECK: beq $9, $6, 1332
+0x04 0xc1 0x01 0x4c # CHECK: bgez $6, 1332
+0x04 0xd1 0x01 0x4c # CHECK: bgezal $6, 1332
+0x1c 0xc0 0x01 0x4c # CHECK: bgtz $6, 1332
+0x18 0xc0 0x01 0x4c # CHECK: blez $6, 1332
+0x15 0x26 0x01 0x4c # CHECK: bne $9, $6, 1332
+0x46 0x2e 0x60 0x32 # CHECK: c.eq.d $f12, $f14
+0x46 0x07 0x30 0x32 # CHECK: c.eq.s $f6, $f7
+0x46 0x2e 0x60 0x30 # CHECK: c.f.d $f12, $f14
+0x46 0x07 0x30 0x30 # CHECK: c.f.s $f6, $f7
+0x46 0x2e 0x60 0x3e # CHECK: c.le.d $f12, $f14
+0x46 0x07 0x30 0x3e # CHECK: c.le.s $f6, $f7
+0x46 0x2e 0x60 0x3c # CHECK: c.lt.d $f12, $f14
+0x46 0x07 0x30 0x3c # CHECK: c.lt.s $f6, $f7
+0x46 0x2e 0x60 0x3d # CHECK: c.nge.d $f12, $f14
+0x46 0x07 0x30 0x3d # CHECK: c.nge.s $f6, $f7
+0x46 0x2e 0x60 0x3b # CHECK: c.ngl.d $f12, $f14
+0x46 0x07 0x30 0x3b # CHECK: c.ngl.s $f6, $f7
+0x46 0x2e 0x60 0x39 # CHECK: c.ngle.d $f12, $f14
+0x46 0x07 0x30 0x39 # CHECK: c.ngle.s $f6, $f7
+0x46 0x2e 0x60 0x3f # CHECK: c.ngt.d $f12, $f14
+0x46 0x07 0x30 0x3f # CHECK: c.ngt.s $f6, $f7
+0x46 0x2e 0x60 0x36 # CHECK: c.ole.d $f12, $f14
+0x46 0x07 0x30 0x36 # CHECK: c.ole.s $f6, $f7
+0x46 0x2e 0x60 0x34 # CHECK: c.olt.d $f12, $f14
+0x46 0x07 0x30 0x34 # CHECK: c.olt.s $f6, $f7
+0x46 0x2e 0x60 0x3a # CHECK: c.seq.d $f12, $f14
+0x46 0x07 0x30 0x3a # CHECK: c.seq.s $f6, $f7
+0x46 0x2e 0x60 0x38 # CHECK: c.sf.d $f12, $f14
+0x46 0x07 0x30 0x38 # CHECK: c.sf.s $f6, $f7
+0x46 0x2e 0x60 0x33 # CHECK: c.ueq.d $f12, $f14
+0x46 0x12 0xe0 0x33 # CHECK: c.ueq.s $f28, $f18
+0x46 0x2e 0x60 0x37 # CHECK: c.ule.d $f12, $f14
+0x46 0x07 0x30 0x37 # CHECK: c.ule.s $f6, $f7
+0x46 0x2e 0x60 0x35 # CHECK: c.ult.d $f12, $f14
+0x46 0x07 0x30 0x35 # CHECK: c.ult.s $f6, $f7
+0x46 0x2e 0x60 0x31 # CHECK: c.un.d $f12, $f14
+0x46 0x07 0x30 0x31 # CHECK: c.un.s $f6, $f7
+0x46 0x20 0x18 0x4a # CHECK: ceil.l.d $f1, $f3
+0x46 0x00 0x6c 0x8a # CHECK: ceil.l.s $f18, $f13
+0x46 0x20 0x73 0x0e # CHECK: ceil.w.d $f12, $f14
+0x46 0x00 0x39 0x8e # CHECK: ceil.w.s $f6, $f7
+0x44 0x46 0x38 0x00 # CHECK: cfc1 $6, $7
+0x70 0xe6 0x30 0x21 # CHECK: clo $6, $7
+0x70 0xe6 0x30 0x20 # CHECK: clz $6, $7
+0x44 0xc6 0x38 0x00 # CHECK: ctc1 $6, $7
+0x46 0xa0 0x81 0x21 # CHECK: cvt.d.l $f4, $f16
+0x46 0x00 0x39 0xa1 # CHECK: cvt.d.s $f6, $f7
+0x46 0x80 0x73 0x21 # CHECK: cvt.d.w $f12, $f14
+0x46 0x20 0x73 0x25 # CHECK: cvt.l.d $f12, $f14
+0x46 0x00 0x39 0xa5 # CHECK: cvt.l.s $f6, $f7
+0x46 0xa0 0xf3 0xe0 # CHECK: cvt.s.l $f15, $f30
+0x46 0x20 0x73 0x20 # CHECK: cvt.s.d $f12, $f14
+0x46 0x80 0x39 0xa0 # CHECK: cvt.s.w $f6, $f7
+0x46 0x20 0x73 0x24 # CHECK: cvt.w.d $f12, $f14
+0x46 0x00 0x39 0xa4 # CHECK: cvt.w.s $f6, $f7
+0x00 0x3f 0x98 0x2c # CHECK: dadd $19, $1, $ra
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x63 0xbd 0x93 0xc7 # CHECK: daddi $sp, $sp, -27705
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x62 0x9d 0x93 0xc7 # CHECK: daddi $sp, $20, -27705
+0x63 0xbd 0x93 0xc7 # CHECK: daddi $sp, $sp, -27705
+0x66 0xda 0xee 0x16 # CHECK: daddiu $26, $22, -4586
+0x00 0x3f 0x98 0x2d # CHECK: daddu $19, $1, $ra
+0x64 0x58 0x46 0x9f # CHECK: daddiu $24, $2, 18079
+0x66 0x73 0x69 0x3f # CHECK: daddiu $19, $19, 26943
+0x70 0xd2 0x90 0x25 # CHECK: dclo $18, $6
+0x73 0x30 0x80 0x24 # CHECK: dclz $16, $25
+0x03 0x53 0x00 0x1e # CHECK: ddiv $zero, $26, $19
+0x02 0x11 0x00 0x1f # CHECK: ddivu $zero, $16, $17
+0x44 0x2c 0x68 0x00 # CHECK: dmfc1 $12, $f13
+0x44 0xb0 0x70 0x00 # CHECK: dmtc1 $16, $f14
+0x02 0xe9 0x00 0x1c # CHECK: dmult $23, $9
+0x00 0xa6 0x00 0x1d # CHECK: dmultu $5, $6
+0x00 0x00 0x04 0xb8 # CHECK: dsll $zero, $zero, 18
+0x00 0x14 0x04 0xb8 # CHECK: dsll $zero, $20, 18
+0x01 0x94 0x00 0x14 # CHECK: dsllv $zero, $20, $12
+0x00 0x00 0x04 0xbc # CHECK: dsll32 $zero, $zero, 18
+0x00 0x00 0x04 0xbc # CHECK: dsll32 $zero, $zero, 18
+0x01 0x94 0x00 0x14 # CHECK: dsllv $zero, $20, $12
+0x00 0x1c 0xe2 0xbb # CHECK: dsra $gp, $gp, 10
+0x00 0x12 0xe2 0xbb # CHECK: dsra $gp, $18, 10
+0x02 0x72 0xe0 0x17 # CHECK: dsrav $gp, $18, $19
+0x00 0x1c 0xe2 0xbf # CHECK: dsra32 $gp, $gp, 10
+0x00 0x12 0xe2 0xbf # CHECK: dsra32 $gp, $18, 10
+0x02 0x72 0xe0 0x17 # CHECK: dsrav $gp, $18, $19
+0x00 0x13 0x9d 0xfa # CHECK: dsrl $19, $19, 23
+0x00 0x06 0x9d 0xfa # CHECK: dsrl $19, $6, 23
+0x02 0x86 0x98 0x16 # CHECK: dsrlv $19, $6, $20
+0x00 0x13 0x9d 0xfe # CHECK: dsrl32 $19, $19, 23
+0x00 0x06 0x9d 0xfe # CHECK: dsrl32 $19, $6, 23
+0x02 0x86 0x98 0x16 # CHECK: dsrlv $19, $6, $20
+0x02 0xc8 0x38 0x2e # CHECK: dsub $7, $22, $8
+0x7c 0x0e 0x18 0xa4 # CHECK: dsbh $3, $14
+0x7c 0x1d 0x11 0x64 # CHECK: dshd $2, $sp
+0x62 0x9d 0x6c 0x39 # CHECK: daddi $sp, $20, 27705
+0x63 0xbd 0x6c 0x39 # CHECK: daddi $sp, $sp, 27705
+0x00 0xba 0x28 0x2f # CHECK: dsubu $5, $5, $26
+0x65 0x6f 0xec 0x5f # CHECK: daddiu $15, $11, -5025
+0x65 0xce 0x11 0xea # CHECK: daddiu $14, $14, 4586
+0x41 0x7e 0x60 0x00 # CHECK: di $fp
+0x41 0x60 0x60 0x00 # CHECK: di
+0x00 0x21 0x0b 0xfa # CHECK: drotr $1, $1, 15
+0x00 0x2e 0x0b 0xfa # CHECK: drotr $1, $14, 15
+0x00 0x21 0x0b 0xfe # CHECK: drotr32 $1, $1, 15
+0x00 0x2e 0x0b 0xfe # CHECK: drotr32 $1, $14, 15
+0x01 0xee 0x08 0x56 # CHECK: drotrv $1, $14, $15
+0x41 0x6e 0x60 0x20 # CHECK: ei $14
+0x41 0x60 0x60 0x20 # CHECK: ei
+0x46 0x20 0x3e 0x8b # CHECK: floor.l.d $f26, $f7
+0x46 0x00 0x2b 0x0b # CHECK: floor.l.s $f12, $f5
+0x46 0x20 0x73 0x0f # CHECK: floor.w.d $f12, $f14
+0x46 0x00 0x39 0x8f # CHECK: floor.w.s $f6, $f7
+0x7d 0x33 0x61 0x84 # CHECK: ins $19, $9, 6, 7
+0x08 0x00 0x01 0x4c # CHECK: j 1328
+0x0c 0x00 0x01 0x4c # CHECK: jal 1328
+0x74 0x00 0x01 0x4c # CHECK: jalx 1328
+0x00 0xe0 0xf8 0x09 # CHECK: jalr $7
+0x00 0x80 0xfc 0x09 # CHECK: jalr.hb $4
+0x00 0xa0 0x24 0x09 # CHECK: jalr.hb $4, $5
+0x00 0xe0 0x00 0x08 # CHECK: jr $7
+0x80 0xa4 0x23 0xc6 # CHECK: lb $4, 9158($5)
+0x90 0xa4 0x00 0x06 # CHECK: lbu $4, 6($5)
+0xde 0x3d 0x90 0x1b # CHECK: ld $sp, -28645($17)
+0x6b 0x18 0xef 0xb9 # CHECK: ldl $24, -4167($24)
+0x6e 0x8e 0x89 0x6a # CHECK: ldr $14, -30358($20)
+0xd4 0xe9 0x23 0xc6 # CHECK: ldc1 $f9, 9158($7)
+0x4d 0xf7 0x02 0x01 # CHECK: ldxc1 $f8, $23($15)
+0x84 0xa4 0x00 0x0c # CHECK: lh $4, 12($5)
+0x84 0xa4 0x00 0x0c # CHECK: lh $4, 12($5)
+0xc0 0xe9 0x23 0xc6 # CHECK: ll $9, 9158($7)
+0xd3 0xe0 0xc6 0x70 # CHECK: lld $zero, -14736($ra)
+0x3c 0x06 0x45 0x67 # CHECK: lui $6, 17767
+0x4c 0xa6 0x00 0x05 # CHECK: luxc1 $f0, $6($5)
+0x8c 0xa4 0x00 0x18 # CHECK: lw $4, 24($5)
+0xc4 0xe9 0x23 0xc6 # CHECK: lwc1 $f9, 9158($7)
+0x88 0x82 0x00 0x03 # CHECK: lwl $2, 3($4)
+0x98 0xa3 0x00 0x10 # CHECK: lwr $3, 16($5)
+0x4d 0xcc 0x05 0x00 # CHECK: lwxc1 $f20, $12($14)
+0x9c 0x73 0xa1 0xea # CHECK: lwu $19, -24086($3)
+0x70 0xc7 0x00 0x00 # CHECK: madd $6, $7
+0x4f 0xf9 0x98 0x60 # CHECK: madd.s $f1, $f31, $f19, $f25
+0x70 0xc7 0x00 0x01 # CHECK: maddu $6, $7
+0x44 0x06 0x38 0x00 # CHECK: mfc1 $6, $f7
+0x00 0x00 0x28 0x10 # CHECK: mfhi $5
+0x44 0x7e 0xc0 0x00 # CHECK: mfhc1 $fp, $f24
+0x00 0x00 0x28 0x12 # CHECK: mflo $5
+0x46 0x20 0x41 0x86 # CHECK: mov.d $f6, $f8
+0x46 0x00 0x39 0x86 # CHECK: mov.s $f6, $f7
+0x70 0xc7 0x00 0x04 # CHECK: msub $6, $7
+0x4e 0x70 0x53 0x28 # CHECK: msub.s $f12, $f19, $f10, $f16
+0x70 0xc7 0x00 0x05 # CHECK: msubu $6, $7
+0x44 0x86 0x38 0x00 # CHECK: mtc1 $6, $f7
+0x00 0xe0 0x00 0x11 # CHECK: mthi $7
+0x44 0xe0 0x80 0x00 # CHECK: mthc1 $zero, $f16
+0x00 0xe0 0x00 0x13 # CHECK: mtlo $7
+0x46 0x2e 0x62 0x02 # CHECK: mul.d $f8, $f12, $f14
+0x46 0x07 0x32 0x42 # CHECK: mul.s $f9, $f6, $f7
+0x70 0xc7 0x48 0x02 # CHECK: mul $9, $6, $7
+0x00 0x65 0x00 0x18 # CHECK: mult $3, $5
+0x00 0x65 0x00 0x19 # CHECK: multu $3, $5
+0x46 0x20 0x73 0x07 # CHECK: neg.d $f12, $f14
+0x46 0x00 0x39 0x87 # CHECK: neg.s $f6, $f7
+0x4c 0xac 0xc8 0x30 # CHECK: nmadd.s $f0, $f5, $f25, $f12
+0x00 0x00 0x00 0x00 # CHECK: nop
+0x00 0xc7 0x48 0x27 # CHECK: nor $9, $6, $7
+0x4f 0x04 0x98 0x78 # CHECK: nmsub.s $f1, $f24, $f19, $f4
+0x00 0x65 0x18 0x25 # CHECK: or $3, $3, $5
+0x34 0xc9 0x45 0x67 # CHECK: ori $9, $6, 17767
+0x00 0x26 0x49 0xc2 # CHECK: rotr $9, $6, 7
+0x00 0xe6 0x48 0x46 # CHECK: rotrv $9, $6, $7
+0x46 0x20 0x0b 0x08 # CHECK: round.l.d $f12, $f1
+0x46 0x00 0x2e 0x48 # CHECK: round.l.s $f25, $f5
+0x46 0x20 0x73 0x0c # CHECK: round.w.d $f12, $f14
+0x46 0x00 0x39 0x8c # CHECK: round.w.s $f6, $f7
+0xa0 0xa4 0x23 0xc6 # CHECK: sb $4, 9158($5)
+0xa0 0xa4 0x00 0x06 # CHECK: sb $4, 6($5)
+0xe0 0xe9 0x23 0xc6 # CHECK: sc $9, 9158($7)
+0xf3 0xaf 0xdf 0xcd # CHECK: scd $15, -8243($sp)
+0xfd 0x4c 0x16 0xcb # CHECK: sd $12, 5835($10)
+0xf4 0xe9 0x23 0xc6 # CHECK: sdc1 $f9, 9158($7)
+0xb3 0xc7 0xae 0x1f # CHECK: sdl $7, -20961($fp)
+0xb5 0x8b 0xb0 0x39 # CHECK: sdr $11, -20423($12)
+0x4f 0x24 0x40 0x09 # CHECK: sdxc1 $f8, $4($25)
+0x7c 0x07 0x34 0x20 # CHECK: seb $6, $7
+0x7c 0x07 0x36 0x20 # CHECK: seh $6, $7
+0xa4 0xa4 0x23 0xc6 # CHECK: sh $4, 9158($5)
+0x00 0x03 0x21 0xc0 # CHECK: sll $4, $3, 7
+0x00 0xa3 0x10 0x04 # CHECK: sllv $2, $3, $5
+0x00 0x65 0x18 0x2a # CHECK: slt $3, $3, $5
+0x28 0x63 0x00 0x67 # CHECK: slti $3, $3, 103
+0x2c 0x63 0x00 0x67 # CHECK: sltiu $3, $3, 103
+0x00 0x65 0x18 0x2b # CHECK: sltu $3, $3, $5
+0x46 0x20 0x73 0x04 # CHECK: sqrt.d $f12, $f14
+0x46 0x00 0x39 0x84 # CHECK: sqrt.s $f6, $f7
+0x00 0x03 0x21 0xc3 # CHECK: sra $4, $3, 7
+0x00 0xa3 0x10 0x07 # CHECK: srav $2, $3, $5
+0x00 0x03 0x21 0xc2 # CHECK: srl $4, $3, 7
+0x00 0xa3 0x10 0x06 # CHECK: srlv $2, $3, $5
+0x46 0x2e 0x62 0x01 # CHECK: sub.d $f8, $f12, $f14
+0x46 0x07 0x32 0x41 # CHECK: sub.s $f9, $f6, $f7
+0x00 0xc7 0x48 0x22 # CHECK: sub $9, $6, $7
+0x00 0x65 0x20 0x23 # CHECK: subu $4, $3, $5
+0x4c 0xb8 0x20 0x0d # CHECK: suxc1 $f4, $24($5)
+0xac 0xa4 0x00 0x18 # CHECK: sw $4, 24($5)
+0xe4 0xe9 0x23 0xc6 # CHECK: swc1 $f9, 9158($7)
+0xa8 0xa4 0x00 0x10 # CHECK: swl $4, 16($5)
+0xb8 0xe6 0x00 0x10 # CHECK: swr $6, 16($7)
+0x4e 0xd2 0xd0 0x08 # CHECK: swxc1 $f26, $18($22)
+0x00 0x00 0x01 0xcf # CHECK: sync 7
+0x46 0x20 0xbd 0xc9 # CHECK: trunc.l.d $f23, $f23
+0x46 0x00 0xff 0x09 # CHECK: trunc.l.s $f28, $f31
+0x46 0x20 0x73 0x0d # CHECK: trunc.w.d $f12, $f14
+0x46 0x00 0x39 0x8d # CHECK: trunc.w.s $f6, $f7
+0x7c 0x07 0x30 0xa0 # CHECK: wsbh $6, $7
+0x00 0x65 0x18 0x26 # CHECK: xor $3, $3, $5
+0x38 0xc9 0x45 0x67 # CHECK: xori $9, $6, 17767
diff --git a/test/MC/Disassembler/Mips/mips64r5/valid-xfail-mips64r5.txt b/test/MC/Disassembler/Mips/mips64r5/valid-xfail-mips64r5.txt
new file mode 100644
index 0000000..b8a98bd
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64r5/valid-xfail-mips64r5.txt
@@ -0,0 +1,76 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -disassemble -mcpu=mips64r5 | FileCheck %s
+# XFAIL: *
+0x46 0x2f 0x79 0x32 # CHECK: c.eq.d $fcc1, $f15, $f15
+0x46 0x11 0xc5 0x32 # CHECK: c.eq.s $fcc5, $f24, $f17
+0x46 0x35 0x5c 0x30 # CHECK: c.f.d $fcc4, $f11, $f21
+0x46 0x07 0xf4 0x30 # CHECK: c.f.s $fcc4, $f30, $f7
+0x46 0x21 0x94 0x3e # CHECK: c.le.d $fcc4, $f18, $f1
+0x46 0x04 0xc6 0x3e # CHECK: c.le.s $fcc6, $f24, $f4
+0x46 0x23 0x4b 0x3c # CHECK: c.lt.d $fcc3, $f9, $f3
+0x46 0x0e 0x8a 0x3c # CHECK: c.lt.s $fcc2, $f17, $f14
+0x46 0x30 0xad 0x3d # CHECK: c.nge.d $fcc5, $f21, $f16
+0x46 0x08 0x5b 0x3d # CHECK: c.nge.s $fcc3, $f11, $f8
+0x46 0x17 0xfa 0x3b # CHECK: c.ngl.s $fcc2, $f31, $f23
+0x46 0x17 0x92 0x39 # CHECK: c.ngle.s $fcc2, $f18, $f23
+0x46 0x27 0xc4 0x3f # CHECK: c.ngt.d $fcc4, $f24, $f7
+0x46 0x0d 0x45 0x3f # CHECK: c.ngt.s $fcc5, $f8, $f13
+0x46 0x3f 0x82 0x36 # CHECK: c.ole.d $fcc2, $f16, $f31
+0x46 0x14 0x3b 0x36 # CHECK: c.ole.s $fcc3, $f7, $f20
+0x46 0x3c 0x9c 0x34 # CHECK: c.olt.d $fcc4, $f19, $f28
+0x46 0x07 0xa6 0x34 # CHECK: c.olt.s $fcc6, $f20, $f7
+0x46 0x27 0xfc 0x3a # CHECK: c.seq.d $fcc4, $f31, $f7
+0x46 0x19 0x0f 0x3a # CHECK: c.seq.s $fcc7, $f1, $f25
+0x46 0x39 0x6c 0x33 # CHECK: c.ueq.d $fcc4, $f13, $f25
+0x46 0x1e 0x1e 0x33 # CHECK: c.ueq.s $fcc6, $f3, $f30
+0x46 0x32 0xcf 0x37 # CHECK: c.ule.d $fcc7, $f25, $f18
+0x46 0x1e 0xaf 0x37 # CHECK: c.ule.s $fcc7, $f21, $f30
+0x46 0x31 0x36 0x35 # CHECK: c.ult.d $fcc6, $f6, $f17
+0x46 0x0a 0xc7 0x35 # CHECK: c.ult.s $fcc7, $f24, $f10
+0x46 0x38 0xbe 0x31 # CHECK: c.un.d $fcc6, $f23, $f24
+0x46 0x04 0xf1 0x31 # CHECK: c.un.s $fcc1, $f30, $f4
+0x46 0xc0 0x45 0x85 # CHECK: abs.ps $f22, $f8
+0x46 0xcc 0xc6 0x00 # CHECK: add.ps $f24, $f24, $f12
+0x46 0xca 0x04 0x32 # CHECK: c.eq.ps $fcc4, $f0, $f10
+0x46 0xcc 0x66 0x30 # CHECK: c.f.ps $fcc6, $f12, $f12
+0x46 0xd4 0x42 0x3e # CHECK: c.le.ps $fcc2, $f8, $f20
+0x46 0xc4 0x90 0x3c # CHECK: c.lt.ps $f18, $f4
+0x46 0xda 0x10 0x3d # CHECK: c.nge.ps $f2, $f26
+0x46 0xde 0xb0 0x3b # CHECK: c.ngl.ps $f22, $f30
+0x46 0xd4 0x66 0x39 # CHECK: c.ngle.ps $fcc6, $f12, $f20
+0x46 0xc6 0xf6 0x3f # CHECK: c.ngt.ps $fcc6, $f30, $f6
+0x46 0xc8 0xa6 0x36 # CHECK: c.ole.ps $fcc6, $f20, $f8
+0x46 0xd0 0x32 0x34 # CHECK: c.olt.ps $fcc2, $f6, $f16
+0x46 0xce 0xf6 0x3a # CHECK: c.seq.ps $fcc6, $f30, $f14
+0x46 0xc6 0x26 0x38 # CHECK: c.sf.ps $fcc6, $f4, $f6
+0x46 0xdc 0x20 0x33 # CHECK: c.ueq.ps $f4, $f28
+0x46 0xc2 0x86 0x37 # CHECK: c.ule.ps $fcc6, $f16, $f2
+0x46 0xc0 0x76 0x35 # CHECK: c.ult.ps $fcc6, $f14, $f0
+0x46 0xda 0x14 0x31 # CHECK: c.un.ps $fcc4, $f2, $f26
+0x46 0xc0 0x17 0xa8 # CHECK: cvt.s.pl $f30, $f2
+0x46 0xc0 0xd3 0xa0 # CHECK: cvt.s.pu $f14, $f26
+0x4e 0x94 0xd4 0xa1 # CHECK: madd.d $f18, $f20, $f26, $f20
+0x4c 0x42 0x75 0xa6 # CHECK: madd.ps $f22, $f2, $f14, $f2
+0x46 0xc0 0x85 0x86 # CHECK: mov.ps $f22, $f16
+0x46 0xd8 0xe2 0x91 # CHECK: movf.ps $f10, $f28, $fcc6
+0x46 0xd3 0xf7 0x93 # CHECK: movn.ps $f30, $f30, s3
+0x46 0xc9 0xc5 0x11 # CHECK: movt.ps $f20, $f24, $fcc2
+0x46 0xdf 0x84 0x92 # CHECK: movz.ps $f18, $f16, ra
+0x4c 0x52 0xf2 0xa9 # CHECK: msub.d $f10, $f2, $f30, $f18
+0x4d 0xd0 0xe3 0x2e # CHECK: msub.ps $f12, $f14, $f28, $f16
+0x46 0xc0 0x64 0x87 # CHECK: neg.ps $f18, $f12
+0x4d 0x54 0x74 0xb1 # CHECK: nmadd.d $f18, $f10, $f14, $f20
+0x4c 0x98 0x46 0xb6 # CHECK: nmadd.ps $f26, $f4, $f8, $f24
+0x4d 0x1e 0x87 0xb9 # CHECK: nmsub.d $f30, $f8, $f16, $f30
+0x4d 0x90 0x71 0xbe # CHECK: nmsub.ps $f6, $f12, $f14, $f16
+0x46 0xde 0x46 0x2c # CHECK: pll.ps $f24, $f8, $f30
+0x46 0xdc 0xd0 0x2d # CHECK: plu.ps $f0, $f26, $f28
+0x46 0xda 0xf2 0x2e # CHECK: pul.ps $f8, $f30, $f26
+0x46 0xc2 0x46 0x2f # CHECK: puu.ps $f24, $f8, $f2
+0x41 0x49 0x98 0x00 # CHECK: rdpgpr s3, t1
+0x46 0x20 0x34 0x95 # CHECK: recip.d $f18, $f6
+0x46 0x00 0xf0 0xd5 # CHECK: recip.s $f3, $f30
+0x02 0xa7 0x68 0x46 # CHECK: rorv t5, a3, s5
+0x46 0x20 0xe0 0x96 # CHECK: rsqrt.d $f2, $f28
+0x46 0x00 0x41 0x16 # CHECK: rsqrt.s $f4, $f8
+0x46 0xda 0x71 0x01 # CHECK: sub.ps $f4, $f14, $f26
+0x41 0xcd 0x00 0x00 # CHECK: wrpgpr zero, t5
diff --git a/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt b/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt
new file mode 100644
index 0000000..b92c793
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt
@@ -0,0 +1,166 @@
+# RUN: llvm-mc %s -disassemble -triple=mipsel-unknown-linux -mcpu=mips64r6 | FileCheck %s
+0x04 0x00 0x42 0x30 # CHECK: andi $2, $2, 4
+0x19 0x00 0x80 0xec # CHECK: addiupc $4, 100
+0x0a 0x00 0x29 0x25 # CHECK: addiu $9, $9, 10
+0xa0 0x22 0x43 0x7c # CHECK: align $4, $2, $3, 2
+0x38 0x00 0x7f 0xec # CHECK: aluipc $3, 56
+0xe9 0xff 0x62 0x3c # CHECK: aui $3, $2, -23
+0xff 0xff 0x7e 0xec # CHECK: auipc $3, -1
+0x9b 0x14 0x11 0x04 # CHECK: bal 21104
+0xb8 0x96 0x37 0xe8 # CHECK: balc 14572256
+0xb8 0x96 0x37 0xc8 # CHECK: bc 14572256
+0x01 0x00 0x20 0x45 # CHECK: bc1eqz $f0, 8
+0x01 0x00 0x3f 0x45 # CHECK: bc1eqz $f31, 8
+0x01 0x00 0xa0 0x45 # CHECK: bc1nez $f0, 8
+0x01 0x00 0xbf 0x45 # CHECK: bc1nez $f31, 8
+0x02 0x00 0x20 0x49 # CHECK: bc2eqz $0, 12
+0x02 0x00 0x3f 0x49 # CHECK: bc2eqz $31, 12
+0x02 0x00 0xa0 0x49 # CHECK: bc2nez $0, 12
+0x02 0x00 0xbf 0x49 # CHECK: bc2nez $31, 12
+0x40 0x00 0xa6 0x20 # CHECK: beqc $5, $6, 256
+0x4d 0x01 0x02 0x20 # CHECK: beqzalc $2, 1332
+0x40 0x00 0xa6 0x60 # CHECK: bnec $5, $6, 256
+0x4d 0x01 0x02 0x60 # CHECK: bnezalc $2, 1332
+0x90 0x46 0xa0 0xd8 # CHECK: beqzc $5, 72256
+0x40 0x00 0x43 0x58 # CHECK: bgec $2, $3, 256
+0x40 0x00 0x43 0x18 # CHECK: bgeuc $2, $3, 256
+0x4d 0x01 0x42 0x18 # CHECK: bgezalc $2, 1332
+0x90 0x46 0xa0 0xf8 # CHECK: bnezc $5, 72256
+0x40 0x00 0xa5 0x5c # CHECK: bltzc $5, 256
+0x40 0x00 0xa5 0x58 # CHECK: bgezc $5, 256
+0x4d 0x01 0x02 0x1c # CHECK: bgtzalc $2, 1332
+0x40 0x00 0x05 0x58 # CHECK: blezc $5, 256
+0x4d 0x01 0x42 0x1c # CHECK: bltzalc $2, 1332
+0x40 0x00 0x05 0x5c # CHECK: bgtzc $5, 256
+0x20 0x20 0x02 0x7c # CHECK: bitswap $4, $2
+0x4d 0x01 0x02 0x18 # CHECK: blezalc $2, 1332
+0x40 0x00 0xa6 0x5c # CHECK: bltc $5, $6, 256
+0x40 0x00 0xa6 0x1c # CHECK: bltuc $5, $6, 256
+0x01 0x00 0x00 0x60 # CHECK: bnvc $zero, $zero, 4
+0x01 0x00 0x40 0x60 # CHECK: bnvc $2, $zero, 4
+0x01 0x00 0x82 0x60 # CHECK: bnvc $4, $2, 4
+0x01 0x00 0x00 0x20 # CHECK: bovc $zero, $zero, 4
+0x01 0x00 0x40 0x20 # CHECK: bovc $2, $zero, 4
+0x01 0x00 0x82 0x20 # CHECK: bovc $4, $2, 4
+0x80 0x18 0x84 0x46 # CHECK: cmp.af.s $f2, $f3, $f4
+0x80 0x18 0xa4 0x46 # CHECK: cmp.af.d $f2, $f3, $f4
+0x81 0x18 0x84 0x46 # CHECK: cmp.un.s $f2, $f3, $f4
+0x81 0x18 0xa4 0x46 # CHECK: cmp.un.d $f2, $f3, $f4
+0x82 0x18 0x84 0x46 # CHECK: cmp.eq.s $f2, $f3, $f4
+0x82 0x18 0xa4 0x46 # CHECK: cmp.eq.d $f2, $f3, $f4
+0x83 0x18 0x84 0x46 # CHECK: cmp.ueq.s $f2, $f3, $f4
+0x83 0x18 0xa4 0x46 # CHECK: cmp.ueq.d $f2, $f3, $f4
+0x84 0x18 0x84 0x46 # CHECK: cmp.lt.s $f2, $f3, $f4
+0x84 0x18 0xa4 0x46 # CHECK: cmp.lt.d $f2, $f3, $f4
+0x85 0x18 0x84 0x46 # CHECK: cmp.ult.s $f2, $f3, $f4
+0x85 0x18 0xa4 0x46 # CHECK: cmp.ult.d $f2, $f3, $f4
+0x86 0x18 0x84 0x46 # CHECK: cmp.le.s $f2, $f3, $f4
+0x86 0x18 0xa4 0x46 # CHECK: cmp.le.d $f2, $f3, $f4
+0x87 0x18 0x84 0x46 # CHECK: cmp.ule.s $f2, $f3, $f4
+0x87 0x18 0xa4 0x46 # CHECK: cmp.ule.d $f2, $f3, $f4
+0x88 0x18 0x84 0x46 # CHECK: cmp.saf.s $f2, $f3, $f4
+0x88 0x18 0xa4 0x46 # CHECK: cmp.saf.d $f2, $f3, $f4
+0x89 0x18 0x84 0x46 # CHECK: cmp.sun.s $f2, $f3, $f4
+0x89 0x18 0xa4 0x46 # CHECK: cmp.sun.d $f2, $f3, $f4
+0x8a 0x18 0x84 0x46 # CHECK: cmp.seq.s $f2, $f3, $f4
+0x8a 0x18 0xa4 0x46 # CHECK: cmp.seq.d $f2, $f3, $f4
+0x8b 0x18 0x84 0x46 # CHECK: cmp.sueq.s $f2, $f3, $f4
+0x8b 0x18 0xa4 0x46 # CHECK: cmp.sueq.d $f2, $f3, $f4
+0x8c 0x18 0x84 0x46 # CHECK: cmp.slt.s $f2, $f3, $f4
+0x8c 0x18 0xa4 0x46 # CHECK: cmp.slt.d $f2, $f3, $f4
+0x8d 0x18 0x84 0x46 # CHECK: cmp.sult.s $f2, $f3, $f4
+0x8d 0x18 0xa4 0x46 # CHECK: cmp.sult.d $f2, $f3, $f4
+0x8e 0x18 0x84 0x46 # CHECK: cmp.sle.s $f2, $f3, $f4
+0x8e 0x18 0xa4 0x46 # CHECK: cmp.sle.d $f2, $f3, $f4
+0x8f 0x18 0x84 0x46 # CHECK: cmp.sule.s $f2, $f3, $f4
+0x8f 0x18 0xa4 0x46 # CHECK: cmp.sule.d $f2, $f3, $f4
+0x64 0x23 0x43 0x7c # CHECK: dalign $4, $2, $3, 5
+0x34 0x12 0x62 0x74 # CHECK: daui $3, $2, 4660
+0x78 0x56 0x66 0x04 # CHECK: dahi $3, 22136
+0x24 0x20 0x02 0x7c # CHECK: dbitswap $4, $2
+0x00 0x60 0x7e 0x41 # CHECK: di $fp
+0x00 0x60 0x60 0x41 # CHECK: di
+0x9a 0x10 0x64 0x00 # CHECK: div $2, $3, $4
+0x9b 0x10 0x64 0x00 # CHECK: divu $2, $3, $4
+0x20 0x60 0x6e 0x41 # CHECK: ei $14
+0x20 0x60 0x60 0x41 # CHECK: ei
+0xda 0x10 0x64 0x00 # CHECK: mod $2, $3, $4
+0xdb 0x10 0x64 0x00 # CHECK: modu $2, $3, $4
+0x9e 0x10 0x64 0x00 # CHECK: ddiv $2, $3, $4
+0x9f 0x10 0x64 0x00 # CHECK: ddivu $2, $3, $4
+0xde 0x10 0x64 0x00 # CHECK: dmod $2, $3, $4
+0xdf 0x10 0x64 0x00 # CHECK: dmodu $2, $3, $4
+0xc5 0x10 0x64 0x00 # CHECK: lsa $2, $3, $4, 3
+0xd5 0x10 0x64 0x00 # CHECK: dlsa $2, $3, $4, 3
+0x48 0x3c 0x58 0xec # CHECK: ldpc $2, 123456
+0x43 0x00 0x48 0xec # CHECK: lwpc $2, 268
+0x43 0x00 0x50 0xec # CHECK: lwupc $2, 268
+0x98 0x10 0x64 0x00 # CHECK: mul $2, $3, $4
+0xd8 0x10 0x64 0x00 # CHECK: muh $2, $3, $4
+0x99 0x10 0x64 0x00 # CHECK: mulu $2, $3, $4
+0xd9 0x10 0x64 0x00 # CHECK: muhu $2, $3, $4
+0x9c 0x10 0x64 0x00 # CHECK: dmul $2, $3, $4
+0xdc 0x10 0x64 0x00 # CHECK: dmuh $2, $3, $4
+0x9d 0x10 0x64 0x00 # CHECK: dmulu $2, $3, $4
+0xdd 0x10 0x64 0x00 # CHECK: dmuhu $2, $3, $4
+0x98 0x18 0x04 0x46 # CHECK: maddf.s $f2, $f3, $f4
+0x98 0x18 0x24 0x46 # CHECK: maddf.d $f2, $f3, $f4
+0x99 0x18 0x04 0x46 # CHECK: msubf.s $f2, $f3, $f4
+0x99 0x18 0x24 0x46 # CHECK: msubf.d $f2, $f3, $f4
+0x10 0x08 0x22 0x46 # CHECK: sel.d $f0, $f1, $f2
+0x10 0x08 0x02 0x46 # CHECK: sel.s $f0, $f1, $f2
+0x35 0x10 0x64 0x00 # CHECK: seleqz $2, $3, $4
+0x37 0x10 0x64 0x00 # CHECK: selnez $2, $3, $4
+0x1d 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4
+0x1d 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4
+0x1c 0x10 0x04 0x46 # CHECK: min.s $f0, $f2, $f4
+0x1c 0x10 0x24 0x46 # CHECK: min.d $f0, $f2, $f4
+0x1f 0x10 0x04 0x46 # CHECK: maxa.s $f0, $f2, $f4
+0x1f 0x10 0x24 0x46 # CHECK: maxa.d $f0, $f2, $f4
+0x1e 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4
+0x1e 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4
+0x04 0x00 0x42 0x34 # CHECK: ori $2, $2, 4
+0x14 0x10 0x04 0x46 # CHECK: seleqz.s $f0, $f2, $f4
+0x14 0x10 0x24 0x46 # CHECK: seleqz.d $f0, $f2, $f4
+0x17 0x10 0x04 0x46 # CHECK: selnez.s $f0, $f2, $f4
+0x17 0x10 0x24 0x46 # CHECK: selnez.d $f0, $f2, $f4
+0x9a 0x20 0x00 0x46 # CHECK: rint.s $f2, $f4
+0x9a 0x20 0x20 0x46 # CHECK: rint.d $f2, $f4
+0x9b 0x20 0x00 0x46 # CHECK: class.s $f2, $f4
+0x9b 0x20 0x20 0x46 # CHECK: class.d $f2, $f4
+0x09 0x04 0x80 0x00 # CHECK: jr.hb $4
+0x09 0xfc 0x80 0x00 # CHECK: jalr.hb $4
+0x09 0x24 0xa0 0x00 # CHECK: jalr.hb $4, $5
+0xb6 0xb3 0x42 0x7e # CHECK: ll $2, -153($18)
+0x37 0x38 0xe0 0x7f # CHECK: lld $zero, 112($ra)
+0x26 0xec 0x6f 0x7e # CHECK: sc $15, -40($19)
+0xa7 0xe6 0xaf 0x7f # CHECK: scd $15, -51($sp)
+0x51 0x58 0xa0 0x00 # CHECK: clo $11, $5
+0x50 0xe8 0x80 0x03 # CHECK: clz $sp, $gp
+0x53 0x90 0xc0 0x00 # CHECK: dclo $18, $6
+0x52 0x80 0x20 0x03 # CHECK: dclz $16, $25
+0x40 0x00 0x00 0x00 # CHECK: ssnop
+0x0e 0x00 0x00 0x00 # CHECK: sdbbp
+0x8e 0x08 0x00 0x00 # CHECK: sdbbp 34
+0x0f 0x00 0x00 0x00 # CHECK: sync
+0x4f 0x00 0x00 0x00 # CHECK: sync 1
+0x34 0x00 0x03 0x00 # CHECK: teq $zero, $3
+0x34 0x9b 0xa7 0x00 # CHECK: teq $5, $7, 620
+0x30 0x00 0xea 0x00 # CHECK: tge $7, $10
+0x30 0x55 0xb3 0x00 # CHECK: tge $5, $19, 340
+0x31 0x00 0xdc 0x02 # CHECK: tgeu $22, $gp
+0xf1 0x5e 0x8e 0x02 # CHECK: tgeu $20, $14, 379
+0x32 0x00 0xed 0x01 # CHECK: tlt $15, $13
+0x72 0x21 0x53 0x00 # CHECK: tlt $2, $19, 133
+0x33 0x00 0x70 0x01 # CHECK: tltu $11, $16
+0x33 0xfe 0x1d 0x02 # CHECK: tltu $16, $sp, 1016
+0x36 0x00 0xd1 0x00 # CHECK: tne $6, $17
+0x76 0xdd 0xe8 0x00 # CHECK: tne $7, $8, 885
+0x43 0x0d 0xc8 0x49 # CHECK: ldc2 $8, -701($1)
+0xb7 0x34 0x52 0x49 # CHECK: lwc2 $18, -841($6)
+0x75 0x92 0xf4 0x49 # CHECK: sdc2 $20, 629($18)
+0x30 0x81 0x79 0x49 # CHECK: swc2 $25, 304($16)
+0x00 0x01 0x05 0xf8 # CHECK: jialc $5, 256
+0x00 0x01 0x05 0xd8 # CHECK: jic $5, 256
+0x25 0x04 0xa1 0x7c # CHECK: cache 1, 8($5)
+0x35 0x04 0xa1 0x7c # CHECK: pref 1, 8($5)
diff --git a/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt b/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt
new file mode 100644
index 0000000..debbe50
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt
@@ -0,0 +1,166 @@
+# RUN: llvm-mc %s -disassemble -triple=mips-unknown-linux -mcpu=mips64r6 | FileCheck %s
+0x30 0x42 0x00 0x04 # CHECK: andi $2, $2, 4
+0xec 0x80 0x00 0x19 # CHECK: addiupc $4, 100
+0x25 0x29 0x00 0x0a # CHECK: addiu $9, $9, 10
+0x7c 0x43 0x22 0xa0 # CHECK: align $4, $2, $3, 2
+0xec 0x7f 0x00 0x38 # CHECK: aluipc $3, 56
+0x3c 0x62 0xff 0xe9 # CHECK: aui $3, $2, -23
+0xec 0x7e 0xff 0xff # CHECK: auipc $3, -1
+0x04 0x11 0x14 0x9b # CHECK: bal 21104
+0xe8 0x37 0x96 0xb8 # CHECK: balc 14572256
+0xc8 0x37 0x96 0xb8 # CHECK: bc 14572256
+0x45 0x20 0x00 0x01 # CHECK: bc1eqz $f0, 8
+0x45 0x3f 0x00 0x01 # CHECK: bc1eqz $f31, 8
+0x45 0xa0 0x00 0x01 # CHECK: bc1nez $f0, 8
+0x45 0xbf 0x00 0x01 # CHECK: bc1nez $f31, 8
+0x49 0x20 0x00 0x02 # CHECK: bc2eqz $0, 12
+0x49 0x3f 0x00 0x02 # CHECK: bc2eqz $31, 12
+0x49 0xa0 0x00 0x02 # CHECK: bc2nez $0, 12
+0x49 0xbf 0x00 0x02 # CHECK: bc2nez $31, 12
+0x20 0xa6 0x00 0x40 # CHECK: beqc $5, $6, 256
+0x20 0x02 0x01 0x4d # CHECK: beqzalc $2, 1332
+0x60 0xa6 0x00 0x40 # CHECK: bnec $5, $6, 256
+0x60 0x02 0x01 0x4d # CHECK: bnezalc $2, 1332
+0xd8 0xa0 0x46 0x90 # CHECK: beqzc $5, 72256
+0x58 0x43 0x00 0x40 # CHECK: bgec $2, $3, 256
+0x18 0x43 0x00 0x40 # CHECK: bgeuc $2, $3, 256
+0x18 0x42 0x01 0x4d # CHECK: bgezalc $2, 1332
+0xf8 0xa0 0x46 0x90 # CHECK: bnezc $5, 72256
+0x5c 0xa5 0x00 0x40 # CHECK: bltzc $5, 256
+0x58 0xa5 0x00 0x40 # CHECK: bgezc $5, 256
+0x1c 0x02 0x01 0x4d # CHECK: bgtzalc $2, 1332
+0x58 0x05 0x00 0x40 # CHECK: blezc $5, 256
+0x1c 0x42 0x01 0x4d # CHECK: bltzalc $2, 1332
+0x5c 0x05 0x00 0x40 # CHECK: bgtzc $5, 256
+0x7c 0x02 0x20 0x20 # CHECK: bitswap $4, $2
+0x18 0x02 0x01 0x4d # CHECK: blezalc $2, 1332
+0x5c 0xa6 0x00 0x40 # CHECK: bltc $5, $6, 256
+0x1c 0xa6 0x00 0x40 # CHECK: bltuc $5, $6, 256
+0x60 0x00 0x00 0x01 # CHECK: bnvc $zero, $zero, 4
+0x60 0x40 0x00 0x01 # CHECK: bnvc $2, $zero, 4
+0x60 0x82 0x00 0x01 # CHECK: bnvc $4, $2, 4
+0x20 0x00 0x00 0x01 # CHECK: bovc $zero, $zero, 4
+0x20 0x40 0x00 0x01 # CHECK: bovc $2, $zero, 4
+0x20 0x82 0x00 0x01 # CHECK: bovc $4, $2, 4
+0x46 0x84 0x18 0x80 # CHECK: cmp.af.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x80 # CHECK: cmp.af.d $f2, $f3, $f4
+0x46 0x84 0x18 0x81 # CHECK: cmp.un.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x81 # CHECK: cmp.un.d $f2, $f3, $f4
+0x46 0x84 0x18 0x82 # CHECK: cmp.eq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x82 # CHECK: cmp.eq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x83 # CHECK: cmp.ueq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x83 # CHECK: cmp.ueq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x84 # CHECK: cmp.lt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x84 # CHECK: cmp.lt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x85 # CHECK: cmp.ult.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x85 # CHECK: cmp.ult.d $f2, $f3, $f4
+0x46 0x84 0x18 0x86 # CHECK: cmp.le.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x86 # CHECK: cmp.le.d $f2, $f3, $f4
+0x46 0x84 0x18 0x87 # CHECK: cmp.ule.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x87 # CHECK: cmp.ule.d $f2, $f3, $f4
+0x46 0x84 0x18 0x88 # CHECK: cmp.saf.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x88 # CHECK: cmp.saf.d $f2, $f3, $f4
+0x46 0x84 0x18 0x89 # CHECK: cmp.sun.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x89 # CHECK: cmp.sun.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8a # CHECK: cmp.seq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8a # CHECK: cmp.seq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8b # CHECK: cmp.sueq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8b # CHECK: cmp.sueq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8c # CHECK: cmp.slt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8c # CHECK: cmp.slt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8d # CHECK: cmp.sult.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8d # CHECK: cmp.sult.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8e # CHECK: cmp.sle.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8e # CHECK: cmp.sle.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8f # CHECK: cmp.sule.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8f # CHECK: cmp.sule.d $f2, $f3, $f4
+0x7c 0x43 0x23 0x64 # CHECK: dalign $4, $2, $3, 5
+0x74 0x62 0x12 0x34 # CHECK: daui $3, $2, 4660
+0x04 0x66 0x56 0x78 # CHECK: dahi $3, 22136
+0x7c 0x02 0x20 0x24 # CHECK: dbitswap $4, $2
+0x41 0x7e 0x60 0x00 # CHECK: di $fp
+0x41 0x60 0x60 0x00 # CHECK: di
+0x00 0x64 0x10 0x9a # CHECK: div $2, $3, $4
+0x00 0x64 0x10 0x9b # CHECK: divu $2, $3, $4
+0x41 0x6e 0x60 0x20 # CHECK: ei $14
+0x41 0x60 0x60 0x20 # CHECK: ei
+0x00 0x64 0x10 0xda # CHECK: mod $2, $3, $4
+0x00 0x64 0x10 0xdb # CHECK: modu $2, $3, $4
+0x00 0x64 0x10 0x9e # CHECK: ddiv $2, $3, $4
+0x00 0x64 0x10 0x9f # CHECK: ddivu $2, $3, $4
+0x00 0x64 0x10 0xde # CHECK: dmod $2, $3, $4
+0x00 0x64 0x10 0xdf # CHECK: dmodu $2, $3, $4
+0x00 0x64 0x10 0xc5 # CHECK: lsa $2, $3, $4, 3
+0x00 0x64 0x10 0xd5 # CHECK: dlsa $2, $3, $4, 3
+0xec 0x58 0x3c 0x48 # CHECK: ldpc $2, 123456
+0xec 0x48 0x00 0x43 # CHECK: lwpc $2, 268
+0xec 0x50 0x00 0x43 # CHECK: lwupc $2, 268
+0x00 0x64 0x10 0x98 # CHECK: mul $2, $3, $4
+0x00 0x64 0x10 0xd8 # CHECK: muh $2, $3, $4
+0x00 0x64 0x10 0x99 # CHECK: mulu $2, $3, $4
+0x00 0x64 0x10 0xd9 # CHECK: muhu $2, $3, $4
+0x00 0x64 0x10 0x9c # CHECK: dmul $2, $3, $4
+0x00 0x64 0x10 0xdc # CHECK: dmuh $2, $3, $4
+0x00 0x64 0x10 0x9d # CHECK: dmulu $2, $3, $4
+0x00 0x64 0x10 0xdd # CHECK: dmuhu $2, $3, $4
+0x46 0x04 0x18 0x98 # CHECK: maddf.s $f2, $f3, $f4
+0x46 0x24 0x18 0x98 # CHECK: maddf.d $f2, $f3, $f4
+0x46 0x04 0x18 0x99 # CHECK: msubf.s $f2, $f3, $f4
+0x46 0x24 0x18 0x99 # CHECK: msubf.d $f2, $f3, $f4
+0x46 0x22 0x08 0x10 # CHECK: sel.d $f0, $f1, $f2
+0x46 0x02 0x08 0x10 # CHECK: sel.s $f0, $f1, $f2
+0x00 0x64 0x10 0x35 # CHECK: seleqz $2, $3, $4
+0x00 0x64 0x10 0x37 # CHECK: selnez $2, $3, $4
+0x46 0x04 0x10 0x1d # CHECK: max.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1d # CHECK: max.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1c # CHECK: min.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1c # CHECK: min.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1f # CHECK: maxa.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1f # CHECK: maxa.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1e # CHECK: mina.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1e # CHECK: mina.d $f0, $f2, $f4
+0x34 0x42 0x00 0x04 # CHECK: ori $2, $2, 4
+0x46 0x04 0x10 0x14 # CHECK: seleqz.s $f0, $f2, $f4
+0x46 0x24 0x10 0x14 # CHECK: seleqz.d $f0, $f2, $f4
+0x46 0x04 0x10 0x17 # CHECK: selnez.s $f0, $f2, $f4
+0x46 0x24 0x10 0x17 # CHECK: selnez.d $f0, $f2, $f4
+0x46 0x00 0x20 0x9a # CHECK: rint.s $f2, $f4
+0x46 0x20 0x20 0x9a # CHECK: rint.d $f2, $f4
+0x46 0x00 0x20 0x9b # CHECK: class.s $f2, $f4
+0x46 0x20 0x20 0x9b # CHECK: class.d $f2, $f4
+0x00 0x80 0x04 0x09 # CHECK: jr.hb $4
+0x00 0x80 0xfc 0x09 # CHECK: jalr.hb $4
+0x00 0xa0 0x24 0x09 # CHECK: jalr.hb $4, $5
+0x7e 0x42 0xb3 0xb6 # CHECK: ll $2, -153($18)
+0x7f 0xe0 0x38 0x37 # CHECK: lld $zero, 112($ra)
+0x7e 0x6f 0xec 0x26 # CHECK: sc $15, -40($19)
+0x7f 0xaf 0xe6 0xa7 # CHECK: scd $15, -51($sp)
+0x00 0xa0 0x58 0x51 # CHECK: clo $11, $5
+0x03 0x80 0xe8 0x50 # CHECK: clz $sp, $gp
+0x00 0xc0 0x90 0x53 # CHECK: dclo $18, $6
+0x03 0x20 0x80 0x52 # CHECK: dclz $16, $25
+0x00 0x00 0x00 0x40 # CHECK: ssnop
+0x00 0x00 0x00 0x0e # CHECK: sdbbp
+0x00 0x00 0x08 0x8e # CHECK: sdbbp 34
+0x00 0x00 0x00 0x0f # CHECK: sync
+0x00 0x00 0x00 0x4f # CHECK: sync 1
+0x00 0x03 0x00 0x34 # CHECK: teq $zero, $3
+0x00 0xa7 0x9b 0x34 # CHECK: teq $5, $7, 620
+0x00 0xea 0x00 0x30 # CHECK: tge $7, $10
+0x00 0xb3 0x55 0x30 # CHECK: tge $5, $19, 340
+0x02 0xdc 0x00 0x31 # CHECK: tgeu $22, $gp
+0x02 0x8e 0x5e 0xf1 # CHECK: tgeu $20, $14, 379
+0x01 0xed 0x00 0x32 # CHECK: tlt $15, $13
+0x00 0x53 0x21 0x72 # CHECK: tlt $2, $19, 133
+0x01 0x70 0x00 0x33 # CHECK: tltu $11, $16
+0x02 0x1d 0xfe 0x33 # CHECK: tltu $16, $sp, 1016
+0x00 0xd1 0x00 0x36 # CHECK: tne $6, $17
+0x00 0xe8 0xdd 0x76 # CHECK: tne $7, $8, 885
+0x49 0xc8 0x0d 0x43 # CHECK: ldc2 $8, -701($1)
+0x49 0x52 0x34 0xb7 # CHECK: lwc2 $18, -841($6)
+0x49 0xf4 0x92 0x75 # CHECK: sdc2 $20, 629($18)
+0x49 0x79 0x81 0x30 # CHECK: swc2 $25, 304($16)
+0xf8 0x05 0x01 0x00 # CHECK: jialc $5, 256
+0xd8 0x05 0x01 0x00 # CHECK: jic $5, 256
+0x7c 0xa1 0x04 0x25 # CHECK: cache 1, 8($5)
+0x7c 0xa1 0x04 0x35 # CHECK: pref 1, 8($5)
diff --git a/test/MC/Disassembler/Mips/mips64r6/valid-xfail-mips64r6.txt b/test/MC/Disassembler/Mips/mips64r6/valid-xfail-mips64r6.txt
new file mode 100644
index 0000000..8ca8b81
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64r6/valid-xfail-mips64r6.txt
@@ -0,0 +1,20 @@
+# Instructions that should be valid but currently fail for known reasons (e.g.
+# they aren't implemented yet).
+#
+# RUN: llvm-mc %s -disassemble -triple=mips-unknown-linux -mcpu=mips64r6 | FileCheck %s
+# XFAIL: *
+0x20 0x40 0x00 0x01 # CHECK: bovc $0, $2, 4
+0x20 0x82 0x00 0x01 # CHECK: bovc $2, $4, 4
+0x60 0x40 0x00 0x01 # CHECK: bnvc $0, $2, 4
+0x60 0x82 0x00 0x01 # CHECK: bnvc $2, $4, 4
+0x20 0xc0 0x00 0x40 # CHECK: beqc $6, $zero, 256
+0x20 0xa0 0x00 0x40 # CHECK: beqc $5, $zero, 256
+0x20 0xa6 0x00 0x40 # CHECK: beqc $5, $6, 256
+0x60 0xc0 0x00 0x40 # CHECK: bnec $6, $zero, 256
+0x60 0xa0 0x00 0x40 # CHECK: bnec $5, $zero, 256
+0x60 0xa6 0x00 0x40 # CHECK: bnec $5, $6, 256
+0x64 0x58 0x46 0x9f # CHECK: daddiu $24, $2, 18079
+0x66 0x73 0x69 0x3f # CHECK: daddiu $19, $19, 26943
+0x65 0x6f 0xec 0x5f # CHECK: daddiu $15, $11, -5025
+0x65 0xce 0x11 0xea # CHECK: daddiu $14, $14, 4586
+0x04 0x7e 0xab 0xcd # CHECK: dati $3, 43981
diff --git a/test/MC/Disassembler/PowerPC/ppc64-encoding-ext.txt b/test/MC/Disassembler/PowerPC/ppc64-encoding-ext.txt
index 3c2f935..4f10c74 100644
--- a/test/MC/Disassembler/PowerPC/ppc64-encoding-ext.txt
+++ b/test/MC/Disassembler/PowerPC/ppc64-encoding-ext.txt
@@ -2274,3 +2274,24 @@
 
 # CHECK: rfid
 0x4c 0x00 0x00 0x24
+
+# CHECK: lbzcix 21, 5, 7
+0x7e 0xa5 0x3e 0xaa
+# CHECK: lhzcix 21, 5, 7
+0x7e 0xa5 0x3e 0x6a
+# CHECK: lwzcix 21, 5, 7
+0x7e 0xa5 0x3e 0x2a
+# CHECK: ldcix  21, 5, 7
+0x7e 0xa5 0x3e 0xea
+# CHECK: stbcix 21, 5, 7
+0x7e 0xa5 0x3f 0xaa
+# CHECK: sthcix 21, 5, 7
+0x7e 0xa5 0x3f 0x6a
+# CHECK: stwcix 21, 5, 7
+0x7e 0xa5 0x3f 0x2a
+# CHECK: stdcix 21, 5, 7
+0x7e 0xa5 0x3f 0xea
+
+# CHECK: attn
+0x00 0x00 0x02 0x00
+
diff --git a/test/MC/Disassembler/PowerPC/ppc64-encoding-fp.txt b/test/MC/Disassembler/PowerPC/ppc64-encoding-fp.txt
index 1c01c9d..0487e3f 100644
--- a/test/MC/Disassembler/PowerPC/ppc64-encoding-fp.txt
+++ b/test/MC/Disassembler/PowerPC/ppc64-encoding-fp.txt
@@ -321,6 +321,24 @@
 # CHECK: mffs 2                          
 0xfc 0x40 0x04 0x8e
 
+# CHECK: mffs. 7
+0xfc 0xe0 0x04 0x8f
+
+# CHECK: mcrfs 4, 5
+0xfe 0x14 0x00 0x80
+
+# CHECK: mtfsfi 5, 2, 1
+0xfe 0x81 0x21 0x0c
+
+# CHECK: mtfsfi. 5, 2, 1
+0xfe 0x81 0x21 0x0d
+
+# CHECK: mtfsf 127, 8, 1, 1
+0xfe 0xff 0x45 0x8e
+
+# CHECK: mtfsf. 125, 8, 1, 1
+0xfe 0xfb 0x45 0x8f
+
 # CHECK: mtfsb0 31                       
 0xff 0xe0 0x00 0x8c
 
diff --git a/test/MC/Disassembler/PowerPC/ppc64-encoding-vmx.txt b/test/MC/Disassembler/PowerPC/ppc64-encoding-vmx.txt
index 3896bf7..fe62fdf 100644
--- a/test/MC/Disassembler/PowerPC/ppc64-encoding-vmx.txt
+++ b/test/MC/Disassembler/PowerPC/ppc64-encoding-vmx.txt
@@ -378,6 +378,15 @@
 # CHECK: vandc 2, 3, 4                   
 0x10 0x43 0x24 0x44
 
+# CHECK: veqv 2, 3, 4
+0x10 0x43 0x26 0x84
+
+# CHECK: vnand 2, 3, 4
+0x10 0x43 0x25 0x84
+
+# CHECK: vorc 2, 3, 4
+0x10 0x43 0x25 0x44
+
 # CHECK: vnor 2, 3, 4                    
 0x10 0x43 0x25 0x04
 
@@ -501,6 +510,30 @@
 # CHECK: vrsqrtefp 2, 3                  
 0x10 0x40 0x19 0x4a
 
+# CHECK: vclzb 2, 3
+0x10 0x40 0x1f 0x02
+
+# CHECK: vclzh 2, 3
+0x10 0x40 0x1f 0x42
+
+# CHECK: vclzw 2, 3
+0x10 0x40 0x1f 0x82
+
+# CHECK: vclzd 2, 3
+0x10 0x40 0x1f 0xc2
+
+# CHECK: vpopcntb 2, 3
+0x10 0x40 0x1f 0x03
+
+# CHECK: vpopcnth 2, 3
+0x10 0x40 0x1f 0x43
+
+# CHECK: vpopcntw 2, 3
+0x10 0x40 0x1f 0x83
+
+# CHECK: vpopcntd 2, 3
+0x10 0x40 0x1f 0xc3
+
 # CHECK: mtvscr 2                        
 0x10 0x00 0x16 0x44
 
diff --git a/test/MC/Disassembler/PowerPC/ppc64-encoding.txt b/test/MC/Disassembler/PowerPC/ppc64-encoding.txt
index 2e2e7c1..e99d49b 100644
--- a/test/MC/Disassembler/PowerPC/ppc64-encoding.txt
+++ b/test/MC/Disassembler/PowerPC/ppc64-encoding.txt
@@ -499,6 +499,9 @@
 # CHECK: popcntd 2, 3                    
 0x7c 0x62 0x03 0xf4
 
+# CHECK: cmpb 7, 21, 4
+0x7e 0xa7 0x23 0xf8
+
 # CHECK: rlwinm 2, 3, 4, 5, 6            
 0x54 0x62 0x21 0x4c
 
diff --git a/test/MC/Disassembler/PowerPC/qpx.txt b/test/MC/Disassembler/PowerPC/qpx.txt
new file mode 100644
index 0000000..b53bb4c
--- /dev/null
+++ b/test/MC/Disassembler/PowerPC/qpx.txt
@@ -0,0 +1,383 @@
+# RUN: llvm-mc --disassemble %s -triple powerpc64-bgq-linux -mcpu=a2q | FileCheck %s
+
+# CHECK: qvfabs 3, 5
+0x10 0x60 0x2a 0x10
+
+# CHECK: qvfadd 3, 4, 5
+0x10 0x64 0x28 0x2a
+
+# CHECK: qvfadds 3, 4, 5
+0x00 0x64 0x28 0x2a
+
+# FIXME: decode as qvfandc 3, 4, 5
+# CHECK: qvflogical 3, 4, 5, 4
+0x10 0x64 0x2a 0x08
+
+# FIXME: decode as qvfand 3, 4, 5
+# CHECK: qvflogical 3, 4, 5, 1
+0x10 0x64 0x28 0x88
+
+# CHECK: qvfcfid 3, 5
+0x10 0x60 0x2e 0x9c
+
+# CHECK: qvfcfids 3, 5
+0x00 0x60 0x2e 0x9c
+
+# CHECK: qvfcfidu 3, 5
+0x10 0x60 0x2f 0x9c
+
+# CHECK: qvfcfidus 3, 5
+0x00 0x60 0x2f 0x9c
+
+# FIXME: decode as qvfclr 3
+# CHECK: qvflogical 3, 3, 3, 0
+0x10 0x63 0x18 0x08
+
+# CHECK: qvfcpsgn 3, 4, 5
+0x10 0x64 0x28 0x10
+
+# FIXME: decode as qvfctfb 3, 4
+# CHECK: qvflogical 3, 4, 4, 5
+0x10 0x64 0x22 0x88
+
+# CHECK: qvfctid 3, 5
+0x10 0x60 0x2e 0x5c
+
+# CHECK: qvfctidu 3, 5
+0x10 0x60 0x2f 0x5c
+
+# CHECK: qvfctiduz 3, 5
+0x10 0x60 0x2f 0x5e
+
+# CHECK: qvfctidz 3, 5
+0x10 0x60 0x2e 0x5e
+
+# CHECK: qvfctiw 3, 5
+0x10 0x60 0x28 0x1c
+
+# CHECK: qvfctiwu 3, 5
+0x10 0x60 0x29 0x1c
+
+# CHECK: qvfctiwuz 3, 5
+0x10 0x60 0x29 0x1e
+
+# CHECK: qvfctiwz 3, 5
+0x10 0x60 0x28 0x1e
+
+# FIXME: decode as qvfequ 3, 4, 5
+# CHECK: qvflogical 3, 4, 5, 9
+0x10 0x64 0x2c 0x88
+
+# CHECK: qvflogical 3, 4, 5, 12
+0x10 0x64 0x2e 0x08
+
+# CHECK: qvfmadd 3, 4, 6, 5
+0x10 0x64 0x29 0xba
+
+# CHECK: qvfmadds 3, 4, 6, 5
+0x00 0x64 0x29 0xba
+
+# CHECK: qvfmr 3, 5
+0x10 0x60 0x28 0x90
+
+# CHECK: qvfmsub 3, 4, 6, 5
+0x10 0x64 0x29 0xb8
+
+# CHECK: qvfmsubs 3, 4, 6, 5
+0x00 0x64 0x29 0xb8
+
+# CHECK: qvfmul 3, 4, 6
+0x10 0x64 0x01 0xb2
+
+# CHECK: qvfmuls 3, 4, 6
+0x00 0x64 0x01 0xb2
+
+# CHECK: qvfnabs 3, 5
+0x10 0x60 0x29 0x10
+
+# FIXME: decode as qvfnand 3, 4, 5
+# CHECK: qvflogical 3, 4, 5, 14
+0x10 0x64 0x2f 0x08
+
+# CHECK: qvfneg 3, 5
+0x10 0x60 0x28 0x50
+
+# CHECK: qvfnmadd 3, 4, 6, 5
+0x10 0x64 0x29 0xbe
+
+# CHECK: qvfnmadds 3, 4, 6, 5
+0x00 0x64 0x29 0xbe
+
+# CHECK: qvfnmsub 3, 4, 6, 5
+0x10 0x64 0x29 0xbc
+
+# CHECK: qvfnmsubs 3, 4, 6, 5
+0x00 0x64 0x29 0xbc
+
+# FIXME: decode as qvfnor 3, 4, 5
+# CHECK: qvflogical 3, 4, 5, 8
+0x10 0x64 0x2c 0x08
+
+# FIXME: decode as qvfnot 3, 4
+# CHECK: qvflogical 3, 4, 4, 10
+0x10 0x64 0x25 0x08
+
+# FIXME: decode as qvforc 3, 4, 5
+# CHECK: qvflogical 3, 4, 5, 13
+0x10 0x64 0x2e 0x88
+
+# FIXME: decode as qvfor 3, 4, 5
+# CHECK: qvflogical 3, 4, 5, 7
+0x10 0x64 0x2b 0x88
+
+# CHECK: qvfperm 3, 4, 5, 6
+0x10 0x64 0x29 0x8c
+
+# CHECK: qvfre 3, 5
+0x10 0x60 0x28 0x30
+
+# CHECK: qvfres 3, 5
+0x00 0x60 0x28 0x30
+
+# CHECK: qvfrim 3, 5
+0x10 0x60 0x2b 0xd0
+
+# CHECK: qvfrin 3, 5
+0x10 0x60 0x2b 0x10
+
+# CHECK: qvfrip 3, 5
+0x10 0x60 0x2b 0x90
+
+# CHECK: qvfriz 3, 5
+0x10 0x60 0x2b 0x50
+
+# CHECK: qvfrsp 3, 5
+0x10 0x60 0x28 0x18
+
+# CHECK: qvfrsqrte 3, 5
+0x10 0x60 0x28 0x34
+
+# CHECK: qvfrsqrtes 3, 5
+0x00 0x60 0x28 0x34
+
+# CHECK: qvfsel 3, 4, 6, 5
+0x10 0x64 0x29 0xae
+
+# FIXME: decode as qvfset 3
+# CHECK: qvflogical 3, 3, 3, 15
+0x10 0x63 0x1f 0x88
+
+# CHECK: qvfsub 3, 4, 5
+0x10 0x64 0x28 0x28
+
+# CHECK: qvfsubs 3, 4, 5
+0x00 0x64 0x28 0x28
+
+# CHECK: qvfxmadd 3, 4, 6, 5
+0x10 0x64 0x29 0x92
+
+# CHECK: qvfxmadds 3, 4, 6, 5
+0x00 0x64 0x29 0x92
+
+# CHECK: qvfxmul 3, 4, 6
+0x10 0x64 0x01 0xa2
+
+# CHECK: qvfxmuls 3, 4, 6
+0x00 0x64 0x01 0xa2
+
+# FIXME: decode as qvfxor 3, 4, 5
+# CHECK: qvflogical 3, 4, 5, 6
+0x10 0x64 0x2b 0x08
+
+# CHECK: qvfxxcpnmadd 3, 4, 6, 5
+0x10 0x64 0x29 0x86
+
+# CHECK: qvfxxcpnmadds 3, 4, 6, 5
+0x00 0x64 0x29 0x86
+
+# CHECK: qvfxxmadd 3, 4, 6, 5
+0x10 0x64 0x29 0x82
+
+# CHECK: qvfxxmadds 3, 4, 6, 5
+0x00 0x64 0x29 0x82
+
+# CHECK: qvfxxnpmadd 3, 4, 6, 5
+0x10 0x64 0x29 0x96
+
+# CHECK: qvfxxnpmadds 3, 4, 6, 5
+0x00 0x64 0x29 0x96
+
+# CHECK: qvlfcduxa 3, 9, 11
+0x7c 0x69 0x58 0xcf
+
+# CHECK: qvlfcdux 3, 9, 11
+0x7c 0x69 0x58 0xce
+
+# CHECK: qvlfcdxa 3, 10, 11
+0x7c 0x6a 0x58 0x8f
+
+# CHECK: qvlfcdx 3, 10, 11
+0x7c 0x6a 0x58 0x8e
+
+# CHECK: qvlfcsuxa 3, 9, 11
+0x7c 0x69 0x58 0x4f
+
+# CHECK: qvlfcsux 3, 9, 11
+0x7c 0x69 0x58 0x4e
+
+# CHECK: qvlfcsxa 3, 10, 11
+0x7c 0x6a 0x58 0x0f
+
+# CHECK: qvlfcsx 3, 10, 11
+0x7c 0x6a 0x58 0x0e
+
+# CHECK: qvlfduxa 3, 9, 11
+0x7c 0x69 0x5c 0xcf
+
+# CHECK: qvlfdux 3, 9, 11
+0x7c 0x69 0x5c 0xce
+
+# CHECK: qvlfdxa 3, 10, 11
+0x7c 0x6a 0x5c 0x8f
+
+# CHECK: qvlfdx 3, 10, 11
+0x7c 0x6a 0x5c 0x8e
+
+# CHECK: qvlfiwaxa 3, 10, 11
+0x7c 0x6a 0x5e 0xcf
+
+# CHECK: qvlfiwax 3, 10, 11
+0x7c 0x6a 0x5e 0xce
+
+# CHECK: qvlfiwzxa 3, 10, 11
+0x7c 0x6a 0x5e 0x8f
+
+# CHECK: qvlfiwzx 3, 10, 11
+0x7c 0x6a 0x5e 0x8e
+
+# CHECK: qvlfsuxa 3, 9, 11
+0x7c 0x69 0x5c 0x4f
+
+# CHECK: qvlfsux 3, 9, 11
+0x7c 0x69 0x5c 0x4e
+
+# CHECK: qvlfsxa 3, 10, 11
+0x7c 0x6a 0x5c 0x0f
+
+# CHECK: qvlfsx 3, 10, 11
+0x7c 0x6a 0x5c 0x0e
+
+# CHECK: qvlpcldx 3, 10, 11
+0x7c 0x6a 0x5c 0x8c
+
+# CHECK: qvlpclsx 3, 10, 11
+0x7c 0x6a 0x5c 0x0c
+
+# CHECK: qvlpcrdx 3, 10, 11
+0x7c 0x6a 0x58 0x8c
+
+# CHECK: qvlpcrsx 3, 10, 11
+0x7c 0x6a 0x58 0x0c
+
+# CHECK: qvstfcduxa 2, 9, 11
+0x7c 0x49 0x59 0xcf
+
+# CHECK: qvstfcduxia 2, 9, 11
+0x7c 0x49 0x59 0xcb
+
+# CHECK: qvstfcduxi 2, 9, 11
+0x7c 0x49 0x59 0xca
+
+# CHECK: qvstfcdux 2, 9, 11
+0x7c 0x49 0x59 0xce
+
+# CHECK: qvstfcdxa 2, 10, 11
+0x7c 0x4a 0x59 0x8f
+
+# CHECK: qvstfcdxia 2, 10, 11
+0x7c 0x4a 0x59 0x8b
+
+# CHECK: qvstfcdxi 2, 10, 11
+0x7c 0x4a 0x59 0x8a
+
+# CHECK: qvstfcdx 2, 10, 11
+0x7c 0x4a 0x59 0x8e
+
+# CHECK: qvstfcsuxa 2, 9, 11
+0x7c 0x49 0x59 0x4f
+
+# CHECK: qvstfcsuxia 2, 9, 11
+0x7c 0x49 0x59 0x4b
+
+# CHECK: qvstfcsuxi 2, 9, 11
+0x7c 0x49 0x59 0x4a
+
+# CHECK: qvstfcsux 2, 9, 11
+0x7c 0x49 0x59 0x4e
+
+# CHECK: qvstfcsxa 2, 10, 11
+0x7c 0x4a 0x59 0x0f
+
+# CHECK: qvstfcsxia 2, 10, 11
+0x7c 0x4a 0x59 0x0b
+
+# CHECK: qvstfcsxi 2, 10, 11
+0x7c 0x4a 0x59 0x0a
+
+# CHECK: qvstfcsx 2, 10, 11
+0x7c 0x4a 0x59 0x0e
+
+# CHECK: qvstfduxa 2, 9, 11
+0x7c 0x49 0x5d 0xcf
+
+# CHECK: qvstfduxia 2, 9, 11
+0x7c 0x49 0x5d 0xcb
+
+# CHECK: qvstfduxi 2, 9, 11
+0x7c 0x49 0x5d 0xca
+
+# CHECK: qvstfdux 2, 9, 11
+0x7c 0x49 0x5d 0xce
+
+# CHECK: qvstfdxa 2, 10, 11
+0x7c 0x4a 0x5d 0x8f
+
+# CHECK: qvstfdxia 2, 10, 11
+0x7c 0x4a 0x5d 0x8b
+
+# CHECK: qvstfdxi 2, 10, 11
+0x7c 0x4a 0x5d 0x8a
+
+# CHECK: qvstfdx 2, 10, 11
+0x7c 0x4a 0x5d 0x8e
+
+# CHECK: qvstfiwxa 2, 10, 11
+0x7c 0x4a 0x5f 0x8f
+
+# CHECK: qvstfiwx 2, 10, 11
+0x7c 0x4a 0x5f 0x8e
+
+# CHECK: qvstfsuxa 2, 9, 11
+0x7c 0x49 0x5d 0x4f
+
+# CHECK: qvstfsuxia 2, 9, 11
+0x7c 0x49 0x5d 0x4b
+
+# CHECK: qvstfsuxi 2, 9, 11
+0x7c 0x49 0x5d 0x4a
+
+# CHECK: qvstfsux 2, 9, 11
+0x7c 0x49 0x5d 0x4e
+
+# CHECK: qvstfsxa 2, 10, 11
+0x7c 0x4a 0x5d 0x0f
+
+# CHECK: qvstfsxia 2, 10, 11
+0x7c 0x4a 0x5d 0x0b
+
+# CHECK: qvstfsxi 2, 10, 11
+0x7c 0x4a 0x5d 0x0a
+
+# CHECK: qvstfsx 2, 10, 11
+0x7c 0x4a 0x5d 0x0e
+
diff --git a/test/MC/Disassembler/PowerPC/vsx.txt b/test/MC/Disassembler/PowerPC/vsx.txt
index b5e2751..5e65482 100644
--- a/test/MC/Disassembler/PowerPC/vsx.txt
+++ b/test/MC/Disassembler/PowerPC/vsx.txt
@@ -404,6 +404,15 @@
 # CHECK: xxland 7, 63, 27
 0xf0 0xff 0xdc 0x14
 
+# CHECK: xxleqv 7, 63, 27
+0xf0 0xff 0xdd 0xd4
+
+# CHECK: xxlnand 7, 63, 27
+0xf0 0xff 0xdd 0x94
+
+# CHECK: xxlorc 7, 63, 27
+0xf0 0xff 0xdd 0x54
+
 # CHECK: xxlandc 7, 63, 27
 0xf0 0xff 0xdc 0x54
 
diff --git a/test/MC/Disassembler/X86/avx-512.txt b/test/MC/Disassembler/X86/avx-512.txt
index 62fc35b..d24a68d 100644
--- a/test/MC/Disassembler/X86/avx-512.txt
+++ b/test/MC/Disassembler/X86/avx-512.txt
@@ -109,3 +109,30 @@
 
 # CHECK: vaddss 1024(%rdx), %xmm0, %xmm16
 0x62 0xe1 0x7e 0x08 0x58 0x82 0x00 0x04 0x00 0x00
+
+# CHECK: vpcmpeqd %zmm10, %zmm25, %k5
+0x62 0xd3 0x35 0x40 0x1f 0xea 0x0
+
+# CHECK: vpcmpltd %zmm10, %zmm25, %k5
+0x62 0xd3 0x35 0x40 0x1f 0xea 0x1
+
+# CHECK: vpcmpled %zmm10, %zmm25, %k5
+0x62 0xd3 0x35 0x40 0x1f 0xea 0x2
+
+# CHECK: vpcmpd $3, %zmm10, %zmm25, %k5
+0x62 0xd3 0x35 0x40 0x1f 0xea 0x3
+
+# CHECK: vpcmpneqd %zmm10, %zmm25, %k5
+0x62 0xd3 0x35 0x40 0x1f 0xea 0x4
+
+# CHECK: vpcmpnltd %zmm10, %zmm25, %k5
+0x62 0xd3 0x35 0x40 0x1f 0xea 0x5
+
+# CHECK: vpcmpnled %zmm10, %zmm25, %k5
+0x62 0xd3 0x35 0x40 0x1f 0xea 0x6
+
+# CHECK: vpcmpd $7, %zmm10, %zmm25, %k5
+0x62 0xd3 0x35 0x40 0x1f 0xea 0x7
+
+# CHECK: vpcmpd $8, %zmm10, %zmm25, %k5
+0x62 0xd3 0x35 0x40 0x1f 0xea 0x8
diff --git a/test/MC/Disassembler/X86/intel-syntax-32.txt b/test/MC/Disassembler/X86/intel-syntax-32.txt
index 2298823..66c87b8 100644
--- a/test/MC/Disassembler/X86/intel-syntax-32.txt
+++ b/test/MC/Disassembler/X86/intel-syntax-32.txt
@@ -29,3 +29,15 @@
 
 # CHECK: mov dword ptr [878082192], eax
 0xa3 0x90 0x78 0x56 0x34
+
+# CHECK: lea	cx, [si + 4]
+0x67 0x66 0x8d 0x4c 0x04
+
+# CHECK: lea	ecx, [si + 4]
+0x67 0x8d 0x4c 0x04 
+
+# CHECK: lea	cx, [esp + 4]
+0x66 0x8d 0x4c 0x24 0x04 
+
+# CHECK: lea	ecx, [esp + 4]
+0x8d 0x4c 0x24 0x04 
diff --git a/test/MC/Disassembler/X86/intel-syntax.txt b/test/MC/Disassembler/X86/intel-syntax.txt
index 3689525..0a628af 100644
--- a/test/MC/Disassembler/X86/intel-syntax.txt
+++ b/test/MC/Disassembler/X86/intel-syntax.txt
@@ -152,3 +152,23 @@
 
 # CHECK: movabs qword ptr [-6066930261531658096], rax
 0x48 0xa3 0x90 0x78 0x56 0x34 0x12 0xef 0xcd 0xab
+
+# CHECK: lea	cx, [esp + 4]
+0x67 0x66 0x8d 0x4c 0x24 0x04 
+
+# CHECK: lea	ecx, [esp + 4]
+0x67 0x8d 0x4c 0x24 0x04 
+
+# CHECK: lea	rcx, [esp + 4]
+0x67 0x48 0x8d 0x4c 0x24 0x04 
+
+# CHECK: lea	cx, [rsp + 4]
+0x66 0x8d 0x4c 0x24 0x04 
+
+# CHECK: lea	ecx, [rsp + 4]
+0x8d 0x4c 0x24 0x04 
+
+# CHECK: lea	rcx, [rsp + 4]
+0x48 0x8d 0x4c 0x24 0x04 
+
+
diff --git a/test/MC/Disassembler/X86/invalid-cmp-imm.txt b/test/MC/Disassembler/X86/invalid-cmp-imm.txt
deleted file mode 100644
index 7b2ea2a..0000000
--- a/test/MC/Disassembler/X86/invalid-cmp-imm.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 2>&1 | grep "invalid instruction encoding"
-
-# This instruction would decode as cmpordps if the immediate byte was less than 8.
-0x0f 0xc2 0xc7 0x08
-# This instruction would decode as cmpordpd if the immediate byte was less than 8.
-0x66 0x0f 0xc2 0xc7 0x08
-# This instruction would decode as cmpordss if the immediate byte was less than 8.
-0xf3 0x0f 0xc2 0xc7 0x08
-# This instruction would decode as cmpordsd if the immediate byte was less than 8.
-0xf2 0x0f 0xc2 0xc7 0x08
diff --git a/test/MC/Disassembler/X86/moffs.txt b/test/MC/Disassembler/X86/moffs.txt
index dd2664c..ac18859 100644
--- a/test/MC/Disassembler/X86/moffs.txt
+++ b/test/MC/Disassembler/X86/moffs.txt
@@ -1,86 +1,86 @@
-# RUN: llvm-mc --disassemble --print-imm-hex %s -triple=i686-linux-gnu-code16 | FileCheck --check-prefix=16 %s
-# RUN: llvm-mc --disassemble --print-imm-hex %s -triple=i686-linux-gnu | FileCheck --check-prefix=32 %s
-# RUN: llvm-mc --disassemble --print-imm-hex %s -triple=x86_64-linux-gnu | FileCheck --check-prefix=64 %s
+# RUN: llvm-mc --disassemble --show-encoding --print-imm-hex %s -triple=i686-linux-gnu-code16 | FileCheck --check-prefix=16 %s
+# RUN: llvm-mc --disassemble --show-encoding --print-imm-hex %s -triple=i686-linux-gnu | FileCheck --check-prefix=32 %s
+# RUN: llvm-mc --disassemble --show-encoding --print-imm-hex %s -triple=x86_64-linux-gnu | FileCheck --check-prefix=64 %s
 
-# 16: movb 0x5a5a, %al
-# 32: movb 0x5a5a5a5a, %al
-# 64: movabsb 0x5a5a5a5a5a5a5a5a, %al
+# 16: movb 0x5a5a, %al # encoding: [0xa0,0x5a,0x5a]
+# 32: movb 0x5a5a5a5a, %al # encoding: [0xa0,0x5a,0x5a,0x5a,0x5a]
+# 64: movabsb 0x5a5a5a5a5a5a5a5a, %al # encoding: [0xa0,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a]
 0xa0 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
 
-# 16: movb 0x5a5a5a5a, %al
-# 32: movb 0x5a5a, %al
-# 64: movabsb 0x5a5a5a5a, %al
+# 16: movb 0x5a5a5a5a, %al # encoding: [0x67,0xa0,0x5a,0x5a,0x5a,0x5a]
+# 32: movb 0x5a5a, %al # encoding: [0x67,0xa0,0x5a,0x5a]
+# 64: movb 0x5a5a5a5a, %al # encoding: [0x67,0xa0,0x5a,0x5a,0x5a,0x5a]
 0x67 0xa0 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
 
-# 16: movw 0x5a5a, %ax
-# 32: movl 0x5a5a5a5a, %eax
-# 64: movabsl 0x5a5a5a5a5a5a5a5a, %eax
+# 16: movw 0x5a5a, %ax # encoding: [0xa1,0x5a,0x5a]
+# 32: movl 0x5a5a5a5a, %eax # encoding: [0xa1,0x5a,0x5a,0x5a,0x5a]
+# 64: movabsl 0x5a5a5a5a5a5a5a5a, %eax # encoding: [0xa1,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a]
 0xa1 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
 
-# 16: movw 0x5a5a5a5a, %ax
-# 32: movl 0x5a5a, %eax
-# 64: movabsl 0x5a5a5a5a, %eax
+# 16: movw 0x5a5a5a5a, %ax # encoding: [0x67,0xa1,0x5a,0x5a,0x5a,0x5a]
+# 32: movl 0x5a5a, %eax # encoding: [0x67,0xa1,0x5a,0x5a]
+# 64: movl 0x5a5a5a5a, %eax # encoding: [0x67,0xa1,0x5a,0x5a,0x5a,0x5a]
 0x67 0xa1 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
 
-# 16: movl 0x5a5a, %eax
-# 32: movw 0x5a5a5a5a, %ax
-# 64: movabsw 0x5a5a5a5a5a5a5a5a, %ax
+# 16: movl 0x5a5a, %eax # encoding: [0x66,0xa1,0x5a,0x5a]
+# 32: movw 0x5a5a5a5a, %ax # encoding: [0x66,0xa1,0x5a,0x5a,0x5a,0x5a]
+# 64: movabsw 0x5a5a5a5a5a5a5a5a, %ax # encoding: [0x66,0xa1,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a]
 0x66 0xa1 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
 
-# 16: movl 0x5a5a5a5a, %eax
-# 32: movw 0x5a5a, %ax
-# 64: movabsw 0x5a5a5a5a, %ax
+# 16: movl 0x5a5a5a5a, %eax # encoding: [0x67,0x66,0xa1,0x5a,0x5a,0x5a,0x5a]
+# 32: movw 0x5a5a, %ax # encoding: [0x67,0x66,0xa1,0x5a,0x5a]
+# 64: movw 0x5a5a5a5a, %ax # encoding: [0x67,0x66,0xa1,0x5a,0x5a,0x5a,0x5a]
 0x66 0x67 0xa1 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
 
-# 16: movl 0x5a5a5a5a, %eax
-# 32: movw 0x5a5a, %ax
-# 64: movabsw 0x5a5a5a5a, %ax
+# 16: movl 0x5a5a5a5a, %eax # encoding: [0x67,0x66,0xa1,0x5a,0x5a,0x5a,0x5a]
+# 32: movw 0x5a5a, %ax # encoding: [0x67,0x66,0xa1,0x5a,0x5a]
+# 64: movw 0x5a5a5a5a, %ax # encoding: [0x67,0x66,0xa1,0x5a,0x5a,0x5a,0x5a]
 0x67 0x66 0xa1 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
 
-# 16: movl %es:0x5a5a5a5a, %eax
-# 32: movw %es:0x5a5a, %ax
-# 64: movabsw %es:0x5a5a5a5a, %ax
-0x67 0x26 0x66 0xa1 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
+# 16: movl %es:0x5a5a5a5a, %eax # encoding: [0x67,0x66,0x26,0xa1,0x5a,0x5a,0x5a,0x5a]
+# 32: movw %es:0x5a5a, %ax # encoding: [0x67,0x66,0x26,0xa1,0x5a,0x5a]
+# 64: movw %es:0x5a5a5a5a, %ax # encoding: [0x67,0x66,0x26,0xa1,0x5a,0x5a,0x5a,0x5a]
+0x67 0x26 0x66 0xa1 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a # encoding: [0xa0,0x5a,0x5a]
 
 
 
-# 16: movb %al, 0x5a5a
-# 32: movb %al, 0x5a5a5a5a
-# 64: movabsb %al, 0x5a5a5a5a5a5a5a5a
-0xa2 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
+# 16: movb %al, 0x5a5a # encoding: [0xa2,0x5a,0x5a]
+# 32: movb %al, 0x5a5a5a5a # encoding: [0xa2,0x5a,0x5a,0x5a,0x5a]
+# 64: movabsb %al, 0x5a5a5a5a5a5a5a5a # encoding: [0xa2,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a]
+0xa2 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a # encoding: [0xa0,0x5a,0x5a]
 
-# 16: movb %al, 0x5a5a5a5a
-# 32: movb %al, 0x5a5a
-# 64: movabsb %al, 0x5a5a5a5a
+# 16: movb %al, 0x5a5a5a5a # encoding: [0x67,0xa2,0x5a,0x5a,0x5a,0x5a]
+# 32: movb %al, 0x5a5a # encoding: [0x67,0xa2,0x5a,0x5a]
+# 64: movb %al, 0x5a5a5a5a # encoding: [0x67,0xa2,0x5a,0x5a,0x5a,0x5a]
 0x67 0xa2 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
 
-# 16: movw %ax, 0x5a5a
-# 32: movl %eax, 0x5a5a5a5a
-# 64: movabsl %eax, 0x5a5a5a5a5a5a5a5a
+# 16: movw %ax, 0x5a5a # encoding: [0xa3,0x5a,0x5a]
+# 32: movl %eax, 0x5a5a5a5a # encoding: [0xa3,0x5a,0x5a,0x5a,0x5a]
+# 64: movabsl %eax, 0x5a5a5a5a5a5a5a5a # encoding: [0xa3,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a]
 0xa3 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
 
-# 16: movw %ax, %gs:0x5a5a5a5a
-# 32: movl %eax, %gs:0x5a5a
-# 64: movabsl %eax, %gs:0x5a5a5a5a
+# 16: movw %ax, %gs:0x5a5a5a5a # encoding: [0x67,0x65,0xa3,0x5a,0x5a,0x5a,0x5a]
+# 32: movl %eax, %gs:0x5a5a # encoding: [0x67,0x65,0xa3,0x5a,0x5a]
+# 64: movl %eax, %gs:0x5a5a5a5a # encoding: [0x67,0x65,0xa3,0x5a,0x5a,0x5a,0x5a]
 0x65 0x67 0xa3 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
 
-# 16: movl %eax, 0x5a5a
-# 32: movw %ax, 0x5a5a5a5a
-# 64: movabsw %ax, 0x5a5a5a5a5a5a5a5a
+# 16: movl %eax, 0x5a5a # encoding: [0x66,0xa3,0x5a,0x5a]
+# 32: movw %ax, 0x5a5a5a5a # encoding: [0x66,0xa3,0x5a,0x5a,0x5a,0x5a]
+# 64: movabsw %ax, 0x5a5a5a5a5a5a5a5a # encoding: [0x66,0xa3,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a,0x5a]
 0x66 0xa3 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
 
-# 16: movl %eax, 0x5a5a5a5a
-# 32: movw %ax, 0x5a5a
-# 64: movabsw %ax, 0x5a5a5a5a
+# 16: movl %eax, 0x5a5a5a5a # encoding: [0x67,0x66,0xa3,0x5a,0x5a,0x5a,0x5a]
+# 32: movw %ax, 0x5a5a # encoding: [0x67,0x66,0xa3,0x5a,0x5a]
+# 64: movw %ax, 0x5a5a5a5a # encoding: [0x67,0x66,0xa3,0x5a,0x5a,0x5a,0x5a]
 0x66 0x67 0xa3 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
 
-# 16: movl %eax, 0x5a5a5a5a
-# 32: movw %ax, 0x5a5a
-# 64: movabsw %ax, 0x5a5a5a5a
+# 16: movl %eax, 0x5a5a5a5a # encoding: [0x67,0x66,0xa3,0x5a,0x5a,0x5a,0x5a]
+# 32: movw %ax, 0x5a5a # encoding: [0x67,0x66,0xa3,0x5a,0x5a]
+# 64: movw %ax, 0x5a5a5a5a # encoding: [0x67,0x66,0xa3,0x5a,0x5a,0x5a,0x5a]
 0x67 0x66 0xa3 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
 
-# 16: movl %eax, %es:0x5a5a5a5a
-# 32: movw %ax, %es:0x5a5a
-# 64: movabsw %ax, %es:0x5a5a5a5a
+# 16: movl %eax, %es:0x5a5a5a5a # encoding: [0x67,0x66,0x26,0xa3,0x5a,0x5a,0x5a,0x5a]
+# 32: movw %ax, %es:0x5a5a # encoding: [0x67,0x66,0x26,0xa3,0x5a,0x5a]
+# 64: movw %ax, %es:0x5a5a5a5a # encoding: [0x67,0x66,0x26,0xa3,0x5a,0x5a,0x5a,0x5a]
 0x67 0x26 0x66 0xa3 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
 
diff --git a/test/MC/Disassembler/X86/prefixes.txt b/test/MC/Disassembler/X86/prefixes.txt
index b8830dc..9e002fa 100644
--- a/test/MC/Disassembler/X86/prefixes.txt
+++ b/test/MC/Disassembler/X86/prefixes.txt
@@ -8,11 +8,11 @@
 0x64 0x48 0x8b 0x3c 0x25 0x00 0x03 0x00 0x00
 
 # CHECK: rep
-# CHECK-NEXT:		stosq
+# CHECK-NEXT:		stosq %rax, %es:(%rdi)
 0xf3 0x48 0xab
 
 # CHECK: rep
-# CHECK-NEXT:		stosl
+# CHECK-NEXT:		stosq %rax, %es:(%edi)
 0xf3 0x67 0x48 0xab
 
 # CHECK: movl 32(%rbp), %eax
@@ -54,6 +54,17 @@
 # CHECK-NEXT: stosq
 0xf3 0xf3 0x48 0xab
 
+
+# Test that we can disassembler control registers above CR8
+# CHECK: movq %cr15, %rax
+0x44 0x0f 0x20 0xf8
+# CHECK: movq %dr15, %rax
+0x44 0x0f 0x21 0xf8
+
+# Test that MMX ignore REX.R and REX.B.
+# CHECK: movq %mm0, %mm1
+0x46 0x0f 0x7f 0xc1
+
 # Test that a prefix on it's own works. It's debatable as to if this is 
 # something that is considered valid, but however as LLVM's own disassembler
 # has decided to disassemble prefixes as being separate opcodes, it therefore 
diff --git a/test/MC/Disassembler/X86/simple-tests.txt b/test/MC/Disassembler/X86/simple-tests.txt
index e6e9c7b..875c5b7 100644
--- a/test/MC/Disassembler/X86/simple-tests.txt
+++ b/test/MC/Disassembler/X86/simple-tests.txt
@@ -90,9 +90,24 @@
 # CHECK: movq	%cr0, %rcx
 0x0f 0x20 0xc1
 
+# CHECK: leaw	4(%esp), %cx
+0x67 0x66 0x8d 0x4c 0x24 0x04 
+
+# CHECK: leal	4(%esp), %ecx
+0x67 0x8d 0x4c 0x24 0x04 
+
+# CHECK: leaq	4(%esp), %rcx
+0x67 0x48 0x8d 0x4c 0x24 0x04 
+
+# CHECK: leaw	4(%rsp), %cx
+0x66 0x8d 0x4c 0x24 0x04 
+
 # CHECK: leal	4(%rsp), %ecx
 0x8d 0x4c 0x24 0x04 
 
+# CHECK: leaq	4(%rsp), %rcx
+0x48 0x8d 0x4c 0x24 0x04 
+
 # CHECK: enter	$1, $2
 0xc8 0x01 0x00 0x02
 
@@ -896,6 +911,12 @@
 # CHECK: vpcmov %ymm1, (%rax), %ymm3, %ymm4
 0x8f 0xe8 0x64 0xa2 0x20 0x10
 
+# CHECK: vpcomeqb  %xmm6, %xmm4, %xmm2
+0x8f 0xe8 0x58 0xcc 0xd6 0x04
+
+# CHECK: vpcomneqb 8(%rax), %xmm3, %xmm2
+0x8f 0xe8 0x60 0xcc 0x50 0x08 0x05
+
 # CHECK: vpcomb $55, %xmm6, %xmm4, %xmm2
 0x8f 0xe8 0x58 0xcc 0xd6 0x37
 
diff --git a/test/MC/Disassembler/X86/x86-32.txt b/test/MC/Disassembler/X86/x86-32.txt
index 79577c6..830b830 100644
--- a/test/MC/Disassembler/X86/x86-32.txt
+++ b/test/MC/Disassembler/X86/x86-32.txt
@@ -490,6 +490,27 @@
 # CHECK: xsaveopt (%eax)
 0x0f 0xae 0x30
 
+# CHECK: xsaves (%eax)
+0x0f 0xc7 0x28
+
+# CHECK: xrstors (%eax)
+0x0f 0xc7 0x18
+
+# CHECK: xsavec (%eax)
+0x0f 0xc7 0x20
+
+# CHECK: clflush (%eax)
+0x0f 0xae 0x38
+
+# CHECK: clflushopt (%eax)
+0x66 0x0f 0xae 0x38
+
+# CHECK: clwb (%eax)
+0x66 0x0f 0xae 0x30
+
+# CHECK: pcommit
+0x66 0x0f 0xae 0xf8
+
 # CHECK: vcvtph2ps %xmm0, %xmm0
 0xc4 0xe2 0x79 0x13 0xc0
 
@@ -712,5 +733,34 @@
 # CHECK: movq %mm0, %mm1
 0x0f 0x7f 0xc1
 
-# CHECK: vpermq $-18, %ymm2, %ymm2
+# CHECK: vpermq $238, %ymm2, %ymm2
 0xc4 0xe3 0xfd 0x00 0xd2 0xee
+
+# CHECK: cmpps $8, %xmm7, %xmm0
+0x0f 0xc2 0xc7 0x08
+# CHECK: cmppd $8, %xmm7, %xmm0
+0x66 0x0f 0xc2 0xc7 0x08
+# CHECK: cmpss $8, %xmm7, %xmm0
+0xf3 0x0f 0xc2 0xc7 0x08
+# CHECK: cmpsd $8, %xmm7, %xmm0
+0xf2 0x0f 0xc2 0xc7 0x08
+
+# CHECK: addb $38, 5277496
+0x82 0x05 0x38 0x87 0x50 0x00 0x26
+# CHECK: orb $38, 5277496
+0x82 0x0d 0x38 0x87 0x50 0x00 0x26
+# CHECK: adcb $38, 5277496
+0x82 0x15 0x38 0x87 0x50 0x00 0x26
+# CHECK: sbbb $38, 5277496
+0x82 0x1d 0x38 0x87 0x50 0x00 0x26
+# CHECK: andb $38, 5277496
+0x82 0x25 0x38 0x87 0x50 0x00 0x26
+# CHECK: subb $38, 5277496
+0x82 0x2D 0x38 0x87 0x50 0x00 0x26
+# CHECK: xorb $38, 5277496
+0x82 0x35 0x38 0x87 0x50 0x00 0x26
+# CHECK: cmpb $38, 5277496
+0x82 0x3d 0x38 0x87 0x50 0x00 0x26
+
+#CHECK: getsec
+0x0f 0x37
diff --git a/test/MC/Disassembler/X86/x86-64.txt b/test/MC/Disassembler/X86/x86-64.txt
index 6f072df..f000d15 100644
--- a/test/MC/Disassembler/X86/x86-64.txt
+++ b/test/MC/Disassembler/X86/x86-64.txt
@@ -107,19 +107,22 @@
 # CHECK: xbegin	53
 0xc7 0xf8 0x35 0x00 0x00 0x00
 
+# CHECK: xbegin	53
+0x66 0xc7 0xf8 0x35 0x00
+
 # CHECK: xend
 0x0f 0x01 0xd5
 
 # CHECK: xabort $13
 0xc6 0xf8 0x0d
 
-# CHECK: xsaveq (%rax)
+# CHECK: xsave64 (%rax)
 0x48 0x0f 0xae 0x20
 
-# CHECK: xrstorq (%rax)
+# CHECK: xrstor64 (%rax)
 0x48 0x0f 0xae 0x28
 
-# CHECK: xsaveoptq (%rax)
+# CHECK: xsaveopt64 (%rax)
 0x48 0x0f 0xae 0x30
 
 # CHECK: clac
@@ -233,6 +236,27 @@
 # CHECK: vmovq %xmm0, %rax
 0xc4 0xe1 0xf9 0x7e 0xc0
 
+# CHECK: movd (%rax), %mm0
+0x48 0x0f 0x6e 0x00
+
+# CHECK: movd %rax, %mm0
+0x48 0x0f 0x6e 0xc0
+
+# CHECK: movd %mm0, (%rax)
+0x48 0x0f 0x7e 0x00
+
+# CHECK: movd %mm0, %rax
+0x48 0x0f 0x7e 0xc0
+
+# CHECK: movd (%rax), %xmm0
+0x66 0x48 0x0f 0x6e 0x00
+
+# CHECK: movd %rax, %xmm0
+0x66 0x48 0x0f 0x6e 0xc0
+
+# CHECK: movd %xmm0, (%rax)
+0x66 0x48 0x0f 0x7e 0x00
+
 # CHECK: movd %xmm0, %rax
 0x66 0x48 0x0f 0x7e 0xc0
 
@@ -265,3 +289,15 @@
 
 # CHECK: $0, 305419896(%rbp)
 0x80 0x84 0x25 0x78 0x56 0x34 0x12 0x00
+
+# CHECK: movabsq 6510615555426900570, %rax
+0x48 0xa1 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
+
+# CHECK: movq 1515870810, %rax
+0x67, 0x48 0xa1 0x5a 0x5a 0x5a 0x5a
+
+# CHECK: movabsq %rax, 6510615555426900570
+0x48 0xa3 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a 0x5a
+
+# CHECK: movq %rax, 1515870810
+0x67, 0x48 0xa3 0x5a 0x5a 0x5a 0x5a
diff --git a/test/MC/ELF/alias.s b/test/MC/ELF/alias.s
index 2e65ace..8e13182 100644
--- a/test/MC/ELF/alias.s
+++ b/test/MC/ELF/alias.s
@@ -20,6 +20,10 @@ bar5 = bar4
 
         .long foo2
 
+// Test that bar6 is a function that doesn't have the same value as foo4.
+bar6 = bar5
+bar6:
+
 // CHECK:      Symbols [
 // CHECK-NEXT:   Symbol {
 // CHECK-NEXT:     Name:  (0)
@@ -58,6 +62,15 @@ bar5 = bar4
 // CHECK-NEXT:     Section: .text
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: bar6
+// CHECK-NEXT:     Value: 0x5
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: None
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .text
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
 // CHECK-NEXT:     Name: foo
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
diff --git a/test/MC/ELF/cfi-large-model.s b/test/MC/ELF/cfi-large-model.s
new file mode 100644
index 0000000..16073ad
--- /dev/null
+++ b/test/MC/ELF/cfi-large-model.s
@@ -0,0 +1,27 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -code-model=large %s \
+// RUN:   -o - | llvm-readobj -s -sd | FileCheck %s
+
+// CHECK:      Section {
+// CHECK:        Index: 
+// CHECK:        Name: .eh_frame
+// CHECK-NEXT:   Type: SHT_PROGBITS
+// CHECK-NEXT:   Flags [
+// CHECK-NEXT:     SHF_ALLOC
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Address: 0x0
+// CHECK-NEXT:   Offset: 0x40
+// CHECK-NEXT:   Size: 56
+// CHECK-NEXT:   Link: 0
+// CHECK-NEXT:   Info: 0
+// CHECK-NEXT:   AddressAlignment: 8
+// CHECK-NEXT:   EntrySize: 0
+// CHECK-NEXT:   SectionData (
+// CHECK-NEXT:     0000: 14000000 00000000 037A5200 01781001  |.........zR..x..|
+// CHECK-NEXT:     0010: 1C0C0708 90010000 1C000000 1C000000  |................|
+// CHECK-NEXT:     0020: 00000000 00000000 00000000 00000000  |................|
+// CHECK-NEXT:     0030: 00000000 00000000                    |........|
+// CHECK-NEXT:   )
+
+f:
+    .cfi_startproc
+    .cfi_endproc
diff --git a/test/MC/ELF/cfi-version.ll b/test/MC/ELF/cfi-version.ll
index 2938dc7..c8a9978 100644
--- a/test/MC/ELF/cfi-version.ll
+++ b/test/MC/ELF/cfi-version.ll
@@ -22,19 +22,19 @@ attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/test.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"test.c", metadata !"/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\002\000\001\000\006\00256\000\002", metadata !1, metadata !5, metadata !6, null, i32 ()* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/test.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!11 = metadata !{metadata !"clang version 3.5.0 "}
-!12 = metadata !{i32 2, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/test.c] [DW_LANG_C99]
+!1 = !{!"test.c", !"/tmp"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\002\000\001\000\006\00256\000\002", !1, !5, !6, null, i32 ()* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp/test.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 1, !"Debug Info Version", i32 2}
+!11 = !{!"clang version 3.5.0 "}
+!12 = !MDLocation(line: 2, scope: !4)
 
 ; DWARF2:      .debug_frame contents:
 ; DWARF2:        Version:               1
diff --git a/test/MC/ELF/common-error1.s b/test/MC/ELF/common-error1.s
new file mode 100644
index 0000000..a413885
--- /dev/null
+++ b/test/MC/ELF/common-error1.s
@@ -0,0 +1,6 @@
+// RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux < %s 2>&1 | FileCheck %s
+
+        .comm   C,4,4
+        .set    A,C
+
+// CHECK: Common symbol C cannot be used in assignment expr
diff --git a/test/MC/ELF/common-error2.s b/test/MC/ELF/common-error2.s
new file mode 100644
index 0000000..d666fee
--- /dev/null
+++ b/test/MC/ELF/common-error2.s
@@ -0,0 +1,6 @@
+// RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux < %s 2>&1 | FileCheck %s
+
+        .set    A,C
+        .comm   C,4,4
+
+// CHECK: Common symbol C cannot be used in assignment expr
diff --git a/test/MC/ELF/relocation-386.s b/test/MC/ELF/relocation-386.s
index ba12df0..83c524b 100644
--- a/test/MC/ELF/relocation-386.s
+++ b/test/MC/ELF/relocation-386.s
@@ -63,6 +63,8 @@
 // Relocation 28 (und_symbol-bar2) is of type R_386_PC8
 // CHECK-NEXT:     0xA0         R_386_PC8        und_symbol 0x0
 // CHECK-NEXT:     0xA3         R_386_GOTOFF     und_symbol 0x0
+// Relocation 29 (zed@PLT) is of type R_386_PLT32 and uses the symbol
+// CHECK-NEXT:     0xA9         R_386_PLT32      zed 0x0
 // CHECK-NEXT:   }
 // CHECK-NEXT: ]
 
@@ -129,6 +131,7 @@ bar2:
         .byte und_symbol-bar2
 
         leal 1 + und_symbol@GOTOFF, %edi
+        movl zed@PLT(%eax), %eax
 
         .section        zedsec,"awT",@progbits
 zed:
diff --git a/test/MC/ELF/section-unique.s b/test/MC/ELF/section-unique.s
new file mode 100644
index 0000000..b482af3
--- /dev/null
+++ b/test/MC/ELF/section-unique.s
@@ -0,0 +1,39 @@
+// RUN: llvm-mc -triple x86_64-pc-linux-gnu %s -o - | FileCheck %s
+// RUN: llvm-mc -triple x86_64-pc-linux-gnu %s -filetype=obj -o - | llvm-readobj -t | FileCheck %s --check-prefix=OBJ
+
+	.section	.text,"ax",@progbits,unique
+        .globl	f
+f:
+        nop
+
+	.section	.text,"ax",@progbits,unique
+        .globl	g
+g:
+        nop
+
+// test that f and g are in different sections.
+
+// CHECK: .section	.text,"ax",@progbits,unique
+// CHECK: f:
+
+// CHECK: .section	.text,"ax",@progbits,unique
+// CHECK: g:
+
+// OBJ: Symbol {
+// OBJ:   Name:    f
+// OBJ:   Value:   0x0
+// OBJ:   Size:    0
+// OBJ:   Binding: Global
+// OBJ:   Type:    None
+// OBJ:   Other:   0
+// OBJ:   Section: .text (0x4)
+// OBJ: }
+// OBJ: Symbol {
+// OBJ:   Name:    g
+// OBJ:   Value:   0x0
+// OBJ:   Size:    0
+// OBJ:   Binding: Global
+// OBJ:   Type:    None
+// OBJ:   Other:   0
+// OBJ:   Section: .text (0x5)
+// OBJ: }
diff --git a/test/MC/ELF/symver-msvc.s b/test/MC/ELF/symver-msvc.s
new file mode 100644
index 0000000..d6730ca
--- /dev/null
+++ b/test/MC/ELF/symver-msvc.s
@@ -0,0 +1,59 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-windows-elf %s -o - | llvm-readobj -r -t | FileCheck %s
+
+// Verify that MSVC C++ mangled symbols are not affected by the ELF
+// GNU-style symbol versioning. The ELF format is used on Windows by
+// the MCJIT execution engine.
+
+        .long "??_R0?AVexception@std@@@8"
+        .long "@??_R0?AVinvalid_argument@std@@@8"
+        .long "__imp_??_R0?AVlogic_error@std@@@8"
+        .long "__imp_@??_R0PAVexception@std@@@8"
+
+
+// CHECK:       Relocations [
+// CHECK-NEXT:    Section (2) .rela.text {
+// CHECK-NEXT:      0x0 R_X86_64_32 ??_R0?AVexception@std@@@8 0x0
+// CHECK-NEXT:      0x4 R_X86_64_32 @??_R0?AVinvalid_argument@std@@@8 0x0
+// CHECK-NEXT:      0x8 R_X86_64_32 __imp_??_R0?AVlogic_error@std@@@8 0x0
+// CHECK-NEXT:      0xC R_X86_64_32 __imp_@??_R0PAVexception@std@@@8 0x0
+// CHECK-NEXT:    }
+// CHECK-NEXT:  ]
+
+// CHECK:       Symbols [
+// CHECK:         Symbol {
+// CHECK:           Name: ??_R0?AVexception@std@@@8 (102)
+// CHECK-NEXT:      Value: 0x0
+// CHECK-NEXT:      Size: 0
+// CHECK-NEXT:      Binding: Global (0x1)
+// CHECK-NEXT:      Type: None (0x0)
+// CHECK-NEXT:      Other: 0
+// CHECK-NEXT:      Section: Undefined (0x0)
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Symbol {
+// CHECK-NEXT:      Name: @??_R0?AVinvalid_argument@std@@@8 (1)
+// CHECK-NEXT:      Value: 0x0
+// CHECK-NEXT:      Size: 0
+// CHECK-NEXT:      Binding: Global (0x1)
+// CHECK-NEXT:      Type: None (0x0)
+// CHECK-NEXT:      Other: 0
+// CHECK-NEXT:      Section: Undefined (0x0)
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Symbol {
+// CHECK-NEXT:      Name: __imp_??_R0?AVlogic_error@std@@@8 (35)
+// CHECK-NEXT:      Value: 0x0
+// CHECK-NEXT:      Size: 0
+// CHECK-NEXT:      Binding: Global (0x1)
+// CHECK-NEXT:      Type: None (0x0)
+// CHECK-NEXT:      Other: 0
+// CHECK-NEXT:      Section: Undefined (0x0)
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Symbol {
+// CHECK-NEXT:      Name: __imp_@??_R0PAVexception@std@@@8 (69)
+// CHECK-NEXT:      Value: 0x0
+// CHECK-NEXT:      Size: 0
+// CHECK-NEXT:      Binding: Global (0x1)
+// CHECK-NEXT:      Type: None (0x0)
+// CHECK-NEXT:      Other: 0
+// CHECK-NEXT:      Section: Undefined (0x0)
+// CHECK-NEXT:    }
+// CHECK-NEXT:  ]
diff --git a/test/MC/ELF/type.s b/test/MC/ELF/type.s
index c82d300..f7745d8 100644
--- a/test/MC/ELF/type.s
+++ b/test/MC/ELF/type.s
@@ -9,8 +9,8 @@ foo:
         .type bar,@object
 bar:
 
-// Test that gnu_unique_object is accepted.
         .type zed,@gnu_unique_object
+zed:
 
 obj:
         .global obj
@@ -310,3 +310,13 @@ alias12:
 // CHECK-NEXT:     Other: 0
 // CHECK-NEXT:     Section: .text (0x1)
 // CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: zed (32)
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Unique (0xA)
+// CHECK-NEXT:     Type: Object (0x1)
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .text (0x1)
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
diff --git a/test/MC/ELF/uleb.s b/test/MC/ELF/uleb.s
index d755cc2..5d203a9 100644
--- a/test/MC/ELF/uleb.s
+++ b/test/MC/ELF/uleb.s
@@ -11,16 +11,17 @@ foo:
 	.uleb128	128
 	.uleb128	16383
 	.uleb128	16384
+        .uleb128	23, 42
 
 // ELF_32:   Name: .text
 // ELF_32:   SectionData (
-// ELF_32:     0000: 00017F80 01FF7F80 8001
+// ELF_32:     0000: 00017F80 01FF7F80 8001172A
 // ELF_32:   )
 // ELF_64:   Name: .text
 // ELF_64:   SectionData (
-// ELF_64:     0000: 00017F80 01FF7F80 8001
+// ELF_64:     0000: 00017F80 01FF7F80 8001172A
 // ELF_64:   )
 // MACHO_32: ('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// MACHO_32: ('_section_data', '00017f80 01ff7f80 8001')
+// MACHO_32: ('_section_data', '00017f80 01ff7f80 8001172a')
 // MACHO_64: ('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// MACHO_64: ('_section_data', '00017f80 01ff7f80 8001')
+// MACHO_64: ('_section_data', '00017f80 01ff7f80 8001172a')
diff --git a/test/MC/Hexagon/inst_add.ll b/test/MC/Hexagon/inst_add.ll
index 5377d94..20a7b31 100644
--- a/test/MC/Hexagon/inst_add.ll
+++ b/test/MC/Hexagon/inst_add.ll
@@ -7,4 +7,4 @@ define i32 @foo (i32 %a, i32 %b)
   ret i32 %1
 }
 
-; CHECK:  0000 004100f3 00c09f52
-\ No newline at end of file
+; CHECK:  0000 004100f3 00c09f52
diff --git a/test/MC/Hexagon/inst_add64.ll b/test/MC/Hexagon/inst_add64.ll
new file mode 100644
index 0000000..7216067
--- /dev/null
+++ b/test/MC/Hexagon/inst_add64.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i64 @foo (i64 %a, i64 %b)
+{
+  %1 = add i64 %a, %b
+  ret i64 %1
+}
+
+; CHECK:  0000 e04200d3 00c09f52
diff --git a/test/MC/Hexagon/inst_and.ll b/test/MC/Hexagon/inst_and.ll
index 16bf304..854adf0 100644
--- a/test/MC/Hexagon/inst_and.ll
+++ b/test/MC/Hexagon/inst_and.ll
@@ -7,4 +7,4 @@ define i32 @foo (i32 %a, i32 %b)
   ret i32 %1
 }
 
-; CHECK:  0000 004100f1 00c09f52
-\ No newline at end of file
+; CHECK:  0000 004100f1 00c09f52
diff --git a/test/MC/Hexagon/inst_and64.ll b/test/MC/Hexagon/inst_and64.ll
new file mode 100644
index 0000000..0b83074
--- /dev/null
+++ b/test/MC/Hexagon/inst_and64.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i64 @foo (i64 %a, i64 %b)
+{
+  %1 = and i64 %a, %b
+  ret i64 %1
+}
+
+; CHECK:  0000 0042e0d3 00c09f52
diff --git a/test/MC/Hexagon/inst_aslh.ll b/test/MC/Hexagon/inst_aslh.ll
new file mode 100644
index 0000000..6809438
--- /dev/null
+++ b/test/MC/Hexagon/inst_aslh.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i32 @foo (i32 %a)
+{
+  %1 = shl i32 %a, 16
+  ret i32 %1
+}
+
+; CHECK:   0000 00400070 00c09f52
diff --git a/test/MC/Hexagon/inst_asrh.ll b/test/MC/Hexagon/inst_asrh.ll
new file mode 100644
index 0000000..f16f0a1
--- /dev/null
+++ b/test/MC/Hexagon/inst_asrh.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i32 @foo (i32 %a)
+{
+  %1 = ashr i32 %a, 16
+  ret i32 %1
+}
+
+; CHECK:   0000 00402070 00c09f52
diff --git a/test/MC/Hexagon/inst_cmp_eq.ll b/test/MC/Hexagon/inst_cmp_eq.ll
new file mode 100644
index 0000000..113db63
--- /dev/null
+++ b/test/MC/Hexagon/inst_cmp_eq.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i1 @foo (i32 %a, i32 %b)
+{
+  %1 = icmp eq i32 %a, %b
+  ret i1 %1
+}
+
+; CHECK:  0000 004100f2 00404089 00c09f52
diff --git a/test/MC/Hexagon/inst_cmp_eqi.ll b/test/MC/Hexagon/inst_cmp_eqi.ll
new file mode 100644
index 0000000..70c4c30
--- /dev/null
+++ b/test/MC/Hexagon/inst_cmp_eqi.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i1 @foo (i32 %a)
+{
+  %1 = icmp eq i32 %a, 42
+  ret i1 %1
+}
+
+; CHECK:  0000 40450075 00404089 00c09f52
diff --git a/test/MC/Hexagon/inst_cmp_gt.ll b/test/MC/Hexagon/inst_cmp_gt.ll
new file mode 100644
index 0000000..85fedbf
--- /dev/null
+++ b/test/MC/Hexagon/inst_cmp_gt.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i1 @foo (i32 %a, i32 %b)
+{
+  %1 = icmp sgt i32 %a, %b
+  ret i1 %1
+}
+
+; CHECK:  0000 004140f2 00404089 00c09f52
diff --git a/test/MC/Hexagon/inst_cmp_gti.ll b/test/MC/Hexagon/inst_cmp_gti.ll
new file mode 100644
index 0000000..18ba3e4
--- /dev/null
+++ b/test/MC/Hexagon/inst_cmp_gti.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i1 @foo (i32 %a)
+{
+  %1 = icmp sgt i32 %a, 42
+  ret i1 %1
+}
+
+; CHECK:  0000 40454075 00404089 00c09f52
diff --git a/test/MC/Hexagon/inst_cmp_lt.ll b/test/MC/Hexagon/inst_cmp_lt.ll
new file mode 100644
index 0000000..3a76184
--- /dev/null
+++ b/test/MC/Hexagon/inst_cmp_lt.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i1 @foo (i32 %a, i32 %b)
+{
+  %1 = icmp slt i32 %a, %b
+  ret i1 %1
+}
+
+; CHECK:  0000 004041f2 00404089 00c09f52
diff --git a/test/MC/Hexagon/inst_cmp_ugt.ll b/test/MC/Hexagon/inst_cmp_ugt.ll
new file mode 100644
index 0000000..096536f
--- /dev/null
+++ b/test/MC/Hexagon/inst_cmp_ugt.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i1 @foo (i32 %a, i32 %b)
+{
+  %1 = icmp ugt i32 %a, %b
+  ret i1 %1
+}
+
+; CHECK:  0000 004160f2 00404089 00c09f52
diff --git a/test/MC/Hexagon/inst_cmp_ugti.ll b/test/MC/Hexagon/inst_cmp_ugti.ll
new file mode 100644
index 0000000..a835834
--- /dev/null
+++ b/test/MC/Hexagon/inst_cmp_ugti.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i1 @foo (i32 %a)
+{
+  %1 = icmp ugt i32 %a, 42
+  ret i1 %1
+}
+
+; CHECK:  0000 40458075 00404089 00c09f52
diff --git a/test/MC/Hexagon/inst_cmp_ult.ll b/test/MC/Hexagon/inst_cmp_ult.ll
new file mode 100644
index 0000000..4323fa0
--- /dev/null
+++ b/test/MC/Hexagon/inst_cmp_ult.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i1 @foo (i32 %a, i32 %b)
+{
+  %1 = icmp ult i32 %a, %b
+  ret i1 %1
+}
+
+; CHECK:  0000 004061f2 00404089 00c09f52
diff --git a/test/MC/Hexagon/inst_or.ll b/test/MC/Hexagon/inst_or.ll
index fe8152b..1b966fc 100644
--- a/test/MC/Hexagon/inst_or.ll
+++ b/test/MC/Hexagon/inst_or.ll
@@ -7,4 +7,4 @@ define i32 @foo (i32 %a, i32 %b)
   ret i32 %1
 }
 
-; CHECK:   0000 004120f1 00c09f52
-\ No newline at end of file
+; CHECK:   0000 004120f1 00c09f52
diff --git a/test/MC/Hexagon/inst_or64.ll b/test/MC/Hexagon/inst_or64.ll
new file mode 100644
index 0000000..ea10430
--- /dev/null
+++ b/test/MC/Hexagon/inst_or64.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i64 @foo (i64 %a, i64 %b)
+{
+  %1 = or i64 %a, %b
+  ret i64 %1
+}
+
+; CHECK:  0000 4042e0d3 00c09f52
diff --git a/test/MC/Hexagon/inst_select.ll b/test/MC/Hexagon/inst_select.ll
new file mode 100644
index 0000000..7e88c65
--- /dev/null
+++ b/test/MC/Hexagon/inst_select.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i32 @foo (i1 %a, i32 %b, i32 %c)
+{
+  %1 = select i1 %a, i32 %b, i32 %c
+  ret i32 %1
+}
+
+; CHECK:  0000 00400085 004201f4 00c09f52
diff --git a/test/MC/Hexagon/inst_sub.ll b/test/MC/Hexagon/inst_sub.ll
index 7523aa6..51d38c9 100644
--- a/test/MC/Hexagon/inst_sub.ll
+++ b/test/MC/Hexagon/inst_sub.ll
@@ -7,4 +7,4 @@ define i32 @foo (i32 %a, i32 %b)
   ret i32 %1
 }
 
-; CHECK:  0000 004021f3 00c09f52
-\ No newline at end of file
+; CHECK:  0000 004021f3 00c09f52
diff --git a/test/MC/Hexagon/inst_sub64.ll b/test/MC/Hexagon/inst_sub64.ll
new file mode 100644
index 0000000..99f102d
--- /dev/null
+++ b/test/MC/Hexagon/inst_sub64.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i64 @foo (i64 %a, i64 %b)
+{
+  %1 = sub i64 %a, %b
+  ret i64 %1
+}
+
+; CHECK:  0000 e04022d3 00c09f52
diff --git a/test/MC/Hexagon/inst_sxtb.ll b/test/MC/Hexagon/inst_sxtb.ll
new file mode 100644
index 0000000..4a21742
--- /dev/null
+++ b/test/MC/Hexagon/inst_sxtb.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i32 @foo (i8 %a)
+{
+  %1 = sext i8 %a to i32
+  ret i32 %1
+}
+
+; CHECK:   0000 0040a070 00c09f52
diff --git a/test/MC/Hexagon/inst_sxth.ll b/test/MC/Hexagon/inst_sxth.ll
new file mode 100644
index 0000000..f0bcf58
--- /dev/null
+++ b/test/MC/Hexagon/inst_sxth.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i32 @foo (i16 %a)
+{
+  %1 = sext i16 %a to i32
+  ret i32 %1
+}
+
+; CHECK:   0000 0040e070 00c09f52
diff --git a/test/MC/Hexagon/inst_xor.ll b/test/MC/Hexagon/inst_xor.ll
index fe989e5..f54f3ba 100644
--- a/test/MC/Hexagon/inst_xor.ll
+++ b/test/MC/Hexagon/inst_xor.ll
@@ -7,4 +7,4 @@ define i32 @foo (i32 %a, i32 %b)
   ret i32 %1
 }
 
-; CHECK:   0000 004160f1 00c09f52
-\ No newline at end of file
+; CHECK:   0000 004160f1 00c09f52
diff --git a/test/MC/Hexagon/inst_xor64.ll b/test/MC/Hexagon/inst_xor64.ll
new file mode 100644
index 0000000..7f77c46
--- /dev/null
+++ b/test/MC/Hexagon/inst_xor64.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i64 @foo (i64 %a, i64 %b)
+{
+  %1 = xor i64 %a, %b
+  ret i64 %1
+}
+
+; CHECK:  0000 8042e0d3 00c09f52
diff --git a/test/MC/Hexagon/inst_zxtb.ll b/test/MC/Hexagon/inst_zxtb.ll
new file mode 100644
index 0000000..622c036
--- /dev/null
+++ b/test/MC/Hexagon/inst_zxtb.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i32 @foo (i8 %a)
+{
+  %1 = zext i8 %a to i32
+  ret i32 %1
+}
+
+; CHECK:   0000 e05f0076 00c09f52
diff --git a/test/MC/Hexagon/inst_zxth.ll b/test/MC/Hexagon/inst_zxth.ll
new file mode 100644
index 0000000..962210b
--- /dev/null
+++ b/test/MC/Hexagon/inst_zxth.ll
@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i32 @foo (i16 %a)
+{
+  %1 = zext i16 %a to i32
+  ret i32 %1
+}
+
+; CHECK:   0000 0040c070 00c09f52
diff --git a/test/MC/MachO/AArch64/cfstring.s b/test/MC/MachO/AArch64/cfstring.s
new file mode 100644
index 0000000..19b5067
--- /dev/null
+++ b/test/MC/MachO/AArch64/cfstring.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple arm64-apple-darwin10 %s -filetype=obj -o - | llvm-readobj -r --expand-relocs | FileCheck %s
+
+; Test that we produce an external relocation. There is no apparent need for it, but
+; ld64 (241.9) produces a corrupt output if we don't.
+
+// CHECK:      Relocations [
+// CHECK-NEXT:   Section __data {
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x0
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: ARM64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: Lfoo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
+
+        .section        __DATA,__cfstring
+Lfoo:
+
+        .section        __DATA,__data
+        .quad   Lfoo
diff --git a/test/MC/MachO/AArch64/classrefs.s b/test/MC/MachO/AArch64/classrefs.s
new file mode 100644
index 0000000..5edc82c
--- /dev/null
+++ b/test/MC/MachO/AArch64/classrefs.s
@@ -0,0 +1,25 @@
+; RUN: llvm-mc -triple arm64-apple-darwin10 %s -filetype=obj -o - | llvm-readobj -r --expand-relocs | FileCheck %s
+
+; Test that we produce an external relocation with Lbar. We could also produce
+; an internal relocation. We just have to be careful to not use another symbol.
+
+// CHECK:      Relocations [
+// CHECK-NEXT:   Section __data {
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x0
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: ARM64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: Lbar
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
+
+        .section        __DATA,__objc_classrefs,regular,no_dead_strip
+Lbar:
+
+        .section        __DATA,__data
+        .quad   Lbar
+
diff --git a/test/MC/MachO/AArch64/darwin-ARM64-reloc.s b/test/MC/MachO/AArch64/darwin-ARM64-reloc.s
index 7f586ae..07d5252 100644
--- a/test/MC/MachO/AArch64/darwin-ARM64-reloc.s
+++ b/test/MC/MachO/AArch64/darwin-ARM64-reloc.s
@@ -1,4 +1,4 @@
-; RUN: llvm-mc -n -triple arm64-apple-darwin10 %s -filetype=obj -o - | macho-dump --dump-section-data | FileCheck %s
+; RUN: llvm-mc -n -triple arm64-apple-darwin10 %s -filetype=obj -o - | llvm-readobj -r --expand-relocs | FileCheck %s
 
 	.text
 _fred:
@@ -15,6 +15,7 @@ _fred:
 
 	adrp	x3, _data_ext@gotpage
         ldr	w2, [x3, _data_ext@gotpageoff]
+        adrp    x0, L_.str@PAGE
 
 	.data
 _data:
@@ -28,130 +29,230 @@ _data:
         .quad _foo@got
         .long _foo@got - .
 
+        .section __TEXT,__cstring,cstring_literals
+L_.str:
+        .asciz "foo"
 
-; CHECK: ('cputype', 16777228)
-; CHECK: ('cpusubtype', 0)
-; CHECK: ('filetype', 1)
-; CHECK: ('num_load_commands', 3)
-; CHECK: ('load_commands_size', 336)
-; CHECK: ('flag', 0)
-; CHECK: ('reserved', 0)
-; CHECK: ('load_commands', [
-; CHECK:   # Load Command 0
-; CHECK:  (('command', 25)
-; CHECK:   ('size', 232)
-; CHECK:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-; CHECK:   ('vm_addr', 0)
-; CHECK:   ('vm_size', 84)
-; CHECK:   ('file_offset', 368)
-; CHECK:   ('file_size', 84)
-; CHECK:   ('maxprot', 7)
-; CHECK:   ('initprot', 7)
-; CHECK:   ('num_sections', 2)
-; CHECK:   ('flags', 0)
-; CHECK:   ('sections', [
-; CHECK:     # Section 0
-; CHECK:    (('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-; CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-; CHECK:     ('address', 0)
-; CHECK:     ('size', 36)
-; CHECK:     ('offset', 368)
-; CHECK:     ('alignment', 0)
-; CHECK:     ('reloc_offset', 452)
-; CHECK:     ('num_reloc', 13)
-; CHECK:     ('flags', 0x80000400)
-; CHECK:     ('reserved1', 0)
-; CHECK:     ('reserved2', 0)
-; CHECK:     ('reserved3', 0)
-; CHECK:    ),
-; CHECK:   ('_relocations', [
-; CHECK:     # Relocation 0
-; CHECK:     (('word-0', 0x20),
-; CHECK:      ('word-1', 0x6c000005)),
-; CHECK:     # Relocation 1
-; CHECK:     (('word-0', 0x1c),
-; CHECK:      ('word-1', 0x5d000005)),
-; CHECK:     # Relocation 2
-; CHECK:     (('word-0', 0x18),
-; CHECK:      ('word-1', 0xa4000004)),
-; CHECK:     # Relocation 3
-; CHECK:     (('word-0', 0x18),
-; CHECK:      ('word-1', 0x4c000002)),
-; CHECK:     # Relocation 4
-; CHECK:     (('word-0', 0x14),
-; CHECK:      ('word-1', 0xa4000001)),
-; CHECK:     # Relocation 5
-; CHECK:     (('word-0', 0x14),
-; CHECK:      ('word-1', 0x3d000002)),
-; CHECK:     # Relocation 6
-; CHECK:     (('word-0', 0x10),
-; CHECK:      ('word-1', 0xa4000004)),
-; CHECK:     # Relocation 7
-; CHECK:     (('word-0', 0x10),
-; CHECK:      ('word-1', 0x4c000002)),
-; CHECK:     # Relocation 8
-; CHECK:     (('word-0', 0xc),
-; CHECK:      ('word-1', 0x4c000002)),
-; CHECK:     # Relocation 9
-; CHECK:     (('word-0', 0x8),
-; CHECK:      ('word-1', 0x3d000002)),
-; CHECK:     # Relocation 10
-; CHECK:     (('word-0', 0x4),
-; CHECK:      ('word-1', 0xa4000014)),
-; CHECK:     # Relocation 11
-; CHECK:     (('word-0', 0x4),
-; CHECK:      ('word-1', 0x2d000007)),
-; CHECK:     # Relocation 12
-; CHECK:     (('word-0', 0x0),
-; CHECK:      ('word-1', 0x2d000007)),
-; CHECK:   ])
-; CHECK:   ('_section_data', '00000094 00000094 03000090 620040b9 63000091 03000090 620040b9 03000090 620040b9')
-; CHECK:     # Section 1
-; CHECK:    (('section_name', '__data\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-; CHECK:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-; CHECK:     ('address', 36)
-; CHECK:     ('size', 48)
-; CHECK:     ('offset', 404)
-; CHECK:     ('alignment', 0)
-; CHECK:     ('reloc_offset', 556)
-; CHECK:     ('num_reloc', 10)
-; CHECK:     ('flags', 0x0)
-; CHECK:     ('reserved1', 0)
-; CHECK:     ('reserved2', 0)
-; CHECK:     ('reserved3', 0)
-; CHECK:    ),
-; CHECK:   ('_relocations', [
-; CHECK:     # Relocation 0
-; CHECK:     (('word-0', 0x2c),
-; CHECK:      ('word-1', 0x7d000006)),
-; CHECK:     # Relocation 1
-; CHECK:     (('word-0', 0x24),
-; CHECK:      ('word-1', 0x7e000006)),
-; CHECK:     # Relocation 2
-; CHECK:     (('word-0', 0x20),
-; CHECK:      ('word-1', 0x1c000004)),
-; CHECK:     # Relocation 3
-; CHECK:     (('word-0', 0x20),
-; CHECK:      ('word-1', 0xc000006)),
-; CHECK:     # Relocation 4
-; CHECK:     (('word-0', 0x18),
-; CHECK:      ('word-1', 0x1e000004)),
-; CHECK:     # Relocation 5
-; CHECK:     (('word-0', 0x18),
-; CHECK:      ('word-1', 0xe000006)),
-; CHECK:     # Relocation 6
-; CHECK:     (('word-0', 0x10),
-; CHECK:      ('word-1', 0x1e000004)),
-; CHECK:     # Relocation 7
-; CHECK:     (('word-0', 0x10),
-; CHECK:      ('word-1', 0xe000006)),
-; CHECK:     # Relocation 8
-; CHECK:     (('word-0', 0x8),
-; CHECK:      ('word-1', 0xe000006)),
-; CHECK:     # Relocation 9
-; CHECK:     (('word-0', 0x0),
-; CHECK:      ('word-1', 0xe000006)),
-; CHECK:   ])
-; CHECK:   ('_section_data', '00000000 00000000 04000000 00000000 00000000 00000000 04000000 00000000 00000000 00000000 00000000 d4ffffff')
-; CHECK:   ])
-; CHECK:  ),
+
+; CHECK:     Relocations [
+; CHECK-NEXT:  Section __text {
+; CHECK-NEXT:    Relocation {
+; CHECK-NEXT:       Offset: 0x24
+; CHECK-NEXT:       PCRel: 1
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_PAGE21 (3)
+; CHECK-NEXT:       Symbol: L_.str
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:    Relocation {
+; CHECK-NEXT:       Offset: 0x20
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_GOT_LOAD_PAGEOFF12 (6)
+; CHECK-NEXT:       Symbol: _data_ext
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x1C
+; CHECK-NEXT:       PCRel: 1
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_GOT_LOAD_PAGE21 (5)
+; CHECK-NEXT:       Symbol: _data_ext
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x18
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 0
+; CHECK-NEXT:       Type: ARM64_RELOC_ADDEND (10)
+; CHECK-NEXT:       Symbol: 0x4
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x18
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_PAGEOFF12 (4)
+; CHECK-NEXT:       Symbol: _data
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x14
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 0
+; CHECK-NEXT:       Type: ARM64_RELOC_ADDEND (10)
+; CHECK-NEXT:       Symbol: 0x1
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x14
+; CHECK-NEXT:       PCRel: 1
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_PAGE21 (3)
+; CHECK-NEXT:       Symbol: _data
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x10
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 0
+; CHECK-NEXT:       Type: ARM64_RELOC_ADDEND (10)
+; CHECK-NEXT:       Symbol: 0x4
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x10
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_PAGEOFF12 (4)
+; CHECK-NEXT:       Symbol: _data
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0xC
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_PAGEOFF12 (4)
+; CHECK-NEXT:       Symbol: _data
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x8
+; CHECK-NEXT:       PCRel: 1
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_PAGE21 (3)
+; CHECK-NEXT:       Symbol: _data
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x4
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 0
+; CHECK-NEXT:       Type: ARM64_RELOC_ADDEND (10)
+; CHECK-NEXT:       Symbol: 0x14
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x4
+; CHECK-NEXT:       PCRel: 1
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_BRANCH26 (2)
+; CHECK-NEXT:       Symbol: _func
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x0
+; CHECK-NEXT:       PCRel: 1
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_BRANCH26 (2)
+; CHECK-NEXT:       Symbol: _func
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:   }
+; CHECK-NEXT:   Section __data {
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x2C
+; CHECK-NEXT:       PCRel: 1
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_POINTER_TO_GOT (7)
+; CHECK-NEXT:       Symbol: _foo
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x24
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 3
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_POINTER_TO_GOT (7)
+; CHECK-NEXT:       Symbol: _foo
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x20
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_SUBTRACTOR (1)
+; CHECK-NEXT:       Symbol: _bar
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x20
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_UNSIGNED (0)
+; CHECK-NEXT:       Symbol: _foo
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x18
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 3
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_SUBTRACTOR (1)
+; CHECK-NEXT:       Symbol: _bar
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x18
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 3
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_UNSIGNED (0)
+; CHECK-NEXT:       Symbol: _foo
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x10
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 3
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_SUBTRACTOR (1)
+; CHECK-NEXT:       Symbol: _bar
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x10
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 3
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_UNSIGNED (0)
+; CHECK-NEXT:       Symbol: _foo
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x8
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 3
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_UNSIGNED (0)
+; CHECK-NEXT:       Symbol: _foo
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x0
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 3
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_UNSIGNED (0)
+; CHECK-NEXT:       Symbol: _foo
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:   }
+; CHECK-NEXT: ]
diff --git a/test/MC/MachO/AArch64/mergeable.s b/test/MC/MachO/AArch64/mergeable.s
new file mode 100644
index 0000000..fc6ec04
--- /dev/null
+++ b/test/MC/MachO/AArch64/mergeable.s
@@ -0,0 +1,59 @@
+// RUN: llvm-mc -triple aarch64-apple-darwin14 %s -filetype=obj -o - | llvm-readobj -r --expand-relocs | FileCheck %s
+
+// Test that we "S + K" produce a relocation with a symbol, but just S produces
+// a relocation with the section.
+
+	.section	__TEXT,__literal4,4byte_literals
+L0:
+	.long	42
+
+	.section	__TEXT,__cstring,cstring_literals
+L1:
+	.asciz	"42"
+
+	.section	__DATA,__data
+	.quad	L0
+	.quad	L0 + 1
+	.quad	L1
+	.quad	L1 + 1
+
+// CHECK:      Relocations [
+// CHECK-NEXT:   Section __data {
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x18
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: ARM64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: L1
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x10
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: ARM64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: L1
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x8
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: ARM64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: L0
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x0
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 0
+// CHECK-NEXT:       Type: ARM64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: 0x2
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
diff --git a/test/MC/MachO/AArch64/reloc-crash.s b/test/MC/MachO/AArch64/reloc-crash.s
new file mode 100644
index 0000000..4984947
--- /dev/null
+++ b/test/MC/MachO/AArch64/reloc-crash.s
@@ -0,0 +1,27 @@
+; RUN: llvm-mc -triple arm64-apple-darwin10 %s -filetype=obj -o - | llvm-readobj -r --expand-relocs | FileCheck %s
+
+; Test tha we produce an external relocation. There is no apparent need for it, but
+; ld64 (241.9) crashes if we don't.
+
+; CHECK:      Relocations [
+; CHECK-NEXT:   Section __bar {
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x0
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 3
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_UNSIGNED (0)
+; CHECK-NEXT:       Symbol: Lbar
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:   }
+; CHECK-NEXT: ]
+
+        .section        __TEXT,__cstring
+Lfoo:
+        .asciz  "Hello World!"
+Lbar:
+        .asciz  "cString"
+
+        .section        __foo,__bar,literal_pointers
+        .quad   Lbar
diff --git a/test/MC/MachO/AArch64/reloc-crash2.s b/test/MC/MachO/AArch64/reloc-crash2.s
new file mode 100644
index 0000000..6ae4471
--- /dev/null
+++ b/test/MC/MachO/AArch64/reloc-crash2.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple arm64-apple-darwin10 %s -filetype=obj -o - | llvm-readobj -r --expand-relocs | FileCheck %s
+
+; This is a regression test making sure we don't crash.
+
+; CHECK:      Relocations [
+; CHECK-NEXT:   Section __text {
+; CHECK-NEXT:     Relocation {
+; CHECK-NEXT:       Offset: 0x0
+; CHECK-NEXT:       PCRel: 0
+; CHECK-NEXT:       Length: 2
+; CHECK-NEXT:       Extern: 1
+; CHECK-NEXT:       Type: ARM64_RELOC_PAGEOFF12 (4)
+; CHECK-NEXT:       Symbol: ltmp1
+; CHECK-NEXT:       Scattered: 0
+; CHECK-NEXT:     }
+; CHECK-NEXT:   }
+; CHECK-NEXT: ]
+
+
+        ldr     x0, [x8, L_bar@PAGEOFF]
+
+        .section        __foo,__bar,regular,no_dead_strip
+L_bar:
+        .quad   0
diff --git a/test/MC/MachO/ARM/static-movt-relocs.s b/test/MC/MachO/ARM/static-movt-relocs.s
index dce5683..4385549 100644
--- a/test/MC/MachO/ARM/static-movt-relocs.s
+++ b/test/MC/MachO/ARM/static-movt-relocs.s
@@ -1,4 +1,4 @@
-@ RUN: llvm-mc -mcpu=cortex-a8 -triple thumbv7-apple-darwin10 -filetype=obj -o - < %s | macho-dump | FileCheck %s
+@ RUN: llvm-mc -mcpu=cortex-a8 -triple thumbv7-apple-darwin10 -filetype=obj -o - < %s | llvm-readobj -r --expand-relocs | FileCheck %s
         .thumb
         .thumb_func foo
 foo:
@@ -6,18 +6,43 @@ foo:
         movt r0, :upper16:(bar + 16)
         bx r0
 
-
-@ CHECK:  ('_relocations', [
-@ CHECK:    # Relocation 0
-@ CHECK:    (('word-0', 0x4),
-@ CHECK:     ('word-1', 0x8e000001)),
-@ CHECK:    # Relocation 1
-@ CHECK:    (('word-0', 0x10),
-@ CHECK:     ('word-1', 0x16ffffff)),
-@ CHECK:    # Relocation 2
-@ CHECK:    (('word-0', 0x0),
-@ CHECK:     ('word-1', 0x8c000001)),
-@ CHECK:    # Relocation 3
-@ CHECK:    (('word-0', 0x0),
-@ CHECK:     ('word-1', 0x14ffffff)),
-@ CHECK:  ])
+@ CHECK:      Relocations [
+@ CHECK-NEXT:   Section __text {
+@ CHECK-NEXT:     Relocation {
+@ CHECK-NEXT:       Offset: 0x4
+@ CHECK-NEXT:       PCRel: 0
+@ CHECK-NEXT:       Length: 3
+@ CHECK-NEXT:       Extern: 1
+@ CHECK-NEXT:       Type: ARM_RELOC_HALF (8)
+@ CHECK-NEXT:       Symbol: bar
+@ CHECK-NEXT:       Scattered: 0
+@ CHECK-NEXT:     }
+@ CHECK-NEXT:     Relocation {
+@ CHECK-NEXT:       Offset: 0x10
+@ CHECK-NEXT:       PCRel: 0
+@ CHECK-NEXT:       Length: 3
+@ CHECK-NEXT:       Extern: 0
+@ CHECK-NEXT:       Type: ARM_RELOC_PAIR (1)
+@ CHECK-NEXT:       Symbol: 0xFFFFFF
+@ CHECK-NEXT:       Scattered: 0
+@ CHECK-NEXT:     }
+@ CHECK-NEXT:     Relocation {
+@ CHECK-NEXT:       Offset: 0x0
+@ CHECK-NEXT:       PCRel: 0
+@ CHECK-NEXT:       Length: 2
+@ CHECK-NEXT:       Extern: 1
+@ CHECK-NEXT:       Type: ARM_RELOC_HALF (8)
+@ CHECK-NEXT:       Symbol: bar
+@ CHECK-NEXT:       Scattered: 0
+@ CHECK-NEXT:     }
+@ CHECK-NEXT:     Relocation {
+@ CHECK-NEXT:       Offset: 0x0
+@ CHECK-NEXT:       PCRel: 0
+@ CHECK-NEXT:       Length: 2
+@ CHECK-NEXT:       Extern: 0
+@ CHECK-NEXT:       Type: ARM_RELOC_PAIR (1)
+@ CHECK-NEXT:       Symbol: 0xFFFFFF
+@ CHECK-NEXT:       Scattered: 0
+@ CHECK-NEXT:     }
+@ CHECK-NEXT:   }
+@ CHECK-NEXT: ]
diff --git a/test/MC/MachO/darwin-x86_64-reloc.s b/test/MC/MachO/darwin-x86_64-reloc.s
index 1dfb982..48dd6b4 100644
--- a/test/MC/MachO/darwin-x86_64-reloc.s
+++ b/test/MC/MachO/darwin-x86_64-reloc.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -n -triple x86_64-apple-darwin9 %s -filetype=obj -o - | macho-dump --dump-section-data | FileCheck %s
+// RUN: llvm-mc -n -triple x86_64-apple-darwin9 %s -filetype=obj -o - | llvm-readobj -r --expand-relocs | FileCheck %s
 
 // These examples are taken from <mach-o/x86_64/reloc.h>.
 
@@ -86,320 +86,363 @@ L6:
 
         .text
         cmpq $0, _foo@GOTPCREL(%rip)
-        
-// CHECK: ('cputype', 16777223)
-// CHECK: ('cpusubtype', 3)
-// CHECK: ('filetype', 1)
-// CHECK: ('num_load_commands', 3)
-// CHECK: ('load_commands_size', 496)
-// CHECK: ('flag', 0)
-// CHECK: ('reserved', 0)
-// CHECK: ('load_commands', [
-// CHECK:   # Load Command 0
-// CHECK:  (('command', 25)
-// CHECK:   ('size', 392)
-// CHECK:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:   ('vm_addr', 0)
-// CHECK:   ('vm_size', 311)
-// CHECK:   ('file_offset', 528)
-// CHECK:   ('file_size', 311)
-// CHECK:   ('maxprot', 7)
-// CHECK:   ('initprot', 7)
-// CHECK:   ('num_sections', 4)
-// CHECK:   ('flags', 0)
-// CHECK:   ('sections', [
-// CHECK:     # Section 0
-// CHECK:    (('section_name', '__data\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 40)
-// CHECK:     ('offset', 528)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 840)
-// CHECK:     ('num_reloc', 5)
-// CHECK:     ('flags', 0x0)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:     # Relocation 0
-// CHECK:     (('word-0', 0x20),
-// CHECK:      ('word-1', 0x6000004)),
-// CHECK:     # Relocation 1
-// CHECK:     (('word-0', 0x18),
-// CHECK:      ('word-1', 0xe000006)),
-// CHECK:     # Relocation 2
-// CHECK:     (('word-0', 0x10),
-// CHECK:      ('word-1', 0x6000004)),
-// CHECK:     # Relocation 3
-// CHECK:     (('word-0', 0x8),
-// CHECK:      ('word-1', 0x4d000000)),
-// CHECK:     # Relocation 4
-// CHECK:     (('word-0', 0x4),
-// CHECK:      ('word-1', 0x4d000008)),
-// CHECK:   ])
-// CHECK:   ('_section_data', '00000000 04000000 04000000 00000000 1f010000 00000000 00000000 00000000 2f010000 00000000')
-// CHECK:     # Section 1
-// CHECK:    (('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 40)
-// CHECK:     ('size', 223)
-// CHECK:     ('offset', 568)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 880)
-// CHECK:     ('num_reloc', 32)
-// CHECK:     ('flags', 0x80000400)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:     # Relocation 0
-// CHECK:     (('word-0', 0xda),
-// CHECK:      ('word-1', 0x4d000000)),
-// CHECK:     # Relocation 1
-// CHECK:     (('word-0', 0xd3),
-// CHECK:      ('word-1', 0x15000004)),
-// CHECK:     # Relocation 2
-// CHECK:     (('word-0', 0xcd),
-// CHECK:      ('word-1', 0x1d000006)),
-// CHECK:     # Relocation 3
-// CHECK:     (('word-0', 0xc7),
-// CHECK:      ('word-1', 0x15000004)),
-// CHECK:     # Relocation 4
-// CHECK:     (('word-0', 0xc1),
-// CHECK:      ('word-1', 0x15000001)),
-// CHECK:     # Relocation 5
-// CHECK:     (('word-0', 0xa5),
-// CHECK:      ('word-1', 0x5e000003)),
-// CHECK:     # Relocation 6
-// CHECK:     (('word-0', 0xa5),
-// CHECK:      ('word-1', 0xe000000)),
-// CHECK:     # Relocation 7
-// CHECK:     (('word-0', 0x9d),
-// CHECK:      ('word-1', 0x5e000003)),
-// CHECK:     # Relocation 8
-// CHECK:     (('word-0', 0x9d),
-// CHECK:      ('word-1', 0xe000000)),
-// CHECK:     # Relocation 9
-// CHECK:     (('word-0', 0x95),
-// CHECK:      ('word-1', 0xe000003)),
-// CHECK:     # Relocation 10
-// CHECK:     (('word-0', 0x8d),
-// CHECK:      ('word-1', 0xe000003)),
-// CHECK:     # Relocation 11
-// CHECK:     (('word-0', 0x79),
-// CHECK:      ('word-1', 0x8d000003)),
-// CHECK:     # Relocation 12
-// CHECK:     (('word-0', 0x71),
-// CHECK:      ('word-1', 0x7d000003)),
-// CHECK:     # Relocation 13
-// CHECK:     (('word-0', 0x69),
-// CHECK:      ('word-1', 0x6d000003)),
-// CHECK:     # Relocation 14
-// CHECK:     (('word-0', 0x63),
-// CHECK:      ('word-1', 0x1d000003)),
-// CHECK:     # Relocation 15
-// CHECK:     (('word-0', 0x5c),
-// CHECK:      ('word-1', 0x1d000003)),
-// CHECK:     # Relocation 16
-// CHECK:     (('word-0', 0x55),
-// CHECK:      ('word-1', 0x5c000002)),
-// CHECK:     # Relocation 17
-// CHECK:     (('word-0', 0x55),
-// CHECK:      ('word-1', 0xc000000)),
-// CHECK:     # Relocation 18
-// CHECK:     (('word-0', 0x4d),
-// CHECK:      ('word-1', 0x5e000002)),
-// CHECK:     # Relocation 19
-// CHECK:     (('word-0', 0x4d),
-// CHECK:      ('word-1', 0xe000000)),
-// CHECK:     # Relocation 20
-// CHECK:     (('word-0', 0x45),
-// CHECK:      ('word-1', 0x5e000002)),
-// CHECK:     # Relocation 21
-// CHECK:     (('word-0', 0x45),
-// CHECK:      ('word-1', 0xe000000)),
-// CHECK:     # Relocation 22
-// CHECK:     (('word-0', 0x3d),
-// CHECK:      ('word-1', 0xe000000)),
-// CHECK:     # Relocation 23
-// CHECK:     (('word-0', 0x35),
-// CHECK:      ('word-1', 0xe000000)),
-// CHECK:     # Relocation 24
-// CHECK:     (('word-0', 0x2d),
-// CHECK:      ('word-1', 0x8d000000)),
-// CHECK:     # Relocation 25
-// CHECK:     (('word-0', 0x26),
-// CHECK:      ('word-1', 0x6d000000)),
-// CHECK:     # Relocation 26
-// CHECK:     (('word-0', 0x20),
-// CHECK:      ('word-1', 0x1d000000)),
-// CHECK:     # Relocation 27
-// CHECK:     (('word-0', 0x1a),
-// CHECK:      ('word-1', 0x1d000000)),
-// CHECK:     # Relocation 28
-// CHECK:     (('word-0', 0x14),
-// CHECK:      ('word-1', 0x4d000000)),
-// CHECK:     # Relocation 29
-// CHECK:     (('word-0', 0xe),
-// CHECK:      ('word-1', 0x3d000000)),
-// CHECK:     # Relocation 30
-// CHECK:     (('word-0', 0x7),
-// CHECK:      ('word-1', 0x2d000000)),
-// CHECK:     # Relocation 31
-// CHECK:     (('word-0', 0x2),
-// CHECK:      ('word-1', 0x2d000000)),
-// CHECK:   ])
-// CHECK:   ('_section_data', 'c3e80000 0000e804 00000048 8b050000 0000ff35 00000000 8b050000 00008b05 04000000 c605ffff ffff12c7 05fcffff ff785634 12000000 00000000 00040000 00000000 00000000 00000000 00040000 00000000 00000000 00488d05 2c000000 488d0514 00000083 05130000 00066681 05120000 00f40181 05100000 00f40100 00909090 90909090 90909090 902c0000 00000000 00140000 00000000 00e4ffff ffffffff ffd4ffff ffffffff ff2c0000 00000000 0083c000 03042503 0000008b 051fffff ff8b052c 0000008b 05000000 008b0530 00000048 833dffff ffff00')
-// CHECK:     # Section 2
-// CHECK:    (('section_name', '__debug_frame\x00\x00\x00')
-// CHECK:     ('segment_name', '__DWARF\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 263)
-// CHECK:     ('size', 16)
-// CHECK:     ('offset', 791)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 1136)
-// CHECK:     ('num_reloc', 2)
-// CHECK:     ('flags', 0x2000000)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:     # Relocation 0
-// CHECK:     (('word-0', 0x8),
-// CHECK:      ('word-1', 0xe000007)),
-// CHECK:     # Relocation 1
-// CHECK:     (('word-0', 0x0),
-// CHECK:      ('word-1', 0x6000002)),
-// CHECK:   ])
-// CHECK:   ('_section_data', 'd5000000 00000000 00000000 00000000')
-// CHECK:     # Section 3
-// CHECK:    (('section_name', '__literal8\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 279)
-// CHECK:     ('size', 32)
-// CHECK:     ('offset', 807)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x4)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:   ('_section_data', '00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000')
-// CHECK:   ])
-// CHECK:  ),
-// CHECK:   # Load Command 1
-// CHECK:  (('command', 2)
-// CHECK:   ('size', 24)
-// CHECK:   ('symoff', 1152)
-// CHECK:   ('nsyms', 9)
-// CHECK:   ('stroff', 1296)
-// CHECK:   ('strsize', 48)
-// CHECK:   ('_string_data', '\x00_baz\x00_prev\x00_foobar\x00_bar\x00_ext_foo\x00f6\x00_f3\x00_f2\x00\x00\x00\x00')
-// CHECK:   ('_symbols', [
-// CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 29)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 2)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 40)
-// CHECK:     ('_string', '_foo')
-// CHECK:    ),
-// CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 1)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 2)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 41)
-// CHECK:     ('_string', '_baz')
-// CHECK:    ),
-// CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 20)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 2)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 101)
-// CHECK:     ('_string', '_bar')
-// CHECK:    ),
-// CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 6)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 2)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 169)
-// CHECK:     ('_string', '_prev')
-// CHECK:    ),
-// CHECK:     # Symbol 4
-// CHECK:    (('n_strx', 41)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 2)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 221)
-// CHECK:     ('_string', '_f2')
-// CHECK:    ),
-// CHECK:     # Symbol 5
-// CHECK:    (('n_strx', 37)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 2)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 224)
-// CHECK:     ('_string', '_f3')
-// CHECK:    ),
-// CHECK:     # Symbol 6
-// CHECK:    (('n_strx', 34)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 4)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 295)
-// CHECK:     ('_string', 'f6')
-// CHECK:    ),
-// CHECK:     # Symbol 7
-// CHECK:    (('n_strx', 25)
-// CHECK:     ('n_type', 0x1)
-// CHECK:     ('n_sect', 0)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', '_ext_foo')
-// CHECK:    ),
-// CHECK:     # Symbol 8
-// CHECK:    (('n_strx', 12)
-// CHECK:     ('n_type', 0x1)
-// CHECK:     ('n_sect', 0)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', '_foobar')
-// CHECK:    ),
-// CHECK:   ])
-// CHECK:  ),
-// CHECK:   # Load Command 2
-// CHECK:  (('command', 11)
-// CHECK:   ('size', 80)
-// CHECK:   ('ilocalsym', 0)
-// CHECK:   ('nlocalsym', 7)
-// CHECK:   ('iextdefsym', 7)
-// CHECK:   ('nextdefsym', 0)
-// CHECK:   ('iundefsym', 7)
-// CHECK:   ('nundefsym', 2)
-// CHECK:   ('tocoff', 0)
-// CHECK:   ('ntoc', 0)
-// CHECK:   ('modtaboff', 0)
-// CHECK:   ('nmodtab', 0)
-// CHECK:   ('extrefsymoff', 0)
-// CHECK:   ('nextrefsyms', 0)
-// CHECK:   ('indirectsymoff', 0)
-// CHECK:   ('nindirectsyms', 0)
-// CHECK:   ('extreloff', 0)
-// CHECK:   ('nextrel', 0)
-// CHECK:   ('locreloff', 0)
-// CHECK:   ('nlocrel', 0)
-// CHECK:   ('_indirect_symbols', [
-// CHECK:   ])
-// CHECK:  ),
-// CHECK: ])
+
+// CHECK:      Relocations [
+// CHECK-NEXT:   Section __data {
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x20
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 0
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: 0x4
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x18
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: f6
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x10
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 0
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: 0x4
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x8
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_GOT (4)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x4
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_GOT (4)
+// CHECK-NEXT:       Symbol: _foobar
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Section __text {
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0xDA
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_GOT (4)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0xD3
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 0
+// CHECK-NEXT:       Type: X86_64_RELOC_SIGNED (1)
+// CHECK-NEXT:       Symbol: 0x4
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0xCD
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_SIGNED (1)
+// CHECK-NEXT:       Symbol: f6
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0xC7
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 0
+// CHECK-NEXT:       Type: X86_64_RELOC_SIGNED (1)
+// CHECK-NEXT:       Symbol: 0x4
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0xC1
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 0
+// CHECK-NEXT:       Type: X86_64_RELOC_SIGNED (1)
+// CHECK-NEXT:       Symbol: 0x1
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0xA5
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_SUBTRACTOR (5)
+// CHECK-NEXT:       Symbol: _prev
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0xA5
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x9D
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_SUBTRACTOR (5)
+// CHECK-NEXT:       Symbol: _prev
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x9D
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x95
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: _prev
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x8D
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: _prev
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x79
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_SIGNED_4 (8)
+// CHECK-NEXT:       Symbol: _prev
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x71
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_SIGNED_2 (7)
+// CHECK-NEXT:       Symbol: _prev
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x69
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_SIGNED_1 (6)
+// CHECK-NEXT:       Symbol: _prev
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x63
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_SIGNED (1)
+// CHECK-NEXT:       Symbol: _prev
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x5C
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_SIGNED (1)
+// CHECK-NEXT:       Symbol: _prev
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x55
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_SUBTRACTOR (5)
+// CHECK-NEXT:       Symbol: _bar
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x55
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x4D
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_SUBTRACTOR (5)
+// CHECK-NEXT:       Symbol: _bar
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x4D
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x45
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_SUBTRACTOR (5)
+// CHECK-NEXT:       Symbol: _bar
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x45
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x3D
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x35
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x2D
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_SIGNED_4 (8)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x26
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_SIGNED_1 (6)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x20
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_SIGNED (1)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x1A
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_SIGNED (1)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x14
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_GOT (4)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0xE
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_GOT_LOAD (3)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x7
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_BRANCH (2)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x2
+// CHECK-NEXT:       PCRel: 1
+// CHECK-NEXT:       Length: 2
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_BRANCH (2)
+// CHECK-NEXT:       Symbol: _foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Section __debug_frame {
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x8
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: _ext_foo
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x0
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 0
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: 0x2
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
diff --git a/test/MC/MachO/linker-options.ll b/test/MC/MachO/linker-options.ll
index 827adfd..2cda835 100644
--- a/test/MC/MachO/linker-options.ll
+++ b/test/MC/MachO/linker-options.ll
@@ -34,10 +34,6 @@
 ; CHECK-OBJ:  ),
 ; CHECK-OBJ: ])
 
-!0 = metadata !{ i32 6, metadata !"Linker Options", 
-   metadata !{
-      metadata !{ metadata !"-lz" },
-      metadata !{ metadata !"-framework", metadata !"Cocoa" },
-      metadata !{ metadata !"-lmath" } } }
+!0 = !{i32 6, !"Linker Options", !{!{!"-lz"}, !{!"-framework", !"Cocoa"}, !{!"-lmath"}}}
 
 !llvm.module.flags = !{ !0 }
diff --git a/test/MC/MachO/reloc.s b/test/MC/MachO/reloc.s
index 2a6d5db..55c9940 100644
--- a/test/MC/MachO/reloc.s
+++ b/test/MC/MachO/reloc.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple i386-apple-darwin9 %s -filetype=obj -o - | macho-dump --dump-section-data | FileCheck %s
+// RUN: llvm-mc -triple i386-apple-darwin9 %s -filetype=obj -o - | llvm-readobj -r -expand-relocs | FileCheck %s
 
         .data
         .long undef
@@ -53,240 +53,164 @@ _f1:
         .long _f1
         .long _f1 + 4
 
-// CHECK: ('cputype', 7)
-// CHECK: ('cpusubtype', 3)
-// CHECK: ('filetype', 1)
-// CHECK: ('num_load_commands', 3)
-// CHECK: ('load_commands_size', 364)
-// CHECK: ('flag', 0)
-// CHECK: ('load_commands', [
-// CHECK:   # Load Command 0
-// CHECK:  (('command', 1)
-// CHECK:   ('size', 260)
-// CHECK:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:   ('vm_addr', 0)
-// CHECK:   ('vm_size', 76)
-// CHECK:   ('file_offset', 392)
-// CHECK:   ('file_size', 76)
-// CHECK:   ('maxprot', 7)
-// CHECK:   ('initprot', 7)
-// CHECK:   ('num_sections', 3)
-// CHECK:   ('flags', 0)
-// CHECK:   ('sections', [
-// CHECK:     # Section 0
-// CHECK:    (('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 13)
-// CHECK:     ('offset', 392)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 468)
-// CHECK:     ('num_reloc', 2)
-// CHECK:     ('flags', 0x80000400)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:     # Relocation 0
-// CHECK:     (('word-0', 0x6),
-// CHECK:      ('word-1', 0x5000003)),
-// CHECK:     # Relocation 1
-// CHECK:     (('word-0', 0x1),
-// CHECK:      ('word-1', 0x5000000)),
-// CHECK:   ])
-// CHECK:   ('_section_data', 'e9f9cabe bae93a00 0000ebf4 c3')
-// CHECK:     # Section 1
-// CHECK:    (('section_name', '__data\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 13)
-// CHECK:     ('size', 51)
-// CHECK:     ('offset', 405)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 484)
-// CHECK:     ('num_reloc', 11)
-// CHECK:     ('flags', 0x0)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:     # Relocation 0
-// CHECK:     (('word-0', 0x2f),
-// CHECK:      ('word-1', 0xc000007)),
-// CHECK:     # Relocation 1
-// CHECK:     (('word-0', 0x2b),
-// CHECK:      ('word-1', 0xc000007)),
-// CHECK:     # Relocation 2
-// CHECK:     (('word-0', 0x8000002a),
-// CHECK:      ('word-1', 0x1d)),
-// CHECK:     # Relocation 3
-// CHECK:     (('word-0', 0x90000028),
-// CHECK:      ('word-1', 0x1d)),
-// CHECK:     # Relocation 4
-// CHECK:     (('word-0', 0xa0000024),
-// CHECK:      ('word-1', 0x1d)),
-// CHECK:     # Relocation 5
-// CHECK:     (('word-0', 0xa0000020),
-// CHECK:      ('word-1', 0x1d)),
-// CHECK:     # Relocation 6
-// CHECK:     (('word-0', 0xa4000014),
-// CHECK:      ('word-1', 0x21)),
-// CHECK:     # Relocation 7
-// CHECK:     (('word-0', 0xa1000000),
-// CHECK:      ('word-1', 0x29)),
-// CHECK:     # Relocation 8
-// CHECK:     (('word-0', 0x8),
-// CHECK:      ('word-1', 0x4000002)),
-// CHECK:     # Relocation 9
-// CHECK:     (('word-0', 0x4),
-// CHECK:      ('word-1', 0xc000009)),
-// CHECK:     # Relocation 10
-// CHECK:     (('word-0', 0x0),
-// CHECK:      ('word-1', 0xc000009)),
-// CHECK:   ])
-// CHECK:   ('_section_data', '00000000 04000000 15000000 00000000 00000000 ed000000 00000000 00000000 1e000000 27000000 31007600 00000004 000000')
-// CHECK:     # Section 2
-// CHECK:    (('section_name', '__const\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 64)
-// CHECK:     ('size', 12)
-// CHECK:     ('offset', 456)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 572)
-// CHECK:     ('num_reloc', 4)
-// CHECK:     ('flags', 0x0)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:     # Relocation 0
-// CHECK:     (('word-0', 0x8),
-// CHECK:      ('word-1', 0x4000001)),
-// CHECK:     # Relocation 1
-// CHECK:     (('word-0', 0x4),
-// CHECK:      ('word-1', 0x4000003)),
-// CHECK:     # Relocation 2
-// CHECK:     (('word-0', 0xa4000000),
-// CHECK:      ('word-1', 0x1d)),
-// CHECK:     # Relocation 3
-// CHECK:     (('word-0', 0xa1000000),
-// CHECK:      ('word-1', 0x40)),
-// CHECK:   ])
-// CHECK:   ('_section_data', 'feffffff 44000000 00000000')
-// CHECK:   ])
-// CHECK:  ),
-// CHECK:   # Load Command 1
-// CHECK:  (('command', 2)
-// CHECK:   ('size', 24)
-// CHECK:   ('symoff', 604)
-// CHECK:   ('nsyms', 10)
-// CHECK:   ('stroff', 724)
-// CHECK:   ('strsize', 88)
-// CHECK:   ('_string_data', '\x00local_a_ext\x00local_a_elt\x00bar\x00undef\x00local_c\x00local_b\x00local_a\x00.objc_class_name_A\x00_f1\x00_f0\x00\x00\x00')
-// CHECK:   ('_symbols', [
-// CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 51)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 2)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 25)
-// CHECK:     ('_string', 'local_a')
-// CHECK:    ),
-// CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 13)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 2)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 29)
-// CHECK:     ('_string', 'local_a_elt')
-// CHECK:    ),
-// CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 43)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 2)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 33)
-// CHECK:     ('_string', 'local_b')
-// CHECK:    ),
-// CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 35)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 2)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 41)
-// CHECK:     ('_string', 'local_c')
-// CHECK:    ),
-// CHECK:     # Symbol 4
-// CHECK:    (('n_strx', 25)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 3)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 64)
-// CHECK:     ('_string', 'bar')
-// CHECK:    ),
-// CHECK:     # Symbol 5
-// CHECK:    (('n_strx', 82)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 1)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', '_f0')
-// CHECK:    ),
-// CHECK:     # Symbol 6
-// CHECK:    (('n_strx', 59)
-// CHECK:     ('n_type', 0x3)
-// CHECK:     ('n_sect', 0)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', '.objc_class_name_A')
-// CHECK:    ),
-// CHECK:     # Symbol 7
-// CHECK:    (('n_strx', 78)
-// CHECK:     ('n_type', 0xf)
-// CHECK:     ('n_sect', 1)
-// CHECK:     ('n_desc', 128)
-// CHECK:     ('n_value', 13)
-// CHECK:     ('_string', '_f1')
-// CHECK:    ),
-// CHECK:     # Symbol 8
-// CHECK:    (('n_strx', 1)
-// CHECK:     ('n_type', 0xf)
-// CHECK:     ('n_sect', 2)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 21)
-// CHECK:     ('_string', 'local_a_ext')
-// CHECK:    ),
-// CHECK:     # Symbol 9
-// CHECK:    (('n_strx', 29)
-// CHECK:     ('n_type', 0x1)
-// CHECK:     ('n_sect', 0)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'undef')
-// CHECK:    ),
-// CHECK:   ])
-// CHECK:  ),
-// CHECK:   # Load Command 2
-// CHECK:  (('command', 11)
-// CHECK:   ('size', 80)
-// CHECK:   ('ilocalsym', 0)
-// CHECK:   ('nlocalsym', 6)
-// CHECK:   ('iextdefsym', 6)
-// CHECK:   ('nextdefsym', 3)
-// CHECK:   ('iundefsym', 9)
-// CHECK:   ('nundefsym', 1)
-// CHECK:   ('tocoff', 0)
-// CHECK:   ('ntoc', 0)
-// CHECK:   ('modtaboff', 0)
-// CHECK:   ('nmodtab', 0)
-// CHECK:   ('extrefsymoff', 0)
-// CHECK:   ('nextrefsyms', 0)
-// CHECK:   ('indirectsymoff', 0)
-// CHECK:   ('nindirectsyms', 0)
-// CHECK:   ('extreloff', 0)
-// CHECK:   ('nextrel', 0)
-// CHECK:   ('locreloff', 0)
-// CHECK:   ('nlocrel', 0)
-// CHECK:   ('_indirect_symbols', [
-// CHECK:   ])
-// CHECK:  ),
-// CHECK: ])
+// CHECK:     Relocations [
+// CHECK-NEXT:  Section __text {
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x6
+// CHECK-NEXT:      PCRel: 1
+// CHECK-NEXT:      Length: 2
+// CHECK-NEXT:      Extern: 0
+// CHECK-NEXT:      Type: GENERIC_RELOC_VANILLA (0)
+// CHECK-NEXT:      Symbol: 0x3
+// CHECK-NEXT:      Scattered: 0
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x1
+// CHECK-NEXT:      PCRel: 1
+// CHECK-NEXT:      Length: 2
+// CHECK-NEXT:      Extern: 0
+// CHECK-NEXT:      Type: GENERIC_RELOC_VANILLA (0)
+// CHECK-NEXT:      Symbol: 0x0
+// CHECK-NEXT:      Scattered: 0
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  Section __data {
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x2F
+// CHECK-NEXT:      PCRel: 0
+// CHECK-NEXT:      Length: 2
+// CHECK-NEXT:      Extern: 1
+// CHECK-NEXT:      Type: GENERIC_RELOC_VANILLA (0)
+// CHECK-NEXT:      Symbol: _f1
+// CHECK-NEXT:      Scattered: 0
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x2B
+// CHECK-NEXT:      PCRel: 0
+// CHECK-NEXT:      Length: 2
+// CHECK-NEXT:      Extern: 1
+// CHECK-NEXT:      Type: GENERIC_RELOC_VANILLA (0)
+// CHECK-NEXT:      Symbol: _f1
+// CHECK-NEXT:      Scattered: 0
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x2A
+// CHECK-NEXT:      PCRel: 0
+// CHECK-NEXT:      Length: 0
+// CHECK-NEXT:      Extern: N/A
+// CHECK-NEXT:      Type: GENERIC_RELOC_VANILLA (0)
+// CHECK-NEXT:      Symbol: 0x1D
+// CHECK-NEXT:      Scattered: 1
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x28
+// CHECK-NEXT:      PCRel: 0
+// CHECK-NEXT:      Length: 1
+// CHECK-NEXT:      Extern: N/A
+// CHECK-NEXT:      Type: GENERIC_RELOC_VANILLA (0)
+// CHECK-NEXT:      Symbol: 0x1D
+// CHECK-NEXT:      Scattered: 1
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x24
+// CHECK-NEXT:      PCRel: 0
+// CHECK-NEXT:      Length: 2
+// CHECK-NEXT:      Extern: N/A
+// CHECK-NEXT:      Type: GENERIC_RELOC_VANILLA (0)
+// CHECK-NEXT:      Symbol: 0x1D
+// CHECK-NEXT:      Scattered: 1
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x20
+// CHECK-NEXT:      PCRel: 0
+// CHECK-NEXT:      Length: 2
+// CHECK-NEXT:      Extern: N/A
+// CHECK-NEXT:      Type: GENERIC_RELOC_VANILLA (0)
+// CHECK-NEXT:      Symbol: 0x1D
+// CHECK-NEXT:      Scattered: 1
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x14
+// CHECK-NEXT:      PCRel: 0
+// CHECK-NEXT:      Length: 2
+// CHECK-NEXT:      Extern: N/A
+// CHECK-NEXT:      Type: GENERIC_RELOC_LOCAL_SECTDIFF (4)
+// CHECK-NEXT:      Symbol: 0x21
+// CHECK-NEXT:      Scattered: 1
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x0
+// CHECK-NEXT:      PCRel: 0
+// CHECK-NEXT:      Length: 2
+// CHECK-NEXT:      Extern: N/A
+// CHECK-NEXT:      Type: GENERIC_RELOC_PAIR (1)
+// CHECK-NEXT:      Symbol: 0x29
+// CHECK-NEXT:      Scattered: 1
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x8
+// CHECK-NEXT:      PCRel: 0
+// CHECK-NEXT:      Length: 2
+// CHECK-NEXT:      Extern: 0
+// CHECK-NEXT:      Type: GENERIC_RELOC_VANILLA (0)
+// CHECK-NEXT:      Symbol: 0x2
+// CHECK-NEXT:      Scattered: 0
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x4
+// CHECK-NEXT:      PCRel: 0
+// CHECK-NEXT:      Length: 2
+// CHECK-NEXT:      Extern: 1
+// CHECK-NEXT:      Type: GENERIC_RELOC_VANILLA (0)
+// CHECK-NEXT:      Symbol: undef
+// CHECK-NEXT:      Scattered: 0
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x0
+// CHECK-NEXT:      PCRel: 0
+// CHECK-NEXT:      Length: 2
+// CHECK-NEXT:      Extern: 1
+// CHECK-NEXT:      Type: GENERIC_RELOC_VANILLA (0)
+// CHECK-NEXT:      Symbol: undef
+// CHECK-NEXT:      Scattered: 0
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  Section __const {
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x8
+// CHECK-NEXT:      PCRel: 0
+// CHECK-NEXT:      Length: 2
+// CHECK-NEXT:      Extern: 0
+// CHECK-NEXT:      Type: GENERIC_RELOC_VANILLA (0)
+// CHECK-NEXT:      Symbol: 0x1
+// CHECK-NEXT:      Scattered: 0
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x4
+// CHECK-NEXT:      PCRel: 0
+// CHECK-NEXT:      Length: 2
+// CHECK-NEXT:      Extern: 0
+// CHECK-NEXT:      Type: GENERIC_RELOC_VANILLA (0)
+// CHECK-NEXT:      Symbol: 0x3
+// CHECK-NEXT:      Scattered: 0
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x0
+// CHECK-NEXT:      PCRel: 0
+// CHECK-NEXT:      Length: 2
+// CHECK-NEXT:      Extern: N/A
+// CHECK-NEXT:      Type: GENERIC_RELOC_LOCAL_SECTDIFF (4)
+// CHECK-NEXT:      Symbol: 0x1D
+// CHECK-NEXT:      Scattered: 1
+// CHECK-NEXT:    }
+// CHECK-NEXT:    Relocation {
+// CHECK-NEXT:      Offset: 0x0
+// CHECK-NEXT:      PCRel: 0
+// CHECK-NEXT:      Length: 2
+// CHECK-NEXT:      Extern: N/A
+// CHECK-NEXT:      Type: GENERIC_RELOC_PAIR (1)
+// CHECK-NEXT:      Symbol: 0x40
+// CHECK-NEXT:      Scattered: 1
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:]
diff --git a/test/MC/MachO/x86_64-mergeable.s b/test/MC/MachO/x86_64-mergeable.s
new file mode 100644
index 0000000..9724776
--- /dev/null
+++ b/test/MC/MachO/x86_64-mergeable.s
@@ -0,0 +1,59 @@
+// RUN: llvm-mc -triple x86_64-apple-darwin14 %s -filetype=obj -o - | llvm-readobj -r --expand-relocs | FileCheck %s
+
+// Test that we "S + K" produce a relocation with a symbol, but just S produces
+// a relocation with the section.
+
+	.section	__TEXT,__literal4,4byte_literals
+L0:
+	.long	42
+
+	.section	__TEXT,__cstring,cstring_literals
+L1:
+	.asciz	"42"
+
+	.section	__DATA,__data
+	.quad	L0
+	.quad	L0 + 1
+	.quad	L1
+	.quad	L1 + 1
+
+// CHECK:      Relocations [
+// CHECK-NEXT:   Section __data {
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x18
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: L1
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x10
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 0
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: 0x3
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x8
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 1
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: L0
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset: 0x0
+// CHECK-NEXT:       PCRel: 0
+// CHECK-NEXT:       Length: 3
+// CHECK-NEXT:       Extern: 0
+// CHECK-NEXT:       Type: X86_64_RELOC_UNSIGNED (0)
+// CHECK-NEXT:       Symbol: 0x2
+// CHECK-NEXT:       Scattered: 0
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
diff --git a/test/MC/MachO/x86_64-symbols.s b/test/MC/MachO/x86_64-symbols.s
index 9788feb..f40183d 100644
--- a/test/MC/MachO/x86_64-symbols.s
+++ b/test/MC/MachO/x86_64-symbols.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -o - | macho-dump | FileCheck %s
+// RUN: llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -o - | llvm-readobj -t | FileCheck %s
 
         .text
 L0:
@@ -121,878 +121,372 @@ D38:
 //L39:
 //D39:
 
-// CHECK: ('cputype', 16777223)
-// CHECK: ('cpusubtype', 3)
-// CHECK: ('filetype', 1)
-// CHECK: ('num_load_commands', 3)
-// CHECK: ('load_commands_size', 2656)
-// CHECK: ('flag', 0)
-// CHECK: ('reserved', 0)
-// CHECK: ('load_commands', [
-// CHECK:   # Load Command 0
-// CHECK:  (('command', 25)
-// CHECK:   ('size', 2552)
-// CHECK:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:   ('vm_addr', 0)
-// CHECK:   ('vm_size', 0)
-// CHECK:   ('file_offset', 2688)
-// CHECK:   ('file_size', 0)
-// CHECK:   ('maxprot', 7)
-// CHECK:   ('initprot', 7)
-// CHECK:   ('num_sections', 31)
-// CHECK:   ('flags', 0)
-// CHECK:   ('sections', [
-// CHECK:     # Section 0
-// CHECK:    (('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x80000000)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 1
-// CHECK:    (('section_name', '__const\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x0)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 2
-// CHECK:    (('section_name', '__static_const\x00\x00')
-// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x0)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 3
-// CHECK:    (('section_name', '__cstring\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x2)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 4
-// CHECK:    (('section_name', '__literal4\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 2)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x3)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 5
-// CHECK:    (('section_name', '__literal8\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 3)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x4)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 6
-// CHECK:    (('section_name', '__literal16\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 4)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0xe)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 7
-// CHECK:    (('section_name', '__constructor\x00\x00\x00')
-// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x0)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 8
-// CHECK:    (('section_name', '__destructor\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x0)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 9
-// CHECK:    (('section_name', '__data\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x0)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 10
-// CHECK:    (('section_name', '__static_data\x00\x00\x00')
-// CHECK:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x0)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 11
-// CHECK:    (('section_name', '__dyld\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x0)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 12
-// CHECK:    (('section_name', '__mod_init_func\x00')
-// CHECK:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 2)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x9)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 13
-// CHECK:    (('section_name', '__mod_term_func\x00')
-// CHECK:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 2)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0xa)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 14
-// CHECK:    (('section_name', '__const\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x0)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 15
-// CHECK:    (('section_name', '__class\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__OBJC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x10000000)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 16
-// CHECK:    (('section_name', '__meta_class\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__OBJC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x10000000)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 17
-// CHECK:    (('section_name', '__cat_cls_meth\x00\x00')
-// CHECK:     ('segment_name', '__OBJC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x10000000)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 18
-// CHECK:    (('section_name', '__cat_inst_meth\x00')
-// CHECK:     ('segment_name', '__OBJC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x10000000)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 19
-// CHECK:    (('section_name', '__protocol\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__OBJC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x10000000)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 20
-// CHECK:    (('section_name', '__string_object\x00')
-// CHECK:     ('segment_name', '__OBJC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x10000000)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 21
-// CHECK:    (('section_name', '__cls_meth\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__OBJC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x10000000)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 22
-// CHECK:    (('section_name', '__inst_meth\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__OBJC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x10000000)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 23
-// CHECK:    (('section_name', '__cls_refs\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__OBJC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 2)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x10000005)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 24
-// CHECK:    (('section_name', '__message_refs\x00\x00')
-// CHECK:     ('segment_name', '__OBJC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 2)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x10000005)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 25
-// CHECK:    (('section_name', '__symbols\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__OBJC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x10000000)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 26
-// CHECK:    (('section_name', '__category\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__OBJC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x10000000)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 27
-// CHECK:    (('section_name', '__class_vars\x00\x00\x00\x00')
-// CHECK:     ('segment_name', '__OBJC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x10000000)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 28
-// CHECK:    (('section_name', '__instance_vars\x00')
-// CHECK:     ('segment_name', '__OBJC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x10000000)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 29
-// CHECK:    (('section_name', '__module_info\x00\x00\x00')
-// CHECK:     ('segment_name', '__OBJC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x10000000)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:     # Section 30
-// CHECK:    (('section_name', '__selector_strs\x00')
-// CHECK:     ('segment_name', '__OBJC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK:     ('address', 0)
-// CHECK:     ('size', 0)
-// CHECK:     ('offset', 2688)
-// CHECK:     ('alignment', 0)
-// CHECK:     ('reloc_offset', 0)
-// CHECK:     ('num_reloc', 0)
-// CHECK:     ('flags', 0x2)
-// CHECK:     ('reserved1', 0)
-// CHECK:     ('reserved2', 0)
-// CHECK:     ('reserved3', 0)
-// CHECK:    ),
-// CHECK:   ('_relocations', [
-// CHECK:   ])
-// CHECK:   ])
-// CHECK:  ),
-// CHECK:   # Load Command 1
-// CHECK:  (('command', 2)
-// CHECK:   ('size', 24)
-// CHECK:   ('symoff', 2688)
-// CHECK:   ('nsyms', 40)
-// CHECK:   ('stroff', 3328)
-// CHECK:   ('strsize', 152)
-// CHECK:   ('_string_data', '\x00D9\x00D29\x00D19\x00D8\x00L38\x00D38\x00D28\x00D18\x00D7\x00L37\x00D37\x00D27\x00D17\x00D6\x00L36\x00D36\x00D26\x00D16\x00D5\x00L35\x00D35\x00D25\x00L4\x00D4\x00D34\x00D24\x00D3\x00D33\x00D23\x00D13\x00D2\x00D32\x00D22\x00D12\x00D1\x00D31\x00D21\x00D0\x00D30\x00D20\x00\x00\x00')
-// CHECK:   ('_symbols', [
-// CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 139)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 1)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D0')
-// CHECK:    ),
-// CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 128)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 1)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D1')
-// CHECK:    ),
-// CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 113)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 2)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D2')
-// CHECK:    ),
-// CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 98)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 3)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D3')
-// CHECK:    ),
-// CHECK:     # Symbol 4
-// CHECK:    (('n_strx', 84)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 4)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'L4')
-// CHECK:    ),
-// CHECK:     # Symbol 5
-// CHECK:    (('n_strx', 87)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 4)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D4')
-// CHECK:    ),
-// CHECK:     # Symbol 6
-// CHECK:    (('n_strx', 69)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 5)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D5')
-// CHECK:    ),
-// CHECK:     # Symbol 7
-// CHECK:    (('n_strx', 50)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 6)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D6')
-// CHECK:    ),
-// CHECK:     # Symbol 8
-// CHECK:    (('n_strx', 31)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 7)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D7')
-// CHECK:    ),
-// CHECK:     # Symbol 9
-// CHECK:    (('n_strx', 12)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 8)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D8')
-// CHECK:    ),
-// CHECK:     # Symbol 10
-// CHECK:    (('n_strx', 1)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 9)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D9')
-// CHECK:    ),
-// CHECK:     # Symbol 11
-// CHECK:    (('n_strx', 124)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 10)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D12')
-// CHECK:    ),
-// CHECK:     # Symbol 12
-// CHECK:    (('n_strx', 109)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 11)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D13')
-// CHECK:    ),
-// CHECK:     # Symbol 13
-// CHECK:    (('n_strx', 65)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 12)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D16')
-// CHECK:    ),
-// CHECK:     # Symbol 14
-// CHECK:    (('n_strx', 46)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 13)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D17')
-// CHECK:    ),
-// CHECK:     # Symbol 15
-// CHECK:    (('n_strx', 27)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 14)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D18')
-// CHECK:    ),
-// CHECK:     # Symbol 16
-// CHECK:    (('n_strx', 8)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 15)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D19')
-// CHECK:    ),
-// CHECK:     # Symbol 17
-// CHECK:    (('n_strx', 146)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 16)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D20')
-// CHECK:    ),
-// CHECK:     # Symbol 18
-// CHECK:    (('n_strx', 135)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 17)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D21')
-// CHECK:    ),
-// CHECK:     # Symbol 19
-// CHECK:    (('n_strx', 120)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 18)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D22')
-// CHECK:    ),
-// CHECK:     # Symbol 20
-// CHECK:    (('n_strx', 105)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 19)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D23')
-// CHECK:    ),
-// CHECK:     # Symbol 21
-// CHECK:    (('n_strx', 94)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 20)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D24')
-// CHECK:    ),
-// CHECK:     # Symbol 22
-// CHECK:    (('n_strx', 80)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 21)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D25')
-// CHECK:    ),
-// CHECK:     # Symbol 23
-// CHECK:    (('n_strx', 61)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 22)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D26')
-// CHECK:    ),
-// CHECK:     # Symbol 24
-// CHECK:    (('n_strx', 42)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 23)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D27')
-// CHECK:    ),
-// CHECK:     # Symbol 25
-// CHECK:    (('n_strx', 23)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 24)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D28')
-// CHECK:    ),
-// CHECK:     # Symbol 26
-// CHECK:    (('n_strx', 4)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 25)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D29')
-// CHECK:    ),
-// CHECK:     # Symbol 27
-// CHECK:    (('n_strx', 142)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 26)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D30')
-// CHECK:    ),
-// CHECK:     # Symbol 28
-// CHECK:    (('n_strx', 131)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 27)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D31')
-// CHECK:    ),
-// CHECK:     # Symbol 29
-// CHECK:    (('n_strx', 116)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 28)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D32')
-// CHECK:    ),
-// CHECK:     # Symbol 30
-// CHECK:    (('n_strx', 101)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 29)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D33')
-// CHECK:    ),
-// CHECK:     # Symbol 31
-// CHECK:    (('n_strx', 90)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 30)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D34')
-// CHECK:    ),
-// CHECK:     # Symbol 32
-// CHECK:    (('n_strx', 72)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 4)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'L35')
-// CHECK:    ),
-// CHECK:     # Symbol 33
-// CHECK:    (('n_strx', 76)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 4)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D35')
-// CHECK:    ),
-// CHECK:     # Symbol 34
-// CHECK:    (('n_strx', 53)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 4)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'L36')
-// CHECK:    ),
-// CHECK:     # Symbol 35
-// CHECK:    (('n_strx', 57)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 4)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D36')
-// CHECK:    ),
-// CHECK:     # Symbol 36
-// CHECK:    (('n_strx', 34)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 4)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'L37')
-// CHECK:    ),
-// CHECK:     # Symbol 37
-// CHECK:    (('n_strx', 38)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 4)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D37')
-// CHECK:    ),
-// CHECK:     # Symbol 38
-// CHECK:    (('n_strx', 15)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 31)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'L38')
-// CHECK:    ),
-// CHECK:     # Symbol 39
-// CHECK:    (('n_strx', 19)
-// CHECK:     ('n_type', 0xe)
-// CHECK:     ('n_sect', 31)
-// CHECK:     ('n_desc', 0)
-// CHECK:     ('n_value', 0)
-// CHECK:     ('_string', 'D38')
-// CHECK:    ),
-// CHECK:   ])
-// CHECK:  ),
-// CHECK:   # Load Command 2
-// CHECK:  (('command', 11)
-// CHECK:   ('size', 80)
-// CHECK:   ('ilocalsym', 0)
-// CHECK:   ('nlocalsym', 40)
-// CHECK:   ('iextdefsym', 40)
-// CHECK:   ('nextdefsym', 0)
-// CHECK:   ('iundefsym', 40)
-// CHECK:   ('nundefsym', 0)
-// CHECK:   ('tocoff', 0)
-// CHECK:   ('ntoc', 0)
-// CHECK:   ('modtaboff', 0)
-// CHECK:   ('nmodtab', 0)
-// CHECK:   ('extrefsymoff', 0)
-// CHECK:   ('nextrefsyms', 0)
-// CHECK:   ('indirectsymoff', 0)
-// CHECK:   ('nindirectsyms', 0)
-// CHECK:   ('extreloff', 0)
-// CHECK:   ('nextrel', 0)
-// CHECK:   ('locreloff', 0)
-// CHECK:   ('nlocrel', 0)
-// CHECK:   ('_indirect_symbols', [
-// CHECK:   ])
-// CHECK:  ),
-// CHECK: ])
+        .section foo, bar
+        .long L4 + 1
+        .long L35 + 1
+        .long L36 + 1
+        .long L37 + 1
+        .long L38 + 1
+
+// CHECK: Symbols [
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D0 (139)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __text (0x1)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D1 (128)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __text (0x1)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D2 (113)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __const (0x2)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D3 (98)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __static_const (0x3)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: L4 (84)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __cstring (0x4)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D4 (87)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __cstring (0x4)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D5 (69)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __literal4 (0x5)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D6 (50)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __literal8 (0x6)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D7 (31)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __literal16 (0x7)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D8 (12)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __constructor (0x8)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D9 (1)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __destructor (0x9)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D12 (124)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __data (0xA)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D13 (109)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __static_data (0xB)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D16 (65)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __dyld (0xC)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D17 (46)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __mod_init_func (0xD)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D18 (27)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __mod_term_func (0xE)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D19 (8)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __const (0xF)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D20 (146)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __class (0x10)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D21 (135)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __meta_class (0x11)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D22 (120)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __cat_cls_meth (0x12)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D23 (105)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __cat_inst_meth (0x13)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D24 (94)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __protocol (0x14)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D25 (80)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __string_object (0x15)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D26 (61)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __cls_meth (0x16)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D27 (42)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __inst_meth (0x17)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D28 (23)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __cls_refs (0x18)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D29 (4)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __message_refs (0x19)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D30 (142)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __symbols (0x1A)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D31 (131)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __category (0x1B)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D32 (116)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __class_vars (0x1C)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D33 (101)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __instance_vars (0x1D)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D34 (90)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __module_info (0x1E)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: L35 (72)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __cstring (0x4)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D35 (76)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __cstring (0x4)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: L36 (53)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __cstring (0x4)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D36 (57)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __cstring (0x4)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: L37 (34)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __cstring (0x4)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D37 (38)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __cstring (0x4)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: L38 (15)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __selector_strs (0x1F)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: D38 (19)
+// CHECK-NEXT:     Type: Section (0xE)
+// CHECK-NEXT:     Section: __selector_strs (0x1F)
+// CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
diff --git a/test/MC/Mips/cpload.s b/test/MC/Mips/cpload.s
index 46b3ee4..842e0c7 100644
--- a/test/MC/Mips/cpload.s
+++ b/test/MC/Mips/cpload.s
@@ -4,7 +4,7 @@
 # RUN:  llvm-objdump -d -r -arch=mips - | \
 # RUN:    FileCheck %s -check-prefix=OBJ-O32
 
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -mattr=-n64,+n32 -filetype=obj -o -| \
+# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -target-abi n32 -filetype=obj -o -| \
 # RUN:  llvm-objdump -d -r -arch=mips - | \
 # RUN:    FileCheck %s -check-prefix=OBJ-N32
 
diff --git a/test/MC/Mips/cpsetup-bad.s b/test/MC/Mips/cpsetup-bad.s
index 09252a1..ec6525a 100644
--- a/test/MC/Mips/cpsetup-bad.s
+++ b/test/MC/Mips/cpsetup-bad.s
@@ -12,3 +12,11 @@ t1:
 # ASM: :[[@LINE-1]]:23: error: expected save register or stack offset
         .cpsetup $31, $32, __cerror
 # ASM: :[[@LINE-1]]:23: error: invalid register
+        .cpsetup $25, $2, $3
+# ASM: :[[@LINE-1]]:28: error: expected expression
+        .cpsetup $25, $2, 4
+# ASM: :[[@LINE-1]]:28: error: expected symbol
+        .cpsetup $25, $2, 4+65
+# ASM: :[[@LINE-1]]:31: error: expected symbol
+        .cpsetup $25, $2, foo+4
+# ASM: :[[@LINE-1]]:32: error: expected symbol
diff --git a/test/MC/Mips/cpsetup.s b/test/MC/Mips/cpsetup.s
index a21a1e3..a3ffae6 100644
--- a/test/MC/Mips/cpsetup.s
+++ b/test/MC/Mips/cpsetup.s
@@ -1,19 +1,19 @@
-# RUN: llvm-mc -triple mips64-unknown-unknown -mattr=-n64,+o32 -filetype=obj -o - %s | \
+# RUN: llvm-mc -triple mips64-unknown-unknown -target-abi o32 -filetype=obj -o - %s | \
 # RUN:   llvm-objdump -d -r -arch=mips64 - | \
 # RUN:     FileCheck -check-prefix=O32 %s
 
-# RUN: llvm-mc -triple mips64-unknown-unknown -mattr=-n64,+o32 %s | \
+# RUN: llvm-mc -triple mips64-unknown-unknown -target-abi o32 %s | \
 # RUN:   FileCheck -check-prefix=ASM %s
 
-# RUN: llvm-mc -triple mips64-unknown-unknown -mattr=-n64,+n32 -filetype=obj -o - %s | \
-# RUN:   llvm-objdump -d -r -arch=mips64 - | \
+# RUN: llvm-mc -triple mips64-unknown-unknown -target-abi n32 -filetype=obj -o - %s | \
+# RUN:   llvm-objdump -d -r -t -arch=mips64 - | \
 # RUN:     FileCheck -check-prefix=NXX -check-prefix=N32 %s
 
-# RUN: llvm-mc -triple mips64-unknown-unknown -mattr=-n64,+n32 %s | \
+# RUN: llvm-mc -triple mips64-unknown-unknown -target-abi n32 %s | \
 # RUN:   FileCheck -check-prefix=ASM %s
 
 # RUN: llvm-mc -triple mips64-unknown-unknown %s -filetype=obj -o - | \
-# RUN:   llvm-objdump -d -r -arch=mips64 - | \
+# RUN:   llvm-objdump -d -r -t -arch=mips64 - | \
 # RUN:     FileCheck -check-prefix=NXX -check-prefix=N64 %s
 
 # RUN: llvm-mc -triple mips64-unknown-unknown %s | \
@@ -61,6 +61,35 @@ t2:
 
 # ASM: .cpsetup $25, $2, __cerror
 
+# .cpsetup with local labels (PR22518):
+1:
+        .cpsetup $25, $2, 1b
+        nop
+        sub $3, $3, $2
+        nop
+
+# O32: t2:
+# O32:   nop
+# O32:   sub $3, $3, $2
+# O32:   nop
+
+# FIXME: Direct object emission for N32 is still under development.
+# N32 doesn't allow 3 operations to be specified in the same relocation
+# record like N64 does.
+
+# NXX: move     $2, $gp
+# NXX: lui      $gp, 0
+# NXX: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  $tmp0
+# NXX: addiu    $gp, $gp, 0
+# NXX: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  $tmp0
+# N32: addu     $gp, $gp, $25
+# N64: daddu    $gp, $gp, $25
+# NXX: nop
+# NXX: sub $3, $3, $2
+# NXX: nop
+
+# ASM: .cpsetup $25, $2, $tmp0
+
 t3:
         .option pic0
         nop
@@ -76,3 +105,7 @@ t3:
 # ASM: nop
 # ASM: .cpsetup $25, 8, __cerror
 # ASM: nop
+
+# For .cpsetup with local labels, we need to check if $tmp0 is in the symbol
+# table:
+# NXX: .text  00000000 $tmp0
diff --git a/test/MC/Mips/do_switch3.s b/test/MC/Mips/do_switch3.s
index 02ad087..c0d9dc6 100644
--- a/test/MC/Mips/do_switch3.s
+++ b/test/MC/Mips/do_switch3.s
@@ -2,7 +2,7 @@
 // produced. This was not handled for direct object and an assertion
 // to occur. This is a variation on test case test/CodeGen/Mips/do_switch.ll
 
-// RUN: llvm-mc < %s -filetype=obj -triple=mips64-pc-linux -relocation-model=pic -mcpu=mips64 -mattr=n64
+// RUN: llvm-mc < %s -filetype=obj -triple=mips64-pc-linux -relocation-model=pic -mcpu=mips64 -target-abi=n64
 
 	.text
 	.abicalls
diff --git a/test/MC/Mips/elf_eflags.s b/test/MC/Mips/elf_eflags.s
index 1f28ee0..708894f 100644
--- a/test/MC/Mips/elf_eflags.s
+++ b/test/MC/Mips/elf_eflags.s
@@ -8,9 +8,13 @@
 # MIPSEL-MIPS64R6-NAN2008: Flags [ (0xA0000406)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r2 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64R2 %s
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r3 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64R2 %s
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r5 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64R2 %s
 # MIPSEL-MIPS64R2: Flags [ (0x80000006)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r2 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64R2-NAN2008 %s
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r3 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64R2-NAN2008 %s
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r5 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64R2-NAN2008 %s
 # MIPSEL-MIPS64R2-NAN2008: Flags [ (0x80000406)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64 %s
@@ -26,9 +30,13 @@
 # MIPSEL-MIPS32R6-NAN2008: Flags [ (0x90001404)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32r2 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32R2 %s
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32r3 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32R2 %s
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32r5 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32R2 %s
 # MIPSEL-MIPS32R2: Flags [ (0x70001004)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32R2-NAN2008 %s
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32r3 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32R2-NAN2008 %s
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32r5 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32R2-NAN2008 %s
 # MIPSEL-MIPS32R2-NAN2008: Flags [ (0x70001404)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32 %s
@@ -37,35 +45,35 @@
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32-NAN2008 %s
 # MIPSEL-MIPS32-NAN2008: Flags [ (0x50001404)
 
-# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r2 -mattr=-n64,n32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N32 %s
-# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=-n64,n32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N32 %s
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r2 -target-abi n32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N32 %s
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -target-abi n32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N32 %s
 # MIPS64EL-MIPS64R2-N32: Flags [ (0x80000024)
 
-# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=-n64,n32,+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N32-NAN2008 %s
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -target-abi n32 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N32-NAN2008 %s
 # MIPS64EL-MIPS64R2-N32-NAN2008: Flags [ (0x80000424)
 
-# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 -mattr=-n64,n32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-N32 %s
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 -target-abi n32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-N32 %s
 # MIPS64EL-MIPS64-N32: Flags [ (0x60000024)
 
-# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 -mattr=-n64,n32,+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-N32-NAN2008 %s
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 -target-abi n32 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-N32-NAN2008 %s
 # MIPS64EL-MIPS64-N32-NAN2008: Flags [ (0x60000424)
 
-# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=n64 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N64 %s
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -target-abi n64 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N64 %s
 # MIPS64EL-MIPS64R2-N64: Flags [ (0x80000006)
 
-# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=n64,+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N64-NAN2008 %s
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -target-abi n64 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N64-NAN2008 %s
 # MIPS64EL-MIPS64R2-N64-NAN2008: Flags [ (0x80000406)
 
-# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -mattr=n64 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-N64 %s
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -target-abi n64 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-N64 %s
 # MIPS64EL-MIPS64-N64: Flags [ (0x60000006)
 
-# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -mattr=n64,+nan2008 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-N64-NAN2008 %s
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -target-abi n64 -mattr=+nan2008 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-N64-NAN2008 %s
 # MIPS64EL-MIPS64-N64-NAN2008: Flags [ (0x60000406)
 
-# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=-n64,o32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-O32 %s
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -target-abi o32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-O32 %s
 # MIPS64EL-MIPS64R2-O32: Flags [ (0x80001104)
 
-# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=-n64,o32,+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-O32-NAN2008 %s
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -target-abi o32 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-O32-NAN2008 %s
 # MIPS64EL-MIPS64R2-O32-NAN2008: Flags [ (0x80001504)
 
 # RUN: llvm-mc -filetype=obj -triple mips64-unknown-linux -mcpu=mips5 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS5 %s
@@ -98,10 +106,10 @@
  # RUN: llvm-mc -filetype=obj -triple mips-unknown-linux -mcpu=mips1 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS1-NAN2008 %s
 # MIPS1-NAN2008: Flags [ (0x1404)
 
-# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -mattr=-n64,o32 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-O32 %s
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -target-abi o32 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-O32 %s
 # MIPS64EL-MIPS64-O32: Flags [ (0x60001104)
 
-# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -mattr=-n64,o32,+nan2008 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-O32-NAN2008 %s
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -target-abi o32 -mattr=+nan2008 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-O32-NAN2008 %s
 # MIPS64EL-MIPS64-O32-NAN2008: Flags [ (0x60001504)
 
 # Default ABI for MIPS64 is N64 as opposed to GCC/GAS (N32)
diff --git a/test/MC/Mips/elf_reginfo.s b/test/MC/Mips/elf_reginfo.s
index ba4788a..8a4ed3b 100644
--- a/test/MC/Mips/elf_reginfo.s
+++ b/test/MC/Mips/elf_reginfo.s
@@ -1,9 +1,9 @@
 # These *MUST* match the output of gas compiled with the same triple and
 # corresponding options (-mabi=64 -> -mattr=+n64 for example).
 
-# RUN: llvm-mc -filetype=obj -triple=mips64el-linux -mattr=-n64,+n64 %s -o - \
+# RUN: llvm-mc -filetype=obj -triple=mips64el-linux -target-abi n64 %s -o - \
 # RUN: | llvm-readobj -s | FileCheck --check-prefix=CHECK_64 %s
-# RUN: llvm-mc -filetype=obj -triple=mipsel %s -mattr=-o32,+n32 -o - \
+# RUN: llvm-mc -filetype=obj -triple=mipsel %s -target-abi n32 -o - \
 # RUN: | llvm-readobj -s | FileCheck --check-prefix=CHECK_32 %s
 
 # Check for register information sections.
diff --git a/test/MC/Mips/micromips-16-bit-instructions.s b/test/MC/Mips/micromips-16-bit-instructions.s
index 35855e1..25fbfdb 100644
--- a/test/MC/Mips/micromips-16-bit-instructions.s
+++ b/test/MC/Mips/micromips-16-bit-instructions.s
@@ -18,24 +18,49 @@
 # CHECK-EL: xor16   $17, $5         # encoding: [0x4d,0x44]
 # CHECK-EL: sll16   $3, $16, 5      # encoding: [0x8a,0x25]
 # CHECK-EL: srl16   $4, $17, 6      # encoding: [0x1d,0x26]
+# CHECK-EL: lbu16   $3, 4($17)      # encoding: [0x94,0x09]
+# CHECK-EL: lbu16   $3, -1($16)     # encoding: [0x8f,0x09]
+# CHECK-EL: lhu16   $3, 4($16)      # encoding: [0x82,0x29]
+# CHECK-EL: lw16    $4, 8($17)      # encoding: [0x12,0x6a]
+# CHECK-EL: sb16    $3, 4($16)      # encoding: [0x84,0x89]
+# CHECK-EL: sh16    $4, 8($17)      # encoding: [0x14,0xaa]
+# CHECK-EL: sw16    $4, 4($17)      # encoding: [0x11,0xea]
+# CHECK-EL: sw16    $zero, 4($17)   # encoding: [0x11,0xe8]
+# CHECK-EL: lw      $3, 32($gp)     # encoding: [0x88,0x65]
+# CHECK-EL: lw      $3, 32($sp)     # encoding: [0x68,0x48]
+# CHECK-EL: sw      $4, 124($sp)    # encoding: [0x9f,0xc8]
 # CHECK-EL: li16    $3, -1          # encoding: [0xff,0xed]
 # CHECK-EL: li16    $3, 126         # encoding: [0xfe,0xed]
 # CHECK-EL: addiur1sp $7, 4         # encoding: [0x83,0x6f]
 # CHECK-EL: addiur2 $6, $7, -1      # encoding: [0x7e,0x6f]
 # CHECK-EL: addiur2 $6, $7, 12      # encoding: [0x76,0x6f]
 # CHECK-EL: addius5 $7, -2          # encoding: [0xfc,0x4c]
+# CHECK-EL: addiusp -1028           # encoding: [0xff,0x4f]
+# CHECK-EL: addiusp -1032           # encoding: [0xfd,0x4f]
+# CHECK-EL: addiusp 1024            # encoding: [0x01,0x4c]
+# CHECK-EL: addiusp 1028            # encoding: [0x03,0x4c]
 # CHECK-EL: addiusp -16             # encoding: [0xf9,0x4f]
 # CHECK-EL: mfhi    $9              # encoding: [0x09,0x46]
 # CHECK-EL: mflo    $9              # encoding: [0x49,0x46]
 # CHECK-EL: move    $25, $1         # encoding: [0x21,0x0f]
+# CHECK-EL: movep   $5, $6, $2, $3  # encoding: [0x34,0x84]
 # CHECK-EL: jrc     $9              # encoding: [0xa9,0x45]
 # CHECK-NEXT: jalr    $9            # encoding: [0xc9,0x45]
 # CHECK-EL: jraddiusp 20            # encoding: [0x05,0x47]
-# CHECK-EL: nop                     # encoding: [0x00,0x00,0x00,0x00]
-# CHECK-EL: jalrs16 $9              # encoding: [0xe9,0x45]
-# CHECK-EL: move    $zero, $zero    # encoding: [0x00,0x0c]
+# CHECK-NEXT: jalrs16 $9            # encoding: [0xe9,0x45]
+# CHECK-EL: nop                     # encoding: [0x00,0x0c]
 # CHECK-EL: jr16    $9              # encoding: [0x89,0x45]
 # CHECK-EL: nop                     # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: beqz16 $6, 20           # encoding: [0x0a,0x8f]
+# CHECK-EL: nop                     # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: bnez16 $6, 20           # encoding: [0x0a,0xaf]
+# CHECK-EL: nop                     # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: b16 132                 # encoding: [0x42,0xcc]
+# CHECK-EL: nop
+# CHECK-EL: b16 132                 # encoding: [0x42,0xcc]
+# CHECK-EL: nop
+# CHECK-EL: break16 8               # encoding: [0x88,0x46]
+# CHECK-EL: sdbbp16 14              # encoding: [0xce,0x46]
 #------------------------------------------------------------------------------
 # Big endian
 #------------------------------------------------------------------------------
@@ -48,24 +73,49 @@
 # CHECK-EB: xor16   $17, $5         # encoding: [0x44,0x4d]
 # CHECK-EB: sll16   $3, $16, 5      # encoding: [0x25,0x8a]
 # CHECK-EB: srl16   $4, $17, 6      # encoding: [0x26,0x1d]
+# CHECK-EB: lbu16   $3, 4($17)      # encoding: [0x09,0x94]
+# CHECK-EB: lbu16   $3, -1($16)     # encoding: [0x09,0x8f]
+# CHECK-EB: lhu16   $3, 4($16)      # encoding: [0x29,0x82]
+# CHECK-EB: lw16    $4, 8($17)      # encoding: [0x6a,0x12]
+# CHECK-EB: sb16    $3, 4($16)      # encoding: [0x89,0x84]
+# CHECK-EB: sh16    $4, 8($17)      # encoding: [0xaa,0x14]
+# CHECK-EB: sw16    $4, 4($17)      # encoding: [0xea,0x11]
+# CHECK-EB: sw16    $zero, 4($17)   # encoding: [0xe8,0x11]
+# CHECK-EB: lw      $3, 32($gp)     # encoding: [0x65,0x88]
+# CHECK-EB: lw      $3, 32($sp)     # encoding: [0x48,0x68]
+# CHECK-EB: sw      $4, 124($sp)    # encoding: [0xc8,0x9f]
 # CHECK-EB: li16    $3, -1          # encoding: [0xed,0xff]
 # CHECK-EB: li16    $3, 126         # encoding: [0xed,0xfe]
 # CHECK-EB: addiur1sp $7, 4         # encoding: [0x6f,0x83]
 # CHECK-EB: addiur2 $6, $7, -1      # encoding: [0x6f,0x7e]
 # CHECK-EB: addiur2 $6, $7, 12      # encoding: [0x6f,0x76]
 # CHECK-EB: addius5 $7, -2          # encoding: [0x4c,0xfc]
+# CHECK-EB: addiusp -1028           # encoding: [0x4f,0xff]
+# CHECK-EB: addiusp -1032           # encoding: [0x4f,0xfd]
+# CHECK-EB: addiusp 1024            # encoding: [0x4c,0x01]
+# CHECK-EB: addiusp 1028            # encoding: [0x4c,0x03]
 # CHECK-EB: addiusp -16             # encoding: [0x4f,0xf9]
 # CHECK-EB: mfhi    $9              # encoding: [0x46,0x09]
 # CHECK-EB: mflo    $9              # encoding: [0x46,0x49]
 # CHECK-EB: move    $25, $1         # encoding: [0x0f,0x21]
+# CHECK-EB: movep   $5, $6, $2, $3  # encoding: [0x84,0x34]
 # CHECK-EB: jrc     $9              # encoding: [0x45,0xa9]
 # CHECK-NEXT: jalr    $9            # encoding: [0x45,0xc9]
 # CHECK-EB: jraddiusp 20            # encoding: [0x47,0x05]
-# CHECK-EB: nop                     # encoding: [0x00,0x00,0x00,0x00]
-# CHECK-EB: jalrs16 $9              # encoding: [0x45,0xe9]
-# CHECK-EB: move    $zero, $zero    # encoding: [0x0c,0x00]
+# CHECK-NEXT: jalrs16 $9            # encoding: [0x45,0xe9]
+# CHECK-EB: nop                     # encoding: [0x0c,0x00]
 # CHECK-EB: jr16    $9              # encoding: [0x45,0x89]
 # CHECK-EB: nop                     # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: beqz16 $6, 20           # encoding: [0x8f,0x0a]
+# CHECK-EB: nop                     # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: bnez16 $6, 20           # encoding: [0xaf,0x0a]
+# CHECK-EB: nop                     # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: b16 132                 # encoding: [0xcc,0x42]
+# CHECK-EB: nop
+# CHECK-EB: b16 132                 # encoding: [0xcc,0x42]
+# CHECK-EB: nop
+# CHECK-EB: break16 8               # encoding: [0x46,0x88]
+# CHECK-EB: sdbbp16 14              # encoding: [0x46,0xce]
 
     addu16  $6, $17, $4
     subu16  $5, $16, $3
@@ -76,18 +126,40 @@
     xor16   $17, $5
     sll16   $3, $16, 5
     srl16   $4, $17, 6
+    lbu16   $3, 4($17)
+    lbu16   $3, -1($16)
+    lhu16   $3, 4($16)
+    lw16    $4, 8($17)
+    sb16    $3, 4($16)
+    sh16    $4, 8($17)
+    sw16    $4, 4($17)
+    sw16    $0, 4($17)
+    lw      $3, 32($gp)
+    lw      $3, 32($sp)
+    sw      $4, 124($sp)
     li16    $3, -1
     li16    $3, 126
     addiur1sp $7, 4
     addiur2 $6, $7, -1
     addiur2 $6, $7, 12
     addius5 $7, -2
+    addiusp -1028
+    addiusp -1032
+    addiusp 1024
+    addiusp 1028
     addiusp -16
     mfhi    $9
     mflo    $9
     move    $25, $1
+    movep   $5, $6, $2, $3
     jrc     $9
     jalr    $9
     jraddiusp 20
     jalrs16 $9
     jr16    $9
+    beqz16 $6, 20
+    bnez16 $6, 20
+    b   132
+    b16 132
+    break16 8
+    sdbbp16 14
diff --git a/test/MC/Mips/micromips-alu-instructions.s b/test/MC/Mips/micromips-alu-instructions.s
index 1131d1f..aeab09e 100644
--- a/test/MC/Mips/micromips-alu-instructions.s
+++ b/test/MC/Mips/micromips-alu-instructions.s
@@ -38,6 +38,9 @@
 # CHECK-EL: multu  $9, $7         # encoding: [0xe9,0x00,0x3c,0x9b]
 # CHECK-EL: div    $zero, $9, $7  # encoding: [0xe9,0x00,0x3c,0xab]
 # CHECK-EL: divu   $zero, $9, $7  # encoding: [0xe9,0x00,0x3c,0xbb]
+# CHECK-EL: addiupc $2, 20        # encoding: [0x00,0x79,0x05,0x00]
+# CHECK-EL: addiupc $7, 16777212  # encoding: [0xbf,0x7b,0xff,0xff]
+# CHECK-EL: addiupc $7, -16777216 # encoding: [0xc0,0x7b,0x00,0x00]
 #------------------------------------------------------------------------------
 # Big endian
 #------------------------------------------------------------------------------
@@ -72,6 +75,9 @@
 # CHECK-EB:  multu $9, $7         # encoding: [0x00,0xe9,0x9b,0x3c]
 # CHECK-EB: div  $zero, $9, $7    # encoding: [0x00,0xe9,0xab,0x3c]
 # CHECK-EB: divu $zero, $9, $7    # encoding: [0x00,0xe9,0xbb,0x3c]
+# CHECK-EB: addiupc $2, 20        # encoding: [0x79,0x00,0x00,0x05]
+# CHECK-EB: addiupc $7, 16777212  # encoding: [0x7b,0xbf,0xff,0xff]
+# CHECK-EB: addiupc $7, -16777216 # encoding: [0x7b,0xc0,0x00,0x00]
     add    $9, $6, $7
     add    $9, $6, 17767
     addu   $9, $6, -15001
@@ -104,3 +110,6 @@
     multu  $9, $7
     div    $0, $9, $7
     divu   $0, $9, $7
+    addiupc $2, 20
+    addiupc $7, 16777212
+    addiupc $7, -16777216
diff --git a/test/MC/Mips/micromips-bad-branches.s b/test/MC/Mips/micromips-bad-branches.s
index 573605e..f64cd9f 100644
--- a/test/MC/Mips/micromips-bad-branches.s
+++ b/test/MC/Mips/micromips-bad-branches.s
@@ -126,6 +126,11 @@
 # CHECK: error: branch target out of range
 # CHECK:        bc1t $fcc0, 65536
 
+# CHECK: error: branch to misaligned address
+# CHECK:        beqz16 $6, 31
+# CHECK: error: branch target out of range
+# CHECK:        beqz16 $6, 130
+
         b -65535
         b -65536
         b -65537
@@ -223,3 +228,6 @@
         bc1t $fcc0, 65534
         bc1t $fcc0, 65535
         bc1t $fcc0, 65536
+
+        beqz16 $6, 31
+        beqz16 $6, 130
diff --git a/test/MC/Mips/micromips-branch-fixup.s b/test/MC/Mips/micromips-branch-fixup.s
new file mode 100644
index 0000000..98b4842
--- /dev/null
+++ b/test/MC/Mips/micromips-branch-fixup.s
@@ -0,0 +1,91 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding \
+# RUN: -mattr=micromips | FileCheck %s -check-prefix=CHECK-FIXUP
+# RUN: llvm-mc %s -filetype=obj -triple=mipsel-unknown-linux \
+# RUN: -mattr=micromips | llvm-readobj -r \
+# RUN: | FileCheck %s -check-prefix=CHECK-ELF
+#------------------------------------------------------------------------------
+# Check that the assembler can handle the documented syntax
+# for relocations.
+#------------------------------------------------------------------------------
+# CHECK-FIXUP: beqz16 $6, bar  # encoding: [0b0AAAAAAA,0x8f]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC7_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: bnez16 $6, bar  # encoding: [0b0AAAAAAA,0xaf]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC7_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: b16         bar # encoding: [A,0b110011AA]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC10_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: b   bar         # encoding: [A,0x94'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x0c]
+# CHECK-FIXUP: beq $3, $4, bar # encoding: [0x83'A',0x94'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: bne $3, $4, bar # encoding: [0x83'A',0xb4'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: bgez    $4, bar # encoding: [0x44'A',0x40'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: bgtz    $4, bar # encoding: [0xc4'A',0x40'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: blez    $4, bar # encoding: [0x84'A',0x40'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: bltz    $4, bar # encoding: [0x04'A',0x40'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: bgezal  $4, bar # encoding: [0x64'A',0x40'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: bltzal  $4, bar # encoding: [0x24'A',0x40'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+#------------------------------------------------------------------------------
+# Check that the appropriate relocations were created.
+#------------------------------------------------------------------------------
+# CHECK-ELF: Relocations [
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC7_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC7_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC10_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF: ]
+
+  .text
+  .type main, @function
+  .set micromips
+main:
+  beqz16  $6, bar
+  bnez16  $6, bar
+  b16     bar
+  b       bar
+  beq     $3, $4, bar
+  bne     $3, $4, bar
+  bgez    $4, bar
+  bgtz    $4, bar
+  blez    $4, bar
+  bltz    $4, bar
+  bgezal  $4, bar
+  bltzal  $4, bar
diff --git a/test/MC/Mips/micromips-branch-instructions.s b/test/MC/Mips/micromips-branch-instructions.s
index cf0aab7..e85b925 100644
--- a/test/MC/Mips/micromips-branch-instructions.s
+++ b/test/MC/Mips/micromips-branch-instructions.s
@@ -9,8 +9,8 @@
 #------------------------------------------------------------------------------
 # Little endian
 #------------------------------------------------------------------------------
-# CHECK-EL: b 1332               # encoding: [0x00,0x94,0x9a,0x02]
-# CHECK-EL: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: b   1332             # encoding: [0x00,0x94,0x9a,0x02]
+# CHECK-EL: nop                  # encoding: [0x00,0x0c]
 # CHECK-EL: beq $9, $6, 1332     # encoding: [0xc9,0x94,0x9a,0x02]
 # CHECK-EL: nop                  # encoding: [0x00,0x00,0x00,0x00]
 # CHECK-EL: bgez $6, 1332        # encoding: [0x46,0x40,0x9a,0x02]
@@ -30,14 +30,14 @@
 # CHECK-EL: bltz $6, 1332        # encoding: [0x06,0x40,0x9a,0x02]
 # CHECK-EL: nop                  # encoding: [0x00,0x00,0x00,0x00]
 # CHECK-EL: bgezals $6, 1332     # encoding: [0x66,0x42,0x9a,0x02]
-# CHECK-EL: move $zero, $zero    # encoding: [0x00,0x0c]
+# CHECK-EL: nop                  # encoding: [0x00,0x0c]
 # CHECK-EL: bltzals $6, 1332     # encoding: [0x26,0x42,0x9a,0x02]
-# CHECK-EL: move $zero, $zero    # encoding: [0x00,0x0c]
+# CHECK-EL: nop                  # encoding: [0x00,0x0c]
 #------------------------------------------------------------------------------
 # Big endian
 #------------------------------------------------------------------------------
-# CHECK-EB: b 1332               # encoding: [0x94,0x00,0x02,0x9a]
-# CHECK-EB: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: b   1332             # encoding: [0x94,0x00,0x02,0x9a]
+# CHECK-EB: nop                  # encoding: [0x0c,0x00]
 # CHECK-EB: beq $9, $6, 1332     # encoding: [0x94,0xc9,0x02,0x9a]
 # CHECK-EB: nop                  # encoding: [0x00,0x00,0x00,0x00]
 # CHECK-EB: bgez $6, 1332        # encoding: [0x40,0x46,0x02,0x9a]
@@ -57,10 +57,14 @@
 # CHECK-EB: bltz $6, 1332        # encoding: [0x40,0x06,0x02,0x9a]
 # CHECK-EB: nop                  # encoding: [0x00,0x00,0x00,0x00]
 # CHECK-EB: bgezals $6, 1332     # encoding: [0x42,0x66,0x02,0x9a]
-# CHECK-EB: move $zero, $zero    # encoding: [0x0c,0x00]
+# CHECK-EB: nop                  # encoding: [0x0c,0x00]
 # CHECK-EB: bltzals $6, 1332     # encoding: [0x42,0x26,0x02,0x9a]
-# CHECK-EB: move $zero, $zero    # encoding: [0x0c,0x00]
+# CHECK-EB: nop                  # encoding: [0x0c,0x00]
 
+    .text
+    .type main, @function
+    .set micromips
+main:
      b      1332
      beq    $9,$6,1332
      bgez   $6,1332
diff --git a/test/MC/Mips/micromips-branch16.s b/test/MC/Mips/micromips-branch16.s
deleted file mode 100644
index 321ee86..0000000
--- a/test/MC/Mips/micromips-branch16.s
+++ /dev/null
@@ -1,69 +0,0 @@
-# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding \
-# RUN: -mattr=micromips | FileCheck %s -check-prefix=CHECK-FIXUP
-# RUN: llvm-mc %s -filetype=obj -triple=mipsel-unknown-linux \
-# RUN: -mattr=micromips | llvm-readobj -r \
-# RUN: | FileCheck %s -check-prefix=CHECK-ELF
-#------------------------------------------------------------------------------
-# Check that the assembler can handle the documented syntax
-# for relocations.
-#------------------------------------------------------------------------------
-# CHECK-FIXUP: b           bar # encoding: [A,0x94'A',0x00,0x00]
-# CHECK-FIXUP:                 #   fixup A - offset: 0,
-# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
-# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
-# CHECK-FIXUP: beq $3, $4, bar # encoding: [0x83'A',0x94'A',0x00,0x00]
-# CHECK-FIXUP:                 #   fixup A - offset: 0,
-# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
-# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
-# CHECK-FIXUP: bne $3, $4, bar # encoding: [0x83'A',0xb4'A',0x00,0x00]
-# CHECK-FIXUP:                 #   fixup A - offset: 0,
-# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
-# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
-# CHECK-FIXUP: bgez    $4, bar # encoding: [0x44'A',0x40'A',0x00,0x00]
-# CHECK-FIXUP:                 #   fixup A - offset: 0,
-# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
-# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
-# CHECK-FIXUP: bgtz    $4, bar # encoding: [0xc4'A',0x40'A',0x00,0x00]
-# CHECK-FIXUP:                 #   fixup A - offset: 0,
-# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
-# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
-# CHECK-FIXUP: blez    $4, bar # encoding: [0x84'A',0x40'A',0x00,0x00]
-# CHECK-FIXUP:                 #   fixup A - offset: 0,
-# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
-# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
-# CHECK-FIXUP: bltz    $4, bar # encoding: [0x04'A',0x40'A',0x00,0x00]
-# CHECK-FIXUP:                 #   fixup A - offset: 0,
-# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
-# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
-# CHECK-FIXUP: bgezal  $4, bar # encoding: [0x64'A',0x40'A',0x00,0x00]
-# CHECK-FIXUP:                 #   fixup A - offset: 0,
-# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
-# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
-# CHECK-FIXUP: bltzal  $4, bar # encoding: [0x24'A',0x40'A',0x00,0x00]
-# CHECK-FIXUP:                 #   fixup A - offset: 0,
-# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
-# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
-#------------------------------------------------------------------------------
-# Check that the appropriate relocations were created.
-#------------------------------------------------------------------------------
-# CHECK-ELF: Relocations [
-# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
-# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
-# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
-# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
-# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
-# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
-# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
-# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
-# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
-# CHECK-ELF: ]
-
-  b       bar
-  beq     $3, $4, bar
-  bne     $3, $4, bar
-  bgez    $4, bar
-  bgtz    $4, bar
-  blez    $4, bar
-  bltz    $4, bar
-  bgezal  $4, bar
-  bltzal  $4, bar
diff --git a/test/MC/Mips/micromips-control-instructions.s b/test/MC/Mips/micromips-control-instructions.s
index e79896d..76c953f 100644
--- a/test/MC/Mips/micromips-control-instructions.s
+++ b/test/MC/Mips/micromips-control-instructions.s
@@ -15,6 +15,11 @@
 # CHECK-EL:    .set mips32r2
 # CHECK-EL:    rdhwr $5, $29
 # CHECK-EL:    .set pop                   # encoding: [0xbd,0x00,0x3c,0x6b]
+# CHECK-EL:    cache 1, 8($5)             # encoding: [0x25,0x20,0x08,0x60]
+# CHECK-EL:    pref 1, 8($5)              # encoding: [0x25,0x60,0x08,0x20]
+# CHECK-EL:    ssnop                      # encoding: [0x00,0x00,0x00,0x08]
+# CHECK-EL:    ehb                        # encoding: [0x00,0x00,0x00,0x18]
+# CHECK-EL:    pause                      # encoding: [0x00,0x00,0x00,0x28]
 # CHECK-EL:    break                      # encoding: [0x00,0x00,0x07,0x00]
 # CHECK-EL:    break 7                    # encoding: [0x07,0x00,0x07,0x00]
 # CHECK-EL:    break 7, 5                 # encoding: [0x07,0x00,0x47,0x01]
@@ -43,6 +48,11 @@
 # CHECK-EB:   .set mips32r2
 # CHECK-EB:   rdhwr $5, $29
 # CHECK-EB:   .set pop                    # encoding: [0x00,0xbd,0x6b,0x3c]
+# CHECK-EB:   cache 1, 8($5)              # encoding: [0x20,0x25,0x60,0x08]
+# CHECK-EB:   pref 1, 8($5)               # encoding: [0x60,0x25,0x20,0x08]
+# CHECK-EB:   ssnop                       # encoding: [0x00,0x00,0x08,0x00]
+# CHECK-EB:   ehb                         # encoding: [0x00,0x00,0x18,0x00]
+# CHECK-EB:   pause                       # encoding: [0x00,0x00,0x28,0x00]
 # CHECK-EB:   break                       # encoding: [0x00,0x00,0x00,0x07]
 # CHECK-EB:   break 7                     # encoding: [0x00,0x07,0x00,0x07]
 # CHECK-EB:   break 7, 5                  # encoding: [0x00,0x07,0x01,0x47]
@@ -66,6 +76,11 @@
     sdbbp
     sdbbp 34
     rdhwr $5, $29
+    cache 1, 8($5)
+    pref 1, 8($5)
+    ssnop
+    ehb
+    pause
     break
     break 7
     break 7,5
diff --git a/test/MC/Mips/micromips-diagnostic-fixup.s b/test/MC/Mips/micromips-diagnostic-fixup.s
index f8fe447..041338a 100644
--- a/test/MC/Mips/micromips-diagnostic-fixup.s
+++ b/test/MC/Mips/micromips-diagnostic-fixup.s
@@ -4,7 +4,7 @@
 
 .text
   b foo
-  .space 65536 - 8, 1   # -8 = size of b instr plus size of automatically inserted nop
+  .space 65536 - 6, 1   # -6 = size of b instr plus size of automatically inserted nop
   nop                   # This instr makes the branch too long to fit into a 17-bit offset
 foo:
   add $0,$0,$0
diff --git a/test/MC/Mips/micromips-func-addr.s b/test/MC/Mips/micromips-func-addr.s
new file mode 100644
index 0000000..e2a4d23
--- /dev/null
+++ b/test/MC/Mips/micromips-func-addr.s
@@ -0,0 +1,16 @@
+# RUN: llvm-mc %s -filetype=obj -triple=mipsel-unknown-linux \
+# RUN: -mattr=micromips | llvm-readobj -r \
+# RUN: | FileCheck %s -check-prefix=CHECK
+# CHECK: Relocations [
+# CHECK:     0x0 R_MIPS_32 bar 0x0
+# CHECK:     0x4 R_MIPS_32 L1 0x0
+
+  .set    micromips
+  .type   bar,@function
+bar:
+L1:
+  nop
+  .data
+  .4byte bar 
+  .4byte L1 
+
diff --git a/test/MC/Mips/micromips-invalid.s b/test/MC/Mips/micromips-invalid.s
index 779e66e..4321574 100644
--- a/test/MC/Mips/micromips-invalid.s
+++ b/test/MC/Mips/micromips-invalid.s
@@ -22,6 +22,15 @@
   li16  $4, -2 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
   addiur2 $9, $7, -1 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   addiur2 $6, $7, 10 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  lwm16   $5, $6, $ra, 8($sp)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: $16 or $31 expected
+  lwm16   $16, $19, $ra, 8($sp)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: consecutive register numbers expected
+  lwm16   $16-$25, $ra, 8($sp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid register operand
+  lwm16   $16, 8($sp)            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lwm16   $16, $17, 8($sp)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lwm16   $16-$20, 8($sp)        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  swm16   $5, $6, $ra, 8($sp)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: $16 or $31 expected
+  swm16   $16, $19, $ra, 8($sp)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: consecutive register numbers expected
+  swm16   $16-$25, $ra, 8($sp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid register operand
   lwm32   $5, $6, 8($4)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: $16 or $31 expected
   lwm32   $16, $19, 8($4)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: consecutive register numbers expected
   lwm32   $16-$25, 8($4)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid register operand
@@ -29,3 +38,38 @@
   swm32   $16, $19, 8($4)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: consecutive register numbers expected
   swm32   $16-$25, 8($4)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid register operand
   lwm32 $16, $17, $18, $19, $20, $21, $22, $23, $24, 8($4) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid register operand
+  addiupc $7, 16777216  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  addiupc $6, -16777220 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  addiupc $3, 3         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  lbu16 $9, 8($16) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhu16 $9, 4($16) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lw16  $9, 8($17) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  sb16  $9, 4($16) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  sh16  $9, 8($17) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  sw16  $9, 4($17) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lbu16 $3, -2($16) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  lhu16 $3, 64($16) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  lw16  $4, 68($17) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  sb16  $3, 64($16) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  sh16  $4, 68($17) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  sw16  $4, 64($17) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  lbu16 $3, -2($16) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  lhu16 $3, 64($16) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  lw16  $4, 68($17) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  sb16  $16, 4($16) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  sh16  $16, 8($17) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  sw16  $16, 4($17) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lbu16 $16, 8($9)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhu16 $16, 4($9)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lw16  $17, 8($10) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  sb16  $7, 4($9)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  sh16  $7, 8($9)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  sw16  $7, 4($10)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  cache 256, 8($5)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  pref 256, 8($5)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  beqz16 $9, 20 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  bnez16 $9, 20 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  movep   $5, $21, $2, $3 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  movep   $8, $6, $2, $3  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  movep   $5, $6, $5, $3  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  movep   $5, $6, $2, $9  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/micromips-jump-instructions.s b/test/MC/Mips/micromips-jump-instructions.s
index aed18dc..3147a3f 100644
--- a/test/MC/Mips/micromips-jump-instructions.s
+++ b/test/MC/Mips/micromips-jump-instructions.s
@@ -19,10 +19,16 @@
 # CHECK-EL: nop         # encoding: [0x00,0x00,0x00,0x00]
 # CHECK-EL: jr $7       # encoding: [0x07,0x00,0x3c,0x0f]
 # CHECK-EL: nop         # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: jalx 1328   # encoding: [0x00,0xf0,0x4c,0x01]
+# CHECK-EL: nop         # encoding: [0x00,0x00,0x00,0x00]
 # CHECK-EL: jals 1328         # encoding: [0x00,0x74,0x98,0x02]
-# CHECK-EL: move $zero, $zero # encoding: [0x00,0x0c]
+# CHECK-EL: nop               # encoding: [0x00,0x0c]
 # CHECK-EL: jalrs $ra, $6     # encoding: [0xe6,0x03,0x3c,0x4f]
-# CHECK-EL: move $zero, $zero # encoding: [0x00,0x0c]
+# CHECK-EL: nop               # encoding: [0x00,0x0c]
+# CHECK-EL: jalr $25          # encoding: [0xd9,0x45]
+# CHECK-EL: nop               # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: jalr $4, $25      # encoding: [0x99,0x00,0x3c,0x0f]
+# CHECK-EL: nop               # encoding: [0x00,0x00,0x00,0x00]
 #------------------------------------------------------------------------------
 # Big endian
 #------------------------------------------------------------------------------
@@ -36,15 +42,24 @@
 # CHECK-EB: nop         # encoding: [0x00,0x00,0x00,0x00]
 # CHECK-EB: jr $7       # encoding: [0x00,0x07,0x0f,0x3c]
 # CHECK-EB: nop         # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: jalx 1328   # encoding: [0xf0,0x00,0x01,0x4c]
+# CHECK-EB: nop         # encoding: [0x00,0x00,0x00,0x00]
 # CHECK-EB: jals 1328         # encoding: [0x74,0x00,0x02,0x98]
-# CHECK-EB: move $zero, $zero # encoding: [0x0c,0x00]
+# CHECK-EB: nop               # encoding: [0x0c,0x00]
 # CHECK-EB: jalrs $ra, $6     # encoding: [0x03,0xe6,0x4f,0x3c]
-# CHECK-EB: move $zero, $zero # encoding: [0x0c,0x00]
+# CHECK-EB: nop               # encoding: [0x0c,0x00]
+# CHECK-EB: jalr $25          # encoding: [0x45,0xd9]
+# CHECK-EB: nop               # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: jalr $4, $25      # encoding: [0x00,0x99,0x0f,0x3c]
+# CHECK-EB: nop               # encoding: [0x00,0x00,0x00,0x00]
 
      j 1328
      jal 1328
      jalr $ra, $6
      jr $7
      j $7
+     jalx 1328
      jals 1328
      jalrs $ra, $6
+     jal $25
+     jal $4, $25
diff --git a/test/MC/Mips/micromips-loadstore-instructions.s b/test/MC/Mips/micromips-loadstore-instructions.s
index 62fa101..f22719d 100644
--- a/test/MC/Mips/micromips-loadstore-instructions.s
+++ b/test/MC/Mips/micromips-loadstore-instructions.s
@@ -14,9 +14,12 @@
 # CHECK-EL: lh     $2, 8($4)                  # encoding: [0x44,0x3c,0x08,0x00]
 # CHECK-EL: lhu    $4, 8($2)                  # encoding: [0x82,0x34,0x08,0x00]
 # CHECK-EL: lw     $6, 4($5)                  # encoding: [0xc5,0xfc,0x04,0x00]
+# CHECK-EL: lw     $6, 123($sp)               # encoding: [0xdd,0xfc,0x7b,0x00]
 # CHECK-EL: sb     $5, 8($4)                  # encoding: [0xa4,0x18,0x08,0x00]
 # CHECK-EL: sh     $2, 8($4)                  # encoding: [0x44,0x38,0x08,0x00]
 # CHECK-EL: sw     $5, 4($6)                  # encoding: [0xa6,0xf8,0x04,0x00]
+# CHECK-EL: sw     $5, 123($sp)               # encoding: [0xbd,0xf8,0x7b,0x00]
+# CHECK-EL: sw     $3, 32($gp)                # encoding: [0x7c,0xf8,0x20,0x00]
 # CHECK-EL: ll     $2, 8($4)                  # encoding: [0x44,0x60,0x08,0x30]
 # CHECK-EL: sc     $2, 8($4)                  # encoding: [0x44,0x60,0x08,0xb0]
 # CHECK-EL: lwu    $2, 8($4)                  # encoding: [0x44,0x60,0x08,0xe0]
@@ -29,6 +32,18 @@
 # CHECK-EL: lwm32  $16, $17, $18, $19, $20, $21, $22, $23, $fp, $ra, 8($4) # encoding: [0x24,0x23,0x08,0x50]
 # CHECK-EL: swm32  $16, $17, 8($4)            # encoding: [0x44,0x20,0x08,0xd0]
 # CHECK-EL: swm32  $16, $17, $18, $19, 8($4)  # encoding: [0x84,0x20,0x08,0xd0]
+# CHECK-EL: lwm16  $16, $17, $ra, 8($sp)      # encoding: [0x12,0x45]
+# CHECK-EL: swm16  $16, $17, $ra, 8($sp)      # encoding: [0x52,0x45]
+# CHECK-EL: lwm16  $16, $17, $ra, 8($sp)      # encoding: [0x12,0x45]
+# CHECK-EL: lwm32  $16, $17, $ra, 64($sp)     # encoding: [0x5d,0x22,0x40,0x50]
+# CHECK-EL: lwm32  $16, $17, $ra, 8($4)       # encoding: [0x44,0x22,0x08,0x50]
+# CHECK-EL: lwm32  $16, $17, 8($sp)           # encoding: [0x5d,0x20,0x08,0x50]
+# CHECK-EL: swm16  $16, $17, $ra, 8($sp)      # encoding: [0x52,0x45]
+# CHECK-EL: swm32  $16, $17, $ra, 64($sp)     # encoding: [0x5d,0x22,0x40,0xd0]
+# CHECK-EL: swm32  $16, $17, $ra, 8($4)       # encoding: [0x44,0x22,0x08,0xd0]
+# CHECK-EL: swm32  $16, $17, 8($sp)           # encoding: [0x5d,0x20,0x08,0xd0]
+# CHECK-EL: swp    $16, 8($4)                 # encoding: [0x04,0x22,0x08,0x90]
+# CHECK-EL: lwp    $16, 8($4)                 # encoding: [0x04,0x22,0x08,0x10]
 #------------------------------------------------------------------------------
 # Big endian
 #------------------------------------------------------------------------------
@@ -37,9 +52,12 @@
 # CHECK-EB: lh     $2, 8($4)                 # encoding: [0x3c,0x44,0x00,0x08]
 # CHECK-EB: lhu    $4, 8($2)                 # encoding: [0x34,0x82,0x00,0x08]
 # CHECK-EB: lw     $6, 4($5)                 # encoding: [0xfc,0xc5,0x00,0x04]
+# CHECK-EB: lw     $6, 123($sp)              # encoding: [0xfc,0xdd,0x00,0x7b]
 # CHECK-EB: sb     $5, 8($4)                 # encoding: [0x18,0xa4,0x00,0x08]
 # CHECK-EB: sh     $2, 8($4)                 # encoding: [0x38,0x44,0x00,0x08]
 # CHECK-EB: sw     $5, 4($6)                 # encoding: [0xf8,0xa6,0x00,0x04]
+# CHECK-EB: sw     $5, 123($sp)              # encoding: [0xf8,0xbd,0x00,0x7b]
+# CHECK-EB: sw     $3, 32($gp)               # encoding: [0xf8,0x7c,0x00,0x20]
 # CHECK-EB: ll     $2, 8($4)                 # encoding: [0x60,0x44,0x30,0x08]
 # CHECK-EB: sc     $2, 8($4)                 # encoding: [0x60,0x44,0xb0,0x08]
 # CHECK-EB: lwu    $2, 8($4)                 # encoding: [0x60,0x44,0xe0,0x08]
@@ -52,14 +70,29 @@
 # CHECK-EB: lwm32  $16, $17, $18, $19, $20, $21, $22, $23, $fp, $ra, 8($4) # encoding: [0x23,0x24,0x50,0x08]
 # CHECK-EB: swm32  $16, $17, 8($4)           # encoding: [0x20,0x44,0xd0,0x08]
 # CHECK-EB: swm32  $16, $17, $18, $19, 8($4) # encoding: [0x20,0x84,0xd0,0x08]
+# CHECK-EB: lwm16  $16, $17, $ra, 8($sp)     # encoding: [0x45,0x12]
+# CHECK-EB: swm16  $16, $17, $ra, 8($sp)     # encoding: [0x45,0x52]
+# CHECK-EB: lwm16  $16, $17, $ra, 8($sp)     # encoding: [0x45,0x12]
+# CHECK-EB: lwm32  $16, $17, $ra, 64($sp)    # encoding: [0x22,0x5d,0x50,0x40]
+# CHECK-EB: lwm32  $16, $17, $ra, 8($4)      # encoding: [0x22,0x44,0x50,0x08]
+# CHECK-EB: lwm32  $16, $17, 8($sp)          # encoding: [0x20,0x5d,0x50,0x08]
+# CHECK-EB: swm16  $16, $17, $ra, 8($sp)     # encoding: [0x45,0x52]
+# CHECK-EB: swm32  $16, $17, $ra, 64($sp)    # encoding: [0x22,0x5d,0xd0,0x40]
+# CHECK-EB: swm32  $16, $17, $ra, 8($4)      # encoding: [0x22,0x44,0xd0,0x08]
+# CHECK-EB: swm32  $16, $17, 8($sp)          # encoding: [0x20,0x5d,0xd0,0x08]
+# CHECK-EB: swp    $16, 8($4)                # encoding: [0x22,0x04,0x90,0x08]
+# CHECK-EB: lwp    $16, 8($4)                # encoding: [0x22,0x04,0x10,0x08]
      lb     $5, 8($4)
      lbu    $6, 8($4)
      lh     $2, 8($4)
      lhu    $4, 8($2)
      lw     $6, 4($5)
+     lw     $6, 123($sp)
      sb     $5, 8($4)
      sh     $2, 8($4)
      sw     $5, 4($6)
+     sw     $5, 123($sp)
+     sw     $3, 32($gp)
      ll     $2, 8($4)
      sc     $2, 8($4)
      lwu    $2, 8($4)
@@ -72,3 +105,15 @@
      lwm32  $16-$23, $30 - $31, 8($4)
      swm32  $16, $17, 8($4)
      swm32  $16 - $19, 8($4)
+     lwm16  $16, $17, $ra, 8($sp)
+     swm16  $16, $17, $ra, 8($sp)
+     lwm    $16, $17, $ra, 8($sp)
+     lwm    $16, $17, $ra, 64($sp)
+     lwm    $16, $17, $ra, 8($4)
+     lwm    $16, $17, 8($sp)
+     swm    $16, $17, $ra, 8($sp)
+     swm    $16, $17, $ra, 64($sp)
+     swm    $16, $17, $ra, 8($4)
+     swm    $16, $17, 8($sp)
+     swp    $16, 8($4)
+     lwp    $16, 8($4)
diff --git a/test/MC/Mips/mips-abi-bad.s b/test/MC/Mips/mips-abi-bad.s
index c4653cf..ba6564f 100644
--- a/test/MC/Mips/mips-abi-bad.s
+++ b/test/MC/Mips/mips-abi-bad.s
@@ -1,20 +1,30 @@
-# Error checking for malformed abi related directives
 # RUN: not llvm-mc -triple mips-unknown-unknown %s 2>&1 | FileCheck %s
-# CHECK: .text
+
+# Error checking for malformed .module directives (and .set fp=...).
+
     .module fp=3
-# CHECK      : mips-abi-bad.s:4:16: error: unsupported option
-# CHECK-NEXT : .module fp=3
-# CHECK-NEXT :           ^
+# CHECK: :[[@LINE-1]]:17: error: unsupported value, expected 'xx', '32' or '64'
+# CHECK-NEXT: .module fp=3
+# CHECK-NEXT:             ^
 
+# FIXME: Add separate test for .set fp=xx/32/64.
     .set fp=xx,6
-# CHECK      :mips-abi-bad.s:5:15: error: unexpected token in statement
-# CHECK-NEXT :    .set fp=xx,6
-# CHECK-NEXT :              ^
+# CHECK: :[[@LINE-1]]:15: error: unexpected token, expected end of statement
+# CHECK-NEXT: .set fp=xx,6
+# CHECK-NEXT:           ^
+
+    .module
+# CHECK: :[[@LINE-1]]:12: error: expected .module option identifier
+# CHECK-NEXT: .module
+# CHECK-NEXT:        ^
+
+    .module 34
+# CHECK: :[[@LINE-1]]:13: error: expected .module option identifier
+# CHECK-NEXT: .module 34
+# CHECK-NEXT:         ^
 
-# CHECK       :.set mips16
     .set mips16
     .module fp=32
-
-# CHECK      :mips-abi-bad.s:14:13: error: .module directive must come before any code
-# CHECK-NEXT :    .module fp=32
-# CHECK-NEXT :            ^
+# CHECK: :[[@LINE-1]]:13: error: .module directive must appear before any code
+# CHECK-NEXT: .module fp=32
+# CHECK-NEXT:         ^
diff --git a/test/MC/Mips/mips-noat.s b/test/MC/Mips/mips-noat.s
index f9d4efd..3a937c7 100644
--- a/test/MC/Mips/mips-noat.s
+++ b/test/MC/Mips/mips-noat.s
@@ -7,6 +7,8 @@
 # CHECK:  lui   $1, 1
 # CHECK:  addu  $1, $1, $2
 # CHECK:  lw    $2, 0($1)
+# CHECK-LABEL: test2:
+# CHECK:  .set noat
 test1:
         lw      $2, 65536($2)
 
@@ -20,6 +22,8 @@ test2:
 # CHECK:  lui   $1, 1
 # CHECK:  addu  $1, $1, $2
 # CHECK:  lw    $2, 0($1)
+# CHECK-LABEL: test4:
+# CHECK:  .set  at=$0
 test3:
         .set at
         lw      $2, 65536($2)
diff --git a/test/MC/Mips/mips-reginfo-fp64.s b/test/MC/Mips/mips-reginfo-fp64.s
index b60e54e..ba2bf79 100644
--- a/test/MC/Mips/mips-reginfo-fp64.s
+++ b/test/MC/Mips/mips-reginfo-fp64.s
@@ -2,11 +2,11 @@
 # RUN:   llvm-readobj -s -section-data | \
 # RUN:     FileCheck %s -check-prefix=ELF32
 
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64,-n64,+n32 -filetype=obj -o - | \
+# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 -target-abi n32 -filetype=obj -o - | \
 # RUN:   llvm-readobj -s -section-data | \
 # RUN:     FileCheck %s -check-prefix=ELF32
 
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64,+n64 -filetype=obj -o - | \
+# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 -target-abi n64 -filetype=obj -o - | \
 # RUN:   llvm-readobj -s -section-data | \
 # RUN:     FileCheck %s -check-prefix=ELF64
 
diff --git a/test/MC/Mips/mips32r2/valid-xfail.s b/test/MC/Mips/mips32r2/valid-xfail.s
index ef02d51..13385d0 100644
--- a/test/MC/Mips/mips32r2/valid-xfail.s
+++ b/test/MC/Mips/mips32r2/valid-xfail.s
@@ -293,7 +293,6 @@
         swe             $24,94($k0)
         swle            $v1,-209($gp)
         swre            $k0,-202($s2)
-        synci           20023($s0)
         tlbginv
         tlbginvf
         tlbgp
diff --git a/test/MC/Mips/mips32r2/valid.s b/test/MC/Mips/mips32r2/valid.s
index 4ef5aab..97cfa36 100644
--- a/test/MC/Mips/mips32r2/valid.s
+++ b/test/MC/Mips/mips32r2/valid.s
@@ -233,3 +233,4 @@
         trunc.w.s $f28,$f30
         wsbh      $k1,$9
         xor       $s2,$a0,$s8
+        synci     -15842($a2)          # CHECK: synci -15842($6)        # encoding: [0x04,0xdf,0xc2,0x1e]
diff --git a/test/MC/Mips/mips32r3/abiflags.s b/test/MC/Mips/mips32r3/abiflags.s
new file mode 100644
index 0000000..a6c7057
--- /dev/null
+++ b/test/MC/Mips/mips32r3/abiflags.s
@@ -0,0 +1,37 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32r3 | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+#
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32r3 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+
+# CHECK-ASM: .module fp=32
+# CHECK-ASM: .set fp=64
+
+# Checking if the Mips.abiflags were correctly emitted.
+# CHECK-OBJ:       Section {
+# CHECK-OBJ:         Index: 5
+# CHECK-OBJ-LABEL:   Name: .MIPS.abiflags (12)
+# CHECK-OBJ:         Type: SHT_MIPS_ABIFLAGS (0x7000002A)
+# CHECK-OBJ:          Flags [ (0x2)
+# CHECK-OBJ:           SHF_ALLOC (0x2)
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         Address: 0x0
+# CHECK-OBJ:         Size: 24
+# CHECK-OBJ:         Link: 0
+# CHECK-OBJ:         Info: 0
+# CHECK-OBJ:         AddressAlignment: 8
+# CHECK-OBJ:         EntrySize: 24
+# CHECK-OBJ:         Relocations [
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         SectionData (
+# CHECK-OBJ:           0000: 00002003 01010001 00000000 00000000  |.. .............|
+# CHECK-OBJ:           0010: 00000001 00000000                    |........|
+# CHECK-OBJ:         )
+# CHECK-OBJ-LABEL: }
+
+        .module fp=32
+        .set fp=64
+# FIXME: Test should include gnu_attributes directive when implemented.
+#        An explicit .gnu_attribute must be checked against the effective
+#        command line options and any inconsistencies reported via a warning.
diff --git a/test/MC/Mips/mips32r3/invalid-mips64r2.s b/test/MC/Mips/mips32r3/invalid-mips64r2.s
new file mode 100644
index 0000000..0f007e2
--- /dev/null
+++ b/test/MC/Mips/mips32r3/invalid-mips64r2.s
@@ -0,0 +1,10 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding \
+# RUN:     -mcpu=mips32r3 2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+	dsbh	$v1,$t6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+	dshd	$v0,$sp    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+
diff --git a/test/MC/Mips/mips32r3/invalid.s b/test/MC/Mips/mips32r3/invalid.s
new file mode 100644
index 0000000..f67f4c5
--- /dev/null
+++ b/test/MC/Mips/mips32r3/invalid.s
@@ -0,0 +1,10 @@
+# Instructions that are valid for the current ISA but should be rejected by the assembler (e.g.
+# invalid set of operands or operand's restrictions not met).
+
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -mcpu=mips32r3 2>%t1
+# RUN: FileCheck %s < %t1 -check-prefix=ASM
+
+        .text
+        .set noreorder
+        jalr.hb $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
+        jalr.hb $31, $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
diff --git a/test/MC/Mips/mips32r3/valid-xfail.s b/test/MC/Mips/mips32r3/valid-xfail.s
new file mode 100644
index 0000000..b0fc3a1
--- /dev/null
+++ b/test/MC/Mips/mips32r3/valid-xfail.s
@@ -0,0 +1,308 @@
+# Instructions that should be valid but currently fail for known reasons (e.g.
+# they aren't implemented yet).
+# This test is set up to XPASS if any instruction generates an encoding.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r3 | not FileCheck %s
+# CHECK-NOT: encoding
+# XFAIL: *
+
+        .set noat
+        abs.ps          $f22,$f8
+        absq_s.ph       $8,$a0
+        absq_s.qb       $15,$s1
+        absq_s.w        $s3,$ra
+        add.ps          $f25,$f27,$f13
+        addq.ph         $s1,$15,$at
+        addq_s.ph       $s3,$s6,$s2
+        addq_s.w        $a2,$8,$at
+        addqh.ph        $s4,$14,$s1
+        addqh.w         $s7,$s7,$k1
+        addqh_r.ph      $sp,$25,$s8
+        addqh_r.w       $8,$v1,$zero
+        addsc           $s8,$15,$12
+        addu.ph         $a2,$14,$s3
+        addu.qb         $s6,$v1,$v1
+        addu_s.ph       $a3,$s3,$gp
+        addu_s.qb       $s4,$s8,$s1
+        adduh.qb        $a1,$a1,$at
+        adduh_r.qb      $a0,$9,$12
+        addwc           $k0,$s6,$s7
+        alnv.ps         $f12,$f18,$f30,$12
+        and.v           $w10,$w25,$w29
+        bitrev          $14,$at
+        bmnz.v          $w15,$w2,$w28
+        bmz.v           $w13,$w11,$w21
+        bsel.v          $w28,$w7,$w0
+        c.eq.d          $fcc1,$f15,$f15
+        c.eq.ps         $fcc5,$f0,$f9
+        c.eq.s          $fcc5,$f24,$f17
+        c.f.d           $fcc4,$f11,$f21
+        c.f.ps          $fcc6,$f11,$f11
+        c.f.s           $fcc4,$f30,$f7
+        c.le.d          $fcc4,$f18,$f1
+        c.le.ps         $fcc1,$f7,$f20
+        c.le.s          $fcc6,$f24,$f4
+        c.lt.d          $fcc3,$f9,$f3
+        c.lt.ps         $f19,$f5
+        c.lt.s          $fcc2,$f17,$f14
+        c.nge.d         $fcc5,$f21,$f16
+        c.nge.ps        $f1,$f26
+        c.nge.s         $fcc3,$f11,$f8
+        c.ngl.ps        $f21,$f30
+        c.ngl.s         $fcc2,$f31,$f23
+        c.ngle.ps       $fcc7,$f12,$f20
+        c.ngle.s        $fcc2,$f18,$f23
+        c.ngt.d         $fcc4,$f24,$f7
+        c.ngt.ps        $fcc5,$f30,$f6
+        c.ngt.s         $fcc5,$f8,$f13
+        c.ole.d         $fcc2,$f16,$f31
+        c.ole.ps        $fcc7,$f21,$f8
+        c.ole.s         $fcc3,$f7,$f20
+        c.olt.d         $fcc4,$f19,$f28
+        c.olt.ps        $fcc3,$f7,$f16
+        c.olt.s         $fcc6,$f20,$f7
+        c.seq.d         $fcc4,$f31,$f7
+        c.seq.ps        $fcc6,$f31,$f14
+        c.seq.s         $fcc7,$f1,$f25
+        c.sf.ps         $fcc6,$f4,$f6
+        c.ueq.d         $fcc4,$f13,$f25
+        c.ueq.ps        $fcc1,$f5,$f29
+        c.ueq.s         $fcc6,$f3,$f30
+        c.ule.d         $fcc7,$f25,$f18
+        c.ule.ps        $fcc6,$f17,$f3
+        c.ule.s         $fcc7,$f21,$f30
+        c.ult.d         $fcc6,$f6,$f17
+        c.ult.ps        $fcc7,$f14,$f0
+        c.ult.s         $fcc7,$f24,$f10
+        c.un.d          $fcc6,$f23,$f24
+        c.un.ps         $fcc4,$f2,$f26
+        c.un.s          $fcc1,$f30,$f4
+        ceil.l.d        $f1,$f3
+        ceil.l.s        $f18,$f13
+        cfcmsa          $s6,$19
+        cmp.eq.ph       $s7,$14
+        cmp.le.ph       $8,$14
+        cmp.lt.ph       $k0,$sp
+        cmpgdu.eq.qb    $s3,$zero,$k0
+        cmpgdu.le.qb    $v1,$15,$s2
+        cmpgdu.lt.qb    $s0,$gp,$sp
+        cmpgu.eq.qb     $14,$s6,$s8
+        cmpgu.le.qb     $9,$a3,$s4
+        cmpgu.lt.qb     $sp,$at,$8
+        cmpu.eq.qb      $v0,$24
+        cmpu.le.qb      $s1,$a1
+        cmpu.lt.qb      $at,$a3
+        ctcmsa          $31,$s7
+        cvt.d.l         $f4,$f16
+        cvt.ps.s        $f3,$f18,$f19
+        cvt.s.l         $f15,$f30
+        cvt.s.pl        $f30,$f1
+        cvt.s.pu        $f14,$f25
+        dmt $k0
+        dpa.w.ph        $ac1,$s7,$k0
+        dpaq_s.w.ph     $ac2,$a0,$13
+        dpaq_sa.l.w     $ac0,$a2,$14
+        dpaqx_s.w.ph    $ac3,$a0,$24
+        dpaqx_sa.w.ph   $ac1,$zero,$s5
+        dpau.h.qbl      $ac1,$10,$24
+        dpau.h.qbr      $ac1,$s7,$s6
+        dpax.w.ph       $ac3,$a0,$k0
+        dps.w.ph        $ac1,$a3,$a1
+        dpsq_s.w.ph     $ac0,$gp,$k0
+        dpsq_sa.l.w     $ac0,$a3,$15
+        dpsqx_s.w.ph    $ac3,$13,$a3
+        dpsqx_sa.w.ph   $ac3,$sp,$s2
+        dpsu.h.qbl      $ac2,$14,$10
+        dpsu.h.qbr      $ac2,$a1,$s6
+        dpsx.w.ph       $ac0,$s7,$gp
+        dvpe            $s6
+        emt $8
+        evpe            $v0
+        extpdpv         $s6,$ac0,$s8
+        extpv           $13,$ac0,$14
+        extrv.w         $8,$ac3,$at
+        extrv_r.w       $8,$ac1,$s6
+        extrv_rs.w      $gp,$ac1,$s6
+        extrv_s.h       $s2,$ac1,$14
+        fclass.d        $w14,$w27
+        fclass.w        $w19,$w28
+        fexupl.d        $w10,$w29
+        fexupl.w        $w12,$w27
+        fexupr.d        $w31,$w15
+        fexupr.w        $w29,$w12
+        ffint_s.d       $w1,$w30
+        ffint_s.w       $w16,$w14
+        ffint_u.d       $w23,$w18
+        ffint_u.w       $w19,$w12
+        ffql.d          $w2,$w3
+        ffql.w          $w9,$w0
+        ffqr.d          $w25,$w24
+        ffqr.w          $w10,$w6
+        fill.b          $w9,$v1
+        fill.h          $w9,$8
+        fill.w          $w31,$15
+        flog2.d         $w12,$w16
+        flog2.w         $w19,$w23
+        floor.l.d       $f26,$f7
+        floor.l.s       $f12,$f5
+        fork            $s2,$8,$a0
+        frcp.d          $w12,$w4
+        frcp.w          $w30,$w8
+        frint.d         $w20,$w8
+        frint.w         $w11,$w29
+        frsqrt.d        $w29,$w2
+        frsqrt.w        $w9,$w8
+        fsqrt.d         $w3,$w1
+        fsqrt.w         $w5,$w15
+        ftint_s.d       $w31,$w26
+        ftint_s.w       $w27,$w14
+        ftint_u.d       $w5,$w31
+        ftint_u.w       $w12,$w29
+        ftrunc_s.d      $w4,$w22
+        ftrunc_s.w      $w24,$w7
+        ftrunc_u.d      $w20,$w25
+        ftrunc_u.w      $w7,$w26
+        insv            $s2,$at
+        iret
+        lbe             $14,122($9)
+        lbue            $11,-108($10)
+        lbux            $9,$14($v0)
+        lhe             $s6,219($v1)
+        lhue            $gp,118($11)
+        lhx             $sp,$k0($15)
+        lle             $gp,-237($ra)
+        lwe             $ra,-145($14)
+        lwle            $11,-42($11)
+        lwre            $sp,-152($24)
+        lwx             $12,$12($s4)
+        madd.ps         $f22,$f3,$f14,$f3
+        maq_s.w.phl     $ac2,$25,$11
+        maq_s.w.phr     $ac0,$10,$25
+        maq_sa.w.phl    $ac3,$a1,$v1
+        maq_sa.w.phr    $ac1,$at,$10
+        mfgc0           $s6,c0_datahi1
+        mflo            $9,$ac2
+        modsub          $a3,$12,$a3
+        mov.ps          $f22,$f17
+        move.v          $w8,$w17
+        movf.ps         $f10,$f28,$fcc6
+        movn.ps         $f31,$f31,$s3
+        movt.ps         $f20,$f25,$fcc2
+        movz.ps         $f18,$f17,$ra
+        msub            $ac2,$sp,$14
+        msub.ps         $f12,$f14,$f29,$f17
+        msubu           $ac2,$a1,$24
+        mtc0            $9,c0_datahi1
+        mtgc0           $s4,$21,7
+        mthi            $v0,$ac1
+        mthlip          $a3,$ac0
+        mul.ph          $s4,$24,$s0
+        mul.ps          $f14,$f0,$f16
+        mul_s.ph        $10,$14,$15
+        muleq_s.w.phl   $11,$s4,$s4
+        muleq_s.w.phr   $s6,$a0,$s8
+        muleu_s.ph.qbl  $a2,$14,$8
+        muleu_s.ph.qbr  $a1,$ra,$9
+        mulq_rs.ph      $s2,$14,$15
+        mulq_rs.w       $at,$s4,$25
+        mulq_s.ph       $s0,$k1,$15
+        mulq_s.w        $9,$a3,$s0
+        mulsa.w.ph      $ac1,$s4,$s6
+        mulsaq_s.w.ph   $ac0,$ra,$s2
+        neg.ps          $f19,$f13
+        nloc.b          $w12,$w30
+        nloc.d          $w16,$w7
+        nloc.h          $w21,$w17
+        nloc.w          $w17,$w16
+        nlzc.b          $w12,$w7
+        nlzc.d          $w14,$w14
+        nlzc.h          $w24,$w24
+        nlzc.w          $w10,$w4
+        nmadd.ps        $f27,$f4,$f9,$f25
+        nmsub.ps        $f6,$f12,$f14,$f17
+        nor.v           $w20,$w20,$w15
+        or.v            $w13,$w23,$w12
+        packrl.ph       $ra,$24,$14
+        pcnt.b          $w30,$w15
+        pcnt.d          $w5,$w16
+        pcnt.h          $w20,$w24
+        pcnt.w          $w22,$w20
+        pick.ph         $ra,$a2,$gp
+        pick.qb         $11,$a0,$gp
+        pll.ps          $f25,$f9,$f30
+        plu.ps          $f1,$f26,$f29
+        preceq.w.phl    $s8,$gp
+        preceq.w.phr    $s5,$15
+        precequ.ph.qbl  $s7,$ra
+        precequ.ph.qbla $a0,$9
+        precequ.ph.qbr  $ra,$s3
+        precequ.ph.qbra $24,$8
+        preceu.ph.qbl   $sp,$8
+        preceu.ph.qbla  $s6,$11
+        preceu.ph.qbr   $gp,$s1
+        preceu.ph.qbra  $k1,$s0
+        precr.qb.ph     $v0,$12,$s8
+        precrq.ph.w     $14,$s8,$24
+        precrq.qb.ph    $a2,$12,$12
+        precrq_rs.ph.w  $a1,$k0,$a3
+        precrqu_s.qb.ph $zero,$gp,$s5
+        pul.ps          $f9,$f30,$f26
+        puu.ps          $f24,$f9,$f2
+        raddu.w.qb      $25,$s3
+        rdpgpr          $s3,$9
+        recip.d         $f19,$f6
+        recip.s         $f3,$f30
+        repl.ph         $at,-307
+        replv.ph        $v1,$s7
+        replv.qb        $25,$12
+        rorv            $13,$a3,$s5
+        round.l.d       $f12,$f1
+        round.l.s       $f25,$f5
+        rsqrt.d         $f3,$f28
+        rsqrt.s         $f4,$f8
+        sbe             $s7,33($s1)
+        sce             $sp,189($10)
+        she             $24,105($v0)
+        shilo           $ac1,26
+        shilov          $ac2,$10
+        shllv.ph        $10,$s0,$s0
+        shllv.qb        $gp,$v1,$zero
+        shllv_s.ph      $k1,$at,$13
+        shllv_s.w       $s1,$ra,$k0
+        shrav.ph        $25,$s2,$s1
+        shrav.qb        $zero,$24,$11
+        shrav_r.ph      $s3,$11,$25
+        shrav_r.qb      $a0,$sp,$s5
+        shrav_r.w       $s7,$s4,$s6
+        shrlv.ph        $14,$10,$9
+        shrlv.qb        $a2,$s2,$11
+        sub.ps          $f5,$f14,$f26
+        subq.ph         $ra,$9,$s8
+        subq_s.ph       $13,$s8,$s5
+        subq_s.w        $k1,$a2,$a3
+        subqh.ph        $10,$at,$9
+        subqh.w         $v0,$a2,$zero
+        subqh_r.ph      $a0,$12,$s6
+        subqh_r.w       $10,$a2,$gp
+        subu.ph         $9,$s6,$s4
+        subu.qb         $s6,$a2,$s6
+        subu_s.ph       $v1,$a1,$s3
+        subu_s.qb       $s1,$at,$ra
+        subuh.qb        $zero,$gp,$gp
+        subuh_r.qb      $s4,$s8,$s6
+        swe             $24,94($k0)
+        swle            $v1,-209($gp)
+        swre            $k0,-202($s2)
+        tlbginv
+        tlbginvf
+        tlbgp
+        tlbgr
+        tlbgwi
+        tlbgwr
+        tlbinv
+        tlbinvf
+        trunc.l.d       $f23,$f23
+        trunc.l.s       $f28,$f31
+        wrpgpr          $zero,$13
+        xor.v           $w20,$w21,$w30
+        yield           $v1,$s0
diff --git a/test/MC/Mips/mips32r3/valid.s b/test/MC/Mips/mips32r3/valid.s
new file mode 100644
index 0000000..4280de5
--- /dev/null
+++ b/test/MC/Mips/mips32r3/valid.s
@@ -0,0 +1,236 @@
+# Instructions that are valid
+#
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r3 | FileCheck %s
+
+        .set noat
+        abs.d     $f7,$f25             # CHECK: encoding:
+        abs.s     $f9,$f16
+        add       $s7,$s2,$a1
+        add       $9,$14,15176         # CHECK: addi $9, $14, 15176   # encoding: [0x21,0xc9,0x3b,0x48]
+        add       $24,-7193            # CHECK: addi $24, $24, -7193  # encoding: [0x23,0x18,0xe3,0xe7]
+        add.d     $f1,$f7,$f29
+        add.s     $f8,$f21,$f24
+        addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
+        addu      $9,$a0,$a2
+        addu      $9,10                # CHECK: addiu $9, $9, 10    # encoding: [0x25,0x29,0x00,0x0a]
+        and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1fl     $fcc0,4688           # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     4688                 # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     $fcc7,27             # CHECK: bc1fl $fcc7, 27 # encoding: [0x45,0x1e,0x00,0x06]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1tl     $fcc0,4688           # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     4688                 # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     $fcc7,27             # CHECK: bc1tl $fcc7, 27 # encoding: [0x45,0x1f,0x00,0x06]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        beql      $14,$s3,12544        # CHECK: beql $14, $19, 12544 # encoding: [0x51,0xd3,0x0c,0x40]
+        bgezall   $12,7293             # CHECK: bgezall $12, 7293    # encoding: [0x05,0x93,0x07,0x1f]
+        bgezl     $4,-6858             # CHECK: bgezl $4, -6858      # encoding: [0x04,0x83,0xf9,0x4d]
+        bgtzl     $10,-3738            # CHECK: bgtzl $10, -3738     # encoding: [0x5d,0x40,0xfc,0x59]
+        blezl     $6,2974              # CHECK: blezl $6, 2974       # encoding: [0x58,0xc0,0x02,0xe7]
+        bltzall   $6,488               # CHECK: bltzall $6, 488      # encoding: [0x04,0xd2,0x00,0x7a]
+        bltzl     $s1,-9964            # CHECK: bltzl $17, -9964     # encoding: [0x06,0x22,0xf6,0x45]
+        bnel      $gp,$s4,5107         # CHECK: bnel $gp, $20, 5107  # encoding: [0x57,0x94,0x04,0xfc]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
+        c.ngl.d   $f29,$f29
+        c.ngle.d  $f0,$f16
+        c.sf.d    $f30,$f0
+        c.sf.s    $f14,$f22
+        ceil.w.d  $f11,$f25
+        ceil.w.s  $f6,$f20
+        cfc1      $s1,$21
+        clo       $11,$a1              # CHECK: clo $11, $5   # encoding: [0x70,0xab,0x58,0x21]
+        clz       $sp,$gp              # CHECK: clz $sp, $gp  # encoding: [0x73,0x9d,0xe8,0x20]
+        ctc1      $a2,$26
+        cvt.d.s   $f22,$f28
+        cvt.d.w   $f26,$f11
+        cvt.l.d   $f24,$f15
+        cvt.l.s   $f11,$f29
+        cvt.s.d   $f26,$f8
+        cvt.s.w   $f22,$f15
+        cvt.w.d   $f20,$f14
+        cvt.w.s   $f20,$f24
+        deret
+        di        $s8                  # CHECK: di  $fp       # encoding: [0x41,0x7e,0x60,0x00]
+        di                             # CHECK: di            # encoding: [0x41,0x60,0x60,0x00]
+        div       $zero,$25,$11
+        div.d     $f29,$f20,$f27
+        div.s     $f4,$f5,$f15
+        divu      $zero,$25,$15
+        ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
+        ei        $14                  # CHECK: ei  $14       # encoding: [0x41,0x6e,0x60,0x20]
+        ei                             # CHECK: ei            # encoding: [0x41,0x60,0x60,0x20]
+        eret
+        floor.w.d $f14,$f11
+        floor.w.s $f8,$f9
+        jr.hb     $4                   # CHECK: jr.hb  $4 # encoding: [0x00,0x80,0x04,0x08]
+        jalr.hb   $4                   # CHECK: jalr.hb  $4 # encoding: [0x00,0x80,0xfc,0x09]
+        jalr.hb   $4, $5               # CHECK: jalr.hb  $4, $5 # encoding: [0x00,0xa0,0x24,0x09]
+        lb        $24,-14515($10)
+        lbu       $8,30195($v1)
+        ldc1      $f11,16391($s0)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
+        ldxc1     $f8,$s7($15)
+        lh        $11,-8556($s5)
+        lhu       $s3,-22851($v0)
+        li        $at,-29773
+        li        $zero,-29889
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
+        luxc1     $f19,$s6($s5)
+        lw        $8,5674($a1)
+        lwc1      $f16,10225($k0)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
+        lwl       $s4,-4231($15)
+        lwr       $zero,-19147($gp)
+        lwxc1     $f12,$s1($s8)
+        madd      $s6,$13
+        madd      $zero,$9
+        madd.d    $f18,$f19,$f26,$f20
+        madd.s    $f1,$f31,$f19,$f25
+        maddu     $s3,$gp
+        maddu     $24,$s2
+        mfc0      $a2,$14,1
+        mfc1      $a3,$f27
+        mfhc1     $s8,$f24
+        mfhi      $s3
+        mfhi      $sp
+        mflo      $s1
+        mov.d     $f20,$f14
+        mov.s     $f2,$f27
+        move      $s8,$a0
+        move      $25,$a2
+        movf      $gp,$8,$fcc7
+        movf.d    $f6,$f11,$fcc5
+        movf.s    $f23,$f5,$fcc6
+        movn      $v1,$s1,$s0
+        movn.d    $f27,$f21,$k0
+        movn.s    $f12,$f0,$s7
+        movt      $zero,$s4,$fcc5
+        movt.d    $f0,$f2,$fcc0
+        movt.s    $f30,$f2,$fcc1
+        movz      $a1,$s6,$9
+        movz.d    $f12,$f29,$9
+        movz.s    $f25,$f7,$v1
+        msub      $s7,$k1
+        msub.d    $f10,$f1,$f31,$f18
+        msub.s    $f12,$f19,$f10,$f16
+        msubu     $15,$a1
+        mtc0      $9,$29,3
+        mtc1      $s8,$f9
+        mthc1     $zero,$f16
+        mthi      $s1
+        mtlo      $sp
+        mtlo      $25
+        mul       $s0,$s4,$at
+        mul.d     $f20,$f20,$f16
+        mul.s     $f30,$f10,$f2
+        mult      $sp,$s4
+        mult      $sp,$v0
+        multu     $gp,$k0
+        multu     $9,$s2
+        negu      $2                   # CHECK: negu $2, $2            # encoding: [0x00,0x02,0x10,0x23]
+        negu      $2,$3                # CHECK: negu $2, $3            # encoding: [0x00,0x03,0x10,0x23]
+        neg.d     $f27,$f18
+        neg.s     $f1,$f15
+        nmadd.d   $f18,$f9,$f14,$f19
+        nmadd.s   $f0,$f5,$f25,$f12
+        nmsub.d   $f30,$f8,$f16,$f30
+        nmsub.s   $f1,$f24,$f19,$f4
+        nop
+        nor       $a3,$zero,$a3
+        or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4           # encoding: [0x34,0x42,0x00,0x04]
+        pause                          # CHECK: pause # encoding:  [0x00,0x00,0x01,0x40]
+        pref      1, 8($5)             # CHECK: pref 1, 8($5)           # encoding: [0xcc,0xa1,0x00,0x08]
+        # FIXME: Use the code generator in order to print the .set directives
+        #        instead of the instruction printer.
+        rdhwr     $sp,$11              # CHECK:      .set  push
+                                       # CHECK-NEXT: .set  mips32r2
+                                       # CHECK-NEXT: rdhwr $sp, $11
+                                       # CHECK-NEXT: .set  pop          # encoding: [0x7c,0x1d,0x58,0x3b]
+        rotr      $1,15                # CHECK: rotr $1, $1, 15         # encoding: [0x00,0x21,0x0b,0xc2]
+        rotr      $1,$14,15            # CHECK: rotr $1, $14, 15        # encoding: [0x00,0x2e,0x0b,0xc2]
+        rotrv     $1,$14,$15           # CHECK: rotrv $1, $14, $15      # encoding: [0x01,0xee,0x08,0x46]
+        round.w.d $f6,$f4
+        round.w.s $f27,$f28
+        sb        $s6,-19857($14)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
+        sdbbp                          # CHECK: sdbbp                  # encoding: [0x70,0x00,0x00,0x3f]
+        sdbbp     34                   # CHECK: sdbbp 34               # encoding: [0x70,0x00,0x08,0xbf]
+        sdc1      $f31,30574($13)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
+        sdxc1     $f11,$10($14)
+        seb       $25,$15
+        seh       $v1,$12
+        sh        $14,-6704($15)
+        sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
+        sll       $a3,$zero,18         # CHECK: sll $7, $zero, 18      # encoding: [0x00,0x00,0x3c,0x80]
+        sll       $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        sllv      $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        slt       $s7,$11,$k1          # CHECK: slt $23, $11, $27      # encoding: [0x01,0x7b,0xb8,0x2a]
+        slti      $s1,$10,9489         # CHECK: slti $17, $10, 9489    # encoding: [0x29,0x51,0x25,0x11]
+        sltiu     $25,$25,-15531       # CHECK: sltiu $25, $25, -15531 # encoding: [0x2f,0x39,0xc3,0x55]
+        sltu      $s4,$s5,$11          # CHECK: sltu  $20, $21, $11    # encoding: [0x02,0xab,0xa0,0x2b]
+        sltu      $24,$25,-15531       # CHECK: sltiu $24, $25, -15531 # encoding: [0x2f,0x38,0xc3,0x55]
+        sqrt.d    $f17,$f22
+        sqrt.s    $f0,$f1
+        sra       $s1,15               # CHECK: sra $17, $17, 15       # encoding: [0x00,0x11,0x8b,0xc3]
+        sra       $s1,$s7,15           # CHECK: sra $17, $23, 15       # encoding: [0x00,0x17,0x8b,0xc3]
+        sra       $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srav      $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srl       $2,7                 # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $2,$2,7              # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sub       $s6,$s3,$12
+        sub       $22,$17,-3126        # CHECK: addi $22, $17, 3126    # encoding: [0x22,0x36,0x0c,0x36]
+        sub       $13,6512             # CHECK: addi $13, $13, -6512   # encoding: [0x21,0xad,0xe6,0x90]
+        sub.d     $f18,$f3,$f17
+        sub.s     $f23,$f22,$f22
+        subu      $sp,$s6,$s6
+        suxc1     $f12,$k1($13)
+        sw        $ra,-10160($sp)
+        swc1      $f6,-8465($24)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
+        swl       $15,13694($s3)
+        swr       $s1,-26590($14)
+        swxc1     $f19,$12($k0)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        sync      1                    # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
+        teq       $0,$3                # CHECK: teq $zero, $3          # encoding: [0x00,0x03,0x00,0x34]
+        teq       $5,$7,620            # CHECK: teq $5, $7, 620        # encoding: [0x00,0xa7,0x9b,0x34]
+        teqi      $s5,-17504
+        tge       $7,$10               # CHECK: tge $7, $10            # encoding: [0x00,0xea,0x00,0x30]
+        tge       $5,$19,340           # CHECK: tge $5, $19, 340       # encoding: [0x00,0xb3,0x55,0x30]
+        tgei      $s1,5025
+        tgeiu     $sp,-28621
+        tgeu      $22,$28              # CHECK: tgeu $22, $gp          # encoding: [0x02,0xdc,0x00,0x31]
+        tgeu      $20,$14,379          # CHECK: tgeu $20, $14, 379     # encoding: [0x02,0x8e,0x5e,0xf1]
+        tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
+        tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
+        tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
+        tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlt       $15,$13              # CHECK: tlt $15, $13           # encoding: [0x01,0xed,0x00,0x32]
+        tlt       $2,$19,133           # CHECK: tlt $2, $19, 133       # encoding: [0x00,0x53,0x21,0x72]
+        tlti      $14,-21059
+        tltiu     $ra,-5076
+        tltu      $11,$16              # CHECK: tltu $11, $16          # encoding: [0x01,0x70,0x00,0x33]
+        tltu      $16,$29,1016         # CHECK: tltu $16, $sp, 1016    # encoding: [0x02,0x1d,0xfe,0x33]
+        tne       $6,$17               # CHECK: tne $6, $17            # encoding: [0x00,0xd1,0x00,0x36]
+        tne       $7,$8,885            # CHECK: tne $7, $8, 885        # encoding: [0x00,0xe8,0xdd,0x76]
+        tnei      $12,-29647
+        trunc.w.d $f22,$f15
+        trunc.w.s $f28,$f30
+        wsbh      $k1,$9
+        xor       $s2,$a0,$s8
+        synci     -15842($a2)          # CHECK: synci -15842($6)        # encoding: [0x04,0xdf,0xc2,0x1e]
diff --git a/test/MC/Mips/mips32r5/abiflags.s b/test/MC/Mips/mips32r5/abiflags.s
new file mode 100644
index 0000000..fa5befe
--- /dev/null
+++ b/test/MC/Mips/mips32r5/abiflags.s
@@ -0,0 +1,37 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32r5 | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+#
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32r5 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+
+# CHECK-ASM: .module fp=32
+# CHECK-ASM: .set fp=64
+
+# Checking if the Mips.abiflags were correctly emitted.
+# CHECK-OBJ:       Section {
+# CHECK-OBJ:         Index: 5
+# CHECK-OBJ-LABEL:   Name: .MIPS.abiflags (12)
+# CHECK-OBJ:         Type: SHT_MIPS_ABIFLAGS (0x7000002A)
+# CHECK-OBJ:          Flags [ (0x2)
+# CHECK-OBJ:           SHF_ALLOC (0x2)
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         Address: 0x0
+# CHECK-OBJ:         Size: 24
+# CHECK-OBJ:         Link: 0
+# CHECK-OBJ:         Info: 0
+# CHECK-OBJ:         AddressAlignment: 8
+# CHECK-OBJ:         EntrySize: 24
+# CHECK-OBJ:         Relocations [
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         SectionData (
+# CHECK-OBJ:           0000: 00002005 01010001 00000000 00000000  |.. .............|
+# CHECK-OBJ:           0010: 00000001 00000000                    |........|
+# CHECK-OBJ:         )
+# CHECK-OBJ-LABEL: }
+
+        .module fp=32
+        .set fp=64
+# FIXME: Test should include gnu_attributes directive when implemented.
+#        An explicit .gnu_attribute must be checked against the effective
+#        command line options and any inconsistencies reported via a warning.
diff --git a/test/MC/Mips/mips32r5/invalid-mips64r2.s b/test/MC/Mips/mips32r5/invalid-mips64r2.s
new file mode 100644
index 0000000..ed8f5c7
--- /dev/null
+++ b/test/MC/Mips/mips32r5/invalid-mips64r2.s
@@ -0,0 +1,10 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding \
+# RUN:     -mcpu=mips32r5 2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+	dsbh	$v1,$t6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+	dshd	$v0,$sp    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+
diff --git a/test/MC/Mips/mips32r5/invalid.s b/test/MC/Mips/mips32r5/invalid.s
new file mode 100644
index 0000000..fec30e1
--- /dev/null
+++ b/test/MC/Mips/mips32r5/invalid.s
@@ -0,0 +1,10 @@
+# Instructions that are valid for the current ISA but should be rejected by the assembler (e.g.
+# invalid set of operands or operand's restrictions not met).
+
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -mcpu=mips32r5 2>%t1
+# RUN: FileCheck %s < %t1 -check-prefix=ASM
+
+        .text
+        .set noreorder
+        jalr.hb $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
+        jalr.hb $31, $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
diff --git a/test/MC/Mips/mips32r5/valid-xfail.s b/test/MC/Mips/mips32r5/valid-xfail.s
new file mode 100644
index 0000000..a821ddd
--- /dev/null
+++ b/test/MC/Mips/mips32r5/valid-xfail.s
@@ -0,0 +1,308 @@
+# Instructions that should be valid but currently fail for known reasons (e.g.
+# they aren't implemented yet).
+# This test is set up to XPASS if any instruction generates an encoding.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r5 | not FileCheck %s
+# CHECK-NOT: encoding
+# XFAIL: *
+
+        .set noat
+        abs.ps          $f22,$f8
+        absq_s.ph       $8,$a0
+        absq_s.qb       $15,$s1
+        absq_s.w        $s3,$ra
+        add.ps          $f25,$f27,$f13
+        addq.ph         $s1,$15,$at
+        addq_s.ph       $s3,$s6,$s2
+        addq_s.w        $a2,$8,$at
+        addqh.ph        $s4,$14,$s1
+        addqh.w         $s7,$s7,$k1
+        addqh_r.ph      $sp,$25,$s8
+        addqh_r.w       $8,$v1,$zero
+        addsc           $s8,$15,$12
+        addu.ph         $a2,$14,$s3
+        addu.qb         $s6,$v1,$v1
+        addu_s.ph       $a3,$s3,$gp
+        addu_s.qb       $s4,$s8,$s1
+        adduh.qb        $a1,$a1,$at
+        adduh_r.qb      $a0,$9,$12
+        addwc           $k0,$s6,$s7
+        alnv.ps         $f12,$f18,$f30,$12
+        and.v           $w10,$w25,$w29
+        bitrev          $14,$at
+        bmnz.v          $w15,$w2,$w28
+        bmz.v           $w13,$w11,$w21
+        bsel.v          $w28,$w7,$w0
+        c.eq.d          $fcc1,$f15,$f15
+        c.eq.ps         $fcc5,$f0,$f9
+        c.eq.s          $fcc5,$f24,$f17
+        c.f.d           $fcc4,$f11,$f21
+        c.f.ps          $fcc6,$f11,$f11
+        c.f.s           $fcc4,$f30,$f7
+        c.le.d          $fcc4,$f18,$f1
+        c.le.ps         $fcc1,$f7,$f20
+        c.le.s          $fcc6,$f24,$f4
+        c.lt.d          $fcc3,$f9,$f3
+        c.lt.ps         $f19,$f5
+        c.lt.s          $fcc2,$f17,$f14
+        c.nge.d         $fcc5,$f21,$f16
+        c.nge.ps        $f1,$f26
+        c.nge.s         $fcc3,$f11,$f8
+        c.ngl.ps        $f21,$f30
+        c.ngl.s         $fcc2,$f31,$f23
+        c.ngle.ps       $fcc7,$f12,$f20
+        c.ngle.s        $fcc2,$f18,$f23
+        c.ngt.d         $fcc4,$f24,$f7
+        c.ngt.ps        $fcc5,$f30,$f6
+        c.ngt.s         $fcc5,$f8,$f13
+        c.ole.d         $fcc2,$f16,$f31
+        c.ole.ps        $fcc7,$f21,$f8
+        c.ole.s         $fcc3,$f7,$f20
+        c.olt.d         $fcc4,$f19,$f28
+        c.olt.ps        $fcc3,$f7,$f16
+        c.olt.s         $fcc6,$f20,$f7
+        c.seq.d         $fcc4,$f31,$f7
+        c.seq.ps        $fcc6,$f31,$f14
+        c.seq.s         $fcc7,$f1,$f25
+        c.sf.ps         $fcc6,$f4,$f6
+        c.ueq.d         $fcc4,$f13,$f25
+        c.ueq.ps        $fcc1,$f5,$f29
+        c.ueq.s         $fcc6,$f3,$f30
+        c.ule.d         $fcc7,$f25,$f18
+        c.ule.ps        $fcc6,$f17,$f3
+        c.ule.s         $fcc7,$f21,$f30
+        c.ult.d         $fcc6,$f6,$f17
+        c.ult.ps        $fcc7,$f14,$f0
+        c.ult.s         $fcc7,$f24,$f10
+        c.un.d          $fcc6,$f23,$f24
+        c.un.ps         $fcc4,$f2,$f26
+        c.un.s          $fcc1,$f30,$f4
+        ceil.l.d        $f1,$f3
+        ceil.l.s        $f18,$f13
+        cfcmsa          $s6,$19
+        cmp.eq.ph       $s7,$14
+        cmp.le.ph       $8,$14
+        cmp.lt.ph       $k0,$sp
+        cmpgdu.eq.qb    $s3,$zero,$k0
+        cmpgdu.le.qb    $v1,$15,$s2
+        cmpgdu.lt.qb    $s0,$gp,$sp
+        cmpgu.eq.qb     $14,$s6,$s8
+        cmpgu.le.qb     $9,$a3,$s4
+        cmpgu.lt.qb     $sp,$at,$8
+        cmpu.eq.qb      $v0,$24
+        cmpu.le.qb      $s1,$a1
+        cmpu.lt.qb      $at,$a3
+        ctcmsa          $31,$s7
+        cvt.d.l         $f4,$f16
+        cvt.ps.s        $f3,$f18,$f19
+        cvt.s.l         $f15,$f30
+        cvt.s.pl        $f30,$f1
+        cvt.s.pu        $f14,$f25
+        dmt $k0
+        dpa.w.ph        $ac1,$s7,$k0
+        dpaq_s.w.ph     $ac2,$a0,$13
+        dpaq_sa.l.w     $ac0,$a2,$14
+        dpaqx_s.w.ph    $ac3,$a0,$24
+        dpaqx_sa.w.ph   $ac1,$zero,$s5
+        dpau.h.qbl      $ac1,$10,$24
+        dpau.h.qbr      $ac1,$s7,$s6
+        dpax.w.ph       $ac3,$a0,$k0
+        dps.w.ph        $ac1,$a3,$a1
+        dpsq_s.w.ph     $ac0,$gp,$k0
+        dpsq_sa.l.w     $ac0,$a3,$15
+        dpsqx_s.w.ph    $ac3,$13,$a3
+        dpsqx_sa.w.ph   $ac3,$sp,$s2
+        dpsu.h.qbl      $ac2,$14,$10
+        dpsu.h.qbr      $ac2,$a1,$s6
+        dpsx.w.ph       $ac0,$s7,$gp
+        dvpe            $s6
+        emt $8
+        evpe            $v0
+        extpdpv         $s6,$ac0,$s8
+        extpv           $13,$ac0,$14
+        extrv.w         $8,$ac3,$at
+        extrv_r.w       $8,$ac1,$s6
+        extrv_rs.w      $gp,$ac1,$s6
+        extrv_s.h       $s2,$ac1,$14
+        fclass.d        $w14,$w27
+        fclass.w        $w19,$w28
+        fexupl.d        $w10,$w29
+        fexupl.w        $w12,$w27
+        fexupr.d        $w31,$w15
+        fexupr.w        $w29,$w12
+        ffint_s.d       $w1,$w30
+        ffint_s.w       $w16,$w14
+        ffint_u.d       $w23,$w18
+        ffint_u.w       $w19,$w12
+        ffql.d          $w2,$w3
+        ffql.w          $w9,$w0
+        ffqr.d          $w25,$w24
+        ffqr.w          $w10,$w6
+        fill.b          $w9,$v1
+        fill.h          $w9,$8
+        fill.w          $w31,$15
+        flog2.d         $w12,$w16
+        flog2.w         $w19,$w23
+        floor.l.d       $f26,$f7
+        floor.l.s       $f12,$f5
+        fork            $s2,$8,$a0
+        frcp.d          $w12,$w4
+        frcp.w          $w30,$w8
+        frint.d         $w20,$w8
+        frint.w         $w11,$w29
+        frsqrt.d        $w29,$w2
+        frsqrt.w        $w9,$w8
+        fsqrt.d         $w3,$w1
+        fsqrt.w         $w5,$w15
+        ftint_s.d       $w31,$w26
+        ftint_s.w       $w27,$w14
+        ftint_u.d       $w5,$w31
+        ftint_u.w       $w12,$w29
+        ftrunc_s.d      $w4,$w22
+        ftrunc_s.w      $w24,$w7
+        ftrunc_u.d      $w20,$w25
+        ftrunc_u.w      $w7,$w26
+        insv            $s2,$at
+        iret
+        lbe             $14,122($9)
+        lbue            $11,-108($10)
+        lbux            $9,$14($v0)
+        lhe             $s6,219($v1)
+        lhue            $gp,118($11)
+        lhx             $sp,$k0($15)
+        lle             $gp,-237($ra)
+        lwe             $ra,-145($14)
+        lwle            $11,-42($11)
+        lwre            $sp,-152($24)
+        lwx             $12,$12($s4)
+        madd.ps         $f22,$f3,$f14,$f3
+        maq_s.w.phl     $ac2,$25,$11
+        maq_s.w.phr     $ac0,$10,$25
+        maq_sa.w.phl    $ac3,$a1,$v1
+        maq_sa.w.phr    $ac1,$at,$10
+        mfgc0           $s6,c0_datahi1
+        mflo            $9,$ac2
+        modsub          $a3,$12,$a3
+        mov.ps          $f22,$f17
+        move.v          $w8,$w17
+        movf.ps         $f10,$f28,$fcc6
+        movn.ps         $f31,$f31,$s3
+        movt.ps         $f20,$f25,$fcc2
+        movz.ps         $f18,$f17,$ra
+        msub            $ac2,$sp,$14
+        msub.ps         $f12,$f14,$f29,$f17
+        msubu           $ac2,$a1,$24
+        mtc0            $9,c0_datahi1
+        mtgc0           $s4,$21,7
+        mthi            $v0,$ac1
+        mthlip          $a3,$ac0
+        mul.ph          $s4,$24,$s0
+        mul.ps          $f14,$f0,$f16
+        mul_s.ph        $10,$14,$15
+        muleq_s.w.phl   $11,$s4,$s4
+        muleq_s.w.phr   $s6,$a0,$s8
+        muleu_s.ph.qbl  $a2,$14,$8
+        muleu_s.ph.qbr  $a1,$ra,$9
+        mulq_rs.ph      $s2,$14,$15
+        mulq_rs.w       $at,$s4,$25
+        mulq_s.ph       $s0,$k1,$15
+        mulq_s.w        $9,$a3,$s0
+        mulsa.w.ph      $ac1,$s4,$s6
+        mulsaq_s.w.ph   $ac0,$ra,$s2
+        neg.ps          $f19,$f13
+        nloc.b          $w12,$w30
+        nloc.d          $w16,$w7
+        nloc.h          $w21,$w17
+        nloc.w          $w17,$w16
+        nlzc.b          $w12,$w7
+        nlzc.d          $w14,$w14
+        nlzc.h          $w24,$w24
+        nlzc.w          $w10,$w4
+        nmadd.ps        $f27,$f4,$f9,$f25
+        nmsub.ps        $f6,$f12,$f14,$f17
+        nor.v           $w20,$w20,$w15
+        or.v            $w13,$w23,$w12
+        packrl.ph       $ra,$24,$14
+        pcnt.b          $w30,$w15
+        pcnt.d          $w5,$w16
+        pcnt.h          $w20,$w24
+        pcnt.w          $w22,$w20
+        pick.ph         $ra,$a2,$gp
+        pick.qb         $11,$a0,$gp
+        pll.ps          $f25,$f9,$f30
+        plu.ps          $f1,$f26,$f29
+        preceq.w.phl    $s8,$gp
+        preceq.w.phr    $s5,$15
+        precequ.ph.qbl  $s7,$ra
+        precequ.ph.qbla $a0,$9
+        precequ.ph.qbr  $ra,$s3
+        precequ.ph.qbra $24,$8
+        preceu.ph.qbl   $sp,$8
+        preceu.ph.qbla  $s6,$11
+        preceu.ph.qbr   $gp,$s1
+        preceu.ph.qbra  $k1,$s0
+        precr.qb.ph     $v0,$12,$s8
+        precrq.ph.w     $14,$s8,$24
+        precrq.qb.ph    $a2,$12,$12
+        precrq_rs.ph.w  $a1,$k0,$a3
+        precrqu_s.qb.ph $zero,$gp,$s5
+        pul.ps          $f9,$f30,$f26
+        puu.ps          $f24,$f9,$f2
+        raddu.w.qb      $25,$s3
+        rdpgpr          $s3,$9
+        recip.d         $f19,$f6
+        recip.s         $f3,$f30
+        repl.ph         $at,-307
+        replv.ph        $v1,$s7
+        replv.qb        $25,$12
+        rorv            $13,$a3,$s5
+        round.l.d       $f12,$f1
+        round.l.s       $f25,$f5
+        rsqrt.d         $f3,$f28
+        rsqrt.s         $f4,$f8
+        sbe             $s7,33($s1)
+        sce             $sp,189($10)
+        she             $24,105($v0)
+        shilo           $ac1,26
+        shilov          $ac2,$10
+        shllv.ph        $10,$s0,$s0
+        shllv.qb        $gp,$v1,$zero
+        shllv_s.ph      $k1,$at,$13
+        shllv_s.w       $s1,$ra,$k0
+        shrav.ph        $25,$s2,$s1
+        shrav.qb        $zero,$24,$11
+        shrav_r.ph      $s3,$11,$25
+        shrav_r.qb      $a0,$sp,$s5
+        shrav_r.w       $s7,$s4,$s6
+        shrlv.ph        $14,$10,$9
+        shrlv.qb        $a2,$s2,$11
+        sub.ps          $f5,$f14,$f26
+        subq.ph         $ra,$9,$s8
+        subq_s.ph       $13,$s8,$s5
+        subq_s.w        $k1,$a2,$a3
+        subqh.ph        $10,$at,$9
+        subqh.w         $v0,$a2,$zero
+        subqh_r.ph      $a0,$12,$s6
+        subqh_r.w       $10,$a2,$gp
+        subu.ph         $9,$s6,$s4
+        subu.qb         $s6,$a2,$s6
+        subu_s.ph       $v1,$a1,$s3
+        subu_s.qb       $s1,$at,$ra
+        subuh.qb        $zero,$gp,$gp
+        subuh_r.qb      $s4,$s8,$s6
+        swe             $24,94($k0)
+        swle            $v1,-209($gp)
+        swre            $k0,-202($s2)
+        tlbginv
+        tlbginvf
+        tlbgp
+        tlbgr
+        tlbgwi
+        tlbgwr
+        tlbinv
+        tlbinvf
+        trunc.l.d       $f23,$f23
+        trunc.l.s       $f28,$f31
+        wrpgpr          $zero,$13
+        xor.v           $w20,$w21,$w30
+        yield           $v1,$s0
diff --git a/test/MC/Mips/mips32r5/valid.s b/test/MC/Mips/mips32r5/valid.s
new file mode 100644
index 0000000..13341d5
--- /dev/null
+++ b/test/MC/Mips/mips32r5/valid.s
@@ -0,0 +1,236 @@
+# Instructions that are valid
+#
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r5 | FileCheck %s
+
+        .set noat
+        abs.d     $f7,$f25             # CHECK: encoding:
+        abs.s     $f9,$f16
+        add       $s7,$s2,$a1
+        add       $9,$14,15176         # CHECK: addi $9, $14, 15176   # encoding: [0x21,0xc9,0x3b,0x48]
+        add       $24,-7193            # CHECK: addi $24, $24, -7193  # encoding: [0x23,0x18,0xe3,0xe7]
+        add.d     $f1,$f7,$f29
+        add.s     $f8,$f21,$f24
+        addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
+        addu      $9,$a0,$a2
+        addu      $9,10                # CHECK: addiu $9, $9, 10    # encoding: [0x25,0x29,0x00,0x0a]
+        and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1fl     $fcc0,4688           # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     4688                 # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     $fcc7,27             # CHECK: bc1fl $fcc7, 27 # encoding: [0x45,0x1e,0x00,0x06]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1tl     $fcc0,4688           # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     4688                 # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     $fcc7,27             # CHECK: bc1tl $fcc7, 27 # encoding: [0x45,0x1f,0x00,0x06]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        beql      $14,$s3,12544        # CHECK: beql $14, $19, 12544 # encoding: [0x51,0xd3,0x0c,0x40]
+        bgezall   $12,7293             # CHECK: bgezall $12, 7293    # encoding: [0x05,0x93,0x07,0x1f]
+        bgezl     $4,-6858             # CHECK: bgezl $4, -6858      # encoding: [0x04,0x83,0xf9,0x4d]
+        bgtzl     $10,-3738            # CHECK: bgtzl $10, -3738     # encoding: [0x5d,0x40,0xfc,0x59]
+        blezl     $6,2974              # CHECK: blezl $6, 2974       # encoding: [0x58,0xc0,0x02,0xe7]
+        bltzall   $6,488               # CHECK: bltzall $6, 488      # encoding: [0x04,0xd2,0x00,0x7a]
+        bltzl     $s1,-9964            # CHECK: bltzl $17, -9964     # encoding: [0x06,0x22,0xf6,0x45]
+        bnel      $gp,$s4,5107         # CHECK: bnel $gp, $20, 5107  # encoding: [0x57,0x94,0x04,0xfc]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
+        c.ngl.d   $f29,$f29
+        c.ngle.d  $f0,$f16
+        c.sf.d    $f30,$f0
+        c.sf.s    $f14,$f22
+        ceil.w.d  $f11,$f25
+        ceil.w.s  $f6,$f20
+        cfc1      $s1,$21
+        clo       $11,$a1              # CHECK: clo $11, $5   # encoding: [0x70,0xab,0x58,0x21]
+        clz       $sp,$gp              # CHECK: clz $sp, $gp  # encoding: [0x73,0x9d,0xe8,0x20]
+        ctc1      $a2,$26
+        cvt.d.s   $f22,$f28
+        cvt.d.w   $f26,$f11
+        cvt.l.d   $f24,$f15
+        cvt.l.s   $f11,$f29
+        cvt.s.d   $f26,$f8
+        cvt.s.w   $f22,$f15
+        cvt.w.d   $f20,$f14
+        cvt.w.s   $f20,$f24
+        deret
+        di        $s8                  # CHECK: di  $fp       # encoding: [0x41,0x7e,0x60,0x00]
+        di                             # CHECK: di            # encoding: [0x41,0x60,0x60,0x00]
+        div       $zero,$25,$11
+        div.d     $f29,$f20,$f27
+        div.s     $f4,$f5,$f15
+        divu      $zero,$25,$15
+        ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
+        ei        $14                  # CHECK: ei  $14       # encoding: [0x41,0x6e,0x60,0x20]
+        ei                             # CHECK: ei            # encoding: [0x41,0x60,0x60,0x20]
+        eret
+        floor.w.d $f14,$f11
+        floor.w.s $f8,$f9
+        jr.hb     $4                   # CHECK: jr.hb  $4 # encoding: [0x00,0x80,0x04,0x08]
+        jalr.hb   $4                   # CHECK: jalr.hb  $4 # encoding: [0x00,0x80,0xfc,0x09]
+        jalr.hb   $4, $5               # CHECK: jalr.hb  $4, $5 # encoding: [0x00,0xa0,0x24,0x09]
+        lb        $24,-14515($10)
+        lbu       $8,30195($v1)
+        ldc1      $f11,16391($s0)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
+        ldxc1     $f8,$s7($15)
+        lh        $11,-8556($s5)
+        lhu       $s3,-22851($v0)
+        li        $at,-29773
+        li        $zero,-29889
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
+        luxc1     $f19,$s6($s5)
+        lw        $8,5674($a1)
+        lwc1      $f16,10225($k0)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
+        lwl       $s4,-4231($15)
+        lwr       $zero,-19147($gp)
+        lwxc1     $f12,$s1($s8)
+        madd      $s6,$13
+        madd      $zero,$9
+        madd.d    $f18,$f19,$f26,$f20
+        madd.s    $f1,$f31,$f19,$f25
+        maddu     $s3,$gp
+        maddu     $24,$s2
+        mfc0      $a2,$14,1
+        mfc1      $a3,$f27
+        mfhc1     $s8,$f24
+        mfhi      $s3
+        mfhi      $sp
+        mflo      $s1
+        mov.d     $f20,$f14
+        mov.s     $f2,$f27
+        move      $s8,$a0
+        move      $25,$a2
+        movf      $gp,$8,$fcc7
+        movf.d    $f6,$f11,$fcc5
+        movf.s    $f23,$f5,$fcc6
+        movn      $v1,$s1,$s0
+        movn.d    $f27,$f21,$k0
+        movn.s    $f12,$f0,$s7
+        movt      $zero,$s4,$fcc5
+        movt.d    $f0,$f2,$fcc0
+        movt.s    $f30,$f2,$fcc1
+        movz      $a1,$s6,$9
+        movz.d    $f12,$f29,$9
+        movz.s    $f25,$f7,$v1
+        msub      $s7,$k1
+        msub.d    $f10,$f1,$f31,$f18
+        msub.s    $f12,$f19,$f10,$f16
+        msubu     $15,$a1
+        mtc0      $9,$29,3
+        mtc1      $s8,$f9
+        mthc1     $zero,$f16
+        mthi      $s1
+        mtlo      $sp
+        mtlo      $25
+        mul       $s0,$s4,$at
+        mul.d     $f20,$f20,$f16
+        mul.s     $f30,$f10,$f2
+        mult      $sp,$s4
+        mult      $sp,$v0
+        multu     $gp,$k0
+        multu     $9,$s2
+        negu      $2                   # CHECK: negu $2, $2            # encoding: [0x00,0x02,0x10,0x23]
+        negu      $2,$3                # CHECK: negu $2, $3            # encoding: [0x00,0x03,0x10,0x23]
+        neg.d     $f27,$f18
+        neg.s     $f1,$f15
+        nmadd.d   $f18,$f9,$f14,$f19
+        nmadd.s   $f0,$f5,$f25,$f12
+        nmsub.d   $f30,$f8,$f16,$f30
+        nmsub.s   $f1,$f24,$f19,$f4
+        nop
+        nor       $a3,$zero,$a3
+        or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4           # encoding: [0x34,0x42,0x00,0x04]
+        pause                          # CHECK: pause # encoding:  [0x00,0x00,0x01,0x40]
+        pref      1, 8($5)             # CHECK: pref 1, 8($5)           # encoding: [0xcc,0xa1,0x00,0x08]
+        # FIXME: Use the code generator in order to print the .set directives
+        #        instead of the instruction printer.
+        rdhwr     $sp,$11              # CHECK:      .set  push
+                                       # CHECK-NEXT: .set  mips32r2
+                                       # CHECK-NEXT: rdhwr $sp, $11
+                                       # CHECK-NEXT: .set  pop          # encoding: [0x7c,0x1d,0x58,0x3b]
+        rotr      $1,15                # CHECK: rotr $1, $1, 15         # encoding: [0x00,0x21,0x0b,0xc2]
+        rotr      $1,$14,15            # CHECK: rotr $1, $14, 15        # encoding: [0x00,0x2e,0x0b,0xc2]
+        rotrv     $1,$14,$15           # CHECK: rotrv $1, $14, $15      # encoding: [0x01,0xee,0x08,0x46]
+        round.w.d $f6,$f4
+        round.w.s $f27,$f28
+        sb        $s6,-19857($14)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
+        sdbbp                          # CHECK: sdbbp                  # encoding: [0x70,0x00,0x00,0x3f]
+        sdbbp     34                   # CHECK: sdbbp 34               # encoding: [0x70,0x00,0x08,0xbf]
+        sdc1      $f31,30574($13)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
+        sdxc1     $f11,$10($14)
+        seb       $25,$15
+        seh       $v1,$12
+        sh        $14,-6704($15)
+        sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
+        sll       $a3,$zero,18         # CHECK: sll $7, $zero, 18      # encoding: [0x00,0x00,0x3c,0x80]
+        sll       $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        sllv      $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        slt       $s7,$11,$k1          # CHECK: slt $23, $11, $27      # encoding: [0x01,0x7b,0xb8,0x2a]
+        slti      $s1,$10,9489         # CHECK: slti $17, $10, 9489    # encoding: [0x29,0x51,0x25,0x11]
+        sltiu     $25,$25,-15531       # CHECK: sltiu $25, $25, -15531 # encoding: [0x2f,0x39,0xc3,0x55]
+        sltu      $s4,$s5,$11          # CHECK: sltu  $20, $21, $11    # encoding: [0x02,0xab,0xa0,0x2b]
+        sltu      $24,$25,-15531       # CHECK: sltiu $24, $25, -15531 # encoding: [0x2f,0x38,0xc3,0x55]
+        sqrt.d    $f17,$f22
+        sqrt.s    $f0,$f1
+        sra       $s1,15               # CHECK: sra $17, $17, 15       # encoding: [0x00,0x11,0x8b,0xc3]
+        sra       $s1,$s7,15           # CHECK: sra $17, $23, 15       # encoding: [0x00,0x17,0x8b,0xc3]
+        sra       $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srav      $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srl       $2,7                 # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $2,$2,7              # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sub       $s6,$s3,$12
+        sub       $22,$17,-3126        # CHECK: addi $22, $17, 3126    # encoding: [0x22,0x36,0x0c,0x36]
+        sub       $13,6512             # CHECK: addi $13, $13, -6512   # encoding: [0x21,0xad,0xe6,0x90]
+        sub.d     $f18,$f3,$f17
+        sub.s     $f23,$f22,$f22
+        subu      $sp,$s6,$s6
+        suxc1     $f12,$k1($13)
+        sw        $ra,-10160($sp)
+        swc1      $f6,-8465($24)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
+        swl       $15,13694($s3)
+        swr       $s1,-26590($14)
+        swxc1     $f19,$12($k0)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        sync      1                    # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
+        teq       $0,$3                # CHECK: teq $zero, $3          # encoding: [0x00,0x03,0x00,0x34]
+        teq       $5,$7,620            # CHECK: teq $5, $7, 620        # encoding: [0x00,0xa7,0x9b,0x34]
+        teqi      $s5,-17504
+        tge       $7,$10               # CHECK: tge $7, $10            # encoding: [0x00,0xea,0x00,0x30]
+        tge       $5,$19,340           # CHECK: tge $5, $19, 340       # encoding: [0x00,0xb3,0x55,0x30]
+        tgei      $s1,5025
+        tgeiu     $sp,-28621
+        tgeu      $22,$28              # CHECK: tgeu $22, $gp          # encoding: [0x02,0xdc,0x00,0x31]
+        tgeu      $20,$14,379          # CHECK: tgeu $20, $14, 379     # encoding: [0x02,0x8e,0x5e,0xf1]
+        tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
+        tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
+        tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
+        tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlt       $15,$13              # CHECK: tlt $15, $13           # encoding: [0x01,0xed,0x00,0x32]
+        tlt       $2,$19,133           # CHECK: tlt $2, $19, 133       # encoding: [0x00,0x53,0x21,0x72]
+        tlti      $14,-21059
+        tltiu     $ra,-5076
+        tltu      $11,$16              # CHECK: tltu $11, $16          # encoding: [0x01,0x70,0x00,0x33]
+        tltu      $16,$29,1016         # CHECK: tltu $16, $sp, 1016    # encoding: [0x02,0x1d,0xfe,0x33]
+        tne       $6,$17               # CHECK: tne $6, $17            # encoding: [0x00,0xd1,0x00,0x36]
+        tne       $7,$8,885            # CHECK: tne $7, $8, 885        # encoding: [0x00,0xe8,0xdd,0x76]
+        tnei      $12,-29647
+        trunc.w.d $f22,$f15
+        trunc.w.s $f28,$f30
+        wsbh      $k1,$9
+        xor       $s2,$a0,$s8
+        synci     -15842($a2)          # CHECK: synci -15842($6)        # encoding: [0x04,0xdf,0xc2,0x1e]
diff --git a/test/MC/Mips/mips4/invalid-mips64r2.s b/test/MC/Mips/mips4/invalid-mips64r2.s
index b259706..70a8261 100644
--- a/test/MC/Mips/mips4/invalid-mips64r2.s
+++ b/test/MC/Mips/mips4/invalid-mips64r2.s
@@ -17,19 +17,15 @@
         luxc1   $f19,$s6($s5)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd    $s6,$t5             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd    $zero,$t1           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        madd.s  $f1,$f31,$f19,$f25  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         maddu   $s3,$gp             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         maddu   $t8,$s2             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mfc0    $a2,$14,1           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mfhc1   $s8,$f24            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         msub    $s7,$k1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        msub.s  $f12,$f19,$f10,$f16 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         msubu   $t7,$a1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mtc0    $t1,$29,3           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mthc1   $zero,$f16          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mul     $s0,$s4,$at         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        nmadd.s $f0,$f5,$f25,$f12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        nmsub.s $f1,$f24,$f19,$f4   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         pause                       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         seb     $t9,$t7             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         seh     $v1,$t4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips4/valid-xfail.s b/test/MC/Mips/mips4/valid-xfail.s
index ff6f457..9c647d1 100644
--- a/test/MC/Mips/mips4/valid-xfail.s
+++ b/test/MC/Mips/mips4/valid-xfail.s
@@ -35,14 +35,6 @@
         c.ult.s         $fcc7,$f24,$f10
         c.un.d          $fcc6,$f23,$f24
         c.un.s          $fcc1,$f30,$f4
-        madd.d          $f18,$f19,$f26,$f20
-        madd.s          $f1,$f31,$f19,$f25
-        msub.d          $f10,$f1,$f31,$f18
-        msub.s          $f12,$f19,$f10,$f16
-        nmadd.d         $f18,$f9,$f14,$f19
-        nmadd.s         $f0,$f5,$f25,$f12
-        nmsub.d         $f30,$f8,$f16,$f30
-        nmsub.s         $f1,$f24,$f19,$f4
         recip.d         $f19,$f6
         recip.s         $f3,$f30
         rsqrt.d         $f3,$f28
diff --git a/test/MC/Mips/mips4/valid.s b/test/MC/Mips/mips4/valid.s
index c221b76..fc747a5 100644
--- a/test/MC/Mips/mips4/valid.s
+++ b/test/MC/Mips/mips4/valid.s
@@ -134,6 +134,8 @@
         lwr       $zero,-19147($gp)
         lwu       $s3,-24086($v1)
         lwxc1     $f12,$s1($s8)
+        madd.d    $f18, $f22, $f26, $f20  # encoding: [0x4e,0xd4,0xd4,0xa1]
+        madd.s    $f2, $f30, $f18, $f24   # encoding: [0x4f,0xd8,0x90,0xa0]        
         mfc1      $a3,$f27
         mfhi      $s3
         mfhi      $sp
@@ -156,6 +158,8 @@
         movz      $a1,$s6,$9
         movz.d    $f12,$f29,$9
         movz.s    $f25,$f7,$v1
+        msub.d    $f10, $f2, $f30, $f18   # encoding: [0x4c,0x52,0xf2,0xa9]
+        msub.s    $f12, $f18, $f10, $f16  # encoding: [0x4e,0x50,0x53,0x28]
         mtc1      $s8,$f9
         mthi      $s1
         mtlo      $sp
@@ -170,6 +174,10 @@
         negu      $2,$3                # CHECK: negu $2, $3            # encoding: [0x00,0x03,0x10,0x23]
         neg.d     $f27,$f18
         neg.s     $f1,$f15
+        nmadd.d   $f18, $f8, $f14, $f20   # encoding: [0x4d,0x14,0x74,0xb1]
+        nmadd.s   $f0, $f4, $f24, $f12    # encoding: [0x4c,0x8c,0xc0,0x30]
+        nmsub.d   $f30, $f8, $f16, $f30   # encoding: [0x4d,0x1e,0x87,0xb9]
+        nmsub.s   $f0, $f24, $f20, $f4    # encoding: [0x4f,0x04,0xa0,0x38]
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
diff --git a/test/MC/Mips/mips5/invalid-mips64r2.s b/test/MC/Mips/mips5/invalid-mips64r2.s
index b91e520..a96f4b3 100644
--- a/test/MC/Mips/mips5/invalid-mips64r2.s
+++ b/test/MC/Mips/mips5/invalid-mips64r2.s
@@ -21,19 +21,15 @@
         ei        $14                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd      $s6,$13             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd      $zero,$9            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        madd.s    $f1,$f31,$f19,$f25  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         maddu     $s3,$gp             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         maddu     $24,$s2             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mfc0      $a2,$14,1           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mfhc1     $s8,$f24            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         msub      $s7,$k1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        msub.s    $f12,$f19,$f10,$f16 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         msubu     $15,$a1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mtc0      $9,$29,3            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mthc1     $zero,$f16          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mul       $s0,$s4,$at         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        nmadd.s   $f0,$f5,$f25,$f12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        nmsub.s   $f1,$f24,$f19,$f4   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         pause                         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         rotr      $1,15               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         rotr      $1,$14,15           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips5/valid-xfail.s b/test/MC/Mips/mips5/valid-xfail.s
index 8d1d0d7..d761189 100644
--- a/test/MC/Mips/mips5/valid-xfail.s
+++ b/test/MC/Mips/mips5/valid-xfail.s
@@ -57,25 +57,17 @@
         cvt.ps.s        $f3,$f18,$f19
         cvt.s.pl        $f30,$f1
         cvt.s.pu        $f14,$f25
-        madd.d          $f18,$f19,$f26,$f20
         madd.ps         $f22,$f3,$f14,$f3
-        madd.s          $f1,$f31,$f19,$f25
         mov.ps          $f22,$f17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
         movt.ps         $f20,$f25,$fcc2
         movz.ps         $f18,$f17,$ra
-        msub.d          $f10,$f1,$f31,$f18
         msub.ps         $f12,$f14,$f29,$f17
-        msub.s          $f12,$f19,$f10,$f16
         mul.ps          $f14,$f0,$f16
         neg.ps          $f19,$f13
-        nmadd.d         $f18,$f9,$f14,$f19
         nmadd.ps        $f27,$f4,$f9,$f25
-        nmadd.s         $f0,$f5,$f25,$f12
-        nmsub.d         $f30,$f8,$f16,$f30
         nmsub.ps        $f6,$f12,$f14,$f17
-        nmsub.s         $f1,$f24,$f19,$f4
         pll.ps          $f25,$f9,$f30
         plu.ps          $f1,$f26,$f29
         pul.ps          $f9,$f30,$f26
diff --git a/test/MC/Mips/mips5/valid.s b/test/MC/Mips/mips5/valid.s
index b93b22f..995d1a5 100644
--- a/test/MC/Mips/mips5/valid.s
+++ b/test/MC/Mips/mips5/valid.s
@@ -135,6 +135,8 @@
         lwr       $zero,-19147($gp)
         lwu       $s3,-24086($v1)
         lwxc1     $f12,$s1($s8)
+        madd.d    $f18, $f22, $f26, $f20  # encoding: [0x4e,0xd4,0xd4,0xa1]
+        madd.s    $f2, $f30, $f18, $f24   # encoding: [0x4f,0xd8,0x90,0xa0]
         mfc1      $a3,$f27
         mfhi      $s3
         mfhi      $sp
@@ -157,6 +159,8 @@
         movz      $a1,$s6,$9
         movz.d    $f12,$f29,$9
         movz.s    $f25,$f7,$v1
+        msub.d    $f10, $f2, $f30, $f18   # encoding: [0x4c,0x52,0xf2,0xa9]
+        msub.s    $f12, $f18, $f10, $f16  # encoding: [0x4e,0x50,0x53,0x28]
         mtc1      $s8,$f9
         mthi      $s1
         mtlo      $sp
@@ -171,6 +175,10 @@
         negu      $2,$3                # CHECK: negu $2, $3            # encoding: [0x00,0x03,0x10,0x23]
         neg.d     $f27,$f18
         neg.s     $f1,$f15
+        nmadd.d   $f18, $f8, $f14, $f20   # encoding: [0x4d,0x14,0x74,0xb1]
+        nmadd.s   $f0, $f4, $f24, $f12    # encoding: [0x4c,0x8c,0xc0,0x30]
+        nmsub.d   $f30, $f8, $f16, $f30   # encoding: [0x4d,0x1e,0x87,0xb9]
+        nmsub.s   $f0, $f24, $f20, $f4    # encoding: [0x4f,0x04,0xa0,0x38]
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
diff --git a/test/MC/Mips/mips64-register-names-n32-n64.s b/test/MC/Mips/mips64-register-names-n32-n64.s
index efe1cdb..fb2c426 100644
--- a/test/MC/Mips/mips64-register-names-n32-n64.s
+++ b/test/MC/Mips/mips64-register-names-n32-n64.s
@@ -3,7 +3,7 @@
 # RUN: FileCheck -check-prefix=WARNING %s < %t0
 #
 # RUN: llvm-mc %s -triple=mips64-unknown-freebsd -show-encoding \
-# RUN:     -mattr=-n64,+n32 2>%t1 | FileCheck %s
+# RUN:     -target-abi n32 2>%t1 | FileCheck %s
 # RUN: FileCheck -check-prefix=WARNING %s < %t1
 #
 # Check that the register names are mapped to their correct numbers for n32/n64
diff --git a/test/MC/Mips/mips64-register-names-o32.s b/test/MC/Mips/mips64-register-names-o32.s
index c170578..f5df527 100644
--- a/test/MC/Mips/mips64-register-names-o32.s
+++ b/test/MC/Mips/mips64-register-names-o32.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc %s -triple=mips64-unknown-freebsd -show-encoding \
-# RUN:     -mattr=-n64,+o32 | FileCheck %s
+# RUN:     -target-abi o32 | FileCheck %s
 
 # Check that the register names are mapped to their correct numbers for o32
 # Second byte of daddiu with $zero at rt contains the number of the source
diff --git a/test/MC/Mips/mips64/invalid-mips64r2.s b/test/MC/Mips/mips64/invalid-mips64r2.s
index 1a5abb6..1caa2bd 100644
--- a/test/MC/Mips/mips64/invalid-mips64r2.s
+++ b/test/MC/Mips/mips64/invalid-mips64r2.s
@@ -14,12 +14,8 @@
         dsbh      $v1,$14             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dshd      $v0,$sp             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ei        $14                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        madd.s    $f1,$f31,$f19,$f25  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mfhc1     $s8,$f24            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        msub.s    $f12,$f19,$f10,$f16 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mthc1     $zero,$f16          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        nmadd.s   $f0,$f5,$f25,$f12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        nmsub.s   $f1,$f24,$f19,$f4   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         pause                         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         rotr      $1,15               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         rotr      $1,$14,15           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips64/valid-xfail.s b/test/MC/Mips/mips64/valid-xfail.s
index e5455f5..7d1eb92 100644
--- a/test/MC/Mips/mips64/valid-xfail.s
+++ b/test/MC/Mips/mips64/valid-xfail.s
@@ -62,9 +62,7 @@
         cvt.s.pu        $f14,$f25
         dmfc0           $10,c0_watchhi,2
         dmtc0           $15,c0_datalo
-        madd.d          $f18,$f19,$f26,$f20
         madd.ps         $f22,$f3,$f14,$f3
-        madd.s          $f1,$f31,$f19,$f25
         mov.ps          $f22,$f17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
@@ -72,17 +70,11 @@
         movz.ps         $f18,$f17,$ra
         msgn.qh         $v0,$v24,$v20
         msgn.qh         $v12,$v21,$v0[1]
-        msub.d          $f10,$f1,$f31,$f18
         msub.ps         $f12,$f14,$f29,$f17
-        msub.s          $f12,$f19,$f10,$f16
         mul.ps          $f14,$f0,$f16
         neg.ps          $f19,$f13
-        nmadd.d         $f18,$f9,$f14,$f19
         nmadd.ps        $f27,$f4,$f9,$f25
-        nmadd.s         $f0,$f5,$f25,$f12
-        nmsub.d         $f30,$f8,$f16,$f30
         nmsub.ps        $f6,$f12,$f14,$f17
-        nmsub.s         $f1,$f24,$f19,$f4
         pll.ps          $f25,$f9,$f30
         plu.ps          $f1,$f26,$f29
         pul.ps          $f9,$f30,$f26
diff --git a/test/MC/Mips/mips64/valid.s b/test/MC/Mips/mips64/valid.s
index 032777e..f481a28 100644
--- a/test/MC/Mips/mips64/valid.s
+++ b/test/MC/Mips/mips64/valid.s
@@ -144,6 +144,8 @@
         madd      $zero,$9
         maddu     $s3,$gp
         maddu     $24,$s2
+        madd.d    $f18, $f22, $f26, $f20  # encoding: [0x4e,0xd4,0xd4,0xa1]
+        madd.s    $f2, $f30, $f18, $f24   # encoding: [0x4f,0xd8,0x90,0xa0]
         mfc0      $a2,$14,1
         mfc1      $a3,$f27
         mfhi      $s3
@@ -169,6 +171,8 @@
         movz.s    $f25,$f7,$v1
         msub      $s7,$k1
         msubu     $15,$a1
+        msub.d    $f10, $f2, $f30, $f18   # encoding: [0x4c,0x52,0xf2,0xa9]
+        msub.s    $f12, $f18, $f10, $f16  # encoding: [0x4e,0x50,0x53,0x28]
         mtc0      $9,$29,3
         mtc1      $s8,$f9
         mthi      $s1
@@ -185,6 +189,10 @@
         negu      $2,$3                # CHECK: negu $2, $3            # encoding: [0x00,0x03,0x10,0x23]
         neg.d     $f27,$f18
         neg.s     $f1,$f15
+        nmadd.d   $f18, $f8, $f14, $f20   # encoding: [0x4d,0x14,0x74,0xb1]
+        nmadd.s   $f0, $f4, $f24, $f12    # encoding: [0x4c,0x8c,0xc0,0x30]
+        nmsub.d   $f30, $f8, $f16, $f30   # encoding: [0x4d,0x1e,0x87,0xb9]
+        nmsub.s   $f0, $f24, $f20, $f4    # encoding: [0x4f,0x04,0xa0,0x38]
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
diff --git a/test/MC/Mips/mips64extins.ll b/test/MC/Mips/mips64extins.ll
index ebe8f86..093bc87 100644
--- a/test/MC/Mips/mips64extins.ll
+++ b/test/MC/Mips/mips64extins.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -mattr=n64 %s -o - \
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -target-abi=n64 %s -o - \
 ; RUN: | llvm-objdump -disassemble -triple mips64el -mattr +mips64r2 - \
 ; RUN: | FileCheck %s
 
diff --git a/test/MC/Mips/mips64r2/abi-bad.s b/test/MC/Mips/mips64r2/abi-bad.s
index 31d13ab..7070d45 100644
--- a/test/MC/Mips/mips64r2/abi-bad.s
+++ b/test/MC/Mips/mips64r2/abi-bad.s
@@ -1,9 +1,5 @@
-# RUN: not llvm-mc %s -triple mips-unknown-unknown -mcpu=mips64r2 2>&1 | FileCheck %s
-# CHECK: .text
-
-
-
+# RUN: not llvm-mc %s -triple mips-unknown-linux -mcpu=mips64r2 2>&1 | FileCheck %s
         .set fp=xx
-# CHECK     : error: 'set fp=xx'option requires O32 ABI
-# CHECK     : .set fp=xx
-# CHECK     :          ^
+# CHECK: error: '.set fp=xx' requires the O32 ABI
+# CHECK: .set fp=xx
+# CHECK:          ^
diff --git a/test/MC/Mips/mips64r2/valid-xfail.s b/test/MC/Mips/mips64r2/valid-xfail.s
index 9ac47f6..148758c 100644
--- a/test/MC/Mips/mips64r2/valid-xfail.s
+++ b/test/MC/Mips/mips64r2/valid-xfail.s
@@ -176,7 +176,6 @@
         lwle            $11,-42($11)
         lwre            $sp,-152($24)
         lwx $12,$12($s4)
-        madd.d          $f18,$f19,$f26,$f20
         madd.ps         $f22,$f3,$f14,$f3
         maq_s.w.phl     $ac2,$25,$11
         maq_s.w.phr     $ac0,$10,$25
@@ -193,7 +192,6 @@
         msgn.qh         $v0,$v24,$v20
         msgn.qh         $v12,$v21,$v0[1]
         msub            $ac2,$sp,$14
-        msub.d          $f10,$f1,$f31,$f18
         msub.ps         $f12,$f14,$f29,$f17
         msubu           $ac2,$a1,$24
         mtc0            $9,c0_datahi1
@@ -222,9 +220,7 @@
         nlzc.d          $w14,$w14
         nlzc.h          $w24,$w24
         nlzc.w          $w10,$w4
-        nmadd.d         $f18,$f9,$f14,$f19
         nmadd.ps        $f27,$f4,$f9,$f25
-        nmsub.d         $f30,$f8,$f16,$f30
         nmsub.ps        $f6,$f12,$f14,$f17
         nor.v           $w20,$w20,$w15
         or.v            $w13,$w23,$w12
@@ -297,7 +293,6 @@
         swe $24,94($k0)
         swle            $v1,-209($gp)
         swre            $k0,-202($s2)
-        synci           20023($s0)
         tlbginv
         tlbginvf
         tlbgp
diff --git a/test/MC/Mips/mips64r3/abi-bad.s b/test/MC/Mips/mips64r3/abi-bad.s
new file mode 100644
index 0000000..7691601
--- /dev/null
+++ b/test/MC/Mips/mips64r3/abi-bad.s
@@ -0,0 +1,5 @@
+# RUN: not llvm-mc %s -triple mips-unknown-linux -mcpu=mips64r3 2>&1 | FileCheck %s
+        .set fp=xx
+# CHECK: error: '.set fp=xx' requires the O32 ABI
+# CHECK: .set fp=xx
+# CHECK:          ^
diff --git a/test/MC/Mips/mips64r3/abiflags.s b/test/MC/Mips/mips64r3/abiflags.s
new file mode 100644
index 0000000..e89be29
--- /dev/null
+++ b/test/MC/Mips/mips64r3/abiflags.s
@@ -0,0 +1,36 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips64r3 | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+#
+# RUN: llvm-mc %s -arch=mips -mcpu=mips64r3 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+
+# CHECK-ASM: .module fp=64
+
+# Checking if the Mips.abiflags were correctly emitted.
+# CHECK-OBJ:       Section {
+# CHECK-OBJ:         Index: 5
+# CHECK-OBJ-LABEL:   Name: .MIPS.abiflags
+# CHECK-OBJ:         Type: SHT_MIPS_ABIFLAGS (0x7000002A)
+# CHECK-OBJ:          Flags [ (0x2)
+# CHECK-OBJ:           SHF_ALLOC (0x2)
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         Address: 0x0
+# CHECK-OBJ:         Size: 24
+# CHECK-OBJ:         Link: 0
+# CHECK-OBJ:         Info: 0
+# CHECK-OBJ:         AddressAlignment: 8
+# CHECK-OBJ:         EntrySize: 24
+# CHECK-OBJ:         Relocations [
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         SectionData (
+# CHECK-OBJ:           0000: 00004003 02020001 00000000 00000000  |..@.............|
+# CHECK-OBJ:           0010: 00000001 00000000                    |........|
+# CHECK-OBJ:         )
+# CHECK-OBJ-LABEL: }
+
+        .module fp=64
+
+# FIXME: Test should include gnu_attributes directive when implemented.
+#        An explicit .gnu_attribute must be checked against the effective
+#        command line options and any inconsistencies reported via a warning.
diff --git a/test/MC/Mips/mips64r3/invalid.s b/test/MC/Mips/mips64r3/invalid.s
new file mode 100644
index 0000000..99cd080
--- /dev/null
+++ b/test/MC/Mips/mips64r3/invalid.s
@@ -0,0 +1,10 @@
+# Instructions that are valid for the current ISA but should be rejected by the assembler (e.g.
+# invalid set of operands or operand's restrictions not met).
+
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -mcpu=mips64r3 2>%t1
+# RUN: FileCheck %s < %t1 -check-prefix=ASM
+
+        .text
+        .set noreorder
+        jalr.hb $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
+        jalr.hb $31, $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
diff --git a/test/MC/Mips/mips64r3/valid-xfail.s b/test/MC/Mips/mips64r3/valid-xfail.s
new file mode 100644
index 0000000..f2949c4
--- /dev/null
+++ b/test/MC/Mips/mips64r3/valid-xfail.s
@@ -0,0 +1,306 @@
+# Instructions that should be valid but currently fail for known reasons (e.g.
+# they aren't implemented yet).
+# This test is set up to XPASS if any instruction generates an encoding.
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r3 | not FileCheck %s
+# CHECK-NOT: encoding
+# XFAIL: *
+
+        .set noat
+        abs.ps          $f22,$f8
+        absq_s.ph       $8,$a0
+        absq_s.qb       $15,$s1
+        absq_s.w        $s3,$ra
+        add.ps          $f25,$f27,$f13
+        addq.ph         $s1,$15,$at
+        addq_s.ph       $s3,$s6,$s2
+        addq_s.w        $a2,$8,$at
+        addqh.ph        $s4,$14,$s1
+        addqh.w         $s7,$s7,$k1
+        addqh_r.ph      $sp,$25,$s8
+        addqh_r.w       $8,$v1,$zero
+        addsc           $s8,$15,$12
+        addu.ph         $a2,$14,$s3
+        addu.qb         $s6,$v1,$v1
+        addu_s.ph       $a3,$s3,$gp
+        addu_s.qb       $s4,$s8,$s1
+        adduh.qb        $a1,$a1,$at
+        adduh_r.qb      $a0,$9,$12
+        addwc           $k0,$s6,$s7
+        alnv.ob         $v22,$v19,$v30,$v1
+        alnv.ob         $v31,$v23,$v30,$at
+        alnv.ob         $v8,$v17,$v30,$a1
+        alnv.ps         $f12,$f18,$f30,$12
+        and.v           $w10,$w25,$w29
+        bitrev          $14,$at
+        bmnz.v          $w15,$w2,$w28
+        bmz.v           $w13,$w11,$w21
+        bsel.v          $w28,$w7,$w0
+        c.eq.d          $fcc1,$f15,$f15
+        c.eq.ps         $fcc5,$f0,$f9
+        c.eq.s          $fcc5,$f24,$f17
+        c.f.d           $fcc4,$f11,$f21
+        c.f.ps          $fcc6,$f11,$f11
+        c.f.s           $fcc4,$f30,$f7
+        c.le.d          $fcc4,$f18,$f1
+        c.le.ps         $fcc1,$f7,$f20
+        c.le.s          $fcc6,$f24,$f4
+        c.lt.d          $fcc3,$f9,$f3
+        c.lt.ps         $f19,$f5
+        c.lt.s          $fcc2,$f17,$f14
+        c.nge.d         $fcc5,$f21,$f16
+        c.nge.ps        $f1,$f26
+        c.nge.s         $fcc3,$f11,$f8
+        c.ngl.ps        $f21,$f30
+        c.ngl.s         $fcc2,$f31,$f23
+        c.ngle.ps       $fcc7,$f12,$f20
+        c.ngle.s        $fcc2,$f18,$f23
+        c.ngt.d         $fcc4,$f24,$f7
+        c.ngt.ps        $fcc5,$f30,$f6
+        c.ngt.s         $fcc5,$f8,$f13
+        c.ole.d         $fcc2,$f16,$f31
+        c.ole.ps        $fcc7,$f21,$f8
+        c.ole.s         $fcc3,$f7,$f20
+        c.olt.d         $fcc4,$f19,$f28
+        c.olt.ps        $fcc3,$f7,$f16
+        c.olt.s         $fcc6,$f20,$f7
+        c.seq.d         $fcc4,$f31,$f7
+        c.seq.ps        $fcc6,$f31,$f14
+        c.seq.s         $fcc7,$f1,$f25
+        c.sf.ps         $fcc6,$f4,$f6
+        c.ueq.d         $fcc4,$f13,$f25
+        c.ueq.ps        $fcc1,$f5,$f29
+        c.ueq.s         $fcc6,$f3,$f30
+        c.ule.d         $fcc7,$f25,$f18
+        c.ule.ps        $fcc6,$f17,$f3
+        c.ule.s         $fcc7,$f21,$f30
+        c.ult.d         $fcc6,$f6,$f17
+        c.ult.ps        $fcc7,$f14,$f0
+        c.ult.s         $fcc7,$f24,$f10
+        c.un.d          $fcc6,$f23,$f24
+        c.un.ps         $fcc4,$f2,$f26
+        c.un.s          $fcc1,$f30,$f4
+        cvt.ps.s        $f3,$f18,$f19
+        cmp.eq.ph       $s7,$14
+        cmp.le.ph       $8,$14
+        cmp.lt.ph       $k0,$sp
+        cmpgdu.eq.qb    $s3,$zero,$k0
+        cmpgdu.le.qb    $v1,$15,$s2
+        cmpgdu.lt.qb    $s0,$gp,$sp
+        cmpgu.eq.qb     $14,$s6,$s8
+        cmpgu.le.qb     $9,$a3,$s4
+        cmpgu.lt.qb     $sp,$at,$8
+        cmpu.eq.qb      $v0,$24
+        cmpu.le.qb      $s1,$a1
+        cmpu.lt.qb      $at,$a3
+        cvt.s.pl        $f30,$f1
+        cvt.s.pu        $f14,$f25
+        dmfc0           $10,c0_watchhi,2
+        dmfgc0          $gp,c0_perfcnt,6
+        dmt $k0
+        dmtc0           $15,c0_datalo
+        dmtgc0          $a2,c0_watchlo,2
+        dpa.w.ph        $ac1,$s7,$k0
+        dpaq_s.w.ph     $ac2,$a0,$13
+        dpaq_sa.l.w     $ac0,$a2,$14
+        dpaqx_s.w.ph    $ac3,$a0,$24
+        dpaqx_sa.w.ph   $ac1,$zero,$s5
+        dpau.h.qbl      $ac1,$10,$24
+        dpau.h.qbr      $ac1,$s7,$s6
+        dpax.w.ph       $ac3,$a0,$k0
+        dps.w.ph        $ac1,$a3,$a1
+        dpsq_s.w.ph     $ac0,$gp,$k0
+        dpsq_sa.l.w     $ac0,$a3,$15
+        dpsqx_s.w.ph    $ac3,$13,$a3
+        dpsqx_sa.w.ph   $ac3,$sp,$s2
+        dpsu.h.qbl      $ac2,$14,$10
+        dpsu.h.qbr      $ac2,$a1,$s6
+        dpsx.w.ph       $ac0,$s7,$gp
+        drorv           $at,$a1,$s7
+        dvpe            $s6
+        emt $8
+        evpe            $v0
+        extpdpv         $s6,$ac0,$s8
+        extpv           $13,$ac0,$14
+        extrv.w         $8,$ac3,$at
+        extrv_r.w       $8,$ac1,$s6
+        extrv_rs.w      $gp,$ac1,$s6
+        extrv_s.h       $s2,$ac1,$14
+        fclass.d        $w14,$w27
+        fclass.w        $w19,$w28
+        fexupl.d        $w10,$w29
+        fexupl.w        $w12,$w27
+        fexupr.d        $w31,$w15
+        fexupr.w        $w29,$w12
+        ffint_s.d       $w1,$w30
+        ffint_s.w       $w16,$w14
+        ffint_u.d       $w23,$w18
+        ffint_u.w       $w19,$w12
+        ffql.d          $w2,$w3
+        ffql.w          $w9,$w0
+        ffqr.d          $w25,$w24
+        ffqr.w          $w10,$w6
+        fill.b          $w9,$v1
+        fill.d          $w28,$8
+        fill.h          $w9,$8
+        fill.w          $w31,$15
+        flog2.d         $w12,$w16
+        flog2.w         $w19,$w23
+        fork            $s2,$8,$a0
+        frcp.d          $w12,$w4
+        frcp.w          $w30,$w8
+        frint.d         $w20,$w8
+        frint.w         $w11,$w29
+        frsqrt.d        $w29,$w2
+        frsqrt.w        $w9,$w8
+        fsqrt.d         $w3,$w1
+        fsqrt.w         $w5,$w15
+        ftint_s.d       $w31,$w26
+        ftint_s.w       $w27,$w14
+        ftint_u.d       $w5,$w31
+        ftint_u.w       $w12,$w29
+        ftrunc_s.d      $w4,$w22
+        ftrunc_s.w      $w24,$w7
+        ftrunc_u.d      $w20,$w25
+        ftrunc_u.w      $w7,$w26
+        insv            $s2,$at
+        iret
+        lbe $14,122($9)
+        lbue            $11,-108($10)
+        lbux            $9,$14($v0)
+        lhe $s6,219($v1)
+        lhue            $gp,118($11)
+        lhx $sp,$k0($15)
+        lle $gp,-237($ra)
+        lwe $ra,-145($14)
+        lwle            $11,-42($11)
+        lwre            $sp,-152($24)
+        lwx $12,$12($s4)
+        madd.ps         $f22,$f3,$f14,$f3
+        maq_s.w.phl     $ac2,$25,$11
+        maq_s.w.phr     $ac0,$10,$25
+        maq_sa.w.phl    $ac3,$a1,$v1
+        maq_sa.w.phr    $ac1,$at,$10
+        mfgc0           $s6,c0_datahi1
+        mflo            $9,$ac2
+        modsub          $a3,$12,$a3
+        mov.ps          $f22,$f17
+        movf.ps         $f10,$f28,$fcc6
+        movn.ps         $f31,$f31,$s3
+        movt.ps         $f20,$f25,$fcc2
+        movz.ps         $f18,$f17,$ra
+        msgn.qh         $v0,$v24,$v20
+        msgn.qh         $v12,$v21,$v0[1]
+        msub            $ac2,$sp,$14
+        msub.ps         $f12,$f14,$f29,$f17
+        msubu           $ac2,$a1,$24
+        mtc0            $9,c0_datahi1
+        mtgc0           $s4,$21,7
+        mthi            $v0,$ac1
+        mthlip          $a3,$ac0
+        mul.ph          $s4,$24,$s0
+        mul.ps          $f14,$f0,$f16
+        mul_s.ph        $10,$14,$15
+        muleq_s.w.phl   $11,$s4,$s4
+        muleq_s.w.phr   $s6,$a0,$s8
+        muleu_s.ph.qbl  $a2,$14,$8
+        muleu_s.ph.qbr  $a1,$ra,$9
+        mulq_rs.ph      $s2,$14,$15
+        mulq_rs.w       $at,$s4,$25
+        mulq_s.ph       $s0,$k1,$15
+        mulq_s.w        $9,$a3,$s0
+        mulsa.w.ph      $ac1,$s4,$s6
+        mulsaq_s.w.ph   $ac0,$ra,$s2
+        neg.ps          $f19,$f13
+        nloc.b          $w12,$w30
+        nloc.d          $w16,$w7
+        nloc.h          $w21,$w17
+        nloc.w          $w17,$w16
+        nlzc.b          $w12,$w7
+        nlzc.d          $w14,$w14
+        nlzc.h          $w24,$w24
+        nlzc.w          $w10,$w4
+        nmadd.ps        $f27,$f4,$f9,$f25
+        nmsub.ps        $f6,$f12,$f14,$f17
+        nor.v           $w20,$w20,$w15
+        or.v            $w13,$w23,$w12
+        packrl.ph       $ra,$24,$14
+        pcnt.b          $w30,$w15
+        pcnt.d          $w5,$w16
+        pcnt.h          $w20,$w24
+        pcnt.w          $w22,$w20
+        pick.ph         $ra,$a2,$gp
+        pick.qb         $11,$a0,$gp
+        pll.ps          $f25,$f9,$f30
+        plu.ps          $f1,$f26,$f29
+        preceq.w.phl    $s8,$gp
+        preceq.w.phr    $s5,$15
+        precequ.ph.qbl  $s7,$ra
+        precequ.ph.qbla $a0,$9
+        precequ.ph.qbr  $ra,$s3
+        precequ.ph.qbra $24,$8
+        preceu.ph.qbl   $sp,$8
+        preceu.ph.qbla  $s6,$11
+        preceu.ph.qbr   $gp,$s1
+        preceu.ph.qbra  $k1,$s0
+        precr.qb.ph     $v0,$12,$s8
+        precrq.ph.w     $14,$s8,$24
+        precrq.qb.ph    $a2,$12,$12
+        precrq_rs.ph.w  $a1,$k0,$a3
+        precrqu_s.qb.ph $zero,$gp,$s5
+        pul.ps          $f9,$f30,$f26
+        puu.ps          $f24,$f9,$f2
+        raddu.w.qb      $25,$s3
+        rdpgpr          $s3,$9
+        recip.d         $f19,$f6
+        recip.s         $f3,$f30
+        repl.ph         $at,-307
+        replv.ph        $v1,$s7
+        replv.qb        $25,$12
+        rorv            $13,$a3,$s5
+        rsqrt.d         $f3,$f28
+        rsqrt.s         $f4,$f8
+        sbe $s7,33($s1)
+        sce $sp,189($10)
+        she $24,105($v0)
+        shilo           $ac1,26
+        shilov          $ac2,$10
+        shllv.ph        $10,$s0,$s0
+        shllv.qb        $gp,$v1,$zero
+        shllv_s.ph      $k1,$at,$13
+        shllv_s.w       $s1,$ra,$k0
+        shrav.ph        $25,$s2,$s1
+        shrav.qb        $zero,$24,$11
+        shrav_r.ph      $s3,$11,$25
+        shrav_r.qb      $a0,$sp,$s5
+        shrav_r.w       $s7,$s4,$s6
+        shrlv.ph        $14,$10,$9
+        shrlv.qb        $a2,$s2,$11
+        sub.ps          $f5,$f14,$f26
+        subq.ph         $ra,$9,$s8
+        subq_s.ph       $13,$s8,$s5
+        subq_s.w        $k1,$a2,$a3
+        subqh.ph        $10,$at,$9
+        subqh.w         $v0,$a2,$zero
+        subqh_r.ph      $a0,$12,$s6
+        subqh_r.w       $10,$a2,$gp
+        subu.ph         $9,$s6,$s4
+        subu.qb         $s6,$a2,$s6
+        subu_s.ph       $v1,$a1,$s3
+        subu_s.qb       $s1,$at,$ra
+        subuh.qb        $zero,$gp,$gp
+        subuh_r.qb      $s4,$s8,$s6
+        swe $24,94($k0)
+        swle            $v1,-209($gp)
+        swre            $k0,-202($s2)
+        tlbginv
+        tlbginvf
+        tlbgp
+        tlbgr
+        tlbgwi
+        tlbgwr
+        tlbinv
+        tlbinvf
+        wrpgpr          $zero,$13
+        xor.v           $w20,$w21,$w30
+        yield           $v1,$s0
diff --git a/test/MC/Mips/mips64r3/valid.s b/test/MC/Mips/mips64r3/valid.s
new file mode 100644
index 0000000..d8f1721
--- /dev/null
+++ b/test/MC/Mips/mips64r3/valid.s
@@ -0,0 +1,305 @@
+# Instructions that are valid
+#
+# RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r3 | FileCheck %s
+
+        .set noat
+        abs.d     $f7,$f25             # CHECK: encoding:
+        abs.s     $f9,$f16
+        add       $s7,$s2,$a1
+        add       $9,$14,15176         # CHECK: addi $9, $14, 15176   # encoding: [0x21,0xc9,0x3b,0x48]
+        add       $24,-7193            # CHECK: addi $24, $24, -7193  # encoding: [0x23,0x18,0xe3,0xe7]
+        add.d     $f1,$f7,$f29
+        add.s     $f8,$f21,$f24
+        addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
+        addu      $9,$a0,$a2
+        addu      $9,10                # CHECK: addiu $9, $9, 10    # encoding: [0x25,0x29,0x00,0x0a]
+        and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1fl     $fcc0,4688           # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     4688                 # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     $fcc7,27             # CHECK: bc1fl $fcc7, 27 # encoding: [0x45,0x1e,0x00,0x06]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1tl     $fcc0,4688           # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     4688                 # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     $fcc7,27             # CHECK: bc1tl $fcc7, 27 # encoding: [0x45,0x1f,0x00,0x06]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        beql      $14,$s3,12544        # CHECK: beql $14, $19, 12544 # encoding: [0x51,0xd3,0x0c,0x40]
+        bgezall   $12,7293             # CHECK: bgezall $12, 7293    # encoding: [0x05,0x93,0x07,0x1f]
+        bgezl     $4,-6858             # CHECK: bgezl $4, -6858      # encoding: [0x04,0x83,0xf9,0x4d]
+        bgtzl     $10,-3738            # CHECK: bgtzl $10, -3738     # encoding: [0x5d,0x40,0xfc,0x59]
+        blezl     $6,2974              # CHECK: blezl $6, 2974       # encoding: [0x58,0xc0,0x02,0xe7]
+        bltzall   $6,488               # CHECK: bltzall $6, 488      # encoding: [0x04,0xd2,0x00,0x7a]
+        bltzl     $s1,-9964            # CHECK: bltzl $17, -9964     # encoding: [0x06,0x22,0xf6,0x45]
+        bnel      $gp,$s4,5107         # CHECK: bnel $gp, $20, 5107  # encoding: [0x57,0x94,0x04,0xfc]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
+        c.ngl.d   $f29,$f29
+        c.ngle.d  $f0,$f16
+        c.sf.d    $f30,$f0
+        c.sf.s    $f14,$f22
+        ceil.l.d  $f1,$f3
+        ceil.l.s  $f18,$f13
+        ceil.w.d  $f11,$f25
+        ceil.w.s  $f6,$f20
+        cfc1      $s1,$21
+        clo       $11,$a1              # CHECK: clo $11, $5   # encoding: [0x70,0xab,0x58,0x21]
+        clz       $sp,$gp              # CHECK: clz $sp, $gp  # encoding: [0x73,0x9d,0xe8,0x20]
+        ctc1      $a2,$26
+        cvt.d.l   $f4,$f16
+        cvt.d.s   $f22,$f28
+        cvt.d.w   $f26,$f11
+        cvt.l.d   $f24,$f15
+        cvt.l.s   $f11,$f29
+        cvt.s.d   $f26,$f8
+        cvt.s.l   $f15,$f30
+        cvt.s.w   $f22,$f15
+        cvt.w.d   $f20,$f14
+        cvt.w.s   $f20,$f24
+        dadd      $s3,$at,$ra
+        dadd      $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        dadd      $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
+        daddi     $sp,$s4,-27705
+        daddi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
+        daddiu    $k0,$s6,-4586
+        daddu     $s3,$at,$ra
+        daddu     $24,$2,18079         # CHECK: daddiu $24, $2, 18079  # encoding: [0x64,0x58,0x46,0x9f]
+        daddu     $19,26943            # CHECK: daddiu $19, $19, 26943 # encoding: [0x66,0x73,0x69,0x3f]
+        dclo      $s2,$a2              # CHECK: dclo $18, $6   # encoding: [0x70,0xd2,0x90,0x25]
+        dclz      $s0,$25              # CHECK: dclz $16, $25  # encoding: [0x73,0x30,0x80,0x24]
+        deret
+        di        $s8                  # CHECK: di  $fp        # encoding: [0x41,0x7e,0x60,0x00]
+        di                             # CHECK: di             # encoding: [0x41,0x60,0x60,0x00]
+        ddiv      $zero,$k0,$s3
+        ddivu     $zero,$s0,$s1
+        div       $zero,$25,$11
+        div.d     $f29,$f20,$f27
+        div.s     $f4,$f5,$f15
+        divu      $zero,$25,$15
+        dmfc1     $12,$f13
+        dmtc1     $s0,$f14
+        dmult     $s7,$9
+        dmultu    $a1,$a2
+        drotr     $1,15                # CHECK: drotr $1, $1, 15            # encoding: [0x00,0x21,0x0b,0xfa]
+        drotr     $1,$14,15            # CHECK: drotr $1, $14, 15           # encoding: [0x00,0x2e,0x0b,0xfa]
+        drotr32   $1,15                # CHECK: drotr32 $1, $1, 15          # encoding: [0x00,0x21,0x0b,0xfe]
+        drotr32   $1,$14,15            # CHECK: drotr32 $1, $14, 15         # encoding: [0x00,0x2e,0x0b,0xfe]
+        drotrv    $1,$14,$15           # CHECK: drotrv $1, $14, $15         # encoding: [0x01,0xee,0x08,0x56]
+        dsbh      $v1,$14
+        dshd      $v0,$sp
+        dsll      $zero,18             # CHECK: dsll $zero, $zero, 18       # encoding: [0x00,0x00,0x04,0xb8]
+        dsll      $zero,$s4,18         # CHECK: dsll $zero, $20, 18         # encoding: [0x00,0x14,0x04,0xb8]
+        dsll      $zero,$s4,$12        # CHECK: dsllv $zero, $20, $12       # encoding: [0x01,0x94,0x00,0x14]
+        dsll32    $zero,18             # CHECK: dsll32 $zero, $zero, 18     # encoding: [0x00,0x00,0x04,0xbc]
+        dsll32    $zero,$zero,18       # CHECK: dsll32 $zero, $zero, 18     # encoding: [0x00,0x00,0x04,0xbc]
+        dsllv     $zero,$s4,$12        # CHECK: dsllv $zero, $20, $12       # encoding: [0x01,0x94,0x00,0x14]
+        dsra      $gp,10               # CHECK: dsra $gp, $gp, 10           # encoding: [0x00,0x1c,0xe2,0xbb]
+        dsra      $gp,$s2,10           # CHECK: dsra $gp, $18, 10           # encoding: [0x00,0x12,0xe2,0xbb]
+        dsra      $gp,$s2,$s3          # CHECK: dsrav $gp, $18, $19         # encoding: [0x02,0x72,0xe0,0x17]
+        dsra32    $gp,10               # CHECK: dsra32 $gp, $gp, 10         # encoding: [0x00,0x1c,0xe2,0xbf]
+        dsra32    $gp,$s2,10           # CHECK: dsra32 $gp, $18, 10         # encoding: [0x00,0x12,0xe2,0xbf]
+        dsrav     $gp,$s2,$s3          # CHECK: dsrav $gp, $18, $19         # encoding: [0x02,0x72,0xe0,0x17]
+        dsrl      $s3,23               # CHECK: dsrl $19, $19, 23           # encoding: [0x00,0x13,0x9d,0xfa]
+        dsrl      $s3,$6,23            # CHECK: dsrl $19, $6, 23            # encoding: [0x00,0x06,0x9d,0xfa]
+        dsrl      $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
+        dsrl32    $s3,23               # CHECK: dsrl32 $19, $19, 23         # encoding: [0x00,0x13,0x9d,0xfe]
+        dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
+        dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
+        dsub      $a3,$s6,$8
+        dsub      $a3,$s6,$8
+        dsub      $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsub      $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
+        dsubi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
+        dsubu     $a1,$a1,$k0
+        dsubu     $a1,$a1,$k0
+        dsubu     $15,$11,5025         # CHECK: daddiu $15, $11, -5025 # encoding: [0x65,0x6f,0xec,0x5f]
+        dsubu     $14,-4586            # CHECK: daddiu $14, $14, 4586  # encoding: [0x65,0xce,0x11,0xea]
+        ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
+        ei        $14                  # CHECK: ei  $14       # encoding: [0x41,0x6e,0x60,0x20]
+        ei                             # CHECK: ei            # encoding: [0x41,0x60,0x60,0x20]
+        eret
+        floor.l.d $f26,$f7
+        floor.l.s $f12,$f5
+        floor.w.d $f14,$f11
+        floor.w.s $f8,$f9
+        jr.hb     $4                   # CHECK: jr.hb  $4 # encoding: [0x00,0x80,0x04,0x08]
+        jalr.hb   $4                   # CHECK: jalr.hb  $4 # encoding: [0x00,0x80,0xfc,0x09]
+        jalr.hb   $4, $5               # CHECK: jalr.hb  $4, $5 # encoding: [0x00,0xa0,0x24,0x09]
+        lb        $24,-14515($10)
+        lbu       $8,30195($v1)
+        ld        $sp,-28645($s1)
+        ldc1      $f11,16391($s0)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
+        ldl       $24,-4167($24)
+        ldr       $14,-30358($s4)
+        ldxc1     $f8,$s7($15)
+        lh        $11,-8556($s5)
+        lhu       $s3,-22851($v0)
+        li        $at,-29773
+        li        $zero,-29889
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
+        lld       $zero,-14736($ra)    # CHECK: lld $zero, -14736($ra) # encoding: [0xd3,0xe0,0xc6,0x70]
+        luxc1     $f19,$s6($s5)
+        lw        $8,5674($a1)
+        lwc1      $f16,10225($k0)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
+        lwl       $s4,-4231($15)
+        lwr       $zero,-19147($gp)
+        lwu       $s3,-24086($v1)
+        lwxc1     $f12,$s1($s8)
+        madd      $s6,$13
+        madd      $zero,$9
+        madd.s    $f1,$f31,$f19,$f25
+        maddu     $s3,$gp
+        maddu     $24,$s2
+        mfc0      $a2,$14,1
+        mfc1      $a3,$f27
+        mfhc1     $s8,$f24
+        mfhi      $s3
+        mfhi      $sp
+        mflo      $s1
+        mov.d     $f20,$f14
+        mov.s     $f2,$f27
+        move      $a0,$a3
+        move      $s5,$a0
+        move      $s8,$a0
+        move      $25,$a2
+        movf      $gp,$8,$fcc7
+        movf.d    $f6,$f11,$fcc5
+        movf.s    $f23,$f5,$fcc6
+        movn      $v1,$s1,$s0
+        movn.d    $f27,$f21,$k0
+        movn.s    $f12,$f0,$s7
+        movt      $zero,$s4,$fcc5
+        movt.d    $f0,$f2,$fcc0
+        movt.s    $f30,$f2,$fcc1
+        movz      $a1,$s6,$9
+        movz.d    $f12,$f29,$9
+        movz.s    $f25,$f7,$v1
+        msub      $s7,$k1
+        msub.s    $f12,$f19,$f10,$f16
+        msubu     $15,$a1
+        mtc0      $9,$29,3
+        mtc1      $s8,$f9
+        mthc1     $zero,$f16
+        mthi      $s1
+        mtlo      $sp
+        mtlo      $25
+        mul       $s0,$s4,$at
+        mul.d     $f20,$f20,$f16
+        mul.s     $f30,$f10,$f2
+        mult      $sp,$s4
+        mult      $sp,$v0
+        multu     $gp,$k0
+        multu     $9,$s2
+        negu      $2                   # CHECK: negu $2, $2            # encoding: [0x00,0x02,0x10,0x23]
+        negu      $2,$3                # CHECK: negu $2, $3            # encoding: [0x00,0x03,0x10,0x23]
+        neg.d     $f27,$f18
+        neg.s     $f1,$f15
+        nmadd.s   $f0,$f5,$f25,$f12
+        nmsub.s   $f1,$f24,$f19,$f4
+        nop
+        nor       $a3,$zero,$a3
+        or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4           # encoding: [0x34,0x42,0x00,0x04]
+        pause                          # CHECK: pause # encoding:  [0x00,0x00,0x01,0x40]
+        pref      1, 8($5)             # CHECK: pref 1, 8($5)           # encoding: [0xcc,0xa1,0x00,0x08]
+        # FIXME: Use the code generator in order to print the .set directives
+        #        instead of the instruction printer.
+        rdhwr     $sp,$11              # CHECK:      .set  push
+                                       # CHECK-NEXT: .set  mips32r2
+                                       # CHECK-NEXT: rdhwr $sp, $11
+                                       # CHECK-NEXT: .set  pop          # encoding: [0x7c,0x1d,0x58,0x3b]
+        rotr      $1,15                # CHECK: rotr $1, $1, 15         # encoding: [0x00,0x21,0x0b,0xc2]
+        rotr      $1,$14,15            # CHECK: rotr $1, $14, 15        # encoding: [0x00,0x2e,0x0b,0xc2]
+        rotrv     $1,$14,$15           # CHECK: rotrv $1, $14, $15      # encoding: [0x01,0xee,0x08,0x46]
+        round.l.d $f12,$f1
+        round.l.s $f25,$f5
+        round.w.d $f6,$f4
+        round.w.s $f27,$f28
+        sb        $s6,-19857($14)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
+        scd       $15,-8243($sp)       # CHECK: scd $15, -8243($sp)    # encoding: [0xf3,0xaf,0xdf,0xcd]
+        sdbbp                          # CHECK: sdbbp                  # encoding: [0x70,0x00,0x00,0x3f]
+        sdbbp     34                   # CHECK: sdbbp 34               # encoding: [0x70,0x00,0x08,0xbf]
+        sd        $12,5835($10)
+        sdc1      $f31,30574($13)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
+        sdl       $a3,-20961($s8)
+        sdr       $11,-20423($12)
+        sdxc1     $f11,$10($14)
+        seb       $25,$15
+        seh       $v1,$12
+        sh        $14,-6704($15)
+        sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
+        sll       $a3,$zero,18         # CHECK: sll $7, $zero, 18      # encoding: [0x00,0x00,0x3c,0x80]
+        sll       $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        sllv      $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        slt       $s7,$11,$k1          # CHECK: slt $23, $11, $27      # encoding: [0x01,0x7b,0xb8,0x2a]
+        slti      $s1,$10,9489         # CHECK: slti $17, $10, 9489    # encoding: [0x29,0x51,0x25,0x11]
+        sltiu     $25,$25,-15531       # CHECK: sltiu $25, $25, -15531 # encoding: [0x2f,0x39,0xc3,0x55]
+        sltu      $s4,$s5,$11          # CHECK: sltu  $20, $21, $11    # encoding: [0x02,0xab,0xa0,0x2b]
+        sltu      $24,$25,-15531       # CHECK: sltiu $24, $25, -15531 # encoding: [0x2f,0x38,0xc3,0x55]
+        sqrt.d    $f17,$f22
+        sqrt.s    $f0,$f1
+        sra       $s1,15               # CHECK: sra $17, $17, 15       # encoding: [0x00,0x11,0x8b,0xc3]
+        sra       $s1,$s7,15           # CHECK: sra $17, $23, 15       # encoding: [0x00,0x17,0x8b,0xc3]
+        sra       $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srav      $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srl       $2,7                 # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $2,$2,7              # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sub       $s6,$s3,$12
+        sub       $22,$17,-3126        # CHECK: addi $22, $17, 3126    # encoding: [0x22,0x36,0x0c,0x36]
+        sub       $13,6512             # CHECK: addi $13, $13, -6512   # encoding: [0x21,0xad,0xe6,0x90]
+        sub.d     $f18,$f3,$f17
+        sub.s     $f23,$f22,$f22
+        subu      $sp,$s6,$s6
+        suxc1     $f12,$k1($13)
+        sw        $ra,-10160($sp)
+        swc1      $f6,-8465($24)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
+        swl       $15,13694($s3)
+        swr       $s1,-26590($14)
+        swxc1     $f19,$12($k0)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        sync      1                    # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
+        teq       $0,$3                # CHECK: teq $zero, $3          # encoding: [0x00,0x03,0x00,0x34]
+        teq       $5,$7,620            # CHECK: teq $5, $7, 620        # encoding: [0x00,0xa7,0x9b,0x34]
+        teqi      $s5,-17504
+        tge       $7,$10               # CHECK: tge $7, $10            # encoding: [0x00,0xea,0x00,0x30]
+        tge       $5,$19,340           # CHECK: tge $5, $19, 340       # encoding: [0x00,0xb3,0x55,0x30]
+        tgei      $s1,5025
+        tgeiu     $sp,-28621
+        tgeu      $22,$28              # CHECK: tgeu $22, $gp          # encoding: [0x02,0xdc,0x00,0x31]
+        tgeu      $20,$14,379          # CHECK: tgeu $20, $14, 379     # encoding: [0x02,0x8e,0x5e,0xf1]
+        tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
+        tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
+        tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
+        tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlt       $15,$13              # CHECK: tlt $15, $13           # encoding: [0x01,0xed,0x00,0x32]
+        tlt       $2,$19,133           # CHECK: tlt $2, $19, 133       # encoding: [0x00,0x53,0x21,0x72]
+        tlti      $14,-21059
+        tltiu     $ra,-5076
+        tltu      $11,$16              # CHECK: tltu $11, $16          # encoding: [0x01,0x70,0x00,0x33]
+        tltu      $16,$29,1016         # CHECK: tltu $16, $sp, 1016    # encoding: [0x02,0x1d,0xfe,0x33]
+        tne       $6,$17               # CHECK: tne $6, $17            # encoding: [0x00,0xd1,0x00,0x36]
+        tne       $7,$8,885            # CHECK: tne $7, $8, 885        # encoding: [0x00,0xe8,0xdd,0x76]
+        tnei      $12,-29647
+        trunc.l.d $f23,$f23
+        trunc.l.s $f28,$f31
+        trunc.w.d $f22,$f15
+        trunc.w.s $f28,$f30
+        xor       $s2,$a0,$s8
+        wsbh      $k1,$9
diff --git a/test/MC/Mips/mips64r5/abi-bad.s b/test/MC/Mips/mips64r5/abi-bad.s
new file mode 100644
index 0000000..c6bb29a
--- /dev/null
+++ b/test/MC/Mips/mips64r5/abi-bad.s
@@ -0,0 +1,5 @@
+# RUN: not llvm-mc %s -triple mips-unknown-linux -mcpu=mips64r5 2>&1 | FileCheck %s
+        .set fp=xx
+# CHECK: error: '.set fp=xx' requires the O32 ABI
+# CHECK: .set fp=xx
+# CHECK:          ^
diff --git a/test/MC/Mips/mips64r5/abiflags.s b/test/MC/Mips/mips64r5/abiflags.s
new file mode 100644
index 0000000..43a5fe6
--- /dev/null
+++ b/test/MC/Mips/mips64r5/abiflags.s
@@ -0,0 +1,36 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips64r5 | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+#
+# RUN: llvm-mc %s -arch=mips -mcpu=mips64r5 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+
+# CHECK-ASM: .module fp=64
+
+# Checking if the Mips.abiflags were correctly emitted.
+# CHECK-OBJ:       Section {
+# CHECK-OBJ:         Index: 5
+# CHECK-OBJ-LABEL:   Name: .MIPS.abiflags
+# CHECK-OBJ:         Type: SHT_MIPS_ABIFLAGS (0x7000002A)
+# CHECK-OBJ:          Flags [ (0x2)
+# CHECK-OBJ:           SHF_ALLOC (0x2)
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         Address: 0x0
+# CHECK-OBJ:         Size: 24
+# CHECK-OBJ:         Link: 0
+# CHECK-OBJ:         Info: 0
+# CHECK-OBJ:         AddressAlignment: 8
+# CHECK-OBJ:         EntrySize: 24
+# CHECK-OBJ:         Relocations [
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         SectionData (
+# CHECK-OBJ:           0000: 00004005 02020001 00000000 00000000  |..@.............|
+# CHECK-OBJ:           0010: 00000001 00000000                    |........|
+# CHECK-OBJ:         )
+# CHECK-OBJ-LABEL: }
+
+        .module fp=64
+
+# FIXME: Test should include gnu_attributes directive when implemented.
+#        An explicit .gnu_attribute must be checked against the effective
+#        command line options and any inconsistencies reported via a warning.
diff --git a/test/MC/Mips/mips64r5/invalid.s b/test/MC/Mips/mips64r5/invalid.s
new file mode 100644
index 0000000..8319deb
--- /dev/null
+++ b/test/MC/Mips/mips64r5/invalid.s
@@ -0,0 +1,10 @@
+# Instructions that are valid for the current ISA but should be rejected by the assembler (e.g.
+# invalid set of operands or operand's restrictions not met).
+
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -mcpu=mips64r5 2>%t1
+# RUN: FileCheck %s < %t1 -check-prefix=ASM
+
+        .text
+        .set noreorder
+        jalr.hb $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
+        jalr.hb $31, $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
diff --git a/test/MC/Mips/mips64r5/valid-xfail.s b/test/MC/Mips/mips64r5/valid-xfail.s
new file mode 100644
index 0000000..04221dd
--- /dev/null
+++ b/test/MC/Mips/mips64r5/valid-xfail.s
@@ -0,0 +1,306 @@
+# Instructions that should be valid but currently fail for known reasons (e.g.
+# they aren't implemented yet).
+# This test is set up to XPASS if any instruction generates an encoding.
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r5 | not FileCheck %s
+# CHECK-NOT: encoding
+# XFAIL: *
+
+        .set noat
+        abs.ps          $f22,$f8
+        absq_s.ph       $8,$a0
+        absq_s.qb       $15,$s1
+        absq_s.w        $s3,$ra
+        add.ps          $f25,$f27,$f13
+        addq.ph         $s1,$15,$at
+        addq_s.ph       $s3,$s6,$s2
+        addq_s.w        $a2,$8,$at
+        addqh.ph        $s4,$14,$s1
+        addqh.w         $s7,$s7,$k1
+        addqh_r.ph      $sp,$25,$s8
+        addqh_r.w       $8,$v1,$zero
+        addsc           $s8,$15,$12
+        addu.ph         $a2,$14,$s3
+        addu.qb         $s6,$v1,$v1
+        addu_s.ph       $a3,$s3,$gp
+        addu_s.qb       $s4,$s8,$s1
+        adduh.qb        $a1,$a1,$at
+        adduh_r.qb      $a0,$9,$12
+        addwc           $k0,$s6,$s7
+        alnv.ob         $v22,$v19,$v30,$v1
+        alnv.ob         $v31,$v23,$v30,$at
+        alnv.ob         $v8,$v17,$v30,$a1
+        alnv.ps         $f12,$f18,$f30,$12
+        and.v           $w10,$w25,$w29
+        bitrev          $14,$at
+        bmnz.v          $w15,$w2,$w28
+        bmz.v           $w13,$w11,$w21
+        bsel.v          $w28,$w7,$w0
+        c.eq.d          $fcc1,$f15,$f15
+        c.eq.ps         $fcc5,$f0,$f9
+        c.eq.s          $fcc5,$f24,$f17
+        c.f.d           $fcc4,$f11,$f21
+        c.f.ps          $fcc6,$f11,$f11
+        c.f.s           $fcc4,$f30,$f7
+        c.le.d          $fcc4,$f18,$f1
+        c.le.ps         $fcc1,$f7,$f20
+        c.le.s          $fcc6,$f24,$f4
+        c.lt.d          $fcc3,$f9,$f3
+        c.lt.ps         $f19,$f5
+        c.lt.s          $fcc2,$f17,$f14
+        c.nge.d         $fcc5,$f21,$f16
+        c.nge.ps        $f1,$f26
+        c.nge.s         $fcc3,$f11,$f8
+        c.ngl.ps        $f21,$f30
+        c.ngl.s         $fcc2,$f31,$f23
+        c.ngle.ps       $fcc7,$f12,$f20
+        c.ngle.s        $fcc2,$f18,$f23
+        c.ngt.d         $fcc4,$f24,$f7
+        c.ngt.ps        $fcc5,$f30,$f6
+        c.ngt.s         $fcc5,$f8,$f13
+        c.ole.d         $fcc2,$f16,$f31
+        c.ole.ps        $fcc7,$f21,$f8
+        c.ole.s         $fcc3,$f7,$f20
+        c.olt.d         $fcc4,$f19,$f28
+        c.olt.ps        $fcc3,$f7,$f16
+        c.olt.s         $fcc6,$f20,$f7
+        c.seq.d         $fcc4,$f31,$f7
+        c.seq.ps        $fcc6,$f31,$f14
+        c.seq.s         $fcc7,$f1,$f25
+        c.sf.ps         $fcc6,$f4,$f6
+        c.ueq.d         $fcc4,$f13,$f25
+        c.ueq.ps        $fcc1,$f5,$f29
+        c.ueq.s         $fcc6,$f3,$f30
+        c.ule.d         $fcc7,$f25,$f18
+        c.ule.ps        $fcc6,$f17,$f3
+        c.ule.s         $fcc7,$f21,$f30
+        c.ult.d         $fcc6,$f6,$f17
+        c.ult.ps        $fcc7,$f14,$f0
+        c.ult.s         $fcc7,$f24,$f10
+        c.un.d          $fcc6,$f23,$f24
+        c.un.ps         $fcc4,$f2,$f26
+        c.un.s          $fcc1,$f30,$f4
+        cvt.ps.s        $f3,$f18,$f19
+        cmp.eq.ph       $s7,$14
+        cmp.le.ph       $8,$14
+        cmp.lt.ph       $k0,$sp
+        cmpgdu.eq.qb    $s3,$zero,$k0
+        cmpgdu.le.qb    $v1,$15,$s2
+        cmpgdu.lt.qb    $s0,$gp,$sp
+        cmpgu.eq.qb     $14,$s6,$s8
+        cmpgu.le.qb     $9,$a3,$s4
+        cmpgu.lt.qb     $sp,$at,$8
+        cmpu.eq.qb      $v0,$24
+        cmpu.le.qb      $s1,$a1
+        cmpu.lt.qb      $at,$a3
+        cvt.s.pl        $f30,$f1
+        cvt.s.pu        $f14,$f25
+        dmfc0           $10,c0_watchhi,2
+        dmfgc0          $gp,c0_perfcnt,6
+        dmt $k0
+        dmtc0           $15,c0_datalo
+        dmtgc0          $a2,c0_watchlo,2
+        dpa.w.ph        $ac1,$s7,$k0
+        dpaq_s.w.ph     $ac2,$a0,$13
+        dpaq_sa.l.w     $ac0,$a2,$14
+        dpaqx_s.w.ph    $ac3,$a0,$24
+        dpaqx_sa.w.ph   $ac1,$zero,$s5
+        dpau.h.qbl      $ac1,$10,$24
+        dpau.h.qbr      $ac1,$s7,$s6
+        dpax.w.ph       $ac3,$a0,$k0
+        dps.w.ph        $ac1,$a3,$a1
+        dpsq_s.w.ph     $ac0,$gp,$k0
+        dpsq_sa.l.w     $ac0,$a3,$15
+        dpsqx_s.w.ph    $ac3,$13,$a3
+        dpsqx_sa.w.ph   $ac3,$sp,$s2
+        dpsu.h.qbl      $ac2,$14,$10
+        dpsu.h.qbr      $ac2,$a1,$s6
+        dpsx.w.ph       $ac0,$s7,$gp
+        drorv           $at,$a1,$s7
+        dvpe            $s6
+        emt $8
+        evpe            $v0
+        extpdpv         $s6,$ac0,$s8
+        extpv           $13,$ac0,$14
+        extrv.w         $8,$ac3,$at
+        extrv_r.w       $8,$ac1,$s6
+        extrv_rs.w      $gp,$ac1,$s6
+        extrv_s.h       $s2,$ac1,$14
+        fclass.d        $w14,$w27
+        fclass.w        $w19,$w28
+        fexupl.d        $w10,$w29
+        fexupl.w        $w12,$w27
+        fexupr.d        $w31,$w15
+        fexupr.w        $w29,$w12
+        ffint_s.d       $w1,$w30
+        ffint_s.w       $w16,$w14
+        ffint_u.d       $w23,$w18
+        ffint_u.w       $w19,$w12
+        ffql.d          $w2,$w3
+        ffql.w          $w9,$w0
+        ffqr.d          $w25,$w24
+        ffqr.w          $w10,$w6
+        fill.b          $w9,$v1
+        fill.d          $w28,$8
+        fill.h          $w9,$8
+        fill.w          $w31,$15
+        flog2.d         $w12,$w16
+        flog2.w         $w19,$w23
+        fork            $s2,$8,$a0
+        frcp.d          $w12,$w4
+        frcp.w          $w30,$w8
+        frint.d         $w20,$w8
+        frint.w         $w11,$w29
+        frsqrt.d        $w29,$w2
+        frsqrt.w        $w9,$w8
+        fsqrt.d         $w3,$w1
+        fsqrt.w         $w5,$w15
+        ftint_s.d       $w31,$w26
+        ftint_s.w       $w27,$w14
+        ftint_u.d       $w5,$w31
+        ftint_u.w       $w12,$w29
+        ftrunc_s.d      $w4,$w22
+        ftrunc_s.w      $w24,$w7
+        ftrunc_u.d      $w20,$w25
+        ftrunc_u.w      $w7,$w26
+        insv            $s2,$at
+        iret
+        lbe $14,122($9)
+        lbue            $11,-108($10)
+        lbux            $9,$14($v0)
+        lhe $s6,219($v1)
+        lhue            $gp,118($11)
+        lhx $sp,$k0($15)
+        lle $gp,-237($ra)
+        lwe $ra,-145($14)
+        lwle            $11,-42($11)
+        lwre            $sp,-152($24)
+        lwx $12,$12($s4)
+        madd.ps         $f22,$f3,$f14,$f3
+        maq_s.w.phl     $ac2,$25,$11
+        maq_s.w.phr     $ac0,$10,$25
+        maq_sa.w.phl    $ac3,$a1,$v1
+        maq_sa.w.phr    $ac1,$at,$10
+        mfgc0           $s6,c0_datahi1
+        mflo            $9,$ac2
+        modsub          $a3,$12,$a3
+        mov.ps          $f22,$f17
+        movf.ps         $f10,$f28,$fcc6
+        movn.ps         $f31,$f31,$s3
+        movt.ps         $f20,$f25,$fcc2
+        movz.ps         $f18,$f17,$ra
+        msgn.qh         $v0,$v24,$v20
+        msgn.qh         $v12,$v21,$v0[1]
+        msub            $ac2,$sp,$14
+        msub.ps         $f12,$f14,$f29,$f17
+        msubu           $ac2,$a1,$24
+        mtc0            $9,c0_datahi1
+        mtgc0           $s4,$21,7
+        mthi            $v0,$ac1
+        mthlip          $a3,$ac0
+        mul.ph          $s4,$24,$s0
+        mul.ps          $f14,$f0,$f16
+        mul_s.ph        $10,$14,$15
+        muleq_s.w.phl   $11,$s4,$s4
+        muleq_s.w.phr   $s6,$a0,$s8
+        muleu_s.ph.qbl  $a2,$14,$8
+        muleu_s.ph.qbr  $a1,$ra,$9
+        mulq_rs.ph      $s2,$14,$15
+        mulq_rs.w       $at,$s4,$25
+        mulq_s.ph       $s0,$k1,$15
+        mulq_s.w        $9,$a3,$s0
+        mulsa.w.ph      $ac1,$s4,$s6
+        mulsaq_s.w.ph   $ac0,$ra,$s2
+        neg.ps          $f19,$f13
+        nloc.b          $w12,$w30
+        nloc.d          $w16,$w7
+        nloc.h          $w21,$w17
+        nloc.w          $w17,$w16
+        nlzc.b          $w12,$w7
+        nlzc.d          $w14,$w14
+        nlzc.h          $w24,$w24
+        nlzc.w          $w10,$w4
+        nmadd.ps        $f27,$f4,$f9,$f25
+        nmsub.ps        $f6,$f12,$f14,$f17
+        nor.v           $w20,$w20,$w15
+        or.v            $w13,$w23,$w12
+        packrl.ph       $ra,$24,$14
+        pcnt.b          $w30,$w15
+        pcnt.d          $w5,$w16
+        pcnt.h          $w20,$w24
+        pcnt.w          $w22,$w20
+        pick.ph         $ra,$a2,$gp
+        pick.qb         $11,$a0,$gp
+        pll.ps          $f25,$f9,$f30
+        plu.ps          $f1,$f26,$f29
+        preceq.w.phl    $s8,$gp
+        preceq.w.phr    $s5,$15
+        precequ.ph.qbl  $s7,$ra
+        precequ.ph.qbla $a0,$9
+        precequ.ph.qbr  $ra,$s3
+        precequ.ph.qbra $24,$8
+        preceu.ph.qbl   $sp,$8
+        preceu.ph.qbla  $s6,$11
+        preceu.ph.qbr   $gp,$s1
+        preceu.ph.qbra  $k1,$s0
+        precr.qb.ph     $v0,$12,$s8
+        precrq.ph.w     $14,$s8,$24
+        precrq.qb.ph    $a2,$12,$12
+        precrq_rs.ph.w  $a1,$k0,$a3
+        precrqu_s.qb.ph $zero,$gp,$s5
+        pul.ps          $f9,$f30,$f26
+        puu.ps          $f24,$f9,$f2
+        raddu.w.qb      $25,$s3
+        rdpgpr          $s3,$9
+        recip.d         $f19,$f6
+        recip.s         $f3,$f30
+        repl.ph         $at,-307
+        replv.ph        $v1,$s7
+        replv.qb        $25,$12
+        rorv            $13,$a3,$s5
+        rsqrt.d         $f3,$f28
+        rsqrt.s         $f4,$f8
+        sbe $s7,33($s1)
+        sce $sp,189($10)
+        she $24,105($v0)
+        shilo           $ac1,26
+        shilov          $ac2,$10
+        shllv.ph        $10,$s0,$s0
+        shllv.qb        $gp,$v1,$zero
+        shllv_s.ph      $k1,$at,$13
+        shllv_s.w       $s1,$ra,$k0
+        shrav.ph        $25,$s2,$s1
+        shrav.qb        $zero,$24,$11
+        shrav_r.ph      $s3,$11,$25
+        shrav_r.qb      $a0,$sp,$s5
+        shrav_r.w       $s7,$s4,$s6
+        shrlv.ph        $14,$10,$9
+        shrlv.qb        $a2,$s2,$11
+        sub.ps          $f5,$f14,$f26
+        subq.ph         $ra,$9,$s8
+        subq_s.ph       $13,$s8,$s5
+        subq_s.w        $k1,$a2,$a3
+        subqh.ph        $10,$at,$9
+        subqh.w         $v0,$a2,$zero
+        subqh_r.ph      $a0,$12,$s6
+        subqh_r.w       $10,$a2,$gp
+        subu.ph         $9,$s6,$s4
+        subu.qb         $s6,$a2,$s6
+        subu_s.ph       $v1,$a1,$s3
+        subu_s.qb       $s1,$at,$ra
+        subuh.qb        $zero,$gp,$gp
+        subuh_r.qb      $s4,$s8,$s6
+        swe $24,94($k0)
+        swle            $v1,-209($gp)
+        swre            $k0,-202($s2)
+        tlbginv
+        tlbginvf
+        tlbgp
+        tlbgr
+        tlbgwi
+        tlbgwr
+        tlbinv
+        tlbinvf
+        wrpgpr          $zero,$13
+        xor.v           $w20,$w21,$w30
+        yield           $v1,$s0
diff --git a/test/MC/Mips/mips64r5/valid.s b/test/MC/Mips/mips64r5/valid.s
new file mode 100644
index 0000000..1706852
--- /dev/null
+++ b/test/MC/Mips/mips64r5/valid.s
@@ -0,0 +1,305 @@
+# Instructions that are valid
+#
+# RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r5 | FileCheck %s
+
+        .set noat
+        abs.d     $f7,$f25             # CHECK: encoding:
+        abs.s     $f9,$f16
+        add       $s7,$s2,$a1
+        add       $9,$14,15176         # CHECK: addi $9, $14, 15176   # encoding: [0x21,0xc9,0x3b,0x48]
+        add       $24,-7193            # CHECK: addi $24, $24, -7193  # encoding: [0x23,0x18,0xe3,0xe7]
+        add.d     $f1,$f7,$f29
+        add.s     $f8,$f21,$f24
+        addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
+        addu      $9,$a0,$a2
+        addu      $9,10                # CHECK: addiu $9, $9, 10    # encoding: [0x25,0x29,0x00,0x0a]
+        and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1fl     $fcc0,4688           # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     4688                 # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     $fcc7,27             # CHECK: bc1fl $fcc7, 27 # encoding: [0x45,0x1e,0x00,0x06]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1tl     $fcc0,4688           # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     4688                 # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     $fcc7,27             # CHECK: bc1tl $fcc7, 27 # encoding: [0x45,0x1f,0x00,0x06]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        beql      $14,$s3,12544        # CHECK: beql $14, $19, 12544 # encoding: [0x51,0xd3,0x0c,0x40]
+        bgezall   $12,7293             # CHECK: bgezall $12, 7293    # encoding: [0x05,0x93,0x07,0x1f]
+        bgezl     $4,-6858             # CHECK: bgezl $4, -6858      # encoding: [0x04,0x83,0xf9,0x4d]
+        bgtzl     $10,-3738            # CHECK: bgtzl $10, -3738     # encoding: [0x5d,0x40,0xfc,0x59]
+        blezl     $6,2974              # CHECK: blezl $6, 2974       # encoding: [0x58,0xc0,0x02,0xe7]
+        bltzall   $6,488               # CHECK: bltzall $6, 488      # encoding: [0x04,0xd2,0x00,0x7a]
+        bltzl     $s1,-9964            # CHECK: bltzl $17, -9964     # encoding: [0x06,0x22,0xf6,0x45]
+        bnel      $gp,$s4,5107         # CHECK: bnel $gp, $20, 5107  # encoding: [0x57,0x94,0x04,0xfc]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
+        c.ngl.d   $f29,$f29
+        c.ngle.d  $f0,$f16
+        c.sf.d    $f30,$f0
+        c.sf.s    $f14,$f22
+        ceil.l.d  $f1,$f3
+        ceil.l.s  $f18,$f13
+        ceil.w.d  $f11,$f25
+        ceil.w.s  $f6,$f20
+        cfc1      $s1,$21
+        clo       $11,$a1              # CHECK: clo $11, $5   # encoding: [0x70,0xab,0x58,0x21]
+        clz       $sp,$gp              # CHECK: clz $sp, $gp  # encoding: [0x73,0x9d,0xe8,0x20]
+        ctc1      $a2,$26
+        cvt.d.l   $f4,$f16
+        cvt.d.s   $f22,$f28
+        cvt.d.w   $f26,$f11
+        cvt.l.d   $f24,$f15
+        cvt.l.s   $f11,$f29
+        cvt.s.d   $f26,$f8
+        cvt.s.l   $f15,$f30
+        cvt.s.w   $f22,$f15
+        cvt.w.d   $f20,$f14
+        cvt.w.s   $f20,$f24
+        dadd      $s3,$at,$ra
+        dadd      $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        dadd      $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
+        daddi     $sp,$s4,-27705
+        daddi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
+        daddiu    $k0,$s6,-4586
+        daddu     $s3,$at,$ra
+        daddu     $24,$2,18079         # CHECK: daddiu $24, $2, 18079  # encoding: [0x64,0x58,0x46,0x9f]
+        daddu     $19,26943            # CHECK: daddiu $19, $19, 26943 # encoding: [0x66,0x73,0x69,0x3f]
+        dclo      $s2,$a2              # CHECK: dclo $18, $6   # encoding: [0x70,0xd2,0x90,0x25]
+        dclz      $s0,$25              # CHECK: dclz $16, $25  # encoding: [0x73,0x30,0x80,0x24]
+        deret
+        di        $s8                  # CHECK: di  $fp        # encoding: [0x41,0x7e,0x60,0x00]
+        di                             # CHECK: di             # encoding: [0x41,0x60,0x60,0x00]
+        ddiv      $zero,$k0,$s3
+        ddivu     $zero,$s0,$s1
+        div       $zero,$25,$11
+        div.d     $f29,$f20,$f27
+        div.s     $f4,$f5,$f15
+        divu      $zero,$25,$15
+        dmfc1     $12,$f13
+        dmtc1     $s0,$f14
+        dmult     $s7,$9
+        dmultu    $a1,$a2
+        drotr     $1,15                # CHECK: drotr $1, $1, 15            # encoding: [0x00,0x21,0x0b,0xfa]
+        drotr     $1,$14,15            # CHECK: drotr $1, $14, 15           # encoding: [0x00,0x2e,0x0b,0xfa]
+        drotr32   $1,15                # CHECK: drotr32 $1, $1, 15          # encoding: [0x00,0x21,0x0b,0xfe]
+        drotr32   $1,$14,15            # CHECK: drotr32 $1, $14, 15         # encoding: [0x00,0x2e,0x0b,0xfe]
+        drotrv    $1,$14,$15           # CHECK: drotrv $1, $14, $15         # encoding: [0x01,0xee,0x08,0x56]
+        dsbh      $v1,$14
+        dshd      $v0,$sp
+        dsll      $zero,18             # CHECK: dsll $zero, $zero, 18       # encoding: [0x00,0x00,0x04,0xb8]
+        dsll      $zero,$s4,18         # CHECK: dsll $zero, $20, 18         # encoding: [0x00,0x14,0x04,0xb8]
+        dsll      $zero,$s4,$12        # CHECK: dsllv $zero, $20, $12       # encoding: [0x01,0x94,0x00,0x14]
+        dsll32    $zero,18             # CHECK: dsll32 $zero, $zero, 18     # encoding: [0x00,0x00,0x04,0xbc]
+        dsll32    $zero,$zero,18       # CHECK: dsll32 $zero, $zero, 18     # encoding: [0x00,0x00,0x04,0xbc]
+        dsllv     $zero,$s4,$12        # CHECK: dsllv $zero, $20, $12       # encoding: [0x01,0x94,0x00,0x14]
+        dsra      $gp,10               # CHECK: dsra $gp, $gp, 10           # encoding: [0x00,0x1c,0xe2,0xbb]
+        dsra      $gp,$s2,10           # CHECK: dsra $gp, $18, 10           # encoding: [0x00,0x12,0xe2,0xbb]
+        dsra      $gp,$s2,$s3          # CHECK: dsrav $gp, $18, $19         # encoding: [0x02,0x72,0xe0,0x17]
+        dsra32    $gp,10               # CHECK: dsra32 $gp, $gp, 10         # encoding: [0x00,0x1c,0xe2,0xbf]
+        dsra32    $gp,$s2,10           # CHECK: dsra32 $gp, $18, 10         # encoding: [0x00,0x12,0xe2,0xbf]
+        dsrav     $gp,$s2,$s3          # CHECK: dsrav $gp, $18, $19         # encoding: [0x02,0x72,0xe0,0x17]
+        dsrl      $s3,23               # CHECK: dsrl $19, $19, 23           # encoding: [0x00,0x13,0x9d,0xfa]
+        dsrl      $s3,$6,23            # CHECK: dsrl $19, $6, 23            # encoding: [0x00,0x06,0x9d,0xfa]
+        dsrl      $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
+        dsrl32    $s3,23               # CHECK: dsrl32 $19, $19, 23         # encoding: [0x00,0x13,0x9d,0xfe]
+        dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
+        dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
+        dsub      $a3,$s6,$8
+        dsub      $a3,$s6,$8
+        dsub      $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsub      $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
+        dsubi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
+        dsubu     $a1,$a1,$k0
+        dsubu     $a1,$a1,$k0
+        dsubu     $15,$11,5025         # CHECK: daddiu $15, $11, -5025 # encoding: [0x65,0x6f,0xec,0x5f]
+        dsubu     $14,-4586            # CHECK: daddiu $14, $14, 4586  # encoding: [0x65,0xce,0x11,0xea]
+        ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
+        ei        $14                  # CHECK: ei  $14       # encoding: [0x41,0x6e,0x60,0x20]
+        ei                             # CHECK: ei            # encoding: [0x41,0x60,0x60,0x20]
+        eret
+        floor.l.d $f26,$f7
+        floor.l.s $f12,$f5
+        floor.w.d $f14,$f11
+        floor.w.s $f8,$f9
+        jr.hb     $4                   # CHECK: jr.hb  $4 # encoding: [0x00,0x80,0x04,0x08]
+        jalr.hb   $4                   # CHECK: jalr.hb  $4 # encoding: [0x00,0x80,0xfc,0x09]
+        jalr.hb   $4, $5               # CHECK: jalr.hb  $4, $5 # encoding: [0x00,0xa0,0x24,0x09]
+        lb        $24,-14515($10)
+        lbu       $8,30195($v1)
+        ld        $sp,-28645($s1)
+        ldc1      $f11,16391($s0)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
+        ldl       $24,-4167($24)
+        ldr       $14,-30358($s4)
+        ldxc1     $f8,$s7($15)
+        lh        $11,-8556($s5)
+        lhu       $s3,-22851($v0)
+        li        $at,-29773
+        li        $zero,-29889
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
+        lld       $zero,-14736($ra)    # CHECK: lld $zero, -14736($ra) # encoding: [0xd3,0xe0,0xc6,0x70]
+        luxc1     $f19,$s6($s5)
+        lw        $8,5674($a1)
+        lwc1      $f16,10225($k0)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
+        lwl       $s4,-4231($15)
+        lwr       $zero,-19147($gp)
+        lwu       $s3,-24086($v1)
+        lwxc1     $f12,$s1($s8)
+        madd      $s6,$13
+        madd      $zero,$9
+        madd.s    $f1,$f31,$f19,$f25
+        maddu     $s3,$gp
+        maddu     $24,$s2
+        mfc0      $a2,$14,1
+        mfc1      $a3,$f27
+        mfhc1     $s8,$f24
+        mfhi      $s3
+        mfhi      $sp
+        mflo      $s1
+        mov.d     $f20,$f14
+        mov.s     $f2,$f27
+        move      $a0,$a3
+        move      $s5,$a0
+        move      $s8,$a0
+        move      $25,$a2
+        movf      $gp,$8,$fcc7
+        movf.d    $f6,$f11,$fcc5
+        movf.s    $f23,$f5,$fcc6
+        movn      $v1,$s1,$s0
+        movn.d    $f27,$f21,$k0
+        movn.s    $f12,$f0,$s7
+        movt      $zero,$s4,$fcc5
+        movt.d    $f0,$f2,$fcc0
+        movt.s    $f30,$f2,$fcc1
+        movz      $a1,$s6,$9
+        movz.d    $f12,$f29,$9
+        movz.s    $f25,$f7,$v1
+        msub      $s7,$k1
+        msub.s    $f12,$f19,$f10,$f16
+        msubu     $15,$a1
+        mtc0      $9,$29,3
+        mtc1      $s8,$f9
+        mthc1     $zero,$f16
+        mthi      $s1
+        mtlo      $sp
+        mtlo      $25
+        mul       $s0,$s4,$at
+        mul.d     $f20,$f20,$f16
+        mul.s     $f30,$f10,$f2
+        mult      $sp,$s4
+        mult      $sp,$v0
+        multu     $gp,$k0
+        multu     $9,$s2
+        negu      $2                   # CHECK: negu $2, $2            # encoding: [0x00,0x02,0x10,0x23]
+        negu      $2,$3                # CHECK: negu $2, $3            # encoding: [0x00,0x03,0x10,0x23]
+        neg.d     $f27,$f18
+        neg.s     $f1,$f15
+        nmadd.s   $f0,$f5,$f25,$f12
+        nmsub.s   $f1,$f24,$f19,$f4
+        nop
+        nor       $a3,$zero,$a3
+        or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4           # encoding: [0x34,0x42,0x00,0x04]
+        pause                          # CHECK: pause # encoding:  [0x00,0x00,0x01,0x40]
+        pref      1, 8($5)             # CHECK: pref 1, 8($5)           # encoding: [0xcc,0xa1,0x00,0x08]
+        # FIXME: Use the code generator in order to print the .set directives
+        #        instead of the instruction printer.
+        rdhwr     $sp,$11              # CHECK:      .set  push
+                                       # CHECK-NEXT: .set  mips32r2
+                                       # CHECK-NEXT: rdhwr $sp, $11
+                                       # CHECK-NEXT: .set  pop          # encoding: [0x7c,0x1d,0x58,0x3b]
+        rotr      $1,15                # CHECK: rotr $1, $1, 15         # encoding: [0x00,0x21,0x0b,0xc2]
+        rotr      $1,$14,15            # CHECK: rotr $1, $14, 15        # encoding: [0x00,0x2e,0x0b,0xc2]
+        rotrv     $1,$14,$15           # CHECK: rotrv $1, $14, $15      # encoding: [0x01,0xee,0x08,0x46]
+        round.l.d $f12,$f1
+        round.l.s $f25,$f5
+        round.w.d $f6,$f4
+        round.w.s $f27,$f28
+        sb        $s6,-19857($14)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
+        scd       $15,-8243($sp)       # CHECK: scd $15, -8243($sp)    # encoding: [0xf3,0xaf,0xdf,0xcd]
+        sdbbp                          # CHECK: sdbbp                  # encoding: [0x70,0x00,0x00,0x3f]
+        sdbbp     34                   # CHECK: sdbbp 34               # encoding: [0x70,0x00,0x08,0xbf]
+        sd        $12,5835($10)
+        sdc1      $f31,30574($13)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
+        sdl       $a3,-20961($s8)
+        sdr       $11,-20423($12)
+        sdxc1     $f11,$10($14)
+        seb       $25,$15
+        seh       $v1,$12
+        sh        $14,-6704($15)
+        sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
+        sll       $a3,$zero,18         # CHECK: sll $7, $zero, 18      # encoding: [0x00,0x00,0x3c,0x80]
+        sll       $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        sllv      $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        slt       $s7,$11,$k1          # CHECK: slt $23, $11, $27      # encoding: [0x01,0x7b,0xb8,0x2a]
+        slti      $s1,$10,9489         # CHECK: slti $17, $10, 9489    # encoding: [0x29,0x51,0x25,0x11]
+        sltiu     $25,$25,-15531       # CHECK: sltiu $25, $25, -15531 # encoding: [0x2f,0x39,0xc3,0x55]
+        sltu      $s4,$s5,$11          # CHECK: sltu  $20, $21, $11    # encoding: [0x02,0xab,0xa0,0x2b]
+        sltu      $24,$25,-15531       # CHECK: sltiu $24, $25, -15531 # encoding: [0x2f,0x38,0xc3,0x55]
+        sqrt.d    $f17,$f22
+        sqrt.s    $f0,$f1
+        sra       $s1,15               # CHECK: sra $17, $17, 15       # encoding: [0x00,0x11,0x8b,0xc3]
+        sra       $s1,$s7,15           # CHECK: sra $17, $23, 15       # encoding: [0x00,0x17,0x8b,0xc3]
+        sra       $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srav      $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srl       $2,7                 # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $2,$2,7              # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sub       $s6,$s3,$12
+        sub       $22,$17,-3126        # CHECK: addi $22, $17, 3126    # encoding: [0x22,0x36,0x0c,0x36]
+        sub       $13,6512             # CHECK: addi $13, $13, -6512   # encoding: [0x21,0xad,0xe6,0x90]
+        sub.d     $f18,$f3,$f17
+        sub.s     $f23,$f22,$f22
+        subu      $sp,$s6,$s6
+        suxc1     $f12,$k1($13)
+        sw        $ra,-10160($sp)
+        swc1      $f6,-8465($24)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
+        swl       $15,13694($s3)
+        swr       $s1,-26590($14)
+        swxc1     $f19,$12($k0)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        sync      1                    # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
+        teq       $0,$3                # CHECK: teq $zero, $3          # encoding: [0x00,0x03,0x00,0x34]
+        teq       $5,$7,620            # CHECK: teq $5, $7, 620        # encoding: [0x00,0xa7,0x9b,0x34]
+        teqi      $s5,-17504
+        tge       $7,$10               # CHECK: tge $7, $10            # encoding: [0x00,0xea,0x00,0x30]
+        tge       $5,$19,340           # CHECK: tge $5, $19, 340       # encoding: [0x00,0xb3,0x55,0x30]
+        tgei      $s1,5025
+        tgeiu     $sp,-28621
+        tgeu      $22,$28              # CHECK: tgeu $22, $gp          # encoding: [0x02,0xdc,0x00,0x31]
+        tgeu      $20,$14,379          # CHECK: tgeu $20, $14, 379     # encoding: [0x02,0x8e,0x5e,0xf1]
+        tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
+        tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
+        tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
+        tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlt       $15,$13              # CHECK: tlt $15, $13           # encoding: [0x01,0xed,0x00,0x32]
+        tlt       $2,$19,133           # CHECK: tlt $2, $19, 133       # encoding: [0x00,0x53,0x21,0x72]
+        tlti      $14,-21059
+        tltiu     $ra,-5076
+        tltu      $11,$16              # CHECK: tltu $11, $16          # encoding: [0x01,0x70,0x00,0x33]
+        tltu      $16,$29,1016         # CHECK: tltu $16, $sp, 1016    # encoding: [0x02,0x1d,0xfe,0x33]
+        tne       $6,$17               # CHECK: tne $6, $17            # encoding: [0x00,0xd1,0x00,0x36]
+        tne       $7,$8,885            # CHECK: tne $7, $8, 885        # encoding: [0x00,0xe8,0xdd,0x76]
+        tnei      $12,-29647
+        trunc.l.d $f23,$f23
+        trunc.l.s $f28,$f31
+        trunc.w.d $f22,$f15
+        trunc.w.s $f28,$f30
+        xor       $s2,$a0,$s8
+        wsbh      $k1,$9
diff --git a/test/MC/Mips/nabi-regs.s b/test/MC/Mips/nabi-regs.s
index d79df4e..c265809 100644
--- a/test/MC/Mips/nabi-regs.s
+++ b/test/MC/Mips/nabi-regs.s
@@ -7,10 +7,10 @@
 # RUN:   -mcpu=mips64r2 -arch=mips64 | FileCheck %s
 #
 # RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding \
-# RUN:   -mcpu=mips64r2 -arch=mips64 -mattr=-n64,+n32 | FileCheck %s
+# RUN:   -mcpu=mips64r2 -arch=mips64 -target-abi n32 | FileCheck %s
 #
 # RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding \
-# RUN:   -mcpu=mips64r2 -arch=mips64 -mattr=-n64,+n64 | FileCheck %s
+# RUN:   -mcpu=mips64r2 -arch=mips64 -target-abi n64 | FileCheck %s
 
     .text
 foo:
diff --git a/test/MC/Mips/nooddspreg-cmdarg.s b/test/MC/Mips/nooddspreg-cmdarg.s
index 52b040e..74c9d4d 100644
--- a/test/MC/Mips/nooddspreg-cmdarg.s
+++ b/test/MC/Mips/nooddspreg-cmdarg.s
@@ -5,7 +5,7 @@
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefix=CHECK-OBJ
 
-# RUN: not llvm-mc %s -arch=mips -mcpu=mips64 -mattr=-n64,+n32,+nooddspreg 2> %t0
+# RUN: not llvm-mc %s -arch=mips -mcpu=mips64 -target-abi n32 -mattr=+nooddspreg 2> %t0
 # RUN: FileCheck %s -check-prefix=INVALID < %t0
 #
 # RUN: not llvm-mc %s -arch=mips -mcpu=mips64 -mattr=+nooddspreg 2> %t0
diff --git a/test/MC/Mips/nooddspreg.s b/test/MC/Mips/nooddspreg.s
index f268ef4..6332b70 100644
--- a/test/MC/Mips/nooddspreg.s
+++ b/test/MC/Mips/nooddspreg.s
@@ -5,7 +5,7 @@
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefix=CHECK-OBJ
 
-# RUN: not llvm-mc %s -arch=mips -mcpu=mips64 -mattr=-n64,n32 2> %t1
+# RUN: not llvm-mc %s -arch=mips -mcpu=mips64 -target-abi n32 2> %t1
 # RUN: FileCheck %s -check-prefix=INVALID < %t1
 #
 # RUN: not llvm-mc %s -arch=mips -mcpu=mips64 2> %t2
diff --git a/test/MC/Mips/octeon-instructions.s b/test/MC/Mips/octeon-instructions.s
index 2922744..34830c0 100644
--- a/test/MC/Mips/octeon-instructions.s
+++ b/test/MC/Mips/octeon-instructions.s
@@ -3,10 +3,18 @@
 # CHECK: baddu $9, $6, $7             # encoding: [0x70,0xc7,0x48,0x28]
 # CHECK: baddu $17, $18, $19          # encoding: [0x72,0x53,0x88,0x28]
 # CHECK: baddu $2, $2, $3             # encoding: [0x70,0x43,0x10,0x28]
+# CHECK: bbit0 $19, 22, foo           # encoding: [0xca,0x76,A,A]
+# CHECK: bbit032 $fp, 11, foo         # encoding: [0xdb,0xcb,A,A]
+# CHECK: bbit032 $8, 10, foo          # encoding: [0xd9,0x0a,A,A]
+# CHECK: bbit1 $3, 31, foo            # encoding: [0xe8,0x7f,A,A]
+# CHECK: bbit132 $24, 10, foo         # encoding: [0xfb,0x0a,A,A]
+# CHECK: bbit132 $14, 14, foo         # encoding: [0xf9,0xce,A,A]
 # CHECK: cins  $25, $10, 22, 2        # encoding: [0x71,0x59,0x15,0xb2]
 # CHECK: cins  $9, $9, 17, 29         # encoding: [0x71,0x29,0xec,0x72]
 # CHECK: cins32 $15, $2, 18, 8        # encoding: [0x70,0x4f,0x44,0xb3]
 # CHECK: cins32 $22, $22, 9, 22       # encoding: [0x72,0xd6,0xb2,0x73]
+# CHECK: cins32 $24, $ra, 0, 31       # encoding: [0x73,0xf8,0xf8,0x33]
+# CHECK: cins32 $15, $15, 5, 5        # encoding: [0x71,0xef,0x29,0x73]
 # CHECK: dmul  $9, $6, $7             # encoding: [0x70,0xc7,0x48,0x03]
 # CHECK: dmul  $19, $24, $25          # encoding: [0x73,0x19,0x98,0x03]
 # CHECK: dmul  $9, $9, $6             # encoding: [0x71,0x26,0x48,0x03]
@@ -18,6 +26,8 @@
 # CHECK: exts  $15, $15, 17, 6        # encoding: [0x71,0xef,0x34,0x7a]
 # CHECK: exts32 $4, $13, 10, 8        # encoding: [0x71,0xa4,0x42,0xbb]
 # CHECK: exts32 $15, $15, 11, 20      # encoding: [0x71,0xef,0xa2,0xfb]
+# CHECK: exts32 $7, $4, 22, 9         # encoding: [0x70,0x87,0x4d,0xbb]
+# CHECK: exts32 $25, $25, 5, 25       # encoding: [0x73,0x39,0xc9,0x7b]
 # CHECK: mtm0  $15                    # encoding: [0x71,0xe0,0x00,0x08]
 # CHECK: mtm1  $16                    # encoding: [0x72,0x00,0x00,0x0c]
 # CHECK: mtm2  $17                    # encoding: [0x72,0x20,0x00,0x0d]
@@ -46,13 +56,22 @@
 # CHECK: vmulu $sp, $10, $17          # encoding: [0x71,0x51,0xe8,0x0f]
 # CHECK: vmulu $27, $27, $6           # encoding: [0x73,0x66,0xd8,0x0f]
 
+foo:
   baddu $9, $6, $7
   baddu $17, $18, $19
   baddu $2, $3
+  bbit0 $19, 22, foo
+  bbit032 $30, 11, foo
+  bbit0 $8, 42, foo
+  bbit1 $3, 31, foo
+  bbit132 $24, 10, foo
+  bbit1 $14, 46, foo
   cins  $25, $10, 22, 2
   cins  $9, 17, 29
   cins32 $15, $2, 18, 8
   cins32 $22, 9, 22
+  cins  $24, $31, 32, 31
+  cins  $15, 37, 5
   dmul  $9, $6, $7
   dmul  $19, $24, $25
   dmul  $9, $6
@@ -64,6 +83,8 @@
   exts  $15, 17, 6
   exts32 $4, $13, 10, 8
   exts32 $15, 11, 20
+  exts  $7, $4, 54, 9
+  exts  $25, 37, 25
   mtm0  $15
   mtm1  $16
   mtm2  $17
diff --git a/test/MC/Mips/oddspreg.s b/test/MC/Mips/oddspreg.s
index 32ba9e0..a3902f6 100644
--- a/test/MC/Mips/oddspreg.s
+++ b/test/MC/Mips/oddspreg.s
@@ -5,10 +5,10 @@
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefix=CHECK-OBJ-ALL -check-prefix=CHECK-OBJ-O32
 #
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -mattr=-n64,+n32 | \
+# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n32 | \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 #
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -mattr=-n64,+n32 -filetype=obj -o - | \
+# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n32 -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefix=CHECK-OBJ-ALL -check-prefix=CHECK-OBJ-N32
 
@@ -25,7 +25,7 @@
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefix=CHECK-OBJ-ALL -check-prefix=CHECK-OBJ-O32
 #
-# RUN: llvm-mc /dev/null -arch=mips64 -mcpu=mips64 -mattr=-n64,+n32 -filetype=obj -o - | \
+# RUN: llvm-mc /dev/null -arch=mips64 -mcpu=mips64 -target-abi n32 -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefix=CHECK-OBJ-ALL -check-prefix=CHECK-OBJ-N32
 
diff --git a/test/MC/Mips/set-arch.s b/test/MC/Mips/set-arch.s
index 6267468..834718c 100644
--- a/test/MC/Mips/set-arch.s
+++ b/test/MC/Mips/set-arch.s
@@ -16,12 +16,24 @@
     clo         $2, $2
     .set arch=mips32r2
     rotr        $2, $2, 15
+    .set arch=mips32
+    .set arch=mips32r3
+    rotr        $2, $2, 15
+    .set arch=mips32
+    .set arch=mips32r5
+    rotr        $2, $2, 15
     .set arch=mips32r6
     mod         $2, $4, $6
     .set arch=mips64
     daddi       $2, $2, 10
     .set arch=mips64r2
     drotr32     $1, $14, 15
+    .set arch=mips64
+    .set arch=mips64r3
+    drotr32     $1, $14, 15
+    .set arch=mips64
+    .set arch=mips64r5
+    drotr32     $1, $14, 15
     .set arch=mips64r6
     mod         $2, $4, $6
     .set arch=cnmips
diff --git a/test/MC/Mips/set-at-directive-explicit-at.s b/test/MC/Mips/set-at-directive-explicit-at.s
index 797a2b7..28a7091 100644
--- a/test/MC/Mips/set-at-directive-explicit-at.s
+++ b/test/MC/Mips/set-at-directive-explicit-at.s
@@ -15,6 +15,16 @@ foo:
 # WARNINGS: :[[@LINE+2]]:11: warning: used $at without ".set noat"
     .set    at=$1
     jr    $1
+
+# CHECK:   jr    $1                      # encoding: [0x08,0x00,0x20,0x00]
+# WARNINGS: :[[@LINE+2]]:11: warning: used $at without ".set noat"
+    .set    at=$at
+    jr    $at
+
+# CHECK:   jr    $1                      # encoding: [0x08,0x00,0x20,0x00]
+# WARNINGS: :[[@LINE+2]]:11: warning: used $at without ".set noat"
+    .set    at=$at
+    jr    $1
 # WARNINGS-NOT: warning: used $at without ".set noat"
 
 # CHECK:   jr    $1                      # encoding: [0x08,0x00,0x20,0x00]
diff --git a/test/MC/Mips/set-at-directive.s b/test/MC/Mips/set-at-directive.s
index 7e93f76..1bfc473 100644
--- a/test/MC/Mips/set-at-directive.s
+++ b/test/MC/Mips/set-at-directive.s
@@ -4,156 +4,187 @@
 # for ".set at" and set the correct value.
     .text
 foo:
+# CHECK: .set at=$1
 # CHECK: lui  $1, 1
 # CHECK: addu $1, $1, $2
 # CHECK: lw   $2, 0($1)
     .set    at=$1
         lw      $2, 65536($2)
+# CHECK: .set at=$2
 # CHECK: lui  $2, 1
 # CHECK: addu $2, $2, $1
 # CHECK: lw   $1, 0($2)
     .set    at=$2
         lw      $1, 65536($1)
+# CHECK: .set at=$3
 # CHECK: lui  $3, 1
 # CHECK: addu $3, $3, $1
 # CHECK: lw   $1, 0($3)
     .set    at=$3
         lw      $1, 65536($1)
+# CHECK: .set at=$4
 # CHECK: lui  $4, 1
 # CHECK: addu $4, $4, $1
 # CHECK: lw   $1, 0($4)
     .set    at=$a0
         lw      $1, 65536($1)
+# CHECK: .set at=$5
 # CHECK: lui  $5, 1
 # CHECK: addu $5, $5, $1
 # CHECK: lw   $1, 0($5)
     .set    at=$a1
         lw      $1, 65536($1)
+# CHECK: .set at=$6
 # CHECK: lui  $6, 1
 # CHECK: addu $6, $6, $1
 # CHECK: lw   $1, 0($6)
     .set    at=$a2
         lw      $1, 65536($1)
+# CHECK: .set at=$7
 # CHECK: lui  $7, 1
 # CHECK: addu $7, $7, $1
 # CHECK: lw   $1, 0($7)
     .set    at=$a3
         lw      $1, 65536($1)
+# CHECK: .set at=$8
 # CHECK: lui  $8, 1
 # CHECK: addu $8, $8, $1
 # CHECK: lw   $1, 0($8)
     .set    at=$8
         lw      $1, 65536($1)
+# CHECK: .set at=$9
 # CHECK: lui  $9, 1
 # CHECK: addu $9, $9, $1
 # CHECK: lw   $1, 0($9)
     .set    at=$9
         lw      $1, 65536($1)
+# CHECK: .set at=$10
 # CHECK: lui  $10, 1
 # CHECK: addu $10, $10, $1
 # CHECK: lw   $1, 0($10)
     .set    at=$10
         lw      $1, 65536($1)
+# CHECK: .set at=$11
 # CHECK: lui  $11, 1
 # CHECK: addu $11, $11, $1
 # CHECK: lw   $1, 0($11)
     .set    at=$11
         lw      $1, 65536($1)
+# CHECK: .set at=$12
 # CHECK: lui  $12, 1
 # CHECK: addu $12, $12, $1
 # CHECK: lw   $1, 0($12)
     .set    at=$12
         lw      $1, 65536($1)
+# CHECK: .set at=$13
 # CHECK: lui  $13, 1
 # CHECK: addu $13, $13, $1
 # CHECK: lw   $1, 0($13)
     .set    at=$13
         lw      $1, 65536($1)
+# CHECK: .set at=$14
 # CHECK: lui  $14, 1
 # CHECK: addu $14, $14, $1
 # CHECK: lw   $1, 0($14)
     .set    at=$14
         lw      $1, 65536($1)
+# CHECK: .set at=$15
 # CHECK: lui  $15, 1
 # CHECK: addu $15, $15, $1
 # CHECK: lw   $1, 0($15)
     .set    at=$15
         lw      $1, 65536($1)
+# CHECK: .set at=$16
 # CHECK: lui  $16, 1
 # CHECK: addu $16, $16, $1
 # CHECK: lw   $1, 0($16)
     .set    at=$s0
         lw      $1, 65536($1)
+# CHECK: .set at=$17
 # CHECK: lui  $17, 1
 # CHECK: addu $17, $17, $1
 # CHECK: lw   $1, 0($17)
     .set    at=$s1
         lw      $1, 65536($1)
+# CHECK: .set at=$18
 # CHECK: lui  $18, 1
 # CHECK: addu $18, $18, $1
 # CHECK: lw   $1, 0($18)
     .set    at=$s2
         lw      $1, 65536($1)
+# CHECK: .set at=$19
 # CHECK: lui  $19, 1
 # CHECK: addu $19, $19, $1
 # CHECK: lw   $1, 0($19)
     .set    at=$s3
         lw      $1, 65536($1)
+# CHECK: .set at=$20
 # CHECK: lui  $20, 1
 # CHECK: addu $20, $20, $1
 # CHECK: lw   $1, 0($20)
     .set    at=$s4
         lw      $1, 65536($1)
+# CHECK: .set at=$21
 # CHECK: lui  $21, 1
 # CHECK: addu $21, $21, $1
 # CHECK: lw   $1, 0($21)
     .set    at=$s5
         lw      $1, 65536($1)
+# CHECK: .set at=$22
 # CHECK: lui  $22, 1
 # CHECK: addu $22, $22, $1
 # CHECK: lw   $1, 0($22)
     .set    at=$s6
         lw      $1, 65536($1)
+# CHECK: .set at=$23
 # CHECK: lui  $23, 1
 # CHECK: addu $23, $23, $1
 # CHECK: lw   $1, 0($23)
     .set    at=$s7
         lw      $1, 65536($1)
+# CHECK: .set at=$24
 # CHECK: lui  $24, 1
 # CHECK: addu $24, $24, $1
 # CHECK: lw   $1, 0($24)
     .set    at=$24
         lw      $1, 65536($1)
+# CHECK: .set at=$25
 # CHECK: lui  $25, 1
 # CHECK: addu $25, $25, $1
 # CHECK: lw   $1, 0($25)
     .set    at=$25
         lw      $1, 65536($1)
+# CHECK: .set at=$26
 # CHECK: lui  $26, 1
 # CHECK: addu $26, $26, $1
 # CHECK: lw   $1, 0($26)
     .set    at=$26
         lw      $1, 65536($1)
+# CHECK: .set at=$27
 # CHECK: lui  $27, 1
 # CHECK: addu $27, $27, $1
 # CHECK: lw   $1, 0($27)
     .set    at=$27
         lw      $1, 65536($1)
+# CHECK: .set at=$28
 # CHECK: lui  $gp, 1
 # CHECK: addu $gp, $gp, $1
 # CHECK: lw   $1, 0($gp)
     .set    at=$gp
         lw      $1, 65536($1)
+# CHECK: .set at=$30
 # CHECK: lui  $fp, 1
 # CHECK: addu $fp, $fp, $1
 # CHECK: lw   $1, 0($fp)
     .set    at=$fp
         lw      $1, 65536($1)
+# CHECK: .set at=$29
 # CHECK: lui  $sp, 1
 # CHECK: addu $sp, $sp, $1
 # CHECK: lw   $1, 0($sp)
     .set    at=$sp
         lw      $1, 65536($1)
+# CHECK: .set at=$31
 # CHECK: lui  $ra, 1
 # CHECK: addu $ra, $ra, $1
 # CHECK: lw   $1, 0($ra)
diff --git a/test/MC/Mips/set-at-noat-bad-syntax.s b/test/MC/Mips/set-at-noat-bad-syntax.s
new file mode 100644
index 0000000..47f5be7
--- /dev/null
+++ b/test/MC/Mips/set-at-noat-bad-syntax.s
@@ -0,0 +1,29 @@
+# RUN: not llvm-mc %s -triple=mips-unknown-unknown -mcpu=mips32 2>%t1
+# RUN: FileCheck %s < %t1
+
+.set at~
+# CHECK: error: unexpected token, expected equals sign
+
+.set at=
+# CHECK: error: no register specified
+
+.set at=~
+# CHECK: error: unexpected token, expected dollar sign '$'
+
+.set at=$
+# CHECK: error: unexpected token, expected identifier or integer
+
+.set at=$-4
+# CHECK: error: unexpected token, expected identifier or integer
+
+.set at=$1000
+# CHECK: error: invalid register
+
+.set at=$foo
+# CHECK: error: invalid register
+
+.set at=$2bar
+# CHECK: error: unexpected token, expected end of statement
+
+.set noat bar
+# CHECK: error: unexpected token, expected end of statement
diff --git a/test/MC/Mips/set-mips-directives-bad.s b/test/MC/Mips/set-mips-directives-bad.s
index 6726987..68a0da8 100644
--- a/test/MC/Mips/set-mips-directives-bad.s
+++ b/test/MC/Mips/set-mips-directives-bad.s
@@ -21,10 +21,22 @@
         rotr    $2,15 # CHECK: error: instruction requires a CPU feature not currently enabled
         .set mips32r2
         mod $2, $4, $6 # CHECK: error:instruction requires a CPU feature not currently enabled
+        .set mips64r3
+        .set mips32r3
+        daddi $2, $2, 10 # CHECK: error: instruction requires a CPU feature not currently enabled
+        .set mips64r3
+        .set mips32r5
+        daddi $2, $2, 10 # CHECK: error: instruction requires a CPU feature not currently enabled
         .set mips32r6
         daddi $2, $2, 10 # CHECK: error: instruction requires a CPU feature not currently enabled
         .set mips64
         drotr32 $1,$14,15 # CHECK: error: instruction requires a CPU feature not currently enabled
         .set mips64r2
         mod $2, $4, $6 # CHECK: error: instruction requires a CPU feature not currently enabled
+        .set mips64r6
+        .set mips64r3
+        mod $2, $4, $6 # CHECK: error: instruction requires a CPU feature not currently enabled
+        .set mips64r6
+        .set mips64r5
+        mod $2, $4, $6 # CHECK: error: instruction requires a CPU feature not currently enabled
 
diff --git a/test/MC/Mips/set-mips-directives.s b/test/MC/Mips/set-mips-directives.s
index 96c2308..5225968 100644
--- a/test/MC/Mips/set-mips-directives.s
+++ b/test/MC/Mips/set-mips-directives.s
@@ -17,12 +17,24 @@
         clo  $2,$2
         .set mips32r2
         rotr    $2,15
+        .set mips32
+        .set mips32r3
+        rotr    $2,15
+        .set mips32
+        .set mips32r5
+        rotr    $2,15
         .set mips32r6
         mod $2, $4, $6
         .set mips64
         daddi $2, $2, 10
         .set mips64r2
         drotr32 $1,$14,15
+        .set mips64
+        .set mips64r3
+        drotr32 $1,$14,15
+        .set mips64
+        .set mips64r5
+        drotr32 $1,$14,15
         .set mips64r6
         mod $2, $4, $6
 
@@ -41,11 +53,23 @@
 # CHECK: clo $2, $2
 # CHECK: .set mips32r2
 # CHECK: rotr $2, $2, 15
+# CHECK: .set mips32
+# CHECK: .set mips32r3
+# CHECK: rotr $2, $2, 15
+# CHECK: .set mips32
+# CHECK: .set mips32r5
+# CHECK: rotr $2, $2, 15
 # CHECK: .set mips32r6
 # CHECK: mod $2, $4, $6
 # CHECK: .set mips64
 # CHECK: daddi $2, $2, 10
 # CHECK: .set mips64r2
 # CHECK:  drotr32 $1, $14, 15
+# CHECK: .set mips64
+# CHECK: .set mips64r3
+# CHECK:  drotr32 $1, $14, 15
+# CHECK: .set mips64
+# CHECK: .set mips64r5
+# CHECK:  drotr32 $1, $14, 15
 # CHECK: .set mips64r6
 # CHECK: mod $2, $4, $6
diff --git a/test/MC/PowerPC/ppc-reloc.s b/test/MC/PowerPC/ppc-reloc.s
index 19dd2a3..e7dd1e2 100644
--- a/test/MC/PowerPC/ppc-reloc.s
+++ b/test/MC/PowerPC/ppc-reloc.s
@@ -7,11 +7,13 @@
 	.align 2
 foo:
 	bl printf@plt
+	bl _GLOBAL_OFFSET_TABLE_@local-4
 .LC1:
 	.size foo, . - foo
 
 # CHECK:      Relocations [
 # CHECK-NEXT:   Section (2) .rela.text {
 # CHECK-NEXT:     0x0 R_PPC_PLTREL24 printf 0x0
+# CHECK-NEXT:     0x4 R_PPC_LOCAL24PC _GLOBAL_OFFSET_TABLE_ 0xFFFFFFFC
 # CHECK-NEXT:   }
 # CHECK-NEXT: ]
diff --git a/test/MC/PowerPC/ppc64-encoding-ext.s b/test/MC/PowerPC/ppc64-encoding-ext.s
index 0ffe0bf..dca8a65 100644
--- a/test/MC/PowerPC/ppc64-encoding-ext.s
+++ b/test/MC/PowerPC/ppc64-encoding-ext.s
@@ -3633,3 +3633,36 @@
 # CHECK-BE: mtspr 280, 2                     # encoding: [0x7c,0x58,0x43,0xa6]
 # CHECK-LE: mtspr 280, 2                     # encoding: [0xa6,0x43,0x58,0x7c]
             mtasr 2
+
+# Load and Store Caching Inhibited Instructions
+# CHECK-BE: lbzcix 21, 5, 7                  # encoding: [0x7e,0xa5,0x3e,0xaa]
+# CHECK-LE: lbzcix 21, 5, 7                  # encoding: [0xaa,0x3e,0xa5,0x7e]
+            lbzcix 21, 5, 7
+# CHECK-BE: lhzcix 21, 5, 7                  # encoding: [0x7e,0xa5,0x3e,0x6a]
+# CHECK-LE: lhzcix 21, 5, 7                  # encoding: [0x6a,0x3e,0xa5,0x7e]
+            lhzcix 21, 5, 7
+# CHECK-BE: lwzcix 21, 5, 7                  # encoding: [0x7e,0xa5,0x3e,0x2a]
+# CHECK-LE: lwzcix 21, 5, 7                  # encoding: [0x2a,0x3e,0xa5,0x7e]
+            lwzcix 21, 5, 7
+# CHECK-BE: ldcix  21, 5, 7                  # encoding: [0x7e,0xa5,0x3e,0xea]
+# CHECK-LE: ldcix  21, 5, 7                  # encoding: [0xea,0x3e,0xa5,0x7e]
+            ldcix  21, 5, 7
+            
+# CHECK-BE: stbcix 21, 5, 7                  # encoding: [0x7e,0xa5,0x3f,0xaa]
+# CHECK-LE: stbcix 21, 5, 7                  # encoding: [0xaa,0x3f,0xa5,0x7e]
+            stbcix 21, 5, 7
+# CHECK-BE: sthcix 21, 5, 7                  # encoding: [0x7e,0xa5,0x3f,0x6a]
+# CHECK-LE: sthcix 21, 5, 7                  # encoding: [0x6a,0x3f,0xa5,0x7e]
+            sthcix 21, 5, 7
+# CHECK-BE: stwcix 21, 5, 7                  # encoding: [0x7e,0xa5,0x3f,0x2a]
+# CHECK-LE: stwcix 21, 5, 7                  # encoding: [0x2a,0x3f,0xa5,0x7e]
+            stwcix 21, 5, 7
+# CHECK-BE: stdcix 21, 5, 7                  # encoding: [0x7e,0xa5,0x3f,0xea]
+# CHECK-LE: stdcix 21, 5, 7                  # encoding: [0xea,0x3f,0xa5,0x7e]
+            stdcix 21, 5, 7
+
+# Processor-Specific Instructions
+# CHECK-BE: attn                             # encoding: [0x00,0x00,0x02,0x00]
+# CHECK-LE: attn                             # encoding: [0x00,0x02,0x00,0x00]
+            attn
+
diff --git a/test/MC/PowerPC/ppc64-encoding-fp.s b/test/MC/PowerPC/ppc64-encoding-fp.s
index c19f9b3..2f4f828 100644
--- a/test/MC/PowerPC/ppc64-encoding-fp.s
+++ b/test/MC/PowerPC/ppc64-encoding-fp.s
@@ -359,15 +359,36 @@
 # CHECK-BE: mffs 2                          # encoding: [0xfc,0x40,0x04,0x8e]
 # CHECK-LE: mffs 2                          # encoding: [0x8e,0x04,0x40,0xfc]
             mffs 2
-# FIXME:    mffs. 2
-
-# FIXME:    mcrfs 2, 3
-
-# FIXME:    mtfsfi 2, 3, 1
-# FIXME:    mtfsfi. 2, 3, 1
-# FIXME:    mtfsf 2, 3, 1, 1
-# FIXME:    mtfsf. 2, 3, 1, 1
-
+# CHECK-BE: mffs. 7                         # encoding: [0xfc,0xe0,0x04,0x8f]
+# CHECK-LE: mffs. 7                         # encoding: [0x8f,0x04,0xe0,0xfc]
+            mffs. 7
+# CHECK-BE: mcrfs 4, 5                      # encoding: [0xfe,0x14,0x00,0x80]
+# CHECK-LE: mcrfs 4, 5                      # encoding: [0x80,0x00,0x14,0xfe]
+            mcrfs 4, 5
+# CHECK-BE: mtfsfi 5, 2, 1                  # encoding: [0xfe,0x81,0x21,0x0c]
+# CHECK-LE: mtfsfi 5, 2, 1                  # encoding: [0x0c,0x21,0x81,0xfe]
+            mtfsfi 5, 2, 1
+# CHECK-BE: mtfsfi. 5, 2, 1                 # encoding: [0xfe,0x81,0x21,0x0d]
+# CHECK-LE: mtfsfi. 5, 2, 1                 # encoding: [0x0d,0x21,0x81,0xfe]
+            mtfsfi. 5, 2, 1
+# CHECK-BE: mtfsfi 6, 2, 0                  # encoding: [0xff,0x00,0x21,0x0c]
+# CHECK-LE: mtfsfi 6, 2, 0                  # encoding: [0x0c,0x21,0x00,0xff]
+            mtfsfi 6, 2
+# CHECK-BE: mtfsfi. 6, 2, 0                 # encoding: [0xff,0x00,0x21,0x0d]
+# CHECK-LE: mtfsfi. 6, 2, 0                 # encoding: [0x0d,0x21,0x00,0xff]
+            mtfsfi. 6, 2
+# CHECK-BE: mtfsf 127, 8, 1, 1              # encoding: [0xfe,0xff,0x45,0x8e]
+# CHECK-LE: mtfsf 127, 8, 1, 1              # encoding: [0x8e,0x45,0xff,0xfe]
+            mtfsf 127, 8, 1, 1
+# CHECK-BE: mtfsf. 125, 8, 1, 1             # encoding: [0xfe,0xfb,0x45,0x8f]
+# CHECK-LE: mtfsf. 125, 8, 1, 1             # encoding: [0x8f,0x45,0xfb,0xfe]
+            mtfsf. 125, 8, 1, 1
+# CHECK-BE: mtfsf 127, 6, 0, 0              # encoding: [0xfc,0xfe,0x35,0x8e]
+# CHECK-LE: mtfsf 127, 6, 0, 0              # encoding: [0x8e,0x35,0xfe,0xfc]
+            mtfsf 127, 6
+# CHECK-BE: mtfsf. 125, 6, 0, 0             # encoding: [0xfc,0xfa,0x35,0x8f]
+# CHECK-LE: mtfsf. 125, 6, 0, 0             # encoding: [0x8f,0x35,0xfa,0xfc]
+            mtfsf. 125, 6
 # CHECK-BE: mtfsb0 31                       # encoding: [0xff,0xe0,0x00,0x8c]
 # CHECK-LE: mtfsb0 31                       # encoding: [0x8c,0x00,0xe0,0xff]
             mtfsb0 31
diff --git a/test/MC/PowerPC/ppc64-encoding-vmx.s b/test/MC/PowerPC/ppc64-encoding-vmx.s
index 3d2df84..7641d1d 100644
--- a/test/MC/PowerPC/ppc64-encoding-vmx.s
+++ b/test/MC/PowerPC/ppc64-encoding-vmx.s
@@ -408,6 +408,15 @@
 # CHECK-BE: vandc 2, 3, 4                   # encoding: [0x10,0x43,0x24,0x44]
 # CHECK-LE: vandc 2, 3, 4                   # encoding: [0x44,0x24,0x43,0x10]
             vandc 2, 3, 4
+# CHECK-BE: veqv 2, 3, 4                    # encoding: [0x10,0x43,0x26,0x84]
+# CHECK-LE: veqv 2, 3, 4                    # encoding: [0x84,0x26,0x43,0x10]
+            veqv 2, 3, 4
+# CHECK-BE: vnand 2, 3, 4                   # encoding: [0x10,0x43,0x25,0x84]
+# CHECK-LE: vnand 2, 3, 4                   # encoding: [0x84,0x25,0x43,0x10]
+            vnand 2, 3, 4
+# CHECK-BE: vorc 2, 3, 4                    # encoding: [0x10,0x43,0x25,0x44]
+# CHECK-LE: vorc 2, 3, 4                    # encoding: [0x44,0x25,0x43,0x10]
+            vorc 2, 3, 4
 # CHECK-BE: vnor 2, 3, 4                    # encoding: [0x10,0x43,0x25,0x04]
 # CHECK-LE: vnor 2, 3, 4                    # encoding: [0x04,0x25,0x43,0x10]
             vnor 2, 3, 4
@@ -543,6 +552,40 @@
 # CHECK-LE: vrsqrtefp 2, 3                  # encoding: [0x4a,0x19,0x40,0x10]
             vrsqrtefp 2, 3
 
+# Vector count leading zero instructions
+# CHECK-BE: vclzb 2, 3                      # encoding: [0x10,0x40,0x1f,0x02]
+# CHECK-LE: vclzb 2, 3                      # encoding: [0x02,0x1f,0x40,0x10]
+            vclzb 2, 3
+
+# CHECK-BE: vclzh 2, 3                      # encoding: [0x10,0x40,0x1f,0x42]
+# CHECK-LE: vclzh 2, 3                      # encoding: [0x42,0x1f,0x40,0x10]
+            vclzh 2, 3
+
+# CHECK-BE: vclzw 2, 3                      # encoding: [0x10,0x40,0x1f,0x82]
+# CHECK-LE: vclzw 2, 3                      # encoding: [0x82,0x1f,0x40,0x10]
+            vclzw 2, 3
+
+# CHECK-BE: vclzd 2, 3                      # encoding: [0x10,0x40,0x1f,0xc2]
+# CHECK-LE: vclzd 2, 3                      # encoding: [0xc2,0x1f,0x40,0x10]
+            vclzd 2, 3                      
+
+# Vector population count instructions
+# CHECK-BE: vpopcntb 2, 3                   # encoding: [0x10,0x40,0x1f,0x03]
+# CHECK-LE: vpopcntb 2, 3                   # encoding: [0x03,0x1f,0x40,0x10]
+            vpopcntb 2, 3
+
+# CHECK-BE: vpopcnth 2, 3                   # encoding: [0x10,0x40,0x1f,0x43]
+# CHECK-LE: vpopcnth 2, 3                   # encoding: [0x43,0x1f,0x40,0x10]
+            vpopcnth 2, 3	
+
+# CHECK-BE: vpopcntw 2, 3                   # encoding: [0x10,0x40,0x1f,0x83]
+# CHECK-LE: vpopcntw 2, 3                   # encoding: [0x83,0x1f,0x40,0x10]
+            vpopcntw 2, 3
+	
+# BCHECK-BE: vpopcntd 2, 3                   # encoding: [0x10,0x40,0x1f,0xC3]
+# BCHECK-LE: vpopcntd 2, 3                   # encoding: [0xC3,0x1f,0x40,0x10]
+#            vpopcntd 2, 3
+	
 # Vector status and control register instructions
 
 # CHECK-BE: mtvscr 2                        # encoding: [0x10,0x00,0x16,0x44]
diff --git a/test/MC/PowerPC/ppc64-encoding.s b/test/MC/PowerPC/ppc64-encoding.s
index d483f9d..d2ac669 100644
--- a/test/MC/PowerPC/ppc64-encoding.s
+++ b/test/MC/PowerPC/ppc64-encoding.s
@@ -612,7 +612,15 @@
 # CHECK-BE: cntlzw. 2, 3                    # encoding: [0x7c,0x62,0x00,0x35]
 # CHECK-LE: cntlzw. 2, 3                    # encoding: [0x35,0x00,0x62,0x7c]
             cntlzw. 2, 3
-# FIXME:    cmpb 2, 3, 4
+# CHECK-BE: cntlzw 2, 3                     # encoding: [0x7c,0x62,0x00,0x34]
+# CHECK-LE: cntlzw 2, 3                     # encoding: [0x34,0x00,0x62,0x7c]
+            cntlz  2, 3
+# CHECK-BE: cntlzw. 2, 3                    # encoding: [0x7c,0x62,0x00,0x35]
+# CHECK-LE: cntlzw. 2, 3                    # encoding: [0x35,0x00,0x62,0x7c]
+            cntlz.  2, 3
+            cmpb 7, 21, 4
+# CHECK-BE: cmpb 7, 21, 4                   # encoding: [0x7e,0xa7,0x23,0xf8]
+# CHECK-LE: cmpb 7, 21, 4                   # encoding: [0xf8,0x23,0xa7,0x7e]
 # FIXME:    popcntb 2, 3
 # CHECK-BE: popcntw 2, 3                    # encoding: [0x7c,0x62,0x02,0xf4]
 # CHECK-LE: popcntw 2, 3                    # encoding: [0xf4,0x02,0x62,0x7c]
diff --git a/test/MC/PowerPC/ppc64-localentry.s b/test/MC/PowerPC/ppc64-localentry.s
index 6d2c120..03f760e 100644
--- a/test/MC/PowerPC/ppc64-localentry.s
+++ b/test/MC/PowerPC/ppc64-localentry.s
@@ -35,6 +35,9 @@ caller_other:
 	nop
 	.size caller_other, .-caller_other
 
+copy1 = callee1
+copy2 = callee2
+
 # Verify that use of .localentry implies ABI version 2
 # CHECK: ElfHeader {
 # CHECK: Flags [ (0x2)
@@ -68,3 +71,19 @@ caller_other:
 # CHECK-NEXT:  Other: 0
 # CHECK-NEXT:  Section: .text
 
+# Verify that symbol assignment copies the Other bits.
+# CHECK:       Name: copy1
+# CHECK-NEXT:  Value:
+# CHECK-NEXT:  Size: 16
+# CHECK-NEXT:  Binding: Local
+# CHECK-NEXT:  Type: Function
+# CHECK-NEXT:  Other: 96
+# CHECK-NEXT:  Section: .text
+# CHECK:       Name: copy2
+# CHECK-NEXT:  Value:
+# CHECK-NEXT:  Size: 8
+# CHECK-NEXT:  Binding: Local
+# CHECK-NEXT:  Type: Function
+# CHECK-NEXT:  Other: 0
+# CHECK-NEXT:  Section: .text
+
diff --git a/test/MC/PowerPC/qpx.s b/test/MC/PowerPC/qpx.s
new file mode 100644
index 0000000..6c92d71
--- /dev/null
+++ b/test/MC/PowerPC/qpx.s
@@ -0,0 +1,251 @@
+# RUN: llvm-mc -triple powerpc64-bgq-linux --show-encoding %s | FileCheck %s
+
+# FIXME: print qvflogical aliases.
+
+# CHECK: qvfabs 3, 5                     # encoding: [0x10,0x60,0x2a,0x10]
+         qvfabs 3, 5
+# CHECK: qvfadd 3, 4, 5                  # encoding: [0x10,0x64,0x28,0x2a]
+         qvfadd 3, 4, 5
+# CHECK: qvfadds 3, 4, 5                 # encoding: [0x00,0x64,0x28,0x2a]
+         qvfadds 3, 4, 5
+# CHECK: qvflogical 3, 4, 5, 4           # encoding: [0x10,0x64,0x2a,0x08]
+         qvfandc 3, 4, 5
+# CHECK: qvflogical 3, 4, 5, 1           # encoding: [0x10,0x64,0x28,0x88]
+         qvfand 3, 4, 5
+# CHECK: qvfcfid 3, 5                    # encoding: [0x10,0x60,0x2e,0x9c]
+         qvfcfid 3, 5
+# CHECK: qvfcfids 3, 5                   # encoding: [0x00,0x60,0x2e,0x9c]
+         qvfcfids 3, 5
+# CHECK: qvfcfidu 3, 5                   # encoding: [0x10,0x60,0x2f,0x9c]
+         qvfcfidu 3, 5
+# CHECK: qvfcfidus 3, 5                  # encoding: [0x00,0x60,0x2f,0x9c]
+         qvfcfidus 3, 5
+# CHECK: qvflogical 3, 3, 3, 0           # encoding: [0x10,0x63,0x18,0x08]
+         qvfclr 3
+# CHECK: qvfcpsgn 3, 4, 5                # encoding: [0x10,0x64,0x28,0x10]
+         qvfcpsgn 3, 4, 5
+# CHECK: qvflogical 3, 4, 4, 5           # encoding: [0x10,0x64,0x22,0x88]
+         qvfctfb 3, 4
+# CHECK: qvfctid 3, 5                    # encoding: [0x10,0x60,0x2e,0x5c]
+         qvfctid 3, 5
+# CHECK: qvfctidu 3, 5                   # encoding: [0x10,0x60,0x2f,0x5c]
+         qvfctidu 3, 5
+# CHECK: qvfctiduz 3, 5                  # encoding: [0x10,0x60,0x2f,0x5e]
+         qvfctiduz 3, 5
+# CHECK: qvfctidz 3, 5                   # encoding: [0x10,0x60,0x2e,0x5e]
+         qvfctidz 3, 5
+# CHECK: qvfctiw 3, 5                    # encoding: [0x10,0x60,0x28,0x1c]
+         qvfctiw 3, 5
+# CHECK: qvfctiwu 3, 5                   # encoding: [0x10,0x60,0x29,0x1c]
+         qvfctiwu 3, 5
+# CHECK: qvfctiwuz 3, 5                  # encoding: [0x10,0x60,0x29,0x1e]
+         qvfctiwuz 3, 5
+# CHECK: qvfctiwz 3, 5                   # encoding: [0x10,0x60,0x28,0x1e]
+         qvfctiwz 3, 5
+# CHECK: qvflogical 3, 4, 5, 9           # encoding: [0x10,0x64,0x2c,0x88]
+         qvfequ 3, 4, 5
+# CHECK: qvflogical 3, 4, 5, 12          # encoding: [0x10,0x64,0x2e,0x08]
+         qvflogical 3, 4, 5, 12
+# CHECK: qvfmadd 3, 4, 6, 5              # encoding: [0x10,0x64,0x29,0xba]
+         qvfmadd 3, 4, 6, 5
+# CHECK: qvfmadds 3, 4, 6, 5             # encoding: [0x00,0x64,0x29,0xba]
+         qvfmadds 3, 4, 6, 5
+# CHECK: qvfmr 3, 5                      # encoding: [0x10,0x60,0x28,0x90]
+         qvfmr 3, 5
+# CHECK: qvfmsub 3, 4, 6, 5              # encoding: [0x10,0x64,0x29,0xb8]
+         qvfmsub 3, 4, 6, 5
+# CHECK: qvfmsubs 3, 4, 6, 5             # encoding: [0x00,0x64,0x29,0xb8]
+         qvfmsubs 3, 4, 6, 5
+# CHECK: qvfmul 3, 4, 6                  # encoding: [0x10,0x64,0x01,0xb2]
+         qvfmul 3, 4, 6
+# CHECK: qvfmuls 3, 4, 6                 # encoding: [0x00,0x64,0x01,0xb2]
+         qvfmuls 3, 4, 6
+# CHECK: qvfnabs 3, 5                    # encoding: [0x10,0x60,0x29,0x10]
+         qvfnabs 3, 5
+# CHECK: qvflogical 3, 4, 5, 14          # encoding: [0x10,0x64,0x2f,0x08]
+         qvfnand 3, 4, 5
+# CHECK: qvfneg 3, 5                     # encoding: [0x10,0x60,0x28,0x50]
+         qvfneg 3, 5
+# CHECK: qvfnmadd 3, 4, 6, 5             # encoding: [0x10,0x64,0x29,0xbe]
+         qvfnmadd 3, 4, 6, 5
+# CHECK: qvfnmadds 3, 4, 6, 5            # encoding: [0x00,0x64,0x29,0xbe]
+         qvfnmadds 3, 4, 6, 5
+# CHECK: qvfnmsub 3, 4, 6, 5             # encoding: [0x10,0x64,0x29,0xbc]
+         qvfnmsub 3, 4, 6, 5
+# CHECK: qvfnmsubs 3, 4, 6, 5            # encoding: [0x00,0x64,0x29,0xbc]
+         qvfnmsubs 3, 4, 6, 5
+# CHECK: qvflogical 3, 4, 5, 8           # encoding: [0x10,0x64,0x2c,0x08]
+         qvfnor 3, 4, 5
+# CHECK: qvflogical 3, 4, 4, 10          # encoding: [0x10,0x64,0x25,0x08]
+         qvfnot 3, 4
+# CHECK: qvflogical 3, 4, 5, 13          # encoding: [0x10,0x64,0x2e,0x88]
+         qvforc 3, 4, 5
+# CHECK: qvflogical 3, 4, 5, 7           # encoding: [0x10,0x64,0x2b,0x88]
+         qvfor 3, 4, 5
+# CHECK: qvfperm 3, 4, 5, 6              # encoding: [0x10,0x64,0x29,0x8c]
+         qvfperm 3, 4, 5, 6
+# CHECK: qvfre 3, 5                      # encoding: [0x10,0x60,0x28,0x30]
+         qvfre 3, 5
+# CHECK: qvfres 3, 5                     # encoding: [0x00,0x60,0x28,0x30]
+         qvfres 3, 5
+# CHECK: qvfrim 3, 5                     # encoding: [0x10,0x60,0x2b,0xd0]
+         qvfrim 3, 5
+# CHECK: qvfrin 3, 5                     # encoding: [0x10,0x60,0x2b,0x10]
+         qvfrin 3, 5
+# CHECK: qvfrip 3, 5                     # encoding: [0x10,0x60,0x2b,0x90]
+         qvfrip 3, 5
+# CHECK: qvfriz 3, 5                     # encoding: [0x10,0x60,0x2b,0x50]
+         qvfriz 3, 5
+# CHECK: qvfrsp 3, 5                     # encoding: [0x10,0x60,0x28,0x18]
+         qvfrsp 3, 5
+# CHECK: qvfrsqrte 3, 5                  # encoding: [0x10,0x60,0x28,0x34]
+         qvfrsqrte 3, 5
+# CHECK: qvfrsqrtes 3, 5                 # encoding: [0x00,0x60,0x28,0x34]
+         qvfrsqrtes 3, 5
+# CHECK: qvfsel 3, 4, 6, 5               # encoding: [0x10,0x64,0x29,0xae]
+         qvfsel 3, 4, 6, 5
+# CHECK: qvflogical 3, 3, 3, 15          # encoding: [0x10,0x63,0x1f,0x88]
+         qvfset 3
+# CHECK: qvfsub 3, 4, 5                  # encoding: [0x10,0x64,0x28,0x28]
+         qvfsub 3, 4, 5
+# CHECK: qvfsubs 3, 4, 5                 # encoding: [0x00,0x64,0x28,0x28]
+         qvfsubs 3, 4, 5
+# CHECK: qvfxmadd 3, 4, 6, 5             # encoding: [0x10,0x64,0x29,0x92]
+         qvfxmadd 3, 4, 6, 5
+# CHECK: qvfxmadds 3, 4, 6, 5            # encoding: [0x00,0x64,0x29,0x92]
+         qvfxmadds 3, 4, 6, 5
+# CHECK: qvfxmul 3, 4, 6                 # encoding: [0x10,0x64,0x01,0xa2]
+         qvfxmul 3, 4, 6
+# CHECK: qvfxmuls 3, 4, 6                # encoding: [0x00,0x64,0x01,0xa2]
+         qvfxmuls 3, 4, 6
+# CHECK: qvflogical 3, 4, 5, 6           # encoding: [0x10,0x64,0x2b,0x08]
+         qvfxor 3, 4, 5
+# CHECK: qvfxxcpnmadd 3, 4, 6, 5         # encoding: [0x10,0x64,0x29,0x86]
+         qvfxxcpnmadd 3, 4, 6, 5
+# CHECK: qvfxxcpnmadds 3, 4, 6, 5        # encoding: [0x00,0x64,0x29,0x86]
+         qvfxxcpnmadds 3, 4, 6, 5
+# CHECK: qvfxxmadd 3, 4, 6, 5            # encoding: [0x10,0x64,0x29,0x82]
+         qvfxxmadd 3, 4, 6, 5
+# CHECK: qvfxxmadds 3, 4, 6, 5           # encoding: [0x00,0x64,0x29,0x82]
+         qvfxxmadds 3, 4, 6, 5
+# CHECK: qvfxxnpmadd 3, 4, 6, 5          # encoding: [0x10,0x64,0x29,0x96]
+         qvfxxnpmadd 3, 4, 6, 5
+# CHECK: qvfxxnpmadds 3, 4, 6, 5         # encoding: [0x00,0x64,0x29,0x96]
+         qvfxxnpmadds 3, 4, 6, 5
+# CHECK: qvlfcduxa 3, 9, 11              # encoding: [0x7c,0x69,0x58,0xcf]
+         qvlfcduxa 3, 9, 11
+# CHECK: qvlfcdux 3, 9, 11               # encoding: [0x7c,0x69,0x58,0xce]
+         qvlfcdux 3, 9, 11
+# CHECK: qvlfcdxa 3, 10, 11              # encoding: [0x7c,0x6a,0x58,0x8f]
+         qvlfcdxa 3, 10, 11
+# CHECK: qvlfcdx 3, 10, 11               # encoding: [0x7c,0x6a,0x58,0x8e]
+         qvlfcdx 3, 10, 11
+# CHECK: qvlfcsuxa 3, 9, 11              # encoding: [0x7c,0x69,0x58,0x4f]
+         qvlfcsuxa 3, 9, 11
+# CHECK: qvlfcsux 3, 9, 11               # encoding: [0x7c,0x69,0x58,0x4e]
+         qvlfcsux 3, 9, 11
+# CHECK: qvlfcsxa 3, 10, 11              # encoding: [0x7c,0x6a,0x58,0x0f]
+         qvlfcsxa 3, 10, 11
+# CHECK: qvlfcsx 3, 10, 11               # encoding: [0x7c,0x6a,0x58,0x0e]
+         qvlfcsx 3, 10, 11
+# CHECK: qvlfduxa 3, 9, 11               # encoding: [0x7c,0x69,0x5c,0xcf]
+         qvlfduxa 3, 9, 11
+# CHECK: qvlfdux 3, 9, 11                # encoding: [0x7c,0x69,0x5c,0xce]
+         qvlfdux 3, 9, 11
+# CHECK: qvlfdxa 3, 10, 11               # encoding: [0x7c,0x6a,0x5c,0x8f]
+         qvlfdxa 3, 10, 11
+# CHECK: qvlfdx 3, 10, 11                # encoding: [0x7c,0x6a,0x5c,0x8e]
+         qvlfdx 3, 10, 11
+# CHECK: qvlfiwaxa 3, 10, 11             # encoding: [0x7c,0x6a,0x5e,0xcf]
+         qvlfiwaxa 3, 10, 11
+# CHECK: qvlfiwax 3, 10, 11              # encoding: [0x7c,0x6a,0x5e,0xce]
+         qvlfiwax 3, 10, 11
+# CHECK: qvlfiwzxa 3, 10, 11             # encoding: [0x7c,0x6a,0x5e,0x8f]
+         qvlfiwzxa 3, 10, 11
+# CHECK: qvlfiwzx 3, 10, 11              # encoding: [0x7c,0x6a,0x5e,0x8e]
+         qvlfiwzx 3, 10, 11
+# CHECK: qvlfsuxa 3, 9, 11               # encoding: [0x7c,0x69,0x5c,0x4f]
+         qvlfsuxa 3, 9, 11
+# CHECK: qvlfsux 3, 9, 11                # encoding: [0x7c,0x69,0x5c,0x4e]
+         qvlfsux 3, 9, 11
+# CHECK: qvlfsxa 3, 10, 11               # encoding: [0x7c,0x6a,0x5c,0x0f]
+         qvlfsxa 3, 10, 11
+# CHECK: qvlfsx 3, 10, 11                # encoding: [0x7c,0x6a,0x5c,0x0e]
+         qvlfsx 3, 10, 11
+# CHECK: qvlpcldx 3, 10, 11              # encoding: [0x7c,0x6a,0x5c,0x8c]
+         qvlpcldx 3, 10, 11
+# CHECK: qvlpclsx 3, 10, 11              # encoding: [0x7c,0x6a,0x5c,0x0c]
+         qvlpclsx 3, 10, 11
+# CHECK: qvlpcrdx 3, 10, 11              # encoding: [0x7c,0x6a,0x58,0x8c]
+         qvlpcrdx 3, 10, 11
+# CHECK: qvlpcrsx 3, 10, 11              # encoding: [0x7c,0x6a,0x58,0x0c]
+         qvlpcrsx 3, 10, 11
+# CHECK: qvstfcduxa 2, 9, 11             # encoding: [0x7c,0x49,0x59,0xcf]
+         qvstfcduxa 2, 9, 11
+# CHECK: qvstfcduxia 2, 9, 11            # encoding: [0x7c,0x49,0x59,0xcb]
+         qvstfcduxia 2, 9, 11
+# CHECK: qvstfcduxi 2, 9, 11             # encoding: [0x7c,0x49,0x59,0xca]
+         qvstfcduxi 2, 9, 11
+# CHECK: qvstfcdux 2, 9, 11              # encoding: [0x7c,0x49,0x59,0xce]
+         qvstfcdux 2, 9, 11
+# CHECK: qvstfcdxa 2, 10, 11             # encoding: [0x7c,0x4a,0x59,0x8f]
+         qvstfcdxa 2, 10, 11
+# CHECK: qvstfcdxia 2, 10, 11            # encoding: [0x7c,0x4a,0x59,0x8b]
+         qvstfcdxia 2, 10, 11
+# CHECK: qvstfcdxi 2, 10, 11             # encoding: [0x7c,0x4a,0x59,0x8a]
+         qvstfcdxi 2, 10, 11
+# CHECK: qvstfcdx 2, 10, 11              # encoding: [0x7c,0x4a,0x59,0x8e]
+         qvstfcdx 2, 10, 11
+# CHECK: qvstfcsuxa 2, 9, 11             # encoding: [0x7c,0x49,0x59,0x4f]
+         qvstfcsuxa 2, 9, 11
+# CHECK: qvstfcsuxia 2, 9, 11            # encoding: [0x7c,0x49,0x59,0x4b]
+         qvstfcsuxia 2, 9, 11
+# CHECK: qvstfcsuxi 2, 9, 11             # encoding: [0x7c,0x49,0x59,0x4a]
+         qvstfcsuxi 2, 9, 11
+# CHECK: qvstfcsux 2, 9, 11              # encoding: [0x7c,0x49,0x59,0x4e]
+         qvstfcsux 2, 9, 11
+# CHECK: qvstfcsxa 2, 10, 11             # encoding: [0x7c,0x4a,0x59,0x0f]
+         qvstfcsxa 2, 10, 11
+# CHECK: qvstfcsxia 2, 10, 11            # encoding: [0x7c,0x4a,0x59,0x0b]
+         qvstfcsxia 2, 10, 11
+# CHECK: qvstfcsxi 2, 10, 11             # encoding: [0x7c,0x4a,0x59,0x0a]
+         qvstfcsxi 2, 10, 11
+# CHECK: qvstfcsx 2, 10, 11              # encoding: [0x7c,0x4a,0x59,0x0e]
+         qvstfcsx 2, 10, 11
+# CHECK: qvstfduxa 2, 9, 11              # encoding: [0x7c,0x49,0x5d,0xcf]
+         qvstfduxa 2, 9, 11
+# CHECK: qvstfduxia 2, 9, 11             # encoding: [0x7c,0x49,0x5d,0xcb]
+         qvstfduxia 2, 9, 11
+# CHECK: qvstfduxi 2, 9, 11              # encoding: [0x7c,0x49,0x5d,0xca]
+         qvstfduxi 2, 9, 11
+# CHECK: qvstfdux 2, 9, 11               # encoding: [0x7c,0x49,0x5d,0xce]
+         qvstfdux 2, 9, 11
+# CHECK: qvstfdxa 2, 10, 11              # encoding: [0x7c,0x4a,0x5d,0x8f]
+         qvstfdxa 2, 10, 11
+# CHECK: qvstfdxia 2, 10, 11             # encoding: [0x7c,0x4a,0x5d,0x8b]
+         qvstfdxia 2, 10, 11
+# CHECK: qvstfdxi 2, 10, 11              # encoding: [0x7c,0x4a,0x5d,0x8a]
+         qvstfdxi 2, 10, 11
+# CHECK: qvstfdx 2, 10, 11               # encoding: [0x7c,0x4a,0x5d,0x8e]
+         qvstfdx 2, 10, 11
+# CHECK: qvstfiwxa 2, 10, 11             # encoding: [0x7c,0x4a,0x5f,0x8f]
+         qvstfiwxa 2, 10, 11
+# CHECK: qvstfiwx 2, 10, 11              # encoding: [0x7c,0x4a,0x5f,0x8e]
+         qvstfiwx 2, 10, 11
+# CHECK: qvstfsuxa 2, 9, 11              # encoding: [0x7c,0x49,0x5d,0x4f]
+         qvstfsuxa 2, 9, 11
+# CHECK: qvstfsuxia 2, 9, 11             # encoding: [0x7c,0x49,0x5d,0x4b]
+         qvstfsuxia 2, 9, 11
+# CHECK: qvstfsuxi 2, 9, 11              # encoding: [0x7c,0x49,0x5d,0x4a]
+         qvstfsuxi 2, 9, 11
+# CHECK: qvstfsux 2, 9, 11               # encoding: [0x7c,0x49,0x5d,0x4e]
+         qvstfsux 2, 9, 11
+# CHECK: qvstfsxa 2, 10, 11              # encoding: [0x7c,0x4a,0x5d,0x0f]
+         qvstfsxa 2, 10, 11
+# CHECK: qvstfsxia 2, 10, 11             # encoding: [0x7c,0x4a,0x5d,0x0b]
+         qvstfsxia 2, 10, 11
+# CHECK: qvstfsxi 2, 10, 11              # encoding: [0x7c,0x4a,0x5d,0x0a]
+         qvstfsxi 2, 10, 11
+# CHECK: qvstfsx 2, 10, 11               # encoding: [0x7c,0x4a,0x5d,0x0e]
+         qvstfsx 2, 10, 11
+
diff --git a/test/MC/PowerPC/vsx.s b/test/MC/PowerPC/vsx.s
index b355ba3..4a0053d 100644
--- a/test/MC/PowerPC/vsx.s
+++ b/test/MC/PowerPC/vsx.s
@@ -403,6 +403,15 @@
 # CHECK-BE: xxland 7, 63, 27                   # encoding: [0xf0,0xff,0xdc,0x14]
 # CHECK-LE: xxland 7, 63, 27                   # encoding: [0x14,0xdc,0xff,0xf0]
             xxland 7, 63, 27
+# CHECK-BE: xxleqv 7, 63, 27                   # encoding: [0xf0,0xff,0xdd,0xd4]
+# CHECK-LE: xxleqv 7, 63, 27                   # encoding: [0xd4,0xdd,0xff,0xf0]
+            xxleqv 7, 63, 27
+# CHECK-BE: xxlnand 7, 63, 27                  # encoding: [0xf0,0xff,0xdd,0x94]
+# CHECK-LE: xxlnand 7, 63, 27                  # encoding: [0x94,0xdd,0xff,0xf0]
+            xxlnand 7, 63, 27
+# CHECK-BE: xxlorc 7, 63, 27                   # encoding: [0xf0,0xff,0xdd,0x54]
+# CHECK-LE: xxlorc 7, 63, 27                   # encoding: [0x54,0xdd,0xff,0xf0]
+            xxlorc 7, 63, 27
 # CHECK-BE: xxlandc 7, 63, 27                  # encoding: [0xf0,0xff,0xdc,0x54]
 # CHECK-LE: xxlandc 7, 63, 27                  # encoding: [0x54,0xdc,0xff,0xf0]
             xxlandc 7, 63, 27
diff --git a/test/MC/R600/sopp.s b/test/MC/R600/sopp.s
index 65fc97b..0f186b1 100644
--- a/test/MC/R600/sopp.s
+++ b/test/MC/R600/sopp.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -arch=r600 -mcpu=SI  -show-encoding %s | FileCheck %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=SI  -show-encoding %s | FileCheck %s
 
   s_nop 1            // CHECK: s_nop 1 ; encoding: [0x01,0x00,0x80,0xbf]
   s_endpgm           // CHECK: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
diff --git a/test/MC/SystemZ/fixups.s b/test/MC/SystemZ/fixups.s
new file mode 100644
index 0000000..8354121
--- /dev/null
+++ b/test/MC/SystemZ/fixups.s
@@ -0,0 +1,119 @@
+
+# RUN: llvm-mc -triple s390x-unknown-unknown --show-encoding %s | FileCheck %s
+
+# RUN: llvm-mc -triple s390x-unknown-unknown -filetype=obj %s | \
+# RUN: llvm-readobj -r | FileCheck %s -check-prefix=CHECK-REL
+
+# CHECK: larl %r14, target                      # encoding: [0xc0,0xe0,A,A,A,A]
+# CHECK-NEXT:                                   # fixup A - offset: 2, value: target+2, kind: FK_390_PC32DBL
+# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PC32DBL target 0x2
+	.align 16
+	larl %r14, target
+
+# CHECK: larl %r14, target@GOT                  # encoding: [0xc0,0xe0,A,A,A,A]
+# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@GOT+2, kind: FK_390_PC32DBL
+# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_GOTENT target 0x2
+	.align 16
+	larl %r14, target@got
+
+# CHECK: larl %r14, target@INDNTPOFF            # encoding: [0xc0,0xe0,A,A,A,A]
+# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@INDNTPOFF+2, kind: FK_390_PC32DBL
+# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_TLS_IEENT target 0x2
+	.align 16
+	larl %r14, target@indntpoff
+
+# CHECK: brasl %r14, target                     # encoding: [0xc0,0xe5,A,A,A,A]
+# CHECK-NEXT:                                   # fixup A - offset: 2, value: target+2, kind: FK_390_PC32DBL
+# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PC32DBL target 0x2
+	.align 16
+	brasl %r14, target
+
+# CHECK: brasl %r14, target@PLT                 # encoding: [0xc0,0xe5,A,A,A,A]
+# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@PLT+2, kind: FK_390_PC32DBL
+# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PLT32DBL target 0x2
+	.align 16
+	brasl %r14, target@plt
+
+# CHECK: brasl %r14, target@PLT:tls_gdcall:sym  # encoding: [0xc0,0xe5,A,A,A,A]
+# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@PLT+2, kind: FK_390_PC32DBL
+# CHECK-NEXT:                                   # fixup B - offset: 0, value: sym@TLSGD, kind: FK_390_TLS_CALL
+# CHECK-REL:                                    0x{{[0-9A-F]*0}} R_390_TLS_GDCALL sym 0x0
+# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PLT32DBL target 0x2
+	.align 16
+	brasl %r14, target@plt:tls_gdcall:sym
+
+# CHECK: brasl %r14, target@PLT:tls_ldcall:sym  # encoding: [0xc0,0xe5,A,A,A,A]
+# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@PLT+2, kind: FK_390_PC32DBL
+# CHECK-NEXT:                                   # fixup B - offset: 0, value: sym@TLSLDM, kind: FK_390_TLS_CALL
+# CHECK-REL:                                    0x{{[0-9A-F]*0}} R_390_TLS_LDCALL sym 0x0
+# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PLT32DBL target 0x2
+	.align 16
+	brasl %r14, target@plt:tls_ldcall:sym
+
+# CHECK: bras %r14, target                      # encoding: [0xa7,0xe5,A,A]
+# CHECK-NEXT:                                   # fixup A - offset: 2, value: target+2, kind: FK_390_PC16DBL
+# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PC16DBL target 0x2
+	.align 16
+	bras %r14, target
+
+# CHECK: bras %r14, target@PLT                  # encoding: [0xa7,0xe5,A,A]
+# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@PLT+2, kind: FK_390_PC16DBL
+# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PLT16DBL target 0x2
+	.align 16
+	bras %r14, target@plt
+
+# CHECK: bras %r14, target@PLT:tls_gdcall:sym   # encoding: [0xa7,0xe5,A,A]
+# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@PLT+2, kind: FK_390_PC16DBL
+# CHECK-NEXT:                                   # fixup B - offset: 0, value: sym@TLSGD, kind: FK_390_TLS_CALL
+# CHECK-REL:                                    0x{{[0-9A-F]*0}} R_390_TLS_GDCALL sym 0x0
+# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PLT16DBL target 0x2
+	.align 16
+	bras %r14, target@plt:tls_gdcall:sym
+
+# CHECK: bras %r14, target@PLT:tls_ldcall:sym   # encoding: [0xa7,0xe5,A,A]
+# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@PLT+2, kind: FK_390_PC16DBL
+# CHECK-NEXT:                                   # fixup B - offset: 0, value: sym@TLSLDM, kind: FK_390_TLS_CALL
+# CHECK-REL:                                    0x{{[0-9A-F]*0}} R_390_TLS_LDCALL sym 0x0
+# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PLT16DBL target 0x2
+	.align 16
+	bras %r14, target@plt:tls_ldcall:sym
+
+
+# Data relocs
+# llvm-mc does not show any "encoding" string for data, so we just check the relocs
+
+# CHECK-REL: .rela.data
+	.data
+
+# CHECK-REL: 0x{{[0-9A-F]*0}} R_390_TLS_LE64 target 0x0
+	.align 16
+	.quad target@ntpoff
+
+# CHECK-REL: 0x{{[0-9A-F]*0}} R_390_TLS_LDO64 target 0x0
+	.align 16
+	.quad target@dtpoff
+
+# CHECK-REL: 0x{{[0-9A-F]*0}} R_390_TLS_LDM64 target 0x0
+	.align 16
+	.quad target@tlsldm
+
+# CHECK-REL: 0x{{[0-9A-F]*0}} R_390_TLS_GD64 target 0x0
+	.align 16
+	.quad target@tlsgd
+
+# CHECK-REL: 0x{{[0-9A-F]*0}} R_390_TLS_LE32 target 0x0
+	.align 16
+	.long target@ntpoff
+
+# CHECK-REL: 0x{{[0-9A-F]*0}} R_390_TLS_LDO32 target 0x0
+	.align 16
+	.long target@dtpoff
+
+# CHECK-REL: 0x{{[0-9A-F]*0}} R_390_TLS_LDM32 target 0x0
+	.align 16
+	.long target@tlsldm
+
+# CHECK-REL: 0x{{[0-9A-F]*0}} R_390_TLS_GD32 target 0x0
+	.align 16
+	.long target@tlsgd
+
diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s
index c734da8..9c2b175 100644
--- a/test/MC/X86/avx512-encodings.s
+++ b/test/MC/X86/avx512-encodings.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple x86_64-unknown-unknown -mcpu=knl --show-encoding %s 2> %t.err | FileCheck %s
+// RUN: not llvm-mc -triple x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq --show-encoding %s 2> %t.err | FileCheck %s
 // RUN: FileCheck --check-prefix=ERR < %t.err %s
 
 // CHECK: vaddpd %zmm6, %zmm27, %zmm8
@@ -1513,6 +1513,42 @@
 // CHECK:  encoding: [0x62,0xe1,0xdd,0x58,0xdb,0x8a,0xf8,0xfb,0xff,0xff]
           vpandq -1032(%rdx){1to8}, %zmm4, %zmm17
 
+// CHECK: vpbroadcastd %eax, %zmm11
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x7c,0xd8]
+          vpbroadcastd %eax, %zmm11
+
+// CHECK: vpbroadcastd %eax, %zmm11 {%k6}
+// CHECK:  encoding: [0x62,0x72,0x7d,0x4e,0x7c,0xd8]
+          vpbroadcastd %eax, %zmm11 {%k6}
+
+// CHECK: vpbroadcastd %eax, %zmm11 {%k6} {z}
+// CHECK:  encoding: [0x62,0x72,0x7d,0xce,0x7c,0xd8]
+          vpbroadcastd %eax, %zmm11 {%k6} {z}
+
+// CHECK: vpbroadcastd %ebp, %zmm11
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x7c,0xdd]
+          vpbroadcastd %ebp, %zmm11
+
+// CHECK: vpbroadcastd %r13d, %zmm11
+// CHECK:  encoding: [0x62,0x52,0x7d,0x48,0x7c,0xdd]
+          vpbroadcastd %r13d, %zmm11
+
+// CHECK: vpbroadcastq %rax, %zmm1
+// CHECK:  encoding: [0x62,0xf2,0xfd,0x48,0x7c,0xc8]
+          vpbroadcastq %rax, %zmm1
+
+// CHECK: vpbroadcastq %rax, %zmm1 {%k6}
+// CHECK:  encoding: [0x62,0xf2,0xfd,0x4e,0x7c,0xc8]
+          vpbroadcastq %rax, %zmm1 {%k6}
+
+// CHECK: vpbroadcastq %rax, %zmm1 {%k6} {z}
+// CHECK:  encoding: [0x62,0xf2,0xfd,0xce,0x7c,0xc8]
+          vpbroadcastq %rax, %zmm1 {%k6} {z}
+
+// CHECK: vpbroadcastq %r8, %zmm1
+// CHECK:  encoding: [0x62,0xd2,0xfd,0x48,0x7c,0xc8]
+          vpbroadcastq %r8, %zmm1
+
 // CHECK: vpcmpd $171, %zmm10, %zmm25, %k5
 // CHECK:  encoding: [0x62,0xd3,0x35,0x40,0x1f,0xea,0xab]
           vpcmpd $171, %zmm10, %zmm25, %k5
@@ -1569,6 +1605,266 @@
 // CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0xaa,0xfc,0xfd,0xff,0xff,0x7b]
           vpcmpd $123, -516(%rdx){1to16}, %zmm25, %k5
 
+// CHECK: vpcmpltd %zmm7, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0xd7,0x01]
+          vpcmpltd %zmm7, %zmm25, %k2
+
+// CHECK: vpcmpltd %zmm7, %zmm25, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x35,0x41,0x1f,0xd7,0x01]
+          vpcmpltd %zmm7, %zmm25, %k2 {%k1}
+
+// CHECK: vpcmpltd (%rcx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x11,0x01]
+          vpcmpltd (%rcx), %zmm25, %k2
+
+// CHECK: vpcmpltd 291(%rax,%r14,8), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xb3,0x35,0x40,0x1f,0x94,0xf0,0x23,0x01,0x00,0x00,0x01]
+          vpcmpltd 291(%rax,%r14,8), %zmm25, %k2
+
+// CHECK: vpcmpltd (%rcx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x11,0x01]
+          vpcmpltd (%rcx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpltd 8128(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x52,0x7f,0x01]
+          vpcmpltd 8128(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpltd 8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x92,0x00,0x20,0x00,0x00,0x01]
+          vpcmpltd 8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpltd -8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x52,0x80,0x01]
+          vpcmpltd -8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpltd -8256(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x92,0xc0,0xdf,0xff,0xff,0x01]
+          vpcmpltd -8256(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpltd 508(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x52,0x7f,0x01]
+          vpcmpltd 508(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpltd 512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x92,0x00,0x02,0x00,0x00,0x01]
+          vpcmpltd 512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpltd -512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x52,0x80,0x01]
+          vpcmpltd -512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpltd -516(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x92,0xfc,0xfd,0xff,0xff,0x01]
+          vpcmpltd -516(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpled %zmm7, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0xd7,0x02]
+          vpcmpled %zmm7, %zmm25, %k2
+
+// CHECK: vpcmpled %zmm7, %zmm25, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x35,0x41,0x1f,0xd7,0x02]
+          vpcmpled %zmm7, %zmm25, %k2 {%k1}
+
+// CHECK: vpcmpled (%rcx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x11,0x02]
+          vpcmpled (%rcx), %zmm25, %k2
+
+// CHECK: vpcmpled 291(%rax,%r14,8), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xb3,0x35,0x40,0x1f,0x94,0xf0,0x23,0x01,0x00,0x00,0x02]
+          vpcmpled 291(%rax,%r14,8), %zmm25, %k2
+
+// CHECK: vpcmpled (%rcx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x11,0x02]
+          vpcmpled (%rcx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpled 8128(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x52,0x7f,0x02]
+          vpcmpled 8128(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpled 8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x92,0x00,0x20,0x00,0x00,0x02]
+          vpcmpled 8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpled -8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x52,0x80,0x02]
+          vpcmpled -8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpled -8256(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x92,0xc0,0xdf,0xff,0xff,0x02]
+          vpcmpled -8256(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpled 508(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x52,0x7f,0x02]
+          vpcmpled 508(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpled 512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x92,0x00,0x02,0x00,0x00,0x02]
+          vpcmpled 512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpled -512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x52,0x80,0x02]
+          vpcmpled -512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpled -516(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x92,0xfc,0xfd,0xff,0xff,0x02]
+          vpcmpled -516(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpneqd %zmm7, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0xd7,0x04]
+          vpcmpneqd %zmm7, %zmm25, %k2
+
+// CHECK: vpcmpneqd %zmm7, %zmm25, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x35,0x41,0x1f,0xd7,0x04]
+          vpcmpneqd %zmm7, %zmm25, %k2 {%k1}
+
+// CHECK: vpcmpneqd (%rcx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x11,0x04]
+          vpcmpneqd (%rcx), %zmm25, %k2
+
+// CHECK: vpcmpneqd 291(%rax,%r14,8), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xb3,0x35,0x40,0x1f,0x94,0xf0,0x23,0x01,0x00,0x00,0x04]
+          vpcmpneqd 291(%rax,%r14,8), %zmm25, %k2
+
+// CHECK: vpcmpneqd (%rcx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x11,0x04]
+          vpcmpneqd (%rcx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpneqd 8128(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x52,0x7f,0x04]
+          vpcmpneqd 8128(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpneqd 8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x92,0x00,0x20,0x00,0x00,0x04]
+          vpcmpneqd 8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpneqd -8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x52,0x80,0x04]
+          vpcmpneqd -8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpneqd -8256(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x92,0xc0,0xdf,0xff,0xff,0x04]
+          vpcmpneqd -8256(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpneqd 508(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x52,0x7f,0x04]
+          vpcmpneqd 508(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpneqd 512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x92,0x00,0x02,0x00,0x00,0x04]
+          vpcmpneqd 512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpneqd -512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x52,0x80,0x04]
+          vpcmpneqd -512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpneqd -516(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x92,0xfc,0xfd,0xff,0xff,0x04]
+          vpcmpneqd -516(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnltd %zmm7, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0xd7,0x05]
+          vpcmpnltd %zmm7, %zmm25, %k2
+
+// CHECK: vpcmpnltd %zmm7, %zmm25, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x35,0x41,0x1f,0xd7,0x05]
+          vpcmpnltd %zmm7, %zmm25, %k2 {%k1}
+
+// CHECK: vpcmpnltd (%rcx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x11,0x05]
+          vpcmpnltd (%rcx), %zmm25, %k2
+
+// CHECK: vpcmpnltd 291(%rax,%r14,8), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xb3,0x35,0x40,0x1f,0x94,0xf0,0x23,0x01,0x00,0x00,0x05]
+          vpcmpnltd 291(%rax,%r14,8), %zmm25, %k2
+
+// CHECK: vpcmpnltd (%rcx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x11,0x05]
+          vpcmpnltd (%rcx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnltd 8128(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x52,0x7f,0x05]
+          vpcmpnltd 8128(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnltd 8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x92,0x00,0x20,0x00,0x00,0x05]
+          vpcmpnltd 8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnltd -8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x52,0x80,0x05]
+          vpcmpnltd -8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnltd -8256(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x92,0xc0,0xdf,0xff,0xff,0x05]
+          vpcmpnltd -8256(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnltd 508(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x52,0x7f,0x05]
+          vpcmpnltd 508(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnltd 512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x92,0x00,0x02,0x00,0x00,0x05]
+          vpcmpnltd 512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnltd -512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x52,0x80,0x05]
+          vpcmpnltd -512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnltd -516(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x92,0xfc,0xfd,0xff,0xff,0x05]
+          vpcmpnltd -516(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnled %zmm7, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0xd7,0x06]
+          vpcmpnled %zmm7, %zmm25, %k2
+
+// CHECK: vpcmpnled %zmm7, %zmm25, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x35,0x41,0x1f,0xd7,0x06]
+          vpcmpnled %zmm7, %zmm25, %k2 {%k1}
+
+// CHECK: vpcmpnled (%rcx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x11,0x06]
+          vpcmpnled (%rcx), %zmm25, %k2
+
+// CHECK: vpcmpnled 291(%rax,%r14,8), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xb3,0x35,0x40,0x1f,0x94,0xf0,0x23,0x01,0x00,0x00,0x06]
+          vpcmpnled 291(%rax,%r14,8), %zmm25, %k2
+
+// CHECK: vpcmpnled (%rcx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x11,0x06]
+          vpcmpnled (%rcx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnled 8128(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x52,0x7f,0x06]
+          vpcmpnled 8128(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnled 8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x92,0x00,0x20,0x00,0x00,0x06]
+          vpcmpnled 8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnled -8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x52,0x80,0x06]
+          vpcmpnled -8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnled -8256(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x92,0xc0,0xdf,0xff,0xff,0x06]
+          vpcmpnled -8256(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnled 508(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x52,0x7f,0x06]
+          vpcmpnled 508(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnled 512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x92,0x00,0x02,0x00,0x00,0x06]
+          vpcmpnled 512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnled -512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x52,0x80,0x06]
+          vpcmpnled -512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnled -516(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x92,0xfc,0xfd,0xff,0xff,0x06]
+          vpcmpnled -516(%rdx){1to16}, %zmm25, %k2
+
 // CHECK: vpcmpeqd %zmm10, %zmm2, %k5
 // CHECK:  encoding: [0x62,0xd1,0x6d,0x48,0x76,0xea]
           vpcmpeqd %zmm10, %zmm2, %k5
@@ -1833,6 +2129,266 @@
 // CHECK:  encoding: [0x62,0xf3,0x9d,0x50,0x1f,0xaa,0xf8,0xfb,0xff,0xff,0x7b]
           vpcmpq $123, -1032(%rdx){1to8}, %zmm28, %k5
 
+// CHECK: vpcmpltq %zmm7, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0xd7,0x01]
+          vpcmpltq %zmm7, %zmm14, %k2
+
+// CHECK: vpcmpltq %zmm7, %zmm14, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x49,0x1f,0xd7,0x01]
+          vpcmpltq %zmm7, %zmm14, %k2 {%k1}
+
+// CHECK: vpcmpltq (%rcx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x11,0x01]
+          vpcmpltq (%rcx), %zmm14, %k2
+
+// CHECK: vpcmpltq 291(%rax,%r14,8), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xb3,0x8d,0x48,0x1f,0x94,0xf0,0x23,0x01,0x00,0x00,0x01]
+          vpcmpltq 291(%rax,%r14,8), %zmm14, %k2
+
+// CHECK: vpcmpltq (%rcx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x11,0x01]
+          vpcmpltq (%rcx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpltq 8128(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x52,0x7f,0x01]
+          vpcmpltq 8128(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpltq 8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x92,0x00,0x20,0x00,0x00,0x01]
+          vpcmpltq 8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpltq -8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x52,0x80,0x01]
+          vpcmpltq -8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpltq -8256(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x92,0xc0,0xdf,0xff,0xff,0x01]
+          vpcmpltq -8256(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpltq 1016(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x52,0x7f,0x01]
+          vpcmpltq 1016(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpltq 1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x92,0x00,0x04,0x00,0x00,0x01]
+          vpcmpltq 1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpltq -1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x52,0x80,0x01]
+          vpcmpltq -1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpltq -1032(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x92,0xf8,0xfb,0xff,0xff,0x01]
+          vpcmpltq -1032(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpleq %zmm7, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0xd7,0x02]
+          vpcmpleq %zmm7, %zmm14, %k2
+
+// CHECK: vpcmpleq %zmm7, %zmm14, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x49,0x1f,0xd7,0x02]
+          vpcmpleq %zmm7, %zmm14, %k2 {%k1}
+
+// CHECK: vpcmpleq (%rcx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x11,0x02]
+          vpcmpleq (%rcx), %zmm14, %k2
+
+// CHECK: vpcmpleq 291(%rax,%r14,8), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xb3,0x8d,0x48,0x1f,0x94,0xf0,0x23,0x01,0x00,0x00,0x02]
+          vpcmpleq 291(%rax,%r14,8), %zmm14, %k2
+
+// CHECK: vpcmpleq (%rcx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x11,0x02]
+          vpcmpleq (%rcx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpleq 8128(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x52,0x7f,0x02]
+          vpcmpleq 8128(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpleq 8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x92,0x00,0x20,0x00,0x00,0x02]
+          vpcmpleq 8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpleq -8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x52,0x80,0x02]
+          vpcmpleq -8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpleq -8256(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x92,0xc0,0xdf,0xff,0xff,0x02]
+          vpcmpleq -8256(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpleq 1016(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x52,0x7f,0x02]
+          vpcmpleq 1016(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpleq 1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x92,0x00,0x04,0x00,0x00,0x02]
+          vpcmpleq 1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpleq -1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x52,0x80,0x02]
+          vpcmpleq -1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpleq -1032(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x92,0xf8,0xfb,0xff,0xff,0x02]
+          vpcmpleq -1032(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpneqq %zmm7, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0xd7,0x04]
+          vpcmpneqq %zmm7, %zmm14, %k2
+
+// CHECK: vpcmpneqq %zmm7, %zmm14, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x49,0x1f,0xd7,0x04]
+          vpcmpneqq %zmm7, %zmm14, %k2 {%k1}
+
+// CHECK: vpcmpneqq (%rcx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x11,0x04]
+          vpcmpneqq (%rcx), %zmm14, %k2
+
+// CHECK: vpcmpneqq 291(%rax,%r14,8), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xb3,0x8d,0x48,0x1f,0x94,0xf0,0x23,0x01,0x00,0x00,0x04]
+          vpcmpneqq 291(%rax,%r14,8), %zmm14, %k2
+
+// CHECK: vpcmpneqq (%rcx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x11,0x04]
+          vpcmpneqq (%rcx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpneqq 8128(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x52,0x7f,0x04]
+          vpcmpneqq 8128(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpneqq 8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x92,0x00,0x20,0x00,0x00,0x04]
+          vpcmpneqq 8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpneqq -8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x52,0x80,0x04]
+          vpcmpneqq -8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpneqq -8256(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x92,0xc0,0xdf,0xff,0xff,0x04]
+          vpcmpneqq -8256(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpneqq 1016(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x52,0x7f,0x04]
+          vpcmpneqq 1016(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpneqq 1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x92,0x00,0x04,0x00,0x00,0x04]
+          vpcmpneqq 1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpneqq -1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x52,0x80,0x04]
+          vpcmpneqq -1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpneqq -1032(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x92,0xf8,0xfb,0xff,0xff,0x04]
+          vpcmpneqq -1032(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnltq %zmm7, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0xd7,0x05]
+          vpcmpnltq %zmm7, %zmm14, %k2
+
+// CHECK: vpcmpnltq %zmm7, %zmm14, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x49,0x1f,0xd7,0x05]
+          vpcmpnltq %zmm7, %zmm14, %k2 {%k1}
+
+// CHECK: vpcmpnltq (%rcx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x11,0x05]
+          vpcmpnltq (%rcx), %zmm14, %k2
+
+// CHECK: vpcmpnltq 291(%rax,%r14,8), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xb3,0x8d,0x48,0x1f,0x94,0xf0,0x23,0x01,0x00,0x00,0x05]
+          vpcmpnltq 291(%rax,%r14,8), %zmm14, %k2
+
+// CHECK: vpcmpnltq (%rcx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x11,0x05]
+          vpcmpnltq (%rcx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnltq 8128(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x52,0x7f,0x05]
+          vpcmpnltq 8128(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnltq 8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x92,0x00,0x20,0x00,0x00,0x05]
+          vpcmpnltq 8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnltq -8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x52,0x80,0x05]
+          vpcmpnltq -8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnltq -8256(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x92,0xc0,0xdf,0xff,0xff,0x05]
+          vpcmpnltq -8256(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnltq 1016(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x52,0x7f,0x05]
+          vpcmpnltq 1016(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnltq 1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x92,0x00,0x04,0x00,0x00,0x05]
+          vpcmpnltq 1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnltq -1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x52,0x80,0x05]
+          vpcmpnltq -1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnltq -1032(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x92,0xf8,0xfb,0xff,0xff,0x05]
+          vpcmpnltq -1032(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnleq %zmm7, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0xd7,0x06]
+          vpcmpnleq %zmm7, %zmm14, %k2
+
+// CHECK: vpcmpnleq %zmm7, %zmm14, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x49,0x1f,0xd7,0x06]
+          vpcmpnleq %zmm7, %zmm14, %k2 {%k1}
+
+// CHECK: vpcmpnleq (%rcx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x11,0x06]
+          vpcmpnleq (%rcx), %zmm14, %k2
+
+// CHECK: vpcmpnleq 291(%rax,%r14,8), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xb3,0x8d,0x48,0x1f,0x94,0xf0,0x23,0x01,0x00,0x00,0x06]
+          vpcmpnleq 291(%rax,%r14,8), %zmm14, %k2
+
+// CHECK: vpcmpnleq (%rcx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x11,0x06]
+          vpcmpnleq (%rcx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnleq 8128(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x52,0x7f,0x06]
+          vpcmpnleq 8128(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnleq 8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x92,0x00,0x20,0x00,0x00,0x06]
+          vpcmpnleq 8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnleq -8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x52,0x80,0x06]
+          vpcmpnleq -8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnleq -8256(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1f,0x92,0xc0,0xdf,0xff,0xff,0x06]
+          vpcmpnleq -8256(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnleq 1016(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x52,0x7f,0x06]
+          vpcmpnleq 1016(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnleq 1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x92,0x00,0x04,0x00,0x00,0x06]
+          vpcmpnleq 1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnleq -1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x52,0x80,0x06]
+          vpcmpnleq -1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnleq -1032(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1f,0x92,0xf8,0xfb,0xff,0xff,0x06]
+          vpcmpnleq -1032(%rdx){1to8}, %zmm14, %k2
+
 // CHECK: vpcmpud $171, %zmm7, %zmm25, %k2
 // CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0xd7,0xab]
           vpcmpud $171, %zmm7, %zmm25, %k2
@@ -1889,6 +2445,318 @@
 // CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x92,0xfc,0xfd,0xff,0xff,0x7b]
           vpcmpud $123, -516(%rdx){1to16}, %zmm25, %k2
 
+// CHECK: vpcmpequd %zmm7, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0xd7,0x00]
+          vpcmpequd %zmm7, %zmm25, %k2
+
+// CHECK: vpcmpequd %zmm7, %zmm25, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x35,0x41,0x1e,0xd7,0x00]
+          vpcmpequd %zmm7, %zmm25, %k2 {%k1}
+
+// CHECK: vpcmpequd (%rcx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x11,0x00]
+          vpcmpequd (%rcx), %zmm25, %k2
+
+// CHECK: vpcmpequd 291(%rax,%r14,8), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xb3,0x35,0x40,0x1e,0x94,0xf0,0x23,0x01,0x00,0x00,0x00]
+          vpcmpequd 291(%rax,%r14,8), %zmm25, %k2
+
+// CHECK: vpcmpequd (%rcx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x11,0x00]
+          vpcmpequd (%rcx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpequd 8128(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x52,0x7f,0x00]
+          vpcmpequd 8128(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpequd 8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x92,0x00,0x20,0x00,0x00,0x00]
+          vpcmpequd 8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpequd -8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x52,0x80,0x00]
+          vpcmpequd -8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpequd -8256(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x92,0xc0,0xdf,0xff,0xff,0x00]
+          vpcmpequd -8256(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpequd 508(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x52,0x7f,0x00]
+          vpcmpequd 508(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpequd 512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x92,0x00,0x02,0x00,0x00,0x00]
+          vpcmpequd 512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpequd -512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x52,0x80,0x00]
+          vpcmpequd -512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpequd -516(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x92,0xfc,0xfd,0xff,0xff,0x00]
+          vpcmpequd -516(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpltud %zmm7, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0xd7,0x01]
+          vpcmpltud %zmm7, %zmm25, %k2
+
+// CHECK: vpcmpltud %zmm7, %zmm25, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x35,0x41,0x1e,0xd7,0x01]
+          vpcmpltud %zmm7, %zmm25, %k2 {%k1}
+
+// CHECK: vpcmpltud (%rcx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x11,0x01]
+          vpcmpltud (%rcx), %zmm25, %k2
+
+// CHECK: vpcmpltud 291(%rax,%r14,8), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xb3,0x35,0x40,0x1e,0x94,0xf0,0x23,0x01,0x00,0x00,0x01]
+          vpcmpltud 291(%rax,%r14,8), %zmm25, %k2
+
+// CHECK: vpcmpltud (%rcx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x11,0x01]
+          vpcmpltud (%rcx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpltud 8128(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x52,0x7f,0x01]
+          vpcmpltud 8128(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpltud 8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x92,0x00,0x20,0x00,0x00,0x01]
+          vpcmpltud 8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpltud -8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x52,0x80,0x01]
+          vpcmpltud -8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpltud -8256(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x92,0xc0,0xdf,0xff,0xff,0x01]
+          vpcmpltud -8256(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpltud 508(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x52,0x7f,0x01]
+          vpcmpltud 508(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpltud 512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x92,0x00,0x02,0x00,0x00,0x01]
+          vpcmpltud 512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpltud -512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x52,0x80,0x01]
+          vpcmpltud -512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpltud -516(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x92,0xfc,0xfd,0xff,0xff,0x01]
+          vpcmpltud -516(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpleud %zmm7, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0xd7,0x02]
+          vpcmpleud %zmm7, %zmm25, %k2
+
+// CHECK: vpcmpleud %zmm7, %zmm25, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x35,0x41,0x1e,0xd7,0x02]
+          vpcmpleud %zmm7, %zmm25, %k2 {%k1}
+
+// CHECK: vpcmpleud (%rcx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x11,0x02]
+          vpcmpleud (%rcx), %zmm25, %k2
+
+// CHECK: vpcmpleud 291(%rax,%r14,8), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xb3,0x35,0x40,0x1e,0x94,0xf0,0x23,0x01,0x00,0x00,0x02]
+          vpcmpleud 291(%rax,%r14,8), %zmm25, %k2
+
+// CHECK: vpcmpleud (%rcx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x11,0x02]
+          vpcmpleud (%rcx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpleud 8128(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x52,0x7f,0x02]
+          vpcmpleud 8128(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpleud 8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x92,0x00,0x20,0x00,0x00,0x02]
+          vpcmpleud 8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpleud -8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x52,0x80,0x02]
+          vpcmpleud -8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpleud -8256(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x92,0xc0,0xdf,0xff,0xff,0x02]
+          vpcmpleud -8256(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpleud 508(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x52,0x7f,0x02]
+          vpcmpleud 508(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpleud 512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x92,0x00,0x02,0x00,0x00,0x02]
+          vpcmpleud 512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpleud -512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x52,0x80,0x02]
+          vpcmpleud -512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpleud -516(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x92,0xfc,0xfd,0xff,0xff,0x02]
+          vpcmpleud -516(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnequd %zmm7, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0xd7,0x04]
+          vpcmpnequd %zmm7, %zmm25, %k2
+
+// CHECK: vpcmpnequd %zmm7, %zmm25, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x35,0x41,0x1e,0xd7,0x04]
+          vpcmpnequd %zmm7, %zmm25, %k2 {%k1}
+
+// CHECK: vpcmpnequd (%rcx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x11,0x04]
+          vpcmpnequd (%rcx), %zmm25, %k2
+
+// CHECK: vpcmpnequd 291(%rax,%r14,8), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xb3,0x35,0x40,0x1e,0x94,0xf0,0x23,0x01,0x00,0x00,0x04]
+          vpcmpnequd 291(%rax,%r14,8), %zmm25, %k2
+
+// CHECK: vpcmpnequd (%rcx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x11,0x04]
+          vpcmpnequd (%rcx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnequd 8128(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x52,0x7f,0x04]
+          vpcmpnequd 8128(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnequd 8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x92,0x00,0x20,0x00,0x00,0x04]
+          vpcmpnequd 8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnequd -8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x52,0x80,0x04]
+          vpcmpnequd -8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnequd -8256(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x92,0xc0,0xdf,0xff,0xff,0x04]
+          vpcmpnequd -8256(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnequd 508(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x52,0x7f,0x04]
+          vpcmpnequd 508(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnequd 512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x92,0x00,0x02,0x00,0x00,0x04]
+          vpcmpnequd 512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnequd -512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x52,0x80,0x04]
+          vpcmpnequd -512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnequd -516(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x92,0xfc,0xfd,0xff,0xff,0x04]
+          vpcmpnequd -516(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnltud %zmm7, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0xd7,0x05]
+          vpcmpnltud %zmm7, %zmm25, %k2
+
+// CHECK: vpcmpnltud %zmm7, %zmm25, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x35,0x41,0x1e,0xd7,0x05]
+          vpcmpnltud %zmm7, %zmm25, %k2 {%k1}
+
+// CHECK: vpcmpnltud (%rcx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x11,0x05]
+          vpcmpnltud (%rcx), %zmm25, %k2
+
+// CHECK: vpcmpnltud 291(%rax,%r14,8), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xb3,0x35,0x40,0x1e,0x94,0xf0,0x23,0x01,0x00,0x00,0x05]
+          vpcmpnltud 291(%rax,%r14,8), %zmm25, %k2
+
+// CHECK: vpcmpnltud (%rcx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x11,0x05]
+          vpcmpnltud (%rcx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnltud 8128(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x52,0x7f,0x05]
+          vpcmpnltud 8128(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnltud 8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x92,0x00,0x20,0x00,0x00,0x05]
+          vpcmpnltud 8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnltud -8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x52,0x80,0x05]
+          vpcmpnltud -8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnltud -8256(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x92,0xc0,0xdf,0xff,0xff,0x05]
+          vpcmpnltud -8256(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnltud 508(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x52,0x7f,0x05]
+          vpcmpnltud 508(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnltud 512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x92,0x00,0x02,0x00,0x00,0x05]
+          vpcmpnltud 512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnltud -512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x52,0x80,0x05]
+          vpcmpnltud -512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnltud -516(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x92,0xfc,0xfd,0xff,0xff,0x05]
+          vpcmpnltud -516(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnleud %zmm7, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0xd7,0x06]
+          vpcmpnleud %zmm7, %zmm25, %k2
+
+// CHECK: vpcmpnleud %zmm7, %zmm25, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x35,0x41,0x1e,0xd7,0x06]
+          vpcmpnleud %zmm7, %zmm25, %k2 {%k1}
+
+// CHECK: vpcmpnleud (%rcx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x11,0x06]
+          vpcmpnleud (%rcx), %zmm25, %k2
+
+// CHECK: vpcmpnleud 291(%rax,%r14,8), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xb3,0x35,0x40,0x1e,0x94,0xf0,0x23,0x01,0x00,0x00,0x06]
+          vpcmpnleud 291(%rax,%r14,8), %zmm25, %k2
+
+// CHECK: vpcmpnleud (%rcx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x11,0x06]
+          vpcmpnleud (%rcx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnleud 8128(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x52,0x7f,0x06]
+          vpcmpnleud 8128(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnleud 8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x92,0x00,0x20,0x00,0x00,0x06]
+          vpcmpnleud 8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnleud -8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x52,0x80,0x06]
+          vpcmpnleud -8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnleud -8256(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x92,0xc0,0xdf,0xff,0xff,0x06]
+          vpcmpnleud -8256(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpnleud 508(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x52,0x7f,0x06]
+          vpcmpnleud 508(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnleud 512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x92,0x00,0x02,0x00,0x00,0x06]
+          vpcmpnleud 512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnleud -512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x52,0x80,0x06]
+          vpcmpnleud -512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpnleud -516(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x92,0xfc,0xfd,0xff,0xff,0x06]
+          vpcmpnleud -516(%rdx){1to16}, %zmm25, %k2
+
 // CHECK: vpcmpuq $171, %zmm8, %zmm14, %k3
 // CHECK:  encoding: [0x62,0xd3,0x8d,0x48,0x1e,0xd8,0xab]
           vpcmpuq $171, %zmm8, %zmm14, %k3
@@ -1945,6 +2813,318 @@
 // CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x9a,0xf8,0xfb,0xff,0xff,0x7b]
           vpcmpuq $123, -1032(%rdx){1to8}, %zmm14, %k3
 
+// CHECK: vpcmpequq %zmm7, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0xd7,0x00]
+          vpcmpequq %zmm7, %zmm14, %k2
+
+// CHECK: vpcmpequq %zmm7, %zmm14, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x49,0x1e,0xd7,0x00]
+          vpcmpequq %zmm7, %zmm14, %k2 {%k1}
+
+// CHECK: vpcmpequq (%rcx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x11,0x00]
+          vpcmpequq (%rcx), %zmm14, %k2
+
+// CHECK: vpcmpequq 291(%rax,%r14,8), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xb3,0x8d,0x48,0x1e,0x94,0xf0,0x23,0x01,0x00,0x00,0x00]
+          vpcmpequq 291(%rax,%r14,8), %zmm14, %k2
+
+// CHECK: vpcmpequq (%rcx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x11,0x00]
+          vpcmpequq (%rcx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpequq 8128(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x52,0x7f,0x00]
+          vpcmpequq 8128(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpequq 8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x92,0x00,0x20,0x00,0x00,0x00]
+          vpcmpequq 8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpequq -8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x52,0x80,0x00]
+          vpcmpequq -8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpequq -8256(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x92,0xc0,0xdf,0xff,0xff,0x00]
+          vpcmpequq -8256(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpequq 1016(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x52,0x7f,0x00]
+          vpcmpequq 1016(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpequq 1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x92,0x00,0x04,0x00,0x00,0x00]
+          vpcmpequq 1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpequq -1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x52,0x80,0x00]
+          vpcmpequq -1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpequq -1032(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x92,0xf8,0xfb,0xff,0xff,0x00]
+          vpcmpequq -1032(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpltuq %zmm7, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0xd7,0x01]
+          vpcmpltuq %zmm7, %zmm14, %k2
+
+// CHECK: vpcmpltuq %zmm7, %zmm14, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x49,0x1e,0xd7,0x01]
+          vpcmpltuq %zmm7, %zmm14, %k2 {%k1}
+
+// CHECK: vpcmpltuq (%rcx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x11,0x01]
+          vpcmpltuq (%rcx), %zmm14, %k2
+
+// CHECK: vpcmpltuq 291(%rax,%r14,8), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xb3,0x8d,0x48,0x1e,0x94,0xf0,0x23,0x01,0x00,0x00,0x01]
+          vpcmpltuq 291(%rax,%r14,8), %zmm14, %k2
+
+// CHECK: vpcmpltuq (%rcx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x11,0x01]
+          vpcmpltuq (%rcx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpltuq 8128(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x52,0x7f,0x01]
+          vpcmpltuq 8128(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpltuq 8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x92,0x00,0x20,0x00,0x00,0x01]
+          vpcmpltuq 8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpltuq -8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x52,0x80,0x01]
+          vpcmpltuq -8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpltuq -8256(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x92,0xc0,0xdf,0xff,0xff,0x01]
+          vpcmpltuq -8256(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpltuq 1016(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x52,0x7f,0x01]
+          vpcmpltuq 1016(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpltuq 1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x92,0x00,0x04,0x00,0x00,0x01]
+          vpcmpltuq 1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpltuq -1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x52,0x80,0x01]
+          vpcmpltuq -1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpltuq -1032(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x92,0xf8,0xfb,0xff,0xff,0x01]
+          vpcmpltuq -1032(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpleuq %zmm7, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0xd7,0x02]
+          vpcmpleuq %zmm7, %zmm14, %k2
+
+// CHECK: vpcmpleuq %zmm7, %zmm14, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x49,0x1e,0xd7,0x02]
+          vpcmpleuq %zmm7, %zmm14, %k2 {%k1}
+
+// CHECK: vpcmpleuq (%rcx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x11,0x02]
+          vpcmpleuq (%rcx), %zmm14, %k2
+
+// CHECK: vpcmpleuq 291(%rax,%r14,8), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xb3,0x8d,0x48,0x1e,0x94,0xf0,0x23,0x01,0x00,0x00,0x02]
+          vpcmpleuq 291(%rax,%r14,8), %zmm14, %k2
+
+// CHECK: vpcmpleuq (%rcx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x11,0x02]
+          vpcmpleuq (%rcx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpleuq 8128(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x52,0x7f,0x02]
+          vpcmpleuq 8128(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpleuq 8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x92,0x00,0x20,0x00,0x00,0x02]
+          vpcmpleuq 8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpleuq -8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x52,0x80,0x02]
+          vpcmpleuq -8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpleuq -8256(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x92,0xc0,0xdf,0xff,0xff,0x02]
+          vpcmpleuq -8256(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpleuq 1016(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x52,0x7f,0x02]
+          vpcmpleuq 1016(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpleuq 1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x92,0x00,0x04,0x00,0x00,0x02]
+          vpcmpleuq 1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpleuq -1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x52,0x80,0x02]
+          vpcmpleuq -1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpleuq -1032(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x92,0xf8,0xfb,0xff,0xff,0x02]
+          vpcmpleuq -1032(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnequq %zmm7, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0xd7,0x04]
+          vpcmpnequq %zmm7, %zmm14, %k2
+
+// CHECK: vpcmpnequq %zmm7, %zmm14, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x49,0x1e,0xd7,0x04]
+          vpcmpnequq %zmm7, %zmm14, %k2 {%k1}
+
+// CHECK: vpcmpnequq (%rcx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x11,0x04]
+          vpcmpnequq (%rcx), %zmm14, %k2
+
+// CHECK: vpcmpnequq 291(%rax,%r14,8), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xb3,0x8d,0x48,0x1e,0x94,0xf0,0x23,0x01,0x00,0x00,0x04]
+          vpcmpnequq 291(%rax,%r14,8), %zmm14, %k2
+
+// CHECK: vpcmpnequq (%rcx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x11,0x04]
+          vpcmpnequq (%rcx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnequq 8128(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x52,0x7f,0x04]
+          vpcmpnequq 8128(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnequq 8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x92,0x00,0x20,0x00,0x00,0x04]
+          vpcmpnequq 8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnequq -8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x52,0x80,0x04]
+          vpcmpnequq -8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnequq -8256(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x92,0xc0,0xdf,0xff,0xff,0x04]
+          vpcmpnequq -8256(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnequq 1016(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x52,0x7f,0x04]
+          vpcmpnequq 1016(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnequq 1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x92,0x00,0x04,0x00,0x00,0x04]
+          vpcmpnequq 1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnequq -1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x52,0x80,0x04]
+          vpcmpnequq -1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnequq -1032(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x92,0xf8,0xfb,0xff,0xff,0x04]
+          vpcmpnequq -1032(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnltuq %zmm7, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0xd7,0x05]
+          vpcmpnltuq %zmm7, %zmm14, %k2
+
+// CHECK: vpcmpnltuq %zmm7, %zmm14, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x49,0x1e,0xd7,0x05]
+          vpcmpnltuq %zmm7, %zmm14, %k2 {%k1}
+
+// CHECK: vpcmpnltuq (%rcx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x11,0x05]
+          vpcmpnltuq (%rcx), %zmm14, %k2
+
+// CHECK: vpcmpnltuq 291(%rax,%r14,8), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xb3,0x8d,0x48,0x1e,0x94,0xf0,0x23,0x01,0x00,0x00,0x05]
+          vpcmpnltuq 291(%rax,%r14,8), %zmm14, %k2
+
+// CHECK: vpcmpnltuq (%rcx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x11,0x05]
+          vpcmpnltuq (%rcx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnltuq 8128(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x52,0x7f,0x05]
+          vpcmpnltuq 8128(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnltuq 8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x92,0x00,0x20,0x00,0x00,0x05]
+          vpcmpnltuq 8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnltuq -8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x52,0x80,0x05]
+          vpcmpnltuq -8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnltuq -8256(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x92,0xc0,0xdf,0xff,0xff,0x05]
+          vpcmpnltuq -8256(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnltuq 1016(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x52,0x7f,0x05]
+          vpcmpnltuq 1016(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnltuq 1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x92,0x00,0x04,0x00,0x00,0x05]
+          vpcmpnltuq 1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnltuq -1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x52,0x80,0x05]
+          vpcmpnltuq -1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnltuq -1032(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x92,0xf8,0xfb,0xff,0xff,0x05]
+          vpcmpnltuq -1032(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnleuq %zmm7, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0xd7,0x06]
+          vpcmpnleuq %zmm7, %zmm14, %k2
+
+// CHECK: vpcmpnleuq %zmm7, %zmm14, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x49,0x1e,0xd7,0x06]
+          vpcmpnleuq %zmm7, %zmm14, %k2 {%k1}
+
+// CHECK: vpcmpnleuq (%rcx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x11,0x06]
+          vpcmpnleuq (%rcx), %zmm14, %k2
+
+// CHECK: vpcmpnleuq 291(%rax,%r14,8), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xb3,0x8d,0x48,0x1e,0x94,0xf0,0x23,0x01,0x00,0x00,0x06]
+          vpcmpnleuq 291(%rax,%r14,8), %zmm14, %k2
+
+// CHECK: vpcmpnleuq (%rcx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x11,0x06]
+          vpcmpnleuq (%rcx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnleuq 8128(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x52,0x7f,0x06]
+          vpcmpnleuq 8128(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnleuq 8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x92,0x00,0x20,0x00,0x00,0x06]
+          vpcmpnleuq 8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnleuq -8192(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x52,0x80,0x06]
+          vpcmpnleuq -8192(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnleuq -8256(%rdx), %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x92,0xc0,0xdf,0xff,0xff,0x06]
+          vpcmpnleuq -8256(%rdx), %zmm14, %k2
+
+// CHECK: vpcmpnleuq 1016(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x52,0x7f,0x06]
+          vpcmpnleuq 1016(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnleuq 1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x92,0x00,0x04,0x00,0x00,0x06]
+          vpcmpnleuq 1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnleuq -1024(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x52,0x80,0x06]
+          vpcmpnleuq -1024(%rdx){1to8}, %zmm14, %k2
+
+// CHECK: vpcmpnleuq -1032(%rdx){1to8}, %zmm14, %k2
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x92,0xf8,0xfb,0xff,0xff,0x06]
+          vpcmpnleuq -1032(%rdx){1to8}, %zmm14, %k2
+
 // CHECK: vpmaxsd %zmm16, %zmm8, %zmm6
 // CHECK:  encoding: [0x62,0xb2,0x3d,0x48,0x3d,0xf0]
           vpmaxsd %zmm16, %zmm8, %zmm6
@@ -4725,11 +5905,11 @@ vmovntpd %zmm6, 4(%rdx)
 // CHECK: encoding: [0x62,0x51,0x7c,0x48,0x2b,0x5c,0x8d,0x00]
 vmovntps %zmm11, (%r13,%rcx,4)
 
-// CHECK: vcmpps $14
+// CHECK: vcmpgtps %zmm17, %zmm5, %k2
 // CHECK: encoding: [0x62,0xb1,0x54,0x48,0xc2,0xd1,0x0e]
 vcmpgtps %zmm17, %zmm5, %k2
 
-// CHECK: vcmppd $13
+// CHECK: vcmpgepd 128(%r14), %zmm17, %k6
 // CHECK: encoding: [0x62,0xd1,0xf5,0x40,0xc2,0x76,0x02,0x0d]
 vcmpgepd 0x80(%r14), %zmm17, %k6
 
diff --git a/test/MC/X86/avx512bw-encoding.s b/test/MC/X86/avx512bw-encoding.s
new file mode 100644
index 0000000..06397fb
--- /dev/null
+++ b/test/MC/X86/avx512bw-encoding.s
@@ -0,0 +1,73 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mcpu=skx  --show-encoding %s | FileCheck %s
+
+// CHECK: vpblendmb %zmm25, %zmm18, %zmm17
+// CHECK:  encoding: [0x62,0x82,0x6d,0x40,0x66,0xc9]
+          vpblendmb %zmm25, %zmm18, %zmm17
+
+// CHECK: vpblendmb %zmm25, %zmm18, %zmm17 {%k5}
+// CHECK:  encoding: [0x62,0x82,0x6d,0x45,0x66,0xc9]
+          vpblendmb %zmm25, %zmm18, %zmm17 {%k5}
+
+// CHECK: vpblendmb %zmm25, %zmm18, %zmm17 {%k5} {z}
+// CHECK:  encoding: [0x62,0x82,0x6d,0xc5,0x66,0xc9]
+          vpblendmb %zmm25, %zmm18, %zmm17 {%k5} {z}
+
+// CHECK: vpblendmb (%rcx), %zmm18, %zmm17
+// CHECK:  encoding: [0x62,0xe2,0x6d,0x40,0x66,0x09]
+          vpblendmb (%rcx), %zmm18, %zmm17
+
+// CHECK: vpblendmb 291(%rax,%r14,8), %zmm18, %zmm17
+// CHECK:  encoding: [0x62,0xa2,0x6d,0x40,0x66,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpblendmb 291(%rax,%r14,8), %zmm18, %zmm17
+
+// CHECK: vpblendmb 8128(%rdx), %zmm18, %zmm17
+// CHECK:  encoding: [0x62,0xe2,0x6d,0x40,0x66,0x4a,0x7f]
+          vpblendmb 8128(%rdx), %zmm18, %zmm17
+
+// CHECK: vpblendmb 8192(%rdx), %zmm18, %zmm17
+// CHECK:  encoding: [0x62,0xe2,0x6d,0x40,0x66,0x8a,0x00,0x20,0x00,0x00]
+          vpblendmb 8192(%rdx), %zmm18, %zmm17
+
+// CHECK: vpblendmb -8192(%rdx), %zmm18, %zmm17
+// CHECK:  encoding: [0x62,0xe2,0x6d,0x40,0x66,0x4a,0x80]
+          vpblendmb -8192(%rdx), %zmm18, %zmm17
+
+// CHECK: vpblendmb -8256(%rdx), %zmm18, %zmm17
+// CHECK:  encoding: [0x62,0xe2,0x6d,0x40,0x66,0x8a,0xc0,0xdf,0xff,0xff]
+          vpblendmb -8256(%rdx), %zmm18, %zmm17
+
+// CHECK: vpblendmw %zmm17, %zmm20, %zmm26
+// CHECK:  encoding: [0x62,0x22,0xdd,0x40,0x66,0xd1]
+          vpblendmw %zmm17, %zmm20, %zmm26
+
+// CHECK: vpblendmw %zmm17, %zmm20, %zmm26 {%k7}
+// CHECK:  encoding: [0x62,0x22,0xdd,0x47,0x66,0xd1]
+          vpblendmw %zmm17, %zmm20, %zmm26 {%k7}
+
+// CHECK: vpblendmw %zmm17, %zmm20, %zmm26 {%k7} {z}
+// CHECK:  encoding: [0x62,0x22,0xdd,0xc7,0x66,0xd1]
+          vpblendmw %zmm17, %zmm20, %zmm26 {%k7} {z}
+
+// CHECK: vpblendmw (%rcx), %zmm20, %zmm26
+// CHECK:  encoding: [0x62,0x62,0xdd,0x40,0x66,0x11]
+          vpblendmw (%rcx), %zmm20, %zmm26
+
+// CHECK: vpblendmw 291(%rax,%r14,8), %zmm20, %zmm26
+// CHECK:  encoding: [0x62,0x22,0xdd,0x40,0x66,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpblendmw 291(%rax,%r14,8), %zmm20, %zmm26
+
+// CHECK: vpblendmw 8128(%rdx), %zmm20, %zmm26
+// CHECK:  encoding: [0x62,0x62,0xdd,0x40,0x66,0x52,0x7f]
+          vpblendmw 8128(%rdx), %zmm20, %zmm26
+
+// CHECK: vpblendmw 8192(%rdx), %zmm20, %zmm26
+// CHECK:  encoding: [0x62,0x62,0xdd,0x40,0x66,0x92,0x00,0x20,0x00,0x00]
+          vpblendmw 8192(%rdx), %zmm20, %zmm26
+
+// CHECK: vpblendmw -8192(%rdx), %zmm20, %zmm26
+// CHECK:  encoding: [0x62,0x62,0xdd,0x40,0x66,0x52,0x80]
+          vpblendmw -8192(%rdx), %zmm20, %zmm26
+
+// CHECK: vpblendmw -8256(%rdx), %zmm20, %zmm26
+// CHECK:  encoding: [0x62,0x62,0xdd,0x40,0x66,0x92,0xc0,0xdf,0xff,0xff]
+          vpblendmw -8256(%rdx), %zmm20, %zmm26
diff --git a/test/MC/X86/avx512vl-encoding.s b/test/MC/X86/avx512vl-encoding.s
new file mode 100644
index 0000000..acea70c
--- /dev/null
+++ b/test/MC/X86/avx512vl-encoding.s
@@ -0,0 +1,449 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mcpu=skx  --show-encoding %s | FileCheck %s
+
+// CHECK: vblendmpd %xmm19, %xmm20, %xmm27
+// CHECK:  encoding: [0x62,0x22,0xdd,0x00,0x65,0xdb]
+          vblendmpd %xmm19, %xmm20, %xmm27
+
+// CHECK: vblendmpd %xmm19, %xmm20, %xmm27 {%k7}
+// CHECK:  encoding: [0x62,0x22,0xdd,0x07,0x65,0xdb]
+          vblendmpd %xmm19, %xmm20, %xmm27 {%k7}
+
+// CHECK: vblendmpd %xmm19, %xmm20, %xmm27 {%k7} {z}
+// CHECK:  encoding: [0x62,0x22,0xdd,0x87,0x65,0xdb]
+          vblendmpd %xmm19, %xmm20, %xmm27 {%k7} {z}
+
+// CHECK: vblendmpd (%rcx), %xmm20, %xmm27
+// CHECK:  encoding: [0x62,0x62,0xdd,0x00,0x65,0x19]
+          vblendmpd (%rcx), %xmm20, %xmm27
+
+// CHECK: vblendmpd 291(%rax,%r14,8), %xmm20, %xmm27
+// CHECK:  encoding: [0x62,0x22,0xdd,0x00,0x65,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vblendmpd 291(%rax,%r14,8), %xmm20, %xmm27
+
+// CHECK: vblendmpd (%rcx){1to2}, %xmm20, %xmm27
+// CHECK:  encoding: [0x62,0x62,0xdd,0x10,0x65,0x19]
+          vblendmpd (%rcx){1to2}, %xmm20, %xmm27
+
+// CHECK: vblendmpd 2032(%rdx), %xmm20, %xmm27
+// CHECK:  encoding: [0x62,0x62,0xdd,0x00,0x65,0x5a,0x7f]
+          vblendmpd 2032(%rdx), %xmm20, %xmm27
+
+// CHECK: vblendmpd 2048(%rdx), %xmm20, %xmm27
+// CHECK:  encoding: [0x62,0x62,0xdd,0x00,0x65,0x9a,0x00,0x08,0x00,0x00]
+          vblendmpd 2048(%rdx), %xmm20, %xmm27
+
+// CHECK: vblendmpd -2048(%rdx), %xmm20, %xmm27
+// CHECK:  encoding: [0x62,0x62,0xdd,0x00,0x65,0x5a,0x80]
+          vblendmpd -2048(%rdx), %xmm20, %xmm27
+
+// CHECK: vblendmpd -2064(%rdx), %xmm20, %xmm27
+// CHECK:  encoding: [0x62,0x62,0xdd,0x00,0x65,0x9a,0xf0,0xf7,0xff,0xff]
+          vblendmpd -2064(%rdx), %xmm20, %xmm27
+
+// CHECK: vblendmpd 1016(%rdx){1to2}, %xmm20, %xmm27
+// CHECK:  encoding: [0x62,0x62,0xdd,0x10,0x65,0x5a,0x7f]
+          vblendmpd 1016(%rdx){1to2}, %xmm20, %xmm27
+
+// CHECK: vblendmpd 1024(%rdx){1to2}, %xmm20, %xmm27
+// CHECK:  encoding: [0x62,0x62,0xdd,0x10,0x65,0x9a,0x00,0x04,0x00,0x00]
+          vblendmpd 1024(%rdx){1to2}, %xmm20, %xmm27
+
+// CHECK: vblendmpd -1024(%rdx){1to2}, %xmm20, %xmm27
+// CHECK:  encoding: [0x62,0x62,0xdd,0x10,0x65,0x5a,0x80]
+          vblendmpd -1024(%rdx){1to2}, %xmm20, %xmm27
+
+// CHECK: vblendmpd -1032(%rdx){1to2}, %xmm20, %xmm27
+// CHECK:  encoding: [0x62,0x62,0xdd,0x10,0x65,0x9a,0xf8,0xfb,0xff,0xff]
+          vblendmpd -1032(%rdx){1to2}, %xmm20, %xmm27
+
+// CHECK: vblendmpd %ymm23, %ymm21, %ymm28
+// CHECK:  encoding: [0x62,0x22,0xd5,0x20,0x65,0xe7]
+          vblendmpd %ymm23, %ymm21, %ymm28
+
+// CHECK: vblendmpd %ymm23, %ymm21, %ymm28 {%k3}
+// CHECK:  encoding: [0x62,0x22,0xd5,0x23,0x65,0xe7]
+          vblendmpd %ymm23, %ymm21, %ymm28 {%k3}
+
+// CHECK: vblendmpd %ymm23, %ymm21, %ymm28 {%k3} {z}
+// CHECK:  encoding: [0x62,0x22,0xd5,0xa3,0x65,0xe7]
+          vblendmpd %ymm23, %ymm21, %ymm28 {%k3} {z}
+
+// CHECK: vblendmpd (%rcx), %ymm21, %ymm28
+// CHECK:  encoding: [0x62,0x62,0xd5,0x20,0x65,0x21]
+          vblendmpd (%rcx), %ymm21, %ymm28
+
+// CHECK: vblendmpd 291(%rax,%r14,8), %ymm21, %ymm28
+// CHECK:  encoding: [0x62,0x22,0xd5,0x20,0x65,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vblendmpd 291(%rax,%r14,8), %ymm21, %ymm28
+
+// CHECK: vblendmpd (%rcx){1to4}, %ymm21, %ymm28
+// CHECK:  encoding: [0x62,0x62,0xd5,0x30,0x65,0x21]
+          vblendmpd (%rcx){1to4}, %ymm21, %ymm28
+
+// CHECK: vblendmpd 4064(%rdx), %ymm21, %ymm28
+// CHECK:  encoding: [0x62,0x62,0xd5,0x20,0x65,0x62,0x7f]
+          vblendmpd 4064(%rdx), %ymm21, %ymm28
+
+// CHECK: vblendmpd 4096(%rdx), %ymm21, %ymm28
+// CHECK:  encoding: [0x62,0x62,0xd5,0x20,0x65,0xa2,0x00,0x10,0x00,0x00]
+          vblendmpd 4096(%rdx), %ymm21, %ymm28
+
+// CHECK: vblendmpd -4096(%rdx), %ymm21, %ymm28
+// CHECK:  encoding: [0x62,0x62,0xd5,0x20,0x65,0x62,0x80]
+          vblendmpd -4096(%rdx), %ymm21, %ymm28
+
+// CHECK: vblendmpd -4128(%rdx), %ymm21, %ymm28
+// CHECK:  encoding: [0x62,0x62,0xd5,0x20,0x65,0xa2,0xe0,0xef,0xff,0xff]
+          vblendmpd -4128(%rdx), %ymm21, %ymm28
+
+// CHECK: vblendmpd 1016(%rdx){1to4}, %ymm21, %ymm28
+// CHECK:  encoding: [0x62,0x62,0xd5,0x30,0x65,0x62,0x7f]
+          vblendmpd 1016(%rdx){1to4}, %ymm21, %ymm28
+
+// CHECK: vblendmpd 1024(%rdx){1to4}, %ymm21, %ymm28
+// CHECK:  encoding: [0x62,0x62,0xd5,0x30,0x65,0xa2,0x00,0x04,0x00,0x00]
+          vblendmpd 1024(%rdx){1to4}, %ymm21, %ymm28
+
+// CHECK: vblendmpd -1024(%rdx){1to4}, %ymm21, %ymm28
+// CHECK:  encoding: [0x62,0x62,0xd5,0x30,0x65,0x62,0x80]
+          vblendmpd -1024(%rdx){1to4}, %ymm21, %ymm28
+
+// CHECK: vblendmpd -1032(%rdx){1to4}, %ymm21, %ymm28
+// CHECK:  encoding: [0x62,0x62,0xd5,0x30,0x65,0xa2,0xf8,0xfb,0xff,0xff]
+          vblendmpd -1032(%rdx){1to4}, %ymm21, %ymm28
+
+// CHECK: vblendmps %xmm20, %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x22,0x5d,0x00,0x65,0xc4]
+          vblendmps %xmm20, %xmm20, %xmm24
+
+// CHECK: vblendmps %xmm20, %xmm20, %xmm24 {%k1}
+// CHECK:  encoding: [0x62,0x22,0x5d,0x01,0x65,0xc4]
+          vblendmps %xmm20, %xmm20, %xmm24 {%k1}
+
+// CHECK: vblendmps %xmm20, %xmm20, %xmm24 {%k1} {z}
+// CHECK:  encoding: [0x62,0x22,0x5d,0x81,0x65,0xc4]
+          vblendmps %xmm20, %xmm20, %xmm24 {%k1} {z}
+
+// CHECK: vblendmps (%rcx), %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x62,0x5d,0x00,0x65,0x01]
+          vblendmps (%rcx), %xmm20, %xmm24
+
+// CHECK: vblendmps 291(%rax,%r14,8), %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x22,0x5d,0x00,0x65,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vblendmps 291(%rax,%r14,8), %xmm20, %xmm24
+
+// CHECK: vblendmps (%rcx){1to4}, %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x62,0x5d,0x10,0x65,0x01]
+          vblendmps (%rcx){1to4}, %xmm20, %xmm24
+
+// CHECK: vblendmps 2032(%rdx), %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x62,0x5d,0x00,0x65,0x42,0x7f]
+          vblendmps 2032(%rdx), %xmm20, %xmm24
+
+// CHECK: vblendmps 2048(%rdx), %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x62,0x5d,0x00,0x65,0x82,0x00,0x08,0x00,0x00]
+          vblendmps 2048(%rdx), %xmm20, %xmm24
+
+// CHECK: vblendmps -2048(%rdx), %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x62,0x5d,0x00,0x65,0x42,0x80]
+          vblendmps -2048(%rdx), %xmm20, %xmm24
+
+// CHECK: vblendmps -2064(%rdx), %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x62,0x5d,0x00,0x65,0x82,0xf0,0xf7,0xff,0xff]
+          vblendmps -2064(%rdx), %xmm20, %xmm24
+
+// CHECK: vblendmps 508(%rdx){1to4}, %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x62,0x5d,0x10,0x65,0x42,0x7f]
+          vblendmps 508(%rdx){1to4}, %xmm20, %xmm24
+
+// CHECK: vblendmps 512(%rdx){1to4}, %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x62,0x5d,0x10,0x65,0x82,0x00,0x02,0x00,0x00]
+          vblendmps 512(%rdx){1to4}, %xmm20, %xmm24
+
+// CHECK: vblendmps -512(%rdx){1to4}, %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x62,0x5d,0x10,0x65,0x42,0x80]
+          vblendmps -512(%rdx){1to4}, %xmm20, %xmm24
+
+// CHECK: vblendmps -516(%rdx){1to4}, %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x62,0x5d,0x10,0x65,0x82,0xfc,0xfd,0xff,0xff]
+          vblendmps -516(%rdx){1to4}, %xmm20, %xmm24
+
+// CHECK: vblendmps %ymm24, %ymm23, %ymm17
+// CHECK:  encoding: [0x62,0x82,0x45,0x20,0x65,0xc8]
+          vblendmps %ymm24, %ymm23, %ymm17
+
+// CHECK: vblendmps %ymm24, %ymm23, %ymm17 {%k6}
+// CHECK:  encoding: [0x62,0x82,0x45,0x26,0x65,0xc8]
+          vblendmps %ymm24, %ymm23, %ymm17 {%k6}
+
+// CHECK: vblendmps %ymm24, %ymm23, %ymm17 {%k6} {z}
+// CHECK:  encoding: [0x62,0x82,0x45,0xa6,0x65,0xc8]
+          vblendmps %ymm24, %ymm23, %ymm17 {%k6} {z}
+
+// CHECK: vblendmps (%rcx), %ymm23, %ymm17
+// CHECK:  encoding: [0x62,0xe2,0x45,0x20,0x65,0x09]
+          vblendmps (%rcx), %ymm23, %ymm17
+
+// CHECK: vblendmps 291(%rax,%r14,8), %ymm23, %ymm17
+// CHECK:  encoding: [0x62,0xa2,0x45,0x20,0x65,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vblendmps 291(%rax,%r14,8), %ymm23, %ymm17
+
+// CHECK: vblendmps (%rcx){1to8}, %ymm23, %ymm17
+// CHECK:  encoding: [0x62,0xe2,0x45,0x30,0x65,0x09]
+          vblendmps (%rcx){1to8}, %ymm23, %ymm17
+
+// CHECK: vblendmps 4064(%rdx), %ymm23, %ymm17
+// CHECK:  encoding: [0x62,0xe2,0x45,0x20,0x65,0x4a,0x7f]
+          vblendmps 4064(%rdx), %ymm23, %ymm17
+
+// CHECK: vblendmps 4096(%rdx), %ymm23, %ymm17
+// CHECK:  encoding: [0x62,0xe2,0x45,0x20,0x65,0x8a,0x00,0x10,0x00,0x00]
+          vblendmps 4096(%rdx), %ymm23, %ymm17
+
+// CHECK: vblendmps -4096(%rdx), %ymm23, %ymm17
+// CHECK:  encoding: [0x62,0xe2,0x45,0x20,0x65,0x4a,0x80]
+          vblendmps -4096(%rdx), %ymm23, %ymm17
+
+// CHECK: vblendmps -4128(%rdx), %ymm23, %ymm17
+// CHECK:  encoding: [0x62,0xe2,0x45,0x20,0x65,0x8a,0xe0,0xef,0xff,0xff]
+          vblendmps -4128(%rdx), %ymm23, %ymm17
+
+// CHECK: vblendmps 508(%rdx){1to8}, %ymm23, %ymm17
+// CHECK:  encoding: [0x62,0xe2,0x45,0x30,0x65,0x4a,0x7f]
+          vblendmps 508(%rdx){1to8}, %ymm23, %ymm17
+
+// CHECK: vblendmps 512(%rdx){1to8}, %ymm23, %ymm17
+// CHECK:  encoding: [0x62,0xe2,0x45,0x30,0x65,0x8a,0x00,0x02,0x00,0x00]
+          vblendmps 512(%rdx){1to8}, %ymm23, %ymm17
+
+// CHECK: vblendmps -512(%rdx){1to8}, %ymm23, %ymm17
+// CHECK:  encoding: [0x62,0xe2,0x45,0x30,0x65,0x4a,0x80]
+          vblendmps -512(%rdx){1to8}, %ymm23, %ymm17
+
+// CHECK: vblendmps -516(%rdx){1to8}, %ymm23, %ymm17
+// CHECK:  encoding: [0x62,0xe2,0x45,0x30,0x65,0x8a,0xfc,0xfd,0xff,0xff]
+          vblendmps -516(%rdx){1to8}, %ymm23, %ymm17
+
+// CHECK: vpblendmd %xmm26, %xmm25, %xmm17
+// CHECK:  encoding: [0x62,0x82,0x35,0x00,0x64,0xca]
+          vpblendmd %xmm26, %xmm25, %xmm17
+
+// CHECK: vpblendmd %xmm26, %xmm25, %xmm17 {%k5}
+// CHECK:  encoding: [0x62,0x82,0x35,0x05,0x64,0xca]
+          vpblendmd %xmm26, %xmm25, %xmm17 {%k5}
+
+// CHECK: vpblendmd %xmm26, %xmm25, %xmm17 {%k5} {z}
+// CHECK:  encoding: [0x62,0x82,0x35,0x85,0x64,0xca]
+          vpblendmd %xmm26, %xmm25, %xmm17 {%k5} {z}
+
+// CHECK: vpblendmd (%rcx), %xmm25, %xmm17
+// CHECK:  encoding: [0x62,0xe2,0x35,0x00,0x64,0x09]
+          vpblendmd (%rcx), %xmm25, %xmm17
+
+// CHECK: vpblendmd 291(%rax,%r14,8), %xmm25, %xmm17
+// CHECK:  encoding: [0x62,0xa2,0x35,0x00,0x64,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpblendmd 291(%rax,%r14,8), %xmm25, %xmm17
+
+// CHECK: vpblendmd (%rcx){1to4}, %xmm25, %xmm17
+// CHECK:  encoding: [0x62,0xe2,0x35,0x10,0x64,0x09]
+          vpblendmd (%rcx){1to4}, %xmm25, %xmm17
+
+// CHECK: vpblendmd 2032(%rdx), %xmm25, %xmm17
+// CHECK:  encoding: [0x62,0xe2,0x35,0x00,0x64,0x4a,0x7f]
+          vpblendmd 2032(%rdx), %xmm25, %xmm17
+
+// CHECK: vpblendmd 2048(%rdx), %xmm25, %xmm17
+// CHECK:  encoding: [0x62,0xe2,0x35,0x00,0x64,0x8a,0x00,0x08,0x00,0x00]
+          vpblendmd 2048(%rdx), %xmm25, %xmm17
+
+// CHECK: vpblendmd -2048(%rdx), %xmm25, %xmm17
+// CHECK:  encoding: [0x62,0xe2,0x35,0x00,0x64,0x4a,0x80]
+          vpblendmd -2048(%rdx), %xmm25, %xmm17
+
+// CHECK: vpblendmd -2064(%rdx), %xmm25, %xmm17
+// CHECK:  encoding: [0x62,0xe2,0x35,0x00,0x64,0x8a,0xf0,0xf7,0xff,0xff]
+          vpblendmd -2064(%rdx), %xmm25, %xmm17
+
+// CHECK: vpblendmd 508(%rdx){1to4}, %xmm25, %xmm17
+// CHECK:  encoding: [0x62,0xe2,0x35,0x10,0x64,0x4a,0x7f]
+          vpblendmd 508(%rdx){1to4}, %xmm25, %xmm17
+
+// CHECK: vpblendmd 512(%rdx){1to4}, %xmm25, %xmm17
+// CHECK:  encoding: [0x62,0xe2,0x35,0x10,0x64,0x8a,0x00,0x02,0x00,0x00]
+          vpblendmd 512(%rdx){1to4}, %xmm25, %xmm17
+
+// CHECK: vpblendmd -512(%rdx){1to4}, %xmm25, %xmm17
+// CHECK:  encoding: [0x62,0xe2,0x35,0x10,0x64,0x4a,0x80]
+          vpblendmd -512(%rdx){1to4}, %xmm25, %xmm17
+
+// CHECK: vpblendmd -516(%rdx){1to4}, %xmm25, %xmm17
+// CHECK:  encoding: [0x62,0xe2,0x35,0x10,0x64,0x8a,0xfc,0xfd,0xff,0xff]
+          vpblendmd -516(%rdx){1to4}, %xmm25, %xmm17
+
+// CHECK: vpblendmd %ymm23, %ymm29, %ymm26
+// CHECK:  encoding: [0x62,0x22,0x15,0x20,0x64,0xd7]
+          vpblendmd %ymm23, %ymm29, %ymm26
+
+// CHECK: vpblendmd %ymm23, %ymm29, %ymm26 {%k7}
+// CHECK:  encoding: [0x62,0x22,0x15,0x27,0x64,0xd7]
+          vpblendmd %ymm23, %ymm29, %ymm26 {%k7}
+
+// CHECK: vpblendmd %ymm23, %ymm29, %ymm26 {%k7} {z}
+// CHECK:  encoding: [0x62,0x22,0x15,0xa7,0x64,0xd7]
+          vpblendmd %ymm23, %ymm29, %ymm26 {%k7} {z}
+
+// CHECK: vpblendmd (%rcx), %ymm29, %ymm26
+// CHECK:  encoding: [0x62,0x62,0x15,0x20,0x64,0x11]
+          vpblendmd (%rcx), %ymm29, %ymm26
+
+// CHECK: vpblendmd 291(%rax,%r14,8), %ymm29, %ymm26
+// CHECK:  encoding: [0x62,0x22,0x15,0x20,0x64,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpblendmd 291(%rax,%r14,8), %ymm29, %ymm26
+
+// CHECK: vpblendmd (%rcx){1to8}, %ymm29, %ymm26
+// CHECK:  encoding: [0x62,0x62,0x15,0x30,0x64,0x11]
+          vpblendmd (%rcx){1to8}, %ymm29, %ymm26
+
+// CHECK: vpblendmd 4064(%rdx), %ymm29, %ymm26
+// CHECK:  encoding: [0x62,0x62,0x15,0x20,0x64,0x52,0x7f]
+          vpblendmd 4064(%rdx), %ymm29, %ymm26
+
+// CHECK: vpblendmd 4096(%rdx), %ymm29, %ymm26
+// CHECK:  encoding: [0x62,0x62,0x15,0x20,0x64,0x92,0x00,0x10,0x00,0x00]
+          vpblendmd 4096(%rdx), %ymm29, %ymm26
+
+// CHECK: vpblendmd -4096(%rdx), %ymm29, %ymm26
+// CHECK:  encoding: [0x62,0x62,0x15,0x20,0x64,0x52,0x80]
+          vpblendmd -4096(%rdx), %ymm29, %ymm26
+
+// CHECK: vpblendmd -4128(%rdx), %ymm29, %ymm26
+// CHECK:  encoding: [0x62,0x62,0x15,0x20,0x64,0x92,0xe0,0xef,0xff,0xff]
+          vpblendmd -4128(%rdx), %ymm29, %ymm26
+
+// CHECK: vpblendmd 508(%rdx){1to8}, %ymm29, %ymm26
+// CHECK:  encoding: [0x62,0x62,0x15,0x30,0x64,0x52,0x7f]
+          vpblendmd 508(%rdx){1to8}, %ymm29, %ymm26
+
+// CHECK: vpblendmd 512(%rdx){1to8}, %ymm29, %ymm26
+// CHECK:  encoding: [0x62,0x62,0x15,0x30,0x64,0x92,0x00,0x02,0x00,0x00]
+          vpblendmd 512(%rdx){1to8}, %ymm29, %ymm26
+
+// CHECK: vpblendmd -512(%rdx){1to8}, %ymm29, %ymm26
+// CHECK:  encoding: [0x62,0x62,0x15,0x30,0x64,0x52,0x80]
+          vpblendmd -512(%rdx){1to8}, %ymm29, %ymm26
+
+// CHECK: vpblendmd -516(%rdx){1to8}, %ymm29, %ymm26
+// CHECK:  encoding: [0x62,0x62,0x15,0x30,0x64,0x92,0xfc,0xfd,0xff,0xff]
+          vpblendmd -516(%rdx){1to8}, %ymm29, %ymm26
+
+// CHECK: vpblendmq %xmm17, %xmm27, %xmm29
+// CHECK:  encoding: [0x62,0x22,0xa5,0x00,0x64,0xe9]
+          vpblendmq %xmm17, %xmm27, %xmm29
+
+// CHECK: vpblendmq %xmm17, %xmm27, %xmm29 {%k6}
+// CHECK:  encoding: [0x62,0x22,0xa5,0x06,0x64,0xe9]
+          vpblendmq %xmm17, %xmm27, %xmm29 {%k6}
+
+// CHECK: vpblendmq %xmm17, %xmm27, %xmm29 {%k6} {z}
+// CHECK:  encoding: [0x62,0x22,0xa5,0x86,0x64,0xe9]
+          vpblendmq %xmm17, %xmm27, %xmm29 {%k6} {z}
+
+// CHECK: vpblendmq (%rcx), %xmm27, %xmm29
+// CHECK:  encoding: [0x62,0x62,0xa5,0x00,0x64,0x29]
+          vpblendmq (%rcx), %xmm27, %xmm29
+
+// CHECK: vpblendmq 291(%rax,%r14,8), %xmm27, %xmm29
+// CHECK:  encoding: [0x62,0x22,0xa5,0x00,0x64,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpblendmq 291(%rax,%r14,8), %xmm27, %xmm29
+
+// CHECK: vpblendmq (%rcx){1to2}, %xmm27, %xmm29
+// CHECK:  encoding: [0x62,0x62,0xa5,0x10,0x64,0x29]
+          vpblendmq (%rcx){1to2}, %xmm27, %xmm29
+
+// CHECK: vpblendmq 2032(%rdx), %xmm27, %xmm29
+// CHECK:  encoding: [0x62,0x62,0xa5,0x00,0x64,0x6a,0x7f]
+          vpblendmq 2032(%rdx), %xmm27, %xmm29
+
+// CHECK: vpblendmq 2048(%rdx), %xmm27, %xmm29
+// CHECK:  encoding: [0x62,0x62,0xa5,0x00,0x64,0xaa,0x00,0x08,0x00,0x00]
+          vpblendmq 2048(%rdx), %xmm27, %xmm29
+
+// CHECK: vpblendmq -2048(%rdx), %xmm27, %xmm29
+// CHECK:  encoding: [0x62,0x62,0xa5,0x00,0x64,0x6a,0x80]
+          vpblendmq -2048(%rdx), %xmm27, %xmm29
+
+// CHECK: vpblendmq -2064(%rdx), %xmm27, %xmm29
+// CHECK:  encoding: [0x62,0x62,0xa5,0x00,0x64,0xaa,0xf0,0xf7,0xff,0xff]
+          vpblendmq -2064(%rdx), %xmm27, %xmm29
+
+// CHECK: vpblendmq 1016(%rdx){1to2}, %xmm27, %xmm29
+// CHECK:  encoding: [0x62,0x62,0xa5,0x10,0x64,0x6a,0x7f]
+          vpblendmq 1016(%rdx){1to2}, %xmm27, %xmm29
+
+// CHECK: vpblendmq 1024(%rdx){1to2}, %xmm27, %xmm29
+// CHECK:  encoding: [0x62,0x62,0xa5,0x10,0x64,0xaa,0x00,0x04,0x00,0x00]
+          vpblendmq 1024(%rdx){1to2}, %xmm27, %xmm29
+
+// CHECK: vpblendmq -1024(%rdx){1to2}, %xmm27, %xmm29
+// CHECK:  encoding: [0x62,0x62,0xa5,0x10,0x64,0x6a,0x80]
+          vpblendmq -1024(%rdx){1to2}, %xmm27, %xmm29
+
+// CHECK: vpblendmq -1032(%rdx){1to2}, %xmm27, %xmm29
+// CHECK:  encoding: [0x62,0x62,0xa5,0x10,0x64,0xaa,0xf8,0xfb,0xff,0xff]
+          vpblendmq -1032(%rdx){1to2}, %xmm27, %xmm29
+
+// CHECK: vpblendmq %ymm21, %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xa2,0xc5,0x20,0x64,0xed]
+          vpblendmq %ymm21, %ymm23, %ymm21
+
+// CHECK: vpblendmq %ymm21, %ymm23, %ymm21 {%k3}
+// CHECK:  encoding: [0x62,0xa2,0xc5,0x23,0x64,0xed]
+          vpblendmq %ymm21, %ymm23, %ymm21 {%k3}
+
+// CHECK: vpblendmq %ymm21, %ymm23, %ymm21 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa2,0xc5,0xa3,0x64,0xed]
+          vpblendmq %ymm21, %ymm23, %ymm21 {%k3} {z}
+
+// CHECK: vpblendmq (%rcx), %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x20,0x64,0x29]
+          vpblendmq (%rcx), %ymm23, %ymm21
+
+// CHECK: vpblendmq 291(%rax,%r14,8), %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xa2,0xc5,0x20,0x64,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpblendmq 291(%rax,%r14,8), %ymm23, %ymm21
+
+// CHECK: vpblendmq (%rcx){1to4}, %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x30,0x64,0x29]
+          vpblendmq (%rcx){1to4}, %ymm23, %ymm21
+
+// CHECK: vpblendmq 4064(%rdx), %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x20,0x64,0x6a,0x7f]
+          vpblendmq 4064(%rdx), %ymm23, %ymm21
+
+// CHECK: vpblendmq 4096(%rdx), %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x20,0x64,0xaa,0x00,0x10,0x00,0x00]
+          vpblendmq 4096(%rdx), %ymm23, %ymm21
+
+// CHECK: vpblendmq -4096(%rdx), %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x20,0x64,0x6a,0x80]
+          vpblendmq -4096(%rdx), %ymm23, %ymm21
+
+// CHECK: vpblendmq -4128(%rdx), %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x20,0x64,0xaa,0xe0,0xef,0xff,0xff]
+          vpblendmq -4128(%rdx), %ymm23, %ymm21
+
+// CHECK: vpblendmq 1016(%rdx){1to4}, %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x30,0x64,0x6a,0x7f]
+          vpblendmq 1016(%rdx){1to4}, %ymm23, %ymm21
+
+// CHECK: vpblendmq 1024(%rdx){1to4}, %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x30,0x64,0xaa,0x00,0x04,0x00,0x00]
+          vpblendmq 1024(%rdx){1to4}, %ymm23, %ymm21
+
+// CHECK: vpblendmq -1024(%rdx){1to4}, %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x30,0x64,0x6a,0x80]
+          vpblendmq -1024(%rdx){1to4}, %ymm23, %ymm21
+
+// CHECK: vpblendmq -1032(%rdx){1to4}, %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x30,0x64,0xaa,0xf8,0xfb,0xff,0xff]
+          vpblendmq -1032(%rdx){1to4}, %ymm23, %ymm21
diff --git a/test/MC/X86/compact-unwind.s b/test/MC/X86/compact-unwind.s
new file mode 100644
index 0000000..82be239
--- /dev/null
+++ b/test/MC/X86/compact-unwind.s
@@ -0,0 +1,72 @@
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin10.0 %s | llvm-objdump -unwind-info - | FileCheck %s
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.macosx_version_min 10, 10
+
+# Check that we emit compact-unwind info with UNWIND_X86_MODE_STACK_IND encoding
+
+# CHECK: Contents of __compact_unwind section:
+# CHECK-NEXT:   Entry at offset 0x0:
+# CHECK-NEXT:     start:                0x0 _test0
+# CHECK-NEXT:     length:               0x15
+# CHECK-NEXT:     compact encoding:     0x03056804
+	.globl	_test0
+_test0:                                  ## @test0
+	.cfi_startproc
+## BB#0:                                ## %entry
+	pushq	%rbp
+Ltmp0:
+	.cfi_def_cfa_offset 16
+	pushq	%rbx
+Ltmp1:
+	.cfi_def_cfa_offset 24
+	subq	$14408, %rsp            ## imm = 0x3848
+Ltmp2:
+	.cfi_def_cfa_offset 14432
+Ltmp3:
+	.cfi_offset %rbx, -24
+Ltmp4:
+	.cfi_offset %rbp, -16
+	xorl	%eax, %eax
+	addq	$14408, %rsp            ## imm = 0x3848
+	popq	%rbx
+	popq	%rbp
+	retq
+	.cfi_endproc
+
+# Check that we emit compact-unwind info with UNWIND_X86_MODE_STACK_IMMD encoding
+
+# CHECK:   Entry at offset 0x20:
+# CHECK-NEXT:     start:                0x15 _test1
+# CHECK-NEXT:     length:               0x15
+# CHECK-NEXT:     compact encoding:     0x02360804
+	.globl	_test1
+_test1:                                  ## @test1
+	.cfi_startproc
+## BB#0:                                ## %entry
+	pushq	%rbp
+Ltmp10:
+	.cfi_def_cfa_offset 16
+	pushq	%rbx
+Ltmp11:
+	.cfi_def_cfa_offset 24
+	subq	$408, %rsp              ## imm = 0x198
+Ltmp12:
+	.cfi_def_cfa_offset 432
+Ltmp13:
+	.cfi_offset %rbx, -24
+Ltmp14:
+	.cfi_offset %rbp, -16
+	xorl	%eax, %eax
+	addq	$408, %rsp              ## imm = 0x198
+	popq	%rbx
+	popq	%rbp
+	retq
+	.cfi_endproc
+
+	.section	__TEXT,__cstring,cstring_literals
+L_.str:                                 ## @.str
+	.asciz	"%d\n"
+
+
+.subsections_via_symbols
diff --git a/test/MC/X86/cstexpr-gotpcrel.ll b/test/MC/X86/cstexpr-gotpcrel.ll
new file mode 100644
index 0000000..82da870
--- /dev/null
+++ b/test/MC/X86/cstexpr-gotpcrel.ll
@@ -0,0 +1,78 @@
+; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t
+; RUN: FileCheck %s < %t
+; RUN: FileCheck %s -check-prefix=GOT-EQUIV < %t
+
+; GOT equivalent globals references can be replaced by the GOT entry of the
+; final symbol instead.
+
+%struct.data = type { i32, %struct.anon }
+%struct.anon = type { i32, i32 }
+
+; Check that these got equivalent symbols are never emitted or used
+; GOT-EQUIV-NOT: _localgotequiv
+; GOT-EQUIV-NOT: _extgotequiv
+@localfoo = global i32 42
+@localgotequiv = private unnamed_addr constant i32* @localfoo
+
+@extfoo = external global i32
+@extgotequiv = private unnamed_addr constant i32* @extfoo
+
+; Don't replace GOT equivalent usage within instructions and emit the GOT
+; equivalent since it can't be replaced by the GOT entry. @bargotequiv is
+; used by an instruction inside @t0.
+;
+; CHECK: l_bargotequiv:
+; CHECK-NEXT:  .quad   _extbar
+@extbar = external global i32
+@bargotequiv = private unnamed_addr constant i32* @extbar
+
+@table = global [4 x %struct.data] [
+; CHECK-LABEL: _table
+  %struct.data { i32 1, %struct.anon { i32 2, i32 3 } },
+; Test GOT equivalent usage inside nested constant arrays.
+; CHECK: .long   5
+; CHECK-NOT: .long   _localgotequiv-(_table+20)
+; CHECK-NEXT: .long   _localfoo@GOTPCREL+4
+  %struct.data { i32 4, %struct.anon { i32 5,
+    i32 trunc (i64 sub (i64 ptrtoint (i32** @localgotequiv to i64),
+                        i64 ptrtoint (i32* getelementptr inbounds ([4 x %struct.data]* @table, i32 0, i64 1, i32 1, i32 1) to i64))
+                        to i32)}
+  },
+; CHECK: .long   5
+; CHECK-NOT: _extgotequiv-(_table+32)
+; CHECK-NEXT: .long   _extfoo@GOTPCREL+4
+  %struct.data { i32 4, %struct.anon { i32 5,
+    i32 trunc (i64 sub (i64 ptrtoint (i32** @extgotequiv to i64),
+                        i64 ptrtoint (i32* getelementptr inbounds ([4 x %struct.data]* @table, i32 0, i64 2, i32 1, i32 1) to i64))
+                        to i32)}
+  },
+; Test support for arbitrary constants into the GOTPCREL offset
+; CHECK: .long   5
+; CHECK-NOT: _extgotequiv-(_table+44)
+; CHECK-NEXT: .long   _extfoo@GOTPCREL+28
+  %struct.data { i32 4, %struct.anon { i32 5,
+    i32 add (i32 trunc (i64 sub (i64 ptrtoint (i32** @extgotequiv to i64),
+                                 i64 ptrtoint (i32* getelementptr inbounds ([4 x %struct.data]* @table, i32 0, i64 3, i32 1, i32 1) to i64))
+                                 to i32), i32 24)}
+  }
+], align 16
+
+; Test multiple uses of GOT equivalents.
+; CHECK-LABEL: _delta
+; CHECK: .long   _extfoo@GOTPCREL+4
+@delta = global i32 trunc (i64 sub (i64 ptrtoint (i32** @extgotequiv to i64),
+                                    i64 ptrtoint (i32* @delta to i64))
+                           to i32)
+
+; CHECK-LABEL: _deltaplus:
+; CHECK: .long   _localfoo@GOTPCREL+59
+@deltaplus = global i32 add (i32 trunc (i64 sub (i64 ptrtoint (i32** @localgotequiv to i64),
+                                        i64 ptrtoint (i32* @deltaplus to i64))
+                                        to i32), i32 55)
+
+define i32 @t0(i32 %a) {
+  %x = add i32 trunc (i64 sub (i64 ptrtoint (i32** @bargotequiv to i64),
+                               i64 ptrtoint (i32 (i32)* @t0 to i64))
+                           to i32), %a
+  ret i32 %x
+}
diff --git a/test/MC/X86/i386-darwin-frame-register.ll b/test/MC/X86/i386-darwin-frame-register.ll
new file mode 100644
index 0000000..dd8c88d
--- /dev/null
+++ b/test/MC/X86/i386-darwin-frame-register.ll
@@ -0,0 +1,38 @@
+; RUN: llc -filetype=obj %s -o - | llvm-dwarfdump -debug-dump=frames - | FileCheck %s
+
+; IR reduced from a dummy:
+; void foo() {}
+
+; x86 Darwin uses different register mappings for eh_frame and debug_frame
+; sections. Check that the right mapping is used in debug_frame.
+; In the debug_frame mapping, regsiter 4 is ESP, thus the below tests that
+; the CFA is ESP+4 upon function entry.
+
+; CHECK: .debug_frame contents:
+; CHECK: ffffffff CIE
+; CHECK-NOT: {{CIE|FDE}}
+; CHECK:   DW_CFA_def_cfa: reg4 +4
+
+; ModuleID = 'foo.c'
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.10.0"
+
+; Function Attrs: nounwind ssp
+define void @foo() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = !{!"0x11\0012\00clang version 3.7.0 (trunk 230514) (llvm/trunk 230518)\000\00\000\00\001", !1, !2, !2, !2, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/foo.c] [DW_LANG_C99]
+!1 = !{!"foo.c", !"/tmp"}
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 2}
+!4 = !{i32 2, !"Debug Info Version", i32 2}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 3.7.0 (trunk 230514) (llvm/trunk 230518)"}
diff --git a/test/MC/X86/intel-syntax-unsized-memory.s b/test/MC/X86/intel-syntax-unsized-memory.s
new file mode 100644
index 0000000..c9d8e2a
--- /dev/null
+++ b/test/MC/X86/intel-syntax-unsized-memory.s
@@ -0,0 +1,29 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -mcpu=knl %s | FileCheck %s
+
+// Check that we deduce unsized memory operands in the general, unambiguous, case.
+// We can't deduce xword memory operands, because there is no instruction
+// unambiguously accessing 80-bit memory.
+
+// CHECK: movb %al, (%rax)
+mov [rax], al
+
+// CHECK: movw %ax, (%rax)
+mov [rax], ax
+
+// CHECK: movl %eax, (%rax)
+mov [rax], eax
+
+// CHECK: movq %rax, (%rax)
+mov [rax], rax
+
+// CHECK: movdqa %xmm0, (%rax)
+movdqa [rax], xmm0
+
+// CHECK: vmovdqa %ymm0, (%rax)
+vmovdqa [rax], ymm0
+
+// CHECK: vaddps (%rax), %zmm1, %zmm1
+vaddps zmm1, zmm1, [rax]
+
+// CHECK: leal 1(%r15d), %r9d
+lea r9d, [r15d+1]
diff --git a/test/MC/X86/intel-syntax.s b/test/MC/X86/intel-syntax.s
index c027aa4..fce0c65 100644
--- a/test/MC/X86/intel-syntax.s
+++ b/test/MC/X86/intel-syntax.s
@@ -586,8 +586,8 @@ fdiv ST(1)
 fdivr ST(1)
 
 
-// CHECK: fxsaveq (%rax)
-// CHECK: fxrstorq (%rax)
+// CHECK: fxsave64 (%rax)
+// CHECK: fxrstor64 (%rax)
 fxsave64 opaque ptr [rax]
 fxrstor64 opaque ptr [rax]
 
diff --git a/test/MC/X86/shuffle-comments.s b/test/MC/X86/shuffle-comments.s
index 20fd4eb..2f22bff 100644
--- a/test/MC/X86/shuffle-comments.s
+++ b/test/MC/X86/shuffle-comments.s
@@ -269,3 +269,8 @@ vshufpd $11, %ymm0, %ymm1, %ymm2
 # CHECK: ymm2 = ymm1[1],ymm0[1],ymm1[2],ymm0[3]
 vshufpd $11, (%rax), %ymm1, %ymm2
 # CHECK: ymm2 = ymm1[1],mem[1],ymm1[2],mem[3]
+
+vinsertps $16, %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[0],xmm0[0],xmm1[2,3]
+vinsertps $16, (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[0],mem[0],xmm1[2,3]
diff --git a/test/MC/X86/validate-inst-att.s b/test/MC/X86/validate-inst-att.s
new file mode 100644
index 0000000..dec8bfd
--- /dev/null
+++ b/test/MC/X86/validate-inst-att.s
@@ -0,0 +1,7 @@
+# RUN: not llvm-mc -triple i686 -filetype asm -o /dev/null %s 2>&1 | FileCheck %s
+
+	.text
+	int $65535
+# CHECK: error: interrupt vector must be in range [0-255]
+# CHECK:	int $65535
+# CHECK:            ^
diff --git a/test/MC/X86/validate-inst-intel.s b/test/MC/X86/validate-inst-intel.s
new file mode 100644
index 0000000..9a7d122
--- /dev/null
+++ b/test/MC/X86/validate-inst-intel.s
@@ -0,0 +1,9 @@
+# RUN: not llvm-mc -x86-asm-syntax intel -triple i686 -filetype asm -o /dev/null %s 2>&1 \
+# RUN:    | FileCheck %s
+
+	.text
+	int 65535
+# CHECK: error: interrupt vector must be in range [0-255]
+# CHECK:	int 65535
+# CHECK:            ^
+
diff --git a/test/MC/X86/x86-32-avx.s b/test/MC/X86/x86-32-avx.s
index ec4abdb..a2f47d4 100644
--- a/test/MC/X86/x86-32-avx.s
+++ b/test/MC/X86/x86-32-avx.s
@@ -343,131 +343,131 @@
 // CHECK: encoding: [0xc5,0xe9,0xc6,0x5c,0xcb,0xfc,0x08]
           vshufpd  $8, -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpps  $0, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpeqps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x00]
           vcmpeqps   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $2, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpleps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x02]
           vcmpleps   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $1, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpltps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x01]
           vcmpltps   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $4, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpneqps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x04]
           vcmpneqps   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $6, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpnleps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x06]
           vcmpnleps   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $5, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpnltps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x05]
           vcmpnltps   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $7, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpordps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x07]
           vcmpordps   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $3, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpunordps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x03]
           vcmpunordps   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $0, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpeqps  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0x5c,0xcb,0xfc,0x00]
           vcmpeqps   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpps  $2, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpleps  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0x5c,0xcb,0xfc,0x02]
           vcmpleps   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpps  $1, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpltps  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0x5c,0xcb,0xfc,0x01]
           vcmpltps   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpps  $4, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpneqps  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0x5c,0xcb,0xfc,0x04]
           vcmpneqps   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpps  $6, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpnleps  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0x5c,0xcb,0xfc,0x06]
           vcmpnleps   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpps  $5, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpnltps  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0x5c,0xcb,0xfc,0x05]
           vcmpnltps   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpps  $7, -4(%ebx,%ecx,8), %xmm6, %xmm2
+// CHECK: vcmpordps  -4(%ebx,%ecx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xc8,0xc2,0x54,0xcb,0xfc,0x07]
           vcmpordps   -4(%ebx,%ecx,8), %xmm6, %xmm2
 
-// CHECK: vcmpps  $3, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpunordps  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0x5c,0xcb,0xfc,0x03]
           vcmpunordps   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmppd  $0, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpeqpd  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe9,0xc2,0xd9,0x00]
           vcmpeqpd   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmppd  $2, %xmm1, %xmm2, %xmm3
+// CHECK: vcmplepd  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe9,0xc2,0xd9,0x02]
           vcmplepd   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmppd  $1, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpltpd  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe9,0xc2,0xd9,0x01]
           vcmpltpd   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmppd  $4, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpneqpd  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe9,0xc2,0xd9,0x04]
           vcmpneqpd   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmppd  $6, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpnlepd  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe9,0xc2,0xd9,0x06]
           vcmpnlepd   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmppd  $5, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpnltpd  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe9,0xc2,0xd9,0x05]
           vcmpnltpd   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmppd  $7, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpordpd  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe9,0xc2,0xd9,0x07]
           vcmpordpd   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmppd  $3, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpunordpd  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe9,0xc2,0xd9,0x03]
           vcmpunordpd   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmppd  $0, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpeqpd  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe9,0xc2,0x5c,0xcb,0xfc,0x00]
           vcmpeqpd   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmppd  $2, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmplepd  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe9,0xc2,0x5c,0xcb,0xfc,0x02]
           vcmplepd   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmppd  $1, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpltpd  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe9,0xc2,0x5c,0xcb,0xfc,0x01]
           vcmpltpd   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmppd  $4, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpneqpd  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe9,0xc2,0x5c,0xcb,0xfc,0x04]
           vcmpneqpd   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmppd  $6, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpnlepd  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe9,0xc2,0x5c,0xcb,0xfc,0x06]
           vcmpnlepd   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmppd  $5, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpnltpd  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe9,0xc2,0x5c,0xcb,0xfc,0x05]
           vcmpnltpd   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmppd  $7, -4(%ebx,%ecx,8), %xmm6, %xmm2
+// CHECK: vcmpordpd  -4(%ebx,%ecx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xc9,0xc2,0x54,0xcb,0xfc,0x07]
           vcmpordpd   -4(%ebx,%ecx,8), %xmm6, %xmm2
 
-// CHECK: vcmppd  $3, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpunordpd  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe9,0xc2,0x5c,0xcb,0xfc,0x03]
           vcmpunordpd   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
@@ -487,131 +487,131 @@
 // CHECK: encoding: [0xc5,0xfd,0x50,0xc2]
           vmovmskpd  %ymm2, %eax
 
-// CHECK: vcmpss  $0, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpeqss  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xea,0xc2,0xd9,0x00]
           vcmpeqss   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpss  $2, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpless   %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xea,0xc2,0xd9,0x02]
           vcmpless   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpss  $1, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpltss  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xea,0xc2,0xd9,0x01]
           vcmpltss   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpss  $4, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpneqss  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xea,0xc2,0xd9,0x04]
           vcmpneqss   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpss  $6, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpnless  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xea,0xc2,0xd9,0x06]
           vcmpnless   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpss  $5, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpnltss  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xea,0xc2,0xd9,0x05]
           vcmpnltss   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpss  $7, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpordss  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xea,0xc2,0xd9,0x07]
           vcmpordss   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpss  $3, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpunordss  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xea,0xc2,0xd9,0x03]
           vcmpunordss   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpss  $0, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpeqss  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xea,0xc2,0x5c,0xcb,0xfc,0x00]
           vcmpeqss   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpss  $2, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpless  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xea,0xc2,0x5c,0xcb,0xfc,0x02]
           vcmpless   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpss  $1, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpltss  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xea,0xc2,0x5c,0xcb,0xfc,0x01]
           vcmpltss   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpss  $4, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpneqss  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xea,0xc2,0x5c,0xcb,0xfc,0x04]
           vcmpneqss   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpss  $6, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpnless  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xea,0xc2,0x5c,0xcb,0xfc,0x06]
           vcmpnless   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpss  $5, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpnltss  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xea,0xc2,0x5c,0xcb,0xfc,0x05]
           vcmpnltss   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpss  $7, -4(%ebx,%ecx,8), %xmm6, %xmm2
+// CHECK: vcmpordss  -4(%ebx,%ecx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xca,0xc2,0x54,0xcb,0xfc,0x07]
           vcmpordss   -4(%ebx,%ecx,8), %xmm6, %xmm2
 
-// CHECK: vcmpss  $3, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpunordss  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xea,0xc2,0x5c,0xcb,0xfc,0x03]
           vcmpunordss   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpsd  $0, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpeqsd  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xeb,0xc2,0xd9,0x00]
           vcmpeqsd   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpsd  $2, %xmm1, %xmm2, %xmm3
+// CHECK: vcmplesd  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xeb,0xc2,0xd9,0x02]
           vcmplesd   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpsd  $1, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpltsd  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xeb,0xc2,0xd9,0x01]
           vcmpltsd   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpsd  $4, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpneqsd  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xeb,0xc2,0xd9,0x04]
           vcmpneqsd   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpsd  $6, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpnlesd  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xeb,0xc2,0xd9,0x06]
           vcmpnlesd   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpsd  $5, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpnltsd  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xeb,0xc2,0xd9,0x05]
           vcmpnltsd   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpsd  $7, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpordsd  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xeb,0xc2,0xd9,0x07]
           vcmpordsd   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpsd  $3, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpunordsd  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xeb,0xc2,0xd9,0x03]
           vcmpunordsd   %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpsd  $0, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpeqsd  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xeb,0xc2,0x5c,0xcb,0xfc,0x00]
           vcmpeqsd   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpsd  $2, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmplesd  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xeb,0xc2,0x5c,0xcb,0xfc,0x02]
           vcmplesd   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpsd  $1, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpltsd  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xeb,0xc2,0x5c,0xcb,0xfc,0x01]
           vcmpltsd   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpsd  $4, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpneqsd  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xeb,0xc2,0x5c,0xcb,0xfc,0x04]
           vcmpneqsd   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpsd  $6, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpnlesd  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xeb,0xc2,0x5c,0xcb,0xfc,0x06]
           vcmpnlesd   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpsd  $5, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpnltsd  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xeb,0xc2,0x5c,0xcb,0xfc,0x05]
           vcmpnltsd   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
-// CHECK: vcmpsd  $7, -4(%ebx,%ecx,8), %xmm6, %xmm2
+// CHECK: vcmpordsd  -4(%ebx,%ecx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xcb,0xc2,0x54,0xcb,0xfc,0x07]
           vcmpordsd   -4(%ebx,%ecx,8), %xmm6, %xmm2
 
-// CHECK: vcmpsd  $3, -4(%ebx,%ecx,8), %xmm2, %xmm3
+// CHECK: vcmpunordsd  -4(%ebx,%ecx,8), %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xeb,0xc2,0x5c,0xcb,0xfc,0x03]
           vcmpunordsd   -4(%ebx,%ecx,8), %xmm2, %xmm3
 
@@ -2195,99 +2195,99 @@
 // CHECK: encoding: [0xc4,0xe3,0x79,0xdf,0x28,0x07]
           vaeskeygenassist  $7, (%eax), %xmm5
 
-// CHECK: vcmpps  $8, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpeq_uqps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x08]
           vcmpeq_uqps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $9, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpngeps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x09]
           vcmpngeps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $10, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpngtps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x0a]
           vcmpngtps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $11, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpfalseps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x0b]
           vcmpfalseps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $12, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpneq_oqps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x0c]
           vcmpneq_oqps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $13, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpgeps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x0d]
           vcmpgeps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $14, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpgtps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x0e]
           vcmpgtps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $15, %xmm1, %xmm2, %xmm3
+// CHECK: vcmptrueps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x0f]
           vcmptrueps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $16, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpeq_osps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x10]
           vcmpeq_osps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $17, %xmm1, %xmm2, %xmm3
+// CHECK: vcmplt_oqps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x11]
           vcmplt_oqps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $18, %xmm1, %xmm2, %xmm3
+// CHECK: vcmple_oqps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x12]
           vcmple_oqps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $19, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpunord_sps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x13]
           vcmpunord_sps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $20, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpneq_usps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x14]
           vcmpneq_usps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $21, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpnlt_uqps   %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x15]
           vcmpnlt_uqps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $22, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpnle_uqps   %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x16]
           vcmpnle_uqps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $23, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpord_sps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x17]
           vcmpord_sps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $24, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpeq_usps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x18]
           vcmpeq_usps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $25, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpnge_uqps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x19]
           vcmpnge_uqps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $26, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpngt_uqps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x1a]
           vcmpngt_uqps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $27, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpfalse_osps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x1b]
           vcmpfalse_osps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $28, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpneq_osps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x1c]
           vcmpneq_osps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $29, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpge_oqps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x1d]
           vcmpge_oqps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $30, %xmm1, %xmm2, %xmm3
+// CHECK: vcmpgt_oqps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x1e]
           vcmpgt_oqps %xmm1, %xmm2, %xmm3
 
-// CHECK: vcmpps  $31, %xmm1, %xmm2, %xmm3
+// CHECK: vcmptrue_usps  %xmm1, %xmm2, %xmm3
 // CHECK: encoding: [0xc5,0xe8,0xc2,0xd9,0x1f]
           vcmptrue_usps %xmm1, %xmm2, %xmm3
 
@@ -2687,227 +2687,227 @@
 // CHECK: encoding: [0xc5,0xfb,0xe6,0x08]
           vcvtpd2dqx  (%eax), %xmm1
 
-// CHECK: vcmpps  $0, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpeqps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x00]
           vcmpeqps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $2, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpleps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x02]
           vcmpleps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $1, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpltps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x01]
           vcmpltps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $4, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpneqps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x04]
           vcmpneqps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $6, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpnleps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x06]
           vcmpnleps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $5, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpnltps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x05]
           vcmpnltps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $7, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpordps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x07]
           vcmpordps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $3, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpunordps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x03]
           vcmpunordps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $0, -4(%ebx,%ecx,8), %ymm2, %ymm3
+// CHECK: vcmpeqps  -4(%ebx,%ecx,8), %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0x5c,0xcb,0xfc,0x00]
           vcmpeqps -4(%ebx,%ecx,8), %ymm2, %ymm3
 
-// CHECK: vcmpps  $2, -4(%ebx,%ecx,8), %ymm2, %ymm3
+// CHECK: vcmpleps  -4(%ebx,%ecx,8), %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0x5c,0xcb,0xfc,0x02]
           vcmpleps -4(%ebx,%ecx,8), %ymm2, %ymm3
 
-// CHECK: vcmpps  $1, -4(%ebx,%ecx,8), %ymm2, %ymm3
+// CHECK: vcmpltps  -4(%ebx,%ecx,8), %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0x5c,0xcb,0xfc,0x01]
           vcmpltps -4(%ebx,%ecx,8), %ymm2, %ymm3
 
-// CHECK: vcmpps  $4, -4(%ebx,%ecx,8), %ymm2, %ymm3
+// CHECK: vcmpneqps  -4(%ebx,%ecx,8), %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0x5c,0xcb,0xfc,0x04]
           vcmpneqps -4(%ebx,%ecx,8), %ymm2, %ymm3
 
-// CHECK: vcmpps  $6, -4(%ebx,%ecx,8), %ymm2, %ymm3
+// CHECK: vcmpnleps  -4(%ebx,%ecx,8), %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0x5c,0xcb,0xfc,0x06]
           vcmpnleps -4(%ebx,%ecx,8), %ymm2, %ymm3
 
-// CHECK: vcmpps  $5, -4(%ebx,%ecx,8), %ymm2, %ymm3
+// CHECK: vcmpnltps  -4(%ebx,%ecx,8), %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0x5c,0xcb,0xfc,0x05]
           vcmpnltps -4(%ebx,%ecx,8), %ymm2, %ymm3
 
-// CHECK: vcmpps  $7, -4(%ebx,%ecx,8), %ymm6, %ymm2
+// CHECK: vcmpordps  -4(%ebx,%ecx,8), %ymm6, %ymm2
 // CHECK: encoding: [0xc5,0xcc,0xc2,0x54,0xcb,0xfc,0x07]
           vcmpordps -4(%ebx,%ecx,8), %ymm6, %ymm2
 
-// CHECK: vcmpps  $3, -4(%ebx,%ecx,8), %ymm2, %ymm3
+// CHECK: vcmpunordps  -4(%ebx,%ecx,8), %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0x5c,0xcb,0xfc,0x03]
           vcmpunordps -4(%ebx,%ecx,8), %ymm2, %ymm3
 
-// CHECK: vcmppd  $0, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpeqpd  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xed,0xc2,0xd9,0x00]
           vcmpeqpd %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmppd  $2, %ymm1, %ymm2, %ymm3
+// CHECK: vcmplepd  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xed,0xc2,0xd9,0x02]
           vcmplepd %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmppd  $1, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpltpd  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xed,0xc2,0xd9,0x01]
           vcmpltpd %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmppd  $4, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpneqpd  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xed,0xc2,0xd9,0x04]
           vcmpneqpd %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmppd  $6, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpnlepd  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xed,0xc2,0xd9,0x06]
           vcmpnlepd %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmppd  $5, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpnltpd  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xed,0xc2,0xd9,0x05]
           vcmpnltpd %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmppd  $7, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpordpd  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xed,0xc2,0xd9,0x07]
           vcmpordpd %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmppd  $3, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpunordpd  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xed,0xc2,0xd9,0x03]
           vcmpunordpd %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmppd  $0, -4(%ebx,%ecx,8), %ymm2, %ymm3
+// CHECK: vcmpeqpd  -4(%ebx,%ecx,8), %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xed,0xc2,0x5c,0xcb,0xfc,0x00]
           vcmpeqpd -4(%ebx,%ecx,8), %ymm2, %ymm3
 
-// CHECK: vcmppd  $2, -4(%ebx,%ecx,8), %ymm2, %ymm3
+// CHECK: vcmplepd  -4(%ebx,%ecx,8), %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xed,0xc2,0x5c,0xcb,0xfc,0x02]
           vcmplepd -4(%ebx,%ecx,8), %ymm2, %ymm3
 
-// CHECK: vcmppd  $1, -4(%ebx,%ecx,8), %ymm2, %ymm3
+// CHECK: vcmpltpd  -4(%ebx,%ecx,8), %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xed,0xc2,0x5c,0xcb,0xfc,0x01]
           vcmpltpd -4(%ebx,%ecx,8), %ymm2, %ymm3
 
-// CHECK: vcmppd  $4, -4(%ebx,%ecx,8), %ymm2, %ymm3
+// CHECK: vcmpneqpd  -4(%ebx,%ecx,8), %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xed,0xc2,0x5c,0xcb,0xfc,0x04]
           vcmpneqpd -4(%ebx,%ecx,8), %ymm2, %ymm3
 
-// CHECK: vcmppd  $6, -4(%ebx,%ecx,8), %ymm2, %ymm3
+// CHECK: vcmpnlepd  -4(%ebx,%ecx,8), %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xed,0xc2,0x5c,0xcb,0xfc,0x06]
           vcmpnlepd -4(%ebx,%ecx,8), %ymm2, %ymm3
 
-// CHECK: vcmppd  $5, -4(%ebx,%ecx,8), %ymm2, %ymm3
+// CHECK: vcmpnltpd  -4(%ebx,%ecx,8), %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xed,0xc2,0x5c,0xcb,0xfc,0x05]
           vcmpnltpd -4(%ebx,%ecx,8), %ymm2, %ymm3
 
-// CHECK: vcmppd  $7, -4(%ebx,%ecx,8), %ymm6, %ymm2
+// CHECK: vcmpordpd  -4(%ebx,%ecx,8), %ymm6, %ymm2
 // CHECK: encoding: [0xc5,0xcd,0xc2,0x54,0xcb,0xfc,0x07]
           vcmpordpd -4(%ebx,%ecx,8), %ymm6, %ymm2
 
-// CHECK: vcmppd  $3, -4(%ebx,%ecx,8), %ymm2, %ymm3
+// CHECK: vcmpunordpd  -4(%ebx,%ecx,8), %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xed,0xc2,0x5c,0xcb,0xfc,0x03]
           vcmpunordpd -4(%ebx,%ecx,8), %ymm2, %ymm3
 
-// CHECK: vcmpps  $8, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpeq_uqps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x08]
           vcmpeq_uqps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $9, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpngeps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x09]
           vcmpngeps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $10, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpngtps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x0a]
           vcmpngtps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $11, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpfalseps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x0b]
           vcmpfalseps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $12, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpneq_oqps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x0c]
           vcmpneq_oqps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $13, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpgeps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x0d]
           vcmpgeps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $14, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpgtps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x0e]
           vcmpgtps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $15, %ymm1, %ymm2, %ymm3
+// CHECK: vcmptrueps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x0f]
           vcmptrueps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $16, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpeq_osps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x10]
           vcmpeq_osps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $17, %ymm1, %ymm2, %ymm3
+// CHECK: vcmplt_oqps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x11]
           vcmplt_oqps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $18, %ymm1, %ymm2, %ymm3
+// CHECK: vcmple_oqps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x12]
           vcmple_oqps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $19, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpunord_sps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x13]
           vcmpunord_sps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $20, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpneq_usps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x14]
           vcmpneq_usps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $21, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpnlt_uqps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x15]
           vcmpnlt_uqps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $22, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpnle_uqps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x16]
           vcmpnle_uqps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $23, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpord_sps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x17]
           vcmpord_sps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $24, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpeq_usps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x18]
           vcmpeq_usps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $25, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpnge_uqps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x19]
           vcmpnge_uqps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $26, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpngt_uqps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x1a]
           vcmpngt_uqps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $27, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpfalse_osps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x1b]
           vcmpfalse_osps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $28, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpneq_osps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x1c]
           vcmpneq_osps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $29, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpge_oqps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x1d]
           vcmpge_oqps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $30, %ymm1, %ymm2, %ymm3
+// CHECK: vcmpgt_oqps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x1e]
           vcmpgt_oqps %ymm1, %ymm2, %ymm3
 
-// CHECK: vcmpps  $31, %ymm1, %ymm2, %ymm3
+// CHECK: vcmptrue_usps  %ymm1, %ymm2, %ymm3
 // CHECK: encoding: [0xc5,0xec,0xc2,0xd9,0x1f]
           vcmptrue_usps %ymm1, %ymm2, %ymm3
 
@@ -3351,3 +3351,15 @@
           vdppd $0x81, %xmm2, %xmm5, %xmm1
 // CHECK: vinsertps $129, %xmm3, %xmm2, %xmm1
           vinsertps $0x81, %xmm3, %xmm2, %xmm1
+
+// CHECK: vcmpps $128, %xmm2, %xmm1, %xmm0
+// CHECK: encoding: [0xc5,0xf0,0xc2,0xc2,0x80]
+vcmpps $-128, %xmm2, %xmm1, %xmm0
+
+// CHECK: vcmpps $128, %xmm2, %xmm1, %xmm0
+// CHECK: encoding: [0xc5,0xf0,0xc2,0xc2,0x80]
+vcmpps $128, %xmm2, %xmm1, %xmm0
+
+// CHECK: vcmpps $255, %xmm2, %xmm1, %xmm0
+// CHECK: encoding: [0xc5,0xf0,0xc2,0xc2,0xff]
+vcmpps $255, %xmm2, %xmm1, %xmm0
diff --git a/test/MC/X86/x86-32-coverage.s b/test/MC/X86/x86-32-coverage.s
index 80c34ec..e14031d 100644
--- a/test/MC/X86/x86-32-coverage.s
+++ b/test/MC/X86/x86-32-coverage.s
@@ -1,377 +1,5 @@
 // RUN: llvm-mc -triple i386-unknown-unknown %s --show-encoding  | FileCheck %s
 
-// CHECK: 	movb	$127, 3735928559(%ebx,%ecx,8)
-        	movb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movw	$31438, 3735928559(%ebx,%ecx,8)
-        	movw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	movl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movl	$324478056, 3735928559(%ebx,%ecx,8)
-        	movl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movsbl	3735928559(%ebx,%ecx,8), %ecx
-        	movsbl	0xdeadbeef(%ebx,%ecx,8),%ecx
-
-// CHECK: 	movswl	3735928559(%ebx,%ecx,8), %ecx
-        	movswl	0xdeadbeef(%ebx,%ecx,8),%ecx
-
-// CHECK: 	movzbl	3735928559(%ebx,%ecx,8), %ecx
-        	movzbl	0xdeadbeef(%ebx,%ecx,8),%ecx
-
-// CHECK: 	movzwl	3735928559(%ebx,%ecx,8), %ecx
-        	movzwl	0xdeadbeef(%ebx,%ecx,8),%ecx
-
-// CHECK: 	pushl	3735928559(%ebx,%ecx,8)
-        	pushl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	popl	3735928559(%ebx,%ecx,8)
-        	popl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	lahf
-        	lahf
-
-// CHECK: 	sahf
-        	sahf
-
-// CHECK: 	addb	$254, 3735928559(%ebx,%ecx,8)
-        	addb	$0xfe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	addb	$127, 3735928559(%ebx,%ecx,8)
-        	addb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	addw	$31438, 3735928559(%ebx,%ecx,8)
-        	addw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	addl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	addl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	addl	$324478056, 3735928559(%ebx,%ecx,8)
-        	addl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	incl	3735928559(%ebx,%ecx,8)
-        	incl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	subb	$254, 3735928559(%ebx,%ecx,8)
-        	subb	$0xfe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	subb	$127, 3735928559(%ebx,%ecx,8)
-        	subb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	subw	$31438, 3735928559(%ebx,%ecx,8)
-        	subw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	subl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	subl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	subl	$324478056, 3735928559(%ebx,%ecx,8)
-        	subl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	decl	3735928559(%ebx,%ecx,8)
-        	decl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sbbw	$31438, 3735928559(%ebx,%ecx,8)
-        	sbbw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sbbl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	sbbl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sbbl	$324478056, 3735928559(%ebx,%ecx,8)
-        	sbbl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	cmpb	$254, 3735928559(%ebx,%ecx,8)
-        	cmpb	$0xfe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	cmpb	$127, 3735928559(%ebx,%ecx,8)
-        	cmpb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	cmpw	$31438, 3735928559(%ebx,%ecx,8)
-        	cmpw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	cmpl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	cmpl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	cmpl	$324478056, 3735928559(%ebx,%ecx,8)
-        	cmpl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	testb	$127, 3735928559(%ebx,%ecx,8)
-        	testb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	testw	$31438, 3735928559(%ebx,%ecx,8)
-        	testw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	testl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	testl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	testl	$324478056, 3735928559(%ebx,%ecx,8)
-        	testl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	andb	$254, 3735928559(%ebx,%ecx,8)
-        	andb	$0xfe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	andb	$127, 3735928559(%ebx,%ecx,8)
-        	andb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	andw	$31438, 3735928559(%ebx,%ecx,8)
-        	andw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	andl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	andl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	andl	$324478056, 3735928559(%ebx,%ecx,8)
-        	andl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	orb	$254, 3735928559(%ebx,%ecx,8)
-        	orb	$0xfe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	orb	$127, 3735928559(%ebx,%ecx,8)
-        	orb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	orw	$31438, 3735928559(%ebx,%ecx,8)
-        	orw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	orl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	orl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	orl	$324478056, 3735928559(%ebx,%ecx,8)
-        	orl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	xorb	$254, 3735928559(%ebx,%ecx,8)
-        	xorb	$0xfe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	xorb	$127, 3735928559(%ebx,%ecx,8)
-        	xorb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	xorw	$31438, 3735928559(%ebx,%ecx,8)
-        	xorw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	xorl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	xorl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	xorl	$324478056, 3735928559(%ebx,%ecx,8)
-        	xorl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	adcb	$254, 3735928559(%ebx,%ecx,8)
-        	adcb	$0xfe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	adcb	$127, 3735928559(%ebx,%ecx,8)
-        	adcb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	adcw	$31438, 3735928559(%ebx,%ecx,8)
-        	adcw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	adcl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	adcl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	adcl	$324478056, 3735928559(%ebx,%ecx,8)
-        	adcl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	negl	3735928559(%ebx,%ecx,8)
-        	negl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	notl	3735928559(%ebx,%ecx,8)
-        	notl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	cbtw
-        	cbtw
-
-// CHECK: 	cwtl
-        	cwtl
-
-// CHECK: 	cwtd
-        	cwtd
-
-// CHECK: 	cltd
-        	cltd
-
-// CHECK: 	mull	3735928559(%ebx,%ecx,8)
-        	mull	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	imull	3735928559(%ebx,%ecx,8)
-        	imull	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	divl	3735928559(%ebx,%ecx,8)
-        	divl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	idivl	3735928559(%ebx,%ecx,8)
-        	idivl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	roll	$0, 3735928559(%ebx,%ecx,8)
-        	roll	$0,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	rolb	$127, 3735928559(%ebx,%ecx,8)
-        	rolb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	roll	3735928559(%ebx,%ecx,8)
-        	roll	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	rorl	$0, 3735928559(%ebx,%ecx,8)
-        	rorl	$0,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	rorb	$127, 3735928559(%ebx,%ecx,8)
-        	rorb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	rorl	3735928559(%ebx,%ecx,8)
-        	rorl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	shll	$0, 3735928559(%ebx,%ecx,8)
-        	shll	$0,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	shlb	$127, 3735928559(%ebx,%ecx,8)
-        	shlb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	shll	3735928559(%ebx,%ecx,8)
-        	shll	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	shrl	$0, 3735928559(%ebx,%ecx,8)
-        	shrl	$0,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	shrb	$127, 3735928559(%ebx,%ecx,8)
-        	shrb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	shrl	3735928559(%ebx,%ecx,8)
-        	shrl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sarl	$0, 3735928559(%ebx,%ecx,8)
-        	sarl	$0,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sarb	$127, 3735928559(%ebx,%ecx,8)
-        	sarb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sarl	3735928559(%ebx,%ecx,8)
-        	sarl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	calll	*%ecx
-        	call	*%ecx
-
-// CHECK: 	calll	*3735928559(%ebx,%ecx,8)
-        	call	*0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	calll	*3735928559(%ebx,%ecx,8)
-        	call	*0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	jmpl	*3735928559(%ebx,%ecx,8)
-        	jmp	*0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	jmpl	*3735928559(%ebx,%ecx,8)
-        	jmp	*0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	ljmpl	*3735928559(%ebx,%ecx,8)
-        	ljmpl	*0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	lret
-        	lret
-
-// CHECK: 	leave
-        	leave
-
-// CHECK: 	leave
-        	leavel
-
-// CHECK: 	seto	%bl
-        	seto	%bl
-
-// CHECK: 	seto	3735928559(%ebx,%ecx,8)
-        	seto	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setno	%bl
-        	setno	%bl
-
-// CHECK: 	setno	3735928559(%ebx,%ecx,8)
-        	setno	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setb	%bl
-        	setb	%bl
-
-// CHECK: 	setb	3735928559(%ebx,%ecx,8)
-        	setb	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setae	%bl
-        	setae	%bl
-
-// CHECK: 	setae	3735928559(%ebx,%ecx,8)
-        	setae	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sete	%bl
-        	sete	%bl
-
-// CHECK: 	sete	3735928559(%ebx,%ecx,8)
-        	sete	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setne	%bl
-        	setne	%bl
-
-// CHECK: 	setne	3735928559(%ebx,%ecx,8)
-        	setne	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setbe	%bl
-        	setbe	%bl
-
-// CHECK: 	setbe	3735928559(%ebx,%ecx,8)
-        	setbe	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	seta	%bl
-        	seta	%bl
-
-// CHECK: 	seta	3735928559(%ebx,%ecx,8)
-        	seta	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sets	%bl
-        	sets	%bl
-
-// CHECK: 	sets	3735928559(%ebx,%ecx,8)
-        	sets	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setns	%bl
-        	setns	%bl
-
-// CHECK: 	setns	3735928559(%ebx,%ecx,8)
-        	setns	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setp	%bl
-        	setp	%bl
-
-// CHECK: 	setp	3735928559(%ebx,%ecx,8)
-        	setp	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setnp	%bl
-        	setnp	%bl
-
-// CHECK: 	setnp	3735928559(%ebx,%ecx,8)
-        	setnp	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setl	%bl
-        	setl	%bl
-
-// CHECK: 	setl	3735928559(%ebx,%ecx,8)
-        	setl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setge	%bl
-        	setge	%bl
-
-// CHECK: 	setge	3735928559(%ebx,%ecx,8)
-        	setge	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setle	%bl
-        	setle	%bl
-
-// CHECK: 	setle	3735928559(%ebx,%ecx,8)
-        	setle	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setg	%bl
-        	setg	%bl
-
-// CHECK: 	setg	3735928559(%ebx,%ecx,8)
-        	setg	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	nopl	3735928559(%ebx,%ecx,8)
-        	nopl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	nop
-        	nop
-
 // CHECK: flds	(%edi)
 // CHECK:  encoding: [0xd9,0x07]
         	flds	(%edi)
@@ -380,1270 +8,6 @@
 // CHECK:  encoding: [0xdf,0x07]
         	filds	(%edi)
 
-// CHECK: 	fldl	3735928559(%ebx,%ecx,8)
-        	fldl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fildl	3735928559(%ebx,%ecx,8)
-        	fildl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fildll	3735928559(%ebx,%ecx,8)
-        	fildll	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fldt	3735928559(%ebx,%ecx,8)
-        	fldt	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fbld	3735928559(%ebx,%ecx,8)
-        	fbld	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fstl	3735928559(%ebx,%ecx,8)
-        	fstl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fistl	3735928559(%ebx,%ecx,8)
-        	fistl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fstpl	3735928559(%ebx,%ecx,8)
-        	fstpl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fistpl	3735928559(%ebx,%ecx,8)
-        	fistpl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fistpll	3735928559(%ebx,%ecx,8)
-        	fistpll	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fstpt	3735928559(%ebx,%ecx,8)
-        	fstpt	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fbstp	3735928559(%ebx,%ecx,8)
-        	fbstp	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	ficoml	3735928559(%ebx,%ecx,8)
-        	ficoml	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	ficompl	3735928559(%ebx,%ecx,8)
-        	ficompl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fucompp
-        	fucompp
-
-// CHECK: 	ftst
-        	ftst
-
-// CHECK: 	fld1
-        	fld1
-
-// CHECK: 	fldz
-        	fldz
-
-// CHECK: 	faddl	3735928559(%ebx,%ecx,8)
-        	faddl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fiaddl	3735928559(%ebx,%ecx,8)
-        	fiaddl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fsubl	3735928559(%ebx,%ecx,8)
-        	fsubl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fisubl	3735928559(%ebx,%ecx,8)
-        	fisubl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fsubrl	3735928559(%ebx,%ecx,8)
-        	fsubrl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fisubrl	3735928559(%ebx,%ecx,8)
-        	fisubrl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fmull	3735928559(%ebx,%ecx,8)
-        	fmull	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fimull	3735928559(%ebx,%ecx,8)
-        	fimull	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fdivl	3735928559(%ebx,%ecx,8)
-        	fdivl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fidivl	3735928559(%ebx,%ecx,8)
-        	fidivl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fdivrl	3735928559(%ebx,%ecx,8)
-        	fdivrl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fidivrl	3735928559(%ebx,%ecx,8)
-        	fidivrl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fsqrt
-        	fsqrt
-
-// CHECK: 	fsin
-        	fsin
-
-// CHECK: 	fcos
-        	fcos
-
-// CHECK: 	fchs
-        	fchs
-
-// CHECK: 	fabs
-        	fabs
-
-// CHECK: 	fldcw	3735928559(%ebx,%ecx,8)
-        	fldcw	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fnstcw	3735928559(%ebx,%ecx,8)
-        	fnstcw	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	rdtsc
-        	rdtsc
-
-// CHECK: 	sysenter
-        	sysenter
-
-// CHECK: 	sysexit
-        	sysexit
-
-// CHECK: 	sysexitl
-        	sysexitl
-
-// CHECK: 	ud2
-        	ud2
-
-// CHECK: 	movntil	%ecx, 3735928559(%ebx,%ecx,8)
-        	movnti	%ecx,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	clflush	3735928559(%ebx,%ecx,8)
-        	clflush	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	emms
-        	emms
-
-// CHECK: 	movd	%ecx, %mm3
-        	movd	%ecx,%mm3
-
-// CHECK: 	movd	3735928559(%ebx,%ecx,8), %mm3
-        	movd	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	movd	%ecx, %xmm5
-        	movd	%ecx,%xmm5
-
-// CHECK: 	movd	3735928559(%ebx,%ecx,8), %xmm5
-        	movd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movd	%xmm5, %ecx
-        	movd	%xmm5,%ecx
-
-// CHECK: 	movd	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movd	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movq	3735928559(%ebx,%ecx,8), %mm3
-        	movq	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	movq	%mm3, %mm3
-        	movq	%mm3,%mm3
-
-// CHECK: 	movq	%mm3, %mm3
-        	movq	%mm3,%mm3
-
-// CHECK: 	movq	%xmm5, %xmm5
-        	movq	%xmm5,%xmm5
-
-// CHECK: 	movq	%xmm5, %xmm5
-        	movq	%xmm5,%xmm5
-
-// CHECK: 	packssdw	%mm3, %mm3
-        	packssdw	%mm3,%mm3
-
-// CHECK: 	packssdw	%xmm5, %xmm5
-        	packssdw	%xmm5,%xmm5
-
-// CHECK: 	packsswb	%mm3, %mm3
-        	packsswb	%mm3,%mm3
-
-// CHECK: 	packsswb	%xmm5, %xmm5
-        	packsswb	%xmm5,%xmm5
-
-// CHECK: 	packuswb	%mm3, %mm3
-        	packuswb	%mm3,%mm3
-
-// CHECK: 	packuswb	%xmm5, %xmm5
-        	packuswb	%xmm5,%xmm5
-
-// CHECK: 	paddb	%mm3, %mm3
-        	paddb	%mm3,%mm3
-
-// CHECK: 	paddb	%xmm5, %xmm5
-        	paddb	%xmm5,%xmm5
-
-// CHECK: 	paddw	%mm3, %mm3
-        	paddw	%mm3,%mm3
-
-// CHECK: 	paddw	%xmm5, %xmm5
-        	paddw	%xmm5,%xmm5
-
-// CHECK: 	paddd	%mm3, %mm3
-        	paddd	%mm3,%mm3
-
-// CHECK: 	paddd	%xmm5, %xmm5
-        	paddd	%xmm5,%xmm5
-
-// CHECK: 	paddq	%mm3, %mm3
-        	paddq	%mm3,%mm3
-
-// CHECK: 	paddq	%xmm5, %xmm5
-        	paddq	%xmm5,%xmm5
-
-// CHECK: 	paddsb	%mm3, %mm3
-        	paddsb	%mm3,%mm3
-
-// CHECK: 	paddsb	%xmm5, %xmm5
-        	paddsb	%xmm5,%xmm5
-
-// CHECK: 	paddsw	%mm3, %mm3
-        	paddsw	%mm3,%mm3
-
-// CHECK: 	paddsw	%xmm5, %xmm5
-        	paddsw	%xmm5,%xmm5
-
-// CHECK: 	paddusb	%mm3, %mm3
-        	paddusb	%mm3,%mm3
-
-// CHECK: 	paddusb	%xmm5, %xmm5
-        	paddusb	%xmm5,%xmm5
-
-// CHECK: 	paddusw	%mm3, %mm3
-        	paddusw	%mm3,%mm3
-
-// CHECK: 	paddusw	%xmm5, %xmm5
-        	paddusw	%xmm5,%xmm5
-
-// CHECK: 	pand	%mm3, %mm3
-        	pand	%mm3,%mm3
-
-// CHECK: 	pand	%xmm5, %xmm5
-        	pand	%xmm5,%xmm5
-
-// CHECK: 	pandn	%mm3, %mm3
-        	pandn	%mm3,%mm3
-
-// CHECK: 	pandn	%xmm5, %xmm5
-        	pandn	%xmm5,%xmm5
-
-// CHECK: 	pcmpeqb	%mm3, %mm3
-        	pcmpeqb	%mm3,%mm3
-
-// CHECK: 	pcmpeqb	%xmm5, %xmm5
-        	pcmpeqb	%xmm5,%xmm5
-
-// CHECK: 	pcmpeqw	%mm3, %mm3
-        	pcmpeqw	%mm3,%mm3
-
-// CHECK: 	pcmpeqw	%xmm5, %xmm5
-        	pcmpeqw	%xmm5,%xmm5
-
-// CHECK: 	pcmpeqd	%mm3, %mm3
-        	pcmpeqd	%mm3,%mm3
-
-// CHECK: 	pcmpeqd	%xmm5, %xmm5
-        	pcmpeqd	%xmm5,%xmm5
-
-// CHECK: 	pcmpgtb	%mm3, %mm3
-        	pcmpgtb	%mm3,%mm3
-
-// CHECK: 	pcmpgtb	%xmm5, %xmm5
-        	pcmpgtb	%xmm5,%xmm5
-
-// CHECK: 	pcmpgtw	%mm3, %mm3
-        	pcmpgtw	%mm3,%mm3
-
-// CHECK: 	pcmpgtw	%xmm5, %xmm5
-        	pcmpgtw	%xmm5,%xmm5
-
-// CHECK: 	pcmpgtd	%mm3, %mm3
-        	pcmpgtd	%mm3,%mm3
-
-// CHECK: 	pcmpgtd	%xmm5, %xmm5
-        	pcmpgtd	%xmm5,%xmm5
-
-// CHECK: 	pmaddwd	%mm3, %mm3
-        	pmaddwd	%mm3,%mm3
-
-// CHECK: 	pmaddwd	%xmm5, %xmm5
-        	pmaddwd	%xmm5,%xmm5
-
-// CHECK: 	pmulhw	%mm3, %mm3
-        	pmulhw	%mm3,%mm3
-
-// CHECK: 	pmulhw	%xmm5, %xmm5
-        	pmulhw	%xmm5,%xmm5
-
-// CHECK: 	pmullw	%mm3, %mm3
-        	pmullw	%mm3,%mm3
-
-// CHECK: 	pmullw	%xmm5, %xmm5
-        	pmullw	%xmm5,%xmm5
-
-// CHECK: 	por	%mm3, %mm3
-        	por	%mm3,%mm3
-
-// CHECK: 	por	%xmm5, %xmm5
-        	por	%xmm5,%xmm5
-
-// CHECK: 	psllw	%mm3, %mm3
-        	psllw	%mm3,%mm3
-
-// CHECK: 	psllw	%xmm5, %xmm5
-        	psllw	%xmm5,%xmm5
-
-// CHECK: 	psllw	$127, %mm3
-        	psllw	$0x7f,%mm3
-
-// CHECK: 	psllw	$127, %xmm5
-        	psllw	$0x7f,%xmm5
-
-// CHECK: 	pslld	%mm3, %mm3
-        	pslld	%mm3,%mm3
-
-// CHECK: 	pslld	%xmm5, %xmm5
-        	pslld	%xmm5,%xmm5
-
-// CHECK: 	pslld	$127, %mm3
-        	pslld	$0x7f,%mm3
-
-// CHECK: 	pslld	$127, %xmm5
-        	pslld	$0x7f,%xmm5
-
-// CHECK: 	psllq	%mm3, %mm3
-        	psllq	%mm3,%mm3
-
-// CHECK: 	psllq	%xmm5, %xmm5
-        	psllq	%xmm5,%xmm5
-
-// CHECK: 	psllq	$127, %mm3
-        	psllq	$0x7f,%mm3
-
-// CHECK: 	psllq	$127, %xmm5
-        	psllq	$0x7f,%xmm5
-
-// CHECK: 	psraw	%mm3, %mm3
-        	psraw	%mm3,%mm3
-
-// CHECK: 	psraw	%xmm5, %xmm5
-        	psraw	%xmm5,%xmm5
-
-// CHECK: 	psraw	$127, %mm3
-        	psraw	$0x7f,%mm3
-
-// CHECK: 	psraw	$127, %xmm5
-        	psraw	$0x7f,%xmm5
-
-// CHECK: 	psrad	%mm3, %mm3
-        	psrad	%mm3,%mm3
-
-// CHECK: 	psrad	%xmm5, %xmm5
-        	psrad	%xmm5,%xmm5
-
-// CHECK: 	psrad	$127, %mm3
-        	psrad	$0x7f,%mm3
-
-// CHECK: 	psrad	$127, %xmm5
-        	psrad	$0x7f,%xmm5
-
-// CHECK: 	psrlw	%mm3, %mm3
-        	psrlw	%mm3,%mm3
-
-// CHECK: 	psrlw	%xmm5, %xmm5
-        	psrlw	%xmm5,%xmm5
-
-// CHECK: 	psrlw	$127, %mm3
-        	psrlw	$0x7f,%mm3
-
-// CHECK: 	psrlw	$127, %xmm5
-        	psrlw	$0x7f,%xmm5
-
-// CHECK: 	psrld	%mm3, %mm3
-        	psrld	%mm3,%mm3
-
-// CHECK: 	psrld	%xmm5, %xmm5
-        	psrld	%xmm5,%xmm5
-
-// CHECK: 	psrld	$127, %mm3
-        	psrld	$0x7f,%mm3
-
-// CHECK: 	psrld	$127, %xmm5
-        	psrld	$0x7f,%xmm5
-
-// CHECK: 	psrlq	%mm3, %mm3
-        	psrlq	%mm3,%mm3
-
-// CHECK: 	psrlq	%xmm5, %xmm5
-        	psrlq	%xmm5,%xmm5
-
-// CHECK: 	psrlq	$127, %mm3
-        	psrlq	$0x7f,%mm3
-
-// CHECK: 	psrlq	$127, %xmm5
-        	psrlq	$0x7f,%xmm5
-
-// CHECK: 	psubb	%mm3, %mm3
-        	psubb	%mm3,%mm3
-
-// CHECK: 	psubb	%xmm5, %xmm5
-        	psubb	%xmm5,%xmm5
-
-// CHECK: 	psubw	%mm3, %mm3
-        	psubw	%mm3,%mm3
-
-// CHECK: 	psubw	%xmm5, %xmm5
-        	psubw	%xmm5,%xmm5
-
-// CHECK: 	psubd	%mm3, %mm3
-        	psubd	%mm3,%mm3
-
-// CHECK: 	psubd	%xmm5, %xmm5
-        	psubd	%xmm5,%xmm5
-
-// CHECK: 	psubq	%mm3, %mm3
-        	psubq	%mm3,%mm3
-
-// CHECK: 	psubq	%xmm5, %xmm5
-        	psubq	%xmm5,%xmm5
-
-// CHECK: 	psubsb	%mm3, %mm3
-        	psubsb	%mm3,%mm3
-
-// CHECK: 	psubsb	%xmm5, %xmm5
-        	psubsb	%xmm5,%xmm5
-
-// CHECK: 	psubsw	%mm3, %mm3
-        	psubsw	%mm3,%mm3
-
-// CHECK: 	psubsw	%xmm5, %xmm5
-        	psubsw	%xmm5,%xmm5
-
-// CHECK: 	psubusb	%mm3, %mm3
-        	psubusb	%mm3,%mm3
-
-// CHECK: 	psubusb	%xmm5, %xmm5
-        	psubusb	%xmm5,%xmm5
-
-// CHECK: 	psubusw	%mm3, %mm3
-        	psubusw	%mm3,%mm3
-
-// CHECK: 	psubusw	%xmm5, %xmm5
-        	psubusw	%xmm5,%xmm5
-
-// CHECK: 	punpckhbw	%mm3, %mm3
-        	punpckhbw	%mm3,%mm3
-
-// CHECK: 	punpckhbw	%xmm5, %xmm5
-        	punpckhbw	%xmm5,%xmm5
-
-// CHECK: 	punpckhwd	%mm3, %mm3
-        	punpckhwd	%mm3,%mm3
-
-// CHECK: 	punpckhwd	%xmm5, %xmm5
-        	punpckhwd	%xmm5,%xmm5
-
-// CHECK: 	punpckhdq	%mm3, %mm3
-        	punpckhdq	%mm3,%mm3
-
-// CHECK: 	punpckhdq	%xmm5, %xmm5
-        	punpckhdq	%xmm5,%xmm5
-
-// CHECK: 	punpcklbw	%mm3, %mm3
-        	punpcklbw	%mm3,%mm3
-
-// CHECK: 	punpcklbw	%xmm5, %xmm5
-        	punpcklbw	%xmm5,%xmm5
-
-// CHECK: 	punpcklwd	%mm3, %mm3
-        	punpcklwd	%mm3,%mm3
-
-// CHECK: 	punpcklwd	%xmm5, %xmm5
-        	punpcklwd	%xmm5,%xmm5
-
-// CHECK: 	punpckldq	%mm3, %mm3
-        	punpckldq	%mm3,%mm3
-
-// CHECK: 	punpckldq	%xmm5, %xmm5
-        	punpckldq	%xmm5,%xmm5
-
-// CHECK: 	pxor	%mm3, %mm3
-        	pxor	%mm3,%mm3
-
-// CHECK: 	pxor	%xmm5, %xmm5
-        	pxor	%xmm5,%xmm5
-
-// CHECK: 	addps	%xmm5, %xmm5
-        	addps	%xmm5,%xmm5
-
-// CHECK: 	addss	%xmm5, %xmm5
-        	addss	%xmm5,%xmm5
-
-// CHECK: 	andnps	%xmm5, %xmm5
-        	andnps	%xmm5,%xmm5
-
-// CHECK: 	andps	%xmm5, %xmm5
-        	andps	%xmm5,%xmm5
-
-// CHECK: 	cvtpi2ps	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtpi2ps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtpi2ps	%mm3, %xmm5
-        	cvtpi2ps	%mm3,%xmm5
-
-// CHECK: 	cvtps2pi	3735928559(%ebx,%ecx,8), %mm3
-        	cvtps2pi	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	cvtps2pi	%xmm5, %mm3
-        	cvtps2pi	%xmm5,%mm3
-
-// CHECK: 	cvtsi2ssl	%ecx, %xmm5
-        	cvtsi2ssl	%ecx,%xmm5
-
-// CHECK: 	cvtsi2ssl	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtsi2ssl	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvttps2pi	3735928559(%ebx,%ecx,8), %mm3
-        	cvttps2pi	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	cvttps2pi	%xmm5, %mm3
-        	cvttps2pi	%xmm5,%mm3
-
-// CHECK: 	cvttss2si	3735928559(%ebx,%ecx,8), %ecx
-        	cvttss2si	0xdeadbeef(%ebx,%ecx,8),%ecx
-
-// CHECK: 	cvttss2si	%xmm5, %ecx
-        	cvttss2si	%xmm5,%ecx
-
-// CHECK: 	divps	%xmm5, %xmm5
-        	divps	%xmm5,%xmm5
-
-// CHECK: 	divss	%xmm5, %xmm5
-        	divss	%xmm5,%xmm5
-
-// CHECK: 	ldmxcsr	3735928559(%ebx,%ecx,8)
-        	ldmxcsr	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	maskmovq	%mm3, %mm3
-        	maskmovq	%mm3,%mm3
-
-// CHECK: 	maxps	%xmm5, %xmm5
-        	maxps	%xmm5,%xmm5
-
-// CHECK: 	maxss	%xmm5, %xmm5
-        	maxss	%xmm5,%xmm5
-
-// CHECK: 	minps	%xmm5, %xmm5
-        	minps	%xmm5,%xmm5
-
-// CHECK: 	minss	%xmm5, %xmm5
-        	minss	%xmm5,%xmm5
-
-// CHECK: 	movaps	3735928559(%ebx,%ecx,8), %xmm5
-        	movaps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movaps	%xmm5, %xmm5
-        	movaps	%xmm5,%xmm5
-
-// CHECK: 	movaps	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movaps	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movaps	%xmm5, %xmm5
-        	movaps	%xmm5,%xmm5
-
-// CHECK: 	movhlps	%xmm5, %xmm5
-        	movhlps	%xmm5,%xmm5
-
-// CHECK: 	movhps	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movhps	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movlhps	%xmm5, %xmm5
-        	movlhps	%xmm5,%xmm5
-
-// CHECK: 	movlps	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movlps	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movmskps	%xmm5, %ecx
-        	movmskps	%xmm5,%ecx
-
-// CHECK: 	movntps	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movntps	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movntq	%mm3, 3735928559(%ebx,%ecx,8)
-        	movntq	%mm3,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movntdq	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movntdq	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movss	3735928559(%ebx,%ecx,8), %xmm5
-        	movss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movss	%xmm5, %xmm5
-        	movss	%xmm5,%xmm5
-
-// CHECK: 	movss	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movss	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movss	%xmm5, %xmm5
-        	movss	%xmm5,%xmm5
-
-// CHECK: 	movups	3735928559(%ebx,%ecx,8), %xmm5
-        	movups	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movups	%xmm5, %xmm5
-        	movups	%xmm5,%xmm5
-
-// CHECK: 	movups	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movups	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movups	%xmm5, %xmm5
-        	movups	%xmm5,%xmm5
-
-// CHECK: 	mulps	%xmm5, %xmm5
-        	mulps	%xmm5,%xmm5
-
-// CHECK: 	mulss	%xmm5, %xmm5
-        	mulss	%xmm5,%xmm5
-
-// CHECK: 	orps	%xmm5, %xmm5
-        	orps	%xmm5,%xmm5
-
-// CHECK: 	pavgb	%mm3, %mm3
-        	pavgb	%mm3,%mm3
-
-// CHECK: 	pavgb	%xmm5, %xmm5
-        	pavgb	%xmm5,%xmm5
-
-// CHECK: 	pavgw	%mm3, %mm3
-        	pavgw	%mm3,%mm3
-
-// CHECK: 	pavgw	%xmm5, %xmm5
-        	pavgw	%xmm5,%xmm5
-
-// CHECK: 	pmaxsw	%mm3, %mm3
-        	pmaxsw	%mm3,%mm3
-
-// CHECK: 	pmaxsw	%xmm5, %xmm5
-        	pmaxsw	%xmm5,%xmm5
-
-// CHECK: 	pmaxub	%mm3, %mm3
-        	pmaxub	%mm3,%mm3
-
-// CHECK: 	pmaxub	%xmm5, %xmm5
-        	pmaxub	%xmm5,%xmm5
-
-// CHECK: 	pminsw	%mm3, %mm3
-        	pminsw	%mm3,%mm3
-
-// CHECK: 	pminsw	%xmm5, %xmm5
-        	pminsw	%xmm5,%xmm5
-
-// CHECK: 	pminub	%mm3, %mm3
-        	pminub	%mm3,%mm3
-
-// CHECK: 	pminub	%xmm5, %xmm5
-        	pminub	%xmm5,%xmm5
-
-// CHECK: 	pmovmskb	%mm3, %ecx
-        	pmovmskb	%mm3,%ecx
-
-// CHECK: 	pmovmskb	%xmm5, %ecx
-        	pmovmskb	%xmm5,%ecx
-
-// CHECK: 	pmulhuw	%mm3, %mm3
-        	pmulhuw	%mm3,%mm3
-
-// CHECK: 	pmulhuw	%xmm5, %xmm5
-        	pmulhuw	%xmm5,%xmm5
-
-// CHECK: 	prefetchnta	3735928559(%ebx,%ecx,8)
-        	prefetchnta	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	prefetcht0	3735928559(%ebx,%ecx,8)
-        	prefetcht0	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	prefetcht1	3735928559(%ebx,%ecx,8)
-        	prefetcht1	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	prefetcht2	3735928559(%ebx,%ecx,8)
-        	prefetcht2	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	psadbw	%mm3, %mm3
-        	psadbw	%mm3,%mm3
-
-// CHECK: 	psadbw	%xmm5, %xmm5
-        	psadbw	%xmm5,%xmm5
-
-// CHECK: 	rcpps	3735928559(%ebx,%ecx,8), %xmm5
-        	rcpps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	rcpps	%xmm5, %xmm5
-        	rcpps	%xmm5,%xmm5
-
-// CHECK: 	rcpss	3735928559(%ebx,%ecx,8), %xmm5
-        	rcpss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	rcpss	%xmm5, %xmm5
-        	rcpss	%xmm5,%xmm5
-
-// CHECK: 	rsqrtps	3735928559(%ebx,%ecx,8), %xmm5
-        	rsqrtps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	rsqrtps	%xmm5, %xmm5
-        	rsqrtps	%xmm5,%xmm5
-
-// CHECK: 	rsqrtss	3735928559(%ebx,%ecx,8), %xmm5
-        	rsqrtss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	rsqrtss	%xmm5, %xmm5
-        	rsqrtss	%xmm5,%xmm5
-
-// CHECK: 	sqrtps	3735928559(%ebx,%ecx,8), %xmm5
-        	sqrtps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	sqrtps	%xmm5, %xmm5
-        	sqrtps	%xmm5,%xmm5
-
-// CHECK: 	sqrtss	3735928559(%ebx,%ecx,8), %xmm5
-        	sqrtss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	sqrtss	%xmm5, %xmm5
-        	sqrtss	%xmm5,%xmm5
-
-// CHECK: 	stmxcsr	3735928559(%ebx,%ecx,8)
-        	stmxcsr	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	subps	%xmm5, %xmm5
-        	subps	%xmm5,%xmm5
-
-// CHECK: 	subss	%xmm5, %xmm5
-        	subss	%xmm5,%xmm5
-
-// CHECK: 	ucomiss	3735928559(%ebx,%ecx,8), %xmm5
-        	ucomiss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	ucomiss	%xmm5, %xmm5
-        	ucomiss	%xmm5,%xmm5
-
-// CHECK: 	unpckhps	%xmm5, %xmm5
-        	unpckhps	%xmm5,%xmm5
-
-// CHECK: 	unpcklps	%xmm5, %xmm5
-        	unpcklps	%xmm5,%xmm5
-
-// CHECK: 	xorps	%xmm5, %xmm5
-        	xorps	%xmm5,%xmm5
-
-// CHECK: 	addpd	%xmm5, %xmm5
-        	addpd	%xmm5,%xmm5
-
-// CHECK: 	addsd	%xmm5, %xmm5
-        	addsd	%xmm5,%xmm5
-
-// CHECK: 	andnpd	%xmm5, %xmm5
-        	andnpd	%xmm5,%xmm5
-
-// CHECK: 	andpd	%xmm5, %xmm5
-        	andpd	%xmm5,%xmm5
-
-// CHECK: 	comisd	3735928559(%ebx,%ecx,8), %xmm5
-        	comisd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	comisd	%xmm5, %xmm5
-        	comisd	%xmm5,%xmm5
-
-// CHECK: 	cvtpi2pd	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtpi2pd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtpi2pd	%mm3, %xmm5
-        	cvtpi2pd	%mm3,%xmm5
-
-// CHECK: 	cvtsi2sdl	%ecx, %xmm5
-        	cvtsi2sdl	%ecx,%xmm5
-
-// CHECK: 	cvtsi2sdl	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtsi2sdl	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	divpd	%xmm5, %xmm5
-        	divpd	%xmm5,%xmm5
-
-// CHECK: 	divsd	%xmm5, %xmm5
-        	divsd	%xmm5,%xmm5
-
-// CHECK: 	maxpd	%xmm5, %xmm5
-        	maxpd	%xmm5,%xmm5
-
-// CHECK: 	maxsd	%xmm5, %xmm5
-        	maxsd	%xmm5,%xmm5
-
-// CHECK: 	minpd	%xmm5, %xmm5
-        	minpd	%xmm5,%xmm5
-
-// CHECK: 	minsd	%xmm5, %xmm5
-        	minsd	%xmm5,%xmm5
-
-// CHECK: 	movapd	3735928559(%ebx,%ecx,8), %xmm5
-        	movapd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movapd	%xmm5, %xmm5
-        	movapd	%xmm5,%xmm5
-
-// CHECK: 	movapd	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movapd	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movapd	%xmm5, %xmm5
-        	movapd	%xmm5,%xmm5
-
-// CHECK: 	movhpd	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movhpd	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movlpd	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movlpd	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movmskpd	%xmm5, %ecx
-        	movmskpd	%xmm5,%ecx
-
-// CHECK: 	movntpd	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movntpd	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movsd	3735928559(%ebx,%ecx,8), %xmm5
-        	movsd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movsd	%xmm5, %xmm5
-        	movsd	%xmm5,%xmm5
-
-// CHECK: 	movsd	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movsd	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movsd	%xmm5, %xmm5
-        	movsd	%xmm5,%xmm5
-
-// CHECK: 	movupd	3735928559(%ebx,%ecx,8), %xmm5
-        	movupd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movupd	%xmm5, %xmm5
-        	movupd	%xmm5,%xmm5
-
-// CHECK: 	movupd	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movupd	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movupd	%xmm5, %xmm5
-        	movupd	%xmm5,%xmm5
-
-// CHECK: 	mulpd	%xmm5, %xmm5
-        	mulpd	%xmm5,%xmm5
-
-// CHECK: 	mulsd	%xmm5, %xmm5
-        	mulsd	%xmm5,%xmm5
-
-// CHECK: 	orpd	%xmm5, %xmm5
-        	orpd	%xmm5,%xmm5
-
-// CHECK: 	sqrtpd	3735928559(%ebx,%ecx,8), %xmm5
-        	sqrtpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	sqrtpd	%xmm5, %xmm5
-        	sqrtpd	%xmm5,%xmm5
-
-// CHECK: 	sqrtsd	3735928559(%ebx,%ecx,8), %xmm5
-        	sqrtsd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	sqrtsd	%xmm5, %xmm5
-        	sqrtsd	%xmm5,%xmm5
-
-// CHECK: 	subpd	%xmm5, %xmm5
-        	subpd	%xmm5,%xmm5
-
-// CHECK: 	subsd	%xmm5, %xmm5
-        	subsd	%xmm5,%xmm5
-
-// CHECK: 	ucomisd	3735928559(%ebx,%ecx,8), %xmm5
-        	ucomisd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	ucomisd	%xmm5, %xmm5
-        	ucomisd	%xmm5,%xmm5
-
-// CHECK: 	unpckhpd	%xmm5, %xmm5
-        	unpckhpd	%xmm5,%xmm5
-
-// CHECK: 	unpcklpd	%xmm5, %xmm5
-        	unpcklpd	%xmm5,%xmm5
-
-// CHECK: 	xorpd	%xmm5, %xmm5
-        	xorpd	%xmm5,%xmm5
-
-// CHECK: 	cvtdq2pd	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtdq2pd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtdq2pd	%xmm5, %xmm5
-        	cvtdq2pd	%xmm5,%xmm5
-
-// CHECK: 	cvtpd2dq	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtpd2dq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtpd2dq	%xmm5, %xmm5
-        	cvtpd2dq	%xmm5,%xmm5
-
-// CHECK: 	cvtdq2ps	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtdq2ps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtdq2ps	%xmm5, %xmm5
-        	cvtdq2ps	%xmm5,%xmm5
-
-// CHECK: 	cvtpd2pi	3735928559(%ebx,%ecx,8), %mm3
-        	cvtpd2pi	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	cvtpd2pi	%xmm5, %mm3
-        	cvtpd2pi	%xmm5,%mm3
-
-// CHECK: 	cvtps2dq	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtps2dq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtps2dq	%xmm5, %xmm5
-        	cvtps2dq	%xmm5,%xmm5
-
-// CHECK: 	cvtsd2ss	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtsd2ss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtsd2ss	%xmm5, %xmm5
-        	cvtsd2ss	%xmm5,%xmm5
-
-// CHECK: 	cvtss2sd	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtss2sd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtss2sd	%xmm5, %xmm5
-        	cvtss2sd	%xmm5,%xmm5
-
-// CHECK: 	cvttpd2pi	3735928559(%ebx,%ecx,8), %mm3
-        	cvttpd2pi	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	cvttpd2pi	%xmm5, %mm3
-        	cvttpd2pi	%xmm5,%mm3
-
-// CHECK: 	cvttsd2si	3735928559(%ebx,%ecx,8), %ecx
-        	cvttsd2si	0xdeadbeef(%ebx,%ecx,8),%ecx
-
-// CHECK: 	cvttsd2si	%xmm5, %ecx
-        	cvttsd2si	%xmm5,%ecx
-
-// CHECK: 	maskmovdqu	%xmm5, %xmm5
-        	maskmovdqu	%xmm5,%xmm5
-
-// CHECK: 	movdqa	3735928559(%ebx,%ecx,8), %xmm5
-        	movdqa	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movdqa	%xmm5, %xmm5
-        	movdqa	%xmm5,%xmm5
-
-// CHECK: 	movdqa	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movdqa	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movdqa	%xmm5, %xmm5
-        	movdqa	%xmm5,%xmm5
-
-// CHECK: 	movdqu	3735928559(%ebx,%ecx,8), %xmm5
-        	movdqu	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movdqu	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movdqu	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movdq2q	%xmm5, %mm3
-        	movdq2q	%xmm5,%mm3
-
-// CHECK: 	movq2dq	%mm3, %xmm5
-        	movq2dq	%mm3,%xmm5
-
-// CHECK: 	pmuludq	%mm3, %mm3
-        	pmuludq	%mm3,%mm3
-
-// CHECK: 	pmuludq	%xmm5, %xmm5
-        	pmuludq	%xmm5,%xmm5
-
-// CHECK: 	pslldq	$127, %xmm5
-        	pslldq	$0x7f,%xmm5
-
-// CHECK: 	psrldq	$127, %xmm5
-        	psrldq	$0x7f,%xmm5
-
-// CHECK: 	punpckhqdq	%xmm5, %xmm5
-        	punpckhqdq	%xmm5,%xmm5
-
-// CHECK: 	punpcklqdq	%xmm5, %xmm5
-        	punpcklqdq	%xmm5,%xmm5
-
-// CHECK: 	addsubpd	%xmm5, %xmm5
-        	addsubpd	%xmm5,%xmm5
-
-// CHECK: 	addsubps	%xmm5, %xmm5
-        	addsubps	%xmm5,%xmm5
-
-// CHECK: 	haddpd	%xmm5, %xmm5
-        	haddpd	%xmm5,%xmm5
-
-// CHECK: 	haddps	%xmm5, %xmm5
-        	haddps	%xmm5,%xmm5
-
-// CHECK: 	hsubpd	%xmm5, %xmm5
-        	hsubpd	%xmm5,%xmm5
-
-// CHECK: 	hsubps	%xmm5, %xmm5
-        	hsubps	%xmm5,%xmm5
-
-// CHECK: 	lddqu	3735928559(%ebx,%ecx,8), %xmm5
-        	lddqu	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movddup	3735928559(%ebx,%ecx,8), %xmm5
-        	movddup	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movddup	%xmm5, %xmm5
-        	movddup	%xmm5,%xmm5
-
-// CHECK: 	movshdup	3735928559(%ebx,%ecx,8), %xmm5
-        	movshdup	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movshdup	%xmm5, %xmm5
-        	movshdup	%xmm5,%xmm5
-
-// CHECK: 	movsldup	3735928559(%ebx,%ecx,8), %xmm5
-        	movsldup	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movsldup	%xmm5, %xmm5
-        	movsldup	%xmm5,%xmm5
-
-// CHECK: 	phaddw	%mm3, %mm3
-        	phaddw	%mm3,%mm3
-
-// CHECK: 	phaddw	%xmm5, %xmm5
-        	phaddw	%xmm5,%xmm5
-
-// CHECK: 	phaddd	%mm3, %mm3
-        	phaddd	%mm3,%mm3
-
-// CHECK: 	phaddd	%xmm5, %xmm5
-        	phaddd	%xmm5,%xmm5
-
-// CHECK: 	phaddsw	%mm3, %mm3
-        	phaddsw	%mm3,%mm3
-
-// CHECK: 	phaddsw	%xmm5, %xmm5
-        	phaddsw	%xmm5,%xmm5
-
-// CHECK: 	phsubw	%mm3, %mm3
-        	phsubw	%mm3,%mm3
-
-// CHECK: 	phsubw	%xmm5, %xmm5
-        	phsubw	%xmm5,%xmm5
-
-// CHECK: 	phsubd	%mm3, %mm3
-        	phsubd	%mm3,%mm3
-
-// CHECK: 	phsubd	%xmm5, %xmm5
-        	phsubd	%xmm5,%xmm5
-
-// CHECK: 	phsubsw	%mm3, %mm3
-        	phsubsw	%mm3,%mm3
-
-// CHECK: 	phsubsw	%xmm5, %xmm5
-        	phsubsw	%xmm5,%xmm5
-
-// CHECK: 	pmaddubsw	%mm3, %mm3
-        	pmaddubsw	%mm3,%mm3
-
-// CHECK: 	pmaddubsw	%xmm5, %xmm5
-        	pmaddubsw	%xmm5,%xmm5
-
-// CHECK: 	pmulhrsw	%mm3, %mm3
-        	pmulhrsw	%mm3,%mm3
-
-// CHECK: 	pmulhrsw	%xmm5, %xmm5
-        	pmulhrsw	%xmm5,%xmm5
-
-// CHECK: 	pshufb	%mm3, %mm3
-        	pshufb	%mm3,%mm3
-
-// CHECK: 	pshufb	%xmm5, %xmm5
-        	pshufb	%xmm5,%xmm5
-
-// CHECK: 	psignb	%mm3, %mm3
-        	psignb	%mm3,%mm3
-
-// CHECK: 	psignb	%xmm5, %xmm5
-        	psignb	%xmm5,%xmm5
-
-// CHECK: 	psignw	%mm3, %mm3
-        	psignw	%mm3,%mm3
-
-// CHECK: 	psignw	%xmm5, %xmm5
-        	psignw	%xmm5,%xmm5
-
-// CHECK: 	psignd	%mm3, %mm3
-        	psignd	%mm3,%mm3
-
-// CHECK: 	psignd	%xmm5, %xmm5
-        	psignd	%xmm5,%xmm5
-
-// CHECK: 	pabsb	3735928559(%ebx,%ecx,8), %mm3
-        	pabsb	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pabsb	%mm3, %mm3
-        	pabsb	%mm3,%mm3
-
-// CHECK: 	pabsb	3735928559(%ebx,%ecx,8), %xmm5
-        	pabsb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pabsb	%xmm5, %xmm5
-        	pabsb	%xmm5,%xmm5
-
-// CHECK: 	pabsw	3735928559(%ebx,%ecx,8), %mm3
-        	pabsw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pabsw	%mm3, %mm3
-        	pabsw	%mm3,%mm3
-
-// CHECK: 	pabsw	3735928559(%ebx,%ecx,8), %xmm5
-        	pabsw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pabsw	%xmm5, %xmm5
-        	pabsw	%xmm5,%xmm5
-
-// CHECK: 	pabsd	3735928559(%ebx,%ecx,8), %mm3
-        	pabsd	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pabsd	%mm3, %mm3
-        	pabsd	%mm3,%mm3
-
-// CHECK: 	pabsd	3735928559(%ebx,%ecx,8), %xmm5
-        	pabsd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pabsd	%xmm5, %xmm5
-        	pabsd	%xmm5,%xmm5
-
-// CHECK: 	femms
-        	femms
-
-// CHECK: 	packusdw	%xmm5, %xmm5
-        	packusdw	%xmm5,%xmm5
-
-// CHECK: 	pcmpeqq	%xmm5, %xmm5
-        	pcmpeqq	%xmm5,%xmm5
-
-// CHECK: 	phminposuw	3735928559(%ebx,%ecx,8), %xmm5
-        	phminposuw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	phminposuw	%xmm5, %xmm5
-        	phminposuw	%xmm5,%xmm5
-
-// CHECK: 	pmaxsb	%xmm5, %xmm5
-        	pmaxsb	%xmm5,%xmm5
-
-// CHECK: 	pmaxsd	%xmm5, %xmm5
-        	pmaxsd	%xmm5,%xmm5
-
-// CHECK: 	pmaxud	%xmm5, %xmm5
-        	pmaxud	%xmm5,%xmm5
-
-// CHECK: 	pmaxuw	%xmm5, %xmm5
-        	pmaxuw	%xmm5,%xmm5
-
-// CHECK: 	pminsb	%xmm5, %xmm5
-        	pminsb	%xmm5,%xmm5
-
-// CHECK: 	pminsd	%xmm5, %xmm5
-        	pminsd	%xmm5,%xmm5
-
-// CHECK: 	pminud	%xmm5, %xmm5
-        	pminud	%xmm5,%xmm5
-
-// CHECK: 	pminuw	%xmm5, %xmm5
-        	pminuw	%xmm5,%xmm5
-
-// CHECK: 	pmovsxbw	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovsxbw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovsxbw	%xmm5, %xmm5
-        	pmovsxbw	%xmm5,%xmm5
-
-// CHECK: 	pmovsxbd	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovsxbd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovsxbd	%xmm5, %xmm5
-        	pmovsxbd	%xmm5,%xmm5
-
-// CHECK: 	pmovsxbq	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovsxbq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovsxbq	%xmm5, %xmm5
-        	pmovsxbq	%xmm5,%xmm5
-
-// CHECK: 	pmovsxwd	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovsxwd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovsxwd	%xmm5, %xmm5
-        	pmovsxwd	%xmm5,%xmm5
-
-// CHECK: 	pmovsxwq	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovsxwq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovsxwq	%xmm5, %xmm5
-        	pmovsxwq	%xmm5,%xmm5
-
-// CHECK: 	pmovsxdq	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovsxdq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovsxdq	%xmm5, %xmm5
-        	pmovsxdq	%xmm5,%xmm5
-
-// CHECK: 	pmovzxbw	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovzxbw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovzxbw	%xmm5, %xmm5
-        	pmovzxbw	%xmm5,%xmm5
-
-// CHECK: 	pmovzxbd	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovzxbd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovzxbd	%xmm5, %xmm5
-        	pmovzxbd	%xmm5,%xmm5
-
-// CHECK: 	pmovzxbq	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovzxbq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovzxbq	%xmm5, %xmm5
-        	pmovzxbq	%xmm5,%xmm5
-
-// CHECK: 	pmovzxwd	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovzxwd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovzxwd	%xmm5, %xmm5
-        	pmovzxwd	%xmm5,%xmm5
-
-// CHECK: 	pmovzxwq	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovzxwq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovzxwq	%xmm5, %xmm5
-        	pmovzxwq	%xmm5,%xmm5
-
-// CHECK: 	pmovzxdq	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovzxdq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovzxdq	%xmm5, %xmm5
-        	pmovzxdq	%xmm5,%xmm5
-
-// CHECK: 	pmuldq	%xmm5, %xmm5
-        	pmuldq	%xmm5,%xmm5
-
-// CHECK: 	pmulld	%xmm5, %xmm5
-        	pmulld	%xmm5,%xmm5
-
-// CHECK: 	ptest 	3735928559(%ebx,%ecx,8), %xmm5
-        	ptest	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	ptest 	%xmm5, %xmm5
-        	ptest	%xmm5,%xmm5
-
-// CHECK: 	pcmpgtq	%xmm5, %xmm5
-        	pcmpgtq	%xmm5,%xmm5
-
-
 // CHECK: movb	$127, 3735928559(%ebx,%ecx,8)
 // CHECK:  encoding: [0xc6,0x84,0xcb,0xef,0xbe,0xad,0xde,0x7f]
         	movb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
@@ -4708,10 +3072,6 @@
 // CHECK:  encoding: [0x66,0x0f,0xd6,0x2d,0x78,0x56,0x34,0x12]
         	movq	%xmm5,0x12345678
 
-// CHECK: movq	%xmm5, %xmm5
-// CHECK:  encoding: [0xf3,0x0f,0x7e,0xed]
-        	movq	%xmm5,%xmm5
-
 // CHECK: packssdw	3735928559(%ebx,%ecx,8), %mm3
 // CHECK:  encoding: [0x0f,0x6b,0x9c,0xcb,0xef,0xbe,0xad,0xde]
         	packssdw	0xdeadbeef(%ebx,%ecx,8),%mm3
@@ -7432,10 +5792,6 @@
 // CHECK:  encoding: [0x0f,0x29,0x2d,0x78,0x56,0x34,0x12]
         	movaps	%xmm5,0x12345678
 
-// CHECK: movaps	%xmm5, %xmm5
-// CHECK:  encoding: [0x0f,0x28,0xed]
-        	movaps	%xmm5,%xmm5
-
 // CHECK: movhlps	%xmm5, %xmm5
 // CHECK:  encoding: [0x0f,0x12,0xed]
         	movhlps	%xmm5,%xmm5
@@ -9986,205 +8342,205 @@
 
 // CHECK: hsubps	%xmm5, %xmm5
 // CHECK:  encoding: [0xf2,0x0f,0x7d,0xed]
-        	hsubps	%xmm5,%xmm5
+                  hsubps	%xmm5,%xmm5
 
-// CHECK: lddqu	3735928559(%ebx,%ecx,8), %xmm5
-// CHECK:  encoding: [0xf2,0x0f,0xf0,0xac,0xcb,0xef,0xbe,0xad,0xde]
-        	lddqu	0xdeadbeef(%ebx,%ecx,8),%xmm5
+  // CHECK: lddqu	3735928559(%ebx,%ecx,8), %xmm5
+  // CHECK:  encoding: [0xf2,0x0f,0xf0,0xac,0xcb,0xef,0xbe,0xad,0xde]
+                  lddqu	0xdeadbeef(%ebx,%ecx,8),%xmm5
 
-// CHECK: lddqu	69, %xmm5
-// CHECK:  encoding: [0xf2,0x0f,0xf0,0x2d,0x45,0x00,0x00,0x00]
-        	lddqu	0x45,%xmm5
+  // CHECK: lddqu	69, %xmm5
+  // CHECK:  encoding: [0xf2,0x0f,0xf0,0x2d,0x45,0x00,0x00,0x00]
+                  lddqu	0x45,%xmm5
 
-// CHECK: lddqu	32493, %xmm5
-// CHECK:  encoding: [0xf2,0x0f,0xf0,0x2d,0xed,0x7e,0x00,0x00]
-        	lddqu	0x7eed,%xmm5
+  // CHECK: lddqu	32493, %xmm5
+  // CHECK:  encoding: [0xf2,0x0f,0xf0,0x2d,0xed,0x7e,0x00,0x00]
+                  lddqu	0x7eed,%xmm5
 
-// CHECK: lddqu	3133065982, %xmm5
-// CHECK:  encoding: [0xf2,0x0f,0xf0,0x2d,0xfe,0xca,0xbe,0xba]
-        	lddqu	0xbabecafe,%xmm5
+  // CHECK: lddqu	3133065982, %xmm5
+  // CHECK:  encoding: [0xf2,0x0f,0xf0,0x2d,0xfe,0xca,0xbe,0xba]
+                  lddqu	0xbabecafe,%xmm5
 
-// CHECK: lddqu	305419896, %xmm5
-// CHECK:  encoding: [0xf2,0x0f,0xf0,0x2d,0x78,0x56,0x34,0x12]
-        	lddqu	0x12345678,%xmm5
+  // CHECK: lddqu	305419896, %xmm5
+  // CHECK:  encoding: [0xf2,0x0f,0xf0,0x2d,0x78,0x56,0x34,0x12]
+                  lddqu	0x12345678,%xmm5
 
-// CHECK: movddup	3735928559(%ebx,%ecx,8), %xmm5
-// CHECK:  encoding: [0xf2,0x0f,0x12,0xac,0xcb,0xef,0xbe,0xad,0xde]
-        	movddup	0xdeadbeef(%ebx,%ecx,8),%xmm5
+  // CHECK: movddup	3735928559(%ebx,%ecx,8), %xmm5
+  // CHECK:  encoding: [0xf2,0x0f,0x12,0xac,0xcb,0xef,0xbe,0xad,0xde]
+                  movddup	0xdeadbeef(%ebx,%ecx,8),%xmm5
 
-// CHECK: movddup	69, %xmm5
-// CHECK:  encoding: [0xf2,0x0f,0x12,0x2d,0x45,0x00,0x00,0x00]
-        	movddup	0x45,%xmm5
+  // CHECK: movddup	69, %xmm5
+  // CHECK:  encoding: [0xf2,0x0f,0x12,0x2d,0x45,0x00,0x00,0x00]
+                  movddup	0x45,%xmm5
 
-// CHECK: movddup	32493, %xmm5
-// CHECK:  encoding: [0xf2,0x0f,0x12,0x2d,0xed,0x7e,0x00,0x00]
-        	movddup	0x7eed,%xmm5
+  // CHECK: movddup	32493, %xmm5
+  // CHECK:  encoding: [0xf2,0x0f,0x12,0x2d,0xed,0x7e,0x00,0x00]
+                  movddup	0x7eed,%xmm5
 
-// CHECK: movddup	3133065982, %xmm5
-// CHECK:  encoding: [0xf2,0x0f,0x12,0x2d,0xfe,0xca,0xbe,0xba]
-        	movddup	0xbabecafe,%xmm5
+  // CHECK: movddup	3133065982, %xmm5
+  // CHECK:  encoding: [0xf2,0x0f,0x12,0x2d,0xfe,0xca,0xbe,0xba]
+                  movddup	0xbabecafe,%xmm5
 
-// CHECK: movddup	305419896, %xmm5
-// CHECK:  encoding: [0xf2,0x0f,0x12,0x2d,0x78,0x56,0x34,0x12]
-        	movddup	0x12345678,%xmm5
+  // CHECK: movddup	305419896, %xmm5
+  // CHECK:  encoding: [0xf2,0x0f,0x12,0x2d,0x78,0x56,0x34,0x12]
+                  movddup	0x12345678,%xmm5
 
-// CHECK: movddup	%xmm5, %xmm5
-// CHECK:  encoding: [0xf2,0x0f,0x12,0xed]
-        	movddup	%xmm5,%xmm5
+  // CHECK: movddup	%xmm5, %xmm5
+  // CHECK:  encoding: [0xf2,0x0f,0x12,0xed]
+                  movddup	%xmm5,%xmm5
 
-// CHECK: movshdup	3735928559(%ebx,%ecx,8), %xmm5
-// CHECK:  encoding: [0xf3,0x0f,0x16,0xac,0xcb,0xef,0xbe,0xad,0xde]
-        	movshdup	0xdeadbeef(%ebx,%ecx,8),%xmm5
+  // CHECK: movshdup	3735928559(%ebx,%ecx,8), %xmm5
+  // CHECK:  encoding: [0xf3,0x0f,0x16,0xac,0xcb,0xef,0xbe,0xad,0xde]
+                  movshdup	0xdeadbeef(%ebx,%ecx,8),%xmm5
 
-// CHECK: movshdup	69, %xmm5
-// CHECK:  encoding: [0xf3,0x0f,0x16,0x2d,0x45,0x00,0x00,0x00]
-        	movshdup	0x45,%xmm5
+  // CHECK: movshdup	69, %xmm5
+  // CHECK:  encoding: [0xf3,0x0f,0x16,0x2d,0x45,0x00,0x00,0x00]
+                  movshdup	0x45,%xmm5
 
-// CHECK: movshdup	32493, %xmm5
-// CHECK:  encoding: [0xf3,0x0f,0x16,0x2d,0xed,0x7e,0x00,0x00]
-        	movshdup	0x7eed,%xmm5
+  // CHECK: movshdup	32493, %xmm5
+  // CHECK:  encoding: [0xf3,0x0f,0x16,0x2d,0xed,0x7e,0x00,0x00]
+                  movshdup	0x7eed,%xmm5
 
-// CHECK: movshdup	3133065982, %xmm5
-// CHECK:  encoding: [0xf3,0x0f,0x16,0x2d,0xfe,0xca,0xbe,0xba]
-        	movshdup	0xbabecafe,%xmm5
+  // CHECK: movshdup	3133065982, %xmm5
+  // CHECK:  encoding: [0xf3,0x0f,0x16,0x2d,0xfe,0xca,0xbe,0xba]
+                  movshdup	0xbabecafe,%xmm5
 
-// CHECK: movshdup	305419896, %xmm5
-// CHECK:  encoding: [0xf3,0x0f,0x16,0x2d,0x78,0x56,0x34,0x12]
-        	movshdup	0x12345678,%xmm5
+  // CHECK: movshdup	305419896, %xmm5
+  // CHECK:  encoding: [0xf3,0x0f,0x16,0x2d,0x78,0x56,0x34,0x12]
+                  movshdup	0x12345678,%xmm5
 
-// CHECK: movshdup	%xmm5, %xmm5
-// CHECK:  encoding: [0xf3,0x0f,0x16,0xed]
-        	movshdup	%xmm5,%xmm5
+  // CHECK: movshdup	%xmm5, %xmm5
+  // CHECK:  encoding: [0xf3,0x0f,0x16,0xed]
+                  movshdup	%xmm5,%xmm5
 
-// CHECK: movsldup	3735928559(%ebx,%ecx,8), %xmm5
-// CHECK:  encoding: [0xf3,0x0f,0x12,0xac,0xcb,0xef,0xbe,0xad,0xde]
-        	movsldup	0xdeadbeef(%ebx,%ecx,8),%xmm5
+  // CHECK: movsldup	3735928559(%ebx,%ecx,8), %xmm5
+  // CHECK:  encoding: [0xf3,0x0f,0x12,0xac,0xcb,0xef,0xbe,0xad,0xde]
+                  movsldup	0xdeadbeef(%ebx,%ecx,8),%xmm5
 
-// CHECK: movsldup	69, %xmm5
-// CHECK:  encoding: [0xf3,0x0f,0x12,0x2d,0x45,0x00,0x00,0x00]
-        	movsldup	0x45,%xmm5
+  // CHECK: movsldup	69, %xmm5
+  // CHECK:  encoding: [0xf3,0x0f,0x12,0x2d,0x45,0x00,0x00,0x00]
+                  movsldup	0x45,%xmm5
 
-// CHECK: movsldup	32493, %xmm5
-// CHECK:  encoding: [0xf3,0x0f,0x12,0x2d,0xed,0x7e,0x00,0x00]
-        	movsldup	0x7eed,%xmm5
+  // CHECK: movsldup	32493, %xmm5
+  // CHECK:  encoding: [0xf3,0x0f,0x12,0x2d,0xed,0x7e,0x00,0x00]
+                  movsldup	0x7eed,%xmm5
 
-// CHECK: movsldup	3133065982, %xmm5
-// CHECK:  encoding: [0xf3,0x0f,0x12,0x2d,0xfe,0xca,0xbe,0xba]
-        	movsldup	0xbabecafe,%xmm5
+  // CHECK: movsldup	3133065982, %xmm5
+  // CHECK:  encoding: [0xf3,0x0f,0x12,0x2d,0xfe,0xca,0xbe,0xba]
+                  movsldup	0xbabecafe,%xmm5
 
-// CHECK: movsldup	305419896, %xmm5
-// CHECK:  encoding: [0xf3,0x0f,0x12,0x2d,0x78,0x56,0x34,0x12]
-        	movsldup	0x12345678,%xmm5
+  // CHECK: movsldup	305419896, %xmm5
+  // CHECK:  encoding: [0xf3,0x0f,0x12,0x2d,0x78,0x56,0x34,0x12]
+                  movsldup	0x12345678,%xmm5
 
-// CHECK: movsldup	%xmm5, %xmm5
-// CHECK:  encoding: [0xf3,0x0f,0x12,0xed]
-        	movsldup	%xmm5,%xmm5
+  // CHECK: movsldup	%xmm5, %xmm5
+  // CHECK:  encoding: [0xf3,0x0f,0x12,0xed]
+                  movsldup	%xmm5,%xmm5
 
-// CHECK: vmclear	3735928559(%ebx,%ecx,8)
-// CHECK:  encoding: [0x66,0x0f,0xc7,0xb4,0xcb,0xef,0xbe,0xad,0xde]
-        	vmclear	0xdeadbeef(%ebx,%ecx,8)
+  // CHECK: vmclear	3735928559(%ebx,%ecx,8)
+  // CHECK:  encoding: [0x66,0x0f,0xc7,0xb4,0xcb,0xef,0xbe,0xad,0xde]
+                  vmclear	0xdeadbeef(%ebx,%ecx,8)
 
-// CHECK: vmclear	32493
-// CHECK:  encoding: [0x66,0x0f,0xc7,0x35,0xed,0x7e,0x00,0x00]
-        	vmclear	0x7eed
+  // CHECK: vmclear	32493
+  // CHECK:  encoding: [0x66,0x0f,0xc7,0x35,0xed,0x7e,0x00,0x00]
+                  vmclear	0x7eed
 
-// CHECK: vmclear	3133065982
-// CHECK:  encoding: [0x66,0x0f,0xc7,0x35,0xfe,0xca,0xbe,0xba]
-        	vmclear	0xbabecafe
+  // CHECK: vmclear	3133065982
+  // CHECK:  encoding: [0x66,0x0f,0xc7,0x35,0xfe,0xca,0xbe,0xba]
+                  vmclear	0xbabecafe
 
-// CHECK: vmclear	305419896
-// CHECK:  encoding: [0x66,0x0f,0xc7,0x35,0x78,0x56,0x34,0x12]
-        	vmclear	0x12345678
+  // CHECK: vmclear	305419896
+  // CHECK:  encoding: [0x66,0x0f,0xc7,0x35,0x78,0x56,0x34,0x12]
+                  vmclear	0x12345678
 
-// CHECK: vmptrld	3735928559(%ebx,%ecx,8)
-// CHECK:  encoding: [0x0f,0xc7,0xb4,0xcb,0xef,0xbe,0xad,0xde]
-        	vmptrld	0xdeadbeef(%ebx,%ecx,8)
+  // CHECK: vmptrld	3735928559(%ebx,%ecx,8)
+  // CHECK:  encoding: [0x0f,0xc7,0xb4,0xcb,0xef,0xbe,0xad,0xde]
+                  vmptrld	0xdeadbeef(%ebx,%ecx,8)
 
-// CHECK: vmptrld	32493
-// CHECK:  encoding: [0x0f,0xc7,0x35,0xed,0x7e,0x00,0x00]
-        	vmptrld	0x7eed
+  // CHECK: vmptrld	32493
+  // CHECK:  encoding: [0x0f,0xc7,0x35,0xed,0x7e,0x00,0x00]
+                  vmptrld	0x7eed
 
-// CHECK: vmptrld	3133065982
-// CHECK:  encoding: [0x0f,0xc7,0x35,0xfe,0xca,0xbe,0xba]
-        	vmptrld	0xbabecafe
+  // CHECK: vmptrld	3133065982
+  // CHECK:  encoding: [0x0f,0xc7,0x35,0xfe,0xca,0xbe,0xba]
+                  vmptrld	0xbabecafe
 
-// CHECK: vmptrld	305419896
-// CHECK:  encoding: [0x0f,0xc7,0x35,0x78,0x56,0x34,0x12]
-        	vmptrld	0x12345678
+  // CHECK: vmptrld	305419896
+  // CHECK:  encoding: [0x0f,0xc7,0x35,0x78,0x56,0x34,0x12]
+                  vmptrld	0x12345678
 
-// CHECK: vmptrst	3735928559(%ebx,%ecx,8)
-// CHECK:  encoding: [0x0f,0xc7,0xbc,0xcb,0xef,0xbe,0xad,0xde]
-        	vmptrst	0xdeadbeef(%ebx,%ecx,8)
+  // CHECK: vmptrst	3735928559(%ebx,%ecx,8)
+  // CHECK:  encoding: [0x0f,0xc7,0xbc,0xcb,0xef,0xbe,0xad,0xde]
+                  vmptrst	0xdeadbeef(%ebx,%ecx,8)
 
-// CHECK: vmptrst	32493
-// CHECK:  encoding: [0x0f,0xc7,0x3d,0xed,0x7e,0x00,0x00]
-        	vmptrst	0x7eed
+  // CHECK: vmptrst	32493
+  // CHECK:  encoding: [0x0f,0xc7,0x3d,0xed,0x7e,0x00,0x00]
+                  vmptrst	0x7eed
 
-// CHECK: vmptrst	3133065982
-// CHECK:  encoding: [0x0f,0xc7,0x3d,0xfe,0xca,0xbe,0xba]
-        	vmptrst	0xbabecafe
+  // CHECK: vmptrst	3133065982
+  // CHECK:  encoding: [0x0f,0xc7,0x3d,0xfe,0xca,0xbe,0xba]
+                  vmptrst	0xbabecafe
 
-// CHECK: vmptrst	305419896
-// CHECK:  encoding: [0x0f,0xc7,0x3d,0x78,0x56,0x34,0x12]
-        	vmptrst	0x12345678
+  // CHECK: vmptrst	305419896
+  // CHECK:  encoding: [0x0f,0xc7,0x3d,0x78,0x56,0x34,0x12]
+                  vmptrst	0x12345678
 
-// CHECK: phaddw	3735928559(%ebx,%ecx,8), %mm3
-// CHECK:  encoding: [0x0f,0x38,0x01,0x9c,0xcb,0xef,0xbe,0xad,0xde]
-        	phaddw	0xdeadbeef(%ebx,%ecx,8),%mm3
+  // CHECK: phaddw	3735928559(%ebx,%ecx,8), %mm3
+  // CHECK:  encoding: [0x0f,0x38,0x01,0x9c,0xcb,0xef,0xbe,0xad,0xde]
+                  phaddw	0xdeadbeef(%ebx,%ecx,8),%mm3
 
-// CHECK: phaddw	69, %mm3
-// CHECK:  encoding: [0x0f,0x38,0x01,0x1d,0x45,0x00,0x00,0x00]
-        	phaddw	0x45,%mm3
+  // CHECK: phaddw	69, %mm3
+  // CHECK:  encoding: [0x0f,0x38,0x01,0x1d,0x45,0x00,0x00,0x00]
+                  phaddw	0x45,%mm3
 
-// CHECK: phaddw	32493, %mm3
-// CHECK:  encoding: [0x0f,0x38,0x01,0x1d,0xed,0x7e,0x00,0x00]
-        	phaddw	0x7eed,%mm3
+  // CHECK: phaddw	32493, %mm3
+  // CHECK:  encoding: [0x0f,0x38,0x01,0x1d,0xed,0x7e,0x00,0x00]
+                  phaddw	0x7eed,%mm3
 
-// CHECK: phaddw	3133065982, %mm3
-// CHECK:  encoding: [0x0f,0x38,0x01,0x1d,0xfe,0xca,0xbe,0xba]
-        	phaddw	0xbabecafe,%mm3
+  // CHECK: phaddw	3133065982, %mm3
+  // CHECK:  encoding: [0x0f,0x38,0x01,0x1d,0xfe,0xca,0xbe,0xba]
+                  phaddw	0xbabecafe,%mm3
 
-// CHECK: phaddw	305419896, %mm3
-// CHECK:  encoding: [0x0f,0x38,0x01,0x1d,0x78,0x56,0x34,0x12]
-        	phaddw	0x12345678,%mm3
+  // CHECK: phaddw	305419896, %mm3
+  // CHECK:  encoding: [0x0f,0x38,0x01,0x1d,0x78,0x56,0x34,0x12]
+                  phaddw	0x12345678,%mm3
 
-// CHECK: phaddw	%mm3, %mm3
-// CHECK:  encoding: [0x0f,0x38,0x01,0xdb]
-        	phaddw	%mm3,%mm3
+  // CHECK: phaddw	%mm3, %mm3
+  // CHECK:  encoding: [0x0f,0x38,0x01,0xdb]
+                  phaddw	%mm3,%mm3
 
-// CHECK: phaddw	3735928559(%ebx,%ecx,8), %xmm5
-// CHECK:  encoding: [0x66,0x0f,0x38,0x01,0xac,0xcb,0xef,0xbe,0xad,0xde]
-        	phaddw	0xdeadbeef(%ebx,%ecx,8),%xmm5
+  // CHECK: phaddw	3735928559(%ebx,%ecx,8), %xmm5
+  // CHECK:  encoding: [0x66,0x0f,0x38,0x01,0xac,0xcb,0xef,0xbe,0xad,0xde]
+                  phaddw	0xdeadbeef(%ebx,%ecx,8),%xmm5
 
-// CHECK: phaddw	69, %xmm5
-// CHECK:  encoding: [0x66,0x0f,0x38,0x01,0x2d,0x45,0x00,0x00,0x00]
-        	phaddw	0x45,%xmm5
+  // CHECK: phaddw	69, %xmm5
+  // CHECK:  encoding: [0x66,0x0f,0x38,0x01,0x2d,0x45,0x00,0x00,0x00]
+                  phaddw	0x45,%xmm5
 
-// CHECK: phaddw	32493, %xmm5
-// CHECK:  encoding: [0x66,0x0f,0x38,0x01,0x2d,0xed,0x7e,0x00,0x00]
-        	phaddw	0x7eed,%xmm5
+  // CHECK: phaddw	32493, %xmm5
+  // CHECK:  encoding: [0x66,0x0f,0x38,0x01,0x2d,0xed,0x7e,0x00,0x00]
+                  phaddw	0x7eed,%xmm5
 
-// CHECK: phaddw	3133065982, %xmm5
-// CHECK:  encoding: [0x66,0x0f,0x38,0x01,0x2d,0xfe,0xca,0xbe,0xba]
-        	phaddw	0xbabecafe,%xmm5
+  // CHECK: phaddw	3133065982, %xmm5
+  // CHECK:  encoding: [0x66,0x0f,0x38,0x01,0x2d,0xfe,0xca,0xbe,0xba]
+                  phaddw	0xbabecafe,%xmm5
 
-// CHECK: phaddw	305419896, %xmm5
-// CHECK:  encoding: [0x66,0x0f,0x38,0x01,0x2d,0x78,0x56,0x34,0x12]
-        	phaddw	0x12345678,%xmm5
+  // CHECK: phaddw	305419896, %xmm5
+  // CHECK:  encoding: [0x66,0x0f,0x38,0x01,0x2d,0x78,0x56,0x34,0x12]
+                  phaddw	0x12345678,%xmm5
 
-// CHECK: phaddw	%xmm5, %xmm5
-// CHECK:  encoding: [0x66,0x0f,0x38,0x01,0xed]
-        	phaddw	%xmm5,%xmm5
+  // CHECK: phaddw	%xmm5, %xmm5
+  // CHECK:  encoding: [0x66,0x0f,0x38,0x01,0xed]
+                  phaddw	%xmm5,%xmm5
 
-// CHECK: phaddd	3735928559(%ebx,%ecx,8), %mm3
-// CHECK:  encoding: [0x0f,0x38,0x02,0x9c,0xcb,0xef,0xbe,0xad,0xde]
-        	phaddd	0xdeadbeef(%ebx,%ecx,8),%mm3
+  // CHECK: phaddd	3735928559(%ebx,%ecx,8), %mm3
+  // CHECK:  encoding: [0x0f,0x38,0x02,0x9c,0xcb,0xef,0xbe,0xad,0xde]
+                  phaddd	0xdeadbeef(%ebx,%ecx,8),%mm3
 
-// CHECK: phaddd	69, %mm3
-// CHECK:  encoding: [0x0f,0x38,0x02,0x1d,0x45,0x00,0x00,0x00]
-        	phaddd	0x45,%mm3
+  // CHECK: phaddd	69, %mm3
+  // CHECK:  encoding: [0x0f,0x38,0x02,0x1d,0x45,0x00,0x00,0x00]
+                  phaddd	0x45,%mm3
 
-// CHECK: phaddd	32493, %mm3
+  // CHECK: phaddd	32493, %mm3
 // CHECK:  encoding: [0x0f,0x38,0x02,0x1d,0xed,0x7e,0x00,0x00]
         	phaddd	0x7eed,%mm3
 
@@ -11732,162 +10088,6 @@
         addw $0xFFFF, %ax
 
 
-// CHECK: 	movb	$127, 3735928559(%ebx,%ecx,8)
-        	movb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movb	$127, 69
-        	movb	$0x7f,0x45
-
-// CHECK: 	movb	$127, 32493
-        	movb	$0x7f,0x7eed
-
-// CHECK: 	movb	$127, 3133065982
-        	movb	$0x7f,0xbabecafe
-
-// CHECK: 	movb	$127, 305419896
-        	movb	$0x7f,0x12345678
-
-// CHECK: 	movw	$31438, 3735928559(%ebx,%ecx,8)
-        	movw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movw	$31438, 69
-        	movw	$0x7ace,0x45
-
-// CHECK: 	movw	$31438, 32493
-        	movw	$0x7ace,0x7eed
-
-// CHECK: 	movw	$31438, 3133065982
-        	movw	$0x7ace,0xbabecafe
-
-// CHECK: 	movw	$31438, 305419896
-        	movw	$0x7ace,0x12345678
-
-// CHECK: 	movl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	movl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movl	$2063514302, 69
-        	movl	$0x7afebabe,0x45
-
-// CHECK: 	movl	$2063514302, 32493
-        	movl	$0x7afebabe,0x7eed
-
-// CHECK: 	movl	$2063514302, 3133065982
-        	movl	$0x7afebabe,0xbabecafe
-
-// CHECK: 	movl	$2063514302, 305419896
-        	movl	$0x7afebabe,0x12345678
-
-// CHECK: 	movl	$324478056, 3735928559(%ebx,%ecx,8)
-        	movl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movl	$324478056, 69
-        	movl	$0x13572468,0x45
-
-// CHECK: 	movl	$324478056, 32493
-        	movl	$0x13572468,0x7eed
-
-// CHECK: 	movl	$324478056, 3133065982
-        	movl	$0x13572468,0xbabecafe
-
-// CHECK: 	movl	$324478056, 305419896
-        	movl	$0x13572468,0x12345678
-
-// CHECK: 	movsbl	3735928559(%ebx,%ecx,8), %ecx
-        	movsbl	0xdeadbeef(%ebx,%ecx,8),%ecx
-
-// CHECK: 	movsbl	69, %ecx
-        	movsbl	0x45,%ecx
-
-// CHECK: 	movsbl	32493, %ecx
-        	movsbl	0x7eed,%ecx
-
-// CHECK: 	movsbl	3133065982, %ecx
-        	movsbl	0xbabecafe,%ecx
-
-// CHECK: 	movsbl	305419896, %ecx
-        	movsbl	0x12345678,%ecx
-
-// CHECK: 	movsbw	3735928559(%ebx,%ecx,8), %bx
-        	movsbw	0xdeadbeef(%ebx,%ecx,8),%bx
-
-// CHECK: 	movsbw	69, %bx
-        	movsbw	0x45,%bx
-
-// CHECK: 	movsbw	32493, %bx
-        	movsbw	0x7eed,%bx
-
-// CHECK: 	movsbw	3133065982, %bx
-        	movsbw	0xbabecafe,%bx
-
-// CHECK: 	movsbw	305419896, %bx
-        	movsbw	0x12345678,%bx
-
-// CHECK: 	movswl	3735928559(%ebx,%ecx,8), %ecx
-        	movswl	0xdeadbeef(%ebx,%ecx,8),%ecx
-
-// CHECK: 	movswl	69, %ecx
-        	movswl	0x45,%ecx
-
-// CHECK: 	movswl	32493, %ecx
-        	movswl	0x7eed,%ecx
-
-// CHECK: 	movswl	3133065982, %ecx
-        	movswl	0xbabecafe,%ecx
-
-// CHECK: 	movswl	305419896, %ecx
-        	movswl	0x12345678,%ecx
-
-// CHECK: 	movzbl	3735928559(%ebx,%ecx,8), %ecx
-        	movzbl	0xdeadbeef(%ebx,%ecx,8),%ecx
-
-// CHECK: 	movzbl	69, %ecx
-        	movzbl	0x45,%ecx
-
-// CHECK: 	movzbl	32493, %ecx
-        	movzbl	0x7eed,%ecx
-
-// CHECK: 	movzbl	3133065982, %ecx
-        	movzbl	0xbabecafe,%ecx
-
-// CHECK: 	movzbl	305419896, %ecx
-        	movzbl	0x12345678,%ecx
-
-// CHECK: 	movzbw	3735928559(%ebx,%ecx,8), %bx
-        	movzbw	0xdeadbeef(%ebx,%ecx,8),%bx
-
-// CHECK: 	movzbw	69, %bx
-        	movzbw	0x45,%bx
-
-// CHECK: 	movzbw	32493, %bx
-        	movzbw	0x7eed,%bx
-
-// CHECK: 	movzbw	3133065982, %bx
-        	movzbw	0xbabecafe,%bx
-
-// CHECK: 	movzbw	305419896, %bx
-        	movzbw	0x12345678,%bx
-
-// CHECK: 	movzwl	3735928559(%ebx,%ecx,8), %ecx
-        	movzwl	0xdeadbeef(%ebx,%ecx,8),%ecx
-
-// CHECK: 	movzwl	69, %ecx
-        	movzwl	0x45,%ecx
-
-// CHECK: 	movzwl	32493, %ecx
-        	movzwl	0x7eed,%ecx
-
-// CHECK: 	movzwl	3133065982, %ecx
-        	movzwl	0xbabecafe,%ecx
-
-// CHECK: 	movzwl	305419896, %ecx
-        	movzwl	0x12345678,%ecx
-
-// CHECK: 	pushw	32493
-        	pushw	0x7eed
-
-// CHECK: 	popw	32493
-        	popw	0x7eed
-
 // CHECK: 	pushf
         	pushfl
 
@@ -11900,888 +10100,6 @@
 // CHECK: 	popfl
         	popfl
 
-// CHECK: 	clc
-        	clc
-
-// CHECK: 	cld
-        	cld
-
-// CHECK: 	cli
-        	cli
-
-// CHECK: 	clts
-        	clts
-
-// CHECK: 	cmc
-        	cmc
-
-// CHECK: 	lahf
-        	lahf
-
-// CHECK: 	sahf
-        	sahf
-
-// CHECK: 	stc
-        	stc
-
-// CHECK: 	std
-        	std
-
-// CHECK: 	sti
-        	sti
-
-// CHECK: 	addb	$254, 3735928559(%ebx,%ecx,8)
-        	addb	$0xfe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	addb	$254, 69
-        	addb	$0xfe,0x45
-
-// CHECK: 	addb	$254, 32493
-        	addb	$0xfe,0x7eed
-
-// CHECK: 	addb	$254, 3133065982
-        	addb	$0xfe,0xbabecafe
-
-// CHECK: 	addb	$254, 305419896
-        	addb	$0xfe,0x12345678
-
-// CHECK: 	addb	$127, 3735928559(%ebx,%ecx,8)
-        	addb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	addb	$127, 69
-        	addb	$0x7f,0x45
-
-// CHECK: 	addb	$127, 32493
-        	addb	$0x7f,0x7eed
-
-// CHECK: 	addb	$127, 3133065982
-        	addb	$0x7f,0xbabecafe
-
-// CHECK: 	addb	$127, 305419896
-        	addb	$0x7f,0x12345678
-
-// CHECK: 	addw	$31438, 3735928559(%ebx,%ecx,8)
-        	addw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	addw	$31438, 69
-        	addw	$0x7ace,0x45
-
-// CHECK: 	addw	$31438, 32493
-        	addw	$0x7ace,0x7eed
-
-// CHECK: 	addw	$31438, 3133065982
-        	addw	$0x7ace,0xbabecafe
-
-// CHECK: 	addw	$31438, 305419896
-        	addw	$0x7ace,0x12345678
-
-// CHECK: 	addl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	addl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	addl	$2063514302, 69
-        	addl	$0x7afebabe,0x45
-
-// CHECK: 	addl	$2063514302, 32493
-        	addl	$0x7afebabe,0x7eed
-
-// CHECK: 	addl	$2063514302, 3133065982
-        	addl	$0x7afebabe,0xbabecafe
-
-// CHECK: 	addl	$2063514302, 305419896
-        	addl	$0x7afebabe,0x12345678
-
-// CHECK: 	addl	$324478056, 3735928559(%ebx,%ecx,8)
-        	addl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	addl	$324478056, 69
-        	addl	$0x13572468,0x45
-
-// CHECK: 	addl	$324478056, 32493
-        	addl	$0x13572468,0x7eed
-
-// CHECK: 	addl	$324478056, 3133065982
-        	addl	$0x13572468,0xbabecafe
-
-// CHECK: 	addl	$324478056, 305419896
-        	addl	$0x13572468,0x12345678
-
-// CHECK: 	incl	3735928559(%ebx,%ecx,8)
-        	incl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	incw	32493
-        	incw	0x7eed
-
-// CHECK: 	incl	3133065982
-        	incl	0xbabecafe
-
-// CHECK: 	incl	305419896
-        	incl	0x12345678
-
-// CHECK: 	subb	$254, 3735928559(%ebx,%ecx,8)
-        	subb	$0xfe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	subb	$254, 69
-        	subb	$0xfe,0x45
-
-// CHECK: 	subb	$254, 32493
-        	subb	$0xfe,0x7eed
-
-// CHECK: 	subb	$254, 3133065982
-        	subb	$0xfe,0xbabecafe
-
-// CHECK: 	subb	$254, 305419896
-        	subb	$0xfe,0x12345678
-
-// CHECK: 	subb	$127, 3735928559(%ebx,%ecx,8)
-        	subb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	subb	$127, 69
-        	subb	$0x7f,0x45
-
-// CHECK: 	subb	$127, 32493
-        	subb	$0x7f,0x7eed
-
-// CHECK: 	subb	$127, 3133065982
-        	subb	$0x7f,0xbabecafe
-
-// CHECK: 	subb	$127, 305419896
-        	subb	$0x7f,0x12345678
-
-// CHECK: 	subw	$31438, 3735928559(%ebx,%ecx,8)
-        	subw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	subw	$31438, 69
-        	subw	$0x7ace,0x45
-
-// CHECK: 	subw	$31438, 32493
-        	subw	$0x7ace,0x7eed
-
-// CHECK: 	subw	$31438, 3133065982
-        	subw	$0x7ace,0xbabecafe
-
-// CHECK: 	subw	$31438, 305419896
-        	subw	$0x7ace,0x12345678
-
-// CHECK: 	subl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	subl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	subl	$2063514302, 69
-        	subl	$0x7afebabe,0x45
-
-// CHECK: 	subl	$2063514302, 32493
-        	subl	$0x7afebabe,0x7eed
-
-// CHECK: 	subl	$2063514302, 3133065982
-        	subl	$0x7afebabe,0xbabecafe
-
-// CHECK: 	subl	$2063514302, 305419896
-        	subl	$0x7afebabe,0x12345678
-
-// CHECK: 	subl	$324478056, 3735928559(%ebx,%ecx,8)
-        	subl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	subl	$324478056, 69
-        	subl	$0x13572468,0x45
-
-// CHECK: 	subl	$324478056, 32493
-        	subl	$0x13572468,0x7eed
-
-// CHECK: 	subl	$324478056, 3133065982
-        	subl	$0x13572468,0xbabecafe
-
-// CHECK: 	subl	$324478056, 305419896
-        	subl	$0x13572468,0x12345678
-
-// CHECK: 	decl	3735928559(%ebx,%ecx,8)
-        	decl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	decw	32493
-        	decw	0x7eed
-
-// CHECK: 	decl	3133065982
-        	decl	0xbabecafe
-
-// CHECK: 	decl	305419896
-        	decl	0x12345678
-
-// CHECK: 	sbbb	$254, 3735928559(%ebx,%ecx,8)
-        	sbbb	$0xfe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sbbb	$254, 69
-        	sbbb	$0xfe,0x45
-
-// CHECK: 	sbbb	$254, 32493
-        	sbbb	$0xfe,0x7eed
-
-// CHECK: 	sbbb	$254, 3133065982
-        	sbbb	$0xfe,0xbabecafe
-
-// CHECK: 	sbbb	$254, 305419896
-        	sbbb	$0xfe,0x12345678
-
-// CHECK: 	sbbb	$127, 3735928559(%ebx,%ecx,8)
-        	sbbb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sbbb	$127, 69
-        	sbbb	$0x7f,0x45
-
-// CHECK: 	sbbb	$127, 32493
-        	sbbb	$0x7f,0x7eed
-
-// CHECK: 	sbbb	$127, 3133065982
-        	sbbb	$0x7f,0xbabecafe
-
-// CHECK: 	sbbb	$127, 305419896
-        	sbbb	$0x7f,0x12345678
-
-// CHECK: 	sbbw	$31438, 3735928559(%ebx,%ecx,8)
-        	sbbw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sbbw	$31438, 69
-        	sbbw	$0x7ace,0x45
-
-// CHECK: 	sbbw	$31438, 32493
-        	sbbw	$0x7ace,0x7eed
-
-// CHECK: 	sbbw	$31438, 3133065982
-        	sbbw	$0x7ace,0xbabecafe
-
-// CHECK: 	sbbw	$31438, 305419896
-        	sbbw	$0x7ace,0x12345678
-
-// CHECK: 	sbbl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	sbbl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sbbl	$2063514302, 69
-        	sbbl	$0x7afebabe,0x45
-
-// CHECK: 	sbbl	$2063514302, 32493
-        	sbbl	$0x7afebabe,0x7eed
-
-// CHECK: 	sbbl	$2063514302, 3133065982
-        	sbbl	$0x7afebabe,0xbabecafe
-
-// CHECK: 	sbbl	$2063514302, 305419896
-        	sbbl	$0x7afebabe,0x12345678
-
-// CHECK: 	sbbl	$324478056, 3735928559(%ebx,%ecx,8)
-        	sbbl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sbbl	$324478056, 69
-        	sbbl	$0x13572468,0x45
-
-// CHECK: 	sbbl	$324478056, 32493
-        	sbbl	$0x13572468,0x7eed
-
-// CHECK: 	sbbl	$324478056, 3133065982
-        	sbbl	$0x13572468,0xbabecafe
-
-// CHECK: 	sbbl	$324478056, 305419896
-        	sbbl	$0x13572468,0x12345678
-
-// CHECK: 	cmpb	$254, 3735928559(%ebx,%ecx,8)
-        	cmpb	$0xfe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	cmpb	$254, 69
-        	cmpb	$0xfe,0x45
-
-// CHECK: 	cmpb	$254, 32493
-        	cmpb	$0xfe,0x7eed
-
-// CHECK: 	cmpb	$254, 3133065982
-        	cmpb	$0xfe,0xbabecafe
-
-// CHECK: 	cmpb	$254, 305419896
-        	cmpb	$0xfe,0x12345678
-
-// CHECK: 	cmpb	$127, 3735928559(%ebx,%ecx,8)
-        	cmpb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	cmpb	$127, 69
-        	cmpb	$0x7f,0x45
-
-// CHECK: 	cmpb	$127, 32493
-        	cmpb	$0x7f,0x7eed
-
-// CHECK: 	cmpb	$127, 3133065982
-        	cmpb	$0x7f,0xbabecafe
-
-// CHECK: 	cmpb	$127, 305419896
-        	cmpb	$0x7f,0x12345678
-
-// CHECK: 	cmpw	$31438, 3735928559(%ebx,%ecx,8)
-        	cmpw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	cmpw	$31438, 69
-        	cmpw	$0x7ace,0x45
-
-// CHECK: 	cmpw	$31438, 32493
-        	cmpw	$0x7ace,0x7eed
-
-// CHECK: 	cmpw	$31438, 3133065982
-        	cmpw	$0x7ace,0xbabecafe
-
-// CHECK: 	cmpw	$31438, 305419896
-        	cmpw	$0x7ace,0x12345678
-
-// CHECK: 	cmpl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	cmpl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	cmpl	$2063514302, 69
-        	cmpl	$0x7afebabe,0x45
-
-// CHECK: 	cmpl	$2063514302, 32493
-        	cmpl	$0x7afebabe,0x7eed
-
-// CHECK: 	cmpl	$2063514302, 3133065982
-        	cmpl	$0x7afebabe,0xbabecafe
-
-// CHECK: 	cmpl	$2063514302, 305419896
-        	cmpl	$0x7afebabe,0x12345678
-
-// CHECK: 	cmpl	$324478056, 3735928559(%ebx,%ecx,8)
-        	cmpl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	cmpl	$324478056, 69
-        	cmpl	$0x13572468,0x45
-
-// CHECK: 	cmpl	$324478056, 32493
-        	cmpl	$0x13572468,0x7eed
-
-// CHECK: 	cmpl	$324478056, 3133065982
-        	cmpl	$0x13572468,0xbabecafe
-
-// CHECK: 	cmpl	$324478056, 305419896
-        	cmpl	$0x13572468,0x12345678
-
-// CHECK: 	testb	$127, 3735928559(%ebx,%ecx,8)
-        	testb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	testb	$127, 69
-        	testb	$0x7f,0x45
-
-// CHECK: 	testb	$127, 32493
-        	testb	$0x7f,0x7eed
-
-// CHECK: 	testb	$127, 3133065982
-        	testb	$0x7f,0xbabecafe
-
-// CHECK: 	testb	$127, 305419896
-        	testb	$0x7f,0x12345678
-
-// CHECK: 	testw	$31438, 3735928559(%ebx,%ecx,8)
-        	testw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	testw	$31438, 69
-        	testw	$0x7ace,0x45
-
-// CHECK: 	testw	$31438, 32493
-        	testw	$0x7ace,0x7eed
-
-// CHECK: 	testw	$31438, 3133065982
-        	testw	$0x7ace,0xbabecafe
-
-// CHECK: 	testw	$31438, 305419896
-        	testw	$0x7ace,0x12345678
-
-// CHECK: 	testl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	testl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	testl	$2063514302, 69
-        	testl	$0x7afebabe,0x45
-
-// CHECK: 	testl	$2063514302, 32493
-        	testl	$0x7afebabe,0x7eed
-
-// CHECK: 	testl	$2063514302, 3133065982
-        	testl	$0x7afebabe,0xbabecafe
-
-// CHECK: 	testl	$2063514302, 305419896
-        	testl	$0x7afebabe,0x12345678
-
-// CHECK: 	testl	$324478056, 3735928559(%ebx,%ecx,8)
-        	testl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	testl	$324478056, 69
-        	testl	$0x13572468,0x45
-
-// CHECK: 	testl	$324478056, 32493
-        	testl	$0x13572468,0x7eed
-
-// CHECK: 	testl	$324478056, 3133065982
-        	testl	$0x13572468,0xbabecafe
-
-// CHECK: 	testl	$324478056, 305419896
-        	testl	$0x13572468,0x12345678
-
-// CHECK: 	andb	$254, 3735928559(%ebx,%ecx,8)
-        	andb	$0xfe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	andb	$254, 69
-        	andb	$0xfe,0x45
-
-// CHECK: 	andb	$254, 32493
-        	andb	$0xfe,0x7eed
-
-// CHECK: 	andb	$254, 3133065982
-        	andb	$0xfe,0xbabecafe
-
-// CHECK: 	andb	$254, 305419896
-        	andb	$0xfe,0x12345678
-
-// CHECK: 	andb	$127, 3735928559(%ebx,%ecx,8)
-        	andb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	andb	$127, 69
-        	andb	$0x7f,0x45
-
-// CHECK: 	andb	$127, 32493
-        	andb	$0x7f,0x7eed
-
-// CHECK: 	andb	$127, 3133065982
-        	andb	$0x7f,0xbabecafe
-
-// CHECK: 	andb	$127, 305419896
-        	andb	$0x7f,0x12345678
-
-// CHECK: 	andw	$31438, 3735928559(%ebx,%ecx,8)
-        	andw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	andw	$31438, 69
-        	andw	$0x7ace,0x45
-
-// CHECK: 	andw	$31438, 32493
-        	andw	$0x7ace,0x7eed
-
-// CHECK: 	andw	$31438, 3133065982
-        	andw	$0x7ace,0xbabecafe
-
-// CHECK: 	andw	$31438, 305419896
-        	andw	$0x7ace,0x12345678
-
-// CHECK: 	andl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	andl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	andl	$2063514302, 69
-        	andl	$0x7afebabe,0x45
-
-// CHECK: 	andl	$2063514302, 32493
-        	andl	$0x7afebabe,0x7eed
-
-// CHECK: 	andl	$2063514302, 3133065982
-        	andl	$0x7afebabe,0xbabecafe
-
-// CHECK: 	andl	$2063514302, 305419896
-        	andl	$0x7afebabe,0x12345678
-
-// CHECK: 	andl	$324478056, 3735928559(%ebx,%ecx,8)
-        	andl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	andl	$324478056, 69
-        	andl	$0x13572468,0x45
-
-// CHECK: 	andl	$324478056, 32493
-        	andl	$0x13572468,0x7eed
-
-// CHECK: 	andl	$324478056, 3133065982
-        	andl	$0x13572468,0xbabecafe
-
-// CHECK: 	andl	$324478056, 305419896
-        	andl	$0x13572468,0x12345678
-
-// CHECK: 	orb	$254, 3735928559(%ebx,%ecx,8)
-        	orb	$0xfe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	orb	$254, 69
-        	orb	$0xfe,0x45
-
-// CHECK: 	orb	$254, 32493
-        	orb	$0xfe,0x7eed
-
-// CHECK: 	orb	$254, 3133065982
-        	orb	$0xfe,0xbabecafe
-
-// CHECK: 	orb	$254, 305419896
-        	orb	$0xfe,0x12345678
-
-// CHECK: 	orb	$127, 3735928559(%ebx,%ecx,8)
-        	orb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	orb	$127, 69
-        	orb	$0x7f,0x45
-
-// CHECK: 	orb	$127, 32493
-        	orb	$0x7f,0x7eed
-
-// CHECK: 	orb	$127, 3133065982
-        	orb	$0x7f,0xbabecafe
-
-// CHECK: 	orb	$127, 305419896
-        	orb	$0x7f,0x12345678
-
-// CHECK: 	orw	$31438, 3735928559(%ebx,%ecx,8)
-        	orw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	orw	$31438, 69
-        	orw	$0x7ace,0x45
-
-// CHECK: 	orw	$31438, 32493
-        	orw	$0x7ace,0x7eed
-
-// CHECK: 	orw	$31438, 3133065982
-        	orw	$0x7ace,0xbabecafe
-
-// CHECK: 	orw	$31438, 305419896
-        	orw	$0x7ace,0x12345678
-
-// CHECK: 	orl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	orl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	orl	$2063514302, 69
-        	orl	$0x7afebabe,0x45
-
-// CHECK: 	orl	$2063514302, 32493
-        	orl	$0x7afebabe,0x7eed
-
-// CHECK: 	orl	$2063514302, 3133065982
-        	orl	$0x7afebabe,0xbabecafe
-
-// CHECK: 	orl	$2063514302, 305419896
-        	orl	$0x7afebabe,0x12345678
-
-// CHECK: 	orl	$324478056, 3735928559(%ebx,%ecx,8)
-        	orl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	orl	$324478056, 69
-        	orl	$0x13572468,0x45
-
-// CHECK: 	orl	$324478056, 32493
-        	orl	$0x13572468,0x7eed
-
-// CHECK: 	orl	$324478056, 3133065982
-        	orl	$0x13572468,0xbabecafe
-
-// CHECK: 	orl	$324478056, 305419896
-        	orl	$0x13572468,0x12345678
-
-// CHECK: 	xorb	$254, 3735928559(%ebx,%ecx,8)
-        	xorb	$0xfe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	xorb	$254, 69
-        	xorb	$0xfe,0x45
-
-// CHECK: 	xorb	$254, 32493
-        	xorb	$0xfe,0x7eed
-
-// CHECK: 	xorb	$254, 3133065982
-        	xorb	$0xfe,0xbabecafe
-
-// CHECK: 	xorb	$254, 305419896
-        	xorb	$0xfe,0x12345678
-
-// CHECK: 	xorb	$127, 3735928559(%ebx,%ecx,8)
-        	xorb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	xorb	$127, 69
-        	xorb	$0x7f,0x45
-
-// CHECK: 	xorb	$127, 32493
-        	xorb	$0x7f,0x7eed
-
-// CHECK: 	xorb	$127, 3133065982
-        	xorb	$0x7f,0xbabecafe
-
-// CHECK: 	xorb	$127, 305419896
-        	xorb	$0x7f,0x12345678
-
-// CHECK: 	xorw	$31438, 3735928559(%ebx,%ecx,8)
-        	xorw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	xorw	$31438, 69
-        	xorw	$0x7ace,0x45
-
-// CHECK: 	xorw	$31438, 32493
-        	xorw	$0x7ace,0x7eed
-
-// CHECK: 	xorw	$31438, 3133065982
-        	xorw	$0x7ace,0xbabecafe
-
-// CHECK: 	xorw	$31438, 305419896
-        	xorw	$0x7ace,0x12345678
-
-// CHECK: 	xorl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	xorl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	xorl	$2063514302, 69
-        	xorl	$0x7afebabe,0x45
-
-// CHECK: 	xorl	$2063514302, 32493
-        	xorl	$0x7afebabe,0x7eed
-
-// CHECK: 	xorl	$2063514302, 3133065982
-        	xorl	$0x7afebabe,0xbabecafe
-
-// CHECK: 	xorl	$2063514302, 305419896
-        	xorl	$0x7afebabe,0x12345678
-
-// CHECK: 	xorl	$324478056, 3735928559(%ebx,%ecx,8)
-        	xorl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	xorl	$324478056, 69
-        	xorl	$0x13572468,0x45
-
-// CHECK: 	xorl	$324478056, 32493
-        	xorl	$0x13572468,0x7eed
-
-// CHECK: 	xorl	$324478056, 3133065982
-        	xorl	$0x13572468,0xbabecafe
-
-// CHECK: 	xorl	$324478056, 305419896
-        	xorl	$0x13572468,0x12345678
-
-// CHECK: 	adcb	$254, 3735928559(%ebx,%ecx,8)
-        	adcb	$0xfe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	adcb	$254, 69
-        	adcb	$0xfe,0x45
-
-// CHECK: 	adcb	$254, 32493
-        	adcb	$0xfe,0x7eed
-
-// CHECK: 	adcb	$254, 3133065982
-        	adcb	$0xfe,0xbabecafe
-
-// CHECK: 	adcb	$254, 305419896
-        	adcb	$0xfe,0x12345678
-
-// CHECK: 	adcb	$127, 3735928559(%ebx,%ecx,8)
-        	adcb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	adcb	$127, 69
-        	adcb	$0x7f,0x45
-
-// CHECK: 	adcb	$127, 32493
-        	adcb	$0x7f,0x7eed
-
-// CHECK: 	adcb	$127, 3133065982
-        	adcb	$0x7f,0xbabecafe
-
-// CHECK: 	adcb	$127, 305419896
-        	adcb	$0x7f,0x12345678
-
-// CHECK: 	adcw	$31438, 3735928559(%ebx,%ecx,8)
-        	adcw	$0x7ace,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	adcw	$31438, 69
-        	adcw	$0x7ace,0x45
-
-// CHECK: 	adcw	$31438, 32493
-        	adcw	$0x7ace,0x7eed
-
-// CHECK: 	adcw	$31438, 3133065982
-        	adcw	$0x7ace,0xbabecafe
-
-// CHECK: 	adcw	$31438, 305419896
-        	adcw	$0x7ace,0x12345678
-
-// CHECK: 	adcl	$2063514302, 3735928559(%ebx,%ecx,8)
-        	adcl	$0x7afebabe,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	adcl	$2063514302, 69
-        	adcl	$0x7afebabe,0x45
-
-// CHECK: 	adcl	$2063514302, 32493
-        	adcl	$0x7afebabe,0x7eed
-
-// CHECK: 	adcl	$2063514302, 3133065982
-        	adcl	$0x7afebabe,0xbabecafe
-
-// CHECK: 	adcl	$2063514302, 305419896
-        	adcl	$0x7afebabe,0x12345678
-
-// CHECK: 	adcl	$324478056, 3735928559(%ebx,%ecx,8)
-        	adcl	$0x13572468,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	adcl	$324478056, 69
-        	adcl	$0x13572468,0x45
-
-// CHECK: 	adcl	$324478056, 32493
-        	adcl	$0x13572468,0x7eed
-
-// CHECK: 	adcl	$324478056, 3133065982
-        	adcl	$0x13572468,0xbabecafe
-
-// CHECK: 	adcl	$324478056, 305419896
-        	adcl	$0x13572468,0x12345678
-
-// CHECK: 	negl	3735928559(%ebx,%ecx,8)
-        	negl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	negw	32493
-        	negw	0x7eed
-
-// CHECK: 	negl	3133065982
-        	negl	0xbabecafe
-
-// CHECK: 	negl	305419896
-        	negl	0x12345678
-
-// CHECK: 	notl	3735928559(%ebx,%ecx,8)
-        	notl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	notw	32493
-        	notw	0x7eed
-
-// CHECK: 	notl	3133065982
-        	notl	0xbabecafe
-
-// CHECK: 	notl	305419896
-        	notl	0x12345678
-
-// CHECK: 	cbtw
-        	cbtw
-
-// CHECK: 	cwtl
-        	cwtl
-
-// CHECK: 	cwtd
-        	cwtd
-
-// CHECK: 	cltd
-        	cltd
-
-// CHECK: 	mull	3735928559(%ebx,%ecx,8)
-        	mull	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	mulw	32493
-        	mulw	0x7eed
-
-// CHECK: 	mull	3133065982
-        	mull	0xbabecafe
-
-// CHECK: 	mull	305419896
-        	mull	0x12345678
-
-// CHECK: 	imull	3735928559(%ebx,%ecx,8)
-        	imull	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	imulw	32493
-        	imulw	0x7eed
-
-// CHECK: 	imull	3133065982
-        	imull	0xbabecafe
-
-// CHECK: 	imull	305419896
-        	imull	0x12345678
-
-// CHECK: 	divl	3735928559(%ebx,%ecx,8)
-        	divl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	divw	32493
-        	divw	0x7eed
-
-// CHECK: 	divl	3133065982
-        	divl	0xbabecafe
-
-// CHECK: 	divl	305419896
-        	divl	0x12345678
-
-// CHECK: 	idivl	3735928559(%ebx,%ecx,8)
-        	idivl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	idivw	32493
-        	idivw	0x7eed
-
-// CHECK: 	idivl	3133065982
-        	idivl	0xbabecafe
-
-// CHECK: 	idivl	305419896
-        	idivl	0x12345678
-
-// CHECK: 	roll	$0, 3735928559(%ebx,%ecx,8)
-        	roll	$0,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	roll	$0, 69
-        	roll	$0,0x45
-
-// CHECK: 	roll	$0, 32493
-        	roll	$0,0x7eed
-
-// CHECK: 	roll	$0, 3133065982
-        	roll	$0,0xbabecafe
-
-// CHECK: 	roll	$0, 305419896
-        	roll	$0,0x12345678
-
-// CHECK: 	rolb	$127, 3735928559(%ebx,%ecx,8)
-        	rolb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	rolb	$127, 69
-        	rolb	$0x7f,0x45
-
-// CHECK: 	rolb	$127, 32493
-        	rolb	$0x7f,0x7eed
-
-// CHECK: 	rolb	$127, 3133065982
-        	rolb	$0x7f,0xbabecafe
-
-// CHECK: 	rolb	$127, 305419896
-        	rolb	$0x7f,0x12345678
-
-// CHECK: 	roll	3735928559(%ebx,%ecx,8)
-        	roll	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	rolw	32493
-        	rolw	0x7eed
-
-// CHECK: 	roll	3133065982
-        	roll	0xbabecafe
-
-// CHECK: 	roll	305419896
-        	roll	0x12345678
-
-// CHECK: 	rorl	$0, 3735928559(%ebx,%ecx,8)
-        	rorl	$0,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	rorl	$0, 69
-        	rorl	$0,0x45
-
-// CHECK: 	rorl	$0, 32493
-        	rorl	$0,0x7eed
-
-// CHECK: 	rorl	$0, 3133065982
-        	rorl	$0,0xbabecafe
-
-// CHECK: 	rorl	$0, 305419896
-        	rorl	$0,0x12345678
-
-// CHECK: 	rorb	$127, 3735928559(%ebx,%ecx,8)
-        	rorb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	rorb	$127, 69
-        	rorb	$0x7f,0x45
-
-// CHECK: 	rorb	$127, 32493
-        	rorb	$0x7f,0x7eed
-
-// CHECK: 	rorb	$127, 3133065982
-        	rorb	$0x7f,0xbabecafe
-
-// CHECK: 	rorb	$127, 305419896
-        	rorb	$0x7f,0x12345678
-
-// CHECK: 	rorl	3735928559(%ebx,%ecx,8)
-        	rorl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	rorw	32493
-        	rorw	0x7eed
-
-// CHECK: 	rorl	3133065982
-        	rorl	0xbabecafe
-
-// CHECK: 	rorl	305419896
-        	rorl	0x12345678
-
 // CHECK: 	rcll	$0, 3735928559(%ebx,%ecx,8)
         	rcll	$0,0xdeadbeef(%ebx,%ecx,8)
 
@@ -12842,183 +10160,12 @@
 // CHECK: 	rcrb	$127, 305419896
         	rcrb	$0x7f,0x12345678
 
-// CHECK: 	shll	$0, 3735928559(%ebx,%ecx,8)
-        	sall	$0,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	shll	$0, 69
-        	sall	$0,0x45
-
-// CHECK: 	shll	$0, 32493
-        	sall	$0,0x7eed
-
-// CHECK: 	shll	$0, 3133065982
-        	sall	$0,0xbabecafe
-
-// CHECK: 	shll	$0, 305419896
-        	sall	$0,0x12345678
-
-// CHECK: 	shlb	$127, 3735928559(%ebx,%ecx,8)
-        	salb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	shlb	$127, 69
-        	salb	$0x7f,0x45
-
-// CHECK: 	shlb	$127, 32493
-        	salb	$0x7f,0x7eed
-
-// CHECK: 	shlb	$127, 3133065982
-        	salb	$0x7f,0xbabecafe
-
-// CHECK: 	shlb	$127, 305419896
-        	salb	$0x7f,0x12345678
-
-// CHECK: 	shll	3735928559(%ebx,%ecx,8)
-        	sall	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	shlw	32493
-        	salw	0x7eed
-
-// CHECK: 	shll	3133065982
-        	sall	0xbabecafe
-
-// CHECK: 	shll	305419896
-        	sall	0x12345678
-
-// CHECK: 	shll	$0, 3735928559(%ebx,%ecx,8)
-        	shll	$0,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	shll	$0, 69
-        	shll	$0,0x45
-
-// CHECK: 	shll	$0, 32493
-        	shll	$0,0x7eed
-
-// CHECK: 	shll	$0, 3133065982
-        	shll	$0,0xbabecafe
-
-// CHECK: 	shll	$0, 305419896
-        	shll	$0,0x12345678
-
-// CHECK: 	shlb	$127, 3735928559(%ebx,%ecx,8)
-        	shlb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	shlb	$127, 69
-        	shlb	$0x7f,0x45
-
-// CHECK: 	shlb	$127, 32493
-        	shlb	$0x7f,0x7eed
-
-// CHECK: 	shlb	$127, 3133065982
-        	shlb	$0x7f,0xbabecafe
-
-// CHECK: 	shlb	$127, 305419896
-        	shlb	$0x7f,0x12345678
-
-// CHECK: 	shll	3735928559(%ebx,%ecx,8)
-        	shll	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	shlw	32493
-        	shlw	0x7eed
-
-// CHECK: 	shll	3133065982
-        	shll	0xbabecafe
-
-// CHECK: 	shll	305419896
-        	shll	0x12345678
-
-// CHECK: 	shrl	$0, 3735928559(%ebx,%ecx,8)
-        	shrl	$0,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	shrl	$0, 69
-        	shrl	$0,0x45
-
-// CHECK: 	shrl	$0, 32493
-        	shrl	$0,0x7eed
-
-// CHECK: 	shrl	$0, 3133065982
-        	shrl	$0,0xbabecafe
-
-// CHECK: 	shrl	$0, 305419896
-        	shrl	$0,0x12345678
-
-// CHECK: 	shrb	$127, 3735928559(%ebx,%ecx,8)
-        	shrb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	shrb	$127, 69
-        	shrb	$0x7f,0x45
-
-// CHECK: 	shrb	$127, 32493
-        	shrb	$0x7f,0x7eed
-
-// CHECK: 	shrb	$127, 3133065982
-        	shrb	$0x7f,0xbabecafe
-
-// CHECK: 	shrb	$127, 305419896
-        	shrb	$0x7f,0x12345678
-
-// CHECK: 	shrl	3735928559(%ebx,%ecx,8)
-        	shrl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	shrw	32493
-        	shrw	0x7eed
-
-// CHECK: 	shrl	3133065982
-        	shrl	0xbabecafe
-
-// CHECK: 	shrl	305419896
-        	shrl	0x12345678
-
-// CHECK: 	sarl	$0, 3735928559(%ebx,%ecx,8)
-        	sarl	$0,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sarl	$0, 69
-        	sarl	$0,0x45
-
-// CHECK: 	sarl	$0, 32493
-        	sarl	$0,0x7eed
-
-// CHECK: 	sarl	$0, 3133065982
-        	sarl	$0,0xbabecafe
-
-// CHECK: 	sarl	$0, 305419896
-        	sarl	$0,0x12345678
-
-// CHECK: 	sarb	$127, 3735928559(%ebx,%ecx,8)
-        	sarb	$0x7f,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sarb	$127, 69
-        	sarb	$0x7f,0x45
-
-// CHECK: 	sarb	$127, 32493
-        	sarb	$0x7f,0x7eed
-
-// CHECK: 	sarb	$127, 3133065982
-        	sarb	$0x7f,0xbabecafe
-
-// CHECK: 	sarb	$127, 305419896
-        	sarb	$0x7f,0x12345678
-
-// CHECK: 	sarl	3735928559(%ebx,%ecx,8)
-        	sarl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sarw	32493
-        	sarw	0x7eed
-
-// CHECK: 	sarl	3133065982
-        	sarl	0xbabecafe
-
-// CHECK: 	sarl	305419896
-        	sarl	0x12345678
-
 // CHECK: 	calll	3133065982
         	calll	0xbabecafe
 
 // CHECK: 	calll	*3735928559(%ebx,%ecx,8)
         	calll	*0xdeadbeef(%ebx,%ecx,8)
 
-// CHECK: 	calll	3133065982
-        	calll	0xbabecafe
-
 // CHECK: 	calll	305419896
         	calll	0x12345678
 
@@ -13097,18 +10244,9 @@
 // CHECK: 	ljmpl	*305419896
         	ljmpl	*0x12345678
 
-// CHECK: 	ret
-        	ret
-
-// CHECK: 	lret
-        	lret
-
 // CHECK: 	enter	$31438, $127
         	enter	$0x7ace,$0x7f
 
-// CHECK: 	leave
-        	leave
-
 // CHECK: 	jo	32493
         	jo	0x7eed
 
@@ -13301,924 +10439,9 @@
 // CHECK: 	jg	-77129852792157442
         	jg	0xfeedfacebabecafe
 
-// CHECK: 	seto	%bl
-        	seto	%bl
-
-// CHECK: 	seto	3735928559(%ebx,%ecx,8)
-        	seto	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	seto	32493
-        	seto	0x7eed
-
-// CHECK: 	seto	3133065982
-        	seto	0xbabecafe
-
-// CHECK: 	seto	305419896
-        	seto	0x12345678
-
-// CHECK: 	setno	%bl
-        	setno	%bl
-
-// CHECK: 	setno	3735928559(%ebx,%ecx,8)
-        	setno	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setno	32493
-        	setno	0x7eed
-
-// CHECK: 	setno	3133065982
-        	setno	0xbabecafe
-
-// CHECK: 	setno	305419896
-        	setno	0x12345678
-
-// CHECK: 	setb	%bl
-        	setb	%bl
-
-// CHECK: 	setb	3735928559(%ebx,%ecx,8)
-        	setb	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setb	32493
-        	setb	0x7eed
-
-// CHECK: 	setb	3133065982
-        	setb	0xbabecafe
-
-// CHECK: 	setb	305419896
-        	setb	0x12345678
-
-// CHECK: 	setae	%bl
-        	setae	%bl
-
-// CHECK: 	setae	3735928559(%ebx,%ecx,8)
-        	setae	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setae	32493
-        	setae	0x7eed
-
-// CHECK: 	setae	3133065982
-        	setae	0xbabecafe
-
-// CHECK: 	setae	305419896
-        	setae	0x12345678
-
-// CHECK: 	sete	%bl
-        	sete	%bl
-
-// CHECK: 	sete	3735928559(%ebx,%ecx,8)
-        	sete	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sete	32493
-        	sete	0x7eed
-
-// CHECK: 	sete	3133065982
-        	sete	0xbabecafe
-
-// CHECK: 	sete	305419896
-        	sete	0x12345678
-
-// CHECK: 	setne	%bl
-        	setne	%bl
-
-// CHECK: 	setne	3735928559(%ebx,%ecx,8)
-        	setne	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setne	32493
-        	setne	0x7eed
-
-// CHECK: 	setne	3133065982
-        	setne	0xbabecafe
-
-// CHECK: 	setne	305419896
-        	setne	0x12345678
-
-// CHECK: 	setbe	%bl
-        	setbe	%bl
-
-// CHECK: 	setbe	3735928559(%ebx,%ecx,8)
-        	setbe	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setbe	32493
-        	setbe	0x7eed
-
-// CHECK: 	setbe	3133065982
-        	setbe	0xbabecafe
-
-// CHECK: 	setbe	305419896
-        	setbe	0x12345678
-
-// CHECK: 	seta	%bl
-        	seta	%bl
-
-// CHECK: 	seta	3735928559(%ebx,%ecx,8)
-        	seta	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	seta	32493
-        	seta	0x7eed
-
-// CHECK: 	seta	3133065982
-        	seta	0xbabecafe
-
-// CHECK: 	seta	305419896
-        	seta	0x12345678
-
-// CHECK: 	sets	%bl
-        	sets	%bl
-
-// CHECK: 	sets	3735928559(%ebx,%ecx,8)
-        	sets	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	sets	32493
-        	sets	0x7eed
-
-// CHECK: 	sets	3133065982
-        	sets	0xbabecafe
-
-// CHECK: 	sets	305419896
-        	sets	0x12345678
-
-// CHECK: 	setns	%bl
-        	setns	%bl
-
-// CHECK: 	setns	3735928559(%ebx,%ecx,8)
-        	setns	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setns	32493
-        	setns	0x7eed
-
-// CHECK: 	setns	3133065982
-        	setns	0xbabecafe
-
-// CHECK: 	setns	305419896
-        	setns	0x12345678
-
-// CHECK: 	setp	%bl
-        	setp	%bl
-
-// CHECK: 	setp	3735928559(%ebx,%ecx,8)
-        	setp	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setp	32493
-        	setp	0x7eed
-
-// CHECK: 	setp	3133065982
-        	setp	0xbabecafe
-
-// CHECK: 	setp	305419896
-        	setp	0x12345678
-
-// CHECK: 	setnp	%bl
-        	setnp	%bl
-
-// CHECK: 	setnp	3735928559(%ebx,%ecx,8)
-        	setnp	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setnp	32493
-        	setnp	0x7eed
-
-// CHECK: 	setnp	3133065982
-        	setnp	0xbabecafe
-
-// CHECK: 	setnp	305419896
-        	setnp	0x12345678
-
-// CHECK: 	setl	%bl
-        	setl	%bl
-
-// CHECK: 	setl	3735928559(%ebx,%ecx,8)
-        	setl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setl	32493
-        	setl	0x7eed
-
-// CHECK: 	setl	3133065982
-        	setl	0xbabecafe
-
-// CHECK: 	setl	305419896
-        	setl	0x12345678
-
-// CHECK: 	setge	%bl
-        	setge	%bl
-
-// CHECK: 	setge	3735928559(%ebx,%ecx,8)
-        	setge	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setge	32493
-        	setge	0x7eed
-
-// CHECK: 	setge	3133065982
-        	setge	0xbabecafe
-
-// CHECK: 	setge	305419896
-        	setge	0x12345678
-
-// CHECK: 	setle	%bl
-        	setle	%bl
-
-// CHECK: 	setle	3735928559(%ebx,%ecx,8)
-        	setle	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setle	32493
-        	setle	0x7eed
-
-// CHECK: 	setle	3133065982
-        	setle	0xbabecafe
-
-// CHECK: 	setle	305419896
-        	setle	0x12345678
-
-// CHECK: 	setg	%bl
-        	setg	%bl
-
-// CHECK: 	setg	3735928559(%ebx,%ecx,8)
-        	setg	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	setg	32493
-        	setg	0x7eed
-
-// CHECK: 	setg	3133065982
-        	setg	0xbabecafe
-
-// CHECK: 	setg	305419896
-        	setg	0x12345678
-
 // CHECK: 	int	$127
         	int	$0x7f
 
-// CHECK: 	rsm
-        	rsm
-
-// CHECK: 	hlt
-        	hlt
-
-// CHECK: 	nopl	3735928559(%ebx,%ecx,8)
-        	nopl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	nopw	32493
-        	nopw	0x7eed
-
-// CHECK: 	nopl	3133065982
-        	nopl	0xbabecafe
-
-// CHECK: 	nopl	305419896
-        	nopl	0x12345678
-
-// CHECK: 	nop
-        	nop
-
-// CHECK: 	lldtw	32493
-        	lldtw	0x7eed
-
-// CHECK: 	lmsww	32493
-        	lmsww	0x7eed
-
-// CHECK: 	ltrw	32493
-        	ltrw	0x7eed
-
-// CHECK: 	sldtw	32493
-        	sldtw	0x7eed
-
-// CHECK: 	smsww	32493
-        	smsww	0x7eed
-
-// CHECK: 	strw	32493
-        	strw	0x7eed
-
-// CHECK: 	verr	%bx
-        	verr	%bx
-
-// CHECK: 	verr	3735928559(%ebx,%ecx,8)
-        	verr	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	verr	3133065982
-        	verr	0xbabecafe
-
-// CHECK: 	verr	305419896
-        	verr	0x12345678
-
-// CHECK: 	verw	%bx
-        	verw	%bx
-
-// CHECK: 	verw	3735928559(%ebx,%ecx,8)
-        	verw	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	verw	3133065982
-        	verw	0xbabecafe
-
-// CHECK: 	verw	305419896
-        	verw	0x12345678
-
-// CHECK: 	fld	%st(2)
-        	fld	%st(2)
-
-// CHECK: 	fldl	3735928559(%ebx,%ecx,8)
-        	fldl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fldl	3133065982
-        	fldl	0xbabecafe
-
-// CHECK: 	fldl	305419896
-        	fldl	0x12345678
-
-// CHECK: 	fld	%st(2)
-        	fld	%st(2)
-
-// CHECK: 	fildl	3735928559(%ebx,%ecx,8)
-        	fildl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fildl	3133065982
-        	fildl	0xbabecafe
-
-// CHECK: 	fildl	305419896
-        	fildl	0x12345678
-
-// CHECK: 	fildll	3735928559(%ebx,%ecx,8)
-        	fildll	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fildll	32493
-        	fildll	0x7eed
-
-// CHECK: 	fildll	3133065982
-        	fildll	0xbabecafe
-
-// CHECK: 	fildll	305419896
-        	fildll	0x12345678
-
-// CHECK: 	fldt	3735928559(%ebx,%ecx,8)
-        	fldt	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fldt	32493
-        	fldt	0x7eed
-
-// CHECK: 	fldt	3133065982
-        	fldt	0xbabecafe
-
-// CHECK: 	fldt	305419896
-        	fldt	0x12345678
-
-// CHECK: 	fbld	3735928559(%ebx,%ecx,8)
-        	fbld	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fbld	32493
-        	fbld	0x7eed
-
-// CHECK: 	fbld	3133065982
-        	fbld	0xbabecafe
-
-// CHECK: 	fbld	305419896
-        	fbld	0x12345678
-
-// CHECK: 	fst	%st(2)
-        	fst	%st(2)
-
-// CHECK: 	fstl	3735928559(%ebx,%ecx,8)
-        	fstl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fstl	3133065982
-        	fstl	0xbabecafe
-
-// CHECK: 	fstl	305419896
-        	fstl	0x12345678
-
-// CHECK: 	fst	%st(2)
-        	fst	%st(2)
-
-// CHECK: 	fistl	3735928559(%ebx,%ecx,8)
-        	fistl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fistl	3133065982
-        	fistl	0xbabecafe
-
-// CHECK: 	fistl	305419896
-        	fistl	0x12345678
-
-// CHECK: 	fstp	%st(2)
-        	fstp	%st(2)
-
-// CHECK: 	fstpl	3735928559(%ebx,%ecx,8)
-        	fstpl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fstpl	3133065982
-        	fstpl	0xbabecafe
-
-// CHECK: 	fstpl	305419896
-        	fstpl	0x12345678
-
-// CHECK: 	fstp	%st(2)
-        	fstp	%st(2)
-
-// CHECK: 	fistpl	3735928559(%ebx,%ecx,8)
-        	fistpl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fistpl	3133065982
-        	fistpl	0xbabecafe
-
-// CHECK: 	fistpl	305419896
-        	fistpl	0x12345678
-
-// CHECK: 	fistpll	3735928559(%ebx,%ecx,8)
-        	fistpll	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fistpll	32493
-        	fistpll	0x7eed
-
-// CHECK: 	fistpll	3133065982
-        	fistpll	0xbabecafe
-
-// CHECK: 	fistpll	305419896
-        	fistpll	0x12345678
-
-// CHECK: 	fstpt	3735928559(%ebx,%ecx,8)
-        	fstpt	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fstpt	32493
-        	fstpt	0x7eed
-
-// CHECK: 	fstpt	3133065982
-        	fstpt	0xbabecafe
-
-// CHECK: 	fstpt	305419896
-        	fstpt	0x12345678
-
-// CHECK: 	fbstp	3735928559(%ebx,%ecx,8)
-        	fbstp	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fbstp	32493
-        	fbstp	0x7eed
-
-// CHECK: 	fbstp	3133065982
-        	fbstp	0xbabecafe
-
-// CHECK: 	fbstp	305419896
-        	fbstp	0x12345678
-
-// CHECK: 	fxch	%st(2)
-        	fxch	%st(2)
-
-// CHECK: 	fcom	%st(2)
-        	fcom	%st(2)
-
-// CHECK: 	fcoml	3735928559(%ebx,%ecx,8)
-        	fcoml	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fcoml	3133065982
-        	fcoml	0xbabecafe
-
-// CHECK: 	fcoml	305419896
-        	fcoml	0x12345678
-
-// CHECK: 	fcom	%st(2)
-        	fcom	%st(2)
-
-// CHECK: 	ficoml	3735928559(%ebx,%ecx,8)
-        	ficoml	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	ficoml	3133065982
-        	ficoml	0xbabecafe
-
-// CHECK: 	ficoml	305419896
-        	ficoml	0x12345678
-
-// CHECK: 	fcomp	%st(2)
-        	fcomp	%st(2)
-
-// CHECK: 	fcompl	3735928559(%ebx,%ecx,8)
-        	fcompl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fcompl	3133065982
-        	fcompl	0xbabecafe
-
-// CHECK: 	fcompl	305419896
-        	fcompl	0x12345678
-
-// CHECK: 	fcomp	%st(2)
-        	fcomp	%st(2)
-
-// CHECK: 	ficompl	3735928559(%ebx,%ecx,8)
-        	ficompl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	ficompl	3133065982
-        	ficompl	0xbabecafe
-
-// CHECK: 	ficompl	305419896
-        	ficompl	0x12345678
-
-// CHECK: 	fcompp
-        	fcompp
-
-// CHECK: 	fucom	%st(2)
-        	fucom	%st(2)
-
-// CHECK: 	fucomp	%st(2)
-        	fucomp	%st(2)
-
-// CHECK: 	fucompp
-        	fucompp
-
-// CHECK: 	ftst
-        	ftst
-
-// CHECK: 	fxam
-        	fxam
-
-// CHECK: 	fld1
-        	fld1
-
-// CHECK: 	fldl2t
-        	fldl2t
-
-// CHECK: 	fldl2e
-        	fldl2e
-
-// CHECK: 	fldpi
-        	fldpi
-
-// CHECK: 	fldlg2
-        	fldlg2
-
-// CHECK: 	fldln2
-        	fldln2
-
-// CHECK: 	fldz
-        	fldz
-
-// CHECK: 	fadd	%st(2)
-        	fadd	%st(2)
-
-// CHECK: 	faddl	3735928559(%ebx,%ecx,8)
-        	faddl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	faddl	3133065982
-        	faddl	0xbabecafe
-
-// CHECK: 	faddl	305419896
-        	faddl	0x12345678
-
-// CHECK: 	fiaddl	3735928559(%ebx,%ecx,8)
-        	fiaddl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fiaddl	3133065982
-        	fiaddl	0xbabecafe
-
-// CHECK: 	fiaddl	305419896
-        	fiaddl	0x12345678
-
-// CHECK: 	faddp	%st(2)
-        	faddp	%st(2)
-
-// CHECK: 	fsub	%st(2)
-        	fsub	%st(2)
-
-// CHECK: 	fsubl	3735928559(%ebx,%ecx,8)
-        	fsubl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fsubl	3133065982
-        	fsubl	0xbabecafe
-
-// CHECK: 	fsubl	305419896
-        	fsubl	0x12345678
-
-// CHECK: 	fisubl	3735928559(%ebx,%ecx,8)
-        	fisubl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fisubl	3133065982
-        	fisubl	0xbabecafe
-
-// CHECK: 	fisubl	305419896
-        	fisubl	0x12345678
-
-// CHECK: 	fsubp	%st(2)
-        	fsubp	%st(2)
-
-// CHECK: 	fsubr	%st(2)
-        	fsubr	%st(2)
-
-// CHECK: 	fsubrl	3735928559(%ebx,%ecx,8)
-        	fsubrl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fsubrl	3133065982
-        	fsubrl	0xbabecafe
-
-// CHECK: 	fsubrl	305419896
-        	fsubrl	0x12345678
-
-// CHECK: 	fisubrl	3735928559(%ebx,%ecx,8)
-        	fisubrl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fisubrl	3133065982
-        	fisubrl	0xbabecafe
-
-// CHECK: 	fisubrl	305419896
-        	fisubrl	0x12345678
-
-// CHECK: 	fsubrp	%st(2)
-        	fsubrp	%st(2)
-
-// CHECK: 	fmul	%st(2)
-        	fmul	%st(2)
-
-// CHECK: 	fmull	3735928559(%ebx,%ecx,8)
-        	fmull	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fmull	3133065982
-        	fmull	0xbabecafe
-
-// CHECK: 	fmull	305419896
-        	fmull	0x12345678
-
-// CHECK: 	fimull	3735928559(%ebx,%ecx,8)
-        	fimull	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fimull	3133065982
-        	fimull	0xbabecafe
-
-// CHECK: 	fimull	305419896
-        	fimull	0x12345678
-
-// CHECK: 	fmulp	%st(2)
-        	fmulp	%st(2)
-
-// CHECK: 	fdiv	%st(2)
-        	fdiv	%st(2)
-
-// CHECK: 	fdivl	3735928559(%ebx,%ecx,8)
-        	fdivl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fdivl	3133065982
-        	fdivl	0xbabecafe
-
-// CHECK: 	fdivl	305419896
-        	fdivl	0x12345678
-
-// CHECK: 	fidivl	3735928559(%ebx,%ecx,8)
-        	fidivl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fidivl	3133065982
-        	fidivl	0xbabecafe
-
-// CHECK: 	fidivl	305419896
-        	fidivl	0x12345678
-
-// CHECK: 	fdivp	%st(2)
-        	fdivp	%st(2)
-
-// CHECK: 	fdivr	%st(2)
-        	fdivr	%st(2)
-
-// CHECK: 	fdivrl	3735928559(%ebx,%ecx,8)
-        	fdivrl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fdivrl	3133065982
-        	fdivrl	0xbabecafe
-
-// CHECK: 	fdivrl	305419896
-        	fdivrl	0x12345678
-
-// CHECK: 	fidivrl	3735928559(%ebx,%ecx,8)
-        	fidivrl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fidivrl	3133065982
-        	fidivrl	0xbabecafe
-
-// CHECK: 	fidivrl	305419896
-        	fidivrl	0x12345678
-
-// CHECK: 	fdivrp	%st(2)
-        	fdivrp	%st(2)
-
-// CHECK: 	f2xm1
-        	f2xm1
-
-// CHECK: 	fyl2x
-        	fyl2x
-
-// CHECK: 	fptan
-        	fptan
-
-// CHECK: 	fpatan
-        	fpatan
-
-// CHECK: 	fxtract
-        	fxtract
-
-// CHECK: 	fprem1
-        	fprem1
-
-// CHECK: 	fdecstp
-        	fdecstp
-
-// CHECK: 	fincstp
-        	fincstp
-
-// CHECK: 	fprem
-        	fprem
-
-// CHECK: 	fyl2xp1
-        	fyl2xp1
-
-// CHECK: 	fsqrt
-        	fsqrt
-
-// CHECK: 	fsincos
-        	fsincos
-
-// CHECK: 	frndint
-        	frndint
-
-// CHECK: 	fscale
-        	fscale
-
-// CHECK: 	fsin
-        	fsin
-
-// CHECK: 	fcos
-        	fcos
-
-// CHECK: 	fchs
-        	fchs
-
-// CHECK: 	fabs
-        	fabs
-
-// CHECK: 	fninit
-        	fninit
-
-// CHECK: 	fldcw	3735928559(%ebx,%ecx,8)
-        	fldcw	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fldcw	3133065982
-        	fldcw	0xbabecafe
-
-// CHECK: 	fldcw	305419896
-        	fldcw	0x12345678
-
-// CHECK: 	fnstcw	3735928559(%ebx,%ecx,8)
-        	fnstcw	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fnstcw	3133065982
-        	fnstcw	0xbabecafe
-
-// CHECK: 	fnstcw	305419896
-        	fnstcw	0x12345678
-
-// CHECK: 	fnstsw	3735928559(%ebx,%ecx,8)
-        	fnstsw	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fnstsw	3133065982
-        	fnstsw	0xbabecafe
-
-// CHECK: 	fnstsw	305419896
-        	fnstsw	0x12345678
-
-// CHECK: 	fnclex
-        	fnclex
-
-// CHECK: 	fnstenv	32493
-        	fnstenv	0x7eed
-
-// CHECK: 	fldenv	32493
-        	fldenv	0x7eed
-
-// CHECK: 	fnsave	32493
-        	fnsave	0x7eed
-
-// CHECK: 	frstor	32493
-        	frstor	0x7eed
-
-// CHECK: 	ffree	%st(2)
-        	ffree	%st(2)
-
-// CHECK: 	fnop
-        	fnop
-
-// CHECK: 	invd
-        	invd
-
-// CHECK: 	wbinvd
-        	wbinvd
-
-// CHECK: 	cpuid
-        	cpuid
-
-// CHECK: 	wrmsr
-        	wrmsr
-
-// CHECK: 	rdtsc
-        	rdtsc
-
-// CHECK: 	rdmsr
-        	rdmsr
-
-// CHECK: 	cmpxchg8b	3735928559(%ebx,%ecx,8)
-        	cmpxchg8b	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	cmpxchg8b	32493
-        	cmpxchg8b	0x7eed
-
-// CHECK: 	cmpxchg8b	3133065982
-        	cmpxchg8b	0xbabecafe
-
-// CHECK: 	cmpxchg8b	305419896
-        	cmpxchg8b	0x12345678
-
-// CHECK: 	sysenter
-        	sysenter
-
-// CHECK: 	sysexit
-        	sysexit
-
-// CHECK: 	fxsave	3735928559(%ebx,%ecx,8)
-        	fxsave	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fxsave	32493
-        	fxsave	0x7eed
-
-// CHECK: 	fxsave	3133065982
-        	fxsave	0xbabecafe
-
-// CHECK: 	fxsave	305419896
-        	fxsave	0x12345678
-
-// CHECK: 	fxrstor	3735928559(%ebx,%ecx,8)
-        	fxrstor	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fxrstor	32493
-        	fxrstor	0x7eed
-
-// CHECK: 	fxrstor	3133065982
-        	fxrstor	0xbabecafe
-
-// CHECK: 	fxrstor	305419896
-        	fxrstor	0x12345678
-
-// CHECK: 	rdpmc
-        	rdpmc
-
-// CHECK: 	ud2
-        	ud2
-
-// CHECK: 	fcmovb	%st(2), %st(0)
-        	fcmovb	%st(2),%st
-
-// CHECK: 	fcmove	%st(2), %st(0)
-        	fcmove	%st(2),%st
-
-// CHECK: 	fcmovbe	%st(2), %st(0)
-        	fcmovbe	%st(2),%st
-
-// CHECK: 	fcmovu	 %st(2), %st(0)
-        	fcmovu	%st(2),%st
-
-// CHECK: 	fcmovnb	%st(2), %st(0)
-        	fcmovnb	%st(2),%st
-
-// CHECK: 	fcmovne	%st(2), %st(0)
-        	fcmovne	%st(2),%st
-
-// CHECK: 	fcmovnbe	%st(2), %st(0)
-        	fcmovnbe	%st(2),%st
-
-// CHECK: 	fcmovnu	%st(2), %st(0)
-        	fcmovnu	%st(2),%st
-
-// CHECK: 	fcomi	%st(2)
-        	fcomi	%st(2),%st
-
-// CHECK: 	fucomi	%st(2)
-        	fucomi	%st(2),%st
-
-// CHECK: 	fcompi	%st(2)
-        	fcomip	%st(2),%st
-
-// CHECK: 	fucompi	%st(2)
-        	fucomip	%st(2),%st
-
-// CHECK: 	movntil	%ecx, 3735928559(%ebx,%ecx,8)
-        	movnti	%ecx,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movntil	%ecx, 69
-        	movntil	%ecx,0x45
-
-// CHECK: 	movntil	%ecx, 32493
-        	movnti	%ecx,0x7eed
-
-// CHECK: 	movntil	%ecx, 3133065982
-        	movnti	%ecx,0xbabecafe
-
-// CHECK: 	movntil	%ecx, 305419896
-        	movnti	%ecx,0x12345678
-
-// CHECK: 	clflush	3735928559(%ebx,%ecx,8)
-        	clflush	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	clflush	32493
-        	clflush	0x7eed
-
-// CHECK: 	clflush	3133065982
-        	clflush	0xbabecafe
-
-// CHECK: 	clflush	305419896
-        	clflush	0x12345678
-
 // CHECK: 	pause
         	pause
 
@@ -14231,4185 +10454,9 @@
 // CHECK: 	mfence
         	mfence
 
-// CHECK: 	emms
-        	emms
-
-// CHECK: 	movd	%ecx, %mm3
-        	movd	%ecx,%mm3
-
-// CHECK: 	movd	3735928559(%ebx,%ecx,8), %mm3
-        	movd	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	movd	69, %mm3
-        	movd	0x45,%mm3
-
-// CHECK: 	movd	32493, %mm3
-        	movd	0x7eed,%mm3
-
-// CHECK: 	movd	3133065982, %mm3
-        	movd	0xbabecafe,%mm3
-
-// CHECK: 	movd	305419896, %mm3
-        	movd	0x12345678,%mm3
-
-// CHECK: 	movd	%mm3, %ecx
-        	movd	%mm3,%ecx
-
-// CHECK: 	movd	%mm3, 3735928559(%ebx,%ecx,8)
-        	movd	%mm3,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movd	%mm3, 69
-        	movd	%mm3,0x45
-
-// CHECK: 	movd	%mm3, 32493
-        	movd	%mm3,0x7eed
-
-// CHECK: 	movd	%mm3, 3133065982
-        	movd	%mm3,0xbabecafe
-
-// CHECK: 	movd	%mm3, 305419896
-        	movd	%mm3,0x12345678
-
-// CHECK: 	movd	%ecx, %xmm5
-        	movd	%ecx,%xmm5
-
-// CHECK: 	movd	3735928559(%ebx,%ecx,8), %xmm5
-        	movd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movd	69, %xmm5
-        	movd	0x45,%xmm5
-
-// CHECK: 	movd	32493, %xmm5
-        	movd	0x7eed,%xmm5
-
-// CHECK: 	movd	3133065982, %xmm5
-        	movd	0xbabecafe,%xmm5
-
-// CHECK: 	movd	305419896, %xmm5
-        	movd	0x12345678,%xmm5
-
-// CHECK: 	movd	%xmm5, %ecx
-        	movd	%xmm5,%ecx
-
-// CHECK: 	movd	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movd	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movd	%xmm5, 69
-        	movd	%xmm5,0x45
-
-// CHECK: 	movd	%xmm5, 32493
-        	movd	%xmm5,0x7eed
-
-// CHECK: 	movd	%xmm5, 3133065982
-        	movd	%xmm5,0xbabecafe
-
-// CHECK: 	movd	%xmm5, 305419896
-        	movd	%xmm5,0x12345678
-
-// CHECK: 	movq	3735928559(%ebx,%ecx,8), %mm3
-        	movq	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	movq	69, %mm3
-        	movq	0x45,%mm3
-
-// CHECK: 	movq	32493, %mm3
-        	movq	0x7eed,%mm3
-
-// CHECK: 	movq	3133065982, %mm3
-        	movq	0xbabecafe,%mm3
-
-// CHECK: 	movq	305419896, %mm3
-        	movq	0x12345678,%mm3
-
-// CHECK: 	movq	%mm3, %mm3
-        	movq	%mm3,%mm3
-
-// CHECK: 	movq	%mm3, 3735928559(%ebx,%ecx,8)
-        	movq	%mm3,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movq	%mm3, 69
-        	movq	%mm3,0x45
-
-// CHECK: 	movq	%mm3, 32493
-        	movq	%mm3,0x7eed
-
-// CHECK: 	movq	%mm3, 3133065982
-        	movq	%mm3,0xbabecafe
-
-// CHECK: 	movq	%mm3, 305419896
-        	movq	%mm3,0x12345678
-
-// CHECK: 	movq	%mm3, %mm3
-        	movq	%mm3,%mm3
-
-// CHECK: 	movq	3735928559(%ebx,%ecx,8), %xmm5
-        	movq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movq	69, %xmm5
-        	movq	0x45,%xmm5
-
-// CHECK: 	movq	32493, %xmm5
-        	movq	0x7eed,%xmm5
-
-// CHECK: 	movq	3133065982, %xmm5
-        	movq	0xbabecafe,%xmm5
-
-// CHECK: 	movq	305419896, %xmm5
-        	movq	0x12345678,%xmm5
-
-// CHECK: 	movq	%xmm5, %xmm5
-        	movq	%xmm5,%xmm5
-
-// CHECK: 	movq	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movq	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movq	%xmm5, 69
-        	movq	%xmm5,0x45
-
-// CHECK: 	movq	%xmm5, 32493
-        	movq	%xmm5,0x7eed
-
-// CHECK: 	movq	%xmm5, 3133065982
-        	movq	%xmm5,0xbabecafe
-
-// CHECK: 	movq	%xmm5, 305419896
-        	movq	%xmm5,0x12345678
-
-// CHECK: 	movq	%xmm5, %xmm5
-        	movq	%xmm5,%xmm5
-
-// CHECK: 	packssdw	3735928559(%ebx,%ecx,8), %mm3
-        	packssdw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	packssdw	69, %mm3
-        	packssdw	0x45,%mm3
-
-// CHECK: 	packssdw	32493, %mm3
-        	packssdw	0x7eed,%mm3
-
-// CHECK: 	packssdw	3133065982, %mm3
-        	packssdw	0xbabecafe,%mm3
-
-// CHECK: 	packssdw	305419896, %mm3
-        	packssdw	0x12345678,%mm3
-
-// CHECK: 	packssdw	%mm3, %mm3
-        	packssdw	%mm3,%mm3
-
-// CHECK: 	packssdw	3735928559(%ebx,%ecx,8), %xmm5
-        	packssdw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	packssdw	69, %xmm5
-        	packssdw	0x45,%xmm5
-
-// CHECK: 	packssdw	32493, %xmm5
-        	packssdw	0x7eed,%xmm5
-
-// CHECK: 	packssdw	3133065982, %xmm5
-        	packssdw	0xbabecafe,%xmm5
-
-// CHECK: 	packssdw	305419896, %xmm5
-        	packssdw	0x12345678,%xmm5
-
-// CHECK: 	packssdw	%xmm5, %xmm5
-        	packssdw	%xmm5,%xmm5
-
-// CHECK: 	packsswb	3735928559(%ebx,%ecx,8), %mm3
-        	packsswb	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	packsswb	69, %mm3
-        	packsswb	0x45,%mm3
-
-// CHECK: 	packsswb	32493, %mm3
-        	packsswb	0x7eed,%mm3
-
-// CHECK: 	packsswb	3133065982, %mm3
-        	packsswb	0xbabecafe,%mm3
-
-// CHECK: 	packsswb	305419896, %mm3
-        	packsswb	0x12345678,%mm3
-
-// CHECK: 	packsswb	%mm3, %mm3
-        	packsswb	%mm3,%mm3
-
-// CHECK: 	packsswb	3735928559(%ebx,%ecx,8), %xmm5
-        	packsswb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	packsswb	69, %xmm5
-        	packsswb	0x45,%xmm5
-
-// CHECK: 	packsswb	32493, %xmm5
-        	packsswb	0x7eed,%xmm5
-
-// CHECK: 	packsswb	3133065982, %xmm5
-        	packsswb	0xbabecafe,%xmm5
-
-// CHECK: 	packsswb	305419896, %xmm5
-        	packsswb	0x12345678,%xmm5
-
-// CHECK: 	packsswb	%xmm5, %xmm5
-        	packsswb	%xmm5,%xmm5
-
-// CHECK: 	packuswb	3735928559(%ebx,%ecx,8), %mm3
-        	packuswb	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	packuswb	69, %mm3
-        	packuswb	0x45,%mm3
-
-// CHECK: 	packuswb	32493, %mm3
-        	packuswb	0x7eed,%mm3
-
-// CHECK: 	packuswb	3133065982, %mm3
-        	packuswb	0xbabecafe,%mm3
-
-// CHECK: 	packuswb	305419896, %mm3
-        	packuswb	0x12345678,%mm3
-
-// CHECK: 	packuswb	%mm3, %mm3
-        	packuswb	%mm3,%mm3
-
-// CHECK: 	packuswb	3735928559(%ebx,%ecx,8), %xmm5
-        	packuswb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	packuswb	69, %xmm5
-        	packuswb	0x45,%xmm5
-
-// CHECK: 	packuswb	32493, %xmm5
-        	packuswb	0x7eed,%xmm5
-
-// CHECK: 	packuswb	3133065982, %xmm5
-        	packuswb	0xbabecafe,%xmm5
-
-// CHECK: 	packuswb	305419896, %xmm5
-        	packuswb	0x12345678,%xmm5
-
-// CHECK: 	packuswb	%xmm5, %xmm5
-        	packuswb	%xmm5,%xmm5
-
-// CHECK: 	paddb	3735928559(%ebx,%ecx,8), %mm3
-        	paddb	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	paddb	69, %mm3
-        	paddb	0x45,%mm3
-
-// CHECK: 	paddb	32493, %mm3
-        	paddb	0x7eed,%mm3
-
-// CHECK: 	paddb	3133065982, %mm3
-        	paddb	0xbabecafe,%mm3
-
-// CHECK: 	paddb	305419896, %mm3
-        	paddb	0x12345678,%mm3
-
-// CHECK: 	paddb	%mm3, %mm3
-        	paddb	%mm3,%mm3
-
-// CHECK: 	paddb	3735928559(%ebx,%ecx,8), %xmm5
-        	paddb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	paddb	69, %xmm5
-        	paddb	0x45,%xmm5
-
-// CHECK: 	paddb	32493, %xmm5
-        	paddb	0x7eed,%xmm5
-
-// CHECK: 	paddb	3133065982, %xmm5
-        	paddb	0xbabecafe,%xmm5
-
-// CHECK: 	paddb	305419896, %xmm5
-        	paddb	0x12345678,%xmm5
-
-// CHECK: 	paddb	%xmm5, %xmm5
-        	paddb	%xmm5,%xmm5
-
-// CHECK: 	paddw	3735928559(%ebx,%ecx,8), %mm3
-        	paddw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	paddw	69, %mm3
-        	paddw	0x45,%mm3
-
-// CHECK: 	paddw	32493, %mm3
-        	paddw	0x7eed,%mm3
-
-// CHECK: 	paddw	3133065982, %mm3
-        	paddw	0xbabecafe,%mm3
-
-// CHECK: 	paddw	305419896, %mm3
-        	paddw	0x12345678,%mm3
-
-// CHECK: 	paddw	%mm3, %mm3
-        	paddw	%mm3,%mm3
-
-// CHECK: 	paddw	3735928559(%ebx,%ecx,8), %xmm5
-        	paddw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	paddw	69, %xmm5
-        	paddw	0x45,%xmm5
-
-// CHECK: 	paddw	32493, %xmm5
-        	paddw	0x7eed,%xmm5
-
-// CHECK: 	paddw	3133065982, %xmm5
-        	paddw	0xbabecafe,%xmm5
-
-// CHECK: 	paddw	305419896, %xmm5
-        	paddw	0x12345678,%xmm5
-
-// CHECK: 	paddw	%xmm5, %xmm5
-        	paddw	%xmm5,%xmm5
-
-// CHECK: 	paddd	3735928559(%ebx,%ecx,8), %mm3
-        	paddd	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	paddd	69, %mm3
-        	paddd	0x45,%mm3
-
-// CHECK: 	paddd	32493, %mm3
-        	paddd	0x7eed,%mm3
-
-// CHECK: 	paddd	3133065982, %mm3
-        	paddd	0xbabecafe,%mm3
-
-// CHECK: 	paddd	305419896, %mm3
-        	paddd	0x12345678,%mm3
-
-// CHECK: 	paddd	%mm3, %mm3
-        	paddd	%mm3,%mm3
-
-// CHECK: 	paddd	3735928559(%ebx,%ecx,8), %xmm5
-        	paddd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	paddd	69, %xmm5
-        	paddd	0x45,%xmm5
-
-// CHECK: 	paddd	32493, %xmm5
-        	paddd	0x7eed,%xmm5
-
-// CHECK: 	paddd	3133065982, %xmm5
-        	paddd	0xbabecafe,%xmm5
-
-// CHECK: 	paddd	305419896, %xmm5
-        	paddd	0x12345678,%xmm5
-
-// CHECK: 	paddd	%xmm5, %xmm5
-        	paddd	%xmm5,%xmm5
-
-// CHECK: 	paddq	3735928559(%ebx,%ecx,8), %mm3
-        	paddq	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	paddq	69, %mm3
-        	paddq	0x45,%mm3
-
-// CHECK: 	paddq	32493, %mm3
-        	paddq	0x7eed,%mm3
-
-// CHECK: 	paddq	3133065982, %mm3
-        	paddq	0xbabecafe,%mm3
-
-// CHECK: 	paddq	305419896, %mm3
-        	paddq	0x12345678,%mm3
-
-// CHECK: 	paddq	%mm3, %mm3
-        	paddq	%mm3,%mm3
-
-// CHECK: 	paddq	3735928559(%ebx,%ecx,8), %xmm5
-        	paddq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	paddq	69, %xmm5
-        	paddq	0x45,%xmm5
-
-// CHECK: 	paddq	32493, %xmm5
-        	paddq	0x7eed,%xmm5
-
-// CHECK: 	paddq	3133065982, %xmm5
-        	paddq	0xbabecafe,%xmm5
-
-// CHECK: 	paddq	305419896, %xmm5
-        	paddq	0x12345678,%xmm5
-
-// CHECK: 	paddq	%xmm5, %xmm5
-        	paddq	%xmm5,%xmm5
-
-// CHECK: 	paddsb	3735928559(%ebx,%ecx,8), %mm3
-        	paddsb	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	paddsb	69, %mm3
-        	paddsb	0x45,%mm3
-
-// CHECK: 	paddsb	32493, %mm3
-        	paddsb	0x7eed,%mm3
-
-// CHECK: 	paddsb	3133065982, %mm3
-        	paddsb	0xbabecafe,%mm3
-
-// CHECK: 	paddsb	305419896, %mm3
-        	paddsb	0x12345678,%mm3
-
-// CHECK: 	paddsb	%mm3, %mm3
-        	paddsb	%mm3,%mm3
-
-// CHECK: 	paddsb	3735928559(%ebx,%ecx,8), %xmm5
-        	paddsb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	paddsb	69, %xmm5
-        	paddsb	0x45,%xmm5
-
-// CHECK: 	paddsb	32493, %xmm5
-        	paddsb	0x7eed,%xmm5
-
-// CHECK: 	paddsb	3133065982, %xmm5
-        	paddsb	0xbabecafe,%xmm5
-
-// CHECK: 	paddsb	305419896, %xmm5
-        	paddsb	0x12345678,%xmm5
-
-// CHECK: 	paddsb	%xmm5, %xmm5
-        	paddsb	%xmm5,%xmm5
-
-// CHECK: 	paddsw	3735928559(%ebx,%ecx,8), %mm3
-        	paddsw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	paddsw	69, %mm3
-        	paddsw	0x45,%mm3
-
-// CHECK: 	paddsw	32493, %mm3
-        	paddsw	0x7eed,%mm3
-
-// CHECK: 	paddsw	3133065982, %mm3
-        	paddsw	0xbabecafe,%mm3
-
-// CHECK: 	paddsw	305419896, %mm3
-        	paddsw	0x12345678,%mm3
-
-// CHECK: 	paddsw	%mm3, %mm3
-        	paddsw	%mm3,%mm3
-
-// CHECK: 	paddsw	3735928559(%ebx,%ecx,8), %xmm5
-        	paddsw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	paddsw	69, %xmm5
-        	paddsw	0x45,%xmm5
-
-// CHECK: 	paddsw	32493, %xmm5
-        	paddsw	0x7eed,%xmm5
-
-// CHECK: 	paddsw	3133065982, %xmm5
-        	paddsw	0xbabecafe,%xmm5
-
-// CHECK: 	paddsw	305419896, %xmm5
-        	paddsw	0x12345678,%xmm5
-
-// CHECK: 	paddsw	%xmm5, %xmm5
-        	paddsw	%xmm5,%xmm5
-
-// CHECK: 	paddusb	3735928559(%ebx,%ecx,8), %mm3
-        	paddusb	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	paddusb	69, %mm3
-        	paddusb	0x45,%mm3
-
-// CHECK: 	paddusb	32493, %mm3
-        	paddusb	0x7eed,%mm3
-
-// CHECK: 	paddusb	3133065982, %mm3
-        	paddusb	0xbabecafe,%mm3
-
-// CHECK: 	paddusb	305419896, %mm3
-        	paddusb	0x12345678,%mm3
-
-// CHECK: 	paddusb	%mm3, %mm3
-        	paddusb	%mm3,%mm3
-
-// CHECK: 	paddusb	3735928559(%ebx,%ecx,8), %xmm5
-        	paddusb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	paddusb	69, %xmm5
-        	paddusb	0x45,%xmm5
-
-// CHECK: 	paddusb	32493, %xmm5
-        	paddusb	0x7eed,%xmm5
-
-// CHECK: 	paddusb	3133065982, %xmm5
-        	paddusb	0xbabecafe,%xmm5
-
-// CHECK: 	paddusb	305419896, %xmm5
-        	paddusb	0x12345678,%xmm5
-
-// CHECK: 	paddusb	%xmm5, %xmm5
-        	paddusb	%xmm5,%xmm5
-
-// CHECK: 	paddusw	3735928559(%ebx,%ecx,8), %mm3
-        	paddusw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	paddusw	69, %mm3
-        	paddusw	0x45,%mm3
-
-// CHECK: 	paddusw	32493, %mm3
-        	paddusw	0x7eed,%mm3
-
-// CHECK: 	paddusw	3133065982, %mm3
-        	paddusw	0xbabecafe,%mm3
-
-// CHECK: 	paddusw	305419896, %mm3
-        	paddusw	0x12345678,%mm3
-
-// CHECK: 	paddusw	%mm3, %mm3
-        	paddusw	%mm3,%mm3
-
-// CHECK: 	paddusw	3735928559(%ebx,%ecx,8), %xmm5
-        	paddusw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	paddusw	69, %xmm5
-        	paddusw	0x45,%xmm5
-
-// CHECK: 	paddusw	32493, %xmm5
-        	paddusw	0x7eed,%xmm5
-
-// CHECK: 	paddusw	3133065982, %xmm5
-        	paddusw	0xbabecafe,%xmm5
-
-// CHECK: 	paddusw	305419896, %xmm5
-        	paddusw	0x12345678,%xmm5
-
-// CHECK: 	paddusw	%xmm5, %xmm5
-        	paddusw	%xmm5,%xmm5
-
-// CHECK: 	pand	3735928559(%ebx,%ecx,8), %mm3
-        	pand	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pand	69, %mm3
-        	pand	0x45,%mm3
-
-// CHECK: 	pand	32493, %mm3
-        	pand	0x7eed,%mm3
-
-// CHECK: 	pand	3133065982, %mm3
-        	pand	0xbabecafe,%mm3
-
-// CHECK: 	pand	305419896, %mm3
-        	pand	0x12345678,%mm3
-
-// CHECK: 	pand	%mm3, %mm3
-        	pand	%mm3,%mm3
-
-// CHECK: 	pand	3735928559(%ebx,%ecx,8), %xmm5
-        	pand	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pand	69, %xmm5
-        	pand	0x45,%xmm5
-
-// CHECK: 	pand	32493, %xmm5
-        	pand	0x7eed,%xmm5
-
-// CHECK: 	pand	3133065982, %xmm5
-        	pand	0xbabecafe,%xmm5
-
-// CHECK: 	pand	305419896, %xmm5
-        	pand	0x12345678,%xmm5
-
-// CHECK: 	pand	%xmm5, %xmm5
-        	pand	%xmm5,%xmm5
-
-// CHECK: 	pandn	3735928559(%ebx,%ecx,8), %mm3
-        	pandn	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pandn	69, %mm3
-        	pandn	0x45,%mm3
-
-// CHECK: 	pandn	32493, %mm3
-        	pandn	0x7eed,%mm3
-
-// CHECK: 	pandn	3133065982, %mm3
-        	pandn	0xbabecafe,%mm3
-
-// CHECK: 	pandn	305419896, %mm3
-        	pandn	0x12345678,%mm3
-
-// CHECK: 	pandn	%mm3, %mm3
-        	pandn	%mm3,%mm3
-
-// CHECK: 	pandn	3735928559(%ebx,%ecx,8), %xmm5
-        	pandn	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pandn	69, %xmm5
-        	pandn	0x45,%xmm5
-
-// CHECK: 	pandn	32493, %xmm5
-        	pandn	0x7eed,%xmm5
-
-// CHECK: 	pandn	3133065982, %xmm5
-        	pandn	0xbabecafe,%xmm5
-
-// CHECK: 	pandn	305419896, %xmm5
-        	pandn	0x12345678,%xmm5
-
-// CHECK: 	pandn	%xmm5, %xmm5
-        	pandn	%xmm5,%xmm5
-
-// CHECK: 	pcmpeqb	3735928559(%ebx,%ecx,8), %mm3
-        	pcmpeqb	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pcmpeqb	69, %mm3
-        	pcmpeqb	0x45,%mm3
-
-// CHECK: 	pcmpeqb	32493, %mm3
-        	pcmpeqb	0x7eed,%mm3
-
-// CHECK: 	pcmpeqb	3133065982, %mm3
-        	pcmpeqb	0xbabecafe,%mm3
-
-// CHECK: 	pcmpeqb	305419896, %mm3
-        	pcmpeqb	0x12345678,%mm3
-
-// CHECK: 	pcmpeqb	%mm3, %mm3
-        	pcmpeqb	%mm3,%mm3
-
-// CHECK: 	pcmpeqb	3735928559(%ebx,%ecx,8), %xmm5
-        	pcmpeqb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pcmpeqb	69, %xmm5
-        	pcmpeqb	0x45,%xmm5
-
-// CHECK: 	pcmpeqb	32493, %xmm5
-        	pcmpeqb	0x7eed,%xmm5
-
-// CHECK: 	pcmpeqb	3133065982, %xmm5
-        	pcmpeqb	0xbabecafe,%xmm5
-
-// CHECK: 	pcmpeqb	305419896, %xmm5
-        	pcmpeqb	0x12345678,%xmm5
-
-// CHECK: 	pcmpeqb	%xmm5, %xmm5
-        	pcmpeqb	%xmm5,%xmm5
-
-// CHECK: 	pcmpeqw	3735928559(%ebx,%ecx,8), %mm3
-        	pcmpeqw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pcmpeqw	69, %mm3
-        	pcmpeqw	0x45,%mm3
-
-// CHECK: 	pcmpeqw	32493, %mm3
-        	pcmpeqw	0x7eed,%mm3
-
-// CHECK: 	pcmpeqw	3133065982, %mm3
-        	pcmpeqw	0xbabecafe,%mm3
-
-// CHECK: 	pcmpeqw	305419896, %mm3
-        	pcmpeqw	0x12345678,%mm3
-
-// CHECK: 	pcmpeqw	%mm3, %mm3
-        	pcmpeqw	%mm3,%mm3
-
-// CHECK: 	pcmpeqw	3735928559(%ebx,%ecx,8), %xmm5
-        	pcmpeqw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pcmpeqw	69, %xmm5
-        	pcmpeqw	0x45,%xmm5
-
-// CHECK: 	pcmpeqw	32493, %xmm5
-        	pcmpeqw	0x7eed,%xmm5
-
-// CHECK: 	pcmpeqw	3133065982, %xmm5
-        	pcmpeqw	0xbabecafe,%xmm5
-
-// CHECK: 	pcmpeqw	305419896, %xmm5
-        	pcmpeqw	0x12345678,%xmm5
-
-// CHECK: 	pcmpeqw	%xmm5, %xmm5
-        	pcmpeqw	%xmm5,%xmm5
-
-// CHECK: 	pcmpeqd	3735928559(%ebx,%ecx,8), %mm3
-        	pcmpeqd	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pcmpeqd	69, %mm3
-        	pcmpeqd	0x45,%mm3
-
-// CHECK: 	pcmpeqd	32493, %mm3
-        	pcmpeqd	0x7eed,%mm3
-
-// CHECK: 	pcmpeqd	3133065982, %mm3
-        	pcmpeqd	0xbabecafe,%mm3
-
-// CHECK: 	pcmpeqd	305419896, %mm3
-        	pcmpeqd	0x12345678,%mm3
-
-// CHECK: 	pcmpeqd	%mm3, %mm3
-        	pcmpeqd	%mm3,%mm3
-
-// CHECK: 	pcmpeqd	3735928559(%ebx,%ecx,8), %xmm5
-        	pcmpeqd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pcmpeqd	69, %xmm5
-        	pcmpeqd	0x45,%xmm5
-
-// CHECK: 	pcmpeqd	32493, %xmm5
-        	pcmpeqd	0x7eed,%xmm5
-
-// CHECK: 	pcmpeqd	3133065982, %xmm5
-        	pcmpeqd	0xbabecafe,%xmm5
-
-// CHECK: 	pcmpeqd	305419896, %xmm5
-        	pcmpeqd	0x12345678,%xmm5
-
-// CHECK: 	pcmpeqd	%xmm5, %xmm5
-        	pcmpeqd	%xmm5,%xmm5
-
-// CHECK: 	pcmpgtb	3735928559(%ebx,%ecx,8), %mm3
-        	pcmpgtb	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pcmpgtb	69, %mm3
-        	pcmpgtb	0x45,%mm3
-
-// CHECK: 	pcmpgtb	32493, %mm3
-        	pcmpgtb	0x7eed,%mm3
-
-// CHECK: 	pcmpgtb	3133065982, %mm3
-        	pcmpgtb	0xbabecafe,%mm3
-
-// CHECK: 	pcmpgtb	305419896, %mm3
-        	pcmpgtb	0x12345678,%mm3
-
-// CHECK: 	pcmpgtb	%mm3, %mm3
-        	pcmpgtb	%mm3,%mm3
-
-// CHECK: 	pcmpgtb	3735928559(%ebx,%ecx,8), %xmm5
-        	pcmpgtb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pcmpgtb	69, %xmm5
-        	pcmpgtb	0x45,%xmm5
-
-// CHECK: 	pcmpgtb	32493, %xmm5
-        	pcmpgtb	0x7eed,%xmm5
-
-// CHECK: 	pcmpgtb	3133065982, %xmm5
-        	pcmpgtb	0xbabecafe,%xmm5
-
-// CHECK: 	pcmpgtb	305419896, %xmm5
-        	pcmpgtb	0x12345678,%xmm5
-
-// CHECK: 	pcmpgtb	%xmm5, %xmm5
-        	pcmpgtb	%xmm5,%xmm5
-
-// CHECK: 	pcmpgtw	3735928559(%ebx,%ecx,8), %mm3
-        	pcmpgtw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pcmpgtw	69, %mm3
-        	pcmpgtw	0x45,%mm3
-
-// CHECK: 	pcmpgtw	32493, %mm3
-        	pcmpgtw	0x7eed,%mm3
-
-// CHECK: 	pcmpgtw	3133065982, %mm3
-        	pcmpgtw	0xbabecafe,%mm3
-
-// CHECK: 	pcmpgtw	305419896, %mm3
-        	pcmpgtw	0x12345678,%mm3
-
-// CHECK: 	pcmpgtw	%mm3, %mm3
-        	pcmpgtw	%mm3,%mm3
-
-// CHECK: 	pcmpgtw	3735928559(%ebx,%ecx,8), %xmm5
-        	pcmpgtw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pcmpgtw	69, %xmm5
-        	pcmpgtw	0x45,%xmm5
-
-// CHECK: 	pcmpgtw	32493, %xmm5
-        	pcmpgtw	0x7eed,%xmm5
-
-// CHECK: 	pcmpgtw	3133065982, %xmm5
-        	pcmpgtw	0xbabecafe,%xmm5
-
-// CHECK: 	pcmpgtw	305419896, %xmm5
-        	pcmpgtw	0x12345678,%xmm5
-
-// CHECK: 	pcmpgtw	%xmm5, %xmm5
-        	pcmpgtw	%xmm5,%xmm5
-
-// CHECK: 	pcmpgtd	3735928559(%ebx,%ecx,8), %mm3
-        	pcmpgtd	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pcmpgtd	69, %mm3
-        	pcmpgtd	0x45,%mm3
-
-// CHECK: 	pcmpgtd	32493, %mm3
-        	pcmpgtd	0x7eed,%mm3
-
-// CHECK: 	pcmpgtd	3133065982, %mm3
-        	pcmpgtd	0xbabecafe,%mm3
-
-// CHECK: 	pcmpgtd	305419896, %mm3
-        	pcmpgtd	0x12345678,%mm3
-
-// CHECK: 	pcmpgtd	%mm3, %mm3
-        	pcmpgtd	%mm3,%mm3
-
-// CHECK: 	pcmpgtd	3735928559(%ebx,%ecx,8), %xmm5
-        	pcmpgtd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pcmpgtd	69, %xmm5
-        	pcmpgtd	0x45,%xmm5
-
-// CHECK: 	pcmpgtd	32493, %xmm5
-        	pcmpgtd	0x7eed,%xmm5
-
-// CHECK: 	pcmpgtd	3133065982, %xmm5
-        	pcmpgtd	0xbabecafe,%xmm5
-
-// CHECK: 	pcmpgtd	305419896, %xmm5
-        	pcmpgtd	0x12345678,%xmm5
-
-// CHECK: 	pcmpgtd	%xmm5, %xmm5
-        	pcmpgtd	%xmm5,%xmm5
-
-// CHECK: 	pmaddwd	3735928559(%ebx,%ecx,8), %mm3
-        	pmaddwd	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pmaddwd	69, %mm3
-        	pmaddwd	0x45,%mm3
-
-// CHECK: 	pmaddwd	32493, %mm3
-        	pmaddwd	0x7eed,%mm3
-
-// CHECK: 	pmaddwd	3133065982, %mm3
-        	pmaddwd	0xbabecafe,%mm3
-
-// CHECK: 	pmaddwd	305419896, %mm3
-        	pmaddwd	0x12345678,%mm3
-
-// CHECK: 	pmaddwd	%mm3, %mm3
-        	pmaddwd	%mm3,%mm3
-
-// CHECK: 	pmaddwd	3735928559(%ebx,%ecx,8), %xmm5
-        	pmaddwd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmaddwd	69, %xmm5
-        	pmaddwd	0x45,%xmm5
-
-// CHECK: 	pmaddwd	32493, %xmm5
-        	pmaddwd	0x7eed,%xmm5
-
-// CHECK: 	pmaddwd	3133065982, %xmm5
-        	pmaddwd	0xbabecafe,%xmm5
-
-// CHECK: 	pmaddwd	305419896, %xmm5
-        	pmaddwd	0x12345678,%xmm5
-
-// CHECK: 	pmaddwd	%xmm5, %xmm5
-        	pmaddwd	%xmm5,%xmm5
-
-// CHECK: 	pmulhw	3735928559(%ebx,%ecx,8), %mm3
-        	pmulhw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pmulhw	69, %mm3
-        	pmulhw	0x45,%mm3
-
-// CHECK: 	pmulhw	32493, %mm3
-        	pmulhw	0x7eed,%mm3
-
-// CHECK: 	pmulhw	3133065982, %mm3
-        	pmulhw	0xbabecafe,%mm3
-
-// CHECK: 	pmulhw	305419896, %mm3
-        	pmulhw	0x12345678,%mm3
-
-// CHECK: 	pmulhw	%mm3, %mm3
-        	pmulhw	%mm3,%mm3
-
-// CHECK: 	pmulhw	3735928559(%ebx,%ecx,8), %xmm5
-        	pmulhw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmulhw	69, %xmm5
-        	pmulhw	0x45,%xmm5
-
-// CHECK: 	pmulhw	32493, %xmm5
-        	pmulhw	0x7eed,%xmm5
-
-// CHECK: 	pmulhw	3133065982, %xmm5
-        	pmulhw	0xbabecafe,%xmm5
-
-// CHECK: 	pmulhw	305419896, %xmm5
-        	pmulhw	0x12345678,%xmm5
-
-// CHECK: 	pmulhw	%xmm5, %xmm5
-        	pmulhw	%xmm5,%xmm5
-
-// CHECK: 	pmullw	3735928559(%ebx,%ecx,8), %mm3
-        	pmullw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pmullw	69, %mm3
-        	pmullw	0x45,%mm3
-
-// CHECK: 	pmullw	32493, %mm3
-        	pmullw	0x7eed,%mm3
-
-// CHECK: 	pmullw	3133065982, %mm3
-        	pmullw	0xbabecafe,%mm3
-
-// CHECK: 	pmullw	305419896, %mm3
-        	pmullw	0x12345678,%mm3
-
-// CHECK: 	pmullw	%mm3, %mm3
-        	pmullw	%mm3,%mm3
-
-// CHECK: 	pmullw	3735928559(%ebx,%ecx,8), %xmm5
-        	pmullw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmullw	69, %xmm5
-        	pmullw	0x45,%xmm5
-
-// CHECK: 	pmullw	32493, %xmm5
-        	pmullw	0x7eed,%xmm5
-
-// CHECK: 	pmullw	3133065982, %xmm5
-        	pmullw	0xbabecafe,%xmm5
-
-// CHECK: 	pmullw	305419896, %xmm5
-        	pmullw	0x12345678,%xmm5
-
-// CHECK: 	pmullw	%xmm5, %xmm5
-        	pmullw	%xmm5,%xmm5
-
-// CHECK: 	por	3735928559(%ebx,%ecx,8), %mm3
-        	por	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	por	69, %mm3
-        	por	0x45,%mm3
-
-// CHECK: 	por	32493, %mm3
-        	por	0x7eed,%mm3
-
-// CHECK: 	por	3133065982, %mm3
-        	por	0xbabecafe,%mm3
-
-// CHECK: 	por	305419896, %mm3
-        	por	0x12345678,%mm3
-
-// CHECK: 	por	%mm3, %mm3
-        	por	%mm3,%mm3
-
-// CHECK: 	por	3735928559(%ebx,%ecx,8), %xmm5
-        	por	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	por	69, %xmm5
-        	por	0x45,%xmm5
-
-// CHECK: 	por	32493, %xmm5
-        	por	0x7eed,%xmm5
-
-// CHECK: 	por	3133065982, %xmm5
-        	por	0xbabecafe,%xmm5
-
-// CHECK: 	por	305419896, %xmm5
-        	por	0x12345678,%xmm5
-
-// CHECK: 	por	%xmm5, %xmm5
-        	por	%xmm5,%xmm5
-
-// CHECK: 	psllw	3735928559(%ebx,%ecx,8), %mm3
-        	psllw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psllw	69, %mm3
-        	psllw	0x45,%mm3
-
-// CHECK: 	psllw	32493, %mm3
-        	psllw	0x7eed,%mm3
-
-// CHECK: 	psllw	3133065982, %mm3
-        	psllw	0xbabecafe,%mm3
-
-// CHECK: 	psllw	305419896, %mm3
-        	psllw	0x12345678,%mm3
-
-// CHECK: 	psllw	%mm3, %mm3
-        	psllw	%mm3,%mm3
-
-// CHECK: 	psllw	3735928559(%ebx,%ecx,8), %xmm5
-        	psllw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psllw	69, %xmm5
-        	psllw	0x45,%xmm5
-
-// CHECK: 	psllw	32493, %xmm5
-        	psllw	0x7eed,%xmm5
-
-// CHECK: 	psllw	3133065982, %xmm5
-        	psllw	0xbabecafe,%xmm5
-
-// CHECK: 	psllw	305419896, %xmm5
-        	psllw	0x12345678,%xmm5
-
-// CHECK: 	psllw	%xmm5, %xmm5
-        	psllw	%xmm5,%xmm5
-
-// CHECK: 	psllw	$127, %mm3
-        	psllw	$0x7f,%mm3
-
-// CHECK: 	psllw	$127, %xmm5
-        	psllw	$0x7f,%xmm5
-
-// CHECK: 	pslld	3735928559(%ebx,%ecx,8), %mm3
-        	pslld	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pslld	69, %mm3
-        	pslld	0x45,%mm3
-
-// CHECK: 	pslld	32493, %mm3
-        	pslld	0x7eed,%mm3
-
-// CHECK: 	pslld	3133065982, %mm3
-        	pslld	0xbabecafe,%mm3
-
-// CHECK: 	pslld	305419896, %mm3
-        	pslld	0x12345678,%mm3
-
-// CHECK: 	pslld	%mm3, %mm3
-        	pslld	%mm3,%mm3
-
-// CHECK: 	pslld	3735928559(%ebx,%ecx,8), %xmm5
-        	pslld	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pslld	69, %xmm5
-        	pslld	0x45,%xmm5
-
-// CHECK: 	pslld	32493, %xmm5
-        	pslld	0x7eed,%xmm5
-
-// CHECK: 	pslld	3133065982, %xmm5
-        	pslld	0xbabecafe,%xmm5
-
-// CHECK: 	pslld	305419896, %xmm5
-        	pslld	0x12345678,%xmm5
-
-// CHECK: 	pslld	%xmm5, %xmm5
-        	pslld	%xmm5,%xmm5
-
-// CHECK: 	pslld	$127, %mm3
-        	pslld	$0x7f,%mm3
-
-// CHECK: 	pslld	$127, %xmm5
-        	pslld	$0x7f,%xmm5
-
-// CHECK: 	psllq	3735928559(%ebx,%ecx,8), %mm3
-        	psllq	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psllq	69, %mm3
-        	psllq	0x45,%mm3
-
-// CHECK: 	psllq	32493, %mm3
-        	psllq	0x7eed,%mm3
-
-// CHECK: 	psllq	3133065982, %mm3
-        	psllq	0xbabecafe,%mm3
-
-// CHECK: 	psllq	305419896, %mm3
-        	psllq	0x12345678,%mm3
-
-// CHECK: 	psllq	%mm3, %mm3
-        	psllq	%mm3,%mm3
-
-// CHECK: 	psllq	3735928559(%ebx,%ecx,8), %xmm5
-        	psllq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psllq	69, %xmm5
-        	psllq	0x45,%xmm5
-
-// CHECK: 	psllq	32493, %xmm5
-        	psllq	0x7eed,%xmm5
-
-// CHECK: 	psllq	3133065982, %xmm5
-        	psllq	0xbabecafe,%xmm5
-
-// CHECK: 	psllq	305419896, %xmm5
-        	psllq	0x12345678,%xmm5
-
-// CHECK: 	psllq	%xmm5, %xmm5
-        	psllq	%xmm5,%xmm5
-
-// CHECK: 	psllq	$127, %mm3
-        	psllq	$0x7f,%mm3
-
-// CHECK: 	psllq	$127, %xmm5
-        	psllq	$0x7f,%xmm5
-
-// CHECK: 	psraw	3735928559(%ebx,%ecx,8), %mm3
-        	psraw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psraw	69, %mm3
-        	psraw	0x45,%mm3
-
-// CHECK: 	psraw	32493, %mm3
-        	psraw	0x7eed,%mm3
-
-// CHECK: 	psraw	3133065982, %mm3
-        	psraw	0xbabecafe,%mm3
-
-// CHECK: 	psraw	305419896, %mm3
-        	psraw	0x12345678,%mm3
-
-// CHECK: 	psraw	%mm3, %mm3
-        	psraw	%mm3,%mm3
-
-// CHECK: 	psraw	3735928559(%ebx,%ecx,8), %xmm5
-        	psraw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psraw	69, %xmm5
-        	psraw	0x45,%xmm5
-
-// CHECK: 	psraw	32493, %xmm5
-        	psraw	0x7eed,%xmm5
-
-// CHECK: 	psraw	3133065982, %xmm5
-        	psraw	0xbabecafe,%xmm5
-
-// CHECK: 	psraw	305419896, %xmm5
-        	psraw	0x12345678,%xmm5
-
-// CHECK: 	psraw	%xmm5, %xmm5
-        	psraw	%xmm5,%xmm5
-
-// CHECK: 	psraw	$127, %mm3
-        	psraw	$0x7f,%mm3
-
-// CHECK: 	psraw	$127, %xmm5
-        	psraw	$0x7f,%xmm5
-
-// CHECK: 	psrad	3735928559(%ebx,%ecx,8), %mm3
-        	psrad	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psrad	69, %mm3
-        	psrad	0x45,%mm3
-
-// CHECK: 	psrad	32493, %mm3
-        	psrad	0x7eed,%mm3
-
-// CHECK: 	psrad	3133065982, %mm3
-        	psrad	0xbabecafe,%mm3
-
-// CHECK: 	psrad	305419896, %mm3
-        	psrad	0x12345678,%mm3
-
-// CHECK: 	psrad	%mm3, %mm3
-        	psrad	%mm3,%mm3
-
-// CHECK: 	psrad	3735928559(%ebx,%ecx,8), %xmm5
-        	psrad	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psrad	69, %xmm5
-        	psrad	0x45,%xmm5
-
-// CHECK: 	psrad	32493, %xmm5
-        	psrad	0x7eed,%xmm5
-
-// CHECK: 	psrad	3133065982, %xmm5
-        	psrad	0xbabecafe,%xmm5
-
-// CHECK: 	psrad	305419896, %xmm5
-        	psrad	0x12345678,%xmm5
-
-// CHECK: 	psrad	%xmm5, %xmm5
-        	psrad	%xmm5,%xmm5
-
-// CHECK: 	psrad	$127, %mm3
-        	psrad	$0x7f,%mm3
-
-// CHECK: 	psrad	$127, %xmm5
-        	psrad	$0x7f,%xmm5
-
-// CHECK: 	psrlw	3735928559(%ebx,%ecx,8), %mm3
-        	psrlw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psrlw	69, %mm3
-        	psrlw	0x45,%mm3
-
-// CHECK: 	psrlw	32493, %mm3
-        	psrlw	0x7eed,%mm3
-
-// CHECK: 	psrlw	3133065982, %mm3
-        	psrlw	0xbabecafe,%mm3
-
-// CHECK: 	psrlw	305419896, %mm3
-        	psrlw	0x12345678,%mm3
-
-// CHECK: 	psrlw	%mm3, %mm3
-        	psrlw	%mm3,%mm3
-
-// CHECK: 	psrlw	3735928559(%ebx,%ecx,8), %xmm5
-        	psrlw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psrlw	69, %xmm5
-        	psrlw	0x45,%xmm5
-
-// CHECK: 	psrlw	32493, %xmm5
-        	psrlw	0x7eed,%xmm5
-
-// CHECK: 	psrlw	3133065982, %xmm5
-        	psrlw	0xbabecafe,%xmm5
-
-// CHECK: 	psrlw	305419896, %xmm5
-        	psrlw	0x12345678,%xmm5
-
-// CHECK: 	psrlw	%xmm5, %xmm5
-        	psrlw	%xmm5,%xmm5
-
-// CHECK: 	psrlw	$127, %mm3
-        	psrlw	$0x7f,%mm3
-
-// CHECK: 	psrlw	$127, %xmm5
-        	psrlw	$0x7f,%xmm5
-
-// CHECK: 	psrld	3735928559(%ebx,%ecx,8), %mm3
-        	psrld	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psrld	69, %mm3
-        	psrld	0x45,%mm3
-
-// CHECK: 	psrld	32493, %mm3
-        	psrld	0x7eed,%mm3
-
-// CHECK: 	psrld	3133065982, %mm3
-        	psrld	0xbabecafe,%mm3
-
-// CHECK: 	psrld	305419896, %mm3
-        	psrld	0x12345678,%mm3
-
-// CHECK: 	psrld	%mm3, %mm3
-        	psrld	%mm3,%mm3
-
-// CHECK: 	psrld	3735928559(%ebx,%ecx,8), %xmm5
-        	psrld	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psrld	69, %xmm5
-        	psrld	0x45,%xmm5
-
-// CHECK: 	psrld	32493, %xmm5
-        	psrld	0x7eed,%xmm5
-
-// CHECK: 	psrld	3133065982, %xmm5
-        	psrld	0xbabecafe,%xmm5
-
-// CHECK: 	psrld	305419896, %xmm5
-        	psrld	0x12345678,%xmm5
-
-// CHECK: 	psrld	%xmm5, %xmm5
-        	psrld	%xmm5,%xmm5
-
-// CHECK: 	psrld	$127, %mm3
-        	psrld	$0x7f,%mm3
-
-// CHECK: 	psrld	$127, %xmm5
-        	psrld	$0x7f,%xmm5
-
-// CHECK: 	psrlq	3735928559(%ebx,%ecx,8), %mm3
-        	psrlq	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psrlq	69, %mm3
-        	psrlq	0x45,%mm3
-
-// CHECK: 	psrlq	32493, %mm3
-        	psrlq	0x7eed,%mm3
-
-// CHECK: 	psrlq	3133065982, %mm3
-        	psrlq	0xbabecafe,%mm3
-
-// CHECK: 	psrlq	305419896, %mm3
-        	psrlq	0x12345678,%mm3
-
-// CHECK: 	psrlq	%mm3, %mm3
-        	psrlq	%mm3,%mm3
-
-// CHECK: 	psrlq	3735928559(%ebx,%ecx,8), %xmm5
-        	psrlq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psrlq	69, %xmm5
-        	psrlq	0x45,%xmm5
-
-// CHECK: 	psrlq	32493, %xmm5
-        	psrlq	0x7eed,%xmm5
-
-// CHECK: 	psrlq	3133065982, %xmm5
-        	psrlq	0xbabecafe,%xmm5
-
-// CHECK: 	psrlq	305419896, %xmm5
-        	psrlq	0x12345678,%xmm5
-
-// CHECK: 	psrlq	%xmm5, %xmm5
-        	psrlq	%xmm5,%xmm5
-
-// CHECK: 	psrlq	$127, %mm3
-        	psrlq	$0x7f,%mm3
-
-// CHECK: 	psrlq	$127, %xmm5
-        	psrlq	$0x7f,%xmm5
-
-// CHECK: 	psubb	3735928559(%ebx,%ecx,8), %mm3
-        	psubb	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psubb	69, %mm3
-        	psubb	0x45,%mm3
-
-// CHECK: 	psubb	32493, %mm3
-        	psubb	0x7eed,%mm3
-
-// CHECK: 	psubb	3133065982, %mm3
-        	psubb	0xbabecafe,%mm3
-
-// CHECK: 	psubb	305419896, %mm3
-        	psubb	0x12345678,%mm3
-
-// CHECK: 	psubb	%mm3, %mm3
-        	psubb	%mm3,%mm3
-
-// CHECK: 	psubb	3735928559(%ebx,%ecx,8), %xmm5
-        	psubb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psubb	69, %xmm5
-        	psubb	0x45,%xmm5
-
-// CHECK: 	psubb	32493, %xmm5
-        	psubb	0x7eed,%xmm5
-
-// CHECK: 	psubb	3133065982, %xmm5
-        	psubb	0xbabecafe,%xmm5
-
-// CHECK: 	psubb	305419896, %xmm5
-        	psubb	0x12345678,%xmm5
-
-// CHECK: 	psubb	%xmm5, %xmm5
-        	psubb	%xmm5,%xmm5
-
-// CHECK: 	psubw	3735928559(%ebx,%ecx,8), %mm3
-        	psubw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psubw	69, %mm3
-        	psubw	0x45,%mm3
-
-// CHECK: 	psubw	32493, %mm3
-        	psubw	0x7eed,%mm3
-
-// CHECK: 	psubw	3133065982, %mm3
-        	psubw	0xbabecafe,%mm3
-
-// CHECK: 	psubw	305419896, %mm3
-        	psubw	0x12345678,%mm3
-
-// CHECK: 	psubw	%mm3, %mm3
-        	psubw	%mm3,%mm3
-
-// CHECK: 	psubw	3735928559(%ebx,%ecx,8), %xmm5
-        	psubw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psubw	69, %xmm5
-        	psubw	0x45,%xmm5
-
-// CHECK: 	psubw	32493, %xmm5
-        	psubw	0x7eed,%xmm5
-
-// CHECK: 	psubw	3133065982, %xmm5
-        	psubw	0xbabecafe,%xmm5
-
-// CHECK: 	psubw	305419896, %xmm5
-        	psubw	0x12345678,%xmm5
-
-// CHECK: 	psubw	%xmm5, %xmm5
-        	psubw	%xmm5,%xmm5
-
-// CHECK: 	psubd	3735928559(%ebx,%ecx,8), %mm3
-        	psubd	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psubd	69, %mm3
-        	psubd	0x45,%mm3
-
-// CHECK: 	psubd	32493, %mm3
-        	psubd	0x7eed,%mm3
-
-// CHECK: 	psubd	3133065982, %mm3
-        	psubd	0xbabecafe,%mm3
-
-// CHECK: 	psubd	305419896, %mm3
-        	psubd	0x12345678,%mm3
-
-// CHECK: 	psubd	%mm3, %mm3
-        	psubd	%mm3,%mm3
-
-// CHECK: 	psubd	3735928559(%ebx,%ecx,8), %xmm5
-        	psubd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psubd	69, %xmm5
-        	psubd	0x45,%xmm5
-
-// CHECK: 	psubd	32493, %xmm5
-        	psubd	0x7eed,%xmm5
-
-// CHECK: 	psubd	3133065982, %xmm5
-        	psubd	0xbabecafe,%xmm5
-
-// CHECK: 	psubd	305419896, %xmm5
-        	psubd	0x12345678,%xmm5
-
-// CHECK: 	psubd	%xmm5, %xmm5
-        	psubd	%xmm5,%xmm5
-
-// CHECK: 	psubq	3735928559(%ebx,%ecx,8), %mm3
-        	psubq	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psubq	69, %mm3
-        	psubq	0x45,%mm3
-
-// CHECK: 	psubq	32493, %mm3
-        	psubq	0x7eed,%mm3
-
-// CHECK: 	psubq	3133065982, %mm3
-        	psubq	0xbabecafe,%mm3
-
-// CHECK: 	psubq	305419896, %mm3
-        	psubq	0x12345678,%mm3
-
-// CHECK: 	psubq	%mm3, %mm3
-        	psubq	%mm3,%mm3
-
-// CHECK: 	psubq	3735928559(%ebx,%ecx,8), %xmm5
-        	psubq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psubq	69, %xmm5
-        	psubq	0x45,%xmm5
-
-// CHECK: 	psubq	32493, %xmm5
-        	psubq	0x7eed,%xmm5
-
-// CHECK: 	psubq	3133065982, %xmm5
-        	psubq	0xbabecafe,%xmm5
-
-// CHECK: 	psubq	305419896, %xmm5
-        	psubq	0x12345678,%xmm5
-
-// CHECK: 	psubq	%xmm5, %xmm5
-        	psubq	%xmm5,%xmm5
-
-// CHECK: 	psubsb	3735928559(%ebx,%ecx,8), %mm3
-        	psubsb	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psubsb	69, %mm3
-        	psubsb	0x45,%mm3
-
-// CHECK: 	psubsb	32493, %mm3
-        	psubsb	0x7eed,%mm3
-
-// CHECK: 	psubsb	3133065982, %mm3
-        	psubsb	0xbabecafe,%mm3
-
-// CHECK: 	psubsb	305419896, %mm3
-        	psubsb	0x12345678,%mm3
-
-// CHECK: 	psubsb	%mm3, %mm3
-        	psubsb	%mm3,%mm3
-
-// CHECK: 	psubsb	3735928559(%ebx,%ecx,8), %xmm5
-        	psubsb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psubsb	69, %xmm5
-        	psubsb	0x45,%xmm5
-
-// CHECK: 	psubsb	32493, %xmm5
-        	psubsb	0x7eed,%xmm5
-
-// CHECK: 	psubsb	3133065982, %xmm5
-        	psubsb	0xbabecafe,%xmm5
-
-// CHECK: 	psubsb	305419896, %xmm5
-        	psubsb	0x12345678,%xmm5
-
-// CHECK: 	psubsb	%xmm5, %xmm5
-        	psubsb	%xmm5,%xmm5
-
-// CHECK: 	psubsw	3735928559(%ebx,%ecx,8), %mm3
-        	psubsw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psubsw	69, %mm3
-        	psubsw	0x45,%mm3
-
-// CHECK: 	psubsw	32493, %mm3
-        	psubsw	0x7eed,%mm3
-
-// CHECK: 	psubsw	3133065982, %mm3
-        	psubsw	0xbabecafe,%mm3
-
-// CHECK: 	psubsw	305419896, %mm3
-        	psubsw	0x12345678,%mm3
-
-// CHECK: 	psubsw	%mm3, %mm3
-        	psubsw	%mm3,%mm3
-
-// CHECK: 	psubsw	3735928559(%ebx,%ecx,8), %xmm5
-        	psubsw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psubsw	69, %xmm5
-        	psubsw	0x45,%xmm5
-
-// CHECK: 	psubsw	32493, %xmm5
-        	psubsw	0x7eed,%xmm5
-
-// CHECK: 	psubsw	3133065982, %xmm5
-        	psubsw	0xbabecafe,%xmm5
-
-// CHECK: 	psubsw	305419896, %xmm5
-        	psubsw	0x12345678,%xmm5
-
-// CHECK: 	psubsw	%xmm5, %xmm5
-        	psubsw	%xmm5,%xmm5
-
-// CHECK: 	psubusb	3735928559(%ebx,%ecx,8), %mm3
-        	psubusb	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psubusb	69, %mm3
-        	psubusb	0x45,%mm3
-
-// CHECK: 	psubusb	32493, %mm3
-        	psubusb	0x7eed,%mm3
-
-// CHECK: 	psubusb	3133065982, %mm3
-        	psubusb	0xbabecafe,%mm3
-
-// CHECK: 	psubusb	305419896, %mm3
-        	psubusb	0x12345678,%mm3
-
-// CHECK: 	psubusb	%mm3, %mm3
-        	psubusb	%mm3,%mm3
-
-// CHECK: 	psubusb	3735928559(%ebx,%ecx,8), %xmm5
-        	psubusb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psubusb	69, %xmm5
-        	psubusb	0x45,%xmm5
-
-// CHECK: 	psubusb	32493, %xmm5
-        	psubusb	0x7eed,%xmm5
-
-// CHECK: 	psubusb	3133065982, %xmm5
-        	psubusb	0xbabecafe,%xmm5
-
-// CHECK: 	psubusb	305419896, %xmm5
-        	psubusb	0x12345678,%xmm5
-
-// CHECK: 	psubusb	%xmm5, %xmm5
-        	psubusb	%xmm5,%xmm5
-
-// CHECK: 	psubusw	3735928559(%ebx,%ecx,8), %mm3
-        	psubusw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psubusw	69, %mm3
-        	psubusw	0x45,%mm3
-
-// CHECK: 	psubusw	32493, %mm3
-        	psubusw	0x7eed,%mm3
-
-// CHECK: 	psubusw	3133065982, %mm3
-        	psubusw	0xbabecafe,%mm3
-
-// CHECK: 	psubusw	305419896, %mm3
-        	psubusw	0x12345678,%mm3
-
-// CHECK: 	psubusw	%mm3, %mm3
-        	psubusw	%mm3,%mm3
-
-// CHECK: 	psubusw	3735928559(%ebx,%ecx,8), %xmm5
-        	psubusw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psubusw	69, %xmm5
-        	psubusw	0x45,%xmm5
-
-// CHECK: 	psubusw	32493, %xmm5
-        	psubusw	0x7eed,%xmm5
-
-// CHECK: 	psubusw	3133065982, %xmm5
-        	psubusw	0xbabecafe,%xmm5
-
-// CHECK: 	psubusw	305419896, %xmm5
-        	psubusw	0x12345678,%xmm5
-
-// CHECK: 	psubusw	%xmm5, %xmm5
-        	psubusw	%xmm5,%xmm5
-
-// CHECK: 	punpckhbw	3735928559(%ebx,%ecx,8), %mm3
-        	punpckhbw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	punpckhbw	69, %mm3
-        	punpckhbw	0x45,%mm3
-
-// CHECK: 	punpckhbw	32493, %mm3
-        	punpckhbw	0x7eed,%mm3
-
-// CHECK: 	punpckhbw	3133065982, %mm3
-        	punpckhbw	0xbabecafe,%mm3
-
-// CHECK: 	punpckhbw	305419896, %mm3
-        	punpckhbw	0x12345678,%mm3
-
-// CHECK: 	punpckhbw	%mm3, %mm3
-        	punpckhbw	%mm3,%mm3
-
-// CHECK: 	punpckhbw	3735928559(%ebx,%ecx,8), %xmm5
-        	punpckhbw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	punpckhbw	69, %xmm5
-        	punpckhbw	0x45,%xmm5
-
-// CHECK: 	punpckhbw	32493, %xmm5
-        	punpckhbw	0x7eed,%xmm5
-
-// CHECK: 	punpckhbw	3133065982, %xmm5
-        	punpckhbw	0xbabecafe,%xmm5
-
-// CHECK: 	punpckhbw	305419896, %xmm5
-        	punpckhbw	0x12345678,%xmm5
-
-// CHECK: 	punpckhbw	%xmm5, %xmm5
-        	punpckhbw	%xmm5,%xmm5
-
-// CHECK: 	punpckhwd	3735928559(%ebx,%ecx,8), %mm3
-        	punpckhwd	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	punpckhwd	69, %mm3
-        	punpckhwd	0x45,%mm3
-
-// CHECK: 	punpckhwd	32493, %mm3
-        	punpckhwd	0x7eed,%mm3
-
-// CHECK: 	punpckhwd	3133065982, %mm3
-        	punpckhwd	0xbabecafe,%mm3
-
-// CHECK: 	punpckhwd	305419896, %mm3
-        	punpckhwd	0x12345678,%mm3
-
-// CHECK: 	punpckhwd	%mm3, %mm3
-        	punpckhwd	%mm3,%mm3
-
-// CHECK: 	punpckhwd	3735928559(%ebx,%ecx,8), %xmm5
-        	punpckhwd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	punpckhwd	69, %xmm5
-        	punpckhwd	0x45,%xmm5
-
-// CHECK: 	punpckhwd	32493, %xmm5
-        	punpckhwd	0x7eed,%xmm5
-
-// CHECK: 	punpckhwd	3133065982, %xmm5
-        	punpckhwd	0xbabecafe,%xmm5
-
-// CHECK: 	punpckhwd	305419896, %xmm5
-        	punpckhwd	0x12345678,%xmm5
-
-// CHECK: 	punpckhwd	%xmm5, %xmm5
-        	punpckhwd	%xmm5,%xmm5
-
-// CHECK: 	punpckhdq	3735928559(%ebx,%ecx,8), %mm3
-        	punpckhdq	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	punpckhdq	69, %mm3
-        	punpckhdq	0x45,%mm3
-
-// CHECK: 	punpckhdq	32493, %mm3
-        	punpckhdq	0x7eed,%mm3
-
-// CHECK: 	punpckhdq	3133065982, %mm3
-        	punpckhdq	0xbabecafe,%mm3
-
-// CHECK: 	punpckhdq	305419896, %mm3
-        	punpckhdq	0x12345678,%mm3
-
-// CHECK: 	punpckhdq	%mm3, %mm3
-        	punpckhdq	%mm3,%mm3
-
-// CHECK: 	punpckhdq	3735928559(%ebx,%ecx,8), %xmm5
-        	punpckhdq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	punpckhdq	69, %xmm5
-        	punpckhdq	0x45,%xmm5
-
-// CHECK: 	punpckhdq	32493, %xmm5
-        	punpckhdq	0x7eed,%xmm5
-
-// CHECK: 	punpckhdq	3133065982, %xmm5
-        	punpckhdq	0xbabecafe,%xmm5
-
-// CHECK: 	punpckhdq	305419896, %xmm5
-        	punpckhdq	0x12345678,%xmm5
-
-// CHECK: 	punpckhdq	%xmm5, %xmm5
-        	punpckhdq	%xmm5,%xmm5
-
-// CHECK: 	punpcklbw	3735928559(%ebx,%ecx,8), %mm3
-        	punpcklbw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	punpcklbw	69, %mm3
-        	punpcklbw	0x45,%mm3
-
-// CHECK: 	punpcklbw	32493, %mm3
-        	punpcklbw	0x7eed,%mm3
-
-// CHECK: 	punpcklbw	3133065982, %mm3
-        	punpcklbw	0xbabecafe,%mm3
-
-// CHECK: 	punpcklbw	305419896, %mm3
-        	punpcklbw	0x12345678,%mm3
-
-// CHECK: 	punpcklbw	%mm3, %mm3
-        	punpcklbw	%mm3,%mm3
-
-// CHECK: 	punpcklbw	3735928559(%ebx,%ecx,8), %xmm5
-        	punpcklbw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	punpcklbw	69, %xmm5
-        	punpcklbw	0x45,%xmm5
-
-// CHECK: 	punpcklbw	32493, %xmm5
-        	punpcklbw	0x7eed,%xmm5
-
-// CHECK: 	punpcklbw	3133065982, %xmm5
-        	punpcklbw	0xbabecafe,%xmm5
-
-// CHECK: 	punpcklbw	305419896, %xmm5
-        	punpcklbw	0x12345678,%xmm5
-
-// CHECK: 	punpcklbw	%xmm5, %xmm5
-        	punpcklbw	%xmm5,%xmm5
-
-// CHECK: 	punpcklwd	3735928559(%ebx,%ecx,8), %mm3
-        	punpcklwd	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	punpcklwd	69, %mm3
-        	punpcklwd	0x45,%mm3
-
-// CHECK: 	punpcklwd	32493, %mm3
-        	punpcklwd	0x7eed,%mm3
-
-// CHECK: 	punpcklwd	3133065982, %mm3
-        	punpcklwd	0xbabecafe,%mm3
-
-// CHECK: 	punpcklwd	305419896, %mm3
-        	punpcklwd	0x12345678,%mm3
-
-// CHECK: 	punpcklwd	%mm3, %mm3
-        	punpcklwd	%mm3,%mm3
-
-// CHECK: 	punpcklwd	3735928559(%ebx,%ecx,8), %xmm5
-        	punpcklwd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	punpcklwd	69, %xmm5
-        	punpcklwd	0x45,%xmm5
-
-// CHECK: 	punpcklwd	32493, %xmm5
-        	punpcklwd	0x7eed,%xmm5
-
-// CHECK: 	punpcklwd	3133065982, %xmm5
-        	punpcklwd	0xbabecafe,%xmm5
-
-// CHECK: 	punpcklwd	305419896, %xmm5
-        	punpcklwd	0x12345678,%xmm5
-
-// CHECK: 	punpcklwd	%xmm5, %xmm5
-        	punpcklwd	%xmm5,%xmm5
-
-// CHECK: 	punpckldq	3735928559(%ebx,%ecx,8), %mm3
-        	punpckldq	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	punpckldq	69, %mm3
-        	punpckldq	0x45,%mm3
-
-// CHECK: 	punpckldq	32493, %mm3
-        	punpckldq	0x7eed,%mm3
-
-// CHECK: 	punpckldq	3133065982, %mm3
-        	punpckldq	0xbabecafe,%mm3
-
-// CHECK: 	punpckldq	305419896, %mm3
-        	punpckldq	0x12345678,%mm3
-
-// CHECK: 	punpckldq	%mm3, %mm3
-        	punpckldq	%mm3,%mm3
-
-// CHECK: 	punpckldq	3735928559(%ebx,%ecx,8), %xmm5
-        	punpckldq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	punpckldq	69, %xmm5
-        	punpckldq	0x45,%xmm5
-
-// CHECK: 	punpckldq	32493, %xmm5
-        	punpckldq	0x7eed,%xmm5
-
-// CHECK: 	punpckldq	3133065982, %xmm5
-        	punpckldq	0xbabecafe,%xmm5
-
-// CHECK: 	punpckldq	305419896, %xmm5
-        	punpckldq	0x12345678,%xmm5
-
-// CHECK: 	punpckldq	%xmm5, %xmm5
-        	punpckldq	%xmm5,%xmm5
-
-// CHECK: 	pxor	3735928559(%ebx,%ecx,8), %mm3
-        	pxor	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pxor	69, %mm3
-        	pxor	0x45,%mm3
-
-// CHECK: 	pxor	32493, %mm3
-        	pxor	0x7eed,%mm3
-
-// CHECK: 	pxor	3133065982, %mm3
-        	pxor	0xbabecafe,%mm3
-
-// CHECK: 	pxor	305419896, %mm3
-        	pxor	0x12345678,%mm3
-
-// CHECK: 	pxor	%mm3, %mm3
-        	pxor	%mm3,%mm3
-
-// CHECK: 	pxor	3735928559(%ebx,%ecx,8), %xmm5
-        	pxor	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pxor	69, %xmm5
-        	pxor	0x45,%xmm5
-
-// CHECK: 	pxor	32493, %xmm5
-        	pxor	0x7eed,%xmm5
-
-// CHECK: 	pxor	3133065982, %xmm5
-        	pxor	0xbabecafe,%xmm5
-
-// CHECK: 	pxor	305419896, %xmm5
-        	pxor	0x12345678,%xmm5
-
-// CHECK: 	pxor	%xmm5, %xmm5
-        	pxor	%xmm5,%xmm5
-
-// CHECK: 	addps	3735928559(%ebx,%ecx,8), %xmm5
-        	addps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	addps	69, %xmm5
-        	addps	0x45,%xmm5
-
-// CHECK: 	addps	32493, %xmm5
-        	addps	0x7eed,%xmm5
-
-// CHECK: 	addps	3133065982, %xmm5
-        	addps	0xbabecafe,%xmm5
-
-// CHECK: 	addps	305419896, %xmm5
-        	addps	0x12345678,%xmm5
-
-// CHECK: 	addps	%xmm5, %xmm5
-        	addps	%xmm5,%xmm5
-
-// CHECK: 	addss	3735928559(%ebx,%ecx,8), %xmm5
-        	addss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	addss	69, %xmm5
-        	addss	0x45,%xmm5
-
-// CHECK: 	addss	32493, %xmm5
-        	addss	0x7eed,%xmm5
-
-// CHECK: 	addss	3133065982, %xmm5
-        	addss	0xbabecafe,%xmm5
-
-// CHECK: 	addss	305419896, %xmm5
-        	addss	0x12345678,%xmm5
-
-// CHECK: 	addss	%xmm5, %xmm5
-        	addss	%xmm5,%xmm5
-
-// CHECK: 	andnps	3735928559(%ebx,%ecx,8), %xmm5
-        	andnps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	andnps	69, %xmm5
-        	andnps	0x45,%xmm5
-
-// CHECK: 	andnps	32493, %xmm5
-        	andnps	0x7eed,%xmm5
-
-// CHECK: 	andnps	3133065982, %xmm5
-        	andnps	0xbabecafe,%xmm5
-
-// CHECK: 	andnps	305419896, %xmm5
-        	andnps	0x12345678,%xmm5
-
-// CHECK: 	andnps	%xmm5, %xmm5
-        	andnps	%xmm5,%xmm5
-
-// CHECK: 	andps	3735928559(%ebx,%ecx,8), %xmm5
-        	andps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	andps	69, %xmm5
-        	andps	0x45,%xmm5
-
-// CHECK: 	andps	32493, %xmm5
-        	andps	0x7eed,%xmm5
-
-// CHECK: 	andps	3133065982, %xmm5
-        	andps	0xbabecafe,%xmm5
-
-// CHECK: 	andps	305419896, %xmm5
-        	andps	0x12345678,%xmm5
-
-// CHECK: 	andps	%xmm5, %xmm5
-        	andps	%xmm5,%xmm5
-
-// CHECK: 	comiss	3735928559(%ebx,%ecx,8), %xmm5
-        	comiss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	comiss	69, %xmm5
-        	comiss	0x45,%xmm5
-
-// CHECK: 	comiss	32493, %xmm5
-        	comiss	0x7eed,%xmm5
-
-// CHECK: 	comiss	3133065982, %xmm5
-        	comiss	0xbabecafe,%xmm5
-
-// CHECK: 	comiss	305419896, %xmm5
-        	comiss	0x12345678,%xmm5
-
-// CHECK: 	comiss	%xmm5, %xmm5
-        	comiss	%xmm5,%xmm5
-
-// CHECK: 	cvtpi2ps	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtpi2ps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtpi2ps	69, %xmm5
-        	cvtpi2ps	0x45,%xmm5
-
-// CHECK: 	cvtpi2ps	32493, %xmm5
-        	cvtpi2ps	0x7eed,%xmm5
-
-// CHECK: 	cvtpi2ps	3133065982, %xmm5
-        	cvtpi2ps	0xbabecafe,%xmm5
-
-// CHECK: 	cvtpi2ps	305419896, %xmm5
-        	cvtpi2ps	0x12345678,%xmm5
-
-// CHECK: 	cvtpi2ps	%mm3, %xmm5
-        	cvtpi2ps	%mm3,%xmm5
-
-// CHECK: 	cvtps2pi	3735928559(%ebx,%ecx,8), %mm3
-        	cvtps2pi	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	cvtps2pi	69, %mm3
-        	cvtps2pi	0x45,%mm3
-
-// CHECK: 	cvtps2pi	32493, %mm3
-        	cvtps2pi	0x7eed,%mm3
-
-// CHECK: 	cvtps2pi	3133065982, %mm3
-        	cvtps2pi	0xbabecafe,%mm3
-
-// CHECK: 	cvtps2pi	305419896, %mm3
-        	cvtps2pi	0x12345678,%mm3
-
-// CHECK: 	cvtps2pi	%xmm5, %mm3
-        	cvtps2pi	%xmm5,%mm3
-
-// CHECK: 	cvtsi2ssl	%ecx, %xmm5
-        	cvtsi2ssl	%ecx,%xmm5
-
-// CHECK: 	cvtsi2ssl	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtsi2ssl	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtsi2ssl	69, %xmm5
-        	cvtsi2ssl	0x45,%xmm5
-
-// CHECK: 	cvtsi2ssl	32493, %xmm5
-        	cvtsi2ssl	0x7eed,%xmm5
-
-// CHECK: 	cvtsi2ssl	3133065982, %xmm5
-        	cvtsi2ssl	0xbabecafe,%xmm5
-
-// CHECK: 	cvtsi2ssl	305419896, %xmm5
-        	cvtsi2ssl	0x12345678,%xmm5
-
-// CHECK: 	cvttps2pi	3735928559(%ebx,%ecx,8), %mm3
-        	cvttps2pi	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	cvttps2pi	69, %mm3
-        	cvttps2pi	0x45,%mm3
-
-// CHECK: 	cvttps2pi	32493, %mm3
-        	cvttps2pi	0x7eed,%mm3
-
-// CHECK: 	cvttps2pi	3133065982, %mm3
-        	cvttps2pi	0xbabecafe,%mm3
-
-// CHECK: 	cvttps2pi	305419896, %mm3
-        	cvttps2pi	0x12345678,%mm3
-
-// CHECK: 	cvttps2pi	%xmm5, %mm3
-        	cvttps2pi	%xmm5,%mm3
-
-// CHECK: 	cvttss2si	3735928559(%ebx,%ecx,8), %ecx
-        	cvttss2si	0xdeadbeef(%ebx,%ecx,8),%ecx
-
-// CHECK: 	cvttss2si	69, %ecx
-        	cvttss2si	0x45,%ecx
-
-// CHECK: 	cvttss2si	32493, %ecx
-        	cvttss2si	0x7eed,%ecx
-
-// CHECK: 	cvttss2si	3133065982, %ecx
-        	cvttss2si	0xbabecafe,%ecx
-
-// CHECK: 	cvttss2si	305419896, %ecx
-        	cvttss2si	0x12345678,%ecx
-
-// CHECK: 	cvttss2si	%xmm5, %ecx
-        	cvttss2si	%xmm5,%ecx
-
-// CHECK: 	divps	3735928559(%ebx,%ecx,8), %xmm5
-        	divps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	divps	69, %xmm5
-        	divps	0x45,%xmm5
-
-// CHECK: 	divps	32493, %xmm5
-        	divps	0x7eed,%xmm5
-
-// CHECK: 	divps	3133065982, %xmm5
-        	divps	0xbabecafe,%xmm5
-
-// CHECK: 	divps	305419896, %xmm5
-        	divps	0x12345678,%xmm5
-
-// CHECK: 	divps	%xmm5, %xmm5
-        	divps	%xmm5,%xmm5
-
-// CHECK: 	divss	3735928559(%ebx,%ecx,8), %xmm5
-        	divss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	divss	69, %xmm5
-        	divss	0x45,%xmm5
-
-// CHECK: 	divss	32493, %xmm5
-        	divss	0x7eed,%xmm5
-
-// CHECK: 	divss	3133065982, %xmm5
-        	divss	0xbabecafe,%xmm5
-
-// CHECK: 	divss	305419896, %xmm5
-        	divss	0x12345678,%xmm5
-
-// CHECK: 	divss	%xmm5, %xmm5
-        	divss	%xmm5,%xmm5
-
-// CHECK: 	ldmxcsr	3735928559(%ebx,%ecx,8)
-        	ldmxcsr	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	ldmxcsr	32493
-        	ldmxcsr	0x7eed
-
-// CHECK: 	ldmxcsr	3133065982
-        	ldmxcsr	0xbabecafe
-
-// CHECK: 	ldmxcsr	305419896
-        	ldmxcsr	0x12345678
-
-// CHECK: 	maskmovq	%mm3, %mm3
-        	maskmovq	%mm3,%mm3
-
-// CHECK: 	maxps	3735928559(%ebx,%ecx,8), %xmm5
-        	maxps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	maxps	69, %xmm5
-        	maxps	0x45,%xmm5
-
-// CHECK: 	maxps	32493, %xmm5
-        	maxps	0x7eed,%xmm5
-
-// CHECK: 	maxps	3133065982, %xmm5
-        	maxps	0xbabecafe,%xmm5
-
-// CHECK: 	maxps	305419896, %xmm5
-        	maxps	0x12345678,%xmm5
-
-// CHECK: 	maxps	%xmm5, %xmm5
-        	maxps	%xmm5,%xmm5
-
-// CHECK: 	maxss	3735928559(%ebx,%ecx,8), %xmm5
-        	maxss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	maxss	69, %xmm5
-        	maxss	0x45,%xmm5
-
-// CHECK: 	maxss	32493, %xmm5
-        	maxss	0x7eed,%xmm5
-
-// CHECK: 	maxss	3133065982, %xmm5
-        	maxss	0xbabecafe,%xmm5
-
-// CHECK: 	maxss	305419896, %xmm5
-        	maxss	0x12345678,%xmm5
-
-// CHECK: 	maxss	%xmm5, %xmm5
-        	maxss	%xmm5,%xmm5
-
-// CHECK: 	minps	3735928559(%ebx,%ecx,8), %xmm5
-        	minps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	minps	69, %xmm5
-        	minps	0x45,%xmm5
-
-// CHECK: 	minps	32493, %xmm5
-        	minps	0x7eed,%xmm5
-
-// CHECK: 	minps	3133065982, %xmm5
-        	minps	0xbabecafe,%xmm5
-
-// CHECK: 	minps	305419896, %xmm5
-        	minps	0x12345678,%xmm5
-
-// CHECK: 	minps	%xmm5, %xmm5
-        	minps	%xmm5,%xmm5
-
-// CHECK: 	minss	3735928559(%ebx,%ecx,8), %xmm5
-        	minss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	minss	69, %xmm5
-        	minss	0x45,%xmm5
-
-// CHECK: 	minss	32493, %xmm5
-        	minss	0x7eed,%xmm5
-
-// CHECK: 	minss	3133065982, %xmm5
-        	minss	0xbabecafe,%xmm5
-
-// CHECK: 	minss	305419896, %xmm5
-        	minss	0x12345678,%xmm5
-
-// CHECK: 	minss	%xmm5, %xmm5
-        	minss	%xmm5,%xmm5
-
-// CHECK: 	movaps	3735928559(%ebx,%ecx,8), %xmm5
-        	movaps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movaps	69, %xmm5
-        	movaps	0x45,%xmm5
-
-// CHECK: 	movaps	32493, %xmm5
-        	movaps	0x7eed,%xmm5
-
-// CHECK: 	movaps	3133065982, %xmm5
-        	movaps	0xbabecafe,%xmm5
-
-// CHECK: 	movaps	305419896, %xmm5
-        	movaps	0x12345678,%xmm5
-
-// CHECK: 	movaps	%xmm5, %xmm5
-        	movaps	%xmm5,%xmm5
-
-// CHECK: 	movaps	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movaps	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movaps	%xmm5, 69
-        	movaps	%xmm5,0x45
-
-// CHECK: 	movaps	%xmm5, 32493
-        	movaps	%xmm5,0x7eed
-
-// CHECK: 	movaps	%xmm5, 3133065982
-        	movaps	%xmm5,0xbabecafe
-
-// CHECK: 	movaps	%xmm5, 305419896
-        	movaps	%xmm5,0x12345678
-
-// CHECK: 	movaps	%xmm5, %xmm5
-        	movaps	%xmm5,%xmm5
-
-// CHECK: 	movhlps	%xmm5, %xmm5
-        	movhlps	%xmm5,%xmm5
-
-// CHECK: 	movhps	3735928559(%ebx,%ecx,8), %xmm5
-        	movhps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movhps	69, %xmm5
-        	movhps	0x45,%xmm5
-
-// CHECK: 	movhps	32493, %xmm5
-        	movhps	0x7eed,%xmm5
-
-// CHECK: 	movhps	3133065982, %xmm5
-        	movhps	0xbabecafe,%xmm5
-
-// CHECK: 	movhps	305419896, %xmm5
-        	movhps	0x12345678,%xmm5
-
-// CHECK: 	movhps	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movhps	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movhps	%xmm5, 69
-        	movhps	%xmm5,0x45
-
-// CHECK: 	movhps	%xmm5, 32493
-        	movhps	%xmm5,0x7eed
-
-// CHECK: 	movhps	%xmm5, 3133065982
-        	movhps	%xmm5,0xbabecafe
-
-// CHECK: 	movhps	%xmm5, 305419896
-        	movhps	%xmm5,0x12345678
-
-// CHECK: 	movlhps	%xmm5, %xmm5
-        	movlhps	%xmm5,%xmm5
-
-// CHECK: 	movlps	3735928559(%ebx,%ecx,8), %xmm5
-        	movlps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movlps	69, %xmm5
-        	movlps	0x45,%xmm5
-
-// CHECK: 	movlps	32493, %xmm5
-        	movlps	0x7eed,%xmm5
-
-// CHECK: 	movlps	3133065982, %xmm5
-        	movlps	0xbabecafe,%xmm5
-
-// CHECK: 	movlps	305419896, %xmm5
-        	movlps	0x12345678,%xmm5
-
-// CHECK: 	movlps	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movlps	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movlps	%xmm5, 69
-        	movlps	%xmm5,0x45
-
-// CHECK: 	movlps	%xmm5, 32493
-        	movlps	%xmm5,0x7eed
-
-// CHECK: 	movlps	%xmm5, 3133065982
-        	movlps	%xmm5,0xbabecafe
-
-// CHECK: 	movlps	%xmm5, 305419896
-        	movlps	%xmm5,0x12345678
-
-// CHECK: 	movmskps	%xmm5, %ecx
-        	movmskps	%xmm5,%ecx
-
-// CHECK: 	movntps	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movntps	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movntps	%xmm5, 69
-        	movntps	%xmm5,0x45
-
-// CHECK: 	movntps	%xmm5, 32493
-        	movntps	%xmm5,0x7eed
-
-// CHECK: 	movntps	%xmm5, 3133065982
-        	movntps	%xmm5,0xbabecafe
-
-// CHECK: 	movntps	%xmm5, 305419896
-        	movntps	%xmm5,0x12345678
-
-// CHECK: 	movntq	%mm3, 3735928559(%ebx,%ecx,8)
-        	movntq	%mm3,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movntq	%mm3, 69
-        	movntq	%mm3,0x45
-
-// CHECK: 	movntq	%mm3, 32493
-        	movntq	%mm3,0x7eed
-
-// CHECK: 	movntq	%mm3, 3133065982
-        	movntq	%mm3,0xbabecafe
-
-// CHECK: 	movntq	%mm3, 305419896
-        	movntq	%mm3,0x12345678
-
-// CHECK: 	movntdq	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movntdq	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movntdq	%xmm5, 69
-        	movntdq	%xmm5,0x45
-
-// CHECK: 	movntdq	%xmm5, 32493
-        	movntdq	%xmm5,0x7eed
-
-// CHECK: 	movntdq	%xmm5, 3133065982
-        	movntdq	%xmm5,0xbabecafe
-
-// CHECK: 	movntdq	%xmm5, 305419896
-        	movntdq	%xmm5,0x12345678
-
-// CHECK: 	movss	3735928559(%ebx,%ecx,8), %xmm5
-        	movss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movss	69, %xmm5
-        	movss	0x45,%xmm5
-
-// CHECK: 	movss	32493, %xmm5
-        	movss	0x7eed,%xmm5
-
-// CHECK: 	movss	3133065982, %xmm5
-        	movss	0xbabecafe,%xmm5
-
-// CHECK: 	movss	305419896, %xmm5
-        	movss	0x12345678,%xmm5
-
-// CHECK: 	movss	%xmm5, %xmm5
-        	movss	%xmm5,%xmm5
-
-// CHECK: 	movss	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movss	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movss	%xmm5, 69
-        	movss	%xmm5,0x45
-
-// CHECK: 	movss	%xmm5, 32493
-        	movss	%xmm5,0x7eed
-
-// CHECK: 	movss	%xmm5, 3133065982
-        	movss	%xmm5,0xbabecafe
-
-// CHECK: 	movss	%xmm5, 305419896
-        	movss	%xmm5,0x12345678
-
-// CHECK: 	movss	%xmm5, %xmm5
-        	movss	%xmm5,%xmm5
-
-// CHECK: 	movups	3735928559(%ebx,%ecx,8), %xmm5
-        	movups	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movups	69, %xmm5
-        	movups	0x45,%xmm5
-
-// CHECK: 	movups	32493, %xmm5
-        	movups	0x7eed,%xmm5
-
-// CHECK: 	movups	3133065982, %xmm5
-        	movups	0xbabecafe,%xmm5
-
-// CHECK: 	movups	305419896, %xmm5
-        	movups	0x12345678,%xmm5
-
-// CHECK: 	movups	%xmm5, %xmm5
-        	movups	%xmm5,%xmm5
-
-// CHECK: 	movups	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movups	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movups	%xmm5, 69
-        	movups	%xmm5,0x45
-
-// CHECK: 	movups	%xmm5, 32493
-        	movups	%xmm5,0x7eed
-
-// CHECK: 	movups	%xmm5, 3133065982
-        	movups	%xmm5,0xbabecafe
-
-// CHECK: 	movups	%xmm5, 305419896
-        	movups	%xmm5,0x12345678
-
-// CHECK: 	movups	%xmm5, %xmm5
-        	movups	%xmm5,%xmm5
-
-// CHECK: 	mulps	3735928559(%ebx,%ecx,8), %xmm5
-        	mulps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	mulps	69, %xmm5
-        	mulps	0x45,%xmm5
-
-// CHECK: 	mulps	32493, %xmm5
-        	mulps	0x7eed,%xmm5
-
-// CHECK: 	mulps	3133065982, %xmm5
-        	mulps	0xbabecafe,%xmm5
-
-// CHECK: 	mulps	305419896, %xmm5
-        	mulps	0x12345678,%xmm5
-
-// CHECK: 	mulps	%xmm5, %xmm5
-        	mulps	%xmm5,%xmm5
-
-// CHECK: 	mulss	3735928559(%ebx,%ecx,8), %xmm5
-        	mulss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	mulss	69, %xmm5
-        	mulss	0x45,%xmm5
-
-// CHECK: 	mulss	32493, %xmm5
-        	mulss	0x7eed,%xmm5
-
-// CHECK: 	mulss	3133065982, %xmm5
-        	mulss	0xbabecafe,%xmm5
-
-// CHECK: 	mulss	305419896, %xmm5
-        	mulss	0x12345678,%xmm5
-
-// CHECK: 	mulss	%xmm5, %xmm5
-        	mulss	%xmm5,%xmm5
-
-// CHECK: 	orps	3735928559(%ebx,%ecx,8), %xmm5
-        	orps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	orps	69, %xmm5
-        	orps	0x45,%xmm5
-
-// CHECK: 	orps	32493, %xmm5
-        	orps	0x7eed,%xmm5
-
-// CHECK: 	orps	3133065982, %xmm5
-        	orps	0xbabecafe,%xmm5
-
-// CHECK: 	orps	305419896, %xmm5
-        	orps	0x12345678,%xmm5
-
-// CHECK: 	orps	%xmm5, %xmm5
-        	orps	%xmm5,%xmm5
-
-// CHECK: 	pavgb	3735928559(%ebx,%ecx,8), %mm3
-        	pavgb	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pavgb	69, %mm3
-        	pavgb	0x45,%mm3
-
-// CHECK: 	pavgb	32493, %mm3
-        	pavgb	0x7eed,%mm3
-
-// CHECK: 	pavgb	3133065982, %mm3
-        	pavgb	0xbabecafe,%mm3
-
-// CHECK: 	pavgb	305419896, %mm3
-        	pavgb	0x12345678,%mm3
-
-// CHECK: 	pavgb	%mm3, %mm3
-        	pavgb	%mm3,%mm3
-
-// CHECK: 	pavgb	3735928559(%ebx,%ecx,8), %xmm5
-        	pavgb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pavgb	69, %xmm5
-        	pavgb	0x45,%xmm5
-
-// CHECK: 	pavgb	32493, %xmm5
-        	pavgb	0x7eed,%xmm5
-
-// CHECK: 	pavgb	3133065982, %xmm5
-        	pavgb	0xbabecafe,%xmm5
-
-// CHECK: 	pavgb	305419896, %xmm5
-        	pavgb	0x12345678,%xmm5
-
-// CHECK: 	pavgb	%xmm5, %xmm5
-        	pavgb	%xmm5,%xmm5
-
-// CHECK: 	pavgw	3735928559(%ebx,%ecx,8), %mm3
-        	pavgw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pavgw	69, %mm3
-        	pavgw	0x45,%mm3
-
-// CHECK: 	pavgw	32493, %mm3
-        	pavgw	0x7eed,%mm3
-
-// CHECK: 	pavgw	3133065982, %mm3
-        	pavgw	0xbabecafe,%mm3
-
-// CHECK: 	pavgw	305419896, %mm3
-        	pavgw	0x12345678,%mm3
-
-// CHECK: 	pavgw	%mm3, %mm3
-        	pavgw	%mm3,%mm3
-
-// CHECK: 	pavgw	3735928559(%ebx,%ecx,8), %xmm5
-        	pavgw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pavgw	69, %xmm5
-        	pavgw	0x45,%xmm5
-
-// CHECK: 	pavgw	32493, %xmm5
-        	pavgw	0x7eed,%xmm5
-
-// CHECK: 	pavgw	3133065982, %xmm5
-        	pavgw	0xbabecafe,%xmm5
-
-// CHECK: 	pavgw	305419896, %xmm5
-        	pavgw	0x12345678,%xmm5
-
-// CHECK: 	pavgw	%xmm5, %xmm5
-        	pavgw	%xmm5,%xmm5
-
-// CHECK: 	pmaxsw	3735928559(%ebx,%ecx,8), %mm3
-        	pmaxsw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pmaxsw	69, %mm3
-        	pmaxsw	0x45,%mm3
-
-// CHECK: 	pmaxsw	32493, %mm3
-        	pmaxsw	0x7eed,%mm3
-
-// CHECK: 	pmaxsw	3133065982, %mm3
-        	pmaxsw	0xbabecafe,%mm3
-
-// CHECK: 	pmaxsw	305419896, %mm3
-        	pmaxsw	0x12345678,%mm3
-
-// CHECK: 	pmaxsw	%mm3, %mm3
-        	pmaxsw	%mm3,%mm3
-
-// CHECK: 	pmaxsw	3735928559(%ebx,%ecx,8), %xmm5
-        	pmaxsw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmaxsw	69, %xmm5
-        	pmaxsw	0x45,%xmm5
-
-// CHECK: 	pmaxsw	32493, %xmm5
-        	pmaxsw	0x7eed,%xmm5
-
-// CHECK: 	pmaxsw	3133065982, %xmm5
-        	pmaxsw	0xbabecafe,%xmm5
-
-// CHECK: 	pmaxsw	305419896, %xmm5
-        	pmaxsw	0x12345678,%xmm5
-
-// CHECK: 	pmaxsw	%xmm5, %xmm5
-        	pmaxsw	%xmm5,%xmm5
-
-// CHECK: 	pmaxub	3735928559(%ebx,%ecx,8), %mm3
-        	pmaxub	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pmaxub	69, %mm3
-        	pmaxub	0x45,%mm3
-
-// CHECK: 	pmaxub	32493, %mm3
-        	pmaxub	0x7eed,%mm3
-
-// CHECK: 	pmaxub	3133065982, %mm3
-        	pmaxub	0xbabecafe,%mm3
-
-// CHECK: 	pmaxub	305419896, %mm3
-        	pmaxub	0x12345678,%mm3
-
-// CHECK: 	pmaxub	%mm3, %mm3
-        	pmaxub	%mm3,%mm3
-
-// CHECK: 	pmaxub	3735928559(%ebx,%ecx,8), %xmm5
-        	pmaxub	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmaxub	69, %xmm5
-        	pmaxub	0x45,%xmm5
-
-// CHECK: 	pmaxub	32493, %xmm5
-        	pmaxub	0x7eed,%xmm5
-
-// CHECK: 	pmaxub	3133065982, %xmm5
-        	pmaxub	0xbabecafe,%xmm5
-
-// CHECK: 	pmaxub	305419896, %xmm5
-        	pmaxub	0x12345678,%xmm5
-
-// CHECK: 	pmaxub	%xmm5, %xmm5
-        	pmaxub	%xmm5,%xmm5
-
-// CHECK: 	pminsw	3735928559(%ebx,%ecx,8), %mm3
-        	pminsw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pminsw	69, %mm3
-        	pminsw	0x45,%mm3
-
-// CHECK: 	pminsw	32493, %mm3
-        	pminsw	0x7eed,%mm3
-
-// CHECK: 	pminsw	3133065982, %mm3
-        	pminsw	0xbabecafe,%mm3
-
-// CHECK: 	pminsw	305419896, %mm3
-        	pminsw	0x12345678,%mm3
-
-// CHECK: 	pminsw	%mm3, %mm3
-        	pminsw	%mm3,%mm3
-
-// CHECK: 	pminsw	3735928559(%ebx,%ecx,8), %xmm5
-        	pminsw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pminsw	69, %xmm5
-        	pminsw	0x45,%xmm5
-
-// CHECK: 	pminsw	32493, %xmm5
-        	pminsw	0x7eed,%xmm5
-
-// CHECK: 	pminsw	3133065982, %xmm5
-        	pminsw	0xbabecafe,%xmm5
-
-// CHECK: 	pminsw	305419896, %xmm5
-        	pminsw	0x12345678,%xmm5
-
-// CHECK: 	pminsw	%xmm5, %xmm5
-        	pminsw	%xmm5,%xmm5
-
-// CHECK: 	pminub	3735928559(%ebx,%ecx,8), %mm3
-        	pminub	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pminub	69, %mm3
-        	pminub	0x45,%mm3
-
-// CHECK: 	pminub	32493, %mm3
-        	pminub	0x7eed,%mm3
-
-// CHECK: 	pminub	3133065982, %mm3
-        	pminub	0xbabecafe,%mm3
-
-// CHECK: 	pminub	305419896, %mm3
-        	pminub	0x12345678,%mm3
-
-// CHECK: 	pminub	%mm3, %mm3
-        	pminub	%mm3,%mm3
-
-// CHECK: 	pminub	3735928559(%ebx,%ecx,8), %xmm5
-        	pminub	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pminub	69, %xmm5
-        	pminub	0x45,%xmm5
-
-// CHECK: 	pminub	32493, %xmm5
-        	pminub	0x7eed,%xmm5
-
-// CHECK: 	pminub	3133065982, %xmm5
-        	pminub	0xbabecafe,%xmm5
-
-// CHECK: 	pminub	305419896, %xmm5
-        	pminub	0x12345678,%xmm5
-
-// CHECK: 	pminub	%xmm5, %xmm5
-        	pminub	%xmm5,%xmm5
-
-// CHECK: 	pmovmskb	%mm3, %ecx
-        	pmovmskb	%mm3,%ecx
-
-// CHECK: 	pmovmskb	%xmm5, %ecx
-        	pmovmskb	%xmm5,%ecx
-
-// CHECK: 	pmulhuw	3735928559(%ebx,%ecx,8), %mm3
-        	pmulhuw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pmulhuw	69, %mm3
-        	pmulhuw	0x45,%mm3
-
-// CHECK: 	pmulhuw	32493, %mm3
-        	pmulhuw	0x7eed,%mm3
-
-// CHECK: 	pmulhuw	3133065982, %mm3
-        	pmulhuw	0xbabecafe,%mm3
-
-// CHECK: 	pmulhuw	305419896, %mm3
-        	pmulhuw	0x12345678,%mm3
-
-// CHECK: 	pmulhuw	%mm3, %mm3
-        	pmulhuw	%mm3,%mm3
-
-// CHECK: 	pmulhuw	3735928559(%ebx,%ecx,8), %xmm5
-        	pmulhuw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmulhuw	69, %xmm5
-        	pmulhuw	0x45,%xmm5
-
-// CHECK: 	pmulhuw	32493, %xmm5
-        	pmulhuw	0x7eed,%xmm5
-
-// CHECK: 	pmulhuw	3133065982, %xmm5
-        	pmulhuw	0xbabecafe,%xmm5
-
-// CHECK: 	pmulhuw	305419896, %xmm5
-        	pmulhuw	0x12345678,%xmm5
-
-// CHECK: 	pmulhuw	%xmm5, %xmm5
-        	pmulhuw	%xmm5,%xmm5
-
-// CHECK: 	prefetchnta	3735928559(%ebx,%ecx,8)
-        	prefetchnta	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	prefetchnta	32493
-        	prefetchnta	0x7eed
-
-// CHECK: 	prefetchnta	3133065982
-        	prefetchnta	0xbabecafe
-
-// CHECK: 	prefetchnta	305419896
-        	prefetchnta	0x12345678
-
-// CHECK: 	prefetcht0	3735928559(%ebx,%ecx,8)
-        	prefetcht0	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	prefetcht0	32493
-        	prefetcht0	0x7eed
-
-// CHECK: 	prefetcht0	3133065982
-        	prefetcht0	0xbabecafe
-
-// CHECK: 	prefetcht0	305419896
-        	prefetcht0	0x12345678
-
-// CHECK: 	prefetcht1	3735928559(%ebx,%ecx,8)
-        	prefetcht1	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	prefetcht1	32493
-        	prefetcht1	0x7eed
-
-// CHECK: 	prefetcht1	3133065982
-        	prefetcht1	0xbabecafe
-
-// CHECK: 	prefetcht1	305419896
-        	prefetcht1	0x12345678
-
-// CHECK: 	prefetcht2	3735928559(%ebx,%ecx,8)
-        	prefetcht2	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	prefetcht2	32493
-        	prefetcht2	0x7eed
-
-// CHECK: 	prefetcht2	3133065982
-        	prefetcht2	0xbabecafe
-
-// CHECK: 	prefetcht2	305419896
-        	prefetcht2	0x12345678
-
-// CHECK: 	psadbw	3735928559(%ebx,%ecx,8), %mm3
-        	psadbw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psadbw	69, %mm3
-        	psadbw	0x45,%mm3
-
-// CHECK: 	psadbw	32493, %mm3
-        	psadbw	0x7eed,%mm3
-
-// CHECK: 	psadbw	3133065982, %mm3
-        	psadbw	0xbabecafe,%mm3
-
-// CHECK: 	psadbw	305419896, %mm3
-        	psadbw	0x12345678,%mm3
-
-// CHECK: 	psadbw	%mm3, %mm3
-        	psadbw	%mm3,%mm3
-
-// CHECK: 	psadbw	3735928559(%ebx,%ecx,8), %xmm5
-        	psadbw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psadbw	69, %xmm5
-        	psadbw	0x45,%xmm5
-
-// CHECK: 	psadbw	32493, %xmm5
-        	psadbw	0x7eed,%xmm5
-
-// CHECK: 	psadbw	3133065982, %xmm5
-        	psadbw	0xbabecafe,%xmm5
-
-// CHECK: 	psadbw	305419896, %xmm5
-        	psadbw	0x12345678,%xmm5
-
-// CHECK: 	psadbw	%xmm5, %xmm5
-        	psadbw	%xmm5,%xmm5
-
-// CHECK: 	rcpps	3735928559(%ebx,%ecx,8), %xmm5
-        	rcpps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	rcpps	69, %xmm5
-        	rcpps	0x45,%xmm5
-
-// CHECK: 	rcpps	32493, %xmm5
-        	rcpps	0x7eed,%xmm5
-
-// CHECK: 	rcpps	3133065982, %xmm5
-        	rcpps	0xbabecafe,%xmm5
-
-// CHECK: 	rcpps	305419896, %xmm5
-        	rcpps	0x12345678,%xmm5
-
-// CHECK: 	rcpps	%xmm5, %xmm5
-        	rcpps	%xmm5,%xmm5
-
-// CHECK: 	rcpss	3735928559(%ebx,%ecx,8), %xmm5
-        	rcpss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	rcpss	69, %xmm5
-        	rcpss	0x45,%xmm5
-
-// CHECK: 	rcpss	32493, %xmm5
-        	rcpss	0x7eed,%xmm5
-
-// CHECK: 	rcpss	3133065982, %xmm5
-        	rcpss	0xbabecafe,%xmm5
-
-// CHECK: 	rcpss	305419896, %xmm5
-        	rcpss	0x12345678,%xmm5
-
-// CHECK: 	rcpss	%xmm5, %xmm5
-        	rcpss	%xmm5,%xmm5
-
-// CHECK: 	rsqrtps	3735928559(%ebx,%ecx,8), %xmm5
-        	rsqrtps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	rsqrtps	69, %xmm5
-        	rsqrtps	0x45,%xmm5
-
-// CHECK: 	rsqrtps	32493, %xmm5
-        	rsqrtps	0x7eed,%xmm5
-
-// CHECK: 	rsqrtps	3133065982, %xmm5
-        	rsqrtps	0xbabecafe,%xmm5
-
-// CHECK: 	rsqrtps	305419896, %xmm5
-        	rsqrtps	0x12345678,%xmm5
-
-// CHECK: 	rsqrtps	%xmm5, %xmm5
-        	rsqrtps	%xmm5,%xmm5
-
-// CHECK: 	rsqrtss	3735928559(%ebx,%ecx,8), %xmm5
-        	rsqrtss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	rsqrtss	69, %xmm5
-        	rsqrtss	0x45,%xmm5
-
-// CHECK: 	rsqrtss	32493, %xmm5
-        	rsqrtss	0x7eed,%xmm5
-
-// CHECK: 	rsqrtss	3133065982, %xmm5
-        	rsqrtss	0xbabecafe,%xmm5
-
-// CHECK: 	rsqrtss	305419896, %xmm5
-        	rsqrtss	0x12345678,%xmm5
-
-// CHECK: 	rsqrtss	%xmm5, %xmm5
-        	rsqrtss	%xmm5,%xmm5
-
-// CHECK: 	sfence
-        	sfence
-
-// CHECK: 	sqrtps	3735928559(%ebx,%ecx,8), %xmm5
-        	sqrtps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	sqrtps	69, %xmm5
-        	sqrtps	0x45,%xmm5
-
-// CHECK: 	sqrtps	32493, %xmm5
-        	sqrtps	0x7eed,%xmm5
-
-// CHECK: 	sqrtps	3133065982, %xmm5
-        	sqrtps	0xbabecafe,%xmm5
-
-// CHECK: 	sqrtps	305419896, %xmm5
-        	sqrtps	0x12345678,%xmm5
-
-// CHECK: 	sqrtps	%xmm5, %xmm5
-        	sqrtps	%xmm5,%xmm5
-
-// CHECK: 	sqrtss	3735928559(%ebx,%ecx,8), %xmm5
-        	sqrtss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	sqrtss	69, %xmm5
-        	sqrtss	0x45,%xmm5
-
-// CHECK: 	sqrtss	32493, %xmm5
-        	sqrtss	0x7eed,%xmm5
-
-// CHECK: 	sqrtss	3133065982, %xmm5
-        	sqrtss	0xbabecafe,%xmm5
-
-// CHECK: 	sqrtss	305419896, %xmm5
-        	sqrtss	0x12345678,%xmm5
-
-// CHECK: 	sqrtss	%xmm5, %xmm5
-        	sqrtss	%xmm5,%xmm5
-
-// CHECK: 	stmxcsr	3735928559(%ebx,%ecx,8)
-        	stmxcsr	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	stmxcsr	32493
-        	stmxcsr	0x7eed
-
-// CHECK: 	stmxcsr	3133065982
-        	stmxcsr	0xbabecafe
-
-// CHECK: 	stmxcsr	305419896
-        	stmxcsr	0x12345678
-
-// CHECK: 	subps	3735928559(%ebx,%ecx,8), %xmm5
-        	subps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	subps	69, %xmm5
-        	subps	0x45,%xmm5
-
-// CHECK: 	subps	32493, %xmm5
-        	subps	0x7eed,%xmm5
-
-// CHECK: 	subps	3133065982, %xmm5
-        	subps	0xbabecafe,%xmm5
-
-// CHECK: 	subps	305419896, %xmm5
-        	subps	0x12345678,%xmm5
-
-// CHECK: 	subps	%xmm5, %xmm5
-        	subps	%xmm5,%xmm5
-
-// CHECK: 	subss	3735928559(%ebx,%ecx,8), %xmm5
-        	subss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	subss	69, %xmm5
-        	subss	0x45,%xmm5
-
-// CHECK: 	subss	32493, %xmm5
-        	subss	0x7eed,%xmm5
-
-// CHECK: 	subss	3133065982, %xmm5
-        	subss	0xbabecafe,%xmm5
-
-// CHECK: 	subss	305419896, %xmm5
-        	subss	0x12345678,%xmm5
-
-// CHECK: 	subss	%xmm5, %xmm5
-        	subss	%xmm5,%xmm5
-
-// CHECK: 	ucomiss	3735928559(%ebx,%ecx,8), %xmm5
-        	ucomiss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	ucomiss	69, %xmm5
-        	ucomiss	0x45,%xmm5
-
-// CHECK: 	ucomiss	32493, %xmm5
-        	ucomiss	0x7eed,%xmm5
-
-// CHECK: 	ucomiss	3133065982, %xmm5
-        	ucomiss	0xbabecafe,%xmm5
-
-// CHECK: 	ucomiss	305419896, %xmm5
-        	ucomiss	0x12345678,%xmm5
-
-// CHECK: 	ucomiss	%xmm5, %xmm5
-        	ucomiss	%xmm5,%xmm5
-
-// CHECK: 	unpckhps	3735928559(%ebx,%ecx,8), %xmm5
-        	unpckhps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	unpckhps	69, %xmm5
-        	unpckhps	0x45,%xmm5
-
-// CHECK: 	unpckhps	32493, %xmm5
-        	unpckhps	0x7eed,%xmm5
-
-// CHECK: 	unpckhps	3133065982, %xmm5
-        	unpckhps	0xbabecafe,%xmm5
-
-// CHECK: 	unpckhps	305419896, %xmm5
-        	unpckhps	0x12345678,%xmm5
-
-// CHECK: 	unpckhps	%xmm5, %xmm5
-        	unpckhps	%xmm5,%xmm5
-
-// CHECK: 	unpcklps	3735928559(%ebx,%ecx,8), %xmm5
-        	unpcklps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	unpcklps	69, %xmm5
-        	unpcklps	0x45,%xmm5
-
-// CHECK: 	unpcklps	32493, %xmm5
-        	unpcklps	0x7eed,%xmm5
-
-// CHECK: 	unpcklps	3133065982, %xmm5
-        	unpcklps	0xbabecafe,%xmm5
-
-// CHECK: 	unpcklps	305419896, %xmm5
-        	unpcklps	0x12345678,%xmm5
-
-// CHECK: 	unpcklps	%xmm5, %xmm5
-        	unpcklps	%xmm5,%xmm5
-
-// CHECK: 	xorps	3735928559(%ebx,%ecx,8), %xmm5
-        	xorps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	xorps	69, %xmm5
-        	xorps	0x45,%xmm5
-
-// CHECK: 	xorps	32493, %xmm5
-        	xorps	0x7eed,%xmm5
-
-// CHECK: 	xorps	3133065982, %xmm5
-        	xorps	0xbabecafe,%xmm5
-
-// CHECK: 	xorps	305419896, %xmm5
-        	xorps	0x12345678,%xmm5
-
-// CHECK: 	xorps	%xmm5, %xmm5
-        	xorps	%xmm5,%xmm5
-
-// CHECK: 	addpd	3735928559(%ebx,%ecx,8), %xmm5
-        	addpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	addpd	69, %xmm5
-        	addpd	0x45,%xmm5
-
-// CHECK: 	addpd	32493, %xmm5
-        	addpd	0x7eed,%xmm5
-
-// CHECK: 	addpd	3133065982, %xmm5
-        	addpd	0xbabecafe,%xmm5
-
-// CHECK: 	addpd	305419896, %xmm5
-        	addpd	0x12345678,%xmm5
-
-// CHECK: 	addpd	%xmm5, %xmm5
-        	addpd	%xmm5,%xmm5
-
-// CHECK: 	addsd	3735928559(%ebx,%ecx,8), %xmm5
-        	addsd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	addsd	69, %xmm5
-        	addsd	0x45,%xmm5
-
-// CHECK: 	addsd	32493, %xmm5
-        	addsd	0x7eed,%xmm5
-
-// CHECK: 	addsd	3133065982, %xmm5
-        	addsd	0xbabecafe,%xmm5
-
-// CHECK: 	addsd	305419896, %xmm5
-        	addsd	0x12345678,%xmm5
-
-// CHECK: 	addsd	%xmm5, %xmm5
-        	addsd	%xmm5,%xmm5
-
-// CHECK: 	andnpd	3735928559(%ebx,%ecx,8), %xmm5
-        	andnpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	andnpd	69, %xmm5
-        	andnpd	0x45,%xmm5
-
-// CHECK: 	andnpd	32493, %xmm5
-        	andnpd	0x7eed,%xmm5
-
-// CHECK: 	andnpd	3133065982, %xmm5
-        	andnpd	0xbabecafe,%xmm5
-
-// CHECK: 	andnpd	305419896, %xmm5
-        	andnpd	0x12345678,%xmm5
-
-// CHECK: 	andnpd	%xmm5, %xmm5
-        	andnpd	%xmm5,%xmm5
-
-// CHECK: 	andpd	3735928559(%ebx,%ecx,8), %xmm5
-        	andpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	andpd	69, %xmm5
-        	andpd	0x45,%xmm5
-
-// CHECK: 	andpd	32493, %xmm5
-        	andpd	0x7eed,%xmm5
-
-// CHECK: 	andpd	3133065982, %xmm5
-        	andpd	0xbabecafe,%xmm5
-
-// CHECK: 	andpd	305419896, %xmm5
-        	andpd	0x12345678,%xmm5
-
-// CHECK: 	andpd	%xmm5, %xmm5
-        	andpd	%xmm5,%xmm5
-
-// CHECK: 	comisd	3735928559(%ebx,%ecx,8), %xmm5
-        	comisd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	comisd	69, %xmm5
-        	comisd	0x45,%xmm5
-
-// CHECK: 	comisd	32493, %xmm5
-        	comisd	0x7eed,%xmm5
-
-// CHECK: 	comisd	3133065982, %xmm5
-        	comisd	0xbabecafe,%xmm5
-
-// CHECK: 	comisd	305419896, %xmm5
-        	comisd	0x12345678,%xmm5
-
-// CHECK: 	comisd	%xmm5, %xmm5
-        	comisd	%xmm5,%xmm5
-
-// CHECK: 	cvtpi2pd	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtpi2pd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtpi2pd	69, %xmm5
-        	cvtpi2pd	0x45,%xmm5
-
-// CHECK: 	cvtpi2pd	32493, %xmm5
-        	cvtpi2pd	0x7eed,%xmm5
-
-// CHECK: 	cvtpi2pd	3133065982, %xmm5
-        	cvtpi2pd	0xbabecafe,%xmm5
-
-// CHECK: 	cvtpi2pd	305419896, %xmm5
-        	cvtpi2pd	0x12345678,%xmm5
-
-// CHECK: 	cvtpi2pd	%mm3, %xmm5
-        	cvtpi2pd	%mm3,%xmm5
-
-// CHECK: 	cvtsi2sdl	%ecx, %xmm5
-        	cvtsi2sdl	%ecx,%xmm5
-
-// CHECK: 	cvtsi2sdl	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtsi2sdl	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtsi2sdl	69, %xmm5
-        	cvtsi2sdl	0x45,%xmm5
-
-// CHECK: 	cvtsi2sdl	32493, %xmm5
-        	cvtsi2sdl	0x7eed,%xmm5
-
-// CHECK: 	cvtsi2sdl	3133065982, %xmm5
-        	cvtsi2sdl	0xbabecafe,%xmm5
-
-// CHECK: 	cvtsi2sdl	305419896, %xmm5
-        	cvtsi2sdl	0x12345678,%xmm5
-
-// CHECK: 	divpd	3735928559(%ebx,%ecx,8), %xmm5
-        	divpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	divpd	69, %xmm5
-        	divpd	0x45,%xmm5
-
-// CHECK: 	divpd	32493, %xmm5
-        	divpd	0x7eed,%xmm5
-
-// CHECK: 	divpd	3133065982, %xmm5
-        	divpd	0xbabecafe,%xmm5
-
-// CHECK: 	divpd	305419896, %xmm5
-        	divpd	0x12345678,%xmm5
-
-// CHECK: 	divpd	%xmm5, %xmm5
-        	divpd	%xmm5,%xmm5
-
-// CHECK: 	divsd	3735928559(%ebx,%ecx,8), %xmm5
-        	divsd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	divsd	69, %xmm5
-        	divsd	0x45,%xmm5
-
-// CHECK: 	divsd	32493, %xmm5
-        	divsd	0x7eed,%xmm5
-
-// CHECK: 	divsd	3133065982, %xmm5
-        	divsd	0xbabecafe,%xmm5
-
-// CHECK: 	divsd	305419896, %xmm5
-        	divsd	0x12345678,%xmm5
-
-// CHECK: 	divsd	%xmm5, %xmm5
-        	divsd	%xmm5,%xmm5
-
-// CHECK: 	maxpd	3735928559(%ebx,%ecx,8), %xmm5
-        	maxpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	maxpd	69, %xmm5
-        	maxpd	0x45,%xmm5
-
-// CHECK: 	maxpd	32493, %xmm5
-        	maxpd	0x7eed,%xmm5
-
-// CHECK: 	maxpd	3133065982, %xmm5
-        	maxpd	0xbabecafe,%xmm5
-
-// CHECK: 	maxpd	305419896, %xmm5
-        	maxpd	0x12345678,%xmm5
-
-// CHECK: 	maxpd	%xmm5, %xmm5
-        	maxpd	%xmm5,%xmm5
-
-// CHECK: 	maxsd	3735928559(%ebx,%ecx,8), %xmm5
-        	maxsd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	maxsd	69, %xmm5
-        	maxsd	0x45,%xmm5
-
-// CHECK: 	maxsd	32493, %xmm5
-        	maxsd	0x7eed,%xmm5
-
-// CHECK: 	maxsd	3133065982, %xmm5
-        	maxsd	0xbabecafe,%xmm5
-
-// CHECK: 	maxsd	305419896, %xmm5
-        	maxsd	0x12345678,%xmm5
-
-// CHECK: 	maxsd	%xmm5, %xmm5
-        	maxsd	%xmm5,%xmm5
-
-// CHECK: 	minpd	3735928559(%ebx,%ecx,8), %xmm5
-        	minpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	minpd	69, %xmm5
-        	minpd	0x45,%xmm5
-
-// CHECK: 	minpd	32493, %xmm5
-        	minpd	0x7eed,%xmm5
-
-// CHECK: 	minpd	3133065982, %xmm5
-        	minpd	0xbabecafe,%xmm5
-
-// CHECK: 	minpd	305419896, %xmm5
-        	minpd	0x12345678,%xmm5
-
-// CHECK: 	minpd	%xmm5, %xmm5
-        	minpd	%xmm5,%xmm5
-
-// CHECK: 	minsd	3735928559(%ebx,%ecx,8), %xmm5
-        	minsd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	minsd	69, %xmm5
-        	minsd	0x45,%xmm5
-
-// CHECK: 	minsd	32493, %xmm5
-        	minsd	0x7eed,%xmm5
-
-// CHECK: 	minsd	3133065982, %xmm5
-        	minsd	0xbabecafe,%xmm5
-
-// CHECK: 	minsd	305419896, %xmm5
-        	minsd	0x12345678,%xmm5
-
-// CHECK: 	minsd	%xmm5, %xmm5
-        	minsd	%xmm5,%xmm5
-
-// CHECK: 	movapd	3735928559(%ebx,%ecx,8), %xmm5
-        	movapd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movapd	69, %xmm5
-        	movapd	0x45,%xmm5
-
-// CHECK: 	movapd	32493, %xmm5
-        	movapd	0x7eed,%xmm5
-
-// CHECK: 	movapd	3133065982, %xmm5
-        	movapd	0xbabecafe,%xmm5
-
-// CHECK: 	movapd	305419896, %xmm5
-        	movapd	0x12345678,%xmm5
-
-// CHECK: 	movapd	%xmm5, %xmm5
-        	movapd	%xmm5,%xmm5
-
-// CHECK: 	movapd	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movapd	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movapd	%xmm5, 69
-        	movapd	%xmm5,0x45
-
-// CHECK: 	movapd	%xmm5, 32493
-        	movapd	%xmm5,0x7eed
-
-// CHECK: 	movapd	%xmm5, 3133065982
-        	movapd	%xmm5,0xbabecafe
-
-// CHECK: 	movapd	%xmm5, 305419896
-        	movapd	%xmm5,0x12345678
-
-// CHECK: 	movapd	%xmm5, %xmm5
-        	movapd	%xmm5,%xmm5
-
-// CHECK: 	movhpd	3735928559(%ebx,%ecx,8), %xmm5
-        	movhpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movhpd	69, %xmm5
-        	movhpd	0x45,%xmm5
-
-// CHECK: 	movhpd	32493, %xmm5
-        	movhpd	0x7eed,%xmm5
-
-// CHECK: 	movhpd	3133065982, %xmm5
-        	movhpd	0xbabecafe,%xmm5
-
-// CHECK: 	movhpd	305419896, %xmm5
-        	movhpd	0x12345678,%xmm5
-
-// CHECK: 	movhpd	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movhpd	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movhpd	%xmm5, 69
-        	movhpd	%xmm5,0x45
-
-// CHECK: 	movhpd	%xmm5, 32493
-        	movhpd	%xmm5,0x7eed
-
-// CHECK: 	movhpd	%xmm5, 3133065982
-        	movhpd	%xmm5,0xbabecafe
-
-// CHECK: 	movhpd	%xmm5, 305419896
-        	movhpd	%xmm5,0x12345678
-
-// CHECK: 	movlpd	3735928559(%ebx,%ecx,8), %xmm5
-        	movlpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movlpd	69, %xmm5
-        	movlpd	0x45,%xmm5
-
-// CHECK: 	movlpd	32493, %xmm5
-        	movlpd	0x7eed,%xmm5
-
-// CHECK: 	movlpd	3133065982, %xmm5
-        	movlpd	0xbabecafe,%xmm5
-
-// CHECK: 	movlpd	305419896, %xmm5
-        	movlpd	0x12345678,%xmm5
-
-// CHECK: 	movlpd	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movlpd	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movlpd	%xmm5, 69
-        	movlpd	%xmm5,0x45
-
-// CHECK: 	movlpd	%xmm5, 32493
-        	movlpd	%xmm5,0x7eed
-
-// CHECK: 	movlpd	%xmm5, 3133065982
-        	movlpd	%xmm5,0xbabecafe
-
-// CHECK: 	movlpd	%xmm5, 305419896
-        	movlpd	%xmm5,0x12345678
-
-// CHECK: 	movmskpd	%xmm5, %ecx
-        	movmskpd	%xmm5,%ecx
-
-// CHECK: 	movntpd	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movntpd	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movntpd	%xmm5, 69
-        	movntpd	%xmm5,0x45
-
-// CHECK: 	movntpd	%xmm5, 32493
-        	movntpd	%xmm5,0x7eed
-
-// CHECK: 	movntpd	%xmm5, 3133065982
-        	movntpd	%xmm5,0xbabecafe
-
-// CHECK: 	movntpd	%xmm5, 305419896
-        	movntpd	%xmm5,0x12345678
-
-// CHECK: 	movsd	3735928559(%ebx,%ecx,8), %xmm5
-        	movsd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movsd	69, %xmm5
-        	movsd	0x45,%xmm5
-
-// CHECK: 	movsd	32493, %xmm5
-        	movsd	0x7eed,%xmm5
-
-// CHECK: 	movsd	3133065982, %xmm5
-        	movsd	0xbabecafe,%xmm5
-
-// CHECK: 	movsd	305419896, %xmm5
-        	movsd	0x12345678,%xmm5
-
-// CHECK: 	movsd	%xmm5, %xmm5
-        	movsd	%xmm5,%xmm5
-
-// CHECK: 	movsd	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movsd	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movsd	%xmm5, 69
-        	movsd	%xmm5,0x45
-
-// CHECK: 	movsd	%xmm5, 32493
-        	movsd	%xmm5,0x7eed
-
-// CHECK: 	movsd	%xmm5, 3133065982
-        	movsd	%xmm5,0xbabecafe
-
-// CHECK: 	movsd	%xmm5, 305419896
-        	movsd	%xmm5,0x12345678
-
-// CHECK: 	movsd	%xmm5, %xmm5
-        	movsd	%xmm5,%xmm5
-
-// CHECK: 	movupd	3735928559(%ebx,%ecx,8), %xmm5
-        	movupd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movupd	69, %xmm5
-        	movupd	0x45,%xmm5
-
-// CHECK: 	movupd	32493, %xmm5
-        	movupd	0x7eed,%xmm5
-
-// CHECK: 	movupd	3133065982, %xmm5
-        	movupd	0xbabecafe,%xmm5
-
-// CHECK: 	movupd	305419896, %xmm5
-        	movupd	0x12345678,%xmm5
-
-// CHECK: 	movupd	%xmm5, %xmm5
-        	movupd	%xmm5,%xmm5
-
-// CHECK: 	movupd	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movupd	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movupd	%xmm5, 69
-        	movupd	%xmm5,0x45
-
-// CHECK: 	movupd	%xmm5, 32493
-        	movupd	%xmm5,0x7eed
-
-// CHECK: 	movupd	%xmm5, 3133065982
-        	movupd	%xmm5,0xbabecafe
-
-// CHECK: 	movupd	%xmm5, 305419896
-        	movupd	%xmm5,0x12345678
-
-// CHECK: 	movupd	%xmm5, %xmm5
-        	movupd	%xmm5,%xmm5
-
-// CHECK: 	mulpd	3735928559(%ebx,%ecx,8), %xmm5
-        	mulpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	mulpd	69, %xmm5
-        	mulpd	0x45,%xmm5
-
-// CHECK: 	mulpd	32493, %xmm5
-        	mulpd	0x7eed,%xmm5
-
-// CHECK: 	mulpd	3133065982, %xmm5
-        	mulpd	0xbabecafe,%xmm5
-
-// CHECK: 	mulpd	305419896, %xmm5
-        	mulpd	0x12345678,%xmm5
-
-// CHECK: 	mulpd	%xmm5, %xmm5
-        	mulpd	%xmm5,%xmm5
-
-// CHECK: 	mulsd	3735928559(%ebx,%ecx,8), %xmm5
-        	mulsd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	mulsd	69, %xmm5
-        	mulsd	0x45,%xmm5
-
-// CHECK: 	mulsd	32493, %xmm5
-        	mulsd	0x7eed,%xmm5
-
-// CHECK: 	mulsd	3133065982, %xmm5
-        	mulsd	0xbabecafe,%xmm5
-
-// CHECK: 	mulsd	305419896, %xmm5
-        	mulsd	0x12345678,%xmm5
-
-// CHECK: 	mulsd	%xmm5, %xmm5
-        	mulsd	%xmm5,%xmm5
-
-// CHECK: 	orpd	3735928559(%ebx,%ecx,8), %xmm5
-        	orpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	orpd	69, %xmm5
-        	orpd	0x45,%xmm5
-
-// CHECK: 	orpd	32493, %xmm5
-        	orpd	0x7eed,%xmm5
-
-// CHECK: 	orpd	3133065982, %xmm5
-        	orpd	0xbabecafe,%xmm5
-
-// CHECK: 	orpd	305419896, %xmm5
-        	orpd	0x12345678,%xmm5
-
-// CHECK: 	orpd	%xmm5, %xmm5
-        	orpd	%xmm5,%xmm5
-
-// CHECK: 	sqrtpd	3735928559(%ebx,%ecx,8), %xmm5
-        	sqrtpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	sqrtpd	69, %xmm5
-        	sqrtpd	0x45,%xmm5
-
-// CHECK: 	sqrtpd	32493, %xmm5
-        	sqrtpd	0x7eed,%xmm5
-
-// CHECK: 	sqrtpd	3133065982, %xmm5
-        	sqrtpd	0xbabecafe,%xmm5
-
-// CHECK: 	sqrtpd	305419896, %xmm5
-        	sqrtpd	0x12345678,%xmm5
-
-// CHECK: 	sqrtpd	%xmm5, %xmm5
-        	sqrtpd	%xmm5,%xmm5
-
-// CHECK: 	sqrtsd	3735928559(%ebx,%ecx,8), %xmm5
-        	sqrtsd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	sqrtsd	69, %xmm5
-        	sqrtsd	0x45,%xmm5
-
-// CHECK: 	sqrtsd	32493, %xmm5
-        	sqrtsd	0x7eed,%xmm5
-
-// CHECK: 	sqrtsd	3133065982, %xmm5
-        	sqrtsd	0xbabecafe,%xmm5
-
-// CHECK: 	sqrtsd	305419896, %xmm5
-        	sqrtsd	0x12345678,%xmm5
-
-// CHECK: 	sqrtsd	%xmm5, %xmm5
-        	sqrtsd	%xmm5,%xmm5
-
-// CHECK: 	subpd	3735928559(%ebx,%ecx,8), %xmm5
-        	subpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	subpd	69, %xmm5
-        	subpd	0x45,%xmm5
-
-// CHECK: 	subpd	32493, %xmm5
-        	subpd	0x7eed,%xmm5
-
-// CHECK: 	subpd	3133065982, %xmm5
-        	subpd	0xbabecafe,%xmm5
-
-// CHECK: 	subpd	305419896, %xmm5
-        	subpd	0x12345678,%xmm5
-
-// CHECK: 	subpd	%xmm5, %xmm5
-        	subpd	%xmm5,%xmm5
-
-// CHECK: 	subsd	3735928559(%ebx,%ecx,8), %xmm5
-        	subsd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	subsd	69, %xmm5
-        	subsd	0x45,%xmm5
-
-// CHECK: 	subsd	32493, %xmm5
-        	subsd	0x7eed,%xmm5
-
-// CHECK: 	subsd	3133065982, %xmm5
-        	subsd	0xbabecafe,%xmm5
-
-// CHECK: 	subsd	305419896, %xmm5
-        	subsd	0x12345678,%xmm5
-
-// CHECK: 	subsd	%xmm5, %xmm5
-        	subsd	%xmm5,%xmm5
-
-// CHECK: 	ucomisd	3735928559(%ebx,%ecx,8), %xmm5
-        	ucomisd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	ucomisd	69, %xmm5
-        	ucomisd	0x45,%xmm5
-
-// CHECK: 	ucomisd	32493, %xmm5
-        	ucomisd	0x7eed,%xmm5
-
-// CHECK: 	ucomisd	3133065982, %xmm5
-        	ucomisd	0xbabecafe,%xmm5
-
-// CHECK: 	ucomisd	305419896, %xmm5
-        	ucomisd	0x12345678,%xmm5
-
-// CHECK: 	ucomisd	%xmm5, %xmm5
-        	ucomisd	%xmm5,%xmm5
-
-// CHECK: 	unpckhpd	3735928559(%ebx,%ecx,8), %xmm5
-        	unpckhpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	unpckhpd	69, %xmm5
-        	unpckhpd	0x45,%xmm5
-
-// CHECK: 	unpckhpd	32493, %xmm5
-        	unpckhpd	0x7eed,%xmm5
-
-// CHECK: 	unpckhpd	3133065982, %xmm5
-        	unpckhpd	0xbabecafe,%xmm5
-
-// CHECK: 	unpckhpd	305419896, %xmm5
-        	unpckhpd	0x12345678,%xmm5
-
-// CHECK: 	unpckhpd	%xmm5, %xmm5
-        	unpckhpd	%xmm5,%xmm5
-
-// CHECK: 	unpcklpd	3735928559(%ebx,%ecx,8), %xmm5
-        	unpcklpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	unpcklpd	69, %xmm5
-        	unpcklpd	0x45,%xmm5
-
-// CHECK: 	unpcklpd	32493, %xmm5
-        	unpcklpd	0x7eed,%xmm5
-
-// CHECK: 	unpcklpd	3133065982, %xmm5
-        	unpcklpd	0xbabecafe,%xmm5
-
-// CHECK: 	unpcklpd	305419896, %xmm5
-        	unpcklpd	0x12345678,%xmm5
-
-// CHECK: 	unpcklpd	%xmm5, %xmm5
-        	unpcklpd	%xmm5,%xmm5
-
-// CHECK: 	xorpd	3735928559(%ebx,%ecx,8), %xmm5
-        	xorpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	xorpd	69, %xmm5
-        	xorpd	0x45,%xmm5
-
-// CHECK: 	xorpd	32493, %xmm5
-        	xorpd	0x7eed,%xmm5
-
-// CHECK: 	xorpd	3133065982, %xmm5
-        	xorpd	0xbabecafe,%xmm5
-
-// CHECK: 	xorpd	305419896, %xmm5
-        	xorpd	0x12345678,%xmm5
-
-// CHECK: 	xorpd	%xmm5, %xmm5
-        	xorpd	%xmm5,%xmm5
-
-// CHECK: 	cvtdq2pd	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtdq2pd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtdq2pd	69, %xmm5
-        	cvtdq2pd	0x45,%xmm5
-
-// CHECK: 	cvtdq2pd	32493, %xmm5
-        	cvtdq2pd	0x7eed,%xmm5
-
-// CHECK: 	cvtdq2pd	3133065982, %xmm5
-        	cvtdq2pd	0xbabecafe,%xmm5
-
-// CHECK: 	cvtdq2pd	305419896, %xmm5
-        	cvtdq2pd	0x12345678,%xmm5
-
-// CHECK: 	cvtdq2pd	%xmm5, %xmm5
-        	cvtdq2pd	%xmm5,%xmm5
-
-// CHECK: 	cvtpd2dq	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtpd2dq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtpd2dq	69, %xmm5
-        	cvtpd2dq	0x45,%xmm5
-
-// CHECK: 	cvtpd2dq	32493, %xmm5
-        	cvtpd2dq	0x7eed,%xmm5
-
-// CHECK: 	cvtpd2dq	3133065982, %xmm5
-        	cvtpd2dq	0xbabecafe,%xmm5
-
-// CHECK: 	cvtpd2dq	305419896, %xmm5
-        	cvtpd2dq	0x12345678,%xmm5
-
-// CHECK: 	cvtpd2dq	%xmm5, %xmm5
-        	cvtpd2dq	%xmm5,%xmm5
-
-// CHECK: 	cvtdq2ps	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtdq2ps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtdq2ps	69, %xmm5
-        	cvtdq2ps	0x45,%xmm5
-
-// CHECK: 	cvtdq2ps	32493, %xmm5
-        	cvtdq2ps	0x7eed,%xmm5
-
-// CHECK: 	cvtdq2ps	3133065982, %xmm5
-        	cvtdq2ps	0xbabecafe,%xmm5
-
-// CHECK: 	cvtdq2ps	305419896, %xmm5
-        	cvtdq2ps	0x12345678,%xmm5
-
-// CHECK: 	cvtdq2ps	%xmm5, %xmm5
-        	cvtdq2ps	%xmm5,%xmm5
-
-// CHECK: 	cvtpd2pi	3735928559(%ebx,%ecx,8), %mm3
-        	cvtpd2pi	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	cvtpd2pi	69, %mm3
-        	cvtpd2pi	0x45,%mm3
-
-// CHECK: 	cvtpd2pi	32493, %mm3
-        	cvtpd2pi	0x7eed,%mm3
-
-// CHECK: 	cvtpd2pi	3133065982, %mm3
-        	cvtpd2pi	0xbabecafe,%mm3
-
-// CHECK: 	cvtpd2pi	305419896, %mm3
-        	cvtpd2pi	0x12345678,%mm3
-
-// CHECK: 	cvtpd2pi	%xmm5, %mm3
-        	cvtpd2pi	%xmm5,%mm3
-
-// CHECK: 	cvtpd2ps	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtpd2ps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtpd2ps	69, %xmm5
-        	cvtpd2ps	0x45,%xmm5
-
-// CHECK: 	cvtpd2ps	32493, %xmm5
-        	cvtpd2ps	0x7eed,%xmm5
-
-// CHECK: 	cvtpd2ps	3133065982, %xmm5
-        	cvtpd2ps	0xbabecafe,%xmm5
-
-// CHECK: 	cvtpd2ps	305419896, %xmm5
-        	cvtpd2ps	0x12345678,%xmm5
-
-// CHECK: 	cvtpd2ps	%xmm5, %xmm5
-        	cvtpd2ps	%xmm5,%xmm5
-
-// CHECK: 	cvtps2pd	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtps2pd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtps2pd	69, %xmm5
-        	cvtps2pd	0x45,%xmm5
-
-// CHECK: 	cvtps2pd	32493, %xmm5
-        	cvtps2pd	0x7eed,%xmm5
-
-// CHECK: 	cvtps2pd	3133065982, %xmm5
-        	cvtps2pd	0xbabecafe,%xmm5
-
-// CHECK: 	cvtps2pd	305419896, %xmm5
-        	cvtps2pd	0x12345678,%xmm5
-
-// CHECK: 	cvtps2pd	%xmm5, %xmm5
-        	cvtps2pd	%xmm5,%xmm5
-
-// CHECK: 	cvtps2dq	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtps2dq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtps2dq	69, %xmm5
-        	cvtps2dq	0x45,%xmm5
-
-// CHECK: 	cvtps2dq	32493, %xmm5
-        	cvtps2dq	0x7eed,%xmm5
-
-// CHECK: 	cvtps2dq	3133065982, %xmm5
-        	cvtps2dq	0xbabecafe,%xmm5
-
-// CHECK: 	cvtps2dq	305419896, %xmm5
-        	cvtps2dq	0x12345678,%xmm5
-
-// CHECK: 	cvtps2dq	%xmm5, %xmm5
-        	cvtps2dq	%xmm5,%xmm5
-
-// CHECK: 	cvtsd2ss	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtsd2ss	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtsd2ss	69, %xmm5
-        	cvtsd2ss	0x45,%xmm5
-
-// CHECK: 	cvtsd2ss	32493, %xmm5
-        	cvtsd2ss	0x7eed,%xmm5
-
-// CHECK: 	cvtsd2ss	3133065982, %xmm5
-        	cvtsd2ss	0xbabecafe,%xmm5
-
-// CHECK: 	cvtsd2ss	305419896, %xmm5
-        	cvtsd2ss	0x12345678,%xmm5
-
-// CHECK: 	cvtsd2ss	%xmm5, %xmm5
-        	cvtsd2ss	%xmm5,%xmm5
-
-// CHECK: 	cvtss2sd	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtss2sd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvtss2sd	69, %xmm5
-        	cvtss2sd	0x45,%xmm5
-
-// CHECK: 	cvtss2sd	32493, %xmm5
-        	cvtss2sd	0x7eed,%xmm5
-
-// CHECK: 	cvtss2sd	3133065982, %xmm5
-        	cvtss2sd	0xbabecafe,%xmm5
-
-// CHECK: 	cvtss2sd	305419896, %xmm5
-        	cvtss2sd	0x12345678,%xmm5
-
-// CHECK: 	cvtss2sd	%xmm5, %xmm5
-        	cvtss2sd	%xmm5,%xmm5
-
-// CHECK: 	cvttpd2pi	3735928559(%ebx,%ecx,8), %mm3
-        	cvttpd2pi	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	cvttpd2pi	69, %mm3
-        	cvttpd2pi	0x45,%mm3
-
-// CHECK: 	cvttpd2pi	32493, %mm3
-        	cvttpd2pi	0x7eed,%mm3
-
-// CHECK: 	cvttpd2pi	3133065982, %mm3
-        	cvttpd2pi	0xbabecafe,%mm3
-
-// CHECK: 	cvttpd2pi	305419896, %mm3
-        	cvttpd2pi	0x12345678,%mm3
-
-// CHECK: 	cvttpd2pi	%xmm5, %mm3
-        	cvttpd2pi	%xmm5,%mm3
-
-// CHECK: 	cvttsd2si	3735928559(%ebx,%ecx,8), %ecx
-        	cvttsd2si	0xdeadbeef(%ebx,%ecx,8),%ecx
-
-// CHECK: 	cvttsd2si	69, %ecx
-        	cvttsd2si	0x45,%ecx
-
-// CHECK: 	cvttsd2si	32493, %ecx
-        	cvttsd2si	0x7eed,%ecx
-
-// CHECK: 	cvttsd2si	3133065982, %ecx
-        	cvttsd2si	0xbabecafe,%ecx
-
-// CHECK: 	cvttsd2si	305419896, %ecx
-        	cvttsd2si	0x12345678,%ecx
-
-// CHECK: 	cvttsd2si	%xmm5, %ecx
-        	cvttsd2si	%xmm5,%ecx
-
-// CHECK: 	cvttps2dq	3735928559(%ebx,%ecx,8), %xmm5
-        	cvttps2dq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	cvttps2dq	69, %xmm5
-        	cvttps2dq	0x45,%xmm5
-
-// CHECK: 	cvttps2dq	32493, %xmm5
-        	cvttps2dq	0x7eed,%xmm5
-
-// CHECK: 	cvttps2dq	3133065982, %xmm5
-        	cvttps2dq	0xbabecafe,%xmm5
-
-// CHECK: 	cvttps2dq	305419896, %xmm5
-        	cvttps2dq	0x12345678,%xmm5
-
-// CHECK: 	cvttps2dq	%xmm5, %xmm5
-        	cvttps2dq	%xmm5,%xmm5
-
-// CHECK: 	maskmovdqu	%xmm5, %xmm5
-        	maskmovdqu	%xmm5,%xmm5
-
-// CHECK: 	movdqa	3735928559(%ebx,%ecx,8), %xmm5
-        	movdqa	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movdqa	69, %xmm5
-        	movdqa	0x45,%xmm5
-
-// CHECK: 	movdqa	32493, %xmm5
-        	movdqa	0x7eed,%xmm5
-
-// CHECK: 	movdqa	3133065982, %xmm5
-        	movdqa	0xbabecafe,%xmm5
-
-// CHECK: 	movdqa	305419896, %xmm5
-        	movdqa	0x12345678,%xmm5
-
-// CHECK: 	movdqa	%xmm5, %xmm5
-        	movdqa	%xmm5,%xmm5
-
-// CHECK: 	movdqa	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movdqa	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movdqa	%xmm5, 69
-        	movdqa	%xmm5,0x45
-
-// CHECK: 	movdqa	%xmm5, 32493
-        	movdqa	%xmm5,0x7eed
-
-// CHECK: 	movdqa	%xmm5, 3133065982
-        	movdqa	%xmm5,0xbabecafe
-
-// CHECK: 	movdqa	%xmm5, 305419896
-        	movdqa	%xmm5,0x12345678
-
-// CHECK: 	movdqa	%xmm5, %xmm5
-        	movdqa	%xmm5,%xmm5
-
-// CHECK: 	movdqu	3735928559(%ebx,%ecx,8), %xmm5
-        	movdqu	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movdqu	69, %xmm5
-        	movdqu	0x45,%xmm5
-
-// CHECK: 	movdqu	32493, %xmm5
-        	movdqu	0x7eed,%xmm5
-
-// CHECK: 	movdqu	3133065982, %xmm5
-        	movdqu	0xbabecafe,%xmm5
-
-// CHECK: 	movdqu	305419896, %xmm5
-        	movdqu	0x12345678,%xmm5
-
-// CHECK: 	movdqu	%xmm5, 3735928559(%ebx,%ecx,8)
-        	movdqu	%xmm5,0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	movdqu	%xmm5, 69
-        	movdqu	%xmm5,0x45
-
-// CHECK: 	movdqu	%xmm5, 32493
-        	movdqu	%xmm5,0x7eed
-
-// CHECK: 	movdqu	%xmm5, 3133065982
-        	movdqu	%xmm5,0xbabecafe
-
-// CHECK: 	movdqu	%xmm5, 305419896
-        	movdqu	%xmm5,0x12345678
-
-// CHECK: 	movdq2q	%xmm5, %mm3
-        	movdq2q	%xmm5,%mm3
-
-// CHECK: 	movq2dq	%mm3, %xmm5
-        	movq2dq	%mm3,%xmm5
-
-// CHECK: 	pmuludq	3735928559(%ebx,%ecx,8), %mm3
-        	pmuludq	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pmuludq	69, %mm3
-        	pmuludq	0x45,%mm3
-
-// CHECK: 	pmuludq	32493, %mm3
-        	pmuludq	0x7eed,%mm3
-
-// CHECK: 	pmuludq	3133065982, %mm3
-        	pmuludq	0xbabecafe,%mm3
-
-// CHECK: 	pmuludq	305419896, %mm3
-        	pmuludq	0x12345678,%mm3
-
-// CHECK: 	pmuludq	%mm3, %mm3
-        	pmuludq	%mm3,%mm3
-
-// CHECK: 	pmuludq	3735928559(%ebx,%ecx,8), %xmm5
-        	pmuludq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmuludq	69, %xmm5
-        	pmuludq	0x45,%xmm5
-
-// CHECK: 	pmuludq	32493, %xmm5
-        	pmuludq	0x7eed,%xmm5
-
-// CHECK: 	pmuludq	3133065982, %xmm5
-        	pmuludq	0xbabecafe,%xmm5
-
-// CHECK: 	pmuludq	305419896, %xmm5
-        	pmuludq	0x12345678,%xmm5
-
-// CHECK: 	pmuludq	%xmm5, %xmm5
-        	pmuludq	%xmm5,%xmm5
-
-// CHECK: 	pslldq	$127, %xmm5
-        	pslldq	$0x7f,%xmm5
-
-// CHECK: 	psrldq	$127, %xmm5
-        	psrldq	$0x7f,%xmm5
-
-// CHECK: 	punpckhqdq	3735928559(%ebx,%ecx,8), %xmm5
-        	punpckhqdq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	punpckhqdq	69, %xmm5
-        	punpckhqdq	0x45,%xmm5
-
-// CHECK: 	punpckhqdq	32493, %xmm5
-        	punpckhqdq	0x7eed,%xmm5
-
-// CHECK: 	punpckhqdq	3133065982, %xmm5
-        	punpckhqdq	0xbabecafe,%xmm5
-
-// CHECK: 	punpckhqdq	305419896, %xmm5
-        	punpckhqdq	0x12345678,%xmm5
-
-// CHECK: 	punpckhqdq	%xmm5, %xmm5
-        	punpckhqdq	%xmm5,%xmm5
-
-// CHECK: 	punpcklqdq	3735928559(%ebx,%ecx,8), %xmm5
-        	punpcklqdq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	punpcklqdq	69, %xmm5
-        	punpcklqdq	0x45,%xmm5
-
-// CHECK: 	punpcklqdq	32493, %xmm5
-        	punpcklqdq	0x7eed,%xmm5
-
-// CHECK: 	punpcklqdq	3133065982, %xmm5
-        	punpcklqdq	0xbabecafe,%xmm5
-
-// CHECK: 	punpcklqdq	305419896, %xmm5
-        	punpcklqdq	0x12345678,%xmm5
-
-// CHECK: 	punpcklqdq	%xmm5, %xmm5
-        	punpcklqdq	%xmm5,%xmm5
-
-// CHECK: 	addsubpd	3735928559(%ebx,%ecx,8), %xmm5
-        	addsubpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	addsubpd	69, %xmm5
-        	addsubpd	0x45,%xmm5
-
-// CHECK: 	addsubpd	32493, %xmm5
-        	addsubpd	0x7eed,%xmm5
-
-// CHECK: 	addsubpd	3133065982, %xmm5
-        	addsubpd	0xbabecafe,%xmm5
-
-// CHECK: 	addsubpd	305419896, %xmm5
-        	addsubpd	0x12345678,%xmm5
-
-// CHECK: 	addsubpd	%xmm5, %xmm5
-        	addsubpd	%xmm5,%xmm5
-
-// CHECK: 	addsubps	3735928559(%ebx,%ecx,8), %xmm5
-        	addsubps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	addsubps	69, %xmm5
-        	addsubps	0x45,%xmm5
-
-// CHECK: 	addsubps	32493, %xmm5
-        	addsubps	0x7eed,%xmm5
-
-// CHECK: 	addsubps	3133065982, %xmm5
-        	addsubps	0xbabecafe,%xmm5
-
-// CHECK: 	addsubps	305419896, %xmm5
-        	addsubps	0x12345678,%xmm5
-
-// CHECK: 	addsubps	%xmm5, %xmm5
-        	addsubps	%xmm5,%xmm5
-
-// CHECK: 	fisttpl	3735928559(%ebx,%ecx,8)
-        	fisttpl	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	fisttpl	3133065982
-        	fisttpl	0xbabecafe
-
-// CHECK: 	fisttpl	305419896
-        	fisttpl	0x12345678
-
-// CHECK: 	haddpd	3735928559(%ebx,%ecx,8), %xmm5
-        	haddpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	haddpd	69, %xmm5
-        	haddpd	0x45,%xmm5
-
-// CHECK: 	haddpd	32493, %xmm5
-        	haddpd	0x7eed,%xmm5
-
-// CHECK: 	haddpd	3133065982, %xmm5
-        	haddpd	0xbabecafe,%xmm5
-
-// CHECK: 	haddpd	305419896, %xmm5
-        	haddpd	0x12345678,%xmm5
-
-// CHECK: 	haddpd	%xmm5, %xmm5
-        	haddpd	%xmm5,%xmm5
-
-// CHECK: 	haddps	3735928559(%ebx,%ecx,8), %xmm5
-        	haddps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	haddps	69, %xmm5
-        	haddps	0x45,%xmm5
-
-// CHECK: 	haddps	32493, %xmm5
-        	haddps	0x7eed,%xmm5
-
-// CHECK: 	haddps	3133065982, %xmm5
-        	haddps	0xbabecafe,%xmm5
-
-// CHECK: 	haddps	305419896, %xmm5
-        	haddps	0x12345678,%xmm5
-
-// CHECK: 	haddps	%xmm5, %xmm5
-        	haddps	%xmm5,%xmm5
-
-// CHECK: 	hsubpd	3735928559(%ebx,%ecx,8), %xmm5
-        	hsubpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	hsubpd	69, %xmm5
-        	hsubpd	0x45,%xmm5
-
-// CHECK: 	hsubpd	32493, %xmm5
-        	hsubpd	0x7eed,%xmm5
-
-// CHECK: 	hsubpd	3133065982, %xmm5
-        	hsubpd	0xbabecafe,%xmm5
-
-// CHECK: 	hsubpd	305419896, %xmm5
-        	hsubpd	0x12345678,%xmm5
-
-// CHECK: 	hsubpd	%xmm5, %xmm5
-        	hsubpd	%xmm5,%xmm5
-
-// CHECK: 	hsubps	3735928559(%ebx,%ecx,8), %xmm5
-        	hsubps	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	hsubps	69, %xmm5
-        	hsubps	0x45,%xmm5
-
-// CHECK: 	hsubps	32493, %xmm5
-        	hsubps	0x7eed,%xmm5
-
-// CHECK: 	hsubps	3133065982, %xmm5
-        	hsubps	0xbabecafe,%xmm5
-
-// CHECK: 	hsubps	305419896, %xmm5
-        	hsubps	0x12345678,%xmm5
-
-// CHECK: 	hsubps	%xmm5, %xmm5
-        	hsubps	%xmm5,%xmm5
-
-// CHECK: 	lddqu	3735928559(%ebx,%ecx,8), %xmm5
-        	lddqu	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	lddqu	69, %xmm5
-        	lddqu	0x45,%xmm5
-
-// CHECK: 	lddqu	32493, %xmm5
-        	lddqu	0x7eed,%xmm5
-
-// CHECK: 	lddqu	3133065982, %xmm5
-        	lddqu	0xbabecafe,%xmm5
-
-// CHECK: 	lddqu	305419896, %xmm5
-        	lddqu	0x12345678,%xmm5
-
 // CHECK: 	monitor
         	monitor
 
-// CHECK: 	movddup	3735928559(%ebx,%ecx,8), %xmm5
-        	movddup	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movddup	69, %xmm5
-        	movddup	0x45,%xmm5
-
-// CHECK: 	movddup	32493, %xmm5
-        	movddup	0x7eed,%xmm5
-
-// CHECK: 	movddup	3133065982, %xmm5
-        	movddup	0xbabecafe,%xmm5
-
-// CHECK: 	movddup	305419896, %xmm5
-        	movddup	0x12345678,%xmm5
-
-// CHECK: 	movddup	%xmm5, %xmm5
-        	movddup	%xmm5,%xmm5
-
-// CHECK: 	movshdup	3735928559(%ebx,%ecx,8), %xmm5
-        	movshdup	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movshdup	69, %xmm5
-        	movshdup	0x45,%xmm5
-
-// CHECK: 	movshdup	32493, %xmm5
-        	movshdup	0x7eed,%xmm5
-
-// CHECK: 	movshdup	3133065982, %xmm5
-        	movshdup	0xbabecafe,%xmm5
-
-// CHECK: 	movshdup	305419896, %xmm5
-        	movshdup	0x12345678,%xmm5
-
-// CHECK: 	movshdup	%xmm5, %xmm5
-        	movshdup	%xmm5,%xmm5
-
-// CHECK: 	movsldup	3735928559(%ebx,%ecx,8), %xmm5
-        	movsldup	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movsldup	69, %xmm5
-        	movsldup	0x45,%xmm5
-
-// CHECK: 	movsldup	32493, %xmm5
-        	movsldup	0x7eed,%xmm5
-
-// CHECK: 	movsldup	3133065982, %xmm5
-        	movsldup	0xbabecafe,%xmm5
-
-// CHECK: 	movsldup	305419896, %xmm5
-        	movsldup	0x12345678,%xmm5
-
-// CHECK: 	movsldup	%xmm5, %xmm5
-        	movsldup	%xmm5,%xmm5
-
 // CHECK: 	mwait
         	mwait
 
@@ -18419,48 +10466,12 @@
 // CHECK: 	vmfunc
         	vmfunc
 
-// CHECK: 	vmclear	3735928559(%ebx,%ecx,8)
-        	vmclear	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	vmclear	32493
-        	vmclear	0x7eed
-
-// CHECK: 	vmclear	3133065982
-        	vmclear	0xbabecafe
-
-// CHECK: 	vmclear	305419896
-        	vmclear	0x12345678
-
 // CHECK: 	vmlaunch
         	vmlaunch
 
 // CHECK: 	vmresume
         	vmresume
 
-// CHECK: 	vmptrld	3735928559(%ebx,%ecx,8)
-        	vmptrld	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	vmptrld	32493
-        	vmptrld	0x7eed
-
-// CHECK: 	vmptrld	3133065982
-        	vmptrld	0xbabecafe
-
-// CHECK: 	vmptrld	305419896
-        	vmptrld	0x12345678
-
-// CHECK: 	vmptrst	3735928559(%ebx,%ecx,8)
-        	vmptrst	0xdeadbeef(%ebx,%ecx,8)
-
-// CHECK: 	vmptrst	32493
-        	vmptrst	0x7eed
-
-// CHECK: 	vmptrst	3133065982
-        	vmptrst	0xbabecafe
-
-// CHECK: 	vmptrst	305419896
-        	vmptrst	0x12345678
-
 // CHECK: 	vmxoff
         	vmxoff
 
@@ -18500,1119 +10511,6 @@
 // CHECK: 	invlpga %ecx, %eax
         	invlpga %ecx, %eax
 
-// CHECK: 	phaddw	3735928559(%ebx,%ecx,8), %mm3
-        	phaddw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	phaddw	69, %mm3
-        	phaddw	0x45,%mm3
-
-// CHECK: 	phaddw	32493, %mm3
-        	phaddw	0x7eed,%mm3
-
-// CHECK: 	phaddw	3133065982, %mm3
-        	phaddw	0xbabecafe,%mm3
-
-// CHECK: 	phaddw	305419896, %mm3
-        	phaddw	0x12345678,%mm3
-
-// CHECK: 	phaddw	%mm3, %mm3
-        	phaddw	%mm3,%mm3
-
-// CHECK: 	phaddw	3735928559(%ebx,%ecx,8), %xmm5
-        	phaddw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	phaddw	69, %xmm5
-        	phaddw	0x45,%xmm5
-
-// CHECK: 	phaddw	32493, %xmm5
-        	phaddw	0x7eed,%xmm5
-
-// CHECK: 	phaddw	3133065982, %xmm5
-        	phaddw	0xbabecafe,%xmm5
-
-// CHECK: 	phaddw	305419896, %xmm5
-        	phaddw	0x12345678,%xmm5
-
-// CHECK: 	phaddw	%xmm5, %xmm5
-        	phaddw	%xmm5,%xmm5
-
-// CHECK: 	phaddd	3735928559(%ebx,%ecx,8), %mm3
-        	phaddd	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	phaddd	69, %mm3
-        	phaddd	0x45,%mm3
-
-// CHECK: 	phaddd	32493, %mm3
-        	phaddd	0x7eed,%mm3
-
-// CHECK: 	phaddd	3133065982, %mm3
-        	phaddd	0xbabecafe,%mm3
-
-// CHECK: 	phaddd	305419896, %mm3
-        	phaddd	0x12345678,%mm3
-
-// CHECK: 	phaddd	%mm3, %mm3
-        	phaddd	%mm3,%mm3
-
-// CHECK: 	phaddd	3735928559(%ebx,%ecx,8), %xmm5
-        	phaddd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	phaddd	69, %xmm5
-        	phaddd	0x45,%xmm5
-
-// CHECK: 	phaddd	32493, %xmm5
-        	phaddd	0x7eed,%xmm5
-
-// CHECK: 	phaddd	3133065982, %xmm5
-        	phaddd	0xbabecafe,%xmm5
-
-// CHECK: 	phaddd	305419896, %xmm5
-        	phaddd	0x12345678,%xmm5
-
-// CHECK: 	phaddd	%xmm5, %xmm5
-        	phaddd	%xmm5,%xmm5
-
-// CHECK: 	phaddsw	3735928559(%ebx,%ecx,8), %mm3
-        	phaddsw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	phaddsw	69, %mm3
-        	phaddsw	0x45,%mm3
-
-// CHECK: 	phaddsw	32493, %mm3
-        	phaddsw	0x7eed,%mm3
-
-// CHECK: 	phaddsw	3133065982, %mm3
-        	phaddsw	0xbabecafe,%mm3
-
-// CHECK: 	phaddsw	305419896, %mm3
-        	phaddsw	0x12345678,%mm3
-
-// CHECK: 	phaddsw	%mm3, %mm3
-        	phaddsw	%mm3,%mm3
-
-// CHECK: 	phaddsw	3735928559(%ebx,%ecx,8), %xmm5
-        	phaddsw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	phaddsw	69, %xmm5
-        	phaddsw	0x45,%xmm5
-
-// CHECK: 	phaddsw	32493, %xmm5
-        	phaddsw	0x7eed,%xmm5
-
-// CHECK: 	phaddsw	3133065982, %xmm5
-        	phaddsw	0xbabecafe,%xmm5
-
-// CHECK: 	phaddsw	305419896, %xmm5
-        	phaddsw	0x12345678,%xmm5
-
-// CHECK: 	phaddsw	%xmm5, %xmm5
-        	phaddsw	%xmm5,%xmm5
-
-// CHECK: 	phsubw	3735928559(%ebx,%ecx,8), %mm3
-        	phsubw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	phsubw	69, %mm3
-        	phsubw	0x45,%mm3
-
-// CHECK: 	phsubw	32493, %mm3
-        	phsubw	0x7eed,%mm3
-
-// CHECK: 	phsubw	3133065982, %mm3
-        	phsubw	0xbabecafe,%mm3
-
-// CHECK: 	phsubw	305419896, %mm3
-        	phsubw	0x12345678,%mm3
-
-// CHECK: 	phsubw	%mm3, %mm3
-        	phsubw	%mm3,%mm3
-
-// CHECK: 	phsubw	3735928559(%ebx,%ecx,8), %xmm5
-        	phsubw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	phsubw	69, %xmm5
-        	phsubw	0x45,%xmm5
-
-// CHECK: 	phsubw	32493, %xmm5
-        	phsubw	0x7eed,%xmm5
-
-// CHECK: 	phsubw	3133065982, %xmm5
-        	phsubw	0xbabecafe,%xmm5
-
-// CHECK: 	phsubw	305419896, %xmm5
-        	phsubw	0x12345678,%xmm5
-
-// CHECK: 	phsubw	%xmm5, %xmm5
-        	phsubw	%xmm5,%xmm5
-
-// CHECK: 	phsubd	3735928559(%ebx,%ecx,8), %mm3
-        	phsubd	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	phsubd	69, %mm3
-        	phsubd	0x45,%mm3
-
-// CHECK: 	phsubd	32493, %mm3
-        	phsubd	0x7eed,%mm3
-
-// CHECK: 	phsubd	3133065982, %mm3
-        	phsubd	0xbabecafe,%mm3
-
-// CHECK: 	phsubd	305419896, %mm3
-        	phsubd	0x12345678,%mm3
-
-// CHECK: 	phsubd	%mm3, %mm3
-        	phsubd	%mm3,%mm3
-
-// CHECK: 	phsubd	3735928559(%ebx,%ecx,8), %xmm5
-        	phsubd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	phsubd	69, %xmm5
-        	phsubd	0x45,%xmm5
-
-// CHECK: 	phsubd	32493, %xmm5
-        	phsubd	0x7eed,%xmm5
-
-// CHECK: 	phsubd	3133065982, %xmm5
-        	phsubd	0xbabecafe,%xmm5
-
-// CHECK: 	phsubd	305419896, %xmm5
-        	phsubd	0x12345678,%xmm5
-
-// CHECK: 	phsubd	%xmm5, %xmm5
-        	phsubd	%xmm5,%xmm5
-
-// CHECK: 	phsubsw	3735928559(%ebx,%ecx,8), %mm3
-        	phsubsw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	phsubsw	69, %mm3
-        	phsubsw	0x45,%mm3
-
-// CHECK: 	phsubsw	32493, %mm3
-        	phsubsw	0x7eed,%mm3
-
-// CHECK: 	phsubsw	3133065982, %mm3
-        	phsubsw	0xbabecafe,%mm3
-
-// CHECK: 	phsubsw	305419896, %mm3
-        	phsubsw	0x12345678,%mm3
-
-// CHECK: 	phsubsw	%mm3, %mm3
-        	phsubsw	%mm3,%mm3
-
-// CHECK: 	phsubsw	3735928559(%ebx,%ecx,8), %xmm5
-        	phsubsw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	phsubsw	69, %xmm5
-        	phsubsw	0x45,%xmm5
-
-// CHECK: 	phsubsw	32493, %xmm5
-        	phsubsw	0x7eed,%xmm5
-
-// CHECK: 	phsubsw	3133065982, %xmm5
-        	phsubsw	0xbabecafe,%xmm5
-
-// CHECK: 	phsubsw	305419896, %xmm5
-        	phsubsw	0x12345678,%xmm5
-
-// CHECK: 	phsubsw	%xmm5, %xmm5
-        	phsubsw	%xmm5,%xmm5
-
-// CHECK: 	pmaddubsw	3735928559(%ebx,%ecx,8), %mm3
-        	pmaddubsw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pmaddubsw	69, %mm3
-        	pmaddubsw	0x45,%mm3
-
-// CHECK: 	pmaddubsw	32493, %mm3
-        	pmaddubsw	0x7eed,%mm3
-
-// CHECK: 	pmaddubsw	3133065982, %mm3
-        	pmaddubsw	0xbabecafe,%mm3
-
-// CHECK: 	pmaddubsw	305419896, %mm3
-        	pmaddubsw	0x12345678,%mm3
-
-// CHECK: 	pmaddubsw	%mm3, %mm3
-        	pmaddubsw	%mm3,%mm3
-
-// CHECK: 	pmaddubsw	3735928559(%ebx,%ecx,8), %xmm5
-        	pmaddubsw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmaddubsw	69, %xmm5
-        	pmaddubsw	0x45,%xmm5
-
-// CHECK: 	pmaddubsw	32493, %xmm5
-        	pmaddubsw	0x7eed,%xmm5
-
-// CHECK: 	pmaddubsw	3133065982, %xmm5
-        	pmaddubsw	0xbabecafe,%xmm5
-
-// CHECK: 	pmaddubsw	305419896, %xmm5
-        	pmaddubsw	0x12345678,%xmm5
-
-// CHECK: 	pmaddubsw	%xmm5, %xmm5
-        	pmaddubsw	%xmm5,%xmm5
-
-// CHECK: 	pmulhrsw	3735928559(%ebx,%ecx,8), %mm3
-        	pmulhrsw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pmulhrsw	69, %mm3
-        	pmulhrsw	0x45,%mm3
-
-// CHECK: 	pmulhrsw	32493, %mm3
-        	pmulhrsw	0x7eed,%mm3
-
-// CHECK: 	pmulhrsw	3133065982, %mm3
-        	pmulhrsw	0xbabecafe,%mm3
-
-// CHECK: 	pmulhrsw	305419896, %mm3
-        	pmulhrsw	0x12345678,%mm3
-
-// CHECK: 	pmulhrsw	%mm3, %mm3
-        	pmulhrsw	%mm3,%mm3
-
-// CHECK: 	pmulhrsw	3735928559(%ebx,%ecx,8), %xmm5
-        	pmulhrsw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmulhrsw	69, %xmm5
-        	pmulhrsw	0x45,%xmm5
-
-// CHECK: 	pmulhrsw	32493, %xmm5
-        	pmulhrsw	0x7eed,%xmm5
-
-// CHECK: 	pmulhrsw	3133065982, %xmm5
-        	pmulhrsw	0xbabecafe,%xmm5
-
-// CHECK: 	pmulhrsw	305419896, %xmm5
-        	pmulhrsw	0x12345678,%xmm5
-
-// CHECK: 	pmulhrsw	%xmm5, %xmm5
-        	pmulhrsw	%xmm5,%xmm5
-
-// CHECK: 	pshufb	3735928559(%ebx,%ecx,8), %mm3
-        	pshufb	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pshufb	69, %mm3
-        	pshufb	0x45,%mm3
-
-// CHECK: 	pshufb	32493, %mm3
-        	pshufb	0x7eed,%mm3
-
-// CHECK: 	pshufb	3133065982, %mm3
-        	pshufb	0xbabecafe,%mm3
-
-// CHECK: 	pshufb	305419896, %mm3
-        	pshufb	0x12345678,%mm3
-
-// CHECK: 	pshufb	%mm3, %mm3
-        	pshufb	%mm3,%mm3
-
-// CHECK: 	pshufb	3735928559(%ebx,%ecx,8), %xmm5
-        	pshufb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pshufb	69, %xmm5
-        	pshufb	0x45,%xmm5
-
-// CHECK: 	pshufb	32493, %xmm5
-        	pshufb	0x7eed,%xmm5
-
-// CHECK: 	pshufb	3133065982, %xmm5
-        	pshufb	0xbabecafe,%xmm5
-
-// CHECK: 	pshufb	305419896, %xmm5
-        	pshufb	0x12345678,%xmm5
-
-// CHECK: 	pshufb	%xmm5, %xmm5
-        	pshufb	%xmm5,%xmm5
-
-// CHECK: 	psignb	3735928559(%ebx,%ecx,8), %mm3
-        	psignb	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psignb	69, %mm3
-        	psignb	0x45,%mm3
-
-// CHECK: 	psignb	32493, %mm3
-        	psignb	0x7eed,%mm3
-
-// CHECK: 	psignb	3133065982, %mm3
-        	psignb	0xbabecafe,%mm3
-
-// CHECK: 	psignb	305419896, %mm3
-        	psignb	0x12345678,%mm3
-
-// CHECK: 	psignb	%mm3, %mm3
-        	psignb	%mm3,%mm3
-
-// CHECK: 	psignb	3735928559(%ebx,%ecx,8), %xmm5
-        	psignb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psignb	69, %xmm5
-        	psignb	0x45,%xmm5
-
-// CHECK: 	psignb	32493, %xmm5
-        	psignb	0x7eed,%xmm5
-
-// CHECK: 	psignb	3133065982, %xmm5
-        	psignb	0xbabecafe,%xmm5
-
-// CHECK: 	psignb	305419896, %xmm5
-        	psignb	0x12345678,%xmm5
-
-// CHECK: 	psignb	%xmm5, %xmm5
-        	psignb	%xmm5,%xmm5
-
-// CHECK: 	psignw	3735928559(%ebx,%ecx,8), %mm3
-        	psignw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psignw	69, %mm3
-        	psignw	0x45,%mm3
-
-// CHECK: 	psignw	32493, %mm3
-        	psignw	0x7eed,%mm3
-
-// CHECK: 	psignw	3133065982, %mm3
-        	psignw	0xbabecafe,%mm3
-
-// CHECK: 	psignw	305419896, %mm3
-        	psignw	0x12345678,%mm3
-
-// CHECK: 	psignw	%mm3, %mm3
-        	psignw	%mm3,%mm3
-
-// CHECK: 	psignw	3735928559(%ebx,%ecx,8), %xmm5
-        	psignw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psignw	69, %xmm5
-        	psignw	0x45,%xmm5
-
-// CHECK: 	psignw	32493, %xmm5
-        	psignw	0x7eed,%xmm5
-
-// CHECK: 	psignw	3133065982, %xmm5
-        	psignw	0xbabecafe,%xmm5
-
-// CHECK: 	psignw	305419896, %xmm5
-        	psignw	0x12345678,%xmm5
-
-// CHECK: 	psignw	%xmm5, %xmm5
-        	psignw	%xmm5,%xmm5
-
-// CHECK: 	psignd	3735928559(%ebx,%ecx,8), %mm3
-        	psignd	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	psignd	69, %mm3
-        	psignd	0x45,%mm3
-
-// CHECK: 	psignd	32493, %mm3
-        	psignd	0x7eed,%mm3
-
-// CHECK: 	psignd	3133065982, %mm3
-        	psignd	0xbabecafe,%mm3
-
-// CHECK: 	psignd	305419896, %mm3
-        	psignd	0x12345678,%mm3
-
-// CHECK: 	psignd	%mm3, %mm3
-        	psignd	%mm3,%mm3
-
-// CHECK: 	psignd	3735928559(%ebx,%ecx,8), %xmm5
-        	psignd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	psignd	69, %xmm5
-        	psignd	0x45,%xmm5
-
-// CHECK: 	psignd	32493, %xmm5
-        	psignd	0x7eed,%xmm5
-
-// CHECK: 	psignd	3133065982, %xmm5
-        	psignd	0xbabecafe,%xmm5
-
-// CHECK: 	psignd	305419896, %xmm5
-        	psignd	0x12345678,%xmm5
-
-// CHECK: 	psignd	%xmm5, %xmm5
-        	psignd	%xmm5,%xmm5
-
-// CHECK: 	pabsb	3735928559(%ebx,%ecx,8), %mm3
-        	pabsb	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pabsb	69, %mm3
-        	pabsb	0x45,%mm3
-
-// CHECK: 	pabsb	32493, %mm3
-        	pabsb	0x7eed,%mm3
-
-// CHECK: 	pabsb	3133065982, %mm3
-        	pabsb	0xbabecafe,%mm3
-
-// CHECK: 	pabsb	305419896, %mm3
-        	pabsb	0x12345678,%mm3
-
-// CHECK: 	pabsb	%mm3, %mm3
-        	pabsb	%mm3,%mm3
-
-// CHECK: 	pabsb	3735928559(%ebx,%ecx,8), %xmm5
-        	pabsb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pabsb	69, %xmm5
-        	pabsb	0x45,%xmm5
-
-// CHECK: 	pabsb	32493, %xmm5
-        	pabsb	0x7eed,%xmm5
-
-// CHECK: 	pabsb	3133065982, %xmm5
-        	pabsb	0xbabecafe,%xmm5
-
-// CHECK: 	pabsb	305419896, %xmm5
-        	pabsb	0x12345678,%xmm5
-
-// CHECK: 	pabsb	%xmm5, %xmm5
-        	pabsb	%xmm5,%xmm5
-
-// CHECK: 	pabsw	3735928559(%ebx,%ecx,8), %mm3
-        	pabsw	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pabsw	69, %mm3
-        	pabsw	0x45,%mm3
-
-// CHECK: 	pabsw	32493, %mm3
-        	pabsw	0x7eed,%mm3
-
-// CHECK: 	pabsw	3133065982, %mm3
-        	pabsw	0xbabecafe,%mm3
-
-// CHECK: 	pabsw	305419896, %mm3
-        	pabsw	0x12345678,%mm3
-
-// CHECK: 	pabsw	%mm3, %mm3
-        	pabsw	%mm3,%mm3
-
-// CHECK: 	pabsw	3735928559(%ebx,%ecx,8), %xmm5
-        	pabsw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pabsw	69, %xmm5
-        	pabsw	0x45,%xmm5
-
-// CHECK: 	pabsw	32493, %xmm5
-        	pabsw	0x7eed,%xmm5
-
-// CHECK: 	pabsw	3133065982, %xmm5
-        	pabsw	0xbabecafe,%xmm5
-
-// CHECK: 	pabsw	305419896, %xmm5
-        	pabsw	0x12345678,%xmm5
-
-// CHECK: 	pabsw	%xmm5, %xmm5
-        	pabsw	%xmm5,%xmm5
-
-// CHECK: 	pabsd	3735928559(%ebx,%ecx,8), %mm3
-        	pabsd	0xdeadbeef(%ebx,%ecx,8),%mm3
-
-// CHECK: 	pabsd	69, %mm3
-        	pabsd	0x45,%mm3
-
-// CHECK: 	pabsd	32493, %mm3
-        	pabsd	0x7eed,%mm3
-
-// CHECK: 	pabsd	3133065982, %mm3
-        	pabsd	0xbabecafe,%mm3
-
-// CHECK: 	pabsd	305419896, %mm3
-        	pabsd	0x12345678,%mm3
-
-// CHECK: 	pabsd	%mm3, %mm3
-        	pabsd	%mm3,%mm3
-
-// CHECK: 	pabsd	3735928559(%ebx,%ecx,8), %xmm5
-        	pabsd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pabsd	69, %xmm5
-        	pabsd	0x45,%xmm5
-
-// CHECK: 	pabsd	32493, %xmm5
-        	pabsd	0x7eed,%xmm5
-
-// CHECK: 	pabsd	3133065982, %xmm5
-        	pabsd	0xbabecafe,%xmm5
-
-// CHECK: 	pabsd	305419896, %xmm5
-        	pabsd	0x12345678,%xmm5
-
-// CHECK: 	pabsd	%xmm5, %xmm5
-        	pabsd	%xmm5,%xmm5
-
-// CHECK: 	femms
-        	femms
-
-// CHECK: 	movntdqa	3735928559(%ebx,%ecx,8), %xmm5
-        	movntdqa	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	movntdqa	69, %xmm5
-        	movntdqa	0x45,%xmm5
-
-// CHECK: 	movntdqa	32493, %xmm5
-        	movntdqa	0x7eed,%xmm5
-
-// CHECK: 	movntdqa	3133065982, %xmm5
-        	movntdqa	0xbabecafe,%xmm5
-
-// CHECK: 	movntdqa	305419896, %xmm5
-        	movntdqa	0x12345678,%xmm5
-
-// CHECK: 	packusdw	3735928559(%ebx,%ecx,8), %xmm5
-        	packusdw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	packusdw	69, %xmm5
-        	packusdw	0x45,%xmm5
-
-// CHECK: 	packusdw	32493, %xmm5
-        	packusdw	0x7eed,%xmm5
-
-// CHECK: 	packusdw	3133065982, %xmm5
-        	packusdw	0xbabecafe,%xmm5
-
-// CHECK: 	packusdw	305419896, %xmm5
-        	packusdw	0x12345678,%xmm5
-
-// CHECK: 	packusdw	%xmm5, %xmm5
-        	packusdw	%xmm5,%xmm5
-
-// CHECK: 	pcmpeqq	3735928559(%ebx,%ecx,8), %xmm5
-        	pcmpeqq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pcmpeqq	69, %xmm5
-        	pcmpeqq	0x45,%xmm5
-
-// CHECK: 	pcmpeqq	32493, %xmm5
-        	pcmpeqq	0x7eed,%xmm5
-
-// CHECK: 	pcmpeqq	3133065982, %xmm5
-        	pcmpeqq	0xbabecafe,%xmm5
-
-// CHECK: 	pcmpeqq	305419896, %xmm5
-        	pcmpeqq	0x12345678,%xmm5
-
-// CHECK: 	pcmpeqq	%xmm5, %xmm5
-        	pcmpeqq	%xmm5,%xmm5
-
-// CHECK: 	phminposuw	3735928559(%ebx,%ecx,8), %xmm5
-        	phminposuw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	phminposuw	69, %xmm5
-        	phminposuw	0x45,%xmm5
-
-// CHECK: 	phminposuw	32493, %xmm5
-        	phminposuw	0x7eed,%xmm5
-
-// CHECK: 	phminposuw	3133065982, %xmm5
-        	phminposuw	0xbabecafe,%xmm5
-
-// CHECK: 	phminposuw	305419896, %xmm5
-        	phminposuw	0x12345678,%xmm5
-
-// CHECK: 	phminposuw	%xmm5, %xmm5
-        	phminposuw	%xmm5,%xmm5
-
-// CHECK: 	pmaxsb	3735928559(%ebx,%ecx,8), %xmm5
-        	pmaxsb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmaxsb	69, %xmm5
-        	pmaxsb	0x45,%xmm5
-
-// CHECK: 	pmaxsb	32493, %xmm5
-        	pmaxsb	0x7eed,%xmm5
-
-// CHECK: 	pmaxsb	3133065982, %xmm5
-        	pmaxsb	0xbabecafe,%xmm5
-
-// CHECK: 	pmaxsb	305419896, %xmm5
-        	pmaxsb	0x12345678,%xmm5
-
-// CHECK: 	pmaxsb	%xmm5, %xmm5
-        	pmaxsb	%xmm5,%xmm5
-
-// CHECK: 	pmaxsd	3735928559(%ebx,%ecx,8), %xmm5
-        	pmaxsd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmaxsd	69, %xmm5
-        	pmaxsd	0x45,%xmm5
-
-// CHECK: 	pmaxsd	32493, %xmm5
-        	pmaxsd	0x7eed,%xmm5
-
-// CHECK: 	pmaxsd	3133065982, %xmm5
-        	pmaxsd	0xbabecafe,%xmm5
-
-// CHECK: 	pmaxsd	305419896, %xmm5
-        	pmaxsd	0x12345678,%xmm5
-
-// CHECK: 	pmaxsd	%xmm5, %xmm5
-        	pmaxsd	%xmm5,%xmm5
-
-// CHECK: 	pmaxud	3735928559(%ebx,%ecx,8), %xmm5
-        	pmaxud	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmaxud	69, %xmm5
-        	pmaxud	0x45,%xmm5
-
-// CHECK: 	pmaxud	32493, %xmm5
-        	pmaxud	0x7eed,%xmm5
-
-// CHECK: 	pmaxud	3133065982, %xmm5
-        	pmaxud	0xbabecafe,%xmm5
-
-// CHECK: 	pmaxud	305419896, %xmm5
-        	pmaxud	0x12345678,%xmm5
-
-// CHECK: 	pmaxud	%xmm5, %xmm5
-        	pmaxud	%xmm5,%xmm5
-
-// CHECK: 	pmaxuw	3735928559(%ebx,%ecx,8), %xmm5
-        	pmaxuw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmaxuw	69, %xmm5
-        	pmaxuw	0x45,%xmm5
-
-// CHECK: 	pmaxuw	32493, %xmm5
-        	pmaxuw	0x7eed,%xmm5
-
-// CHECK: 	pmaxuw	3133065982, %xmm5
-        	pmaxuw	0xbabecafe,%xmm5
-
-// CHECK: 	pmaxuw	305419896, %xmm5
-        	pmaxuw	0x12345678,%xmm5
-
-// CHECK: 	pmaxuw	%xmm5, %xmm5
-        	pmaxuw	%xmm5,%xmm5
-
-// CHECK: 	pminsb	3735928559(%ebx,%ecx,8), %xmm5
-        	pminsb	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pminsb	69, %xmm5
-        	pminsb	0x45,%xmm5
-
-// CHECK: 	pminsb	32493, %xmm5
-        	pminsb	0x7eed,%xmm5
-
-// CHECK: 	pminsb	3133065982, %xmm5
-        	pminsb	0xbabecafe,%xmm5
-
-// CHECK: 	pminsb	305419896, %xmm5
-        	pminsb	0x12345678,%xmm5
-
-// CHECK: 	pminsb	%xmm5, %xmm5
-        	pminsb	%xmm5,%xmm5
-
-// CHECK: 	pminsd	3735928559(%ebx,%ecx,8), %xmm5
-        	pminsd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pminsd	69, %xmm5
-        	pminsd	0x45,%xmm5
-
-// CHECK: 	pminsd	32493, %xmm5
-        	pminsd	0x7eed,%xmm5
-
-// CHECK: 	pminsd	3133065982, %xmm5
-        	pminsd	0xbabecafe,%xmm5
-
-// CHECK: 	pminsd	305419896, %xmm5
-        	pminsd	0x12345678,%xmm5
-
-// CHECK: 	pminsd	%xmm5, %xmm5
-        	pminsd	%xmm5,%xmm5
-
-// CHECK: 	pminud	3735928559(%ebx,%ecx,8), %xmm5
-        	pminud	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pminud	69, %xmm5
-        	pminud	0x45,%xmm5
-
-// CHECK: 	pminud	32493, %xmm5
-        	pminud	0x7eed,%xmm5
-
-// CHECK: 	pminud	3133065982, %xmm5
-        	pminud	0xbabecafe,%xmm5
-
-// CHECK: 	pminud	305419896, %xmm5
-        	pminud	0x12345678,%xmm5
-
-// CHECK: 	pminud	%xmm5, %xmm5
-        	pminud	%xmm5,%xmm5
-
-// CHECK: 	pminuw	3735928559(%ebx,%ecx,8), %xmm5
-        	pminuw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pminuw	69, %xmm5
-        	pminuw	0x45,%xmm5
-
-// CHECK: 	pminuw	32493, %xmm5
-        	pminuw	0x7eed,%xmm5
-
-// CHECK: 	pminuw	3133065982, %xmm5
-        	pminuw	0xbabecafe,%xmm5
-
-// CHECK: 	pminuw	305419896, %xmm5
-        	pminuw	0x12345678,%xmm5
-
-// CHECK: 	pminuw	%xmm5, %xmm5
-        	pminuw	%xmm5,%xmm5
-
-// CHECK: 	pmovsxbw	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovsxbw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovsxbw	69, %xmm5
-        	pmovsxbw	0x45,%xmm5
-
-// CHECK: 	pmovsxbw	32493, %xmm5
-        	pmovsxbw	0x7eed,%xmm5
-
-// CHECK: 	pmovsxbw	3133065982, %xmm5
-        	pmovsxbw	0xbabecafe,%xmm5
-
-// CHECK: 	pmovsxbw	305419896, %xmm5
-        	pmovsxbw	0x12345678,%xmm5
-
-// CHECK: 	pmovsxbw	%xmm5, %xmm5
-        	pmovsxbw	%xmm5,%xmm5
-
-// CHECK: 	pmovsxbd	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovsxbd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovsxbd	69, %xmm5
-        	pmovsxbd	0x45,%xmm5
-
-// CHECK: 	pmovsxbd	32493, %xmm5
-        	pmovsxbd	0x7eed,%xmm5
-
-// CHECK: 	pmovsxbd	3133065982, %xmm5
-        	pmovsxbd	0xbabecafe,%xmm5
-
-// CHECK: 	pmovsxbd	305419896, %xmm5
-        	pmovsxbd	0x12345678,%xmm5
-
-// CHECK: 	pmovsxbd	%xmm5, %xmm5
-        	pmovsxbd	%xmm5,%xmm5
-
-// CHECK: 	pmovsxbq	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovsxbq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovsxbq	69, %xmm5
-        	pmovsxbq	0x45,%xmm5
-
-// CHECK: 	pmovsxbq	32493, %xmm5
-        	pmovsxbq	0x7eed,%xmm5
-
-// CHECK: 	pmovsxbq	3133065982, %xmm5
-        	pmovsxbq	0xbabecafe,%xmm5
-
-// CHECK: 	pmovsxbq	305419896, %xmm5
-        	pmovsxbq	0x12345678,%xmm5
-
-// CHECK: 	pmovsxbq	%xmm5, %xmm5
-        	pmovsxbq	%xmm5,%xmm5
-
-// CHECK: 	pmovsxwd	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovsxwd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovsxwd	69, %xmm5
-        	pmovsxwd	0x45,%xmm5
-
-// CHECK: 	pmovsxwd	32493, %xmm5
-        	pmovsxwd	0x7eed,%xmm5
-
-// CHECK: 	pmovsxwd	3133065982, %xmm5
-        	pmovsxwd	0xbabecafe,%xmm5
-
-// CHECK: 	pmovsxwd	305419896, %xmm5
-        	pmovsxwd	0x12345678,%xmm5
-
-// CHECK: 	pmovsxwd	%xmm5, %xmm5
-        	pmovsxwd	%xmm5,%xmm5
-
-// CHECK: 	pmovsxwq	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovsxwq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovsxwq	69, %xmm5
-        	pmovsxwq	0x45,%xmm5
-
-// CHECK: 	pmovsxwq	32493, %xmm5
-        	pmovsxwq	0x7eed,%xmm5
-
-// CHECK: 	pmovsxwq	3133065982, %xmm5
-        	pmovsxwq	0xbabecafe,%xmm5
-
-// CHECK: 	pmovsxwq	305419896, %xmm5
-        	pmovsxwq	0x12345678,%xmm5
-
-// CHECK: 	pmovsxwq	%xmm5, %xmm5
-        	pmovsxwq	%xmm5,%xmm5
-
-// CHECK: 	pmovsxdq	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovsxdq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovsxdq	69, %xmm5
-        	pmovsxdq	0x45,%xmm5
-
-// CHECK: 	pmovsxdq	32493, %xmm5
-        	pmovsxdq	0x7eed,%xmm5
-
-// CHECK: 	pmovsxdq	3133065982, %xmm5
-        	pmovsxdq	0xbabecafe,%xmm5
-
-// CHECK: 	pmovsxdq	305419896, %xmm5
-        	pmovsxdq	0x12345678,%xmm5
-
-// CHECK: 	pmovsxdq	%xmm5, %xmm5
-        	pmovsxdq	%xmm5,%xmm5
-
-// CHECK: 	pmovzxbw	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovzxbw	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovzxbw	69, %xmm5
-        	pmovzxbw	0x45,%xmm5
-
-// CHECK: 	pmovzxbw	32493, %xmm5
-        	pmovzxbw	0x7eed,%xmm5
-
-// CHECK: 	pmovzxbw	3133065982, %xmm5
-        	pmovzxbw	0xbabecafe,%xmm5
-
-// CHECK: 	pmovzxbw	305419896, %xmm5
-        	pmovzxbw	0x12345678,%xmm5
-
-// CHECK: 	pmovzxbw	%xmm5, %xmm5
-        	pmovzxbw	%xmm5,%xmm5
-
-// CHECK: 	pmovzxbd	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovzxbd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovzxbd	69, %xmm5
-        	pmovzxbd	0x45,%xmm5
-
-// CHECK: 	pmovzxbd	32493, %xmm5
-        	pmovzxbd	0x7eed,%xmm5
-
-// CHECK: 	pmovzxbd	3133065982, %xmm5
-        	pmovzxbd	0xbabecafe,%xmm5
-
-// CHECK: 	pmovzxbd	305419896, %xmm5
-        	pmovzxbd	0x12345678,%xmm5
-
-// CHECK: 	pmovzxbd	%xmm5, %xmm5
-        	pmovzxbd	%xmm5,%xmm5
-
-// CHECK: 	pmovzxbq	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovzxbq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovzxbq	69, %xmm5
-        	pmovzxbq	0x45,%xmm5
-
-// CHECK: 	pmovzxbq	32493, %xmm5
-        	pmovzxbq	0x7eed,%xmm5
-
-// CHECK: 	pmovzxbq	3133065982, %xmm5
-        	pmovzxbq	0xbabecafe,%xmm5
-
-// CHECK: 	pmovzxbq	305419896, %xmm5
-        	pmovzxbq	0x12345678,%xmm5
-
-// CHECK: 	pmovzxbq	%xmm5, %xmm5
-        	pmovzxbq	%xmm5,%xmm5
-
-// CHECK: 	pmovzxwd	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovzxwd	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovzxwd	69, %xmm5
-        	pmovzxwd	0x45,%xmm5
-
-// CHECK: 	pmovzxwd	32493, %xmm5
-        	pmovzxwd	0x7eed,%xmm5
-
-// CHECK: 	pmovzxwd	3133065982, %xmm5
-        	pmovzxwd	0xbabecafe,%xmm5
-
-// CHECK: 	pmovzxwd	305419896, %xmm5
-        	pmovzxwd	0x12345678,%xmm5
-
-// CHECK: 	pmovzxwd	%xmm5, %xmm5
-        	pmovzxwd	%xmm5,%xmm5
-
-// CHECK: 	pmovzxwq	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovzxwq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovzxwq	69, %xmm5
-        	pmovzxwq	0x45,%xmm5
-
-// CHECK: 	pmovzxwq	32493, %xmm5
-        	pmovzxwq	0x7eed,%xmm5
-
-// CHECK: 	pmovzxwq	3133065982, %xmm5
-        	pmovzxwq	0xbabecafe,%xmm5
-
-// CHECK: 	pmovzxwq	305419896, %xmm5
-        	pmovzxwq	0x12345678,%xmm5
-
-// CHECK: 	pmovzxwq	%xmm5, %xmm5
-        	pmovzxwq	%xmm5,%xmm5
-
-// CHECK: 	pmovzxdq	3735928559(%ebx,%ecx,8), %xmm5
-        	pmovzxdq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmovzxdq	69, %xmm5
-        	pmovzxdq	0x45,%xmm5
-
-// CHECK: 	pmovzxdq	32493, %xmm5
-        	pmovzxdq	0x7eed,%xmm5
-
-// CHECK: 	pmovzxdq	3133065982, %xmm5
-        	pmovzxdq	0xbabecafe,%xmm5
-
-// CHECK: 	pmovzxdq	305419896, %xmm5
-        	pmovzxdq	0x12345678,%xmm5
-
-// CHECK: 	pmovzxdq	%xmm5, %xmm5
-        	pmovzxdq	%xmm5,%xmm5
-
-// CHECK: 	pmuldq	3735928559(%ebx,%ecx,8), %xmm5
-        	pmuldq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmuldq	69, %xmm5
-        	pmuldq	0x45,%xmm5
-
-// CHECK: 	pmuldq	32493, %xmm5
-        	pmuldq	0x7eed,%xmm5
-
-// CHECK: 	pmuldq	3133065982, %xmm5
-        	pmuldq	0xbabecafe,%xmm5
-
-// CHECK: 	pmuldq	305419896, %xmm5
-        	pmuldq	0x12345678,%xmm5
-
-// CHECK: 	pmuldq	%xmm5, %xmm5
-        	pmuldq	%xmm5,%xmm5
-
-// CHECK: 	pmulld	3735928559(%ebx,%ecx,8), %xmm5
-        	pmulld	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pmulld	69, %xmm5
-        	pmulld	0x45,%xmm5
-
-// CHECK: 	pmulld	32493, %xmm5
-        	pmulld	0x7eed,%xmm5
-
-// CHECK: 	pmulld	3133065982, %xmm5
-        	pmulld	0xbabecafe,%xmm5
-
-// CHECK: 	pmulld	305419896, %xmm5
-        	pmulld	0x12345678,%xmm5
-
-// CHECK: 	pmulld	%xmm5, %xmm5
-        	pmulld	%xmm5,%xmm5
-
-// CHECK: 	ptest 	3735928559(%ebx,%ecx,8), %xmm5
-        	ptest	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	ptest 	69, %xmm5
-        	ptest	0x45,%xmm5
-
-// CHECK: 	ptest 	32493, %xmm5
-        	ptest	0x7eed,%xmm5
-
-// CHECK: 	ptest 	3133065982, %xmm5
-        	ptest	0xbabecafe,%xmm5
-
-// CHECK: 	ptest 	305419896, %xmm5
-        	ptest	0x12345678,%xmm5
-
-// CHECK: 	ptest 	%xmm5, %xmm5
-        	ptest	%xmm5,%xmm5
-
-// CHECK: 	crc32b 	%bl, %eax
-                crc32b %bl, %eax
-
-// CHECK: 	crc32b 	4(%ebx), %eax
-                crc32b 4(%ebx), %eax
-
-// CHECK: 	crc32w 	%bx, %eax
-                crc32w %bx, %eax
-
-// CHECK: 	crc32w 	4(%ebx), %eax
-                crc32w 4(%ebx), %eax
-
-// CHECK: 	crc32l 	%ebx, %eax
-                crc32l %ebx, %eax
-
-// CHECK: 	crc32l 	4(%ebx), %eax
-                crc32l 4(%ebx), %eax
-
-// CHECK: 	crc32l 	3735928559(%ebx,%ecx,8), %ecx
-                crc32l 0xdeadbeef(%ebx,%ecx,8),%ecx
-
-// CHECK: 	crc32l 	69, %ecx
-                crc32l 0x45,%ecx
-
-// CHECK: 	crc32l 	32493, %ecx
-                crc32l 0x7eed,%ecx
-
-// CHECK: 	crc32l 	3133065982, %ecx
-                crc32l 0xbabecafe,%ecx
-
-// CHECK: 	crc32l 	%ecx, %ecx
-                crc32l %ecx,%ecx
-
-// CHECK: 	pcmpgtq	3735928559(%ebx,%ecx,8), %xmm5
-        	pcmpgtq	0xdeadbeef(%ebx,%ecx,8),%xmm5
-
-// CHECK: 	pcmpgtq	69, %xmm5
-        	pcmpgtq	0x45,%xmm5
-
-// CHECK: 	pcmpgtq	32493, %xmm5
-        	pcmpgtq	0x7eed,%xmm5
-
-// CHECK: 	pcmpgtq	3133065982, %xmm5
-        	pcmpgtq	0xbabecafe,%xmm5
-
-// CHECK: 	pcmpgtq	305419896, %xmm5
-        	pcmpgtq	0x12345678,%xmm5
-
-// CHECK: 	pcmpgtq	%xmm5, %xmm5
-        	pcmpgtq	%xmm5,%xmm5
-
-// CHECK: 	aesimc	%xmm0, %xmm1
-                aesimc %xmm0,%xmm1
-
-// CHECK: 	aesimc	(%eax), %xmm1
-                aesimc (%eax),%xmm1
-
-// CHECK: 	aesenc	%xmm1, %xmm2
-                aesenc %xmm1,%xmm2
-
-// CHECK: 	aesenc	4(%ebx), %xmm2
-                aesenc 4(%ebx),%xmm2
-
-// CHECK: 	aesenclast	%xmm3, %xmm4
-                aesenclast %xmm3,%xmm4
-
-// CHECK: 	aesenclast	4(%edx,%edi), %xmm4
-                aesenclast 4(%edx,%edi),%xmm4
-
-// CHECK: 	aesdec	%xmm5, %xmm6
-                aesdec %xmm5,%xmm6
-
-// CHECK: 	aesdec	4(%ecx,%eax,8), %xmm6
-                aesdec 4(%ecx,%eax,8),%xmm6
-
-// CHECK: 	aesdeclast	%xmm7, %xmm0
-                aesdeclast %xmm7,%xmm0
-
-// CHECK: 	aesdeclast	3405691582, %xmm0
-                aesdeclast 0xcafebabe,%xmm0
-
-// CHECK: 	aeskeygenassist	$125, %xmm1, %xmm2
-                aeskeygenassist $125, %xmm1, %xmm2
-
-// CHECK: 	aeskeygenassist	$125, (%edx,%eax,4), %xmm2
-                aeskeygenassist $125, (%edx,%eax,4), %xmm2
-
 // CHECK:   blendvps	(%eax), %xmm1   # encoding: [0x66,0x0f,0x38,0x14,0x08]
             blendvps (%eax), %xmm1
 // CHECK:   blendvps	%xmm2, %xmm1    # encoding: [0x66,0x0f,0x38,0x14,0xca]
@@ -19622,31 +10520,31 @@
 // immediate. Check both forms here.
 // CHECK: blendps $129, %xmm2, %xmm1
           blendps $0x81, %xmm2, %xmm1
-// CHECK: blendps $-64, %xmm2, %xmm1
+// CHECK: blendps $192, %xmm2, %xmm1
           blendps $-64, %xmm2, %xmm1
 // CHECK: blendpd $129, %xmm2, %xmm1
           blendpd $0x81, %xmm2, %xmm1
-// CHECK: blendpd $-64, %xmm2, %xmm1
+// CHECK: blendpd $192, %xmm2, %xmm1
           blendpd $-64, %xmm2, %xmm1
 // CHECK: pblendw $129, %xmm2, %xmm1
           pblendw $0x81, %xmm2, %xmm1
-// CHECK: pblendw $-64, %xmm2, %xmm1
+// CHECK: pblendw $192, %xmm2, %xmm1
           pblendw $-64, %xmm2, %xmm1
 // CHECK: mpsadbw $129, %xmm2, %xmm1
           mpsadbw $0x81, %xmm2, %xmm1
-// CHECK: mpsadbw $-64, %xmm2, %xmm1
+// CHECK: mpsadbw $192, %xmm2, %xmm1
           mpsadbw $-64, %xmm2, %xmm1
 // CHECK: dpps $129, %xmm2, %xmm1
           dpps $0x81, %xmm2, %xmm1
-// CHECK: dpps $-64, %xmm2, %xmm1
+// CHECK: dpps $192, %xmm2, %xmm1
           dpps $-64, %xmm2, %xmm1
 // CHECK: dppd $129, %xmm2, %xmm1
           dppd $0x81, %xmm2, %xmm1
-// CHECK: dppd $-64, %xmm2, %xmm1
+// CHECK: dppd $192, %xmm2, %xmm1
           dppd $-64, %xmm2, %xmm1
 // CHECK: insertps $129, %xmm2, %xmm1
           insertps $0x81, %xmm2, %xmm1
-// CHECK: insertps $-64, %xmm2, %xmm1
+// CHECK: insertps $192, %xmm2, %xmm1
           insertps $-64, %xmm2, %xmm1
 
 // PR13253 handle implicit optional third argument that must always be xmm0
@@ -19708,3 +10606,139 @@ btc $4, (%eax)
 btcw $4, (%eax)
 btcl $4, (%eax)
 btcq $4, (%eax)
+
+// CHECK: clflushopt	3735928559(%ebx,%ecx,8)
+// CHECK:  encoding: [0x66,0x0f,0xae,0xbc,0xcb,0xef,0xbe,0xad,0xde]
+        	clflushopt	0xdeadbeef(%ebx,%ecx,8)
+
+// CHECK: clflushopt	32493
+// CHECK:  encoding: [0x66,0x0f,0xae,0x3d,0xed,0x7e,0x00,0x00]
+        	clflushopt	0x7eed
+
+// CHECK: clflushopt	3133065982
+// CHECK:  encoding: [0x66,0x0f,0xae,0x3d,0xfe,0xca,0xbe,0xba]
+        	clflushopt	0xbabecafe
+
+// CHECK: clflushopt	305419896
+// CHECK:  encoding: [0x66,0x0f,0xae,0x3d,0x78,0x56,0x34,0x12]
+        	clflushopt	0x12345678
+
+// CHECK: clwb	3735928559(%ebx,%ecx,8)
+// CHECK:  encoding: [0x66,0x0f,0xae,0xb4,0xcb,0xef,0xbe,0xad,0xde]
+        	clwb	0xdeadbeef(%ebx,%ecx,8)
+
+// CHECK: clwb	32493
+// CHECK:  encoding: [0x66,0x0f,0xae,0x35,0xed,0x7e,0x00,0x00]
+        	clwb	0x7eed
+
+// CHECK: clwb	3133065982
+// CHECK:  encoding: [0x66,0x0f,0xae,0x35,0xfe,0xca,0xbe,0xba]
+        	clwb	0xbabecafe
+
+// CHECK: clwb	305419896
+// CHECK:  encoding: [0x66,0x0f,0xae,0x35,0x78,0x56,0x34,0x12]
+        	clwb	0x12345678
+
+// CHECK: pcommit
+// CHECK:  encoding: [0x66,0x0f,0xae,0xf8]
+        	pcommit
+
+// CHECK: xsave	3735928559(%ebx,%ecx,8)
+// CHECK:  encoding: [0x0f,0xae,0xa4,0xcb,0xef,0xbe,0xad,0xde]
+        	xsave	0xdeadbeef(%ebx,%ecx,8)
+
+// CHECK: xsave	32493
+// CHECK:  encoding: [0x0f,0xae,0x25,0xed,0x7e,0x00,0x00]
+        	xsave	0x7eed
+
+// CHECK: xsave	3133065982
+// CHECK:  encoding: [0x0f,0xae,0x25,0xfe,0xca,0xbe,0xba]
+        	xsave	0xbabecafe
+
+// CHECK: xsave	305419896
+// CHECK:  encoding: [0x0f,0xae,0x25,0x78,0x56,0x34,0x12]
+        	xsave	0x12345678
+
+// CHECK: xrstor	3735928559(%ebx,%ecx,8)
+// CHECK:  encoding: [0x0f,0xae,0xac,0xcb,0xef,0xbe,0xad,0xde]
+        	xrstor	0xdeadbeef(%ebx,%ecx,8)
+
+// CHECK: xrstor	32493
+// CHECK:  encoding: [0x0f,0xae,0x2d,0xed,0x7e,0x00,0x00]
+        	xrstor	0x7eed
+
+// CHECK: xrstor	3133065982
+// CHECK:  encoding: [0x0f,0xae,0x2d,0xfe,0xca,0xbe,0xba]
+        	xrstor	0xbabecafe
+
+// CHECK: xrstor	305419896
+// CHECK:  encoding: [0x0f,0xae,0x2d,0x78,0x56,0x34,0x12]
+        	xrstor	0x12345678
+
+// CHECK: xsaveopt	3735928559(%ebx,%ecx,8)
+// CHECK:  encoding: [0x0f,0xae,0xb4,0xcb,0xef,0xbe,0xad,0xde]
+        	xsaveopt	0xdeadbeef(%ebx,%ecx,8)
+
+// CHECK: xsaveopt	32493
+// CHECK:  encoding: [0x0f,0xae,0x35,0xed,0x7e,0x00,0x00]
+        	xsaveopt	0x7eed
+
+// CHECK: xsaveopt	3133065982
+// CHECK:  encoding: [0x0f,0xae,0x35,0xfe,0xca,0xbe,0xba]
+        	xsaveopt	0xbabecafe
+
+// CHECK: xsaveopt	305419896
+// CHECK:  encoding: [0x0f,0xae,0x35,0x78,0x56,0x34,0x12]
+        	xsaveopt	0x12345678
+
+// CHECK: xsaves	3735928559(%ebx,%ecx,8)
+// CHECK:  encoding: [0x0f,0xc7,0xac,0xcb,0xef,0xbe,0xad,0xde]
+        	xsaves	0xdeadbeef(%ebx,%ecx,8)
+
+// CHECK: xsaves	32493
+// CHECK:  encoding: [0x0f,0xc7,0x2d,0xed,0x7e,0x00,0x00]
+        	xsaves	0x7eed
+
+// CHECK: xsaves	3133065982
+// CHECK:  encoding: [0x0f,0xc7,0x2d,0xfe,0xca,0xbe,0xba]
+        	xsaves	0xbabecafe
+
+// CHECK: xsaves	305419896
+// CHECK:  encoding: [0x0f,0xc7,0x2d,0x78,0x56,0x34,0x12]
+        	xsaves	0x12345678
+
+// CHECK: xsavec	3735928559(%ebx,%ecx,8)
+// CHECK:  encoding: [0x0f,0xc7,0xa4,0xcb,0xef,0xbe,0xad,0xde]
+        	xsavec	0xdeadbeef(%ebx,%ecx,8)
+
+// CHECK: xsavec	32493
+// CHECK:  encoding: [0x0f,0xc7,0x25,0xed,0x7e,0x00,0x00]
+        	xsavec	0x7eed
+
+// CHECK: xsavec	3133065982
+// CHECK:  encoding: [0x0f,0xc7,0x25,0xfe,0xca,0xbe,0xba]
+        	xsavec	0xbabecafe
+
+// CHECK: xsavec	305419896
+// CHECK:  encoding: [0x0f,0xc7,0x25,0x78,0x56,0x34,0x12]
+        	xsavec	0x12345678
+
+// CHECK: xrstors	3735928559(%ebx,%ecx,8)
+// CHECK:  encoding: [0x0f,0xc7,0x9c,0xcb,0xef,0xbe,0xad,0xde]
+        	xrstors	0xdeadbeef(%ebx,%ecx,8)
+
+// CHECK: xrstors	32493
+// CHECK:  encoding: [0x0f,0xc7,0x1d,0xed,0x7e,0x00,0x00]
+        	xrstors	0x7eed
+
+// CHECK: xrstors	3133065982
+// CHECK:  encoding: [0x0f,0xc7,0x1d,0xfe,0xca,0xbe,0xba]
+        	xrstors	0xbabecafe
+
+// CHECK: xrstors	305419896
+// CHECK:  encoding: [0x0f,0xc7,0x1d,0x78,0x56,0x34,0x12]
+        	xrstors	0x12345678
+
+// CHECK: getsec
+// CHECK:  encoding: [0x0f,0x37]
+        	getsec
diff --git a/test/MC/X86/x86-32.s b/test/MC/X86/x86-32.s
index bebaa65..648eb5a 100644
--- a/test/MC/X86/x86-32.s
+++ b/test/MC/X86/x86-32.s
@@ -289,35 +289,35 @@ cmovnae	%bx,%bx
 
 // Check matching of instructions which embed the SSE comparison code.
 
-// CHECK: cmpps $0, %xmm0, %xmm1
+// CHECK: cmpeqps %xmm0, %xmm1
 // CHECK: encoding: [0x0f,0xc2,0xc8,0x00]
         cmpeqps %xmm0, %xmm1
 
-// CHECK: cmppd $1, %xmm0, %xmm1
+// CHECK: cmpltpd %xmm0, %xmm1
 // CHECK: encoding: [0x66,0x0f,0xc2,0xc8,0x01]
         cmpltpd %xmm0, %xmm1
 
-// CHECK: cmpss $2, %xmm0, %xmm1
+// CHECK: cmpless %xmm0, %xmm1
 // CHECK: encoding: [0xf3,0x0f,0xc2,0xc8,0x02]
         cmpless %xmm0, %xmm1
 
-// CHECK: cmppd $3, %xmm0, %xmm1
+// CHECK: cmpunordpd %xmm0, %xmm1
 // CHECK: encoding: [0x66,0x0f,0xc2,0xc8,0x03]
         cmpunordpd %xmm0, %xmm1
 
-// CHECK: cmpps $4, %xmm0, %xmm1
+// CHECK: cmpneqps %xmm0, %xmm1
 // CHECK: encoding: [0x0f,0xc2,0xc8,0x04]
         cmpneqps %xmm0, %xmm1
 
-// CHECK: cmppd $5, %xmm0, %xmm1
+// CHECK: cmpnltpd %xmm0, %xmm1
 // CHECK: encoding: [0x66,0x0f,0xc2,0xc8,0x05]
         cmpnltpd %xmm0, %xmm1
 
-// CHECK: cmpss $6, %xmm0, %xmm1
+// CHECK: cmpnless %xmm0, %xmm1
 // CHECK: encoding: [0xf3,0x0f,0xc2,0xc8,0x06]
         cmpnless %xmm0, %xmm1
 
-// CHECK: cmpsd $7, %xmm0, %xmm1
+// CHECK: cmpordsd %xmm0, %xmm1
 // CHECK: encoding: [0xf2,0x0f,0xc2,0xc8,0x07]
         cmpordsd %xmm0, %xmm1
 
diff --git a/test/MC/X86/x86-64-avx512bw.s b/test/MC/X86/x86-64-avx512bw.s
index 5155504..7aa7afa 100644
--- a/test/MC/X86/x86-64-avx512bw.s
+++ b/test/MC/X86/x86-64-avx512bw.s
@@ -72,6 +72,29 @@
 // CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xfd,0x8a,0xc0,0xdf,0xff,0xff]
           vpaddw -8256(%rdx), %zmm24, %zmm17
 
+// CHECK: vpbroadcastb %eax, %zmm19
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x7a,0xd8]
+          vpbroadcastb %eax, %zmm19
+
+// CHECK: vpbroadcastb %eax, %zmm19 {%k7}
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x4f,0x7a,0xd8]
+          vpbroadcastb %eax, %zmm19 {%k7}
+
+// CHECK: vpbroadcastb %eax, %zmm19 {%k7} {z}
+// CHECK:  encoding: [0x62,0xe2,0x7d,0xcf,0x7a,0xd8]
+          vpbroadcastb %eax, %zmm19 {%k7} {z}
+
+// CHECK: vpbroadcastw %eax, %zmm24
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x7b,0xc0]
+          vpbroadcastw %eax, %zmm24
+
+// CHECK: vpbroadcastw %eax, %zmm24 {%k1}
+// CHECK:  encoding: [0x62,0x62,0x7d,0x49,0x7b,0xc0]
+          vpbroadcastw %eax, %zmm24 {%k1}
+
+// CHECK: vpbroadcastw %eax, %zmm24 {%k1} {z}
+// CHECK:  encoding: [0x62,0x62,0x7d,0xc9,0x7b,0xc0]
+          vpbroadcastw %eax, %zmm24 {%k1} {z}
 // CHECK: vpcmpeqb %zmm26, %zmm26, %k4
 // CHECK:  encoding: [0x62,0x91,0x2d,0x40,0x74,0xe2]
           vpcmpeqb %zmm26, %zmm26, %k4
@@ -852,6 +875,7 @@
 // CHECK:  encoding: [0x62,0x61,0xff,0x48,0x7f,0xa2,0xc0,0xdf,0xff,0xff]
           vmovdqu16 %zmm28, -8256(%rdx)
 
+
 // CHECK: vpcmpb $171, %zmm25, %zmm26, %k3
 // CHECK:  encoding: [0x62,0x93,0x2d,0x40,0x3f,0xd9,0xab]
           vpcmpb $171, %zmm25, %zmm26, %k3
@@ -888,6 +912,166 @@
 // CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x9a,0xc0,0xdf,0xff,0xff,0x7b]
           vpcmpb $123, -8256(%rdx), %zmm26, %k3
 
+// CHECK: vpcmpltb %zmm25, %zmm26, %k3
+// CHECK:  encoding: [0x62,0x93,0x2d,0x40,0x3f,0xd9,0x01]
+          vpcmpltb %zmm25, %zmm26, %k3
+
+// CHECK: vpcmpltb %zmm25, %zmm26, %k3 {%k7}
+// CHECK:  encoding: [0x62,0x93,0x2d,0x47,0x3f,0xd9,0x01]
+          vpcmpltb %zmm25, %zmm26, %k3 {%k7}
+
+// CHECK: vpcmpltb (%rcx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x19,0x01]
+          vpcmpltb (%rcx), %zmm26, %k3
+
+// CHECK: vpcmpltb 291(%rax,%r14,8), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xb3,0x2d,0x40,0x3f,0x9c,0xf0,0x23,0x01,0x00,0x00,0x01]
+          vpcmpltb 291(%rax,%r14,8), %zmm26, %k3
+
+// CHECK: vpcmpltb 8128(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x5a,0x7f,0x01]
+          vpcmpltb 8128(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpltb 8192(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x9a,0x00,0x20,0x00,0x00,0x01]
+          vpcmpltb 8192(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpltb -8192(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x5a,0x80,0x01]
+          vpcmpltb -8192(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpltb -8256(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x9a,0xc0,0xdf,0xff,0xff,0x01]
+          vpcmpltb -8256(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpleb %zmm25, %zmm26, %k3
+// CHECK:  encoding: [0x62,0x93,0x2d,0x40,0x3f,0xd9,0x02]
+          vpcmpleb %zmm25, %zmm26, %k3
+
+// CHECK: vpcmpleb %zmm25, %zmm26, %k3 {%k7}
+// CHECK:  encoding: [0x62,0x93,0x2d,0x47,0x3f,0xd9,0x02]
+          vpcmpleb %zmm25, %zmm26, %k3 {%k7}
+
+// CHECK: vpcmpleb (%rcx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x19,0x02]
+          vpcmpleb (%rcx), %zmm26, %k3
+
+// CHECK: vpcmpleb 291(%rax,%r14,8), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xb3,0x2d,0x40,0x3f,0x9c,0xf0,0x23,0x01,0x00,0x00,0x02]
+          vpcmpleb 291(%rax,%r14,8), %zmm26, %k3
+
+// CHECK: vpcmpleb 8128(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x5a,0x7f,0x02]
+          vpcmpleb 8128(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpleb 8192(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x9a,0x00,0x20,0x00,0x00,0x02]
+          vpcmpleb 8192(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpleb -8192(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x5a,0x80,0x02]
+          vpcmpleb -8192(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpleb -8256(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x9a,0xc0,0xdf,0xff,0xff,0x02]
+          vpcmpleb -8256(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpneqb %zmm25, %zmm26, %k3
+// CHECK:  encoding: [0x62,0x93,0x2d,0x40,0x3f,0xd9,0x04]
+          vpcmpneqb %zmm25, %zmm26, %k3
+
+// CHECK: vpcmpneqb %zmm25, %zmm26, %k3 {%k7}
+// CHECK:  encoding: [0x62,0x93,0x2d,0x47,0x3f,0xd9,0x04]
+          vpcmpneqb %zmm25, %zmm26, %k3 {%k7}
+
+// CHECK: vpcmpneqb (%rcx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x19,0x04]
+          vpcmpneqb (%rcx), %zmm26, %k3
+
+// CHECK: vpcmpneqb 291(%rax,%r14,8), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xb3,0x2d,0x40,0x3f,0x9c,0xf0,0x23,0x01,0x00,0x00,0x04]
+          vpcmpneqb 291(%rax,%r14,8), %zmm26, %k3
+
+// CHECK: vpcmpneqb 8128(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x5a,0x7f,0x04]
+          vpcmpneqb 8128(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpneqb 8192(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x9a,0x00,0x20,0x00,0x00,0x04]
+          vpcmpneqb 8192(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpneqb -8192(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x5a,0x80,0x04]
+          vpcmpneqb -8192(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpneqb -8256(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x9a,0xc0,0xdf,0xff,0xff,0x04]
+          vpcmpneqb -8256(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpnltb %zmm25, %zmm26, %k3
+// CHECK:  encoding: [0x62,0x93,0x2d,0x40,0x3f,0xd9,0x05]
+          vpcmpnltb %zmm25, %zmm26, %k3
+
+// CHECK: vpcmpnltb %zmm25, %zmm26, %k3 {%k7}
+// CHECK:  encoding: [0x62,0x93,0x2d,0x47,0x3f,0xd9,0x05]
+          vpcmpnltb %zmm25, %zmm26, %k3 {%k7}
+
+// CHECK: vpcmpnltb (%rcx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x19,0x05]
+          vpcmpnltb (%rcx), %zmm26, %k3
+
+// CHECK: vpcmpnltb 291(%rax,%r14,8), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xb3,0x2d,0x40,0x3f,0x9c,0xf0,0x23,0x01,0x00,0x00,0x05]
+          vpcmpnltb 291(%rax,%r14,8), %zmm26, %k3
+
+// CHECK: vpcmpnltb 8128(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x5a,0x7f,0x05]
+          vpcmpnltb 8128(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpnltb 8192(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x9a,0x00,0x20,0x00,0x00,0x05]
+          vpcmpnltb 8192(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpnltb -8192(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x5a,0x80,0x05]
+          vpcmpnltb -8192(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpnltb -8256(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x9a,0xc0,0xdf,0xff,0xff,0x05]
+          vpcmpnltb -8256(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpnleb %zmm25, %zmm26, %k3
+// CHECK:  encoding: [0x62,0x93,0x2d,0x40,0x3f,0xd9,0x06]
+          vpcmpnleb %zmm25, %zmm26, %k3
+
+// CHECK: vpcmpnleb %zmm25, %zmm26, %k3 {%k7}
+// CHECK:  encoding: [0x62,0x93,0x2d,0x47,0x3f,0xd9,0x06]
+          vpcmpnleb %zmm25, %zmm26, %k3 {%k7}
+
+// CHECK: vpcmpnleb (%rcx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x19,0x06]
+          vpcmpnleb (%rcx), %zmm26, %k3
+
+// CHECK: vpcmpnleb 291(%rax,%r14,8), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xb3,0x2d,0x40,0x3f,0x9c,0xf0,0x23,0x01,0x00,0x00,0x06]
+          vpcmpnleb 291(%rax,%r14,8), %zmm26, %k3
+
+// CHECK: vpcmpnleb 8128(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x5a,0x7f,0x06]
+          vpcmpnleb 8128(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpnleb 8192(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x9a,0x00,0x20,0x00,0x00,0x06]
+          vpcmpnleb 8192(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpnleb -8192(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x5a,0x80,0x06]
+          vpcmpnleb -8192(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpnleb -8256(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x9a,0xc0,0xdf,0xff,0xff,0x06]
+          vpcmpnleb -8256(%rdx), %zmm26, %k3
+
 // CHECK: vpcmpw $171, %zmm25, %zmm29, %k3
 // CHECK:  encoding: [0x62,0x93,0x95,0x40,0x3f,0xd9,0xab]
           vpcmpw $171, %zmm25, %zmm29, %k3
@@ -924,6 +1108,166 @@
 // CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x9a,0xc0,0xdf,0xff,0xff,0x7b]
           vpcmpw $123, -8256(%rdx), %zmm29, %k3
 
+// CHECK: vpcmpltw %zmm25, %zmm29, %k3
+// CHECK:  encoding: [0x62,0x93,0x95,0x40,0x3f,0xd9,0x01]
+          vpcmpltw %zmm25, %zmm29, %k3
+
+// CHECK: vpcmpltw %zmm25, %zmm29, %k3 {%k6}
+// CHECK:  encoding: [0x62,0x93,0x95,0x46,0x3f,0xd9,0x01]
+          vpcmpltw %zmm25, %zmm29, %k3 {%k6}
+
+// CHECK: vpcmpltw (%rcx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x19,0x01]
+          vpcmpltw (%rcx), %zmm29, %k3
+
+// CHECK: vpcmpltw 291(%rax,%r14,8), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xb3,0x95,0x40,0x3f,0x9c,0xf0,0x23,0x01,0x00,0x00,0x01]
+          vpcmpltw 291(%rax,%r14,8), %zmm29, %k3
+
+// CHECK: vpcmpltw 8128(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x5a,0x7f,0x01]
+          vpcmpltw 8128(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpltw 8192(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x9a,0x00,0x20,0x00,0x00,0x01]
+          vpcmpltw 8192(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpltw -8192(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x5a,0x80,0x01]
+          vpcmpltw -8192(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpltw -8256(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x9a,0xc0,0xdf,0xff,0xff,0x01]
+          vpcmpltw -8256(%rdx), %zmm29, %k3
+
+// CHECK: vpcmplew %zmm25, %zmm29, %k3
+// CHECK:  encoding: [0x62,0x93,0x95,0x40,0x3f,0xd9,0x02]
+          vpcmplew %zmm25, %zmm29, %k3
+
+// CHECK: vpcmplew %zmm25, %zmm29, %k3 {%k6}
+// CHECK:  encoding: [0x62,0x93,0x95,0x46,0x3f,0xd9,0x02]
+          vpcmplew %zmm25, %zmm29, %k3 {%k6}
+
+// CHECK: vpcmplew (%rcx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x19,0x02]
+          vpcmplew (%rcx), %zmm29, %k3
+
+// CHECK: vpcmplew 291(%rax,%r14,8), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xb3,0x95,0x40,0x3f,0x9c,0xf0,0x23,0x01,0x00,0x00,0x02]
+          vpcmplew 291(%rax,%r14,8), %zmm29, %k3
+
+// CHECK: vpcmplew 8128(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x5a,0x7f,0x02]
+          vpcmplew 8128(%rdx), %zmm29, %k3
+
+// CHECK: vpcmplew 8192(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x9a,0x00,0x20,0x00,0x00,0x02]
+          vpcmplew 8192(%rdx), %zmm29, %k3
+
+// CHECK: vpcmplew -8192(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x5a,0x80,0x02]
+          vpcmplew -8192(%rdx), %zmm29, %k3
+
+// CHECK: vpcmplew -8256(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x9a,0xc0,0xdf,0xff,0xff,0x02]
+          vpcmplew -8256(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpneqw %zmm25, %zmm29, %k3
+// CHECK:  encoding: [0x62,0x93,0x95,0x40,0x3f,0xd9,0x04]
+          vpcmpneqw %zmm25, %zmm29, %k3
+
+// CHECK: vpcmpneqw %zmm25, %zmm29, %k3 {%k6}
+// CHECK:  encoding: [0x62,0x93,0x95,0x46,0x3f,0xd9,0x04]
+          vpcmpneqw %zmm25, %zmm29, %k3 {%k6}
+
+// CHECK: vpcmpneqw (%rcx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x19,0x04]
+          vpcmpneqw (%rcx), %zmm29, %k3
+
+// CHECK: vpcmpneqw 291(%rax,%r14,8), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xb3,0x95,0x40,0x3f,0x9c,0xf0,0x23,0x01,0x00,0x00,0x04]
+          vpcmpneqw 291(%rax,%r14,8), %zmm29, %k3
+
+// CHECK: vpcmpneqw 8128(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x5a,0x7f,0x04]
+          vpcmpneqw 8128(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpneqw 8192(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x9a,0x00,0x20,0x00,0x00,0x04]
+          vpcmpneqw 8192(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpneqw -8192(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x5a,0x80,0x04]
+          vpcmpneqw -8192(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpneqw -8256(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x9a,0xc0,0xdf,0xff,0xff,0x04]
+          vpcmpneqw -8256(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpnltw %zmm25, %zmm29, %k3
+// CHECK:  encoding: [0x62,0x93,0x95,0x40,0x3f,0xd9,0x05]
+          vpcmpnltw %zmm25, %zmm29, %k3
+
+// CHECK: vpcmpnltw %zmm25, %zmm29, %k3 {%k6}
+// CHECK:  encoding: [0x62,0x93,0x95,0x46,0x3f,0xd9,0x05]
+          vpcmpnltw %zmm25, %zmm29, %k3 {%k6}
+
+// CHECK: vpcmpnltw (%rcx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x19,0x05]
+          vpcmpnltw (%rcx), %zmm29, %k3
+
+// CHECK: vpcmpnltw 291(%rax,%r14,8), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xb3,0x95,0x40,0x3f,0x9c,0xf0,0x23,0x01,0x00,0x00,0x05]
+          vpcmpnltw 291(%rax,%r14,8), %zmm29, %k3
+
+// CHECK: vpcmpnltw 8128(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x5a,0x7f,0x05]
+          vpcmpnltw 8128(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpnltw 8192(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x9a,0x00,0x20,0x00,0x00,0x05]
+          vpcmpnltw 8192(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpnltw -8192(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x5a,0x80,0x05]
+          vpcmpnltw -8192(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpnltw -8256(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x9a,0xc0,0xdf,0xff,0xff,0x05]
+          vpcmpnltw -8256(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpnlew %zmm25, %zmm29, %k3
+// CHECK:  encoding: [0x62,0x93,0x95,0x40,0x3f,0xd9,0x06]
+          vpcmpnlew %zmm25, %zmm29, %k3
+
+// CHECK: vpcmpnlew %zmm25, %zmm29, %k3 {%k6}
+// CHECK:  encoding: [0x62,0x93,0x95,0x46,0x3f,0xd9,0x06]
+          vpcmpnlew %zmm25, %zmm29, %k3 {%k6}
+
+// CHECK: vpcmpnlew (%rcx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x19,0x06]
+          vpcmpnlew (%rcx), %zmm29, %k3
+
+// CHECK: vpcmpnlew 291(%rax,%r14,8), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xb3,0x95,0x40,0x3f,0x9c,0xf0,0x23,0x01,0x00,0x00,0x06]
+          vpcmpnlew 291(%rax,%r14,8), %zmm29, %k3
+
+// CHECK: vpcmpnlew 8128(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x5a,0x7f,0x06]
+          vpcmpnlew 8128(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpnlew 8192(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x9a,0x00,0x20,0x00,0x00,0x06]
+          vpcmpnlew 8192(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpnlew -8192(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x5a,0x80,0x06]
+          vpcmpnlew -8192(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpnlew -8256(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x9a,0xc0,0xdf,0xff,0xff,0x06]
+          vpcmpnlew -8256(%rdx), %zmm29, %k3
+
 // CHECK: vpcmpub $171, %zmm22, %zmm29, %k4
 // CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xe6,0xab]
           vpcmpub $171, %zmm22, %zmm29, %k4
@@ -960,6 +1304,222 @@
 // CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0xa2,0xc0,0xdf,0xff,0xff,0x7b]
           vpcmpub $123, -8256(%rdx), %zmm29, %k4
 
+// CHECK: vpcmpequb %zmm22, %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xe6,0x00]
+          vpcmpequb %zmm22, %zmm29, %k4
+
+// CHECK: vpcmpequb %zmm22, %zmm29, %k4 {%k7}
+// CHECK:  encoding: [0x62,0xb3,0x15,0x47,0x3e,0xe6,0x00]
+          vpcmpequb %zmm22, %zmm29, %k4 {%k7}
+
+// CHECK: vpcmpequb %zmm22, %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xe6,0x00]
+          vpcmpequb %zmm22, %zmm29, %k4
+
+// CHECK: vpcmpequb (%rcx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x21,0x00]
+          vpcmpequb (%rcx), %zmm29, %k4
+
+// CHECK: vpcmpequb 291(%rax,%r14,8), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xa4,0xf0,0x23,0x01,0x00,0x00,0x00]
+          vpcmpequb 291(%rax,%r14,8), %zmm29, %k4
+
+// CHECK: vpcmpequb 8128(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x62,0x7f,0x00]
+          vpcmpequb 8128(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpequb 8192(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0xa2,0x00,0x20,0x00,0x00,0x00]
+          vpcmpequb 8192(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpequb -8192(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x62,0x80,0x00]
+          vpcmpequb -8192(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpequb -8256(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0xa2,0xc0,0xdf,0xff,0xff,0x00]
+          vpcmpequb -8256(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpltub %zmm22, %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xe6,0x01]
+          vpcmpltub %zmm22, %zmm29, %k4
+
+// CHECK: vpcmpltub %zmm22, %zmm29, %k4 {%k7}
+// CHECK:  encoding: [0x62,0xb3,0x15,0x47,0x3e,0xe6,0x01]
+          vpcmpltub %zmm22, %zmm29, %k4 {%k7}
+
+// CHECK: vpcmpltub %zmm22, %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xe6,0x01]
+          vpcmpltub %zmm22, %zmm29, %k4
+
+// CHECK: vpcmpltub (%rcx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x21,0x01]
+          vpcmpltub (%rcx), %zmm29, %k4
+
+// CHECK: vpcmpltub 291(%rax,%r14,8), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xa4,0xf0,0x23,0x01,0x00,0x00,0x01]
+          vpcmpltub 291(%rax,%r14,8), %zmm29, %k4
+
+// CHECK: vpcmpltub 8128(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x62,0x7f,0x01]
+          vpcmpltub 8128(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpltub 8192(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0xa2,0x00,0x20,0x00,0x00,0x01]
+          vpcmpltub 8192(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpltub -8192(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x62,0x80,0x01]
+          vpcmpltub -8192(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpltub -8256(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0xa2,0xc0,0xdf,0xff,0xff,0x01]
+          vpcmpltub -8256(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpleub %zmm22, %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xe6,0x02]
+          vpcmpleub %zmm22, %zmm29, %k4
+
+// CHECK: vpcmpleub %zmm22, %zmm29, %k4 {%k7}
+// CHECK:  encoding: [0x62,0xb3,0x15,0x47,0x3e,0xe6,0x02]
+          vpcmpleub %zmm22, %zmm29, %k4 {%k7}
+
+// CHECK: vpcmpleub %zmm22, %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xe6,0x02]
+          vpcmpleub %zmm22, %zmm29, %k4
+
+// CHECK: vpcmpleub (%rcx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x21,0x02]
+          vpcmpleub (%rcx), %zmm29, %k4
+
+// CHECK: vpcmpleub 291(%rax,%r14,8), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xa4,0xf0,0x23,0x01,0x00,0x00,0x02]
+          vpcmpleub 291(%rax,%r14,8), %zmm29, %k4
+
+// CHECK: vpcmpleub 8128(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x62,0x7f,0x02]
+          vpcmpleub 8128(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpleub 8192(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0xa2,0x00,0x20,0x00,0x00,0x02]
+          vpcmpleub 8192(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpleub -8192(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x62,0x80,0x02]
+          vpcmpleub -8192(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpleub -8256(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0xa2,0xc0,0xdf,0xff,0xff,0x02]
+          vpcmpleub -8256(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpnequb %zmm22, %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xe6,0x04]
+          vpcmpnequb %zmm22, %zmm29, %k4
+
+// CHECK: vpcmpnequb %zmm22, %zmm29, %k4 {%k7}
+// CHECK:  encoding: [0x62,0xb3,0x15,0x47,0x3e,0xe6,0x04]
+          vpcmpnequb %zmm22, %zmm29, %k4 {%k7}
+
+// CHECK: vpcmpnequb %zmm22, %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xe6,0x04]
+          vpcmpnequb %zmm22, %zmm29, %k4
+
+// CHECK: vpcmpnequb (%rcx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x21,0x04]
+          vpcmpnequb (%rcx), %zmm29, %k4
+
+// CHECK: vpcmpnequb 291(%rax,%r14,8), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xa4,0xf0,0x23,0x01,0x00,0x00,0x04]
+          vpcmpnequb 291(%rax,%r14,8), %zmm29, %k4
+
+// CHECK: vpcmpnequb 8128(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x62,0x7f,0x04]
+          vpcmpnequb 8128(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpnequb 8192(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0xa2,0x00,0x20,0x00,0x00,0x04]
+          vpcmpnequb 8192(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpnequb -8192(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x62,0x80,0x04]
+          vpcmpnequb -8192(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpnequb -8256(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0xa2,0xc0,0xdf,0xff,0xff,0x04]
+          vpcmpnequb -8256(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpnltub %zmm22, %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xe6,0x05]
+          vpcmpnltub %zmm22, %zmm29, %k4
+
+// CHECK: vpcmpnltub %zmm22, %zmm29, %k4 {%k7}
+// CHECK:  encoding: [0x62,0xb3,0x15,0x47,0x3e,0xe6,0x05]
+          vpcmpnltub %zmm22, %zmm29, %k4 {%k7}
+
+// CHECK: vpcmpnltub %zmm22, %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xe6,0x05]
+          vpcmpnltub %zmm22, %zmm29, %k4
+
+// CHECK: vpcmpnltub (%rcx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x21,0x05]
+          vpcmpnltub (%rcx), %zmm29, %k4
+
+// CHECK: vpcmpnltub 291(%rax,%r14,8), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xa4,0xf0,0x23,0x01,0x00,0x00,0x05]
+          vpcmpnltub 291(%rax,%r14,8), %zmm29, %k4
+
+// CHECK: vpcmpnltub 8128(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x62,0x7f,0x05]
+          vpcmpnltub 8128(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpnltub 8192(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0xa2,0x00,0x20,0x00,0x00,0x05]
+          vpcmpnltub 8192(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpnltub -8192(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x62,0x80,0x05]
+          vpcmpnltub -8192(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpnltub -8256(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0xa2,0xc0,0xdf,0xff,0xff,0x05]
+          vpcmpnltub -8256(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpnleub %zmm22, %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xe6,0x06]
+          vpcmpnleub %zmm22, %zmm29, %k4
+
+// CHECK: vpcmpnleub %zmm22, %zmm29, %k4 {%k7}
+// CHECK:  encoding: [0x62,0xb3,0x15,0x47,0x3e,0xe6,0x06]
+          vpcmpnleub %zmm22, %zmm29, %k4 {%k7}
+
+// CHECK: vpcmpnleub %zmm22, %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xe6,0x06]
+          vpcmpnleub %zmm22, %zmm29, %k4
+
+// CHECK: vpcmpnleub (%rcx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x21,0x06]
+          vpcmpnleub (%rcx), %zmm29, %k4
+
+// CHECK: vpcmpnleub 291(%rax,%r14,8), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xa4,0xf0,0x23,0x01,0x00,0x00,0x06]
+          vpcmpnleub 291(%rax,%r14,8), %zmm29, %k4
+
+// CHECK: vpcmpnleub 8128(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x62,0x7f,0x06]
+          vpcmpnleub 8128(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpnleub 8192(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0xa2,0x00,0x20,0x00,0x00,0x06]
+          vpcmpnleub 8192(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpnleub -8192(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x62,0x80,0x06]
+          vpcmpnleub -8192(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpnleub -8256(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0xa2,0xc0,0xdf,0xff,0xff,0x06]
+          vpcmpnleub -8256(%rdx), %zmm29, %k4
+
 // CHECK: vpcmpuw $171, %zmm22, %zmm22, %k4
 // CHECK:  encoding: [0x62,0xb3,0xcd,0x40,0x3e,0xe6,0xab]
           vpcmpuw $171, %zmm22, %zmm22, %k4
@@ -995,3 +1555,195 @@
 // CHECK: vpcmpuw $123, -8256(%rdx), %zmm22, %k4
 // CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0xa2,0xc0,0xdf,0xff,0xff,0x7b]
           vpcmpuw $123, -8256(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpequw %zmm22, %zmm22, %k4
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x40,0x3e,0xe6,0x00]
+          vpcmpequw %zmm22, %zmm22, %k4
+
+// CHECK: vpcmpequw %zmm22, %zmm22, %k4 {%k7}
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x47,0x3e,0xe6,0x00]
+          vpcmpequw %zmm22, %zmm22, %k4 {%k7}
+
+// CHECK: vpcmpequw (%rcx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x21,0x00]
+          vpcmpequw (%rcx), %zmm22, %k4
+
+// CHECK: vpcmpequw 291(%rax,%r14,8), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x40,0x3e,0xa4,0xf0,0x23,0x01,0x00,0x00,0x00]
+          vpcmpequw 291(%rax,%r14,8), %zmm22, %k4
+
+// CHECK: vpcmpequw 8128(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x62,0x7f,0x00]
+          vpcmpequw 8128(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpequw 8192(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0xa2,0x00,0x20,0x00,0x00,0x00]
+          vpcmpequw 8192(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpequw -8192(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x62,0x80,0x00]
+          vpcmpequw -8192(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpequw -8256(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0xa2,0xc0,0xdf,0xff,0xff,0x00]
+          vpcmpequw -8256(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpltuw %zmm22, %zmm22, %k4
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x40,0x3e,0xe6,0x01]
+          vpcmpltuw %zmm22, %zmm22, %k4
+
+// CHECK: vpcmpltuw %zmm22, %zmm22, %k4 {%k7}
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x47,0x3e,0xe6,0x01]
+          vpcmpltuw %zmm22, %zmm22, %k4 {%k7}
+
+// CHECK: vpcmpltuw (%rcx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x21,0x01]
+          vpcmpltuw (%rcx), %zmm22, %k4
+
+// CHECK: vpcmpltuw 291(%rax,%r14,8), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x40,0x3e,0xa4,0xf0,0x23,0x01,0x00,0x00,0x01]
+          vpcmpltuw 291(%rax,%r14,8), %zmm22, %k4
+
+// CHECK: vpcmpltuw 8128(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x62,0x7f,0x01]
+          vpcmpltuw 8128(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpltuw 8192(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0xa2,0x00,0x20,0x00,0x00,0x01]
+          vpcmpltuw 8192(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpltuw -8192(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x62,0x80,0x01]
+          vpcmpltuw -8192(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpltuw -8256(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0xa2,0xc0,0xdf,0xff,0xff,0x01]
+          vpcmpltuw -8256(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpleuw %zmm22, %zmm22, %k4
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x40,0x3e,0xe6,0x02]
+          vpcmpleuw %zmm22, %zmm22, %k4
+
+// CHECK: vpcmpleuw %zmm22, %zmm22, %k4 {%k7}
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x47,0x3e,0xe6,0x02]
+          vpcmpleuw %zmm22, %zmm22, %k4 {%k7}
+
+// CHECK: vpcmpleuw (%rcx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x21,0x02]
+          vpcmpleuw (%rcx), %zmm22, %k4
+
+// CHECK: vpcmpleuw 291(%rax,%r14,8), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x40,0x3e,0xa4,0xf0,0x23,0x01,0x00,0x00,0x02]
+          vpcmpleuw 291(%rax,%r14,8), %zmm22, %k4
+
+// CHECK: vpcmpleuw 8128(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x62,0x7f,0x02]
+          vpcmpleuw 8128(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpleuw 8192(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0xa2,0x00,0x20,0x00,0x00,0x02]
+          vpcmpleuw 8192(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpleuw -8192(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x62,0x80,0x02]
+          vpcmpleuw -8192(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpleuw -8256(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0xa2,0xc0,0xdf,0xff,0xff,0x02]
+          vpcmpleuw -8256(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpnequw %zmm22, %zmm22, %k4
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x40,0x3e,0xe6,0x04]
+          vpcmpnequw %zmm22, %zmm22, %k4
+
+// CHECK: vpcmpnequw %zmm22, %zmm22, %k4 {%k7}
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x47,0x3e,0xe6,0x04]
+          vpcmpnequw %zmm22, %zmm22, %k4 {%k7}
+
+// CHECK: vpcmpnequw (%rcx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x21,0x04]
+          vpcmpnequw (%rcx), %zmm22, %k4
+
+// CHECK: vpcmpnequw 291(%rax,%r14,8), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x40,0x3e,0xa4,0xf0,0x23,0x01,0x00,0x00,0x04]
+          vpcmpnequw 291(%rax,%r14,8), %zmm22, %k4
+
+// CHECK: vpcmpnequw 8128(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x62,0x7f,0x04]
+          vpcmpnequw 8128(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpnequw 8192(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0xa2,0x00,0x20,0x00,0x00,0x04]
+          vpcmpnequw 8192(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpnequw -8192(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x62,0x80,0x04]
+          vpcmpnequw -8192(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpnequw -8256(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0xa2,0xc0,0xdf,0xff,0xff,0x04]
+          vpcmpnequw -8256(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpnltuw %zmm22, %zmm22, %k4
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x40,0x3e,0xe6,0x05]
+          vpcmpnltuw %zmm22, %zmm22, %k4
+
+// CHECK: vpcmpnltuw %zmm22, %zmm22, %k4 {%k7}
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x47,0x3e,0xe6,0x05]
+          vpcmpnltuw %zmm22, %zmm22, %k4 {%k7}
+
+// CHECK: vpcmpnltuw (%rcx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x21,0x05]
+          vpcmpnltuw (%rcx), %zmm22, %k4
+
+// CHECK: vpcmpnltuw 291(%rax,%r14,8), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x40,0x3e,0xa4,0xf0,0x23,0x01,0x00,0x00,0x05]
+          vpcmpnltuw 291(%rax,%r14,8), %zmm22, %k4
+
+// CHECK: vpcmpnltuw 8128(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x62,0x7f,0x05]
+          vpcmpnltuw 8128(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpnltuw 8192(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0xa2,0x00,0x20,0x00,0x00,0x05]
+          vpcmpnltuw 8192(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpnltuw -8192(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x62,0x80,0x05]
+          vpcmpnltuw -8192(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpnltuw -8256(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0xa2,0xc0,0xdf,0xff,0xff,0x05]
+          vpcmpnltuw -8256(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpnleuw %zmm22, %zmm22, %k4
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x40,0x3e,0xe6,0x06]
+          vpcmpnleuw %zmm22, %zmm22, %k4
+
+// CHECK: vpcmpnleuw %zmm22, %zmm22, %k4 {%k7}
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x47,0x3e,0xe6,0x06]
+          vpcmpnleuw %zmm22, %zmm22, %k4 {%k7}
+
+// CHECK: vpcmpnleuw (%rcx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x21,0x06]
+          vpcmpnleuw (%rcx), %zmm22, %k4
+
+// CHECK: vpcmpnleuw 291(%rax,%r14,8), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x40,0x3e,0xa4,0xf0,0x23,0x01,0x00,0x00,0x06]
+          vpcmpnleuw 291(%rax,%r14,8), %zmm22, %k4
+
+// CHECK: vpcmpnleuw 8128(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x62,0x7f,0x06]
+          vpcmpnleuw 8128(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpnleuw 8192(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0xa2,0x00,0x20,0x00,0x00,0x06]
+          vpcmpnleuw 8192(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpnleuw -8192(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x62,0x80,0x06]
+          vpcmpnleuw -8192(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpnleuw -8256(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0xa2,0xc0,0xdf,0xff,0xff,0x06]
+          vpcmpnleuw -8256(%rdx), %zmm22, %k4
diff --git a/test/MC/X86/x86-64-avx512bw_vl.s b/test/MC/X86/x86-64-avx512bw_vl.s
index c3761de..e847550 100644
--- a/test/MC/X86/x86-64-avx512bw_vl.s
+++ b/test/MC/X86/x86-64-avx512bw_vl.s
@@ -144,6 +144,54 @@
 // CHECK:  encoding: [0x62,0xe1,0x55,0x20,0xfd,0xba,0xe0,0xef,0xff,0xff]
           vpaddw -4128(%rdx), %ymm21, %ymm23
 
+// CHECK: vpbroadcastb %eax, %xmm22
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x08,0x7a,0xf0]
+          vpbroadcastb %eax, %xmm22
+
+// CHECK: vpbroadcastb %eax, %xmm22 {%k3}
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x0b,0x7a,0xf0]
+          vpbroadcastb %eax, %xmm22 {%k3}
+
+// CHECK: vpbroadcastb %eax, %xmm22 {%k3} {z}
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x8b,0x7a,0xf0]
+          vpbroadcastb %eax, %xmm22 {%k3} {z}
+
+// CHECK: vpbroadcastb %eax, %ymm17
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x28,0x7a,0xc8]
+          vpbroadcastb %eax, %ymm17
+
+// CHECK: vpbroadcastb %eax, %ymm17 {%k1}
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x29,0x7a,0xc8]
+          vpbroadcastb %eax, %ymm17 {%k1}
+
+// CHECK: vpbroadcastb %eax, %ymm17 {%k1} {z}
+// CHECK:  encoding: [0x62,0xe2,0x7d,0xa9,0x7a,0xc8]
+          vpbroadcastb %eax, %ymm17 {%k1} {z}
+
+// CHECK: vpbroadcastw %eax, %xmm29
+// CHECK:  encoding: [0x62,0x62,0x7d,0x08,0x7b,0xe8]
+          vpbroadcastw %eax, %xmm29
+
+// CHECK: vpbroadcastw %eax, %xmm29 {%k1}
+// CHECK:  encoding: [0x62,0x62,0x7d,0x09,0x7b,0xe8]
+          vpbroadcastw %eax, %xmm29 {%k1}
+
+// CHECK: vpbroadcastw %eax, %xmm29 {%k1} {z}
+// CHECK:  encoding: [0x62,0x62,0x7d,0x89,0x7b,0xe8]
+          vpbroadcastw %eax, %xmm29 {%k1} {z}
+
+// CHECK: vpbroadcastw %eax, %ymm28
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x7b,0xe0]
+          vpbroadcastw %eax, %ymm28
+
+// CHECK: vpbroadcastw %eax, %ymm28 {%k4}
+// CHECK:  encoding: [0x62,0x62,0x7d,0x2c,0x7b,0xe0]
+          vpbroadcastw %eax, %ymm28 {%k4}
+
+// CHECK: vpbroadcastw %eax, %ymm28 {%k4} {z}
+// CHECK:  encoding: [0x62,0x62,0x7d,0xac,0x7b,0xe0]
+          vpbroadcastw %eax, %ymm28 {%k4} {z}
+
 // CHECK: vpcmpeqb %xmm21, %xmm21, %k4
 // CHECK:  encoding: [0x62,0xb1,0x55,0x00,0x74,0xe5]
           vpcmpeqb %xmm21, %xmm21, %k4
diff --git a/test/MC/X86/x86-64-avx512f_vl.s b/test/MC/X86/x86-64-avx512f_vl.s
index 973a553..ad121dc 100644
--- a/test/MC/X86/x86-64-avx512f_vl.s
+++ b/test/MC/X86/x86-64-avx512f_vl.s
@@ -2692,6 +2692,78 @@
 // CHECK:  encoding: [0x62,0x61,0xad,0x30,0xdb,0x8a,0xf8,0xfb,0xff,0xff]
           vpandq -1032(%rdx){1to4}, %ymm26, %ymm25
 
+// CHECK: vpbroadcastd %eax, %xmm22
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x08,0x7c,0xf0]
+          vpbroadcastd %eax, %xmm22
+
+// CHECK: vpbroadcastd %eax, %xmm22 {%k5}
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x0d,0x7c,0xf0]
+          vpbroadcastd %eax, %xmm22 {%k5}
+
+// CHECK: vpbroadcastd %eax, %xmm22 {%k5} {z}
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x8d,0x7c,0xf0]
+          vpbroadcastd %eax, %xmm22 {%k5} {z}
+
+// CHECK: vpbroadcastd %ebp, %xmm22
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x08,0x7c,0xf5]
+          vpbroadcastd %ebp, %xmm22
+
+// CHECK: vpbroadcastd %r13d, %xmm22
+// CHECK:  encoding: [0x62,0xc2,0x7d,0x08,0x7c,0xf5]
+          vpbroadcastd %r13d, %xmm22
+
+// CHECK: vpbroadcastd %eax, %ymm25
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x7c,0xc8]
+          vpbroadcastd %eax, %ymm25
+
+// CHECK: vpbroadcastd %eax, %ymm25 {%k5}
+// CHECK:  encoding: [0x62,0x62,0x7d,0x2d,0x7c,0xc8]
+          vpbroadcastd %eax, %ymm25 {%k5}
+
+// CHECK: vpbroadcastd %eax, %ymm25 {%k5} {z}
+// CHECK:  encoding: [0x62,0x62,0x7d,0xad,0x7c,0xc8]
+          vpbroadcastd %eax, %ymm25 {%k5} {z}
+
+// CHECK: vpbroadcastd %ebp, %ymm25
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x7c,0xcd]
+          vpbroadcastd %ebp, %ymm25
+
+// CHECK: vpbroadcastd %r13d, %ymm25
+// CHECK:  encoding: [0x62,0x42,0x7d,0x28,0x7c,0xcd]
+          vpbroadcastd %r13d, %ymm25
+
+// CHECK: vpbroadcastq %rax, %xmm22
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x08,0x7c,0xf0]
+          vpbroadcastq %rax, %xmm22
+
+// CHECK: vpbroadcastq %rax, %xmm22 {%k2}
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x0a,0x7c,0xf0]
+          vpbroadcastq %rax, %xmm22 {%k2}
+
+// CHECK: vpbroadcastq %rax, %xmm22 {%k2} {z}
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x8a,0x7c,0xf0]
+          vpbroadcastq %rax, %xmm22 {%k2} {z}
+
+// CHECK: vpbroadcastq %r8, %xmm22
+// CHECK:  encoding: [0x62,0xc2,0xfd,0x08,0x7c,0xf0]
+          vpbroadcastq %r8, %xmm22
+
+// CHECK: vpbroadcastq %rax, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x28,0x7c,0xd8]
+          vpbroadcastq %rax, %ymm19
+
+// CHECK: vpbroadcastq %rax, %ymm19 {%k5}
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x2d,0x7c,0xd8]
+          vpbroadcastq %rax, %ymm19 {%k5}
+
+// CHECK: vpbroadcastq %rax, %ymm19 {%k5} {z}
+// CHECK:  encoding: [0x62,0xe2,0xfd,0xad,0x7c,0xd8]
+          vpbroadcastq %rax, %ymm19 {%k5} {z}
+
+// CHECK: vpbroadcastq %r8, %ymm19
+// CHECK:  encoding: [0x62,0xc2,0xfd,0x28,0x7c,0xd8]
+          vpbroadcastq %r8, %ymm19
+
 // CHECK: vpcmpd $171, %xmm20, %xmm23, %k4
 // CHECK:  encoding: [0x62,0xb3,0x45,0x00,0x1f,0xe4,0xab]
           vpcmpd $171, %xmm20, %xmm23, %k4
diff --git a/test/MC/X86/x86_64-avx-encoding.s b/test/MC/X86/x86_64-avx-encoding.s
index 1704b94..9da08df 100644
--- a/test/MC/X86/x86_64-avx-encoding.s
+++ b/test/MC/X86/x86_64-avx-encoding.s
@@ -344,1027 +344,1027 @@ vdivpd  -4(%rcx,%rbx,8), %xmm10, %xmm11
 // CHECK: encoding: [0xc5,0x19,0xc6,0x6c,0xcb,0xfc,0x08]
           vshufpd  $8, -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $0, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x00]
           vcmpeqps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $2, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpleps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x02]
           vcmpleps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $1, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpltps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x01]
           vcmpltps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $4, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x04]
           vcmpneqps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $6, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnleps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x06]
           vcmpnleps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $5, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnltps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x05]
           vcmpnltps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $7, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpordps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x07]
           vcmpordps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $3, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpunordps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x03]
           vcmpunordps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $0, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpeqps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x00]
           vcmpeqps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $2, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpleps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x02]
           vcmpleps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $1, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpltps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x01]
           vcmpltps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $4, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpneqps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x04]
           vcmpneqps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $6, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpnleps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x06]
           vcmpnleps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $5, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpnltps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x05]
           vcmpnltps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $7, -4(%rbx,%rcx,8), %xmm6, %xmm2
+// CHECK: vcmpordps  -4(%rbx,%rcx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xc8,0xc2,0x54,0xcb,0xfc,0x07]
           vcmpordps   -4(%rbx,%rcx,8), %xmm6, %xmm2
 
-// CHECK: vcmpps  $3, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpunordps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x03]
           vcmpunordps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $0, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeqpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x00]
           vcmpeqpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $2, %xmm11, %xmm12, %xmm13
+// CHECK: vcmplepd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x02]
           vcmplepd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $1, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpltpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x01]
           vcmpltpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $4, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneqpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x04]
           vcmpneqpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $6, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnlepd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x06]
           vcmpnlepd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $5, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnltpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x05]
           vcmpnltpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $7, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpordpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x07]
           vcmpordpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $3, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpunordpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x03]
           vcmpunordpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $0, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpeqpd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x00]
           vcmpeqpd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $2, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmplepd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x02]
           vcmplepd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $1, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpltpd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x01]
           vcmpltpd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $4, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpneqpd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x04]
           vcmpneqpd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $6, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpnlepd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x06]
           vcmpnlepd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $5, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpnltpd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x05]
           vcmpnltpd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $7, -4(%rbx,%rcx,8), %xmm6, %xmm2
+// CHECK: vcmpordpd  -4(%rbx,%rcx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xc9,0xc2,0x54,0xcb,0xfc,0x07]
           vcmpordpd   -4(%rbx,%rcx,8), %xmm6, %xmm2
 
-// CHECK: vcmppd  $3, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpunordpd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x03]
           vcmpunordpd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $0, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeqss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x00]
           vcmpeqss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $2, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpless  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x02]
           vcmpless   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $1, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpltss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x01]
           vcmpltss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $4, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneqss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x04]
           vcmpneqss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $6, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnless  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x06]
           vcmpnless   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $5, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnltss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x05]
           vcmpnltss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $7, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpordss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x07]
           vcmpordss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $3, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpunordss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x03]
           vcmpunordss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $0, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpeqss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x00]
           vcmpeqss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $2, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpless  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x02]
           vcmpless   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $1, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpltss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x01]
           vcmpltss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $4, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpneqss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x04]
           vcmpneqss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $6, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpnless  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x06]
           vcmpnless   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $5, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpnltss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x05]
           vcmpnltss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $7, -4(%rbx,%rcx,8), %xmm6, %xmm2
+// CHECK: vcmpordss  -4(%rbx,%rcx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xca,0xc2,0x54,0xcb,0xfc,0x07]
           vcmpordss   -4(%rbx,%rcx,8), %xmm6, %xmm2
 
-// CHECK: vcmpss  $3, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpunordss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x03]
           vcmpunordss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $0, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeqsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x00]
           vcmpeqsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $2, %xmm11, %xmm12, %xmm13
+// CHECK: vcmplesd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x02]
           vcmplesd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $1, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpltsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x01]
           vcmpltsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $4, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneqsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x04]
           vcmpneqsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $6, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnlesd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x06]
           vcmpnlesd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $5, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnltsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x05]
           vcmpnltsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $7, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpordsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x07]
           vcmpordsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $3, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpunordsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x03]
           vcmpunordsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $0, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpeqsd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x00]
           vcmpeqsd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $2, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmplesd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x02]
           vcmplesd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $1, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpltsd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x01]
           vcmpltsd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $4, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpneqsd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x04]
           vcmpneqsd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $6, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpnlesd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x06]
           vcmpnlesd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $5, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpnltsd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x05]
           vcmpnltsd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $7, -4(%rbx,%rcx,8), %xmm6, %xmm2
+// CHECK: vcmpordsd  -4(%rbx,%rcx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xcb,0xc2,0x54,0xcb,0xfc,0x07]
           vcmpordsd   -4(%rbx,%rcx,8), %xmm6, %xmm2
 
-// CHECK: vcmpsd  $3, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpunordsd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x03]
           vcmpunordsd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $8, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeq_uqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x08]
           vcmpeq_uqps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $9, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpngeps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x09]
           vcmpngeps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $10, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpngtps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x0a]
           vcmpngtps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $11, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpfalseps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x0b]
           vcmpfalseps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $12, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneq_oqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x0c]
           vcmpneq_oqps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $13, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpgeps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x0d]
           vcmpgeps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $14, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpgtps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x0e]
           vcmpgtps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $15, %xmm11, %xmm12, %xmm13
+// CHECK: vcmptrueps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x0f]
           vcmptrueps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $16, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeq_osps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x10]
           vcmpeq_osps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $17, %xmm11, %xmm12, %xmm13
+// CHECK: vcmplt_oqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x11]
           vcmplt_oqps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $18, %xmm11, %xmm12, %xmm13
+// CHECK: vcmple_oqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x12]
           vcmple_oqps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $19, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpunord_sps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x13]
           vcmpunord_sps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $20, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneq_usps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x14]
           vcmpneq_usps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $21, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnlt_uqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x15]
           vcmpnlt_uqps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $22, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnle_uqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x16]
           vcmpnle_uqps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $23, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpord_sps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x17]
           vcmpord_sps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $24, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeq_usps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x18]
           vcmpeq_usps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $25, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnge_uqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x19]
           vcmpnge_uqps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $26, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpngt_uqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x1a]
           vcmpngt_uqps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $27, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpfalse_osps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x1b]
           vcmpfalse_osps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $28, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneq_osps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x1c]
           vcmpneq_osps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $29, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpge_oqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x1d]
           vcmpge_oqps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $30, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpgt_oqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x1e]
           vcmpgt_oqps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $31, %xmm11, %xmm12, %xmm13
+// CHECK: vcmptrue_usps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x1f]
           vcmptrue_usps   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $8, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpeq_uqps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x08]
           vcmpeq_uqps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $9, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpngeps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x09]
           vcmpngeps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $10, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpngtps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x0a]
           vcmpngtps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $11, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpfalseps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x0b]
           vcmpfalseps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $12, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpneq_oqps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x0c]
           vcmpneq_oqps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $13, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpgeps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x0d]
           vcmpgeps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $14, -4(%rbx,%rcx,8), %xmm6, %xmm2
+// CHECK: vcmpgtps  -4(%rbx,%rcx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xc8,0xc2,0x54,0xcb,0xfc,0x0e]
           vcmpgtps   -4(%rbx,%rcx,8), %xmm6, %xmm2
 
-// CHECK: vcmpps  $15, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmptrueps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x0f]
           vcmptrueps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $16, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpeq_osps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x10]
           vcmpeq_osps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $17, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmplt_oqps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x11]
           vcmplt_oqps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $18, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmple_oqps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x12]
           vcmple_oqps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $19, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpunord_sps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x13]
           vcmpunord_sps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $20, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpneq_usps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x14]
           vcmpneq_usps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $21, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpnlt_uqps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x15]
           vcmpnlt_uqps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $22, -4(%rbx,%rcx,8), %xmm6, %xmm2
+// CHECK: vcmpnle_uqps  -4(%rbx,%rcx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xc8,0xc2,0x54,0xcb,0xfc,0x16]
           vcmpnle_uqps   -4(%rbx,%rcx,8), %xmm6, %xmm2
 
-// CHECK: vcmpps  $23, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpord_sps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x17]
           vcmpord_sps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $24, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpeq_usps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x18]
           vcmpeq_usps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $25, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpnge_uqps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x19]
           vcmpnge_uqps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $26, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpngt_uqps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x1a]
           vcmpngt_uqps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $27, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpfalse_osps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x1b]
           vcmpfalse_osps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $28, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpneq_osps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x1c]
           vcmpneq_osps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $29, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpge_oqps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x1d]
           vcmpge_oqps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpps  $30, -4(%rbx,%rcx,8), %xmm6, %xmm2
+// CHECK: vcmpgt_oqps  -4(%rbx,%rcx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xc8,0xc2,0x54,0xcb,0xfc,0x1e]
           vcmpgt_oqps   -4(%rbx,%rcx,8), %xmm6, %xmm2
 
-// CHECK: vcmpps  $31, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmptrue_usps  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x18,0xc2,0x6c,0xcb,0xfc,0x1f]
           vcmptrue_usps   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $8, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeq_uqpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x08]
           vcmpeq_uqpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $9, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpngepd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x09]
           vcmpngepd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $10, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpngtpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x0a]
           vcmpngtpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $11, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpfalsepd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x0b]
           vcmpfalsepd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $12, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneq_oqpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x0c]
           vcmpneq_oqpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $13, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpgepd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x0d]
           vcmpgepd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $14, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpgtpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x0e]
           vcmpgtpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $15, %xmm11, %xmm12, %xmm13
+// CHECK: vcmptruepd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x0f]
           vcmptruepd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $16, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeq_ospd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x10]
           vcmpeq_ospd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $17, %xmm11, %xmm12, %xmm13
+// CHECK: vcmplt_oqpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x11]
           vcmplt_oqpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $18, %xmm11, %xmm12, %xmm13
+// CHECK: vcmple_oqpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x12]
           vcmple_oqpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $19, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpunord_spd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x13]
           vcmpunord_spd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $20, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneq_uspd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x14]
           vcmpneq_uspd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $21, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnlt_uqpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x15]
           vcmpnlt_uqpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $22, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnle_uqpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x16]
           vcmpnle_uqpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $23, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpord_spd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x17]
           vcmpord_spd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $24, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeq_uspd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x18]
           vcmpeq_uspd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $25, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnge_uqpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x19]
           vcmpnge_uqpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $26, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpngt_uqpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x1a]
           vcmpngt_uqpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $27, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpfalse_ospd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x1b]
           vcmpfalse_ospd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $28, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneq_ospd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x1c]
           vcmpneq_ospd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $29, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpge_oqpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x1d]
           vcmpge_oqpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $30, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpgt_oqpd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x1e]
           vcmpgt_oqpd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $31, %xmm11, %xmm12, %xmm13
+// CHECK: vcmptrue_uspd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x19,0xc2,0xeb,0x1f]
           vcmptrue_uspd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmppd  $8, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpeq_uqpd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x08]
           vcmpeq_uqpd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $9, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpngepd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x09]
           vcmpngepd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $10, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpngtpd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x0a]
           vcmpngtpd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $11, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpfalsepd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x0b]
           vcmpfalsepd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $12, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpneq_oqpd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x0c]
           vcmpneq_oqpd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $13, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpgepd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x0d]
           vcmpgepd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $14, -4(%rbx,%rcx,8), %xmm6, %xmm2
+// CHECK: vcmpgtpd  -4(%rbx,%rcx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xc9,0xc2,0x54,0xcb,0xfc,0x0e]
           vcmpgtpd   -4(%rbx,%rcx,8), %xmm6, %xmm2
 
-// CHECK: vcmppd  $15, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmptruepd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x0f]
           vcmptruepd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $16, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpeq_ospd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x10]
           vcmpeq_ospd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $17, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmplt_oqpd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x11]
           vcmplt_oqpd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $18, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmple_oqpd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x12]
           vcmple_oqpd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $19, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpunord_spd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x13]
           vcmpunord_spd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $20, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpneq_uspd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x14]
           vcmpneq_uspd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $21, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpnlt_uqpd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x15]
           vcmpnlt_uqpd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $22, -4(%rbx,%rcx,8), %xmm6, %xmm2
+// CHECK: vcmpnle_uqpd  -4(%rbx,%rcx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xc9,0xc2,0x54,0xcb,0xfc,0x16]
           vcmpnle_uqpd   -4(%rbx,%rcx,8), %xmm6, %xmm2
 
-// CHECK: vcmppd  $23, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpord_spd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x17]
           vcmpord_spd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $24, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpeq_uspd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x18]
           vcmpeq_uspd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $25, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpnge_uqpd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x19]
           vcmpnge_uqpd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $26, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpngt_uqpd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x1a]
           vcmpngt_uqpd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $27, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpfalse_ospd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x1b]
           vcmpfalse_ospd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $28, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpneq_ospd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x1c]
           vcmpneq_ospd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $29, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpge_oqpd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x1d]
           vcmpge_oqpd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmppd  $30, -4(%rbx,%rcx,8), %xmm6, %xmm2
+// CHECK: vcmpgt_oqpd  -4(%rbx,%rcx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xc9,0xc2,0x54,0xcb,0xfc,0x1e]
           vcmpgt_oqpd   -4(%rbx,%rcx,8), %xmm6, %xmm2
 
-// CHECK: vcmppd  $31, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmptrue_uspd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x19,0xc2,0x6c,0xcb,0xfc,0x1f]
           vcmptrue_uspd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $8, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeq_uqss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x08]
           vcmpeq_uqss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $9, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpngess  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x09]
           vcmpngess   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $10, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpngtss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x0a]
           vcmpngtss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $11, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpfalsess  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x0b]
           vcmpfalsess   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $12, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneq_oqss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x0c]
           vcmpneq_oqss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $13, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpgess  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x0d]
           vcmpgess   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $14, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpgtss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x0e]
           vcmpgtss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $15, %xmm11, %xmm12, %xmm13
+// CHECK: vcmptruess  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x0f]
           vcmptruess   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $16, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeq_osss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x10]
           vcmpeq_osss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $17, %xmm11, %xmm12, %xmm13
+// CHECK: vcmplt_oqss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x11]
           vcmplt_oqss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $18, %xmm11, %xmm12, %xmm13
+// CHECK: vcmple_oqss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x12]
           vcmple_oqss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $19, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpunord_sss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x13]
           vcmpunord_sss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $20, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneq_usss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x14]
           vcmpneq_usss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $21, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnlt_uqss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x15]
           vcmpnlt_uqss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $22, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnle_uqss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x16]
           vcmpnle_uqss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $23, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpord_sss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x17]
           vcmpord_sss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $24, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeq_usss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x18]
           vcmpeq_usss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $25, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnge_uqss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x19]
           vcmpnge_uqss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $26, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpngt_uqss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x1a]
           vcmpngt_uqss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $27, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpfalse_osss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x1b]
           vcmpfalse_osss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $28, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneq_osss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x1c]
           vcmpneq_osss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $29, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpge_oqss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x1d]
           vcmpge_oqss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $30, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpgt_oqss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x1e]
           vcmpgt_oqss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $31, %xmm11, %xmm12, %xmm13
+// CHECK: vcmptrue_usss  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1a,0xc2,0xeb,0x1f]
           vcmptrue_usss   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpss  $8, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpeq_uqss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x08]
           vcmpeq_uqss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $9, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpngess  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x09]
           vcmpngess   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $10, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpngtss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x0a]
           vcmpngtss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $11, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpfalsess  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x0b]
           vcmpfalsess   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $12, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpneq_oqss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x0c]
           vcmpneq_oqss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $13, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpgess  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x0d]
           vcmpgess   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $14, -4(%rbx,%rcx,8), %xmm6, %xmm2
+// CHECK: vcmpgtss  -4(%rbx,%rcx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xca,0xc2,0x54,0xcb,0xfc,0x0e]
           vcmpgtss   -4(%rbx,%rcx,8), %xmm6, %xmm2
 
-// CHECK: vcmpss  $15, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmptruess  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x0f]
           vcmptruess   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $16, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpeq_osss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x10]
           vcmpeq_osss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $17, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmplt_oqss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x11]
           vcmplt_oqss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $18, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmple_oqss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x12]
           vcmple_oqss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $19, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpunord_sss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x13]
           vcmpunord_sss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $20, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpneq_usss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x14]
           vcmpneq_usss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $21, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpnlt_uqss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x15]
           vcmpnlt_uqss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $22, -4(%rbx,%rcx,8), %xmm6, %xmm2
+// CHECK: vcmpnle_uqss  -4(%rbx,%rcx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xca,0xc2,0x54,0xcb,0xfc,0x16]
           vcmpnle_uqss   -4(%rbx,%rcx,8), %xmm6, %xmm2
 
-// CHECK: vcmpss  $23, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpord_sss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x17]
           vcmpord_sss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $24, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpeq_usss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x18]
           vcmpeq_usss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $25, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpnge_uqss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x19]
           vcmpnge_uqss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $26, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpngt_uqss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x1a]
           vcmpngt_uqss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $27, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpfalse_osss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x1b]
           vcmpfalse_osss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $28, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpneq_osss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x1c]
           vcmpneq_osss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $29, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpge_oqss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x1d]
           vcmpge_oqss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpss  $30, -4(%rbx,%rcx,8), %xmm6, %xmm2
+// CHECK: vcmpgt_oqss  -4(%rbx,%rcx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xca,0xc2,0x54,0xcb,0xfc,0x1e]
           vcmpgt_oqss   -4(%rbx,%rcx,8), %xmm6, %xmm2
 
-// CHECK: vcmpss  $31, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmptrue_usss  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1a,0xc2,0x6c,0xcb,0xfc,0x1f]
           vcmptrue_usss   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $8, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeq_uqsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x08]
           vcmpeq_uqsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $9, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpngesd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x09]
           vcmpngesd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $10, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpngtsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x0a]
           vcmpngtsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $11, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpfalsesd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x0b]
           vcmpfalsesd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $12, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneq_oqsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x0c]
           vcmpneq_oqsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $13, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpgesd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x0d]
           vcmpgesd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $14, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpgtsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x0e]
           vcmpgtsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $15, %xmm11, %xmm12, %xmm13
+// CHECK: vcmptruesd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x0f]
           vcmptruesd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $16, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeq_ossd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x10]
           vcmpeq_ossd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $17, %xmm11, %xmm12, %xmm13
+// CHECK: vcmplt_oqsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x11]
           vcmplt_oqsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $18, %xmm11, %xmm12, %xmm13
+// CHECK: vcmple_oqsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x12]
           vcmple_oqsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $19, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpunord_ssd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x13]
           vcmpunord_ssd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $20, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneq_ussd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x14]
           vcmpneq_ussd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $21, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnlt_uqsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x15]
           vcmpnlt_uqsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $22, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnle_uqsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x16]
           vcmpnle_uqsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $23, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpord_ssd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x17]
           vcmpord_ssd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $24, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeq_ussd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x18]
           vcmpeq_ussd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $25, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnge_uqsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x19]
           vcmpnge_uqsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $26, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpngt_uqsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x1a]
           vcmpngt_uqsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $27, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpfalse_ossd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x1b]
           vcmpfalse_ossd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $28, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneq_ossd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x1c]
           vcmpneq_ossd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $29, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpge_oqsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x1d]
           vcmpge_oqsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $30, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpgt_oqsd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x1e]
           vcmpgt_oqsd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $31, %xmm11, %xmm12, %xmm13
+// CHECK: vcmptrue_ussd  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x1b,0xc2,0xeb,0x1f]
           vcmptrue_ussd   %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpsd  $8, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpeq_uqsd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x08]
           vcmpeq_uqsd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $9, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpngesd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x09]
           vcmpngesd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $10, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpngtsd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x0a]
           vcmpngtsd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $11, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpfalsesd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x0b]
           vcmpfalsesd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $12, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpneq_oqsd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x0c]
           vcmpneq_oqsd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $13, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpgesd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x0d]
           vcmpgesd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $14, -4(%rbx,%rcx,8), %xmm6, %xmm2
+// CHECK: vcmpgtsd  -4(%rbx,%rcx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xcb,0xc2,0x54,0xcb,0xfc,0x0e]
           vcmpgtsd   -4(%rbx,%rcx,8), %xmm6, %xmm2
 
-// CHECK: vcmpsd  $15, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmptruesd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x0f]
           vcmptruesd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $16, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpeq_ossd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x10]
           vcmpeq_ossd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $17, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmplt_oqsd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x11]
           vcmplt_oqsd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $18, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmple_oqsd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x12]
           vcmple_oqsd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $19, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpunord_ssd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x13]
           vcmpunord_ssd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $20, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpneq_ussd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x14]
           vcmpneq_ussd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $21, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpnlt_uqsd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x15]
           vcmpnlt_uqsd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $22, -4(%rbx,%rcx,8), %xmm6, %xmm2
+// CHECK: vcmpnle_uqsd  -4(%rbx,%rcx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xcb,0xc2,0x54,0xcb,0xfc,0x16]
           vcmpnle_uqsd   -4(%rbx,%rcx,8), %xmm6, %xmm2
 
-// CHECK: vcmpsd  $23, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpord_ssd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x17]
           vcmpord_ssd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $24, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpeq_ussd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x18]
           vcmpeq_ussd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $25, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpnge_uqsd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x19]
           vcmpnge_uqsd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $26, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpngt_uqsd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x1a]
           vcmpngt_uqsd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $27, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpfalse_ossd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x1b]
           vcmpfalse_ossd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $28, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpneq_ossd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x1c]
           vcmpneq_ossd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $29, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmpge_oqsd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x1d]
           vcmpge_oqsd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
-// CHECK: vcmpsd  $30, -4(%rbx,%rcx,8), %xmm6, %xmm2
+// CHECK: vcmpgt_oqsd  -4(%rbx,%rcx,8), %xmm6, %xmm2
 // CHECK: encoding: [0xc5,0xcb,0xc2,0x54,0xcb,0xfc,0x1e]
           vcmpgt_oqsd   -4(%rbx,%rcx,8), %xmm6, %xmm2
 
-// CHECK: vcmpsd  $31, -4(%rbx,%rcx,8), %xmm12, %xmm13
+// CHECK: vcmptrue_ussd  -4(%rbx,%rcx,8), %xmm12, %xmm13
 // CHECK: encoding: [0xc5,0x1b,0xc2,0x6c,0xcb,0xfc,0x1f]
           vcmptrue_ussd   -4(%rbx,%rcx,8), %xmm12, %xmm13
 
@@ -2936,99 +2936,99 @@ vdivpd  -4(%rcx,%rbx,8), %xmm10, %xmm11
 // CHECK: encoding: [0xc4,0x63,0x79,0xdf,0x10,0x07]
           vaeskeygenassist  $7, (%rax), %xmm10
 
-// CHECK: vcmpps  $8, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeq_uqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x08]
           vcmpeq_uqps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $9, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpngeps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x09]
           vcmpngeps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $10, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpngtps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x0a]
           vcmpngtps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $11, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpfalseps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x0b]
           vcmpfalseps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $12, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneq_oqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x0c]
           vcmpneq_oqps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $13, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpgeps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x0d]
           vcmpgeps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $14, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpgtps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x0e]
           vcmpgtps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $15, %xmm11, %xmm12, %xmm13
+// CHECK: vcmptrueps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x0f]
           vcmptrueps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $16, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeq_osps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x10]
           vcmpeq_osps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $17, %xmm11, %xmm12, %xmm13
+// CHECK: vcmplt_oqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x11]
           vcmplt_oqps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $18, %xmm11, %xmm12, %xmm13
+// CHECK: vcmple_oqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x12]
           vcmple_oqps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $19, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpunord_sps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x13]
           vcmpunord_sps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $20, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneq_usps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x14]
           vcmpneq_usps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $21, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnlt_uqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x15]
           vcmpnlt_uqps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $22, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnle_uqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x16]
           vcmpnle_uqps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $23, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpord_sps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x17]
           vcmpord_sps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $24, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpeq_usps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x18]
           vcmpeq_usps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $25, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpnge_uqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x19]
           vcmpnge_uqps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $26, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpngt_uqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x1a]
           vcmpngt_uqps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $27, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpfalse_osps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x1b]
           vcmpfalse_osps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $28, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpneq_osps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x1c]
           vcmpneq_osps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $29, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpge_oqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x1d]
           vcmpge_oqps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $30, %xmm11, %xmm12, %xmm13
+// CHECK: vcmpgt_oqps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x1e]
           vcmpgt_oqps %xmm11, %xmm12, %xmm13
 
-// CHECK: vcmpps  $31, %xmm11, %xmm12, %xmm13
+// CHECK: vcmptrue_usps  %xmm11, %xmm12, %xmm13
 // CHECK: encoding: [0xc4,0x41,0x18,0xc2,0xeb,0x1f]
           vcmptrue_usps %xmm11, %xmm12, %xmm13
 
@@ -3428,227 +3428,227 @@ vdivpd  -4(%rcx,%rbx,8), %xmm10, %xmm11
 // CHECK: encoding: [0xc5,0x7b,0xe6,0x18]
           vcvtpd2dqx  (%rax), %xmm11
 
-// CHECK: vcmpps  $0, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpeqps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x00]
           vcmpeqps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $2, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpleps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x02]
           vcmpleps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $1, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpltps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x01]
           vcmpltps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $4, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpneqps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x04]
           vcmpneqps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $6, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpnleps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x06]
           vcmpnleps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $5, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpnltps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x05]
           vcmpnltps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $7, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpordps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x07]
           vcmpordps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $3, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpunordps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x03]
           vcmpunordps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $0, -4(%rbx,%rcx,8), %ymm12, %ymm13
+// CHECK: vcmpeqps  -4(%rbx,%rcx,8), %ymm12, %ymm13
 // CHECK: encoding: [0xc5,0x1c,0xc2,0x6c,0xcb,0xfc,0x00]
           vcmpeqps -4(%rbx,%rcx,8), %ymm12, %ymm13
 
-// CHECK: vcmpps  $2, -4(%rbx,%rcx,8), %ymm12, %ymm13
+// CHECK: vcmpleps  -4(%rbx,%rcx,8), %ymm12, %ymm13
 // CHECK: encoding: [0xc5,0x1c,0xc2,0x6c,0xcb,0xfc,0x02]
           vcmpleps -4(%rbx,%rcx,8), %ymm12, %ymm13
 
-// CHECK: vcmpps  $1, -4(%rbx,%rcx,8), %ymm12, %ymm13
+// CHECK: vcmpltps  -4(%rbx,%rcx,8), %ymm12, %ymm13
 // CHECK: encoding: [0xc5,0x1c,0xc2,0x6c,0xcb,0xfc,0x01]
           vcmpltps -4(%rbx,%rcx,8), %ymm12, %ymm13
 
-// CHECK: vcmpps  $4, -4(%rbx,%rcx,8), %ymm12, %ymm13
+// CHECK: vcmpneqps  -4(%rbx,%rcx,8), %ymm12, %ymm13
 // CHECK: encoding: [0xc5,0x1c,0xc2,0x6c,0xcb,0xfc,0x04]
           vcmpneqps -4(%rbx,%rcx,8), %ymm12, %ymm13
 
-// CHECK: vcmpps  $6, -4(%rbx,%rcx,8), %ymm12, %ymm13
+// CHECK: vcmpnleps  -4(%rbx,%rcx,8), %ymm12, %ymm13
 // CHECK: encoding: [0xc5,0x1c,0xc2,0x6c,0xcb,0xfc,0x06]
           vcmpnleps -4(%rbx,%rcx,8), %ymm12, %ymm13
 
-// CHECK: vcmpps  $5, -4(%rbx,%rcx,8), %ymm12, %ymm13
+// CHECK: vcmpnltps  -4(%rbx,%rcx,8), %ymm12, %ymm13
 // CHECK: encoding: [0xc5,0x1c,0xc2,0x6c,0xcb,0xfc,0x05]
           vcmpnltps -4(%rbx,%rcx,8), %ymm12, %ymm13
 
-// CHECK: vcmpps  $7, -4(%rbx,%rcx,8), %ymm6, %ymm12
+// CHECK: vcmpordps  -4(%rbx,%rcx,8), %ymm6, %ymm12
 // CHECK: encoding: [0xc5,0x4c,0xc2,0x64,0xcb,0xfc,0x07]
           vcmpordps -4(%rbx,%rcx,8), %ymm6, %ymm12
 
-// CHECK: vcmpps  $3, -4(%rbx,%rcx,8), %ymm12, %ymm13
+// CHECK: vcmpunordps  -4(%rbx,%rcx,8), %ymm12, %ymm13
 // CHECK: encoding: [0xc5,0x1c,0xc2,0x6c,0xcb,0xfc,0x03]
           vcmpunordps -4(%rbx,%rcx,8), %ymm12, %ymm13
 
-// CHECK: vcmppd  $0, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpeqpd  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1d,0xc2,0xeb,0x00]
           vcmpeqpd %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmppd  $2, %ymm11, %ymm12, %ymm13
+// CHECK: vcmplepd  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1d,0xc2,0xeb,0x02]
           vcmplepd %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmppd  $1, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpltpd  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1d,0xc2,0xeb,0x01]
           vcmpltpd %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmppd  $4, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpneqpd  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1d,0xc2,0xeb,0x04]
           vcmpneqpd %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmppd  $6, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpnlepd  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1d,0xc2,0xeb,0x06]
           vcmpnlepd %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmppd  $5, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpnltpd  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1d,0xc2,0xeb,0x05]
           vcmpnltpd %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmppd  $7, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpordpd  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1d,0xc2,0xeb,0x07]
           vcmpordpd %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmppd  $3, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpunordpd  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1d,0xc2,0xeb,0x03]
           vcmpunordpd %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmppd  $0, -4(%rbx,%rcx,8), %ymm12, %ymm13
+// CHECK: vcmpeqpd  -4(%rbx,%rcx,8), %ymm12, %ymm13
 // CHECK: encoding: [0xc5,0x1d,0xc2,0x6c,0xcb,0xfc,0x00]
           vcmpeqpd -4(%rbx,%rcx,8), %ymm12, %ymm13
 
-// CHECK: vcmppd  $2, -4(%rbx,%rcx,8), %ymm12, %ymm13
+// CHECK: vcmplepd  -4(%rbx,%rcx,8), %ymm12, %ymm13
 // CHECK: encoding: [0xc5,0x1d,0xc2,0x6c,0xcb,0xfc,0x02]
           vcmplepd -4(%rbx,%rcx,8), %ymm12, %ymm13
 
-// CHECK: vcmppd  $1, -4(%rbx,%rcx,8), %ymm12, %ymm13
+// CHECK: vcmpltpd  -4(%rbx,%rcx,8), %ymm12, %ymm13
 // CHECK: encoding: [0xc5,0x1d,0xc2,0x6c,0xcb,0xfc,0x01]
           vcmpltpd -4(%rbx,%rcx,8), %ymm12, %ymm13
 
-// CHECK: vcmppd  $4, -4(%rbx,%rcx,8), %ymm12, %ymm13
+// CHECK: vcmpneqpd  -4(%rbx,%rcx,8), %ymm12, %ymm13
 // CHECK: encoding: [0xc5,0x1d,0xc2,0x6c,0xcb,0xfc,0x04]
           vcmpneqpd -4(%rbx,%rcx,8), %ymm12, %ymm13
 
-// CHECK: vcmppd  $6, -4(%rbx,%rcx,8), %ymm12, %ymm13
+// CHECK: vcmpnlepd  -4(%rbx,%rcx,8), %ymm12, %ymm13
 // CHECK: encoding: [0xc5,0x1d,0xc2,0x6c,0xcb,0xfc,0x06]
           vcmpnlepd -4(%rbx,%rcx,8), %ymm12, %ymm13
 
-// CHECK: vcmppd  $5, -4(%rbx,%rcx,8), %ymm12, %ymm13
+// CHECK: vcmpnltpd  -4(%rbx,%rcx,8), %ymm12, %ymm13
 // CHECK: encoding: [0xc5,0x1d,0xc2,0x6c,0xcb,0xfc,0x05]
           vcmpnltpd -4(%rbx,%rcx,8), %ymm12, %ymm13
 
-// CHECK: vcmppd  $7, -4(%rbx,%rcx,8), %ymm6, %ymm12
+// CHECK: vcmpordpd  -4(%rbx,%rcx,8), %ymm6, %ymm12
 // CHECK: encoding: [0xc5,0x4d,0xc2,0x64,0xcb,0xfc,0x07]
           vcmpordpd -4(%rbx,%rcx,8), %ymm6, %ymm12
 
-// CHECK: vcmppd  $3, -4(%rbx,%rcx,8), %ymm12, %ymm13
+// CHECK: vcmpunordpd  -4(%rbx,%rcx,8), %ymm12, %ymm13
 // CHECK: encoding: [0xc5,0x1d,0xc2,0x6c,0xcb,0xfc,0x03]
           vcmpunordpd -4(%rbx,%rcx,8), %ymm12, %ymm13
 
-// CHECK: vcmpps  $8, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpeq_uqps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x08]
           vcmpeq_uqps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $9, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpngeps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x09]
           vcmpngeps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $10, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpngtps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x0a]
           vcmpngtps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $11, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpfalseps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x0b]
           vcmpfalseps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $12, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpneq_oqps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x0c]
           vcmpneq_oqps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $13, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpgeps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x0d]
           vcmpgeps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $14, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpgtps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x0e]
           vcmpgtps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $15, %ymm11, %ymm12, %ymm13
+// CHECK: vcmptrueps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x0f]
           vcmptrueps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $16, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpeq_osps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x10]
           vcmpeq_osps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $17, %ymm11, %ymm12, %ymm13
+// CHECK: vcmplt_oqps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x11]
           vcmplt_oqps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $18, %ymm11, %ymm12, %ymm13
+// CHECK: vcmple_oqps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x12]
           vcmple_oqps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $19, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpunord_sps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x13]
           vcmpunord_sps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $20, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpneq_usps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x14]
           vcmpneq_usps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $21, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpnlt_uqps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x15]
           vcmpnlt_uqps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $22, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpnle_uqps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x16]
           vcmpnle_uqps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $23, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpord_sps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x17]
           vcmpord_sps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $24, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpeq_usps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x18]
           vcmpeq_usps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $25, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpnge_uqps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x19]
           vcmpnge_uqps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $26, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpngt_uqps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x1a]
           vcmpngt_uqps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $27, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpfalse_osps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x1b]
           vcmpfalse_osps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $28, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpneq_osps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x1c]
           vcmpneq_osps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $29, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpge_oqps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x1d]
           vcmpge_oqps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $30, %ymm11, %ymm12, %ymm13
+// CHECK: vcmpgt_oqps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x1e]
           vcmpgt_oqps %ymm11, %ymm12, %ymm13
 
-// CHECK: vcmpps  $31, %ymm11, %ymm12, %ymm13
+// CHECK: vcmptrue_usps  %ymm11, %ymm12, %ymm13
 // CHECK: encoding: [0xc4,0x41,0x1c,0xc2,0xeb,0x1f]
           vcmptrue_usps %ymm11, %ymm12, %ymm13
 
diff --git a/test/MC/X86/x86_64-encoding.s b/test/MC/X86/x86_64-encoding.s
index 40b93f0..62af1bd 100644
--- a/test/MC/X86/x86_64-encoding.s
+++ b/test/MC/X86/x86_64-encoding.s
@@ -200,14 +200,22 @@ sha256msg2 (%rax), %xmm2
 // CHECK: encoding: [0x48,0x8b,0x04,0xe1]
           movq  (%rcx,%riz,8), %rax
 
-// CHECK: fxsaveq (%rax)
+// CHECK: fxsave64 (%rax)
 // CHECK: encoding: [0x48,0x0f,0xae,0x00]
           fxsaveq (%rax)
 
-// CHECK: fxrstorq (%rax)
+// CHECK: fxsave64 (%rax)
+// CHECK: encoding: [0x48,0x0f,0xae,0x00]
+          fxsave64 (%rax)
+
+// CHECK: fxrstor64 (%rax)
 // CHECK: encoding: [0x48,0x0f,0xae,0x08]
           fxrstorq (%rax)
 
+// CHECK: fxrstor64 (%rax)
+// CHECK: encoding: [0x48,0x0f,0xae,0x08]
+          fxrstor64 (%rax)
+
 // CHECK: leave
 // CHECK:  encoding: [0xc9]
         	leave
diff --git a/test/MC/X86/x86_64-xop-encoding.s b/test/MC/X86/x86_64-xop-encoding.s
index 1137b71..48f30ec 100644
--- a/test/MC/X86/x86_64-xop-encoding.s
+++ b/test/MC/X86/x86_64-xop-encoding.s
@@ -506,6 +506,70 @@
 // CHECK: encoding: [0x8f,0xe8,0x60,0xcc,0x50,0x08,0x38]
           vpcomb $56, 8(%rax), %xmm3, %xmm2
 
+// CHECK: vpcomltw %xmm2, %xmm3, %xmm4
+// CHECK: encoding: [0x8f,0xe8,0x60,0xcd,0xe2,0x00]
+          vpcomltw %xmm2, %xmm3, %xmm4
+
+// CHECK: vpcomlew %xmm2, %xmm3, %xmm4
+// CHECK: encoding: [0x8f,0xe8,0x60,0xcd,0xe2,0x01]
+          vpcomlew %xmm2, %xmm3, %xmm4
+
+// CHECK: vpcomgtw %xmm2, %xmm3, %xmm4
+// CHECK: encoding: [0x8f,0xe8,0x60,0xcd,0xe2,0x02]
+          vpcomgtw %xmm2, %xmm3, %xmm4
+
+// CHECK: vpcomgew %xmm2, %xmm3, %xmm4
+// CHECK: encoding: [0x8f,0xe8,0x60,0xcd,0xe2,0x03]
+          vpcomgew %xmm2, %xmm3, %xmm4
+
+// CHECK: vpcomeqw %xmm2, %xmm3, %xmm4
+// CHECK: encoding: [0x8f,0xe8,0x60,0xcd,0xe2,0x04]
+          vpcomeqw %xmm2, %xmm3, %xmm4
+
+// CHECK: vpcomneqw %xmm2, %xmm3, %xmm4
+// CHECK: encoding: [0x8f,0xe8,0x60,0xcd,0xe2,0x05]
+          vpcomneqw %xmm2, %xmm3, %xmm4
+
+// CHECK: vpcomfalsew %xmm2, %xmm3, %xmm4
+// CHECK: encoding: [0x8f,0xe8,0x60,0xcd,0xe2,0x06]
+          vpcomfalsew %xmm2, %xmm3, %xmm4
+
+// CHECK: vpcomtruew %xmm2, %xmm3, %xmm4
+// CHECK: encoding: [0x8f,0xe8,0x60,0xcd,0xe2,0x07]
+          vpcomtruew %xmm2, %xmm3, %xmm4
+
+
+// CHECK: vpcomltuw %xmm2, %xmm3, %xmm4
+// CHECK: encoding: [0x8f,0xe8,0x60,0xed,0xe2,0x00]
+          vpcomltuw %xmm2, %xmm3, %xmm4
+
+// CHECK: vpcomleuw %xmm2, %xmm3, %xmm4
+// CHECK: encoding: [0x8f,0xe8,0x60,0xed,0xe2,0x01]
+          vpcomleuw %xmm2, %xmm3, %xmm4
+
+// CHECK: vpcomgtuw %xmm2, %xmm3, %xmm4
+// CHECK: encoding: [0x8f,0xe8,0x60,0xed,0xe2,0x02]
+          vpcomgtuw %xmm2, %xmm3, %xmm4
+
+// CHECK: vpcomgeuw %xmm2, %xmm3, %xmm4
+// CHECK: encoding: [0x8f,0xe8,0x60,0xed,0xe2,0x03]
+          vpcomgeuw %xmm2, %xmm3, %xmm4
+
+// CHECK: vpcomequw %xmm2, %xmm3, %xmm4
+// CHECK: encoding: [0x8f,0xe8,0x60,0xed,0xe2,0x04]
+          vpcomequw %xmm2, %xmm3, %xmm4
+
+// CHECK: vpcomnequw %xmm2, %xmm3, %xmm4
+// CHECK: encoding: [0x8f,0xe8,0x60,0xed,0xe2,0x05]
+          vpcomnequw %xmm2, %xmm3, %xmm4
+
+// CHECK: vpcomfalseuw %xmm2, %xmm3, %xmm4
+// CHECK: encoding: [0x8f,0xe8,0x60,0xed,0xe2,0x06]
+          vpcomfalseuw %xmm2, %xmm3, %xmm4
+
+// CHECK: vpcomtrueuw %xmm2, %xmm3, %xmm4
+// CHECK: encoding: [0x8f,0xe8,0x60,0xed,0xe2,0x07]
+          vpcomtrueuw %xmm2, %xmm3, %xmm4
 
 // vpperm
 // CHECK: vpperm %xmm1, %xmm2, %xmm3, %xmm4
diff --git a/test/MC/X86/x86_errors.s b/test/MC/X86/x86_errors.s
index 0b3bc7f..fa87ef6 100644
--- a/test/MC/X86/x86_errors.s
+++ b/test/MC/X86/x86_errors.s
@@ -50,3 +50,11 @@ outb al, 4
 // 32: error: invalid segment register
 // 64: error: invalid segment register
 movl %eax:0x00, %ebx
+
+// 32: error: invalid operand for instruction
+// 64: error: invalid operand for instruction
+cmpps $-129, %xmm0, %xmm0
+
+// 32: error: invalid operand for instruction
+// 64: error: invalid operand for instruction
+cmppd $256, %xmm0, %xmm0
diff --git a/test/Makefile b/test/Makefile
index 38aba65..f4ed151 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -85,13 +85,13 @@ endif
 # just removing them here won't work.
 # Solaris does not have the -m flag for ulimit
 ifeq ($(HOST_OS),SunOS)
-ULIMIT=ulimit -t 600 ; ulimit -d 512000 ; ulimit -v 512000 ;
+ULIMIT=ulimit -t 1200 ; ulimit -d 512000 ; ulimit -v 512000 ;
 else # !SunOS
 # Newer versions of python try to allocate an insane amount of address space for
 # its thread-local storage, don't set a limit here.
 # When -v is not used, then -s has to be used to limit the stack size.
 # FIXME: Those limits should be enforced by lit instead of globally.
-ULIMIT=ulimit -t 600 ; ulimit -d 512000 ; ulimit -m 512000 ; ulimit -s 8192 ;
+ULIMIT=ulimit -t 1200 ; ulimit -d 512000 ; ulimit -m 512000 ; ulimit -s 8192 ;
 endif # SunOS
 
 check-local:: lit.site.cfg Unit/lit.site.cfg
@@ -123,11 +123,12 @@ lit.site.cfg: FORCE
 	@$(ECHOPATH) s=@LLVM_SOURCE_DIR@=$(LLVM_SRC_ROOT)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVM_BINARY_DIR@=$(LLVM_OBJ_ROOT)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVM_TOOLS_DIR@=$(ToolDir)=g >> lit.tmp
-	@$(ECHOPATH) s=@LIBDIR@=$(LibDir)=g >> lit.tmp
+	@$(ECHOPATH) s=@LLVM_LIBRARY_DIR@=$(LibDir)=g >> lit.tmp
 	@$(ECHOPATH) s=@SHLIBDIR@=$(SharedLibDir)=g >> lit.tmp
 	@$(ECHOPATH) s=@SHLIBEXT@=$(SHLIBEXT)=g >> lit.tmp
 	@$(ECHOPATH) s=@EXEEXT@=$(EXEEXT)=g >> lit.tmp
 	@$(ECHOPATH) s=@PYTHON_EXECUTABLE@=$(PYTHON)=g >> lit.tmp
+	@$(ECHOPATH) s=@GOLD_EXECUTABLE@=ld=g >> lit.tmp
 	@$(ECHOPATH) s=@OCAMLFIND@=$(OCAMLFIND)=g >> lit.tmp
 	@$(ECHOPATH) s!@OCAMLFLAGS@!$(addprefix -cclib ,$(LDFLAGS))!g >> lit.tmp
 	@$(ECHOPATH) s=@HAVE_OCAMLOPT@=$(HAVE_OCAMLOPT)=g >> lit.tmp
@@ -143,6 +144,7 @@ lit.site.cfg: FORCE
 	@$(ECHOPATH) s=@HOST_OS@=$(HOST_OS)=g >> lit.tmp
 	@$(ECHOPATH) s=@HOST_ARCH@=$(HOST_ARCH)=g >> lit.tmp
 	@$(ECHOPATH) s=@HAVE_LIBZ@=$(HAVE_LIBZ)=g >> lit.tmp
+	@$(ECHOPATH) s=@HAVE_DIA_SDK@=0=g >> lit.tmp
 	@sed -f lit.tmp $(PROJ_SRC_DIR)/lit.site.cfg.in > $@
 	@-rm -f lit.tmp
 
diff --git a/test/Object/AArch64/yaml2obj-elf-aarch64-rel.yaml b/test/Object/AArch64/yaml2obj-elf-aarch64-rel.yaml
index 6147025..c27e888 100644
--- a/test/Object/AArch64/yaml2obj-elf-aarch64-rel.yaml
+++ b/test/Object/AArch64/yaml2obj-elf-aarch64-rel.yaml
@@ -10,7 +10,9 @@
 # CHECK-NEXT:      - Offset:          0x0000000000000000
 # CHECK-NEXT:        Symbol:          main
 # CHECK-NEXT:        Type:            R_AARCH64_ABS64
-# CHECK-NEXT:        Addend:          0
+# CHECK-NEXT:      - Offset:          0x0000000000000008
+# CHECK-NEXT:        Symbol:          main
+# CHECK-NEXT:        Type:            R_AARCH64_TLSGD_ADR_PREL21
 
 FileHeader:
   Class:           ELFCLASS64
@@ -22,7 +24,7 @@ Sections:
     Name:            .text
     Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
     AddressAlign:    0x04
-    Content:         0000000000000000
+    Content:         00000000000000000000000000000000
   - Type:            SHT_RELA
     Name:            .rela.text
     Link:            .symtab
@@ -33,6 +35,10 @@ Sections:
         Symbol:          main
         Type:            R_AARCH64_ABS64
         Addend:          0
+      - Offset:          8
+        Symbol:          main
+        Type:            R_AARCH64_TLSGD_ADR_PREL21
+        Addend:          0
 
 Symbols:
   Local:
diff --git a/test/Object/Inputs/archive-test.a-irix6-mips64el b/test/Object/Inputs/archive-test.a-irix6-mips64el
new file mode 100644
index 0000000..ccc2634
--- /dev/null
+++ b/test/Object/Inputs/archive-test.a-irix6-mips64el
diff --git a/test/Object/Inputs/macho-invalid-bad-symbol-index b/test/Object/Inputs/macho-invalid-bad-symbol-index
new file mode 100644
index 0000000..294bbde
--- /dev/null
+++ b/test/Object/Inputs/macho-invalid-bad-symbol-index
diff --git a/test/Object/Inputs/macho-invalid-getsection-index b/test/Object/Inputs/macho-invalid-getsection-index
new file mode 100644
index 0000000..b7e4b95
--- /dev/null
+++ b/test/Object/Inputs/macho-invalid-getsection-index
diff --git a/test/Object/Inputs/macho-invalid-no-size-for-sections b/test/Object/Inputs/macho-invalid-no-size-for-sections
new file mode 100644
index 0000000..89fa95a
--- /dev/null
+++ b/test/Object/Inputs/macho-invalid-no-size-for-sections
diff --git a/test/Object/Inputs/macho-invalid-section-index-getSectionRawFinalSegmentName b/test/Object/Inputs/macho-invalid-section-index-getSectionRawFinalSegmentName
new file mode 100644
index 0000000..e3f6586
--- /dev/null
+++ b/test/Object/Inputs/macho-invalid-section-index-getSectionRawFinalSegmentName
diff --git a/test/Object/Inputs/macho-invalid-section-index-getSectionRawName b/test/Object/Inputs/macho-invalid-section-index-getSectionRawName
new file mode 100644
index 0000000..9cd3e1c
--- /dev/null
+++ b/test/Object/Inputs/macho-invalid-section-index-getSectionRawName
diff --git a/test/Object/Inputs/macho-invalid-symbol-name-past-eof b/test/Object/Inputs/macho-invalid-symbol-name-past-eof
new file mode 100644
index 0000000..8747884
--- /dev/null
+++ b/test/Object/Inputs/macho-invalid-symbol-name-past-eof
diff --git a/test/Object/Inputs/macho-invalid-too-small-load-command b/test/Object/Inputs/macho-invalid-too-small-load-command
new file mode 100644
index 0000000..3602169
--- /dev/null
+++ b/test/Object/Inputs/macho-invalid-too-small-load-command
diff --git a/test/Object/Inputs/macho-invalid-too-small-segment-load-command b/test/Object/Inputs/macho-invalid-too-small-segment-load-command
new file mode 100644
index 0000000..8cbfbf9
--- /dev/null
+++ b/test/Object/Inputs/macho-invalid-too-small-segment-load-command
diff --git a/test/Object/Inputs/macho-invalid-zero-ncmds b/test/Object/Inputs/macho-invalid-zero-ncmds
new file mode 100644
index 0000000..0505419
--- /dev/null
+++ b/test/Object/Inputs/macho-invalid-zero-ncmds
diff --git a/test/Object/Inputs/macho-no-exports.dylib b/test/Object/Inputs/macho-no-exports.dylib
new file mode 100755
index 0000000..6e1be6c
--- /dev/null
+++ b/test/Object/Inputs/macho-no-exports.dylib
diff --git a/test/Object/Inputs/macho-rpath-x86_64 b/test/Object/Inputs/macho-rpath-x86_64
new file mode 100755
index 0000000..c518fcd
--- /dev/null
+++ b/test/Object/Inputs/macho-rpath-x86_64
diff --git a/test/Object/Inputs/macho64-invalid-getsection-index b/test/Object/Inputs/macho64-invalid-getsection-index
new file mode 100644
index 0000000..a2a7bc1
--- /dev/null
+++ b/test/Object/Inputs/macho64-invalid-getsection-index
diff --git a/test/Object/Inputs/macho64-invalid-incomplete-load-command b/test/Object/Inputs/macho64-invalid-incomplete-load-command
new file mode 100644
index 0000000..a569c9e
--- /dev/null
+++ b/test/Object/Inputs/macho64-invalid-incomplete-load-command
diff --git a/test/Object/Inputs/macho64-invalid-no-size-for-sections b/test/Object/Inputs/macho64-invalid-no-size-for-sections
new file mode 100644
index 0000000..5aae5ff
--- /dev/null
+++ b/test/Object/Inputs/macho64-invalid-no-size-for-sections
diff --git a/test/Object/Inputs/macho64-invalid-too-small-load-command b/test/Object/Inputs/macho64-invalid-too-small-load-command
new file mode 100644
index 0000000..0028451
--- /dev/null
+++ b/test/Object/Inputs/macho64-invalid-too-small-load-command
diff --git a/test/Object/Inputs/macho64-invalid-too-small-segment-load-command b/test/Object/Inputs/macho64-invalid-too-small-segment-load-command
new file mode 100644
index 0000000..ce6a201
--- /dev/null
+++ b/test/Object/Inputs/macho64-invalid-too-small-segment-load-command
diff --git a/test/Object/Inputs/micro-mips.elf-mipsel b/test/Object/Inputs/micro-mips.elf-mipsel
new file mode 100755
index 0000000..80b8472
--- /dev/null
+++ b/test/Object/Inputs/micro-mips.elf-mipsel
diff --git a/test/Object/Inputs/sectionGroup.elf.x86-64 b/test/Object/Inputs/sectionGroup.elf.x86-64
new file mode 100644
index 0000000..b88ad0c
--- /dev/null
+++ b/test/Object/Inputs/sectionGroup.elf.x86-64
diff --git a/test/Object/Inputs/thin.a b/test/Object/Inputs/thin.a
new file mode 100644
index 0000000..bb9d532
--- /dev/null
+++ b/test/Object/Inputs/thin.a
diff --git a/test/Object/Mips/elf-mips64-rel.yaml b/test/Object/Mips/elf-mips64-rel.yaml
new file mode 100644
index 0000000..8b59509
--- /dev/null
+++ b/test/Object/Mips/elf-mips64-rel.yaml
@@ -0,0 +1,113 @@
+# RUN: yaml2obj -format=elf %s > %t
+# RUN: llvm-readobj -r %t | FileCheck -check-prefix=OBJ %s
+# RUN: obj2yaml %t | FileCheck -check-prefix=YAML %s
+
+# OBJ:      Relocations [
+# OBJ-NEXT:   Section (2) .rela.text {
+# OBJ-NEXT:     0x14 R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16 main 0x4
+# OBJ-NEXT:     0x1C R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16 main 0x8
+# OBJ-NEXT:     0x20 R_MIPS_GOT_PAGE/R_MIPS_NONE/R_MIPS_NONE .rodata 0x0
+# OBJ-NEXT:     0x24 R_MIPS_GOT_OFST/R_MIPS_NONE/R_MIPS_NONE .rodata 0x0
+# OBJ-NEXT:     0x28 R_MIPS_CALL16/R_MIPS_NONE/R_MIPS_NONE printf 0x0
+# OBJ-NEXT:     0x30 R_MIPS_GPREL16/R_MIPS_LO16/R_MIPS_NONE printf 0x0
+# OBJ-NEXT:   }
+# OBJ-NEXT: ]
+
+# YAML:      Relocations:
+# YAML-NEXT:   - Offset:      0x0000000000000014
+# YAML-NEXT:     Symbol:      main
+# YAML-NEXT:     Type:        R_MIPS_GPREL16
+# YAML-NEXT:     Type2:       R_MIPS_SUB
+# YAML-NEXT:     Type3:       R_MIPS_HI16
+# YAML-NEXT:     Addend:      4
+# YAML-NEXT:   - Offset:      0x000000000000001C
+# YAML-NEXT:     Symbol:      main
+# YAML-NEXT:     Type:        R_MIPS_GPREL16
+# YAML-NEXT:     Type2:       R_MIPS_SUB
+# YAML-NEXT:     Type3:       R_MIPS_LO16
+# YAML-NEXT:     Addend:      8
+# YAML-NEXT:   - Offset:      0x0000000000000020
+# YAML-NEXT:     Symbol:      .rodata
+# YAML-NEXT:     Type:        R_MIPS_GOT_PAGE
+# YAML-NEXT:   - Offset:      0x0000000000000024
+# YAML-NEXT:     Symbol:      .rodata
+# YAML-NEXT:     Type:        R_MIPS_GOT_OFST
+# YAML-NEXT:   - Offset:      0x0000000000000028
+# YAML-NEXT:     Symbol:      printf
+# YAML-NEXT:     Type:        R_MIPS_CALL16
+# YAML-NEXT:   - Offset:      0x0000000000000030
+# YAML-NEXT:     Symbol:      printf
+# YAML-NEXT:     Type:        R_MIPS_GPREL16
+# YAML-NEXT:     Type2:       R_MIPS_LO16
+# YAML-NEXT:     SpecSym:     RSS_GP0
+
+---
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_MIPS
+  Flags:           [ EF_MIPS_PIC, EF_MIPS_CPIC,
+                     EF_MIPS_NOREORDER, EF_MIPS_ARCH_64R2 ]
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x10
+    Size:            0x60
+  - Name:            .rela.text
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Info:            .text
+    Relocations:
+      - Offset:      0x14
+        Symbol:      main
+        Type:        R_MIPS_GPREL16
+        Type2:       R_MIPS_SUB
+        Type3:       R_MIPS_HI16
+        Addend:      4
+      - Offset:      0x1C
+        Symbol:      main
+        Type:        R_MIPS_GPREL16
+        Type2:       R_MIPS_SUB
+        Type3:       R_MIPS_LO16
+        Addend:      8
+      - Offset:      0x20
+        Symbol:      .rodata
+        Type:        R_MIPS_GOT_PAGE
+        Addend:      0
+      - Offset:      0x24
+        Symbol:      .rodata
+        Type:        R_MIPS_GOT_OFST
+        Addend:      0
+      - Offset:      0x28
+        Symbol:      printf
+        Type:        R_MIPS_CALL16
+        Addend:      0
+      - Offset:      0x30
+        Symbol:      printf
+        Type:        R_MIPS_GPREL16
+        Type2:       R_MIPS_LO16
+        SpecSym:     RSS_GP0
+        Addend:      0
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    AddressAlign:    0x10
+    Size:            0x0F
+
+Symbols:
+  Local:
+    - Name:            .text
+      Type:            STT_SECTION
+      Section:         .text
+    - Name:            .rodata
+      Type:            STT_SECTION
+      Section:         .rodata
+  Global:
+    - Name:            main
+      Type:            STT_FUNC
+      Section:         .text
+      Size:            0x58
+    - Name:            printf
+...
diff --git a/test/Object/Mips/objdump-micro-mips.test b/test/Object/Mips/objdump-micro-mips.test
new file mode 100644
index 0000000..0f28dc1
--- /dev/null
+++ b/test/Object/Mips/objdump-micro-mips.test
@@ -0,0 +1,12 @@
+RUN: llvm-objdump -d -mattr=micromips %p/../Inputs/micro-mips.elf-mipsel \
+RUN:   | FileCheck %s
+
+CHECK:      foo:
+CHECK-NEXT:      330:   bd 33 f8 ff   addiu   $sp, $sp, -8
+CHECK-NEXT:      334:   dd fb 04 00   sw      $fp, 4($sp)
+CHECK-NEXT:      338:   1d 00 50 f1   addu    $fp, $sp, $zero
+
+CHECK:      bar:
+CHECK-NEXT:      350:   a2 41 02 00   lui     $2, 2
+CHECK-NEXT:      354:   42 30 8f 80   addiu   $2, $2, -32625
+CHECK-NEXT:      358:   bd 33 e8 ff   addiu   $sp, $sp, -24
diff --git a/test/Object/archive-toc.test b/test/Object/archive-toc.test
index 4195c40..79a6e0e 100644
--- a/test/Object/archive-toc.test
+++ b/test/Object/archive-toc.test
@@ -26,3 +26,11 @@ CHECK:      rw-r--r-- 1002/102      8 2004-11-19 03:24:02.000000000 evenlen
 CHECK-NEXT: rw-r--r-- 1002/102      7 2004-11-19 03:24:02.000000000 oddlen
 CHECK-NEXT: rwxr-xr-x 1002/102   1465 2004-11-19 03:24:02.000000000 very_long_bytecode_file_name.bc
 CHECK-NEXT: rw-r--r-- 1002/102   2280 2004-11-19 03:24:02.000000000 IsNAN.o
+
+Test reading a thin archive created by gnu ar
+RUN: env TZ=GMT llvm-ar tv %p/Inputs/thin.a | FileCheck %s --check-prefix=THIN -strict-whitespace
+
+THIN:      rw-r--r-- 1000/1000      8 2014-12-16 00:56:27.000000000 evenlen
+THIN-NEXT: rw-r--r-- 1000/1000      7 2014-12-16 00:56:27.000000000 oddlen
+THIN-NEXT: rwxr-xr-x 1000/1000   1465 2014-12-16 00:56:27.000000000 very_long_bytecode_file_name.bc
+THIN-NEXT: rw-r--r-- 1000/1000   2280 2014-12-16 00:56:27.000000000 IsNAN.o
diff --git a/test/Object/elf-unknown-type.test b/test/Object/elf-unknown-type.test
new file mode 100644
index 0000000..9993c09
--- /dev/null
+++ b/test/Object/elf-unknown-type.test
@@ -0,0 +1,10 @@
+# RUN: yaml2obj -format=elf %s | llvm-readobj -file-headers - | FileCheck %s
+
+!ELF
+FileHeader: !FileHeader
+  Class: ELFCLASS64
+  Data: ELFDATA2LSB
+  Type: 42
+  Machine: EM_X86_64
+
+# CHECK: Type: 0x2A
diff --git a/test/Object/macho-invalid.test b/test/Object/macho-invalid.test
new file mode 100644
index 0000000..ac4bbeb
--- /dev/null
+++ b/test/Object/macho-invalid.test
@@ -0,0 +1,51 @@
+// No crash, might not be totally invalid
+RUN: llvm-objdump -private-headers %p/Inputs/macho-invalid-zero-ncmds
+
+RUN: not llvm-objdump -private-headers %p/Inputs/macho64-invalid-incomplete-load-command 2>&1 \
+RUN:      | FileCheck -check-prefix INCOMPLETE-LOADC %s
+
+RUN: not llvm-objdump -private-headers %p/Inputs/macho-invalid-too-small-load-command 2>&1 \
+RUN:      | FileCheck -check-prefix SMALL-LOADC-SIZE %s
+RUN: not llvm-objdump -private-headers %p/Inputs/macho64-invalid-too-small-load-command 2>&1 \
+RUN:      | FileCheck -check-prefix SMALL-LOADC-SIZE %s
+
+RUN: not llvm-objdump -private-headers %p/Inputs/macho-invalid-too-small-segment-load-command 2>&1 \
+RUN:      | FileCheck -check-prefix SMALL-SEGLOADC-SIZE %s
+RUN: not llvm-objdump -private-headers %p/Inputs/macho64-invalid-too-small-segment-load-command 2>&1 \
+RUN:      | FileCheck -check-prefix SMALL-SEGLOADC-SIZE %s
+
+RUN: not llvm-objdump -private-headers %p/Inputs/macho-invalid-no-size-for-sections 2>&1 \
+RUN:      | FileCheck -check-prefix TOO-MANY-SECTS %s
+RUN: not llvm-objdump -private-headers %p/Inputs/macho64-invalid-no-size-for-sections 2>&1 \
+RUN:      | FileCheck -check-prefix TOO-MANY-SECTS %s
+
+RUN: not llvm-objdump -t %p/Inputs/macho-invalid-bad-symbol-index 2>&1 \
+RUN:      | FileCheck -check-prefix BAD-SYMBOL %s
+
+RUN: not llvm-objdump -t %p/Inputs/macho-invalid-symbol-name-past-eof 2>&1 \
+RUN:      | FileCheck -check-prefix NAME-PAST-EOF %s
+
+RUN: not llvm-objdump -t %p/Inputs/macho-invalid-section-index-getSectionRawFinalSegmentName 2>&1 \
+RUN:      | FileCheck -check-prefix INVALID-SECTION-IDX-SEG-NAME %s
+
+RUN: not llvm-nm %p/Inputs/macho-invalid-section-index-getSectionRawName 2>&1 \
+RUN:      | FileCheck -check-prefix INVALID-SECTION-IDX-SECT-NAME %s
+
+RUN: not llvm-objdump -t %p/Inputs/macho-invalid-getsection-index 2>&1 \
+RUN:      | FileCheck -check-prefix INVALID-SECTION-IDX-GETSECT %s
+
+RUN: not llvm-objdump -t %p/Inputs/macho64-invalid-getsection-index 2>&1 \
+RUN:      | FileCheck -check-prefix INVALID-SECTION-IDX-GETSECT64 %s
+
+
+SMALL-LOADC-SIZE: Load command with size < 8 bytes
+SMALL-SEGLOADC-SIZE: Segment load command size is too small
+INCOMPLETE-LOADC: Malformed MachO file
+TOO-MANY-SECTS: Number of sections too large for size of load command
+BAD-SYMBOL: Requested symbol index is out of range
+NAME-PAST-EOF: Symbol name entry points before beginning or past end of file
+
+INVALID-SECTION-IDX-SEG-NAME: getSectionRawFinalSegmentName: Invalid section index
+INVALID-SECTION-IDX-SECT-NAME: getSectionRawName: Invalid section index
+INVALID-SECTION-IDX-GETSECT: getSection: Invalid section index
+INVALID-SECTION-IDX-GETSECT64: getSection64: Invalid section index
diff --git a/test/Object/nm-irix6.test b/test/Object/nm-irix6.test
new file mode 100644
index 0000000..047665c
--- /dev/null
+++ b/test/Object/nm-irix6.test
@@ -0,0 +1,27 @@
+# Check reading IRIX 6.0 64-bit archive file.
+RUN: llvm-nm %p/Inputs/archive-test.a-irix6-mips64el | FileCheck %s
+
+CHECK:      f1.o:
+CHECK-NEXT: 00000028 T f1
+CHECK-NEXT: 00000000 d s_d
+CHECK-NEXT: 00000000 t s_foo
+
+CHECK:      f2.o:
+CHECK-NEXT: 00000028 T f2
+CHECK-NEXT: 00000000 d s_d
+CHECK-NEXT: 00000000 t s_foo
+
+CHECK:      f3.o:
+CHECK-NEXT: 00000028 T f3
+CHECK-NEXT: 00000000 d s_d
+CHECK-NEXT: 00000000 t s_foo
+
+CHECK:      f4.o:
+CHECK-NEXT: 00000028 T f4
+CHECK-NEXT: 00000000 d s_d
+CHECK-NEXT: 00000000 t s_foo
+
+CHECK:      f5.o:
+CHECK-NEXT: 00000028 T f5
+CHECK-NEXT: 00000000 d s_d
+CHECK-NEXT: 00000000 t s_foo
diff --git a/test/Object/obj2yaml-sectiongroup.test b/test/Object/obj2yaml-sectiongroup.test
new file mode 100644
index 0000000..66e8e38
--- /dev/null
+++ b/test/Object/obj2yaml-sectiongroup.test
@@ -0,0 +1,26 @@
+# Checks that the tool is able to read section groups with ELF.
+RUN: obj2yaml %p/Inputs/sectionGroup.elf.x86-64 > %t1.sectiongroup.yaml
+RUN: FileCheck %s --check-prefix ELF-GROUP < %t1.sectiongroup.yaml
+RUN: yaml2obj -format=elf %t1.sectiongroup.yaml -o %t2.o.elf
+RUN: llvm-readobj -sections %t2.o.elf | FileCheck %s -check-prefix=SECTIONS
+#ELF-GROUP:  - Name:            .group
+#ELF-GROUP:    Type:            SHT_GROUP
+#ELF-GROUP:    Link:            .symtab
+#ELF-GROUP:    Info:            a
+#ELF-GROUP:    Members:
+#ELF-GROUP:      - SectionOrType:   GRP_COMDAT
+#ELF-GROUP:      - SectionOrType:   .rodata.a
+#SECTIONS: Format: ELF64-x86-64
+#SECTIONS: Arch: x86_64
+#SECTIONS: AddressSize: 64bit
+#SECTIONS:   Section {
+#SECTIONS:     Index: 1
+#SECTIONS:     Name: .group (21)
+#SECTIONS:     Type: SHT_GROUP (0x11)
+#SECTIONS:     Flags [ (0x0)
+#SECTIONS:     ]
+#SECTIONS:     Address: 0x0
+#SECTIONS:     Size: 8
+#SECTIONS:     AddressAlignment: 4
+#SECTIONS:     EntrySize: 4
+#SECTIONS:   }
diff --git a/test/Object/obj2yaml.test b/test/Object/obj2yaml.test
index 1c79e98..e654dcd 100644
--- a/test/Object/obj2yaml.test
+++ b/test/Object/obj2yaml.test
@@ -210,27 +210,21 @@ ELF-MIPSEL-NEXT:     Relocations:
 ELF-MIPSEL-NEXT:       - Offset:          0x0000000000000000
 ELF-MIPSEL-NEXT:         Symbol:          _gp_disp
 ELF-MIPSEL-NEXT:         Type:            R_MIPS_HI16
-ELF-MIPSEL-NEXT:         Addend:          0
 ELF-MIPSEL-NEXT:       - Offset:          0x0000000000000004
 ELF-MIPSEL-NEXT:         Symbol:          _gp_disp
 ELF-MIPSEL-NEXT:         Type:            R_MIPS_LO16
-ELF-MIPSEL-NEXT:         Addend:          0
 ELF-MIPSEL-NEXT:       - Offset:          0x0000000000000018
 ELF-MIPSEL-NEXT:         Symbol:          '$.str'
 ELF-MIPSEL-NEXT:         Type:            R_MIPS_GOT16
-ELF-MIPSEL-NEXT:         Addend:          0
 ELF-MIPSEL-NEXT:       - Offset:          0x000000000000001C
 ELF-MIPSEL-NEXT:         Symbol:          '$.str'
 ELF-MIPSEL-NEXT:         Type:            R_MIPS_LO16
-ELF-MIPSEL-NEXT:         Addend:          0
 ELF-MIPSEL-NEXT:       - Offset:          0x0000000000000020
 ELF-MIPSEL-NEXT:         Symbol:          puts
 ELF-MIPSEL-NEXT:         Type:            R_MIPS_CALL16
-ELF-MIPSEL-NEXT:         Addend:          0
 ELF-MIPSEL-NEXT:       - Offset:          0x000000000000002C
 ELF-MIPSEL-NEXT:         Symbol:          SomeOtherFunction
 ELF-MIPSEL-NEXT:         Type:            R_MIPS_CALL16
-ELF-MIPSEL-NEXT:         Addend:          0
 ELF-MIPSEL-NEXT:   - Name:            .data
 ELF-MIPSEL-NEXT:     Type:            SHT_PROGBITS
 ELF-MIPSEL-NEXT:     Flags:           [ SHF_WRITE, SHF_ALLOC ]
@@ -328,7 +322,6 @@ ELF-MIPS64EL-NEXT:     Relocations:
 ELF-MIPS64EL-NEXT:       - Offset:          0
 ELF-MIPS64EL-NEXT:         Symbol:          zed
 ELF-MIPS64EL-NEXT:         Type:            R_MIPS_64
-ELF-MIPS64EL-NEXT:         Addend:          0
 ELF-MIPS64EL-NEXT:   - Name:            .bss
 ELF-MIPS64EL-NEXT:     Type:            SHT_NOBITS
 ELF-MIPS64EL-NEXT:     Flags:           [ SHF_WRITE, SHF_ALLOC ]
@@ -398,7 +391,6 @@ ELF-X86-64-NEXT:     Relocations:
 ELF-X86-64-NEXT:       - Offset:          0x000000000000000D
 ELF-X86-64-NEXT:         Symbol:          .rodata.str1.1
 ELF-X86-64-NEXT:         Type:            R_X86_64_32S
-ELF-X86-64-NEXT:         Addend:          0
 ELF-X86-64-NEXT:       - Offset:          0x0000000000000012
 ELF-X86-64-NEXT:         Symbol:          puts
 ELF-X86-64-NEXT:         Type:            R_X86_64_PC32
diff --git a/test/Object/objdump-export-list.test b/test/Object/objdump-export-list.test
new file mode 100644
index 0000000..74344c1
--- /dev/null
+++ b/test/Object/objdump-export-list.test
@@ -0,0 +1,4 @@
+RUN: llvm-objdump -exports-trie %p/Inputs/macho-no-exports.dylib | FileCheck %s
+
+; Test that we don't crash with an empty export list.
+CHECK: macho-no-exports.dylib: file format Mach-O 64-bit x86-64
diff --git a/test/Object/objdump-private-headers.test b/test/Object/objdump-private-headers.test
index c562044..d311bec 100644
--- a/test/Object/objdump-private-headers.test
+++ b/test/Object/objdump-private-headers.test
@@ -2,6 +2,8 @@ RUN: llvm-objdump -p %p/Inputs/program-headers.elf-i386 \
 RUN:              | FileCheck %s -check-prefix ELF-i386
 RUN: llvm-objdump -p %p/Inputs/program-headers.elf-x86-64 \
 RUN:              | FileCheck %s -check-prefix ELF-x86-64
+RUN: llvm-objdump -p %p/Inputs/macho-rpath-x86_64 \
+RUN:              | FileCheck %s -check-prefix MACHO-x86_64
 
 ELF-i386: Program Header:
 ELF-i386:     LOAD off    0x00000000 vaddr 0x08048000 paddr 0x08048000 align 2**12
@@ -16,3 +18,8 @@ ELF-x86-64: EH_FRAME off    0x00000000000000f4 vaddr 0x00000000004000f4 paddr 0x
 ELF-x86-64:          filesz 0x0000000000000014 memsz 0x0000000000000014 flags r--
 ELF-x86-64:    STACK off    0x0000000000000000 vaddr 0x0000000000000000 paddr 0x0000000000000000 align 2**3
 ELF-x86-64:          filesz 0x0000000000000000 memsz 0x0000000000000000 flags rw-
+
+MACHO-x86_64: Load command 12
+MACHO-x86_64:          cmd LC_RPATH
+MACHO-x86_64:      cmdsize 32
+MACHO-x86_64:         path @executable_path/. (offset 12)
diff --git a/test/Other/2009-03-31-CallGraph.ll b/test/Other/2009-03-31-CallGraph.ll
index 864903c..1e17830 100644
--- a/test/Other/2009-03-31-CallGraph.ll
+++ b/test/Other/2009-03-31-CallGraph.ll
@@ -7,6 +7,8 @@ ok1:
     ret void
 
 lpad1:
+    landingpad {i8*, i32} personality i32 (...)* @__gxx_personality_v0
+            cleanup
     invoke void @f4()
         to label %ok2 unwind label %lpad2
 
diff --git a/test/Other/Inputs/utf8-bom-response b/test/Other/Inputs/utf8-bom-response
new file mode 100644
index 0000000..9dae315
--- /dev/null
+++ b/test/Other/Inputs/utf8-bom-response
@@ -0,0 +1 @@
+-help
diff --git a/test/Other/Inputs/utf8-response b/test/Other/Inputs/utf8-response
new file mode 100644
index 0000000..97f455a
--- /dev/null
+++ b/test/Other/Inputs/utf8-response
@@ -0,0 +1 @@
+-help
diff --git a/test/Other/ResponseFile.ll b/test/Other/ResponseFile.ll
index 914e548..92648b8 100644
--- a/test/Other/ResponseFile.ll
+++ b/test/Other/ResponseFile.ll
@@ -6,6 +6,11 @@
 ; RUN: llvm-as @%t.list2 -o %t.bc
 ; RUN: llvm-nm %t.bc 2>&1 | FileCheck %s
 
+; When the response file begins with UTF8 BOM sequence, we shall remove them.
+; Neither command below should return a "Could not open input file" error.
+; RUN: llvm-as @%S/Inputs/utf8-response > /dev/null
+; RUN: llvm-as @%S/Inputs/utf8-bom-response > /dev/null
+
 ; CHECK: T foobar
 
 define void @foobar() {
diff --git a/test/Other/new-pass-manager.ll b/test/Other/new-pass-manager.ll
index cec01b5..fb84e78 100644
--- a/test/Other/new-pass-manager.ll
+++ b/test/Other/new-pass-manager.ll
@@ -5,27 +5,85 @@
 ; files, but for now this is just going to step the new process through its
 ; paces.
 
+; RUN: opt -disable-output -disable-verify -debug-pass-manager \
+; RUN:     -passes=no-op-module %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-MODULE-PASS
+; CHECK-MODULE-PASS: Starting pass manager
+; CHECK-MODULE-PASS-NEXT: Running pass: NoOpModulePass
+; CHECK-MODULE-PASS-NEXT: Finished pass manager
+
+; RUN: opt -disable-output -disable-verify -debug-pass-manager \
+; RUN:     -passes=no-op-cgscc %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-CGSCC-PASS
+; RUN: opt -disable-output -disable-verify -debug-pass-manager \
+; RUN:     -passes='cgscc(no-op-cgscc)' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-CGSCC-PASS
+; CHECK-CGSCC-PASS: Starting pass manager
+; CHECK-CGSCC-PASS-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor
+; CHECK-CGSCC-PASS-NEXT: Running analysis: CGSCCAnalysisManagerModuleProxy
+; CHECK-CGSCC-PASS-NEXT: Running analysis: Lazy CallGraph Analysis
+; CHECK-CGSCC-PASS-NEXT: Starting pass manager
+; CHECK-CGSCC-PASS-NEXT: Running pass: NoOpCGSCCPass
+; CHECK-CGSCC-PASS-NEXT: Finished pass manager
+; CHECK-CGSCC-PASS-NEXT: Finished pass manager
+
+; RUN: opt -disable-output -disable-verify -debug-pass-manager \
+; RUN:     -passes=no-op-function %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-FUNCTION-PASS
+; RUN: opt -disable-output -disable-verify -debug-pass-manager \
+; RUN:     -passes='function(no-op-function)' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-FUNCTION-PASS
+; CHECK-FUNCTION-PASS: Starting pass manager
+; CHECK-FUNCTION-PASS-NEXT: Running pass: ModuleToFunctionPassAdaptor
+; CHECK-FUNCTION-PASS-NEXT: Running analysis: FunctionAnalysisManagerModuleProxy
+; CHECK-FUNCTION-PASS-NEXT: Starting pass manager
+; CHECK-FUNCTION-PASS-NEXT: Running pass: NoOpFunctionPass
+; CHECK-FUNCTION-PASS-NEXT: Finished pass manager
+; CHECK-FUNCTION-PASS-NEXT: Finished pass manager
+
 ; RUN: opt -disable-output -debug-pass-manager -passes=print %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-MODULE-PRINT
-; CHECK-MODULE-PRINT: Starting module pass manager
-; CHECK-MODULE-PRINT: Running module pass: VerifierPass
-; CHECK-MODULE-PRINT: Running module pass: PrintModulePass
+; CHECK-MODULE-PRINT: Starting pass manager
+; CHECK-MODULE-PRINT: Running pass: VerifierPass
+; CHECK-MODULE-PRINT: Running pass: PrintModulePass
 ; CHECK-MODULE-PRINT: ModuleID
 ; CHECK-MODULE-PRINT: define void @foo()
-; CHECK-MODULE-PRINT: Running module pass: VerifierPass
-; CHECK-MODULE-PRINT: Finished module pass manager
+; CHECK-MODULE-PRINT: Running pass: VerifierPass
+; CHECK-MODULE-PRINT: Finished pass manager
+
+; RUN: opt -disable-output -debug-pass-manager -disable-verify -passes='print,verify' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-MODULE-VERIFY
+; CHECK-MODULE-VERIFY: Starting pass manager
+; CHECK-MODULE-VERIFY: Running pass: PrintModulePass
+; CHECK-MODULE-VERIFY: ModuleID
+; CHECK-MODULE-VERIFY: define void @foo()
+; CHECK-MODULE-VERIFY: Running pass: VerifierPass
+; CHECK-MODULE-VERIFY: Finished pass manager
 
 ; RUN: opt -disable-output -debug-pass-manager -passes='function(print)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-FUNCTION-PRINT
-; CHECK-FUNCTION-PRINT: Starting module pass manager
-; CHECK-FUNCTION-PRINT: Running module pass: VerifierPass
-; CHECK-FUNCTION-PRINT: Starting function pass manager
-; CHECK-FUNCTION-PRINT: Running function pass: PrintFunctionPass
+; CHECK-FUNCTION-PRINT: Starting pass manager
+; CHECK-FUNCTION-PRINT: Running pass: VerifierPass
+; CHECK-FUNCTION-PRINT: Running pass: ModuleToFunctionPassAdaptor
+; CHECK-FUNCTION-PRINT: Running analysis: FunctionAnalysisManagerModuleProxy
+; CHECK-FUNCTION-PRINT: Starting pass manager
+; CHECK-FUNCTION-PRINT: Running pass: PrintFunctionPass
 ; CHECK-FUNCTION-PRINT-NOT: ModuleID
 ; CHECK-FUNCTION-PRINT: define void @foo()
-; CHECK-FUNCTION-PRINT: Finished function pass manager
-; CHECK-FUNCTION-PRINT: Running module pass: VerifierPass
-; CHECK-FUNCTION-PRINT: Finished module pass manager
+; CHECK-FUNCTION-PRINT: Finished pass manager
+; CHECK-FUNCTION-PRINT: Running pass: VerifierPass
+; CHECK-FUNCTION-PRINT: Finished pass manager
+
+; RUN: opt -disable-output -debug-pass-manager -disable-verify -passes='function(print,verify)' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-FUNCTION-VERIFY
+; CHECK-FUNCTION-VERIFY: Starting pass manager
+; CHECK-FUNCTION-VERIFY: Starting pass manager
+; CHECK-FUNCTION-VERIFY: Running pass: PrintFunctionPass
+; CHECK-FUNCTION-VERIFY-NOT: ModuleID
+; CHECK-FUNCTION-VERIFY: define void @foo()
+; CHECK-FUNCTION-VERIFY: Running pass: VerifierPass
+; CHECK-FUNCTION-VERIFY: Finished pass manager
+; CHECK-FUNCTION-VERIFY: Finished pass manager
 
 ; RUN: opt -S -o - -passes='no-op-module,no-op-module' %s \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-NOOP
@@ -40,30 +98,208 @@
 
 ; RUN: opt -disable-output -debug-pass-manager -verify-each -passes='no-op-module,function(no-op-function)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-VERIFY-EACH
-; CHECK-VERIFY-EACH: Starting module pass manager
-; CHECK-VERIFY-EACH: Running module pass: VerifierPass
-; CHECK-VERIFY-EACH: Running module pass: NoOpModulePass
-; CHECK-VERIFY-EACH: Running module pass: VerifierPass
-; CHECK-VERIFY-EACH: Starting function pass manager
-; CHECK-VERIFY-EACH: Running function pass: NoOpFunctionPass
-; CHECK-VERIFY-EACH: Running function pass: VerifierPass
-; CHECK-VERIFY-EACH: Finished function pass manager
-; CHECK-VERIFY-EACH: Running module pass: VerifierPass
-; CHECK-VERIFY-EACH: Finished module pass manager
+; CHECK-VERIFY-EACH: Starting pass manager
+; CHECK-VERIFY-EACH: Running pass: VerifierPass
+; CHECK-VERIFY-EACH: Running pass: NoOpModulePass
+; CHECK-VERIFY-EACH: Running pass: VerifierPass
+; CHECK-VERIFY-EACH: Starting pass manager
+; CHECK-VERIFY-EACH: Running pass: NoOpFunctionPass
+; CHECK-VERIFY-EACH: Running pass: VerifierPass
+; CHECK-VERIFY-EACH: Finished pass manager
+; CHECK-VERIFY-EACH: Running pass: VerifierPass
+; CHECK-VERIFY-EACH: Finished pass manager
 
 ; RUN: opt -disable-output -debug-pass-manager -disable-verify -passes='no-op-module,function(no-op-function)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-NO-VERIFY
-; CHECK-NO-VERIFY: Starting module pass manager
+; CHECK-NO-VERIFY: Starting pass manager
 ; CHECK-NO-VERIFY-NOT: VerifierPass
-; CHECK-NO-VERIFY: Running module pass: NoOpModulePass
+; CHECK-NO-VERIFY: Running pass: NoOpModulePass
 ; CHECK-NO-VERIFY-NOT: VerifierPass
-; CHECK-NO-VERIFY: Starting function pass manager
-; CHECK-NO-VERIFY: Running function pass: NoOpFunctionPass
+; CHECK-NO-VERIFY: Starting pass manager
+; CHECK-NO-VERIFY: Running pass: NoOpFunctionPass
 ; CHECK-NO-VERIFY-NOT: VerifierPass
-; CHECK-NO-VERIFY: Finished function pass manager
+; CHECK-NO-VERIFY: Finished pass manager
 ; CHECK-NO-VERIFY-NOT: VerifierPass
-; CHECK-NO-VERIFY: Finished module pass manager
+; CHECK-NO-VERIFY: Finished pass manager
+
+; RUN: opt -disable-output -debug-pass-manager \
+; RUN:     -passes='require<no-op-module>,cgscc(require<no-op-cgscc>,function(require<no-op-function>))' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-ANALYSES
+; CHECK-ANALYSES: Starting pass manager
+; CHECK-ANALYSES: Running pass: RequireAnalysisPass
+; CHECK-ANALYSES: Running analysis: NoOpModuleAnalysis
+; CHECK-ANALYSES: Starting pass manager
+; CHECK-ANALYSES: Running pass: RequireAnalysisPass
+; CHECK-ANALYSES: Running analysis: NoOpCGSCCAnalysis
+; CHECK-ANALYSES: Starting pass manager
+; CHECK-ANALYSES: Running pass: RequireAnalysisPass
+; CHECK-ANALYSES: Running analysis: NoOpFunctionAnalysis
+
+; Make sure no-op passes that preserve all analyses don't even try to do any
+; analysis invalidation.
+; RUN: opt -disable-output -debug-pass-manager \
+; RUN:     -passes='require<no-op-module>,cgscc(require<no-op-cgscc>,function(require<no-op-function>))' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-NO-OP-INVALIDATION
+; CHECK-NO-OP-INVALIDATION: Starting pass manager
+; CHECK-NO-OP-INVALIDATION-NOT: Invalidating all non-preserved analyses
+
+; RUN: opt -disable-output -debug-pass-manager \
+; RUN:     -passes='require<no-op-module>,require<no-op-module>,require<no-op-module>' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-DO-CACHE-MODULE-ANALYSIS-RESULTS
+; CHECK-DO-CACHE-MODULE-ANALYSIS-RESULTS: Starting pass manager
+; CHECK-DO-CACHE-MODULE-ANALYSIS-RESULTS: Running pass: RequireAnalysisPass
+; CHECK-DO-CACHE-MODULE-ANALYSIS-RESULTS: Running analysis: NoOpModuleAnalysis
+; CHECK-DO-CACHE-MODULE-ANALYSIS-RESULTS-NOT: Running analysis: NoOpModuleAnalysis
+
+; RUN: opt -disable-output -debug-pass-manager \
+; RUN:     -passes='require<no-op-module>,invalidate<no-op-module>,require<no-op-module>' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-DO-INVALIDATE-MODULE-ANALYSIS-RESULTS
+; CHECK-DO-INVALIDATE-MODULE-ANALYSIS-RESULTS: Starting pass manager
+; CHECK-DO-INVALIDATE-MODULE-ANALYSIS-RESULTS: Running pass: RequireAnalysisPass
+; CHECK-DO-INVALIDATE-MODULE-ANALYSIS-RESULTS: Running analysis: NoOpModuleAnalysis
+; CHECK-DO-INVALIDATE-MODULE-ANALYSIS-RESULTS: Invalidating analysis: NoOpModuleAnalysis
+; CHECK-DO-INVALIDATE-MODULE-ANALYSIS-RESULTS: Running analysis: NoOpModuleAnalysis
+
+; RUN: opt -disable-output -debug-pass-manager \
+; RUN:     -passes='cgscc(require<no-op-cgscc>,require<no-op-cgscc>,require<no-op-cgscc>)' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-DO-CACHE-CGSCC-ANALYSIS-RESULTS
+; CHECK-DO-CACHE-CGSCC-ANALYSIS-RESULTS: Starting pass manager
+; CHECK-DO-CACHE-CGSCC-ANALYSIS-RESULTS: Running pass: RequireAnalysisPass
+; CHECK-DO-CACHE-CGSCC-ANALYSIS-RESULTS: Running analysis: NoOpCGSCCAnalysis
+; CHECK-DO-CACHE-CGSCC-ANALYSIS-RESULTS-NOT: Running analysis: NoOpCGSCCAnalysis
+
+; RUN: opt -disable-output -debug-pass-manager \
+; RUN:     -passes='cgscc(require<no-op-cgscc>,invalidate<no-op-cgscc>,require<no-op-cgscc>)' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-DO-INVALIDATE-CGSCC-ANALYSIS-RESULTS
+; CHECK-DO-INVALIDATE-CGSCC-ANALYSIS-RESULTS: Starting pass manager
+; CHECK-DO-INVALIDATE-CGSCC-ANALYSIS-RESULTS: Running pass: RequireAnalysisPass
+; CHECK-DO-INVALIDATE-CGSCC-ANALYSIS-RESULTS: Running analysis: NoOpCGSCCAnalysis
+; CHECK-DO-INVALIDATE-CGSCC-ANALYSIS-RESULTS: Invalidating analysis: NoOpCGSCCAnalysis
+; CHECK-DO-INVALIDATE-CGSCC-ANALYSIS-RESULTS: Running analysis: NoOpCGSCCAnalysis
+
+; RUN: opt -disable-output -debug-pass-manager \
+; RUN:     -passes='function(require<no-op-function>,require<no-op-function>,require<no-op-function>)' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-DO-CACHE-FUNCTION-ANALYSIS-RESULTS
+; CHECK-DO-CACHE-FUNCTION-ANALYSIS-RESULTS: Starting pass manager
+; CHECK-DO-CACHE-FUNCTION-ANALYSIS-RESULTS: Running pass: RequireAnalysisPass
+; CHECK-DO-CACHE-FUNCTION-ANALYSIS-RESULTS: Running analysis: NoOpFunctionAnalysis
+; CHECK-DO-CACHE-FUNCTION-ANALYSIS-RESULTS-NOT: Running analysis: NoOpFunctionAnalysis
+
+; RUN: opt -disable-output -debug-pass-manager \
+; RUN:     -passes='function(require<no-op-function>,invalidate<no-op-function>,require<no-op-function>)' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-DO-INVALIDATE-FUNCTION-ANALYSIS-RESULTS
+; CHECK-DO-INVALIDATE-FUNCTION-ANALYSIS-RESULTS: Starting pass manager
+; CHECK-DO-INVALIDATE-FUNCTION-ANALYSIS-RESULTS: Running pass: RequireAnalysisPass
+; CHECK-DO-INVALIDATE-FUNCTION-ANALYSIS-RESULTS: Running analysis: NoOpFunctionAnalysis
+; CHECK-DO-INVALIDATE-FUNCTION-ANALYSIS-RESULTS: Invalidating analysis: NoOpFunctionAnalysis
+; CHECK-DO-INVALIDATE-FUNCTION-ANALYSIS-RESULTS: Running analysis: NoOpFunctionAnalysis
+
+; RUN: opt -disable-output -disable-verify -debug-pass-manager \
+; RUN:     -passes='require<no-op-module>,module(require<no-op-module>,function(require<no-op-function>,invalidate<all>,require<no-op-function>),require<no-op-module>),require<no-op-module>' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-INVALIDATE-ALL
+; CHECK-INVALIDATE-ALL: Starting pass manager
+; CHECK-INVALIDATE-ALL: Running pass: RequireAnalysisPass
+; CHECK-INVALIDATE-ALL: Running analysis: NoOpModuleAnalysis
+; CHECK-INVALIDATE-ALL: Starting pass manager
+; CHECK-INVALIDATE-ALL: Running pass: RequireAnalysisPass
+; CHECK-INVALIDATE-ALL-NOT: Running analysis: NoOpModuleAnalysis
+; CHECK-INVALIDATE-ALL: Starting pass manager
+; CHECK-INVALIDATE-ALL: Running pass: RequireAnalysisPass
+; CHECK-INVALIDATE-ALL: Running analysis: NoOpFunctionAnalysis
+; CHECK-INVALIDATE-ALL: Running pass: InvalidateAllAnalysesPass
+; CHECK-INVALIDATE-ALL: Invalidating all non-preserved analyses
+; CHECK-INVALIDATE-ALL: Invalidating analysis: NoOpFunctionAnalysis
+; CHECK-INVALIDATE-ALL: Running pass: RequireAnalysisPass
+; CHECK-INVALIDATE-ALL: Running analysis: NoOpFunctionAnalysis
+; CHECK-INVALIDATE-ALL: Finished pass manager
+; CHECK-INVALIDATE-ALL: Invalidating all non-preserved analyses
+; CHECK-INVALIDATE-ALL-NOT: Running analysis: NoOpFunctionAnalysis
+; CHECK-INVALIDATE-ALL: Invalidating all non-preserved analyses
+; CHECK-INVALIDATE-ALL: Invalidating analysis: NoOpModuleAnalysis
+; CHECK-INVALIDATE-ALL: Running pass: RequireAnalysisPass
+; CHECK-INVALIDATE-ALL: Running analysis: NoOpModuleAnalysis
+; CHECK-INVALIDATE-ALL: Finished pass manager
+; CHECK-INVALIDATE-ALL: Invalidating all non-preserved analyses
+; CHECK-INVALIDATE-ALL-NOT: Invalidating analysis: NoOpModuleAnalysis
+; CHECK-INVALIDATE-ALL: Running pass: RequireAnalysisPass
+; CHECK-INVALIDATE-ALL-NOT: Running analysis: NoOpModuleAnalysis
+; CHECK-INVALIDATE-ALL: Finished pass manager
+
+; RUN: opt -disable-output -disable-verify -debug-pass-manager \
+; RUN:     -passes='require<no-op-module>,module(require<no-op-module>,cgscc(require<no-op-cgscc>,function(require<no-op-function>,invalidate<all>,require<no-op-function>),require<no-op-cgscc>),require<no-op-module>),require<no-op-module>' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-INVALIDATE-ALL-CG
+; CHECK-INVALIDATE-ALL-CG: Starting pass manager
+; CHECK-INVALIDATE-ALL-CG: Running pass: RequireAnalysisPass
+; CHECK-INVALIDATE-ALL-CG: Running analysis: NoOpModuleAnalysis
+; CHECK-INVALIDATE-ALL-CG: Starting pass manager
+; CHECK-INVALIDATE-ALL-CG: Running pass: RequireAnalysisPass
+; CHECK-INVALIDATE-ALL-CG-NOT: Running analysis: NoOpModuleAnalysis
+; CHECK-INVALIDATE-ALL-CG: Starting pass manager
+; CHECK-INVALIDATE-ALL-CG: Running pass: RequireAnalysisPass
+; CHECK-INVALIDATE-ALL-CG: Running analysis: NoOpCGSCCAnalysis
+; CHECK-INVALIDATE-ALL-CG: Starting pass manager
+; CHECK-INVALIDATE-ALL-CG: Running pass: RequireAnalysisPass
+; CHECK-INVALIDATE-ALL-CG: Running analysis: NoOpFunctionAnalysis
+; CHECK-INVALIDATE-ALL-CG: Running pass: InvalidateAllAnalysesPass
+; CHECK-INVALIDATE-ALL-CG: Invalidating all non-preserved analyses
+; CHECK-INVALIDATE-ALL-CG: Invalidating analysis: NoOpFunctionAnalysis
+; CHECK-INVALIDATE-ALL-CG: Running pass: RequireAnalysisPass
+; CHECK-INVALIDATE-ALL-CG: Running analysis: NoOpFunctionAnalysis
+; CHECK-INVALIDATE-ALL-CG: Finished pass manager
+; CHECK-INVALIDATE-ALL-CG: Invalidating all non-preserved analyses
+; CHECK-INVALIDATE-ALL-CG-NOT: Running analysis: NoOpFunctionAnalysis
+; CHECK-INVALIDATE-ALL-CG: Invalidating all non-preserved analyses
+; CHECK-INVALIDATE-ALL-CG: Invalidating analysis: NoOpCGSCCAnalysis
+; CHECK-INVALIDATE-ALL-CG: Running pass: RequireAnalysisPass
+; CHECK-INVALIDATE-ALL-CG: Running analysis: NoOpCGSCCAnalysis
+; CHECK-INVALIDATE-ALL-CG: Finished pass manager
+; CHECK-INVALIDATE-ALL-CG: Invalidating all non-preserved analyses
+; CHECK-INVALIDATE-ALL-CG-NOT: Invalidating analysis: NoOpCGSCCAnalysis
+; CHECK-INVALIDATE-ALL-CG: Invalidating all non-preserved analyses
+; CHECK-INVALIDATE-ALL-CG: Invalidating analysis: NoOpModuleAnalysis
+; CHECK-INVALIDATE-ALL-CG: Running pass: RequireAnalysisPass
+; CHECK-INVALIDATE-ALL-CG: Running analysis: NoOpModuleAnalysis
+; CHECK-INVALIDATE-ALL-CG: Finished pass manager
+; CHECK-INVALIDATE-ALL-CG: Invalidating all non-preserved analyses
+; CHECK-INVALIDATE-ALL-CG-NOT: Invalidating analysis: NoOpModuleAnalysis
+; CHECK-INVALIDATE-ALL-CG: Running pass: RequireAnalysisPass
+; CHECK-INVALIDATE-ALL-CG-NOT: Running analysis: NoOpModuleAnalysis
+; CHECK-INVALIDATE-ALL-CG: Finished pass manager
+
+; RUN: opt -disable-output -disable-verify -debug-pass-manager %s 2>&1 \
+; RUN:     -passes='require<targetlibinfo>,invalidate<all>,require<targetlibinfo>' \
+; RUN:     | FileCheck %s --check-prefix=CHECK-TLI
+; CHECK-TLI: Starting pass manager
+; CHECK-TLI: Running pass: RequireAnalysisPass
+; CHECK-TLI: Running analysis: TargetLibraryAnalysis
+; CHECK-TLI: Running pass: InvalidateAllAnalysesPass
+; CHECK-TLI-NOT: Invalidating analysis: TargetLibraryAnalysis
+; CHECK-TLI: Running pass: RequireAnalysisPass
+; CHECK-TLI-NOT: Running analysis: TargetLibraryAnalysis
+; CHECK-TLI: Finished pass manager
+
+; RUN: opt -disable-output -disable-verify -debug-pass-manager %s 2>&1 \
+; RUN:     -passes='require<targetir>,invalidate<all>,require<targetir>' \
+; RUN:     | FileCheck %s --check-prefix=CHECK-TIRA
+; CHECK-TIRA: Starting pass manager
+; CHECK-TIRA: Running pass: RequireAnalysisPass
+; CHECK-TIRA: Running analysis: TargetIRAnalysis
+; CHECK-TIRA: Running pass: InvalidateAllAnalysesPass
+; CHECK-TIRA-NOT: Invalidating analysis: TargetIRAnalysis
+; CHECK-TIRA: Running pass: RequireAnalysisPass
+; CHECK-TIRA-NOT: Running analysis: TargetIRAnalysis
+; CHECK-TIRA: Finished pass manager
+
+; RUN: opt -disable-output -disable-verify -debug-pass-manager %s 2>&1 \
+; RUN:     -passes='require<domtree>' \
+; RUN:     | FileCheck %s --check-prefix=CHECK-DT
+; CHECK-DT: Starting pass manager
+; CHECK-DT: Running pass: RequireAnalysisPass
+; CHECK-DT: Running analysis: DominatorTreeAnalysis
+; CHECK-DT: Finished pass manager
 
 define void @foo() {
   ret void
 }
+
+declare void @bar()
diff --git a/test/Other/pass-pipeline-parsing.ll b/test/Other/pass-pipeline-parsing.ll
index 4ec4162..da0e760 100644
--- a/test/Other/pass-pipeline-parsing.ll
+++ b/test/Other/pass-pipeline-parsing.ll
@@ -1,59 +1,55 @@
 ; RUN: opt -disable-output -debug-pass-manager \
 ; RUN:     -passes=no-op-module,no-op-module %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-TWO-NOOP-MP
-; CHECK-TWO-NOOP-MP: Starting module pass manager
-; CHECK-TWO-NOOP-MP: Running module pass: NoOpModulePass
-; CHECK-TWO-NOOP-MP: Running module pass: NoOpModulePass
-; CHECK-TWO-NOOP-MP: Finished module pass manager
+; CHECK-TWO-NOOP-MP: Starting pass manager
+; CHECK-TWO-NOOP-MP: Running pass: NoOpModulePass
+; CHECK-TWO-NOOP-MP: Running pass: NoOpModulePass
+; CHECK-TWO-NOOP-MP: Finished pass manager
 
 ; RUN: opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='module(no-op-module,no-op-module)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-NESTED-TWO-NOOP-MP
-; CHECK-NESTED-TWO-NOOP-MP: Starting module pass manager
-; CHECK-NESTED-TWO-NOOP-MP: Running module pass: ModulePassManager
-; CHECK-NESTED-TWO-NOOP-MP: Starting module pass manager
-; CHECK-NESTED-TWO-NOOP-MP: Running module pass: NoOpModulePass
-; CHECK-NESTED-TWO-NOOP-MP: Running module pass: NoOpModulePass
-; CHECK-NESTED-TWO-NOOP-MP: Finished module pass manager
-; CHECK-NESTED-TWO-NOOP-MP: Finished module pass manager
+; CHECK-NESTED-TWO-NOOP-MP: Starting pass manager
+; CHECK-NESTED-TWO-NOOP-MP: Starting pass manager
+; CHECK-NESTED-TWO-NOOP-MP: Running pass: NoOpModulePass
+; CHECK-NESTED-TWO-NOOP-MP: Running pass: NoOpModulePass
+; CHECK-NESTED-TWO-NOOP-MP: Finished pass manager
+; CHECK-NESTED-TWO-NOOP-MP: Finished pass manager
 
 ; RUN: opt -disable-output -debug-pass-manager \
 ; RUN:     -passes=no-op-function,no-op-function %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-TWO-NOOP-FP
-; CHECK-TWO-NOOP-FP: Starting module pass manager
-; CHECK-TWO-NOOP-FP: Running module pass: ModuleToFunctionPassAdaptor
-; CHECK-TWO-NOOP-FP: Starting function pass manager
-; CHECK-TWO-NOOP-FP: Running function pass: NoOpFunctionPass
-; CHECK-TWO-NOOP-FP: Running function pass: NoOpFunctionPass
-; CHECK-TWO-NOOP-FP: Finished function pass manager
-; CHECK-TWO-NOOP-FP: Finished module pass manager
+; CHECK-TWO-NOOP-FP: Starting pass manager
+; CHECK-TWO-NOOP-FP: Running pass: ModuleToFunctionPassAdaptor
+; CHECK-TWO-NOOP-FP: Starting pass manager
+; CHECK-TWO-NOOP-FP: Running pass: NoOpFunctionPass
+; CHECK-TWO-NOOP-FP: Running pass: NoOpFunctionPass
+; CHECK-TWO-NOOP-FP: Finished pass manager
+; CHECK-TWO-NOOP-FP: Finished pass manager
 
 ; RUN: opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(no-op-function,no-op-function)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-NESTED-TWO-NOOP-FP
-; CHECK-NESTED-TWO-NOOP-FP: Starting module pass manager
-; CHECK-NESTED-TWO-NOOP-FP: Running module pass: ModuleToFunctionPassAdaptor
-; CHECK-NESTED-TWO-NOOP-FP: Starting function pass manager
-; CHECK-NESTED-TWO-NOOP-FP: Running function pass: FunctionPassManager
-; CHECK-NESTED-TWO-NOOP-FP: Starting function pass manager
-; CHECK-NESTED-TWO-NOOP-FP: Running function pass: NoOpFunctionPass
-; CHECK-NESTED-TWO-NOOP-FP: Running function pass: NoOpFunctionPass
-; CHECK-NESTED-TWO-NOOP-FP: Finished function pass manager
-; CHECK-NESTED-TWO-NOOP-FP: Finished function pass manager
-; CHECK-NESTED-TWO-NOOP-FP: Finished module pass manager
+; CHECK-NESTED-TWO-NOOP-FP: Starting pass manager
+; CHECK-NESTED-TWO-NOOP-FP: Running pass: ModuleToFunctionPassAdaptor
+; CHECK-NESTED-TWO-NOOP-FP: Starting pass manager
+; CHECK-NESTED-TWO-NOOP-FP: Running pass: NoOpFunctionPass
+; CHECK-NESTED-TWO-NOOP-FP: Running pass: NoOpFunctionPass
+; CHECK-NESTED-TWO-NOOP-FP: Finished pass manager
+; CHECK-NESTED-TWO-NOOP-FP: Finished pass manager
 
 ; RUN: opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-module,function(no-op-function,no-op-function),no-op-module' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-MIXED-FP-AND-MP
-; CHECK-MIXED-FP-AND-MP: Starting module pass manager
-; CHECK-MIXED-FP-AND-MP: Running module pass: NoOpModulePass
-; CHECK-MIXED-FP-AND-MP: Running module pass: ModuleToFunctionPassAdaptor
-; CHECK-MIXED-FP-AND-MP: Starting function pass manager
-; CHECK-MIXED-FP-AND-MP: Running function pass: NoOpFunctionPass
-; CHECK-MIXED-FP-AND-MP: Running function pass: NoOpFunctionPass
-; CHECK-MIXED-FP-AND-MP: Finished function pass manager
-; CHECK-MIXED-FP-AND-MP: Running module pass: NoOpModulePass
-; CHECK-MIXED-FP-AND-MP: Finished module pass manager
+; CHECK-MIXED-FP-AND-MP: Starting pass manager
+; CHECK-MIXED-FP-AND-MP: Running pass: NoOpModulePass
+; CHECK-MIXED-FP-AND-MP: Running pass: ModuleToFunctionPassAdaptor
+; CHECK-MIXED-FP-AND-MP: Starting pass manager
+; CHECK-MIXED-FP-AND-MP: Running pass: NoOpFunctionPass
+; CHECK-MIXED-FP-AND-MP: Running pass: NoOpFunctionPass
+; CHECK-MIXED-FP-AND-MP: Finished pass manager
+; CHECK-MIXED-FP-AND-MP: Running pass: NoOpModulePass
+; CHECK-MIXED-FP-AND-MP: Finished pass manager
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-module)' %s 2>&1 \
@@ -105,41 +101,41 @@
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED10
 ; CHECK-UNBALANCED10: unable to parse pass pipeline description
 
-; RUN: opt -disable-output -debug-pass-manager -debug-cgscc-pass-manager \
+; RUN: opt -disable-output -debug-pass-manager \
 ; RUN:     -passes=no-op-cgscc,no-op-cgscc %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-TWO-NOOP-CG
-; CHECK-TWO-NOOP-CG: Starting module pass manager
-; CHECK-TWO-NOOP-CG: Running module pass: ModuleToPostOrderCGSCCPassAdaptor
-; CHECK-TWO-NOOP-CG: Starting CGSCC pass manager
-; CHECK-TWO-NOOP-CG: Running CGSCC pass: NoOpCGSCCPass
-; CHECK-TWO-NOOP-CG: Running CGSCC pass: NoOpCGSCCPass
-; CHECK-TWO-NOOP-CG: Finished CGSCC pass manager
-; CHECK-TWO-NOOP-CG: Finished module pass manager
-
-; RUN: opt -disable-output -debug-pass-manager -debug-cgscc-pass-manager \
+; CHECK-TWO-NOOP-CG: Starting pass manager
+; CHECK-TWO-NOOP-CG: Running pass: ModuleToPostOrderCGSCCPassAdaptor
+; CHECK-TWO-NOOP-CG: Starting pass manager
+; CHECK-TWO-NOOP-CG: Running pass: NoOpCGSCCPass
+; CHECK-TWO-NOOP-CG: Running pass: NoOpCGSCCPass
+; CHECK-TWO-NOOP-CG: Finished pass manager
+; CHECK-TWO-NOOP-CG: Finished pass manager
+
+; RUN: opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='module(function(no-op-function),cgscc(no-op-cgscc,function(no-op-function),no-op-cgscc),function(no-op-function))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-NESTED-MP-CG-FP
-; CHECK-NESTED-MP-CG-FP: Starting module pass manager
-; CHECK-NESTED-MP-CG-FP: Starting module pass manager
-; CHECK-NESTED-MP-CG-FP: Running module pass: ModuleToFunctionPassAdaptor
-; CHECK-NESTED-MP-CG-FP: Starting function pass manager
-; CHECK-NESTED-MP-CG-FP: Running function pass: NoOpFunctionPass
-; CHECK-NESTED-MP-CG-FP: Finished function pass manager
-; CHECK-NESTED-MP-CG-FP: Running module pass: ModuleToPostOrderCGSCCPassAdaptor
-; CHECK-NESTED-MP-CG-FP: Starting CGSCC pass manager
-; CHECK-NESTED-MP-CG-FP: Running CGSCC pass: NoOpCGSCCPass
-; CHECK-NESTED-MP-CG-FP: Running CGSCC pass: CGSCCToFunctionPassAdaptor
-; CHECK-NESTED-MP-CG-FP: Starting function pass manager
-; CHECK-NESTED-MP-CG-FP: Running function pass: NoOpFunctionPass
-; CHECK-NESTED-MP-CG-FP: Finished function pass manager
-; CHECK-NESTED-MP-CG-FP: Running CGSCC pass: NoOpCGSCCPass
-; CHECK-NESTED-MP-CG-FP: Finished CGSCC pass manager
-; CHECK-NESTED-MP-CG-FP: Running module pass: ModuleToFunctionPassAdaptor
-; CHECK-NESTED-MP-CG-FP: Starting function pass manager
-; CHECK-NESTED-MP-CG-FP: Running function pass: NoOpFunctionPass
-; CHECK-NESTED-MP-CG-FP: Finished function pass manager
-; CHECK-NESTED-MP-CG-FP: Finished module pass manager
-; CHECK-NESTED-MP-CG-FP: Finished module pass manager
+; CHECK-NESTED-MP-CG-FP: Starting pass manager
+; CHECK-NESTED-MP-CG-FP: Starting pass manager
+; CHECK-NESTED-MP-CG-FP: Running pass: ModuleToFunctionPassAdaptor
+; CHECK-NESTED-MP-CG-FP: Starting pass manager
+; CHECK-NESTED-MP-CG-FP: Running pass: NoOpFunctionPass
+; CHECK-NESTED-MP-CG-FP: Finished pass manager
+; CHECK-NESTED-MP-CG-FP: Running pass: ModuleToPostOrderCGSCCPassAdaptor
+; CHECK-NESTED-MP-CG-FP: Starting pass manager
+; CHECK-NESTED-MP-CG-FP: Running pass: NoOpCGSCCPass
+; CHECK-NESTED-MP-CG-FP: Running pass: CGSCCToFunctionPassAdaptor
+; CHECK-NESTED-MP-CG-FP: Starting pass manager
+; CHECK-NESTED-MP-CG-FP: Running pass: NoOpFunctionPass
+; CHECK-NESTED-MP-CG-FP: Finished pass manager
+; CHECK-NESTED-MP-CG-FP: Running pass: NoOpCGSCCPass
+; CHECK-NESTED-MP-CG-FP: Finished pass manager
+; CHECK-NESTED-MP-CG-FP: Running pass: ModuleToFunctionPassAdaptor
+; CHECK-NESTED-MP-CG-FP: Starting pass manager
+; CHECK-NESTED-MP-CG-FP: Running pass: NoOpFunctionPass
+; CHECK-NESTED-MP-CG-FP: Finished pass manager
+; CHECK-NESTED-MP-CG-FP: Finished pass manager
+; CHECK-NESTED-MP-CG-FP: Finished pass manager
 
 define void @f() {
  ret void
diff --git a/test/SymbolRewriter/rewrite.ll b/test/SymbolRewriter/rewrite.ll
index 716fff9..e8a0db6 100644
--- a/test/SymbolRewriter/rewrite.ll
+++ b/test/SymbolRewriter/rewrite.ll
@@ -28,12 +28,40 @@ entry:
   ret void
 }
 
+$source_comdat_function = comdat any
+define dllexport void @source_comdat_function() comdat($source_comdat_function) {
+entry:
+  ret void
+}
+
+$source_comdat_function_1 = comdat exactmatch
+define dllexport void @source_comdat_function_1() comdat($source_comdat_function_1) {
+entry:
+  ret void
+}
+
+$source_comdat_variable = comdat largest
+@source_comdat_variable = global i32 32, comdat($source_comdat_variable)
+
+$source_comdat_variable_1 = comdat noduplicates
+@source_comdat_variable_1 = global i32 64, comdat($source_comdat_variable_1)
+
+; CHECK: $target_comdat_function = comdat any
+; CHECK: $target_comdat_function_1 = comdat exactmatch
+; CHECK: $target_comdat_variable = comdat largest
+; CHECK: $target_comdat_variable_1 = comdat noduplicates
+
 ; CHECK: @target_variable = external global i32
 ; CHECK-NOT: @source_variable = external global i32
 ; CHECK: @target_pattern_variable = external global i32
 ; CHECK-NOT: @source_pattern_variable = external global i32
 ; CHECK: @target_pattern_multiple_variable_matches = external global i32
 ; CHECK-NOT: @source_pattern_multiple_variable_matches = external global i32
+; CHECK: @target_comdat_variable = global i32 32, comdat
+; CHECK-NOT: @source_comdat_variable = global i32 32, comdat
+; CHECK: @target_comdat_variable_1 = global i32 64, comdat
+; CHECK-NOT: @source_comdat_variable_1 = global i32 64, comdat
+
 ; CHECK: declare void @target_function()
 ; CHECK-NOT: declare void @source_function()
 ; CHECK: declare void @target_pattern_function()
@@ -57,3 +85,8 @@ entry:
 ; CHECK:   ret i32 %res
 ; CHECK: }
 
+; CHECK: define dllexport void @target_comdat_function() comdat
+; CHECK-NOT: define dllexport void @source_comdat_function() comdat
+; CHECK: define dllexport void @target_comdat_function_1() comdat
+; CHECK-NOT: define dllexport void @source_comdat_function_1() comdat
+
diff --git a/test/SymbolRewriter/rewrite.map b/test/SymbolRewriter/rewrite.map
index ef6dfc8..8094939 100644
--- a/test/SymbolRewriter/rewrite.map
+++ b/test/SymbolRewriter/rewrite.map
@@ -44,3 +44,23 @@ global alias: {
   target: _ZN1SD1Ev,
 }
 
+function: {
+  source: source_comdat_function,
+  target: target_comdat_function,
+}
+
+function: {
+  source: source_comdat_function_(.*),
+  transform: target_comdat_function_\1,
+}
+
+global variable: {
+  source: source_comdat_variable,
+  target: target_comdat_variable,
+}
+
+global variable: {
+  source: source_comdat_variable_(.*),
+  transform: target_comdat_variable_\1,
+}
+
diff --git a/test/Transforms/AddDiscriminators/basic.ll b/test/Transforms/AddDiscriminators/basic.ll
index 6c1e532..7c8b3d3 100644
--- a/test/Transforms/AddDiscriminators/basic.ll
+++ b/test/Transforms/AddDiscriminators/basic.ll
@@ -40,20 +40,20 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [basic.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"basic.c", metadata !"."}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [basic.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!9 = metadata !{metadata !"clang version 3.5 "}
-!10 = metadata !{i32 3, i32 0, metadata !11, null}
-!11 = metadata !{metadata !"0xb\003\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [basic.c]
-!12 = metadata !{i32 4, i32 0, metadata !4, null}
-
-; CHECK: !12 = metadata !{i32 3, i32 0, metadata !13, null}
-; CHECK: !13 = metadata !{metadata !"0xb\001", metadata !1, metadata !11} ; [ DW_TAG_lexical_block ] [./basic.c]
-; CHECK: !14 = metadata !{i32 4, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [basic.c] [DW_LANG_C99]
+!1 = !{!"basic.c", !"."}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", !1, !5, !6, null, void (i32)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [basic.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 2}
+!9 = !{!"clang version 3.5 "}
+!10 = !MDLocation(line: 3, scope: !11)
+!11 = !{!"0xb\003\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [basic.c]
+!12 = !MDLocation(line: 4, scope: !4)
+
+; CHECK: !12 = !MDLocation(line: 3, scope: !13)
+; CHECK: !13 = !{!"0xb\001", !1, !11} ; [ DW_TAG_lexical_block ] [./basic.c]
+; CHECK: !14 = !MDLocation(line: 4, scope: !4)
diff --git a/test/Transforms/AddDiscriminators/first-only.ll b/test/Transforms/AddDiscriminators/first-only.ll
index e15a80a..153cfc8 100644
--- a/test/Transforms/AddDiscriminators/first-only.ll
+++ b/test/Transforms/AddDiscriminators/first-only.ll
@@ -50,33 +50,33 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 (trunk 199750) (llvm/trunk 199751)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [first-only.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"first-only.c", metadata !"."}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [first-only.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!9 = metadata !{metadata !"clang version 3.5 (trunk 199750) (llvm/trunk 199751)"}
-!10 = metadata !{i32 3, i32 0, metadata !11, null}
-
-!11 = metadata !{metadata !"0xb\003\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [first-only.c]
-; CHECK: !11 = metadata !{metadata !"0xb\003\000\000", metadata !1, metadata !4}
-
-!12 = metadata !{i32 3, i32 0, metadata !13, null}
-
-!13 = metadata !{metadata !"0xb\003\000\001", metadata !1, metadata !11} ; [ DW_TAG_lexical_block ] [first-only.c]
-; CHECK: !13 = metadata !{metadata !"0xb\001", metadata !1, metadata !14} ; [ DW_TAG_lexical_block ] [./first-only.c]
-
-!14 = metadata !{i32 4, i32 0, metadata !13, null}
-; CHECK: !14 = metadata !{metadata !"0xb\003\000\001", metadata !1, metadata !11}
-
-!15 = metadata !{i32 5, i32 0, metadata !13, null}
-; CHECK: !15 = metadata !{i32 4, i32 0, metadata !14, null}
-
-!16 = metadata !{i32 6, i32 0, metadata !4, null}
-; CHECK: !16 = metadata !{i32 5, i32 0, metadata !14, null}
-; CHECK: !17 = metadata !{i32 6, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5 (trunk 199750) (llvm/trunk 199751)\000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [first-only.c] [DW_LANG_C99]
+!1 = !{!"first-only.c", !"."}
+!2 = !{i32 0}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", !1, !5, !6, null, void (i32)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [first-only.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 2}
+!9 = !{!"clang version 3.5 (trunk 199750) (llvm/trunk 199751)"}
+!10 = !MDLocation(line: 3, scope: !11)
+
+!11 = !{!"0xb\003\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [first-only.c]
+; CHECK: !11 = !{!"0xb\003\000\000", !1, !4}
+
+!12 = !MDLocation(line: 3, scope: !13)
+
+!13 = !{!"0xb\003\000\001", !1, !11} ; [ DW_TAG_lexical_block ] [first-only.c]
+; CHECK: !13 = !{!"0xb\001", !1, !14} ; [ DW_TAG_lexical_block ] [./first-only.c]
+
+!14 = !MDLocation(line: 4, scope: !13)
+; CHECK: !14 = !{!"0xb\003\000\001", !1, !11}
+
+!15 = !MDLocation(line: 5, scope: !13)
+; CHECK: !15 = !MDLocation(line: 4, scope: !14)
+
+!16 = !MDLocation(line: 6, scope: !4)
+; CHECK: !16 = !MDLocation(line: 5, scope: !14)
+; CHECK: !17 = !MDLocation(line: 6, scope: !4)
 
diff --git a/test/Transforms/AddDiscriminators/multiple.ll b/test/Transforms/AddDiscriminators/multiple.ll
index 8418c9e..5e552a8 100644
--- a/test/Transforms/AddDiscriminators/multiple.ll
+++ b/test/Transforms/AddDiscriminators/multiple.ll
@@ -51,21 +51,21 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 (trunk 199750) (llvm/trunk 199751)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [multiple.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"multiple.c", metadata !"."}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [multiple.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!9 = metadata !{metadata !"clang version 3.5 (trunk 199750) (llvm/trunk 199751)"}
-!10 = metadata !{i32 3, i32 0, metadata !11, null}
-!11 = metadata !{metadata !"0xb\003\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [multiple.c]
-!12 = metadata !{i32 4, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5 (trunk 199750) (llvm/trunk 199751)\000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [multiple.c] [DW_LANG_C99]
+!1 = !{!"multiple.c", !"."}
+!2 = !{i32 0}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", !1, !5, !6, null, void (i32)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [multiple.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 2}
+!9 = !{!"clang version 3.5 (trunk 199750) (llvm/trunk 199751)"}
+!10 = !MDLocation(line: 3, scope: !11)
+!11 = !{!"0xb\003\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [multiple.c]
+!12 = !MDLocation(line: 4, scope: !4)
 
-; CHECK: !12 = metadata !{i32 3, i32 0, metadata !13, null}
-; CHECK: !13 = metadata !{metadata !"0xb\001", metadata !1, metadata !11} ; [ DW_TAG_lexical_block ] [./multiple.c]
-; CHECK: !14 = metadata !{i32 3, i32 0, metadata !15, null}
-; CHECK: !15 = metadata !{metadata !"0xb\002", metadata !1, metadata !11} ; [ DW_TAG_lexical_block ] [./multiple.c]
+; CHECK: !12 = !MDLocation(line: 3, scope: !13)
+; CHECK: !13 = !{!"0xb\001", !1, !11} ; [ DW_TAG_lexical_block ] [./multiple.c]
+; CHECK: !14 = !MDLocation(line: 3, scope: !15)
+; CHECK: !15 = !{!"0xb\002", !1, !11} ; [ DW_TAG_lexical_block ] [./multiple.c]
diff --git a/test/Transforms/AddDiscriminators/no-discriminators.ll b/test/Transforms/AddDiscriminators/no-discriminators.ll
index 66a2c4e..dd7faf0 100644
--- a/test/Transforms/AddDiscriminators/no-discriminators.ll
+++ b/test/Transforms/AddDiscriminators/no-discriminators.ll
@@ -17,7 +17,7 @@ entry:
   %retval = alloca i32, align 4
   %i.addr = alloca i64, align 8
   store i64 %i, i64* %i.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i64* %i.addr}, metadata !13, metadata !{}), !dbg !14
+  call void @llvm.dbg.declare(metadata i64* %i.addr, metadata !13, metadata !{}), !dbg !14
   %0 = load i64* %i.addr, align 8, !dbg !15
 ; CHECK:  %0 = load i64* %i.addr, align 8, !dbg !15
   %cmp = icmp slt i64 %0, 5, !dbg !15
@@ -48,24 +48,24 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [./no-discriminators] [DW_LANG_C99]
-!1 = metadata !{metadata !"no-discriminators", metadata !"."}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i64)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [./no-discriminators]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !9}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-; CHECK: !10 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!12 = metadata !{metadata !"clang version 3.5.0 "}
-!13 = metadata !{metadata !"0x101\00i\0016777217\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [i] [line 1]
-!14 = metadata !{i32 1, i32 0, metadata !4, null}
-!15 = metadata !{i32 2, i32 0, metadata !16, null}
-; CHECK: !15 = metadata !{i32 2, i32 0, metadata !16, null}
-!16 = metadata !{metadata !"0xb\002\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [./no-discriminators]
-; CHECK: !16 = metadata !{metadata !"0xb\002\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [./no-discriminators]
-!17 = metadata !{i32 3, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [./no-discriminators] [DW_LANG_C99]
+!1 = !{!"no-discriminators", !"."}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", !1, !5, !6, null, i32 (i64)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [./no-discriminators]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !9}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!10 = !{i32 2, !"Dwarf Version", i32 2}
+; CHECK: !10 = !{i32 2, !"Dwarf Version", i32 2}
+!11 = !{i32 1, !"Debug Info Version", i32 2}
+!12 = !{!"clang version 3.5.0 "}
+!13 = !{!"0x101\00i\0016777217\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [i] [line 1]
+!14 = !MDLocation(line: 1, scope: !4)
+!15 = !MDLocation(line: 2, scope: !16)
+; CHECK: !15 = !MDLocation(line: 2, scope: !16)
+!16 = !{!"0xb\002\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [./no-discriminators]
+; CHECK: !16 = !{!"0xb\002\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [./no-discriminators]
+!17 = !MDLocation(line: 3, scope: !4)
diff --git a/test/Transforms/ArgumentPromotion/control-flow2.ll b/test/Transforms/ArgumentPromotion/control-flow2.ll
index 2543218..db63584 100644
--- a/test/Transforms/ArgumentPromotion/control-flow2.ll
+++ b/test/Transforms/ArgumentPromotion/control-flow2.ll
@@ -1,5 +1,6 @@
-; RUN: opt < %s -argpromotion -S | \
-; RUN:   grep "load i32\* %A"
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+
+; CHECK: load i32* %A
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 define internal i32 @callee(i1 %C, i32* %P) {
diff --git a/test/Transforms/ArgumentPromotion/dbg.ll b/test/Transforms/ArgumentPromotion/dbg.ll
index d155750..65cf367 100644
--- a/test/Transforms/ArgumentPromotion/dbg.ll
+++ b/test/Transforms/ArgumentPromotion/dbg.ll
@@ -19,8 +19,8 @@ define void @caller(i32** %Y) {
 !llvm.module.flags = !{!0}
 !llvm.dbg.cu = !{!3}
 
-!0 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!1 = metadata !{i32 8, i32 0, metadata !2, null}
-!2 = metadata !{metadata !"0x2e\00test\00test\00\003\001\001\000\006\00256\000\003", null, null, null, null, void (i32**)* @test, null, null, null} ; [ DW_TAG_subprogram ]
-!3 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\002", null, null, null, metadata !4, null, null} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/pr20038/reduce/<stdin>] [DW_LANG_C_plus_plus]
-!4 = metadata !{metadata !2}
+!0 = !{i32 2, !"Debug Info Version", i32 2}
+!1 = !MDLocation(line: 8, scope: !2)
+!2 = !{!"0x2e\00test\00test\00\003\001\001\000\006\00256\000\003", null, null, null, null, void (i32**)* @test, null, null, null} ; [ DW_TAG_subprogram ]
+!3 = !{!"0x11\004\00clang version 3.5.0 \000\00\000\00\002", null, null, null, !4, null, null} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/pr20038/reduce/<stdin>] [DW_LANG_C_plus_plus]
+!4 = !{!2}
diff --git a/test/Transforms/ArgumentPromotion/reserve-tbaa.ll b/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
index 4688a83..db9d70d 100644
--- a/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
+++ b/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
@@ -37,16 +37,16 @@ entry:
   ret i32 0
 }
 
-!1 = metadata !{metadata !2, metadata !2, i64 0}
-!2 = metadata !{metadata !"long", metadata !3, i64 0}
-!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
-!5 = metadata !{metadata !6, metadata !6, i64 0}
-!6 = metadata !{metadata !"int", metadata !3, i64 0}
-!7 = metadata !{metadata !3, metadata !3, i64 0}
-!8 = metadata !{metadata !9, metadata !9, i64 0}
-!9 = metadata !{metadata !"any pointer", metadata !3, i64 0}
-; CHECK: ![[I32]] = metadata !{metadata ![[I32_TYPE:[0-9]+]], metadata ![[I32_TYPE]], i64 0}
-; CHECK: ![[I32_TYPE]] = metadata !{metadata !"int", metadata !{{.*}}, i64 0}
-; CHECK: ![[LONG]] = metadata !{metadata ![[LONG_TYPE:[0-9]+]], metadata ![[LONG_TYPE]], i64 0}
-; CHECK: ![[LONG_TYPE]] = metadata !{metadata !"long", metadata !{{.*}}, i64 0}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"long", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"int", !3, i64 0}
+!7 = !{!3, !3, i64 0}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"any pointer", !3, i64 0}
+; CHECK: ![[I32]] = !{![[I32_TYPE:[0-9]+]], ![[I32_TYPE]], i64 0}
+; CHECK: ![[I32_TYPE]] = !{!"int", !{{.*}}, i64 0}
+; CHECK: ![[LONG]] = !{![[LONG_TYPE:[0-9]+]], ![[LONG_TYPE]], i64 0}
+; CHECK: ![[LONG_TYPE]] = !{!"long", !{{.*}}, i64 0}
diff --git a/test/Transforms/BBVectorize/loop1.ll b/test/Transforms/BBVectorize/loop1.ll
index ed7be15..ca36170 100644
--- a/test/Transforms/BBVectorize/loop1.ll
+++ b/test/Transforms/BBVectorize/loop1.ll
@@ -83,7 +83,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11
 ; CHECK-UNRL: %4 = bitcast double* %arrayidx14 to <2 x double>*
 ; CHECK-UNRL: store <2 x double> %add12, <2 x double>* %4, align 8
-; CHECK-UNRL: %indvars.iv.next.1 = add i64 %indvars.iv, 2
+; CHECK-UNRL: %indvars.iv.next.1 = add nsw i64 %indvars.iv, 2
 ; CHECK-UNRL: %lftr.wideiv.1 = trunc i64 %indvars.iv.next.1 to i32
 ; CHECK-UNRL: %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 10
 ; CHECK-UNRL: br i1 %exitcond.1, label %for.end, label %for.body
diff --git a/test/Transforms/BBVectorize/metadata.ll b/test/Transforms/BBVectorize/metadata.ll
index ac7297d..874fbb8 100644
--- a/test/Transforms/BBVectorize/metadata.ll
+++ b/test/Transforms/BBVectorize/metadata.ll
@@ -41,9 +41,9 @@ entry:
 ; CHECK: ret void
 }
 
-!0 = metadata !{i64 0, i64 2}
-!1 = metadata !{i64 3, i64 5}
+!0 = !{i64 0, i64 2}
+!1 = !{i64 3, i64 5}
 
-!2 = metadata !{ float 5.0 }
-!3 = metadata !{ float 2.5 }
+!2 = !{ float 5.0 }
+!3 = !{ float 2.5 }
 
diff --git a/test/Transforms/BDCE/basic.ll b/test/Transforms/BDCE/basic.ll
new file mode 100644
index 0000000..6e748c6
--- /dev/null
+++ b/test/Transforms/BDCE/basic.ll
@@ -0,0 +1,348 @@
+; RUN: opt -S -bdce -instsimplify < %s | FileCheck %s
+; RUN: opt -S -instsimplify < %s | FileCheck %s -check-prefix=CHECK-IO
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone
+define signext i32 @bar(i32 signext %x) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 4
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 8
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %shr = ashr i32 %or15, 4
+  ret i32 %shr
+
+; CHECK-LABEL: @bar
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i32
+
+; Check that instsimplify is not doing this all on its own.
+; CHECK-IO-LABEL: @bar
+; CHECK-IO: tail call signext i32 @foo(i32 signext 5)
+; CHECK-IO: tail call signext i32 @foo(i32 signext 3)
+; CHECK-IO: tail call signext i32 @foo(i32 signext 2)
+; CHECK-IO: tail call signext i32 @foo(i32 signext 1)
+; CHECK-IO: tail call signext i32 @foo(i32 signext 0)
+; CHECK-IO: tail call signext i32 @foo(i32 signext 4)
+; CHECK-IO: ret i32
+}
+
+; Function Attrs: nounwind readnone
+declare signext i32 @foo(i32 signext) #0
+
+; Function Attrs: nounwind readnone
+define signext i32 @far(i32 signext %x) #1 {
+entry:
+  %call = tail call signext i32 @goo(i32 signext 5) #1
+  %and = and i32 %call, 4
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @goo(i32 signext 3) #1
+  %and2 = and i32 %call1, 8
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @goo(i32 signext 2) #1
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @goo(i32 signext 1) #1
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @goo(i32 signext 0) #1
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @goo(i32 signext 4) #1
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %shr = ashr i32 %or15, 4
+  ret i32 %shr
+
+; CHECK-LABEL: @far
+; Calls to foo(5) and foo(3) are still there, but their results are not used.
+; CHECK: tail call signext i32 @goo(i32 signext 5)
+; CHECK-NEXT: tail call signext i32 @goo(i32 signext 3)
+; CHECK-NEXT: tail call signext i32 @goo(i32 signext 2)
+; CHECK: tail call signext i32 @goo(i32 signext 1)
+; CHECK: tail call signext i32 @goo(i32 signext 0)
+; CHECK: tail call signext i32 @goo(i32 signext 4)
+; CHECK: ret i32
+
+; Check that instsimplify is not doing this all on its own.
+; CHECK-IO-LABEL: @far
+; CHECK-IO: tail call signext i32 @goo(i32 signext 5)
+; CHECK-IO: tail call signext i32 @goo(i32 signext 3)
+; CHECK-IO: tail call signext i32 @goo(i32 signext 2)
+; CHECK-IO: tail call signext i32 @goo(i32 signext 1)
+; CHECK-IO: tail call signext i32 @goo(i32 signext 0)
+; CHECK-IO: tail call signext i32 @goo(i32 signext 4)
+; CHECK-IO: ret i32
+}
+
+declare signext i32 @goo(i32 signext) #1
+
+; Function Attrs: nounwind readnone
+define signext i32 @tar1(i32 signext %x) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 33554432
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 67108864
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %bs = tail call i32 @llvm.bswap.i32(i32 %or15) #0
+  %shr = ashr i32 %bs, 4
+  ret i32 %shr
+
+; CHECK-LABEL: @tar1
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i32
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.bswap.i32(i32) #0
+
+; Function Attrs: nounwind readnone
+define signext i32 @tar2(i32 signext %x) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 33554432
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 67108864
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %shl = shl i32 %or15, 10
+  ret i32 %shl
+
+; CHECK-LABEL: @tar2
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i32
+}
+
+; Function Attrs: nounwind readnone
+define signext i32 @tar3(i32 signext %x) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 33554432
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 67108864
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %add = add i32 %or15, 5
+  %shl = shl i32 %add, 10
+  ret i32 %shl
+
+; CHECK-LABEL: @tar3
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i32
+}
+
+; Function Attrs: nounwind readnone
+define signext i32 @tar4(i32 signext %x) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 33554432
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 67108864
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %sub = sub i32 %or15, 5
+  %shl = shl i32 %sub, 10
+  ret i32 %shl
+
+; CHECK-LABEL: @tar4
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i32
+}
+
+; Function Attrs: nounwind readnone
+define signext i32 @tar5(i32 signext %x) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 33554432
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 67108864
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %xor = xor i32 %or15, 5
+  %shl = shl i32 %xor, 10
+  ret i32 %shl
+
+; CHECK-LABEL: @tar5
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i32
+}
+
+; Function Attrs: nounwind readnone
+define signext i32 @tar7(i32 signext %x, i1 %b) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 33554432
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 67108864
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %v = select i1 %b, i32 %or15, i32 5
+  %shl = shl i32 %v, 10
+  ret i32 %shl
+
+; CHECK-LABEL: @tar7
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i32
+}
+
+; Function Attrs: nounwind readnone
+define signext i16 @tar8(i32 signext %x) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 33554432
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 67108864
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %tr = trunc i32 %or15 to i16
+  ret i16 %tr
+
+; CHECK-LABEL: @tar8
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i16
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
diff --git a/test/Transforms/BDCE/dce-pure.ll b/test/Transforms/BDCE/dce-pure.ll
new file mode 100644
index 0000000..6a432fc
--- /dev/null
+++ b/test/Transforms/BDCE/dce-pure.ll
@@ -0,0 +1,33 @@
+; RUN: opt -bdce -S < %s | FileCheck %s
+
+declare i32 @strlen(i8*) readonly nounwind
+
+define void @test1() {
+  call i32 @strlen( i8* null )
+  ret void
+
+; CHECK-LABEL: @test1
+; CHECK-NOT: call
+; CHECK: ret void
+}
+
+define i32 @test2() {
+  ; invoke of pure function should not be deleted!
+  invoke i32 @strlen( i8* null ) readnone
+                  to label %Cont unwind label %Other
+
+Cont:           ; preds = %0
+  ret i32 0
+
+Other:          ; preds = %0
+   %exn = landingpad {i8*, i32} personality i32 (...)* @__gxx_personality_v0
+            cleanup
+  ret i32 1
+
+; CHECK-LABEL: @test2
+; CHECK: invoke
+; CHECK: ret i32 1
+}
+
+declare i32 @__gxx_personality_v0(...)
+
diff --git a/test/Transforms/BDCE/order.ll b/test/Transforms/BDCE/order.ll
new file mode 100644
index 0000000..301f447
--- /dev/null
+++ b/test/Transforms/BDCE/order.ll
@@ -0,0 +1,37 @@
+; RUN: opt -bdce -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i32 @__gxx_personality_v0(...)
+
+define fastcc void @_ZN11__sanitizerL12TestRegistryEPNS_14ThreadRegistryEb() #0 {
+entry:
+  br i1 undef, label %if.else, label %entry.if.end_crit_edge
+
+if.else:
+  ret void
+
+invoke.cont70:
+  store i32 %call71, i32* undef, align 4
+  br label %if.else
+
+; CHECK-LABEL: @_ZN11__sanitizerL12TestRegistryEPNS_14ThreadRegistryEb
+; CHECK: store i32 %call71
+
+lpad65.loopexit.split-lp.loopexit.split-lp:
+  br label %if.else
+
+lpad65.loopexit.split-lp.loopexit.split-lp.loopexit:
+  %lpad.loopexit1121 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %lpad65.loopexit.split-lp.loopexit.split-lp
+
+entry.if.end_crit_edge:
+  %call71 = invoke i32 @_ZN11__sanitizer14ThreadRegistry12CreateThreadEmbjPv()
+          to label %invoke.cont70 unwind label %lpad65.loopexit.split-lp.loopexit.split-lp.loopexit
+}
+
+declare i32 @_ZN11__sanitizer14ThreadRegistry12CreateThreadEmbjPv()
+
+attributes #0 = { uwtable }
+
diff --git a/test/Transforms/CodeGenPrepare/statepoint-relocate.ll b/test/Transforms/CodeGenPrepare/statepoint-relocate.ll
new file mode 100644
index 0000000..cf411bc
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/statepoint-relocate.ll
@@ -0,0 +1,88 @@
+; RUN: opt -codegenprepare -S < %s | FileCheck %s
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+declare zeroext i1 @return_i1()
+
+define i32 @test_sor_basic(i32* %base) {
+; CHECK: getelementptr i32* %base, i32 15
+; CHECK: getelementptr i32* %base-new, i32 15
+entry:
+       %ptr = getelementptr i32* %base, i32 15
+       %tok = call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32* %base, i32* %ptr)
+       %base-new = call i32* @llvm.experimental.gc.relocate.p0i32(i32 %tok, i32 4, i32 4)
+       %ptr-new = call i32* @llvm.experimental.gc.relocate.p0i32(i32 %tok, i32 4, i32 5)
+       %ret = load i32* %ptr-new
+       ret i32 %ret
+}
+
+define i32 @test_sor_two_derived(i32* %base) {
+; CHECK: getelementptr i32* %base, i32 15
+; CHECK: getelementptr i32* %base, i32 12
+; CHECK: getelementptr i32* %base-new, i32 15
+; CHECK: getelementptr i32* %base-new, i32 12
+entry:
+       %ptr = getelementptr i32* %base, i32 15
+       %ptr2 = getelementptr i32* %base, i32 12
+       %tok = call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32* %base, i32* %ptr, i32* %ptr2)
+       %base-new = call i32* @llvm.experimental.gc.relocate.p0i32(i32 %tok, i32 4, i32 4)
+       %ptr-new = call i32* @llvm.experimental.gc.relocate.p0i32(i32 %tok, i32 4, i32 5)
+       %ptr2-new = call i32* @llvm.experimental.gc.relocate.p0i32(i32 %tok, i32 4, i32 6)
+       %ret = load i32* %ptr-new
+       ret i32 %ret
+}
+
+define i32 @test_sor_ooo(i32* %base) {
+; CHECK: getelementptr i32* %base, i32 15
+; CHECK: getelementptr i32* %base-new, i32 15
+entry:
+       %ptr = getelementptr i32* %base, i32 15
+       %tok = call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32* %base, i32* %ptr)
+       %ptr-new = call i32* @llvm.experimental.gc.relocate.p0i32(i32 %tok, i32 4, i32 5)
+       %base-new = call i32* @llvm.experimental.gc.relocate.p0i32(i32 %tok, i32 4, i32 4)
+       %ret = load i32* %ptr-new
+       ret i32 %ret
+}
+
+define i32 @test_sor_gep_smallint([3 x i32]* %base) {
+; CHECK: getelementptr [3 x i32]* %base, i32 0, i32 2
+; CHECK: getelementptr [3 x i32]* %base-new, i32 0, i32 2
+entry:
+       %ptr = getelementptr [3 x i32]* %base, i32 0, i32 2
+       %tok = call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, [3 x i32]* %base, i32* %ptr)
+       %base-new = call [3 x i32]* @llvm.experimental.gc.relocate.p0a3i32(i32 %tok, i32 4, i32 4)
+       %ptr-new = call i32* @llvm.experimental.gc.relocate.p0i32(i32 %tok, i32 4, i32 5)
+       %ret = load i32* %ptr-new
+       ret i32 %ret
+}
+
+define i32 @test_sor_gep_largeint([3 x i32]* %base) {
+; CHECK: getelementptr [3 x i32]* %base, i32 0, i32 21
+; CHECK-NOT: getelementptr [3 x i32]* %base-new, i32 0, i32 21
+entry:
+       %ptr = getelementptr [3 x i32]* %base, i32 0, i32 21
+       %tok = call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, [3 x i32]* %base, i32* %ptr)
+       %base-new = call [3 x i32]* @llvm.experimental.gc.relocate.p0a3i32(i32 %tok, i32 4, i32 4)
+       %ptr-new = call i32* @llvm.experimental.gc.relocate.p0i32(i32 %tok, i32 4, i32 5)
+       %ret = load i32* %ptr-new
+       ret i32 %ret
+}
+
+define i32 @test_sor_noop(i32* %base) {
+; CHECK: getelementptr i32* %base, i32 15
+; CHECK: call i32* @llvm.experimental.gc.relocate.p0i32(i32 %tok, i32 4, i32 5)
+; CHECK: call i32* @llvm.experimental.gc.relocate.p0i32(i32 %tok, i32 4, i32 6)
+entry:
+       %ptr = getelementptr i32* %base, i32 15
+       %ptr2 = getelementptr i32* %base, i32 12
+       %tok = call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32* %base, i32* %ptr, i32* %ptr2)
+       %ptr-new = call i32* @llvm.experimental.gc.relocate.p0i32(i32 %tok, i32 4, i32 5)
+       %ptr2-new = call i32* @llvm.experimental.gc.relocate.p0i32(i32 %tok, i32 4, i32 6)
+       %ret = load i32* %ptr-new
+       ret i32 %ret
+}
+
+declare i32 @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()*, i32, i32, ...)
+declare i32* @llvm.experimental.gc.relocate.p0i32(i32, i32, i32)
+declare [3 x i32]* @llvm.experimental.gc.relocate.p0a3i32(i32, i32, i32)
diff --git a/test/Transforms/ConstProp/InsertElement.ll b/test/Transforms/ConstProp/InsertElement.ll
new file mode 100644
index 0000000..d249c2e
--- /dev/null
+++ b/test/Transforms/ConstProp/InsertElement.ll
@@ -0,0 +1,12 @@
+; RUN: opt < %s -constprop -S | FileCheck %s
+
+define i32 @test1() {
+  %A = bitcast i32 2139171423 to float
+  %B = insertelement <1 x float> undef, float %A, i32 0
+  %C = extractelement <1 x float> %B, i32 0
+  %D = bitcast float %C to i32
+  ret i32 %D
+; CHECK: @test1
+; CHECK: ret i32 2139171423
+}
+
diff --git a/test/Transforms/ConstProp/insertvalue.ll b/test/Transforms/ConstProp/insertvalue.ll
index 0d288b3..dce2b72 100644
--- a/test/Transforms/ConstProp/insertvalue.ll
+++ b/test/Transforms/ConstProp/insertvalue.ll
@@ -65,3 +65,12 @@ define [3 x %struct] @undef-test3() {
 ; CHECK: ret [3 x %struct] [%struct undef, %struct { i32 0, [4 x i8] undef }, %struct undef]
 }
 
+define i32 @test-float-Nan() {
+  %A = bitcast i32 2139171423 to float
+  %B = insertvalue [1 x float] undef, float %A, 0
+  %C = extractvalue [1 x float] %B, 0
+  %D = bitcast float %C to i32
+  ret i32 %D
+; CHECK: @test-float-Nan
+; CHECK: ret i32 2139171423
+}
diff --git a/test/Transforms/CorrelatedValuePropagation/icmp.ll b/test/Transforms/CorrelatedValuePropagation/icmp.ll
new file mode 100644
index 0000000..c2863ff
--- /dev/null
+++ b/test/Transforms/CorrelatedValuePropagation/icmp.ll
@@ -0,0 +1,63 @@
+; RUN: opt -correlated-propagation -S %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Function Attrs: noreturn
+declare void @check1(i1) #1
+
+; Function Attrs: noreturn
+declare void @check2(i1) #1
+
+; Make sure we propagate the value of %tmp35 to the true/false cases
+; CHECK-LABEL: @test1
+; CHECK: call void @check1(i1 false)
+; CHECK: call void @check2(i1 true)
+define void @test1(i64 %tmp35) {
+bb:
+  %tmp36 = icmp sgt i64 %tmp35, 0
+  br i1 %tmp36, label %bb_true, label %bb_false
+
+bb_true:
+  %tmp47 = icmp slt i64 %tmp35, 0
+  tail call void @check1(i1 %tmp47) #4
+  unreachable
+
+bb_false:
+  %tmp48 = icmp sle i64 %tmp35, 0
+  tail call void @check2(i1 %tmp48) #4
+  unreachable
+}
+
+; Function Attrs: noreturn
+; This is the same as test1 but with a diamond to ensure we
+; get %tmp36 from both true and false BBs.
+; CHECK-LABEL: @test2
+; CHECK: call void @check1(i1 false)
+; CHECK: call void @check2(i1 true)
+define void @test2(i64 %tmp35, i1 %inner_cmp) {
+bb:
+  %tmp36 = icmp sgt i64 %tmp35, 0
+  br i1 %tmp36, label %bb_true, label %bb_false
+
+bb_true:
+  br i1 %inner_cmp, label %inner_true, label %inner_false
+
+inner_true:
+  br label %merge
+
+inner_false:
+  br label %merge
+
+merge:
+  %tmp47 = icmp slt i64 %tmp35, 0
+  tail call void @check1(i1 %tmp47) #0
+  unreachable
+
+bb_false:
+  %tmp48 = icmp sle i64 %tmp35, 0
+  tail call void @check2(i1 %tmp48) #4
+  unreachable
+}
+
+attributes #4 = { noreturn }
diff --git a/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll b/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
index 077394f..dd283ae 100644
--- a/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
+++ b/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
@@ -4,10 +4,10 @@
 
 define i8* @vfs_addname(i8* %name, i32 %len, i32 %hash, i32 %flags) nounwind ssp {
 entry:
-  call void @llvm.dbg.value(metadata !{i8* %name}, i64 0, metadata !0, metadata !{})
-  call void @llvm.dbg.value(metadata !{i32 %len}, i64 0, metadata !10, metadata !{})
-  call void @llvm.dbg.value(metadata !{i32 %hash}, i64 0, metadata !11, metadata !{})
-  call void @llvm.dbg.value(metadata !{i32 %flags}, i64 0, metadata !12, metadata !{})
+  call void @llvm.dbg.value(metadata i8* %name, i64 0, metadata !0, metadata !{})
+  call void @llvm.dbg.value(metadata i32 %len, i64 0, metadata !10, metadata !{})
+  call void @llvm.dbg.value(metadata i32 %hash, i64 0, metadata !11, metadata !{})
+  call void @llvm.dbg.value(metadata i32 %flags, i64 0, metadata !12, metadata !{})
 ; CHECK:  call fastcc i8* @add_name_internal(i8* %name, i32 %hash) [[NUW:#[0-9]+]], !dbg !{{[0-9]+}}
   %0 = call fastcc i8* @add_name_internal(i8* %name, i32 %len, i32 %hash, i8 zeroext 0, i32 %flags) nounwind, !dbg !13 ; <i8*> [#uses=1]
   ret i8* %0, !dbg !13
@@ -17,11 +17,11 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define internal fastcc i8* @add_name_internal(i8* %name, i32 %len, i32 %hash, i8 zeroext %extra, i32 %flags) noinline nounwind ssp {
 entry:
-  call void @llvm.dbg.value(metadata !{i8* %name}, i64 0, metadata !15, metadata !{})
-  call void @llvm.dbg.value(metadata !{i32 %len}, i64 0, metadata !20, metadata !{})
-  call void @llvm.dbg.value(metadata !{i32 %hash}, i64 0, metadata !21, metadata !{})
-  call void @llvm.dbg.value(metadata !{i8 %extra}, i64 0, metadata !22, metadata !{})
-  call void @llvm.dbg.value(metadata !{i32 %flags}, i64 0, metadata !23, metadata !{})
+  call void @llvm.dbg.value(metadata i8* %name, i64 0, metadata !15, metadata !{})
+  call void @llvm.dbg.value(metadata i32 %len, i64 0, metadata !20, metadata !{})
+  call void @llvm.dbg.value(metadata i32 %hash, i64 0, metadata !21, metadata !{})
+  call void @llvm.dbg.value(metadata i8 %extra, i64 0, metadata !22, metadata !{})
+  call void @llvm.dbg.value(metadata i32 %flags, i64 0, metadata !23, metadata !{})
   %0 = icmp eq i32 %hash, 0, !dbg !24             ; <i1> [#uses=1]
   br i1 %0, label %bb, label %bb1, !dbg !24
 
@@ -45,34 +45,34 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!30}
-!0 = metadata !{metadata !"0x101\00name\008\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00vfs_addname\00vfs_addname\00vfs_addname\0012\000\001\000\006\000\000\000", metadata !28, metadata !2, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x29", metadata !28} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)\001\00\000\00\000", metadata !28, metadata !29, metadata !29, null, null, null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !28, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{metadata !6, metadata !6, metadata !9, metadata !9, metadata !9}
-!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !28, metadata !2, metadata !7} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{metadata !"0x26\00\000\008\008\000\000", metadata !28, metadata !2, metadata !8} ; [ DW_TAG_const_type ]
-!8 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !28, metadata !2} ; [ DW_TAG_base_type ]
-!9 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", metadata !28, metadata !2} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !"0x101\00len\009\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{metadata !"0x101\00hash\0010\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{metadata !"0x101\00flags\0011\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
-!13 = metadata !{i32 13, i32 0, metadata !14, null}
-!14 = metadata !{metadata !"0xb\0012\000\000", metadata !28, metadata !1} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{metadata !"0x101\00name\0017\000", metadata !16, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!16 = metadata !{metadata !"0x2e\00add_name_internal\00add_name_internal\00add_name_internal\0022\001\001\000\006\000\000\000", metadata !28, metadata !2, metadata !17, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !28, metadata !2, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!18 = metadata !{metadata !6, metadata !6, metadata !9, metadata !9, metadata !19, metadata !9}
-!19 = metadata !{metadata !"0x24\00unsigned char\000\008\008\000\000\008", metadata !28, metadata !2} ; [ DW_TAG_base_type ]
-!20 = metadata !{metadata !"0x101\00len\0018\000", metadata !16, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
-!21 = metadata !{metadata !"0x101\00hash\0019\000", metadata !16, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
-!22 = metadata !{metadata !"0x101\00extra\0020\000", metadata !16, metadata !2, metadata !19} ; [ DW_TAG_arg_variable ]
-!23 = metadata !{metadata !"0x101\00flags\0021\000", metadata !16, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
-!24 = metadata !{i32 23, i32 0, metadata !25, null}
-!25 = metadata !{metadata !"0xb\0022\000\000", metadata !28, metadata !16} ; [ DW_TAG_lexical_block ]
-!26 = metadata !{i32 24, i32 0, metadata !25, null}
-!27 = metadata !{i32 26, i32 0, metadata !25, null}
-!28 = metadata !{metadata !"tail.c", metadata !"/Users/echeng/LLVM/radars/r7927803/"}
-!29 = metadata !{i32 0}
-!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x101\00name\008\000", !1, !2, !6} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00vfs_addname\00vfs_addname\00vfs_addname\0012\000\001\000\006\000\000\000", !28, !2, !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x29", !28} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)\001\00\000\00\000", !28, !29, !29, null, null, null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !28, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{!6, !6, !9, !9, !9}
+!6 = !{!"0xf\00\000\0064\0064\000\000", !28, !2, !7} ; [ DW_TAG_pointer_type ]
+!7 = !{!"0x26\00\000\008\008\000\000", !28, !2, !8} ; [ DW_TAG_const_type ]
+!8 = !{!"0x24\00char\000\008\008\000\000\006", !28, !2} ; [ DW_TAG_base_type ]
+!9 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", !28, !2} ; [ DW_TAG_base_type ]
+!10 = !{!"0x101\00len\009\000", !1, !2, !9} ; [ DW_TAG_arg_variable ]
+!11 = !{!"0x101\00hash\0010\000", !1, !2, !9} ; [ DW_TAG_arg_variable ]
+!12 = !{!"0x101\00flags\0011\000", !1, !2, !9} ; [ DW_TAG_arg_variable ]
+!13 = !MDLocation(line: 13, scope: !14)
+!14 = !{!"0xb\0012\000\000", !28, !1} ; [ DW_TAG_lexical_block ]
+!15 = !{!"0x101\00name\0017\000", !16, !2, !6} ; [ DW_TAG_arg_variable ]
+!16 = !{!"0x2e\00add_name_internal\00add_name_internal\00add_name_internal\0022\001\001\000\006\000\000\000", !28, !2, !17, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!17 = !{!"0x15\00\000\000\000\000\000\000", !28, !2, null, !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = !{!6, !6, !9, !9, !19, !9}
+!19 = !{!"0x24\00unsigned char\000\008\008\000\000\008", !28, !2} ; [ DW_TAG_base_type ]
+!20 = !{!"0x101\00len\0018\000", !16, !2, !9} ; [ DW_TAG_arg_variable ]
+!21 = !{!"0x101\00hash\0019\000", !16, !2, !9} ; [ DW_TAG_arg_variable ]
+!22 = !{!"0x101\00extra\0020\000", !16, !2, !19} ; [ DW_TAG_arg_variable ]
+!23 = !{!"0x101\00flags\0021\000", !16, !2, !9} ; [ DW_TAG_arg_variable ]
+!24 = !MDLocation(line: 23, scope: !25)
+!25 = !{!"0xb\0022\000\000", !28, !16} ; [ DW_TAG_lexical_block ]
+!26 = !MDLocation(line: 24, scope: !25)
+!27 = !MDLocation(line: 26, scope: !25)
+!28 = !{!"tail.c", !"/Users/echeng/LLVM/radars/r7927803/"}
+!29 = !{i32 0}
+!30 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/DeadArgElim/aggregates.ll b/test/Transforms/DeadArgElim/aggregates.ll
new file mode 100644
index 0000000..f54c6c9
--- /dev/null
+++ b/test/Transforms/DeadArgElim/aggregates.ll
@@ -0,0 +1,162 @@
+; RUN: opt -S -deadargelim %s | FileCheck %s
+
+; Case 0: the basic example: an entire aggregate use is returned, but it's
+; actually only used in ways we can eliminate. We gain benefit from analysing
+; the "use" and applying its results to all sub-values.
+
+; CHECK-LABEL: define internal void @agguse_dead()
+
+define internal { i32, i32 } @agguse_dead() {
+  ret { i32, i32 } { i32 0, i32 1 }
+}
+
+define internal { i32, i32 } @test_agguse_dead() {
+  %val = call { i32, i32 } @agguse_dead()
+  ret { i32, i32 } %val
+}
+
+
+
+; Case 1: an opaque use of the aggregate exists (in this case dead). Otherwise
+; only one value is used, so function can be simplified.
+
+; CHECK-LABEL: define internal i32 @rets_independent_if_agguse_dead()
+; CHECK: [[RET:%.*]] = extractvalue { i32, i32 } { i32 0, i32 1 }, 1
+; CHECK: ret i32 [[RET]]
+
+define internal { i32, i32 } @rets_independent_if_agguse_dead() {
+  ret { i32, i32 } { i32 0, i32 1 }
+}
+
+define internal { i32, i32 } @test_rets_independent_if_agguse_dead(i1 %tst) {
+  %val = call { i32, i32 } @rets_independent_if_agguse_dead()
+  br i1 %tst, label %use_1, label %use_aggregate
+
+use_1:
+  ; This use can be classified as applying only to ret 1.
+  %val0 = extractvalue { i32, i32 } %val, 1
+  call void @callee(i32 %val0)
+  ret { i32, i32 } undef
+
+use_aggregate:
+  ; This use is assumed to apply to both 0 and 1.
+  ret { i32, i32 } %val
+}
+
+; Case 2: an opaque use of the aggregate exists (in this case *live*). Other
+; uses shouldn't matter.
+
+; CHECK-LABEL: define internal { i32, i32 } @rets_live_agguse()
+; CHECK: ret { i32, i32 } { i32 0, i32 1 }
+
+define internal { i32, i32 } @rets_live_agguse() {
+  ret { i32, i32} { i32 0, i32 1 }
+}
+
+define { i32, i32 } @test_rets_live_aggues(i1 %tst) {
+  %val = call { i32, i32 } @rets_live_agguse()
+  br i1 %tst, label %use_1, label %use_aggregate
+
+use_1:
+  ; This use can be classified as applying only to ret 1.
+  %val0 = extractvalue { i32, i32 } %val, 1
+  call void @callee(i32 %val0)
+  ret { i32, i32 } undef
+
+use_aggregate:
+  ; This use is assumed to apply to both 0 and 1.
+  ret { i32, i32 } %val
+}
+
+declare void @callee(i32)
+
+; Case 3: the insertvalue meant %in was live if ret-slot-1 was, but we were only
+; tracking multiple ret-slots for struct types. So %in was eliminated
+; incorrectly.
+
+; CHECK-LABEL: define internal [2 x i32] @array_rets_have_multiple_slots(i32 %in)
+
+define internal [2 x i32] @array_rets_have_multiple_slots(i32 %in) {
+  %ret = insertvalue [2 x i32] undef, i32 %in, 1
+  ret [2 x i32] %ret
+}
+
+define [2 x i32] @test_array_rets_have_multiple_slots() {
+  %res = call [2 x i32] @array_rets_have_multiple_slots(i32 42)
+  ret [2 x i32] %res
+}
+
+; Case 4: we can remove some retvals from the array. It's nice to produce an
+; array again having done so (rather than converting it to a struct).
+
+; CHECK-LABEL: define internal [2 x i32] @can_shrink_arrays()
+; CHECK: [[VAL0:%.*]] = extractvalue [3 x i32] [i32 42, i32 43, i32 44], 0
+; CHECK: [[RESTMP:%.*]] = insertvalue [2 x i32] undef, i32 [[VAL0]], 0
+; CHECK: [[VAL2:%.*]] = extractvalue [3 x i32] [i32 42, i32 43, i32 44], 2
+; CHECK: [[RES:%.*]] = insertvalue [2 x i32] [[RESTMP]], i32 [[VAL2]], 1
+; CHECK: ret [2 x i32] [[RES]]
+
+; CHECK-LABEL: define void @test_can_shrink_arrays()
+
+define internal [3 x i32] @can_shrink_arrays() {
+  ret [3 x i32] [i32 42, i32 43, i32 44]
+}
+
+define void @test_can_shrink_arrays() {
+  %res = call [3 x i32] @can_shrink_arrays()
+
+  %res.0 = extractvalue [3 x i32] %res, 0
+  call void @callee(i32 %res.0)
+
+  %res.2 = extractvalue [3 x i32] %res, 2
+  call void @callee(i32 %res.2)
+
+  ret void
+}
+
+; Case 5: %in gets passed directly to the return. It should mark be marked as
+; used if *any* of the return values are, not just if value 0 is.
+
+; CHECK-LABEL: define internal i32 @ret_applies_to_all({ i32, i32 } %in)
+; CHECK: [[RET:%.*]] = extractvalue { i32, i32 } %in, 1
+; CHECK: ret i32 [[RET]]
+
+define internal {i32, i32} @ret_applies_to_all({i32, i32} %in) {
+  ret {i32, i32} %in
+}
+
+define i32 @test_ret_applies_to_all() {
+  %val = call {i32, i32} @ret_applies_to_all({i32, i32} {i32 42, i32 43})
+  %ret = extractvalue {i32, i32} %val, 1
+  ret i32 %ret
+}
+
+; Case 6: When considering @mid, the return instruciton has sub-value 0
+; unconditionally live, but 1 only conditionally live. Since at that level we're
+; applying the results to the whole of %res, this means %res is live and cannot
+; be reduced. There is scope for further optimisation here (though not visible
+; in this test-case).
+
+; CHECK-LABEL: define internal { i8*, i32 } @inner()
+
+define internal {i8*, i32} @mid() {
+  %res = call {i8*, i32} @inner()
+  %intval = extractvalue {i8*, i32} %res, 1
+  %tst = icmp eq i32 %intval, 42
+  br i1 %tst, label %true, label %true
+
+true:
+  ret {i8*, i32} %res
+}
+
+define internal {i8*, i32} @inner() {
+  ret {i8*, i32} {i8* null, i32 42}
+}
+
+define internal i8 @outer() {
+  %res = call {i8*, i32} @mid()
+  %resptr = extractvalue {i8*, i32} %res, 0
+
+  %val = load i8* %resptr
+  ret i8 %val
+}
+\ No newline at end of file
diff --git a/test/Transforms/DeadArgElim/dbginfo.ll b/test/Transforms/DeadArgElim/dbginfo.ll
index b457f01..5bbf821 100644
--- a/test/Transforms/DeadArgElim/dbginfo.ll
+++ b/test/Transforms/DeadArgElim/dbginfo.ll
@@ -29,7 +29,7 @@ entry:
 ; Function Attrs: nounwind uwtable
 define internal void @_ZL2f1iz(i32, ...) #1 {
 entry:
-  call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !17, metadata !18), !dbg !19
+  call void @llvm.dbg.value(metadata i32 %0, i64 0, metadata !17, metadata !18), !dbg !19
   ret void, !dbg !20
 }
 
@@ -47,24 +47,24 @@ attributes #2 = { nounwind readnone }
 !llvm.module.flags = !{!12, !13}
 !llvm.ident = !{!14}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dbg.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"dbg.cpp", metadata !"/tmp/dbginfo"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !8}
-!4 = metadata !{metadata !"0x2e\00f2\00f2\00_Z2f2v\004\000\001\000\000\00256\000\004", metadata !1, metadata !5, metadata !6, null, void ()* @_Z2f2v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [f2]
-!5 = metadata !{metadata !"0x29", metadata !1}    ; [ DW_TAG_file_type ] [/tmp/dbginfo/dbg.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{metadata !"0x2e\00f1\00f1\00_ZL2f1iz\001\001\001\000\000\00256\000\001", metadata !1, metadata !5, metadata !9, null, void (i32, ...)* @_ZL2f1iz, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [local] [def] [f1]
-!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!10 = metadata !{null, metadata !11, null}
-!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!12 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!13 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!14 = metadata !{metadata !"clang version 3.6.0 "}
-!15 = metadata !{i32 5, i32 3, metadata !4, null}
-!16 = metadata !{i32 6, i32 1, metadata !4, null}
-!17 = metadata !{metadata !"0x101\00\0016777217\000", metadata !8, metadata !5, metadata !11} ; [ DW_TAG_arg_variable ] [line 1]
-!18 = metadata !{metadata !"0x102"}               ; [ DW_TAG_expression ]
-!19 = metadata !{i32 1, i32 19, metadata !8, null}
-!20 = metadata !{i32 2, i32 1, metadata !8, null}
+!0 = !{!"0x11\004\00clang version 3.6.0 \000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dbg.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"dbg.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4, !8}
+!4 = !{!"0x2e\00f2\00f2\00_Z2f2v\004\000\001\000\000\00256\000\004", !1, !5, !6, null, void ()* @_Z2f2v, null, null, !2} ; [ DW_TAG_subprogram ] [line 4] [def] [f2]
+!5 = !{!"0x29", !1}    ; [ DW_TAG_file_type ] [/tmp/dbginfo/dbg.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{!"0x2e\00f1\00f1\00_ZL2f1iz\001\001\001\000\000\00256\000\001", !1, !5, !9, null, void (i32, ...)* @_ZL2f1iz, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [local] [def] [f1]
+!9 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = !{null, !11, null}
+!11 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = !{i32 2, !"Dwarf Version", i32 4}
+!13 = !{i32 2, !"Debug Info Version", i32 2}
+!14 = !{!"clang version 3.6.0 "}
+!15 = !MDLocation(line: 5, column: 3, scope: !4)
+!16 = !MDLocation(line: 6, column: 1, scope: !4)
+!17 = !{!"0x101\00\0016777217\000", !8, !5, !11} ; [ DW_TAG_arg_variable ] [line 1]
+!18 = !{!"0x102"}               ; [ DW_TAG_expression ]
+!19 = !MDLocation(line: 1, column: 19, scope: !8)
+!20 = !MDLocation(line: 2, column: 1, scope: !8)
diff --git a/test/Transforms/DeadStoreElimination/2011-03-25-DSEMiscompile.ll b/test/Transforms/DeadStoreElimination/2011-03-25-DSEMiscompile.ll
index 079eec4..39d5358 100644
--- a/test/Transforms/DeadStoreElimination/2011-03-25-DSEMiscompile.ll
+++ b/test/Transforms/DeadStoreElimination/2011-03-25-DSEMiscompile.ll
@@ -5,9 +5,9 @@ target triple = "i386-apple-darwin9.8"
 
 @A = external global [0 x i32]
 
-declare cc10 void @Func2(i32*, i32*, i32*, i32)
+declare ghccc void @Func2(i32*, i32*, i32*, i32)
 
-define cc10 void @Func1(i32* noalias %Arg1, i32* noalias %Arg2, i32* %Arg3, i32 %Arg4) {
+define ghccc void @Func1(i32* noalias %Arg1, i32* noalias %Arg2, i32* %Arg3, i32 %Arg4) {
 entry:
   store i32 add (i32 ptrtoint ([0 x i32]* @A to i32), i32 1), i32* %Arg2
 ; CHECK: store i32 add (i32 ptrtoint ([0 x i32]* @A to i32), i32 1), i32* %Arg2
@@ -18,6 +18,6 @@ entry:
   %ln2gE = bitcast i32* %ln2gD to double*
   store double %ln2gB, double* %ln2gE
 ; CHECK: store double %ln2gB, double* %ln2gE
-  tail call cc10 void @Func2(i32* %Arg1, i32* %Arg2, i32* %Arg3, i32 %Arg4) nounwind
+  tail call ghccc void @Func2(i32* %Arg1, i32* %Arg2, i32* %Arg3, i32 %Arg4) nounwind
   ret void
 }
diff --git a/test/Transforms/DeadStoreElimination/inst-limits.ll b/test/Transforms/DeadStoreElimination/inst-limits.ll
index 3d78bb5..3ef5607 100644
--- a/test/Transforms/DeadStoreElimination/inst-limits.ll
+++ b/test/Transforms/DeadStoreElimination/inst-limits.ll
@@ -118,7 +118,7 @@ entry:
 
   ; Insert a meaningless dbg.value intrinsic; it should have no
   ; effect on the working of DSE in any way.
-  call void @llvm.dbg.value(metadata !12, i64 0, metadata !10, metadata !{})
+  call void @llvm.dbg.value(metadata i32* undef, i64 0, metadata !10, metadata !{})
 
   ; CHECK:  store i32 -1, i32* @x, align 4
   store i32 -1, i32* @x, align 4
@@ -245,18 +245,18 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !13}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4\001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2} ; [ DW_TAG_compile_unit ] [/home/tmp/test.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"test.c", metadata !"/home/tmp"}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00test_within_limit\00test_within_limit\00\003\000\001\000\006\00256\000\004", metadata !1, metadata !5, metadata !6, null, i32 ()* @test_within_limit, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [test]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/home/tmp/test.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !10}
-!10 = metadata !{metadata !"0x34\00x\00x\00\001\000\001", null, metadata !5, metadata !8, i32* @x, null} ; [ DW_TAG_variable ] [x] [line 1] [def]
-!11 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!12 = metadata !{i32* undef}
+!0 = !{!"0x11\004\00clang version 3.4\001\00\000\00\000", !1, !2, !2, !3, !9, !2} ; [ DW_TAG_compile_unit ] [/home/tmp/test.c] [DW_LANG_C99]
+!1 = !{!"test.c", !"/home/tmp"}
+!2 = !{i32 0}
+!3 = !{!4}
+!4 = !{!"0x2e\00test_within_limit\00test_within_limit\00\003\000\001\000\006\00256\000\004", !1, !5, !6, null, i32 ()* @test_within_limit, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [test]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/home/tmp/test.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!10}
+!10 = !{!"0x34\00x\00x\00\001\000\001", null, !5, !8, i32* @x, null} ; [ DW_TAG_variable ] [x] [line 1] [def]
+!11 = !{i32 2, !"Dwarf Version", i32 4}
+!12 = !{i32* undef}
 
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!13 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/DebugIR/crash.ll b/test/Transforms/DebugIR/crash.ll
deleted file mode 100644
index f4a88d7..0000000
--- a/test/Transforms/DebugIR/crash.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; ModuleID = 'crash.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-linux-gnu"
-
-@.str = private unnamed_addr constant [18 x i8] c"Hello, segfault!\0A\00", align 1
-@.str1 = private unnamed_addr constant [14 x i8] c"Now crash %d\0A\00", align 1
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** %argv) #0 {
-  %1 = alloca i32, align 4                                        ;CHECK: !dbg
-  %2 = alloca i32, align 4                                        ;CHECK-NEXT: !dbg
-  %3 = alloca i8**, align 8                                       ;CHECK-NEXT: !dbg
-  %null_ptr = alloca i32*, align 8                                ;CHECK-NEXT: !dbg
-  store i32 0, i32* %1                                            ;CHECK-NEXT: !dbg
-  store i32 %argc, i32* %2, align 4                               ;CHECK-NEXT: !dbg
-  store i8** %argv, i8*** %3, align 8                             ;CHECK-NEXT: !dbg
-  store i32* null, i32** %null_ptr, align 8                       ;CHECK-NEXT: !dbg
-  %4 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([18 x i8]* @.str, i32 0, i32 0)) ;CHECK-NEXT: !dbg
-  %5 = load i32** %null_ptr, align 8                              ;CHECK-NEXT: !dbg
-  %6 = load i32* %5, align 4                                      ;CHECK-NEXT: !dbg
-  %7 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([14 x i8]* @.str1, i32 0, i32 0), i32 %6) ;CHECK-NEXT: !dbg
-  %8 = load i32* %2, align 4                                      ;CHECK-NEXT: !dbg
-  ret i32 %8                                                      ;CHECK-NEXT: !dbg
-}
-
-declare i32 @printf(i8*, ...) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-; CHECK: = metadata !{i32 14,
-; CHECK-NEXT: = metadata !{i32 15,
-; CHECK-NEXT: = metadata !{i32 16,
-; CHECK-NEXT: = metadata !{i32 17,
-; CHECK-NEXT: = metadata !{i32 18,
-; CHECK-NEXT: = metadata !{i32 19,
-; CHECK-NEXT: = metadata !{i32 20,
-; CHECK-NEXT: = metadata !{i32 21,
-; CHECK-NEXT: = metadata !{i32 22,
-; CHECK-NEXT: = metadata !{i32 23,
-
-; RUN: opt %s -debug-ir -S | FileCheck %s
diff --git a/test/Transforms/DebugIR/exception.ll b/test/Transforms/DebugIR/exception.ll
deleted file mode 100644
index 2436d38..0000000
--- a/test/Transforms/DebugIR/exception.ll
+++ /dev/null
@@ -1,127 +0,0 @@
-; ModuleID = 'exception.cpp'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-linux-gnu"
-
-@_ZTIi = external constant i8*
-
-; Function Attrs: uwtable
-define i32 @main(i32 %argc, i8** %argv) #0 {
-  %1 = alloca i32, align 4                        ; CHECK: !dbg
-  %2 = alloca i32, align 4                        ; CHECK-NEXT: !dbg
-  %3 = alloca i8**, align 8                       ; CHECK-NEXT: !dbg
-  %4 = alloca i8*                                 ; CHECK-NEXT: !dbg
-  %5 = alloca i32                                 ; CHECK-NEXT: !dbg
-  %e = alloca i32, align 4                        ; CHECK-NEXT: !dbg
-  %6 = alloca i32                                 ; CHECK-NEXT: !dbg
-  store i32 0, i32* %1                            ; CHECK-NEXT: !dbg
-  store i32 %argc, i32* %2, align 4               ; CHECK-NEXT: !dbg
-  store i8** %argv, i8*** %3, align 8             ; CHECK-NEXT: !dbg
-  %7 = call i8* @__cxa_allocate_exception(i64 4) #2 ; CHECK-NEXT: !dbg
-  %8 = bitcast i8* %7 to i32*                     ; CHECK-NEXT: !dbg
-  %9 = load i32* %2, align 4                      ; CHECK-NEXT: !dbg
-  store i32 %9, i32* %8                           ; CHECK-NEXT: !dbg
-  invoke void @__cxa_throw(i8* %7, i8* bitcast (i8** @_ZTIi to i8*), i8* null) #3
-          to label %31 unwind label %10           ; CHECK: !dbg
-
-; <label>:10                                      ; preds = %0
-  %11 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-          catch i8* bitcast (i8** @_ZTIi to i8*)  ; CHECK: !dbg
-  %12 = extractvalue { i8*, i32 } %11, 0          ; CHECK-NEXT: !dbg
-  store i8* %12, i8** %4                          ; CHECK-NEXT: !dbg
-  %13 = extractvalue { i8*, i32 } %11, 1          ; CHECK-NEXT: !dbg
-  store i32 %13, i32* %5                          ; CHECK-NEXT: !dbg
-  br label %14                                    ; CHECK-NEXT: !dbg
-
-; <label>:14                                      ; preds = %10
-  %15 = load i32* %5                              ; CHECK: !dbg
-  %16 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) #2   ; CHECK-NEXT: !dbg
-  %17 = icmp eq i32 %15, %16                      ; CHECK-NEXT: !dbg
-  br i1 %17, label %18, label %26                 ; CHECK-NEXT: !dbg
-
-; <label>:18                                      ; preds = %14
-  %19 = load i8** %4                              ; CHECK: !dbg
-  %20 = call i8* @__cxa_begin_catch(i8* %19) #2   ; CHECK-NEXT: !dbg
-  %21 = bitcast i8* %20 to i32*                   ; CHECK-NEXT: !dbg
-  %22 = load i32* %21, align 4                    ; CHECK-NEXT: !dbg
-  store i32 %22, i32* %e, align 4                 ; CHECK-NEXT: !dbg
-  %23 = load i32* %e, align 4                     ; CHECK-NEXT: !dbg
-  store i32 %23, i32* %1                          ; CHECK-NEXT: !dbg
-  store i32 1, i32* %6                            ; CHECK-NEXT: !dbg
-  call void @__cxa_end_catch() #2                 ; CHECK-NEXT: !dbg
-  br label %24                                    ; CHECK-NEXT: !dbg
-
-; <label>:24                                      ; preds = %18
-  %25 = load i32* %1                              ; CHECK: !dbg
-  ret i32 %25                                     ; CHECK-NEXT: !dbg
-
-; <label>:26                                      ; preds = %14
-  %27 = load i8** %4                              ; CHECK: !dbg
-  %28 = load i32* %5                              ; CHECK-NEXT: !dbg
-  %29 = insertvalue { i8*, i32 } undef, i8* %27, 0 ; CHECK-NEXT: !dbg
-  %30 = insertvalue { i8*, i32 } %29, i32 %28, 1   ; CHECK-NEXT: !dbg
-  resume { i8*, i32 } %30                         ; CHECK-NEXT: !dbg
-
-; <label>:31                                      ; preds = %0
-  unreachable                                     ; CHECK: !dbg
-}
-
-declare i8* @__cxa_allocate_exception(i64)
-
-declare void @__cxa_throw(i8*, i8*, i8*)
-
-declare i32 @__gxx_personality_v0(...)
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #1
-
-declare i8* @__cxa_begin_catch(i8*)
-
-declare void @__cxa_end_catch()
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind }
-attributes #3 = { noreturn }
-; CHECK: = metadata !{i32 16,
-; CHECK-NEXT: = metadata !{i32 17,
-; CHECK-NEXT: = metadata !{i32 18,
-; CHECK-NEXT: = metadata !{i32 19,
-; CHECK-NEXT: = metadata !{i32 20,
-; CHECK-NEXT: = metadata !{i32 21,
-; CHECK-NEXT: = metadata !{i32 22,
-; CHECK-NEXT: = metadata !{i32 24,
-
-; CHECK-NEXT: = metadata !{i32 28,
-; CHECK-NEXT: = metadata !{i32 29,
-; CHECK-NEXT: = metadata !{i32 30,
-; CHECK-NEXT: = metadata !{i32 31,
-; CHECK-NEXT: = metadata !{i32 32,
-; CHECK-NEXT: = metadata !{i32 33,
-
-; CHECK-NEXT: = metadata !{i32 36,
-; CHECK-NEXT: = metadata !{i32 37,
-; CHECK-NEXT: = metadata !{i32 38,
-; CHECK-NEXT: = metadata !{i32 39,
-
-; CHECK-NEXT: = metadata !{i32 42,
-; CHECK-NEXT: = metadata !{i32 43,
-; CHECK-NEXT: = metadata !{i32 44,
-; CHECK-NEXT: = metadata !{i32 45,
-; CHECK-NEXT: = metadata !{i32 46,
-; CHECK-NEXT: = metadata !{i32 47,
-; CHECK-NEXT: = metadata !{i32 48,
-; CHECK-NEXT: = metadata !{i32 49,
-; CHECK-NEXT: = metadata !{i32 50,
-; CHECK-NEXT: = metadata !{i32 51,
-
-; CHECK-NEXT: = metadata !{i32 54,
-; CHECK-NEXT: = metadata !{i32 55,
-
-; CHECK-NEXT: = metadata !{i32 58,
-; CHECK-NEXT: = metadata !{i32 59,
-; CHECK-NEXT: = metadata !{i32 60,
-; CHECK-NEXT: = metadata !{i32 61,
-; CHECK-NEXT: = metadata !{i32 62,
-; CHECK-NEXT: = metadata !{i32 65,
-
-; RUN: opt %s -debug-ir -S | FileCheck %s
diff --git a/test/Transforms/DebugIR/function.ll b/test/Transforms/DebugIR/function.ll
deleted file mode 100644
index dba073d..0000000
--- a/test/Transforms/DebugIR/function.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; ModuleID = 'function.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-linux-gnu"
-
-; Function Attrs: nounwind uwtable
-define void @blah(i32* %i) #0 {
-  %1 = alloca i32*, align 8                   ; CHECK: !dbg
-  store i32* %i, i32** %1, align 8            ; CHECK-NEXT: !dbg
-  %2 = load i32** %1, align 8                 ; CHECK-NEXT: !dbg
-  %3 = load i32* %2, align 4                  ; CHECK-NEXT: !dbg
-  %4 = add nsw i32 %3, 1                      ; CHECK-NEXT: !dbg
-  store i32 %4, i32* %2, align 4              ; CHECK-NEXT: !dbg
-  ret void                                    ; CHECK-NEXT: !dbg
-}
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** %argv) #0 {
-  %1 = alloca i32, align 4                    ; CHECK: !dbg
-  %2 = alloca i32, align 4                    ; CHECK-NEXT: !dbg
-  %3 = alloca i8**, align 8                   ; CHECK-NEXT: !dbg
-  %i = alloca i32, align 4                    ; CHECK-NEXT: !dbg
-  store i32 0, i32* %1                        ; CHECK-NEXT: !dbg
-  store i32 %argc, i32* %2, align 4           ; CHECK-NEXT: !dbg
-  store i8** %argv, i8*** %3, align 8         ; CHECK-NEXT: !dbg
-  store i32 7, i32* %i, align 4               ; CHECK-NEXT: !dbg
-  call void @blah(i32* %i)                    ; CHECK-NEXT: !dbg
-  %4 = load i32* %i, align 4                  ; CHECK-NEXT: !dbg
-  ret i32 %4                                  ; CHECK-NEXT: !dbg
-}
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-; CHECK: = metadata !{i32 8,
-; CHECK-NEXT: = metadata !{i32 9,
-; CHECK-NEXT: = metadata !{i32 10,
-; CHECK-NEXT: = metadata !{i32 11,
-; CHECK-NEXT: = metadata !{i32 12,
-; CHECK-NEXT: = metadata !{i32 13,
-
-; CHECK-NEXT: = metadata !{i32 18,
-; CHECK-NEXT: = metadata !{i32 19,
-; CHECK-NEXT: = metadata !{i32 20,
-; CHECK-NEXT: = metadata !{i32 21,
-; CHECK-NEXT: = metadata !{i32 22,
-; CHECK-NEXT: = metadata !{i32 23,
-; CHECK-NEXT: = metadata !{i32 24,
-; CHECK-NEXT: = metadata !{i32 25,
-; CHECK-NEXT: = metadata !{i32 26,
-; CHECK-NEXT: = metadata !{i32 27,
-; CHECK-NEXT: = metadata !{i32 28,
-
-; RUN: opt %s -debug-ir -S | FileCheck %s
diff --git a/test/Transforms/DebugIR/simple-addrspace.ll b/test/Transforms/DebugIR/simple-addrspace.ll
deleted file mode 100644
index 6539c8a..0000000
--- a/test/Transforms/DebugIR/simple-addrspace.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: opt -debug-ir -S %s -o - | FileCheck %s
-
-target datalayout = "e-p:64:64:64-p1:16:16:16"
-
-define void @foo(i32 addrspace(1)*) nounwind {
-  ret void
-}
-
-; Make sure the pointer size is 16
-
-; CHECK: metadata !"0xf\00i32 addrspace(1)*\000\0016\002\000\000"
diff --git a/test/Transforms/DebugIR/simple.ll b/test/Transforms/DebugIR/simple.ll
deleted file mode 100644
index 3b18895..0000000
--- a/test/Transforms/DebugIR/simple.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; ModuleID = 'simple.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-linux-gnu"
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** %argv) #0 {
-  %1 = alloca i32, align 4                  ; CHECK: !dbg
-  %2 = alloca i32, align 4                  ; CHECK-NEXT: !dbg
-  %3 = alloca i8**, align 8                 ; CHECK-NEXT: !dbg
-  store i32 0, i32* %1                      ; CHECK-NEXT: !dbg
-  store i32 %argc, i32* %2, align 4         ; CHECK-NEXT: !dbg
-  store i8** %argv, i8*** %3, align 8       ; CHECK-NEXT: !dbg
-  %4 = load i32* %2, align 4                ; CHECK-NEXT: !dbg
-  ret i32 %4                                ; CHECK-NEXT: !dbg
-}
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-; CHECK: = metadata !{i32 10,
-; CHECK-NEXT: = metadata !{i32 11,
-; CHECK-NEXT: = metadata !{i32 12,
-; CHECK-NEXT: = metadata !{i32 13,
-; CHECK-NEXT: = metadata !{i32 14,
-
-; RUN: opt %s -debug-ir -S | FileCheck %s
diff --git a/test/Transforms/DebugIR/struct.ll b/test/Transforms/DebugIR/struct.ll
deleted file mode 100644
index 8db3dbe..0000000
--- a/test/Transforms/DebugIR/struct.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; ModuleID = 'struct.cpp'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-linux-gnu"
-
-%struct.blah = type { i32, float, i8 }
-
-; Function Attrs: nounwind uwtable
-define i32 @main() #0 {
-  %1 = alloca i32, align 4                                    ; CHECK: !dbg
-  %b = alloca %struct.blah, align 4                           ; CHECK-NEXT: !dbg
-  store i32 0, i32* %1                                        ; CHECK-NEXT: !dbg
-  %2 = getelementptr inbounds %struct.blah* %b, i32 0, i32 0  ; CHECK-NEXT: !dbg
-  %3 = load i32* %2, align 4                                  ; CHECK-NEXT: !dbg
-  ret i32 %3                                                  ; CHECK-NEXT: !dbg
-}
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-; CHECK: = metadata !{i32 11,
-; CHECK-NEXT: = metadata !{i32 12,
-; CHECK-NEXT: = metadata !{i32 13,
-; CHECK-NEXT: = metadata !{i32 14,
-
-; RUN: opt %s -debug-ir -S | FileCheck %s
diff --git a/test/Transforms/DebugIR/vector.ll b/test/Transforms/DebugIR/vector.ll
deleted file mode 100644
index 50d99ac..0000000
--- a/test/Transforms/DebugIR/vector.ll
+++ /dev/null
@@ -1,93 +0,0 @@
-; ModuleID = 'vector.cpp'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-linux-gnu"
-
-; Function Attrs: noinline nounwind uwtable
-define <4 x float> @_Z3fooDv2_fS_(double %a.coerce, double %b.coerce) #0 {
-  %1 = alloca <2 x float>, align 8                    ; CHECK: !dbg
-  %2 = alloca <2 x float>, align 8                    ; CHECK-NEXT: !dbg
-  %3 = alloca <2 x float>, align 8                    ; CHECK-NEXT: !dbg
-  %4 = alloca <2 x float>, align 8                    ; CHECK-NEXT: !dbg
-  %c = alloca <4 x float>, align 16                   ; CHECK-NEXT: !dbg
-  %5 = bitcast <2 x float>* %1 to double*             ; CHECK-NEXT: !dbg
-  store double %a.coerce, double* %5, align 1         ; CHECK-NEXT: !dbg
-  %a = load <2 x float>* %1, align 8                  ; CHECK-NEXT: !dbg
-  store <2 x float> %a, <2 x float>* %2, align 8      ; CHECK-NEXT: !dbg
-  %6 = bitcast <2 x float>* %3 to double*             ; CHECK-NEXT: !dbg
-  store double %b.coerce, double* %6, align 1         ; CHECK-NEXT: !dbg
-  %b = load <2 x float>* %3, align 8                  ; CHECK-NEXT: !dbg
-  store <2 x float> %b, <2 x float>* %4, align 8      ; CHECK-NEXT: !dbg
-  %7 = load <2 x float>* %2, align 8                  ; CHECK-NEXT: !dbg
-  %8 = load <4 x float>* %c, align 16                 ; CHECK-NEXT: !dbg
-  %9 = shufflevector <2 x float> %7, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>   ; CHECK-NEXT: !dbg
-  %10 = shufflevector <4 x float> %8, <4 x float> %9, <4 x i32> <i32 4, i32 1, i32 5, i32 3>             ; CHECK-NEXT: !dbg
-  store <4 x float> %10, <4 x float>* %c, align 16    ; CHECK-NEXT: !dbg
-  %11 = load <2 x float>* %4, align 8                 ; CHECK-NEXT: !dbg
-  %12 = load <4 x float>* %c, align 16                ; CHECK-NEXT: !dbg
-  %13 = shufflevector <2 x float> %11, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> ; CHECK-NEXT: !dbg
-  %14 = shufflevector <4 x float> %12, <4 x float> %13, <4 x i32> <i32 0, i32 4, i32 2, i32 5>           ; CHECK-NEXT: !dbg
-  store <4 x float> %14, <4 x float>* %c, align 16    ; CHECK-NEXT: !dbg
-  %15 = load <4 x float>* %c, align 16                ; CHECK-NEXT: !dbg
-  ret <4 x float> %15                                 ; CHECK-NEXT: !dbg
-}
-
-; Function Attrs: nounwind uwtable
-define i32 @main() #1 {
-  %1 = alloca i32, align 4                            ; CHECK: !dbg
-  %a = alloca <2 x float>, align 8                    ; CHECK-NEXT: !dbg
-  %b = alloca <2 x float>, align 8                    ; CHECK-NEXT: !dbg
-  %x = alloca <4 x float>, align 16                   ; CHECK-NEXT: !dbg
-  %2 = alloca <2 x float>, align 8                    ; CHECK-NEXT: !dbg
-  %3 = alloca <2 x float>, align 8                    ; CHECK-NEXT: !dbg
-  store i32 0, i32* %1                                ; CHECK-NEXT: !dbg
-  store <2 x float> <float 1.000000e+00, float 2.000000e+00>, <2 x float>* %a, align 8                   ; CHECK-NEXT: !dbg
-  store <2 x float> <float 1.000000e+00, float 2.000000e+00>, <2 x float>* %b, align 8                   ; CHECK-NEXT: !dbg
-  %4 = load <2 x float>* %a, align 8                  ; CHECK-NEXT: !dbg
-  %5 = load <2 x float>* %b, align 8                  ; CHECK-NEXT: !dbg
-  store <2 x float> %4, <2 x float>* %2, align 8      ; CHECK-NEXT: !dbg
-  %6 = bitcast <2 x float>* %2 to double*             ; CHECK-NEXT: !dbg
-  %7 = load double* %6, align 1                       ; CHECK-NEXT: !dbg
-  store <2 x float> %5, <2 x float>* %3, align 8      ; CHECK-NEXT: !dbg
-  %8 = bitcast <2 x float>* %3 to double*             ; CHECK-NEXT: !dbg
-  %9 = load double* %8, align 1                       ; CHECK-NEXT: !dbg
-  %10 = call <4 x float> @_Z3fooDv2_fS_(double %7, double %9)                                            ; CHECK-NEXT: !dbg
-  store <4 x float> %10, <4 x float>* %x, align 16    ; CHECK-NEXT: !dbg
-  %11 = load <4 x float>* %x, align 16                ; CHECK-NEXT: !dbg
-  %12 = extractelement <4 x float> %11, i32 0         ; CHECK-NEXT: !dbg
-  %13 = load <4 x float>* %x, align 16                ; CHECK-NEXT: !dbg
-  %14 = extractelement <4 x float> %13, i32 1         ; CHECK-NEXT: !dbg
-  %15 = fadd float %12, %14                           ; CHECK-NEXT: !dbg
-  %16 = load <4 x float>* %x, align 16                ; CHECK-NEXT: !dbg
-  %17 = extractelement <4 x float> %16, i32 2         ; CHECK-NEXT: !dbg
-  %18 = fadd float %15, %17                           ; CHECK-NEXT: !dbg
-  %19 = load <4 x float>* %x, align 16                ; CHECK-NEXT: !dbg
-  %20 = extractelement <4 x float> %19, i32 3         ; CHECK-NEXT: !dbg
-  %21 = fadd float %18, %20                           ; CHECK-NEXT: !dbg
-  %22 = fptosi float %21 to i32                       ; CHECK-NEXT: !dbg
-  ret i32 %22                                         ; CHECK-NEXT: !dbg
-}
-
-attributes #0 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-; CHECK: = metadata !{i32 13,
-; CHECK-NEXT: = metadata !{i32 14,
-; CHECK-NEXT: = metadata !{i32 15,
-; CHECK-NEXT: = metadata !{i32 16,
-; CHECK-NEXT: = metadata !{i32 17,
-; CHECK-NEXT: = metadata !{i32 18,
-; CHECK-NEXT: = metadata !{i32 19,
-; CHECK-NEXT: = metadata !{i32 20,
-; CHECK-NEXT: = metadata !{i32 21,
-; CHECK-NEXT: = metadata !{i32 22,
-; CHECK-NEXT: = metadata !{i32 23,
-; CHECK-NEXT: = metadata !{i32 24,
-; CHECK-NEXT: = metadata !{i32 25,
-; CHECK-NEXT: = metadata !{i32 26,
-; CHECK-NEXT: = metadata !{i32 27,
-; CHECK-NEXT: = metadata !{i32 28,
-; CHECK-NEXT: = metadata !{i32 29,
-; CHECK-NEXT: = metadata !{i32 30,
-; CHECK-NEXT: = metadata !{i32 31,
-
-; RUN: opt %s -debug-ir -S | FileCheck %s
diff --git a/test/Transforms/EarlyCSE/AArch64/intrinsics.ll b/test/Transforms/EarlyCSE/AArch64/intrinsics.ll
new file mode 100644
index 0000000..d166ff1
--- /dev/null
+++ b/test/Transforms/EarlyCSE/AArch64/intrinsics.ll
@@ -0,0 +1,232 @@
+; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse | FileCheck %s
+; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse | FileCheck %s
+
+define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
+entry:
+; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
+; CHECK-LABEL: @test_cse
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
+  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = bitcast i32* %a to i8*
+  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
+  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <4 x i32>
+  %4 = bitcast <16 x i8> %2 to <4 x i32>
+  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
+  %5 = bitcast i32* %a to i8*
+  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
+  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
+  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret <4 x i32> %res.0
+}
+
+define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
+entry:
+; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE.
+; CHECK-LABEL: @test_cse2
+; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
+; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
+  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = bitcast i32* %a to i8*
+  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
+  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <4 x i32>
+  %4 = bitcast <16 x i8> %2 to <4 x i32>
+  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
+  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
+  %5 = bitcast i32* %a to i8*
+  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
+  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
+  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret <4 x i32> %res.0
+}
+
+define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 {
+entry:
+; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
+; CHECK-LABEL: @test_cse3
+; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
+  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = bitcast i32* %a to i8*
+  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0)
+  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
+  %1 = bitcast i32* %a to i8*
+  %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1)
+  %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0
+  %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1
+  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret <4 x i32> %res.0
+}
+
+
+define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) {
+entry:
+; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized
+; away by Early CSE.
+; CHECK-LABEL: @test_nocse
+; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
+  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = bitcast i32* %a to i8*
+  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
+  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <4 x i32>
+  %4 = bitcast <16 x i8> %2 to <4 x i32>
+  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
+  store i32 0, i32* %b, align 4
+  %5 = bitcast i32* %a to i8*
+  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
+  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
+  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret <4 x i32> %res.0
+}
+
+define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
+entry:
+; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due
+; to mismatch between st2 and ld3.
+; CHECK-LABEL: @test_nocse2
+; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8
+  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = bitcast i32* %a to i8*
+  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
+  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <4 x i32>
+  %4 = bitcast <16 x i8> %2 to <4 x i32>
+  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
+  %5 = bitcast i32* %a to i8*
+  %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
+  %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
+  %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
+  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret <4 x i32> %res.0
+}
+
+define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
+entry:
+; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to
+; mismatch between st2 and st3.
+; CHECK-LABEL: @test_nocse3
+; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8
+; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8
+  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = bitcast i32* %a to i8*
+  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
+  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <4 x i32>
+  %4 = bitcast <16 x i8> %2 to <4 x i32>
+  call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0)
+  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
+  %5 = bitcast i32* %a to i8*
+  %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
+  %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
+  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret <4 x i32> %res.0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture)
+
+; Function Attrs: nounwind
+declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture)
+
+; Function Attrs: nounwind readonly
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*)
+
+; Function Attrs: nounwind readonly
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*)
+
+define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) {
+entry:
+  %add = add <4 x i32> %__p0, %__p1
+  ret <4 x i32> %add
+}
diff --git a/test/Transforms/EarlyCSE/AArch64/lit.local.cfg b/test/Transforms/EarlyCSE/AArch64/lit.local.cfg
new file mode 100644
index 0000000..6642d28
--- /dev/null
+++ b/test/Transforms/EarlyCSE/AArch64/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.ll']
+
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
diff --git a/test/Transforms/EarlyCSE/basic.ll b/test/Transforms/EarlyCSE/basic.ll
index 155d36f..a36a103 100644
--- a/test/Transforms/EarlyCSE/basic.ll
+++ b/test/Transforms/EarlyCSE/basic.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -S -early-cse | FileCheck %s
+; RUN: opt < %s -S -passes=early-cse | FileCheck %s
 
 declare void @llvm.assume(i1) nounwind
 
@@ -192,4 +193,13 @@ define void @test11(i32 *%P) {
   ; CHECK-NEXT: ret void
 }
 
-
+; CHECK-LABEL: @test12(
+define i32 @test12(i1 %B, i32* %P1, i32* %P2) {
+  %load0 = load i32* %P1
+  %1 = load atomic i32* %P2 seq_cst, align 4
+  %load1 = load i32* %P1
+  %sel = select i1 %B, i32 %load0, i32 %load1
+  ret i32 %sel
+  ; CHECK: load i32* %P1
+  ; CHECK: load i32* %P1
+}
diff --git a/test/Transforms/GCOVProfiling/function-numbering.ll b/test/Transforms/GCOVProfiling/function-numbering.ll
index 2480820..487f4ca 100644
--- a/test/Transforms/GCOVProfiling/function-numbering.ll
+++ b/test/Transforms/GCOVProfiling/function-numbering.ll
@@ -2,7 +2,7 @@
 ; functions aren't emitted.
 
 ; Inject metadata to set the .gcno file location
-; RUN: echo '!14 = metadata !{metadata !"%/T/function-numbering.ll", metadata !0}' > %t1
+; RUN: echo '!14 = !{!"%/T/function-numbering.ll", !0}' > %t1
 ; RUN: cat %s %t1 > %t2
 
 ; RUN: opt -insert-gcov-profiling -S < %t2 | FileCheck --check-prefix GCDA %s
@@ -40,17 +40,17 @@ define void @baz() {
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.6.0 \000\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [function-numbering.ll] [DW_LANG_C99]
-!1 = metadata !{metadata !".../llvm/test/Transforms/GCOVProfiling/function-numbering.ll", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !7, metadata !8}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\000\000\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}    ; [ DW_TAG_file_type ] [/Users/bogner/build/llvm-debug//tmp/foo.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !"0x2e\00bar\00bar\00\002\000\001\000\000\000\000\002", metadata !1, metadata !5, metadata !6, null, void ()* @bar, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
-!8 = metadata !{metadata !"0x2e\00baz\00baz\00\003\000\001\000\000\000\000\003", metadata !1, metadata !5, metadata !6, null, void ()* @baz, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [baz]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!11 = metadata !{metadata !"clang version 3.6.0 "}
-!12 = metadata !{i32 1, i32 13, metadata !4, null}
-!13 = metadata !{i32 3, i32 13, metadata !8, null}
+!0 = !{!"0x11\0012\00clang version 3.6.0 \000\00\000\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [function-numbering.ll] [DW_LANG_C99]
+!1 = !{!".../llvm/test/Transforms/GCOVProfiling/function-numbering.ll", !""}
+!2 = !{}
+!3 = !{!4, !7, !8}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\000\000\000\001", !1, !5, !6, null, void ()* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}    ; [ DW_TAG_file_type ] [/Users/bogner/build/llvm-debug//tmp/foo.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!"0x2e\00bar\00bar\00\002\000\001\000\000\000\000\002", !1, !5, !6, null, void ()* @bar, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
+!8 = !{!"0x2e\00baz\00baz\00\003\000\001\000\000\000\000\003", !1, !5, !6, null, void ()* @baz, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [baz]
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 2}
+!11 = !{!"clang version 3.6.0 "}
+!12 = !MDLocation(line: 1, column: 13, scope: !4)
+!13 = !MDLocation(line: 3, column: 13, scope: !8)
diff --git a/test/Transforms/GCOVProfiling/global-ctor.ll b/test/Transforms/GCOVProfiling/global-ctor.ll
index 1dff3f0..9a9b7ce 100644
--- a/test/Transforms/GCOVProfiling/global-ctor.ll
+++ b/test/Transforms/GCOVProfiling/global-ctor.ll
@@ -1,4 +1,4 @@
-; RUN: echo '!16 = metadata !{metadata !"%/T/global-ctor.ll", metadata !0}' > %t1
+; RUN: echo '!16 = !{!"%/T/global-ctor.ll", !0}' > %t1
 ; RUN: cat %s %t1 > %t2
 ; RUN: opt -insert-gcov-profiling -disable-output < %t2
 ; RUN: not grep '_GLOBAL__sub_I_global-ctor' %T/global-ctor.gcno
@@ -38,19 +38,19 @@ attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "
 !llvm.gcov = !{!16}
 !llvm.ident = !{!12}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 210217)\000\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/home/nlewycky/<stdin>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"<stdin>", metadata !"/home/nlewycky"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !8}
-!4 = metadata !{metadata !"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\002\001\001\000\006\00256\000\002", metadata !5, metadata !6, metadata !7, null, void ()* @__cxx_global_var_init, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [__cxx_global_var_init]
-!5 = metadata !{metadata !"global-ctor.ll", metadata !"/home/nlewycky"}
-!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [/home/nlewycky/global-ctor.ll]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !"0x2e\00\00\00_GLOBAL__sub_I_global-ctor.ll\000\001\001\000\006\0064\000\000", metadata !1, metadata !9, metadata !7, null, void ()* @_GLOBAL__sub_I_global-ctor.ll, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 0] [local] [def]
-!9 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/home/nlewycky/<stdin>]
-!10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!11 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!12 = metadata !{metadata !"clang version 3.5.0 (trunk 210217)"}
-!13 = metadata !{i32 2, i32 0, metadata !4, null}
-!14 = metadata !{i32 0, i32 0, metadata !15, null}
-!15 = metadata !{metadata !"0xb\000", metadata !5, metadata !8} ; [ DW_TAG_lexical_block ] [/home/nlewycky/global-ctor.ll]
+!0 = !{!"0x11\004\00clang version 3.5.0 (trunk 210217)\000\00\000\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/home/nlewycky/<stdin>] [DW_LANG_C_plus_plus]
+!1 = !{!"<stdin>", !"/home/nlewycky"}
+!2 = !{}
+!3 = !{!4, !8}
+!4 = !{!"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\002\001\001\000\006\00256\000\002", !5, !6, !7, null, void ()* @__cxx_global_var_init, null, null, !2} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [__cxx_global_var_init]
+!5 = !{!"global-ctor.ll", !"/home/nlewycky"}
+!6 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [/home/nlewycky/global-ctor.ll]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!"0x2e\00\00\00_GLOBAL__sub_I_global-ctor.ll\000\001\001\000\006\0064\000\000", !1, !9, !7, null, void ()* @_GLOBAL__sub_I_global-ctor.ll, null, null, !2} ; [ DW_TAG_subprogram ] [line 0] [local] [def]
+!9 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/home/nlewycky/<stdin>]
+!10 = !{i32 2, !"Dwarf Version", i32 4}
+!11 = !{i32 2, !"Debug Info Version", i32 2}
+!12 = !{!"clang version 3.5.0 (trunk 210217)"}
+!13 = !MDLocation(line: 2, scope: !4)
+!14 = !MDLocation(line: 0, scope: !15)
+!15 = !{!"0xb\000", !5, !8} ; [ DW_TAG_lexical_block ] [/home/nlewycky/global-ctor.ll]
diff --git a/test/Transforms/GCOVProfiling/linezero.ll b/test/Transforms/GCOVProfiling/linezero.ll
index 50e026c..cf0fcd2 100644
--- a/test/Transforms/GCOVProfiling/linezero.ll
+++ b/test/Transforms/GCOVProfiling/linezero.ll
@@ -19,17 +19,17 @@ entry:
   %__begin = alloca i8*, align 8
   %__end = alloca i8*, align 8
   %spec = alloca i8, align 1
-  call void @llvm.dbg.declare(metadata !{%struct.vector** %__range}, metadata !27, metadata !{}), !dbg !30
+  call void @llvm.dbg.declare(metadata %struct.vector** %__range, metadata !27, metadata !{}), !dbg !30
   br label %0
 
 ; <label>:0                                       ; preds = %entry
   call void @_Z13TagFieldSpecsv(), !dbg !31
   store %struct.vector* %ref.tmp, %struct.vector** %__range, align 8, !dbg !31
-  call void @llvm.dbg.declare(metadata !{i8** %__begin}, metadata !32, metadata !{}), !dbg !30
+  call void @llvm.dbg.declare(metadata i8** %__begin, metadata !32, metadata !{}), !dbg !30
   %1 = load %struct.vector** %__range, align 8, !dbg !31
   %call = call i8* @_ZN6vector5beginEv(%struct.vector* %1), !dbg !31
   store i8* %call, i8** %__begin, align 8, !dbg !31
-  call void @llvm.dbg.declare(metadata !{i8** %__end}, metadata !33, metadata !{}), !dbg !30
+  call void @llvm.dbg.declare(metadata i8** %__end, metadata !33, metadata !{}), !dbg !30
   %2 = load %struct.vector** %__range, align 8, !dbg !31
   %call1 = call i8* @_ZN6vector3endEv(%struct.vector* %2), !dbg !31
   store i8* %call1, i8** %__end, align 8, !dbg !31
@@ -42,7 +42,7 @@ for.cond:                                         ; preds = %for.inc, %0
   br i1 %cmp, label %for.body, label %for.end, !dbg !34
 
 for.body:                                         ; preds = %for.cond
-  call void @llvm.dbg.declare(metadata !{i8* %spec}, metadata !37, metadata !{}), !dbg !31
+  call void @llvm.dbg.declare(metadata i8* %spec, metadata !37, metadata !{}), !dbg !31
   %5 = load i8** %__begin, align 8, !dbg !38
   %6 = load i8* %5, align 1, !dbg !38
   store i8 %6, i8* %spec, align 1, !dbg !38
@@ -94,49 +94,49 @@ attributes #3 = { noreturn nounwind }
 !llvm.gcov = !{!25}
 !llvm.ident = !{!26}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 209871)\000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [<stdin>] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"<stdin>", metadata !"PATTERN"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x13\00vector\0021\008\008\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS6vector"} ; [ DW_TAG_structure_type ] [vector] [line 21, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !"linezero.cc", metadata !"PATTERN"}
-!6 = metadata !{metadata !7, metadata !13}
-!7 = metadata !{metadata !"0x2e\00begin\00begin\00_ZN6vector5beginEv\0025\000\000\000\006\00256\000\0025", metadata !5, metadata !"_ZTS6vector", metadata !8, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 25] [begin]
-!8 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!9 = metadata !{metadata !10, metadata !12}
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
-!11 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS6vector"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS6vector]
-!13 = metadata !{metadata !"0x2e\00end\00end\00_ZN6vector3endEv\0026\000\000\000\006\00256\000\0026", metadata !5, metadata !"_ZTS6vector", metadata !8, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 26] [end]
-!14 = metadata !{metadata !15, metadata !20}
-!15 = metadata !{metadata !"0x2e\00test\00test\00_Z4testv\0050\000\001\000\006\00256\000\0050", metadata !5, metadata !16, metadata !17, null, i32 ()* @_Z4testv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 50] [def] [test]
-!16 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ] [./linezero.cc]
-!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!18 = metadata !{metadata !19}
-!19 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!20 = metadata !{metadata !"0x2e\00f1\00f1\00_Z2f1v\0054\000\001\000\006\00256\000\0054", metadata !5, metadata !16, metadata !21, null, void ()* @_Z2f1v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 54] [def] [f1]
-!21 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!22 = metadata !{null}
-!23 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!24 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!25 = metadata !{metadata !"PATTERN/linezero.o", metadata !0}
-!26 = metadata !{metadata !"clang version 3.5.0 (trunk 209871)"}
-!27 = metadata !{metadata !"0x100\00__range\000\0064", metadata !28, null, metadata !29} ; [ DW_TAG_auto_variable ] [__range] [line 0]
-!28 = metadata !{metadata !"0xb\0051\000\000", metadata !5, metadata !15} ; [ DW_TAG_lexical_block ] [./linezero.cc]
-!29 = metadata !{metadata !"0x42\00\000\000\000\000\000", null, null, metadata !"_ZTS6vector"} ; [ DW_TAG_rvalue_reference_type ] [line 0, size 0, align 0, offset 0] [from _ZTS6vector]
-!30 = metadata !{i32 0, i32 0, metadata !28, null}
-!31 = metadata !{i32 51, i32 0, metadata !28, null}
-!32 = metadata !{metadata !"0x100\00__begin\000\0064", metadata !28, null, metadata !10} ; [ DW_TAG_auto_variable ] [__begin] [line 0]
-!33 = metadata !{metadata !"0x100\00__end\000\0064", metadata !28, null, metadata !10} ; [ DW_TAG_auto_variable ] [__end] [line 0]
-!34 = metadata !{i32 51, i32 0, metadata !35, null}
-!35 = metadata !{metadata !"0xb\0051\000\005", metadata !5, metadata !36} ; [ DW_TAG_lexical_block ] [./linezero.cc]
-!36 = metadata !{metadata !"0xb\0051\000\001", metadata !5, metadata !28} ; [ DW_TAG_lexical_block ] [./linezero.cc]
-!37 = metadata !{metadata !"0x100\00spec\0051\000", metadata !28, metadata !16, metadata !11} ; [ DW_TAG_auto_variable ] [spec] [line 51]
-!38 = metadata !{i32 51, i32 0, metadata !39, null}
-!39 = metadata !{metadata !"0xb\0051\000\002", metadata !5, metadata !28} ; [ DW_TAG_lexical_block ] [./linezero.cc]
-!40 = metadata !{i32 51, i32 0, metadata !41, null}
-!41 = metadata !{metadata !"0xb\0051\000\004", metadata !5, metadata !28} ; [ DW_TAG_lexical_block ] [./linezero.cc]
-!42 = metadata !{i32 51, i32 0, metadata !43, null}
-!43 = metadata !{metadata !"0xb\0051\000\003", metadata !5, metadata !28} ; [ DW_TAG_lexical_block ] [./linezero.cc]
-!44 = metadata !{i32 52, i32 0, metadata !15, null}
-!45 = metadata !{i32 54, i32 0, metadata !20, null}
+!0 = !{!"0x11\004\00clang version 3.5.0 (trunk 209871)\000\00\000\00\001", !1, !2, !3, !14, !2, !2} ; [ DW_TAG_compile_unit ] [<stdin>] [DW_LANG_C_plus_plus]
+!1 = !{!"<stdin>", !"PATTERN"}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00vector\0021\008\008\000\000\000", !5, null, null, !6, null, null, !"_ZTS6vector"} ; [ DW_TAG_structure_type ] [vector] [line 21, size 8, align 8, offset 0] [def] [from ]
+!5 = !{!"linezero.cc", !"PATTERN"}
+!6 = !{!7, !13}
+!7 = !{!"0x2e\00begin\00begin\00_ZN6vector5beginEv\0025\000\000\000\006\00256\000\0025", !5, !"_ZTS6vector", !8, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 25] [begin]
+!8 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = !{!10, !12}
+!10 = !{!"0xf\00\000\0064\0064\000\000", null, null, !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!11 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!12 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS6vector"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS6vector]
+!13 = !{!"0x2e\00end\00end\00_ZN6vector3endEv\0026\000\000\000\006\00256\000\0026", !5, !"_ZTS6vector", !8, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 26] [end]
+!14 = !{!15, !20}
+!15 = !{!"0x2e\00test\00test\00_Z4testv\0050\000\001\000\006\00256\000\0050", !5, !16, !17, null, i32 ()* @_Z4testv, null, null, !2} ; [ DW_TAG_subprogram ] [line 50] [def] [test]
+!16 = !{!"0x29", !5}         ; [ DW_TAG_file_type ] [./linezero.cc]
+!17 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = !{!19}
+!19 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!20 = !{!"0x2e\00f1\00f1\00_Z2f1v\0054\000\001\000\006\00256\000\0054", !5, !16, !21, null, void ()* @_Z2f1v, null, null, !2} ; [ DW_TAG_subprogram ] [line 54] [def] [f1]
+!21 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = !{null}
+!23 = !{i32 2, !"Dwarf Version", i32 4}
+!24 = !{i32 2, !"Debug Info Version", i32 2}
+!25 = !{!"PATTERN/linezero.o", !0}
+!26 = !{!"clang version 3.5.0 (trunk 209871)"}
+!27 = !{!"0x100\00__range\000\0064", !28, null, !29} ; [ DW_TAG_auto_variable ] [__range] [line 0]
+!28 = !{!"0xb\0051\000\000", !5, !15} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!29 = !{!"0x42\00\000\000\000\000\000", null, null, !"_ZTS6vector"} ; [ DW_TAG_rvalue_reference_type ] [line 0, size 0, align 0, offset 0] [from _ZTS6vector]
+!30 = !MDLocation(line: 0, scope: !28)
+!31 = !MDLocation(line: 51, scope: !28)
+!32 = !{!"0x100\00__begin\000\0064", !28, null, !10} ; [ DW_TAG_auto_variable ] [__begin] [line 0]
+!33 = !{!"0x100\00__end\000\0064", !28, null, !10} ; [ DW_TAG_auto_variable ] [__end] [line 0]
+!34 = !MDLocation(line: 51, scope: !35)
+!35 = !{!"0xb\0051\000\005", !5, !36} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!36 = !{!"0xb\0051\000\001", !5, !28} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!37 = !{!"0x100\00spec\0051\000", !28, !16, !11} ; [ DW_TAG_auto_variable ] [spec] [line 51]
+!38 = !MDLocation(line: 51, scope: !39)
+!39 = !{!"0xb\0051\000\002", !5, !28} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!40 = !MDLocation(line: 51, scope: !41)
+!41 = !{!"0xb\0051\000\004", !5, !28} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!42 = !MDLocation(line: 51, scope: !43)
+!43 = !{!"0xb\0051\000\003", !5, !28} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!44 = !MDLocation(line: 52, scope: !15)
+!45 = !MDLocation(line: 54, scope: !20)
diff --git a/test/Transforms/GCOVProfiling/linkagename.ll b/test/Transforms/GCOVProfiling/linkagename.ll
index 04281b2..c30d4a6 100644
--- a/test/Transforms/GCOVProfiling/linkagename.ll
+++ b/test/Transforms/GCOVProfiling/linkagename.ll
@@ -1,4 +1,4 @@
-; RUN: echo '!9 = metadata !{metadata !"%/T/linkagename.ll", metadata !0}' > %t1
+; RUN: echo '!9 = !{!"%/T/linkagename.ll", !0}' > %t1
 ; RUN: cat %s %t1 > %t2
 ; RUN: opt -insert-gcov-profiling -disable-output < %t2
 ; RUN: grep _Z3foov %T/linkagename.gcno
@@ -13,15 +13,15 @@ entry:
 !llvm.module.flags = !{!10}
 !llvm.gcov = !{!9}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 177323)\000\00\000\00\000", metadata !2, metadata !3, metadata !3, metadata !4, metadata !3,  metadata !3} ; [ DW_TAG_compile_unit ] [/home/nlewycky/hello.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"0x29", metadata !2}          ; [ DW_TAG_file_type ] [/home/nlewycky/hello.cc]
-!2 = metadata !{metadata !"hello.cc", metadata !"/home/nlewycky"}
-!3 = metadata !{i32 0}
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3foov\001\000\001\000\006\00256\000\001", metadata !1, metadata !1, metadata !6, null, void ()* @_Z3foov, null, null, metadata !3} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{i32 1, i32 0, metadata !5, null}
+!0 = !{!"0x11\004\00clang version 3.3 (trunk 177323)\000\00\000\00\000", !2, !3, !3, !4, !3,  !3} ; [ DW_TAG_compile_unit ] [/home/nlewycky/hello.cc] [DW_LANG_C_plus_plus]
+!1 = !{!"0x29", !2}          ; [ DW_TAG_file_type ] [/home/nlewycky/hello.cc]
+!2 = !{!"hello.cc", !"/home/nlewycky"}
+!3 = !{i32 0}
+!4 = !{!5}
+!5 = !{!"0x2e\00foo\00foo\00_Z3foov\001\000\001\000\006\00256\000\001", !1, !1, !6, null, void ()* @_Z3foov, null, null, !3} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !MDLocation(line: 1, scope: !5)
 
 
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!10 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/GCOVProfiling/return-block.ll b/test/Transforms/GCOVProfiling/return-block.ll
new file mode 100644
index 0000000..f0be3d2
--- /dev/null
+++ b/test/Transforms/GCOVProfiling/return-block.ll
@@ -0,0 +1,66 @@
+; Inject metadata to set the .gcno file location
+; RUN: echo '!19 = !{!"%/T/return-block.ll", !0}' > %t1
+; RUN: cat %s %t1 > %t2
+; RUN: opt -insert-gcov-profiling -disable-output %t2
+; RUN: llvm-cov gcov -n -dump %T/return-block.gcno 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define void @test() #0 {
+entry:
+  tail call void (...)* @f() #2, !dbg !14
+  %0 = load i32* @A, align 4, !dbg !15
+  %tobool = icmp eq i32 %0, 0, !dbg !15
+  br i1 %tobool, label %if.end, label %if.then, !dbg !15
+
+if.then:                                          ; preds = %entry
+  tail call void (...)* @g() #2, !dbg !16
+  br label %if.end, !dbg !16
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void, !dbg !18
+}
+
+declare void @f(...) #1
+
+declare void @g(...) #1
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.gcov = !{!19}
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!11, !12}
+!llvm.ident = !{!13}
+
+!0 = !{!"0x11\0012\00clang version 3.6.0 (trunk 223182)\001\00\000\00\001", !1, !2, !2, !3, !8, !2} ; [ DW_TAG_compile_unit ] [return-block.c] [DW_LANG_C99]
+!1 = !{!".../llvm/test/Transforms/GCOVProfiling/return-block.ll", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00test\00test\00\005\000\001\000\000\000\001\005", !1, !5, !6, null, void ()* @test, null, null, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [test]
+!5 = !{!"0x29", !1}    ; [ DW_TAG_file_type ] [return-block.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null}
+!8 = !{!9}
+!9 = !{!"0x34\00A\00A\00\003\000\001", null, !5, !10, i32* @A, null} ; [ DW_TAG_variable ] [A] [line 3] [def]
+!10 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!11 = !{i32 2, !"Dwarf Version", i32 4}
+!12 = !{i32 2, !"Debug Info Version", i32 2}
+!13 = !{!"clang version 3.6.0 (trunk 223182)"}
+!14 = !MDLocation(line: 6, column: 3, scope: !4)
+!15 = !MDLocation(line: 7, column: 7, scope: !4)
+!16 = !MDLocation(line: 8, column: 5, scope: !17)
+!17 = !{!"0xb\007\007\000", !1, !4} ; [ DW_TAG_lexical_block ] [return-block.c]
+!18 = !MDLocation(line: 9, column: 1, scope: !4)
+
+; There should be no destination edges for block 1.
+; CHECK: Block : 0 Counter : 0
+; CHECK-NEXT:         Destination Edges : 2 (0), 
+; CHECK-NEXT: Block : 1 Counter : 0
+; CHECK-NEXT:         Source Edges : 4 (0), 
+; CHECK-NEXT: Block : 2 Counter : 0
diff --git a/test/Transforms/GCOVProfiling/version.ll b/test/Transforms/GCOVProfiling/version.ll
index 1af684e..9436bd6 100644
--- a/test/Transforms/GCOVProfiling/version.ll
+++ b/test/Transforms/GCOVProfiling/version.ll
@@ -1,4 +1,4 @@
-; RUN: echo '!9 = metadata !{metadata !"%/T/version.ll", metadata !0}' > %t1
+; RUN: echo '!9 = !{!"%/T/version.ll", !0}' > %t1
 ; RUN: cat %s %t1 > %t2
 ; RUN: opt -insert-gcov-profiling -disable-output < %t2
 ; RUN: head -c8 %T/version.gcno | grep '^oncg.204'
@@ -16,15 +16,15 @@ define void @test() {
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!12}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 176994)\000\00\000\00\000", metadata !11, metadata !3, metadata !3, metadata !4, metadata !3, null} ; [ DW_TAG_compile_unit ] [./version] [DW_LANG_C_plus_plus]
-!2 = metadata !{metadata !"0x29", metadata !11} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 0}
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x2e\00test\00test\00\001\000\001\000\006\00256\000\001", metadata !10, metadata !6, metadata !7, null, void ()* @test, null, null, metadata !3} ; [ DW_TAG_subprogram ] [line 1] [def] [test]
-!6 = metadata !{metadata !"0x29", metadata !10} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !3, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{i32 1, i32 0, metadata !5, null}
+!0 = !{!"0x11\004\00clang version 3.3 (trunk 176994)\000\00\000\00\000", !11, !3, !3, !4, !3, null} ; [ DW_TAG_compile_unit ] [./version] [DW_LANG_C_plus_plus]
+!2 = !{!"0x29", !11} ; [ DW_TAG_file_type ]
+!3 = !{i32 0}
+!4 = !{!5}
+!5 = !{!"0x2e\00test\00test\00\001\000\001\000\006\00256\000\001", !10, !6, !7, null, void ()* @test, null, null, !3} ; [ DW_TAG_subprogram ] [line 1] [def] [test]
+!6 = !{!"0x29", !10} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !3, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !MDLocation(line: 1, scope: !5)
 ;; !9 is added through the echo line at the top.
-!10 = metadata !{metadata !"<stdin>", metadata !"."}
-!11 = metadata !{metadata !"version", metadata !"/usr/local/google/home/nlewycky"}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!10 = !{!"<stdin>", !"."}
+!11 = !{!"version", !"/usr/local/google/home/nlewycky"}
+!12 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/GVN/cond_br2.ll b/test/Transforms/GVN/cond_br2.ll
index 27e6f75..a7ca219 100644
--- a/test/Transforms/GVN/cond_br2.ll
+++ b/test/Transforms/GVN/cond_br2.ll
@@ -132,9 +132,9 @@ attributes #1 = { nounwind }
 attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #3 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
-!0 = metadata !{metadata !"any pointer", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"int", metadata !1}
-!4 = metadata !{metadata !0, metadata !0, i64 0}
-!5 = metadata !{metadata !3, metadata !3, i64 0}
+!0 = !{!"any pointer", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"int", !1}
+!4 = !{!0, !0, i64 0}
+!5 = !{!3, !3, i64 0}
diff --git a/test/Transforms/GVN/condprop.ll b/test/Transforms/GVN/condprop.ll
index 708e4b2..845f88e 100644
--- a/test/Transforms/GVN/condprop.ll
+++ b/test/Transforms/GVN/condprop.ll
@@ -144,6 +144,22 @@ different:
   ret i1 %cmp3
 }
 
+; CHECK-LABEL: @test6_fp(
+define i1 @test6_fp(float %x, float %y) {
+  %cmp2 = fcmp une float %x, %y
+  %cmp = fcmp oeq float %x, %y
+  %cmp3 = fcmp oeq float  %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+; CHECK: ret i1 false
+  ret i1 %cmp2
+
+different:
+; CHECK: ret i1 false
+  ret i1 %cmp3
+}
+
 ; CHECK-LABEL: @test7(
 define i1 @test7(i32 %x, i32 %y) {
   %cmp = icmp sgt i32 %x, %y
@@ -160,6 +176,22 @@ different:
   ret i1 %cmp3
 }
 
+; CHECK-LABEL: @test7_fp(
+define i1 @test7_fp(float %x, float %y) {
+  %cmp = fcmp ogt float %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  %cmp2 = fcmp ule float %x, %y
+; CHECK: ret i1 false
+  ret i1 %cmp2
+
+different:
+  %cmp3 = fcmp ogt float %x, %y
+; CHECK: ret i1 false
+  ret i1 %cmp3
+}
+
 ; CHECK-LABEL: @test8(
 define i1 @test8(i32 %x, i32 %y) {
   %cmp2 = icmp sle i32 %x, %y
@@ -176,6 +208,22 @@ different:
   ret i1 %cmp3
 }
 
+; CHECK-LABEL: @test8_fp(
+define i1 @test8_fp(float %x, float %y) {
+  %cmp2 = fcmp ule float %x, %y
+  %cmp = fcmp ogt float %x, %y
+  %cmp3 = fcmp ogt float %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+; CHECK: ret i1 false
+  ret i1 %cmp2
+
+different:
+; CHECK: ret i1 false
+  ret i1 %cmp3
+}
+
 ; PR1768
 ; CHECK-LABEL: @test9(
 define i32 @test9(i32 %i, i32 %j) {
diff --git a/test/Transforms/GVN/edge.ll b/test/Transforms/GVN/edge.ll
index 646e10c..0c1a3fb 100644
--- a/test/Transforms/GVN/edge.ll
+++ b/test/Transforms/GVN/edge.ll
@@ -58,3 +58,113 @@ bb2:
 ; CHECK: call void @g(i1 %y)
   ret void
 }
+
+define double @fcmp_oeq_not_zero(double %x, double %y) {
+entry:
+  %cmp = fcmp oeq double %y, 2.0
+  br i1 %cmp, label %if, label %return
+
+if:
+  %div = fdiv double %x, %y
+  br label %return
+
+return:
+  %retval = phi double [ %div, %if ], [ %x, %entry ]
+  ret double %retval
+
+; CHECK-LABEL: define double @fcmp_oeq_not_zero(
+; CHECK: %div = fdiv double %x, 2.0
+}
+
+define double @fcmp_une_not_zero(double %x, double %y) {
+entry:
+  %cmp = fcmp une double %y, 2.0
+  br i1 %cmp, label %return, label %else
+
+else:
+  %div = fdiv double %x, %y
+  br label %return
+
+return:
+  %retval = phi double [ %div, %else ], [ %x, %entry ]
+  ret double %retval
+
+; CHECK-LABEL: define double @fcmp_une_not_zero(
+; CHECK: %div = fdiv double %x, 2.0
+}
+
+; PR22376 - We can't propagate zero constants because -0.0 
+; compares equal to 0.0. If %y is -0.0 in this test case,
+; we would produce the wrong sign on the infinity return value.
+define double @fcmp_oeq_zero(double %x, double %y) {
+entry:
+  %cmp = fcmp oeq double %y, 0.0
+  br i1 %cmp, label %if, label %return
+
+if:
+  %div = fdiv double %x, %y
+  br label %return
+
+return:
+  %retval = phi double [ %div, %if ], [ %x, %entry ]
+  ret double %retval
+
+; CHECK-LABEL: define double @fcmp_oeq_zero(
+; CHECK: %div = fdiv double %x, %y
+}
+
+define double @fcmp_une_zero(double %x, double %y) {
+entry:
+  %cmp = fcmp une double %y, -0.0
+  br i1 %cmp, label %return, label %else
+
+else:
+  %div = fdiv double %x, %y
+  br label %return
+
+return:
+  %retval = phi double [ %div, %else ], [ %x, %entry ]
+  ret double %retval
+
+; CHECK-LABEL: define double @fcmp_une_zero(
+; CHECK: %div = fdiv double %x, %y
+}
+
+; We also cannot propagate a value if it's not a constant.
+; This is because the value could be 0.0 or -0.0.
+
+define double @fcmp_oeq_maybe_zero(double %x, double %y, double %z1, double %z2) {
+entry:
+ %z = fadd double %z1, %z2
+ %cmp = fcmp oeq double %y, %z
+ br i1 %cmp, label %if, label %return
+
+if:
+ %div = fdiv double %x, %z
+ br label %return
+
+return:
+ %retval = phi double [ %div, %if ], [ %x, %entry ]
+ ret double %retval
+
+; CHECK-LABEL: define double @fcmp_oeq_maybe_zero(
+; CHECK: %div = fdiv double %x, %z
+}
+
+define double @fcmp_une_maybe_zero(double %x, double %y, double %z1, double %z2) {
+entry:
+ %z = fadd double %z1, %z2
+ %cmp = fcmp une double %y, %z
+ br i1 %cmp, label %return, label %else
+
+else:
+ %div = fdiv double %x, %z
+ br label %return
+
+return:
+ %retval = phi double [ %div, %else ], [ %x, %entry ]
+ ret double %retval
+
+; CHECK-LABEL: define double @fcmp_une_maybe_zero(
+; CHECK: %div = fdiv double %x, %z
+}
diff --git a/test/Transforms/GVN/fpmath.ll b/test/Transforms/GVN/fpmath.ll
index 403df5c..d164fb5 100644
--- a/test/Transforms/GVN/fpmath.ll
+++ b/test/Transforms/GVN/fpmath.ll
@@ -41,5 +41,5 @@ define double @test4(double %x, double %y) {
   ret double %foo
 }
 
-!0 = metadata !{ float 5.0 }
-!1 = metadata !{ float 2.5 }
+!0 = !{ float 5.0 }
+!1 = !{ float 2.5 }
diff --git a/test/Transforms/GVN/invariant-load.ll b/test/Transforms/GVN/invariant-load.ll
index 80e2226..2a83c45 100644
--- a/test/Transforms/GVN/invariant-load.ll
+++ b/test/Transforms/GVN/invariant-load.ll
@@ -27,5 +27,43 @@ entry:
   ret i32 %add
 }
 
-!0 = metadata !{ }
+; With the invariant.load metadata, what would otherwise
+; be a case for PRE becomes a full redundancy.
+define i32 @test3(i1 %cnd, i32* %p, i32* %q) {
+; CHECK-LABEL: test3
+; CHECK-NOT: load
+entry:
+  %v1 = load i32* %p
+  br i1 %cnd, label %bb1, label %bb2
+
+bb1:
+  store i32 5, i32* %q
+  br label %bb2
+
+bb2:
+  %v2 = load i32* %p, !invariant.load !0
+  %res = sub i32 %v1, %v2
+  ret i32 %res
+}
+
+; This test is here to document a case which doesn't optimize
+; as well as it could.  
+define i32 @test4(i1 %cnd, i32* %p, i32* %q) {
+; CHECK-LABEL: test4
+; %v2 is redundant, but GVN currently doesn't catch that
+entry:
+  %v1 = load i32* %p, !invariant.load !0
+  br i1 %cnd, label %bb1, label %bb2
+
+bb1:
+  store i32 5, i32* %q
+  br label %bb2
+
+bb2:
+  %v2 = load i32* %p
+  %res = sub i32 %v1, %v2
+  ret i32 %res
+}
+
+!0 = !{ }
 
diff --git a/test/Transforms/GVN/load-from-unreachable-predecessor.ll b/test/Transforms/GVN/load-from-unreachable-predecessor.ll
new file mode 100644
index 0000000..b676d95
--- /dev/null
+++ b/test/Transforms/GVN/load-from-unreachable-predecessor.ll
@@ -0,0 +1,20 @@
+; RUN: opt -gvn -S < %s | FileCheck %s
+
+; Check that an unreachable predecessor to a PHI node doesn't cause a crash.
+; PR21625.
+
+define i32 @f(i32** %f) {
+; CHECK: bb0:
+; Load should be removed, since it's ignored.
+; CHECK-NEXT: br label
+bb0:
+  %bar = load i32** %f
+  br label %bb2
+bb1:
+  %zed = load i32** %f
+  br i1 false, label %bb1, label %bb2
+bb2:
+  %foo = phi i32* [ null, %bb0 ], [ %zed, %bb1 ]
+  %storemerge = load i32* %foo
+  ret i32 %storemerge
+}
diff --git a/test/Transforms/GVN/load-pre-nonlocal.ll b/test/Transforms/GVN/load-pre-nonlocal.ll
index 7bac1b7..ae508b9 100644
--- a/test/Transforms/GVN/load-pre-nonlocal.ll
+++ b/test/Transforms/GVN/load-pre-nonlocal.ll
@@ -79,9 +79,9 @@ if.end:
   ret i32 %add1
 }
 
-!1 = metadata !{metadata !2, metadata !2, i64 0}
-!2 = metadata !{metadata !"any pointer", metadata !3, i64 0}
-!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
-!5 = metadata !{metadata !6, metadata !6, i64 0}
-!6 = metadata !{metadata !"int", metadata !3, i64 0}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"any pointer", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"int", !3, i64 0}
diff --git a/test/Transforms/GVN/noalias.ll b/test/Transforms/GVN/noalias.ll
index a774f38..6c310fa 100644
--- a/test/Transforms/GVN/noalias.ll
+++ b/test/Transforms/GVN/noalias.ll
@@ -37,7 +37,7 @@ define i32 @test3(i32* %p, i32* %q) {
 
 declare i32 @foo(i32*) readonly
 
-!0 = metadata !{metadata !0}
-!1 = metadata !{metadata !1}
-!2 = metadata !{metadata !0, metadata !1}
+!0 = !{!0}
+!1 = !{!1}
+!2 = !{!0, !1}
 
diff --git a/test/Transforms/GVN/pre-gep-load.ll b/test/Transforms/GVN/pre-gep-load.ll
new file mode 100644
index 0000000..3ee3a37
--- /dev/null
+++ b/test/Transforms/GVN/pre-gep-load.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s -basicaa -gvn -enable-load-pre -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+define double @foo(i32 %stat, i32 %i, double** %p) {
+; CHECK-LABEL: @foo(
+entry:
+  switch i32 %stat, label %sw.default [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb2
+  ]
+
+sw.bb:                                            ; preds = %entry, %entry
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds double** %p, i64 0
+  %0 = load double** %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds double* %0, i64 %idxprom
+  %1 = load double* %arrayidx1, align 8
+  %sub = fsub double %1, 1.000000e+00
+  %cmp = fcmp olt double %sub, 0.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %sw.bb
+  br label %return
+
+if.end:                                           ; preds = %sw.bb
+  br label %sw.bb2
+
+sw.bb2:                                           ; preds = %if.end, %entry
+  %idxprom3 = sext i32 %i to i64
+  %arrayidx4 = getelementptr inbounds double** %p, i64 0
+  %2 = load double** %arrayidx4, align 8
+  %arrayidx5 = getelementptr inbounds double* %2, i64 %idxprom3
+  %3 = load double* %arrayidx5, align 8
+; CHECK: sw.bb2:
+; CHECK-NEXT-NOT: sext
+; CHECK-NEXT: phi double [
+; CHECK-NOT: load
+  %sub6 = fsub double 3.000000e+00, %3
+  br label %return
+
+sw.default:                                       ; preds = %entry
+  br label %return
+
+return:                                           ; preds = %sw.default, %sw.bb2, %if.then
+  %retval.0 = phi double [ 0.000000e+00, %sw.default ], [ %sub6, %sw.bb2 ], [ %sub, %if.then ]
+  ret double %retval.0
+}
diff --git a/test/Transforms/GVN/pre-no-cost-phi.ll b/test/Transforms/GVN/pre-no-cost-phi.ll
new file mode 100644
index 0000000..4c5afa1
--- /dev/null
+++ b/test/Transforms/GVN/pre-no-cost-phi.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -gvn -S | FileCheck %s
+; This testcase tests insertion of no-cost phis.  That is,
+; when the value is already available in every predecessor,
+; and we just need to insert a phi node to merge the available values.
+
+@c = global i32 0, align 4
+@d = global i32 0, align 4
+
+
+define i32 @mai(i32 %foo, i32 %a, i32 %b) {
+  %1 = icmp ne i32 %foo, 0
+  br i1 %1, label %bb1, label %bb2
+
+bb1:
+  %2 = add nsw i32 %a, %b
+  store i32 %2, i32* @c, align 4
+  br label %mergeblock
+
+bb2:
+  %3 = add nsw i32 %a, %b
+  store i32 %3, i32* @d, align 4
+  br label %mergeblock
+
+mergeblock:
+; CHECK: pre-phi = phi i32 [ %3, %bb2 ], [ %2, %bb1 ]
+; CHECK-NEXT: ret i32 %.pre-phi
+  %4 = add nsw i32 %a, %b
+  ret i32 %4
+}
+
+
diff --git a/test/Transforms/GVN/preserve-tbaa.ll b/test/Transforms/GVN/preserve-tbaa.ll
index c52ed96..587d463 100644
--- a/test/Transforms/GVN/preserve-tbaa.ll
+++ b/test/Transforms/GVN/preserve-tbaa.ll
@@ -25,7 +25,7 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-!0 = metadata !{metadata !3, metadata !3, i64 0}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"short", metadata !1}
+!0 = !{!3, !3, i64 0}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA", null}
+!3 = !{!"short", !1}
diff --git a/test/Transforms/GVN/range.ll b/test/Transforms/GVN/range.ll
index 2115fe8..3720232 100644
--- a/test/Transforms/GVN/range.ll
+++ b/test/Transforms/GVN/range.ll
@@ -82,20 +82,20 @@ define i32 @test8(i32* %p) {
   ret i32 %c
 }
 
-; CHECK: ![[DISJOINT_RANGE]] = metadata !{i32 0, i32 2, i32 3, i32 5}
-; CHECK: ![[MERGED_RANGE]] = metadata !{i32 0, i32 5}
-; CHECK: ![[MERGED_SIGNED_RANGE]] = metadata !{i32 -3, i32 -2, i32 1, i32 2}
-; CHECK: ![[MERGED_TEST6]] = metadata !{i32 10, i32 1}
-; CHECK: ![[MERGED_TEST7]] = metadata !{i32 3, i32 4, i32 5, i32 2}
+; CHECK: ![[DISJOINT_RANGE]] = !{i32 0, i32 2, i32 3, i32 5}
+; CHECK: ![[MERGED_RANGE]] = !{i32 0, i32 5}
+; CHECK: ![[MERGED_SIGNED_RANGE]] = !{i32 -3, i32 -2, i32 1, i32 2}
+; CHECK: ![[MERGED_TEST6]] = !{i32 10, i32 1}
+; CHECK: ![[MERGED_TEST7]] = !{i32 3, i32 4, i32 5, i32 2}
 
-!0 = metadata !{i32 0, i32 2}
-!1 = metadata !{i32 3, i32 5}
-!2 = metadata !{i32 2, i32 5}
-!3 = metadata !{i32 -3, i32 -2}
-!4 = metadata !{i32 1, i32 2}
-!5 = metadata !{i32 10, i32 1}
-!6 = metadata !{i32 12, i32 13}
-!7 = metadata !{i32 1, i32 2, i32 3, i32 4}
-!8 = metadata !{i32 5, i32 1}
-!9 = metadata !{i32 1, i32 5}
-!10 = metadata !{i32 5, i32 1}
+!0 = !{i32 0, i32 2}
+!1 = !{i32 3, i32 5}
+!2 = !{i32 2, i32 5}
+!3 = !{i32 -3, i32 -2}
+!4 = !{i32 1, i32 2}
+!5 = !{i32 10, i32 1}
+!6 = !{i32 12, i32 13}
+!7 = !{i32 1, i32 2, i32 3, i32 4}
+!8 = !{i32 5, i32 1}
+!9 = !{i32 1, i32 5}
+!10 = !{i32 5, i32 1}
diff --git a/test/Transforms/GVN/tbaa.ll b/test/Transforms/GVN/tbaa.ll
index d6412fc..71fbed41 100644
--- a/test/Transforms/GVN/tbaa.ll
+++ b/test/Transforms/GVN/tbaa.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basicaa -gvn -S < %s | FileCheck %s
+; RUN: opt -tbaa -basicaa -gvn -S < %s | FileCheck %s
 
 define i32 @test1(i8* %p, i8* %q) {
 ; CHECK: @test1(i8* %p, i8* %q)
@@ -72,20 +72,57 @@ define i32 @test7(i8* %p, i8* %q) {
   ret i32 %c
 }
 
+
+
+define i32 @test8(i32* %p, i32* %q) {
+; CHECK-LABEL: test8
+; CHECK-NEXT: store i32 15, i32* %p
+; CHECK-NEXT: ret i32 0
+; Since we know the location is invariant, we can forward the
+; load across the potentially aliasing store.
+
+  %a = load i32* %q, !tbaa !10
+  store i32 15, i32* %p
+  %b = load i32* %q, !tbaa !10
+  %c = sub i32 %a, %b
+  ret i32 %c
+}
+define i32 @test9(i32* %p, i32* %q) {
+; CHECK-LABEL: test9
+; CHECK-NEXT: call void @clobber()
+; CHECK-NEXT: ret i32 0
+; Since we know the location is invariant, we can forward the
+; load across the potentially aliasing store (within the call).
+
+  %a = load i32* %q, !tbaa !10
+  call void @clobber()
+  %b = load i32* %q, !tbaa !10
+  %c = sub i32 %a, %b
+  ret i32 %c
+}
+
+
+declare void @clobber()
 declare i32 @foo(i8*) readonly
 
-; CHECK: [[TAGC]] = metadata !{metadata [[TYPEC:!.*]], metadata [[TYPEC]], i64 0}
-; CHECK: [[TYPEC]] = metadata !{metadata !"C", metadata [[TYPEA:!.*]]}
-; CHECK: [[TYPEA]] = metadata !{metadata !"A", metadata !{{.*}}}
-; CHECK: [[TAGB]] = metadata !{metadata [[TYPEB:!.*]], metadata [[TYPEB]], i64 0}
-; CHECK: [[TYPEB]] = metadata !{metadata !"B", metadata [[TYPEA]]}
-; CHECK: [[TAGA]] = metadata !{metadata [[TYPEA]], metadata [[TYPEA]], i64 0}
-!0 = metadata !{metadata !5, metadata !5, i64 0}
-!1 = metadata !{metadata !6, metadata !6, i64 0}
-!2 = metadata !{metadata !"tbaa root", null}
-!3 = metadata !{metadata !7, metadata !7, i64 0}
-!4 = metadata !{metadata !8, metadata !8, i64 0}
-!5 = metadata !{metadata !"C", metadata !6}
-!6 = metadata !{metadata !"A", metadata !2}
-!7 = metadata !{metadata !"B", metadata !6}
-!8 = metadata !{metadata !"another root", null}
+; CHECK: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
+; CHECK: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]}
+; CHECK: [[TYPEA]] = !{!"A", !{{.*}}}
+; CHECK: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0}
+; CHECK: [[TYPEB]] = !{!"B", [[TYPEA]]}
+; CHECK: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0}
+!0 = !{!5, !5, i64 0}
+!1 = !{!6, !6, i64 0}
+!2 = !{!"tbaa root", null}
+!3 = !{!7, !7, i64 0}
+!4 = !{!8, !8, i64 0}
+!5 = !{!"C", !6}
+!6 = !{!"A", !2}
+!7 = !{!"B", !6}
+!8 = !{!"another root", null}
+
+
+;; A TBAA structure who's only point is to have a constant location
+!9 = !{!"yet another root"}
+!10 = !{!"node", !9, i64 1}
+
diff --git a/test/Transforms/GVN/volatile.ll b/test/Transforms/GVN/volatile.ll
new file mode 100644
index 0000000..5ba03d9
--- /dev/null
+++ b/test/Transforms/GVN/volatile.ll
@@ -0,0 +1,157 @@
+; Tests that check our handling of volatile instructions encountered
+; when scanning for dependencies
+; RUN: opt -basicaa -gvn -S < %s | FileCheck %s
+
+; Check that we can bypass a volatile load when searching
+; for dependencies of a non-volatile load
+define i32 @test1(i32* nocapture %p, i32* nocapture %q) {
+; CHECK-LABEL: test1
+; CHECK:      %0 = load volatile i32* %q
+; CHECK-NEXT: ret i32 0
+entry:
+  %x = load i32* %p
+  load volatile i32* %q
+  %y = load i32* %p
+  %add = sub i32 %y, %x
+  ret i32 %add
+}
+
+; We can not value forward if the query instruction is 
+; volatile, this would be (in effect) removing the volatile load
+define i32 @test2(i32* nocapture %p, i32* nocapture %q) {
+; CHECK-LABEL: test2
+; CHECK:      %x = load i32* %p
+; CHECK-NEXT: %y = load volatile i32* %p
+; CHECK-NEXT: %add = sub i32 %y, %x
+entry:
+  %x = load i32* %p
+  %y = load volatile i32* %p
+  %add = sub i32 %y, %x
+  ret i32 %add
+}
+
+; If the query instruction is itself volatile, we *cannot*
+; reorder it even if p and q are noalias
+define i32 @test3(i32* noalias nocapture %p, i32* noalias nocapture %q) {
+; CHECK-LABEL: test3
+; CHECK:      %x = load i32* %p
+; CHECK-NEXT: %0 = load volatile i32* %q
+; CHECK-NEXT: %y = load volatile i32* %p
+entry:
+  %x = load i32* %p
+  load volatile i32* %q
+  %y = load volatile i32* %p
+  %add = sub i32 %y, %x
+  ret i32 %add
+}
+
+; If an encountered instruction is both volatile and ordered, 
+; we need to use the strictest ordering of either.  In this 
+; case, the ordering prevents forwarding.
+define i32 @test4(i32* noalias nocapture %p, i32* noalias nocapture %q) {
+; CHECK-LABEL: test4
+; CHECK:      %x = load i32* %p
+; CHECK-NEXT: %0 = load atomic volatile i32* %q seq_cst 
+; CHECK-NEXT: %y = load atomic i32* %p seq_cst
+entry:
+  %x = load i32* %p
+  load atomic volatile i32* %q seq_cst, align 4
+  %y = load atomic i32* %p seq_cst, align 4
+  %add = sub i32 %y, %x
+  ret i32 %add
+}
+
+; Value forwarding from a volatile load is perfectly legal
+define i32 @test5(i32* nocapture %p, i32* nocapture %q) {
+; CHECK-LABEL: test5
+; CHECK:      %x = load volatile i32* %p
+; CHECK-NEXT: ret i32 0
+entry:
+  %x = load volatile i32* %p
+  %y = load i32* %p
+  %add = sub i32 %y, %x
+  ret i32 %add
+}
+
+; Does cross block redundancy elimination work with volatiles?
+define i32 @test6(i32* noalias nocapture %p, i32* noalias nocapture %q) {
+; CHECK-LABEL: test6
+; CHECK:      %y1 = load i32* %p
+; CHECK-LABEL: header
+; CHECK:      %x = load volatile i32* %q
+; CHECK-NEXT: %add = sub i32 %y1, %x
+entry:
+  %y1 = load i32* %p
+  call void @use(i32 %y1)
+  br label %header
+header:
+  %x = load volatile i32* %q
+  %y = load i32* %p
+  %add = sub i32 %y, %x
+  %cnd = icmp eq i32 %add, 0
+  br i1 %cnd, label %exit, label %header
+exit:
+  ret i32 %add
+}
+
+; Does cross block PRE work with volatiles?
+define i32 @test7(i1 %c, i32* noalias nocapture %p, i32* noalias nocapture %q) {
+; CHECK-LABEL: test7
+; CHECK-LABEL: entry.header_crit_edge:
+; CHECK:       %y.pre = load i32* %p
+; CHECK-LABEL: skip:
+; CHECK:       %y1 = load i32* %p
+; CHECK-LABEL: header:
+; CHECK:      %y = phi i32
+; CHECK-NEXT: %x = load volatile i32* %q
+; CHECK-NEXT: %add = sub i32 %y, %x
+entry:
+  br i1 %c, label %header, label %skip
+skip:
+  %y1 = load i32* %p
+  call void @use(i32 %y1)
+  br label %header
+header:
+  %x = load volatile i32* %q
+  %y = load i32* %p
+  %add = sub i32 %y, %x
+  %cnd = icmp eq i32 %add, 0
+  br i1 %cnd, label %exit, label %header
+exit:
+  ret i32 %add
+}
+
+; Another volatile PRE case - two paths through a loop
+; load in preheader, one path read only, one not
+define i32 @test8(i1 %b, i1 %c, i32* noalias %p, i32* noalias %q) {
+; CHECK-LABEL: test8
+; CHECK-LABEL: entry
+; CHECK:       %y1 = load i32* %p
+; CHECK-LABEL: header:
+; CHECK:      %y = phi i32
+; CHECK-NEXT: %x = load volatile i32* %q
+; CHECK-NOT:  load
+; CHECK-LABEL: skip.header_crit_edge:
+; CHECK:       %y.pre = load i32* %p
+entry:
+  %y1 = load i32* %p
+  call void @use(i32 %y1)
+  br label %header
+header:
+  %x = load volatile i32* %q
+  %y = load i32* %p
+  call void @use(i32 %y)
+  br i1 %b, label %skip, label %header
+skip:
+  ; escaping the arguments is explicitly required since we marked 
+  ; them noalias
+  call void @clobber(i32* %p, i32* %q)
+  br i1 %c, label %header, label %exit
+exit:
+  %add = sub i32 %y, %x
+  ret i32 %add
+}
+
+declare void @use(i32) readonly
+declare void @clobber(i32* %p, i32* %q)
+
diff --git a/test/Transforms/GlobalDCE/pr20981.ll b/test/Transforms/GlobalDCE/pr20981.ll
index 92d2840..0eaa6b8 100644
--- a/test/Transforms/GlobalDCE/pr20981.ll
+++ b/test/Transforms/GlobalDCE/pr20981.ll
@@ -6,10 +6,10 @@ $c1 = comdat any
 @a1 = linkonce_odr alias void ()* @f1
 ; CHECK: @a1 = linkonce_odr alias void ()* @f1
 
-define linkonce_odr void @f1() comdat $c1 {
+define linkonce_odr void @f1() comdat($c1) {
   ret void
 }
-; CHECK: define linkonce_odr void @f1() comdat $c1
+; CHECK: define linkonce_odr void @f1() comdat($c1)
 
 define void @g() {
   call void @f1()
diff --git a/test/Transforms/GlobalOpt/2009-03-05-dbg.ll b/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
index 0513829..049eef1 100644
--- a/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
+++ b/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
@@ -6,14 +6,14 @@
 define i32 @foo(i32 %i) nounwind ssp {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !3, metadata !{})
+  call void @llvm.dbg.value(metadata i32 %i, i64 0, metadata !3, metadata !{})
   %0 = icmp eq i32 %i, 1, !dbg !7                 ; <i1> [#uses=1]
   br i1 %0, label %bb, label %bb1, !dbg !7
 
 bb:                                               ; preds = %entry
   store i32 0, i32* @Stop, align 4, !dbg !9
   %1 = mul nsw i32 %i, 42, !dbg !10               ; <i32> [#uses=1]
-  call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !3, metadata !{}), !dbg !10
+  call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !3, metadata !{}), !dbg !10
   br label %bb2, !dbg !10
 
 bb1:                                              ; preds = %entry
@@ -55,25 +55,25 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 !llvm.dbg.gv = !{!0}
 
-!0 = metadata !{metadata !"0x34\00Stop\00Stop\00\002\001\001", metadata !1, metadata !1, metadata !2, i32* @Stop} ; [ DW_TAG_variable ]
-!1 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !20, metadata !21, metadata !21, null, null, null} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !1} ; [ DW_TAG_base_type ]
-!3 = metadata !{metadata !"0x101\00i\004\000", metadata !4, metadata !1, metadata !2} ; [ DW_TAG_arg_variable ]
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00foo\004\000\001\000\006\000\000\000", i32 0, metadata !1, metadata !5, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !1, null, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!6 = metadata !{metadata !2, metadata !2}
-!7 = metadata !{i32 5, i32 0, metadata !8, null}
-!8 = metadata !{metadata !"0xb\000\000\000", metadata !20, metadata !4} ; [ DW_TAG_lexical_block ]
-!9 = metadata !{i32 6, i32 0, metadata !8, null}
-!10 = metadata !{i32 7, i32 0, metadata !8, null}
-!11 = metadata !{i32 9, i32 0, metadata !8, null}
-!12 = metadata !{i32 11, i32 0, metadata !8, null}
-!13 = metadata !{i32 14, i32 0, metadata !14, null}
-!14 = metadata !{metadata !"0xb\000\000\000", metadata !20, metadata !15} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{metadata !"0x2e\00bar\00bar\00bar\0013\000\001\000\006\000\000\000", i32 0, metadata !1, metadata !16, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !1, null, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!17 = metadata !{metadata !2}
-!18 = metadata !{i32 15, i32 0, metadata !14, null}
-!19 = metadata !{i32 16, i32 0, metadata !14, null}
-!20 = metadata !{metadata !"g.c", metadata !"/tmp"}
-!21 = metadata !{i32 0}
+!0 = !{!"0x34\00Stop\00Stop\00\002\001\001", !1, !1, !2, i32* @Stop} ; [ DW_TAG_variable ]
+!1 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", !20, !21, !21, null, null, null} ; [ DW_TAG_compile_unit ]
+!2 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !1} ; [ DW_TAG_base_type ]
+!3 = !{!"0x101\00i\004\000", !4, !1, !2} ; [ DW_TAG_arg_variable ]
+!4 = !{!"0x2e\00foo\00foo\00foo\004\000\001\000\006\000\000\000", i32 0, !1, !5, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!5 = !{!"0x15\00\000\000\000\000\000\000", !1, null, null, !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = !{!2, !2}
+!7 = !MDLocation(line: 5, scope: !8)
+!8 = !{!"0xb\000\000\000", !20, !4} ; [ DW_TAG_lexical_block ]
+!9 = !MDLocation(line: 6, scope: !8)
+!10 = !MDLocation(line: 7, scope: !8)
+!11 = !MDLocation(line: 9, scope: !8)
+!12 = !MDLocation(line: 11, scope: !8)
+!13 = !MDLocation(line: 14, scope: !14)
+!14 = !{!"0xb\000\000\000", !20, !15} ; [ DW_TAG_lexical_block ]
+!15 = !{!"0x2e\00bar\00bar\00bar\0013\000\001\000\006\000\000\000", i32 0, !1, !16, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!16 = !{!"0x15\00\000\000\000\000\000\000", !1, null, null, !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = !{!2}
+!18 = !MDLocation(line: 15, scope: !14)
+!19 = !MDLocation(line: 16, scope: !14)
+!20 = !{!"g.c", !"/tmp"}
+!21 = !{i32 0}
diff --git a/test/Transforms/GlobalOpt/externally-initialized-global-ctr.ll b/test/Transforms/GlobalOpt/externally-initialized-global-ctr.ll
index 9295c20..675211b 100644
--- a/test/Transforms/GlobalOpt/externally-initialized-global-ctr.ll
+++ b/test/Transforms/GlobalOpt/externally-initialized-global-ctr.ll
@@ -32,4 +32,4 @@ define void @print() {
   ret void
 }
 
-!2009 = metadata !{}
+!2009 = !{}
diff --git a/test/Transforms/GlobalOpt/metadata.ll b/test/Transforms/GlobalOpt/metadata.ll
index ecf3f94..fb60b66 100644
--- a/test/Transforms/GlobalOpt/metadata.ll
+++ b/test/Transforms/GlobalOpt/metadata.ll
@@ -13,14 +13,20 @@ define i32 @main(i32 %argc, i8** %argv) {
 }
 
 define void @foo(i32 %x) {
-  call void @llvm.foo(metadata !{i8*** @G, i32 %x})
-; CHECK: call void @llvm.foo(metadata !{null, i32 %x})
+; Note: these arguments look like MDNodes, but they're really syntactic sugar
+; for 'MetadataAsValue::get(ValueAsMetadata::get(Value*))'.  When @G drops to
+; null, the ValueAsMetadata instance gets replaced by metadata !{}, or
+; MDNode::get({}).
+  call void @llvm.foo(metadata i8*** @G, metadata i32 %x)
+; CHECK: call void @llvm.foo(metadata ![[EMPTY:[0-9]+]], metadata i32 %x)
   ret void
 }
 
-declare void @llvm.foo(metadata) nounwind readnone
+declare void @llvm.foo(metadata, metadata) nounwind readnone
 
 !named = !{!0}
+; CHECK: !named = !{![[NULL:[0-9]+]]}
 
-!0 = metadata !{i8*** @G}
-; CHECK: !0 = metadata !{null}
+!0 = !{i8*** @G}
+; CHECK-DAG: ![[NULL]] = !{null}
+; CHECK-DAG: ![[EMPTY]] = !{}
diff --git a/test/Transforms/GlobalOpt/pr21191.ll b/test/Transforms/GlobalOpt/pr21191.ll
index 39b8eee..34e15cb 100644
--- a/test/Transforms/GlobalOpt/pr21191.ll
+++ b/test/Transforms/GlobalOpt/pr21191.ll
@@ -3,15 +3,15 @@
 $c = comdat any
 ; CHECK: $c = comdat any
 
-define linkonce_odr void @foo() comdat $c {
+define linkonce_odr void @foo() comdat($c) {
   ret void
 }
-; CHECK: define linkonce_odr void @foo() comdat $c
+; CHECK: define linkonce_odr void @foo() comdat($c)
 
-define linkonce_odr void @bar() comdat $c {
+define linkonce_odr void @bar() comdat($c) {
   ret void
 }
-; CHECK: define linkonce_odr void @bar() comdat $c
+; CHECK: define linkonce_odr void @bar() comdat($c)
 
 define void @zed()  {
   call void @foo()
diff --git a/test/Transforms/GlobalOpt/preserve-comdats.ll b/test/Transforms/GlobalOpt/preserve-comdats.ll
index 08188b9..0148f00 100644
--- a/test/Transforms/GlobalOpt/preserve-comdats.ll
+++ b/test/Transforms/GlobalOpt/preserve-comdats.ll
@@ -2,9 +2,9 @@
 
 $comdat_global = comdat any
 
-@comdat_global = weak_odr global i8 0, comdat $comdat_global
+@comdat_global = weak_odr global i8 0, comdat($comdat_global)
 @simple_global = internal global i8 0
-; CHECK: @comdat_global = weak_odr global i8 0, comdat $comdat_global
+; CHECK: @comdat_global = weak_odr global i8 0, comdat{{$}}
 ; CHECK: @simple_global = internal global i8 42
 
 @llvm.global_ctors = appending global [2 x { i32, void ()*, i8* }] [
@@ -20,7 +20,7 @@ define void @init_comdat_global() {
 }
 ; CHECK: define void @init_comdat_global()
 
-define internal void @init_simple_global() comdat $comdat_global {
+define internal void @init_simple_global() comdat($comdat_global) {
   store i8 42, i8* @simple_global
   ret void
 }
diff --git a/test/Transforms/IRCE/bug-mismatched-types.ll b/test/Transforms/IRCE/bug-mismatched-types.ll
new file mode 100644
index 0000000..9ff7249
--- /dev/null
+++ b/test/Transforms/IRCE/bug-mismatched-types.ll
@@ -0,0 +1,66 @@
+; RUN: opt -irce -S < %s
+
+; These test cases don't check the correctness of the transform, but
+; that the -irce does not crash in the presence of certain things in
+; the IR:
+
+define void @mismatched_types_1() {
+; In this test case, the safe range for the only range check in the
+; loop is of type [i32, i32) while the backedge taken count is of type
+; i64.
+
+; CHECK-LABEL: mismatched_types_1
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %0 = trunc i64 %indvars.iv to i32
+  %1 = icmp ult i32 %0, 7
+  br i1 %1, label %switch.lookup, label %for.inc
+
+switch.lookup:
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp55 = icmp slt i64 %indvars.iv.next, 11
+  br i1 %cmp55, label %for.body, label %for.end
+
+for.end:
+  unreachable
+}
+
+define void @mismatched_types_2() {
+; In this test case, there are two range check in the loop, one with a
+; safe range of type [i32, i32) and one with a safe range of type
+; [i64, i64).
+
+; CHECK-LABEL: mismatched_types_2
+entry:
+  br label %for.body.a
+
+for.body.a:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %cond.a = icmp ult i64 %indvars.iv, 7
+  br i1 %cond.a, label %switch.lookup.a, label %for.body.b
+
+switch.lookup.a:
+  br label %for.body.b
+
+for.body.b:
+  %truncated = trunc i64 %indvars.iv to i32
+  %cond.b = icmp ult i32 %truncated, 7
+  br i1 %cond.b, label %switch.lookup.b, label %for.inc
+
+switch.lookup.b:
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp55 = icmp slt i64 %indvars.iv.next, 11
+  br i1 %cmp55, label %for.body.a, label %for.end
+
+for.end:
+  unreachable
+}
diff --git a/test/Transforms/IRCE/decrementing-loop.ll b/test/Transforms/IRCE/decrementing-loop.ll
new file mode 100644
index 0000000..877a2c2
--- /dev/null
+++ b/test/Transforms/IRCE/decrementing-loop.ll
@@ -0,0 +1,43 @@
+; RUN: opt -irce -S < %s | FileCheck %s
+
+define void @decrementing_loop(i32 *%arr, i32 *%a_len_ptr, i32 %n) {
+ entry:
+  %len = load i32* %a_len_ptr, !range !0
+  %first.itr.check = icmp sgt i32 %n, 0
+  %start = sub i32 %n, 1
+  br i1 %first.itr.check, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ %start, %entry ] , [ %idx.dec, %in.bounds ]
+  %idx.dec = sub i32 %idx, 1
+  %abc.high = icmp slt i32 %idx, %len
+  %abc.low = icmp sge i32 %idx, 0
+  %abc = and i1 %abc.low, %abc.high
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+ in.bounds:
+  %addr = getelementptr i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp sgt i32 %idx.dec, -1
+  br i1 %next, label %loop, label %exit
+
+ out.of.bounds:
+  ret void
+
+ exit:
+  ret void
+
+; CHECK: loop.preheader:
+; CHECK:   [[indvar_start:[^ ]+]] = add i32 %n, -1
+; CHECK:   [[not_len:[^ ]+]] = sub i32 -1, %len
+; CHECK:   [[not_n:[^ ]+]] = sub i32 -1, %n
+; CHECK:   [[not_len_hiclamp_cmp:[^ ]+]] = icmp sgt i32 [[not_len]], [[not_n]]
+; CHECK:   [[not_len_hiclamp:[^ ]+]] = select i1 [[not_len_hiclamp_cmp]], i32 [[not_len]], i32 [[not_n]]
+; CHECK:   [[len_hiclamp:[^ ]+]] = sub i32 -1, [[not_len_hiclamp]]
+; CHECK:   [[not_exit_preloop_at_cmp:[^ ]+]] = icmp sgt i32 [[len_hiclamp]], 0
+; CHECK:   [[not_exit_preloop_at:[^ ]+]] = select i1 [[not_exit_preloop_at_cmp]], i32 [[len_hiclamp]], i32 0
+; CHECK:   %exit.preloop.at = add i32 [[not_exit_preloop_at]], -1
+}
+
+!0 = !{i32 0, i32 2147483647}
+!1 = !{!"branch_weights", i32 64, i32 4}
diff --git a/test/Transforms/IRCE/low-becount.ll b/test/Transforms/IRCE/low-becount.ll
new file mode 100644
index 0000000..2ddaf19
--- /dev/null
+++ b/test/Transforms/IRCE/low-becount.ll
@@ -0,0 +1,32 @@
+; RUN: opt -irce-print-changed-loops -irce -S < %s 2>&1 | FileCheck %s
+
+; CHECK-NOT: constrained Loop
+
+define void @low_profiled_be_count(i32 *%arr, i32 *%a_len_ptr, i32 %n) {
+ entry:
+  %len = load i32* %a_len_ptr, !range !0
+  %first.itr.check = icmp sgt i32 %n, 0
+  br i1 %first.itr.check, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 0, %entry ] , [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+ in.bounds:
+  %addr = getelementptr i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp slt i32 %idx.next, %n
+  br i1 %next, label %loop, label %exit, !prof !2
+
+ out.of.bounds:
+  ret void
+
+ exit:
+  ret void
+}
+
+!0 = !{i32 0, i32 2147483647}
+!1 = !{!"branch_weights", i32 64, i32 4}
+!2 = !{!"branch_weights", i32 4, i32 64}
diff --git a/test/Transforms/IRCE/multiple-access-no-preloop.ll b/test/Transforms/IRCE/multiple-access-no-preloop.ll
new file mode 100644
index 0000000..304bb4d
--- /dev/null
+++ b/test/Transforms/IRCE/multiple-access-no-preloop.ll
@@ -0,0 +1,66 @@
+; RUN: opt -irce -S < %s | FileCheck %s
+
+define void @multiple_access_no_preloop(
+    i32* %arr_a, i32* %a_len_ptr, i32* %arr_b, i32* %b_len_ptr, i32 %n) {
+
+ entry:
+  %len.a = load i32* %a_len_ptr, !range !0
+  %len.b = load i32* %b_len_ptr, !range !0
+  %first.itr.check = icmp sgt i32 %n, 0
+  br i1 %first.itr.check, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 0, %entry ] , [ %idx.next, %in.bounds.b ]
+  %idx.next = add i32 %idx, 1
+  %abc.a = icmp slt i32 %idx, %len.a
+  br i1 %abc.a, label %in.bounds.a, label %out.of.bounds, !prof !1
+
+ in.bounds.a:
+  %addr.a = getelementptr i32* %arr_a, i32 %idx
+  store i32 0, i32* %addr.a
+  %abc.b = icmp slt i32 %idx, %len.b
+  br i1 %abc.b, label %in.bounds.b, label %out.of.bounds, !prof !1
+
+ in.bounds.b:
+  %addr.b = getelementptr i32* %arr_b, i32 %idx
+  store i32 -1, i32* %addr.b
+  %next = icmp slt i32 %idx.next, %n
+  br i1 %next, label %loop, label %exit
+
+ out.of.bounds:
+  ret void
+
+ exit:
+  ret void
+}
+
+; CHECK-LABEL: multiple_access_no_preloop
+
+; CHECK-LABEL: loop.preheader:
+; CHECK: [[not_len_b:[^ ]+]] = sub i32 -1, %len.b
+; CHECK: [[not_len_a:[^ ]+]] = sub i32 -1, %len.a
+; CHECK: [[smax_not_len_cond:[^ ]+]] = icmp sgt i32 [[not_len_b]], [[not_len_a]]
+; CHECK: [[smax_not_len:[^ ]+]] = select i1 [[smax_not_len_cond]], i32 [[not_len_b]], i32 [[not_len_a]]
+; CHECK: [[not_n:[^ ]+]] = sub i32 -1, %n
+; CHECK: [[not_upper_limit_cond_loclamp:[^ ]+]] = icmp sgt i32 [[smax_not_len]], [[not_n]]
+; CHECK: [[not_upper_limit_loclamp:[^ ]+]] = select i1 [[not_upper_limit_cond_loclamp]], i32 [[smax_not_len]], i32 [[not_n]]
+; CHECK: [[upper_limit_loclamp:[^ ]+]] = sub i32 -1, [[not_upper_limit_loclamp]]
+; CHECK: [[upper_limit_cmp:[^ ]+]] = icmp sgt i32 [[upper_limit_loclamp]], 0
+; CHECK: [[upper_limit:[^ ]+]] = select i1 [[upper_limit_cmp]], i32 [[upper_limit_loclamp]], i32 0
+
+; CHECK-LABEL: loop:
+; CHECK: br i1 true, label %in.bounds.a, label %out.of.bounds
+
+; CHECK-LABEL: in.bounds.a:
+; CHECK: br i1 true, label %in.bounds.b, label %out.of.bounds
+
+; CHECK-LABEL: in.bounds.b:
+; CHECK: [[main_loop_cond:[^ ]+]] = icmp slt i32 %idx.next, [[upper_limit]]
+; CHECK: br i1 [[main_loop_cond]], label %loop, label %main.exit.selector
+
+; CHECK-LABEL: in.bounds.b.postloop:
+; CHECK: %next.postloop = icmp slt i32 %idx.next.postloop, %n
+; CHECK: br i1 %next.postloop, label %loop.postloop, label %exit.loopexit
+
+!0 = !{i32 0, i32 2147483647}
+!1 = !{!"branch_weights", i32 64, i32 4}
diff --git a/test/Transforms/IRCE/not-likely-taken.ll b/test/Transforms/IRCE/not-likely-taken.ll
new file mode 100644
index 0000000..c044530
--- /dev/null
+++ b/test/Transforms/IRCE/not-likely-taken.ll
@@ -0,0 +1,40 @@
+; RUN: opt -verify-loop-info -irce-print-changed-loops -irce < %s 2>&1 | FileCheck %s
+
+; CHECK-NOT: constrained Loop
+
+define void @multiple_access_no_preloop(
+    i32* %arr_a, i32* %a_len_ptr, i32* %arr_b, i32* %b_len_ptr, i32 %n) {
+
+ entry:
+  %len.a = load i32* %a_len_ptr, !range !0
+  %len.b = load i32* %b_len_ptr, !range !0
+  %first.itr.check = icmp sgt i32 %n, 0
+  br i1 %first.itr.check, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 0, %entry ] , [ %idx.next, %in.bounds.b ]
+  %idx.next = add i32 %idx, 1
+  %abc.a = icmp slt i32 %idx, %len.a
+  br i1 %abc.a, label %in.bounds.a, label %out.of.bounds, !prof !1
+
+ in.bounds.a:
+  %addr.a = getelementptr i32* %arr_a, i32 %idx
+  store i32 0, i32* %addr.a
+  %abc.b = icmp slt i32 %idx, %len.b
+  br i1 %abc.b, label %in.bounds.b, label %out.of.bounds, !prof !1
+
+ in.bounds.b:
+  %addr.b = getelementptr i32* %arr_b, i32 %idx
+  store i32 -1, i32* %addr.b
+  %next = icmp slt i32 %idx.next, %n
+  br i1 %next, label %loop, label %exit
+
+ out.of.bounds:
+  ret void
+
+ exit:
+  ret void
+}
+
+!0 = !{i32 0, i32 2147483647}
+!1 = !{!"branch_weights", i32 1, i32 1}
+\ No newline at end of file
diff --git a/test/Transforms/IRCE/single-access-no-preloop.ll b/test/Transforms/IRCE/single-access-no-preloop.ll
new file mode 100644
index 0000000..4d47ba8
--- /dev/null
+++ b/test/Transforms/IRCE/single-access-no-preloop.ll
@@ -0,0 +1,116 @@
+; RUN: opt -irce -S < %s | FileCheck %s
+
+define void @single_access_no_preloop_no_offset(i32 *%arr, i32 *%a_len_ptr, i32 %n) {
+ entry:
+  %len = load i32* %a_len_ptr, !range !0
+  %first.itr.check = icmp sgt i32 %n, 0
+  br i1 %first.itr.check, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 0, %entry ] , [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+ in.bounds:
+  %addr = getelementptr i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp slt i32 %idx.next, %n
+  br i1 %next, label %loop, label %exit
+
+ out.of.bounds:
+  ret void
+
+ exit:
+  ret void
+}
+
+; CHECK-LABEL: single_access_no_preloop
+
+; CHECK-LABEL: loop:
+; CHECK: br i1 true, label %in.bounds, label %out.of.bounds
+
+; CHECK-LABEL: main.exit.selector:
+; CHECK-NEXT: [[continue:%[^ ]+]] = icmp slt i32 %idx.next, %n
+; CHECK-NEXT: br i1 [[continue]], label %main.pseudo.exit, label %exit.loopexit
+
+; CHECK-LABEL: main.pseudo.exit:
+; CHECK-NEXT: %idx.copy = phi i32 [ 0, %loop.preheader ], [ %idx.next, %main.exit.selector ]
+; CHECK-NEXT: %indvar.end = phi i32 [ 0, %loop.preheader ], [ %idx.next, %main.exit.selector ]
+; CHECK-NEXT: br label %postloop
+
+; CHECK-LABEL: postloop:
+; CHECK-NEXT: br label %loop.postloop
+
+; CHECK-LABEL: loop.postloop:
+; CHECK-NEXT: %idx.postloop = phi i32 [ %idx.next.postloop, %in.bounds.postloop ], [ %idx.copy, %postloop ]
+; CHECK-NEXT: %idx.next.postloop = add i32 %idx.postloop, 1
+; CHECK-NEXT: %abc.postloop = icmp slt i32 %idx.postloop, %len
+; CHECK-NEXT: br i1 %abc.postloop, label %in.bounds.postloop, label %out.of.bounds
+
+; CHECK-LABEL: in.bounds.postloop:
+; CHECK-NEXT: %addr.postloop = getelementptr i32* %arr, i32 %idx.postloop
+; CHECK-NEXT: store i32 0, i32* %addr.postloop
+; CHECK-NEXT: %next.postloop = icmp slt i32 %idx.next.postloop, %n
+; CHECK-NEXT: br i1 %next.postloop, label %loop.postloop, label %exit.loopexit
+
+
+define void @single_access_no_preloop_with_offset(i32 *%arr, i32 *%a_len_ptr, i32 %n) {
+ entry:
+  %len = load i32* %a_len_ptr, !range !0
+  %first.itr.check = icmp sgt i32 %n, 0
+  br i1 %first.itr.check, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 0, %entry ] , [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %idx.for.abc = add i32 %idx, 4
+  %abc = icmp slt i32 %idx.for.abc, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+ in.bounds:
+  %addr = getelementptr i32* %arr, i32 %idx.for.abc
+  store i32 0, i32* %addr
+  %next = icmp slt i32 %idx.next, %n
+  br i1 %next, label %loop, label %exit
+
+ out.of.bounds:
+  ret void
+
+ exit:
+  ret void
+}
+
+; CHECK-LABEL: single_access_no_preloop_with_offset
+
+; CHECK-LABEL: loop.preheader:
+; CHECK: [[not_n:[^ ]+]] = sub i32 -1, %n
+; CHECK: [[not_safe_range_end:[^ ]+]] = sub i32 3, %len
+; CHECK: [[not_exit_main_loop_at_hiclamp_cmp:[^ ]+]] = icmp sgt i32 [[not_n]], [[not_safe_range_end]]
+; CHECK: [[not_exit_main_loop_at_hiclamp:[^ ]+]] = select i1 [[not_exit_main_loop_at_hiclamp_cmp]], i32 [[not_n]], i32 [[not_safe_range_end]]
+; CHECK: [[exit_main_loop_at_hiclamp:[^ ]+]] = sub i32 -1, [[not_exit_main_loop_at_hiclamp]]
+; CHECK: [[exit_main_loop_at_loclamp_cmp:[^ ]+]] = icmp sgt i32 [[exit_main_loop_at_hiclamp]], 0
+; CHECK: [[exit_main_loop_at_loclamp:[^ ]+]] = select i1 [[exit_main_loop_at_loclamp_cmp]], i32 [[exit_main_loop_at_hiclamp]], i32 0
+; CHECK: [[enter_main_loop:[^ ]+]] = icmp slt i32 0, [[exit_main_loop_at_loclamp]]
+; CHECK: br i1 [[enter_main_loop]], label %loop, label %main.pseudo.exit
+
+; CHECK-LABEL: loop:
+; CHECK: br i1 true, label %in.bounds, label %out.of.bounds
+
+; CHECK-LABEL: in.bounds:
+; CHECK: [[continue_main_loop:[^ ]+]] = icmp slt i32 %idx.next, [[exit_main_loop_at_loclamp]]
+; CHECK: br i1 [[continue_main_loop]], label %loop, label %main.exit.selector
+
+; CHECK-LABEL: main.pseudo.exit:
+; CHECK:  %idx.copy = phi i32 [ 0, %loop.preheader ], [ %idx.next, %main.exit.selector ]
+; CHECK:  br label %postloop
+
+; CHECK-LABEL: loop.postloop:
+; CHECK: %idx.postloop = phi i32 [ %idx.next.postloop, %in.bounds.postloop ], [ %idx.copy, %postloop ]
+
+; CHECK-LABEL: in.bounds.postloop:
+; CHECK: %next.postloop = icmp slt i32 %idx.next.postloop, %n
+; CHECK: br i1 %next.postloop, label %loop.postloop, label %exit.loopexit
+
+!0 = !{i32 0, i32 2147483647}
+!1 = !{!"branch_weights", i32 64, i32 4}
diff --git a/test/Transforms/IRCE/single-access-with-preloop.ll b/test/Transforms/IRCE/single-access-with-preloop.ll
new file mode 100644
index 0000000..16426b8
--- /dev/null
+++ b/test/Transforms/IRCE/single-access-with-preloop.ll
@@ -0,0 +1,71 @@
+; RUN: opt -irce -S < %s | FileCheck %s
+
+define void @single_access_with_preloop(i32 *%arr, i32 *%a_len_ptr, i32 %n, i32 %offset) {
+ entry:
+  %len = load i32* %a_len_ptr, !range !0
+  %first.itr.check = icmp sgt i32 %n, 0
+  br i1 %first.itr.check, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 0, %entry ] , [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %array.idx = add i32 %idx, %offset
+  %abc.high = icmp slt i32 %array.idx, %len
+  %abc.low = icmp sge i32 %array.idx, 0
+  %abc = and i1 %abc.low, %abc.high
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+ in.bounds:
+  %addr = getelementptr i32* %arr, i32 %array.idx
+  store i32 0, i32* %addr
+  %next = icmp slt i32 %idx.next, %n
+  br i1 %next, label %loop, label %exit
+
+ out.of.bounds:
+  ret void
+
+ exit:
+  ret void
+}
+
+; CHECK-LABEL: loop.preheader:
+; CHECK: [[not_safe_start:[^ ]+]] = add i32 %offset, -1
+; CHECK: [[not_n:[^ ]+]] = sub i32 -1, %n
+; CHECK: [[not_exit_preloop_at_cond_loclamp:[^ ]+]] = icmp sgt i32 [[not_safe_start]], [[not_n]]
+; CHECK: [[not_exit_preloop_at_loclamp:[^ ]+]] = select i1 [[not_exit_preloop_at_cond_loclamp]], i32 [[not_safe_start]], i32 [[not_n]]
+; CHECK: [[exit_preloop_at_loclamp:[^ ]+]] = sub i32 -1, [[not_exit_preloop_at_loclamp]]
+; CHECK: [[exit_preloop_at_cond:[^ ]+]] = icmp sgt i32 [[exit_preloop_at_loclamp]], 0
+; CHECK: [[exit_preloop_at:[^ ]+]] = select i1 [[exit_preloop_at_cond]], i32 [[exit_preloop_at_loclamp]], i32 0
+
+
+; CHECK: [[not_safe_start_2:[^ ]+]] = add i32 %offset, -1
+; CHECK: [[not_safe_end:[^ ]+]] = sub i32 [[not_safe_start_2]], %len
+; CHECK: [[not_exit_mainloop_at_cond_loclamp:[^ ]+]] = icmp sgt i32 [[not_safe_end]], [[not_n]]
+; CHECK: [[not_exit_mainloop_at_loclamp:[^ ]+]] = select i1 [[not_exit_mainloop_at_cond_loclamp]], i32 [[not_safe_end]], i32 [[not_n]]
+; CHECK: [[exit_mainloop_at_loclamp:[^ ]+]] = sub i32 -1, [[not_exit_mainloop_at_loclamp]]
+; CHECK: [[exit_mainloop_at_cmp:[^ ]+]] = icmp sgt i32 [[exit_mainloop_at_loclamp]], 0
+; CHECK: [[exit_mainloop_at:[^ ]+]] = select i1 [[exit_mainloop_at_cmp]], i32 [[exit_mainloop_at_loclamp]], i32 0
+
+
+; CHECK-LABEL: in.bounds:
+; CHECK: [[continue_mainloop_cond:[^ ]+]] = icmp slt i32 %idx.next, [[exit_mainloop_at]]
+; CHECK: br i1 [[continue_mainloop_cond]], label %loop, label %main.exit.selector
+
+; CHECK-LABEL: main.exit.selector:
+; CHECK: [[mainloop_its_left:[^ ]+]] = icmp slt i32 %idx.next, %n
+; CHECK: br i1 [[mainloop_its_left]], label %main.pseudo.exit, label %exit.loopexit
+
+; CHECK-LABEL: in.bounds.preloop:
+; CHECK: [[continue_preloop_cond:[^ ]+]] = icmp slt i32 %idx.next.preloop, [[exit_preloop_at]]
+; CHECK: br i1 [[continue_preloop_cond]], label %loop.preloop, label %preloop.exit.selector
+
+; CHECK-LABEL: preloop.exit.selector:
+; CHECK: [[preloop_its_left:[^ ]+]] = icmp slt i32 %idx.next.preloop, %n
+; CHECK: br i1 [[preloop_its_left]], label %preloop.pseudo.exit, label %exit.loopexit
+
+; CHECK-LABEL: in.bounds.postloop:
+; CHECK: %next.postloop = icmp slt i32 %idx.next.postloop, %n
+; CHECK: br i1 %next.postloop, label %loop.postloop, label %exit.loopexit
+
+!0 = !{i32 0, i32 2147483647}
+!1 = !{!"branch_weights", i32 64, i32 4}
diff --git a/test/Transforms/IRCE/unhandled.ll b/test/Transforms/IRCE/unhandled.ll
new file mode 100644
index 0000000..3531c48
--- /dev/null
+++ b/test/Transforms/IRCE/unhandled.ll
@@ -0,0 +1,37 @@
+; RUN: opt -irce-print-changed-loops -irce -S < %s 2>&1 | FileCheck %s
+
+; Demonstrates that we don't currently handle the general expression
+; `A * I + B'.
+
+define void @general_affine_expressions(i32 *%arr, i32 *%a_len_ptr, i32 %n,
+                                        i32 %scale, i32 %offset) {
+; CHECK-NOT: constrained Loop at depth
+ entry:
+  %len = load i32* %a_len_ptr, !range !0
+  %first.itr.check = icmp sgt i32 %n, 0
+  br i1 %first.itr.check, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 0, %entry ] , [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %idx.mul = mul i32 %idx, %scale
+  %array.idx = add i32 %idx.mul, %offset
+  %abc.high = icmp slt i32 %array.idx, %len
+  %abc.low = icmp sge i32 %array.idx, 0
+  %abc = and i1 %abc.low, %abc.high
+  br i1 %abc, label %in.bounds, label %out.of.bounds
+
+ in.bounds:
+  %addr = getelementptr i32* %arr, i32 %array.idx
+  store i32 0, i32* %addr
+  %next = icmp slt i32 %idx.next, %n
+  br i1 %next, label %loop, label %exit
+
+ out.of.bounds:
+  ret void
+
+ exit:
+  ret void
+}
+
+!0 = !{i32 0, i32 2147483647}
diff --git a/test/Transforms/IRCE/with-parent-loops.ll b/test/Transforms/IRCE/with-parent-loops.ll
new file mode 100644
index 0000000..f8d6c83
--- /dev/null
+++ b/test/Transforms/IRCE/with-parent-loops.ll
@@ -0,0 +1,345 @@
+; RUN: opt -verify-loop-info -irce-print-changed-loops -irce < %s 2>&1 | FileCheck %s
+
+; This test checks if we update the LoopInfo correctly in the presence
+; of parents, uncles and cousins.
+
+; Function Attrs: alwaysinline
+define void @inner_loop(i32* %arr, i32* %a_len_ptr, i32 %n) #0 {
+; CHECK: irce: in function inner_loop: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+
+entry:
+  %len = load i32* %a_len_ptr, !range !0
+  %first.itr.check = icmp sgt i32 %n, 0
+  br i1 %first.itr.check, label %loop, label %exit
+
+loop:                                             ; preds = %in.bounds, %entry
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:                                        ; preds = %loop
+  %addr = getelementptr i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp slt i32 %idx.next, %n
+  br i1 %next, label %loop, label %exit
+
+out.of.bounds:                                    ; preds = %loop
+  ret void
+
+exit:                                             ; preds = %in.bounds, %entry
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define void @with_parent(i32* %arr, i32* %a_len_ptr, i32 %n, i32 %parent.count) #0 {
+; CHECK: irce: in function with_parent: constrained Loop at depth 2 containing: %loop.i<header><exiting>,%in.bounds.i<latch><exiting>
+
+entry:
+  br label %loop
+
+loop:                                             ; preds = %inner_loop.exit, %entry
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %inner_loop.exit ]
+  %idx.next = add i32 %idx, 1
+  %next = icmp ult i32 %idx.next, %parent.count
+  %len.i = load i32* %a_len_ptr, !range !0
+  %first.itr.check.i = icmp sgt i32 %n, 0
+  br i1 %first.itr.check.i, label %loop.i, label %exit.i
+
+loop.i:                                           ; preds = %in.bounds.i, %loop
+  %idx.i = phi i32 [ 0, %loop ], [ %idx.next.i, %in.bounds.i ]
+  %idx.next.i = add i32 %idx.i, 1
+  %abc.i = icmp slt i32 %idx.i, %len.i
+  br i1 %abc.i, label %in.bounds.i, label %out.of.bounds.i, !prof !1
+
+in.bounds.i:                                      ; preds = %loop.i
+  %addr.i = getelementptr i32* %arr, i32 %idx.i
+  store i32 0, i32* %addr.i
+  %next.i = icmp slt i32 %idx.next.i, %n
+  br i1 %next.i, label %loop.i, label %exit.i
+
+out.of.bounds.i:                                  ; preds = %loop.i
+  br label %inner_loop.exit
+
+exit.i:                                           ; preds = %in.bounds.i, %loop
+  br label %inner_loop.exit
+
+inner_loop.exit:                                  ; preds = %exit.i, %out.of.bounds.i
+  br i1 %next, label %loop, label %exit
+
+exit:                                             ; preds = %inner_loop.exit
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define void @with_grandparent(i32* %arr, i32* %a_len_ptr, i32 %n, i32 %parent.count, i32 %grandparent.count) #0 {
+; CHECK: irce: in function with_grandparent: constrained Loop at depth 3 containing: %loop.i.i<header><exiting>,%in.bounds.i.i<latch><exiting>
+
+entry:
+  br label %loop
+
+loop:                                             ; preds = %with_parent.exit, %entry
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %with_parent.exit ]
+  %idx.next = add i32 %idx, 1
+  %next = icmp ult i32 %idx.next, %grandparent.count
+  br label %loop.i
+
+loop.i:                                           ; preds = %inner_loop.exit.i, %loop
+  %idx.i = phi i32 [ 0, %loop ], [ %idx.next.i, %inner_loop.exit.i ]
+  %idx.next.i = add i32 %idx.i, 1
+  %next.i = icmp ult i32 %idx.next.i, %parent.count
+  %len.i.i = load i32* %a_len_ptr, !range !0
+  %first.itr.check.i.i = icmp sgt i32 %n, 0
+  br i1 %first.itr.check.i.i, label %loop.i.i, label %exit.i.i
+
+loop.i.i:                                         ; preds = %in.bounds.i.i, %loop.i
+  %idx.i.i = phi i32 [ 0, %loop.i ], [ %idx.next.i.i, %in.bounds.i.i ]
+  %idx.next.i.i = add i32 %idx.i.i, 1
+  %abc.i.i = icmp slt i32 %idx.i.i, %len.i.i
+  br i1 %abc.i.i, label %in.bounds.i.i, label %out.of.bounds.i.i, !prof !1
+
+in.bounds.i.i:                                    ; preds = %loop.i.i
+  %addr.i.i = getelementptr i32* %arr, i32 %idx.i.i
+  store i32 0, i32* %addr.i.i
+  %next.i.i = icmp slt i32 %idx.next.i.i, %n
+  br i1 %next.i.i, label %loop.i.i, label %exit.i.i
+
+out.of.bounds.i.i:                                ; preds = %loop.i.i
+  br label %inner_loop.exit.i
+
+exit.i.i:                                         ; preds = %in.bounds.i.i, %loop.i
+  br label %inner_loop.exit.i
+
+inner_loop.exit.i:                                ; preds = %exit.i.i, %out.of.bounds.i.i
+  br i1 %next.i, label %loop.i, label %with_parent.exit
+
+with_parent.exit:                                 ; preds = %inner_loop.exit.i
+  br i1 %next, label %loop, label %exit
+
+exit:                                             ; preds = %with_parent.exit
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define void @with_sibling(i32* %arr, i32* %a_len_ptr, i32 %n, i32 %parent.count) #0 {
+; CHECK: irce: in function with_sibling: constrained Loop at depth 2 containing: %loop.i<header><exiting>,%in.bounds.i<latch><exiting>
+; CHECK: irce: in function with_sibling: constrained Loop at depth 2 containing: %loop.i6<header><exiting>,%in.bounds.i9<latch><exiting>
+
+entry:
+  br label %loop
+
+loop:                                             ; preds = %inner_loop.exit12, %entry
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %inner_loop.exit12 ]
+  %idx.next = add i32 %idx, 1
+  %next = icmp ult i32 %idx.next, %parent.count
+  %len.i = load i32* %a_len_ptr, !range !0
+  %first.itr.check.i = icmp sgt i32 %n, 0
+  br i1 %first.itr.check.i, label %loop.i, label %exit.i
+
+loop.i:                                           ; preds = %in.bounds.i, %loop
+  %idx.i = phi i32 [ 0, %loop ], [ %idx.next.i, %in.bounds.i ]
+  %idx.next.i = add i32 %idx.i, 1
+  %abc.i = icmp slt i32 %idx.i, %len.i
+  br i1 %abc.i, label %in.bounds.i, label %out.of.bounds.i, !prof !1
+
+in.bounds.i:                                      ; preds = %loop.i
+  %addr.i = getelementptr i32* %arr, i32 %idx.i
+  store i32 0, i32* %addr.i
+  %next.i = icmp slt i32 %idx.next.i, %n
+  br i1 %next.i, label %loop.i, label %exit.i
+
+out.of.bounds.i:                                  ; preds = %loop.i
+  br label %inner_loop.exit
+
+exit.i:                                           ; preds = %in.bounds.i, %loop
+  br label %inner_loop.exit
+
+inner_loop.exit:                                  ; preds = %exit.i, %out.of.bounds.i
+  %len.i1 = load i32* %a_len_ptr, !range !0
+  %first.itr.check.i2 = icmp sgt i32 %n, 0
+  br i1 %first.itr.check.i2, label %loop.i6, label %exit.i11
+
+loop.i6:                                          ; preds = %in.bounds.i9, %inner_loop.exit
+  %idx.i3 = phi i32 [ 0, %inner_loop.exit ], [ %idx.next.i4, %in.bounds.i9 ]
+  %idx.next.i4 = add i32 %idx.i3, 1
+  %abc.i5 = icmp slt i32 %idx.i3, %len.i1
+  br i1 %abc.i5, label %in.bounds.i9, label %out.of.bounds.i10, !prof !1
+
+in.bounds.i9:                                     ; preds = %loop.i6
+  %addr.i7 = getelementptr i32* %arr, i32 %idx.i3
+  store i32 0, i32* %addr.i7
+  %next.i8 = icmp slt i32 %idx.next.i4, %n
+  br i1 %next.i8, label %loop.i6, label %exit.i11
+
+out.of.bounds.i10:                                ; preds = %loop.i6
+  br label %inner_loop.exit12
+
+exit.i11:                                         ; preds = %in.bounds.i9, %inner_loop.exit
+  br label %inner_loop.exit12
+
+inner_loop.exit12:                                ; preds = %exit.i11, %out.of.bounds.i10
+  br i1 %next, label %loop, label %exit
+
+exit:                                             ; preds = %inner_loop.exit12
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define void @with_cousin(i32* %arr, i32* %a_len_ptr, i32 %n, i32 %parent.count, i32 %grandparent.count) #0 {
+; CHECK: irce: in function with_cousin: constrained Loop at depth 3 containing: %loop.i.i<header><exiting>,%in.bounds.i.i<latch><exiting>
+; CHECK: irce: in function with_cousin: constrained Loop at depth 3 containing: %loop.i.i10<header><exiting>,%in.bounds.i.i13<latch><exiting>
+
+entry:
+  br label %loop
+
+loop:                                             ; preds = %with_parent.exit17, %entry
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %with_parent.exit17 ]
+  %idx.next = add i32 %idx, 1
+  %next = icmp ult i32 %idx.next, %grandparent.count
+  br label %loop.i
+
+loop.i:                                           ; preds = %inner_loop.exit.i, %loop
+  %idx.i = phi i32 [ 0, %loop ], [ %idx.next.i, %inner_loop.exit.i ]
+  %idx.next.i = add i32 %idx.i, 1
+  %next.i = icmp ult i32 %idx.next.i, %parent.count
+  %len.i.i = load i32* %a_len_ptr, !range !0
+  %first.itr.check.i.i = icmp sgt i32 %n, 0
+  br i1 %first.itr.check.i.i, label %loop.i.i, label %exit.i.i
+
+loop.i.i:                                         ; preds = %in.bounds.i.i, %loop.i
+  %idx.i.i = phi i32 [ 0, %loop.i ], [ %idx.next.i.i, %in.bounds.i.i ]
+  %idx.next.i.i = add i32 %idx.i.i, 1
+  %abc.i.i = icmp slt i32 %idx.i.i, %len.i.i
+  br i1 %abc.i.i, label %in.bounds.i.i, label %out.of.bounds.i.i, !prof !1
+
+in.bounds.i.i:                                    ; preds = %loop.i.i
+  %addr.i.i = getelementptr i32* %arr, i32 %idx.i.i
+  store i32 0, i32* %addr.i.i
+  %next.i.i = icmp slt i32 %idx.next.i.i, %n
+  br i1 %next.i.i, label %loop.i.i, label %exit.i.i
+
+out.of.bounds.i.i:                                ; preds = %loop.i.i
+  br label %inner_loop.exit.i
+
+exit.i.i:                                         ; preds = %in.bounds.i.i, %loop.i
+  br label %inner_loop.exit.i
+
+inner_loop.exit.i:                                ; preds = %exit.i.i, %out.of.bounds.i.i
+  br i1 %next.i, label %loop.i, label %with_parent.exit
+
+with_parent.exit:                                 ; preds = %inner_loop.exit.i
+  br label %loop.i6
+
+loop.i6:                                          ; preds = %inner_loop.exit.i16, %with_parent.exit
+  %idx.i1 = phi i32 [ 0, %with_parent.exit ], [ %idx.next.i2, %inner_loop.exit.i16 ]
+  %idx.next.i2 = add i32 %idx.i1, 1
+  %next.i3 = icmp ult i32 %idx.next.i2, %parent.count
+  %len.i.i4 = load i32* %a_len_ptr, !range !0
+  %first.itr.check.i.i5 = icmp sgt i32 %n, 0
+  br i1 %first.itr.check.i.i5, label %loop.i.i10, label %exit.i.i15
+
+loop.i.i10:                                       ; preds = %in.bounds.i.i13, %loop.i6
+  %idx.i.i7 = phi i32 [ 0, %loop.i6 ], [ %idx.next.i.i8, %in.bounds.i.i13 ]
+  %idx.next.i.i8 = add i32 %idx.i.i7, 1
+  %abc.i.i9 = icmp slt i32 %idx.i.i7, %len.i.i4
+  br i1 %abc.i.i9, label %in.bounds.i.i13, label %out.of.bounds.i.i14, !prof !1
+
+in.bounds.i.i13:                                  ; preds = %loop.i.i10
+  %addr.i.i11 = getelementptr i32* %arr, i32 %idx.i.i7
+  store i32 0, i32* %addr.i.i11
+  %next.i.i12 = icmp slt i32 %idx.next.i.i8, %n
+  br i1 %next.i.i12, label %loop.i.i10, label %exit.i.i15
+
+out.of.bounds.i.i14:                              ; preds = %loop.i.i10
+  br label %inner_loop.exit.i16
+
+exit.i.i15:                                       ; preds = %in.bounds.i.i13, %loop.i6
+  br label %inner_loop.exit.i16
+
+inner_loop.exit.i16:                              ; preds = %exit.i.i15, %out.of.bounds.i.i14
+  br i1 %next.i3, label %loop.i6, label %with_parent.exit17
+
+with_parent.exit17:                               ; preds = %inner_loop.exit.i16
+  br i1 %next, label %loop, label %exit
+
+exit:                                             ; preds = %with_parent.exit17
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define void @with_uncle(i32* %arr, i32* %a_len_ptr, i32 %n, i32 %parent.count, i32 %grandparent.count) #0 {
+; CHECK: irce: in function with_uncle: constrained Loop at depth 2 containing: %loop.i<header><exiting>,%in.bounds.i<latch><exiting>
+; CHECK: irce: in function with_uncle: constrained Loop at depth 3 containing: %loop.i.i<header><exiting>,%in.bounds.i.i<latch><exiting>
+
+entry:
+  br label %loop
+
+loop:                                             ; preds = %with_parent.exit, %entry
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %with_parent.exit ]
+  %idx.next = add i32 %idx, 1
+  %next = icmp ult i32 %idx.next, %grandparent.count
+  %len.i = load i32* %a_len_ptr, !range !0
+  %first.itr.check.i = icmp sgt i32 %n, 0
+  br i1 %first.itr.check.i, label %loop.i, label %exit.i
+
+loop.i:                                           ; preds = %in.bounds.i, %loop
+  %idx.i = phi i32 [ 0, %loop ], [ %idx.next.i, %in.bounds.i ]
+  %idx.next.i = add i32 %idx.i, 1
+  %abc.i = icmp slt i32 %idx.i, %len.i
+  br i1 %abc.i, label %in.bounds.i, label %out.of.bounds.i, !prof !1
+
+in.bounds.i:                                      ; preds = %loop.i
+  %addr.i = getelementptr i32* %arr, i32 %idx.i
+  store i32 0, i32* %addr.i
+  %next.i = icmp slt i32 %idx.next.i, %n
+  br i1 %next.i, label %loop.i, label %exit.i
+
+out.of.bounds.i:                                  ; preds = %loop.i
+  br label %inner_loop.exit
+
+exit.i:                                           ; preds = %in.bounds.i, %loop
+  br label %inner_loop.exit
+
+inner_loop.exit:                                  ; preds = %exit.i, %out.of.bounds.i
+  br label %loop.i4
+
+loop.i4:                                          ; preds = %inner_loop.exit.i, %inner_loop.exit
+  %idx.i1 = phi i32 [ 0, %inner_loop.exit ], [ %idx.next.i2, %inner_loop.exit.i ]
+  %idx.next.i2 = add i32 %idx.i1, 1
+  %next.i3 = icmp ult i32 %idx.next.i2, %parent.count
+  %len.i.i = load i32* %a_len_ptr, !range !0
+  %first.itr.check.i.i = icmp sgt i32 %n, 0
+  br i1 %first.itr.check.i.i, label %loop.i.i, label %exit.i.i
+
+loop.i.i:                                         ; preds = %in.bounds.i.i, %loop.i4
+  %idx.i.i = phi i32 [ 0, %loop.i4 ], [ %idx.next.i.i, %in.bounds.i.i ]
+  %idx.next.i.i = add i32 %idx.i.i, 1
+  %abc.i.i = icmp slt i32 %idx.i.i, %len.i.i
+  br i1 %abc.i.i, label %in.bounds.i.i, label %out.of.bounds.i.i, !prof !1
+
+in.bounds.i.i:                                    ; preds = %loop.i.i
+  %addr.i.i = getelementptr i32* %arr, i32 %idx.i.i
+  store i32 0, i32* %addr.i.i
+  %next.i.i = icmp slt i32 %idx.next.i.i, %n
+  br i1 %next.i.i, label %loop.i.i, label %exit.i.i
+
+out.of.bounds.i.i:                                ; preds = %loop.i.i
+  br label %inner_loop.exit.i
+
+exit.i.i:                                         ; preds = %in.bounds.i.i, %loop.i4
+  br label %inner_loop.exit.i
+
+inner_loop.exit.i:                                ; preds = %exit.i.i, %out.of.bounds.i.i
+  br i1 %next.i3, label %loop.i4, label %with_parent.exit
+
+with_parent.exit:                                 ; preds = %inner_loop.exit.i
+  br i1 %next, label %loop, label %exit
+
+exit:                                             ; preds = %with_parent.exit
+  ret void
+}
+
+attributes #0 = { alwaysinline }
+
+!0 = !{i32 0, i32 2147483647}
+!1 = !{!"branch_weights", i32 64, i32 4}
diff --git a/test/Transforms/IndVarSimplify/2011-09-10-widen-nsw.ll b/test/Transforms/IndVarSimplify/2011-09-10-widen-nsw.ll
index 64fef10..82b2120 100644
--- a/test/Transforms/IndVarSimplify/2011-09-10-widen-nsw.ll
+++ b/test/Transforms/IndVarSimplify/2011-09-10-widen-nsw.ll
@@ -17,7 +17,7 @@ for.body11:                                       ; preds = %entry
 for.body153:                                      ; preds = %for.body153, %for.body11
   br i1 undef, label %for.body170, label %for.body153
 
-; CHECK: add nsw i64 %indvars.iv, 1
+; CHECK: add nuw nsw i64 %indvars.iv, 1
 ; CHECK: sub nsw i64 %indvars.iv, 2
 ; CHECK: sub nsw i64 4, %indvars.iv
 ; CHECK: mul nsw i64 %indvars.iv, 8
diff --git a/test/Transforms/IndVarSimplify/backedge-on-min-max.ll b/test/Transforms/IndVarSimplify/backedge-on-min-max.ll
new file mode 100644
index 0000000..250ff9a
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/backedge-on-min-max.ll
@@ -0,0 +1,453 @@
+; RUN: opt < %s -indvars -S | FileCheck %s
+
+;; --- signed ---
+
+define void @min.signed.1(i32* %a, i32 %a_len, i32 %n) {
+; CHECK-LABEL: @min.signed.1
+ entry:
+  %smin.cmp = icmp slt i32 %a_len, %n
+  %smin = select i1 %smin.cmp, i32 %a_len, i32 %n
+  %entry.cond = icmp slt i32 0, %smin
+  br i1 %entry.cond, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %latch ]
+  %idx.inc = add i32 %idx, 1
+  %in.bounds = icmp slt i32 %idx, %a_len
+  br i1 %in.bounds, label %ok, label %latch
+; CHECK: br i1 true, label %ok, label %latch
+
+ ok:
+  %addr = getelementptr i32* %a, i32 %idx
+  store i32 %idx, i32* %addr
+  br label %latch
+
+ latch:
+  %be.cond = icmp slt i32 %idx.inc, %smin
+  br i1 %be.cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+define void @min.signed.2(i32* %a, i32 %a_len, i32 %n) {
+; CHECK-LABEL: @min.signed.2
+ entry:
+  %smin.cmp = icmp slt i32 %a_len, %n
+  %smin = select i1 %smin.cmp, i32 %a_len, i32 %n
+  %entry.cond = icmp slt i32 0, %smin
+  br i1 %entry.cond, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %latch ]
+  %idx.inc = add i32 %idx, 1
+  %in.bounds = icmp sgt i32 %a_len, %idx
+  br i1 %in.bounds, label %ok, label %latch
+; CHECK: br i1 true, label %ok, label %latch
+
+ ok:
+  %addr = getelementptr i32* %a, i32 %idx
+  store i32 %idx, i32* %addr
+  br label %latch
+
+ latch:
+  %be.cond = icmp slt i32 %idx.inc, %smin
+  br i1 %be.cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+define void @min.signed.3(i32* %a, i32 %n) {
+; CHECK-LABEL: @min.signed.3
+ entry:
+  %smin.cmp = icmp slt i32 42, %n
+  %smin = select i1 %smin.cmp, i32 42, i32 %n
+  %entry.cond = icmp slt i32 0, %smin
+  br i1 %entry.cond, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %latch ]
+  %idx.inc = add i32 %idx, 1
+  %in.bounds = icmp slt i32 %idx, 42
+  br i1 %in.bounds, label %ok, label %latch
+; CHECK: br i1 true, label %ok, label %latch
+
+ ok:
+  %addr = getelementptr i32* %a, i32 %idx
+  store i32 %idx, i32* %addr
+  br label %latch
+
+ latch:
+  %be.cond = icmp slt i32 %idx.inc, %smin
+  br i1 %be.cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+define void @min.signed.4(i32* %a, i32 %n) {
+; CHECK-LABEL: @min.signed.4
+ entry:
+  %smin.cmp = icmp slt i32 42, %n
+  %smin = select i1 %smin.cmp, i32 42, i32 %n
+  %entry.cond = icmp slt i32 0, %smin
+  br i1 %entry.cond, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %latch ]
+  %idx.inc = add i32 %idx, 1
+  %in.bounds = icmp sgt i32 42, %idx
+  br i1 %in.bounds, label %ok, label %latch
+; CHECK: br i1 true, label %ok, label %latch
+
+ ok:
+  %addr = getelementptr i32* %a, i32 %idx
+  store i32 %idx, i32* %addr
+  br label %latch
+
+ latch:
+  %be.cond = icmp slt i32 %idx.inc, %smin
+  br i1 %be.cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+define void @max.signed.1(i32* %a, i32 %a_len, i32 %n) {
+; CHECK-LABEL: @max.signed.1
+ entry:
+  %smax.cmp = icmp sgt i32 %a_len, %n
+  %smax = select i1 %smax.cmp, i32 %a_len, i32 %n
+  %entry.cond = icmp sgt i32 0, %smax
+  br i1 %entry.cond, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %latch ]
+  %idx.inc = add i32 %idx, 1
+  %in.bounds = icmp sgt i32 %idx, %a_len
+  br i1 %in.bounds, label %ok, label %latch
+; CHECK: br i1 true, label %ok, label %latch
+
+ ok:
+  %addr = getelementptr i32* %a, i32 %idx
+  store i32 %idx, i32* %addr
+  br label %latch
+
+ latch:
+  %be.cond = icmp sgt i32 %idx.inc, %smax
+  br i1 %be.cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+define void @max.signed.2(i32* %a, i32 %a_len, i32 %n) {
+; CHECK-LABEL: @max.signed.2
+ entry:
+  %smax.cmp = icmp sgt i32 %a_len, %n
+  %smax = select i1 %smax.cmp, i32 %a_len, i32 %n
+  %entry.cond = icmp sgt i32 0, %smax
+  br i1 %entry.cond, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %latch ]
+  %idx.inc = add i32 %idx, 1
+  %in.bounds = icmp slt i32 %a_len, %idx
+  br i1 %in.bounds, label %ok, label %latch
+; CHECK: br i1 true, label %ok, label %latch
+
+ ok:
+  %addr = getelementptr i32* %a, i32 %idx
+  store i32 %idx, i32* %addr
+  br label %latch
+
+ latch:
+  %be.cond = icmp sgt i32 %idx.inc, %smax
+  br i1 %be.cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+define void @max.signed.3(i32* %a, i32 %n, i32 %init) {
+; CHECK-LABEL: @max.signed.3
+ entry:
+  %smax.cmp = icmp sgt i32 42, %n
+  %smax = select i1 %smax.cmp, i32 42, i32 %n
+  %entry.cond = icmp sgt i32 %init, %smax
+  br i1 %entry.cond, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ %init, %entry ], [ %idx.inc, %latch ]
+  %idx.inc = add i32 %idx, 1
+  %in.bounds = icmp sgt i32 %idx, 42
+  br i1 %in.bounds, label %ok, label %latch
+; CHECK: br i1 true, label %ok, label %latch
+
+ ok:
+  %addr = getelementptr i32* %a, i32 %idx
+  store i32 %idx, i32* %addr
+  br label %latch
+
+ latch:
+  %be.cond = icmp sgt i32 %idx.inc, %smax
+  br i1 %be.cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+define void @max.signed.4(i32* %a, i32 %n, i32 %init) {
+; CHECK-LABEL: @max.signed.4
+ entry:
+  %smax.cmp = icmp sgt i32 42, %n
+  %smax = select i1 %smax.cmp, i32 42, i32 %n
+  %entry.cond = icmp sgt i32 %init, %smax
+  br i1 %entry.cond, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ %init, %entry ], [ %idx.inc, %latch ]
+  %idx.inc = add i32 %idx, 1
+  %in.bounds = icmp slt i32 42, %idx
+  br i1 %in.bounds, label %ok, label %latch
+; CHECK: br i1 true, label %ok, label %latch
+
+ ok:
+  %addr = getelementptr i32* %a, i32 %idx
+  store i32 %idx, i32* %addr
+  br label %latch
+
+ latch:
+  %be.cond = icmp sgt i32 %idx.inc, %smax
+  br i1 %be.cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+;; --- unsigned ---
+
+define void @min.unsigned.1(i32* %a, i32 %a_len, i32 %n) {
+; CHECK-LABEL: @min.unsigned.1
+ entry:
+  %umin.cmp = icmp ult i32 %a_len, %n
+  %umin = select i1 %umin.cmp, i32 %a_len, i32 %n
+  %entry.cond = icmp ult i32 5, %umin
+  br i1 %entry.cond, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 5, %entry ], [ %idx.inc, %latch ]
+  %idx.inc = add i32 %idx, 1
+  %in.bounds = icmp ult i32 %idx, %a_len
+  br i1 %in.bounds, label %ok, label %latch
+; CHECK: br i1 true, label %ok, label %latch
+
+ ok:
+  %addr = getelementptr i32* %a, i32 %idx
+  store i32 %idx, i32* %addr
+  br label %latch
+
+ latch:
+  %be.cond = icmp ult i32 %idx.inc, %umin
+  br i1 %be.cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+define void @min.unsigned.2(i32* %a, i32 %a_len, i32 %n) {
+; CHECK-LABEL: @min.unsigned.2
+ entry:
+  %umin.cmp = icmp ult i32 %a_len, %n
+  %umin = select i1 %umin.cmp, i32 %a_len, i32 %n
+  %entry.cond = icmp ult i32 5, %umin
+  br i1 %entry.cond, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 5, %entry ], [ %idx.inc, %latch ]
+  %idx.inc = add i32 %idx, 1
+  %in.bounds = icmp ugt i32 %a_len, %idx
+  br i1 %in.bounds, label %ok, label %latch
+; CHECK: br i1 true, label %ok, label %latch
+
+ ok:
+  %addr = getelementptr i32* %a, i32 %idx
+  store i32 %idx, i32* %addr
+  br label %latch
+
+ latch:
+  %be.cond = icmp ult i32 %idx.inc, %umin
+  br i1 %be.cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+define void @min.unsigned.3(i32* %a, i32 %n) {
+; CHECK-LABEL: @min.unsigned.3
+ entry:
+  %umin.cmp = icmp ult i32 42, %n
+  %umin = select i1 %umin.cmp, i32 42, i32 %n
+  %entry.cond = icmp ult i32 5, %umin
+  br i1 %entry.cond, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 5, %entry ], [ %idx.inc, %latch ]
+  %idx.inc = add i32 %idx, 1
+  %in.bounds = icmp ult i32 %idx, 42
+  br i1 %in.bounds, label %ok, label %latch
+; CHECK: br i1 true, label %ok, label %latch
+
+ ok:
+  %addr = getelementptr i32* %a, i32 %idx
+  store i32 %idx, i32* %addr
+  br label %latch
+
+ latch:
+  %be.cond = icmp ult i32 %idx.inc, %umin
+  br i1 %be.cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+define void @min.unsigned.4(i32* %a, i32 %n) {
+; CHECK-LABEL: @min.unsigned.4
+ entry:
+  %umin.cmp = icmp ult i32 42, %n
+  %umin = select i1 %umin.cmp, i32 42, i32 %n
+  %entry.cond = icmp ult i32 5, %umin
+  br i1 %entry.cond, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 5, %entry ], [ %idx.inc, %latch ]
+  %idx.inc = add i32 %idx, 1
+  %in.bounds = icmp ugt i32 42, %idx
+  br i1 %in.bounds, label %ok, label %latch
+; CHECK: br i1 true, label %ok, label %latch
+
+ ok:
+  %addr = getelementptr i32* %a, i32 %idx
+  store i32 %idx, i32* %addr
+  br label %latch
+
+ latch:
+  %be.cond = icmp ult i32 %idx.inc, %umin
+  br i1 %be.cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+define void @max.unsigned.1(i32* %a, i32 %a_len, i32 %n) {
+; CHECK-LABEL: @max.unsigned.1
+ entry:
+  %umax.cmp = icmp ugt i32 %a_len, %n
+  %umax = select i1 %umax.cmp, i32 %a_len, i32 %n
+  %entry.cond = icmp ugt i32 5, %umax
+  br i1 %entry.cond, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 5, %entry ], [ %idx.inc, %latch ]
+  %idx.inc = add i32 %idx, 1
+  %in.bounds = icmp ugt i32 %idx, %a_len
+  br i1 %in.bounds, label %ok, label %latch
+; CHECK: br i1 true, label %ok, label %latch
+
+ ok:
+  %addr = getelementptr i32* %a, i32 %idx
+  store i32 %idx, i32* %addr
+  br label %latch
+
+ latch:
+  %be.cond = icmp ugt i32 %idx.inc, %umax
+  br i1 %be.cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+define void @max.unsigned.2(i32* %a, i32 %a_len, i32 %n) {
+; CHECK-LABEL: @max.unsigned.2
+ entry:
+  %umax.cmp = icmp ugt i32 %a_len, %n
+  %umax = select i1 %umax.cmp, i32 %a_len, i32 %n
+  %entry.cond = icmp ugt i32 5, %umax
+  br i1 %entry.cond, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ 5, %entry ], [ %idx.inc, %latch ]
+  %idx.inc = add i32 %idx, 1
+  %in.bounds = icmp ult i32 %a_len, %idx
+  br i1 %in.bounds, label %ok, label %latch
+; CHECK: br i1 true, label %ok, label %latch
+
+ ok:
+  %addr = getelementptr i32* %a, i32 %idx
+  store i32 %idx, i32* %addr
+  br label %latch
+
+ latch:
+  %be.cond = icmp ugt i32 %idx.inc, %umax
+  br i1 %be.cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+define void @max.unsigned.3(i32* %a, i32 %n, i32 %init) {
+; CHECK-LABEL: @max.unsigned.3
+ entry:
+  %umax.cmp = icmp ugt i32 42, %n
+  %umax = select i1 %umax.cmp, i32 42, i32 %n
+  %entry.cond = icmp ugt i32 %init, %umax
+  br i1 %entry.cond, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ %init, %entry ], [ %idx.inc, %latch ]
+  %idx.inc = add i32 %idx, 1
+  %in.bounds = icmp ugt i32 %idx, 42
+  br i1 %in.bounds, label %ok, label %latch
+; CHECK: br i1 true, label %ok, label %latch
+
+ ok:
+  %addr = getelementptr i32* %a, i32 %idx
+  store i32 %idx, i32* %addr
+  br label %latch
+
+ latch:
+  %be.cond = icmp ugt i32 %idx.inc, %umax
+  br i1 %be.cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+define void @max.unsigned.4(i32* %a, i32 %n, i32 %init) {
+; CHECK-LABEL: @max.unsigned.4
+ entry:
+  %umax.cmp = icmp ugt i32 42, %n
+  %umax = select i1 %umax.cmp, i32 42, i32 %n
+  %entry.cond = icmp ugt i32 %init, %umax
+  br i1 %entry.cond, label %loop, label %exit
+
+ loop:
+  %idx = phi i32 [ %init, %entry ], [ %idx.inc, %latch ]
+  %idx.inc = add i32 %idx, 1
+  %in.bounds = icmp ult i32 42, %idx
+  br i1 %in.bounds, label %ok, label %latch
+; CHECK: br i1 true, label %ok, label %latch
+
+ ok:
+  %addr = getelementptr i32* %a, i32 %idx
+  store i32 %idx, i32* %addr
+  br label %latch
+
+ latch:
+  %be.cond = icmp ugt i32 %idx.inc, %umax
+  br i1 %be.cond, label %loop, label %exit
+
+ exit:
+  ret void
+}
diff --git a/test/Transforms/IndVarSimplify/overflowcheck.ll b/test/Transforms/IndVarSimplify/overflowcheck.ll
index 2603f36..3864c6c 100644
--- a/test/Transforms/IndVarSimplify/overflowcheck.ll
+++ b/test/Transforms/IndVarSimplify/overflowcheck.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-macosx"
 ; CHECK: @llvm.sadd.with.overflow
 ; CHECK-LABEL: loop2:
 ; CHECK-NOT: extractvalue
-; CHECK: add nuw nsw
+; CHECK: add nuw
 ; CHECK: @llvm.sadd.with.overflow
 ; CHECK-LABEL: loop3:
 ; CHECK-NOT: extractvalue
diff --git a/test/Transforms/IndVarSimplify/pr20680.ll b/test/Transforms/IndVarSimplify/pr20680.ll
index 88a7fd7..716e013 100644
--- a/test/Transforms/IndVarSimplify/pr20680.ll
+++ b/test/Transforms/IndVarSimplify/pr20680.ll
@@ -204,8 +204,8 @@ for.cond2.for.inc13_crit_edge:                    ; preds = %for.cond2.for.inc13
   br label %for.inc13
 
 ; CHECK: [[for_inc13]]:
-; CHECK-NEXT: %[[indvars_iv_next]] = add nuw nsw i32 %[[indvars_iv]], 1
-; CHECK-NEXT: %[[exitcond4:.*]] = icmp ne i32 %[[indvars_iv]], -1
+; CHECK-NEXT: %[[indvars_iv_next]] = add nsw i32 %[[indvars_iv]], 1
+; CHECK-NEXT: %[[exitcond4:.*]] = icmp ne i32 %[[indvars_iv_next]], 0
 ; CHECK-NEXT: br i1 %[[exitcond4]], label %[[for_cond2_preheader]], label %[[for_end15:.*]]
 for.inc13:                                        ; preds = %for.cond2.for.inc13_crit_edge, %for.cond2.preheader
   %inc14 = add i8 %storemerge15, 1
diff --git a/test/Transforms/IndVarSimplify/pr22222.ll b/test/Transforms/IndVarSimplify/pr22222.ll
new file mode 100644
index 0000000..ccdfe53
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/pr22222.ll
@@ -0,0 +1,46 @@
+; RUN: opt -indvars -S < %s | FileCheck %s
+
+@b = common global i32 0, align 4
+@c = common global i32 0, align 4
+@a = common global i32 0, align 4
+
+declare void @abort() #1
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @main() {
+entry:
+  %a.promoted13 = load i32* @a, align 4
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.end
+  %or.lcssa14 = phi i32 [ %a.promoted13, %entry ], [ %or.lcssa, %for.end ]
+  %d.010 = phi i32 [ 1, %entry ], [ 0, %for.end ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %inc12 = phi i32 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %or11 = phi i32 [ %or.lcssa14, %for.cond1.preheader ], [ %or, %for.body3 ]
+; CHECK-NOT: sub nuw i32 %inc12, %d.010
+; CHECK: sub i32 %inc12, %d.010
+  %add = sub i32 %inc12, %d.010
+  %or = or i32 %or11, %add
+  %inc = add i32 %inc12, 1
+  br i1 false, label %for.body3, label %for.end
+
+for.end:                                          ; preds = %for.body3
+  %or.lcssa = phi i32 [ %or, %for.body3 ]
+  br i1 false, label %for.cond1.preheader, label %for.end6
+
+for.end6:                                         ; preds = %for.end
+  %or.lcssa.lcssa = phi i32 [ %or.lcssa, %for.end ]
+  store i32 %or.lcssa.lcssa, i32* @a, align 4
+  %cmp7 = icmp eq i32 %or.lcssa.lcssa, -1
+  br i1 %cmp7, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.end6
+  tail call void @abort() #2
+  unreachable
+
+if.end:                                           ; preds = %for.end6
+  ret i32 0
+}
diff --git a/test/Transforms/IndVarSimplify/sharpen-range.ll b/test/Transforms/IndVarSimplify/sharpen-range.ll
index 6a9d352..5392dbc 100644
--- a/test/Transforms/IndVarSimplify/sharpen-range.ll
+++ b/test/Transforms/IndVarSimplify/sharpen-range.ll
@@ -110,4 +110,4 @@ exit:
   ret void
 }
 
-!0 = metadata !{i32 0, i32 100}
+!0 = !{i32 0, i32 100}
diff --git a/test/Transforms/IndVarSimplify/strengthen-overflow.ll b/test/Transforms/IndVarSimplify/strengthen-overflow.ll
new file mode 100644
index 0000000..2bafe96
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/strengthen-overflow.ll
@@ -0,0 +1,108 @@
+; RUN: opt < %s -indvars -S | FileCheck %s
+
+define i32 @test.signed.add.0(i32* %array, i32 %length, i32 %init) {
+; CHECK-LABEL: @test.signed.add.0
+ entry:
+  %upper = icmp slt i32 %init, %length
+  br i1 %upper, label %loop, label %exit
+
+ loop:
+; CHECK-LABEL: loop
+  %civ = phi i32 [ %init, %entry ], [ %civ.inc, %latch ]
+  %civ.inc = add i32 %civ, 1
+; CHECK: %civ.inc = add nsw i32 %civ, 1
+  %cmp = icmp slt i32 %civ.inc, %length
+  br i1 %cmp, label %latch, label %break
+
+ latch:
+  store i32 0, i32* %array
+  %check = icmp slt i32 %civ.inc, %length
+  br i1 %check, label %loop, label %break
+
+ break:
+  ret i32 %civ.inc
+
+ exit:
+  ret i32 42
+}
+
+define i32 @test.signed.add.1(i32* %array, i32 %length, i32 %init) {
+; CHECK-LABEL: @test.signed.add.1
+ entry:
+  %upper = icmp sle i32 %init, %length
+  br i1 %upper, label %loop, label %exit
+
+ loop:
+; CHECK-LABEL: loop
+  %civ = phi i32 [ %init, %entry ], [ %civ.inc, %latch ]
+  %civ.inc = add i32 %civ, 1
+; CHECK: %civ.inc = add i32 %civ, 1
+  %cmp = icmp slt i32 %civ.inc, %length
+  br i1 %cmp, label %latch, label %break
+
+ latch:
+  store i32 0, i32* %array
+  %check = icmp slt i32 %civ.inc, %length
+  br i1 %check, label %loop, label %break
+
+ break:
+  ret i32 %civ.inc
+
+ exit:
+  ret i32 42
+}
+
+define i32 @test.unsigned.add.0(i32* %array, i32 %length, i32 %init) {
+; CHECK-LABEL: @test.unsigned.add.0
+ entry:
+  %upper = icmp ult i32 %init, %length
+  br i1 %upper, label %loop, label %exit
+
+ loop:
+; CHECK-LABEL: loop
+  %civ = phi i32 [ %init, %entry ], [ %civ.inc, %latch ]
+  %civ.inc = add i32 %civ, 1
+; CHECK: %civ.inc = add nuw i32 %civ, 1
+  %cmp = icmp slt i32 %civ.inc, %length
+  br i1 %cmp, label %latch, label %break
+
+ latch:
+  store i32 0, i32* %array
+  %check = icmp ult i32 %civ.inc, %length
+  br i1 %check, label %loop, label %break
+
+ break:
+  ret i32 %civ.inc
+
+ exit:
+  ret i32 42
+}
+
+define i32 @test.unsigned.add.1(i32* %array, i32 %length, i32 %init) {
+; CHECK-LABEL: @test.unsigned.add.1
+ entry:
+  %upper = icmp ule i32 %init, %length
+  br i1 %upper, label %loop, label %exit
+
+ loop:
+; CHECK-LABEL: loop
+  %civ = phi i32 [ %init, %entry ], [ %civ.inc, %latch ]
+  %civ.inc = add i32 %civ, 1
+; CHECK: %civ.inc = add i32 %civ, 1
+  %cmp = icmp slt i32 %civ.inc, %length
+  br i1 %cmp, label %latch, label %break
+
+ latch:
+  store i32 0, i32* %array
+  %check = icmp ult i32 %civ.inc, %length
+  br i1 %check, label %loop, label %break
+
+ break:
+  ret i32 %civ.inc
+
+ exit:
+  ret i32 42
+}
+
+!0 = !{i32 0, i32 2}
+!1 = !{i32 0, i32 42}
diff --git a/test/Transforms/IndVarSimplify/use-range-metadata.ll b/test/Transforms/IndVarSimplify/use-range-metadata.ll
index 7ac4f11..ea3b12d 100644
--- a/test/Transforms/IndVarSimplify/use-range-metadata.ll
+++ b/test/Transforms/IndVarSimplify/use-range-metadata.ll
@@ -34,4 +34,4 @@ oob:
   ret i1 false
 }
 
-!0 = metadata !{i32 1, i32 100}
+!0 = !{i32 1, i32 100}
diff --git a/test/Transforms/Inline/alloca-dbgdeclare.ll b/test/Transforms/Inline/alloca-dbgdeclare.ll
new file mode 100644
index 0000000..6809e41
--- /dev/null
+++ b/test/Transforms/Inline/alloca-dbgdeclare.ll
@@ -0,0 +1,141 @@
+; RUN: opt -inline -S < %s | FileCheck %s
+; struct A {
+;   int arg0;
+;   double arg1[2];
+; } a, b;
+;  
+; void fn3(A p1) {
+;   if (p1.arg0)
+;     a = p1;
+; }
+;  
+; void fn4() { fn3(b); }
+;  
+; void fn5() {
+;   while (1)
+;     fn4();
+; }
+; ModuleID = 'test.cpp'
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-apple-darwin"
+
+%struct.A = type { i32, [2 x double] }
+
+@a = global %struct.A zeroinitializer, align 8
+@b = global %struct.A zeroinitializer, align 8
+
+; Function Attrs: nounwind
+declare void @_Z3fn31A(%struct.A* nocapture readonly %p1) #0
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #2
+
+; Function Attrs: nounwind
+define void @_Z3fn4v() #0 {
+entry:
+; Test that the dbg.declare is moved together with the alloca.
+; CHECK: define void @_Z3fn5v()
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %agg.tmp.sroa.3.i = alloca [20 x i8], align 4
+; CHECK-NEXT:   call void @llvm.dbg.declare(metadata [20 x i8]* %agg.tmp.sroa.3.i,
+  %agg.tmp.sroa.3 = alloca [20 x i8], align 4
+  tail call void @llvm.dbg.declare(metadata [20 x i8]* %agg.tmp.sroa.3, metadata !46, metadata !48), !dbg !49
+  %agg.tmp.sroa.0.0.copyload = load i32* getelementptr inbounds (%struct.A* @b, i64 0, i32 0), align 8, !dbg !50
+  tail call void @llvm.dbg.value(metadata i32 %agg.tmp.sroa.0.0.copyload, i64 0, metadata !46, metadata !51), !dbg !49
+  %agg.tmp.sroa.3.0..sroa_idx = getelementptr inbounds [20 x i8]* %agg.tmp.sroa.3, i64 0, i64 0, !dbg !50
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %agg.tmp.sroa.3.0..sroa_idx, i8* getelementptr (i8* bitcast (%struct.A* @b to i8*), i64 4), i64 20, i32 4, i1 false), !dbg !50
+  tail call void @llvm.dbg.declare(metadata %struct.A* undef, metadata !46, metadata !31) #2, !dbg !49
+  %tobool.i = icmp eq i32 %agg.tmp.sroa.0.0.copyload, 0, !dbg !52
+  br i1 %tobool.i, label %_Z3fn31A.exit, label %if.then.i, !dbg !53
+
+if.then.i:                                        ; preds = %entry
+  store i32 %agg.tmp.sroa.0.0.copyload, i32* getelementptr inbounds (%struct.A* @a, i64 0, i32 0), align 8, !dbg !54
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr (i8* bitcast (%struct.A* @a to i8*), i64 4), i8* %agg.tmp.sroa.3.0..sroa_idx, i64 20, i32 4, i1 false), !dbg !54
+  br label %_Z3fn31A.exit, !dbg !54
+
+_Z3fn31A.exit:                                    ; preds = %entry, %if.then.i
+  ret void, !dbg !50
+}
+
+; Function Attrs: noreturn nounwind
+define void @_Z3fn5v() #3 {
+entry:
+  br label %while.body, !dbg !55
+
+while.body:                                       ; preds = %entry, %while.body
+  call void @_Z3fn4v(), !dbg !56
+  br label %while.body, !dbg !55
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+attributes #3 = { noreturn nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!28, !29}
+!llvm.ident = !{!30}
+
+!0 = !{!"0x11\004\00clang version 3.7.0 (trunk 227480) (llvm/trunk 227517)\001\00\000\00\001", !1, !2, !3, !14, !25, !2} ; [ DW_TAG_compile_unit ] [/<stdin>] [DW_LANG_C_plus_plus]
+!1 = !{!"<stdin>", !""}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x13\00A\001\00192\0064\000\000\000", !5, null, null, !6, null, null, !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 192, align 64, offset 0] [def] [from ]
+!5 = !{!"test.cpp", !""}
+!6 = !{!7, !9}
+!7 = !{!"0xd\00arg0\002\0032\0032\000\000", !5, !"_ZTS1A", !8} ; [ DW_TAG_member ] [arg0] [line 2, size 32, align 32, offset 0] [from int]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0xd\00arg1\003\00128\0064\0064\000", !5, !"_ZTS1A", !10} ; [ DW_TAG_member ] [arg1] [line 3, size 128, align 64, offset 64] [from ]
+!10 = !{!"0x1\00\000\00128\0064\000\000\000", null, null, !11, !12, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 64, offset 0] [from double]
+!11 = !{!"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!12 = !{!13}
+!13 = !{!"0x21\000\002"}                          ; [ DW_TAG_subrange_type ] [0, 1]
+!14 = !{!15, !21, !24}
+!15 = !{!"0x2e\00fn3\00fn3\00_Z3fn31A\006\000\001\000\000\00256\001\006", !5, !16, !17, null, void (%struct.A*)* @_Z3fn31A, null, null, !19} ; [ DW_TAG_subprogram ] [line 6] [def] [fn3]
+!16 = !{!"0x29", !5}                              ; [ DW_TAG_file_type ] [/test.cpp]
+!17 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = !{null, !"_ZTS1A"}
+!19 = !{!20}
+!20 = !{!"0x101\00p1\0016777222\000", !15, !16, !"_ZTS1A"} ; [ DW_TAG_arg_variable ] [p1] [line 6]
+!21 = !{!"0x2e\00fn4\00fn4\00_Z3fn4v\0011\000\001\000\000\00256\001\0011", !5, !16, !22, null, void ()* @_Z3fn4v, null, null, !2} ; [ DW_TAG_subprogram ] [line 11] [def] [fn4]
+!22 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = !{null}
+!24 = !{!"0x2e\00fn5\00fn5\00_Z3fn5v\0013\000\001\000\000\00256\001\0013", !5, !16, !22, null, void ()* @_Z3fn5v, null, null, !2} ; [ DW_TAG_subprogram ] [line 13] [def] [fn5]
+!25 = !{!26, !27}
+!26 = !{!"0x34\00a\00a\00\004\000\001", null, !16, !"_ZTS1A", %struct.A* @a, null} ; [ DW_TAG_variable ] [a] [line 4] [def]
+!27 = !{!"0x34\00b\00b\00\004\000\001", null, !16, !"_ZTS1A", %struct.A* @b, null} ; [ DW_TAG_variable ] [b] [line 4] [def]
+!28 = !{i32 2, !"Dwarf Version", i32 4}
+!29 = !{i32 2, !"Debug Info Version", i32 2}
+!30 = !{!"clang version 3.7.0 (trunk 227480) (llvm/trunk 227517)"}
+!31 = !{!"0x102\006"}                             ; [ DW_TAG_expression ] [DW_OP_deref]
+!32 = !MDLocation(line: 6, scope: !15)
+!33 = !MDLocation(line: 7, scope: !34)
+!34 = !{!"0xb\007\000\000", !5, !15}              ; [ DW_TAG_lexical_block ] [/test.cpp]
+!35 = !{!36, !37, i64 0}
+!36 = !{!"_ZTS1A", !37, i64 0, !38, i64 8}
+!37 = !{!"int", !38, i64 0}
+!38 = !{!"omnipotent char", !39, i64 0}
+!39 = !{!"Simple C/C++ TBAA"}
+!40 = !MDLocation(line: 7, scope: !15)
+!41 = !MDLocation(line: 8, scope: !34)
+!42 = !{i64 0, i64 4, !43, i64 8, i64 16, !44}
+!43 = !{!37, !37, i64 0}
+!44 = !{!38, !38, i64 0}
+!45 = !MDLocation(line: 9, scope: !15)
+!46 = !{!"0x101\00p1\0016777222\000", !15, !16, !"_ZTS1A", !47} ; [ DW_TAG_arg_variable ] [p1] [line 6]
+!47 = distinct !MDLocation(line: 11, scope: !21)
+!48 = !{!"0x102\00147\004\0020"}                  ; [ DW_TAG_expression ] [DW_OP_piece offset=4, size=20]
+!49 = !MDLocation(line: 6, scope: !15, inlinedAt: !47)
+!50 = !MDLocation(line: 11, scope: !21)
+!51 = !{!"0x102\00147\000\004"}                   ; [ DW_TAG_expression ] [DW_OP_piece offset=0, size=4]
+!52 = !MDLocation(line: 7, scope: !34, inlinedAt: !47)
+!53 = !MDLocation(line: 7, scope: !15, inlinedAt: !47)
+!54 = !MDLocation(line: 8, scope: !34, inlinedAt: !47)
+!55 = !MDLocation(line: 14, scope: !24)
+!56 = !MDLocation(line: 15, scope: !24)
diff --git a/test/Transforms/Inline/debug-info-duplicate-calls.ll b/test/Transforms/Inline/debug-info-duplicate-calls.ll
new file mode 100644
index 0000000..2363693
--- /dev/null
+++ b/test/Transforms/Inline/debug-info-duplicate-calls.ll
@@ -0,0 +1,121 @@
+; RUN: opt < %s -always-inline -S | FileCheck %s
+
+; Original input generated from clang -emit-llvm -S -c -mllvm -disable-llvm-optzns
+;
+; #define CALLS1 f2(); f2();
+; #define CALLS2 f4(); f4();
+; void f1();
+; inline __attribute__((always_inline)) void f2() {
+;   f1();
+; }
+; inline __attribute__((always_inline)) void f3() {
+;   CALLS1
+; }
+; inline __attribute__((always_inline)) void f4() {
+;   f3();
+; }
+; void f() {
+;   CALLS2
+; }
+
+; There should be unique locations for all 4 of these instructions, correctly
+; describing the inlining that has occurred, even in the face of duplicate call
+; site locations.
+
+; The nomenclature used for the tags here is <function name>[cs<number>] where
+; 'cs' is an abbreviation for 'call site' and the number indicates which call
+; site from within the named function this is. (so, given the above inlining, we
+; should have 4 calls to 'f1', two from the first call to f4 and two from the
+; second call to f4)
+
+; CHECK: call void @_Z2f1v(), !dbg [[fcs1_f4_f3cs1_f2:![0-9]+]]
+; CHECK: call void @_Z2f1v(), !dbg [[fcs1_f4_f3cs2_f2:![0-9]+]]
+; CHECK: call void @_Z2f1v(), !dbg [[fcs2_f4_f3cs1_f2:![0-9]+]]
+; CHECK: call void @_Z2f1v(), !dbg [[fcs2_f4_f3cs2_f2:![0-9]+]]
+
+; CHECK-DAG: [[F:![0-9]+]] = {{.*}} ; [ DW_TAG_subprogram ] {{.*}} [f]
+; CHECK-DAG: [[F2:![0-9]+]] = {{.*}} ; [ DW_TAG_subprogram ] {{.*}} [f2]
+; CHECK-DAG: [[F3:![0-9]+]] = {{.*}} ; [ DW_TAG_subprogram ] {{.*}} [f3]
+; CHECK-DAG: [[F4:![0-9]+]] = {{.*}} ; [ DW_TAG_subprogram ] {{.*}} [f4]
+
+; CHECK: [[fcs1_f4_f3cs1_f2]] = {{.*}}, scope: [[F2]], inlinedAt: [[fcs1_f4_f3cs1:![0-9]+]])
+; CHECK: [[fcs1_f4_f3cs1]] = {{.*}}, scope: [[F3]], inlinedAt: [[fcs1_f4:![0-9]+]])
+; CHECK: [[fcs1_f4]] = {{.*}}, scope: [[F4]], inlinedAt: [[fcs1:![0-9]+]])
+; CHECK: [[fcs1]] = {{.*}}, scope: [[F]])
+; CHECK: [[fcs1_f4_f3cs2_f2]] = {{.*}}, scope: [[F2]], inlinedAt: [[fcs1_f4_f3cs2:![0-9]+]])
+; CHECK: [[fcs1_f4_f3cs2]] = {{.*}}, scope: [[F3]], inlinedAt: [[fcs1_f4]])
+
+; CHECK: [[fcs2_f4_f3cs1_f2]] = {{.*}}, scope: [[F2]], inlinedAt: [[fcs2_f4_f3cs1:![0-9]+]])
+; CHECK: [[fcs2_f4_f3cs1]] = {{.*}}, scope: [[F3]], inlinedAt: [[fcs2_f4:![0-9]+]])
+; CHECK: [[fcs2_f4]] = {{.*}}, scope: [[F4]], inlinedAt: [[fcs2:![0-9]+]])
+; CHECK: [[fcs2]] = {{.*}}, scope: [[F]])
+; CHECK: [[fcs2_f4_f3cs2_f2]] = {{.*}}, scope: [[F2]], inlinedAt: [[fcs2_f4_f3cs2:![0-9]+]])
+; CHECK: [[fcs2_f4_f3cs2]] = {{.*}}, scope: [[F3]], inlinedAt: [[fcs2_f4]])
+
+$_Z2f4v = comdat any
+
+$_Z2f3v = comdat any
+
+$_Z2f2v = comdat any
+
+; Function Attrs: uwtable
+define void @_Z1fv() #0 {
+entry:
+  call void @_Z2f4v(), !dbg !13
+  call void @_Z2f4v(), !dbg !13
+  ret void, !dbg !14
+}
+
+; Function Attrs: alwaysinline inlinehint uwtable
+define linkonce_odr void @_Z2f4v() #1 comdat {
+entry:
+  call void @_Z2f3v(), !dbg !15
+  ret void, !dbg !16
+}
+
+; Function Attrs: alwaysinline inlinehint uwtable
+define linkonce_odr void @_Z2f3v() #1 comdat {
+entry:
+  call void @_Z2f2v(), !dbg !17
+  call void @_Z2f2v(), !dbg !17
+  ret void, !dbg !18
+}
+
+; Function Attrs: alwaysinline inlinehint uwtable
+define linkonce_odr void @_Z2f2v() #1 comdat {
+entry:
+  call void @_Z2f1v(), !dbg !19
+  ret void, !dbg !20
+}
+
+declare void @_Z2f1v() #2
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { alwaysinline inlinehint uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!10, !11}
+!llvm.ident = !{!12}
+
+!0 = !{!"0x11\004\00clang version 3.7.0 (trunk 226474) (llvm/trunk 226478)\000\00\000\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/debug-info-duplicate-calls.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"debug-info-duplicate-calls.cpp", !"/tmp/dbginfo"}
+!2 = !{}
+!3 = !{!4, !7, !8, !9}
+!4 = !{!"0x2e\00f\00f\00\0013\000\001\000\000\00256\000\0013", !1, !5, !6, null, void ()* @_Z1fv, null, null, !2} ; [ DW_TAG_subprogram ] [line 13] [def] [f]
+!5 = !{!"0x29", !1}                               ; [ DW_TAG_file_type ] [/tmp/dbginfo/debug-info-duplicate-calls.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!"0x2e\00f4\00f4\00\0010\000\001\000\000\00256\000\0010", !1, !5, !6, null, void ()* @_Z2f4v, null, null, !2} ; [ DW_TAG_subprogram ] [line 10] [def] [f4]
+!8 = !{!"0x2e\00f3\00f3\00\007\000\001\000\000\00256\000\007", !1, !5, !6, null, void ()* @_Z2f3v, null, null, !2} ; [ DW_TAG_subprogram ] [line 7] [def] [f3]
+!9 = !{!"0x2e\00f2\00f2\00\004\000\001\000\000\00256\000\004", !1, !5, !6, null, void ()* @_Z2f2v, null, null, !2} ; [ DW_TAG_subprogram ] [line 4] [def] [f2]
+!10 = !{i32 2, !"Dwarf Version", i32 4}
+!11 = !{i32 2, !"Debug Info Version", i32 2}
+!12 = !{!"clang version 3.7.0 (trunk 226474) (llvm/trunk 226478)"}
+!13 = !MDLocation(line: 14, column: 3, scope: !4)
+!14 = !MDLocation(line: 15, column: 1, scope: !4)
+!15 = !MDLocation(line: 11, column: 3, scope: !7)
+!16 = !MDLocation(line: 12, column: 1, scope: !7)
+!17 = !MDLocation(line: 8, column: 3, scope: !8)
+!18 = !MDLocation(line: 9, column: 1, scope: !8)
+!19 = !MDLocation(line: 5, column: 3, scope: !9)
+!20 = !MDLocation(line: 6, column: 1, scope: !9)
diff --git a/test/Transforms/Inline/debug-invoke.ll b/test/Transforms/Inline/debug-invoke.ll
index 0de2d22..74ba9dc 100644
--- a/test/Transforms/Inline/debug-invoke.ll
+++ b/test/Transforms/Inline/debug-invoke.ll
@@ -4,9 +4,9 @@
 
 ; CHECK: invoke void @test()
 ; CHECK-NEXT: to label {{.*}} unwind label {{.*}}, !dbg [[INL_LOC:!.*]]
-; CHECK: [[EMPTY:.*]] = metadata !{}
-; CHECK: [[INL_LOC]] = metadata !{i32 1, i32 0, metadata [[EMPTY]], metadata [[INL_AT:.*]]}
-; CHECK: [[INL_AT]] = metadata !{i32 2, i32 0, metadata [[EMPTY]], null}
+; CHECK: [[EMPTY:.*]] = !{}
+; CHECK: [[INL_LOC]] = !MDLocation(line: 1, scope: [[EMPTY]], inlinedAt: [[INL_AT:.*]])
+; CHECK: [[INL_AT]] = distinct !MDLocation(line: 2, scope: [[EMPTY]])
 
 declare void @test()
 declare i32 @__gxx_personality_v0(...)
@@ -31,7 +31,7 @@ lpad:
 }
 
 !llvm.module.flags = !{!1}
-!1 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!2 = metadata !{}
-!3 = metadata !{i32 1, i32 0, metadata !2, null}
-!4 = metadata !{i32 2, i32 0, metadata !2, null}
+!1 = !{i32 2, !"Debug Info Version", i32 2}
+!2 = !{}
+!3 = !MDLocation(line: 1, scope: !2)
+!4 = !MDLocation(line: 2, scope: !2)
diff --git a/test/Transforms/Inline/ignore-debug-info.ll b/test/Transforms/Inline/ignore-debug-info.ll
index 428b5d5..8bd6e7c 100644
--- a/test/Transforms/Inline/ignore-debug-info.ll
+++ b/test/Transforms/Inline/ignore-debug-info.ll
@@ -47,9 +47,9 @@ attributes #0 = { nounwind readnone }
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
 
-!0 = metadata !{metadata !"0x11\004\00\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !{}, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"", metadata !""}
-!2 = metadata !{i32 0}
-!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!5 = metadata !{metadata !""}
+!0 = !{!"0x11\004\00\000\00\000\00\000", !1, !2, !2, !{}, !2, !2} ; [ DW_TAG_compile_unit ]
+!1 = !{!"", !""}
+!2 = !{i32 0}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 1, !"Debug Info Version", i32 2}
+!5 = !{!""}
diff --git a/test/Transforms/Inline/inline-fast-math-flags.ll b/test/Transforms/Inline/inline-fast-math-flags.ll
new file mode 100644
index 0000000..c6a1487
--- /dev/null
+++ b/test/Transforms/Inline/inline-fast-math-flags.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -S -inline -inline-threshold=20 | FileCheck %s
+; Check that we don't drop FastMathFlag when estimating inlining profitability.
+;
+; In this test we should inline 'foo'  to 'boo', because it'll fold to a
+; constant.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define float @foo(float* %a, float %b) {
+entry:
+  %a0 = load float* %a, align 4
+  %mul = fmul fast float %a0, %b
+  %tobool = fcmp une float %mul, 0.000000e+00
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %a1 = load float* %a, align 8
+  %arrayidx1 = getelementptr inbounds float* %a, i64 1
+  %a2 = load float* %arrayidx1, align 4
+  %add = fadd fast float %a1, %a2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %storemerge = phi float [ %add, %if.then ], [ 1.000000e+00, %entry ]
+  ret float %storemerge
+}
+
+; CHECK-LABEL: @boo
+; CHECK-NOT: call float @foo
+define float @boo(float* %a) {
+entry:
+  %call = call float @foo(float* %a, float 0.000000e+00)
+  ret float %call
+}
diff --git a/test/Transforms/Inline/inline-fp.ll b/test/Transforms/Inline/inline-fp.ll
new file mode 100644
index 0000000..4d18ce8
--- /dev/null
+++ b/test/Transforms/Inline/inline-fp.ll
@@ -0,0 +1,136 @@
+; RUN: opt -S -inline < %s | FileCheck %s
+; Make sure that soft float implementations are calculated as being more expensive
+; to the inliner.
+
+define i32 @test_nofp() #0 {
+; f_nofp() has the "use-soft-float" attribute, so it should never get inlined.
+; CHECK-LABEL: test_nofp
+; CHECK: call float @f_nofp 
+entry:
+  %responseX = alloca i32, align 4
+  %responseY = alloca i32, align 4
+  %responseZ = alloca i32, align 4
+  %valueX = alloca i8, align 1
+  %valueY = alloca i8, align 1
+  %valueZ = alloca i8, align 1
+
+  call void @getX(i32* %responseX, i8* %valueX)
+  call void @getY(i32* %responseY, i8* %valueY)
+  call void @getZ(i32* %responseZ, i8* %valueZ)
+
+  %0 = load i32* %responseX
+  %1 = load i8* %valueX
+  %call = call float @f_nofp(i32 %0, i8 zeroext %1)
+  %2 = load i32* %responseZ
+  %3 = load i8* %valueZ
+  %call2 = call float @f_nofp(i32 %2, i8 zeroext %3)
+  %call3 = call float @fabsf(float %call)
+  %cmp = fcmp ogt float %call3, 0x3FC1EB8520000000
+  br i1 %cmp, label %if.end12, label %if.else
+
+if.else:                                          ; preds = %entry
+  %4 = load i32* %responseY
+  %5 = load i8* %valueY
+  %call1 = call float @f_nofp(i32 %4, i8 zeroext %5)
+  %call4 = call float @fabsf(float %call1)
+  %cmp5 = fcmp ogt float %call4, 0x3FC1EB8520000000
+  br i1 %cmp5, label %if.end12, label %if.else7
+
+if.else7:                                         ; preds = %if.else
+  %call8 = call float @fabsf(float %call2)
+  %cmp9 = fcmp ogt float %call8, 0x3FC1EB8520000000
+  br i1 %cmp9, label %if.then10, label %if.end12
+
+if.then10:                                        ; preds = %if.else7
+  br label %if.end12
+
+if.end12:                                         ; preds = %if.else, %entry, %if.then10, %if.else7
+  %success.0 = phi i32 [ 0, %if.then10 ], [ 1, %if.else7 ], [ 0, %entry ], [ 0, %if.else ]
+  ret i32 %success.0
+}
+
+define i32 @test_hasfp() #0 {
+; f_hasfp()  does not have the "use-soft-float" attribute, so it should get inlined.
+; CHECK-LABEL: test_hasfp
+; CHECK-NOT: call float @f_hasfp 
+entry:
+  %responseX = alloca i32, align 4
+  %responseY = alloca i32, align 4
+  %responseZ = alloca i32, align 4
+  %valueX = alloca i8, align 1
+  %valueY = alloca i8, align 1
+  %valueZ = alloca i8, align 1
+
+  call void @getX(i32* %responseX, i8* %valueX)
+  call void @getY(i32* %responseY, i8* %valueY)
+  call void @getZ(i32* %responseZ, i8* %valueZ)
+
+  %0 = load i32* %responseX
+  %1 = load i8* %valueX
+  %call = call float @f_hasfp(i32 %0, i8 zeroext %1)
+  %2 = load i32* %responseZ
+  %3 = load i8* %valueZ
+  %call2 = call float @f_hasfp(i32 %2, i8 zeroext %3)
+  %call3 = call float @fabsf(float %call)
+  %cmp = fcmp ogt float %call3, 0x3FC1EB8520000000
+  br i1 %cmp, label %if.end12, label %if.else
+
+if.else:                                          ; preds = %entry
+  %4 = load i32* %responseY
+  %5 = load i8* %valueY
+  %call1 = call float @f_hasfp(i32 %4, i8 zeroext %5)
+  %call4 = call float @fabsf(float %call1)
+  %cmp5 = fcmp ogt float %call4, 0x3FC1EB8520000000
+  br i1 %cmp5, label %if.end12, label %if.else7
+
+if.else7:                                         ; preds = %if.else
+  %call8 = call float @fabsf(float %call2)
+  %cmp9 = fcmp ogt float %call8, 0x3FC1EB8520000000
+  br i1 %cmp9, label %if.then10, label %if.end12
+
+if.then10:                                        ; preds = %if.else7
+  br label %if.end12
+
+if.end12:                                         ; preds = %if.else, %entry, %if.then10, %if.else7
+  %success.0 = phi i32 [ 0, %if.then10 ], [ 1, %if.else7 ], [ 0, %entry ], [ 0, %if.else ]
+  ret i32 %success.0
+}
+
+declare void @getX(i32*, i8*) #0
+
+declare void @getY(i32*, i8*) #0
+
+declare void @getZ(i32*, i8*) #0
+
+define internal float @f_hasfp(i32 %response, i8 zeroext %value1) #0 {
+entry:
+  %conv = zext i8 %value1 to i32
+  %sub = add nsw i32 %conv, -1
+  %conv1 = sitofp i32 %sub to float
+  %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
+  %mul = fmul float %0, 2.620000e+03
+  %conv2 = sitofp i32 %response to float
+  %sub3 = fsub float %conv2, %mul
+  %div = fdiv float %sub3, %mul
+  ret float %div
+}
+
+define internal float @f_nofp(i32 %response, i8 zeroext %value1) #1 {
+entry:
+  %conv = zext i8 %value1 to i32
+  %sub = add nsw i32 %conv, -1
+  %conv1 = sitofp i32 %sub to float
+  %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
+  %mul = fmul float %0, 2.620000e+03
+  %conv2 = sitofp i32 %response to float
+  %sub3 = fsub float %conv2, %mul
+  %div = fdiv float %sub3, %mul
+  ret float %div
+}
+
+declare float @fabsf(float) optsize minsize
+
+declare float @llvm.pow.f32(float, float) optsize minsize
+
+attributes #0 = { minsize optsize }
+attributes #1 = { minsize optsize "use-soft-float"="true" }
diff --git a/test/Transforms/Inline/inline-indirect.ll b/test/Transforms/Inline/inline-indirect.ll
new file mode 100644
index 0000000..f6eb528
--- /dev/null
+++ b/test/Transforms/Inline/inline-indirect.ll
@@ -0,0 +1,19 @@
+; RUN: opt < %s -inline -disable-output 2>/dev/null
+; This test used to trigger an assertion in the assumption cache when
+; inlining the indirect call
+declare void @llvm.assume(i1)
+
+define void @foo() {
+  ret void
+}
+
+define void @bar(void ()*) {
+  call void @llvm.assume(i1 true)
+  call void %0();
+  ret void
+}
+
+define void @baz() {
+  call void @bar(void ()* @foo)
+  ret void
+}
diff --git a/test/Transforms/Inline/inline-vla.ll b/test/Transforms/Inline/inline-vla.ll
index dc9deaa..7da448b 100644
--- a/test/Transforms/Inline/inline-vla.ll
+++ b/test/Transforms/Inline/inline-vla.ll
@@ -35,4 +35,4 @@ attributes #2 = { nounwind }
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.5.0 (trunk 205695) (llvm/trunk 205706)"}
+!0 = !{!"clang version 3.5.0 (trunk 205695) (llvm/trunk 205706)"}
diff --git a/test/Transforms/Inline/inline_dbg_declare.ll b/test/Transforms/Inline/inline_dbg_declare.ll
new file mode 100644
index 0000000..d296678
--- /dev/null
+++ b/test/Transforms/Inline/inline_dbg_declare.ll
@@ -0,0 +1,97 @@
+; RUN: opt < %s -S -inline | FileCheck %s
+;
+; The purpose of this test is to check that inline pass preserves debug info
+; for variable using the dbg.declare intrinsic.
+;
+;; This test was generated by running this command:
+;; clang.exe -S -O0 -emit-llvm -g foo.c
+;;
+;; foo.c
+;; ==========================
+;; float foo(float x)
+;; {
+;;    return x;
+;; }
+;;
+;; void bar(float *dst)
+;; {
+;;    dst[0] = foo(dst[0]);
+;; }
+;; ==========================
+
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+; Function Attrs: nounwind
+define float @foo(float %x) #0 {
+entry:
+  %x.addr = alloca float, align 4
+  store float %x, float* %x.addr, align 4
+  call void @llvm.dbg.declare(metadata float* %x.addr, metadata !16, metadata !17), !dbg !18
+  %0 = load float* %x.addr, align 4, !dbg !19
+  ret float %0, !dbg !19
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; CHECK: define void @bar
+
+; Function Attrs: nounwind
+define void @bar(float* %dst) #0 {
+entry:
+
+; CHECK: [[x_addr_i:%[a-zA-Z0-9.]+]] = alloca float, align 4
+
+  %dst.addr = alloca float*, align 4
+  store float* %dst, float** %dst.addr, align 4
+  call void @llvm.dbg.declare(metadata float** %dst.addr, metadata !20, metadata !17), !dbg !21
+  %0 = load float** %dst.addr, align 4, !dbg !22
+  %arrayidx = getelementptr inbounds float* %0, i32 0, !dbg !22
+  %1 = load float* %arrayidx, align 4, !dbg !22
+  %call = call float @foo(float %1), !dbg !22
+
+; CHECK-NOT: call float @foo
+; CHECK: void @llvm.dbg.declare(metadata float* [[x_addr_i]], metadata [[m23:![0-9]+]], metadata !17), !dbg [[m24:![0-9]+]]
+
+  %2 = load float** %dst.addr, align 4, !dbg !22
+  %arrayidx1 = getelementptr inbounds float* %2, i32 0, !dbg !22
+  store float %call, float* %arrayidx1, align 4, !dbg !22
+  ret void, !dbg !23
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!13, !14}
+!llvm.ident = !{!15}
+
+!0 = !{!"0x11\0012\00clang version 3.6.0 (trunk)\000\00\000\00\001", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [foo.c] [DW_LANG_C99]
+!1 = !{!"foo.c", !""}
+!2 = !{}
+!3 = !{!4, !9}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\000\00256\000\002", !1, !5, !6, null, float (float)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [foo]
+!5 = !{!"0x29", !1}    ; [ DW_TAG_file_type ] [foo.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8}
+!8 = !{!"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!9 = !{!"0x2e\00bar\00bar\00\006\000\001\000\000\00256\000\007", !1, !5, !10, null, void (float*)* @bar, null, null, !2} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 7] [bar]
+!10 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = !{null, !12}
+!12 = !{!"0xf\00\000\0032\0032\000\000", null, null, !8} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from float]
+!13 = !{i32 2, !"Dwarf Version", i32 4}
+!14 = !{i32 2, !"Debug Info Version", i32 2}
+!15 = !{!"clang version 3.6.0 (trunk)"}
+!16 = !{!"0x101\00x\0016777217\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [x] [line 1]
+!17 = !{!"0x102"}               ; [ DW_TAG_expression ]
+!18 = !MDLocation(line: 1, column: 17, scope: !4)
+!19 = !MDLocation(line: 3, column: 5, scope: !4)
+!20 = !{!"0x101\00dst\0016777222\000", !9, !5, !12} ; [ DW_TAG_arg_variable ] [dst] [line 6]
+!21 = !MDLocation(line: 6, column: 17, scope: !9)
+!22 = !MDLocation(line: 8, column: 14, scope: !9)
+!23 = !MDLocation(line: 9, column: 1, scope: !9)
+
+; CHECK: [[CALL_SITE:![0-9]+]] = distinct !MDLocation(line: 8, column: 14, scope: !9)
+; CHECK: [[m23]] = !{!"0x101\00x\0016777217\000", !4, !5, !8, [[CALL_SITE]]} ; [ DW_TAG_arg_variable ] [x] [line 1]
+; CHECK: [[m24]] = !MDLocation(line: 1, column: 17, scope: !4, inlinedAt: [[CALL_SITE]])
diff --git a/test/Transforms/Inline/noalias-calls.ll b/test/Transforms/Inline/noalias-calls.ll
index 13408e4..c09d2a6 100644
--- a/test/Transforms/Inline/noalias-calls.ll
+++ b/test/Transforms/Inline/noalias-calls.ll
@@ -35,10 +35,10 @@ entry:
 attributes #0 = { nounwind }
 attributes #1 = { nounwind uwtable }
 
-; CHECK: !0 = metadata !{metadata !1}
-; CHECK: !1 = metadata !{metadata !1, metadata !2, metadata !"hello: %c"}
-; CHECK: !2 = metadata !{metadata !2, metadata !"hello"}
-; CHECK: !3 = metadata !{metadata !4}
-; CHECK: !4 = metadata !{metadata !4, metadata !2, metadata !"hello: %a"}
-; CHECK: !5 = metadata !{metadata !4, metadata !1}
+; CHECK: !0 = !{!1}
+; CHECK: !1 = distinct !{!1, !2, !"hello: %c"}
+; CHECK: !2 = distinct !{!2, !"hello"}
+; CHECK: !3 = !{!4}
+; CHECK: !4 = distinct !{!4, !2, !"hello: %a"}
+; CHECK: !5 = !{!4, !1}
 
diff --git a/test/Transforms/Inline/noalias-cs.ll b/test/Transforms/Inline/noalias-cs.ll
index acd9021..da5ddd6 100644
--- a/test/Transforms/Inline/noalias-cs.ll
+++ b/test/Transforms/Inline/noalias-cs.ll
@@ -46,39 +46,39 @@ entry:
 
 attributes #0 = { nounwind uwtable }
 
-!0 = metadata !{metadata !1}
-!1 = metadata !{metadata !1, metadata !2, metadata !"hello: %a"}
-!2 = metadata !{metadata !2, metadata !"hello"}
-!3 = metadata !{metadata !4, metadata !6}
-!4 = metadata !{metadata !4, metadata !5, metadata !"hello2: %a"}
-!5 = metadata !{metadata !5, metadata !"hello2"}
-!6 = metadata !{metadata !6, metadata !5, metadata !"hello2: %b"}
-!7 = metadata !{metadata !4}
-!8 = metadata !{metadata !6}
+!0 = !{!1}
+!1 = distinct !{!1, !2, !"hello: %a"}
+!2 = distinct !{!2, !"hello"}
+!3 = !{!4, !6}
+!4 = distinct !{!4, !5, !"hello2: %a"}
+!5 = distinct !{!5, !"hello2"}
+!6 = distinct !{!6, !5, !"hello2: %b"}
+!7 = !{!4}
+!8 = !{!6}
 
-; CHECK: !0 = metadata !{metadata !1, metadata !3}
-; CHECK: !1 = metadata !{metadata !1, metadata !2, metadata !"hello2: %a"}
-; CHECK: !2 = metadata !{metadata !2, metadata !"hello2"}
-; CHECK: !3 = metadata !{metadata !3, metadata !2, metadata !"hello2: %b"}
-; CHECK: !4 = metadata !{metadata !1}
-; CHECK: !5 = metadata !{metadata !3}
-; CHECK: !6 = metadata !{metadata !7, metadata !9, metadata !10}
-; CHECK: !7 = metadata !{metadata !7, metadata !8, metadata !"hello2: %a"}
-; CHECK: !8 = metadata !{metadata !8, metadata !"hello2"}
-; CHECK: !9 = metadata !{metadata !9, metadata !8, metadata !"hello2: %b"}
-; CHECK: !10 = metadata !{metadata !10, metadata !11, metadata !"hello: %a"}
-; CHECK: !11 = metadata !{metadata !11, metadata !"hello"}
-; CHECK: !12 = metadata !{metadata !7}
-; CHECK: !13 = metadata !{metadata !9, metadata !10}
-; CHECK: !14 = metadata !{metadata !9}
-; CHECK: !15 = metadata !{metadata !7, metadata !10}
-; CHECK: !16 = metadata !{metadata !10}
-; CHECK: !17 = metadata !{metadata !18, metadata !20}
-; CHECK: !18 = metadata !{metadata !18, metadata !19, metadata !"hello2: %a"}
-; CHECK: !19 = metadata !{metadata !19, metadata !"hello2"}
-; CHECK: !20 = metadata !{metadata !20, metadata !19, metadata !"hello2: %b"}
-; CHECK: !21 = metadata !{metadata !18, metadata !10}
-; CHECK: !22 = metadata !{metadata !20}
-; CHECK: !23 = metadata !{metadata !20, metadata !10}
-; CHECK: !24 = metadata !{metadata !18}
+; CHECK: !0 = !{!1, !3}
+; CHECK: !1 = distinct !{!1, !2, !"hello2: %a"}
+; CHECK: !2 = distinct !{!2, !"hello2"}
+; CHECK: !3 = distinct !{!3, !2, !"hello2: %b"}
+; CHECK: !4 = !{!1}
+; CHECK: !5 = !{!3}
+; CHECK: !6 = !{!7, !9, !10}
+; CHECK: !7 = distinct !{!7, !8, !"hello2: %a"}
+; CHECK: !8 = distinct !{!8, !"hello2"}
+; CHECK: !9 = distinct !{!9, !8, !"hello2: %b"}
+; CHECK: !10 = distinct !{!10, !11, !"hello: %a"}
+; CHECK: !11 = distinct !{!11, !"hello"}
+; CHECK: !12 = !{!7}
+; CHECK: !13 = !{!9, !10}
+; CHECK: !14 = !{!9}
+; CHECK: !15 = !{!7, !10}
+; CHECK: !16 = !{!10}
+; CHECK: !17 = !{!18, !20}
+; CHECK: !18 = distinct !{!18, !19, !"hello2: %a"}
+; CHECK: !19 = distinct !{!19, !"hello2"}
+; CHECK: !20 = distinct !{!20, !19, !"hello2: %b"}
+; CHECK: !21 = !{!18, !10}
+; CHECK: !22 = !{!20}
+; CHECK: !23 = !{!20, !10}
+; CHECK: !24 = !{!18}
 
diff --git a/test/Transforms/Inline/noalias.ll b/test/Transforms/Inline/noalias.ll
index 7a54d5d..674da1e 100644
--- a/test/Transforms/Inline/noalias.ll
+++ b/test/Transforms/Inline/noalias.ll
@@ -64,13 +64,13 @@ entry:
 
 attributes #0 = { nounwind uwtable }
 
-; CHECK: !0 = metadata !{metadata !1}
-; CHECK: !1 = metadata !{metadata !1, metadata !2, metadata !"hello: %a"}
-; CHECK: !2 = metadata !{metadata !2, metadata !"hello"}
-; CHECK: !3 = metadata !{metadata !4, metadata !6}
-; CHECK: !4 = metadata !{metadata !4, metadata !5, metadata !"hello2: %a"}
-; CHECK: !5 = metadata !{metadata !5, metadata !"hello2"}
-; CHECK: !6 = metadata !{metadata !6, metadata !5, metadata !"hello2: %b"}
-; CHECK: !7 = metadata !{metadata !4}
-; CHECK: !8 = metadata !{metadata !6}
+; CHECK: !0 = !{!1}
+; CHECK: !1 = distinct !{!1, !2, !"hello: %a"}
+; CHECK: !2 = distinct !{!2, !"hello"}
+; CHECK: !3 = !{!4, !6}
+; CHECK: !4 = distinct !{!4, !5, !"hello2: %a"}
+; CHECK: !5 = distinct !{!5, !"hello2"}
+; CHECK: !6 = distinct !{!6, !5, !"hello2: %b"}
+; CHECK: !7 = !{!4}
+; CHECK: !8 = !{!6}
 
diff --git a/test/Transforms/Inline/noalias2.ll b/test/Transforms/Inline/noalias2.ll
index a4b38b0..9c8f8e2 100644
--- a/test/Transforms/Inline/noalias2.ll
+++ b/test/Transforms/Inline/noalias2.ll
@@ -71,27 +71,27 @@ entry:
 ; CHECK:   ret void
 ; CHECK: }
 
-; CHECK: !0 = metadata !{metadata !1}
-; CHECK: !1 = metadata !{metadata !1, metadata !2, metadata !"hello: %c"}
-; CHECK: !2 = metadata !{metadata !2, metadata !"hello"}
-; CHECK: !3 = metadata !{metadata !4}
-; CHECK: !4 = metadata !{metadata !4, metadata !2, metadata !"hello: %a"}
-; CHECK: !5 = metadata !{metadata !6, metadata !8}
-; CHECK: !6 = metadata !{metadata !6, metadata !7, metadata !"hello: %c"}
-; CHECK: !7 = metadata !{metadata !7, metadata !"hello"}
-; CHECK: !8 = metadata !{metadata !8, metadata !9, metadata !"foo: %c"}
-; CHECK: !9 = metadata !{metadata !9, metadata !"foo"}
-; CHECK: !10 = metadata !{metadata !11, metadata !12}
-; CHECK: !11 = metadata !{metadata !11, metadata !7, metadata !"hello: %a"}
-; CHECK: !12 = metadata !{metadata !12, metadata !9, metadata !"foo: %a"}
-; CHECK: !13 = metadata !{metadata !8}
-; CHECK: !14 = metadata !{metadata !12}
-; CHECK: !15 = metadata !{metadata !16, metadata !18}
-; CHECK: !16 = metadata !{metadata !16, metadata !17, metadata !"hello2: %a"}
-; CHECK: !17 = metadata !{metadata !17, metadata !"hello2"}
-; CHECK: !18 = metadata !{metadata !18, metadata !17, metadata !"hello2: %b"}
-; CHECK: !19 = metadata !{metadata !16}
-; CHECK: !20 = metadata !{metadata !18}
+; CHECK: !0 = !{!1}
+; CHECK: !1 = distinct !{!1, !2, !"hello: %c"}
+; CHECK: !2 = distinct !{!2, !"hello"}
+; CHECK: !3 = !{!4}
+; CHECK: !4 = distinct !{!4, !2, !"hello: %a"}
+; CHECK: !5 = !{!6, !8}
+; CHECK: !6 = distinct !{!6, !7, !"hello: %c"}
+; CHECK: !7 = distinct !{!7, !"hello"}
+; CHECK: !8 = distinct !{!8, !9, !"foo: %c"}
+; CHECK: !9 = distinct !{!9, !"foo"}
+; CHECK: !10 = !{!11, !12}
+; CHECK: !11 = distinct !{!11, !7, !"hello: %a"}
+; CHECK: !12 = distinct !{!12, !9, !"foo: %a"}
+; CHECK: !13 = !{!8}
+; CHECK: !14 = !{!12}
+; CHECK: !15 = !{!16, !18}
+; CHECK: !16 = distinct !{!16, !17, !"hello2: %a"}
+; CHECK: !17 = distinct !{!17, !"hello2"}
+; CHECK: !18 = distinct !{!18, !17, !"hello2: %b"}
+; CHECK: !19 = !{!16}
+; CHECK: !20 = !{!18}
 
 attributes #0 = { nounwind uwtable }
 
diff --git a/test/Transforms/Inline/optimization-remarks.ll b/test/Transforms/Inline/optimization-remarks.ll
index 9108f3a..fb1b047 100644
--- a/test/Transforms/Inline/optimization-remarks.ll
+++ b/test/Transforms/Inline/optimization-remarks.ll
@@ -57,4 +57,4 @@ attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.5.0 "}
+!0 = !{!"clang version 3.5.0 "}
diff --git a/test/Transforms/Inline/pr21206.ll b/test/Transforms/Inline/pr21206.ll
index 1a4366e..e460030 100644
--- a/test/Transforms/Inline/pr21206.ll
+++ b/test/Transforms/Inline/pr21206.ll
@@ -3,15 +3,15 @@
 $c = comdat any
 ; CHECK: $c = comdat any
 
-define linkonce_odr void @foo() comdat $c {
+define linkonce_odr void @foo() comdat($c) {
   ret void
 }
-; CHECK: define linkonce_odr void @foo() comdat $c
+; CHECK: define linkonce_odr void @foo() comdat($c)
 
-define linkonce_odr void @bar() comdat $c {
+define linkonce_odr void @bar() comdat($c) {
   ret void
 }
-; CHECK: define linkonce_odr void @bar() comdat $c
+; CHECK: define linkonce_odr void @bar() comdat($c)
 
 define void()* @zed()  {
   ret void()* @foo
diff --git a/test/Transforms/InstCombine/2008-05-23-CompareFold.ll b/test/Transforms/InstCombine/2008-05-23-CompareFold.ll
index acb259b..b729677 100644
--- a/test/Transforms/InstCombine/2008-05-23-CompareFold.ll
+++ b/test/Transforms/InstCombine/2008-05-23-CompareFold.ll
@@ -1,5 +1,8 @@
-; RUN: opt < %s -instcombine -S | grep "ret i1 false"
+; RUN: opt -instcombine -S < %s | FileCheck %s
 ; PR2359
+
+; CHECK-LABEL: @f(
+; CHECK: ret i1 false
 define i1 @f(i8* %x) {
 entry:
        %tmp462 = load i8* %x, align 1          ; <i8> [#uses=1]
diff --git a/test/Transforms/InstCombine/2008-11-08-FCmp.ll b/test/Transforms/InstCombine/2008-11-08-FCmp.ll
index f33a1f5..f1af7ce 100644
--- a/test/Transforms/InstCombine/2008-11-08-FCmp.ll
+++ b/test/Transforms/InstCombine/2008-11-08-FCmp.ll
@@ -4,6 +4,7 @@
 ; When inst combining an FCMP with the LHS coming from a uitofp instruction, we
 ; can't lower it to signed ICMP instructions.
 
+; CHECK-LABEL: @test1(
 define i1 @test1(i32 %val) {
   %1 = uitofp i32 %val to double
   %2 = fcmp ole double %1, 0.000000e+00
@@ -11,6 +12,7 @@ define i1 @test1(i32 %val) {
   ret i1 %2
 }
 
+; CHECK-LABEL: @test2(
 define i1 @test2(i32 %val) {
   %1 = uitofp i32 %val to double
   %2 = fcmp olt double %1, 0.000000e+00
@@ -18,6 +20,7 @@ define i1 @test2(i32 %val) {
 ; CHECK: ret i1 false
 }
 
+; CHECK-LABEL: @test3(
 define i1 @test3(i32 %val) {
   %1 = uitofp i32 %val to double
   %2 = fcmp oge double %1, 0.000000e+00
@@ -25,6 +28,7 @@ define i1 @test3(i32 %val) {
 ; CHECK: ret i1 true
 }
 
+; CHECK-LABEL: @test4(
 define i1 @test4(i32 %val) {
   %1 = uitofp i32 %val to double
   %2 = fcmp ogt double %1, 0.000000e+00
@@ -32,6 +36,7 @@ define i1 @test4(i32 %val) {
   ret i1 %2
 }
 
+; CHECK-LABEL: @test5(
 define i1 @test5(i32 %val) {
   %1 = uitofp i32 %val to double
   %2 = fcmp ogt double %1, -4.400000e+00
@@ -39,6 +44,7 @@ define i1 @test5(i32 %val) {
 ; CHECK: ret i1 true
 }
 
+; CHECK-LABEL: @test6(
 define i1 @test6(i32 %val) {
   %1 = uitofp i32 %val to double
   %2 = fcmp olt double %1, -4.400000e+00
@@ -48,6 +54,7 @@ define i1 @test6(i32 %val) {
 
 ; Check that optimizing unsigned >= comparisons correctly distinguishes
 ; positive and negative constants.  <rdar://problem/12029145>
+; CHECK-LABEL: @test7(
 define i1 @test7(i32 %val) {
   %1 = uitofp i32 %val to double
   %2 = fcmp oge double %1, 3.200000e+00
diff --git a/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll b/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll
index 895b260..c8f0351 100644
--- a/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll
+++ b/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll
@@ -50,7 +50,8 @@ define void @fu2(i32 %parm) nounwind ssp {
   %7 = add  i32 %6, 2048
 ; CHECK: alloca i8
   %8 = alloca i8, i32 %7
-; CHECK-NEXT: bitcast i8*
+; CHECK-NEXT: bitcast double**
+; CHECK-NEXT: store i8*
   %9 = bitcast i8* %8 to double*
   store double* %9, double** %ptr, align 4
   br label %10
diff --git a/test/Transforms/InstCombine/AddOverFlow.ll b/test/Transforms/InstCombine/AddOverFlow.ll
index 8f3d429..bebfd62 100644
--- a/test/Transforms/InstCombine/AddOverFlow.ll
+++ b/test/Transforms/InstCombine/AddOverFlow.ll
@@ -36,8 +36,8 @@ define i16 @zero_sign_bit2(i16 %a, i16 %b) {
 
 declare i16 @bounded(i16 %input);
 declare i32 @__gxx_personality_v0(...);
-!0 = metadata !{i16 0, i16 32768} ; [0, 32767]
-!1 = metadata !{i16 0, i16 32769} ; [0, 32768]
+!0 = !{i16 0, i16 32768} ; [0, 32767]
+!1 = !{i16 0, i16 32769} ; [0, 32768]
 
 define i16 @add_bounded_values(i16 %a, i16 %b) {
 ; CHECK-LABEL: @add_bounded_values(
diff --git a/test/Transforms/InstCombine/LandingPadClauses.ll b/test/Transforms/InstCombine/LandingPadClauses.ll
index 10af4bc..0d42f7c 100644
--- a/test/Transforms/InstCombine/LandingPadClauses.ll
+++ b/test/Transforms/InstCombine/LandingPadClauses.ll
@@ -7,6 +7,7 @@
 declare i32 @generic_personality(i32, i64, i8*, i8*)
 declare i32 @__gxx_personality_v0(i32, i64, i8*, i8*)
 declare i32 @__objc_personality_v0(i32, i64, i8*, i8*)
+declare i32 @__C_specific_handler(...)
 
 declare void @bar()
 
@@ -231,3 +232,54 @@ lpad.d:
 ; CHECK-NEXT: null
 ; CHECK-NEXT: unreachable
 }
+
+define void @foo_seh() {
+; CHECK-LABEL: @foo_seh(
+  invoke void @bar()
+    to label %cont.a unwind label %lpad.a
+cont.a:
+  invoke void @bar()
+    to label %cont.b unwind label %lpad.b
+cont.b:
+  invoke void @bar()
+    to label %cont.c unwind label %lpad.c
+cont.c:
+  invoke void @bar()
+    to label %cont.d unwind label %lpad.d
+cont.d:
+  ret void
+
+lpad.a:
+  %a = landingpad { i8*, i32 } personality i32 (...)* @__C_specific_handler
+          catch i32* null
+          catch i32* @T1
+  unreachable
+; CHECK: %a = landingpad
+; CHECK-NEXT: null
+; CHECK-NEXT: unreachable
+
+lpad.b:
+  %b = landingpad { i8*, i32 } personality i32 (...)* @__C_specific_handler
+          filter [1 x i32*] zeroinitializer
+  unreachable
+; CHECK: %b = landingpad
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: unreachable
+
+lpad.c:
+  %c = landingpad { i8*, i32 } personality i32 (...)* @__C_specific_handler
+          filter [2 x i32*] [i32* @T1, i32* null]
+  unreachable
+; CHECK: %c = landingpad
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: unreachable
+
+lpad.d:
+  %d = landingpad { i8*, i32 } personality i32 (...)* @__C_specific_handler
+          cleanup
+          catch i32* null
+  unreachable
+; CHECK: %d = landingpad
+; CHECK-NEXT: null
+; CHECK-NEXT: unreachable
+}
diff --git a/test/Transforms/InstCombine/add2.ll b/test/Transforms/InstCombine/add2.ll
index a166e5f..fbbba59 100644
--- a/test/Transforms/InstCombine/add2.ll
+++ b/test/Transforms/InstCombine/add2.ll
@@ -219,7 +219,7 @@ define i16 @mul_add_to_mul_1(i16 %x) {
  %add2 = add nsw i16 %x, %mul1
  ret i16 %add2
 ; CHECK-LABEL: @mul_add_to_mul_1(
-; CHECK-NEXT: %add2 = mul i16 %x, 9
+; CHECK-NEXT: %add2 = mul nsw i16 %x, 9
 ; CHECK-NEXT: ret i16 %add2
 }
 
@@ -228,7 +228,7 @@ define i16 @mul_add_to_mul_2(i16 %x) {
  %add2 = add nsw i16 %mul1, %x
  ret i16 %add2
 ; CHECK-LABEL: @mul_add_to_mul_2(
-; CHECK-NEXT: %add2 = mul i16 %x, 9
+; CHECK-NEXT: %add2 = mul nsw i16 %x, 9
 ; CHECK-NEXT: ret i16 %add2
 }
 
@@ -248,7 +248,7 @@ define i16 @mul_add_to_mul_4(i16 %a) {
  %add = add nsw i16 %mul1, %mul2
  ret i16 %add
 ; CHECK-LABEL: @mul_add_to_mul_4(
-; CHECK-NEXT: %add = mul i16 %a, 9
+; CHECK-NEXT: %add = mul nsw i16 %a, 9
 ; CHECK-NEXT: ret i16 %add
 }
 
@@ -294,7 +294,7 @@ define i16 @add_cttz(i16 %a) {
   ret i16 %b
 }
 declare i16 @llvm.cttz.i16(i16, i1)
-!0 = metadata !{i16 0, i16 8}
+!0 = !{i16 0, i16 8}
 
 ; Similar to @add_cttz, but in this test, the range implied by the
 ; intrinsic is more strict. Therefore, ValueTracking uses that range.
@@ -312,7 +312,7 @@ define i16 @add_cttz_2(i16 %a) {
 ; CHECK: or i16 %cttz, -16
   ret i16 %b
 }
-!1 = metadata !{i16 0, i16 32}
+!1 = !{i16 0, i16 32}
 
 define i32 @add_or_and(i32 %x, i32 %y) {
   %or = or i32 %x, %y
diff --git a/test/Transforms/InstCombine/addnegneg.ll b/test/Transforms/InstCombine/addnegneg.ll
index ad8791d..90f6baf 100644
--- a/test/Transforms/InstCombine/addnegneg.ll
+++ b/test/Transforms/InstCombine/addnegneg.ll
@@ -9,4 +9,3 @@ entry:
 	%sub6 = add i32 %sub4, %d		; <i32> [#uses=1]
 	ret i32 %sub6
 }
-
diff --git a/test/Transforms/InstCombine/alias-recursion.ll b/test/Transforms/InstCombine/alias-recursion.ll
new file mode 100644
index 0000000..fa63726
--- /dev/null
+++ b/test/Transforms/InstCombine/alias-recursion.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+%class.A = type { i32 (...)** }
+
+@0 = constant [1 x i8*] zeroinitializer
+
+@vtbl = alias getelementptr inbounds ([1 x i8*]* @0, i32 0, i32 0)
+
+define i32 (%class.A*)* @test() {
+; CHECK-LABEL: test
+entry:
+  br i1 undef, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 undef, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  %A = phi i32 (%class.A*)** [ bitcast (i8** @vtbl to i32 (%class.A*)**), %for.body ], [ null, %entry ]
+  %B = load i32 (%class.A*)** %A
+  ret i32 (%class.A*)* %B
+}
diff --git a/test/Transforms/InstCombine/aligned-altivec.ll b/test/Transforms/InstCombine/aligned-altivec.ll
new file mode 100644
index 0000000..6ac2691
--- /dev/null
+++ b/test/Transforms/InstCombine/aligned-altivec.ll
@@ -0,0 +1,131 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare <4 x i32> @llvm.ppc.altivec.lvx(i8*) #1
+
+define <4 x i32> @test1(<4 x i32>* %h) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  %vl = call <4 x i32> @llvm.ppc.altivec.lvx(i8* %hv)
+
+; CHECK-LABEL: @test1
+; CHECK: @llvm.ppc.altivec.lvx
+; CHECK: ret <4 x i32>
+
+  %v0 = load <4 x i32>* %h, align 8
+  %a = add <4 x i32> %v0, %vl
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @test1a(<4 x i32>* align 16 %h) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  %vl = call <4 x i32> @llvm.ppc.altivec.lvx(i8* %hv)
+
+; CHECK-LABEL: @test1a
+; CHECK-NOT: @llvm.ppc.altivec.lvx
+; CHECK: ret <4 x i32>
+
+  %v0 = load <4 x i32>* %h, align 8
+  %a = add <4 x i32> %v0, %vl
+  ret <4 x i32> %a
+}
+
+declare void @llvm.ppc.altivec.stvx(<4 x i32>, i8*) #0
+
+define <4 x i32> @test2(<4 x i32>* %h, <4 x i32> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  call void @llvm.ppc.altivec.stvx(<4 x i32> %d, i8* %hv)
+
+  %v0 = load <4 x i32>* %h, align 8
+  ret <4 x i32> %v0
+
+; CHECK-LABEL: @test2
+; CHECK: @llvm.ppc.altivec.stvx
+; CHECK: ret <4 x i32>
+}
+
+define <4 x i32> @test2a(<4 x i32>* align 16 %h, <4 x i32> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  call void @llvm.ppc.altivec.stvx(<4 x i32> %d, i8* %hv)
+
+  %v0 = load <4 x i32>* %h, align 8
+  ret <4 x i32> %v0
+
+; CHECK-LABEL: @test2
+; CHECK-NOT: @llvm.ppc.altivec.stvx
+; CHECK: ret <4 x i32>
+}
+
+declare <4 x i32> @llvm.ppc.altivec.lvxl(i8*) #1
+
+define <4 x i32> @test1l(<4 x i32>* %h) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  %vl = call <4 x i32> @llvm.ppc.altivec.lvxl(i8* %hv)
+
+; CHECK-LABEL: @test1l
+; CHECK: @llvm.ppc.altivec.lvxl
+; CHECK: ret <4 x i32>
+
+  %v0 = load <4 x i32>* %h, align 8
+  %a = add <4 x i32> %v0, %vl
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @test1la(<4 x i32>* align 16 %h) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  %vl = call <4 x i32> @llvm.ppc.altivec.lvxl(i8* %hv)
+
+; CHECK-LABEL: @test1la
+; CHECK-NOT: @llvm.ppc.altivec.lvxl
+; CHECK: ret <4 x i32>
+
+  %v0 = load <4 x i32>* %h, align 8
+  %a = add <4 x i32> %v0, %vl
+  ret <4 x i32> %a
+}
+
+declare void @llvm.ppc.altivec.stvxl(<4 x i32>, i8*) #0
+
+define <4 x i32> @test2l(<4 x i32>* %h, <4 x i32> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  call void @llvm.ppc.altivec.stvxl(<4 x i32> %d, i8* %hv)
+
+  %v0 = load <4 x i32>* %h, align 8
+  ret <4 x i32> %v0
+
+; CHECK-LABEL: @test2l
+; CHECK: @llvm.ppc.altivec.stvxl
+; CHECK: ret <4 x i32>
+}
+
+define <4 x i32> @test2la(<4 x i32>* align 16 %h, <4 x i32> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  call void @llvm.ppc.altivec.stvxl(<4 x i32> %d, i8* %hv)
+
+  %v0 = load <4 x i32>* %h, align 8
+  ret <4 x i32> %v0
+
+; CHECK-LABEL: @test2l
+; CHECK-NOT: @llvm.ppc.altivec.stvxl
+; CHECK: ret <4 x i32>
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+
diff --git a/test/Transforms/InstCombine/aligned-qpx.ll b/test/Transforms/InstCombine/aligned-qpx.ll
new file mode 100644
index 0000000..c8a1f6f
--- /dev/null
+++ b/test/Transforms/InstCombine/aligned-qpx.ll
@@ -0,0 +1,162 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare <4 x double> @llvm.ppc.qpx.qvlfs(i8*) #1
+
+define <4 x double> @test1(<4 x float>* %h) #0 {
+entry:
+  %h1 = getelementptr <4 x float>* %h, i64 1
+  %hv = bitcast <4 x float>* %h1 to i8*
+  %vl = call <4 x double> @llvm.ppc.qpx.qvlfs(i8* %hv)
+
+; CHECK-LABEL: @test1
+; CHECK: @llvm.ppc.qpx.qvlfs
+; CHECK: ret <4 x double>
+
+  %v0 = load <4 x float>* %h, align 8
+  %v0e = fpext <4 x float> %v0 to <4 x double>
+  %a = fadd <4 x double> %v0e, %vl
+  ret <4 x double> %a
+}
+
+define <4 x double> @test1a(<4 x float>* align 16 %h) #0 {
+entry:
+  %h1 = getelementptr <4 x float>* %h, i64 1
+  %hv = bitcast <4 x float>* %h1 to i8*
+  %vl = call <4 x double> @llvm.ppc.qpx.qvlfs(i8* %hv)
+
+; CHECK-LABEL: @test1a
+; CHECK-NOT: @llvm.ppc.qpx.qvlfs
+; CHECK: ret <4 x double>
+
+  %v0 = load <4 x float>* %h, align 8
+  %v0e = fpext <4 x float> %v0 to <4 x double>
+  %a = fadd <4 x double> %v0e, %vl
+  ret <4 x double> %a
+}
+
+declare void @llvm.ppc.qpx.qvstfs(<4 x double>, i8*) #0
+
+define <4 x float> @test2(<4 x float>* %h, <4 x double> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x float>* %h, i64 1
+  %hv = bitcast <4 x float>* %h1 to i8*
+  call void @llvm.ppc.qpx.qvstfs(<4 x double> %d, i8* %hv)
+
+  %v0 = load <4 x float>* %h, align 8
+  ret <4 x float> %v0
+
+; CHECK-LABEL: @test2
+; CHECK: @llvm.ppc.qpx.qvstfs
+; CHECK: ret <4 x float>
+}
+
+define <4 x float> @test2a(<4 x float>* align 16 %h, <4 x double> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x float>* %h, i64 1
+  %hv = bitcast <4 x float>* %h1 to i8*
+  call void @llvm.ppc.qpx.qvstfs(<4 x double> %d, i8* %hv)
+
+  %v0 = load <4 x float>* %h, align 8
+  ret <4 x float> %v0
+
+; CHECK-LABEL: @test2
+; CHECK-NOT: @llvm.ppc.qpx.qvstfs
+; CHECK: ret <4 x float>
+}
+
+declare <4 x double> @llvm.ppc.qpx.qvlfd(i8*) #1
+
+define <4 x double> @test1l(<4 x double>* %h) #0 {
+entry:
+  %h1 = getelementptr <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv)
+
+; CHECK-LABEL: @test1l
+; CHECK: @llvm.ppc.qpx.qvlfd
+; CHECK: ret <4 x double>
+
+  %v0 = load <4 x double>* %h, align 8
+  %a = fadd <4 x double> %v0, %vl
+  ret <4 x double> %a
+}
+
+define <4 x double> @test1ln(<4 x double>* align 16 %h) #0 {
+entry:
+  %h1 = getelementptr <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv)
+
+; CHECK-LABEL: @test1ln
+; CHECK: @llvm.ppc.qpx.qvlfd
+; CHECK: ret <4 x double>
+
+  %v0 = load <4 x double>* %h, align 8
+  %a = fadd <4 x double> %v0, %vl
+  ret <4 x double> %a
+}
+
+define <4 x double> @test1la(<4 x double>* align 32 %h) #0 {
+entry:
+  %h1 = getelementptr <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv)
+
+; CHECK-LABEL: @test1la
+; CHECK-NOT: @llvm.ppc.qpx.qvlfd
+; CHECK: ret <4 x double>
+
+  %v0 = load <4 x double>* %h, align 8
+  %a = fadd <4 x double> %v0, %vl
+  ret <4 x double> %a
+}
+
+declare void @llvm.ppc.qpx.qvstfd(<4 x double>, i8*) #0
+
+define <4 x double> @test2l(<4 x double>* %h, <4 x double> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv)
+
+  %v0 = load <4 x double>* %h, align 8
+  ret <4 x double> %v0
+
+; CHECK-LABEL: @test2l
+; CHECK: @llvm.ppc.qpx.qvstfd
+; CHECK: ret <4 x double>
+}
+
+define <4 x double> @test2ln(<4 x double>* align 16 %h, <4 x double> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv)
+
+  %v0 = load <4 x double>* %h, align 8
+  ret <4 x double> %v0
+
+; CHECK-LABEL: @test2ln
+; CHECK: @llvm.ppc.qpx.qvstfd
+; CHECK: ret <4 x double>
+}
+
+define <4 x double> @test2la(<4 x double>* align 32 %h, <4 x double> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv)
+
+  %v0 = load <4 x double>* %h, align 8
+  ret <4 x double> %v0
+
+; CHECK-LABEL: @test2l
+; CHECK-NOT: @llvm.ppc.qpx.qvstfd
+; CHECK: ret <4 x double>
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+
diff --git a/test/Transforms/InstCombine/and-compare.ll b/test/Transforms/InstCombine/and-compare.ll
index c30a245..037641b 100644
--- a/test/Transforms/InstCombine/and-compare.ll
+++ b/test/Transforms/InstCombine/and-compare.ll
@@ -1,11 +1,15 @@
 ; RUN: opt < %s -instcombine -S | \
-; RUN:    grep and | count 1
+; RUN: FileCheck %s
 
 ; Should be optimized to one and.
 define i1 @test1(i32 %a, i32 %b) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT: %1 = xor i32 %a, %b
+; CHECK-NEXT: %2 = and i32 %1, 65280
+; CHECK-NEXT: %tmp = icmp ne i32 %2, 0
+; CHECK-NEXT: ret i1 %tmp
         %tmp1 = and i32 %a, 65280               ; <i32> [#uses=1]
         %tmp3 = and i32 %b, 65280               ; <i32> [#uses=1]
         %tmp = icmp ne i32 %tmp1, %tmp3         ; <i1> [#uses=1]
         ret i1 %tmp
 }
-
diff --git a/test/Transforms/InstCombine/and-xor-merge.ll b/test/Transforms/InstCombine/and-xor-merge.ll
index e432a9a..b9a6a53 100644
--- a/test/Transforms/InstCombine/and-xor-merge.ll
+++ b/test/Transforms/InstCombine/and-xor-merge.ll
@@ -1,8 +1,11 @@
-; RUN: opt < %s -instcombine -S | grep and | count 1
-; RUN: opt < %s -instcombine -S | grep xor | count 2
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
 ; (x&z) ^ (y&z) -> (x^y)&z
 define i32 @test1(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT: %tmp61 = xor i32 %x, %y
+; CHECK-NEXT: %tmp7 = and i32 %tmp61, %z
+; CHECK-NEXT: ret i32 %tmp7
         %tmp3 = and i32 %z, %x
         %tmp6 = and i32 %z, %y
         %tmp7 = xor i32 %tmp3, %tmp6
@@ -11,9 +14,11 @@ define i32 @test1(i32 %x, i32 %y, i32 %z) {
 
 ; (x & y) ^ (x|y) -> x^y
 define i32 @test2(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT: %tmp7 = xor i32 %y, %x
+; CHECK-NEXT: ret i32 %tmp7
         %tmp3 = and i32 %y, %x
         %tmp6 = or i32 %y, %x
         %tmp7 = xor i32 %tmp3, %tmp6
         ret i32 %tmp7
 }
-
diff --git a/test/Transforms/InstCombine/apint-call-cast-target.ll b/test/Transforms/InstCombine/apint-call-cast-target.ll
index 4e98f9b..f3a66c3 100644
--- a/test/Transforms/InstCombine/apint-call-cast-target.ll
+++ b/test/Transforms/InstCombine/apint-call-cast-target.ll
@@ -5,15 +5,18 @@ target triple = "i686-pc-linux-gnu"
 
 define i32 @main() {
 ; CHECK-LABEL: @main(
-; CHECK: call i32 bitcast
+; CHECK: %[[call:.*]] = call i7* @ctime(i999* null)
+; CHECK: %[[cast:.*]] = ptrtoint i7* %[[call]] to i32
+; CHECK: ret i32 %[[cast]]
 entry:
 	%tmp = call i32 bitcast (i7* (i999*)* @ctime to i32 (i99*)*)( i99* null )
 	ret i32 %tmp
 }
 
 define i7* @ctime(i999*) {
-; CHECK-LABEL: @ctime(
-; CHECK: call i7* bitcast
+; CHECK-LABEL: define i7* @ctime(
+; CHECK: %[[call:.*]] = call i32 @main()
+; CHECK: %[[cast:.*]] = inttoptr i32 %[[call]] to i7*
 entry:
 	%tmp = call i7* bitcast (i32 ()* @main to i7* ()*)( )
 	ret i7* %tmp
diff --git a/test/Transforms/InstCombine/bitcast-alias-function.ll b/test/Transforms/InstCombine/bitcast-alias-function.ll
index bc36b25..cfec092 100644
--- a/test/Transforms/InstCombine/bitcast-alias-function.ll
+++ b/test/Transforms/InstCombine/bitcast-alias-function.ll
@@ -94,7 +94,8 @@ entry:
 ; CHECK: load i32*
 ; CHECK-NOT: fptoui
 ; CHECK-NOT: uitofp
-; CHECK: bitcast i32 %call to float
+; CHECK: bitcast float* %dest to i32*
+; CHECK: store i32
   %tmp = load float* %source, align 8
   %call = call float @alias_i32_to_f32(float %tmp) nounwind
   store float %call, float* %dest, align 8
@@ -109,7 +110,8 @@ entry:
 ; CHECK: load <2 x i32>*
 ; CHECK-NOT: fptoui
 ; CHECK-NOT: uitofp
-; CHECK: bitcast <2 x i32> %call to <2 x float>
+; CHECK: bitcast <2 x float>* %dest to <2 x i32>*
+; CHECK: store <2 x i32>
   %tmp = load <2 x float>* %source, align 8
   %call = call <2 x float> @alias_v2i32_to_v2f32(<2 x float> %tmp) nounwind
   store <2 x float> %call, <2 x float>* %dest, align 8
@@ -123,7 +125,8 @@ entry:
 ; CHECK: bitcast <2 x float>* %source to i64*
 ; CHECK: load i64*
 ; CHECK: %call = call i64 @func_i64
-; CHECK: bitcast i64 %call to <2 x float>
+; CHECK: bitcast <2 x float>* %dest to i64*
+; CHECK: store i64
   %tmp = load <2 x float>* %source, align 8
   %call = call <2 x float> @alias_v2f32_to_i64(<2 x float> %tmp) nounwind
   store <2 x float> %call, <2 x float>* %dest, align 8
@@ -136,7 +139,8 @@ entry:
 ; CHECK: bitcast i64* %source to <2 x float>*
 ; CHECK: load <2 x float>*
 ; CHECK: call <2 x float> @func_v2f32
-; CHECK: bitcast <2 x float> %call to i64
+; CHECK: bitcast i64* %dest to <2 x float>*
+; CHECK: store <2 x float>
   %tmp = load i64* %source, align 8
   %call = call i64 @alias_i64_to_v2f32(i64 %tmp) nounwind
   store i64 %call, i64* %dest, align 8
@@ -149,7 +153,8 @@ entry:
 ; CHECK: bitcast <2 x i64*>* %source to <2 x i32*>*
 ; CHECK: load <2 x i32*>*
 ; CHECK: call <2 x i32*> @func_v2i32p
-; CHECK: bitcast <2 x i32*> %call to <2 x i64*>
+; CHECK: bitcast <2 x i64*>* %dest to <2 x i32*>*
+; CHECK: store <2 x i32*>
   %tmp = load <2 x i64*>* %source, align 8
   %call = call <2 x i64*> @alias_v2i32p_to_v2i64p(<2 x i64*> %tmp) nounwind
   store <2 x i64*> %call, <2 x i64*>* %dest, align 8
diff --git a/test/Transforms/InstCombine/bitcast-store.ll b/test/Transforms/InstCombine/bitcast-store.ll
index e46b5c8..ea4d680 100644
--- a/test/Transforms/InstCombine/bitcast-store.ll
+++ b/test/Transforms/InstCombine/bitcast-store.ll
@@ -10,11 +10,11 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 @G = external constant [5 x i8*]
 
 ; CHECK-LABEL: @foo
-; CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([5 x i8*]* @G, i64 0, i64 2) to i32 (...)**), i32 (...)*** %0, align 16, !tag !0
-define void @foo(%struct.A* %a) nounwind {
+; CHECK: store i32 %x, i32* %{{.*}}, align 16, !noalias !0
+define void @foo(i32 %x, float* %p) nounwind {
 entry:
-  %0 = bitcast %struct.A* %a to i8***
-  store i8** getelementptr inbounds ([5 x i8*]* @G, i64 0, i64 2), i8*** %0, align 16, !tag !0
+  %x.cast = bitcast i32 %x to float
+  store float %x.cast, float* %p, align 16, !noalias !0
   ret void
 }
 
@@ -32,4 +32,4 @@ entry:
   ret void
 }
 
-!0 = metadata !{metadata !"hello"}
+!0 = !{!0}
diff --git a/test/Transforms/InstCombine/bswap-fold.ll b/test/Transforms/InstCombine/bswap-fold.ll
index 442ce58..63b0775 100644
--- a/test/Transforms/InstCombine/bswap-fold.ll
+++ b/test/Transforms/InstCombine/bswap-fold.ll
@@ -1,63 +1,79 @@
-; RUN: opt < %s -instcombine -S | not grep call.*bswap
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
 define i1 @test1(i16 %tmp2) {
+; CHECK-LABEL: @test1
+; CHECK-NEXT:  %tmp = icmp eq i16 %tmp2, 256
+; CHECK-NEXT:  ret i1 %tmp
         %tmp10 = call i16 @llvm.bswap.i16( i16 %tmp2 )
         %tmp = icmp eq i16 %tmp10, 1
         ret i1 %tmp
 }
 
 define i1 @test2(i32 %tmp) {
+; CHECK-LABEL: @test2
+; CHECK-NEXT:  %tmp.upgrd.1 = icmp eq i32 %tmp, 16777216
+; CHECK-NEXT:  ret i1 %tmp.upgrd.1
         %tmp34 = tail call i32 @llvm.bswap.i32( i32 %tmp )
         %tmp.upgrd.1 = icmp eq i32 %tmp34, 1
         ret i1 %tmp.upgrd.1
 }
 
-declare i32 @llvm.bswap.i32(i32)
-
 define i1 @test3(i64 %tmp) {
+; CHECK-LABEL: @test3
+; CHECK-NEXT:  %tmp.upgrd.2 = icmp eq i64 %tmp, 72057594037927936
+; CHECK-NEXT:  ret i1 %tmp.upgrd.2
         %tmp34 = tail call i64 @llvm.bswap.i64( i64 %tmp )
         %tmp.upgrd.2 = icmp eq i64 %tmp34, 1
         ret i1 %tmp.upgrd.2
 }
 
-declare i64 @llvm.bswap.i64(i64)
-
-declare i16 @llvm.bswap.i16(i16)
-
 ; rdar://5992453
 ; A & 255
 define i32 @test4(i32 %a) nounwind  {
-entry:
-	%tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )	
+; CHECK-LABEL: @test4
+; CHECK-NEXT:  %tmp2 = and i32 %a, 255
+; CHECK-NEXT:  ret i32 %tmp2
+	%tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )
 	%tmp4 = lshr i32 %tmp2, 24
 	ret i32 %tmp4
 }
 
 ; A
-define i32 @test5(i32 %a) nounwind  {
-entry:
+define i32 @test5(i32 %a) nounwind {
+; CHECK-LABEL: @test5
+; CHECK-NEXT:  ret i32 %a
 	%tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )
 	%tmp4 = tail call i32 @llvm.bswap.i32( i32 %tmp2 )
 	ret i32 %tmp4
 }
 
 ; a >> 24
-define i32 @test6(i32 %a) nounwind  {
-entry:
-	%tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )	
+define i32 @test6(i32 %a) nounwind {
+; CHECK-LABEL: @test6
+; CHECK-NEXT:  %tmp2 = lshr i32 %a, 24
+; CHECK-NEXT   ret i32 %tmp4
+	%tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )
 	%tmp4 = and i32 %tmp2, 255
 	ret i32 %tmp4
 }
 
 ; PR5284
 define i16 @test7(i32 %A) {
-  %B = tail call i32 @llvm.bswap.i32(i32 %A) nounwind 
+; CHECK-LABEL: @test7
+; CHECK-NEXT:  %1 = lshr i32 %A, 16
+; CHECK-NEXT:  %D = trunc i32 %1 to i16
+; CHECK-NEXT   ret i16 %D
+  %B = tail call i32 @llvm.bswap.i32(i32 %A) nounwind
   %C = trunc i32 %B to i16
   %D = tail call i16 @llvm.bswap.i16(i16 %C) nounwind
   ret i16 %D
 }
 
 define i16 @test8(i64 %A) {
+; CHECK-LABEL: @test8
+; CHECK-NEXT:  %1 = lshr i64 %A, 48
+; CHECK-NEXT:  %D = trunc i64 %1 to i16
+; CHECK-NEXT   ret i16 %D
   %B = tail call i64 @llvm.bswap.i64(i64 %A) nounwind 
   %C = trunc i64 %B to i16
   %D = tail call i16 @llvm.bswap.i16(i16 %C) nounwind
@@ -66,6 +82,144 @@ define i16 @test8(i64 %A) {
 
 ; Misc: Fold bswap(undef) to undef.
 define i64 @foo() {
+; CHECK-LABEL: @foo
+; CHECK-NEXT: ret i64 undef
   %a = call i64 @llvm.bswap.i64(i64 undef)
   ret i64 %a
 }
+
+; PR15782
+; Fold: OP( BSWAP(x), BSWAP(y) ) -> BSWAP( OP(x, y) )
+; Fold: OP( BSWAP(x), CONSTANT ) -> BSWAP( OP(x, BSWAP(CONSTANT) ) )
+define i16 @bs_and16i(i16 %a, i16 %b) #0 {
+; CHECK-LABEL: @bs_and16i
+; CHECK-NEXT:  %1 = and i16 %a, 4391
+; CHECK-NEXT:  %2 = call i16 @llvm.bswap.i16(i16 %1)
+; CHECK-NEXT:  ret i16 %2
+  %1 = tail call i16 @llvm.bswap.i16(i16 %a)
+  %2 = and i16 %1, 10001
+  ret i16 %2
+}
+
+define i16 @bs_and16(i16 %a, i16 %b) #0 {
+; CHECK-LABEL: @bs_and16
+; CHECK-NEXT:  %1 = and i16 %a, %b
+; CHECK-NEXT:  %2 = call i16 @llvm.bswap.i16(i16 %1)
+; CHECK-NEXT:  ret i16 %2
+  %tmp1 = tail call i16 @llvm.bswap.i16(i16 %a)
+  %tmp2 = tail call i16 @llvm.bswap.i16(i16 %b)
+  %tmp3 = and i16 %tmp1, %tmp2
+  ret i16 %tmp3
+}
+
+define i16 @bs_or16(i16 %a, i16 %b) #0 {
+; CHECK-LABEL: @bs_or16
+; CHECK-NEXT:  %1 = or i16 %a, %b
+; CHECK-NEXT:  %2 = call i16 @llvm.bswap.i16(i16 %1)
+; CHECK-NEXT:  ret i16 %2
+  %tmp1 = tail call i16 @llvm.bswap.i16(i16 %a)
+  %tmp2 = tail call i16 @llvm.bswap.i16(i16 %b)
+  %tmp3 = or i16 %tmp1, %tmp2
+  ret i16 %tmp3
+}
+
+define i16 @bs_xor16(i16 %a, i16 %b) #0 {
+; CHECK-LABEL: @bs_xor16
+; CHECK-NEXT:  %1 = xor i16 %a, %b
+; CHECK-NEXT:  %2 = call i16 @llvm.bswap.i16(i16 %1)
+; CHECK-NEXT:  ret i16 %2
+  %tmp1 = tail call i16 @llvm.bswap.i16(i16 %a)
+  %tmp2 = tail call i16 @llvm.bswap.i16(i16 %b)
+  %tmp3 = xor i16 %tmp1, %tmp2
+  ret i16 %tmp3
+}
+
+define i32 @bs_and32i(i32 %a, i32 %b) #0 {
+; CHECK-LABEL: @bs_and32i
+; CHECK-NEXT:  %1 = and i32 %a, -1585053440
+; CHECK-NEXT:  %2 = call i32 @llvm.bswap.i32(i32 %1)
+; CHECK-NEXT:  ret i32 %2
+  %tmp1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  %tmp2 = and i32 %tmp1, 100001
+  ret i32 %tmp2
+}
+
+define i32 @bs_and32(i32 %a, i32 %b) #0 {
+; CHECK-LABEL: @bs_and32
+; CHECK-NEXT:  %1 = and i32 %a, %b
+; CHECK-NEXT:  %2 = call i32 @llvm.bswap.i32(i32 %1)
+; CHECK-NEXT:  ret i32 %2
+  %tmp1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  %tmp2 = tail call i32 @llvm.bswap.i32(i32 %b)
+  %tmp3 = and i32 %tmp1, %tmp2
+  ret i32 %tmp3
+}
+
+define i32 @bs_or32(i32 %a, i32 %b) #0 {
+; CHECK-LABEL: @bs_or32
+; CHECK-NEXT:  %1 = or i32 %a, %b
+; CHECK-NEXT:  %2 = call i32 @llvm.bswap.i32(i32 %1)
+; CHECK-NEXT:  ret i32 %2
+  %tmp1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  %tmp2 = tail call i32 @llvm.bswap.i32(i32 %b)
+  %tmp3 = or i32 %tmp1, %tmp2
+  ret i32 %tmp3
+}
+
+define i32 @bs_xor32(i32 %a, i32 %b) #0 {
+; CHECK-LABEL: @bs_xor32
+; CHECK-NEXT:  %1 = xor i32 %a, %b
+; CHECK-NEXT:  %2 = call i32 @llvm.bswap.i32(i32 %1)
+; CHECK-NEXT:  ret i32 %2
+  %tmp1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  %tmp2 = tail call i32 @llvm.bswap.i32(i32 %b)
+  %tmp3 = xor i32 %tmp1, %tmp2
+  ret i32 %tmp3
+}
+
+define i64 @bs_and64i(i64 %a, i64 %b) #0 {
+; CHECK-LABEL: @bs_and64i
+; CHECK-NEXT:  %1 = and i64 %a, 129085117527228416
+; CHECK-NEXT:  %2 = call i64 @llvm.bswap.i64(i64 %1)
+; CHECK-NEXT:  ret i64 %2
+  %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a)
+  %tmp2 = and i64 %tmp1, 1000000001
+  ret i64 %tmp2
+}
+
+define i64 @bs_and64(i64 %a, i64 %b) #0 {
+; CHECK-LABEL: @bs_and64
+; CHECK-NEXT:  %1 = and i64 %a, %b
+; CHECK-NEXT:  %2 = call i64 @llvm.bswap.i64(i64 %1)
+; CHECK-NEXT:  ret i64 %2
+  %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a)
+  %tmp2 = tail call i64 @llvm.bswap.i64(i64 %b)
+  %tmp3 = and i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @bs_or64(i64 %a, i64 %b) #0 {
+; CHECK-LABEL: @bs_or64
+; CHECK-NEXT:  %1 = or i64 %a, %b
+; CHECK-NEXT:  %2 = call i64 @llvm.bswap.i64(i64 %1)
+; CHECK-NEXT:  ret i64 %2
+  %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a)
+  %tmp2 = tail call i64 @llvm.bswap.i64(i64 %b)
+  %tmp3 = or i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @bs_xor64(i64 %a, i64 %b) #0 {
+; CHECK-LABEL: @bs_xor64
+; CHECK-NEXT:  %1 = xor i64 %a, %b
+; CHECK-NEXT:  %2 = call i64 @llvm.bswap.i64(i64 %1)
+; CHECK-NEXT:  ret i64 %2
+  %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a)
+  %tmp2 = tail call i64 @llvm.bswap.i64(i64 %b)
+  %tmp3 = xor i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+declare i16 @llvm.bswap.i16(i16)
+declare i32 @llvm.bswap.i32(i32)
+declare i64 @llvm.bswap.i64(i64)
diff --git a/test/Transforms/InstCombine/call-cast-target.ll b/test/Transforms/InstCombine/call-cast-target.ll
index 1af3317..4a5c949 100644
--- a/test/Transforms/InstCombine/call-cast-target.ll
+++ b/test/Transforms/InstCombine/call-cast-target.ll
@@ -5,7 +5,9 @@ target triple = "i686-pc-linux-gnu"
 
 define i32 @main() {
 ; CHECK-LABEL: @main
-; CHECK: call i32 bitcast
+; CHECK: %[[call:.*]] = call i8* @ctime(i32* null)
+; CHECK: %[[cast:.*]] = ptrtoint i8* %[[call]] to i32
+; CHECK: ret i32 %[[cast]]
 entry:
   %tmp = call i32 bitcast (i8* (i32*)* @ctime to i32 (i32*)*)( i32* null )          ; <i32> [#uses=1]
   ret i32 %tmp
@@ -25,3 +27,48 @@ entry:
   %0 = call { i8 } bitcast ({ i8 } (i32*)* @foo to { i8 } (i16*)*)(i16* null)
   ret void
 }
+
+declare i32 @fn1(i32)
+
+define i32 @test1(i32* %a) {
+; CHECK-LABEL: @test1
+; CHECK:      %[[cast:.*]] = ptrtoint i32* %a to i32
+; CHECK-NEXT: %[[call:.*]] = tail call i32 @fn1(i32 %[[cast]])
+; CHECK-NEXT: ret i32 %[[call]]
+entry:
+  %call = tail call i32 bitcast (i32 (i32)* @fn1 to i32 (i32*)*)(i32* %a)
+  ret i32 %call
+}
+
+declare i32 @fn2(i16)
+
+define i32 @test2(i32* %a) {
+; CHECK-LABEL: @test2
+; CHECK:      %[[call:.*]] = tail call i32 bitcast (i32 (i16)* @fn2 to i32 (i32*)*)(i32* %a)
+; CHECK-NEXT: ret i32 %[[call]]
+entry:
+  %call = tail call i32 bitcast (i32 (i16)* @fn2 to i32 (i32*)*)(i32* %a)
+  ret i32 %call
+}
+
+declare i32 @fn3(i64)
+
+define i32 @test3(i32* %a) {
+; CHECK-LABEL: @test3
+; CHECK:      %[[call:.*]] = tail call i32 bitcast (i32 (i64)* @fn3 to i32 (i32*)*)(i32* %a)
+; CHECK-NEXT: ret i32 %[[call]]
+entry:
+  %call = tail call i32 bitcast (i32 (i64)* @fn3 to i32 (i32*)*)(i32* %a)
+  ret i32 %call
+}
+
+declare i32 @fn4(i32) "thunk"
+
+define i32 @test4(i32* %a) {
+; CHECK-LABEL: @test4
+; CHECK:      %[[call:.*]] = tail call i32 bitcast (i32 (i32)* @fn4 to i32 (i32*)*)(i32* %a)
+; CHECK-NEXT: ret i32 %[[call]]
+entry:
+  %call = tail call i32 bitcast (i32 (i32)* @fn4 to i32 (i32*)*)(i32* %a)
+  ret i32 %call
+}
diff --git a/test/Transforms/InstCombine/canonicalize_branch.ll b/test/Transforms/InstCombine/canonicalize_branch.ll
index b62b143..29fd51a 100644
--- a/test/Transforms/InstCombine/canonicalize_branch.ll
+++ b/test/Transforms/InstCombine/canonicalize_branch.ll
@@ -57,10 +57,10 @@ F:
         ret i32 123
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 1, i32 2}
-!1 = metadata !{metadata !"branch_weights", i32 3, i32 4}
-!2 = metadata !{metadata !"branch_weights", i32 5, i32 6}
-!3 = metadata !{metadata !"branch_weights", i32 7, i32 8}
+!0 = !{!"branch_weights", i32 1, i32 2}
+!1 = !{!"branch_weights", i32 3, i32 4}
+!2 = !{!"branch_weights", i32 5, i32 6}
+!3 = !{!"branch_weights", i32 7, i32 8}
 ; Base case shouldn't change.
 ; CHECK: !0 = {{.*}} i32 1, i32 2}
 ; Ensure that the branch metadata is reversed to match the reversals above.
diff --git a/test/Transforms/InstCombine/cast-int-fcmp-eq-0.ll b/test/Transforms/InstCombine/cast-int-fcmp-eq-0.ll
new file mode 100644
index 0000000..551d0ef
--- /dev/null
+++ b/test/Transforms/InstCombine/cast-int-fcmp-eq-0.ll
@@ -0,0 +1,454 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_0_uitofp(
+; CHECK-NEXT: icmp eq i32 %i, 0
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_oeq_int_0_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp oeq float %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_n0_uitofp(
+; CHECK: uitofp
+; CHECK: fcmp oeq
+define i1 @i32_cast_cmp_oeq_int_n0_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp oeq float %f, -0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_0_sitofp(
+; CHECK-NEXT: icmp eq i32 %i, 0
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_oeq_int_0_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp oeq float %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_n0_sitofp(
+; CHECK: sitofp
+; CHECK: fcmp oeq
+define i1 @i32_cast_cmp_oeq_int_n0_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp oeq float %f, -0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_one_int_0_uitofp(
+; CHECK-NEXT: icmp ne i32 %i, 0
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_one_int_0_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp one float %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_one_int_n0_uitofp(
+; CHECK: uitofp
+; CHECK: fcmp one
+define i1 @i32_cast_cmp_one_int_n0_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp one float %f, -0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_one_int_0_sitofp(
+; CHECK-NEXT: icmp ne i32 %i, 0
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_one_int_0_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp one float %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_one_int_n0_sitofp(
+; CHECK: sitofp
+; CHECK: fcmp one
+define i1 @i32_cast_cmp_one_int_n0_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp one float %f, -0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_ueq_int_0_uitofp(
+; CHECK-NEXT: icmp eq i32 %i, 0
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_ueq_int_0_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp ueq float %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_ueq_int_n0_uitofp(
+; CHECK: uitofp
+; CHECK: fcmp ueq
+define i1 @i32_cast_cmp_ueq_int_n0_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp ueq float %f, -0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_ueq_int_0_sitofp(
+; CHECK-NEXT: icmp eq i32 %i, 0
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_ueq_int_0_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp ueq float %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_ueq_int_n0_sitofp(
+; CHECK: sitofp
+; CHECK: fcmp ueq
+define i1 @i32_cast_cmp_ueq_int_n0_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp ueq float %f, -0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_une_int_0_uitofp(
+; CHECK-NEXT: icmp ne i32 %i, 0
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_une_int_0_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp une float %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_une_int_n0_uitofp(
+; CHECK: uitofp
+; CHECK: fcmp une
+define i1 @i32_cast_cmp_une_int_n0_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp une float %f, -0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_une_int_0_sitofp(
+; CHECK-NEXT: icmp ne i32 %i, 0
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_une_int_0_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp une float %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_une_int_n0_sitofp(
+; CHECK: sitofp
+; CHECK: fcmp une
+define i1 @i32_cast_cmp_une_int_n0_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp une float %f, -0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_ogt_int_0_uitofp(
+; CHECK: icmp ne i32 %i, 0
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_ogt_int_0_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp ogt float %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_ogt_int_n0_uitofp(
+; CHECK: uitofp
+; CHECK: fcmp ogt
+define i1 @i32_cast_cmp_ogt_int_n0_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp ogt float %f, -0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_ogt_int_0_sitofp(
+; CHECK: icmp sgt i32 %i, 0
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_ogt_int_0_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp ogt float %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_ogt_int_n0_sitofp(
+; CHECK: sitofp
+; CHECK: fcmp ogt
+define i1 @i32_cast_cmp_ogt_int_n0_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp ogt float %f, -0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_ole_int_0_uitofp(
+; CHECK: icmp eq i32 %i, 0
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_ole_int_0_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp ole float %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_ole_int_0_sitofp(
+; CHECK: icmp slt i32 %i, 1
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_ole_int_0_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp ole float %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_olt_int_0_uitofp(
+; CHECK: ret i1 false
+define i1 @i32_cast_cmp_olt_int_0_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp olt float %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_olt_int_0_sitofp(
+; CHECK: icmp slt i32 %i, 0
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_olt_int_0_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp olt float %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i64_cast_cmp_oeq_int_0_uitofp(
+; CHECK-NEXT: icmp eq i64 %i, 0
+; CHECK-NEXT: ret
+define i1 @i64_cast_cmp_oeq_int_0_uitofp(i64 %i) {
+  %f = uitofp i64 %i to float
+  %cmp = fcmp oeq float %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i64_cast_cmp_oeq_int_0_sitofp(
+; CHECK-NEXT: icmp eq i64 %i, 0
+; CHECK-NEXT: ret
+define i1 @i64_cast_cmp_oeq_int_0_sitofp(i64 %i) {
+  %f = sitofp i64 %i to float
+  %cmp = fcmp oeq float %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i64_cast_cmp_oeq_int_0_uitofp_half(
+; CHECK-NEXT: icmp eq i64 %i, 0
+; CHECK-NEXT: ret
+define i1 @i64_cast_cmp_oeq_int_0_uitofp_half(i64 %i) {
+  %f = uitofp i64 %i to half
+  %cmp = fcmp oeq half %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i64_cast_cmp_oeq_int_0_sitofp_half(
+; CHECK-NEXT: icmp eq i64 %i, 0
+; CHECK-NEXT: ret
+define i1 @i64_cast_cmp_oeq_int_0_sitofp_half(i64 %i) {
+  %f = sitofp i64 %i to half
+  %cmp = fcmp oeq half %f, 0.0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_0_uitofp_ppcf128(
+; CHECK: uitofp
+; CHECK: fcmp oeq
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_oeq_int_0_uitofp_ppcf128(i32 %i) {
+  %f = uitofp i32 %i to ppc_fp128
+  %cmp = fcmp oeq ppc_fp128 %f, 0xM00000000000000000000000000000000
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_i24max_uitofp(
+; CHECK: uitofp
+; CHECK: fcmp oeq
+
+; XCHECK: icmp eq i32 %i, 16777215
+; XCHECK-NEXT: ret
+define i1 @i32_cast_cmp_oeq_int_i24max_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp oeq float %f, 0x416FFFFFE0000000
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_i24max_sitofp(
+; CHECK: sitofp
+; CHECK: fcmp oeq
+
+; XCHECK: icmp eq i32 %i, 16777215
+; XCHECK-NEXT: ret
+define i1 @i32_cast_cmp_oeq_int_i24max_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp oeq float %f, 0x416FFFFFE0000000
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_i24maxp1_uitofp(
+; CHECK: uitofp
+; CHECK: fcmp oeq
+
+; XCHECK: icmp eq i32 %i, 16777216
+; XCHECK-NEXT: ret
+define i1 @i32_cast_cmp_oeq_int_i24maxp1_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp oeq float %f, 0x4170000000000000
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_i24maxp1_sitofp(
+; CHECK: sitofp
+; CHECK: fcmp oeq
+
+; XCHECK: icmp eq i32 %i, 16777216
+; XCHECK-NEXT: ret
+define i1 @i32_cast_cmp_oeq_int_i24maxp1_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp oeq float %f, 0x4170000000000000
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_i32umax_uitofp(
+; CHECK: uitofp
+; CHECK: fcmp oeq
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_oeq_int_i32umax_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp oeq float %f, 0x41F0000000000000
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_i32umax_sitofp(
+; CHECK: sitofp
+; CHECK: fcmp oeq
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_oeq_int_i32umax_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp oeq float %f, 0x41F0000000000000
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_i32imin_uitofp(
+; CHECK: uitofp
+; CHECK: fcmp oeq
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_oeq_int_i32imin_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp oeq float %f, 0xC1E0000000000000
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_i32imin_sitofp(
+; CHECK: sitofp
+; CHECK: fcmp oeq
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_oeq_int_i32imin_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp oeq float %f, 0xC1E0000000000000
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_i32imax_uitofp(
+; CHECK: uitofp
+; CHECK: fcmp oeq
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_oeq_int_i32imax_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp oeq float %f, 0x41E0000000000000
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_i32imax_sitofp(
+; CHECK: sitofp
+; CHECK: fcmp oeq
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_oeq_int_i32imax_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp oeq float %f, 0x41E0000000000000
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_negi32umax_uitofp(
+; CHECK: uitofp
+; CHECK: fcmp oeq
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_oeq_int_negi32umax_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp oeq float %f, 0xC1F0000000000000
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_int_negi32umax_sitofp(
+; CHECK: sitofp
+; CHECK: fcmp oeq
+; CHECK-NEXT: ret
+define i1 @i32_cast_cmp_oeq_int_negi32umax_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp oeq float %f, 0xC1F0000000000000
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_half_uitofp(
+; CHECK: ret i1 false
+define i1 @i32_cast_cmp_oeq_half_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp oeq float %f, 0.5
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_oeq_half_sitofp(
+; CHECK: ret i1 false
+define i1 @i32_cast_cmp_oeq_half_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp oeq float %f, 0.5
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_one_half_uitofp(
+; CHECK: ret i1 true
+define i1 @i32_cast_cmp_one_half_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp one float %f, 0.5
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_one_half_sitofp(
+; CHECK: ret i1 true
+define i1 @i32_cast_cmp_one_half_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp one float %f, 0.5
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_ueq_half_uitofp(
+; CHECK: ret i1 false
+define i1 @i32_cast_cmp_ueq_half_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp ueq float %f, 0.5
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_ueq_half_sitofp(
+; CHECK: ret i1 false
+define i1 @i32_cast_cmp_ueq_half_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp ueq float %f, 0.5
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_une_half_uitofp(
+; CHECK: ret i1 true
+define i1 @i32_cast_cmp_une_half_uitofp(i32 %i) {
+  %f = uitofp i32 %i to float
+  %cmp = fcmp une float %f, 0.5
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @i32_cast_cmp_une_half_sitofp(
+; CHECK: ret i1 true
+define i1 @i32_cast_cmp_une_half_sitofp(i32 %i) {
+  %f = sitofp i32 %i to float
+  %cmp = fcmp une float %f, 0.5
+  ret i1 %cmp
+}
diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index 578b16d..aac7a53 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll
@@ -99,6 +99,26 @@ define void @test11(i32* %P) {
 ; CHECK: ret void
 }
 
+declare i32 @__gxx_personality_v0(...)
+define void @test_invoke_vararg_cast(i32* %a, i32* %b) {
+entry:
+  %0 = bitcast i32* %b to i8*
+  %1 = bitcast i32* %a to i64*
+  invoke void (i32, ...)* @varargs(i32 1, i8* %0, i64* %1)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  ret void
+
+lpad:                                             ; preds = %entry
+  %2 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  ret void
+; CHECK-LABEL: test_invoke_vararg_cast
+; CHECK-LABEL: entry:
+; CHECK: invoke void (i32, ...)* @varargs(i32 1, i32* %b, i32* %a)
+}
+
 define i8* @test13(i64 %A) {
         %c = getelementptr [0 x i8]* bitcast ([32832 x i8]* @inbuf to [0 x i8]*), i64 0, i64 %A             ; <i8*> [#uses=1]
         ret i8* %c
@@ -199,15 +219,6 @@ define i1 @test24(i1 %C) {
 ; CHECK: ret i1 true
 }
 
-define void @test25(i32** %P) {
-        %c = bitcast i32** %P to float**                ; <float**> [#uses=1]
-        ;; Fold cast into null
-        store float* null, float** %c
-        ret void
-; CHECK: store i32* null, i32** %P
-; CHECK: ret void
-}
-
 define i32 @test26(float %F) {
         ;; no need to cast from float->double.
         %c = fpext float %F to double           ; <double> [#uses=1]
diff --git a/test/Transforms/InstCombine/cast_ptr.ll b/test/Transforms/InstCombine/cast_ptr.ll
index 23006a8..cc7a2bf 100644
--- a/test/Transforms/InstCombine/cast_ptr.ll
+++ b/test/Transforms/InstCombine/cast_ptr.ll
@@ -3,6 +3,8 @@
 
 target datalayout = "p:32:32-p1:32:32-p2:16:16"
 
+@global = global i8 0
+
 ; This shouldn't convert to getelementptr because the relationship
 ; between the arithmetic and the layout of allocated memory is
 ; entirely unknown.
@@ -47,10 +49,29 @@ define i1 @test2_as2_larger(i8 addrspace(2)* %a, i8 addrspace(2)* %b) {
   ret i1 %r
 }
 
+; These casts should not be folded away.
+; CHECK-LABEL: @test2_diff_as
+; CHECK: icmp sge i32 %i0, %i1
+define i1 @test2_diff_as(i8* %p, i8 addrspace(1)* %q) {
+  %i0 = ptrtoint i8* %p to i32
+  %i1 = ptrtoint i8 addrspace(1)* %q to i32
+  %r0 = icmp sge i32 %i0, %i1
+  ret i1 %r0
+}
+
+; These casts should not be folded away.
+; CHECK-LABEL: @test2_diff_as_global
+; CHECK: icmp sge i32 %i1
+define i1 @test2_diff_as_global(i8 addrspace(1)* %q) {
+  %i0 = ptrtoint i8* @global to i32
+  %i1 = ptrtoint i8 addrspace(1)* %q to i32
+  %r0 = icmp sge i32 %i1, %i0
+  ret i1 %r0
+}
+
 ; These casts should also be folded away.
 ; CHECK-LABEL: @test3(
 ; CHECK: icmp eq i8* %a, @global
-@global = global i8 0
 define i1 @test3(i8* %a) {
         %tmpa = ptrtoint i8* %a to i32
         %r = icmp eq i32 %tmpa, ptrtoint (i8* @global to i32)
diff --git a/test/Transforms/InstCombine/debug-line.ll b/test/Transforms/InstCombine/debug-line.ll
index 309843f..1946576 100644
--- a/test/Transforms/InstCombine/debug-line.ll
+++ b/test/Transforms/InstCombine/debug-line.ll
@@ -15,14 +15,14 @@ declare i32 @printf(i8*, ...)
 !llvm.module.flags = !{!10}
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{metadata !"0x2e\00foo\00foo\00\004\000\001\000\006\000\000\000", metadata !8, metadata !1, metadata !3, null, void ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang\001\00\000\00\000", metadata !8, metadata !4, metadata !4, metadata !9, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !8, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null}
-!5 = metadata !{i32 5, i32 2, metadata !6, null}
-!6 = metadata !{metadata !"0xb\004\0012\000", metadata !8, metadata !0} ; [ DW_TAG_lexical_block ]
-!7 = metadata !{i32 6, i32 1, metadata !6, null}
-!8 = metadata !{metadata !"m.c", metadata !"/private/tmp"}
-!9 = metadata !{metadata !0}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00foo\00foo\00\004\000\001\000\006\000\000\000", !8, !1, !3, null, void ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x29", !8} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang\001\00\000\00\000", !8, !4, !4, !9, null, null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !8, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{null}
+!5 = !MDLocation(line: 5, column: 2, scope: !6)
+!6 = !{!"0xb\004\0012\000", !8, !0} ; [ DW_TAG_lexical_block ]
+!7 = !MDLocation(line: 6, column: 1, scope: !6)
+!8 = !{!"m.c", !"/private/tmp"}
+!9 = !{!0}
+!10 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/InstCombine/debuginfo.ll b/test/Transforms/InstCombine/debuginfo.ll
index a7a491e..ae72f70 100644
--- a/test/Transforms/InstCombine/debuginfo.ll
+++ b/test/Transforms/InstCombine/debuginfo.ll
@@ -14,11 +14,11 @@ entry:
   store i8* %__dest, i8** %__dest.addr, align 8
 ; CHECK-NOT: call void @llvm.dbg.declare
 ; CHECK: call void @llvm.dbg.value
-  call void @llvm.dbg.declare(metadata !{i8** %__dest.addr}, metadata !0, metadata !{}), !dbg !16
+  call void @llvm.dbg.declare(metadata i8** %__dest.addr, metadata !0, metadata !{}), !dbg !16
   store i32 %__val, i32* %__val.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %__val.addr}, metadata !7, metadata !{}), !dbg !18
+  call void @llvm.dbg.declare(metadata i32* %__val.addr, metadata !7, metadata !{}), !dbg !18
   store i64 %__len, i64* %__len.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i64* %__len.addr}, metadata !9, metadata !{}), !dbg !20
+  call void @llvm.dbg.declare(metadata i64* %__len.addr, metadata !9, metadata !{}), !dbg !20
   %tmp = load i8** %__dest.addr, align 8, !dbg !21
   %tmp1 = load i32* %__val.addr, align 4, !dbg !21
   %tmp2 = load i64* %__len.addr, align 8, !dbg !21
@@ -31,29 +31,29 @@ entry:
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!30}
 
-!0 = metadata !{metadata !"0x101\00__dest\0016777294\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00foobar\00foobar\00\0079\001\001\000\006\00256\001\0079", metadata !27, metadata !2, metadata !4, null, i8* (i8*, i32, i64)* @foobar, null, null, metadata !25} ; [ DW_TAG_subprogram ] [line 79] [local] [def] [foobar]
-!2 = metadata !{metadata !"0x29", metadata !27} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 127710)\001\00\000\00\000", metadata !28, metadata !29, metadata !29, metadata !24, null, null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !27, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !3, null} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{metadata !"0x101\00__val\0033554510\000", metadata !1, metadata !2, metadata !8} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !3} ; [ DW_TAG_base_type ]
-!9 = metadata !{metadata !"0x101\00__len\0050331726\000", metadata !1, metadata !2, metadata !10} ; [ DW_TAG_arg_variable ]
-!10 = metadata !{metadata !"0x16\00size_t\0080\000\000\000\000", metadata !27, metadata !3, metadata !11} ; [ DW_TAG_typedef ]
-!11 = metadata !{metadata !"0x16\00__darwin_size_t\0090\000\000\000\000", metadata !27, metadata !3, metadata !12} ; [ DW_TAG_typedef ]
-!12 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, metadata !3} ; [ DW_TAG_base_type ]
-!16 = metadata !{i32 78, i32 28, metadata !1, null}
-!18 = metadata !{i32 78, i32 40, metadata !1, null}
-!20 = metadata !{i32 78, i32 54, metadata !1, null}
-!21 = metadata !{i32 80, i32 3, metadata !22, null}
-!22 = metadata !{metadata !"0xb\0080\003\007", metadata !27, metadata !23} ; [ DW_TAG_lexical_block ]
-!23 = metadata !{metadata !"0xb\0079\001\006", metadata !27, metadata !1} ; [ DW_TAG_lexical_block ]
-!24 = metadata !{metadata !1}
-!25 = metadata !{metadata !0, metadata !7, metadata !9}
-!26 = metadata !{metadata !"0x29", metadata !28} ; [ DW_TAG_file_type ]
-!27 = metadata !{metadata !"string.h", metadata !"Game"}
-!28 = metadata !{metadata !"bits.c", metadata !"Game"}
-!29 = metadata !{i32 0}
-!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x101\00__dest\0016777294\000", !1, !2, !6} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00foobar\00foobar\00\0079\001\001\000\006\00256\001\0079", !27, !2, !4, null, i8* (i8*, i32, i64)* @foobar, null, null, !25} ; [ DW_TAG_subprogram ] [line 79] [local] [def] [foobar]
+!2 = !{!"0x29", !27} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\0012\00clang version 3.0 (trunk 127710)\001\00\000\00\000", !28, !29, !29, !24, null, null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !27, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{!6}
+!6 = !{!"0xf\00\000\0064\0064\000\000", null, !3, null} ; [ DW_TAG_pointer_type ]
+!7 = !{!"0x101\00__val\0033554510\000", !1, !2, !8} ; [ DW_TAG_arg_variable ]
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !3} ; [ DW_TAG_base_type ]
+!9 = !{!"0x101\00__len\0050331726\000", !1, !2, !10} ; [ DW_TAG_arg_variable ]
+!10 = !{!"0x16\00size_t\0080\000\000\000\000", !27, !3, !11} ; [ DW_TAG_typedef ]
+!11 = !{!"0x16\00__darwin_size_t\0090\000\000\000\000", !27, !3, !12} ; [ DW_TAG_typedef ]
+!12 = !{!"0x24\00long unsigned int\000\0064\0064\000\000\007", null, !3} ; [ DW_TAG_base_type ]
+!16 = !MDLocation(line: 78, column: 28, scope: !1)
+!18 = !MDLocation(line: 78, column: 40, scope: !1)
+!20 = !MDLocation(line: 78, column: 54, scope: !1)
+!21 = !MDLocation(line: 80, column: 3, scope: !22)
+!22 = !{!"0xb\0080\003\007", !27, !23} ; [ DW_TAG_lexical_block ]
+!23 = !{!"0xb\0079\001\006", !27, !1} ; [ DW_TAG_lexical_block ]
+!24 = !{!1}
+!25 = !{!0, !7, !9}
+!26 = !{!"0x29", !28} ; [ DW_TAG_file_type ]
+!27 = !{!"string.h", !"Game"}
+!28 = !{!"bits.c", !"Game"}
+!29 = !{i32 0}
+!30 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/InstCombine/div.ll b/test/Transforms/InstCombine/div.ll
index 2841043..e0ff07b 100644
--- a/test/Transforms/InstCombine/div.ll
+++ b/test/Transforms/InstCombine/div.ll
@@ -217,7 +217,7 @@ define i32 @test25(i32 %a) {
   %div = sdiv i32 %shl, 2
   ret i32 %div
 ; CHECK-LABEL: @test25(
-; CHECK-NEXT: %div = shl i32 %a, 1
+; CHECK-NEXT: %div = shl nsw i32 %a, 1
 ; CHECK-NEXT: ret i32 %div
 }
 
@@ -226,7 +226,7 @@ define i32 @test26(i32 %a) {
   %div = sdiv i32 %mul, 3
   ret i32 %div
 ; CHECK-LABEL: @test26(
-; CHECK-NEXT: %div = shl i32 %a, 2
+; CHECK-NEXT: %div = shl nsw i32 %a, 2
 ; CHECK-NEXT: ret i32 %div
 }
 
@@ -286,3 +286,42 @@ define i32 @test32(i32 %a, i32 %b) {
 ; CHECK-NEXT: %[[div:.*]] = udiv i32 %a, %[[shr]]
 ; CHECK-NEXT: ret i32
 }
+
+define <2 x i64> @test33(<2 x i64> %x) nounwind {
+  %shr = lshr exact <2 x i64> %x, <i64 5, i64 5>
+  %div = udiv exact <2 x i64> %shr, <i64 6, i64 6>
+  ret <2 x i64> %div
+; CHECK-LABEL: @test33(
+; CHECK-NEXT: udiv exact <2 x i64> %x, <i64 192, i64 192>
+; CHECK-NEXT: ret <2 x i64>
+}
+
+define <2 x i64> @test34(<2 x i64> %x) nounwind {
+  %neg = sub nsw <2 x i64> zeroinitializer, %x
+  %div = sdiv exact <2 x i64> %neg, <i64 3, i64 4>
+  ret <2 x i64> %div
+; CHECK-LABEL: @test34(
+; CHECK-NEXT: sdiv exact <2 x i64> %x, <i64 -3, i64 -4>
+; CHECK-NEXT: ret <2 x i64>
+}
+
+define i32 @test35(i32 %A) {
+  %and = and i32 %A, 2147483647
+  %mul = sdiv exact i32 %and, 2147483647
+  ret i32 %mul
+; CHECK-LABEL: @test35(
+; CHECK-NEXT: %[[and:.*]]  = and i32 %A, 2147483647
+; CHECK-NEXT: %[[udiv:.*]] = udiv exact i32 %[[and]], 2147483647
+; CHECK-NEXT: ret i32 %[[udiv]]
+}
+
+define i32 @test36(i32 %A) {
+  %and = and i32 %A, 2147483647
+  %shl = shl nsw i32 1, %A
+  %mul = sdiv exact i32 %and, %shl
+  ret i32 %mul
+; CHECK-LABEL: @test36(
+; CHECK-NEXT: %[[and:.*]] = and i32 %A, 2147483647
+; CHECK-NEXT: %[[shr:.*]] = lshr exact i32 %[[and]], %A
+; CHECK-NEXT: ret i32 %[[shr]]
+}
diff --git a/test/Transforms/InstCombine/fast-math.ll b/test/Transforms/InstCombine/fast-math.ll
index b0ec895..c6081c3 100644
--- a/test/Transforms/InstCombine/fast-math.ll
+++ b/test/Transforms/InstCombine/fast-math.ll
@@ -93,7 +93,7 @@ define float @fold9(float %f1, float %f2) {
   ret float %t3
 
 ; CHECK-LABEL: @fold9(
-; CHECK: fsub fast float 0.000000e+00, %f2
+; CHECK: fsub fast float -0.000000e+00, %f2
 }
 
 ; Let C3 = C1 + C2. (f1 + C1) + (f2 + C2) => (f1 + f2) + C3 instead of
@@ -322,6 +322,14 @@ define float @fneg1(float %f1, float %f2) {
 ; CHECK: fmul float %f1, %f2
 }
 
+define float @fneg2(float %x) {
+  %sub = fsub nsz float 0.0, %x
+  ret float %sub
+; CHECK-LABEL: @fneg2(
+; CHECK-NEXT: fsub nsz float -0.000000e+00, %x
+; CHECK-NEXT: ret float 
+}
+
 ; =========================================================================
 ;
 ;   Testing-cases about div
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index afc6782..ee39d10 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -S -instcombine < %s | FileCheck %s
 
+declare double @llvm.fabs.f64(double) nounwind readnone
+
 define i1 @test1(float %x, float %y) nounwind {
   %ext1 = fpext float %x to double
   %ext2 = fpext float %y to double
@@ -81,6 +83,16 @@ define i32 @test9(double %a) nounwind {
 ; CHECK: ret i32 0
 }
 
+define i32 @test9_intrinsic(double %a) nounwind {
+  %call = tail call double @llvm.fabs.f64(double %a) nounwind
+  %cmp = fcmp olt double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: @test9_intrinsic(
+; CHECK-NOT: fabs
+; CHECK: ret i32 0
+}
+
 define i32 @test10(double %a) nounwind {
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp ole double %call, 0.000000e+00
@@ -91,6 +103,16 @@ define i32 @test10(double %a) nounwind {
 ; CHECK: fcmp oeq double %a, 0.000000e+00
 }
 
+define i32 @test10_intrinsic(double %a) nounwind {
+  %call = tail call double @llvm.fabs.f64(double %a) nounwind
+  %cmp = fcmp ole double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: @test10_intrinsic(
+; CHECK-NOT: fabs
+; CHECK: fcmp oeq double %a, 0.000000e+00
+}
+
 define i32 @test11(double %a) nounwind {
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp ogt double %call, 0.000000e+00
@@ -101,6 +123,16 @@ define i32 @test11(double %a) nounwind {
 ; CHECK: fcmp one double %a, 0.000000e+00
 }
 
+define i32 @test11_intrinsic(double %a) nounwind {
+  %call = tail call double @llvm.fabs.f64(double %a) nounwind
+  %cmp = fcmp ogt double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: @test11_intrinsic(
+; CHECK-NOT: fabs
+; CHECK: fcmp one double %a, 0.000000e+00
+}
+
 define i32 @test12(double %a) nounwind {
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp oge double %call, 0.000000e+00
@@ -111,6 +143,16 @@ define i32 @test12(double %a) nounwind {
 ; CHECK: fcmp ord double %a, 0.000000e+00
 }
 
+define i32 @test12_intrinsic(double %a) nounwind {
+  %call = tail call double @llvm.fabs.f64(double %a) nounwind
+  %cmp = fcmp oge double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: @test12_intrinsic(
+; CHECK-NOT: fabs
+; CHECK: fcmp ord double %a, 0.000000e+00
+}
+
 define i32 @test13(double %a) nounwind {
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp une double %call, 0.000000e+00
@@ -121,6 +163,16 @@ define i32 @test13(double %a) nounwind {
 ; CHECK: fcmp une double %a, 0.000000e+00
 }
 
+define i32 @test13_intrinsic(double %a) nounwind {
+  %call = tail call double @llvm.fabs.f64(double %a) nounwind
+  %cmp = fcmp une double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: @test13_intrinsic(
+; CHECK-NOT: fabs
+; CHECK: fcmp une double %a, 0.000000e+00
+}
+
 define i32 @test14(double %a) nounwind {
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp oeq double %call, 0.000000e+00
@@ -131,6 +183,16 @@ define i32 @test14(double %a) nounwind {
 ; CHECK: fcmp oeq double %a, 0.000000e+00
 }
 
+define i32 @test14_intrinsic(double %a) nounwind {
+  %call = tail call double @llvm.fabs.f64(double %a) nounwind
+  %cmp = fcmp oeq double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: @test14_intrinsic(
+; CHECK-NOT: fabs
+; CHECK: fcmp oeq double %a, 0.000000e+00
+}
+
 define i32 @test15(double %a) nounwind {
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp one double %call, 0.000000e+00
@@ -141,6 +203,16 @@ define i32 @test15(double %a) nounwind {
 ; CHECK: fcmp one double %a, 0.000000e+00
 }
 
+define i32 @test15_intrinsic(double %a) nounwind {
+  %call = tail call double @llvm.fabs.f64(double %a) nounwind
+  %cmp = fcmp one double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: @test15_intrinsic(
+; CHECK-NOT: fabs
+; CHECK: fcmp one double %a, 0.000000e+00
+}
+
 define i32 @test16(double %a) nounwind {
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp ueq double %call, 0.000000e+00
@@ -151,6 +223,16 @@ define i32 @test16(double %a) nounwind {
 ; CHECK: fcmp ueq double %a, 0.000000e+00
 }
 
+define i32 @test16_intrinsic(double %a) nounwind {
+  %call = tail call double @llvm.fabs.f64(double %a) nounwind
+  %cmp = fcmp ueq double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: @test16_intrinsic(
+; CHECK-NOT: fabs
+; CHECK: fcmp ueq double %a, 0.000000e+00
+}
+
 ; Don't crash.
 define i32 @test17(double %a, double (double)* %p) nounwind {
   %call = tail call double %p(double %a) nounwind
diff --git a/test/Transforms/InstCombine/float-shrink-compare.ll b/test/Transforms/InstCombine/float-shrink-compare.ll
index e500467..a08f953 100644
--- a/test/Transforms/InstCombine/float-shrink-compare.ll
+++ b/test/Transforms/InstCombine/float-shrink-compare.ll
@@ -222,8 +222,46 @@ define i32 @test18(float %x, float %y, float %z) nounwind uwtable {
 ; CHECK-NEXT: fcmp oeq float %fmaxf, %z
 }
 
+define i32 @test19(float %x, float %y, float %z) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = fpext float %y to double
+  %3 = call double @copysign(double %1, double %2) nounwind
+  %4 = fpext float %z to double
+  %5 = fcmp oeq double %3, %4
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+; CHECK-LABEL: @test19(
+; CHECK-NEXT: %copysignf = call float @copysignf(float %x, float %y)
+; CHECK-NEXT: fcmp oeq float %copysignf, %z
+}
+
+define i32 @test20(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @fmin(double 1.000000e+00, double %2) nounwind
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK-LABEL: @test20(
+; CHECK-NEXT: %fminf = call float @fminf(float 1.000000e+00, float %x)
+; CHECK-NEXT: fcmp oeq float %fminf, %y
+}
+
+define i32 @test21(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @fmin(double 1.300000e+00, double %2) nounwind
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; should not be changed to fminf as the constant would loose precision
+; CHECK-LABEL: @test21(
+; CHECK: %3 = call double @fmin(double 1.300000e+00, double %2)
+}
+
 declare double @fabs(double) nounwind readnone
 declare double @ceil(double) nounwind readnone
+declare double @copysign(double, double) nounwind readnone
 declare double @floor(double) nounwind readnone
 declare double @nearbyint(double) nounwind readnone
 declare double @rint(double) nounwind readnone
diff --git a/test/Transforms/InstCombine/fpcast.ll b/test/Transforms/InstCombine/fpcast.ll
index ac03402..8319624 100644
--- a/test/Transforms/InstCombine/fpcast.ll
+++ b/test/Transforms/InstCombine/fpcast.ll
@@ -73,3 +73,15 @@ define float @test7(double %V) {
 ; CHECK-NEXT: %[[trunc:.*]] = fptrunc double %frem to float
 ; CHECK-NEXT: ret float %trunc
 }
+
+define float @test8(float %V) {
+  %fext = fpext float %V to double
+  %frem = frem double %fext, 1.000000e-01
+  %trunc = fptrunc double %frem to float
+  ret float %trunc
+; CHECK-LABEL: @test8
+; CHECK-NEXT: %[[fext:.*]]  = fpext float %V to double
+; CHECK-NEXT: %[[frem:.*]]  = frem double %fext, 1.000000e-01
+; CHECK-NEXT: %[[trunc:.*]] = fptrunc double %frem to float
+; CHECK-NEXT: ret float %trunc
+}
diff --git a/test/Transforms/InstCombine/gc.relocate.ll b/test/Transforms/InstCombine/gc.relocate.ll
new file mode 100644
index 0000000..d10ef5f
--- /dev/null
+++ b/test/Transforms/InstCombine/gc.relocate.ll
@@ -0,0 +1,20 @@
+; RUN: opt < %s -datalayout -instcombine -S | FileCheck %s
+
+; Uses InstCombine with DataLayout to propagate dereferenceable
+; attribute via gc.relocate: if the derived ptr is dereferenceable(N),
+; then the return attribute of gc.relocate is dereferenceable(N).
+
+declare zeroext i1 @return_i1()
+declare i32 @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()*, i32, i32, ...)
+declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32)
+
+define i32 addrspace(1)* @deref(i32 addrspace(1)* dereferenceable(8) %dparam) {
+; Checks that a dereferenceabler pointer
+; CHECK-LABEL: @deref
+; CHECK: call dereferenceable(8)
+entry:
+    %load = load i32 addrspace(1)* %dparam
+    %tok = tail call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 addrspace(1)* %dparam)
+    %relocate = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %tok, i32 4, i32 4)
+    ret i32 addrspace(1)* %relocate
+}
+\ No newline at end of file
diff --git a/test/Transforms/InstCombine/gep-sext.ll b/test/Transforms/InstCombine/gep-sext.ll
new file mode 100644
index 0000000..3d23dab
--- /dev/null
+++ b/test/Transforms/InstCombine/gep-sext.ll
@@ -0,0 +1,61 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-pc-win32"
+
+declare void @use(i32) readonly
+
+; We prefer to canonicalize the machine width gep indices early
+define void @test(i32* %p, i32 %index) {
+; CHECK-LABEL: @test
+; CHECK-NEXT: %1 = sext i32 %index to i64
+; CHECK-NEXT: %addr = getelementptr i32* %p, i64 %1
+  %addr = getelementptr i32* %p, i32 %index
+  %val = load i32* %addr
+  call void @use(i32 %val)
+  ret void
+}
+; If they've already been canonicalized via zext, that's fine
+define void @test2(i32* %p, i32 %index) {
+; CHECK-LABEL: @test2
+; CHECK-NEXT: %i = zext i32 %index to i64
+; CHECK-NEXT: %addr = getelementptr i32* %p, i64 %i
+  %i = zext i32 %index to i64
+  %addr = getelementptr i32* %p, i64 %i
+  %val = load i32* %addr
+  call void @use(i32 %val)
+  ret void
+}
+; If we can use a zext, we prefer that.  This requires
+; knowing that the index is positive.
+define void @test3(i32* %p, i32 %index) {
+; CHECK-LABEL: @test3
+; CHECK:   zext
+; CHECK-NOT: sext
+  %addr_begin = getelementptr i32* %p, i64 40
+  %addr_fixed = getelementptr i32* %addr_begin, i64 48
+  %val_fixed = load i32* %addr_fixed, !range !0
+  %addr = getelementptr i32* %addr_begin, i32 %val_fixed
+  %val = load i32* %addr
+  call void @use(i32 %val)
+  ret void
+}
+; Replace sext with zext where possible
+define void @test4(i32* %p, i32 %index) {
+; CHECK-LABEL: @test4
+; CHECK:   zext
+; CHECK-NOT: sext
+  %addr_begin = getelementptr i32* %p, i64 40
+  %addr_fixed = getelementptr i32* %addr_begin, i64 48
+  %val_fixed = load i32* %addr_fixed, !range !0
+  %i = sext i32 %val_fixed to i64
+  %addr = getelementptr i32* %addr_begin, i64 %i
+  %val = load i32* %addr
+  call void @use(i32 %val)
+  ret void
+}
+
+;;  !range !0
+!0 = !{i32 0, i32 2147483647}
+
+
+
diff --git a/test/Transforms/InstCombine/gepphigep.ll b/test/Transforms/InstCombine/gepphigep.ll
index 9aab609..86295e4 100644
--- a/test/Transforms/InstCombine/gepphigep.ll
+++ b/test/Transforms/InstCombine/gepphigep.ll
@@ -2,6 +2,8 @@
 
 %struct1 = type { %struct2*, i32, i32, i32 }
 %struct2 = type { i32, i32 }
+%struct3 = type { i32, %struct4, %struct4 }
+%struct4 = type { %struct2, %struct2 }
 
 define i32 @test1(%struct1* %dm, i1 %tmp4, i64 %tmp9, i64 %tmp19) {
 bb:
@@ -54,3 +56,45 @@ bb:
 ; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp19, i32 0
 ; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp9, i32 1
 }
+
+; Check that instcombine doesn't insert GEPs before landingpad.
+
+define i32 @test3(%struct3* %dm, i1 %tmp4, i64 %tmp9, i64 %tmp19, i64 %tmp20, i64 %tmp21) {
+bb:
+  %tmp = getelementptr inbounds %struct3* %dm, i64 0
+  br i1 %tmp4, label %bb1, label %bb2
+
+bb1:
+  %tmp1 = getelementptr inbounds %struct3* %tmp, i64 %tmp19, i32 1
+  %tmp11 = getelementptr inbounds %struct4* %tmp1, i64 0, i32 0, i32 0
+  store i32 0, i32* %tmp11, align 4
+  br label %bb3
+
+bb2:
+  %tmp2 = getelementptr inbounds %struct3* %tmp, i64 %tmp20, i32 1
+  %tmp12 = getelementptr inbounds %struct4* %tmp2, i64 0, i32 0, i32 1
+  store i32 0, i32* %tmp12, align 4
+  br label %bb3
+
+bb3:
+  %phi = phi %struct4* [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
+  %tmp22 = invoke i32 @foo1(i32 11) to label %bb4 unwind label %bb5
+
+bb4:
+  ret i32 0
+
+bb5:
+  %tmp27 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) catch i8* bitcast (i8** @_ZTIi to i8*)
+  %tmp34 = getelementptr inbounds %struct4* %phi, i64 %tmp21, i32 1
+  %tmp35 = getelementptr inbounds %struct2* %tmp34, i64 0, i32 1
+  %tmp25 = load i32* %tmp35, align 4
+  ret i32 %tmp25
+
+; CHECK-LABEL: @test3(
+; CHECK: bb5:
+; CHECK-NEXT: {{.*}}landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+}
+
+@_ZTIi = external constant i8*
+declare i32 @__gxx_personality_v0(...)
+declare i32 @foo1(i32)
diff --git a/test/Transforms/InstCombine/getelementptr.ll b/test/Transforms/InstCombine/getelementptr.ll
index bb46662..94cc180 100644
--- a/test/Transforms/InstCombine/getelementptr.ll
+++ b/test/Transforms/InstCombine/getelementptr.ll
@@ -602,8 +602,8 @@ entry:
 	%C = load i8** %B, align 8
 	ret i8* %C
 ; CHECK-LABEL: @test34(
-; CHECK: %V.c = inttoptr i64 %V to i8*
-; CHECK: ret i8* %V.c
+; CHECK: %[[C:.*]] = inttoptr i64 %V to i8*
+; CHECK: ret i8* %[[C]]
 }
 
 %t0 = type { i8*, [19 x i8] }
diff --git a/test/Transforms/InstCombine/icmp-range.ll b/test/Transforms/InstCombine/icmp-range.ll
index 97d231f..0911ab0 100644
--- a/test/Transforms/InstCombine/icmp-range.ll
+++ b/test/Transforms/InstCombine/icmp-range.ll
@@ -55,7 +55,7 @@ define i1 @test_nonzero6(i8* %argw) {
 }
 
 
-!0 = metadata !{i32 1, i32 6} 
-!1 = metadata !{i32 0, i32 6} 
-!2 = metadata !{i8 0, i8 1} 
-!3 = metadata !{i8 0, i8 6} 
+!0 = !{i32 1, i32 6} 
+!1 = !{i32 0, i32 6} 
+!2 = !{i8 0, i8 1} 
+!3 = !{i8 0, i8 6} 
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 279d86d..64741c5 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -1522,3 +1522,54 @@ define zeroext i1 @icmp_cmpxchg_strong(i32* %sc, i32 %old_val, i32 %new_val) {
   %icmp = icmp eq i32 %xtrc, %old_val
   ret i1 %icmp
 }
+
+; CHECK-LABEL: @f1
+; CHECK-NEXT: %[[cmp:.*]] = icmp sge i64 %a, %b
+; CHECK-NEXT: ret i1 %[[cmp]]
+define i1 @f1(i64 %a, i64 %b) {
+  %t = sub nsw i64 %a, %b
+  %v = icmp sge i64 %t, 0
+  ret i1 %v
+}
+
+; CHECK-LABEL: @f2
+; CHECK-NEXT: %[[cmp:.*]] = icmp sgt i64 %a, %b
+; CHECK-NEXT: ret i1 %[[cmp]]
+define i1 @f2(i64 %a, i64 %b) {
+  %t = sub nsw i64 %a, %b
+  %v = icmp sgt i64 %t, 0
+  ret i1 %v
+}
+
+; CHECK-LABEL: @f3
+; CHECK-NEXT: %[[cmp:.*]] = icmp slt i64 %a, %b
+; CHECK-NEXT: ret i1 %[[cmp]]
+define i1 @f3(i64 %a, i64 %b) {
+  %t = sub nsw i64 %a, %b
+  %v = icmp slt i64 %t, 0
+  ret i1 %v
+}
+
+; CHECK-LABEL: @f4
+; CHECK-NEXT: %[[cmp:.*]] = icmp sle i64 %a, %b
+; CHECK-NEXT: ret i1 %[[cmp]]
+define i1 @f4(i64 %a, i64 %b) {
+  %t = sub nsw i64 %a, %b
+  %v = icmp sle i64 %t, 0
+  ret i1 %v
+}
+
+; CHECK-LABEL: @f5
+; CHECK: %[[cmp:.*]] = icmp slt i32 %[[sub:.*]], 0
+; CHECK: %[[neg:.*]] = sub nsw i32 0, %[[sub]]
+; CHECK: %[[sel:.*]] = select i1 %[[cmp]], i32 %[[neg]], i32 %[[sub]]
+; CHECK: ret i32 %[[sel]]
+define i32 @f5(i8 %a, i8 %b) {
+  %conv = zext i8 %a to i32
+  %conv3 = zext i8 %b to i32
+  %sub = sub nsw i32 %conv, %conv3
+  %cmp4 = icmp slt i32 %sub, 0
+  %sub7 = sub nsw i32 0, %sub
+  %sub7.sub = select i1 %cmp4, i32 %sub7, i32 %sub
+  ret i32 %sub7.sub
+}
diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll
index 9b58d93..2791adf 100644
--- a/test/Transforms/InstCombine/intrinsics.ll
+++ b/test/Transforms/InstCombine/intrinsics.ll
@@ -1,10 +1,17 @@
 ; RUN: opt -instcombine -S < %s | FileCheck %s
 
 %overflow.result = type {i8, i1}
+%ov.result.32 = type { i32, i1 }
+
 
-declare %overflow.result @llvm.uadd.with.overflow.i8(i8, i8)
-declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32)
-declare %overflow.result @llvm.umul.with.overflow.i8(i8, i8)
+declare %overflow.result @llvm.uadd.with.overflow.i8(i8, i8) nounwind readnone
+declare %overflow.result @llvm.umul.with.overflow.i8(i8, i8) nounwind readnone
+declare %ov.result.32 @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+declare %ov.result.32 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+declare %ov.result.32 @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
+declare %ov.result.32 @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare %ov.result.32 @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
+declare %ov.result.32 @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
 declare double @llvm.powi.f64(double, i32) nounwind readonly
 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
@@ -91,17 +98,92 @@ define i8 @uaddtest7(i8 %A, i8 %B) {
 }
 
 ; PR20194
-define { i32, i1 } @saddtest1(i8 %a, i8 %b) {
+define %ov.result.32 @saddtest_nsw(i8 %a, i8 %b) {
   %A = sext i8 %a to i32
   %B = sext i8 %b to i32
-  %x = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %A, i32 %B)
-  ret { i32, i1 } %x
-; CHECK-LABEL: @saddtest1
+  %x = call %ov.result.32 @llvm.sadd.with.overflow.i32(i32 %A, i32 %B)
+  ret %ov.result.32 %x
+; CHECK-LABEL: @saddtest_nsw
 ; CHECK: %x = add nsw i32 %A, %B
-; CHECK-NEXT: %1 = insertvalue { i32, i1 } { i32 undef, i1 false }, i32 %x, 0
-; CHECK-NEXT:  ret { i32, i1 } %1
+; CHECK-NEXT: %1 = insertvalue %ov.result.32 { i32 undef, i1 false }, i32 %x, 0
+; CHECK-NEXT:  ret %ov.result.32 %1
 }
 
+define %ov.result.32 @uaddtest_nuw(i32 %a, i32 %b) {
+  %A = and i32 %a, 2147483647
+  %B = and i32 %b, 2147483647
+  %x = call %ov.result.32 @llvm.uadd.with.overflow.i32(i32 %A, i32 %B)
+  ret %ov.result.32 %x
+; CHECK-LABEL: @uaddtest_nuw
+; CHECK: %x = add nuw i32 %A, %B
+; CHECK-NEXT: %1 = insertvalue %ov.result.32 { i32 undef, i1 false }, i32 %x, 0
+; CHECK-NEXT:  ret %ov.result.32 %1
+}
+
+define %ov.result.32 @ssubtest_nsw(i8 %a, i8 %b) {
+  %A = sext i8 %a to i32
+  %B = sext i8 %b to i32
+  %x = call %ov.result.32 @llvm.ssub.with.overflow.i32(i32 %A, i32 %B)
+  ret %ov.result.32 %x
+; CHECK-LABEL: @ssubtest_nsw
+; CHECK: %x = sub nsw i32 %A, %B
+; CHECK-NEXT: %1 = insertvalue %ov.result.32 { i32 undef, i1 false }, i32 %x, 0
+; CHECK-NEXT:  ret %ov.result.32 %1
+}
+
+define %ov.result.32 @usubtest_nuw(i32 %a, i32 %b) {
+  %A = or i32 %a, 2147483648
+  %B = and i32 %b, 2147483647
+  %x = call %ov.result.32 @llvm.usub.with.overflow.i32(i32 %A, i32 %B)
+  ret %ov.result.32 %x
+; CHECK-LABEL: @usubtest_nuw
+; CHECK: %x = sub nuw i32 %A, %B
+; CHECK-NEXT: %1 = insertvalue %ov.result.32 { i32 undef, i1 false }, i32 %x, 0
+; CHECK-NEXT:  ret %ov.result.32 %1
+}
+
+define %ov.result.32 @smultest1_nsw(i32 %a, i32 %b) {
+  %A = and i32 %a, 4095 ; 0xfff
+  %B = and i32 %b, 524287; 0x7ffff
+  %x = call %ov.result.32 @llvm.smul.with.overflow.i32(i32 %A, i32 %B)
+  ret %ov.result.32 %x
+; CHECK-LABEL: @smultest1_nsw
+; CHECK: %x = mul nuw nsw i32 %A, %B
+; CHECK-NEXT: %1 = insertvalue %ov.result.32 { i32 undef, i1 false }, i32 %x, 0
+; CHECK-NEXT:  ret %ov.result.32 %1
+}
+
+define %ov.result.32 @smultest2_nsw(i32 %a, i32 %b) {
+  %A = ashr i32 %a, 16
+  %B = ashr i32 %b, 16
+  %x = call %ov.result.32 @llvm.smul.with.overflow.i32(i32 %A, i32 %B)
+  ret %ov.result.32 %x
+; CHECK-LABEL: @smultest2_nsw
+; CHECK: %x = mul nsw i32 %A, %B
+; CHECK-NEXT: %1 = insertvalue %ov.result.32 { i32 undef, i1 false }, i32 %x, 0
+; CHECK-NEXT:  ret %ov.result.32 %1
+}
+
+define %ov.result.32 @smultest3_sw(i32 %a, i32 %b) {
+  %A = ashr i32 %a, 16
+  %B = ashr i32 %b, 15
+  %x = call %ov.result.32 @llvm.smul.with.overflow.i32(i32 %A, i32 %B)
+  ret %ov.result.32 %x
+; CHECK-LABEL: @smultest3_sw
+; CHECK: %x = call %ov.result.32 @llvm.smul.with.overflow.i32(i32 %A, i32 %B)
+; CHECK-NEXT:  ret %ov.result.32 %x
+}
+
+define %ov.result.32 @umultest_nuw(i32 %a, i32 %b) {
+  %A = and i32 %a, 65535 ; 0xffff
+  %B = and i32 %b, 65535 ; 0xffff
+  %x = call %ov.result.32 @llvm.umul.with.overflow.i32(i32 %A, i32 %B)
+  ret %ov.result.32 %x
+; CHECK-LABEL: @umultest_nuw
+; CHECK: %x = mul nuw i32 %A, %B
+; CHECK-NEXT: %1 = insertvalue %ov.result.32 { i32 undef, i1 false }, i32 %x, 0
+; CHECK-NEXT:  ret %ov.result.32 %1
+}
 
 define i8 @umultest1(i8 %A, i1* %overflowPtr) {
   %x = call %overflow.result @llvm.umul.with.overflow.i8(i8 0, i8 %A)
@@ -125,9 +207,6 @@ define i8 @umultest2(i8 %A, i1* %overflowPtr) {
 ; CHECK-NEXT: ret i8 %A
 }
 
-%ov.result.32 = type { i32, i1 }
-declare %ov.result.32 @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
-
 define i32 @umultest3(i32 %n) nounwind {
   %shr = lshr i32 %n, 2
   %mul = call %ov.result.32 @llvm.umul.with.overflow.i32(i32 %shr, i32 3)
@@ -152,6 +231,19 @@ define i32 @umultest4(i32 %n) nounwind {
 ; CHECK: umul.with.overflow
 }
 
+define %ov.result.32 @umultest5(i32 %x, i32 %y) nounwind {
+  %or_x = or i32 %x, 2147483648
+  %or_y = or i32 %y, 2147483648
+  %mul = call %ov.result.32 @llvm.umul.with.overflow.i32(i32 %or_x, i32 %or_y)
+  ret %ov.result.32 %mul
+; CHECK-LABEL: @umultest5(
+; CHECK-NEXT: %[[or_x:.*]] = or i32 %x, -2147483648
+; CHECK-NEXT: %[[or_y:.*]] = or i32 %y, -2147483648
+; CHECK-NEXT: %[[mul:.*]] = mul i32 %[[or_x]], %[[or_y]]
+; CHECK-NEXT: %[[ret:.*]] = insertvalue %ov.result.32 { i32 undef, i1 true }, i32 %[[mul]], 0
+; CHECK-NEXT: ret %ov.result.32 %[[ret]]
+}
+
 define void @powi(double %V, double *%P) {
 entry:
   %A = tail call double @llvm.powi.f64(double %V, i32 -1) nounwind
@@ -257,7 +349,8 @@ define i32 @ctlz_select(i32 %Value) nounwind {
   ret i32 %s
 
 ; CHECK-LABEL: @ctlz_select(
-; CHECK: select i1 %tobool, i32 %ctlz, i32 32
+; CHECK-NEXT: call i32 @llvm.ctlz.i32(i32 %Value, i1 false)
+; CHECK-NEXT: ret i32
 }
 
 define i32 @cttz_select(i32 %Value) nounwind {
@@ -267,5 +360,6 @@ define i32 @cttz_select(i32 %Value) nounwind {
   ret i32 %s
 
 ; CHECK-LABEL: @cttz_select(
-; CHECK: select i1 %tobool, i32 %cttz, i32 32
+; CHECK-NEXT: call i32 @llvm.cttz.i32(i32 %Value, i1 false)
+; CHECK-NEXT: ret i32
 }
diff --git a/test/Transforms/InstCombine/load-cmp.ll b/test/Transforms/InstCombine/load-cmp.ll
index 9810026..40673a7 100644
--- a/test/Transforms/InstCombine/load-cmp.ll
+++ b/test/Transforms/InstCombine/load-cmp.ll
@@ -230,7 +230,7 @@ define i1 @test10_struct(i32 %x) {
 ; NODL: getelementptr inbounds %Foo* @GS, i32 %x, i32 0
 
 ; P32-LABEL: @test10_struct(
-; P32: getelementptr inbounds %Foo* @GS, i32 %x, i32 0
+; P32: ret i1 false
   %p = getelementptr inbounds %Foo* @GS, i32 %x, i32 0
   %q = load i32* %p
   %r = icmp eq i32 %q, 9
@@ -256,8 +256,7 @@ define i1 @test10_struct_i16(i16 %x){
 ; NODL: getelementptr inbounds %Foo* @GS, i16 %x, i32 0
 
 ; P32-LABEL: @test10_struct_i16(
-; P32: %1 = sext i16 %x to i32
-; P32: getelementptr inbounds %Foo* @GS, i32 %1, i32 0
+; P32: ret i1 false
   %p = getelementptr inbounds %Foo* @GS, i16 %x, i32 0
   %q = load i32* %p
   %r = icmp eq i32 %q, 0
@@ -271,8 +270,7 @@ define i1 @test10_struct_i64(i64 %x){
 ; NODL: getelementptr inbounds %Foo* @GS, i64 %x, i32 0
 
 ; P32-LABEL: @test10_struct_i64(
-; P32: %1 = trunc i64 %x to i32
-; P32: getelementptr inbounds %Foo* @GS, i32 %1, i32 0
+; P32: ret i1 false
   %p = getelementptr inbounds %Foo* @GS, i64 %x, i32 0
   %q = load i32* %p
   %r = icmp eq i32 %q, 0
diff --git a/test/Transforms/InstCombine/load.ll b/test/Transforms/InstCombine/load.ll
index b4b7558..624083b 100644
--- a/test/Transforms/InstCombine/load.ll
+++ b/test/Transforms/InstCombine/load.ll
@@ -1,8 +1,9 @@
 ; RUN: opt -instcombine -S < %s | FileCheck %s
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
 
 ; This test makes sure that these instructions are properly eliminated.
 
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target datalayout = "e-m:e-p:64:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
 @X = constant i32 42		; <i32*> [#uses=2]
 @X2 = constant i32 47		; <i32*> [#uses=1]
@@ -150,3 +151,53 @@ define i8 @test15(i8 %x, i32 %y) {
   %r = load i8* %g.i8
   ret i8 %r
 }
+
+define void @test16(i8* %x, i8* %a, i8* %b, i8* %c) {
+; Check that we canonicalize loads which are only stored to use integer types
+; when there is a valid integer type.
+; CHECK-LABEL: @test16(
+; CHECK: %[[L1:.*]] = load i32*
+; CHECK-NOT: load
+; CHECK: store i32 %[[L1]], i32*
+; CHECK: store i32 %[[L1]], i32*
+; CHECK-NOT: store
+; CHECK: %[[L1:.*]] = load i32*
+; CHECK-NOT: load
+; CHECK: store i32 %[[L1]], i32*
+; CHECK: store i32 %[[L1]], i32*
+; CHECK-NOT: store
+; CHECK: ret
+
+entry:
+  %x.cast = bitcast i8* %x to float*
+  %a.cast = bitcast i8* %a to float*
+  %b.cast = bitcast i8* %b to float*
+  %c.cast = bitcast i8* %c to i32*
+
+  %x1 = load float* %x.cast
+  store float %x1, float* %a.cast
+  store float %x1, float* %b.cast
+
+  %x2 = load float* %x.cast
+  store float %x2, float* %b.cast
+  %x2.cast = bitcast float %x2 to i32
+  store i32 %x2.cast, i32* %c.cast
+
+  ret void
+}
+
+define void @test17(i8** %x, i8 %y) {
+; Check that in cases similar to @test16 we don't try to rewrite a load when
+; its only use is a store but it is used as the pointer to that store rather
+; than the value.
+;
+; CHECK-LABEL: @test17(
+; CHECK: %[[L:.*]] = load i8**
+; CHECK: store i8 %y, i8* %[[L]]
+
+entry:
+  %x.load = load i8** %x
+  store i8 %y, i8* %x.load
+
+  ret void
+}
diff --git a/test/Transforms/InstCombine/loadstore-metadata.ll b/test/Transforms/InstCombine/loadstore-metadata.ll
index 863edae..be55fa6 100644
--- a/test/Transforms/InstCombine/loadstore-metadata.ll
+++ b/test/Transforms/InstCombine/loadstore-metadata.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -instcombine -S < %s | FileCheck %s
 
+target datalayout = "e-m:e-p:64:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
 define i32 @test_load_cast_combine_tbaa(float* %ptr) {
 ; Ensure (cast (load (...))) -> (load (cast (...))) preserves TBAA.
 ; CHECK-LABEL: @test_load_cast_combine_tbaa(
@@ -78,9 +80,34 @@ exit:
   ret void
 }
 
-!0 = metadata !{ metadata !1, metadata !1, i64 0 }
-!1 = metadata !{ metadata !1 }
-!2 = metadata !{ metadata !2, metadata !1 }
-!3 = metadata !{ }
-!4 = metadata !{ i32 1 }
-!5 = metadata !{ i32 0, i32 42 }
+define void @test_load_cast_combine_nonnull(float** %ptr) {
+; We can't preserve nonnull metadata when converting a load of a pointer to
+; a load of an integer. Instead, we translate it to range metadata.
+; FIXME: We should also transform range metadata back into nonnull metadata.
+; FIXME: This test is very fragile. If any LABEL lines are added after
+; this point, the test will fail, because this test depends on a metadata tuple,
+; which is always emitted at the end of the file. At some point, we should
+; consider an option to the IR printer to emit MD tuples after the function
+; that first uses them--this will allow us to refer to them like this and not
+; have the tests break. For now, this function must always come last in this
+; file, and no LABEL lines are to be added after this point.
+;
+; CHECK-LABEL: @test_load_cast_combine_nonnull(
+; CHECK: %[[V:.*]] = load i64* %{{.*}}, !range ![[MD:[0-9]+]]
+; CHECK-NOT: !nonnull
+; CHECK: store i64 %[[V]], i64*
+entry:
+  %p = load float** %ptr, !nonnull !3
+  %gep = getelementptr float** %ptr, i32 42
+  store float* %p, float** %gep
+  ret void
+}
+
+; This is the metadata tuple that we reference above:
+; CHECK: ![[MD]] = !{i64 1, i64 0}
+!0 = !{ !1, !1, i64 0 }
+!1 = !{ !1 }
+!2 = !{ !2, !1 }
+!3 = !{ }
+!4 = !{ i32 1 }
+!5 = !{ i32 0, i32 42 }
diff --git a/test/Transforms/InstCombine/malloc-free-delete.ll b/test/Transforms/InstCombine/malloc-free-delete.ll
index ed25e4e..765c8c3 100644
--- a/test/Transforms/InstCombine/malloc-free-delete.ll
+++ b/test/Transforms/InstCombine/malloc-free-delete.ll
@@ -146,17 +146,36 @@ lpad.i:                                           ; preds = %entry
 }
 
 declare i8* @_Znwm(i64) nobuiltin
-declare void @_ZdlPvm(i8*, i64) nobuiltin
 declare i8* @_Znwj(i32) nobuiltin
-declare void @_ZdlPvj(i8*, i32) nobuiltin
 declare i8* @_Znam(i64) nobuiltin
-declare void @_ZdaPvm(i8*, i64) nobuiltin
 declare i8* @_Znaj(i32) nobuiltin
-declare void @_ZdaPvj(i8*, i32) nobuiltin
+declare void @_ZdlPv(i8*) nobuiltin
+declare void @_ZdaPv(i8*) nobuiltin
+
+define linkonce void @_ZdlPvm(i8* %p, i64) nobuiltin {
+  call void @_ZdlPv(i8* %p)
+  ret void
+}
+define linkonce void @_ZdlPvj(i8* %p, i32) nobuiltin {
+  call void @_ZdlPv(i8* %p)
+  ret void
+}
+define linkonce void @_ZdaPvm(i8* %p, i64) nobuiltin {
+  call void @_ZdaPv(i8* %p)
+  ret void
+}
+define linkonce void @_ZdaPvj(i8* %p, i32) nobuiltin {
+  call void @_ZdaPv(i8* %p)
+  ret void
+}
 
 ; CHECK-LABEL: @test8(
 define void @test8() {
   ; CHECK-NOT: call
+  %nw = call i8* @_Znwm(i64 32) builtin
+  call void @_ZdlPv(i8* %nw) builtin
+  %na = call i8* @_Znam(i64 32) builtin
+  call void @_ZdaPv(i8* %na) builtin
   %nwm = call i8* @_Znwm(i64 32) builtin
   call void @_ZdlPvm(i8* %nwm, i64 32) builtin
   %nwj = call i8* @_Znwj(i32 32) builtin
diff --git a/test/Transforms/InstCombine/max-of-nots.ll b/test/Transforms/InstCombine/max-of-nots.ll
new file mode 100644
index 0000000..41e3038
--- /dev/null
+++ b/test/Transforms/InstCombine/max-of-nots.ll
@@ -0,0 +1,68 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+define i32 @compute_min_2(i32 %x, i32 %y) {
+; CHECK-LABEL: compute_min_2
+ entry:
+  %not_x = sub i32 -1, %x
+  %not_y = sub i32 -1, %y
+  %cmp = icmp sgt i32 %not_x, %not_y
+  %not_min = select i1 %cmp, i32 %not_x, i32 %not_y
+  %min = sub i32 -1, %not_min
+  ret i32 %min
+
+; CHECK: %0 = icmp slt i32 %x, %y
+; CHECK-NEXT: %1 = select i1 %0, i32 %x, i32 %y
+; CHECK-NEXT: ret i32 %1
+}
+
+define i32 @compute_min_3(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: compute_min_3
+ entry:
+  %not_x = sub i32 -1, %x
+  %not_y = sub i32 -1, %y
+  %not_z = sub i32 -1, %z
+  %cmp_1 = icmp sgt i32 %not_x, %not_y
+  %not_min_1 = select i1 %cmp_1, i32 %not_x, i32 %not_y
+  %cmp_2 = icmp sgt i32 %not_min_1, %not_z
+  %not_min_2 = select i1 %cmp_2, i32 %not_min_1, i32 %not_z
+  %min = sub i32 -1, %not_min_2
+  ret i32 %min
+
+; CHECK: %0 = icmp slt i32 %x, %y
+; CHECK-NEXT: %1 = select i1 %0, i32 %x, i32 %y
+; CHECK-NEXT: %2 = icmp slt i32 %1, %z
+; CHECK-NEXT: %3 = select i1 %2, i32 %1, i32 %z
+; CHECK-NEXT: ret i32 %3
+}
+
+define i32 @compute_min_arithmetic(i32 %x, i32 %y) {
+; CHECK-LABEL: compute_min_arithmetic
+ entry:
+  %not_value = sub i32 3, %x
+  %not_y = sub i32 -1, %y
+  %cmp = icmp sgt i32 %not_value, %not_y
+  %not_min = select i1 %cmp, i32 %not_value, i32 %not_y
+  ret i32 %not_min
+
+; CHECK: %0 = add i32 %x, -4
+; CHECK-NEXT: %1 = icmp slt i32 %0, %y
+; CHECK-NEXT: %2 = select i1 %1, i32 %0, i32 %y
+; CHECK-NEXT: %3 = xor i32 %2, -1
+; CHECK-NEXT: ret i32 %3
+}
+
+declare void @fake_use(i32)
+
+define i32 @compute_min_pessimization(i32 %x, i32 %y) {
+; CHECK-LABEL: compute_min_pessimization
+ entry:
+  %not_value = sub i32 3, %x
+  call void @fake_use(i32 %not_value)
+  %not_y = sub i32 -1, %y
+  %cmp = icmp sgt i32 %not_value, %not_y
+; CHECK: %not_value = sub i32 3, %x
+; CHECK: %cmp = icmp sgt i32 %not_value, %not_y
+  %not_min = select i1 %cmp, i32 %not_value, i32 %not_y
+  %min = sub i32 -1, %not_min
+  ret i32 %min
+}
diff --git a/test/Transforms/InstCombine/mem-gep-zidx.ll b/test/Transforms/InstCombine/mem-gep-zidx.ll
new file mode 100644
index 0000000..9141d99
--- /dev/null
+++ b/test/Transforms/InstCombine/mem-gep-zidx.ll
@@ -0,0 +1,48 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4
+@f.b = private unnamed_addr constant [1 x i32] [i32 55], align 4
+
+define signext i32 @test1(i32 signext %x) #0 {
+entry:
+  %idxprom = sext i32 %x to i64
+  %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %idxprom
+  %0 = load i32* %arrayidx, align 4
+  ret i32 %0
+
+; CHECK-LABEL: @test1
+; CHECK: ret i32 12
+}
+
+declare void @foo(i64* %p)
+define void @test2(i32 signext %x, i64 %v) #0 {
+entry:
+  %p = alloca i64
+  %idxprom = sext i32 %x to i64
+  %arrayidx = getelementptr inbounds i64* %p, i64 %idxprom
+  store i64 %v, i64* %arrayidx
+  call void @foo(i64* %p)
+  ret void
+
+; CHECK-LABEL: @test2
+; CHECK: %p = alloca i64
+; CHECK: store i64 %v, i64* %p
+; CHECK: ret void
+}
+
+define signext i32 @test3(i32 signext %x, i1 %y) #0 {
+entry:
+  %idxprom = sext i32 %x to i64
+  %p = select i1 %y, [1 x i32]* @f.a, [1 x i32]* @f.b
+  %arrayidx = getelementptr inbounds [1 x i32]* %p, i64 0, i64 %idxprom
+  %0 = load i32* %arrayidx, align 4
+  ret i32 %0
+
+; CHECK-LABEL: @test3
+; CHECK: getelementptr inbounds [1 x i32]* %p, i64 0, i64 0
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/Transforms/InstCombine/memcpy_chk-1.ll b/test/Transforms/InstCombine/memcpy_chk-1.ll
index 9216ae7..ddaaf82 100644
--- a/test/Transforms/InstCombine/memcpy_chk-1.ll
+++ b/test/Transforms/InstCombine/memcpy_chk-1.ll
@@ -15,46 +15,63 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ; Check cases where dstlen >= len.
 
-define void @test_simplify1() {
+define i8* @test_simplify1() {
 ; CHECK-LABEL: @test_simplify1(
   %dst = bitcast %struct.T1* @t1 to i8*
   %src = bitcast %struct.T2* @t2 to i8*
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64
-  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824, i64 1824)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.T1* @t1 to i8*), i8* bitcast (%struct.T2* @t2 to i8*), i64 1824, i32 4, i1 false)
+; CHECK-NEXT: ret i8* bitcast (%struct.T1* @t1 to i8*)
+  %ret = call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824, i64 1824)
+  ret i8* %ret
 }
 
-define void @test_simplify2() {
+define i8* @test_simplify2() {
 ; CHECK-LABEL: @test_simplify2(
   %dst = bitcast %struct.T1* @t1 to i8*
   %src = bitcast %struct.T3* @t3 to i8*
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64
-  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824, i64 2848)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.T1* @t1 to i8*), i8* bitcast (%struct.T3* @t3 to i8*), i64 1824, i32 4, i1 false)
+; CHECK-NEXT: ret i8* bitcast (%struct.T1* @t1 to i8*)
+  %ret = call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824, i64 2848)
+  ret i8* %ret
 }
 
 ; Check cases where dstlen < len.
 
-define void @test_no_simplify1() {
+define i8* @test_no_simplify1() {
 ; CHECK-LABEL: @test_no_simplify1(
   %dst = bitcast %struct.T3* @t3 to i8*
   %src = bitcast %struct.T1* @t1 to i8*
 
-; CHECK-NEXT: call i8* @__memcpy_chk
-  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 2848, i64 1824)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__memcpy_chk(i8* bitcast (%struct.T3* @t3 to i8*), i8* bitcast (%struct.T1* @t1 to i8*), i64 2848, i64 1824)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 2848, i64 1824)
+  ret i8* %ret
 }
 
-define void @test_no_simplify2() {
+define i8* @test_no_simplify2() {
 ; CHECK-LABEL: @test_no_simplify2(
   %dst = bitcast %struct.T1* @t1 to i8*
   %src = bitcast %struct.T2* @t2 to i8*
 
-; CHECK-NEXT: call i8* @__memcpy_chk
-  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1024, i64 0)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__memcpy_chk(i8* bitcast (%struct.T1* @t1 to i8*), i8* bitcast (%struct.T2* @t2 to i8*), i64 1024, i64 0)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1024, i64 0)
+  ret i8* %ret
+}
+
+define i8* @test_simplify_return_indcall(i8* ()* %alloc) {
+; CHECK-LABEL: @test_simplify_return_indcall(
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: %dst = call i8* %alloc()
+  %dst = call i8* %alloc()
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64
+  %ret = call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824, i64 1824)
+; CHECK-NEXT: ret i8* %dst
+  ret i8* %ret
 }
 
 declare i8* @__memcpy_chk(i8*, i8*, i64, i64)
diff --git a/test/Transforms/InstCombine/memmove_chk-1.ll b/test/Transforms/InstCombine/memmove_chk-1.ll
index 6d93bbb..e4e1f6e 100644
--- a/test/Transforms/InstCombine/memmove_chk-1.ll
+++ b/test/Transforms/InstCombine/memmove_chk-1.ll
@@ -15,46 +15,50 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ; Check cases where dstlen >= len.
 
-define void @test_simplify1() {
+define i8* @test_simplify1() {
 ; CHECK-LABEL: @test_simplify1(
   %dst = bitcast %struct.T1* @t1 to i8*
   %src = bitcast %struct.T2* @t2 to i8*
 
-; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64
-  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824, i64 1824)
-  ret void
+; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* bitcast (%struct.T1* @t1 to i8*), i8* bitcast (%struct.T2* @t2 to i8*), i64 1824, i32 4, i1 false)
+; CHECK-NEXT: ret i8* bitcast (%struct.T1* @t1 to i8*)
+  %ret = call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824, i64 1824)
+  ret i8* %ret
 }
 
-define void @test_simplify2() {
+define i8* @test_simplify2() {
 ; CHECK-LABEL: @test_simplify2(
   %dst = bitcast %struct.T1* @t1 to i8*
   %src = bitcast %struct.T3* @t3 to i8*
 
-; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64
-  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824, i64 2848)
-  ret void
+; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* bitcast (%struct.T1* @t1 to i8*), i8* bitcast (%struct.T3* @t3 to i8*), i64 1824, i32 4, i1 false)
+; CHECK-NEXT: ret i8* bitcast (%struct.T1* @t1 to i8*)
+  %ret = call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824, i64 2848)
+  ret i8* %ret
 }
 
 ; Check cases where dstlen < len.
 
-define void @test_no_simplify1() {
+define i8* @test_no_simplify1() {
 ; CHECK-LABEL: @test_no_simplify1(
   %dst = bitcast %struct.T3* @t3 to i8*
   %src = bitcast %struct.T1* @t1 to i8*
 
-; CHECK-NEXT: call i8* @__memmove_chk
-  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 2848, i64 1824)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__memmove_chk(i8* bitcast (%struct.T3* @t3 to i8*), i8* bitcast (%struct.T1* @t1 to i8*), i64 2848, i64 1824)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__memmove_chk(i8* %dst, i8* %src, i64 2848, i64 1824)
+  ret i8* %ret
 }
 
-define void @test_no_simplify2() {
+define i8* @test_no_simplify2() {
 ; CHECK-LABEL: @test_no_simplify2(
   %dst = bitcast %struct.T1* @t1 to i8*
   %src = bitcast %struct.T2* @t2 to i8*
 
-; CHECK-NEXT: call i8* @__memmove_chk
-  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1024, i64 0)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__memmove_chk(i8* bitcast (%struct.T1* @t1 to i8*), i8* bitcast (%struct.T2* @t2 to i8*), i64 1024, i64 0)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1024, i64 0)
+  ret i8* %ret
 }
 
 declare i8* @__memmove_chk(i8*, i8*, i64, i64)
diff --git a/test/Transforms/InstCombine/memset_chk-1.ll b/test/Transforms/InstCombine/memset_chk-1.ll
index 47cc7db..27f7293 100644
--- a/test/Transforms/InstCombine/memset_chk-1.ll
+++ b/test/Transforms/InstCombine/memset_chk-1.ll
@@ -11,51 +11,56 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ; Check cases where dstlen >= len.
 
-define void @test_simplify1() {
+define i8* @test_simplify1() {
 ; CHECK-LABEL: @test_simplify1(
   %dst = bitcast %struct.T* @t to i8*
 
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64
-  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 1824)
-  ret void
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* bitcast (%struct.T* @t to i8*), i8 0, i64 1824, i32 4, i1 false)
+; CHECK-NEXT: ret i8* bitcast (%struct.T* @t to i8*)
+  %ret = call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 1824)
+  ret i8* %ret
 }
 
-define void @test_simplify2() {
+define i8* @test_simplify2() {
 ; CHECK-LABEL: @test_simplify2(
   %dst = bitcast %struct.T* @t to i8*
 
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64
-  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 3648)
-  ret void
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* bitcast (%struct.T* @t to i8*), i8 0, i64 1824, i32 4, i1 false)
+; CHECK-NEXT: ret i8* bitcast (%struct.T* @t to i8*)
+  %ret = call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 3648)
+  ret i8* %ret
 }
 
-define void @test_simplify3() {
+define i8* @test_simplify3() {
 ; CHECK-LABEL: @test_simplify3(
   %dst = bitcast %struct.T* @t to i8*
 
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64
-  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 -1)
-  ret void
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* bitcast (%struct.T* @t to i8*), i8 0, i64 1824, i32 4, i1 false)
+; CHECK-NEXT: ret i8* bitcast (%struct.T* @t to i8*)
+  %ret = call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 -1)
+  ret i8* %ret
 }
 
 ; Check cases where dstlen < len.
 
-define void @test_no_simplify1() {
+define i8* @test_no_simplify1() {
 ; CHECK-LABEL: @test_no_simplify1(
   %dst = bitcast %struct.T* @t to i8*
 
-; CHECK-NEXT: call i8* @__memset_chk
-  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 400)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__memset_chk(i8* bitcast (%struct.T* @t to i8*), i32 0, i64 1824, i64 400)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 400)
+  ret i8* %ret
 }
 
-define void @test_no_simplify2() {
+define i8* @test_no_simplify2() {
 ; CHECK-LABEL: @test_no_simplify2(
   %dst = bitcast %struct.T* @t to i8*
 
-; CHECK-NEXT: call i8* @__memset_chk
-  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 0)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__memset_chk(i8* bitcast (%struct.T* @t to i8*), i32 0, i64 1824, i64 0)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 0)
+  ret i8* %ret
 }
 
 declare i8* @__memset_chk(i8*, i32, i64, i64)
diff --git a/test/Transforms/InstCombine/minnum.ll b/test/Transforms/InstCombine/minnum.ll
index 57d6e16..f7494e7 100644
--- a/test/Transforms/InstCombine/minnum.ll
+++ b/test/Transforms/InstCombine/minnum.ll
@@ -7,7 +7,7 @@ declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0
 declare double @llvm.minnum.f64(double, double) #0
 declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) #0
 
-declare float @llvm.fmax.f32(float, float) #0
+declare float @llvm.maxnum.f32(float, float) #0
 
 ; CHECK-LABEL: @constant_fold_minnum_f32
 ; CHECK-NEXT: ret float 1.000000e+00
@@ -206,23 +206,23 @@ define float @minnum4(float %x, float %y, float %z, float %w) #0 {
   ret float %c
 }
 
-; CHECK-LABEL: @minnum_x_fmax_x_y
-; CHECK-NEXT: call float @llvm.fmax.f32
+; CHECK-LABEL: @minnum_x_maxnum_x_y
+; CHECK-NEXT: call float @llvm.maxnum.f32
 ; CHECK-NEXT: call float @llvm.minnum.f32
 ; CHECK-NEXT: ret float
-define float @minnum_x_fmax_x_y(float %x, float %y) #0 {
-  %a = call float @llvm.fmax.f32(float %x, float %y) #0
+define float @minnum_x_maxnum_x_y(float %x, float %y) #0 {
+  %a = call float @llvm.maxnum.f32(float %x, float %y) #0
   %b = call float @llvm.minnum.f32(float %x, float %a) #0
   ret float %b
 }
 
-; CHECK-LABEL: @fmax_x_minnum_x_y
+; CHECK-LABEL: @maxnum_x_minnum_x_y
 ; CHECK-NEXT: call float @llvm.minnum.f32
-; CHECK-NEXT: call float @llvm.fmax.f32
+; CHECK-NEXT: call float @llvm.maxnum.f32
 ; CHECK-NEXT: ret float
-define float @fmax_x_minnum_x_y(float %x, float %y) #0 {
+define float @maxnum_x_minnum_x_y(float %x, float %y) #0 {
   %a = call float @llvm.minnum.f32(float %x, float %y) #0
-  %b = call float @llvm.fmax.f32(float %x, float %a) #0
+  %b = call float @llvm.maxnum.f32(float %x, float %a) #0
   ret float %b
 }
 
diff --git a/test/Transforms/InstCombine/mul.ll b/test/Transforms/InstCombine/mul.ll
index d19bedc..4d1e6c7 100644
--- a/test/Transforms/InstCombine/mul.ll
+++ b/test/Transforms/InstCombine/mul.ll
@@ -197,3 +197,94 @@ define <2 x i1> @test21(<2 x i1> %A, <2 x i1> %B) {
         ret <2 x i1> %C
 ; CHECK: %C = and <2 x i1> %A, %B
 }
+
+define i32 @test22(i32 %A) {
+; CHECK-LABEL: @test22(
+        %B = mul nsw i32 %A, -1
+        ret i32 %B
+; CHECK: sub nsw i32 0, %A
+}
+
+define i32 @test23(i32 %A) {
+; CHECK-LABEL: @test23(
+        %B = shl nuw i32 %A, 1
+        %C = mul nuw i32 %B, 3
+        ret i32 %C
+; CHECK: mul nuw i32 %A, 6
+}
+
+define i32 @test24(i32 %A) {
+; CHECK-LABEL: @test24(
+        %B = shl nsw i32 %A, 1
+        %C = mul nsw i32 %B, 3
+        ret i32 %C
+; CHECK: mul nsw i32 %A, 6
+}
+
+define i32 @test25(i32 %A, i32 %B) {
+; CHECK-LABEL: @test25(
+        %C = sub nsw i32 0, %A
+        %D = sub nsw i32 0, %B
+        %E = mul nsw i32 %C, %D
+        ret i32 %E
+; CHECK: mul nsw i32 %A, %B
+}
+
+define i32 @test26(i32 %A, i32 %B) {
+; CHECK-LABEL: @test26(
+        %C = shl nsw i32 1, %B
+        %D = mul nsw i32 %A, %C
+        ret i32 %D
+; CHECK: shl nsw i32 %A, %B
+}
+
+define i32 @test27(i32 %A, i32 %B) {
+; CHECK-LABEL: @test27(
+        %C = shl i32 1, %B
+        %D = mul nuw i32 %A, %C
+        ret i32 %D
+; CHECK: shl nuw i32 %A, %B
+}
+
+define i32 @test28(i32 %A) {
+; CHECK-LABEL: @test28(
+        %B = shl i32 1, %A
+        %C = mul nsw i32 %B, %B
+        ret i32 %C
+; CHECK:      %[[shl1:.*]] = shl i32 1, %A
+; CHECK-NEXT: %[[shl2:.*]] = shl i32 %[[shl1]], %A
+; CHECK-NEXT: ret i32 %[[shl2]]
+}
+
+define i64 @test29(i31 %A, i31 %B) {
+; CHECK-LABEL: @test29(
+        %C = sext i31 %A to i64
+        %D = sext i31 %B to i64
+        %E = mul i64 %C, %D
+        ret i64 %E
+; CHECK:      %[[sext1:.*]] = sext i31 %A to i64
+; CHECK-NEXT: %[[sext2:.*]] = sext i31 %B to i64
+; CHECK-NEXT: %[[mul:.*]] = mul nsw i64 %[[sext1]], %[[sext2]]
+; CHECK-NEXT: ret i64 %[[mul]]
+}
+
+define i64 @test30(i32 %A, i32 %B) {
+; CHECK-LABEL: @test30(
+        %C = zext i32 %A to i64
+        %D = zext i32 %B to i64
+        %E = mul i64 %C, %D
+        ret i64 %E
+; CHECK:      %[[zext1:.*]] = zext i32 %A to i64
+; CHECK-NEXT: %[[zext2:.*]] = zext i32 %B to i64
+; CHECK-NEXT: %[[mul:.*]] = mul nuw i64 %[[zext1]], %[[zext2]]
+; CHECK-NEXT: ret i64 %[[mul]]
+}
+
+@PR22087 = external global i32
+define i32 @test31(i32 %V) {
+; CHECK-LABEL: @test31
+  %mul = mul i32 %V, shl (i32 1, i32 zext (i1 icmp ne (i32* inttoptr (i64 1 to i32*), i32* @PR22087) to i32))
+  ret i32 %mul
+; CHECK:      %[[mul:.*]] = shl i32 %V, zext (i1 icmp ne (i32* inttoptr (i64 1 to i32*), i32* @PR22087) to i32)
+; CHECK-NEXT: ret i32 %[[mul]]
+}
diff --git a/test/Transforms/InstCombine/narrow-switch.ll b/test/Transforms/InstCombine/narrow-switch.ll
index 7646189..f3f19ba 100644
--- a/test/Transforms/InstCombine/narrow-switch.ll
+++ b/test/Transforms/InstCombine/narrow-switch.ll
@@ -91,3 +91,33 @@ return:
   %retval.0 = phi i32 [ 24, %sw.default ], [ 123, %sw.bb2 ], [ 213, %sw.bb1 ], [ 231, %entry ]
   ret i32 %retval.0
 }
+
+; Make sure to avoid assertion crashes and use the type before
+; truncation to generate the sub constant expressions that leads
+; to the recomputed condition.
+;
+; CHECK-LABEL: @trunc64to59
+; CHECK: switch i59
+; CHECK: i59 0, label
+; CHECK: i59 18717182647723699, label
+
+define void @trunc64to59(i64 %a) {
+entry:
+  %tmp0 = and i64 %a, 15
+  %tmp1 = mul i64 %tmp0, -6425668444178048401
+  %tmp2 = add i64 %tmp1, 5170979678563097242
+  %tmp3 = mul i64 %tmp2, 1627972535142754813
+  switch i64 %tmp3, label %sw.default [
+    i64 847514119312061490, label %sw.bb1
+    i64 866231301959785189, label %sw.bb2
+  ]
+
+sw.bb1:
+  br label %sw.default
+
+sw.bb2:
+  br label %sw.default
+
+sw.default:
+  ret void
+}
diff --git a/test/Transforms/InstCombine/not-fcmp.ll b/test/Transforms/InstCombine/not-fcmp.ll
index ad01a6b..9718e0b 100644
--- a/test/Transforms/InstCombine/not-fcmp.ll
+++ b/test/Transforms/InstCombine/not-fcmp.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep "fcmp uge"
+; RUN: opt < %s -instcombine -S | FileCheck %s
 ; PR1570
 
 define i1 @f(float %X, float %Y) {
@@ -6,5 +6,8 @@ entry:
         %tmp3 = fcmp olt float %X, %Y           ; <i1> [#uses=1]
         %toBoolnot5 = xor i1 %tmp3, true                ; <i1> [#uses=1]
         ret i1 %toBoolnot5
+; CHECK-LABEL: @f(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %toBoolnot5 = fcmp uge float %X, %Y
+; CHECK-NEXT: ret i1 %toBoolnot5
 }
-
diff --git a/test/Transforms/InstCombine/not.ll b/test/Transforms/InstCombine/not.ll
index 4a8825b..9d59edd 100644
--- a/test/Transforms/InstCombine/not.ll
+++ b/test/Transforms/InstCombine/not.ll
@@ -1,7 +1,8 @@
 ; This test makes sure that these instructions are properly eliminated.
 ;
 
-; RUN: opt < %s -instcombine -S | not grep xor
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK-NOT: xor
 
 define i32 @test1(i32 %A) {
         %B = xor i32 %A, -1             ; <i32> [#uses=1]
@@ -52,3 +53,8 @@ entry:
 	ret i8 %retval67
 }
 
+define <2 x i1> @test7(<2 x i32> %A, <2 x i32> %B) {
+        %cond = icmp sle <2 x i32> %A, %B
+        %Ret = xor <2 x i1> %cond, <i1 true, i1 true>
+        ret <2 x i1> %Ret
+}
diff --git a/test/Transforms/InstCombine/or-xor.ll b/test/Transforms/InstCombine/or-xor.ll
index 670e3e0..546b777 100644
--- a/test/Transforms/InstCombine/or-xor.ll
+++ b/test/Transforms/InstCombine/or-xor.ll
@@ -103,81 +103,75 @@ define i32 @test10(i32 %A, i32 %B) {
 ; CHECK-NEXT: ret i32 -1
 }
 
-define i32 @test11(i32 %A, i32 %B) {
-  %xor1 = xor i32 %B, %A
-  %not = xor i32 %A, -1
-  %xor2 = xor i32 %not, %B
-  %or = or i32 %xor1, %xor2
-  ret i32 %or
-; CHECK-LABEL: @test11(
-; CHECK-NEXT: ret i32 -1
-}
-
 ; (x | y) & ((~x) ^ y) -> (x & y)
-define i32 @test12(i32 %x, i32 %y) {
+define i32 @test11(i32 %x, i32 %y) {
  %or = or i32 %x, %y
  %neg = xor i32 %x, -1
  %xor = xor i32 %neg, %y
  %and = and i32 %or, %xor
  ret i32 %and
-; CHECK-LABEL: @test12(
+; CHECK-LABEL: @test11(
 ; CHECK-NEXT: %and = and i32 %x, %y
 ; CHECK-NEXT: ret i32 %and
 }
 
 ; ((~x) ^ y) & (x | y) -> (x & y)
-define i32 @test13(i32 %x, i32 %y) {
+define i32 @test12(i32 %x, i32 %y) {
  %neg = xor i32 %x, -1
  %xor = xor i32 %neg, %y
  %or = or i32 %x, %y
  %and = and i32 %xor, %or
  ret i32 %and
-; CHECK-LABEL: @test13(
+; CHECK-LABEL: @test12(
 ; CHECK-NEXT: %and = and i32 %x, %y
 ; CHECK-NEXT: ret i32 %and
 }
 
 ; ((x | y) ^ (x ^ y)) -> (x & y)
-define i32 @test15(i32 %x, i32 %y) {
+define i32 @test13(i32 %x, i32 %y) {
   %1 = xor i32 %y, %x
   %2 = or i32 %y, %x
   %3 = xor i32 %2, %1
   ret i32 %3
-; CHECK-LABEL: @test15(
+; CHECK-LABEL: @test13(
 ; CHECK-NEXT: %1 = and i32 %y, %x
 ; CHECK-NEXT: ret i32 %1
 }
 
 ; ((x | ~y) ^ (~x | y)) -> x ^ y
-define i32 @test16(i32 %x, i32 %y) {
+define i32 @test14(i32 %x, i32 %y) {
   %noty = xor i32 %y, -1
   %notx = xor i32 %x, -1
   %or1 = or i32 %x, %noty
   %or2 = or i32 %notx, %y
   %xor = xor i32 %or1, %or2
   ret i32 %xor
-; CHECK-LABEL: @test16(
+; CHECK-LABEL: @test14(
 ; CHECK-NEXT: %xor = xor i32 %x, %y
 ; CHECK-NEXT: ret i32 %xor
 }
 
 ; ((x & ~y) ^ (~x & y)) -> x ^ y
-define i32 @test17(i32 %x, i32 %y) {
+define i32 @test15(i32 %x, i32 %y) {
   %noty = xor i32 %y, -1
   %notx = xor i32 %x, -1
   %and1 = and i32 %x, %noty
   %and2 = and i32 %notx, %y
   %xor = xor i32 %and1, %and2
   ret i32 %xor
-; CHECK-LABEL: @test17(
+; CHECK-LABEL: @test15(
 ; CHECK-NEXT: %xor = xor i32 %x, %y
 ; CHECK-NEXT: ret i32 %xor
 }
 
-define i32 @test18(i32 %a, i32 %b) {
+define i32 @test16(i32 %a, i32 %b) {
   %or = xor i32 %a, %b
   %and1 = and i32 %or, 1
   %and2 = and i32 %b, -2
   %xor = or i32 %and1, %and2
   ret i32 %xor
+; CHECK-LABEL: @test16(
+; CHECK-NEXT: %1 = and i32 %a, 1
+; CHECK-NEXT: %xor = xor i32 %1, %b
+; CHECK-NEXT: ret i32 %xor
 }
diff --git a/test/Transforms/InstCombine/or.ll b/test/Transforms/InstCombine/or.ll
index 23dad21..f604baf 100644
--- a/test/Transforms/InstCombine/or.ll
+++ b/test/Transforms/InstCombine/or.ll
@@ -506,3 +506,13 @@ define i1 @test47(i8 signext %c)  {
 ; CHECK-NEXT:  add i8 %1, -65
 ; CHECK-NEXT:  icmp ult i8 %2, 27
 }
+
+define i1 @test48(i64 %x, i1 %b) {
+  %1 = icmp ult i64 %x, 2305843009213693952
+  %2 = icmp ugt i64 %x, 2305843009213693951
+  %.b = or i1 %2, %b
+  %3 = or i1 %1, %.b
+  ret i1 %3
+; CHECK-LABEL: @test48(
+; CHECK-NEXT:  ret i1 true
+}
diff --git a/test/Transforms/InstCombine/pr12251.ll b/test/Transforms/InstCombine/pr12251.ll
index 74a41eb..8c382bb 100644
--- a/test/Transforms/InstCombine/pr12251.ll
+++ b/test/Transforms/InstCombine/pr12251.ll
@@ -12,4 +12,4 @@ entry:
 ; CHECK-NEXT: %tobool = icmp ne i8 %a, 0
 ; CHECK-NEXT: ret i1 %tobool
 
-!0 = metadata !{i8 0, i8 2}
+!0 = !{i8 0, i8 2}
diff --git a/test/Transforms/InstCombine/pr12338.ll b/test/Transforms/InstCombine/pr12338.ll
index 614387a..7e0bf59 100644
--- a/test/Transforms/InstCombine/pr12338.ll
+++ b/test/Transforms/InstCombine/pr12338.ll
@@ -4,6 +4,7 @@ define void @entry() nounwind {
 entry:
   br label %for.cond
 
+; CHECK: br label %for.cond
 for.cond:
   %local = phi <1 x i32> [ <i32 0>, %entry ], [ %phi2, %cond.end47 ]
   %phi3 = sub <1 x i32> zeroinitializer, %local
@@ -18,7 +19,6 @@ cond.end:
 
 cond.end47:
   %sum = add <1 x i32> %cond, <i32 92>
-; CHECK: sub <1 x i32> <i32 -92>, %cond
   %phi2 = sub <1 x i32> zeroinitializer, %sum
   br label %for.cond
 }
diff --git a/test/Transforms/InstCombine/pr21199.ll b/test/Transforms/InstCombine/pr21199.ll
new file mode 100644
index 0000000..e6599fb
--- /dev/null
+++ b/test/Transforms/InstCombine/pr21199.ll
@@ -0,0 +1,25 @@
+; do not replace a 'select' with 'or' in 'select - cmp - br' sequence
+; RUN: opt -instcombine -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @f(i32)
+
+define void @test(i32 %len) {
+entry:
+  %cmp = icmp ult i32 %len, 8
+  %cond = select i1 %cmp, i32 %len, i32 8
+  %cmp11 = icmp ult i32 0, %cond
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  tail call void @f(i32 %cond)
+  %inc = add i32 %i.02, 1
+  %cmp1 = icmp ult i32 %inc, %cond
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK: select
+}
diff --git a/test/Transforms/InstCombine/pr21210.ll b/test/Transforms/InstCombine/pr21210.ll
new file mode 100644
index 0000000..1db8794
--- /dev/null
+++ b/test/Transforms/InstCombine/pr21210.ll
@@ -0,0 +1,50 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -instcombine -S | FileCheck %s
+; Checks that the select-icmp optimization is safe in two cases
+declare void @foo(i32)
+declare i32 @bar(i32)
+
+; don't replace 'cond' by 'len' in the home block ('bb') that
+; contains the select
+define void @test1(i32 %len) {
+entry:
+  br label %bb
+
+bb:
+  %cmp = icmp ult i32 %len, 8
+  %cond = select i1 %cmp, i32 %len, i32 8
+  call void @foo(i32 %cond)
+  %cmp11 = icmp eq i32 %cond, 8
+  br i1 %cmp11, label %for.end, label %bb
+
+for.end:
+  ret void
+; CHECK: select
+; CHECK: icmp eq i32 %cond, 8
+}
+
+; don't replace 'cond' by 'len' in a block ('b1') that dominates all uses
+; of the select outside the home block ('bb'), but can be reached from the home
+; block on another path ('bb -> b0 -> b1')
+define void @test2(i32 %len) {
+entry:
+  %0 = call i32 @bar(i32 %len);
+  %cmp = icmp ult i32 %len, 4
+  br i1 %cmp, label %bb, label %b1
+bb:
+  %cond = select i1 %cmp, i32 %len, i32 8
+  %cmp11 = icmp eq i32 %cond, 8
+  br i1 %cmp11, label %b0, label %b1
+
+b0:
+  call void @foo(i32 %len)
+  br label %b1
+
+b1:
+; CHECK: phi i32 [ %cond, %bb ], [ undef, %b0 ], [ %0, %entry ]
+  %1 = phi i32 [ %cond, %bb ], [ undef, %b0 ], [ %0, %entry ]
+  br label %ret
+
+ret:
+  call void @foo(i32 %1)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/pr21651.ll b/test/Transforms/InstCombine/pr21651.ll
new file mode 100644
index 0000000..914785f
--- /dev/null
+++ b/test/Transforms/InstCombine/pr21651.ll
@@ -0,0 +1,20 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define void @PR21651() {
+  switch i2 0, label %out [
+    i2 0, label %out
+    i2 1, label %out
+  ]
+
+out:
+  ret void
+}
+
+; CHECK-LABEL: define void @PR21651(
+; CHECK:   switch i2 0, label %out [
+; CHECK:     i2 0, label %out
+; CHECK:     i2 1, label %out
+; CHECK:   ]
+; CHECK: out:                                              ; preds = %0, %0, %0
+; CHECK:   ret void
+; CHECK: }
diff --git a/test/Transforms/InstCombine/pr21891.ll b/test/Transforms/InstCombine/pr21891.ll
new file mode 100644
index 0000000..8194976
--- /dev/null
+++ b/test/Transforms/InstCombine/pr21891.ll
@@ -0,0 +1,18 @@
+; RUN: opt %s -instcombine
+
+define i32 @f(i32 %theNumber) {
+entry:
+  %cmp = icmp sgt i32 %theNumber, -1
+  call void @llvm.assume(i1 %cmp)
+  br i1 true, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %shl = shl nuw i32 %theNumber, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %phi = phi i32 [ %shl, %if.then ], [ undef, %entry ]
+  ret i32 %phi
+}
+
+declare void @llvm.assume(i1)
diff --git a/test/Transforms/InstCombine/range-check.ll b/test/Transforms/InstCombine/range-check.ll
new file mode 100644
index 0000000..35f11dd
--- /dev/null
+++ b/test/Transforms/InstCombine/range-check.ll
@@ -0,0 +1,159 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Check simplification of
+; (icmp sgt x, -1) & (icmp sgt/sge n, x) --> icmp ugt/uge n, x
+
+; CHECK-LABEL: define i1 @test_and1
+; CHECK: [[R:%[0-9]+]] = icmp ugt i32 %nn, %x
+; CHECK: ret i1 [[R]]
+define i1 @test_and1(i32 %x, i32 %n) {
+  %nn = and i32 %n, 2147483647
+  %a = icmp sge i32 %x, 0
+  %b = icmp slt i32 %x, %nn
+  %c = and i1 %a, %b
+  ret i1 %c
+}
+
+; CHECK-LABEL: define i1 @test_and2
+; CHECK: [[R:%[0-9]+]] = icmp uge i32 %nn, %x
+; CHECK: ret i1 [[R]]
+define i1 @test_and2(i32 %x, i32 %n) {
+  %nn = and i32 %n, 2147483647
+  %a = icmp sgt i32 %x, -1
+  %b = icmp sle i32 %x, %nn
+  %c = and i1 %a, %b
+  ret i1 %c
+}
+
+; CHECK-LABEL: define i1 @test_and3
+; CHECK: [[R:%[0-9]+]] = icmp ugt i32 %nn, %x
+; CHECK: ret i1 [[R]]
+define i1 @test_and3(i32 %x, i32 %n) {
+  %nn = and i32 %n, 2147483647
+  %a = icmp sgt i32 %nn, %x
+  %b = icmp sge i32 %x, 0
+  %c = and i1 %a, %b
+  ret i1 %c
+}
+
+; CHECK-LABEL: define i1 @test_and4
+; CHECK: [[R:%[0-9]+]] = icmp uge i32 %nn, %x
+; CHECK: ret i1 [[R]]
+define i1 @test_and4(i32 %x, i32 %n) {
+  %nn = and i32 %n, 2147483647
+  %a = icmp sge i32 %nn, %x
+  %b = icmp sge i32 %x, 0
+  %c = and i1 %a, %b
+  ret i1 %c
+}
+
+; CHECK-LABEL: define i1 @test_or1
+; CHECK: [[R:%[0-9]+]] = icmp ule i32 %nn, %x
+; CHECK: ret i1 [[R]]
+define i1 @test_or1(i32 %x, i32 %n) {
+  %nn = and i32 %n, 2147483647
+  %a = icmp slt i32 %x, 0
+  %b = icmp sge i32 %x, %nn
+  %c = or i1 %a, %b
+  ret i1 %c
+}
+
+; CHECK-LABEL: define i1 @test_or2
+; CHECK: [[R:%[0-9]+]] = icmp ult i32 %nn, %x
+; CHECK: ret i1 [[R]]
+define i1 @test_or2(i32 %x, i32 %n) {
+  %nn = and i32 %n, 2147483647
+  %a = icmp sle i32 %x, -1
+  %b = icmp sgt i32 %x, %nn
+  %c = or i1 %a, %b
+  ret i1 %c
+}
+
+; CHECK-LABEL: define i1 @test_or3
+; CHECK: [[R:%[0-9]+]] = icmp ule i32 %nn, %x
+; CHECK: ret i1 [[R]]
+define i1 @test_or3(i32 %x, i32 %n) {
+  %nn = and i32 %n, 2147483647
+  %a = icmp sle i32 %nn, %x
+  %b = icmp slt i32 %x, 0
+  %c = or i1 %a, %b
+  ret i1 %c
+}
+
+; CHECK-LABEL: define i1 @test_or4
+; CHECK: [[R:%[0-9]+]] = icmp ult i32 %nn, %x
+; CHECK: ret i1 [[R]]
+define i1 @test_or4(i32 %x, i32 %n) {
+  %nn = and i32 %n, 2147483647
+  %a = icmp slt i32 %nn, %x
+  %b = icmp slt i32 %x, 0
+  %c = or i1 %a, %b
+  ret i1 %c
+}
+
+; Negative tests
+
+; CHECK-LABEL: define i1 @negative1
+; CHECK: %a = icmp
+; CHECK: %b = icmp
+; CHECK: %c = and i1 %a, %b
+; CHECK: ret i1 %c
+define i1 @negative1(i32 %x, i32 %n) {
+  %nn = and i32 %n, 2147483647
+  %a = icmp slt i32 %x, %nn
+  %b = icmp sgt i32 %x, 0      ; should be: icmp sge
+  %c = and i1 %a, %b
+  ret i1 %c
+}
+
+; CHECK-LABEL: define i1 @negative2
+; CHECK: %a = icmp
+; CHECK: %b = icmp
+; CHECK: %c = and i1 %a, %b
+; CHECK: ret i1 %c
+define i1 @negative2(i32 %x, i32 %n) {
+  %a = icmp slt i32 %x, %n     ; n can be negative
+  %b = icmp sge i32 %x, 0
+  %c = and i1 %a, %b
+  ret i1 %c
+}
+
+; CHECK-LABEL: define i1 @negative3
+; CHECK: %a = icmp
+; CHECK: %b = icmp
+; CHECK: %c = and i1 %a, %b
+; CHECK: ret i1 %c
+define i1 @negative3(i32 %x, i32 %y, i32 %n) {
+  %nn = and i32 %n, 2147483647
+  %a = icmp slt i32 %x, %nn
+  %b = icmp sge i32 %y, 0      ; should compare %x and not %y
+  %c = and i1 %a, %b
+  ret i1 %c
+}
+
+; CHECK-LABEL: define i1 @negative4
+; CHECK: %a = icmp
+; CHECK: %b = icmp
+; CHECK: %c = and i1 %a, %b
+; CHECK: ret i1 %c
+define i1 @negative4(i32 %x, i32 %n) {
+  %nn = and i32 %n, 2147483647
+  %a = icmp ne i32 %x, %nn     ; should be: icmp slt/sle
+  %b = icmp sge i32 %x, 0
+  %c = and i1 %a, %b
+  ret i1 %c
+}
+
+; CHECK-LABEL: define i1 @negative5
+; CHECK: %a = icmp
+; CHECK: %b = icmp
+; CHECK: %c = or i1 %a, %b
+; CHECK: ret i1 %c
+define i1 @negative5(i32 %x, i32 %n) {
+  %nn = and i32 %n, 2147483647
+  %a = icmp slt i32 %x, %nn
+  %b = icmp sge i32 %x, 0
+  %c = or i1 %a, %b            ; should be: and
+  ret i1 %c
+}
+
diff --git a/test/Transforms/InstCombine/select-cmp-br.ll b/test/Transforms/InstCombine/select-cmp-br.ll
new file mode 100644
index 0000000..f10d587
--- /dev/null
+++ b/test/Transforms/InstCombine/select-cmp-br.ll
@@ -0,0 +1,155 @@
+; Replace a 'select' with 'or' in 'select - cmp [eq|ne] - br' sequence
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+%C = type <{ %struct.S }>
+%struct.S = type { i64*, i32, i32 }
+
+declare void @bar(%struct.S *) #1
+declare void @foobar()
+
+define void @test1(%C*) {
+entry:
+  %1 = getelementptr inbounds %C* %0, i64 0, i32 0, i32 0
+  %m = load i64** %1, align 8
+  %2 = getelementptr inbounds %C* %0, i64 1, i32 0, i32 0
+  %n = load i64** %2, align 8
+  %3 = getelementptr inbounds i64* %m, i64 9
+  %4 = bitcast i64* %3 to i64 (%C*)**
+  %5 = load i64 (%C*)** %4, align 8
+  %6 = icmp eq i64* %m, %n
+  %7 = select i1 %6, %C* %0, %C* null
+  %8 = icmp eq %C* %7, null
+  br i1 %8, label %12, label %10
+
+; <label>:9                                       ; preds = %10, %12
+  ret void
+
+; <label>:10                                      ; preds = %entry
+  %11 = getelementptr inbounds %C* %7, i64 0, i32 0
+  tail call void @bar(%struct.S* %11)
+  br label %9
+
+; <label>:12                                      ; preds = %entry
+  %13 = tail call i64 %5(%C* %0)
+  br label %9
+; CHECK-LABEL: @test1(
+; CHECK-NOT: select
+; CHECK: or
+; CHECK-NOT: select
+}
+
+define void @test2(%C*) {
+entry:
+  %1 = getelementptr inbounds %C* %0, i64 0, i32 0, i32 0
+  %m = load i64** %1, align 8
+  %2 = getelementptr inbounds %C* %0, i64 1, i32 0, i32 0
+  %n = load i64** %2, align 8
+  %3 = getelementptr inbounds i64* %m, i64 9
+  %4 = bitcast i64* %3 to i64 (%C*)**
+  %5 = load i64 (%C*)** %4, align 8
+  %6 = icmp eq i64* %m, %n
+  %7 = select i1 %6, %C* null, %C* %0
+  %8 = icmp eq %C* %7, null
+  br i1 %8, label %12, label %10
+
+; <label>:9                                       ; preds = %10, %12
+  ret void
+
+; <label>:10                                      ; preds = %entry
+  %11 = getelementptr inbounds %C* %7, i64 0, i32 0
+  tail call void @bar(%struct.S* %11)
+  br label %9
+
+; <label>:12                                      ; preds = %entry
+  %13 = tail call i64 %5(%C* %0)
+  br label %9
+; CHECK-LABEL: @test2(
+; CHECK-NOT: select
+; CHECK: or
+; CHECK-NOT: select
+}
+
+define void @test3(%C*) {
+entry:
+  %1 = getelementptr inbounds %C* %0, i64 0, i32 0, i32 0
+  %m = load i64** %1, align 8
+  %2 = getelementptr inbounds %C* %0, i64 1, i32 0, i32 0
+  %n = load i64** %2, align 8
+  %3 = getelementptr inbounds i64* %m, i64 9
+  %4 = bitcast i64* %3 to i64 (%C*)**
+  %5 = load i64 (%C*)** %4, align 8
+  %6 = icmp eq i64* %m, %n
+  %7 = select i1 %6, %C* %0, %C* null
+  %8 = icmp ne %C* %7, null
+  br i1 %8, label %10, label %12
+
+; <label>:9                                       ; preds = %10, %12
+  ret void
+
+; <label>:10                                      ; preds = %entry
+  %11 = getelementptr inbounds %C* %7, i64 0, i32 0
+  tail call void @bar(%struct.S* %11)
+  br label %9
+
+; <label>:12                                      ; preds = %entry
+  %13 = tail call i64 %5(%C* %0)
+  br label %9
+; CHECK-LABEL: @test3(
+; CHECK-NOT: select
+; CHECK: or
+; CHECK-NOT: select
+}
+
+define void @test4(%C*) {
+entry:
+  %1 = getelementptr inbounds %C* %0, i64 0, i32 0, i32 0
+  %m = load i64** %1, align 8
+  %2 = getelementptr inbounds %C* %0, i64 1, i32 0, i32 0
+  %n = load i64** %2, align 8
+  %3 = getelementptr inbounds i64* %m, i64 9
+  %4 = bitcast i64* %3 to i64 (%C*)**
+  %5 = load i64 (%C*)** %4, align 8
+  %6 = icmp eq i64* %m, %n
+  %7 = select i1 %6, %C* null, %C* %0
+  %8 = icmp ne %C* %7, null
+  br i1 %8, label %10, label %12
+
+; <label>:9                                       ; preds = %10, %12
+  ret void
+
+; <label>:10                                      ; preds = %entry
+  %11 = getelementptr inbounds %C* %7, i64 0, i32 0
+  tail call void @bar(%struct.S* %11)
+  br label %9
+
+; <label>:12                                      ; preds = %entry
+  %13 = tail call i64 %5(%C* %0)
+  br label %9
+; CHECK-LABEL: @test4(
+; CHECK-NOT: select
+; CHECK: or
+; CHECK-NOT: select
+}
+
+define void @test5(%C*, i1) {
+entry:
+  %2 = select i1 %1, %C* null, %C* %0
+  %3 = icmp ne %C* %2, null
+  br i1 %3, label %5, label %7
+
+; <label>:4                                       ; preds = %10, %12
+  ret void
+
+; <label>:5                                      ; preds = %entry
+  %6 = getelementptr inbounds %C* %2, i64 0, i32 0
+  tail call void @bar(%struct.S* %6)
+  br label %4
+
+; <label>:7                                      ; preds = %entry
+  tail call void @foobar()
+  br label %4
+; CHECK-LABEL: @test5(
+; CHECK-NOT: select
+; CHECK: or
+; CHECK-NOT: select
+}
diff --git a/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll b/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll
new file mode 100644
index 0000000..894bf6d
--- /dev/null
+++ b/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll
@@ -0,0 +1,327 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+; This test is to verify that the instruction combiner is able to fold
+; a cttz/ctlz followed by a icmp + select into a single cttz/ctlz with
+; the 'is_zero_undef' flag cleared.
+
+define i16 @test1(i16 %x) {
+; CHECK-LABEL: @test1(
+; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i16 @llvm.ctlz.i16(i16 %x, i1 false)
+; CHECK-NEXT: ret i16 [[VAR]]
+entry:
+  %0 = tail call i16 @llvm.ctlz.i16(i16 %x, i1 true)
+  %tobool = icmp ne i16 %x, 0
+  %cond = select i1 %tobool, i16 %0, i16 16
+  ret i16 %cond
+}
+
+define i32 @test2(i32 %x) {
+; CHECK-LABEL: @test2(
+; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+; CHECK-NEXT: ret i32 [[VAR]]
+entry:
+  %0 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+  %tobool = icmp ne i32 %x, 0
+  %cond = select i1 %tobool, i32 %0, i32 32
+  ret i32 %cond
+}
+
+define i64 @test3(i64 %x) {
+; CHECK-LABEL: @test3(
+; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %x, i1 false)
+; CHECK-NEXT: ret i64 [[VAR]]
+entry:
+  %0 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+  %tobool = icmp ne i64 %x, 0
+  %cond = select i1 %tobool, i64 %0, i64 64
+  ret i64 %cond
+}
+
+define i16 @test4(i16 %x) {
+; CHECK-LABEL: @test4(
+; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i16 @llvm.ctlz.i16(i16 %x, i1 false)
+; CHECK-NEXT: ret i16 [[VAR]]
+entry:
+  %0 = tail call i16 @llvm.ctlz.i16(i16 %x, i1 true)
+  %tobool = icmp eq i16 %x, 0
+  %cond = select i1 %tobool, i16 16, i16 %0
+  ret i16 %cond
+}
+
+define i32 @test5(i32 %x) {
+; CHECK-LABEL: @test5(
+; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+; CHECK-NEXT: ret i32 [[VAR]]
+entry:
+  %0 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+  %tobool = icmp eq i32 %x, 0
+  %cond = select i1 %tobool, i32 32, i32 %0
+  ret i32 %cond
+}
+
+define i64 @test6(i64 %x) {
+; CHECK-LABEL: @test6(
+; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %x, i1 false)
+; CHECK-NEXT: ret i64 [[VAR]]
+entry:
+  %0 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+  %tobool = icmp eq i64 %x, 0
+  %cond = select i1 %tobool, i64 64, i64 %0
+  ret i64 %cond
+}
+
+define i16 @test1b(i16 %x) {
+; CHECK-LABEL: @test1b(
+; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i16 @llvm.cttz.i16(i16 %x, i1 false)
+; CHECK-NEXT: ret i16 [[VAR]]
+entry:
+  %0 = tail call i16 @llvm.cttz.i16(i16 %x, i1 true)
+  %tobool = icmp ne i16 %x, 0
+  %cond = select i1 %tobool, i16 %0, i16 16
+  ret i16 %cond
+}
+
+define i32 @test2b(i32 %x) {
+; CHECK-LABEL: @test2b(
+; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+; CHECK-NEXT: ret i32 [[VAR]]
+entry:
+  %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+  %tobool = icmp ne i32 %x, 0
+  %cond = select i1 %tobool, i32 %0, i32 32
+  ret i32 %cond
+}
+
+define i64 @test3b(i64 %x) {
+; CHECK-LABEL: @test3b(
+; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %x, i1 false)
+; CHECK-NEXT: ret i64 [[VAR]]
+entry:
+  %0 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
+  %tobool = icmp ne i64 %x, 0
+  %cond = select i1 %tobool, i64 %0, i64 64
+  ret i64 %cond
+}
+
+define i16 @test4b(i16 %x) {
+; CHECK-LABEL: @test4b(
+; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i16 @llvm.cttz.i16(i16 %x, i1 false)
+; CHECK-NEXT: ret i16 [[VAR]]
+entry:
+  %0 = tail call i16 @llvm.cttz.i16(i16 %x, i1 true)
+  %tobool = icmp eq i16 %x, 0
+  %cond = select i1 %tobool, i16 16, i16 %0
+  ret i16 %cond
+}
+
+define i32 @test5b(i32 %x) {
+; CHECK-LABEL: @test5b(
+; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+; CHECK-NEXT: ret i32 [[VAR]]
+entry:
+  %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+  %tobool = icmp eq i32 %x, 0
+  %cond = select i1 %tobool, i32 32, i32 %0
+  ret i32 %cond
+}
+
+define i64 @test6b(i64 %x) {
+; CHECK-LABEL: @test6b(
+; CHECK: [[VAR:%[a-zA-Z0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %x, i1 false)
+; CHECK-NEXT: ret i64 [[VAR]]
+entry:
+  %0 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
+  %tobool = icmp eq i64 %x, 0
+  %cond = select i1 %tobool, i64 64, i64 %0
+  ret i64 %cond
+}
+
+define i32 @test1c(i16 %x) {
+; CHECK-LABEL: @test1c(
+; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i16 @llvm.cttz.i16(i16 %x, i1 false)
+; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = zext i16 [[VAR1]] to i32
+; CHECK-NEXT: ret i32 [[VAR2]]
+entry:
+  %0 = tail call i16 @llvm.cttz.i16(i16 %x, i1 true)
+  %cast2 = zext i16 %0 to i32
+  %tobool = icmp ne i16 %x, 0
+  %cond = select i1 %tobool, i32 %cast2, i32 16
+  ret i32 %cond
+}
+
+define i64 @test2c(i16 %x) {
+; CHECK-LABEL: @test2c(
+; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i16 @llvm.cttz.i16(i16 %x, i1 false)
+; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = zext i16 [[VAR1]] to i64
+; CHECK-NEXT: ret i64 [[VAR2]]
+entry:
+  %0 = tail call i16 @llvm.cttz.i16(i16 %x, i1 true)
+  %conv = zext i16 %0 to i64
+  %tobool = icmp ne i16 %x, 0
+  %cond = select i1 %tobool, i64 %conv, i64 16
+  ret i64 %cond
+}
+
+define i64 @test3c(i32 %x) {
+; CHECK-LABEL: @test3c(
+; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = zext i32 [[VAR1]] to i64
+; CHECK-NEXT: ret i64 [[VAR2]]
+entry:
+  %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+  %conv = zext i32 %0 to i64
+  %tobool = icmp ne i32 %x, 0
+  %cond = select i1 %tobool, i64 %conv, i64 32
+  ret i64 %cond
+}
+
+define i32 @test4c(i16 %x) {
+; CHECK-LABEL: @test4c(
+; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i16 @llvm.ctlz.i16(i16 %x, i1 false)
+; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = zext i16 [[VAR1]] to i32
+; CHECK-NEXT: ret i32 [[VAR2]]
+entry:
+  %0 = tail call i16 @llvm.ctlz.i16(i16 %x, i1 true)
+  %cast = zext i16 %0 to i32
+  %tobool = icmp ne i16 %x, 0
+  %cond = select i1 %tobool, i32 %cast, i32 16
+  ret i32 %cond
+}
+
+define i64 @test5c(i16 %x) {
+; CHECK-LABEL: @test5c(
+; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i16 @llvm.ctlz.i16(i16 %x, i1 false)
+; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = zext i16 [[VAR1]] to i64
+; CHECK-NEXT: ret i64 [[VAR2]]
+entry:
+  %0 = tail call i16 @llvm.ctlz.i16(i16 %x, i1 true)
+  %cast = zext i16 %0 to i64
+  %tobool = icmp ne i16 %x, 0
+  %cond = select i1 %tobool, i64 %cast, i64 16
+  ret i64 %cond
+}
+
+define i64 @test6c(i32 %x) {
+; CHECK-LABEL: @test6c(
+; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = zext i32 [[VAR1]] to i64
+; CHECK-NEXT: ret i64 [[VAR2]]
+entry:
+  %0 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+  %cast = zext i32 %0 to i64
+  %tobool = icmp ne i32 %x, 0
+  %cond = select i1 %tobool, i64 %cast, i64 32
+  ret i64 %cond
+}
+
+define i16 @test1d(i64 %x) {
+; CHECK-LABEL: @test1d(
+; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %x, i1 false)
+; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = trunc i64 [[VAR1]] to i16
+; CHECK-NEXT: ret i16 [[VAR2]]
+entry:
+  %0 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
+  %conv = trunc i64 %0 to i16
+  %tobool = icmp ne i64 %x, 0
+  %cond = select i1 %tobool, i16 %conv, i16 64
+  ret i16 %cond
+}
+
+define i32 @test2d(i64 %x) {
+; CHECK-LABEL: @test2d(
+; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %x, i1 false)
+; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = trunc i64 [[VAR1]] to i32
+; CHECK-NEXT: ret i32 [[VAR2]]
+entry:
+  %0 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
+  %cast = trunc i64 %0 to i32
+  %tobool = icmp ne i64 %x, 0
+  %cond = select i1 %tobool, i32 %cast, i32 64
+  ret i32 %cond
+}
+
+define i16 @test3d(i32 %x) {
+; CHECK-LABEL: @test3d(
+; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = trunc i32 [[VAR1]] to i16
+; CHECK-NEXT: ret i16 [[VAR2]]
+entry:
+  %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+  %cast = trunc i32 %0 to i16
+  %tobool = icmp ne i32 %x, 0
+  %cond = select i1 %tobool, i16 %cast, i16 32
+  ret i16 %cond
+}
+
+define i16 @test4d(i64 %x) {
+; CHECK-LABEL: @test4d(
+; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %x, i1 false)
+; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = trunc i64 [[VAR1]] to i16
+; CHECK-NEXT: ret i16 [[VAR2]]
+entry:
+  %0 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+  %cast = trunc i64 %0 to i16
+  %tobool = icmp ne i64 %x, 0
+  %cond = select i1 %tobool, i16 %cast, i16 64
+  ret i16 %cond
+}
+
+define i32 @test5d(i64 %x) {
+; CHECK-LABEL: @test5d(
+; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %x, i1 false)
+; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = trunc i64 [[VAR1]] to i32
+; CHECK-NEXT: ret i32 [[VAR2]]
+entry:
+  %0 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+  %cast = trunc i64 %0 to i32
+  %tobool = icmp ne i64 %x, 0
+  %cond = select i1 %tobool, i32 %cast, i32 64
+  ret i32 %cond
+}
+
+define i16 @test6d(i32 %x) {
+; CHECK-LABEL: @test6d(
+; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = trunc i32 [[VAR1]] to i16
+; CHECK-NEXT: ret i16 [[VAR2]]
+entry:
+  %0 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+  %cast = trunc i32 %0 to i16
+  %tobool = icmp ne i32 %x, 0
+  %cond = select i1 %tobool, i16 %cast, i16 32
+  ret i16 %cond
+}
+
+define i64 @select_bug1(i32 %x) {
+; CHECK-LABEL: @select_bug1(
+; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = zext i32 [[VAR1]] to i64
+; CHECK-NEXT: ret i64 [[VAR2]]
+entry:
+  %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+  %conv = zext i32 %0 to i64
+  %tobool = icmp ne i32 %x, 0
+  %cond = select i1 %tobool, i64 %conv, i64 32
+  ret i64 %cond
+}
+
+define i16 @select_bug2(i32 %x) {
+; CHECK-LABEL: @select_bug2(
+; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+; CHECK-NEXT: [[VAR2:%[a-zA-Z0-9]+]] = trunc i32 [[VAR1]] to i16
+; CHECK-NEXT: ret i16 [[VAR2]]
+entry:
+  %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+  %conv = trunc i32 %0 to i16
+  %tobool = icmp ne i32 %x, 0
+  %cond = select i1 %tobool, i16 %conv, i16 32
+  ret i16 %cond
+}
+
+
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index 6cf9f0f..a6a7aa9 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -308,6 +308,26 @@ define i32 @test16(i1 %C, i32* %P) {
 ; CHECK: ret i32 %V
 }
 
+;; It may be legal to load from a null address in a non-zero address space
+define i32 @test16_neg(i1 %C, i32 addrspace(1)* %P) {
+        %P2 = select i1 %C, i32 addrspace(1)* %P, i32 addrspace(1)* null
+        %V = load i32 addrspace(1)* %P2
+        ret i32 %V
+; CHECK-LABEL: @test16_neg
+; CHECK-NEXT: %P2 = select i1 %C, i32 addrspace(1)* %P, i32 addrspace(1)* null
+; CHECK-NEXT: %V = load i32 addrspace(1)* %P2
+; CHECK: ret i32 %V
+}
+define i32 @test16_neg2(i1 %C, i32 addrspace(1)* %P) {
+        %P2 = select i1 %C, i32 addrspace(1)* null, i32 addrspace(1)* %P
+        %V = load i32 addrspace(1)* %P2
+        ret i32 %V
+; CHECK-LABEL: @test16_neg2
+; CHECK-NEXT: %P2 = select i1 %C, i32 addrspace(1)* null, i32 addrspace(1)* %P
+; CHECK-NEXT: %V = load i32 addrspace(1)* %P2
+; CHECK: ret i32 %V
+}
+
 define i1 @test17(i32* %X, i1 %C) {
         %R = select i1 %C, i32* %X, i32* null           
         %RV = icmp eq i32* %R, null             
@@ -997,17 +1017,6 @@ define <2 x i32> @select_icmp_eq_and_1_0_or_vector_of_2s(i32 %x, <2 x i32> %y) {
   ret <2 x i32> %select
 }
 
-; CHECK-LABEL: @select_icmp_and_8_eq_0_or_8(
-; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 %x, 8
-; CHECK-NEXT: ret i32 [[OR]]
-define i32 @select_icmp_and_8_eq_0_or_8(i32 %x) {
-  %and = and i32 %x, 8
-  %cmp = icmp eq i32 %and, 0
-  %or = or i32 %x, 8
-  %or.x = select i1 %cmp, i32 %or, i32 %x
-  ret i32 %or.x
-}
-
 ; CHECK-LABEL: @select_icmp_and_8_ne_0_xor_8(
 ; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, -9
 ; CHECK-NEXT: ret i32 [[AND]]
@@ -1030,27 +1039,6 @@ define i32 @select_icmp_and_8_eq_0_xor_8(i32 %x) {
   ret i32 %xor.x
 }
 
-; CHECK-LABEL: @select_icmp_and_8_ne_0_and_not_8(
-; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, -9
-; CHECK-NEXT: ret i32 [[AND]]
-define i32 @select_icmp_and_8_ne_0_and_not_8(i32 %x) {
-  %and = and i32 %x, 8
-  %cmp = icmp eq i32 %and, 0
-  %and1 = and i32 %x, -9
-  %x.and1 = select i1 %cmp, i32 %x, i32 %and1
-  ret i32 %x.and1
-}
-
-; CHECK-LABEL: @select_icmp_and_8_eq_0_and_not_8(
-; CHECK-NEXT: ret i32 %x
-define i32 @select_icmp_and_8_eq_0_and_not_8(i32 %x) {
-  %and = and i32 %x, 8
-  %cmp = icmp eq i32 %and, 0
-  %and1 = and i32 %x, -9
-  %and1.x = select i1 %cmp, i32 %and1, i32 %x
-  ret i32 %and1.x
-}
-
 ; CHECK-LABEL: @select_icmp_x_and_8_eq_0_y_xor_8(
 ; CHECK: select i1 %cmp, i64 %y, i64 %xor
 define i64 @select_icmp_x_and_8_eq_0_y_xor_8(i32 %x, i64 %y) {
@@ -1061,16 +1049,6 @@ define i64 @select_icmp_x_and_8_eq_0_y_xor_8(i32 %x, i64 %y) {
   ret i64 %y.xor
 }
 
-; CHECK-LABEL: @select_icmp_x_and_8_eq_0_y_and_not_8(
-; CHECK: select i1 %cmp, i64 %y, i64 %and1
-define i64 @select_icmp_x_and_8_eq_0_y_and_not_8(i32 %x, i64 %y) {
-  %and = and i32 %x, 8
-  %cmp = icmp eq i32 %and, 0
-  %and1 = and i64 %y, -9
-  %y.and1 = select i1 %cmp, i64 %y, i64 %and1
-  ret i64 %y.and1
-}
-
 ; CHECK-LABEL: @select_icmp_x_and_8_ne_0_y_xor_8(
 ; CHECK: select i1 %cmp, i64 %xor, i64 %y
 define i64 @select_icmp_x_and_8_ne_0_y_xor_8(i32 %x, i64 %y) {
@@ -1081,16 +1059,6 @@ define i64 @select_icmp_x_and_8_ne_0_y_xor_8(i32 %x, i64 %y) {
   ret i64 %xor.y
 }
 
-; CHECK-LABEL: @select_icmp_x_and_8_ne_0_y_and_not_8(
-; CHECK: select i1 %cmp, i64 %and1, i64 %y
-define i64 @select_icmp_x_and_8_ne_0_y_and_not_8(i32 %x, i64 %y) {
-  %and = and i32 %x, 8
-  %cmp = icmp eq i32 %and, 0
-  %and1 = and i64 %y, -9
-  %and1.y = select i1 %cmp, i64 %and1, i64 %y
-  ret i64 %and1.y
-}
-
 ; CHECK-LABEL: @select_icmp_x_and_8_ne_0_y_or_8(
 ; CHECK: xor i64 %1, 8
 ; CHECK: or i64 %2, %y
@@ -1102,6 +1070,39 @@ define i64 @select_icmp_x_and_8_ne_0_y_or_8(i32 %x, i64 %y) {
   ret i64 %or.y
 }
 
+; CHECK-LABEL: @select_icmp_and_2147483648_ne_0_xor_2147483648(
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, 2147483647
+; CHECK-NEXT: ret i32 [[AND]]
+define i32 @select_icmp_and_2147483648_ne_0_xor_2147483648(i32 %x) {
+  %and = and i32 %x, 2147483648
+  %cmp = icmp eq i32 %and, 0
+  %xor = xor i32 %x, 2147483648
+  %x.xor = select i1 %cmp, i32 %x, i32 %xor
+  ret i32 %x.xor
+}
+
+; CHECK-LABEL: @select_icmp_and_2147483648_eq_0_xor_2147483648(
+; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 %x, -2147483648
+; CHECK-NEXT: ret i32 [[OR]]
+define i32 @select_icmp_and_2147483648_eq_0_xor_2147483648(i32 %x) {
+  %and = and i32 %x, 2147483648
+  %cmp = icmp eq i32 %and, 0
+  %xor = xor i32 %x, 2147483648
+  %xor.x = select i1 %cmp, i32 %xor, i32 %x
+  ret i32 %xor.x
+}
+
+; CHECK-LABEL: @select_icmp_x_and_2147483648_ne_0_or_2147483648(
+; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 %x, -2147483648
+; CHECK-NEXT: ret i32 [[OR]]
+define i32 @select_icmp_x_and_2147483648_ne_0_or_2147483648(i32 %x) {
+  %and = and i32 %x, 2147483648
+  %cmp = icmp eq i32 %and, 0
+  %or = or i32 %x, 2147483648
+  %or.x = select i1 %cmp, i32 %or, i32 %x
+  ret i32 %or.x
+}
+
 define i32 @test65(i64 %x) {
   %1 = and i64 %x, 16
   %2 = icmp ne i64 %1, 0
@@ -1256,7 +1257,7 @@ define i32 @test76(i1 %flag, i32* %x) {
   ret i32 %v
 }
 
-declare void @scribble_on_memory(i32*)
+declare void @scribble_on_i32(i32*)
 
 define i32 @test77(i1 %flag, i32* %x) {
 ; The load here must not be speculated around the select. One side of the
@@ -1264,13 +1265,13 @@ define i32 @test77(i1 %flag, i32* %x) {
 ; load does.
 ; CHECK-LABEL: @test77(
 ; CHECK: %[[A:.*]] = alloca i32, align 1
-; CHECK: call void @scribble_on_memory(i32* %[[A]])
+; CHECK: call void @scribble_on_i32(i32* %[[A]])
 ; CHECK: store i32 0, i32* %x
 ; CHECK: %[[P:.*]] = select i1 %flag, i32* %[[A]], i32* %x
 ; CHECK: load i32* %[[P]]
 
   %under_aligned = alloca i32, align 1
-  call void @scribble_on_memory(i32* %under_aligned)
+  call void @scribble_on_i32(i32* %under_aligned)
   store i32 0, i32* %x
   %p = select i1 %flag, i32* %under_aligned, i32* %x
   %v = load i32* %p
@@ -1327,8 +1328,8 @@ define i32 @test80(i1 %flag) {
 entry:
   %x = alloca i32
   %y = alloca i32
-  call void @scribble_on_memory(i32* %x)
-  call void @scribble_on_memory(i32* %y)
+  call void @scribble_on_i32(i32* %x)
+  call void @scribble_on_i32(i32* %y)
   %tmp = load i32* %x
   store i32 %tmp, i32* %y
   %p = select i1 %flag, i32* %x, i32* %y
@@ -1351,8 +1352,8 @@ entry:
   %y = alloca i32
   %x1 = bitcast float* %x to i32*
   %y1 = bitcast i32* %y to float*
-  call void @scribble_on_memory(i32* %x1)
-  call void @scribble_on_memory(i32* %y)
+  call void @scribble_on_i32(i32* %x1)
+  call void @scribble_on_i32(i32* %y)
   %tmp = load i32* %x1
   store i32 %tmp, i32* %y
   %p = select i1 %flag, float* %x, float* %y1
@@ -1377,11 +1378,145 @@ entry:
   %y = alloca i32
   %x1 = bitcast float* %x to i32*
   %y1 = bitcast i32* %y to float*
-  call void @scribble_on_memory(i32* %x1)
-  call void @scribble_on_memory(i32* %y)
+  call void @scribble_on_i32(i32* %x1)
+  call void @scribble_on_i32(i32* %y)
   %tmp = load float* %x
   store float %tmp, float* %y1
   %p = select i1 %flag, i32* %x1, i32* %y
   %v = load i32* %p
   ret i32 %v
 }
+
+declare void @scribble_on_i64(i64*)
+declare void @scribble_on_i128(i128*)
+
+define i8* @test83(i1 %flag) {
+; Test that we can speculate the load around the select even though they use
+; differently typed pointers and requires inttoptr casts.
+; CHECK-LABEL: @test83(
+; CHECK:         %[[X:.*]] = alloca i8*
+; CHECK-NEXT:    %[[Y:.*]] = alloca i8*
+; CHECK-DAG:     %[[X2:.*]] = bitcast i8** %[[X]] to i64*
+; CHECK-DAG:     %[[Y2:.*]] = bitcast i8** %[[Y]] to i64*
+; CHECK:         %[[V:.*]] = load i64* %[[X2]]
+; CHECK-NEXT:    store i64 %[[V]], i64* %[[Y2]]
+; CHECK-NEXT:    %[[C:.*]] = inttoptr i64 %[[V]] to i8*
+; CHECK-NEXT:    ret i8* %[[S]]
+entry:
+  %x = alloca i8*
+  %y = alloca i64
+  %x1 = bitcast i8** %x to i64*
+  %y1 = bitcast i64* %y to i8**
+  call void @scribble_on_i64(i64* %x1)
+  call void @scribble_on_i64(i64* %y)
+  %tmp = load i64* %x1
+  store i64 %tmp, i64* %y
+  %p = select i1 %flag, i8** %x, i8** %y1
+  %v = load i8** %p
+  ret i8* %v
+}
+
+define i64 @test84(i1 %flag) {
+; Test that we can speculate the load around the select even though they use
+; differently typed pointers and requires a ptrtoint cast.
+; CHECK-LABEL: @test84(
+; CHECK:         %[[X:.*]] = alloca i8*
+; CHECK-NEXT:    %[[Y:.*]] = alloca i8*
+; CHECK:         %[[V:.*]] = load i8** %[[X]]
+; CHECK-NEXT:    store i8* %[[V]], i8** %[[Y]]
+; CHECK-NEXT:    %[[C:.*]] = ptrtoint i8* %[[V]] to i64
+; CHECK-NEXT:    ret i64 %[[C]]
+entry:
+  %x = alloca i8*
+  %y = alloca i64
+  %x1 = bitcast i8** %x to i64*
+  %y1 = bitcast i64* %y to i8**
+  call void @scribble_on_i64(i64* %x1)
+  call void @scribble_on_i64(i64* %y)
+  %tmp = load i8** %x
+  store i8* %tmp, i8** %y1
+  %p = select i1 %flag, i64* %x1, i64* %y
+  %v = load i64* %p
+  ret i64 %v
+}
+
+define i8* @test85(i1 %flag) {
+; Test that we can't speculate the load around the select. The load of the
+; pointer doesn't load all of the stored integer bits. We could fix this, but it
+; would require endianness checks and other nastiness.
+; CHECK-LABEL: @test85(
+; CHECK:         %[[T:.*]] = load i128*
+; CHECK-NEXT:    store i128 %[[T]], i128*
+; CHECK-NEXT:    %[[X:.*]] = load i8**
+; CHECK-NEXT:    %[[Y:.*]] = load i8**
+; CHECK-NEXT:    %[[V:.*]] = select i1 %flag, i8* %[[X]], i8* %[[Y]]
+; CHECK-NEXT:    ret i8* %[[V]]
+entry:
+  %x = alloca [2 x i8*]
+  %y = alloca i128
+  %x1 = bitcast [2 x i8*]* %x to i8**
+  %x2 = bitcast i8** %x1 to i128*
+  %y1 = bitcast i128* %y to i8**
+  call void @scribble_on_i128(i128* %x2)
+  call void @scribble_on_i128(i128* %y)
+  %tmp = load i128* %x2
+  store i128 %tmp, i128* %y
+  %p = select i1 %flag, i8** %x1, i8** %y1
+  %v = load i8** %p
+  ret i8* %v
+}
+
+define i128 @test86(i1 %flag) {
+; Test that we can't speculate the load around the select when the integer size
+; is larger than the pointer size. The store of the pointer doesn't store to all
+; the bits of the integer.
+;
+; CHECK-LABEL: @test86(
+; CHECK:         %[[T:.*]] = load i8**
+; CHECK-NEXT:    store i8* %[[T]], i8**
+; CHECK-NEXT:    %[[X:.*]] = load i128*
+; CHECK-NEXT:    %[[Y:.*]] = load i128*
+; CHECK-NEXT:    %[[V:.*]] = select i1 %flag, i128 %[[X]], i128 %[[Y]]
+; CHECK-NEXT:    ret i128 %[[V]]
+entry:
+  %x = alloca [2 x i8*]
+  %y = alloca i128
+  %x1 = bitcast [2 x i8*]* %x to i8**
+  %x2 = bitcast i8** %x1 to i128*
+  %y1 = bitcast i128* %y to i8**
+  call void @scribble_on_i128(i128* %x2)
+  call void @scribble_on_i128(i128* %y)
+  %tmp = load i8** %x1
+  store i8* %tmp, i8** %y1
+  %p = select i1 %flag, i128* %x2, i128* %y
+  %v = load i128* %p
+  ret i128 %v
+}
+
+define i32 @test_select_select0(i32 %a, i32 %r0, i32 %r1, i32 %v1, i32 %v2) {
+  ; CHECK-LABEL: @test_select_select0(
+  ; CHECK: %[[C0:.*]] = icmp sge i32 %a, %v1
+  ; CHECK-NEXT: %[[C1:.*]] = icmp slt i32 %a, %v2
+  ; CHECK-NEXT: %[[C:.*]] = and i1 %[[C1]], %[[C0]]
+  ; CHECK-NEXT: %[[SEL:.*]] = select i1 %[[C]], i32 %r0, i32 %r1
+  ; CHECK-NEXT: ret i32 %[[SEL]]
+  %c0 = icmp sge i32 %a, %v1
+  %s0 = select i1 %c0, i32 %r0, i32 %r1
+  %c1 = icmp slt i32 %a, %v2
+  %s1 = select i1 %c1, i32 %s0, i32 %r1
+  ret i32 %s1
+}
+
+define i32 @test_select_select1(i32 %a, i32 %r0, i32 %r1, i32 %v1, i32 %v2) {
+  ; CHECK-LABEL: @test_select_select1(
+  ; CHECK: %[[C0:.*]] = icmp sge i32 %a, %v1
+  ; CHECK-NEXT: %[[C1:.*]] = icmp slt i32 %a, %v2
+  ; CHECK-NEXT: %[[C:.*]] = or i1 %[[C1]], %[[C0]]
+  ; CHECK-NEXT: %[[SEL:.*]] = select i1 %[[C]], i32 %r0, i32 %r1
+  ; CHECK-NEXT: ret i32 %[[SEL]]
+  %c0 = icmp sge i32 %a, %v1
+  %s0 = select i1 %c0, i32 %r0, i32 %r1
+  %c1 = icmp slt i32 %a, %v2
+  %s1 = select i1 %c1, i32 %r0, i32 %s0
+  ret i32 %s1
+}
diff --git a/test/Transforms/InstCombine/shift.ll b/test/Transforms/InstCombine/shift.ll
index 5586bb6..0b5b5de 100644
--- a/test/Transforms/InstCombine/shift.ll
+++ b/test/Transforms/InstCombine/shift.ll
@@ -57,7 +57,7 @@ define <4 x i32> @test5_zero_vector(<4 x i32> %A) {
 define <4 x i32> @test5_non_splat_vector(<4 x i32> %A) {
 ; CHECK-LABEL: @test5_non_splat_vector(
 ; CHECK-NOT: ret <4 x i32> undef
-  %B = shl <4 x i32> %A, <i32 32, i32 1, i32 2, i32 3>
+  %B = lshr <4 x i32> %A, <i32 32, i32 1, i32 2, i32 3>
   ret <4 x i32> %B
 }
 
@@ -84,14 +84,14 @@ define <4 x i32> @test5a_non_splat_vector(<4 x i32> %A) {
 
 define i32 @test5b() {
 ; CHECK-LABEL: @test5b(
-; CHECK: ret i32 -1
+; CHECK: ret i32 0
         %B = ashr i32 undef, 2  ;; top two bits must be equal, so not undef
         ret i32 %B
 }
 
 define i32 @test5b2(i32 %A) {
 ; CHECK-LABEL: @test5b2(
-; CHECK: ret i32 -1
+; CHECK: ret i32 0
         %B = ashr i32 undef, %A  ;; top %A bits must be equal, so not undef
         ret i32 %B
 }
@@ -738,67 +738,49 @@ define i32 @test56(i32 %x) {
 
 
 define i32 @test57(i32 %x) {
-  %shr = lshr i32 %x, 1
-  %shl = shl i32 %shr, 4
-  %and = and i32 %shl, 16
-  ret i32 %and
-; CHECK-LABEL: @test57(
-; CHECK: shl i32 %x, 3
-}
-
-define i32 @test58(i32 %x) {
-  %shr = lshr i32 %x, 1
-  %shl = shl i32 %shr, 4
-  %or = or i32 %shl, 8
-  ret i32 %or
-; CHECK-LABEL: @test58(
-; CHECK: shl i32 %x, 3
-}
-
-define i32 @test59(i32 %x) {
   %shr = ashr i32 %x, 1
   %shl = shl i32 %shr, 4
   %or = or i32 %shl, 7
   ret i32 %or
-; CHECK-LABEL: @test59(
+; CHECK-LABEL: @test57(
 ; CHECK: %shl = shl i32 %shr1, 4
 }
 
 
-define i32 @test60(i32 %x) {
+define i32 @test58(i32 %x) {
   %shr = ashr i32 %x, 4
   %shl = shl i32 %shr, 1
   %or = or i32 %shl, 1
   ret i32 %or
-; CHECK-LABEL: @test60(
+; CHECK-LABEL: @test58(
 ; CHECK: ashr i32 %x, 3
 }
 
 
-define i32 @test61(i32 %x) {
+define i32 @test59(i32 %x) {
   %shr = ashr i32 %x, 4
   %shl = shl i32 %shr, 1
   %or = or i32 %shl, 2
   ret i32 %or
-; CHECK-LABEL: @test61(
+; CHECK-LABEL: @test59(
 ; CHECK: ashr i32 %x, 4
 }
 
 ; propagate "exact" trait
-define i32 @test62(i32 %x) {
+define i32 @test60(i32 %x) {
   %shr = ashr exact i32 %x, 4
   %shl = shl i32 %shr, 1
   %or = or i32 %shl, 1
   ret i32 %or
-; CHECK-LABEL: @test62(
+; CHECK-LABEL: @test60(
 ; CHECK: ashr exact i32 %x, 3
 }
 
 ; PR17026
-; CHECK-LABEL: @test63(
+; CHECK-LABEL: @test61(
 ; CHECK-NOT: sh
 ; CHECK: ret
-define void @test63(i128 %arg) {
+define void @test61(i128 %arg) {
 bb:
   br i1 undef, label %bb1, label %bb12
 
@@ -830,29 +812,29 @@ bb12:                                             ; preds = %bb11, %bb8, %bb
   ret void
 }
 
-define i32 @test64(i32 %a) {
-; CHECK-LABEL: @test64(
+define i32 @test62(i32 %a) {
+; CHECK-LABEL: @test62(
 ; CHECK-NEXT: ret i32 undef
   %b = ashr i32 %a, 32  ; shift all bits out
   ret i32 %b
 }
 
-define <4 x i32> @test64_splat_vector(<4 x i32> %a) {
-; CHECK-LABEL: @test64_splat_vector
+define <4 x i32> @test62_splat_vector(<4 x i32> %a) {
+; CHECK-LABEL: @test62_splat_vector
 ; CHECK-NEXT: ret <4 x i32> undef
   %b = ashr <4 x i32> %a, <i32 32, i32 32, i32 32, i32 32>  ; shift all bits out
   ret <4 x i32> %b
 }
 
-define <4 x i32> @test64_non_splat_vector(<4 x i32> %a) {
-; CHECK-LABEL: @test64_non_splat_vector
+define <4 x i32> @test62_non_splat_vector(<4 x i32> %a) {
+; CHECK-LABEL: @test62_non_splat_vector
 ; CHECK-NOT: ret <4 x i32> undef
   %b = ashr <4 x i32> %a, <i32 32, i32 0, i32 1, i32 2>  ; shift all bits out
   ret <4 x i32> %b
 }
 
-define <2 x i65> @test_65(<2 x i64> %t) {
-; CHECK-LABEL: @test_65
+define <2 x i65> @test_63(<2 x i64> %t) {
+; CHECK-LABEL: @test_63
   %a = zext <2 x i64> %t to <2 x i65>
   %sext = shl <2 x i65> %a, <i65 33, i65 33>
   %b = ashr <2 x i65> %sext, <i65 33, i65 33>
diff --git a/test/Transforms/InstCombine/signext.ll b/test/Transforms/InstCombine/signext.ll
index d700497..3a714d7 100644
--- a/test/Transforms/InstCombine/signext.ll
+++ b/test/Transforms/InstCombine/signext.ll
@@ -34,54 +34,45 @@ define i32 @test3(i16 %P) {
 ; CHECK: ret i32 %tmp.5
 }
 
-define i32 @test4(i16 %P) {
-        %tmp.1 = zext i16 %P to i32             ; <i32> [#uses=1]
-        %tmp.4 = xor i32 %tmp.1, 32768          ; <i32> [#uses=1]
-        %tmp.5 = add i32 %tmp.4, -32768         ; <i32> [#uses=1]
-        ret i32 %tmp.5
-; CHECK-LABEL: @test4(
-; CHECK: %tmp.5 = sext i16 %P to i32
-; CHECK: ret i32 %tmp.5
-}
-
-define i32 @test5(i32 %x) {
+define i32 @test4(i32 %x) {
         %tmp.1 = and i32 %x, 255                ; <i32> [#uses=1]
         %tmp.2 = xor i32 %tmp.1, 128            ; <i32> [#uses=1]
         %tmp.3 = add i32 %tmp.2, -128           ; <i32> [#uses=1]
         ret i32 %tmp.3
-; CHECK-LABEL: @test5(
+; CHECK-LABEL: @test4(
 ; CHECK: %sext = shl i32 %x, 24
 ; CHECK: %tmp.3 = ashr exact i32 %sext, 24
 ; CHECK: ret i32 %tmp.3
 }
 
-define i32 @test6(i32 %x) {
+define i32 @test5(i32 %x) {
         %tmp.2 = shl i32 %x, 16         ; <i32> [#uses=1]
         %tmp.4 = ashr i32 %tmp.2, 16            ; <i32> [#uses=1]
         ret i32 %tmp.4
-; CHECK-LABEL: @test6(
+; CHECK-LABEL: @test5(
 ; CHECK: %tmp.2 = shl i32 %x, 16
 ; CHECK: %tmp.4 = ashr exact i32 %tmp.2, 16
 ; CHECK: ret i32 %tmp.4
 }
 
-define i32 @test7(i16 %P) {
+define i32 @test6(i16 %P) {
   %tmp.1 = zext i16 %P to i32                     ; <i32> [#uses=1]
   %sext1 = shl i32 %tmp.1, 16                     ; <i32> [#uses=1]
   %tmp.5 = ashr i32 %sext1, 16                    ; <i32> [#uses=1]
   ret i32 %tmp.5
-; CHECK-LABEL: @test7(
+; CHECK-LABEL: @test6(
 ; CHECK: %tmp.5 = sext i16 %P to i32
 ; CHECK: ret i32 %tmp.5
 }
 
-define i32 @test8(i32 %x) nounwind readnone {
+define i32 @test7(i32 %x) nounwind readnone {
 entry:
   %shr = lshr i32 %x, 5                           ; <i32> [#uses=1]
   %xor = xor i32 %shr, 67108864                   ; <i32> [#uses=1]
   %sub = add i32 %xor, -67108864                  ; <i32> [#uses=1]
   ret i32 %sub
-; CHECK-LABEL: @test8(
+; CHECK-LABEL: @test7(
 ; CHECK: %sub = ashr i32 %x, 5
 ; CHECK: ret i32 %sub
 }
+
diff --git a/test/Transforms/InstCombine/sitofp.ll b/test/Transforms/InstCombine/sitofp.ll
index bd31b89..8209778 100644
--- a/test/Transforms/InstCombine/sitofp.ll
+++ b/test/Transforms/InstCombine/sitofp.ll
@@ -1,28 +1,40 @@
-; RUN: opt < %s -instcombine -S | not grep itofp
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
+; CHECK-LABEL: test1
+; CHECK: ret i1 true
 define i1 @test1(i8 %A) {
   %B = sitofp i8 %A to double
   %C = fcmp ult double %B, 128.0
-  ret i1 %C  ;  True!
+  ret i1 %C
 }
+
+; CHECK-LABEL: test2
+; CHECK: ret i1 true
 define i1 @test2(i8 %A) {
   %B = sitofp i8 %A to double
   %C = fcmp ugt double %B, -128.1
-  ret i1 %C  ;  True!
+  ret i1 %C
 }
 
+; CHECK-LABEL: test3
+; CHECK: ret i1 true
 define i1 @test3(i8 %A) {
   %B = sitofp i8 %A to double
   %C = fcmp ule double %B, 127.0
-  ret i1 %C  ;  true!
+  ret i1 %C
 }
 
+; CHECK-LABEL: test4
+; CHECK: icmp ne i8 %A, 127
+; CHECK-NEXT: ret i1
 define i1 @test4(i8 %A) {
   %B = sitofp i8 %A to double
   %C = fcmp ult double %B, 127.0
-  ret i1 %C  ;  A != 127
+  ret i1 %C
 }
 
+; CHECK-LABEL: test5
+; CHECK: ret i32
 define i32 @test5(i32 %A) {
   %B = sitofp i32 %A to double
   %C = fptosi double %B to i32
@@ -31,25 +43,142 @@ define i32 @test5(i32 %A) {
   ret i32 %E
 }
 
+; CHECK-LABEL: test6
+; CHECK: and i32 %A, 39
+; CHECK-NEXT: ret i32
 define i32 @test6(i32 %A) {
-	%B = and i32 %A, 7		; <i32> [#uses=1]
-	%C = and i32 %A, 32		; <i32> [#uses=1]
-	%D = sitofp i32 %B to double		; <double> [#uses=1]
-	%E = sitofp i32 %C to double		; <double> [#uses=1]
-	%F = fadd double %D, %E		; <double> [#uses=1]
-	%G = fptosi double %F to i32		; <i32> [#uses=1]
-	ret i32 %G
+  %B = and i32 %A, 7
+  %C = and i32 %A, 32
+  %D = sitofp i32 %B to double
+  %E = sitofp i32 %C to double
+  %F = fadd double %D, %E
+  %G = fptosi double %F to i32
+  ret i32 %G
+}
+
+; CHECK-LABEL: test7
+; CHECK: ret i32
+define i32 @test7(i32 %A) nounwind {
+  %B = sitofp i32 %A to double
+  %C = fptoui double %B to i32
+  ret i32 %C
+}
+
+; CHECK-LABEL: test8
+; CHECK: ret i32
+define i32 @test8(i32 %A) nounwind {
+  %B = uitofp i32 %A to double
+  %C = fptosi double %B to i32
+  ret i32 %C
+}
+
+; CHECK-LABEL: test9
+; CHECK: zext i8
+; CHECK-NEXT: ret i32
+define i32 @test9(i8 %A) nounwind {
+  %B = sitofp i8 %A to float
+  %C = fptoui float %B to i32
+  ret i32 %C
+}
+
+; CHECK-LABEL: test10
+; CHECK: sext i8
+; CHECK-NEXT: ret i32
+define i32 @test10(i8 %A) nounwind {
+  %B = sitofp i8 %A to float
+  %C = fptosi float %B to i32
+  ret i32 %C
+}
+
+; If the input value is outside of the range of the output cast, it's
+; undefined behavior, so we can assume it fits.
+; CHECK-LABEL: test11
+; CHECK: trunc
+; CHECK-NEXT: ret i8
+define i8 @test11(i32 %A) nounwind {
+  %B = sitofp i32 %A to float
+  %C = fptosi float %B to i8
+  ret i8 %C
+}
+
+; If the input value is negative, it'll be outside the range of the
+; output cast, and thus undefined behavior.
+; CHECK-LABEL: test12
+; CHECK: zext i8
+; CHECK-NEXT: ret i32
+define i32 @test12(i8 %A) nounwind {
+  %B = sitofp i8 %A to float
+  %C = fptoui float %B to i32
+  ret i32 %C
+}
+
+; This can't fold because the 25-bit input doesn't fit in the mantissa.
+; CHECK-LABEL: test13
+; CHECK: uitofp
+; CHECK-NEXT: fptoui
+define i32 @test13(i25 %A) nounwind {
+  %B = uitofp i25 %A to float
+  %C = fptoui float %B to i32
+  ret i32 %C
+}
+
+; But this one can.
+; CHECK-LABEL: test14
+; CHECK: zext i24
+; CHECK-NEXT: ret i32
+define i32 @test14(i24 %A) nounwind {
+  %B = uitofp i24 %A to float
+  %C = fptoui float %B to i32
+  ret i32 %C
+}
+
+; And this one can too.
+; CHECK-LABEL: test15
+; CHECK: trunc i32
+; CHECK-NEXT: ret i24
+define i24 @test15(i32 %A) nounwind {
+  %B = uitofp i32 %A to float
+  %C = fptoui float %B to i24
+  ret i24 %C
+}
+
+; This can fold because the 25-bit input is signed and we disard the sign bit.
+; CHECK-LABEL: test16
+; CHECK: zext
+define i32 @test16(i25 %A) nounwind {
+ %B = sitofp i25 %A to float
+ %C = fptoui float %B to i32
+ ret i32 %C
+}
+
+; This can't fold because the 26-bit input won't fit the mantissa
+; even after disarding the signed bit.
+; CHECK-LABEL: test17
+; CHECK: sitofp
+; CHECK-NEXT: fptoui
+define i32 @test17(i26 %A) nounwind {
+ %B = sitofp i26 %A to float
+ %C = fptoui float %B to i32
+ ret i32 %C
 }
 
-define i32 @test7(i32 %a) nounwind {
-	%b = sitofp i32 %a to double		; <double> [#uses=1]
-	%c = fptoui double %b to i32		; <i32> [#uses=1]
-	ret i32 %c
+; This can fold because the 54-bit output is signed and we disard the sign bit.
+; CHECK-LABEL: test18
+; CHECK: trunc
+define i54 @test18(i64 %A) nounwind {
+ %B = sitofp i64 %A to double
+ %C = fptosi double %B to i54
+ ret i54 %C
 }
 
-define i32 @test8(i32 %a) nounwind {
-	%b = uitofp i32 %a to double		; <double> [#uses=1]
-	%c = fptosi double %b to i32		; <i32> [#uses=1]
-	ret i32 %c
+; This can't fold because the 55-bit output won't fit the mantissa
+; even after disarding the sign bit.
+; CHECK-LABEL: test19
+; CHECK: sitofp
+; CHECK-NEXT: fptosi
+define i55 @test19(i64 %A) nounwind {
+ %B = sitofp i64 %A to double
+ %C = fptosi double %B to i55
+ ret i55 %C
 }
 
diff --git a/test/Transforms/InstCombine/statepoint.ll b/test/Transforms/InstCombine/statepoint.ll
new file mode 100644
index 0000000..bee219d
--- /dev/null
+++ b/test/Transforms/InstCombine/statepoint.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; These tests check the optimizations specific to
+; pointers being relocated at a statepoint.
+
+
+declare void @func()
+
+define i1 @test_negative(i32 addrspace(1)* %p) {
+entry:
+  %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0, i32 addrspace(1)* %p)
+  %pnew = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 4, i32 4)
+  %cmp = icmp eq i32 addrspace(1)* %pnew, null
+  ret i1 %cmp
+; CHECK-LABEL: test_negative
+; CHECK: %pnew = call i32 addrspace(1)*
+; CHECK: ret i1 %cmp
+}
+
+define i1 @test_nonnull(i32 addrspace(1)* nonnull %p) {
+entry:
+  %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0, i32 addrspace(1)* %p)
+  %pnew = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 4, i32 4)
+  %cmp = icmp eq i32 addrspace(1)* %pnew, null
+  ret i1 %cmp
+; CHECK-LABEL: test_nonnull
+; CHECK: ret i1 false
+}
+
+define i1 @test_null() {
+entry:
+  %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0, i32 addrspace(1)* null)
+  %pnew = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 4, i32 4)
+  %cmp = icmp eq i32 addrspace(1)* %pnew, null
+  ret i1 %cmp
+; CHECK-LABEL: test_null
+; CHECK-NOT: %pnew
+; CHECK: ret i1 true
+}
+
+define i1 @test_undef() {
+entry:
+  %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0, i32 addrspace(1)* undef)
+  %pnew = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 4, i32 4)
+  %cmp = icmp eq i32 addrspace(1)* %pnew, null
+  ret i1 %cmp
+; CHECK-LABEL: test_undef
+; CHECK-NOT: %pnew
+; CHECK: ret i1 undef
+}
+
+declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()*, i32, i32, ...)
+declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32) #3
diff --git a/test/Transforms/InstCombine/store.ll b/test/Transforms/InstCombine/store.ll
index b64c800..0bb1759 100644
--- a/test/Transforms/InstCombine/store.ll
+++ b/test/Transforms/InstCombine/store.ll
@@ -113,8 +113,8 @@ for.end:                                          ; preds = %for.cond
 ; CHECK-NEXT: store i32 %storemerge, i32* %gi, align 4, !tbaa !0
 }
 
-!0 = metadata !{metadata !4, metadata !4, i64 0}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"float", metadata !1}
-!4 = metadata !{metadata !"int", metadata !1}
+!0 = !{!4, !4, i64 0}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"float", !1}
+!4 = !{!"int", !1}
diff --git a/test/Transforms/InstCombine/stpcpy_chk-1.ll b/test/Transforms/InstCombine/stpcpy_chk-1.ll
index 8a02529..393c5d9 100644
--- a/test/Transforms/InstCombine/stpcpy_chk-1.ll
+++ b/test/Transforms/InstCombine/stpcpy_chk-1.ll
@@ -11,46 +11,50 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 
 ; Check cases where slen >= strlen (src).
 
-define void @test_simplify1() {
+define i8* @test_simplify1() {
 ; CHECK-LABEL: @test_simplify1(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
-  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 60)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 1, i1 false)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 11)
+  %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 60)
+  ret i8* %ret
 }
 
-define void @test_simplify2() {
+define i8* @test_simplify2() {
 ; CHECK-LABEL: @test_simplify2(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
-  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 12)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 1, i1 false)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 11)
+  %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 12)
+  ret i8* %ret
 }
 
-define void @test_simplify3() {
+define i8* @test_simplify3() {
 ; CHECK-LABEL: @test_simplify3(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
-  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 -1)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 1, i1 false)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 11)
+  %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret i8* %ret
 }
 
 ; Check cases where there are no string constants.
 
-define void @test_simplify4() {
+define i8* @test_simplify4() {
 ; CHECK-LABEL: @test_simplify4(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @stpcpy
-  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 -1)
-  ret void
+; CHECK-NEXT: %stpcpy = call i8* @stpcpy(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8]* @b, i32 0, i32 0))
+; CHECK-NEXT: ret i8* %stpcpy
+  %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret i8* %ret
 }
 
 ; Check case where the string length is not constant.
@@ -60,10 +64,11 @@ define i8* @test_simplify5() {
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK: @__memcpy_chk
+; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i1 false)
+; CHECK-NEXT: %1 = call i8* @__memcpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 %len)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 11)
   %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
   %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 %len)
-; CHECK: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 11)
   ret i8* %ret
 }
 
@@ -73,8 +78,9 @@ define i8* @test_simplify6() {
 ; CHECK-LABEL: @test_simplify6(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
 
-; CHECK: [[LEN:%[a-z]+]] = call i32 @strlen
-; CHECK-NEXT: getelementptr inbounds [60 x i8]* @a, i32 0, i32 [[LEN]]
+; CHECK-NEXT: %strlen = call i32 @strlen(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0))
+; CHECK-NEXT: %1 = getelementptr inbounds [60 x i8]* @a, i32 0, i32 %strlen
+; CHECK-NEXT: ret i8* %1
   %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
   %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %dst, i32 %len)
   ret i8* %ret
@@ -82,14 +88,15 @@ define i8* @test_simplify6() {
 
 ; Check case where slen < strlen (src).
 
-define void @test_no_simplify1() {
+define i8* @test_no_simplify1() {
 ; CHECK-LABEL: @test_no_simplify1(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @__stpcpy_chk
-  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 8)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__stpcpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8]* @b, i32 0, i32 0), i32 8)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 8)
+  ret i8* %ret
 }
 
 declare i8* @__stpcpy_chk(i8*, i8*, i32) nounwind
diff --git a/test/Transforms/InstCombine/strcpy_chk-1.ll b/test/Transforms/InstCombine/strcpy_chk-1.ll
index 8e7fec7..e3f163f 100644
--- a/test/Transforms/InstCombine/strcpy_chk-1.ll
+++ b/test/Transforms/InstCombine/strcpy_chk-1.ll
@@ -11,59 +11,65 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 
 ; Check cases where slen >= strlen (src).
 
-define void @test_simplify1() {
+define i8* @test_simplify1() {
 ; CHECK-LABEL: @test_simplify1(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
-  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 60)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 1, i1 false)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
+  %ret = call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 60)
+  ret i8* %ret
 }
 
-define void @test_simplify2() {
+define i8* @test_simplify2() {
 ; CHECK-LABEL: @test_simplify2(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
-  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 12)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 1, i1 false)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
+  %ret = call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 12)
+  ret i8* %ret
 }
 
-define void @test_simplify3() {
+define i8* @test_simplify3() {
 ; CHECK-LABEL: @test_simplify3(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
-  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 -1)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 1, i1 false)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
+  %ret = call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret i8* %ret
 }
 
 ; Check cases where there are no string constants.
 
-define void @test_simplify4() {
+define i8* @test_simplify4() {
 ; CHECK-LABEL: @test_simplify4(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @strcpy
-  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 -1)
-  ret void
+; CHECK-NEXT: %strcpy = call i8* @strcpy(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8]* @b, i32 0, i32 0))
+; CHECK-NEXT: ret i8* %strcpy
+  %ret = call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret i8* %ret
 }
 
 ; Check case where the string length is not constant.
 
-define void @test_simplify5() {
+define i8* @test_simplify5() {
 ; CHECK-LABEL: @test_simplify5(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK: @__memcpy_chk
+; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i1 false)
+; CHECK-NEXT: %1 = call i8* @__memcpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 %len)
+; CHECK-NEXT: ret i8* %1
   %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
-  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 %len)
-  ret void
+  %ret = call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 %len)
+  ret i8* %ret
 }
 
 ; Check case where the source and destination are the same.
@@ -72,7 +78,9 @@ define i8* @test_simplify6() {
 ; CHECK-LABEL: @test_simplify6(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
 
-; CHECK: getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
+; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i1 false)
+; CHECK-NEXT: %ret = call i8* @__strcpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i32 %len)
+; CHECK-NEXT: ret i8* %ret
   %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
   %ret = call i8* @__strcpy_chk(i8* %dst, i8* %dst, i32 %len)
   ret i8* %ret
@@ -80,14 +88,15 @@ define i8* @test_simplify6() {
 
 ; Check case where slen < strlen (src).
 
-define void @test_no_simplify1() {
+define i8* @test_no_simplify1() {
 ; CHECK-LABEL: @test_no_simplify1(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @__strcpy_chk
-  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 8)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__strcpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8]* @b, i32 0, i32 0), i32 8)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 8)
+  ret i8* %ret
 }
 
 declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
diff --git a/test/Transforms/InstCombine/strncpy_chk-1.ll b/test/Transforms/InstCombine/strncpy_chk-1.ll
index 90b4173..9242a8a 100644
--- a/test/Transforms/InstCombine/strncpy_chk-1.ll
+++ b/test/Transforms/InstCombine/strncpy_chk-1.ll
@@ -11,56 +11,61 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 
 ; Check cases where dstlen >= len
 
-define void @test_simplify1() {
+define i8* @test_simplify1() {
 ; CHECK-LABEL: @test_simplify1(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
-  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 60)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 1, i1 false)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
+  %ret = call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 60)
+  ret i8* %ret
 }
 
-define void @test_simplify2() {
+define i8* @test_simplify2() {
 ; CHECK-LABEL: @test_simplify2(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
-  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 12)
-  ret void
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 12, i32 1, i1 false)
+; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
+  %ret = call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 12)
+  ret i8* %ret
 }
 
-define void @test_simplify3() {
+define i8* @test_simplify3() {
 ; CHECK-LABEL: @test_simplify3(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @strncpy
-  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 60)
-  ret void
+; CHECK-NEXT: %strncpy = call i8* @strncpy(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8]* @b, i32 0, i32 0), i32 12)
+; CHECK-NEXT: ret i8* %strncpy
+  %ret = call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 60)
+  ret i8* %ret
 }
 
 ; Check cases where dstlen < len
 
-define void @test_no_simplify1() {
+define i8* @test_no_simplify1() {
 ; CHECK-LABEL: @test_no_simplify1(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @__strncpy_chk
-  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 8, i32 4)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__strncpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), i32 8, i32 4)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 8, i32 4)
+  ret i8* %ret
 }
 
-define void @test_no_simplify2() {
+define i8* @test_no_simplify2() {
 ; CHECK-LABEL: @test_no_simplify2(
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
 
-; CHECK-NEXT: call i8* @__strncpy_chk
-  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 8, i32 0)
-  ret void
+; CHECK-NEXT: %ret = call i8* @__strncpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8]* @b, i32 0, i32 0), i32 8, i32 0)
+; CHECK-NEXT: ret i8* %ret
+  %ret = call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 8, i32 0)
+  ret i8* %ret
 }
 
 declare i8* @__strncpy_chk(i8*, i8*, i32, i32)
diff --git a/test/Transforms/InstCombine/struct-assign-tbaa.ll b/test/Transforms/InstCombine/struct-assign-tbaa.ll
index c80e31a..e949640 100644
--- a/test/Transforms/InstCombine/struct-assign-tbaa.ll
+++ b/test/Transforms/InstCombine/struct-assign-tbaa.ll
@@ -10,8 +10,8 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32,
 %struct.test1 = type { float }
 
 ; CHECK: @test
-; CHECK: %2 = load float* %0, align 4, !tbaa !0
-; CHECK: store float %2, float* %1, align 4, !tbaa !0
+; CHECK: %[[LOAD:.*]] = load i32* %{{.*}}, align 4, !tbaa !0
+; CHECK: store i32 %[[LOAD:.*]], i32* %{{.*}}, align 4, !tbaa !0
 ; CHECK: ret
 define void @test1(%struct.test1* nocapture %a, %struct.test1* nocapture %b) {
 entry:
@@ -35,12 +35,12 @@ define i32 (i8*, i32*, double*)*** @test2() {
   ret i32 (i8*, i32*, double*)*** %tmp2
 }
 
-; CHECK: !0 = metadata !{metadata !1, metadata !1, i64 0}
-; CHECK: !1 = metadata !{metadata !"float", metadata !2}
+; CHECK: !0 = !{!1, !1, i64 0}
+; CHECK: !1 = !{!"float", !2}
 
-!0 = metadata !{metadata !"Simple C/C++ TBAA"}
-!1 = metadata !{metadata !"omnipotent char", metadata !0}
-!2 = metadata !{metadata !5, metadata !5, i64 0}
-!3 = metadata !{i64 0, i64 4, metadata !2}
-!4 = metadata !{i64 0, i64 8, null}
-!5 = metadata !{metadata !"float", metadata !0}
+!0 = !{!"Simple C/C++ TBAA"}
+!1 = !{!"omnipotent char", !0}
+!2 = !{!5, !5, i64 0}
+!3 = !{i64 0, i64 4, !2}
+!4 = !{i64 0, i64 8, null}
+!5 = !{!"float", !0}
diff --git a/test/Transforms/InstCombine/type_pun.ll b/test/Transforms/InstCombine/type_pun.ll
new file mode 100644
index 0000000..33143ef
--- /dev/null
+++ b/test/Transforms/InstCombine/type_pun.ll
@@ -0,0 +1,137 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Ensure that type punning using a union of vector and same-sized array
+; generates an extract instead of a shuffle with an uncommon vector size:
+;
+;   typedef uint32_t v4i32 __attribute__((vector_size(16)));
+;   union { v4i32 v; uint32_t a[4]; };
+;
+; This cleans up behind SROA, which inserts the uncommon vector size when
+; cleaning up the alloca/store/GEP/load.
+
+
+; Extracting the zeroth element in an i32 array.
+define i32 @type_pun_zeroth(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_zeroth(
+; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32>
+; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0
+; CHECK-NEXT: ret i32 %[[EXT]]
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = bitcast <4 x i8> %sroa to i32
+  ret i32 %1
+}
+
+; Extracting the first element in an i32 array.
+define i32 @type_pun_first(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_first(
+; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32>
+; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 1
+; CHECK-NEXT: ret i32 %[[EXT]]
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = bitcast <4 x i8> %sroa to i32
+  ret i32 %1
+}
+
+; Extracting an i32 that isn't aligned to any natural boundary.
+define i32 @type_pun_misaligned(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_misaligned(
+; CHECK-NEXT: %[[SHUF:.*]] = shufflevector <16 x i8> %in, <16 x i8> undef, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %[[SHUF]] to <4 x i32>
+; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0
+; CHECK-NEXT: ret i32 %[[EXT]]
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+  %1 = bitcast <4 x i8> %sroa to i32
+  ret i32 %1
+}
+
+; Type punning to an array of pointers.
+define i32* @type_pun_pointer(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_pointer(
+; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32>
+; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0
+; CHECK-NEXT: %[[I2P:.*]] = inttoptr i32 %[[EXT]] to i32*
+; CHECK-NEXT: ret i32* %[[I2P]]
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = bitcast <4 x i8> %sroa to i32
+  %2 = inttoptr i32 %1 to i32*
+  ret i32* %2
+}
+
+; Type punning to an array of 32-bit floating-point values.
+define float @type_pun_float(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_float(
+; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x float>
+; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x float> %[[BC]], i32 0
+; CHECK-NEXT: ret float %[[EXT]]
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = bitcast <4 x i8> %sroa to float
+  ret float %1
+}
+
+; Type punning to an array of 64-bit floating-point values.
+define double @type_pun_double(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_double(
+; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <2 x double>
+; CHECK-NEXT: %[[EXT:.*]] = extractelement <2 x double> %[[BC]], i32 0
+; CHECK-NEXT: ret double %[[EXT]]
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %1 = bitcast <8 x i8> %sroa to double
+  ret double %1
+}
+
+; Type punning to same-size floating-point and integer values.
+; Verify that multiple uses with different bitcast types are properly handled.
+define { float, i32 } @type_pun_float_i32(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_float_i32(
+; CHECK-NEXT: %[[BCI:.*]] = bitcast <16 x i8> %in to <4 x i32>
+; CHECK-NEXT: %[[EXTI:.*]] = extractelement <4 x i32> %[[BCI]], i32 0
+; CHECK-NEXT: %[[BCF:.*]] = bitcast <16 x i8> %in to <4 x float>
+; CHECK-NEXT: %[[EXTF:.*]] = extractelement <4 x float> %[[BCF]], i32 0
+; CHECK-NEXT: %1 = insertvalue { float, i32 } undef, float %[[EXTF]], 0
+; CHECK-NEXT: %2 = insertvalue { float, i32 } %1, i32 %[[EXTI]], 1
+; CHECK-NEXT: ret { float, i32 } %2
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %f = bitcast <4 x i8> %sroa to float
+  %i = bitcast <4 x i8> %sroa to i32
+  %1 = insertvalue { float, i32 } undef, float %f, 0
+  %2 = insertvalue { float, i32 } %1, i32 %i, 1
+  ret { float, i32 } %2
+}
+
+; Type punning two i32 values, with control flow.
+; Verify that the bitcast is shared and dominates usage.
+define i32 @type_pun_i32_ctrl(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_i32_ctrl(
+entry: ; CHECK-NEXT: entry:
+; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32>
+; CHECK-NEXT: br
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  br i1 undef, label %left, label %right
+left: ; CHECK: left:
+; CHECK-NEXT: %[[EXTL:.*]] = extractelement <4 x i32> %[[BC]], i32 0
+; CHECK-NEXT: br
+  %lhs = bitcast <4 x i8> %sroa to i32
+  br label %tail
+right: ; CHECK: right:
+; CHECK-NEXT: %[[EXTR:.*]] = extractelement <4 x i32> %[[BC]], i32 0
+; CHECK-NEXT: br
+  %rhs = bitcast <4 x i8> %sroa to i32
+  br label %tail
+tail: ; CHECK: tail:
+; CHECK-NEXT: %i = phi i32 [ %[[EXTL]], %left ], [ %[[EXTR]], %right ]
+; CHECK-NEXT: ret i32 %i
+  %i = phi i32 [ %lhs, %left ], [ %rhs, %right ]
+  ret i32 %i
+}
+
+; Extracting a type that won't fit in a vector isn't handled. The function
+; should stay the same.
+define i40 @type_pun_unhandled(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_unhandled(
+; CHECK-NEXT: %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <5 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT: %1 = bitcast <5 x i8> %sroa to i40
+; CHECK-NEXT: ret i40 %1
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <5 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8>
+  %1 = bitcast <5 x i8> %sroa to i40
+  ret i40 %1
+}
diff --git a/test/Transforms/InstCombine/unordered-fcmp-select.ll b/test/Transforms/InstCombine/unordered-fcmp-select.ll
new file mode 100644
index 0000000..0eb7290
--- /dev/null
+++ b/test/Transforms/InstCombine/unordered-fcmp-select.ll
@@ -0,0 +1,125 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; CHECK-LABEL: @select_max_ugt(
+; CHECK: %cmp.inv = fcmp ole float %a, %b
+; CHECK-NEXT: %sel = select i1 %cmp.inv, float %b, float %a
+; CHECK-NEXT: ret float %sel
+define float @select_max_ugt(float %a, float %b) {
+  %cmp = fcmp ugt float %a, %b
+  %sel = select i1 %cmp, float %a, float %b
+  ret float %sel
+}
+
+; CHECK-LABEL: @select_max_uge(
+; CHECK: %cmp.inv = fcmp olt float %a, %b
+; CHECK-NEXT: %sel = select i1 %cmp.inv, float %b, float %a
+; CHECK-NEXT: ret float %sel
+define float @select_max_uge(float %a, float %b) {
+  %cmp = fcmp uge float %a, %b
+  %sel = select i1 %cmp, float %a, float %b
+  ret float %sel
+}
+
+; CHECK-LABEL: @select_min_ugt(
+; CHECK: %cmp.inv = fcmp ole float %a, %b
+; CHECK-NEXT: %sel = select i1 %cmp.inv, float %a, float %b
+; CHECK-NEXT: ret float %sel
+define float @select_min_ugt(float %a, float %b) {
+  %cmp = fcmp ugt float %a, %b
+  %sel = select i1 %cmp, float %b, float %a
+  ret float %sel
+}
+
+; CHECK-LABEL: @select_min_uge(
+; CHECK: %cmp.inv = fcmp olt float %a, %b
+; CHECK-NEXT: %sel = select i1 %cmp.inv, float %a, float %b
+; CHECK-NEXT: ret float %sel
+define float @select_min_uge(float %a, float %b) {
+  %cmp = fcmp uge float %a, %b
+  %sel = select i1 %cmp, float %b, float %a
+  ret float %sel
+}
+
+; CHECK-LABEL: @select_max_ult(
+; CHECK: %cmp.inv = fcmp oge float %a, %b
+; CHECK-NEXT: %sel = select i1 %cmp.inv, float %a, float %b
+; CHECK-NEXT: ret float %sel
+define float @select_max_ult(float %a, float %b) {
+  %cmp = fcmp ult float %a, %b
+  %sel = select i1 %cmp, float %b, float %a
+  ret float %sel
+}
+
+; CHECK-LABEL: @select_max_ule(
+; CHECK: %cmp.inv = fcmp ogt float %a, %b
+; CHECK-NEXT: %sel = select i1 %cmp.inv, float %a, float %b
+; CHECK: ret float %sel
+define float @select_max_ule(float %a, float %b) {
+  %cmp = fcmp ule float %a, %b
+  %sel = select i1 %cmp, float %b, float %a
+  ret float %sel
+}
+
+; CHECK-LABEL: @select_min_ult(
+; CHECK: %cmp.inv = fcmp oge float %a, %b
+; CHECK-NEXT: %sel = select i1 %cmp.inv, float %b, float %a
+; CHECK-NEXT: ret float %sel
+define float @select_min_ult(float %a, float %b) {
+  %cmp = fcmp ult float %a, %b
+  %sel = select i1 %cmp, float %a, float %b
+  ret float %sel
+}
+
+; CHECK-LABEL: @select_min_ule(
+; CHECK: %cmp.inv = fcmp ogt float %a, %b
+; CHECK-NEXT: %sel = select i1 %cmp.inv, float %b, float %a
+; CHECK-NEXT: ret float %sel
+define float @select_min_ule(float %a, float %b) {
+  %cmp = fcmp ule float %a, %b
+  %sel = select i1 %cmp, float %a, float %b
+  ret float %sel
+}
+
+; CHECK-LABEL: @select_fcmp_une(
+; CHECK:  %cmp.inv = fcmp oeq float %a, %b
+; CHECK-NEXT:  %sel = select i1 %cmp.inv, float %b, float %a
+; CHECK-NEXT: ret float %sel
+define float @select_fcmp_une(float %a, float %b) {
+  %cmp = fcmp une float %a, %b
+  %sel = select i1 %cmp, float %a, float %b
+  ret float %sel
+}
+
+; CHECK-LABEL: @select_fcmp_ueq
+; CHECK:  %cmp.inv = fcmp one float %a, %b
+; CHECK-NEXT:  %sel = select i1 %cmp.inv, float %b, float %a
+; CHECK-NEXT: ret float %sel
+define float @select_fcmp_ueq(float %a, float %b) {
+  %cmp = fcmp ueq float %a, %b
+  %sel = select i1 %cmp, float %a, float %b
+  ret float %sel
+}
+
+declare void @foo(i1)
+
+; CHECK-LABEL: @select_max_ugt_2_use_cmp(
+; CHECK: fcmp ugt
+; CHECK-NOT: fcmp
+; CHECK: ret
+define float @select_max_ugt_2_use_cmp(float %a, float %b) {
+  %cmp = fcmp ugt float %a, %b
+  call void @foo(i1 %cmp)
+  %sel = select i1 %cmp, float %a, float %b
+  ret float %sel
+}
+
+; CHECK-LABEL: @select_min_uge_2_use_cmp(
+; CHECK: fcmp uge
+; CHECK-NOT: fcmp
+; CHECK: ret
+define float @select_min_uge_2_use_cmp(float %a, float %b) {
+  %cmp = fcmp uge float %a, %b
+  call void @foo(i1 %cmp)
+  %sel = select i1 %cmp, float %b, float %a
+  ret float %sel
+}
diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll
index 41d2b29..00a029a 100644
--- a/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -303,6 +303,33 @@ define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) {
   ret <2 x i64> %2
 }
 
+; CHECK: define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: ret <2 x i64> %i
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 0)
+  ret <2 x i64> %1
+}
+
+; CHECK: define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: ret <2 x i64> undef
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 16)
+  ret <2 x i64> %1
+}
+
+; CHECK: define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: ret <2 x i64> undef
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 32)
+  ret <2 x i64> %1
+}
+
+; CHECK: define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: ret <2 x i64> undef
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 16)
+  ret <2 x i64> %1
+}
 
 ; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi
 declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
diff --git a/test/Transforms/InstCombine/xor.ll b/test/Transforms/InstCombine/xor.ll
index 3722697..c8debcb 100644
--- a/test/Transforms/InstCombine/xor.ll
+++ b/test/Transforms/InstCombine/xor.ll
@@ -1,50 +1,70 @@
 ; This test makes sure that these instructions are properly eliminated.
 ;
 ; RUN: opt < %s -instcombine -S | \
-; RUN:    not grep "xor "
+; RUN:    FileCheck %s
 ; END.
 @G1 = global i32 0		; <i32*> [#uses=1]
 @G2 = global i32 0		; <i32*> [#uses=1]
 
 define i1 @test0(i1 %A) {
+; CHECK-LABEL: @test0(
+; CHECK-NEXT: ret i1 %A
 	%B = xor i1 %A, false		; <i1> [#uses=1]
 	ret i1 %B
 }
 
 define i32 @test1(i32 %A) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT: ret i32 %A
 	%B = xor i32 %A, 0		; <i32> [#uses=1]
 	ret i32 %B
 }
 
 define i1 @test2(i1 %A) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT: ret i1 false
 	%B = xor i1 %A, %A		; <i1> [#uses=1]
 	ret i1 %B
 }
 
 define i32 @test3(i32 %A) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT: ret i32 0
 	%B = xor i32 %A, %A		; <i32> [#uses=1]
 	ret i32 %B
 }
 
 define i32 @test4(i32 %A) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT: ret i32 -1
 	%NotA = xor i32 -1, %A		; <i32> [#uses=1]
 	%B = xor i32 %A, %NotA		; <i32> [#uses=1]
 	ret i32 %B
 }
 
 define i32 @test5(i32 %A) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT: %1 = and i32 %A, -124
+; CHECK-NEXT: ret i32 %1
 	%t1 = or i32 %A, 123		; <i32> [#uses=1]
 	%r = xor i32 %t1, 123		; <i32> [#uses=1]
 	ret i32 %r
 }
 
 define i8 @test6(i8 %A) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT: ret i8 %A
 	%B = xor i8 %A, 17		; <i8> [#uses=1]
 	%C = xor i8 %B, 17		; <i8> [#uses=1]
 	ret i8 %C
 }
 
 define i32 @test7(i32 %A, i32 %B) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT: %A1 = and i32 %A, 7
+; CHECK-NEXT: %B1 = and i32 %B, 128
+; CHECK-NEXT: %C11 = or i32 %A1, %B1
+; CHECK-NEXT: ret i32 %C11
 	%A1 = and i32 %A, 7		; <i32> [#uses=1]
 	%B1 = and i32 %B, 128		; <i32> [#uses=1]
 	%C1 = xor i32 %A1, %B1		; <i32> [#uses=1]
@@ -52,6 +72,8 @@ define i32 @test7(i32 %A, i32 %B) {
 }
 
 define i8 @test8(i1 %c) {
+; CHECK-LABEL: @test8(
+; CHECK: br i1 %c, label %False, label %True
 	%d = xor i1 %c, true		; <i1> [#uses=1]
 	br i1 %d, label %True, label %False
 
@@ -63,30 +85,47 @@ False:		; preds = %0
 }
 
 define i1 @test9(i8 %A) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT: %C = icmp eq i8 %A, 89
+; CHECK-NEXT: ret i1 %C
 	%B = xor i8 %A, 123		; <i8> [#uses=1]
 	%C = icmp eq i8 %B, 34		; <i1> [#uses=1]
 	ret i1 %C
 }
 
 define i8 @test10(i8 %A) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT: %B = and i8 %A, 3
+; CHECK-NEXT: %C1 = or i8 %B, 4
+; CHECK-NEXT: ret i8 %C1
 	%B = and i8 %A, 3		; <i8> [#uses=1]
 	%C = xor i8 %B, 4		; <i8> [#uses=1]
 	ret i8 %C
 }
 
 define i8 @test11(i8 %A) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT: %B = and i8 %A, -13
+; CHECK-NEXT: %1 = or i8 %B, 8
+; CHECK-NEXT: ret i8 %1
 	%B = or i8 %A, 12		; <i8> [#uses=1]
 	%C = xor i8 %B, 4		; <i8> [#uses=1]
 	ret i8 %C
 }
 
 define i1 @test12(i8 %A) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT: %c = icmp ne i8 %A, 4
+; CHECK-NEXT: ret i1 %c
 	%B = xor i8 %A, 4		; <i8> [#uses=1]
 	%c = icmp ne i8 %B, 0		; <i1> [#uses=1]
 	ret i1 %c
 }
 
 define i1 @test13(i8 %A, i8 %B) {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT: %1 = icmp ne i8 %A, %B
+; CHECK-NEXT: ret i1 %1
 	%C = icmp ult i8 %A, %B		; <i1> [#uses=1]
 	%D = icmp ugt i8 %A, %B		; <i1> [#uses=1]
 	%E = xor i1 %C, %D		; <i1> [#uses=1]
@@ -94,6 +133,8 @@ define i1 @test13(i8 %A, i8 %B) {
 }
 
 define i1 @test14(i8 %A, i8 %B) {
+; CHECK-LABEL: @test14(
+; CHECK-NEXT: ret i1 true
 	%C = icmp eq i8 %A, %B		; <i1> [#uses=1]
 	%D = icmp ne i8 %B, %A		; <i1> [#uses=1]
 	%E = xor i1 %C, %D		; <i1> [#uses=1]
@@ -101,36 +142,54 @@ define i1 @test14(i8 %A, i8 %B) {
 }
 
 define i32 @test15(i32 %A) {
+; CHECK-LABEL: @test15(
+; CHECK-NEXT: %C = sub i32 0, %A
+; CHECK-NEXT: ret i32 %C
 	%B = add i32 %A, -1		; <i32> [#uses=1]
 	%C = xor i32 %B, -1		; <i32> [#uses=1]
 	ret i32 %C
 }
 
 define i32 @test16(i32 %A) {
+; CHECK-LABEL: @test16(
+; CHECK-NEXT: %C = sub i32 -124, %A
+; CHECK-NEXT: ret i32 %C
 	%B = add i32 %A, 123		; <i32> [#uses=1]
 	%C = xor i32 %B, -1		; <i32> [#uses=1]
 	ret i32 %C
 }
 
 define i32 @test17(i32 %A) {
+; CHECK-LABEL: @test17(
+; CHECK-NEXT: %C = add i32 %A, -124
+; CHECK-NEXT: ret i32 %C
 	%B = sub i32 123, %A		; <i32> [#uses=1]
 	%C = xor i32 %B, -1		; <i32> [#uses=1]
 	ret i32 %C
 }
 
 define i32 @test18(i32 %A) {
+; CHECK-LABEL: @test18(
+; CHECK-NEXT: %C = add i32 %A, 124
+; CHECK-NEXT: ret i32 %C
 	%B = xor i32 %A, -1		; <i32> [#uses=1]
 	%C = sub i32 123, %B		; <i32> [#uses=1]
 	ret i32 %C
 }
 
 define i32 @test19(i32 %A, i32 %B) {
+; CHECK-LABEL: @test19(
+; CHECK-NEXT: ret i32 %B
 	%C = xor i32 %A, %B		; <i32> [#uses=1]
 	%D = xor i32 %C, %A		; <i32> [#uses=1]
 	ret i32 %D
 }
 
 define void @test20(i32 %A, i32 %B) {
+; CHECK-LABEL: @test20(
+; CHECK-NEXT: store i32 %B, i32* @G1
+; CHECK-NEXT: store i32 %A, i32* @G2
+; CHECK-NEXT: ret void
 	%tmp.2 = xor i32 %B, %A		; <i32> [#uses=2]
 	%tmp.5 = xor i32 %tmp.2, %B		; <i32> [#uses=2]
 	%tmp.8 = xor i32 %tmp.5, %tmp.2		; <i32> [#uses=1]
@@ -140,12 +199,18 @@ define void @test20(i32 %A, i32 %B) {
 }
 
 define i32 @test21(i1 %C, i32 %A, i32 %B) {
+; CHECK-LABEL: @test21(
+; CHECK-NEXT: %D = select i1 %C, i32 %B, i32 %A
+; CHECK-NEXT: ret i32 %D
 	%C2 = xor i1 %C, true		; <i1> [#uses=1]
 	%D = select i1 %C2, i32 %A, i32 %B		; <i32> [#uses=1]
 	ret i32 %D
 }
 
 define i32 @test22(i1 %X) {
+; CHECK-LABEL: @test22(
+; CHECK-NEXT: %1 = zext i1 %X to i32
+; CHECK-NEXT: ret i32 %1
 	%Y = xor i1 %X, true		; <i1> [#uses=1]
 	%Z = zext i1 %Y to i32		; <i32> [#uses=1]
 	%Q = xor i32 %Z, 1		; <i32> [#uses=1]
@@ -153,18 +218,27 @@ define i32 @test22(i1 %X) {
 }
 
 define i1 @test23(i32 %a, i32 %b) {
+; CHECK-LABEL: @test23(
+; CHECK-NEXT: %tmp.4 = icmp eq i32 %b, 0
+; CHECK-NEXT: ret i1 %tmp.4
 	%tmp.2 = xor i32 %b, %a		; <i32> [#uses=1]
 	%tmp.4 = icmp eq i32 %tmp.2, %a		; <i1> [#uses=1]
 	ret i1 %tmp.4
 }
 
 define i1 @test24(i32 %c, i32 %d) {
+; CHECK-LABEL: @test24(
+; CHECK-NEXT: %tmp.4 = icmp ne i32 %d, 0
+; CHECK-NEXT: ret i1 %tmp.4
 	%tmp.2 = xor i32 %d, %c		; <i32> [#uses=1]
 	%tmp.4 = icmp ne i32 %tmp.2, %c		; <i1> [#uses=1]
 	ret i1 %tmp.4
 }
 
 define i32 @test25(i32 %g, i32 %h) {
+; CHECK-LABEL: @test25(
+; CHECK-NEXT: %tmp4 = and i32 %h, %g
+; CHECK-NEXT: ret i32 %tmp4
 	%h2 = xor i32 %h, -1		; <i32> [#uses=1]
 	%tmp2 = and i32 %h2, %g		; <i32> [#uses=1]
 	%tmp4 = xor i32 %tmp2, %g		; <i32> [#uses=1]
@@ -172,6 +246,9 @@ define i32 @test25(i32 %g, i32 %h) {
 }
 
 define i32 @test26(i32 %a, i32 %b) {
+; CHECK-LABEL: @test26(
+; CHECK-NEXT: %tmp4 = and i32 %a, %b
+; CHECK-NEXT: ret i32 %tmp4
 	%b2 = xor i32 %b, -1		; <i32> [#uses=1]
 	%tmp2 = xor i32 %a, %b2		; <i32> [#uses=1]
 	%tmp4 = and i32 %tmp2, %a		; <i32> [#uses=1]
@@ -179,6 +256,10 @@ define i32 @test26(i32 %a, i32 %b) {
 }
 
 define i32 @test27(i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @test27(
+; CHECK-NEXT: %tmp = icmp eq i32 %b, %c
+; CHECK-NEXT: %tmp6 = zext i1 %tmp to i32
+; CHECK-NEXT: ret i32 %tmp6
 	%tmp2 = xor i32 %d, %b		; <i32> [#uses=1]
 	%tmp5 = xor i32 %d, %c		; <i32> [#uses=1]
 	%tmp = icmp eq i32 %tmp2, %tmp5		; <i1> [#uses=1]
@@ -187,6 +268,9 @@ define i32 @test27(i32 %b, i32 %c, i32 %d) {
 }
 
 define i32 @test28(i32 %indvar) {
+; CHECK-LABEL: @test28(
+; CHECK-NEXT: %tmp214 = add i32 %indvar, 1
+; CHECK-NEXT: ret i32 %tmp214
 	%tmp7 = add i32 %indvar, -2147483647		; <i32> [#uses=1]
 	%tmp214 = xor i32 %tmp7, -2147483648		; <i32> [#uses=1]
 	ret i32 %tmp214
diff --git a/test/Transforms/InstMerge/st_sink_barrier_call.ll b/test/Transforms/InstMerge/st_sink_barrier_call.ll
new file mode 100644
index 0000000..c158b00
--- /dev/null
+++ b/test/Transforms/InstMerge/st_sink_barrier_call.ll
@@ -0,0 +1,43 @@
+; Test to make sure that a function call that needs to be a barrier to sinking stores is indeed a barrier.
+; Stores sunks into the footer.
+; RUN: opt -basicaa -memdep -mldst-motion -S < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+%struct.node = type { i32, %struct.node*, %struct.node*, %struct.node*, i32, i32, i32, i32 }
+
+declare i32 @foo(i32 %x)
+
+; Function Attrs: nounwind uwtable
+define void @sink_store(%struct.node* nocapture %r, i32 %index) {
+entry:
+  %node.0.in16 = getelementptr inbounds %struct.node* %r, i64 0, i32 2
+  %node.017 = load %struct.node** %node.0.in16, align 8
+  %index.addr = alloca i32, align 4
+  store i32 %index, i32* %index.addr, align 4
+  %0 = load i32* %index.addr, align 4
+  %cmp = icmp slt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+; CHECK: if.then
+if.then:                                          ; preds = %entry
+  %1 = load i32* %index.addr, align 4
+  %p1 = getelementptr inbounds %struct.node* %node.017, i32 0, i32 6
+  ; CHECK: store i32
+  store i32 %1, i32* %p1, align 4
+  br label %if.end
+  
+; CHECK: if.else
+if.else:                                          ; preds = %entry
+  %2 = load i32* %index.addr, align 4
+  %add = add nsw i32 %2, 1
+  %p3 = getelementptr inbounds %struct.node* %node.017, i32 0, i32 6
+  ; CHECK: store i32
+  store i32 %add, i32* %p3, align 4
+  call i32 @foo(i32 5)				  ;barrier
+  br label %if.end
+
+; CHECK: if.end
+if.end:                                           ; preds = %if.else, %if.then
+; CHECK-NOT: store
+  ret void
+}
diff --git a/test/Transforms/InstMerge/st_sink_bugfix_22613.ll b/test/Transforms/InstMerge/st_sink_bugfix_22613.ll
new file mode 100644
index 0000000..34e3fdb
--- /dev/null
+++ b/test/Transforms/InstMerge/st_sink_bugfix_22613.ll
@@ -0,0 +1,106 @@
+; ModuleID = 'bug.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; RUN: opt -O2 -S < %s | FileCheck %s
+
+; CHECK_LABEL: main
+; CHECK: if.end
+; CHECK: store
+; CHECK: memset
+; CHECK: if.then
+; CHECK: store
+; CHECK: memset
+
+@d = common global i32 0, align 4
+@b = common global i32 0, align 4
+@f = common global [1 x [3 x i8]] zeroinitializer, align 1
+@e = common global i32 0, align 4
+@c = common global i32 0, align 4
+@a = common global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define void @fn1() {
+entry:
+  store i32 0, i32* @d, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc8, %entry
+  %0 = load i32* @d, align 4
+  %cmp = icmp slt i32 %0, 2
+  br i1 %cmp, label %for.body, label %for.end10
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* @d, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32* @b, align 4
+  %idxprom1 = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [1 x [3 x i8]]* @f, i32 0, i64 %idxprom1
+  %arrayidx2 = getelementptr inbounds [3 x i8]* %arrayidx, i32 0, i64 %idxprom
+  store i8 0, i8* %arrayidx2, align 1
+  store i32 0, i32* @e, align 4
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.inc, %for.body
+  %3 = load i32* @e, align 4
+  %cmp4 = icmp slt i32 %3, 3
+  br i1 %cmp4, label %for.body5, label %for.end
+
+for.body5:                                        ; preds = %for.cond3
+  %4 = load i32* @c, align 4
+  %tobool = icmp ne i32 %4, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body5
+  %5 = load i32* @a, align 4
+  %dec = add nsw i32 %5, -1
+  store i32 %dec, i32* @a, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body5
+  %6 = load i32* @e, align 4
+  %idxprom6 = sext i32 %6 to i64
+  %arrayidx7 = getelementptr inbounds [3 x i8]* getelementptr inbounds ([1 x [3 x i8]]* @f, i32 0, i64 0), i32 0, i64 %idxprom6
+  store i8 1, i8* %arrayidx7, align 1
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %7 = load i32* @e, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* @e, align 4
+  br label %for.cond3
+
+for.end:                                          ; preds = %for.cond3
+  br label %for.inc8
+
+for.inc8:                                         ; preds = %for.end
+  %8 = load i32* @d, align 4
+  %inc9 = add nsw i32 %8, 1
+  store i32 %inc9, i32* @d, align 4
+  br label %for.cond
+
+for.end10:                                        ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  call void @fn1()
+  %0 = load i8* getelementptr inbounds ([1 x [3 x i8]]* @f, i32 0, i64 0, i64 1), align 1
+  %conv = sext i8 %0 to i32
+  %cmp = icmp ne i32 %conv, 1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @abort()
+  unreachable
+
+if.end:                                           ; preds = %entry
+  ret i32 0
+}
+
+; Function Attrs: noreturn nounwind
+declare void @abort()
diff --git a/test/Transforms/InstMerge/st_sink_no_barrier_call.ll b/test/Transforms/InstMerge/st_sink_no_barrier_call.ll
new file mode 100644
index 0000000..72f1fdf
--- /dev/null
+++ b/test/Transforms/InstMerge/st_sink_no_barrier_call.ll
@@ -0,0 +1,45 @@
+; Test to make sure that stores in a diamond get merged with a non barrier function call after the store instruction
+; Stores sunks into the footer.
+; RUN: opt -basicaa -memdep -mldst-motion -S < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+%struct.node = type { i32, %struct.node*, %struct.node*, %struct.node*, i32, i32, i32, i32 }
+
+declare i32 @foo(i32 %x) #0
+
+; Function Attrs: nounwind uwtable
+define void @sink_store(%struct.node* nocapture %r, i32 %index) {
+entry:
+  %node.0.in16 = getelementptr inbounds %struct.node* %r, i64 0, i32 2
+  %node.017 = load %struct.node** %node.0.in16, align 8
+  %index.addr = alloca i32, align 4
+  store i32 %index, i32* %index.addr, align 4
+  %0 = load i32* %index.addr, align 4
+  %cmp = icmp slt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+; CHECK: if.then
+if.then:                                          ; preds = %entry
+  %1 = load i32* %index.addr, align 4
+  %p1 = getelementptr inbounds %struct.node* %node.017, i32 0, i32 6
+  ; CHECK-NOT: store i32
+  store i32 %1, i32* %p1, align 4
+  br label %if.end
+  
+; CHECK: if.else
+if.else:                                          ; preds = %entry
+  %2 = load i32* %index.addr, align 4
+  %add = add nsw i32 %2, 1
+  %p3 = getelementptr inbounds %struct.node* %node.017, i32 0, i32 6
+  ; CHECK-NOT: store i32
+  store i32 %add, i32* %p3, align 4
+  call i32 @foo(i32 5)				  ;not a barrier
+  br label %if.end
+
+; CHECK: if.end
+if.end:                                           ; preds = %if.else, %if.then
+; CHECK: store
+  ret void
+}
+
+attributes #0 = { readnone } 
diff --git a/test/Transforms/InstMerge/st_sink_no_barrier_load.ll b/test/Transforms/InstMerge/st_sink_no_barrier_load.ll
new file mode 100644
index 0000000..5be0c25
--- /dev/null
+++ b/test/Transforms/InstMerge/st_sink_no_barrier_load.ll
@@ -0,0 +1,43 @@
+; Test to make sure that stores in a diamond get merged with a non barrier load after the store instruction
+; Stores sunks into the footer.
+; RUN: opt -basicaa -memdep -mldst-motion -S < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+%struct.node = type { i32, %struct.node*, %struct.node*, %struct.node*, i32, i32, i32, i32 }
+
+; Function Attrs: nounwind uwtable
+define void @sink_store(%struct.node* nocapture %r, i32 %index) {
+entry:
+  %node.0.in16 = getelementptr inbounds %struct.node* %r, i64 0, i32 2
+  %node.017 = load %struct.node** %node.0.in16, align 8
+  %index.addr = alloca i32, align 4
+  store i32 %index, i32* %index.addr, align 4
+  %0 = load i32* %index.addr, align 4
+  %cmp = icmp slt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+; CHECK: if.then
+if.then:                                          ; preds = %entry
+  %1 = load i32* %index.addr, align 4
+  %p1 = getelementptr inbounds %struct.node* %node.017, i32 0, i32 6
+  ; CHECK-NOT: store i32
+  store i32 %1, i32* %p1, align 4
+  %p2 = getelementptr inbounds %struct.node* %node.017, i32 5, i32 6
+  ; CHECK: load i32*
+  %not_barrier = load i32 * %p2, align 4
+  br label %if.end
+
+; CHECK: if.else
+if.else:                                          ; preds = %entry
+  %2 = load i32* %index.addr, align 4
+  %add = add nsw i32 %2, 1
+  %p3 = getelementptr inbounds %struct.node* %node.017, i32 0, i32 6
+  ; CHECK-NOT: store i32
+  store i32 %add, i32* %p3, align 4
+  br label %if.end
+
+; CHECK: if.end
+if.end:                                           ; preds = %if.else, %if.then
+; CHECK: store
+  ret void
+}
diff --git a/test/Transforms/InstMerge/st_sink_no_barrier_store.ll b/test/Transforms/InstMerge/st_sink_no_barrier_store.ll
new file mode 100644
index 0000000..06e2b63
--- /dev/null
+++ b/test/Transforms/InstMerge/st_sink_no_barrier_store.ll
@@ -0,0 +1,42 @@
+; Test to make sure that stores in a diamond get merged with a non barrier store after the store instruction to be sunk
+; Stores sunks into the footer.
+; RUN: opt -basicaa -memdep -mldst-motion -S < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+%struct.node = type { i32, %struct.node*, %struct.node*, %struct.node*, i32, i32, i32, i32 }
+
+; Function Attrs: nounwind uwtable
+define void @sink_store(%struct.node* nocapture %r, i32 %index) {
+entry:
+  %node.0.in16 = getelementptr inbounds %struct.node* %r, i64 0, i32 2
+  %node.017 = load %struct.node** %node.0.in16, align 8
+  %index.addr = alloca i32, align 4
+  store i32 %index, i32* %index.addr, align 4
+  %0 = load i32* %index.addr, align 4
+  %cmp = icmp slt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+; CHECK: if.then
+if.then:                                          ; preds = %entry
+  %1 = load i32* %index.addr, align 4
+  %p1 = getelementptr inbounds %struct.node* %node.017, i32 0, i32 6
+  ; CHECK-NOT: store i32
+  store i32 %1, i32* %p1, align 4
+  br label %if.end
+
+; CHECK: if.else
+if.else:                                          ; preds = %entry
+  %2 = load i32* %index.addr, align 4
+  %add = add nsw i32 %2, 1
+  %p2 = getelementptr inbounds %struct.node* %node.017, i32 0, i32 6
+  store i32 %add, i32* %p2, align 4
+  %p3 = getelementptr inbounds %struct.node* %node.017, i32 5, i32 6
+  ; CHECK: store i32
+  store i32 %add, i32* %p3, align 4  			  ; This is not a barrier
+  br label %if.end
+
+; CHECK: if.end
+if.end:                                           ; preds = %if.else, %if.then
+; CHECK: store
+  ret void
+}
diff --git a/test/Transforms/InstMerge/st_sink_two_stores.ll b/test/Transforms/InstMerge/st_sink_two_stores.ll
new file mode 100644
index 0000000..1f7c6aa
--- /dev/null
+++ b/test/Transforms/InstMerge/st_sink_two_stores.ll
@@ -0,0 +1,47 @@
+; Test to make sure that stores in a diamond get merged
+; Stores sunks into the footer.
+; RUN: opt -basicaa -memdep -mldst-motion -S < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+%struct.node = type { i32, %struct.node*, %struct.node*, %struct.node*, i32, i32, i32, i32 }
+
+; Function Attrs: nounwind uwtable
+define void @sink_store(%struct.node* nocapture %r, i32 %index) {
+entry:
+  %node.0.in16 = getelementptr inbounds %struct.node* %r, i64 0, i32 2
+  %node.017 = load %struct.node** %node.0.in16, align 8
+  %index.addr = alloca i32, align 4
+  store i32 %index, i32* %index.addr, align 4
+  %0 = load i32* %index.addr, align 4
+  %cmp = icmp slt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+; CHECK: if.then
+if.then:                                          ; preds = %entry
+  %1 = load i32* %index.addr, align 4
+  %p1 = getelementptr inbounds %struct.node* %node.017, i32 0, i32 6
+  ; CHECK-NOT: store i32
+  store i32 %1, i32* %p1, align 4
+  %p2 = getelementptr inbounds %struct.node* %node.017, i32 4, i32 6
+  ; CHECK-NOT: store i32
+  store i32 %1, i32* %p2, align 4
+  br label %if.end
+
+; CHECK: if.else
+if.else:                                          ; preds = %entry
+  %2 = load i32* %index.addr, align 4
+  %add = add nsw i32 %2, 1
+  %p3 = getelementptr inbounds %struct.node* %node.017, i32 0, i32 6
+  ; CHECK-NOT: store i32
+  store i32 %add, i32* %p3, align 4
+  %p4 = getelementptr inbounds %struct.node* %node.017, i32 4, i32 6
+  ; CHECK-NOT: store i32
+  store i32 %2, i32* %p4, align 4  
+  br label %if.end
+
+; CHECK: if.end
+if.end:                                           ; preds = %if.else, %if.then
+; CHECK: store
+; CHECK: store
+  ret void
+}
diff --git a/test/Transforms/InstMerge/st_sink_with_barrier.ll b/test/Transforms/InstMerge/st_sink_with_barrier.ll
new file mode 100644
index 0000000..d4efaa7
--- /dev/null
+++ b/test/Transforms/InstMerge/st_sink_with_barrier.ll
@@ -0,0 +1,42 @@
+; Test to make sure that load from the same address as a store and appears after the store prevents the store from being sunk
+; RUN: opt -basicaa -memdep -mldst-motion -S < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+%struct.node = type { i32, %struct.node*, %struct.node*, %struct.node*, i32, i32, i32, i32 }
+
+; Function Attrs: nounwind uwtable
+define void @sink_store(%struct.node* nocapture %r, i32 %index) {
+entry:
+  %node.0.in16 = getelementptr inbounds %struct.node* %r, i64 0, i32 2
+  %node.017 = load %struct.node** %node.0.in16, align 8
+  %index.addr = alloca i32, align 4
+  store i32 %index, i32* %index.addr, align 4
+  %0 = load i32* %index.addr, align 4
+  %cmp = icmp slt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+; CHECK: if.then
+if.then:                                          ; preds = %entry
+  %1 = load i32* %index.addr, align 4
+  %p1 = getelementptr inbounds %struct.node* %node.017, i32 0, i32 6
+  ; CHECK: store i32
+  store i32 %1, i32* %p1, align 4
+  %p2 = getelementptr inbounds %struct.node* %node.017, i32 0, i32 6
+  ; CHECK: load i32*
+  %barrier = load i32 * %p2, align 4
+  br label %if.end
+
+; CHECK: if.else
+if.else:                                          ; preds = %entry
+  %2 = load i32* %index.addr, align 4
+  %add = add nsw i32 %2, 1
+  %p3 = getelementptr inbounds %struct.node* %node.017, i32 0, i32 6
+  ; CHECK: store i32
+  store i32 %add, i32* %p3, align 4
+  br label %if.end
+
+; CHECK: if.end
+if.end:                                           ; preds = %if.else, %if.then
+; CHECK-NOT: store
+  ret void
+}
diff --git a/test/Transforms/InstSimplify/AndOrXor.ll b/test/Transforms/InstSimplify/AndOrXor.ll
index 8ed06e8..ce3c2aa 100644
--- a/test/Transforms/InstSimplify/AndOrXor.ll
+++ b/test/Transforms/InstSimplify/AndOrXor.ll
@@ -148,3 +148,58 @@ define i1 @or_of_icmps5(i32 %b) {
   ret i1 %cmp
 ; CHECK: ret i1 true
 }
+
+define i32 @neg_nuw(i32 %x) {
+; CHECK-LABEL: @neg_nuw(
+  %neg = sub nuw i32 0, %x
+  ret i32 %neg
+; CHECK: ret i32 0
+}
+
+define i1 @and_icmp1(i32 %x, i32 %y) {
+  %1 = icmp ult i32 %x, %y
+  %2 = icmp ne i32 %y, 0
+  %3 = and i1 %1, %2
+  ret i1 %3
+}
+; CHECK-LABEL: @and_icmp1(
+; CHECK: %[[cmp:.*]] = icmp ult i32 %x, %y
+; CHECK: ret i1 %[[cmp]]
+
+define i1 @and_icmp2(i32 %x, i32 %y) {
+  %1 = icmp ult i32 %x, %y
+  %2 = icmp eq i32 %y, 0
+  %3 = and i1 %1, %2
+  ret i1 %3
+}
+; CHECK-LABEL: @and_icmp2(
+; CHECK: ret i1 false
+
+define i1 @or_icmp1(i32 %x, i32 %y) {
+  %1 = icmp ult i32 %x, %y
+  %2 = icmp ne i32 %y, 0
+  %3 = or i1 %1, %2
+  ret i1 %3
+}
+; CHECK-LABEL: @or_icmp1(
+; CHECK: %[[cmp:.*]] = icmp ne i32 %y, 0
+; CHECK: ret i1 %[[cmp]]
+
+define i1 @or_icmp2(i32 %x, i32 %y) {
+  %1 = icmp uge i32 %x, %y
+  %2 = icmp ne i32 %y, 0
+  %3 = or i1 %1, %2
+  ret i1 %3
+}
+; CHECK-LABEL: @or_icmp2(
+; CHECK: ret i1 true
+
+define i1 @or_icmp3(i32 %x, i32 %y) {
+  %1 = icmp uge i32 %x, %y
+  %2 = icmp eq i32 %y, 0
+  %3 = or i1 %1, %2
+  ret i1 %3
+}
+; CHECK-LABEL: @or_icmp3(
+; CHECK: %[[cmp:.*]] = icmp uge i32 %x, %y
+; CHECK: ret i1 %[[cmp]]
diff --git a/test/Transforms/InstSimplify/compare.ll b/test/Transforms/InstSimplify/compare.ll
index 38fd747..10c7ca6 100644
--- a/test/Transforms/InstSimplify/compare.ll
+++ b/test/Transforms/InstSimplify/compare.ll
@@ -1100,3 +1100,67 @@ define i1 @icmp_shl_1_V_ne_31(i32 %V) {
 ; CHECK-LABEL: @icmp_shl_1_V_ne_31(
 ; CHECK-NEXT: ret i1 true
 }
+
+define i1 @tautological1(i32 %A, i32 %B) {
+  %C = and i32 %A, %B
+  %D = icmp ugt i32 %C, %A
+  ret i1 %D
+; CHECK-LABEL: @tautological1(
+; CHECK: ret i1 false
+}
+
+define i1 @tautological2(i32 %A, i32 %B) {
+  %C = and i32 %A, %B
+  %D = icmp ule i32 %C, %A
+  ret i1 %D
+; CHECK-LABEL: @tautological2(
+; CHECK: ret i1 true
+}
+
+define i1 @tautological3(i32 %A, i32 %B) {
+  %C = or i32 %A, %B
+  %D = icmp ule i32 %A, %C
+  ret i1 %D
+; CHECK-LABEL: @tautological3(
+; CHECK: ret i1 true
+}
+
+define i1 @tautological4(i32 %A, i32 %B) {
+  %C = or i32 %A, %B
+  %D = icmp ugt i32 %A, %C
+  ret i1 %D
+; CHECK-LABEL: @tautological4(
+; CHECK: ret i1 false
+}
+
+define i1 @tautological5(i32 %A, i32 %B) {
+  %C = or i32 %A, %B
+  %D = icmp ult i32 %C, %A
+  ret i1 %D
+; CHECK-LABEL: @tautological5(
+; CHECK: ret i1 false
+}
+
+define i1 @tautological6(i32 %A, i32 %B) {
+  %C = or i32 %A, %B
+  %D = icmp uge i32 %C, %A
+  ret i1 %D
+; CHECK-LABEL: @tautological6(
+; CHECK: ret i1 true
+}
+
+define i1 @tautological7(i32 %A, i32 %B) {
+  %C = and i32 %A, %B
+  %D = icmp uge i32 %A, %C
+  ret i1 %D
+; CHECK-LABEL: @tautological7(
+; CHECK: ret i1 true
+}
+
+define i1 @tautological8(i32 %A, i32 %B) {
+  %C = and i32 %A, %B
+  %D = icmp ult i32 %A, %C
+  ret i1 %D
+; CHECK-LABEL: @tautological8(
+; CHECK: ret i1 false
+}
diff --git a/test/Transforms/InstSimplify/fast-math.ll b/test/Transforms/InstSimplify/fast-math.ll
index 71d1ed8..e7fb14d 100644
--- a/test/Transforms/InstSimplify/fast-math.ll
+++ b/test/Transforms/InstSimplify/fast-math.ll
@@ -105,3 +105,12 @@ define float @nofold_fadd_x_0(float %a) {
 ; CHECK: ret float %no_zero
   ret float %no_zero
 }
+
+; fdiv nsz nnan 0, X ==> 0
+define double @fdiv_zero_by_x(double %X) {
+; CHECK-LABEL: @fdiv_zero_by_x(
+; 0 / X -> 0
+  %r = fdiv nnan nsz double 0.0, %X
+  ret double %r
+; CHECK: ret double 0
+}
diff --git a/test/Transforms/InstSimplify/floating-point-arithmetic.ll b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
index 8177440..b0957a8 100644
--- a/test/Transforms/InstSimplify/floating-point-arithmetic.ll
+++ b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
@@ -33,3 +33,29 @@ define double @fmul_X_1(double %a) {
   ; CHECK: ret double %a
   ret double %b
 }
+
+; We can't optimize away the fadd in this test because the input
+; value to the function and subsequently to the fadd may be -0.0. 
+; In that one special case, the result of the fadd should be +0.0
+; rather than the first parameter of the fadd.
+
+; Fragile test warning: We need 6 sqrt calls to trigger the bug 
+; because the internal logic has a magic recursion limit of 6. 
+; This is presented without any explanation or ability to customize.
+
+declare float @sqrtf(float)
+
+define float @PR22688(float %x) {
+  %1 = call float @sqrtf(float %x)
+  %2 = call float @sqrtf(float %1)
+  %3 = call float @sqrtf(float %2)
+  %4 = call float @sqrtf(float %3)
+  %5 = call float @sqrtf(float %4)
+  %6 = call float @sqrtf(float %5)
+  %7 = fadd float %6, 0.0
+  ret float %7
+
+; CHECK-LABEL: @PR22688(
+; CHECK: fadd float %6, 0.0
+}
+
diff --git a/test/Transforms/InstSimplify/floating-point-compare.ll b/test/Transforms/InstSimplify/floating-point-compare.ll
new file mode 100644
index 0000000..af48d06
--- /dev/null
+++ b/test/Transforms/InstSimplify/floating-point-compare.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+; These tests choose arbitrarily between float and double,
+; and between uge and olt, to give reasonble coverage 
+; without combinatorial explosion.
+
+declare float @llvm.fabs.f32(float)
+declare float @llvm.sqrt.f32(float)
+declare double @llvm.powi.f64(double,i32)
+declare float @llvm.exp.f32(float)
+declare double @llvm.exp2.f64(double)
+declare float @llvm.fma.f32(float,float,float)
+
+declare void @expect_equal(i1,i1)
+
+; CHECK-LABEL: @orderedLessZeroTree(
+define i1 @orderedLessZeroTree(float,float,float,float) {
+  %square = fmul float %0, %0
+  %abs = call float @llvm.fabs.f32(float %1)
+  %sqrt = call float @llvm.sqrt.f32(float %2)
+  %fma = call float @llvm.fma.f32(float %3, float %3, float %sqrt)
+  %div = fdiv float %square, %abs
+  %rem = frem float %sqrt, %fma
+  %add = fadd float %div, %rem
+  %uge = fcmp uge float %add, 0.000000e+00
+; CHECK: ret i1 true
+  ret i1 %uge
+}
+
+; CHECK-LABEL: @orderedLessZeroExpExt(
+define i1 @orderedLessZeroExpExt(float) {
+  %a = call float @llvm.exp.f32(float %0)
+  %b = fpext float %a to double
+  %uge = fcmp uge double %b, 0.000000e+00
+; CHECK: ret i1 true
+  ret i1 %uge
+}
+
+; CHECK-LABEL: @orderedLessZeroExp2Trunc(
+define i1 @orderedLessZeroExp2Trunc(double) {
+  %a = call double @llvm.exp2.f64(double %0)
+  %b = fptrunc double %a to float
+  %olt = fcmp olt float %b, 0.000000e+00
+; CHECK: ret i1 false
+  ret i1 %olt
+}
+
+; CHECK-LABEL: @orderedLessZeroPowi(
+define i1 @orderedLessZeroPowi(double,double) {
+  ; Even constant exponent
+  %a = call double @llvm.powi.f64(double %0, i32 2)
+  %square = fmul double %1, %1
+  ; Odd constant exponent with provably non-negative base
+  %b = call double @llvm.powi.f64(double %square, i32 3)
+  %c = fadd double %a, %b
+  %olt = fcmp olt double %b, 0.000000e+00
+; CHECK: ret i1 false
+  ret i1 %olt
+}
+
diff --git a/test/Transforms/InstSimplify/load.ll b/test/Transforms/InstSimplify/load.ll
new file mode 100644
index 0000000..92953cd
--- /dev/null
+++ b/test/Transforms/InstSimplify/load.ll
@@ -0,0 +1,19 @@
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+@zeroinit = constant {} zeroinitializer
+@undef = constant {} undef
+
+define i32 @crash_on_zeroinit() {
+; CHECK-LABEL: @crash_on_zeroinit
+; CHECK: ret i32 0
+  %load = load i32* bitcast ({}* @zeroinit to i32*)
+  ret i32 %load
+}
+
+define i32 @crash_on_undef() {
+; CHECK-LABEL: @crash_on_undef
+; CHECK: ret i32 undef
+  %load = load i32* bitcast ({}* @undef to i32*)
+  ret i32 %load
+}
+
diff --git a/test/Transforms/InstSimplify/noalias-ptr.ll b/test/Transforms/InstSimplify/noalias-ptr.ll
new file mode 100644
index 0000000..7693e55
--- /dev/null
+++ b/test/Transforms/InstSimplify/noalias-ptr.ll
@@ -0,0 +1,259 @@
+; RUN: opt -instsimplify -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@g1 = global i32 0, align 4
+@g2 = internal global i32 0, align 4
+@g3 = unnamed_addr global i32 0, align 4
+@g4 = hidden global i32 0, align 4
+@g5 = protected global i32 0, align 4
+@g6 = thread_local unnamed_addr global i32 0, align 4
+
+; Make sure we can simplify away a pointer comparison between
+; dynamically-allocated memory and a local stack allocation.
+;   void p()
+;   {
+;     int *mData;
+;     int mStackData[10];
+;     mData = new int[12];
+;     if (mData != mStackData) {
+;       delete[] mData;
+;     }
+;   }
+
+define void @_Z2p1v() #0 {
+  %mStackData = alloca [10 x i32], align 16
+  %1 = bitcast [10 x i32]* %mStackData to i8*
+  %2 = tail call noalias i8* @_Znam(i64 48) #4
+  %3 = bitcast i8* %2 to i32*
+  %4 = getelementptr inbounds [10 x i32]* %mStackData, i64 0, i64 0
+  %5 = icmp eq i32* %3, %4
+  br i1 %5, label %7, label %6
+
+; CHECK-LABEL: @_Z2p1v
+; CHECK-NOT: icmp
+; CHECK: ret void
+
+; <label>:6                                       ; preds = %0
+  call void @_ZdaPv(i8* %2) #5
+  br label %7
+
+; <label>:7                                       ; preds = %0, %6
+  ret void
+}
+
+; Also check a more-complicated case with multiple underlying objects.
+
+define void @_Z2p2bb(i1 zeroext %b1, i1 zeroext %b2) #0 {
+  %mStackData = alloca [10 x i32], align 16
+  %1 = bitcast [10 x i32]* %mStackData to i8*
+  %2 = getelementptr inbounds [10 x i32]* %mStackData, i64 0, i64 0
+  %3 = select i1 %b1, i32* %2, i32* @g2
+  %4 = tail call noalias i8* @_Znam(i64 48) #4
+  %5 = tail call noalias i8* @_Znam(i64 48) #4
+  %.v = select i1 %b2, i8* %4, i8* %5
+  %6 = bitcast i8* %.v to i32*
+  %7 = icmp eq i32* %6, %3
+  br i1 %7, label %9, label %8
+
+; CHECK-LABEL: @_Z2p2bb
+; CHECK-NOT: icmp
+; CHECK: ret void
+
+; <label>:8                                       ; preds = %0
+  call void @_ZdaPv(i8* %4) #5
+  call void @_ZdaPv(i8* %5) #5
+  br label %9
+
+; <label>:9                                       ; preds = %0, %8
+  ret void
+}
+
+define void @_Z2p4bb(i1 zeroext %b1, i1 zeroext %b2) #0 {
+  %mStackData = alloca [10 x i32], align 16
+  %1 = bitcast [10 x i32]* %mStackData to i8*
+  %2 = getelementptr inbounds [10 x i32]* %mStackData, i64 0, i64 0
+  %3 = select i1 %b1, i32* %2, i32* @g3
+  %4 = tail call noalias i8* @_Znam(i64 48) #4
+  %5 = tail call noalias i8* @_Znam(i64 48) #4
+  %.v = select i1 %b2, i8* %4, i8* %5
+  %6 = bitcast i8* %.v to i32*
+  %7 = icmp eq i32* %6, %3
+  br i1 %7, label %9, label %8
+
+; CHECK-LABEL: @_Z2p4bb
+; CHECK-NOT: icmp
+; CHECK: ret void
+
+; <label>:8                                       ; preds = %0
+  call void @_ZdaPv(i8* %4) #5
+  call void @_ZdaPv(i8* %5) #5
+  br label %9
+
+; <label>:9                                       ; preds = %0, %8
+  ret void
+}
+
+define void @_Z2p5bb(i1 zeroext %b1, i1 zeroext %b2) #0 {
+  %mStackData = alloca [10 x i32], align 16
+  %1 = bitcast [10 x i32]* %mStackData to i8*
+  %2 = getelementptr inbounds [10 x i32]* %mStackData, i64 0, i64 0
+  %3 = select i1 %b1, i32* %2, i32* @g4
+  %4 = tail call noalias i8* @_Znam(i64 48) #4
+  %5 = tail call noalias i8* @_Znam(i64 48) #4
+  %.v = select i1 %b2, i8* %4, i8* %5
+  %6 = bitcast i8* %.v to i32*
+  %7 = icmp eq i32* %6, %3
+  br i1 %7, label %9, label %8
+
+; CHECK-LABEL: @_Z2p5bb
+; CHECK-NOT: icmp
+; CHECK: ret void
+
+; <label>:8                                       ; preds = %0
+  call void @_ZdaPv(i8* %4) #5
+  call void @_ZdaPv(i8* %5) #5
+  br label %9
+
+; <label>:9                                       ; preds = %0, %8
+  ret void
+}
+
+define void @_Z2p6bb(i1 zeroext %b1, i1 zeroext %b2) #0 {
+  %mStackData = alloca [10 x i32], align 16
+  %1 = bitcast [10 x i32]* %mStackData to i8*
+  %2 = getelementptr inbounds [10 x i32]* %mStackData, i64 0, i64 0
+  %3 = select i1 %b1, i32* %2, i32* @g5
+  %4 = tail call noalias i8* @_Znam(i64 48) #4
+  %5 = tail call noalias i8* @_Znam(i64 48) #4
+  %.v = select i1 %b2, i8* %4, i8* %5
+  %6 = bitcast i8* %.v to i32*
+  %7 = icmp eq i32* %6, %3
+  br i1 %7, label %9, label %8
+
+; CHECK-LABEL: @_Z2p6bb
+; CHECK-NOT: icmp
+; CHECK: ret void
+
+; <label>:8                                       ; preds = %0
+  call void @_ZdaPv(i8* %4) #5
+  call void @_ZdaPv(i8* %5) #5
+  br label %9
+
+; <label>:9                                       ; preds = %0, %8
+  ret void
+}
+
+; Here's another case involving multiple underlying objects, but this time we
+; must keep the comparison (it might involve a regular pointer-typed function
+; argument).
+
+define void @_Z4nopebbPi(i1 zeroext %b1, i1 zeroext %b2, i32* readnone %q) #0 {
+  %mStackData = alloca [10 x i32], align 16
+  %1 = bitcast [10 x i32]* %mStackData to i8*
+  %2 = getelementptr inbounds [10 x i32]* %mStackData, i64 0, i64 0
+  %3 = select i1 %b1, i32* %2, i32* %q
+  %4 = tail call noalias i8* @_Znam(i64 48) #4
+  %5 = tail call noalias i8* @_Znam(i64 48) #4
+  %.v = select i1 %b2, i8* %4, i8* %5
+  %6 = bitcast i8* %.v to i32*
+  %7 = icmp eq i32* %6, %3
+  br i1 %7, label %9, label %8
+
+; CHECK-LABEL: @_Z4nopebbPi
+; CHECK: icmp
+; CHECK: ret void
+
+; <label>:8                                       ; preds = %0
+  call void @_ZdaPv(i8* %4) #5
+  call void @_ZdaPv(i8* %5) #5
+  br label %9
+
+; <label>:9                                       ; preds = %0, %8
+  ret void
+}
+
+define void @_Z2p3bb(i1 zeroext %b1, i1 zeroext %b2) #0 {
+  %mStackData = alloca [10 x i32], align 16
+  %1 = bitcast [10 x i32]* %mStackData to i8*
+  %2 = getelementptr inbounds [10 x i32]* %mStackData, i64 0, i64 0
+  %3 = select i1 %b1, i32* %2, i32* @g1
+  %4 = tail call noalias i8* @_Znam(i64 48) #4
+  %5 = tail call noalias i8* @_Znam(i64 48) #4
+  %.v = select i1 %b2, i8* %4, i8* %5
+  %6 = bitcast i8* %.v to i32*
+  %7 = icmp eq i32* %6, %3
+  br i1 %7, label %9, label %8
+
+; CHECK-LABEL: @_Z2p3bb
+; CHECK: icmp
+; CHECK: ret void
+
+; <label>:8                                       ; preds = %0
+  call void @_ZdaPv(i8* %4) #5
+  call void @_ZdaPv(i8* %5) #5
+  br label %9
+
+; <label>:9                                       ; preds = %0, %8
+  ret void
+}
+
+define void @_Z2p7bb(i1 zeroext %b1, i1 zeroext %b2) #0 {
+  %mStackData = alloca [10 x i32], align 16
+  %1 = bitcast [10 x i32]* %mStackData to i8*
+  %2 = getelementptr inbounds [10 x i32]* %mStackData, i64 0, i64 0
+  %3 = select i1 %b1, i32* %2, i32* @g6
+  %4 = tail call noalias i8* @_Znam(i64 48) #4
+  %5 = tail call noalias i8* @_Znam(i64 48) #4
+  %.v = select i1 %b2, i8* %4, i8* %5
+  %6 = bitcast i8* %.v to i32*
+  %7 = icmp eq i32* %6, %3
+  br i1 %7, label %9, label %8
+
+; CHECK-LABEL: @_Z2p7bb
+; CHECK: icmp
+; CHECK: ret void
+
+; <label>:8                                       ; preds = %0
+  call void @_ZdaPv(i8* %4) #5
+  call void @_ZdaPv(i8* %5) #5
+  br label %9
+
+; <label>:9                                       ; preds = %0, %8
+  ret void
+}
+
+define void @_Z2p2v(i32 %c) #0 {
+  %mStackData = alloca [10 x i32], i32 %c, align 16
+  %1 = bitcast [10 x i32]* %mStackData to i8*
+  %2 = tail call noalias i8* @_Znam(i64 48) #4
+  %3 = bitcast i8* %2 to i32*
+  %4 = getelementptr inbounds [10 x i32]* %mStackData, i64 0, i64 0
+  %5 = icmp eq i32* %3, %4
+  br i1 %5, label %7, label %6
+
+; CHECK-LABEL: @_Z2p2v
+; CHECK: icmp
+; CHECK: ret void
+
+; <label>:6                                       ; preds = %0
+  call void @_ZdaPv(i8* %2) #5
+  br label %7
+
+; <label>:7                                       ; preds = %0, %6
+  ret void
+}
+
+; Function Attrs: nobuiltin
+declare noalias i8* @_Znam(i64) #2
+
+; Function Attrs: nobuiltin nounwind
+declare void @_ZdaPv(i8*) #3
+
+attributes #0 = { uwtable }
+attributes #1 = { nounwind }
+attributes #2 = { nobuiltin }
+attributes #3 = { nobuiltin nounwind }
+attributes #4 = { builtin }
+attributes #5 = { builtin nounwind }
+
diff --git a/test/Transforms/InstSimplify/select.ll b/test/Transforms/InstSimplify/select.ll
new file mode 100644
index 0000000..1d45e57
--- /dev/null
+++ b/test/Transforms/InstSimplify/select.ll
@@ -0,0 +1,161 @@
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+define i32 @test1(i32 %x) {
+  %and = and i32 %x, 1
+  %cmp = icmp eq i32 %and, 0
+  %and1 = and i32 %x, -2
+  %and1.x = select i1 %cmp, i32 %and1, i32 %x
+  ret i32 %and1.x
+; CHECK-LABEL: @test1(
+; CHECK: ret i32 %x
+}
+
+define i32 @test2(i32 %x) {
+  %and = and i32 %x, 1
+  %cmp = icmp ne i32 %and, 0
+  %and1 = and i32 %x, -2
+  %and1.x = select i1 %cmp, i32 %x, i32 %and1
+  ret i32 %and1.x
+; CHECK-LABEL: @test2(
+; CHECK: ret i32 %x
+}
+
+define i32 @test3(i32 %x) {
+  %and = and i32 %x, 1
+  %cmp = icmp ne i32 %and, 0
+  %and1 = and i32 %x, -2
+  %and1.x = select i1 %cmp, i32 %and1, i32 %x
+  ret i32 %and1.x
+; CHECK-LABEL: @test3(
+; CHECK: %[[and:.*]] = and i32 %x, -2
+; CHECK: ret i32 %[[and]]
+}
+
+define i32 @test4(i32 %X) {
+  %cmp = icmp slt i32 %X, 0
+  %or = or i32 %X, -2147483648
+  %cond = select i1 %cmp, i32 %X, i32 %or
+  ret i32 %cond
+; CHECK-LABEL: @test4
+; CHECK: %[[or:.*]] = or i32 %X, -2147483648
+; CHECK: ret i32 %[[or]]
+}
+
+define i32 @test5(i32 %X) {
+  %cmp = icmp slt i32 %X, 0
+  %or = or i32 %X, -2147483648
+  %cond = select i1 %cmp, i32 %or, i32 %X
+  ret i32 %cond
+; CHECK-LABEL: @test5
+; CHECK: ret i32 %X
+}
+
+define i32 @test6(i32 %X) {
+  %cmp = icmp slt i32 %X, 0
+  %and = and i32 %X, 2147483647
+  %cond = select i1 %cmp, i32 %and, i32 %X
+  ret i32 %cond
+; CHECK-LABEL: @test6
+; CHECK: %[[and:.*]] = and i32 %X, 2147483647
+; CHECK: ret i32 %[[and]]
+}
+
+define i32 @test7(i32 %X) {
+  %cmp = icmp slt i32 %X, 0
+  %and = and i32 %X, 2147483647
+  %cond = select i1 %cmp, i32 %X, i32 %and
+  ret i32 %cond
+; CHECK-LABEL: @test7
+; CHECK: ret i32 %X
+}
+
+define i32 @test8(i32 %X) {
+  %cmp = icmp sgt i32 %X, -1
+  %or = or i32 %X, -2147483648
+  %cond = select i1 %cmp, i32 %X, i32 %or
+  ret i32 %cond
+; CHECK-LABEL: @test8
+; CHECK: ret i32 %X
+}
+
+define i32 @test9(i32 %X) {
+  %cmp = icmp sgt i32 %X, -1
+  %or = or i32 %X, -2147483648
+  %cond = select i1 %cmp, i32 %or, i32 %X
+  ret i32 %cond
+; CHECK-LABEL: @test9
+; CHECK: %[[or:.*]] = or i32 %X, -2147483648
+; CHECK: ret i32 %[[or]]
+}
+
+define i32 @test10(i32 %X) {
+  %cmp = icmp sgt i32 %X, -1
+  %and = and i32 %X, 2147483647
+  %cond = select i1 %cmp, i32 %and, i32 %X
+  ret i32 %cond
+; CHECK-LABEL: @test10
+; CHECK: ret i32 %X
+}
+
+define i32 @test11(i32 %X) {
+  %cmp = icmp sgt i32 %X, -1
+  %and = and i32 %X, 2147483647
+  %cond = select i1 %cmp, i32 %X, i32 %and
+  ret i32 %cond
+; CHECK-LABEL: @test11
+; CHECK: %[[and:.*]] = and i32 %X, 2147483647
+; CHECK: ret i32 %[[and]]
+}
+
+; CHECK-LABEL: @select_icmp_and_8_eq_0_or_8(
+; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 %x, 8
+; CHECK-NEXT: ret i32 [[OR]]
+define i32 @select_icmp_and_8_eq_0_or_8(i32 %x) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %or = or i32 %x, 8
+  %or.x = select i1 %cmp, i32 %or, i32 %x
+  ret i32 %or.x
+}
+
+; CHECK-LABEL: @select_icmp_and_8_ne_0_and_not_8(
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, -9
+; CHECK-NEXT: ret i32 [[AND]]
+define i32 @select_icmp_and_8_ne_0_and_not_8(i32 %x) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %and1 = and i32 %x, -9
+  %x.and1 = select i1 %cmp, i32 %x, i32 %and1
+  ret i32 %x.and1
+}
+
+; CHECK-LABEL: @select_icmp_and_8_eq_0_and_not_8(
+; CHECK-NEXT: ret i32 %x
+define i32 @select_icmp_and_8_eq_0_and_not_8(i32 %x) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %and1 = and i32 %x, -9
+  %and1.x = select i1 %cmp, i32 %and1, i32 %x
+  ret i32 %and1.x
+}
+
+; CHECK-LABEL: @select_icmp_x_and_8_eq_0_y_and_not_8(
+; CHECK: select i1 %cmp, i64 %y, i64 %and1
+define i64 @select_icmp_x_and_8_eq_0_y_and_not_8(i32 %x, i64 %y) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %and1 = and i64 %y, -9
+  %y.and1 = select i1 %cmp, i64 %y, i64 %and1
+  ret i64 %y.and1
+}
+
+; CHECK-LABEL: @select_icmp_x_and_8_ne_0_y_and_not_8(
+; CHECK: select i1 %cmp, i64 %and1, i64 %y
+define i64 @select_icmp_x_and_8_ne_0_y_and_not_8(i32 %x, i64 %y) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %and1 = and i64 %y, -9
+  %and1.y = select i1 %cmp, i64 %and1, i64 %y
+  ret i64 %and1.y
+}
+
diff --git a/test/Transforms/InstSimplify/undef.ll b/test/Transforms/InstSimplify/undef.ll
index 181c2ef..e8b49b6 100644
--- a/test/Transforms/InstSimplify/undef.ll
+++ b/test/Transforms/InstSimplify/undef.ll
@@ -160,3 +160,108 @@ define <4 x i8> @test19(<4 x i8> %a) {
   %b = shl <4 x i8> %a, <i8 8, i8 9, i8 undef, i8 -1>
   ret <4 x i8> %b
 }
+
+; CHECK-LABEL: @test20
+; CHECK: ret i32 undef
+define i32 @test20(i32 %a) {
+  %b = udiv i32 %a, 0
+  ret i32 %b
+}
+
+; CHECK-LABEL: @test21
+; CHECK: ret i32 undef
+define i32 @test21(i32 %a) {
+  %b = sdiv i32 %a, 0
+  ret i32 %b
+}
+
+; CHECK-LABEL: @test22
+; CHECK: ret i32 undef
+define i32 @test22(i32 %a) {
+  %b = ashr exact i32 undef, %a
+  ret i32 %b
+}
+
+; CHECK-LABEL: @test23
+; CHECK: ret i32 undef
+define i32 @test23(i32 %a) {
+  %b = lshr exact i32 undef, %a
+  ret i32 %b
+}
+
+; CHECK-LABEL: @test24
+; CHECK: ret i32 undef
+define i32 @test24() {
+  %b = udiv i32 undef, 0
+  ret i32 %b
+}
+
+; CHECK-LABEL: @test25
+; CHECK: ret i32 undef
+define i32 @test25() {
+  %b = lshr i32 0, undef
+  ret i32 %b
+}
+
+; CHECK-LABEL: @test26
+; CHECK: ret i32 undef
+define i32 @test26() {
+  %b = ashr i32 0, undef
+  ret i32 %b
+}
+
+; CHECK-LABEL: @test27
+; CHECK: ret i32 undef
+define i32 @test27() {
+  %b = shl i32 0, undef
+  ret i32 %b
+}
+
+; CHECK-LABEL: @test28
+; CHECK: ret i32 undef
+define i32 @test28(i32 %a) {
+  %b = shl nsw i32 undef, %a
+  ret i32 %b
+}
+
+; CHECK-LABEL: @test29
+; CHECK: ret i32 undef
+define i32 @test29(i32 %a) {
+  %b = shl nuw i32 undef, %a
+  ret i32 %b
+}
+
+; CHECK-LABEL: @test30
+; CHECK: ret i32 undef
+define i32 @test30(i32 %a) {
+  %b = shl nsw nuw i32 undef, %a
+  ret i32 %b
+}
+
+; CHECK-LABEL: @test31
+; CHECK: ret i32 0
+define i32 @test31(i32 %a) {
+  %b = shl i32 undef, %a
+  ret i32 %b
+}
+
+; CHECK-LABEL: @test32
+; CHECK: ret i32 undef
+define i32 @test32(i32 %a) {
+  %b = shl i32 undef, 0
+  ret i32 %b
+}
+
+; CHECK-LABEL: @test33
+; CHECK: ret i32 undef
+define i32 @test33(i32 %a) {
+  %b = ashr i32 undef, 0
+  ret i32 %b
+}
+
+; CHECK-LABEL: @test34
+; CHECK: ret i32 undef
+define i32 @test34(i32 %a) {
+  %b = lshr i32 undef, 0
+  ret i32 %b
+}
diff --git a/test/Transforms/JumpThreading/conservative-lvi.ll b/test/Transforms/JumpThreading/conservative-lvi.ll
new file mode 100644
index 0000000..1ea8cdc
--- /dev/null
+++ b/test/Transforms/JumpThreading/conservative-lvi.ll
@@ -0,0 +1,58 @@
+; RUN: opt -jump-threading -S %s | FileCheck %s
+
+; Check that we thread arg2neg -> checkpos -> end.
+;
+; LazyValueInfo would previously fail to analyze the value of %arg in arg2neg
+; because its predecessing blocks (checkneg) hadn't been processed yet (PR21238)
+
+; CHECK-LABEL: @test_jump_threading
+; CHECK: arg2neg:
+; CHECK-NEXT: br i1 %arg1, label %end, label %checkpos.thread
+; CHECK: checkpos.thread:
+; CHECK-NEXT: br label %end
+
+define i32 @test_jump_threading(i1 %arg1, i32 %arg2) {
+checkneg:
+  %cmp = icmp slt i32 %arg2, 0
+  br i1 %cmp, label %arg2neg, label %checkpos
+
+arg2neg:
+  br i1 %arg1, label %end, label %checkpos
+
+checkpos:
+  %cmp2 = icmp sgt i32 %arg2, 0
+  br i1 %cmp2, label %arg2pos, label %end
+
+arg2pos:
+  br label %end
+
+end:
+  %0 = phi i32 [ 1, %arg2neg ], [ 2, %checkpos ], [ 3, %arg2pos ]
+  ret i32 %0
+}
+
+
+; arg2neg has an edge back to itself. If LazyValueInfo is not careful when
+; visiting predecessors, it could get into an infinite loop.
+
+; CHECK-LABEL: test_infinite_loop
+
+define i32 @test_infinite_loop(i1 %arg1, i32 %arg2) {
+checkneg:
+  %cmp = icmp slt i32 %arg2, 0
+  br i1 %cmp, label %arg2neg, label %checkpos
+
+arg2neg:
+  br i1 %arg1, label %arg2neg, label %checkpos
+
+checkpos:
+  %cmp2 = icmp sgt i32 %arg2, 0
+  br i1 %cmp2, label %arg2pos, label %end
+
+arg2pos:
+  br label %end
+
+end:
+  %0 = phi i32 [ 2, %checkpos ], [ 3, %arg2pos ]
+  ret i32 %0
+}
diff --git a/test/Transforms/JumpThreading/phi-eq.ll b/test/Transforms/JumpThreading/phi-eq.ll
index e05d5ee..3dd2c36 100644
--- a/test/Transforms/JumpThreading/phi-eq.ll
+++ b/test/Transforms/JumpThreading/phi-eq.ll
@@ -101,7 +101,7 @@ get_filter_list.exit6:                            ; preds = %sw.bb3.i4, %sw.bb2.
   %2 = load %struct._GList** %1, align 8
 ; We should have jump-threading insert an additional load here for the value
 ; coming out of the first switch, which is picked up by a subsequent phi
-; CHECK: {{%\.pr = load %[^%]* %0}}
+; CHECK: %.pr = load %struct._GList** %0
 ; CHECK-NEXT:  br label %while.cond
   br label %while.cond
 
diff --git a/test/Transforms/JumpThreading/pr22086.ll b/test/Transforms/JumpThreading/pr22086.ll
new file mode 100644
index 0000000..35d9aa5
--- /dev/null
+++ b/test/Transforms/JumpThreading/pr22086.ll
@@ -0,0 +1,28 @@
+; RUN: opt -S -jump-threading < %s | FileCheck %s
+
+
+; CHECK-LABEL: @f(
+; CHECK-LABEL: entry:
+; CHECK-NEXT:  br label %[[loop:.*]]
+; CHECK:       [[loop]]:
+; CHECK-NEXT:  br label %[[loop]]
+
+define void @f() {
+entry:
+  br label %for.cond1
+
+if.end16:
+  %phi1 = phi i32 [ undef, %for.cond1 ]
+  %g.3 = phi i32 [ %g.1, %for.cond1 ]
+  %sext = shl i32 %g.3, 16
+  %conv20 = ashr exact i32 %sext, 16
+  %tobool21 = icmp eq i32 %phi1, 0
+  br i1 %tobool21, label %lor.rhs, label %for.cond1
+
+for.cond1:
+  %g.1 = phi i32 [ 0, %entry ], [ 0, %lor.rhs ], [ %g.3, %if.end16 ]
+  br i1 undef, label %lor.rhs, label %if.end16
+
+lor.rhs:
+  br label %for.cond1
+}
diff --git a/test/Transforms/JumpThreading/thread-loads.ll b/test/Transforms/JumpThreading/thread-loads.ll
index b13b767..4351f99 100644
--- a/test/Transforms/JumpThreading/thread-loads.ll
+++ b/test/Transforms/JumpThreading/thread-loads.ll
@@ -106,7 +106,7 @@ return:
   ret i32 13
 }
 
-!0 = metadata !{metadata !3, metadata !3, i64 0}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"int", metadata !1}
+!0 = !{!3, !3, i64 0}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA", null}
+!3 = !{!"int", !1}
diff --git a/test/Transforms/LCSSA/indirectbr.ll b/test/Transforms/LCSSA/indirectbr.ll
index 9656448..345395b 100644
--- a/test/Transforms/LCSSA/indirectbr.ll
+++ b/test/Transforms/LCSSA/indirectbr.ll
@@ -1,11 +1,11 @@
-; RUN: opt < %s -lcssa -verify-loop-info -verify-dom-info -disable-output
-; PR5437
+; RUN: opt < %s -loop-simplify -lcssa -verify-loop-info -verify-dom-info -S | FileCheck %s
 
 ; LCSSA should work correctly in the case of an indirectbr that exits
 ; the loop, and the loop has exits with predecessors not within the loop
 ; (and btw these edges are unsplittable due to the indirectbr).
-
-define i32 @js_Interpret() nounwind {
+; PR5437
+define i32 @test0() nounwind {
+; CHECK-LABEL: @test0
 entry:
   br i1 undef, label %"4", label %"3"
 
@@ -540,3 +540,35 @@ entry:
 "1862":                                           ; preds = %"1836", %"692"
   unreachable
 }
+
+; An exit for Loop L1 may be the header of a disjoint Loop L2.  Thus, when we
+; create PHIs in one of such exits we are also inserting PHIs in L2 header. This
+; could break LCSSA form for L2 because these inserted PHIs can also have uses
+; in L2 exits. Test that we don't assert/crash on that.
+define void @test1() {
+; CHECK-LABEL: @test1
+  br label %lab1
+
+lab1:
+  %tmp21 = add i32 undef, 677038203
+  br i1 undef, label %lab2, label %exit
+
+lab2:
+  indirectbr i8* undef, [label %lab1, label %lab3]
+
+lab3:
+; CHECK: %tmp21.lcssa1 = phi i32 [ %tmp21.lcssa1, %lab4 ], [ %tmp21, %lab2 ]
+  %tmp12 = phi i32 [ %tmp21, %lab2 ], [ %tmp12, %lab4 ]
+  br i1 undef, label %lab5, label %lab4
+
+lab4:
+  br label %lab3
+
+lab5:
+; CHECK:  %tmp21.lcssa1.lcssa = phi i32 [ %tmp21.lcssa1, %lab3 ]
+  %tmp15 = add i32 %tmp12, undef
+  br label %exit
+
+exit:
+  ret void
+}
diff --git a/test/Transforms/LCSSA/unreachable-use.ll b/test/Transforms/LCSSA/unreachable-use.ll
index 71ae134..2ea7aeb 100644
--- a/test/Transforms/LCSSA/unreachable-use.ll
+++ b/test/Transforms/LCSSA/unreachable-use.ll
@@ -1,9 +1,11 @@
-; RUN: opt < %s -lcssa -S -verify-loop-info | grep "[%]tmp33 = load i1\*\* [%]tmp"
+; RUN: opt < %s -lcssa -S -verify-loop-info | FileCheck %s
 ; PR6546
 
 ; LCSSA doesn't need to transform uses in blocks not reachable
 ; from the entry block.
 
+; CHECK: %tmp33 = load i1** %tmp
+
 define fastcc void @dfs() nounwind {
 bb:
   br label %bb44
diff --git a/test/Transforms/LICM/2011-04-06-PromoteResultOfPromotion.ll b/test/Transforms/LICM/2011-04-06-PromoteResultOfPromotion.ll
index 7cf7a32..5587142 100644
--- a/test/Transforms/LICM/2011-04-06-PromoteResultOfPromotion.ll
+++ b/test/Transforms/LICM/2011-04-06-PromoteResultOfPromotion.ll
@@ -30,10 +30,10 @@ for.end:                                          ; preds = %for.inc
   ret void
 }
 
-!0 = metadata !{metadata !5, metadata !5, i64 0}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"short", metadata !1}
-!4 = metadata !{metadata !6, metadata !6, i64 0}
-!5 = metadata !{metadata !"any pointer", metadata !1}
-!6 = metadata !{metadata !"int", metadata !1}
+!0 = !{!5, !5, i64 0}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA", null}
+!3 = !{!"short", !1}
+!4 = !{!6, !6, i64 0}
+!5 = !{!"any pointer", !1}
+!6 = !{!"int", !1}
diff --git a/test/Transforms/LICM/constexpr.ll b/test/Transforms/LICM/constexpr.ll
new file mode 100644
index 0000000..f788787
--- /dev/null
+++ b/test/Transforms/LICM/constexpr.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s -S -basicaa -licm | FileCheck %s
+; This fixes PR22460
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+@in = internal unnamed_addr global i32* null, align 8
+@out = internal unnamed_addr global i32* null, align 8
+
+; CHECK-LABEL: @bar
+; CHECK: entry:
+; CHECK: load i64* bitcast (i32** @in to i64*)
+; CHECK: do.body:
+; CHECK-NOT: load
+
+define i64 @bar(i32 %N) {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %l2, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %l2 ]
+  %total = phi i64 [ 0, %entry ], [ %next, %l2 ]
+  %c = icmp eq i32 %N, 6
+  br i1 %c, label %l1, label %do.body.l2_crit_edge
+
+do.body.l2_crit_edge:                             ; preds = %do.body
+  %inval.pre = load i32** @in, align 8
+  br label %l2
+
+l1:                                               ; preds = %do.body
+  %v1 = load i64* bitcast (i32** @in to i64*), align 8
+  store i64 %v1, i64* bitcast (i32** @out to i64*), align 8
+  %0 = inttoptr i64 %v1 to i32*
+  br label %l2
+
+l2:                                               ; preds = %do.body.l2_crit_edge, %l1
+  %inval = phi i32* [ %inval.pre, %do.body.l2_crit_edge ], [ %0, %l1 ]
+  %int = ptrtoint i32* %inval to i64
+  %next = add i64 %total, %int
+  %inc = add nsw i32 %i.0, 1
+  %cmp = icmp slt i32 %inc, %N
+  br i1 %cmp, label %do.body, label %do.end
+
+do.end:                                           ; preds = %l2
+  ret i64 %total
+}
diff --git a/test/Transforms/LICM/debug-value.ll b/test/Transforms/LICM/debug-value.ll
index 0e0cd39..b49c559 100644
--- a/test/Transforms/LICM/debug-value.ll
+++ b/test/Transforms/LICM/debug-value.ll
@@ -15,7 +15,7 @@ if.then:                                          ; preds = %for.body
 
 if.then27:                                        ; preds = %if.then
 ; CHECK: tail call void @llvm.dbg.value
-  tail call void @llvm.dbg.value(metadata !18, i64 0, metadata !19, metadata !{}), !dbg !21
+  tail call void @llvm.dbg.value(metadata double undef, i64 0, metadata !19, metadata !{}), !dbg !21
   br label %for.body61.us
 
 if.end.if.end.split_crit_edge.critedge:           ; preds = %if.then
@@ -36,30 +36,30 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.module.flags = !{!26}
 !llvm.dbg.sp = !{!0, !6, !9, !10}
 
-!0 = metadata !{metadata !"0x2e\00idamax\00idamax\00\00112\000\001\000\006\00256\000\000", metadata !25, metadata !1, metadata !3, i32 0, null, null, null, null} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x29", metadata !25} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 127169)\001\00\000\00\000", metadata !25, metadata !8, metadata !8, metadata !8, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !25, metadata !1, null, metadata !4, i32 0} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x2e\00dscal\00dscal\00\00206\000\001\000\006\00256\000\000", metadata !25, metadata !1, metadata !7, i32 0, null, null, null, null} ; [ DW_TAG_subprogram ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !25, metadata !1, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null}
-!9 = metadata !{metadata !"0x2e\00daxpy\00daxpy\00\00230\000\001\000\006\00256\000\000", metadata !25, metadata !1, metadata !7, i32 0, null, null, null, null} ; [ DW_TAG_subprogram ]
-!10 = metadata !{metadata !"0x2e\00dgefa\00dgefa\00\00267\000\001\000\006\00256\000\000", metadata !25, metadata !1, metadata !7, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 267] [def] [scope 0] [dgefa]
-!11 = metadata !{i32 281, i32 9, metadata !12, null}
-!12 = metadata !{metadata !"0xb\00272\005\0032", metadata !25, metadata !13} ; [ DW_TAG_lexical_block ]
-!13 = metadata !{metadata !"0xb\00271\005\0031", metadata !25, metadata !14} ; [ DW_TAG_lexical_block ]
-!14 = metadata !{metadata !"0xb\00267\001\0030", metadata !25, metadata !10} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{i32 271, i32 5, metadata !14, null}
-!16 = metadata !{i32 284, i32 10, metadata !17, null}
-!17 = metadata !{metadata !"0xb\00282\009\0033", metadata !25, metadata !12} ; [ DW_TAG_lexical_block ]
-!18 = metadata !{double undef}
-!19 = metadata !{metadata !"0x100\00temp\00268\000", metadata !14, metadata !1, metadata !20} ; [ DW_TAG_auto_variable ]
-!20 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, metadata !2} ; [ DW_TAG_base_type ]
-!21 = metadata !{i32 286, i32 14, metadata !22, null}
-!22 = metadata !{metadata !"0xb\00285\0013\0034", metadata !25, metadata !17} ; [ DW_TAG_lexical_block ]
-!23 = metadata !{i32 296, i32 13, metadata !17, null}
-!24 = metadata !{i32 313, i32 1, metadata !14, null}
-!25 = metadata !{metadata !"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/Benchmarks/CoyoteBench/lpbench.c", metadata !"/private/tmp"}
-!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00idamax\00idamax\00\00112\000\001\000\006\00256\000\000", !25, !1, !3, i32 0, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x29", !25} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 2.9 (trunk 127169)\001\00\000\00\000", !25, !8, !8, !8, null, null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !25, !1, null, !4, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !2} ; [ DW_TAG_base_type ]
+!6 = !{!"0x2e\00dscal\00dscal\00\00206\000\001\000\006\00256\000\000", !25, !1, !7, i32 0, null, null, null, null} ; [ DW_TAG_subprogram ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", !25, !1, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{null}
+!9 = !{!"0x2e\00daxpy\00daxpy\00\00230\000\001\000\006\00256\000\000", !25, !1, !7, i32 0, null, null, null, null} ; [ DW_TAG_subprogram ]
+!10 = !{!"0x2e\00dgefa\00dgefa\00\00267\000\001\000\006\00256\000\000", !25, !1, !7, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 267] [def] [scope 0] [dgefa]
+!11 = !MDLocation(line: 281, column: 9, scope: !12)
+!12 = !{!"0xb\00272\005\0032", !25, !13} ; [ DW_TAG_lexical_block ]
+!13 = !{!"0xb\00271\005\0031", !25, !14} ; [ DW_TAG_lexical_block ]
+!14 = !{!"0xb\00267\001\0030", !25, !10} ; [ DW_TAG_lexical_block ]
+!15 = !MDLocation(line: 271, column: 5, scope: !14)
+!16 = !MDLocation(line: 284, column: 10, scope: !17)
+!17 = !{!"0xb\00282\009\0033", !25, !12} ; [ DW_TAG_lexical_block ]
+!18 = !{double undef}
+!19 = !{!"0x100\00temp\00268\000", !14, !1, !20} ; [ DW_TAG_auto_variable ]
+!20 = !{!"0x24\00double\000\0064\0064\000\000\004", null, !2} ; [ DW_TAG_base_type ]
+!21 = !MDLocation(line: 286, column: 14, scope: !22)
+!22 = !{!"0xb\00285\0013\0034", !25, !17} ; [ DW_TAG_lexical_block ]
+!23 = !MDLocation(line: 296, column: 13, scope: !17)
+!24 = !MDLocation(line: 313, column: 1, scope: !14)
+!25 = !{!"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/Benchmarks/CoyoteBench/lpbench.c", !"/private/tmp"}
+!26 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/LICM/hoist-invariant-load.ll b/test/Transforms/LICM/hoist-invariant-load.ll
index 1ba94d6..59904ba 100644
--- a/test/Transforms/LICM/hoist-invariant-load.ll
+++ b/test/Transforms/LICM/hoist-invariant-load.ll
@@ -37,4 +37,4 @@ for.end:                                          ; preds = %for.cond
 
 declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
 
-!0 = metadata !{}
+!0 = !{}
diff --git a/test/Transforms/LICM/preheader-safe.ll b/test/Transforms/LICM/preheader-safe.ll
new file mode 100644
index 0000000..260a5f6
--- /dev/null
+++ b/test/Transforms/LICM/preheader-safe.ll
@@ -0,0 +1,69 @@
+; RUN: opt -S -licm < %s | FileCheck %s
+
+declare void @use_nothrow(i64 %a) nounwind
+declare void @use(i64 %a)
+
+define void @nothrow(i64 %x, i64 %y, i1* %cond) {
+; CHECK-LABEL: nothrow
+; CHECK-LABEL: entry
+; CHECK: %div = udiv i64 %x, %y
+; CHECK-LABEL: loop
+; CHECK: call void @use_nothrow(i64 %div)
+entry:
+  br label %loop
+
+loop:                                         ; preds = %entry, %for.inc
+  %div = udiv i64 %x, %y
+  call void @use_nothrow(i64 %div)
+  br label %loop
+}
+; Negative test
+define void @throw_header(i64 %x, i64 %y, i1* %cond) {
+; CHECK-LABEL: throw_header
+; CHECK-LABEL: loop
+; CHECK: %div = udiv i64 %x, %y
+; CHECK: call void @use(i64 %div)
+entry:
+  br label %loop
+
+loop:                                         ; preds = %entry, %for.inc
+  %div = udiv i64 %x, %y
+  call void @use(i64 %div)
+  br label %loop
+}
+
+; The header is known no throw, but the loop is not.  We can
+; still lift out of the header.
+define void @nothrow_header(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: nothrow_header
+; CHECK-LABEL: entry
+; CHECK: %div = udiv i64 %x, %y
+; CHECK-LABEL: loop
+; CHECK: call void @use(i64 %div)
+entry:
+  br label %loop
+loop:                                         ; preds = %entry, %for.inc
+  %div = udiv i64 %x, %y
+  br i1 %cond, label %loop-if, label %exit
+loop-if:
+  call void @use(i64 %div)
+  br label %loop
+exit:
+  ret void
+}
+; Negative test - can't move out of throwing block
+define void @nothrow_header_neg(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: nothrow_header_neg
+; CHECK-LABEL: entry
+; CHECK-LABEL: loop
+; CHECK: %div = udiv i64 %x, %y
+; CHECK: call void @use(i64 %div)
+entry:
+  br label %loop
+loop:                                         ; preds = %entry, %for.inc
+  br label %loop-if
+loop-if:
+  %div = udiv i64 %x, %y
+  call void @use(i64 %div)
+  br label %loop
+}
diff --git a/test/Transforms/LICM/promote-order.ll b/test/Transforms/LICM/promote-order.ll
index 86f11fe..a189cf2 100644
--- a/test/Transforms/LICM/promote-order.ll
+++ b/test/Transforms/LICM/promote-order.ll
@@ -36,8 +36,8 @@ for.end:                                          ; preds = %for.cond.for.end_cr
   ret i32* %r.0.lcssa
 }
 
-!0 = metadata !{metadata !"minimal TBAA"}
-!1 = metadata !{metadata !3, metadata !3, i64 0}
-!2 = metadata !{metadata !4, metadata !4, i64 0}
-!3 = metadata !{metadata !"float", metadata !0}
-!4 = metadata !{metadata !"int", metadata !0}
+!0 = !{!"minimal TBAA"}
+!1 = !{!3, !3, i64 0}
+!2 = !{!4, !4, i64 0}
+!3 = !{!"float", !0}
+!4 = !{!"int", !0}
diff --git a/test/Transforms/LICM/scalar_promote.ll b/test/Transforms/LICM/scalar_promote.ll
index d7e7c6e..80afb3c 100644
--- a/test/Transforms/LICM/scalar_promote.ll
+++ b/test/Transforms/LICM/scalar_promote.ll
@@ -185,9 +185,9 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 ; CHECK-NEXT:  store i32 %[[LCSSAPHI]], i32* %gi, align 4, !tbaa !0
 }
 
-!0 = metadata !{metadata !4, metadata !4, i64 0}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !5, metadata !5, i64 0}
-!4 = metadata !{metadata !"int", metadata !1}
-!5 = metadata !{metadata !"float", metadata !1}
+!0 = !{!4, !4, i64 0}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!5, !5, i64 0}
+!4 = !{!"int", !1}
+!5 = !{!"float", !1}
diff --git a/test/Transforms/LICM/sinking.ll b/test/Transforms/LICM/sinking.ll
index ccc9186..d7a8fcd 100644
--- a/test/Transforms/LICM/sinking.ll
+++ b/test/Transforms/LICM/sinking.ll
@@ -314,6 +314,84 @@ exit:
   ret i32 %lcssa
 }
 
+; Can't sink stores out of exit blocks containing indirectbr instructions
+; because loop simplify does not create dedicated exits for such blocks. Test
+; that by sinking the store from lab21 to lab22, but not further.
+define void @test12() {
+; CHECK-LABEL: @test12
+  br label %lab4
+
+lab4:
+  br label %lab20
+
+lab5:
+  br label %lab20
+
+lab6:
+  br label %lab4
+
+lab7:
+  br i1 undef, label %lab8, label %lab13
+
+lab8:
+  br i1 undef, label %lab13, label %lab10
+
+lab10:
+  br label %lab7
+
+lab13:
+  ret void
+
+lab20:
+  br label %lab21
+
+lab21:
+; CHECK: lab21:
+; CHECK-NOT: store
+; CHECK: br i1 false, label %lab21, label %lab22
+  store i32 36127957, i32* undef, align 4
+  br i1 undef, label %lab21, label %lab22
+
+lab22:
+; CHECK: lab22:
+; CHECK: store
+; CHECK-NEXT: indirectbr i8* undef
+  indirectbr i8* undef, [label %lab5, label %lab6, label %lab7]
+}
+
+; Test that we don't crash when trying to sink stores and there's no preheader
+; available (which is used for creating loads that may be used by the SSA
+; updater)
+define void @test13() {
+; CHECK-LABEL: @test13
+  br label %lab59
+
+lab19:
+  br i1 undef, label %lab20, label %lab38
+
+lab20:
+  br label %lab60
+
+lab21:
+  br i1 undef, label %lab22, label %lab38
+
+lab22:
+  br label %lab38
+
+lab38:
+  ret void
+
+lab59:
+  indirectbr i8* undef, [label %lab60, label %lab38]
+
+lab60:
+; CHECK: lab60:
+; CHECK: store
+; CHECK-NEXT: indirectbr
+  store i32 2145244101, i32* undef, align 4
+  indirectbr i8* undef, [label %lab21, label %lab19]
+}
+
 declare void @f(i32*)
 
 declare void @g()
diff --git a/test/Transforms/LoopIdiom/debug-line.ll b/test/Transforms/LoopIdiom/debug-line.ll
index ea3c4de..863df3c 100644
--- a/test/Transforms/LoopIdiom/debug-line.ll
+++ b/test/Transforms/LoopIdiom/debug-line.ll
@@ -5,8 +5,8 @@ target triple = "x86_64-apple-darwin10.0.0"
 
 define void @foo(double* nocapture %a) nounwind ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{double* %a}, i64 0, metadata !5, metadata !{}), !dbg !8
-  tail call void @llvm.dbg.value(metadata !9, i64 0, metadata !10, metadata !{}), !dbg !14
+  tail call void @llvm.dbg.value(metadata double* %a, i64 0, metadata !5, metadata !{}), !dbg !8
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !10, metadata !{}), !dbg !14
   br label %for.body
 
 for.body:                                         ; preds = %entry, %for.body
@@ -30,23 +30,23 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.module.flags = !{!19}
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{metadata !"0x2e\00foo\00foo\00\002\000\001\000\006\00256\000\000", metadata !18, metadata !1, metadata !3, null, void (double*)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
-!1 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 127165:127174)\001\00\000\00\000", metadata !18, metadata !9, metadata !9, null, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null}
-!5 = metadata !{metadata !"0x101\00a\0016777218\000", metadata !0, metadata !1, metadata !6} ; [ DW_TAG_arg_variable ]
-!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !2, metadata !7} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, metadata !2} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 2, i32 18, metadata !0, null}
-!9 = metadata !{i32 0}
-!10 = metadata !{metadata !"0x100\00i\003\000", metadata !11, metadata !1, metadata !13} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{metadata !"0xb\003\003\001", metadata !18, metadata !12} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{metadata !"0xb\002\0021\000", metadata !18, metadata !0} ; [ DW_TAG_lexical_block ]
-!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 3, i32 3, metadata !12, null}
-!15 = metadata !{i32 4, i32 5, metadata !11, null}
-!16 = metadata !{i32 3, i32 29, metadata !11, null}
-!17 = metadata !{i32 5, i32 1, metadata !12, null}
-!18 = metadata !{metadata !"li.c", metadata !"/private/tmp"}
-!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00foo\00foo\00\002\000\001\000\006\00256\000\000", !18, !1, !3, null, void (double*)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
+!1 = !{!"0x29", !18} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 2.9 (trunk 127165:127174)\001\00\000\00\000", !18, !9, !9, null, null, null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !18, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{null}
+!5 = !{!"0x101\00a\0016777218\000", !0, !1, !6} ; [ DW_TAG_arg_variable ]
+!6 = !{!"0xf\00\000\0064\0064\000\000", null, !2, !7} ; [ DW_TAG_pointer_type ]
+!7 = !{!"0x24\00double\000\0064\0064\000\000\004", null, !2} ; [ DW_TAG_base_type ]
+!8 = !MDLocation(line: 2, column: 18, scope: !0)
+!9 = !{i32 0}
+!10 = !{!"0x100\00i\003\000", !11, !1, !13} ; [ DW_TAG_auto_variable ]
+!11 = !{!"0xb\003\003\001", !18, !12} ; [ DW_TAG_lexical_block ]
+!12 = !{!"0xb\002\0021\000", !18, !0} ; [ DW_TAG_lexical_block ]
+!13 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !2} ; [ DW_TAG_base_type ]
+!14 = !MDLocation(line: 3, column: 3, scope: !12)
+!15 = !MDLocation(line: 4, column: 5, scope: !11)
+!16 = !MDLocation(line: 3, column: 29, scope: !11)
+!17 = !MDLocation(line: 5, column: 1, scope: !12)
+!18 = !{!"li.c", !"/private/tmp"}
+!19 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/LoopReroll/basic.ll b/test/Transforms/LoopReroll/basic.ll
index 3bd6d7a..7533461 100644
--- a/test/Transforms/LoopReroll/basic.ll
+++ b/test/Transforms/LoopReroll/basic.ll
@@ -322,6 +322,260 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
+; void multi1(int *x) {
+;   y = foo(0)
+;   for (int i = 0; i < 500; ++i) {
+;     x[3*i] = y;
+;     x[3*i+1] = y;
+;     x[3*i+2] = y;
+;     x[3*i+6] = y;
+;     x[3*i+7] = y;
+;     x[3*i+8] = y;
+;   }
+; }
+
+; Function Attrs: nounwind uwtable
+define void @multi1(i32* nocapture %x) #0 {
+entry:
+  %call = tail call i32 @foo(i32 0) #1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds i32* %x, i64 %0
+  store i32 %call, i32* %arrayidx, align 4
+  %1 = add nsw i64 %0, 1
+  %arrayidx4 = getelementptr inbounds i32* %x, i64 %1
+  store i32 %call, i32* %arrayidx4, align 4
+  %2 = add nsw i64 %0, 2
+  %arrayidx9 = getelementptr inbounds i32* %x, i64 %2
+  store i32 %call, i32* %arrayidx9, align 4
+  %3 = add nsw i64 %0, 6
+  %arrayidx6 = getelementptr inbounds i32* %x, i64 %3
+  store i32 %call, i32* %arrayidx6, align 4
+  %4 = add nsw i64 %0, 7
+  %arrayidx7 = getelementptr inbounds i32* %x, i64 %4
+  store i32 %call, i32* %arrayidx7, align 4
+  %5 = add nsw i64 %0, 8
+  %arrayidx8 = getelementptr inbounds i32* %x, i64 %5
+  store i32 %call, i32* %arrayidx8, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 500
+  br i1 %exitcond, label %for.end, label %for.body
+
+; CHECK-LABEL: @multi1
+
+; CHECK:for.body:
+; CHECK:  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK:  %0 = add i64 %indvars.iv, 6
+; CHECK:  %arrayidx = getelementptr inbounds i32* %x, i64 %indvars.iv
+; CHECK:  store i32 %call, i32* %arrayidx, align 4
+; CHECK:  %arrayidx6 = getelementptr inbounds i32* %x, i64 %0
+; CHECK:  store i32 %call, i32* %arrayidx6, align 4
+; CHECK:  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK:  %exitcond2 = icmp eq i64 %0, 1505
+; CHECK:  br i1 %exitcond2, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; void multi2(int *x) {
+;   y = foo(0)
+;   for (int i = 0; i < 500; ++i) {
+;     x[3*i] = y;
+;     x[3*i+1] = y;
+;     x[3*i+2] = y;
+;     x[3*(i+1)] = y;
+;     x[3*(i+1)+1] = y;
+;     x[3*(i+1)+2] = y;
+;   }
+; }
+
+; Function Attrs: nounwind uwtable
+define void @multi2(i32* nocapture %x) #0 {
+entry:
+  %call = tail call i32 @foo(i32 0) #1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %add = add nsw i64 %indvars.iv, 1
+  %newmul = mul nsw i64 %add, 3
+  %arrayidx = getelementptr inbounds i32* %x, i64 %0
+  store i32 %call, i32* %arrayidx, align 4
+  %1 = add nsw i64 %0, 1
+  %arrayidx4 = getelementptr inbounds i32* %x, i64 %1
+  store i32 %call, i32* %arrayidx4, align 4
+  %2 = add nsw i64 %0, 2
+  %arrayidx9 = getelementptr inbounds i32* %x, i64 %2
+  store i32 %call, i32* %arrayidx9, align 4
+  %arrayidx6 = getelementptr inbounds i32* %x, i64 %newmul
+  store i32 %call, i32* %arrayidx6, align 4
+  %3 = add nsw i64 %newmul, 1
+  %arrayidx7 = getelementptr inbounds i32* %x, i64 %3
+  store i32 %call, i32* %arrayidx7, align 4
+  %4 = add nsw i64 %newmul, 2
+  %arrayidx8 = getelementptr inbounds i32* %x, i64 %4
+  store i32 %call, i32* %arrayidx8, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 500
+  br i1 %exitcond, label %for.end, label %for.body
+
+; CHECK-LABEL: @multi2
+
+; CHECK:for.body:
+; CHECK:  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK:  %0 = add i64 %indvars.iv, 3
+; CHECK:  %arrayidx = getelementptr inbounds i32* %x, i64 %indvars.iv
+; CHECK:  store i32 %call, i32* %arrayidx, align 4
+; CHECK:  %arrayidx6 = getelementptr inbounds i32* %x, i64 %0
+; CHECK:  store i32 %call, i32* %arrayidx6, align 4
+; CHECK:  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK:  %exitcond2 = icmp eq i64 %indvars.iv, 1499
+; CHECK:  br i1 %exitcond2, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; void multi3(int *x) {
+;   y = foo(0)
+;   for (int i = 0; i < 500; ++i) {
+;     // Note: No zero index
+;     x[3*i+3] = y;
+;     x[3*i+4] = y;
+;     x[3*i+5] = y;
+;   }
+; }
+
+; Function Attrs: nounwind uwtable
+define void @multi3(i32* nocapture %x) #0 {
+entry:
+  %call = tail call i32 @foo(i32 0) #1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %x0 = add nsw i64 %0, 3
+  %add = add nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32* %x, i64 %x0
+  store i32 %call, i32* %arrayidx, align 4
+  %1 = add nsw i64 %0, 4
+  %arrayidx4 = getelementptr inbounds i32* %x, i64 %1
+  store i32 %call, i32* %arrayidx4, align 4
+  %2 = add nsw i64 %0, 5
+  %arrayidx9 = getelementptr inbounds i32* %x, i64 %2
+  store i32 %call, i32* %arrayidx9, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 500
+  br i1 %exitcond, label %for.end, label %for.body
+
+; CHECK-LABEL: @multi3
+; CHECK: for.body:
+; CHECK:   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK:   %0 = add i64 %indvars.iv, 3
+; CHECK:   %arrayidx = getelementptr inbounds i32* %x, i64 %0
+; CHECK:   store i32 %call, i32* %arrayidx, align 4
+; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK:   %exitcond1 = icmp eq i64 %0, 1502
+; CHECK:   br i1 %exitcond1, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; int foo(int a);
+; void bar2(int *x, int y, int z) {
+;   for (int i = 0; i < 500; i += 3) {
+;     foo(i+y+i*z); // Slightly reordered instruction order
+;     foo(i+1+y+(i+1)*z);
+;     foo(i+2+y+(i+2)*z);
+;   }
+; }
+
+; Function Attrs: nounwind uwtable
+define void @bar2(i32* nocapture readnone %x, i32 %y, i32 %z) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+
+  %tmp1 = add i32 %i.08, %y
+  %tmp2 = mul i32 %i.08, %z
+  %tmp3 = add i32 %tmp2, %tmp1
+  %call = tail call i32 @foo(i32 %tmp3) #1
+
+  %add = add nsw i32 %i.08, 1
+  %tmp2a = mul i32 %add, %z
+  %tmp1a = add i32 %add, %y
+  %tmp3a = add i32 %tmp2a, %tmp1a
+  %calla = tail call i32 @foo(i32 %tmp3a) #1
+  
+  %add2 = add nsw i32 %i.08, 2
+  %tmp2b = mul i32 %add2, %z
+  %tmp1b = add i32 %add2, %y
+  %tmp3b = add i32 %tmp2b, %tmp1b
+  %callb = tail call i32 @foo(i32 %tmp3b) #1
+
+  %add3 = add nsw i32 %i.08, 3
+
+  %exitcond = icmp eq i32 %add3, 500
+  br i1 %exitcond, label %for.end, label %for.body
+
+; CHECK-LABEL: @bar2
+
+; CHECK: for.body:
+; CHECK: %indvar = phi i32 [ %indvar.next, %for.body ], [ 0, %entry ]
+; CHECK: %tmp1 = add i32 %indvar, %y
+; CHECK: %tmp2 = mul i32 %indvar, %z
+; CHECK: %tmp3 = add i32 %tmp2, %tmp1
+; CHECK: %call = tail call i32 @foo(i32 %tmp3) #1
+; CHECK: %indvar.next = add i32 %indvar, 1
+; CHECK: %exitcond1 = icmp eq i32 %indvar, 497
+; CHECK: br i1 %exitcond1, label %for.end, label %for.body
+
+; CHECK: ret
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+%struct.s = type { i32, i32 }
+
+; Function Attrs: nounwind uwtable
+define void @gep1(%struct.s* nocapture %x) #0 {
+entry:
+  %call = tail call i32 @foo(i32 0) #1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds %struct.s* %x, i64 %0, i32 0
+  store i32 %call, i32* %arrayidx, align 4
+  %1 = add nsw i64 %0, 1
+  %arrayidx4 = getelementptr inbounds %struct.s* %x, i64 %1, i32 0
+  store i32 %call, i32* %arrayidx4, align 4
+  %2 = add nsw i64 %0, 2
+  %arrayidx9 = getelementptr inbounds %struct.s* %x, i64 %2, i32 0
+  store i32 %call, i32* %arrayidx9, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 500
+  br i1 %exitcond, label %for.end, label %for.body
+
+; CHECK-LABEL: @gep1
+; This test is a crash test only.
+; CHECK: ret
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
 attributes #0 = { nounwind uwtable }
 attributes #1 = { nounwind }
 
diff --git a/test/Transforms/LoopReroll/reduction.ll b/test/Transforms/LoopReroll/reduction.ll
index c9991c7..a4f168a 100644
--- a/test/Transforms/LoopReroll/reduction.ll
+++ b/test/Transforms/LoopReroll/reduction.ll
@@ -92,5 +92,41 @@ for.end:                                          ; preds = %for.body
   ret float %add12
 }
 
+define i32 @foo_unusedphi(i32* nocapture readonly %x) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.029 = phi i32 [ 0, %entry ], [ %add12, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %x, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %0
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds i32* %x, i64 %1
+  %2 = load i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %add, %2
+  %3 = or i64 %indvars.iv, 2
+  %arrayidx7 = getelementptr inbounds i32* %x, i64 %3
+  %4 = load i32* %arrayidx7, align 4
+  %add8 = add nsw i32 %add4, %4
+  %5 = or i64 %indvars.iv, 3
+  %arrayidx11 = getelementptr inbounds i32* %x, i64 %5
+  %6 = load i32* %arrayidx11, align 4
+  %add12 = add nsw i32 %add8, %6
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  %7 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %7, 400
+  br i1 %cmp, label %for.body, label %for.end
+
+; CHECK-LABEL: @foo_unusedphi
+; The above is just testing for a crash - no specific output expected.
+
+; CHECK: ret
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add12
+}
+
 attributes #0 = { nounwind readonly uwtable }
 
diff --git a/test/Transforms/LoopRotate/crash.ll b/test/Transforms/LoopRotate/crash.ll
index fd922cb..e95f9a1 100644
--- a/test/Transforms/LoopRotate/crash.ll
+++ b/test/Transforms/LoopRotate/crash.ll
@@ -153,3 +153,21 @@ entry:
 "5":                                              ; preds = %"3", %entry
   ret void
 }
+
+; PR21968
+define void @test8(i1 %C, i8* %P) #0 {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  br i1 %C, label %l_bad, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  indirectbr i8* %P, [label %for.inc, label %l_bad]
+
+for.inc:                                          ; preds = %for.body
+  br label %for.cond
+
+l_bad:                                            ; preds = %for.body, %for.cond
+  ret void
+}
diff --git a/test/Transforms/LoopRotate/dbgvalue.ll b/test/Transforms/LoopRotate/dbgvalue.ll
index 4da0776..846b366 100644
--- a/test/Transforms/LoopRotate/dbgvalue.ll
+++ b/test/Transforms/LoopRotate/dbgvalue.ll
@@ -6,7 +6,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 define i32 @tak(i32 %x, i32 %y, i32 %z) nounwind ssp {
 ; CHECK-LABEL: define i32 @tak(
 ; CHECK: entry
-; CHECK-NEXT: call void @llvm.dbg.value(metadata !{i32 %x}
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 %x
 
 entry:
   br label %tailrecurse
@@ -15,9 +15,9 @@ tailrecurse:                                      ; preds = %if.then, %entry
   %x.tr = phi i32 [ %x, %entry ], [ %call, %if.then ]
   %y.tr = phi i32 [ %y, %entry ], [ %call9, %if.then ]
   %z.tr = phi i32 [ %z, %entry ], [ %call14, %if.then ]
-  tail call void @llvm.dbg.value(metadata !{i32 %x.tr}, i64 0, metadata !6, metadata !{}), !dbg !7
-  tail call void @llvm.dbg.value(metadata !{i32 %y.tr}, i64 0, metadata !8, metadata !{}), !dbg !9
-  tail call void @llvm.dbg.value(metadata !{i32 %z.tr}, i64 0, metadata !10, metadata !{}), !dbg !11
+  tail call void @llvm.dbg.value(metadata i32 %x.tr, i64 0, metadata !6, metadata !{}), !dbg !7
+  tail call void @llvm.dbg.value(metadata i32 %y.tr, i64 0, metadata !8, metadata !{}), !dbg !9
+  tail call void @llvm.dbg.value(metadata i32 %z.tr, i64 0, metadata !10, metadata !{}), !dbg !11
   %cmp = icmp slt i32 %y.tr, %x.tr, !dbg !12
   br i1 %cmp, label %if.then, label %if.end, !dbg !12
 
@@ -72,7 +72,7 @@ for.body:
 
 for.inc:
   %dec = add i64 %i.0, -1
-  tail call void @llvm.dbg.value(metadata !{i64 %dec}, i64 0, metadata !{metadata !"undef"}, metadata !{})
+  tail call void @llvm.dbg.value(metadata i64 %dec, i64 0, metadata !{!"undef"}, metadata !{})
   br label %for.cond
 
 for.end:
@@ -84,24 +84,24 @@ for.end:
 !llvm.module.flags = !{!20}
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{metadata !"0x2e\00tak\00tak\00\0032\000\001\000\006\00256\000\000", metadata !18, metadata !1, metadata !3, null, i32 (i32, i32, i32)* @tak, null, null, null} ; [ DW_TAG_subprogram ] [line 32] [def] [scope 0] [tak]
-!1 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 125492)\001\00\000\00\000", metadata !18, metadata !19, metadata !19, null, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x101\00x\0032\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{i32 32, i32 13, metadata !0, null}
-!8 = metadata !{metadata !"0x101\00y\0032\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{i32 32, i32 20, metadata !0, null}
-!10 = metadata !{metadata !"0x101\00z\0032\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{i32 32, i32 27, metadata !0, null}
-!12 = metadata !{i32 33, i32 3, metadata !13, null}
-!13 = metadata !{metadata !"0xb\0032\0030\006", metadata !18, metadata !0} ; [ DW_TAG_lexical_block ]
-!14 = metadata !{i32 34, i32 5, metadata !15, null}
-!15 = metadata !{metadata !"0xb\0033\0014\007", metadata !18, metadata !13} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 36, i32 3, metadata !13, null}
-!17 = metadata !{i32 37, i32 1, metadata !13, null}
-!18 = metadata !{metadata !"/Volumes/Lalgate/cj/llvm/projects/llvm-test/SingleSource/Benchmarks/BenchmarkGame/recursive.c", metadata !"/Volumes/Lalgate/cj/D/projects/llvm-test/SingleSource/Benchmarks/BenchmarkGame"}
-!19 = metadata !{i32 0}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00tak\00tak\00\0032\000\001\000\006\00256\000\000", !18, !1, !3, null, i32 (i32, i32, i32)* @tak, null, null, null} ; [ DW_TAG_subprogram ] [line 32] [def] [scope 0] [tak]
+!1 = !{!"0x29", !18} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 2.9 (trunk 125492)\001\00\000\00\000", !18, !19, !19, null, null, null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !18, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !2} ; [ DW_TAG_base_type ]
+!6 = !{!"0x101\00x\0032\000", !0, !1, !5} ; [ DW_TAG_arg_variable ]
+!7 = !MDLocation(line: 32, column: 13, scope: !0)
+!8 = !{!"0x101\00y\0032\000", !0, !1, !5} ; [ DW_TAG_arg_variable ]
+!9 = !MDLocation(line: 32, column: 20, scope: !0)
+!10 = !{!"0x101\00z\0032\000", !0, !1, !5} ; [ DW_TAG_arg_variable ]
+!11 = !MDLocation(line: 32, column: 27, scope: !0)
+!12 = !MDLocation(line: 33, column: 3, scope: !13)
+!13 = !{!"0xb\0032\0030\006", !18, !0} ; [ DW_TAG_lexical_block ]
+!14 = !MDLocation(line: 34, column: 5, scope: !15)
+!15 = !{!"0xb\0033\0014\007", !18, !13} ; [ DW_TAG_lexical_block ]
+!16 = !MDLocation(line: 36, column: 3, scope: !13)
+!17 = !MDLocation(line: 37, column: 1, scope: !13)
+!18 = !{!"/Volumes/Lalgate/cj/llvm/projects/llvm-test/SingleSource/Benchmarks/BenchmarkGame/recursive.c", !"/Volumes/Lalgate/cj/D/projects/llvm-test/SingleSource/Benchmarks/BenchmarkGame"}
+!19 = !{i32 0}
+!20 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/LoopRotate/pr22337.ll b/test/Transforms/LoopRotate/pr22337.ll
new file mode 100644
index 0000000..c2893db
--- /dev/null
+++ b/test/Transforms/LoopRotate/pr22337.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s -loop-rotate -S | FileCheck %s
+
+@a = external global i8, align 4
+@tmp = global i8* @a
+
+define void @f() {
+; CHECK-LABEL: define void @f(
+; CHECK: getelementptr i8* @a, i32 0
+entry:
+  br label %for.preheader
+
+for.preheader:
+  br i1 undef, label %if.then8, label %for.body
+
+for.body:
+  br i1 undef, label %if.end, label %if.then8
+
+if.end:
+  %arrayidx = getelementptr i8* @a, i32 0
+  br label %for.preheader
+
+if.then8:
+  unreachable
+}
diff --git a/test/Transforms/LoopSimplify/2011-12-14-LandingpadHeader.ll b/test/Transforms/LoopSimplify/2011-12-14-LandingpadHeader.ll
index 173a582..39471eb 100644
--- a/test/Transforms/LoopSimplify/2011-12-14-LandingpadHeader.ll
+++ b/test/Transforms/LoopSimplify/2011-12-14-LandingpadHeader.ll
@@ -12,7 +12,7 @@ entry:
 ; CHECK-NEXT: landingpad
 ; CHECK: br label %catch
 
-; CHECK: catch.split-lp:
+; CHECK: catch.preheader.split-lp:
 ; CHECK-NEXT: landingpad
 ; CHECK: br label %catch
 
diff --git a/test/Transforms/LoopStrengthReduce/AArch64/lsr-memset.ll b/test/Transforms/LoopStrengthReduce/AArch64/lsr-memset.ll
index 10b2c3a..48b7094 100644
--- a/test/Transforms/LoopStrengthReduce/AArch64/lsr-memset.ll
+++ b/test/Transforms/LoopStrengthReduce/AArch64/lsr-memset.ll
@@ -96,6 +96,6 @@ done:                                             ; preds = %while.cond, %while.
   ret i8* %dest
 }
 
-!0 = metadata !{metadata !"omnipotent char", metadata !1}
-!1 = metadata !{metadata !"Simple C/C++ TBAA"}
-!2 = metadata !{metadata !"long long", metadata !0}
+!0 = !{!"omnipotent char", !1}
+!1 = !{!"Simple C/C++ TBAA"}
+!2 = !{!"long long", !0}
diff --git a/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll b/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
index f4edf09..26b2940 100644
--- a/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
+++ b/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
@@ -201,7 +201,7 @@ for.end:                                          ; preds = %for.body
 ;
 ; Currently we have three extra add.w's that keep the store address
 ; live past the next increment because ISEL is unfortunately undoing
-; the store chain. ISEL also fails to convert the stores to
+; the store chain. ISEL also fails to convert all but one of the stores to
 ; post-increment addressing. However, the loads should use
 ; post-increment addressing, no add's or add.w's beyond the three
 ; mentioned. Most importantly, there should be no spills or reloads!
@@ -210,7 +210,7 @@ for.end:                                          ; preds = %for.body
 ; A9: %.lr.ph
 ; A9-NOT: lsl.w
 ; A9-NOT: {{ldr|str|adds|add r}}
-; A9: add.w r
+; A9: vst1.8 {{.*}} [r{{[0-9]+}}]!
 ; A9-NOT: {{ldr|str|adds|add r}}
 ; A9: add.w r
 ; A9-NOT: {{ldr|str|adds|add r}}
diff --git a/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
index 937791d..d8636a8 100644
--- a/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -59,7 +59,9 @@ exit:
 ;
 ; X32: @user
 ; expensive address computation in the preheader
-; X32: imul
+; X32: shll $4
+; X32: lea
+; X32: lea
 ; X32: %loop
 ; complex address modes
 ; X32: (%{{[^)]+}},%{{[^)]+}},
diff --git a/test/Transforms/LoopStrengthReduce/count-to-zero.ll b/test/Transforms/LoopStrengthReduce/count-to-zero.ll
index feb79f8..0e96f02 100644
--- a/test/Transforms/LoopStrengthReduce/count-to-zero.ll
+++ b/test/Transforms/LoopStrengthReduce/count-to-zero.ll
@@ -19,7 +19,7 @@ bb3:                                              ; preds = %bb1
   %tmp4 = add i32 %c_addr.1, -1                   ; <i32> [#uses=1]
   %c_addr.1.be = select i1 %tmp2, i32 %tmp3, i32 %tmp4 ; <i32> [#uses=1]
   %indvar.next = add i32 %indvar, 1               ; <i32> [#uses=1]
-; CHECK: add i32 %lsr.iv, -1
+; CHECK: add nsw i32 %lsr.iv, -1
   br label %bb6
 
 bb6:                                              ; preds = %bb3, %entry
diff --git a/test/Transforms/LoopStrengthReduce/pr12018.ll b/test/Transforms/LoopStrengthReduce/pr12018.ll
index 1e3df6c..e493cf8 100644
--- a/test/Transforms/LoopStrengthReduce/pr12018.ll
+++ b/test/Transforms/LoopStrengthReduce/pr12018.ll
@@ -16,7 +16,7 @@ for.body:                                         ; preds = %_ZN8nsTArray9Elemen
   %tmp = bitcast %struct.nsTArrayHeader* %add.ptr.i to %struct.nsTArray*
   %arrayidx = getelementptr inbounds %struct.nsTArray* %tmp, i32 %i.06
   %add = add nsw i32 %i.06, 1
-  call void @llvm.dbg.value(metadata !{%struct.nsTArray* %aValues}, i64 0, metadata !0, metadata !{}) nounwind
+  call void @llvm.dbg.value(metadata %struct.nsTArray* %aValues, i64 0, metadata !0, metadata !{}) nounwind
   br label %_ZN8nsTArray9ElementAtEi.exit
 
 _ZN8nsTArray9ElementAtEi.exit:                    ; preds = %for.body
@@ -35,4 +35,4 @@ declare %struct.nsTArrayHeader* @_ZN8nsTArray4Hdr2Ev()
 
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
-!0 = metadata !{metadata !"0x101"}                       ; [ DW_TAG_arg_variable ]
+!0 = !{!"0x101"}                       ; [ DW_TAG_arg_variable ]
diff --git a/test/Transforms/LoopStrengthReduce/pr18165.ll b/test/Transforms/LoopStrengthReduce/pr18165.ll
index c38d6a6..cc878c4 100644
--- a/test/Transforms/LoopStrengthReduce/pr18165.ll
+++ b/test/Transforms/LoopStrengthReduce/pr18165.ll
@@ -77,12 +77,12 @@ attributes #2 = { nounwind optsize }
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.5 "}
-!1 = metadata !{metadata !2, metadata !3, i64 0}
-!2 = metadata !{metadata !"", metadata !3, i64 0, metadata !3, i64 4, metadata !3, i64 8}
-!3 = metadata !{metadata !"int", metadata !4, i64 0}
-!4 = metadata !{metadata !"omnipotent char", metadata !5, i64 0}
-!5 = metadata !{metadata !"Simple C/C++ TBAA"}
-!6 = metadata !{metadata !2, metadata !3, i64 8}
-!7 = metadata !{metadata !3, metadata !3, i64 0}
-!8 = metadata !{metadata !2, metadata !3, i64 4}
+!0 = !{!"clang version 3.5 "}
+!1 = !{!2, !3, i64 0}
+!2 = !{!"", !3, i64 0, !3, i64 4, !3, i64 8}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = !{!2, !3, i64 8}
+!7 = !{!3, !3, i64 0}
+!8 = !{!2, !3, i64 4}
diff --git a/test/Transforms/LoopStrengthReduce/uglygep.ll b/test/Transforms/LoopStrengthReduce/uglygep.ll
index 4562d29..5155087 100644
--- a/test/Transforms/LoopStrengthReduce/uglygep.ll
+++ b/test/Transforms/LoopStrengthReduce/uglygep.ll
@@ -59,7 +59,7 @@ bb:
 ; CHECK: loop0:
 ; Induction variable is initialized to -2.
 ; CHECK-NEXT: [[PHIIV:%[^ ]+]] = phi i32 [ [[IVNEXT:%[^ ]+]], %loop0 ], [ -2, %bb ]
-; CHECK-NEXT: [[IVNEXT]] = add i32 [[PHIIV]], 1
+; CHECK-NEXT: [[IVNEXT]] = add nuw nsw i32 [[PHIIV]], 1
 ; CHECK-NEXT: br i1 false, label %loop0, label %bb0
 loop0:                                            ; preds = %loop0, %bb
   %i0 = phi i32 [ %i0.next, %loop0 ], [ 0, %bb ]  ; <i32> [#uses=2]
diff --git a/test/Transforms/LoopUnroll/PowerPC/p7-unrolling.ll b/test/Transforms/LoopUnroll/PowerPC/p7-unrolling.ll
new file mode 100644
index 0000000..7a50fc0
--- /dev/null
+++ b/test/Transforms/LoopUnroll/PowerPC/p7-unrolling.ll
@@ -0,0 +1,99 @@
+; RUN: opt < %s -S -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -loop-unroll | FileCheck %s
+define void @unroll_opt_for_size() nounwind optsize {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %inc = add i32 %iv, 1
+  %exitcnd = icmp uge i32 %inc, 1024
+  br i1 %exitcnd, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: @unroll_opt_for_size
+; CHECK:      add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: icmp
+
+define void @unroll_default() nounwind {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %inc = add i32 %iv, 1
+  %exitcnd = icmp uge i32 %inc, 1024
+  br i1 %exitcnd, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: @unroll_default
+; CHECK:      add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: icmp
+
diff --git a/test/Transforms/LoopUnroll/full-unroll-heuristics.ll b/test/Transforms/LoopUnroll/full-unroll-heuristics.ll
new file mode 100644
index 0000000..a1bb4c5
--- /dev/null
+++ b/test/Transforms/LoopUnroll/full-unroll-heuristics.ll
@@ -0,0 +1,62 @@
+; In this test we check how heuristics for complete unrolling work. We have
+; three knobs:
+;  1) -unroll-threshold
+;  2) -unroll-absolute-threshold and
+;  3) -unroll-percent-of-optimized-for-complete-unroll
+;
+; They control loop-unrolling according to the following rules:
+;  * If size of unrolled loop exceeds the absoulte threshold, we don't unroll
+;    this loop under any circumstances.
+;  * If size of unrolled loop is below the '-unroll-threshold', then we'll
+;    consider this loop as a very small one, and completely unroll it.
+;  * If a loop size is between these two tresholds, we only do complete unroll
+;    it if estimated number of potentially optimized instructions is high (we
+;    specify the minimal percent of such instructions).
+
+; In this particular test-case, complete unrolling will allow later
+; optimizations to remove ~55% of the instructions, the loop body size is 9,
+; and unrolled size is 65.
+
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-absolute-threshold=10  -unroll-threshold=10  -unroll-percent-of-optimized-for-complete-unroll=30 | FileCheck %s -check-prefix=TEST1
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-absolute-threshold=100 -unroll-threshold=10  -unroll-percent-of-optimized-for-complete-unroll=30 | FileCheck %s -check-prefix=TEST2
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-absolute-threshold=100 -unroll-threshold=10  -unroll-percent-of-optimized-for-complete-unroll=80 | FileCheck %s -check-prefix=TEST3
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-absolute-threshold=100 -unroll-threshold=100 -unroll-percent-of-optimized-for-complete-unroll=80 | FileCheck %s -check-prefix=TEST4
+
+; If the absolute threshold is too low, or if we can't optimize away requested
+; percent of instructions, we shouldn't unroll:
+; TEST1: %array_const_idx = getelementptr inbounds [9 x i32]* @known_constant, i64 0, i64 %iv
+; TEST3: %array_const_idx = getelementptr inbounds [9 x i32]* @known_constant, i64 0, i64 %iv
+
+; Otherwise, we should:
+; TEST2-NOT: %array_const_idx = getelementptr inbounds [9 x i32]* @known_constant, i64 0, i64 %iv
+
+; Also, we should unroll if the 'unroll-threshold' is big enough:
+; TEST4-NOT: %array_const_idx = getelementptr inbounds [9 x i32]* @known_constant, i64 0, i64 %iv
+
+; And check that we don't crash when we're not allowed to do any analysis.
+; RUN: opt < %s -loop-unroll -unroll-max-iteration-count-to-analyze=0 -disable-output
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+@known_constant = internal unnamed_addr constant [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16
+
+define i32 @foo(i32* noalias nocapture readonly %src) {
+entry:
+  br label %loop
+
+loop:                                                ; preds = %loop, %entry
+  %iv = phi i64 [ 0, %entry ], [ %inc, %loop ]
+  %r  = phi i32 [ 0, %entry ], [ %add, %loop ]
+  %arrayidx = getelementptr inbounds i32* %src, i64 %iv
+  %src_element = load i32* %arrayidx, align 4
+  %array_const_idx = getelementptr inbounds [9 x i32]* @known_constant, i64 0, i64 %iv
+  %const_array_element = load i32* %array_const_idx, align 4
+  %mul = mul nsw i32 %src_element, %const_array_element
+  %add = add nsw i32 %mul, %r
+  %inc = add nuw nsw i64 %iv, 1
+  %exitcond86.i = icmp eq i64 %inc, 9
+  br i1 %exitcond86.i, label %loop.end, label %loop
+
+loop.end:                                            ; preds = %loop
+  %r.lcssa = phi i32 [ %r, %loop ]
+  ret i32 %r.lcssa
+}
diff --git a/test/Transforms/LoopUnroll/partial-unroll-optsize.ll b/test/Transforms/LoopUnroll/partial-unroll-optsize.ll
index 3179d55..a650317 100644
--- a/test/Transforms/LoopUnroll/partial-unroll-optsize.ll
+++ b/test/Transforms/LoopUnroll/partial-unroll-optsize.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -S -loop-unroll -unroll-allow-partial | FileCheck %s
 ; Loop size = 3, when the function has the optsize attribute, the
 ; OptSizeUnrollThreshold, i.e. 50, is used, hence the loop should be unrolled
-; by 16 times because 3 * 16 < 50.
+; by 32 times because (1 * 32) + 2 < 50 (whereas (1 * 64 + 2) is not).
 define void @unroll_opt_for_size() nounwind optsize {
 entry:
   br label %loop
@@ -32,4 +32,21 @@ exit:
 ; CHECK-NEXT: add
 ; CHECK-NEXT: add
 ; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
 ; CHECK-NEXT: icmp
+
diff --git a/test/Transforms/LoopUnroll/runtime-loop.ll b/test/Transforms/LoopUnroll/runtime-loop.ll
index 05d03f2..80571ec 100644
--- a/test/Transforms/LoopUnroll/runtime-loop.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop.ll
@@ -4,9 +4,7 @@
 
 ; CHECK: %xtraiter = and i32 %n
 ; CHECK:  %lcmp.mod = icmp ne i32 %xtraiter, 0
-; CHECK:  %lcmp.overflow = icmp eq i32 %n, 0
-; CHECK:  %lcmp.or = or i1 %lcmp.overflow, %lcmp.mod
-; CHECK:  br i1 %lcmp.or, label %for.body.prol, label %for.body.preheader.split
+; CHECK:  br i1 %lcmp.mod, label %for.body.prol, label %for.body.preheader.split
 
 ; CHECK: for.body.prol:
 ; CHECK: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.preheader ]
@@ -115,6 +113,6 @@ for.end:                                          ; preds = %for.cond.for.end_cr
   ret i16 %res.0.lcssa
 }
 
-; CHECK: !0 = metadata !{metadata !0, metadata !1}
-; CHECK: !1 = metadata !{metadata !"llvm.loop.unroll.disable"}
+; CHECK: !0 = distinct !{!0, !1}
+; CHECK: !1 = !{!"llvm.loop.unroll.disable"}
 
diff --git a/test/Transforms/LoopUnroll/runtime-loop1.ll b/test/Transforms/LoopUnroll/runtime-loop1.ll
index 38b4f32..5ff75e3 100644
--- a/test/Transforms/LoopUnroll/runtime-loop1.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop1.ll
@@ -3,7 +3,7 @@
 ; This tests that setting the unroll count works
 
 ; CHECK: for.body.prol:
-; CHECK: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split
+; CHECK: br label %for.body.preheader.split
 ; CHECK: for.body:
 ; CHECK: br i1 %exitcond.1, label %for.end.loopexit.unr-lcssa, label %for.body
 ; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body
diff --git a/test/Transforms/LoopUnroll/runtime-loop2.ll b/test/Transforms/LoopUnroll/runtime-loop2.ll
index 7205c68..176362a 100644
--- a/test/Transforms/LoopUnroll/runtime-loop2.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop2.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -S -loop-unroll -unroll-threshold=50 -unroll-runtime -unroll-count=8 | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-runtime -unroll-count=8 | FileCheck %s
 
 ; Choose a smaller, power-of-two, unroll count if the loop is too large.
 ; This test makes sure we're not unrolling 'odd' counts
diff --git a/test/Transforms/LoopUnroll/tripcount-overflow.ll b/test/Transforms/LoopUnroll/tripcount-overflow.ll
index d593685..052077c 100644
--- a/test/Transforms/LoopUnroll/tripcount-overflow.ll
+++ b/test/Transforms/LoopUnroll/tripcount-overflow.ll
@@ -1,19 +1,28 @@
 ; RUN: opt < %s -S -unroll-runtime -unroll-count=2 -loop-unroll | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
-; When prologue is fully unrolled, the branch on its end is unconditional.
-; Unrolling it is illegal if we can't prove that trip-count+1 doesn't overflow,
-; like in this example, where it comes from an argument.
-;
-; This test is based on an example from here:
-; http://stackoverflow.com/questions/23838661/why-is-clang-optimizing-this-code-out
-;
+; This test case documents how runtime loop unrolling handles the case
+; when the backedge-count is -1.
+
+; If %N, the backedge-taken count, is -1 then %0 unsigned-overflows
+; and is 0.  %xtraiter too is 0, signifying that the total trip-count
+; is divisible by 2.  The prologue then branches to the unrolled loop
+; and executes the 2^32 iterations there, in groups of 2.
+
+
+; CHECK: entry:
+; CHECK-NEXT: %0 = add i32 %N, 1
+; CHECK-NEXT: %xtraiter = and i32 %0, 1
+; CHECK-NEXT: %lcmp.mod = icmp ne i32 %xtraiter, 0
+; CHECK-NEXT: br i1 %lcmp.mod, label %while.body.prol, label %entry.split
+
 ; CHECK: while.body.prol:
-; CHECK: br i1
+; CHECK: br label %entry.split
+
 ; CHECK: entry.split:
 
 ; Function Attrs: nounwind readnone ssp uwtable
-define i32 @foo(i32 %N) #0 {
+define i32 @foo(i32 %N) {
 entry:
   br label %while.body
 
@@ -26,5 +35,3 @@ while.body:                                       ; preds = %while.body, %entry
 while.end:                                        ; preds = %while.body
   ret i32 %i
 }
-
-attributes #0 = { nounwind readnone ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/LoopUnroll/unroll-pragmas-disabled.ll b/test/Transforms/LoopUnroll/unroll-pragmas-disabled.ll
index db18f25..4f934a6 100644
--- a/test/Transforms/LoopUnroll/unroll-pragmas-disabled.ll
+++ b/test/Transforms/LoopUnroll/unroll-pragmas-disabled.ll
@@ -30,10 +30,10 @@ for.body:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.body
   ret void
 }
-!1 = metadata !{metadata !1, metadata !2, metadata !3, metadata !4}
-!2 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
-!3 = metadata !{metadata !"llvm.loop.unroll.count", i32 4}
-!4 = metadata !{metadata !"llvm.loop.vectorize.width", i32 8}
+!1 = !{!1, !2, !3, !4}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
+!3 = !{!"llvm.loop.unroll.count", i32 4}
+!4 = !{!"llvm.loop.vectorize.width", i32 8}
 
 ; #pragma clang loop unroll(full)
 ;
@@ -63,8 +63,8 @@ for.body:                                         ; preds = %entry, %for.body
 for.end:                                          ; preds = %for.body, %entry
   ret void
 }
-!5 = metadata !{metadata !5, metadata !6}
-!6 = metadata !{metadata !"llvm.loop.unroll.full"}
+!5 = !{!5, !6}
+!6 = !{!"llvm.loop.unroll.full"}
 
 ; #pragma clang loop unroll(disable)
 ;
@@ -89,8 +89,8 @@ for.body:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.body
   ret void
 }
-!7 = metadata !{metadata !7, metadata !8}
-!8 = metadata !{metadata !"llvm.loop.unroll.disable"}
+!7 = !{!7, !8}
+!8 = !{!"llvm.loop.unroll.disable"}
 
 ; This function contains two loops which share the same llvm.loop metadata node
 ; with an llvm.loop.unroll.count 2 hint.  Both loops should be unrolled.  This
@@ -134,16 +134,16 @@ for.body3.1:                                      ; preds = %for.body3.1.prehead
 for.inc5.1:                                       ; preds = %for.body3.1
   ret void
 }
-!9 = metadata !{metadata !9, metadata !10}
-!10 = metadata !{metadata !"llvm.loop.unroll.count", i32 2}
-
-
-; CHECK: ![[LOOP_1]] = metadata !{metadata ![[LOOP_1]], metadata ![[VEC_ENABLE:.*]], metadata ![[WIDTH_8:.*]], metadata ![[UNROLL_DISABLE:.*]]}
-; CHECK: ![[VEC_ENABLE]] = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
-; CHECK: ![[WIDTH_8]] = metadata !{metadata !"llvm.loop.vectorize.width", i32 8}
-; CHECK: ![[UNROLL_DISABLE]] = metadata !{metadata !"llvm.loop.unroll.disable"}
-; CHECK: ![[LOOP_2]] = metadata !{metadata ![[LOOP_2]], metadata ![[UNROLL_FULL:.*]]}
-; CHECK: ![[UNROLL_FULL]] = metadata !{metadata !"llvm.loop.unroll.full"}
-; CHECK: ![[LOOP_3]] = metadata !{metadata ![[LOOP_3]], metadata ![[UNROLL_DISABLE:.*]]}
-; CHECK: ![[LOOP_4]] = metadata !{metadata ![[LOOP_4]], metadata ![[UNROLL_DISABLE:.*]]}
-; CHECK: ![[LOOP_5]] = metadata !{metadata ![[LOOP_5]], metadata ![[UNROLL_DISABLE:.*]]}
+!9 = !{!9, !10}
+!10 = !{!"llvm.loop.unroll.count", i32 2}
+
+
+; CHECK: ![[LOOP_1]] = distinct !{![[LOOP_1]], ![[VEC_ENABLE:.*]], ![[WIDTH_8:.*]], ![[UNROLL_DISABLE:.*]]}
+; CHECK: ![[VEC_ENABLE]] = !{!"llvm.loop.vectorize.enable", i1 true}
+; CHECK: ![[WIDTH_8]] = !{!"llvm.loop.vectorize.width", i32 8}
+; CHECK: ![[UNROLL_DISABLE]] = !{!"llvm.loop.unroll.disable"}
+; CHECK: ![[LOOP_2]] = distinct !{![[LOOP_2]], ![[UNROLL_FULL:.*]]}
+; CHECK: ![[UNROLL_FULL]] = !{!"llvm.loop.unroll.full"}
+; CHECK: ![[LOOP_3]] = distinct !{![[LOOP_3]], ![[UNROLL_DISABLE:.*]]}
+; CHECK: ![[LOOP_4]] = distinct !{![[LOOP_4]], ![[UNROLL_DISABLE:.*]]}
+; CHECK: ![[LOOP_5]] = distinct !{![[LOOP_5]], ![[UNROLL_DISABLE:.*]]}
diff --git a/test/Transforms/LoopUnroll/unroll-pragmas.ll b/test/Transforms/LoopUnroll/unroll-pragmas.ll
index 1ca249d..5831557 100644
--- a/test/Transforms/LoopUnroll/unroll-pragmas.ll
+++ b/test/Transforms/LoopUnroll/unroll-pragmas.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -loop-unroll -S | FileCheck %s
-; RUN: opt < %s -loop-unroll -loop-unroll -S | FileCheck %s
+; RUN: opt < %s -loop-unroll -pragma-unroll-threshold=1024 -S | FileCheck %s
+; RUN: opt < %s -loop-unroll -loop-unroll -pragma-unroll-threshold=1024 -S | FileCheck %s
 ;
 ; Run loop unrolling twice to verify that loop unrolling metadata is properly
 ; removed and further unrolling is disabled after the pass is run once.
@@ -54,8 +54,8 @@ for.body:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.body
   ret void
 }
-!1 = metadata !{metadata !1, metadata !2}
-!2 = metadata !{metadata !"llvm.loop.unroll.disable"}
+!1 = !{!1, !2}
+!2 = !{!"llvm.loop.unroll.disable"}
 
 ; loop64 has a high enough count that it should *not* be unrolled by
 ; the default unrolling heuristic.  It serves as the control for the
@@ -105,8 +105,8 @@ for.body:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.body
   ret void
 }
-!3 = metadata !{metadata !3, metadata !4}
-!4 = metadata !{metadata !"llvm.loop.unroll.full"}
+!3 = !{!3, !4}
+!4 = !{!"llvm.loop.unroll.full"}
 
 ; #pragma clang loop unroll_count(4)
 ; Loop should be unrolled 4 times.
@@ -135,8 +135,8 @@ for.body:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.body
   ret void
 }
-!5 = metadata !{metadata !5, metadata !6}
-!6 = metadata !{metadata !"llvm.loop.unroll.count", i32 4}
+!5 = !{!5, !6}
+!6 = !{!"llvm.loop.unroll.count", i32 4}
 
 ; #pragma clang loop unroll(full)
 ; Full unrolling is requested, but loop has a dynamic trip count so
@@ -165,7 +165,7 @@ for.body:                                         ; preds = %entry, %for.body
 for.end:                                          ; preds = %for.body, %entry
   ret void
 }
-!8 = metadata !{metadata !8, metadata !4}
+!8 = !{!8, !4}
 
 ; #pragma clang loop unroll_count(4)
 ; Loop has a dynamic trip count.  Unrolling should occur, but no
@@ -202,7 +202,7 @@ for.body:                                         ; preds = %entry, %for.body
 for.end:                                          ; preds = %for.body, %entry
   ret void
 }
-!9 = metadata !{metadata !9, metadata !6}
+!9 = !{!9, !6}
 
 ; #pragma clang loop unroll_count(1)
 ; Loop should not be unrolled
@@ -228,8 +228,8 @@ for.body:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.body
   ret void
 }
-!10 = metadata !{metadata !10, metadata !11}
-!11 = metadata !{metadata !"llvm.loop.unroll.count", i32 1}
+!10 = !{!10, !11}
+!11 = !{!"llvm.loop.unroll.count", i32 1}
 
 ; #pragma clang loop unroll(full)
 ; Loop has very high loop count (1 million) and full unrolling was requested.
@@ -256,4 +256,4 @@ for.body:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.body
   ret void
 }
-!12 = metadata !{metadata !12, metadata !4}
+!12 = !{!12, !4}
diff --git a/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll b/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
new file mode 100644
index 0000000..95734bf
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
@@ -0,0 +1,150 @@
+; RUN: opt -S < %s -loop-vectorize 2>&1 | FileCheck %s
+; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnueabi"
+
+; Test integer induction variable of step 2:
+;   for (int i = 0; i < 1024; i+=2) {
+;     int tmp = *A++;
+;     sum += i * tmp;
+;   }
+
+; CHECK-LABEL: @ind_plus2(
+; CHECK: load <4 x i32>*
+; CHECK: load <4 x i32>*
+; CHECK: mul nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: %index.next = add i64 %index, 8
+; CHECK: icmp eq i64 %index.next, 512
+
+; FORCE-VEC-LABEL: @ind_plus2(
+; FORCE-VEC: %wide.load = load <2 x i32>*
+; FORCE-VEC: mul nsw <2 x i32>
+; FORCE-VEC: add nsw <2 x i32>
+; FORCE-VEC: %index.next = add i64 %index, 2
+; FORCE-VEC: icmp eq i64 %index.next, 512
+define i32 @ind_plus2(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %A.addr = phi i32* [ %A, %entry ], [ %inc.ptr, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %add1, %for.body ]
+  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %inc.ptr = getelementptr inbounds i32* %A.addr, i64 1
+  %0 = load i32* %A.addr, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add1 = add nsw i32 %i, 2
+  %cmp = icmp slt i32 %add1, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+}
+
+
+; Test integer induction variable of step -2:
+;   for (int i = 1024; i > 0; i-=2) {
+;     int tmp = *A++;
+;     sum += i * tmp;
+;   }
+
+; CHECK-LABEL: @ind_minus2(
+; CHECK: load <4 x i32>*
+; CHECK: load <4 x i32>*
+; CHECK: mul nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: %index.next = add i64 %index, 8
+; CHECK: icmp eq i64 %index.next, 512
+
+; FORCE-VEC-LABEL: @ind_minus2(
+; FORCE-VEC: %wide.load = load <2 x i32>*
+; FORCE-VEC: mul nsw <2 x i32>
+; FORCE-VEC: add nsw <2 x i32>
+; FORCE-VEC: %index.next = add i64 %index, 2
+; FORCE-VEC: icmp eq i64 %index.next, 512
+define i32 @ind_minus2(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %A.addr = phi i32* [ %A, %entry ], [ %inc.ptr, %for.body ]
+  %i = phi i32 [ 1024, %entry ], [ %sub, %for.body ]
+  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %inc.ptr = getelementptr inbounds i32* %A.addr, i64 1
+  %0 = load i32* %A.addr, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %sub = add nsw i32 %i, -2
+  %cmp = icmp sgt i32 %i, 2
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+}
+
+
+; Test pointer induction variable of step 2. As currently we don't support
+; masked load/store, vectorization is possible but not beneficial. If loop
+; vectorization is not enforced, LV will only do interleave.
+;   for (int i = 0; i < 1024; i++) {
+;     int tmp0 = *A++;
+;     int tmp1 = *A++;
+;     sum += tmp0 * tmp1;
+;   }
+
+; CHECK-LABEL: @ptr_ind_plus2(
+; CHECK: load i32*
+; CHECK: load i32*
+; CHECK: load i32*
+; CHECK: load i32*
+; CHECK: mul nsw i32
+; CHECK: mul nsw i32
+; CHECK: add nsw i32
+; CHECK: add nsw i32
+; CHECK: %index.next = add i64 %index, 2
+; CHECK: %21 = icmp eq i64 %index.next, 1024
+
+; FORCE-VEC-LABEL: @ptr_ind_plus2(
+; FORCE-VEC: load i32*
+; FORCE-VEC: insertelement <2 x i32>
+; FORCE-VEC: load i32*
+; FORCE-VEC: insertelement <2 x i32>
+; FORCE-VEC: load i32*
+; FORCE-VEC: insertelement <2 x i32>
+; FORCE-VEC: load i32*
+; FORCE-VEC: insertelement <2 x i32>
+; FORCE-VEC: mul nsw <2 x i32>
+; FORCE-VEC: add nsw <2 x i32>
+; FORCE-VEC: %index.next = add i64 %index, 2
+; FORCE-VEC: icmp eq i64 %index.next, 1024
+define i32 @ptr_ind_plus2(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %A.addr = phi i32* [ %A, %entry ], [ %inc.ptr1, %for.body ]
+  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc.ptr = getelementptr inbounds i32* %A.addr, i64 1
+  %0 = load i32* %A.addr, align 4
+  %inc.ptr1 = getelementptr inbounds i32* %A.addr, i64 2
+  %1 = load i32* %inc.ptr, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %sum
+  %inc = add nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll b/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll
new file mode 100644
index 0000000..25e7d24
--- /dev/null
+++ b/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; CHECK: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT-NOT: fadd
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-ibm-linux-gnu"
+
+define void @test(double* nocapture readonly %arr, i32 signext %len) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %len, 0
+  br i1 %cmp4, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = add i32 %len, -1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %redx.05 = phi double [ 0.000000e+00, %for.body.lr.ph ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds double* %arr, i64 %indvars.iv
+  %1 = load double* %arrayidx, align 8
+  %add = fadd fast double %1, %redx.05
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %redx.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/X86/already-vectorized.ll b/test/Transforms/LoopVectorize/X86/already-vectorized.ll
index 9c69ba8..29d74a0 100644
--- a/test/Transforms/LoopVectorize/X86/already-vectorized.ll
+++ b/test/Transforms/LoopVectorize/X86/already-vectorized.ll
@@ -39,8 +39,8 @@ for.end:                                          ; preds = %for.body
 }
 
 ; Now, we check for the Hint metadata
-; CHECK: [[vect]] = metadata !{metadata [[vect]], metadata [[width:![0-9]+]], metadata [[unroll:![0-9]+]]}
-; CHECK: [[width]] = metadata !{metadata !"llvm.loop.vectorize.width", i32 1}
-; CHECK: [[unroll]] = metadata !{metadata !"llvm.loop.interleave.count", i32 1}
-; CHECK: [[scalar]] = metadata !{metadata [[scalar]], metadata [[width]], metadata [[unroll]]}
+; CHECK: [[vect]] = distinct !{[[vect]], [[width:![0-9]+]], [[unroll:![0-9]+]]}
+; CHECK: [[width]] = !{!"llvm.loop.vectorize.width", i32 1}
+; CHECK: [[unroll]] = !{!"llvm.loop.interleave.count", i32 1}
+; CHECK: [[scalar]] = distinct !{[[scalar]], [[width]], [[unroll]]}
 
diff --git a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
index 0650d94..46efaf0 100644
--- a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -50,7 +50,7 @@ for.end15:                                        ; preds = %for.end.us, %entry
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
-!3 = metadata !{metadata !4, metadata !5}
-!4 = metadata !{metadata !4}
-!5 = metadata !{metadata !5}
+!3 = !{!4, !5}
+!4 = !{!4}
+!5 = !{!5}
 
diff --git a/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/test/Transforms/LoopVectorize/X86/masked_load_store.ll
new file mode 100644
index 0000000..9e2de80
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -0,0 +1,502 @@
+; RUN: opt < %s  -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX1
+; RUN: opt < %s  -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX2
+; RUN: opt < %s  -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
+
+;AVX1-NOT: llvm.masked
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc_linux"
+
+; The source code:
+;
+;void foo1(int *A, int *B, int *trigger) {
+;
+;  for (int i=0; i<10000; i++) {
+;    if (trigger[i] < 100) {
+;          A[i] = B[i] + trigger[i];
+;    }
+;  }
+;}
+
+;AVX2-LABEL: @foo1
+;AVX2: icmp slt <8 x i32> %wide.load, <i32 100, i32 100, i32 100
+;AVX2: call <8 x i32> @llvm.masked.load.v8i32
+;AVX2: add nsw <8 x i32>
+;AVX2: call void @llvm.masked.store.v8i32
+;AVX2: ret void
+
+;AVX512-LABEL: @foo1
+;AVX512: icmp slt <16 x i32> %wide.load, <i32 100, i32 100, i32 100
+;AVX512: call <16 x i32> @llvm.masked.load.v16i32
+;AVX512: add nsw <16 x i32>
+;AVX512: call void @llvm.masked.store.v16i32
+;AVX512: ret void
+
+; Function Attrs: nounwind uwtable
+define void @foo1(i32* %A, i32* %B, i32* %trigger) {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store i32* %A, i32** %A.addr, align 8
+  store i32* %B, i32** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
+  %3 = load i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load i32** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32* %5, i64 %idxprom2
+  %6 = load i32* %arrayidx3, align 4
+  %7 = load i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4
+  %9 = load i32* %arrayidx5, align 4
+  %add = add nsw i32 %6, %9
+  %10 = load i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load i32** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds i32* %11, i64 %idxprom6
+  store i32 %add, i32* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code:
+;
+;void foo2(float *A, float *B, int *trigger) {
+;
+;  for (int i=0; i<10000; i++) {
+;    if (trigger[i] < 100) {
+;          A[i] = B[i] + trigger[i];
+;    }
+;  }
+;}
+
+;AVX2-LABEL: @foo2
+;AVX2: icmp slt <8 x i32> %wide.load, <i32 100, i32 100, i32 100
+;AVX2: call <8 x float> @llvm.masked.load.v8f32
+;AVX2: fadd <8 x float>
+;AVX2: call void @llvm.masked.store.v8f32
+;AVX2: ret void
+
+;AVX512-LABEL: @foo2
+;AVX512: icmp slt <16 x i32> %wide.load, <i32 100, i32 100, i32 100
+;AVX512: call <16 x float> @llvm.masked.load.v16f32
+;AVX512: fadd <16 x float>
+;AVX512: call void @llvm.masked.store.v16f32
+;AVX512: ret void
+
+; Function Attrs: nounwind uwtable
+define void @foo2(float* %A, float* %B, i32* %trigger) {
+entry:
+  %A.addr = alloca float*, align 8
+  %B.addr = alloca float*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store float* %A, float** %A.addr, align 8
+  store float* %B, float** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
+  %3 = load i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load float** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds float* %5, i64 %idxprom2
+  %6 = load float* %arrayidx3, align 4
+  %7 = load i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4
+  %9 = load i32* %arrayidx5, align 4
+  %conv = sitofp i32 %9 to float
+  %add = fadd float %6, %conv
+  %10 = load i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load float** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds float* %11, i64 %idxprom6
+  store float %add, float* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code:
+;
+;void foo3(double *A, double *B, int *trigger) {
+;
+;  for (int i=0; i<10000; i++) {
+;    if (trigger[i] < 100) {
+;          A[i] = B[i] + trigger[i];
+;    }
+;  }
+;}
+
+;AVX2-LABEL: @foo3
+;AVX2: icmp slt <4 x i32> %wide.load, <i32 100, i32 100,
+;AVX2: call <4 x double> @llvm.masked.load.v4f64
+;AVX2: sitofp <4 x i32> %wide.load to <4 x double>
+;AVX2: fadd <4 x double>
+;AVX2: call void @llvm.masked.store.v4f64
+;AVX2: ret void
+
+;AVX512-LABEL: @foo3
+;AVX512: icmp slt <8 x i32> %wide.load, <i32 100, i32 100,
+;AVX512: call <8 x double> @llvm.masked.load.v8f64
+;AVX512: sitofp <8 x i32> %wide.load to <8 x double>
+;AVX512: fadd <8 x double>
+;AVX512: call void @llvm.masked.store.v8f64
+;AVX512: ret void
+
+
+; Function Attrs: nounwind uwtable
+define void @foo3(double* %A, double* %B, i32* %trigger) #0 {
+entry:
+  %A.addr = alloca double*, align 8
+  %B.addr = alloca double*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store double* %A, double** %A.addr, align 8
+  store double* %B, double** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
+  %3 = load i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load double** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds double* %5, i64 %idxprom2
+  %6 = load double* %arrayidx3, align 8
+  %7 = load i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4
+  %9 = load i32* %arrayidx5, align 4
+  %conv = sitofp i32 %9 to double
+  %add = fadd double %6, %conv
+  %10 = load i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load double** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds double* %11, i64 %idxprom6
+  store double %add, double* %arrayidx7, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code:
+;
+;void foo4(double *A, double *B, int *trigger) {
+;
+;  for (int i=0; i<10000; i++) {
+;    if (trigger[i] < 100) {
+;          A[i] = B[i*2] + trigger[i]; << non-cosecutive access
+;    }
+;  }
+;}
+
+;AVX2-LABEL: @foo4
+;AVX2-NOT: llvm.masked
+;AVX2: ret void
+
+;AVX512-LABEL: @foo4
+;AVX512-NOT: llvm.masked
+;AVX512: ret void
+
+; Function Attrs: nounwind uwtable
+define void @foo4(double* %A, double* %B, i32* %trigger)  {
+entry:
+  %A.addr = alloca double*, align 8
+  %B.addr = alloca double*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store double* %A, double** %A.addr, align 8
+  store double* %B, double** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
+  %3 = load i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32* %i, align 4
+  %mul = mul nsw i32 %4, 2
+  %idxprom2 = sext i32 %mul to i64
+  %5 = load double** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds double* %5, i64 %idxprom2
+  %6 = load double* %arrayidx3, align 8
+  %7 = load i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4
+  %9 = load i32* %arrayidx5, align 4
+  %conv = sitofp i32 %9 to double
+  %add = fadd double %6, %conv
+  %10 = load i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load double** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds double* %11, i64 %idxprom6
+  store double %add, double* %arrayidx7, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+@a = common global [1 x i32*] zeroinitializer, align 8
+@c = common global i32* null, align 8
+
+; The loop here should not be vectorized due to trapping
+; constant expression
+;AVX2-LABEL: @foo5
+;AVX2-NOT: llvm.masked
+;AVX2: store i32 sdiv
+;AVX2: ret void
+
+;AVX512-LABEL: @foo5
+;AVX512-NOT: llvm.masked
+;AVX512: store i32 sdiv
+;AVX512: ret void
+
+; Function Attrs: nounwind uwtable
+define void @foo5(i32* %A, i32* %B, i32* %trigger) {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store i32* %A, i32** %A.addr, align 8
+  store i32* %B, i32** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
+  %3 = load i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load i32** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32* %5, i64 %idxprom2
+  %6 = load i32* %arrayidx3, align 4
+  %7 = load i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4
+  %9 = load i32* %arrayidx5, align 4
+  %add = add nsw i32 %6, %9
+  %10 = load i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load i32** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds i32* %11, i64 %idxprom6
+  store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*]* @a, i64 0, i64 1), i32** @c) to i32)), i32* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Reverse loop
+;void foo6(double *in, double *out, unsigned size, int *trigger) {
+;
+;  for (int i=SIZE-1; i>=0; i--) {
+;    if (trigger[i] > 0) {
+;      out[i] = in[i] + (double) 0.5;
+;    }
+;  }
+;}
+;AVX2-LABEL: @foo6
+;AVX2: icmp sgt <4 x i32> %reverse, zeroinitializer
+;AVX2: shufflevector <4 x i1>{{.*}}<4 x i32> <i32 3, i32 2, i32 1, i32 0>
+;AVX2: call <4 x double> @llvm.masked.load.v4f64
+;AVX2: fadd <4 x double>
+;AVX2: call void @llvm.masked.store.v4f64
+;AVX2: ret void
+
+;AVX512-LABEL: @foo6
+;AVX512: icmp sgt <8 x i32> %reverse, zeroinitializer
+;AVX512: shufflevector <8 x i1>{{.*}}<8 x i32> <i32 7, i32 6, i32 5, i32 4
+;AVX512: call <8 x double> @llvm.masked.load.v8f64
+;AVX512: fadd <8 x double>
+;AVX512: call void @llvm.masked.store.v8f64
+;AVX512: ret void
+
+
+define void @foo6(double* %in, double* %out, i32 %size, i32* %trigger) {
+entry:
+  %in.addr = alloca double*, align 8
+  %out.addr = alloca double*, align 8
+  %size.addr = alloca i32, align 4
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store double* %in, double** %in.addr, align 8
+  store double* %out, double** %out.addr, align 8
+  store i32 %size, i32* %size.addr, align 4
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 4095, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp sge i32 %0, 0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
+  %3 = load i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load double** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds double* %5, i64 %idxprom2
+  %6 = load double* %arrayidx3, align 8
+  %add = fadd double %6, 5.000000e-01
+  %7 = load i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load double** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds double* %8, i64 %idxprom4
+  store double %add, double* %arrayidx5, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32* %i, align 4
+  %dec = add nsw i32 %9, -1
+  store i32 %dec, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+
diff --git a/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index 8e0ca41..7feb66c 100644
--- a/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -170,7 +170,7 @@ for.end:                                          ; preds = %for.body
   ret i32 %1
 }
 
-!0 = metadata !{metadata !0, metadata !1}
-!1 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 1}
-!2 = metadata !{metadata !2, metadata !3}
-!3 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 0}
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 1}
+!2 = !{!2, !3}
+!3 = !{!"llvm.loop.vectorize.enable", i1 0}
diff --git a/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll b/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
index 0b542a9..ad01044 100644
--- a/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
+++ b/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
@@ -46,4 +46,4 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
-!3 = metadata !{metadata !3}
+!3 = !{!3}
diff --git a/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/test/Transforms/LoopVectorize/X86/parallel-loops.ll
index b580d73..22ab521 100644
--- a/test/Transforms/LoopVectorize/X86/parallel-loops.ll
+++ b/test/Transforms/LoopVectorize/X86/parallel-loops.ll
@@ -104,8 +104,8 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
-!3 = metadata !{metadata !3}
-!4 = metadata !{metadata !4}
-!5 = metadata !{metadata !3, metadata !4}
-!6 = metadata !{metadata !6}
-!7 = metadata !{metadata !7}
+!3 = !{!3}
+!4 = !{!4}
+!5 = !{!3, !4}
+!6 = !{!6}
+!7 = !{!7}
diff --git a/test/Transforms/LoopVectorize/X86/small-size.ll b/test/Transforms/LoopVectorize/X86/small-size.ll
index f9a0281..8c7a881 100644
--- a/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -139,7 +139,7 @@ define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture
   ret void
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
+!0 = !{!"branch_weights", i32 64, i32 4}
 
 ; We can't vectorize this one because we need a runtime ptr check.
 ;CHECK-LABEL: @example23(
diff --git a/test/Transforms/LoopVectorize/X86/vect.omp.force.ll b/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
index 074313b..a781fbe 100644
--- a/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
+++ b/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
@@ -52,8 +52,8 @@ for.end:
   ret void
 }
 
-!1 = metadata !{metadata !1, metadata !2}
-!2 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
+!1 = !{!1, !2}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 ;
 ; This method will not be vectorized, as scalar cost is lower than any of vector costs.
@@ -89,5 +89,5 @@ for.end:
 declare float @llvm.sin.f32(float) nounwind readnone
 
 ; Dummy metadata
-!3 = metadata !{metadata !3}
+!3 = !{!3}
 
diff --git a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index 97c31a1..e39e6b5 100644
--- a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -43,8 +43,8 @@ for.end:
   ret void
 }
 
-!1 = metadata !{metadata !1, metadata !2}
-!2 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
+!1 = !{!1, !2}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 ;
 ; This loop will not be vectorized as the trip count is below the threshold.
@@ -69,5 +69,5 @@ for.end:
   ret void
 }
 
-!3 = metadata !{metadata !3}
+!3 = !{!3}
 
diff --git a/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll b/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
index 3b3a787..ece9895 100644
--- a/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
+++ b/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
@@ -50,8 +50,8 @@ define void @vectorselect(i1 %cond) {
   %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
   %8 = icmp ult i64 %indvars.iv, 8
 
-; A vector select has a cost of 4 on core2
-; CHECK: cost of 4 for VF 2 {{.*}}  select i1 %8, i32 %6, i32 0
+; A vector select has a cost of 1 on core2
+; CHECK: cost of 1 for VF 2 {{.*}}  select i1 %8, i32 %6, i32 0
 
   %sel = select i1 %8, i32 %6, i32 zeroinitializer
   store i32 %sel, i32* %7, align 4
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
index 7bce11d..011ce8e 100644
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
@@ -122,40 +122,40 @@ attributes #0 = { nounwind }
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0\001\00\006\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [./source.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"source.cpp", metadata !"."}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !7, metadata !8}
-!4 = metadata !{metadata !"0x2e\00test\00test\00\001\000\001\000\006\00256\001\001", metadata !1, metadata !5, metadata !6, null, void (i32*, i32)* @_Z4testPii, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [test]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [./source.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !"0x2e\00test_disabled\00test_disabled\00\0010\000\001\000\006\00256\001\0010", metadata !1, metadata !5, metadata !6, null, void (i32*, i32)* @_Z13test_disabledPii, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 10] [def] [test_disabled]
-!8 = metadata !{metadata !"0x2e\00test_array_bounds\00test_array_bounds\00\0016\000\001\000\006\00256\001\0016", metadata !1, metadata !5, metadata !6, null, void (i32*, i32*, i32)* @_Z17test_array_boundsPiS_i, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 16] [def] [test_array_bounds]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!11 = metadata !{metadata !"clang version 3.5.0"}
-!12 = metadata !{i32 3, i32 8, metadata !13, null}
-!13 = metadata !{metadata !"0xb\003\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ]
-!14 = metadata !{metadata !14, metadata !15, metadata !15}
-!15 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
-!16 = metadata !{i32 4, i32 5, metadata !17, null}
-!17 = metadata !{metadata !"0xb\003\0036\000", metadata !1, metadata !13} ; [ DW_TAG_lexical_block ]
-!18 = metadata !{metadata !19, metadata !19, i64 0}
-!19 = metadata !{metadata !"int", metadata !20, i64 0}
-!20 = metadata !{metadata !"omnipotent char", metadata !21, i64 0}
-!21 = metadata !{metadata !"Simple C/C++ TBAA"}
-!22 = metadata !{i32 5, i32 9, metadata !23, null}
-!23 = metadata !{metadata !"0xb\005\009\000", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ]
-!24 = metadata !{i32 8, i32 1, metadata !4, null}
-!25 = metadata !{i32 12, i32 8, metadata !26, null}
-!26 = metadata !{metadata !"0xb\0012\003\000", metadata !1, metadata !7} ; [ DW_TAG_lexical_block ]
-!27 = metadata !{metadata !27, metadata !28, metadata !29}
-!28 = metadata !{metadata !"llvm.loop.interleave.count", i32 1}
-!29 = metadata !{metadata !"llvm.loop.vectorize.width", i32 1}
-!30 = metadata !{i32 13, i32 5, metadata !26, null}
-!31 = metadata !{i32 14, i32 1, metadata !7, null}
-!32 = metadata !{i32 18, i32 8, metadata !33, null}
-!33 = metadata !{metadata !"0xb\0018\003\000", metadata !1, metadata !8} ; [ DW_TAG_lexical_block ]
-!34 = metadata !{metadata !34, metadata !15}
-!35 = metadata !{i32 19, i32 5, metadata !33, null}
-!36 = metadata !{i32 20, i32 1, metadata !8, null}
+!0 = !{!"0x11\004\00clang version 3.5.0\001\00\006\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [./source.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"source.cpp", !"."}
+!2 = !{}
+!3 = !{!4, !7, !8}
+!4 = !{!"0x2e\00test\00test\00\001\000\001\000\006\00256\001\001", !1, !5, !6, null, void (i32*, i32)* @_Z4testPii, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [test]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [./source.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!"0x2e\00test_disabled\00test_disabled\00\0010\000\001\000\006\00256\001\0010", !1, !5, !6, null, void (i32*, i32)* @_Z13test_disabledPii, null, null, !2} ; [ DW_TAG_subprogram ] [line 10] [def] [test_disabled]
+!8 = !{!"0x2e\00test_array_bounds\00test_array_bounds\00\0016\000\001\000\006\00256\001\0016", !1, !5, !6, null, void (i32*, i32*, i32)* @_Z17test_array_boundsPiS_i, null, null, !2} ; [ DW_TAG_subprogram ] [line 16] [def] [test_array_bounds]
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 2}
+!11 = !{!"clang version 3.5.0"}
+!12 = !MDLocation(line: 3, column: 8, scope: !13)
+!13 = !{!"0xb\003\003\000", !1, !4} ; [ DW_TAG_lexical_block ]
+!14 = !{!14, !15, !15}
+!15 = !{!"llvm.loop.vectorize.enable", i1 true}
+!16 = !MDLocation(line: 4, column: 5, scope: !17)
+!17 = !{!"0xb\003\0036\000", !1, !13} ; [ DW_TAG_lexical_block ]
+!18 = !{!19, !19, i64 0}
+!19 = !{!"int", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !MDLocation(line: 5, column: 9, scope: !23)
+!23 = !{!"0xb\005\009\000", !1, !17} ; [ DW_TAG_lexical_block ]
+!24 = !MDLocation(line: 8, column: 1, scope: !4)
+!25 = !MDLocation(line: 12, column: 8, scope: !26)
+!26 = !{!"0xb\0012\003\000", !1, !7} ; [ DW_TAG_lexical_block ]
+!27 = !{!27, !28, !29}
+!28 = !{!"llvm.loop.interleave.count", i32 1}
+!29 = !{!"llvm.loop.vectorize.width", i32 1}
+!30 = !MDLocation(line: 13, column: 5, scope: !26)
+!31 = !MDLocation(line: 14, column: 1, scope: !7)
+!32 = !MDLocation(line: 18, column: 8, scope: !33)
+!33 = !{!"0xb\0018\003\000", !1, !8} ; [ DW_TAG_lexical_block ]
+!34 = !{!34, !15}
+!35 = !MDLocation(line: 19, column: 5, scope: !33)
+!36 = !MDLocation(line: 20, column: 1, scope: !8)
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
index 14e541a..16fe370 100644
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
@@ -49,26 +49,26 @@ declare void @ibar(i32*) #1
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!1 = metadata !{metadata !"vectorization-remarks.c", metadata !"."}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\005\000\001\000\006\00256\001\006", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 6] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [./vectorization-remarks.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!9 = metadata !{metadata !"clang version 3.5.0 "}
-!10 = metadata !{i32 8, i32 3, metadata !4, null}
-!11 = metadata !{metadata !12, metadata !12, i64 0}
-!12 = metadata !{metadata !"int", metadata !13, i64 0}
-!13 = metadata !{metadata !"omnipotent char", metadata !14, i64 0}
-!14 = metadata !{metadata !"Simple C/C++ TBAA"}
-!15 = metadata !{i32 17, i32 8, metadata !16, null}
-!16 = metadata !{metadata !"0xb\0017\008\002", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
-!17 = metadata !{metadata !"0xb\0017\008\001", metadata !1, metadata !18} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
-!18 = metadata !{metadata !"0xb\0017\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
-!19 = metadata !{i32 18, i32 5, metadata !20, null}
-!20 = metadata !{metadata !"0xb\0017\0027\000", metadata !1, metadata !18} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
-!21 = metadata !{metadata !13, metadata !13, i64 0}
-!22 = metadata !{i32 20, i32 3, metadata !4, null}
-!23 = metadata !{i32 21, i32 3, metadata !4, null}
+!1 = !{!"vectorization-remarks.c", !"."}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\005\000\001\000\006\00256\001\006", !1, !5, !6, null, i32 (i32)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 6] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [./vectorization-remarks.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 2}
+!9 = !{!"clang version 3.5.0 "}
+!10 = !MDLocation(line: 8, column: 3, scope: !4)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !MDLocation(line: 17, column: 8, scope: !16)
+!16 = !{!"0xb\0017\008\002", !1, !17} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
+!17 = !{!"0xb\0017\008\001", !1, !18} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
+!18 = !{!"0xb\0017\003\000", !1, !4} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
+!19 = !MDLocation(line: 18, column: 5, scope: !20)
+!20 = !{!"0xb\0017\0027\000", !1, !18} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
+!21 = !{!13, !13, i64 0}
+!22 = !MDLocation(line: 20, column: 3, scope: !4)
+!23 = !MDLocation(line: 21, column: 3, scope: !4)
diff --git a/test/Transforms/LoopVectorize/conditional-assignment.ll b/test/Transforms/LoopVectorize/conditional-assignment.ll
index 50fa329..38e9c4f 100644
--- a/test/Transforms/LoopVectorize/conditional-assignment.ll
+++ b/test/Transforms/LoopVectorize/conditional-assignment.ll
@@ -36,23 +36,23 @@ attributes #0 = { nounwind }
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.6.0\001\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"source.c", metadata !"."}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00conditional_store\00conditional_store\00\001\000\001\000\006\00256\001\001", metadata !1, metadata !5, metadata !6, null, void (i32*)* @conditional_store, null, null, metadata !2} ; [ DW_TAG_subprogram ]
-!5 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ]
-!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!8 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!9 = metadata !{metadata !"clang version 3.6.0"}
-!10 = metadata !{i32 2, i32 8, metadata !11, null}
-!11 = metadata !{metadata !"0xb\002\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 3, i32 9, metadata !13, null}
-!13 = metadata !{metadata !"0xb\003\009\000", metadata !1, metadata !11} ; [ DW_TAG_lexical_block ]
-!14 = metadata !{metadata !15, metadata !15, i64 0}
-!15 = metadata !{metadata !"int", metadata !16, i64 0}
-!16 = metadata !{metadata !"omnipotent char", metadata !17, i64 0}
-!17 = metadata !{metadata !"Simple C/C++ TBAA"}
-!18 = metadata !{i32 3, i32 29, metadata !13, null}
-!19 = metadata !{i32 4, i32 1, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.6.0\001\00\000\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ]
+!1 = !{!"source.c", !"."}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00conditional_store\00conditional_store\00\001\000\001\000\006\00256\001\001", !1, !5, !6, null, void (i32*)* @conditional_store, null, null, !2} ; [ DW_TAG_subprogram ]
+!5 = !{!"0x29", !1} ; [ DW_TAG_file_type ]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ]
+!7 = !{i32 2, !"Dwarf Version", i32 2}
+!8 = !{i32 2, !"Debug Info Version", i32 2}
+!9 = !{!"clang version 3.6.0"}
+!10 = !MDLocation(line: 2, column: 8, scope: !11)
+!11 = !{!"0xb\002\003\000", !1, !4} ; [ DW_TAG_lexical_block ]
+!12 = !MDLocation(line: 3, column: 9, scope: !13)
+!13 = !{!"0xb\003\009\000", !1, !11} ; [ DW_TAG_lexical_block ]
+!14 = !{!15, !15, i64 0}
+!15 = !{!"int", !16, i64 0}
+!16 = !{!"omnipotent char", !17, i64 0}
+!17 = !{!"Simple C/C++ TBAA"}
+!18 = !MDLocation(line: 3, column: 29, scope: !13)
+!19 = !MDLocation(line: 4, column: 1, scope: !4)
diff --git a/test/Transforms/LoopVectorize/control-flow.ll b/test/Transforms/LoopVectorize/control-flow.ll
index 452b7ae..1882c3f 100644
--- a/test/Transforms/LoopVectorize/control-flow.ll
+++ b/test/Transforms/LoopVectorize/control-flow.ll
@@ -55,24 +55,24 @@ attributes #0 = { nounwind }
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0\001\00\006\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [./source.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"source.cpp", metadata !"."}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00test\00test\00\001\000\001\000\006\00256\001\002", metadata !1, metadata !5, metadata !6, null, i32 (i32*, i32)* @_Z4testPii, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [test]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [./source.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!8 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!9 = metadata !{metadata !"clang version 3.5.0"}
-!10 = metadata !{i32 3, i32 8, metadata !11, null}
-!11 = metadata !{metadata !"0xb\003\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 5, i32 9, metadata !13, null}
-!13 = metadata !{metadata !"0xb\005\009\000", metadata !1, metadata !14} ; [ DW_TAG_lexical_block ]
-!14 = metadata !{metadata !"0xb\004\003\000", metadata !1, metadata !11} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{metadata !16, metadata !16, i64 0}
-!16 = metadata !{metadata !"int", metadata !17, i64 0}
-!17 = metadata !{metadata !"omnipotent char", metadata !18, i64 0}
-!18 = metadata !{metadata !"Simple C/C++ TBAA"}
-!19 = metadata !{i32 8, i32 7, metadata !13, null}
-!20 = metadata !{i32 12, i32 3, metadata !4, null}
+!0 = !{!"0x11\004\00clang version 3.5.0\001\00\006\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [./source.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"source.cpp", !"."}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00test\00test\00\001\000\001\000\006\00256\001\002", !1, !5, !6, null, i32 (i32*, i32)* @_Z4testPii, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [test]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [./source.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{i32 2, !"Dwarf Version", i32 2}
+!8 = !{i32 2, !"Debug Info Version", i32 2}
+!9 = !{!"clang version 3.5.0"}
+!10 = !MDLocation(line: 3, column: 8, scope: !11)
+!11 = !{!"0xb\003\003\000", !1, !4} ; [ DW_TAG_lexical_block ]
+!12 = !MDLocation(line: 5, column: 9, scope: !13)
+!13 = !{!"0xb\005\009\000", !1, !14} ; [ DW_TAG_lexical_block ]
+!14 = !{!"0xb\004\003\000", !1, !11} ; [ DW_TAG_lexical_block ]
+!15 = !{!16, !16, i64 0}
+!16 = !{!"int", !17, i64 0}
+!17 = !{!"omnipotent char", !18, i64 0}
+!18 = !{!"Simple C/C++ TBAA"}
+!19 = !MDLocation(line: 8, column: 7, scope: !13)
+!20 = !MDLocation(line: 12, column: 3, scope: !4)
diff --git a/test/Transforms/LoopVectorize/dbg.value.ll b/test/Transforms/LoopVectorize/dbg.value.ll
index 91d07d4..92d3154 100644
--- a/test/Transforms/LoopVectorize/dbg.value.ll
+++ b/test/Transforms/LoopVectorize/dbg.value.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 ; CHECK-LABEL: @test(
 define i32 @test() #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !1, i64 0, metadata !9, metadata !{}), !dbg !18
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !9, metadata !{}), !dbg !18
   br label %for.body, !dbg !18
 
 for.body:
@@ -44,27 +44,27 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!26}
 
-!0 = metadata !{metadata !"0x11\004\00clang\001\00\000\00\000", metadata !25, metadata !1, metadata !1, metadata !2, metadata !11, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 0}
-!2 = metadata !{metadata !3}
-!3 = metadata !{metadata !"0x2e\00test\00test\00test\005\000\001\000\006\00256\001\005", metadata !25, metadata !4, metadata !5, null, i32 ()* @test, null, null, metadata !8} ; [ DW_TAG_subprogram ]
-!4 = metadata !{metadata !"0x29", metadata !25} ; [ DW_TAG_file_type ]
-!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!6 = metadata !{metadata !7}
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x100\00i\006\000", metadata !10, metadata !4, metadata !7} ; [ DW_TAG_auto_variable ]
-!10 = metadata !{metadata !"0xb\006\000\000", metadata !25, metadata !3} ; [ DW_TAG_lexical_block ]
-!11 = metadata !{metadata !12, metadata !16, metadata !17}
-!12 = metadata !{metadata !"0x34\00A\00A\00\001\000\001", null, metadata !4, metadata !13, [1024 x i32]* @A, null} ; [ DW_TAG_variable ]
-!13 = metadata !{metadata !"0x1\00\000\0032768\0032\000\000", null, null, metadata !7, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32768, align 32, offset 0] [from int]
-!14 = metadata !{metadata !15}
-!15 = metadata !{i32 786465, i64 0, i64 1024}
-!16 = metadata !{metadata !"0x34\00B\00B\00\002\000\001", null, metadata !4, metadata !13, [1024 x i32]* @B, null} ; [ DW_TAG_variable ]
-!17 = metadata !{metadata !"0x34\00C\00C\00\003\000\001", null, metadata !4, metadata !13, [1024 x i32]* @C, null} ; [ DW_TAG_variable ]
-!18 = metadata !{i32 6, i32 0, metadata !10, null}
-!19 = metadata !{i32 7, i32 0, metadata !20, null}
-!20 = metadata !{metadata !"0xb\006\000\001", metadata !25, metadata !10} ; [ DW_TAG_lexical_block ]
-!24 = metadata !{i32 9, i32 0, metadata !3, null}
-!25 = metadata !{metadata !"test", metadata !"/path/to/somewhere"}
-!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang\001\00\000\00\000", !25, !1, !1, !2, !11, null} ; [ DW_TAG_compile_unit ]
+!1 = !{i32 0}
+!2 = !{!3}
+!3 = !{!"0x2e\00test\00test\00test\005\000\001\000\006\00256\001\005", !25, !4, !5, null, i32 ()* @test, null, null, !8} ; [ DW_TAG_subprogram ]
+!4 = !{!"0x29", !25} ; [ DW_TAG_file_type ]
+!5 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = !{!7}
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!8 = !{!9}
+!9 = !{!"0x100\00i\006\000", !10, !4, !7} ; [ DW_TAG_auto_variable ]
+!10 = !{!"0xb\006\000\000", !25, !3} ; [ DW_TAG_lexical_block ]
+!11 = !{!12, !16, !17}
+!12 = !{!"0x34\00A\00A\00\001\000\001", null, !4, !13, [1024 x i32]* @A, null} ; [ DW_TAG_variable ]
+!13 = !{!"0x1\00\000\0032768\0032\000\000", null, null, !7, !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32768, align 32, offset 0] [from int]
+!14 = !{!15}
+!15 = !{i32 786465, i64 0, i64 1024}
+!16 = !{!"0x34\00B\00B\00\002\000\001", null, !4, !13, [1024 x i32]* @B, null} ; [ DW_TAG_variable ]
+!17 = !{!"0x34\00C\00C\00\003\000\001", null, !4, !13, [1024 x i32]* @C, null} ; [ DW_TAG_variable ]
+!18 = !MDLocation(line: 6, scope: !10)
+!19 = !MDLocation(line: 7, scope: !20)
+!20 = !{!"0xb\006\000\001", !25, !10} ; [ DW_TAG_lexical_block ]
+!24 = !MDLocation(line: 9, scope: !3)
+!25 = !{!"test", !"/path/to/somewhere"}
+!26 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/LoopVectorize/debugloc.ll b/test/Transforms/LoopVectorize/debugloc.ll
index 6350296..634bf79 100644
--- a/test/Transforms/LoopVectorize/debugloc.ll
+++ b/test/Transforms/LoopVectorize/debugloc.ll
@@ -19,10 +19,10 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 define i32 @f(i32* nocapture %a, i32 %size) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32* %a}, i64 0, metadata !13, metadata !{}), !dbg !19
-  tail call void @llvm.dbg.value(metadata !{i32 %size}, i64 0, metadata !14, metadata !{}), !dbg !19
-  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !15, metadata !{}), !dbg !20
-  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !16, metadata !{}), !dbg !21
+  tail call void @llvm.dbg.value(metadata i32* %a, i64 0, metadata !13, metadata !{}), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 %size, i64 0, metadata !14, metadata !{}), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !{}), !dbg !20
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !16, metadata !{}), !dbg !21
   %cmp4 = icmp eq i32 %size, 0, !dbg !21
   br i1 %cmp4, label %for.end, label %for.body.lr.ph, !dbg !21
 
@@ -35,7 +35,7 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
   %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv, !dbg !22
   %0 = load i32* %arrayidx, align 4, !dbg !22
   %add = add i32 %0, %sum.05, !dbg !22
-  tail call void @llvm.dbg.value(metadata !{i32 %add.lcssa}, i64 0, metadata !15, metadata !{}), !dbg !22
+  tail call void @llvm.dbg.value(metadata i32 %add.lcssa, i64 0, metadata !15, metadata !{}), !dbg !22
   %indvars.iv.next = add i64 %indvars.iv, 1, !dbg !21
   tail call void @llvm.dbg.value(metadata !{null}, i64 0, metadata !16, metadata !{}), !dbg !21
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !21
@@ -63,28 +63,28 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18, !27}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 185038) (llvm/trunk 185097)\001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Volumes/Data/backedup/dev/os/llvm/debug/-] [DW_LANG_C99]
-!1 = metadata !{metadata !"-", metadata !"/Volumes/Data/backedup/dev/os/llvm/debug"}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00f\00f\00\003\000\001\000\006\00256\001\003", metadata !5, metadata !6, metadata !7, null, i32 (i32*, i32)* @f, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
-!5 = metadata !{metadata !"<stdin>", metadata !"/Volumes/Data/backedup/dev/os/llvm/debug"}
-!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [/Volumes/Data/backedup/dev/os/llvm/debug/<stdin>]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9, metadata !10, metadata !11}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!11 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
-!12 = metadata !{metadata !13, metadata !14, metadata !15, metadata !16}
-!13 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !4, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ] [a] [line 3]
-!14 = metadata !{metadata !"0x101\00size\0033554435\000", metadata !4, metadata !6, metadata !11} ; [ DW_TAG_arg_variable ] [size] [line 3]
-!15 = metadata !{metadata !"0x100\00sum\004\000", metadata !4, metadata !6, metadata !11} ; [ DW_TAG_auto_variable ] [sum] [line 4]
-!16 = metadata !{metadata !"0x100\00i\005\000", metadata !17, metadata !6, metadata !11} ; [ DW_TAG_auto_variable ] [i] [line 5]
-!17 = metadata !{metadata !"0xb\005\000\000", metadata !5, metadata !4} ; [ DW_TAG_lexical_block ] [/Volumes/Data/backedup/dev/os/llvm/debug/<stdin>]
-!18 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!19 = metadata !{i32 3, i32 0, metadata !4, null}
-!20 = metadata !{i32 4, i32 0, metadata !4, null}
-!21 = metadata !{i32 5, i32 0, metadata !17, null}
-!22 = metadata !{i32 6, i32 0, metadata !17, null}
-!26 = metadata !{i32 7, i32 0, metadata !4, null}
-!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4 (trunk 185038) (llvm/trunk 185097)\001\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/Volumes/Data/backedup/dev/os/llvm/debug/-] [DW_LANG_C99]
+!1 = !{!"-", !"/Volumes/Data/backedup/dev/os/llvm/debug"}
+!2 = !{i32 0}
+!3 = !{!4}
+!4 = !{!"0x2e\00f\00f\00\003\000\001\000\006\00256\001\003", !5, !6, !7, null, i32 (i32*, i32)* @f, null, null, !12} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!5 = !{!"<stdin>", !"/Volumes/Data/backedup/dev/os/llvm/debug"}
+!6 = !{!"0x29", !5}          ; [ DW_TAG_file_type ] [/Volumes/Data/backedup/dev/os/llvm/debug/<stdin>]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9, !10, !11}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!"0xf\00\000\0064\0064\000\000", null, null, !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!11 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!12 = !{!13, !14, !15, !16}
+!13 = !{!"0x101\00a\0016777219\000", !4, !6, !10} ; [ DW_TAG_arg_variable ] [a] [line 3]
+!14 = !{!"0x101\00size\0033554435\000", !4, !6, !11} ; [ DW_TAG_arg_variable ] [size] [line 3]
+!15 = !{!"0x100\00sum\004\000", !4, !6, !11} ; [ DW_TAG_auto_variable ] [sum] [line 4]
+!16 = !{!"0x100\00i\005\000", !17, !6, !11} ; [ DW_TAG_auto_variable ] [i] [line 5]
+!17 = !{!"0xb\005\000\000", !5, !4} ; [ DW_TAG_lexical_block ] [/Volumes/Data/backedup/dev/os/llvm/debug/<stdin>]
+!18 = !{i32 2, !"Dwarf Version", i32 3}
+!19 = !MDLocation(line: 3, scope: !4)
+!20 = !MDLocation(line: 4, scope: !4)
+!21 = !MDLocation(line: 5, scope: !17)
+!22 = !MDLocation(line: 6, scope: !17)
+!26 = !MDLocation(line: 7, scope: !4)
+!27 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/LoopVectorize/duplicated-metadata.ll b/test/Transforms/LoopVectorize/duplicated-metadata.ll
index 8353dca..bf2f899 100644
--- a/test/Transforms/LoopVectorize/duplicated-metadata.ll
+++ b/test/Transforms/LoopVectorize/duplicated-metadata.ll
@@ -24,7 +24,7 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
-!0 = metadata !{metadata !0, metadata !1}
-!1 = metadata !{metadata !"llvm.loop.vectorize.width", i32 4}
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
 ; CHECK-NOT: !{metadata !"llvm.loop.vectorize.width", i32 4}
-; CHECK: !{metadata !"llvm.loop.interleave.count", i32 1}
+; CHECK: !{!"llvm.loop.interleave.count", i32 1}
diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll
index 6c8af0b..6a2c2c6 100644
--- a/test/Transforms/LoopVectorize/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -388,9 +388,8 @@ define void @example12() nounwind uwtable ssp {
   ret void
 }
 
-; Can't vectorize because of reductions.
 ;CHECK-LABEL: @example13(
-;CHECK-NOT: <4 x i32>
+;CHECK: <4 x i32>
 ;CHECK: ret void
 define void @example13(i32** nocapture %A, i32** nocapture %B, i32* nocapture %out) nounwind uwtable ssp {
   br label %.preheader
diff --git a/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll b/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll
index 27c274d..8b8408b 100644
--- a/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll
+++ b/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll
@@ -20,7 +20,10 @@ entry:
   br i1 %cmp88, label %for.body.lr.ph, label %for.end
 
 for.body.lr.ph:
-  %0 = load i32** @b, align 8  %1 = load i32** @a, align 8  %2 = load i32** @c, align 8  br label %for.body
+  %0 = load i32** @b, align 8
+  %1 = load i32** @a, align 8
+  %2 = load i32** @c, align 8
+  br label %for.body
 
 for.body:
   %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %_ZL3fn3ii.exit58 ]
diff --git a/test/Transforms/LoopVectorize/if-conversion.ll b/test/Transforms/LoopVectorize/if-conversion.ll
index 9e18528..a220203 100644
--- a/test/Transforms/LoopVectorize/if-conversion.ll
+++ b/test/Transforms/LoopVectorize/if-conversion.ll
@@ -19,9 +19,9 @@ target triple = "x86_64-apple-macosx10.9.0"
 
 ;CHECK-LABEL: @function0(
 ;CHECK: load <4 x i32>
-;CHECK: icmp sgt <4 x i32>
 ;CHECK: mul <4 x i32>
 ;CHECK: add <4 x i32>
+;CHECK: icmp sle <4 x i32>
 ;CHECK: select <4 x i1>
 ;CHECK: ret i32
 define i32 @function0(i32* nocapture %a, i32* nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp {
@@ -72,8 +72,8 @@ for.end:
 
 ;CHECK-LABEL: @reduction_func(
 ;CHECK: load <4 x i32>
-;CHECK: icmp sgt <4 x i32>
 ;CHECK: add <4 x i32>
+;CHECK: icmp sle <4 x i32>
 ;CHECK: select <4 x i1>
 ;CHECK: ret i32
 define i32 @reduction_func(i32* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
diff --git a/test/Transforms/LoopVectorize/incorrect-dom-info.ll b/test/Transforms/LoopVectorize/incorrect-dom-info.ll
index 624ee7e..b8624fd 100644
--- a/test/Transforms/LoopVectorize/incorrect-dom-info.ll
+++ b/test/Transforms/LoopVectorize/incorrect-dom-info.ll
@@ -139,4 +139,4 @@ attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-po
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.6.0 "}
+!0 = !{!"clang version 3.6.0 "}
diff --git a/test/Transforms/LoopVectorize/loop-form.ll b/test/Transforms/LoopVectorize/loop-form.ll
new file mode 100644
index 0000000..138df1d
--- /dev/null
+++ b/test/Transforms/LoopVectorize/loop-form.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that we vectorize only bottom-tested loops.
+; This is a reduced testcase from PR21302.
+;
+; rdar://problem/18886083
+
+%struct.X = type { i32, i16 }
+; CHECK-LABEL: @foo(
+; CHECK-NOT: vector.body
+
+define void @foo(i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %for.body, label %if.end
+
+for.body:
+  %iprom = sext i32 %i to i64
+  %b = getelementptr inbounds %struct.X* undef, i64 %iprom, i32 1
+  store i16 0, i16* %b, align 4
+  %inc = add nsw i32 %i, 1
+  br label %for.cond
+
+if.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/loop-vect-memdep.ll b/test/Transforms/LoopVectorize/loop-vect-memdep.ll
new file mode 100644
index 0000000..e2c7524
--- /dev/null
+++ b/test/Transforms/LoopVectorize/loop-vect-memdep.ll
@@ -0,0 +1,26 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; RUN: opt < %s -S -loop-vectorize -debug-only=loop-vectorize 2>&1 | FileCheck %s
+; REQUIRES: asserts
+; CHECK: LV: Can't vectorize due to memory conflicts
+
+define void @test_loop_novect(double** %arr, i64 %n) {
+for.body.lr.ph:
+  %t = load double** %arr, align 8
+  br label %for.body
+
+for.body:                                      ; preds = %for.body, %for.body.lr.ph
+  %i = phi i64 [ 0, %for.body.lr.ph ], [ %i.next, %for.body ]
+  %a = getelementptr inbounds double* %t, i64 %i
+  %i.next = add nuw nsw i64 %i, 1
+  %a.next = getelementptr inbounds double* %t, i64 %i.next
+  %t1 = load double* %a, align 8
+  %t2 = load double* %a.next, align 8
+  store double %t1, double* %a.next, align 8
+  store double %t2, double* %a, align 8
+  %c = icmp eq i64 %i, %n
+  br i1 %c, label %final, label %for.body
+
+final:                                   ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/metadata-unroll.ll b/test/Transforms/LoopVectorize/metadata-unroll.ll
index 848f1f9..36a2314 100644
--- a/test/Transforms/LoopVectorize/metadata-unroll.ll
+++ b/test/Transforms/LoopVectorize/metadata-unroll.ll
@@ -37,5 +37,5 @@ define void @inc(i32 %n) nounwind uwtable noinline ssp {
   ret void
 }
 
-!0 = metadata !{metadata !0, metadata !1}
-!1 = metadata !{metadata !"llvm.loop.interleave.count", i32 2}
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.interleave.count", i32 2}
diff --git a/test/Transforms/LoopVectorize/metadata-width.ll b/test/Transforms/LoopVectorize/metadata-width.ll
index da0c622..dee4fee 100644
--- a/test/Transforms/LoopVectorize/metadata-width.ll
+++ b/test/Transforms/LoopVectorize/metadata-width.ll
@@ -27,5 +27,5 @@ for.end:                                          ; preds = %for.body, %entry
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
-!0 = metadata !{metadata !0, metadata !1}
-!1 = metadata !{metadata !"llvm.loop.vectorize.width", i32 8}
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.width", i32 8}
diff --git a/test/Transforms/LoopVectorize/metadata.ll b/test/Transforms/LoopVectorize/metadata.ll
index 14f60b3..a258f7c 100644
--- a/test/Transforms/LoopVectorize/metadata.ll
+++ b/test/Transforms/LoopVectorize/metadata.ll
@@ -27,18 +27,18 @@ for.end:                                          ; preds = %for.body
 ; CHECK: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa ![[TINT:[0-9]+]]
 ; CHECK: ret i32 0
 
-; CHECK-DAG: ![[TFLT]] = metadata !{metadata ![[TFLT1:[0-9]+]]
-; CHECK-DAG: ![[TFLT1]] = metadata !{metadata !"float"
+; CHECK-DAG: ![[TFLT]] = !{![[TFLT1:[0-9]+]]
+; CHECK-DAG: ![[TFLT1]] = !{!"float"
 
-; CHECK-DAG: ![[TINT]] = metadata !{metadata ![[TINT1:[0-9]+]]
-; CHECK-DAG: ![[TINT1]] = metadata !{metadata !"int"
+; CHECK-DAG: ![[TINT]] = !{![[TINT1:[0-9]+]]
+; CHECK-DAG: ![[TINT1]] = !{!"int"
 
 attributes #0 = { nounwind uwtable }
 
-!0 = metadata !{metadata !1, metadata !1, i64 0}
-!1 = metadata !{metadata !"float", metadata !2, i64 0}
-!2 = metadata !{metadata !"omnipotent char", metadata !3, i64 0}
-!3 = metadata !{metadata !"Simple C/C++ TBAA"}
-!4 = metadata !{metadata !5, metadata !5, i64 0}
-!5 = metadata !{metadata !"int", metadata !2, i64 0}
+!0 = !{!1, !1, i64 0}
+!1 = !{!"float", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"int", !2, i64 0}
 
diff --git a/test/Transforms/LoopVectorize/minmax_reduction.ll b/test/Transforms/LoopVectorize/minmax_reduction.ll
index e73e69d..1984cdd 100644
--- a/test/Transforms/LoopVectorize/minmax_reduction.ll
+++ b/test/Transforms/LoopVectorize/minmax_reduction.ll
@@ -516,7 +516,7 @@ for.end:
 }
 
 ; CHECK-LABEL: @unordered_max_red_float(
-; CHECK: fcmp ugt <2 x float>
+; CHECK: fcmp ole <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp ogt <2 x float>
@@ -542,7 +542,7 @@ for.end:
 }
 
 ; CHECK-LABEL: @unordered_max_red_float_ge(
-; CHECK: fcmp uge <2 x float>
+; CHECK: fcmp olt <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp ogt <2 x float>
@@ -568,7 +568,7 @@ for.end:
 }
 
 ; CHECK-LABEL: @inverted_unordered_max_red_float(
-; CHECK: fcmp ult <2 x float>
+; CHECK: fcmp oge <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp ogt <2 x float>
@@ -594,7 +594,7 @@ for.end:
 }
 
 ; CHECK-LABEL: @inverted_unordered_max_red_float_le(
-; CHECK: fcmp ule <2 x float>
+; CHECK: fcmp ogt <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp ogt <2 x float>
@@ -727,7 +727,7 @@ for.end:
 }
 
 ; CHECK-LABEL: @unordered_min_red_float(
-; CHECK: fcmp ult <2 x float>
+; CHECK: fcmp oge <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp olt <2 x float>
@@ -753,7 +753,7 @@ for.end:
 }
 
 ; CHECK-LABEL: @unordered_min_red_float_le(
-; CHECK: fcmp ule <2 x float>
+; CHECK: fcmp ogt <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp olt <2 x float>
@@ -779,7 +779,7 @@ for.end:
 }
 
 ; CHECK-LABEL: @inverted_unordered_min_red_float(
-; CHECK: fcmp ugt <2 x float>
+; CHECK: fcmp ole <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp olt <2 x float>
@@ -805,7 +805,7 @@ for.end:
 }
 
 ; CHECK-LABEL: @inverted_unordered_min_red_float_ge(
-; CHECK: fcmp uge <2 x float>
+; CHECK: fcmp olt <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp olt <2 x float>
diff --git a/test/Transforms/LoopVectorize/no_array_bounds.ll b/test/Transforms/LoopVectorize/no_array_bounds.ll
index a39b44f..d3bd755 100644
--- a/test/Transforms/LoopVectorize/no_array_bounds.ll
+++ b/test/Transforms/LoopVectorize/no_array_bounds.ll
@@ -72,30 +72,30 @@ attributes #0 = { nounwind }
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0\001\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"no_array_bounds.cpp", metadata !"."}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00test\00test\00\001\000\001\000\006\00256\001\002", metadata !1, metadata !5, metadata !6, null, void (i32*, i32*, i32)* @_Z4testPiS_i, null, null, metadata !2} ; [ DW_TAG_subprogram ]
-!5 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ]
-!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!8 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!9 = metadata !{metadata !"clang version 3.5.0"}
-!10 = metadata !{i32 4, i32 8, metadata !11, null}
-!11 = metadata !{metadata !"0xb\004\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{metadata !12, metadata !13}
-!13 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
-!14 = metadata !{i32 5, i32 5, metadata !15, null}
-!15 = metadata !{metadata !"0xb\004\0036\000", metadata !1, metadata !11} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 9, i32 8, metadata !17, null}
-!17 = metadata !{metadata !"0xb\009\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ]
-!18 = metadata !{metadata !18, metadata !13, metadata !19}
-!19 = metadata !{metadata !"llvm.loop.vectorize.width", i32 1}
-!20 = metadata !{i32 10, i32 5, metadata !21, null}
-!21 = metadata !{metadata !"0xb\009\0036\000", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ]
-!22 = metadata !{metadata !23, metadata !23, i64 0}
-!23 = metadata !{metadata !"int", metadata !24, i64 0}
-!24 = metadata !{metadata !"omnipotent char", metadata !25, i64 0}
-!25 = metadata !{metadata !"Simple C/C++ TBAA"}
-!26 = metadata !{i32 12, i32 1, metadata !4, null}
+!0 = !{!"0x11\004\00clang version 3.5.0\001\00\000\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ]
+!1 = !{!"no_array_bounds.cpp", !"."}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00test\00test\00\001\000\001\000\006\00256\001\002", !1, !5, !6, null, void (i32*, i32*, i32)* @_Z4testPiS_i, null, null, !2} ; [ DW_TAG_subprogram ]
+!5 = !{!"0x29", !1} ; [ DW_TAG_file_type ]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ]
+!7 = !{i32 2, !"Dwarf Version", i32 2}
+!8 = !{i32 2, !"Debug Info Version", i32 2}
+!9 = !{!"clang version 3.5.0"}
+!10 = !MDLocation(line: 4, column: 8, scope: !11)
+!11 = !{!"0xb\004\003\000", !1, !4} ; [ DW_TAG_lexical_block ]
+!12 = !{!12, !13}
+!13 = !{!"llvm.loop.vectorize.enable", i1 true}
+!14 = !MDLocation(line: 5, column: 5, scope: !15)
+!15 = !{!"0xb\004\0036\000", !1, !11} ; [ DW_TAG_lexical_block ]
+!16 = !MDLocation(line: 9, column: 8, scope: !17)
+!17 = !{!"0xb\009\003\000", !1, !4} ; [ DW_TAG_lexical_block ]
+!18 = !{!18, !13, !19}
+!19 = !{!"llvm.loop.vectorize.width", i32 1}
+!20 = !MDLocation(line: 10, column: 5, scope: !21)
+!21 = !{!"0xb\009\0036\000", !1, !17} ; [ DW_TAG_lexical_block ]
+!22 = !{!23, !23, i64 0}
+!23 = !{!"int", !24, i64 0}
+!24 = !{!"omnipotent char", !25, i64 0}
+!25 = !{!"Simple C/C++ TBAA"}
+!26 = !MDLocation(line: 12, column: 1, scope: !4)
diff --git a/test/Transforms/LoopVectorize/no_switch.ll b/test/Transforms/LoopVectorize/no_switch.ll
index c989c6b..64aab37 100644
--- a/test/Transforms/LoopVectorize/no_switch.ll
+++ b/test/Transforms/LoopVectorize/no_switch.ll
@@ -59,28 +59,28 @@ attributes #0 = { nounwind }
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0\001\00\006\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [./source.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"source.cpp", metadata !"."}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00test_switch\00test_switch\00\001\000\001\000\006\00256\001\001", metadata !1, metadata !5, metadata !6, null, void (i32*, i32)* @_Z11test_switchPii, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [test_switch]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [./source.cpp]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!8 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!9 = metadata !{metadata !"clang version 3.5.0"}
-!10 = metadata !{i32 3, i32 8, metadata !11, null}
-!11 = metadata !{metadata !"0xb\003\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{metadata !12, metadata !13, metadata !13}
-!13 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
-!14 = metadata !{i32 4, i32 5, metadata !15, null}
-!15 = metadata !{metadata !"0xb\003\0036\000", metadata !1, metadata !11} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{metadata !17, metadata !17, i64 0}
-!17 = metadata !{metadata !"int", metadata !18, i64 0}
-!18 = metadata !{metadata !"omnipotent char", metadata !19, i64 0}
-!19 = metadata !{metadata !"Simple C/C++ TBAA"}
-!20 = metadata !{i32 6, i32 7, metadata !21, null}
-!21 = metadata !{metadata !"0xb\004\0018\000", metadata !1, metadata !15} ; [ DW_TAG_lexical_block ]
-!22 = metadata !{i32 7, i32 5, metadata !21, null}
-!23 = metadata !{i32 9, i32 7, metadata !21, null}
-!24 = metadata !{i32 14, i32 1, metadata !4, null}
+!0 = !{!"0x11\004\00clang version 3.5.0\001\00\006\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [./source.cpp] [DW_LANG_C_plus_plus]
+!1 = !{!"source.cpp", !"."}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00test_switch\00test_switch\00\001\000\001\000\006\00256\001\001", !1, !5, !6, null, void (i32*, i32)* @_Z11test_switchPii, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [test_switch]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [./source.cpp]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{i32 2, !"Dwarf Version", i32 2}
+!8 = !{i32 2, !"Debug Info Version", i32 2}
+!9 = !{!"clang version 3.5.0"}
+!10 = !MDLocation(line: 3, column: 8, scope: !11)
+!11 = !{!"0xb\003\003\000", !1, !4} ; [ DW_TAG_lexical_block ]
+!12 = !{!12, !13, !13}
+!13 = !{!"llvm.loop.vectorize.enable", i1 true}
+!14 = !MDLocation(line: 4, column: 5, scope: !15)
+!15 = !{!"0xb\003\0036\000", !1, !11} ; [ DW_TAG_lexical_block ]
+!16 = !{!17, !17, i64 0}
+!17 = !{!"int", !18, i64 0}
+!18 = !{!"omnipotent char", !19, i64 0}
+!19 = !{!"Simple C/C++ TBAA"}
+!20 = !MDLocation(line: 6, column: 7, scope: !21)
+!21 = !{!"0xb\004\0018\000", !1, !15} ; [ DW_TAG_lexical_block ]
+!22 = !MDLocation(line: 7, column: 5, scope: !21)
+!23 = !MDLocation(line: 9, column: 7, scope: !21)
+!24 = !MDLocation(line: 14, column: 1, scope: !4)
diff --git a/test/Transforms/LoopVectorize/reverse_induction.ll b/test/Transforms/LoopVectorize/reverse_induction.ll
index da02d01..d379606 100644
--- a/test/Transforms/LoopVectorize/reverse_induction.ll
+++ b/test/Transforms/LoopVectorize/reverse_induction.ll
@@ -97,7 +97,7 @@ loopend:
 ; CHECK: vector.body
 ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK: %normalized.idx = sub i64 %index, 0
-; CHECK: %reverse.idx = sub i64 1023, %normalized.idx
+; CHECK: %offset.idx = sub i64 1023, %normalized.idx
 ; CHECK: trunc i64 %index to i8
 
 define void @reverse_forward_induction_i64_i8() {
@@ -124,7 +124,7 @@ while.end:
 ; CHECK: vector.body:
 ; CHECK:  %index = phi i64 [ 129, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK:  %normalized.idx = sub i64 %index, 129
-; CHECK:  %reverse.idx = sub i64 1023, %normalized.idx
+; CHECK:  %offset.idx = sub i64 1023, %normalized.idx
 ; CHECK:  trunc i64 %index to i8
 
 define void @reverse_forward_induction_i64_i8_signed() {
diff --git a/test/Transforms/LoopVectorize/runtime-check-address-space.ll b/test/Transforms/LoopVectorize/runtime-check-address-space.ll
index 34bbe52..ec56f80 100644
--- a/test/Transforms/LoopVectorize/runtime-check-address-space.ll
+++ b/test/Transforms/LoopVectorize/runtime-check-address-space.ll
@@ -31,25 +31,23 @@ define void @foo(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %n) #0 {
 ; CHECK: ret
 
 entry:
-  br label %for.cond
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body, label %for.end
 
-for.cond:                                         ; preds = %for.body, %entry
-  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %cmp = icmp slt i32 %i.0, %n
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %idxprom = sext i32 %i.0 to i64
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = sext i32 %i.02 to i64
   %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %idxprom
   %0 = load i32 addrspace(1)* %arrayidx, align 4
   %mul = mul nsw i32 %0, 3
-  %idxprom1 = sext i32 %i.0 to i64
+  %idxprom1 = sext i32 %i.02 to i64
   %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %a, i64 %idxprom1
   store i32 %mul, i32 addrspace(1)* %arrayidx2, align 4
-  %inc = add nsw i32 %i.0, 1
-  br label %for.cond
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
 
-for.end:                                          ; preds = %for.cond
+for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
@@ -60,25 +58,23 @@ define void @bar0(i32* %a, i32 addrspace(1)* %b, i32 %n) #0 {
 ; CHECK: ret
 
 entry:
-  br label %for.cond
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body, label %for.end
 
-for.cond:                                         ; preds = %for.body, %entry
-  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %cmp = icmp slt i32 %i.0, %n
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %idxprom = sext i32 %i.0 to i64
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = sext i32 %i.02 to i64
   %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %idxprom
   %0 = load i32 addrspace(1)* %arrayidx, align 4
   %mul = mul nsw i32 %0, 3
-  %idxprom1 = sext i32 %i.0 to i64
+  %idxprom1 = sext i32 %i.02 to i64
   %arrayidx2 = getelementptr inbounds i32* %a, i64 %idxprom1
   store i32 %mul, i32* %arrayidx2, align 4
-  %inc = add nsw i32 %i.0, 1
-  br label %for.cond
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
 
-for.end:                                          ; preds = %for.cond
+for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
@@ -89,25 +85,23 @@ define void @bar1(i32 addrspace(1)* %a, i32* %b, i32 %n) #0 {
 ; CHECK: ret
 
 entry:
-  br label %for.cond
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body, label %for.end
 
-for.cond:                                         ; preds = %for.body, %entry
-  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %cmp = icmp slt i32 %i.0, %n
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %idxprom = sext i32 %i.0 to i64
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = sext i32 %i.02 to i64
   %arrayidx = getelementptr inbounds i32* %b, i64 %idxprom
   %0 = load i32* %arrayidx, align 4
   %mul = mul nsw i32 %0, 3
-  %idxprom1 = sext i32 %i.0 to i64
+  %idxprom1 = sext i32 %i.02 to i64
   %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %a, i64 %idxprom1
   store i32 %mul, i32 addrspace(1)* %arrayidx2, align 4
-  %inc = add nsw i32 %i.0, 1
-  br label %for.cond
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
 
-for.end:                                          ; preds = %for.cond
+for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
@@ -119,25 +113,23 @@ define void @bar2(i32* noalias %a, i32 addrspace(1)* noalias %b, i32 %n) #0 {
 ; CHECK: ret
 
 entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.body, %entry
-  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %cmp = icmp slt i32 %i.0, %n
-  br i1 %cmp, label %for.body, label %for.end
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body, label %for.end
 
-for.body:                                         ; preds = %for.cond
-  %idxprom = sext i32 %i.0 to i64
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = sext i32 %i.02 to i64
   %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %idxprom
   %0 = load i32 addrspace(1)* %arrayidx, align 4
   %mul = mul nsw i32 %0, 3
-  %idxprom1 = sext i32 %i.0 to i64
+  %idxprom1 = sext i32 %i.02 to i64
   %arrayidx2 = getelementptr inbounds i32* %a, i64 %idxprom1
   store i32 %mul, i32* %arrayidx2, align 4
-  %inc = add nsw i32 %i.0, 1
-  br label %for.cond
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
 
-for.end:                                          ; preds = %for.cond
+for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
@@ -149,25 +141,23 @@ define void @arst0(i32* %b, i32 %n) #0 {
 ; CHECK: ret
 
 entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.body, %entry
-  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %cmp = icmp slt i32 %i.0, %n
-  br i1 %cmp, label %for.body, label %for.end
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body, label %for.end
 
-for.body:                                         ; preds = %for.cond
-  %idxprom = sext i32 %i.0 to i64
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = sext i32 %i.02 to i64
   %arrayidx = getelementptr inbounds i32* %b, i64 %idxprom
   %0 = load i32* %arrayidx, align 4
   %mul = mul nsw i32 %0, 3
-  %idxprom1 = sext i32 %i.0 to i64
+  %idxprom1 = sext i32 %i.02 to i64
   %arrayidx2 = getelementptr inbounds [1024 x i32] addrspace(1)* @g_as1, i64 0, i64 %idxprom1
   store i32 %mul, i32 addrspace(1)* %arrayidx2, align 4
-  %inc = add nsw i32 %i.0, 1
-  br label %for.cond
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
 
-for.end:                                          ; preds = %for.cond
+for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
@@ -180,25 +170,23 @@ define void @arst1(i32* %b, i32 %n) #0 {
 ; CHECK: ret
 
 entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.body, %entry
-  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %cmp = icmp slt i32 %i.0, %n
-  br i1 %cmp, label %for.body, label %for.end
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body, label %for.end
 
-for.body:                                         ; preds = %for.cond
-  %idxprom = sext i32 %i.0 to i64
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = sext i32 %i.02 to i64
   %arrayidx = getelementptr inbounds [1024 x i32] addrspace(1)* @g_as1, i64 0, i64 %idxprom
   %0 = load i32 addrspace(1)* %arrayidx, align 4
   %mul = mul nsw i32 %0, 3
-  %idxprom1 = sext i32 %i.0 to i64
+  %idxprom1 = sext i32 %i.02 to i64
   %arrayidx2 = getelementptr inbounds i32* %b, i64 %idxprom1
   store i32 %mul, i32* %arrayidx2, align 4
-  %inc = add nsw i32 %i.0, 1
-  br label %for.cond
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
 
-for.end:                                          ; preds = %for.cond
+for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
@@ -210,25 +198,23 @@ define void @aoeu(i32 %n) #0 {
 ; CHECK: ret
 
 entry:
-  br label %for.cond
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body, label %for.end
 
-for.cond:                                         ; preds = %for.body, %entry
-  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %cmp = icmp slt i32 %i.0, %n
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %idxprom = sext i32 %i.0 to i64
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = sext i32 %i.02 to i64
   %arrayidx = getelementptr inbounds [1024 x i32] addrspace(2)* @q_as2, i64 0, i64 %idxprom
   %0 = load i32 addrspace(2)* %arrayidx, align 4
   %mul = mul nsw i32 %0, 3
-  %idxprom1 = sext i32 %i.0 to i64
+  %idxprom1 = sext i32 %i.02 to i64
   %arrayidx2 = getelementptr inbounds [1024 x i32] addrspace(1)* @g_as1, i64 0, i64 %idxprom1
   store i32 %mul, i32 addrspace(1)* %arrayidx2, align 4
-  %inc = add nsw i32 %i.0, 1
-  br label %for.cond
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
 
-for.end:                                          ; preds = %for.cond
+for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
diff --git a/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll b/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
index 56f1f99..12ba3ce 100644
--- a/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
+++ b/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
@@ -8,26 +8,24 @@ define void @add_ints_1_1_1(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addr
 ; CHECK-LABEL: @add_ints_1_1_1(
 ; CHECK: <4 x i32>
 ; CHECK: ret
-entry:
-  br label %for.cond
 
-for.cond:                                         ; preds = %for.body, %entry
-  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
-  %cmp = icmp ult i64 %i.0, 200
-  br i1 %cmp, label %for.body, label %for.end
+entry:
+  br label %for.body
 
-for.body:                                         ; preds = %for.cond
-  %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %i.0
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %i.01
   %0 = load i32 addrspace(1)* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %c, i64 %i.0
+  %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %c, i64 %i.01
   %1 = load i32 addrspace(1)* %arrayidx1, align 4
   %add = add nsw i32 %0, %1
-  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %a, i64 %i.0
+  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %a, i64 %i.01
   store i32 %add, i32 addrspace(1)* %arrayidx2, align 4
-  %inc = add i64 %i.0, 1
-  br label %for.cond
+  %inc = add i64 %i.01, 1
+  %cmp = icmp ult i64 %inc, 200
+  br i1 %cmp, label %for.body, label %for.end
 
-for.end:                                          ; preds = %for.cond
+for.end:                                          ; preds = %for.body
   ret void
 }
 
@@ -35,26 +33,24 @@ define void @add_ints_as_1_0_0(i32 addrspace(1)* %a, i32* %b, i32* %c) #0 {
 ; CHECK-LABEL: @add_ints_as_1_0_0(
 ; CHECK-NOT: <4 x i32>
 ; CHECK: ret
-entry:
-  br label %for.cond
 
-for.cond:                                         ; preds = %for.body, %entry
-  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
-  %cmp = icmp ult i64 %i.0, 200
-  br i1 %cmp, label %for.body, label %for.end
+entry:
+  br label %for.body
 
-for.body:                                         ; preds = %for.cond
-  %arrayidx = getelementptr inbounds i32* %b, i64 %i.0
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %b, i64 %i.01
   %0 = load i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32* %c, i64 %i.0
+  %arrayidx1 = getelementptr inbounds i32* %c, i64 %i.01
   %1 = load i32* %arrayidx1, align 4
   %add = add nsw i32 %0, %1
-  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %a, i64 %i.0
+  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %a, i64 %i.01
   store i32 %add, i32 addrspace(1)* %arrayidx2, align 4
-  %inc = add i64 %i.0, 1
-  br label %for.cond
+  %inc = add i64 %i.01, 1
+  %cmp = icmp ult i64 %inc, 200
+  br i1 %cmp, label %for.body, label %for.end
 
-for.end:                                          ; preds = %for.cond
+for.end:                                          ; preds = %for.body
   ret void
 }
 
@@ -62,26 +58,24 @@ define void @add_ints_as_0_1_0(i32* %a, i32 addrspace(1)* %b, i32* %c) #0 {
 ; CHECK-LABEL: @add_ints_as_0_1_0(
 ; CHECK-NOT: <4 x i32>
 ; CHECK: ret
-entry:
-  br label %for.cond
 
-for.cond:                                         ; preds = %for.body, %entry
-  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
-  %cmp = icmp ult i64 %i.0, 200
-  br i1 %cmp, label %for.body, label %for.end
+entry:
+  br label %for.body
 
-for.body:                                         ; preds = %for.cond
-  %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %i.0
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %i.01
   %0 = load i32 addrspace(1)* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32* %c, i64 %i.0
+  %arrayidx1 = getelementptr inbounds i32* %c, i64 %i.01
   %1 = load i32* %arrayidx1, align 4
   %add = add nsw i32 %0, %1
-  %arrayidx2 = getelementptr inbounds i32* %a, i64 %i.0
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %i.01
   store i32 %add, i32* %arrayidx2, align 4
-  %inc = add i64 %i.0, 1
-  br label %for.cond
+  %inc = add i64 %i.01, 1
+  %cmp = icmp ult i64 %inc, 200
+  br i1 %cmp, label %for.body, label %for.end
 
-for.end:                                          ; preds = %for.cond
+for.end:                                          ; preds = %for.body
   ret void
 }
 
@@ -89,26 +83,24 @@ define void @add_ints_as_0_1_1(i32* %a, i32 addrspace(1)* %b, i32 addrspace(1)*
 ; CHECK-LABEL: @add_ints_as_0_1_1(
 ; CHECK-NOT: <4 x i32>
 ; CHECK: ret
-entry:
-  br label %for.cond
 
-for.cond:                                         ; preds = %for.body, %entry
-  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
-  %cmp = icmp ult i64 %i.0, 200
-  br i1 %cmp, label %for.body, label %for.end
+entry:
+  br label %for.body
 
-for.body:                                         ; preds = %for.cond
-  %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %i.0
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %i.01
   %0 = load i32 addrspace(1)* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %c, i64 %i.0
+  %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %c, i64 %i.01
   %1 = load i32 addrspace(1)* %arrayidx1, align 4
   %add = add nsw i32 %0, %1
-  %arrayidx2 = getelementptr inbounds i32* %a, i64 %i.0
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %i.01
   store i32 %add, i32* %arrayidx2, align 4
-  %inc = add i64 %i.0, 1
-  br label %for.cond
+  %inc = add i64 %i.01, 1
+  %cmp = icmp ult i64 %inc, 200
+  br i1 %cmp, label %for.body, label %for.end
 
-for.end:                                          ; preds = %for.cond
+for.end:                                          ; preds = %for.body
   ret void
 }
 
@@ -116,26 +108,24 @@ define void @add_ints_as_0_1_2(i32* %a, i32 addrspace(1)* %b, i32 addrspace(2)*
 ; CHECK-LABEL: @add_ints_as_0_1_2(
 ; CHECK-NOT: <4 x i32>
 ; CHECK: ret
-entry:
-  br label %for.cond
 
-for.cond:                                         ; preds = %for.body, %entry
-  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
-  %cmp = icmp ult i64 %i.0, 200
-  br i1 %cmp, label %for.body, label %for.end
+entry:
+  br label %for.body
 
-for.body:                                         ; preds = %for.cond
-  %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %i.0
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %i.01
   %0 = load i32 addrspace(1)* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32 addrspace(2)* %c, i64 %i.0
+  %arrayidx1 = getelementptr inbounds i32 addrspace(2)* %c, i64 %i.01
   %1 = load i32 addrspace(2)* %arrayidx1, align 4
   %add = add nsw i32 %0, %1
-  %arrayidx2 = getelementptr inbounds i32* %a, i64 %i.0
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %i.01
   store i32 %add, i32* %arrayidx2, align 4
-  %inc = add i64 %i.0, 1
-  br label %for.cond
+  %inc = add i64 %i.01, 1
+  %cmp = icmp ult i64 %inc, 200
+  br i1 %cmp, label %for.body, label %for.end
 
-for.end:                                          ; preds = %for.cond
+for.end:                                          ; preds = %for.body
   ret void
 }
 
diff --git a/test/Transforms/LoopVectorize/scev-exitlim-crash.ll b/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
index 1bce3f8..5154771 100644
--- a/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
+++ b/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
@@ -106,9 +106,9 @@ declare i32 @fn2(double) #1
 attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"double", metadata !1}
-!4 = metadata !{metadata !0, metadata !0, i64 0}
-!5 = metadata !{metadata !3, metadata !3, i64 0}
+!0 = !{!"int", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"double", !1}
+!4 = !{!0, !0, i64 0}
+!5 = !{!3, !3, i64 0}
diff --git a/test/Transforms/LoopVectorize/tbaa-nodep.ll b/test/Transforms/LoopVectorize/tbaa-nodep.ll
index 5cd104c..be3e74f 100644
--- a/test/Transforms/LoopVectorize/tbaa-nodep.ll
+++ b/test/Transforms/LoopVectorize/tbaa-nodep.ll
@@ -93,10 +93,10 @@ for.end:                                          ; preds = %for.body
 
 attributes #0 = { nounwind uwtable }
 
-!0 = metadata !{metadata !1, metadata !1, i64 0}
-!1 = metadata !{metadata !"float", metadata !2, i64 0}
-!2 = metadata !{metadata !"omnipotent char", metadata !3, i64 0}
-!3 = metadata !{metadata !"Simple C/C++ TBAA"}
-!4 = metadata !{metadata !5, metadata !5, i64 0}
-!5 = metadata !{metadata !"int", metadata !2, i64 0}
+!0 = !{!1, !1, i64 0}
+!1 = !{!"float", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"int", !2, i64 0}
 
diff --git a/test/Transforms/LoopVectorize/vect.omp.persistence.ll b/test/Transforms/LoopVectorize/vect.omp.persistence.ll
index b0fe7a5..ea7be9c 100644
--- a/test/Transforms/LoopVectorize/vect.omp.persistence.ll
+++ b/test/Transforms/LoopVectorize/vect.omp.persistence.ll
@@ -61,8 +61,8 @@ for.end:
   ret void
 }
 
-!1 = metadata !{metadata !1, metadata !2}
-!2 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
+!1 = !{!1, !2}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 ;
 ; Test #2
@@ -84,5 +84,5 @@ return:
   ret i32 0
 }
 
-!3 = metadata !{metadata !3, metadata !4}
-!4 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
+!3 = !{!3, !4}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/test/Transforms/LoopVectorize/vect.stats.ll b/test/Transforms/LoopVectorize/vect.stats.ll
index 556da45..c5b6e64 100644
--- a/test/Transforms/LoopVectorize/vect.stats.ll
+++ b/test/Transforms/LoopVectorize/vect.stats.ll
@@ -13,53 +13,47 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define void @vectorized(float* nocapture %a, i64 %size) {
 entry:
-  %cmp1 = icmp sgt i64 %size, 0
-  br i1 %cmp1, label %for.header, label %for.end
-
-for.header:
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %cmp2 = icmp sgt i64 %indvars.iv, %size
-  br i1 %cmp2, label %for.end, label %for.body
-
-for.body:
-
-  %arrayidx = getelementptr inbounds float* %a, i64 %indvars.iv
+  %cmp1 = icmp sle i64 %size, 0
+  %cmp21 = icmp sgt i64 0, %size
+  %or.cond = or i1 %cmp1, %cmp21
+  br i1 %or.cond, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv2 = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float* %a, i64 %indvars.iv2
   %0 = load float* %arrayidx, align 4
   %mul = fmul float %0, %0
   store float %mul, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv2, 1
+  %cmp2 = icmp sgt i64 %indvars.iv.next, %size
+  br i1 %cmp2, label %for.end, label %for.body
 
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  br label %for.header
-
-for.end:
+for.end:                                          ; preds = %entry, %for.body
   ret void
 }
 
 define void @not_vectorized(float* nocapture %a, i64 %size) {
 entry:
-  %cmp1 = icmp sgt i64 %size, 0
-  br i1 %cmp1, label %for.header, label %for.end
-
-for.header:
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %cmp2 = icmp sgt i64 %indvars.iv, %size
-  br i1 %cmp2, label %for.end, label %for.body
-
-for.body:
-
-  %0 = add nsw i64 %indvars.iv, -5
+  %cmp1 = icmp sle i64 %size, 0
+  %cmp21 = icmp sgt i64 0, %size
+  %or.cond = or i1 %cmp1, %cmp21
+  br i1 %or.cond, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv2 = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %0 = add nsw i64 %indvars.iv2, -5
   %arrayidx = getelementptr inbounds float* %a, i64 %0
   %1 = load float* %arrayidx, align 4
-  %2 = add nsw i64 %indvars.iv, 2
+  %2 = add nsw i64 %indvars.iv2, 2
   %arrayidx2 = getelementptr inbounds float* %a, i64 %2
   %3 = load float* %arrayidx2, align 4
   %mul = fmul float %1, %3
-  %arrayidx4 = getelementptr inbounds float* %a, i64 %indvars.iv
+  %arrayidx4 = getelementptr inbounds float* %a, i64 %indvars.iv2
   store float %mul, float* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv2, 1
+  %cmp2 = icmp sgt i64 %indvars.iv.next, %size
+  br i1 %cmp2, label %for.end, label %for.body
 
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  br label %for.header
-
-for.end:
+for.end:                                          ; preds = %entry, %for.body
   ret void
 }
diff --git a/test/Transforms/LoopVectorize/vectorize-once.ll b/test/Transforms/LoopVectorize/vectorize-once.ll
index cee4b16..a9b2a53 100644
--- a/test/Transforms/LoopVectorize/vectorize-once.ll
+++ b/test/Transforms/LoopVectorize/vectorize-once.ll
@@ -68,10 +68,10 @@ _ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %for.body.i, %entry
 
 attributes #0 = { nounwind readonly ssp uwtable "fp-contract-model"="standard" "no-frame-pointer-elim" "no-frame-pointer-elim-non-leaf" "realign-stack" "relocation-model"="pic" "ssp-buffers-size"="8" }
 
-; CHECK: !0 = metadata !{metadata !0, metadata !1, metadata !2}
-; CHECK: !1 = metadata !{metadata !"llvm.loop.vectorize.width", i32 1}
-; CHECK: !2 = metadata !{metadata !"llvm.loop.interleave.count", i32 1}
-; CHECK: !3 = metadata !{metadata !3, metadata !1, metadata !2}
+; CHECK: !0 = distinct !{!0, !1, !2}
+; CHECK: !1 = !{!"llvm.loop.vectorize.width", i32 1}
+; CHECK: !2 = !{!"llvm.loop.interleave.count", i32 1}
+; CHECK: !3 = distinct !{!3, !1, !2}
 
-!0 = metadata !{metadata !0, metadata !1}
-!1 = metadata !{metadata !"llvm.loop.vectorize.width", i32 1}
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.width", i32 1}
diff --git a/test/Transforms/LoopVectorize/version-mem-access.ll b/test/Transforms/LoopVectorize/version-mem-access.ll
index 7ac2fca..37145aa 100644
--- a/test/Transforms/LoopVectorize/version-mem-access.ll
+++ b/test/Transforms/LoopVectorize/version-mem-access.ll
@@ -2,10 +2,16 @@
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
+; Check that we version this loop with speculating the value 1 for symbolic
+; strides.  This also checks that the symbolic stride information is correctly
+; propagated to the memcheck generation.  Without this the loop wouldn't
+; vectorize because we couldn't determine the array bounds for the required
+; memchecks.
+
 ; CHECK-LABEL: test
-define void @test(i32* noalias %A, i64 %AStride,
-                  i32* noalias %B, i32 %BStride,
-                  i32* noalias %C, i64 %CStride, i32 %N) {
+define void @test(i32*  %A, i64 %AStride,
+                  i32*  %B, i32 %BStride,
+                  i32*  %C, i64 %CStride, i32 %N) {
 entry:
   %cmp13 = icmp eq i32 %N, 0
   br i1 %cmp13, label %for.end, label %for.body.preheader
diff --git a/test/Transforms/LowerBitSets/constant.ll b/test/Transforms/LowerBitSets/constant.ll
new file mode 100644
index 0000000..230c57c
--- /dev/null
+++ b/test/Transforms/LowerBitSets/constant.ll
@@ -0,0 +1,34 @@
+; RUN: opt -S -lowerbitsets < %s | FileCheck %s
+
+target datalayout = "e-p:32:32"
+
+@a = constant i32 1
+@b = constant [2 x i32] [i32 2, i32 3]
+
+!0 = !{!"bitset1", i32* @a, i32 0}
+!1 = !{!"bitset1", [2 x i32]* @b, i32 4}
+
+!llvm.bitsets = !{ !0, !1 }
+
+declare i1 @llvm.bitset.test(i8* %ptr, metadata %bitset) nounwind readnone
+
+; CHECK: @foo(
+define i1 @foo() {
+  ; CHECK: ret i1 true
+  %x = call i1 @llvm.bitset.test(i8* bitcast (i32* @a to i8*), metadata !"bitset1")
+  ret i1 %x
+}
+
+; CHECK: @bar(
+define i1 @bar() {
+  ; CHECK: ret i1 true
+  %x = call i1 @llvm.bitset.test(i8* bitcast (i32* getelementptr ([2 x i32]* @b, i32 0, i32 1) to i8*), metadata !"bitset1")
+  ret i1 %x
+}
+
+; CHECK: @baz(
+define i1 @baz() {
+  ; CHECK-NOT: ret i1 true
+  %x = call i1 @llvm.bitset.test(i8* bitcast (i32* getelementptr ([2 x i32]* @b, i32 0, i32 0) to i8*), metadata !"bitset1")
+  ret i1 %x
+}
diff --git a/test/Transforms/LowerBitSets/layout.ll b/test/Transforms/LowerBitSets/layout.ll
new file mode 100644
index 0000000..a0c6e77
--- /dev/null
+++ b/test/Transforms/LowerBitSets/layout.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -lowerbitsets < %s | FileCheck %s
+
+target datalayout = "e-p:32:32"
+
+; Tests that this set of globals is laid out according to our layout algorithm
+; (see GlobalLayoutBuilder in include/llvm/Transforms/IPO/LowerBitSets.h).
+; The chosen layout in this case is a, e, b, d, c.
+
+; CHECK: private constant { i32, [0 x i8], i32, [0 x i8], i32, [0 x i8], i32, [0 x i8], i32 } { i32 1, [0 x i8] zeroinitializer, i32 5, [0 x i8] zeroinitializer, i32 2, [0 x i8] zeroinitializer, i32 4, [0 x i8] zeroinitializer, i32 3 }
+@a = constant i32 1
+@b = constant i32 2
+@c = constant i32 3
+@d = constant i32 4
+@e = constant i32 5
+
+!0 = !{!"bitset1", i32* @a, i32 0}
+!1 = !{!"bitset1", i32* @b, i32 0}
+!2 = !{!"bitset1", i32* @c, i32 0}
+
+!3 = !{!"bitset2", i32* @b, i32 0}
+!4 = !{!"bitset2", i32* @d, i32 0}
+
+!5 = !{!"bitset3", i32* @a, i32 0}
+!6 = !{!"bitset3", i32* @e, i32 0}
+
+!llvm.bitsets = !{ !0, !1, !2, !3, !4, !5, !6 }
+
+declare i1 @llvm.bitset.test(i8* %ptr, metadata %bitset) nounwind readnone
+
+define void @foo() {
+  %x = call i1 @llvm.bitset.test(i8* undef, metadata !"bitset1")
+  %y = call i1 @llvm.bitset.test(i8* undef, metadata !"bitset2")
+  %z = call i1 @llvm.bitset.test(i8* undef, metadata !"bitset3")
+  ret void
+}
diff --git a/test/Transforms/LowerBitSets/simple.ll b/test/Transforms/LowerBitSets/simple.ll
new file mode 100644
index 0000000..0928524
--- /dev/null
+++ b/test/Transforms/LowerBitSets/simple.ll
@@ -0,0 +1,122 @@
+; RUN: opt -S -lowerbitsets < %s | FileCheck %s
+; RUN: opt -S -O3 < %s | FileCheck -check-prefix=CHECK-NODISCARD %s
+
+target datalayout = "e-p:32:32"
+
+; CHECK: [[G:@[^ ]*]] = private constant { i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] } { i32 1, [0 x i8] zeroinitializer, [63 x i32] zeroinitializer, [4 x i8] zeroinitializer, i32 3, [0 x i8] zeroinitializer, [2 x i32] [i32 4, i32 5] }
+@a = constant i32 1
+@b = constant [63 x i32] zeroinitializer
+@c = constant i32 3
+@d = constant [2 x i32] [i32 4, i32 5]
+
+; Offset 0, 4 byte alignment
+; CHECK: @bitset1.bits = private constant [9 x i8] c"\03\00\00\00\00\00\00\00\08"
+!0 = !{!"bitset1", i32* @a, i32 0}
+; CHECK-NODISCARD-DAG: !{!"bitset1", i32* @a, i32 0}
+!1 = !{!"bitset1", [63 x i32]* @b, i32 0}
+; CHECK-NODISCARD-DAG: !{!"bitset1", [63 x i32]* @b, i32 0}
+!2 = !{!"bitset1", [2 x i32]* @d, i32 4}
+; CHECK-NODISCARD-DAG: !{!"bitset1", [2 x i32]* @d, i32 4}
+
+; Offset 4, 256 byte alignment
+; CHECK: @bitset2.bits = private constant [1 x i8] c"\03"
+!3 = !{!"bitset2", [63 x i32]* @b, i32 0}
+; CHECK-NODISCARD-DAG: !{!"bitset2", [63 x i32]* @b, i32 0}
+!4 = !{!"bitset2", i32* @c, i32 0}
+; CHECK-NODISCARD-DAG: !{!"bitset2", i32* @c, i32 0}
+
+; Offset 0, 4 byte alignment
+; CHECK: @bitset3.bits = private constant [9 x i8] c"\01\00\00\00\00\00\00\00\02"
+!5 = !{!"bitset3", i32* @a, i32 0}
+; CHECK-NODISCARD-DAG: !{!"bitset3", i32* @a, i32 0}
+!6 = !{!"bitset3", i32* @c, i32 0}
+; CHECK-NODISCARD-DAG: !{!"bitset3", i32* @c, i32 0}
+
+; Entries whose second operand is null (the result of a global being DCE'd)
+; should be ignored.
+!7 = !{!"bitset2", null, i32 0}
+
+!llvm.bitsets = !{ !0, !1, !2, !3, !4, !5, !6, !7 }
+
+; CHECK: @a = alias getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 0)
+; CHECK: @b = alias getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 2)
+; CHECK: @c = alias getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 4)
+; CHECK: @d = alias getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 6)
+
+declare i1 @llvm.bitset.test(i8* %ptr, metadata %bitset) nounwind readnone
+
+; CHECK: @foo(i32* [[A0:%[^ ]*]])
+define i1 @foo(i32* %p) {
+  ; CHECK-NOT: llvm.bitset.test
+
+  ; CHECK: [[R0:%[^ ]*]] = bitcast i32* [[A0]] to i8*
+  %pi8 = bitcast i32* %p to i8*
+  ; CHECK: [[R1:%[^ ]*]] = ptrtoint i8* [[R0]] to i32
+  ; CHECK: [[R2:%[^ ]*]] = sub i32 [[R1]], ptrtoint ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]] to i32)
+  ; CHECK: [[R3:%[^ ]*]] = lshr i32 [[R2]], 2
+  ; CHECK: [[R4:%[^ ]*]] = shl i32 [[R2]], 30
+  ; CHECK: [[R5:%[^ ]*]] = or i32 [[R3]], [[R4]]
+  ; CHECK: [[R6:%[^ ]*]] = icmp ult i32 [[R5]], 68
+  ; CHECK: br i1 [[R6]]
+
+  ; CHECK: [[R8:%[^ ]*]] = lshr i32 [[R5]], 5
+  ; CHECK: [[R9:%[^ ]*]] = getelementptr i32* bitcast ([9 x i8]* @bitset1.bits to i32*), i32 [[R8]]
+  ; CHECK: [[R10:%[^ ]*]] = load i32* [[R9]]
+  ; CHECK: [[R11:%[^ ]*]] = and i32 [[R5]], 31
+  ; CHECK: [[R12:%[^ ]*]] = shl i32 1, [[R11]]
+  ; CHECK: [[R13:%[^ ]*]] = and i32 [[R10]], [[R12]]
+  ; CHECK: [[R14:%[^ ]*]] = icmp ne i32 [[R13]], 0
+
+  ; CHECK: [[R16:%[^ ]*]] = phi i1 [ false, {{%[^ ]*}} ], [ [[R14]], {{%[^ ]*}} ]
+  %x = call i1 @llvm.bitset.test(i8* %pi8, metadata !"bitset1")
+
+  ; CHECK-NOT: llvm.bitset.test
+  %y = call i1 @llvm.bitset.test(i8* %pi8, metadata !"bitset1")
+
+  ; CHECK: ret i1 [[R16]]
+  ret i1 %x
+}
+
+; CHECK: @bar(i32* [[B0:%[^ ]*]])
+define i1 @bar(i32* %p) {
+  ; CHECK: [[S0:%[^ ]*]] = bitcast i32* [[B0]] to i8*
+  %pi8 = bitcast i32* %p to i8*
+  ; CHECK: [[S1:%[^ ]*]] = ptrtoint i8* [[S0]] to i32
+  ; CHECK: [[S2:%[^ ]*]] = sub i32 [[S1]], add (i32 ptrtoint ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]] to i32), i32 4)
+  ; CHECK: [[S3:%[^ ]*]] = lshr i32 [[S2]], 8
+  ; CHECK: [[S4:%[^ ]*]] = shl i32 [[S2]], 24
+  ; CHECK: [[S5:%[^ ]*]] = or i32 [[S3]], [[S4]]
+  ; CHECK: [[S6:%[^ ]*]] = icmp ult i32 [[S5]], 2
+  %x = call i1 @llvm.bitset.test(i8* %pi8, metadata !"bitset2")
+
+  ; CHECK: ret i1 [[S6]]
+  ret i1 %x
+}
+
+; CHECK: @baz(i32* [[C0:%[^ ]*]])
+define i1 @baz(i32* %p) {
+  ; CHECK: [[T0:%[^ ]*]] = bitcast i32* [[C0]] to i8*
+  %pi8 = bitcast i32* %p to i8*
+  ; CHECK: [[T1:%[^ ]*]] = ptrtoint i8* [[T0]] to i32
+  ; CHECK: [[T2:%[^ ]*]] = sub i32 [[T1]], ptrtoint ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]] to i32)
+  ; CHECK: [[T3:%[^ ]*]] = lshr i32 [[T2]], 2
+  ; CHECK: [[T4:%[^ ]*]] = shl i32 [[T2]], 30
+  ; CHECK: [[T5:%[^ ]*]] = or i32 [[T3]], [[T4]]
+  ; CHECK: [[T6:%[^ ]*]] = icmp ult i32 [[T5]], 66
+  ; CHECK: br i1 [[T6]]
+
+  ; CHECK: [[T8:%[^ ]*]] = lshr i32 [[T5]], 5
+  ; CHECK: [[T9:%[^ ]*]] = getelementptr i32* bitcast ([9 x i8]* @bitset3.bits to i32*), i32 [[T8]]
+  ; CHECK: [[T10:%[^ ]*]] = load i32* [[T9]]
+  ; CHECK: [[T11:%[^ ]*]] = and i32 [[T5]], 31
+  ; CHECK: [[T12:%[^ ]*]] = shl i32 1, [[T11]]
+  ; CHECK: [[T13:%[^ ]*]] = and i32 [[T10]], [[T12]]
+  ; CHECK: [[T14:%[^ ]*]] = icmp ne i32 [[T13]], 0
+
+  ; CHECK: [[T16:%[^ ]*]] = phi i1 [ false, {{%[^ ]*}} ], [ [[T14]], {{%[^ ]*}} ]
+  %x = call i1 @llvm.bitset.test(i8* %pi8, metadata !"bitset3")
+  ; CHECK: ret i1 [[T16]]
+  ret i1 %x
+}
+
+; CHECK-NOT: !llvm.bitsets
diff --git a/test/Transforms/LowerBitSets/single-offset.ll b/test/Transforms/LowerBitSets/single-offset.ll
new file mode 100644
index 0000000..57194f4
--- /dev/null
+++ b/test/Transforms/LowerBitSets/single-offset.ll
@@ -0,0 +1,40 @@
+; RUN: opt -S -lowerbitsets < %s | FileCheck %s
+
+target datalayout = "e-p:32:32"
+
+; CHECK: [[G:@[^ ]*]] = private constant { i32, [0 x i8], i32 }
+@a = constant i32 1
+@b = constant i32 2
+
+!0 = !{!"bitset1", i32* @a, i32 0}
+!1 = !{!"bitset1", i32* @b, i32 0}
+!2 = !{!"bitset2", i32* @a, i32 0}
+!3 = !{!"bitset3", i32* @b, i32 0}
+
+!llvm.bitsets = !{ !0, !1, !2, !3 }
+
+declare i1 @llvm.bitset.test(i8* %ptr, metadata %bitset) nounwind readnone
+
+; CHECK: @foo(i8* [[A0:%[^ ]*]])
+define i1 @foo(i8* %p) {
+  ; CHECK: [[R0:%[^ ]*]] = ptrtoint i8* [[A0]] to i32
+  ; CHECK: [[R1:%[^ ]*]] = icmp eq i32 [[R0]], ptrtoint ({ i32, [0 x i8], i32 }* [[G]] to i32)
+  %x = call i1 @llvm.bitset.test(i8* %p, metadata !"bitset2")
+  ; CHECK: ret i1 [[R1]]
+  ret i1 %x
+}
+
+; CHECK: @bar(i8* [[B0:%[^ ]*]])
+define i1 @bar(i8* %p) {
+  ; CHECK: [[S0:%[^ ]*]] = ptrtoint i8* [[B0]] to i32
+  ; CHECK: [[S1:%[^ ]*]] = icmp eq i32 [[S0]], add (i32 ptrtoint ({ i32, [0 x i8], i32 }* [[G]] to i32), i32 4)
+  %x = call i1 @llvm.bitset.test(i8* %p, metadata !"bitset3")
+  ; CHECK: ret i1 [[S1]]
+  ret i1 %x
+}
+
+; CHECK: @x(
+define i1 @x(i8* %p) {
+  %x = call i1 @llvm.bitset.test(i8* %p, metadata !"bitset1")
+  ret i1 %x
+}
diff --git a/test/Transforms/LowerExpectIntrinsic/basic.ll b/test/Transforms/LowerExpectIntrinsic/basic.ll
index e184cb0..f4326c8 100644
--- a/test/Transforms/LowerExpectIntrinsic/basic.ll
+++ b/test/Transforms/LowerExpectIntrinsic/basic.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -lower-expect -strip-dead-prototypes -S -o - < %s | FileCheck %s
+; RUN: opt -S -passes=lower-expect < %s | opt -strip-dead-prototypes -S | FileCheck %s
 
 ; CHECK-LABEL: @test1(
 define i32 @test1(i32 %x) nounwind uwtable ssp {
@@ -274,7 +275,7 @@ return:                                           ; preds = %if.end, %if.then
 
 declare i1 @llvm.expect.i1(i1, i1) nounwind readnone
 
-; CHECK: !0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
-; CHECK: !1 = metadata !{metadata !"branch_weights", i32 4, i32 64}
-; CHECK: !2 = metadata !{metadata !"branch_weights", i32 4, i32 64, i32 4}
-; CHECK: !3 = metadata !{metadata !"branch_weights", i32 64, i32 4, i32 4}
+; CHECK: !0 = !{!"branch_weights", i32 64, i32 4}
+; CHECK: !1 = !{!"branch_weights", i32 4, i32 64}
+; CHECK: !2 = !{!"branch_weights", i32 4, i32 64, i32 4}
+; CHECK: !3 = !{!"branch_weights", i32 64, i32 4, i32 4}
diff --git a/test/Transforms/LowerSwitch/2014-06-11-SwitchDefaultUnreachableOpt.ll b/test/Transforms/LowerSwitch/2014-06-11-SwitchDefaultUnreachableOpt.ll
index 0f73721..ecdd767 100644
--- a/test/Transforms/LowerSwitch/2014-06-11-SwitchDefaultUnreachableOpt.ll
+++ b/test/Transforms/LowerSwitch/2014-06-11-SwitchDefaultUnreachableOpt.ll
@@ -1,5 +1,8 @@
 ; RUN: opt < %s -lowerswitch -S | FileCheck %s
-; CHECK-NOT: {{.*}}icmp eq{{.*}}
+;
+; The switch is lowered with a single icmp.
+; CHECK: icmp
+; CHECK-NOT: icmp
 ;
 ;int foo(int a) {
 ;
@@ -14,7 +17,7 @@
 ;
 ;}
 
-define i32 @foo(i32 %a) nounwind ssp uwtable {
+define i32 @foo(i32 %a) {
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   store i32 %a, i32* %2, align 4
diff --git a/test/Transforms/LowerSwitch/fold-popular-case-to-unreachable-default.ll b/test/Transforms/LowerSwitch/fold-popular-case-to-unreachable-default.ll
new file mode 100644
index 0000000..54929c5
--- /dev/null
+++ b/test/Transforms/LowerSwitch/fold-popular-case-to-unreachable-default.ll
@@ -0,0 +1,110 @@
+; RUN: opt %s -lowerswitch -S | FileCheck %s
+
+define void @foo(i32 %x, i32* %p) {
+; Cases 2 and 4 are removed and become the new default case.
+; It is now enough to use two icmps to lower the switch.
+;
+; CHECK-LABEL: @foo
+; CHECK:       icmp slt i32 %x, 5
+; CHECK:       icmp eq i32 %x, 1
+; CHECK-NOT:   icmp
+;
+entry:
+  switch i32 %x, label %default [
+    i32 1, label %bb0
+    i32 2, label %popular
+    i32 4, label %popular
+    i32 5, label %bb1
+  ]
+bb0:
+  store i32 0, i32* %p
+  br label %exit
+bb1:
+  store i32 1, i32* %p
+  br label %exit
+popular:
+  store i32 2, i32* %p
+  br label %exit
+exit:
+  ret void
+default:
+  unreachable
+}
+
+define void @unreachable_gap(i64 %x, i32* %p) {
+; Cases 6 and INT64_MAX become the new default, but we still exploit the fact
+; that 3-4 is unreachable, so four icmps is enough.
+
+; CHECK-LABEL: @unreachable_gap
+; CHECK:       icmp slt i64 %x, 2
+; CHECK:       icmp slt i64 %x, 5
+; CHECK:       icmp eq  i64 %x, 5
+; CHECK:       icmp slt i64 %x, 1
+; CHECK-NOT:   icmp
+
+entry:
+  switch i64 %x, label %default [
+    i64 -9223372036854775808, label %bb0
+    i64 1, label %bb1
+    i64 2, label %bb2
+    i64 5, label %bb3
+    i64 6, label %bb4
+    i64 9223372036854775807, label %bb4
+  ]
+bb0:
+  store i32 0, i32* %p
+  br label %exit
+bb1:
+  store i32 1, i32* %p
+  br label %exit
+bb2:
+  store i32 2, i32* %p
+  br label %exit
+bb3:
+  store i32 3, i32* %p
+  br label %exit
+bb4:
+  store i32 4, i32* %p
+  br label %exit
+exit:
+  ret void
+default:
+  unreachable
+}
+
+
+
+define void @nocases(i32 %x, i32* %p) {
+; Don't fall over when there are no cases.
+;
+; CHECK-LABEL: @nocases
+; CHECK-LABEL: entry
+; CHECK-NEXT:  br label %default
+;
+entry:
+  switch i32 %x, label %default [
+  ]
+default:
+  unreachable
+}
+
+define void @nocasesleft(i32 %x, i32* %p) {
+; Cases 2 and 4 are removed and we are left with no cases.
+;
+; CHECK-LABEL: @nocasesleft
+; CHECK-LABEL: entry
+; CHECK-NEXT:  br label %popular
+;
+entry:
+  switch i32 %x, label %default [
+    i32 2, label %popular
+    i32 4, label %popular
+  ]
+popular:
+  store i32 2, i32* %p
+  br label %exit
+exit:
+  ret void
+default:
+  unreachable
+}
diff --git a/test/Transforms/Mem2Reg/ConvertDebugInfo.ll b/test/Transforms/Mem2Reg/ConvertDebugInfo.ll
index b2d094f..a7369c0 100644
--- a/test/Transforms/Mem2Reg/ConvertDebugInfo.ll
+++ b/test/Transforms/Mem2Reg/ConvertDebugInfo.ll
@@ -7,13 +7,13 @@ entry:
   %retval = alloca double                         ; <double*> [#uses=2]
   %0 = alloca double                              ; <double*> [#uses=2]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{i32* %i_addr}, metadata !0, metadata !{}), !dbg !8
-; CHECK: call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata ![[IVAR:[0-9]*]], metadata {{.*}})
-; CHECK: call void @llvm.dbg.value(metadata !{double %j}, i64 0, metadata ![[JVAR:[0-9]*]], metadata {{.*}})
+  call void @llvm.dbg.declare(metadata i32* %i_addr, metadata !0, metadata !{}), !dbg !8
+; CHECK: call void @llvm.dbg.value(metadata i32 %i, i64 0, metadata ![[IVAR:[0-9]*]], metadata {{.*}})
+; CHECK: call void @llvm.dbg.value(metadata double %j, i64 0, metadata ![[JVAR:[0-9]*]], metadata {{.*}})
 ; CHECK: ![[IVAR]] = {{.*}} ; [ DW_TAG_arg_variable ] [i]
 ; CHECK: ![[JVAR]] = {{.*}} ; [ DW_TAG_arg_variable ] [j]
   store i32 %i, i32* %i_addr
-  call void @llvm.dbg.declare(metadata !{double* %j_addr}, metadata !9, metadata !{}), !dbg !8
+  call void @llvm.dbg.declare(metadata double* %j_addr, metadata !9, metadata !{}), !dbg !8
   store double %j, double* %j_addr
   %1 = load i32* %i_addr, align 4, !dbg !10       ; <i32> [#uses=1]
   %2 = add nsw i32 %1, 1, !dbg !10                ; <i32> [#uses=1]
@@ -35,18 +35,18 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!14}
 
-!0 = metadata !{metadata !"0x101\00i\002\000", metadata !1, metadata !2, metadata !7} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00testfunc\00testfunc\00testfunc\002\000\001\000\006\000\000\002", metadata !12, metadata !2, metadata !4, null, double (i32, double)* @testfunc, null, null, null} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !12, metadata !13, metadata !13, null, null, null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{metadata !6, metadata !7, metadata !6}
-!6 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", metadata !12, metadata !2} ; [ DW_TAG_base_type ]
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !12, metadata !2} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 2, i32 0, metadata !1, null}
-!9 = metadata !{metadata !"0x101\00j\002\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!10 = metadata !{i32 3, i32 0, metadata !11, null}
-!11 = metadata !{metadata !"0xb\002\000\000", metadata !12, metadata !1} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{metadata !"testfunc.c", metadata !"/tmp"}
-!13 = metadata !{i32 0}
-!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x101\00i\002\000", !1, !2, !7} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00testfunc\00testfunc\00testfunc\002\000\001\000\006\000\000\002", !12, !2, !4, null, double (i32, double)* @testfunc, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x29", !12} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", !12, !13, !13, null, null, null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !12, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{!6, !7, !6}
+!6 = !{!"0x24\00double\000\0064\0064\000\000\004", !12, !2} ; [ DW_TAG_base_type ]
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", !12, !2} ; [ DW_TAG_base_type ]
+!8 = !MDLocation(line: 2, scope: !1)
+!9 = !{!"0x101\00j\002\000", !1, !2, !6} ; [ DW_TAG_arg_variable ]
+!10 = !MDLocation(line: 3, scope: !11)
+!11 = !{!"0xb\002\000\000", !12, !1} ; [ DW_TAG_lexical_block ]
+!12 = !{!"testfunc.c", !"/tmp"}
+!13 = !{i32 0}
+!14 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/Mem2Reg/ConvertDebugInfo2.ll b/test/Transforms/Mem2Reg/ConvertDebugInfo2.ll
index b7b9dc7..76d2a1a 100644
--- a/test/Transforms/Mem2Reg/ConvertDebugInfo2.ll
+++ b/test/Transforms/Mem2Reg/ConvertDebugInfo2.ll
@@ -11,14 +11,14 @@ entry:
   %z_addr.i = alloca i8*                          ; <i8**> [#uses=2]
   %a_addr = alloca i32                            ; <i32*> [#uses=2]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{i32* %a_addr}, metadata !0, metadata !{}), !dbg !7
+  call void @llvm.dbg.declare(metadata i32* %a_addr, metadata !0, metadata !{}), !dbg !7
   store i32 %a, i32* %a_addr
   %0 = load i32* %a_addr, align 4, !dbg !8        ; <i32> [#uses=1]
-  call void @llvm.dbg.declare(metadata !{i32* %x_addr.i}, metadata !9, metadata !{}) nounwind, !dbg !15
+  call void @llvm.dbg.declare(metadata i32* %x_addr.i, metadata !9, metadata !{}) nounwind, !dbg !15
   store i32 %0, i32* %x_addr.i
-  call void @llvm.dbg.declare(metadata !{i64* %y_addr.i}, metadata !16, metadata !{}) nounwind, !dbg !15
+  call void @llvm.dbg.declare(metadata i64* %y_addr.i, metadata !16, metadata !{}) nounwind, !dbg !15
   store i64 55, i64* %y_addr.i
-  call void @llvm.dbg.declare(metadata !{i8** %z_addr.i}, metadata !17, metadata !{}) nounwind, !dbg !15
+  call void @llvm.dbg.declare(metadata i8** %z_addr.i, metadata !17, metadata !{}) nounwind, !dbg !15
   store i8* bitcast (void (i32)* @baz to i8*), i8** %z_addr.i
   %1 = load i32* %x_addr.i, align 4, !dbg !18     ; <i32> [#uses=1]
   %2 = load i64* %y_addr.i, align 8, !dbg !18     ; <i64> [#uses=1]
@@ -32,26 +32,26 @@ return:                                           ; preds = %entry
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!22}
-!0 = metadata !{metadata !"0x101\00a\008\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{metadata !"0x2e\00baz\00baz\00baz\008\000\001\000\006\000\000\008", metadata !20, metadata !2, metadata !4, null, void (i32)* @baz, null, null, null} ; [ DW_TAG_subprogram ]
-!2 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !20, metadata !21, metadata !21, null, null, null} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{null, metadata !6}
-!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !20, metadata !2} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 8, i32 0, metadata !1, null}
-!8 = metadata !{i32 9, i32 0, metadata !1, null}
-!9 = metadata !{metadata !"0x101\00x\004\000", metadata !10, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
-!10 = metadata !{metadata !"0x2e\00bar\00bar\00bar\004\001\001\000\006\000\000\004", metadata !20, metadata !2, metadata !11, null, null, null, null, null} ; [ DW_TAG_subprogram ]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{null, metadata !6, metadata !13, metadata !14}
-!13 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", metadata !20, metadata !2} ; [ DW_TAG_base_type ]
-!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !20, metadata !2, null} ; [ DW_TAG_pointer_type ]
-!15 = metadata !{i32 4, i32 0, metadata !10, metadata !8}
-!16 = metadata !{metadata !"0x101\00y\004\000", metadata !10, metadata !2, metadata !13} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{metadata !"0x101\00z\004\000", metadata !10, metadata !2, metadata !14} ; [ DW_TAG_arg_variable ]
-!18 = metadata !{i32 5, i32 0, metadata !10, metadata !8}
-!19 = metadata !{i32 10, i32 0, metadata !1, null}
-!20 = metadata !{metadata !"bar.c", metadata !"/tmp/"}
-!21 = metadata !{i32 0}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x101\00a\008\000", !1, !2, !6} ; [ DW_TAG_arg_variable ]
+!1 = !{!"0x2e\00baz\00baz\00baz\008\000\001\000\006\000\000\008", !20, !2, !4, null, void (i32)* @baz, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = !{!"0x29", !20} ; [ DW_TAG_file_type ]
+!3 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", !20, !21, !21, null, null, null} ; [ DW_TAG_compile_unit ]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !20, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{null, !6}
+!6 = !{!"0x24\00int\000\0032\0032\000\000\005", !20, !2} ; [ DW_TAG_base_type ]
+!7 = !MDLocation(line: 8, scope: !1)
+!8 = !MDLocation(line: 9, scope: !1)
+!9 = !{!"0x101\00x\004\000", !10, !2, !6} ; [ DW_TAG_arg_variable ]
+!10 = !{!"0x2e\00bar\00bar\00bar\004\001\001\000\006\000\000\004", !20, !2, !11, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!11 = !{!"0x15\00\000\000\000\000\000\000", !20, !2, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{null, !6, !13, !14}
+!13 = !{!"0x24\00long int\000\0064\0064\000\000\005", !20, !2} ; [ DW_TAG_base_type ]
+!14 = !{!"0xf\00\000\0064\0064\000\000", !20, !2, null} ; [ DW_TAG_pointer_type ]
+!15 = !MDLocation(line: 4, scope: !10, inlinedAt: !8)
+!16 = !{!"0x101\00y\004\000", !10, !2, !13} ; [ DW_TAG_arg_variable ]
+!17 = !{!"0x101\00z\004\000", !10, !2, !14} ; [ DW_TAG_arg_variable ]
+!18 = !MDLocation(line: 5, scope: !10, inlinedAt: !8)
+!19 = !MDLocation(line: 10, scope: !1)
+!20 = !{!"bar.c", !"/tmp/"}
+!21 = !{i32 0}
+!22 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/MemCpyOpt/callslot_aa.ll b/test/Transforms/MemCpyOpt/callslot_aa.ll
new file mode 100644
index 0000000..b6ea129
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/callslot_aa.ll
@@ -0,0 +1,22 @@
+; RUN: opt < %s -S -basicaa -memcpyopt | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%T = type { i64, i64 }
+
+define void @test(i8* %src) {
+  %tmp = alloca i8
+  %dst = alloca i8
+; CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 1, i32 8, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp, i8* %src, i64 1, i32 8, i1 false), !noalias !2
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %tmp, i64 1, i32 8, i1 false)
+
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
+
+; Check that the noalias for "dst" was removed by checking that the metadata is gone
+; CHECK-NOT: "dst"
+!0 = !{!0}
+!1 = distinct !{!1, !0, !"dst"}
+!2 = distinct !{!1}
diff --git a/test/Transforms/MemCpyOpt/form-memset.ll b/test/Transforms/MemCpyOpt/form-memset.ll
index d980b7f..0a40943 100644
--- a/test/Transforms/MemCpyOpt/form-memset.ll
+++ b/test/Transforms/MemCpyOpt/form-memset.ll
@@ -284,3 +284,18 @@ define void @test10(i8* nocapture %P) nounwind {
 ; CHECK-NOT: memset
 ; CHECK: ret void
 }
+
+; Memset followed by odd store.
+define void @test11(i32* nocapture %P) nounwind ssp {
+entry:
+  %add.ptr = getelementptr inbounds i32* %P, i64 3
+  %0 = bitcast i32* %add.ptr to i8*
+  tail call void @llvm.memset.p0i8.i64(i8* %0, i8 1, i64 11, i32 1, i1 false)
+  %arrayidx = getelementptr inbounds i32* %P, i64 0
+  %arrayidx.cast = bitcast i32* %arrayidx to i96*
+  store i96 310698676526526814092329217, i96* %arrayidx.cast, align 4
+  ret void
+; CHECK-LABEL: @test11(
+; CHECK-NOT: store
+; CHECK: call void @llvm.memset.p0i8.i64(i8* %1, i8 1, i64 23, i32 4, i1 false)
+}
diff --git a/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll b/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll
index 9878b47..b2083cb 100644
--- a/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll
+++ b/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll
@@ -87,5 +87,5 @@ lpad:
 declare i8 @dummy();
 declare i32 @__gxx_personality_v0(...)
 
-!0 = metadata !{i8 0, i8 2}
-!1 = metadata !{i8 5, i8 7}
-\ No newline at end of file
+!0 = !{i8 0, i8 2}
+!1 = !{i8 5, i8 7}
diff --git a/test/Transforms/MergeFunc/ranges.ll b/test/Transforms/MergeFunc/ranges.ll
index e25ff1d..d3e4d94 100644
--- a/test/Transforms/MergeFunc/ranges.ll
+++ b/test/Transforms/MergeFunc/ranges.ll
@@ -39,5 +39,5 @@ define i1 @cmp_with_same_range(i8*, i8*) {
   ret i1 %out
 }
 
-!0 = metadata !{i8 0, i8 2}
-!1 = metadata !{i8 5, i8 7}
+!0 = !{i8 0, i8 2}
+!1 = !{i8 5, i8 7}
diff --git a/test/Transforms/ObjCARC/allocas.ll b/test/Transforms/ObjCARC/allocas.ll
index d2e7841..7b671df 100644
--- a/test/Transforms/ObjCARC/allocas.ll
+++ b/test/Transforms/ObjCARC/allocas.ll
@@ -495,6 +495,6 @@ arraydestroy.done1:
   ret void
 }
 
-!0 = metadata !{}
+!0 = !{}
 
 declare i32 @__gxx_personality_v0(...)
diff --git a/test/Transforms/ObjCARC/arc-annotations.ll b/test/Transforms/ObjCARC/arc-annotations.ll
index f76ba3b..c0ce44f 100644
--- a/test/Transforms/ObjCARC/arc-annotations.ll
+++ b/test/Transforms/ObjCARC/arc-annotations.ll
@@ -73,11 +73,11 @@ return:
   ret void
 }
 
-!0 = metadata !{}
+!0 = !{}
 
-; CHECK: ![[ANN0]] = metadata !{metadata !"(test0,%x)", metadata !"S_Use", metadata !"S_None"}
-; CHECK: ![[ANN1]] = metadata !{metadata !"(test0,%x)", metadata !"S_None", metadata !"S_Retain"}
-; CHECK: ![[ANN2]] = metadata !{metadata !"(test0,%x)", metadata !"S_Release", metadata !"S_Use"}
-; CHECK: ![[ANN3]] = metadata !{metadata !"(test0,%x)", metadata !"S_None", metadata !"S_Release"}
-; CHECK: ![[ANN4]] = metadata !{metadata !"(test0,%x)", metadata !"S_Retain", metadata !"S_None"}
+; CHECK: ![[ANN0]] = !{!"(test0,%x)", !"S_Use", !"S_None"}
+; CHECK: ![[ANN1]] = !{!"(test0,%x)", !"S_None", !"S_Retain"}
+; CHECK: ![[ANN2]] = !{!"(test0,%x)", !"S_Release", !"S_Use"}
+; CHECK: ![[ANN3]] = !{!"(test0,%x)", !"S_None", !"S_Release"}
+; CHECK: ![[ANN4]] = !{!"(test0,%x)", !"S_Retain", !"S_None"}
 
diff --git a/test/Transforms/ObjCARC/basic.ll b/test/Transforms/ObjCARC/basic.ll
index a1ee956..7bc58c4 100644
--- a/test/Transforms/ObjCARC/basic.ll
+++ b/test/Transforms/ObjCARC/basic.ll
@@ -2679,8 +2679,8 @@ define {<2 x float>, <2 x float>} @"\01-[A z]"({}* %self, i8* nocapture %_cmd) n
 invoke.cont:
   %0 = bitcast {}* %self to i8*
   %1 = tail call i8* @objc_retain(i8* %0) nounwind
-  tail call void @llvm.dbg.value(metadata !{{}* %self}, i64 0, metadata !0, metadata !{})
-  tail call void @llvm.dbg.value(metadata !{{}* %self}, i64 0, metadata !0, metadata !{})
+  tail call void @llvm.dbg.value(metadata {}* %self, i64 0, metadata !0, metadata !{})
+  tail call void @llvm.dbg.value(metadata {}* %self, i64 0, metadata !0, metadata !{})
   %ivar = load i64* @"OBJC_IVAR_$_A.myZ", align 8
   %add.ptr = getelementptr i8* %0, i64 %ivar
   %tmp1 = bitcast i8* %add.ptr to float*
@@ -3011,9 +3011,9 @@ define void @test67(i8* %x) {
 
 !llvm.module.flags = !{!1}
 
-!0 = metadata !{}
-!1 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{}
+!1 = !{i32 1, !"Debug Info Version", i32 2}
 
 ; CHECK: attributes #0 = { nounwind readnone }
 ; CHECK: attributes [[NUW]] = { nounwind }
-; CHECK: ![[RELEASE]] = metadata !{}
+; CHECK: ![[RELEASE]] = !{}
diff --git a/test/Transforms/ObjCARC/cfg-hazards.ll b/test/Transforms/ObjCARC/cfg-hazards.ll
index 61e5a3b..746d56d 100644
--- a/test/Transforms/ObjCARC/cfg-hazards.ll
+++ b/test/Transforms/ObjCARC/cfg-hazards.ll
@@ -432,4 +432,4 @@ exit:
 
 ; CHECK: attributes [[NUW]] = { nounwind }
 
-!0 = metadata !{}
+!0 = !{}
diff --git a/test/Transforms/ObjCARC/contract-marker.ll b/test/Transforms/ObjCARC/contract-marker.ll
index 55a1b28..a828260 100644
--- a/test/Transforms/ObjCARC/contract-marker.ll
+++ b/test/Transforms/ObjCARC/contract-marker.ll
@@ -22,6 +22,6 @@ declare void @bar(i8*)
 
 !clang.arc.retainAutoreleasedReturnValueMarker = !{!0}
 
-!0 = metadata !{metadata !"mov\09r7, r7\09\09@ marker for objc_retainAutoreleaseReturnValue"}
+!0 = !{!"mov\09r7, r7\09\09@ marker for objc_retainAutoreleaseReturnValue"}
 
 ; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/contract-storestrong.ll b/test/Transforms/ObjCARC/contract-storestrong.ll
index 50a2d97..c218e33 100644
--- a/test/Transforms/ObjCARC/contract-storestrong.ll
+++ b/test/Transforms/ObjCARC/contract-storestrong.ll
@@ -24,7 +24,7 @@ entry:
 
 ; Don't do this if the load is volatile.
 
-;      CHECK: define void @test1(i8* %p) {
+; CHECK-LABEL: define void @test1(i8* %p) {
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %p) [[NUW]]
 ; CHECK-NEXT:   %tmp = load volatile i8** @x, align 8
@@ -43,7 +43,7 @@ entry:
 
 ; Don't do this if the store is volatile.
 
-;      CHECK: define void @test2(i8* %p) {
+; CHECK-LABEL: define void @test2(i8* %p) {
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %p) [[NUW]]
 ; CHECK-NEXT:   %tmp = load i8** @x, align 8
@@ -63,15 +63,15 @@ entry:
 ; Don't do this if there's a use of the old pointer value between the store
 ; and the release.
 
-; CHECK:      define void @test3(i8* %newValue) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   %x0 = tail call i8* @objc_retain(i8* %newValue) [[NUW]]
-; CHECK-NEXT:   %x1 = load i8** @x, align 8
-; CHECK-NEXT:   store i8* %x0, i8** @x, align 8
-; CHECK-NEXT:   tail call void @use_pointer(i8* %x1), !clang.arc.no_objc_arc_exceptions !0
-; CHECK-NEXT:   tail call void @objc_release(i8* %x1) [[NUW]], !clang.imprecise_release !0
-; CHECK-NEXT:   ret void
-; CHECK-NEXT: }
+; CHECK-LABEL: define void @test3(i8* %newValue) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %x0 = tail call i8* @objc_retain(i8* %newValue) [[NUW]]
+; CHECK-NEXT:    %x1 = load i8** @x, align 8
+; CHECK-NEXT:    store i8* %x0, i8** @x, align 8
+; CHECK-NEXT:    tail call void @use_pointer(i8* %x1), !clang.arc.no_objc_arc_exceptions !0
+; CHECK-NEXT:    tail call void @objc_release(i8* %x1) [[NUW]], !clang.imprecise_release !0
+; CHECK-NEXT:    ret void
+; CHECK-NEXT:  }
 define void @test3(i8* %newValue) {
 entry:
   %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
@@ -84,15 +84,15 @@ entry:
 
 ; Like test3, but with an icmp use instead of a call, for good measure.
 
-; CHECK:      define i1 @test4(i8* %newValue, i8* %foo) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   %x0 = tail call i8* @objc_retain(i8* %newValue) [[NUW]]
-; CHECK-NEXT:   %x1 = load i8** @x, align 8
-; CHECK-NEXT:   store i8* %x0, i8** @x, align 8
-; CHECK-NEXT:   %t = icmp eq i8* %x1, %foo
-; CHECK-NEXT:   tail call void @objc_release(i8* %x1) [[NUW]], !clang.imprecise_release !0
-; CHECK-NEXT:   ret i1 %t
-; CHECK-NEXT: }
+; CHECK-LABEL:  define i1 @test4(i8* %newValue, i8* %foo) {
+; CHECK-NEXT:   entry:
+; CHECK-NEXT:     %x0 = tail call i8* @objc_retain(i8* %newValue) [[NUW]]
+; CHECK-NEXT:     %x1 = load i8** @x, align 8
+; CHECK-NEXT:     store i8* %x0, i8** @x, align 8
+; CHECK-NEXT:     %t = icmp eq i8* %x1, %foo
+; CHECK-NEXT:     tail call void @objc_release(i8* %x1) [[NUW]], !clang.imprecise_release !0
+; CHECK-NEXT:     ret i1 %t
+; CHECK-NEXT:   }
 define i1 @test4(i8* %newValue, i8* %foo) {
 entry:
   %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
@@ -105,7 +105,7 @@ entry:
 
 ; Do form an objc_storeStrong here, because the use is before the store.
 
-; CHECK: define i1 @test5(i8* %newValue, i8* %foo) {
+; CHECK-LABEL: define i1 @test5(i8* %newValue, i8* %foo) {
 ; CHECK: %t = icmp eq i8* %x1, %foo
 ; CHECK: tail call void @objc_storeStrong(i8** @x, i8* %newValue) [[NUW]]
 ; CHECK: }
@@ -121,7 +121,7 @@ entry:
 
 ; Like test5, but the release is before the store.
 
-; CHECK: define i1 @test6(i8* %newValue, i8* %foo) {
+; CHECK-LABEL: define i1 @test6(i8* %newValue, i8* %foo) {
 ; CHECK: %t = icmp eq i8* %x1, %foo
 ; CHECK: tail call void @objc_storeStrong(i8** @x, i8* %newValue) [[NUW]]
 ; CHECK: }
@@ -137,7 +137,7 @@ entry:
 
 ; Like test0, but there's no store, so don't form an objc_storeStrong.
 
-;      CHECK-LABEL: define void @test7(
+; CHECK-LABEL: define void @test7(
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %p) [[NUW]]
 ; CHECK-NEXT:   %tmp = load i8** @x, align 8
@@ -154,7 +154,7 @@ entry:
 
 ; Like test0, but there's no retain, so don't form an objc_storeStrong.
 
-;      CHECK-LABEL: define void @test8(
+; CHECK-LABEL: define void @test8(
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   %tmp = load i8** @x, align 8
 ; CHECK-NEXT:   store i8* %p, i8** @x, align 8
@@ -169,6 +169,54 @@ entry:
   ret void
 }
 
-!0 = metadata !{}
+; Make sure that we properly handle release that *may* release our new
+; value in between the retain and the store. We need to be sure that
+; this we can safely move the retain to the store. This specific test
+; makes sure that we properly handled a release of an unrelated
+; pointer.
+;
+; CHECK-LABEL: define i1 @test9(i8* %newValue, i8* %foo, i8* %unrelated_ptr) {
+; CHECK-NOT: objc_storeStrong
+define i1 @test9(i8* %newValue, i8* %foo, i8* %unrelated_ptr) {
+entry:
+  %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+  tail call void @objc_release(i8* %unrelated_ptr) nounwind, !clang.imprecise_release !0
+  %x1 = load i8** @x, align 8
+  tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+  %t = icmp eq i8* %x1, %foo
+  store i8* %newValue, i8** @x, align 8
+  ret i1 %t  
+}
+
+; Make sure that we don't perform the optimization when we just have a call.
+;
+; CHECK-LABEL: define i1 @test10(i8* %newValue, i8* %foo, i8* %unrelated_ptr) {
+; CHECK-NOT: objc_storeStrong
+define i1 @test10(i8* %newValue, i8* %foo, i8* %unrelated_ptr) {
+entry:
+  %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+  call void @use_pointer(i8* %unrelated_ptr)
+  %x1 = load i8** @x, align 8
+  tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+  %t = icmp eq i8* %x1, %foo
+  store i8* %newValue, i8** @x, align 8
+  ret i1 %t
+}
+
+; Make sure we form the store strong if the use in between the retain
+; and the store does not touch reference counts.
+; CHECK-LABEL: define i1 @test11(i8* %newValue, i8* %foo, i8* %unrelated_ptr) {
+; CHECK: objc_storeStrong
+define i1 @test11(i8* %newValue, i8* %foo, i8* %unrelated_ptr) {
+entry:
+  %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+  %t = icmp eq i8* %newValue, %foo
+  %x1 = load i8** @x, align 8
+  tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+  store i8* %newValue, i8** @x, align 8
+  ret i1 %t
+}
+
+!0 = !{}
 
 ; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/contract-testcases.ll b/test/Transforms/ObjCARC/contract-testcases.ll
index 0bf63a6..74a4a7f 100644
--- a/test/Transforms/ObjCARC/contract-testcases.ll
+++ b/test/Transforms/ObjCARC/contract-testcases.ll
@@ -89,7 +89,7 @@ lpad:                                             ; preds = %entry
 
 !clang.arc.retainAutoreleasedReturnValueMarker = !{!0}
 
-!0 = metadata !{metadata !"mov\09r7, r7\09\09@ marker for objc_retainAutoreleaseReturnValue"}
+!0 = !{!"mov\09r7, r7\09\09@ marker for objc_retainAutoreleaseReturnValue"}
 
 ; CHECK: attributes #0 = { optsize }
 ; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/empty-block.ll b/test/Transforms/ObjCARC/empty-block.ll
index 0440ab8..cc82d10 100644
--- a/test/Transforms/ObjCARC/empty-block.ll
+++ b/test/Transforms/ObjCARC/empty-block.ll
@@ -56,4 +56,4 @@ define %0* @test1() nounwind {
 
 declare %0* @foo()
 
-!0 = metadata !{}
+!0 = !{}
diff --git a/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll b/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
index 03af93e..c72566c 100644
--- a/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
+++ b/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
@@ -41,10 +41,10 @@ entry:
   %tmp2 = bitcast %struct._class_t* %tmp to i8*, !dbg !37
 ; CHECK: call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %tmp2, i8* %tmp1)
   %call = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %tmp2, i8* %tmp1), !dbg !37, !clang.arc.no_objc_arc_exceptions !38
-  call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !12, metadata !{}), !dbg !37
+  call void @llvm.dbg.value(metadata i8* %call, i64 0, metadata i32 02, metadata !{}), !dbg !37
 ; CHECK: call i8* @objc_retain(i8* %call) [[NUW:#[0-9]+]]
   %tmp3 = call i8* @objc_retain(i8* %call) nounwind, !dbg !39
-  call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !25, metadata !{}), !dbg !39
+  call void @llvm.dbg.value(metadata i8* %call, i64 0, metadata !25, metadata !{}), !dbg !39
   invoke fastcc void @ThrowFunc(i8* %call)
           to label %eh.cont unwind label %lpad, !dbg !40, !clang.arc.no_objc_arc_exceptions !38
 
@@ -58,7 +58,7 @@ lpad:                                             ; preds = %entry
           catch i8* null, !dbg !40
   %tmp5 = extractvalue { i8*, i32 } %tmp4, 0, !dbg !40
   %exn.adjusted = call i8* @objc_begin_catch(i8* %tmp5) nounwind, !dbg !44
-  call void @llvm.dbg.value(metadata !45, i64 0, metadata !21, metadata !{}), !dbg !46
+  call void @llvm.dbg.value(metadata i8 0, i64 0, metadata !21, metadata !{}), !dbg !46
   call void @objc_end_catch(), !dbg !49, !clang.arc.no_objc_arc_exceptions !38
 ; CHECK: call void @objc_release(i8* %call)
   call void @objc_release(i8* %call) nounwind, !dbg !42, !clang.imprecise_release !38
@@ -87,7 +87,7 @@ declare void @objc_exception_rethrow()
 define internal fastcc void @ThrowFunc(i8* %obj) uwtable noinline ssp {
 entry:
   %tmp = call i8* @objc_retain(i8* %obj) nounwind
-  call void @llvm.dbg.value(metadata !{i8* %obj}, i64 0, metadata !32, metadata !{}), !dbg !55
+  call void @llvm.dbg.value(metadata i8* %obj, i64 0, metadata !32, metadata !{}), !dbg !55
   %tmp1 = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_1", align 8, !dbg !56
   %tmp2 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_5", align 8, !dbg !56, !invariant.load !38
   %tmp3 = bitcast %struct._class_t* %tmp1 to i8*, !dbg !56
@@ -113,62 +113,62 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33, !34, !35, !36, !61}
 
-!0 = metadata !{metadata !"0x11\0016\00clang version 3.3 \001\00\002\00\000", metadata !60, metadata !1, metadata !1, metadata !3, metadata !1, null} ; [ DW_TAG_compile_unit ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m] [DW_LANG_ObjC]
-!1 = metadata !{i32 0}
-!3 = metadata !{metadata !5, metadata !27}
-!5 = metadata !{metadata !"0x2e\00main\00main\00\009\000\001\000\006\000\001\0010", metadata !60, metadata !6, metadata !7, null, i32 ()* @main, null, null, metadata !10} ; [ DW_TAG_subprogram ] [line 9] [def] [scope 10] [main]
-!6 = metadata !{metadata !"0x29", metadata !60} ; [ DW_TAG_file_type ]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !11}
-!11 = metadata !{metadata !12, metadata !21, metadata !25}
-!12 = metadata !{metadata !"0x100\00obj\0011\000", metadata !13, metadata !6, metadata !14} ; [ DW_TAG_auto_variable ] [obj] [line 11]
-!13 = metadata !{metadata !"0xb\0010\000\000", metadata !60, metadata !5} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
-!14 = metadata !{metadata !"0x16\00id\0011\000\000\000\000", metadata !60, null, metadata !15} ; [ DW_TAG_typedef ] [id] [line 11, size 0, align 0, offset 0] [from ]
-!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !60, null, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_object]
-!16 = metadata !{metadata !"0x13\00objc_object\000\000\000\000\000\000", metadata !60, null, null, metadata !17, null, i32 0, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [def] [from ]
-!17 = metadata !{metadata !18}
-!18 = metadata !{metadata !"0xd\00isa\000\0064\000\000\000", metadata !60, metadata !16, metadata !19} ; [ DW_TAG_member ] [isa] [line 0, size 64, align 0, offset 0] [from ]
-!19 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from objc_class]
-!20 = metadata !{metadata !"0x13\00objc_class\000\000\000\000\004\000", metadata !60, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
-!21 = metadata !{metadata !"0x100\00ok\0013\000", metadata !22, metadata !6, metadata !23} ; [ DW_TAG_auto_variable ] [ok] [line 13]
-!22 = metadata !{metadata !"0xb\0012\000\001", metadata !60, metadata !13} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
-!23 = metadata !{metadata !"0x16\00BOOL\0062\000\000\000\000", metadata !60, null, metadata !24} ; [ DW_TAG_typedef ] [BOOL] [line 62, size 0, align 0, offset 0] [from signed char]
-!24 = metadata !{metadata !"0x24\00signed char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [signed char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!25 = metadata !{metadata !"0x100\00obj2\0015\000", metadata !26, metadata !6, metadata !14} ; [ DW_TAG_auto_variable ] [obj2] [line 15]
-!26 = metadata !{metadata !"0xb\0014\000\002", metadata !60, metadata !22} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
-!27 = metadata !{metadata !"0x2e\00ThrowFunc\00ThrowFunc\00\004\001\001\000\006\00256\001\005", metadata !60, metadata !6, metadata !28, null, void (i8*)* @ThrowFunc, null, null, metadata !30} ; [ DW_TAG_subprogram ] [line 4] [local] [def] [scope 5] [ThrowFunc]
-!28 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!29 = metadata !{null, metadata !14}
-!30 = metadata !{metadata !31}
-!31 = metadata !{metadata !32}
-!32 = metadata !{metadata !"0x101\00obj\0016777220\000", metadata !27, metadata !6, metadata !14} ; [ DW_TAG_arg_variable ] [obj] [line 4]
-!33 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!34 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!35 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!36 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
-!37 = metadata !{i32 11, i32 0, metadata !13, null}
-!38 = metadata !{}
-!39 = metadata !{i32 15, i32 0, metadata !26, null}
-!40 = metadata !{i32 17, i32 0, metadata !41, null}
-!41 = metadata !{metadata !"0xb\0016\000\003", metadata !60, metadata !26} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
-!42 = metadata !{i32 22, i32 0, metadata !26, null}
-!43 = metadata !{i32 23, i32 0, metadata !22, null}
-!44 = metadata !{i32 19, i32 0, metadata !41, null}
-!45 = metadata !{i8 0}
-!46 = metadata !{i32 20, i32 0, metadata !47, null}
-!47 = metadata !{metadata !"0xb\0019\000\005", metadata !60, metadata !48} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
-!48 = metadata !{metadata !"0xb\0019\000\004", metadata !60, metadata !26} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
-!49 = metadata !{i32 21, i32 0, metadata !47, null}
-!50 = metadata !{i32 24, i32 0, metadata !51, null}
-!51 = metadata !{metadata !"0xb\0023\000\006", metadata !60, metadata !22} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
-!52 = metadata !{i32 25, i32 0, metadata !51, null}
-!53 = metadata !{i32 27, i32 0, metadata !13, null}
-!54 = metadata !{i32 28, i32 0, metadata !13, null}
-!55 = metadata !{i32 4, i32 0, metadata !27, null}
-!56 = metadata !{i32 6, i32 0, metadata !57, null}
-!57 = metadata !{metadata !"0xb\005\000\007", metadata !60, metadata !27} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
-!58 = metadata !{i32 7, i32 0, metadata !57, null}
-!60 = metadata !{metadata !"test.m", metadata !"/Volumes/Files/gottesmmcab/Radar/12906997"}
-!61 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0016\00clang version 3.3 \001\00\002\00\000", !60, !1, !1, !3, !1, null} ; [ DW_TAG_compile_unit ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m] [DW_LANG_ObjC]
+!1 = !{i32 0}
+!3 = !{!5, !27}
+!5 = !{!"0x2e\00main\00main\00\009\000\001\000\006\000\001\0010", !60, !6, !7, null, i32 ()* @main, null, null, !10} ; [ DW_TAG_subprogram ] [line 9] [def] [scope 10] [main]
+!6 = !{!"0x29", !60} ; [ DW_TAG_file_type ]
+!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!11}
+!11 = !{!12, !21, !25}
+!12 = !{!"0x100\00obj\0011\000", !13, !6, !14} ; [ DW_TAG_auto_variable ] [obj] [line 11]
+!13 = !{!"0xb\0010\000\000", !60, !5} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!14 = !{!"0x16\00id\0011\000\000\000\000", !60, null, !15} ; [ DW_TAG_typedef ] [id] [line 11, size 0, align 0, offset 0] [from ]
+!15 = !{!"0xf\00\000\0064\0064\000\000", !60, null, !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_object]
+!16 = !{!"0x13\00objc_object\000\000\000\000\000\000", !60, null, null, !17, null, i32 0, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [def] [from ]
+!17 = !{!18}
+!18 = !{!"0xd\00isa\000\0064\000\000\000", !60, !16, !19} ; [ DW_TAG_member ] [isa] [line 0, size 64, align 0, offset 0] [from ]
+!19 = !{!"0xf\00\000\0064\000\000\000", null, null, !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from objc_class]
+!20 = !{!"0x13\00objc_class\000\000\000\000\004\000", !60, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!21 = !{!"0x100\00ok\0013\000", !22, !6, !23} ; [ DW_TAG_auto_variable ] [ok] [line 13]
+!22 = !{!"0xb\0012\000\001", !60, !13} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!23 = !{!"0x16\00BOOL\0062\000\000\000\000", !60, null, !24} ; [ DW_TAG_typedef ] [BOOL] [line 62, size 0, align 0, offset 0] [from signed char]
+!24 = !{!"0x24\00signed char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [signed char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!25 = !{!"0x100\00obj2\0015\000", !26, !6, !14} ; [ DW_TAG_auto_variable ] [obj2] [line 15]
+!26 = !{!"0xb\0014\000\002", !60, !22} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!27 = !{!"0x2e\00ThrowFunc\00ThrowFunc\00\004\001\001\000\006\00256\001\005", !60, !6, !28, null, void (i8*)* @ThrowFunc, null, null, !30} ; [ DW_TAG_subprogram ] [line 4] [local] [def] [scope 5] [ThrowFunc]
+!28 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!29 = !{null, !14}
+!30 = !{!31}
+!31 = !{!32}
+!32 = !{!"0x101\00obj\0016777220\000", !27, !6, !14} ; [ DW_TAG_arg_variable ] [obj] [line 4]
+!33 = !{i32 1, !"Objective-C Version", i32 2}
+!34 = !{i32 1, !"Objective-C Image Info Version", i32 0}
+!35 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!36 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
+!37 = !MDLocation(line: 11, scope: !13)
+!38 = !{}
+!39 = !MDLocation(line: 15, scope: !26)
+!40 = !MDLocation(line: 17, scope: !41)
+!41 = !{!"0xb\0016\000\003", !60, !26} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!42 = !MDLocation(line: 22, scope: !26)
+!43 = !MDLocation(line: 23, scope: !22)
+!44 = !MDLocation(line: 19, scope: !41)
+!45 = !{i8 0}
+!46 = !MDLocation(line: 20, scope: !47)
+!47 = !{!"0xb\0019\000\005", !60, !48} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!48 = !{!"0xb\0019\000\004", !60, !26} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!49 = !MDLocation(line: 21, scope: !47)
+!50 = !MDLocation(line: 24, scope: !51)
+!51 = !{!"0xb\0023\000\006", !60, !22} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!52 = !MDLocation(line: 25, scope: !51)
+!53 = !MDLocation(line: 27, scope: !13)
+!54 = !MDLocation(line: 28, scope: !13)
+!55 = !MDLocation(line: 4, scope: !27)
+!56 = !MDLocation(line: 6, scope: !57)
+!57 = !{!"0xb\005\000\007", !60, !27} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!58 = !MDLocation(line: 7, scope: !57)
+!60 = !{!"test.m", !"/Volumes/Files/gottesmmcab/Radar/12906997"}
+!61 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/ObjCARC/escape.ll b/test/Transforms/ObjCARC/escape.ll
index 28f2e80..357f759 100644
--- a/test/Transforms/ObjCARC/escape.ll
+++ b/test/Transforms/ObjCARC/escape.ll
@@ -128,7 +128,7 @@ declare i8* @objc_storeWeak(i8**, i8*)
 declare i8* @not_really_objc_storeWeak(i8**, i8*)
 declare void @objc_release(i8*)
 
-!0 = metadata !{}
+!0 = !{}
 
 ; CHECK: attributes [[NUW]] = { nounwind }
 ; CHECK: attributes #1 = { nounwind ssp }
diff --git a/test/Transforms/ObjCARC/intrinsic-use.ll b/test/Transforms/ObjCARC/intrinsic-use.ll
index f3833cb..b1e56c8 100644
--- a/test/Transforms/ObjCARC/intrinsic-use.ll
+++ b/test/Transforms/ObjCARC/intrinsic-use.ll
@@ -112,5 +112,5 @@ entry:
 }
 
 
-!0 = metadata !{}
+!0 = !{}
 
diff --git a/test/Transforms/ObjCARC/invoke.ll b/test/Transforms/ObjCARC/invoke.ll
index 04d057b..5ef5184 100644
--- a/test/Transforms/ObjCARC/invoke.ll
+++ b/test/Transforms/ObjCARC/invoke.ll
@@ -221,4 +221,4 @@ declare i32 @__objc_personality_v0(...)
 
 ; CHECK: attributes [[NUW]] = { nounwind }
 
-!0 = metadata !{}
+!0 = !{}
diff --git a/test/Transforms/ObjCARC/nested.ll b/test/Transforms/ObjCARC/nested.ll
index 2eeb4fc..7d72e37 100644
--- a/test/Transforms/ObjCARC/nested.ll
+++ b/test/Transforms/ObjCARC/nested.ll
@@ -21,7 +21,7 @@ declare void @__crasher_block_invoke(i8* nocapture)
 declare i8* @objc_retainBlock(i8*)
 declare void @__crasher_block_invoke1(i8* nocapture)
 
-!0 = metadata !{}
+!0 = !{}
 
 ; Delete a nested retain+release pair.
 
diff --git a/test/Transforms/ObjCARC/path-overflow.ll b/test/Transforms/ObjCARC/path-overflow.ll
index 3c14353..d239653 100644
--- a/test/Transforms/ObjCARC/path-overflow.ll
+++ b/test/Transforms/ObjCARC/path-overflow.ll
@@ -2190,4 +2190,4 @@ return:                                           ; No predecessors!
 }
 
 
-!0 = metadata !{}
+!0 = !{}
diff --git a/test/Transforms/ObjCARC/retain-not-declared.ll b/test/Transforms/ObjCARC/retain-not-declared.ll
index 3a2bd03..4162022 100644
--- a/test/Transforms/ObjCARC/retain-not-declared.ll
+++ b/test/Transforms/ObjCARC/retain-not-declared.ll
@@ -64,6 +64,6 @@ lpad100:                                          ; preds = %invoke.cont93
 
 declare i32 @__gxx_personality_v0(...)
 
-!0 = metadata !{}
+!0 = !{}
 
 ; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/split-backedge.ll b/test/Transforms/ObjCARC/split-backedge.ll
index 1b7cf44..2507173 100644
--- a/test/Transforms/ObjCARC/split-backedge.ll
+++ b/test/Transforms/ObjCARC/split-backedge.ll
@@ -45,6 +45,6 @@ declare void @objc_release(i8*)
 declare i8* @objc_retain(i8*)
 declare void @use_pointer(i8*)
 
-!0 = metadata !{}
+!0 = !{}
 
 ; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/weak-copies.ll b/test/Transforms/ObjCARC/weak-copies.ll
index 5dab4e0..13d0b0a 100644
--- a/test/Transforms/ObjCARC/weak-copies.ll
+++ b/test/Transforms/ObjCARC/weak-copies.ll
@@ -86,4 +86,4 @@ declare void @objc_destroyWeak(i8**)
 
 ; CHECK: attributes [[NUW]] = { nounwind }
 
-!0 = metadata !{}
+!0 = !{}
diff --git a/test/Transforms/PlaceSafepoints/basic.ll b/test/Transforms/PlaceSafepoints/basic.ll
new file mode 100644
index 0000000..ca63da4
--- /dev/null
+++ b/test/Transforms/PlaceSafepoints/basic.ll
@@ -0,0 +1,94 @@
+; RUN: opt %s -S -place-safepoints | FileCheck %s
+
+
+; Do we insert a simple entry safepoint?
+define void @test_entry() gc "statepoint-example" {
+; CHECK-LABEL: @test_entry
+entry:
+; CHECK-LABEL: entry
+; CHECK: statepoint
+  ret void
+}
+
+; On a non-gc function, we should NOT get an entry safepoint
+define void @test_negative() {
+; CHECK-LABEL: @test_negative
+entry:
+; CHECK-NOT: statepoint
+  ret void
+}
+
+; Do we insert a backedge safepoint in a statically
+; infinite loop?
+define void @test_backedge() gc "statepoint-example" {
+; CHECK-LABEL: test_backedge
+entry:
+; CHECK-LABEL: entry
+; This statepoint is technically not required, but we don't exploit that yet.
+; CHECK: statepoint
+  br label %other
+
+; CHECK-LABEL: other
+; CHECK: statepoint
+other:
+  call void undef()
+  br label %other
+}
+
+; Check that we remove an unreachable block rather than trying
+; to insert a backedge safepoint
+define void @test_unreachable() gc "statepoint-example" {
+; CHECK-LABEL: test_unreachable
+entry:
+; CHECK-LABEL: entry
+; CHECK: statepoint
+  ret void
+
+; CHECK-NOT: other
+; CHECK-NOT: statepoint
+other:
+  br label %other
+}
+
+declare void @foo()
+
+; Do we turn a call into it's own statepoint
+define void @test_simple_call() gc "statepoint-example" {
+; CHECK-LABEL: test_simple_call
+entry:
+  br label %other
+other:
+; CHECK-LABEL: other
+; CHECK: statepoint
+; CHECK-NOT: gc.result
+  call void @foo()
+  ret void
+}
+
+declare zeroext i1 @i1_return_i1(i1)
+
+define i1 @test_call_with_result() gc "statepoint-example" {
+; CHECK-LABEL: test_call_with_result
+; This is checking that a statepoint_poll + statepoint + result is
+; inserted for a function that takes 1 argument.
+; CHECK: gc.statepoint.p0f_isVoidf
+; CHECK: gc.statepoint.p0f_i1i1f
+; CHECK: (i1 (i1)* @i1_return_i1, i32 1, i32 0, i1 false, i32 0)
+; CHECK: %call12 = call i1 @llvm.experimental.gc.result.i1
+entry:
+  %call1 = tail call i1 (i1)* @i1_return_i1(i1 false)
+  ret i1 %call1
+}
+
+; This function is inlined when inserting a poll.  To avoid recursive 
+; issues, make sure we don't place safepoints in it.
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+; CHECK-LABEL: entry
+; CHECK-NEXT: do_safepoint
+; CHECK-NEXT: ret void 
+entry:
+  call void @do_safepoint()
+  ret void
+}
diff --git a/test/Transforms/PlaceSafepoints/call-in-loop.ll b/test/Transforms/PlaceSafepoints/call-in-loop.ll
new file mode 100644
index 0000000..a220fc9
--- /dev/null
+++ b/test/Transforms/PlaceSafepoints/call-in-loop.ll
@@ -0,0 +1,31 @@
+; If there's a call in the loop which dominates the backedge, we 
+; don't need a safepoint poll (since the callee must contain a 
+; poll test).
+;; RUN: opt %s -place-safepoints -S | FileCheck %s
+
+declare void @foo()
+
+define void @test1() gc "statepoint-example" {
+; CHECK-LABEL: test1
+
+entry:
+; CHECK-LABEL: entry
+; CHECK: statepoint
+  br label %loop
+
+loop:
+; CHECK-LABEL: loop
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @foo
+; CHECK-NOT: statepoint
+  call void @foo()
+  br label %loop
+}
+
+; This function is inlined when inserting a poll.
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+entry:
+  call void @do_safepoint()
+  ret void
+}
diff --git a/test/Transforms/PlaceSafepoints/finite-loops.ll b/test/Transforms/PlaceSafepoints/finite-loops.ll
new file mode 100644
index 0000000..8b64d24
--- /dev/null
+++ b/test/Transforms/PlaceSafepoints/finite-loops.ll
@@ -0,0 +1,80 @@
+; Tests to ensure that we are not placing backedge safepoints in
+; loops which are clearly finite.
+;; RUN: opt %s -place-safepoints -S | FileCheck %s
+
+
+; A simple counted loop with trivially known range
+define void @test1(i32) gc "statepoint-example" {
+; CHECK-LABEL: test1
+; CHECK-LABEL: entry
+; CHECK: statepoint
+; CHECK-LABEL: loop
+; CHECK-NOT: statepoint
+
+entry:
+  br label %loop
+
+loop:
+  %counter = phi i32 [ 0 , %entry ], [ %counter.inc , %loop ]
+  %counter.inc = add i32 %counter, 1
+  %counter.cmp = icmp slt i32 %counter.inc, 16
+  br i1 %counter.cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; The same counted loop, but with an unknown early exit
+define void @test2(i32) gc "statepoint-example" {
+; CHECK-LABEL: test2
+; CHECK-LABEL: entry
+; CHECK: statepoint
+; CHECK-LABEL: loop
+; CHECK-NOT: statepoint
+
+entry:
+  br label %loop
+
+loop:
+  %counter = phi i32 [ 0 , %entry ], [ %counter.inc , %continue ]
+  %counter.inc = add i32 %counter, 1
+  %counter.cmp = icmp slt i32 %counter.inc, 16
+  br i1 undef, label %continue, label %exit
+
+continue:
+  br i1 %counter.cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; The range is a 8 bit value and we can't overflow
+define void @test3(i8 %upper) gc "statepoint-example" {
+; CHECK-LABEL: test3
+; CHECK-LABEL: entry
+; CHECK: statepoint
+; CHECK-LABEL: loop
+; CHECK-NOT: statepoint
+
+entry:
+  br label %loop
+
+loop:
+  %counter = phi i8 [ 0 , %entry ], [ %counter.inc , %loop ]
+  %counter.inc = add nsw i8 %counter, 1
+  %counter.cmp = icmp slt i8 %counter.inc, %upper
+  br i1 %counter.cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+
+; This function is inlined when inserting a poll.
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+entry:
+  call void @do_safepoint()
+  ret void
+}
diff --git a/test/Transforms/PlaceSafepoints/invokes.ll b/test/Transforms/PlaceSafepoints/invokes.ll
new file mode 100644
index 0000000..5fd5bea
--- /dev/null
+++ b/test/Transforms/PlaceSafepoints/invokes.ll
@@ -0,0 +1,110 @@
+; RUN: opt %s -S -place-safepoints | FileCheck %s
+
+declare i64 addrspace(1)* @"some_call"(i64 addrspace(1)*)
+declare i32 @"personality_function"()
+
+define i64 addrspace(1)* @test_basic(i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1) gc "statepoint-example" {
+; CHECK-LABEL: entry:
+entry:
+  ; CHECK: invoke
+  ; CHECK: statepoint
+  ; CHECK: some_call
+  %ret_val = invoke i64 addrspace(1)* @"some_call"(i64 addrspace(1)* %obj)
+               to label %normal_return unwind label %exceptional_return
+
+; CHECK-LABEL: normal_return:
+; CHECK: gc.result
+; CHECK: ret i64
+
+normal_return:
+  ret i64 addrspace(1)* %ret_val
+
+; CHECK-LABEL: exceptional_return:
+; CHECK: landingpad
+; CHECK: ret i64
+
+exceptional_return:
+  %landing_pad4 = landingpad {i8*, i32} personality i32 ()* @"personality_function"
+          cleanup
+  ret i64 addrspace(1)* %obj1
+}
+
+define i64 addrspace(1)* @test_two_invokes(i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1) gc "statepoint-example" {
+; CHECK-LABEL: entry:
+entry:
+  ; CHECK: invoke 
+  ; CHECK: statepoint
+  ; CHECK: some_call
+  %ret_val1 = invoke i64 addrspace(1)* @"some_call"(i64 addrspace(1)* %obj)
+               to label %second_invoke unwind label %exceptional_return
+
+; CHECK-LABEL: second_invoke:
+second_invoke:
+  ; CHECK: invoke
+  ; CHECK: statepoint
+  ; CHECK: some_call
+  %ret_val2 = invoke i64 addrspace(1)* @"some_call"(i64 addrspace(1)* %ret_val1)
+                to label %normal_return unwind label %exceptional_return
+
+; CHECK-LABEL: normal_return:
+normal_return:
+  ; CHECK: gc.result
+  ; CHECK: ret i64
+  ret i64 addrspace(1)* %ret_val2
+
+; CHECK: exceptional_return:
+; CHECK: ret i64
+
+exceptional_return:
+  %landing_pad4 = landingpad {i8*, i32} personality i32 ()* @"personality_function"
+          cleanup
+  ret i64 addrspace(1)* %obj1
+}
+
+define i64 addrspace(1)* @test_phi_node(i1 %cond, i64 addrspace(1)* %obj) gc "statepoint-example" {
+; CHECK-LABEL: entry:
+entry:
+  br i1 %cond, label %left, label %right
+
+left:
+  %ret_val_left = invoke i64 addrspace(1)* @"some_call"(i64 addrspace(1)* %obj)
+                    to label %merge unwind label %exceptional_return
+
+right:
+  %ret_val_right = invoke i64 addrspace(1)* @"some_call"(i64 addrspace(1)* %obj)
+                     to label %merge unwind label %exceptional_return
+
+; CHECK-LABEL: merge1:
+; CHECK: gc.result
+; CHECK: br label %merge
+
+; CHECK-LABEL: merge3:
+; CHECK: gc.result
+; CHECK: br label %merge
+
+; CHECK-LABEL: merge:
+; CHECK: phi
+; CHECK: ret i64 addrspace(1)* %ret_val
+merge:
+  %ret_val = phi i64 addrspace(1)* [%ret_val_left, %left], [%ret_val_right, %right]
+  ret i64 addrspace(1)* %ret_val
+
+; CHECK-LABEL: exceptional_return:
+; CHECK: ret i64 addrspace(1)*
+
+exceptional_return:
+  %landing_pad4 = landingpad {i8*, i32} personality i32 ()* @"personality_function"
+          cleanup
+  ret i64 addrspace(1)* %obj
+}
+
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+; CHECK-LABEL: entry
+; CHECK-NEXT: do_safepoint
+; CHECK-NEXT: ret void 
+entry:
+  call void @do_safepoint()
+  ret void
+}
diff --git a/test/Transforms/PlaceSafepoints/split-backedge.ll b/test/Transforms/PlaceSafepoints/split-backedge.ll
new file mode 100644
index 0000000..176b54f
--- /dev/null
+++ b/test/Transforms/PlaceSafepoints/split-backedge.ll
@@ -0,0 +1,46 @@
+;; A very basic test to make sure that splitting the backedge keeps working
+;; RUN: opt -place-safepoints -spp-split-backedge=1 -S %s | FileCheck %s
+
+define void @test(i32, i1 %cond) gc "statepoint-example" {
+; CHECK-LABEL: @test
+; CHECK-LABEL: loop.loop_crit_edge
+; CHECK: gc.statepoint
+; CHECK-NEXT: br label %loop
+entry:
+  br label %loop
+
+loop:
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Test for the case where a single conditional branch jumps to two
+; different loop header blocks.  Since we're currently using LoopSimplfy
+; this doesn't hit the interesting case, but once we remove that, we need
+; to be sure this keeps working.
+define void @test2(i32, i1 %cond) gc "statepoint-example" {
+; CHECK-LABEL: @test2
+; CHECK-LABE: loop.loopexit.split
+; CHECK: gc.statepoint
+; CHECK-NEXT: br label %loop
+; CHECK-LABEL: loop2.loop2_crit_edge
+; CHECK: gc.statepoint
+; CHECK-NEXT: br label %loop2
+entry:
+  br label %loop
+
+loop:
+  br label %loop2
+
+loop2:
+  br i1 %cond, label %loop, label %loop2
+}
+
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+entry:
+  call void @do_safepoint()
+  ret void
+}
diff --git a/test/Transforms/PruneEH/2003-11-21-PHIUpdate.ll b/test/Transforms/PruneEH/2003-11-21-PHIUpdate.ll
deleted file mode 100644
index a010703..0000000
--- a/test/Transforms/PruneEH/2003-11-21-PHIUpdate.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: opt < %s -prune-eh -disable-output
-
-define internal void @callee() {
-	ret void
-}
-
-define i32 @caller() {
-; <label>:0
-	invoke void @callee( )
-			to label %E unwind label %E
-E:		; preds = %0, %0
-	%X = phi i32 [ 0, %0 ], [ 0, %0 ]		; <i32> [#uses=1]
-	ret i32 %X
-}
-
diff --git a/test/Transforms/PruneEH/recursivetest.ll b/test/Transforms/PruneEH/recursivetest.ll
index 724c7cf..bc002ae 100644
--- a/test/Transforms/PruneEH/recursivetest.ll
+++ b/test/Transforms/PruneEH/recursivetest.ll
@@ -6,6 +6,8 @@ define internal i32 @foo() {
 Normal:		; preds = %0
 	ret i32 12
 Except:		; preds = %0
+        landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0
+                catch i8* null
 	ret i32 123
 }
 
@@ -15,6 +17,9 @@ define i32 @caller() {
 Normal:		; preds = %0
 	ret i32 0
 Except:		; preds = %0
+        landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0
+                catch i8* null
 	ret i32 1
 }
 
+declare i32 @__gxx_personality_v0(...)
diff --git a/test/Transforms/PruneEH/seh-nounwind.ll b/test/Transforms/PruneEH/seh-nounwind.ll
new file mode 100644
index 0000000..4b69ae4
--- /dev/null
+++ b/test/Transforms/PruneEH/seh-nounwind.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -prune-eh < %s | FileCheck %s
+
+; Don't remove invokes of nounwind functions if the personality handles async
+; exceptions. The @div function in this test can fault, even though it can't
+; throw a synchronous exception.
+
+define i32 @div(i32 %n, i32 %d) nounwind {
+entry:
+  %div = sdiv i32 %n, %d
+  ret i32 %div
+}
+
+define i32 @main() nounwind {
+entry:
+  %call = invoke i32 @div(i32 10, i32 0)
+          to label %__try.cont unwind label %lpad
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
+          catch i8* null
+  br label %__try.cont
+
+__try.cont:
+  %retval.0 = phi i32 [ %call, %entry ], [ 0, %lpad ]
+  ret i32 %retval.0
+}
+
+; CHECK-LABEL: define i32 @main()
+; CHECK: invoke i32 @div(i32 10, i32 0)
+
+declare i32 @__C_specific_handler(...)
diff --git a/test/Transforms/PruneEH/simpletest.ll b/test/Transforms/PruneEH/simpletest.ll
index 77c429d..6154a80 100644
--- a/test/Transforms/PruneEH/simpletest.ll
+++ b/test/Transforms/PruneEH/simpletest.ll
@@ -15,5 +15,9 @@ Normal:		; preds = %0
 	ret i32 0
 
 Except:		; preds = %0
+        landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0
+                catch i8* null
 	ret i32 1
 }
+
+declare i32 @__gxx_personality_v0(...)
diff --git a/test/Transforms/Reassociate/crash2.ll b/test/Transforms/Reassociate/crash2.ll
new file mode 100644
index 0000000..b51a88c
--- /dev/null
+++ b/test/Transforms/Reassociate/crash2.ll
@@ -0,0 +1,25 @@
+; RUN: opt -reassociate %s -S -o - | FileCheck %s
+
+; Reassociate pass used to crash on these example
+
+
+define float @undef1() {
+wrapper_entry:
+; CHECK-LABEL: @undef1
+; CHECK: ret float fadd (float undef, float fadd (float undef, float fadd (float fsub (float -0.000000e+00, float undef), float fsub (float -0.000000e+00, float undef))))
+  %0 = fadd fast float undef, undef
+  %1 = fsub fast float undef, %0
+  %2 = fadd fast float undef, %1
+  ret float %2
+}
+
+define void @undef2() {
+wrapper_entry:
+; CHECK-LABEL: @undef2
+; CHECK: unreachable
+  %0 = fadd fast float undef, undef
+  %1 = fadd fast float %0, 1.000000e+00
+  %2 = fsub fast float %0, %1
+  %3 = fmul fast float %2, 2.000000e+00
+  unreachable
+}
diff --git a/test/Transforms/Reassociate/min_int.ll b/test/Transforms/Reassociate/min_int.ll
new file mode 100644
index 0000000..52dab3a
--- /dev/null
+++ b/test/Transforms/Reassociate/min_int.ll
@@ -0,0 +1,13 @@
+; RUN: opt < %s -reassociate -dce -S | FileCheck %s
+
+; MIN_INT cannot be negated during reassociation
+
+define i32 @minint(i32 %i) {
+; CHECK:  %mul = mul i32 %i, -2147483648
+; CHECK-NEXT:  %add = add i32 %mul, 1
+; CHECK-NEXT:  ret i32 %add
+  %mul = mul i32 %i, -2147483648
+  %add = add i32 %mul, 1
+  ret i32 %add
+}
+
diff --git a/test/Transforms/RewriteStatepointsForGC/basics.ll b/test/Transforms/RewriteStatepointsForGC/basics.ll
new file mode 100644
index 0000000..ec522ab
--- /dev/null
+++ b/test/Transforms/RewriteStatepointsForGC/basics.ll
@@ -0,0 +1,88 @@
+; This is a collection of really basic tests for gc.statepoint rewriting.
+; RUN:  opt %s -rewrite-statepoints-for-gc -S | FileCheck %s
+
+declare void @foo()
+
+; Trivial relocation over a single call
+define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) gc "statepoint-example" {
+; CHECK-LABEL: @test1
+; CHECK-LABEL: entry:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj.relocated = call coldcc i8 addrspace(1)*
+entry:
+  call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @foo, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  ret i8 addrspace(1)* %obj
+}
+
+; Two safepoints in a row (i.e. consistent liveness)
+define i8 addrspace(1)* @test2(i8 addrspace(1)* %obj) gc "statepoint-example" {
+; CHECK-LABEL: @test2
+; CHECK-LABEL: entry:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj.relocated = call coldcc i8 addrspace(1)*
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj.relocated1 = call coldcc i8 addrspace(1)*
+entry:
+  call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @foo, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @foo, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  ret i8 addrspace(1)* %obj
+}
+
+; A simple derived pointer
+define i8 @test3(i8 addrspace(1)* %obj) gc "statepoint-example" {
+; CHECK-LABEL: entry:
+; CHECK-NEXT: getelementptr
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %derived.relocated = call coldcc i8 addrspace(1)*
+; CHECK-NEXT: %obj.relocated = call coldcc i8 addrspace(1)*
+; CHECK-NEXT: load i8 addrspace(1)* %derived.relocated
+; CHECK-NEXT: load i8 addrspace(1)* %obj.relocated
+entry:
+  %derived = getelementptr i8 addrspace(1)* %obj, i64 10
+  call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @foo, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+
+  %a = load i8 addrspace(1)* %derived
+  %b = load i8 addrspace(1)* %obj
+  %c = sub i8 %a, %b
+  ret i8 %c
+}
+
+; Tests to make sure we visit both the taken and untaken predeccessor 
+; of merge.  This was a bug in the dataflow liveness at one point.
+define i8 addrspace(1)* @test4(i1 %cmp, i8 addrspace(1)* %obj) gc "statepoint-example" {
+entry:
+  br i1 %cmp, label %taken, label %untaken
+
+taken:
+; CHECK-LABEL: taken:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj.relocated = call coldcc i8 addrspace(1)*
+  call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @foo, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  br label %merge
+
+untaken:
+; CHECK-LABEL: untaken:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj.relocated1 = call coldcc i8 addrspace(1)*
+  call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @foo, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  br label %merge
+
+merge:
+; CHECK-LABEL: merge:
+; CHECK-NEXT: %.0 = phi i8 addrspace(1)* [ %obj.relocated, %taken ], [ %obj.relocated1, %untaken ]
+; CHECK-NEXT: ret i8 addrspace(1)* %.0
+  ret i8 addrspace(1)* %obj
+}
+
+; When run over a function which doesn't opt in, should do nothing!
+define i8 addrspace(1)* @test5(i8 addrspace(1)* %obj) {
+; CHECK-LABEL: @test5
+; CHECK-LABEL: entry:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NOT: %obj.relocated = call coldcc i8 addrspace(1)*
+entry:
+  call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @foo, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  ret i8 addrspace(1)* %obj
+}
+
+declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()*, i32, i32, ...)
diff --git a/test/Transforms/SLPVectorizer/X86/addsub.ll b/test/Transforms/SLPVectorizer/X86/addsub.ll
index 174d400..d082b07 100644
--- a/test/Transforms/SLPVectorizer/X86/addsub.ll
+++ b/test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -10,6 +10,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @fb = common global [4 x float] zeroinitializer, align 16
 @fc = common global [4 x float] zeroinitializer, align 16
 @fa = common global [4 x float] zeroinitializer, align 16
+@fd = common global [4 x float] zeroinitializer, align 16
 
 ; CHECK-LABEL: @addsub
 ; CHECK: %5 = add nsw <4 x i32> %3, %4
@@ -177,5 +178,137 @@ entry:
   ret void
 }
 
+; Check vectorization of following code for float data type-
+;  fc[0] = fb[0]+fa[0]; //swapped fb and fa
+;  fc[1] = fa[1]-fb[1];
+;  fc[2] = fa[2]+fb[2];
+;  fc[3] = fa[3]-fb[3];
+
+; CHECK-LABEL: @reorder_alt
+; CHECK: %3 = fadd <4 x float> %1, %2
+; CHECK: %4 = fsub <4 x float> %1, %2
+; CHECK: %5 = shufflevector <4 x float> %3, <4 x float> %4, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+define void @reorder_alt() #0 {
+  %1 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
+  %2 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
+  %3 = fadd float %1, %2
+  store float %3, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
+  %4 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
+  %5 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
+  %6 = fsub float %4, %5
+  store float %6, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
+  %7 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
+  %8 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
+  %9 = fadd float %7, %8
+  store float %9, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
+  %10 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
+  %11 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
+  %12 = fsub float %10, %11
+  store float %12, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
+  ret void
+}
+
+; Check vectorization of following code for float data type-
+;  fc[0] = fa[0]+(fb[0]-fd[0]);
+;  fc[1] = fa[1]-(fb[1]+fd[1]);
+;  fc[2] = fa[2]+(fb[2]-fd[2]);
+;  fc[3] = fa[3]-(fd[3]+fb[3]); //swapped fd and fb 
+
+; CHECK-LABEL: @reorder_alt_subTree
+; CHECK: %4 = fsub <4 x float> %3, %2
+; CHECK: %5 = fadd <4 x float> %3, %2
+; CHECK: %6 = shufflevector <4 x float> %4, <4 x float> %5, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK: %7 = fadd <4 x float> %1, %6
+; CHECK: %8 = fsub <4 x float> %1, %6
+; CHECK: %9 = shufflevector <4 x float> %7, <4 x float> %8, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+define void @reorder_alt_subTree() #0 {
+  %1 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
+  %2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
+  %3 = load float* getelementptr inbounds ([4 x float]* @fd, i32 0, i64 0), align 4
+  %4 = fsub float %2, %3
+  %5 = fadd float %1, %4
+  store float %5, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
+  %6 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
+  %7 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
+  %8 = load float* getelementptr inbounds ([4 x float]* @fd, i32 0, i64 1), align 4
+  %9 = fadd float %7, %8
+  %10 = fsub float %6, %9
+  store float %10, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
+  %11 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
+  %12 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
+  %13 = load float* getelementptr inbounds ([4 x float]* @fd, i32 0, i64 2), align 4
+  %14 = fsub float %12, %13
+  %15 = fadd float %11, %14
+  store float %15, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
+  %16 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
+  %17 = load float* getelementptr inbounds ([4 x float]* @fd, i32 0, i64 3), align 4
+  %18 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
+  %19 = fadd float %17, %18
+  %20 = fsub float %16, %19
+  store float %20, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
+  ret void
+}
+
+; Check vectorization of following code for double data type-
+;  c[0] = (a[0]+b[0])-d[0];
+;  c[1] = d[1]+(a[1]+b[1]); //swapped d[1] and (a[1]+b[1]) 
+
+; CHECK-LABEL: @reorder_alt_rightsubTree
+; CHECK: fadd <2 x double>
+; CHECK: fsub <2 x double>
+; CHECK: shufflevector <2 x double> 
+define void @reorder_alt_rightsubTree(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %d) {
+  %1 = load double* %a
+  %2 = load double* %b
+  %3 = fadd double %1, %2
+  %4 = load double* %d
+  %5 = fsub double %3, %4
+  store double %5, double* %c
+  %6 = getelementptr inbounds double* %d, i64 1
+  %7 = load double* %6
+  %8 = getelementptr inbounds double* %a, i64 1
+  %9 = load double* %8
+  %10 = getelementptr inbounds double* %b, i64 1
+  %11 = load double* %10
+  %12 = fadd double %9, %11
+  %13 = fadd double %7, %12
+  %14 = getelementptr inbounds double* %c, i64 1
+  store double %13, double* %14
+  ret void
+}
+
+; Dont vectorization of following code for float data type as sub is not commutative-
+;  fc[0] = fb[0]+fa[0];
+;  fc[1] = fa[1]-fb[1];
+;  fc[2] = fa[2]+fb[2];
+;  fc[3] = fb[3]-fa[3];
+;  In the above code we can swap the 1st and 2nd operation as fadd is commutative
+;  but not 2nd or 4th as fsub is not commutative. 
+
+; CHECK-LABEL: @no_vec_shuff_reorder
+; CHECK-NOT: fadd <4 x float>
+; CHECK-NOT: fsub <4 x float>
+; CHECK-NOT: shufflevector
+define void @no_vec_shuff_reorder() #0 {
+  %1 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
+  %2 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
+  %3 = fadd float %1, %2
+  store float %3, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
+  %4 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
+  %5 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
+  %6 = fsub float %4, %5
+  store float %6, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
+  %7 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
+  %8 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
+  %9 = fadd float %7, %8
+  store float %9, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
+  %10 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
+  %11 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
+  %12 = fsub float %10, %11
+  store float %12, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
+  ret void
+}
+
+
 attributes #0 = { nounwind }
 
diff --git a/test/Transforms/SLPVectorizer/X86/atomics.ll b/test/Transforms/SLPVectorizer/X86/atomics.ll
new file mode 100644
index 0000000..6cb322e
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/atomics.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S |FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+@x = global [4 x i32] zeroinitializer, align 16
+@a = global [4 x i32] zeroinitializer, align 16
+
+; The SLPVectorizer should not vectorize atomic stores and it should not
+; schedule regular stores around atomic stores.
+
+; CHECK-LABEL: test
+; CHECK: store i32
+; CHECK: store atomic i32
+; CHECK: store i32
+; CHECK: store atomic i32
+; CHECK: store i32
+; CHECK: store atomic i32
+; CHECK: store i32
+; CHECK: store atomic i32
+define void @test() {
+entry:
+  store i32 0, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 0), align 16
+  store atomic i32 0, i32* getelementptr inbounds ([4 x i32]* @x, i64 0, i64 0) release, align 16
+  store i32 0, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 1), align 4
+  store atomic i32 1, i32* getelementptr inbounds ([4 x i32]* @x, i64 0, i64 1) release, align 4
+  store i32 0, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 2), align 8
+  store atomic i32 2, i32* getelementptr inbounds ([4 x i32]* @x, i64 0, i64 2) release, align 8
+  store i32 0, i32* getelementptr inbounds ([4 x i32]* @a, i64 0, i64 3), align 4
+  store atomic i32 3, i32* getelementptr inbounds ([4 x i32]* @x, i64 0, i64 3) release, align 4
+  ret void
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/bad_types.ll b/test/Transforms/SLPVectorizer/X86/bad_types.ll
new file mode 100644
index 0000000..38ed18d
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/bad_types.ll
@@ -0,0 +1,50 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test1(x86_mmx %a, x86_mmx %b, i64* %ptr) {
+; Ensure we can handle x86_mmx values which are primitive and can be bitcast
+; with integer types but can't be put into a vector.
+;
+; CHECK-LABEL: @test1
+; CHECK:         store i64
+; CHECK:         store i64
+; CHECK:         ret void
+entry:
+  %a.cast = bitcast x86_mmx %a to i64
+  %b.cast = bitcast x86_mmx %b to i64
+  %a.and = and i64 %a.cast, 42
+  %b.and = and i64 %b.cast, 42
+  %gep = getelementptr i64* %ptr, i32 1
+  store i64 %a.and, i64* %ptr
+  store i64 %b.and, i64* %gep
+  ret void
+}
+
+define void @test2(x86_mmx %a, x86_mmx %b) {
+; Same as @test1 but using phi-input vectorization instead of store
+; vectorization.
+;
+; CHECK-LABEL: @test2
+; CHECK:         and i64
+; CHECK:         and i64
+; CHECK:         ret void
+entry:
+  br i1 undef, label %if.then, label %exit
+
+if.then:
+  %a.cast = bitcast x86_mmx %a to i64
+  %b.cast = bitcast x86_mmx %b to i64
+  %a.and = and i64 %a.cast, 42
+  %b.and = and i64 %b.cast, 42
+  br label %exit
+
+exit:
+  %a.phi = phi i64 [ 0, %entry ], [ %a.and, %if.then ]
+  %b.phi = phi i64 [ 0, %entry ], [ %b.and, %if.then ]
+  tail call void @f(i64 %a.phi, i64 %b.phi)
+  ret void
+}
+
+declare void @f(i64, i64)
diff --git a/test/Transforms/SLPVectorizer/X86/consecutive-access.ll b/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
index f4f112f..aa59429 100644
--- a/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
+++ b/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
@@ -172,4 +172,4 @@ attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-po
 
 !llvm.ident = !{!0}
 
-!0 = metadata !{metadata !"clang version 3.5.0 "}
+!0 = !{!"clang version 3.5.0 "}
diff --git a/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
new file mode 100644
index 0000000..18a96e5
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
@@ -0,0 +1,56 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+define void @testfunc(float* nocapture %dest, float* nocapture readonly %src) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %acc1.056 = phi float [ 0.000000e+00, %entry ], [ %add13, %for.body ]
+  %s1.055 = phi float [ 0.000000e+00, %entry ], [ %cond.i40, %for.body ]
+  %s0.054 = phi float [ 0.000000e+00, %entry ], [ %cond.i44, %for.body ]
+  %arrayidx = getelementptr inbounds float* %src, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds float* %dest, i64 %indvars.iv
+  store float %acc1.056, float* %arrayidx2, align 4
+  %add = fadd float %s0.054, %0
+  %add3 = fadd float %s1.055, %0
+  %mul = fmul float %s0.054, 0.000000e+00
+  %add4 = fadd float %mul, %add3
+  %mul5 = fmul float %s1.055, 0.000000e+00
+  %add6 = fadd float %mul5, %add
+  %cmp.i = fcmp olt float %add6, 1.000000e+00
+  %cond.i = select i1 %cmp.i, float %add6, float 1.000000e+00
+  %cmp.i51 = fcmp olt float %cond.i, -1.000000e+00
+  %cmp.i49 = fcmp olt float %add4, 1.000000e+00
+  %cond.i50 = select i1 %cmp.i49, float %add4, float 1.000000e+00
+  %cmp.i47 = fcmp olt float %cond.i50, -1.000000e+00
+  %cond.i.op = fmul float %cond.i, 0.000000e+00
+  %mul10 = select i1 %cmp.i51, float -0.000000e+00, float %cond.i.op
+  %cond.i50.op = fmul float %cond.i50, 0.000000e+00
+  %mul11 = select i1 %cmp.i47, float -0.000000e+00, float %cond.i50.op
+  %add13 = fadd float %mul10, %mul11
+
+  ; The SLPVectorizer crashed in vectorizeChainsInBlock() because it tried
+  ; to access the second operand of the following cmp after the cmp itself
+  ; was already vectorized and deleted.
+  %cmp.i45 = fcmp olt float %add13, 1.000000e+00
+
+  %cond.i46 = select i1 %cmp.i45, float %add13, float 1.000000e+00
+  %cmp.i43 = fcmp olt float %cond.i46, -1.000000e+00
+  %cond.i44 = select i1 %cmp.i43, float -1.000000e+00, float %cond.i46
+  %cmp.i41 = fcmp olt float %mul11, 1.000000e+00
+  %cond.i42 = select i1 %cmp.i41, float %mul11, float 1.000000e+00
+  %cmp.i39 = fcmp olt float %cond.i42, -1.000000e+00
+  %cond.i40 = select i1 %cmp.i39, float -1.000000e+00, float %cond.i42
+  %exitcond = icmp eq i64 %indvars.iv.next, 32
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll b/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll
index dddc1be..e6cc0f7 100644
--- a/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll
+++ b/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll
@@ -43,5 +43,5 @@ return:
 
 declare i32 @_xfn(<2 x double>) #4
 
-!3 = metadata !{metadata !"int", metadata !4, i64 0}
-!4 = metadata !{metadata !3, metadata !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!3, !3, i64 0}
diff --git a/test/Transforms/SLPVectorizer/X86/debug_info.ll b/test/Transforms/SLPVectorizer/X86/debug_info.ll
index 1046087..21f51d7 100644
--- a/test/Transforms/SLPVectorizer/X86/debug_info.ll
+++ b/test/Transforms/SLPVectorizer/X86/debug_info.ll
@@ -18,16 +18,16 @@ target triple = "x86_64-apple-macosx10.7.0"
 ;CHECK: load <2 x double>* {{.*}}, !dbg ![[LOC]]
 ;CHECK: store <2 x double> {{.*}}, !dbg ![[LOC2:[0-9]+]]
 ;CHECK: ret
-;CHECK: ![[LOC]] = metadata !{i32 4, i32 0,
-;CHECK: ![[LOC2]] = metadata !{i32 7, i32 0,
+;CHECK: ![[LOC]] = !MDLocation(line: 4, scope:
+;CHECK: ![[LOC2]] = !MDLocation(line: 7, scope:
 
 define i32 @depth(double* nocapture %A, i32 %m) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{double* %A}, i64 0, metadata !12, metadata !{}), !dbg !19
-  tail call void @llvm.dbg.value(metadata !{i32 %m}, i64 0, metadata !13, metadata !{}), !dbg !19
-  tail call void @llvm.dbg.value(metadata !20, i64 0, metadata !14, metadata !{}), !dbg !21
-  tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !15, metadata !{}), !dbg !21
-  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !16, metadata !{}), !dbg !23
+  tail call void @llvm.dbg.value(metadata double* %A, i64 0, metadata !12, metadata !{}), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 %m, i64 0, metadata !13, metadata !{}), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 00, i64 0, metadata !14, metadata !{}), !dbg !21
+  tail call void @llvm.dbg.value(metadata i32 02, i64 0, metadata !15, metadata !{}), !dbg !21
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !16, metadata !{}), !dbg !23
   %cmp8 = icmp sgt i32 %m, 0, !dbg !23
   br i1 %cmp8, label %for.body.lr.ph, label %for.end, !dbg !23
 
@@ -57,33 +57,33 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18, !32}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 187335) (llvm/trunk 187335:187340M)\001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/nadav/file.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"file.c", metadata !"/Users/nadav"}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00depth\00depth\00\001\000\001\000\006\00256\001\001", metadata !1, metadata !5, metadata !6, null, i32 (double*, i32)* @depth, null, null, metadata !11} ; [ DW_TAG_subprogram ] [line 1] [def] [depth]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/Users/nadav/file.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !9, metadata !8}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from double]
-!10 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
-!11 = metadata !{metadata !12, metadata !13, metadata !14, metadata !15, metadata !16}
-!12 = metadata !{metadata !"0x101\00A\0016777217\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [A] [line 1]
-!13 = metadata !{metadata !"0x101\00m\0033554433\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [m] [line 1]
-!14 = metadata !{metadata !"0x100\00y0\002\000", metadata !4, metadata !5, metadata !10} ; [ DW_TAG_auto_variable ] [y0] [line 2]
-!15 = metadata !{metadata !"0x100\00y1\002\000", metadata !4, metadata !5, metadata !10} ; [ DW_TAG_auto_variable ] [y1] [line 2]
-!16 = metadata !{metadata !"0x100\00i\003\000", metadata !17, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 3]
-!17 = metadata !{metadata !"0xb\003\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/Users/nadav/file.c]
-!18 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!19 = metadata !{i32 1, i32 0, metadata !4, null}
-!20 = metadata !{double 0.000000e+00}
-!21 = metadata !{i32 2, i32 0, metadata !4, null}
-!22 = metadata !{double 1.000000e+00}
-!23 = metadata !{i32 3, i32 0, metadata !17, null}
-!24 = metadata !{i32 4, i32 0, metadata !25, null}
-!25 = metadata !{metadata !"0xb\003\000\001", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ] [/Users/nadav/file.c]
-!29 = metadata !{i32 5, i32 0, metadata !25, null}
-!30 = metadata !{i32 7, i32 0, metadata !4, null}
-!31 = metadata !{i32 8, i32 0, metadata !4, null}
-!32 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4 (trunk 187335) (llvm/trunk 187335:187340M)\001\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/Users/nadav/file.c] [DW_LANG_C99]
+!1 = !{!"file.c", !"/Users/nadav"}
+!2 = !{i32 0}
+!3 = !{!4}
+!4 = !{!"0x2e\00depth\00depth\00\001\000\001\000\006\00256\001\001", !1, !5, !6, null, i32 (double*, i32)* @depth, null, null, !11} ; [ DW_TAG_subprogram ] [line 1] [def] [depth]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/Users/nadav/file.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !9, !8}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0xf\00\000\0064\0064\000\000", null, null, !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from double]
+!10 = !{!"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!11 = !{!12, !13, !14, !15, !16}
+!12 = !{!"0x101\00A\0016777217\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [A] [line 1]
+!13 = !{!"0x101\00m\0033554433\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [m] [line 1]
+!14 = !{!"0x100\00y0\002\000", !4, !5, !10} ; [ DW_TAG_auto_variable ] [y0] [line 2]
+!15 = !{!"0x100\00y1\002\000", !4, !5, !10} ; [ DW_TAG_auto_variable ] [y1] [line 2]
+!16 = !{!"0x100\00i\003\000", !17, !5, !8} ; [ DW_TAG_auto_variable ] [i] [line 3]
+!17 = !{!"0xb\003\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [/Users/nadav/file.c]
+!18 = !{i32 2, !"Dwarf Version", i32 2}
+!19 = !MDLocation(line: 1, scope: !4)
+!20 = !{double 0.000000e+00}
+!21 = !MDLocation(line: 2, scope: !4)
+!22 = !{double 1.000000e+00}
+!23 = !MDLocation(line: 3, scope: !17)
+!24 = !MDLocation(line: 4, scope: !25)
+!25 = !{!"0xb\003\000\001", !1, !17} ; [ DW_TAG_lexical_block ] [/Users/nadav/file.c]
+!29 = !MDLocation(line: 5, scope: !25)
+!30 = !MDLocation(line: 7, scope: !4)
+!31 = !MDLocation(line: 8, scope: !4)
+!32 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/SLPVectorizer/X86/metadata.ll b/test/Transforms/SLPVectorizer/X86/metadata.ll
index 5bd2fa4..e021cca 100644
--- a/test/Transforms/SLPVectorizer/X86/metadata.ll
+++ b/test/Transforms/SLPVectorizer/X86/metadata.ll
@@ -51,11 +51,11 @@ entry:
   ret void
 }
 
-;CHECK-DAG: ![[TBAA]] = metadata !{metadata [[TYPEC:!.*]], metadata [[TYPEC]], i64 0}
-;CHECK-DAG: ![[FP1]] = metadata !{float 5.000000e+00}
-;CHECK-DAG: ![[FP2]] = metadata !{float 2.500000e+00}
-!0 = metadata !{ float 5.0 }
-!1 = metadata !{ float 2.5 }
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"omnipotent char", metadata !2}
-!4 = metadata !{metadata !"double", metadata !3}
+;CHECK-DAG: ![[TBAA]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
+;CHECK-DAG: ![[FP1]] = !{float 5.000000e+00}
+;CHECK-DAG: ![[FP2]] = !{float 2.500000e+00}
+!0 = !{ float 5.0 }
+!1 = !{ float 2.5 }
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"omnipotent char", !2}
+!4 = !{!"double", !3}
diff --git a/test/Transforms/SLPVectorizer/X86/operandorder.ll b/test/Transforms/SLPVectorizer/X86/operandorder.ll
index c5322a8..cd446f0 100644
--- a/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -232,3 +232,113 @@ for.body3:
 for.end:
   ret void
 }
+
+; Check vectorization of following code for double data type-
+;  c[0] = a[0]+b[0];
+;  c[1] = b[1]+a[1]; // swapped b[1] and a[1]
+
+; CHECK-LABEL: load_reorder_double
+; CHECK: load <2 x double>*
+; CHECK: fadd <2 x double>
+define void @load_reorder_double(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b){
+  %1 = load double* %a
+  %2 = load double* %b
+  %3 = fadd double %1, %2
+  store double %3, double* %c
+  %4 = getelementptr inbounds double* %b, i64 1
+  %5 = load double* %4
+  %6 = getelementptr inbounds double* %a, i64 1
+  %7 = load double* %6
+  %8 = fadd double %5, %7
+  %9 = getelementptr inbounds double* %c, i64 1
+  store double %8, double* %9
+  ret void
+}
+
+; Check vectorization of following code for float data type-
+;  c[0] = a[0]+b[0];
+;  c[1] = b[1]+a[1]; // swapped b[1] and a[1]
+;  c[2] = a[2]+b[2];
+;  c[3] = a[3]+b[3];
+
+; CHECK-LABEL: load_reorder_float
+; CHECK: load <4 x float>*
+; CHECK: fadd <4 x float>
+define void @load_reorder_float(float* nocapture %c, float* noalias nocapture readonly %a, float* noalias nocapture readonly %b){
+  %1 = load float* %a
+  %2 = load float* %b
+  %3 = fadd float %1, %2
+  store float %3, float* %c
+  %4 = getelementptr inbounds float* %b, i64 1
+  %5 = load float* %4
+  %6 = getelementptr inbounds float* %a, i64 1
+  %7 = load float* %6
+  %8 = fadd float %5, %7
+  %9 = getelementptr inbounds float* %c, i64 1
+  store float %8, float* %9
+  %10 = getelementptr inbounds float* %a, i64 2
+  %11 = load float* %10
+  %12 = getelementptr inbounds float* %b, i64 2
+  %13 = load float* %12
+  %14 = fadd float %11, %13
+  %15 = getelementptr inbounds float* %c, i64 2
+  store float %14, float* %15
+  %16 = getelementptr inbounds float* %a, i64 3
+  %17 = load float* %16
+  %18 = getelementptr inbounds float* %b, i64 3
+  %19 = load float* %18
+  %20 = fadd float %17, %19
+  %21 = getelementptr inbounds float* %c, i64 3
+  store float %20, float* %21
+  ret void
+}
+
+; Check we properly reorder the below code so that it gets vectorized optimally-
+; a[0] = (b[0]+c[0])+d[0];
+; a[1] = d[1]+(b[1]+c[1]);
+; a[2] = (b[2]+c[2])+d[2];
+; a[3] = (b[3]+c[3])+d[3];
+
+; CHECK-LABEL: opcode_reorder
+; CHECK: load <4 x float>*
+; CHECK: fadd <4 x float>
+define void @opcode_reorder(float* noalias nocapture %a, float* noalias nocapture readonly %b, 
+                            float* noalias nocapture readonly %c,float* noalias nocapture readonly %d){
+  %1 = load float* %b
+  %2 = load float* %c
+  %3 = fadd float %1, %2
+  %4 = load float* %d
+  %5 = fadd float %3, %4
+  store float %5, float* %a
+  %6 = getelementptr inbounds float* %d, i64 1
+  %7 = load float* %6
+  %8 = getelementptr inbounds float* %b, i64 1
+  %9 = load float* %8
+  %10 = getelementptr inbounds float* %c, i64 1
+  %11 = load float* %10
+  %12 = fadd float %9, %11
+  %13 = fadd float %7, %12
+  %14 = getelementptr inbounds float* %a, i64 1
+  store float %13, float* %14
+  %15 = getelementptr inbounds float* %b, i64 2
+  %16 = load float* %15
+  %17 = getelementptr inbounds float* %c, i64 2
+  %18 = load float* %17
+  %19 = fadd float %16, %18
+  %20 = getelementptr inbounds float* %d, i64 2
+  %21 = load float* %20
+  %22 = fadd float %19, %21
+  %23 = getelementptr inbounds float* %a, i64 2
+  store float %22, float* %23
+  %24 = getelementptr inbounds float* %b, i64 3
+  %25 = load float* %24
+  %26 = getelementptr inbounds float* %c, i64 3
+  %27 = load float* %26
+  %28 = fadd float %25, %27
+  %29 = getelementptr inbounds float* %d, i64 3
+  %30 = load float* %29
+  %31 = fadd float %28, %30
+  %32 = getelementptr inbounds float* %a, i64 3
+  store float %31, float* %32
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/pr16899.ll b/test/Transforms/SLPVectorizer/X86/pr16899.ll
index 8631bc9..c642f3c 100644
--- a/test/Transforms/SLPVectorizer/X86/pr16899.ll
+++ b/test/Transforms/SLPVectorizer/X86/pr16899.ll
@@ -23,9 +23,9 @@ do.body:                                          ; preds = %do.body, %entry
 
 attributes #0 = { noreturn nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
-!0 = metadata !{metadata !"any pointer", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"int", metadata !1}
-!4 = metadata !{metadata !0, metadata !0, i64 0}
-!5 = metadata !{metadata !3, metadata !3, i64 0}
+!0 = !{!"any pointer", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"int", !1}
+!4 = !{!0, !0, i64 0}
+!5 = !{!3, !3, i64 0}
diff --git a/test/Transforms/SROA/alignment.ll b/test/Transforms/SROA/alignment.ll
index 5fa78766..4f4e40c 100644
--- a/test/Transforms/SROA/alignment.ll
+++ b/test/Transforms/SROA/alignment.ll
@@ -85,15 +85,18 @@ entry:
 }
 
 define void @test5() {
-; Test that we preserve underaligned loads and stores when splitting.
+; Test that we preserve underaligned loads and stores when splitting. The use
+; of volatile in this test case is just to force the loads and stores to not be
+; split or promoted out of existence.
+;
 ; CHECK-LABEL: @test5(
 ; CHECK: alloca [9 x i8]
 ; CHECK: alloca [9 x i8]
 ; CHECK: store volatile double 0.0{{.*}}, double* %{{.*}}, align 1
-; CHECK: load i16* %{{.*}}, align 1
+; CHECK: load volatile i16* %{{.*}}, align 1
 ; CHECK: load double* %{{.*}}, align 1
 ; CHECK: store volatile double %{{.*}}, double* %{{.*}}, align 1
-; CHECK: load i16* %{{.*}}, align 1
+; CHECK: load volatile i16* %{{.*}}, align 1
 ; CHECK: ret void
 
 entry:
@@ -103,7 +106,7 @@ entry:
   store volatile double 0.0, double* %ptr1, align 1
   %weird_gep1 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 7
   %weird_cast1 = bitcast i8* %weird_gep1 to i16*
-  %weird_load1 = load i16* %weird_cast1, align 1
+  %weird_load1 = load volatile i16* %weird_cast1, align 1
 
   %raw2 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 9
   %ptr2 = bitcast i8* %raw2 to double*
@@ -111,7 +114,7 @@ entry:
   store volatile double %d1, double* %ptr2, align 1
   %weird_gep2 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 16
   %weird_cast2 = bitcast i8* %weird_gep2 to i16*
-  %weird_load2 = load i16* %weird_cast2, align 1
+  %weird_load2 = load volatile i16* %weird_cast2, align 1
 
   ret void
 }
diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll
index dc2b165..e3f762a 100644
--- a/test/Transforms/SROA/basictest.ll
+++ b/test/Transforms/SROA/basictest.ll
@@ -1440,3 +1440,158 @@ entry:
   ret void
 }
 
+define float @test25() {
+; Check that we split up stores in order to promote the smaller SSA values.. These types
+; of patterns can arise because LLVM maps small memcpy's to integer load and
+; stores. If we get a memcpy of an aggregate (such as C and C++ frontends would
+; produce, but so might any language frontend), this will in many cases turn into
+; an integer load and store. SROA needs to be extremely powerful to correctly
+; handle these cases and form splitable and promotable SSA values.
+;
+; CHECK-LABEL: @test25(
+; CHECK-NOT: alloca
+; CHECK: %[[F1:.*]] = bitcast i32 0 to float
+; CHECK: %[[F2:.*]] = bitcast i32 1065353216 to float
+; CHECK: %[[SUM:.*]] = fadd float %[[F1]], %[[F2]]
+; CHECK: ret float %[[SUM]]
+
+entry:
+  %a = alloca i64
+  %b = alloca i64
+  %a.cast = bitcast i64* %a to [2 x float]*
+  %a.gep1 = getelementptr [2 x float]* %a.cast, i32 0, i32 0
+  %a.gep2 = getelementptr [2 x float]* %a.cast, i32 0, i32 1
+  %b.cast = bitcast i64* %b to [2 x float]*
+  %b.gep1 = getelementptr [2 x float]* %b.cast, i32 0, i32 0
+  %b.gep2 = getelementptr [2 x float]* %b.cast, i32 0, i32 1
+  store float 0.0, float* %a.gep1
+  store float 1.0, float* %a.gep2
+  %v = load i64* %a
+  store i64 %v, i64* %b
+  %f1 = load float* %b.gep1
+  %f2 = load float* %b.gep2
+  %ret = fadd float %f1, %f2
+  ret float %ret
+}
+
+@complex1 = external global [2 x float]
+@complex2 = external global [2 x float]
+
+define void @test26() {
+; Test a case of splitting up loads and stores against a globals.
+;
+; CHECK-LABEL: @test26(
+; CHECK-NOT: alloca
+; CHECK: %[[L1:.*]] = load i32* bitcast
+; CHECK: %[[L2:.*]] = load i32* bitcast
+; CHECK: %[[F1:.*]] = bitcast i32 %[[L1]] to float
+; CHECK: %[[F2:.*]] = bitcast i32 %[[L2]] to float
+; CHECK: %[[SUM:.*]] = fadd float %[[F1]], %[[F2]]
+; CHECK: %[[C1:.*]] = bitcast float %[[SUM]] to i32
+; CHECK: %[[C2:.*]] = bitcast float %[[SUM]] to i32
+; CHECK: store i32 %[[C1]], i32* bitcast
+; CHECK: store i32 %[[C2]], i32* bitcast
+; CHECK: ret void
+
+entry:
+  %a = alloca i64
+  %a.cast = bitcast i64* %a to [2 x float]*
+  %a.gep1 = getelementptr [2 x float]* %a.cast, i32 0, i32 0
+  %a.gep2 = getelementptr [2 x float]* %a.cast, i32 0, i32 1
+  %v1 = load i64* bitcast ([2 x float]* @complex1 to i64*)
+  store i64 %v1, i64* %a
+  %f1 = load float* %a.gep1
+  %f2 = load float* %a.gep2
+  %sum = fadd float %f1, %f2
+  store float %sum, float* %a.gep1
+  store float %sum, float* %a.gep2
+  %v2 = load i64* %a
+  store i64 %v2, i64* bitcast ([2 x float]* @complex2 to i64*)
+  ret void
+}
+
+define float @test27() {
+; Another, more complex case of splittable i64 loads and stores. This example
+; is a particularly challenging one because the load and store both point into
+; the alloca SROA is processing, and they overlap but at an offset.
+;
+; CHECK-LABEL: @test27(
+; CHECK-NOT: alloca
+; CHECK: %[[F1:.*]] = bitcast i32 0 to float
+; CHECK: %[[F2:.*]] = bitcast i32 1065353216 to float
+; CHECK: %[[SUM:.*]] = fadd float %[[F1]], %[[F2]]
+; CHECK: ret float %[[SUM]]
+
+entry:
+  %a = alloca [12 x i8]
+  %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
+  %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
+  %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
+  %iptr1 = bitcast i8* %gep1 to i64*
+  %iptr2 = bitcast i8* %gep2 to i64*
+  %fptr1 = bitcast i8* %gep1 to float*
+  %fptr2 = bitcast i8* %gep2 to float*
+  %fptr3 = bitcast i8* %gep3 to float*
+  store float 0.0, float* %fptr1
+  store float 1.0, float* %fptr2
+  %v = load i64* %iptr1
+  store i64 %v, i64* %iptr2
+  %f1 = load float* %fptr2
+  %f2 = load float* %fptr3
+  %ret = fadd float %f1, %f2
+  ret float %ret
+}
+
+define i32 @PR22093() {
+; Test that we don't try to pre-split a splittable store of a splittable but
+; not pre-splittable load over the same alloca. We "handle" this case when the
+; load is unsplittable but unrelated to this alloca by just generating extra
+; loads without touching the original, but when the original load was out of
+; this alloca we need to handle it specially to ensure the splits line up
+; properly for rewriting.
+;
+; CHECK-LABEL: @PR22093(
+; CHECK-NOT: alloca
+; CHECK: alloca i16
+; CHECK-NOT: alloca
+; CHECK: store volatile i16
+
+entry:
+  %a = alloca i32
+  %a.cast = bitcast i32* %a to i16*
+  store volatile i16 42, i16* %a.cast
+  %load = load i32* %a
+  store i32 %load, i32* %a
+  ret i32 %load
+}
+
+define void @PR22093.2() {
+; Another way that we end up being unable to split a particular set of loads
+; and stores can even have ordering importance. Here we have a load which is
+; pre-splittable by itself, and the first store is also compatible. But the
+; second store of the load makes the load unsplittable because of a mismatch of
+; splits. Because this makes the load unsplittable, we also have to go back and
+; remove the first store from the presplit candidates as its load won't be
+; presplit.
+;
+; CHECK-LABEL: @PR22093.2(
+; CHECK-NOT: alloca
+; CHECK: alloca i16
+; CHECK-NEXT: alloca i8
+; CHECK-NOT: alloca
+; CHECK: store volatile i16
+; CHECK: store volatile i8
+
+entry:
+  %a = alloca i64
+  %a.cast1 = bitcast i64* %a to i32*
+  %a.cast2 = bitcast i64* %a to i16*
+  store volatile i16 42, i16* %a.cast2
+  %load = load i32* %a.cast1
+  store i32 %load, i32* %a.cast1
+  %a.gep1 = getelementptr i32* %a.cast1, i32 1
+  %a.cast3 = bitcast i32* %a.gep1 to i8*
+  store volatile i8 13, i8* %a.cast3
+  store i32 %load, i32* %a.gep1
+  ret void
+}
diff --git a/test/Transforms/SROA/vector-promotion.ll b/test/Transforms/SROA/vector-promotion.ll
index 830a22a..c20c635 100644
--- a/test/Transforms/SROA/vector-promotion.ll
+++ b/test/Transforms/SROA/vector-promotion.ll
@@ -604,3 +604,22 @@ entry:
   ret <2 x float> %result
 ; CHECK-NEXT: ret <2 x float> %[[V4]]
 }
+
+define <4 x float> @test12() {
+; CHECK-LABEL: @test12(
+  %a = alloca <3 x i32>, align 16
+; CHECK-NOT: alloca
+
+  %cast1 = bitcast <3 x i32>* %a to <4 x i32>*
+  store <4 x i32> undef, <4 x i32>* %cast1, align 16
+; CHECK-NOT: store
+
+  %cast2 = bitcast <3 x i32>* %a to <3 x float>*
+  %cast3 = bitcast <3 x float>* %cast2 to <4 x float>*
+  %vec = load <4 x float>* %cast3
+; CHECK-NOT: load
+
+; CHECK:      %[[ret:.*]] = bitcast <4 x i32> undef to <4 x float>
+; CHECK-NEXT: ret <4 x float> %[[ret]]
+  ret <4 x float> %vec
+}
diff --git a/test/Transforms/SampleProfile/branch.ll b/test/Transforms/SampleProfile/branch.ll
index e646609..6391fc5 100644
--- a/test/Transforms/SampleProfile/branch.ll
+++ b/test/Transforms/SampleProfile/branch.ll
@@ -32,8 +32,8 @@ define i32 @main(i32 %argc, i8** nocapture readonly %argv) #0 {
 ; CHECK: Printing analysis 'Branch Probability Analysis' for function 'main':
 
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !13, metadata !{}), !dbg !27
-  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !14, metadata !{}), !dbg !27
+  tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !13, metadata !{}), !dbg !27
+  tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !14, metadata !{}), !dbg !27
   %cmp = icmp slt i32 %argc, 2, !dbg !28
   br i1 %cmp, label %return, label %if.end, !dbg !28
 ; CHECK: edge entry -> return probability is 1 / 2 = 50%
@@ -43,7 +43,7 @@ if.end:                                           ; preds = %entry
   %arrayidx = getelementptr inbounds i8** %argv, i64 1, !dbg !30
   %0 = load i8** %arrayidx, align 8, !dbg !30, !tbaa !31
   %call = tail call i32 @atoi(i8* %0) #4, !dbg !30
-  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !17, metadata !{}), !dbg !30
+  tail call void @llvm.dbg.value(metadata i32 %call, i64 0, metadata !17, metadata !{}), !dbg !30
   %cmp1 = icmp sgt i32 %call, 100, !dbg !35
   br i1 %cmp1, label %for.body, label %if.end6, !dbg !35
 ; CHECK: edge if.end -> for.body probability is 1 / 2 = 50%
@@ -55,14 +55,14 @@ for.body:                                         ; preds = %if.end, %for.body
   %add = fadd double %s.015, 3.049000e+00, !dbg !36
   %conv = sitofp i32 %u.016 to double, !dbg !36
   %add4 = fadd double %add, %conv, !dbg !36
-  tail call void @llvm.dbg.value(metadata !{double %add4}, i64 0, metadata !18, metadata !{}), !dbg !36
+  tail call void @llvm.dbg.value(metadata double %add4, i64 0, metadata !18, metadata !{}), !dbg !36
   %div = fdiv double 3.940000e+00, %s.015, !dbg !37
   %mul = fmul double %div, 3.200000e-01, !dbg !37
   %add5 = fadd double %add4, %mul, !dbg !37
   %sub = fsub double %add4, %add5, !dbg !37
-  tail call void @llvm.dbg.value(metadata !{double %sub}, i64 0, metadata !18, metadata !{}), !dbg !37
+  tail call void @llvm.dbg.value(metadata double %sub, i64 0, metadata !18, metadata !{}), !dbg !37
   %inc = add nsw i32 %u.016, 1, !dbg !38
-  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !21, metadata !{}), !dbg !38
+  tail call void @llvm.dbg.value(metadata i32 %inc, i64 0, metadata !21, metadata !{}), !dbg !38
   %exitcond = icmp eq i32 %inc, %call, !dbg !38
   br i1 %exitcond, label %if.end6, label %for.body, !dbg !38
 ; CHECK: edge for.body -> if.end6 probability is 1 / 10227 = 0.00977804
@@ -98,46 +98,46 @@ attributes #4 = { nounwind readonly }
 !llvm.module.flags = !{!25, !42}
 !llvm.ident = !{!26}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (trunk 192896) (llvm/trunk 192895)\001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [./branch.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"branch.cc", metadata !"."}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00main\00main\00\004\000\001\000\006\00256\001\004", metadata !1, metadata !5, metadata !6, null, i32 (i32, i8**)* @main, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 4] [def] [main]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [./branch.cc]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !8, metadata !9}
-!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
-!11 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!12 = metadata !{metadata !13, metadata !14, metadata !15, metadata !17, metadata !18, metadata !21, metadata !23}
-!13 = metadata !{metadata !"0x101\00argc\0016777220\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [argc] [line 4]
-!14 = metadata !{metadata !"0x101\00argv\0033554436\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [argv] [line 4]
-!15 = metadata !{metadata !"0x100\00result\007\000", metadata !4, metadata !5, metadata !16} ; [ DW_TAG_auto_variable ] [result] [line 7]
-!16 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
-!17 = metadata !{metadata !"0x100\00limit\008\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [limit] [line 8]
-!18 = metadata !{metadata !"0x100\00s\0010\000", metadata !19, metadata !5, metadata !16} ; [ DW_TAG_auto_variable ] [s] [line 10]
-!19 = metadata !{metadata !"0xb\009\000\000", metadata !1, metadata !20} ; [ DW_TAG_lexical_block ] [./branch.cc]
-!20 = metadata !{metadata !"0xb\009\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [./branch.cc]
-!21 = metadata !{metadata !"0x100\00u\0011\000", metadata !22, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [u] [line 11]
-!22 = metadata !{metadata !"0xb\0011\000\000", metadata !1, metadata !19} ; [ DW_TAG_lexical_block ] [./branch.cc]
-!23 = metadata !{metadata !"0x100\00x\0012\000", metadata !24, metadata !5, metadata !16} ; [ DW_TAG_auto_variable ] [x] [line 12]
-!24 = metadata !{metadata !"0xb\0011\000\000", metadata !1, metadata !22} ; [ DW_TAG_lexical_block ] [./branch.cc]
-!25 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!26 = metadata !{metadata !"clang version 3.4 (trunk 192896) (llvm/trunk 192895)"}
-!27 = metadata !{i32 4, i32 0, metadata !4, null}
-!28 = metadata !{i32 5, i32 0, metadata !29, null}
-!29 = metadata !{metadata !"0xb\005\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [./branch.cc]
-!30 = metadata !{i32 8, i32 0, metadata !4, null}
-!31 = metadata !{metadata !32, metadata !32, i64 0}
-!32 = metadata !{metadata !"any pointer", metadata !33, i64 0}
-!33 = metadata !{metadata !"omnipotent char", metadata !34, i64 0}
-!34 = metadata !{metadata !"Simple C/C++ TBAA"}
-!35 = metadata !{i32 9, i32 0, metadata !20, null}
-!36 = metadata !{i32 13, i32 0, metadata !24, null}
-!37 = metadata !{i32 14, i32 0, metadata !24, null}
-!38 = metadata !{i32 11, i32 0, metadata !22, null}
-!39 = metadata !{i32 20, i32 0, metadata !4, null}
-!40 = metadata !{i32 21, i32 0, metadata !4, null}
-!41 = metadata !{i32 22, i32 0, metadata !4, null}
-!42 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\004\00clang version 3.4 (trunk 192896) (llvm/trunk 192895)\001\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [./branch.cc] [DW_LANG_C_plus_plus]
+!1 = !{!"branch.cc", !"."}
+!2 = !{i32 0}
+!3 = !{!4}
+!4 = !{!"0x2e\00main\00main\00\004\000\001\000\006\00256\001\004", !1, !5, !6, null, i32 (i32, i8**)* @main, null, null, !12} ; [ DW_TAG_subprogram ] [line 4] [def] [main]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [./branch.cc]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!8, !8, !9}
+!8 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = !{!"0xf\00\000\0064\0064\000\000", null, null, !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!10 = !{!"0xf\00\000\0064\0064\000\000", null, null, !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!11 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!12 = !{!13, !14, !15, !17, !18, !21, !23}
+!13 = !{!"0x101\00argc\0016777220\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [argc] [line 4]
+!14 = !{!"0x101\00argv\0033554436\000", !4, !5, !9} ; [ DW_TAG_arg_variable ] [argv] [line 4]
+!15 = !{!"0x100\00result\007\000", !4, !5, !16} ; [ DW_TAG_auto_variable ] [result] [line 7]
+!16 = !{!"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!17 = !{!"0x100\00limit\008\000", !4, !5, !8} ; [ DW_TAG_auto_variable ] [limit] [line 8]
+!18 = !{!"0x100\00s\0010\000", !19, !5, !16} ; [ DW_TAG_auto_variable ] [s] [line 10]
+!19 = !{!"0xb\009\000\000", !1, !20} ; [ DW_TAG_lexical_block ] [./branch.cc]
+!20 = !{!"0xb\009\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [./branch.cc]
+!21 = !{!"0x100\00u\0011\000", !22, !5, !8} ; [ DW_TAG_auto_variable ] [u] [line 11]
+!22 = !{!"0xb\0011\000\000", !1, !19} ; [ DW_TAG_lexical_block ] [./branch.cc]
+!23 = !{!"0x100\00x\0012\000", !24, !5, !16} ; [ DW_TAG_auto_variable ] [x] [line 12]
+!24 = !{!"0xb\0011\000\000", !1, !22} ; [ DW_TAG_lexical_block ] [./branch.cc]
+!25 = !{i32 2, !"Dwarf Version", i32 4}
+!26 = !{!"clang version 3.4 (trunk 192896) (llvm/trunk 192895)"}
+!27 = !MDLocation(line: 4, scope: !4)
+!28 = !MDLocation(line: 5, scope: !29)
+!29 = !{!"0xb\005\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [./branch.cc]
+!30 = !MDLocation(line: 8, scope: !4)
+!31 = !{!32, !32, i64 0}
+!32 = !{!"any pointer", !33, i64 0}
+!33 = !{!"omnipotent char", !34, i64 0}
+!34 = !{!"Simple C/C++ TBAA"}
+!35 = !MDLocation(line: 9, scope: !20)
+!36 = !MDLocation(line: 13, scope: !24)
+!37 = !MDLocation(line: 14, scope: !24)
+!38 = !MDLocation(line: 11, scope: !22)
+!39 = !MDLocation(line: 20, scope: !4)
+!40 = !MDLocation(line: 21, scope: !4)
+!41 = !MDLocation(line: 22, scope: !4)
+!42 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/SampleProfile/calls.ll b/test/Transforms/SampleProfile/calls.ll
index c39472b..d566609 100644
--- a/test/Transforms/SampleProfile/calls.ll
+++ b/test/Transforms/SampleProfile/calls.ll
@@ -92,29 +92,29 @@ declare i32 @printf(i8*, ...) #2
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [./calls.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"calls.cc", metadata !"."}
-!2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !7}
-!4 = metadata !{metadata !"0x2e\00sum\00sum\00\003\000\001\000\006\00256\000\003", metadata !1, metadata !5, metadata !6, null, i32 (i32, i32)* @_Z3sumii, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [sum]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [./calls.cc]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!10 = metadata !{metadata !"clang version 3.5 "}
-!11 = metadata !{i32 4, i32 0, metadata !4, null}
-!12 = metadata !{i32 8, i32 0, metadata !7, null}
-!13 = metadata !{i32 9, i32 0, metadata !7, null}
-!14 = metadata !{i32 9, i32 0, metadata !15, null}
-!15 = metadata !{metadata !"0xb\001", metadata !1, metadata !7} ; [ DW_TAG_lexical_block ] [./calls.cc]
-!16 = metadata !{i32 10, i32 0, metadata !17, null}
-!17 = metadata !{metadata !"0xb\0010\000\000", metadata !1, metadata !7} ; [ DW_TAG_lexical_block ] [./calls.cc]
-!18 = metadata !{i32 10, i32 0, metadata !19, null}
-!19 = metadata !{metadata !"0xb\001", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ] [./calls.cc]
-!20 = metadata !{i32 10, i32 0, metadata !21, null}
-!21 = metadata !{metadata !"0xb\002", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ] [./calls.cc]
-!22 = metadata !{i32 10, i32 0, metadata !23, null}
-!23 = metadata !{metadata !"0xb\003", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ] [./calls.cc]
-!24 = metadata !{i32 11, i32 0, metadata !7, null}
-!25 = metadata !{i32 12, i32 0, metadata !7, null}
+!0 = !{!"0x11\004\00clang version 3.5 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [./calls.cc] [DW_LANG_C_plus_plus]
+!1 = !{!"calls.cc", !"."}
+!2 = !{}
+!3 = !{!4, !7}
+!4 = !{!"0x2e\00sum\00sum\00\003\000\001\000\006\00256\000\003", !1, !5, !6, null, i32 (i32, i32)* @_Z3sumii, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [sum]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [./calls.cc]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", !1, !5, !6, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 1, !"Debug Info Version", i32 2}
+!10 = !{!"clang version 3.5 "}
+!11 = !MDLocation(line: 4, scope: !4)
+!12 = !MDLocation(line: 8, scope: !7)
+!13 = !MDLocation(line: 9, scope: !7)
+!14 = !MDLocation(line: 9, scope: !15)
+!15 = !{!"0xb\001", !1, !7} ; [ DW_TAG_lexical_block ] [./calls.cc]
+!16 = !MDLocation(line: 10, scope: !17)
+!17 = !{!"0xb\0010\000\000", !1, !7} ; [ DW_TAG_lexical_block ] [./calls.cc]
+!18 = !MDLocation(line: 10, scope: !19)
+!19 = !{!"0xb\001", !1, !17} ; [ DW_TAG_lexical_block ] [./calls.cc]
+!20 = !MDLocation(line: 10, scope: !21)
+!21 = !{!"0xb\002", !1, !17} ; [ DW_TAG_lexical_block ] [./calls.cc]
+!22 = !MDLocation(line: 10, scope: !23)
+!23 = !{!"0xb\003", !1, !17} ; [ DW_TAG_lexical_block ] [./calls.cc]
+!24 = !MDLocation(line: 11, scope: !7)
+!25 = !MDLocation(line: 12, scope: !7)
diff --git a/test/Transforms/SampleProfile/discriminator.ll b/test/Transforms/SampleProfile/discriminator.ll
index 73c73d1..cafc69d 100644
--- a/test/Transforms/SampleProfile/discriminator.ll
+++ b/test/Transforms/SampleProfile/discriminator.ll
@@ -66,25 +66,25 @@ while.end:                                        ; preds = %while.cond
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [discriminator.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"discriminator.c", metadata !"."}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [discriminator.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!9 = metadata !{metadata !"clang version 3.5 "}
-!10 = metadata !{i32 2, i32 0, metadata !4, null}
-!11 = metadata !{i32 3, i32 0, metadata !4, null}
-!12 = metadata !{i32 3, i32 0, metadata !13, null}
-!13 = metadata !{metadata !"0xb\001", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [discriminator.c]
-!14 = metadata !{i32 4, i32 0, metadata !15, null}
-!15 = metadata !{metadata !"0xb\004\000\001", metadata !1, metadata !16} ; [ DW_TAG_lexical_block ] [discriminator.c]
-!16 = metadata !{metadata !"0xb\003\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [discriminator.c]
-!17 = metadata !{i32 4, i32 0, metadata !18, null}
-!18 = metadata !{metadata !"0xb\001", metadata !1, metadata !15} ; [ DW_TAG_lexical_block ] [discriminator.c]
-!19 = metadata !{i32 5, i32 0, metadata !16, null}
-!20 = metadata !{i32 6, i32 0, metadata !16, null}
-!21 = metadata !{i32 7, i32 0, metadata !4, null}
+!0 = !{!"0x11\0012\00clang version 3.5 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [discriminator.c] [DW_LANG_C99]
+!1 = !{!"discriminator.c", !"."}
+!2 = !{}
+!3 = !{!4}
+!4 = !{!"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", !1, !5, !6, null, i32 (i32)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [discriminator.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 2}
+!9 = !{!"clang version 3.5 "}
+!10 = !MDLocation(line: 2, scope: !4)
+!11 = !MDLocation(line: 3, scope: !4)
+!12 = !MDLocation(line: 3, scope: !13)
+!13 = !{!"0xb\001", !1, !4} ; [ DW_TAG_lexical_block ] [discriminator.c]
+!14 = !MDLocation(line: 4, scope: !15)
+!15 = !{!"0xb\004\000\001", !1, !16} ; [ DW_TAG_lexical_block ] [discriminator.c]
+!16 = !{!"0xb\003\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [discriminator.c]
+!17 = !MDLocation(line: 4, scope: !18)
+!18 = !{!"0xb\001", !1, !15} ; [ DW_TAG_lexical_block ] [discriminator.c]
+!19 = !MDLocation(line: 5, scope: !16)
+!20 = !MDLocation(line: 6, scope: !16)
+!21 = !MDLocation(line: 7, scope: !4)
diff --git a/test/Transforms/SampleProfile/fnptr.ll b/test/Transforms/SampleProfile/fnptr.ll
index f78123c..096033b 100644
--- a/test/Transforms/SampleProfile/fnptr.ll
+++ b/test/Transforms/SampleProfile/fnptr.ll
@@ -127,29 +127,29 @@ declare i32 @printf(i8* nocapture readonly, ...) #1
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
-!0 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
-!1 = metadata !{metadata !"clang version 3.6.0 "}
-!2 = metadata !{i32 9, i32 3, metadata !3, null}
-!3 = metadata !{metadata !"0x2e\00foo\00foo\00\008\000\001\000\000\00256\001\008", metadata !4, metadata !5, metadata !6, null, double (i32)* @_Z3fooi, null, null, metadata !7} ; [ DW_TAG_subprogram ] [line 8] [def] [foo]
-!4 = metadata !{metadata !"fnptr.cc", metadata !"."}
-!5 = metadata !{metadata !"0x29", metadata !4}    ; [ DW_TAG_file_type ] [./fnptr.cc]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{}
-!8 = metadata !{i32 9, i32 14, metadata !3, null}
-!9 = metadata !{i32 13, i32 3, metadata !10, null}
-!10 = metadata !{metadata !"0x2e\00bar\00bar\00\0012\000\001\000\000\00256\001\0012", metadata !4, metadata !5, metadata !6, null, double (i32)* @_Z3bari, null, null, metadata !7} ; [ DW_TAG_subprogram ] [line 12] [def] [bar]
-!11 = metadata !{i32 13, i32 14, metadata !10, null}
-!12 = metadata !{i32 19, i32 3, metadata !13, null}
-!13 = metadata !{metadata !"0x2e\00main\00main\00\0016\000\001\000\000\00256\001\0016", metadata !4, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !7} ; [ DW_TAG_subprogram ] [line 16] [def] [main]
-!14 = metadata !{i32 20, i32 5, metadata !13, null}
-!15 = metadata !{i32 21, i32 15, metadata !13, null}
-!16 = metadata !{i32 22, i32 11, metadata !13, null}
-!17 = metadata !{metadata !"branch_weights", i32 534, i32 2064}
-!18 = metadata !{i32 23, i32 14, metadata !13, null}
-!19 = metadata !{i32 25, i32 14, metadata !13, null}
-!20 = metadata !{i32 20, i32 28, metadata !13, null}
-!21 = metadata !{metadata !"branch_weights", i32 0, i32 1075}
-!22 = metadata !{i32 19, i32 26, metadata !13, null}
-!23 = metadata !{metadata !"branch_weights", i32 0, i32 534}
-!24 = metadata !{i32 27, i32 3, metadata !13, null}
-!25 = metadata !{i32 28, i32 3, metadata !13, null}
+!0 = !{i32 2, !"Debug Info Version", i32 2}
+!1 = !{!"clang version 3.6.0 "}
+!2 = !MDLocation(line: 9, column: 3, scope: !3)
+!3 = !{!"0x2e\00foo\00foo\00\008\000\001\000\000\00256\001\008", !4, !5, !6, null, double (i32)* @_Z3fooi, null, null, !7} ; [ DW_TAG_subprogram ] [line 8] [def] [foo]
+!4 = !{!"fnptr.cc", !"."}
+!5 = !{!"0x29", !4}    ; [ DW_TAG_file_type ] [./fnptr.cc]
+!6 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{}
+!8 = !MDLocation(line: 9, column: 14, scope: !3)
+!9 = !MDLocation(line: 13, column: 3, scope: !10)
+!10 = !{!"0x2e\00bar\00bar\00\0012\000\001\000\000\00256\001\0012", !4, !5, !6, null, double (i32)* @_Z3bari, null, null, !7} ; [ DW_TAG_subprogram ] [line 12] [def] [bar]
+!11 = !MDLocation(line: 13, column: 14, scope: !10)
+!12 = !MDLocation(line: 19, column: 3, scope: !13)
+!13 = !{!"0x2e\00main\00main\00\0016\000\001\000\000\00256\001\0016", !4, !5, !6, null, i32 ()* @main, null, null, !7} ; [ DW_TAG_subprogram ] [line 16] [def] [main]
+!14 = !MDLocation(line: 20, column: 5, scope: !13)
+!15 = !MDLocation(line: 21, column: 15, scope: !13)
+!16 = !MDLocation(line: 22, column: 11, scope: !13)
+!17 = !{!"branch_weights", i32 534, i32 2064}
+!18 = !MDLocation(line: 23, column: 14, scope: !13)
+!19 = !MDLocation(line: 25, column: 14, scope: !13)
+!20 = !MDLocation(line: 20, column: 28, scope: !13)
+!21 = !{!"branch_weights", i32 0, i32 1075}
+!22 = !MDLocation(line: 19, column: 26, scope: !13)
+!23 = !{!"branch_weights", i32 0, i32 534}
+!24 = !MDLocation(line: 27, column: 3, scope: !13)
+!25 = !MDLocation(line: 28, column: 3, scope: !13)
diff --git a/test/Transforms/SampleProfile/propagate.ll b/test/Transforms/SampleProfile/propagate.ll
index 9ee8ec5..594645f 100644
--- a/test/Transforms/SampleProfile/propagate.ll
+++ b/test/Transforms/SampleProfile/propagate.ll
@@ -198,46 +198,46 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [propagate.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"propagate.cc", metadata !"."}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4, metadata !7}
-!4 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\001\000\006\00256\000\003", metadata !1, metadata !5, metadata !6, null, i64 (i32, i32, i64)* @_Z3fooiil, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [foo]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [propagate.cc]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !"0x2e\00main\00main\00\0024\000\001\000\006\00256\000\0024", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 24] [def] [main]
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
-!10 = metadata !{metadata !"clang version 3.5 "}
-!11 = metadata !{i32 4, i32 0, metadata !12, null}
-!12 = metadata !{metadata !"0xb\004\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [propagate.cc]
-!13 = metadata !{i32 5, i32 0, metadata !14, null}
-!14 = metadata !{metadata !"0xb\004\000\000", metadata !1, metadata !12} ; [ DW_TAG_lexical_block ] [propagate.cc]
-!15 = metadata !{i32 7, i32 0, metadata !16, null}
-!16 = metadata !{metadata !"0xb\007\000\000", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ] [propagate.cc]
-!17 = metadata !{metadata !"0xb\006\000\000", metadata !1, metadata !12} ; [ DW_TAG_lexical_block ] [propagate.cc]
-!18 = metadata !{i32 8, i32 0, metadata !19, null}
-!19 = metadata !{metadata !"0xb\008\000\000", metadata !1, metadata !20} ; [ DW_TAG_lexical_block ] [propagate.cc]
-!20 = metadata !{metadata !"0xb\007\000\000", metadata !1, metadata !16} ; [ DW_TAG_lexical_block ] [propagate.cc]
-!21 = metadata !{i32 9, i32 0, metadata !19, null}
-!22 = metadata !{i32 10, i32 0, metadata !23, null}
-!23 = metadata !{metadata !"0xb\0010\000\000", metadata !1, metadata !20} ; [ DW_TAG_lexical_block ] [propagate.cc]
-!24 = metadata !{i32 11, i32 0, metadata !25, null}
-!25 = metadata !{metadata !"0xb\0010\000\000", metadata !1, metadata !23} ; [ DW_TAG_lexical_block ] [propagate.cc]
-!26 = metadata !{i32 12, i32 0, metadata !25, null}
-!27 = metadata !{i32 13, i32 0, metadata !25, null}
-!28 = metadata !{i32 14, i32 0, metadata !29, null}
-!29 = metadata !{metadata !"0xb\0014\000\000", metadata !1, metadata !30} ; [ DW_TAG_lexical_block ] [propagate.cc]
-!30 = metadata !{metadata !"0xb\0013\000\000", metadata !1, metadata !23} ; [ DW_TAG_lexical_block ] [propagate.cc]
-!31 = metadata !{i32 15, i32 0, metadata !32, null}
-!32 = metadata !{metadata !"0xb\0014\000\000", metadata !1, metadata !29} ; [ DW_TAG_lexical_block ] [propagate.cc]
-!33 = metadata !{i32 16, i32 0, metadata !32, null}
-!34 = metadata !{i32 17, i32 0, metadata !32, null}
-!35 = metadata !{i32 19, i32 0, metadata !20, null}
-!36 = metadata !{i32 21, i32 0, metadata !4, null}
-!37 = metadata !{i32 22, i32 0, metadata !4, null}
-!38 = metadata !{i32 25, i32 0, metadata !7, null}
-!39 = metadata !{i32 26, i32 0, metadata !7, null}
-!40 = metadata !{i32 27, i32 0, metadata !7, null}
-!41 = metadata !{i32 28, i32 0, metadata !7, null}
-!42 = metadata !{i32 29, i32 0, metadata !7, null}
+!0 = !{!"0x11\004\00clang version 3.5 \000\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [propagate.cc] [DW_LANG_C_plus_plus]
+!1 = !{!"propagate.cc", !"."}
+!2 = !{i32 0}
+!3 = !{!4, !7}
+!4 = !{!"0x2e\00foo\00foo\00\003\000\001\000\006\00256\000\003", !1, !5, !6, null, i64 (i32, i32, i64)* @_Z3fooiil, null, null, !2} ; [ DW_TAG_subprogram ] [line 3] [def] [foo]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [propagate.cc]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{!"0x2e\00main\00main\00\0024\000\001\000\006\00256\000\0024", !1, !5, !6, null, i32 ()* @main, null, null, !2} ; [ DW_TAG_subprogram ] [line 24] [def] [main]
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 1, !"Debug Info Version", i32 2}
+!10 = !{!"clang version 3.5 "}
+!11 = !MDLocation(line: 4, scope: !12)
+!12 = !{!"0xb\004\000\000", !1, !4} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!13 = !MDLocation(line: 5, scope: !14)
+!14 = !{!"0xb\004\000\000", !1, !12} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!15 = !MDLocation(line: 7, scope: !16)
+!16 = !{!"0xb\007\000\000", !1, !17} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!17 = !{!"0xb\006\000\000", !1, !12} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!18 = !MDLocation(line: 8, scope: !19)
+!19 = !{!"0xb\008\000\000", !1, !20} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!20 = !{!"0xb\007\000\000", !1, !16} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!21 = !MDLocation(line: 9, scope: !19)
+!22 = !MDLocation(line: 10, scope: !23)
+!23 = !{!"0xb\0010\000\000", !1, !20} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!24 = !MDLocation(line: 11, scope: !25)
+!25 = !{!"0xb\0010\000\000", !1, !23} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!26 = !MDLocation(line: 12, scope: !25)
+!27 = !MDLocation(line: 13, scope: !25)
+!28 = !MDLocation(line: 14, scope: !29)
+!29 = !{!"0xb\0014\000\000", !1, !30} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!30 = !{!"0xb\0013\000\000", !1, !23} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!31 = !MDLocation(line: 15, scope: !32)
+!32 = !{!"0xb\0014\000\000", !1, !29} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!33 = !MDLocation(line: 16, scope: !32)
+!34 = !MDLocation(line: 17, scope: !32)
+!35 = !MDLocation(line: 19, scope: !20)
+!36 = !MDLocation(line: 21, scope: !4)
+!37 = !MDLocation(line: 22, scope: !4)
+!38 = !MDLocation(line: 25, scope: !7)
+!39 = !MDLocation(line: 26, scope: !7)
+!40 = !MDLocation(line: 27, scope: !7)
+!41 = !MDLocation(line: 28, scope: !7)
+!42 = !MDLocation(line: 29, scope: !7)
diff --git a/test/Transforms/ScalarRepl/debuginfo-preserved.ll b/test/Transforms/ScalarRepl/debuginfo-preserved.ll
index eb660d2..b0c459e 100644
--- a/test/Transforms/ScalarRepl/debuginfo-preserved.ll
+++ b/test/Transforms/ScalarRepl/debuginfo-preserved.ll
@@ -17,10 +17,10 @@ entry:
   %b.addr = alloca i32, align 4
   %c = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !6, metadata !{}), !dbg !7
+  call void @llvm.dbg.declare(metadata i32* %a.addr, metadata !6, metadata !{}), !dbg !7
   store i32 %b, i32* %b.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !8, metadata !{}), !dbg !9
-  call void @llvm.dbg.declare(metadata !{i32* %c}, metadata !10, metadata !{}), !dbg !12
+  call void @llvm.dbg.declare(metadata i32* %b.addr, metadata !8, metadata !{}), !dbg !9
+  call void @llvm.dbg.declare(metadata i32* %c, metadata !10, metadata !{}), !dbg !12
   %tmp = load i32* %a.addr, align 4, !dbg !13
   store i32 %tmp, i32* %c, align 4, !dbg !13
   %tmp1 = load i32* %a.addr, align 4, !dbg !14
@@ -42,24 +42,24 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 131941)\000\00\000\00\000", metadata !18, metadata !19, metadata !19, metadata !17, null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"0x2e\00f\00f\00\001\000\001\000\006\00256\000\001", metadata !18, metadata !2, metadata !3, null, i32 (i32, i32)* @f, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!2 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x101\00a\0016777217\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{i32 1, i32 11, metadata !1, null}
-!8 = metadata !{metadata !"0x101\00b\0033554433\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{i32 1, i32 18, metadata !1, null}
-!10 = metadata !{metadata !"0x100\00c\002\000", metadata !11, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{metadata !"0xb\001\0021\000", metadata !18, metadata !1} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 2, i32 9, metadata !11, null}
-!13 = metadata !{i32 2, i32 14, metadata !11, null}
-!14 = metadata !{i32 3, i32 5, metadata !11, null}
-!15 = metadata !{i32 4, i32 5, metadata !11, null}
-!16 = metadata !{i32 5, i32 5, metadata !11, null}
-!17 = metadata !{metadata !1}
-!18 = metadata !{metadata !"/d/j/debug-test.c", metadata !"/Volumes/Data/b"}
-!19 = metadata !{i32 0}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.0 (trunk 131941)\000\00\000\00\000", !18, !19, !19, !17, null, null} ; [ DW_TAG_compile_unit ]
+!1 = !{!"0x2e\00f\00f\00\001\000\001\000\006\00256\000\001", !18, !2, !3, null, i32 (i32, i32)* @f, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!2 = !{!"0x29", !18} ; [ DW_TAG_file_type ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !18, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !0} ; [ DW_TAG_base_type ]
+!6 = !{!"0x101\00a\0016777217\000", !1, !2, !5} ; [ DW_TAG_arg_variable ]
+!7 = !MDLocation(line: 1, column: 11, scope: !1)
+!8 = !{!"0x101\00b\0033554433\000", !1, !2, !5} ; [ DW_TAG_arg_variable ]
+!9 = !MDLocation(line: 1, column: 18, scope: !1)
+!10 = !{!"0x100\00c\002\000", !11, !2, !5} ; [ DW_TAG_auto_variable ]
+!11 = !{!"0xb\001\0021\000", !18, !1} ; [ DW_TAG_lexical_block ]
+!12 = !MDLocation(line: 2, column: 9, scope: !11)
+!13 = !MDLocation(line: 2, column: 14, scope: !11)
+!14 = !MDLocation(line: 3, column: 5, scope: !11)
+!15 = !MDLocation(line: 4, column: 5, scope: !11)
+!16 = !MDLocation(line: 5, column: 5, scope: !11)
+!17 = !{!1}
+!18 = !{!"/d/j/debug-test.c", !"/Volumes/Data/b"}
+!19 = !{i32 0}
+!20 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/Scalarizer/basic.ll b/test/Transforms/Scalarizer/basic.ll
index 1cfc0dd..a94cf9f 100644
--- a/test/Transforms/Scalarizer/basic.ll
+++ b/test/Transforms/Scalarizer/basic.ll
@@ -443,9 +443,9 @@ exit:
   ret <4 x float> %next_acc
 }
 
-!0 = metadata !{ metadata !"root" }
-!1 = metadata !{ metadata !"set1", metadata !0 }
-!2 = metadata !{ metadata !"set2", metadata !0 }
-!3 = metadata !{ metadata !3 }
-!4 = metadata !{ float 4.0 }
-!5 = metadata !{ i64 0, i64 8, null }
+!0 = !{ !"root" }
+!1 = !{ !"set1", !0 }
+!2 = !{ !"set2", !0 }
+!3 = !{ !3 }
+!4 = !{ float 4.0 }
+!5 = !{ i64 0, i64 8, null }
diff --git a/test/Transforms/Scalarizer/dbginfo.ll b/test/Transforms/Scalarizer/dbginfo.ll
index ee7182b..ed65aaa 100644
--- a/test/Transforms/Scalarizer/dbginfo.ll
+++ b/test/Transforms/Scalarizer/dbginfo.ll
@@ -16,9 +16,9 @@ define void @f1(<4 x i32>* nocapture %a, <4 x i32>* nocapture readonly %b, <4 x
 ; CHECK: %b.i1 = getelementptr i32* %b.i0, i32 1
 ; CHECK: %b.i2 = getelementptr i32* %b.i0, i32 2
 ; CHECK: %b.i3 = getelementptr i32* %b.i0, i32 3
-; CHECK: tail call void @llvm.dbg.value(metadata !{<4 x i32>* %a}, i64 0, metadata !{{[0-9]+}}, metadata {{.*}}), !dbg !{{[0-9]+}}
-; CHECK: tail call void @llvm.dbg.value(metadata !{<4 x i32>* %b}, i64 0, metadata !{{[0-9]+}}, metadata {{.*}}), !dbg !{{[0-9]+}}
-; CHECK: tail call void @llvm.dbg.value(metadata !{<4 x i32>* %c}, i64 0, metadata !{{[0-9]+}}, metadata {{.*}}), !dbg !{{[0-9]+}}
+; CHECK: tail call void @llvm.dbg.value(metadata <4 x i32>* %a, i64 0, metadata !{{[0-9]+}}, metadata {{.*}}), !dbg !{{[0-9]+}}
+; CHECK: tail call void @llvm.dbg.value(metadata <4 x i32>* %b, i64 0, metadata !{{[0-9]+}}, metadata {{.*}}), !dbg !{{[0-9]+}}
+; CHECK: tail call void @llvm.dbg.value(metadata <4 x i32>* %c, i64 0, metadata !{{[0-9]+}}, metadata {{.*}}), !dbg !{{[0-9]+}}
 ; CHECK: %bval.i0 = load i32* %b.i0, align 16, !dbg ![[TAG1:[0-9]+]], !tbaa ![[TAG2:[0-9]+]]
 ; CHECK: %bval.i1 = load i32* %b.i1, align 4, !dbg ![[TAG1]], !tbaa ![[TAG2]]
 ; CHECK: %bval.i2 = load i32* %b.i2, align 8, !dbg ![[TAG1]], !tbaa ![[TAG2]]
@@ -37,9 +37,9 @@ define void @f1(<4 x i32>* nocapture %a, <4 x i32>* nocapture readonly %b, <4 x
 ; CHECK: store i32 %add.i3, i32* %a.i3, align 4, !dbg ![[TAG1]], !tbaa ![[TAG2]]
 ; CHECK: ret void
 entry:
-  tail call void @llvm.dbg.value(metadata !{<4 x i32>* %a}, i64 0, metadata !15, metadata !{}), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{<4 x i32>* %b}, i64 0, metadata !16, metadata !{}), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{<4 x i32>* %c}, i64 0, metadata !17, metadata !{}), !dbg !20
+  tail call void @llvm.dbg.value(metadata <4 x i32>* %a, i64 0, metadata !15, metadata !{}), !dbg !20
+  tail call void @llvm.dbg.value(metadata <4 x i32>* %b, i64 0, metadata !16, metadata !{}), !dbg !20
+  tail call void @llvm.dbg.value(metadata <4 x i32>* %c, i64 0, metadata !17, metadata !{}), !dbg !20
   %bval = load <4 x i32>* %b, align 16, !dbg !21, !tbaa !22
   %cval = load <4 x i32>* %c, align 16, !dbg !21, !tbaa !22
   %add = add <4 x i32> %bval, %cval, !dbg !21
@@ -57,30 +57,30 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!18, !26}
 !llvm.ident = !{!19}
 
-!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 194134) (llvm/trunk 194126)\001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/home/richards/llvm/build//tmp/add.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"/tmp/add.c", metadata !"/home/richards/llvm/build"}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !"0x2e\00f1\00f1\00\003\000\001\000\006\00256\001\004", metadata !1, metadata !5, metadata !6, null, void (<4 x i32>*, <4 x i32>*, <4 x i32>*)* @f1, null, null, metadata !14} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [f]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/home/richards/llvm/build//tmp/add.c]
-!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null, metadata !8, metadata !8, metadata !8}
-!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from V4SI]
-!9 = metadata !{metadata !"0x16\00V4SI\001\000\000\000\000", metadata !1, null, metadata !10} ; [ DW_TAG_typedef ] [V4SI] [line 1, size 0, align 0, offset 0] [from ]
-!10 = metadata !{metadata !"0x1\00\000\00128\00128\000\002048", null, null, metadata !11, metadata !12, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [vector] [from int]
-!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
-!14 = metadata !{metadata !15, metadata !16, metadata !17}
-!15 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 3]
-!16 = metadata !{metadata !"0x101\00b\0033554435\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [b] [line 3]
-!17 = metadata !{metadata !"0x101\00c\0050331651\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [c] [line 3]
-!18 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!19 = metadata !{metadata !"clang version 3.4 (trunk 194134) (llvm/trunk 194126)"}
-!20 = metadata !{i32 3, i32 0, metadata !4, null}
-!21 = metadata !{i32 5, i32 0, metadata !4, null}
-!22 = metadata !{metadata !23, metadata !23, i64 0}
-!23 = metadata !{metadata !"omnipotent char", metadata !24, i64 0}
-!24 = metadata !{metadata !"Simple C/C++ TBAA"}
-!25 = metadata !{i32 6, i32 0, metadata !4, null}
-!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\0012\00clang version 3.4 (trunk 194134) (llvm/trunk 194126)\001\00\000\00\000", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [/home/richards/llvm/build//tmp/add.c] [DW_LANG_C99]
+!1 = !{!"/tmp/add.c", !"/home/richards/llvm/build"}
+!2 = !{i32 0}
+!3 = !{!4}
+!4 = !{!"0x2e\00f1\00f1\00\003\000\001\000\006\00256\001\004", !1, !5, !6, null, void (<4 x i32>*, <4 x i32>*, <4 x i32>*)* @f1, null, null, !14} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [f]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/home/richards/llvm/build//tmp/add.c]
+!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = !{null, !8, !8, !8}
+!8 = !{!"0xf\00\000\0064\0064\000\000", null, null, !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from V4SI]
+!9 = !{!"0x16\00V4SI\001\000\000\000\000", !1, null, !10} ; [ DW_TAG_typedef ] [V4SI] [line 1, size 0, align 0, offset 0] [from ]
+!10 = !{!"0x1\00\000\00128\00128\000\002048", null, null, !11, !12, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [vector] [from int]
+!11 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = !{!13}
+!13 = !{!"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
+!14 = !{!15, !16, !17}
+!15 = !{!"0x101\00a\0016777219\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [a] [line 3]
+!16 = !{!"0x101\00b\0033554435\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [b] [line 3]
+!17 = !{!"0x101\00c\0050331651\000", !4, !5, !8} ; [ DW_TAG_arg_variable ] [c] [line 3]
+!18 = !{i32 2, !"Dwarf Version", i32 4}
+!19 = !{!"clang version 3.4 (trunk 194134) (llvm/trunk 194126)"}
+!20 = !MDLocation(line: 3, scope: !4)
+!21 = !MDLocation(line: 5, scope: !4)
+!22 = !{!23, !23, i64 0}
+!23 = !{!"omnipotent char", !24, i64 0}
+!24 = !{!"Simple C/C++ TBAA"}
+!25 = !MDLocation(line: 6, scope: !4)
+!26 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/SimplifyCFG/2007-11-22-InvokeNoUnwind.ll b/test/Transforms/SimplifyCFG/2007-11-22-InvokeNoUnwind.ll
index a90e072..76f41e8 100644
--- a/test/Transforms/SimplifyCFG/2007-11-22-InvokeNoUnwind.ll
+++ b/test/Transforms/SimplifyCFG/2007-11-22-InvokeNoUnwind.ll
@@ -12,5 +12,9 @@ Cont:		; preds = %0
 	ret i32 0
 
 Other:		; preds = %0
+	landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0
+		catch i8* null
 	ret i32 1
 }
+
+declare i32 @__gxx_personality_v0(...)
diff --git a/test/Transforms/SimplifyCFG/2008-01-02-hoist-fp-add.ll b/test/Transforms/SimplifyCFG/2008-01-02-hoist-fp-add.ll
index cf29b71..8e15637 100644
--- a/test/Transforms/SimplifyCFG/2008-01-02-hoist-fp-add.ll
+++ b/test/Transforms/SimplifyCFG/2008-01-02-hoist-fp-add.ll
@@ -1,27 +1,27 @@
-; The phi should not be eliminated in this case, because the fp op could trap.
+; The phi should not be eliminated in this case, because the divide op could trap.
 ; RUN: opt < %s -simplifycfg -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin8"
-@G = weak global double 0.000000e+00, align 8		; <double*> [#uses=2]
+@G = weak global i32 0, align 8		; <i32*> [#uses=2]
 
-define void @test(i32 %X, i32 %Y, double %Z) {
+define void @test(i32 %X, i32 %Y, i32 %Z) {
 entry:
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
-	%tmp = load double* @G, align 8		; <double> [#uses=2]
+	%tmp = load i32* @G, align 8		; <i32> [#uses=2]
 	%tmp3 = icmp eq i32 %X, %Y		; <i1> [#uses=1]
 	%tmp34 = zext i1 %tmp3 to i8		; <i8> [#uses=1]
 	%toBool = icmp ne i8 %tmp34, 0		; <i1> [#uses=1]
 	br i1 %toBool, label %cond_true, label %cond_next
 
 cond_true:		; preds = %entry
-	%tmp7 = fadd double %tmp, %Z		; <double> [#uses=1]
+	%tmp7 = udiv i32 %tmp, %Z		; <i32> [#uses=1]
 	br label %cond_next
 
 cond_next:		; preds = %cond_true, %entry
-; CHECK: = phi double
-	%F.0 = phi double [ %tmp, %entry ], [ %tmp7, %cond_true ]		; <double> [#uses=1]
-	store double %F.0, double* @G, align 8
+; CHECK: = phi i32
+	%F.0 = phi i32 [ %tmp, %entry ], [ %tmp7, %cond_true ]		; <i32> [#uses=1]
+	store i32 %F.0, i32* @G, align 8
 	ret void
 }
 
diff --git a/test/Transforms/SimplifyCFG/AArch64/lit.local.cfg b/test/Transforms/SimplifyCFG/AArch64/lit.local.cfg
new file mode 100644
index 0000000..6642d28
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/AArch64/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.ll']
+
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
diff --git a/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll b/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll
new file mode 100644
index 0000000..076cb58
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll
@@ -0,0 +1,72 @@
+; RUN: opt < %s -mtriple=aarch64-linux-gnu -simplifycfg -enable-unsafe-fp-math -S >%t
+; RUN: FileCheck %s < %t
+; ModuleID = 't.cc'
+
+; Function Attrs: nounwind
+define double @_Z3fooRdS_S_S_(double* dereferenceable(8) %x, double* dereferenceable(8) %y, double* dereferenceable(8) %a) #0 {
+entry:
+  %0 = load double* %y, align 8
+  %cmp = fcmp oeq double %0, 0.000000e+00
+  %1 = load double* %x, align 8
+  br i1 %cmp, label %if.then, label %if.else
+
+; fadd (const, (fmul x, y))
+if.then:                                          ; preds = %entry
+; CHECK-LABEL: if.then:
+; CHECK:   %3 = fmul fast double %1, %2
+; CHECK-NEXT:   %mul = fadd fast double 1.000000e+00, %3
+  %2 = load double* %a, align 8
+  %3 = fmul fast double %1, %2
+  %mul = fadd fast double 1.000000e+00, %3
+  store double %mul, double* %y, align 8
+  br label %if.end
+
+; fsub ((fmul x, y), z)
+if.else:                                          ; preds = %entry
+; CHECK-LABEL: if.else:
+; CHECK:   %mul1 = fmul fast double %1, %2
+; CHECK-NEXT:   %sub1 = fsub fast double %mul1, %0
+  %4 = load double* %a, align 8
+  %mul1 = fmul fast double %1, %4
+  %sub1 = fsub fast double %mul1, %0
+  store double %sub1, double* %y, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %5 = load double* %y, align 8
+  %cmp2 = fcmp oeq double %5, 2.000000e+00
+  %6 = load double* %x, align 8
+  br i1 %cmp2, label %if.then2, label %if.else2
+
+; fsub (x, (fmul y, z))
+if.then2:                                         ; preds = %entry
+; CHECK-LABEL: if.then2:
+; CHECK:   %7 = fmul fast double %5, 3.000000e+00
+; CHECK-NEXT:   %mul2 = fsub fast double %6, %7
+  %7 = load double* %a, align 8
+  %8 = fmul fast double %6, 3.0000000e+00
+  %mul2 = fsub fast double %7, %8
+  store double %mul2, double* %y, align 8
+  br label %if.end2
+
+; fsub (fneg((fmul x, y)), const)
+if.else2:                                         ; preds = %entry
+; CHECK-LABEL: if.else2:
+; CHECK:   %mul3 = fmul fast double %5, 3.000000e+00
+; CHECK-NEXT:   %neg = fsub fast double 0.000000e+00, %mul3
+; CHECK-NEXT:   %sub2 = fsub fast double %neg, 3.000000e+00
+  %mul3 = fmul fast double %6, 3.0000000e+00
+  %neg = fsub fast double 0.0000000e+00, %mul3
+  %sub2 = fsub fast double %neg, 3.0000000e+00
+  store double %sub2, double* %y, align 8
+  br label %if.end2
+
+if.end2:                                           ; preds = %if.else, %if.then
+  %9 = load double* %x, align 8
+  %10 = load double* %y, align 8
+  %add = fadd fast double %9, %10
+  %11 = load double* %a, align 8
+  %add2 = fadd fast double %add, %11
+  ret double %add2
+}
+
diff --git a/test/Transforms/SimplifyCFG/PhiBlockMerge.ll b/test/Transforms/SimplifyCFG/PhiBlockMerge.ll
index 36b52f5..5550829 100644
--- a/test/Transforms/SimplifyCFG/PhiBlockMerge.ll
+++ b/test/Transforms/SimplifyCFG/PhiBlockMerge.ll
@@ -4,9 +4,7 @@
 ;
 
 define i32 @test(i1 %a, i1 %b) {
-; CHECK: br i1 %a
         br i1 %a, label %M, label %O
-; CHECK: O:
 O:              ; preds = %0
 ; CHECK: select i1 %b, i32 0, i32 1
 ; CHECK-NOT: phi
@@ -18,9 +16,9 @@ N:              ; preds = %Q, %O
         %Wp = phi i32 [ 0, %O ], [ 1, %Q ]              ; <i32> [#uses=1]
         br label %M
 M:              ; preds = %N, %0
-; CHECK: %W = phi i32
         %W = phi i32 [ %Wp, %N ], [ 2, %0 ]             ; <i32> [#uses=1]
         %R = add i32 %W, 1              ; <i32> [#uses=1]
         ret i32 %R
+; CHECK: ret
 }
 
diff --git a/test/Transforms/SimplifyCFG/PowerPC/cttz-ctlz-spec.ll b/test/Transforms/SimplifyCFG/PowerPC/cttz-ctlz-spec.ll
new file mode 100644
index 0000000..fa74549
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/PowerPC/cttz-ctlz-spec.ll
@@ -0,0 +1,45 @@
+; RUN: opt -S -simplifycfg < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i64 @test1(i64 %A) {
+; CHECK-LABEL: @test1(
+; CHECK: [[ICMP:%[A-Za-z0-9]+]] = icmp eq i64 %A, 0
+; CHECK-NEXT: [[CTLZ:%[A-Za-z0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true)
+; CHECK-NEXT: [[SEL:%[A-Za-z0-9.]+]] = select i1 [[ICMP]], i64 64, i64 [[CTLZ]]
+; CHECK-NEXT: ret i64 [[SEL]]
+entry:
+  %tobool = icmp eq i64 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ]
+  ret i64 %cond
+}
+
+define i64 @test1b(i64 %A) {
+; CHECK-LABEL: @test1b(
+; CHECK: [[ICMP:%[A-Za-z0-9]+]] = icmp eq i64 %A, 0
+; CHECK-NEXT: [[CTTZ:%[A-Za-z0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %A, i1 true)
+; CHECK-NEXT: [[SEL:%[A-Za-z0-9.]+]] = select i1 [[ICMP]], i64 64, i64 [[CTLZ]]
+; CHECK-NEXT: ret i64 [[SEL]]
+entry:
+  %tobool = icmp eq i64 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ]
+  ret i64 %cond
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+
diff --git a/test/Transforms/SimplifyCFG/PowerPC/lit.local.cfg b/test/Transforms/SimplifyCFG/PowerPC/lit.local.cfg
new file mode 100644
index 0000000..0913324
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/PowerPC/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'PowerPC' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/SimplifyCFG/R600/cttz-ctlz.ll b/test/Transforms/SimplifyCFG/R600/cttz-ctlz.ll
new file mode 100644
index 0000000..5b27994
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/R600/cttz-ctlz.ll
@@ -0,0 +1,249 @@
+; RUN: opt -S -simplifycfg -mtriple=r600-unknown-unknown -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=ALL %s
+; RUN: opt -S -simplifycfg -mtriple=r600-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=ALL %s
+
+
+define i64 @test1(i64 %A) {
+; ALL-LABEL: @test1(
+; SI: [[ICMP:%[A-Za-z0-9]+]] = icmp eq i64 %A, 0
+; SI-NEXT: [[CTLZ:%[A-Za-z0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true)
+; SI-NEXT: [[SEL:%[A-Za-z0-9.]+]] = select i1 [[ICMP]], i64 64, i64 [[CTLZ]]
+; SI-NEXT: ret i64 [[SEL]]
+entry:
+  %tobool = icmp eq i64 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ]
+  ret i64 %cond
+}
+
+
+define i32 @test2(i32 %A) {
+; ALL-LABEL: @test2(
+; SI: [[ICMP:%[A-Za-z0-9]+]] = icmp eq i32 %A, 0
+; SI-NEXT: [[CTLZ:%[A-Za-z0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true)
+; SI-NEXT: [[SEL:%[A-Za-z0-9.]+]] = select i1 [[ICMP]], i32 32, i32 [[CTLZ]]
+; SI-NEXT: ret i32 [[SEL]]
+entry:
+  %tobool = icmp eq i32 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i32 [ %0, %cond.true ], [ 32, %entry ]
+  ret i32 %cond
+}
+
+
+define signext i16 @test3(i16 signext %A) {
+; ALL-LABEL: @test3(
+; SI: [[ICMP:%[A-Za-z0-9]+]] = icmp eq i16 %A, 0
+; SI-NEXT: [[CTLZ:%[A-Za-z0-9]+]] = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true)
+; SI-NEXT: [[SEL:%[A-Za-z0-9.]+]] = select i1 [[ICMP]], i16 16, i16 [[CTLZ]]
+; SI-NEXT: ret i16 [[SEL]]
+entry:
+  %tobool = icmp eq i16 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i16 [ %0, %cond.true ], [ 16, %entry ]
+  ret i16 %cond
+}
+
+
+define i64 @test1b(i64 %A) {
+; ALL-LABEL: @test1b(
+; SI: [[ICMP:%[A-Za-z0-9]+]] = icmp eq i64 %A, 0
+; SI-NEXT: [[CTTZ:%[A-Za-z0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %A, i1 true)
+; SI-NEXT: [[SEL:%[A-Za-z0-9.]+]] = select i1 [[ICMP]], i64 64, i64 [[CTTZ]]
+; SI-NEXT: ret i64 [[SEL]]
+entry:
+  %tobool = icmp eq i64 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ]
+  ret i64 %cond
+}
+
+
+define i32 @test2b(i32 %A) {
+; ALL-LABEL: @test2b(
+; SI: [[ICMP:%[A-Za-z0-9]+]] = icmp eq i32 %A, 0
+; SI-NEXT: [[CTTZ:%[A-Za-z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %A, i1 true)
+; SI-NEXT: [[SEL:%[A-Za-z0-9.]+]] = select i1 [[ICMP]], i32 32, i32 [[CTTZ]]
+; SI-NEXT: ret i32 [[SEL]]
+entry:
+  %tobool = icmp eq i32 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i32 @llvm.cttz.i32(i32 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i32 [ %0, %cond.true ], [ 32, %entry ]
+  ret i32 %cond
+}
+
+
+define signext i16 @test3b(i16 signext %A) {
+; ALL-LABEL: @test3b(
+; SI: [[ICMP:%[A-Za-z0-9]+]] = icmp eq i16 %A, 0
+; SI-NEXT: [[CTTZ:%[A-Za-z0-9]+]] = tail call i16 @llvm.cttz.i16(i16 %A, i1 true)
+; SI-NEXT: [[SEL:%[A-Za-z0-9.]+]] = select i1 [[ICMP]], i16 16, i16 [[CTTZ]]
+; SI-NEXT: ret i16 [[SEL]]
+entry:
+  %tobool = icmp eq i16 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i16 [ %0, %cond.true ], [ 16, %entry ]
+  ret i16 %cond
+}
+
+
+define i64 @test1c(i64 %A) {
+; ALL-LABEL: @test1c(
+; ALL: [[ICMP:%[A-Za-z0-9]+]] = icmp eq i64 %A, 0
+; ALL-NEXT: [[CTLZ:%[A-Za-z0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true)
+; ALL-NEXT: [[SEL:%[A-Za-z0-9.]+]] = select i1 [[ICMP]], i64 63, i64 [[CTLZ]]
+; ALL-NEXT: ret i64 [[SEL]]
+entry:
+  %tobool = icmp eq i64 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %0, %cond.true ], [ 63, %entry ]
+  ret i64 %cond
+}
+
+define i32 @test2c(i32 %A) {
+; ALL-LABEL: @test2c(
+; ALL: [[ICMP:%[A-Za-z0-9]+]] = icmp eq i32 %A, 0
+; ALL-NEXT: [[CTLZ:%[A-Za-z0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true)
+; ALL-NEXT: [[SEL:%[A-Za-z0-9.]+]] = select i1 [[ICMP]], i32 31, i32 [[CTLZ]]
+; ALL-NEXT: ret i32 [[SEL]]
+entry:
+  %tobool = icmp eq i32 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i32 [ %0, %cond.true ], [ 31, %entry ]
+  ret i32 %cond
+}
+
+
+define signext i16 @test3c(i16 signext %A) {
+; ALL-LABEL: @test3c(
+; ALL: [[ICMP:%[A-Za-z0-9]+]] = icmp eq i16 %A, 0
+; ALL-NEXT: [[CTLZ:%[A-Za-z0-9]+]] = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true)
+; ALL-NEXT: [[SEL:%[A-Za-z0-9.]+]] = select i1 [[ICMP]], i16 15, i16 [[CTLZ]]
+; ALL-NEXT: ret i16 [[SEL]]
+entry:
+  %tobool = icmp eq i16 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i16 [ %0, %cond.true ], [ 15, %entry ]
+  ret i16 %cond
+}
+
+
+define i64 @test1d(i64 %A) {
+; ALL-LABEL: @test1d(
+; ALL: [[ICMP:%[A-Za-z0-9]+]] = icmp eq i64 %A, 0
+; ALL-NEXT: [[CTTZ:%[A-Za-z0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %A, i1 true)
+; ALL-NEXT: [[SEL:%[A-Za-z0-9.]+]] = select i1 [[ICMP]], i64 63, i64 [[CTTZ]]
+; ALL-NEXT: ret i64 [[SEL]]
+entry:
+  %tobool = icmp eq i64 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %0, %cond.true ], [ 63, %entry ]
+  ret i64 %cond
+}
+
+
+define i32 @test2d(i32 %A) {
+; ALL-LABEL: @test2d(
+; ALL: [[ICMP:%[A-Za-z0-9]+]] = icmp eq i32 %A, 0
+; ALL-NEXT: [[CTTZ:%[A-Za-z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %A, i1 true)
+; ALL-NEXT: [[SEL:%[A-Za-z0-9.]+]] = select i1 [[ICMP]], i32 31, i32 [[CTTZ]]
+; ALL-NEXT: ret i32 [[SEL]]
+entry:
+  %tobool = icmp eq i32 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i32 @llvm.cttz.i32(i32 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i32 [ %0, %cond.true ], [ 31, %entry ]
+  ret i32 %cond
+}
+
+
+define signext i16 @test3d(i16 signext %A) {
+; ALL-LABEL: @test3d(
+; ALL: [[ICMP:%[A-Za-z0-9]+]] = icmp eq i16 %A, 0
+; ALL-NEXT: [[CTTZ:%[A-Za-z0-9]+]] = tail call i16 @llvm.cttz.i16(i16 %A, i1 true)
+; ALL-NEXT: [[SEL:%[A-Za-z0-9.]+]] = select i1 [[ICMP]], i16 15, i16 [[CTTZ]]
+; ALL-NEXT: ret i16 [[SEL]]
+entry:
+  %tobool = icmp eq i16 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i16 [ %0, %cond.true ], [ 15, %entry ]
+  ret i16 %cond
+}
+
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
diff --git a/test/Transforms/SimplifyCFG/R600/lit.local.cfg b/test/Transforms/SimplifyCFG/R600/lit.local.cfg
new file mode 100644
index 0000000..ad9ce25
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/R600/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'R600' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/SimplifyCFG/SpeculativeExec.ll b/test/Transforms/SimplifyCFG/SpeculativeExec.ll
index 83fa419..31de3c8 100644
--- a/test/Transforms/SimplifyCFG/SpeculativeExec.ll
+++ b/test/Transforms/SimplifyCFG/SpeculativeExec.ll
@@ -28,22 +28,6 @@ bb3:		; preds = %bb2, %entry
 	ret i32 %tmp5
 }
 
-declare i8 @llvm.cttz.i8(i8, i1)
-
-define i8 @test2(i8 %a) {
-; CHECK-LABEL: @test2(
-  br i1 undef, label %bb_true, label %bb_false
-bb_true:
-  %b = tail call i8 @llvm.cttz.i8(i8 %a, i1 false)
-  br label %join
-bb_false:
-  br label %join
-join:
-  %c = phi i8 [%b, %bb_true], [%a, %bb_false]
-; CHECK: select
-  ret i8 %c
-}
-
 define i8* @test4(i1* %dummy, i8* %a, i8* %b) {
 ; Test that we don't speculate an arbitrarily large number of unfolded constant
 ; expressions.
diff --git a/test/Transforms/SimplifyCFG/UnreachableEliminate.ll b/test/Transforms/SimplifyCFG/UnreachableEliminate.ll
index 21428c6..22b144b 100644
--- a/test/Transforms/SimplifyCFG/UnreachableEliminate.ll
+++ b/test/Transforms/SimplifyCFG/UnreachableEliminate.ll
@@ -46,32 +46,6 @@ T:
         ret i32 2
 }
 
-; PR9450
-define i32 @test4(i32 %v, i32 %w) {
-; CHECK: entry:
-; CHECK-NEXT:  switch i32 %v, label %T [
-; CHECK-NEXT:    i32 3, label %V
-; CHECK-NEXT:    i32 2, label %U
-; CHECK-NEXT:  ]
-
-entry:
-        br label %SWITCH
-V:
-        ret i32 7
-SWITCH:
-        switch i32 %v, label %default [
-                 i32 1, label %T
-                 i32 2, label %U
-                 i32 3, label %V
-        ]
-default:
-        unreachable
-U:
-        ret i32 %w
-T:
-        ret i32 2
-}
-
 
 ;; We can either convert the following control-flow to a select or remove the
 ;; unreachable control flow because of the undef store of null. Make sure we do
diff --git a/test/Transforms/SimplifyCFG/X86/speculate-cttz-ctlz.ll b/test/Transforms/SimplifyCFG/X86/speculate-cttz-ctlz.ll
new file mode 100644
index 0000000..69f6c69
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/X86/speculate-cttz-ctlz.ll
@@ -0,0 +1,330 @@
+; RUN: opt -S -simplifycfg -mtriple=x86_64-unknown-unknown -mattr=+bmi < %s | FileCheck %s --check-prefix=ALL --check-prefix=BMI
+; RUN: opt -S -simplifycfg -mtriple=x86_64-unknown-unknown -mattr=+lzcnt < %s | FileCheck %s --check-prefix=ALL --check-prefix=LZCNT
+; RUN: opt -S -simplifycfg -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=ALL --check-prefix=GENERIC
+
+
+define i64 @test1(i64 %A) {
+; ALL-LABEL: @test1(
+; ALL: [[COND:%[A-Za-z0-9]+]] = icmp eq i64 %A, 0
+; ALL: [[CTLZ:%[A-Za-z0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true)
+; LZCNT-NEXT: select i1 [[COND]], i64 64, i64 [[CTLZ]]
+; BMI-NOT: select
+; GENERIC-NOT: select
+; ALL: ret
+entry:
+  %tobool = icmp eq i64 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ]
+  ret i64 %cond
+}
+
+define i32 @test2(i32 %A) {
+; ALL-LABEL: @test2(
+; ALL: [[COND:%[A-Za-z0-9]+]] = icmp eq i32 %A, 0
+; ALL: [[CTLZ:%[A-Za-z0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true)
+; LZCNT-NEXT: select i1 [[COND]], i32 32, i32 [[CTLZ]]
+; BMI-NOT: select
+; GENERIC-NOT: select
+; ALL: ret
+entry:
+  %tobool = icmp eq i32 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i32 [ %0, %cond.true ], [ 32, %entry ]
+  ret i32 %cond
+}
+
+
+define signext i16 @test3(i16 signext %A) {
+; ALL-LABEL: @test3(
+; ALL: [[COND:%[A-Za-z0-9]+]] = icmp eq i16 %A, 0
+; ALL: [[CTLZ:%[A-Za-z0-9]+]] = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true)
+; LZCNT-NEXT: select i1 [[COND]], i16 16, i16 [[CTLZ]]
+; BMI-NOT: select
+; GENERIC-NOT: select
+; ALL: ret
+entry:
+  %tobool = icmp eq i16 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i16 [ %0, %cond.true ], [ 16, %entry ]
+  ret i16 %cond
+}
+
+
+define i64 @test1b(i64 %A) {
+; ALL-LABEL: @test1b(
+; ALL: [[COND:%[A-Za-z0-9]+]] = icmp eq i64 %A, 0
+; ALL: [[CTTZ:%[A-Za-z0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %A, i1 true)
+; BMI-NEXT: select i1 [[COND]], i64 64, i64 [[CTTZ]]
+; LZCNT-NOT: select
+; GENERIC-NOT: select
+; ALL: ret
+entry:
+  %tobool = icmp eq i64 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ]
+  ret i64 %cond
+}
+
+
+define i32 @test2b(i32 %A) {
+; ALL-LABEL: @test2b(
+; ALL: [[COND:%[A-Za-z0-9]+]] = icmp eq i32 %A, 0
+; ALL: [[CTTZ:%[A-Za-z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %A, i1 true)
+; BMI-NEXT: select i1 [[COND]], i32 32, i32 [[CTTZ]]
+; LZCNT-NOT: select
+; GENERIC-NOT: select
+; ALL: ret
+entry:
+  %tobool = icmp eq i32 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i32 @llvm.cttz.i32(i32 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i32 [ %0, %cond.true ], [ 32, %entry ]
+  ret i32 %cond
+}
+
+
+define signext i16 @test3b(i16 signext %A) {
+; ALL-LABEL: @test3b(
+; ALL: [[COND:%[A-Za-z0-9]+]] = icmp eq i16 %A, 0
+; ALL: [[CTTZ:%[A-Za-z0-9]+]] = tail call i16 @llvm.cttz.i16(i16 %A, i1 true)
+; BMI-NEXT: select i1 [[COND]], i16 16, i16 [[CTTZ]]
+; LZCNT-NOT: select
+; GENERIC-NOT: select
+; ALL: ret
+entry:
+  %tobool = icmp eq i16 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i16 [ %0, %cond.true ], [ 16, %entry ]
+  ret i16 %cond
+}
+
+; The following tests verify that calls to cttz/ctlz are speculated even if
+; basic block %cond.true has an extra zero extend/truncate which is "free"
+; for the target.
+
+define i64 @test1e(i32 %x) {
+; ALL-LABEL: @test1e(
+; ALL: [[COND:%[A-Za-z0-9]+]] = icmp eq i32 %x, 0
+; ALL: [[CTTZ:%[A-Za-z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+; ALL: [[ZEXT:%[A-Za-z0-9]+]] = zext i32 [[CTTZ]] to i64
+; BMI-NEXT: select i1 [[COND]], i64 32, i64 [[ZEXT]]
+; LZCNT-NOT: select
+; GENERIC-NOT: select
+; ALL: ret
+entry:
+  %tobool = icmp eq i32 %x, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+  %phitmp2 = zext i32 %0 to i64
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %phitmp2, %cond.true ], [ 32, %entry ]
+  ret i64 %cond
+}
+
+define i32 @test2e(i64 %x) {
+; ALL-LABEL: @test2e(
+; ALL: [[COND:%[A-Za-z0-9]+]] = icmp eq i64 %x, 0
+; ALL: [[CTTZ:%[A-Za-z0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
+; ALL: [[TRUNC:%[A-Za-z0-9]+]] = trunc i64 [[CTTZ]] to i32
+; BMI-NEXT: select i1 [[COND]], i32 64, i32 [[TRUNC]]
+; LZCNT-NOT: select
+; GENERIC-NOT: select
+; ALL: ret
+entry:
+  %tobool = icmp eq i64 %x, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
+  %cast = trunc i64 %0 to i32
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i32 [ %cast, %cond.true ], [ 64, %entry ]
+  ret i32 %cond
+}
+
+define i64 @test3e(i32 %x) {
+; ALL-LABEL: @test3e(
+; ALL: [[COND:%[A-Za-z0-9]+]] = icmp eq i32 %x, 0
+; ALL: [[CTLZ:%[A-Za-z0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+; ALL: [[ZEXT:%[A-Za-z0-9]+]] = zext i32 [[CTLZ]] to i64
+; LZCNT-NEXT: select i1 [[COND]], i64 32, i64 [[ZEXT]]
+; BMI-NOT: select
+; GENERIC-NOT: select
+; ALL: ret
+entry:
+  %tobool = icmp eq i32 %x, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+  %phitmp2 = zext i32 %0 to i64
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %phitmp2, %cond.true ], [ 32, %entry ]
+  ret i64 %cond
+}
+
+define i32 @test4e(i64 %x) {
+; ALL-LABEL: @test4e(
+; ALL: [[COND:%[A-Za-z0-9]+]] = icmp eq i64 %x, 0
+; ALL: [[CTLZ:%[A-Za-z0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+; ALL: [[TRUNC:%[A-Za-z0-9]+]] = trunc i64 [[CTLZ]] to i32
+; LZCNT-NEXT: select i1 [[COND]], i32 64, i32 [[TRUNC]]
+; BMI-NOT: select
+; GENERIC-NOT: select
+; ALL: ret
+entry:
+  %tobool = icmp eq i64 %x, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+  %cast = trunc i64 %0 to i32
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i32 [ %cast, %cond.true ], [ 64, %entry ]
+  ret i32 %cond
+}
+
+define i16 @test5e(i64 %x) {
+; ALL-LABEL: @test5e(
+; ALL: [[COND:%[A-Za-z0-9]+]] = icmp eq i64 %x, 0
+; ALL: [[CTLZ:%[A-Za-z0-9]+]] = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+; ALL: [[TRUNC:%[A-Za-z0-9]+]] = trunc i64 [[CTLZ]] to i16
+; LZCNT-NEXT: select i1 [[COND]], i16 64, i16 [[TRUNC]]
+; BMI-NOT: select
+; GENERIC-NOT: select
+; ALL: ret
+entry:
+  %tobool = icmp eq i64 %x, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+  %cast = trunc i64 %0 to i16
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i16 [ %cast, %cond.true ], [ 64, %entry ]
+  ret i16 %cond
+}
+
+define i16 @test6e(i32 %x) {
+; ALL-LABEL: @test6e(
+; ALL: [[COND:%[A-Za-z0-9]+]] = icmp eq i32 %x, 0
+; ALL: [[CTLZ:%[A-Za-z0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+; ALL: [[TRUNC:%[A-Za-z0-9]+]] = trunc i32 [[CTLZ]] to i16
+; LZCNT-NEXT: select i1 [[COND]], i16 32, i16 [[TRUNC]]
+; BMI-NOT: select
+; GENERIC-NOT: select
+; ALL: ret
+entry:
+  %tobool = icmp eq i32 %x, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+  %cast = trunc i32 %0 to i16
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i16 [ %cast, %cond.true ], [ 32, %entry ]
+  ret i16 %cond
+}
+
+define i16 @test7e(i64 %x) {
+; ALL-LABEL: @test7e(
+; ALL: [[COND:%[A-Za-z0-9]+]] = icmp eq i64 %x, 0
+; ALL: [[CTTZ:%[A-Za-z0-9]+]] = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
+; ALL: [[TRUNC:%[A-Za-z0-9]+]] = trunc i64 [[CTTZ]] to i16
+; BMI-NEXT: select i1 [[COND]], i16 64, i16 [[TRUNC]]
+; LZCNT-NOT: select
+; GENERIC-NOT: select
+; ALL: ret
+entry:
+  %tobool = icmp eq i64 %x, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
+  %cast = trunc i64 %0 to i16
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i16 [ %cast, %cond.true ], [ 64, %entry ]
+  ret i16 %cond
+}
+
+define i16 @test8e(i32 %x) {
+; ALL-LABEL: @test8e(
+; ALL: [[COND:%[A-Za-z0-9]+]] = icmp eq i32 %x, 0
+; ALL: [[CTTZ:%[A-Za-z0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+; ALL: [[TRUNC:%[A-Za-z0-9]+]] = trunc i32 [[CTTZ]] to i16
+; BMI-NEXT: select i1 [[COND]], i16 32, i16 [[TRUNC]]
+; LZCNT-NOT: select
+; GENERIC-NOT: select
+; ALL: ret
+entry:
+  %tobool = icmp eq i32 %x, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+  %cast = trunc i32 %0 to i16
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i16 [ %cast, %cond.true ], [ 32, %entry ]
+  ret i16 %cond
+}
+
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
diff --git a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
index fc22e7e..ea3b575 100644
--- a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
+++ b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
@@ -21,8 +21,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; The table for @cprop
 ; CHECK: @switch.table5 = private unnamed_addr constant [7 x i32] [i32 5, i32 42, i32 126, i32 -452, i32 128, i32 6, i32 7]
 
-; The table for @unreachable
-; CHECK: @switch.table6 = private unnamed_addr constant [5 x i32] [i32 0, i32 0, i32 0, i32 1, i32 -1]
+; The table for @unreachable_case
+; CHECK: @switch.table6 = private unnamed_addr constant [9 x i32] [i32 0, i32 0, i32 0, i32 2, i32 -1, i32 1, i32 1, i32 1, i32 1]
 
 ; A simple int-to-int selection switch.
 ; It is dense enough to be replaced by table lookup.
@@ -752,7 +752,7 @@ return:
 ; CHECK: %switch.gep = getelementptr inbounds [7 x i32]* @switch.table5, i32 0, i32 %switch.tableidx
 }
 
-define i32 @unreachable(i32 %x)  {
+define i32 @unreachable_case(i32 %x)  {
 entry:
   switch i32 %x, label %sw.default [
     i32 0, label %sw.bb
@@ -770,19 +770,47 @@ sw.bb: br label %return
 sw.bb1: unreachable
 sw.bb2: br label %return
 sw.bb3: br label %return
-sw.default: unreachable
+sw.default: br label %return
 
 return:
-  %retval.0 = phi i32 [ 1, %sw.bb3 ], [ -1, %sw.bb2 ], [ 0, %sw.bb ]
+  %retval.0 = phi i32 [ 1, %sw.bb3 ], [ -1, %sw.bb2 ], [ 0, %sw.bb ], [ 2, %sw.default ]
   ret i32 %retval.0
 
-; CHECK-LABEL: @unreachable(
+; CHECK-LABEL: @unreachable_case(
 ; CHECK: switch.lookup:
-; CHECK: getelementptr inbounds [5 x i32]* @switch.table6, i32 0, i32 %switch.tableidx
+; CHECK: getelementptr inbounds [9 x i32]* @switch.table6, i32 0, i32 %switch.tableidx
+}
+
+define i32 @unreachable_default(i32 %x)  {
+entry:
+  switch i32 %x, label %default [
+    i32 0, label %bb0
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+  ]
+
+bb0: br label %return
+bb1: br label %return
+bb2: br label %return
+bb3: br label %return
+default: unreachable
+
+return:
+  %retval = phi i32 [ 42, %bb0 ], [ 52, %bb1 ], [ 1, %bb2 ], [ 2, %bb3 ]
+  ret i32 %retval
+
+; CHECK-LABEL: @unreachable_default(
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %x, 0
+; CHECK-NOT: icmp
+; CHECK-NOT: br 1i
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [4 x i32]* @switch.table7, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load i32* %switch.gep
+; CHECK-NEXT: ret i32 %switch.load
 }
 
 ; Don't create a table with illegal type
-; rdar://12779436
 define i96 @illegaltype(i32 %c) {
 entry:
   switch i32 %c, label %sw.default [
@@ -1078,3 +1106,170 @@ return:
 ; CHECK-NEXT: ret i8 %switch.idx.cast
 }
 
+; Reuse the inverted table range compare.
+define i32 @reuse_cmp1(i32 %x) {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+sw.bb: br label %sw.epilog
+sw.bb1: br label %sw.epilog
+sw.bb2: br label %sw.epilog
+sw.bb3: br label %sw.epilog
+sw.default: br label %sw.epilog
+sw.epilog:
+  %r.0 = phi i32 [ 0, %sw.default ], [ 13, %sw.bb3 ], [ 12, %sw.bb2 ], [ 11, %sw.bb1 ], [ 10, %sw.bb ]
+  %cmp = icmp eq i32 %r.0, 0       ; This compare can be "replaced".
+  br i1 %cmp, label %if.then, label %if.end
+if.then: br label %return
+if.end: br label %return
+return:
+  %retval.0 = phi i32 [ 100, %if.then ], [ %r.0, %if.end ]
+  ret i32 %retval.0
+; CHECK-LABEL: @reuse_cmp1(
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %x, 0
+; CHECK-NEXT: [[C:%.+]] = icmp ult i32 %switch.tableidx, 4
+; CHECK-NEXT: %inverted.cmp = xor i1 [[C]], true
+; CHECK:      [[R:%.+]] = select i1 %inverted.cmp, i32 100, i32 {{.*}}
+; CHECK-NEXT: ret i32 [[R]]
+}
+
+; Reuse the table range compare.
+define i32 @reuse_cmp2(i32 %x) {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+sw.bb: br label %sw.epilog
+sw.bb1: br label %sw.epilog
+sw.bb2: br label %sw.epilog
+sw.bb3: br label %sw.epilog
+sw.default: br label %sw.epilog
+sw.epilog:
+  %r.0 = phi i32 [ 4, %sw.default ], [ 3, %sw.bb3 ], [ 2, %sw.bb2 ], [ 1, %sw.bb1 ], [ 0, %sw.bb ]
+  %cmp = icmp ne i32 %r.0, 4       ; This compare can be "replaced".
+  br i1 %cmp, label %if.then, label %if.end
+if.then: br label %return
+if.end: br label %return
+return:
+  %retval.0 = phi i32 [ %r.0, %if.then ], [ 100, %if.end ]
+  ret i32 %retval.0
+; CHECK-LABEL: @reuse_cmp2(
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %x, 0
+; CHECK-NEXT: [[C:%.+]] = icmp ult i32 %switch.tableidx, 4
+; CHECK:      [[R:%.+]] = select i1 [[C]], i32 {{.*}}, i32 100
+; CHECK-NEXT: ret i32 [[R]]
+}
+
+; Cannot reuse the table range compare, because the default value is the same
+; as one of the case values.
+define i32 @no_reuse_cmp(i32 %x) {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+sw.bb: br label %sw.epilog
+sw.bb1: br label %sw.epilog
+sw.bb2: br label %sw.epilog
+sw.bb3: br label %sw.epilog
+sw.default: br label %sw.epilog
+sw.epilog:
+  %r.0 = phi i32 [ 12, %sw.default ], [ 13, %sw.bb3 ], [ 12, %sw.bb2 ], [ 11, %sw.bb1 ], [ 10, %sw.bb ]
+  %cmp = icmp ne i32 %r.0, 0
+  br i1 %cmp, label %if.then, label %if.end
+if.then: br label %return
+if.end: br label %return
+return:
+  %retval.0 = phi i32 [ %r.0, %if.then ], [ 100, %if.end ]
+  ret i32 %retval.0
+; CHECK-LABEL: @no_reuse_cmp(
+; CHECK:  [[S:%.+]] = select 
+; CHECK-NEXT:  %cmp = icmp ne i32 [[S]], 0
+; CHECK-NEXT:  [[R:%.+]] = select i1 %cmp, i32 [[S]], i32 100
+; CHECK-NEXT:  ret i32 [[R]]
+}
+
+; Cannot reuse the table range compare, because the phi at the switch merge
+; point is not dominated by the switch.
+define i32 @no_reuse_cmp2(i32 %x, i32 %y) {
+entry:
+  %ec = icmp ne i32 %y, 0
+  br i1 %ec, label %switch.entry, label %sw.epilog
+switch.entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+sw.bb: br label %sw.epilog
+sw.bb1: br label %sw.epilog
+sw.bb2: br label %sw.epilog
+sw.bb3: br label %sw.epilog
+sw.default: br label %sw.epilog
+sw.epilog:
+  %r.0 = phi i32 [100, %entry], [ 0, %sw.default ], [ 13, %sw.bb3 ], [ 12, %sw.bb2 ], [ 11, %sw.bb1 ], [ 10, %sw.bb ]
+  %cmp = icmp eq i32 %r.0, 0       ; This compare can be "replaced".
+  br i1 %cmp, label %if.then, label %if.end
+if.then: br label %return
+if.end: br label %return
+return:
+  %retval.0 = phi i32 [ 100, %if.then ], [ %r.0, %if.end ]
+  ret i32 %retval.0
+; CHECK-LABEL: @no_reuse_cmp2(
+; CHECK:  %r.0 = phi
+; CHECK-NEXT:  %cmp = icmp eq i32 %r.0, 0
+; CHECK-NEXT:  [[R:%.+]] = select i1 %cmp
+; CHECK-NEXT:  ret i32 [[R]]
+}
+
+define void @pr20210(i8 %x, i1 %y) {
+; %z has uses outside of its BB or the phi it feeds into,
+; so doing a table lookup and jumping directly to while.cond would
+; cause %z to cease dominating all its uses.
+
+entry:
+  br i1 %y, label %sw, label %intermediate
+
+sw:
+  switch i8 %x, label %end [
+    i8 7, label %intermediate
+    i8 3, label %intermediate
+    i8 2, label %intermediate
+    i8 1, label %intermediate
+    i8 0, label %intermediate
+  ]
+
+intermediate:
+  %z = zext i8 %x to i32
+  br label %while.cond
+
+while.cond:
+  %i = phi i32 [ %z, %intermediate ], [ %j, %while.body ]
+  %b = icmp ne i32 %i, 7
+  br i1 %b, label %while.body, label %while.end
+
+while.body:
+  %j = add i32 %i, 1
+  br label %while.cond
+
+while.end:
+  call void @exit(i32 %z)
+  unreachable
+
+end:
+  ret void
+; CHECK-LABEL: @pr20210
+; CHECK: switch i8 %x
+}
diff --git a/test/Transforms/SimplifyCFG/basictest.ll b/test/Transforms/SimplifyCFG/basictest.ll
index d6958a9..5d9dad4 100644
--- a/test/Transforms/SimplifyCFG/basictest.ll
+++ b/test/Transforms/SimplifyCFG/basictest.ll
@@ -1,6 +1,7 @@
 ; Test CFG simplify removal of branch instructions.
 ;
 ; RUN: opt < %s -simplifycfg -S | FileCheck %s
+; RUN: opt < %s -passes=simplify-cfg -S | FileCheck %s
 
 define void @test1() {
         br label %1
@@ -68,6 +69,6 @@ bb3:
 }
 declare i8 @test6g(i8*)
 
-!0 = metadata !{metadata !1, metadata !1, i64 0}
-!1 = metadata !{metadata !"foo"}
-!2 = metadata !{i8 0, i8 2}
+!0 = !{!1, !1, i64 0}
+!1 = !{!"foo"}
+!2 = !{i8 0, i8 2}
diff --git a/test/Transforms/SimplifyCFG/branch-fold-dbg.ll b/test/Transforms/SimplifyCFG/branch-fold-dbg.ll
index 9235f62..f715a0c 100644
--- a/test/Transforms/SimplifyCFG/branch-fold-dbg.ll
+++ b/test/Transforms/SimplifyCFG/branch-fold-dbg.ll
@@ -25,7 +25,7 @@ BB2:                                              ; preds = %BB1
 
 BB3:                                              ; preds = %BB2
   %6 = getelementptr inbounds [5 x %0]* @0, i32 0, i32 %0, !dbg !6
-  call void @llvm.dbg.value(metadata !{%0* %6}, i64 0, metadata !7, metadata !{}), !dbg !12
+  call void @llvm.dbg.value(metadata %0* %6, i64 0, metadata !7, metadata !{}), !dbg !12
   %7 = icmp eq %0* %6, null, !dbg !13
   br i1 %7, label %BB5, label %BB4, !dbg !13
 
@@ -41,19 +41,19 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{metadata !"0x2e\00foo\00foo\00\00231\000\001\000\006\00256\000\000", metadata !15, metadata !1, metadata !3, null, void (i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 231] [def] [scope 0] [foo]
-!1 = metadata !{metadata !"0x29", metadata !15} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang (trunk 129006)\001\00\000\00\000", metadata !15, metadata !4, metadata !4, null, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !15, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null}
-!5 = metadata !{i32 131, i32 2, metadata !0, null}
-!6 = metadata !{i32 134, i32 2, metadata !0, null}
-!7 = metadata !{metadata !"0x100\00bar\00232\000", metadata !8, metadata !1, metadata !9} ; [ DW_TAG_auto_variable ]
-!8 = metadata !{metadata !"0xb\00231\001\003", metadata !15, metadata !0} ; [ DW_TAG_lexical_block ]
-!9 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, metadata !2, metadata !11} ; [ DW_TAG_const_type ]
-!11 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, metadata !2} ; [ DW_TAG_base_type ]
-!12 = metadata !{i32 232, i32 40, metadata !8, null}
-!13 = metadata !{i32 234, i32 2, metadata !8, null}
-!14 = metadata !{i32 274, i32 1, metadata !8, null}
-!15 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
+!0 = !{!"0x2e\00foo\00foo\00\00231\000\001\000\006\00256\000\000", !15, !1, !3, null, void (i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 231] [def] [scope 0] [foo]
+!1 = !{!"0x29", !15} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang (trunk 129006)\001\00\000\00\000", !15, !4, !4, null, null, null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !15, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{null}
+!5 = !MDLocation(line: 131, column: 2, scope: !0)
+!6 = !MDLocation(line: 134, column: 2, scope: !0)
+!7 = !{!"0x100\00bar\00232\000", !8, !1, !9} ; [ DW_TAG_auto_variable ]
+!8 = !{!"0xb\00231\001\003", !15, !0} ; [ DW_TAG_lexical_block ]
+!9 = !{!"0xf\00\000\0032\0032\000\000", null, !2, !10} ; [ DW_TAG_pointer_type ]
+!10 = !{!"0x26\00\000\000\000\000\000", null, !2, !11} ; [ DW_TAG_const_type ]
+!11 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", null, !2} ; [ DW_TAG_base_type ]
+!12 = !MDLocation(line: 232, column: 40, scope: !8)
+!13 = !MDLocation(line: 234, column: 2, scope: !8)
+!14 = !MDLocation(line: 274, column: 1, scope: !8)
+!15 = !{!"a.c", !"/private/tmp"}
diff --git a/test/Transforms/SimplifyCFG/clamp.ll b/test/Transforms/SimplifyCFG/clamp.ll
new file mode 100644
index 0000000..d21894a
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/clamp.ll
@@ -0,0 +1,22 @@
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+
+define float @clamp(float %a, float %b, float %c) {
+; CHECK-LABEL: @clamp
+; CHECK:  %cmp = fcmp ogt float %a, %c
+; CHECK:  %cmp1 = fcmp olt float %a, %b
+; CHECK:  %cond = select i1 %cmp1, float %b, float %a
+; CHECK:  %cond5 = select i1 %cmp, float %c, float %cond
+; CHECK:  ret float %cond5
+entry:
+  %cmp = fcmp ogt float %a, %c
+  br i1 %cmp, label %cond.end4, label %cond.false
+
+cond.false:                                       ; preds = %entry
+  %cmp1 = fcmp olt float %a, %b
+  %cond = select i1 %cmp1, float %b, float %a
+  br label %cond.end4
+
+cond.end4:                                        ; preds = %entry, %cond.false
+  %cond5 = phi float [ %cond, %cond.false ], [ %c, %entry ]
+  ret float %cond5
+}
diff --git a/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll b/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
index cc382be..869ce09 100644
--- a/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
+++ b/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
@@ -1,8 +1,8 @@
 ; RUN: opt -simplifycfg -S < %s | FileCheck %s
 
 define i32 @foo(i32 %i) nounwind ssp {
-  call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !6, metadata !{}), !dbg !7
-  call void @llvm.dbg.value(metadata !8, i64 0, metadata !9, metadata !{}), !dbg !11
+  call void @llvm.dbg.value(metadata i32 %i, i64 0, metadata !6, metadata !{}), !dbg !7
+  call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !9, metadata !{}), !dbg !11
   %1 = icmp ne i32 %i, 0, !dbg !12
 ;CHECK: call i32 (...)* @bar()
 ;CHECK-NEXT: llvm.dbg.value
@@ -10,12 +10,12 @@ define i32 @foo(i32 %i) nounwind ssp {
 
 ; <label>:2                                       ; preds = %0
   %3 = call i32 (...)* @bar(), !dbg !13
-  call void @llvm.dbg.value(metadata !{i32 %3}, i64 0, metadata !9, metadata !{}), !dbg !13
+  call void @llvm.dbg.value(metadata i32 %3, i64 0, metadata !9, metadata !{}), !dbg !13
   br label %6, !dbg !15
 
 ; <label>:4                                       ; preds = %0
   %5 = call i32 (...)* @bar(), !dbg !16
-  call void @llvm.dbg.value(metadata !{i32 %5}, i64 0, metadata !9, metadata !{}), !dbg !16
+  call void @llvm.dbg.value(metadata i32 %5, i64 0, metadata !9, metadata !{}), !dbg !16
   br label %6, !dbg !18
 
 ; <label>:6                                       ; preds = %4, %2
@@ -32,25 +32,25 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.module.flags = !{!21}
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{metadata !"0x2e\00foo\00foo\00\002\000\001\000\006\00256\000\000", metadata !20, metadata !1, metadata !3, null, i32 (i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
-!1 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang\001\00\000\00\000", metadata !20, metadata !8, metadata !8, null, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x101\00i\0016777218\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{i32 2, i32 13, metadata !0, null}
-!8 = metadata !{i32 0}
-!9 = metadata !{metadata !"0x100\00k\003\000", metadata !10, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
-!10 = metadata !{metadata !"0xb\002\0016\000", metadata !20, metadata !0} ; [ DW_TAG_lexical_block ]
-!11 = metadata !{i32 3, i32 12, metadata !10, null}
-!12 = metadata !{i32 4, i32 3, metadata !10, null}
-!13 = metadata !{i32 5, i32 5, metadata !14, null}
-!14 = metadata !{metadata !"0xb\004\0010\001", metadata !20, metadata !10} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{i32 6, i32 3, metadata !14, null}
-!16 = metadata !{i32 7, i32 5, metadata !17, null}
-!17 = metadata !{metadata !"0xb\006\0010\002", metadata !20, metadata !10} ; [ DW_TAG_lexical_block ]
-!18 = metadata !{i32 8, i32 3, metadata !17, null}
-!19 = metadata !{i32 9, i32 3, metadata !10, null}
-!20 = metadata !{metadata !"b.c", metadata !"/private/tmp"}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00foo\00foo\00\002\000\001\000\006\00256\000\000", !20, !1, !3, null, i32 (i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
+!1 = !{!"0x29", !20} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang\001\00\000\00\000", !20, !8, !8, null, null, null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !20, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !2} ; [ DW_TAG_base_type ]
+!6 = !{!"0x101\00i\0016777218\000", !0, !1, !5} ; [ DW_TAG_arg_variable ]
+!7 = !MDLocation(line: 2, column: 13, scope: !0)
+!8 = !{i32 0}
+!9 = !{!"0x100\00k\003\000", !10, !1, !5} ; [ DW_TAG_auto_variable ]
+!10 = !{!"0xb\002\0016\000", !20, !0} ; [ DW_TAG_lexical_block ]
+!11 = !MDLocation(line: 3, column: 12, scope: !10)
+!12 = !MDLocation(line: 4, column: 3, scope: !10)
+!13 = !MDLocation(line: 5, column: 5, scope: !14)
+!14 = !{!"0xb\004\0010\001", !20, !10} ; [ DW_TAG_lexical_block ]
+!15 = !MDLocation(line: 6, column: 3, scope: !14)
+!16 = !MDLocation(line: 7, column: 5, scope: !17)
+!17 = !{!"0xb\006\0010\002", !20, !10} ; [ DW_TAG_lexical_block ]
+!18 = !MDLocation(line: 8, column: 3, scope: !17)
+!19 = !MDLocation(line: 9, column: 3, scope: !10)
+!20 = !{!"b.c", !"/private/tmp"}
+!21 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/SimplifyCFG/hoist-with-range.ll b/test/Transforms/SimplifyCFG/hoist-with-range.ll
index 362aa9a..7ca3ff2 100644
--- a/test/Transforms/SimplifyCFG/hoist-with-range.ll
+++ b/test/Transforms/SimplifyCFG/hoist-with-range.ll
@@ -3,7 +3,7 @@
 define void @foo(i1 %c, i8* %p) {
 ; CHECK: if:
 ; CHECK-NEXT: load i8* %p, !range !0
-; CHECK: !0 = metadata !{i8 0, i8 1, i8 3, i8 5}
+; CHECK: !0 = !{i8 0, i8 1, i8 3, i8 5}
 if:
   br i1 %c, label %then, label %else
 then:
@@ -16,5 +16,5 @@ out:
   ret void
 }
 
-!0 = metadata !{ i8 0, i8 1 }
-!1 = metadata !{ i8 3, i8 5 }
+!0 = !{ i8 0, i8 1 }
+!1 = !{ i8 3, i8 5 }
diff --git a/test/Transforms/SimplifyCFG/preserve-branchweights-partial.ll b/test/Transforms/SimplifyCFG/preserve-branchweights-partial.ll
index 8cc07e3..b2b3841 100644
--- a/test/Transforms/SimplifyCFG/preserve-branchweights-partial.ll
+++ b/test/Transforms/SimplifyCFG/preserve-branchweights-partial.ll
@@ -34,4 +34,4 @@ if.end:
   ret void
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 1, i32 0}
+!0 = !{!"branch_weights", i32 1, i32 0}
diff --git a/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll b/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll
index 941f5ad..32a30c3 100644
--- a/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll
+++ b/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll
@@ -129,12 +129,12 @@ sw.epilog:
   ret void
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
-!1 = metadata !{metadata !"branch_weights", i32 4, i32 64}
-; CHECK: !0 = metadata !{metadata !"branch_weights", i32 256, i32 4352, i32 16}
-!2 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 8}
-!3 = metadata !{metadata !"branch_weights", i32 8, i32 8, i32 4}
-; CHECK: !1 = metadata !{metadata !"branch_weights", i32 32, i32 48, i32 96, i32 16}
-!4 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 4, i32 3}
-!5 = metadata !{metadata !"branch_weights", i32 17, i32 13, i32 9}
-; CHECK: !3 = metadata !{metadata !"branch_weights", i32 7, i32 3, i32 4, i32 6}
+!0 = !{!"branch_weights", i32 64, i32 4}
+!1 = !{!"branch_weights", i32 4, i32 64}
+; CHECK: !0 = !{!"branch_weights", i32 256, i32 4352, i32 16}
+!2 = !{!"branch_weights", i32 4, i32 4, i32 8}
+!3 = !{!"branch_weights", i32 8, i32 8, i32 4}
+; CHECK: !1 = !{!"branch_weights", i32 32, i32 48, i32 96, i32 16}
+!4 = !{!"branch_weights", i32 7, i32 6, i32 4, i32 3}
+!5 = !{!"branch_weights", i32 17, i32 13, i32 9}
+; CHECK: !3 = !{!"branch_weights", i32 7, i32 3, i32 4, i32 6}
diff --git a/test/Transforms/SimplifyCFG/preserve-branchweights.ll b/test/Transforms/SimplifyCFG/preserve-branchweights.ll
index bdd25ba..7802a05 100644
--- a/test/Transforms/SimplifyCFG/preserve-branchweights.ll
+++ b/test/Transforms/SimplifyCFG/preserve-branchweights.ll
@@ -364,29 +364,29 @@ for.exit:
   ret void
 }
 
-!0 = metadata !{metadata !"branch_weights", i32 3, i32 5}
-!1 = metadata !{metadata !"branch_weights", i32 1, i32 1}
-!2 = metadata !{metadata !"branch_weights", i32 1, i32 2}
-!3 = metadata !{metadata !"branch_weights", i32 4, i32 3, i32 2, i32 1}
-!4 = metadata !{metadata !"branch_weights", i32 4, i32 3, i32 2, i32 1}
-!5 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 5}
-!6 = metadata !{metadata !"branch_weights", i32 1, i32 3}
-!7 = metadata !{metadata !"branch_weights", i32 33, i32 9, i32 8, i32 7}
-!8 = metadata !{metadata !"branch_weights", i32 33, i32 9, i32 8}
-!9 = metadata !{metadata !"branch_weights", i32 7, i32 6}
-!10 = metadata !{metadata !"branch_weights", i32 672646, i32 21604207}
-!11 = metadata !{metadata !"branch_weights", i32 6960, i32 21597248}
-
-; CHECK: !0 = metadata !{metadata !"branch_weights", i32 5, i32 11}
-; CHECK: !1 = metadata !{metadata !"branch_weights", i32 1, i32 5}
-; CHECK: !2 = metadata !{metadata !"branch_weights", i32 7, i32 1, i32 2}
-; CHECK: !3 = metadata !{metadata !"branch_weights", i32 49, i32 12, i32 24, i32 35}
-; CHECK: !4 = metadata !{metadata !"branch_weights", i32 11, i32 5}
-; CHECK: !5 = metadata !{metadata !"branch_weights", i32 17, i32 15} 
-; CHECK: !6 = metadata !{metadata !"branch_weights", i32 9, i32 7}
-; CHECK: !7 = metadata !{metadata !"branch_weights", i32 17, i32 9, i32 8, i32 7, i32 17}
-; CHECK: !8 = metadata !{metadata !"branch_weights", i32 24, i32 33}
-; CHECK: !9 = metadata !{metadata !"branch_weights", i32 8, i32 33}
+!0 = !{!"branch_weights", i32 3, i32 5}
+!1 = !{!"branch_weights", i32 1, i32 1}
+!2 = !{!"branch_weights", i32 1, i32 2}
+!3 = !{!"branch_weights", i32 4, i32 3, i32 2, i32 1}
+!4 = !{!"branch_weights", i32 4, i32 3, i32 2, i32 1}
+!5 = !{!"branch_weights", i32 7, i32 6, i32 5}
+!6 = !{!"branch_weights", i32 1, i32 3}
+!7 = !{!"branch_weights", i32 33, i32 9, i32 8, i32 7}
+!8 = !{!"branch_weights", i32 33, i32 9, i32 8}
+!9 = !{!"branch_weights", i32 7, i32 6}
+!10 = !{!"branch_weights", i32 672646, i32 21604207}
+!11 = !{!"branch_weights", i32 6960, i32 21597248}
+
+; CHECK: !0 = !{!"branch_weights", i32 5, i32 11}
+; CHECK: !1 = !{!"branch_weights", i32 1, i32 5}
+; CHECK: !2 = !{!"branch_weights", i32 7, i32 1, i32 2}
+; CHECK: !3 = !{!"branch_weights", i32 49, i32 12, i32 24, i32 35}
+; CHECK: !4 = !{!"branch_weights", i32 11, i32 5}
+; CHECK: !5 = !{!"branch_weights", i32 17, i32 15} 
+; CHECK: !6 = !{!"branch_weights", i32 9, i32 7}
+; CHECK: !7 = !{!"branch_weights", i32 17, i32 9, i32 8, i32 7, i32 17}
+; CHECK: !8 = !{!"branch_weights", i32 24, i32 33}
+; CHECK: !9 = !{!"branch_weights", i32 8, i32 33}
 ;; The false weight prints out as a negative integer here, but inside llvm, we
 ;; treat the weight as an unsigned integer.
-; CHECK: !10 = metadata !{metadata !"branch_weights", i32 112017436, i32 -735157296}
+; CHECK: !10 = !{!"branch_weights", i32 112017436, i32 -735157296}
diff --git a/test/Transforms/SimplifyCFG/seh-nounwind.ll b/test/Transforms/SimplifyCFG/seh-nounwind.ll
new file mode 100644
index 0000000..3845e31
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/seh-nounwind.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -simplifycfg < %s | FileCheck %s
+
+; Don't remove invokes of nounwind functions if the personality handles async
+; exceptions. The @div function in this test can fault, even though it can't
+; throw a synchronous exception.
+
+define i32 @div(i32 %n, i32 %d) nounwind {
+entry:
+  %div = sdiv i32 %n, %d
+  ret i32 %div
+}
+
+define i32 @main() nounwind {
+entry:
+  %call = invoke i32 @div(i32 10, i32 0)
+          to label %__try.cont unwind label %lpad
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
+          catch i8* null
+  br label %__try.cont
+
+__try.cont:
+  %retval.0 = phi i32 [ %call, %entry ], [ 0, %lpad ]
+  ret i32 %retval.0
+}
+
+; CHECK-LABEL: define i32 @main()
+; CHECK: invoke i32 @div(i32 10, i32 0)
+
+declare i32 @__C_specific_handler(...)
diff --git a/test/Transforms/SimplifyCFG/select-gep.ll b/test/Transforms/SimplifyCFG/select-gep.ll
index 96c214c..43e46ca 100644
--- a/test/Transforms/SimplifyCFG/select-gep.ll
+++ b/test/Transforms/SimplifyCFG/select-gep.ll
@@ -1,27 +1,8 @@
 ; RUN: opt -S -simplifycfg < %s | FileCheck %s
 
-define i8* @test1(i8* %x, i64 %y) nounwind {
-entry:
-  %tmp1 = load i8* %x, align 1
-  %cmp = icmp eq i8 %tmp1, 47
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  %incdec.ptr = getelementptr inbounds i8* %x, i64 %y
-  br label %if.end
-
-if.end:
-  %x.addr = phi i8* [ %incdec.ptr, %if.then ], [ %x, %entry ]
-  ret i8* %x.addr
-
-; CHECK-LABEL: @test1(
-; CHECK-NOT: select
-; CHECK: ret i8* %x.addr
-}
-
 %ST = type { i8, i8 }
 
-define i8* @test2(%ST* %x, i8* %y) nounwind {
+define i8* @test1(%ST* %x, i8* %y) nounwind {
 entry:
   %cmp = icmp eq %ST* %x, null
   br i1 %cmp, label %if.then, label %if.end
@@ -34,7 +15,7 @@ if.end:
   %x.addr = phi i8* [ %incdec.ptr, %if.then ], [ %y, %entry ]
   ret i8* %x.addr
 
-; CHECK-LABEL: @test2(
+; CHECK-LABEL: @test1(
 ; CHECK: %incdec.ptr.y = select i1 %cmp, i8* %incdec.ptr, i8* %y
 ; CHECK: ret i8* %incdec.ptr.y
 }
diff --git a/test/Transforms/SimplifyCFG/sink-common-code.ll b/test/Transforms/SimplifyCFG/sink-common-code.ll
index 28d7279..cdb6ed2 100644
--- a/test/Transforms/SimplifyCFG/sink-common-code.ll
+++ b/test/Transforms/SimplifyCFG/sink-common-code.ll
@@ -4,7 +4,7 @@ define zeroext i1 @test1(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
 entry:
   br i1 %flag, label %if.then, label %if.else
 
-; CHECK: test1
+; CHECK-LABEL: test1
 ; CHECK: add
 ; CHECK: select
 ; CHECK: icmp
@@ -30,7 +30,7 @@ define zeroext i1 @test2(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
 entry:
   br i1 %flag, label %if.then, label %if.else
 
-; CHECK: test2
+; CHECK-LABEL: test2
 ; CHECK: add
 ; CHECK: select
 ; CHECK: icmp
@@ -51,3 +51,33 @@ if.end:
   %tobool4 = icmp ne i8 %obeys.0, 0
   ret i1 %tobool4
 }
+
+declare i32 @foo(i32, i32) nounwind readnone
+
+define i32 @test3(i1 zeroext %flag, i32 %x, i32 %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %x0 = call i32 @foo(i32 %x, i32 0) nounwind readnone
+  %y0 = call i32 @foo(i32 %x, i32 1) nounwind readnone
+  br label %if.end
+
+if.else:
+  %x1 = call i32 @foo(i32 %y, i32 0) nounwind readnone
+  %y1 = call i32 @foo(i32 %y, i32 1) nounwind readnone
+  br label %if.end
+
+if.end:
+  %xx = phi i32 [ %x0, %if.then ], [ %x1, %if.else ]
+  %yy = phi i32 [ %y0, %if.then ], [ %y1, %if.else ]
+  %ret = add i32 %xx, %yy
+  ret i32 %ret
+}
+
+; CHECK-LABEL: test3
+; CHECK: select
+; CHECK: call
+; CHECK: call
+; CHECK: add
+; CHECK-NOT: br
diff --git a/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll b/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
new file mode 100644
index 0000000..a109b31
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
@@ -0,0 +1,77 @@
+; RUN: opt %s -simplifycfg -S | FileCheck %s
+
+declare i32 @f(i32)
+
+define i32 @basic(i32 %x) {
+; CHECK-LABEL: @basic
+; CHECK: x.off = add i32 %x, -5
+; CHECK: %switch = icmp ult i32 %x.off, 3
+; CHECK: br i1 %switch, label %a, label %default
+
+entry:
+  switch i32 %x, label %default [
+    i32 5, label %a
+    i32 6, label %a
+    i32 7, label %a
+  ]
+default:
+  %0 = call i32 @f(i32 0)
+  ret i32 %0
+a:
+  %1 = call i32 @f(i32 1)
+  ret i32 %1
+}
+
+
+define i32 @unreachable(i32 %x) {
+; CHECK-LABEL: @unreachable
+; CHECK: x.off = add i32 %x, -5
+; CHECK: %switch = icmp ult i32 %x.off, 3
+; CHECK: br i1 %switch, label %a, label %b
+
+entry:
+  switch i32 %x, label %unreachable [
+    i32 5, label %a
+    i32 6, label %a
+    i32 7, label %a
+    i32 10, label %b
+    i32 20, label %b
+    i32 30, label %b
+    i32 40, label %b
+  ]
+unreachable:
+  unreachable
+a:
+  %0 = call i32 @f(i32 0)
+  ret i32 %0
+b:
+  %1 = call i32 @f(i32 1)
+  ret i32 %1
+}
+
+
+define i32 @unreachable2(i32 %x) {
+; CHECK-LABEL: @unreachable2
+; CHECK: x.off = add i32 %x, -5
+; CHECK: %switch = icmp ult i32 %x.off, 3
+; CHECK: br i1 %switch, label %a, label %b
+
+entry:
+  ; Note: folding the most popular case destination into the default
+  ; would prevent switch-to-icmp here.
+  switch i32 %x, label %unreachable [
+    i32 5, label %a
+    i32 6, label %a
+    i32 7, label %a
+    i32 10, label %b
+    i32 20, label %b
+  ]
+unreachable:
+  unreachable
+a:
+  %0 = call i32 @f(i32 0)
+  ret i32 %0
+b:
+  %1 = call i32 @f(i32 1)
+  ret i32 %1
+}
diff --git a/test/Transforms/SimplifyCFG/switch-to-br.ll b/test/Transforms/SimplifyCFG/switch-to-br.ll
new file mode 100644
index 0000000..01484cd
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/switch-to-br.ll
@@ -0,0 +1,64 @@
+; RUN: opt %s -simplifycfg -S | FileCheck %s
+
+declare i32 @f(i32)
+
+define i32 @basic(i32 %x) {
+; CHECK-LABEL: @basic
+; CHECK-LABEL: entry:
+; CHECK-NEXT:  call i32 @f(i32 0)
+; CHECK-NEXT:  ret i32 %0
+
+entry:
+  switch i32 %x, label %default [
+    i32 5, label %default
+    i32 6, label %default
+    i32 7, label %default
+  ]
+default:
+  %0 = call i32 @f(i32 0)
+  ret i32 %0
+}
+
+
+define i32 @constant() {
+; CHECK-LABEL: @constant
+; CHECK-LABEL: entry:
+; CHECK-NEXT:  call i32 @f(i32 1)
+; CHECK-NEXT:  ret i32 %0
+
+entry:
+  switch i32 42, label %default [
+    i32 41, label %default
+    i32 42, label %a
+    i32 43, label %b
+  ]
+default:
+  %0 = call i32 @f(i32 0)
+  ret i32 %0
+a:
+  %1 = call i32 @f(i32 1)
+  ret i32 %1
+b:
+  %2 = call i32 @f(i32 2)
+  ret i32 %2
+}
+
+
+define i32 @unreachable(i32 %x) {
+; CHECK-LABEL: @unreachable
+; CHECK-LABEL: entry:
+; CHECK-NEXT:  call i32 @f(i32 0)
+; CHECK-NEXT:  ret i32 %0
+
+entry:
+  switch i32 %x, label %unreachable [
+    i32 5, label %a
+    i32 6, label %a
+    i32 7, label %a
+  ]
+unreachable:
+  unreachable
+a:
+  %0 = call i32 @f(i32 0)
+  ret i32 %0
+}
diff --git a/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll b/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll
index 69f97e5..f4d171a 100644
--- a/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll
+++ b/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll
@@ -35,38 +35,3 @@ return:
   %retval.0 = phi i32 [ 4, %sw.epilog ], [ 2, %sw.bb1 ], [ 10, %sw.bb ]
   ret i32 %retval.0
 }
-
-; int foo1_without_default(int a) {
-;   switch(a) {
-;     case 10:
-;       return 10;
-;     case 20:
-;       return 2;
-;   }
-;   __builtin_unreachable();
-; }
-
-define i32 @foo1_without_default(i32 %a) {
-; CHECK-LABEL: @foo1_without_default
-; CHECK: %switch.selectcmp = icmp eq i32 %a, 10
-; CHECK-NEXT: %switch.select = select i1 %switch.selectcmp, i32 10, i32 2
-; CHECK-NOT: %switch.selectcmp1
-entry:
-  switch i32 %a, label %sw.epilog [
-    i32 10, label %sw.bb
-    i32 20, label %sw.bb1
-  ]
-
-sw.bb:
-  br label %return
-
-sw.bb1:
-  br label %return
-
-sw.epilog:
-  unreachable
-
-return:
-  %retval.0 = phi i32 [ 2, %sw.bb1 ], [ 10, %sw.bb ]
-  ret i32 %retval.0
-}
diff --git a/test/Transforms/SimplifyCFG/trap-debugloc.ll b/test/Transforms/SimplifyCFG/trap-debugloc.ll
index adf4215..24a286f 100644
--- a/test/Transforms/SimplifyCFG/trap-debugloc.ll
+++ b/test/Transforms/SimplifyCFG/trap-debugloc.ll
@@ -11,14 +11,14 @@ define void @foo() nounwind ssp {
 !llvm.module.flags = !{!10}
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\001\000\006\000\000\000", metadata !8, metadata !1, metadata !3, null, void ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [foo]
-!1 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-206.1) (based on LLVM 3.0svn)\001\00\000\00\000", metadata !8, metadata !4, metadata !4, metadata !9, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !8, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null}
-!5 = metadata !{i32 4, i32 2, metadata !6, null}
-!6 = metadata !{metadata !"0xb\003\0012\000", metadata !8, metadata !0} ; [ DW_TAG_lexical_block ]
-!7 = metadata !{i32 5, i32 1, metadata !6, null}
-!8 = metadata !{metadata !"foo.c", metadata !"/private/tmp"}
-!9 = metadata !{metadata !0}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00foo\00foo\00\003\000\001\000\006\000\000\000", !8, !1, !3, null, void ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [foo]
+!1 = !{!"0x29", !8} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-206.1) (based on LLVM 3.0svn)\001\00\000\00\000", !8, !4, !4, !9, null, null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !8, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{null}
+!5 = !MDLocation(line: 4, column: 2, scope: !6)
+!6 = !{!"0xb\003\0012\000", !8, !0} ; [ DW_TAG_lexical_block ]
+!7 = !MDLocation(line: 5, column: 1, scope: !6)
+!8 = !{!"foo.c", !"/private/tmp"}
+!9 = !{!0}
+!10 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/SimplifyCFG/trivial-throw.ll b/test/Transforms/SimplifyCFG/trivial-throw.ll
deleted file mode 100644
index ca2b569..0000000
--- a/test/Transforms/SimplifyCFG/trivial-throw.ll
+++ /dev/null
@@ -1,77 +0,0 @@
-; RUN: opt -simplifycfg -S < %s | FileCheck %s
-; <rdar://problem/13360379>
-
-@_ZTVN10__cxxabiv117__class_type_infoE = external global i8*
-@_ZTS13TestException = linkonce_odr constant [16 x i8] c"13TestException\00"
-@_ZTI13TestException = linkonce_odr unnamed_addr constant { i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8** @_ZTVN10__cxxabiv117__class_type_infoE, i64 2) to i8*), i8* getelementptr inbounds ([16 x i8]* @_ZTS13TestException, i32 0, i32 0) }
-
-define void @throw(i32 %n) #0 {
-entry:
-  %exception = call i8* @__cxa_allocate_exception(i64 1) #4
-  call void @__cxa_throw(i8* %exception, i8* bitcast ({ i8*, i8* }* @_ZTI13TestException to i8*), i8* null) #2
-  unreachable
-}
-
-define void @func() #0 {
-entry:
-; CHECK: func()
-; CHECK: invoke void @throw
-; CHECK-NOT: call void @throw
-  invoke void @throw(i32 42) #0
-          to label %exit unwind label %lpad
-
-lpad:
-  %tmp0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-          cleanup
-  resume { i8*, i32 } %tmp0
-
-exit:
-  invoke void @abort() #2
-          to label %invoke.cont unwind label %lpad1
-
-invoke.cont:
-  unreachable
-
-lpad1:
-  %tmp1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-          catch i8* bitcast ({ i8*, i8* }* @_ZTI13TestException to i8*)
-  %tmp2 = extractvalue { i8*, i32 } %tmp1, 1
-  %tmp3 = call i32 @llvm.eh.typeid.for(i8* bitcast ({ i8*, i8* }* @_ZTI13TestException to i8*)) #4
-  %matches = icmp eq i32 %tmp2, %tmp3
-  br i1 %matches, label %catch, label %eh.resume
-
-catch:
-  ret void
-
-eh.resume:
-  resume { i8*, i32 } %tmp1
-}
-
-define linkonce_odr hidden void @__clang_call_terminate(i8*) #1 {
-  %2 = call i8* @__cxa_begin_catch(i8* %0) #4
-  call void @_ZSt9terminatev() #5
-  unreachable
-}
-
-declare void @abort() #2
-
-declare i32 @llvm.eh.typeid.for(i8*) #3
-
-declare void @__cxa_end_catch()
-
-declare i8* @__cxa_allocate_exception(i64)
-
-declare i32 @__gxx_personality_v0(...)
-
-declare void @__cxa_throw(i8*, i8*, i8*)
-
-declare i8* @__cxa_begin_catch(i8*)
-
-declare void @_ZSt9terminatev()
-
-attributes #0 = { ssp uwtable }
-attributes #1 = { noinline noreturn nounwind }
-attributes #2 = { noreturn }
-attributes #3 = { nounwind readnone }
-attributes #4 = { nounwind }
-attributes #5 = { noreturn nounwind }
diff --git a/test/Transforms/SimplifyCFG/volatile-phioper.ll b/test/Transforms/SimplifyCFG/volatile-phioper.ll
index 1ef3a7c..6367451 100644
--- a/test/Transforms/SimplifyCFG/volatile-phioper.ll
+++ b/test/Transforms/SimplifyCFG/volatile-phioper.ll
@@ -45,4 +45,4 @@ attributes #0 = { nounwind ssp uwtable "fp-contract-model"="standard" "no-frame-
 attributes #1 = { "fp-contract-model"="standard" "no-frame-pointer-elim" "no-frame-pointer-elim-non-leaf" "relocation-model"="pic" "ssp-buffers-size"="8" }
 attributes #2 = { nounwind }
 
-!0 = metadata !{i32 1039}
+!0 = !{i32 1039}
diff --git a/test/Transforms/StraightLineStrengthReduce/slsr.ll b/test/Transforms/StraightLineStrengthReduce/slsr.ll
new file mode 100644
index 0000000..951cbb0
--- /dev/null
+++ b/test/Transforms/StraightLineStrengthReduce/slsr.ll
@@ -0,0 +1,119 @@
+; RUN: opt < %s -slsr -gvn -dce -S | FileCheck %s
+
+declare i32 @foo(i32 %a)
+
+define i32 @slsr1(i32 %b, i32 %s) {
+; CHECK-LABEL: @slsr1(
+  ; v0 = foo(b * s);
+  %mul0 = mul i32 %b, %s
+; CHECK: mul i32
+; CHECK-NOT: mul i32
+  %v0 = call i32 @foo(i32 %mul0)
+
+  ; v1 = foo((b + 1) * s);
+  %b1 = add i32 %b, 1
+  %mul1 = mul i32 %b1, %s
+  %v1 = call i32 @foo(i32 %mul1)
+
+  ; v2 = foo((b + 2) * s);
+  %b2 = add i32 %b, 2
+  %mul2 = mul i32 %b2, %s
+  %v2 = call i32 @foo(i32 %mul2)
+
+  ; return v0 + v1 + v2;
+  %1 = add i32 %v0, %v1
+  %2 = add i32 %1, %v2
+  ret i32 %2
+}
+
+; v0 = foo(a * b)
+; v1 = foo((a + 1) * b)
+; v2 = foo(a * (b + 1))
+; v3 = foo((a + 1) * (b + 1))
+define i32 @slsr2(i32 %a, i32 %b) {
+; CHECK-LABEL: @slsr2(
+  %a1 = add i32 %a, 1
+  %b1 = add i32 %b, 1
+  %mul0 = mul i32 %a, %b
+; CHECK: mul i32
+; CHECK-NOT: mul i32
+  %mul1 = mul i32 %a1, %b
+  %mul2 = mul i32 %a, %b1
+  %mul3 = mul i32 %a1, %b1
+
+  %v0 = call i32 @foo(i32 %mul0)
+  %v1 = call i32 @foo(i32 %mul1)
+  %v2 = call i32 @foo(i32 %mul2)
+  %v3 = call i32 @foo(i32 %mul3)
+
+  %1 = add i32 %v0, %v1
+  %2 = add i32 %1, %v2
+  %3 = add i32 %2, %v3
+  ret i32 %3
+}
+
+; The bump is a multiple of the stride.
+;
+; v0 = foo(b * s);
+; v1 = foo((b + 2) * s);
+; v2 = foo((b + 4) * s);
+; return v0 + v1 + v2;
+;
+; ==>
+;
+; mul0 = b * s;
+; v0 = foo(mul0);
+; bump = s * 2;
+; mul1 = mul0 + bump; // GVN ensures mul1 and mul2 use the same bump.
+; v1 = foo(mul1);
+; mul2 = mul1 + bump;
+; v2 = foo(mul2);
+; return v0 + v1 + v2;
+define i32 @slsr3(i32 %b, i32 %s) {
+; CHECK-LABEL: @slsr3(
+  %mul0 = mul i32 %b, %s
+; CHECK: mul i32
+  %v0 = call i32 @foo(i32 %mul0)
+
+  %b1 = add i32 %b, 2
+  %mul1 = mul i32 %b1, %s
+; CHECK: [[BUMP:%[a-zA-Z0-9]+]] = mul i32 %s, 2
+; CHECK: %mul1 = add i32 %mul0, [[BUMP]]
+  %v1 = call i32 @foo(i32 %mul1)
+
+  %b2 = add i32 %b, 4
+  %mul2 = mul i32 %b2, %s
+; CHECK: %mul2 = add i32 %mul1, [[BUMP]]
+  %v2 = call i32 @foo(i32 %mul2)
+
+  %1 = add i32 %v0, %v1
+  %2 = add i32 %1, %v2
+  ret i32 %2
+}
+
+; Do not rewrite a candidate if its potential basis does not dominate it.
+; v0 = 0;
+; if (cond)
+;   v0 = foo(a * b);
+; v1 = foo((a + 1) * b);
+; return v0 + v1;
+define i32 @not_dominate(i1 %cond, i32 %a, i32 %b) {
+; CHECK-LABEL: @not_dominate(
+entry:
+  %a1 = add i32 %a, 1
+  br i1 %cond, label %then, label %merge
+
+then:
+  %mul0 = mul i32 %a, %b
+; CHECK: %mul0 = mul i32 %a, %b
+  %v0 = call i32 @foo(i32 %mul0)
+  br label %merge
+
+merge:
+  %v0.phi = phi i32 [ 0, %entry ], [ %mul0, %then ]
+  %mul1 = mul i32 %a1, %b
+; CHECK: %mul1 = mul i32 %a1, %b
+  %v1 = call i32 @foo(i32 %mul1)
+  %sum = add i32 %v0.phi, %v1
+  ret i32 %sum
+}
diff --git a/test/Transforms/StripSymbols/2010-06-30-StripDebug.ll b/test/Transforms/StripSymbols/2010-06-30-StripDebug.ll
index 6100a6a..f2c705a 100644
--- a/test/Transforms/StripSymbols/2010-06-30-StripDebug.ll
+++ b/test/Transforms/StripSymbols/2010-06-30-StripDebug.ll
@@ -6,7 +6,7 @@
 
 define void @foo() nounwind readnone optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !9, i64 0, metadata !5, metadata !{}), !dbg !10
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !5, metadata !{}), !dbg !10
   ret void, !dbg !11
 }
 
@@ -18,17 +18,17 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !llvm.dbg.lv.foo = !{!5}
 !llvm.dbg.gv = !{!8}
 
-!0 = metadata !{metadata !"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\001\000", metadata !12, metadata !1, metadata !3, null, void ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !12, metadata !4, metadata !4, null, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null}
-!5 = metadata !{metadata !"0x100\00y\003\000", metadata !6, metadata !1, metadata !7} ; [ DW_TAG_auto_variable ]
-!6 = metadata !{metadata !"0xb\002\000\000", metadata !12, metadata !0} ; [ DW_TAG_lexical_block ]
-!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !12, metadata !1} ; [ DW_TAG_base_type ]
-!8 = metadata !{metadata !"0x34\00x\00x\00\001\000\001", metadata !1, metadata !1, metadata !7, i32* @x} ; [ DW_TAG_variable ]
-!9 = metadata !{i32 0}
-!10 = metadata !{i32 3, i32 0, metadata !6, null}
-!11 = metadata !{i32 4, i32 0, metadata !6, null}
-!12 = metadata !{metadata !"b.c", metadata !"/tmp"}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\001\000", !12, !1, !3, null, void ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x29", !12} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", !12, !4, !4, null, null, null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !12, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{null}
+!5 = !{!"0x100\00y\003\000", !6, !1, !7} ; [ DW_TAG_auto_variable ]
+!6 = !{!"0xb\002\000\000", !12, !0} ; [ DW_TAG_lexical_block ]
+!7 = !{!"0x24\00int\000\0032\0032\000\000\005", !12, !1} ; [ DW_TAG_base_type ]
+!8 = !{!"0x34\00x\00x\00\001\000\001", !1, !1, !7, i32* @x} ; [ DW_TAG_variable ]
+!9 = !{i32 0}
+!10 = !MDLocation(line: 3, scope: !6)
+!11 = !MDLocation(line: 4, scope: !6)
+!12 = !{!"b.c", !"/tmp"}
+!13 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/StripSymbols/2010-08-25-crash.ll b/test/Transforms/StripSymbols/2010-08-25-crash.ll
index c211dc1..1534647 100644
--- a/test/Transforms/StripSymbols/2010-08-25-crash.ll
+++ b/test/Transforms/StripSymbols/2010-08-25-crash.ll
@@ -7,18 +7,18 @@ entry:
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14}
 
-!0 = metadata !{metadata !"0x2e\00foo\00foo\00foo\003\000\001\000\006\000\000\000", metadata !10, metadata !1, metadata !3, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
-!1 = metadata !{metadata !"0x29", metadata !10} ; [ DW_TAG_file_type ]
-!2 = metadata !{metadata !"0x11\0012\00clang version 2.8 (trunk 112062)\001\00\000\00\001", metadata !10, metadata !11, metadata !11, metadata !12, metadata !13, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !10, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !10, metadata !1} ; [ DW_TAG_base_type ]
-!6 = metadata !{metadata !"0x34\00i\00i\00i\002\001\001", metadata !1, metadata !1, metadata !7, i32 0, null} ; [ DW_TAG_variable ]
-!7 = metadata !{metadata !"0x26\00\000\000\000\000\000", metadata !10, metadata !1, metadata !5} ; [ DW_TAG_const_type ]
-!8 = metadata !{i32 3, i32 13, metadata !9, null}
-!9 = metadata !{metadata !"0xb\003\0011\000", metadata !10, metadata !0} ; [ DW_TAG_lexical_block ]
-!10 = metadata !{metadata !"/tmp/a.c", metadata !"/Volumes/Lalgate/clean/D.CW"}
-!11 = metadata !{i32 0}
-!12 = metadata !{metadata !0}
-!13 = metadata !{metadata !6}
-!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x2e\00foo\00foo\00foo\003\000\001\000\006\000\000\000", !10, !1, !3, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = !{!"0x29", !10} ; [ DW_TAG_file_type ]
+!2 = !{!"0x11\0012\00clang version 2.8 (trunk 112062)\001\00\000\00\001", !10, !11, !11, !12, !13, null} ; [ DW_TAG_compile_unit ]
+!3 = !{!"0x15\00\000\000\000\000\000\000", !10, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !{!5}
+!5 = !{!"0x24\00int\000\0032\0032\000\000\005", !10, !1} ; [ DW_TAG_base_type ]
+!6 = !{!"0x34\00i\00i\00i\002\001\001", !1, !1, !7, i32 0, null} ; [ DW_TAG_variable ]
+!7 = !{!"0x26\00\000\000\000\000\000", !10, !1, !5} ; [ DW_TAG_const_type ]
+!8 = !MDLocation(line: 3, column: 13, scope: !9)
+!9 = !{!"0xb\003\0011\000", !10, !0} ; [ DW_TAG_lexical_block ]
+!10 = !{!"/tmp/a.c", !"/Volumes/Lalgate/clean/D.CW"}
+!11 = !{i32 0}
+!12 = !{!0}
+!13 = !{!6}
+!14 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/StripSymbols/strip-dead-debug-info.ll b/test/Transforms/StripSymbols/strip-dead-debug-info.ll
index 04a3f32..aca7cd6 100644
--- a/test/Transforms/StripSymbols/strip-dead-debug-info.ll
+++ b/test/Transforms/StripSymbols/strip-dead-debug-info.ll
@@ -18,7 +18,7 @@ entry:
 ; Function Attrs: nounwind readonly ssp
 define i32 @foo(i32 %i) #2 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !15, metadata !{}), !dbg !20
+  tail call void @llvm.dbg.value(metadata i32 %i, i64 0, metadata !15, metadata !{}), !dbg !20
   %.0 = load i32* @xyz, align 4
   ret i32 %.0, !dbg !21
 }
@@ -30,29 +30,29 @@ attributes #2 = { nounwind readonly ssp }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!25}
 
-!0 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !23, metadata !24, null} ; [ DW_TAG_compile_unit ] [/tmp//g.c] [DW_LANG_C89]
-!1 = metadata !{metadata !"g.c", metadata !"/tmp/"}
-!2 = metadata !{null}
-!3 = metadata !{metadata !"0x2e\00bar\00bar\00\005\001\001\000\006\000\001\000", metadata !1, null, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 5] [local] [def] [scope 0] [bar]
-!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !1, metadata !5, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp//g.c]
-!6 = metadata !{metadata !"0x2e\00fn\00fn\00fn\006\000\001\000\006\000\001\000", metadata !1, null, metadata !7, null, i32 ()* @fn, null, null, null} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [fn]
-!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !1, metadata !5, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !1, metadata !5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{metadata !"0x2e\00foo\00foo\00foo\007\000\001\000\006\000\001\000", metadata !1, null, metadata !11, null, i32 (i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 0] [foo]
-!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !1, metadata !5, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{metadata !9, metadata !9}
-!13 = metadata !{metadata !"0x100\00bb\005\000", metadata !14, metadata !5, metadata !9} ; [ DW_TAG_auto_variable ]
-!14 = metadata !{metadata !"0xb\005\000\000", metadata !1, metadata !3} ; [ DW_TAG_lexical_block ] [/tmp//g.c]
-!15 = metadata !{metadata !"0x101\00i\007\000", metadata !10, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ]
-!16 = metadata !{metadata !"0x34\00abcd\00abcd\00\002\001\001", metadata !5, metadata !5, metadata !9, null, null} ; [ DW_TAG_variable ]
-!17 = metadata !{metadata !"0x34\00xyz\00xyz\00\003\000\001", metadata !5, metadata !5, metadata !9, i32* @xyz, null} ; [ DW_TAG_variable ]
-!18 = metadata !{i32 6, i32 0, metadata !19, null}
-!19 = metadata !{metadata !"0xb\006\000\000", metadata !1, metadata !6} ; [ DW_TAG_lexical_block ] [/tmp//g.c]
-!20 = metadata !{i32 7, i32 0, metadata !10, null}
-!21 = metadata !{i32 10, i32 0, metadata !22, null}
-!22 = metadata !{metadata !"0xb\007\000\000", metadata !1, metadata !10} ; [ DW_TAG_lexical_block ] [/tmp//g.c]
-!23 = metadata !{metadata !3, metadata !6, metadata !10}
-!24 = metadata !{metadata !16, metadata !17}
-!25 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!0 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", !1, !2, !2, !23, !24, null} ; [ DW_TAG_compile_unit ] [/tmp//g.c] [DW_LANG_C89]
+!1 = !{!"g.c", !"/tmp/"}
+!2 = !{null}
+!3 = !{!"0x2e\00bar\00bar\00\005\001\001\000\006\000\001\000", !1, null, !4, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 5] [local] [def] [scope 0] [bar]
+!4 = !{!"0x15\00\000\000\000\000\000\000", !1, !5, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [/tmp//g.c]
+!6 = !{!"0x2e\00fn\00fn\00fn\006\000\001\000\006\000\001\000", !1, null, !7, null, i32 ()* @fn, null, null, null} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [fn]
+!7 = !{!"0x15\00\000\000\000\000\000\000", !1, !5, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = !{!9}
+!9 = !{!"0x24\00int\000\0032\0032\000\000\005", !1, !5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = !{!"0x2e\00foo\00foo\00foo\007\000\001\000\006\000\001\000", !1, null, !11, null, i32 (i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 0] [foo]
+!11 = !{!"0x15\00\000\000\000\000\000\000", !1, !5, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = !{!9, !9}
+!13 = !{!"0x100\00bb\005\000", !14, !5, !9} ; [ DW_TAG_auto_variable ]
+!14 = !{!"0xb\005\000\000", !1, !3} ; [ DW_TAG_lexical_block ] [/tmp//g.c]
+!15 = !{!"0x101\00i\007\000", !10, !5, !9} ; [ DW_TAG_arg_variable ]
+!16 = !{!"0x34\00abcd\00abcd\00\002\001\001", !5, !5, !9, null, null} ; [ DW_TAG_variable ]
+!17 = !{!"0x34\00xyz\00xyz\00\003\000\001", !5, !5, !9, i32* @xyz, null} ; [ DW_TAG_variable ]
+!18 = !MDLocation(line: 6, scope: !19)
+!19 = !{!"0xb\006\000\000", !1, !6} ; [ DW_TAG_lexical_block ] [/tmp//g.c]
+!20 = !MDLocation(line: 7, scope: !10)
+!21 = !MDLocation(line: 10, scope: !22)
+!22 = !{!"0xb\007\000\000", !1, !10} ; [ DW_TAG_lexical_block ] [/tmp//g.c]
+!23 = !{!3, !6, !10}
+!24 = !{!16, !17}
+!25 = !{i32 1, !"Debug Info Version", i32 2}
diff --git a/test/Transforms/StructurizeCFG/nested-loop-order.ll b/test/Transforms/StructurizeCFG/nested-loop-order.ll
new file mode 100644
index 0000000..fee1ff0
--- /dev/null
+++ b/test/Transforms/StructurizeCFG/nested-loop-order.ll
@@ -0,0 +1,79 @@
+; RUN: opt -S -structurizecfg %s -o - | FileCheck %s
+
+define void @main(float addrspace(1)* %out) {
+
+; CHECK: main_body:
+; CHECK: br label %LOOP.outer
+main_body:
+  br label %LOOP.outer
+
+; CHECK: LOOP.outer:
+; CHECK: br label %LOOP
+LOOP.outer:                                       ; preds = %ENDIF28, %main_body
+  %temp8.0.ph = phi float [ 0.000000e+00, %main_body ], [ %tmp35, %ENDIF28 ]
+  %temp4.0.ph = phi i32 [ 0, %main_body ], [ %tmp20, %ENDIF28 ]
+  br label %LOOP
+
+; CHECK: LOOP:
+; br i1 %{{[0-9]+}}, label %ENDIF, label %Flow
+LOOP:                                             ; preds = %IF29, %LOOP.outer
+  %temp4.0 = phi i32 [ %temp4.0.ph, %LOOP.outer ], [ %tmp20, %IF29 ]
+  %tmp20 = add i32 %temp4.0, 1
+  %tmp22 = icmp sgt i32 %tmp20, 3
+  br i1 %tmp22, label %ENDLOOP, label %ENDIF
+
+; CHECK: Flow3
+; CHECK: br i1 %{{[0-9]+}}, label %ENDLOOP, label %LOOP.outer
+
+; CHECK: ENDLOOP:
+; CHECK: ret void
+ENDLOOP:                                          ; preds = %ENDIF28, %IF29, %LOOP
+  %temp8.1 = phi float [ %temp8.0.ph, %LOOP ], [ %temp8.0.ph, %IF29 ], [ %tmp35, %ENDIF28 ]
+  %tmp23 = icmp eq i32 %tmp20, 3
+  %.45 = select i1 %tmp23, float 0.000000e+00, float 1.000000e+00
+  store float %.45, float addrspace(1)* %out
+  ret void
+
+; CHECK: ENDIF:
+; CHECK: br i1 %tmp31, label %IF29, label %Flow1
+ENDIF:                                            ; preds = %LOOP
+  %tmp31 = icmp sgt i32 %tmp20, 1
+  br i1 %tmp31, label %IF29, label %ENDIF28
+
+; CHECK: Flow:
+; CHECK br i1 %{{[0-9]+}}, label %Flow, label %LOOP
+
+; CHECK: IF29:
+; CHECK: br label %Flow1
+IF29:                                             ; preds = %ENDIF
+  %tmp32 = icmp sgt i32 %tmp20, 2
+  br i1 %tmp32, label %ENDLOOP, label %LOOP
+
+; CHECK: Flow1:
+; CHECK: br label %Flow
+
+; CHECK: Flow2:
+; CHECK: br i1 %{{[0-9]+}}, label %ENDIF28, label %Flow3
+
+; CHECK: ENDIF28:
+; CHECK: br label %Flow3
+ENDIF28:                                          ; preds = %ENDIF
+  %tmp35 = fadd float %temp8.0.ph, 1.0
+  %tmp36 = icmp sgt i32 %tmp20, 2
+  br i1 %tmp36, label %ENDLOOP, label %LOOP.outer
+}
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.clamp.(float, float, float) #2
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="1" "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readnone }
+
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
diff --git a/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll b/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll
new file mode 100644
index 0000000..668a1e9
--- /dev/null
+++ b/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll
@@ -0,0 +1,42 @@
+; RUN: opt -S -structurizecfg %s -o - | FileCheck %s
+
+; CHECK-NOT: br i1 true
+
+define void @blam(i32 addrspace(1)* nocapture %arg, float %arg1, float %arg2) {
+; CHECK: bb:
+bb:
+  br label %bb3
+
+; CHECK: bb3:
+bb3:                                              ; preds = %bb7, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb7 ]
+  %tmp4 = fcmp ult float %arg1, 3.500000e+00
+; CHECK: %0 = xor i1 %tmp4, true
+; CHECK: br i1 %0, label %bb5, label %Flow
+  br i1 %tmp4, label %bb7, label %bb5
+
+; CHECK: bb5:
+bb5:                                              ; preds = %bb3
+  %tmp6 = fcmp olt float 0.000000e+00, %arg2
+; CHECK: br label %Flow
+  br i1 %tmp6, label %bb10, label %bb7
+
+; CHECK: Flow:
+; CHECK: br i1 %3, label %bb7, label %Flow1
+
+; CHECK: bb7
+bb7:                                              ; preds = %bb5, %bb3
+  %tmp8 = add nuw nsw i64 %tmp, 1
+  %tmp9 = icmp slt i64 %tmp8, 5
+; CHECK: br label %Flow1
+  br i1 %tmp9, label %bb3, label %bb10
+
+; CHECK: Flow1:
+; CHECK: br i1 %7, label %bb10, label %bb3
+
+; CHECK: bb10
+bb10:                                             ; preds = %bb7, %bb5
+  %tmp11 = phi i32 [ 15, %bb5 ], [ 255, %bb7 ]
+  store i32 %tmp11, i32 addrspace(1)* %arg, align 4
+  ret void
+}
diff --git a/test/Transforms/StructurizeCFG/post-order-traversal-bug.ll b/test/Transforms/StructurizeCFG/post-order-traversal-bug.ll
new file mode 100644
index 0000000..740b3d1
--- /dev/null
+++ b/test/Transforms/StructurizeCFG/post-order-traversal-bug.ll
@@ -0,0 +1,100 @@
+; RUN: opt -S -structurizecfg %s -o - | FileCheck %s
+
+; The structurize cfg pass used to do a post-order traversal to generate a list
+; of ; basic blocks and then operate on the list in reverse.  This led to bugs,
+; because sometimes successors would be visited before their predecessors.
+; The fix for this was to do a reverse post-order traversal which is what the
+; algorithm requires.
+
+; Function Attrs: nounwind
+define void @test(float* nocapture %out, i32 %K1, float* nocapture readonly %nr) {
+
+; CHECK: entry:
+; CHECK: br label %for.body
+entry:
+  br label %for.body
+
+; CHECK: for.body:
+; CHECK: br i1 %{{[0-9]+}}, label %lor.lhs.false, label %Flow
+for.body:                                         ; preds = %for.body.backedge, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.be, %for.body.backedge ], [ 1, %entry ]
+  %best_val.027 = phi float [ %best_val.027.be, %for.body.backedge ], [ 5.000000e+01, %entry ]
+  %prev_start.026 = phi i32 [ %tmp26, %for.body.backedge ], [ 0, %entry ]
+  %best_count.025 = phi i32 [ %best_count.025.be, %for.body.backedge ], [ 0, %entry ]
+  %tmp0 = trunc i64 %indvars.iv to i32
+  %cmp1 = icmp eq i32 %tmp0, %K1
+  br i1 %cmp1, label %if.then, label %lor.lhs.false
+
+; CHECK: lor.lhs.false:
+; CHECK: br label %Flow
+lor.lhs.false:                                    ; preds = %for.body
+  %arrayidx = getelementptr inbounds float* %nr, i64 %indvars.iv
+  %tmp1 = load float* %arrayidx, align 4
+  %tmp2 = add nsw i64 %indvars.iv, -1
+  %arrayidx2 = getelementptr inbounds float* %nr, i64 %tmp2
+  %tmp3 = load float* %arrayidx2, align 4
+  %cmp3 = fcmp une float %tmp1, %tmp3
+  br i1 %cmp3, label %if.then, label %for.body.1
+
+; CHECK: Flow:
+; CHECK: br i1 %{{[0-9]+}}, label %if.then, label %Flow1
+
+; CHECK: if.then:
+; CHECK: br label %Flow1
+if.then:                                          ; preds = %lor.lhs.false, %for.body
+  %sub4 = sub nsw i32 %tmp0, %prev_start.026
+  %tmp4 = add nsw i64 %indvars.iv, -1
+  %arrayidx8 = getelementptr inbounds float* %nr, i64 %tmp4
+  %tmp5 = load float* %arrayidx8, align 4
+  br i1 %cmp1, label %for.end, label %for.body.1
+
+; CHECK: for.end:
+; CHECK: ret void
+for.end:                                          ; preds = %for.body.1, %if.then
+  %best_val.0.lcssa = phi float [ %best_val.233, %for.body.1 ], [ %tmp5, %if.then ]
+  store float %best_val.0.lcssa, float* %out, align 4
+  ret void
+
+; CHECK: Flow1
+; CHECK: br i1 %{{[0-9]}}, label %for.body.1, label %Flow2
+
+; CHECK: for.body.1:
+; CHECK: br i1 %{{[0-9]+}}, label %for.body.6, label %Flow3
+for.body.1:                                       ; preds = %if.then, %lor.lhs.false
+  %best_val.233 = phi float [ %tmp5, %if.then ], [ %best_val.027, %lor.lhs.false ]
+  %best_count.231 = phi i32 [ %sub4, %if.then ], [ %best_count.025, %lor.lhs.false ]
+  %indvars.iv.next.454 = add nsw i64 %indvars.iv, 5
+  %tmp22 = trunc i64 %indvars.iv.next.454 to i32
+  %cmp1.5 = icmp eq i32 %tmp22, %K1
+  br i1 %cmp1.5, label %for.end, label %for.body.6
+
+; CHECK: Flow2:
+; CHECK: br i1 %{{[0-9]+}}, label %for.end, label %for.body
+
+; CHECK: for.body.6:
+; CHECK: br i1 %cmp5.6, label %if.then6.6, label %for.body.backedge
+for.body.6:                                       ; preds = %for.body.1
+  %indvars.iv.next.559 = add nsw i64 %indvars.iv, 6
+  %tmp26 = trunc i64 %indvars.iv.next.559 to i32
+  %sub4.6 = sub nsw i32 %tmp26, %tmp22
+  %cmp5.6 = icmp slt i32 %best_count.231, %sub4.6
+  br i1 %cmp5.6, label %if.then6.6, label %for.body.backedge
+
+; CHECK: if.then6.6
+; CHECK: br label %for.body.backedge
+if.then6.6:                                       ; preds = %for.body.6
+  %arrayidx8.6 = getelementptr inbounds float* %nr, i64 %indvars.iv.next.454
+  %tmp29 = load float* %arrayidx8.6, align 4
+  br label %for.body.backedge
+
+; CHECK: Flow3:
+; CHECK: br label %Flow2
+
+; CHECK: for.body.backedge:
+; CHECK: br label %Flow3
+for.body.backedge:                                ; preds = %if.then6.6, %for.body.6
+  %best_val.027.be = phi float [ %tmp29, %if.then6.6 ], [ %best_val.233, %for.body.6 ]
+  %best_count.025.be = phi i32 [ %sub4.6, %if.then6.6 ], [ %best_count.231, %for.body.6 ]
+  %indvars.iv.be = add nsw i64 %indvars.iv, 7
+  br label %for.body
+}
diff --git a/test/Transforms/Util/combine-alias-scope-metadata.ll b/test/Transforms/Util/combine-alias-scope-metadata.ll
new file mode 100644
index 0000000..fd0a3d5
--- /dev/null
+++ b/test/Transforms/Util/combine-alias-scope-metadata.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s -S -basicaa -memcpyopt | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @test(i8* noalias dereferenceable(1) %in, i8* noalias dereferenceable(1) %out) {
+  %tmp = alloca i8
+  %tmp2 = alloca i8
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 1, i32 8, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp, i8* %in, i64 1, i32 8, i1 false), !alias.scope !4
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp2, i8* %tmp, i64 1, i32 8, i1 false), !alias.scope !5
+
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %tmp2, i64 1, i32 8, i1 false), !noalias !6
+
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
+
+!0 = !{!0}
+!1 = distinct !{!1, !0, !"in"}
+!2 = distinct !{!2, !0, !"tmp"}
+!3 = distinct !{!3, !0, !"tmp2"}
+!4 = distinct !{!1, !2}
+!5 = distinct !{!2, !3}
+!6 = distinct !{!1, !2}
diff --git a/test/Transforms/Util/lowerswitch.ll b/test/Transforms/Util/lowerswitch.ll
index 06bd4cc..17c1202 100644
--- a/test/Transforms/Util/lowerswitch.ll
+++ b/test/Transforms/Util/lowerswitch.ll
@@ -1,8 +1,8 @@
 ; RUN: opt -lowerswitch -S < %s | FileCheck %s
 
 ; Test that we don't crash and have a different basic block for each incoming edge.
-define void @test_lower_switch() {
-; CHECK-LABEL: @test_lower_switch
+define void @test0() {
+; CHECK-LABEL: @test0
 ; CHECK: %merge = phi i64 [ 1, %BB3 ], [ 0, %NewDefault ], [ 0, %NodeBlock5 ], [ 0, %LeafBlock1 ]
 BB1:
   switch i32 undef, label %BB2 [
@@ -20,3 +20,35 @@ BB2:
 BB3:
   br label %BB2
 }
+
+; Test switch cases that are merged into a single case during lowerswitch
+; (take 84 and 85 below) - check that the number of incoming phi values match
+; the number of branches.
+define void @test1() {
+; CHECK-LABEL: @test1
+entry:
+  br label %bb1
+
+bb1:
+  switch i32 undef, label %bb1 [
+    i32 84, label %bb3
+    i32 85, label %bb3
+    i32 86, label %bb2
+    i32 78, label %exit
+    i32 99, label %bb3
+  ]
+
+bb2:
+  br label %bb3
+
+bb3:
+; CHECK-LABEL: bb3
+; CHECK: %tmp = phi i32 [ 1, %NodeBlock ], [ 0, %bb2 ], [ 1, %LeafBlock3 ]
+  %tmp = phi i32 [ 1, %bb1 ], [ 0, %bb2 ], [ 1, %bb1 ], [ 1, %bb1 ]
+; CHECK-NEXT: %tmp2 = phi i32 [ 2, %NodeBlock ], [ 5, %bb2 ], [ 2, %LeafBlock3 ]
+  %tmp2 = phi i32 [ 2, %bb1 ], [ 2, %bb1 ], [ 5, %bb2 ], [ 2, %bb1 ]
+  br label %exit
+
+exit:
+  ret void
+}
diff --git a/test/Verifier/2008-03-01-AllocaSized.ll b/test/Verifier/2008-03-01-AllocaSized.ll
index fc12a96..7478334 100644
--- a/test/Verifier/2008-03-01-AllocaSized.ll
+++ b/test/Verifier/2008-03-01-AllocaSized.ll
@@ -1,5 +1,5 @@
 ; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
-; CHECK: Cannot allocate unsized type
+; CHECK: invalid type for alloca
 ; PR2113
 
 define void @test() {
diff --git a/test/Verifier/comdat.ll b/test/Verifier/comdat.ll
index ca47429..dcf67d8 100644
--- a/test/Verifier/comdat.ll
+++ b/test/Verifier/comdat.ll
@@ -1,5 +1,5 @@
 ; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
 
 $v = comdat any
-@v = common global i32 0, comdat $v
+@v = common global i32 0, comdat($v)
 ; CHECK: 'common' global may not be in a Comdat!
diff --git a/test/Verifier/comdat2.ll b/test/Verifier/comdat2.ll
index d78030c..9d892b9 100644
--- a/test/Verifier/comdat2.ll
+++ b/test/Verifier/comdat2.ll
@@ -1,5 +1,5 @@
 ; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
 
 $v = comdat any
-@v = private global i32 0, comdat $v
+@v = private global i32 0, comdat($v)
 ; CHECK: comdat global value has private linkage
diff --git a/test/Verifier/comdat3.ll b/test/Verifier/comdat3.ll
new file mode 100644
index 0000000..28df930
--- /dev/null
+++ b/test/Verifier/comdat3.ll
@@ -0,0 +1,5 @@
+; This used to be invalid, but now it's valid.  Ensure the verifier
+; doesn't reject it.
+; RUN: llvm-as %s -o /dev/null
+
+$v = comdat largest
diff --git a/test/Verifier/fpmath.ll b/test/Verifier/fpmath.ll
index 7002c5c..2689b69 100644
--- a/test/Verifier/fpmath.ll
+++ b/test/Verifier/fpmath.ll
@@ -22,10 +22,10 @@ define void @fpmath1(i32 %i, float %f, <2 x float> %g) {
   ret void
 }
 
-!0 = metadata !{ float 1.0 }
-!1 = metadata !{ }
-!2 = metadata !{ float 1.0, float 1.0 }
-!3 = metadata !{ i32 1 }
-!4 = metadata !{ float -1.0 }
-!5 = metadata !{ float 0.0 }
-!6 = metadata !{ float 0x7FFFFFFF00000000 }
+!0 = !{ float 1.0 }
+!1 = !{ }
+!2 = !{ float 1.0, float 1.0 }
+!3 = !{ i32 1 }
+!4 = !{ float -1.0 }
+!5 = !{ float 0.0 }
+!6 = !{ float 0x7FFFFFFF00000000 }
diff --git a/test/Verifier/frameallocate.ll b/test/Verifier/frameallocate.ll
new file mode 100644
index 0000000..e3018db
--- /dev/null
+++ b/test/Verifier/frameallocate.ll
@@ -0,0 +1,48 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+declare i8* @llvm.frameallocate(i32)
+declare i8* @llvm.framerecover(i8*, i8*)
+
+define internal void @f() {
+  call i8* @llvm.frameallocate(i32 4)
+  call i8* @llvm.frameallocate(i32 4)
+  ret void
+}
+; CHECK: multiple calls to llvm.frameallocate in one function
+
+define internal void @f_a(i32 %n) {
+  call i8* @llvm.frameallocate(i32 %n)
+  ret void
+}
+; CHECK: llvm.frameallocate argument must be constant integer size
+
+define internal void @g() {
+entry:
+  br label %not_entry
+not_entry:
+  call i8* @llvm.frameallocate(i32 4)
+  ret void
+}
+; CHECK: llvm.frameallocate used outside of entry block
+
+define internal void @h() {
+  call i8* @llvm.framerecover(i8* null, i8* null)
+  ret void
+}
+; CHECK: llvm.framerecover first argument must be function defined in this module
+
+@global = constant i8 0
+
+declare void @declaration()
+
+define internal void @i() {
+  call i8* @llvm.framerecover(i8* @global, i8* null)
+  ret void
+}
+; CHECK: llvm.framerecover first argument must be function defined in this module
+
+define internal void @j() {
+  call i8* @llvm.framerecover(i8* bitcast(void()* @declaration to i8*), i8* null)
+  ret void
+}
+; CHECK: llvm.framerecover first argument must be function defined in this module
diff --git a/test/Verifier/ident-meta1.ll b/test/Verifier/ident-meta1.ll
index fb247a8..3202fbd 100644
--- a/test/Verifier/ident-meta1.ll
+++ b/test/Verifier/ident-meta1.ll
@@ -4,9 +4,9 @@
 ; Each metadata entry can have only one string.
 
 !llvm.ident = !{!0, !1}
-!0 = metadata !{metadata !"version string"}
-!1 = metadata !{metadata !"string1", metadata !"string2"}
+!0 = !{!"version string"}
+!1 = !{!"string1", !"string2"}
 ; CHECK: assembly parsed, but does not verify as correct!
 ; CHECK-NEXT: incorrect number of operands in llvm.ident metadata
-; CHECK-NEXT: metadata !1
+; CHECK-NEXT: !1
 
diff --git a/test/Verifier/ident-meta2.ll b/test/Verifier/ident-meta2.ll
index e86f18a..1c11e1b 100644
--- a/test/Verifier/ident-meta2.ll
+++ b/test/Verifier/ident-meta2.ll
@@ -4,10 +4,10 @@
 ; Each metadata entry can contain one string only.
 
 !llvm.ident = !{!0, !1, !2, !3}
-!0 = metadata !{metadata !"str1"}
-!1 = metadata !{metadata !"str2"}
-!2 = metadata !{metadata !"str3"}
-!3 = metadata !{i32 1}
+!0 = !{!"str1"}
+!1 = !{!"str2"}
+!2 = !{!"str3"}
+!3 = !{i32 1}
 ; CHECK: assembly parsed, but does not verify as correct!
 ; CHECK-NEXT: invalid value for llvm.ident metadata entry operand(the operand should be a string)
 ; CHECK-NEXT: i32 1
diff --git a/test/Verifier/ident-meta3.ll b/test/Verifier/ident-meta3.ll
index a847b46..91e1cfe 100644
--- a/test/Verifier/ident-meta3.ll
+++ b/test/Verifier/ident-meta3.ll
@@ -4,7 +4,7 @@
 ; Each metadata entry can contain one string only.
 
 !llvm.ident = !{!0}
-!0 = metadata !{metadata !{metadata !"nested metadata"}}
+!0 = !{!{!"nested metadata"}}
 ; CHECK: assembly parsed, but does not verify as correct!
 ; CHECK-NEXT: invalid value for llvm.ident metadata entry operand(the operand should be a string)
-; CHECK-NEXT: metadata !1
+; CHECK-NEXT: !1
diff --git a/test/Verifier/ident-meta4.ll b/test/Verifier/ident-meta4.ll
new file mode 100644
index 0000000..f44dbd5
--- /dev/null
+++ b/test/Verifier/ident-meta4.ll
@@ -0,0 +1,9 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+; Verify that llvm.ident is properly structured.
+; llvm.ident takes a list of metadata entries.
+; Each metadata entry can contain one string only.
+
+!llvm.ident = !{!0}
+!0 = !{null}
+; CHECK: assembly parsed, but does not verify as correct!
+; CHECK-NEXT: invalid value for llvm.ident metadata entry operand(the operand should be a string)
diff --git a/test/Verifier/module-flags-1.ll b/test/Verifier/module-flags-1.ll
index e5feaf3..36bcb33 100644
--- a/test/Verifier/module-flags-1.ll
+++ b/test/Verifier/module-flags-1.ll
@@ -3,57 +3,54 @@
 ; Check that module flags are structurally correct.
 ;
 ; CHECK: incorrect number of operands in module flag
-; CHECK: metadata !0
-!0 = metadata !{ i32 1 }
+; CHECK: !0
+!0 = !{i32 1}
 ; CHECK: invalid behavior operand in module flag (expected constant integer)
-; CHECK: metadata !"foo"
-!1 = metadata !{ metadata !"foo", metadata !"foo", i32 42 }
+; CHECK: !"foo"
+!1 = !{!"foo", !"foo", i32 42}
 ; CHECK: invalid behavior operand in module flag (unexpected constant)
 ; CHECK: i32 999
-!2 = metadata !{ i32 999, metadata !"foo", i32 43 }
+!2 = !{i32 999, !"foo", i32 43}
 ; CHECK: invalid ID operand in module flag (expected metadata string)
 ; CHECK: i32 1
-!3 = metadata !{ i32 1, i32 1, i32 44 }
+!3 = !{i32 1, i32 1, i32 44}
 ; CHECK: invalid value for 'require' module flag (expected metadata pair)
 ; CHECK: i32 45
-!4 = metadata !{ i32 3, metadata !"bla", i32 45 }
+!4 = !{i32 3, !"bla", i32 45}
 ; CHECK: invalid value for 'require' module flag (expected metadata pair)
-; CHECK: metadata !
-!5 = metadata !{ i32 3, metadata !"bla", metadata !{ i32 46 } }
+; CHECK: !
+!5 = !{i32 3, !"bla", !{i32 46}}
 ; CHECK: invalid value for 'require' module flag (first value operand should be a string)
 ; CHECK: i32 47
-!6 = metadata !{ i32 3, metadata !"bla", metadata !{ i32 47, i32 48 } }
+!6 = !{i32 3, !"bla", !{i32 47, i32 48}}
 
 ; Check that module flags only have unique IDs.
 ;
 ; CHECK: module flag identifiers must be unique (or of 'require' type)
-!7 = metadata !{ i32 1, metadata !"foo", i32 49 }
-!8 = metadata !{ i32 2, metadata !"foo", i32 50 }
+!7 = !{i32 1, !"foo", i32 49}
+!8 = !{i32 2, !"foo", i32 50}
 ; CHECK-NOT: module flag identifiers must be unique
-!9 = metadata !{ i32 2, metadata !"bar", i32 51 }
-!10 = metadata !{ i32 3, metadata !"bar", metadata !{ metadata !"bar", i32 51 } }
+!9 = !{i32 2, !"bar", i32 51}
+!10 = !{i32 3, !"bar", !{!"bar", i32 51}}
 
 ; Check that any 'append'-type module flags are valid.
 ; CHECK: invalid value for 'append'-type module flag (expected a metadata node)
-!16 = metadata !{ i32 5, metadata !"flag-2", i32 56 }
+!16 = !{i32 5, !"flag-2", i32 56}
 ; CHECK: invalid value for 'append'-type module flag (expected a metadata node)
-!17 = metadata !{ i32 5, metadata !"flag-3", i32 57 }
+!17 = !{i32 5, !"flag-3", i32 57}
 ; CHECK-NOT: invalid value for 'append'-type module flag (expected a metadata node)
-!18 = metadata !{ i32 5, metadata !"flag-4", metadata !{ i32 57 } }
+!18 = !{i32 5, !"flag-4", !{i32 57}}
 
 ; Check that any 'require' module flags are valid.
 ; CHECK: invalid requirement on flag, flag is not present in module
-!11 = metadata !{ i32 3, metadata !"bar",
-     metadata !{ metadata !"no-such-flag", i32 52 } }
+!11 = !{i32 3, !"bar", !{!"no-such-flag", i32 52}}
 ; CHECK: invalid requirement on flag, flag does not have the required value
-!12 = metadata !{ i32 1, metadata !"flag-0", i32 53 }
-!13 = metadata !{ i32 3, metadata !"bar",
-     metadata !{ metadata !"flag-0", i32 54 } }
+!12 = !{i32 1, !"flag-0", i32 53}
+!13 = !{i32 3, !"bar", !{!"flag-0", i32 54}}
 ; CHECK-NOT: invalid requirement on flag, flag is not present in module
 ; CHECK-NOT: invalid requirement on flag, flag does not have the required value
-!14 = metadata !{ i32 1, metadata !"flag-1", i32 55 }
-!15 = metadata !{ i32 3, metadata !"bar",
-     metadata !{ metadata !"flag-1", i32 55 } }
+!14 = !{i32 1, !"flag-1", i32 55}
+!15 = !{i32 3, !"bar", !{!"flag-1", i32 55}}
 
 !llvm.module.flags = !{
   !0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15,
diff --git a/test/Verifier/module-flags-2.ll b/test/Verifier/module-flags-2.ll
new file mode 100644
index 0000000..8582162
--- /dev/null
+++ b/test/Verifier/module-flags-2.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+!llvm.module.flags = !{!0}
+!0 = !{null, null, null}
+
+; CHECK: invalid behavior operand in module flag (expected constant integer)
diff --git a/test/Verifier/module-flags-3.ll b/test/Verifier/module-flags-3.ll
new file mode 100644
index 0000000..64ab57e
--- /dev/null
+++ b/test/Verifier/module-flags-3.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, null, null}
+
+; CHECK: invalid ID operand in module flag (expected metadata string)
diff --git a/test/Verifier/range-1.ll b/test/Verifier/range-1.ll
index 0b20ca2..fda65cb 100644
--- a/test/Verifier/range-1.ll
+++ b/test/Verifier/range-1.ll
@@ -5,7 +5,7 @@ entry:
   store i8 0, i8* %x, align 1, !range !0
   ret void
 }
-!0 = metadata !{i8 0, i8 1}
+!0 = !{i8 0, i8 1}
 ; CHECK: Ranges are only for loads, calls and invokes!
 ; CHECK-NEXT: store i8 0, i8* %x, align 1, !range !0
 
@@ -14,16 +14,15 @@ entry:
   %y = load i8* %x, align 1, !range !1
   ret i8 %y
 }
-!1 = metadata !{}
+!1 = !{}
 ; CHECK: It should have at least one range!
-; CHECK-NEXT: metadata
 
 define i8 @f3(i8* %x) {
 entry:
   %y = load i8* %x, align 1, !range !2
   ret i8 %y
 }
-!2 = metadata !{i8 0}
+!2 = !{i8 0}
 ; CHECK: Unfinished range!
 
 define i8 @f4(i8* %x) {
@@ -31,7 +30,7 @@ entry:
   %y = load i8* %x, align 1, !range !3
   ret i8 %y
 }
-!3 = metadata !{double 0.0, i8 0}
+!3 = !{double 0.0, i8 0}
 ; CHECK: The lower limit must be an integer!
 
 define i8 @f5(i8* %x) {
@@ -39,7 +38,7 @@ entry:
   %y = load i8* %x, align 1, !range !4
   ret i8 %y
 }
-!4 = metadata !{i8 0, double 0.0}
+!4 = !{i8 0, double 0.0}
 ; CHECK: The upper limit must be an integer!
 
 define i8 @f6(i8* %x) {
@@ -47,7 +46,7 @@ entry:
   %y = load i8* %x, align 1, !range !5
   ret i8 %y
 }
-!5 = metadata !{i32 0, i8 0}
+!5 = !{i32 0, i8 0}
 ; CHECK: Range types must match instruction type!
 ; CHECK:  %y = load
 
@@ -56,7 +55,7 @@ entry:
   %y = load i8* %x, align 1, !range !6
   ret i8 %y
 }
-!6 = metadata !{i8 0, i32 0}
+!6 = !{i8 0, i32 0}
 ; CHECK: Range types must match instruction type!
 ; CHECK:  %y = load
 
@@ -65,7 +64,7 @@ entry:
   %y = load i8* %x, align 1, !range !7
   ret i8 %y
 }
-!7 = metadata !{i32 0, i32 0}
+!7 = !{i32 0, i32 0}
 ; CHECK: Range types must match instruction type!
 ; CHECK:  %y = load
 
@@ -74,7 +73,7 @@ entry:
   %y = load i8* %x, align 1, !range !8
   ret i8 %y
 }
-!8 = metadata !{i8 0, i8 0}
+!8 = !{i8 0, i8 0}
 ; CHECK: Range must not be empty!
 
 define i8 @f10(i8* %x) {
@@ -82,7 +81,7 @@ entry:
   %y = load i8* %x, align 1, !range !9
   ret i8 %y
 }
-!9 = metadata !{i8 0, i8 2, i8 1, i8 3}
+!9 = !{i8 0, i8 2, i8 1, i8 3}
 ; CHECK: Intervals are overlapping
 
 define i8 @f11(i8* %x) {
@@ -90,7 +89,7 @@ entry:
   %y = load i8* %x, align 1, !range !10
   ret i8 %y
 }
-!10 = metadata !{i8 0, i8 2, i8 2, i8 3}
+!10 = !{i8 0, i8 2, i8 2, i8 3}
 ; CHECK: Intervals are contiguous
 
 define i8 @f12(i8* %x) {
@@ -98,7 +97,7 @@ entry:
   %y = load i8* %x, align 1, !range !11
   ret i8 %y
 }
-!11 = metadata !{i8 1, i8 2, i8 -1, i8 0}
+!11 = !{i8 1, i8 2, i8 -1, i8 0}
 ; CHECK: Intervals are not in order
 
 define i8 @f13(i8* %x) {
@@ -106,7 +105,7 @@ entry:
   %y = load i8* %x, align 1, !range !12
   ret i8 %y
 }
-!12 = metadata !{i8 1, i8 3, i8 5, i8 1}
+!12 = !{i8 1, i8 3, i8 5, i8 1}
 ; CHECK: Intervals are contiguous
 
 define i8 @f14(i8* %x) {
@@ -114,7 +113,7 @@ entry:
   %y = load i8* %x, align 1, !range !13
   ret i8 %y
 }
-!13 = metadata !{i8 1, i8 3, i8 5, i8 2}
+!13 = !{i8 1, i8 3, i8 5, i8 2}
 ; CHECK: Intervals are overlapping
 
 define i8 @f15(i8* %x) {
@@ -122,7 +121,7 @@ entry:
   %y = load i8* %x, align 1, !range !14
   ret i8 %y
 }
-!14 = metadata !{i8 10, i8 1, i8 12, i8 13}
+!14 = !{i8 10, i8 1, i8 12, i8 13}
 ; CHECK: Intervals are overlapping
 
 define i8 @f16(i8* %x) {
@@ -130,7 +129,7 @@ entry:
   %y = load i8* %x, align 1, !range !16
   ret i8 %y
 }
-!16 = metadata !{i8 1, i8 3, i8 4, i8 5, i8 6, i8 2}
+!16 = !{i8 1, i8 3, i8 4, i8 5, i8 6, i8 2}
 ; CHECK: Intervals are overlapping
 
 define i8 @f17(i8* %x) {
@@ -138,7 +137,7 @@ entry:
   %y = load i8* %x, align 1, !range !17
   ret i8 %y
 }
-!17 = metadata !{i8 1, i8 3, i8 4, i8 5, i8 6, i8 1}
+!17 = !{i8 1, i8 3, i8 4, i8 5, i8 6, i8 1}
 ; CHECK: Intervals are contiguous
 
 define i8 @f18() {
@@ -146,6 +145,5 @@ entry:
   %y = call i8 undef(), !range !18
   ret i8 %y
 }
-!18 = metadata !{}
+!18 = !{}
 ; CHECK: It should have at least one range!
-; CHECK-NEXT: metadata
diff --git a/test/Verifier/range-2.ll b/test/Verifier/range-2.ll
index 1d2e057..f8891c8 100644
--- a/test/Verifier/range-2.ll
+++ b/test/Verifier/range-2.ll
@@ -5,35 +5,35 @@ entry:
   %y = load i8* %x, align 1, !range !0
   ret i8 %y
 }
-!0 = metadata !{i8 0, i8 1}
+!0 = !{i8 0, i8 1}
 
 define i8 @f2(i8* %x) {
 entry:
   %y = load i8* %x, align 1, !range !1
   ret i8 %y
 }
-!1 = metadata !{i8 255, i8 1}
+!1 = !{i8 255, i8 1}
 
 define i8 @f3(i8* %x) {
 entry:
   %y = load i8* %x, align 1, !range !2
   ret i8 %y
 }
-!2 = metadata !{i8 1, i8 3, i8 5, i8 42}
+!2 = !{i8 1, i8 3, i8 5, i8 42}
 
 define i8 @f4(i8* %x) {
 entry:
   %y = load i8* %x, align 1, !range !3
   ret i8 %y
 }
-!3 = metadata !{i8 -1, i8 0, i8 1, i8 2}
+!3 = !{i8 -1, i8 0, i8 1, i8 2}
 
 define i8 @f5(i8* %x) {
 entry:
   %y = load i8* %x, align 1, !range !4
   ret i8 %y
 }
-!4 = metadata !{i8 -1, i8 0, i8 1, i8 -2}
+!4 = !{i8 -1, i8 0, i8 1, i8 -2}
 
 ; We can annotate the range of the return value of a CALL.
 define void @call_all(i8* %x) {
diff --git a/test/Verifier/statepoint.ll b/test/Verifier/statepoint.ll
new file mode 100644
index 0000000..9342309
--- /dev/null
+++ b/test/Verifier/statepoint.ll
@@ -0,0 +1,83 @@
+; RUN: opt -S %s -verify | FileCheck %s
+
+declare void @use(...)
+declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(i32, i32, i32)
+declare i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32, i32, i32)
+declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()*, i32, i32, ...)
+declare i32 @"personality_function"()
+
+;; Basic usage
+define i64 addrspace(1)* @test1(i8 addrspace(1)* %arg) gc "statepoint-example" {
+entry:
+  %cast = bitcast i8 addrspace(1)* %arg to i64 addrspace(1)*
+  %safepoint_token = call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 10, i32 0, i8 addrspace(1)* %arg, i64 addrspace(1)* %cast, i8 addrspace(1)* %arg, i8 addrspace(1)* %arg)
+  %reloc = call i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %safepoint_token, i32 9, i32 10)
+  ;; It is perfectly legal to relocate the same value multiple times...
+  %reloc2 = call i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %safepoint_token, i32 9, i32 10)
+  %reloc3 = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(i32 %safepoint_token, i32 10, i32 9)
+  ret i64 addrspace(1)* %reloc
+; CHECK-LABEL: test1
+; CHECK: statepoint
+; CHECK: gc.relocate
+; CHECK: gc.relocate
+; CHECK: gc.relocate
+; CHECK: ret i64 addrspace(1)* %reloc
+}
+
+; This test catches two cases where the verifier was too strict:
+; 1) A base doesn't need to be relocated if it's never used again
+; 2) A value can be replaced by one which is known equal.  This
+; means a potentially derived pointer can be known base and that
+; we can't check that derived pointer are never bases.
+define void @test2(i8 addrspace(1)* %arg, i64 addrspace(1)* %arg2) gc "statepoint-example" {
+entry:
+  %cast = bitcast i8 addrspace(1)* %arg to i64 addrspace(1)*
+  %c = icmp eq i64 addrspace(1)* %cast,  %arg2
+  br i1 %c, label %equal, label %notequal
+
+notequal:
+  ret void
+
+equal:
+  %safepoint_token = call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 10, i32 0, i8 addrspace(1)* %arg, i64 addrspace(1)* %cast, i8 addrspace(1)* %arg, i8 addrspace(1)* %arg)
+  %reloc = call i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %safepoint_token, i32 9, i32 10)
+  call void undef(i64 addrspace(1)* %reloc)
+  ret void
+; CHECK-LABEL: test2
+; CHECK-LABEL: equal
+; CHECK: statepoint
+; CHECK-NEXT: %reloc = call 
+; CHECK-NEXT: call
+; CHECK-NEXT: ret voi
+}
+
+; Basic test for invoke statepoints
+define i8 addrspace(1)* @test3(i8 addrspace(1)* %obj, i8 addrspace(1)* %obj1) gc "statepoint-example" {
+; CHECK-LABEL: test3
+entry:
+  ; CHECK-LABEL: entry
+  ; CHECK: statepoint
+  %0 = invoke i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i8 addrspace(1)* %obj, i8 addrspace(1)* %obj1)
+          to label %normal_dest unwind label %exceptional_return
+
+normal_dest:
+  ; CHECK-LABEL: normal_dest:
+  ; CHECK: gc.relocate
+  ; CHECK: gc.relocate
+  ; CHECK: ret
+  %obj.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(i32 %0, i32 9, i32 9)
+  %obj1.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(i32 %0, i32 10, i32 10)
+  ret i8 addrspace(1)* %obj.relocated
+
+exceptional_return:
+  ; CHECK-LABEL: exceptional_return
+  ; CHECK: gc.relocate
+  ; CHECK: gc.relocate
+  %landing_pad = landingpad { i8*, i32 } personality i32 ()* @"personality_function"
+          cleanup
+  %relocate_token = extractvalue { i8*, i32 } %landing_pad, 1
+  %obj.relocated1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(i32 %relocate_token, i32 9, i32 9)
+  %obj1.relocated1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(i32 %relocate_token, i32 10, i32 10)
+  ret i8 addrspace(1)* %obj1.relocated1
+}
+
diff --git a/test/lit.cfg b/test/lit.cfg
index 372e091..7b7a269 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -176,7 +176,7 @@ lli = 'lli'
 # we don't support COFF in MCJIT well enough for the tests, force ELF format on
 # Windows.  FIXME: the process target triple should be used here, but this is
 # difficult to obtain on Windows.
-if re.search(r'cygwin|mingw32|win32', config.host_triple):
+if re.search(r'cygwin|mingw32|windows-gnu|win32', config.host_triple):
   lli += ' -mtriple='+config.host_triple+'-elf'
 config.substitutions.append( ('%lli', lli ) )
 
@@ -187,6 +187,7 @@ if re.search(r'win32', config.target_triple):
 config.substitutions.append( ('%llc_dwarf', llc_dwarf) )
 
 # Add site-specific substitutions.
+config.substitutions.append( ('%gold', config.gold_executable) )
 config.substitutions.append( ('%go', config.go_executable) )
 config.substitutions.append( ('%llvmshlibdir', config.llvm_shlib_dir) )
 config.substitutions.append( ('%shlibext', config.llvm_shlib_ext) )
@@ -196,7 +197,8 @@ config.substitutions.append( ('%python', config.python_executable) )
 # OCaml substitutions.
 # Support tests for both native and bytecode builds.
 config.substitutions.append( ('%ocamlc',
-    "%s ocamlc %s" % (config.ocamlfind_executable, config.ocaml_flags)) )
+    "%s ocamlc -cclib -L%s %s" %
+        (config.ocamlfind_executable, llvm_lib_dir, config.ocaml_flags)) )
 if config.have_ocamlopt in ('1', 'TRUE'):
     config.substitutions.append( ('%ocamlopt',
         "%s ocamlopt -cclib -L%s -cclib -Wl,-rpath,%s %s" %
@@ -228,6 +230,7 @@ for pattern in [r"\bbugpoint\b(?!-)",
                 r"\bllvm-cov\b",
                 r"\bllvm-diff\b",
                 r"\bllvm-dis\b",
+                r"\bllvm-dsymutil\b",
                 r"\bllvm-dwarfdump\b",
                 r"\bllvm-extract\b",
                 r"\bllvm-go\b",
@@ -268,6 +271,10 @@ for pattern in [r"\bbugpoint\b(?!-)",
         # Warn, but still provide a substitution.
         lit_config.note('Did not find ' + tool_name + ' in ' + llvm_tools_dir)
         tool_path = llvm_tools_dir + '/' + tool_name
+    if (tool_name == "llc" and
+       'LLVM_ENABLE_MACHINE_VERIFIER' in os.environ and
+       os.environ['LLVM_ENABLE_MACHINE_VERIFIER'] == "1"):
+        tool_path += " -verify-machineinstrs"
     config.substitutions.append((pattern, tool_pipe + tool_path))
 
 ### Targets
@@ -307,7 +314,7 @@ else:
 
 # Direct object generation
 # Suppress x86_64-mingw32 while investigating since r219108.
-if not 'hexagon' in config.target_triple and not re.match(r'^x86_64.*-(mingw32|win32)', config.target_triple):
+if not 'hexagon' in config.target_triple and not re.match(r'^x86_64.*-(mingw32|windows-gnu|win32)', config.target_triple):
     config.available_features.add("object-emission")
 
 if config.have_zlib == "1":
@@ -327,8 +334,8 @@ def have_ld_plugin_support():
     if not os.path.exists(os.path.join(config.llvm_shlib_dir, 'LLVMgold.so')):
         return False
 
-    ld_cmd = subprocess.Popen(['ld', '--help'], stdout = subprocess.PIPE)
-    ld_out = ld_cmd.stdout.read()
+    ld_cmd = subprocess.Popen([config.gold_executable, '--help'], stdout = subprocess.PIPE)
+    ld_out = ld_cmd.stdout.read().decode()
     ld_cmd.wait()
 
     if not '-plugin' in ld_out:
@@ -346,8 +353,8 @@ def have_ld_plugin_support():
     if 'elf32ppc' not in emulations or 'elf_x86_64' not in emulations:
         return False
 
-    ld_version = subprocess.Popen(['ld', '--version'], stdout = subprocess.PIPE)
-    if not 'GNU gold' in ld_version.stdout.read():
+    ld_version = subprocess.Popen([config.gold_executable, '--version'], stdout = subprocess.PIPE)
+    if not 'GNU gold' in ld_version.stdout.read().decode():
         return False
     ld_version.wait()
 
@@ -382,7 +389,7 @@ if 'darwin' == sys.platform:
     sysctl_cmd.wait()
 
 # .debug_frame is not emitted for targeting Windows x64.
-if not re.match(r'^x86_64.*-(mingw32|win32)', config.target_triple):
+if not re.match(r'^x86_64.*-(mingw32|windows-gnu|win32)', config.target_triple):
     config.available_features.add('debug_frame')
 
 # Check if we should use gmalloc.
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 7d2c833..1c19fd1 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -7,12 +7,13 @@ config.target_triple = "@TARGET_TRIPLE@"
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
-config.llvm_lib_dir = "@LIBDIR@"
+config.llvm_lib_dir = "@LLVM_LIBRARY_DIR@"
 config.llvm_shlib_dir = "@SHLIBDIR@"
 config.llvm_shlib_ext = "@SHLIBEXT@"
 config.llvm_exe_ext = "@EXEEXT@"
 config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
 config.python_executable = "@PYTHON_EXECUTABLE@"
+config.gold_executable = "@GOLD_EXECUTABLE@"
 config.ocamlfind_executable = "@OCAMLFIND@"
 config.have_ocamlopt = "@HAVE_OCAMLOPT@"
 config.have_ocaml_ounit = "@HAVE_OCAML_OUNIT@"
@@ -30,6 +31,7 @@ config.host_ldflags = "@HOST_LDFLAGS@"
 config.llvm_use_intel_jitevents = "@LLVM_USE_INTEL_JITEVENTS@"
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 config.have_zlib = "@HAVE_LIBZ@"
+config.have_dia_sdk = @HAVE_DIA_SDK@
 config.enable_ffi = "@LLVM_ENABLE_FFI@"
 
 # Support substitution of the tools_dir with user parameters. This is
diff --git a/test/tools/dsymutil/Inputs/basic-archive.macho.x86_64 b/test/tools/dsymutil/Inputs/basic-archive.macho.x86_64
new file mode 100755
index 0000000..abffb06
--- /dev/null
+++ b/test/tools/dsymutil/Inputs/basic-archive.macho.x86_64
diff --git a/test/tools/dsymutil/Inputs/basic-lto.macho.x86_64 b/test/tools/dsymutil/Inputs/basic-lto.macho.x86_64
new file mode 100755
index 0000000..b5ffb03
--- /dev/null
+++ b/test/tools/dsymutil/Inputs/basic-lto.macho.x86_64
diff --git a/test/tools/dsymutil/Inputs/basic-lto.macho.x86_64.o b/test/tools/dsymutil/Inputs/basic-lto.macho.x86_64.o
new file mode 100644
index 0000000..c68e15a
--- /dev/null
+++ b/test/tools/dsymutil/Inputs/basic-lto.macho.x86_64.o
diff --git a/test/tools/dsymutil/Inputs/basic.macho.x86_64 b/test/tools/dsymutil/Inputs/basic.macho.x86_64
new file mode 100755
index 0000000..8b3a34a
--- /dev/null
+++ b/test/tools/dsymutil/Inputs/basic.macho.x86_64
diff --git a/test/tools/dsymutil/Inputs/basic1.c b/test/tools/dsymutil/Inputs/basic1.c
new file mode 100644
index 0000000..cedf83a
--- /dev/null
+++ b/test/tools/dsymutil/Inputs/basic1.c
@@ -0,0 +1,28 @@
+/* This is the main file used to produce the basic* objects that are
+   used for the dsymutil tests.
+
+   These are compiled in a couple of different ways (always on a
+   Darwin system):
+   Basic compilation:
+      for FILE in basic1.c basic2.c basic3.c; do
+         clang -g -c $FILE -o ${FILE%.c}.macho.x86_64.o
+      done
+      clang basic1.macho.x86_64.o basic2.macho.x86_64.o basic3.macho.x86_64.o -o basic.macho.x86_64 -Wl,-dead_strip
+
+   LTO compilation:
+      for FILE in basic1.c basic2.c basic3.c; do
+         clang -g -c -flto $FILE -o ${FILE%.c}-lto.o
+      done
+      clang basic1-lto.o basic2-lto.o basic3-lto.o -o basic-lto.macho.x86_64 -Wl,-object_path_lto,$PWD/basic-lto.macho.x86_64.o -Wl,-dead_strip
+      rm basic1-lto.o basic2-lto.o basic3-lto.o
+
+   Archive compilation (after basic compilation):
+      ar -q libbasic.a basic2.macho.x86_64.o basic3.macho.x86_64.o
+      clang basic1.macho.x86_64.o -lbasic -o basic-archive.macho.x86_64 -Wl,-dead_strip -L.
+*/
+
+int foo(int);
+
+int main(int argc, const char *argv[]) {
+  return foo(argc);
+}
diff --git a/test/tools/dsymutil/Inputs/basic1.macho.x86_64.o b/test/tools/dsymutil/Inputs/basic1.macho.x86_64.o
new file mode 100644
index 0000000..d7b5000
--- /dev/null
+++ b/test/tools/dsymutil/Inputs/basic1.macho.x86_64.o
diff --git a/test/tools/dsymutil/Inputs/basic2.c b/test/tools/dsymutil/Inputs/basic2.c
new file mode 100644
index 0000000..13c6d07
--- /dev/null
+++ b/test/tools/dsymutil/Inputs/basic2.c
@@ -0,0 +1,22 @@
+/* For compilation instructions see basic1.c. */
+
+static int baz = 42;
+static int private_int;
+extern volatile int val;
+int unused_data = 1;
+
+int bar(int);
+
+void unused1() {
+  bar(baz);
+}
+
+static int inc() {
+  return ++private_int;
+}
+
+__attribute__((noinline))
+int foo(int arg) {
+  return bar(arg+val) + inc() + baz++;
+}
+
diff --git a/test/tools/dsymutil/Inputs/basic2.macho.x86_64.o b/test/tools/dsymutil/Inputs/basic2.macho.x86_64.o
new file mode 100644
index 0000000..bdd8225
--- /dev/null
+++ b/test/tools/dsymutil/Inputs/basic2.macho.x86_64.o
diff --git a/test/tools/dsymutil/Inputs/basic3.c b/test/tools/dsymutil/Inputs/basic3.c
new file mode 100644
index 0000000..f20998a
--- /dev/null
+++ b/test/tools/dsymutil/Inputs/basic3.c
@@ -0,0 +1,20 @@
+/* For compilation instructions see basic1.c. */
+
+volatile int val;
+
+extern int foo(int);
+
+int unused2() {
+  return foo(val);
+}
+
+static int inc() {
+  return ++val;
+}
+
+__attribute__((noinline))
+int bar(int arg) {
+  if (arg > 42)
+    return inc();
+  return foo(val + arg);
+}
diff --git a/test/tools/dsymutil/Inputs/basic3.macho.x86_64.o b/test/tools/dsymutil/Inputs/basic3.macho.x86_64.o
new file mode 100644
index 0000000..3c1c639
--- /dev/null
+++ b/test/tools/dsymutil/Inputs/basic3.macho.x86_64.o
diff --git a/test/tools/dsymutil/Inputs/libbasic.a b/test/tools/dsymutil/Inputs/libbasic.a
new file mode 100644
index 0000000..9657e78
--- /dev/null
+++ b/test/tools/dsymutil/Inputs/libbasic.a
diff --git a/test/tools/dsymutil/basic-linking.test b/test/tools/dsymutil/basic-linking.test
new file mode 100644
index 0000000..5de6c13
--- /dev/null
+++ b/test/tools/dsymutil/basic-linking.test
@@ -0,0 +1,149 @@
+RUN: llvm-dsymutil -v -oso-prepend-path=%p %p/Inputs/basic.macho.x86_64 | FileCheck %s
+RUN: llvm-dsymutil -v -oso-prepend-path=%p %p/Inputs/basic-lto.macho.x86_64 | FileCheck %s --check-prefix=CHECK-LTO
+RUN: llvm-dsymutil -v -oso-prepend-path=%p %p/Inputs/basic-archive.macho.x86_64 | FileCheck %s --check-prefix=CHECK-ARCHIVE
+
+This test check the basic Dwarf linking process through the debug dumps.
+
+================================= Simple link ================================
+CHECK: DEBUG MAP OBJECT: {{.*}}basic1.macho.x86_64.o
+CHECK: Input compilation unit:
+CHECK-NEXT: TAG_compile_unit
+CHECK-NOT: TAG
+CHECK: AT_name {{.*}}basic1.c
+CHECK-NOT: Found valid debug map entry
+CHECK: Found valid debug map entry: _main 	0000000000000000 => 0000000100000ea0
+CHECK-NEXT: DW_TAG_subprogram
+CHECK-NEXT:   DW_AT_name{{.*}}"main"
+
+CHECK: DEBUG MAP OBJECT: {{.*}}basic2.macho.x86_64.o
+CHECK: Input compilation unit:
+CHECK-NEXT: TAG_compile_unit
+CHECK-NOT: TAG
+CHECK: AT_name {{.*}}basic2.c
+CHECK-NOT: Found valid debug map entry
+CHECK: Found valid debug map entry: _private_int 	0000000000000560 => 0000000100001008
+CHECK-NEXT: DW_TAG_variable
+CHECK-NEXT:   DW_AT_name {{.*}}"private_int"
+CHECK-NOT: Found valid debug map entry
+CHECK: Found valid debug map entry: _baz 	0000000000000310 => 0000000100001000
+CHECK-NEXT: DW_TAG_variable
+CHECK-NEXT:   DW_AT_name {{.*}}"baz"
+CHECK-NOT: Found valid debug map entry
+CHECK: Found valid debug map entry: _foo 	0000000000000020 => 0000000100000ed0
+CHECK-NEXT: DW_TAG_subprogram
+CHECK-NEXT:   DW_AT_name {{.*}}"foo"
+CHECK-NOT: Found valid debug map entry
+CHECK: Found valid debug map entry: _inc 	0000000000000070 => 0000000100000f20
+CHECK-NEXT: DW_TAG_subprogram
+CHECK-NEXT:   DW_AT_name {{.*}}"inc"
+
+CHECK: DEBUG MAP OBJECT: {{.*}}basic3.macho.x86_64.o
+CHECK: Input compilation unit:
+CHECK-NEXT: TAG_compile_unit
+CHECK-NOT: TAG
+CHECK: AT_name {{.*}}basic3.c
+CHECK-NOT: Found valid debug map entry
+CHECK: Found valid debug map entry: _val 	0000000000000004 => 0000000100001004
+CHECK-NEXT: DW_TAG_variable
+CHECK-NEXT:   DW_AT_name {{.*}}"val"
+CHECK-NOT: Found valid debug map entry
+CHECK: Found valid debug map entry: _bar 	0000000000000020 => 0000000100000f40
+CHECK-NEXT: DW_TAG_subprogram
+CHECK-NEXT:   DW_AT_name {{.*}}"bar"
+CHECK-NOT: Found valid debug map entry
+CHECK: Found valid debug map entry: _inc 	0000000000000070 => 0000000100000f90
+CHECK-NEXT: DW_TAG_subprogram
+CHECK-NEXT:   DW_AT_name {{.*}}"inc")
+
+
+================================= LTO link ================================
+CHECK-LTO: DEBUG MAP OBJECT: {{.*}}basic-lto.macho.x86_64.o
+CHECK-LTO: Input compilation unit:
+CHECK-LTO-NEXT: TAG_compile_unit
+CHECK-LTO-NOT: TAG
+CHECK-LTO: AT_name {{.*}}basic1.c
+CHECK-LTO: Input compilation unit:
+CHECK-LTO-NEXT: TAG_compile_unit
+CHECK-LTO-NOT: TAG
+CHECK-LTO: AT_name {{.*}}basic2.c
+CHECK-LTO: Input compilation unit:
+CHECK-LTO-NEXT: TAG_compile_unit
+CHECK-LTO-NOT: TAG
+CHECK-LTO: AT_name {{.*}}basic3.c
+
+CHECK-LTO-NOT: Found valid debug map entry
+CHECK-LTO: Found valid debug map entry: _main 	0000000000000000 => 0000000100000f40
+CHECK-LTO-NEXT: DW_TAG_subprogram
+CHECK-LTO-NEXT:   DW_AT_name {{.*}}"main"
+CHECK-LTO-NOT: Found valid debug map entry
+CHECK-LTO: Found valid debug map entry: _private_int 	00000000000008e8 => 0000000100001008
+CHECK-LTO-NEXT: DW_TAG_variable
+CHECK-LTO-NEXT:   DW_AT_name {{.*}}"private_int"
+CHECK-LTO-NOT: Found valid debug map entry
+CHECK-LTO: Found valid debug map entry: _baz 	0000000000000658 => 0000000100001000
+CHECK-LTO-NEXT: DW_TAG_variable
+CHECK-LTO-NEXT:   DW_AT_name {{.*}} "baz"
+CHECK-LTO-NOT: Found valid debug map entry
+CHECK-LTO: Found valid debug map entry: _foo 	0000000000000010 => 0000000100000f50
+CHECK-LTO-NEXT: DW_TAG_subprogram
+CHECK-LTO-NEXT:   DW_AT_name {{.*}}"foo"
+CHECK-LTO-NOT: Found valid debug map entry
+CHECK-LTO: Found valid debug map entry: _val 	00000000000008ec => 0000000100001004
+CHECK-LTO-NEXT: DW_TAG_variable
+CHECK-LTO-NEXT:   DW_AT_name {{.*}}"val"
+CHECK-LTO-NOT: Found valid debug map entry
+CHECK-LTO: Found valid debug map entry: _bar 	0000000000000050 => 0000000100000f90
+CHECK-LTO-NEXT: DW_TAG_subprogram
+CHECK-LTO-NEXT:   DW_AT_name {{.*}}"bar"
+
+
+================================= Archive link ================================
+CHECK-ARCHIVE: DEBUG MAP OBJECT: {{.*}}basic1.macho.x86_64.o
+CHECK-ARCHIVE: Input compilation unit:
+CHECK-ARCHIVE-NEXT: TAG_compile_unit
+CHECK-ARCHIVE-NOT: TAG
+CHECK-ARCHIVE: AT_name {{.*}}basic1.c
+CHECK-ARCHIVE-NOT: Found valid debug map entry
+CHECK-ARCHIVE: Found valid debug map entry: _main 	0000000000000000 => 0000000100000ea0
+CHECK-ARCHIVE-NEXT: DW_TAG_subprogram
+CHECK-ARCHIVE-NEXT:   DW_AT_name{{.*}}"main"
+
+CHECK-ARCHIVE: DEBUG MAP OBJECT: {{.*}}libbasic.a(basic2.macho.x86_64.o)
+CHECK-ARCHIVE: Input compilation unit:
+CHECK-ARCHIVE-NEXT: TAG_compile_unit
+CHECK-ARCHIVE-NOT: TAG
+CHECK-ARCHIVE: AT_name {{.*}}basic2.c
+CHECK-ARCHIVE-NOT: Found valid debug map entry
+CHECK-ARCHIVE: Found valid debug map entry: _private_int 	0000000000000560 => 0000000100001004
+CHECK-ARCHIVE-NEXT: DW_TAG_variable
+CHECK-ARCHIVE-NEXT:   DW_AT_name {{.*}}"private_int"
+CHECK-ARCHIVE-NOT: Found valid debug map entry
+CHECK-ARCHIVE: Found valid debug map entry: _baz 	0000000000000310 => 0000000100001000
+CHECK-ARCHIVE-NEXT: DW_TAG_variable
+CHECK-ARCHIVE-NEXT:   DW_AT_name {{.*}}"baz"
+CHECK-ARCHIVE-NOT: Found valid debug map entry
+CHECK-ARCHIVE: Found valid debug map entry: _foo 	0000000000000020 => 0000000100000ed0
+CHECK-ARCHIVE-NEXT: DW_TAG_subprogram
+CHECK-ARCHIVE-NEXT:   DW_AT_name {{.*}}"foo"
+CHECK-ARCHIVE-NOT: Found valid debug map entry
+CHECK-ARCHIVE: Found valid debug map entry: _inc 	0000000000000070 => 0000000100000f20
+CHECK-ARCHIVE-NEXT: DW_TAG_subprogram
+CHECK-ARCHIVE-NEXT:   DW_AT_name {{.*}}"inc"
+
+CHECK-ARCHIVE: DEBUG MAP OBJECT: {{.*}}libbasic.a(basic3.macho.x86_64.o)
+CHECK-ARCHIVE: Input compilation unit:
+CHECK-ARCHIVE-NEXT: TAG_compile_unit
+CHECK-ARCHIVE-NOT: TAG
+CHECK-ARCHIVE: AT_name {{.*}}basic3.c
+CHECK-ARCHIVE-NOT: Found valid debug map entry
+CHECK-ARCHIVE: Found valid debug map entry: _val 	0000000000000004 => 0000000100001008
+CHECK-ARCHIVE-NEXT: DW_TAG_variable
+CHECK-ARCHIVE-NEXT:   DW_AT_name {{.*}}"val"
+CHECK-ARCHIVE-NOT: Found valid debug map entry
+CHECK-ARCHIVE: Found valid debug map entry: _bar 	0000000000000020 => 0000000100000f40
+CHECK-ARCHIVE-NEXT: DW_TAG_subprogram
+CHECK-ARCHIVE-NEXT:   DW_AT_name {{.*}}"bar"
+CHECK-ARCHIVE-NOT: Found valid debug map entry
+CHECK-ARCHIVE: Found valid debug map entry: _inc 	0000000000000070 => 0000000100000f90
+CHECK-ARCHIVE-NEXT: DW_TAG_subprogram
+CHECK-ARCHIVE-NEXT:   DW_AT_name {{.*}}"inc")
diff --git a/test/tools/dsymutil/debug-map-parsing.test b/test/tools/dsymutil/debug-map-parsing.test
new file mode 100644
index 0000000..b64ad9f
--- /dev/null
+++ b/test/tools/dsymutil/debug-map-parsing.test
@@ -0,0 +1,79 @@
+RUN: llvm-dsymutil -v -parse-only -oso-prepend-path=%p %p/Inputs/basic.macho.x86_64 | FileCheck %s
+RUN: llvm-dsymutil -v -parse-only -oso-prepend-path=%p %p/Inputs/basic-lto.macho.x86_64 | FileCheck %s --check-prefix=CHECK-LTO
+RUN: llvm-dsymutil -v -parse-only -oso-prepend-path=%p %p/Inputs/basic-archive.macho.x86_64 | FileCheck %s --check-prefix=CHECK-ARCHIVE
+RUN: llvm-dsymutil -v -parse-only %p/Inputs/basic.macho.x86_64 2>&1 | FileCheck %s --check-prefix=NOT-FOUND
+RUN: not llvm-dsymutil -v -parse-only %p/Inputs/inexistant 2>&1 | FileCheck %s --check-prefix=NO-EXECUTABLE
+
+
+Check that We can parse the debug map of the basic executable.
+
+CHECK-NOT: error
+CHECK: DEBUG MAP: x86_64-unknown-unknown-macho
+CHECK: /Inputs/basic1.macho.x86_64.o:
+CHECK: 	0000000000000000 => 0000000100000ea0	_main
+CHECK: /Inputs/basic2.macho.x86_64.o:
+CHECK: 	0000000000000310 => 0000000100001000	_baz
+CHECK: 	0000000000000020 => 0000000100000ed0	_foo
+CHECK: 	0000000000000070 => 0000000100000f20	_inc
+CHECK: 	0000000000000560 => 0000000100001008	_private_int
+CHECK: /Inputs/basic3.macho.x86_64.o:
+CHECK: 	0000000000000020 => 0000000100000f40	_bar
+CHECK: 	0000000000000070 => 0000000100000f90	_inc
+CHECK: 	0000000000000004 => 0000000100001004	_val
+CHECK: END DEBUG MAP
+
+
+Check that we can parse the debug-map of the basic-lto executable
+
+CHECK-LTO-NOT: error
+CHECK-LTO: DEBUG MAP: x86_64-unknown-unknown-macho
+CHECK-LTO: /Inputs/basic-lto.macho.x86_64.o:
+CHECK-LTO: 	0000000000000050 => 0000000100000f90	_bar
+CHECK-LTO: 	0000000000000658 => 0000000100001000	_baz
+CHECK-LTO: 	0000000000000010 => 0000000100000f50	_foo
+CHECK-LTO: 	0000000000000000 => 0000000100000f40	_main
+CHECK-LTO: 	00000000000008e8 => 0000000100001008	_private_int
+CHECK-LTO: 	00000000000008ec => 0000000100001004	_val
+CHECK-LTO: END DEBUG MAP
+
+Check thet we correctly handle debug maps with archive members (including only
+opening the archive once if mulitple of its members are used).
+
+CHECK-ARCHIVE:      trying to open {{.*}}basic-archive.macho.x86_64'
+CHECK-ARCHIVE-NEXT: 	loaded file.
+CHECK-ARCHIVE-NEXT: trying to open {{.*}}/Inputs/basic1.macho.x86_64.o'
+CHECK-ARCHIVE-NEXT: 	loaded file.
+CHECK-ARCHIVE-NEXT: trying to open {{.*}}/libbasic.a(basic2.macho.x86_64.o)'
+CHECK-ARCHIVE-NEXT: 	opened new archive {{.*}}/libbasic.a'
+CHECK-ARCHIVE-NEXT: 	found member in current archive.
+CHECK-ARCHIVE-NEXT: trying to open {{.*}}/libbasic.a(basic3.macho.x86_64.o)'
+CHECK-ARCHIVE-NEXT: 	found member in current archive.
+CHECK-ARCHIVE: DEBUG MAP: x86_64-unknown-unknown-macho
+CHECK-ARCHIVE:       object addr =>  executable addr	symbol name
+CHECK-ARCHIVE: /Inputs/basic1.macho.x86_64.o:
+CHECK-ARCHIVE: 	0000000000000000 => 0000000100000ea0	_main
+CHECK-ARCHIVE: /Inputs/./libbasic.a(basic2.macho.x86_64.o):
+CHECK-ARCHIVE: 	0000000000000310 => 0000000100001000	_baz
+CHECK-ARCHIVE: 	0000000000000020 => 0000000100000ed0	_foo
+CHECK-ARCHIVE: 	0000000000000070 => 0000000100000f20	_inc
+CHECK-ARCHIVE: 	0000000000000560 => 0000000100001004	_private_int
+CHECK-ARCHIVE: /Inputs/./libbasic.a(basic3.macho.x86_64.o):
+CHECK-ARCHIVE: 	0000000000000020 => 0000000100000f40	_bar
+CHECK-ARCHIVE: 	0000000000000070 => 0000000100000f90	_inc
+CHECK-ARCHIVE: 	0000000000000004 => 0000000100001008	_val
+CHECK-ARCHIVE: END DEBUG MAP
+
+Check that we warn about missing object files (this presumes that the files aren't
+present in the machine's /Inputs/ folder, which should be a pretty safe bet).
+
+NOT-FOUND: cannot open{{.*}}"/Inputs/basic1.macho.x86_64.o": {{[Nn]o}} such file
+NOT-FOUND: cannot open{{.*}}"/Inputs/basic2.macho.x86_64.o": {{[Nn]o}} such file
+NOT-FOUND: cannot open{{.*}}"/Inputs/basic3.macho.x86_64.o": {{[Nn]o}} such file
+NOT-FOUND: DEBUG MAP:
+NOT-FOUND-NEXT:  object addr =>  executable addr	symbol name
+NOT-FOUND-NEXT: END DEBUG MAP
+
+Check that we correctly error out on invalid executatble.
+
+NO-EXECUTABLE: cannot parse{{.*}}/inexistant": {{[Nn]o}} such file
+NO-EXECUTABLE-NOT: DEBUG MAP
diff --git a/test/tools/gold/Inputs/comdat.ll b/test/tools/gold/Inputs/comdat.ll
index e9e4704..464aefa 100644
--- a/test/tools/gold/Inputs/comdat.ll
+++ b/test/tools/gold/Inputs/comdat.ll
@@ -1,7 +1,12 @@
 $c2 = comdat any
+$c1 = comdat any
 
-@v1 = weak_odr global i32 41, comdat $c2
-define weak_odr protected i32 @f1(i8* %this) comdat $c2 {
+; This is only present in this file. The linker will keep $c1 from the first
+; file and this will be undefined.
+@will_be_undefined = global i32 1, comdat($c1)
+
+@v1 = weak_odr global i32 41, comdat($c2)
+define weak_odr protected i32 @f1(i8* %this) comdat($c2) {
 bb20:
   store i8* %this, i8** null
   br label %bb21
diff --git a/test/tools/gold/Inputs/drop-debug.bc b/test/tools/gold/Inputs/drop-debug.bc
new file mode 100644
index 0000000..f9c471f
--- /dev/null
+++ b/test/tools/gold/Inputs/drop-debug.bc
diff --git a/test/tools/gold/alias.ll b/test/tools/gold/alias.ll
index dbf3af5..b4edb05 100644
--- a/test/tools/gold/alias.ll
+++ b/test/tools/gold/alias.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as %s -o %t.o
 ; RUN: llvm-as %p/Inputs/alias-1.ll -o %t2.o
-; RUN: ld -shared -o %t3.o -plugin %llvmshlibdir/LLVMgold.so %t2.o %t.o \
+; RUN: %gold -shared -o %t3.o -plugin %llvmshlibdir/LLVMgold.so %t2.o %t.o \
 ; RUN:  -plugin-opt=emit-llvm
 ; RUN: llvm-dis < %t3.o -o - | FileCheck %s
 
diff --git a/test/tools/gold/bad-alias.ll b/test/tools/gold/bad-alias.ll
index e0fc788..a98bf71 100644
--- a/test/tools/gold/bad-alias.ll
+++ b/test/tools/gold/bad-alias.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as %s -o %t.o
 
-; RUN: not ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN: not %gold -plugin %llvmshlibdir/LLVMgold.so \
 ; RUN:    --plugin-opt=emit-llvm \
 ; RUN:    -shared %t.o -o %t2.o 2>&1 | FileCheck %s
 
diff --git a/test/tools/gold/bcsection.ll b/test/tools/gold/bcsection.ll
index 8565d9d..37d2994 100644
--- a/test/tools/gold/bcsection.ll
+++ b/test/tools/gold/bcsection.ll
@@ -2,7 +2,7 @@
 
 ; RUN: llvm-mc -I=%T -filetype=obj -o %T/bcsection.bco %p/Inputs/bcsection.s
 ; RUN: llvm-nm -no-llvm-bc %T/bcsection.bco | count 0
-; RUN: ld -r -o %T/bcsection.o -plugin %llvmshlibdir/LLVMgold.so %T/bcsection.bco
+; RUN: %gold -r -o %T/bcsection.o -plugin %llvmshlibdir/LLVMgold.so %T/bcsection.bco
 ; RUN: llvm-nm -no-llvm-bc %T/bcsection.o | FileCheck %s
 
 ; CHECK: main
diff --git a/test/tools/gold/coff.ll b/test/tools/gold/coff.ll
new file mode 100644
index 0000000..5d8a1c9
--- /dev/null
+++ b/test/tools/gold/coff.ll
@@ -0,0 +1,22 @@
+; RUN: llvm-as %s -o %t.o
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so -plugin-opt=emit-llvm \
+; RUN:    -shared %t.o -o %t2.o
+; RUN: llvm-dis %t2.o -o - | FileCheck %s
+
+
+target datalayout = "m:w"
+
+; CHECK: define void @f() {
+define void @f() {
+  ret void
+}
+
+; CHECK: define internal void @g() {
+define hidden void @g() {
+  ret void
+}
+
+; CHECK: define internal void @h() {
+define linkonce_odr void @h() {
+  ret void
+}
diff --git a/test/tools/gold/comdat.ll b/test/tools/gold/comdat.ll
index ba3abce..370bf56 100644
--- a/test/tools/gold/comdat.ll
+++ b/test/tools/gold/comdat.ll
@@ -1,13 +1,13 @@
 ; RUN: llvm-as %s -o %t.o
 ; RUN: llvm-as %p/Inputs/comdat.ll -o %t2.o
-; RUN: ld -shared -o %t3.o -plugin %llvmshlibdir/LLVMgold.so %t.o %t2.o \
+; RUN: %gold -shared -o %t3.o -plugin %llvmshlibdir/LLVMgold.so %t.o %t2.o \
 ; RUN:  -plugin-opt=emit-llvm
 ; RUN: llvm-dis %t3.o -o - | FileCheck %s
 
 $c1 = comdat any
 
-@v1 = weak_odr global i32 42, comdat $c1
-define weak_odr i32 @f1(i8*) comdat $c1 {
+@v1 = weak_odr global i32 42, comdat($c1)
+define weak_odr i32 @f1(i8*) comdat($c1) {
 bb10:
   br label %bb11
 bb11:
@@ -27,7 +27,7 @@ bb11:
 ; CHECK: $c1 = comdat any
 ; CHECK: $c2 = comdat any
 
-; CHECK: @v1 = weak_odr global i32 42, comdat $c1
+; CHECK: @v1 = weak_odr global i32 42, comdat($c1)
 
 ; CHECK: @r11 = global i32* @v1{{$}}
 ; CHECK: @r12 = global i32 (i8*)* @f1{{$}}
@@ -35,7 +35,7 @@ bb11:
 ; CHECK: @r21 = global i32* @v1{{$}}
 ; CHECK: @r22 = global i32 (i8*)* @f1{{$}}
 
-; CHECK: @v11 = internal global i32 41, comdat $c2
+; CHECK: @v11 = internal global i32 41, comdat($c2)
 
 ; CHECK: @a11 = alias i32* @v1{{$}}
 ; CHECK: @a12 = alias bitcast (i32* @v1 to i16*)
@@ -49,14 +49,14 @@ bb11:
 ; CHECK: @a23 = alias i32 (i8*)* @f12{{$}}
 ; CHECK: @a24 = alias bitcast (i32 (i8*)* @f12 to i16*)
 
-; CHECK:      define weak_odr protected i32 @f1(i8*) comdat $c1 {
+; CHECK:      define weak_odr protected i32 @f1(i8*) comdat($c1) {
 ; CHECK-NEXT: bb10:
 ; CHECK-NEXT:   br label %bb11{{$}}
 ; CHECK:      bb11:
 ; CHECK-NEXT:   ret i32 42
 ; CHECK-NEXT: }
 
-; CHECK:      define internal i32 @f12(i8* %this) comdat $c2 {
+; CHECK:      define internal i32 @f12(i8* %this) comdat($c2) {
 ; CHECK-NEXT: bb20:
 ; CHECK-NEXT:   store i8* %this, i8** null
 ; CHECK-NEXT:   br label %bb21
diff --git a/test/tools/gold/common.ll b/test/tools/gold/common.ll
index f309231..ef18e68 100644
--- a/test/tools/gold/common.ll
+++ b/test/tools/gold/common.ll
@@ -1,7 +1,7 @@
 ; RUN: llvm-as %s -o %t1.o
 ; RUN: llvm-as %p/Inputs/common.ll -o %t2.o
 
-; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \
 ; RUN:    --plugin-opt=emit-llvm \
 ; RUN:    -shared %t1.o %t2.o -o %t3.o
 ; RUN: llvm-dis %t3.o -o - | FileCheck %s
@@ -11,7 +11,7 @@
 ; Shared library case, we merge @a as common and keep it for the symbol table.
 ; CHECK: @a = common global i16 0, align 8
 
-; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \
 ; RUN:    --plugin-opt=emit-llvm \
 ; RUN:    %t1.o %t2.o -o %t3.o
 ; RUN: llvm-dis %t3.o -o - | FileCheck --check-prefix=EXEC %s
@@ -20,7 +20,7 @@
 ; EXEC: @a = internal global i16 0, align 8
 
 ; RUN: llc %p/Inputs/common.ll -o %t2.o -filetype=obj
-; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \
 ; RUN:    --plugin-opt=emit-llvm \
 ; RUN:    %t1.o %t2.o -o %t3.o
 ; RUN: llvm-dis %t3.o -o - | FileCheck --check-prefix=MIXED %s
diff --git a/test/tools/gold/drop-debug.ll b/test/tools/gold/drop-debug.ll
new file mode 100644
index 0000000..b8c4d8c
--- /dev/null
+++ b/test/tools/gold/drop-debug.ll
@@ -0,0 +1,8 @@
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --plugin-opt=emit-llvm -shared %p/Inputs/drop-debug.bc \
+; RUN:    -o t2.bc 2>&1 | FileCheck %s
+
+; drop-debug.bc was created from "void f(void) {}" with clang 3.5 and
+; -gline-tables-only, so it contains old debug info.
+
+; CHECK: warning: LLVM gold plugin: ignoring debug info with an invalid version (1) in {{.*}}/Inputs/drop-debug.bc
diff --git a/test/tools/gold/emit-llvm.ll b/test/tools/gold/emit-llvm.ll
index 0a6dcfc..f851fbf 100644
--- a/test/tools/gold/emit-llvm.ll
+++ b/test/tools/gold/emit-llvm.ll
@@ -1,20 +1,31 @@
 ; RUN: llvm-as %s -o %t.o
 
-; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \
 ; RUN:    --plugin-opt=emit-llvm \
 ; RUN:    --plugin-opt=generate-api-file \
 ; RUN:    -shared %t.o -o %t2.o
 ; RUN: llvm-dis %t2.o -o - | FileCheck %s
 ; RUN: FileCheck --check-prefix=API %s < %T/../apifile.txt
 
-; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \
 ; RUN:     -m elf_x86_64 --plugin-opt=save-temps \
 ; RUN:    -shared %t.o -o %t3.o
 ; RUN: llvm-dis %t3.o.bc -o - | FileCheck %s
 ; RUN: llvm-dis %t3.o.opt.bc -o - | FileCheck --check-prefix=OPT %s
 
+; RUN: rm -f %t4.o
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:     -m elf_x86_64 --plugin-opt=disable-output \
+; RUN:    -shared %t.o -o %t4.o
+; RUN: not test -a %t4.o
+
 target triple = "x86_64-unknown-linux-gnu"
 
+@g7 = extern_weak global i32
+; CHECK-DAG: @g7 = extern_weak global i32
+
+@g8 = external global i32
+
 ; CHECK: define internal void @f1()
 ; OPT-NOT: @f1
 define hidden void @f1() {
@@ -56,6 +67,13 @@ define linkonce_odr void @f6() unnamed_addr {
 }
 @g6 = global void()* @f6
 
+define i32* @f7() {
+  ret i32* @g7
+}
+
+define i32* @f8() {
+  ret i32* @g8
+}
 
 ; API: f1 PREVAILING_DEF_IRONLY
 ; API: f2 PREVAILING_DEF_IRONLY
@@ -63,5 +81,9 @@ define linkonce_odr void @f6() unnamed_addr {
 ; API: f4 PREVAILING_DEF_IRONLY_EXP
 ; API: f5 PREVAILING_DEF_IRONLY_EXP
 ; API: f6 PREVAILING_DEF_IRONLY_EXP
+; API: f7 PREVAILING_DEF_IRONLY_EXP
+; API: f8 PREVAILING_DEF_IRONLY_EXP
+; API: g7 UNDEF
+; API: g8 UNDEF
 ; API: g5 PREVAILING_DEF_IRONLY_EXP
 ; API: g6 PREVAILING_DEF_IRONLY_EXP
diff --git a/test/tools/gold/invalid.ll b/test/tools/gold/invalid.ll
index 8db7644..858cd47 100644
--- a/test/tools/gold/invalid.ll
+++ b/test/tools/gold/invalid.ll
@@ -1,4 +1,4 @@
-; RUN: not ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN: not %gold -plugin %llvmshlibdir/LLVMgold.so \
 ; RUN:    %p/Inputs/invalid.bc -o %t2 2>&1 | FileCheck %s
 
 ; test that only one error gets printed
diff --git a/test/tools/gold/linker-script.ll b/test/tools/gold/linker-script.ll
index 35a7694..7c88b0f 100644
--- a/test/tools/gold/linker-script.ll
+++ b/test/tools/gold/linker-script.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as %s -o %t.o
 
-; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \
 ; RUN:    --plugin-opt=emit-llvm \
 ; RUN:    -shared %t.o -o %t2.o \
 ; RUN:    -version-script=%p/Inputs/linker-script.export
diff --git a/test/tools/gold/linkonce-weak.ll b/test/tools/gold/linkonce-weak.ll
index 765275b..a0cccea 100644
--- a/test/tools/gold/linkonce-weak.ll
+++ b/test/tools/gold/linkonce-weak.ll
@@ -1,12 +1,12 @@
 ; RUN: llvm-as %s -o %t.o
 ; RUN: llvm-as %p/Inputs/linkonce-weak.ll -o %t2.o
 
-; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \
 ; RUN:    --plugin-opt=emit-llvm \
 ; RUN:    -shared %t.o %t2.o -o %t3.o
 ; RUN: llvm-dis %t3.o -o - | FileCheck %s
 
-; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \
 ; RUN:    --plugin-opt=emit-llvm \
 ; RUN:    -shared %t2.o %t.o -o %t3.o
 ; RUN: llvm-dis %t3.o -o - | FileCheck %s
diff --git a/test/tools/gold/mtriple.ll b/test/tools/gold/mtriple.ll
index 6395af6..94211ed 100644
--- a/test/tools/gold/mtriple.ll
+++ b/test/tools/gold/mtriple.ll
@@ -1,5 +1,5 @@
 ; RUN: llvm-as %s -o %t.o
-; RUN: ld -plugin %llvmshlibdir/LLVMgold.so -m elf32ppc \
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so -m elf32ppc \
 ; RUN:    -plugin-opt=mtriple=powerpc-linux-gnu \
 ; RUN:    -plugin-opt=obj-path=%t3.o \
 ; RUN:    -shared %t.o -o %t2
diff --git a/test/tools/gold/no-map-whole-file.ll b/test/tools/gold/no-map-whole-file.ll
new file mode 100644
index 0000000..4c261d7
--- /dev/null
+++ b/test/tools/gold/no-map-whole-file.ll
@@ -0,0 +1,9 @@
+; RUN: llvm-as -o %t.bc %s
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so -plugin-opt=emit-llvm \
+; RUN:    --no-map-whole-files -r -o %t2.bc %t.bc
+; RUN: llvm-dis < %t2.bc -o - | FileCheck %s
+
+; CHECK: main
+define i32 @main() {
+  ret i32 0
+}
diff --git a/test/tools/gold/option.ll b/test/tools/gold/option.ll
index 8154e43..59e3f1e 100644
--- a/test/tools/gold/option.ll
+++ b/test/tools/gold/option.ll
@@ -1,5 +1,5 @@
 ; RUN: llvm-as %s -o %t.o
-; RUN: ld -plugin %llvmshlibdir/LLVMgold.so -m elf_x86_64 \
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so -m elf_x86_64 \
 ; RUN:    --plugin-opt=-jump-table-type=arity \
 ; RUN:    --plugin-opt=-mattr=+aes \
 ; RUN:    --plugin-opt=mcpu=core-avx2 \
diff --git a/test/tools/gold/pr19901.ll b/test/tools/gold/pr19901.ll
index 304246b..2a7dea1 100644
--- a/test/tools/gold/pr19901.ll
+++ b/test/tools/gold/pr19901.ll
@@ -1,6 +1,6 @@
 ; RUN: llc %s -o %t.o -filetype=obj -relocation-model=pic
 ; RUN: llvm-as %p/Inputs/pr19901-1.ll -o %t2.o
-; RUN: ld -shared -o %t.so -plugin %llvmshlibdir/LLVMgold.so %t2.o %t.o
+; RUN: %gold -shared -o %t.so -plugin %llvmshlibdir/LLVMgold.so %t2.o %t.o
 ; RUN: llvm-readobj -t %t.so | FileCheck %s
 
 ; CHECK:       Symbol {
@@ -9,7 +9,7 @@
 ; CHECK-NEXT:    Size:
 ; CHECK-NEXT:    Binding: Local
 ; CHECK-NEXT:    Type: Function
-; CHECK-NEXT:    Other: 2
+; CHECK-NEXT:    Other: {{2|0}}
 ; CHECK-NEXT:    Section: .text
 ; CHECK-NEXT:  }
 
diff --git a/test/tools/gold/slp-vectorize.ll b/test/tools/gold/slp-vectorize.ll
index d378902..d39aa76 100644
--- a/test/tools/gold/slp-vectorize.ll
+++ b/test/tools/gold/slp-vectorize.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as %s -o %t.o
 
-; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \
 ; RUN:    --plugin-opt=save-temps \
 ; RUN:    -shared %t.o -o %t2.o
 ; RUN: llvm-dis %t2.o.opt.bc -o - | FileCheck %s
diff --git a/test/tools/gold/stats.ll b/test/tools/gold/stats.ll
new file mode 100644
index 0000000..b3c8297
--- /dev/null
+++ b/test/tools/gold/stats.ll
@@ -0,0 +1,7 @@
+; REQUIRES: asserts
+
+; RUN: llvm-as %s -o %t.o
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so  -shared \
+; RUN:    -plugin-opt=-stats %t.o -o %t2 2>&1 | FileCheck %s
+
+; CHECK: Statistics Collected
diff --git a/test/tools/gold/vectorize.ll b/test/tools/gold/vectorize.ll
index 3d305db..c1626d7 100644
--- a/test/tools/gold/vectorize.ll
+++ b/test/tools/gold/vectorize.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as %s -o %t.o
 
-; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \
 ; RUN:    --plugin-opt=save-temps \
 ; RUN:    -shared %t.o -o %t2.o
 ; RUN: llvm-dis %t2.o.opt.bc -o - | FileCheck %s
diff --git a/test/tools/gold/weak.ll b/test/tools/gold/weak.ll
index e05e905..6d8d7a8 100644
--- a/test/tools/gold/weak.ll
+++ b/test/tools/gold/weak.ll
@@ -1,7 +1,7 @@
 ; RUN: llvm-as %s -o %t.o
 ; RUN: llvm-as %p/Inputs/weak.ll -o %t2.o
 
-; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \
 ; RUN:    --plugin-opt=emit-llvm \
 ; RUN:    -shared %t.o %t2.o -o %t3.o
 ; RUN: llvm-dis %t3.o -o - | FileCheck %s
diff --git a/test/tools/llvm-cov/Inputs/highlightedRanges.covmapping b/test/tools/llvm-cov/Inputs/highlightedRanges.covmapping
index 20eb0d7..e97320b 100644
--- a/test/tools/llvm-cov/Inputs/highlightedRanges.covmapping
+++ b/test/tools/llvm-cov/Inputs/highlightedRanges.covmapping
diff --git a/test/tools/llvm-cov/Inputs/regionMarkers.covmapping b/test/tools/llvm-cov/Inputs/regionMarkers.covmapping
index 3ebcb07..501cba2 100644
--- a/test/tools/llvm-cov/Inputs/regionMarkers.covmapping
+++ b/test/tools/llvm-cov/Inputs/regionMarkers.covmapping
diff --git a/test/tools/llvm-cov/Inputs/report.covmapping b/test/tools/llvm-cov/Inputs/report.covmapping
index 32d84bc..5d0bfc1 100644
--- a/test/tools/llvm-cov/Inputs/report.covmapping
+++ b/test/tools/llvm-cov/Inputs/report.covmapping
diff --git a/test/tools/llvm-cov/Inputs/showExpansions.covmapping b/test/tools/llvm-cov/Inputs/showExpansions.covmapping
index b8c7d97..e02a728 100644
--- a/test/tools/llvm-cov/Inputs/showExpansions.covmapping
+++ b/test/tools/llvm-cov/Inputs/showExpansions.covmapping
diff --git a/test/tools/llvm-cov/report.cpp b/test/tools/llvm-cov/report.cpp
index 297322a..570012e 100644
--- a/test/tools/llvm-cov/report.cpp
+++ b/test/tools/llvm-cov/report.cpp
@@ -1,7 +1,21 @@
-// RUN: llvm-cov report %S/Inputs/report.covmapping -instr-profile %S/Inputs/report.profdata -no-colors 2>&1 | FileCheck %s
+// RUN: llvm-cov report %S/Inputs/report.covmapping -instr-profile %S/Inputs/report.profdata -no-colors -filename-equivalence 2>&1 | FileCheck %s
+// RUN: llvm-cov report %S/Inputs/report.covmapping -instr-profile %S/Inputs/report.profdata -no-colors -filename-equivalence report.cpp 2>&1 | FileCheck -check-prefix=FILT-NEXT %s
 
-// CHECK: Filename                    Regions    Miss   Cover Functions  Executed
-// CHECK: TOTAL                             5       2  60.00%         4    75.00%
+// CHECK:      Filename   Regions  Miss   Cover  Functions  Executed
+// CHECK-NEXT: ---
+// CHECK-NEXT: report.cpp       5     2  60.00%          4    75.00%
+// CHECK-NEXT: ---
+// CHECK-NEXT: TOTAL            5     2  60.00%          4    75.00%
+
+// FILT: File 'report.cpp':
+// FILT-NEXT: Name        Regions  Miss   Cover  Lines  Miss   Cover
+// FILT-NEXT: ---
+// FILT-NEXT: _Z3foob           2     1  50.00%      4     2  50.00%
+// FILT-NEXT: _Z3barv           1     0 100.00%      2     0 100.00%
+// FILT-NEXT: _Z4funcv          1     1   0.00%      2     2   0.00%
+// FILT-NEXT: main              1     0 100.00%      5     0 100.00%
+// FILT-NEXT: ---
+// FILT-NEXT: TOTAL             5     2  60.00%     13     4  69.23%
 
 void foo(bool cond) {
   if (cond) {
diff --git a/test/tools/llvm-cov/showHighlightedRanges.cpp b/test/tools/llvm-cov/showHighlightedRanges.cpp
index cec7308..1ff7929 100644
--- a/test/tools/llvm-cov/showHighlightedRanges.cpp
+++ b/test/tools/llvm-cov/showHighlightedRanges.cpp
@@ -2,17 +2,17 @@
 
 void func() {
   return;
-  int i = 0;                     // CHECK: Highlighted line [[@LINE]], 3 -> 12
-}
+  int i = 0;                     // CHECK: Highlighted line [[@LINE]], 3 -> ?
+}                                // CHECK: Highlighted line [[@LINE]], 1 -> 2
 
 void func2(int x) {
   if(x > 5) {
     while(x >= 9) {
       return;
-      --x;                       // CHECK: Highlighted line [[@LINE]], 7 -> 10
-    }
-    int i = 0;                   // CHECK: Highlighted line [[@LINE]], 5 -> 14
-  }
+      --x;                       // CHECK: Highlighted line [[@LINE]], 7 -> ?
+    }                            // CHECK: Highlighted line [[@LINE]], 1 -> 6
+    int i = 0;                   // CHECK: Highlighted line [[@LINE]], 5 -> ?
+  }                              // CHECK: Highlighted line [[@LINE]], 1 -> 4
 }
 
 void test() {
diff --git a/test/tools/llvm-cov/showLineExecutionCounts.cpp b/test/tools/llvm-cov/showLineExecutionCounts.cpp
index 34baa57..625c3f2 100644
--- a/test/tools/llvm-cov/showLineExecutionCounts.cpp
+++ b/test/tools/llvm-cov/showLineExecutionCounts.cpp
@@ -12,11 +12,11 @@ int main() {                             // CHECK:   1| [[@LINE]]|int main(
     x = 1;                               // CHECK:   1| [[@LINE]]|    x = 1
   }                                      // CHECK:   1| [[@LINE]]|  }
                                          // CHECK:   1| [[@LINE]]|
-  for (int i = 0; i < 100; ++i) {        // CHECK: 100| [[@LINE]]|  for (
+  for (int i = 0; i < 100; ++i) {        // CHECK: 101| [[@LINE]]|  for (
     x = 1;                               // CHECK: 100| [[@LINE]]|    x = 1
   }                                      // CHECK: 100| [[@LINE]]|  }
                                          // CHECK:   1| [[@LINE]]|
-  x = x < 10 ? x + 1 : x - 1;            // CHECK:   0| [[@LINE]]|  x =
+  x = x < 10 ? x + 1 : x - 1;            // CHECK:   1| [[@LINE]]|  x =
   x = x > 10 ?                           // CHECK:   1| [[@LINE]]|  x =
         x - 1:                           // CHECK:   0| [[@LINE]]|        x
         x + 1;                           // CHECK:   1| [[@LINE]]|        x
diff --git a/test/tools/llvm-objdump/AArch64/Inputs/link-opt-hints.macho-aarch64 b/test/tools/llvm-objdump/AArch64/Inputs/link-opt-hints.macho-aarch64
new file mode 100644
index 0000000..12d33fc
--- /dev/null
+++ b/test/tools/llvm-objdump/AArch64/Inputs/link-opt-hints.macho-aarch64
diff --git a/test/tools/llvm-objdump/AArch64/macho-link-opt-hints.test b/test/tools/llvm-objdump/AArch64/macho-link-opt-hints.test
new file mode 100644
index 0000000..932bc4f
--- /dev/null
+++ b/test/tools/llvm-objdump/AArch64/macho-link-opt-hints.test
@@ -0,0 +1,11 @@
+RUN: llvm-objdump -m -link-opt-hints %p/Inputs/link-opt-hints.macho-aarch64 | FileCheck %s
+
+CHECK: Linker optimiztion hints (8 total bytes)
+CHECK:     identifier 8 AdrpLdrGot
+CHECK:     narguments 2
+CHECK: 	value 0x18
+CHECK: 	value 0x1c
+CHECK:     identifier 7 AdrpAdd
+CHECK:     narguments 2
+CHECK: 	value 0x6c
+CHECK: 	value 0x70
diff --git a/test/tools/llvm-objdump/AArch64/macho-private-headers.test b/test/tools/llvm-objdump/AArch64/macho-private-headers.test
new file mode 100644
index 0000000..cdf98b1
--- /dev/null
+++ b/test/tools/llvm-objdump/AArch64/macho-private-headers.test
@@ -0,0 +1,312 @@
+// RUN: llvm-objdump -p %p/Inputs/hello.obj.macho-aarch64 | FileCheck %s
+// RUN: llvm-objdump -p %p/Inputs/hello.exe.macho-aarch64 \
+// RUN:     | FileCheck %s -check-prefix=EXE
+ 
+CHECK: Mach header
+CHECK:       magic cputype cpusubtype  caps    filetype ncmds sizeofcmds      flags
+CHECK: MH_MAGIC_64   ARM64        ALL  0x00      OBJECT     4        352 SUBSECTIONS_VIA_SYMBOLS
+CHECK: Load command 0
+CHECK:       cmd LC_SEGMENT_64
+CHECK:   cmdsize 232
+CHECK:   segname 
+CHECK:    vmaddr 0x0000000000000000
+CHECK:    vmsize 0x000000000000004d
+CHECK:   fileoff 384
+CHECK:  filesize 77
+CHECK:   maxprot rwx
+CHECK:  initprot rwx
+CHECK:    nsects 2
+CHECK:     flags (none)
+CHECK: Section
+CHECK:   sectname __text
+CHECK:    segname __TEXT
+CHECK:       addr 0x0000000000000000
+CHECK:       size 0x0000000000000040
+CHECK:     offset 384
+CHECK:      align 2^2 (4)
+CHECK:     reloff 464
+CHECK:     nreloc 3
+CHECK:       type S_REGULAR
+CHECK: attributes PURE_INSTRUCTIONS SOME_INSTRUCTIONS
+CHECK:  reserved1 0
+CHECK:  reserved2 0
+CHECK: Section
+CHECK:   sectname __cstring
+CHECK:    segname __TEXT
+CHECK:       addr 0x0000000000000040
+CHECK:       size 0x000000000000000d
+CHECK:     offset 448
+CHECK:      align 2^0 (1)
+CHECK:     reloff 0
+CHECK:     nreloc 0
+CHECK:       type S_CSTRING_LITERALS
+CHECK: attributes (none)
+CHECK:  reserved1 0
+CHECK:  reserved2 0
+CHECK: Load command 1
+CHECK:       cmd LC_VERSION_MIN_IPHONEOS
+CHECK:   cmdsize 16
+CHECK:   version 9.0
+CHECK:       sdk n/a
+CHECK: Load command 2
+CHECK:      cmd LC_SYMTAB
+CHECK:  cmdsize 24
+CHECK:   symoff 488
+CHECK:    nsyms 5
+CHECK:   stroff 568
+CHECK:  strsize 36
+CHECK: Load command 3
+CHECK:             cmd LC_DYSYMTAB
+CHECK:         cmdsize 80
+CHECK:       ilocalsym 0
+CHECK:       nlocalsym 3
+CHECK:      iextdefsym 3
+CHECK:      nextdefsym 1
+CHECK:       iundefsym 4
+CHECK:       nundefsym 1
+CHECK:          tocoff 0
+CHECK:            ntoc 0
+CHECK:       modtaboff 0
+CHECK:         nmodtab 0
+CHECK:    extrefsymoff 0
+CHECK:     nextrefsyms 0
+CHECK:  indirectsymoff 0
+CHECK:   nindirectsyms 0
+CHECK:       extreloff 0
+CHECK:         nextrel 0
+CHECK:       locreloff 0
+CHECK:         nlocrel 0
+
+EXE: Mach header
+EXE:       magic cputype cpusubtype  caps    filetype ncmds sizeofcmds      flags
+EXE: MH_MAGIC_64   ARM64        ALL  0x00     EXECUTE    17       1240   NOUNDEFS DYLDLINK TWOLEVEL PIE
+EXE: Load command 0
+EXE:       cmd LC_SEGMENT_64
+EXE:   cmdsize 72
+EXE:   segname __PAGEZERO
+EXE:    vmaddr 0x0000000000000000
+EXE:    vmsize 0x0000000100000000
+EXE:   fileoff 0
+EXE:  filesize 0
+EXE:   maxprot ---
+EXE:  initprot ---
+EXE:    nsects 0
+EXE:     flags (none)
+EXE: Load command 1
+EXE:       cmd LC_SEGMENT_64
+EXE:   cmdsize 472
+EXE:   segname __TEXT
+EXE:    vmaddr 0x0000000100000000
+EXE:    vmsize 0x0000000000008000
+EXE:   fileoff 0
+EXE:  filesize 32768
+EXE:   maxprot r-x
+EXE:  initprot r-x
+EXE:    nsects 5
+EXE:     flags (none)
+EXE: Section
+EXE:   sectname __text
+EXE:    segname __TEXT
+EXE:       addr 0x0000000100007f38
+EXE:       size 0x0000000000000040
+EXE:     offset 32568
+EXE:      align 2^2 (4)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_REGULAR
+EXE: attributes PURE_INSTRUCTIONS SOME_INSTRUCTIONS
+EXE:  reserved1 0
+EXE:  reserved2 0
+EXE: Section
+EXE:   sectname __stubs
+EXE:    segname __TEXT
+EXE:       addr 0x0000000100007f78
+EXE:       size 0x000000000000000c
+EXE:     offset 32632
+EXE:      align 2^1 (2)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_SYMBOL_STUBS
+EXE: attributes PURE_INSTRUCTIONS SOME_INSTRUCTIONS
+EXE:  reserved1 0 (index into indirect symbol table)
+EXE:  reserved2 12 (size of stubs)
+EXE: Section
+EXE:   sectname __stub_helper
+EXE:    segname __TEXT
+EXE:       addr 0x0000000100007f84
+EXE:       size 0x0000000000000024
+EXE:     offset 32644
+EXE:      align 2^2 (4)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_REGULAR
+EXE: attributes PURE_INSTRUCTIONS SOME_INSTRUCTIONS
+EXE:  reserved1 0
+EXE:  reserved2 0
+EXE: Section
+EXE:   sectname __cstring
+EXE:    segname __TEXT
+EXE:       addr 0x0000000100007fa8
+EXE:       size 0x000000000000000d
+EXE:     offset 32680
+EXE:      align 2^0 (1)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_CSTRING_LITERALS
+EXE: attributes (none)
+EXE:  reserved1 0
+EXE:  reserved2 0
+EXE: Section
+EXE:   sectname __unwind_info
+EXE:    segname __TEXT
+EXE:       addr 0x0000000100007fb8
+EXE:       size 0x0000000000000048
+EXE:     offset 32696
+EXE:      align 2^2 (4)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_REGULAR
+EXE: attributes (none)
+EXE:  reserved1 0
+EXE:  reserved2 0
+EXE: Load command 2
+EXE:       cmd LC_SEGMENT_64
+EXE:   cmdsize 232
+EXE:   segname __DATA
+EXE:    vmaddr 0x0000000100008000
+EXE:    vmsize 0x0000000000004000
+EXE:   fileoff 32768
+EXE:  filesize 16384
+EXE:   maxprot rw-
+EXE:  initprot rw-
+EXE:    nsects 2
+EXE:     flags (none)
+EXE: Section
+EXE:   sectname __got
+EXE:    segname __DATA
+EXE:       addr 0x0000000100008000
+EXE:       size 0x0000000000000010
+EXE:     offset 32768
+EXE:      align 2^3 (8)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_NON_LAZY_SYMBOL_POINTERS
+EXE: attributes (none)
+EXE:  reserved1 1 (index into indirect symbol table)
+EXE:  reserved2 0
+EXE: Section
+EXE:   sectname __la_symbol_ptr
+EXE:    segname __DATA
+EXE:       addr 0x0000000100008010
+EXE:       size 0x0000000000000008
+EXE:     offset 32784
+EXE:      align 2^3 (8)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_LAZY_SYMBOL_POINTERS
+EXE: attributes (none)
+EXE:  reserved1 3 (index into indirect symbol table)
+EXE:  reserved2 0
+EXE: Load command 3
+EXE:       cmd LC_SEGMENT_64
+EXE:   cmdsize 72
+EXE:   segname __LINKEDIT
+EXE:    vmaddr 0x000000010000c000
+EXE:    vmsize 0x0000000000004000
+EXE:   fileoff 49152
+EXE:  filesize 264
+EXE:   maxprot r--
+EXE:  initprot r--
+EXE:    nsects 0
+EXE:     flags (none)
+EXE: Load command 4
+EXE:             cmd LC_DYLD_INFO_ONLY
+EXE:         cmdsize 48
+EXE:      rebase_off 49152
+EXE:     rebase_size 8
+EXE:        bind_off 49160
+EXE:       bind_size 24
+EXE:   weak_bind_off 0
+EXE:  weak_bind_size 0
+EXE:   lazy_bind_off 49184
+EXE:  lazy_bind_size 16
+EXE:      export_off 49200
+EXE:     export_size 48
+EXE: Load command 5
+EXE:      cmd LC_SYMTAB
+EXE:  cmdsize 24
+EXE:   symoff 49280
+EXE:    nsyms 4
+EXE:   stroff 49360
+EXE:  strsize 56
+EXE: Load command 6
+EXE:             cmd LC_DYSYMTAB
+EXE:         cmdsize 80
+EXE:       ilocalsym 0
+EXE:       nlocalsym 0
+EXE:      iextdefsym 0
+EXE:      nextdefsym 2
+EXE:       iundefsym 2
+EXE:       nundefsym 2
+EXE:          tocoff 0
+EXE:            ntoc 0
+EXE:       modtaboff 0
+EXE:         nmodtab 0
+EXE:    extrefsymoff 0
+EXE:     nextrefsyms 0
+EXE:  indirectsymoff 49344
+EXE:   nindirectsyms 4
+EXE:       extreloff 0
+EXE:         nextrel 0
+EXE:       locreloff 0
+EXE:         nlocrel 0
+EXE: Load command 7
+EXE:           cmd LC_LOAD_DYLINKER
+EXE:       cmdsize 32
+EXE:          name /usr/lib/dyld (offset 12)
+EXE: Load command 8
+EXE:      cmd LC_UUID
+EXE:  cmdsize 24
+EXE:     uuid D687F888-CD3F-3276-8C94-BA3CCA21D820
+EXE: Load command 9
+EXE:       cmd LC_VERSION_MIN_IPHONEOS
+EXE:   cmdsize 16
+EXE:   version 9.0
+EXE:       sdk 9.0
+EXE: Load command 10
+EXE:       cmd LC_SOURCE_VERSION
+EXE:   cmdsize 16
+EXE:   version 0.0
+EXE: Load command 11
+EXE:        cmd LC_MAIN
+EXE:    cmdsize 24
+EXE:   entryoff 32568
+EXE:  stacksize 0
+EXE: Load command 12
+EXE:           cmd LC_ENCRYPTION_INFO_64
+EXE:       cmdsize 24
+EXE:      cryptoff 16384
+EXE:     cryptsize 16384
+EXE:       cryptid 0
+EXE:           pad 0
+EXE: Load command 13
+EXE:           cmd LC_LOAD_DYLIB
+EXE:       cmdsize 56
+EXE:          name /usr/lib/libSystem.B.dylib (offset 24)
+EXE:       current version 1215.0.0
+EXE: compatibility version 1.0.0
+EXE: Load command 14
+EXE:       cmd LC_FUNCTION_STARTS
+EXE:   cmdsize 16
+EXE:   dataoff 49248
+EXE:  datasize 8
+EXE: Load command 15
+EXE:       cmd LC_DATA_IN_CODE
+EXE:   cmdsize 16
+EXE:   dataoff 49256
+EXE:  datasize 0
+EXE: Load command 16
+EXE:       cmd LC_DYLIB_CODE_SIGN_DRS
+EXE:   cmdsize 16
+EXE:   dataoff 49256
+EXE:  datasize 24
diff --git a/test/tools/llvm-objdump/ARM/Inputs/data-in-code.macho-arm b/test/tools/llvm-objdump/ARM/Inputs/data-in-code.macho-arm
new file mode 100644
index 0000000..e826f29
--- /dev/null
+++ b/test/tools/llvm-objdump/ARM/Inputs/data-in-code.macho-arm
diff --git a/test/tools/llvm-objdump/ARM/macho-data-in-code.test b/test/tools/llvm-objdump/ARM/macho-data-in-code.test
new file mode 100644
index 0000000..1814dc0
--- /dev/null
+++ b/test/tools/llvm-objdump/ARM/macho-data-in-code.test
@@ -0,0 +1,8 @@
+RUN: llvm-objdump -m -data-in-code %p/Inputs/data-in-code.macho-arm | FileCheck %s
+
+CHECK: Data in code table (4 entries)
+CHECK: offset     length kind
+CHECK: 0x00000000      4 DATA
+CHECK: 0x00000004      4 JUMP_TABLE32
+CHECK: 0x00000008      2 JUMP_TABLE16
+CHECK: 0x0000000a      1 JUMP_TABLE8
diff --git a/test/tools/llvm-objdump/ARM/macho-private-headers.test b/test/tools/llvm-objdump/ARM/macho-private-headers.test
new file mode 100644
index 0000000..4ab3043
--- /dev/null
+++ b/test/tools/llvm-objdump/ARM/macho-private-headers.test
@@ -0,0 +1,345 @@
+// RUN: llvm-objdump -p %p/Inputs/hello.obj.macho-arm | FileCheck %s
+// RUN: llvm-objdump -p %p/Inputs/hello.exe.macho-arm \
+// RUN:     | FileCheck %s -check-prefix=EXE
+
+CHECK: Mach header
+CHECK:       magic cputype cpusubtype  caps    filetype ncmds sizeofcmds      flags
+CHECK:    MH_MAGIC     ARM         V7  0x00      OBJECT     3        568 SUBSECTIONS_VIA_SYMBOLS
+CHECK: Load command 0
+CHECK:       cmd LC_SEGMENT
+CHECK:   cmdsize 464
+CHECK:   segname 
+CHECK:    vmaddr 0x00000000
+CHECK:    vmsize 0x00000043
+CHECK:   fileoff 596
+CHECK:  filesize 67
+CHECK:   maxprot rwx
+CHECK:  initprot rwx
+CHECK:    nsects 6
+CHECK:     flags (none)
+CHECK: Section
+CHECK:   sectname __text
+CHECK:    segname __TEXT
+CHECK:       addr 0x00000000
+CHECK:       size 0x00000036
+CHECK:     offset 596
+CHECK:      align 2^2 (4)
+CHECK:     reloff 664
+CHECK:     nreloc 5
+CHECK:       type S_REGULAR
+CHECK: attributes PURE_INSTRUCTIONS SOME_INSTRUCTIONS
+CHECK:  reserved1 0
+CHECK:  reserved2 0
+CHECK: Section
+CHECK:   sectname __textcoal_nt
+CHECK:    segname __TEXT
+CHECK:       addr 0x00000036
+CHECK:       size 0x00000000
+CHECK:     offset 650
+CHECK:      align 2^0 (1)
+CHECK:     reloff 0
+CHECK:     nreloc 0
+CHECK:       type S_COALESCED
+CHECK: attributes PURE_INSTRUCTIONS
+CHECK:  reserved1 0
+CHECK:  reserved2 0
+CHECK: Section
+CHECK:   sectname __const_coal
+CHECK:    segname __TEXT
+CHECK:       addr 0x00000036
+CHECK:       size 0x00000000
+CHECK:     offset 650
+CHECK:      align 2^0 (1)
+CHECK:     reloff 0
+CHECK:     nreloc 0
+CHECK:       type S_COALESCED
+CHECK: attributes (none)
+CHECK:  reserved1 0
+CHECK:  reserved2 0
+CHECK: Section
+CHECK:   sectname __picsymbolstub4
+CHECK:    segname __TEXT
+CHECK:       addr 0x00000036
+CHECK:       size 0x00000000
+CHECK:     offset 650
+CHECK:      align 2^0 (1)
+CHECK:     reloff 0
+CHECK:     nreloc 0
+CHECK:       type S_SYMBOL_STUBS
+CHECK: attributes (none)
+CHECK:  reserved1 0 (index into indirect symbol table)
+CHECK:  reserved2 16 (size of stubs)
+CHECK: Section
+CHECK:   sectname __StaticInit
+CHECK:    segname __TEXT
+CHECK:       addr 0x00000036
+CHECK:       size 0x00000000
+CHECK:     offset 650
+CHECK:      align 2^0 (1)
+CHECK:     reloff 0
+CHECK:     nreloc 0
+CHECK:       type S_REGULAR
+CHECK: attributes PURE_INSTRUCTIONS
+CHECK:  reserved1 0
+CHECK:  reserved2 0
+CHECK: Section
+CHECK:   sectname __cstring
+CHECK:    segname __TEXT
+CHECK:       addr 0x00000036
+CHECK:       size 0x0000000d
+CHECK:     offset 650
+CHECK:      align 2^0 (1)
+CHECK:     reloff 0
+CHECK:     nreloc 0
+CHECK:       type S_CSTRING_LITERALS
+CHECK: attributes (none)
+CHECK:  reserved1 0
+CHECK:  reserved2 0
+CHECK: Load command 1
+CHECK:      cmd LC_SYMTAB
+CHECK:  cmdsize 24
+CHECK:   symoff 704
+CHECK:    nsyms 2
+CHECK:   stroff 728
+CHECK:  strsize 16
+CHECK: Load command 2
+CHECK:             cmd LC_DYSYMTAB
+CHECK:         cmdsize 80
+CHECK:       ilocalsym 0
+CHECK:       nlocalsym 0
+CHECK:      iextdefsym 0
+CHECK:      nextdefsym 1
+CHECK:       iundefsym 1
+CHECK:       nundefsym 1
+CHECK:          tocoff 0
+CHECK:            ntoc 0
+CHECK:       modtaboff 0
+CHECK:         nmodtab 0
+CHECK:    extrefsymoff 0
+CHECK:     nextrefsyms 0
+CHECK:  indirectsymoff 0
+CHECK:   nindirectsyms 0
+CHECK:       extreloff 0
+CHECK:         nextrel 0
+CHECK:       locreloff 0
+CHECK:         nlocrel 0
+
+EXE: Mach header
+EXE:       magic cputype cpusubtype  caps    filetype ncmds sizeofcmds      flags
+EXE:    MH_MAGIC     ARM         V7  0x00     EXECUTE    17       1012   NOUNDEFS DYLDLINK TWOLEVEL PIE
+EXE: Load command 0
+EXE:       cmd LC_SEGMENT
+EXE:   cmdsize 56
+EXE:   segname __PAGEZERO
+EXE:    vmaddr 0x00000000
+EXE:    vmsize 0x00004000
+EXE:   fileoff 0
+EXE:  filesize 0
+EXE:   maxprot ---
+EXE:  initprot ---
+EXE:    nsects 0
+EXE:     flags (none)
+EXE: Load command 1
+EXE:       cmd LC_SEGMENT
+EXE:   cmdsize 328
+EXE:   segname __TEXT
+EXE:    vmaddr 0x00004000
+EXE:    vmsize 0x00008000
+EXE:   fileoff 0
+EXE:  filesize 32768
+EXE:   maxprot r-x
+EXE:  initprot r-x
+EXE:    nsects 4
+EXE:     flags (none)
+EXE: Section
+EXE:   sectname __text
+EXE:    segname __TEXT
+EXE:       addr 0x0000bf84
+EXE:       size 0x00000036
+EXE:     offset 32644
+EXE:      align 2^2 (4)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_REGULAR
+EXE: attributes PURE_INSTRUCTIONS SOME_INSTRUCTIONS
+EXE:  reserved1 0
+EXE:  reserved2 0
+EXE: Section
+EXE:   sectname __stub_helper
+EXE:    segname __TEXT
+EXE:       addr 0x0000bfbc
+EXE:       size 0x00000030
+EXE:     offset 32700
+EXE:      align 2^2 (4)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_REGULAR
+EXE: attributes PURE_INSTRUCTIONS SOME_INSTRUCTIONS
+EXE:  reserved1 0
+EXE:  reserved2 0
+EXE: Section
+EXE:   sectname __cstring
+EXE:    segname __TEXT
+EXE:       addr 0x0000bfec
+EXE:       size 0x0000000d
+EXE:     offset 32748
+EXE:      align 2^0 (1)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_CSTRING_LITERALS
+EXE: attributes (none)
+EXE:  reserved1 0
+EXE:  reserved2 0
+EXE: Section
+EXE:   sectname __symbolstub1
+EXE:    segname __TEXT
+EXE:       addr 0x0000bffc
+EXE:       size 0x00000004
+EXE:     offset 32764
+EXE:      align 2^2 (4)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_SYMBOL_STUBS
+EXE: attributes PURE_INSTRUCTIONS SOME_INSTRUCTIONS
+EXE:  reserved1 0 (index into indirect symbol table)
+EXE:  reserved2 4 (size of stubs)
+EXE: Load command 2
+EXE:       cmd LC_SEGMENT
+EXE:   cmdsize 192
+EXE:   segname __DATA
+EXE:    vmaddr 0x0000c000
+EXE:    vmsize 0x00004000
+EXE:   fileoff 32768
+EXE:  filesize 16384
+EXE:   maxprot rw-
+EXE:  initprot rw-
+EXE:    nsects 2
+EXE:     flags (none)
+EXE: Section
+EXE:   sectname __lazy_symbol
+EXE:    segname __DATA
+EXE:       addr 0x0000c000
+EXE:       size 0x00000004
+EXE:     offset 32768
+EXE:      align 2^2 (4)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_LAZY_SYMBOL_POINTERS
+EXE: attributes (none)
+EXE:  reserved1 1 (index into indirect symbol table)
+EXE:  reserved2 0
+EXE: Section
+EXE:   sectname __nl_symbol_ptr
+EXE:    segname __DATA
+EXE:       addr 0x0000c004
+EXE:       size 0x00000008
+EXE:     offset 32772
+EXE:      align 2^2 (4)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_NON_LAZY_SYMBOL_POINTERS
+EXE: attributes (none)
+EXE:  reserved1 2 (index into indirect symbol table)
+EXE:  reserved2 0
+EXE: Load command 3
+EXE:       cmd LC_SEGMENT
+EXE:   cmdsize 56
+EXE:   segname __LINKEDIT
+EXE:    vmaddr 0x00010000
+EXE:    vmsize 0x00004000
+EXE:   fileoff 49152
+EXE:  filesize 256
+EXE:   maxprot r--
+EXE:  initprot r--
+EXE:    nsects 0
+EXE:     flags (none)
+EXE: Load command 4
+EXE:             cmd LC_DYLD_INFO_ONLY
+EXE:         cmdsize 48
+EXE:      rebase_off 49152
+EXE:     rebase_size 4
+EXE:        bind_off 49156
+EXE:       bind_size 24
+EXE:   weak_bind_off 0
+EXE:  weak_bind_size 0
+EXE:   lazy_bind_off 49180
+EXE:  lazy_bind_size 16
+EXE:      export_off 49196
+EXE:     export_size 44
+EXE: Load command 5
+EXE:      cmd LC_SYMTAB
+EXE:  cmdsize 24
+EXE:   symoff 49264
+EXE:    nsyms 5
+EXE:   stroff 49340
+EXE:  strsize 68
+EXE: Load command 6
+EXE:             cmd LC_DYSYMTAB
+EXE:         cmdsize 80
+EXE:       ilocalsym 0
+EXE:       nlocalsym 1
+EXE:      iextdefsym 1
+EXE:      nextdefsym 2
+EXE:       iundefsym 3
+EXE:       nundefsym 2
+EXE:          tocoff 0
+EXE:            ntoc 0
+EXE:       modtaboff 0
+EXE:         nmodtab 0
+EXE:    extrefsymoff 0
+EXE:     nextrefsyms 0
+EXE:  indirectsymoff 49324
+EXE:   nindirectsyms 4
+EXE:       extreloff 0
+EXE:         nextrel 0
+EXE:       locreloff 0
+EXE:         nlocrel 0
+EXE: Load command 7
+EXE:           cmd LC_LOAD_DYLINKER
+EXE:       cmdsize 28
+EXE:          name /usr/lib/dyld (offset 12)
+EXE: Load command 8
+EXE:      cmd LC_UUID
+EXE:  cmdsize 24
+EXE:     uuid C2D9351C-1EF1-330B-A2AB-EED6CF7D2C5D
+EXE: Load command 9
+EXE:      cmd LC_VERSION_MIN_IPHONEOS
+EXE:  cmdsize 16
+EXE:  version 8.0
+EXE:      sdk 8.0
+EXE: Load command 10
+EXE:       cmd LC_SOURCE_VERSION
+EXE:   cmdsize 16
+EXE:   version 0.0
+EXE: Load command 11
+EXE:        cmd LC_MAIN
+EXE:    cmdsize 24
+EXE:   entryoff 32645
+EXE:  stacksize 0
+EXE: Load command 12
+EXE:          cmd LC_ENCRYPTION_INFO
+EXE:      cmdsize 20
+EXE:     cryptoff 16384
+EXE:    cryptsize 16384
+EXE:      cryptid 0
+EXE: Load command 13
+EXE:           cmd LC_LOAD_DYLIB
+EXE:       cmdsize 52
+EXE:          name /usr/lib/libSystem.B.dylib (offset 24)
+EXE:       current version 1213.0.0
+EXE: compatibility version 1.0.0
+EXE: Load command 14
+EXE:       cmd LC_FUNCTION_STARTS
+EXE:   cmdsize 16
+EXE:   dataoff 49240
+EXE:  datasize 4
+EXE: Load command 15
+EXE:       cmd LC_DATA_IN_CODE
+EXE:   cmdsize 16
+EXE:   dataoff 49244
+EXE:  datasize 0
+EXE: Load command 16
+EXE:       cmd LC_DYLIB_CODE_SIGN_DRS
+EXE:   cmdsize 16
+EXE:   dataoff 49244
+EXE:  datasize 20
diff --git a/test/tools/llvm-objdump/Inputs/common-symbol-elf b/test/tools/llvm-objdump/Inputs/common-symbol-elf
new file mode 100644
index 0000000..9609edb
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/common-symbol-elf
diff --git a/test/tools/llvm-objdump/Inputs/proc-specific-section-elf b/test/tools/llvm-objdump/Inputs/proc-specific-section-elf
new file mode 100644
index 0000000..7c3d613
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/proc-specific-section-elf
diff --git a/test/tools/llvm-objdump/X86/Inputs/dylibModInit.macho-x86_64 b/test/tools/llvm-objdump/X86/Inputs/dylibModInit.macho-x86_64
new file mode 100755
index 0000000..a39424a
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/dylibModInit.macho-x86_64
diff --git a/test/tools/llvm-objdump/X86/Inputs/dylibRoutines.macho-x86_64 b/test/tools/llvm-objdump/X86/Inputs/dylibRoutines.macho-x86_64
new file mode 100755
index 0000000..3568045
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/dylibRoutines.macho-x86_64
diff --git a/test/tools/llvm-objdump/X86/Inputs/dylibSubClient.macho-x86_64 b/test/tools/llvm-objdump/X86/Inputs/dylibSubClient.macho-x86_64
new file mode 100755
index 0000000..e7f9542
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/dylibSubClient.macho-x86_64
diff --git a/test/tools/llvm-objdump/X86/Inputs/dylibSubFramework.macho-x86_64 b/test/tools/llvm-objdump/X86/Inputs/dylibSubFramework.macho-x86_64
new file mode 100755
index 0000000..3036c27
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/dylibSubFramework.macho-x86_64
diff --git a/test/tools/llvm-objdump/X86/Inputs/dylibSubLibrary.macho-x86_64 b/test/tools/llvm-objdump/X86/Inputs/dylibSubLibrary.macho-x86_64
new file mode 100755
index 0000000..dafee5f
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/dylibSubLibrary.macho-x86_64
diff --git a/test/tools/llvm-objdump/X86/Inputs/dylibSubUmbrella.macho-x86_64 b/test/tools/llvm-objdump/X86/Inputs/dylibSubUmbrella.macho-x86_64
new file mode 100755
index 0000000..1e42a4f
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/dylibSubUmbrella.macho-x86_64
diff --git a/test/tools/llvm-objdump/X86/Inputs/exeThread.macho-x86_64 b/test/tools/llvm-objdump/X86/Inputs/exeThread.macho-x86_64
new file mode 100755
index 0000000..93fe1db
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/exeThread.macho-x86_64
diff --git a/test/tools/llvm-objdump/X86/Inputs/linkerOption.macho-x86_64 b/test/tools/llvm-objdump/X86/Inputs/linkerOption.macho-x86_64
new file mode 100644
index 0000000..38053c5
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/linkerOption.macho-x86_64
diff --git a/test/tools/llvm-objdump/X86/Inputs/macho-universal-archive.x86_64.i386 b/test/tools/llvm-objdump/X86/Inputs/macho-universal-archive.x86_64.i386
new file mode 100644
index 0000000..1660714
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/macho-universal-archive.x86_64.i386
diff --git a/test/tools/llvm-objdump/X86/Inputs/macho-universal.x86_64.i386 b/test/tools/llvm-objdump/X86/Inputs/macho-universal.x86_64.i386
new file mode 100755
index 0000000..36d5fc2
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/macho-universal.x86_64.i386
diff --git a/test/tools/llvm-objdump/X86/macho-archive-headers.test b/test/tools/llvm-objdump/X86/macho-archive-headers.test
new file mode 100644
index 0000000..3d9043e
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-archive-headers.test
@@ -0,0 +1,10 @@
+RUN: llvm-objdump %p/Inputs/macho-universal-archive.x86_64.i386 -macho -archive-headers -arch all \
+RUN: | FileCheck %s
+
+# Note the date as printed by ctime(3) is time zone dependent and not checked.
+CHECK: Archive : {{.*}}/macho-universal-archive.x86_64.i386 (architecture x86_64)
+CHECK: -rw-r--r--124/11     44 {{.*}} __.SYMDEF SORTED
+CHECK: -rw-r--r--124/0     860 {{.*}} hello.o
+CHECK: Archive : {{.*}}/macho-universal-archive.x86_64.i386 (architecture i386)
+CHECK: -rw-r--r--124/11     60 {{.*}} __.SYMDEF SORTED
+CHECK: -rw-r--r--124/0     388 {{.*}} foo.o
diff --git a/test/tools/llvm-objdump/X86/macho-cstring-dump.test b/test/tools/llvm-objdump/X86/macho-cstring-dump.test
new file mode 100644
index 0000000..3dfa4e3
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-cstring-dump.test
@@ -0,0 +1,4 @@
+RUN: llvm-objdump -m -section __TEXT,__cstring %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s
+
+CHECK: Contents of (__TEXT,__cstring) section
+CHECK: 000000000000003b  Hello world\n
diff --git a/test/tools/llvm-objdump/X86/macho-indirect-symbols.test b/test/tools/llvm-objdump/X86/macho-indirect-symbols.test
new file mode 100644
index 0000000..4f3af18
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-indirect-symbols.test
@@ -0,0 +1,12 @@
+RUN: llvm-objdump -macho -indirect-symbols %p/Inputs/hello.exe.macho-x86_64 | FileCheck %s
+
+CHECK: Indirect symbols for (__TEXT,__stubs) 1 entries
+CHECK: address            index name
+CHECK: 0x0000000100000f6c     2 _printf
+CHECK: Indirect symbols for (__DATA,__nl_symbol_ptr) 2 entries
+CHECK: address            index name
+CHECK: 0x0000000100001000     3 dyld_stub_binder
+CHECK: 0x0000000100001008 ABSOLUTE
+CHECK: Indirect symbols for (__DATA,__la_symbol_ptr) 1 entries
+CHECK: address            index name
+CHECK: 0x0000000100001010     2 _printf
diff --git a/test/tools/llvm-objdump/X86/macho-literal-pointers-i386.test b/test/tools/llvm-objdump/X86/macho-literal-pointers-i386.test
new file mode 100644
index 0000000..0069668
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-literal-pointers-i386.test
@@ -0,0 +1,34 @@
+# RUN: llvm-mc < %s -triple i386-apple-darwin -filetype=obj | llvm-objdump -m -section __DATA,__litp - | FileCheck %s
+
+.cstring
+L1: .asciz "Hello world\n"
+
+.literal4
+.align 2
+L2: .float 4.0
+
+.literal8
+.align 3
+L3: .double 8.0
+
+.literal16
+.align 4
+L4: .long 0x10000016, 0x20000016, 0x30000016, 0x40000016
+
+.const
+L5: .asciz "const non-literal string"
+
+.section __DATA,__litp, literal_pointers
+.align 2
+.long L1
+.long L2
+.long L3
+.long L4
+.long L5
+
+# CHECK: Contents of (__DATA,__litp) section
+# CHECK: 0000004c  __TEXT:__cstring:Hello world\n
+# CHECK: 00000050  __TEXT:__literal4:0x40800000
+# CHECK: 00000054  __TEXT:__literal8:0x00000000 0x40200000
+# CHECK: 00000058  __TEXT:__literal16:0x10000016 0x20000016 0x30000016 0x40000016
+# CHECK: 0000005c  0x30 (not in a literal section)
diff --git a/test/tools/llvm-objdump/X86/macho-literal-pointers-x86_64.test b/test/tools/llvm-objdump/X86/macho-literal-pointers-x86_64.test
new file mode 100644
index 0000000..b403b81
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-literal-pointers-x86_64.test
@@ -0,0 +1,34 @@
+# RUN: llvm-mc < %s -triple x86_64-apple-darwin -filetype=obj | llvm-objdump -m -section __DATA,__litp - | FileCheck %s
+
+.cstring
+L1: .asciz "Hello world\n"
+
+.literal4
+.align 2
+L2: .float 4.0
+
+.literal8
+.align 3
+L3: .double 8.0
+
+.literal16
+.align 4
+L4: .long 0x10000016, 0x20000016, 0x30000016, 0x40000016
+
+.const
+L5: .asciz "const non-literal string"
+
+.section __DATA,__litp, literal_pointers
+.align 3
+.quad L1
+.quad L2
+.quad L3
+.quad L4
+.quad L5
+
+# CHECK: Contents of (__DATA,__litp) section
+# CHECK: 0000000000000050  __TEXT:__cstring:Hello world\n
+# CHECK: 0000000000000058  __TEXT:__literal4:0x40800000
+# CHECK: 0000000000000060  __TEXT:__literal8:0x00000000 0x40200000
+# CHECK: 0000000000000068  __TEXT:__literal16:0x10000016 0x20000016 0x30000016 0x40000016
+# CHECK: 0000000000000070  0x30 (not in a literal section)
diff --git a/test/tools/llvm-objdump/X86/macho-literals.test b/test/tools/llvm-objdump/X86/macho-literals.test
new file mode 100644
index 0000000..4824453
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-literals.test
@@ -0,0 +1,48 @@
+# RUN: llvm-mc < %s -triple x86_64-apple-darwin -filetype=obj | llvm-objdump -m -section __TEXT,__literal4 - | FileCheck %s -check-prefix=CHECK-LIT4
+# RUN: llvm-mc < %s -triple x86_64-apple-darwin -filetype=obj | llvm-objdump -m -section __TEXT,__literal8 - | FileCheck %s -check-prefix=CHECK-LIT8
+# RUN: llvm-mc < %s -triple x86_64-apple-darwin -filetype=obj | llvm-objdump -m -section __TEXT,__literal16 - | FileCheck %s -check-prefix=CHECK-LIT16
+
+.literal4
+.float 2.5
+.float 8.25
+.long 0x7f800000
+.long 0xff800000
+.long 0x7fc00000
+.long 0x7f800001
+
+# CHECK-LIT4: Contents of (__TEXT,__literal4) section
+# CHECK-LIT4: 0000000000000000  0x40200000
+# CHECK-LIT4: 0000000000000004  0x41040000
+# CHECK-LIT4: 0000000000000008  0x7f800000
+# CHECK-LIT4: 000000000000000c  0xff800000
+# CHECK-LIT4: 0000000000000010  0x7fc00000
+# CHECK-LIT4: 0000000000000014  0x7f800001
+
+.literal8
+.double 2.5
+.double 8.25
+.long 0
+.long 0x7ff00000
+.long 0
+.long 0xfff00000
+.long 0
+.long 0x7ff80000
+.long 1
+.long 0x7ff00000
+
+# CHECK-LIT8: Contents of (__TEXT,__literal8) section
+# CHECK-LIT8: 0000000000000018  0x00000000 0x40040000
+# CHECK-LIT8: 0000000000000020  0x00000000 0x40208000
+# CHECK-LIT8: 0000000000000028  0x00000000 0x7ff00000
+# CHECK-LIT8: 0000000000000030  0x00000000 0xfff00000
+# CHECK-LIT8: 0000000000000038  0x00000000 0x7ff80000
+# CHECK-LIT8: 0000000000000040  0x00000001 0x7ff00000
+
+.literal16
+.long 1
+.long 2
+.long 3
+.long 4
+
+# CHECK-LIT16: Contents of (__TEXT,__literal16) section
+# CHECK-LIT16: 0000000000000050  0x00000001 0x00000002 0x00000003 0x00000004
diff --git a/test/tools/llvm-objdump/X86/macho-nontext-disasm.test b/test/tools/llvm-objdump/X86/macho-nontext-disasm.test
new file mode 100644
index 0000000..27b7bb4
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-nontext-disasm.test
@@ -0,0 +1,9 @@
+# RUN: llvm-mc < %s -triple x86_64-apple-darwin -filetype=obj | llvm-objdump -m -section __FOO,__bar -full-leading-addr -print-imm-hex -no-show-raw-insn - | FileCheck %s
+
+.section __FOO, __bar
+_foo:
+	nop
+
+# CHECK: Contents of (__FOO,__bar) section
+# CHECK: _foo:
+# CHECK: 0000000000000000	nop
diff --git a/test/tools/llvm-objdump/X86/macho-private-headers.test b/test/tools/llvm-objdump/X86/macho-private-headers.test
index 685b4f7..c80bb08 100644
--- a/test/tools/llvm-objdump/X86/macho-private-headers.test
+++ b/test/tools/llvm-objdump/X86/macho-private-headers.test
@@ -3,6 +3,22 @@
 // RUN:     | FileCheck %s -check-prefix=EXE
 // RUN: llvm-objdump -p %p/Inputs/dylibLoadKinds.macho-x86_64 \
 // RUN:     | FileCheck %s -check-prefix=LOAD
+// RUN: llvm-objdump -p %p/Inputs/linkerOption.macho-x86_64 \
+// RUN:     | FileCheck %s -check-prefix=LD_OPT
+// RUN: llvm-objdump -p %p/Inputs/dylibSubFramework.macho-x86_64 \
+// RUN:     | FileCheck %s -check-prefix=SUB_FRAME
+// RUN: llvm-objdump -p %p/Inputs/dylibSubUmbrella.macho-x86_64 \
+// RUN:     | FileCheck %s -check-prefix=SUB_UMB
+// RUN: llvm-objdump -p %p/Inputs/dylibSubLibrary.macho-x86_64 \
+// RUN:     | FileCheck %s -check-prefix=SUB_LIB
+// RUN: llvm-objdump -p %p/Inputs/dylibSubClient.macho-x86_64 \
+// RUN:     | FileCheck %s -check-prefix=SUB_CLI
+// RUN: llvm-objdump -p %p/Inputs/dylibRoutines.macho-x86_64 \
+// RUN:     | FileCheck %s -check-prefix=ROUTINE
+// RUN: llvm-objdump -p %p/Inputs/exeThread.macho-x86_64 \
+// RUN:     | FileCheck %s -check-prefix=THREAD
+// RUN: llvm-objdump -macho -p -arch i386 %p/Inputs/macho-universal.x86_64.i386 \
+// RUN:     | FileCheck %s -check-prefix=FATi386
 
 CHECK: Mach header
 CHECK:       magic cputype cpusubtype  caps    filetype ncmds sizeofcmds      flags
@@ -366,3 +382,64 @@ LOAD:          name /usr/lib/foo4.dylib (offset 24)
 LOAD:       current version 0.0.0
 LOAD: compatibility version 0.0.0
 
+LD_OPT: Load command 4
+LD_OPT:      cmd LC_LINKER_OPTION
+LD_OPT:  cmdsize 24
+LD_OPT:    count 1
+LD_OPT:   string #1 -lc++
+LD_OPT: Load command 5
+LD_OPT:      cmd LC_LINKER_OPTION
+LD_OPT:  cmdsize 40
+LD_OPT:    count 2
+LD_OPT:   string #1 -framework
+LD_OPT:   string #2 Foundation
+
+SUB_FRAME: Load command 10
+SUB_FRAME:           cmd LC_SUB_FRAMEWORK
+SUB_FRAME:       cmdsize 16
+SUB_FRAME:      umbrella Bar (offset 12)
+
+SUB_UMB: Load command 5
+SUB_UMB:           cmd LC_SUB_UMBRELLA
+SUB_UMB:       cmdsize 16
+SUB_UMB:  sub_umbrella Foo (offset 12)
+
+SUB_LIB: Load command 5
+SUB_LIB:           cmd LC_SUB_LIBRARY
+SUB_LIB:       cmdsize 20
+SUB_LIB:   sub_library libfoo (offset 12)
+
+SUB_CLI: Load command 10
+SUB_CLI:           cmd LC_SUB_CLIENT
+SUB_CLI:       cmdsize 16
+SUB_CLI:        client bar (offset 12)
+
+ROUTINE: Load command 6
+ROUTINE:           cmd LC_ROUTINES_64
+ROUTINE:       cmdsize 72
+ROUTINE:  init_address 0x0000000000000f80
+ROUTINE:   init_module 0
+ROUTINE:     reserved1 0
+ROUTINE:     reserved2 0
+ROUTINE:     reserved3 0
+ROUTINE:     reserved4 0
+ROUTINE:     reserved5 0
+ROUTINE:     reserved6 0
+
+THREAD: Load command 10
+THREAD:         cmd LC_UNIXTHREAD
+THREAD:     cmdsize 184
+THREAD:      flavor x86_THREAD_STATE64
+THREAD:       count x86_THREAD_STATE64_COUNT
+THREAD:    rax  0x0000000000000000 rbx 0x0000000000000000 rcx  0x0000000000000000
+THREAD:    rdx  0x0000000000000000 rdi 0x0000000000000000 rsi  0x0000000000000000
+THREAD:    rbp  0x0000000000000000 rsp 0x0000000000000000 r8   0x0000000000000000
+THREAD:     r9  0x0000000000000000 r10 0x0000000000000000 r11  0x0000000000000000
+THREAD:    r12  0x0000000000000000 r13 0x0000000000000000 r14  0x0000000000000000
+THREAD:    r15  0x0000000000000000 rip 0x0000000100000d00
+THREAD: rflags  0x0000000000000000 cs  0x0000000000000000 fs   0x0000000000000000
+THREAD:     gs  0x0000000000000000
+
+FATi386: Mach header
+FATi386:       magic cputype cpusubtype  caps    filetype ncmds sizeofcmds      flags
+FATi386:    MH_MAGIC    I386        ALL  0x00     EXECUTE    16        716   NOUNDEFS DYLDLINK TWOLEVEL PIE MH_NO_HEAP_EXECUTION
diff --git a/test/tools/llvm-objdump/X86/macho-relocations.test b/test/tools/llvm-objdump/X86/macho-relocations.test
new file mode 100644
index 0000000..536aec8
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-relocations.test
@@ -0,0 +1,7 @@
+RUN: llvm-objdump -macho -r %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s
+
+CHECK: RELOCATION RECORDS FOR [__text]:
+CHECK: 0000000000000027 X86_64_RELOC_BRANCH _printf
+CHECK: 000000000000000b X86_64_RELOC_SIGNED L_.str
+CHECK: RELOCATION RECORDS FOR [__compact_unwind]:
+CHECK: 0000000000000000 X86_64_RELOC_UNSIGNED __text
diff --git a/test/tools/llvm-objdump/X86/macho-section-contents.test b/test/tools/llvm-objdump/X86/macho-section-contents.test
new file mode 100644
index 0000000..f62b5a7
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-section-contents.test
@@ -0,0 +1,17 @@
+RUN: llvm-objdump -macho -s %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s
+
+CHECK: Contents of section __text:
+CHECK:  0000 554889e5 4883ec20 488d0500 000000c7  UH..H.. H.......
+CHECK:  0010 45fc0000 0000897d f8488975 f0488955  E......}.H.u.H.U
+CHECK:  0020 e84889c7 b000e800 000000b9 00000000  .H..............
+CHECK:  0030 8945e489 c84883c4 205dc3             .E...H.. ].
+CHECK: Contents of section __cstring:
+CHECK:  003b 48656c6c 6f20776f 726c640a 00        Hello world..
+CHECK: Contents of section __compact_unwind:
+CHECK:  0048 00000000 00000000 3b000000 00000001  ........;.......
+CHECK:  0058 00000000 00000000 00000000 00000000  ................
+CHECK: Contents of section __eh_frame:
+CHECK:  0068 14000000 00000000 017a5200 01781001  .........zR..x..
+CHECK:  0078 100c0708 90010000 24000000 1c000000  ........$.......
+CHECK:  0088 78ffffff ffffffff 3b000000 00000000  x.......;.......
+CHECK:  0098 00410e10 8602430d 06000000 00000000  .A....C.........
diff --git a/test/tools/llvm-objdump/X86/macho-section-headers.test b/test/tools/llvm-objdump/X86/macho-section-headers.test
new file mode 100644
index 0000000..5159d18
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-section-headers.test
@@ -0,0 +1,8 @@
+RUN: llvm-objdump -macho -h %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s
+
+CHECK: Sections:
+CHECK: Idx Name          Size      Address          Type
+CHECK:   0 __text        0000003b 0000000000000000 TEXT 
+CHECK:   1 __cstring     0000000d 000000000000003b DATA 
+CHECK:   2 __compact_unwind 00000020 0000000000000048 DATA 
+CHECK:   3 __eh_frame    00000040 0000000000000068 DATA 
diff --git a/test/tools/llvm-objdump/X86/macho-section.test b/test/tools/llvm-objdump/X86/macho-section.test
new file mode 100644
index 0000000..720b9a4
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-section.test
@@ -0,0 +1,4 @@
+// RUN: llvm-objdump -m -section __DATA,__mod_init_func %p/Inputs/dylibModInit.macho-x86_64 | FileCheck %s -check-prefix=MODINIT
+
+MODINIT: Contents of (__DATA,__mod_init_func) section
+MODINIT: 0x0000000000001000 0x0000000000000f30 _libinit
diff --git a/test/tools/llvm-objdump/X86/macho-symbol-table.test b/test/tools/llvm-objdump/X86/macho-symbol-table.test
new file mode 100644
index 0000000..5b9a4f8
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-symbol-table.test
@@ -0,0 +1,8 @@
+RUN: llvm-objdump -macho -t %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s
+
+CHECK: SYMBOL TABLE:
+CHECK: 000000000000003b l     F __TEXT,__cstring	0000000d L_.str
+CHECK: 0000000000000068 l     F __TEXT,__eh_frame	00000018 EH_frame0
+CHECK: 0000000000000000 g     F __TEXT,__text	0000003b _main
+CHECK: 0000000000000080 g     F __TEXT,__eh_frame	00000028 _main.eh
+CHECK: 0000000000000000         *UND*	00000000 _printf
diff --git a/test/tools/llvm-objdump/X86/macho-universal-x86_64.i386.test b/test/tools/llvm-objdump/X86/macho-universal-x86_64.i386.test
new file mode 100644
index 0000000..e4fd37a
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-universal-x86_64.i386.test
@@ -0,0 +1,44 @@
+RUN: llvm-objdump %p/Inputs/macho-universal.x86_64.i386 -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex -arch all \
+RUN: | FileCheck %s -check-prefix UEXE-all
+RUN: llvm-objdump %p/Inputs/macho-universal-archive.x86_64.i386 -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex -arch i386 \
+RUN: | FileCheck %s -check-prefix UArchive-i386
+RUN: llvm-objdump %p/Inputs/macho-universal.x86_64.i386 -universal-headers -m \
+RUN: | FileCheck %s -check-prefix FAT
+
+UEXE-all: macho-universal.x86_64.i386 (architecture x86_64):
+UEXE-all: (__TEXT,__text) section
+UEXE-all: _main:
+UEXE-all: 0000000100000f60	pushq	%rbp
+UEXE-all: 0000000100000f61	movq	%rsp, %rbp
+UEXE-all: macho-universal.x86_64.i386 (architecture i386):
+UEXE-all: (__TEXT,__text) section
+UEXE-all: _main:
+UEXE-all: 00001fa0	pushl	%ebp
+UEXE-all: 00001fa1	movl	%esp, %ebp
+
+UArchive-i386: Archive : {{.*}}/macho-universal-archive.x86_64.i386
+UArchive-i386: macho-universal-archive.x86_64.i386(foo.o):
+UArchive-i386: (__TEXT,__text) section
+UArchive-i386: _foo:
+UArchive-i386: 00000000	pushl	%ebp
+UArchive-i386: 00000001	movl	%esp, %ebp
+UArchive-i386: 00000003	popl	%ebp
+UArchive-i386: 00000004	retl
+
+FAT: Fat headers
+FAT: fat_magic FAT_MAGIC
+FAT: nfat_arch 2
+FAT: architecture x86_64
+FAT:     cputype CPU_TYPE_X86_64
+FAT:     cpusubtype CPU_SUBTYPE_X86_64_ALL
+FAT:     capabilities CPU_SUBTYPE_LIB64
+FAT:     offset 4096
+FAT:     size 4360
+FAT:     align 2^12 (4096)
+FAT: architecture i386
+FAT:     cputype CPU_TYPE_I386
+FAT:     cpusubtype CPU_SUBTYPE_I386_ALL
+FAT:     capabilities 0x0
+FAT:     offset 12288
+FAT:     size 4336
+FAT:     align 2^12 (4096)
diff --git a/test/tools/llvm-objdump/X86/macho-unwind-info.test b/test/tools/llvm-objdump/X86/macho-unwind-info.test
new file mode 100644
index 0000000..33db84f
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-unwind-info.test
@@ -0,0 +1,7 @@
+RUN: llvm-objdump -macho -u %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s
+
+CHECK: Contents of __compact_unwind section:
+CHECK:   Entry at offset 0x0:
+CHECK:     start:                0x0 _main
+CHECK:     length:               0x3b
+CHECK:     compact encoding:     0x01000000
diff --git a/test/tools/llvm-objdump/common-symbol-elf.test b/test/tools/llvm-objdump/common-symbol-elf.test
new file mode 100644
index 0000000..32df05a
--- /dev/null
+++ b/test/tools/llvm-objdump/common-symbol-elf.test
@@ -0,0 +1,3 @@
+// RUN: llvm-objdump -t %p/Inputs/common-symbol-elf | FileCheck %s
+
+CHECK: 00000400 g       *COM*  00000008 common_symbol
diff --git a/test/tools/llvm-objdump/proc-specific-section-elf.test b/test/tools/llvm-objdump/proc-specific-section-elf.test
new file mode 100644
index 0000000..b3067d4
--- /dev/null
+++ b/test/tools/llvm-objdump/proc-specific-section-elf.test
@@ -0,0 +1,3 @@
+// RUN: llvm-objdump -t %p/Inputs/proc-specific-section-elf | FileCheck %s
+
+CHECK: 00000000         *UND*  00000000 print
diff --git a/test/tools/llvm-readobj/ARM/attribute-0.s b/test/tools/llvm-readobj/ARM/attribute-0.s
new file mode 100644
index 0000000..b761dd8
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-0.s
@@ -0,0 +1,234 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch, 0
+@CHECK:   .eabi_attribute 6, 0
+@CHECK-OBJ: Tag: 6
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: CPU_arch
+@CHECK-OBJ-NEXT: Description: Pre-v4
+
+.eabi_attribute  Tag_CPU_arch_profile, 0
+@CHECK:   .eabi_attribute 7, 0
+@CHECK-OBJ: Tag: 7
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: CPU_arch_profile
+@CHECK-OBJ-NEXT: Description: None
+
+.eabi_attribute  Tag_ARM_ISA_use, 0
+@CHECK:   .eabi_attribute 8, 0
+@CHECK-OBJ: Tag: 8
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ARM_ISA_use
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_THUMB_ISA_use, 0
+@CHECK:   .eabi_attribute 9, 0
+@CHECK-OBJ: Tag: 9
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: THUMB_ISA_use
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_FP_arch, 0
+@CHECK:   .eabi_attribute 10, 0
+@CHECK-OBJ: Tag: 10
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: FP_arch
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_WMMX_arch, 0
+@CHECK:   .eabi_attribute 11, 0
+@CHECK-OBJ: Tag: 11
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: WMMX_arch
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_Advanced_SIMD_arch, 0
+@CHECK:   .eabi_attribute 12, 0
+@CHECK-OBJ: Tag: 12
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: Advanced_SIMD_arch
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_PCS_config, 0
+@CHECK:   .eabi_attribute 13, 0
+@CHECK-OBJ: Tag: 13
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: PCS_config
+@CHECK-OBJ-NEXT: Description: None
+
+.eabi_attribute  Tag_ABI_PCS_R9_use, 0
+@CHECK:   .eabi_attribute 14, 0
+@CHECK-OBJ: Tag: 14
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_R9_use
+@CHECK-OBJ-NEXT: Description: v6
+
+.eabi_attribute  Tag_ABI_PCS_RW_data, 0
+@CHECK:   .eabi_attribute 15, 0
+@CHECK-OBJ: Tag: 15
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_RW_data
+@CHECK-OBJ-NEXT: Description: Absolute
+
+.eabi_attribute  Tag_ABI_PCS_RO_data, 0
+@CHECK:   .eabi_attribute 16, 0
+@CHECK-OBJ: Tag: 16
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_RO_data
+@CHECK-OBJ-NEXT: Description: Absolute
+
+.eabi_attribute  Tag_ABI_PCS_GOT_use, 0
+@CHECK:   .eabi_attribute 17, 0
+@CHECK-OBJ: Tag: 17
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_GOT_use
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_ABI_PCS_wchar_t, 0
+@CHECK:   .eabi_attribute 18, 0
+@CHECK-OBJ: Tag: 18
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_wchar_t
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_ABI_FP_rounding, 0
+@CHECK:   .eabi_attribute 19, 0
+@CHECK-OBJ: Tag: 19
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_FP_rounding
+@CHECK-OBJ-NEXT: Description: IEEE-754
+
+.eabi_attribute  Tag_ABI_FP_denormal, 0
+@CHECK:   .eabi_attribute 20, 0
+@CHECK-OBJ: Tag: 20
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_FP_denormal
+@CHECK-OBJ-NEXT: Description: Unsupported
+
+.eabi_attribute  Tag_ABI_FP_exceptions, 0
+@CHECK:   .eabi_attribute 21, 0
+@CHECK-OBJ: Tag: 21
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_FP_exceptions
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_ABI_FP_user_exceptions, 0
+@CHECK:   .eabi_attribute 22, 0
+@CHECK-OBJ: Tag: 22
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_FP_user_exceptions
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_ABI_FP_number_model, 0
+@CHECK:   .eabi_attribute 23, 0
+@CHECK-OBJ: Tag: 23
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_FP_number_model
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_ABI_align_needed, 0
+@CHECK:   .eabi_attribute 24, 0
+@CHECK-OBJ: Tag: 24
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_align_needed
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_ABI_align_preserved, 0
+@CHECK:   .eabi_attribute 25, 0
+@CHECK-OBJ: Tag: 25
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_align_preserved
+@CHECK-OBJ-NEXT: Description: Not Required
+
+.eabi_attribute  Tag_ABI_enum_size, 0
+@CHECK:   .eabi_attribute 26, 0
+@CHECK-OBJ: Tag: 26
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_enum_size
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_ABI_HardFP_use, 0
+@CHECK:   .eabi_attribute 27, 0
+@CHECK-OBJ: Tag: 27
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_HardFP_use
+@CHECK-OBJ-NEXT: Description: Tag_FP_arch
+
+.eabi_attribute  Tag_ABI_VFP_args, 0
+@CHECK:   .eabi_attribute 28, 0
+@CHECK-OBJ: Tag: 28
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_VFP_args
+@CHECK-OBJ-NEXT: Description: AAPCS
+
+.eabi_attribute  Tag_ABI_WMMX_args, 0
+@CHECK:   .eabi_attribute 29, 0
+@CHECK-OBJ: Tag: 29
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_WMMX_args
+@CHECK-OBJ-NEXT: Description: AAPCS
+
+.eabi_attribute  Tag_ABI_optimization_goals, 0
+@CHECK:   .eabi_attribute 30, 0
+@CHECK-OBJ: Tag: 30
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_optimization_goals
+@CHECK-OBJ-NEXT: Description: None
+
+.eabi_attribute  Tag_ABI_FP_optimization_goals, 0
+@CHECK:   .eabi_attribute 31, 0
+@CHECK-OBJ: Tag: 31
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_FP_optimization_goals
+@CHECK-OBJ-NEXT: Description: None
+
+.eabi_attribute  Tag_compatibility, 0, "ARM"
+@CHECK:   .eabi_attribute 32, 0
+@CHECK-OBJ: Tag: 32
+@CHECK-OBJ-NEXT: Value: 0, ARM
+@CHECK-OBJ-NEXT: TagName: compatibility
+@CHECK-OBJ-NEXT: Description: No Specific Requirements
+
+.eabi_attribute  Tag_CPU_unaligned_access, 0
+@CHECK:   .eabi_attribute 34, 0
+@CHECK-OBJ: Tag: 34
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: CPU_unaligned_access
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_FP_HP_extension, 0
+@CHECK:   .eabi_attribute 36, 0
+@CHECK-OBJ: Tag: 36
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: FP_HP_extension
+@CHECK-OBJ-NEXT: Description: If Available
+
+.eabi_attribute  Tag_ABI_FP_16bit_format, 0
+@CHECK:   .eabi_attribute 38, 0
+@CHECK-OBJ: Tag: 38
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: ABI_FP_16bit_format
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_MPextension_use, 0
+@CHECK:   .eabi_attribute 42, 0
+@CHECK-OBJ: Tag: 42
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: MPextension_use
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_DIV_use, 0
+@CHECK:   .eabi_attribute 44, 0
+@CHECK-OBJ: Tag: 44
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: DIV_use
+@CHECK-OBJ-NEXT: Description: If Available
+
+.eabi_attribute  Tag_Virtualization_use, 0
+@CHECK:   .eabi_attribute 68, 0
+@CHECK-OBJ: Tag: 68
+@CHECK-OBJ-NEXT: Value: 0
+@CHECK-OBJ-NEXT: TagName: Virtualization_use
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-1.s b/test/tools/llvm-readobj/ARM/attribute-1.s
new file mode 100644
index 0000000..f433cbc
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-1.s
@@ -0,0 +1,220 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch, 1
+@CHECK:   .eabi_attribute 6, 1
+@CHECK-OBJ: Tag: 6
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: CPU_arch
+@CHECK-OBJ-NEXT: Description: ARM v4
+
+.eabi_attribute  Tag_ARM_ISA_use, 1
+@CHECK:   .eabi_attribute 8, 1
+@CHECK-OBJ: Tag: 8
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ARM_ISA_use
+@CHECK-OBJ-NEXT: Description: Permitted
+
+.eabi_attribute  Tag_THUMB_ISA_use, 1
+@CHECK:   .eabi_attribute 9, 1
+@CHECK-OBJ: Tag: 9
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: THUMB_ISA_use
+@CHECK-OBJ-NEXT: Description: Thumb-1
+
+.eabi_attribute  Tag_FP_arch, 1
+@CHECK:   .eabi_attribute 10, 1
+@CHECK-OBJ: Tag: 10
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: FP_arch
+@CHECK-OBJ-NEXT: Description: VFPv1
+
+.eabi_attribute  Tag_WMMX_arch, 1
+@CHECK:   .eabi_attribute 11, 1
+@CHECK-OBJ: Tag: 11
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: WMMX_arch
+@CHECK-OBJ-NEXT: Description: WMMXv1
+
+.eabi_attribute  Tag_Advanced_SIMD_arch, 1
+@CHECK:   .eabi_attribute 12, 1
+@CHECK-OBJ: Tag: 12
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: Advanced_SIMD_arch
+@CHECK-OBJ-NEXT: Description: NEONv1
+
+.eabi_attribute  Tag_PCS_config, 1
+@CHECK:   .eabi_attribute 13, 1
+@CHECK-OBJ: Tag: 13
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: PCS_config
+@CHECK-OBJ-NEXT: Description: Bare Platform
+
+.eabi_attribute  Tag_ABI_PCS_R9_use, 1
+@CHECK:   .eabi_attribute 14, 1
+@CHECK-OBJ: Tag: 14
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_R9_use
+@CHECK-OBJ-NEXT: Description: Static Base
+
+.eabi_attribute  Tag_ABI_PCS_RW_data, 1
+@CHECK:   .eabi_attribute 15, 1
+@CHECK-OBJ: Tag: 15
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_RW_data
+@CHECK-OBJ-NEXT: Description: PC-relative
+
+.eabi_attribute  Tag_ABI_PCS_RO_data, 1
+@CHECK:   .eabi_attribute 16, 1
+@CHECK-OBJ: Tag: 16
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_RO_data
+@CHECK-OBJ-NEXT: Description: PC-relative
+
+.eabi_attribute  Tag_ABI_PCS_GOT_use, 1
+@CHECK:   .eabi_attribute 17, 1
+@CHECK-OBJ: Tag: 17
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_GOT_use
+@CHECK-OBJ-NEXT: Description: Direct
+
+.eabi_attribute  Tag_ABI_FP_rounding, 1
+@CHECK:   .eabi_attribute 19, 1
+@CHECK-OBJ: Tag: 19
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_FP_rounding
+@CHECK-OBJ-NEXT: Description: Runtime
+
+.eabi_attribute  Tag_ABI_FP_denormal, 1
+@CHECK:   .eabi_attribute 20, 1
+@CHECK-OBJ: Tag: 20
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_FP_denormal
+@CHECK-OBJ-NEXT: Description: IEEE-754
+
+.eabi_attribute  Tag_ABI_FP_exceptions, 1
+@CHECK:   .eabi_attribute 21, 1
+@CHECK-OBJ: Tag: 21
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_FP_exceptions
+@CHECK-OBJ-NEXT: Description: IEEE-754
+
+.eabi_attribute  Tag_ABI_FP_user_exceptions, 1
+@CHECK:   .eabi_attribute 22, 1
+@CHECK-OBJ: Tag: 22
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_FP_user_exceptions
+@CHECK-OBJ-NEXT: Description: IEEE-754
+
+.eabi_attribute  Tag_ABI_FP_number_model, 1
+@CHECK:   .eabi_attribute 23, 1
+@CHECK-OBJ: Tag: 23
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_FP_number_model
+@CHECK-OBJ-NEXT: Description: Finite Only
+
+.eabi_attribute  Tag_ABI_align_needed, 1
+@CHECK:   .eabi_attribute 24, 1
+@CHECK-OBJ: Tag: 24
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_align_needed
+@CHECK-OBJ-NEXT: Description: 8-byte alignment
+
+.eabi_attribute  Tag_ABI_align_preserved, 1
+@CHECK:   .eabi_attribute 25, 1
+@CHECK-OBJ: Tag: 25
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_align_preserved
+@CHECK-OBJ-NEXT: Description: 8-byte data alignment
+
+.eabi_attribute  Tag_ABI_enum_size, 1
+@CHECK:   .eabi_attribute 26, 1
+@CHECK-OBJ: Tag: 26
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_enum_size
+@CHECK-OBJ-NEXT: Description: Packed
+
+.eabi_attribute  Tag_ABI_HardFP_use, 1
+@CHECK:   .eabi_attribute 27, 1
+@CHECK-OBJ: Tag: 27
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_HardFP_use
+@CHECK-OBJ-NEXT: Description: Single-Precision
+
+.eabi_attribute  Tag_ABI_VFP_args, 1
+@CHECK:   .eabi_attribute 28, 1
+@CHECK-OBJ: Tag: 28
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_VFP_args
+@CHECK-OBJ-NEXT: Description: AAPCS VFP
+
+.eabi_attribute  Tag_ABI_WMMX_args, 1
+@CHECK:   .eabi_attribute 29, 1
+@CHECK-OBJ: Tag: 29
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_WMMX_args
+@CHECK-OBJ-NEXT: Description: iWMMX
+
+.eabi_attribute  Tag_ABI_optimization_goals, 1
+@CHECK:   .eabi_attribute 30, 1
+@CHECK-OBJ: Tag: 30
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_optimization_goals
+@CHECK-OBJ-NEXT: Description: Speed
+
+.eabi_attribute  Tag_ABI_FP_optimization_goals, 1
+@CHECK:   .eabi_attribute 31, 1
+@CHECK-OBJ: Tag: 31
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_FP_optimization_goals
+@CHECK-OBJ-NEXT: Description: Speed
+
+.eabi_attribute  Tag_compatibility, 1, ""
+@CHECK:   .eabi_attribute 32, 1
+@CHECK-OBJ: Tag: 32
+@CHECK-OBJ-NEXT: Value: 1,
+@CHECK-OBJ-NEXT: TagName: compatibility
+@CHECK-OBJ-NEXT: Description: AEABI Conformant
+
+.eabi_attribute  Tag_CPU_unaligned_access, 1
+@CHECK:   .eabi_attribute 34, 1
+@CHECK-OBJ: Tag: 34
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: CPU_unaligned_access
+@CHECK-OBJ-NEXT: Description: v6-style
+
+.eabi_attribute  Tag_FP_HP_extension, 1
+@CHECK:   .eabi_attribute 36, 1
+@CHECK-OBJ: Tag: 36
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: FP_HP_extension
+@CHECK-OBJ-NEXT: Description: Permitted
+
+.eabi_attribute  Tag_ABI_FP_16bit_format, 1
+@CHECK:   .eabi_attribute 38, 1
+@CHECK-OBJ: Tag: 38
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: ABI_FP_16bit_format
+@CHECK-OBJ-NEXT: Description: IEEE-754
+
+.eabi_attribute  Tag_MPextension_use, 1
+@CHECK:   .eabi_attribute 42, 1
+@CHECK-OBJ: Tag: 42
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: MPextension_use
+@CHECK-OBJ-NEXT: Description: Permitted
+
+.eabi_attribute  Tag_DIV_use, 1
+@CHECK:   .eabi_attribute 44, 1
+@CHECK-OBJ: Tag: 44
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: DIV_use
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_Virtualization_use, 1
+@CHECK:   .eabi_attribute 68, 1
+@CHECK-OBJ: Tag: 68
+@CHECK-OBJ-NEXT: Value: 1
+@CHECK-OBJ-NEXT: TagName: Virtualization_use
+@CHECK-OBJ-NEXT: Description: TrustZone
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-10.s b/test/tools/llvm-readobj/ARM/attribute-10.s
new file mode 100644
index 0000000..667db8c
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-10.s
@@ -0,0 +1,24 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch, 10
+@CHECK:   .eabi_attribute 6, 10
+@CHECK-OBJ: Tag: 6
+@CHECK-OBJ-NEXT: Value: 10
+@CHECK-OBJ-NEXT: TagName: CPU_arch
+@CHECK-OBJ-NEXT: Description: ARM v7
+
+.eabi_attribute  Tag_ABI_align_needed, 10
+@CHECK:   .eabi_attribute 24, 10
+@CHECK-OBJ: Tag: 24
+@CHECK-OBJ-NEXT: Value: 10
+@CHECK-OBJ-NEXT: TagName: ABI_align_needed
+@CHECK-OBJ-NEXT: Description: 8-byte alignment, 1024-byte extended alignment
+
+.eabi_attribute  Tag_ABI_align_preserved, 10
+@CHECK:   .eabi_attribute 25, 10
+@CHECK-OBJ: Tag: 25
+@CHECK-OBJ-NEXT: Value: 10
+@CHECK-OBJ-NEXT: TagName: ABI_align_preserved
+@CHECK-OBJ-NEXT: Description: 8-byte stack alignment, 1024-byte data alignment
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-11.s b/test/tools/llvm-readobj/ARM/attribute-11.s
new file mode 100644
index 0000000..2d8e43b
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-11.s
@@ -0,0 +1,24 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch, 11
+@CHECK:   .eabi_attribute 6, 11
+@CHECK-OBJ: Tag: 6
+@CHECK-OBJ-NEXT: Value: 11
+@CHECK-OBJ-NEXT: TagName: CPU_arch
+@CHECK-OBJ-NEXT: Description: ARM v6-M
+
+.eabi_attribute  Tag_ABI_align_needed, 11
+@CHECK:   .eabi_attribute 24, 11
+@CHECK-OBJ: Tag: 24
+@CHECK-OBJ-NEXT: Value: 11
+@CHECK-OBJ-NEXT: TagName: ABI_align_needed
+@CHECK-OBJ-NEXT: Description: 8-byte alignment, 2048-byte extended alignment
+
+.eabi_attribute  Tag_ABI_align_preserved, 11
+@CHECK:   .eabi_attribute 25, 11
+@CHECK-OBJ: Tag: 25
+@CHECK-OBJ-NEXT: Value: 11
+@CHECK-OBJ-NEXT: TagName: ABI_align_preserved
+@CHECK-OBJ-NEXT: Description: 8-byte stack alignment, 2048-byte data alignment
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-12.s b/test/tools/llvm-readobj/ARM/attribute-12.s
new file mode 100644
index 0000000..4387527
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-12.s
@@ -0,0 +1,24 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch, 12
+@CHECK:   .eabi_attribute 6, 12
+@CHECK-OBJ: Tag: 6
+@CHECK-OBJ-NEXT: Value: 12
+@CHECK-OBJ-NEXT: TagName: CPU_arch
+@CHECK-OBJ-NEXT: Description: ARM v6S-M
+
+.eabi_attribute  Tag_ABI_align_needed, 12
+@CHECK:   .eabi_attribute 24, 12
+@CHECK-OBJ: Tag: 24
+@CHECK-OBJ-NEXT: Value: 12
+@CHECK-OBJ-NEXT: TagName: ABI_align_needed
+@CHECK-OBJ-NEXT: Description: 8-byte alignment, 4096-byte extended alignment
+
+.eabi_attribute  Tag_ABI_align_preserved, 12
+@CHECK:   .eabi_attribute 25, 12
+@CHECK-OBJ: Tag: 25
+@CHECK-OBJ-NEXT: Value: 12
+@CHECK-OBJ-NEXT: TagName: ABI_align_preserved
+@CHECK-OBJ-NEXT: Description: 8-byte stack alignment, 4096-byte data alignment
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-13.s b/test/tools/llvm-readobj/ARM/attribute-13.s
new file mode 100644
index 0000000..25ac5f1
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-13.s
@@ -0,0 +1,10 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch, 13
+@CHECK:   .eabi_attribute 6, 13
+@CHECK-OBJ: Tag: 6
+@CHECK-OBJ-NEXT: Value: 13
+@CHECK-OBJ-NEXT: TagName: CPU_arch
+@CHECK-OBJ-NEXT: Description: ARM v7E-M
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-136.s b/test/tools/llvm-readobj/ARM/attribute-136.s
new file mode 100644
index 0000000..a2d2a9a
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-136.s
@@ -0,0 +1,10 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_compatibility, 136, "Foo Corp"
+@CHECK:   .eabi_attribute 32, 136
+@CHECK-OBJ: Tag: 32
+@CHECK-OBJ-NEXT: Value: 136, Foo Corp
+@CHECK-OBJ-NEXT: TagName: compatibility
+@CHECK-OBJ-NEXT: Description: AEABI Non-Conformant
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-14.s b/test/tools/llvm-readobj/ARM/attribute-14.s
new file mode 100644
index 0000000..e0d8e46
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-14.s
@@ -0,0 +1,10 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch, 14
+@CHECK:   .eabi_attribute 6, 14
+@CHECK-OBJ: Tag: 6
+@CHECK-OBJ-NEXT: Value: 14
+@CHECK-OBJ-NEXT: TagName: CPU_arch
+@CHECK-OBJ-NEXT: Description: ARM v8
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-15.s b/test/tools/llvm-readobj/ARM/attribute-15.s
new file mode 100644
index 0000000..7877ce7
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-15.s
@@ -0,0 +1,10 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_compatibility, 15, "Longer Corporation NaMe"
+@CHECK:   .eabi_attribute 32, 15
+@CHECK-OBJ: Tag: 32
+@CHECK-OBJ-NEXT: Value: 15, Longer Corporation NaMe
+@CHECK-OBJ-NEXT: TagName: compatibility
+@CHECK-OBJ-NEXT: Description: AEABI Non-Conformant
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-2.s b/test/tools/llvm-readobj/ARM/attribute-2.s
new file mode 100644
index 0000000..21ee41f
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-2.s
@@ -0,0 +1,178 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch, 2
+@CHECK:   .eabi_attribute 6, 2
+@CHECK-OBJ: Tag: 6
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: CPU_arch
+@CHECK-OBJ-NEXT: Description: ARM v4T
+
+.eabi_attribute  Tag_THUMB_ISA_use, 2
+@CHECK:   .eabi_attribute 9, 2
+@CHECK-OBJ: Tag: 9
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: THUMB_ISA_use
+@CHECK-OBJ-NEXT: Description: Thumb-2
+
+.eabi_attribute  Tag_FP_arch, 2
+@CHECK:   .eabi_attribute 10, 2
+@CHECK-OBJ: Tag: 10
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: FP_arch
+@CHECK-OBJ-NEXT: Description: VFPv2
+
+.eabi_attribute  Tag_WMMX_arch, 2
+@CHECK:   .eabi_attribute 11, 2
+@CHECK-OBJ: Tag: 11
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: WMMX_arch
+@CHECK-OBJ-NEXT: Description: WMMXv2
+
+.eabi_attribute  Tag_Advanced_SIMD_arch, 2
+@CHECK:   .eabi_attribute 12, 2
+@CHECK-OBJ: Tag: 12
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: Advanced_SIMD_arch
+@CHECK-OBJ-NEXT: Description: NEONv2+FMA
+
+.eabi_attribute  Tag_PCS_config, 2
+@CHECK:   .eabi_attribute 13, 2
+@CHECK-OBJ: Tag: 13
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: PCS_config
+@CHECK-OBJ-NEXT: Description: Linux Application
+
+.eabi_attribute  Tag_ABI_PCS_R9_use, 2
+@CHECK:   .eabi_attribute 14, 2
+@CHECK-OBJ: Tag: 14
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_R9_use
+@CHECK-OBJ-NEXT: Description: TLS
+
+.eabi_attribute  Tag_ABI_PCS_RW_data, 2
+@CHECK:   .eabi_attribute 15, 2
+@CHECK-OBJ: Tag: 15
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_RW_data
+@CHECK-OBJ-NEXT: Description: SB-relative
+
+.eabi_attribute  Tag_ABI_PCS_RO_data, 2
+@CHECK:   .eabi_attribute 16, 2
+@CHECK-OBJ: Tag: 16
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_RO_data
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_ABI_PCS_GOT_use, 2
+@CHECK:   .eabi_attribute 17, 2
+@CHECK-OBJ: Tag: 17
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_GOT_use
+@CHECK-OBJ-NEXT: Description: GOT-Indirect
+
+.eabi_attribute  Tag_ABI_PCS_wchar_t, 2
+@CHECK:   .eabi_attribute 18, 2
+@CHECK-OBJ: Tag: 18
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_wchar_t
+@CHECK-OBJ-NEXT: Description: 2-byte
+
+.eabi_attribute  Tag_ABI_FP_denormal, 2
+@CHECK:   .eabi_attribute 20, 2
+@CHECK-OBJ: Tag: 20
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: ABI_FP_denormal
+@CHECK-OBJ-NEXT: Description: Sign Only
+
+.eabi_attribute  Tag_ABI_FP_number_model, 2
+@CHECK:   .eabi_attribute 23, 2
+@CHECK-OBJ: Tag: 23
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: ABI_FP_number_model
+@CHECK-OBJ-NEXT: Description: RTABI
+
+.eabi_attribute  Tag_ABI_align_needed, 2
+@CHECK:   .eabi_attribute 24, 2
+@CHECK-OBJ: Tag: 24
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: ABI_align_needed
+@CHECK-OBJ-NEXT: Description: 4-byte alignment
+
+.eabi_attribute  Tag_ABI_align_preserved, 2
+@CHECK:   .eabi_attribute 25, 2
+@CHECK-OBJ: Tag: 25
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: ABI_align_preserved
+@CHECK-OBJ-NEXT: Description: 8-byte data and code alignment
+
+.eabi_attribute  Tag_ABI_enum_size, 2
+@CHECK:   .eabi_attribute 26, 2
+@CHECK-OBJ: Tag: 26
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: ABI_enum_size
+@CHECK-OBJ-NEXT: Description: Int32
+
+.eabi_attribute  Tag_ABI_HardFP_use, 2
+@CHECK:   .eabi_attribute 27, 2
+@CHECK-OBJ: Tag: 27
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: ABI_HardFP_use
+@CHECK-OBJ-NEXT: Description: Reserved
+
+.eabi_attribute  Tag_ABI_VFP_args, 2
+@CHECK:   .eabi_attribute 28, 2
+@CHECK-OBJ: Tag: 28
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: ABI_VFP_args
+@CHECK-OBJ-NEXT: Description: Custom
+
+.eabi_attribute  Tag_ABI_WMMX_args, 2
+@CHECK:   .eabi_attribute 29, 2
+@CHECK-OBJ: Tag: 29
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: ABI_WMMX_args
+@CHECK-OBJ-NEXT: Description: Custom
+
+.eabi_attribute  Tag_ABI_optimization_goals, 2
+@CHECK:   .eabi_attribute 30, 2
+@CHECK-OBJ: Tag: 30
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: ABI_optimization_goals
+@CHECK-OBJ-NEXT: Description: Aggressive Speed
+
+.eabi_attribute  Tag_ABI_FP_optimization_goals, 2
+@CHECK:   .eabi_attribute 31, 2
+@CHECK-OBJ: Tag: 31
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: ABI_FP_optimization_goals
+@CHECK-OBJ-NEXT: Description: Aggressive Speed
+
+.eabi_attribute  Tag_compatibility, 2, ""
+@CHECK:   .eabi_attribute 32, 2
+@CHECK-OBJ: Tag: 32
+@CHECK-OBJ-NEXT: Value: 2,
+@CHECK-OBJ-NEXT: TagName: compatibility
+@CHECK-OBJ-NEXT: Description: AEABI Non-Conformant
+
+.eabi_attribute  Tag_ABI_FP_16bit_format, 2
+@CHECK:   .eabi_attribute 38, 2
+@CHECK-OBJ: Tag: 38
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: ABI_FP_16bit_format
+@CHECK-OBJ-NEXT: Description: VFPv3
+
+.eabi_attribute  Tag_DIV_use, 2
+@CHECK:   .eabi_attribute 44, 2
+@CHECK-OBJ: Tag: 44
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: DIV_use
+@CHECK-OBJ-NEXT: Description: Permitted
+
+.eabi_attribute  Tag_Virtualization_use, 2
+@CHECK:   .eabi_attribute 68, 2
+@CHECK-OBJ: Tag: 68
+@CHECK-OBJ-NEXT: Value: 2
+@CHECK-OBJ-NEXT: TagName: Virtualization_use
+@CHECK-OBJ-NEXT: Description: Virtualization Extensions
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-3.s b/test/tools/llvm-readobj/ARM/attribute-3.s
new file mode 100644
index 0000000..ad2de25
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-3.s
@@ -0,0 +1,108 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch, 3
+@CHECK:   .eabi_attribute 6, 3
+@CHECK-OBJ: Tag: 6
+@CHECK-OBJ-NEXT: Value: 3
+@CHECK-OBJ-NEXT: TagName: CPU_arch
+@CHECK-OBJ-NEXT: Description: ARM v5T
+
+.eabi_attribute  Tag_FP_arch, 3
+@CHECK:   .eabi_attribute 10, 3
+@CHECK-OBJ: Tag: 10
+@CHECK-OBJ-NEXT: Value: 3
+@CHECK-OBJ-NEXT: TagName: FP_arch
+@CHECK-OBJ-NEXT: Description: VFPv3
+
+.eabi_attribute  Tag_Advanced_SIMD_arch, 3
+@CHECK:   .eabi_attribute 12, 3
+@CHECK-OBJ: Tag: 12
+@CHECK-OBJ-NEXT: Value: 3
+@CHECK-OBJ-NEXT: TagName: Advanced_SIMD_arch
+@CHECK-OBJ-NEXT: Description: ARMv8-a NEON
+
+.eabi_attribute  Tag_PCS_config, 3
+@CHECK:   .eabi_attribute 13, 3
+@CHECK-OBJ: Tag: 13
+@CHECK-OBJ-NEXT: Value: 3
+@CHECK-OBJ-NEXT: TagName: PCS_config
+@CHECK-OBJ-NEXT: Description: Linux DSO
+
+.eabi_attribute  Tag_ABI_PCS_R9_use, 3
+@CHECK:   .eabi_attribute 14, 3
+@CHECK-OBJ: Tag: 14
+@CHECK-OBJ-NEXT: Value: 3
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_R9_use
+@CHECK-OBJ-NEXT: Description: Unused
+
+.eabi_attribute  Tag_ABI_PCS_RW_data, 3
+@CHECK:   .eabi_attribute 15, 3
+@CHECK-OBJ: Tag: 15
+@CHECK-OBJ-NEXT: Value: 3
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_RW_data
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_ABI_FP_number_model, 3
+@CHECK:   .eabi_attribute 23, 3
+@CHECK-OBJ: Tag: 23
+@CHECK-OBJ-NEXT: Value: 3
+@CHECK-OBJ-NEXT: TagName: ABI_FP_number_model
+@CHECK-OBJ-NEXT: Description: IEEE-754
+
+.eabi_attribute  Tag_ABI_align_needed, 3
+@CHECK:   .eabi_attribute 24, 3
+@CHECK-OBJ: Tag: 24
+@CHECK-OBJ-NEXT: Value: 3
+@CHECK-OBJ-NEXT: TagName: ABI_align_needed
+@CHECK-OBJ-NEXT: Description: Reserved
+
+.eabi_attribute  Tag_ABI_align_preserved, 3
+@CHECK:   .eabi_attribute 25, 3
+@CHECK-OBJ: Tag: 25
+@CHECK-OBJ-NEXT: Value: 3
+@CHECK-OBJ-NEXT: TagName: ABI_align_preserved
+@CHECK-OBJ-NEXT: Description: Reserved
+
+.eabi_attribute  Tag_ABI_enum_size, 3
+@CHECK:   .eabi_attribute 26, 3
+@CHECK-OBJ: Tag: 26
+@CHECK-OBJ-NEXT: Value: 3
+@CHECK-OBJ-NEXT: TagName: ABI_enum_size
+@CHECK-OBJ-NEXT: Description: External Int32
+
+.eabi_attribute  Tag_ABI_HardFP_use, 3
+@CHECK:   .eabi_attribute 27, 3
+@CHECK-OBJ: Tag: 27
+@CHECK-OBJ-NEXT: Value: 3
+@CHECK-OBJ-NEXT: TagName: ABI_HardFP_use
+@CHECK-OBJ-NEXT: Description: Tag_FP_arch (deprecated)
+
+.eabi_attribute  Tag_ABI_VFP_args, 3
+@CHECK:   .eabi_attribute 28, 3
+@CHECK-OBJ: Tag: 28
+@CHECK-OBJ-NEXT: Value: 3
+@CHECK-OBJ-NEXT: TagName: ABI_VFP_args
+@CHECK-OBJ-NEXT: Description: Not Permitted
+
+.eabi_attribute  Tag_ABI_optimization_goals, 3
+@CHECK:   .eabi_attribute 30, 3
+@CHECK-OBJ: Tag: 30
+@CHECK-OBJ-NEXT: Value: 3
+@CHECK-OBJ-NEXT: TagName: ABI_optimization_goals
+@CHECK-OBJ-NEXT: Description: Size
+
+.eabi_attribute  Tag_ABI_FP_optimization_goals, 3
+@CHECK:   .eabi_attribute 31, 3
+@CHECK-OBJ: Tag: 31
+@CHECK-OBJ-NEXT: Value: 3
+@CHECK-OBJ-NEXT: TagName: ABI_FP_optimization_goals
+@CHECK-OBJ-NEXT: Description: Size
+
+.eabi_attribute  Tag_Virtualization_use, 3
+@CHECK:   .eabi_attribute 68, 3
+@CHECK-OBJ: Tag: 68
+@CHECK-OBJ-NEXT: Value: 3
+@CHECK-OBJ-NEXT: TagName: Virtualization_use
+@CHECK-OBJ-NEXT: Description: TrustZone + Virtualization Extensions
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-4.s b/test/tools/llvm-readobj/ARM/attribute-4.s
new file mode 100644
index 0000000..dd0a4a6
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-4.s
@@ -0,0 +1,59 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch, 4
+@CHECK:   .eabi_attribute 6, 4
+@CHECK-OBJ: Tag: 6
+@CHECK-OBJ-NEXT: Value: 4
+@CHECK-OBJ-NEXT: TagName: CPU_arch
+@CHECK-OBJ-NEXT: Description: ARM v5TE
+
+.eabi_attribute  Tag_FP_arch, 4
+@CHECK:   .eabi_attribute 10, 4
+@CHECK-OBJ: Tag: 10
+@CHECK-OBJ-NEXT: Value: 4
+@CHECK-OBJ-NEXT: TagName: FP_arch
+@CHECK-OBJ-NEXT: Description: VFPv3-D16
+
+.eabi_attribute  Tag_PCS_config, 4
+@CHECK:   .eabi_attribute 13, 4
+@CHECK-OBJ: Tag: 13
+@CHECK-OBJ-NEXT: Value: 4
+@CHECK-OBJ-NEXT: TagName: PCS_config
+@CHECK-OBJ-NEXT: Description: Palm OS 2004
+
+.eabi_attribute  Tag_ABI_PCS_wchar_t, 4
+@CHECK:   .eabi_attribute 18, 4
+@CHECK-OBJ: Tag: 18
+@CHECK-OBJ-NEXT: Value: 4
+@CHECK-OBJ-NEXT: TagName: ABI_PCS_wchar_t
+@CHECK-OBJ-NEXT: Description: 4-byte
+
+.eabi_attribute  Tag_ABI_align_needed, 4
+@CHECK:   .eabi_attribute 24, 4
+@CHECK-OBJ: Tag: 24
+@CHECK-OBJ-NEXT: Value: 4
+@CHECK-OBJ-NEXT: TagName: ABI_align_needed
+@CHECK-OBJ-NEXT: Description: 8-byte alignment, 16-byte extended alignment
+
+.eabi_attribute  Tag_ABI_align_preserved, 4
+@CHECK:   .eabi_attribute 25, 4
+@CHECK-OBJ: Tag: 25
+@CHECK-OBJ-NEXT: Value: 4
+@CHECK-OBJ-NEXT: TagName: ABI_align_preserved
+@CHECK-OBJ-NEXT: Description: 8-byte stack alignment, 16-byte data alignment
+
+.eabi_attribute  Tag_ABI_optimization_goals, 4
+@CHECK:   .eabi_attribute 30, 4
+@CHECK-OBJ: Tag: 30
+@CHECK-OBJ-NEXT: Value: 4
+@CHECK-OBJ-NEXT: TagName: ABI_optimization_goals
+@CHECK-OBJ-NEXT: Description: Aggressive Size
+
+.eabi_attribute  Tag_ABI_FP_optimization_goals, 4
+@CHECK:   .eabi_attribute 31, 4
+@CHECK-OBJ: Tag: 31
+@CHECK-OBJ-NEXT: Value: 4
+@CHECK-OBJ-NEXT: TagName: ABI_FP_optimization_goals
+@CHECK-OBJ-NEXT: Description: Aggressive Size
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-5.s b/test/tools/llvm-readobj/ARM/attribute-5.s
new file mode 100644
index 0000000..97e37e2
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-5.s
@@ -0,0 +1,52 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch, 5
+@CHECK:   .eabi_attribute 6, 5
+@CHECK-OBJ: Tag: 6
+@CHECK-OBJ-NEXT: Value: 5
+@CHECK-OBJ-NEXT: TagName: CPU_arch
+@CHECK-OBJ-NEXT: Description: ARM v5TEJ
+
+.eabi_attribute  Tag_FP_arch, 5
+@CHECK:   .eabi_attribute 10, 5
+@CHECK-OBJ: Tag: 10
+@CHECK-OBJ-NEXT: Value: 5
+@CHECK-OBJ-NEXT: TagName: FP_arch
+@CHECK-OBJ-NEXT: Description: VFPv4
+
+.eabi_attribute  Tag_PCS_config, 5
+@CHECK:   .eabi_attribute 13, 5
+@CHECK-OBJ: Tag: 13
+@CHECK-OBJ-NEXT: Value: 5
+@CHECK-OBJ-NEXT: TagName: PCS_config
+@CHECK-OBJ-NEXT: Description: Reserved (Palm OS)
+
+.eabi_attribute  Tag_ABI_align_needed, 5
+@CHECK:   .eabi_attribute 24, 5
+@CHECK-OBJ: Tag: 24
+@CHECK-OBJ-NEXT: Value: 5
+@CHECK-OBJ-NEXT: TagName: ABI_align_needed
+@CHECK-OBJ-NEXT: Description: 8-byte alignment, 32-byte extended alignment
+
+.eabi_attribute  Tag_ABI_align_preserved, 5
+@CHECK:   .eabi_attribute 25, 5
+@CHECK-OBJ: Tag: 25
+@CHECK-OBJ-NEXT: Value: 5
+@CHECK-OBJ-NEXT: TagName: ABI_align_preserved
+@CHECK-OBJ-NEXT: Description: 8-byte stack alignment, 32-byte data alignment
+
+.eabi_attribute  Tag_ABI_optimization_goals, 5
+@CHECK:   .eabi_attribute 30, 5
+@CHECK-OBJ: Tag: 30
+@CHECK-OBJ-NEXT: Value: 5
+@CHECK-OBJ-NEXT: TagName: ABI_optimization_goals
+@CHECK-OBJ-NEXT: Description: Debugging
+
+.eabi_attribute  Tag_ABI_FP_optimization_goals, 5
+@CHECK:   .eabi_attribute 31, 5
+@CHECK-OBJ: Tag: 31
+@CHECK-OBJ-NEXT: Value: 5
+@CHECK-OBJ-NEXT: TagName: ABI_FP_optimization_goals
+@CHECK-OBJ-NEXT: Description: Accuracy
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-6.s b/test/tools/llvm-readobj/ARM/attribute-6.s
new file mode 100644
index 0000000..8da7b99
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-6.s
@@ -0,0 +1,52 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch, 6
+@CHECK:   .eabi_attribute 6, 6
+@CHECK-OBJ: Tag: 6
+@CHECK-OBJ-NEXT: Value: 6
+@CHECK-OBJ-NEXT: TagName: CPU_arch
+@CHECK-OBJ-NEXT: Description: ARM v6
+
+.eabi_attribute  Tag_FP_arch, 6
+@CHECK:   .eabi_attribute 10, 6
+@CHECK-OBJ: Tag: 10
+@CHECK-OBJ-NEXT: Value: 6
+@CHECK-OBJ-NEXT: TagName: FP_arch
+@CHECK-OBJ-NEXT: Description: VFPv4-D16
+
+.eabi_attribute  Tag_PCS_config, 6
+@CHECK:   .eabi_attribute 13, 6
+@CHECK-OBJ: Tag: 13
+@CHECK-OBJ-NEXT: Value: 6
+@CHECK-OBJ-NEXT: TagName: PCS_config
+@CHECK-OBJ-NEXT: Description: Symbian OS 2004
+
+.eabi_attribute  Tag_ABI_align_needed, 6
+@CHECK:   .eabi_attribute 24, 6
+@CHECK-OBJ: Tag: 24
+@CHECK-OBJ-NEXT: Value: 6
+@CHECK-OBJ-NEXT: TagName: ABI_align_needed
+@CHECK-OBJ-NEXT: Description: 8-byte alignment, 64-byte extended alignment
+
+.eabi_attribute  Tag_ABI_align_preserved, 6
+@CHECK:   .eabi_attribute 25, 6
+@CHECK-OBJ: Tag: 25
+@CHECK-OBJ-NEXT: Value: 6
+@CHECK-OBJ-NEXT: TagName: ABI_align_preserved
+@CHECK-OBJ-NEXT: Description: 8-byte stack alignment, 64-byte data alignment
+
+.eabi_attribute  Tag_ABI_optimization_goals, 6
+@CHECK:   .eabi_attribute 30, 6
+@CHECK-OBJ: Tag: 30
+@CHECK-OBJ-NEXT: Value: 6
+@CHECK-OBJ-NEXT: TagName: ABI_optimization_goals
+@CHECK-OBJ-NEXT: Description: Best Debugging
+
+.eabi_attribute  Tag_ABI_FP_optimization_goals, 6
+@CHECK:   .eabi_attribute 31, 6
+@CHECK-OBJ: Tag: 31
+@CHECK-OBJ-NEXT: Value: 6
+@CHECK-OBJ-NEXT: TagName: ABI_FP_optimization_goals
+@CHECK-OBJ-NEXT: Description: Best Accuracy
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-7.s b/test/tools/llvm-readobj/ARM/attribute-7.s
new file mode 100644
index 0000000..2fd1b20
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-7.s
@@ -0,0 +1,38 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch, 7
+@CHECK:   .eabi_attribute 6, 7
+@CHECK-OBJ: Tag: 6
+@CHECK-OBJ-NEXT: Value: 7
+@CHECK-OBJ-NEXT: TagName: CPU_arch
+@CHECK-OBJ-NEXT: Description: ARM v6KZ
+
+.eabi_attribute  Tag_FP_arch, 7
+@CHECK:   .eabi_attribute 10, 7
+@CHECK-OBJ: Tag: 10
+@CHECK-OBJ-NEXT: Value: 7
+@CHECK-OBJ-NEXT: TagName: FP_arch
+@CHECK-OBJ-NEXT: Description: ARMv8-a FP
+
+.eabi_attribute  Tag_PCS_config, 7
+@CHECK:   .eabi_attribute 13, 7
+@CHECK-OBJ: Tag: 13
+@CHECK-OBJ-NEXT: Value: 7
+@CHECK-OBJ-NEXT: TagName: PCS_config
+@CHECK-OBJ-NEXT: Description: Reserved (Symbian OS)
+
+.eabi_attribute  Tag_ABI_align_needed, 7
+@CHECK:   .eabi_attribute 24, 7
+@CHECK-OBJ: Tag: 24
+@CHECK-OBJ-NEXT: Value: 7
+@CHECK-OBJ-NEXT: TagName: ABI_align_needed
+@CHECK-OBJ-NEXT: Description: 8-byte alignment, 128-byte extended alignment
+
+.eabi_attribute  Tag_ABI_align_preserved, 7
+@CHECK:   .eabi_attribute 25, 7
+@CHECK-OBJ: Tag: 25
+@CHECK-OBJ-NEXT: Value: 7
+@CHECK-OBJ-NEXT: TagName: ABI_align_preserved
+@CHECK-OBJ-NEXT: Description: 8-byte stack alignment, 128-byte data alignment
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-8.s b/test/tools/llvm-readobj/ARM/attribute-8.s
new file mode 100644
index 0000000..ac3e3a0
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-8.s
@@ -0,0 +1,31 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch, 8
+@CHECK:   .eabi_attribute 6, 8
+@CHECK-OBJ: Tag: 6
+@CHECK-OBJ-NEXT: Value: 8
+@CHECK-OBJ-NEXT: TagName: CPU_arch
+@CHECK-OBJ-NEXT: Description: ARM v6T2
+
+.eabi_attribute  Tag_FP_arch, 8
+@CHECK:   .eabi_attribute 10, 8
+@CHECK-OBJ: Tag: 10
+@CHECK-OBJ-NEXT: Value: 8
+@CHECK-OBJ-NEXT: TagName: FP_arch
+@CHECK-OBJ-NEXT: Description: ARMv8-a FP-D16
+
+.eabi_attribute  Tag_ABI_align_needed, 8
+@CHECK:   .eabi_attribute 24, 8
+@CHECK-OBJ: Tag: 24
+@CHECK-OBJ-NEXT: Value: 8
+@CHECK-OBJ-NEXT: TagName: ABI_align_needed
+@CHECK-OBJ-NEXT: Description: 8-byte alignment, 256-byte extended alignment
+
+.eabi_attribute  Tag_ABI_align_preserved, 8
+@CHECK:   .eabi_attribute 25, 8
+@CHECK-OBJ: Tag: 25
+@CHECK-OBJ-NEXT: Value: 8
+@CHECK-OBJ-NEXT: TagName: ABI_align_preserved
+@CHECK-OBJ-NEXT: Description: 8-byte stack alignment, 256-byte data alignment
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-9.s b/test/tools/llvm-readobj/ARM/attribute-9.s
new file mode 100644
index 0000000..68f6ccb
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-9.s
@@ -0,0 +1,24 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch, 9
+@CHECK:   .eabi_attribute 6, 9
+@CHECK-OBJ: Tag: 6
+@CHECK-OBJ-NEXT: Value: 9
+@CHECK-OBJ-NEXT: TagName: CPU_arch
+@CHECK-OBJ-NEXT: Description: ARM v6K
+
+.eabi_attribute  Tag_ABI_align_needed, 9
+@CHECK:   .eabi_attribute 24, 9
+@CHECK-OBJ: Tag: 24
+@CHECK-OBJ-NEXT: Value: 9
+@CHECK-OBJ-NEXT: TagName: ABI_align_needed
+@CHECK-OBJ-NEXT: Description: 8-byte alignment, 512-byte extended alignment
+
+.eabi_attribute  Tag_ABI_align_preserved, 9
+@CHECK:   .eabi_attribute 25, 9
+@CHECK-OBJ: Tag: 25
+@CHECK-OBJ-NEXT: Value: 9
+@CHECK-OBJ-NEXT: TagName: ABI_align_preserved
+@CHECK-OBJ-NEXT: Description: 8-byte stack alignment, 512-byte data alignment
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-A.s b/test/tools/llvm-readobj/ARM/attribute-A.s
new file mode 100644
index 0000000..720f56e
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-A.s
@@ -0,0 +1,10 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch_profile, 'A'
+@CHECK:   .eabi_attribute 7, 65
+@CHECK-OBJ: Tag: 7
+@CHECK-OBJ-NEXT: Value: 65
+@CHECK-OBJ-NEXT: TagName: CPU_arch_profile
+@CHECK-OBJ-NEXT: Description: Application
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-M.s b/test/tools/llvm-readobj/ARM/attribute-M.s
new file mode 100644
index 0000000..7d1e1ef
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-M.s
@@ -0,0 +1,10 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch_profile, 'M'
+@CHECK:   .eabi_attribute 7, 77
+@CHECK-OBJ: Tag: 7
+@CHECK-OBJ-NEXT: Value: 77
+@CHECK-OBJ-NEXT: TagName: CPU_arch_profile
+@CHECK-OBJ-NEXT: Description: Microcontroller
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-R.s b/test/tools/llvm-readobj/ARM/attribute-R.s
new file mode 100644
index 0000000..096d557
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-R.s
@@ -0,0 +1,10 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch_profile, 'R'
+@CHECK:   .eabi_attribute 7, 82
+@CHECK-OBJ: Tag: 7
+@CHECK-OBJ-NEXT: Value: 82
+@CHECK-OBJ-NEXT: TagName: CPU_arch_profile
+@CHECK-OBJ-NEXT: Description: Real-time
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-S.s b/test/tools/llvm-readobj/ARM/attribute-S.s
new file mode 100644
index 0000000..cb90958
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-S.s
@@ -0,0 +1,10 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_CPU_arch_profile, 'S'
+@CHECK:   .eabi_attribute 7, 83
+@CHECK-OBJ: Tag: 7
+@CHECK-OBJ-NEXT: Value: 83
+@CHECK-OBJ-NEXT: TagName: CPU_arch_profile
+@CHECK-OBJ-NEXT: Description: Classic
+
diff --git a/test/tools/llvm-readobj/ARM/attribute-conformance-1.s b/test/tools/llvm-readobj/ARM/attribute-conformance-1.s
new file mode 100644
index 0000000..daa44c1
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-conformance-1.s
@@ -0,0 +1,8 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_conformance, "0"
+@CHECK:   .eabi_attribute 67, "0"
+@CHECK-OBJ: Tag: 67
+@CHECK-OBJ-NEXT: TagName: conformance
+@CHECK-OBJ-NEXT: Value: 0
diff --git a/test/tools/llvm-readobj/ARM/attribute-conformance-2.s b/test/tools/llvm-readobj/ARM/attribute-conformance-2.s
new file mode 100644
index 0000000..47c83c0
--- /dev/null
+++ b/test/tools/llvm-readobj/ARM/attribute-conformance-2.s
@@ -0,0 +1,8 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
+@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+.eabi_attribute  Tag_conformance, "A.long--non numeric oddity...!!"
+@CHECK:   .eabi_attribute 67, "A.long--non numeric oddity...!!"
+@CHECK-OBJ: Tag: 67
+@CHECK-OBJ-NEXT: TagName: conformance
+@CHECK-OBJ-NEXT: Value: A.long--non numeric oddity...!!
diff --git a/test/tools/llvm-readobj/ARM/attributes.s b/test/tools/llvm-readobj/ARM/attributes.s
deleted file mode 100644
index 594bab8..0000000
--- a/test/tools/llvm-readobj/ARM/attributes.s
+++ /dev/null
@@ -1,287 +0,0 @@
-@ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s \
-@ RUN:   | llvm-readobj -arm-attributes - | FileCheck %s
-
-	.syntax unified
-
-	.cpu cortex-a8
-	.fpu neon
-
-	.eabi_attribute Tag_CPU_raw_name, "Cortex-A9"
-	.eabi_attribute Tag_CPU_name, "cortex-a9"
-	.eabi_attribute Tag_CPU_arch, 10
-	.eabi_attribute Tag_CPU_arch_profile, 'A'
-	.eabi_attribute Tag_ARM_ISA_use, 0
-	.eabi_attribute Tag_THUMB_ISA_use, 2
-	.eabi_attribute Tag_FP_arch, 3
-	.eabi_attribute Tag_WMMX_arch, 0
-	.eabi_attribute Tag_Advanced_SIMD_arch, 1
-	.eabi_attribute Tag_PCS_config, 2
-	.eabi_attribute Tag_ABI_PCS_R9_use, 0
-	.eabi_attribute Tag_ABI_PCS_RW_data, 0
-	.eabi_attribute Tag_ABI_PCS_RO_data, 0
-	.eabi_attribute Tag_ABI_PCS_GOT_use, 0
-	.eabi_attribute Tag_ABI_PCS_wchar_t, 4
-	.eabi_attribute Tag_ABI_FP_rounding, 1
-	.eabi_attribute Tag_ABI_FP_denormal, 2
-	.eabi_attribute Tag_ABI_FP_exceptions, 1
-	.eabi_attribute Tag_ABI_FP_user_exceptions, 1
-	.eabi_attribute Tag_ABI_FP_number_model, 3
-	.eabi_attribute Tag_ABI_align_needed, 1
-	.eabi_attribute Tag_ABI_align_preserved, 2
-	.eabi_attribute Tag_ABI_enum_size, 3
-	.eabi_attribute Tag_ABI_HardFP_use, 0
-	.eabi_attribute Tag_ABI_VFP_args, 1
-	.eabi_attribute Tag_ABI_WMMX_args, 0
-	.eabi_attribute Tag_ABI_optimization_goals, 2
-	.eabi_attribute Tag_ABI_FP_optimization_goals, 2
-	.eabi_attribute Tag_compatibility, 1
-	.eabi_attribute Tag_compatibility, 1, "aeabi"
-	.eabi_attribute Tag_CPU_unaligned_access, 0
-	.eabi_attribute Tag_FP_HP_extension, 0
-	.eabi_attribute Tag_ABI_FP_16bit_format, 0
-	.eabi_attribute Tag_MPextension_use, 0
-	.eabi_attribute Tag_DIV_use, 0
-	.eabi_attribute Tag_nodefaults, 0
-	.eabi_attribute Tag_also_compatible_with, "gnu"
-	.eabi_attribute Tag_T2EE_use, 0
-	.eabi_attribute Tag_conformance, "2.09"
-	.eabi_attribute Tag_Virtualization_use, 0
-
-@ CHECK: BuildAttributes {
-@ CHECK:   Section 1 {
-@ CHECK:     Tag: Tag_File (0x1)
-@ CHECK:     FileAttributes {
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 4
-@ CHECK:         TagName: CPU_raw_name
-@ CHECK:         Value: CORTEX-A9
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 5
-@ CHECK:         TagName: CPU_name
-@ CHECK:         Value: CORTEX-A9
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 6
-@ CHECK:         Value: 10
-@ CHECK:         TagName: CPU_arch
-@ CHECK:         Description: ARM v7
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 7
-@ CHECK:         Value: 65
-@ CHECK:         TagName: CPU_arch_profile
-@ CHECK:         Description: Application
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 8
-@ CHECK:         Value: 0
-@ CHECK:         TagName: ARM_ISA_use
-@ CHECK:         Description: Not Permitted
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 9
-@ CHECK:         Value: 2
-@ CHECK:         TagName: THUMB_ISA_use
-@ CHECK:         Description: Thumb-2
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 10
-@ CHECK:         Value: 3
-@ CHECK:         TagName: FP_arch
-@ CHECK:         Description: VFPv3
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 11
-@ CHECK:         Value: 0
-@ CHECK:         TagName: WMMX_arch
-@ CHECK:         Description: Not Permitted
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 12
-@ CHECK:         Value: 1
-@ CHECK:         TagName: Advanced_SIMD_arch
-@ CHECK:         Description: NEONv1
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 13
-@ CHECK:         Value: 2
-@ CHECK:         TagName: PCS_config
-@ CHECK:         Description: Linux Application
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 14
-@ CHECK:         Value: 0
-@ CHECK:         TagName: ABI_PCS_R9_use
-@ CHECK:         Description: v6
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 15
-@ CHECK:         Value: 0
-@ CHECK:         TagName: ABI_PCS_RW_data
-@ CHECK:         Description: Absolute
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 16
-@ CHECK:         Value: 0
-@ CHECK:         TagName: ABI_PCS_RO_data
-@ CHECK:         Description: Absolute
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 17
-@ CHECK:         Value: 0
-@ CHECK:         TagName: ABI_PCS_GOT_use
-@ CHECK:         Description: Not Permitted
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 18
-@ CHECK:         Value: 4
-@ CHECK:         TagName: ABI_PCS_wchar_t
-@ CHECK:         Description: 4-byte
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 19
-@ CHECK:         Value: 1
-@ CHECK:         TagName: ABI_FP_rounding
-@ CHECK:         Description: Runtime
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 20
-@ CHECK:         Value: 2
-@ CHECK:         TagName: ABI_FP_denormal
-@ CHECK:         Description: Sign Only
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 21
-@ CHECK:         Value: 1
-@ CHECK:         TagName: ABI_FP_exceptions
-@ CHECK:         Description: IEEE-754
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 22
-@ CHECK:         Value: 1
-@ CHECK:         TagName: ABI_FP_user_exceptions
-@ CHECK:         Description: IEEE-754
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 23
-@ CHECK:         Value: 3
-@ CHECK:         TagName: ABI_FP_number_model
-@ CHECK:         Description: IEEE-754
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 24
-@ CHECK:         Value: 1
-@ CHECK:         TagName: ABI_align_needed
-@ CHECK:         Description: 8-byte alignment
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 25
-@ CHECK:         Value: 2
-@ CHECK:         TagName: ABI_align_preserved
-@ CHECK:         Description: 8-byte data and code alignment
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 26
-@ CHECK:         Value: 3
-@ CHECK:         TagName: ABI_enum_size
-@ CHECK:         Description: External Int32
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 27
-@ CHECK:         Value: 0
-@ CHECK:         TagName: ABI_HardFP_use
-@ CHECK:         Description: Tag_FP_arch
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 28
-@ CHECK:         Value: 1
-@ CHECK:         TagName: ABI_VFP_args
-@ CHECK:         Description: AAPCS VFP
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 29
-@ CHECK:         Value: 0
-@ CHECK:         TagName: ABI_WMMX_args
-@ CHECK:         Description: AAPCS
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 30
-@ CHECK:         Value: 2
-@ CHECK:         TagName: ABI_optimization_goals
-@ CHECK:         Description: Aggressive Speed
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 31
-@ CHECK:         Value: 2
-@ CHECK:         TagName: ABI_FP_optimization_goals
-@ CHECK:         Description: Aggressive Speed
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 32
-@ CHECK:         Value: 1, AEABI
-@ CHECK:         TagName: compatibility
-@ CHECK:         Description: AEABI Conformant
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 34
-@ CHECK:         Value: 0
-@ CHECK:         TagName: CPU_unaligned_access
-@ CHECK:         Description: Not Permitted
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 36
-@ CHECK:         Value: 0
-@ CHECK:         TagName: FP_HP_extension
-@ CHECK:         Description: If Available
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 38
-@ CHECK:         Value: 0
-@ CHECK:         TagName: ABI_FP_16bit_format
-@ CHECK:         Description: Not Permitted
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 42
-@ CHECK:         Value: 0
-@ CHECK:         TagName: MPextension_use
-@ CHECK:         Description: Not Permitted
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 44
-@ CHECK:         Value: 0
-@ CHECK:         TagName: DIV_use
-@ CHECK:         Description: If Available
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 64
-@ CHECK:         Value: 0
-@ CHECK:         TagName: nodefaults
-@ CHECK:         Description: Unspecified Tags UNDEFINED
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 65
-@ CHECK:         TagName: also_compatible_with
-@ CHECK:         Value: GNU
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 66
-@ CHECK:         Value: 0
-@ CHECK:         TagName: T2EE_use
-@ CHECK:         Description: Not Permitted
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 67
-@ CHECK:         TagName: conformance
-@ CHECK:         Value: 2.09
-@ CHECK:       }
-@ CHECK:       Attribute {
-@ CHECK:         Tag: 68
-@ CHECK:         Value: 0
-@ CHECK:         TagName: Virtualization_use
-@ CHECK:         Description: Not Permitted
-@ CHECK:       }
-@ CHECK:     }
-@ CHECK:   }
-@ CHECK: }
-
diff --git a/test/tools/llvm-readobj/Inputs/export-arm.dll b/test/tools/llvm-readobj/Inputs/export-arm.dll
new file mode 100755
index 0000000..a555562
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/export-arm.dll
diff --git a/test/tools/llvm-readobj/Inputs/export-x64.dll b/test/tools/llvm-readobj/Inputs/export-x64.dll
new file mode 100755
index 0000000..10b14e8
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/export-x64.dll
diff --git a/test/tools/llvm-readobj/Inputs/export-x86.dll b/test/tools/llvm-readobj/Inputs/export-x86.dll
new file mode 100755
index 0000000..9efcd31
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/export-x86.dll
diff --git a/test/tools/llvm-readobj/Inputs/relocs-no-symtab.obj.coff-i386 b/test/tools/llvm-readobj/Inputs/relocs-no-symtab.obj.coff-i386
new file mode 100644
index 0000000..5882daf
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/relocs-no-symtab.obj.coff-i386
diff --git a/test/tools/llvm-readobj/Inputs/relocs.obj.elf-aarch64 b/test/tools/llvm-readobj/Inputs/relocs.obj.elf-aarch64
index a1034cb..658b0ea 100644
--- a/test/tools/llvm-readobj/Inputs/relocs.obj.elf-aarch64
+++ b/test/tools/llvm-readobj/Inputs/relocs.obj.elf-aarch64
diff --git a/test/tools/llvm-readobj/Inputs/relocs.obj.elf-arm b/test/tools/llvm-readobj/Inputs/relocs.obj.elf-arm
index 908507d..206c933 100644
--- a/test/tools/llvm-readobj/Inputs/relocs.obj.elf-arm
+++ b/test/tools/llvm-readobj/Inputs/relocs.obj.elf-arm
diff --git a/test/tools/llvm-readobj/Inputs/relocs.py b/test/tools/llvm-readobj/Inputs/relocs.py
index ffddf3d..62dbd62 100644
--- a/test/tools/llvm-readobj/Inputs/relocs.py
+++ b/test/tools/llvm-readobj/Inputs/relocs.py
@@ -591,7 +591,7 @@ class Relocs_Elf_PPC64(Enum):
   R_PPC64_TLSLD               = 108
 
 class Relocs_Elf_AArch64(Enum):
-  R_AARCH64_NONE                        = 0x100
+  R_AARCH64_NONE                        = 0
   R_AARCH64_ABS64                       = 0x101
   R_AARCH64_ABS32                       = 0x102
   R_AARCH64_ABS16                       = 0x103
@@ -611,6 +611,7 @@ class Relocs_Elf_AArch64(Enum):
   R_AARCH64_LD_PREL_LO19                = 0x111
   R_AARCH64_ADR_PREL_LO21               = 0x112
   R_AARCH64_ADR_PREL_PG_HI21            = 0x113
+  R_AARCH64_ADR_PREL_PG_HI21_NC         = 0x114
   R_AARCH64_ADD_ABS_LO12_NC             = 0x115
   R_AARCH64_LDST8_ABS_LO12_NC           = 0x116
   R_AARCH64_TSTBR14                     = 0x117
@@ -620,11 +621,39 @@ class Relocs_Elf_AArch64(Enum):
   R_AARCH64_LDST16_ABS_LO12_NC          = 0x11c
   R_AARCH64_LDST32_ABS_LO12_NC          = 0x11d
   R_AARCH64_LDST64_ABS_LO12_NC          = 0x11e
+  R_AARCH64_MOVW_PREL_G0                = 0x11f
+  R_AARCH64_MOVW_PREL_G0_NC             = 0x120
+  R_AARCH64_MOVW_PREL_G1                = 0x121
+  R_AARCH64_MOVW_PREL_G1_NC             = 0x122
+  R_AARCH64_MOVW_PREL_G2                = 0x123
+  R_AARCH64_MOVW_PREL_G2_NC             = 0x124
+  R_AARCH64_MOVW_PREL_G3                = 0x125
   R_AARCH64_LDST128_ABS_LO12_NC         = 0x12b
+  R_AARCH64_MOVW_GOTOFF_G0              = 0x12c
+  R_AARCH64_MOVW_GOTOFF_G0_NC           = 0x12d
+  R_AARCH64_MOVW_GOTOFF_G1              = 0x12e
+  R_AARCH64_MOVW_GOTOFF_G1_NC           = 0x12f
+  R_AARCH64_MOVW_GOTOFF_G2              = 0x130
+  R_AARCH64_MOVW_GOTOFF_G2_NC           = 0x131
+  R_AARCH64_MOVW_GOTOFF_G3              = 0x132
   R_AARCH64_GOTREL64                    = 0x133
   R_AARCH64_GOTREL32                    = 0x134
+  R_AARCH64_GOT_LD_PREL19               = 0x135
+  R_AARCH64_LD64_GOTOFF_LO15            = 0x136
   R_AARCH64_ADR_GOT_PAGE                = 0x137
   R_AARCH64_LD64_GOT_LO12_NC            = 0x138
+  R_AARCH64_LD64_GOTPAGE_LO15           = 0x139
+  R_AARCH64_TLSGD_ADR_PREL21            = 0x200
+  R_AARCH64_TLSGD_ADR_PAGE21            = 0x201
+  R_AARCH64_TLSGD_ADD_LO12_NC           = 0x202
+  R_AARCH64_TLSGD_MOVW_G1               = 0x203
+  R_AARCH64_TLSGD_MOVW_G0_NC            = 0x204
+  R_AARCH64_TLSLD_ADR_PREL21            = 0x205
+  R_AARCH64_TLSLD_ADR_PAGE21            = 0x206
+  R_AARCH64_TLSLD_ADD_LO12_NC           = 0x207
+  R_AARCH64_TLSLD_MOVW_G1               = 0x208
+  R_AARCH64_TLSLD_MOVW_G0_NC            = 0x209
+  R_AARCH64_TLSLD_LD_PREL19             = 0x20a
   R_AARCH64_TLSLD_MOVW_DTPREL_G2        = 0x20b
   R_AARCH64_TLSLD_MOVW_DTPREL_G1        = 0x20c
   R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC     = 0x20d
@@ -662,10 +691,20 @@ class Relocs_Elf_AArch64(Enum):
   R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC  = 0x22d
   R_AARCH64_TLSLE_LDST64_TPREL_LO12     = 0x22e
   R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC  = 0x22f
-  R_AARCH64_TLSDESC_ADR_PAGE            = 0x232
+  R_AARCH64_TLSDESC_LD_PREL19           = 0x230
+  R_AARCH64_TLSDESC_ADR_PREL21          = 0x231
+  R_AARCH64_TLSDESC_ADR_PAGE21          = 0x232
   R_AARCH64_TLSDESC_LD64_LO12_NC        = 0x233
   R_AARCH64_TLSDESC_ADD_LO12_NC         = 0x234
+  R_AARCH64_TLSDESC_OFF_G1              = 0x235
+  R_AARCH64_TLSDESC_OFF_G0_NC           = 0x236
+  R_AARCH64_TLSDESC_LDR                 = 0x237
+  R_AARCH64_TLSDESC_ADD                 = 0x238
   R_AARCH64_TLSDESC_CALL                = 0x239
+  R_AARCH64_TLSLE_LDST128_TPREL_LO12    = 0x23a
+  R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC = 0x23b
+  R_AARCH64_TLSLD_LDST128_DTPREL_LO12   = 0x23c
+  R_AARCH64_TLSLD_LDST128_DTPREL_LO12_NC = 0x23d
   R_AARCH64_COPY                        = 0x400
   R_AARCH64_GLOB_DAT                    = 0x401
   R_AARCH64_JUMP_SLOT                   = 0x402
@@ -808,6 +847,7 @@ class Relocs_Elf_ARM(Enum):
   R_ARM_ME_TOO                = 0x80
   R_ARM_THM_TLS_DESCSEQ16     = 0x81
   R_ARM_THM_TLS_DESCSEQ32     = 0x82
+  R_ARM_IRELATIVE             = 0xa0
 
 class Relocs_Elf_Mips(Enum):
   R_MIPS_NONE              =  0
diff --git a/test/tools/llvm-readobj/Inputs/trivial.exe.coff-arm b/test/tools/llvm-readobj/Inputs/trivial.exe.coff-arm
new file mode 100755
index 0000000..121d820
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/trivial.exe.coff-arm
diff --git a/test/tools/llvm-readobj/codeview-linetables.test b/test/tools/llvm-readobj/codeview-linetables.test
index e5e344b..b2acee1 100644
--- a/test/tools/llvm-readobj/codeview-linetables.test
+++ b/test/tools/llvm-readobj/codeview-linetables.test
@@ -18,16 +18,16 @@
 ;     z();
 ;   }
 ; using 32-/64-bit versions of CL v17.00.61030 and v18.00.21005.1 respectively.
-RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifunction-linetables.obj.coff-2012-i368 \
+RUN: llvm-readobj -s -codeview -section-symbols %p/Inputs/multifunction-linetables.obj.coff-2012-i368 \
 RUN:   | FileCheck %s -check-prefix MFUN32
-RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifunction-linetables.obj.coff-2013-i368 \
+RUN: llvm-readobj -s -codeview -section-symbols %p/Inputs/multifunction-linetables.obj.coff-2013-i368 \
 RUN:   | FileCheck %s -check-prefix MFUN32
-RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifunction-linetables.obj.coff-2012-x86_64 \
+RUN: llvm-readobj -s -codeview -section-symbols %p/Inputs/multifunction-linetables.obj.coff-2012-x86_64 \
 RUN:   | FileCheck %s -check-prefix MFUN64
-RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifunction-linetables.obj.coff-2013-x86_64 \
+RUN: llvm-readobj -s -codeview -section-symbols %p/Inputs/multifunction-linetables.obj.coff-2013-x86_64 \
 RUN:   | FileCheck %s -check-prefix MFUN64
 
-MFUN32:      CodeViewLineTables [
+MFUN32:      CodeViewDebugInfo [
 MFUN32-NEXT:   Magic: 0x4
 MFUN32-NEXT:   Subsection [
 MFUN32-NEXT:     Type: 0xF1
@@ -136,7 +136,7 @@ MFUN32-NEXT:     ]
 MFUN32-NEXT:   ]
 MFUN32-NEXT: ]
 
-MFUN64:      CodeViewLineTables [
+MFUN64:      CodeViewDebugInfo [
 MFUN64-NEXT:   Magic: 0x4
 MFUN64-NEXT:   Subsection [
 MFUN64-NEXT:     Type: 0xF1
@@ -248,16 +248,16 @@ MFUN64-NEXT: ]
 ;     g();
 ;   }
 ; using 32-/64-bit versions of CL v17.00.61030 and v18.00.21005.1 respectively.
-RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifile-linetables.obj.coff-2012-i368 \
+RUN: llvm-readobj -s -codeview -section-symbols %p/Inputs/multifile-linetables.obj.coff-2012-i368 \
 RUN:   | FileCheck %s -check-prefix MFILE32
-RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifile-linetables.obj.coff-2013-i368 \
+RUN: llvm-readobj -s -codeview -section-symbols %p/Inputs/multifile-linetables.obj.coff-2013-i368 \
 RUN:   | FileCheck %s -check-prefix MFILE32
-RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifile-linetables.obj.coff-2012-x86_64 \
+RUN: llvm-readobj -s -codeview -section-symbols %p/Inputs/multifile-linetables.obj.coff-2012-x86_64 \
 RUN:   | FileCheck %s -check-prefix MFILE64
-RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifile-linetables.obj.coff-2013-x86_64 \
+RUN: llvm-readobj -s -codeview -section-symbols %p/Inputs/multifile-linetables.obj.coff-2013-x86_64 \
 RUN:   | FileCheck %s -check-prefix MFILE64
 
-MFILE32:      CodeViewLineTables [
+MFILE32:      CodeViewDebugInfo [
 MFILE32-NEXT:   Magic: 0x4
 MFILE32-NEXT:   Subsection [
 MFILE32-NEXT:     Type: 0xF1
@@ -317,7 +317,7 @@ MFILE32-NEXT:     ]
 MFILE32-NEXT:   ]
 MFILE32-NEXT: ]
 
-MFILE64:      CodeViewLineTables [
+MFILE64:      CodeViewDebugInfo [
 MFILE64-NEXT:   Magic: 0x4
 MFILE64-NEXT:   Subsection [
 MFILE64-NEXT:     Type: 0xF1
@@ -387,9 +387,9 @@ MFILE64-NEXT: ]
 ;     return 0;
 ;   }
 ; using 32-version of CL v17.00.61030 and v18.00.21005.1 respectively.
-RUN: llvm-readobj -s -codeview-linetables %p/Inputs/comdat-function-linetables.obj.coff-2012-i386 \
+RUN: llvm-readobj -s -codeview -section-symbols %p/Inputs/comdat-function-linetables.obj.coff-2012-i386 \
 RUN:   | FileCheck %s -check-prefix MCOMDAT
-RUN: llvm-readobj -s -codeview-linetables %p/Inputs/comdat-function-linetables.obj.coff-2013-i386 \
+RUN: llvm-readobj -s -codeview -section-symbols %p/Inputs/comdat-function-linetables.obj.coff-2013-i386 \
 RUN:   | FileCheck %s -check-prefix MCOMDAT
 
 MCOMDAT:      ProcStart {
diff --git a/test/tools/llvm-readobj/coff-arm-baserelocs.test b/test/tools/llvm-readobj/coff-arm-baserelocs.test
new file mode 100644
index 0000000..c0febd7
--- /dev/null
+++ b/test/tools/llvm-readobj/coff-arm-baserelocs.test
@@ -0,0 +1,7 @@
+# RUN: llvm-readobj -coff-basereloc %p/Inputs/trivial.exe.coff-arm | FileCheck %s
+
+# CHECK: Entry {
+# CHECK:  Type: ARM_MOV32(T)
+# CHECK:  Address: 0x9390
+# CHECK: }
+
diff --git a/test/tools/llvm-readobj/coff-exports.test b/test/tools/llvm-readobj/coff-exports.test
new file mode 100644
index 0000000..54b42fe
--- /dev/null
+++ b/test/tools/llvm-readobj/coff-exports.test
@@ -0,0 +1,11 @@
+RUN: llvm-readobj -coff-exports %p/Inputs/export-x86.dll | FileCheck %s -check-prefix CHECK -check-prefix CHECK-X86
+RUN: llvm-readobj -coff-exports %p/Inputs/export-x64.dll | FileCheck %s -check-prefix CHECK -check-prefix CHECK-X64
+RUN: llvm-readobj -coff-exports %p/Inputs/export-arm.dll | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM
+
+CHECK: Export {
+CHECK:   Ordinal: 1
+CHECK:   Name: function
+CHECK-X86:   RVA: 0x1000
+CHECK-X64:   RVA: 0x1000
+CHECK-ARM:   RVA: 0x1001
+CHECK: }
diff --git a/test/tools/llvm-readobj/reloc-types.test b/test/tools/llvm-readobj/reloc-types.test
index 20c2538..36e2f70 100644
--- a/test/tools/llvm-readobj/reloc-types.test
+++ b/test/tools/llvm-readobj/reloc-types.test
@@ -149,7 +149,7 @@ ELF-PPC64: Type: R_PPC64_GOT_TPREL16_HA (90)
 ELF-PPC64: Type: R_PPC64_TLSGD (107)
 ELF-PPC64: Type: R_PPC64_TLSLD (108)
 
-ELF-AARCH64: Type: R_AARCH64_NONE (256)
+ELF-AARCH64: Type: R_AARCH64_NONE (0)
 ELF-AARCH64: Type: R_AARCH64_ABS64 (257)
 ELF-AARCH64: Type: R_AARCH64_ABS32 (258)
 ELF-AARCH64: Type: R_AARCH64_ABS16 (259)
@@ -169,6 +169,7 @@ ELF-AARCH64: Type: R_AARCH64_MOVW_SABS_G2 (272)
 ELF-AARCH64: Type: R_AARCH64_LD_PREL_LO19 (273)
 ELF-AARCH64: Type: R_AARCH64_ADR_PREL_LO21 (274)
 ELF-AARCH64: Type: R_AARCH64_ADR_PREL_PG_HI21 (275)
+ELF-AARCH64: Type: R_AARCH64_ADR_PREL_PG_HI21_NC (276)
 ELF-AARCH64: Type: R_AARCH64_ADD_ABS_LO12_NC (277)
 ELF-AARCH64: Type: R_AARCH64_LDST8_ABS_LO12_NC (278)
 ELF-AARCH64: Type: R_AARCH64_TSTBR14 (279)
@@ -178,11 +179,39 @@ ELF-AARCH64: Type: R_AARCH64_CALL26 (283)
 ELF-AARCH64: Type: R_AARCH64_LDST16_ABS_LO12_NC (284)
 ELF-AARCH64: Type: R_AARCH64_LDST32_ABS_LO12_NC (285)
 ELF-AARCH64: Type: R_AARCH64_LDST64_ABS_LO12_NC (286)
+ELF-AARCH64: Type: R_AARCH64_MOVW_PREL_G0 (287)
+ELF-AARCH64: Type: R_AARCH64_MOVW_PREL_G0_NC (288)
+ELF-AARCH64: Type: R_AARCH64_MOVW_PREL_G1 (289)
+ELF-AARCH64: Type: R_AARCH64_MOVW_PREL_G1_NC (290)
+ELF-AARCH64: Type: R_AARCH64_MOVW_PREL_G2 (291)
+ELF-AARCH64: Type: R_AARCH64_MOVW_PREL_G2_NC (292)
+ELF-AARCH64: Type: R_AARCH64_MOVW_PREL_G3 (293)
 ELF-AARCH64: Type: R_AARCH64_LDST128_ABS_LO12_NC (299)
+ELF-AARCH64: Type: R_AARCH64_MOVW_GOTOFF_G0 (300)
+ELF-AARCH64: Type: R_AARCH64_MOVW_GOTOFF_G0_NC (301)
+ELF-AARCH64: Type: R_AARCH64_MOVW_GOTOFF_G1 (302)
+ELF-AARCH64: Type: R_AARCH64_MOVW_GOTOFF_G1_NC (303)
+ELF-AARCH64: Type: R_AARCH64_MOVW_GOTOFF_G2 (304)
+ELF-AARCH64: Type: R_AARCH64_MOVW_GOTOFF_G2_NC (305)
+ELF-AARCH64: Type: R_AARCH64_MOVW_GOTOFF_G3 (306)
 ELF-AARCH64: Type: R_AARCH64_GOTREL64 (307)
 ELF-AARCH64: Type: R_AARCH64_GOTREL32 (308)
+ELF-AARCH64: Type: R_AARCH64_GOT_LD_PREL19 (309)
+ELF-AARCH64: Type: R_AARCH64_LD64_GOTOFF_LO15 (310)
 ELF-AARCH64: Type: R_AARCH64_ADR_GOT_PAGE (311)
 ELF-AARCH64: Type: R_AARCH64_LD64_GOT_LO12_NC (312)
+ELF-AARCH64: Type: R_AARCH64_LD64_GOTPAGE_LO15 (313)
+ELF-AARCH64: Type: R_AARCH64_TLSGD_ADR_PREL21 (512)
+ELF-AARCH64: Type: R_AARCH64_TLSGD_ADR_PAGE21 (513)
+ELF-AARCH64: Type: R_AARCH64_TLSGD_ADD_LO12_NC (514)
+ELF-AARCH64: Type: R_AARCH64_TLSGD_MOVW_G1 (515)
+ELF-AARCH64: Type: R_AARCH64_TLSGD_MOVW_G0_NC (516)
+ELF-AARCH64: Type: R_AARCH64_TLSLD_ADR_PREL21 (517)
+ELF-AARCH64: Type: R_AARCH64_TLSLD_ADR_PAGE21 (518)
+ELF-AARCH64: Type: R_AARCH64_TLSLD_ADD_LO12_NC (519)
+ELF-AARCH64: Type: R_AARCH64_TLSLD_MOVW_G1 (520)
+ELF-AARCH64: Type: R_AARCH64_TLSLD_MOVW_G0_NC (521)
+ELF-AARCH64: Type: R_AARCH64_TLSLD_LD_PREL19 (522)
 ELF-AARCH64: Type: R_AARCH64_TLSLD_MOVW_DTPREL_G2 (523)
 ELF-AARCH64: Type: R_AARCH64_TLSLD_MOVW_DTPREL_G1 (524)
 ELF-AARCH64: Type: R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC (525)
@@ -220,10 +249,20 @@ ELF-AARCH64: Type: R_AARCH64_TLSLE_LDST32_TPREL_LO12 (556)
 ELF-AARCH64: Type: R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC (557)
 ELF-AARCH64: Type: R_AARCH64_TLSLE_LDST64_TPREL_LO12 (558)
 ELF-AARCH64: Type: R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC (559)
-ELF-AARCH64: Type: R_AARCH64_TLSDESC_ADR_PAGE (562)
+ELF-AARCH64: Type: R_AARCH64_TLSDESC_LD_PREL19 (560)
+ELF-AARCH64: Type: R_AARCH64_TLSDESC_ADR_PREL21 (561)
+ELF-AARCH64: Type: R_AARCH64_TLSDESC_ADR_PAGE21 (562)
 ELF-AARCH64: Type: R_AARCH64_TLSDESC_LD64_LO12_NC (563)
 ELF-AARCH64: Type: R_AARCH64_TLSDESC_ADD_LO12_NC (564)
+ELF-AARCH64: Type: R_AARCH64_TLSDESC_OFF_G1 (565)
+ELF-AARCH64: Type: R_AARCH64_TLSDESC_OFF_G0_NC (566)
+ELF-AARCH64: Type: R_AARCH64_TLSDESC_LDR (567)
+ELF-AARCH64: Type: R_AARCH64_TLSDESC_ADD (568)
 ELF-AARCH64: Type: R_AARCH64_TLSDESC_CALL (569)
+ELF-AARCH64: Type: R_AARCH64_TLSLE_LDST128_TPREL_LO12 (570)
+ELF-AARCH64: Type: R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC (571)
+ELF-AARCH64: Type: R_AARCH64_TLSLD_LDST128_DTPREL_LO12 (572)
+ELF-AARCH64: Type: R_AARCH64_TLSLD_LDST128_DTPREL_LO12_NC (573)
 ELF-AARCH64: Type: R_AARCH64_COPY (1024)
 ELF-AARCH64: Type: R_AARCH64_GLOB_DAT (1025)
 ELF-AARCH64: Type: R_AARCH64_JUMP_SLOT (1026)
@@ -364,6 +403,7 @@ ELF-ARM: Type: R_ARM_PRIVATE_15 (127)
 ELF-ARM: Type: R_ARM_ME_TOO (128)
 ELF-ARM: Type: R_ARM_THM_TLS_DESCSEQ16 (129)
 ELF-ARM: Type: R_ARM_THM_TLS_DESCSEQ32 (130)
+ELF-ARM: Type: R_ARM_IRELATIVE (160)
 
 ELF-MIPS: Type: R_MIPS_NONE (0)
 ELF-MIPS: Type: R_MIPS_16 (1)
diff --git a/test/tools/llvm-readobj/relocations.test b/test/tools/llvm-readobj/relocations.test
index 222dcf1..2e11aa2 100644
--- a/test/tools/llvm-readobj/relocations.test
+++ b/test/tools/llvm-readobj/relocations.test
@@ -2,6 +2,8 @@ RUN: llvm-readobj -r %p/Inputs/trivial.obj.coff-i386 \
 RUN:   | FileCheck %s -check-prefix COFF
 RUN: llvm-readobj -r %p/Inputs/bad-relocs.obj.coff-i386 \
 RUN:   | FileCheck %s -check-prefix BAD-COFF-RELOCS
+RUN: llvm-readobj -r %p/Inputs/relocs-no-symtab.obj.coff-i386 \
+RUN:   | FileCheck %s -check-prefix BAD-COFF-RELOCS
 RUN: llvm-readobj -r %p/Inputs/trivial.obj.elf-i386 \
 RUN:   | FileCheck %s -check-prefix ELF
 RUN: llvm-readobj -r %p/Inputs/trivial.obj.macho-i386 \
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index fd761ec..1f415b6 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -36,6 +36,7 @@ add_llvm_tool_subdirectory(llvm-objdump)
 add_llvm_tool_subdirectory(llvm-readobj)
 add_llvm_tool_subdirectory(llvm-rtdyld)
 add_llvm_tool_subdirectory(llvm-dwarfdump)
+add_llvm_tool_subdirectory(dsymutil)
 add_llvm_tool_subdirectory(llvm-vtabledump)
 if( LLVM_USE_INTEL_JITEVENTS )
   add_llvm_tool_subdirectory(llvm-jitlistener)
@@ -60,6 +61,8 @@ add_llvm_tool_subdirectory(yaml2obj)
 
 add_llvm_tool_subdirectory(llvm-go)
 
+add_llvm_tool_subdirectory(llvm-pdbdump)
+
 if(NOT CYGWIN AND LLVM_ENABLE_PIC)
   add_llvm_tool_subdirectory(lto)
   add_llvm_tool_subdirectory(llvm-lto)
@@ -68,19 +71,10 @@ else()
   ignore_llvm_tool_subdirectory(llvm-lto)
 endif()
 
-if( LLVM_ENABLE_PIC )
-  # TODO: support other systems:
-  if( (CMAKE_SYSTEM_NAME STREQUAL "Linux")
-      OR (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD") )
-    add_llvm_tool_subdirectory(gold)
-  else()
-    ignore_llvm_tool_subdirectory(gold)
-  endif()
-else()
-  ignore_llvm_tool_subdirectory(gold)
-endif()
+add_llvm_tool_subdirectory(gold)
 
 add_llvm_external_project(clang)
+add_llvm_external_project(llgo)
 
 if( NOT LLVM_INCLUDE_TOOLS STREQUAL "bootstrap-only" )
   add_llvm_external_project(lld)
diff --git a/tools/LLVMBuild.txt b/tools/LLVMBuild.txt
index 13a08b2..e7b81bb 100644
--- a/tools/LLVMBuild.txt
+++ b/tools/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = bugpoint llc lli llvm-ar llvm-as llvm-bcanalyzer llvm-cov llvm-diff llvm-dis llvm-dwarfdump llvm-extract llvm-jitlistener llvm-link llvm-lto llvm-mc llvm-nm llvm-objdump llvm-profdata llvm-rtdyld llvm-size macho-dump opt llvm-mcmarkup verify-uselistorder
+subdirectories = bugpoint llc lli llvm-ar llvm-as llvm-bcanalyzer llvm-cov llvm-diff llvm-dis llvm-dwarfdump llvm-extract llvm-jitlistener llvm-link llvm-lto llvm-mc llvm-nm llvm-objdump llvm-pdbdump llvm-profdata llvm-rtdyld llvm-size macho-dump opt llvm-mcmarkup verify-uselistorder dsymutil
 
 [component_0]
 type = Group
diff --git a/tools/Makefile b/tools/Makefile
index 4b8923a..fcb6c64 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -33,7 +33,7 @@ PARALLEL_DIRS := opt llvm-as llvm-dis llc llvm-ar llvm-nm llvm-link \
                  macho-dump llvm-objdump llvm-readobj llvm-rtdyld \
                  llvm-dwarfdump llvm-cov llvm-size llvm-stress llvm-mcmarkup \
                  llvm-profdata llvm-symbolizer obj2yaml yaml2obj llvm-c-test \
-                 llvm-vtabledump verify-uselistorder
+                 llvm-vtabledump verify-uselistorder dsymutil llvm-pdbdump
 
 # If Intel JIT Events support is configured, build an extra tool to test it.
 ifeq ($(USE_INTEL_JITEVENTS), 1)
diff --git a/tools/bugpoint/CrashDebugger.cpp b/tools/bugpoint/CrashDebugger.cpp
index bac948a..ee18169 100644
--- a/tools/bugpoint/CrashDebugger.cpp
+++ b/tools/bugpoint/CrashDebugger.cpp
@@ -19,11 +19,11 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Pass.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Transforms/Scalar.h"
@@ -407,7 +407,7 @@ bool ReduceCrashingInstructions::TestInsts(std::vector<const Instruction*>
       }
 
   // Verify that this is still valid.
-  PassManager Passes;
+  legacy::PassManager Passes;
   Passes.add(createVerifierPass());
   Passes.add(createDebugInfoVerifierPass());
   Passes.run(*M);
diff --git a/tools/bugpoint/ExtractFunction.cpp b/tools/bugpoint/ExtractFunction.cpp
index 34fe53c..238cbbc 100644
--- a/tools/bugpoint/ExtractFunction.cpp
+++ b/tools/bugpoint/ExtractFunction.cpp
@@ -17,10 +17,10 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Pass.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileUtilities.h"
@@ -195,9 +195,9 @@ static Constant *GetTorInit(std::vector<std::pair<Function*, int> > &TorList) {
   assert(!TorList.empty() && "Don't create empty tor list!");
   std::vector<Constant*> ArrayElts;
   Type *Int32Ty = Type::getInt32Ty(TorList[0].first->getContext());
-  
+
   StructType *STy =
-    StructType::get(Int32Ty, TorList[0].first->getType(), NULL);
+      StructType::get(Int32Ty, TorList[0].first->getType(), nullptr);
   for (unsigned i = 0, e = TorList.size(); i != e; ++i) {
     Constant *Elts[] = {
       ConstantInt::get(Int32Ty, TorList[i].second),
diff --git a/tools/bugpoint/OptimizerDriver.cpp b/tools/bugpoint/OptimizerDriver.cpp
index f197cc5..481f343 100644
--- a/tools/bugpoint/OptimizerDriver.cpp
+++ b/tools/bugpoint/OptimizerDriver.cpp
@@ -18,9 +18,9 @@
 #include "BugDriver.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileUtilities.h"
diff --git a/tools/bugpoint/bugpoint.cpp b/tools/bugpoint/bugpoint.cpp
index d0bade5..0ee3784 100644
--- a/tools/bugpoint/bugpoint.cpp
+++ b/tools/bugpoint/bugpoint.cpp
@@ -16,10 +16,10 @@
 #include "BugDriver.h"
 #include "ToolRunner.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/LegacyPassNameParser.h"
 #include "llvm/LinkAllIR.h"
 #include "llvm/LinkAllPasses.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PluginLoader.h"
@@ -92,7 +92,7 @@ static void BugpointInterruptFunction() {
 
 // Hack to capture a pass list.
 namespace {
-  class AddToDriver : public FunctionPassManager {
+  class AddToDriver : public legacy::FunctionPassManager {
     BugDriver &D;
   public:
     AddToDriver(BugDriver &_D) : FunctionPassManager(nullptr), D(_D) {}
diff --git a/tools/dsymutil/BinaryHolder.cpp b/tools/dsymutil/BinaryHolder.cpp
new file mode 100644
index 0000000..ad66105
--- /dev/null
+++ b/tools/dsymutil/BinaryHolder.cpp
@@ -0,0 +1,111 @@
+//===-- BinaryHolder.cpp --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This program is a utility that aims to be a dropin replacement for
+// Darwin's dsymutil.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryHolder.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace dsymutil {
+
+ErrorOr<MemoryBufferRef>
+BinaryHolder::GetMemoryBufferForFile(StringRef Filename) {
+  if (Verbose)
+    outs() << "trying to open '" << Filename << "'\n";
+
+  // Try that first as it doesn't involve any filesystem access.
+  if (auto ErrOrArchiveMember = GetArchiveMemberBuffer(Filename))
+    return *ErrOrArchiveMember;
+
+  // If the name ends with a closing paren, there is a huge chance
+  // it is an archive member specification.
+  if (Filename.endswith(")"))
+    if (auto ErrOrArchiveMember = MapArchiveAndGetMemberBuffer(Filename))
+      return *ErrOrArchiveMember;
+
+  // Otherwise, just try opening a standard file. If this is an
+  // archive member specifiaction and any of the above didn't handle it
+  // (either because the archive is not there anymore, or because the
+  // archive doesn't contain the requested member), this will still
+  // provide a sensible error message.
+  auto ErrOrFile = MemoryBuffer::getFileOrSTDIN(Filename);
+  if (auto Err = ErrOrFile.getError())
+    return Err;
+
+  if (Verbose)
+    outs() << "\tloaded file.\n";
+  CurrentArchive.reset();
+  CurrentMemoryBuffer = std::move(ErrOrFile.get());
+  return CurrentMemoryBuffer->getMemBufferRef();
+}
+
+ErrorOr<MemoryBufferRef>
+BinaryHolder::GetArchiveMemberBuffer(StringRef Filename) {
+  if (!CurrentArchive)
+    return make_error_code(errc::no_such_file_or_directory);
+
+  StringRef CurArchiveName = CurrentArchive->getFileName();
+  if (!Filename.startswith(Twine(CurArchiveName, "(").str()))
+    return make_error_code(errc::no_such_file_or_directory);
+
+  // Remove the archive name and the parens around the archive member name.
+  Filename = Filename.substr(CurArchiveName.size() + 1).drop_back();
+
+  for (const auto &Child : CurrentArchive->children()) {
+    if (auto NameOrErr = Child.getName())
+      if (*NameOrErr == Filename) {
+        if (Verbose)
+          outs() << "\tfound member in current archive.\n";
+        return Child.getMemoryBufferRef();
+      }
+  }
+
+  return make_error_code(errc::no_such_file_or_directory);
+}
+
+ErrorOr<MemoryBufferRef>
+BinaryHolder::MapArchiveAndGetMemberBuffer(StringRef Filename) {
+  StringRef ArchiveFilename = Filename.substr(0, Filename.find('('));
+
+  auto ErrOrBuff = MemoryBuffer::getFileOrSTDIN(ArchiveFilename);
+  if (auto Err = ErrOrBuff.getError())
+    return Err;
+
+  if (Verbose)
+    outs() << "\topened new archive '" << ArchiveFilename << "'\n";
+  auto ErrOrArchive = object::Archive::create((*ErrOrBuff)->getMemBufferRef());
+  if (auto Err = ErrOrArchive.getError())
+    return Err;
+
+  CurrentArchive = std::move(*ErrOrArchive);
+  CurrentMemoryBuffer = std::move(*ErrOrBuff);
+
+  return GetArchiveMemberBuffer(Filename);
+}
+
+ErrorOr<const object::ObjectFile &>
+BinaryHolder::GetObjectFile(StringRef Filename) {
+  auto ErrOrMemBufferRef = GetMemoryBufferForFile(Filename);
+  if (auto Err = ErrOrMemBufferRef.getError())
+    return Err;
+
+  auto ErrOrObjectFile =
+      object::ObjectFile::createObjectFile(*ErrOrMemBufferRef);
+  if (auto Err = ErrOrObjectFile.getError())
+    return Err;
+
+  CurrentObjectFile = std::move(*ErrOrObjectFile);
+  return *CurrentObjectFile;
+}
+}
+}
diff --git a/tools/dsymutil/BinaryHolder.h b/tools/dsymutil/BinaryHolder.h
new file mode 100644
index 0000000..04871b5
--- /dev/null
+++ b/tools/dsymutil/BinaryHolder.h
@@ -0,0 +1,104 @@
+//===-- BinaryHolder.h - Utility class for accessing binaries -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This program is a utility that aims to be a dropin replacement for
+// Darwin's dsymutil.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TOOLS_DSYMUTIL_BINARYHOLDER_H
+#define LLVM_TOOLS_DSYMUTIL_BINARYHOLDER_H
+
+#include "llvm/Object/Archive.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/ErrorOr.h"
+
+namespace llvm {
+namespace dsymutil {
+
+/// \brief The BinaryHolder class is responsible for creating and
+/// owning ObjectFile objects and their underlying MemoryBuffer. This
+/// is different from a simple OwningBinary in that it handles
+/// accessing to archive members.
+///
+/// As an optimization, this class will reuse an already mapped and
+/// parsed Archive object if 2 successive requests target the same
+/// archive file (Which is always the case in debug maps).
+/// Currently it only owns one memory buffer at any given time,
+/// meaning that a mapping request will invalidate the previous memory
+/// mapping.
+class BinaryHolder {
+  std::unique_ptr<object::Archive> CurrentArchive;
+  std::unique_ptr<MemoryBuffer> CurrentMemoryBuffer;
+  std::unique_ptr<object::ObjectFile> CurrentObjectFile;
+  bool Verbose;
+
+  /// \brief Get the MemoryBufferRef for the file specification in \p
+  /// Filename from the current archive.
+  ///
+  /// This function performs no system calls, it just looks up a
+  /// potential match for the given \p Filename in the currently
+  /// mapped archive if there is one.
+  ErrorOr<MemoryBufferRef> GetArchiveMemberBuffer(StringRef Filename);
+
+  /// \brief Interpret Filename as an archive member specification,
+  /// map the corresponding archive to memory and return the
+  /// MemoryBufferRef corresponding to the described member.
+  ErrorOr<MemoryBufferRef> MapArchiveAndGetMemberBuffer(StringRef Filename);
+
+  /// \brief Return the MemoryBufferRef that holds the memory
+  /// mapping for the given \p Filename. This function will try to
+  /// parse archive member specifications of the form
+  /// /path/to/archive.a(member.o).
+  ///
+  /// The returned MemoryBufferRef points to a buffer owned by this
+  /// object. The buffer is valid until the next call to
+  /// GetMemoryBufferForFile() on this object.
+  ErrorOr<MemoryBufferRef> GetMemoryBufferForFile(StringRef Filename);
+
+public:
+  BinaryHolder(bool Verbose) : Verbose(Verbose) {}
+
+  /// \brief Get the ObjectFile designated by the \p Filename. This
+  /// might be an archive member specification of the form
+  /// /path/to/archive.a(member.o).
+  ///
+  /// Calling this function invalidates the previous mapping owned by
+  /// the BinaryHolder.
+  ErrorOr<const object::ObjectFile &> GetObjectFile(StringRef Filename);
+
+  /// \brief Wraps GetObjectFile() to return a derived ObjectFile type.
+  template <typename ObjectFileType>
+  ErrorOr<const ObjectFileType &> GetFileAs(StringRef Filename) {
+    auto ErrOrObjFile = GetObjectFile(Filename);
+    if (auto Err = ErrOrObjFile.getError())
+      return Err;
+    if (const auto *Derived = dyn_cast<ObjectFileType>(CurrentObjectFile.get()))
+      return *Derived;
+    return make_error_code(object::object_error::invalid_file_type);
+  }
+
+  /// \brief Access the currently owned ObjectFile. As successfull
+  /// call to GetObjectFile() or GetFileAs() must have been performed
+  /// before calling this.
+  const object::ObjectFile &Get() {
+    assert(CurrentObjectFile);
+    return *CurrentObjectFile;
+  }
+
+  /// \brief Access to a derived version of the currently owned
+  /// ObjectFile. The conversion must be known to be valid.
+  template <typename ObjectFileType> const ObjectFileType &GetAs() {
+    return cast<ObjectFileType>(*CurrentObjectFile);
+  }
+};
+}
+}
+#endif
diff --git a/tools/dsymutil/CMakeLists.txt b/tools/dsymutil/CMakeLists.txt
new file mode 100644
index 0000000..5e1f37f
--- /dev/null
+++ b/tools/dsymutil/CMakeLists.txt
@@ -0,0 +1,14 @@
+set(LLVM_LINK_COMPONENTS
+  DebugInfoDWARF
+  Object
+  Support
+  )
+
+add_llvm_tool(llvm-dsymutil
+  dsymutil.cpp
+  BinaryHolder.cpp
+  DebugMap.cpp
+  DwarfLinker.cpp
+  MachODebugMapParser.cpp
+  )
+
diff --git a/tools/dsymutil/DebugMap.cpp b/tools/dsymutil/DebugMap.cpp
new file mode 100644
index 0000000..c04b2fe
--- /dev/null
+++ b/tools/dsymutil/DebugMap.cpp
@@ -0,0 +1,92 @@
+//===- tools/dsymutil/DebugMap.cpp - Generic debug map representation -----===//
+//
+//                             The LLVM Linker
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "DebugMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+namespace llvm {
+namespace dsymutil {
+
+using namespace llvm::object;
+
+DebugMapObject::DebugMapObject(StringRef ObjectFilename)
+    : Filename(ObjectFilename) {}
+
+bool DebugMapObject::addSymbol(StringRef Name, uint64_t ObjectAddress,
+                               uint64_t LinkedAddress) {
+  auto InsertResult = Symbols.insert(
+      std::make_pair(Name, SymbolMapping(ObjectAddress, LinkedAddress)));
+
+  if (InsertResult.second)
+    AddressToMapping[ObjectAddress] = &*InsertResult.first;
+  return InsertResult.second;
+}
+
+void DebugMapObject::print(raw_ostream &OS) const {
+  OS << getObjectFilename() << ":\n";
+  // Sort the symbols in alphabetical order, like llvm-nm (and to get
+  // deterministic output for testing).
+  typedef std::pair<StringRef, SymbolMapping> Entry;
+  std::vector<Entry> Entries;
+  Entries.reserve(Symbols.getNumItems());
+  for (const auto &Sym : make_range(Symbols.begin(), Symbols.end()))
+    Entries.push_back(std::make_pair(Sym.getKey(), Sym.getValue()));
+  std::sort(
+      Entries.begin(), Entries.end(),
+      [](const Entry &LHS, const Entry &RHS) { return LHS.first < RHS.first; });
+  for (const auto &Sym : Entries) {
+    OS << format("\t%016" PRIx64 " => %016" PRIx64 "\t%s\n",
+                 Sym.second.ObjectAddress, Sym.second.BinaryAddress,
+                 Sym.first.data());
+  }
+  OS << '\n';
+}
+
+#ifndef NDEBUG
+void DebugMapObject::dump() const { print(errs()); }
+#endif
+
+DebugMapObject &DebugMap::addDebugMapObject(StringRef ObjectFilePath) {
+  Objects.emplace_back(new DebugMapObject(ObjectFilePath));
+  return *Objects.back();
+}
+
+const DebugMapObject::DebugMapEntry *
+DebugMapObject::lookupSymbol(StringRef SymbolName) const {
+  StringMap<SymbolMapping>::const_iterator Sym = Symbols.find(SymbolName);
+  if (Sym == Symbols.end())
+    return nullptr;
+  return &*Sym;
+}
+
+const DebugMapObject::DebugMapEntry *
+DebugMapObject::lookupObjectAddress(uint64_t Address) const {
+  auto Mapping = AddressToMapping.find(Address);
+  if (Mapping == AddressToMapping.end())
+    return nullptr;
+  return Mapping->getSecond();
+}
+
+void DebugMap::print(raw_ostream &OS) const {
+  OS << "DEBUG MAP: " << BinaryTriple.getTriple()
+     << "\n\tobject addr =>  executable addr\tsymbol name\n";
+  for (const auto &Obj : objects())
+    Obj->print(OS);
+  OS << "END DEBUG MAP\n";
+}
+
+#ifndef NDEBUG
+void DebugMap::dump() const { print(errs()); }
+#endif
+}
+}
diff --git a/tools/dsymutil/DebugMap.h b/tools/dsymutil/DebugMap.h
new file mode 100644
index 0000000..ff2b27e
--- /dev/null
+++ b/tools/dsymutil/DebugMap.h
@@ -0,0 +1,142 @@
+//===- tools/dsymutil/DebugMap.h - Generic debug map representation -------===//
+//
+//                             The LLVM Linker
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+///
+/// This file contains the class declaration of the DebugMap
+/// entity. A DebugMap lists all the object files linked together to
+/// produce an executable along with the linked address of all the
+/// atoms used in these object files.
+/// The DebugMap is an input to the DwarfLinker class that will
+/// extract the Dwarf debug information from the referenced object
+/// files and link their usefull debug info together.
+///
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TOOLS_DSYMUTIL_DEBUGMAP_H
+#define LLVM_TOOLS_DSYMUTIL_DEBUGMAP_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/Format.h"
+#include <vector>
+
+namespace llvm {
+class raw_ostream;
+
+namespace dsymutil {
+class DebugMapObject;
+
+/// \brief The DebugMap object stores the list of object files to
+/// query for debug information along with the mapping between the
+/// symbols' addresses in the object file to their linked address in
+/// the linked binary.
+///
+/// A DebugMap producer could look like this:
+/// DebugMap *DM = new DebugMap();
+/// for (const auto &Obj: LinkedObjects) {
+///     DebugMapObject &DMO = DM->addDebugMapObject(Obj.getPath());
+///     for (const auto &Sym: Obj.getLinkedSymbols())
+///         DMO.addSymbol(Sym.getName(), Sym.getObjectFileAddress(),
+///                       Sym.getBinaryAddress());
+/// }
+///
+/// A DebugMap consumer can then use the map to link the debug
+/// information. For example something along the lines of:
+/// for (const auto &DMO: DM->objects()) {
+///     auto Obj = createBinary(DMO.getObjectFilename());
+///     for (auto &DIE: Obj.getDwarfDIEs()) {
+///         if (SymbolMapping *Sym = DMO.lookup(DIE.getName()))
+///             DIE.relocate(Sym->ObjectAddress, Sym->BinaryAddress);
+///         else
+///             DIE.discardSubtree();
+///     }
+/// }
+class DebugMap {
+  Triple BinaryTriple;
+  typedef std::vector<std::unique_ptr<DebugMapObject>> ObjectContainer;
+  ObjectContainer Objects;
+
+public:
+  DebugMap(const Triple &BinaryTriple) : BinaryTriple(BinaryTriple) {}
+
+  typedef ObjectContainer::const_iterator const_iterator;
+
+  iterator_range<const_iterator> objects() const {
+    return make_range(begin(), end());
+  }
+
+  const_iterator begin() const { return Objects.begin(); }
+
+  const_iterator end() const { return Objects.end(); }
+
+  /// This function adds an DebugMapObject to the list owned by this
+  /// debug map.
+  DebugMapObject &addDebugMapObject(StringRef ObjectFilePath);
+
+  const Triple &getTriple() { return BinaryTriple; }
+
+  void print(raw_ostream &OS) const;
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+
+/// \brief The DebugMapObject represents one object file described by
+/// the DebugMap. It contains a list of mappings between addresses in
+/// the object file and in the linked binary for all the linked atoms
+/// in this object file.
+class DebugMapObject {
+public:
+  struct SymbolMapping {
+    uint64_t ObjectAddress;
+    uint64_t BinaryAddress;
+    SymbolMapping(uint64_t ObjectAddress, uint64_t BinaryAddress)
+        : ObjectAddress(ObjectAddress), BinaryAddress(BinaryAddress) {}
+  };
+
+  typedef StringMapEntry<SymbolMapping> DebugMapEntry;
+
+  /// \brief Adds a symbol mapping to this DebugMapObject.
+  /// \returns false if the symbol was already registered. The request
+  /// is discarded in this case.
+  bool addSymbol(llvm::StringRef SymName, uint64_t ObjectAddress,
+                 uint64_t LinkedAddress);
+
+  /// \brief Lookup a symbol mapping.
+  /// \returns null if the symbol isn't found.
+  const DebugMapEntry *lookupSymbol(StringRef SymbolName) const;
+
+  /// \brief Lookup an objectfile address.
+  /// \returns null if the address isn't found.
+  const DebugMapEntry *lookupObjectAddress(uint64_t Address) const;
+
+  llvm::StringRef getObjectFilename() const { return Filename; }
+
+  void print(raw_ostream &OS) const;
+#ifndef NDEBUG
+  void dump() const;
+#endif
+private:
+  friend class DebugMap;
+  /// DebugMapObjects can only be constructed by the owning DebugMap.
+  DebugMapObject(StringRef ObjectFilename);
+
+  std::string Filename;
+  StringMap<SymbolMapping> Symbols;
+  DenseMap<uint64_t, DebugMapEntry *> AddressToMapping;
+};
+}
+}
+
+#endif // LLVM_TOOLS_DSYMUTIL_DEBUGMAP_H
diff --git a/tools/dsymutil/DwarfLinker.cpp b/tools/dsymutil/DwarfLinker.cpp
new file mode 100644
index 0000000..3c0bc0b
--- /dev/null
+++ b/tools/dsymutil/DwarfLinker.cpp
@@ -0,0 +1,667 @@
+//===- tools/dsymutil/DwarfLinker.cpp - Dwarf debug info linker -----------===//
+//
+//                             The LLVM Linker
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "DebugMap.h"
+#include "BinaryHolder.h"
+#include "DebugMap.h"
+#include "dsymutil.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/LEB128.h"
+#include <string>
+
+namespace llvm {
+namespace dsymutil {
+
+namespace {
+
+/// \brief Stores all information relating to a compile unit, be it in
+/// its original instance in the object file to its brand new cloned
+/// and linked DIE tree.
+class CompileUnit {
+public:
+  /// \brief Information gathered about a DIE in the object file.
+  struct DIEInfo {
+    uint64_t Address;   ///< Linked address of the described entity.
+    uint32_t ParentIdx; ///< The index of this DIE's parent.
+    bool Keep;          ///< Is the DIE part of the linked output?
+    bool InDebugMap;    ///< Was this DIE's entity found in the map?
+  };
+
+  CompileUnit(DWARFUnit &OrigUnit) : OrigUnit(OrigUnit) {
+    Info.resize(OrigUnit.getNumDIEs());
+  }
+
+  DWARFUnit &getOrigUnit() const { return OrigUnit; }
+
+  DIEInfo &getInfo(unsigned Idx) { return Info[Idx]; }
+  const DIEInfo &getInfo(unsigned Idx) const { return Info[Idx]; }
+
+private:
+  DWARFUnit &OrigUnit;
+  std::vector<DIEInfo> Info; ///< DIE info indexed by DIE index.
+};
+
+/// \brief The core of the Dwarf linking logic.
+///
+/// The link of the dwarf information from the object files will be
+/// driven by the selection of 'root DIEs', which are DIEs that
+/// describe variables or functions that are present in the linked
+/// binary (and thus have entries in the debug map). All the debug
+/// information that will be linked (the DIEs, but also the line
+/// tables, ranges, ...) is derived from that set of root DIEs.
+///
+/// The root DIEs are identified because they contain relocations that
+/// correspond to a debug map entry at specific places (the low_pc for
+/// a function, the location for a variable). These relocations are
+/// called ValidRelocs in the DwarfLinker and are gathered as a very
+/// first step when we start processing a DebugMapObject.
+class DwarfLinker {
+public:
+  DwarfLinker(StringRef OutputFilename, bool Verbose)
+      : OutputFilename(OutputFilename), Verbose(Verbose), BinHolder(Verbose) {}
+
+  /// \brief Link the contents of the DebugMap.
+  bool link(const DebugMap &);
+
+private:
+  /// \brief Called at the start of a debug object link.
+  void startDebugObject(DWARFContext &);
+
+  /// \brief Called at the end of a debug object link.
+  void endDebugObject();
+
+  /// \defgroup FindValidRelocations Translate debug map into a list
+  /// of relevant relocations
+  ///
+  /// @{
+  struct ValidReloc {
+    uint32_t Offset;
+    uint32_t Size;
+    uint64_t Addend;
+    const DebugMapObject::DebugMapEntry *Mapping;
+
+    ValidReloc(uint32_t Offset, uint32_t Size, uint64_t Addend,
+               const DebugMapObject::DebugMapEntry *Mapping)
+        : Offset(Offset), Size(Size), Addend(Addend), Mapping(Mapping) {}
+
+    bool operator<(const ValidReloc &RHS) const { return Offset < RHS.Offset; }
+  };
+
+  /// \brief The valid relocations for the current DebugMapObject.
+  /// This vector is sorted by relocation offset.
+  std::vector<ValidReloc> ValidRelocs;
+
+  /// \brief Index into ValidRelocs of the next relocation to
+  /// consider. As we walk the DIEs in acsending file offset and as
+  /// ValidRelocs is sorted by file offset, keeping this index
+  /// uptodate is all we have to do to have a cheap lookup during the
+  /// root DIE selection.
+  unsigned NextValidReloc;
+
+  bool findValidRelocsInDebugInfo(const object::ObjectFile &Obj,
+                                  const DebugMapObject &DMO);
+
+  bool findValidRelocs(const object::SectionRef &Section,
+                       const object::ObjectFile &Obj,
+                       const DebugMapObject &DMO);
+
+  void findValidRelocsMachO(const object::SectionRef &Section,
+                            const object::MachOObjectFile &Obj,
+                            const DebugMapObject &DMO);
+  /// @}
+
+  /// \defgroup FindRootDIEs Find DIEs corresponding to debug map entries.
+  ///
+  /// @{
+  /// \brief Recursively walk the \p DIE tree and look for DIEs to
+  /// keep. Store that information in \p CU's DIEInfo.
+  void lookForDIEsToKeep(const DWARFDebugInfoEntryMinimal &DIE,
+                         const DebugMapObject &DMO, CompileUnit &CU,
+                         unsigned Flags);
+
+  /// \brief Flags passed to DwarfLinker::lookForDIEsToKeep
+  enum TravesalFlags {
+    TF_Keep = 1 << 0,            ///< Mark the traversed DIEs as kept.
+    TF_InFunctionScope = 1 << 1, ///< Current scope is a fucntion scope.
+    TF_DependencyWalk = 1 << 2,  ///< Walking the dependencies of a kept DIE.
+    TF_ParentWalk = 1 << 3,      ///< Walking up the parents of a kept DIE.
+  };
+
+  /// \brief Mark the passed DIE as well as all the ones it depends on
+  /// as kept.
+  void keepDIEAndDenpendencies(const DWARFDebugInfoEntryMinimal &DIE,
+                               CompileUnit::DIEInfo &MyInfo,
+                               const DebugMapObject &DMO, CompileUnit &CU,
+                               unsigned Flags);
+
+  unsigned shouldKeepDIE(const DWARFDebugInfoEntryMinimal &DIE,
+                         CompileUnit &Unit, CompileUnit::DIEInfo &MyInfo,
+                         unsigned Flags);
+
+  unsigned shouldKeepVariableDIE(const DWARFDebugInfoEntryMinimal &DIE,
+                                 CompileUnit &Unit,
+                                 CompileUnit::DIEInfo &MyInfo, unsigned Flags);
+
+  unsigned shouldKeepSubprogramDIE(const DWARFDebugInfoEntryMinimal &DIE,
+                                   CompileUnit &Unit,
+                                   CompileUnit::DIEInfo &MyInfo,
+                                   unsigned Flags);
+
+  bool hasValidRelocation(uint32_t StartOffset, uint32_t EndOffset,
+                          CompileUnit::DIEInfo &Info);
+  /// @}
+
+  /// \defgroup Helpers Various helper methods.
+  ///
+  /// @{
+  const DWARFDebugInfoEntryMinimal *
+  resolveDIEReference(DWARFFormValue &RefValue, const DWARFUnit &Unit,
+                      const DWARFDebugInfoEntryMinimal &DIE,
+                      CompileUnit *&ReferencedCU);
+
+  CompileUnit *getUnitForOffset(unsigned Offset);
+
+  void reportWarning(const Twine &Warning, const DWARFUnit *Unit = nullptr,
+                     const DWARFDebugInfoEntryMinimal *DIE = nullptr);
+  /// @}
+
+private:
+  std::string OutputFilename;
+  bool Verbose;
+  BinaryHolder BinHolder;
+
+  /// The units of the current debug map object.
+  std::vector<CompileUnit> Units;
+
+  /// The debug map object curently under consideration.
+  DebugMapObject *CurrentDebugObject;
+};
+
+/// \brief Similar to DWARFUnitSection::getUnitForOffset(), but
+/// returning our CompileUnit object instead.
+CompileUnit *DwarfLinker::getUnitForOffset(unsigned Offset) {
+  auto CU =
+      std::upper_bound(Units.begin(), Units.end(), Offset,
+                       [](uint32_t LHS, const CompileUnit &RHS) {
+                         return LHS < RHS.getOrigUnit().getNextUnitOffset();
+                       });
+  return CU != Units.end() ? &*CU : nullptr;
+}
+
+/// \brief Resolve the DIE attribute reference that has been
+/// extracted in \p RefValue. The resulting DIE migh be in another
+/// CompileUnit which is stored into \p ReferencedCU.
+/// \returns null if resolving fails for any reason.
+const DWARFDebugInfoEntryMinimal *DwarfLinker::resolveDIEReference(
+    DWARFFormValue &RefValue, const DWARFUnit &Unit,
+    const DWARFDebugInfoEntryMinimal &DIE, CompileUnit *&RefCU) {
+  assert(RefValue.isFormClass(DWARFFormValue::FC_Reference));
+  uint64_t RefOffset = *RefValue.getAsReference(&Unit);
+
+  if ((RefCU = getUnitForOffset(RefOffset)))
+    if (const auto *RefDie = RefCU->getOrigUnit().getDIEForOffset(RefOffset))
+      return RefDie;
+
+  reportWarning("could not find referenced DIE", &Unit, &DIE);
+  return nullptr;
+}
+
+/// \brief Report a warning to the user, optionaly including
+/// information about a specific \p DIE related to the warning.
+void DwarfLinker::reportWarning(const Twine &Warning, const DWARFUnit *Unit,
+                                const DWARFDebugInfoEntryMinimal *DIE) {
+  if (CurrentDebugObject)
+    errs() << Twine("while processing ") +
+                  CurrentDebugObject->getObjectFilename() + ":\n";
+  errs() << Twine("warning: ") + Warning + "\n";
+
+  if (!Verbose || !DIE)
+    return;
+
+  errs() << "    in DIE:\n";
+  DIE->dump(errs(), const_cast<DWARFUnit *>(Unit), 0 /* RecurseDepth */,
+            6 /* Indent */);
+}
+
+/// \brief Recursive helper to gather the child->parent relationships in the
+/// original compile unit.
+static void gatherDIEParents(const DWARFDebugInfoEntryMinimal *DIE,
+                             unsigned ParentIdx, CompileUnit &CU) {
+  unsigned MyIdx = CU.getOrigUnit().getDIEIndex(DIE);
+  CU.getInfo(MyIdx).ParentIdx = ParentIdx;
+
+  if (DIE->hasChildren())
+    for (auto *Child = DIE->getFirstChild(); Child && !Child->isNULL();
+         Child = Child->getSibling())
+      gatherDIEParents(Child, MyIdx, CU);
+}
+
+static bool dieNeedsChildrenToBeMeaningful(uint32_t Tag) {
+  switch (Tag) {
+  default:
+    return false;
+  case dwarf::DW_TAG_subprogram:
+  case dwarf::DW_TAG_lexical_block:
+  case dwarf::DW_TAG_subroutine_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_union_type:
+    return true;
+  }
+  llvm_unreachable("Invalid Tag");
+}
+
+void DwarfLinker::startDebugObject(DWARFContext &Dwarf) {
+  Units.reserve(Dwarf.getNumCompileUnits());
+  NextValidReloc = 0;
+}
+
+void DwarfLinker::endDebugObject() {
+  Units.clear();
+  ValidRelocs.clear();
+}
+
+/// \brief Iterate over the relocations of the given \p Section and
+/// store the ones that correspond to debug map entries into the
+/// ValidRelocs array.
+void DwarfLinker::findValidRelocsMachO(const object::SectionRef &Section,
+                                       const object::MachOObjectFile &Obj,
+                                       const DebugMapObject &DMO) {
+  StringRef Contents;
+  Section.getContents(Contents);
+  DataExtractor Data(Contents, Obj.isLittleEndian(), 0);
+
+  for (const object::RelocationRef &Reloc : Section.relocations()) {
+    object::DataRefImpl RelocDataRef = Reloc.getRawDataRefImpl();
+    MachO::any_relocation_info MachOReloc = Obj.getRelocation(RelocDataRef);
+    unsigned RelocSize = 1 << Obj.getAnyRelocationLength(MachOReloc);
+    uint64_t Offset64;
+    if ((RelocSize != 4 && RelocSize != 8) || Reloc.getOffset(Offset64)) {
+      reportWarning(" unsupported relocation in debug_info section.");
+      continue;
+    }
+    uint32_t Offset = Offset64;
+    // Mach-o uses REL relocations, the addend is at the relocation offset.
+    uint64_t Addend = Data.getUnsigned(&Offset, RelocSize);
+
+    auto Sym = Reloc.getSymbol();
+    if (Sym != Obj.symbol_end()) {
+      StringRef SymbolName;
+      if (Sym->getName(SymbolName)) {
+        reportWarning("error getting relocation symbol name.");
+        continue;
+      }
+      if (const auto *Mapping = DMO.lookupSymbol(SymbolName))
+        ValidRelocs.emplace_back(Offset64, RelocSize, Addend, Mapping);
+    } else if (const auto *Mapping = DMO.lookupObjectAddress(Addend)) {
+      // Do not store the addend. The addend was the address of the
+      // symbol in the object file, the address in the binary that is
+      // stored in the debug map doesn't need to be offseted.
+      ValidRelocs.emplace_back(Offset64, RelocSize, 0, Mapping);
+    }
+  }
+}
+
+/// \brief Dispatch the valid relocation finding logic to the
+/// appropriate handler depending on the object file format.
+bool DwarfLinker::findValidRelocs(const object::SectionRef &Section,
+                                  const object::ObjectFile &Obj,
+                                  const DebugMapObject &DMO) {
+  // Dispatch to the right handler depending on the file type.
+  if (auto *MachOObj = dyn_cast<object::MachOObjectFile>(&Obj))
+    findValidRelocsMachO(Section, *MachOObj, DMO);
+  else
+    reportWarning(Twine("unsupported object file type: ") + Obj.getFileName());
+
+  if (ValidRelocs.empty())
+    return false;
+
+  // Sort the relocations by offset. We will walk the DIEs linearly in
+  // the file, this allows us to just keep an index in the relocation
+  // array that we advance during our walk, rather than resorting to
+  // some associative container. See DwarfLinker::NextValidReloc.
+  std::sort(ValidRelocs.begin(), ValidRelocs.end());
+  return true;
+}
+
+/// \brief Look for relocations in the debug_info section that match
+/// entries in the debug map. These relocations will drive the Dwarf
+/// link by indicating which DIEs refer to symbols present in the
+/// linked binary.
+/// \returns wether there are any valid relocations in the debug info.
+bool DwarfLinker::findValidRelocsInDebugInfo(const object::ObjectFile &Obj,
+                                             const DebugMapObject &DMO) {
+  // Find the debug_info section.
+  for (const object::SectionRef &Section : Obj.sections()) {
+    StringRef SectionName;
+    Section.getName(SectionName);
+    SectionName = SectionName.substr(SectionName.find_first_not_of("._"));
+    if (SectionName != "debug_info")
+      continue;
+    return findValidRelocs(Section, Obj, DMO);
+  }
+  return false;
+}
+
+/// \brief Checks that there is a relocation against an actual debug
+/// map entry between \p StartOffset and \p NextOffset.
+///
+/// This function must be called with offsets in strictly ascending
+/// order because it never looks back at relocations it already 'went past'.
+/// \returns true and sets Info.InDebugMap if it is the case.
+bool DwarfLinker::hasValidRelocation(uint32_t StartOffset, uint32_t EndOffset,
+                                     CompileUnit::DIEInfo &Info) {
+  assert(NextValidReloc == 0 ||
+         StartOffset > ValidRelocs[NextValidReloc - 1].Offset);
+  if (NextValidReloc >= ValidRelocs.size())
+    return false;
+
+  uint64_t RelocOffset = ValidRelocs[NextValidReloc].Offset;
+
+  // We might need to skip some relocs that we didn't consider. For
+  // example the high_pc of a discarded DIE might contain a reloc that
+  // is in the list because it actually corresponds to the start of a
+  // function that is in the debug map.
+  while (RelocOffset < StartOffset && NextValidReloc < ValidRelocs.size() - 1)
+    RelocOffset = ValidRelocs[++NextValidReloc].Offset;
+
+  if (RelocOffset < StartOffset || RelocOffset >= EndOffset)
+    return false;
+
+  const auto &ValidReloc = ValidRelocs[NextValidReloc++];
+  if (Verbose)
+    outs() << "Found valid debug map entry: " << ValidReloc.Mapping->getKey()
+           << " " << format("\t%016" PRIx64 " => %016" PRIx64,
+                            ValidReloc.Mapping->getValue().ObjectAddress,
+                            ValidReloc.Mapping->getValue().BinaryAddress);
+
+  Info.Address =
+      ValidReloc.Mapping->getValue().BinaryAddress + ValidReloc.Addend;
+  Info.InDebugMap = true;
+  return true;
+}
+
+/// \brief Get the starting and ending (exclusive) offset for the
+/// attribute with index \p Idx descibed by \p Abbrev. \p Offset is
+/// supposed to point to the position of the first attribute described
+/// by \p Abbrev.
+/// \return [StartOffset, EndOffset) as a pair.
+static std::pair<uint32_t, uint32_t>
+getAttributeOffsets(const DWARFAbbreviationDeclaration *Abbrev, unsigned Idx,
+                    unsigned Offset, const DWARFUnit &Unit) {
+  DataExtractor Data = Unit.getDebugInfoExtractor();
+
+  for (unsigned i = 0; i < Idx; ++i)
+    DWARFFormValue::skipValue(Abbrev->getFormByIndex(i), Data, &Offset, &Unit);
+
+  uint32_t End = Offset;
+  DWARFFormValue::skipValue(Abbrev->getFormByIndex(Idx), Data, &End, &Unit);
+
+  return std::make_pair(Offset, End);
+}
+
+/// \brief Check if a variable describing DIE should be kept.
+/// \returns updated TraversalFlags.
+unsigned DwarfLinker::shouldKeepVariableDIE(
+    const DWARFDebugInfoEntryMinimal &DIE, CompileUnit &Unit,
+    CompileUnit::DIEInfo &MyInfo, unsigned Flags) {
+  const auto *Abbrev = DIE.getAbbreviationDeclarationPtr();
+
+  // Global variables with constant value can always be kept.
+  if (!(Flags & TF_InFunctionScope) &&
+      Abbrev->findAttributeIndex(dwarf::DW_AT_const_value) != -1U) {
+    MyInfo.InDebugMap = true;
+    return Flags | TF_Keep;
+  }
+
+  uint32_t LocationIdx = Abbrev->findAttributeIndex(dwarf::DW_AT_location);
+  if (LocationIdx == -1U)
+    return Flags;
+
+  uint32_t Offset = DIE.getOffset() + getULEB128Size(Abbrev->getCode());
+  const DWARFUnit &OrigUnit = Unit.getOrigUnit();
+  uint32_t LocationOffset, LocationEndOffset;
+  std::tie(LocationOffset, LocationEndOffset) =
+      getAttributeOffsets(Abbrev, LocationIdx, Offset, OrigUnit);
+
+  // See if there is a relocation to a valid debug map entry inside
+  // this variable's location. The order is important here. We want to
+  // always check in the variable has a valid relocation, so that the
+  // DIEInfo is filled. However, we don't want a static variable in a
+  // function to force us to keep the enclosing function.
+  if (!hasValidRelocation(LocationOffset, LocationEndOffset, MyInfo) ||
+      (Flags & TF_InFunctionScope))
+    return Flags;
+
+  if (Verbose)
+    DIE.dump(outs(), const_cast<DWARFUnit *>(&OrigUnit), 0, 8 /* Indent */);
+
+  return Flags | TF_Keep;
+}
+
+/// \brief Check if a function describing DIE should be kept.
+/// \returns updated TraversalFlags.
+unsigned DwarfLinker::shouldKeepSubprogramDIE(
+    const DWARFDebugInfoEntryMinimal &DIE, CompileUnit &Unit,
+    CompileUnit::DIEInfo &MyInfo, unsigned Flags) {
+  const auto *Abbrev = DIE.getAbbreviationDeclarationPtr();
+
+  Flags |= TF_InFunctionScope;
+
+  uint32_t LowPcIdx = Abbrev->findAttributeIndex(dwarf::DW_AT_low_pc);
+  if (LowPcIdx == -1U)
+    return Flags;
+
+  uint32_t Offset = DIE.getOffset() + getULEB128Size(Abbrev->getCode());
+  const DWARFUnit &OrigUnit = Unit.getOrigUnit();
+  uint32_t LowPcOffset, LowPcEndOffset;
+  std::tie(LowPcOffset, LowPcEndOffset) =
+      getAttributeOffsets(Abbrev, LowPcIdx, Offset, OrigUnit);
+
+  uint64_t LowPc =
+      DIE.getAttributeValueAsAddress(&OrigUnit, dwarf::DW_AT_low_pc, -1ULL);
+  assert(LowPc != -1ULL && "low_pc attribute is not an address.");
+  if (LowPc == -1ULL ||
+      !hasValidRelocation(LowPcOffset, LowPcEndOffset, MyInfo))
+    return Flags;
+
+  if (Verbose)
+    DIE.dump(outs(), const_cast<DWARFUnit *>(&OrigUnit), 0, 8 /* Indent */);
+
+  return Flags | TF_Keep;
+}
+
+/// \brief Check if a DIE should be kept.
+/// \returns updated TraversalFlags.
+unsigned DwarfLinker::shouldKeepDIE(const DWARFDebugInfoEntryMinimal &DIE,
+                                    CompileUnit &Unit,
+                                    CompileUnit::DIEInfo &MyInfo,
+                                    unsigned Flags) {
+  switch (DIE.getTag()) {
+  case dwarf::DW_TAG_constant:
+  case dwarf::DW_TAG_variable:
+    return shouldKeepVariableDIE(DIE, Unit, MyInfo, Flags);
+  case dwarf::DW_TAG_subprogram:
+    return shouldKeepSubprogramDIE(DIE, Unit, MyInfo, Flags);
+  case dwarf::DW_TAG_module:
+  case dwarf::DW_TAG_imported_module:
+  case dwarf::DW_TAG_imported_declaration:
+  case dwarf::DW_TAG_imported_unit:
+    // We always want to keep these.
+    return Flags | TF_Keep;
+  }
+
+  return Flags;
+}
+
+
+/// \brief Mark the passed DIE as well as all the ones it depends on
+/// as kept.
+///
+/// This function is called by lookForDIEsToKeep on DIEs that are
+/// newly discovered to be needed in the link. It recursively calls
+/// back to lookForDIEsToKeep while adding TF_DependencyWalk to the
+/// TraversalFlags to inform it that it's not doing the primary DIE
+/// tree walk.
+void DwarfLinker::keepDIEAndDenpendencies(const DWARFDebugInfoEntryMinimal &DIE,
+                                          CompileUnit::DIEInfo &MyInfo,
+                                          const DebugMapObject &DMO,
+                                          CompileUnit &CU, unsigned Flags) {
+  const DWARFUnit &Unit = CU.getOrigUnit();
+  MyInfo.Keep = true;
+
+  // First mark all the parent chain as kept.
+  unsigned AncestorIdx = MyInfo.ParentIdx;
+  while (!CU.getInfo(AncestorIdx).Keep) {
+    lookForDIEsToKeep(*Unit.getDIEAtIndex(AncestorIdx), DMO, CU,
+                      TF_ParentWalk | TF_Keep | TF_DependencyWalk);
+    AncestorIdx = CU.getInfo(AncestorIdx).ParentIdx;
+  }
+
+  // Then we need to mark all the DIEs referenced by this DIE's
+  // attributes as kept.
+  DataExtractor Data = Unit.getDebugInfoExtractor();
+  const auto *Abbrev = DIE.getAbbreviationDeclarationPtr();
+  uint32_t Offset = DIE.getOffset() + getULEB128Size(Abbrev->getCode());
+
+  // Mark all DIEs referenced through atttributes as kept.
+  for (const auto &AttrSpec : Abbrev->attributes()) {
+    DWARFFormValue Val(AttrSpec.Form);
+
+    if (!Val.isFormClass(DWARFFormValue::FC_Reference)) {
+      DWARFFormValue::skipValue(AttrSpec.Form, Data, &Offset, &Unit);
+      continue;
+    }
+
+    Val.extractValue(Data, &Offset, &Unit);
+    CompileUnit *ReferencedCU;
+    if (const auto *RefDIE = resolveDIEReference(Val, Unit, DIE, ReferencedCU))
+      lookForDIEsToKeep(*RefDIE, DMO, *ReferencedCU,
+                        TF_Keep | TF_DependencyWalk);
+  }
+}
+
+/// \brief Recursively walk the \p DIE tree and look for DIEs to
+/// keep. Store that information in \p CU's DIEInfo.
+///
+/// This function is the entry point of the DIE selection
+/// algorithm. It is expected to walk the DIE tree in file order and
+/// (though the mediation of its helper) call hasValidRelocation() on
+/// each DIE that might be a 'root DIE' (See DwarfLinker class
+/// comment).
+/// While walking the dependencies of root DIEs, this function is
+/// also called, but during these dependency walks the file order is
+/// not respected. The TF_DependencyWalk flag tells us which kind of
+/// traversal we are currently doing.
+void DwarfLinker::lookForDIEsToKeep(const DWARFDebugInfoEntryMinimal &DIE,
+                                    const DebugMapObject &DMO, CompileUnit &CU,
+                                    unsigned Flags) {
+  unsigned Idx = CU.getOrigUnit().getDIEIndex(&DIE);
+  CompileUnit::DIEInfo &MyInfo = CU.getInfo(Idx);
+  bool AlreadyKept = MyInfo.Keep;
+
+  // If the Keep flag is set, we are marking a required DIE's
+  // dependencies. If our target is already marked as kept, we're all
+  // set.
+  if ((Flags & TF_DependencyWalk) && AlreadyKept)
+    return;
+
+  // We must not call shouldKeepDIE while called from keepDIEAndDenpendencies,
+  // because it would screw up the relocation finding logic.
+  if (!(Flags & TF_DependencyWalk))
+    Flags = shouldKeepDIE(DIE, CU, MyInfo, Flags);
+
+  // If it is a newly kept DIE mark it as well as all its dependencies as kept.
+  if (!AlreadyKept && (Flags & TF_Keep))
+    keepDIEAndDenpendencies(DIE, MyInfo, DMO, CU, Flags);
+
+  // The TF_ParentWalk flag tells us that we are currently walking up
+  // the parent chain of a required DIE, and we don't want to mark all
+  // the children of the parents as kept (consider for example a
+  // DW_TAG_namespace node in the parent chain). There are however a
+  // set of DIE types for which we want to ignore that directive and still
+  // walk their children.
+  if (dieNeedsChildrenToBeMeaningful(DIE.getTag()))
+    Flags &= ~TF_ParentWalk;
+
+  if (!DIE.hasChildren() || (Flags & TF_ParentWalk))
+    return;
+
+  for (auto *Child = DIE.getFirstChild(); Child && !Child->isNULL();
+       Child = Child->getSibling())
+    lookForDIEsToKeep(*Child, DMO, CU, Flags);
+}
+
+bool DwarfLinker::link(const DebugMap &Map) {
+
+  if (Map.begin() == Map.end()) {
+    errs() << "Empty debug map.\n";
+    return false;
+  }
+
+  for (const auto &Obj : Map.objects()) {
+    CurrentDebugObject = Obj.get();
+
+    if (Verbose)
+      outs() << "DEBUG MAP OBJECT: " << Obj->getObjectFilename() << "\n";
+    auto ErrOrObj = BinHolder.GetObjectFile(Obj->getObjectFilename());
+    if (std::error_code EC = ErrOrObj.getError()) {
+      reportWarning(Twine(Obj->getObjectFilename()) + ": " + EC.message());
+      continue;
+    }
+
+    // Look for relocations that correspond to debug map entries.
+    if (!findValidRelocsInDebugInfo(*ErrOrObj, *Obj)) {
+      if (Verbose)
+        outs() << "No valid relocations found. Skipping.\n";
+      continue;
+    }
+
+    // Setup access to the debug info.
+    DWARFContextInMemory DwarfContext(*ErrOrObj);
+    startDebugObject(DwarfContext);
+
+    // In a first phase, just read in the debug info and store the DIE
+    // parent links that we will use during the next phase.
+    for (const auto &CU : DwarfContext.compile_units()) {
+      auto *CUDie = CU->getCompileUnitDIE(false);
+      if (Verbose) {
+        outs() << "Input compilation unit:";
+        CUDie->dump(outs(), CU.get(), 0);
+      }
+      Units.emplace_back(*CU);
+      gatherDIEParents(CUDie, 0, Units.back());
+    }
+
+    // Then mark all the DIEs that need to be present in the linked
+    // output and collect some information about them. Note that this
+    // loop can not be merged with the previous one becaue cross-cu
+    // references require the ParentIdx to be setup for every CU in
+    // the object file before calling this.
+    for (auto &CurrentUnit : Units)
+      lookForDIEsToKeep(*CurrentUnit.getOrigUnit().getCompileUnitDIE(), *Obj,
+                        CurrentUnit, 0);
+
+    // Clean-up before starting working on the next object.
+    endDebugObject();
+  }
+
+  return true;
+}
+}
+
+bool linkDwarf(StringRef OutputFilename, const DebugMap &DM, bool Verbose) {
+  DwarfLinker Linker(OutputFilename, Verbose);
+  return Linker.link(DM);
+}
+}
+}
diff --git a/tools/dsymutil/LLVMBuild.txt b/tools/dsymutil/LLVMBuild.txt
new file mode 100644
index 0000000..c995291
--- /dev/null
+++ b/tools/dsymutil/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./tools/dsymutil/LLVMBuild.txt ---------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = llvm-dsymutil
+parent = Tools
+required_libraries = DebugInfoDWARF Object Support
diff --git a/tools/dsymutil/MachODebugMapParser.cpp b/tools/dsymutil/MachODebugMapParser.cpp
new file mode 100644
index 0000000..7bb0011
--- /dev/null
+++ b/tools/dsymutil/MachODebugMapParser.cpp
@@ -0,0 +1,241 @@
+//===- tools/dsymutil/MachODebugMapParser.cpp - Parse STABS debug maps ----===//
+//
+//                             The LLVM Linker
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryHolder.h"
+#include "DebugMap.h"
+#include "dsymutil.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace {
+using namespace llvm;
+using namespace llvm::dsymutil;
+using namespace llvm::object;
+
+class MachODebugMapParser {
+public:
+  MachODebugMapParser(StringRef BinaryPath, StringRef PathPrefix = "",
+                      bool Verbose = false)
+      : BinaryPath(BinaryPath), PathPrefix(PathPrefix),
+        MainBinaryHolder(Verbose), CurrentObjectHolder(Verbose),
+        CurrentDebugMapObject(nullptr) {}
+
+  /// \brief Parses and returns the DebugMap of the input binary.
+  /// \returns an error in case the provided BinaryPath doesn't exist
+  /// or isn't of a supported type.
+  ErrorOr<std::unique_ptr<DebugMap>> parse();
+
+private:
+  std::string BinaryPath;
+  std::string PathPrefix;
+
+  /// Owns the MemoryBuffer for the main binary.
+  BinaryHolder MainBinaryHolder;
+  /// Map of the binary symbol addresses.
+  StringMap<uint64_t> MainBinarySymbolAddresses;
+  StringRef MainBinaryStrings;
+  /// The constructed DebugMap.
+  std::unique_ptr<DebugMap> Result;
+
+  /// Owns the MemoryBuffer for the currently handled object file.
+  BinaryHolder CurrentObjectHolder;
+  /// Map of the currently processed object file symbol addresses.
+  StringMap<uint64_t> CurrentObjectAddresses;
+  /// Element of the debug map corresponfing to the current object file.
+  DebugMapObject *CurrentDebugMapObject;
+
+  void switchToNewDebugMapObject(StringRef Filename);
+  void resetParserState();
+  uint64_t getMainBinarySymbolAddress(StringRef Name);
+  void loadMainBinarySymbols();
+  void loadCurrentObjectFileSymbols();
+  void handleStabSymbolTableEntry(uint32_t StringIndex, uint8_t Type,
+                                  uint8_t SectionIndex, uint16_t Flags,
+                                  uint64_t Value);
+
+  template <typename STEType> void handleStabDebugMapEntry(const STEType &STE) {
+    handleStabSymbolTableEntry(STE.n_strx, STE.n_type, STE.n_sect, STE.n_desc,
+                               STE.n_value);
+  }
+};
+
+static void Warning(const Twine &Msg) { errs() << "warning: " + Msg + "\n"; }
+}
+
+/// Reset the parser state coresponding to the current object
+/// file. This is to be called after an object file is finished
+/// processing.
+void MachODebugMapParser::resetParserState() {
+  CurrentObjectAddresses.clear();
+  CurrentDebugMapObject = nullptr;
+}
+
+/// Create a new DebugMapObject. This function resets the state of the
+/// parser that was referring to the last object file and sets
+/// everything up to add symbols to the new one.
+void MachODebugMapParser::switchToNewDebugMapObject(StringRef Filename) {
+  resetParserState();
+
+  SmallString<80> Path(PathPrefix);
+  sys::path::append(Path, Filename);
+
+  auto MachOOrError = CurrentObjectHolder.GetFileAs<MachOObjectFile>(Path);
+  if (auto Error = MachOOrError.getError()) {
+    Warning(Twine("cannot open debug object \"") + Path.str() + "\": " +
+            Error.message() + "\n");
+    return;
+  }
+
+  loadCurrentObjectFileSymbols();
+  CurrentDebugMapObject = &Result->addDebugMapObject(Path);
+}
+
+static Triple getTriple(const object::MachOObjectFile &Obj) {
+  Triple TheTriple("unknown-unknown-unknown");
+  TheTriple.setArch(Triple::ArchType(Obj.getArch()));
+  TheTriple.setObjectFormat(Triple::MachO);
+  return TheTriple;
+}
+
+/// This main parsing routine tries to open the main binary and if
+/// successful iterates over the STAB entries. The real parsing is
+/// done in handleStabSymbolTableEntry.
+ErrorOr<std::unique_ptr<DebugMap>> MachODebugMapParser::parse() {
+  auto MainBinOrError = MainBinaryHolder.GetFileAs<MachOObjectFile>(BinaryPath);
+  if (auto Error = MainBinOrError.getError())
+    return Error;
+
+  const MachOObjectFile &MainBinary = *MainBinOrError;
+  loadMainBinarySymbols();
+  Result = make_unique<DebugMap>(getTriple(MainBinary));
+  MainBinaryStrings = MainBinary.getStringTableData();
+  for (const SymbolRef &Symbol : MainBinary.symbols()) {
+    const DataRefImpl &DRI = Symbol.getRawDataRefImpl();
+    if (MainBinary.is64Bit())
+      handleStabDebugMapEntry(MainBinary.getSymbol64TableEntry(DRI));
+    else
+      handleStabDebugMapEntry(MainBinary.getSymbolTableEntry(DRI));
+  }
+
+  resetParserState();
+  return std::move(Result);
+}
+
+/// Interpret the STAB entries to fill the DebugMap.
+void MachODebugMapParser::handleStabSymbolTableEntry(uint32_t StringIndex,
+                                                     uint8_t Type,
+                                                     uint8_t SectionIndex,
+                                                     uint16_t Flags,
+                                                     uint64_t Value) {
+  if (!(Type & MachO::N_STAB))
+    return;
+
+  const char *Name = &MainBinaryStrings.data()[StringIndex];
+
+  // An N_OSO entry represents the start of a new object file description.
+  if (Type == MachO::N_OSO)
+    return switchToNewDebugMapObject(Name);
+
+  // If the last N_OSO object file wasn't found,
+  // CurrentDebugMapObject will be null. Do not update anything
+  // until we find the next valid N_OSO entry.
+  if (!CurrentDebugMapObject)
+    return;
+
+  switch (Type) {
+  case MachO::N_GSYM:
+    // This is a global variable. We need to query the main binary
+    // symbol table to find its address as it might not be in the
+    // debug map (for common symbols).
+    Value = getMainBinarySymbolAddress(Name);
+    if (Value == UnknownAddressOrSize)
+      return;
+    break;
+  case MachO::N_FUN:
+    // Functions are scopes in STABS. They have an end marker that we
+    // need to ignore.
+    if (Name[0] == '\0')
+      return;
+    break;
+  case MachO::N_STSYM:
+    break;
+  default:
+    return;
+  }
+
+  auto ObjectSymIt = CurrentObjectAddresses.find(Name);
+  if (ObjectSymIt == CurrentObjectAddresses.end())
+    return Warning("could not find object file symbol for symbol " +
+                   Twine(Name));
+  if (!CurrentDebugMapObject->addSymbol(Name, ObjectSymIt->getValue(), Value))
+    return Warning(Twine("failed to insert symbol '") + Name +
+                   "' in the debug map.");
+}
+
+/// Load the current object file symbols into CurrentObjectAddresses.
+void MachODebugMapParser::loadCurrentObjectFileSymbols() {
+  CurrentObjectAddresses.clear();
+
+  for (auto Sym : CurrentObjectHolder.Get().symbols()) {
+    StringRef Name;
+    uint64_t Addr;
+    if (Sym.getAddress(Addr) || Addr == UnknownAddressOrSize ||
+        Sym.getName(Name))
+      continue;
+    CurrentObjectAddresses[Name] = Addr;
+  }
+}
+
+/// Lookup a symbol address in the main binary symbol table. The
+/// parser only needs to query common symbols, thus not every symbol's
+/// address is available through this function.
+uint64_t MachODebugMapParser::getMainBinarySymbolAddress(StringRef Name) {
+  auto Sym = MainBinarySymbolAddresses.find(Name);
+  if (Sym == MainBinarySymbolAddresses.end())
+    return UnknownAddressOrSize;
+  return Sym->second;
+}
+
+/// Load the interesting main binary symbols' addresses into
+/// MainBinarySymbolAddresses.
+void MachODebugMapParser::loadMainBinarySymbols() {
+  const MachOObjectFile &MainBinary = MainBinaryHolder.GetAs<MachOObjectFile>();
+  section_iterator Section = MainBinary.section_end();
+  for (const auto &Sym : MainBinary.symbols()) {
+    SymbolRef::Type Type;
+    // Skip undefined and STAB entries.
+    if (Sym.getType(Type) || (Type & SymbolRef::ST_Debug) ||
+        (Type & SymbolRef::ST_Unknown))
+      continue;
+    StringRef Name;
+    uint64_t Addr;
+    // The only symbols of interest are the global variables. These
+    // are the only ones that need to be queried because the address
+    // of common data won't be described in the debug map. All other
+    // addresses should be fetched for the debug map.
+    if (Sym.getAddress(Addr) || Addr == UnknownAddressOrSize ||
+        !(Sym.getFlags() & SymbolRef::SF_Global) || Sym.getSection(Section) ||
+        Section->isText() || Sym.getName(Name) || Name.size() == 0 ||
+        Name[0] == '\0')
+      continue;
+    MainBinarySymbolAddresses[Name] = Addr;
+  }
+}
+
+namespace llvm {
+namespace dsymutil {
+llvm::ErrorOr<std::unique_ptr<DebugMap>> parseDebugMap(StringRef InputFile,
+                                                       StringRef PrependPath,
+                                                       bool Verbose) {
+  MachODebugMapParser Parser(InputFile, PrependPath, Verbose);
+  return Parser.parse();
+}
+}
+}
diff --git a/tools/dsymutil/Makefile b/tools/dsymutil/Makefile
new file mode 100644
index 0000000..e8dc569
--- /dev/null
+++ b/tools/dsymutil/Makefile
@@ -0,0 +1,17 @@
+##===- tools/dsymutil/Makefile -----------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../..
+TOOLNAME := llvm-dsymutil
+LINK_COMPONENTS := DebugInfoDWARF Object Support
+
+# This tool has no plugins, optimize startup time.
+TOOL_NO_EXPORTS := 1
+
+include $(LEVEL)/Makefile.common
diff --git a/tools/dsymutil/dsymutil.cpp b/tools/dsymutil/dsymutil.cpp
new file mode 100644
index 0000000..2b4fcfe
--- /dev/null
+++ b/tools/dsymutil/dsymutil.cpp
@@ -0,0 +1,71 @@
+//===-- dsymutil.cpp - Debug info dumping utility for llvm ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This program is a utility that aims to be a dropin replacement for
+// Darwin's dsymutil.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DebugMap.h"
+#include "dsymutil.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Options.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+
+using namespace llvm::dsymutil;
+
+namespace {
+using namespace llvm::cl;
+
+static opt<std::string> InputFile(Positional, desc("<input file>"),
+                                  init("a.out"));
+
+static opt<std::string> OsoPrependPath("oso-prepend-path",
+                                       desc("Specify a directory to prepend "
+                                            "to the paths of object files."),
+                                       value_desc("path"));
+
+static opt<bool> Verbose("v", desc("Verbosity level"), init(false));
+
+static opt<bool>
+    ParseOnly("parse-only",
+              desc("Only parse the debug map, do not actaully link "
+                   "the DWARF."),
+              init(false));
+}
+
+int main(int argc, char **argv) {
+  llvm::sys::PrintStackTraceOnErrorSignal();
+  llvm::PrettyStackTraceProgram StackPrinter(argc, argv);
+  llvm::llvm_shutdown_obj Shutdown;
+
+  llvm::cl::ParseCommandLineOptions(argc, argv, "llvm dsymutil\n");
+  auto DebugMapPtrOrErr = parseDebugMap(InputFile, OsoPrependPath, Verbose);
+
+  if (auto EC = DebugMapPtrOrErr.getError()) {
+    llvm::errs() << "error: cannot parse the debug map for \"" << InputFile
+                 << "\": " << EC.message() << '\n';
+    return 1;
+  }
+
+  if (Verbose)
+    (*DebugMapPtrOrErr)->print(llvm::outs());
+
+  if (ParseOnly)
+    return 0;
+
+  std::string OutputBasename(InputFile);
+  if (OutputBasename == "-")
+    OutputBasename = "a.out";
+
+  return !linkDwarf(OutputBasename + ".dwarf", **DebugMapPtrOrErr, Verbose);
+}
diff --git a/tools/dsymutil/dsymutil.h b/tools/dsymutil/dsymutil.h
new file mode 100644
index 0000000..9203bea
--- /dev/null
+++ b/tools/dsymutil/dsymutil.h
@@ -0,0 +1,39 @@
+//===- tools/dsymutil/dsymutil.h - dsymutil high-level functionality ------===//
+//
+//                             The LLVM Linker
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+///
+/// This file contains the class declaration for the code that parses STABS
+/// debug maps that are embedded in the binaries symbol tables.
+///
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TOOLS_DSYMUTIL_DSYMUTIL_H
+#define LLVM_TOOLS_DSYMUTIL_DSYMUTIL_H
+
+#include "DebugMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorOr.h"
+#include <memory>
+
+namespace llvm {
+namespace dsymutil {
+/// \brief Extract the DebugMap from the given file.
+/// The file has to be a MachO object file.
+llvm::ErrorOr<std::unique_ptr<DebugMap>>
+parseDebugMap(StringRef InputFile, StringRef PrependPath = "",
+              bool Verbose = false);
+
+/// \brief Link the Dwarf debuginfo as directed by the passed DebugMap
+/// \p DM into a DwarfFile named \p OutputFilename.
+/// \returns false if the link failed.
+bool linkDwarf(StringRef OutputFilename, const DebugMap &DM,
+               bool Verbose = false);
+}
+}
+#endif // LLVM_TOOLS_DSYMUTIL_DSYMUTIL_H
diff --git a/tools/gold/CMakeLists.txt b/tools/gold/CMakeLists.txt
index 3033010..1a6169d 100644
--- a/tools/gold/CMakeLists.txt
+++ b/tools/gold/CMakeLists.txt
@@ -1,13 +1,6 @@
-set(LLVM_BINUTILS_INCDIR "" CACHE PATH
-  "PATH to binutils/include containing plugin-api.h for gold plugin.")
-
 set(LLVM_EXPORTED_SYMBOL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/gold.exports)
 
-if( NOT LLVM_BINUTILS_INCDIR )
-  # Nothing to say.
-elseif( NOT EXISTS "${LLVM_BINUTILS_INCDIR}/plugin-api.h" )
-  message(STATUS "plugin-api.h not found. gold plugin excluded from the build.")
-else()
+if( LLVM_ENABLE_PIC AND LLVM_BINUTILS_INCDIR )
   include_directories( ${LLVM_BINUTILS_INCDIR} )
 
   # Because off_t is used in the public API, the largefile parts are required for
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index cfda6d2..e3a57b5 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -15,23 +15,28 @@
 #include "llvm/Config/config.h" // plugin-api.h requires HAVE_STDINT_H
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Object/IRObjectFile.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
@@ -78,9 +83,14 @@ static std::vector<std::string> Cleanup;
 static llvm::TargetOptions TargetOpts;
 
 namespace options {
-  enum generate_bc { BC_NO, BC_ONLY, BC_SAVE_TEMPS };
+  enum OutputType {
+    OT_NORMAL,
+    OT_DISABLE,
+    OT_BC_ONLY,
+    OT_SAVE_TEMPS
+  };
   static bool generate_api_file = false;
-  static generate_bc generate_bc_file = BC_NO;
+  static OutputType TheOutputType = OT_NORMAL;
   static std::string obj_path;
   static std::string extra_library_path;
   static std::string triple;
@@ -109,9 +119,11 @@ namespace options {
     } else if (opt.startswith("obj-path=")) {
       obj_path = opt.substr(strlen("obj-path="));
     } else if (opt == "emit-llvm") {
-      generate_bc_file = BC_ONLY;
+      TheOutputType = OT_BC_ONLY;
     } else if (opt == "save-temps") {
-      generate_bc_file = BC_SAVE_TEMPS;
+      TheOutputType = OT_SAVE_TEMPS;
+    } else if (opt == "disable-output") {
+      TheOutputType = OT_DISABLE;
     } else {
       // Save this option to pass to the code generator.
       // ParseCommandLineOptions() expects argv[0] to be program name. Lazily
@@ -253,6 +265,44 @@ static const GlobalObject *getBaseObject(const GlobalValue &GV) {
   return cast<GlobalObject>(&GV);
 }
 
+static bool shouldSkip(uint32_t Symflags) {
+  if (!(Symflags & object::BasicSymbolRef::SF_Global))
+    return true;
+  if (Symflags & object::BasicSymbolRef::SF_FormatSpecific)
+    return true;
+  return false;
+}
+
+static void diagnosticHandler(const DiagnosticInfo &DI, void *Context) {
+  if (const auto *BDI = dyn_cast<BitcodeDiagnosticInfo>(&DI)) {
+    std::error_code EC = BDI->getError();
+    if (EC == BitcodeError::InvalidBitcodeSignature)
+      return;
+  }
+
+  std::string ErrStorage;
+  {
+    raw_string_ostream OS(ErrStorage);
+    DiagnosticPrinterRawOStream DP(OS);
+    DI.print(DP);
+  }
+  ld_plugin_level Level;
+  switch (DI.getSeverity()) {
+  case DS_Error:
+    message(LDPL_FATAL, "LLVM gold plugin has failed to create LTO module: %s",
+            ErrStorage.c_str());
+    llvm_unreachable("Fatal doesn't return.");
+  case DS_Warning:
+    Level = LDPL_WARNING;
+    break;
+  case DS_Remark:
+  case DS_Note:
+    Level = LDPL_INFO;
+    break;
+  }
+  message(Level, "LLVM gold plugin: %s",  ErrStorage.c_str());
+}
+
 /// Called by gold to see whether this file is one that our plugin can handle.
 /// We'll try to open it and register all the symbols with add_symbol if
 /// possible.
@@ -286,11 +336,11 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
     BufferRef = Buffer->getMemBufferRef();
   }
 
+  Context.setDiagnosticHandler(diagnosticHandler);
   ErrorOr<std::unique_ptr<object::IRObjectFile>> ObjOrErr =
-      object::IRObjectFile::createIRObjectFile(BufferRef, Context);
+      object::IRObjectFile::create(BufferRef, Context);
   std::error_code EC = ObjOrErr.getError();
-  if (EC == BitcodeError::InvalidBitcodeSignature ||
-      EC == object::object_error::invalid_file_type ||
+  if (EC == object::object_error::invalid_file_type ||
       EC == object::object_error::bitcode_section_not_found)
     return LDPS_OK;
 
@@ -310,10 +360,7 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
 
   for (auto &Sym : Obj->symbols()) {
     uint32_t Symflags = Sym.getFlags();
-    if (!(Symflags & object::BasicSymbolRef::SF_Global))
-      continue;
-
-    if (Symflags & object::BasicSymbolRef::SF_FormatSpecific)
+    if (shouldSkip(Symflags))
       continue;
 
     cf.syms.push_back(ld_plugin_symbol());
@@ -467,51 +514,10 @@ static const char *getResolutionName(ld_plugin_symbol_resolution R) {
   llvm_unreachable("Unknown resolution");
 }
 
-static GlobalObject *makeInternalReplacement(GlobalObject *GO) {
-  Module *M = GO->getParent();
-  GlobalObject *Ret;
-  if (auto *F = dyn_cast<Function>(GO)) {
-    if (F->materialize())
-      message(LDPL_FATAL, "LLVM gold plugin has failed to read a function");
-
-    auto *NewF = Function::Create(F->getFunctionType(), F->getLinkage(),
-                                  F->getName(), M);
-
-    ValueToValueMapTy VM;
-    Function::arg_iterator NewI = NewF->arg_begin();
-    for (auto &Arg : F->args()) {
-      NewI->setName(Arg.getName());
-      VM[&Arg] = NewI;
-      ++NewI;
-    }
-
-    NewF->getBasicBlockList().splice(NewF->end(), F->getBasicBlockList());
-    for (auto &BB : *NewF) {
-      for (auto &Inst : BB)
-        RemapInstruction(&Inst, VM, RF_IgnoreMissingEntries);
-    }
-
-    Ret = NewF;
-    F->deleteBody();
-  } else {
-    auto *Var = cast<GlobalVariable>(GO);
-    Ret = new GlobalVariable(
-        *M, Var->getType()->getElementType(), Var->isConstant(),
-        Var->getLinkage(), Var->getInitializer(), Var->getName(),
-        nullptr, Var->getThreadLocalMode(), Var->getType()->getAddressSpace(),
-        Var->isExternallyInitialized());
-    Var->setInitializer(nullptr);
-  }
-  Ret->copyAttributesFrom(GO);
-  Ret->setLinkage(GlobalValue::InternalLinkage);
-  Ret->setComdat(GO->getComdat());
-
-  return Ret;
-}
-
 namespace {
 class LocalValueMaterializer : public ValueMaterializer {
   DenseSet<GlobalValue *> &Dropped;
+  DenseMap<GlobalObject *, GlobalObject *> LocalVersions;
 
 public:
   LocalValueMaterializer(DenseSet<GlobalValue *> &Dropped) : Dropped(Dropped) {}
@@ -520,13 +526,39 @@ public:
 }
 
 Value *LocalValueMaterializer::materializeValueFor(Value *V) {
-  auto *GV = dyn_cast<GlobalValue>(V);
-  if (!GV)
+  auto *GO = dyn_cast<GlobalObject>(V);
+  if (!GO)
     return nullptr;
-  if (!Dropped.count(GV))
+
+  auto I = LocalVersions.find(GO);
+  if (I != LocalVersions.end())
+    return I->second;
+
+  if (!Dropped.count(GO))
     return nullptr;
-  assert(!isa<GlobalAlias>(GV) && "Found alias point to weak alias.");
-  return makeInternalReplacement(cast<GlobalObject>(GV));
+
+  Module &M = *GO->getParent();
+  GlobalValue::LinkageTypes L = GO->getLinkage();
+  GlobalObject *Declaration;
+  if (auto *F = dyn_cast<Function>(GO)) {
+    Declaration = Function::Create(F->getFunctionType(), L, "", &M);
+  } else {
+    auto *Var = cast<GlobalVariable>(GO);
+    Declaration = new GlobalVariable(M, Var->getType()->getElementType(),
+                                     Var->isConstant(), L,
+                                     /*Initializer*/ nullptr);
+  }
+  Declaration->takeName(GO);
+  Declaration->copyAttributesFrom(GO);
+
+  GO->setLinkage(GlobalValue::InternalLinkage);
+  GO->setName(Declaration->getName());
+  Dropped.erase(GO);
+  GO->replaceAllUsesWith(Declaration);
+
+  LocalVersions[Declaration] = GO;
+
+  return GO;
 }
 
 static Constant *mapConstantToLocalCopy(Constant *C, ValueToValueMapTy &VM,
@@ -534,12 +566,17 @@ static Constant *mapConstantToLocalCopy(Constant *C, ValueToValueMapTy &VM,
   return MapValue(C, VM, RF_IgnoreMissingEntries, nullptr, Materializer);
 }
 
+static void freeSymName(ld_plugin_symbol &Sym) {
+  free(Sym.name);
+  free(Sym.comdat_key);
+  Sym.name = nullptr;
+  Sym.comdat_key = nullptr;
+}
+
 static std::unique_ptr<Module>
-getModuleForFile(LLVMContext &Context, claimed_file &F, raw_fd_ostream *ApiFile,
+getModuleForFile(LLVMContext &Context, claimed_file &F,
+                 ld_plugin_input_file &Info, raw_fd_ostream *ApiFile,
                  StringSet<> &Internalize, StringSet<> &Maybe) {
-  ld_plugin_input_file File;
-  if (get_input_file(F.handle, &File) != LDPS_OK)
-    message(LDPL_FATAL, "Failed to get file information");
 
   if (get_symbols(F.handle, F.syms.size(), &F.syms[0]) != LDPS_OK)
     message(LDPL_FATAL, "Failed to get symbol information");
@@ -548,42 +585,45 @@ getModuleForFile(LLVMContext &Context, claimed_file &F, raw_fd_ostream *ApiFile,
   if (get_view(F.handle, &View) != LDPS_OK)
     message(LDPL_FATAL, "Failed to get a view of file");
 
-  llvm::ErrorOr<MemoryBufferRef> MBOrErr =
-      object::IRObjectFile::findBitcodeInMemBuffer(
-          MemoryBufferRef(StringRef((const char *)View, File.filesize), ""));
-  if (std::error_code EC = MBOrErr.getError())
+  MemoryBufferRef BufferRef(StringRef((const char *)View, Info.filesize),
+                            Info.name);
+  ErrorOr<std::unique_ptr<object::IRObjectFile>> ObjOrErr =
+      object::IRObjectFile::create(BufferRef, Context);
+
+  if (std::error_code EC = ObjOrErr.getError())
     message(LDPL_FATAL, "Could not read bitcode from file : %s",
             EC.message().c_str());
 
-  std::unique_ptr<MemoryBuffer> Buffer =
-      MemoryBuffer::getMemBuffer(MBOrErr->getBuffer(), "", false);
-
-  if (release_input_file(F.handle) != LDPS_OK)
-    message(LDPL_FATAL, "Failed to release file information");
-
-  ErrorOr<Module *> MOrErr = getLazyBitcodeModule(std::move(Buffer), Context);
+  object::IRObjectFile &Obj = **ObjOrErr;
 
-  if (std::error_code EC = MOrErr.getError())
-    message(LDPL_FATAL, "Could not read bitcode from file : %s",
-            EC.message().c_str());
+  Module &M = Obj.getModule();
 
-  std::unique_ptr<Module> M(MOrErr.get());
+  UpgradeDebugInfo(M);
 
   SmallPtrSet<GlobalValue *, 8> Used;
-  collectUsedGlobalVariables(*M, Used, /*CompilerUsed*/ false);
+  collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ false);
 
   DenseSet<GlobalValue *> Drop;
   std::vector<GlobalAlias *> KeptAliases;
-  for (ld_plugin_symbol &Sym : F.syms) {
+
+  unsigned SymNum = 0;
+  for (auto &ObjSym : Obj.symbols()) {
+    if (shouldSkip(ObjSym.getFlags()))
+      continue;
+    ld_plugin_symbol &Sym = F.syms[SymNum];
+    ++SymNum;
+
     ld_plugin_symbol_resolution Resolution =
         (ld_plugin_symbol_resolution)Sym.resolution;
 
     if (options::generate_api_file)
       *ApiFile << Sym.name << ' ' << getResolutionName(Resolution) << '\n';
 
-    GlobalValue *GV = M->getNamedValue(Sym.name);
-    if (!GV)
+    GlobalValue *GV = Obj.getSymbolGV(ObjSym.getRawDataRefImpl());
+    if (!GV) {
+      freeSymName(Sym);
       continue; // Asm symbol.
+    }
 
     if (Resolution != LDPR_PREVAILING_DEF_IRONLY && GV->hasCommonLinkage()) {
       // Common linkage is special. There is no single symbol that wins the
@@ -591,6 +631,7 @@ getModuleForFile(LLVMContext &Context, claimed_file &F, raw_fd_ostream *ApiFile,
       // The IR linker does that for us if we just pass it every common GV.
       // We still have to keep track of LDPR_PREVAILING_DEF_IRONLY so we
       // internalize once the IR linker has done its job.
+      freeSymName(Sym);
       continue;
     }
 
@@ -601,17 +642,23 @@ getModuleForFile(LLVMContext &Context, claimed_file &F, raw_fd_ostream *ApiFile,
     case LDPR_RESOLVED_IR:
     case LDPR_RESOLVED_EXEC:
     case LDPR_RESOLVED_DYN:
-    case LDPR_UNDEF:
       assert(GV->isDeclarationForLinker());
       break;
 
+    case LDPR_UNDEF:
+      if (!GV->isDeclarationForLinker()) {
+        assert(GV->hasComdat());
+        Drop.insert(GV);
+      }
+      break;
+
     case LDPR_PREVAILING_DEF_IRONLY: {
       keepGlobalValue(*GV, KeptAliases);
       if (!Used.count(GV)) {
         // Since we use the regular lib/Linker, we cannot just internalize GV
         // now or it will not be copied to the merged module. Instead we force
         // it to be copied and then internalize it.
-        Internalize.insert(Sym.name);
+        Internalize.insert(GV->getName());
       }
       break;
     }
@@ -624,7 +671,7 @@ getModuleForFile(LLVMContext &Context, claimed_file &F, raw_fd_ostream *ApiFile,
       // Gold might have selected a linkonce_odr and preempted a weak_odr.
       // In that case we have to make sure we don't end up internalizing it.
       if (!GV->isDiscardableIfUnused())
-        Maybe.erase(Sym.name);
+        Maybe.erase(GV->getName());
 
       // fall-through
     case LDPR_PREEMPTED_REG:
@@ -637,16 +684,13 @@ getModuleForFile(LLVMContext &Context, claimed_file &F, raw_fd_ostream *ApiFile,
       // and in that module the address might be significant, but that
       // copy will be LDPR_PREEMPTED_IR.
       if (GV->hasLinkOnceODRLinkage())
-        Maybe.insert(Sym.name);
+        Maybe.insert(GV->getName());
       keepGlobalValue(*GV, KeptAliases);
       break;
     }
     }
 
-    free(Sym.name);
-    free(Sym.comdat_key);
-    Sym.name = nullptr;
-    Sym.comdat_key = nullptr;
+    freeSymName(Sym);
   }
 
   ValueToValueMapTy VM;
@@ -656,26 +700,31 @@ getModuleForFile(LLVMContext &Context, claimed_file &F, raw_fd_ostream *ApiFile,
     // expression is being dropped. If that is the case, that GV must be copied.
     Constant *Aliasee = GA->getAliasee();
     Constant *Replacement = mapConstantToLocalCopy(Aliasee, VM, &Materializer);
-    if (Aliasee != Replacement)
-      GA->setAliasee(Replacement);
+    GA->setAliasee(Replacement);
   }
 
   for (auto *GV : Drop)
     drop(*GV);
 
-  return M;
+  return Obj.takeModule();
 }
 
 static void runLTOPasses(Module &M, TargetMachine &TM) {
-  PassManager passes;
+  if (const DataLayout *DL = TM.getDataLayout())
+    M.setDataLayout(DL);
+
+  legacy::PassManager passes;
+  passes.add(new DataLayoutPass());
+  passes.add(createTargetTransformInfoWrapperPass(TM.getTargetIRAnalysis()));
+
   PassManagerBuilder PMB;
-  PMB.LibraryInfo = new TargetLibraryInfo(Triple(TM.getTargetTriple()));
+  PMB.LibraryInfo = new TargetLibraryInfoImpl(Triple(TM.getTargetTriple()));
   PMB.Inliner = createFunctionInliningPass();
   PMB.VerifyInput = true;
   PMB.VerifyOutput = true;
   PMB.LoopVectorize = true;
   PMB.SLPVectorize = true;
-  PMB.populateLTOPassManager(passes, &TM);
+  PMB.populateLTOPassManager(passes);
   passes.run(M);
 }
 
@@ -711,10 +760,10 @@ static void codegen(Module &M) {
 
   runLTOPasses(M, *TM);
 
-  if (options::generate_bc_file == options::BC_SAVE_TEMPS)
+  if (options::TheOutputType == options::OT_SAVE_TEMPS)
     saveBCFile(output_name + ".opt.bc", M);
 
-  PassManager CodeGenPasses;
+  legacy::PassManager CodeGenPasses;
   CodeGenPasses.add(new DataLayoutPass());
 
   SmallString<128> Filename;
@@ -760,6 +809,8 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
     return LDPS_OK;
 
   LLVMContext Context;
+  Context.setDiagnosticHandler(diagnosticHandler);
+
   std::unique_ptr<Module> Combined(new Module("ld-temp.o", Context));
   Linker L(Combined.get());
 
@@ -768,8 +819,11 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
   StringSet<> Internalize;
   StringSet<> Maybe;
   for (claimed_file &F : Modules) {
+    ld_plugin_input_file File;
+    if (get_input_file(F.handle, &File) != LDPS_OK)
+      message(LDPL_FATAL, "Failed to get file information");
     std::unique_ptr<Module> M =
-        getModuleForFile(Context, F, ApiFile, Internalize, Maybe);
+        getModuleForFile(Context, F, File, ApiFile, Internalize, Maybe);
     if (!options::triple.empty())
       M->setTargetTriple(options::triple.c_str());
     else if (M->getTargetTriple().empty()) {
@@ -778,6 +832,8 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
 
     if (L.linkInModule(M.get()))
       message(LDPL_FATAL, "Failed to link module");
+    if (release_input_file(F.handle) != LDPS_OK)
+      message(LDPL_FATAL, "Failed to release file information");
   }
 
   for (const auto &Name : Internalize) {
@@ -795,14 +851,17 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
       internalize(*GV);
   }
 
-  if (options::generate_bc_file != options::BC_NO) {
+  if (options::TheOutputType == options::OT_DISABLE)
+    return LDPS_OK;
+
+  if (options::TheOutputType != options::OT_NORMAL) {
     std::string path;
-    if (options::generate_bc_file == options::BC_ONLY)
+    if (options::TheOutputType == options::OT_BC_ONLY)
       path = output_name;
     else
       path = output_name + ".bc";
     saveBCFile(path, *L.getModule());
-    if (options::generate_bc_file == options::BC_ONLY)
+    if (options::TheOutputType == options::OT_BC_ONLY)
       return LDPS_OK;
   }
 
@@ -828,8 +887,16 @@ static ld_plugin_status all_symbols_read_hook(void) {
     Ret = allSymbolsReadHook(&ApiFile);
   }
 
-  if (options::generate_bc_file == options::BC_ONLY)
+  llvm_shutdown();
+
+  if (options::TheOutputType == options::OT_BC_ONLY ||
+      options::TheOutputType == options::OT_DISABLE) {
+    if (options::TheOutputType == options::OT_DISABLE)
+      // Remove the output file here since ld.bfd creates the output file
+      // early.
+      sys::fs::remove(output_name);
     exit(0);
+  }
 
   return Ret;
 }
diff --git a/tools/llc/CMakeLists.txt b/tools/llc/CMakeLists.txt
index 393d64c..484ff40 100644
--- a/tools/llc/CMakeLists.txt
+++ b/tools/llc/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
+  Analysis
   AsmPrinter
   CodeGen
   Core
@@ -8,7 +9,6 @@ set(LLVM_LINK_COMPONENTS
   ScalarOpts
   SelectionDAG
   Support
-  Target
   )
 
 # Support plugins.
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index fe4d9ac..efa1422 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -15,17 +15,18 @@
 
 
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/CodeGen/LinkAllAsmWriterComponents.h"
 #include "llvm/CodeGen/LinkAllCodegenComponents.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Pass.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
@@ -39,7 +40,6 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <memory>
@@ -95,9 +95,9 @@ static cl::opt<bool> AsmVerbose("asm-verbose",
 
 static int compileModule(char **, LLVMContext &);
 
-static tool_output_file *GetOutputStream(const char *TargetName,
-                                         Triple::OSType OS,
-                                         const char *ProgName) {
+static std::unique_ptr<tool_output_file>
+GetOutputStream(const char *TargetName, Triple::OSType OS,
+                const char *ProgName) {
   // If we don't yet have an output filename, make one.
   if (OutputFilename.empty()) {
     if (InputFilename == "-")
@@ -151,10 +151,10 @@ static tool_output_file *GetOutputStream(const char *TargetName,
   sys::fs::OpenFlags OpenFlags = sys::fs::F_None;
   if (!Binary)
     OpenFlags |= sys::fs::F_Text;
-  tool_output_file *FDOut = new tool_output_file(OutputFilename, EC, OpenFlags);
+  auto FDOut = llvm::make_unique<tool_output_file>(OutputFilename, EC,
+                                                   OpenFlags);
   if (EC) {
     errs() << EC.message() << '\n';
-    delete FDOut;
     return nullptr;
   }
 
@@ -205,7 +205,6 @@ static int compileModule(char **argv, LLVMContext &Context) {
   // Load the module to be compiled...
   SMDiagnostic Err;
   std::unique_ptr<Module> M;
-  Module *mod = nullptr;
   Triple TheTriple;
 
   bool SkipModule = MCPU == "help" ||
@@ -220,16 +219,15 @@ static int compileModule(char **argv, LLVMContext &Context) {
   // If user just wants to list available options, skip module loading
   if (!SkipModule) {
     M = parseIRFile(InputFilename, Err, Context);
-    mod = M.get();
-    if (mod == nullptr) {
+    if (!M) {
       Err.print(argv[0], errs());
       return 1;
     }
 
     // If we are supposed to override the target triple, do so now.
     if (!TargetTriple.empty())
-      mod->setTargetTriple(Triple::normalize(TargetTriple));
-    TheTriple = Triple(mod->getTargetTriple());
+      M->setTargetTriple(Triple::normalize(TargetTriple));
+    TheTriple = Triple(M->getTargetTriple());
   } else {
     TheTriple = Triple(Triple::normalize(TargetTriple));
   }
@@ -273,10 +271,10 @@ static int compileModule(char **argv, LLVMContext &Context) {
   Options.MCOptions.MCUseDwarfDirectory = EnableDwarfDirectory;
   Options.MCOptions.AsmVerbose = AsmVerbose;
 
-  std::unique_ptr<TargetMachine> target(
+  std::unique_ptr<TargetMachine> Target(
       TheTarget->createTargetMachine(TheTriple.getTriple(), MCPU, FeaturesStr,
                                      Options, RelocModel, CMModel, OLvl));
-  assert(target.get() && "Could not allocate target machine!");
+  assert(Target && "Could not allocate target machine!");
 
   // If we don't have a module then just exit now. We do this down
   // here since the CPU/Feature help is underneath the target machine
@@ -284,29 +282,30 @@ static int compileModule(char **argv, LLVMContext &Context) {
   if (SkipModule)
     return 0;
 
-  assert(mod && "Should have exited if we didn't have a module!");
-  TargetMachine &Target = *target.get();
+  assert(M && "Should have exited if we didn't have a module!");
 
   if (GenerateSoftFloatCalls)
     FloatABIForCalls = FloatABI::Soft;
 
   // Figure out where we are going to send the output.
-  std::unique_ptr<tool_output_file> Out(
-      GetOutputStream(TheTarget->getName(), TheTriple.getOS(), argv[0]));
+  std::unique_ptr<tool_output_file> Out =
+      GetOutputStream(TheTarget->getName(), TheTriple.getOS(), argv[0]);
   if (!Out) return 1;
 
   // Build up all of the passes that we want to do to the module.
-  PassManager PM;
+  legacy::PassManager PM;
 
   // Add an appropriate TargetLibraryInfo pass for the module's triple.
-  TargetLibraryInfo *TLI = new TargetLibraryInfo(TheTriple);
+  TargetLibraryInfoImpl TLII(Triple(M->getTargetTriple()));
+
+  // The -disable-simplify-libcalls flag actually disables all builtin optzns.
   if (DisableSimplifyLibCalls)
-    TLI->disableAllFunctions();
-  PM.add(TLI);
+    TLII.disableAllFunctions();
+  PM.add(new TargetLibraryInfoWrapperPass(TLII));
 
   // Add the target data from the target machine, if it exists, or the module.
-  if (const DataLayout *DL = Target.getSubtargetImpl()->getDataLayout())
-    mod->setDataLayout(DL);
+  if (const DataLayout *DL = Target->getDataLayout())
+    M->setDataLayout(DL);
   PM.add(new DataLayoutPass());
 
   if (RelaxAll.getNumOccurrences() > 0 &&
@@ -338,8 +337,8 @@ static int compileModule(char **argv, LLVMContext &Context) {
     }
 
     // Ask the target to add backend passes as necessary.
-    if (Target.addPassesToEmitFile(PM, FOS, FileType, NoVerify,
-                                   StartAfterID, StopAfterID)) {
+    if (Target->addPassesToEmitFile(PM, FOS, FileType, NoVerify,
+                                    StartAfterID, StopAfterID)) {
       errs() << argv[0] << ": target does not support generation of this"
              << " file type!\n";
       return 1;
@@ -348,7 +347,7 @@ static int compileModule(char **argv, LLVMContext &Context) {
     // Before executing passes, print the final values of the LLVM options.
     cl::PrintOptionValues();
 
-    PM.run(*mod);
+    PM.run(*M);
   }
 
   // Declare success.
diff --git a/tools/lli/Android.mk b/tools/lli/Android.mk
index 1b09102..d771e56 100644
--- a/tools/lli/Android.mk
+++ b/tools/lli/Android.mk
@@ -46,7 +46,6 @@ lli_STATIC_LIBRARIES := \
   libLLVMSelectionDAG \
   libLLVMCodeGen \
   libLLVMInstrumentation \
-  libLLVMExecutionEngine \
   libLLVMLinker \
   libLLVMInterpreter \
   libLLVMScalarOpts \
@@ -55,6 +54,8 @@ lli_STATIC_LIBRARIES := \
   libLLVMTarget \
   libLLVMMC \
   libLLVMMCJIT \
+  libLLVMOrcJIT \
+  libLLVMExecutionEngine \
   libLLVMRuntimeDyld \
   libLLVMMCParser \
   libLLVMObject \
diff --git a/tools/lli/CMakeLists.txt b/tools/lli/CMakeLists.txt
index 3610d76..463c853 100644
--- a/tools/lli/CMakeLists.txt
+++ b/tools/lli/CMakeLists.txt
@@ -10,6 +10,8 @@ set(LLVM_LINK_COMPONENTS
   MC
   MCJIT
   Object
+  OrcJIT
+  RuntimeDyld
   SelectionDAG
   Support
   native
@@ -25,7 +27,7 @@ endif( LLVM_USE_OPROFILE )
 if( LLVM_USE_INTEL_JITEVENTS )
   set(LLVM_LINK_COMPONENTS
     ${LLVM_LINK_COMPONENTS}
-    DebugInfo
+    DebugInfoDWARF
     IntelJITEvents
     Object
     )
diff --git a/tools/lli/ChildTarget/ChildTarget.cpp b/tools/lli/ChildTarget/ChildTarget.cpp
index 0d71b17..6c537d4 100644
--- a/tools/lli/ChildTarget/ChildTarget.cpp
+++ b/tools/lli/ChildTarget/ChildTarget.cpp
@@ -97,15 +97,15 @@ void LLIChildTarget::handleMessage(LLIMessageType messageType) {
 // Incoming message handlers
 void LLIChildTarget::handleAllocateSpace() {
   // Read and verify the message data size.
-  uint32_t DataSize;
+  uint32_t DataSize = 0;
   int rc = ReadBytes(&DataSize, 4);
   (void)rc;
   assert(rc == 4);
   assert(DataSize == 8);
 
   // Read the message arguments.
-  uint32_t Alignment;
-  uint32_t AllocSize;
+  uint32_t Alignment = 0;
+  uint32_t AllocSize = 0;
   rc = ReadBytes(&Alignment, 4);
   assert(rc == 4);
   rc = ReadBytes(&AllocSize, 4);
@@ -121,13 +121,13 @@ void LLIChildTarget::handleAllocateSpace() {
 
 void LLIChildTarget::handleLoadSection(bool IsCode) {
   // Read the message data size.
-  uint32_t DataSize;
+  uint32_t DataSize = 0;
   int rc = ReadBytes(&DataSize, 4);
   (void)rc;
   assert(rc == 4);
 
   // Read the target load address.
-  uint64_t Addr;
+  uint64_t Addr = 0;
   rc = ReadBytes(&Addr, 8);
   assert(rc == 8);
   size_t BufferSize = DataSize - 8;
@@ -150,14 +150,14 @@ void LLIChildTarget::handleLoadSection(bool IsCode) {
 
 void LLIChildTarget::handleExecute() {
   // Read the message data size.
-  uint32_t DataSize;
+  uint32_t DataSize = 0;
   int rc = ReadBytes(&DataSize, 4);
   (void)rc;
   assert(rc == 4);
   assert(DataSize == 8);
 
   // Read the target address.
-  uint64_t Addr;
+  uint64_t Addr = 0;
   rc = ReadBytes(&Addr, 8);
   assert(rc == 8);
 
diff --git a/tools/lli/Makefile b/tools/lli/Makefile
index 94d6f06..70d8c80 100644
--- a/tools/lli/Makefile
+++ b/tools/lli/Makefile
@@ -14,12 +14,12 @@ PARALLEL_DIRS := ChildTarget
 
 include $(LEVEL)/Makefile.config
 
-LINK_COMPONENTS := mcjit instrumentation interpreter nativecodegen bitreader asmparser irreader selectiondag native
+LINK_COMPONENTS := mcjit orcjit instrumentation interpreter nativecodegen bitreader asmparser irreader selectiondag native
 
 # If Intel JIT Events support is confiured, link against the LLVM Intel JIT
 # Events interface library
 ifeq ($(USE_INTEL_JITEVENTS), 1)
-  LINK_COMPONENTS += debuginfo inteljitevents object
+  LINK_COMPONENTS += debuginfodwarf inteljitevents object
 endif
 
 # If oprofile support is confiured, link against the LLVM oprofile interface
diff --git a/tools/lli/RemoteMemoryManager.cpp b/tools/lli/RemoteMemoryManager.cpp
index 5a135ea..47da8fb 100644
--- a/tools/lli/RemoteMemoryManager.cpp
+++ b/tools/lli/RemoteMemoryManager.cpp
@@ -14,7 +14,6 @@
 
 #include "RemoteMemoryManager.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/ObjectImage.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 
@@ -78,7 +77,7 @@ sys::MemoryBlock RemoteMemoryManager::allocateSection(uintptr_t Size) {
 }
 
 void RemoteMemoryManager::notifyObjectLoaded(ExecutionEngine *EE,
-                                                const ObjectImage *Obj) {
+                                             const object::ObjectFile &Obj) {
   // The client should have called setRemoteTarget() before triggering any
   // code generation.
   assert(Target);
diff --git a/tools/lli/RemoteMemoryManager.h b/tools/lli/RemoteMemoryManager.h
index 0bdb4e2..895bcda 100644
--- a/tools/lli/RemoteMemoryManager.h
+++ b/tools/lli/RemoteMemoryManager.h
@@ -80,7 +80,8 @@ public:
   // symbols from Modules it contains.
   uint64_t getSymbolAddress(const std::string &Name) override { return 0; }
 
-  void notifyObjectLoaded(ExecutionEngine *EE, const ObjectImage *Obj) override;
+  void notifyObjectLoaded(ExecutionEngine *EE,
+                          const object::ObjectFile &Obj) override;
 
   bool finalizeMemory(std::string *ErrMsg) override;
 
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index 276740b..9c2b781 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
+#include "llvm/ExecutionEngine/OrcMCJITReplacement.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
@@ -74,9 +75,12 @@ namespace {
                                  cl::desc("Force interpretation: disable JIT"),
                                  cl::init(false));
 
-  cl::opt<bool> DebugIR(
-    "debug-ir", cl::desc("Generate debug information to allow debugging IR."),
-    cl::init(false));
+  cl::opt<bool> UseOrcMCJITReplacement("use-orcmcjit",
+                                       cl::desc("Use the experimental "
+                                                "OrcMCJITReplacement as a "
+                                                "drop-in replacement for "
+                                                "MCJIT."),
+                                       cl::init(false));
 
   // The MCJIT supports building for a target address space separate from
   // the JIT compilation process. Use a forked process and a copying
@@ -414,11 +418,6 @@ int main(int argc, char **argv, char * const *envp) {
     }
   }
 
-  if (DebugIR) {
-    ModulePass *DebugIRPass = createDebugIRPass();
-    DebugIRPass->runOnModule(*Mod);
-  }
-
   std::string ErrorMsg;
   EngineBuilder builder(std::move(Owner));
   builder.setMArch(MArch);
@@ -430,6 +429,7 @@ int main(int argc, char **argv, char * const *envp) {
   builder.setEngineKind(ForceInterpreter
                         ? EngineKind::Interpreter
                         : EngineKind::JIT);
+  builder.setUseOrcMCJITReplacement(UseOrcMCJITReplacement);
 
   // If we are supposed to override the target triple, do so now.
   if (!TargetTriple.empty())
@@ -442,7 +442,11 @@ int main(int argc, char **argv, char * const *envp) {
       RTDyldMM = new RemoteMemoryManager();
     else
       RTDyldMM = new SectionMemoryManager();
-    builder.setMCJITMemoryManager(RTDyldMM);
+
+    // Deliberately construct a temp std::unique_ptr to pass in. Do not null out
+    // RTDyldMM: We still use it below, even though we don't own it.
+    builder.setMCJITMemoryManager(
+      std::unique_ptr<RTDyldMemoryManager>(RTDyldMM));
   } else if (RemoteMCJIT) {
     errs() << "error: Remote process execution does not work with the "
               "interpreter.\n";
@@ -561,7 +565,7 @@ int main(int argc, char **argv, char * const *envp) {
   // If the user specifically requested an argv[0] to pass into the program,
   // do it now.
   if (!FakeArgv0.empty()) {
-    InputFile = FakeArgv0;
+    InputFile = static_cast<std::string>(FakeArgv0);
   } else {
     // Otherwise, if there is a .bc suffix on the executable strip it off, it
     // might confuse the program.
@@ -593,7 +597,7 @@ int main(int argc, char **argv, char * const *envp) {
     // function later on to make an explicit call, so get the function now.
     Constant *Exit = Mod->getOrInsertFunction("exit", Type::getVoidTy(Context),
                                                       Type::getInt32Ty(Context),
-                                                      NULL);
+                                                      nullptr);
 
     // Run static constructors.
     if (!ForceInterpreter) {
diff --git a/tools/llvm-ar/CMakeLists.txt b/tools/llvm-ar/CMakeLists.txt
index 5193def..3782c87 100644
--- a/tools/llvm-ar/CMakeLists.txt
+++ b/tools/llvm-ar/CMakeLists.txt
@@ -9,9 +9,6 @@ add_llvm_tool(llvm-ar
   llvm-ar.cpp
   )
 
-# FIXME: this is duplicated from the clang CMakeLists.txt
-# FIXME: bin/llvm-ranlib is not a valid build target with this setup (pr17024)
-
 if(UNIX)
   set(LLVM_LINK_OR_COPY create_symlink)
   set(llvm_ar_binary "llvm-ar${CMAKE_EXECUTABLE_SUFFIX}")
@@ -21,10 +18,12 @@ else()
 endif()
 
 set(llvm_ranlib "${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-ranlib${CMAKE_EXECUTABLE_SUFFIX}")
-add_custom_command(TARGET llvm-ar POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${llvm_ar_binary}" "${llvm_ranlib}")
 
-set_property(DIRECTORY APPEND
-  PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${llvm_ranlib})
+add_custom_command(OUTPUT ${llvm_ranlib}
+                   COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${llvm_ar_binary}" "${llvm_ranlib}"
+                   DEPENDS llvm-ar)
+
+add_custom_target(llvm-ranlib ALL DEPENDS ${llvm_ranlib})
+set_target_properties(llvm-ranlib PROPERTIES FOLDER Tools)
 
-# TODO: Support check-local.
+install(SCRIPT install_symlink.cmake -DCMAKE_INSTALL_PREFIX=\"${CMAKE_INSTALL_PREFIX}\")
diff --git a/tools/llvm-ar/install_symlink.cmake b/tools/llvm-ar/install_symlink.cmake
new file mode 100644
index 0000000..e313897
--- /dev/null
+++ b/tools/llvm-ar/install_symlink.cmake
@@ -0,0 +1,25 @@
+# We need to execute this script at installation time because the
+# DESTDIR environment variable may be unset at configuration time.
+# See PR8397.
+
+if(UNIX)
+  set(LINK_OR_COPY create_symlink)
+  set(DESTDIR $ENV{DESTDIR})
+else()
+  set(LINK_OR_COPY copy)
+endif()
+
+# CMAKE_EXECUTABLE_SUFFIX is undefined on cmake scripts. See PR9286.
+if( WIN32 )
+  set(EXECUTABLE_SUFFIX ".exe")
+else()
+  set(EXECUTABLE_SUFFIX "")
+endif()
+
+set(bindir "${DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/")
+
+message("Creating llvm-ranlib")
+
+execute_process(
+  COMMAND "${CMAKE_COMMAND}" -E ${LINK_OR_COPY} "llvm-ar${EXECUTABLE_SUFFIX}" "llvm-ranlib${EXECUTABLE_SUFFIX}"
+  WORKING_DIRECTORY "${bindir}")
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index 8ee66f6..2f1eb3b 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -92,7 +92,6 @@ static cl::extrahelp MoreHelp(
   "  [a] - put file(s) after [relpos]\n"
   "  [b] - put file(s) before [relpos] (same as [i])\n"
   "  [i] - put file(s) before [relpos] (same as [b])\n"
-  "  [N] - use instance [count] of name\n"
   "  [o] - preserve original dates\n"
   "  [s] - create an archive index (cf. ranlib)\n"
   "  [S] - do not build a symbol table\n"
@@ -413,8 +412,6 @@ class NewArchiveIterator {
   object::Archive::child_iterator OldI;
 
   StringRef NewFilename;
-  mutable int NewFD;
-  mutable sys::fs::file_status NewStatus;
 
 public:
   NewArchiveIterator(object::Archive::child_iterator I, StringRef Name);
@@ -426,7 +423,7 @@ public:
   object::Archive::child_iterator getOld() const;
 
   StringRef getNew() const;
-  int getFD() const;
+  int getFD(sys::fs::file_status &NewStatus) const;
   const sys::fs::file_status &getStatus() const;
 };
 }
@@ -438,7 +435,7 @@ NewArchiveIterator::NewArchiveIterator(object::Archive::child_iterator I,
     : IsNewMember(false), Name(Name), OldI(I) {}
 
 NewArchiveIterator::NewArchiveIterator(StringRef NewFilename, StringRef Name)
-    : IsNewMember(true), Name(Name), NewFilename(NewFilename), NewFD(-1) {}
+    : IsNewMember(true), Name(Name), NewFilename(NewFilename) {}
 
 StringRef NewArchiveIterator::getName() const { return Name; }
 
@@ -454,10 +451,9 @@ StringRef NewArchiveIterator::getNew() const {
   return NewFilename;
 }
 
-int NewArchiveIterator::getFD() const {
+int NewArchiveIterator::getFD(sys::fs::file_status &NewStatus) const {
   assert(IsNewMember);
-  if (NewFD != -1)
-    return NewFD;
+  int NewFD;
   failIfError(sys::fs::openFileForRead(NewFilename, NewFD), NewFilename);
   assert(NewFD != -1);
 
@@ -472,12 +468,6 @@ int NewArchiveIterator::getFD() const {
   return NewFD;
 }
 
-const sys::fs::file_status &NewArchiveIterator::getStatus() const {
-  assert(IsNewMember);
-  assert(NewFD != -1 && "Must call getFD first");
-  return NewStatus;
-}
-
 template <typename T>
 void addMember(std::vector<NewArchiveIterator> &Members, T I, StringRef Name,
                int Pos = -1) {
@@ -694,10 +684,11 @@ static void writeStringTable(raw_fd_ostream &Out,
   Out.seek(Pos);
 }
 
-static void
-writeSymbolTable(raw_fd_ostream &Out, ArrayRef<NewArchiveIterator> Members,
-                 ArrayRef<MemoryBufferRef> Buffers,
-                 std::vector<std::pair<unsigned, unsigned>> &MemberOffsetRefs) {
+// Returns the offset of the first reference to a member offset.
+static unsigned writeSymbolTable(raw_fd_ostream &Out,
+                                 ArrayRef<NewArchiveIterator> Members,
+                                 ArrayRef<MemoryBufferRef> Buffers,
+                                 std::vector<unsigned> &MemberOffsetRefs) {
   unsigned StartOffset = 0;
   unsigned MemberNum = 0;
   std::string NameBuf;
@@ -732,14 +723,14 @@ writeSymbolTable(raw_fd_ostream &Out, ArrayRef<NewArchiveIterator> Members,
       failIfError(S.printName(NameOS));
       NameOS << '\0';
       ++NumSyms;
-      MemberOffsetRefs.push_back(std::make_pair(Out.tell(), MemberNum));
+      MemberOffsetRefs.push_back(MemberNum);
       print32BE(Out, 0);
     }
   }
   Out << NameOS.str();
 
   if (StartOffset == 0)
-    return;
+    return 0;
 
   if (Out.tell() % 2)
     Out << '\0';
@@ -750,6 +741,7 @@ writeSymbolTable(raw_fd_ostream &Out, ArrayRef<NewArchiveIterator> Members,
   Out.seek(StartOffset);
   print32BE(Out, NumSyms);
   Out.seek(Pos);
+  return StartOffset + 4;
 }
 
 static void
@@ -764,10 +756,11 @@ performWriteOperation(ArchiveOperation Operation, object::Archive *OldArchive,
   raw_fd_ostream &Out = Output.os();
   Out << "!<arch>\n";
 
-  std::vector<std::pair<unsigned, unsigned> > MemberOffsetRefs;
+  std::vector<unsigned> MemberOffsetRefs;
 
   std::vector<std::unique_ptr<MemoryBuffer>> Buffers;
   std::vector<MemoryBufferRef> Members;
+  std::vector<sys::fs::file_status> NewMemberStatus;
 
   for (unsigned I = 0, N = NewMembers.size(); I < N; ++I) {
     NewArchiveIterator &Member = NewMembers[I];
@@ -775,11 +768,14 @@ performWriteOperation(ArchiveOperation Operation, object::Archive *OldArchive,
 
     if (Member.isNewMember()) {
       StringRef Filename = Member.getNew();
-      int FD = Member.getFD();
-      const sys::fs::file_status &Status = Member.getStatus();
+      NewMemberStatus.resize(NewMemberStatus.size() + 1);
+      sys::fs::file_status &Status = NewMemberStatus.back();
+      int FD = Member.getFD(Status);
       ErrorOr<std::unique_ptr<MemoryBuffer>> MemberBufferOrErr =
           MemoryBuffer::getOpenFile(FD, Filename, Status.getSize(), false);
       failIfError(MemberBufferOrErr.getError(), Filename);
+      if (close(FD) != 0)
+        fail("Could not close file");
       Buffers.push_back(std::move(MemberBufferOrErr.get()));
       MemberRef = Buffers.back()->getMemBufferRef();
     } else {
@@ -792,35 +788,31 @@ performWriteOperation(ArchiveOperation Operation, object::Archive *OldArchive,
     Members.push_back(MemberRef);
   }
 
+  unsigned MemberReferenceOffset = 0;
   if (Symtab) {
-    writeSymbolTable(Out, NewMembers, Members, MemberOffsetRefs);
+    MemberReferenceOffset =
+        writeSymbolTable(Out, NewMembers, Members, MemberOffsetRefs);
   }
 
   std::vector<unsigned> StringMapIndexes;
   writeStringTable(Out, NewMembers, StringMapIndexes);
 
-  std::vector<std::pair<unsigned, unsigned> >::iterator MemberRefsI =
-      MemberOffsetRefs.begin();
-
   unsigned MemberNum = 0;
   unsigned LongNameMemberNum = 0;
+  unsigned NewMemberNum = 0;
+  std::vector<unsigned> MemberOffset;
   for (std::vector<NewArchiveIterator>::iterator I = NewMembers.begin(),
                                                  E = NewMembers.end();
        I != E; ++I, ++MemberNum) {
 
     unsigned Pos = Out.tell();
-    while (MemberRefsI != MemberOffsetRefs.end() &&
-           MemberRefsI->second == MemberNum) {
-      Out.seek(MemberRefsI->first);
-      print32BE(Out, Pos);
-      ++MemberRefsI;
-    }
-    Out.seek(Pos);
+    MemberOffset.push_back(Pos);
 
     MemoryBufferRef File = Members[MemberNum];
     if (I->isNewMember()) {
       StringRef FileName = I->getNew();
-      const sys::fs::file_status &Status = I->getStatus();
+      const sys::fs::file_status &Status = NewMemberStatus[NewMemberNum];
+      NewMemberNum++;
 
       StringRef Name = sys::path::filename(FileName);
       if (Name.size() < 16)
@@ -853,6 +845,12 @@ performWriteOperation(ArchiveOperation Operation, object::Archive *OldArchive,
       Out << '\n';
   }
 
+  if (MemberReferenceOffset) {
+    Out.seek(MemberReferenceOffset);
+    for (unsigned MemberNum : MemberOffsetRefs)
+      print32BE(Out, MemberOffset[MemberNum]);
+  }
+
   Output.keep();
   Out.close();
   sys::fs::rename(TemporaryOutput, ArchiveName);
diff --git a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
index f95b272..8aa5697 100644
--- a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
+++ b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
@@ -28,10 +28,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Bitcode/BitstreamReader.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/Bitcode/LLVMBitCodes.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -222,8 +222,10 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
 
     case bitc::FUNC_CODE_INST_BINOP:        return "INST_BINOP";
     case bitc::FUNC_CODE_INST_CAST:         return "INST_CAST";
-    case bitc::FUNC_CODE_INST_GEP:          return "INST_GEP";
-    case bitc::FUNC_CODE_INST_INBOUNDS_GEP: return "INST_INBOUNDS_GEP";
+    case bitc::FUNC_CODE_INST_GEP_OLD:
+      return "INST_GEP_OLD";
+    case bitc::FUNC_CODE_INST_INBOUNDS_GEP_OLD:
+      return "INST_INBOUNDS_GEP_OLD";
     case bitc::FUNC_CODE_INST_SELECT:       return "INST_SELECT";
     case bitc::FUNC_CODE_INST_EXTRACTELT:   return "INST_EXTRACTELT";
     case bitc::FUNC_CODE_INST_INSERTELT:    return "INST_INSERTELT";
@@ -248,6 +250,8 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
     case bitc::FUNC_CODE_DEBUG_LOC_AGAIN:   return "DEBUG_LOC_AGAIN";
     case bitc::FUNC_CODE_INST_CALL:         return "INST_CALL";
     case bitc::FUNC_CODE_DEBUG_LOC:         return "DEBUG_LOC";
+    case bitc::FUNC_CODE_INST_GEP:
+      return "INST_GEP";
     }
   case bitc::VALUE_SYMTAB_BLOCK_ID:
     switch (CodeID) {
@@ -267,7 +271,9 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
     case bitc::METADATA_NAME:        return "METADATA_NAME";
     case bitc::METADATA_KIND:        return "METADATA_KIND";
     case bitc::METADATA_NODE:        return "METADATA_NODE";
-    case bitc::METADATA_FN_NODE:     return "METADATA_FN_NODE";
+    case bitc::METADATA_VALUE:       return "METADATA_VALUE";
+    case bitc::METADATA_OLD_NODE:    return "METADATA_OLD_NODE";
+    case bitc::METADATA_OLD_FN_NODE: return "METADATA_OLD_FN_NODE";
     case bitc::METADATA_NAMED_NODE:  return "METADATA_NAMED_NODE";
     }
   case bitc::USELIST_BLOCK_ID:
diff --git a/tools/llvm-c-test/Android.mk b/tools/llvm-c-test/Android.mk
index 3ab8830..6b978c0 100644
--- a/tools/llvm-c-test/Android.mk
+++ b/tools/llvm-c-test/Android.mk
@@ -13,6 +13,7 @@ llvm_c_test_SRC_FILES := \
   helpers.c              \
   include-all.c          \
   main.c                 \
+  metadata.c             \
   module.c               \
   object.c               \
   targets.c              \
diff --git a/tools/llvm-c-test/CMakeLists.txt b/tools/llvm-c-test/CMakeLists.txt
index 989678b..f22dffb 100644
--- a/tools/llvm-c-test/CMakeLists.txt
+++ b/tools/llvm-c-test/CMakeLists.txt
@@ -7,7 +7,26 @@ set(LLVM_LINK_COMPONENTS
   Target
   )
 
-if(TARGET LLVM)
+# We should only have llvm-c-test use libLLVM if libLLVM is built with the
+# default list of components. Using libLLVM with custom components can result in
+# build failures.
+
+set (USE_LLVM_DYLIB FALSE)
+
+if (TARGET LLVM)
+  set (USE_LLVM_DYLIB TRUE)
+  if (DEFINED LLVM_DYLIB_COMPONENTS)
+    foreach(c in ${LLVM_LINK_COMPONENTS})
+      list(FIND LLVM_DYLIB_COMPONENTS ${c} C_IDX)
+      if (C_IDX EQUAL -1)
+        set(USE_LLVM_DYLIB FALSE)
+        break()
+      endif()
+    endforeach()
+  endif()
+endif()
+
+if(USE_LLVM_DYLIB)
   set(LLVM_LINK_COMPONENTS)
 endif()
 
@@ -22,11 +41,11 @@ add_llvm_tool(llvm-c-test
   include-all.c
   main.c
   module.c
+  metadata.c
   object.c
   targets.c
   )
 
-# Use libLLVM.so if it is available.
-if(TARGET LLVM)
+if(USE_LLVM_DYLIB)
   target_link_libraries(llvm-c-test LLVM)
 endif()
diff --git a/tools/llvm-c-test/llvm-c-test.h b/tools/llvm-c-test/llvm-c-test.h
index 0a25aa6..1b4976a 100644
--- a/tools/llvm-c-test/llvm-c-test.h
+++ b/tools/llvm-c-test/llvm-c-test.h
@@ -27,6 +27,10 @@ int calc(void);
 // disassemble.c
 int disassemble(void);
 
+// metadata.c
+int add_named_metadata_operand(void);
+int set_metadata(void);
+
 // object.c
 int object_list_sections(void);
 int object_list_symbols(void);
diff --git a/tools/llvm-c-test/main.c b/tools/llvm-c-test/main.c
index 72f8b04..59cc749 100644
--- a/tools/llvm-c-test/main.c
+++ b/tools/llvm-c-test/main.c
@@ -65,6 +65,10 @@ int main(int argc, char **argv) {
     return disassemble();
   } else if (argc == 2 && !strcmp(argv[1], "--calc")) {
     return calc();
+  } else if (argc == 2 && !strcmp(argv[1], "--add-named-metadata-operand")) {
+    return add_named_metadata_operand();
+  } else if (argc == 2 && !strcmp(argv[1], "--set-metadata")) {
+    return set_metadata();
   } else {
     print_usage();
   }
diff --git a/tools/llvm-c-test/metadata.c b/tools/llvm-c-test/metadata.c
new file mode 100644
index 0000000..b64a696
--- /dev/null
+++ b/tools/llvm-c-test/metadata.c
@@ -0,0 +1,43 @@
+/*===-- object.c - tool for testing libLLVM and llvm-c API ----------------===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file implements the --add-named-metadata-operand and --set-metadata   *|
+|* commands in llvm-c-test.                                                   *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c-test.h"
+#include "llvm-c/Core.h"
+
+int add_named_metadata_operand(void) {
+  LLVMModuleRef m = LLVMModuleCreateWithName("Mod");
+  LLVMValueRef values[] = { LLVMConstInt(LLVMInt32Type(), 0, 0) };
+
+  // This used to trigger an assertion
+  LLVMAddNamedMetadataOperand(m, "name", LLVMMDNode(values, 1));
+
+  LLVMDisposeModule(m);
+
+  return 0;
+}
+
+int set_metadata(void) {
+  LLVMBuilderRef b = LLVMCreateBuilder();
+  LLVMValueRef values[] = { LLVMConstInt(LLVMInt32Type(), 0, 0) };
+
+  // This used to trigger an assertion
+  LLVMSetMetadata(
+      LLVMBuildRetVoid(b),
+      LLVMGetMDKindID("kind", 4),
+      LLVMMDNode(values, 1));
+
+  LLVMDisposeBuilder(b);
+
+  return 0;
+}
diff --git a/tools/llvm-config/BuildVariables.inc.in b/tools/llvm-config/BuildVariables.inc.in
index 59b5835..f741663 100644
--- a/tools/llvm-config/BuildVariables.inc.in
+++ b/tools/llvm-config/BuildVariables.inc.in
@@ -23,5 +23,6 @@
 #define LLVM_LDFLAGS ""
 #define LLVM_CXXFLAGS " -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wmissing-field-initializers -pedantic -Wno-long-long -Wcovered-switch-default -Wnon-virtual-dtor -std=c++11 -fcolor-diagnostics -ffunction-sections -fdata-sections   -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS"
 #define LLVM_BUILDMODE ""
+#define LLVM_LIBDIR_SUFFIX ""
 #define LLVM_TARGETS_BUILT "X86 AArch64 ARM Mips"
 #define LLVM_SYSTEM_LIBS "-lrt -ldl -ltinfo -lpthread"
diff --git a/tools/llvm-config/Makefile b/tools/llvm-config/Makefile
index b78551e..1ff8b6f 100644
--- a/tools/llvm-config/Makefile
+++ b/tools/llvm-config/Makefile
@@ -59,6 +59,8 @@ $(ObjDir)/BuildVariables.inc: $(BUILDVARIABLES_SRCPATH) Makefile $(ObjDir)/.dir
 	  >> temp.sed
 	$(Verb) $(ECHO) 's/@LLVM_BUILDMODE@/$(subst /,\/,$(BuildMode))/' \
 	  >> temp.sed
+	$(Verb) $(ECHO) 's/@LLVM_LIBDIR_SUFFIX@//' \
+	  >> temp.sed
 	$(Verb) $(ECHO) 's/@LLVM_SYSTEM_LIBS@/$(subst /,\/,$(LLVM_SYSTEM_LIBS))/' \
 	  >> temp.sed
 	$(Verb) $(ECHO) 's/@LLVM_TARGETS_BUILT@/$(subst /,\/,$(TARGETS_TO_BUILD))/' \
diff --git a/tools/llvm-config/llvm-config.cpp b/tools/llvm-config/llvm-config.cpp
index ed1c8c3..224035a 100644
--- a/tools/llvm-config/llvm-config.cpp
+++ b/tools/llvm-config/llvm-config.cpp
@@ -243,16 +243,18 @@ int main(int argc, char **argv) {
     case MakefileStyle:
       ActivePrefix = ActiveObjRoot;
       ActiveBinDir = ActiveObjRoot + "/" + build_mode + "/bin";
-      ActiveLibDir = ActiveObjRoot + "/" + build_mode + "/lib";
+      ActiveLibDir =
+          ActiveObjRoot + "/" + build_mode + "/lib" + LLVM_LIBDIR_SUFFIX;
       break;
     case CMakeStyle:
       ActiveBinDir = ActiveObjRoot + "/bin";
-      ActiveLibDir = ActiveObjRoot + "/lib";
+      ActiveLibDir = ActiveObjRoot + "/lib" + LLVM_LIBDIR_SUFFIX;
       break;
     case CMakeBuildModeStyle:
       ActivePrefix = ActiveObjRoot;
       ActiveBinDir = ActiveObjRoot + "/bin/" + build_mode;
-      ActiveLibDir = ActiveObjRoot + "/lib/" + build_mode;
+      ActiveLibDir =
+          ActiveObjRoot + "/lib" + LLVM_LIBDIR_SUFFIX + "/" + build_mode;
       break;
     }
 
@@ -263,7 +265,7 @@ int main(int argc, char **argv) {
     ActivePrefix = CurrentExecPrefix;
     ActiveIncludeDir = ActivePrefix + "/include";
     ActiveBinDir = ActivePrefix + "/bin";
-    ActiveLibDir = ActivePrefix + "/lib";
+    ActiveLibDir = ActivePrefix + "/lib" + LLVM_LIBDIR_SUFFIX;
     ActiveIncludeOption = "-I" + ActiveIncludeDir;
   }
 
diff --git a/tools/llvm-cov/Android.mk b/tools/llvm-cov/Android.mk
index d76c940..798e601 100644
--- a/tools/llvm-cov/Android.mk
+++ b/tools/llvm-cov/Android.mk
@@ -11,7 +11,6 @@ llvm_cov_SRC_FILES := \
   CodeCoverage.cpp \
   CoverageFilters.cpp \
   CoverageReport.cpp \
-  CoverageSummary.cpp \
   CoverageSummaryInfo.cpp \
   gcov.cpp \
   llvm-cov.cpp \
diff --git a/tools/llvm-cov/CMakeLists.txt b/tools/llvm-cov/CMakeLists.txt
index b2d2b89..193218a 100644
--- a/tools/llvm-cov/CMakeLists.txt
+++ b/tools/llvm-cov/CMakeLists.txt
@@ -6,7 +6,6 @@ add_llvm_tool(llvm-cov
   CodeCoverage.cpp
   CoverageFilters.cpp
   CoverageReport.cpp
-  CoverageSummary.cpp
   CoverageSummaryInfo.cpp
   SourceCoverageView.cpp
   TestingSupport.cpp
diff --git a/tools/llvm-cov/CodeCoverage.cpp b/tools/llvm-cov/CodeCoverage.cpp
index 093525e..cf8ab33 100644
--- a/tools/llvm-cov/CodeCoverage.cpp
+++ b/tools/llvm-cov/CodeCoverage.cpp
@@ -14,24 +14,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "RenderingSupport.h"
-#include "CoverageViewOptions.h"
 #include "CoverageFilters.h"
-#include "SourceCoverageView.h"
-#include "CoverageSummary.h"
 #include "CoverageReport.h"
-#include "llvm/ADT/StringRef.h"
+#include "CoverageViewOptions.h"
+#include "SourceCoverageView.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ProfileData/CoverageMapping.h"
-#include "llvm/ProfileData/CoverageMappingReader.h"
+#include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Signals.h"
 #include <functional>
 #include <system_error>
 
@@ -327,10 +324,11 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
 
     for (const auto &File : InputSourceFiles) {
       SmallString<128> Path(File);
-      if (std::error_code EC = sys::fs::make_absolute(Path)) {
-        errs() << "error: " << File << ": " << EC.message();
-        return 1;
-      }
+      if (!CompareFilenamesOnly)
+        if (std::error_code EC = sys::fs::make_absolute(Path)) {
+          errs() << "error: " << File << ": " << EC.message();
+          return 1;
+        }
       SourceFiles.push_back(Path.str());
     }
     return 0;
@@ -461,15 +459,11 @@ int CodeCoverageTool::report(int argc, const char **argv,
   if (!Coverage)
     return 1;
 
-  CoverageSummary Summarizer;
-  Summarizer.createSummaries(*Coverage);
-  CoverageReport Report(ViewOpts, Summarizer);
-  if (SourceFiles.empty() && Filters.empty()) {
+  CoverageReport Report(ViewOpts, std::move(Coverage));
+  if (SourceFiles.empty())
     Report.renderFileReports(llvm::outs());
-    return 0;
-  }
-
-  Report.renderFunctionReports(llvm::outs());
+  else
+    Report.renderFunctionReports(SourceFiles, llvm::outs());
   return 0;
 }
 
diff --git a/tools/llvm-cov/CoverageFilters.h b/tools/llvm-cov/CoverageFilters.h
index e543005..dc5dc98 100644
--- a/tools/llvm-cov/CoverageFilters.h
+++ b/tools/llvm-cov/CoverageFilters.h
@@ -15,8 +15,8 @@
 #define LLVM_COV_COVERAGEFILTERS_H
 
 #include "llvm/ProfileData/CoverageMapping.h"
-#include <vector>
 #include <memory>
+#include <vector>
 
 namespace llvm {
 
diff --git a/tools/llvm-cov/CoverageReport.cpp b/tools/llvm-cov/CoverageReport.cpp
index 7ac9355..497c2f8 100644
--- a/tools/llvm-cov/CoverageReport.cpp
+++ b/tools/llvm-cov/CoverageReport.cpp
@@ -12,10 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "CoverageReport.h"
-#include "CoverageSummary.h"
 #include "RenderingSupport.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Format.h"
 
 using namespace llvm;
 namespace {
@@ -156,14 +155,15 @@ void CoverageReport::render(const FunctionCoverageSummary &Function,
   OS << "\n";
 }
 
-void CoverageReport::renderFunctionReports(raw_ostream &OS) {
+void CoverageReport::renderFunctionReports(ArrayRef<std::string> Files,
+                                           raw_ostream &OS) {
   bool isFirst = true;
-  for (const auto &File : Summary.getFileSummaries()) {
+  for (StringRef Filename : Files) {
     if (isFirst)
       isFirst = false;
     else
       OS << "\n";
-    OS << "File '" << File.Name << "':\n";
+    OS << "File '" << Filename << "':\n";
     OS << column("Name", FunctionReportColumns[0])
        << column("Regions", FunctionReportColumns[1], Column::RightAlignment)
        << column("Miss", FunctionReportColumns[2], Column::RightAlignment)
@@ -174,13 +174,19 @@ void CoverageReport::renderFunctionReports(raw_ostream &OS) {
     OS << "\n";
     renderDivider(FunctionReportColumns, OS);
     OS << "\n";
-    for (const auto &Function : File.FunctionSummaries)
+    FunctionCoverageSummary Totals("TOTAL");
+    for (const auto &F : Coverage->getCoveredFunctions(Filename)) {
+      FunctionCoverageSummary Function = FunctionCoverageSummary::get(F);
+      ++Totals.ExecutionCount;
+      Totals.RegionCoverage += Function.RegionCoverage;
+      Totals.LineCoverage += Function.LineCoverage;
       render(Function, OS);
-    renderDivider(FunctionReportColumns, OS);
-    OS << "\n";
-    render(FunctionCoverageSummary("TOTAL", /*ExecutionCount=*/0,
-                                   File.RegionCoverage, File.LineCoverage),
-           OS);
+    }
+    if (Totals.ExecutionCount) {
+      renderDivider(FunctionReportColumns, OS);
+      OS << "\n";
+      render(Totals, OS);
+    }
   }
 }
 
@@ -194,9 +200,17 @@ void CoverageReport::renderFileReports(raw_ostream &OS) {
      << "\n";
   renderDivider(FileReportColumns, OS);
   OS << "\n";
-  for (const auto &File : Summary.getFileSummaries())
-    render(File, OS);
+  FileCoverageSummary Totals("TOTAL");
+  for (StringRef Filename : Coverage->getUniqueSourceFiles()) {
+    FileCoverageSummary Summary(Filename);
+    for (const auto &F : Coverage->getCoveredFunctions(Filename)) {
+      FunctionCoverageSummary Function = FunctionCoverageSummary::get(F);
+      Summary.addFunction(Function);
+      Totals.addFunction(Function);
+    }
+    render(Summary, OS);
+  }
   renderDivider(FileReportColumns, OS);
   OS << "\n";
-  render(Summary.getCombinedFileSummaries(), OS);
+  render(Totals, OS);
 }
diff --git a/tools/llvm-cov/CoverageReport.h b/tools/llvm-cov/CoverageReport.h
index e8d34f2..7ec3df9 100644
--- a/tools/llvm-cov/CoverageReport.h
+++ b/tools/llvm-cov/CoverageReport.h
@@ -14,24 +14,25 @@
 #ifndef LLVM_COV_COVERAGEREPORT_H
 #define LLVM_COV_COVERAGEREPORT_H
 
+#include "CoverageSummaryInfo.h"
 #include "CoverageViewOptions.h"
-#include "CoverageSummary.h"
 
 namespace llvm {
 
 /// \brief Displays the code coverage report.
 class CoverageReport {
   const CoverageViewOptions &Options;
-  CoverageSummary &Summary;
+  std::unique_ptr<coverage::CoverageMapping> Coverage;
 
   void render(const FileCoverageSummary &File, raw_ostream &OS);
   void render(const FunctionCoverageSummary &Function, raw_ostream &OS);
 
 public:
-  CoverageReport(const CoverageViewOptions &Options, CoverageSummary &Summary)
-      : Options(Options), Summary(Summary) {}
+  CoverageReport(const CoverageViewOptions &Options,
+                 std::unique_ptr<coverage::CoverageMapping> Coverage)
+      : Options(Options), Coverage(std::move(Coverage)) {}
 
-  void renderFunctionReports(raw_ostream &OS);
+  void renderFunctionReports(ArrayRef<std::string> Files, raw_ostream &OS);
 
   void renderFileReports(raw_ostream &OS);
 };
diff --git a/tools/llvm-cov/CoverageSummary.cpp b/tools/llvm-cov/CoverageSummary.cpp
deleted file mode 100644
index 059c8c8..0000000
--- a/tools/llvm-cov/CoverageSummary.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-//===- CoverageSummary.cpp - Code coverage summary ------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class implements data management and rendering for the code coverage
-// summaries of all files and functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "CoverageSummary.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Format.h"
-
-using namespace llvm;
-
-unsigned CoverageSummary::getFileID(StringRef Filename) {
-  for (unsigned I = 0, E = Filenames.size(); I < E; ++I) {
-    if (sys::fs::equivalent(Filenames[I], Filename))
-      return I;
-  }
-  Filenames.push_back(Filename);
-  return Filenames.size() - 1;
-}
-
-void
-CoverageSummary::createSummaries(const coverage::CoverageMapping &Coverage) {
-  for (StringRef Filename : Coverage.getUniqueSourceFiles()) {
-    size_t PrevSize = FunctionSummaries.size();
-    for (const auto &F : Coverage.getCoveredFunctions(Filename))
-      FunctionSummaries.push_back(FunctionCoverageSummary::get(F));
-    size_t Count = FunctionSummaries.size() - PrevSize;
-    if (Count == 0)
-      continue;
-    FileSummaries.push_back(FileCoverageSummary::get(
-        Filename, makeArrayRef(FunctionSummaries.data() + PrevSize, Count)));
-  }
-}
-
-FileCoverageSummary CoverageSummary::getCombinedFileSummaries() {
-  size_t NumRegions = 0, CoveredRegions = 0;
-  size_t NumLines = 0, NonCodeLines = 0, CoveredLines = 0;
-  size_t NumFunctionsExecuted = 0, NumFunctions = 0;
-  for (const auto &File : FileSummaries) {
-    NumRegions += File.RegionCoverage.NumRegions;
-    CoveredRegions += File.RegionCoverage.Covered;
-
-    NumLines += File.LineCoverage.NumLines;
-    NonCodeLines += File.LineCoverage.NonCodeLines;
-    CoveredLines += File.LineCoverage.Covered;
-
-    NumFunctionsExecuted += File.FunctionCoverage.Executed;
-    NumFunctions += File.FunctionCoverage.NumFunctions;
-  }
-  return FileCoverageSummary(
-      "TOTAL", RegionCoverageInfo(CoveredRegions, NumRegions),
-      LineCoverageInfo(CoveredLines, NonCodeLines, NumLines),
-      FunctionCoverageInfo(NumFunctionsExecuted, NumFunctions),
-      None);
-}
diff --git a/tools/llvm-cov/CoverageSummary.h b/tools/llvm-cov/CoverageSummary.h
deleted file mode 100644
index 9dbebde..0000000
--- a/tools/llvm-cov/CoverageSummary.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===- CoverageSummary.h - Code coverage summary --------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class implements data management and rendering for the code coverage
-// summaries of all files and functions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_COV_COVERAGESUMMARY_H
-#define LLVM_COV_COVERAGESUMMARY_H
-
-#include "CoverageSummaryInfo.h"
-#include <vector>
-
-namespace llvm {
-
-/// \brief Manager for the function and file code coverage summaries.
-class CoverageSummary {
-  std::vector<StringRef> Filenames;
-  std::vector<FunctionCoverageSummary> FunctionSummaries;
-  std::vector<std::pair<unsigned, unsigned>> FunctionSummariesFileIDs;
-  std::vector<FileCoverageSummary> FileSummaries;
-
-  unsigned getFileID(StringRef Filename);
-
-public:
-  void createSummaries(const coverage::CoverageMapping &Coverage);
-
-  ArrayRef<FileCoverageSummary> getFileSummaries() { return FileSummaries; }
-
-  FileCoverageSummary getCombinedFileSummaries();
-
-  void render(const FunctionCoverageSummary &Summary, raw_ostream &OS);
-
-  void render(raw_ostream &OS);
-};
-}
-
-#endif // LLVM_COV_COVERAGESUMMARY_H
diff --git a/tools/llvm-cov/CoverageSummaryInfo.cpp b/tools/llvm-cov/CoverageSummaryInfo.cpp
index dd78ace..de89750 100644
--- a/tools/llvm-cov/CoverageSummaryInfo.cpp
+++ b/tools/llvm-cov/CoverageSummaryInfo.cpp
@@ -69,28 +69,3 @@ FunctionCoverageSummary::get(const coverage::FunctionRecord &Function) {
       RegionCoverageInfo(CoveredRegions, NumCodeRegions),
       LineCoverageInfo(CoveredLines, 0, NumLines));
 }
-
-FileCoverageSummary
-FileCoverageSummary::get(StringRef Name,
-                         ArrayRef<FunctionCoverageSummary> FunctionSummaries) {
-  size_t NumRegions = 0, CoveredRegions = 0;
-  size_t NumLines = 0, NonCodeLines = 0, CoveredLines = 0;
-  size_t NumFunctionsExecuted = 0;
-  for (const auto &Func : FunctionSummaries) {
-    CoveredRegions += Func.RegionCoverage.Covered;
-    NumRegions += Func.RegionCoverage.NumRegions;
-
-    CoveredLines += Func.LineCoverage.Covered;
-    NonCodeLines += Func.LineCoverage.NonCodeLines;
-    NumLines += Func.LineCoverage.NumLines;
-
-    if (Func.ExecutionCount != 0)
-      ++NumFunctionsExecuted;
-  }
-
-  return FileCoverageSummary(
-      Name, RegionCoverageInfo(CoveredRegions, NumRegions),
-      LineCoverageInfo(CoveredLines, NonCodeLines, NumLines),
-      FunctionCoverageInfo(NumFunctionsExecuted, FunctionSummaries.size()),
-      FunctionSummaries);
-}
diff --git a/tools/llvm-cov/CoverageSummaryInfo.h b/tools/llvm-cov/CoverageSummaryInfo.h
index 0036032..c393b00 100644
--- a/tools/llvm-cov/CoverageSummaryInfo.h
+++ b/tools/llvm-cov/CoverageSummaryInfo.h
@@ -31,10 +31,19 @@ struct RegionCoverageInfo {
   /// \brief The total number of regions in a function/file.
   size_t NumRegions;
 
+  RegionCoverageInfo() : Covered(0), NotCovered(0), NumRegions(0) {}
+
   RegionCoverageInfo(size_t Covered, size_t NumRegions)
       : Covered(Covered), NotCovered(NumRegions - Covered),
         NumRegions(NumRegions) {}
 
+  RegionCoverageInfo &operator+=(const RegionCoverageInfo &RHS) {
+    Covered += RHS.Covered;
+    NotCovered += RHS.NotCovered;
+    NumRegions += RHS.NumRegions;
+    return *this;
+  }
+
   bool isFullyCovered() const { return Covered == NumRegions; }
 
   double getPercentCovered() const {
@@ -56,10 +65,21 @@ struct LineCoverageInfo {
   /// \brief The total number of lines in a function/file.
   size_t NumLines;
 
+  LineCoverageInfo()
+      : Covered(0), NotCovered(0), NonCodeLines(0), NumLines(0) {}
+
   LineCoverageInfo(size_t Covered, size_t NumNonCodeLines, size_t NumLines)
       : Covered(Covered), NotCovered(NumLines - NumNonCodeLines - Covered),
         NonCodeLines(NumNonCodeLines), NumLines(NumLines) {}
 
+  LineCoverageInfo &operator+=(const LineCoverageInfo &RHS) {
+    Covered += RHS.Covered;
+    NotCovered += RHS.NotCovered;
+    NonCodeLines += RHS.NonCodeLines;
+    NumLines += RHS.NumLines;
+    return *this;
+  }
+
   bool isFullyCovered() const { return Covered == (NumLines - NonCodeLines); }
 
   double getPercentCovered() const {
@@ -75,9 +95,17 @@ struct FunctionCoverageInfo {
   /// \brief The total number of functions in this file.
   size_t NumFunctions;
 
+  FunctionCoverageInfo() : Executed(0), NumFunctions(0) {}
+
   FunctionCoverageInfo(size_t Executed, size_t NumFunctions)
       : Executed(Executed), NumFunctions(NumFunctions) {}
 
+  void addFunction(bool Covered) {
+    if (Covered)
+      ++Executed;
+    ++NumFunctions;
+  }
+
   bool isFullyCovered() const { return Executed == NumFunctions; }
 
   double getPercentCovered() const {
@@ -92,6 +120,8 @@ struct FunctionCoverageSummary {
   RegionCoverageInfo RegionCoverage;
   LineCoverageInfo LineCoverage;
 
+  FunctionCoverageSummary(StringRef Name) : Name(Name), ExecutionCount(0) {}
+
   FunctionCoverageSummary(StringRef Name, uint64_t ExecutionCount,
                           const RegionCoverageInfo &RegionCoverage,
                           const LineCoverageInfo &LineCoverage)
@@ -111,21 +141,14 @@ struct FileCoverageSummary {
   RegionCoverageInfo RegionCoverage;
   LineCoverageInfo LineCoverage;
   FunctionCoverageInfo FunctionCoverage;
-  /// \brief The summary of every function
-  /// in this file.
-  ArrayRef<FunctionCoverageSummary> FunctionSummaries;
-
-  FileCoverageSummary(StringRef Name, const RegionCoverageInfo &RegionCoverage,
-                      const LineCoverageInfo &LineCoverage,
-                      const FunctionCoverageInfo &FunctionCoverage,
-                      ArrayRef<FunctionCoverageSummary> FunctionSummaries)
-      : Name(Name), RegionCoverage(RegionCoverage), LineCoverage(LineCoverage),
-        FunctionCoverage(FunctionCoverage),
-        FunctionSummaries(FunctionSummaries) {}
-
-  /// \brief Compute the code coverage summary for a file.
-  static FileCoverageSummary
-  get(StringRef Name, ArrayRef<FunctionCoverageSummary> FunctionSummaries);
+
+  FileCoverageSummary(StringRef Name) : Name(Name) {}
+
+  void addFunction(const FunctionCoverageSummary &Function) {
+    RegionCoverage += Function.RegionCoverage;
+    LineCoverage += Function.LineCoverage;
+    FunctionCoverage.addFunction(/*Covered=*/Function.ExecutionCount > 0);
+  }
 };
 
 } // namespace llvm
diff --git a/tools/llvm-cov/RenderingSupport.h b/tools/llvm-cov/RenderingSupport.h
index 0271329..3ef155d 100644
--- a/tools/llvm-cov/RenderingSupport.h
+++ b/tools/llvm-cov/RenderingSupport.h
@@ -18,7 +18,7 @@ namespace llvm {
 /// \brief A helper class that resets the output stream's color if needed
 /// when destroyed.
 class ColoredRawOstream {
-  ColoredRawOstream(const ColoredRawOstream &OS) LLVM_DELETED_FUNCTION;
+  ColoredRawOstream(const ColoredRawOstream &OS) = delete;
 
 public:
   raw_ostream &OS;
diff --git a/tools/llvm-cov/SourceCoverageView.h b/tools/llvm-cov/SourceCoverageView.h
index d92a748..9e6fe5f 100644
--- a/tools/llvm-cov/SourceCoverageView.h
+++ b/tools/llvm-cov/SourceCoverageView.h
@@ -90,15 +90,14 @@ private:
     bool hasMultipleRegions() const { return RegionCount > 1; }
 
     void addRegionStartCount(uint64_t Count) {
-      Mapped = true;
-      ExecutionCount = Count;
+      // The max of all region starts is the most interesting value.
+      addRegionCount(RegionCount ? std::max(ExecutionCount, Count) : Count);
       ++RegionCount;
     }
 
     void addRegionCount(uint64_t Count) {
       Mapped = true;
-      if (!RegionCount)
-        ExecutionCount = Count;
+      ExecutionCount = Count;
     }
   };
 
diff --git a/tools/llvm-cov/TestingSupport.cpp b/tools/llvm-cov/TestingSupport.cpp
index 537f133..6959897 100644
--- a/tools/llvm-cov/TestingSupport.cpp
+++ b/tools/llvm-cov/TestingSupport.cpp
@@ -8,15 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/LEB128.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MemoryObject.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/PrettyStackTrace.h"
-#include <system_error>
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/raw_ostream.h"
 #include <functional>
+#include <system_error>
 
 using namespace llvm;
 using namespace object;
diff --git a/tools/llvm-cov/gcov.cpp b/tools/llvm-cov/gcov.cpp
index 4c9195a..c0a48f8 100644
--- a/tools/llvm-cov/gcov.cpp
+++ b/tools/llvm-cov/gcov.cpp
@@ -17,7 +17,6 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/GCOV.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
@@ -81,7 +80,7 @@ void reportCoverage(StringRef SourceFile, StringRef ObjectDir,
 
   FileInfo FI(Options);
   GF.collectLineCounts(FI);
-  FI.print(SourceFile, GCNO, GCDA);
+  FI.print(llvm::outs(), SourceFile, GCNO, GCDA);
 }
 
 int gcovMain(int argc, const char *argv[]) {
diff --git a/tools/llvm-cov/llvm-cov.cpp b/tools/llvm-cov/llvm-cov.cpp
index a67859e..86ec26d 100644
--- a/tools/llvm-cov/llvm-cov.cpp
+++ b/tools/llvm-cov/llvm-cov.cpp
@@ -13,8 +13,8 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
 #include <string>
 
 using namespace llvm;
diff --git a/tools/llvm-dis/llvm-dis.cpp b/tools/llvm-dis/llvm-dis.cpp
index fb73717..1c3a9ce 100644
--- a/tools/llvm-dis/llvm-dis.cpp
+++ b/tools/llvm-dis/llvm-dis.cpp
@@ -20,6 +20,8 @@
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
@@ -112,6 +114,24 @@ public:
 
 } // end anon namespace
 
+static void diagnosticHandler(const DiagnosticInfo &DI, void *Context) {
+  raw_ostream &OS = errs();
+  OS << (char *)Context << ": ";
+  switch (DI.getSeverity()) {
+  case DS_Error: OS << "error: "; break;
+  case DS_Warning: OS << "warning: "; break;
+  case DS_Remark: OS << "remark: "; break;
+  case DS_Note: OS << "note: "; break;
+  }
+
+  DiagnosticPrinterRawOStream DP(OS);
+  DI.print(DP);
+  OS << '\n';
+
+  if (DI.getSeverity() == DS_Error)
+    exit(1);
+}
+
 int main(int argc, char **argv) {
   // Print a stack trace if we signal out.
   sys::PrintStackTraceOnErrorSignal();
@@ -120,6 +140,7 @@ int main(int argc, char **argv) {
   LLVMContext &Context = getGlobalContext();
   llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
 
+  Context.setDiagnosticHandler(diagnosticHandler, argv[0]);
 
   cl::ParseCommandLineOptions(argc, argv, "llvm .bc -> .ll disassembler\n");
 
@@ -127,30 +148,17 @@ int main(int argc, char **argv) {
   std::unique_ptr<Module> M;
 
   // Use the bitcode streaming interface
-  DataStreamer *streamer = getDataFileStreamer(InputFilename, &ErrorMessage);
-  if (streamer) {
+  DataStreamer *Streamer = getDataFileStreamer(InputFilename, &ErrorMessage);
+  if (Streamer) {
     std::string DisplayFilename;
     if (InputFilename == "-")
       DisplayFilename = "<stdin>";
     else
       DisplayFilename = InputFilename;
-    M.reset(getStreamedBitcodeModule(DisplayFilename, streamer, Context,
-                                     &ErrorMessage));
-    if(M.get()) {
-      if (std::error_code EC = M->materializeAllPermanently()) {
-        ErrorMessage = EC.message();
-        M.reset();
-      }
-    }
-  }
-
-  if (!M.get()) {
-    errs() << argv[0] << ": ";
-    if (ErrorMessage.size())
-      errs() << ErrorMessage << "\n";
-    else
-      errs() << "bitcode didn't read correctly.\n";
-    return 1;
+    ErrorOr<std::unique_ptr<Module>> MOrErr =
+        getStreamedBitcodeModule(DisplayFilename, Streamer, Context);
+    M = std::move(*MOrErr);
+    M->materializeAllPermanently();
   }
 
   // Just use stdout.  We won't actually print anything on it.
diff --git a/tools/llvm-dwarfdump/Android.mk b/tools/llvm-dwarfdump/Android.mk
index 61049e8..225004a 100644
--- a/tools/llvm-dwarfdump/Android.mk
+++ b/tools/llvm-dwarfdump/Android.mk
@@ -11,7 +11,8 @@ llvm_dwarfdump_SRC_FILES := \
   llvm-dwarfdump.cpp
 
 llvm_dwarfdump_STATIC_LIBRARIES := \
-  libLLVMDebugInfo                 \
+  libLLVMDebugInfoDWARF            \
+  libLLVMDebugInfoPDB              \
   libLLVMObject                    \
   libLLVMBitReader                 \
   libLLVMMC                        \
diff --git a/tools/llvm-dwarfdump/CMakeLists.txt b/tools/llvm-dwarfdump/CMakeLists.txt
index 288b323..086b139 100644
--- a/tools/llvm-dwarfdump/CMakeLists.txt
+++ b/tools/llvm-dwarfdump/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(LLVM_LINK_COMPONENTS
-  DebugInfo
+  DebugInfoDWARF
   Object
   Support
   )
diff --git a/tools/llvm-dwarfdump/LLVMBuild.txt b/tools/llvm-dwarfdump/LLVMBuild.txt
index 28b7c4c..a9dca3e 100644
--- a/tools/llvm-dwarfdump/LLVMBuild.txt
+++ b/tools/llvm-dwarfdump/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Tool
 name = llvm-dwarfdump
 parent = Tools
-required_libraries = DebugInfo Object
+required_libraries = DebugInfoDWARF Object
diff --git a/tools/llvm-dwarfdump/Makefile b/tools/llvm-dwarfdump/Makefile
index 7ca1a8d..00b18c6 100644
--- a/tools/llvm-dwarfdump/Makefile
+++ b/tools/llvm-dwarfdump/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL := ../..
 TOOLNAME := llvm-dwarfdump
-LINK_COMPONENTS := DebugInfo Object
+LINK_COMPONENTS := DebugInfoDWARF Object
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1
diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 1c540c9..a99f57f 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -13,7 +13,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DIContext.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/RelocVisitor.h"
 #include "llvm/Support/CommandLine.h"
@@ -21,7 +21,6 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/tools/llvm-extract/llvm-extract.cpp b/tools/llvm-extract/llvm-extract.cpp
index 53b2f0d..8cbcdc5 100644
--- a/tools/llvm-extract/llvm-extract.cpp
+++ b/tools/llvm-extract/llvm-extract.cpp
@@ -20,7 +20,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -246,7 +246,7 @@ int main(int argc, char **argv) {
 
   // In addition to deleting all other functions, we also want to spiff it
   // up a little bit.  Do this now.
-  PassManager Passes;
+  legacy::PassManager Passes;
   Passes.add(new DataLayoutPass()); // Use correct DataLayout
 
   std::vector<GlobalValue*> Gvs(GVs.begin(), GVs.end());
diff --git a/tools/llvm-go/llvm-go.go b/tools/llvm-go/llvm-go.go
index 47f9481..c5c3fd2 100644
--- a/tools/llvm-go/llvm-go.go
+++ b/tools/llvm-go/llvm-go.go
@@ -30,6 +30,7 @@ type pkg struct {
 
 var packages = []pkg{
 	{"bindings/go/llvm", "llvm.org/llvm/bindings/go/llvm"},
+	{"tools/llgo", "llvm.org/llgo"},
 }
 
 type compilerFlags struct {
@@ -45,7 +46,7 @@ var components = []string{
 	"bitwriter",
 	"codegen",
 	"core",
-	"debuginfo",
+	"debuginfodwarf",
 	"executionengine",
 	"instrumentation",
 	"interpreter",
@@ -136,7 +137,7 @@ type (run_build_sh int)
 `, flags.cpp, flags.cxx, flags.ld)
 }
 
-func runGoWithLLVMEnv(args []string, cc, cxx, cppflags, cxxflags, ldflags string) {
+func runGoWithLLVMEnv(args []string, cc, cxx, gocmd, llgo, cppflags, cxxflags, ldflags string) {
 	args = addTag(args, "byollvm")
 
 	srcdir := llvmConfig("--src-root")
@@ -159,6 +160,8 @@ func runGoWithLLVMEnv(args []string, cc, cxx, cppflags, cxxflags, ldflags string
 		}
 	}
 
+	newpath := os.Getenv("PATH")
+
 	newgopathlist := []string{tmpgopath}
 	newgopathlist = append(newgopathlist, filepath.SplitList(os.Getenv("GOPATH"))...)
 	newgopath := strings.Join(newgopathlist, string(filepath.ListSeparator))
@@ -172,24 +175,31 @@ func runGoWithLLVMEnv(args []string, cc, cxx, cppflags, cxxflags, ldflags string
 		"CGO_CXXFLAGS=" + flags.cxx + " " + cxxflags,
 		"CGO_LDFLAGS=" + flags.ld + " " + ldflags,
 		"GOPATH=" + newgopath,
+		"PATH=" + newpath,
+	}
+	if llgo != "" {
+		newenv = append(newenv, "GCCGO=" + llgo)
 	}
+
 	for _, v := range os.Environ() {
 		if !strings.HasPrefix(v, "CC=") &&
 			!strings.HasPrefix(v, "CXX=") &&
 			!strings.HasPrefix(v, "CGO_CPPFLAGS=") &&
 			!strings.HasPrefix(v, "CGO_CXXFLAGS=") &&
 			!strings.HasPrefix(v, "CGO_LDFLAGS=") &&
-			!strings.HasPrefix(v, "GOPATH=") {
+			!strings.HasPrefix(v, "GCCGO=") &&
+			!strings.HasPrefix(v, "GOPATH=") &&
+			!strings.HasPrefix(v, "PATH=") {
 			newenv = append(newenv, v)
 		}
 	}
 
-	gocmdpath, err := exec.LookPath("go")
+	gocmdpath, err := exec.LookPath(gocmd)
 	if err != nil {
 		panic(err.Error())
 	}
 
-	proc, err := os.StartProcess(gocmdpath, append([]string{"go"}, args...),
+	proc, err := os.StartProcess(gocmdpath, append([]string{gocmd}, args...),
 		&os.ProcAttr{
 			Env:   newenv,
 			Files: []*os.File{os.Stdin, os.Stdout, os.Stderr},
@@ -222,6 +232,8 @@ func main() {
 	cppflags := os.Getenv("CGO_CPPFLAGS")
 	cxxflags := os.Getenv("CGO_CXXFLAGS")
 	ldflags := os.Getenv("CGO_LDFLAGS")
+	gocmd := "go"
+	llgo := ""
 
 	args := os.Args[1:]
 	DONE: for {
@@ -234,6 +246,12 @@ func main() {
 		case strings.HasPrefix(args[0], "cxx="):
 			cxx = args[0][4:]
 			args = args[1:]
+		case strings.HasPrefix(args[0], "go="):
+			gocmd = args[0][3:]
+			args = args[1:]
+		case strings.HasPrefix(args[0], "llgo="):
+			llgo = args[0][5:]
+			args = args[1:]
 		case strings.HasPrefix(args[0], "cppflags="):
 			cppflags = args[0][9:]
 			args = args[1:]
@@ -250,7 +268,7 @@ func main() {
 
 	switch args[0] {
 	case "build", "get", "install", "run", "test":
-		runGoWithLLVMEnv(args, cc, cxx, cppflags, cxxflags, ldflags)
+		runGoWithLLVMEnv(args, cc, cxx, gocmd, llgo, cppflags, cxxflags, ldflags)
 	case "print-components":
 		printComponents()
 	case "print-config":
diff --git a/tools/llvm-jitlistener/Makefile b/tools/llvm-jitlistener/Makefile
index 6d72427..06132f9 100644
--- a/tools/llvm-jitlistener/Makefile
+++ b/tools/llvm-jitlistener/Makefile
@@ -18,7 +18,7 @@ LINK_COMPONENTS := mcjit interpreter nativecodegen bitreader asmparser irreader
 # Events interface library.  If not, this tool will do nothing useful, but it
 # will build correctly.
 ifeq ($(USE_INTEL_JITEVENTS), 1)
-  LINK_COMPONENTS += debuginfo inteljitevents
+  LINK_COMPONENTS += debuginfodwarf inteljitevents
 endif
 
 # This tool has no plugins, optimize startup time.
diff --git a/tools/llvm-jitlistener/llvm-jitlistener.cpp b/tools/llvm-jitlistener/llvm-jitlistener.cpp
index 0bb6e8b..c3091a5 100644
--- a/tools/llvm-jitlistener/llvm-jitlistener.cpp
+++ b/tools/llvm-jitlistener/llvm-jitlistener.cpp
@@ -17,12 +17,12 @@
 #include "../../lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
-#include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -140,7 +140,7 @@ protected:
     TheJIT.reset(EngineBuilder(std::move(TheModule))
       .setEngineKind(EngineKind::JIT)
       .setErrorStr(&Error)
-      .setMCJITMemoryManager(MemMgr)
+      .setMCJITMemoryManager(std::unique_ptr<RTDyldMemoryManager>(MemMgr))
       .create());
     if (Error.empty() == false)
       errs() << Error;
diff --git a/tools/llvm-lto/llvm-lto.cpp b/tools/llvm-lto/llvm-lto.cpp
index 3c950ba..32b3134 100644
--- a/tools/llvm-lto/llvm-lto.cpp
+++ b/tools/llvm-lto/llvm-lto.cpp
@@ -65,6 +65,14 @@ DSOSymbols("dso-symbol",
   cl::desc("Symbol to put in the symtab in the resulting dso"),
   cl::ZeroOrMore);
 
+static cl::opt<bool> ListSymbolsOnly(
+    "list-symbols-only", cl::init(false),
+    cl::desc("Instead of running LTO, list the symbols in each IR file"));
+
+static cl::opt<bool> SetMergedModule(
+    "set-merged-module", cl::init(false),
+    cl::desc("Use the first input module as the merged module"));
+
 namespace {
 struct ModuleInfo {
   std::vector<bool> CanBeHidden;
@@ -90,6 +98,46 @@ void handleDiagnostics(lto_codegen_diagnostic_severity_t Severity,
   errs() << Msg << "\n";
 }
 
+std::unique_ptr<LTOModule>
+getLocalLTOModule(StringRef Path, std::unique_ptr<MemoryBuffer> &Buffer,
+                  const TargetOptions &Options, std::string &Error) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+      MemoryBuffer::getFile(Path);
+  if (std::error_code EC = BufferOrErr.getError()) {
+    Error = EC.message();
+    return nullptr;
+  }
+  Buffer = std::move(BufferOrErr.get());
+  return std::unique_ptr<LTOModule>(LTOModule::createInLocalContext(
+      Buffer->getBufferStart(), Buffer->getBufferSize(), Options, Error, Path));
+}
+
+/// \brief List symbols in each IR file.
+///
+/// The main point here is to provide lit-testable coverage for the LTOModule
+/// functionality that's exposed by the C API to list symbols.  Moreover, this
+/// provides testing coverage for modules that have been created in their own
+/// contexts.
+int listSymbols(StringRef Command, const TargetOptions &Options) {
+  for (auto &Filename : InputFilenames) {
+    std::string Error;
+    std::unique_ptr<MemoryBuffer> Buffer;
+    std::unique_ptr<LTOModule> Module =
+        getLocalLTOModule(Filename, Buffer, Options, Error);
+    if (!Module) {
+      errs() << Command << ": error loading file '" << Filename
+             << "': " << Error << "\n";
+      return 1;
+    }
+
+    // List the symbols.
+    outs() << Filename << ":\n";
+    for (int I = 0, E = Module->getSymbolCount(); I != E; ++I)
+      outs() << Module->getSymbolName(I) << "\n";
+  }
+  return 0;
+}
+
 int main(int argc, char **argv) {
   // Print a stack trace if we signal out.
   sys::PrintStackTraceOnErrorSignal();
@@ -107,6 +155,9 @@ int main(int argc, char **argv) {
   // set up the TargetOptions for the machine
   TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
 
+  if (ListSymbolsOnly)
+    return listSymbols(argv[0], Options);
+
   unsigned BaseArg = 0;
 
   LTOCodeGenerator CodeGen;
@@ -147,15 +198,22 @@ int main(int argc, char **argv) {
       return 1;
     }
 
-    if (!CodeGen.addModule(Module.get()))
+    LTOModule *LTOMod = Module.get();
+
+    // We use the first input module as the destination module when
+    // SetMergedModule is true.
+    if (SetMergedModule && i == BaseArg) {
+      // Transfer ownership to the code generator.
+      CodeGen.setModule(Module.release());
+    } else if (!CodeGen.addModule(Module.get()))
       return 1;
 
-    unsigned NumSyms = Module->getSymbolCount();
+    unsigned NumSyms = LTOMod->getSymbolCount();
     for (unsigned I = 0; I < NumSyms; ++I) {
-      StringRef Name = Module->getSymbolName(I);
+      StringRef Name = LTOMod->getSymbolName(I);
       if (!DSOSymbolsSet.count(Name))
         continue;
-      lto_symbol_attributes Attrs = Module->getSymbolAttributes(I);
+      lto_symbol_attributes Attrs = LTOMod->getSymbolAttributes(I);
       unsigned Scope = Attrs & LTO_SYMBOL_SCOPE_MASK;
       if (Scope != LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN)
         KeptDSOSyms.push_back(Name);
@@ -170,6 +228,9 @@ int main(int argc, char **argv) {
   for (unsigned i = 0; i < KeptDSOSyms.size(); ++i)
     CodeGen.addMustPreserveSymbol(KeptDSOSyms[i].c_str());
 
+  // Set cpu and attrs strings for the default target/subtarget.
+  CodeGen.setCpu(MCPU.c_str());
+
   std::string attrs;
   for (unsigned i = 0; i < MAttrs.size(); ++i) {
     if (i > 0)
diff --git a/tools/llvm-mc/Android.mk b/tools/llvm-mc/Android.mk
index e6de9eb..ad83637 100644
--- a/tools/llvm-mc/Android.mk
+++ b/tools/llvm-mc/Android.mk
@@ -37,6 +37,7 @@ llvm_mc_STATIC_LIBRARIES := \
   libLLVMX86CodeGen \
   libLLVMAsmPrinter \
   libLLVMCodeGen \
+  libLLVMTransformUtils \
   libLLVMAnalysis \
   libLLVMTarget \
   libLLVMMC \
diff --git a/tools/llvm-mc/Disassembler.cpp b/tools/llvm-mc/Disassembler.cpp
index 95d146a..5ffeffc 100644
--- a/tools/llvm-mc/Disassembler.cpp
+++ b/tools/llvm-mc/Disassembler.cpp
@@ -36,7 +36,6 @@ static bool PrintInsts(const MCDisassembler &DisAsm,
                        SourceMgr &SM, raw_ostream &Out,
                        MCStreamer &Streamer, bool InAtomicBlock,
                        const MCSubtargetInfo &STI) {
-  // Wrap the vector in a MemoryObject.
   ArrayRef<uint8_t> Data(Bytes.first.data(), Bytes.first.size());
 
   // Disassemble it to strings.
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index 5da9e86..91bacc0 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -204,16 +204,15 @@ static const Target *GetTarget(const char *ProgName) {
   return TheTarget;
 }
 
-static tool_output_file *GetOutputStream() {
+static std::unique_ptr<tool_output_file> GetOutputStream() {
   if (OutputFilename == "")
     OutputFilename = "-";
 
   std::error_code EC;
-  tool_output_file *Out =
-      new tool_output_file(OutputFilename, EC, sys::fs::F_None);
+  auto Out = llvm::make_unique<tool_output_file>(OutputFilename, EC,
+                                                 sys::fs::F_None);
   if (EC) {
     errs() << EC.message() << '\n';
-    delete Out;
     return nullptr;
   }
 
@@ -239,7 +238,7 @@ static void setDwarfDebugProducer(void) {
 }
 
 static int AsLexInput(SourceMgr &SrcMgr, MCAsmInfo &MAI,
-                      tool_output_file *Out) {
+                      raw_ostream &OS) {
 
   AsmLexer Lexer(MAI);
   Lexer.setBuffer(SrcMgr.getMemoryBuffer(SrcMgr.getMainFileID())->getBuffer());
@@ -258,60 +257,60 @@ static int AsLexInput(SourceMgr &SrcMgr, MCAsmInfo &MAI,
       Error = true; // error already printed.
       break;
     case AsmToken::Identifier:
-      Out->os() << "identifier: " << Lexer.getTok().getString();
+      OS << "identifier: " << Lexer.getTok().getString();
       break;
     case AsmToken::Integer:
-      Out->os() << "int: " << Lexer.getTok().getString();
+      OS << "int: " << Lexer.getTok().getString();
       break;
     case AsmToken::Real:
-      Out->os() << "real: " << Lexer.getTok().getString();
+      OS << "real: " << Lexer.getTok().getString();
       break;
     case AsmToken::String:
-      Out->os() << "string: " << Lexer.getTok().getString();
+      OS << "string: " << Lexer.getTok().getString();
       break;
 
-    case AsmToken::Amp:            Out->os() << "Amp"; break;
-    case AsmToken::AmpAmp:         Out->os() << "AmpAmp"; break;
-    case AsmToken::At:             Out->os() << "At"; break;
-    case AsmToken::Caret:          Out->os() << "Caret"; break;
-    case AsmToken::Colon:          Out->os() << "Colon"; break;
-    case AsmToken::Comma:          Out->os() << "Comma"; break;
-    case AsmToken::Dollar:         Out->os() << "Dollar"; break;
-    case AsmToken::Dot:            Out->os() << "Dot"; break;
-    case AsmToken::EndOfStatement: Out->os() << "EndOfStatement"; break;
-    case AsmToken::Eof:            Out->os() << "Eof"; break;
-    case AsmToken::Equal:          Out->os() << "Equal"; break;
-    case AsmToken::EqualEqual:     Out->os() << "EqualEqual"; break;
-    case AsmToken::Exclaim:        Out->os() << "Exclaim"; break;
-    case AsmToken::ExclaimEqual:   Out->os() << "ExclaimEqual"; break;
-    case AsmToken::Greater:        Out->os() << "Greater"; break;
-    case AsmToken::GreaterEqual:   Out->os() << "GreaterEqual"; break;
-    case AsmToken::GreaterGreater: Out->os() << "GreaterGreater"; break;
-    case AsmToken::Hash:           Out->os() << "Hash"; break;
-    case AsmToken::LBrac:          Out->os() << "LBrac"; break;
-    case AsmToken::LCurly:         Out->os() << "LCurly"; break;
-    case AsmToken::LParen:         Out->os() << "LParen"; break;
-    case AsmToken::Less:           Out->os() << "Less"; break;
-    case AsmToken::LessEqual:      Out->os() << "LessEqual"; break;
-    case AsmToken::LessGreater:    Out->os() << "LessGreater"; break;
-    case AsmToken::LessLess:       Out->os() << "LessLess"; break;
-    case AsmToken::Minus:          Out->os() << "Minus"; break;
-    case AsmToken::Percent:        Out->os() << "Percent"; break;
-    case AsmToken::Pipe:           Out->os() << "Pipe"; break;
-    case AsmToken::PipePipe:       Out->os() << "PipePipe"; break;
-    case AsmToken::Plus:           Out->os() << "Plus"; break;
-    case AsmToken::RBrac:          Out->os() << "RBrac"; break;
-    case AsmToken::RCurly:         Out->os() << "RCurly"; break;
-    case AsmToken::RParen:         Out->os() << "RParen"; break;
-    case AsmToken::Slash:          Out->os() << "Slash"; break;
-    case AsmToken::Star:           Out->os() << "Star"; break;
-    case AsmToken::Tilde:          Out->os() << "Tilde"; break;
+    case AsmToken::Amp:            OS << "Amp"; break;
+    case AsmToken::AmpAmp:         OS << "AmpAmp"; break;
+    case AsmToken::At:             OS << "At"; break;
+    case AsmToken::Caret:          OS << "Caret"; break;
+    case AsmToken::Colon:          OS << "Colon"; break;
+    case AsmToken::Comma:          OS << "Comma"; break;
+    case AsmToken::Dollar:         OS << "Dollar"; break;
+    case AsmToken::Dot:            OS << "Dot"; break;
+    case AsmToken::EndOfStatement: OS << "EndOfStatement"; break;
+    case AsmToken::Eof:            OS << "Eof"; break;
+    case AsmToken::Equal:          OS << "Equal"; break;
+    case AsmToken::EqualEqual:     OS << "EqualEqual"; break;
+    case AsmToken::Exclaim:        OS << "Exclaim"; break;
+    case AsmToken::ExclaimEqual:   OS << "ExclaimEqual"; break;
+    case AsmToken::Greater:        OS << "Greater"; break;
+    case AsmToken::GreaterEqual:   OS << "GreaterEqual"; break;
+    case AsmToken::GreaterGreater: OS << "GreaterGreater"; break;
+    case AsmToken::Hash:           OS << "Hash"; break;
+    case AsmToken::LBrac:          OS << "LBrac"; break;
+    case AsmToken::LCurly:         OS << "LCurly"; break;
+    case AsmToken::LParen:         OS << "LParen"; break;
+    case AsmToken::Less:           OS << "Less"; break;
+    case AsmToken::LessEqual:      OS << "LessEqual"; break;
+    case AsmToken::LessGreater:    OS << "LessGreater"; break;
+    case AsmToken::LessLess:       OS << "LessLess"; break;
+    case AsmToken::Minus:          OS << "Minus"; break;
+    case AsmToken::Percent:        OS << "Percent"; break;
+    case AsmToken::Pipe:           OS << "Pipe"; break;
+    case AsmToken::PipePipe:       OS << "PipePipe"; break;
+    case AsmToken::Plus:           OS << "Plus"; break;
+    case AsmToken::RBrac:          OS << "RBrac"; break;
+    case AsmToken::RCurly:         OS << "RCurly"; break;
+    case AsmToken::RParen:         OS << "RParen"; break;
+    case AsmToken::Slash:          OS << "Slash"; break;
+    case AsmToken::Star:           OS << "Star"; break;
+    case AsmToken::Tilde:          OS << "Tilde"; break;
     }
 
     // Print the token string.
-    Out->os() << " (\"";
-    Out->os().write_escaped(Tok.getString());
-    Out->os() << "\")\n";
+    OS << " (\"";
+    OS.write_escaped(Tok.getString());
+    OS << "\")\n";
   }
 
   return Error;
@@ -333,7 +332,7 @@ static int AssembleInput(const char *ProgName, const Target *TheTarget,
   }
 
   Parser->setShowParsedOperands(ShowInstOperands);
-  Parser->setTargetParser(*TAP.get());
+  Parser->setTargetParser(*TAP);
 
   int Res = Parser->Run(NoInitialTextSection);
 
@@ -435,7 +434,7 @@ int main(int argc, char **argv) {
     FeaturesStr = Features.getString();
   }
 
-  std::unique_ptr<tool_output_file> Out(GetOutputStream());
+  std::unique_ptr<tool_output_file> Out = GetOutputStream();
   if (!Out)
     return 1;
 
@@ -466,7 +465,7 @@ int main(int argc, char **argv) {
                                            MAB, ShowInst));
 
   } else if (FileType == OFT_Null) {
-    Str.reset(createNullStreamer(Ctx));
+    Str.reset(TheTarget->createNullStreamer(Ctx));
   } else {
     assert(FileType == OFT_ObjectFile && "Invalid file type!");
     MCCodeEmitter *CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, *STI, Ctx);
@@ -481,7 +480,7 @@ int main(int argc, char **argv) {
   bool disassemble = false;
   switch (Action) {
   case AC_AsLex:
-    Res = AsLexInput(SrcMgr, *MAI, Out.get());
+    Res = AsLexInput(SrcMgr, *MAI, Out->os());
     break;
   case AC_Assemble:
     Res = AssembleInput(ProgName, TheTarget, SrcMgr, Ctx, *Str, *MAI, *STI,
diff --git a/tools/llvm-nm/Android.mk b/tools/llvm-nm/Android.mk
index a17ca4d..e3b66f0 100644
--- a/tools/llvm-nm/Android.mk
+++ b/tools/llvm-nm/Android.mk
@@ -38,6 +38,7 @@ llvm_nm_STATIC_LIBRARIES := \
   libLLVMX86Utils \
   libLLVMX86Disassembler \
   libLLVMCodeGen \
+  libLLVMTransformUtils \
   libLLVMAnalysis \
   libLLVMTarget \
   libLLVMObject             \
diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index be2c4fa..f911c72 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -36,8 +36,8 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/Signals.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cctype>
 #include <cerrno>
@@ -1069,7 +1069,6 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
             ArchFound = true;
             ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr =
                 I->getAsObjectFile();
-            std::unique_ptr<Archive> A;
             std::string ArchiveName;
             std::string ArchitectureName;
             ArchiveName.clear();
@@ -1086,7 +1085,9 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
               }
               dumpSymbolNamesFromObject(Obj, false, ArchiveName,
                                         ArchitectureName);
-            } else if (!I->getAsArchive(A)) {
+            } else if (ErrorOr<std::unique_ptr<Archive>> AOrErr =
+                           I->getAsArchive()) {
+              std::unique_ptr<Archive> &A = *AOrErr;
               for (Archive::child_iterator AI = A->child_begin(),
                                            AE = A->child_end();
                    AI != AE; ++AI) {
@@ -1133,13 +1134,14 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
            I != E; ++I) {
         if (HostArchName == I->getArchTypeName()) {
           ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
-          std::unique_ptr<Archive> A;
           std::string ArchiveName;
           ArchiveName.clear();
           if (ObjOrErr) {
             ObjectFile &Obj = *ObjOrErr.get();
             dumpSymbolNamesFromObject(Obj, false);
-          } else if (!I->getAsArchive(A)) {
+          } else if (ErrorOr<std::unique_ptr<Archive>> AOrErr =
+                         I->getAsArchive()) {
+            std::unique_ptr<Archive> &A = *AOrErr;
             for (Archive::child_iterator AI = A->child_begin(),
                                          AE = A->child_end();
                  AI != AE; ++AI) {
@@ -1170,7 +1172,6 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
                                                E = UB->end_objects();
          I != E; ++I) {
       ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
-      std::unique_ptr<Archive> A;
       std::string ArchiveName;
       std::string ArchitectureName;
       ArchiveName.clear();
@@ -1189,7 +1190,8 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
           outs() << ":\n";
         }
         dumpSymbolNamesFromObject(Obj, false, ArchiveName, ArchitectureName);
-      } else if (!I->getAsArchive(A)) {
+      } else if (ErrorOr<std::unique_ptr<Archive>> AOrErr = I->getAsArchive()) {
+        std::unique_ptr<Archive> &A = *AOrErr;
         for (Archive::child_iterator AI = A->child_begin(), AE = A->child_end();
              AI != AE; ++AI) {
           ErrorOr<std::unique_ptr<Binary>> ChildOrErr =
diff --git a/tools/llvm-objdump/Android.mk b/tools/llvm-objdump/Android.mk
index 077e0ee..3b5eafd 100644
--- a/tools/llvm-objdump/Android.mk
+++ b/tools/llvm-objdump/Android.mk
@@ -14,7 +14,8 @@ llvm_objdump_SRC_FILES := \
   MachODump.cpp \
 
 llvm_objdump_STATIC_LIBRARIES := \
-  libLLVMDebugInfo \
+  libLLVMDebugInfoDWARF \
+  libLLVMDebugInfoPDB \
   libLLVMARMAsmParser \
   libLLVMARMInfo \
   libLLVMARMDesc \
@@ -40,6 +41,7 @@ llvm_objdump_STATIC_LIBRARIES := \
   libLLVMX86Disassembler \
   libLLVMAsmPrinter \
   libLLVMCodeGen \
+  libLLVMTransformUtils \
   libLLVMAnalysis \
   libLLVMTarget \
   libLLVMObject \
diff --git a/tools/llvm-objdump/CMakeLists.txt b/tools/llvm-objdump/CMakeLists.txt
index 61bf3b3..d717653 100644
--- a/tools/llvm-objdump/CMakeLists.txt
+++ b/tools/llvm-objdump/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
-  DebugInfo
+  DebugInfoDWARF
   MC
   MCDisassembler
   Object
diff --git a/tools/llvm-objdump/LLVMBuild.txt b/tools/llvm-objdump/LLVMBuild.txt
index d16c501..4232873 100644
--- a/tools/llvm-objdump/LLVMBuild.txt
+++ b/tools/llvm-objdump/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Tool
 name = llvm-objdump
 parent = Tools
-required_libraries = DebugInfo MC MCDisassembler MCParser Object all-targets
+required_libraries = DebugInfoDWARF MC MCDisassembler MCParser Object all-targets
diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp
index 3a28703..cae4356 100644
--- a/tools/llvm-objdump/MachODump.cpp
+++ b/tools/llvm-objdump/MachODump.cpp
@@ -17,7 +17,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Config/config.h"
-#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DIContext.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
@@ -28,15 +28,17 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Object/MachO.h"
+#include "llvm/Object/MachOUniversal.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/MachO.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
@@ -65,6 +67,40 @@ static cl::opt<bool>
     PrintImmHex("print-imm-hex",
                 cl::desc("Use hex format for immediate values"));
 
+cl::opt<bool> llvm::UniversalHeaders("universal-headers",
+                                     cl::desc("Print Mach-O universal headers "
+                                              "(requires -macho)"));
+
+cl::opt<bool>
+    llvm::ArchiveHeaders("archive-headers",
+                         cl::desc("Print archive headers for Mach-O archives "
+                                  "(requires -macho)"));
+
+cl::opt<bool>
+    llvm::IndirectSymbols("indirect-symbols",
+                          cl::desc("Print indirect symbol table for Mach-O "
+                                   "objects (requires -macho)"));
+
+cl::opt<bool>
+    llvm::DataInCode("data-in-code",
+                     cl::desc("Print the data in code table for Mach-O objects "
+                              "(requires -macho)"));
+
+cl::opt<bool>
+    llvm::LinkOptHints("link-opt-hints",
+                       cl::desc("Print the linker optimization hints for "
+                                "Mach-O objects (requires -macho)"));
+
+cl::list<std::string>
+    llvm::DumpSections("section",
+                       cl::desc("Prints the specified segment,section for "
+                                "Mach-O objects (requires -macho)"));
+
+static cl::list<std::string>
+    ArchFlags("arch", cl::desc("architecture(s) from a Mach-O file to dump"),
+              cl::ZeroOrMore);
+bool ArchAll = false;
+
 static std::string ThumbTripleName;
 
 static const Target *GetTarget(const MachOObjectFile *MachOObj,
@@ -205,8 +241,12 @@ static void getSectionsAndSymbols(const MachO::mach_header Header,
                                   std::vector<SymbolRef> &Symbols,
                                   SmallVectorImpl<uint64_t> &FoundFns,
                                   uint64_t &BaseSegmentAddress) {
-  for (const SymbolRef &Symbol : MachOObj->symbols())
-    Symbols.push_back(Symbol);
+  for (const SymbolRef &Symbol : MachOObj->symbols()) {
+    StringRef SymName;
+    Symbol.getName(SymName);
+    if (!SymName.startswith("ltmp"))
+      Symbols.push_back(Symbol);
+  }
 
   for (const SectionRef &Section : MachOObj->sections()) {
     StringRef SectName;
@@ -241,25 +281,1298 @@ static void getSectionsAndSymbols(const MachO::mach_header Header,
   }
 }
 
-static void DisassembleInputMachO2(StringRef Filename,
-                                   MachOObjectFile *MachOOF);
+static void PrintIndirectSymbolTable(MachOObjectFile *O, bool verbose,
+                                     uint32_t n, uint32_t count,
+                                     uint32_t stride, uint64_t addr) {
+  MachO::dysymtab_command Dysymtab = O->getDysymtabLoadCommand();
+  uint32_t nindirectsyms = Dysymtab.nindirectsyms;
+  if (n > nindirectsyms)
+    outs() << " (entries start past the end of the indirect symbol "
+              "table) (reserved1 field greater than the table size)";
+  else if (n + count > nindirectsyms)
+    outs() << " (entries extends past the end of the indirect symbol "
+              "table)";
+  outs() << "\n";
+  uint32_t cputype = O->getHeader().cputype;
+  if (cputype & MachO::CPU_ARCH_ABI64)
+    outs() << "address            index";
+  else
+    outs() << "address    index";
+  if (verbose)
+    outs() << " name\n";
+  else
+    outs() << "\n";
+  for (uint32_t j = 0; j < count && n + j < nindirectsyms; j++) {
+    if (cputype & MachO::CPU_ARCH_ABI64)
+      outs() << format("0x%016" PRIx64, addr + j * stride) << " ";
+    else
+      outs() << format("0x%08" PRIx32, addr + j * stride) << " ";
+    MachO::dysymtab_command Dysymtab = O->getDysymtabLoadCommand();
+    uint32_t indirect_symbol = O->getIndirectSymbolTableEntry(Dysymtab, n + j);
+    if (indirect_symbol == MachO::INDIRECT_SYMBOL_LOCAL) {
+      outs() << "LOCAL\n";
+      continue;
+    }
+    if (indirect_symbol ==
+        (MachO::INDIRECT_SYMBOL_LOCAL | MachO::INDIRECT_SYMBOL_ABS)) {
+      outs() << "LOCAL ABSOLUTE\n";
+      continue;
+    }
+    if (indirect_symbol == MachO::INDIRECT_SYMBOL_ABS) {
+      outs() << "ABSOLUTE\n";
+      continue;
+    }
+    outs() << format("%5u ", indirect_symbol);
+    MachO::symtab_command Symtab = O->getSymtabLoadCommand();
+    if (indirect_symbol < Symtab.nsyms) {
+      symbol_iterator Sym = O->getSymbolByIndex(indirect_symbol);
+      SymbolRef Symbol = *Sym;
+      StringRef SymName;
+      Symbol.getName(SymName);
+      outs() << SymName;
+    } else {
+      outs() << "?";
+    }
+    outs() << "\n";
+  }
+}
 
-void llvm::DisassembleInputMachO(StringRef Filename) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> BuffOrErr =
-      MemoryBuffer::getFileOrSTDIN(Filename);
-  if (std::error_code EC = BuffOrErr.getError()) {
-    errs() << "llvm-objdump: " << Filename << ": " << EC.message() << "\n";
-    return;
+static void PrintIndirectSymbols(MachOObjectFile *O, bool verbose) {
+  uint32_t LoadCommandCount = O->getHeader().ncmds;
+  MachOObjectFile::LoadCommandInfo Load = O->getFirstLoadCommandInfo();
+  for (unsigned I = 0;; ++I) {
+    if (Load.C.cmd == MachO::LC_SEGMENT_64) {
+      MachO::segment_command_64 Seg = O->getSegment64LoadCommand(Load);
+      for (unsigned J = 0; J < Seg.nsects; ++J) {
+        MachO::section_64 Sec = O->getSection64(Load, J);
+        uint32_t section_type = Sec.flags & MachO::SECTION_TYPE;
+        if (section_type == MachO::S_NON_LAZY_SYMBOL_POINTERS ||
+            section_type == MachO::S_LAZY_SYMBOL_POINTERS ||
+            section_type == MachO::S_LAZY_DYLIB_SYMBOL_POINTERS ||
+            section_type == MachO::S_THREAD_LOCAL_VARIABLE_POINTERS ||
+            section_type == MachO::S_SYMBOL_STUBS) {
+          uint32_t stride;
+          if (section_type == MachO::S_SYMBOL_STUBS)
+            stride = Sec.reserved2;
+          else
+            stride = 8;
+          if (stride == 0) {
+            outs() << "Can't print indirect symbols for (" << Sec.segname << ","
+                   << Sec.sectname << ") "
+                   << "(size of stubs in reserved2 field is zero)\n";
+            continue;
+          }
+          uint32_t count = Sec.size / stride;
+          outs() << "Indirect symbols for (" << Sec.segname << ","
+                 << Sec.sectname << ") " << count << " entries";
+          uint32_t n = Sec.reserved1;
+          PrintIndirectSymbolTable(O, verbose, n, count, stride, Sec.addr);
+        }
+      }
+    } else if (Load.C.cmd == MachO::LC_SEGMENT) {
+      MachO::segment_command Seg = O->getSegmentLoadCommand(Load);
+      for (unsigned J = 0; J < Seg.nsects; ++J) {
+        MachO::section Sec = O->getSection(Load, J);
+        uint32_t section_type = Sec.flags & MachO::SECTION_TYPE;
+        if (section_type == MachO::S_NON_LAZY_SYMBOL_POINTERS ||
+            section_type == MachO::S_LAZY_SYMBOL_POINTERS ||
+            section_type == MachO::S_LAZY_DYLIB_SYMBOL_POINTERS ||
+            section_type == MachO::S_THREAD_LOCAL_VARIABLE_POINTERS ||
+            section_type == MachO::S_SYMBOL_STUBS) {
+          uint32_t stride;
+          if (section_type == MachO::S_SYMBOL_STUBS)
+            stride = Sec.reserved2;
+          else
+            stride = 4;
+          if (stride == 0) {
+            outs() << "Can't print indirect symbols for (" << Sec.segname << ","
+                   << Sec.sectname << ") "
+                   << "(size of stubs in reserved2 field is zero)\n";
+            continue;
+          }
+          uint32_t count = Sec.size / stride;
+          outs() << "Indirect symbols for (" << Sec.segname << ","
+                 << Sec.sectname << ") " << count << " entries";
+          uint32_t n = Sec.reserved1;
+          PrintIndirectSymbolTable(O, verbose, n, count, stride, Sec.addr);
+        }
+      }
+    }
+    if (I == LoadCommandCount - 1)
+      break;
+    else
+      Load = O->getNextLoadCommandInfo(Load);
   }
-  std::unique_ptr<MemoryBuffer> Buff = std::move(BuffOrErr.get());
+}
 
-  std::unique_ptr<MachOObjectFile> MachOOF = std::move(
-      ObjectFile::createMachOObjectFile(Buff.get()->getMemBufferRef()).get());
+static void PrintDataInCodeTable(MachOObjectFile *O, bool verbose) {
+  MachO::linkedit_data_command DIC = O->getDataInCodeLoadCommand();
+  uint32_t nentries = DIC.datasize / sizeof(struct MachO::data_in_code_entry);
+  outs() << "Data in code table (" << nentries << " entries)\n";
+  outs() << "offset     length kind\n";
+  for (dice_iterator DI = O->begin_dices(), DE = O->end_dices(); DI != DE;
+       ++DI) {
+    uint32_t Offset;
+    DI->getOffset(Offset);
+    outs() << format("0x%08" PRIx32, Offset) << " ";
+    uint16_t Length;
+    DI->getLength(Length);
+    outs() << format("%6u", Length) << " ";
+    uint16_t Kind;
+    DI->getKind(Kind);
+    if (verbose) {
+      switch (Kind) {
+      case MachO::DICE_KIND_DATA:
+        outs() << "DATA";
+        break;
+      case MachO::DICE_KIND_JUMP_TABLE8:
+        outs() << "JUMP_TABLE8";
+        break;
+      case MachO::DICE_KIND_JUMP_TABLE16:
+        outs() << "JUMP_TABLE16";
+        break;
+      case MachO::DICE_KIND_JUMP_TABLE32:
+        outs() << "JUMP_TABLE32";
+        break;
+      case MachO::DICE_KIND_ABS_JUMP_TABLE32:
+        outs() << "ABS_JUMP_TABLE32";
+        break;
+      default:
+        outs() << format("0x%04" PRIx32, Kind);
+        break;
+      }
+    } else
+      outs() << format("0x%04" PRIx32, Kind);
+    outs() << "\n";
+  }
+}
+
+static void PrintLinkOptHints(MachOObjectFile *O) {
+  MachO::linkedit_data_command LohLC = O->getLinkOptHintsLoadCommand();
+  const char *loh = O->getData().substr(LohLC.dataoff, 1).data();
+  uint32_t nloh = LohLC.datasize;
+  outs() << "Linker optimiztion hints (" << nloh << " total bytes)\n";
+  for (uint32_t i = 0; i < nloh;) {
+    unsigned n;
+    uint64_t identifier = decodeULEB128((const uint8_t *)(loh + i), &n);
+    i += n;
+    outs() << "    identifier " << identifier << " ";
+    if (i >= nloh)
+      return;
+    switch (identifier) {
+    case 1:
+      outs() << "AdrpAdrp\n";
+      break;
+    case 2:
+      outs() << "AdrpLdr\n";
+      break;
+    case 3:
+      outs() << "AdrpAddLdr\n";
+      break;
+    case 4:
+      outs() << "AdrpLdrGotLdr\n";
+      break;
+    case 5:
+      outs() << "AdrpAddStr\n";
+      break;
+    case 6:
+      outs() << "AdrpLdrGotStr\n";
+      break;
+    case 7:
+      outs() << "AdrpAdd\n";
+      break;
+    case 8:
+      outs() << "AdrpLdrGot\n";
+      break;
+    default:
+      outs() << "Unknown identifier value\n";
+      break;
+    }
+    uint64_t narguments = decodeULEB128((const uint8_t *)(loh + i), &n);
+    i += n;
+    outs() << "    narguments " << narguments << "\n";
+    if (i >= nloh)
+      return;
 
-  DisassembleInputMachO2(Filename, MachOOF.get());
+    for (uint32_t j = 0; j < narguments; j++) {
+      uint64_t value = decodeULEB128((const uint8_t *)(loh + i), &n);
+      i += n;
+      outs() << "\tvalue " << format("0x%" PRIx64, value) << "\n";
+      if (i >= nloh)
+        return;
+    }
+  }
 }
 
 typedef DenseMap<uint64_t, StringRef> SymbolAddressMap;
+
+static void CreateSymbolAddressMap(MachOObjectFile *O,
+                                   SymbolAddressMap *AddrMap) {
+  // Create a map of symbol addresses to symbol names.
+  for (const SymbolRef &Symbol : O->symbols()) {
+    SymbolRef::Type ST;
+    Symbol.getType(ST);
+    if (ST == SymbolRef::ST_Function || ST == SymbolRef::ST_Data ||
+        ST == SymbolRef::ST_Other) {
+      uint64_t Address;
+      Symbol.getAddress(Address);
+      StringRef SymName;
+      Symbol.getName(SymName);
+      (*AddrMap)[Address] = SymName;
+    }
+  }
+}
+
+// GuessSymbolName is passed the address of what might be a symbol and a
+// pointer to the SymbolAddressMap.  It returns the name of a symbol
+// with that address or nullptr if no symbol is found with that address.
+static const char *GuessSymbolName(uint64_t value, SymbolAddressMap *AddrMap) {
+  const char *SymbolName = nullptr;
+  // A DenseMap can't lookup up some values.
+  if (value != 0xffffffffffffffffULL && value != 0xfffffffffffffffeULL) {
+    StringRef name = AddrMap->lookup(value);
+    if (!name.empty())
+      SymbolName = name.data();
+  }
+  return SymbolName;
+}
+
+static void DumpCstringChar(const char c) {
+  char p[2];
+  p[0] = c;
+  p[1] = '\0';
+  outs().write_escaped(p);
+}
+
+static void DumpCstringSection(MachOObjectFile *O, const char *sect,
+                               uint32_t sect_size, uint64_t sect_addr,
+                               bool print_addresses) {
+  for (uint32_t i = 0; i < sect_size; i++) {
+    if (print_addresses) {
+      if (O->is64Bit())
+        outs() << format("%016" PRIx64, sect_addr + i) << "  ";
+      else
+        outs() << format("%08" PRIx64, sect_addr + i) << "  ";
+    }
+    for (; i < sect_size && sect[i] != '\0'; i++)
+      DumpCstringChar(sect[i]);
+    if (i < sect_size && sect[i] == '\0')
+      outs() << "\n";
+  }
+}
+
+static void DumpLiteral4(uint32_t l, float f) {
+  outs() << format("0x%08" PRIx32, l);
+  if ((l & 0x7f800000) != 0x7f800000)
+    outs() << format(" (%.16e)\n", f);
+  else {
+    if (l == 0x7f800000)
+      outs() << " (+Infinity)\n";
+    else if (l == 0xff800000)
+      outs() << " (-Infinity)\n";
+    else if ((l & 0x00400000) == 0x00400000)
+      outs() << " (non-signaling Not-a-Number)\n";
+    else
+      outs() << " (signaling Not-a-Number)\n";
+  }
+}
+
+static void DumpLiteral4Section(MachOObjectFile *O, const char *sect,
+                                uint32_t sect_size, uint64_t sect_addr,
+                                bool print_addresses) {
+  for (uint32_t i = 0; i < sect_size; i += sizeof(float)) {
+    if (print_addresses) {
+      if (O->is64Bit())
+        outs() << format("%016" PRIx64, sect_addr + i) << "  ";
+      else
+        outs() << format("%08" PRIx64, sect_addr + i) << "  ";
+    }
+    float f;
+    memcpy(&f, sect + i, sizeof(float));
+    if (O->isLittleEndian() != sys::IsLittleEndianHost)
+      sys::swapByteOrder(f);
+    uint32_t l;
+    memcpy(&l, sect + i, sizeof(uint32_t));
+    if (O->isLittleEndian() != sys::IsLittleEndianHost)
+      sys::swapByteOrder(l);
+    DumpLiteral4(l, f);
+  }
+}
+
+static void DumpLiteral8(MachOObjectFile *O, uint32_t l0, uint32_t l1,
+                         double d) {
+  outs() << format("0x%08" PRIx32, l0) << " " << format("0x%08" PRIx32, l1);
+  uint32_t Hi, Lo;
+  if (O->isLittleEndian()) {
+    Hi = l1;
+    Lo = l0;
+  } else {
+    Hi = l0;
+    Lo = l1;
+  }
+  // Hi is the high word, so this is equivalent to if(isfinite(d))
+  if ((Hi & 0x7ff00000) != 0x7ff00000)
+    outs() << format(" (%.16e)\n", d);
+  else {
+    if (Hi == 0x7ff00000 && Lo == 0)
+      outs() << " (+Infinity)\n";
+    else if (Hi == 0xfff00000 && Lo == 0)
+      outs() << " (-Infinity)\n";
+    else if ((Hi & 0x00080000) == 0x00080000)
+      outs() << " (non-signaling Not-a-Number)\n";
+    else
+      outs() << " (signaling Not-a-Number)\n";
+  }
+}
+
+static void DumpLiteral8Section(MachOObjectFile *O, const char *sect,
+                                uint32_t sect_size, uint64_t sect_addr,
+                                bool print_addresses) {
+  for (uint32_t i = 0; i < sect_size; i += sizeof(double)) {
+    if (print_addresses) {
+      if (O->is64Bit())
+        outs() << format("%016" PRIx64, sect_addr + i) << "  ";
+      else
+        outs() << format("%08" PRIx64, sect_addr + i) << "  ";
+    }
+    double d;
+    memcpy(&d, sect + i, sizeof(double));
+    if (O->isLittleEndian() != sys::IsLittleEndianHost)
+      sys::swapByteOrder(d);
+    uint32_t l0, l1;
+    memcpy(&l0, sect + i, sizeof(uint32_t));
+    memcpy(&l1, sect + i + sizeof(uint32_t), sizeof(uint32_t));
+    if (O->isLittleEndian() != sys::IsLittleEndianHost) {
+      sys::swapByteOrder(l0);
+      sys::swapByteOrder(l1);
+    }
+    DumpLiteral8(O, l0, l1, d);
+  }
+}
+
+static void DumpLiteral16(uint32_t l0, uint32_t l1, uint32_t l2, uint32_t l3) {
+  outs() << format("0x%08" PRIx32, l0) << " ";
+  outs() << format("0x%08" PRIx32, l1) << " ";
+  outs() << format("0x%08" PRIx32, l2) << " ";
+  outs() << format("0x%08" PRIx32, l3) << "\n";
+}
+
+static void DumpLiteral16Section(MachOObjectFile *O, const char *sect,
+                                 uint32_t sect_size, uint64_t sect_addr,
+                                 bool print_addresses) {
+  for (uint32_t i = 0; i < sect_size; i += 16) {
+    if (print_addresses) {
+      if (O->is64Bit())
+        outs() << format("%016" PRIx64, sect_addr + i) << "  ";
+      else
+        outs() << format("%08" PRIx64, sect_addr + i) << "  ";
+    }
+    uint32_t l0, l1, l2, l3;
+    memcpy(&l0, sect + i, sizeof(uint32_t));
+    memcpy(&l1, sect + i + sizeof(uint32_t), sizeof(uint32_t));
+    memcpy(&l2, sect + i + 2 * sizeof(uint32_t), sizeof(uint32_t));
+    memcpy(&l3, sect + i + 3 * sizeof(uint32_t), sizeof(uint32_t));
+    if (O->isLittleEndian() != sys::IsLittleEndianHost) {
+      sys::swapByteOrder(l0);
+      sys::swapByteOrder(l1);
+      sys::swapByteOrder(l2);
+      sys::swapByteOrder(l3);
+    }
+    DumpLiteral16(l0, l1, l2, l3);
+  }
+}
+
+static void DumpLiteralPointerSection(MachOObjectFile *O,
+                                      const SectionRef &Section,
+                                      const char *sect, uint32_t sect_size,
+                                      uint64_t sect_addr,
+                                      bool print_addresses) {
+  // Collect the literal sections in this Mach-O file.
+  std::vector<SectionRef> LiteralSections;
+  for (const SectionRef &Section : O->sections()) {
+    DataRefImpl Ref = Section.getRawDataRefImpl();
+    uint32_t section_type;
+    if (O->is64Bit()) {
+      const MachO::section_64 Sec = O->getSection64(Ref);
+      section_type = Sec.flags & MachO::SECTION_TYPE;
+    } else {
+      const MachO::section Sec = O->getSection(Ref);
+      section_type = Sec.flags & MachO::SECTION_TYPE;
+    }
+    if (section_type == MachO::S_CSTRING_LITERALS ||
+        section_type == MachO::S_4BYTE_LITERALS ||
+        section_type == MachO::S_8BYTE_LITERALS ||
+        section_type == MachO::S_16BYTE_LITERALS)
+      LiteralSections.push_back(Section);
+  }
+
+  // Set the size of the literal pointer.
+  uint32_t lp_size = O->is64Bit() ? 8 : 4;
+
+  // Collect the external relocation symbols for the the literal pointers.
+  std::vector<std::pair<uint64_t, SymbolRef>> Relocs;
+  for (const RelocationRef &Reloc : Section.relocations()) {
+    DataRefImpl Rel;
+    MachO::any_relocation_info RE;
+    bool isExtern = false;
+    Rel = Reloc.getRawDataRefImpl();
+    RE = O->getRelocation(Rel);
+    isExtern = O->getPlainRelocationExternal(RE);
+    if (isExtern) {
+      uint64_t RelocOffset;
+      Reloc.getOffset(RelocOffset);
+      symbol_iterator RelocSym = Reloc.getSymbol();
+      Relocs.push_back(std::make_pair(RelocOffset, *RelocSym));
+    }
+  }
+  array_pod_sort(Relocs.begin(), Relocs.end());
+
+  // Dump each literal pointer.
+  for (uint32_t i = 0; i < sect_size; i += lp_size) {
+    if (print_addresses) {
+      if (O->is64Bit())
+        outs() << format("%016" PRIx64, sect_addr + i) << "  ";
+      else
+        outs() << format("%08" PRIx64, sect_addr + i) << "  ";
+    }
+    uint64_t lp;
+    if (O->is64Bit()) {
+      memcpy(&lp, sect + i, sizeof(uint64_t));
+      if (O->isLittleEndian() != sys::IsLittleEndianHost)
+        sys::swapByteOrder(lp);
+    } else {
+      uint32_t li;
+      memcpy(&li, sect + i, sizeof(uint32_t));
+      if (O->isLittleEndian() != sys::IsLittleEndianHost)
+        sys::swapByteOrder(li);
+      lp = li;
+    }
+
+    // First look for an external relocation entry for this literal pointer.
+    bool reloc_found = false;
+    for (unsigned j = 0, e = Relocs.size(); j != e; ++j) {
+      if (Relocs[i].first == i) {
+        symbol_iterator RelocSym = Relocs[j].second;
+        StringRef SymName;
+        RelocSym->getName(SymName);
+        outs() << "external relocation entry for symbol:" << SymName << "\n";
+        reloc_found = true;
+      }
+    }
+    if (reloc_found == true)
+      continue;
+
+    // For local references see what the section the literal pointer points to.
+    bool found = false;
+    for (unsigned SectIdx = 0; SectIdx != LiteralSections.size(); SectIdx++) {
+      uint64_t SectAddress = LiteralSections[SectIdx].getAddress();
+      uint64_t SectSize = LiteralSections[SectIdx].getSize();
+      if (lp >= SectAddress && lp < SectAddress + SectSize) {
+        found = true;
+
+        StringRef SectName;
+        LiteralSections[SectIdx].getName(SectName);
+        DataRefImpl Ref = LiteralSections[SectIdx].getRawDataRefImpl();
+        StringRef SegmentName = O->getSectionFinalSegmentName(Ref);
+        outs() << SegmentName << ":" << SectName << ":";
+
+        uint32_t section_type;
+        if (O->is64Bit()) {
+          const MachO::section_64 Sec = O->getSection64(Ref);
+          section_type = Sec.flags & MachO::SECTION_TYPE;
+        } else {
+          const MachO::section Sec = O->getSection(Ref);
+          section_type = Sec.flags & MachO::SECTION_TYPE;
+        }
+
+        StringRef BytesStr;
+        LiteralSections[SectIdx].getContents(BytesStr);
+        const char *Contents = reinterpret_cast<const char *>(BytesStr.data());
+
+        switch (section_type) {
+        case MachO::S_CSTRING_LITERALS:
+          for (uint64_t i = lp - SectAddress;
+               i < SectSize && Contents[i] != '\0'; i++) {
+            DumpCstringChar(Contents[i]);
+          }
+          outs() << "\n";
+          break;
+        case MachO::S_4BYTE_LITERALS:
+          float f;
+          memcpy(&f, Contents + (lp - SectAddress), sizeof(float));
+          uint32_t l;
+          memcpy(&l, Contents + (lp - SectAddress), sizeof(uint32_t));
+          if (O->isLittleEndian() != sys::IsLittleEndianHost) {
+            sys::swapByteOrder(f);
+            sys::swapByteOrder(l);
+          }
+          DumpLiteral4(l, f);
+          break;
+        case MachO::S_8BYTE_LITERALS: {
+          double d;
+          memcpy(&d, Contents + (lp - SectAddress), sizeof(double));
+          uint32_t l0, l1;
+          memcpy(&l0, Contents + (lp - SectAddress), sizeof(uint32_t));
+          memcpy(&l1, Contents + (lp - SectAddress) + sizeof(uint32_t),
+                 sizeof(uint32_t));
+          if (O->isLittleEndian() != sys::IsLittleEndianHost) {
+            sys::swapByteOrder(f);
+            sys::swapByteOrder(l0);
+            sys::swapByteOrder(l1);
+          }
+          DumpLiteral8(O, l0, l1, d);
+          break;
+        }
+        case MachO::S_16BYTE_LITERALS: {
+          uint32_t l0, l1, l2, l3;
+          memcpy(&l0, Contents + (lp - SectAddress), sizeof(uint32_t));
+          memcpy(&l1, Contents + (lp - SectAddress) + sizeof(uint32_t),
+                 sizeof(uint32_t));
+          memcpy(&l2, Contents + (lp - SectAddress) + 2 * sizeof(uint32_t),
+                 sizeof(uint32_t));
+          memcpy(&l3, Contents + (lp - SectAddress) + 3 * sizeof(uint32_t),
+                 sizeof(uint32_t));
+          if (O->isLittleEndian() != sys::IsLittleEndianHost) {
+            sys::swapByteOrder(l0);
+            sys::swapByteOrder(l1);
+            sys::swapByteOrder(l2);
+            sys::swapByteOrder(l3);
+          }
+          DumpLiteral16(l0, l1, l2, l3);
+          break;
+        }
+        }
+      }
+    }
+    if (found == false)
+      outs() << format("0x%" PRIx64, lp) << " (not in a literal section)\n";
+  }
+}
+
+static void DumpInitTermPointerSection(MachOObjectFile *O, const char *sect,
+                                       uint32_t sect_size, uint64_t sect_addr,
+                                       SymbolAddressMap *AddrMap,
+                                       bool verbose) {
+  uint32_t stride;
+  if (O->is64Bit())
+    stride = sizeof(uint64_t);
+  else
+    stride = sizeof(uint32_t);
+  for (uint32_t i = 0; i < sect_size; i += stride) {
+    const char *SymbolName = nullptr;
+    if (O->is64Bit()) {
+      outs() << format("0x%016" PRIx64, sect_addr + i * stride) << " ";
+      uint64_t pointer_value;
+      memcpy(&pointer_value, sect + i, stride);
+      if (O->isLittleEndian() != sys::IsLittleEndianHost)
+        sys::swapByteOrder(pointer_value);
+      outs() << format("0x%016" PRIx64, pointer_value);
+      if (verbose)
+        SymbolName = GuessSymbolName(pointer_value, AddrMap);
+    } else {
+      outs() << format("0x%08" PRIx64, sect_addr + i * stride) << " ";
+      uint32_t pointer_value;
+      memcpy(&pointer_value, sect + i, stride);
+      if (O->isLittleEndian() != sys::IsLittleEndianHost)
+        sys::swapByteOrder(pointer_value);
+      outs() << format("0x%08" PRIx32, pointer_value);
+      if (verbose)
+        SymbolName = GuessSymbolName(pointer_value, AddrMap);
+    }
+    if (SymbolName)
+      outs() << " " << SymbolName;
+    outs() << "\n";
+  }
+}
+
+static void DumpRawSectionContents(MachOObjectFile *O, const char *sect,
+                                   uint32_t size, uint64_t addr) {
+  uint32_t cputype = O->getHeader().cputype;
+  if (cputype == MachO::CPU_TYPE_I386 || cputype == MachO::CPU_TYPE_X86_64) {
+    uint32_t j;
+    for (uint32_t i = 0; i < size; i += j, addr += j) {
+      if (O->is64Bit())
+        outs() << format("%016" PRIx64, addr) << "\t";
+      else
+        outs() << format("%08" PRIx64, sect) << "\t";
+      for (j = 0; j < 16 && i + j < size; j++) {
+        uint8_t byte_word = *(sect + i + j);
+        outs() << format("%02" PRIx32, (uint32_t)byte_word) << " ";
+      }
+      outs() << "\n";
+    }
+  } else {
+    uint32_t j;
+    for (uint32_t i = 0; i < size; i += j, addr += j) {
+      if (O->is64Bit())
+        outs() << format("%016" PRIx64, addr) << "\t";
+      else
+        outs() << format("%08" PRIx64, sect) << "\t";
+      for (j = 0; j < 4 * sizeof(int32_t) && i + j < size;
+           j += sizeof(int32_t)) {
+        if (i + j + sizeof(int32_t) < size) {
+          uint32_t long_word;
+          memcpy(&long_word, sect + i + j, sizeof(int32_t));
+          if (O->isLittleEndian() != sys::IsLittleEndianHost)
+            sys::swapByteOrder(long_word);
+          outs() << format("%08" PRIx32, long_word) << " ";
+        } else {
+          for (uint32_t k = 0; i + j + k < size; k++) {
+            uint8_t byte_word = *(sect + i + j);
+            outs() << format("%02" PRIx32, (uint32_t)byte_word) << " ";
+          }
+        }
+      }
+      outs() << "\n";
+    }
+  }
+}
+
+static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
+                             StringRef DisSegName, StringRef DisSectName);
+
+static void DumpSectionContents(StringRef Filename, MachOObjectFile *O,
+                                bool verbose) {
+  SymbolAddressMap AddrMap;
+  if (verbose)
+    CreateSymbolAddressMap(O, &AddrMap);
+
+  for (unsigned i = 0; i < DumpSections.size(); ++i) {
+    StringRef DumpSection = DumpSections[i];
+    std::pair<StringRef, StringRef> DumpSegSectName;
+    DumpSegSectName = DumpSection.split(',');
+    StringRef DumpSegName, DumpSectName;
+    if (DumpSegSectName.second.size()) {
+      DumpSegName = DumpSegSectName.first;
+      DumpSectName = DumpSegSectName.second;
+    } else {
+      DumpSegName = "";
+      DumpSectName = DumpSegSectName.first;
+    }
+    for (const SectionRef &Section : O->sections()) {
+      StringRef SectName;
+      Section.getName(SectName);
+      DataRefImpl Ref = Section.getRawDataRefImpl();
+      StringRef SegName = O->getSectionFinalSegmentName(Ref);
+      if ((DumpSegName.empty() || SegName == DumpSegName) &&
+          (SectName == DumpSectName)) {
+        outs() << "Contents of (" << SegName << "," << SectName
+               << ") section\n";
+        uint32_t section_flags;
+        if (O->is64Bit()) {
+          const MachO::section_64 Sec = O->getSection64(Ref);
+          section_flags = Sec.flags;
+
+        } else {
+          const MachO::section Sec = O->getSection(Ref);
+          section_flags = Sec.flags;
+        }
+        uint32_t section_type = section_flags & MachO::SECTION_TYPE;
+
+        StringRef BytesStr;
+        Section.getContents(BytesStr);
+        const char *sect = reinterpret_cast<const char *>(BytesStr.data());
+        uint32_t sect_size = BytesStr.size();
+        uint64_t sect_addr = Section.getAddress();
+
+        if (verbose) {
+          if ((section_flags & MachO::S_ATTR_PURE_INSTRUCTIONS) ||
+              (section_flags & MachO::S_ATTR_SOME_INSTRUCTIONS)) {
+            DisassembleMachO(Filename, O, SegName, SectName);
+            continue;
+          }
+          switch (section_type) {
+          case MachO::S_REGULAR:
+            DumpRawSectionContents(O, sect, sect_size, sect_addr);
+            break;
+          case MachO::S_ZEROFILL:
+            outs() << "zerofill section and has no contents in the file\n";
+            break;
+          case MachO::S_CSTRING_LITERALS:
+            DumpCstringSection(O, sect, sect_size, sect_addr, verbose);
+            break;
+          case MachO::S_4BYTE_LITERALS:
+            DumpLiteral4Section(O, sect, sect_size, sect_addr, verbose);
+            break;
+          case MachO::S_8BYTE_LITERALS:
+            DumpLiteral8Section(O, sect, sect_size, sect_addr, verbose);
+            break;
+          case MachO::S_16BYTE_LITERALS:
+            DumpLiteral16Section(O, sect, sect_size, sect_addr, verbose);
+            break;
+          case MachO::S_LITERAL_POINTERS:
+            DumpLiteralPointerSection(O, Section, sect, sect_size, sect_addr,
+                                      verbose);
+            break;
+          case MachO::S_MOD_INIT_FUNC_POINTERS:
+          case MachO::S_MOD_TERM_FUNC_POINTERS:
+            DumpInitTermPointerSection(O, sect, sect_size, sect_addr, &AddrMap,
+                                       verbose);
+            break;
+          default:
+            outs() << "Unknown section type ("
+                   << format("0x%08" PRIx32, section_type) << ")\n";
+            DumpRawSectionContents(O, sect, sect_size, sect_addr);
+            break;
+          }
+        } else {
+          if (section_type == MachO::S_ZEROFILL)
+            outs() << "zerofill section and has no contents in the file\n";
+          else
+            DumpRawSectionContents(O, sect, sect_size, sect_addr);
+        }
+      }
+    }
+  }
+}
+
+// checkMachOAndArchFlags() checks to see if the ObjectFile is a Mach-O file
+// and if it is and there is a list of architecture flags is specified then
+// check to make sure this Mach-O file is one of those architectures or all
+// architectures were specified.  If not then an error is generated and this
+// routine returns false.  Else it returns true.
+static bool checkMachOAndArchFlags(ObjectFile *O, StringRef Filename) {
+  if (isa<MachOObjectFile>(O) && !ArchAll && ArchFlags.size() != 0) {
+    MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(O);
+    bool ArchFound = false;
+    MachO::mach_header H;
+    MachO::mach_header_64 H_64;
+    Triple T;
+    if (MachO->is64Bit()) {
+      H_64 = MachO->MachOObjectFile::getHeader64();
+      T = MachOObjectFile::getArch(H_64.cputype, H_64.cpusubtype);
+    } else {
+      H = MachO->MachOObjectFile::getHeader();
+      T = MachOObjectFile::getArch(H.cputype, H.cpusubtype);
+    }
+    unsigned i;
+    for (i = 0; i < ArchFlags.size(); ++i) {
+      if (ArchFlags[i] == T.getArchName())
+        ArchFound = true;
+      break;
+    }
+    if (!ArchFound) {
+      errs() << "llvm-objdump: file: " + Filename + " does not contain "
+             << "architecture: " + ArchFlags[i] + "\n";
+      return false;
+    }
+  }
+  return true;
+}
+
+// ProcessMachO() is passed a single opened Mach-O file, which may be an
+// archive member and or in a slice of a universal file.  It prints the
+// the file name and header info and then processes it according to the
+// command line options.
+static void ProcessMachO(StringRef Filename, MachOObjectFile *MachOOF,
+                         StringRef ArchiveMemberName = StringRef(),
+                         StringRef ArchitectureName = StringRef()) {
+  // If we are doing some processing here on the Mach-O file print the header
+  // info.  And don't print it otherwise like in the case of printing the
+  // UniversalHeaders or ArchiveHeaders.
+  if (Disassemble || PrivateHeaders || ExportsTrie || Rebase || Bind ||
+      LazyBind || WeakBind || IndirectSymbols || DataInCode || LinkOptHints ||
+      DumpSections.size() != 0) {
+    outs() << Filename;
+    if (!ArchiveMemberName.empty())
+      outs() << '(' << ArchiveMemberName << ')';
+    if (!ArchitectureName.empty())
+      outs() << " (architecture " << ArchitectureName << ")";
+    outs() << ":\n";
+  }
+
+  if (Disassemble)
+    DisassembleMachO(Filename, MachOOF, "__TEXT", "__text");
+  if (IndirectSymbols)
+    PrintIndirectSymbols(MachOOF, true);
+  if (DataInCode)
+    PrintDataInCodeTable(MachOOF, true);
+  if (LinkOptHints)
+    PrintLinkOptHints(MachOOF);
+  if (Relocations)
+    PrintRelocations(MachOOF);
+  if (SectionHeaders)
+    PrintSectionHeaders(MachOOF);
+  if (SectionContents)
+    PrintSectionContents(MachOOF);
+  if (DumpSections.size() != 0)
+    DumpSectionContents(Filename, MachOOF, true);
+  if (SymbolTable)
+    PrintSymbolTable(MachOOF);
+  if (UnwindInfo)
+    printMachOUnwindInfo(MachOOF);
+  if (PrivateHeaders)
+    printMachOFileHeader(MachOOF);
+  if (ExportsTrie)
+    printExportsTrie(MachOOF);
+  if (Rebase)
+    printRebaseTable(MachOOF);
+  if (Bind)
+    printBindTable(MachOOF);
+  if (LazyBind)
+    printLazyBindTable(MachOOF);
+  if (WeakBind)
+    printWeakBindTable(MachOOF);
+}
+
+// printUnknownCPUType() helps print_fat_headers for unknown CPU's.
+static void printUnknownCPUType(uint32_t cputype, uint32_t cpusubtype) {
+  outs() << "    cputype (" << cputype << ")\n";
+  outs() << "    cpusubtype (" << cpusubtype << ")\n";
+}
+
+// printCPUType() helps print_fat_headers by printing the cputype and
+// pusubtype (symbolically for the one's it knows about).
+static void printCPUType(uint32_t cputype, uint32_t cpusubtype) {
+  switch (cputype) {
+  case MachO::CPU_TYPE_I386:
+    switch (cpusubtype) {
+    case MachO::CPU_SUBTYPE_I386_ALL:
+      outs() << "    cputype CPU_TYPE_I386\n";
+      outs() << "    cpusubtype CPU_SUBTYPE_I386_ALL\n";
+      break;
+    default:
+      printUnknownCPUType(cputype, cpusubtype);
+      break;
+    }
+    break;
+  case MachO::CPU_TYPE_X86_64:
+    switch (cpusubtype) {
+    case MachO::CPU_SUBTYPE_X86_64_ALL:
+      outs() << "    cputype CPU_TYPE_X86_64\n";
+      outs() << "    cpusubtype CPU_SUBTYPE_X86_64_ALL\n";
+      break;
+    case MachO::CPU_SUBTYPE_X86_64_H:
+      outs() << "    cputype CPU_TYPE_X86_64\n";
+      outs() << "    cpusubtype CPU_SUBTYPE_X86_64_H\n";
+      break;
+    default:
+      printUnknownCPUType(cputype, cpusubtype);
+      break;
+    }
+    break;
+  case MachO::CPU_TYPE_ARM:
+    switch (cpusubtype) {
+    case MachO::CPU_SUBTYPE_ARM_ALL:
+      outs() << "    cputype CPU_TYPE_ARM\n";
+      outs() << "    cpusubtype CPU_SUBTYPE_ARM_ALL\n";
+      break;
+    case MachO::CPU_SUBTYPE_ARM_V4T:
+      outs() << "    cputype CPU_TYPE_ARM\n";
+      outs() << "    cpusubtype CPU_SUBTYPE_ARM_V4T\n";
+      break;
+    case MachO::CPU_SUBTYPE_ARM_V5TEJ:
+      outs() << "    cputype CPU_TYPE_ARM\n";
+      outs() << "    cpusubtype CPU_SUBTYPE_ARM_V5TEJ\n";
+      break;
+    case MachO::CPU_SUBTYPE_ARM_XSCALE:
+      outs() << "    cputype CPU_TYPE_ARM\n";
+      outs() << "    cpusubtype CPU_SUBTYPE_ARM_XSCALE\n";
+      break;
+    case MachO::CPU_SUBTYPE_ARM_V6:
+      outs() << "    cputype CPU_TYPE_ARM\n";
+      outs() << "    cpusubtype CPU_SUBTYPE_ARM_V6\n";
+      break;
+    case MachO::CPU_SUBTYPE_ARM_V6M:
+      outs() << "    cputype CPU_TYPE_ARM\n";
+      outs() << "    cpusubtype CPU_SUBTYPE_ARM_V6M\n";
+      break;
+    case MachO::CPU_SUBTYPE_ARM_V7:
+      outs() << "    cputype CPU_TYPE_ARM\n";
+      outs() << "    cpusubtype CPU_SUBTYPE_ARM_V7\n";
+      break;
+    case MachO::CPU_SUBTYPE_ARM_V7EM:
+      outs() << "    cputype CPU_TYPE_ARM\n";
+      outs() << "    cpusubtype CPU_SUBTYPE_ARM_V7EM\n";
+      break;
+    case MachO::CPU_SUBTYPE_ARM_V7K:
+      outs() << "    cputype CPU_TYPE_ARM\n";
+      outs() << "    cpusubtype CPU_SUBTYPE_ARM_V7K\n";
+      break;
+    case MachO::CPU_SUBTYPE_ARM_V7M:
+      outs() << "    cputype CPU_TYPE_ARM\n";
+      outs() << "    cpusubtype CPU_SUBTYPE_ARM_V7M\n";
+      break;
+    case MachO::CPU_SUBTYPE_ARM_V7S:
+      outs() << "    cputype CPU_TYPE_ARM\n";
+      outs() << "    cpusubtype CPU_SUBTYPE_ARM_V7S\n";
+      break;
+    default:
+      printUnknownCPUType(cputype, cpusubtype);
+      break;
+    }
+    break;
+  case MachO::CPU_TYPE_ARM64:
+    switch (cpusubtype & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_ARM64_ALL:
+      outs() << "    cputype CPU_TYPE_ARM64\n";
+      outs() << "    cpusubtype CPU_SUBTYPE_ARM64_ALL\n";
+      break;
+    default:
+      printUnknownCPUType(cputype, cpusubtype);
+      break;
+    }
+    break;
+  default:
+    printUnknownCPUType(cputype, cpusubtype);
+    break;
+  }
+}
+
+static void printMachOUniversalHeaders(const object::MachOUniversalBinary *UB,
+                                       bool verbose) {
+  outs() << "Fat headers\n";
+  if (verbose)
+    outs() << "fat_magic FAT_MAGIC\n";
+  else
+    outs() << "fat_magic " << format("0x%" PRIx32, MachO::FAT_MAGIC) << "\n";
+
+  uint32_t nfat_arch = UB->getNumberOfObjects();
+  StringRef Buf = UB->getData();
+  uint64_t size = Buf.size();
+  uint64_t big_size = sizeof(struct MachO::fat_header) +
+                      nfat_arch * sizeof(struct MachO::fat_arch);
+  outs() << "nfat_arch " << UB->getNumberOfObjects();
+  if (nfat_arch == 0)
+    outs() << " (malformed, contains zero architecture types)\n";
+  else if (big_size > size)
+    outs() << " (malformed, architectures past end of file)\n";
+  else
+    outs() << "\n";
+
+  for (uint32_t i = 0; i < nfat_arch; ++i) {
+    MachOUniversalBinary::ObjectForArch OFA(UB, i);
+    uint32_t cputype = OFA.getCPUType();
+    uint32_t cpusubtype = OFA.getCPUSubType();
+    outs() << "architecture ";
+    for (uint32_t j = 0; i != 0 && j <= i - 1; j++) {
+      MachOUniversalBinary::ObjectForArch other_OFA(UB, j);
+      uint32_t other_cputype = other_OFA.getCPUType();
+      uint32_t other_cpusubtype = other_OFA.getCPUSubType();
+      if (cputype != 0 && cpusubtype != 0 && cputype == other_cputype &&
+          (cpusubtype & ~MachO::CPU_SUBTYPE_MASK) ==
+              (other_cpusubtype & ~MachO::CPU_SUBTYPE_MASK)) {
+        outs() << "(illegal duplicate architecture) ";
+        break;
+      }
+    }
+    if (verbose) {
+      outs() << OFA.getArchTypeName() << "\n";
+      printCPUType(cputype, cpusubtype & ~MachO::CPU_SUBTYPE_MASK);
+    } else {
+      outs() << i << "\n";
+      outs() << "    cputype " << cputype << "\n";
+      outs() << "    cpusubtype " << (cpusubtype & ~MachO::CPU_SUBTYPE_MASK)
+             << "\n";
+    }
+    if (verbose &&
+        (cpusubtype & MachO::CPU_SUBTYPE_MASK) == MachO::CPU_SUBTYPE_LIB64)
+      outs() << "    capabilities CPU_SUBTYPE_LIB64\n";
+    else
+      outs() << "    capabilities "
+             << format("0x%" PRIx32,
+                       (cpusubtype & MachO::CPU_SUBTYPE_MASK) >> 24) << "\n";
+    outs() << "    offset " << OFA.getOffset();
+    if (OFA.getOffset() > size)
+      outs() << " (past end of file)";
+    if (OFA.getOffset() % (1 << OFA.getAlign()) != 0)
+      outs() << " (not aligned on it's alignment (2^" << OFA.getAlign() << ")";
+    outs() << "\n";
+    outs() << "    size " << OFA.getSize();
+    big_size = OFA.getOffset() + OFA.getSize();
+    if (big_size > size)
+      outs() << " (past end of file)";
+    outs() << "\n";
+    outs() << "    align 2^" << OFA.getAlign() << " (" << (1 << OFA.getAlign())
+           << ")\n";
+  }
+}
+
+static void printArchiveChild(Archive::Child &C, bool verbose,
+                              bool print_offset) {
+  if (print_offset)
+    outs() << C.getChildOffset() << "\t";
+  sys::fs::perms Mode = C.getAccessMode();
+  if (verbose) {
+    // FIXME: this first dash, "-", is for (Mode & S_IFMT) == S_IFREG.
+    // But there is nothing in sys::fs::perms for S_IFMT or S_IFREG.
+    outs() << "-";
+    if (Mode & sys::fs::owner_read)
+      outs() << "r";
+    else
+      outs() << "-";
+    if (Mode & sys::fs::owner_write)
+      outs() << "w";
+    else
+      outs() << "-";
+    if (Mode & sys::fs::owner_exe)
+      outs() << "x";
+    else
+      outs() << "-";
+    if (Mode & sys::fs::group_read)
+      outs() << "r";
+    else
+      outs() << "-";
+    if (Mode & sys::fs::group_write)
+      outs() << "w";
+    else
+      outs() << "-";
+    if (Mode & sys::fs::group_exe)
+      outs() << "x";
+    else
+      outs() << "-";
+    if (Mode & sys::fs::others_read)
+      outs() << "r";
+    else
+      outs() << "-";
+    if (Mode & sys::fs::others_write)
+      outs() << "w";
+    else
+      outs() << "-";
+    if (Mode & sys::fs::others_exe)
+      outs() << "x";
+    else
+      outs() << "-";
+  } else {
+    outs() << format("0%o ", Mode);
+  }
+
+  unsigned UID = C.getUID();
+  outs() << format("%3d/", UID);
+  unsigned GID = C.getGID();
+  outs() << format("%-3d ", GID);
+  uint64_t Size = C.getRawSize();
+  outs() << format("%5" PRId64, Size) << " ";
+
+  StringRef RawLastModified = C.getRawLastModified();
+  if (verbose) {
+    unsigned Seconds;
+    if (RawLastModified.getAsInteger(10, Seconds))
+      outs() << "(date: \"%s\" contains non-decimal chars) " << RawLastModified;
+    else {
+      // Since cime(3) returns a 26 character string of the form:
+      // "Sun Sep 16 01:03:52 1973\n\0"
+      // just print 24 characters.
+      time_t t = Seconds;
+      outs() << format("%.24s ", ctime(&t));
+    }
+  } else {
+    outs() << RawLastModified << " ";
+  }
+
+  if (verbose) {
+    ErrorOr<StringRef> NameOrErr = C.getName();
+    if (NameOrErr.getError()) {
+      StringRef RawName = C.getRawName();
+      outs() << RawName << "\n";
+    } else {
+      StringRef Name = NameOrErr.get();
+      outs() << Name << "\n";
+    }
+  } else {
+    StringRef RawName = C.getRawName();
+    outs() << RawName << "\n";
+  }
+}
+
+static void printArchiveHeaders(Archive *A, bool verbose, bool print_offset) {
+  if (A->hasSymbolTable()) {
+    Archive::child_iterator S = A->getSymbolTableChild();
+    Archive::Child C = *S;
+    printArchiveChild(C, verbose, print_offset);
+  }
+  for (Archive::child_iterator I = A->child_begin(), E = A->child_end(); I != E;
+       ++I) {
+    Archive::Child C = *I;
+    printArchiveChild(C, verbose, print_offset);
+  }
+}
+
+// ParseInputMachO() parses the named Mach-O file in Filename and handles the
+// -arch flags selecting just those slices as specified by them and also parses
+// archive files.  Then for each individual Mach-O file ProcessMachO() is
+// called to process the file based on the command line options.
+void llvm::ParseInputMachO(StringRef Filename) {
+  // Check for -arch all and verifiy the -arch flags are valid.
+  for (unsigned i = 0; i < ArchFlags.size(); ++i) {
+    if (ArchFlags[i] == "all") {
+      ArchAll = true;
+    } else {
+      if (!MachOObjectFile::isValidArch(ArchFlags[i])) {
+        errs() << "llvm-objdump: Unknown architecture named '" + ArchFlags[i] +
+                      "'for the -arch option\n";
+        return;
+      }
+    }
+  }
+
+  // Attempt to open the binary.
+  ErrorOr<OwningBinary<Binary>> BinaryOrErr = createBinary(Filename);
+  if (std::error_code EC = BinaryOrErr.getError()) {
+    errs() << "llvm-objdump: '" << Filename << "': " << EC.message() << ".\n";
+    return;
+  }
+  Binary &Bin = *BinaryOrErr.get().getBinary();
+
+  if (Archive *A = dyn_cast<Archive>(&Bin)) {
+    outs() << "Archive : " << Filename << "\n";
+    if (ArchiveHeaders)
+      printArchiveHeaders(A, true, false);
+    for (Archive::child_iterator I = A->child_begin(), E = A->child_end();
+         I != E; ++I) {
+      ErrorOr<std::unique_ptr<Binary>> ChildOrErr = I->getAsBinary();
+      if (ChildOrErr.getError())
+        continue;
+      if (MachOObjectFile *O = dyn_cast<MachOObjectFile>(&*ChildOrErr.get())) {
+        if (!checkMachOAndArchFlags(O, Filename))
+          return;
+        ProcessMachO(Filename, O, O->getFileName());
+      }
+    }
+    return;
+  }
+  if (UniversalHeaders) {
+    if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(&Bin))
+      printMachOUniversalHeaders(UB, true);
+  }
+  if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(&Bin)) {
+    // If we have a list of architecture flags specified dump only those.
+    if (!ArchAll && ArchFlags.size() != 0) {
+      // Look for a slice in the universal binary that matches each ArchFlag.
+      bool ArchFound;
+      for (unsigned i = 0; i < ArchFlags.size(); ++i) {
+        ArchFound = false;
+        for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
+                                                   E = UB->end_objects();
+             I != E; ++I) {
+          if (ArchFlags[i] == I->getArchTypeName()) {
+            ArchFound = true;
+            ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr =
+                I->getAsObjectFile();
+            std::string ArchitectureName = "";
+            if (ArchFlags.size() > 1)
+              ArchitectureName = I->getArchTypeName();
+            if (ObjOrErr) {
+              ObjectFile &O = *ObjOrErr.get();
+              if (MachOObjectFile *MachOOF = dyn_cast<MachOObjectFile>(&O))
+                ProcessMachO(Filename, MachOOF, "", ArchitectureName);
+            } else if (ErrorOr<std::unique_ptr<Archive>> AOrErr =
+                           I->getAsArchive()) {
+              std::unique_ptr<Archive> &A = *AOrErr;
+              outs() << "Archive : " << Filename;
+              if (!ArchitectureName.empty())
+                outs() << " (architecture " << ArchitectureName << ")";
+              outs() << "\n";
+              if (ArchiveHeaders)
+                printArchiveHeaders(A.get(), true, false);
+              for (Archive::child_iterator AI = A->child_begin(),
+                                           AE = A->child_end();
+                   AI != AE; ++AI) {
+                ErrorOr<std::unique_ptr<Binary>> ChildOrErr = AI->getAsBinary();
+                if (ChildOrErr.getError())
+                  continue;
+                if (MachOObjectFile *O =
+                        dyn_cast<MachOObjectFile>(&*ChildOrErr.get()))
+                  ProcessMachO(Filename, O, O->getFileName(), ArchitectureName);
+              }
+            }
+          }
+        }
+        if (!ArchFound) {
+          errs() << "llvm-objdump: file: " + Filename + " does not contain "
+                 << "architecture: " + ArchFlags[i] + "\n";
+          return;
+        }
+      }
+      return;
+    }
+    // No architecture flags were specified so if this contains a slice that
+    // matches the host architecture dump only that.
+    if (!ArchAll) {
+      for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
+                                                 E = UB->end_objects();
+           I != E; ++I) {
+        if (MachOObjectFile::getHostArch().getArchName() ==
+            I->getArchTypeName()) {
+          ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
+          std::string ArchiveName;
+          ArchiveName.clear();
+          if (ObjOrErr) {
+            ObjectFile &O = *ObjOrErr.get();
+            if (MachOObjectFile *MachOOF = dyn_cast<MachOObjectFile>(&O))
+              ProcessMachO(Filename, MachOOF);
+          } else if (ErrorOr<std::unique_ptr<Archive>> AOrErr =
+                         I->getAsArchive()) {
+            std::unique_ptr<Archive> &A = *AOrErr;
+            outs() << "Archive : " << Filename << "\n";
+            if (ArchiveHeaders)
+              printArchiveHeaders(A.get(), true, false);
+            for (Archive::child_iterator AI = A->child_begin(),
+                                         AE = A->child_end();
+                 AI != AE; ++AI) {
+              ErrorOr<std::unique_ptr<Binary>> ChildOrErr = AI->getAsBinary();
+              if (ChildOrErr.getError())
+                continue;
+              if (MachOObjectFile *O =
+                      dyn_cast<MachOObjectFile>(&*ChildOrErr.get()))
+                ProcessMachO(Filename, O, O->getFileName());
+            }
+          }
+          return;
+        }
+      }
+    }
+    // Either all architectures have been specified or none have been specified
+    // and this does not contain the host architecture so dump all the slices.
+    bool moreThanOneArch = UB->getNumberOfObjects() > 1;
+    for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
+                                               E = UB->end_objects();
+         I != E; ++I) {
+      ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
+      std::string ArchitectureName = "";
+      if (moreThanOneArch)
+        ArchitectureName = I->getArchTypeName();
+      if (ObjOrErr) {
+        ObjectFile &Obj = *ObjOrErr.get();
+        if (MachOObjectFile *MachOOF = dyn_cast<MachOObjectFile>(&Obj))
+          ProcessMachO(Filename, MachOOF, "", ArchitectureName);
+      } else if (ErrorOr<std::unique_ptr<Archive>> AOrErr = I->getAsArchive()) {
+        std::unique_ptr<Archive> &A = *AOrErr;
+        outs() << "Archive : " << Filename;
+        if (!ArchitectureName.empty())
+          outs() << " (architecture " << ArchitectureName << ")";
+        outs() << "\n";
+        if (ArchiveHeaders)
+          printArchiveHeaders(A.get(), true, false);
+        for (Archive::child_iterator AI = A->child_begin(), AE = A->child_end();
+             AI != AE; ++AI) {
+          ErrorOr<std::unique_ptr<Binary>> ChildOrErr = AI->getAsBinary();
+          if (ChildOrErr.getError())
+            continue;
+          if (MachOObjectFile *O =
+                  dyn_cast<MachOObjectFile>(&*ChildOrErr.get())) {
+            if (MachOObjectFile *MachOOF = dyn_cast<MachOObjectFile>(O))
+              ProcessMachO(Filename, MachOOF, MachOOF->getFileName(),
+                           ArchitectureName);
+          }
+        }
+      }
+    }
+    return;
+  }
+  if (ObjectFile *O = dyn_cast<ObjectFile>(&Bin)) {
+    if (!checkMachOAndArchFlags(O, Filename))
+      return;
+    if (MachOObjectFile *MachOOF = dyn_cast<MachOObjectFile>(&*O)) {
+      ProcessMachO(Filename, MachOOF);
+    } else
+      errs() << "llvm-objdump: '" << Filename << "': "
+             << "Object is not a Mach-O file type.\n";
+  } else
+    errs() << "llvm-objdump: '" << Filename << "': "
+           << "Unrecognized file type.\n";
+}
+
 typedef std::pair<uint64_t, const char *> BindInfoEntry;
 typedef std::vector<BindInfoEntry> BindTable;
 typedef BindTable::iterator bind_table_iterator;
@@ -280,21 +1593,6 @@ struct DisassembleInfo {
   BindTable *bindtable;
 };
 
-// GuessSymbolName is passed the address of what might be a symbol and a
-// pointer to the DisassembleInfo struct.  It returns the name of a symbol
-// with that address or nullptr if no symbol is found with that address.
-static const char *GuessSymbolName(uint64_t value,
-                                   struct DisassembleInfo *info) {
-  const char *SymbolName = nullptr;
-  // A DenseMap can't lookup up some values.
-  if (value != 0xffffffffffffffffULL && value != 0xfffffffffffffffeULL) {
-    StringRef name = info->AddrMap->lookup(value);
-    if (!name.empty())
-      SymbolName = name.data();
-  }
-  return SymbolName;
-}
-
 // SymbolizerGetOpInfo() is the operand information call back function.
 // This is called to get the symbolic information for operand(s) of an
 // instruction when it is being done.  This routine does this from
@@ -385,8 +1683,8 @@ int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset,
     }
     if (reloc_found && (r_type == MachO::GENERIC_RELOC_SECTDIFF ||
                         r_type == MachO::GENERIC_RELOC_LOCAL_SECTDIFF)) {
-      const char *add = GuessSymbolName(r_value, info);
-      const char *sub = GuessSymbolName(pair_r_value, info);
+      const char *add = GuessSymbolName(r_value, info->AddrMap);
+      const char *sub = GuessSymbolName(pair_r_value, info->AddrMap);
       uint32_t offset = value - (r_value - pair_r_value);
       op_info->AddSymbol.Present = 1;
       if (add != nullptr)
@@ -527,34 +1825,18 @@ int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset,
       const char *name = SymName.data();
       op_info->AddSymbol.Present = 1;
       op_info->AddSymbol.Name = name;
-      if (value != 0) {
-        switch (r_type) {
-        case MachO::ARM_RELOC_HALF:
-          if ((r_length & 0x1) == 1) {
-            op_info->Value = value << 16 | other_half;
-            op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_HI16;
-          } else {
-            op_info->Value = other_half << 16 | value;
-            op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_LO16;
-          }
-          break;
-        default:
-          break;
-        }
-      } else {
-        switch (r_type) {
-        case MachO::ARM_RELOC_HALF:
-          if ((r_length & 0x1) == 1) {
-            op_info->Value = value << 16 | other_half;
-            op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_HI16;
-          } else {
-            op_info->Value = other_half << 16 | value;
-            op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_LO16;
-          }
-          break;
-        default:
-          break;
+      switch (r_type) {
+      case MachO::ARM_RELOC_HALF:
+        if ((r_length & 0x1) == 1) {
+          op_info->Value = value << 16 | other_half;
+          op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_HI16;
+        } else {
+          op_info->Value = other_half << 16 | value;
+          op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_LO16;
         }
+        break;
+      default:
+        break;
       }
       return 1;
     }
@@ -587,8 +1869,8 @@ int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset,
         op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_HI16;
       else
         op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_LO16;
-      const char *add = GuessSymbolName(r_value, info);
-      const char *sub = GuessSymbolName(pair_r_value, info);
+      const char *add = GuessSymbolName(r_value, info->AddrMap);
+      const char *sub = GuessSymbolName(pair_r_value, info->AddrMap);
       int32_t offset = value - (r_value - pair_r_value);
       op_info->AddSymbol.Present = 1;
       if (add != nullptr)
@@ -617,7 +1899,7 @@ int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset,
           op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_LO16;
       }
     }
-    const char *add = GuessSymbolName(value, info);
+    const char *add = GuessSymbolName(value, info->AddrMap);
     if (add != nullptr) {
       op_info->AddSymbol.Name = add;
       return 1;
@@ -1078,7 +2360,7 @@ const char *get_symbol_64(uint32_t sect_offset, SectionRef S,
   //
   // NOTE: need add passing the ReferenceValue to this routine.  Then that code
   // would simply be this:
-  // SymbolName = GuessSymbolName(ReferenceValue, info);
+  // SymbolName = GuessSymbolName(ReferenceValue, info->AddrMap);
 
   return SymbolName;
 }
@@ -1421,7 +2703,7 @@ const char *SymbolizerSymbolLookUp(void *DisInfo, uint64_t ReferenceValue,
     return nullptr;
   }
 
-  const char *SymbolName = GuessSymbolName(ReferenceValue, info);
+  const char *SymbolName = GuessSymbolName(ReferenceValue, info->AddrMap);
 
   if (*ReferenceType == LLVMDisassembler_ReferenceType_In_Branch) {
     *ReferenceName = GuessIndirectSymbol(ReferenceValue, info);
@@ -1585,8 +2867,8 @@ static void emitComments(raw_svector_ostream &CommentStream,
   CommentStream.resync();
 }
 
-static void DisassembleInputMachO2(StringRef Filename,
-                                   MachOObjectFile *MachOOF) {
+static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
+                             StringRef DisSegName, StringRef DisSectName) {
   const char *McpuDefault = nullptr;
   const Target *ThumbTarget = nullptr;
   const Target *TheTarget = GetTarget(MachOOF, &McpuDefault, &ThumbTarget);
@@ -1628,7 +2910,7 @@ static void DisassembleInputMachO2(StringRef Filename,
   if (RelInfo) {
     Symbolizer.reset(TheTarget->createMCSymbolizer(
         TripleName, SymbolizerGetOpInfo, SymbolizerSymbolLookUp,
-        &SymbolizerInfo, &Ctx, RelInfo.release()));
+        &SymbolizerInfo, &Ctx, std::move(RelInfo)));
     DisAsm->setSymbolizer(std::move(Symbolizer));
   }
   int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
@@ -1639,6 +2921,12 @@ static void DisassembleInputMachO2(StringRef Filename,
   // Comment stream and backing vector.
   SmallString<128> CommentsToEmit;
   raw_svector_ostream CommentStream(CommentsToEmit);
+  // FIXME: Setting the CommentStream in the InstPrinter is problematic in that
+  // if it is done then arm64 comments for string literals don't get printed
+  // and some constant get printed instead and not setting it causes intel
+  // (32-bit and 64-bit) comments printed with different spacing before the
+  // comment causing different diffs with the 'C' disassembler library API.
+  // IP->setCommentStream(CommentStream);
 
   if (!AsmInfo || !STI || !DisAsm || !IP) {
     errs() << "error: couldn't initialize disassembler for target "
@@ -1670,7 +2958,7 @@ static void DisassembleInputMachO2(StringRef Filename,
     if (ThumbRelInfo) {
       ThumbSymbolizer.reset(ThumbTarget->createMCSymbolizer(
           ThumbTripleName, SymbolizerGetOpInfo, SymbolizerSymbolLookUp,
-          &ThumbSymbolizerInfo, PtrThumbCtx, ThumbRelInfo.release()));
+          &ThumbSymbolizerInfo, PtrThumbCtx, std::move(ThumbRelInfo)));
       ThumbDisAsm->setSymbolizer(std::move(ThumbSymbolizer));
     }
     int ThumbAsmPrinterVariant = ThumbAsmInfo->getAssemblerDialect();
@@ -1687,8 +2975,6 @@ static void DisassembleInputMachO2(StringRef Filename,
     return;
   }
 
-  outs() << '\n' << Filename << ":\n\n";
-
   MachO::mach_header Header = MachOOF->getHeader();
 
   // FIXME: Using the -cfg command line option, this code used to be able to
@@ -1750,20 +3036,18 @@ static void DisassembleInputMachO2(StringRef Filename,
     diContext.reset(DIContext::getDWARFContext(*DbgObj));
   }
 
-  for (unsigned SectIdx = 0; SectIdx != Sections.size(); SectIdx++) {
-
-    bool SectIsText = Sections[SectIdx].isText();
-    if (SectIsText == false)
-      continue;
+  if (DumpSections.size() == 0)
+    outs() << "(" << DisSegName << "," << DisSectName << ") section\n";
 
+  for (unsigned SectIdx = 0; SectIdx != Sections.size(); SectIdx++) {
     StringRef SectName;
-    if (Sections[SectIdx].getName(SectName) || SectName != "__text")
-      continue; // Skip non-text sections
+    if (Sections[SectIdx].getName(SectName) || SectName != DisSectName)
+      continue;
 
     DataRefImpl DR = Sections[SectIdx].getRawDataRefImpl();
 
     StringRef SegmentName = MachOOF->getSectionFinalSegmentName(DR);
-    if (SegmentName != "__TEXT")
+    if (SegmentName != DisSegName)
       continue;
 
     StringRef BytesStr;
@@ -2013,6 +3297,11 @@ static void DisassembleInputMachO2(StringRef Filename,
         }
       }
     }
+    // The TripleName's need to be reset if we are called again for a different
+    // archtecture.
+    TripleName = "";
+    ThumbTripleName = "";
+
     if (SymbolizerInfo.method != nullptr)
       free(SymbolizerInfo.method);
     if (SymbolizerInfo.demangled_name != nullptr)
@@ -2504,12 +3793,17 @@ static void PrintMachHeader(uint32_t magic, uint32_t cputype,
       break;
     case MachO::CPU_TYPE_X86_64:
       outs() << "  X86_64";
-    case MachO::CPU_SUBTYPE_X86_64_ALL:
-      outs() << "        ALL";
-      break;
-    case MachO::CPU_SUBTYPE_X86_64_H:
-      outs() << "    Haswell";
-      outs() << format(" %10d", cpusubtype & ~MachO::CPU_SUBTYPE_MASK);
+      switch (cpusubtype & ~MachO::CPU_SUBTYPE_MASK) {
+      case MachO::CPU_SUBTYPE_X86_64_ALL:
+        outs() << "        ALL";
+        break;
+      case MachO::CPU_SUBTYPE_X86_64_H:
+        outs() << "    Haswell";
+        break;
+      default:
+        outs() << format(" %10d", cpusubtype & ~MachO::CPU_SUBTYPE_MASK);
+        break;
+      }
       break;
     case MachO::CPU_TYPE_ARM:
       outs() << "     ARM";
@@ -2774,8 +4068,8 @@ static void PrintSegmentCommand(uint32_t cmd, uint32_t cmdsize,
     outs() << "   vmaddr " << format("0x%016" PRIx64, vmaddr) << "\n";
     outs() << "   vmsize " << format("0x%016" PRIx64, vmsize) << "\n";
   } else {
-    outs() << "   vmaddr " << format("0x%08" PRIx32, vmaddr) << "\n";
-    outs() << "   vmsize " << format("0x%08" PRIx32, vmsize) << "\n";
+    outs() << "   vmaddr " << format("0x%08" PRIx64, vmaddr) << "\n";
+    outs() << "   vmsize " << format("0x%08" PRIx64, vmsize) << "\n";
   }
   outs() << "  fileoff " << fileoff;
   if (fileoff > object_size)
@@ -2878,8 +4172,8 @@ static void PrintSection(const char *sectname, const char *segname,
     outs() << "      addr " << format("0x%016" PRIx64, addr) << "\n";
     outs() << "      size " << format("0x%016" PRIx64, size);
   } else {
-    outs() << "      addr " << format("0x%08" PRIx32, addr) << "\n";
-    outs() << "      size " << format("0x%08" PRIx32, size);
+    outs() << "      addr " << format("0x%08" PRIx64, addr) << "\n";
+    outs() << "      size " << format("0x%08" PRIx64, size);
   }
   if ((flags & MachO::S_ZEROFILL) != 0 && offset + size > object_size)
     outs() << " (past end of file)\n";
@@ -3299,6 +4593,21 @@ static void PrintUuidLoadCommand(MachO::uuid_command uuid) {
   outs() << "\n";
 }
 
+static void PrintRpathLoadCommand(MachO::rpath_command rpath, const char *Ptr) {
+  outs() << "          cmd LC_RPATH\n";
+  outs() << "      cmdsize " << rpath.cmdsize;
+  if (rpath.cmdsize < sizeof(struct MachO::rpath_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  if (rpath.path >= rpath.cmdsize)
+    outs() << "         path ?(bad offset " << rpath.path << ")\n";
+  else {
+    const char *P = (const char *)(Ptr) + rpath.path;
+    outs() << "         path " << P << " (offset " << rpath.path << ")\n";
+  }
+}
+
 static void PrintVersionMinLoadCommand(MachO::version_min_command vd) {
   if (vd.cmd == MachO::LC_VERSION_MIN_MACOSX)
     outs() << "      cmd LC_VERSION_MIN_MACOSX\n";
@@ -3317,7 +4626,7 @@ static void PrintVersionMinLoadCommand(MachO::version_min_command vd) {
     outs() << "." << (vd.version & 0xff);
   outs() << "\n";
   if (vd.sdk == 0)
-    outs() << "      sdk n/a\n";
+    outs() << "      sdk n/a";
   else {
     outs() << "      sdk " << ((vd.sdk >> 16) & 0xffff) << "."
            << ((vd.sdk >> 8) & 0xff);
@@ -3360,6 +4669,525 @@ static void PrintEntryPointCommand(MachO::entry_point_command ep) {
   outs() << " stacksize " << ep.stacksize << "\n";
 }
 
+static void PrintEncryptionInfoCommand(MachO::encryption_info_command ec,
+                                       uint32_t object_size) {
+  outs() << "          cmd LC_ENCRYPTION_INFO\n";
+  outs() << "      cmdsize " << ec.cmdsize;
+  if (ec.cmdsize != sizeof(struct MachO::encryption_info_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  outs() << "     cryptoff " << ec.cryptoff;
+  if (ec.cryptoff > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "    cryptsize " << ec.cryptsize;
+  if (ec.cryptsize > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "      cryptid " << ec.cryptid << "\n";
+}
+
+static void PrintEncryptionInfoCommand64(MachO::encryption_info_command_64 ec,
+                                         uint32_t object_size) {
+  outs() << "          cmd LC_ENCRYPTION_INFO_64\n";
+  outs() << "      cmdsize " << ec.cmdsize;
+  if (ec.cmdsize != sizeof(struct MachO::encryption_info_command_64))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  outs() << "     cryptoff " << ec.cryptoff;
+  if (ec.cryptoff > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "    cryptsize " << ec.cryptsize;
+  if (ec.cryptsize > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "      cryptid " << ec.cryptid << "\n";
+  outs() << "          pad " << ec.pad << "\n";
+}
+
+static void PrintLinkerOptionCommand(MachO::linker_option_command lo,
+                                     const char *Ptr) {
+  outs() << "     cmd LC_LINKER_OPTION\n";
+  outs() << " cmdsize " << lo.cmdsize;
+  if (lo.cmdsize < sizeof(struct MachO::linker_option_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  outs() << "   count " << lo.count << "\n";
+  const char *string = Ptr + sizeof(struct MachO::linker_option_command);
+  uint32_t left = lo.cmdsize - sizeof(struct MachO::linker_option_command);
+  uint32_t i = 0;
+  while (left > 0) {
+    while (*string == '\0' && left > 0) {
+      string++;
+      left--;
+    }
+    if (left > 0) {
+      i++;
+      outs() << "  string #" << i << " " << format("%.*s\n", left, string);
+      uint32_t NullPos = StringRef(string, left).find('\0');
+      uint32_t len = std::min(NullPos, left) + 1;
+      string += len;
+      left -= len;
+    }
+  }
+  if (lo.count != i)
+    outs() << "   count " << lo.count << " does not match number of strings "
+           << i << "\n";
+}
+
+static void PrintSubFrameworkCommand(MachO::sub_framework_command sub,
+                                     const char *Ptr) {
+  outs() << "          cmd LC_SUB_FRAMEWORK\n";
+  outs() << "      cmdsize " << sub.cmdsize;
+  if (sub.cmdsize < sizeof(struct MachO::sub_framework_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  if (sub.umbrella < sub.cmdsize) {
+    const char *P = Ptr + sub.umbrella;
+    outs() << "     umbrella " << P << " (offset " << sub.umbrella << ")\n";
+  } else {
+    outs() << "     umbrella ?(bad offset " << sub.umbrella << ")\n";
+  }
+}
+
+static void PrintSubUmbrellaCommand(MachO::sub_umbrella_command sub,
+                                    const char *Ptr) {
+  outs() << "          cmd LC_SUB_UMBRELLA\n";
+  outs() << "      cmdsize " << sub.cmdsize;
+  if (sub.cmdsize < sizeof(struct MachO::sub_umbrella_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  if (sub.sub_umbrella < sub.cmdsize) {
+    const char *P = Ptr + sub.sub_umbrella;
+    outs() << " sub_umbrella " << P << " (offset " << sub.sub_umbrella << ")\n";
+  } else {
+    outs() << " sub_umbrella ?(bad offset " << sub.sub_umbrella << ")\n";
+  }
+}
+
+static void PrintSubLibraryCommand(MachO::sub_library_command sub,
+                                   const char *Ptr) {
+  outs() << "          cmd LC_SUB_LIBRARY\n";
+  outs() << "      cmdsize " << sub.cmdsize;
+  if (sub.cmdsize < sizeof(struct MachO::sub_library_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  if (sub.sub_library < sub.cmdsize) {
+    const char *P = Ptr + sub.sub_library;
+    outs() << "  sub_library " << P << " (offset " << sub.sub_library << ")\n";
+  } else {
+    outs() << "  sub_library ?(bad offset " << sub.sub_library << ")\n";
+  }
+}
+
+static void PrintSubClientCommand(MachO::sub_client_command sub,
+                                  const char *Ptr) {
+  outs() << "          cmd LC_SUB_CLIENT\n";
+  outs() << "      cmdsize " << sub.cmdsize;
+  if (sub.cmdsize < sizeof(struct MachO::sub_client_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  if (sub.client < sub.cmdsize) {
+    const char *P = Ptr + sub.client;
+    outs() << "       client " << P << " (offset " << sub.client << ")\n";
+  } else {
+    outs() << "       client ?(bad offset " << sub.client << ")\n";
+  }
+}
+
+static void PrintRoutinesCommand(MachO::routines_command r) {
+  outs() << "          cmd LC_ROUTINES\n";
+  outs() << "      cmdsize " << r.cmdsize;
+  if (r.cmdsize != sizeof(struct MachO::routines_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  outs() << " init_address " << format("0x%08" PRIx32, r.init_address) << "\n";
+  outs() << "  init_module " << r.init_module << "\n";
+  outs() << "    reserved1 " << r.reserved1 << "\n";
+  outs() << "    reserved2 " << r.reserved2 << "\n";
+  outs() << "    reserved3 " << r.reserved3 << "\n";
+  outs() << "    reserved4 " << r.reserved4 << "\n";
+  outs() << "    reserved5 " << r.reserved5 << "\n";
+  outs() << "    reserved6 " << r.reserved6 << "\n";
+}
+
+static void PrintRoutinesCommand64(MachO::routines_command_64 r) {
+  outs() << "          cmd LC_ROUTINES_64\n";
+  outs() << "      cmdsize " << r.cmdsize;
+  if (r.cmdsize != sizeof(struct MachO::routines_command_64))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  outs() << " init_address " << format("0x%016" PRIx64, r.init_address) << "\n";
+  outs() << "  init_module " << r.init_module << "\n";
+  outs() << "    reserved1 " << r.reserved1 << "\n";
+  outs() << "    reserved2 " << r.reserved2 << "\n";
+  outs() << "    reserved3 " << r.reserved3 << "\n";
+  outs() << "    reserved4 " << r.reserved4 << "\n";
+  outs() << "    reserved5 " << r.reserved5 << "\n";
+  outs() << "    reserved6 " << r.reserved6 << "\n";
+}
+
+static void Print_x86_thread_state64_t(MachO::x86_thread_state64_t &cpu64) {
+  outs() << "   rax  " << format("0x%016" PRIx64, cpu64.rax);
+  outs() << " rbx " << format("0x%016" PRIx64, cpu64.rbx);
+  outs() << " rcx  " << format("0x%016" PRIx64, cpu64.rcx) << "\n";
+  outs() << "   rdx  " << format("0x%016" PRIx64, cpu64.rdx);
+  outs() << " rdi " << format("0x%016" PRIx64, cpu64.rdi);
+  outs() << " rsi  " << format("0x%016" PRIx64, cpu64.rsi) << "\n";
+  outs() << "   rbp  " << format("0x%016" PRIx64, cpu64.rbp);
+  outs() << " rsp " << format("0x%016" PRIx64, cpu64.rsp);
+  outs() << " r8   " << format("0x%016" PRIx64, cpu64.r8) << "\n";
+  outs() << "    r9  " << format("0x%016" PRIx64, cpu64.r9);
+  outs() << " r10 " << format("0x%016" PRIx64, cpu64.r10);
+  outs() << " r11  " << format("0x%016" PRIx64, cpu64.r11) << "\n";
+  outs() << "   r12  " << format("0x%016" PRIx64, cpu64.r12);
+  outs() << " r13 " << format("0x%016" PRIx64, cpu64.r13);
+  outs() << " r14  " << format("0x%016" PRIx64, cpu64.r14) << "\n";
+  outs() << "   r15  " << format("0x%016" PRIx64, cpu64.r15);
+  outs() << " rip " << format("0x%016" PRIx64, cpu64.rip) << "\n";
+  outs() << "rflags  " << format("0x%016" PRIx64, cpu64.rflags);
+  outs() << " cs  " << format("0x%016" PRIx64, cpu64.cs);
+  outs() << " fs   " << format("0x%016" PRIx64, cpu64.fs) << "\n";
+  outs() << "    gs  " << format("0x%016" PRIx64, cpu64.gs) << "\n";
+}
+
+static void Print_mmst_reg(MachO::mmst_reg_t &r) {
+  uint32_t f;
+  outs() << "\t      mmst_reg  ";
+  for (f = 0; f < 10; f++)
+    outs() << format("%02" PRIx32, (r.mmst_reg[f] & 0xff)) << " ";
+  outs() << "\n";
+  outs() << "\t      mmst_rsrv ";
+  for (f = 0; f < 6; f++)
+    outs() << format("%02" PRIx32, (r.mmst_rsrv[f] & 0xff)) << " ";
+  outs() << "\n";
+}
+
+static void Print_xmm_reg(MachO::xmm_reg_t &r) {
+  uint32_t f;
+  outs() << "\t      xmm_reg ";
+  for (f = 0; f < 16; f++)
+    outs() << format("%02" PRIx32, (r.xmm_reg[f] & 0xff)) << " ";
+  outs() << "\n";
+}
+
+static void Print_x86_float_state_t(MachO::x86_float_state64_t &fpu) {
+  outs() << "\t    fpu_reserved[0] " << fpu.fpu_reserved[0];
+  outs() << " fpu_reserved[1] " << fpu.fpu_reserved[1] << "\n";
+  outs() << "\t    control: invalid " << fpu.fpu_fcw.invalid;
+  outs() << " denorm " << fpu.fpu_fcw.denorm;
+  outs() << " zdiv " << fpu.fpu_fcw.zdiv;
+  outs() << " ovrfl " << fpu.fpu_fcw.ovrfl;
+  outs() << " undfl " << fpu.fpu_fcw.undfl;
+  outs() << " precis " << fpu.fpu_fcw.precis << "\n";
+  outs() << "\t\t     pc ";
+  if (fpu.fpu_fcw.pc == MachO::x86_FP_PREC_24B)
+    outs() << "FP_PREC_24B ";
+  else if (fpu.fpu_fcw.pc == MachO::x86_FP_PREC_53B)
+    outs() << "FP_PREC_53B ";
+  else if (fpu.fpu_fcw.pc == MachO::x86_FP_PREC_64B)
+    outs() << "FP_PREC_64B ";
+  else
+    outs() << fpu.fpu_fcw.pc << " ";
+  outs() << "rc ";
+  if (fpu.fpu_fcw.rc == MachO::x86_FP_RND_NEAR)
+    outs() << "FP_RND_NEAR ";
+  else if (fpu.fpu_fcw.rc == MachO::x86_FP_RND_DOWN)
+    outs() << "FP_RND_DOWN ";
+  else if (fpu.fpu_fcw.rc == MachO::x86_FP_RND_UP)
+    outs() << "FP_RND_UP ";
+  else if (fpu.fpu_fcw.rc == MachO::x86_FP_CHOP)
+    outs() << "FP_CHOP ";
+  outs() << "\n";
+  outs() << "\t    status: invalid " << fpu.fpu_fsw.invalid;
+  outs() << " denorm " << fpu.fpu_fsw.denorm;
+  outs() << " zdiv " << fpu.fpu_fsw.zdiv;
+  outs() << " ovrfl " << fpu.fpu_fsw.ovrfl;
+  outs() << " undfl " << fpu.fpu_fsw.undfl;
+  outs() << " precis " << fpu.fpu_fsw.precis;
+  outs() << " stkflt " << fpu.fpu_fsw.stkflt << "\n";
+  outs() << "\t            errsumm " << fpu.fpu_fsw.errsumm;
+  outs() << " c0 " << fpu.fpu_fsw.c0;
+  outs() << " c1 " << fpu.fpu_fsw.c1;
+  outs() << " c2 " << fpu.fpu_fsw.c2;
+  outs() << " tos " << fpu.fpu_fsw.tos;
+  outs() << " c3 " << fpu.fpu_fsw.c3;
+  outs() << " busy " << fpu.fpu_fsw.busy << "\n";
+  outs() << "\t    fpu_ftw " << format("0x%02" PRIx32, fpu.fpu_ftw);
+  outs() << " fpu_rsrv1 " << format("0x%02" PRIx32, fpu.fpu_rsrv1);
+  outs() << " fpu_fop " << format("0x%04" PRIx32, fpu.fpu_fop);
+  outs() << " fpu_ip " << format("0x%08" PRIx32, fpu.fpu_ip) << "\n";
+  outs() << "\t    fpu_cs " << format("0x%04" PRIx32, fpu.fpu_cs);
+  outs() << " fpu_rsrv2 " << format("0x%04" PRIx32, fpu.fpu_rsrv2);
+  outs() << " fpu_dp " << format("0x%08" PRIx32, fpu.fpu_dp);
+  outs() << " fpu_ds " << format("0x%04" PRIx32, fpu.fpu_ds) << "\n";
+  outs() << "\t    fpu_rsrv3 " << format("0x%04" PRIx32, fpu.fpu_rsrv3);
+  outs() << " fpu_mxcsr " << format("0x%08" PRIx32, fpu.fpu_mxcsr);
+  outs() << " fpu_mxcsrmask " << format("0x%08" PRIx32, fpu.fpu_mxcsrmask);
+  outs() << "\n";
+  outs() << "\t    fpu_stmm0:\n";
+  Print_mmst_reg(fpu.fpu_stmm0);
+  outs() << "\t    fpu_stmm1:\n";
+  Print_mmst_reg(fpu.fpu_stmm1);
+  outs() << "\t    fpu_stmm2:\n";
+  Print_mmst_reg(fpu.fpu_stmm2);
+  outs() << "\t    fpu_stmm3:\n";
+  Print_mmst_reg(fpu.fpu_stmm3);
+  outs() << "\t    fpu_stmm4:\n";
+  Print_mmst_reg(fpu.fpu_stmm4);
+  outs() << "\t    fpu_stmm5:\n";
+  Print_mmst_reg(fpu.fpu_stmm5);
+  outs() << "\t    fpu_stmm6:\n";
+  Print_mmst_reg(fpu.fpu_stmm6);
+  outs() << "\t    fpu_stmm7:\n";
+  Print_mmst_reg(fpu.fpu_stmm7);
+  outs() << "\t    fpu_xmm0:\n";
+  Print_xmm_reg(fpu.fpu_xmm0);
+  outs() << "\t    fpu_xmm1:\n";
+  Print_xmm_reg(fpu.fpu_xmm1);
+  outs() << "\t    fpu_xmm2:\n";
+  Print_xmm_reg(fpu.fpu_xmm2);
+  outs() << "\t    fpu_xmm3:\n";
+  Print_xmm_reg(fpu.fpu_xmm3);
+  outs() << "\t    fpu_xmm4:\n";
+  Print_xmm_reg(fpu.fpu_xmm4);
+  outs() << "\t    fpu_xmm5:\n";
+  Print_xmm_reg(fpu.fpu_xmm5);
+  outs() << "\t    fpu_xmm6:\n";
+  Print_xmm_reg(fpu.fpu_xmm6);
+  outs() << "\t    fpu_xmm7:\n";
+  Print_xmm_reg(fpu.fpu_xmm7);
+  outs() << "\t    fpu_xmm8:\n";
+  Print_xmm_reg(fpu.fpu_xmm8);
+  outs() << "\t    fpu_xmm9:\n";
+  Print_xmm_reg(fpu.fpu_xmm9);
+  outs() << "\t    fpu_xmm10:\n";
+  Print_xmm_reg(fpu.fpu_xmm10);
+  outs() << "\t    fpu_xmm11:\n";
+  Print_xmm_reg(fpu.fpu_xmm11);
+  outs() << "\t    fpu_xmm12:\n";
+  Print_xmm_reg(fpu.fpu_xmm12);
+  outs() << "\t    fpu_xmm13:\n";
+  Print_xmm_reg(fpu.fpu_xmm13);
+  outs() << "\t    fpu_xmm14:\n";
+  Print_xmm_reg(fpu.fpu_xmm14);
+  outs() << "\t    fpu_xmm15:\n";
+  Print_xmm_reg(fpu.fpu_xmm15);
+  outs() << "\t    fpu_rsrv4:\n";
+  for (uint32_t f = 0; f < 6; f++) {
+    outs() << "\t            ";
+    for (uint32_t g = 0; g < 16; g++)
+      outs() << format("%02" PRIx32, fpu.fpu_rsrv4[f * g]) << " ";
+    outs() << "\n";
+  }
+  outs() << "\t    fpu_reserved1 " << format("0x%08" PRIx32, fpu.fpu_reserved1);
+  outs() << "\n";
+}
+
+static void Print_x86_exception_state_t(MachO::x86_exception_state64_t &exc64) {
+  outs() << "\t    trapno " << format("0x%08" PRIx32, exc64.trapno);
+  outs() << " err " << format("0x%08" PRIx32, exc64.err);
+  outs() << " faultvaddr " << format("0x%016" PRIx64, exc64.faultvaddr) << "\n";
+}
+
+static void PrintThreadCommand(MachO::thread_command t, const char *Ptr,
+                               bool isLittleEndian, uint32_t cputype) {
+  if (t.cmd == MachO::LC_THREAD)
+    outs() << "        cmd LC_THREAD\n";
+  else if (t.cmd == MachO::LC_UNIXTHREAD)
+    outs() << "        cmd LC_UNIXTHREAD\n";
+  else
+    outs() << "        cmd " << t.cmd << " (unknown)\n";
+  outs() << "    cmdsize " << t.cmdsize;
+  if (t.cmdsize < sizeof(struct MachO::thread_command) + 2 * sizeof(uint32_t))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+
+  const char *begin = Ptr + sizeof(struct MachO::thread_command);
+  const char *end = Ptr + t.cmdsize;
+  uint32_t flavor, count, left;
+  if (cputype == MachO::CPU_TYPE_X86_64) {
+    while (begin < end) {
+      if (end - begin > (ptrdiff_t)sizeof(uint32_t)) {
+        memcpy((char *)&flavor, begin, sizeof(uint32_t));
+        begin += sizeof(uint32_t);
+      } else {
+        flavor = 0;
+        begin = end;
+      }
+      if (isLittleEndian != sys::IsLittleEndianHost)
+        sys::swapByteOrder(flavor);
+      if (end - begin > (ptrdiff_t)sizeof(uint32_t)) {
+        memcpy((char *)&count, begin, sizeof(uint32_t));
+        begin += sizeof(uint32_t);
+      } else {
+        count = 0;
+        begin = end;
+      }
+      if (isLittleEndian != sys::IsLittleEndianHost)
+        sys::swapByteOrder(count);
+      if (flavor == MachO::x86_THREAD_STATE64) {
+        outs() << "     flavor x86_THREAD_STATE64\n";
+        if (count == MachO::x86_THREAD_STATE64_COUNT)
+          outs() << "      count x86_THREAD_STATE64_COUNT\n";
+        else
+          outs() << "      count " << count
+                 << " (not x86_THREAD_STATE64_COUNT)\n";
+        MachO::x86_thread_state64_t cpu64;
+        left = end - begin;
+        if (left >= sizeof(MachO::x86_thread_state64_t)) {
+          memcpy(&cpu64, begin, sizeof(MachO::x86_thread_state64_t));
+          begin += sizeof(MachO::x86_thread_state64_t);
+        } else {
+          memset(&cpu64, '\0', sizeof(MachO::x86_thread_state64_t));
+          memcpy(&cpu64, begin, left);
+          begin += left;
+        }
+        if (isLittleEndian != sys::IsLittleEndianHost)
+          swapStruct(cpu64);
+        Print_x86_thread_state64_t(cpu64);
+      } else if (flavor == MachO::x86_THREAD_STATE) {
+        outs() << "     flavor x86_THREAD_STATE\n";
+        if (count == MachO::x86_THREAD_STATE_COUNT)
+          outs() << "      count x86_THREAD_STATE_COUNT\n";
+        else
+          outs() << "      count " << count
+                 << " (not x86_THREAD_STATE_COUNT)\n";
+        struct MachO::x86_thread_state_t ts;
+        left = end - begin;
+        if (left >= sizeof(MachO::x86_thread_state_t)) {
+          memcpy(&ts, begin, sizeof(MachO::x86_thread_state_t));
+          begin += sizeof(MachO::x86_thread_state_t);
+        } else {
+          memset(&ts, '\0', sizeof(MachO::x86_thread_state_t));
+          memcpy(&ts, begin, left);
+          begin += left;
+        }
+        if (isLittleEndian != sys::IsLittleEndianHost)
+          swapStruct(ts);
+        if (ts.tsh.flavor == MachO::x86_THREAD_STATE64) {
+          outs() << "\t    tsh.flavor x86_THREAD_STATE64 ";
+          if (ts.tsh.count == MachO::x86_THREAD_STATE64_COUNT)
+            outs() << "tsh.count x86_THREAD_STATE64_COUNT\n";
+          else
+            outs() << "tsh.count " << ts.tsh.count
+                   << " (not x86_THREAD_STATE64_COUNT\n";
+          Print_x86_thread_state64_t(ts.uts.ts64);
+        } else {
+          outs() << "\t    tsh.flavor " << ts.tsh.flavor << "  tsh.count "
+                 << ts.tsh.count << "\n";
+        }
+      } else if (flavor == MachO::x86_FLOAT_STATE) {
+        outs() << "     flavor x86_FLOAT_STATE\n";
+        if (count == MachO::x86_FLOAT_STATE_COUNT)
+          outs() << "      count x86_FLOAT_STATE_COUNT\n";
+        else
+          outs() << "      count " << count << " (not x86_FLOAT_STATE_COUNT)\n";
+        struct MachO::x86_float_state_t fs;
+        left = end - begin;
+        if (left >= sizeof(MachO::x86_float_state_t)) {
+          memcpy(&fs, begin, sizeof(MachO::x86_float_state_t));
+          begin += sizeof(MachO::x86_float_state_t);
+        } else {
+          memset(&fs, '\0', sizeof(MachO::x86_float_state_t));
+          memcpy(&fs, begin, left);
+          begin += left;
+        }
+        if (isLittleEndian != sys::IsLittleEndianHost)
+          swapStruct(fs);
+        if (fs.fsh.flavor == MachO::x86_FLOAT_STATE64) {
+          outs() << "\t    fsh.flavor x86_FLOAT_STATE64 ";
+          if (fs.fsh.count == MachO::x86_FLOAT_STATE64_COUNT)
+            outs() << "fsh.count x86_FLOAT_STATE64_COUNT\n";
+          else
+            outs() << "fsh.count " << fs.fsh.count
+                   << " (not x86_FLOAT_STATE64_COUNT\n";
+          Print_x86_float_state_t(fs.ufs.fs64);
+        } else {
+          outs() << "\t    fsh.flavor " << fs.fsh.flavor << "  fsh.count "
+                 << fs.fsh.count << "\n";
+        }
+      } else if (flavor == MachO::x86_EXCEPTION_STATE) {
+        outs() << "     flavor x86_EXCEPTION_STATE\n";
+        if (count == MachO::x86_EXCEPTION_STATE_COUNT)
+          outs() << "      count x86_EXCEPTION_STATE_COUNT\n";
+        else
+          outs() << "      count " << count
+                 << " (not x86_EXCEPTION_STATE_COUNT)\n";
+        struct MachO::x86_exception_state_t es;
+        left = end - begin;
+        if (left >= sizeof(MachO::x86_exception_state_t)) {
+          memcpy(&es, begin, sizeof(MachO::x86_exception_state_t));
+          begin += sizeof(MachO::x86_exception_state_t);
+        } else {
+          memset(&es, '\0', sizeof(MachO::x86_exception_state_t));
+          memcpy(&es, begin, left);
+          begin += left;
+        }
+        if (isLittleEndian != sys::IsLittleEndianHost)
+          swapStruct(es);
+        if (es.esh.flavor == MachO::x86_EXCEPTION_STATE64) {
+          outs() << "\t    esh.flavor x86_EXCEPTION_STATE64\n";
+          if (es.esh.count == MachO::x86_EXCEPTION_STATE64_COUNT)
+            outs() << "\t    esh.count x86_EXCEPTION_STATE64_COUNT\n";
+          else
+            outs() << "\t    esh.count " << es.esh.count
+                   << " (not x86_EXCEPTION_STATE64_COUNT\n";
+          Print_x86_exception_state_t(es.ues.es64);
+        } else {
+          outs() << "\t    esh.flavor " << es.esh.flavor << "  esh.count "
+                 << es.esh.count << "\n";
+        }
+      } else {
+        outs() << "     flavor " << flavor << " (unknown)\n";
+        outs() << "      count " << count << "\n";
+        outs() << "      state (unknown)\n";
+        begin += count * sizeof(uint32_t);
+      }
+    }
+  } else {
+    while (begin < end) {
+      if (end - begin > (ptrdiff_t)sizeof(uint32_t)) {
+        memcpy((char *)&flavor, begin, sizeof(uint32_t));
+        begin += sizeof(uint32_t);
+      } else {
+        flavor = 0;
+        begin = end;
+      }
+      if (isLittleEndian != sys::IsLittleEndianHost)
+        sys::swapByteOrder(flavor);
+      if (end - begin > (ptrdiff_t)sizeof(uint32_t)) {
+        memcpy((char *)&count, begin, sizeof(uint32_t));
+        begin += sizeof(uint32_t);
+      } else {
+        count = 0;
+        begin = end;
+      }
+      if (isLittleEndian != sys::IsLittleEndianHost)
+        sys::swapByteOrder(count);
+      outs() << "     flavor " << flavor << "\n";
+      outs() << "      count " << count << "\n";
+      outs() << "      state (Unknown cputype/cpusubtype)\n";
+      begin += count * sizeof(uint32_t);
+    }
+  }
+}
+
 static void PrintDylibCommand(MachO::dylib_command dl, const char *Ptr) {
   if (dl.cmd == MachO::LC_ID_DYLIB)
     outs() << "          cmd LC_ID_DYLIB\n";
@@ -3443,6 +5271,8 @@ static void PrintLinkEditDataCommand(MachO::linkedit_data_command ld,
 static void PrintLoadCommands(const MachOObjectFile *Obj, uint32_t ncmds,
                               uint32_t filetype, uint32_t cputype,
                               bool verbose) {
+  if (ncmds == 0)
+    return;
   StringRef Buf = Obj->getData();
   MachOObjectFile::LoadCommandInfo Command = Obj->getFirstLoadCommandInfo();
   for (unsigned i = 0;; ++i) {
@@ -3455,7 +5285,7 @@ static void PrintLoadCommands(const MachOObjectFile *Obj, uint32_t ncmds,
                           SLC.initprot, SLC.nsects, SLC.flags, Buf.size(),
                           verbose);
       for (unsigned j = 0; j < SLC.nsects; j++) {
-        MachO::section_64 S = Obj->getSection64(Command, j);
+        MachO::section S = Obj->getSection(Command, j);
         PrintSection(S.sectname, S.segname, S.addr, S.size, S.offset, S.align,
                      S.reloff, S.nreloc, S.flags, S.reserved1, S.reserved2,
                      SLC.cmd, sg_segname, filetype, Buf.size(), verbose);
@@ -3494,7 +5324,11 @@ static void PrintLoadCommands(const MachOObjectFile *Obj, uint32_t ncmds,
     } else if (Command.C.cmd == MachO::LC_UUID) {
       MachO::uuid_command Uuid = Obj->getUuidCommand(Command);
       PrintUuidLoadCommand(Uuid);
-    } else if (Command.C.cmd == MachO::LC_VERSION_MIN_MACOSX) {
+    } else if (Command.C.cmd == MachO::LC_RPATH) {
+      MachO::rpath_command Rpath = Obj->getRpathCommand(Command);
+      PrintRpathLoadCommand(Rpath, Command.Ptr);
+    } else if (Command.C.cmd == MachO::LC_VERSION_MIN_MACOSX ||
+               Command.C.cmd == MachO::LC_VERSION_MIN_IPHONEOS) {
       MachO::version_min_command Vd = Obj->getVersionMinLoadCommand(Command);
       PrintVersionMinLoadCommand(Vd);
     } else if (Command.C.cmd == MachO::LC_SOURCE_VERSION) {
@@ -3503,6 +5337,40 @@ static void PrintLoadCommands(const MachOObjectFile *Obj, uint32_t ncmds,
     } else if (Command.C.cmd == MachO::LC_MAIN) {
       MachO::entry_point_command Ep = Obj->getEntryPointCommand(Command);
       PrintEntryPointCommand(Ep);
+    } else if (Command.C.cmd == MachO::LC_ENCRYPTION_INFO) {
+      MachO::encryption_info_command Ei =
+          Obj->getEncryptionInfoCommand(Command);
+      PrintEncryptionInfoCommand(Ei, Buf.size());
+    } else if (Command.C.cmd == MachO::LC_ENCRYPTION_INFO_64) {
+      MachO::encryption_info_command_64 Ei =
+          Obj->getEncryptionInfoCommand64(Command);
+      PrintEncryptionInfoCommand64(Ei, Buf.size());
+    } else if (Command.C.cmd == MachO::LC_LINKER_OPTION) {
+      MachO::linker_option_command Lo =
+          Obj->getLinkerOptionLoadCommand(Command);
+      PrintLinkerOptionCommand(Lo, Command.Ptr);
+    } else if (Command.C.cmd == MachO::LC_SUB_FRAMEWORK) {
+      MachO::sub_framework_command Sf = Obj->getSubFrameworkCommand(Command);
+      PrintSubFrameworkCommand(Sf, Command.Ptr);
+    } else if (Command.C.cmd == MachO::LC_SUB_UMBRELLA) {
+      MachO::sub_umbrella_command Sf = Obj->getSubUmbrellaCommand(Command);
+      PrintSubUmbrellaCommand(Sf, Command.Ptr);
+    } else if (Command.C.cmd == MachO::LC_SUB_LIBRARY) {
+      MachO::sub_library_command Sl = Obj->getSubLibraryCommand(Command);
+      PrintSubLibraryCommand(Sl, Command.Ptr);
+    } else if (Command.C.cmd == MachO::LC_SUB_CLIENT) {
+      MachO::sub_client_command Sc = Obj->getSubClientCommand(Command);
+      PrintSubClientCommand(Sc, Command.Ptr);
+    } else if (Command.C.cmd == MachO::LC_ROUTINES) {
+      MachO::routines_command Rc = Obj->getRoutinesCommand(Command);
+      PrintRoutinesCommand(Rc);
+    } else if (Command.C.cmd == MachO::LC_ROUTINES_64) {
+      MachO::routines_command_64 Rc = Obj->getRoutinesCommand64(Command);
+      PrintRoutinesCommand64(Rc);
+    } else if (Command.C.cmd == MachO::LC_THREAD ||
+               Command.C.cmd == MachO::LC_UNIXTHREAD) {
+      MachO::thread_command Tc = Obj->getThreadCommand(Command);
+      PrintThreadCommand(Tc, Command.Ptr, Obj->isLittleEndian(), cputype);
     } else if (Command.C.cmd == MachO::LC_LOAD_DYLIB ||
                Command.C.cmd == MachO::LC_ID_DYLIB ||
                Command.C.cmd == MachO::LC_LOAD_WEAK_DYLIB ||
diff --git a/tools/llvm-objdump/Makefile b/tools/llvm-objdump/Makefile
index 4616b78..7c16523 100644
--- a/tools/llvm-objdump/Makefile
+++ b/tools/llvm-objdump/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL := ../..
 TOOLNAME := llvm-objdump
-LINK_COMPONENTS := all-targets DebugInfo MC MCParser MCDisassembler Object
+LINK_COMPONENTS := all-targets DebugInfoDWARF MC MCParser MCDisassembler Object
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 8903bff..6e62aaa 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -44,7 +44,6 @@
 #include "llvm/Support/Host.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
@@ -62,36 +61,36 @@ using namespace object;
 static cl::list<std::string>
 InputFilenames(cl::Positional, cl::desc("<input object files>"),cl::ZeroOrMore);
 
-static cl::opt<bool>
-Disassemble("disassemble",
+cl::opt<bool>
+llvm::Disassemble("disassemble",
   cl::desc("Display assembler mnemonics for the machine instructions"));
 static cl::alias
 Disassembled("d", cl::desc("Alias for --disassemble"),
              cl::aliasopt(Disassemble));
 
-static cl::opt<bool>
-Relocations("r", cl::desc("Display the relocation entries in the file"));
+cl::opt<bool>
+llvm::Relocations("r", cl::desc("Display the relocation entries in the file"));
 
-static cl::opt<bool>
-SectionContents("s", cl::desc("Display the content of each section"));
+cl::opt<bool>
+llvm::SectionContents("s", cl::desc("Display the content of each section"));
 
-static cl::opt<bool>
-SymbolTable("t", cl::desc("Display the symbol table"));
+cl::opt<bool>
+llvm::SymbolTable("t", cl::desc("Display the symbol table"));
 
-static cl::opt<bool>
-ExportsTrie("exports-trie", cl::desc("Display mach-o exported symbols"));
+cl::opt<bool>
+llvm::ExportsTrie("exports-trie", cl::desc("Display mach-o exported symbols"));
 
-static cl::opt<bool>
-Rebase("rebase", cl::desc("Display mach-o rebasing info"));
+cl::opt<bool>
+llvm::Rebase("rebase", cl::desc("Display mach-o rebasing info"));
 
-static cl::opt<bool>
-Bind("bind", cl::desc("Display mach-o binding info"));
+cl::opt<bool>
+llvm::Bind("bind", cl::desc("Display mach-o binding info"));
 
-static cl::opt<bool>
-LazyBind("lazy-bind", cl::desc("Display mach-o lazy binding info"));
+cl::opt<bool>
+llvm::LazyBind("lazy-bind", cl::desc("Display mach-o lazy binding info"));
 
-static cl::opt<bool>
-WeakBind("weak-bind", cl::desc("Display mach-o weak binding info"));
+cl::opt<bool>
+llvm::WeakBind("weak-bind", cl::desc("Display mach-o weak binding info"));
 
 static cl::opt<bool>
 MachOOpt("macho", cl::desc("Use MachO specific object file parser"));
@@ -109,12 +108,12 @@ llvm::MCPU("mcpu",
      cl::init(""));
 
 cl::opt<std::string>
-llvm::ArchName("arch", cl::desc("Target arch to disassemble for, "
+llvm::ArchName("arch-name", cl::desc("Target arch to disassemble for, "
                                 "see -version for available targets"));
 
-static cl::opt<bool>
-SectionHeaders("section-headers", cl::desc("Display summaries of the headers "
-                                           "for each section."));
+cl::opt<bool>
+llvm::SectionHeaders("section-headers", cl::desc("Display summaries of the "
+                                                 "headers for each section."));
 static cl::alias
 SectionHeadersShort("headers", cl::desc("Alias for --section-headers"),
                     cl::aliasopt(SectionHeaders));
@@ -133,22 +132,23 @@ llvm::NoShowRawInsn("no-show-raw-insn", cl::desc("When disassembling "
                                                  "instructions, do not print "
                                                  "the instruction bytes."));
 
-static cl::opt<bool>
-UnwindInfo("unwind-info", cl::desc("Display unwind information"));
+cl::opt<bool>
+llvm::UnwindInfo("unwind-info", cl::desc("Display unwind information"));
 
 static cl::alias
 UnwindInfoShort("u", cl::desc("Alias for --unwind-info"),
                 cl::aliasopt(UnwindInfo));
 
-static cl::opt<bool>
-PrivateHeaders("private-headers",
-               cl::desc("Display format specific file headers"));
+cl::opt<bool>
+llvm::PrivateHeaders("private-headers",
+                     cl::desc("Display format specific file headers"));
 
 static cl::alias
 PrivateHeadersShort("p", cl::desc("Alias for --private-headers"),
                     cl::aliasopt(PrivateHeaders));
 
 static StringRef ToolName;
+static int ReturnValue = EXIT_SUCCESS;
 
 bool llvm::error(std::error_code EC) {
   if (!EC)
@@ -156,6 +156,7 @@ bool llvm::error(std::error_code EC) {
 
   outs() << ToolName << ": error reading file: " << EC.message() << ".\n";
   outs().flush();
+  ReturnValue = EXIT_FAILURE;
   return true;
 }
 
@@ -452,7 +453,7 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
   }
 }
 
-static void PrintRelocations(const ObjectFile *Obj) {
+void llvm::PrintRelocations(const ObjectFile *Obj) {
   StringRef Fmt = Obj->getBytesInAddress() > 4 ? "%016" PRIx64 :
                                                  "%08" PRIx64;
   // Regular objdump doesn't print relocations in non-relocatable object
@@ -489,7 +490,7 @@ static void PrintRelocations(const ObjectFile *Obj) {
   }
 }
 
-static void PrintSectionHeaders(const ObjectFile *Obj) {
+void llvm::PrintSectionHeaders(const ObjectFile *Obj) {
   outs() << "Sections:\n"
             "Idx Name          Size      Address          Type\n";
   unsigned i = 0;
@@ -510,7 +511,7 @@ static void PrintSectionHeaders(const ObjectFile *Obj) {
   }
 }
 
-static void PrintSectionContents(const ObjectFile *Obj) {
+void llvm::PrintSectionContents(const ObjectFile *Obj) {
   std::error_code EC;
   for (const SectionRef &Section : Obj->sections()) {
     StringRef Name;
@@ -613,7 +614,7 @@ static void PrintCOFFSymbolTable(const COFFObjectFile *coff) {
   }
 }
 
-static void PrintSymbolTable(const ObjectFile *o) {
+void llvm::PrintSymbolTable(const ObjectFile *o) {
   outs() << "SYMBOL TABLE:\n";
 
   if (const COFFObjectFile *coff = dyn_cast<const COFFObjectFile>(o)) {
@@ -641,7 +642,15 @@ static void PrintSymbolTable(const ObjectFile *o) {
     bool Global = Flags & SymbolRef::SF_Global;
     bool Weak = Flags & SymbolRef::SF_Weak;
     bool Absolute = Flags & SymbolRef::SF_Absolute;
-
+    bool Common = Flags & SymbolRef::SF_Common;
+
+    if (Common) {
+      uint32_t Alignment;
+      if (error(Symbol.getAlignment(Alignment)))
+        Alignment = 0;
+      Address = Size;
+      Size = Alignment;
+    }
     if (Address == UnknownAddressOrSize)
       Address = 0;
     if (Size == UnknownAddressOrSize)
@@ -671,6 +680,8 @@ static void PrintSymbolTable(const ObjectFile *o) {
            << ' ';
     if (Absolute) {
       outs() << "*ABS*";
+    } else if (Common) {
+      outs() << "*COM*";
     } else if (Section == o->section_end()) {
       outs() << "*UND*";
     } else {
@@ -707,7 +718,7 @@ static void PrintUnwindInfo(const ObjectFile *o) {
   }
 }
 
-static void printExportsTrie(const ObjectFile *o) {
+void llvm::printExportsTrie(const ObjectFile *o) {
   outs() << "Exports trie:\n";
   if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
     printMachOExportsTrie(MachO);
@@ -718,7 +729,7 @@ static void printExportsTrie(const ObjectFile *o) {
   }
 }
 
-static void printRebaseTable(const ObjectFile *o) {
+void llvm::printRebaseTable(const ObjectFile *o) {
   outs() << "Rebase table:\n";
   if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
     printMachORebaseTable(MachO);
@@ -729,7 +740,7 @@ static void printRebaseTable(const ObjectFile *o) {
   }
 }
 
-static void printBindTable(const ObjectFile *o) {
+void llvm::printBindTable(const ObjectFile *o) {
   outs() << "Bind table:\n";
   if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
     printMachOBindTable(MachO);
@@ -740,7 +751,7 @@ static void printBindTable(const ObjectFile *o) {
   }
 }
 
-static void printLazyBindTable(const ObjectFile *o) {
+void llvm::printLazyBindTable(const ObjectFile *o) {
   outs() << "Lazy bind table:\n";
   if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
     printMachOLazyBindTable(MachO);
@@ -751,7 +762,7 @@ static void printLazyBindTable(const ObjectFile *o) {
   }
 }
 
-static void printWeakBindTable(const ObjectFile *o) {
+void llvm::printWeakBindTable(const ObjectFile *o) {
   outs() << "Weak bind table:\n";
   if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
     printMachOWeakBindTable(MachO);
@@ -831,8 +842,11 @@ static void DumpInput(StringRef file) {
     return;
   }
 
-  if (MachOOpt && Disassemble) {
-    DisassembleInputMachO(file);
+  // If we are using the Mach-O specific object file parser, then let it parse
+  // the file and process the command line options.  So the -arch flags can
+  // be used to select specific slices, etc.
+  if (MachOOpt) {
+    ParseInputMachO(file);
     return;
   }
 
@@ -887,7 +901,13 @@ int main(int argc, char **argv) {
       && !Rebase
       && !Bind
       && !LazyBind
-      && !WeakBind) {
+      && !WeakBind
+      && !(UniversalHeaders && MachOOpt)
+      && !(ArchiveHeaders && MachOOpt)
+      && !(IndirectSymbols && MachOOpt)
+      && !(DataInCode && MachOOpt)
+      && !(LinkOptHints && MachOOpt)
+      && !(DumpSections.size() != 0 && MachOOpt)) {
     cl::PrintHelpMessage();
     return 2;
   }
@@ -895,5 +915,5 @@ int main(int argc, char **argv) {
   std::for_each(InputFilenames.begin(), InputFilenames.end(),
                 DumpInput);
 
-  return 0;
+  return ReturnValue;
 }
diff --git a/tools/llvm-objdump/llvm-objdump.h b/tools/llvm-objdump/llvm-objdump.h
index ef1509f..19f842f 100644
--- a/tools/llvm-objdump/llvm-objdump.h
+++ b/tools/llvm-objdump/llvm-objdump.h
@@ -26,13 +26,31 @@ extern cl::opt<std::string> TripleName;
 extern cl::opt<std::string> ArchName;
 extern cl::opt<std::string> MCPU;
 extern cl::list<std::string> MAttrs;
+extern cl::list<std::string> DumpSections;
+extern cl::opt<bool> Disassemble;
 extern cl::opt<bool> NoShowRawInsn;
+extern cl::opt<bool> PrivateHeaders;
+extern cl::opt<bool> ExportsTrie;
+extern cl::opt<bool> Rebase;
+extern cl::opt<bool> Bind;
+extern cl::opt<bool> LazyBind;
+extern cl::opt<bool> WeakBind;
+extern cl::opt<bool> UniversalHeaders;
+extern cl::opt<bool> ArchiveHeaders;
+extern cl::opt<bool> IndirectSymbols;
+extern cl::opt<bool> DataInCode;
+extern cl::opt<bool> LinkOptHints;
+extern cl::opt<bool> Relocations;
+extern cl::opt<bool> SectionHeaders;
+extern cl::opt<bool> SectionContents;
+extern cl::opt<bool> SymbolTable;
+extern cl::opt<bool> UnwindInfo;
 
 // Various helper functions.
 bool error(std::error_code ec);
 bool RelocAddressLess(object::RelocationRef a, object::RelocationRef b);
 void DumpBytes(StringRef bytes);
-void DisassembleInputMachO(StringRef Filename);
+void ParseInputMachO(StringRef Filename);
 void printCOFFUnwindInfo(const object::COFFObjectFile* o);
 void printMachOUnwindInfo(const object::MachOObjectFile* o);
 void printMachOExportsTrie(const object::MachOObjectFile* o);
@@ -43,6 +61,15 @@ void printMachOWeakBindTable(const object::MachOObjectFile* o);
 void printELFFileHeader(const object::ObjectFile *o);
 void printCOFFFileHeader(const object::ObjectFile *o);
 void printMachOFileHeader(const object::ObjectFile *o);
+void printExportsTrie(const object::ObjectFile *o);
+void printRebaseTable(const object::ObjectFile *o);
+void printBindTable(const object::ObjectFile *o);
+void printLazyBindTable(const object::ObjectFile *o);
+void printWeakBindTable(const object::ObjectFile *o);
+void PrintRelocations(const object::ObjectFile *o);
+void PrintSectionHeaders(const object::ObjectFile *o);
+void PrintSectionContents(const object::ObjectFile *o);
+void PrintSymbolTable(const object::ObjectFile *o);
 
 } // end namespace llvm
 
diff --git a/tools/llvm-pdbdump/CMakeLists.txt b/tools/llvm-pdbdump/CMakeLists.txt
new file mode 100644
index 0000000..0519bf0
--- /dev/null
+++ b/tools/llvm-pdbdump/CMakeLists.txt
@@ -0,0 +1,14 @@
+set(LLVM_LINK_COMPONENTS
+  Support
+  DebugInfoPDB
+  )
+
+add_llvm_tool(llvm-pdbdump
+  llvm-pdbdump.cpp
+  ClassDefinitionDumper.cpp
+  CompilandDumper.cpp
+  FunctionDumper.cpp
+  TypeDumper.cpp
+  TypedefDumper.cpp
+  VariableDumper.cpp
+  )
diff --git a/tools/llvm-pdbdump/ClassDefinitionDumper.cpp b/tools/llvm-pdbdump/ClassDefinitionDumper.cpp
new file mode 100644
index 0000000..edf6eb4
--- /dev/null
+++ b/tools/llvm-pdbdump/ClassDefinitionDumper.cpp
@@ -0,0 +1,152 @@
+//===- ClassDefinitionDumper.cpp --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ClassDefinitionDumper.h"
+#include "FunctionDumper.h"
+#include "llvm-pdbdump.h"
+#include "TypedefDumper.h"
+#include "VariableDumper.h"
+
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+
+ClassDefinitionDumper::ClassDefinitionDumper() : PDBSymDumper(true) {}
+
+void ClassDefinitionDumper::start(const PDBSymbolTypeUDT &Class,
+                                  raw_ostream &OS, int Indent) {
+  OS << "class " << Class.getName() << " {";
+  auto Children = Class.findAllChildren();
+  if (Children->getChildCount() == 0) {
+    OS << "}";
+    return;
+  }
+
+  // Try to dump symbols organized by member access level.  Public members
+  // first, then protected, then private.  This might be slow, so it's worth
+  // reconsidering the value of this if performance of large PDBs is a problem.
+  // NOTE: Access level of nested types is not recorded in the PDB, so we have
+  // a special case for them.
+  SymbolGroupByAccess Groups;
+  Groups.insert(std::make_pair(0, SymbolGroup()));
+  Groups.insert(std::make_pair((int)PDB_MemberAccess::Private, SymbolGroup()));
+  Groups.insert(
+      std::make_pair((int)PDB_MemberAccess::Protected, SymbolGroup()));
+  Groups.insert(std::make_pair((int)PDB_MemberAccess::Public, SymbolGroup()));
+
+  while (auto Child = Children->getNext()) {
+    PDB_MemberAccess Access = Child->getRawSymbol().getAccess();
+    if (isa<PDBSymbolTypeBaseClass>(*Child))
+      continue;
+
+    auto &AccessGroup = Groups.find((int)Access)->second;
+
+    if (auto Func = dyn_cast<PDBSymbolFunc>(Child.get())) {
+      if (Func->isCompilerGenerated())
+        continue;
+      if (Func->getLength() == 0 && !Func->isPureVirtual())
+        continue;
+      Child.release();
+      AccessGroup.Functions.push_back(std::unique_ptr<PDBSymbolFunc>(Func));
+    } else if (auto Data = dyn_cast<PDBSymbolData>(Child.get())) {
+      Child.release();
+      AccessGroup.Data.push_back(std::unique_ptr<PDBSymbolData>(Data));
+    } else {
+      AccessGroup.Unknown.push_back(std::move(Child));
+    }
+  }
+
+  int Count = 0;
+  Count += dumpAccessGroup((PDB_MemberAccess)0, Groups[0], OS, Indent);
+  Count += dumpAccessGroup(PDB_MemberAccess::Public,
+                           Groups[(int)PDB_MemberAccess::Public], OS, Indent);
+  Count +=
+      dumpAccessGroup(PDB_MemberAccess::Protected,
+                      Groups[(int)PDB_MemberAccess::Protected], OS, Indent);
+  Count += dumpAccessGroup(PDB_MemberAccess::Private,
+                           Groups[(int)PDB_MemberAccess::Private], OS, Indent);
+
+  if (Count > 0)
+    OS << newline(Indent);
+  OS << "}";
+}
+
+int ClassDefinitionDumper::dumpAccessGroup(PDB_MemberAccess Access,
+                                           const SymbolGroup &Group,
+                                           raw_ostream &OS, int Indent) {
+  if (Group.Functions.empty() && Group.Data.empty() && Group.Unknown.empty())
+    return 0;
+
+  int Count = 0;
+  if (Access == PDB_MemberAccess::Private)
+    OS << newline(Indent) << "private:";
+  else if (Access == PDB_MemberAccess::Protected)
+    OS << newline(Indent) << "protected:";
+  else if (Access == PDB_MemberAccess::Public)
+    OS << newline(Indent) << "public:";
+  for (auto iter = Group.Functions.begin(), end = Group.Functions.end();
+       iter != end; ++iter) {
+    ++Count;
+    (*iter)->dump(OS, Indent + 2, *this);
+  }
+  for (auto iter = Group.Data.begin(), end = Group.Data.end(); iter != end;
+       ++iter) {
+    ++Count;
+    (*iter)->dump(OS, Indent + 2, *this);
+  }
+  for (auto iter = Group.Unknown.begin(), end = Group.Unknown.end();
+       iter != end; ++iter) {
+    ++Count;
+    (*iter)->dump(OS, Indent + 2, *this);
+  }
+  return Count;
+}
+
+void ClassDefinitionDumper::dump(const PDBSymbolTypeBaseClass &Symbol,
+                                 raw_ostream &OS, int Indent) {}
+
+void ClassDefinitionDumper::dump(const PDBSymbolData &Symbol, raw_ostream &OS,
+                                 int Indent) {
+  VariableDumper Dumper;
+  Dumper.start(Symbol, OS, Indent);
+}
+
+void ClassDefinitionDumper::dump(const PDBSymbolFunc &Symbol, raw_ostream &OS,
+                                 int Indent) {
+  FunctionDumper Dumper;
+  Dumper.start(Symbol, FunctionDumper::PointerType::None, OS, Indent);
+}
+
+void ClassDefinitionDumper::dump(const PDBSymbolTypeVTable &Symbol,
+                                 raw_ostream &OS, int Indent) {}
+
+void ClassDefinitionDumper::dump(const PDBSymbolTypeEnum &Symbol,
+                                 raw_ostream &OS, int Indent) {
+  OS << newline(Indent) << "enum " << Symbol.getName();
+}
+
+void ClassDefinitionDumper::dump(const PDBSymbolTypeTypedef &Symbol,
+                                 raw_ostream &OS, int Indent) {
+  OS << newline(Indent);
+  TypedefDumper Dumper;
+  Dumper.start(Symbol, OS, Indent);
+}
+
+void ClassDefinitionDumper::dump(const PDBSymbolTypeUDT &Symbol,
+                                 raw_ostream &OS, int Indent) {}
diff --git a/tools/llvm-pdbdump/ClassDefinitionDumper.h b/tools/llvm-pdbdump/ClassDefinitionDumper.h
new file mode 100644
index 0000000..aaf0376
--- /dev/null
+++ b/tools/llvm-pdbdump/ClassDefinitionDumper.h
@@ -0,0 +1,64 @@
+//===- ClassDefinitionDumper.h - --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_CLASSDEFINITIONDUMPER_H
+#define LLVM_TOOLS_LLVMPDBDUMP_CLASSDEFINITIONDUMPER_H
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+
+#include <list>
+#include <memory>
+#include <unordered_map>
+
+namespace llvm {
+
+class ClassDefinitionDumper : public PDBSymDumper {
+public:
+  ClassDefinitionDumper();
+
+  void start(const PDBSymbolTypeUDT &Exe, raw_ostream &OS, int Indent);
+
+  void dump(const PDBSymbolTypeBaseClass &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolData &Symbol, raw_ostream &OS, int Indent) override;
+  void dump(const PDBSymbolTypeEnum &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolFunc &Symbol, raw_ostream &OS, int Indent) override;
+  void dump(const PDBSymbolTypeTypedef &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeUDT &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeVTable &Symbol, raw_ostream &OS,
+            int Indent) override;
+
+private:
+  struct SymbolGroup {
+    SymbolGroup() {}
+    SymbolGroup(SymbolGroup &&Other) {
+      Functions = std::move(Other.Functions);
+      Data = std::move(Other.Data);
+      Unknown = std::move(Other.Unknown);
+    }
+
+    std::list<std::unique_ptr<PDBSymbolFunc>> Functions;
+    std::list<std::unique_ptr<PDBSymbolData>> Data;
+    std::list<std::unique_ptr<PDBSymbol>> Unknown;
+    SymbolGroup(const SymbolGroup &other) = delete;
+    SymbolGroup &operator=(const SymbolGroup &other) = delete;
+  };
+  typedef std::unordered_map<int, SymbolGroup> SymbolGroupByAccess;
+
+  int dumpAccessGroup(PDB_MemberAccess Access, const SymbolGroup &Group,
+                      raw_ostream &OS, int Indent);
+};
+}
+
+#endif
diff --git a/tools/llvm-pdbdump/CompilandDumper.cpp b/tools/llvm-pdbdump/CompilandDumper.cpp
new file mode 100644
index 0000000..852ddfa
--- /dev/null
+++ b/tools/llvm-pdbdump/CompilandDumper.cpp
@@ -0,0 +1,117 @@
+//===- CompilandDumper.cpp - llvm-pdbdump compiland symbol dumper *- C++ *-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CompilandDumper.h"
+#include "llvm-pdbdump.h"
+
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolLabel.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolThunk.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolUnknown.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "FunctionDumper.h"
+
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+CompilandDumper::CompilandDumper() : PDBSymDumper(true) {}
+
+void CompilandDumper::dump(const PDBSymbolCompilandDetails &Symbol,
+                           raw_ostream &OS, int Indent) {}
+
+void CompilandDumper::dump(const PDBSymbolCompilandEnv &Symbol, raw_ostream &OS,
+                           int Indent) {}
+
+void CompilandDumper::start(const PDBSymbolCompiland &Symbol, raw_ostream &OS,
+                            int Indent, bool Children) {
+  std::string FullName = Symbol.getName();
+  OS << newline(Indent) << FullName;
+  if (!Children)
+    return;
+
+  auto ChildrenEnum = Symbol.findAllChildren();
+  while (auto Child = ChildrenEnum->getNext())
+    Child->dump(OS, Indent + 2, *this);
+}
+
+void CompilandDumper::dump(const PDBSymbolData &Symbol, raw_ostream &OS,
+                           int Indent) {
+  OS << newline(Indent);
+  switch (auto LocType = Symbol.getLocationType()) {
+  case PDB_LocType::Static:
+    OS << "data: [";
+    OS << format_hex(Symbol.getRelativeVirtualAddress(), 10);
+    OS << "]";
+    break;
+  case PDB_LocType::Constant:
+    OS << "constant: [" << Symbol.getValue() << "]";
+    break;
+  default:
+    OS << "data(unexpected type=" << LocType << ")";
+  }
+
+  OS << " " << Symbol.getName();
+}
+
+void CompilandDumper::dump(const PDBSymbolFunc &Symbol, raw_ostream &OS,
+                           int Indent) {
+  if (Symbol.getLength() == 0)
+    return;
+
+  FunctionDumper Dumper;
+  Dumper.start(Symbol, FunctionDumper::PointerType::None, OS, Indent);
+}
+
+void CompilandDumper::dump(const PDBSymbolLabel &Symbol, raw_ostream &OS,
+                           int Indent) {
+  OS << newline(Indent);
+  OS << "label [" << format_hex(Symbol.getRelativeVirtualAddress(), 10) << "] "
+     << Symbol.getName();
+}
+
+void CompilandDumper::dump(const PDBSymbolThunk &Symbol, raw_ostream &OS,
+                           int Indent) {
+  OS << newline(Indent) << "thunk ";
+  PDB_ThunkOrdinal Ordinal = Symbol.getThunkOrdinal();
+  uint32_t RVA = Symbol.getRelativeVirtualAddress();
+  if (Ordinal == PDB_ThunkOrdinal::TrampIncremental) {
+    OS << format_hex(RVA, 10);
+    OS << " -> " << format_hex(Symbol.getTargetRelativeVirtualAddress(), 10);
+  } else {
+    OS << "[" << format_hex(RVA, 10);
+    OS << " - " << format_hex(RVA + Symbol.getLength(), 10) << "]";
+  }
+  OS << " (" << Ordinal << ") ";
+  std::string Name = Symbol.getName();
+  if (!Name.empty())
+    OS << Name;
+}
+
+void CompilandDumper::dump(const PDBSymbolTypeTypedef &Symbol, raw_ostream &OS,
+                           int Indent) {}
+
+void CompilandDumper::dump(const PDBSymbolUnknown &Symbol, raw_ostream &OS,
+                           int Indent) {
+  OS << newline(Indent);
+  OS << "unknown (" << Symbol.getSymTag() << ")";
+}
diff --git a/tools/llvm-pdbdump/CompilandDumper.h b/tools/llvm-pdbdump/CompilandDumper.h
new file mode 100644
index 0000000..abcebc2
--- /dev/null
+++ b/tools/llvm-pdbdump/CompilandDumper.h
@@ -0,0 +1,39 @@
+//===- CompilandDumper.h - llvm-pdbdump compiland symbol dumper *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_COMPILANDDUMPER_H
+#define LLVM_TOOLS_LLVMPDBDUMP_COMPILANDDUMPER_H
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+namespace llvm {
+
+class CompilandDumper : public PDBSymDumper {
+public:
+  CompilandDumper();
+
+  void start(const PDBSymbolCompiland &Symbol, raw_ostream &OS, int Indent,
+             bool Children);
+
+  void dump(const PDBSymbolCompilandDetails &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolCompilandEnv &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolData &Symbol, raw_ostream &OS, int Indent) override;
+  void dump(const PDBSymbolFunc &Symbol, raw_ostream &OS, int Indent) override;
+  void dump(const PDBSymbolLabel &Symbol, raw_ostream &OS, int Indent) override;
+  void dump(const PDBSymbolThunk &Symbol, raw_ostream &OS, int Indent) override;
+  void dump(const PDBSymbolTypeTypedef &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolUnknown &Symbol, raw_ostream &OS,
+            int Indent) override;
+};
+}
+
+#endif
diff --git a/tools/llvm-pdbdump/FunctionDumper.cpp b/tools/llvm-pdbdump/FunctionDumper.cpp
new file mode 100644
index 0000000..e659830
--- /dev/null
+++ b/tools/llvm-pdbdump/FunctionDumper.cpp
@@ -0,0 +1,243 @@
+//===- FunctionDumper.cpp ------------------------------------ *- C++ *-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FunctionDumper.h"
+#include "llvm-pdbdump.h"
+
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeArray.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+
+namespace {
+template <class T>
+void dumpClassParentWithScopeOperator(const T &Symbol, llvm::raw_ostream &OS,
+                                      llvm::FunctionDumper &Dumper) {
+  uint32_t ClassParentId = Symbol.getClassParentId();
+  auto ClassParent =
+      Symbol.getSession().template getConcreteSymbolById<PDBSymbolTypeUDT>(
+          ClassParentId);
+  if (!ClassParent)
+    return;
+
+  OS << ClassParent->getName() << "::";
+}
+}
+
+FunctionDumper::FunctionDumper() : PDBSymDumper(true) {}
+
+void FunctionDumper::start(const PDBSymbolTypeFunctionSig &Symbol,
+                           PointerType Pointer, raw_ostream &OS) {
+  auto ReturnType = Symbol.getReturnType();
+  ReturnType->dump(OS, 0, *this);
+  OS << " ";
+  uint32_t ClassParentId = Symbol.getClassParentId();
+  auto ClassParent =
+      Symbol.getSession().getConcreteSymbolById<PDBSymbolTypeUDT>(
+          ClassParentId);
+
+  PDB_CallingConv CC = Symbol.getCallingConvention();
+  bool ShouldDumpCallingConvention = true;
+  if ((ClassParent && CC == PDB_CallingConv::Thiscall) ||
+      (!ClassParent && CC == PDB_CallingConv::NearStdcall)) {
+    ShouldDumpCallingConvention = false;
+  }
+
+  if (Pointer == PointerType::None) {
+    if (ShouldDumpCallingConvention)
+      OS << CC << " ";
+    if (ClassParent)
+      OS << "(" << ClassParent->getName() << "::)";
+  } else {
+    OS << "(";
+    if (ShouldDumpCallingConvention)
+      OS << CC << " ";
+    OS << Symbol.getCallingConvention() << " ";
+    if (ClassParent)
+      OS << ClassParent->getName() << "::";
+    if (Pointer == PointerType::Reference)
+      OS << "&";
+    else
+      OS << "*";
+    OS << ")";
+  }
+
+  OS << "(";
+  if (auto ChildEnum = Symbol.getArguments()) {
+    uint32_t Index = 0;
+    while (auto Arg = ChildEnum->getNext()) {
+      Arg->dump(OS, 0, *this);
+      if (++Index < ChildEnum->getChildCount())
+        OS << ", ";
+    }
+  }
+  OS << ")";
+
+  if (Symbol.isConstType())
+    OS << " const";
+  if (Symbol.isVolatileType())
+    OS << " volatile";
+}
+
+void FunctionDumper::start(const PDBSymbolFunc &Symbol, PointerType Pointer,
+                           raw_ostream &OS, int Indent) {
+  uint32_t FuncStart = Symbol.getRelativeVirtualAddress();
+  uint32_t FuncEnd = FuncStart + Symbol.getLength();
+
+  OS << newline(Indent);
+
+  OS << "func [" << format_hex(FuncStart, 8);
+  if (auto DebugStart = Symbol.findOneChild<PDBSymbolFuncDebugStart>())
+    OS << "+" << DebugStart->getRelativeVirtualAddress() - FuncStart;
+  OS << " - " << format_hex(FuncEnd, 8);
+  if (auto DebugEnd = Symbol.findOneChild<PDBSymbolFuncDebugEnd>())
+    OS << "-" << FuncEnd - DebugEnd->getRelativeVirtualAddress();
+  OS << "] ";
+
+  if (Symbol.hasFramePointer())
+    OS << "(" << Symbol.getLocalBasePointerRegisterId() << ")";
+  else
+    OS << "(FPO)";
+
+  OS << " ";
+  if (Symbol.isVirtual() || Symbol.isPureVirtual())
+    OS << "virtual ";
+
+  auto Signature = Symbol.getSignature();
+  if (!Signature) {
+    OS << Symbol.getName();
+    if (Pointer == PointerType::Pointer)
+      OS << "*";
+    else if (Pointer == FunctionDumper::PointerType::Reference)
+      OS << "&";
+    return;
+  }
+
+  auto ReturnType = Signature->getReturnType();
+  ReturnType->dump(OS, 0, *this);
+  OS << " ";
+
+  auto ClassParent = Symbol.getClassParent();
+  PDB_CallingConv CC = Signature->getCallingConvention();
+  if (Pointer != FunctionDumper::PointerType::None)
+    OS << "(";
+
+  if ((ClassParent && CC != PDB_CallingConv::Thiscall) ||
+      (!ClassParent && CC != PDB_CallingConv::NearStdcall))
+    OS << Signature->getCallingConvention() << " ";
+  OS << Symbol.getName();
+  if (Pointer != FunctionDumper::PointerType::None) {
+    if (Pointer == PointerType::Pointer)
+      OS << "*";
+    else if (Pointer == FunctionDumper::PointerType::Reference)
+      OS << "&";
+    OS << ")";
+  }
+
+  OS << "(";
+  if (auto Arguments = Symbol.getArguments()) {
+    uint32_t Index = 0;
+    while (auto Arg = Arguments->getNext()) {
+      auto ArgType = Arg->getType();
+      ArgType->dump(OS, 0, *this);
+      OS << " " << Arg->getName();
+      if (++Index < Arguments->getChildCount())
+        OS << ", ";
+    }
+  }
+  OS << ")";
+  if (Symbol.isConstType())
+    OS << " const";
+  if (Symbol.isVolatileType())
+    OS << " volatile";
+  if (Symbol.isPureVirtual())
+    OS << " = 0";
+}
+
+void FunctionDumper::dump(const PDBSymbolTypeArray &Symbol, raw_ostream &OS,
+                          int Indent) {
+  uint32_t ElementTypeId = Symbol.getTypeId();
+  auto ElementType = Symbol.getSession().getSymbolById(ElementTypeId);
+  if (!ElementType)
+    return;
+
+  ElementType->dump(OS, 0, *this);
+  OS << "[" << Symbol.getLength() << "]";
+}
+
+void FunctionDumper::dump(const PDBSymbolTypeBuiltin &Symbol, raw_ostream &OS,
+                          int Indent) {
+  PDB_BuiltinType Type = Symbol.getBuiltinType();
+  OS << Type;
+  if (Type == PDB_BuiltinType::UInt || Type == PDB_BuiltinType::Int)
+    OS << (8 * Symbol.getLength()) << "_t";
+}
+
+void FunctionDumper::dump(const PDBSymbolTypeEnum &Symbol, raw_ostream &OS,
+                          int Indent) {
+  dumpClassParentWithScopeOperator(Symbol, OS, *this);
+  OS << Symbol.getName();
+}
+
+void FunctionDumper::dump(const PDBSymbolTypeFunctionArg &Symbol,
+                          raw_ostream &OS, int Indent) {
+  // PDBSymbolTypeFunctionArg is just a shim over the real argument.  Just drill
+  // through to the real thing and dump it.
+  Symbol.defaultDump(OS, Indent);
+  uint32_t TypeId = Symbol.getTypeId();
+  auto Type = Symbol.getSession().getSymbolById(TypeId);
+  if (!Type)
+    return;
+  Type->dump(OS, 0, *this);
+}
+
+void FunctionDumper::dump(const PDBSymbolTypeTypedef &Symbol, raw_ostream &OS,
+                          int Indent) {
+  dumpClassParentWithScopeOperator(Symbol, OS, *this);
+  OS << Symbol.getName();
+}
+
+void FunctionDumper::dump(const PDBSymbolTypePointer &Symbol, raw_ostream &OS,
+                          int Indent) {
+  uint32_t PointeeId = Symbol.getTypeId();
+  auto PointeeType = Symbol.getSession().getSymbolById(PointeeId);
+  if (!PointeeType)
+    return;
+
+  if (auto FuncSig = dyn_cast<PDBSymbolTypeFunctionSig>(PointeeType.get())) {
+    FunctionDumper NestedDumper;
+    PointerType Pointer =
+        Symbol.isReference() ? PointerType::Reference : PointerType::Pointer;
+    NestedDumper.start(*FuncSig, Pointer, OS);
+  } else {
+    if (Symbol.isConstType())
+      OS << "const ";
+    if (Symbol.isVolatileType())
+      OS << "volatile ";
+    PointeeType->dump(OS, Indent, *this);
+    OS << (Symbol.isReference() ? "&" : "*");
+  }
+}
+
+void FunctionDumper::dump(const PDBSymbolTypeUDT &Symbol, raw_ostream &OS,
+                          int Indent) {
+  OS << Symbol.getName();
+}
diff --git a/tools/llvm-pdbdump/FunctionDumper.h b/tools/llvm-pdbdump/FunctionDumper.h
new file mode 100644
index 0000000..f9338cb
--- /dev/null
+++ b/tools/llvm-pdbdump/FunctionDumper.h
@@ -0,0 +1,45 @@
+//===- FunctionDumper.h --------------------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_FUNCTIONDUMPER_H
+#define LLVM_TOOLS_LLVMPDBDUMP_FUNCTIONDUMPER_H
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+namespace llvm {
+
+class FunctionDumper : public PDBSymDumper {
+public:
+  FunctionDumper();
+
+  enum class PointerType { None, Pointer, Reference };
+
+  void start(const PDBSymbolTypeFunctionSig &Symbol, PointerType Pointer,
+             raw_ostream &OS);
+  void start(const PDBSymbolFunc &Symbol, PointerType Pointer, raw_ostream &OS,
+             int Indent);
+
+  void dump(const PDBSymbolTypeArray &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeBuiltin &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeEnum &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeFunctionArg &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypePointer &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeTypedef &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeUDT &Symbol, raw_ostream &OS,
+            int Indent) override;
+};
+}
+
+#endif
diff --git a/tools/llvm-pdbdump/LLVMBuild.txt b/tools/llvm-pdbdump/LLVMBuild.txt
new file mode 100644
index 0000000..4877689
--- /dev/null
+++ b/tools/llvm-pdbdump/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./tools/llvm-pdbdump/LLVMBuild.txt -----------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = llvm-pdbdump
+parent = Tools
+required_libraries = DebugInfoPDB
+
diff --git a/tools/llvm-pdbdump/Makefile b/tools/llvm-pdbdump/Makefile
new file mode 100644
index 0000000..18aafc4
--- /dev/null
+++ b/tools/llvm-pdbdump/Makefile
@@ -0,0 +1,17 @@
+##===- tools/llvm-pdbdump/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../..
+TOOLNAME := llvm-pdbdump
+LINK_COMPONENTS := DebugInfoPDB Object
+
+# This tool has no plugins, optimize startup time.
+TOOL_NO_EXPORTS := 1
+
+include $(LEVEL)/Makefile.common
diff --git a/tools/llvm-pdbdump/TypeDumper.cpp b/tools/llvm-pdbdump/TypeDumper.cpp
new file mode 100644
index 0000000..3131e9f
--- /dev/null
+++ b/tools/llvm-pdbdump/TypeDumper.cpp
@@ -0,0 +1,96 @@
+//===- TypeDumper.cpp - PDBSymDumper implementation for types *----- C++ *-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TypeDumper.h"
+
+#include "ClassDefinitionDumper.h"
+#include "FunctionDumper.h"
+#include "llvm-pdbdump.h"
+#include "TypedefDumper.h"
+
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
+
+using namespace llvm;
+
+TypeDumper::TypeDumper(bool Inline, bool ClassDefs)
+    : PDBSymDumper(true), InlineDump(Inline), FullClassDefs(ClassDefs) {}
+
+void TypeDumper::start(const PDBSymbolExe &Exe, raw_ostream &OS, int Indent) {
+  auto Enums = Exe.findAllChildren<PDBSymbolTypeEnum>();
+  OS << newline(Indent) << "Enums: (" << Enums->getChildCount() << " items)";
+  while (auto Enum = Enums->getNext())
+    Enum->dump(OS, Indent + 2, *this);
+
+  auto FuncSigs = Exe.findAllChildren<PDBSymbolTypeFunctionSig>();
+  OS << newline(Indent);
+  OS << "Function Signatures: (" << FuncSigs->getChildCount() << " items)";
+  while (auto Sig = FuncSigs->getNext())
+    Sig->dump(OS, Indent + 2, *this);
+
+  auto Typedefs = Exe.findAllChildren<PDBSymbolTypeTypedef>();
+  OS << newline(Indent) << "Typedefs: (" << Typedefs->getChildCount()
+     << " items)";
+  while (auto Typedef = Typedefs->getNext())
+    Typedef->dump(OS, Indent + 2, *this);
+
+  auto Classes = Exe.findAllChildren<PDBSymbolTypeUDT>();
+  OS << newline(Indent) << "Classes: (" << Classes->getChildCount()
+     << " items)";
+  while (auto Class = Classes->getNext())
+    Class->dump(OS, Indent + 2, *this);
+}
+
+void TypeDumper::dump(const PDBSymbolTypeEnum &Symbol, raw_ostream &OS,
+                      int Indent) {
+  if (Symbol.getUnmodifiedTypeId() != 0)
+    return;
+
+  if (!InlineDump)
+    OS << newline(Indent);
+
+  OS << "enum " << Symbol.getName();
+}
+
+void TypeDumper::dump(const PDBSymbolTypeFunctionSig &Symbol, raw_ostream &OS,
+                      int Indent) {
+  if (!InlineDump)
+    OS << newline(Indent);
+
+  FunctionDumper Dumper;
+  Dumper.start(Symbol, FunctionDumper::PointerType::None, OS);
+}
+
+void TypeDumper::dump(const PDBSymbolTypeTypedef &Symbol, raw_ostream &OS,
+                      int Indent) {
+  if (!InlineDump)
+    OS << newline(Indent);
+
+  TypedefDumper Dumper;
+  Dumper.start(Symbol, OS, Indent);
+}
+
+void TypeDumper::dump(const PDBSymbolTypeUDT &Symbol, raw_ostream &OS,
+                      int Indent) {
+  if (Symbol.getUnmodifiedTypeId() != 0)
+    return;
+  if (!InlineDump)
+    OS << newline(Indent);
+
+  if (FullClassDefs) {
+    ClassDefinitionDumper Dumper;
+    Dumper.start(Symbol, OS, Indent);
+  } else {
+    OS << "class " << Symbol.getName();
+  }
+}
diff --git a/tools/llvm-pdbdump/TypeDumper.h b/tools/llvm-pdbdump/TypeDumper.h
new file mode 100644
index 0000000..d96c24c
--- /dev/null
+++ b/tools/llvm-pdbdump/TypeDumper.h
@@ -0,0 +1,38 @@
+//===- TypeDumper.h - PDBSymDumper implementation for types *- C++ ------*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_TYPEDUMPER_H
+#define LLVM_TOOLS_LLVMPDBDUMP_TYPEDUMPER_H
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+namespace llvm {
+
+class TypeDumper : public PDBSymDumper {
+public:
+  TypeDumper(bool Inline, bool ClassDefs);
+
+  void start(const PDBSymbolExe &Exe, raw_ostream &OS, int Indent);
+
+  void dump(const PDBSymbolTypeEnum &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeFunctionSig &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeTypedef &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeUDT &Symbol, raw_ostream &OS,
+            int Indent) override;
+
+private:
+  bool InlineDump;
+  bool FullClassDefs;
+};
+}
+
+#endif
diff --git a/tools/llvm-pdbdump/TypedefDumper.cpp b/tools/llvm-pdbdump/TypedefDumper.cpp
new file mode 100644
index 0000000..6eea6b6
--- /dev/null
+++ b/tools/llvm-pdbdump/TypedefDumper.cpp
@@ -0,0 +1,84 @@
+//===- TypedefDumper.cpp - PDBSymDumper impl for typedefs -------- * C++ *-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TypedefDumper.h"
+
+#include "FunctionDumper.h"
+#include "llvm-pdbdump.h"
+
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
+
+using namespace llvm;
+
+TypedefDumper::TypedefDumper() : PDBSymDumper(true) {}
+
+void TypedefDumper::start(const PDBSymbolTypeTypedef &Symbol, raw_ostream &OS,
+                          int Indent) {
+  OS << "typedef ";
+  uint32_t TargetId = Symbol.getTypeId();
+  if (auto TypeSymbol = Symbol.getSession().getSymbolById(TargetId))
+    TypeSymbol->dump(OS, 0, *this);
+  OS << " " << Symbol.getName();
+}
+
+void TypedefDumper::dump(const PDBSymbolTypeArray &Symbol, raw_ostream &OS,
+                         int Indent) {}
+
+void TypedefDumper::dump(const PDBSymbolTypeBuiltin &Symbol, raw_ostream &OS,
+                         int Indent) {
+  PDB_BuiltinType Type = Symbol.getBuiltinType();
+  OS << Type;
+  if (Type == PDB_BuiltinType::UInt || Type == PDB_BuiltinType::Int)
+    OS << (8 * Symbol.getLength()) << "_t";
+}
+
+void TypedefDumper::dump(const PDBSymbolTypeEnum &Symbol, raw_ostream &OS,
+                         int Indent) {
+  OS << "enum " << Symbol.getName();
+}
+
+void TypedefDumper::dump(const PDBSymbolTypePointer &Symbol, raw_ostream &OS,
+                         int Indent) {
+  if (Symbol.isConstType())
+    OS << "const ";
+  if (Symbol.isVolatileType())
+    OS << "volatile ";
+  uint32_t PointeeId = Symbol.getTypeId();
+  auto PointeeType = Symbol.getSession().getSymbolById(PointeeId);
+  if (!PointeeType)
+    return;
+  if (auto FuncSig = dyn_cast<PDBSymbolTypeFunctionSig>(PointeeType.get())) {
+    FunctionDumper::PointerType Pointer = FunctionDumper::PointerType::Pointer;
+    if (Symbol.isReference())
+      Pointer = FunctionDumper::PointerType::Reference;
+    FunctionDumper NestedDumper;
+    NestedDumper.start(*FuncSig, Pointer, OS);
+  } else {
+    PointeeType->dump(OS, Indent, *this);
+    OS << ((Symbol.isReference()) ? "&" : "*");
+  }
+}
+
+void TypedefDumper::dump(const PDBSymbolTypeFunctionSig &Symbol,
+                         raw_ostream &OS, int Indent) {
+  FunctionDumper Dumper;
+  Dumper.start(Symbol, FunctionDumper::PointerType::None, OS);
+}
+
+void TypedefDumper::dump(const PDBSymbolTypeUDT &Symbol, raw_ostream &OS,
+                         int Indent) {
+  OS << "class " << Symbol.getName();
+}
diff --git a/tools/llvm-pdbdump/TypedefDumper.h b/tools/llvm-pdbdump/TypedefDumper.h
new file mode 100644
index 0000000..e6211a8
--- /dev/null
+++ b/tools/llvm-pdbdump/TypedefDumper.h
@@ -0,0 +1,38 @@
+//===- TypedefDumper.h - llvm-pdbdump typedef dumper ---------*- C++ ----*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_TYPEDEFDUMPER_H
+#define LLVM_TOOLS_LLVMPDBDUMP_TYPEDEFDUMPER_H
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+
+namespace llvm {
+
+class TypedefDumper : public PDBSymDumper {
+public:
+  TypedefDumper();
+
+  void start(const PDBSymbolTypeTypedef &Symbol, raw_ostream &OS, int Indent);
+
+  void dump(const PDBSymbolTypeArray &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeBuiltin &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeEnum &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeFunctionSig &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypePointer &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeUDT &Symbol, raw_ostream &OS,
+            int Indent) override;
+};
+}
+
+#endif
diff --git a/tools/llvm-pdbdump/VariableDumper.cpp b/tools/llvm-pdbdump/VariableDumper.cpp
new file mode 100644
index 0000000..913cfee
--- /dev/null
+++ b/tools/llvm-pdbdump/VariableDumper.cpp
@@ -0,0 +1,120 @@
+//===- VariableDumper.cpp - -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "VariableDumper.h"
+
+#include "llvm-pdbdump.h"
+#include "FunctionDumper.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeArray.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
+
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+
+VariableDumper::VariableDumper() : PDBSymDumper(true) {}
+
+void VariableDumper::start(const PDBSymbolData &Var, raw_ostream &OS,
+                           int Indent) {
+  OS << newline(Indent);
+  OS << "data ";
+
+  auto VarType = Var.getType();
+
+  switch (auto LocType = Var.getLocationType()) {
+  case PDB_LocType::Static:
+    OS << "[" << format_hex(Var.getRelativeVirtualAddress(), 10) << "] ";
+    OS << "static ";
+    dumpSymbolTypeAndName(*VarType, Var.getName(), OS);
+    break;
+  case PDB_LocType::Constant:
+    OS << "const ";
+    dumpSymbolTypeAndName(*VarType, Var.getName(), OS);
+    OS << "[" << Var.getValue() << "]";
+    break;
+  case PDB_LocType::ThisRel:
+    OS << "+" << format_hex(Var.getOffset(), 4) << " ";
+    dumpSymbolTypeAndName(*VarType, Var.getName(), OS);
+    break;
+  default:
+    break;
+    OS << "unknown(" << LocType << ") " << Var.getName();
+  }
+}
+
+void VariableDumper::dump(const PDBSymbolTypeBuiltin &Symbol, raw_ostream &OS,
+                          int Indent) {
+  OS << Symbol.getBuiltinType();
+}
+
+void VariableDumper::dump(const PDBSymbolTypeEnum &Symbol, raw_ostream &OS,
+                          int Indent) {
+  OS << Symbol.getName();
+}
+
+void VariableDumper::dump(const PDBSymbolTypeFunctionSig &Symbol,
+                          raw_ostream &OS, int Indent) {}
+
+void VariableDumper::dump(const PDBSymbolTypePointer &Symbol, raw_ostream &OS,
+                          int Indent) {
+  auto PointeeType = Symbol.getPointeeType();
+  if (!PointeeType)
+    return;
+
+  if (auto Func = dyn_cast<PDBSymbolFunc>(PointeeType.get())) {
+    FunctionDumper NestedDumper;
+    FunctionDumper::PointerType Pointer =
+        Symbol.isReference() ? FunctionDumper::PointerType::Reference
+                             : FunctionDumper::PointerType::Pointer;
+    NestedDumper.start(*Func, Pointer, OS, Indent);
+  } else {
+    if (Symbol.isConstType())
+      OS << "const ";
+    if (Symbol.isVolatileType())
+      OS << "volatile ";
+    PointeeType->dump(OS, Indent, *this);
+    OS << (Symbol.isReference() ? "&" : "*");
+  }
+}
+
+void VariableDumper::dump(const PDBSymbolTypeTypedef &Symbol, raw_ostream &OS,
+                          int Indent) {
+  OS << "typedef " << Symbol.getName();
+}
+
+void VariableDumper::dump(const PDBSymbolTypeUDT &Symbol, raw_ostream &OS,
+                          int Indent) {
+  OS << Symbol.getName();
+}
+
+void VariableDumper::dumpSymbolTypeAndName(const PDBSymbol &Type,
+                                           StringRef Name, raw_ostream &OS) {
+  if (auto *ArrayType = dyn_cast<PDBSymbolTypeArray>(&Type)) {
+    std::string IndexSpec;
+    raw_string_ostream IndexStream(IndexSpec);
+    std::unique_ptr<PDBSymbol> ElementType = ArrayType->getElementType();
+    while (auto NestedArray = dyn_cast<PDBSymbolTypeArray>(ElementType.get())) {
+      IndexStream << "[" << NestedArray->getCount() << "]";
+      ElementType = NestedArray->getElementType();
+    }
+    IndexStream << "[" << ArrayType->getCount() << "]";
+    ElementType->dump(OS, 0, *this);
+    OS << " " << Name << IndexStream.str();
+  } else {
+    Type.dump(OS, 0, *this);
+    OS << " " << Name;
+  }
+}
diff --git a/tools/llvm-pdbdump/VariableDumper.h b/tools/llvm-pdbdump/VariableDumper.h
new file mode 100644
index 0000000..e6e71fa
--- /dev/null
+++ b/tools/llvm-pdbdump/VariableDumper.h
@@ -0,0 +1,43 @@
+//===- VariableDumper.h - PDBSymDumper implementation for types -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_VARIABLEDUMPER_H
+#define LLVM_TOOLS_LLVMPDBDUMP_VARIABLEDUMPER_H
+
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+
+class VariableDumper : public PDBSymDumper {
+public:
+  VariableDumper();
+
+  void start(const PDBSymbolData &Var, raw_ostream &OS, int Indent);
+
+  void dump(const PDBSymbolTypeBuiltin &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeEnum &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeFunctionSig &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypePointer &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeTypedef &Symbol, raw_ostream &OS,
+            int Indent) override;
+  void dump(const PDBSymbolTypeUDT &Symbol, raw_ostream &OS,
+            int Indent) override;
+
+private:
+  void dumpSymbolTypeAndName(const PDBSymbol &Type, StringRef Name,
+                             raw_ostream &OS);
+};
+}
+
+#endif
diff --git a/tools/llvm-pdbdump/llvm-pdbdump.cpp b/tools/llvm-pdbdump/llvm-pdbdump.cpp
new file mode 100644
index 0000000..e33e715
--- /dev/null
+++ b/tools/llvm-pdbdump/llvm-pdbdump.cpp
@@ -0,0 +1,134 @@
+//===- llvm-pdbdump.cpp - Dump debug info from a PDB file -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Dumps debug information present in PDB files.  This utility makes use of
+// the Microsoft Windows SDK, so will not compile or run on non-Windows
+// platforms.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-pdbdump.h"
+#include "CompilandDumper.h"
+#include "TypeDumper.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/config.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/PDB.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Signals.h"
+
+#if defined(HAVE_DIA_SDK)
+#include <Windows.h>
+#endif
+
+using namespace llvm;
+
+namespace opts {
+
+enum class PDB_DumpType { ByType, ByObjFile, Both };
+
+cl::list<std::string> InputFilenames(cl::Positional,
+                                     cl::desc("<input PDB files>"),
+                                     cl::OneOrMore);
+
+cl::opt<bool> DumpCompilands("compilands", cl::desc("Display compilands"));
+cl::opt<bool> DumpSymbols("symbols",
+                          cl::desc("Display symbols (implies --compilands"));
+cl::opt<bool> DumpTypes("types", cl::desc("Display types"));
+cl::opt<bool> DumpClassDefs("class-definitions",
+                            cl::desc("Display full class definitions"));
+}
+
+static void dumpInput(StringRef Path) {
+  std::unique_ptr<IPDBSession> Session(
+      llvm::createPDBReader(PDB_ReaderType::DIA, Path));
+  if (!Session) {
+    outs() << "Unable to create PDB reader.  Check that a valid implementation";
+    outs() << " is available for your platform.";
+    return;
+  }
+
+  auto GlobalScope(Session->getGlobalScope());
+  std::string FileName(GlobalScope->getSymbolsFileName());
+
+  outs() << "Summary for " << FileName;
+  uint64_t FileSize = 0;
+  if (!llvm::sys::fs::file_size(FileName, FileSize))
+    outs() << newline(2) << "Size: " << FileSize << " bytes";
+  else
+    outs() << newline(2) << "Size: (Unable to obtain file size)";
+
+  outs() << newline(2) << "Guid: " << GlobalScope->getGuid();
+  outs() << newline(2) << "Age: " << GlobalScope->getAge();
+  outs() << newline(2) << "Attributes: ";
+  if (GlobalScope->hasCTypes())
+    outs() << "HasCTypes ";
+  if (GlobalScope->hasPrivateSymbols())
+    outs() << "HasPrivateSymbols ";
+
+  if (opts::DumpTypes) {
+    outs() << "\nDumping types";
+    TypeDumper Dumper(false, opts::DumpClassDefs);
+    Dumper.start(*GlobalScope, outs(), 2);
+  }
+
+  if (opts::DumpSymbols || opts::DumpCompilands) {
+    outs() << "\nDumping compilands";
+    auto Compilands = GlobalScope->findAllChildren<PDBSymbolCompiland>();
+    CompilandDumper Dumper;
+    while (auto Compiland = Compilands->getNext())
+      Dumper.start(*Compiland, outs(), 2, opts::DumpSymbols);
+  }
+  outs().flush();
+}
+
+int main(int argc_, const char *argv_[]) {
+  // Print a stack trace if we signal out.
+  sys::PrintStackTraceOnErrorSignal();
+  PrettyStackTraceProgram X(argc_, argv_);
+
+  SmallVector<const char *, 256> argv;
+  llvm::SpecificBumpPtrAllocator<char> ArgAllocator;
+  std::error_code EC = llvm::sys::Process::GetArgumentVector(
+      argv, llvm::makeArrayRef(argv_, argc_), ArgAllocator);
+  if (EC) {
+    llvm::errs() << "error: couldn't get arguments: " << EC.message() << '\n';
+    return 1;
+  }
+
+  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+
+  cl::ParseCommandLineOptions(argv.size(), argv.data(), "LLVM PDB Dumper\n");
+
+#if defined(HAVE_DIA_SDK)
+  CoInitializeEx(nullptr, COINIT_MULTITHREADED);
+#endif
+
+  std::for_each(opts::InputFilenames.begin(), opts::InputFilenames.end(),
+                dumpInput);
+
+#if defined(HAVE_DIA_SDK)
+  CoUninitialize();
+#endif
+
+  return 0;
+}
diff --git a/tools/llvm-pdbdump/llvm-pdbdump.h b/tools/llvm-pdbdump/llvm-pdbdump.h
new file mode 100644
index 0000000..74a1718
--- /dev/null
+++ b/tools/llvm-pdbdump/llvm-pdbdump.h
@@ -0,0 +1,28 @@
+//===- llvm-pdbdump.h ----------------------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_LLVMPDBDUMP_H
+#define LLVM_TOOLS_LLVMPDBDUMP_LLVMPDBDUMP_H
+
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+struct newline {
+  newline(int IndentWidth) : Width(IndentWidth) {}
+  int Width;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const newline &Indent) {
+  OS << "\n";
+  OS.indent(Indent.Width);
+  return OS;
+}
+}
+
+#endif
+\ No newline at end of file
diff --git a/tools/llvm-profdata/llvm-profdata.cpp b/tools/llvm-profdata/llvm-profdata.cpp
index e977799..0137e35 100644
--- a/tools/llvm-profdata/llvm-profdata.cpp
+++ b/tools/llvm-profdata/llvm-profdata.cpp
@@ -12,11 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/ProfileData/InstrProfWriter.h"
 #include "llvm/ProfileData/SampleProfReader.h"
 #include "llvm/ProfileData/SampleProfWriter.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
@@ -38,7 +38,8 @@ static void exitWithError(const Twine &Message, StringRef Whence = "") {
 
 enum ProfileKinds { instr, sample };
 
-void mergeInstrProfile(cl::list<std::string> Inputs, StringRef OutputFilename) {
+void mergeInstrProfile(const cl::list<std::string> &Inputs,
+                       StringRef OutputFilename) {
   if (OutputFilename.compare("-") == 0)
     exitWithError("Cannot write indexed profdata format to stdout.");
 
@@ -64,7 +65,8 @@ void mergeInstrProfile(cl::list<std::string> Inputs, StringRef OutputFilename) {
   Writer.write(Output);
 }
 
-void mergeSampleProfile(cl::list<std::string> Inputs, StringRef OutputFilename,
+void mergeSampleProfile(const cl::list<std::string> &Inputs,
+                        StringRef OutputFilename,
                         sampleprof::SampleProfileFormat OutputFormat) {
   using namespace sampleprof;
   auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat);
diff --git a/tools/llvm-readobj/ARMAttributeParser.cpp b/tools/llvm-readobj/ARMAttributeParser.cpp
index d35cd14..e2d7191 100644
--- a/tools/llvm-readobj/ARMAttributeParser.cpp
+++ b/tools/llvm-readobj/ARMAttributeParser.cpp
@@ -141,7 +141,7 @@ void ARMAttributeParser::CPU_arch_profile(AttrType Tag, const uint8_t *Data,
   case 'R': Profile = "Real-time"; break;
   case 'M': Profile = "Microcontroller"; break;
   case 'S': Profile = "Classic"; break;
-  case '0': Profile = "None"; break;
+  case 0: Profile = "None"; break;
   }
 
   PrintAttribute(Tag, Encoded, Profile);
diff --git a/tools/llvm-readobj/ARMWinEHPrinter.cpp b/tools/llvm-readobj/ARMWinEHPrinter.cpp
index ede36d1..62252fc 100644
--- a/tools/llvm-readobj/ARMWinEHPrinter.cpp
+++ b/tools/llvm-readobj/ARMWinEHPrinter.cpp
@@ -64,8 +64,8 @@
 
 #include "ARMWinEHPrinter.h"
 #include "Error.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ARMWinEH.h"
 #include "llvm/Support/Format.h"
 
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index 5276428..1412111 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -57,6 +57,7 @@ public:
   void printDynamicSymbols() override;
   void printUnwindInfo() override;
   void printCOFFImports() override;
+  void printCOFFExports() override;
   void printCOFFDirectives() override;
   void printCOFFBaseReloc() override;
 
@@ -70,7 +71,7 @@ private:
   void printBaseOfDataField(const pe32_header *Hdr);
   void printBaseOfDataField(const pe32plus_header *Hdr);
 
-  void printCodeViewLineTables(const SectionRef &Section);
+  void printCodeViewDebugInfo(const SectionRef &Section);
 
   void printCodeViewSymbolsSubsection(StringRef Subsection,
                                       const SectionRef &Section,
@@ -468,7 +469,7 @@ void COFFDumper::printBaseOfDataField(const pe32_header *Hdr) {
 
 void COFFDumper::printBaseOfDataField(const pe32plus_header *) {}
 
-void COFFDumper::printCodeViewLineTables(const SectionRef &Section) {
+void COFFDumper::printCodeViewDebugInfo(const SectionRef &Section) {
   StringRef Data;
   if (error(Section.getContents(Data)))
     return;
@@ -476,7 +477,7 @@ void COFFDumper::printCodeViewLineTables(const SectionRef &Section) {
   SmallVector<StringRef, 10> FunctionNames;
   StringMap<StringRef> FunctionLineTables;
 
-  ListScope D(W, "CodeViewLineTables");
+  ListScope D(W, "CodeViewDebugInfo");
   {
     // FIXME: Add more offset correctness checks.
     DataExtractor DE(Data, true, 4);
@@ -502,14 +503,17 @@ void COFFDumper::printCodeViewLineTables(const SectionRef &Section) {
         return;
       }
 
-      // Print the raw contents to simplify debugging if anything goes wrong
-      // afterwards.
       StringRef Contents = Data.substr(Offset, PayloadSize);
-      W.printBinaryBlock("Contents", Contents);
+      if (opts::CodeViewSubsectionBytes) {
+        // Print the raw contents to simplify debugging if anything goes wrong
+        // afterwards.
+        W.printBinaryBlock("Contents", Contents);
+      }
 
       switch (SubSectionType) {
       case COFF::DEBUG_SYMBOL_SUBSECTION:
-        printCodeViewSymbolsSubsection(Contents, Section, Offset);
+        if (opts::SectionSymbols)
+          printCodeViewSymbolsSubsection(Contents, Section, Offset);
         break;
       case COFF::DEBUG_LINE_TABLE_SUBSECTION: {
         // Holds a PC to file:line table.  Some data to parse this subsection is
@@ -694,10 +698,20 @@ void COFFDumper::printCodeViewSymbolsSubsection(StringRef Subsection,
       InFunctionScope = false;
       break;
     }
-    default:
+    default: {
+      if (opts::CodeViewSubsectionBytes) {
+        ListScope S(W, "Record");
+        W.printHex("Size", Size);
+        W.printHex("Type", Type);
+
+        StringRef Contents = DE.getData().substr(Offset, Size);
+        W.printBinaryBlock("Contents", Contents);
+      }
+
       Offset += Size;
       break;
     }
+    }
   }
 
   if (InFunctionScope)
@@ -746,8 +760,8 @@ void COFFDumper::printSections() {
       }
     }
 
-    if (Name == ".debug$S" && opts::CodeViewLineTables)
-      printCodeViewLineTables(Sec);
+    if (Name == ".debug$S" && opts::CodeView)
+      printCodeViewDebugInfo(Sec);
 
     if (opts::SectionData &&
         !(Section->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)) {
@@ -1062,6 +1076,26 @@ void COFFDumper::printCOFFImports() {
   }
 }
 
+void COFFDumper::printCOFFExports() {
+  for (const ExportDirectoryEntryRef &E : Obj->export_directories()) {
+    DictScope Export(W, "Export");
+
+    StringRef Name;
+    uint32_t Ordinal, RVA;
+
+    if (error(E.getSymbolName(Name)))
+      continue;
+    if (error(E.getOrdinal(Ordinal)))
+      continue;
+    if (error(E.getExportRVA(RVA)))
+      continue;
+
+    W.printNumber("Ordinal", Ordinal);
+    W.printString("Name", Name);
+    W.printHex("RVA", RVA);
+  }
+}
+
 void COFFDumper::printCOFFDirectives() {
   for (const SectionRef &Section : Obj->sections()) {
     StringRef Contents;
@@ -1086,6 +1120,7 @@ static StringRef getBaseRelocTypeName(uint8_t Type) {
   case COFF::IMAGE_REL_BASED_LOW: return "LOW";
   case COFF::IMAGE_REL_BASED_HIGHLOW: return "HIGHLOW";
   case COFF::IMAGE_REL_BASED_HIGHADJ: return "HIGHADJ";
+  case COFF::IMAGE_REL_BASED_ARM_MOV32T: return "ARM_MOV32(T)";
   case COFF::IMAGE_REL_BASED_DIR64: return "DIR64";
   default: return "unknown (" + llvm::utostr(Type) + ")";
   }
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index d68c786..e4b7601 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -372,9 +372,10 @@ static const EnumEntry<unsigned> ElfMachineType[] = {
 };
 
 static const EnumEntry<unsigned> ElfSymbolBindings[] = {
-  { "Local",  ELF::STB_LOCAL  },
-  { "Global", ELF::STB_GLOBAL },
-  { "Weak",   ELF::STB_WEAK   }
+  { "Local",  ELF::STB_LOCAL        },
+  { "Global", ELF::STB_GLOBAL       },
+  { "Weak",   ELF::STB_WEAK         },
+  { "Unique", ELF::STB_GNU_UNIQUE   }
 };
 
 static const EnumEntry<unsigned> ElfSymbolTypes[] = {
diff --git a/tools/llvm-readobj/ObjDumper.h b/tools/llvm-readobj/ObjDumper.h
index a34e091..27e658f 100644
--- a/tools/llvm-readobj/ObjDumper.h
+++ b/tools/llvm-readobj/ObjDumper.h
@@ -45,6 +45,7 @@ public:
 
   // Only implemented for PE/COFF.
   virtual void printCOFFImports() { }
+  virtual void printCOFFExports() { }
   virtual void printCOFFDirectives() { }
   virtual void printCOFFBaseReloc() { }
 
diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index d08f186..f8f3086 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -127,9 +127,14 @@ namespace opts {
   cl::opt<bool> ExpandRelocs("expand-relocs",
     cl::desc("Expand each shown relocation to multiple lines"));
 
-  // -codeview-linetables
-  cl::opt<bool> CodeViewLineTables("codeview-linetables",
-    cl::desc("Display CodeView line table information"));
+  // -codeview
+  cl::opt<bool> CodeView("codeview",
+                         cl::desc("Display CodeView debug information"));
+
+  // -codeview-subsection-bytes
+  cl::opt<bool> CodeViewSubsectionBytes(
+      "codeview-subsection-bytes",
+      cl::desc("Dump raw contents of codeview debug sections and records"));
 
   // -arm-attributes, -a
   cl::opt<bool> ARMAttributes("arm-attributes",
@@ -146,6 +151,10 @@ namespace opts {
   cl::opt<bool>
   COFFImports("coff-imports", cl::desc("Display the PE/COFF import table"));
 
+  // -coff-exports
+  cl::opt<bool>
+  COFFExports("coff-exports", cl::desc("Display the PE/COFF export table"));
+
   // -coff-directives
   cl::opt<bool>
   COFFDirectives("coff-directives",
@@ -282,6 +291,8 @@ static void dumpObject(const ObjectFile *Obj) {
       Dumper->printMipsPLTGOT();
   if (opts::COFFImports)
     Dumper->printCOFFImports();
+  if (opts::COFFExports)
+    Dumper->printCOFFExports();
   if (opts::COFFDirectives)
     Dumper->printCOFFDirectives();
   if (opts::COFFBaseRelocs)
diff --git a/tools/llvm-readobj/llvm-readobj.h b/tools/llvm-readobj/llvm-readobj.h
index 1c33417..74b9a60 100644
--- a/tools/llvm-readobj/llvm-readobj.h
+++ b/tools/llvm-readobj/llvm-readobj.h
@@ -36,7 +36,8 @@ namespace opts {
   extern llvm::cl::opt<bool> DynamicSymbols;
   extern llvm::cl::opt<bool> UnwindInfo;
   extern llvm::cl::opt<bool> ExpandRelocs;
-  extern llvm::cl::opt<bool> CodeViewLineTables;
+  extern llvm::cl::opt<bool> CodeView;
+  extern llvm::cl::opt<bool> CodeViewSubsectionBytes;
   extern llvm::cl::opt<bool> ARMAttributes;
   extern llvm::cl::opt<bool> MipsPLTGOT;
 } // namespace opts
diff --git a/tools/llvm-rtdyld/Android.mk b/tools/llvm-rtdyld/Android.mk
index 4f4fb4c..853c71a 100644
--- a/tools/llvm-rtdyld/Android.mk
+++ b/tools/llvm-rtdyld/Android.mk
@@ -37,7 +37,8 @@ llvm_rtdyld_STATIC_LIBRARIES := \
   libLLVMX86AsmParser \
   libLLVMX86Utils \
   libLLVMX86Disassembler \
-  libLLVMDebugInfo          \
+  libLLVMDebugInfoDWARF \
+  libLLVMDebugInfoPDB \
   libLLVMExecutionEngine    \
   libLLVMCodeGen \
   libLLVMObject             \
diff --git a/tools/llvm-rtdyld/CMakeLists.txt b/tools/llvm-rtdyld/CMakeLists.txt
index feb2134..c1acbe5 100644
--- a/tools/llvm-rtdyld/CMakeLists.txt
+++ b/tools/llvm-rtdyld/CMakeLists.txt
@@ -1,8 +1,9 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
-  DebugInfo
+  DebugInfoDWARF
   ExecutionEngine
   MC
+  Object
   RuntimeDyld
   Support
   )
diff --git a/tools/llvm-rtdyld/Makefile b/tools/llvm-rtdyld/Makefile
index 9de753e..3e868b9 100644
--- a/tools/llvm-rtdyld/Makefile
+++ b/tools/llvm-rtdyld/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL := ../..
 TOOLNAME := llvm-rtdyld
-LINK_COMPONENTS := all-targets support MC object RuntimeDyld MCJIT debuginfo
+LINK_COMPONENTS := all-targets support MC object RuntimeDyld MCJIT DebugInfoDWARF
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index 87d381e..58bf206 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -12,16 +12,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringMap.h"
-#include "llvm/DebugInfo/DIContext.h"
-#include "llvm/ExecutionEngine/ObjectBuffer.h"
-#include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/DebugInfo/DWARF/DIContext.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/ExecutionEngine/RuntimeDyldChecker.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
-#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Support/CommandLine.h"
@@ -30,10 +28,10 @@
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
 #include <list>
 #include <system_error>
 
@@ -207,23 +205,32 @@ static int printLineInfoForInput() {
     if (std::error_code EC = InputBuffer.getError())
       return Error("unable to read input: '" + EC.message() + "'");
 
-    std::unique_ptr<ObjectImage> LoadedObject;
+    ErrorOr<std::unique_ptr<ObjectFile>> MaybeObj(
+      ObjectFile::createObjectFile((*InputBuffer)->getMemBufferRef()));
+
+    if (std::error_code EC = MaybeObj.getError())
+      return Error("unable to create object file: '" + EC.message() + "'");
+
+    ObjectFile &Obj = **MaybeObj;
+
     // Load the object file
-    LoadedObject = Dyld.loadObject(
-        llvm::make_unique<ObjectBuffer>(std::move(*InputBuffer)));
-    if (!LoadedObject) {
+    std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObjInfo =
+      Dyld.loadObject(Obj);
+
+    if (Dyld.hasError())
       return Error(Dyld.getErrorString());
-    }
 
     // Resolve all the relocations we can.
     Dyld.resolveRelocations();
 
+    OwningBinary<ObjectFile> DebugObj = LoadedObjInfo->getObjectForDebug(Obj);
+
     std::unique_ptr<DIContext> Context(
-        DIContext::getDWARFContext(*LoadedObject->getObjectFile()));
+      DIContext::getDWARFContext(*DebugObj.getBinary()));
 
     // Use symbol info to iterate functions in the object.
-    for (object::symbol_iterator I = LoadedObject->begin_symbols(),
-                                 E = LoadedObject->end_symbols();
+    for (object::symbol_iterator I = DebugObj.getBinary()->symbol_begin(),
+                                 E = DebugObj.getBinary()->symbol_end();
          I != E; ++I) {
       object::SymbolRef::Type SymType;
       if (I->getType(SymType)) continue;
@@ -268,11 +275,17 @@ static int executeInput() {
         MemoryBuffer::getFileOrSTDIN(InputFileList[i]);
     if (std::error_code EC = InputBuffer.getError())
       return Error("unable to read input: '" + EC.message() + "'");
-    std::unique_ptr<ObjectImage> LoadedObject;
+    ErrorOr<std::unique_ptr<ObjectFile>> MaybeObj(
+      ObjectFile::createObjectFile((*InputBuffer)->getMemBufferRef()));
+
+    if (std::error_code EC = MaybeObj.getError())
+      return Error("unable to create object file: '" + EC.message() + "'");
+
+    ObjectFile &Obj = **MaybeObj;
+
     // Load the object file
-    LoadedObject = Dyld.loadObject(
-        llvm::make_unique<ObjectBuffer>(std::move(*InputBuffer)));
-    if (!LoadedObject) {
+    Dyld.loadObject(Obj);
+    if (Dyld.hasError()) {
       return Error(Dyld.getErrorString());
     }
   }
@@ -512,14 +525,21 @@ static int linkAndVerify() {
     // Load the input memory buffer.
     ErrorOr<std::unique_ptr<MemoryBuffer>> InputBuffer =
         MemoryBuffer::getFileOrSTDIN(InputFileList[i]);
+
     if (std::error_code EC = InputBuffer.getError())
       return Error("unable to read input: '" + EC.message() + "'");
 
-    std::unique_ptr<ObjectImage> LoadedObject;
+    ErrorOr<std::unique_ptr<ObjectFile>> MaybeObj(
+      ObjectFile::createObjectFile((*InputBuffer)->getMemBufferRef()));
+
+    if (std::error_code EC = MaybeObj.getError())
+      return Error("unable to create object file: '" + EC.message() + "'");
+
+    ObjectFile &Obj = **MaybeObj;
+
     // Load the object file
-    LoadedObject = Dyld.loadObject(
-        llvm::make_unique<ObjectBuffer>(std::move(*InputBuffer)));
-    if (!LoadedObject) {
+    Dyld.loadObject(Obj);
+    if (Dyld.hasError()) {
       return Error(Dyld.getErrorString());
     }
   }
diff --git a/tools/llvm-shlib/CMakeLists.txt b/tools/llvm-shlib/CMakeLists.txt
index 100c184..9a8cd4a 100644
--- a/tools/llvm-shlib/CMakeLists.txt
+++ b/tools/llvm-shlib/CMakeLists.txt
@@ -57,7 +57,7 @@ if(NOT DEFINED LLVM_EXPORTED_SYMBOL_FILE)
 
   foreach (lib ${LIB_NAMES})
     
-    set(LIB_DIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib)
+    set(LIB_DIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX})
     set(LIB_NAME ${LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${lib})
     set(LIB_PATH ${LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX})
     set(LIB_EXPORTS_PATH ${LIB_NAME}.exports)
@@ -78,6 +78,8 @@ if(NOT DEFINED LLVM_EXPORTED_SYMBOL_FILE)
     DEPENDS ${LLVM_DYLIB_REQUIRED_EXPORTS}
     COMMENT "Generating combined export list...")
 
+  add_custom_target(libLLVMExports DEPENDS ${LLVM_EXPORTED_SYMBOL_FILE})
+
 endif()
 
 add_llvm_library(LLVM SHARED ${SOURCES})
@@ -90,7 +92,7 @@ endif()
 
 target_link_libraries(LLVM ${cmake_2_8_12_PRIVATE} ${LIB_NAMES})
 
-add_dependencies(LLVM ${LLVM_EXPORTED_SYMBOL_FILE})
+add_dependencies(LLVM libLLVMExports)
 
 if (APPLE)
   set_property(TARGET LLVM APPEND_STRING PROPERTY
diff --git a/tools/llvm-shlib/libllvm.cpp b/tools/llvm-shlib/libllvm.cpp
index 40b4f66..8424d66 100644
--- a/tools/llvm-shlib/libllvm.cpp
+++ b/tools/llvm-shlib/libllvm.cpp
@@ -11,3 +11,10 @@
 // you can't define a target with no sources.
 //
 //===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+
+#if defined(DISABLE_LLVM_DYLIB_ATEXIT)
+extern "C" int __cxa_atexit();
+extern "C" int __cxa_atexit() { return 0; }
+#endif
diff --git a/tools/llvm-size/llvm-size.cpp b/tools/llvm-size/llvm-size.cpp
index 59a5f20..0e0dd59 100644
--- a/tools/llvm-size/llvm-size.cpp
+++ b/tools/llvm-size/llvm-size.cpp
@@ -15,9 +15,9 @@
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/Object/Archive.h"
-#include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/MachOUniversal.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
@@ -484,7 +484,6 @@ static void PrintFileSectionSizes(StringRef file) {
           if (ArchFlags[i] == I->getArchTypeName()) {
             ArchFound = true;
             ErrorOr<std::unique_ptr<ObjectFile>> UO = I->getAsObjectFile();
-            std::unique_ptr<Archive> UA;
             if (UO) {
               if (ObjectFile *o = dyn_cast<ObjectFile>(&*UO.get())) {
                 MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o);
@@ -503,7 +502,9 @@ static void PrintFileSectionSizes(StringRef file) {
                   outs() << "\n";
                 }
               }
-            } else if (!I->getAsArchive(UA)) {
+            } else if (ErrorOr<std::unique_ptr<Archive>> AOrErr =
+                           I->getAsArchive()) {
+              std::unique_ptr<Archive> &UA = *AOrErr;
               // This is an archive. Iterate over each member and display its
               // sizes.
               for (object::Archive::child_iterator i = UA->child_begin(),
@@ -560,7 +561,6 @@ static void PrintFileSectionSizes(StringRef file) {
            I != E; ++I) {
         if (HostArchName == I->getArchTypeName()) {
           ErrorOr<std::unique_ptr<ObjectFile>> UO = I->getAsObjectFile();
-          std::unique_ptr<Archive> UA;
           if (UO) {
             if (ObjectFile *o = dyn_cast<ObjectFile>(&*UO.get())) {
               MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o);
@@ -579,7 +579,9 @@ static void PrintFileSectionSizes(StringRef file) {
                 outs() << "\n";
               }
             }
-          } else if (!I->getAsArchive(UA)) {
+          } else if (ErrorOr<std::unique_ptr<Archive>> AOrErr =
+                         I->getAsArchive()) {
+            std::unique_ptr<Archive> &UA = *AOrErr;
             // This is an archive. Iterate over each member and display its
             // sizes.
             for (object::Archive::child_iterator i = UA->child_begin(),
@@ -623,7 +625,6 @@ static void PrintFileSectionSizes(StringRef file) {
                                                E = UB->end_objects();
          I != E; ++I) {
       ErrorOr<std::unique_ptr<ObjectFile>> UO = I->getAsObjectFile();
-      std::unique_ptr<Archive> UA;
       if (UO) {
         if (ObjectFile *o = dyn_cast<ObjectFile>(&*UO.get())) {
           MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o);
@@ -643,7 +644,9 @@ static void PrintFileSectionSizes(StringRef file) {
             outs() << "\n";
           }
         }
-      } else if (!I->getAsArchive(UA)) {
+      } else if (ErrorOr<std::unique_ptr<Archive>> AOrErr =
+                         I->getAsArchive()) {
+        std::unique_ptr<Archive> &UA = *AOrErr;
         // This is an archive. Iterate over each member and display its sizes.
         for (object::Archive::child_iterator i = UA->child_begin(),
                                              e = UA->child_end();
@@ -706,7 +709,7 @@ int main(int argc, char **argv) {
 
   ToolName = argv[0];
   if (OutputFormatShort.getNumOccurrences())
-    OutputFormat = OutputFormatShort;
+    OutputFormat = static_cast<OutputFormatTy>(OutputFormatShort);
   if (RadixShort.getNumOccurrences())
     Radix = RadixShort;
 
diff --git a/tools/llvm-stress/llvm-stress.cpp b/tools/llvm-stress/llvm-stress.cpp
index 21a79e3..05ceeb5 100644
--- a/tools/llvm-stress/llvm-stress.cpp
+++ b/tools/llvm-stress/llvm-stress.cpp
@@ -20,7 +20,7 @@
 #include "llvm/IR/LegacyPassNameParser.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -711,7 +711,7 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  PassManager Passes;
+  legacy::PassManager Passes;
   Passes.add(createVerifierPass());
   Passes.add(createDebugInfoVerifierPass());
   Passes.add(createPrintModulePass(Out->os()));
diff --git a/tools/llvm-symbolizer/Android.mk b/tools/llvm-symbolizer/Android.mk
index 6a2d1d0..9f376c2 100644
--- a/tools/llvm-symbolizer/Android.mk
+++ b/tools/llvm-symbolizer/Android.mk
@@ -16,7 +16,8 @@ LOCAL_SRC_FILES := $(llvm_symbolizer_SRC_FILES)
 LOCAL_LDLIBS += -lpthread -lm -ldl
 
 LOCAL_STATIC_LIBRARIES := \
-  libLLVMDebugInfo \
+  libLLVMDebugInfoDWARF \
+  libLLVMDebugInfoPDB \
   libLLVMObject \
   libLLVMBitReader \
   libLLVMMC \
diff --git a/tools/llvm-symbolizer/CMakeLists.txt b/tools/llvm-symbolizer/CMakeLists.txt
index 9e76248..6968f1c 100644
--- a/tools/llvm-symbolizer/CMakeLists.txt
+++ b/tools/llvm-symbolizer/CMakeLists.txt
@@ -4,7 +4,7 @@
 # targets as well. Currently, there is no support for such a build strategy.
 
 set(LLVM_LINK_COMPONENTS
-  DebugInfo
+  DebugInfoDWARF
   Object
   Support
   )
diff --git a/tools/llvm-symbolizer/LLVMSymbolize.h b/tools/llvm-symbolizer/LLVMSymbolize.h
index ff848fc..c109221 100644
--- a/tools/llvm-symbolizer/LLVMSymbolize.h
+++ b/tools/llvm-symbolizer/LLVMSymbolize.h
@@ -14,7 +14,7 @@
 #define LLVM_TOOLS_LLVM_SYMBOLIZER_LLVMSYMBOLIZE_H
 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DIContext.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/DataExtractor.h"
diff --git a/tools/llvm-symbolizer/Makefile b/tools/llvm-symbolizer/Makefile
index 5ac83a5..c5d2cf7 100644
--- a/tools/llvm-symbolizer/Makefile
+++ b/tools/llvm-symbolizer/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL := ../..
 TOOLNAME := llvm-symbolizer
-LINK_COMPONENTS := DebugInfo Object
+LINK_COMPONENTS := DebugInfoDWARF Object
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1
diff --git a/tools/lto/CMakeLists.txt b/tools/lto/CMakeLists.txt
index 559b22b..87f42e8 100644
--- a/tools/lto/CMakeLists.txt
+++ b/tools/lto/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
+  Core
   LTO
   MC
   MCDisassembler
diff --git a/tools/lto/lto.cpp b/tools/lto/lto.cpp
index ef37c90..714b8e0 100644
--- a/tools/lto/lto.cpp
+++ b/tools/lto/lto.cpp
@@ -14,9 +14,11 @@
 
 #include "llvm-c/lto.h"
 #include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/LTO/LTOCodeGenerator.h"
 #include "llvm/LTO/LTOModule.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
 
 // extra command-line flags needed for LTOCodeGenerator
@@ -50,6 +52,12 @@ static bool parsedOptions = false;
 // Initialize the configured targets if they have not been initialized.
 static void lto_initialize() {
   if (!initialized) {
+#ifdef LLVM_ON_WIN32
+    // Dialog box on crash disabling doesn't work across DLL boundaries, so do
+    // it here.
+    llvm::sys::DisableSystemDialogsOnCrash();
+#endif
+
     InitializeAllTargetInfos();
     InitializeAllTargets();
     InitializeAllTargetMCs();
@@ -213,23 +221,37 @@ void lto_codegen_set_diagnostic_handler(lto_code_gen_t cg,
   unwrap(cg)->setDiagnosticHandler(diag_handler, ctxt);
 }
 
-lto_code_gen_t lto_codegen_create(void) {
+static lto_code_gen_t createCodeGen(bool InLocalContext) {
   lto_initialize();
 
   TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
 
-  LTOCodeGenerator *CodeGen = new LTOCodeGenerator();
+  LTOCodeGenerator *CodeGen =
+      InLocalContext ? new LTOCodeGenerator(make_unique<LLVMContext>())
+                     : new LTOCodeGenerator();
   if (CodeGen)
     CodeGen->setTargetOptions(Options);
   return wrap(CodeGen);
 }
 
+lto_code_gen_t lto_codegen_create(void) {
+  return createCodeGen(/* InLocalContext */ false);
+}
+
+lto_code_gen_t lto_codegen_create_in_local_context(void) {
+  return createCodeGen(/* InLocalContext */ true);
+}
+
 void lto_codegen_dispose(lto_code_gen_t cg) { delete unwrap(cg); }
 
 bool lto_codegen_add_module(lto_code_gen_t cg, lto_module_t mod) {
   return !unwrap(cg)->addModule(unwrap(mod));
 }
 
+void lto_codegen_set_module(lto_code_gen_t cg, lto_module_t mod) {
+  unwrap(cg)->setModule(unwrap(mod));
+}
+
 bool lto_codegen_set_debug_model(lto_code_gen_t cg, lto_debug_model debug) {
   unwrap(cg)->setDebugInfo(debug);
   return false;
@@ -278,6 +300,26 @@ const void *lto_codegen_compile(lto_code_gen_t cg, size_t *length) {
                              sLastErrorString);
 }
 
+bool lto_codegen_optimize(lto_code_gen_t cg) {
+  if (!parsedOptions) {
+    unwrap(cg)->parseCodeGenDebugOptions();
+    lto_add_attrs(cg);
+    parsedOptions = true;
+  }
+  return !unwrap(cg)->optimize(DisableOpt, DisableInline,
+                               DisableGVNLoadPRE, DisableLTOVectorization,
+                               sLastErrorString);
+}
+
+const void *lto_codegen_compile_optimized(lto_code_gen_t cg, size_t *length) {
+  if (!parsedOptions) {
+    unwrap(cg)->parseCodeGenDebugOptions();
+    lto_add_attrs(cg);
+    parsedOptions = true;
+  }
+  return unwrap(cg)->compileOptimized(length, sLastErrorString);
+}
+
 bool lto_codegen_compile_to_file(lto_code_gen_t cg, const char **name) {
   if (!parsedOptions) {
     unwrap(cg)->parseCodeGenDebugOptions();
@@ -292,3 +334,5 @@ bool lto_codegen_compile_to_file(lto_code_gen_t cg, const char **name) {
 void lto_codegen_debug_options(lto_code_gen_t cg, const char *opt) {
   unwrap(cg)->setCodeGenDebugOptions(opt);
 }
+
+unsigned int lto_api_version() { return LTO_API_VERSION; }
diff --git a/tools/lto/lto.exports b/tools/lto/lto.exports
index b10ab1a..8729050 100644
--- a/tools/lto/lto.exports
+++ b/tools/lto/lto.exports
@@ -6,6 +6,8 @@ lto_module_create_from_fd
 lto_module_create_from_fd_at_offset
 lto_module_create_from_memory
 lto_module_create_from_memory_with_path
+lto_module_create_in_local_context
+lto_module_create_in_codegen_context
 lto_module_get_deplib
 lto_module_get_linkeropt
 lto_module_get_num_deplibs
@@ -20,11 +22,14 @@ lto_module_is_object_file_for_target
 lto_module_is_object_file_in_memory
 lto_module_is_object_file_in_memory_for_target
 lto_module_dispose
+lto_api_version
 lto_codegen_set_diagnostic_handler
 lto_codegen_add_module
+lto_codegen_set_module
 lto_codegen_add_must_preserve_symbol
 lto_codegen_compile
 lto_codegen_create
+lto_codegen_create_in_local_context
 lto_codegen_dispose
 lto_codegen_set_debug_model
 lto_codegen_set_pic_model
@@ -34,6 +39,8 @@ lto_codegen_set_assembler_args
 lto_codegen_set_assembler_path
 lto_codegen_set_cpu
 lto_codegen_compile_to_file
+lto_codegen_optimize
+lto_codegen_compile_optimized
 LLVMCreateDisasm
 LLVMCreateDisasmCPU
 LLVMDisasmDispose
diff --git a/tools/macho-dump/macho-dump.cpp b/tools/macho-dump/macho-dump.cpp
index aac720d..604f93a 100644
--- a/tools/macho-dump/macho-dump.cpp
+++ b/tools/macho-dump/macho-dump.cpp
@@ -300,12 +300,12 @@ DumpDataInCodeDataCommand(const MachOObjectFile &Obj,
 static int
 DumpLinkerOptionsCommand(const MachOObjectFile &Obj,
                          const MachOObjectFile::LoadCommandInfo &LCI) {
-  MachO::linker_options_command LOLC = Obj.getLinkerOptionsLoadCommand(LCI);
+  MachO::linker_option_command LOLC = Obj.getLinkerOptionLoadCommand(LCI);
   outs() << "  ('count', " << LOLC.count << ")\n"
          << "  ('_strings', [\n";
 
-  uint64_t DataSize = LOLC.cmdsize - sizeof(MachO::linker_options_command);
-  const char *P = LCI.Ptr + sizeof(MachO::linker_options_command);
+  uint64_t DataSize = LOLC.cmdsize - sizeof(MachO::linker_option_command);
+  const char *P = LCI.Ptr + sizeof(MachO::linker_option_command);
   StringRef Data(P, DataSize);
   for (unsigned i = 0; i != LOLC.count; ++i) {
     std::pair<StringRef,StringRef> Split = Data.split('\0');
@@ -356,7 +356,7 @@ static int DumpLoadCommand(const MachOObjectFile &Obj,
     return DumpLinkeditDataCommand(Obj, LCI);
   case MachO::LC_DATA_IN_CODE:
     return DumpDataInCodeDataCommand(Obj, LCI);
-  case MachO::LC_LINKER_OPTIONS:
+  case MachO::LC_LINKER_OPTION:
     return DumpLinkerOptionsCommand(Obj, LCI);
   case MachO::LC_VERSION_MIN_IPHONEOS:
   case MachO::LC_VERSION_MIN_MACOSX:
diff --git a/tools/obj2yaml/elf2yaml.cpp b/tools/obj2yaml/elf2yaml.cpp
index d770ce1..e606df2 100644
--- a/tools/obj2yaml/elf2yaml.cpp
+++ b/tools/obj2yaml/elf2yaml.cpp
@@ -21,8 +21,10 @@ namespace {
 
 template <class ELFT>
 class ELFDumper {
+  typedef object::Elf_Sym_Impl<ELFT> Elf_Sym;
   typedef typename object::ELFFile<ELFT>::Elf_Shdr Elf_Shdr;
   typedef typename object::ELFFile<ELFT>::Elf_Sym_Iter Elf_Sym_Iter;
+  typedef typename object::ELFFile<ELFT>::Elf_Word Elf_Word;
 
   const object::ELFFile<ELFT> &Obj;
 
@@ -38,6 +40,7 @@ class ELFDumper {
   ErrorOr<ELFYAML::RelocationSection *> dumpRelaSection(const Elf_Shdr *Shdr);
   ErrorOr<ELFYAML::RawContentSection *>
   dumpContentSection(const Elf_Shdr *Shdr);
+  ErrorOr<ELFYAML::Group *> dumpGroup(const Elf_Shdr *Shdr);
 
 public:
   ELFDumper(const object::ELFFile<ELFT> &O);
@@ -86,7 +89,13 @@ ErrorOr<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
       Y->Sections.push_back(std::unique_ptr<ELFYAML::Section>(S.get()));
       break;
     }
-    // FIXME: Support SHT_GROUP section format.
+    case ELF::SHT_GROUP: {
+      ErrorOr<ELFYAML::Group *> G = dumpGroup(&Sec);
+      if (std::error_code EC = G.getError())
+        return EC;
+      Y->Sections.push_back(std::unique_ptr<ELFYAML::Section>(G.get()));
+      break;
+    }
     default: {
       ErrorOr<ELFYAML::RawContentSection *> S = dumpContentSection(&Sec);
       if (std::error_code EC = S.getError())
@@ -275,6 +284,41 @@ ELFDumper<ELFT>::dumpContentSection(const Elf_Shdr *Shdr) {
 }
 
 template <class ELFT>
+ErrorOr<ELFYAML::Group *> ELFDumper<ELFT>::dumpGroup(const Elf_Shdr *Shdr) {
+  auto S = make_unique<ELFYAML::Group>();
+
+  if (std::error_code EC = dumpCommonSection(Shdr, *S))
+    return EC;
+  // Get sh_info which is the signature.
+  const Elf_Sym *symbol = Obj.getSymbol(Shdr->sh_info);
+  const Elf_Shdr *symtab = Obj.getSection(Shdr->sh_link);
+  auto sectionContents = Obj.getSectionContents(Shdr);
+  if (std::error_code ec = sectionContents.getError())
+    return ec;
+  ErrorOr<StringRef> symbolName = Obj.getSymbolName(symtab, symbol);
+  if (std::error_code EC = symbolName.getError())
+    return EC;
+  S->Info = *symbolName;
+  const Elf_Word *groupMembers =
+      reinterpret_cast<const Elf_Word *>(sectionContents->data());
+  const long count = (Shdr->sh_size) / sizeof(Elf_Word);
+  ELFYAML::SectionOrType s;
+  for (int i = 0; i < count; i++) {
+    if (groupMembers[i] == llvm::ELF::GRP_COMDAT) {
+      s.sectionNameOrType = "GRP_COMDAT";
+    } else {
+      const Elf_Shdr *sHdr = Obj.getSection(groupMembers[i]);
+      ErrorOr<StringRef> sectionName = Obj.getSectionName(sHdr);
+      if (std::error_code ec = sectionName.getError())
+        return ec;
+      s.sectionNameOrType = *sectionName;
+    }
+    S->Members.push_back(s);
+  }
+  return S.release();
+}
+
+template <class ELFT>
 static std::error_code elf2yaml(raw_ostream &Out,
                                 const object::ELFFile<ELFT> &Obj) {
   ELFDumper<ELFT> Dumper(Obj);
diff --git a/tools/opt/NewPMDriver.cpp b/tools/opt/NewPMDriver.cpp
index 8076ff4..a73750d 100644
--- a/tools/opt/NewPMDriver.cpp
+++ b/tools/opt/NewPMDriver.cpp
@@ -17,8 +17,8 @@
 #include "Passes.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
-#include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Bitcode/BitcodeWriterPass.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
@@ -27,28 +27,29 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 using namespace opt_tool;
 
-bool llvm::runPassPipeline(StringRef Arg0, LLVMContext &Context, Module &M,
-                           tool_output_file *Out, StringRef PassPipeline,
-                           OutputKind OK, VerifierKind VK) {
-  FunctionAnalysisManager FAM;
-  CGSCCAnalysisManager CGAM;
-  ModuleAnalysisManager MAM;
+static cl::opt<bool>
+    DebugPM("debug-pass-manager", cl::Hidden,
+            cl::desc("Print pass management debugging information"));
 
-#define MODULE_ANALYSIS(NAME, CREATE_PASS) \
-  MAM.registerPass(CREATE_PASS);
-#include "PassRegistry.def"
+bool llvm::runPassPipeline(StringRef Arg0, LLVMContext &Context, Module &M,
+                           TargetMachine *TM, tool_output_file *Out,
+                           StringRef PassPipeline, OutputKind OK,
+                           VerifierKind VK) {
+  Passes P(TM);
 
-#define CGSCC_ANALYSIS(NAME, CREATE_PASS) \
-  CGAM.registerPass(CREATE_PASS);
-#include "PassRegistry.def"
+  FunctionAnalysisManager FAM(DebugPM);
+  CGSCCAnalysisManager CGAM(DebugPM);
+  ModuleAnalysisManager MAM(DebugPM);
 
-#define FUNCTION_ANALYSIS(NAME, CREATE_PASS) \
-  FAM.registerPass(CREATE_PASS);
-#include "PassRegistry.def"
+  // Register all the basic analyses with the managers.
+  P.registerModuleAnalyses(MAM);
+  P.registerCGSCCAnalyses(CGAM);
+  P.registerFunctionAnalyses(FAM);
 
   // Cross register the analysis managers through their proxies.
   MAM.registerPass(FunctionAnalysisManagerModuleProxy(FAM));
@@ -58,11 +59,12 @@ bool llvm::runPassPipeline(StringRef Arg0, LLVMContext &Context, Module &M,
   FAM.registerPass(CGSCCAnalysisManagerFunctionProxy(CGAM));
   FAM.registerPass(ModuleAnalysisManagerFunctionProxy(MAM));
 
-  ModulePassManager MPM;
+  ModulePassManager MPM(DebugPM);
   if (VK > VK_NoVerifier)
     MPM.addPass(VerifierPass());
 
-  if (!parsePassPipeline(MPM, PassPipeline, VK == VK_VerifyEachPass)) {
+  if (!P.parsePassPipeline(MPM, PassPipeline, VK == VK_VerifyEachPass,
+                           DebugPM)) {
     errs() << Arg0 << ": unable to parse pass pipeline description.\n";
     return false;
   }
@@ -86,7 +88,7 @@ bool llvm::runPassPipeline(StringRef Arg0, LLVMContext &Context, Module &M,
   cl::PrintOptionValues();
 
   // Now that we have all of the passes ready, run them.
-  MPM.run(&M, &MAM);
+  MPM.run(M, &MAM);
 
   // Declare success.
   if (OK != OK_NoOutput)
diff --git a/tools/opt/NewPMDriver.h b/tools/opt/NewPMDriver.h
index f977bac..5384fe2 100644
--- a/tools/opt/NewPMDriver.h
+++ b/tools/opt/NewPMDriver.h
@@ -26,6 +26,7 @@
 namespace llvm {
 class LLVMContext;
 class Module;
+class TargetMachine;
 class tool_output_file;
 
 namespace opt_tool {
@@ -48,8 +49,9 @@ enum VerifierKind {
 /// file. It's interface is consequentially somewhat ad-hoc, but will go away
 /// when the transition finishes.
 bool runPassPipeline(StringRef Arg0, LLVMContext &Context, Module &M,
-                     tool_output_file *Out, StringRef PassPipeline,
-                     opt_tool::OutputKind OK, opt_tool::VerifierKind VK);
+                     TargetMachine *TM, tool_output_file *Out,
+                     StringRef PassPipeline, opt_tool::OutputKind OK,
+                     opt_tool::VerifierKind VK);
 }
 
 #endif
diff --git a/tools/opt/PassRegistry.def b/tools/opt/PassRegistry.def
index e1e4900..d768a3a 100644
--- a/tools/opt/PassRegistry.def
+++ b/tools/opt/PassRegistry.def
@@ -20,32 +20,58 @@
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)
 #endif
 MODULE_ANALYSIS("lcg", LazyCallGraphAnalysis())
+MODULE_ANALYSIS("no-op-module", NoOpModuleAnalysis())
+MODULE_ANALYSIS("targetlibinfo", TargetLibraryAnalysis())
 #undef MODULE_ANALYSIS
 
 #ifndef MODULE_PASS
 #define MODULE_PASS(NAME, CREATE_PASS)
 #endif
+MODULE_PASS("invalidate<all>", InvalidateAllAnalysesPass())
+MODULE_PASS("no-op-module", NoOpModulePass())
 MODULE_PASS("print", PrintModulePass(dbgs()))
 MODULE_PASS("print-cg", LazyCallGraphPrinterPass(dbgs()))
+MODULE_PASS("verify", VerifierPass())
 #undef MODULE_PASS
 
 #ifndef CGSCC_ANALYSIS
 #define CGSCC_ANALYSIS(NAME, CREATE_PASS)
 #endif
+CGSCC_ANALYSIS("no-op-cgscc", NoOpCGSCCAnalysis())
 #undef CGSCC_ANALYSIS
 
 #ifndef CGSCC_PASS
 #define CGSCC_PASS(NAME, CREATE_PASS)
 #endif
+CGSCC_PASS("invalidate<all>", InvalidateAllAnalysesPass())
+CGSCC_PASS("no-op-cgscc", NoOpCGSCCPass())
 #undef CGSCC_PASS
 
 #ifndef FUNCTION_ANALYSIS
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)
 #endif
+FUNCTION_ANALYSIS("assumptions", AssumptionAnalysis())
+FUNCTION_ANALYSIS("domtree", DominatorTreeAnalysis())
+FUNCTION_ANALYSIS("loops", LoopAnalysis())
+FUNCTION_ANALYSIS("no-op-function", NoOpFunctionAnalysis())
+FUNCTION_ANALYSIS("targetlibinfo", TargetLibraryAnalysis())
+FUNCTION_ANALYSIS("targetir",
+                  TM ? TM->getTargetIRAnalysis() : TargetIRAnalysis())
 #undef FUNCTION_ANALYSIS
 
 #ifndef FUNCTION_PASS
 #define FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
+FUNCTION_PASS("early-cse", EarlyCSEPass())
+FUNCTION_PASS("instcombine", InstCombinePass())
+FUNCTION_PASS("invalidate<all>", InvalidateAllAnalysesPass())
+FUNCTION_PASS("no-op-function", NoOpFunctionPass())
+FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass())
 FUNCTION_PASS("print", PrintFunctionPass(dbgs()))
+FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(dbgs()))
+FUNCTION_PASS("print<domtree>", DominatorTreePrinterPass(dbgs()))
+FUNCTION_PASS("print<loops>", LoopPrinterPass(dbgs()))
+FUNCTION_PASS("simplify-cfg", SimplifyCFGPass())
+FUNCTION_PASS("verify", VerifierPass())
+FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
 #undef FUNCTION_PASS
diff --git a/tools/opt/Passes.cpp b/tools/opt/Passes.cpp
index a171f42..e5ad5c0 100644
--- a/tools/opt/Passes.cpp
+++ b/tools/opt/Passes.cpp
@@ -15,12 +15,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "Passes.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
 
 using namespace llvm;
 
@@ -28,112 +38,184 @@ namespace {
 
 /// \brief No-op module pass which does nothing.
 struct NoOpModulePass {
-  PreservedAnalyses run(Module *M) { return PreservedAnalyses::all(); }
+  PreservedAnalyses run(Module &M) { return PreservedAnalyses::all(); }
   static StringRef name() { return "NoOpModulePass"; }
 };
 
+/// \brief No-op module analysis.
+struct NoOpModuleAnalysis {
+  struct Result {};
+  Result run(Module &) { return Result(); }
+  static StringRef name() { return "NoOpModuleAnalysis"; }
+  static void *ID() { return (void *)&PassID; }
+private:
+  static char PassID;
+};
+
+char NoOpModuleAnalysis::PassID;
+
 /// \brief No-op CGSCC pass which does nothing.
 struct NoOpCGSCCPass {
-  PreservedAnalyses run(LazyCallGraph::SCC *C) {
+  PreservedAnalyses run(LazyCallGraph::SCC &C) {
     return PreservedAnalyses::all();
   }
   static StringRef name() { return "NoOpCGSCCPass"; }
 };
 
+/// \brief No-op CGSCC analysis.
+struct NoOpCGSCCAnalysis {
+  struct Result {};
+  Result run(LazyCallGraph::SCC &) { return Result(); }
+  static StringRef name() { return "NoOpCGSCCAnalysis"; }
+  static void *ID() { return (void *)&PassID; }
+private:
+  static char PassID;
+};
+
+char NoOpCGSCCAnalysis::PassID;
+
 /// \brief No-op function pass which does nothing.
 struct NoOpFunctionPass {
-  PreservedAnalyses run(Function *F) { return PreservedAnalyses::all(); }
+  PreservedAnalyses run(Function &F) { return PreservedAnalyses::all(); }
   static StringRef name() { return "NoOpFunctionPass"; }
 };
 
+/// \brief No-op function analysis.
+struct NoOpFunctionAnalysis {
+  struct Result {};
+  Result run(Function &) { return Result(); }
+  static StringRef name() { return "NoOpFunctionAnalysis"; }
+  static void *ID() { return (void *)&PassID; }
+private:
+  static char PassID;
+};
+
+char NoOpFunctionAnalysis::PassID;
+
 } // End anonymous namespace.
 
-static bool isModulePassName(StringRef Name) {
-  if (Name == "no-op-module") return true;
+void Passes::registerModuleAnalyses(ModuleAnalysisManager &MAM) {
+#define MODULE_ANALYSIS(NAME, CREATE_PASS) \
+  MAM.registerPass(CREATE_PASS);
+#include "PassRegistry.def"
+}
+
+void Passes::registerCGSCCAnalyses(CGSCCAnalysisManager &CGAM) {
+#define CGSCC_ANALYSIS(NAME, CREATE_PASS) \
+  CGAM.registerPass(CREATE_PASS);
+#include "PassRegistry.def"
+}
+
+void Passes::registerFunctionAnalyses(FunctionAnalysisManager &FAM) {
+#define FUNCTION_ANALYSIS(NAME, CREATE_PASS) \
+  FAM.registerPass(CREATE_PASS);
+#include "PassRegistry.def"
+}
 
+#ifndef NDEBUG
+static bool isModulePassName(StringRef Name) {
 #define MODULE_PASS(NAME, CREATE_PASS) if (Name == NAME) return true;
+#define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
+  if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
+    return true;
 #include "PassRegistry.def"
 
   return false;
 }
+#endif
 
 static bool isCGSCCPassName(StringRef Name) {
-  if (Name == "no-op-cgscc") return true;
-
 #define CGSCC_PASS(NAME, CREATE_PASS) if (Name == NAME) return true;
+#define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
+  if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
+    return true;
 #include "PassRegistry.def"
 
   return false;
 }
 
 static bool isFunctionPassName(StringRef Name) {
-  if (Name == "no-op-function") return true;
-
 #define FUNCTION_PASS(NAME, CREATE_PASS) if (Name == NAME) return true;
+#define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
+  if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
+    return true;
 #include "PassRegistry.def"
 
   return false;
 }
 
-static bool parseModulePassName(ModulePassManager &MPM, StringRef Name) {
-  if (Name == "no-op-module") {
-    MPM.addPass(NoOpModulePass());
-    return true;
-  }
-
+bool Passes::parseModulePassName(ModulePassManager &MPM, StringRef Name) {
 #define MODULE_PASS(NAME, CREATE_PASS)                                         \
   if (Name == NAME) {                                                          \
     MPM.addPass(CREATE_PASS);                                                  \
     return true;                                                               \
   }
+#define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
+  if (Name == "require<" NAME ">") {                                           \
+    MPM.addPass(RequireAnalysisPass<decltype(CREATE_PASS)>());                 \
+    return true;                                                               \
+  }                                                                            \
+  if (Name == "invalidate<" NAME ">") {                                        \
+    MPM.addPass(InvalidateAnalysisPass<decltype(CREATE_PASS)>());              \
+    return true;                                                               \
+  }
 #include "PassRegistry.def"
 
   return false;
 }
 
-static bool parseCGSCCPassName(CGSCCPassManager &CGPM, StringRef Name) {
-  if (Name == "no-op-cgscc") {
-    CGPM.addPass(NoOpCGSCCPass());
-    return true;
-  }
-
+bool Passes::parseCGSCCPassName(CGSCCPassManager &CGPM, StringRef Name) {
 #define CGSCC_PASS(NAME, CREATE_PASS)                                          \
   if (Name == NAME) {                                                          \
     CGPM.addPass(CREATE_PASS);                                                 \
     return true;                                                               \
   }
+#define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
+  if (Name == "require<" NAME ">") {                                           \
+    CGPM.addPass(RequireAnalysisPass<decltype(CREATE_PASS)>());                \
+    return true;                                                               \
+  }                                                                            \
+  if (Name == "invalidate<" NAME ">") {                                        \
+    CGPM.addPass(InvalidateAnalysisPass<decltype(CREATE_PASS)>());             \
+    return true;                                                               \
+  }
 #include "PassRegistry.def"
 
   return false;
 }
 
-static bool parseFunctionPassName(FunctionPassManager &FPM, StringRef Name) {
-  if (Name == "no-op-function") {
-    FPM.addPass(NoOpFunctionPass());
-    return true;
-  }
-
+bool Passes::parseFunctionPassName(FunctionPassManager &FPM, StringRef Name) {
 #define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
   if (Name == NAME) {                                                          \
     FPM.addPass(CREATE_PASS);                                                  \
     return true;                                                               \
   }
+#define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
+  if (Name == "require<" NAME ">") {                                           \
+    FPM.addPass(RequireAnalysisPass<decltype(CREATE_PASS)>());                 \
+    return true;                                                               \
+  }                                                                            \
+  if (Name == "invalidate<" NAME ">") {                                        \
+    FPM.addPass(InvalidateAnalysisPass<decltype(CREATE_PASS)>());              \
+    return true;                                                               \
+  }
 #include "PassRegistry.def"
 
   return false;
 }
 
-static bool parseFunctionPassPipeline(FunctionPassManager &FPM,
-                                      StringRef &PipelineText,
-                                      bool VerifyEachPass) {
+bool Passes::parseFunctionPassPipeline(FunctionPassManager &FPM,
+                                       StringRef &PipelineText,
+                                       bool VerifyEachPass, bool DebugLogging) {
   for (;;) {
     // Parse nested pass managers by recursing.
     if (PipelineText.startswith("function(")) {
-      FunctionPassManager NestedFPM;
+      FunctionPassManager NestedFPM(DebugLogging);
 
       // Parse the inner pipeline inte the nested manager.
       PipelineText = PipelineText.substr(strlen("function("));
-      if (!parseFunctionPassPipeline(NestedFPM, PipelineText, VerifyEachPass) ||
+      if (!parseFunctionPassPipeline(NestedFPM, PipelineText, VerifyEachPass,
+                                     DebugLogging) ||
           PipelineText.empty())
         return false;
       assert(PipelineText[0] == ')');
@@ -160,17 +242,18 @@ static bool parseFunctionPassPipeline(FunctionPassManager &FPM,
   }
 }
 
-static bool parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
-                                      StringRef &PipelineText,
-                                      bool VerifyEachPass) {
+bool Passes::parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
+                                    StringRef &PipelineText,
+                                    bool VerifyEachPass, bool DebugLogging) {
   for (;;) {
     // Parse nested pass managers by recursing.
     if (PipelineText.startswith("cgscc(")) {
-      CGSCCPassManager NestedCGPM;
+      CGSCCPassManager NestedCGPM(DebugLogging);
 
       // Parse the inner pipeline into the nested manager.
       PipelineText = PipelineText.substr(strlen("cgscc("));
-      if (!parseCGSCCPassPipeline(NestedCGPM, PipelineText, VerifyEachPass) ||
+      if (!parseCGSCCPassPipeline(NestedCGPM, PipelineText, VerifyEachPass,
+                                  DebugLogging) ||
           PipelineText.empty())
         return false;
       assert(PipelineText[0] == ')');
@@ -179,11 +262,12 @@ static bool parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
       // Add the nested pass manager with the appropriate adaptor.
       CGPM.addPass(std::move(NestedCGPM));
     } else if (PipelineText.startswith("function(")) {
-      FunctionPassManager NestedFPM;
+      FunctionPassManager NestedFPM(DebugLogging);
 
       // Parse the inner pipeline inte the nested manager.
       PipelineText = PipelineText.substr(strlen("function("));
-      if (!parseFunctionPassPipeline(NestedFPM, PipelineText, VerifyEachPass) ||
+      if (!parseFunctionPassPipeline(NestedFPM, PipelineText, VerifyEachPass,
+                                     DebugLogging) ||
           PipelineText.empty())
         return false;
       assert(PipelineText[0] == ')');
@@ -209,17 +293,18 @@ static bool parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
   }
 }
 
-static bool parseModulePassPipeline(ModulePassManager &MPM,
-                                    StringRef &PipelineText,
-                                    bool VerifyEachPass) {
+bool Passes::parseModulePassPipeline(ModulePassManager &MPM,
+                                     StringRef &PipelineText,
+                                     bool VerifyEachPass, bool DebugLogging) {
   for (;;) {
     // Parse nested pass managers by recursing.
     if (PipelineText.startswith("module(")) {
-      ModulePassManager NestedMPM;
+      ModulePassManager NestedMPM(DebugLogging);
 
       // Parse the inner pipeline into the nested manager.
       PipelineText = PipelineText.substr(strlen("module("));
-      if (!parseModulePassPipeline(NestedMPM, PipelineText, VerifyEachPass) ||
+      if (!parseModulePassPipeline(NestedMPM, PipelineText, VerifyEachPass,
+                                   DebugLogging) ||
           PipelineText.empty())
         return false;
       assert(PipelineText[0] == ')');
@@ -228,11 +313,12 @@ static bool parseModulePassPipeline(ModulePassManager &MPM,
       // Now add the nested manager as a module pass.
       MPM.addPass(std::move(NestedMPM));
     } else if (PipelineText.startswith("cgscc(")) {
-      CGSCCPassManager NestedCGPM;
+      CGSCCPassManager NestedCGPM(DebugLogging);
 
       // Parse the inner pipeline inte the nested manager.
       PipelineText = PipelineText.substr(strlen("cgscc("));
-      if (!parseCGSCCPassPipeline(NestedCGPM, PipelineText, VerifyEachPass) ||
+      if (!parseCGSCCPassPipeline(NestedCGPM, PipelineText, VerifyEachPass,
+                                  DebugLogging) ||
           PipelineText.empty())
         return false;
       assert(PipelineText[0] == ')');
@@ -242,11 +328,12 @@ static bool parseModulePassPipeline(ModulePassManager &MPM,
       MPM.addPass(
           createModuleToPostOrderCGSCCPassAdaptor(std::move(NestedCGPM)));
     } else if (PipelineText.startswith("function(")) {
-      FunctionPassManager NestedFPM;
+      FunctionPassManager NestedFPM(DebugLogging);
 
       // Parse the inner pipeline inte the nested manager.
       PipelineText = PipelineText.substr(strlen("function("));
-      if (!parseFunctionPassPipeline(NestedFPM, PipelineText, VerifyEachPass) ||
+      if (!parseFunctionPassPipeline(NestedFPM, PipelineText, VerifyEachPass,
+                                     DebugLogging) ||
           PipelineText.empty())
         return false;
       assert(PipelineText[0] == ')');
@@ -276,48 +363,39 @@ static bool parseModulePassPipeline(ModulePassManager &MPM,
 // Primary pass pipeline description parsing routine.
 // FIXME: Should this routine accept a TargetMachine or require the caller to
 // pre-populate the analysis managers with target-specific stuff?
-bool llvm::parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText,
-                             bool VerifyEachPass) {
-  // Look at the first entry to figure out which layer to start parsing at.
-  if (PipelineText.startswith("module("))
-    return parseModulePassPipeline(MPM, PipelineText, VerifyEachPass) &&
-           PipelineText.empty();
-  if (PipelineText.startswith("cgscc(")) {
-    CGSCCPassManager CGPM;
-    if (!parseCGSCCPassPipeline(CGPM, PipelineText, VerifyEachPass) ||
-        !PipelineText.empty())
-      return false;
-    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
-    return true;
-  }
-  if (PipelineText.startswith("function(")) {
-    FunctionPassManager FPM;
-    if (!parseFunctionPassPipeline(FPM, PipelineText, VerifyEachPass) ||
-        !PipelineText.empty())
-      return false;
-    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-    return true;
-  }
-
-  // This isn't a direct pass manager name, look for the end of a pass name.
+bool Passes::parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText,
+                               bool VerifyEachPass, bool DebugLogging) {
+  // By default, try to parse the pipeline as-if it were within an implicit
+  // 'module(...)' pass pipeline. If this will parse at all, it needs to
+  // consume the entire string.
+  if (parseModulePassPipeline(MPM, PipelineText, VerifyEachPass, DebugLogging))
+    return PipelineText.empty();
+
+  // This isn't parsable as a module pipeline, look for the end of a pass name
+  // and directly drop down to that layer.
   StringRef FirstName =
       PipelineText.substr(0, PipelineText.find_first_of(",)"));
-  if (isModulePassName(FirstName))
-    return parseModulePassPipeline(MPM, PipelineText, VerifyEachPass) &&
-           PipelineText.empty();
+  assert(!isModulePassName(FirstName) &&
+         "Already handled all module pipeline options.");
 
+  // If this looks like a CGSCC pass, parse the whole thing as a CGSCC
+  // pipeline.
   if (isCGSCCPassName(FirstName)) {
-    CGSCCPassManager CGPM;
-    if (!parseCGSCCPassPipeline(CGPM, PipelineText, VerifyEachPass) ||
+    CGSCCPassManager CGPM(DebugLogging);
+    if (!parseCGSCCPassPipeline(CGPM, PipelineText, VerifyEachPass,
+                                DebugLogging) ||
         !PipelineText.empty())
       return false;
     MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
     return true;
   }
 
+  // Similarly, if this looks like a Function pass, parse the whole thing as
+  // a Function pipelien.
   if (isFunctionPassName(FirstName)) {
-    FunctionPassManager FPM;
-    if (!parseFunctionPassPipeline(FPM, PipelineText, VerifyEachPass) ||
+    FunctionPassManager FPM(DebugLogging);
+    if (!parseFunctionPassPipeline(FPM, PipelineText, VerifyEachPass,
+                                   DebugLogging) ||
         !PipelineText.empty())
       return false;
     MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
diff --git a/tools/opt/Passes.h b/tools/opt/Passes.h
index 3bd6752..d3cb628 100644
--- a/tools/opt/Passes.h
+++ b/tools/opt/Passes.h
@@ -1,4 +1,4 @@
-//===- Passes.h - Parsing, selection, and running of passes -----*- C++ -*-===//
+//===- Passes.h - Utilities for manipulating all passes ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 /// \file
 ///
-/// Interfaces for producing common pass manager configurations and parsing
-/// textual pass specifications.
+/// Interfaces for registering passes, producing common pass manager
+/// configurations, and parsing of pass pipelines.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -17,40 +17,88 @@
 #define LLVM_TOOLS_OPT_PASSES_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/IR/PassManager.h"
 
 namespace llvm {
-class ModulePassManager;
+class TargetMachine;
 
-/// \brief Parse a textual pass pipeline description into a \c ModulePassManager.
+/// \brief This class provides access to all of LLVM's passes.
 ///
-/// The format of the textual pass pipeline description looks something like:
-///
-///   module(function(instcombine,sroa),dce,cgscc(inliner,function(...)),...)
-///
-/// Pass managers have ()s describing the nest structure of passes. All passes
-/// are comma separated. As a special shortcut, if the very first pass is not
-/// a module pass (as a module pass manager is), this will automatically form
-/// the shortest stack of pass managers that allow inserting that first pass.
-/// So, assuming function passes 'fpassN', CGSCC passes 'cgpassN', and loop passes
-/// 'lpassN', all of these are valid:
-///
-///   fpass1,fpass2,fpass3
-///   cgpass1,cgpass2,cgpass3
-///   lpass1,lpass2,lpass3
-///
-/// And they are equivalent to the following (resp.):
-///
-///   module(function(fpass1,fpass2,fpass3))
-///   module(cgscc(cgpass1,cgpass2,cgpass3))
-///   module(function(loop(lpass1,lpass2,lpass3)))
-///
-/// This shortcut is especially useful for debugging and testing small pass
-/// combinations. Note that these shortcuts don't introduce any other magic. If
-/// the sequence of passes aren't all the exact same kind of pass, it will be
-/// an error. You cannot mix different levels implicitly, you must explicitly
-/// form a pass manager in which to nest passes.
-bool parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText,
-                       bool VerifyEachPass = true);
+/// It's members provide the baseline state available to passes during their
+/// construction. The \c PassRegistry.def file specifies how to construct all
+/// of the built-in passes, and those may reference these members during
+/// construction.
+class Passes {
+  TargetMachine *TM;
+
+public:
+  explicit Passes(TargetMachine *TM = nullptr) : TM(TM) {}
+
+  /// \brief Registers all available module analysis passes.
+  ///
+  /// This is an interface that can be used to populate a \c
+  /// ModuleAnalysisManager with all registered module analyses. Callers can
+  /// still manually register any additional analyses.
+  void registerModuleAnalyses(ModuleAnalysisManager &MAM);
+
+  /// \brief Registers all available CGSCC analysis passes.
+  ///
+  /// This is an interface that can be used to populate a \c CGSCCAnalysisManager
+  /// with all registered CGSCC analyses. Callers can still manually register any
+  /// additional analyses.
+  void registerCGSCCAnalyses(CGSCCAnalysisManager &CGAM);
+
+  /// \brief Registers all available function analysis passes.
+  ///
+  /// This is an interface that can be used to populate a \c
+  /// FunctionAnalysisManager with all registered function analyses. Callers can
+  /// still manually register any additional analyses.
+  void registerFunctionAnalyses(FunctionAnalysisManager &FAM);
+
+  /// \brief Parse a textual pass pipeline description into a \c ModulePassManager.
+  ///
+  /// The format of the textual pass pipeline description looks something like:
+  ///
+  ///   module(function(instcombine,sroa),dce,cgscc(inliner,function(...)),...)
+  ///
+  /// Pass managers have ()s describing the nest structure of passes. All passes
+  /// are comma separated. As a special shortcut, if the very first pass is not
+  /// a module pass (as a module pass manager is), this will automatically form
+  /// the shortest stack of pass managers that allow inserting that first pass.
+  /// So, assuming function passes 'fpassN', CGSCC passes 'cgpassN', and loop passes
+  /// 'lpassN', all of these are valid:
+  ///
+  ///   fpass1,fpass2,fpass3
+  ///   cgpass1,cgpass2,cgpass3
+  ///   lpass1,lpass2,lpass3
+  ///
+  /// And they are equivalent to the following (resp.):
+  ///
+  ///   module(function(fpass1,fpass2,fpass3))
+  ///   module(cgscc(cgpass1,cgpass2,cgpass3))
+  ///   module(function(loop(lpass1,lpass2,lpass3)))
+  ///
+  /// This shortcut is especially useful for debugging and testing small pass
+  /// combinations. Note that these shortcuts don't introduce any other magic. If
+  /// the sequence of passes aren't all the exact same kind of pass, it will be
+  /// an error. You cannot mix different levels implicitly, you must explicitly
+  /// form a pass manager in which to nest passes.
+  bool parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText,
+                         bool VerifyEachPass = true, bool DebugLogging = false);
+
+private:
+  bool parseModulePassName(ModulePassManager &MPM, StringRef Name);
+  bool parseCGSCCPassName(CGSCCPassManager &CGPM, StringRef Name);
+  bool parseFunctionPassName(FunctionPassManager &FPM, StringRef Name);
+  bool parseFunctionPassPipeline(FunctionPassManager &FPM,
+                                 StringRef &PipelineText, bool VerifyEachPass,
+                                 bool DebugLogging);
+  bool parseCGSCCPassPipeline(CGSCCPassManager &CGPM, StringRef &PipelineText,
+                              bool VerifyEachPass, bool DebugLogging);
+  bool parseModulePassPipeline(ModulePassManager &MPM, StringRef &PipelineText,
+                               bool VerifyEachPass, bool DebugLogging);
+};
 
 }
 
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index cdd22e4..d952525 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -20,6 +20,8 @@
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/RegionPass.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/IR/DataLayout.h"
@@ -33,7 +35,7 @@
 #include "llvm/LinkAllIR.h"
 #include "llvm/LinkAllPasses.h"
 #include "llvm/MC/SubtargetFeature.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -45,7 +47,6 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include <algorithm>
@@ -179,7 +180,7 @@ DefaultDataLayout("default-data-layout",
 
 
 
-static inline void addPass(PassManagerBase &PM, Pass *P) {
+static inline void addPass(legacy::PassManagerBase &PM, Pass *P) {
   // Add the pass to the pass manager...
   PM.add(P);
 
@@ -194,7 +195,8 @@ static inline void addPass(PassManagerBase &PM, Pass *P) {
 /// OptLevel.
 ///
 /// OptLevel - Optimization Level
-static void AddOptimizationPasses(PassManagerBase &MPM,FunctionPassManager &FPM,
+static void AddOptimizationPasses(legacy::PassManagerBase &MPM,
+                                  legacy::FunctionPassManager &FPM,
                                   unsigned OptLevel, unsigned SizeLevel) {
   FPM.add(createVerifierPass());          // Verify that input is correct
   MPM.add(createDebugInfoVerifierPass()); // Verify that debug info is correct
@@ -229,7 +231,7 @@ static void AddOptimizationPasses(PassManagerBase &MPM,FunctionPassManager &FPM,
   Builder.populateModulePassManager(MPM);
 }
 
-static void AddStandardLinkPasses(PassManagerBase &PM) {
+static void AddStandardLinkPasses(legacy::PassManagerBase &PM) {
   PassManagerBuilder Builder;
   Builder.VerifyInput = true;
   Builder.StripDebug = StripDebug;
@@ -307,7 +309,6 @@ int main(int argc, char **argv) {
   // Initialize passes
   PassRegistry &Registry = *PassRegistry::getPassRegistry();
   initializeCore(Registry);
-  initializeDebugIRPass(Registry);
   initializeScalarOpts(Registry);
   initializeObjCARCOpts(Registry);
   initializeVectorization(Registry);
@@ -323,6 +324,8 @@ int main(int argc, char **argv) {
   initializeCodeGenPreparePass(Registry);
   initializeAtomicExpandPass(Registry);
   initializeRewriteSymbolsPass(Registry);
+  initializeWinEHPreparePass(Registry);
+  initializeDwarfEHPreparePass(Registry);
 
 #ifdef LINK_POLLY_INTO_TOOLS
   polly::initializePollyPasses(Registry);
@@ -341,7 +344,7 @@ int main(int argc, char **argv) {
   // Load the input module...
   std::unique_ptr<Module> M = parseIRFile(InputFilename, Err, Context);
 
-  if (!M.get()) {
+  if (!M) {
     Err.print(argv[0], errs());
     return 1;
   }
@@ -369,6 +372,12 @@ int main(int argc, char **argv) {
     }
   }
 
+  Triple ModuleTriple(M->getTargetTriple());
+  TargetMachine *Machine = nullptr;
+  if (ModuleTriple.getArch())
+    Machine = GetTargetMachine(ModuleTriple);
+  std::unique_ptr<TargetMachine> TM(Machine);
+
   // If the output is set to be emitted to standard out, and standard out is a
   // console, print out a warning message and refuse to do it.  We don't
   // impress anyone by spewing tons of binary goo to a terminal.
@@ -390,8 +399,8 @@ int main(int argc, char **argv) {
     // The user has asked to use the new pass manager and provided a pipeline
     // string. Hand off the rest of the functionality to the new code for that
     // layer.
-    return runPassPipeline(argv[0], Context, *M.get(), Out.get(), PassPipeline,
-                           OK, VK)
+    return runPassPipeline(argv[0], Context, *M, TM.get(), Out.get(),
+                           PassPipeline, OK, VK)
                ? 0
                : 1;
   }
@@ -399,44 +408,37 @@ int main(int argc, char **argv) {
   // Create a PassManager to hold and optimize the collection of passes we are
   // about to build.
   //
-  PassManager Passes;
+  legacy::PassManager Passes;
 
   // Add an appropriate TargetLibraryInfo pass for the module's triple.
-  TargetLibraryInfo *TLI = new TargetLibraryInfo(Triple(M->getTargetTriple()));
+  TargetLibraryInfoImpl TLII(ModuleTriple);
 
   // The -disable-simplify-libcalls flag actually disables all builtin optzns.
   if (DisableSimplifyLibCalls)
-    TLI->disableAllFunctions();
-  Passes.add(TLI);
+    TLII.disableAllFunctions();
+  Passes.add(new TargetLibraryInfoWrapperPass(TLII));
 
   // Add an appropriate DataLayout instance for this module.
-  const DataLayout *DL = M.get()->getDataLayout();
+  const DataLayout *DL = M->getDataLayout();
   if (!DL && !DefaultDataLayout.empty()) {
     M->setDataLayout(DefaultDataLayout);
-    DL = M.get()->getDataLayout();
+    DL = M->getDataLayout();
   }
 
   if (DL)
     Passes.add(new DataLayoutPass());
 
-  Triple ModuleTriple(M->getTargetTriple());
-  TargetMachine *Machine = nullptr;
-  if (ModuleTriple.getArch())
-    Machine = GetTargetMachine(Triple(ModuleTriple));
-  std::unique_ptr<TargetMachine> TM(Machine);
-
   // Add internal analysis passes from the target machine.
-  if (TM.get())
-    TM->addAnalysisPasses(Passes);
+  Passes.add(createTargetTransformInfoWrapperPass(TM ? TM->getTargetIRAnalysis()
+                                                     : TargetIRAnalysis()));
 
-  std::unique_ptr<FunctionPassManager> FPasses;
+  std::unique_ptr<legacy::FunctionPassManager> FPasses;
   if (OptLevelO1 || OptLevelO2 || OptLevelOs || OptLevelOz || OptLevelO3) {
-    FPasses.reset(new FunctionPassManager(M.get()));
+    FPasses.reset(new legacy::FunctionPassManager(M.get()));
     if (DL)
       FPasses->add(new DataLayoutPass());
-    if (TM.get())
-      TM->addAnalysisPasses(*FPasses);
-
+    FPasses->add(createTargetTransformInfoWrapperPass(
+        TM ? TM->getTargetIRAnalysis() : TargetIRAnalysis()));
   }
 
   if (PrintBreakpoints) {
@@ -446,7 +448,8 @@ int main(int argc, char **argv) {
         OutputFilename = "-";
 
       std::error_code EC;
-      Out.reset(new tool_output_file(OutputFilename, EC, sys::fs::F_None));
+      Out = llvm::make_unique<tool_output_file>(OutputFilename, EC,
+                                                sys::fs::F_None);
       if (EC) {
         errs() << EC.message() << '\n';
         return 1;
@@ -556,8 +559,8 @@ int main(int argc, char **argv) {
 
   if (OptLevelO1 || OptLevelO2 || OptLevelOs || OptLevelOz || OptLevelO3) {
     FPasses->doInitialization();
-    for (Module::iterator F = M->begin(), E = M->end(); F != E; ++F)
-      FPasses->run(*F);
+    for (Function &F : *M)
+      FPasses->run(F);
     FPasses->doFinalization();
   }
 
@@ -579,7 +582,7 @@ int main(int argc, char **argv) {
   cl::PrintOptionValues();
 
   // Now that we have all of the passes ready, run them.
-  Passes.run(*M.get());
+  Passes.run(*M);
 
   // Declare success.
   if (!NoOutput || PrintBreakpoints)
diff --git a/tools/verify-uselistorder/verify-uselistorder.cpp b/tools/verify-uselistorder/verify-uselistorder.cpp
index 992a5b0..a653608 100644
--- a/tools/verify-uselistorder/verify-uselistorder.cpp
+++ b/tools/verify-uselistorder/verify-uselistorder.cpp
@@ -197,9 +197,12 @@ ValueMapping::ValueMapping(const Module &M) {
       map(G.getInitializer());
   for (const GlobalAlias &A : M.aliases())
     map(A.getAliasee());
-  for (const Function &F : M)
+  for (const Function &F : M) {
     if (F.hasPrefixData())
       map(F.getPrefixData());
+    if (F.hasPrologueData())
+      map(F.getPrologueData());
+  }
 
   // Function bodies.
   for (const Function &F : M) {
@@ -463,9 +466,12 @@ static void changeUseLists(Module &M, Changer changeValueUseList) {
       changeValueUseList(G.getInitializer());
   for (GlobalAlias &A : M.aliases())
     changeValueUseList(A.getAliasee());
-  for (Function &F : M)
+  for (Function &F : M) {
     if (F.hasPrefixData())
       changeValueUseList(F.getPrefixData());
+    if (F.hasPrologueData())
+      changeValueUseList(F.getPrologueData());
+  }
 
   // Function bodies.
   for (Function &F : M) {
diff --git a/tools/yaml2obj/yaml2coff.cpp b/tools/yaml2obj/yaml2coff.cpp
index 6983e9d..8322b2f 100644
--- a/tools/yaml2obj/yaml2coff.cpp
+++ b/tools/yaml2obj/yaml2coff.cpp
@@ -13,13 +13,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "yaml2obj.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Object/COFFYAML.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Object/COFFYAML.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
diff --git a/tools/yaml2obj/yaml2elf.cpp b/tools/yaml2obj/yaml2elf.cpp
index 44c8c12..5d4c263 100644
--- a/tools/yaml2obj/yaml2elf.cpp
+++ b/tools/yaml2obj/yaml2elf.cpp
@@ -131,6 +131,8 @@ class ELFState {
   bool writeSectionContent(Elf_Shdr &SHeader,
                            const ELFYAML::RelocationSection &Section,
                            ContiguousBlobAccumulator &CBA);
+  bool writeSectionContent(Elf_Shdr &SHeader, const ELFYAML::Group &Group,
+                           ContiguousBlobAccumulator &CBA);
 
   // - SHT_NULL entry (placed first, i.e. 0'th entry)
   // - symbol table (.symtab) (placed third to last)
@@ -223,6 +225,16 @@ bool ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
 
       if (!writeSectionContent(SHeader, *S, CBA))
         return false;
+    } else if (auto S = dyn_cast<ELFYAML::Group>(Sec.get())) {
+      unsigned SymIdx;
+      if (SymN2I.lookup(S->Info, SymIdx)) {
+        errs() << "error: Unknown symbol referenced: '" << S->Info
+               << "' at YAML section '" << S->Name << "'.\n";
+        return false;
+      }
+      SHeader.sh_info = SymIdx;
+      if (!writeSectionContent(SHeader, *S, CBA))
+        return false;
     } else
       llvm_unreachable("Unknown section type");
 
@@ -321,6 +333,12 @@ ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
   SHeader.sh_size = Section.Size;
 }
 
+static bool isMips64EL(const ELFYAML::Object &Doc) {
+  return Doc.Header.Machine == ELFYAML::ELF_EM(llvm::ELF::EM_MIPS) &&
+         Doc.Header.Class == ELFYAML::ELF_ELFCLASS(ELF::ELFCLASS64) &&
+         Doc.Header.Data == ELFYAML::ELF_ELFDATA(ELF::ELFDATA2LSB);
+}
+
 template <class ELFT>
 bool
 ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
@@ -351,19 +369,51 @@ ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
       zero(REntry);
       REntry.r_offset = Rel.Offset;
       REntry.r_addend = Rel.Addend;
-      REntry.setSymbolAndType(SymIdx, Rel.Type);
+      REntry.setSymbolAndType(SymIdx, Rel.Type, isMips64EL(Doc));
       OS.write((const char *)&REntry, sizeof(REntry));
     } else {
       Elf_Rel REntry;
       zero(REntry);
       REntry.r_offset = Rel.Offset;
-      REntry.setSymbolAndType(SymIdx, Rel.Type);
+      REntry.setSymbolAndType(SymIdx, Rel.Type, isMips64EL(Doc));
       OS.write((const char *)&REntry, sizeof(REntry));
     }
   }
   return true;
 }
 
+template <class ELFT>
+bool ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
+                                         const ELFYAML::Group &Section,
+                                         ContiguousBlobAccumulator &CBA) {
+  typedef typename object::ELFFile<ELFT>::Elf_Word Elf_Word;
+  if (Section.Type != llvm::ELF::SHT_GROUP) {
+    errs() << "error: Invalid section type.\n";
+    return false;
+  }
+
+  SHeader.sh_entsize = sizeof(Elf_Word);
+  SHeader.sh_size = SHeader.sh_entsize * Section.Members.size();
+
+  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset);
+
+  for (auto member : Section.Members) {
+    Elf_Word SIdx;
+    unsigned int sectionIndex = 0;
+    if (member.sectionNameOrType == "GRP_COMDAT")
+      sectionIndex = llvm::ELF::GRP_COMDAT;
+    else if (SN2I.lookup(member.sectionNameOrType, sectionIndex)) {
+      errs() << "error: Unknown section referenced: '"
+             << member.sectionNameOrType << "' at YAML section' "
+             << Section.Name << "\n";
+      return false;
+    }
+    SIdx = sectionIndex;
+    OS.write((const char *)&SIdx, sizeof(SIdx));
+  }
+  return true;
+}
+
 template <class ELFT> bool ELFState<ELFT>::buildSectionIndex() {
   SN2I.addName(".symtab", getDotSymTabSecNo());
   SN2I.addName(".strtab", getDotStrTabSecNo());
diff --git a/unittests/ADT/APFloatTest.cpp b/unittests/ADT/APFloatTest.cpp
index c7ec16b..8b82fb2 100644
--- a/unittests/ADT/APFloatTest.cpp
+++ b/unittests/ADT/APFloatTest.cpp
@@ -475,6 +475,47 @@ TEST(APFloatTest, FMA) {
     EXPECT_EQ(12.0f, f1.convertToFloat());
   }
 
+  // Test for correct zero sign when answer is exactly zero.
+  // fma(1.0, -1.0, 1.0) -> +ve 0.
+  {
+    APFloat f1(1.0);
+    APFloat f2(-1.0);
+    APFloat f3(1.0);
+    f1.fusedMultiplyAdd(f2, f3, APFloat::rmNearestTiesToEven);
+    EXPECT_TRUE(!f1.isNegative() && f1.isZero());
+  }
+
+  // Test for correct zero sign when answer is exactly zero and rounding towards
+  // negative.
+  // fma(1.0, -1.0, 1.0) -> +ve 0.
+  {
+    APFloat f1(1.0);
+    APFloat f2(-1.0);
+    APFloat f3(1.0);
+    f1.fusedMultiplyAdd(f2, f3, APFloat::rmTowardNegative);
+    EXPECT_TRUE(f1.isNegative() && f1.isZero());
+  }
+
+  // Test for correct (in this case -ve) sign when adding like signed zeros.
+  // Test fma(0.0, -0.0, -0.0) -> -ve 0.
+  {
+    APFloat f1(0.0);
+    APFloat f2(-0.0);
+    APFloat f3(-0.0);
+    f1.fusedMultiplyAdd(f2, f3, APFloat::rmNearestTiesToEven);
+    EXPECT_TRUE(f1.isNegative() && f1.isZero());
+  }
+
+  // Test -ve sign preservation when small negative results underflow.
+  {
+    APFloat f1(APFloat::IEEEdouble,  "-0x1p-1074");
+    APFloat f2(APFloat::IEEEdouble, "+0x1p-1074");
+    APFloat f3(0.0);
+    f1.fusedMultiplyAdd(f2, f3, APFloat::rmNearestTiesToEven);
+    EXPECT_TRUE(f1.isNegative() && f1.isZero());
+  }
+
+  // Test x87 extended precision case from http://llvm.org/PR20728.
   {
     APFloat M1(APFloat::x87DoubleExtended, 1.0);
     APFloat M2(APFloat::x87DoubleExtended, 1.0);
diff --git a/unittests/ADT/APIntTest.cpp b/unittests/ADT/APIntTest.cpp
index 8198c71..3b7ac5b 100644
--- a/unittests/ADT/APIntTest.cpp
+++ b/unittests/ADT/APIntTest.cpp
@@ -678,6 +678,14 @@ TEST(APIntTest, nearestLogBase2) {
   EXPECT_EQ(A9.nearestLogBase2(), UINT32_MAX);
 }
 
+#if defined(__clang__)
+// Disable the pragma warning from versions of Clang without -Wself-move
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunknown-pragmas"
+// Disable the warning that triggers on exactly what is being tested.
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wself-move"
+#endif
 TEST(APIntTest, SelfMoveAssignment) {
   APInt X(32, 0xdeadbeef);
   X = std::move(X);
@@ -694,5 +702,8 @@ TEST(APIntTest, SelfMoveAssignment) {
   EXPECT_EQ(0xdeadbeefdeadbeefULL, Raw[0]);
   EXPECT_EQ(0xdeadbeefdeadbeefULL, Raw[1]);
 }
-
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#pragma clang diagnostic pop
+#endif
 }
diff --git a/unittests/ADT/APSIntTest.cpp b/unittests/ADT/APSIntTest.cpp
index eef9c8a..5e4e874 100644
--- a/unittests/ADT/APSIntTest.cpp
+++ b/unittests/ADT/APSIntTest.cpp
@@ -41,4 +41,106 @@ TEST(APSIntTest, MoveTest) {
   EXPECT_EQ(Bits, A.getRawData()); // Verify that "Wide" was really moved.
 }
 
+TEST(APSIntTest, get) {
+  EXPECT_TRUE(APSInt::get(7).isSigned());
+  EXPECT_EQ(64u, APSInt::get(7).getBitWidth());
+  EXPECT_EQ(7u, APSInt::get(7).getZExtValue());
+  EXPECT_EQ(7, APSInt::get(7).getSExtValue());
+  EXPECT_TRUE(APSInt::get(-7).isSigned());
+  EXPECT_EQ(64u, APSInt::get(-7).getBitWidth());
+  EXPECT_EQ(-7, APSInt::get(-7).getSExtValue());
+  EXPECT_EQ(UINT64_C(0) - 7, APSInt::get(-7).getZExtValue());
+}
+
+TEST(APSIntTest, getUnsigned) {
+  EXPECT_TRUE(APSInt::getUnsigned(7).isUnsigned());
+  EXPECT_EQ(64u, APSInt::getUnsigned(7).getBitWidth());
+  EXPECT_EQ(7u, APSInt::getUnsigned(7).getZExtValue());
+  EXPECT_EQ(7, APSInt::getUnsigned(7).getSExtValue());
+  EXPECT_TRUE(APSInt::getUnsigned(-7).isUnsigned());
+  EXPECT_EQ(64u, APSInt::getUnsigned(-7).getBitWidth());
+  EXPECT_EQ(-7, APSInt::getUnsigned(-7).getSExtValue());
+  EXPECT_EQ(UINT64_C(0) - 7, APSInt::getUnsigned(-7).getZExtValue());
+}
+
+TEST(APSIntTest, getExtValue) {
+  EXPECT_TRUE(APSInt(APInt(3, 7), true).isUnsigned());
+  EXPECT_TRUE(APSInt(APInt(3, 7), false).isSigned());
+  EXPECT_TRUE(APSInt(APInt(4, 7), true).isUnsigned());
+  EXPECT_TRUE(APSInt(APInt(4, 7), false).isSigned());
+  EXPECT_TRUE(APSInt(APInt(4, -7), true).isUnsigned());
+  EXPECT_TRUE(APSInt(APInt(4, -7), false).isSigned());
+  EXPECT_EQ(7, APSInt(APInt(3, 7), true).getExtValue());
+  EXPECT_EQ(-1, APSInt(APInt(3, 7), false).getExtValue());
+  EXPECT_EQ(7, APSInt(APInt(4, 7), true).getExtValue());
+  EXPECT_EQ(7, APSInt(APInt(4, 7), false).getExtValue());
+  EXPECT_EQ(9, APSInt(APInt(4, -7), true).getExtValue());
+  EXPECT_EQ(-7, APSInt(APInt(4, -7), false).getExtValue());
+}
+
+TEST(APSIntTest, compareValues) {
+  auto U = [](uint64_t V) { return APSInt::getUnsigned(V); };
+  auto S = [](int64_t V) { return APSInt::get(V); };
+
+  // Bit-width matches and is-signed.
+  EXPECT_TRUE(APSInt::compareValues(S(7), S(8)) < 0);
+  EXPECT_TRUE(APSInt::compareValues(S(8), S(7)) > 0);
+  EXPECT_TRUE(APSInt::compareValues(S(7), S(7)) == 0);
+  EXPECT_TRUE(APSInt::compareValues(S(-7), S(8)) < 0);
+  EXPECT_TRUE(APSInt::compareValues(S(8), S(-7)) > 0);
+  EXPECT_TRUE(APSInt::compareValues(S(-7), S(-7)) == 0);
+  EXPECT_TRUE(APSInt::compareValues(S(-7), S(-8)) > 0);
+  EXPECT_TRUE(APSInt::compareValues(S(-8), S(-7)) < 0);
+  EXPECT_TRUE(APSInt::compareValues(S(-7), S(-7)) == 0);
+
+  // Bit-width matches and not is-signed.
+  EXPECT_TRUE(APSInt::compareValues(U(7), U(8)) < 0);
+  EXPECT_TRUE(APSInt::compareValues(U(8), U(7)) > 0);
+  EXPECT_TRUE(APSInt::compareValues(U(7), U(7)) == 0);
+
+  // Bit-width matches and mixed signs.
+  EXPECT_TRUE(APSInt::compareValues(U(7), S(8)) < 0);
+  EXPECT_TRUE(APSInt::compareValues(U(8), S(7)) > 0);
+  EXPECT_TRUE(APSInt::compareValues(U(7), S(7)) == 0);
+  EXPECT_TRUE(APSInt::compareValues(U(8), S(-7)) > 0);
+
+  // Bit-width mismatch and is-signed.
+  EXPECT_TRUE(APSInt::compareValues(S(7).trunc(32), S(8)) < 0);
+  EXPECT_TRUE(APSInt::compareValues(S(8).trunc(32), S(7)) > 0);
+  EXPECT_TRUE(APSInt::compareValues(S(7).trunc(32), S(7)) == 0);
+  EXPECT_TRUE(APSInt::compareValues(S(-7).trunc(32), S(8)) < 0);
+  EXPECT_TRUE(APSInt::compareValues(S(8).trunc(32), S(-7)) > 0);
+  EXPECT_TRUE(APSInt::compareValues(S(-7).trunc(32), S(-7)) == 0);
+  EXPECT_TRUE(APSInt::compareValues(S(-7).trunc(32), S(-8)) > 0);
+  EXPECT_TRUE(APSInt::compareValues(S(-8).trunc(32), S(-7)) < 0);
+  EXPECT_TRUE(APSInt::compareValues(S(-7).trunc(32), S(-7)) == 0);
+  EXPECT_TRUE(APSInt::compareValues(S(7), S(8).trunc(32)) < 0);
+  EXPECT_TRUE(APSInt::compareValues(S(8), S(7).trunc(32)) > 0);
+  EXPECT_TRUE(APSInt::compareValues(S(7), S(7).trunc(32)) == 0);
+  EXPECT_TRUE(APSInt::compareValues(S(-7), S(8).trunc(32)) < 0);
+  EXPECT_TRUE(APSInt::compareValues(S(8), S(-7).trunc(32)) > 0);
+  EXPECT_TRUE(APSInt::compareValues(S(-7), S(-7).trunc(32)) == 0);
+  EXPECT_TRUE(APSInt::compareValues(S(-7), S(-8).trunc(32)) > 0);
+  EXPECT_TRUE(APSInt::compareValues(S(-8), S(-7).trunc(32)) < 0);
+  EXPECT_TRUE(APSInt::compareValues(S(-7), S(-7).trunc(32)) == 0);
+
+  // Bit-width mismatch and not is-signed.
+  EXPECT_TRUE(APSInt::compareValues(U(7), U(8).trunc(32)) < 0);
+  EXPECT_TRUE(APSInt::compareValues(U(8), U(7).trunc(32)) > 0);
+  EXPECT_TRUE(APSInt::compareValues(U(7), U(7).trunc(32)) == 0);
+  EXPECT_TRUE(APSInt::compareValues(U(7).trunc(32), U(8)) < 0);
+  EXPECT_TRUE(APSInt::compareValues(U(8).trunc(32), U(7)) > 0);
+  EXPECT_TRUE(APSInt::compareValues(U(7).trunc(32), U(7)) == 0);
+
+  // Bit-width mismatch and mixed signs.
+  EXPECT_TRUE(APSInt::compareValues(U(7).trunc(32), S(8)) < 0);
+  EXPECT_TRUE(APSInt::compareValues(U(8).trunc(32), S(7)) > 0);
+  EXPECT_TRUE(APSInt::compareValues(U(7).trunc(32), S(7)) == 0);
+  EXPECT_TRUE(APSInt::compareValues(U(8).trunc(32), S(-7)) > 0);
+  EXPECT_TRUE(APSInt::compareValues(U(7), S(8).trunc(32)) < 0);
+  EXPECT_TRUE(APSInt::compareValues(U(8), S(7).trunc(32)) > 0);
+  EXPECT_TRUE(APSInt::compareValues(U(7), S(7).trunc(32)) == 0);
+  EXPECT_TRUE(APSInt::compareValues(U(8), S(-7).trunc(32)) > 0);
+}
+
 }
diff --git a/unittests/ADT/ArrayRefTest.cpp b/unittests/ADT/ArrayRefTest.cpp
index f9c98a5..70f8208 100644
--- a/unittests/ADT/ArrayRefTest.cpp
+++ b/unittests/ADT/ArrayRefTest.cpp
@@ -11,6 +11,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/raw_ostream.h"
 #include "gtest/gtest.h"
+#include <vector>
 using namespace llvm;
 
 // Check that the ArrayRef-of-pointer converting constructor only allows adding
@@ -90,4 +91,24 @@ TEST(ArrayRefTest, ConstConvert) {
   a = ArrayRef<int *>(A);
 }
 
+static std::vector<int> ReturnTest12() { return {1, 2}; }
+static void ArgTest12(ArrayRef<int> A) {
+  EXPECT_EQ(2U, A.size());
+  EXPECT_EQ(1, A[0]);
+  EXPECT_EQ(2, A[1]);
+}
+
+TEST(ArrayRefTest, InitializerList) {
+  ArrayRef<int> A = { 0, 1, 2, 3, 4 };
+  for (int i = 0; i < 5; ++i)
+    EXPECT_EQ(i, A[i]);
+
+  std::vector<int> B = ReturnTest12();
+  A = B;
+  EXPECT_EQ(1, A[0]);
+  EXPECT_EQ(2, A[1]);
+
+  ArgTest12({1, 2});
+}
+
 } // end anonymous namespace
diff --git a/unittests/ADT/HashingTest.cpp b/unittests/ADT/HashingTest.cpp
index acaa83c..34eb5a5 100644
--- a/unittests/ADT/HashingTest.cpp
+++ b/unittests/ADT/HashingTest.cpp
@@ -421,4 +421,29 @@ TEST(HashingTest, HashCombineBasicTest) {
             hash_combine(bigarr[0], l2, bigarr[9], l3, bigarr[18], bigarr[19]));
 }
 
+TEST(HashingTest, HashCombineArgs18) {
+  // This tests that we can pass in up to 18 args.
+#define CHECK_SAME(...)                                                        \
+  EXPECT_EQ(hash_combine(__VA_ARGS__), hash_combine(__VA_ARGS__))
+  CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18);
+  CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
+  CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+  CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
+  CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
+  CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
+  CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+  CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+  CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9);
+  CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8);
+  CHECK_SAME(1, 2, 3, 4, 5, 6, 7);
+  CHECK_SAME(1, 2, 3, 4, 5, 6);
+  CHECK_SAME(1, 2, 3, 4, 5);
+  CHECK_SAME(1, 2, 3, 4);
+  CHECK_SAME(1, 2, 3);
+  CHECK_SAME(1, 2);
+  CHECK_SAME(1);
+#undef CHECK_SAME
+}
+
 }
diff --git a/unittests/ADT/MapVectorTest.cpp b/unittests/ADT/MapVectorTest.cpp
index 8919799..2caa8c7 100644
--- a/unittests/ADT/MapVectorTest.cpp
+++ b/unittests/ADT/MapVectorTest.cpp
@@ -122,3 +122,221 @@ TEST(MapVectorTest, iteration_test) {
     count--;
   }
 }
+
+TEST(SmallMapVectorSmallTest, insert_pop) {
+  SmallMapVector<int, int, 32> MV;
+  std::pair<SmallMapVector<int, int, 32>::iterator, bool> R;
+
+  R = MV.insert(std::make_pair(1, 2));
+  ASSERT_EQ(R.first, MV.begin());
+  EXPECT_EQ(R.first->first, 1);
+  EXPECT_EQ(R.first->second, 2);
+  EXPECT_TRUE(R.second);
+
+  R = MV.insert(std::make_pair(1, 3));
+  ASSERT_EQ(R.first, MV.begin());
+  EXPECT_EQ(R.first->first, 1);
+  EXPECT_EQ(R.first->second, 2);
+  EXPECT_FALSE(R.second);
+
+  R = MV.insert(std::make_pair(4, 5));
+  ASSERT_NE(R.first, MV.end());
+  EXPECT_EQ(R.first->first, 4);
+  EXPECT_EQ(R.first->second, 5);
+  EXPECT_TRUE(R.second);
+
+  EXPECT_EQ(MV.size(), 2u);
+  EXPECT_EQ(MV[1], 2);
+  EXPECT_EQ(MV[4], 5);
+
+  MV.pop_back();
+  EXPECT_EQ(MV.size(), 1u);
+  EXPECT_EQ(MV[1], 2);
+
+  R = MV.insert(std::make_pair(4, 7));
+  ASSERT_NE(R.first, MV.end());
+  EXPECT_EQ(R.first->first, 4);
+  EXPECT_EQ(R.first->second, 7);
+  EXPECT_TRUE(R.second);
+
+  EXPECT_EQ(MV.size(), 2u);
+  EXPECT_EQ(MV[1], 2);
+  EXPECT_EQ(MV[4], 7);
+}
+
+TEST(SmallMapVectorSmallTest, erase) {
+  SmallMapVector<int, int, 32> MV;
+
+  MV.insert(std::make_pair(1, 2));
+  MV.insert(std::make_pair(3, 4));
+  MV.insert(std::make_pair(5, 6));
+  ASSERT_EQ(MV.size(), 3u);
+
+  MV.erase(MV.find(1));
+  ASSERT_EQ(MV.size(), 2u);
+  ASSERT_EQ(MV.find(1), MV.end());
+  ASSERT_EQ(MV[3], 4);
+  ASSERT_EQ(MV[5], 6);
+
+  ASSERT_EQ(MV.erase(3), 1u);
+  ASSERT_EQ(MV.size(), 1u);
+  ASSERT_EQ(MV.find(3), MV.end());
+  ASSERT_EQ(MV[5], 6);
+
+  ASSERT_EQ(MV.erase(79), 0u);
+  ASSERT_EQ(MV.size(), 1u);
+}
+
+TEST(SmallMapVectorSmallTest, remove_if) {
+  SmallMapVector<int, int, 32> MV;
+
+  MV.insert(std::make_pair(1, 11));
+  MV.insert(std::make_pair(2, 12));
+  MV.insert(std::make_pair(3, 13));
+  MV.insert(std::make_pair(4, 14));
+  MV.insert(std::make_pair(5, 15));
+  MV.insert(std::make_pair(6, 16));
+  ASSERT_EQ(MV.size(), 6u);
+
+  MV.remove_if([](const std::pair<int, int> &Val) { return Val.second % 2; });
+  ASSERT_EQ(MV.size(), 3u);
+  ASSERT_EQ(MV.find(1), MV.end());
+  ASSERT_EQ(MV.find(3), MV.end());
+  ASSERT_EQ(MV.find(5), MV.end());
+  ASSERT_EQ(MV[2], 12);
+  ASSERT_EQ(MV[4], 14);
+  ASSERT_EQ(MV[6], 16);
+}
+
+TEST(SmallMapVectorSmallTest, iteration_test) {
+  SmallMapVector<int, int, 32> MV;
+
+  MV.insert(std::make_pair(1, 11));
+  MV.insert(std::make_pair(2, 12));
+  MV.insert(std::make_pair(3, 13));
+  MV.insert(std::make_pair(4, 14));
+  MV.insert(std::make_pair(5, 15));
+  MV.insert(std::make_pair(6, 16));
+  ASSERT_EQ(MV.size(), 6u);
+
+  int count = 1;
+  for (auto P : make_range(MV.begin(), MV.end())) {
+    ASSERT_EQ(P.first, count);
+    count++;
+  }
+
+  count = 6;
+  for (auto P : make_range(MV.rbegin(), MV.rend())) {
+    ASSERT_EQ(P.first, count);
+    count--;
+  }
+}
+
+TEST(SmallMapVectorLargeTest, insert_pop) {
+  SmallMapVector<int, int, 1> MV;
+  std::pair<SmallMapVector<int, int, 1>::iterator, bool> R;
+
+  R = MV.insert(std::make_pair(1, 2));
+  ASSERT_EQ(R.first, MV.begin());
+  EXPECT_EQ(R.first->first, 1);
+  EXPECT_EQ(R.first->second, 2);
+  EXPECT_TRUE(R.second);
+
+  R = MV.insert(std::make_pair(1, 3));
+  ASSERT_EQ(R.first, MV.begin());
+  EXPECT_EQ(R.first->first, 1);
+  EXPECT_EQ(R.first->second, 2);
+  EXPECT_FALSE(R.second);
+
+  R = MV.insert(std::make_pair(4, 5));
+  ASSERT_NE(R.first, MV.end());
+  EXPECT_EQ(R.first->first, 4);
+  EXPECT_EQ(R.first->second, 5);
+  EXPECT_TRUE(R.second);
+
+  EXPECT_EQ(MV.size(), 2u);
+  EXPECT_EQ(MV[1], 2);
+  EXPECT_EQ(MV[4], 5);
+
+  MV.pop_back();
+  EXPECT_EQ(MV.size(), 1u);
+  EXPECT_EQ(MV[1], 2);
+
+  R = MV.insert(std::make_pair(4, 7));
+  ASSERT_NE(R.first, MV.end());
+  EXPECT_EQ(R.first->first, 4);
+  EXPECT_EQ(R.first->second, 7);
+  EXPECT_TRUE(R.second);
+
+  EXPECT_EQ(MV.size(), 2u);
+  EXPECT_EQ(MV[1], 2);
+  EXPECT_EQ(MV[4], 7);
+}
+
+TEST(SmallMapVectorLargeTest, erase) {
+  SmallMapVector<int, int, 1> MV;
+
+  MV.insert(std::make_pair(1, 2));
+  MV.insert(std::make_pair(3, 4));
+  MV.insert(std::make_pair(5, 6));
+  ASSERT_EQ(MV.size(), 3u);
+
+  MV.erase(MV.find(1));
+  ASSERT_EQ(MV.size(), 2u);
+  ASSERT_EQ(MV.find(1), MV.end());
+  ASSERT_EQ(MV[3], 4);
+  ASSERT_EQ(MV[5], 6);
+
+  ASSERT_EQ(MV.erase(3), 1u);
+  ASSERT_EQ(MV.size(), 1u);
+  ASSERT_EQ(MV.find(3), MV.end());
+  ASSERT_EQ(MV[5], 6);
+
+  ASSERT_EQ(MV.erase(79), 0u);
+  ASSERT_EQ(MV.size(), 1u);
+}
+
+TEST(SmallMapVectorLargeTest, remove_if) {
+  SmallMapVector<int, int, 1> MV;
+
+  MV.insert(std::make_pair(1, 11));
+  MV.insert(std::make_pair(2, 12));
+  MV.insert(std::make_pair(3, 13));
+  MV.insert(std::make_pair(4, 14));
+  MV.insert(std::make_pair(5, 15));
+  MV.insert(std::make_pair(6, 16));
+  ASSERT_EQ(MV.size(), 6u);
+
+  MV.remove_if([](const std::pair<int, int> &Val) { return Val.second % 2; });
+  ASSERT_EQ(MV.size(), 3u);
+  ASSERT_EQ(MV.find(1), MV.end());
+  ASSERT_EQ(MV.find(3), MV.end());
+  ASSERT_EQ(MV.find(5), MV.end());
+  ASSERT_EQ(MV[2], 12);
+  ASSERT_EQ(MV[4], 14);
+  ASSERT_EQ(MV[6], 16);
+}
+
+TEST(SmallMapVectorLargeTest, iteration_test) {
+  SmallMapVector<int, int, 1> MV;
+
+  MV.insert(std::make_pair(1, 11));
+  MV.insert(std::make_pair(2, 12));
+  MV.insert(std::make_pair(3, 13));
+  MV.insert(std::make_pair(4, 14));
+  MV.insert(std::make_pair(5, 15));
+  MV.insert(std::make_pair(6, 16));
+  ASSERT_EQ(MV.size(), 6u);
+
+  int count = 1;
+  for (auto P : make_range(MV.begin(), MV.end())) {
+    ASSERT_EQ(P.first, count);
+    count++;
+  }
+
+  count = 6;
+  for (auto P : make_range(MV.rbegin(), MV.rend())) {
+    ASSERT_EQ(P.first, count);
+    count--;
+  }
+}
diff --git a/unittests/ADT/OptionalTest.cpp b/unittests/ADT/OptionalTest.cpp
index cadadce..92c4eec 100644
--- a/unittests/ADT/OptionalTest.cpp
+++ b/unittests/ADT/OptionalTest.cpp
@@ -183,10 +183,10 @@ struct MultiArgConstructor {
   explicit MultiArgConstructor(int x, bool positive)
     : x(x), y(positive ? x : -x) {}
 
-  MultiArgConstructor(const MultiArgConstructor &) LLVM_DELETED_FUNCTION;
-  MultiArgConstructor(MultiArgConstructor &&) LLVM_DELETED_FUNCTION;
-  MultiArgConstructor &operator=(const MultiArgConstructor &) LLVM_DELETED_FUNCTION;
-  MultiArgConstructor &operator=(MultiArgConstructor &&) LLVM_DELETED_FUNCTION;
+  MultiArgConstructor(const MultiArgConstructor &) = delete;
+  MultiArgConstructor(MultiArgConstructor &&) = delete;
+  MultiArgConstructor &operator=(const MultiArgConstructor &) = delete;
+  MultiArgConstructor &operator=(MultiArgConstructor &&) = delete;
 
   static unsigned Destructions;
   ~MultiArgConstructor() {
@@ -340,7 +340,7 @@ struct Immovable {
   }
 private:
   // This should disable all move/copy operations.
-  Immovable(Immovable&& other) LLVM_DELETED_FUNCTION;
+  Immovable(Immovable&& other) = delete;
 };
 
 unsigned Immovable::Constructions = 0;
diff --git a/unittests/ADT/PointerIntPairTest.cpp b/unittests/ADT/PointerIntPairTest.cpp
index 296d475..0bbcd9f 100644
--- a/unittests/ADT/PointerIntPairTest.cpp
+++ b/unittests/ADT/PointerIntPairTest.cpp
@@ -42,7 +42,6 @@ TEST_F(PointerIntPairTest, DefaultInitialize) {
   EXPECT_EQ(0U, Pair.getInt());
 }
 
-#if !(defined(_MSC_VER) && _MSC_VER==1700)
 TEST_F(PointerIntPairTest, ManyUnusedBits) {
   // In real code this would be a word-sized integer limited to 31 bits.
   struct Fixnum31 {
@@ -71,6 +70,5 @@ TEST_F(PointerIntPairTest, ManyUnusedBits) {
   EXPECT_EQ(FixnumPointerTraits::NumLowBitsAvailable - 1,
             PointerLikeTypeTraits<decltype(pair)>::NumLowBitsAvailable);
 }
-#endif
 
 } // end anonymous namespace
diff --git a/unittests/ADT/SmallVectorTest.cpp b/unittests/ADT/SmallVectorTest.cpp
index 95bf33e..2bbb4ed 100644
--- a/unittests/ADT/SmallVectorTest.cpp
+++ b/unittests/ADT/SmallVectorTest.cpp
@@ -144,8 +144,8 @@ struct NonCopyable {
   NonCopyable(NonCopyable &&) {}
   NonCopyable &operator=(NonCopyable &&) { return *this; }
 private:
-  NonCopyable(const NonCopyable &) LLVM_DELETED_FUNCTION;
-  NonCopyable &operator=(const NonCopyable &) LLVM_DELETED_FUNCTION;
+  NonCopyable(const NonCopyable &) = delete;
+  NonCopyable &operator=(const NonCopyable &) = delete;
 };
 
 LLVM_ATTRIBUTE_USED void CompileTest() {
@@ -153,17 +153,14 @@ LLVM_ATTRIBUTE_USED void CompileTest() {
   V.resize(42);
 }
 
-// Test fixture class
-template <typename VectorT>
-class SmallVectorTest : public testing::Test {
+class SmallVectorTestBase : public testing::Test {
 protected:
-  VectorT theVector;
-  VectorT otherVector;
 
   void SetUp() {
     Constructable::reset();
   }
 
+  template <typename VectorT>
   void assertEmpty(VectorT & v) {
     // Size tests
     EXPECT_EQ(0u, v.size());
@@ -173,7 +170,8 @@ protected:
     EXPECT_TRUE(v.begin() == v.end());
   }
 
-  // Assert that theVector contains the specified values, in order.
+  // Assert that v contains the specified values, in order.
+  template <typename VectorT>
   void assertValuesInOrder(VectorT & v, size_t size, ...) {
     EXPECT_EQ(size, v.size());
 
@@ -188,6 +186,7 @@ protected:
   }
 
   // Generate a sequence of values to initialize the vector.
+  template <typename VectorT>
   void makeSequence(VectorT & v, int start, int end) {
     for (int i = start; i <= end; ++i) {
       v.push_back(Constructable(i));
@@ -195,6 +194,15 @@ protected:
   }
 };
 
+// Test fixture class
+template <typename VectorT>
+class SmallVectorTest : public SmallVectorTestBase {
+protected:
+  VectorT theVector;
+  VectorT otherVector;
+};
+
+
 typedef ::testing::Types<SmallVector<Constructable, 0>,
                          SmallVector<Constructable, 1>,
                          SmallVector<Constructable, 2>,
@@ -664,6 +672,67 @@ TYPED_TEST(SmallVectorTest, IteratorTest) {
   this->theVector.insert(this->theVector.end(), L.begin(), L.end());
 }
 
+template <typename InvalidType> class DualSmallVectorsTest;
+
+template <typename VectorT1, typename VectorT2>
+class DualSmallVectorsTest<std::pair<VectorT1, VectorT2>> : public SmallVectorTestBase {
+protected:
+  VectorT1 theVector;
+  VectorT2 otherVector;
+
+  template <typename T, unsigned N>
+  static unsigned NumBuiltinElts(const SmallVector<T, N>&) { return N; }
+};
+
+typedef ::testing::Types<
+    // Small mode -> Small mode.
+    std::pair<SmallVector<Constructable, 4>, SmallVector<Constructable, 4>>,
+    // Small mode -> Big mode.
+    std::pair<SmallVector<Constructable, 4>, SmallVector<Constructable, 2>>,
+    // Big mode -> Small mode.
+    std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 4>>,
+    // Big mode -> Big mode.
+    std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 2>>
+  > DualSmallVectorTestTypes;
+
+TYPED_TEST_CASE(DualSmallVectorsTest, DualSmallVectorTestTypes);
+
+TYPED_TEST(DualSmallVectorsTest, MoveAssignment) {
+  SCOPED_TRACE("MoveAssignTest-DualVectorTypes");
+
+  // Set up our vector with four elements.
+  for (unsigned I = 0; I < 4; ++I)
+    this->otherVector.push_back(Constructable(I));
+
+  const Constructable *OrigDataPtr = this->otherVector.data();
+
+  // Move-assign from the other vector.
+  this->theVector =
+    std::move(static_cast<SmallVectorImpl<Constructable>&>(this->otherVector));
+
+  // Make sure we have the right result.
+  this->assertValuesInOrder(this->theVector, 4u, 0, 1, 2, 3);
+
+  // Make sure the # of constructor/destructor calls line up. There
+  // are two live objects after clearing the other vector.
+  this->otherVector.clear();
+  EXPECT_EQ(Constructable::getNumConstructorCalls()-4,
+            Constructable::getNumDestructorCalls());
+
+  // If the source vector (otherVector) was in small-mode, assert that we just
+  // moved the data pointer over.
+  EXPECT_TRUE(this->NumBuiltinElts(this->otherVector) == 4 ||
+              this->theVector.data() == OrigDataPtr);
+
+  // There shouldn't be any live objects any more.
+  this->theVector.clear();
+  EXPECT_EQ(Constructable::getNumConstructorCalls(),
+            Constructable::getNumDestructorCalls());
+
+  // We shouldn't have copied anything in this whole process.
+  EXPECT_EQ(Constructable::getNumCopyConstructorCalls(), 0);
+}
+
 struct notassignable {
   int &x;
   notassignable(int &x) : x(x) {}
@@ -699,4 +768,142 @@ TEST(SmallVectorTest, MidInsert) {
     EXPECT_TRUE(m.hasValue);
 }
 
+enum EmplaceableArgState {
+  EAS_Defaulted,
+  EAS_Arg,
+  EAS_LValue,
+  EAS_RValue,
+  EAS_Failure
+};
+template <int I> struct EmplaceableArg {
+  EmplaceableArgState State;
+  EmplaceableArg() : State(EAS_Defaulted) {}
+  EmplaceableArg(EmplaceableArg &&X)
+      : State(X.State == EAS_Arg ? EAS_RValue : EAS_Failure) {}
+  EmplaceableArg(EmplaceableArg &X)
+      : State(X.State == EAS_Arg ? EAS_LValue : EAS_Failure) {}
+
+  explicit EmplaceableArg(bool) : State(EAS_Arg) {}
+
+private:
+  EmplaceableArg &operator=(EmplaceableArg &&) = delete;
+  EmplaceableArg &operator=(const EmplaceableArg &) = delete;
+};
+
+enum EmplaceableState { ES_Emplaced, ES_Moved };
+struct Emplaceable {
+  EmplaceableArg<0> A0;
+  EmplaceableArg<1> A1;
+  EmplaceableArg<2> A2;
+  EmplaceableArg<3> A3;
+  EmplaceableState State;
+
+  Emplaceable() : State(ES_Emplaced) {}
+
+  template <class A0Ty>
+  explicit Emplaceable(A0Ty &&A0)
+      : A0(std::forward<A0Ty>(A0)), State(ES_Emplaced) {}
+
+  template <class A0Ty, class A1Ty>
+  Emplaceable(A0Ty &&A0, A1Ty &&A1)
+      : A0(std::forward<A0Ty>(A0)), A1(std::forward<A1Ty>(A1)),
+        State(ES_Emplaced) {}
+
+  template <class A0Ty, class A1Ty, class A2Ty>
+  Emplaceable(A0Ty &&A0, A1Ty &&A1, A2Ty &&A2)
+      : A0(std::forward<A0Ty>(A0)), A1(std::forward<A1Ty>(A1)),
+        A2(std::forward<A2Ty>(A2)), State(ES_Emplaced) {}
+
+  template <class A0Ty, class A1Ty, class A2Ty, class A3Ty>
+  Emplaceable(A0Ty &&A0, A1Ty &&A1, A2Ty &&A2, A3Ty &&A3)
+      : A0(std::forward<A0Ty>(A0)), A1(std::forward<A1Ty>(A1)),
+        A2(std::forward<A2Ty>(A2)), A3(std::forward<A3Ty>(A3)),
+        State(ES_Emplaced) {}
+
+  Emplaceable(Emplaceable &&) : State(ES_Moved) {}
+  Emplaceable &operator=(Emplaceable &&) {
+    State = ES_Moved;
+    return *this;
+  }
+
+private:
+  Emplaceable(const Emplaceable &) = delete;
+  Emplaceable &operator=(const Emplaceable &) = delete;
+};
+
+TEST(SmallVectorTest, EmplaceBack) {
+  EmplaceableArg<0> A0(true);
+  EmplaceableArg<1> A1(true);
+  EmplaceableArg<2> A2(true);
+  EmplaceableArg<3> A3(true);
+  {
+    SmallVector<Emplaceable, 3> V;
+    V.emplace_back();
+    EXPECT_TRUE(V.size() == 1);
+    EXPECT_TRUE(V.back().State == ES_Emplaced);
+    EXPECT_TRUE(V.back().A0.State == EAS_Defaulted);
+    EXPECT_TRUE(V.back().A1.State == EAS_Defaulted);
+    EXPECT_TRUE(V.back().A2.State == EAS_Defaulted);
+    EXPECT_TRUE(V.back().A3.State == EAS_Defaulted);
+  }
+  {
+    SmallVector<Emplaceable, 3> V;
+    V.emplace_back(std::move(A0));
+    EXPECT_TRUE(V.size() == 1);
+    EXPECT_TRUE(V.back().State == ES_Emplaced);
+    EXPECT_TRUE(V.back().A0.State == EAS_RValue);
+    EXPECT_TRUE(V.back().A1.State == EAS_Defaulted);
+    EXPECT_TRUE(V.back().A2.State == EAS_Defaulted);
+    EXPECT_TRUE(V.back().A3.State == EAS_Defaulted);
+  }
+  {
+    SmallVector<Emplaceable, 3> V;
+    V.emplace_back(A0);
+    EXPECT_TRUE(V.size() == 1);
+    EXPECT_TRUE(V.back().State == ES_Emplaced);
+    EXPECT_TRUE(V.back().A0.State == EAS_LValue);
+    EXPECT_TRUE(V.back().A1.State == EAS_Defaulted);
+    EXPECT_TRUE(V.back().A2.State == EAS_Defaulted);
+    EXPECT_TRUE(V.back().A3.State == EAS_Defaulted);
+  }
+  {
+    SmallVector<Emplaceable, 3> V;
+    V.emplace_back(A0, A1);
+    EXPECT_TRUE(V.size() == 1);
+    EXPECT_TRUE(V.back().State == ES_Emplaced);
+    EXPECT_TRUE(V.back().A0.State == EAS_LValue);
+    EXPECT_TRUE(V.back().A1.State == EAS_LValue);
+    EXPECT_TRUE(V.back().A2.State == EAS_Defaulted);
+    EXPECT_TRUE(V.back().A3.State == EAS_Defaulted);
+  }
+  {
+    SmallVector<Emplaceable, 3> V;
+    V.emplace_back(std::move(A0), std::move(A1));
+    EXPECT_TRUE(V.size() == 1);
+    EXPECT_TRUE(V.back().State == ES_Emplaced);
+    EXPECT_TRUE(V.back().A0.State == EAS_RValue);
+    EXPECT_TRUE(V.back().A1.State == EAS_RValue);
+    EXPECT_TRUE(V.back().A2.State == EAS_Defaulted);
+    EXPECT_TRUE(V.back().A3.State == EAS_Defaulted);
+  }
+  {
+    SmallVector<Emplaceable, 3> V;
+    V.emplace_back(std::move(A0), A1, std::move(A2), A3);
+    EXPECT_TRUE(V.size() == 1);
+    EXPECT_TRUE(V.back().State == ES_Emplaced);
+    EXPECT_TRUE(V.back().A0.State == EAS_RValue);
+    EXPECT_TRUE(V.back().A1.State == EAS_LValue);
+    EXPECT_TRUE(V.back().A2.State == EAS_RValue);
+    EXPECT_TRUE(V.back().A3.State == EAS_LValue);
+  }
+  {
+    SmallVector<int, 1> V;
+    V.emplace_back();
+    V.emplace_back(42);
+    EXPECT_EQ(2U, V.size());
+    EXPECT_EQ(0, V[0]);
+    EXPECT_EQ(42, V[1]);
+  }
 }
+
+} // end namespace
diff --git a/unittests/ADT/StringMapTest.cpp b/unittests/ADT/StringMapTest.cpp
index 33d668f..4ed0b76 100644
--- a/unittests/ADT/StringMapTest.cpp
+++ b/unittests/ADT/StringMapTest.cpp
@@ -244,7 +244,7 @@ TEST_F(StringMapTest, InsertRehashingPairTest) {
 // Create a non-default constructable value
 struct StringMapTestStruct {
   StringMapTestStruct(int i) : i(i) {}
-  StringMapTestStruct() LLVM_DELETED_FUNCTION;
+  StringMapTestStruct() = delete;
   int i;
 };
 
@@ -258,7 +258,7 @@ TEST_F(StringMapTest, NonDefaultConstructable) {
 
 struct Immovable {
   Immovable() {}
-  Immovable(Immovable&&) LLVM_DELETED_FUNCTION; // will disable the other special members
+  Immovable(Immovable&&) = delete; // will disable the other special members
 };
 
 struct MoveOnly {
@@ -272,8 +272,8 @@ struct MoveOnly {
   }
 
 private:
-  MoveOnly(const MoveOnly &) LLVM_DELETED_FUNCTION;
-  MoveOnly &operator=(const MoveOnly &) LLVM_DELETED_FUNCTION;
+  MoveOnly(const MoveOnly &) = delete;
+  MoveOnly &operator=(const MoveOnly &) = delete;
 };
 
 TEST_F(StringMapTest, MoveOnly) {
diff --git a/unittests/ADT/TinyPtrVectorTest.cpp b/unittests/ADT/TinyPtrVectorTest.cpp
index ec868d4..294dfac 100644
--- a/unittests/ADT/TinyPtrVectorTest.cpp
+++ b/unittests/ADT/TinyPtrVectorTest.cpp
@@ -412,3 +412,50 @@ TYPED_TEST(TinyPtrVectorTest, InsertRange) {
 }
 
 }
+
+TEST(TinyPtrVectorTest, SingleEltCtorTest) {
+  int v = 55;
+  TinyPtrVector<int *> V(&v);
+
+  EXPECT_TRUE(V.size() == 1);
+  EXPECT_FALSE(V.empty());
+  EXPECT_TRUE(V.front() == &v);
+}
+
+TEST(TinyPtrVectorTest, ArrayRefCtorTest) {
+  int data_array[128];
+  std::vector<int *> data;
+
+  for (unsigned i = 0, e = 128; i != e; ++i) {
+    data_array[i] = 324 - int(i);
+    data.push_back(&data_array[i]);
+  }
+
+  TinyPtrVector<int *> V(data);
+  EXPECT_TRUE(V.size() == 128);
+  EXPECT_FALSE(V.empty());
+  for (unsigned i = 0, e = 128; i != e; ++i) {
+    EXPECT_TRUE(V[i] == data[i]);
+  }
+}
+
+TEST(TinyPtrVectorTest, MutableArrayRefTest) {
+  int data_array[128];
+  std::vector<int *> data;
+
+  for (unsigned i = 0, e = 128; i != e; ++i) {
+    data_array[i] = 324 - int(i);
+    data.push_back(&data_array[i]);
+  }
+
+  TinyPtrVector<int *> V(data);
+  EXPECT_TRUE(V.size() == 128);
+  EXPECT_FALSE(V.empty());
+
+  MutableArrayRef<int *> mut_array = V;
+  for (unsigned i = 0, e = 128; i != e; ++i) {
+    EXPECT_TRUE(mut_array[i] == data[i]);
+    mut_array[i] = 324 + mut_array[i];
+    EXPECT_TRUE(mut_array[i] == (324 + data[i]));
+  }
+}
diff --git a/unittests/ADT/TripleTest.cpp b/unittests/ADT/TripleTest.cpp
index cacbde6..1f9aa32 100644
--- a/unittests/ADT/TripleTest.cpp
+++ b/unittests/ADT/TripleTest.cpp
@@ -665,3 +665,20 @@ TEST(TripleTest, getARMCPUForArch) {
   }
 }
 }
+
+TEST(TripleTest, NormalizeARM) {
+  EXPECT_EQ("armv6--netbsd-eabi", Triple::normalize("armv6-netbsd-eabi"));
+  EXPECT_EQ("armv7--netbsd-eabi", Triple::normalize("armv7-netbsd-eabi"));
+  EXPECT_EQ("armv6eb--netbsd-eabi", Triple::normalize("armv6eb-netbsd-eabi"));
+  EXPECT_EQ("armv7eb--netbsd-eabi", Triple::normalize("armv7eb-netbsd-eabi"));
+  EXPECT_EQ("armv6--netbsd-eabihf", Triple::normalize("armv6-netbsd-eabihf"));
+  EXPECT_EQ("armv7--netbsd-eabihf", Triple::normalize("armv7-netbsd-eabihf"));
+  EXPECT_EQ("armv6eb--netbsd-eabihf", Triple::normalize("armv6eb-netbsd-eabihf"));
+  EXPECT_EQ("armv7eb--netbsd-eabihf", Triple::normalize("armv7eb-netbsd-eabihf"));
+
+  Triple T;
+  T = Triple("armv6--netbsd-eabi");
+  EXPECT_EQ(Triple::arm, T.getArch());
+  T = Triple("armv6eb--netbsd-eabi");
+  EXPECT_EQ(Triple::armeb, T.getArch());
+}
diff --git a/unittests/Analysis/CFGTest.cpp b/unittests/Analysis/CFGTest.cpp
index dba9d49..4b4ebb6 100644
--- a/unittests/Analysis/CFGTest.cpp
+++ b/unittests/Analysis/CFGTest.cpp
@@ -16,7 +16,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
@@ -72,7 +72,7 @@ protected:
         PassInfo *PI = new PassInfo("isPotentiallyReachable testing pass",
                                     "", &ID, nullptr, true, true);
         PassRegistry::getPassRegistry()->registerPass(*PI, false);
-        initializeLoopInfoPass(*PassRegistry::getPassRegistry());
+        initializeLoopInfoWrapperPassPass(*PassRegistry::getPassRegistry());
         initializeDominatorTreeWrapperPassPass(
             *PassRegistry::getPassRegistry());
         return 0;
@@ -80,7 +80,7 @@ protected:
 
       void getAnalysisUsage(AnalysisUsage &AU) const {
         AU.setPreservesAll();
-        AU.addRequired<LoopInfo>();
+        AU.addRequired<LoopInfoWrapperPass>();
         AU.addRequired<DominatorTreeWrapperPass>();
       }
 
@@ -88,7 +88,7 @@ protected:
         if (!F.hasName() || F.getName() != "test")
           return false;
 
-        LoopInfo *LI = &getAnalysis<LoopInfo>();
+        LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
         DominatorTree *DT =
             &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
         EXPECT_EQ(isPotentiallyReachable(A, B, nullptr, nullptr),
@@ -107,7 +107,7 @@ protected:
 
     IsPotentiallyReachableTestPass *P =
         new IsPotentiallyReachableTestPass(ExpectedResult, A, B);
-    PassManager PM;
+    legacy::PassManager PM;
     PM.add(P);
     PM.run(*M);
   }
diff --git a/unittests/Analysis/MixedTBAATest.cpp b/unittests/Analysis/MixedTBAATest.cpp
index d7935e3..7b8a25c 100644
--- a/unittests/Analysis/MixedTBAATest.cpp
+++ b/unittests/Analysis/MixedTBAATest.cpp
@@ -13,7 +13,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "gtest/gtest.h"
 
@@ -27,7 +27,7 @@ protected:
   LLVMContext C;
   Module M;
   MDBuilder MD;
-  PassManager PM;
+  legacy::PassManager PM;
 };
 
 TEST_F(MixedTBAATest, MixedTBAA) {
diff --git a/unittests/Analysis/ScalarEvolutionTest.cpp b/unittests/Analysis/ScalarEvolutionTest.cpp
index 90f6997..876a264 100644
--- a/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -14,7 +14,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "gtest/gtest.h"
 
 namespace llvm {
@@ -32,7 +32,7 @@ protected:
   }
   LLVMContext Context;
   Module M;
-  PassManager PM;
+  legacy::PassManager PM;
   ScalarEvolution &SE;
 };
 
diff --git a/unittests/Bitcode/BitReaderTest.cpp b/unittests/Bitcode/BitReaderTest.cpp
index 6eb40d6..1f28ec6 100644
--- a/unittests/Bitcode/BitReaderTest.cpp
+++ b/unittests/Bitcode/BitReaderTest.cpp
@@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SmallString.h"
+#include "llvm/AsmParser/Parser.h"
 #include "llvm/Bitcode/BitstreamWriter.h"
 #include "llvm/Bitcode/ReaderWriter.h"
-#include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index 65930b5..a1272ed 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -4,7 +4,7 @@ set_target_properties(UnitTests PROPERTIES FOLDER "Tests")
 if (APPLE)
   set(CMAKE_INSTALL_RPATH "@executable_path/../../lib")
 else(UNIX)
-  set(CMAKE_INSTALL_RPATH "\$ORIGIN/../../lib")
+  set(CMAKE_INSTALL_RPATH "\$ORIGIN/../../lib${LLVM_LIBDIR_SUFFIX}")
 endif()
 
 function(add_llvm_unittest test_dirname)
@@ -22,5 +22,6 @@ add_subdirectory(LineEditor)
 add_subdirectory(Linker)
 add_subdirectory(MC)
 add_subdirectory(Option)
+add_subdirectory(ProfileData)
 add_subdirectory(Support)
 add_subdirectory(Transforms)
diff --git a/unittests/CodeGen/DIEHashTest.cpp b/unittests/CodeGen/DIEHashTest.cpp
index 04c5a8a..efb0920 100644
--- a/unittests/CodeGen/DIEHashTest.cpp
+++ b/unittests/CodeGen/DIEHashTest.cpp
@@ -7,12 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "../lib/CodeGen/AsmPrinter/DIE.h"
+#include "llvm/CodeGen/DIE.h"
 #include "../lib/CodeGen/AsmPrinter/DIEHash.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
-#include "llvm/ADT/STLExtras.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
diff --git a/unittests/DebugInfo/CMakeLists.txt b/unittests/DebugInfo/CMakeLists.txt
index e844e95..dae472b 100644
--- a/unittests/DebugInfo/CMakeLists.txt
+++ b/unittests/DebugInfo/CMakeLists.txt
@@ -1,11 +1,3 @@
-set(LLVM_LINK_COMPONENTS
-  DebugInfo
-  )
 
-set(DebugInfoSources
-  DWARFFormValueTest.cpp
-  )
-
-add_llvm_unittest(DebugInfoTests
-  ${DebugInfoSources}
-  )
+add_subdirectory(DWARF)
+add_subdirectory(PDB)
diff --git a/unittests/DebugInfo/DWARF/CMakeLists.txt b/unittests/DebugInfo/DWARF/CMakeLists.txt
new file mode 100644
index 0000000..4bec17c
--- /dev/null
+++ b/unittests/DebugInfo/DWARF/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(LLVM_LINK_COMPONENTS
+  DebugInfoDWARF
+  )
+
+set(DebugInfoSources
+  DWARFFormValueTest.cpp
+  )
+
+add_llvm_unittest(DebugInfoDWARFTests
+  ${DebugInfoSources}
+  )
diff --git a/unittests/DebugInfo/DWARF/DWARFFormValueTest.cpp b/unittests/DebugInfo/DWARF/DWARFFormValueTest.cpp
new file mode 100644
index 0000000..4521132
--- /dev/null
+++ b/unittests/DebugInfo/DWARF/DWARFFormValueTest.cpp
@@ -0,0 +1,49 @@
+//===- llvm/unittest/DebugInfo/DWARFFormValueTest.cpp ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/Support/Dwarf.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+using namespace dwarf;
+
+namespace {
+
+TEST(DWARFFormValue, FixedFormSizes) {
+  // Size of DW_FORM_addr and DW_FORM_ref_addr are equal in DWARF2,
+  // DW_FORM_ref_addr is always 4 bytes in DWARF32 starting from DWARF3.
+  ArrayRef<uint8_t> sizes = DWARFFormValue::getFixedFormSizes(4, 2);
+  EXPECT_EQ(sizes[DW_FORM_addr], sizes[DW_FORM_ref_addr]);
+  sizes = DWARFFormValue::getFixedFormSizes(8, 2);
+  EXPECT_EQ(sizes[DW_FORM_addr], sizes[DW_FORM_ref_addr]);
+  sizes = DWARFFormValue::getFixedFormSizes(8, 3);
+  EXPECT_EQ(4, sizes[DW_FORM_ref_addr]);
+  // Check that we don't have fixed form sizes for weird address sizes.
+  EXPECT_EQ(0U, DWARFFormValue::getFixedFormSizes(16, 2).size());
+}
+
+bool isFormClass(uint16_t Form, DWARFFormValue::FormClass FC) {
+  return DWARFFormValue(Form).isFormClass(FC);
+}
+
+TEST(DWARFFormValue, FormClass) {
+  EXPECT_TRUE(isFormClass(DW_FORM_addr, DWARFFormValue::FC_Address));
+  EXPECT_FALSE(isFormClass(DW_FORM_data8, DWARFFormValue::FC_Address));
+  EXPECT_TRUE(isFormClass(DW_FORM_data8, DWARFFormValue::FC_Constant));
+  EXPECT_TRUE(isFormClass(DW_FORM_data8, DWARFFormValue::FC_SectionOffset));
+  EXPECT_TRUE(
+      isFormClass(DW_FORM_sec_offset, DWARFFormValue::FC_SectionOffset));
+  EXPECT_TRUE(isFormClass(DW_FORM_GNU_str_index, DWARFFormValue::FC_String));
+  EXPECT_TRUE(isFormClass(DW_FORM_GNU_addr_index, DWARFFormValue::FC_Address));
+  EXPECT_FALSE(isFormClass(DW_FORM_ref_addr, DWARFFormValue::FC_Address));
+  EXPECT_TRUE(isFormClass(DW_FORM_ref_addr, DWARFFormValue::FC_Reference));
+  EXPECT_TRUE(isFormClass(DW_FORM_ref_sig8, DWARFFormValue::FC_Reference));
+}
+
+} // end anonymous namespace
diff --git a/unittests/DebugInfo/DWARF/Makefile b/unittests/DebugInfo/DWARF/Makefile
new file mode 100644
index 0000000..b0f40e1
--- /dev/null
+++ b/unittests/DebugInfo/DWARF/Makefile
@@ -0,0 +1,16 @@
+##===- unittests/DebugInfo/Makefile ------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+TESTNAME = DebugInfoDWARF
+LINK_COMPONENTS := DebugInfoDWARF object support
+
+include $(LEVEL)/Makefile.config
+
+include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/DebugInfo/DWARFFormValueTest.cpp b/unittests/DebugInfo/DWARFFormValueTest.cpp
deleted file mode 100644
index 38b932e..0000000
--- a/unittests/DebugInfo/DWARFFormValueTest.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//===- llvm/unittest/DebugInfo/DWARFFormValueTest.cpp ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/DWARFFormValue.h"
-#include "llvm/Support/Dwarf.h"
-#include "gtest/gtest.h"
-using namespace llvm;
-using namespace dwarf;
-
-namespace {
-
-TEST(DWARFFormValue, FixedFormSizes) {
-  // Size of DW_FORM_addr and DW_FORM_ref_addr are equal in DWARF2,
-  // DW_FORM_ref_addr is always 4 bytes in DWARF32 starting from DWARF3.
-  ArrayRef<uint8_t> sizes = DWARFFormValue::getFixedFormSizes(4, 2);
-  EXPECT_EQ(sizes[DW_FORM_addr], sizes[DW_FORM_ref_addr]);
-  sizes = DWARFFormValue::getFixedFormSizes(8, 2);
-  EXPECT_EQ(sizes[DW_FORM_addr], sizes[DW_FORM_ref_addr]);
-  sizes = DWARFFormValue::getFixedFormSizes(8, 3);
-  EXPECT_EQ(4, sizes[DW_FORM_ref_addr]);
-  // Check that we don't have fixed form sizes for weird address sizes.
-  EXPECT_EQ(0U, DWARFFormValue::getFixedFormSizes(16, 2).size());
-}
-
-bool isFormClass(uint16_t Form, DWARFFormValue::FormClass FC) {
-  return DWARFFormValue(Form).isFormClass(FC);
-}
-
-TEST(DWARFFormValue, FormClass) {
-  EXPECT_TRUE(isFormClass(DW_FORM_addr, DWARFFormValue::FC_Address));
-  EXPECT_FALSE(isFormClass(DW_FORM_data8, DWARFFormValue::FC_Address));
-  EXPECT_TRUE(isFormClass(DW_FORM_data8, DWARFFormValue::FC_Constant));
-  EXPECT_TRUE(isFormClass(DW_FORM_data8, DWARFFormValue::FC_SectionOffset));
-  EXPECT_TRUE(
-      isFormClass(DW_FORM_sec_offset, DWARFFormValue::FC_SectionOffset));
-  EXPECT_TRUE(isFormClass(DW_FORM_GNU_str_index, DWARFFormValue::FC_String));
-  EXPECT_TRUE(isFormClass(DW_FORM_GNU_addr_index, DWARFFormValue::FC_Address));
-  EXPECT_FALSE(isFormClass(DW_FORM_ref_addr, DWARFFormValue::FC_Address));
-  EXPECT_TRUE(isFormClass(DW_FORM_ref_addr, DWARFFormValue::FC_Reference));
-  EXPECT_TRUE(isFormClass(DW_FORM_ref_sig8, DWARFFormValue::FC_Reference));
-}
-
-} // end anonymous namespace
diff --git a/unittests/DebugInfo/Makefile b/unittests/DebugInfo/Makefile
index 999ded9..1889ad2 100644
--- a/unittests/DebugInfo/Makefile
+++ b/unittests/DebugInfo/Makefile
@@ -6,11 +6,10 @@
 # License. See LICENSE.TXT for details.
 #
 ##===----------------------------------------------------------------------===##
-
 LEVEL = ../..
-TESTNAME = DebugInfo
-LINK_COMPONENTS := debuginfo object support
 
 include $(LEVEL)/Makefile.config
 
+PARALLEL_DIRS := DWARF PDB
+
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/DebugInfo/PDB/CMakeLists.txt b/unittests/DebugInfo/PDB/CMakeLists.txt
new file mode 100644
index 0000000..91924a5
--- /dev/null
+++ b/unittests/DebugInfo/PDB/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(LLVM_LINK_COMPONENTS
+  DebugInfoPDB
+  )
+
+set(DebugInfoPDBSources
+  PDBApiTest.cpp
+  )
+
+add_llvm_unittest(DebugInfoPDBTests
+  ${DebugInfoPDBSources}
+  )
diff --git a/unittests/DebugInfo/PDB/Makefile b/unittests/DebugInfo/PDB/Makefile
new file mode 100644
index 0000000..eb118a3
--- /dev/null
+++ b/unittests/DebugInfo/PDB/Makefile
@@ -0,0 +1,16 @@
+##===- unittests/DebugInfo/PDB/Makefile -------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+TESTNAME = DebugInfoPDB
+LINK_COMPONENTS := DebugInfoPDB support
+
+include $(LEVEL)/Makefile.config
+
+include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/DebugInfo/PDB/PDBApiTest.cpp b/unittests/DebugInfo/PDB/PDBApiTest.cpp
new file mode 100644
index 0000000..629c5f8
--- /dev/null
+++ b/unittests/DebugInfo/PDB/PDBApiTest.cpp
@@ -0,0 +1,387 @@
+//===- llvm/unittest/DebugInfo/PDB/PDBApiTest.cpp -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <unordered_map>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
+
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolAnnotation.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolBlock.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCustom.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolLabel.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolThunk.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeArray.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolUnknown.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+
+namespace {
+
+#define MOCK_SYMBOL_ACCESSOR(Func)                                             \
+  decltype(std::declval<IPDBRawSymbol>().Func()) Func() const override {       \
+    typedef decltype(IPDBRawSymbol::Func()) ReturnType;                        \
+    return ReturnType();                                                       \
+  }
+
+class MockSession : public IPDBSession {
+  uint64_t getLoadAddress() const override { return 0; }
+  void setLoadAddress(uint64_t Address) override {}
+  std::unique_ptr<PDBSymbolExe> getGlobalScope() const override {
+    return nullptr;
+  }
+  std::unique_ptr<PDBSymbol> getSymbolById(uint32_t SymbolId) const override {
+    return nullptr;
+  }
+  std::unique_ptr<IPDBSourceFile>
+  getSourceFileById(uint32_t SymbolId) const override {
+    return nullptr;
+  }
+  std::unique_ptr<IPDBEnumSourceFiles> getAllSourceFiles() const override {
+    return nullptr;
+  }
+  std::unique_ptr<IPDBEnumSourceFiles> getSourceFilesForCompiland(
+      const PDBSymbolCompiland &Compiland) const override {
+    return nullptr;
+  }
+
+  std::unique_ptr<IPDBEnumDataStreams> getDebugStreams() const override {
+    return nullptr;
+  }
+};
+
+class MockRawSymbol : public IPDBRawSymbol {
+public:
+  MockRawSymbol(PDB_SymType SymType)
+      : Type(SymType) {}
+
+  void dump(raw_ostream &OS, int Indent) const override {}
+
+  std::unique_ptr<IPDBEnumSymbols>
+  findChildren(PDB_SymType Type) const override {
+    return nullptr;
+  }
+  std::unique_ptr<IPDBEnumSymbols>
+  findChildren(PDB_SymType Type, StringRef Name,
+               PDB_NameSearchFlags Flags) const override {
+    return nullptr;
+  }
+  std::unique_ptr<IPDBEnumSymbols>
+  findChildrenByRVA(PDB_SymType Type, StringRef Name, PDB_NameSearchFlags Flags,
+                    uint32_t RVA) const override {
+    return nullptr;
+  }
+  std::unique_ptr<IPDBEnumSymbols>
+  findInlineFramesByRVA(uint32_t RVA) const override {
+    return nullptr;
+  }
+
+  void getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes) const override {}
+  void getFrontEndVersion(VersionInfo &Version) const override {}
+  void getBackEndVersion(VersionInfo &Version) const override {}
+
+  PDB_SymType getSymTag() const override { return Type; }
+
+  MOCK_SYMBOL_ACCESSOR(getAccess)
+  MOCK_SYMBOL_ACCESSOR(getAddressOffset)
+  MOCK_SYMBOL_ACCESSOR(getAddressSection)
+  MOCK_SYMBOL_ACCESSOR(getAge)
+  MOCK_SYMBOL_ACCESSOR(getArrayIndexTypeId)
+  MOCK_SYMBOL_ACCESSOR(getBaseDataOffset)
+  MOCK_SYMBOL_ACCESSOR(getBaseDataSlot)
+  MOCK_SYMBOL_ACCESSOR(getBaseSymbolId)
+  MOCK_SYMBOL_ACCESSOR(getBuiltinType)
+  MOCK_SYMBOL_ACCESSOR(getBitPosition)
+  MOCK_SYMBOL_ACCESSOR(getCallingConvention)
+  MOCK_SYMBOL_ACCESSOR(getClassParentId)
+  MOCK_SYMBOL_ACCESSOR(getCompilerName)
+  MOCK_SYMBOL_ACCESSOR(getCount)
+  MOCK_SYMBOL_ACCESSOR(getCountLiveRanges)
+  MOCK_SYMBOL_ACCESSOR(getLanguage)
+  MOCK_SYMBOL_ACCESSOR(getLexicalParentId)
+  MOCK_SYMBOL_ACCESSOR(getLibraryName)
+  MOCK_SYMBOL_ACCESSOR(getLiveRangeStartAddressOffset)
+  MOCK_SYMBOL_ACCESSOR(getLiveRangeStartAddressSection)
+  MOCK_SYMBOL_ACCESSOR(getLiveRangeStartRelativeVirtualAddress)
+  MOCK_SYMBOL_ACCESSOR(getLocalBasePointerRegisterId)
+  MOCK_SYMBOL_ACCESSOR(getLowerBoundId)
+  MOCK_SYMBOL_ACCESSOR(getMemorySpaceKind)
+  MOCK_SYMBOL_ACCESSOR(getName)
+  MOCK_SYMBOL_ACCESSOR(getNumberOfAcceleratorPointerTags)
+  MOCK_SYMBOL_ACCESSOR(getNumberOfColumns)
+  MOCK_SYMBOL_ACCESSOR(getNumberOfModifiers)
+  MOCK_SYMBOL_ACCESSOR(getNumberOfRegisterIndices)
+  MOCK_SYMBOL_ACCESSOR(getNumberOfRows)
+  MOCK_SYMBOL_ACCESSOR(getObjectFileName)
+  MOCK_SYMBOL_ACCESSOR(getOemId)
+  MOCK_SYMBOL_ACCESSOR(getOemSymbolId)
+  MOCK_SYMBOL_ACCESSOR(getOffsetInUdt)
+  MOCK_SYMBOL_ACCESSOR(getPlatform)
+  MOCK_SYMBOL_ACCESSOR(getRank)
+  MOCK_SYMBOL_ACCESSOR(getRegisterId)
+  MOCK_SYMBOL_ACCESSOR(getRegisterType)
+  MOCK_SYMBOL_ACCESSOR(getRelativeVirtualAddress)
+  MOCK_SYMBOL_ACCESSOR(getSamplerSlot)
+  MOCK_SYMBOL_ACCESSOR(getSignature)
+  MOCK_SYMBOL_ACCESSOR(getSizeInUdt)
+  MOCK_SYMBOL_ACCESSOR(getSlot)
+  MOCK_SYMBOL_ACCESSOR(getSourceFileName)
+  MOCK_SYMBOL_ACCESSOR(getStride)
+  MOCK_SYMBOL_ACCESSOR(getSubTypeId)
+  MOCK_SYMBOL_ACCESSOR(getSymbolsFileName)
+  MOCK_SYMBOL_ACCESSOR(getSymIndexId)
+  MOCK_SYMBOL_ACCESSOR(getTargetOffset)
+  MOCK_SYMBOL_ACCESSOR(getTargetRelativeVirtualAddress)
+  MOCK_SYMBOL_ACCESSOR(getTargetVirtualAddress)
+  MOCK_SYMBOL_ACCESSOR(getTargetSection)
+  MOCK_SYMBOL_ACCESSOR(getTextureSlot)
+  MOCK_SYMBOL_ACCESSOR(getTimeStamp)
+  MOCK_SYMBOL_ACCESSOR(getToken)
+  MOCK_SYMBOL_ACCESSOR(getTypeId)
+  MOCK_SYMBOL_ACCESSOR(getUavSlot)
+  MOCK_SYMBOL_ACCESSOR(getUndecoratedName)
+  MOCK_SYMBOL_ACCESSOR(getUnmodifiedTypeId)
+  MOCK_SYMBOL_ACCESSOR(getUpperBoundId)
+  MOCK_SYMBOL_ACCESSOR(getVirtualBaseDispIndex)
+  MOCK_SYMBOL_ACCESSOR(getVirtualBaseOffset)
+  MOCK_SYMBOL_ACCESSOR(getVirtualTableShapeId)
+  MOCK_SYMBOL_ACCESSOR(getDataKind)
+  MOCK_SYMBOL_ACCESSOR(getGuid)
+  MOCK_SYMBOL_ACCESSOR(getOffset)
+  MOCK_SYMBOL_ACCESSOR(getThisAdjust)
+  MOCK_SYMBOL_ACCESSOR(getVirtualBasePointerOffset)
+  MOCK_SYMBOL_ACCESSOR(getLocationType)
+  MOCK_SYMBOL_ACCESSOR(getMachineType)
+  MOCK_SYMBOL_ACCESSOR(getThunkOrdinal)
+  MOCK_SYMBOL_ACCESSOR(getLength)
+  MOCK_SYMBOL_ACCESSOR(getLiveRangeLength)
+  MOCK_SYMBOL_ACCESSOR(getVirtualAddress)
+  MOCK_SYMBOL_ACCESSOR(getUdtKind)
+  MOCK_SYMBOL_ACCESSOR(hasConstructor)
+  MOCK_SYMBOL_ACCESSOR(hasCustomCallingConvention)
+  MOCK_SYMBOL_ACCESSOR(hasFarReturn)
+  MOCK_SYMBOL_ACCESSOR(isCode)
+  MOCK_SYMBOL_ACCESSOR(isCompilerGenerated)
+  MOCK_SYMBOL_ACCESSOR(isConstType)
+  MOCK_SYMBOL_ACCESSOR(isEditAndContinueEnabled)
+  MOCK_SYMBOL_ACCESSOR(isFunction)
+  MOCK_SYMBOL_ACCESSOR(getAddressTaken)
+  MOCK_SYMBOL_ACCESSOR(getNoStackOrdering)
+  MOCK_SYMBOL_ACCESSOR(hasAlloca)
+  MOCK_SYMBOL_ACCESSOR(hasAssignmentOperator)
+  MOCK_SYMBOL_ACCESSOR(hasCTypes)
+  MOCK_SYMBOL_ACCESSOR(hasCastOperator)
+  MOCK_SYMBOL_ACCESSOR(hasDebugInfo)
+  MOCK_SYMBOL_ACCESSOR(hasEH)
+  MOCK_SYMBOL_ACCESSOR(hasEHa)
+  MOCK_SYMBOL_ACCESSOR(hasFramePointer)
+  MOCK_SYMBOL_ACCESSOR(hasInlAsm)
+  MOCK_SYMBOL_ACCESSOR(hasInlineAttribute)
+  MOCK_SYMBOL_ACCESSOR(hasInterruptReturn)
+  MOCK_SYMBOL_ACCESSOR(hasLongJump)
+  MOCK_SYMBOL_ACCESSOR(hasManagedCode)
+  MOCK_SYMBOL_ACCESSOR(hasNestedTypes)
+  MOCK_SYMBOL_ACCESSOR(hasNoInlineAttribute)
+  MOCK_SYMBOL_ACCESSOR(hasNoReturnAttribute)
+  MOCK_SYMBOL_ACCESSOR(hasOptimizedCodeDebugInfo)
+  MOCK_SYMBOL_ACCESSOR(hasOverloadedOperator)
+  MOCK_SYMBOL_ACCESSOR(hasSEH)
+  MOCK_SYMBOL_ACCESSOR(hasSecurityChecks)
+  MOCK_SYMBOL_ACCESSOR(hasSetJump)
+  MOCK_SYMBOL_ACCESSOR(hasStrictGSCheck)
+  MOCK_SYMBOL_ACCESSOR(isAcceleratorGroupSharedLocal)
+  MOCK_SYMBOL_ACCESSOR(isAcceleratorPointerTagLiveRange)
+  MOCK_SYMBOL_ACCESSOR(isAcceleratorStubFunction)
+  MOCK_SYMBOL_ACCESSOR(isAggregated)
+  MOCK_SYMBOL_ACCESSOR(isIntroVirtualFunction)
+  MOCK_SYMBOL_ACCESSOR(isCVTCIL)
+  MOCK_SYMBOL_ACCESSOR(isConstructorVirtualBase)
+  MOCK_SYMBOL_ACCESSOR(isCxxReturnUdt)
+  MOCK_SYMBOL_ACCESSOR(isDataAligned)
+  MOCK_SYMBOL_ACCESSOR(isHLSLData)
+  MOCK_SYMBOL_ACCESSOR(isHotpatchable)
+  MOCK_SYMBOL_ACCESSOR(isIndirectVirtualBaseClass)
+  MOCK_SYMBOL_ACCESSOR(isInterfaceUdt)
+  MOCK_SYMBOL_ACCESSOR(isIntrinsic)
+  MOCK_SYMBOL_ACCESSOR(isLTCG)
+  MOCK_SYMBOL_ACCESSOR(isLocationControlFlowDependent)
+  MOCK_SYMBOL_ACCESSOR(isMSILNetmodule)
+  MOCK_SYMBOL_ACCESSOR(isMatrixRowMajor)
+  MOCK_SYMBOL_ACCESSOR(isManagedCode)
+  MOCK_SYMBOL_ACCESSOR(isMSILCode)
+  MOCK_SYMBOL_ACCESSOR(isMultipleInheritance)
+  MOCK_SYMBOL_ACCESSOR(isNaked)
+  MOCK_SYMBOL_ACCESSOR(isNested)
+  MOCK_SYMBOL_ACCESSOR(isOptimizedAway)
+  MOCK_SYMBOL_ACCESSOR(isPacked)
+  MOCK_SYMBOL_ACCESSOR(isPointerBasedOnSymbolValue)
+  MOCK_SYMBOL_ACCESSOR(isPointerToDataMember)
+  MOCK_SYMBOL_ACCESSOR(isPointerToMemberFunction)
+  MOCK_SYMBOL_ACCESSOR(isPureVirtual)
+  MOCK_SYMBOL_ACCESSOR(isRValueReference)
+  MOCK_SYMBOL_ACCESSOR(isRefUdt)
+  MOCK_SYMBOL_ACCESSOR(isReference)
+  MOCK_SYMBOL_ACCESSOR(isRestrictedType)
+  MOCK_SYMBOL_ACCESSOR(isReturnValue)
+  MOCK_SYMBOL_ACCESSOR(isSafeBuffers)
+  MOCK_SYMBOL_ACCESSOR(isScoped)
+  MOCK_SYMBOL_ACCESSOR(isSdl)
+  MOCK_SYMBOL_ACCESSOR(isSingleInheritance)
+  MOCK_SYMBOL_ACCESSOR(isSplitted)
+  MOCK_SYMBOL_ACCESSOR(isStatic)
+  MOCK_SYMBOL_ACCESSOR(hasPrivateSymbols)
+  MOCK_SYMBOL_ACCESSOR(isUnalignedType)
+  MOCK_SYMBOL_ACCESSOR(isUnreached)
+  MOCK_SYMBOL_ACCESSOR(isValueUdt)
+  MOCK_SYMBOL_ACCESSOR(isVirtual)
+  MOCK_SYMBOL_ACCESSOR(isVirtualBaseClass)
+  MOCK_SYMBOL_ACCESSOR(isVirtualInheritance)
+  MOCK_SYMBOL_ACCESSOR(isVolatileType)
+  MOCK_SYMBOL_ACCESSOR(getValue)
+  MOCK_SYMBOL_ACCESSOR(wasInlined)
+  MOCK_SYMBOL_ACCESSOR(getUnused)
+
+private:
+  PDB_SymType Type;
+};
+
+class PDBApiTest : public testing::Test {
+public:
+  std::unordered_map<PDB_SymType, std::unique_ptr<PDBSymbol>> SymbolMap;
+
+  void SetUp() override {
+    Session.reset(new MockSession());
+
+    InsertItemWithTag(PDB_SymType::None);
+    InsertItemWithTag(PDB_SymType::Exe);
+    InsertItemWithTag(PDB_SymType::Compiland);
+    InsertItemWithTag(PDB_SymType::CompilandDetails);
+    InsertItemWithTag(PDB_SymType::CompilandEnv);
+    InsertItemWithTag(PDB_SymType::Function);
+    InsertItemWithTag(PDB_SymType::Block);
+    InsertItemWithTag(PDB_SymType::Data);
+    InsertItemWithTag(PDB_SymType::Annotation);
+    InsertItemWithTag(PDB_SymType::Label);
+    InsertItemWithTag(PDB_SymType::PublicSymbol);
+    InsertItemWithTag(PDB_SymType::UDT);
+    InsertItemWithTag(PDB_SymType::Enum);
+    InsertItemWithTag(PDB_SymType::FunctionSig);
+    InsertItemWithTag(PDB_SymType::PointerType);
+    InsertItemWithTag(PDB_SymType::ArrayType);
+    InsertItemWithTag(PDB_SymType::BuiltinType);
+    InsertItemWithTag(PDB_SymType::Typedef);
+    InsertItemWithTag(PDB_SymType::BaseClass);
+    InsertItemWithTag(PDB_SymType::Friend);
+    InsertItemWithTag(PDB_SymType::FunctionArg);
+    InsertItemWithTag(PDB_SymType::FuncDebugStart);
+    InsertItemWithTag(PDB_SymType::FuncDebugEnd);
+    InsertItemWithTag(PDB_SymType::UsingNamespace);
+    InsertItemWithTag(PDB_SymType::VTableShape);
+    InsertItemWithTag(PDB_SymType::VTable);
+    InsertItemWithTag(PDB_SymType::Custom);
+    InsertItemWithTag(PDB_SymType::Thunk);
+    InsertItemWithTag(PDB_SymType::CustomType);
+    InsertItemWithTag(PDB_SymType::ManagedType);
+    InsertItemWithTag(PDB_SymType::Dimension);
+    InsertItemWithTag(PDB_SymType::Max);
+  }
+
+  template <class ExpectedType> void VerifyDyncast(PDB_SymType Tag) {
+    for (auto item = SymbolMap.begin(); item != SymbolMap.end(); ++item) {
+      EXPECT_EQ(item->first == Tag, llvm::isa<ExpectedType>(*item->second));
+    }
+  }
+
+  void VerifyUnknownDyncasts() {
+    for (auto item = SymbolMap.begin(); item != SymbolMap.end(); ++item) {
+      bool should_match = false;
+      if (item->first == PDB_SymType::None || item->first >= PDB_SymType::Max)
+        should_match = true;
+
+      EXPECT_EQ(should_match, llvm::isa<PDBSymbolUnknown>(*item->second));
+    }
+  }
+
+private:
+  std::unique_ptr<IPDBSession> Session;
+
+  void InsertItemWithTag(PDB_SymType Tag) {
+    auto RawSymbol = llvm::make_unique<MockRawSymbol>(Tag);
+    auto Symbol = PDBSymbol::create(*Session, std::move(RawSymbol));
+    SymbolMap.insert(std::make_pair(Tag, std::move(Symbol)));
+  }
+};
+
+TEST_F(PDBApiTest, Dyncast) {
+
+  // Most of the types have a one-to-one mapping between Tag and concrete type.
+  VerifyDyncast<PDBSymbolExe>(PDB_SymType::Exe);
+  VerifyDyncast<PDBSymbolCompiland>(PDB_SymType::Compiland);
+  VerifyDyncast<PDBSymbolCompilandDetails>(PDB_SymType::CompilandDetails);
+  VerifyDyncast<PDBSymbolCompilandEnv>(PDB_SymType::CompilandEnv);
+  VerifyDyncast<PDBSymbolFunc>(PDB_SymType::Function);
+  VerifyDyncast<PDBSymbolBlock>(PDB_SymType::Block);
+  VerifyDyncast<PDBSymbolData>(PDB_SymType::Data);
+  VerifyDyncast<PDBSymbolAnnotation>(PDB_SymType::Annotation);
+  VerifyDyncast<PDBSymbolLabel>(PDB_SymType::Label);
+  VerifyDyncast<PDBSymbolPublicSymbol>(PDB_SymType::PublicSymbol);
+  VerifyDyncast<PDBSymbolTypeUDT>(PDB_SymType::UDT);
+  VerifyDyncast<PDBSymbolTypeEnum>(PDB_SymType::Enum);
+  VerifyDyncast<PDBSymbolTypeFunctionSig>(PDB_SymType::FunctionSig);
+  VerifyDyncast<PDBSymbolTypePointer>(PDB_SymType::PointerType);
+  VerifyDyncast<PDBSymbolTypeArray>(PDB_SymType::ArrayType);
+  VerifyDyncast<PDBSymbolTypeBuiltin>(PDB_SymType::BuiltinType);
+  VerifyDyncast<PDBSymbolTypeTypedef>(PDB_SymType::Typedef);
+  VerifyDyncast<PDBSymbolTypeBaseClass>(PDB_SymType::BaseClass);
+  VerifyDyncast<PDBSymbolTypeFriend>(PDB_SymType::Friend);
+  VerifyDyncast<PDBSymbolTypeFunctionArg>(PDB_SymType::FunctionArg);
+  VerifyDyncast<PDBSymbolFuncDebugStart>(PDB_SymType::FuncDebugStart);
+  VerifyDyncast<PDBSymbolFuncDebugEnd>(PDB_SymType::FuncDebugEnd);
+  VerifyDyncast<PDBSymbolUsingNamespace>(PDB_SymType::UsingNamespace);
+  VerifyDyncast<PDBSymbolTypeVTableShape>(PDB_SymType::VTableShape);
+  VerifyDyncast<PDBSymbolTypeVTable>(PDB_SymType::VTable);
+  VerifyDyncast<PDBSymbolCustom>(PDB_SymType::Custom);
+  VerifyDyncast<PDBSymbolThunk>(PDB_SymType::Thunk);
+  VerifyDyncast<PDBSymbolTypeCustom>(PDB_SymType::CustomType);
+  VerifyDyncast<PDBSymbolTypeManaged>(PDB_SymType::ManagedType);
+  VerifyDyncast<PDBSymbolTypeDimension>(PDB_SymType::Dimension);
+
+  VerifyUnknownDyncasts();
+}
+
+} // end anonymous namespace
diff --git a/unittests/ExecutionEngine/CMakeLists.txt b/unittests/ExecutionEngine/CMakeLists.txt
index 783c9b5..302de99 100644
--- a/unittests/ExecutionEngine/CMakeLists.txt
+++ b/unittests/ExecutionEngine/CMakeLists.txt
@@ -3,6 +3,8 @@ set(LLVM_LINK_COMPONENTS
   ExecutionEngine
   Interpreter
   MC
+  OrcJIT
+  RuntimeDyld
   Support
   )
 
@@ -10,6 +12,8 @@ add_llvm_unittest(ExecutionEngineTests
   ExecutionEngineTest.cpp
   )
 
+add_subdirectory(Orc)
+
 # Include MCJIT tests only if native arch is a built JIT target.
 list(FIND LLVM_TARGETS_TO_BUILD "${LLVM_NATIVE_ARCH}" build_idx)
 list(FIND LLVM_TARGETS_WITH_JIT "${LLVM_NATIVE_ARCH}" jit_idx)
diff --git a/unittests/ExecutionEngine/MCJIT/CMakeLists.txt b/unittests/ExecutionEngine/MCJIT/CMakeLists.txt
index b10cbb4..e29787f 100644
--- a/unittests/ExecutionEngine/MCJIT/CMakeLists.txt
+++ b/unittests/ExecutionEngine/MCJIT/CMakeLists.txt
@@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS
   IPO
   MC
   MCJIT
+  RuntimeDyld
   ScalarOpts
   Support
   Target
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
index c80b88b..f2a3000 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
@@ -347,6 +347,38 @@ TEST_F(MCJITCAPITest, simple_function) {
   EXPECT_EQ(42, functionPointer.usable());
 }
 
+TEST_F(MCJITCAPITest, gva) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  Module = LLVMModuleCreateWithName("simple_module");
+  LLVMSetTarget(Module, HostTriple.c_str());
+  LLVMValueRef GlobalVar = LLVMAddGlobal(Module, LLVMInt32Type(), "simple_value");
+  LLVMSetInitializer(GlobalVar, LLVMConstInt(LLVMInt32Type(), 42, 0));
+
+  buildMCJITOptions();
+  buildMCJITEngine();
+  buildAndRunPasses();
+
+  uint64_t raw = LLVMGetGlobalValueAddress(Engine, "simple_value");
+  int32_t *usable  = (int32_t *) raw;
+
+  EXPECT_EQ(42, *usable);
+}
+
+TEST_F(MCJITCAPITest, gfa) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  buildSimpleFunction();
+  buildMCJITOptions();
+  buildMCJITEngine();
+  buildAndRunPasses();
+
+  uint64_t raw = LLVMGetFunctionAddress(Engine, "simple_function");
+  int (*usable)() = (int (*)()) raw;
+
+  EXPECT_EQ(42, usable());
+}
+
 TEST_F(MCJITCAPITest, custom_memory_manager) {
   SKIP_UNSUPPORTED_PLATFORM;
   
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITMultipleModuleTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITMultipleModuleTest.cpp
index b0d1bb3..da6e25a 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITMultipleModuleTest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITMultipleModuleTest.cpp
@@ -392,4 +392,23 @@ TEST_F(MCJITMultipleModuleTest, cross_module_dependency_case3) {
   ptr = TheJIT->getFunctionAddress(FB2->getName().str());
   checkAccumulate(ptr);
 }
+
+// Test that FindFunctionNamed finds the definition of
+// a function in the correct module. We check two functions
+// in two different modules, to make sure that for at least
+// one of them MCJIT had to ignore the extern declaration.
+TEST_F(MCJITMultipleModuleTest, FindFunctionNamed_test) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  std::unique_ptr<Module> A, B;
+  Function *FA, *FB1, *FB2;
+  createCrossModuleRecursiveCase(A, FA, B, FB1, FB2);
+
+  createJIT(std::move(A));
+  TheJIT->addModule(std::move(B));
+
+  EXPECT_EQ(FA, TheJIT->FindFunctionNamed(FA->getName().data()));
+  EXPECT_EQ(FB1, TheJIT->FindFunctionNamed(FB1->getName().data()));
+}
+
 }
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITObjectCacheTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITObjectCacheTest.cpp
index 2736383..2e38dd8 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITObjectCacheTest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITObjectCacheTest.cpp
@@ -163,7 +163,7 @@ TEST_F(MCJITObjectCacheTest, VerifyLoadFromCache) {
   TheJIT.reset();
 
   // Create a new memory manager.
-  MM = new SectionMemoryManager;
+  MM.reset(new SectionMemoryManager());
 
   // Create a new module and save it. Use a different return code so we can
   // tell if MCJIT compiled this module or used the cache.
@@ -197,7 +197,7 @@ TEST_F(MCJITObjectCacheTest, VerifyNonLoadFromCache) {
   TheJIT.reset();
 
   // Create a new memory manager.
-  MM = new SectionMemoryManager;
+  MM.reset(new SectionMemoryManager());
 
   // Create a new module and save it. Use a different return code so we can
   // tell if MCJIT compiled this module or used the cache. Note that we use
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h b/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h
index eea88bb..35af417 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h
+++ b/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h
@@ -325,7 +325,7 @@ protected:
     EngineBuilder EB(std::move(M));
     std::string Error;
     TheJIT.reset(EB.setEngineKind(EngineKind::JIT)
-                 .setMCJITMemoryManager(MM)
+                 .setMCJITMemoryManager(std::move(MM))
                  .setErrorStr(&Error)
                  .setOptLevel(CodeGenOpt::None)
                  .setCodeModel(CodeModel::JITDefault)
@@ -344,7 +344,7 @@ protected:
   StringRef MArch;
   SmallVector<std::string, 1> MAttrs;
   std::unique_ptr<ExecutionEngine> TheJIT;
-  RTDyldMemoryManager *MM;
+  std::unique_ptr<RTDyldMemoryManager> MM;
 
   std::unique_ptr<Module> M;
 };
diff --git a/unittests/ExecutionEngine/Makefile b/unittests/ExecutionEngine/Makefile
index 8ecb883..c19f8d6 100644
--- a/unittests/ExecutionEngine/Makefile
+++ b/unittests/ExecutionEngine/Makefile
@@ -13,8 +13,10 @@ LINK_COMPONENTS :=interpreter
 
 include $(LEVEL)/Makefile.config
 
+PARALLEL_DIRS = Orc
+
 ifeq ($(TARGET_HAS_JIT),1)
-  PARALLEL_DIRS = MCJIT
+  PARALLEL_DIRS += MCJIT
 endif
 
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/ExecutionEngine/Orc/CMakeLists.txt b/unittests/ExecutionEngine/Orc/CMakeLists.txt
new file mode 100644
index 0000000..7bc13a1
--- /dev/null
+++ b/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  Support
+  )
+
+add_llvm_unittest(OrcJITTests
+  LazyEmittingLayerTest.cpp
+  )
diff --git a/unittests/ExecutionEngine/Orc/LazyEmittingLayerTest.cpp b/unittests/ExecutionEngine/Orc/LazyEmittingLayerTest.cpp
new file mode 100644
index 0000000..b0ff127
--- /dev/null
+++ b/unittests/ExecutionEngine/Orc/LazyEmittingLayerTest.cpp
@@ -0,0 +1,30 @@
+//===- LazyEmittingLayerTest.cpp - Unit tests for the lazy emitting layer -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/LazyEmittingLayer.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+struct MockBaseLayer {
+  typedef int ModuleSetHandleT;
+  ModuleSetHandleT addModuleSet(std::list<std::unique_ptr<llvm::Module>>,
+                                std::unique_ptr<llvm::RTDyldMemoryManager> x) {
+    EXPECT_FALSE(x);
+    return 42;
+  }
+};
+
+TEST(LazyEmittingLayerTest, Empty) {
+  MockBaseLayer M;
+  llvm::orc::LazyEmittingLayer<MockBaseLayer> L(M);
+  L.addModuleSet(std::list<std::unique_ptr<llvm::Module>>(), nullptr);
+}
+
+}
diff --git a/unittests/ExecutionEngine/Orc/Makefile b/unittests/ExecutionEngine/Orc/Makefile
new file mode 100644
index 0000000..c899728
--- /dev/null
+++ b/unittests/ExecutionEngine/Orc/Makefile
@@ -0,0 +1,16 @@
+##===- unittests/ExecutionEngine/Orc/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+TESTNAME = OrcJIT
+LINK_COMPONENTS := core ipo mcjit orcjit native support
+
+include $(LEVEL)/Makefile.config
+include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
+
diff --git a/unittests/IR/CMakeLists.txt b/unittests/IR/CMakeLists.txt
index a046209..0c29796 100644
--- a/unittests/IR/CMakeLists.txt
+++ b/unittests/IR/CMakeLists.txt
@@ -14,7 +14,6 @@ set(IRSources
   DominatorTreeTest.cpp
   IRBuilderTest.cpp
   InstructionsTest.cpp
-  LeakDetectorTest.cpp
   LegacyPassManagerTest.cpp
   MDBuilderTest.cpp
   MetadataTest.cpp
diff --git a/unittests/IR/ConstantsTest.cpp b/unittests/IR/ConstantsTest.cpp
index 5414b25..5d271e2 100644
--- a/unittests/IR/ConstantsTest.cpp
+++ b/unittests/IR/ConstantsTest.cpp
@@ -7,12 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
 namespace llvm {
@@ -347,5 +349,19 @@ TEST(ConstantsTest, GEPReplaceWithConstant) {
   ASSERT_EQ(GEP->getOperand(0), Alias);
 }
 
+TEST(ConstantsTest, AliasCAPI) {
+  LLVMContext Context;
+  SMDiagnostic Error;
+  std::unique_ptr<Module> M =
+      parseAssemblyString("@g = global i32 42", Error, Context);
+  GlobalVariable *G = M->getGlobalVariable("g");
+  Type *I16Ty = Type::getInt16Ty(Context);
+  Type *I16PTy = PointerType::get(I16Ty, 0);
+  Constant *Aliasee = ConstantExpr::getBitCast(G, I16PTy);
+  LLVMValueRef AliasRef =
+      LLVMAddAlias(wrap(M.get()), wrap(I16PTy), wrap(Aliasee), "a");
+  ASSERT_EQ(unwrap<GlobalAlias>(AliasRef)->getAliasee(), Aliasee);
+}
+
 }  // end anonymous namespace
 }  // end namespace llvm
diff --git a/unittests/IR/DebugInfoTest.cpp b/unittests/IR/DebugInfoTest.cpp
index 1fa851e..3c6c786 100644
--- a/unittests/IR/DebugInfoTest.cpp
+++ b/unittests/IR/DebugInfoTest.cpp
@@ -65,4 +65,71 @@ TEST(DebugInfoTest, DIHeaderFieldIterator) {
   ASSERT_EQ("stuff", *++MAKE_FIELD_ITERATOR("\0stuff"));
 }
 
+TEST(DIDescriptorTest, getFlag) {
+  // Some valid flags.
+  EXPECT_EQ(DIDescriptor::FlagPublic, DIDescriptor::getFlag("DIFlagPublic"));
+  EXPECT_EQ(DIDescriptor::FlagProtected,
+            DIDescriptor::getFlag("DIFlagProtected"));
+  EXPECT_EQ(DIDescriptor::FlagPrivate, DIDescriptor::getFlag("DIFlagPrivate"));
+  EXPECT_EQ(DIDescriptor::FlagVector, DIDescriptor::getFlag("DIFlagVector"));
+  EXPECT_EQ(DIDescriptor::FlagRValueReference,
+            DIDescriptor::getFlag("DIFlagRValueReference"));
+
+  // FlagAccessibility shouldn't work.
+  EXPECT_EQ(0u, DIDescriptor::getFlag("DIFlagAccessibility"));
+
+  // Some other invalid strings.
+  EXPECT_EQ(0u, DIDescriptor::getFlag("FlagVector"));
+  EXPECT_EQ(0u, DIDescriptor::getFlag("Vector"));
+  EXPECT_EQ(0u, DIDescriptor::getFlag("other things"));
+  EXPECT_EQ(0u, DIDescriptor::getFlag("DIFlagOther"));
+}
+
+TEST(DIDescriptorTest, getFlagString) {
+  // Some valid flags.
+  EXPECT_EQ(StringRef("DIFlagPublic"),
+            DIDescriptor::getFlagString(DIDescriptor::FlagPublic));
+  EXPECT_EQ(StringRef("DIFlagProtected"),
+            DIDescriptor::getFlagString(DIDescriptor::FlagProtected));
+  EXPECT_EQ(StringRef("DIFlagPrivate"),
+            DIDescriptor::getFlagString(DIDescriptor::FlagPrivate));
+  EXPECT_EQ(StringRef("DIFlagVector"),
+            DIDescriptor::getFlagString(DIDescriptor::FlagVector));
+  EXPECT_EQ(StringRef("DIFlagRValueReference"),
+            DIDescriptor::getFlagString(DIDescriptor::FlagRValueReference));
+
+  // FlagAccessibility actually equals FlagPublic.
+  EXPECT_EQ(StringRef("DIFlagPublic"),
+            DIDescriptor::getFlagString(DIDescriptor::FlagAccessibility));
+
+  // Some other invalid flags.
+  EXPECT_EQ(StringRef(), DIDescriptor::getFlagString(DIDescriptor::FlagPublic |
+                                                     DIDescriptor::FlagVector));
+  EXPECT_EQ(StringRef(),
+            DIDescriptor::getFlagString(DIDescriptor::FlagFwdDecl |
+                                        DIDescriptor::FlagArtificial));
+  EXPECT_EQ(StringRef(), DIDescriptor::getFlagString(0xffff));
+}
+
+TEST(DIDescriptorTest, splitFlags) {
+  // Some valid flags.
+#define CHECK_SPLIT(FLAGS, VECTOR, REMAINDER)                                  \
+  {                                                                            \
+    SmallVector<unsigned, 8> V;                                                \
+    EXPECT_EQ(REMAINDER, DIDescriptor::splitFlags(FLAGS, V));                  \
+    EXPECT_TRUE(makeArrayRef(V).equals VECTOR);                                \
+  }
+  CHECK_SPLIT(DIDescriptor::FlagPublic, (DIDescriptor::FlagPublic), 0u);
+  CHECK_SPLIT(DIDescriptor::FlagProtected, (DIDescriptor::FlagProtected), 0u);
+  CHECK_SPLIT(DIDescriptor::FlagPrivate, (DIDescriptor::FlagPrivate), 0u);
+  CHECK_SPLIT(DIDescriptor::FlagVector, (DIDescriptor::FlagVector), 0u);
+  CHECK_SPLIT(DIDescriptor::FlagRValueReference, (DIDescriptor::FlagRValueReference), 0u);
+  CHECK_SPLIT(DIDescriptor::FlagFwdDecl | DIDescriptor::FlagVector,
+              (DIDescriptor::FlagFwdDecl, DIDescriptor::FlagVector), 0u);
+  CHECK_SPLIT(0x100000u, (), 0x100000u);
+  CHECK_SPLIT(0x100000u | DIDescriptor::FlagVector, (DIDescriptor::FlagVector),
+              0x100000u);
+#undef CHECK_SPLIT
+}
+
 } // end namespace
diff --git a/unittests/IR/DominatorTreeTest.cpp b/unittests/IR/DominatorTreeTest.cpp
index 6c43d6f..8d2dc41 100644
--- a/unittests/IR/DominatorTreeTest.cpp
+++ b/unittests/IR/DominatorTreeTest.cpp
@@ -13,7 +13,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
@@ -218,7 +218,7 @@ namespace llvm {
     TEST(DominatorTree, Unreachable) {
       DPass *P = new DPass();
       std::unique_ptr<Module> M = makeLLVMModule(P);
-      PassManager Passes;
+      legacy::PassManager Passes;
       Passes.add(P);
       Passes.run(*M);
     }
diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp
index df5c840..08a729a 100644
--- a/unittests/IR/IRBuilderTest.cpp
+++ b/unittests/IR/IRBuilderTest.cpp
@@ -10,6 +10,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
@@ -287,5 +288,22 @@ TEST_F(IRBuilderTest, RAIIHelpersTest) {
   EXPECT_EQ(BB, Builder.GetInsertBlock());
 }
 
+TEST_F(IRBuilderTest, DIBuilder) {
+  IRBuilder<> Builder(BB);
+  DIBuilder DIB(*M);
+  auto File = DIB.createFile("F.CBL", "/");
+  auto CU = DIB.createCompileUnit(dwarf::DW_LANG_Cobol74, "F.CBL", "/",
+                                  "llvm-cobol74", true, "", 0);
+  auto Type = DIB.createSubroutineType(File, DIB.getOrCreateTypeArray(None));
+  auto SP = DIB.createFunction(CU, "foo", "", File, 1, Type,
+                               false, true, 1, 0, true, F);
+  EXPECT_TRUE(SP.Verify());
+  AllocaInst *I = Builder.CreateAlloca(Builder.getInt8Ty());
+  auto BadScope = DIB.createLexicalBlockFile(DIDescriptor(), File, 0);
+  I->setDebugLoc(DebugLoc::get(2, 0, BadScope));
+  EXPECT_FALSE(SP.Verify());
+  DIB.finalize();
+}
+
 
 }
diff --git a/unittests/IR/LeakDetectorTest.cpp b/unittests/IR/LeakDetectorTest.cpp
deleted file mode 100644
index 94eed4c..0000000
--- a/unittests/IR/LeakDetectorTest.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-//===- LeakDetectorTest.cpp -----------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/IR/LeakDetector.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-
-namespace {
-
-#ifdef GTEST_HAS_DEATH_TEST
-#ifndef NDEBUG
-TEST(LeakDetector, Death1) {
-  LeakDetector::addGarbageObject((void*) 1);
-  LeakDetector::addGarbageObject((void*) 2);
-
-  EXPECT_DEATH(LeakDetector::addGarbageObject((void*) 1),
-               ".*Ts.count\\(o\\) == 0 && \"Object already in set!\"");
-  EXPECT_DEATH(LeakDetector::addGarbageObject((void*) 2),
-               "Cache != o && \"Object already in set!\"");
-}
-#endif
-#endif
-
-}
diff --git a/unittests/IR/LegacyPassManagerTest.cpp b/unittests/IR/LegacyPassManagerTest.cpp
index 4efc2f5..9cb9414 100644
--- a/unittests/IR/LegacyPassManagerTest.cpp
+++ b/unittests/IR/LegacyPassManagerTest.cpp
@@ -8,13 +8,12 @@
 //===----------------------------------------------------------------------===//
 //
 // This unit test exercises the legacy pass manager infrastructure. We use the
-// old names as well to ensure that the source-level compatibility wrapper
-// works for out-of-tree code that expects to include llvm/PassManager.h and
-// subclass the core pass classes.
+// old names as well to ensure that the source-level compatibility is preserved
+// where possible.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -302,7 +301,7 @@ namespace llvm {
 
       mNDM->run = mNDNM->run = mDNM->run = mNDM2->run = 0;
 
-      PassManager Passes;
+      legacy::PassManager Passes;
       Passes.add(new DataLayoutPass());
       Passes.add(mNDM2);
       Passes.add(mNDM);
@@ -326,7 +325,7 @@ namespace llvm {
 
       mNDM->run = mNDNM->run = mDNM->run = mNDM2->run = 0;
 
-      PassManager Passes;
+      legacy::PassManager Passes;
       Passes.add(new DataLayoutPass());
       Passes.add(mNDM);
       Passes.add(mNDNM);
@@ -348,7 +347,7 @@ namespace llvm {
     void MemoryTestHelper(int run) {
       std::unique_ptr<Module> M(makeLLVMModule());
       T *P = new T();
-      PassManager Passes;
+      legacy::PassManager Passes;
       Passes.add(new DataLayoutPass());
       Passes.add(P);
       Passes.run(*M);
@@ -359,7 +358,7 @@ namespace llvm {
     void MemoryTestHelper(int run, int N) {
       Module *M = makeLLVMModule();
       T *P = new T();
-      PassManager Passes;
+      legacy::PassManager Passes;
       Passes.add(new DataLayoutPass());
       Passes.add(P);
       Passes.run(*M);
@@ -397,7 +396,7 @@ namespace llvm {
       {
         SCOPED_TRACE("Running OnTheFlyTest");
         struct OnTheFlyTest *O = new OnTheFlyTest();
-        PassManager Passes;
+        legacy::PassManager Passes;
         Passes.add(new DataLayoutPass());
         Passes.add(O);
         Passes.run(*M);
@@ -554,6 +553,6 @@ INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
 INITIALIZE_PASS_END(CGPass, "cgp","cgp", false, false)
 INITIALIZE_PASS(FPass, "fp","fp", false, false)
 INITIALIZE_PASS_BEGIN(LPass, "lp","lp", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(LPass, "lp","lp", false, false)
 INITIALIZE_PASS(BPass, "bp","bp", false, false)
diff --git a/unittests/IR/MDBuilderTest.cpp b/unittests/IR/MDBuilderTest.cpp
index fc4674e..ab2d34e 100644
--- a/unittests/IR/MDBuilderTest.cpp
+++ b/unittests/IR/MDBuilderTest.cpp
@@ -36,10 +36,10 @@ TEST_F(MDBuilderTest, createFPMath) {
   EXPECT_EQ(MD0, (MDNode *)nullptr);
   EXPECT_NE(MD1, (MDNode *)nullptr);
   EXPECT_EQ(MD1->getNumOperands(), 1U);
-  Value *Op = MD1->getOperand(0);
-  EXPECT_TRUE(isa<ConstantFP>(Op));
-  EXPECT_TRUE(Op->getType()->isFloatingPointTy());
-  ConstantFP *Val = cast<ConstantFP>(Op);
+  Metadata *Op = MD1->getOperand(0);
+  EXPECT_TRUE(mdconst::hasa<ConstantFP>(Op));
+  ConstantFP *Val = mdconst::extract<ConstantFP>(Op);
+  EXPECT_TRUE(Val->getType()->isFloatingPointTy());
   EXPECT_TRUE(Val->isExactlyValue(1.0));
 }
 TEST_F(MDBuilderTest, createRangeMetadata) {
@@ -50,10 +50,10 @@ TEST_F(MDBuilderTest, createRangeMetadata) {
   EXPECT_EQ(R0, (MDNode *)nullptr);
   EXPECT_NE(R1, (MDNode *)nullptr);
   EXPECT_EQ(R1->getNumOperands(), 2U);
-  EXPECT_TRUE(isa<ConstantInt>(R1->getOperand(0)));
-  EXPECT_TRUE(isa<ConstantInt>(R1->getOperand(1)));
-  ConstantInt *C0 = cast<ConstantInt>(R1->getOperand(0));
-  ConstantInt *C1 = cast<ConstantInt>(R1->getOperand(1));
+  EXPECT_TRUE(mdconst::hasa<ConstantInt>(R1->getOperand(0)));
+  EXPECT_TRUE(mdconst::hasa<ConstantInt>(R1->getOperand(1)));
+  ConstantInt *C0 = mdconst::extract<ConstantInt>(R1->getOperand(0));
+  ConstantInt *C1 = mdconst::extract<ConstantInt>(R1->getOperand(1));
   EXPECT_EQ(C0->getValue(), A);
   EXPECT_EQ(C1->getValue(), B);
 }
@@ -101,7 +101,8 @@ TEST_F(MDBuilderTest, createTBAANode) {
   EXPECT_EQ(N0->getOperand(1), R);
   EXPECT_EQ(N1->getOperand(1), R);
   EXPECT_EQ(N2->getOperand(1), R);
-  EXPECT_TRUE(isa<ConstantInt>(N2->getOperand(2)));
-  EXPECT_EQ(cast<ConstantInt>(N2->getOperand(2))->getZExtValue(), 1U);
+  EXPECT_TRUE(mdconst::hasa<ConstantInt>(N2->getOperand(2)));
+  EXPECT_EQ(mdconst::extract<ConstantInt>(N2->getOperand(2))->getZExtValue(),
+            1U);
 }
 }
diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp
index 4f7bd72..6f2372d 100644
--- a/unittests/IR/MetadataTest.cpp
+++ b/unittests/IR/MetadataTest.cpp
@@ -1,4 +1,4 @@
-//===- llvm/unittest/IR/Metadata.cpp - Metadata unit tests ----------------===//
+//===- unittests/IR/MetadataTest.cpp - Metadata unit tests ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,22 +7,67 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Metadata.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
-#include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "gtest/gtest.h"
 using namespace llvm;
 
 namespace {
 
+TEST(ContextAndReplaceableUsesTest, FromContext) {
+  LLVMContext Context;
+  ContextAndReplaceableUses CRU(Context);
+  EXPECT_EQ(&Context, &CRU.getContext());
+  EXPECT_FALSE(CRU.hasReplaceableUses());
+  EXPECT_FALSE(CRU.getReplaceableUses());
+}
+
+TEST(ContextAndReplaceableUsesTest, FromReplaceableUses) {
+  LLVMContext Context;
+  ContextAndReplaceableUses CRU(make_unique<ReplaceableMetadataImpl>(Context));
+  EXPECT_EQ(&Context, &CRU.getContext());
+  EXPECT_TRUE(CRU.hasReplaceableUses());
+  EXPECT_TRUE(CRU.getReplaceableUses());
+}
+
+TEST(ContextAndReplaceableUsesTest, makeReplaceable) {
+  LLVMContext Context;
+  ContextAndReplaceableUses CRU(Context);
+  CRU.makeReplaceable(make_unique<ReplaceableMetadataImpl>(Context));
+  EXPECT_EQ(&Context, &CRU.getContext());
+  EXPECT_TRUE(CRU.hasReplaceableUses());
+  EXPECT_TRUE(CRU.getReplaceableUses());
+}
+
+TEST(ContextAndReplaceableUsesTest, takeReplaceableUses) {
+  LLVMContext Context;
+  auto ReplaceableUses = make_unique<ReplaceableMetadataImpl>(Context);
+  auto *Ptr = ReplaceableUses.get();
+  ContextAndReplaceableUses CRU(std::move(ReplaceableUses));
+  ReplaceableUses = CRU.takeReplaceableUses();
+  EXPECT_EQ(&Context, &CRU.getContext());
+  EXPECT_FALSE(CRU.hasReplaceableUses());
+  EXPECT_FALSE(CRU.getReplaceableUses());
+  EXPECT_EQ(Ptr, ReplaceableUses.get());
+}
+
 class MetadataTest : public testing::Test {
 protected:
   LLVMContext Context;
+  MDNode *getNode() { return MDNode::get(Context, None); }
+  MDNode *getNode(Metadata *MD) { return MDNode::get(Context, MD); }
+  MDNode *getNode(Metadata *MD1, Metadata *MD2) {
+    Metadata *MDs[] = {MD1, MD2};
+    return MDNode::get(Context, MDs);
+  }
 };
 typedef MetadataTest MDStringTest;
 
@@ -58,7 +103,7 @@ TEST_F(MDStringTest, PrintingSimple) {
   std::string Str;
   raw_string_ostream oss(Str);
   s->print(oss);
-  EXPECT_STREQ("metadata !\"testing 1 2 3\"", oss.str().c_str());
+  EXPECT_STREQ("!\"testing 1 2 3\"", oss.str().c_str());
 }
 
 // Test printing of MDString with non-printable characters.
@@ -68,7 +113,7 @@ TEST_F(MDStringTest, PrintingComplex) {
   std::string Str;
   raw_string_ostream oss(Str);
   s->print(oss);
-  EXPECT_STREQ("metadata !\"\\00\\0A\\22\\5C\\FF\"", oss.str().c_str());
+  EXPECT_STREQ("!\"\\00\\0A\\22\\5C\\FF\"", oss.str().c_str());
 }
 
 typedef MetadataTest MDNodeTest;
@@ -80,30 +125,27 @@ TEST_F(MDNodeTest, Simple) {
 
   MDString *s1 = MDString::get(Context, StringRef(&x[0], 3));
   MDString *s2 = MDString::get(Context, StringRef(&y[0], 3));
-  ConstantInt *CI = ConstantInt::get(getGlobalContext(), APInt(8, 0));
+  ConstantAsMetadata *CI = ConstantAsMetadata::get(
+      ConstantInt::get(getGlobalContext(), APInt(8, 0)));
 
-  std::vector<Value *> V;
+  std::vector<Metadata *> V;
   V.push_back(s1);
   V.push_back(CI);
   V.push_back(s2);
 
   MDNode *n1 = MDNode::get(Context, V);
-  Value *const c1 = n1;
+  Metadata *const c1 = n1;
   MDNode *n2 = MDNode::get(Context, c1);
-  Value *const c2 = n2;
+  Metadata *const c2 = n2;
   MDNode *n3 = MDNode::get(Context, V);
   MDNode *n4 = MDNode::getIfExists(Context, V);
   MDNode *n5 = MDNode::getIfExists(Context, c1);
   MDNode *n6 = MDNode::getIfExists(Context, c2);
   EXPECT_NE(n1, n2);
-#ifdef ENABLE_MDNODE_UNIQUING
   EXPECT_EQ(n1, n3);
-#else
-  (void) n3;
-#endif
   EXPECT_EQ(n4, n1);
   EXPECT_EQ(n5, n2);
-  EXPECT_EQ(n6, (Value*)nullptr);
+  EXPECT_EQ(n6, (Metadata *)nullptr);
 
   EXPECT_EQ(3u, n1->getNumOperands());
   EXPECT_EQ(s1, n1->getOperand(0));
@@ -118,22 +160,1713 @@ TEST_F(MDNodeTest, Delete) {
   Constant *C = ConstantInt::get(Type::getInt32Ty(getGlobalContext()), 1);
   Instruction *I = new BitCastInst(C, Type::getInt32Ty(getGlobalContext()));
 
-  Value *const V = I;
+  Metadata *const V = LocalAsMetadata::get(I);
   MDNode *n = MDNode::get(Context, V);
-  WeakVH wvh = n;
+  TrackingMDRef wvh(n);
 
   EXPECT_EQ(n, wvh);
 
   delete I;
 }
 
+TEST_F(MDNodeTest, SelfReference) {
+  // !0 = !{!0}
+  // !1 = !{!0}
+  {
+    auto Temp = MDNode::getTemporary(Context, None);
+    Metadata *Args[] = {Temp.get()};
+    MDNode *Self = MDNode::get(Context, Args);
+    Self->replaceOperandWith(0, Self);
+    ASSERT_EQ(Self, Self->getOperand(0));
+
+    // Self-references should be distinct, so MDNode::get() should grab a
+    // uniqued node that references Self, not Self.
+    Args[0] = Self;
+    MDNode *Ref1 = MDNode::get(Context, Args);
+    MDNode *Ref2 = MDNode::get(Context, Args);
+    EXPECT_NE(Self, Ref1);
+    EXPECT_EQ(Ref1, Ref2);
+  }
+
+  // !0 = !{!0, !{}}
+  // !1 = !{!0, !{}}
+  {
+    auto Temp = MDNode::getTemporary(Context, None);
+    Metadata *Args[] = {Temp.get(), MDNode::get(Context, None)};
+    MDNode *Self = MDNode::get(Context, Args);
+    Self->replaceOperandWith(0, Self);
+    ASSERT_EQ(Self, Self->getOperand(0));
+
+    // Self-references should be distinct, so MDNode::get() should grab a
+    // uniqued node that references Self, not Self itself.
+    Args[0] = Self;
+    MDNode *Ref1 = MDNode::get(Context, Args);
+    MDNode *Ref2 = MDNode::get(Context, Args);
+    EXPECT_NE(Self, Ref1);
+    EXPECT_EQ(Ref1, Ref2);
+  }
+}
+
+TEST_F(MDNodeTest, Print) {
+  Constant *C = ConstantInt::get(Type::getInt32Ty(Context), 7);
+  MDString *S = MDString::get(Context, "foo");
+  MDNode *N0 = getNode();
+  MDNode *N1 = getNode(N0);
+  MDNode *N2 = getNode(N0, N1);
+
+  Metadata *Args[] = {ConstantAsMetadata::get(C), S, nullptr, N0, N1, N2};
+  MDNode *N = MDNode::get(Context, Args);
+
+  std::string Expected;
+  {
+    raw_string_ostream OS(Expected);
+    OS << "!{";
+    C->printAsOperand(OS);
+    OS << ", ";
+    S->printAsOperand(OS);
+    OS << ", null";
+    MDNode *Nodes[] = {N0, N1, N2};
+    for (auto *Node : Nodes)
+      OS << ", <" << (void *)Node << ">";
+    OS << "}";
+  }
+
+  std::string Actual;
+  {
+    raw_string_ostream OS(Actual);
+    N->print(OS);
+  }
+
+  EXPECT_EQ(Expected, Actual);
+}
+
+TEST_F(MDNodeTest, NullOperand) {
+  // metadata !{}
+  MDNode *Empty = MDNode::get(Context, None);
+
+  // metadata !{metadata !{}}
+  Metadata *Ops[] = {Empty};
+  MDNode *N = MDNode::get(Context, Ops);
+  ASSERT_EQ(Empty, N->getOperand(0));
+
+  // metadata !{metadata !{}} => metadata !{null}
+  N->replaceOperandWith(0, nullptr);
+  ASSERT_EQ(nullptr, N->getOperand(0));
+
+  // metadata !{null}
+  Ops[0] = nullptr;
+  MDNode *NullOp = MDNode::get(Context, Ops);
+  ASSERT_EQ(nullptr, NullOp->getOperand(0));
+  EXPECT_EQ(N, NullOp);
+}
+
+TEST_F(MDNodeTest, DistinctOnUniquingCollision) {
+  // !{}
+  MDNode *Empty = MDNode::get(Context, None);
+  ASSERT_TRUE(Empty->isResolved());
+  EXPECT_FALSE(Empty->isDistinct());
+
+  // !{!{}}
+  Metadata *Wrapped1Ops[] = {Empty};
+  MDNode *Wrapped1 = MDNode::get(Context, Wrapped1Ops);
+  ASSERT_EQ(Empty, Wrapped1->getOperand(0));
+  ASSERT_TRUE(Wrapped1->isResolved());
+  EXPECT_FALSE(Wrapped1->isDistinct());
+
+  // !{!{!{}}}
+  Metadata *Wrapped2Ops[] = {Wrapped1};
+  MDNode *Wrapped2 = MDNode::get(Context, Wrapped2Ops);
+  ASSERT_EQ(Wrapped1, Wrapped2->getOperand(0));
+  ASSERT_TRUE(Wrapped2->isResolved());
+  EXPECT_FALSE(Wrapped2->isDistinct());
+
+  // !{!{!{}}} => !{!{}}
+  Wrapped2->replaceOperandWith(0, Empty);
+  ASSERT_EQ(Empty, Wrapped2->getOperand(0));
+  EXPECT_TRUE(Wrapped2->isDistinct());
+  EXPECT_FALSE(Wrapped1->isDistinct());
+}
+
+TEST_F(MDNodeTest, getDistinct) {
+  // !{}
+  MDNode *Empty = MDNode::get(Context, None);
+  ASSERT_TRUE(Empty->isResolved());
+  ASSERT_FALSE(Empty->isDistinct());
+  ASSERT_EQ(Empty, MDNode::get(Context, None));
+
+  // distinct !{}
+  MDNode *Distinct1 = MDNode::getDistinct(Context, None);
+  MDNode *Distinct2 = MDNode::getDistinct(Context, None);
+  EXPECT_TRUE(Distinct1->isResolved());
+  EXPECT_TRUE(Distinct2->isDistinct());
+  EXPECT_NE(Empty, Distinct1);
+  EXPECT_NE(Empty, Distinct2);
+  EXPECT_NE(Distinct1, Distinct2);
+
+  // !{}
+  ASSERT_EQ(Empty, MDNode::get(Context, None));
+}
+
+TEST_F(MDNodeTest, isUniqued) {
+  MDNode *U = MDTuple::get(Context, None);
+  MDNode *D = MDTuple::getDistinct(Context, None);
+  auto T = MDTuple::getTemporary(Context, None);
+  EXPECT_TRUE(U->isUniqued());
+  EXPECT_FALSE(D->isUniqued());
+  EXPECT_FALSE(T->isUniqued());
+}
+
+TEST_F(MDNodeTest, isDistinct) {
+  MDNode *U = MDTuple::get(Context, None);
+  MDNode *D = MDTuple::getDistinct(Context, None);
+  auto T = MDTuple::getTemporary(Context, None);
+  EXPECT_FALSE(U->isDistinct());
+  EXPECT_TRUE(D->isDistinct());
+  EXPECT_FALSE(T->isDistinct());
+}
+
+TEST_F(MDNodeTest, isTemporary) {
+  MDNode *U = MDTuple::get(Context, None);
+  MDNode *D = MDTuple::getDistinct(Context, None);
+  auto T = MDTuple::getTemporary(Context, None);
+  EXPECT_FALSE(U->isTemporary());
+  EXPECT_FALSE(D->isTemporary());
+  EXPECT_TRUE(T->isTemporary());
+}
+
+TEST_F(MDNodeTest, getDistinctWithUnresolvedOperands) {
+  // temporary !{}
+  auto Temp = MDTuple::getTemporary(Context, None);
+  ASSERT_FALSE(Temp->isResolved());
+
+  // distinct !{temporary !{}}
+  Metadata *Ops[] = {Temp.get()};
+  MDNode *Distinct = MDNode::getDistinct(Context, Ops);
+  EXPECT_TRUE(Distinct->isResolved());
+  EXPECT_EQ(Temp.get(), Distinct->getOperand(0));
+
+  // temporary !{} => !{}
+  MDNode *Empty = MDNode::get(Context, None);
+  Temp->replaceAllUsesWith(Empty);
+  EXPECT_EQ(Empty, Distinct->getOperand(0));
+}
+
+TEST_F(MDNodeTest, handleChangedOperandRecursion) {
+  // !0 = !{}
+  MDNode *N0 = MDNode::get(Context, None);
+
+  // !1 = !{!3, null}
+  auto Temp3 = MDTuple::getTemporary(Context, None);
+  Metadata *Ops1[] = {Temp3.get(), nullptr};
+  MDNode *N1 = MDNode::get(Context, Ops1);
+
+  // !2 = !{!3, !0}
+  Metadata *Ops2[] = {Temp3.get(), N0};
+  MDNode *N2 = MDNode::get(Context, Ops2);
+
+  // !3 = !{!2}
+  Metadata *Ops3[] = {N2};
+  MDNode *N3 = MDNode::get(Context, Ops3);
+  Temp3->replaceAllUsesWith(N3);
+
+  // !4 = !{!1}
+  Metadata *Ops4[] = {N1};
+  MDNode *N4 = MDNode::get(Context, Ops4);
+
+  // Confirm that the cycle prevented RAUW from getting dropped.
+  EXPECT_TRUE(N0->isResolved());
+  EXPECT_FALSE(N1->isResolved());
+  EXPECT_FALSE(N2->isResolved());
+  EXPECT_FALSE(N3->isResolved());
+  EXPECT_FALSE(N4->isResolved());
+
+  // Create a couple of distinct nodes to observe what's going on.
+  //
+  // !5 = distinct !{!2}
+  // !6 = distinct !{!3}
+  Metadata *Ops5[] = {N2};
+  MDNode *N5 = MDNode::getDistinct(Context, Ops5);
+  Metadata *Ops6[] = {N3};
+  MDNode *N6 = MDNode::getDistinct(Context, Ops6);
+
+  // Mutate !2 to look like !1, causing a uniquing collision (and an RAUW).
+  // This will ripple up, with !3 colliding with !4, and RAUWing.  Since !2
+  // references !3, this can cause a re-entry of handleChangedOperand() when !3
+  // is not ready for it.
+  //
+  // !2->replaceOperandWith(1, nullptr)
+  // !2: !{!3, !0} => !{!3, null}
+  // !2->replaceAllUsesWith(!1)
+  // !3: !{!2] => !{!1}
+  // !3->replaceAllUsesWith(!4)
+  N2->replaceOperandWith(1, nullptr);
+
+  // If all has gone well, N2 and N3 will have been RAUW'ed and deleted from
+  // under us.  Just check that the other nodes are sane.
+  //
+  // !1 = !{!4, null}
+  // !4 = !{!1}
+  // !5 = distinct !{!1}
+  // !6 = distinct !{!4}
+  EXPECT_EQ(N4, N1->getOperand(0));
+  EXPECT_EQ(N1, N4->getOperand(0));
+  EXPECT_EQ(N1, N5->getOperand(0));
+  EXPECT_EQ(N4, N6->getOperand(0));
+}
+
+TEST_F(MDNodeTest, replaceResolvedOperand) {
+  // Check code for replacing one resolved operand with another.  If doing this
+  // directly (via replaceOperandWith()) becomes illegal, change the operand to
+  // a global value that gets RAUW'ed.
+  //
+  // Use a temporary node to keep N from being resolved.
+  auto Temp = MDTuple::getTemporary(Context, None);
+  Metadata *Ops[] = {nullptr, Temp.get()};
+
+  MDNode *Empty = MDTuple::get(Context, ArrayRef<Metadata *>());
+  MDNode *N = MDTuple::get(Context, Ops);
+  EXPECT_EQ(nullptr, N->getOperand(0));
+  ASSERT_FALSE(N->isResolved());
+
+  // Check code for replacing resolved nodes.
+  N->replaceOperandWith(0, Empty);
+  EXPECT_EQ(Empty, N->getOperand(0));
+
+  // Check code for adding another unresolved operand.
+  N->replaceOperandWith(0, Temp.get());
+  EXPECT_EQ(Temp.get(), N->getOperand(0));
+
+  // Remove the references to Temp; required for teardown.
+  Temp->replaceAllUsesWith(nullptr);
+}
+
+TEST_F(MDNodeTest, replaceWithUniqued) {
+  auto *Empty = MDTuple::get(Context, None);
+  MDTuple *FirstUniqued;
+  {
+    Metadata *Ops[] = {Empty};
+    auto Temp = MDTuple::getTemporary(Context, Ops);
+    EXPECT_TRUE(Temp->isTemporary());
+
+    // Don't expect a collision.
+    auto *Current = Temp.get();
+    FirstUniqued = MDNode::replaceWithUniqued(std::move(Temp));
+    EXPECT_TRUE(FirstUniqued->isUniqued());
+    EXPECT_TRUE(FirstUniqued->isResolved());
+    EXPECT_EQ(Current, FirstUniqued);
+  }
+  {
+    Metadata *Ops[] = {Empty};
+    auto Temp = MDTuple::getTemporary(Context, Ops);
+    EXPECT_TRUE(Temp->isTemporary());
+
+    // Should collide with Uniqued above this time.
+    auto *Uniqued = MDNode::replaceWithUniqued(std::move(Temp));
+    EXPECT_TRUE(Uniqued->isUniqued());
+    EXPECT_TRUE(Uniqued->isResolved());
+    EXPECT_EQ(FirstUniqued, Uniqued);
+  }
+  {
+    auto Unresolved = MDTuple::getTemporary(Context, None);
+    Metadata *Ops[] = {Unresolved.get()};
+    auto Temp = MDTuple::getTemporary(Context, Ops);
+    EXPECT_TRUE(Temp->isTemporary());
+
+    // Shouldn't be resolved.
+    auto *Uniqued = MDNode::replaceWithUniqued(std::move(Temp));
+    EXPECT_TRUE(Uniqued->isUniqued());
+    EXPECT_FALSE(Uniqued->isResolved());
+
+    // Should be a different node.
+    EXPECT_NE(FirstUniqued, Uniqued);
+
+    // Should resolve when we update its node (note: be careful to avoid a
+    // collision with any other nodes above).
+    Uniqued->replaceOperandWith(0, nullptr);
+    EXPECT_TRUE(Uniqued->isResolved());
+  }
+}
+
+TEST_F(MDNodeTest, replaceWithDistinct) {
+  {
+    auto *Empty = MDTuple::get(Context, None);
+    Metadata *Ops[] = {Empty};
+    auto Temp = MDTuple::getTemporary(Context, Ops);
+    EXPECT_TRUE(Temp->isTemporary());
+
+    // Don't expect a collision.
+    auto *Current = Temp.get();
+    auto *Distinct = MDNode::replaceWithDistinct(std::move(Temp));
+    EXPECT_TRUE(Distinct->isDistinct());
+    EXPECT_TRUE(Distinct->isResolved());
+    EXPECT_EQ(Current, Distinct);
+  }
+  {
+    auto Unresolved = MDTuple::getTemporary(Context, None);
+    Metadata *Ops[] = {Unresolved.get()};
+    auto Temp = MDTuple::getTemporary(Context, Ops);
+    EXPECT_TRUE(Temp->isTemporary());
+
+    // Don't expect a collision.
+    auto *Current = Temp.get();
+    auto *Distinct = MDNode::replaceWithDistinct(std::move(Temp));
+    EXPECT_TRUE(Distinct->isDistinct());
+    EXPECT_TRUE(Distinct->isResolved());
+    EXPECT_EQ(Current, Distinct);
+
+    // Cleanup; required for teardown.
+    Unresolved->replaceAllUsesWith(nullptr);
+  }
+}
+
+TEST_F(MDNodeTest, replaceWithPermanent) {
+  Metadata *Ops[] = {nullptr};
+  auto Temp = MDTuple::getTemporary(Context, Ops);
+  auto *T = Temp.get();
+
+  // U is a normal, uniqued node that references T.
+  auto *U = MDTuple::get(Context, T);
+  EXPECT_TRUE(U->isUniqued());
+
+  // Make Temp self-referencing.
+  Temp->replaceOperandWith(0, T);
+
+  // Try to uniquify Temp.  This should, despite the name in the API, give a
+  // 'distinct' node, since self-references aren't allowed to be uniqued.
+  //
+  // Since it's distinct, N should have the same address as when it was a
+  // temporary (i.e., be equal to T not U).
+  auto *N = MDNode::replaceWithPermanent(std::move(Temp));
+  EXPECT_EQ(N, T);
+  EXPECT_TRUE(N->isDistinct());
+
+  // U should be the canonical unique node with N as the argument.
+  EXPECT_EQ(U, MDTuple::get(Context, N));
+  EXPECT_TRUE(U->isUniqued());
+
+  // This temporary should collide with U when replaced, but it should still be
+  // uniqued.
+  EXPECT_EQ(U, MDNode::replaceWithPermanent(MDTuple::getTemporary(Context, N)));
+  EXPECT_TRUE(U->isUniqued());
+
+  // This temporary should become a new uniqued node.
+  auto Temp2 = MDTuple::getTemporary(Context, U);
+  auto *V = Temp2.get();
+  EXPECT_EQ(V, MDNode::replaceWithPermanent(std::move(Temp2)));
+  EXPECT_TRUE(V->isUniqued());
+  EXPECT_EQ(U, V->getOperand(0));
+}
+
+TEST_F(MDNodeTest, deleteTemporaryWithTrackingRef) {
+  TrackingMDRef Ref;
+  EXPECT_EQ(nullptr, Ref.get());
+  {
+    auto Temp = MDTuple::getTemporary(Context, None);
+    Ref.reset(Temp.get());
+    EXPECT_EQ(Temp.get(), Ref.get());
+  }
+  EXPECT_EQ(nullptr, Ref.get());
+}
+
+typedef MetadataTest MDLocationTest;
+
+TEST_F(MDLocationTest, Overflow) {
+  MDNode *N = MDNode::get(Context, None);
+  {
+    MDLocation *L = MDLocation::get(Context, 2, 7, N);
+    EXPECT_EQ(2u, L->getLine());
+    EXPECT_EQ(7u, L->getColumn());
+  }
+  unsigned U16 = 1u << 16;
+  {
+    MDLocation *L = MDLocation::get(Context, UINT32_MAX, U16 - 1, N);
+    EXPECT_EQ(UINT32_MAX, L->getLine());
+    EXPECT_EQ(U16 - 1, L->getColumn());
+  }
+  {
+    MDLocation *L = MDLocation::get(Context, UINT32_MAX, U16, N);
+    EXPECT_EQ(UINT32_MAX, L->getLine());
+    EXPECT_EQ(0u, L->getColumn());
+  }
+  {
+    MDLocation *L = MDLocation::get(Context, UINT32_MAX, U16 + 1, N);
+    EXPECT_EQ(UINT32_MAX, L->getLine());
+    EXPECT_EQ(0u, L->getColumn());
+  }
+}
+
+TEST_F(MDLocationTest, getDistinct) {
+  MDNode *N = MDNode::get(Context, None);
+  MDLocation *L0 = MDLocation::getDistinct(Context, 2, 7, N);
+  EXPECT_TRUE(L0->isDistinct());
+  MDLocation *L1 = MDLocation::get(Context, 2, 7, N);
+  EXPECT_FALSE(L1->isDistinct());
+  EXPECT_EQ(L1, MDLocation::get(Context, 2, 7, N));
+}
+
+TEST_F(MDLocationTest, getTemporary) {
+  MDNode *N = MDNode::get(Context, None);
+  auto L = MDLocation::getTemporary(Context, 2, 7, N);
+  EXPECT_TRUE(L->isTemporary());
+  EXPECT_FALSE(L->isResolved());
+}
+
+typedef MetadataTest GenericDebugNodeTest;
+
+TEST_F(GenericDebugNodeTest, get) {
+  StringRef Header = "header";
+  auto *Empty = MDNode::get(Context, None);
+  Metadata *Ops1[] = {Empty};
+  auto *N = GenericDebugNode::get(Context, 15, Header, Ops1);
+  EXPECT_EQ(15u, N->getTag());
+  EXPECT_EQ(2u, N->getNumOperands());
+  EXPECT_EQ(Header, N->getHeader());
+  EXPECT_EQ(MDString::get(Context, Header), N->getOperand(0));
+  EXPECT_EQ(1u, N->getNumDwarfOperands());
+  EXPECT_EQ(Empty, N->getDwarfOperand(0));
+  EXPECT_EQ(Empty, N->getOperand(1));
+  ASSERT_TRUE(N->isUniqued());
+
+  EXPECT_EQ(N, GenericDebugNode::get(Context, 15, Header, Ops1));
+
+  N->replaceOperandWith(1, nullptr);
+  EXPECT_EQ(15u, N->getTag());
+  EXPECT_EQ(Header, N->getHeader());
+  EXPECT_EQ(nullptr, N->getDwarfOperand(0));
+  ASSERT_TRUE(N->isUniqued());
+
+  Metadata *Ops2[] = {nullptr};
+  EXPECT_EQ(N, GenericDebugNode::get(Context, 15, Header, Ops2));
+
+  N->replaceDwarfOperandWith(0, Empty);
+  EXPECT_EQ(15u, N->getTag());
+  EXPECT_EQ(Header, N->getHeader());
+  EXPECT_EQ(Empty, N->getDwarfOperand(0));
+  ASSERT_TRUE(N->isUniqued());
+  EXPECT_EQ(N, GenericDebugNode::get(Context, 15, Header, Ops1));
+
+  TempGenericDebugNode Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+TEST_F(GenericDebugNodeTest, getEmptyHeader) {
+  // Canonicalize !"" to null.
+  auto *N = GenericDebugNode::get(Context, 15, StringRef(), None);
+  EXPECT_EQ(StringRef(), N->getHeader());
+  EXPECT_EQ(nullptr, N->getOperand(0));
+}
+
+typedef MetadataTest MDSubrangeTest;
+
+TEST_F(MDSubrangeTest, get) {
+  auto *N = MDSubrange::get(Context, 5, 7);
+  EXPECT_EQ(dwarf::DW_TAG_subrange_type, N->getTag());
+  EXPECT_EQ(5, N->getCount());
+  EXPECT_EQ(7, N->getLo());
+  EXPECT_EQ(N, MDSubrange::get(Context, 5, 7));
+  EXPECT_EQ(MDSubrange::get(Context, 5, 0), MDSubrange::get(Context, 5));
+
+  TempMDSubrange Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+TEST_F(MDSubrangeTest, getEmptyArray) {
+  auto *N = MDSubrange::get(Context, -1, 0);
+  EXPECT_EQ(dwarf::DW_TAG_subrange_type, N->getTag());
+  EXPECT_EQ(-1, N->getCount());
+  EXPECT_EQ(0, N->getLo());
+  EXPECT_EQ(N, MDSubrange::get(Context, -1, 0));
+}
+
+typedef MetadataTest MDEnumeratorTest;
+
+TEST_F(MDEnumeratorTest, get) {
+  auto *N = MDEnumerator::get(Context, 7, "name");
+  EXPECT_EQ(dwarf::DW_TAG_enumerator, N->getTag());
+  EXPECT_EQ(7, N->getValue());
+  EXPECT_EQ("name", N->getName());
+  EXPECT_EQ(N, MDEnumerator::get(Context, 7, "name"));
+
+  EXPECT_NE(N, MDEnumerator::get(Context, 8, "name"));
+  EXPECT_NE(N, MDEnumerator::get(Context, 7, "nam"));
+
+  TempMDEnumerator Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+typedef MetadataTest MDBasicTypeTest;
+
+TEST_F(MDBasicTypeTest, get) {
+  auto *N =
+      MDBasicType::get(Context, dwarf::DW_TAG_base_type, "special", 33, 26, 7);
+  EXPECT_EQ(dwarf::DW_TAG_base_type, N->getTag());
+  EXPECT_EQ("special", N->getName());
+  EXPECT_EQ(33u, N->getSizeInBits());
+  EXPECT_EQ(26u, N->getAlignInBits());
+  EXPECT_EQ(7u, N->getEncoding());
+  EXPECT_EQ(0u, N->getLine());
+  EXPECT_EQ(N, MDBasicType::get(Context, dwarf::DW_TAG_base_type, "special", 33,
+                                26, 7));
+
+  EXPECT_NE(N, MDBasicType::get(Context, dwarf::DW_TAG_unspecified_type,
+                                "special", 33, 26, 7));
+  EXPECT_NE(N,
+            MDBasicType::get(Context, dwarf::DW_TAG_base_type, "s", 33, 26, 7));
+  EXPECT_NE(N, MDBasicType::get(Context, dwarf::DW_TAG_base_type, "special", 32,
+                                26, 7));
+  EXPECT_NE(N, MDBasicType::get(Context, dwarf::DW_TAG_base_type, "special", 33,
+                                25, 7));
+  EXPECT_NE(N, MDBasicType::get(Context, dwarf::DW_TAG_base_type, "special", 33,
+                                26, 6));
+
+  TempMDBasicType Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+TEST_F(MDBasicTypeTest, getWithLargeValues) {
+  auto *N = MDBasicType::get(Context, dwarf::DW_TAG_base_type, "special",
+                             UINT64_MAX, UINT64_MAX - 1, 7);
+  EXPECT_EQ(UINT64_MAX, N->getSizeInBits());
+  EXPECT_EQ(UINT64_MAX - 1, N->getAlignInBits());
+}
+
+typedef MetadataTest MDDerivedTypeTest;
+
+TEST_F(MDDerivedTypeTest, get) {
+  Metadata *File = MDTuple::getDistinct(Context, None);
+  Metadata *Scope = MDTuple::getDistinct(Context, None);
+  Metadata *BaseType = MDTuple::getDistinct(Context, None);
+  Metadata *ExtraData = MDTuple::getDistinct(Context, None);
+
+  auto *N = MDDerivedType::get(Context, dwarf::DW_TAG_pointer_type, "something",
+                               File, 1, Scope, BaseType, 2, 3, 4, 5, ExtraData);
+  EXPECT_EQ(dwarf::DW_TAG_pointer_type, N->getTag());
+  EXPECT_EQ("something", N->getName());
+  EXPECT_EQ(File, N->getFile());
+  EXPECT_EQ(1u, N->getLine());
+  EXPECT_EQ(Scope, N->getScope());
+  EXPECT_EQ(BaseType, N->getBaseType());
+  EXPECT_EQ(2u, N->getSizeInBits());
+  EXPECT_EQ(3u, N->getAlignInBits());
+  EXPECT_EQ(4u, N->getOffsetInBits());
+  EXPECT_EQ(5u, N->getFlags());
+  EXPECT_EQ(ExtraData, N->getExtraData());
+  EXPECT_EQ(N, MDDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
+                                  "something", File, 1, Scope, BaseType, 2, 3,
+                                  4, 5, ExtraData));
+
+  EXPECT_NE(N, MDDerivedType::get(Context, dwarf::DW_TAG_reference_type,
+                                  "something", File, 1, Scope, BaseType, 2, 3,
+                                  4, 5, ExtraData));
+  EXPECT_NE(N, MDDerivedType::get(Context, dwarf::DW_TAG_pointer_type, "else",
+                                  File, 1, Scope, BaseType, 2, 3, 4, 5,
+                                  ExtraData));
+  EXPECT_NE(N, MDDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
+                                  "something", Scope, 1, Scope, BaseType, 2, 3,
+                                  4, 5, ExtraData));
+  EXPECT_NE(N, MDDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
+                                  "something", File, 2, Scope, BaseType, 2, 3,
+                                  4, 5, ExtraData));
+  EXPECT_NE(N,
+            MDDerivedType::get(Context, dwarf::DW_TAG_pointer_type, "something",
+                               File, 1, File, BaseType, 2, 3, 4, 5, ExtraData));
+  EXPECT_NE(N,
+            MDDerivedType::get(Context, dwarf::DW_TAG_pointer_type, "something",
+                               File, 1, Scope, File, 2, 3, 4, 5, ExtraData));
+  EXPECT_NE(N, MDDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
+                                  "something", File, 1, Scope, BaseType, 3, 3,
+                                  4, 5, ExtraData));
+  EXPECT_NE(N, MDDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
+                                  "something", File, 1, Scope, BaseType, 2, 2,
+                                  4, 5, ExtraData));
+  EXPECT_NE(N, MDDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
+                                  "something", File, 1, Scope, BaseType, 2, 3,
+                                  5, 5, ExtraData));
+  EXPECT_NE(N, MDDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
+                                  "something", File, 1, Scope, BaseType, 2, 3,
+                                  4, 4, ExtraData));
+  EXPECT_NE(N,
+            MDDerivedType::get(Context, dwarf::DW_TAG_pointer_type, "something",
+                               File, 1, Scope, BaseType, 2, 3, 4, 5, File));
+
+  TempMDDerivedType Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+TEST_F(MDDerivedTypeTest, getWithLargeValues) {
+  Metadata *File = MDTuple::getDistinct(Context, None);
+  Metadata *Scope = MDTuple::getDistinct(Context, None);
+  Metadata *BaseType = MDTuple::getDistinct(Context, None);
+  Metadata *ExtraData = MDTuple::getDistinct(Context, None);
+
+  auto *N = MDDerivedType::get(Context, dwarf::DW_TAG_pointer_type, "something",
+                               File, 1, Scope, BaseType, UINT64_MAX,
+                               UINT64_MAX - 1, UINT64_MAX - 2, 5, ExtraData);
+  EXPECT_EQ(UINT64_MAX, N->getSizeInBits());
+  EXPECT_EQ(UINT64_MAX - 1, N->getAlignInBits());
+  EXPECT_EQ(UINT64_MAX - 2, N->getOffsetInBits());
+}
+
+typedef MetadataTest MDCompositeTypeTest;
+
+TEST_F(MDCompositeTypeTest, get) {
+  unsigned Tag = dwarf::DW_TAG_structure_type;
+  StringRef Name = "some name";
+  Metadata *File = MDTuple::getDistinct(Context, None);
+  unsigned Line = 1;
+  Metadata *Scope = MDTuple::getDistinct(Context, None);
+  Metadata *BaseType = MDTuple::getDistinct(Context, None);
+  uint64_t SizeInBits = 2;
+  uint64_t AlignInBits = 3;
+  uint64_t OffsetInBits = 4;
+  unsigned Flags = 5;
+  Metadata *Elements = MDTuple::getDistinct(Context, None);
+  unsigned RuntimeLang = 6;
+  Metadata *VTableHolder = MDTuple::getDistinct(Context, None);
+  Metadata *TemplateParams = MDTuple::getDistinct(Context, None);
+  StringRef Identifier = "some id";
+
+  auto *N = MDCompositeType::get(Context, Tag, Name, File, Line, Scope,
+                                 BaseType, SizeInBits, AlignInBits,
+                                 OffsetInBits, Flags, Elements, RuntimeLang,
+                                 VTableHolder, TemplateParams, Identifier);
+  EXPECT_EQ(Tag, N->getTag());
+  EXPECT_EQ(Name, N->getName());
+  EXPECT_EQ(File, N->getFile());
+  EXPECT_EQ(Line, N->getLine());
+  EXPECT_EQ(Scope, N->getScope());
+  EXPECT_EQ(BaseType, N->getBaseType());
+  EXPECT_EQ(SizeInBits, N->getSizeInBits());
+  EXPECT_EQ(AlignInBits, N->getAlignInBits());
+  EXPECT_EQ(OffsetInBits, N->getOffsetInBits());
+  EXPECT_EQ(Flags, N->getFlags());
+  EXPECT_EQ(Elements, N->getElements());
+  EXPECT_EQ(RuntimeLang, N->getRuntimeLang());
+  EXPECT_EQ(VTableHolder, N->getVTableHolder());
+  EXPECT_EQ(TemplateParams, N->getTemplateParams());
+  EXPECT_EQ(Identifier, N->getIdentifier());
+
+  EXPECT_EQ(N, MDCompositeType::get(Context, Tag, Name, File, Line, Scope,
+                                    BaseType, SizeInBits, AlignInBits,
+                                    OffsetInBits, Flags, Elements, RuntimeLang,
+                                    VTableHolder, TemplateParams, Identifier));
+
+  EXPECT_NE(N, MDCompositeType::get(Context, Tag + 1, Name, File, Line, Scope,
+                                    BaseType, SizeInBits, AlignInBits,
+                                    OffsetInBits, Flags, Elements, RuntimeLang,
+                                    VTableHolder, TemplateParams, Identifier));
+  EXPECT_NE(N, MDCompositeType::get(Context, Tag, "abc", File, Line, Scope,
+                                    BaseType, SizeInBits, AlignInBits,
+                                    OffsetInBits, Flags, Elements, RuntimeLang,
+                                    VTableHolder, TemplateParams, Identifier));
+  EXPECT_NE(N, MDCompositeType::get(Context, Tag, Name, Scope, Line, Scope,
+                                    BaseType, SizeInBits, AlignInBits,
+                                    OffsetInBits, Flags, Elements, RuntimeLang,
+                                    VTableHolder, TemplateParams, Identifier));
+  EXPECT_NE(N, MDCompositeType::get(Context, Tag, Name, File, Line + 1, Scope,
+                                    BaseType, SizeInBits, AlignInBits,
+                                    OffsetInBits, Flags, Elements, RuntimeLang,
+                                    VTableHolder, TemplateParams, Identifier));
+  EXPECT_NE(N, MDCompositeType::get(Context, Tag, Name, File, Line, File,
+                                    BaseType, SizeInBits, AlignInBits,
+                                    OffsetInBits, Flags, Elements, RuntimeLang,
+                                    VTableHolder, TemplateParams, Identifier));
+  EXPECT_NE(N, MDCompositeType::get(Context, Tag, Name, File, Line, Scope, File,
+                                    SizeInBits, AlignInBits, OffsetInBits,
+                                    Flags, Elements, RuntimeLang, VTableHolder,
+                                    TemplateParams, Identifier));
+  EXPECT_NE(N, MDCompositeType::get(Context, Tag, Name, File, Line, Scope,
+                                    BaseType, SizeInBits + 1, AlignInBits,
+                                    OffsetInBits, Flags, Elements, RuntimeLang,
+                                    VTableHolder, TemplateParams, Identifier));
+  EXPECT_NE(N, MDCompositeType::get(Context, Tag, Name, File, Line, Scope,
+                                    BaseType, SizeInBits, AlignInBits + 1,
+                                    OffsetInBits, Flags, Elements, RuntimeLang,
+                                    VTableHolder, TemplateParams, Identifier));
+  EXPECT_NE(N, MDCompositeType::get(
+                   Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                   AlignInBits, OffsetInBits + 1, Flags, Elements, RuntimeLang,
+                   VTableHolder, TemplateParams, Identifier));
+  EXPECT_NE(N, MDCompositeType::get(
+                   Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                   AlignInBits, OffsetInBits, Flags + 1, Elements, RuntimeLang,
+                   VTableHolder, TemplateParams, Identifier));
+  EXPECT_NE(N, MDCompositeType::get(Context, Tag, Name, File, Line, Scope,
+                                    BaseType, SizeInBits, AlignInBits,
+                                    OffsetInBits, Flags, File, RuntimeLang,
+                                    VTableHolder, TemplateParams, Identifier));
+  EXPECT_NE(N, MDCompositeType::get(
+                   Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                   AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang + 1,
+                   VTableHolder, TemplateParams, Identifier));
+  EXPECT_NE(N, MDCompositeType::get(Context, Tag, Name, File, Line, Scope,
+                                    BaseType, SizeInBits, AlignInBits,
+                                    OffsetInBits, Flags, Elements, RuntimeLang,
+                                    File, TemplateParams, Identifier));
+  EXPECT_NE(N, MDCompositeType::get(Context, Tag, Name, File, Line, Scope,
+                                    BaseType, SizeInBits, AlignInBits,
+                                    OffsetInBits, Flags, Elements, RuntimeLang,
+                                    VTableHolder, File, Identifier));
+  EXPECT_NE(N, MDCompositeType::get(Context, Tag, Name, File, Line, Scope,
+                                    BaseType, SizeInBits, AlignInBits,
+                                    OffsetInBits, Flags, Elements, RuntimeLang,
+                                    VTableHolder, TemplateParams, "other"));
+
+  // Be sure that missing identifiers get null pointers.
+  EXPECT_FALSE(MDCompositeType::get(
+                   Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                   AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
+                   VTableHolder, TemplateParams, "")->getRawIdentifier());
+  EXPECT_FALSE(MDCompositeType::get(
+                   Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                   AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
+                   VTableHolder, TemplateParams)->getRawIdentifier());
+
+  TempMDCompositeType Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+TEST_F(MDCompositeTypeTest, getWithLargeValues) {
+  unsigned Tag = dwarf::DW_TAG_structure_type;
+  StringRef Name = "some name";
+  Metadata *File = MDTuple::getDistinct(Context, None);
+  unsigned Line = 1;
+  Metadata *Scope = MDTuple::getDistinct(Context, None);
+  Metadata *BaseType = MDTuple::getDistinct(Context, None);
+  uint64_t SizeInBits = UINT64_MAX;
+  uint64_t AlignInBits = UINT64_MAX - 1;
+  uint64_t OffsetInBits = UINT64_MAX - 2;
+  unsigned Flags = 5;
+  Metadata *Elements = MDTuple::getDistinct(Context, None);
+  unsigned RuntimeLang = 6;
+  Metadata *VTableHolder = MDTuple::getDistinct(Context, None);
+  Metadata *TemplateParams = MDTuple::getDistinct(Context, None);
+  StringRef Identifier = "some id";
+
+  auto *N = MDCompositeType::get(Context, Tag, Name, File, Line, Scope,
+                                 BaseType, SizeInBits, AlignInBits,
+                                 OffsetInBits, Flags, Elements, RuntimeLang,
+                                 VTableHolder, TemplateParams, Identifier);
+  EXPECT_EQ(SizeInBits, N->getSizeInBits());
+  EXPECT_EQ(AlignInBits, N->getAlignInBits());
+  EXPECT_EQ(OffsetInBits, N->getOffsetInBits());
+}
+
+TEST_F(MDCompositeTypeTest, replaceOperands) {
+  unsigned Tag = dwarf::DW_TAG_structure_type;
+  StringRef Name = "some name";
+  Metadata *File = MDTuple::getDistinct(Context, None);
+  unsigned Line = 1;
+  Metadata *Scope = MDTuple::getDistinct(Context, None);
+  Metadata *BaseType = MDTuple::getDistinct(Context, None);
+  uint64_t SizeInBits = 2;
+  uint64_t AlignInBits = 3;
+  uint64_t OffsetInBits = 4;
+  unsigned Flags = 5;
+  unsigned RuntimeLang = 6;
+  StringRef Identifier = "some id";
+
+  auto *N = MDCompositeType::get(Context, Tag, Name, File, Line, Scope,
+                                 BaseType, SizeInBits, AlignInBits,
+                                 OffsetInBits, Flags, nullptr, RuntimeLang,
+                                 nullptr, nullptr, Identifier);
+
+  auto *Elements = MDTuple::getDistinct(Context, None);
+  EXPECT_EQ(nullptr, N->getElements());
+  N->replaceElements(Elements);
+  EXPECT_EQ(Elements, N->getElements());
+  N->replaceElements(nullptr);
+  EXPECT_EQ(nullptr, N->getElements());
+
+  auto *VTableHolder = MDTuple::getDistinct(Context, None);
+  EXPECT_EQ(nullptr, N->getVTableHolder());
+  N->replaceVTableHolder(VTableHolder);
+  EXPECT_EQ(VTableHolder, N->getVTableHolder());
+  N->replaceVTableHolder(nullptr);
+  EXPECT_EQ(nullptr, N->getVTableHolder());
+
+  auto *TemplateParams = MDTuple::getDistinct(Context, None);
+  EXPECT_EQ(nullptr, N->getTemplateParams());
+  N->replaceTemplateParams(TemplateParams);
+  EXPECT_EQ(TemplateParams, N->getTemplateParams());
+  N->replaceTemplateParams(nullptr);
+  EXPECT_EQ(nullptr, N->getTemplateParams());
+}
+
+typedef MetadataTest MDSubroutineTypeTest;
+
+TEST_F(MDSubroutineTypeTest, get) {
+  unsigned Flags = 1;
+  Metadata *TypeArray = MDTuple::getDistinct(Context, None);
+
+  auto *N = MDSubroutineType::get(Context, Flags, TypeArray);
+  EXPECT_EQ(dwarf::DW_TAG_subroutine_type, N->getTag());
+  EXPECT_EQ(Flags, N->getFlags());
+  EXPECT_EQ(TypeArray, N->getTypeArray());
+  EXPECT_EQ(N, MDSubroutineType::get(Context, Flags, TypeArray));
+
+  EXPECT_NE(N, MDSubroutineType::get(Context, Flags + 1, TypeArray));
+  EXPECT_NE(N, MDSubroutineType::get(Context, Flags,
+                                     MDTuple::getDistinct(Context, None)));
+
+  TempMDSubroutineType Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+
+  // Test always-empty operands.
+  EXPECT_EQ(nullptr, N->getScope());
+  EXPECT_EQ(nullptr, N->getFile());
+  EXPECT_EQ("", N->getName());
+  EXPECT_EQ(nullptr, N->getBaseType());
+  EXPECT_EQ(nullptr, N->getVTableHolder());
+  EXPECT_EQ(nullptr, N->getTemplateParams());
+  EXPECT_EQ("", N->getIdentifier());
+}
+
+typedef MetadataTest MDFileTest;
+
+TEST_F(MDFileTest, get) {
+  StringRef Filename = "file";
+  StringRef Directory = "dir";
+  auto *N = MDFile::get(Context, Filename, Directory);
+
+  EXPECT_EQ(dwarf::DW_TAG_file_type, N->getTag());
+  EXPECT_EQ(Filename, N->getFilename());
+  EXPECT_EQ(Directory, N->getDirectory());
+  EXPECT_EQ(N, MDFile::get(Context, Filename, Directory));
+
+  EXPECT_NE(N, MDFile::get(Context, "other", Directory));
+  EXPECT_NE(N, MDFile::get(Context, Filename, "other"));
+
+  TempMDFile Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+typedef MetadataTest MDCompileUnitTest;
+
+TEST_F(MDCompileUnitTest, get) {
+  unsigned SourceLanguage = 1;
+  Metadata *File = MDTuple::getDistinct(Context, None);
+  StringRef Producer = "some producer";
+  bool IsOptimized = false;
+  StringRef Flags = "flag after flag";
+  unsigned RuntimeVersion = 2;
+  StringRef SplitDebugFilename = "another/file";
+  unsigned EmissionKind = 3;
+  Metadata *EnumTypes = MDTuple::getDistinct(Context, None);
+  Metadata *RetainedTypes = MDTuple::getDistinct(Context, None);
+  Metadata *Subprograms = MDTuple::getDistinct(Context, None);
+  Metadata *GlobalVariables = MDTuple::getDistinct(Context, None);
+  Metadata *ImportedEntities = MDTuple::getDistinct(Context, None);
+  auto *N = MDCompileUnit::get(
+      Context, SourceLanguage, File, Producer, IsOptimized, Flags,
+      RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes,
+      RetainedTypes, Subprograms, GlobalVariables, ImportedEntities);
+
+  EXPECT_EQ(dwarf::DW_TAG_compile_unit, N->getTag());
+  EXPECT_EQ(SourceLanguage, N->getSourceLanguage());
+  EXPECT_EQ(File, N->getFile());
+  EXPECT_EQ(Producer, N->getProducer());
+  EXPECT_EQ(IsOptimized, N->isOptimized());
+  EXPECT_EQ(Flags, N->getFlags());
+  EXPECT_EQ(RuntimeVersion, N->getRuntimeVersion());
+  EXPECT_EQ(SplitDebugFilename, N->getSplitDebugFilename());
+  EXPECT_EQ(EmissionKind, N->getEmissionKind());
+  EXPECT_EQ(EnumTypes, N->getEnumTypes());
+  EXPECT_EQ(RetainedTypes, N->getRetainedTypes());
+  EXPECT_EQ(Subprograms, N->getSubprograms());
+  EXPECT_EQ(GlobalVariables, N->getGlobalVariables());
+  EXPECT_EQ(ImportedEntities, N->getImportedEntities());
+  EXPECT_EQ(N, MDCompileUnit::get(Context, SourceLanguage, File, Producer,
+                                  IsOptimized, Flags, RuntimeVersion,
+                                  SplitDebugFilename, EmissionKind, EnumTypes,
+                                  RetainedTypes, Subprograms, GlobalVariables,
+                                  ImportedEntities));
+
+  EXPECT_NE(N, MDCompileUnit::get(Context, SourceLanguage + 1, File, Producer,
+                                  IsOptimized, Flags, RuntimeVersion,
+                                  SplitDebugFilename, EmissionKind, EnumTypes,
+                                  RetainedTypes, Subprograms, GlobalVariables,
+                                  ImportedEntities));
+  EXPECT_NE(N, MDCompileUnit::get(Context, SourceLanguage, EnumTypes, Producer,
+                                  IsOptimized, Flags, RuntimeVersion,
+                                  SplitDebugFilename, EmissionKind, EnumTypes,
+                                  RetainedTypes, Subprograms, GlobalVariables,
+                                  ImportedEntities));
+  EXPECT_NE(N, MDCompileUnit::get(Context, SourceLanguage, File, "other",
+                                  IsOptimized, Flags, RuntimeVersion,
+                                  SplitDebugFilename, EmissionKind, EnumTypes,
+                                  RetainedTypes, Subprograms, GlobalVariables,
+                                  ImportedEntities));
+  EXPECT_NE(N, MDCompileUnit::get(Context, SourceLanguage, File, Producer,
+                                  !IsOptimized, Flags, RuntimeVersion,
+                                  SplitDebugFilename, EmissionKind, EnumTypes,
+                                  RetainedTypes, Subprograms, GlobalVariables,
+                                  ImportedEntities));
+  EXPECT_NE(N, MDCompileUnit::get(Context, SourceLanguage, File, Producer,
+                                  IsOptimized, "other", RuntimeVersion,
+                                  SplitDebugFilename, EmissionKind, EnumTypes,
+                                  RetainedTypes, Subprograms, GlobalVariables,
+                                  ImportedEntities));
+  EXPECT_NE(N, MDCompileUnit::get(Context, SourceLanguage, File, Producer,
+                                  IsOptimized, Flags, RuntimeVersion + 1,
+                                  SplitDebugFilename, EmissionKind, EnumTypes,
+                                  RetainedTypes, Subprograms, GlobalVariables,
+                                  ImportedEntities));
+  EXPECT_NE(N,
+            MDCompileUnit::get(Context, SourceLanguage, File, Producer,
+                               IsOptimized, Flags, RuntimeVersion, "other",
+                               EmissionKind, EnumTypes, RetainedTypes,
+                               Subprograms, GlobalVariables, ImportedEntities));
+  EXPECT_NE(N, MDCompileUnit::get(Context, SourceLanguage, File, Producer,
+                                  IsOptimized, Flags, RuntimeVersion,
+                                  SplitDebugFilename, EmissionKind + 1,
+                                  EnumTypes, RetainedTypes, Subprograms,
+                                  GlobalVariables, ImportedEntities));
+  EXPECT_NE(N, MDCompileUnit::get(Context, SourceLanguage, File, Producer,
+                                  IsOptimized, Flags, RuntimeVersion,
+                                  SplitDebugFilename, EmissionKind, File,
+                                  RetainedTypes, Subprograms, GlobalVariables,
+                                  ImportedEntities));
+  EXPECT_NE(N, MDCompileUnit::get(
+                   Context, SourceLanguage, File, Producer, IsOptimized, Flags,
+                   RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes,
+                   File, Subprograms, GlobalVariables, ImportedEntities));
+  EXPECT_NE(N, MDCompileUnit::get(
+                   Context, SourceLanguage, File, Producer, IsOptimized, Flags,
+                   RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes,
+                   RetainedTypes, File, GlobalVariables, ImportedEntities));
+  EXPECT_NE(N, MDCompileUnit::get(
+                   Context, SourceLanguage, File, Producer, IsOptimized, Flags,
+                   RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes,
+                   RetainedTypes, Subprograms, File, ImportedEntities));
+  EXPECT_NE(N, MDCompileUnit::get(
+                   Context, SourceLanguage, File, Producer, IsOptimized, Flags,
+                   RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes,
+                   RetainedTypes, Subprograms, GlobalVariables, File));
+
+  TempMDCompileUnit Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+TEST_F(MDCompileUnitTest, replaceArrays) {
+  unsigned SourceLanguage = 1;
+  Metadata *File = MDTuple::getDistinct(Context, None);
+  StringRef Producer = "some producer";
+  bool IsOptimized = false;
+  StringRef Flags = "flag after flag";
+  unsigned RuntimeVersion = 2;
+  StringRef SplitDebugFilename = "another/file";
+  unsigned EmissionKind = 3;
+  Metadata *EnumTypes = MDTuple::getDistinct(Context, None);
+  Metadata *RetainedTypes = MDTuple::getDistinct(Context, None);
+  Metadata *ImportedEntities = MDTuple::getDistinct(Context, None);
+  auto *N = MDCompileUnit::get(
+      Context, SourceLanguage, File, Producer, IsOptimized, Flags,
+      RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes,
+      RetainedTypes, nullptr, nullptr, ImportedEntities);
+
+  auto *Subprograms = MDTuple::getDistinct(Context, None);
+  EXPECT_EQ(nullptr, N->getSubprograms());
+  N->replaceSubprograms(Subprograms);
+  EXPECT_EQ(Subprograms, N->getSubprograms());
+  N->replaceSubprograms(nullptr);
+  EXPECT_EQ(nullptr, N->getSubprograms());
+
+  auto *GlobalVariables = MDTuple::getDistinct(Context, None);
+  EXPECT_EQ(nullptr, N->getGlobalVariables());
+  N->replaceGlobalVariables(GlobalVariables);
+  EXPECT_EQ(GlobalVariables, N->getGlobalVariables());
+  N->replaceGlobalVariables(nullptr);
+  EXPECT_EQ(nullptr, N->getGlobalVariables());
+}
+
+typedef MetadataTest MDSubprogramTest;
+
+TEST_F(MDSubprogramTest, get) {
+  Metadata *Scope = MDTuple::getDistinct(Context, None);
+  StringRef Name = "name";
+  StringRef LinkageName = "linkage";
+  Metadata *File = MDTuple::getDistinct(Context, None);
+  unsigned Line = 2;
+  Metadata *Type = MDTuple::getDistinct(Context, None);
+  bool IsLocalToUnit = false;
+  bool IsDefinition = true;
+  unsigned ScopeLine = 3;
+  Metadata *ContainingType = MDTuple::getDistinct(Context, None);
+  unsigned Virtuality = 4;
+  unsigned VirtualIndex = 5;
+  unsigned Flags = 6;
+  bool IsOptimized = false;
+  Metadata *Function = MDTuple::getDistinct(Context, None);
+  Metadata *TemplateParams = MDTuple::getDistinct(Context, None);
+  Metadata *Declaration = MDTuple::getDistinct(Context, None);
+  Metadata *Variables = MDTuple::getDistinct(Context, None);
+
+  auto *N = MDSubprogram::get(
+      Context, Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit,
+      IsDefinition, ScopeLine, ContainingType, Virtuality, VirtualIndex, Flags,
+      IsOptimized, Function, TemplateParams, Declaration, Variables);
+
+  EXPECT_EQ(dwarf::DW_TAG_subprogram, N->getTag());
+  EXPECT_EQ(Scope, N->getScope());
+  EXPECT_EQ(Name, N->getName());
+  EXPECT_EQ(LinkageName, N->getLinkageName());
+  EXPECT_EQ(File, N->getFile());
+  EXPECT_EQ(Line, N->getLine());
+  EXPECT_EQ(Type, N->getType());
+  EXPECT_EQ(IsLocalToUnit, N->isLocalToUnit());
+  EXPECT_EQ(IsDefinition, N->isDefinition());
+  EXPECT_EQ(ScopeLine, N->getScopeLine());
+  EXPECT_EQ(ContainingType, N->getContainingType());
+  EXPECT_EQ(Virtuality, N->getVirtuality());
+  EXPECT_EQ(VirtualIndex, N->getVirtualIndex());
+  EXPECT_EQ(Flags, N->getFlags());
+  EXPECT_EQ(IsOptimized, N->isOptimized());
+  EXPECT_EQ(Function, N->getFunction());
+  EXPECT_EQ(TemplateParams, N->getTemplateParams());
+  EXPECT_EQ(Declaration, N->getDeclaration());
+  EXPECT_EQ(Variables, N->getVariables());
+  EXPECT_EQ(N, MDSubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
+                                 ContainingType, Virtuality, VirtualIndex,
+                                 Flags, IsOptimized, Function, TemplateParams,
+                                 Declaration, Variables));
+
+  EXPECT_NE(N, MDSubprogram::get(Context, File, Name, LinkageName, File, Line,
+                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
+                                 ContainingType, Virtuality, VirtualIndex,
+                                 Flags, IsOptimized, Function, TemplateParams,
+                                 Declaration, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, "other", LinkageName, File,
+                                 Line, Type, IsLocalToUnit, IsDefinition,
+                                 ScopeLine, ContainingType, Virtuality,
+                                 VirtualIndex, Flags, IsOptimized, Function,
+                                 TemplateParams, Declaration, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, Name, "other", File, Line,
+                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
+                                 ContainingType, Virtuality, VirtualIndex,
+                                 Flags, IsOptimized, Function, TemplateParams,
+                                 Declaration, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, Name, LinkageName, Scope, Line,
+                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
+                                 ContainingType, Virtuality, VirtualIndex,
+                                 Flags, IsOptimized, Function, TemplateParams,
+                                 Declaration, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, Name, LinkageName, File,
+                                 Line + 1, Type, IsLocalToUnit, IsDefinition,
+                                 ScopeLine, ContainingType, Virtuality,
+                                 VirtualIndex, Flags, IsOptimized, Function,
+                                 TemplateParams, Declaration, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Scope, IsLocalToUnit, IsDefinition, ScopeLine,
+                                 ContainingType, Virtuality, VirtualIndex,
+                                 Flags, IsOptimized, Function, TemplateParams,
+                                 Declaration, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, !IsLocalToUnit, IsDefinition, ScopeLine,
+                                 ContainingType, Virtuality, VirtualIndex,
+                                 Flags, IsOptimized, Function, TemplateParams,
+                                 Declaration, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, IsLocalToUnit, !IsDefinition, ScopeLine,
+                                 ContainingType, Virtuality, VirtualIndex,
+                                 Flags, IsOptimized, Function, TemplateParams,
+                                 Declaration, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, IsLocalToUnit, IsDefinition,
+                                 ScopeLine + 1, ContainingType, Virtuality,
+                                 VirtualIndex, Flags, IsOptimized, Function,
+                                 TemplateParams, Declaration, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
+                                 Type, Virtuality, VirtualIndex, Flags,
+                                 IsOptimized, Function, TemplateParams,
+                                 Declaration, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
+                                 ContainingType, Virtuality + 1, VirtualIndex,
+                                 Flags, IsOptimized, Function, TemplateParams,
+                                 Declaration, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
+                                 ContainingType, Virtuality, VirtualIndex + 1,
+                                 Flags, IsOptimized, Function, TemplateParams,
+                                 Declaration, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
+                                 ContainingType, Virtuality, VirtualIndex,
+                                 ~Flags, IsOptimized, Function, TemplateParams,
+                                 Declaration, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
+                                 ContainingType, Virtuality, VirtualIndex,
+                                 Flags, !IsOptimized, Function, TemplateParams,
+                                 Declaration, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
+                                 ContainingType, Virtuality, VirtualIndex,
+                                 Flags, IsOptimized, Type, TemplateParams,
+                                 Declaration, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
+                                 ContainingType, Virtuality, VirtualIndex,
+                                 Flags, IsOptimized, Function, Type,
+                                 Declaration, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
+                                 ContainingType, Virtuality, VirtualIndex,
+                                 Flags, IsOptimized, Function, TemplateParams,
+                                 Type, Variables));
+  EXPECT_NE(N, MDSubprogram::get(Context, Scope, Name, LinkageName, File, Line,
+                                 Type, IsLocalToUnit, IsDefinition, ScopeLine,
+                                 ContainingType, Virtuality, VirtualIndex,
+                                 Flags, IsOptimized, Function, TemplateParams,
+                                 Declaration, Type));
+
+  TempMDSubprogram Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+TEST_F(MDSubprogramTest, replaceFunction) {
+  Metadata *Scope = MDTuple::getDistinct(Context, None);
+  StringRef Name = "name";
+  StringRef LinkageName = "linkage";
+  Metadata *File = MDTuple::getDistinct(Context, None);
+  unsigned Line = 2;
+  Metadata *Type = MDTuple::getDistinct(Context, None);
+  bool IsLocalToUnit = false;
+  bool IsDefinition = true;
+  unsigned ScopeLine = 3;
+  Metadata *ContainingType = MDTuple::getDistinct(Context, None);
+  unsigned Virtuality = 4;
+  unsigned VirtualIndex = 5;
+  unsigned Flags = 6;
+  bool IsOptimized = false;
+  Metadata *TemplateParams = MDTuple::getDistinct(Context, None);
+  Metadata *Declaration = MDTuple::getDistinct(Context, None);
+  Metadata *Variables = MDTuple::getDistinct(Context, None);
+
+  auto *N = MDSubprogram::get(
+      Context, Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit,
+      IsDefinition, ScopeLine, ContainingType, Virtuality, VirtualIndex, Flags,
+      IsOptimized, nullptr, TemplateParams, Declaration, Variables);
+
+  EXPECT_EQ(nullptr, N->getFunction());
+
+  std::unique_ptr<Function> F(
+      Function::Create(FunctionType::get(Type::getVoidTy(Context), false),
+                       GlobalValue::ExternalLinkage));
+  N->replaceFunction(F.get());
+  EXPECT_EQ(ConstantAsMetadata::get(F.get()), N->getFunction());
+
+  N->replaceFunction(nullptr);
+  EXPECT_EQ(nullptr, N->getFunction());
+}
+
+typedef MetadataTest MDLexicalBlockTest;
+
+TEST_F(MDLexicalBlockTest, get) {
+  Metadata *Scope = MDTuple::getDistinct(Context, None);
+  Metadata *File = MDTuple::getDistinct(Context, None);
+  unsigned Line = 5;
+  unsigned Column = 8;
+
+  auto *N = MDLexicalBlock::get(Context, Scope, File, Line, Column);
+
+  EXPECT_EQ(dwarf::DW_TAG_lexical_block, N->getTag());
+  EXPECT_EQ(Scope, N->getScope());
+  EXPECT_EQ(File, N->getFile());
+  EXPECT_EQ(Line, N->getLine());
+  EXPECT_EQ(Column, N->getColumn());
+  EXPECT_EQ(N, MDLexicalBlock::get(Context, Scope, File, Line, Column));
+
+  EXPECT_NE(N, MDLexicalBlock::get(Context, File, File, Line, Column));
+  EXPECT_NE(N, MDLexicalBlock::get(Context, Scope, Scope, Line, Column));
+  EXPECT_NE(N, MDLexicalBlock::get(Context, Scope, File, Line + 1, Column));
+  EXPECT_NE(N, MDLexicalBlock::get(Context, Scope, File, Line, Column + 1));
+
+  TempMDLexicalBlock Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+typedef MetadataTest MDLexicalBlockFileTest;
+
+TEST_F(MDLexicalBlockFileTest, get) {
+  Metadata *Scope = MDTuple::getDistinct(Context, None);
+  Metadata *File = MDTuple::getDistinct(Context, None);
+  unsigned Discriminator = 5;
+
+  auto *N = MDLexicalBlockFile::get(Context, Scope, File, Discriminator);
+
+  EXPECT_EQ(dwarf::DW_TAG_lexical_block, N->getTag());
+  EXPECT_EQ(Scope, N->getScope());
+  EXPECT_EQ(File, N->getFile());
+  EXPECT_EQ(Discriminator, N->getDiscriminator());
+  EXPECT_EQ(N, MDLexicalBlockFile::get(Context, Scope, File, Discriminator));
+
+  EXPECT_NE(N, MDLexicalBlockFile::get(Context, File, File, Discriminator));
+  EXPECT_NE(N, MDLexicalBlockFile::get(Context, Scope, Scope, Discriminator));
+  EXPECT_NE(N,
+            MDLexicalBlockFile::get(Context, Scope, File, Discriminator + 1));
+
+  TempMDLexicalBlockFile Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+typedef MetadataTest MDNamespaceTest;
+
+TEST_F(MDNamespaceTest, get) {
+  Metadata *Scope = MDTuple::getDistinct(Context, None);
+  Metadata *File = MDTuple::getDistinct(Context, None);
+  StringRef Name = "namespace";
+  unsigned Line = 5;
+
+  auto *N = MDNamespace::get(Context, Scope, File, Name, Line);
+
+  EXPECT_EQ(dwarf::DW_TAG_namespace, N->getTag());
+  EXPECT_EQ(Scope, N->getScope());
+  EXPECT_EQ(File, N->getFile());
+  EXPECT_EQ(Name, N->getName());
+  EXPECT_EQ(Line, N->getLine());
+  EXPECT_EQ(N, MDNamespace::get(Context, Scope, File, Name, Line));
+
+  EXPECT_NE(N, MDNamespace::get(Context, File, File, Name, Line));
+  EXPECT_NE(N, MDNamespace::get(Context, Scope, Scope, Name, Line));
+  EXPECT_NE(N, MDNamespace::get(Context, Scope, File, "other", Line));
+  EXPECT_NE(N, MDNamespace::get(Context, Scope, File, Name, Line + 1));
+
+  TempMDNamespace Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+typedef MetadataTest MDTemplateTypeParameterTest;
+
+TEST_F(MDTemplateTypeParameterTest, get) {
+  StringRef Name = "template";
+  Metadata *Type = MDTuple::getDistinct(Context, None);
+  Metadata *Other = MDTuple::getDistinct(Context, None);
+
+  auto *N = MDTemplateTypeParameter::get(Context, Name, Type);
+
+  EXPECT_EQ(dwarf::DW_TAG_template_type_parameter, N->getTag());
+  EXPECT_EQ(Name, N->getName());
+  EXPECT_EQ(Type, N->getType());
+  EXPECT_EQ(N, MDTemplateTypeParameter::get(Context, Name, Type));
+
+  EXPECT_NE(N, MDTemplateTypeParameter::get(Context, "other", Type));
+  EXPECT_NE(N, MDTemplateTypeParameter::get(Context, Name, Other));
+
+  TempMDTemplateTypeParameter Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+typedef MetadataTest MDTemplateValueParameterTest;
+
+TEST_F(MDTemplateValueParameterTest, get) {
+  unsigned Tag = dwarf::DW_TAG_template_value_parameter;
+  StringRef Name = "template";
+  Metadata *Type = MDTuple::getDistinct(Context, None);
+  Metadata *Value = MDTuple::getDistinct(Context, None);
+  Metadata *Other = MDTuple::getDistinct(Context, None);
+
+  auto *N = MDTemplateValueParameter::get(Context, Tag, Name, Type, Value);
+  EXPECT_EQ(Tag, N->getTag());
+  EXPECT_EQ(Name, N->getName());
+  EXPECT_EQ(Type, N->getType());
+  EXPECT_EQ(Value, N->getValue());
+  EXPECT_EQ(N, MDTemplateValueParameter::get(Context, Tag, Name, Type, Value));
+
+  EXPECT_NE(N, MDTemplateValueParameter::get(
+                   Context, dwarf::DW_TAG_GNU_template_template_param, Name,
+                   Type, Value));
+  EXPECT_NE(N, MDTemplateValueParameter::get(Context, Tag,  "other", Type,
+                                             Value));
+  EXPECT_NE(N, MDTemplateValueParameter::get(Context, Tag,  Name, Other,
+                                             Value));
+  EXPECT_NE(N, MDTemplateValueParameter::get(Context, Tag, Name, Type, Other));
+
+  TempMDTemplateValueParameter Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+typedef MetadataTest MDGlobalVariableTest;
+
+TEST_F(MDGlobalVariableTest, get) {
+  Metadata *Scope = MDTuple::getDistinct(Context, None);
+  StringRef Name = "name";
+  StringRef LinkageName = "linkage";
+  Metadata *File = MDTuple::getDistinct(Context, None);
+  unsigned Line = 5;
+  Metadata *Type = MDTuple::getDistinct(Context, None);
+  bool IsLocalToUnit = false;
+  bool IsDefinition = true;
+  Metadata *Variable = MDTuple::getDistinct(Context, None);
+  Metadata *StaticDataMemberDeclaration = MDTuple::getDistinct(Context, None);
+
+  auto *N = MDGlobalVariable::get(Context, Scope, Name, LinkageName, File, Line,
+                                  Type, IsLocalToUnit, IsDefinition, Variable,
+                                  StaticDataMemberDeclaration);
+  EXPECT_EQ(dwarf::DW_TAG_variable, N->getTag());
+  EXPECT_EQ(Scope, N->getScope());
+  EXPECT_EQ(Name, N->getName());
+  EXPECT_EQ(LinkageName, N->getLinkageName());
+  EXPECT_EQ(File, N->getFile());
+  EXPECT_EQ(Line, N->getLine());
+  EXPECT_EQ(Type, N->getType());
+  EXPECT_EQ(IsLocalToUnit, N->isLocalToUnit());
+  EXPECT_EQ(IsDefinition, N->isDefinition());
+  EXPECT_EQ(Variable, N->getVariable());
+  EXPECT_EQ(StaticDataMemberDeclaration, N->getStaticDataMemberDeclaration());
+  EXPECT_EQ(N, MDGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                     Line, Type, IsLocalToUnit, IsDefinition,
+                                     Variable, StaticDataMemberDeclaration));
+
+  EXPECT_NE(N, MDGlobalVariable::get(Context, File, Name, LinkageName, File,
+                                     Line, Type, IsLocalToUnit, IsDefinition,
+                                     Variable, StaticDataMemberDeclaration));
+  EXPECT_NE(N, MDGlobalVariable::get(Context, Scope, "other", LinkageName, File,
+                                     Line, Type, IsLocalToUnit, IsDefinition,
+                                     Variable, StaticDataMemberDeclaration));
+  EXPECT_NE(N, MDGlobalVariable::get(Context, Scope, Name, "other", File, Line,
+                                     Type, IsLocalToUnit, IsDefinition,
+                                     Variable, StaticDataMemberDeclaration));
+  EXPECT_NE(N, MDGlobalVariable::get(Context, Scope, Name, LinkageName, Scope,
+                                     Line, Type, IsLocalToUnit, IsDefinition,
+                                     Variable, StaticDataMemberDeclaration));
+  EXPECT_NE(N,
+            MDGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                  Line + 1, Type, IsLocalToUnit, IsDefinition,
+                                  Variable, StaticDataMemberDeclaration));
+  EXPECT_NE(N, MDGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                     Line, Scope, IsLocalToUnit, IsDefinition,
+                                     Variable, StaticDataMemberDeclaration));
+  EXPECT_NE(N, MDGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                     Line, Type, !IsLocalToUnit, IsDefinition,
+                                     Variable, StaticDataMemberDeclaration));
+  EXPECT_NE(N, MDGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                     Line, Type, IsLocalToUnit, !IsDefinition,
+                                     Variable, StaticDataMemberDeclaration));
+  EXPECT_NE(N, MDGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                     Line, Type, IsLocalToUnit, IsDefinition,
+                                     Type, StaticDataMemberDeclaration));
+  EXPECT_NE(N, MDGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                     Line, Type, IsLocalToUnit, IsDefinition,
+                                     Variable, Type));
+
+  TempMDGlobalVariable Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+typedef MetadataTest MDLocalVariableTest;
+
+TEST_F(MDLocalVariableTest, get) {
+  unsigned Tag = dwarf::DW_TAG_arg_variable;
+  Metadata *Scope = MDTuple::getDistinct(Context, None);
+  StringRef Name = "name";
+  Metadata *File = MDTuple::getDistinct(Context, None);
+  unsigned Line = 5;
+  Metadata *Type = MDTuple::getDistinct(Context, None);
+  unsigned Arg = 6;
+  unsigned Flags = 7;
+  Metadata *InlinedAt = MDTuple::getDistinct(Context, None);
+
+  auto *N = MDLocalVariable::get(Context, Tag, Scope, Name, File, Line, Type,
+                                 Arg, Flags, InlinedAt);
+  EXPECT_EQ(Tag, N->getTag());
+  EXPECT_EQ(Scope, N->getScope());
+  EXPECT_EQ(Name, N->getName());
+  EXPECT_EQ(File, N->getFile());
+  EXPECT_EQ(Line, N->getLine());
+  EXPECT_EQ(Type, N->getType());
+  EXPECT_EQ(Arg, N->getArg());
+  EXPECT_EQ(Flags, N->getFlags());
+  EXPECT_EQ(InlinedAt, N->getInlinedAt());
+  EXPECT_EQ(N, MDLocalVariable::get(Context, Tag, Scope, Name, File, Line, Type,
+                                    Arg, Flags, InlinedAt));
+
+  EXPECT_NE(N, MDLocalVariable::get(Context, dwarf::DW_TAG_auto_variable, Scope,
+                                    Name, File, Line, Type, Arg, Flags,
+                                    InlinedAt));
+  EXPECT_NE(N, MDLocalVariable::get(Context, Tag, File, Name, File, Line,
+                                    Type, Arg, Flags, InlinedAt));
+  EXPECT_NE(N, MDLocalVariable::get(Context, Tag, Scope, "other", File, Line,
+                                    Type, Arg, Flags, InlinedAt));
+  EXPECT_NE(N, MDLocalVariable::get(Context, Tag, Scope, Name, Scope, Line,
+                                    Type, Arg, Flags, InlinedAt));
+  EXPECT_NE(N, MDLocalVariable::get(Context, Tag, Scope, Name, File, Line + 1,
+                                    Type, Arg, Flags, InlinedAt));
+  EXPECT_NE(N, MDLocalVariable::get(Context, Tag, Scope, Name, File, Line,
+                                    Scope, Arg, Flags, InlinedAt));
+  EXPECT_NE(N, MDLocalVariable::get(Context, Tag, Scope, Name, File, Line, Type,
+                                    Arg + 1, Flags, InlinedAt));
+  EXPECT_NE(N, MDLocalVariable::get(Context, Tag, Scope, Name, File, Line, Type,
+                                    Arg, ~Flags, InlinedAt));
+  EXPECT_NE(N, MDLocalVariable::get(Context, Tag, Scope, Name, File, Line, Type,
+                                    Arg, Flags, Scope));
+
+  TempMDLocalVariable Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+typedef MetadataTest MDExpressionTest;
+
+TEST_F(MDExpressionTest, get) {
+  uint64_t Elements[] = {2, 6, 9, 78, 0};
+  auto *N = MDExpression::get(Context, Elements);
+  EXPECT_EQ(makeArrayRef(Elements), N->getElements());
+  EXPECT_EQ(N, MDExpression::get(Context, Elements));
+
+  EXPECT_EQ(5u, N->getNumElements());
+  EXPECT_EQ(2u, N->getElement(0));
+  EXPECT_EQ(6u, N->getElement(1));
+  EXPECT_EQ(9u, N->getElement(2));
+  EXPECT_EQ(78u, N->getElement(3));
+  EXPECT_EQ(0u, N->getElement(4));
+
+  TempMDExpression Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+TEST_F(MDExpressionTest, isValid) {
+#define EXPECT_VALID(...)                                                      \
+  do {                                                                         \
+    uint64_t Elements[] = {__VA_ARGS__};                                       \
+    EXPECT_TRUE(MDExpression::get(Context, Elements)->isValid());              \
+  } while (false)
+#define EXPECT_INVALID(...)                                                    \
+  do {                                                                         \
+    uint64_t Elements[] = {__VA_ARGS__};                                       \
+    EXPECT_FALSE(MDExpression::get(Context, Elements)->isValid());             \
+  } while (false)
+
+  // Empty expression should be valid.
+  EXPECT_TRUE(MDExpression::get(Context, None));
+
+  // Valid constructions.
+  EXPECT_VALID(dwarf::DW_OP_plus, 6);
+  EXPECT_VALID(dwarf::DW_OP_deref);
+  EXPECT_VALID(dwarf::DW_OP_bit_piece, 3, 7);
+  EXPECT_VALID(dwarf::DW_OP_plus, 6, dwarf::DW_OP_deref);
+  EXPECT_VALID(dwarf::DW_OP_deref, dwarf::DW_OP_plus, 6);
+  EXPECT_VALID(dwarf::DW_OP_deref, dwarf::DW_OP_bit_piece, 3, 7);
+  EXPECT_VALID(dwarf::DW_OP_deref, dwarf::DW_OP_plus, 6, dwarf::DW_OP_bit_piece, 3, 7);
+
+  // Invalid constructions.
+  EXPECT_INVALID(~0u);
+  EXPECT_INVALID(dwarf::DW_OP_plus);
+  EXPECT_INVALID(dwarf::DW_OP_bit_piece);
+  EXPECT_INVALID(dwarf::DW_OP_bit_piece, 3);
+  EXPECT_INVALID(dwarf::DW_OP_bit_piece, 3, 7, dwarf::DW_OP_plus, 3);
+  EXPECT_INVALID(dwarf::DW_OP_bit_piece, 3, 7, dwarf::DW_OP_deref);
+
+#undef EXPECT_VALID
+#undef EXPECT_INVALID
+}
+
+typedef MetadataTest MDObjCPropertyTest;
+
+TEST_F(MDObjCPropertyTest, get) {
+  StringRef Name = "name";
+  Metadata *File = MDTuple::getDistinct(Context, None);
+  unsigned Line = 5;
+  StringRef GetterName = "getter";
+  StringRef SetterName = "setter";
+  unsigned Attributes = 7;
+  Metadata *Type = MDTuple::getDistinct(Context, None);
+
+  auto *N = MDObjCProperty::get(Context, Name, File, Line, GetterName,
+                                SetterName, Attributes, Type);
+
+  EXPECT_EQ(dwarf::DW_TAG_APPLE_property, N->getTag());
+  EXPECT_EQ(Name, N->getName());
+  EXPECT_EQ(File, N->getFile());
+  EXPECT_EQ(Line, N->getLine());
+  EXPECT_EQ(GetterName, N->getGetterName());
+  EXPECT_EQ(SetterName, N->getSetterName());
+  EXPECT_EQ(Attributes, N->getAttributes());
+  EXPECT_EQ(Type, N->getType());
+  EXPECT_EQ(N, MDObjCProperty::get(Context, Name, File, Line, GetterName,
+                                   SetterName, Attributes, Type));
+
+  EXPECT_NE(N, MDObjCProperty::get(Context, "other", File, Line, GetterName,
+                                   SetterName, Attributes, Type));
+  EXPECT_NE(N, MDObjCProperty::get(Context, Name, Type, Line, GetterName,
+                                   SetterName, Attributes, Type));
+  EXPECT_NE(N, MDObjCProperty::get(Context, Name, File, Line + 1, GetterName,
+                                   SetterName, Attributes, Type));
+  EXPECT_NE(N, MDObjCProperty::get(Context, Name, File, Line, "other",
+                                   SetterName, Attributes, Type));
+  EXPECT_NE(N, MDObjCProperty::get(Context, Name, File, Line, GetterName,
+                                   "other", Attributes, Type));
+  EXPECT_NE(N, MDObjCProperty::get(Context, Name, File, Line, GetterName,
+                                   SetterName, Attributes + 1, Type));
+  EXPECT_NE(N, MDObjCProperty::get(Context, Name, File, Line, GetterName,
+                                   SetterName, Attributes, File));
+
+  TempMDObjCProperty Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+typedef MetadataTest MDImportedEntityTest;
+
+TEST_F(MDImportedEntityTest, get) {
+  unsigned Tag = dwarf::DW_TAG_imported_module;
+  Metadata *Scope = MDTuple::getDistinct(Context, None);
+  Metadata *Entity = MDTuple::getDistinct(Context, None);
+  unsigned Line = 5;
+  StringRef Name = "name";
+
+  auto *N = MDImportedEntity::get(Context, Tag, Scope, Entity, Line, Name);
+
+  EXPECT_EQ(Tag, N->getTag());
+  EXPECT_EQ(Scope, N->getScope());
+  EXPECT_EQ(Entity, N->getEntity());
+  EXPECT_EQ(Line, N->getLine());
+  EXPECT_EQ(Name, N->getName());
+  EXPECT_EQ(N, MDImportedEntity::get(Context, Tag, Scope, Entity, Line, Name));
+
+  EXPECT_NE(N,
+            MDImportedEntity::get(Context, dwarf::DW_TAG_imported_declaration,
+                                  Scope, Entity, Line, Name));
+  EXPECT_NE(N, MDImportedEntity::get(Context, Tag, Entity, Entity, Line, Name));
+  EXPECT_NE(N, MDImportedEntity::get(Context, Tag, Scope, Scope, Line, Name));
+  EXPECT_NE(N,
+            MDImportedEntity::get(Context, Tag, Scope, Entity, Line + 1, Name));
+  EXPECT_NE(N,
+            MDImportedEntity::get(Context, Tag, Scope, Entity, Line, "other"));
+
+  TempMDImportedEntity Temp = N->clone();
+  EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+}
+
+typedef MetadataTest MetadataAsValueTest;
+
+TEST_F(MetadataAsValueTest, MDNode) {
+  MDNode *N = MDNode::get(Context, None);
+  auto *V = MetadataAsValue::get(Context, N);
+  EXPECT_TRUE(V->getType()->isMetadataTy());
+  EXPECT_EQ(N, V->getMetadata());
+
+  auto *V2 = MetadataAsValue::get(Context, N);
+  EXPECT_EQ(V, V2);
+}
+
+TEST_F(MetadataAsValueTest, MDNodeMDNode) {
+  MDNode *N = MDNode::get(Context, None);
+  Metadata *Ops[] = {N};
+  MDNode *N2 = MDNode::get(Context, Ops);
+  auto *V = MetadataAsValue::get(Context, N2);
+  EXPECT_TRUE(V->getType()->isMetadataTy());
+  EXPECT_EQ(N2, V->getMetadata());
+
+  auto *V2 = MetadataAsValue::get(Context, N2);
+  EXPECT_EQ(V, V2);
+
+  auto *V3 = MetadataAsValue::get(Context, N);
+  EXPECT_TRUE(V3->getType()->isMetadataTy());
+  EXPECT_NE(V, V3);
+  EXPECT_EQ(N, V3->getMetadata());
+}
+
+TEST_F(MetadataAsValueTest, MDNodeConstant) {
+  auto *C = ConstantInt::getTrue(Context);
+  auto *MD = ConstantAsMetadata::get(C);
+  Metadata *Ops[] = {MD};
+  auto *N = MDNode::get(Context, Ops);
+
+  auto *V = MetadataAsValue::get(Context, MD);
+  EXPECT_TRUE(V->getType()->isMetadataTy());
+  EXPECT_EQ(MD, V->getMetadata());
+
+  auto *V2 = MetadataAsValue::get(Context, N);
+  EXPECT_EQ(MD, V2->getMetadata());
+  EXPECT_EQ(V, V2);
+}
+
+typedef MetadataTest ValueAsMetadataTest;
+
+TEST_F(ValueAsMetadataTest, UpdatesOnRAUW) {
+  Type *Ty = Type::getInt1PtrTy(Context);
+  std::unique_ptr<GlobalVariable> GV0(
+      new GlobalVariable(Ty, false, GlobalValue::ExternalLinkage));
+  auto *MD = ValueAsMetadata::get(GV0.get());
+  EXPECT_TRUE(MD->getValue() == GV0.get());
+  ASSERT_TRUE(GV0->use_empty());
+
+  std::unique_ptr<GlobalVariable> GV1(
+      new GlobalVariable(Ty, false, GlobalValue::ExternalLinkage));
+  GV0->replaceAllUsesWith(GV1.get());
+  EXPECT_TRUE(MD->getValue() == GV1.get());
+}
+
+TEST_F(ValueAsMetadataTest, CollidingDoubleUpdates) {
+  // Create a constant.
+  ConstantAsMetadata *CI = ConstantAsMetadata::get(
+      ConstantInt::get(getGlobalContext(), APInt(8, 0)));
+
+  // Create a temporary to prevent nodes from resolving.
+  auto Temp = MDTuple::getTemporary(Context, None);
+
+  // When the first operand of N1 gets reset to nullptr, it'll collide with N2.
+  Metadata *Ops1[] = {CI, CI, Temp.get()};
+  Metadata *Ops2[] = {nullptr, CI, Temp.get()};
+
+  auto *N1 = MDTuple::get(Context, Ops1);
+  auto *N2 = MDTuple::get(Context, Ops2);
+  ASSERT_NE(N1, N2);
+
+  // Tell metadata that the constant is getting deleted.
+  //
+  // After this, N1 will be invalid, so don't touch it.
+  ValueAsMetadata::handleDeletion(CI->getValue());
+  EXPECT_EQ(nullptr, N2->getOperand(0));
+  EXPECT_EQ(nullptr, N2->getOperand(1));
+  EXPECT_EQ(Temp.get(), N2->getOperand(2));
+
+  // Clean up Temp for teardown.
+  Temp->replaceAllUsesWith(nullptr);
+}
+
+typedef MetadataTest TrackingMDRefTest;
+
+TEST_F(TrackingMDRefTest, UpdatesOnRAUW) {
+  Type *Ty = Type::getInt1PtrTy(Context);
+  std::unique_ptr<GlobalVariable> GV0(
+      new GlobalVariable(Ty, false, GlobalValue::ExternalLinkage));
+  TypedTrackingMDRef<ValueAsMetadata> MD(ValueAsMetadata::get(GV0.get()));
+  EXPECT_TRUE(MD->getValue() == GV0.get());
+  ASSERT_TRUE(GV0->use_empty());
+
+  std::unique_ptr<GlobalVariable> GV1(
+      new GlobalVariable(Ty, false, GlobalValue::ExternalLinkage));
+  GV0->replaceAllUsesWith(GV1.get());
+  EXPECT_TRUE(MD->getValue() == GV1.get());
+
+  // Reset it, so we don't inadvertently test deletion.
+  MD.reset();
+}
+
+TEST_F(TrackingMDRefTest, UpdatesOnDeletion) {
+  Type *Ty = Type::getInt1PtrTy(Context);
+  std::unique_ptr<GlobalVariable> GV(
+      new GlobalVariable(Ty, false, GlobalValue::ExternalLinkage));
+  TypedTrackingMDRef<ValueAsMetadata> MD(ValueAsMetadata::get(GV.get()));
+  EXPECT_TRUE(MD->getValue() == GV.get());
+  ASSERT_TRUE(GV->use_empty());
+
+  GV.reset();
+  EXPECT_TRUE(!MD);
+}
+
 TEST(NamedMDNodeTest, Search) {
   LLVMContext Context;
-  Constant *C = ConstantInt::get(Type::getInt32Ty(Context), 1);
-  Constant *C2 = ConstantInt::get(Type::getInt32Ty(Context), 2);
+  ConstantAsMetadata *C =
+      ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), 1));
+  ConstantAsMetadata *C2 =
+      ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), 2));
 
-  Value *const V = C;
-  Value *const V2 = C2;
+  Metadata *const V = C;
+  Metadata *const V2 = C2;
   MDNode *n = MDNode::get(Context, V);
   MDNode *n2 = MDNode::get(Context, V2);
 
diff --git a/unittests/IR/PassManagerTest.cpp b/unittests/IR/PassManagerTest.cpp
index d493156..41af0b0 100644
--- a/unittests/IR/PassManagerTest.cpp
+++ b/unittests/IR/PassManagerTest.cpp
@@ -29,13 +29,16 @@ public:
   /// \brief Returns an opaque, unique ID for this pass type.
   static void *ID() { return (void *)&PassID; }
 
+  /// \brief Returns the name of the analysis.
+  static StringRef name() { return "TestFunctionAnalysis"; }
+
   TestFunctionAnalysis(int &Runs) : Runs(Runs) {}
 
   /// \brief Run the analysis pass over the function and return a result.
-  Result run(Function *F, FunctionAnalysisManager *AM) {
+  Result run(Function &F, FunctionAnalysisManager *AM) {
     ++Runs;
     int Count = 0;
-    for (Function::iterator BBI = F->begin(), BBE = F->end(); BBI != BBE; ++BBI)
+    for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI)
       for (BasicBlock::iterator II = BBI->begin(), IE = BBI->end(); II != IE;
            ++II)
         ++Count;
@@ -60,12 +63,14 @@ public:
 
   static void *ID() { return (void *)&PassID; }
 
+  static StringRef name() { return "TestModuleAnalysis"; }
+
   TestModuleAnalysis(int &Runs) : Runs(Runs) {}
 
-  Result run(Module *M, ModuleAnalysisManager *AM) {
+  Result run(Module &M, ModuleAnalysisManager *AM) {
     ++Runs;
     int Count = 0;
-    for (Module::iterator I = M->begin(), E = M->end(); I != E; ++I)
+    for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
       ++Count;
     return Result(Count);
   }
@@ -81,7 +86,7 @@ char TestModuleAnalysis::PassID;
 struct TestModulePass {
   TestModulePass(int &RunCount) : RunCount(RunCount) {}
 
-  PreservedAnalyses run(Module *M) {
+  PreservedAnalyses run(Module &M) {
     ++RunCount;
     return PreservedAnalyses::none();
   }
@@ -92,13 +97,13 @@ struct TestModulePass {
 };
 
 struct TestPreservingModulePass {
-  PreservedAnalyses run(Module *M) { return PreservedAnalyses::all(); }
+  PreservedAnalyses run(Module &M) { return PreservedAnalyses::all(); }
 
   static StringRef name() { return "TestPreservingModulePass"; }
 };
 
 struct TestMinPreservingModulePass {
-  PreservedAnalyses run(Module *M, ModuleAnalysisManager *AM) {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager *AM) {
     PreservedAnalyses PA;
 
     // Force running an analysis.
@@ -119,13 +124,13 @@ struct TestFunctionPass {
         AnalyzedFunctionCount(AnalyzedFunctionCount),
         OnlyUseCachedResults(OnlyUseCachedResults) {}
 
-  PreservedAnalyses run(Function *F, FunctionAnalysisManager *AM) {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager *AM) {
     ++RunCount;
 
     const ModuleAnalysisManager &MAM =
         AM->getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
     if (TestModuleAnalysis::Result *TMA =
-            MAM.getCachedResult<TestModuleAnalysis>(F->getParent()))
+            MAM.getCachedResult<TestModuleAnalysis>(*F.getParent()))
       AnalyzedFunctionCount += TMA->FunctionCount;
 
     if (OnlyUseCachedResults) {
@@ -155,9 +160,9 @@ struct TestFunctionPass {
 struct TestInvalidationFunctionPass {
   TestInvalidationFunctionPass(StringRef FunctionName) : Name(FunctionName) {}
 
-  PreservedAnalyses run(Function *F) {
-    return F->getName() == Name ? PreservedAnalyses::none()
-                                : PreservedAnalyses::all();
+  PreservedAnalyses run(Function &F) {
+    return F.getName() == Name ? PreservedAnalyses::none()
+                               : PreservedAnalyses::all();
   }
 
   static StringRef name() { return "TestInvalidationFunctionPass"; }
@@ -165,10 +170,10 @@ struct TestInvalidationFunctionPass {
   StringRef Name;
 };
 
-Module *parseIR(const char *IR) {
+std::unique_ptr<Module> parseIR(const char *IR) {
   LLVMContext &C = getGlobalContext();
   SMDiagnostic Err;
-  return parseAssemblyString(IR, Err, C).release();
+  return parseAssemblyString(IR, Err, C);
 }
 
 class PassManagerTest : public ::testing::Test {
@@ -310,7 +315,7 @@ TEST_F(PassManagerTest, Basic) {
     MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
   }
 
-  MPM.run(M.get(), &MAM);
+  MPM.run(*M, &MAM);
 
   // Validate module pass counters.
   EXPECT_EQ(1, ModulePassRunCount);
diff --git a/unittests/IR/UseTest.cpp b/unittests/IR/UseTest.cpp
index 3f33ca6..d9d20af 100644
--- a/unittests/IR/UseTest.cpp
+++ b/unittests/IR/UseTest.cpp
@@ -9,10 +9,10 @@
 
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/User.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
@@ -50,7 +50,7 @@ TEST(UseTest, sort) {
   });
   unsigned I = 0;
   for (User *U : X.users()) {
-    snprintf(vnbuf, sizeof(vnbuf), "v%u", I++);
+    format("v%u", I++).snprint(vnbuf, sizeof(vnbuf));
     EXPECT_EQ(vnbuf, U->getName());
   }
   ASSERT_EQ(8u, I);
@@ -60,7 +60,7 @@ TEST(UseTest, sort) {
   });
   I = 0;
   for (User *U : X.users()) {
-    snprintf(vnbuf, sizeof(vnbuf), "v%u", (7 - I++));
+    format("v%u", (7 - I++)).snprint(vnbuf, sizeof(vnbuf));
     EXPECT_EQ(vnbuf, U->getName());
   }
   ASSERT_EQ(8u, I);
@@ -95,7 +95,7 @@ TEST(UseTest, reverse) {
   });
   unsigned I = 0;
   for (User *U : X.users()) {
-    snprintf(vnbuf, sizeof(vnbuf), "v%u", I++);
+    format("v%u", I++).snprint(vnbuf, sizeof(vnbuf));
     EXPECT_EQ(vnbuf, U->getName());
   }
   ASSERT_EQ(8u, I);
@@ -103,7 +103,7 @@ TEST(UseTest, reverse) {
   X.reverseUseList();
   I = 0;
   for (User *U : X.users()) {
-    snprintf(vnbuf, sizeof(vnbuf), "v%u", (7 - I++));
+    format("v%u", (7 - I++)).snprint(vnbuf, sizeof(vnbuf));
     EXPECT_EQ(vnbuf, U->getName());
   }
   ASSERT_EQ(8u, I);
diff --git a/unittests/IR/UserTest.cpp b/unittests/IR/UserTest.cpp
index 5572424..56b054b 100644
--- a/unittests/IR/UserTest.cpp
+++ b/unittests/IR/UserTest.cpp
@@ -9,10 +9,10 @@
 
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/User.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 using namespace llvm;
diff --git a/unittests/IR/WaymarkTest.cpp b/unittests/IR/WaymarkTest.cpp
index 8e3cd45..a8924ef 100644
--- a/unittests/IR/WaymarkTest.cpp
+++ b/unittests/IR/WaymarkTest.cpp
@@ -29,8 +29,9 @@ TEST(WaymarkTest, NativeArray) {
   Value * values[22];
   std::transform(tail, tail + 22, values, char2constant);
   FunctionType *FT = FunctionType::get(Type::getVoidTy(getGlobalContext()), true);
-  Function *F = Function::Create(FT, GlobalValue::ExternalLinkage);
-  const CallInst *A = CallInst::Create(F, makeArrayRef(values));
+  std::unique_ptr<Function> F(
+      Function::Create(FT, GlobalValue::ExternalLinkage));
+  const CallInst *A = CallInst::Create(F.get(), makeArrayRef(values));
   ASSERT_NE(A, (const CallInst*)nullptr);
   ASSERT_EQ(1U + 22, A->getNumOperands());
   const Use *U = &A->getOperandUse(0);
diff --git a/unittests/Linker/LinkModulesTest.cpp b/unittests/Linker/LinkModulesTest.cpp
index b15d180..91f97d7 100644
--- a/unittests/Linker/LinkModulesTest.cpp
+++ b/unittests/Linker/LinkModulesTest.cpp
@@ -8,12 +8,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/AsmParser/Parser.h"
-#include "llvm/Linker/Linker.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Linker/Linker.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
diff --git a/unittests/MC/CMakeLists.txt b/unittests/MC/CMakeLists.txt
index c82bcde..f83eaf4 100644
--- a/unittests/MC/CMakeLists.txt
+++ b/unittests/MC/CMakeLists.txt
@@ -10,9 +10,3 @@ add_llvm_unittest(MCTests
   StringTableBuilderTest.cpp
   YAMLTest.cpp
   )
-
-foreach(t ${LLVM_TARGETS_TO_BUILD})
-  if (IS_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/${t}")
-    add_subdirectory(${t})
-  endif (IS_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/${t}")
-endforeach()
diff --git a/unittests/MC/Hexagon/CMakeLists.txt b/unittests/MC/Hexagon/CMakeLists.txt
deleted file mode 100644
index 6d4ee93..0000000
--- a/unittests/MC/Hexagon/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-set(LLVM_LINK_COMPONENTS
-  HexagonCodeGen
-  HexagonDesc
-  HexagonInfo
-  MC
-  Support
-  )
-
-include_directories (${LLVM_MAIN_SRC_DIR}/lib/Target/Hexagon)
-include_directories (${LLVM_BINARY_DIR}/lib/Target/Hexagon)
-
-add_llvm_unittest(HexagonTests
-  HexagonMCCodeEmitterTest.cpp
-  )
diff --git a/unittests/MC/Hexagon/HexagonMCCodeEmitterTest.cpp b/unittests/MC/Hexagon/HexagonMCCodeEmitterTest.cpp
deleted file mode 100644
index 958a21f..0000000
--- a/unittests/MC/Hexagon/HexagonMCCodeEmitterTest.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#include "gtest/gtest.h"
-
-#include <memory>
-
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/TargetSelect.h"
-
-#include "MCTargetDesc/HexagonMCInst.h"
-#include "MCTargetDesc/HexagonMCTargetDesc.h"
-
-namespace {
-class TestEmitter {
-public:
-  TestEmitter() : Triple("hexagon-unknown-elf") {
-    LLVMInitializeHexagonTargetInfo();
-    LLVMInitializeHexagonTarget();
-    LLVMInitializeHexagonTargetMC();
-    std::string error;
-    Target = llvm::TargetRegistry::lookupTarget("hexagon", error);
-    assert(Target != nullptr && "Expected to find target");
-    assert(error.empty() && "Error should be empty if we have a target");
-    RegisterInfo = Target->createMCRegInfo(Triple);
-    assert(RegisterInfo != nullptr && "Expecting to find register info");
-    AsmInfo = Target->createMCAsmInfo(*RegisterInfo, Triple);
-    assert(AsmInfo != nullptr && "Expecting to find asm info");
-    Context = new llvm::MCContext(AsmInfo, RegisterInfo, nullptr);
-    assert(Context != nullptr && "Expecting to create a context");
-    Subtarget = Target->createMCSubtargetInfo(Triple, "hexagonv4", "");
-    assert(Subtarget != nullptr && "Expecting to find a subtarget");
-    InstrInfo = Target->createMCInstrInfo();
-    assert(InstrInfo != nullptr && "Expecting to find instr info");
-    Emitter = Target->createMCCodeEmitter(*InstrInfo, *RegisterInfo, *Subtarget,
-                                          *Context);
-    assert(Emitter != nullptr);
-  }
-  std::string Triple;
-  llvm::Target const *Target;
-  llvm::MCRegisterInfo *RegisterInfo;
-  llvm::MCAsmInfo *AsmInfo;
-  llvm::MCContext *Context;
-  llvm::MCSubtargetInfo *Subtarget;
-  llvm::MCInstrInfo *InstrInfo;
-  llvm::MCCodeEmitter *Emitter;
-};
-TestEmitter Emitter;
-}
-
-TEST(HexagonMCCodeEmitter, emitter_creation) {
-  ASSERT_NE(nullptr, Emitter.Emitter);
-}
diff --git a/unittests/MC/StringTableBuilderTest.cpp b/unittests/MC/StringTableBuilderTest.cpp
index 6b5185c..716b9c7 100644
--- a/unittests/MC/StringTableBuilderTest.cpp
+++ b/unittests/MC/StringTableBuilderTest.cpp
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/StringTableBuilder.h"
-#include "gtest/gtest.h"
 #include "llvm/Support/Endian.h"
+#include "gtest/gtest.h"
 #include <string>
 
 using namespace llvm;
diff --git a/unittests/Makefile b/unittests/Makefile
index 603e7d5..fefef07 100644
--- a/unittests/Makefile
+++ b/unittests/Makefile
@@ -10,7 +10,7 @@
 LEVEL = ..
 
 PARALLEL_DIRS = ADT Analysis Bitcode CodeGen DebugInfo ExecutionEngine IR \
-		LineEditor Linker MC Option Support Transforms
+		LineEditor Linker MC Option ProfileData Support Transforms
 
 include $(LEVEL)/Makefile.config
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/ProfileData/CMakeLists.txt b/unittests/ProfileData/CMakeLists.txt
new file mode 100644
index 0000000..79137c9
--- /dev/null
+++ b/unittests/ProfileData/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  ProfileData
+  Support
+  )
+
+add_llvm_unittest(ProfileDataTests
+  CoverageMappingTest.cpp
+  InstrProfTest.cpp
+  )
diff --git a/unittests/ProfileData/CoverageMappingTest.cpp b/unittests/ProfileData/CoverageMappingTest.cpp
new file mode 100644
index 0000000..9fceacb
--- /dev/null
+++ b/unittests/ProfileData/CoverageMappingTest.cpp
@@ -0,0 +1,262 @@
+//===- unittest/ProfileData/CoverageMappingTest.cpp -------------------------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ProfileData/CoverageMapping.h"
+#include "llvm/ProfileData/CoverageMappingReader.h"
+#include "llvm/ProfileData/CoverageMappingWriter.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/ProfileData/InstrProfWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "gtest/gtest.h"
+
+#include <sstream>
+
+using namespace llvm;
+using namespace coverage;
+
+static ::testing::AssertionResult NoError(std::error_code EC) {
+  if (!EC)
+    return ::testing::AssertionSuccess();
+  return ::testing::AssertionFailure() << "error " << EC.value()
+                                       << ": " << EC.message();
+}
+
+namespace llvm {
+namespace coverage {
+void PrintTo(const Counter &C, ::std::ostream *os) {
+  if (C.isZero())
+    *os << "Zero";
+  else if (C.isExpression())
+    *os << "Expression " << C.getExpressionID();
+  else
+    *os << "Counter " << C.getCounterID();
+}
+
+void PrintTo(const CoverageSegment &S, ::std::ostream *os) {
+  *os << "CoverageSegment(" << S.Line << ", " << S.Col << ", ";
+  if (S.HasCount)
+    *os << S.Count << ", ";
+  *os << (S.IsRegionEntry ? "true" : "false") << ")";
+}
+}
+}
+
+namespace {
+
+struct OneFunctionCoverageReader : CoverageMappingReader {
+  StringRef Name;
+  uint64_t Hash;
+  std::vector<StringRef> Filenames;
+  ArrayRef<CounterMappingRegion> Regions;
+  bool Done;
+
+  OneFunctionCoverageReader(StringRef Name, uint64_t Hash,
+                            ArrayRef<StringRef> Filenames,
+                            ArrayRef<CounterMappingRegion> Regions)
+      : Name(Name), Hash(Hash), Filenames(Filenames), Regions(Regions),
+        Done(false) {}
+
+  std::error_code readNextRecord(CoverageMappingRecord &Record) override {
+    if (Done)
+      return instrprof_error::eof;
+    Done = true;
+
+    Record.FunctionName = Name;
+    Record.FunctionHash = Hash;
+    Record.Filenames = Filenames;
+    Record.Expressions = {};
+    Record.MappingRegions = Regions;
+    return instrprof_error::success;
+  }
+};
+
+struct CoverageMappingTest : ::testing::Test {
+  StringMap<unsigned> Files;
+  unsigned NextFile;
+  std::vector<CounterMappingRegion> InputCMRs;
+
+  std::vector<StringRef> OutputFiles;
+  std::vector<CounterExpression> OutputExpressions;
+  std::vector<CounterMappingRegion> OutputCMRs;
+
+  InstrProfWriter ProfileWriter;
+  std::unique_ptr<IndexedInstrProfReader> ProfileReader;
+
+  std::unique_ptr<CoverageMapping> LoadedCoverage;
+
+  void SetUp() override {
+    NextFile = 0;
+  }
+
+  unsigned getFile(StringRef Name) {
+    auto R = Files.find(Name);
+    if (R != Files.end())
+      return R->second;
+    Files[Name] = NextFile;
+    return NextFile++;
+  }
+
+  void addCMR(Counter C, StringRef File, unsigned LS, unsigned CS, unsigned LE,
+              unsigned CE) {
+    InputCMRs.push_back(
+        CounterMappingRegion::makeRegion(C, getFile(File), LS, CS, LE, CE));
+  }
+
+  void addExpansionCMR(StringRef File, StringRef ExpandedFile, unsigned LS,
+                       unsigned CS, unsigned LE, unsigned CE) {
+    InputCMRs.push_back(CounterMappingRegion::makeExpansion(
+        getFile(File), getFile(ExpandedFile), LS, CS, LE, CE));
+  }
+
+  std::string writeCoverageRegions() {
+    SmallVector<unsigned, 8> FileIDs;
+    for (const auto &E : Files)
+      FileIDs.push_back(E.getValue());
+    std::string Coverage;
+    llvm::raw_string_ostream OS(Coverage);
+    CoverageMappingWriter(FileIDs, None, InputCMRs).write(OS);
+    return OS.str();
+  }
+
+  void readCoverageRegions(std::string Coverage) {
+    SmallVector<StringRef, 8> Filenames;
+    for (const auto &E : Files)
+      Filenames.push_back(E.getKey());
+    RawCoverageMappingReader Reader(Coverage, Filenames, OutputFiles,
+                                    OutputExpressions, OutputCMRs);
+    ASSERT_TRUE(NoError(Reader.read()));
+  }
+
+  void readProfCounts() {
+    auto Profile = ProfileWriter.writeBuffer();
+    auto ReaderOrErr = IndexedInstrProfReader::create(std::move(Profile));
+    ASSERT_TRUE(NoError(ReaderOrErr.getError()));
+    ProfileReader = std::move(ReaderOrErr.get());
+  }
+
+  void loadCoverageMapping(StringRef FuncName, uint64_t Hash) {
+    std::string Regions = writeCoverageRegions();
+    readCoverageRegions(Regions);
+
+    SmallVector<StringRef, 8> Filenames;
+    for (const auto &E : Files)
+      Filenames.push_back(E.getKey());
+    OneFunctionCoverageReader CovReader(FuncName, Hash, Filenames, OutputCMRs);
+    auto CoverageOrErr = CoverageMapping::load(CovReader, *ProfileReader);
+    ASSERT_TRUE(NoError(CoverageOrErr.getError()));
+    LoadedCoverage = std::move(CoverageOrErr.get());
+  }
+};
+
+TEST_F(CoverageMappingTest, basic_write_read) {
+  addCMR(Counter::getCounter(0), "foo", 1, 1, 1, 1);
+  addCMR(Counter::getCounter(1), "foo", 2, 1, 2, 2);
+  addCMR(Counter::getZero(),     "foo", 3, 1, 3, 4);
+  addCMR(Counter::getCounter(2), "foo", 4, 1, 4, 8);
+  addCMR(Counter::getCounter(3), "bar", 1, 2, 3, 4);
+  std::string Coverage = writeCoverageRegions();
+  readCoverageRegions(Coverage);
+
+  size_t N = makeArrayRef(InputCMRs).size();
+  ASSERT_EQ(N, OutputCMRs.size());
+  for (size_t I = 0; I < N; ++I) {
+    ASSERT_EQ(InputCMRs[I].Count,      OutputCMRs[I].Count);
+    ASSERT_EQ(InputCMRs[I].FileID,     OutputCMRs[I].FileID);
+    ASSERT_EQ(InputCMRs[I].startLoc(), OutputCMRs[I].startLoc());
+    ASSERT_EQ(InputCMRs[I].endLoc(),   OutputCMRs[I].endLoc());
+    ASSERT_EQ(InputCMRs[I].Kind,       OutputCMRs[I].Kind);
+  }
+}
+
+TEST_F(CoverageMappingTest, expansion_gets_first_counter) {
+  addCMR(Counter::getCounter(1), "foo", 10, 1, 10, 2);
+  // This starts earlier in "foo", so the expansion should get its counter.
+  addCMR(Counter::getCounter(2), "foo", 1, 1, 20, 1);
+  addExpansionCMR("bar", "foo", 3, 3, 3, 3);
+  std::string Coverage = writeCoverageRegions();
+  readCoverageRegions(Coverage);
+
+  ASSERT_EQ(CounterMappingRegion::ExpansionRegion, OutputCMRs[2].Kind);
+  ASSERT_EQ(Counter::getCounter(2), OutputCMRs[2].Count);
+  ASSERT_EQ(3U, OutputCMRs[2].LineStart);
+}
+
+TEST_F(CoverageMappingTest, basic_coverage_iteration) {
+  ProfileWriter.addFunctionCounts("func", 0x1234, {30, 20, 10, 0});
+  readProfCounts();
+
+  addCMR(Counter::getCounter(0), "file1", 1, 1, 9, 9);
+  addCMR(Counter::getCounter(1), "file1", 1, 1, 4, 7);
+  addCMR(Counter::getCounter(2), "file1", 5, 8, 9, 1);
+  addCMR(Counter::getCounter(3), "file1", 10, 10, 11, 11);
+  loadCoverageMapping("func", 0x1234);
+
+  CoverageData Data = LoadedCoverage->getCoverageForFile("file1");
+  std::vector<CoverageSegment> Segments(Data.begin(), Data.end());
+  ASSERT_EQ(7U, Segments.size());
+  ASSERT_EQ(CoverageSegment(1, 1, 20, true),  Segments[0]);
+  ASSERT_EQ(CoverageSegment(4, 7, 30, false), Segments[1]);
+  ASSERT_EQ(CoverageSegment(5, 8, 10, true),  Segments[2]);
+  ASSERT_EQ(CoverageSegment(9, 1, 30, false), Segments[3]);
+  ASSERT_EQ(CoverageSegment(9, 9, false),     Segments[4]);
+  ASSERT_EQ(CoverageSegment(10, 10, 0, true), Segments[5]);
+  ASSERT_EQ(CoverageSegment(11, 11, false),   Segments[6]);
+}
+
+TEST_F(CoverageMappingTest, uncovered_function) {
+  readProfCounts();
+
+  addCMR(Counter::getZero(), "file1", 1, 2, 3, 4);
+  loadCoverageMapping("func", 0x1234);
+
+  CoverageData Data = LoadedCoverage->getCoverageForFile("file1");
+  std::vector<CoverageSegment> Segments(Data.begin(), Data.end());
+  ASSERT_EQ(2U, Segments.size());
+  ASSERT_EQ(CoverageSegment(1, 2, 0, true), Segments[0]);
+  ASSERT_EQ(CoverageSegment(3, 4, false),   Segments[1]);
+}
+
+TEST_F(CoverageMappingTest, combine_regions) {
+  ProfileWriter.addFunctionCounts("func", 0x1234, {10, 20, 30});
+  readProfCounts();
+
+  addCMR(Counter::getCounter(0), "file1", 1, 1, 9, 9);
+  addCMR(Counter::getCounter(1), "file1", 3, 3, 4, 4);
+  addCMR(Counter::getCounter(2), "file1", 3, 3, 4, 4);
+  loadCoverageMapping("func", 0x1234);
+
+  CoverageData Data = LoadedCoverage->getCoverageForFile("file1");
+  std::vector<CoverageSegment> Segments(Data.begin(), Data.end());
+  ASSERT_EQ(4U, Segments.size());
+  ASSERT_EQ(CoverageSegment(1, 1, 10, true), Segments[0]);
+  ASSERT_EQ(CoverageSegment(3, 3, 50, true), Segments[1]);
+  ASSERT_EQ(CoverageSegment(4, 4, 10, false), Segments[2]);
+  ASSERT_EQ(CoverageSegment(9, 9, false), Segments[3]);
+}
+
+TEST_F(CoverageMappingTest, dont_combine_expansions) {
+  ProfileWriter.addFunctionCounts("func", 0x1234, {10, 20});
+  readProfCounts();
+
+  addCMR(Counter::getCounter(0), "file1", 1, 1, 9, 9);
+  addCMR(Counter::getCounter(1), "file1", 3, 3, 4, 4);
+  addCMR(Counter::getCounter(1), "include1", 6, 6, 7, 7);
+  addExpansionCMR("file1", "include1", 3, 3, 4, 4);
+  loadCoverageMapping("func", 0x1234);
+
+  CoverageData Data = LoadedCoverage->getCoverageForFile("file1");
+  std::vector<CoverageSegment> Segments(Data.begin(), Data.end());
+  ASSERT_EQ(4U, Segments.size());
+  ASSERT_EQ(CoverageSegment(1, 1, 10, true), Segments[0]);
+  ASSERT_EQ(CoverageSegment(3, 3, 20, true), Segments[1]);
+  ASSERT_EQ(CoverageSegment(4, 4, 10, false), Segments[2]);
+  ASSERT_EQ(CoverageSegment(9, 9, false), Segments[3]);
+}
+
+} // end anonymous namespace
diff --git a/unittests/ProfileData/InstrProfTest.cpp b/unittests/ProfileData/InstrProfTest.cpp
new file mode 100644
index 0000000..26ea0e4
--- /dev/null
+++ b/unittests/ProfileData/InstrProfTest.cpp
@@ -0,0 +1,98 @@
+//===- unittest/ProfileData/InstrProfTest.cpp -------------------------------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/ProfileData/InstrProfWriter.h"
+#include "gtest/gtest.h"
+
+#include <cstdarg>
+
+using namespace llvm;
+
+static ::testing::AssertionResult NoError(std::error_code EC) {
+  if (!EC)
+    return ::testing::AssertionSuccess();
+  return ::testing::AssertionFailure() << "error " << EC.value()
+                                       << ": " << EC.message();
+}
+
+static ::testing::AssertionResult ErrorEquals(std::error_code Expected,
+                                              std::error_code Found) {
+  if (Expected == Found)
+    return ::testing::AssertionSuccess();
+  return ::testing::AssertionFailure() << "error " << Found.value()
+                                       << ": " << Found.message();
+}
+
+namespace {
+
+struct InstrProfTest : ::testing::Test {
+  InstrProfWriter Writer;
+  std::unique_ptr<IndexedInstrProfReader> Reader;
+
+  void readProfile(std::unique_ptr<MemoryBuffer> Profile) {
+    auto ReaderOrErr = IndexedInstrProfReader::create(std::move(Profile));
+    ASSERT_TRUE(NoError(ReaderOrErr.getError()));
+    Reader = std::move(ReaderOrErr.get());
+  }
+};
+
+TEST_F(InstrProfTest, write_and_read_empty_profile) {
+  auto Profile = Writer.writeBuffer();
+  readProfile(std::move(Profile));
+  ASSERT_TRUE(Reader->begin() == Reader->end());
+}
+
+TEST_F(InstrProfTest, write_and_read_one_function) {
+  Writer.addFunctionCounts("foo", 0x1234, {1, 2, 3, 4});
+  auto Profile = Writer.writeBuffer();
+  readProfile(std::move(Profile));
+
+  auto I = Reader->begin(), E = Reader->end();
+  ASSERT_TRUE(I != E);
+  ASSERT_EQ(StringRef("foo"), I->Name);
+  ASSERT_EQ(0x1234U, I->Hash);
+  ASSERT_EQ(4U, I->Counts.size());
+  ASSERT_EQ(1U, I->Counts[0]);
+  ASSERT_EQ(2U, I->Counts[1]);
+  ASSERT_EQ(3U, I->Counts[2]);
+  ASSERT_EQ(4U, I->Counts[3]);
+  ASSERT_TRUE(++I == E);
+}
+
+TEST_F(InstrProfTest, get_function_counts) {
+  Writer.addFunctionCounts("foo", 0x1234, {1, 2});
+  auto Profile = Writer.writeBuffer();
+  readProfile(std::move(Profile));
+
+  std::vector<uint64_t> Counts;
+  ASSERT_TRUE(NoError(Reader->getFunctionCounts("foo", 0x1234, Counts)));
+  ASSERT_EQ(2U, Counts.size());
+  ASSERT_EQ(1U, Counts[0]);
+  ASSERT_EQ(2U, Counts[1]);
+
+  std::error_code EC;
+  EC = Reader->getFunctionCounts("foo", 0x5678, Counts);
+  ASSERT_TRUE(ErrorEquals(instrprof_error::hash_mismatch, EC));
+
+  EC = Reader->getFunctionCounts("bar", 0x1234, Counts);
+  ASSERT_TRUE(ErrorEquals(instrprof_error::unknown_function, EC));
+}
+
+TEST_F(InstrProfTest, get_max_function_count) {
+  Writer.addFunctionCounts("foo", 0x1234, {1ULL << 31, 2});
+  Writer.addFunctionCounts("bar", 0, {1ULL << 63});
+  Writer.addFunctionCounts("baz", 0x5678, {0, 0, 0, 0});
+  auto Profile = Writer.writeBuffer();
+  readProfile(std::move(Profile));
+
+  ASSERT_EQ(1ULL << 63, Reader->getMaximumFunctionCount());
+}
+
+} // end anonymous namespace
diff --git a/unittests/ProfileData/Makefile b/unittests/ProfileData/Makefile
new file mode 100644
index 0000000..d017c15
--- /dev/null
+++ b/unittests/ProfileData/Makefile
@@ -0,0 +1,15 @@
+##===- unittests/ProfileData/Makefile ----------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+TESTNAME = ProfileData
+LINK_COMPONENTS := ProfileData Core Support
+
+include $(LEVEL)/Makefile.config
+include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/Support/AlignOfTest.cpp b/unittests/Support/AlignOfTest.cpp
index 40f7295..1119019 100644
--- a/unittests/Support/AlignOfTest.cpp
+++ b/unittests/Support/AlignOfTest.cpp
@@ -22,13 +22,13 @@ namespace {
 
 // Suppress direct base '{anonymous}::S1' inaccessible in '{anonymous}::D9'
 // due to ambiguity warning.
-//
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wunknown-pragmas"
+#pragma clang diagnostic ignored "-Winaccessible-base"
+#elif ((__GNUC__ * 100) + __GNUC_MINOR__) >= 402
 // Pragma based warning suppression was introduced in GGC 4.2.  Additionally
 // this warning is "enabled by default".  The warning still appears if -Wall is
 // suppressed.  Apparently GCC suppresses it when -w is specifed, which is odd.
-// At any rate, clang on the other hand gripes about -Wunknown-pragma, so
-// leaving it out of this.
-#if ((__GNUC__ * 100) + __GNUC_MINOR__) >= 402 && !defined(__clang__)
 #pragma GCC diagnostic warning "-w"
 #endif
 
diff --git a/unittests/Support/BlockFrequencyTest.cpp b/unittests/Support/BlockFrequencyTest.cpp
index f6e3537..0eac8bb 100644
--- a/unittests/Support/BlockFrequencyTest.cpp
+++ b/unittests/Support/BlockFrequencyTest.cpp
@@ -1,3 +1,12 @@
+//===- unittests/Support/BlockFrequencyTest.cpp - BlockFrequency tests ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/DataTypes.h"
diff --git a/unittests/Support/CMakeLists.txt b/unittests/Support/CMakeLists.txt
index 7abdd8a..f3b55c3 100644
--- a/unittests/Support/CMakeLists.txt
+++ b/unittests/Support/CMakeLists.txt
@@ -13,6 +13,8 @@ add_llvm_unittest(SupportTests
   CompressionTest.cpp
   ConvertUTFTest.cpp
   DataExtractorTest.cpp
+  DwarfTest.cpp
+  EndianStreamTest.cpp
   EndianTest.cpp
   ErrorOrTest.cpp
   FileOutputBufferTest.cpp
@@ -32,6 +34,7 @@ add_llvm_unittest(SupportTests
   ScaledNumberTest.cpp
   SourceMgrTest.cpp
   SpecialCaseListTest.cpp
+  StreamingMemoryObject.cpp
   StringPool.cpp
   SwapByteOrderTest.cpp
   ThreadLocalTest.cpp
diff --git a/unittests/Support/Casting.cpp b/unittests/Support/Casting.cpp
index 88c7d19..e6c35fc 100644
--- a/unittests/Support/Casting.cpp
+++ b/unittests/Support/Casting.cpp
@@ -232,3 +232,99 @@ namespace TemporaryCast {
 struct pod {};
 IllegalCast *testIllegalCast() { return cast<foo>(pod()); }
 }
+
+namespace {
+namespace pointer_wrappers {
+
+struct Base {
+  bool IsDerived;
+  Base(bool IsDerived = false) : IsDerived(IsDerived) {}
+};
+
+struct Derived : Base {
+  Derived() : Base(true) {}
+  static bool classof(const Base *B) { return B->IsDerived; }
+};
+
+class PTy {
+  Base *B;
+public:
+  PTy(Base *B) : B(B) {}
+  explicit operator bool() const { return get(); }
+  Base *get() const { return B; }
+};
+
+} // end namespace pointer_wrappers
+} // end namespace
+
+namespace llvm {
+
+template <> struct simplify_type<pointer_wrappers::PTy> {
+  typedef pointer_wrappers::Base *SimpleType;
+  static SimpleType getSimplifiedValue(pointer_wrappers::PTy &P) {
+    return P.get();
+  }
+};
+template <> struct simplify_type<const pointer_wrappers::PTy> {
+  typedef pointer_wrappers::Base *SimpleType;
+  static SimpleType getSimplifiedValue(const pointer_wrappers::PTy &P) {
+    return P.get();
+  }
+};
+
+} // end namespace llvm
+
+namespace {
+namespace pointer_wrappers {
+
+// Some objects.
+pointer_wrappers::Base B;
+pointer_wrappers::Derived D;
+
+// Mutable "smart" pointers.
+pointer_wrappers::PTy MN(nullptr);
+pointer_wrappers::PTy MB(&B);
+pointer_wrappers::PTy MD(&D);
+
+// Const "smart" pointers.
+const pointer_wrappers::PTy CN(nullptr);
+const pointer_wrappers::PTy CB(&B);
+const pointer_wrappers::PTy CD(&D);
+
+TEST(CastingTest, smart_isa) {
+  EXPECT_TRUE(!isa<pointer_wrappers::Derived>(MB));
+  EXPECT_TRUE(!isa<pointer_wrappers::Derived>(CB));
+  EXPECT_TRUE(isa<pointer_wrappers::Derived>(MD));
+  EXPECT_TRUE(isa<pointer_wrappers::Derived>(CD));
+}
+
+TEST(CastingTest, smart_cast) {
+  EXPECT_TRUE(cast<pointer_wrappers::Derived>(MD) == &D);
+  EXPECT_TRUE(cast<pointer_wrappers::Derived>(CD) == &D);
+}
+
+TEST(CastingTest, smart_cast_or_null) {
+  EXPECT_TRUE(cast_or_null<pointer_wrappers::Derived>(MN) == nullptr);
+  EXPECT_TRUE(cast_or_null<pointer_wrappers::Derived>(CN) == nullptr);
+  EXPECT_TRUE(cast_or_null<pointer_wrappers::Derived>(MD) == &D);
+  EXPECT_TRUE(cast_or_null<pointer_wrappers::Derived>(CD) == &D);
+}
+
+TEST(CastingTest, smart_dyn_cast) {
+  EXPECT_TRUE(dyn_cast<pointer_wrappers::Derived>(MB) == nullptr);
+  EXPECT_TRUE(dyn_cast<pointer_wrappers::Derived>(CB) == nullptr);
+  EXPECT_TRUE(dyn_cast<pointer_wrappers::Derived>(MD) == &D);
+  EXPECT_TRUE(dyn_cast<pointer_wrappers::Derived>(CD) == &D);
+}
+
+TEST(CastingTest, smart_dyn_cast_or_null) {
+  EXPECT_TRUE(dyn_cast_or_null<pointer_wrappers::Derived>(MN) == nullptr);
+  EXPECT_TRUE(dyn_cast_or_null<pointer_wrappers::Derived>(CN) == nullptr);
+  EXPECT_TRUE(dyn_cast_or_null<pointer_wrappers::Derived>(MB) == nullptr);
+  EXPECT_TRUE(dyn_cast_or_null<pointer_wrappers::Derived>(CB) == nullptr);
+  EXPECT_TRUE(dyn_cast_or_null<pointer_wrappers::Derived>(MD) == &D);
+  EXPECT_TRUE(dyn_cast_or_null<pointer_wrappers::Derived>(CD) == &D);
+}
+
+} // end namespace pointer_wrappers
+} // end namespace
diff --git a/unittests/Support/CommandLineTest.cpp b/unittests/Support/CommandLineTest.cpp
index ac8d3d8..9d7679d 100644
--- a/unittests/Support/CommandLineTest.cpp
+++ b/unittests/Support/CommandLineTest.cpp
@@ -35,6 +35,8 @@ class TempEnvVar {
 #if HAVE_SETENV
     // Assume setenv and unsetenv come together.
     unsetenv(name);
+#else
+    (void)name; // Suppress -Wunused-private-field.
 #endif
   }
 
@@ -70,14 +72,14 @@ public:
 
 
 cl::OptionCategory TestCategory("Test Options", "Description");
-cl::opt<int> TestOption("test-option", cl::desc("old description"));
 TEST(CommandLineTest, ModifyExisitingOption) {
+  StackOption<int> TestOption("test-option", cl::desc("old description"));
+
   const char Description[] = "New description";
   const char ArgString[] = "new-test-option";
   const char ValueString[] = "Integer";
 
-  StringMap<cl::Option*> Map;
-  cl::getRegisteredOptions(Map);
+  StringMap<cl::Option *> &Map = cl::getRegisteredOptions();
 
   ASSERT_TRUE(Map.count("test-option") == 1) <<
     "Could not find option in map.";
@@ -230,5 +232,44 @@ TEST(CommandLineTest, AliasRequired) {
   testAliasRequired(array_lengthof(opts2), opts2);
 }
 
+TEST(CommandLineTest, HideUnrelatedOptions) {
+  StackOption<int> TestOption1("hide-option-1");
+  StackOption<int> TestOption2("hide-option-2", cl::cat(TestCategory));
+
+  cl::HideUnrelatedOptions(TestCategory);
+
+  ASSERT_EQ(cl::ReallyHidden, TestOption1.getOptionHiddenFlag())
+      << "Failed to hide extra option.";
+  ASSERT_EQ(cl::NotHidden, TestOption2.getOptionHiddenFlag())
+      << "Hid extra option that should be visable.";
+
+  StringMap<cl::Option *> &Map = cl::getRegisteredOptions();
+  ASSERT_EQ(cl::NotHidden, Map["help"]->getOptionHiddenFlag())
+      << "Hid default option that should be visable.";
+}
+
+cl::OptionCategory TestCategory2("Test Options set 2", "Description");
+
+TEST(CommandLineTest, HideUnrelatedOptionsMulti) {
+  StackOption<int> TestOption1("multi-hide-option-1");
+  StackOption<int> TestOption2("multi-hide-option-2", cl::cat(TestCategory));
+  StackOption<int> TestOption3("multi-hide-option-3", cl::cat(TestCategory2));
+
+  const cl::OptionCategory *VisibleCategories[] = {&TestCategory,
+                                                   &TestCategory2};
+
+  cl::HideUnrelatedOptions(makeArrayRef(VisibleCategories));
+
+  ASSERT_EQ(cl::ReallyHidden, TestOption1.getOptionHiddenFlag())
+      << "Failed to hide extra option.";
+  ASSERT_EQ(cl::NotHidden, TestOption2.getOptionHiddenFlag())
+      << "Hid extra option that should be visable.";
+  ASSERT_EQ(cl::NotHidden, TestOption3.getOptionHiddenFlag())
+      << "Hid extra option that should be visable.";
+
+  StringMap<cl::Option *> &Map = cl::getRegisteredOptions();
+  ASSERT_EQ(cl::NotHidden, Map["help"]->getOptionHiddenFlag())
+      << "Hid default option that should be visable.";
+}
 
 }  // anonymous namespace
diff --git a/unittests/Support/CompressionTest.cpp b/unittests/Support/CompressionTest.cpp
index 698ae3a..36b84d8 100644
--- a/unittests/Support/CompressionTest.cpp
+++ b/unittests/Support/CompressionTest.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Compression.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
 #include "gtest/gtest.h"
 
diff --git a/unittests/Support/ConvertUTFTest.cpp b/unittests/Support/ConvertUTFTest.cpp
index 510b1da..d436fc0 100644
--- a/unittests/Support/ConvertUTFTest.cpp
+++ b/unittests/Support/ConvertUTFTest.cpp
@@ -8,10 +8,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/Format.h"
 #include "gtest/gtest.h"
 #include <string>
-#include <vector>
 #include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -37,6 +38,19 @@ TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
   EXPECT_EQ(Expected, Result);
 }
 
+TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) {
+  // Src is the look of disapproval.
+  static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
+  StringRef Ref(Src, sizeof(Src) - 1);
+  SmallVector<UTF16, 5> Result;
+  bool Success = convertUTF8ToUTF16String(Ref, Result);
+  EXPECT_TRUE(Success);
+  static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0};
+  ASSERT_EQ(3u, Result.size());
+  for (int I = 0, E = 3; I != E; ++I)
+    EXPECT_EQ(Expected[I], Result[I]);
+}
+
 TEST(ConvertUTFTest, OddLengthInput) {
   std::string Result;
   bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);
@@ -141,8 +155,8 @@ CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
   if (!Partial)
     std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
   else
-
     std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
+
   if (Expected.ErrorCode != ErrorCode)
     return ::testing::AssertionFailure() << "Expected error code "
                                          << Expected.ErrorCode << ", actual "
diff --git a/unittests/Support/DwarfTest.cpp b/unittests/Support/DwarfTest.cpp
new file mode 100644
index 0000000..74fcc98
--- /dev/null
+++ b/unittests/Support/DwarfTest.cpp
@@ -0,0 +1,141 @@
+//===- unittest/Support/DwarfTest.cpp - Dwarf support tests ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Dwarf.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::dwarf;
+
+namespace {
+
+TEST(DwarfTest, TagStringOnInvalid) {
+  // This is invalid, so it shouldn't be stringified.
+  EXPECT_EQ(nullptr, TagString(DW_TAG_invalid));
+
+  // These aren't really tags: they describe ranges within tags.  They
+  // shouldn't be stringified either.
+  EXPECT_EQ(nullptr, TagString(DW_TAG_lo_user));
+  EXPECT_EQ(nullptr, TagString(DW_TAG_hi_user));
+  EXPECT_EQ(nullptr, TagString(DW_TAG_user_base));
+}
+
+TEST(DwarfTest, getTag) {
+  // A couple of valid tags.
+  EXPECT_EQ(DW_TAG_array_type, getTag("DW_TAG_array_type"));
+  EXPECT_EQ(DW_TAG_module, getTag("DW_TAG_module"));
+
+  // Invalid tags.
+  EXPECT_EQ(DW_TAG_invalid, getTag("DW_TAG_invalid"));
+  EXPECT_EQ(DW_TAG_invalid, getTag("DW_TAG_madeuptag"));
+  EXPECT_EQ(DW_TAG_invalid, getTag("something else"));
+
+  // Tag range markers should not be recognized.
+  EXPECT_EQ(DW_TAG_invalid, getTag("DW_TAG_lo_user"));
+  EXPECT_EQ(DW_TAG_invalid, getTag("DW_TAG_hi_user"));
+  EXPECT_EQ(DW_TAG_invalid, getTag("DW_TAG_user_base"));
+}
+
+TEST(DwarfTest, getOperationEncoding) {
+  // Some valid ops.
+  EXPECT_EQ(DW_OP_deref, getOperationEncoding("DW_OP_deref"));
+  EXPECT_EQ(DW_OP_bit_piece, getOperationEncoding("DW_OP_bit_piece"));
+
+  // Invalid ops.
+  EXPECT_EQ(0u, getOperationEncoding("DW_OP_otherthings"));
+  EXPECT_EQ(0u, getOperationEncoding("other"));
+
+  // Markers shouldn't be recognized.
+  EXPECT_EQ(0u, getOperationEncoding("DW_OP_lo_user"));
+  EXPECT_EQ(0u, getOperationEncoding("DW_OP_hi_user"));
+}
+
+TEST(DwarfTest, LanguageStringOnInvalid) {
+  // This is invalid, so it shouldn't be stringified.
+  EXPECT_EQ(nullptr, LanguageString(0));
+
+  // These aren't really tags: they describe ranges within tags.  They
+  // shouldn't be stringified either.
+  EXPECT_EQ(nullptr, LanguageString(DW_LANG_lo_user));
+  EXPECT_EQ(nullptr, LanguageString(DW_LANG_hi_user));
+}
+
+TEST(DwarfTest, getLanguage) {
+  // A couple of valid languages.
+  EXPECT_EQ(DW_LANG_C89, getLanguage("DW_LANG_C89"));
+  EXPECT_EQ(DW_LANG_C_plus_plus_11, getLanguage("DW_LANG_C_plus_plus_11"));
+  EXPECT_EQ(DW_LANG_OCaml, getLanguage("DW_LANG_OCaml"));
+  EXPECT_EQ(DW_LANG_Mips_Assembler, getLanguage("DW_LANG_Mips_Assembler"));
+
+  // Invalid languages.
+  EXPECT_EQ(0u, getLanguage("DW_LANG_invalid"));
+  EXPECT_EQ(0u, getLanguage("DW_TAG_array_type"));
+  EXPECT_EQ(0u, getLanguage("something else"));
+
+  // Language range markers should not be recognized.
+  EXPECT_EQ(0u, getLanguage("DW_LANG_lo_user"));
+  EXPECT_EQ(0u, getLanguage("DW_LANG_hi_user"));
+}
+
+TEST(DwarfTest, AttributeEncodingStringOnInvalid) {
+  // This is invalid, so it shouldn't be stringified.
+  EXPECT_EQ(nullptr, AttributeEncodingString(0));
+
+  // These aren't really tags: they describe ranges within tags.  They
+  // shouldn't be stringified either.
+  EXPECT_EQ(nullptr, AttributeEncodingString(DW_ATE_lo_user));
+  EXPECT_EQ(nullptr, AttributeEncodingString(DW_ATE_hi_user));
+}
+
+TEST(DwarfTest, getAttributeEncoding) {
+  // A couple of valid languages.
+  EXPECT_EQ(DW_ATE_boolean, getAttributeEncoding("DW_ATE_boolean"));
+  EXPECT_EQ(DW_ATE_imaginary_float,
+            getAttributeEncoding("DW_ATE_imaginary_float"));
+
+  // Invalid languages.
+  EXPECT_EQ(0u, getAttributeEncoding("DW_ATE_invalid"));
+  EXPECT_EQ(0u, getAttributeEncoding("DW_TAG_array_type"));
+  EXPECT_EQ(0u, getAttributeEncoding("something else"));
+
+  // AttributeEncoding range markers should not be recognized.
+  EXPECT_EQ(0u, getAttributeEncoding("DW_ATE_lo_user"));
+  EXPECT_EQ(0u, getAttributeEncoding("DW_ATE_hi_user"));
+}
+
+TEST(DwarfTest, VirtualityString) {
+  EXPECT_EQ(StringRef("DW_VIRTUALITY_none"),
+            VirtualityString(DW_VIRTUALITY_none));
+  EXPECT_EQ(StringRef("DW_VIRTUALITY_virtual"),
+            VirtualityString(DW_VIRTUALITY_virtual));
+  EXPECT_EQ(StringRef("DW_VIRTUALITY_pure_virtual"),
+            VirtualityString(DW_VIRTUALITY_pure_virtual));
+
+  // DW_VIRTUALITY_max should be pure virtual.
+  EXPECT_EQ(StringRef("DW_VIRTUALITY_pure_virtual"),
+            VirtualityString(DW_VIRTUALITY_max));
+
+  // Invalid numbers shouldn't be stringified.
+  EXPECT_EQ(nullptr, VirtualityString(DW_VIRTUALITY_max + 1));
+  EXPECT_EQ(nullptr, VirtualityString(DW_VIRTUALITY_max + 77));
+}
+
+TEST(DwarfTest, getVirtuality) {
+  EXPECT_EQ(DW_VIRTUALITY_none, getVirtuality("DW_VIRTUALITY_none"));
+  EXPECT_EQ(DW_VIRTUALITY_virtual, getVirtuality("DW_VIRTUALITY_virtual"));
+  EXPECT_EQ(DW_VIRTUALITY_pure_virtual,
+            getVirtuality("DW_VIRTUALITY_pure_virtual"));
+
+  // Invalid strings.
+  EXPECT_EQ(DW_VIRTUALITY_invalid, getVirtuality("DW_VIRTUALITY_invalid"));
+  EXPECT_EQ(DW_VIRTUALITY_invalid, getVirtuality("DW_VIRTUALITY_max"));
+  EXPECT_EQ(DW_VIRTUALITY_invalid, getVirtuality("something else"));
+}
+
+} // end namespace
diff --git a/unittests/Support/EndianStreamTest.cpp b/unittests/Support/EndianStreamTest.cpp
new file mode 100644
index 0000000..6a69be5
--- /dev/null
+++ b/unittests/Support/EndianStreamTest.cpp
@@ -0,0 +1,157 @@
+//===- unittests/Support/EndianStreamTest.cpp - EndianStream.h tests ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/DataTypes.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+using namespace support;
+
+namespace {
+
+TEST(EndianStream, WriteInt32LE) {
+  SmallString<16> data;
+
+  {
+    raw_svector_ostream OS(data);
+    endian::Writer<little> LE(OS);
+    LE.write(static_cast<int32_t>(-1362446643));
+  }
+
+  EXPECT_EQ(static_cast<uint8_t>(data[0]), 0xCD);
+  EXPECT_EQ(static_cast<uint8_t>(data[1]), 0xB6);
+  EXPECT_EQ(static_cast<uint8_t>(data[2]), 0xCA);
+  EXPECT_EQ(static_cast<uint8_t>(data[3]), 0xAE);
+}
+
+TEST(EndianStream, WriteInt32BE) {
+  SmallVector<char, 16> data;
+
+  {
+    raw_svector_ostream OS(data);
+    endian::Writer<big> BE(OS);
+    BE.write(static_cast<int32_t>(-1362446643));
+  }
+
+  EXPECT_EQ(static_cast<uint8_t>(data[0]), 0xAE);
+  EXPECT_EQ(static_cast<uint8_t>(data[1]), 0xCA);
+  EXPECT_EQ(static_cast<uint8_t>(data[2]), 0xB6);
+  EXPECT_EQ(static_cast<uint8_t>(data[3]), 0xCD);
+}
+
+
+TEST(EndianStream, WriteFloatLE) {
+  SmallString<16> data;
+
+  {
+    raw_svector_ostream OS(data);
+    endian::Writer<little> LE(OS);
+    LE.write(12345.0f);
+  }
+
+  EXPECT_EQ(static_cast<uint8_t>(data[0]), 0x00);
+  EXPECT_EQ(static_cast<uint8_t>(data[1]), 0xE4);
+  EXPECT_EQ(static_cast<uint8_t>(data[2]), 0x40);
+  EXPECT_EQ(static_cast<uint8_t>(data[3]), 0x46);
+}
+
+TEST(EndianStream, WriteFloatBE) {
+  SmallVector<char, 16> data;
+
+  {
+    raw_svector_ostream OS(data);
+    endian::Writer<big> BE(OS);
+    BE.write(12345.0f);
+  }
+
+  EXPECT_EQ(static_cast<uint8_t>(data[0]), 0x46);
+  EXPECT_EQ(static_cast<uint8_t>(data[1]), 0x40);
+  EXPECT_EQ(static_cast<uint8_t>(data[2]), 0xE4);
+  EXPECT_EQ(static_cast<uint8_t>(data[3]), 0x00);
+}
+
+TEST(EndianStream, WriteInt64LE) {
+  SmallString<16> data;
+
+  {
+    raw_svector_ostream OS(data);
+    endian::Writer<little> LE(OS);
+    LE.write(static_cast<int64_t>(-136244664332342323));
+  }
+
+  EXPECT_EQ(static_cast<uint8_t>(data[0]), 0xCD);
+  EXPECT_EQ(static_cast<uint8_t>(data[1]), 0xAB);
+  EXPECT_EQ(static_cast<uint8_t>(data[2]), 0xED);
+  EXPECT_EQ(static_cast<uint8_t>(data[3]), 0x1B);
+  EXPECT_EQ(static_cast<uint8_t>(data[4]), 0x33);
+  EXPECT_EQ(static_cast<uint8_t>(data[5]), 0xF6);
+  EXPECT_EQ(static_cast<uint8_t>(data[6]), 0x1B);
+  EXPECT_EQ(static_cast<uint8_t>(data[7]), 0xFE);
+}
+
+TEST(EndianStream, WriteInt64BE) {
+  SmallVector<char, 16> data;
+
+  {
+    raw_svector_ostream OS(data);
+    endian::Writer<big> BE(OS);
+    BE.write(static_cast<int64_t>(-136244664332342323));
+  }
+
+  EXPECT_EQ(static_cast<uint8_t>(data[0]), 0xFE);
+  EXPECT_EQ(static_cast<uint8_t>(data[1]), 0x1B);
+  EXPECT_EQ(static_cast<uint8_t>(data[2]), 0xF6);
+  EXPECT_EQ(static_cast<uint8_t>(data[3]), 0x33);
+  EXPECT_EQ(static_cast<uint8_t>(data[4]), 0x1B);
+  EXPECT_EQ(static_cast<uint8_t>(data[5]), 0xED);
+  EXPECT_EQ(static_cast<uint8_t>(data[6]), 0xAB);
+  EXPECT_EQ(static_cast<uint8_t>(data[7]), 0xCD);
+}
+
+TEST(EndianStream, WriteDoubleLE) {
+  SmallString<16> data;
+
+  {
+    raw_svector_ostream OS(data);
+    endian::Writer<little> LE(OS);
+    LE.write(-2349214918.58107);
+  }
+
+  EXPECT_EQ(static_cast<uint8_t>(data[0]), 0x20);
+  EXPECT_EQ(static_cast<uint8_t>(data[1]), 0x98);
+  EXPECT_EQ(static_cast<uint8_t>(data[2]), 0xD2);
+  EXPECT_EQ(static_cast<uint8_t>(data[3]), 0x98);
+  EXPECT_EQ(static_cast<uint8_t>(data[4]), 0xC5);
+  EXPECT_EQ(static_cast<uint8_t>(data[5]), 0x80);
+  EXPECT_EQ(static_cast<uint8_t>(data[6]), 0xE1);
+  EXPECT_EQ(static_cast<uint8_t>(data[7]), 0xC1);
+}
+
+TEST(EndianStream, WriteDoubleBE) {
+  SmallVector<char, 16> data;
+
+  {
+    raw_svector_ostream OS(data);
+    endian::Writer<big> BE(OS);
+    BE.write(-2349214918.58107);
+  }
+
+  EXPECT_EQ(static_cast<uint8_t>(data[0]), 0xC1);
+  EXPECT_EQ(static_cast<uint8_t>(data[1]), 0xE1);
+  EXPECT_EQ(static_cast<uint8_t>(data[2]), 0x80);
+  EXPECT_EQ(static_cast<uint8_t>(data[3]), 0xC5);
+  EXPECT_EQ(static_cast<uint8_t>(data[4]), 0x98);
+  EXPECT_EQ(static_cast<uint8_t>(data[5]), 0xD2);
+  EXPECT_EQ(static_cast<uint8_t>(data[6]), 0x98);
+  EXPECT_EQ(static_cast<uint8_t>(data[7]), 0x20);
+}
+
+
+} // end anon namespace
diff --git a/unittests/Support/FileOutputBufferTest.cpp b/unittests/Support/FileOutputBufferTest.cpp
index 911d516..c7e1006 100644
--- a/unittests/Support/FileOutputBufferTest.cpp
+++ b/unittests/Support/FileOutputBufferTest.cpp
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Errc.h"
-#include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
@@ -80,14 +80,13 @@ TEST(FileOutputBuffer, Test) {
     memcpy(Buffer->getBufferStart(), "AABBCCDDEEFFGGHHIIJJ", 20);
     // Write to end of buffer to verify it is writable.
     memcpy(Buffer->getBufferEnd() - 20, "AABBCCDDEEFFGGHHIIJJ", 20);
-    // Commit buffer, but size down to smaller size
-    ASSERT_NO_ERROR(Buffer->commit(5000));
+    ASSERT_NO_ERROR(Buffer->commit());
   }
 
   // Verify file is correct size.
   uint64_t File3Size;
   ASSERT_NO_ERROR(fs::file_size(Twine(File3), File3Size));
-  ASSERT_EQ(File3Size, 5000ULL);
+  ASSERT_EQ(File3Size, 8192000ULL);
   ASSERT_NO_ERROR(fs::remove(File3.str()));
 
   // TEST 4: Verify file can be made executable.
diff --git a/unittests/Support/MathExtrasTest.cpp b/unittests/Support/MathExtrasTest.cpp
index 93a38cb..5c95b50 100644
--- a/unittests/Support/MathExtrasTest.cpp
+++ b/unittests/Support/MathExtrasTest.cpp
@@ -141,21 +141,18 @@ TEST(MathExtras, ByteSwap_64) {
   EXPECT_EQ(0x1100FFEEDDCCBBAAULL, ByteSwap_64(0xAABBCCDDEEFF0011LL));
 }
 
-TEST(MathExtras, CountLeadingOnes_32) {
+TEST(MathExtras, countLeadingOnes) {
   for (int i = 30; i >= 0; --i) {
     // Start with all ones and unset some bit.
-    EXPECT_EQ(31u - i, CountLeadingOnes_32(0xFFFFFFFF ^ (1 << i)));
+    EXPECT_EQ(31u - i, countLeadingOnes(0xFFFFFFFF ^ (1 << i)));
   }
-}
-
-TEST(MathExtras, CountLeadingOnes_64) {
   for (int i = 62; i >= 0; --i) {
     // Start with all ones and unset some bit.
-    EXPECT_EQ(63u - i, CountLeadingOnes_64(0xFFFFFFFFFFFFFFFFLL ^ (1LL << i)));
+    EXPECT_EQ(63u - i, countLeadingOnes(0xFFFFFFFFFFFFFFFFULL ^ (1LL << i)));
   }
   for (int i = 30; i >= 0; --i) {
     // Start with all ones and unset some bit.
-    EXPECT_EQ(31u - i, CountLeadingOnes_32(0xFFFFFFFF ^ (1 << i)));
+    EXPECT_EQ(31u - i, countLeadingOnes(0xFFFFFFFF ^ (1 << i)));
   }
 }
 
diff --git a/unittests/Support/MemoryTest.cpp b/unittests/Support/MemoryTest.cpp
index 8ad90e0..f439cb2 100644
--- a/unittests/Support/MemoryTest.cpp
+++ b/unittests/Support/MemoryTest.cpp
@@ -21,7 +21,7 @@ class MappedMemoryTest : public ::testing::TestWithParam<unsigned> {
 public:
   MappedMemoryTest() {
     Flags = GetParam();
-    PageSize = sys::process::get_self()->page_size();
+    PageSize = sys::Process::getPageSize();
   }
 
 protected:
diff --git a/unittests/Support/Path.cpp b/unittests/Support/Path.cpp
index 502cda2..00af989 100644
--- a/unittests/Support/Path.cpp
+++ b/unittests/Support/Path.cpp
@@ -557,6 +557,7 @@ const char macho_dynamically_linked_shared_lib[] =
 const char macho_dynamic_linker[] = "\xfe\xed\xfa\xce..........\x00\x07";
 const char macho_bundle[] = "\xfe\xed\xfa\xce..........\x00\x08";
 const char macho_dsym_companion[] = "\xfe\xed\xfa\xce..........\x00\x0a";
+const char macho_kext_bundle[] = "\xfe\xed\xfa\xce..........\x00\x0b";
 const char windows_resource[] = "\x00\x00\x00\x00\x020\x00\x00\x00\xff";
 const char macho_dynamically_linked_shared_lib_stub[] =
     "\xfe\xed\xfa\xce..........\x00\x09";
@@ -587,6 +588,7 @@ TEST_F(FileSystemTest, Magic) {
     DEFINE(macho_bundle),
     DEFINE(macho_dynamically_linked_shared_lib_stub),
     DEFINE(macho_dsym_companion),
+    DEFINE(macho_kext_bundle),
     DEFINE(windows_resource)
 #undef DEFINE
     };
@@ -638,22 +640,31 @@ TEST_F(FileSystemTest, CarriageReturn) {
 }
 #endif
 
+TEST_F(FileSystemTest, Resize) {
+  int FD;
+  SmallString<64> TempPath;
+  ASSERT_NO_ERROR(fs::createTemporaryFile("prefix", "temp", FD, TempPath));
+  ASSERT_NO_ERROR(fs::resize_file(FD, 123));
+  fs::file_status Status;
+  ASSERT_NO_ERROR(fs::status(FD, Status));
+  ASSERT_EQ(Status.getSize(), 123U);
+}
+
 TEST_F(FileSystemTest, FileMapping) {
   // Create a temp file.
   int FileDescriptor;
   SmallString<64> TempPath;
   ASSERT_NO_ERROR(
       fs::createTemporaryFile("prefix", "temp", FileDescriptor, TempPath));
+  unsigned Size = 4096;
+  ASSERT_NO_ERROR(fs::resize_file(FileDescriptor, Size));
+
   // Map in temp file and add some content
   std::error_code EC;
   StringRef Val("hello there");
   {
     fs::mapped_file_region mfr(FileDescriptor,
-                               true,
-                               fs::mapped_file_region::readwrite,
-                               4096,
-                               0,
-                               EC);
+                               fs::mapped_file_region::readwrite, Size, 0, EC);
     ASSERT_NO_ERROR(EC);
     std::copy(Val.begin(), Val.end(), mfr.data());
     // Explicitly add a 0.
@@ -662,27 +673,19 @@ TEST_F(FileSystemTest, FileMapping) {
   }
 
   // Map it back in read-only
-  fs::mapped_file_region mfr(Twine(TempPath),
-                             fs::mapped_file_region::readonly,
-                             0,
-                             0,
-                             EC);
+  int FD;
+  EC = fs::openFileForRead(Twine(TempPath), FD);
+  ASSERT_NO_ERROR(EC);
+  fs::mapped_file_region mfr(FD, fs::mapped_file_region::readonly, Size, 0, EC);
   ASSERT_NO_ERROR(EC);
 
   // Verify content
   EXPECT_EQ(StringRef(mfr.const_data()), Val);
 
   // Unmap temp file
-
-  fs::mapped_file_region m(Twine(TempPath),
-                             fs::mapped_file_region::readonly,
-                             0,
-                             0,
-                             EC);
+  fs::mapped_file_region m(FD, fs::mapped_file_region::readonly, Size, 0, EC);
   ASSERT_NO_ERROR(EC);
-  const char *Data = m.const_data();
-  fs::mapped_file_region mfrrv(std::move(m));
-  EXPECT_EQ(mfrrv.const_data(), Data);
+  ASSERT_EQ(close(FD), 0);
 }
 
 TEST(Support, NormalizePath) {
diff --git a/unittests/Support/ProcessTest.cpp b/unittests/Support/ProcessTest.cpp
index 3045c30..298a0a3 100644
--- a/unittests/Support/ProcessTest.cpp
+++ b/unittests/Support/ProcessTest.cpp
@@ -19,26 +19,6 @@ namespace {
 using namespace llvm;
 using namespace sys;
 
-TEST(ProcessTest, SelfProcess) {
-  EXPECT_TRUE(process::get_self());
-  EXPECT_EQ(process::get_self(), process::get_self());
-
-#if defined(LLVM_ON_UNIX)
-  EXPECT_EQ(getpid(), process::get_self()->get_id());
-#elif defined(LLVM_ON_WIN32)
-  EXPECT_EQ(GetCurrentProcessId(), process::get_self()->get_id());
-#endif
-
-  EXPECT_LT(1u, process::get_self()->page_size());
-
-  EXPECT_LT(TimeValue::MinTime(), process::get_self()->get_user_time());
-  EXPECT_GT(TimeValue::MaxTime(), process::get_self()->get_user_time());
-  EXPECT_LT(TimeValue::MinTime(), process::get_self()->get_system_time());
-  EXPECT_GT(TimeValue::MaxTime(), process::get_self()->get_system_time());
-  EXPECT_LT(TimeValue::MinTime(), process::get_self()->get_wall_time());
-  EXPECT_GT(TimeValue::MaxTime(), process::get_self()->get_wall_time());
-}
-
 TEST(ProcessTest, GetRandomNumberTest) {
   const unsigned r1 = Process::GetRandomNumber();
   const unsigned r2 = Process::GetRandomNumber();
diff --git a/unittests/Support/ProgramTest.cpp b/unittests/Support/ProgramTest.cpp
index c0e6e80..0feed47 100644
--- a/unittests/Support/ProgramTest.cpp
+++ b/unittests/Support/ProgramTest.cpp
@@ -70,6 +70,56 @@ static void CopyEnvironment(std::vector<const char *> &out) {
   }
 }
 
+#ifdef LLVM_ON_WIN32
+TEST(ProgramTest, CreateProcessLongPath) {
+  if (getenv("LLVM_PROGRAM_TEST_LONG_PATH"))
+    exit(0);
+
+  // getMainExecutable returns an absolute path; prepend the long-path prefix.
+  std::string MyAbsExe =
+      sys::fs::getMainExecutable(TestMainArgv0, &ProgramTestStringArg1);
+  std::string MyExe;
+  if (!StringRef(MyAbsExe).startswith("\\\\?\\"))
+    MyExe.append("\\\\?\\");
+  MyExe.append(MyAbsExe);
+
+  const char *ArgV[] = {
+    MyExe.c_str(),
+    "--gtest_filter=ProgramTest.CreateProcessLongPath",
+    nullptr
+  };
+
+  // Add LLVM_PROGRAM_TEST_LONG_PATH to the environment of the child.
+  std::vector<const char *> EnvP;
+  CopyEnvironment(EnvP);
+  EnvP.push_back("LLVM_PROGRAM_TEST_LONG_PATH=1");
+  EnvP.push_back(nullptr);
+
+  // Redirect stdout to a long path.
+  SmallString<128> TestDirectory;
+  ASSERT_NO_ERROR(
+    fs::createUniqueDirectory("program-redirect-test", TestDirectory));
+  SmallString<256> LongPath(TestDirectory);
+  LongPath.push_back('\\');
+  // MAX_PATH = 260
+  LongPath.append(260 - TestDirectory.size(), 'a');
+  StringRef LongPathRef(LongPath);
+
+  std::string Error;
+  bool ExecutionFailed;
+  const StringRef *Redirects[] = { nullptr, &LongPathRef, nullptr };
+  int RC = ExecuteAndWait(MyExe, ArgV, &EnvP[0], Redirects,
+    /*secondsToWait=*/ 10, /*memoryLimit=*/ 0, &Error,
+    &ExecutionFailed);
+  EXPECT_FALSE(ExecutionFailed) << Error;
+  EXPECT_EQ(0, RC);
+
+  // Remove the long stdout.
+  ASSERT_NO_ERROR(fs::remove(Twine(LongPath)));
+  ASSERT_NO_ERROR(fs::remove(Twine(TestDirectory)));
+}
+#endif
+
 TEST(ProgramTest, CreateProcessTrailingSlash) {
   if (getenv("LLVM_PROGRAM_TEST_CHILD")) {
     if (ProgramTestStringArg1 == "has\\\\ trailing\\" &&
diff --git a/unittests/Support/ScaledNumberTest.cpp b/unittests/Support/ScaledNumberTest.cpp
index 7bbef7e..d8d6e31 100644
--- a/unittests/Support/ScaledNumberTest.cpp
+++ b/unittests/Support/ScaledNumberTest.cpp
@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ScaledNumber.h"
-
 #include "llvm/Support/DataTypes.h"
 #include "gtest/gtest.h"
 
diff --git a/unittests/Support/SpecialCaseListTest.cpp b/unittests/Support/SpecialCaseListTest.cpp
index 740dbfe..0657f80 100644
--- a/unittests/Support/SpecialCaseListTest.cpp
+++ b/unittests/Support/SpecialCaseListTest.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SpecialCaseList.h"
 #include "gtest/gtest.h"
@@ -30,6 +31,16 @@ protected:
     assert(Error == "");
     return SCL;
   }
+
+  std::string makeSpecialCaseListFile(StringRef Contents) {
+    int FD;
+    SmallString<64> Path;
+    sys::fs::createTemporaryFile("SpecialCaseListTest", "temp", FD, Path);
+    raw_fd_ostream OF(FD, true, true);
+    OF << Contents;
+    OF.close();
+    return Path.str();
+  }
 };
 
 TEST_F(SpecialCaseListTest, Basic) {
@@ -86,17 +97,18 @@ TEST_F(SpecialCaseListTest, Substring) {
 TEST_F(SpecialCaseListTest, InvalidSpecialCaseList) {
   std::string Error;
   EXPECT_EQ(nullptr, makeSpecialCaseList("badline", Error));
-  EXPECT_EQ("Malformed line 1: 'badline'", Error);
+  EXPECT_EQ("malformed line 1: 'badline'", Error);
   EXPECT_EQ(nullptr, makeSpecialCaseList("src:bad[a-", Error));
-  EXPECT_EQ("Malformed regex in line 1: 'bad[a-': invalid character range",
+  EXPECT_EQ("malformed regex in line 1: 'bad[a-': invalid character range",
             Error);
   EXPECT_EQ(nullptr, makeSpecialCaseList("src:a.c\n"
                                    "fun:fun(a\n",
                                    Error));
-  EXPECT_EQ("Malformed regex in line 2: 'fun(a': parentheses not balanced",
+  EXPECT_EQ("malformed regex in line 2: 'fun(a': parentheses not balanced",
             Error);
-  EXPECT_EQ(nullptr, SpecialCaseList::create("unexisting", Error));
-  EXPECT_EQ(0U, Error.find("Can't open file 'unexisting':"));
+  std::vector<std::string> Files(1, "unexisting");
+  EXPECT_EQ(nullptr, SpecialCaseList::create(Files, Error));
+  EXPECT_EQ(0U, Error.find("can't open file 'unexisting':"));
 }
 
 TEST_F(SpecialCaseListTest, EmptySpecialCaseList) {
@@ -104,6 +116,20 @@ TEST_F(SpecialCaseListTest, EmptySpecialCaseList) {
   EXPECT_FALSE(SCL->inSection("foo", "bar"));
 }
 
+TEST_F(SpecialCaseListTest, MultipleBlacklists) {
+  std::vector<std::string> Files;
+  Files.push_back(makeSpecialCaseListFile("src:bar\n"
+                                          "src:*foo*\n"
+                                          "src:ban=init\n"));
+  Files.push_back(makeSpecialCaseListFile("src:baz\n"
+                                          "src:*fog*\n"));
+  auto SCL = SpecialCaseList::createOrDie(Files);
+  EXPECT_TRUE(SCL->inSection("src", "bar"));
+  EXPECT_TRUE(SCL->inSection("src", "baz"));
+  EXPECT_FALSE(SCL->inSection("src", "ban"));
+  EXPECT_TRUE(SCL->inSection("src", "ban", "init"));
+  EXPECT_TRUE(SCL->inSection("src", "tomfoolery"));
+  EXPECT_TRUE(SCL->inSection("src", "tomfoglery"));
 }
 
-
+}
diff --git a/unittests/Support/StreamingMemoryObject.cpp b/unittests/Support/StreamingMemoryObject.cpp
new file mode 100644
index 0000000..2013649
--- /dev/null
+++ b/unittests/Support/StreamingMemoryObject.cpp
@@ -0,0 +1,29 @@
+//===- llvm/unittest/Support/StreamingMemoryObject.cpp - unit tests -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/StreamingMemoryObject.h"
+#include "gtest/gtest.h"
+#include <string.h>
+
+using namespace llvm;
+
+namespace {
+class NullDataStreamer : public DataStreamer {
+  size_t GetBytes(unsigned char *buf, size_t len) override {
+    memset(buf, 0, len);
+    return len;
+  }
+};
+}
+
+TEST(StreamingMemoryObject, Test) {
+  auto *DS = new NullDataStreamer();
+  StreamingMemoryObject O(DS);
+  EXPECT_TRUE(O.isValidAddress(32 * 1024));
+}
diff --git a/unittests/Support/StringPool.cpp b/unittests/Support/StringPool.cpp
index 7b7805f..ac39fec 100644
--- a/unittests/Support/StringPool.cpp
+++ b/unittests/Support/StringPool.cpp
@@ -1,6 +1,6 @@
-//===- llvm/unittest/Support/ThreadLocalTest.cpp - Therad Local tests   ---===//
+//===- llvm/unittest/Support/StringPoiil.cpp - StringPool tests -----------===//
 //
-//		       The LLVM Compiler Infrastructure
+//                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
diff --git a/unittests/Support/ThreadLocalTest.cpp b/unittests/Support/ThreadLocalTest.cpp
index ea751be..e71c7db 100644
--- a/unittests/Support/ThreadLocalTest.cpp
+++ b/unittests/Support/ThreadLocalTest.cpp
@@ -1,6 +1,6 @@
-//===- llvm/unittest/Support/ThreadLocalTest.cpp - Therad Local tests   ---===//
+//===- llvm/unittest/Support/ThreadLocalTest.cpp - ThreadLocal tests ------===//
 //
-//		       The LLVM Compiler Infrastructure
+//                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
@@ -9,6 +9,7 @@
 
 #include "llvm/Support/ThreadLocal.h"
 #include "gtest/gtest.h"
+#include <type_traits>
 
 using namespace llvm;
 using namespace sys;
@@ -25,6 +26,10 @@ struct S {
 TEST_F(ThreadLocalTest, Basics) {
   ThreadLocal<const S> x;
 
+  static_assert(
+      std::is_const<std::remove_pointer<decltype(x.get())>::type>::value,
+      "ThreadLocal::get didn't return a pointer to const object");
+
   EXPECT_EQ(nullptr, x.get());
 
   S s;
@@ -33,6 +38,20 @@ TEST_F(ThreadLocalTest, Basics) {
 
   x.erase();
   EXPECT_EQ(nullptr, x.get());
+
+  ThreadLocal<S> y;
+
+  static_assert(
+      !std::is_const<std::remove_pointer<decltype(y.get())>::type>::value,
+      "ThreadLocal::get returned a pointer to const object");
+
+  EXPECT_EQ(nullptr, y.get());
+
+  y.set(&s);
+  EXPECT_EQ(&s, y.get());
+
+  y.erase();
+  EXPECT_EQ(nullptr, y.get());
 }
 
 }
diff --git a/unittests/Support/raw_ostream_test.cpp b/unittests/Support/raw_ostream_test.cpp
index 39cfaf0..ff98602 100644
--- a/unittests/Support/raw_ostream_test.cpp
+++ b/unittests/Support/raw_ostream_test.cpp
@@ -162,6 +162,8 @@ TEST(raw_ostreamTest, FormatHex) {
   EXPECT_EQ("0x1",        printToString(format_hex(1, 3), 3));
   EXPECT_EQ("0x12",       printToString(format_hex(0x12, 3), 4));
   EXPECT_EQ("0x123",      printToString(format_hex(0x123, 3), 5));
+  EXPECT_EQ("FF",         printToString(format_hex_no_prefix(0xFF, 2, true), 4));
+  EXPECT_EQ("ABCD",       printToString(format_hex_no_prefix(0xABCD, 2, true), 4));
   EXPECT_EQ("0xffffffffffffffff",     
                           printToString(format_hex(UINT64_MAX, 18), 18));
   EXPECT_EQ("0x8000000000000000",     
diff --git a/unittests/Transforms/CMakeLists.txt b/unittests/Transforms/CMakeLists.txt
index 8ec56f1..5d3b29c 100644
--- a/unittests/Transforms/CMakeLists.txt
+++ b/unittests/Transforms/CMakeLists.txt
@@ -1,2 +1,2 @@
-add_subdirectory(DebugIR)
+add_subdirectory(IPO)
 add_subdirectory(Utils)
diff --git a/unittests/Transforms/DebugIR/CMakeLists.txt b/unittests/Transforms/DebugIR/CMakeLists.txt
deleted file mode 100644
index 88734d2..0000000
--- a/unittests/Transforms/DebugIR/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-set(LLVM_LINK_COMPONENTS
-  Core
-  Instrumentation
-  Support
-  )
-
-add_llvm_unittest(DebugIRTests
-  DebugIR.cpp
-  )
diff --git a/unittests/Transforms/DebugIR/DebugIR.cpp b/unittests/Transforms/DebugIR/DebugIR.cpp
deleted file mode 100644
index 41df147..0000000
--- a/unittests/Transforms/DebugIR/DebugIR.cpp
+++ /dev/null
@@ -1,308 +0,0 @@
-//===- DebugIR.cpp - Unit tests for the DebugIR pass ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// The tests in this file verify the DebugIR pass that generates debug metadata
-// for LLVM IR.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/Triple.h"
-#include "../lib/Transforms/Instrumentation/DebugIR.h"
-#include "llvm/Config/config.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Host.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Transforms/Instrumentation.h"
-
-// These tests do not depend on MCJIT, but we use the TrivialModuleBuilder
-// helper class to construct some trivial Modules.
-#include "../unittests/ExecutionEngine/MCJIT/MCJITTestBase.h"
-
-#include <string>
-
-#include "gtest/gtest.h"
-
-#if defined(LLVM_ON_WIN32)
-#include <direct.h>
-#define getcwd_impl _getcwd
-#elif defined (HAVE_GETCWD)
-#include <unistd.h>
-#define getcwd_impl getcwd
-#endif // LLVM_ON_WIN32
-
-using namespace llvm;
-using namespace std;
-
-namespace {
-
-/// Insert a mock CUDescriptor with the specified producer
-void insertCUDescriptor(Module *M, StringRef File, StringRef Dir,
-                        StringRef Producer) {
-  DIBuilder B(*M);
-  B.createCompileUnit(dwarf::DW_LANG_C99, File, Dir, Producer, false, "", 0);
-  B.finalize();
-}
-
-/// Attempts to remove file at Path and returns true if it existed, or false if
-/// it did not.
-bool removeIfExists(StringRef Path) {
-  // This is an approximation, on error we don't know in general if the file
-  // existed or not.
-  std::error_code EC = sys::fs::remove(Path, false);
-  return EC != llvm::errc::no_such_file_or_directory;
-}
-
-char * current_dir() {
-#if defined(LLVM_ON_WIN32) || defined(HAVE_GETCWD)
-  // calling getcwd (or _getcwd() on windows) with a null buffer makes it
-  // allocate a sufficiently sized buffer to store the current working dir.
-  return getcwd_impl(nullptr, 0);
-#else
-  return 0;
-#endif
-}
-
-class TestDebugIR : public ::testing::Test, public TrivialModuleBuilder {
-protected:
-  TestDebugIR()
-      : TrivialModuleBuilder(sys::getProcessTriple())
-      , cwd(current_dir()) {}
-
-  ~TestDebugIR() { free(cwd); }
-
-  /// Returns a concatenated path string consisting of Dir and Filename
-  string getPath(const string &Dir, const string &Filename) {
-    SmallVector<char, 8> Path;
-    sys::path::append(Path, Dir, Filename);
-    Path.resize(Dir.size() + Filename.size() + 2);
-    Path[Dir.size() + Filename.size() + 1] = '\0';
-    return string(Path.data());
-  }
-
-  LLVMContext Context;
-  char *cwd;
-  std::unique_ptr<Module> M;
-  std::unique_ptr<DebugIR> D;
-};
-
-// Test empty named Module that is not supposed to be output to disk.
-TEST_F(TestDebugIR, EmptyNamedModuleNoWrite) {
-  string Dir = "MadeUpDirectory";
-  string File = "empty_module.ll";
-  string Path(getPath(Dir, File));
-
-  M.reset(createEmptyModule(Path));
-
-  // constructing DebugIR with no args should not result in any file generated.
-  D.reset(static_cast<DebugIR *>(llvm::createDebugIRPass()));
-  D->runOnModule(*M);
-
-  // verify DebugIR did not generate a file
-  ASSERT_FALSE(removeIfExists(Path)) << "Unexpected file " << Path;
-}
-
-// Test a non-empty unnamed module that is output to an autogenerated file name.
-TEST_F(TestDebugIR, NonEmptyUnnamedModuleWriteToAutogeneratedFile) {
-  M.reset(createEmptyModule());
-  insertAddFunction(M.get());
-  D.reset(static_cast<DebugIR *>(llvm::createDebugIRPass(true, true)));
-
-  string Path;
-  D->runOnModule(*M, Path);
-
-  // verify DebugIR generated a file, and clean it up
-  ASSERT_TRUE(removeIfExists(Path)) << "Missing expected file at " << Path;
-}
-
-// Test not specifying a name in the module -- DebugIR should generate a name
-// and write the file contents.
-TEST_F(TestDebugIR, EmptyModuleWriteAnonymousFile) {
-  M.reset(createEmptyModule());
-  D.reset(static_cast<DebugIR *>(llvm::createDebugIRPass(false, false)));
-
-  string Path;
-  D->runOnModule(*M, Path);
-
-  // verify DebugIR generated a file and clean it up
-  ASSERT_TRUE(removeIfExists(Path)) << "Missing expected file at " << Path;
-}
-
-#ifdef HAVE_GETCWD // These tests require get_current_dir_name()
-
-// Test empty named Module that is to be output to path specified at Module
-// construction.
-TEST_F(TestDebugIR, EmptyNamedModuleWriteFile) {
-  string Filename("NamedFile1");
-  string ExpectedPath(getPath(cwd, Filename));
-
-  M.reset(createEmptyModule(ExpectedPath));
-  D.reset(static_cast<DebugIR *>(llvm::createDebugIRPass(true, true)));
-
-  string Path;
-  D->runOnModule(*M, Path);
-
-  // verify DebugIR was able to correctly parse the file name from module ID
-  ASSERT_EQ(ExpectedPath, Path);
-
-  // verify DebugIR generated a file, and clean it up
-  ASSERT_TRUE(removeIfExists(Path)) << "Missing expected file at " << Path;
-}
-
-// Test an empty unnamed module generates an output file whose path is specified
-// at DebugIR construction.
-TEST_F(TestDebugIR, EmptyUnnamedModuleWriteNamedFile) {
-  string Filename("NamedFile2");
-
-  M.reset(createEmptyModule());
-  D.reset(static_cast<DebugIR *>(llvm::createDebugIRPass(
-      false, false, StringRef(cwd), StringRef(Filename))));
-  string Path;
-  D->runOnModule(*M, Path);
-
-  string ExpectedPath(getPath(cwd, Filename));
-  ASSERT_EQ(ExpectedPath, Path);
-
-  // verify DebugIR generated a file, and clean it up
-  ASSERT_TRUE(removeIfExists(Path)) << "Missing expected file at " << Path;
-}
-
-// Test an empty named module generates an output file at the path specified
-// during DebugIR construction.
-TEST_F(TestDebugIR, EmptyNamedModuleWriteNamedFile) {
-  string Filename("NamedFile3");
-
-  string UnexpectedPath(getPath(cwd, "UnexpectedFilename"));
-  M.reset(createEmptyModule(UnexpectedPath));
-
-  D.reset(static_cast<DebugIR *>(llvm::createDebugIRPass(
-      false, false, StringRef(cwd), StringRef(Filename))));
-  string Path;
-  D->runOnModule(*M, Path);
-
-  string ExpectedPath(getPath(cwd, Filename));
-  ASSERT_EQ(ExpectedPath, Path);
-
-  // verify DebugIR generated a file, and clean it up
-  ASSERT_TRUE(removeIfExists(Path)) << "Missing expected file at " << Path;
-
-  // verify DebugIR did not generate a file at the path specified at Module
-  // construction.
-  ASSERT_FALSE(removeIfExists(UnexpectedPath)) << "Unexpected file " << Path;
-}
-
-// Test a non-empty named module that is not supposed to be output to disk
-TEST_F(TestDebugIR, NonEmptyNamedModuleNoWrite) {
-  string Filename("NamedFile4");
-  string ExpectedPath(getPath(cwd, Filename));
-
-  M.reset(createEmptyModule(ExpectedPath));
-  insertAddFunction(M.get());
-
-  D.reset(static_cast<DebugIR *>(llvm::createDebugIRPass()));
-
-  string Path;
-  D->runOnModule(*M, Path);
-  ASSERT_EQ(ExpectedPath, Path);
-
-  // verify DebugIR did not generate a file
-  ASSERT_FALSE(removeIfExists(Path)) << "Unexpected file " << Path;
-}
-
-// Test a non-empty named module that is output to disk.
-TEST_F(TestDebugIR, NonEmptyNamedModuleWriteFile) {
-  string Filename("NamedFile5");
-  string ExpectedPath(getPath(cwd, Filename));
-
-  M.reset(createEmptyModule(ExpectedPath));
-  insertAddFunction(M.get());
-
-  D.reset(static_cast<DebugIR *>(llvm::createDebugIRPass(true, true)));
-
-  string Path;
-  D->runOnModule(*M, Path);
-  ASSERT_EQ(ExpectedPath, Path);
-
-  // verify DebugIR generated a file, and clean it up
-  ASSERT_TRUE(removeIfExists(Path)) << "Missing expected file at " << Path;
-}
-
-// Test a non-empty unnamed module is output to a path specified at DebugIR
-// construction.
-TEST_F(TestDebugIR, NonEmptyUnnamedModuleWriteToNamedFile) {
-  string Filename("NamedFile6");
-
-  M.reset(createEmptyModule());
-  insertAddFunction(M.get());
-
-  D.reset(static_cast<DebugIR *>(
-      llvm::createDebugIRPass(true, true, cwd, Filename)));
-  string Path;
-  D->runOnModule(*M, Path);
-
-  string ExpectedPath(getPath(cwd, Filename));
-  ASSERT_EQ(ExpectedPath, Path);
-
-  // verify DebugIR generated a file, and clean it up
-  ASSERT_TRUE(removeIfExists(Path)) << "Missing expected file at " << Path;
-}
-
-// Test that information inside existing debug metadata is retained
-TEST_F(TestDebugIR, ExistingMetadataRetained) {
-  string Filename("NamedFile7");
-  string ExpectedPath(getPath(cwd, Filename));
-
-  M.reset(createEmptyModule(ExpectedPath));
-  insertAddFunction(M.get());
-
-  StringRef Producer("TestProducer");
-  insertCUDescriptor(M.get(), Filename, cwd, Producer);
-
-  DebugInfoFinder Finder;
-  Finder.processModule(*M);
-  ASSERT_EQ((unsigned)1, Finder.compile_unit_count());
-  D.reset(static_cast<DebugIR *>(llvm::createDebugIRPass()));
-
-  string Path;
-  D->runOnModule(*M, Path);
-  ASSERT_EQ(ExpectedPath, Path);
-
-  // verify DebugIR did not generate a file
-  ASSERT_FALSE(removeIfExists(Path)) << "Unexpected file " << Path;
-
-  DICompileUnit CU(*Finder.compile_units().begin());
-
-  // Verify original CU information is retained
-  ASSERT_EQ(Filename, CU.getFilename());
-  ASSERT_EQ(cwd, CU.getDirectory());
-  ASSERT_EQ(Producer, CU.getProducer());
-}
-
-#endif // HAVE_GETCWD
-
-#ifdef GTEST_HAS_DEATH_TEST
-
-// Test a non-empty unnamed module that is not supposed to be output to disk
-// NOTE: this test is expected to die with LLVM_ERROR, and such depends on
-// google test's "death test" mode.
-TEST_F(TestDebugIR, NonEmptyUnnamedModuleNoWrite) {
-  M.reset(createEmptyModule(StringRef()));
-  insertAddFunction(M.get());
-  D.reset(static_cast<DebugIR *>(llvm::createDebugIRPass()));
-
-  // No name in module or on DebugIR construction ==> DebugIR should assert
-  EXPECT_DEATH(D->runOnModule(*M),
-               "DebugIR unable to determine file name in input.");
-}
-
-#endif // GTEST_HAS_DEATH_TEST
-}
diff --git a/unittests/Transforms/DebugIR/Makefile b/unittests/Transforms/DebugIR/Makefile
deleted file mode 100644
index 9ace8c3..0000000
--- a/unittests/Transforms/DebugIR/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- unittests/Transforms/Utils/Makefile -----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-TESTNAME = DebugIR
-LINK_COMPONENTS := Instrumentation
-
-include $(LEVEL)/Makefile.config
-include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/Transforms/IPO/CMakeLists.txt b/unittests/Transforms/IPO/CMakeLists.txt
new file mode 100644
index 0000000..58b71b2
--- /dev/null
+++ b/unittests/Transforms/IPO/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  Support
+  IPO
+  )
+
+add_llvm_unittest(IPOTests
+  LowerBitSets.cpp
+  )
diff --git a/unittests/Transforms/IPO/LowerBitSets.cpp b/unittests/Transforms/IPO/LowerBitSets.cpp
new file mode 100644
index 0000000..26a4252
--- /dev/null
+++ b/unittests/Transforms/IPO/LowerBitSets.cpp
@@ -0,0 +1,95 @@
+//===- LowerBitSets.cpp - Unit tests for bitset lowering ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/LowerBitSets.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+TEST(LowerBitSets, BitSetBuilder) {
+  struct {
+    std::vector<uint64_t> Offsets;
+    std::vector<uint8_t> Bits;
+    uint64_t ByteOffset;
+    uint64_t BitSize;
+    unsigned AlignLog2;
+    bool IsSingleOffset;
+    bool IsAllOnes;
+  } BSBTests[] = {
+      {{}, {0}, 0, 1, 0, false, false},
+      {{0}, {1}, 0, 1, 0, true, true},
+      {{4}, {1}, 4, 1, 0, true, true},
+      {{37}, {1}, 37, 1, 0, true, true},
+      {{0, 1}, {3}, 0, 2, 0, false, true},
+      {{0, 4}, {3}, 0, 2, 2, false, true},
+      {{0, uint64_t(1) << 33}, {3}, 0, 2, 33, false, true},
+      {{3, 7}, {3}, 3, 2, 2, false, true},
+      {{0, 1, 7}, {131}, 0, 8, 0, false, false},
+      {{0, 2, 14}, {131}, 0, 8, 1, false, false},
+      {{0, 1, 8}, {3, 1}, 0, 9, 0, false, false},
+      {{0, 2, 16}, {3, 1}, 0, 9, 1, false, false},
+      {{0, 1, 2, 3, 4, 5, 6, 7}, {255}, 0, 8, 0, false, true},
+      {{0, 1, 2, 3, 4, 5, 6, 7, 8}, {255, 1}, 0, 9, 0, false, true},
+  };
+
+  for (auto &&T : BSBTests) {
+    BitSetBuilder BSB;
+    for (auto Offset : T.Offsets)
+      BSB.addOffset(Offset);
+
+    BitSetInfo BSI = BSB.build();
+
+    EXPECT_EQ(T.Bits, BSI.Bits);
+    EXPECT_EQ(T.ByteOffset, BSI.ByteOffset);
+    EXPECT_EQ(T.BitSize, BSI.BitSize);
+    EXPECT_EQ(T.AlignLog2, BSI.AlignLog2);
+    EXPECT_EQ(T.IsSingleOffset, BSI.isSingleOffset());
+    EXPECT_EQ(T.IsAllOnes, BSI.isAllOnes());
+
+    for (auto Offset : T.Offsets)
+      EXPECT_TRUE(BSI.containsGlobalOffset(Offset));
+
+    auto I = T.Offsets.begin();
+    for (uint64_t NonOffset = 0; NonOffset != 256; ++NonOffset) {
+      if (I != T.Offsets.end() && *I == NonOffset) {
+        ++I;
+        continue;
+      }
+
+      EXPECT_FALSE(BSI.containsGlobalOffset(NonOffset));
+    }
+  }
+}
+
+TEST(LowerBitSets, GlobalLayoutBuilder) {
+  struct {
+    uint64_t NumObjects;
+    std::vector<std::set<uint64_t>> Fragments;
+    std::vector<uint64_t> WantLayout;
+  } GLBTests[] = {
+    {0, {}, {}},
+    {4, {{0, 1}, {2, 3}}, {0, 1, 2, 3}},
+    {3, {{0, 1}, {1, 2}}, {0, 1, 2}},
+    {4, {{0, 1}, {1, 2}, {2, 3}}, {0, 1, 2, 3}},
+    {4, {{0, 1}, {2, 3}, {1, 2}}, {0, 1, 2, 3}},
+    {6, {{2, 5}, {0, 1, 2, 3, 4, 5}}, {0, 1, 2, 5, 3, 4}},
+  };
+
+  for (auto &&T : GLBTests) {
+    GlobalLayoutBuilder GLB(T.NumObjects);
+    for (auto &&F : T.Fragments)
+      GLB.addFragment(F);
+
+    std::vector<uint64_t> ComputedLayout;
+    for (auto &&F : GLB.Fragments)
+      ComputedLayout.insert(ComputedLayout.end(), F.begin(), F.end());
+
+    EXPECT_EQ(T.WantLayout, ComputedLayout);
+  }
+}
diff --git a/unittests/Transforms/IPO/Makefile b/unittests/Transforms/IPO/Makefile
new file mode 100644
index 0000000..f807879
--- /dev/null
+++ b/unittests/Transforms/IPO/Makefile
@@ -0,0 +1,15 @@
+##===- unittests/Transforms/IPO/Makefile -------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+TESTNAME = IPO
+LINK_COMPONENTS := IPO
+
+include $(LEVEL)/Makefile.config
+include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/Transforms/Makefile b/unittests/Transforms/Makefile
index d5cca39..3a2cdfc 100644
--- a/unittests/Transforms/Makefile
+++ b/unittests/Transforms/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL = ../..
 
-PARALLEL_DIRS = DebugIR Utils
+PARALLEL_DIRS = IPO Utils
 
 include $(LEVEL)/Makefile.common
 
diff --git a/unittests/Transforms/Utils/Cloning.cpp b/unittests/Transforms/Utils/Cloning.cpp
index c779979..1d22d5b 100644
--- a/unittests/Transforms/Utils/Cloning.cpp
+++ b/unittests/Transforms/Utils/Cloning.cpp
@@ -13,16 +13,15 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Constant.h"
-#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
diff --git a/utils/FileCheck/FileCheck.cpp b/utils/FileCheck/FileCheck.cpp
index 59affa1..146b512 100644
--- a/utils/FileCheck/FileCheck.cpp
+++ b/utils/FileCheck/FileCheck.cpp
@@ -73,6 +73,7 @@ namespace Check {
     CheckNone = 0,
     CheckPlain,
     CheckNext,
+    CheckSame,
     CheckNot,
     CheckDAG,
     CheckLabel,
@@ -620,6 +621,9 @@ struct CheckString {
   /// CheckNext - Verify there is a single line in the given buffer.
   bool CheckNext(const SourceMgr &SM, StringRef Buffer) const;
 
+  /// CheckSame - Verify there is no newline in the given buffer.
+  bool CheckSame(const SourceMgr &SM, StringRef Buffer) const;
+
   /// CheckNot - Verify there's no "not strings" in the given buffer.
   bool CheckNot(const SourceMgr &SM, StringRef Buffer,
                 const std::vector<const Pattern *> &NotStrings,
@@ -683,6 +687,9 @@ static size_t CheckTypeSize(Check::CheckType Ty) {
   case Check::CheckNext:
     return sizeof("-NEXT:") - 1;
 
+  case Check::CheckSame:
+    return sizeof("-SAME:") - 1;
+
   case Check::CheckNot:
     return sizeof("-NOT:") - 1;
 
@@ -713,6 +720,9 @@ static Check::CheckType FindCheckType(StringRef Buffer, StringRef Prefix) {
   if (Rest.startswith("NEXT:"))
     return Check::CheckNext;
 
+  if (Rest.startswith("SAME:"))
+    return Check::CheckSame;
+
   if (Rest.startswith("NOT:"))
     return Check::CheckNot;
 
@@ -919,10 +929,12 @@ static bool ReadCheckFile(SourceMgr &SM,
     Buffer = Buffer.substr(EOL);
 
     // Verify that CHECK-NEXT lines have at least one CHECK line before them.
-    if ((CheckTy == Check::CheckNext) && CheckStrings.empty()) {
+    if ((CheckTy == Check::CheckNext || CheckTy == Check::CheckSame) &&
+        CheckStrings.empty()) {
+      StringRef Type = CheckTy == Check::CheckNext ? "NEXT" : "SAME";
       SM.PrintMessage(SMLoc::getFromPointer(UsedPrefixStart),
                       SourceMgr::DK_Error,
-                      "found '" + UsedPrefix + "-NEXT:' without previous '"
+                      "found '" + UsedPrefix + "-" + Type + "' without previous '"
                       + UsedPrefix + ": line");
       return true;
     }
@@ -1053,6 +1065,11 @@ size_t CheckString::Check(const SourceMgr &SM, StringRef Buffer,
     if (CheckNext(SM, SkippedRegion))
       return StringRef::npos;
 
+    // If this check is a "CHECK-SAME", verify that the previous match was on
+    // the same line (i.e. that there is no newline between them).
+    if (CheckSame(SM, SkippedRegion))
+      return StringRef::npos;
+
     // If this match had "not strings", verify that they don't exist in the
     // skipped region.
     if (CheckNot(SM, SkippedRegion, NotStrings, VariableTable))
@@ -1101,6 +1118,34 @@ bool CheckString::CheckNext(const SourceMgr &SM, StringRef Buffer) const {
   return false;
 }
 
+bool CheckString::CheckSame(const SourceMgr &SM, StringRef Buffer) const {
+  if (CheckTy != Check::CheckSame)
+    return false;
+
+  // Count the number of newlines between the previous match and this one.
+  assert(Buffer.data() !=
+             SM.getMemoryBuffer(SM.FindBufferContainingLoc(
+                                    SMLoc::getFromPointer(Buffer.data())))
+                 ->getBufferStart() &&
+         "CHECK-SAME can't be the first check in a file");
+
+  const char *FirstNewLine = nullptr;
+  unsigned NumNewLines = CountNumNewlinesBetween(Buffer, FirstNewLine);
+
+  if (NumNewLines != 0) {
+    SM.PrintMessage(Loc, SourceMgr::DK_Error,
+                    Prefix +
+                        "-SAME: is not on the same line as the previous match");
+    SM.PrintMessage(SMLoc::getFromPointer(Buffer.end()), SourceMgr::DK_Note,
+                    "'next' match was here");
+    SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Note,
+                    "previous match ended here");
+    return true;
+  }
+
+  return false;
+}
+
 bool CheckString::CheckNot(const SourceMgr &SM, StringRef Buffer,
                            const std::vector<const Pattern *> &NotStrings,
                            StringMap<StringRef> &VariableTable) const {
diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index 891328f..65a34d3 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp
@@ -115,6 +115,7 @@
 #include <map>
 #include <set>
 #include <sstream>
+#include <forward_list>
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-matcher-emitter"
@@ -255,9 +256,8 @@ public:
       return true;
 
     // ... or if any of its super classes are a subset of RHS.
-    for (std::vector<ClassInfo*>::const_iterator it = SuperClasses.begin(),
-           ie = SuperClasses.end(); it != ie; ++it)
-      if ((*it)->isSubsetOf(RHS))
+    for (const ClassInfo *CI : SuperClasses)
+      if (CI->isSubsetOf(RHS))
         return true;
 
     return false;
@@ -391,6 +391,10 @@ struct MatchableInfo {
   /// AsmVariantID - Target's assembly syntax variant no.
   int AsmVariantID;
 
+  /// AsmString - The assembly string for this instruction (with variants
+  /// removed), e.g. "movsx $src, $dst".
+  std::string AsmString;
+
   /// TheDef - This is the definition of the instruction or InstAlias that this
   /// matchable came from.
   Record *const TheDef;
@@ -408,10 +412,6 @@ struct MatchableInfo {
   /// MCInst.
   SmallVector<ResOperand, 8> ResOperands;
 
-  /// AsmString - The assembly string for this instruction (with variants
-  /// removed), e.g. "movsx $src, $dst".
-  std::string AsmString;
-
   /// Mnemonic - This is the first token of the matched instruction, its
   /// mnemonic.
   StringRef Mnemonic;
@@ -423,7 +423,7 @@ struct MatchableInfo {
   SmallVector<AsmOperand, 8> AsmOperands;
 
   /// Predicates - The required subtarget features to match this instruction.
-  SmallVector<SubtargetFeatureInfo*, 4> RequiredFeatures;
+  SmallVector<const SubtargetFeatureInfo *, 4> RequiredFeatures;
 
   /// ConversionFnKind - The enum value which is passed to the generated
   /// convertToMCInst to convert parsed operands into an MCInst for this
@@ -434,13 +434,15 @@ struct MatchableInfo {
   bool HasDeprecation;
 
   MatchableInfo(const CodeGenInstruction &CGI)
-    : AsmVariantID(0), TheDef(CGI.TheDef), DefRec(&CGI),
-      AsmString(CGI.AsmString) {
+    : AsmVariantID(0), AsmString(CGI.AsmString), TheDef(CGI.TheDef), DefRec(&CGI) {
   }
 
-  MatchableInfo(const CodeGenInstAlias *Alias)
-    : AsmVariantID(0), TheDef(Alias->TheDef), DefRec(Alias),
-      AsmString(Alias->AsmString) {
+  MatchableInfo(std::unique_ptr<const CodeGenInstAlias> Alias)
+    : AsmVariantID(0), AsmString(Alias->AsmString), TheDef(Alias->TheDef), DefRec(Alias.release()) {
+  }
+
+  ~MatchableInfo() {
+    delete DefRec.dyn_cast<const CodeGenInstAlias*>();
   }
 
   // Two-operand aliases clone from the main matchable, but mark the second
@@ -516,7 +518,7 @@ struct MatchableInfo {
   /// couldMatchAmbiguouslyWith - Check whether this matchable could
   /// ambiguously match the same set of operands as \p RHS (without being a
   /// strictly superior match).
-  bool couldMatchAmbiguouslyWith(const MatchableInfo &RHS) {
+  bool couldMatchAmbiguouslyWith(const MatchableInfo &RHS) const {
     // The primary comparator is the instruction mnemonic.
     if (Mnemonic != RHS.Mnemonic)
       return false;
@@ -552,7 +554,7 @@ struct MatchableInfo {
     return !(HasLT ^ HasGT);
   }
 
-  void dump();
+  void dump() const;
 
 private:
   void tokenizeAsmString(const AsmMatcherInfo &Info);
@@ -574,7 +576,7 @@ struct SubtargetFeatureInfo {
     return "Feature_" + TheDef->getName();
   }
 
-  void dump() {
+  void dump() const {
     errs() << getEnumName() << " " << Index << "\n";
     TheDef->dump();
   }
@@ -582,10 +584,10 @@ struct SubtargetFeatureInfo {
 
 struct OperandMatchEntry {
   unsigned OperandMask;
-  MatchableInfo* MI;
+  const MatchableInfo* MI;
   ClassInfo *CI;
 
-  static OperandMatchEntry create(MatchableInfo* mi, ClassInfo *ci,
+  static OperandMatchEntry create(const MatchableInfo *mi, ClassInfo *ci,
                                   unsigned opMask) {
     OperandMatchEntry X;
     X.OperandMask = opMask;
@@ -608,10 +610,10 @@ public:
   CodeGenTarget &Target;
 
   /// The classes which are needed for matching.
-  std::vector<ClassInfo*> Classes;
+  std::forward_list<ClassInfo> Classes;
 
   /// The information on the matchables to match.
-  std::vector<MatchableInfo*> Matchables;
+  std::vector<std::unique_ptr<MatchableInfo>> Matchables;
 
   /// Info for custom matching operands by user defined methods.
   std::vector<OperandMatchEntry> OperandMatchInfo;
@@ -621,7 +623,7 @@ public:
   RegisterClassesTy RegisterClasses;
 
   /// Map of Predicate records to their subtarget information.
-  std::map<Record*, SubtargetFeatureInfo*, LessRecordByID> SubtargetFeatures;
+  std::map<Record *, SubtargetFeatureInfo, LessRecordByID> SubtargetFeatures;
 
   /// Map of AsmOperandClass records to their class information.
   std::map<Record*, ClassInfo*> AsmOperandClasses;
@@ -669,11 +671,10 @@ public:
 
   /// getSubtargetFeature - Lookup or create the subtarget feature info for the
   /// given operand.
-  SubtargetFeatureInfo *getSubtargetFeature(Record *Def) const {
+  const SubtargetFeatureInfo *getSubtargetFeature(Record *Def) const {
     assert(Def->isSubClassOf("Predicate") && "Invalid predicate type!");
-    std::map<Record*, SubtargetFeatureInfo*, LessRecordByID>::const_iterator I =
-      SubtargetFeatures.find(Def);
-    return I == SubtargetFeatures.end() ? nullptr : I->second;
+    const auto &I = SubtargetFeatures.find(Def);
+    return I == SubtargetFeatures.end() ? nullptr : &I->second;
   }
 
   RecordKeeper &getRecords() const {
@@ -683,11 +684,11 @@ public:
 
 } // End anonymous namespace
 
-void MatchableInfo::dump() {
+void MatchableInfo::dump() const {
   errs() << TheDef->getName() << " -- " << "flattened:\"" << AsmString <<"\"\n";
 
   for (unsigned i = 0, e = AsmOperands.size(); i != e; ++i) {
-    AsmOperand &Op = AsmOperands[i];
+    const AsmOperand &Op = AsmOperands[i];
     errs() << "  op[" << i << "] = " << Op.Class->ClassName << " - ";
     errs() << '\"' << Op.Token << "\"\n";
   }
@@ -777,8 +778,8 @@ void MatchableInfo::initialize(const AsmMatcherInfo &Info,
   // Compute the require features.
   std::vector<Record*> Predicates =TheDef->getValueAsListOfDefs("Predicates");
   for (unsigned i = 0, e = Predicates.size(); i != e; ++i)
-    if (SubtargetFeatureInfo *Feature =
-        Info.getSubtargetFeature(Predicates[i]))
+    if (const SubtargetFeatureInfo *Feature =
+            Info.getSubtargetFeature(Predicates[i]))
       RequiredFeatures.push_back(Feature);
 
   // Collect singleton registers, if used.
@@ -978,6 +979,7 @@ static std::string getEnumNameForToken(StringRef Str) {
     case '.': Res += "_DOT_"; break;
     case '<': Res += "_LT_"; break;
     case '>': Res += "_GT_"; break;
+    case '-': Res += "_MINUS_"; break;
     default:
       if ((*it >= 'A' && *it <= 'Z') ||
           (*it >= 'a' && *it <= 'z') ||
@@ -995,7 +997,8 @@ ClassInfo *AsmMatcherInfo::getTokenClass(StringRef Token) {
   ClassInfo *&Entry = TokenClasses[Token];
 
   if (!Entry) {
-    Entry = new ClassInfo();
+    Classes.emplace_front();
+    Entry = &Classes.front();
     Entry->Kind = ClassInfo::Token;
     Entry->ClassName = "Token";
     Entry->Name = "MCK_" + getEnumNameForToken(Token);
@@ -1004,7 +1007,6 @@ ClassInfo *AsmMatcherInfo::getTokenClass(StringRef Token) {
     Entry->RenderMethod = "<invalid>";
     Entry->ParserMethod = "";
     Entry->DiagnosticType = "";
-    Classes.push_back(Entry);
   }
 
   return Entry;
@@ -1076,10 +1078,8 @@ struct LessRegisterSet {
 
 void AsmMatcherInfo::
 buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
-  const std::vector<CodeGenRegister*> &Registers =
-    Target.getRegBank().getRegisters();
-  ArrayRef<CodeGenRegisterClass*> RegClassList =
-    Target.getRegBank().getRegClasses();
+  const auto &Registers = Target.getRegBank().getRegisters();
+  auto &RegClassList = Target.getRegBank().getRegClasses();
 
   typedef std::set<RegisterSet, LessRegisterSet> RegisterSetSet;
 
@@ -1087,15 +1087,12 @@ buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
   RegisterSetSet RegisterSets;
 
   // Gather the defined sets.
-  for (ArrayRef<CodeGenRegisterClass*>::const_iterator it =
-         RegClassList.begin(), ie = RegClassList.end(); it != ie; ++it)
-    RegisterSets.insert(RegisterSet(
-        (*it)->getOrder().begin(), (*it)->getOrder().end()));
+  for (const CodeGenRegisterClass &RC : RegClassList)
+    RegisterSets.insert(
+        RegisterSet(RC.getOrder().begin(), RC.getOrder().end()));
 
   // Add any required singleton sets.
-  for (SmallPtrSetImpl<Record*>::iterator it = SingletonRegisters.begin(),
-       ie = SingletonRegisters.end(); it != ie; ++it) {
-    Record *Rec = *it;
+  for (Record *Rec : SingletonRegisters) {
     RegisterSets.insert(RegisterSet(&Rec, &Rec + 1));
   }
 
@@ -1103,19 +1100,16 @@ buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
   // a unique register set class), and build the mapping of registers to the set
   // they should classify to.
   std::map<Record*, RegisterSet> RegisterMap;
-  for (std::vector<CodeGenRegister*>::const_iterator it = Registers.begin(),
-         ie = Registers.end(); it != ie; ++it) {
-    const CodeGenRegister &CGR = **it;
+  for (const CodeGenRegister &CGR : Registers) {
     // Compute the intersection of all sets containing this register.
     RegisterSet ContainingSet;
 
-    for (RegisterSetSet::iterator it = RegisterSets.begin(),
-           ie = RegisterSets.end(); it != ie; ++it) {
-      if (!it->count(CGR.TheDef))
+    for (const RegisterSet &RS : RegisterSets) {
+      if (!RS.count(CGR.TheDef))
         continue;
 
       if (ContainingSet.empty()) {
-        ContainingSet = *it;
+        ContainingSet = RS;
         continue;
       }
 
@@ -1123,7 +1117,7 @@ buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
       std::swap(Tmp, ContainingSet);
       std::insert_iterator<RegisterSet> II(ContainingSet,
                                            ContainingSet.begin());
-      std::set_intersection(Tmp.begin(), Tmp.end(), it->begin(), it->end(), II,
+      std::set_intersection(Tmp.begin(), Tmp.end(), RS.begin(), RS.end(), II,
                             LessRecordByID());
     }
 
@@ -1136,39 +1130,35 @@ buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
   // Construct the register classes.
   std::map<RegisterSet, ClassInfo*, LessRegisterSet> RegisterSetClasses;
   unsigned Index = 0;
-  for (RegisterSetSet::iterator it = RegisterSets.begin(),
-         ie = RegisterSets.end(); it != ie; ++it, ++Index) {
-    ClassInfo *CI = new ClassInfo();
+  for (const RegisterSet &RS : RegisterSets) {
+    Classes.emplace_front();
+    ClassInfo *CI = &Classes.front();
     CI->Kind = ClassInfo::RegisterClass0 + Index;
     CI->ClassName = "Reg" + utostr(Index);
     CI->Name = "MCK_Reg" + utostr(Index);
     CI->ValueName = "";
     CI->PredicateMethod = ""; // unused
     CI->RenderMethod = "addRegOperands";
-    CI->Registers = *it;
+    CI->Registers = RS;
     // FIXME: diagnostic type.
     CI->DiagnosticType = "";
-    Classes.push_back(CI);
-    RegisterSetClasses.insert(std::make_pair(*it, CI));
+    RegisterSetClasses.insert(std::make_pair(RS, CI));
+    ++Index;
   }
 
   // Find the superclasses; we could compute only the subgroup lattice edges,
   // but there isn't really a point.
-  for (RegisterSetSet::iterator it = RegisterSets.begin(),
-         ie = RegisterSets.end(); it != ie; ++it) {
-    ClassInfo *CI = RegisterSetClasses[*it];
-    for (RegisterSetSet::iterator it2 = RegisterSets.begin(),
-           ie2 = RegisterSets.end(); it2 != ie2; ++it2)
-      if (*it != *it2 &&
-          std::includes(it2->begin(), it2->end(), it->begin(), it->end(),
+  for (const RegisterSet &RS : RegisterSets) {
+    ClassInfo *CI = RegisterSetClasses[RS];
+    for (const RegisterSet &RS2 : RegisterSets)
+      if (RS != RS2 &&
+          std::includes(RS2.begin(), RS2.end(), RS.begin(), RS.end(),
                         LessRecordByID()))
-        CI->SuperClasses.push_back(RegisterSetClasses[*it2]);
+        CI->SuperClasses.push_back(RegisterSetClasses[RS2]);
   }
 
   // Name the register classes which correspond to a user defined RegisterClass.
-  for (ArrayRef<CodeGenRegisterClass*>::const_iterator
-       it = RegClassList.begin(), ie = RegClassList.end(); it != ie; ++it) {
-    const CodeGenRegisterClass &RC = **it;
+  for (const CodeGenRegisterClass &RC : RegClassList) {
     // Def will be NULL for non-user defined register classes.
     Record *Def = RC.getDef();
     if (!Def)
@@ -1191,9 +1181,7 @@ buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
     RegisterClasses[it->first] = RegisterSetClasses[it->second];
 
   // Name the register classes which correspond to singleton registers.
-  for (SmallPtrSetImpl<Record*>::iterator it = SingletonRegisters.begin(),
-         ie = SingletonRegisters.end(); it != ie; ++it) {
-    Record *Rec = *it;
+  for (Record *Rec : SingletonRegisters) {
     ClassInfo *CI = RegisterClasses[Rec];
     assert(CI && "Missing singleton register class info!");
 
@@ -1211,36 +1199,36 @@ void AsmMatcherInfo::buildOperandClasses() {
     Records.getAllDerivedDefinitions("AsmOperandClass");
 
   // Pre-populate AsmOperandClasses map.
-  for (std::vector<Record*>::iterator it = AsmOperands.begin(),
-         ie = AsmOperands.end(); it != ie; ++it)
-    AsmOperandClasses[*it] = new ClassInfo();
+  for (Record *Rec : AsmOperands) {
+    Classes.emplace_front();
+    AsmOperandClasses[Rec] = &Classes.front();
+  }
 
   unsigned Index = 0;
-  for (std::vector<Record*>::iterator it = AsmOperands.begin(),
-         ie = AsmOperands.end(); it != ie; ++it, ++Index) {
-    ClassInfo *CI = AsmOperandClasses[*it];
+  for (Record *Rec : AsmOperands) {
+    ClassInfo *CI = AsmOperandClasses[Rec];
     CI->Kind = ClassInfo::UserClass0 + Index;
 
-    ListInit *Supers = (*it)->getValueAsListInit("SuperClasses");
+    ListInit *Supers = Rec->getValueAsListInit("SuperClasses");
     for (unsigned i = 0, e = Supers->getSize(); i != e; ++i) {
       DefInit *DI = dyn_cast<DefInit>(Supers->getElement(i));
       if (!DI) {
-        PrintError((*it)->getLoc(), "Invalid super class reference!");
+        PrintError(Rec->getLoc(), "Invalid super class reference!");
         continue;
       }
 
       ClassInfo *SC = AsmOperandClasses[DI->getDef()];
       if (!SC)
-        PrintError((*it)->getLoc(), "Invalid super class reference!");
+        PrintError(Rec->getLoc(), "Invalid super class reference!");
       else
         CI->SuperClasses.push_back(SC);
     }
-    CI->ClassName = (*it)->getValueAsString("Name");
+    CI->ClassName = Rec->getValueAsString("Name");
     CI->Name = "MCK_" + CI->ClassName;
-    CI->ValueName = (*it)->getName();
+    CI->ValueName = Rec->getName();
 
     // Get or construct the predicate method name.
-    Init *PMName = (*it)->getValueInit("PredicateMethod");
+    Init *PMName = Rec->getValueInit("PredicateMethod");
     if (StringInit *SI = dyn_cast<StringInit>(PMName)) {
       CI->PredicateMethod = SI->getValue();
     } else {
@@ -1249,7 +1237,7 @@ void AsmMatcherInfo::buildOperandClasses() {
     }
 
     // Get or construct the render method name.
-    Init *RMName = (*it)->getValueInit("RenderMethod");
+    Init *RMName = Rec->getValueInit("RenderMethod");
     if (StringInit *SI = dyn_cast<StringInit>(RMName)) {
       CI->RenderMethod = SI->getValue();
     } else {
@@ -1258,18 +1246,17 @@ void AsmMatcherInfo::buildOperandClasses() {
     }
 
     // Get the parse method name or leave it as empty.
-    Init *PRMName = (*it)->getValueInit("ParserMethod");
+    Init *PRMName = Rec->getValueInit("ParserMethod");
     if (StringInit *SI = dyn_cast<StringInit>(PRMName))
       CI->ParserMethod = SI->getValue();
 
     // Get the diagnostic type or leave it as empty.
     // Get the parse method name or leave it as empty.
-    Init *DiagnosticType = (*it)->getValueInit("DiagnosticType");
+    Init *DiagnosticType = Rec->getValueInit("DiagnosticType");
     if (StringInit *SI = dyn_cast<StringInit>(DiagnosticType))
       CI->DiagnosticType = SI->getValue();
 
-    AsmOperandClasses[*it] = CI;
-    Classes.push_back(CI);
+    ++Index;
   }
 }
 
@@ -1288,16 +1275,13 @@ void AsmMatcherInfo::buildOperandMatchInfo() {
   typedef std::map<ClassInfo *, unsigned, less_ptr<ClassInfo>> OpClassMaskTy;
   OpClassMaskTy OpClassMask;
 
-  for (std::vector<MatchableInfo*>::const_iterator it =
-       Matchables.begin(), ie = Matchables.end();
-       it != ie; ++it) {
-    MatchableInfo &II = **it;
+  for (const auto &MI : Matchables) {
     OpClassMask.clear();
 
     // Keep track of all operands of this instructions which belong to the
     // same class.
-    for (unsigned i = 0, e = II.AsmOperands.size(); i != e; ++i) {
-      MatchableInfo::AsmOperand &Op = II.AsmOperands[i];
+    for (unsigned i = 0, e = MI->AsmOperands.size(); i != e; ++i) {
+      const MatchableInfo::AsmOperand &Op = MI->AsmOperands[i];
       if (Op.Class->ParserMethod.empty())
         continue;
       unsigned &OperandMask = OpClassMask[Op.Class];
@@ -1305,11 +1289,11 @@ void AsmMatcherInfo::buildOperandMatchInfo() {
     }
 
     // Generate operand match info for each mnemonic/operand class pair.
-    for (OpClassMaskTy::iterator iit = OpClassMask.begin(),
-         iie = OpClassMask.end(); iit != iie; ++iit) {
-      unsigned OpMask = iit->second;
-      ClassInfo *CI = iit->first;
-      OperandMatchInfo.push_back(OperandMatchEntry::create(&II, CI, OpMask));
+    for (const auto &OCM : OpClassMask) {
+      unsigned OpMask = OCM.second;
+      ClassInfo *CI = OCM.first;
+      OperandMatchInfo.push_back(OperandMatchEntry::create(MI.get(), CI,
+                                                           OpMask));
     }
   }
 }
@@ -1327,10 +1311,10 @@ void AsmMatcherInfo::buildInfo() {
     if (Pred->getName().empty())
       PrintFatalError(Pred->getLoc(), "Predicate has no name!");
 
-    uint64_t FeatureNo = SubtargetFeatures.size();
-    SubtargetFeatures[Pred] = new SubtargetFeatureInfo(Pred, FeatureNo);
-    DEBUG(SubtargetFeatures[Pred]->dump());
-    assert(FeatureNo < 64 && "Too many subtarget features!");
+    SubtargetFeatures.insert(std::make_pair(
+        Pred, SubtargetFeatureInfo(Pred, SubtargetFeatures.size())));
+    DEBUG(SubtargetFeatures.find(Pred)->second.dump());
+    assert(SubtargetFeatures.size() <= 64 && "Too many subtarget features!");
   }
 
   // Parse the instructions; we need to do this first so that we can gather the
@@ -1344,20 +1328,18 @@ void AsmMatcherInfo::buildInfo() {
     std::string RegisterPrefix = AsmVariant->getValueAsString("RegisterPrefix");
     int AsmVariantNo = AsmVariant->getValueAsInt("Variant");
 
-    for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
-           E = Target.inst_end(); I != E; ++I) {
-      const CodeGenInstruction &CGI = **I;
+    for (const CodeGenInstruction *CGI : Target.instructions()) {
 
       // If the tblgen -match-prefix option is specified (for tblgen hackers),
       // filter the set of instructions we consider.
-      if (!StringRef(CGI.TheDef->getName()).startswith(MatchPrefix))
+      if (!StringRef(CGI->TheDef->getName()).startswith(MatchPrefix))
         continue;
 
       // Ignore "codegen only" instructions.
-      if (CGI.TheDef->getValueAsBit("isCodeGenOnly"))
+      if (CGI->TheDef->getValueAsBit("isCodeGenOnly"))
         continue;
 
-      std::unique_ptr<MatchableInfo> II(new MatchableInfo(CGI));
+      std::unique_ptr<MatchableInfo> II(new MatchableInfo(*CGI));
 
       II->initialize(*this, SingletonRegisters, AsmVariantNo, RegisterPrefix);
 
@@ -1366,14 +1348,7 @@ void AsmMatcherInfo::buildInfo() {
       if (!II->validate(CommentDelimiter, true))
         continue;
 
-      // Ignore "Int_*" and "*_Int" instructions, which are internal aliases.
-      //
-      // FIXME: This is a total hack.
-      if (StringRef(II->TheDef->getName()).startswith("Int_") ||
-          StringRef(II->TheDef->getName()).endswith("_Int"))
-        continue;
-
-      Matchables.push_back(II.release());
+      Matchables.push_back(std::move(II));
     }
 
     // Parse all of the InstAlias definitions and stick them in the list of
@@ -1381,8 +1356,8 @@ void AsmMatcherInfo::buildInfo() {
     std::vector<Record*> AllInstAliases =
       Records.getAllDerivedDefinitions("InstAlias");
     for (unsigned i = 0, e = AllInstAliases.size(); i != e; ++i) {
-      CodeGenInstAlias *Alias =
-          new CodeGenInstAlias(AllInstAliases[i], AsmVariantNo, Target);
+      auto Alias = llvm::make_unique<CodeGenInstAlias>(AllInstAliases[i],
+                                                       AsmVariantNo, Target);
 
       // If the tblgen -match-prefix option is specified (for tblgen hackers),
       // filter the set of instruction aliases we consider, based on the target
@@ -1391,14 +1366,14 @@ void AsmMatcherInfo::buildInfo() {
             .startswith( MatchPrefix))
         continue;
 
-      std::unique_ptr<MatchableInfo> II(new MatchableInfo(Alias));
+      std::unique_ptr<MatchableInfo> II(new MatchableInfo(std::move(Alias)));
 
       II->initialize(*this, SingletonRegisters, AsmVariantNo, RegisterPrefix);
 
       // Validate the alias definitions.
       II->validate(CommentDelimiter, false);
 
-      Matchables.push_back(II.release());
+      Matchables.push_back(std::move(II));
     }
   }
 
@@ -1410,11 +1385,8 @@ void AsmMatcherInfo::buildInfo() {
 
   // Build the information about matchables, now that we have fully formed
   // classes.
-  std::vector<MatchableInfo*> NewMatchables;
-  for (std::vector<MatchableInfo*>::iterator it = Matchables.begin(),
-         ie = Matchables.end(); it != ie; ++it) {
-    MatchableInfo *II = *it;
-
+  std::vector<std::unique_ptr<MatchableInfo>> NewMatchables;
+  for (auto &II : Matchables) {
     // Parse the tokens after the mnemonic.
     // Note: buildInstructionOperandReference may insert new AsmOperands, so
     // don't precompute the loop bound.
@@ -1449,9 +1421,9 @@ void AsmMatcherInfo::buildInfo() {
         OperandName = Token.substr(1);
 
       if (II->DefRec.is<const CodeGenInstruction*>())
-        buildInstructionOperandReference(II, OperandName, i);
+        buildInstructionOperandReference(II.get(), OperandName, i);
       else
-        buildAliasOperandReference(II, OperandName, Op);
+        buildAliasOperandReference(II.get(), OperandName, Op);
     }
 
     if (II->DefRec.is<const CodeGenInstruction*>()) {
@@ -1469,14 +1441,14 @@ void AsmMatcherInfo::buildInfo() {
         AliasII->formTwoOperandAlias(Constraint);
 
         // Add the alias to the matchables list.
-        NewMatchables.push_back(AliasII.release());
+        NewMatchables.push_back(std::move(AliasII));
       }
     } else
       II->buildAliasResultOperands();
   }
   if (!NewMatchables.empty())
-    Matchables.insert(Matchables.end(), NewMatchables.begin(),
-                      NewMatchables.end());
+    std::move(NewMatchables.begin(), NewMatchables.end(),
+              std::back_inserter(Matchables));
 
   // Process token alias definitions and set up the associated superclass
   // information.
@@ -1493,7 +1465,7 @@ void AsmMatcherInfo::buildInfo() {
   }
 
   // Reorder classes so that classes precede super classes.
-  std::sort(Classes.begin(), Classes.end(), less_ptr<ClassInfo>());
+  Classes.sort();
 }
 
 /// buildInstructionOperandReference - The specified operand is a reference to a
@@ -1703,7 +1675,7 @@ static unsigned getConverterOperandID(const std::string &Name,
 
 
 static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
-                             std::vector<MatchableInfo*> &Infos,
+                             std::vector<std::unique_ptr<MatchableInfo>> &Infos,
                              raw_ostream &OS) {
   SetVector<std::string> OperandConversionKinds;
   SetVector<std::string> InstructionConversionKinds;
@@ -1767,16 +1739,13 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
   OperandConversionKinds.insert("CVT_Tied");
   enum { CVT_Done, CVT_Reg, CVT_Tied };
 
-  for (std::vector<MatchableInfo*>::const_iterator it = Infos.begin(),
-         ie = Infos.end(); it != ie; ++it) {
-    MatchableInfo &II = **it;
-
+  for (auto &II : Infos) {
     // Check if we have a custom match function.
     std::string AsmMatchConverter =
-      II.getResultInst()->TheDef->getValueAsString("AsmMatchConverter");
+      II->getResultInst()->TheDef->getValueAsString("AsmMatchConverter");
     if (!AsmMatchConverter.empty()) {
       std::string Signature = "ConvertCustom_" + AsmMatchConverter;
-      II.ConversionFnKind = Signature;
+      II->ConversionFnKind = Signature;
 
       // Check if we have already generated this signature.
       if (!InstructionConversionKinds.insert(Signature))
@@ -1808,16 +1777,17 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
     std::vector<uint8_t> ConversionRow;
 
     // Compute the convert enum and the case body.
-    MaxRowLength = std::max(MaxRowLength, II.ResOperands.size()*2 + 1 );
+    MaxRowLength = std::max(MaxRowLength, II->ResOperands.size()*2 + 1 );
 
-    for (unsigned i = 0, e = II.ResOperands.size(); i != e; ++i) {
-      const MatchableInfo::ResOperand &OpInfo = II.ResOperands[i];
+    for (unsigned i = 0, e = II->ResOperands.size(); i != e; ++i) {
+      const MatchableInfo::ResOperand &OpInfo = II->ResOperands[i];
 
       // Generate code to populate each result operand.
       switch (OpInfo.Kind) {
       case MatchableInfo::ResOperand::RenderAsmOperand: {
         // This comes from something we parsed.
-        MatchableInfo::AsmOperand &Op = II.AsmOperands[OpInfo.AsmOperandNum];
+        const MatchableInfo::AsmOperand &Op =
+          II->AsmOperands[OpInfo.AsmOperandNum];
 
         // Registers are always converted the same, don't duplicate the
         // conversion function based on them.
@@ -1879,6 +1849,7 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
       case MatchableInfo::ResOperand::ImmOperand: {
         int64_t Val = OpInfo.ImmVal;
         std::string Ty = "imm_" + itostr(Val);
+        Ty = getEnumNameForToken(Ty);
         Signature += "__" + Ty;
 
         std::string Name = "CVT_" + Ty;
@@ -1940,7 +1911,7 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
     if (Signature == "Convert")
       Signature += "_NoOperands";
 
-    II.ConversionFnKind = Signature;
+    II->ConversionFnKind = Signature;
 
     // Save the signature. If we already have it, don't add a new row
     // to the table.
@@ -2003,7 +1974,7 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
 
 /// emitMatchClassEnumeration - Emit the enumeration for match class kinds.
 static void emitMatchClassEnumeration(CodeGenTarget &Target,
-                                      std::vector<ClassInfo*> &Infos,
+                                      std::forward_list<ClassInfo> &Infos,
                                       raw_ostream &OS) {
   OS << "namespace {\n\n";
 
@@ -2011,9 +1982,7 @@ static void emitMatchClassEnumeration(CodeGenTarget &Target,
      << "/// instruction matching.\n";
   OS << "enum MatchClassKind {\n";
   OS << "  InvalidMatchClass = 0,\n";
-  for (std::vector<ClassInfo*>::iterator it = Infos.begin(),
-         ie = Infos.end(); it != ie; ++it) {
-    ClassInfo &CI = **it;
+  for (const auto &CI : Infos) {
     OS << "  " << CI.Name << ", // ";
     if (CI.Kind == ClassInfo::Token) {
       OS << "'" << CI.ValueName << "'\n";
@@ -2053,10 +2022,7 @@ static void emitValidateOperandClass(AsmMatcherInfo &Info,
 
   // Check the user classes. We don't care what order since we're only
   // actually matching against one of them.
-  for (std::vector<ClassInfo*>::iterator it = Info.Classes.begin(),
-         ie = Info.Classes.end(); it != ie; ++it) {
-    ClassInfo &CI = **it;
-
+  for (const auto &CI : Info.Classes) {
     if (!CI.isUserClass())
       continue;
 
@@ -2075,11 +2041,9 @@ static void emitValidateOperandClass(AsmMatcherInfo &Info,
   OS << "    MatchClassKind OpKind;\n";
   OS << "    switch (Operand.getReg()) {\n";
   OS << "    default: OpKind = InvalidMatchClass; break;\n";
-  for (AsmMatcherInfo::RegisterClassesTy::iterator
-         it = Info.RegisterClasses.begin(), ie = Info.RegisterClasses.end();
-       it != ie; ++it)
+  for (const auto &RC : Info.RegisterClasses)
     OS << "    case " << Info.Target.getName() << "::"
-       << it->first->getName() << ": OpKind = " << it->second->Name
+       << RC.first->getName() << ": OpKind = " << RC.second->Name
        << "; break;\n";
   OS << "    }\n";
   OS << "    return isSubclass(OpKind, Kind) ? "
@@ -2094,7 +2058,7 @@ static void emitValidateOperandClass(AsmMatcherInfo &Info,
 
 /// emitIsSubclass - Emit the subclass predicate function.
 static void emitIsSubclass(CodeGenTarget &Target,
-                           std::vector<ClassInfo*> &Infos,
+                           std::forward_list<ClassInfo> &Infos,
                            raw_ostream &OS) {
   OS << "/// isSubclass - Compute whether \\p A is a subclass of \\p B.\n";
   OS << "static bool isSubclass(MatchClassKind A, MatchClassKind B) {\n";
@@ -2107,15 +2071,9 @@ static void emitIsSubclass(CodeGenTarget &Target,
   SS << "  switch (A) {\n";
   SS << "  default:\n";
   SS << "    return false;\n";
-  for (std::vector<ClassInfo*>::iterator it = Infos.begin(),
-         ie = Infos.end(); it != ie; ++it) {
-    ClassInfo &A = **it;
-
+  for (const auto &A : Infos) {
     std::vector<StringRef> SuperClasses;
-    for (std::vector<ClassInfo*>::iterator it = Infos.begin(),
-         ie = Infos.end(); it != ie; ++it) {
-      ClassInfo &B = **it;
-
+    for (const auto &B : Infos) {
       if (&A != &B && A.isSubsetOf(B))
         SuperClasses.push_back(B.Name);
     }
@@ -2157,17 +2115,14 @@ static void emitIsSubclass(CodeGenTarget &Target,
 /// emitMatchTokenString - Emit the function to match a token string to the
 /// appropriate match class value.
 static void emitMatchTokenString(CodeGenTarget &Target,
-                                 std::vector<ClassInfo*> &Infos,
+                                 std::forward_list<ClassInfo> &Infos,
                                  raw_ostream &OS) {
   // Construct the match list.
   std::vector<StringMatcher::StringPair> Matches;
-  for (std::vector<ClassInfo*>::iterator it = Infos.begin(),
-         ie = Infos.end(); it != ie; ++it) {
-    ClassInfo &CI = **it;
-
+  for (const auto &CI : Infos) {
     if (CI.Kind == ClassInfo::Token)
-      Matches.push_back(StringMatcher::StringPair(CI.ValueName,
-                                                  "return " + CI.Name + ";"));
+      Matches.push_back(
+          StringMatcher::StringPair(CI.ValueName, "return " + CI.Name + ";"));
   }
 
   OS << "static MatchClassKind matchTokenString(StringRef Name) {\n";
@@ -2184,16 +2139,14 @@ static void emitMatchRegisterName(CodeGenTarget &Target, Record *AsmParser,
                                   raw_ostream &OS) {
   // Construct the match list.
   std::vector<StringMatcher::StringPair> Matches;
-  const std::vector<CodeGenRegister*> &Regs =
-    Target.getRegBank().getRegisters();
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    const CodeGenRegister *Reg = Regs[i];
-    if (Reg->TheDef->getValueAsString("AsmName").empty())
+  const auto &Regs = Target.getRegBank().getRegisters();
+  for (const CodeGenRegister &Reg : Regs) {
+    if (Reg.TheDef->getValueAsString("AsmName").empty())
       continue;
 
-    Matches.push_back(StringMatcher::StringPair(
-                                     Reg->TheDef->getValueAsString("AsmName"),
-                                     "return " + utostr(Reg->EnumValue) + ";"));
+    Matches.push_back(
+        StringMatcher::StringPair(Reg.TheDef->getValueAsString("AsmName"),
+                                  "return " + utostr(Reg.EnumValue) + ";"));
   }
 
   OS << "static unsigned MatchRegisterName(StringRef Name) {\n";
@@ -2230,10 +2183,8 @@ static void emitSubtargetFeatureFlagEnumeration(AsmMatcherInfo &Info,
      << "instruction matching.\n";
   OS << "enum SubtargetFeatureFlag : " << getMinimalRequiredFeaturesType(Info)
      << " {\n";
-  for (std::map<Record*, SubtargetFeatureInfo*, LessRecordByID>::const_iterator
-         it = Info.SubtargetFeatures.begin(),
-         ie = Info.SubtargetFeatures.end(); it != ie; ++it) {
-    SubtargetFeatureInfo &SFI = *it->second;
+  for (const auto &SF : Info.SubtargetFeatures) {
+    const SubtargetFeatureInfo &SFI = SF.second;
     OS << "  " << SFI.getEnumName() << " = (1ULL << " << SFI.Index << "),\n";
   }
   OS << "  Feature_None = 0\n";
@@ -2268,10 +2219,8 @@ static void emitGetSubtargetFeatureName(AsmMatcherInfo &Info, raw_ostream &OS) {
      << "static const char *getSubtargetFeatureName(uint64_t Val) {\n";
   if (!Info.SubtargetFeatures.empty()) {
     OS << "  switch(Val) {\n";
-    typedef std::map<Record*, SubtargetFeatureInfo*, LessRecordByID> RecFeatMap;
-    for (RecFeatMap::const_iterator it = Info.SubtargetFeatures.begin(),
-             ie = Info.SubtargetFeatures.end(); it != ie; ++it) {
-      SubtargetFeatureInfo &SFI = *it->second;
+    for (const auto &SF : Info.SubtargetFeatures) {
+      const SubtargetFeatureInfo &SFI = SF.second;
       // FIXME: Totally just a placeholder name to get the algorithm working.
       OS << "  case " << SFI.getEnumName() << ": return \""
          << SFI.TheDef->getValueAsString("PredicateName") << "\";\n";
@@ -2295,10 +2244,8 @@ static void emitComputeAvailableFeatures(AsmMatcherInfo &Info,
   OS << "uint64_t " << Info.Target.getName() << ClassName << "::\n"
      << "ComputeAvailableFeatures(uint64_t FB) const {\n";
   OS << "  uint64_t Features = 0;\n";
-  for (std::map<Record*, SubtargetFeatureInfo*, LessRecordByID>::const_iterator
-         it = Info.SubtargetFeatures.begin(),
-         ie = Info.SubtargetFeatures.end(); it != ie; ++it) {
-    SubtargetFeatureInfo &SFI = *it->second;
+  for (const auto &SF : Info.SubtargetFeatures) {
+    const SubtargetFeatureInfo &SFI = SF.second;
 
     OS << "  if (";
     std::string CondStorage =
@@ -2344,7 +2291,7 @@ static std::string GetAliasRequiredFeatures(Record *R,
   std::string Result;
   unsigned NumFeatures = 0;
   for (unsigned i = 0, e = ReqFeatures.size(); i != e; ++i) {
-    SubtargetFeatureInfo *F = Info.getSubtargetFeature(ReqFeatures[i]);
+    const SubtargetFeatureInfo *F = Info.getSubtargetFeature(ReqFeatures[i]);
 
     if (!F)
       PrintFatalError(R->getLoc(), "Predicate '" + ReqFeatures[i]->getName() +
@@ -2488,8 +2435,8 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
                << " RequiredFeatures;\n";
   OS << "    " << getMinimalTypeForRange(MaxMnemonicIndex)
                << " Mnemonic;\n";
-  OS << "    " << getMinimalTypeForRange(Info.Classes.size())
-               << " Class;\n";
+  OS << "    " << getMinimalTypeForRange(std::distance(
+                      Info.Classes.begin(), Info.Classes.end())) << " Class;\n";
   OS << "    " << getMinimalTypeForRange(MaxMask)
                << " OperandMask;\n\n";
   OS << "    StringRef getMnemonic() const {\n";
@@ -2566,13 +2513,11 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
      << " &Operands,\n                      unsigned MCK) {\n\n"
      << "  switch(MCK) {\n";
 
-  for (std::vector<ClassInfo*>::const_iterator it = Info.Classes.begin(),
-       ie = Info.Classes.end(); it != ie; ++it) {
-    ClassInfo *CI = *it;
-    if (CI->ParserMethod.empty())
+  for (const auto &CI : Info.Classes) {
+    if (CI.ParserMethod.empty())
       continue;
-    OS << "  case " << CI->Name << ":\n"
-       << "    return " << CI->ParserMethod << "(Operands);\n";
+    OS << "  case " << CI.Name << ":\n"
+       << "    return " << CI.ParserMethod << "(Operands);\n";
   }
 
   OS << "  default:\n";
@@ -2651,22 +2596,23 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   // stable_sort to ensure that ambiguous instructions are still
   // deterministically ordered.
   std::stable_sort(Info.Matchables.begin(), Info.Matchables.end(),
-                   less_ptr<MatchableInfo>());
+                   [](const std::unique_ptr<MatchableInfo> &a,
+                      const std::unique_ptr<MatchableInfo> &b){
+                     return *a < *b;});
 
   DEBUG_WITH_TYPE("instruction_info", {
-      for (std::vector<MatchableInfo*>::iterator
-             it = Info.Matchables.begin(), ie = Info.Matchables.end();
-           it != ie; ++it)
-        (*it)->dump();
+      for (const auto &MI : Info.Matchables)
+        MI->dump();
     });
 
   // Check for ambiguous matchables.
   DEBUG_WITH_TYPE("ambiguous_instrs", {
     unsigned NumAmbiguous = 0;
-    for (unsigned i = 0, e = Info.Matchables.size(); i != e; ++i) {
-      for (unsigned j = i + 1; j != e; ++j) {
-        MatchableInfo &A = *Info.Matchables[i];
-        MatchableInfo &B = *Info.Matchables[j];
+    for (auto I = Info.Matchables.begin(), E = Info.Matchables.end(); I != E;
+         ++I) {
+      for (auto J = std::next(I); J != E; ++J) {
+        const MatchableInfo &A = **I;
+        const MatchableInfo &B = **J;
 
         if (A.couldMatchAmbiguouslyWith(B)) {
           errs() << "warning: ambiguous matchables:\n";
@@ -2701,15 +2647,13 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "  void convertToMapAndConstraints(unsigned Kind,\n                ";
   OS << "           const OperandVector &Operands) override;\n";
   OS << "  bool mnemonicIsValid(StringRef Mnemonic, unsigned VariantID) override;\n";
-  OS << "  unsigned MatchInstructionImpl(\n";
-  OS.indent(27);
-  OS << "const OperandVector &Operands,\n"
+  OS << "  unsigned MatchInstructionImpl(const OperandVector &Operands,\n"
      << "                                MCInst &Inst,\n"
      << "                                uint64_t &ErrorInfo,"
      << " bool matchingInlineAsm,\n"
      << "                                unsigned VariantID = 0);\n";
 
-  if (Info.OperandMatchInfo.size()) {
+  if (!Info.OperandMatchInfo.empty()) {
     OS << "\n  enum OperandMatchResultTy {\n";
     OS << "    MatchOperand_Success,    // operand matched successfully\n";
     OS << "    MatchOperand_NoMatch,    // operand did not match\n";
@@ -2786,15 +2730,12 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   size_t MaxNumOperands = 0;
   unsigned MaxMnemonicIndex = 0;
   bool HasDeprecation = false;
-  for (std::vector<MatchableInfo*>::const_iterator it =
-         Info.Matchables.begin(), ie = Info.Matchables.end();
-       it != ie; ++it) {
-    MatchableInfo &II = **it;
-    MaxNumOperands = std::max(MaxNumOperands, II.AsmOperands.size());
-    HasDeprecation |= II.HasDeprecation;
+  for (const auto &MI : Info.Matchables) {
+    MaxNumOperands = std::max(MaxNumOperands, MI->AsmOperands.size());
+    HasDeprecation |= MI->HasDeprecation;
 
     // Store a pascal-style length byte in the mnemonic.
-    std::string LenMnemonic = char(II.Mnemonic.size()) + II.Mnemonic.str();
+    std::string LenMnemonic = char(MI->Mnemonic.size()) + MI->Mnemonic.str();
     MaxMnemonicIndex = std::max(MaxMnemonicIndex,
                         StringTable.GetOrAddStringOffset(LenMnemonic, false));
   }
@@ -2822,8 +2763,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
                << " ConvertFn;\n";
   OS << "    " << getMinimalRequiredFeaturesType(Info)
                << " RequiredFeatures;\n";
-  OS << "    " << getMinimalTypeForRange(Info.Classes.size())
-               << " Classes[" << MaxNumOperands << "];\n";
+  OS << "    " << getMinimalTypeForRange(
+                      std::distance(Info.Classes.begin(), Info.Classes.end()))
+     << " Classes[" << MaxNumOperands << "];\n";
   OS << "    StringRef getMnemonic() const {\n";
   OS << "      return StringRef(MnemonicTable + Mnemonic + 1,\n";
   OS << "                       MnemonicTable[Mnemonic]);\n";
@@ -2852,33 +2794,30 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
     OS << "static const MatchEntry MatchTable" << VC << "[] = {\n";
 
-    for (std::vector<MatchableInfo*>::const_iterator it =
-         Info.Matchables.begin(), ie = Info.Matchables.end();
-         it != ie; ++it) {
-      MatchableInfo &II = **it;
-      if (II.AsmVariantID != AsmVariantNo)
+    for (const auto &MI : Info.Matchables) {
+      if (MI->AsmVariantID != AsmVariantNo)
         continue;
 
       // Store a pascal-style length byte in the mnemonic.
-      std::string LenMnemonic = char(II.Mnemonic.size()) + II.Mnemonic.str();
+      std::string LenMnemonic = char(MI->Mnemonic.size()) + MI->Mnemonic.str();
       OS << "  { " << StringTable.GetOrAddStringOffset(LenMnemonic, false)
-         << " /* " << II.Mnemonic << " */, "
+         << " /* " << MI->Mnemonic << " */, "
          << Target.getName() << "::"
-         << II.getResultInst()->TheDef->getName() << ", "
-         << II.ConversionFnKind << ", ";
+         << MI->getResultInst()->TheDef->getName() << ", "
+         << MI->ConversionFnKind << ", ";
 
       // Write the required features mask.
-      if (!II.RequiredFeatures.empty()) {
-        for (unsigned i = 0, e = II.RequiredFeatures.size(); i != e; ++i) {
+      if (!MI->RequiredFeatures.empty()) {
+        for (unsigned i = 0, e = MI->RequiredFeatures.size(); i != e; ++i) {
           if (i) OS << "|";
-          OS << II.RequiredFeatures[i]->getEnumName();
+          OS << MI->RequiredFeatures[i]->getEnumName();
         }
       } else
         OS << "0";
 
       OS << ", { ";
-      for (unsigned i = 0, e = II.AsmOperands.size(); i != e; ++i) {
-        MatchableInfo::AsmOperand &Op = II.AsmOperands[i];
+      for (unsigned i = 0, e = MI->AsmOperands.size(); i != e; ++i) {
+        const MatchableInfo::AsmOperand &Op = MI->AsmOperands[i];
 
         if (i) OS << ", ";
         OS << Op.Class->Name;
@@ -2895,7 +2834,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "  // Find the appropriate table for this asm variant.\n";
   OS << "  const MatchEntry *Start, *End;\n";
   OS << "  switch (VariantID) {\n";
-  OS << "  default: // unreachable\n";
+  OS << "  default: llvm_unreachable(\"invalid variant!\");\n";
   for (unsigned VC = 0; VC != VariantCount; ++VC) {
     Record *AsmVariant = Target.getAsmParserVariant(VC);
     int AsmVariantNo = AsmVariant->getValueAsInt("Variant");
@@ -2911,10 +2850,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   // Finally, build the match function.
   OS << "unsigned " << Target.getName() << ClassName << "::\n"
-     << "MatchInstructionImpl(const OperandVector"
-     << " &Operands,\n";
-  OS << "                     MCInst &Inst,\n"
-     << "uint64_t &ErrorInfo, bool matchingInlineAsm, unsigned VariantID) {\n";
+     << "MatchInstructionImpl(const OperandVector &Operands,\n";
+  OS << "                     MCInst &Inst, uint64_t &ErrorInfo,\n"
+     << "                     bool matchingInlineAsm, unsigned VariantID) {\n";
 
   OS << "  // Eliminate obvious mismatches.\n";
   OS << "  if (Operands.size() > " << (MaxNumOperands+1) << ") {\n";
@@ -2949,7 +2887,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "  // Find the appropriate table for this asm variant.\n";
   OS << "  const MatchEntry *Start, *End;\n";
   OS << "  switch (VariantID) {\n";
-  OS << "  default: // unreachable\n";
+  OS << "  default: llvm_unreachable(\"invalid variant!\");\n";
   for (unsigned VC = 0; VC != VariantCount; ++VC) {
     Record *AsmVariant = Target.getAsmParserVariant(VC);
     int AsmVariantNo = AsmVariant->getValueAsInt("Variant");
@@ -3018,12 +2956,13 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "      HadMatchOtherThanFeatures = true;\n";
   OS << "      uint64_t NewMissingFeatures = it->RequiredFeatures & "
         "~AvailableFeatures;\n";
-  OS << "      if (CountPopulation_64(NewMissingFeatures) <=\n"
-        "          CountPopulation_64(MissingFeatures))\n";
+  OS << "      if (countPopulation(NewMissingFeatures) <=\n"
+        "          countPopulation(MissingFeatures))\n";
   OS << "        MissingFeatures = NewMissingFeatures;\n";
   OS << "      continue;\n";
   OS << "    }\n";
   OS << "\n";
+  OS << "    Inst.clear();\n\n";
   OS << "    if (matchingInlineAsm) {\n";
   OS << "      Inst.setOpcode(it->Opcode);\n";
   OS << "      convertToMapAndConstraints(it->ConvertFn, Operands);\n";
@@ -3072,7 +3011,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "  return Match_MissingFeature;\n";
   OS << "}\n\n";
 
-  if (Info.OperandMatchInfo.size())
+  if (!Info.OperandMatchInfo.empty())
     emitCustomOperandParsing(OS, Target, Info, ClassName, StringTable,
                              MaxMnemonicIndex);
 
diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index 7ef70d3..587de26 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp
@@ -208,9 +208,6 @@ FindUniqueOperandCommands(std::vector<std::string> &UniqueOperandCommands,
       // Otherwise, scan to see if all of the other instructions in this command
       // set share the operand.
       bool AllSame = true;
-      // Keep track of the maximum, number of operands or any
-      // instruction we see in the group.
-      size_t MaxSize = FirstInst->Operands.size();
 
       for (NIT = std::find(NIT+1, InstIdxs.end(), CommandIdx);
            NIT != InstIdxs.end();
@@ -220,10 +217,6 @@ FindUniqueOperandCommands(std::vector<std::string> &UniqueOperandCommands,
         const AsmWriterInst *OtherInst =
           getAsmWriterInstByID(NIT-InstIdxs.begin());
 
-        if (OtherInst &&
-            OtherInst->Operands.size() > FirstInst->Operands.size())
-          MaxSize = std::max(MaxSize, OtherInst->Operands.size());
-
         if (!OtherInst || OtherInst->Operands.size() == Op ||
             OtherInst->Operands[Op] != FirstInst->Operands[Op]) {
           AllSame = false;
@@ -350,7 +343,7 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
   // in the opcode-indexed table.
   unsigned BitsLeft = 64-AsmStrBits;
 
-  std::vector<std::vector<std::string> > TableDrivenOperandPrinters;
+  std::vector<std::vector<std::string>> TableDrivenOperandPrinters;
 
   while (1) {
     std::vector<std::string> UniqueOperandCommands;
@@ -393,7 +386,7 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
     }
 
     // Remember the handlers for this set of operands.
-    TableDrivenOperandPrinters.push_back(UniqueOperandCommands);
+    TableDrivenOperandPrinters.push_back(std::move(UniqueOperandCommands));
   }
 
 
@@ -474,7 +467,7 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
       O << "  switch ((Bits >> "
         << (64-BitsLeft) << ") & "
         << ((1 << NumBits)-1) << ") {\n"
-        << "  default:   // unreachable.\n";
+        << "  default: llvm_unreachable(\"Invalid command number.\");\n";
 
       // Print out all the cases.
       for (unsigned i = 0, e = Commands.size(); i != e; ++i) {
@@ -520,14 +513,23 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
   O << "}\n";
 }
 
+static const char *getMinimalTypeForRange(uint64_t Range) {
+  assert(Range < 0xFFFFFFFFULL && "Enum too large");
+  if (Range > 0xFFFF)
+    return "uint32_t";
+  if (Range > 0xFF)
+    return "uint16_t";
+  return "uint8_t";
+}
+
 static void
 emitRegisterNameString(raw_ostream &O, StringRef AltName,
-                       const std::vector<CodeGenRegister*> &Registers) {
+                       const std::deque<CodeGenRegister> &Registers) {
   SequenceToOffsetTable<std::string> StringTable;
   SmallVector<std::string, 4> AsmNames(Registers.size());
-  for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
-    const CodeGenRegister &Reg = *Registers[i];
-    std::string &AsmName = AsmNames[i];
+  unsigned i = 0;
+  for (const auto &Reg : Registers) {
+    std::string &AsmName = AsmNames[i++];
 
     // "NoRegAltName" is special. We don't need to do a lookup for that,
     // as it's just a reference to the default register name.
@@ -564,7 +566,8 @@ emitRegisterNameString(raw_ostream &O, StringRef AltName,
   StringTable.emit(O, printChar);
   O << "  };\n\n";
 
-  O << "  static const uint32_t RegAsmOffset" << AltName << "[] = {";
+  O << "  static const " << getMinimalTypeForRange(StringTable.size()-1)
+    << " RegAsmOffset" << AltName << "[] = {";
   for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
     if ((i % 14) == 0)
       O << "\n    ";
@@ -577,8 +580,7 @@ emitRegisterNameString(raw_ostream &O, StringRef AltName,
 void AsmWriterEmitter::EmitGetRegisterName(raw_ostream &O) {
   Record *AsmWriter = Target.getAsmWriter();
   std::string ClassName = AsmWriter->getValueAsString("AsmWriterClassName");
-  const std::vector<CodeGenRegister*> &Registers =
-    Target.getRegBank().getRegisters();
+  const auto &Registers = Target.getRegBank().getRegisters();
   std::vector<Record*> AltNameIndices = Target.getRegAltNameIndices();
   bool hasAltNames = AltNameIndices.size() > 1;
 
@@ -602,26 +604,25 @@ void AsmWriterEmitter::EmitGetRegisterName(raw_ostream &O) {
     emitRegisterNameString(O, "", Registers);
 
   if (hasAltNames) {
-    O << "  const uint32_t *RegAsmOffset;\n"
-      << "  const char *AsmStrs;\n"
-      << "  switch(AltIdx) {\n"
+    O << "  switch(AltIdx) {\n"
       << "  default: llvm_unreachable(\"Invalid register alt name index!\");\n";
     for (unsigned i = 0, e = AltNameIndices.size(); i < e; ++i) {
       std::string Namespace = AltNameIndices[1]->getValueAsString("Namespace");
       std::string AltName(AltNameIndices[i]->getName());
-      O << "  case " << Namespace << "::" << AltName
-        << ":\n"
-        << "    AsmStrs = AsmStrs" << AltName  << ";\n"
-        << "    RegAsmOffset = RegAsmOffset" << AltName << ";\n"
-        << "    break;\n";
+      O << "  case " << Namespace << "::" << AltName << ":\n"
+        << "    assert(*(AsmStrs" << AltName << "+RegAsmOffset"
+        << AltName << "[RegNo-1]) &&\n"
+        << "           \"Invalid alt name index for register!\");\n"
+        << "    return AsmStrs" << AltName << "+RegAsmOffset"
+        << AltName << "[RegNo-1];\n";
     }
-    O << "}\n";
+    O << "  }\n";
+  } else {
+    O << "  assert (*(AsmStrs+RegAsmOffset[RegNo-1]) &&\n"
+      << "          \"Invalid alt name index for register!\");\n"
+      << "  return AsmStrs+RegAsmOffset[RegNo-1];\n";
   }
-
-  O << "  assert (*(AsmStrs+RegAsmOffset[RegNo-1]) &&\n"
-    << "          \"Invalid alt name index for register!\");\n"
-    << "  return AsmStrs+RegAsmOffset[RegNo-1];\n"
-    << "}\n";
+  O << "}\n";
 }
 
 namespace {
@@ -654,20 +655,26 @@ public:
   std::pair<StringRef, StringRef::iterator> parseName(StringRef::iterator Start,
                                                       StringRef::iterator End) {
     StringRef::iterator I = Start;
+    StringRef::iterator Next;
     if (*I == '{') {
       // ${some_name}
       Start = ++I;
       while (I != End && *I != '}')
         ++I;
+      Next = I;
+      // eat the final '}'
+      if (Next != End)
+        ++Next;
     } else {
       // $name, just eat the usual suspects.
       while (I != End &&
              ((*I >= 'a' && *I <= 'z') || (*I >= 'A' && *I <= 'Z') ||
               (*I >= '0' && *I <= '9') || *I == '_'))
         ++I;
+      Next = I;
     }
 
-    return std::make_pair(StringRef(Start, I - Start), I);
+    return std::make_pair(StringRef(Start, I - Start), Next);
   }
 
   void print(raw_ostream &O) {
@@ -991,7 +998,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
     return;
   }
 
-  if (MCOpPredicates.size())
+  if (!MCOpPredicates.empty())
     O << "static bool " << Target.getName() << ClassName
       << "ValidateMCOperand(\n"
       << "       const MCOperand &MCOp, unsigned PredicateIndex);\n";
@@ -1057,7 +1064,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   }    
   O << "}\n\n";
 
-  if (MCOpPredicates.size()) {
+  if (!MCOpPredicates.empty()) {
     O << "static bool " << Target.getName() << ClassName
       << "ValidateMCOperand(\n"
       << "       const MCOperand &MCOp, unsigned PredicateIndex) {\n"
@@ -1084,13 +1091,10 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
 
 AsmWriterEmitter::AsmWriterEmitter(RecordKeeper &R) : Records(R), Target(R) {
   Record *AsmWriter = Target.getAsmWriter();
-  for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
-                                    E = Target.inst_end();
-       I != E; ++I)
-    if (!(*I)->AsmString.empty() && (*I)->TheDef->getName() != "PHI")
+  for (const CodeGenInstruction *I : Target.instructions())
+    if (!I->AsmString.empty() && I->TheDef->getName() != "PHI")
       Instructions.push_back(
-          AsmWriterInst(**I, AsmWriter->getValueAsInt("Variant"),
-                        AsmWriter->getValueAsInt("OperandSpacing")));
+          AsmWriterInst(*I, AsmWriter->getValueAsInt("Variant")));
 
   // Get the instruction numbering.
   NumberedInstructions = &Target.getInstructionsByEnumValue();
diff --git a/utils/TableGen/AsmWriterInst.cpp b/utils/TableGen/AsmWriterInst.cpp
index 5d9ead1..6ddc510 100644
--- a/utils/TableGen/AsmWriterInst.cpp
+++ b/utils/TableGen/AsmWriterInst.cpp
@@ -48,9 +48,7 @@ std::string AsmWriterOperand::getCode() const {
 /// ParseAsmString - Parse the specified Instruction's AsmString into this
 /// AsmWriterInst.
 ///
-AsmWriterInst::AsmWriterInst(const CodeGenInstruction &CGI,
-                             unsigned Variant,
-                             int OperandSpacing) {
+AsmWriterInst::AsmWriterInst(const CodeGenInstruction &CGI, unsigned Variant) {
   this->CGI = &CGI;
 
   // NOTE: Any extensions to this code need to be mirrored in the
diff --git a/utils/TableGen/AsmWriterInst.h b/utils/TableGen/AsmWriterInst.h
index fd77982..6a900b7 100644
--- a/utils/TableGen/AsmWriterInst.h
+++ b/utils/TableGen/AsmWriterInst.h
@@ -88,8 +88,7 @@ namespace llvm {
     const CodeGenInstruction *CGI;
 
     AsmWriterInst(const CodeGenInstruction &CGI,
-                  unsigned Variant,
-                  int OperandSpacing);
+                  unsigned Variant);
 
     /// MatchesAllButOneOp - If this instruction is exactly identical to the
     /// specified instruction except for one differing operand, return the
diff --git a/utils/TableGen/CallingConvEmitter.cpp b/utils/TableGen/CallingConvEmitter.cpp
index 6a65e5e..051a7e9 100644
--- a/utils/TableGen/CallingConvEmitter.cpp
+++ b/utils/TableGen/CallingConvEmitter.cpp
@@ -124,7 +124,7 @@ void CallingConvEmitter::EmitAction(Record *Action,
         }
         O << "\n" << IndentStr << "};\n";
         O << IndentStr << "if (unsigned Reg = State.AllocateReg(RegList"
-          << Counter << ", " << RegList->getSize() << ")) {\n";
+          << Counter << ")) {\n";
       }
       O << IndentStr << "  State.addLoc(CCValAssign::getReg(ValNo, ValVT, "
         << "Reg, LocVT, LocInfo));\n";
@@ -166,7 +166,7 @@ void CallingConvEmitter::EmitAction(Record *Action,
 
         O << IndentStr << "if (unsigned Reg = State.AllocateReg(RegList"
           << RegListNumber << ", " << "RegList" << ShadowRegListNumber
-          << ", " << RegList->getSize() << ")) {\n";
+          << ")) {\n";
       }
       O << IndentStr << "  State.addLoc(CCValAssign::getReg(ValNo, ValVT, "
         << "Reg, LocVT, LocInfo));\n";
@@ -182,14 +182,14 @@ void CallingConvEmitter::EmitAction(Record *Action,
         O << Size << ", ";
       else
         O << "\n" << IndentStr
-          << "  State.getMachineFunction().getSubtarget().getDataLayout()"
+          << "  State.getMachineFunction().getTarget().getDataLayout()"
              "->getTypeAllocSize(EVT(LocVT).getTypeForEVT(State.getContext())),"
              " ";
       if (Align)
         O << Align;
       else
         O << "\n" << IndentStr
-          << "  State.getMachineFunction().getSubtarget().getDataLayout()"
+          << "  State.getMachineFunction().getTarget().getDataLayout()"
              "->getABITypeAlignment(EVT(LocVT).getTypeForEVT(State.getContext()"
              "))";
       O << ");\n" << IndentStr
@@ -215,8 +215,7 @@ void CallingConvEmitter::EmitAction(Record *Action,
       O << IndentStr << "unsigned Offset" << ++Counter
         << " = State.AllocateStack("
         << Size << ", " << Align << ", "
-        << "ShadowRegList" << ShadowRegListNumber << ", "
-        << ShadowRegList->getSize() << ");\n";
+        << "ShadowRegList" << ShadowRegListNumber << ");\n";
       O << IndentStr << "State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset"
         << Counter << ", LocVT, LocInfo));\n";
       O << IndentStr << "return false;\n";
diff --git a/utils/TableGen/CodeEmitterGen.cpp b/utils/TableGen/CodeEmitterGen.cpp
index 11911b6..46fcdf5 100644
--- a/utils/TableGen/CodeEmitterGen.cpp
+++ b/utils/TableGen/CodeEmitterGen.cpp
@@ -96,7 +96,7 @@ AddCodeToMergeInOperand(Record *R, BitsInit *BI, const std::string &VarName,
     /// generated emitter, skip it.
     while (NumberedOp < NumberOps &&
            (CGI.Operands.isFlatOperandNotEmitted(NumberedOp) ||
-              (NamedOpIndices.size() && NamedOpIndices.count(
+              (!NamedOpIndices.empty() && NamedOpIndices.count(
                 CGI.Operands.getSubOperandNumber(NumberedOp).first)))) {
       ++NumberedOp;
 
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index a750aa9..4e3e588 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -53,7 +53,7 @@ EEVT::TypeSet::TypeSet(MVT::SimpleValueType VT, TreePattern &TP) {
     EnforceVector(TP);
   else {
     assert((VT < MVT::LAST_VALUETYPE || VT == MVT::iPTR ||
-            VT == MVT::iPTRAny) && "Not a concrete type!");
+            VT == MVT::iPTRAny || VT == MVT::Any) && "Not a concrete type!");
     TypeVec.push_back(VT);
   }
 }
@@ -1113,6 +1113,8 @@ static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) {
 
     // FIXME: Should allow access to all the results here.
     unsigned NumDefsToAdd = InstInfo.Operands.NumDefs ? 1 : 0;
+    if (InstInfo.hasTwoExplicitDefs)
+      ++NumDefsToAdd;
 
     // Add on one implicit def if it has a resolvable type.
     if (InstInfo.HasOneImplicitDefWithKnownVT(CDP.getTargetInfo()) !=MVT::Other)
@@ -1539,6 +1541,22 @@ static bool isOperandClass(const TreePatternNode *N, StringRef Class) {
 
   return false;
 }
+
+static void emitTooManyOperandsError(TreePattern &TP,
+                                     StringRef InstName,
+                                     unsigned Expected,
+                                     unsigned Actual) {
+  TP.error("Instruction '" + InstName + "' was provided " + Twine(Actual) +
+           " operands but expected only " + Twine(Expected) + "!");
+}
+
+static void emitTooFewOperandsError(TreePattern &TP,
+                                    StringRef InstName,
+                                    unsigned Actual) {
+  TP.error("Instruction '" + InstName +
+           "' expects more than the provided " + Twine(Actual) + " operands!");
+}
+
 /// ApplyTypeConstraints - Apply all of the type constraints relevant to
 /// this node and its children in the tree.  This returns true if it makes a
 /// change, false otherwise.  If a type contradiction is found, flag an error.
@@ -1593,11 +1611,20 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     assert(getNumTypes() == 0 && "Set doesn't produce a value");
     assert(getNumChildren() >= 2 && "Missing RHS of a set?");
     unsigned NC = getNumChildren();
+    unsigned NumOfSrcs = NC-1;
 
+    // destination
     TreePatternNode *SetVal = getChild(NC-1);
     bool MadeChange = SetVal->ApplyTypeConstraints(TP, NotRegisters);
 
-    for (unsigned i = 0; i < NC-1; ++i) {
+    // second explicit destination
+    if (TP.getRecord()->getValueAsBit("hasTwoExplicitDefs")) {
+      TreePatternNode *Set2Val = getChild(NC-2);
+      MadeChange = Set2Val->ApplyTypeConstraints(TP, NotRegisters);
+      NumOfSrcs --;
+    }
+    
+    for (unsigned i = 0; i < NumOfSrcs; ++i) {
       TreePatternNode *Child = getChild(i);
       MadeChange |= Child->ApplyTypeConstraints(TP, NotRegisters);
 
@@ -1741,8 +1768,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
 
       // Verify that we didn't run out of provided operands.
       if (ChildNo >= getNumChildren()) {
-        TP.error("Instruction '" + getOperator()->getName() +
-                 "' expects more operands than were provided.");
+        emitTooFewOperandsError(TP, getOperator()->getName(), getNumChildren());
         return false;
       }
 
@@ -1766,8 +1792,8 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
             // And the remaining sub-operands against subsequent children.
             for (unsigned Arg = 1; Arg < NumArgs; ++Arg) {
               if (ChildNo >= getNumChildren()) {
-                TP.error("Instruction '" + getOperator()->getName() +
-                         "' expects more operands than were provided.");
+                emitTooFewOperandsError(TP, getOperator()->getName(),
+                                        getNumChildren());
                 return false;
               }
               Child = getChild(ChildNo++);
@@ -1787,8 +1813,8 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     }
 
     if (!InstInfo.Operands.isVariadic && ChildNo != getNumChildren()) {
-      TP.error("Instruction '" + getOperator()->getName() +
-               "' was provided too many operands!");
+      emitTooManyOperandsError(TP, getOperator()->getName(),
+                               ChildNo, getNumChildren());
       return false;
     }
 
@@ -2554,8 +2580,10 @@ FindPatternInputsAndOutputs(TreePattern *I, TreePatternNode *Pat,
       I->error("set destination should be a register!");
 
     DefInit *Val = dyn_cast<DefInit>(Dest->getLeafValue());
-    if (!Val)
+    if (!Val) {
       I->error("set destination should be a register!");
+      continue;
+    }
 
     if (Val->getDef()->isSubClassOf("RegisterClass") ||
         Val->getDef()->isSubClassOf("ValueType") ||
@@ -2839,7 +2867,7 @@ const DAGInstruction &CodeGenDAGPatterns::parseInstructionPattern(
 
     // Check that all of the results occur first in the list.
     std::vector<Record*> Results;
-    TreePatternNode *Res0Node = nullptr;
+    SmallVector<TreePatternNode *, 2> ResNode;
     for (unsigned i = 0; i != NumResults; ++i) {
       if (i == CGI.Operands.size())
         I->error("'" + InstResults.begin()->first +
@@ -2851,8 +2879,7 @@ const DAGInstruction &CodeGenDAGPatterns::parseInstructionPattern(
       if (!RNode)
         I->error("Operand $" + OpName + " does not exist in operand list!");
 
-      if (i == 0)
-        Res0Node = RNode;
+      ResNode.push_back(RNode);
       Record *R = cast<DefInit>(RNode->getLeafValue())->getDef();
       if (!R)
         I->error("Operand $" + OpName + " should be a set destination: all "
@@ -2929,7 +2956,7 @@ const DAGInstruction &CodeGenDAGPatterns::parseInstructionPattern(
                           GetNumNodeResults(I->getRecord(), *this));
     // Copy fully inferred output node type to instruction result pattern.
     for (unsigned i = 0; i != NumResults; ++i)
-      ResultPattern->setType(i, Res0Node->getExtType(i));
+      ResultPattern->setType(i, ResNode[i]->getExtType(0));
 
     // Create and insert the instruction.
     // FIXME: InstImpResults should not be part of DAGInstruction.
@@ -3111,13 +3138,6 @@ void CodeGenDAGPatterns::InferInstructionFlags() {
     CodeGenInstruction &InstInfo =
       const_cast<CodeGenInstruction &>(*Instructions[i]);
 
-    // Treat neverHasSideEffects = 1 as the equivalent of hasSideEffects = 0.
-    // This flag is obsolete and will be removed.
-    if (InstInfo.neverHasSideEffects) {
-      assert(!InstInfo.hasSideEffects);
-      InstInfo.hasSideEffects_Unset = false;
-    }
-
     // Get the primary instruction pattern.
     const TreePattern *Pattern = getInstruction(InstInfo.TheDef).getPattern();
     if (!Pattern) {
diff --git a/utils/TableGen/CodeGenInstruction.cpp b/utils/TableGen/CodeGenInstruction.cpp
index d567dde..b1e4318 100644
--- a/utils/TableGen/CodeGenInstruction.cpp
+++ b/utils/TableGen/CodeGenInstruction.cpp
@@ -68,10 +68,13 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) {
     std::string PrintMethod = "printOperand";
     std::string EncoderMethod;
     std::string OperandType = "OPERAND_UNKNOWN";
+    std::string OperandNamespace = "MCOI";
     unsigned NumOps = 1;
     DagInit *MIOpInfo = nullptr;
     if (Rec->isSubClassOf("RegisterOperand")) {
       PrintMethod = Rec->getValueAsString("PrintMethod");
+      OperandType = Rec->getValueAsString("OperandType");
+      OperandNamespace = Rec->getValueAsString("OperandNamespace");
     } else if (Rec->isSubClassOf("Operand")) {
       PrintMethod = Rec->getValueAsString("PrintMethod");
       OperandType = Rec->getValueAsString("OperandType");
@@ -113,8 +116,8 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) {
                       Twine(i) + " has the same name as a previous operand!");
 
     OperandList.push_back(OperandInfo(Rec, ArgName, PrintMethod, EncoderMethod,
-                                      OperandType, MIOperandNo, NumOps,
-                                      MIOpInfo));
+                                      OperandNamespace + "::" + OperandType,
+				      MIOperandNo, NumOps, MIOpInfo));
     MIOperandNo += NumOps;
   }
 
@@ -317,6 +320,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
   isRegSequence = R->getValueAsBit("isRegSequence");
   isExtractSubreg = R->getValueAsBit("isExtractSubreg");
   isInsertSubreg = R->getValueAsBit("isInsertSubreg");
+  hasTwoExplicitDefs = R->getValueAsBit("hasTwoExplicitDefs");
 
   bool Unset;
   mayLoad      = R->getValueAsBitOrUnset("mayLoad", Unset);
@@ -325,7 +329,6 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
   mayStore_Unset = Unset;
   hasSideEffects = R->getValueAsBitOrUnset("hasSideEffects", Unset);
   hasSideEffects_Unset = Unset;
-  neverHasSideEffects = R->getValueAsBit("neverHasSideEffects");
 
   isAsCheapAsAMove = R->getValueAsBit("isAsCheapAsAMove");
   hasExtraSrcRegAllocReq = R->getValueAsBit("hasExtraSrcRegAllocReq");
@@ -335,9 +338,6 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
   ImplicitDefs = R->getValueAsListOfDefs("Defs");
   ImplicitUses = R->getValueAsListOfDefs("Uses");
 
-  if (neverHasSideEffects + hasSideEffects > 1)
-    PrintFatalError(R->getName() + ": multiple conflicting side-effect flags set!");
-
   // Parse Constraints.
   ParseConstraints(R->getValueAsString("Constraints"), Operands);
 
@@ -541,7 +541,7 @@ bool CodeGenInstAlias::tryAliasOpMatch(DagInit *Result, unsigned AliasOpNo,
   // If both are Operands with the same MVT, allow the conversion. It's
   // up to the user to make sure the values are appropriate, just like
   // for isel Pat's.
-  if (InstOpRec->isSubClassOf("Operand") &&
+  if (InstOpRec->isSubClassOf("Operand") && ADI &&
       ADI->getDef()->isSubClassOf("Operand")) {
     // FIXME: What other attributes should we check here? Identical
     // MIOperandInfo perhaps?
diff --git a/utils/TableGen/CodeGenInstruction.h b/utils/TableGen/CodeGenInstruction.h
index 92aac5f..82b23f4 100644
--- a/utils/TableGen/CodeGenInstruction.h
+++ b/utils/TableGen/CodeGenInstruction.h
@@ -247,7 +247,6 @@ namespace llvm {
     bool isNotDuplicable : 1;
     bool hasSideEffects : 1;
     bool hasSideEffects_Unset : 1;
-    bool neverHasSideEffects : 1;
     bool isAsCheapAsAMove : 1;
     bool hasExtraSrcRegAllocReq : 1;
     bool hasExtraDefRegAllocReq : 1;
@@ -256,6 +255,7 @@ namespace llvm {
     bool isRegSequence : 1;
     bool isExtractSubreg : 1;
     bool isInsertSubreg : 1;
+    bool hasTwoExplicitDefs : 1;
 
     std::string DeprecatedReason;
     bool HasComplexDeprecationPredicate;
diff --git a/utils/TableGen/CodeGenMapTable.cpp b/utils/TableGen/CodeGenMapTable.cpp
index 7e5aa9c..8abea48 100644
--- a/utils/TableGen/CodeGenMapTable.cpp
+++ b/utils/TableGen/CodeGenMapTable.cpp
@@ -376,7 +376,7 @@ unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) {
     std::vector<Record*> ColInstrs = MapTable[CurInstr];
     std::string OutStr("");
     unsigned RelExists = 0;
-    if (ColInstrs.size()) {
+    if (!ColInstrs.empty()) {
       for (unsigned j = 0; j < NumCol; j++) {
         if (ColInstrs[j] != nullptr) {
           RelExists = 1;
@@ -567,7 +567,7 @@ void EmitMapTable(RecordKeeper &Records, raw_ostream &OS) {
   std::vector<Record*> InstrMapVec;
   InstrMapVec = Records.getAllDerivedDefinitions("InstrMapping");
 
-  if (!InstrMapVec.size())
+  if (InstrMapVec.empty())
     return;
 
   OS << "#ifdef GET_INSTRMAP_INFO\n";
diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
index 678222f..37f8905 100644
--- a/utils/TableGen/CodeGenRegisters.cpp
+++ b/utils/TableGen/CodeGenRegisters.cpp
@@ -82,7 +82,7 @@ void CodeGenSubRegIndex::updateComponents(CodeGenRegBank &RegBank) {
   }
 }
 
-unsigned CodeGenSubRegIndex::computeLaneMask() {
+unsigned CodeGenSubRegIndex::computeLaneMask() const {
   // Already computed?
   if (LaneMask)
     return LaneMask;
@@ -92,8 +92,8 @@ unsigned CodeGenSubRegIndex::computeLaneMask() {
 
   // The lane mask is simply the union of all sub-indices.
   unsigned M = 0;
-  for (CompMap::iterator I = Composed.begin(), E = Composed.end(); I != E; ++I)
-    M |= I->second->computeLaneMask();
+  for (const auto &C : Composed)
+    M |= C.second->computeLaneMask();
   assert(M && "Missing lane mask, sub-register cycle?");
   LaneMask = M;
   return LaneMask;
@@ -108,7 +108,6 @@ CodeGenRegister::CodeGenRegister(Record *R, unsigned Enum)
     EnumValue(Enum),
     CostPerUse(R->getValueAsInt("CostPerUse")),
     CoveredBySubRegs(R->getValueAsBit("CoveredBySubRegs")),
-    NumNativeRegUnits(0),
     SubRegsComplete(false),
     SuperRegsComplete(false),
     TopoSig(~0u)
@@ -146,17 +145,18 @@ void CodeGenRegister::buildObjectGraph(CodeGenRegBank &RegBank) {
 }
 
 const std::string &CodeGenRegister::getName() const {
+  assert(TheDef && "no def");
   return TheDef->getName();
 }
 
 namespace {
 // Iterate over all register units in a set of registers.
 class RegUnitIterator {
-  CodeGenRegister::Set::const_iterator RegI, RegE;
-  CodeGenRegister::RegUnitList::const_iterator UnitI, UnitE;
+  CodeGenRegister::Vec::const_iterator RegI, RegE;
+  CodeGenRegister::RegUnitList::iterator UnitI, UnitE;
 
 public:
-  RegUnitIterator(const CodeGenRegister::Set &Regs):
+  RegUnitIterator(const CodeGenRegister::Vec &Regs):
     RegI(Regs.begin()), RegE(Regs.end()), UnitI(), UnitE() {
 
     if (RegI != RegE) {
@@ -191,32 +191,23 @@ protected:
 };
 } // namespace
 
-// Merge two RegUnitLists maintaining the order and removing duplicates.
-// Overwrites MergedRU in the process.
-static void mergeRegUnits(CodeGenRegister::RegUnitList &MergedRU,
-                          const CodeGenRegister::RegUnitList &RRU) {
-  CodeGenRegister::RegUnitList LRU = MergedRU;
-  MergedRU.clear();
-  std::set_union(LRU.begin(), LRU.end(), RRU.begin(), RRU.end(),
-                 std::back_inserter(MergedRU));
-}
-
 // Return true of this unit appears in RegUnits.
 static bool hasRegUnit(CodeGenRegister::RegUnitList &RegUnits, unsigned Unit) {
-  return std::count(RegUnits.begin(), RegUnits.end(), Unit);
+  return RegUnits.test(Unit);
 }
 
 // Inherit register units from subregisters.
 // Return true if the RegUnits changed.
 bool CodeGenRegister::inheritRegUnits(CodeGenRegBank &RegBank) {
-  unsigned OldNumUnits = RegUnits.size();
+  bool changed = false;
   for (SubRegMap::const_iterator I = SubRegs.begin(), E = SubRegs.end();
        I != E; ++I) {
     CodeGenRegister *SR = I->second;
     // Merge the subregister's units into this register's RegUnits.
-    mergeRegUnits(RegUnits, SR->RegUnits);
+    changed |= (RegUnits |= SR->RegUnits);
   }
-  return OldNumUnits != RegUnits.size();
+
+  return changed;
 }
 
 const CodeGenRegister::SubRegMap &
@@ -362,14 +353,8 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
   // sub-registers, the other registers won't contribute any more units.
   for (unsigned i = 0, e = ExplicitSubRegs.size(); i != e; ++i) {
     CodeGenRegister *SR = ExplicitSubRegs[i];
-    // Explicit sub-registers are usually disjoint, so this is a good way of
-    // computing the union. We may pick up a few duplicates that will be
-    // eliminated below.
-    unsigned N = RegUnits.size();
-    RegUnits.append(SR->RegUnits.begin(), SR->RegUnits.end());
-    std::inplace_merge(RegUnits.begin(), RegUnits.begin() + N, RegUnits.end());
+    RegUnits |= SR->RegUnits;
   }
-  RegUnits.erase(std::unique(RegUnits.begin(), RegUnits.end()), RegUnits.end());
 
   // Absent any ad hoc aliasing, we create one register unit per leaf register.
   // These units correspond to the maximal cliques in the register overlap
@@ -388,19 +373,19 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
     // Create a RegUnit representing this alias edge, and add it to both
     // registers.
     unsigned Unit = RegBank.newRegUnit(this, AR);
-    RegUnits.push_back(Unit);
-    AR->RegUnits.push_back(Unit);
+    RegUnits.set(Unit);
+    AR->RegUnits.set(Unit);
   }
 
   // Finally, create units for leaf registers without ad hoc aliases. Note that
   // a leaf register with ad hoc aliases doesn't get its own unit - it isn't
   // necessary. This means the aliasing leaf registers can share a single unit.
   if (RegUnits.empty())
-    RegUnits.push_back(RegBank.newRegUnit(this));
+    RegUnits.set(RegBank.newRegUnit(this));
 
   // We have now computed the native register units. More may be adopted later
   // for balancing purposes.
-  NumNativeRegUnits = RegUnits.size();
+  NativeRegUnits = RegUnits;
 
   return SubRegs;
 }
@@ -534,7 +519,7 @@ CodeGenRegister::addSubRegsPreOrder(SetVector<const CodeGenRegister*> &OSet,
 // Get the sum of this register's unit weights.
 unsigned CodeGenRegister::getWeight(const CodeGenRegBank &RegBank) const {
   unsigned Weight = 0;
-  for (RegUnitList::const_iterator I = RegUnits.begin(), E = RegUnits.end();
+  for (RegUnitList::iterator I = RegUnits.begin(), E = RegUnits.end();
        I != E; ++I) {
     Weight += RegBank.getRegUnit(*I).Weight;
   }
@@ -657,17 +642,21 @@ struct TupleExpander : SetTheory::Expander {
 //                            CodeGenRegisterClass
 //===----------------------------------------------------------------------===//
 
+static void sortAndUniqueRegisters(CodeGenRegister::Vec &M) {
+  std::sort(M.begin(), M.end(), deref<llvm::less>());
+  M.erase(std::unique(M.begin(), M.end(), deref<llvm::equal>()), M.end());
+}
+
 CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
   : TheDef(R),
     Name(R->getName()),
     TopoSigs(RegBank.getNumTopoSigs()),
-    EnumValue(-1) {
+    EnumValue(-1),
+    LaneMask(0) {
   // Rename anonymous register classes.
   if (R->getName().size() > 9 && R->getName()[9] == '.') {
     static unsigned AnonCounter = 0;
-    R->setName("AnonRegClass_" + utostr(AnonCounter));
-    // MSVC2012 ICEs if AnonCounter++ is directly passed to utostr.
-    ++AnonCounter;
+    R->setName("AnonRegClass_" + utostr(AnonCounter++));
   }
 
   std::vector<Record*> TypeList = R->getValueAsListOfDefs("RegTypes");
@@ -689,9 +678,10 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
   for (unsigned i = 0, e = Elements->size(); i != e; ++i) {
     Orders[0].push_back((*Elements)[i]);
     const CodeGenRegister *Reg = RegBank.getReg((*Elements)[i]);
-    Members.insert(Reg);
+    Members.push_back(Reg);
     TopoSigs.set(Reg->getTopoSig());
   }
+  sortAndUniqueRegisters(Members);
 
   // Alternative allocation orders may be subsets.
   SetTheory::RecSet Order;
@@ -733,9 +723,8 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank,
     SpillAlignment(Props.SpillAlignment),
     CopyCost(0),
     Allocatable(true) {
-  for (CodeGenRegister::Set::iterator I = Members.begin(), E = Members.end();
-       I != E; ++I)
-    TopoSigs.set((*I)->getTopoSig());
+  for (const auto R : Members)
+    TopoSigs.set(R->getTopoSig());
 }
 
 // Compute inherited propertied for a synthesized register class.
@@ -764,15 +753,15 @@ void CodeGenRegisterClass::inheritProperties(CodeGenRegBank &RegBank) {
 }
 
 bool CodeGenRegisterClass::contains(const CodeGenRegister *Reg) const {
-  return Members.count(Reg);
+  return std::binary_search(Members.begin(), Members.end(), Reg,
+                            deref<llvm::less>());
 }
 
 namespace llvm {
   raw_ostream &operator<<(raw_ostream &OS, const CodeGenRegisterClass::Key &K) {
     OS << "{ S=" << K.SpillSize << ", A=" << K.SpillAlignment;
-    for (CodeGenRegister::Set::const_iterator I = K.Members->begin(),
-         E = K.Members->end(); I != E; ++I)
-      OS << ", " << (*I)->getName();
+    for (const auto R : *K.Members)
+      OS << ", " << R->getName();
     return OS << " }";
   }
 }
@@ -798,10 +787,10 @@ operator<(const CodeGenRegisterClass::Key &B) const {
 static bool testSubClass(const CodeGenRegisterClass *A,
                          const CodeGenRegisterClass *B) {
   return A->SpillAlignment && B->SpillAlignment % A->SpillAlignment == 0 &&
-    A->SpillSize <= B->SpillSize &&
-    std::includes(A->getMembers().begin(), A->getMembers().end(),
-                  B->getMembers().begin(), B->getMembers().end(),
-                  CodeGenRegister::Less());
+         A->SpillSize <= B->SpillSize &&
+         std::includes(A->getMembers().begin(), A->getMembers().end(),
+                       B->getMembers().begin(), B->getMembers().end(),
+                       deref<llvm::less>());
 }
 
 /// Sorting predicate for register classes.  This provides a topological
@@ -810,34 +799,34 @@ static bool testSubClass(const CodeGenRegisterClass *A,
 /// Register classes with the same registers, spill size, and alignment form a
 /// clique.  They will be ordered alphabetically.
 ///
-static int TopoOrderRC(CodeGenRegisterClass *const *PA,
-                       CodeGenRegisterClass *const *PB) {
-  const CodeGenRegisterClass *A = *PA;
-  const CodeGenRegisterClass *B = *PB;
+static bool TopoOrderRC(const CodeGenRegisterClass &PA,
+                        const CodeGenRegisterClass &PB) {
+  auto *A = &PA;
+  auto *B = &PB;
   if (A == B)
     return 0;
 
   // Order by ascending spill size.
   if (A->SpillSize < B->SpillSize)
-    return -1;
+    return true;
   if (A->SpillSize > B->SpillSize)
-    return 1;
+    return false;
 
   // Order by ascending spill alignment.
   if (A->SpillAlignment < B->SpillAlignment)
-    return -1;
+    return true;
   if (A->SpillAlignment > B->SpillAlignment)
-    return 1;
+    return false;
 
   // Order by descending set size.  Note that the classes' allocation order may
   // not have been computed yet.  The Members set is always vaild.
   if (A->getMembers().size() > B->getMembers().size())
-    return -1;
+    return true;
   if (A->getMembers().size() < B->getMembers().size())
-    return 1;
+    return false;
 
   // Finally order by name as a tie breaker.
-  return StringRef(A->getName()).compare(B->getName());
+  return StringRef(A->getName()) < B->getName();
 }
 
 std::string CodeGenRegisterClass::getQualifiedName() const {
@@ -850,55 +839,56 @@ std::string CodeGenRegisterClass::getQualifiedName() const {
 // Compute sub-classes of all register classes.
 // Assume the classes are ordered topologically.
 void CodeGenRegisterClass::computeSubClasses(CodeGenRegBank &RegBank) {
-  ArrayRef<CodeGenRegisterClass*> RegClasses = RegBank.getRegClasses();
+  auto &RegClasses = RegBank.getRegClasses();
 
   // Visit backwards so sub-classes are seen first.
-  for (unsigned rci = RegClasses.size(); rci; --rci) {
-    CodeGenRegisterClass &RC = *RegClasses[rci - 1];
+  for (auto I = RegClasses.rbegin(), E = RegClasses.rend(); I != E; ++I) {
+    CodeGenRegisterClass &RC = *I;
     RC.SubClasses.resize(RegClasses.size());
     RC.SubClasses.set(RC.EnumValue);
 
     // Normally, all subclasses have IDs >= rci, unless RC is part of a clique.
-    for (unsigned s = rci; s != RegClasses.size(); ++s) {
-      if (RC.SubClasses.test(s))
+    for (auto I2 = I.base(), E2 = RegClasses.end(); I2 != E2; ++I2) {
+      CodeGenRegisterClass &SubRC = *I2;
+      if (RC.SubClasses.test(SubRC.EnumValue))
         continue;
-      CodeGenRegisterClass *SubRC = RegClasses[s];
-      if (!testSubClass(&RC, SubRC))
+      if (!testSubClass(&RC, &SubRC))
         continue;
       // SubRC is a sub-class. Grap all its sub-classes so we won't have to
       // check them again.
-      RC.SubClasses |= SubRC->SubClasses;
+      RC.SubClasses |= SubRC.SubClasses;
     }
 
     // Sweep up missed clique members.  They will be immediately preceding RC.
-    for (unsigned s = rci - 1; s && testSubClass(&RC, RegClasses[s - 1]); --s)
-      RC.SubClasses.set(s - 1);
+    for (auto I2 = std::next(I); I2 != E && testSubClass(&RC, &*I2); ++I2)
+      RC.SubClasses.set(I2->EnumValue);
   }
 
   // Compute the SuperClasses lists from the SubClasses vectors.
-  for (unsigned rci = 0; rci != RegClasses.size(); ++rci) {
-    const BitVector &SC = RegClasses[rci]->getSubClasses();
-    for (int s = SC.find_first(); s >= 0; s = SC.find_next(s)) {
-      if (unsigned(s) == rci)
+  for (auto &RC : RegClasses) {
+    const BitVector &SC = RC.getSubClasses();
+    auto I = RegClasses.begin();
+    for (int s = 0, next_s = SC.find_first(); next_s != -1;
+         next_s = SC.find_next(s)) {
+      std::advance(I, next_s - s);
+      s = next_s;
+      if (&*I == &RC)
         continue;
-      RegClasses[s]->SuperClasses.push_back(RegClasses[rci]);
+      I->SuperClasses.push_back(&RC);
     }
   }
 
   // With the class hierarchy in place, let synthesized register classes inherit
   // properties from their closest super-class. The iteration order here can
   // propagate properties down multiple levels.
-  for (unsigned rci = 0; rci != RegClasses.size(); ++rci)
-    if (!RegClasses[rci]->getDef())
-      RegClasses[rci]->inheritProperties(RegBank);
+  for (auto &RC : RegClasses)
+    if (!RC.getDef())
+      RC.inheritProperties(RegBank);
 }
 
-void
-CodeGenRegisterClass::getSuperRegClasses(CodeGenSubRegIndex *SubIdx,
-                                         BitVector &Out) const {
-  DenseMap<CodeGenSubRegIndex*,
-           SmallPtrSet<CodeGenRegisterClass*, 8> >::const_iterator
-    FindI = SuperRegClasses.find(SubIdx);
+void CodeGenRegisterClass::getSuperRegClasses(const CodeGenSubRegIndex *SubIdx,
+                                              BitVector &Out) const {
+  auto FindI = SuperRegClasses.find(SubIdx);
   if (FindI == SuperRegClasses.end())
     return;
   for (CodeGenRegisterClass *RC : FindI->second)
@@ -933,13 +923,12 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records) {
   for (unsigned i = 0, e = SRIs.size(); i != e; ++i)
     getSubRegIdx(SRIs[i]);
   // Build composite maps from ComposedOf fields.
-  for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i)
-    SubRegIndices[i]->updateComponents(*this);
+  for (auto &Idx : SubRegIndices)
+    Idx.updateComponents(*this);
 
   // Read in the register definitions.
   std::vector<Record*> Regs = Records.getAllDerivedDefinitions("Register");
   std::sort(Regs.begin(), Regs.end(), LessRecordRegister());
-  Registers.reserve(Regs.size());
   // Assign the enumeration values.
   for (unsigned i = 0, e = Regs.size(); i != e; ++i)
     getReg(Regs[i]);
@@ -948,45 +937,41 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records) {
   std::vector<Record*> Tups =
     Records.getAllDerivedDefinitions("RegisterTuples");
 
-  std::vector<Record*> TupRegsCopy;
-  for (unsigned i = 0, e = Tups.size(); i != e; ++i) {
-    const std::vector<Record*> *TupRegs = Sets.expand(Tups[i]);
-    TupRegsCopy.reserve(TupRegs->size());
-    TupRegsCopy.assign(TupRegs->begin(), TupRegs->end());
-    std::sort(TupRegsCopy.begin(), TupRegsCopy.end(), LessRecordRegister());
-    for (unsigned j = 0, je = TupRegsCopy.size(); j != je; ++j)
-      getReg((TupRegsCopy)[j]);
-    TupRegsCopy.clear();
+  for (Record *R : Tups) {
+    std::vector<Record *> TupRegs = *Sets.expand(R);
+    std::sort(TupRegs.begin(), TupRegs.end(), LessRecordRegister());
+    for (Record *RC : TupRegs)
+      getReg(RC);
   }
 
   // Now all the registers are known. Build the object graph of explicit
   // register-register references.
-  for (unsigned i = 0, e = Registers.size(); i != e; ++i)
-    Registers[i]->buildObjectGraph(*this);
+  for (auto &Reg : Registers)
+    Reg.buildObjectGraph(*this);
 
   // Compute register name map.
-  for (unsigned i = 0, e = Registers.size(); i != e; ++i)
+  for (auto &Reg : Registers)
     // FIXME: This could just be RegistersByName[name] = register, except that
     // causes some failures in MIPS - perhaps they have duplicate register name
     // entries? (or maybe there's a reason for it - I don't know much about this
     // code, just drive-by refactoring)
-    RegistersByName.insert(std::make_pair(
-        Registers[i]->TheDef->getValueAsString("AsmName"), Registers[i]));
+    RegistersByName.insert(
+        std::make_pair(Reg.TheDef->getValueAsString("AsmName"), &Reg));
 
   // Precompute all sub-register maps.
   // This will create Composite entries for all inferred sub-register indices.
-  for (unsigned i = 0, e = Registers.size(); i != e; ++i)
-    Registers[i]->computeSubRegs(*this);
+  for (auto &Reg : Registers)
+    Reg.computeSubRegs(*this);
 
   // Infer even more sub-registers by combining leading super-registers.
-  for (unsigned i = 0, e = Registers.size(); i != e; ++i)
-    if (Registers[i]->CoveredBySubRegs)
-      Registers[i]->computeSecondarySubRegs(*this);
+  for (auto &Reg : Registers)
+    if (Reg.CoveredBySubRegs)
+      Reg.computeSecondarySubRegs(*this);
 
   // After the sub-register graph is complete, compute the topologically
   // ordered SuperRegs list.
-  for (unsigned i = 0, e = Registers.size(); i != e; ++i)
-    Registers[i]->computeSuperRegs(*this);
+  for (auto &Reg : Registers)
+    Reg.computeSuperRegs(*this);
 
   // Native register units are associated with a leaf register. They've all been
   // discovered now.
@@ -998,35 +983,35 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records) {
     PrintFatalError("No 'RegisterClass' subclasses defined!");
 
   // Allocate user-defined register classes.
-  RegClasses.reserve(RCs.size());
-  for (unsigned i = 0, e = RCs.size(); i != e; ++i)
-    addToMaps(new CodeGenRegisterClass(*this, RCs[i]));
+  for (auto *RC : RCs) {
+    RegClasses.push_back(CodeGenRegisterClass(*this, RC));
+    addToMaps(&RegClasses.back());
+  }
 
   // Infer missing classes to create a full algebra.
   computeInferredRegisterClasses();
 
   // Order register classes topologically and assign enum values.
-  array_pod_sort(RegClasses.begin(), RegClasses.end(), TopoOrderRC);
-  for (unsigned i = 0, e = RegClasses.size(); i != e; ++i)
-    RegClasses[i]->EnumValue = i;
+  RegClasses.sort(TopoOrderRC);
+  unsigned i = 0;
+  for (auto &RC : RegClasses)
+    RC.EnumValue = i++;
   CodeGenRegisterClass::computeSubClasses(*this);
 }
 
 // Create a synthetic CodeGenSubRegIndex without a corresponding Record.
 CodeGenSubRegIndex*
 CodeGenRegBank::createSubRegIndex(StringRef Name, StringRef Namespace) {
-  CodeGenSubRegIndex *Idx = new CodeGenSubRegIndex(Name, Namespace,
-                                                   SubRegIndices.size() + 1);
-  SubRegIndices.push_back(Idx);
-  return Idx;
+  SubRegIndices.emplace_back(Name, Namespace, SubRegIndices.size() + 1);
+  return &SubRegIndices.back();
 }
 
 CodeGenSubRegIndex *CodeGenRegBank::getSubRegIdx(Record *Def) {
   CodeGenSubRegIndex *&Idx = Def2SubRegIdx[Def];
   if (Idx)
     return Idx;
-  Idx = new CodeGenSubRegIndex(Def, SubRegIndices.size() + 1);
-  SubRegIndices.push_back(Idx);
+  SubRegIndices.emplace_back(Def, SubRegIndices.size() + 1);
+  Idx = &SubRegIndices.back();
   return Idx;
 }
 
@@ -1034,14 +1019,12 @@ CodeGenRegister *CodeGenRegBank::getReg(Record *Def) {
   CodeGenRegister *&Reg = Def2Reg[Def];
   if (Reg)
     return Reg;
-  Reg = new CodeGenRegister(Def, Registers.size() + 1);
-  Registers.push_back(Reg);
+  Registers.emplace_back(Def, Registers.size() + 1);
+  Reg = &Registers.back();
   return Reg;
 }
 
 void CodeGenRegBank::addToMaps(CodeGenRegisterClass *RC) {
-  RegClasses.push_back(RC);
-
   if (Record *Def = RC->getDef())
     Def2RC.insert(std::make_pair(Def, RC));
 
@@ -1054,7 +1037,7 @@ void CodeGenRegBank::addToMaps(CodeGenRegisterClass *RC) {
 // Create a synthetic sub-class if it is missing.
 CodeGenRegisterClass*
 CodeGenRegBank::getOrCreateSubClass(const CodeGenRegisterClass *RC,
-                                    const CodeGenRegister::Set *Members,
+                                    const CodeGenRegister::Vec *Members,
                                     StringRef Name) {
   // Synthetic sub-class has the same size and alignment as RC.
   CodeGenRegisterClass::Key K(Members, RC->SpillSize, RC->SpillAlignment);
@@ -1063,9 +1046,9 @@ CodeGenRegBank::getOrCreateSubClass(const CodeGenRegisterClass *RC,
     return FoundI->second;
 
   // Sub-class doesn't exist, create a new one.
-  CodeGenRegisterClass *NewRC = new CodeGenRegisterClass(*this, Name, K);
-  addToMaps(NewRC);
-  return NewRC;
+  RegClasses.push_back(CodeGenRegisterClass(*this, Name, K));
+  addToMaps(&RegClasses.back());
+  return &RegClasses.back();
 }
 
 CodeGenRegisterClass *CodeGenRegBank::getRegClass(Record *Def) {
@@ -1126,21 +1109,19 @@ void CodeGenRegBank::computeComposites() {
   // and many registers will share TopoSigs on regular architectures.
   BitVector TopoSigs(getNumTopoSigs());
 
-  for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
-    CodeGenRegister *Reg1 = Registers[i];
-
+  for (const auto &Reg1 : Registers) {
     // Skip identical subreg structures already processed.
-    if (TopoSigs.test(Reg1->getTopoSig()))
+    if (TopoSigs.test(Reg1.getTopoSig()))
       continue;
-    TopoSigs.set(Reg1->getTopoSig());
+    TopoSigs.set(Reg1.getTopoSig());
 
-    const CodeGenRegister::SubRegMap &SRM1 = Reg1->getSubRegs();
+    const CodeGenRegister::SubRegMap &SRM1 = Reg1.getSubRegs();
     for (CodeGenRegister::SubRegMap::const_iterator i1 = SRM1.begin(),
          e1 = SRM1.end(); i1 != e1; ++i1) {
       CodeGenSubRegIndex *Idx1 = i1->first;
       CodeGenRegister *Reg2 = i1->second;
       // Ignore identity compositions.
-      if (Reg1 == Reg2)
+      if (&Reg1 == Reg2)
         continue;
       const CodeGenRegister::SubRegMap &SRM2 = Reg2->getSubRegs();
       // Try composing Idx1 with another SubRegIndex.
@@ -1152,7 +1133,7 @@ void CodeGenRegBank::computeComposites() {
         if (Reg2 == Reg3)
           continue;
         // OK Reg1:IdxPair == Reg3. Find the index with Reg:Idx == Reg3.
-        CodeGenSubRegIndex *Idx3 = Reg1->getSubRegIndex(Reg3);
+        CodeGenSubRegIndex *Idx3 = Reg1.getSubRegIndex(Reg3);
         assert(Idx3 && "Sub-register doesn't have an index");
 
         // Conflicting composition? Emit a warning but allow it.
@@ -1173,15 +1154,14 @@ void CodeGenRegBank::computeComposites() {
 //
 // Conservatively share a lane mask bit if two sub-register indices overlap in
 // some registers, but not in others. That shouldn't happen a lot.
-void CodeGenRegBank::computeSubRegIndexLaneMasks() {
+void CodeGenRegBank::computeSubRegLaneMasks() {
   // First assign individual bits to all the leaf indices.
   unsigned Bit = 0;
   // Determine mask of lanes that cover their registers.
   CoveringLanes = ~0u;
-  for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
-    CodeGenSubRegIndex *Idx = SubRegIndices[i];
-    if (Idx->getComposites().empty()) {
-      Idx->LaneMask = 1u << Bit;
+  for (auto &Idx : SubRegIndices) {
+    if (Idx.getComposites().empty()) {
+      Idx.LaneMask = 1u << Bit;
       // Share bit 31 in the unlikely case there are more than 32 leafs.
       //
       // Sharing bits is harmless; it allows graceful degradation in targets
@@ -1196,7 +1176,71 @@ void CodeGenRegBank::computeSubRegIndexLaneMasks() {
         // is no longer covering its registers.
         CoveringLanes &= ~(1u << Bit);
     } else {
-      Idx->LaneMask = 0;
+      Idx.LaneMask = 0;
+    }
+  }
+
+  // Compute transformation sequences for composeSubRegIndexLaneMask. The idea
+  // here is that for each possible target subregister we look at the leafs
+  // in the subregister graph that compose for this target and create
+  // transformation sequences for the lanemasks. Each step in the sequence
+  // consists of a bitmask and a bitrotate operation. As the rotation amounts
+  // are usually the same for many subregisters we can easily combine the steps
+  // by combining the masks.
+  for (const auto &Idx : SubRegIndices) {
+    const auto &Composites = Idx.getComposites();
+    auto &LaneTransforms = Idx.CompositionLaneMaskTransform;
+    // Go through all leaf subregisters and find the ones that compose with Idx.
+    // These make out all possible valid bits in the lane mask we want to
+    // transform. Looking only at the leafs ensure that only a single bit in
+    // the mask is set.
+    unsigned NextBit = 0;
+    for (auto &Idx2 : SubRegIndices) {
+      // Skip non-leaf subregisters.
+      if (!Idx2.getComposites().empty())
+        continue;
+      // Replicate the behaviour from the lane mask generation loop above.
+      unsigned SrcBit = NextBit;
+      unsigned SrcMask = 1u << SrcBit;
+      if (NextBit < 31)
+        ++NextBit;
+      assert(Idx2.LaneMask == SrcMask);
+
+      // Get the composed subregister if there is any.
+      auto C = Composites.find(&Idx2);
+      if (C == Composites.end())
+        continue;
+      const CodeGenSubRegIndex *Composite = C->second;
+      // The Composed subreg should be a leaf subreg too
+      assert(Composite->getComposites().empty());
+
+      // Create Mask+Rotate operation and merge with existing ops if possible.
+      unsigned DstBit = Log2_32(Composite->LaneMask);
+      int Shift = DstBit - SrcBit;
+      uint8_t RotateLeft = Shift >= 0 ? (uint8_t)Shift : 32+Shift;
+      for (auto &I : LaneTransforms) {
+        if (I.RotateLeft == RotateLeft) {
+          I.Mask |= SrcMask;
+          SrcMask = 0;
+        }
+      }
+      if (SrcMask != 0) {
+        MaskRolPair MaskRol = { SrcMask, RotateLeft };
+        LaneTransforms.push_back(MaskRol);
+      }
+    }
+    // Optimize if the transformation consists of one step only: Set mask to
+    // 0xffffffff (including some irrelevant invalid bits) so that it should
+    // merge with more entries later while compressing the table.
+    if (LaneTransforms.size() == 1)
+      LaneTransforms[0].Mask = ~0u;
+
+    // Further compression optimization: For invalid compositions resulting
+    // in a sequence with 0 entries we can just pick any other. Choose
+    // Mask 0xffffffff with Rotation 0.
+    if (LaneTransforms.size() == 0) {
+      MaskRolPair P = { ~0u, 0 };
+      LaneTransforms.push_back(P);
     }
   }
 
@@ -1204,13 +1248,24 @@ void CodeGenRegBank::computeSubRegIndexLaneMasks() {
   // by the sub-register graph? This doesn't occur in any known targets.
 
   // Inherit lanes from composites.
-  for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
-    unsigned Mask = SubRegIndices[i]->computeLaneMask();
+  for (const auto &Idx : SubRegIndices) {
+    unsigned Mask = Idx.computeLaneMask();
     // If some super-registers without CoveredBySubRegs use this index, we can
     // no longer assume that the lanes are covering their registers.
-    if (!SubRegIndices[i]->AllSuperRegsCovered)
+    if (!Idx.AllSuperRegsCovered)
       CoveringLanes &= ~Mask;
   }
+
+  // Compute lane mask combinations for register classes.
+  for (auto &RegClass : RegClasses) {
+    unsigned LaneMask = 0;
+    for (const auto &SubRegIndex : SubRegIndices) {
+      if (RegClass.getSubClassWithSubReg(&SubRegIndex) != &RegClass)
+        continue;
+      LaneMask |= SubRegIndex.LaneMask;
+    }
+    RegClass.LaneMask = LaneMask;
+  }
 }
 
 namespace {
@@ -1231,7 +1286,7 @@ namespace {
 // for which the unit weight equals the set weight. These units should not have
 // their weight increased.
 struct UberRegSet {
-  CodeGenRegister::Set Regs;
+  CodeGenRegister::Vec Regs;
   unsigned Weight;
   CodeGenRegister::RegUnitList SingularDeterminants;
 
@@ -1247,22 +1302,20 @@ static void computeUberSets(std::vector<UberRegSet> &UberSets,
                             std::vector<UberRegSet*> &RegSets,
                             CodeGenRegBank &RegBank) {
 
-  const std::vector<CodeGenRegister*> &Registers = RegBank.getRegisters();
+  const auto &Registers = RegBank.getRegisters();
 
   // The Register EnumValue is one greater than its index into Registers.
-  assert(Registers.size() == Registers[Registers.size()-1]->EnumValue &&
+  assert(Registers.size() == Registers.back().EnumValue &&
          "register enum value mismatch");
 
   // For simplicitly make the SetID the same as EnumValue.
   IntEqClasses UberSetIDs(Registers.size()+1);
   std::set<unsigned> AllocatableRegs;
-  for (unsigned i = 0, e = RegBank.getRegClasses().size(); i != e; ++i) {
-
-    CodeGenRegisterClass *RegClass = RegBank.getRegClasses()[i];
-    if (!RegClass->Allocatable)
+  for (auto &RegClass : RegBank.getRegClasses()) {
+    if (!RegClass.Allocatable)
       continue;
 
-    const CodeGenRegister::Set &Regs = RegClass->getMembers();
+    const CodeGenRegister::Vec &Regs = RegClass.getMembers();
     if (Regs.empty())
       continue;
 
@@ -1270,15 +1323,14 @@ static void computeUberSets(std::vector<UberRegSet> &UberSets,
     assert(USetID && "register number 0 is invalid");
 
     AllocatableRegs.insert((*Regs.begin())->EnumValue);
-    for (CodeGenRegister::Set::const_iterator I = std::next(Regs.begin()),
-           E = Regs.end(); I != E; ++I) {
+    for (auto I = std::next(Regs.begin()), E = Regs.end(); I != E; ++I) {
       AllocatableRegs.insert((*I)->EnumValue);
       UberSetIDs.join(USetID, (*I)->EnumValue);
     }
   }
   // Combine non-allocatable regs.
-  for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
-    unsigned RegNum = Registers[i]->EnumValue;
+  for (const auto &Reg : Registers) {
+    unsigned RegNum = Reg.EnumValue;
     if (AllocatableRegs.count(RegNum))
       continue;
 
@@ -1292,17 +1344,18 @@ static void computeUberSets(std::vector<UberRegSet> &UberSets,
   // Insert Registers into the UberSets formed by union-find.
   // Do not resize after this.
   UberSets.resize(UberSetIDs.getNumClasses());
-  for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
-    const CodeGenRegister *Reg = Registers[i];
-    unsigned USetID = UberSetIDs[Reg->EnumValue];
+  unsigned i = 0;
+  for (const CodeGenRegister &Reg : Registers) {
+    unsigned USetID = UberSetIDs[Reg.EnumValue];
     if (!USetID)
       USetID = ZeroID;
     else if (USetID == ZeroID)
       USetID = 0;
 
     UberRegSet *USet = &UberSets[USetID];
-    USet->Regs.insert(Reg);
-    RegSets[i] = USet;
+    USet->Regs.push_back(&Reg);
+    sortAndUniqueRegisters(USet->Regs);
+    RegSets[i++] = USet;
   }
 }
 
@@ -1335,22 +1388,18 @@ static void computeUberWeights(std::vector<UberRegSet> &UberSets,
     if (I->Weight != MaxWeight) {
       DEBUG(
         dbgs() << "UberSet " << I - UberSets.begin() << " Weight " << MaxWeight;
-        for (CodeGenRegister::Set::iterator
-               UnitI = I->Regs.begin(), UnitE = I->Regs.end();
-             UnitI != UnitE; ++UnitI) {
-          dbgs() << " " << (*UnitI)->getName();
-        }
+        for (auto &Unit : I->Regs)
+          dbgs() << " " << Unit->getName();
         dbgs() << "\n");
       // Update the set weight.
       I->Weight = MaxWeight;
     }
 
     // Find singular determinants.
-    for (CodeGenRegister::Set::iterator RegI = I->Regs.begin(),
-           RegE = I->Regs.end(); RegI != RegE; ++RegI) {
-      if ((*RegI)->getRegUnits().size() == 1
-          && (*RegI)->getWeight(RegBank) == I->Weight)
-        mergeRegUnits(I->SingularDeterminants, (*RegI)->getRegUnits());
+    for (const auto R : I->Regs) {
+      if (R->getRegUnits().count() == 1 && R->getWeight(RegBank) == I->Weight) {
+        I->SingularDeterminants |= R->getRegUnits();
+      }
     }
   }
 }
@@ -1368,13 +1417,14 @@ static void computeUberWeights(std::vector<UberRegSet> &UberSets,
 static bool normalizeWeight(CodeGenRegister *Reg,
                             std::vector<UberRegSet> &UberSets,
                             std::vector<UberRegSet*> &RegSets,
-                            std::set<unsigned> &NormalRegs,
+                            SparseBitVector<> &NormalRegs,
                             CodeGenRegister::RegUnitList &NormalUnits,
                             CodeGenRegBank &RegBank) {
-  bool Changed = false;
-  if (!NormalRegs.insert(Reg->EnumValue).second)
-    return Changed;
+  if (NormalRegs.test(Reg->EnumValue))
+    return false;
+  NormalRegs.set(Reg->EnumValue);
 
+  bool Changed = false;
   const CodeGenRegister::SubRegMap &SRM = Reg->getSubRegs();
   for (CodeGenRegister::SubRegMap::const_iterator SRI = SRM.begin(),
          SRE = SRM.end(); SRI != SRE; ++SRI) {
@@ -1398,8 +1448,8 @@ static bool normalizeWeight(CodeGenRegister *Reg,
     // A register unit's weight can be adjusted only if it is the singular unit
     // for this register, has not been used to normalize a subregister's set,
     // and has not already been used to singularly determine this UberRegSet.
-    unsigned AdjustUnit = Reg->getRegUnits().front();
-    if (Reg->getRegUnits().size() != 1
+    unsigned AdjustUnit = *Reg->getRegUnits().begin();
+    if (Reg->getRegUnits().count() != 1
         || hasRegUnit(NormalUnits, AdjustUnit)
         || hasRegUnit(UberSet->SingularDeterminants, AdjustUnit)) {
       // We don't have an adjustable unit, so adopt a new one.
@@ -1417,7 +1467,7 @@ static bool normalizeWeight(CodeGenRegister *Reg,
   }
 
   // Mark these units normalized so superregisters can't change their weights.
-  mergeRegUnits(NormalUnits, Reg->getRegUnits());
+  NormalUnits |= Reg->getRegUnits();
 
   return Changed;
 }
@@ -1440,11 +1490,11 @@ void CodeGenRegBank::computeRegUnitWeights() {
   for (bool Changed = true; Changed; ++NumIters) {
     assert(NumIters <= NumNativeRegUnits && "Runaway register unit weights");
     Changed = false;
-    for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
+    for (auto &Reg : Registers) {
       CodeGenRegister::RegUnitList NormalUnits;
-      std::set<unsigned> NormalRegs;
-      Changed |= normalizeWeight(Registers[i], UberSets, RegSets,
-                                 NormalRegs, NormalUnits, *this);
+      SparseBitVector<> NormalRegs;
+      Changed |= normalizeWeight(&Reg, UberSets, RegSets, NormalRegs,
+                                 NormalUnits, *this);
     }
   }
 }
@@ -1535,18 +1585,17 @@ void CodeGenRegBank::computeRegUnitSets() {
   assert(RegUnitSets.empty() && "dirty RegUnitSets");
 
   // Compute a unique RegUnitSet for each RegClass.
-  ArrayRef<CodeGenRegisterClass*> RegClasses = getRegClasses();
-  unsigned NumRegClasses = RegClasses.size();
-  for (unsigned RCIdx = 0, RCEnd = NumRegClasses; RCIdx != RCEnd; ++RCIdx) {
-    if (!RegClasses[RCIdx]->Allocatable)
+  auto &RegClasses = getRegClasses();
+  for (auto &RC : RegClasses) {
+    if (!RC.Allocatable)
       continue;
 
     // Speculatively grow the RegUnitSets to hold the new set.
     RegUnitSets.resize(RegUnitSets.size() + 1);
-    RegUnitSets.back().Name = RegClasses[RCIdx]->getName();
+    RegUnitSets.back().Name = RC.getName();
 
     // Compute a sorted list of units in this class.
-    RegClasses[RCIdx]->buildRegUnitSet(RegUnitSets.back().Units);
+    RC.buildRegUnitSet(RegUnitSets.back().Units);
 
     // Find an existing RegUnitSet.
     std::vector<RegUnitSet>::const_iterator SetI =
@@ -1560,9 +1609,8 @@ void CodeGenRegBank::computeRegUnitSets() {
              USIdx < USEnd; ++USIdx) {
           dbgs() << "UnitSet " << USIdx << " " << RegUnitSets[USIdx].Name
                  << ":";
-          ArrayRef<unsigned> Units = RegUnitSets[USIdx].Units;
-          for (unsigned i = 0, e = Units.size(); i < e; ++i)
-            dbgs() << " " << RegUnits[Units[i]].Roots[0]->getName();
+          for (auto &U : RegUnitSets[USIdx].Units)
+            dbgs() << " " << RegUnits[U].Roots[0]->getName();
           dbgs() << "\n";
         });
 
@@ -1574,9 +1622,8 @@ void CodeGenRegBank::computeRegUnitSets() {
              USIdx < USEnd; ++USIdx) {
           dbgs() << "UnitSet " << USIdx << " " << RegUnitSets[USIdx].Name
                  << ":";
-          ArrayRef<unsigned> Units = RegUnitSets[USIdx].Units;
-          for (unsigned i = 0, e = Units.size(); i < e; ++i)
-            dbgs() << " " << RegUnits[Units[i]].Roots[0]->getName();
+          for (auto &U : RegUnitSets[USIdx].Units)
+            dbgs() << " " << RegUnits[U].Roots[0]->getName();
           dbgs() << "\n";
         }
         dbgs() << "\nUnion sets:\n");
@@ -1621,9 +1668,8 @@ void CodeGenRegBank::computeRegUnitSets() {
       else {
         DEBUG(dbgs() << "UnitSet " << RegUnitSets.size()-1
               << " " << RegUnitSets.back().Name << ":";
-              ArrayRef<unsigned> Units = RegUnitSets.back().Units;
-              for (unsigned i = 0, e = Units.size(); i < e; ++i)
-                dbgs() << " " << RegUnits[Units[i]].Roots[0]->getName();
+              for (auto &U : RegUnitSets.back().Units)
+                dbgs() << " " << RegUnits[U].Roots[0]->getName();
               dbgs() << "\n";);
       }
     }
@@ -1637,29 +1683,30 @@ void CodeGenRegBank::computeRegUnitSets() {
              USIdx < USEnd; ++USIdx) {
           dbgs() << "UnitSet " << USIdx << " " << RegUnitSets[USIdx].Name
                  << ":";
-          ArrayRef<unsigned> Units = RegUnitSets[USIdx].Units;
-          for (unsigned i = 0, e = Units.size(); i < e; ++i)
-            dbgs() << " " << RegUnits[Units[i]].Roots[0]->getName();
+          for (auto &U : RegUnitSets[USIdx].Units)
+            dbgs() << " " << RegUnits[U].Roots[0]->getName();
           dbgs() << "\n";
         });
 
   // For each register class, list the UnitSets that are supersets.
-  RegClassUnitSets.resize(NumRegClasses);
-  for (unsigned RCIdx = 0, RCEnd = NumRegClasses; RCIdx != RCEnd; ++RCIdx) {
-    if (!RegClasses[RCIdx]->Allocatable)
+  RegClassUnitSets.resize(RegClasses.size());
+  int RCIdx = -1;
+  for (auto &RC : RegClasses) {
+    ++RCIdx;
+    if (!RC.Allocatable)
       continue;
 
     // Recompute the sorted list of units in this class.
     std::vector<unsigned> RCRegUnits;
-    RegClasses[RCIdx]->buildRegUnitSet(RCRegUnits);
+    RC.buildRegUnitSet(RCRegUnits);
 
     // Don't increase pressure for unallocatable regclasses.
     if (RCRegUnits.empty())
       continue;
 
-    DEBUG(dbgs() << "RC " << RegClasses[RCIdx]->getName() << " Units: \n";
-          for (unsigned i = 0, e = RCRegUnits.size(); i < e; ++i)
-            dbgs() << RegUnits[RCRegUnits[i]].getRoots()[0]->getName() << " ";
+    DEBUG(dbgs() << "RC " << RC.getName() << " Units: \n";
+          for (auto &U : RCRegUnits)
+            dbgs() << RegUnits[U].getRoots()[0]->getName() << " ";
           dbgs() << "\n  UnitSetIDs:");
 
     // Find all supersets.
@@ -1704,9 +1751,46 @@ void CodeGenRegBank::computeRegUnitSets() {
   }
 }
 
+void CodeGenRegBank::computeRegUnitLaneMasks() {
+  for (auto &Register : Registers) {
+    // Create an initial lane mask for all register units.
+    const auto &RegUnits = Register.getRegUnits();
+    CodeGenRegister::RegUnitLaneMaskList RegUnitLaneMasks(RegUnits.count(), 0);
+    // Iterate through SubRegisters.
+    typedef CodeGenRegister::SubRegMap SubRegMap;
+    const SubRegMap &SubRegs = Register.getSubRegs();
+    for (SubRegMap::const_iterator S = SubRegs.begin(),
+         SE = SubRegs.end(); S != SE; ++S) {
+      CodeGenRegister *SubReg = S->second;
+      // Ignore non-leaf subregisters, their lane masks are fully covered by
+      // the leaf subregisters anyway.
+      if (SubReg->getSubRegs().size() != 0)
+        continue;
+      CodeGenSubRegIndex *SubRegIndex = S->first;
+      const CodeGenRegister *SubRegister = S->second;
+      unsigned LaneMask = SubRegIndex->LaneMask;
+      // Distribute LaneMask to Register Units touched.
+      for (const auto &SUI : SubRegister->getRegUnits()) {
+        bool Found = false;
+        unsigned u = 0;
+        for (unsigned RU : RegUnits) {
+          if (SUI == RU) {
+            RegUnitLaneMasks[u] |= LaneMask;
+            assert(!Found);
+            Found = true;
+          }
+          ++u;
+        }
+        assert(Found);
+      }
+    }
+    Register.setRegUnitLaneMasks(RegUnitLaneMasks);
+  }
+}
+
 void CodeGenRegBank::computeDerivedInfo() {
   computeComposites();
-  computeSubRegIndexLaneMasks();
+  computeSubRegLaneMasks();
 
   // Compute a weight for each register unit created during getSubRegs.
   // This may create adopted register units (with unit # >= NumNativeRegUnits).
@@ -1716,6 +1800,8 @@ void CodeGenRegBank::computeDerivedInfo() {
   // supersets for the union of overlapping sets.
   computeRegUnitSets();
 
+  computeRegUnitLaneMasks();
+
   // Get the weight of each set.
   for (unsigned Idx = 0, EndIdx = RegUnitSets.size(); Idx != EndIdx; ++Idx)
     RegUnitSets[Idx].Weight = getRegUnitSetWeight(RegUnitSets[Idx].Units);
@@ -1742,20 +1828,23 @@ void CodeGenRegBank::computeDerivedInfo() {
 // returns a maximal register class for all X.
 //
 void CodeGenRegBank::inferCommonSubClass(CodeGenRegisterClass *RC) {
-  for (unsigned rci = 0, rce = RegClasses.size(); rci != rce; ++rci) {
+  assert(!RegClasses.empty());
+  // Stash the iterator to the last element so that this loop doesn't visit
+  // elements added by the getOrCreateSubClass call within it.
+  for (auto I = RegClasses.begin(), E = std::prev(RegClasses.end());
+       I != std::next(E); ++I) {
     CodeGenRegisterClass *RC1 = RC;
-    CodeGenRegisterClass *RC2 = RegClasses[rci];
+    CodeGenRegisterClass *RC2 = &*I;
     if (RC1 == RC2)
       continue;
 
     // Compute the set intersection of RC1 and RC2.
-    const CodeGenRegister::Set &Memb1 = RC1->getMembers();
-    const CodeGenRegister::Set &Memb2 = RC2->getMembers();
-    CodeGenRegister::Set Intersection;
-    std::set_intersection(Memb1.begin(), Memb1.end(),
-                          Memb2.begin(), Memb2.end(),
-                          std::inserter(Intersection, Intersection.begin()),
-                          CodeGenRegister::Less());
+    const CodeGenRegister::Vec &Memb1 = RC1->getMembers();
+    const CodeGenRegister::Vec &Memb2 = RC2->getMembers();
+    CodeGenRegister::Vec Intersection;
+    std::set_intersection(
+        Memb1.begin(), Memb1.end(), Memb2.begin(), Memb2.end(),
+        std::inserter(Intersection, Intersection.begin()), deref<llvm::less>());
 
     // Skip disjoint class pairs.
     if (Intersection.empty())
@@ -1781,37 +1870,38 @@ void CodeGenRegBank::inferCommonSubClass(CodeGenRegisterClass *RC) {
 //
 void CodeGenRegBank::inferSubClassWithSubReg(CodeGenRegisterClass *RC) {
   // Map SubRegIndex to set of registers in RC supporting that SubRegIndex.
-  typedef std::map<CodeGenSubRegIndex*, CodeGenRegister::Set,
-                   CodeGenSubRegIndex::Less> SubReg2SetMap;
+  typedef std::map<const CodeGenSubRegIndex *, CodeGenRegister::Vec,
+                   deref<llvm::less>> SubReg2SetMap;
 
   // Compute the set of registers supporting each SubRegIndex.
   SubReg2SetMap SRSets;
-  for (CodeGenRegister::Set::const_iterator RI = RC->getMembers().begin(),
-       RE = RC->getMembers().end(); RI != RE; ++RI) {
-    const CodeGenRegister::SubRegMap &SRM = (*RI)->getSubRegs();
+  for (const auto R : RC->getMembers()) {
+    const CodeGenRegister::SubRegMap &SRM = R->getSubRegs();
     for (CodeGenRegister::SubRegMap::const_iterator I = SRM.begin(),
          E = SRM.end(); I != E; ++I)
-      SRSets[I->first].insert(*RI);
+      SRSets[I->first].push_back(R);
   }
 
+  for (auto I : SRSets)
+    sortAndUniqueRegisters(I.second);
+
   // Find matching classes for all SRSets entries.  Iterate in SubRegIndex
   // numerical order to visit synthetic indices last.
-  for (unsigned sri = 0, sre = SubRegIndices.size(); sri != sre; ++sri) {
-    CodeGenSubRegIndex *SubIdx = SubRegIndices[sri];
-    SubReg2SetMap::const_iterator I = SRSets.find(SubIdx);
+  for (const auto &SubIdx : SubRegIndices) {
+    SubReg2SetMap::const_iterator I = SRSets.find(&SubIdx);
     // Unsupported SubRegIndex. Skip it.
     if (I == SRSets.end())
       continue;
     // In most cases, all RC registers support the SubRegIndex.
     if (I->second.size() == RC->getMembers().size()) {
-      RC->setSubClassWithSubReg(SubIdx, RC);
+      RC->setSubClassWithSubReg(&SubIdx, RC);
       continue;
     }
     // This is a real subset.  See if we have a matching class.
     CodeGenRegisterClass *SubRC =
       getOrCreateSubClass(RC, &I->second,
                           RC->getName() + "_with_" + I->first->getName());
-    RC->setSubClassWithSubReg(SubIdx, SubRC);
+    RC->setSubClassWithSubReg(&SubIdx, SubRC);
   }
 }
 
@@ -1823,27 +1913,24 @@ void CodeGenRegBank::inferSubClassWithSubReg(CodeGenRegisterClass *RC) {
 //
 
 void CodeGenRegBank::inferMatchingSuperRegClass(CodeGenRegisterClass *RC,
-                                                unsigned FirstSubRegRC) {
+                                                std::list<CodeGenRegisterClass>::iterator FirstSubRegRC) {
   SmallVector<std::pair<const CodeGenRegister*,
                         const CodeGenRegister*>, 16> SSPairs;
   BitVector TopoSigs(getNumTopoSigs());
 
   // Iterate in SubRegIndex numerical order to visit synthetic indices last.
-  for (unsigned sri = 0, sre = SubRegIndices.size(); sri != sre; ++sri) {
-    CodeGenSubRegIndex *SubIdx = SubRegIndices[sri];
+  for (auto &SubIdx : SubRegIndices) {
     // Skip indexes that aren't fully supported by RC's registers. This was
     // computed by inferSubClassWithSubReg() above which should have been
     // called first.
-    if (RC->getSubClassWithSubReg(SubIdx) != RC)
+    if (RC->getSubClassWithSubReg(&SubIdx) != RC)
       continue;
 
     // Build list of (Super, Sub) pairs for this SubIdx.
     SSPairs.clear();
     TopoSigs.reset();
-    for (CodeGenRegister::Set::const_iterator RI = RC->getMembers().begin(),
-         RE = RC->getMembers().end(); RI != RE; ++RI) {
-      const CodeGenRegister *Super = *RI;
-      const CodeGenRegister *Sub = Super->getSubRegs().find(SubIdx)->second;
+    for (const auto Super : RC->getMembers()) {
+      const CodeGenRegister *Sub = Super->getSubRegs().find(&SubIdx)->second;
       assert(Sub && "Missing sub-register");
       SSPairs.push_back(std::make_pair(Super, Sub));
       TopoSigs.set(Sub->getTopoSig());
@@ -1851,29 +1938,36 @@ void CodeGenRegBank::inferMatchingSuperRegClass(CodeGenRegisterClass *RC,
 
     // Iterate over sub-register class candidates.  Ignore classes created by
     // this loop. They will never be useful.
-    for (unsigned rci = FirstSubRegRC, rce = RegClasses.size(); rci != rce;
-         ++rci) {
-      CodeGenRegisterClass *SubRC = RegClasses[rci];
+    // Store an iterator to the last element (not end) so that this loop doesn't
+    // visit newly inserted elements.
+    assert(!RegClasses.empty());
+    for (auto I = FirstSubRegRC, E = std::prev(RegClasses.end());
+         I != std::next(E); ++I) {
+      CodeGenRegisterClass &SubRC = *I;
       // Topological shortcut: SubRC members have the wrong shape.
-      if (!TopoSigs.anyCommon(SubRC->getTopoSigs()))
+      if (!TopoSigs.anyCommon(SubRC.getTopoSigs()))
         continue;
       // Compute the subset of RC that maps into SubRC.
-      CodeGenRegister::Set SubSet;
+      CodeGenRegister::Vec SubSetVec;
       for (unsigned i = 0, e = SSPairs.size(); i != e; ++i)
-        if (SubRC->contains(SSPairs[i].second))
-          SubSet.insert(SSPairs[i].first);
-      if (SubSet.empty())
+        if (SubRC.contains(SSPairs[i].second))
+          SubSetVec.push_back(SSPairs[i].first);
+
+      if (SubSetVec.empty())
         continue;
+
       // RC injects completely into SubRC.
-      if (SubSet.size() == SSPairs.size()) {
-        SubRC->addSuperRegClass(SubIdx, RC);
+      sortAndUniqueRegisters(SubSetVec);
+      if (SubSetVec.size() == SSPairs.size()) {
+        SubRC.addSuperRegClass(&SubIdx, RC);
         continue;
       }
+
       // Only a subset of RC maps into SubRC. Make sure it is represented by a
       // class.
-      getOrCreateSubClass(RC, &SubSet, RC->getName() +
-                          "_with_" + SubIdx->getName() +
-                          "_in_" + SubRC->getName());
+      getOrCreateSubClass(RC, &SubSetVec, RC->getName() + "_with_" +
+                                          SubIdx.getName() + "_in_" +
+                                          SubRC.getName());
     }
   }
 }
@@ -1883,14 +1977,19 @@ void CodeGenRegBank::inferMatchingSuperRegClass(CodeGenRegisterClass *RC,
 // Infer missing register classes.
 //
 void CodeGenRegBank::computeInferredRegisterClasses() {
+  assert(!RegClasses.empty());
   // When this function is called, the register classes have not been sorted
   // and assigned EnumValues yet.  That means getSubClasses(),
   // getSuperClasses(), and hasSubClass() functions are defunct.
-  unsigned FirstNewRC = RegClasses.size();
+
+  // Use one-before-the-end so it doesn't move forward when new elements are
+  // added.
+  auto FirstNewRC = std::prev(RegClasses.end());
 
   // Visit all register classes, including the ones being added by the loop.
-  for (unsigned rci = 0; rci != RegClasses.size(); ++rci) {
-    CodeGenRegisterClass *RC = RegClasses[rci];
+  // Watch out for iterator invalidation here.
+  for (auto I = RegClasses.begin(), E = RegClasses.end(); I != E; ++I) {
+    CodeGenRegisterClass *RC = &*I;
 
     // Synthesize answers for getSubClassWithSubReg().
     inferSubClassWithSubReg(RC);
@@ -1907,10 +2006,11 @@ void CodeGenRegBank::computeInferredRegisterClasses() {
     // after inferMatchingSuperRegClass was called.  At this point,
     // inferMatchingSuperRegClass has checked SuperRC = [0..rci] with SubRC =
     // [0..FirstNewRC).  We need to cover SubRC = [FirstNewRC..rci].
-    if (rci + 1 == FirstNewRC) {
-      unsigned NextNewRC = RegClasses.size();
-      for (unsigned rci2 = 0; rci2 != FirstNewRC; ++rci2)
-        inferMatchingSuperRegClass(RegClasses[rci2], FirstNewRC);
+    if (I == FirstNewRC) {
+      auto NextNewRC = std::prev(RegClasses.end());
+      for (auto I2 = RegClasses.begin(), E2 = std::next(FirstNewRC); I2 != E2;
+           ++I2)
+        inferMatchingSuperRegClass(&*I2, E2);
       FirstNewRC = NextNewRC;
     }
   }
@@ -1924,10 +2024,8 @@ void CodeGenRegBank::computeInferredRegisterClasses() {
 const CodeGenRegisterClass*
 CodeGenRegBank::getRegClassForRegister(Record *R) {
   const CodeGenRegister *Reg = getReg(R);
-  ArrayRef<CodeGenRegisterClass*> RCs = getRegClasses();
   const CodeGenRegisterClass *FoundRC = nullptr;
-  for (unsigned i = 0, e = RCs.size(); i != e; ++i) {
-    const CodeGenRegisterClass &RC = *RCs[i];
+  for (const auto &RC : getRegClasses()) {
     if (!RC.contains(Reg))
       continue;
 
diff --git a/utils/TableGen/CodeGenRegisters.h b/utils/TableGen/CodeGenRegisters.h
index c1e37fa..7c723d9 100644
--- a/utils/TableGen/CodeGenRegisters.h
+++ b/utils/TableGen/CodeGenRegisters.h
@@ -19,19 +19,36 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/SetTheory.h"
 #include <cstdlib>
+#include <list>
 #include <map>
 #include <set>
 #include <string>
 #include <vector>
+#include <deque>
 
 namespace llvm {
   class CodeGenRegBank;
 
+  /// Used to encode a step in a register lane mask transformation.
+  /// Mask the bits specified in Mask, then rotate them Rol bits to the left
+  /// assuming a wraparound at 32bits.
+  struct MaskRolPair {
+    unsigned Mask;
+    uint8_t RotateLeft;
+    bool operator==(const MaskRolPair Other) {
+      return Mask == Other.Mask && RotateLeft == Other.RotateLeft;
+    }
+    bool operator!=(const MaskRolPair Other) {
+      return Mask != Other.Mask || RotateLeft != Other.RotateLeft;
+    }
+  };
+
   /// CodeGenSubRegIndex - Represents a sub-register index.
   class CodeGenSubRegIndex {
     Record *const TheDef;
@@ -42,7 +59,8 @@ namespace llvm {
     uint16_t Size;
     uint16_t Offset;
     const unsigned EnumValue;
-    unsigned LaneMask;
+    mutable unsigned LaneMask;
+    mutable SmallVector<MaskRolPair,1> CompositionLaneMaskTransform;
 
     // Are all super-registers containing this SubRegIndex covered by their
     // sub-registers?
@@ -55,17 +73,9 @@ namespace llvm {
     const std::string &getNamespace() const { return Namespace; }
     std::string getQualifiedName() const;
 
-    // Order CodeGenSubRegIndex pointers by EnumValue.
-    struct Less {
-      bool operator()(const CodeGenSubRegIndex *A,
-                      const CodeGenSubRegIndex *B) const {
-        assert(A && B);
-        return A->EnumValue < B->EnumValue;
-      }
-    };
-
     // Map of composite subreg indices.
-    typedef std::map<CodeGenSubRegIndex*, CodeGenSubRegIndex*, Less> CompMap;
+    typedef std::map<CodeGenSubRegIndex *, CodeGenSubRegIndex *,
+                     deref<llvm::less>> CompMap;
 
     // Returns the subreg index that results from composing this with Idx.
     // Returns NULL if this and Idx don't compose.
@@ -101,12 +111,17 @@ namespace llvm {
     const CompMap &getComposites() const { return Composed; }
 
     // Compute LaneMask from Composed. Return LaneMask.
-    unsigned computeLaneMask();
+    unsigned computeLaneMask() const;
 
   private:
     CompMap Composed;
   };
 
+  inline bool operator<(const CodeGenSubRegIndex &A,
+                        const CodeGenSubRegIndex &B) {
+    return A.EnumValue < B.EnumValue;
+  }
+
   /// CodeGenRegister - Represents a register definition.
   struct CodeGenRegister {
     Record *TheDef;
@@ -115,8 +130,8 @@ namespace llvm {
     bool CoveredBySubRegs;
 
     // Map SubRegIndex -> Register.
-    typedef std::map<CodeGenSubRegIndex*, CodeGenRegister*,
-                     CodeGenSubRegIndex::Less> SubRegMap;
+    typedef std::map<CodeGenSubRegIndex *, CodeGenRegister *, deref<llvm::less>>
+        SubRegMap;
 
     CodeGenRegister(Record *R, unsigned Enum);
 
@@ -180,18 +195,27 @@ namespace llvm {
     }
 
     // List of register units in ascending order.
-    typedef SmallVector<unsigned, 16> RegUnitList;
+    typedef SparseBitVector<> RegUnitList;
+    typedef SmallVector<unsigned, 16> RegUnitLaneMaskList;
 
     // How many entries in RegUnitList are native?
-    unsigned NumNativeRegUnits;
+    RegUnitList NativeRegUnits;
 
     // Get the list of register units.
     // This is only valid after computeSubRegs() completes.
     const RegUnitList &getRegUnits() const { return RegUnits; }
 
+    ArrayRef<unsigned> getRegUnitLaneMasks() const {
+      return makeArrayRef(RegUnitLaneMasks).slice(0, NativeRegUnits.count());
+    }
+
     // Get the native register units. This is a prefix of getRegUnits().
-    ArrayRef<unsigned> getNativeRegUnits() const {
-      return makeArrayRef(RegUnits).slice(0, NumNativeRegUnits);
+    RegUnitList getNativeRegUnits() const {
+      return NativeRegUnits;
+    }
+
+    void setRegUnitLaneMasks(const RegUnitLaneMaskList &LaneMasks) {
+      RegUnitLaneMasks = LaneMasks;
     }
 
     // Inherit register units from subregisters.
@@ -199,23 +223,14 @@ namespace llvm {
     bool inheritRegUnits(CodeGenRegBank &RegBank);
 
     // Adopt a register unit for pressure tracking.
-    // A unit is adopted iff its unit number is >= NumNativeRegUnits.
-    void adoptRegUnit(unsigned RUID) { RegUnits.push_back(RUID); }
+    // A unit is adopted iff its unit number is >= NativeRegUnits.count().
+    void adoptRegUnit(unsigned RUID) { RegUnits.set(RUID); }
 
     // Get the sum of this register's register unit weights.
     unsigned getWeight(const CodeGenRegBank &RegBank) const;
 
-    // Order CodeGenRegister pointers by EnumValue.
-    struct Less {
-      bool operator()(const CodeGenRegister *A,
-                      const CodeGenRegister *B) const {
-        assert(A && B);
-        return A->EnumValue < B->EnumValue;
-      }
-    };
-
     // Canonically ordered set.
-    typedef std::set<const CodeGenRegister*, Less> Set;
+    typedef std::vector<const CodeGenRegister*> Vec;
 
   private:
     bool SubRegsComplete;
@@ -236,11 +251,19 @@ namespace llvm {
     SuperRegList SuperRegs;
     DenseMap<const CodeGenRegister*, CodeGenSubRegIndex*> SubReg2Idx;
     RegUnitList RegUnits;
+    RegUnitLaneMaskList RegUnitLaneMasks;
   };
 
+  inline bool operator<(const CodeGenRegister &A, const CodeGenRegister &B) {
+    return A.EnumValue < B.EnumValue;
+  }
+
+  inline bool operator==(const CodeGenRegister &A, const CodeGenRegister &B) {
+    return A.EnumValue == B.EnumValue;
+  }
 
   class CodeGenRegisterClass {
-    CodeGenRegister::Set Members;
+    CodeGenRegister::Vec Members;
     // Allocation orders. Order[0] always contains all registers in Members.
     std::vector<SmallVector<Record*, 16> > Orders;
     // Bit mask of sub-classes including this, indexed by their EnumValue.
@@ -257,15 +280,16 @@ namespace llvm {
 
     // Map SubRegIndex -> sub-class.  This is the largest sub-class where all
     // registers have a SubRegIndex sub-register.
-    DenseMap<CodeGenSubRegIndex*, CodeGenRegisterClass*> SubClassWithSubReg;
+    DenseMap<const CodeGenSubRegIndex *, CodeGenRegisterClass *>
+        SubClassWithSubReg;
 
     // Map SubRegIndex -> set of super-reg classes.  This is all register
     // classes SuperRC such that:
     //
     //   R:SubRegIndex in this RC for all R in SuperRC.
     //
-    DenseMap<CodeGenSubRegIndex*,
-             SmallPtrSet<CodeGenRegisterClass*, 8> > SuperRegClasses;
+    DenseMap<const CodeGenSubRegIndex *, SmallPtrSet<CodeGenRegisterClass *, 8>>
+        SuperRegClasses;
 
     // Bit vector of TopoSigs for the registers in this class. This will be
     // very sparse on regular architectures.
@@ -280,6 +304,8 @@ namespace llvm {
     int CopyCost;
     bool Allocatable;
     std::string AltOrderSelect;
+    /// Contains the combination of the lane masks of all subregisters.
+    unsigned LaneMask;
 
     // Return the Record that defined this class, or NULL if the class was
     // created by TableGen.
@@ -314,19 +340,20 @@ namespace llvm {
 
     // getSubClassWithSubReg - Returns the largest sub-class where all
     // registers have a SubIdx sub-register.
-    CodeGenRegisterClass*
-    getSubClassWithSubReg(CodeGenSubRegIndex *SubIdx) const {
+    CodeGenRegisterClass *
+    getSubClassWithSubReg(const CodeGenSubRegIndex *SubIdx) const {
       return SubClassWithSubReg.lookup(SubIdx);
     }
 
-    void setSubClassWithSubReg(CodeGenSubRegIndex *SubIdx,
+    void setSubClassWithSubReg(const CodeGenSubRegIndex *SubIdx,
                                CodeGenRegisterClass *SubRC) {
       SubClassWithSubReg[SubIdx] = SubRC;
     }
 
     // getSuperRegClasses - Returns a bit vector of all register classes
     // containing only SubIdx super-registers of this class.
-    void getSuperRegClasses(CodeGenSubRegIndex *SubIdx, BitVector &Out) const;
+    void getSuperRegClasses(const CodeGenSubRegIndex *SubIdx,
+                            BitVector &Out) const;
 
     // addSuperRegClass - Add a class containing only SudIdx super-registers.
     void addSuperRegClass(CodeGenSubRegIndex *SubIdx,
@@ -357,7 +384,7 @@ namespace llvm {
 
     // Get the set of registers.  This set contains the same registers as
     // getOrder(0).
-    const CodeGenRegister::Set &getMembers() const { return Members; }
+    const CodeGenRegister::Vec &getMembers() const { return Members; }
 
     // Get a bit vector of TopoSigs present in this register class.
     const BitVector &getTopoSigs() const { return TopoSigs; }
@@ -371,11 +398,11 @@ namespace llvm {
     // sub-classes.  Note the ordering provided by this key is not the same as
     // the topological order used for the EnumValues.
     struct Key {
-      const CodeGenRegister::Set *Members;
+      const CodeGenRegister::Vec *Members;
       unsigned SpillSize;
       unsigned SpillAlignment;
 
-      Key(const CodeGenRegister::Set *M, unsigned S = 0, unsigned A = 0)
+      Key(const CodeGenRegister::Vec *M, unsigned S = 0, unsigned A = 0)
         : Members(M), SpillSize(S), SpillAlignment(A) {}
 
       Key(const CodeGenRegisterClass &RC)
@@ -446,8 +473,7 @@ namespace llvm {
   class CodeGenRegBank {
     SetTheory Sets;
 
-    // SubRegIndices.
-    std::vector<CodeGenSubRegIndex*> SubRegIndices;
+    std::deque<CodeGenSubRegIndex> SubRegIndices;
     DenseMap<Record*, CodeGenSubRegIndex*> Def2SubRegIdx;
 
     CodeGenSubRegIndex *createSubRegIndex(StringRef Name, StringRef NameSpace);
@@ -457,7 +483,7 @@ namespace llvm {
     ConcatIdxMap ConcatIdx;
 
     // Registers.
-    std::vector<CodeGenRegister*> Registers;
+    std::deque<CodeGenRegister> Registers;
     StringMap<CodeGenRegister*> RegistersByName;
     DenseMap<Record*, CodeGenRegister*> Def2Reg;
     unsigned NumNativeRegUnits;
@@ -468,7 +494,7 @@ namespace llvm {
     SmallVector<RegUnit, 8> RegUnits;
 
     // Register classes.
-    std::vector<CodeGenRegisterClass*> RegClasses;
+    std::list<CodeGenRegisterClass> RegClasses;
     DenseMap<Record*, CodeGenRegisterClass*> Def2RC;
     typedef std::map<CodeGenRegisterClass::Key, CodeGenRegisterClass*> RCKeyMap;
     RCKeyMap Key2RC;
@@ -494,15 +520,20 @@ namespace llvm {
 
     // Create a synthetic sub-class if it is missing.
     CodeGenRegisterClass *getOrCreateSubClass(const CodeGenRegisterClass *RC,
-                                              const CodeGenRegister::Set *Membs,
+                                              const CodeGenRegister::Vec *Membs,
                                               StringRef Name);
 
     // Infer missing register classes.
     void computeInferredRegisterClasses();
     void inferCommonSubClass(CodeGenRegisterClass *RC);
     void inferSubClassWithSubReg(CodeGenRegisterClass *RC);
-    void inferMatchingSuperRegClass(CodeGenRegisterClass *RC,
-                                    unsigned FirstSubRegRC = 0);
+    void inferMatchingSuperRegClass(CodeGenRegisterClass *RC) {
+      inferMatchingSuperRegClass(RC, RegClasses.begin());
+    }
+
+    void inferMatchingSuperRegClass(
+        CodeGenRegisterClass *RC,
+        std::list<CodeGenRegisterClass>::iterator FirstSubRegRC);
 
     // Iteratively prune unit sets.
     void pruneUnitSets();
@@ -517,7 +548,11 @@ namespace llvm {
     void computeComposites();
 
     // Compute a lane mask for each sub-register index.
-    void computeSubRegIndexLaneMasks();
+    void computeSubRegLaneMasks();
+
+    /// Computes a lane mask for each register unit enumerated by a physical
+    /// register.
+    void computeRegUnitLaneMasks();
 
   public:
     CodeGenRegBank(RecordKeeper&);
@@ -527,7 +562,9 @@ namespace llvm {
     // Sub-register indices. The first NumNamedIndices are defined by the user
     // in the .td files. The rest are synthesized such that all sub-registers
     // have a unique name.
-    ArrayRef<CodeGenSubRegIndex*> getSubRegIndices() { return SubRegIndices; }
+    const std::deque<CodeGenSubRegIndex> &getSubRegIndices() const {
+      return SubRegIndices;
+    }
 
     // Find a SubRegIndex form its Record def.
     CodeGenSubRegIndex *getSubRegIdx(Record*);
@@ -547,7 +584,7 @@ namespace llvm {
       ConcatIdx.insert(std::make_pair(Parts, Idx));
     }
 
-    const std::vector<CodeGenRegister*> &getRegisters() { return Registers; }
+    const std::deque<CodeGenRegister> &getRegisters() { return Registers; }
     const StringMap<CodeGenRegister*> &getRegistersByName() {
       return RegistersByName;
     }
@@ -604,7 +641,9 @@ namespace llvm {
     RegUnit &getRegUnit(unsigned RUID) { return RegUnits[RUID]; }
     const RegUnit &getRegUnit(unsigned RUID) const { return RegUnits[RUID]; }
 
-    ArrayRef<CodeGenRegisterClass*> getRegClasses() const {
+    std::list<CodeGenRegisterClass> &getRegClasses() { return RegClasses; }
+
+    const std::list<CodeGenRegisterClass> &getRegClasses() const {
       return RegClasses;
     }
 
diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp
index 4cf7b5f..bfdf8dc 100644
--- a/utils/TableGen/CodeGenSchedule.cpp
+++ b/utils/TableGen/CodeGenSchedule.cpp
@@ -74,11 +74,10 @@ struct InstRegexOp : public SetTheory::Operator {
       }
       RegexList.push_back(Regex(pat));
     }
-    for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
-           E = Target.inst_end(); I != E; ++I) {
+    for (const CodeGenInstruction *Inst : Target.instructions()) {
       for (auto &R : RegexList) {
-        if (R.match((*I)->TheDef->getName()))
-          Elts.insert((*I)->TheDef);
+        if (R.match(Inst->TheDef->getName()))
+          Elts.insert(Inst->TheDef);
       }
     }
   }
@@ -214,9 +213,8 @@ void CodeGenSchedModels::collectSchedRW() {
 
   // Find all SchedReadWrites referenced by instruction defs.
   RecVec SWDefs, SRDefs;
-  for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
-         E = Target.inst_end(); I != E; ++I) {
-    Record *SchedDef = (*I)->TheDef;
+  for (const CodeGenInstruction *Inst : Target.instructions()) {
+    Record *SchedDef = Inst->TheDef;
     if (SchedDef->isValueUnset("SchedRW"))
       continue;
     RecVec RWs = SchedDef->getValueAsListOfDefs("SchedRW");
@@ -509,18 +507,17 @@ void CodeGenSchedModels::collectSchedClasses() {
 
   // Create a SchedClass for each unique combination of itinerary class and
   // SchedRW list.
-  for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
-         E = Target.inst_end(); I != E; ++I) {
-    Record *ItinDef = (*I)->TheDef->getValueAsDef("Itinerary");
+  for (const CodeGenInstruction *Inst : Target.instructions()) {
+    Record *ItinDef = Inst->TheDef->getValueAsDef("Itinerary");
     IdxVec Writes, Reads;
-    if (!(*I)->TheDef->isValueUnset("SchedRW"))
-      findRWs((*I)->TheDef->getValueAsListOfDefs("SchedRW"), Writes, Reads);
+    if (!Inst->TheDef->isValueUnset("SchedRW"))
+      findRWs(Inst->TheDef->getValueAsListOfDefs("SchedRW"), Writes, Reads);
 
     // ProcIdx == 0 indicates the class applies to all processors.
     IdxVec ProcIndices(1, 0);
 
     unsigned SCIdx = addSchedClass(ItinDef, Writes, Reads, ProcIndices);
-    InstrClassMap[(*I)->TheDef] = SCIdx;
+    InstrClassMap[Inst->TheDef] = SCIdx;
   }
   // Create classes for InstRW defs.
   RecVec InstRWDefs = Records.getAllDerivedDefinitions("InstRW");
@@ -535,18 +532,16 @@ void CodeGenSchedModels::collectSchedClasses() {
   if (!EnableDump)
     return;
 
-  for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
-         E = Target.inst_end(); I != E; ++I) {
-
-    std::string InstName = (*I)->TheDef->getName();
-    unsigned SCIdx = InstrClassMap.lookup((*I)->TheDef);
+  for (const CodeGenInstruction *Inst : Target.instructions()) {
+    std::string InstName = Inst->TheDef->getName();
+    unsigned SCIdx = InstrClassMap.lookup(Inst->TheDef);
     if (!SCIdx) {
-      dbgs() << "No machine model for " << (*I)->TheDef->getName() << '\n';
+      dbgs() << "No machine model for " << Inst->TheDef->getName() << '\n';
       continue;
     }
     CodeGenSchedClass &SC = getSchedClass(SCIdx);
     if (SC.ProcIndices[0] != 0)
-      PrintFatalError((*I)->TheDef->getLoc(), "Instruction's sched class "
+      PrintFatalError(Inst->TheDef->getLoc(), "Instruction's sched class "
                       "must not be subtarget specific.");
 
     IdxVec ProcIndices;
@@ -584,7 +579,7 @@ void CodeGenSchedModels::collectSchedClasses() {
     for (std::vector<CodeGenProcModel>::iterator PI = ProcModels.begin(),
            PE = ProcModels.end(); PI != PE; ++PI) {
       if (!std::count(ProcIndices.begin(), ProcIndices.end(), PI->Index))
-        dbgs() << "No machine model for " << (*I)->TheDef->getName()
+        dbgs() << "No machine model for " << Inst->TheDef->getName()
                << " on processor " << PI->ModelName << '\n';
     }
   }
@@ -781,9 +776,7 @@ bool CodeGenSchedModels::hasItineraries() const {
 
 // Gather the processor itineraries.
 void CodeGenSchedModels::collectProcItins() {
-  for (std::vector<CodeGenProcModel>::iterator PI = ProcModels.begin(),
-         PE = ProcModels.end(); PI != PE; ++PI) {
-    CodeGenProcModel &ProcModel = *PI;
+  for (CodeGenProcModel &ProcModel : ProcModels) {
     if (!ProcModel.hasItineraries())
       continue;
 
@@ -1502,8 +1495,7 @@ void CodeGenSchedModels::collectProcResources() {
       PM.ProcResourceDefs.push_back(*RI);
   }
   // Finalize each ProcModel by sorting the record arrays.
-  for (unsigned PIdx = 0, PEnd = ProcModels.size(); PIdx != PEnd; ++PIdx) {
-    CodeGenProcModel &PM = ProcModels[PIdx];
+  for (CodeGenProcModel &PM : ProcModels) {
     std::sort(PM.WriteResDefs.begin(), PM.WriteResDefs.end(),
               LessRecord());
     std::sort(PM.ReadAdvanceDefs.begin(), PM.ReadAdvanceDefs.end(),
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index 597da68..47c8e0b 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -57,6 +57,7 @@ std::string llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::i32:      return "MVT::i32";
   case MVT::i64:      return "MVT::i64";
   case MVT::i128:     return "MVT::i128";
+  case MVT::Any:      return "MVT::Any";
   case MVT::iAny:     return "MVT::iAny";
   case MVT::fAny:     return "MVT::fAny";
   case MVT::vAny:     return "MVT::vAny";
@@ -133,7 +134,7 @@ std::string llvm::getQualifiedName(const Record *R) {
 /// getTarget - Return the current instance of the Target class.
 ///
 CodeGenTarget::CodeGenTarget(RecordKeeper &records)
-  : Records(records), RegBank(nullptr), SchedModels(nullptr) {
+  : Records(records) {
   std::vector<Record*> Targets = Records.getAllDerivedDefinitions("Target");
   if (Targets.size() == 0)
     PrintFatalError("ERROR: No 'Target' subclasses defined!");
@@ -143,9 +144,6 @@ CodeGenTarget::CodeGenTarget(RecordKeeper &records)
 }
 
 CodeGenTarget::~CodeGenTarget() {
-  DeleteContainerSeconds(Instructions);
-  delete RegBank;
-  delete SchedModels;
 }
 
 const std::string &CodeGenTarget::getName() const {
@@ -153,11 +151,11 @@ const std::string &CodeGenTarget::getName() const {
 }
 
 std::string CodeGenTarget::getInstNamespace() const {
-  for (inst_iterator i = inst_begin(), e = inst_end(); i != e; ++i) {
+  for (const CodeGenInstruction *Inst : instructions()) {
     // Make sure not to pick up "TargetOpcode" by accidentally getting
     // the namespace off the PHI instruction or something.
-    if ((*i)->Namespace != "TargetOpcode")
-      return (*i)->Namespace;
+    if (Inst->Namespace != "TargetOpcode")
+      return Inst->Namespace;
   }
 
   return "";
@@ -211,7 +209,7 @@ Record *CodeGenTarget::getAsmWriter() const {
 
 CodeGenRegBank &CodeGenTarget::getRegBank() const {
   if (!RegBank)
-    RegBank = new CodeGenRegBank(Records);
+    RegBank = llvm::make_unique<CodeGenRegBank>(Records);
   return *RegBank;
 }
 
@@ -234,9 +232,7 @@ std::vector<MVT::SimpleValueType> CodeGenTarget::
 getRegisterVTs(Record *R) const {
   const CodeGenRegister *Reg = getRegBank().getReg(R);
   std::vector<MVT::SimpleValueType> Result;
-  ArrayRef<CodeGenRegisterClass*> RCs = getRegBank().getRegClasses();
-  for (unsigned i = 0, e = RCs.size(); i != e; ++i) {
-    const CodeGenRegisterClass &RC = *RCs[i];
+  for (const auto &RC : getRegBank().getRegClasses()) {
     if (RC.contains(Reg)) {
       ArrayRef<MVT::SimpleValueType> InVTs = RC.getValueTypes();
       Result.insert(Result.end(), InVTs.begin(), InVTs.end());
@@ -251,10 +247,8 @@ getRegisterVTs(Record *R) const {
 
 
 void CodeGenTarget::ReadLegalValueTypes() const {
-  ArrayRef<CodeGenRegisterClass*> RCs = getRegBank().getRegClasses();
-  for (unsigned i = 0, e = RCs.size(); i != e; ++i)
-    for (unsigned ri = 0, re = RCs[i]->VTs.size(); ri != re; ++ri)
-      LegalValueTypes.push_back(RCs[i]->VTs[ri]);
+  for (const auto &RC : getRegBank().getRegClasses())
+    LegalValueTypes.insert(LegalValueTypes.end(), RC.VTs.begin(), RC.VTs.end());
 
   // Remove duplicates.
   std::sort(LegalValueTypes.begin(), LegalValueTypes.end());
@@ -265,7 +259,7 @@ void CodeGenTarget::ReadLegalValueTypes() const {
 
 CodeGenSchedModels &CodeGenTarget::getSchedModels() const {
   if (!SchedModels)
-    SchedModels = new CodeGenSchedModels(Records, *this);
+    SchedModels = llvm::make_unique<CodeGenSchedModels>(Records, *this);
   return *SchedModels;
 }
 
@@ -276,20 +270,20 @@ void CodeGenTarget::ReadInstructions() const {
 
   // Parse the instructions defined in the .td file.
   for (unsigned i = 0, e = Insts.size(); i != e; ++i)
-    Instructions[Insts[i]] = new CodeGenInstruction(Insts[i]);
+    Instructions[Insts[i]] = llvm::make_unique<CodeGenInstruction>(Insts[i]);
 }
 
 static const CodeGenInstruction *
 GetInstByName(const char *Name,
-              const DenseMap<const Record*, CodeGenInstruction*> &Insts,
+              const DenseMap<const Record*,
+                             std::unique_ptr<CodeGenInstruction>> &Insts,
               RecordKeeper &Records) {
   const Record *Rec = Records.getDef(Name);
 
-  DenseMap<const Record*, CodeGenInstruction*>::const_iterator
-    I = Insts.find(Rec);
+  const auto I = Insts.find(Rec);
   if (!Rec || I == Insts.end())
     PrintFatalError(Twine("Could not find '") + Name + "' instruction!");
-  return I->second;
+  return I->second.get();
 }
 
 /// \brief Return all of the instructions defined by the target, ordered by
@@ -302,8 +296,9 @@ void CodeGenTarget::ComputeInstrsByEnum() const {
       "IMPLICIT_DEF", "SUBREG_TO_REG", "COPY_TO_REGCLASS", "DBG_VALUE",
       "REG_SEQUENCE", "COPY",          "BUNDLE",           "LIFETIME_START",
       "LIFETIME_END", "STACKMAP",      "PATCHPOINT",       "LOAD_STACK_GUARD",
+      "STATEPOINT",   "FRAME_ALLOC",
       nullptr};
-  const DenseMap<const Record*, CodeGenInstruction*> &Insts = getInstructions();
+  const auto &Insts = getInstructions();
   for (const char *const *p = FixedInstrs; *p; ++p) {
     const CodeGenInstruction *Instr = GetInstByName(*p, Insts, Records);
     assert(Instr && "Missing target independent instruction");
@@ -312,9 +307,8 @@ void CodeGenTarget::ComputeInstrsByEnum() const {
   }
   unsigned EndOfPredefines = InstrsByEnum.size();
 
-  for (DenseMap<const Record*, CodeGenInstruction*>::const_iterator
-       I = Insts.begin(), E = Insts.end(); I != E; ++I) {
-    const CodeGenInstruction *CGI = I->second;
+  for (const auto &I : Insts) {
+    const CodeGenInstruction *CGI = I.second.get();
     if (CGI->Namespace != "TargetOpcode")
       InstrsByEnum.push_back(CGI);
   }
@@ -344,9 +338,7 @@ void CodeGenTarget::reverseBitsForLittleEndianEncoding() {
     return;
 
   std::vector<Record*> Insts = Records.getAllDerivedDefinitions("Instruction");
-  for (std::vector<Record*>::iterator I = Insts.begin(), E = Insts.end();
-       I != E; ++I) {
-    Record *R = *I;
+  for (Record *R : Insts) {
     if (R->getValueAsString("Namespace") == "TargetOpcode" ||
         R->getValueAsBit("isPseudo"))
       continue;
@@ -539,7 +531,9 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
       // variants with iAny types; otherwise, if the intrinsic is not
       // overloaded, all the types can be specified directly.
       assert(((!TyEl->isSubClassOf("LLVMExtendedType") &&
-               !TyEl->isSubClassOf("LLVMTruncatedType")) ||
+               !TyEl->isSubClassOf("LLVMTruncatedType") &&
+               !TyEl->isSubClassOf("LLVMVectorSameWidth") &&
+               !TyEl->isSubClassOf("LLVMPointerToElt")) ||
               VT == MVT::iAny || VT == MVT::vAny) &&
              "Expected iAny or vAny type");
     } else
diff --git a/utils/TableGen/CodeGenTarget.h b/utils/TableGen/CodeGenTarget.h
index f4e1b6a..24b3851 100644
--- a/utils/TableGen/CodeGenTarget.h
+++ b/utils/TableGen/CodeGenTarget.h
@@ -65,15 +65,16 @@ class CodeGenTarget {
   RecordKeeper &Records;
   Record *TargetRec;
 
-  mutable DenseMap<const Record*, CodeGenInstruction*> Instructions;
-  mutable CodeGenRegBank *RegBank;
+  mutable DenseMap<const Record*,
+                   std::unique_ptr<CodeGenInstruction>> Instructions;
+  mutable std::unique_ptr<CodeGenRegBank> RegBank;
   mutable std::vector<Record*> RegAltNameIndices;
   mutable SmallVector<MVT::SimpleValueType, 8> LegalValueTypes;
   void ReadRegAltNameIndices() const;
   void ReadInstructions() const;
   void ReadLegalValueTypes() const;
 
-  mutable CodeGenSchedModels *SchedModels;
+  mutable std::unique_ptr<CodeGenSchedModels> SchedModels;
 
   mutable std::vector<const CodeGenInstruction*> InstrsByEnum;
 public:
@@ -146,7 +147,8 @@ public:
   CodeGenSchedModels &getSchedModels() const;
 
 private:
-  DenseMap<const Record*, CodeGenInstruction*> &getInstructions() const {
+  DenseMap<const Record*, std::unique_ptr<CodeGenInstruction>> &
+  getInstructions() const {
     if (Instructions.empty()) ReadInstructions();
     return Instructions;
   }
@@ -154,8 +156,7 @@ public:
 
   CodeGenInstruction &getInstruction(const Record *InstRec) const {
     if (Instructions.empty()) ReadInstructions();
-    DenseMap<const Record*, CodeGenInstruction*>::iterator I =
-      Instructions.find(InstRec);
+    auto I = Instructions.find(InstRec);
     assert(I != Instructions.end() && "Not an instruction");
     return *I->second;
   }
diff --git a/utils/TableGen/DAGISelEmitter.cpp b/utils/TableGen/DAGISelEmitter.cpp
index e2e6ab1..0fe3bbd 100644
--- a/utils/TableGen/DAGISelEmitter.cpp
+++ b/utils/TableGen/DAGISelEmitter.cpp
@@ -157,12 +157,12 @@ void DAGISelEmitter::run(raw_ostream &OS) {
     }
   }
 
-  Matcher *TheMatcher = new ScopeMatcher(PatternMatchers);
+  std::unique_ptr<Matcher> TheMatcher =
+    llvm::make_unique<ScopeMatcher>(PatternMatchers);
 
-  TheMatcher = OptimizeMatcher(TheMatcher, CGP);
+  OptimizeMatcher(TheMatcher, CGP);
   //Matcher->dump();
-  EmitMatcherTable(TheMatcher, CGP, OS);
-  delete TheMatcher;
+  EmitMatcherTable(TheMatcher.get(), CGP, OS);
 }
 
 namespace llvm {
diff --git a/utils/TableGen/DAGISelMatcher.h b/utils/TableGen/DAGISelMatcher.h
index b9cb267..9df3b41 100644
--- a/utils/TableGen/DAGISelMatcher.h
+++ b/utils/TableGen/DAGISelMatcher.h
@@ -30,7 +30,8 @@ namespace llvm {
 
 Matcher *ConvertPatternToMatcher(const PatternToMatch &Pattern,unsigned Variant,
                                  const CodeGenDAGPatterns &CGP);
-Matcher *OptimizeMatcher(Matcher *Matcher, const CodeGenDAGPatterns &CGP);
+void OptimizeMatcher(std::unique_ptr<Matcher> &Matcher,
+                     const CodeGenDAGPatterns &CGP);
 void EmitMatcherTable(const Matcher *Matcher, const CodeGenDAGPatterns &CGP,
                       raw_ostream &OS);
 
diff --git a/utils/TableGen/DAGISelMatcherGen.cpp b/utils/TableGen/DAGISelMatcherGen.cpp
index 4a73b00..eb80619 100644
--- a/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/utils/TableGen/DAGISelMatcherGen.cpp
@@ -27,10 +27,8 @@ static MVT::SimpleValueType getRegisterValueType(Record *R,
   bool FoundRC = false;
   MVT::SimpleValueType VT = MVT::Other;
   const CodeGenRegister *Reg = T.getRegBank().getReg(R);
-  ArrayRef<CodeGenRegisterClass*> RCs = T.getRegBank().getRegClasses();
 
-  for (unsigned rc = 0, e = RCs.size(); rc != e; ++rc) {
-    const CodeGenRegisterClass &RC = *RCs[rc];
+  for (const auto &RC : T.getRegBank().getRegClasses()) {
     if (!RC.contains(Reg))
       continue;
 
diff --git a/utils/TableGen/DAGISelMatcherOpt.cpp b/utils/TableGen/DAGISelMatcherOpt.cpp
index 7a22764..c9ee371 100644
--- a/utils/TableGen/DAGISelMatcherOpt.cpp
+++ b/utils/TableGen/DAGISelMatcherOpt.cpp
@@ -511,11 +511,10 @@ static void FactorNodes(std::unique_ptr<Matcher> &MatcherPtr) {
     Scope->resetChild(i, NewOptionsToMatch[i]);
 }
 
-Matcher *llvm::OptimizeMatcher(Matcher *TheMatcher,
-                               const CodeGenDAGPatterns &CGP) {
-  std::unique_ptr<Matcher> MatcherPtr(TheMatcher);
+void
+llvm::OptimizeMatcher(std::unique_ptr<Matcher> &MatcherPtr,
+                      const CodeGenDAGPatterns &CGP) {
   ContractNodes(MatcherPtr, CGP);
   SinkPatternPredicates(MatcherPtr);
   FactorNodes(MatcherPtr);
-  return MatcherPtr.release();
 }
diff --git a/utils/TableGen/DFAPacketizerEmitter.cpp b/utils/TableGen/DFAPacketizerEmitter.cpp
index ea14cb9..5060b6e 100644
--- a/utils/TableGen/DFAPacketizerEmitter.cpp
+++ b/utils/TableGen/DFAPacketizerEmitter.cpp
@@ -472,7 +472,7 @@ void DFAPacketizerEmitter::run(raw_ostream &OS) {
           current->canAddInsnClass(InsnClass)) {
         const State *NewState;
         current->AddInsnClass(InsnClass, NewStateResources);
-        assert(NewStateResources.size() && "New states must be generated");
+        assert(!NewStateResources.empty() && "New states must be generated");
 
         //
         // If we have seen this state before, then do not create a new state.
diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index bd83b6c..292a2b1 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -333,8 +333,8 @@ protected:
   // Parent emitter
   const FixedLenDecoderEmitter *Emitter;
 
-  FilterChooser(const FilterChooser &) LLVM_DELETED_FUNCTION;
-  void operator=(const FilterChooser &) LLVM_DELETED_FUNCTION;
+  FilterChooser(const FilterChooser &) = delete;
+  void operator=(const FilterChooser &) = delete;
 public:
 
   FilterChooser(const std::vector<const CodeGenInstruction*> &Insts,
@@ -537,12 +537,10 @@ Filter::~Filter() {
 // instructions.  In order to unambiguously decode the singleton, we need to
 // match the remaining undecoded encoding bits against the singleton.
 void Filter::recurse() {
-  std::map<uint64_t, std::vector<unsigned> >::const_iterator mapIterator;
-
   // Starts by inheriting our parent filter chooser's filter bit values.
   std::vector<bit_value_t> BitValueArray(Owner->FilterBitValues);
 
-  if (VariableInstructions.size()) {
+  if (!VariableInstructions.empty()) {
     // Conservatively marks each segment position as BIT_UNSET.
     for (unsigned bitIndex = 0; bitIndex < NumBits; ++bitIndex)
       BitValueArray[StartBit + bitIndex] = BIT_UNSET;
@@ -564,13 +562,11 @@ void Filter::recurse() {
   }
 
   // Otherwise, create sub choosers.
-  for (mapIterator = FilteredInstructions.begin();
-       mapIterator != FilteredInstructions.end();
-       mapIterator++) {
+  for (const auto &Inst : FilteredInstructions) {
 
     // Marks all the segment positions with either BIT_TRUE or BIT_FALSE.
     for (unsigned bitIndex = 0; bitIndex < NumBits; ++bitIndex) {
-      if (mapIterator->first & (1ULL << bitIndex))
+      if (Inst.first & (1ULL << bitIndex))
         BitValueArray[StartBit + bitIndex] = BIT_TRUE;
       else
         BitValueArray[StartBit + bitIndex] = BIT_FALSE;
@@ -579,8 +575,8 @@ void Filter::recurse() {
     // Delegates to an inferior filter chooser for further processing on this
     // category of instructions.
     FilterChooserMap.insert(std::make_pair(
-        mapIterator->first, llvm::make_unique<FilterChooser>(
-                                Owner->AllInstructions, mapIterator->second,
+        Inst.first, llvm::make_unique<FilterChooser>(
+                                Owner->AllInstructions, Inst.second,
                                 Owner->Operands, BitValueArray, *Owner)));
   }
 }
@@ -616,19 +612,14 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const {
   // A new filter entry begins a new scope for fixup resolution.
   TableInfo.FixupStack.push_back(FixupList());
 
-  std::map<unsigned,
-           std::unique_ptr<const FilterChooser>>::const_iterator filterIterator;
-
   DecoderTable &Table = TableInfo.Table;
 
   size_t PrevFilter = 0;
   bool HasFallthrough = false;
-  for (filterIterator = FilterChooserMap.begin();
-       filterIterator != FilterChooserMap.end();
-       filterIterator++) {
+  for (auto &Filter : FilterChooserMap) {
     // Field value -1 implies a non-empty set of variable instructions.
     // See also recurse().
-    if (filterIterator->first == (unsigned)-1) {
+    if (Filter.first == (unsigned)-1) {
       HasFallthrough = true;
 
       // Each scope should always have at least one filter value to check
@@ -643,7 +634,7 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const {
       Table.push_back(MCD::OPC_FilterValue);
       // Encode and emit the value to filter against.
       uint8_t Buffer[8];
-      unsigned Len = encodeULEB128(filterIterator->first, Buffer);
+      unsigned Len = encodeULEB128(Filter.first, Buffer);
       Table.insert(Table.end(), Buffer, Buffer + Len);
       // Reserve space for the NumToSkip entry. We'll backpatch the value
       // later.
@@ -656,7 +647,7 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const {
     // Now delegate to the sub filter chooser for further decodings.
     // The case may fallthrough, which happens if the remaining well-known
     // encoding bits do not match exactly.
-    filterIterator->second->emitTableEntries(TableInfo);
+    Filter.second->emitTableEntries(TableInfo);
 
     // Now that we've emitted the body of the handler, update the NumToSkip
     // of the filter itself to be able to skip forward when false. Subtract
@@ -685,7 +676,7 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const {
 // Returns the number of fanout produced by the filter.  More fanout implies
 // the filter distinguishes more categories of instructions.
 unsigned Filter::usefulness() const {
-  if (VariableInstructions.size())
+  if (!VariableInstructions.empty())
     return FilteredInstructions.size();
   else
     return FilteredInstructions.size() + 1;
@@ -863,10 +854,9 @@ emitPredicateFunction(formatted_raw_ostream &OS, PredicateSet &Predicates,
     OS.indent(Indentation) << "switch (Idx) {\n";
     OS.indent(Indentation) << "default: llvm_unreachable(\"Invalid index!\");\n";
     unsigned Index = 0;
-    for (PredicateSet::const_iterator I = Predicates.begin(), E = Predicates.end();
-         I != E; ++I, ++Index) {
-      OS.indent(Indentation) << "case " << Index << ":\n";
-      OS.indent(Indentation+2) << "return (" << *I << ");\n";
+    for (const auto &Predicate : Predicates) {
+      OS.indent(Indentation) << "case " << Index++ << ":\n";
+      OS.indent(Indentation+2) << "return (" << Predicate << ");\n";
     }
     OS.indent(Indentation) << "}\n";
   } else {
@@ -892,10 +882,9 @@ emitDecoderFunction(formatted_raw_ostream &OS, DecoderSet &Decoders,
   OS.indent(Indentation) << "switch (Idx) {\n";
   OS.indent(Indentation) << "default: llvm_unreachable(\"Invalid index!\");\n";
   unsigned Index = 0;
-  for (DecoderSet::const_iterator I = Decoders.begin(), E = Decoders.end();
-       I != E; ++I, ++Index) {
-    OS.indent(Indentation) << "case " << Index << ":\n";
-    OS << *I;
+  for (const auto &Decoder : Decoders) {
+    OS.indent(Indentation) << "case " << Index++ << ":\n";
+    OS << Decoder;
     OS.indent(Indentation+2) << "return S;\n";
   }
   OS.indent(Indentation) << "}\n";
@@ -1071,20 +1060,16 @@ void FilterChooser::emitBinaryParser(raw_ostream &o, unsigned &Indentation,
 
 void FilterChooser::emitDecoder(raw_ostream &OS, unsigned Indentation,
                                 unsigned Opc) const {
-  std::map<unsigned, std::vector<OperandInfo> >::const_iterator OpIter =
-    Operands.find(Opc);
-  const std::vector<OperandInfo>& InsnOperands = OpIter->second;
-  for (std::vector<OperandInfo>::const_iterator
-       I = InsnOperands.begin(), E = InsnOperands.end(); I != E; ++I) {
+  for (const auto &Op : Operands.find(Opc)->second) {
     // If a custom instruction decoder was specified, use that.
-    if (I->numFields() == 0 && I->Decoder.size()) {
-      OS.indent(Indentation) << Emitter->GuardPrefix << I->Decoder
+    if (Op.numFields() == 0 && Op.Decoder.size()) {
+      OS.indent(Indentation) << Emitter->GuardPrefix << Op.Decoder
         << "(MI, insn, Address, Decoder)"
         << Emitter->GuardPostfix << "\n";
       break;
     }
 
-    emitBinaryParser(OS, Indentation, *I);
+    emitBinaryParser(OS, Indentation, Op);
   }
 }
 
@@ -1795,7 +1780,7 @@ static bool populateInstruction(CodeGenTarget &Target,
       unsigned NumberOps = CGI.Operands.size();
       while (NumberedOp < NumberOps &&
              (CGI.Operands.isFlatOperandNotEmitted(NumberedOp) ||
-              (NamedOpIndices.size() && NamedOpIndices.count(
+              (!NamedOpIndices.empty() && NamedOpIndices.count(
                 CGI.Operands.getSubOperandNumber(NumberedOp).first))))
         ++NumberedOp;
 
@@ -1864,20 +1849,20 @@ static bool populateInstruction(CodeGenTarget &Target,
   }
 
   // For each operand, see if we can figure out where it is encoded.
-  for (std::vector<std::pair<Init*, std::string> >::const_iterator
-       NI = InOutOperands.begin(), NE = InOutOperands.end(); NI != NE; ++NI) {
-    if (!NumberedInsnOperands[NI->second].empty()) {
+  for (const auto &Op : InOutOperands) {
+    if (!NumberedInsnOperands[Op.second].empty()) {
       InsnOperands.insert(InsnOperands.end(),
-                          NumberedInsnOperands[NI->second].begin(),
-                          NumberedInsnOperands[NI->second].end());
+                          NumberedInsnOperands[Op.second].begin(),
+                          NumberedInsnOperands[Op.second].end());
       continue;
-    } else if (!NumberedInsnOperands[TiedNames[NI->second]].empty()) {
-      if (!NumberedInsnOperandsNoTie.count(TiedNames[NI->second])) {
+    }
+    if (!NumberedInsnOperands[TiedNames[Op.second]].empty()) {
+      if (!NumberedInsnOperandsNoTie.count(TiedNames[Op.second])) {
         // Figure out to which (sub)operand we're tied.
-        unsigned i = CGI.Operands.getOperandNamed(TiedNames[NI->second]);
+        unsigned i = CGI.Operands.getOperandNamed(TiedNames[Op.second]);
         int tiedTo = CGI.Operands[i].getTiedRegister();
         if (tiedTo == -1) {
-          i = CGI.Operands.getOperandNamed(NI->second);
+          i = CGI.Operands.getOperandNamed(Op.second);
           tiedTo = CGI.Operands[i].getTiedRegister();
         }
 
@@ -1885,7 +1870,7 @@ static bool populateInstruction(CodeGenTarget &Target,
           std::pair<unsigned, unsigned> SO =
             CGI.Operands.getSubOperandNumber(tiedTo);
 
-          InsnOperands.push_back(NumberedInsnOperands[TiedNames[NI->second]]
+          InsnOperands.push_back(NumberedInsnOperands[TiedNames[Op.second]]
                                    [SO.second]);
         }
       }
@@ -1899,7 +1884,7 @@ static bool populateInstruction(CodeGenTarget &Target,
     // for decoding register classes.
     // FIXME: This need to be extended to handle instructions with custom
     // decoder methods, and operands with (simple) MIOperandInfo's.
-    TypedInit *TI = cast<TypedInit>(NI->first);
+    TypedInit *TI = cast<TypedInit>(Op.first);
     RecordRecTy *Type = cast<RecordRecTy>(TI->getType());
     Record *TypeRecord = Type->getRecord();
     bool isReg = false;
@@ -1943,8 +1928,8 @@ static bool populateInstruction(CodeGenTarget &Target,
         continue;
       }
 
-      if (Var->getName() != NI->second &&
-          Var->getName() != TiedNames[NI->second]) {
+      if (Var->getName() != Op.second &&
+          Var->getName() != TiedNames[Op.second]) {
         if (Base != ~0U) {
           OpInfo.addField(Base, Width, Offset);
           Base = ~0U;
@@ -2181,12 +2166,10 @@ void FixedLenDecoderEmitter::run(raw_ostream &o) {
   }
 
   DecoderTableInfo TableInfo;
-  for (std::map<std::pair<std::string, unsigned>,
-                std::vector<unsigned> >::const_iterator
-       I = OpcMap.begin(), E = OpcMap.end(); I != E; ++I) {
+  for (const auto &Opc : OpcMap) {
     // Emit the decoder for this namespace+width combination.
-    FilterChooser FC(*NumberedInstructions, I->second, Operands,
-                     8*I->first.second, this);
+    FilterChooser FC(*NumberedInstructions, Opc.second, Operands,
+                     8*Opc.first.second, this);
 
     // The decode table is cleared for each top level decoder function. The
     // predicates and decoders themselves, however, are shared across all
@@ -2207,7 +2190,7 @@ void FixedLenDecoderEmitter::run(raw_ostream &o) {
     TableInfo.Table.push_back(MCD::OPC_Fail);
 
     // Print the table to the output stream.
-    emitTable(OS, TableInfo.Table, 0, FC.getBitWidth(), I->first.first);
+    emitTable(OS, TableInfo.Table, 0, FC.getBitWidth(), Opc.first.first);
     OS.flush();
   }
 
diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp
index 6fdf22d..c5999de 100644
--- a/utils/TableGen/InstrInfoEmitter.cpp
+++ b/utils/TableGen/InstrInfoEmitter.cpp
@@ -143,7 +143,7 @@ InstrInfoEmitter::GetOperandInfo(const CodeGenInstruction &Inst) {
         Res += "|(1<<MCOI::OptionalDef)";
 
       // Fill in operand type.
-      Res += ", MCOI::";
+      Res += ", ";
       assert(!Op.OperandType.empty() && "Invalid operand type.");
       Res += Op.OperandType;
 
@@ -476,7 +476,7 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
      << SchedModels.getSchedClassIdx(Inst) << ",\t"
      << Inst.TheDef->getValueAsInt("Size") << ",\t0";
 
-  // Emit all of the target indepedent flags...
+  // Emit all of the target independent flags...
   if (Inst.isPseudo)           OS << "|(1<<MCID::Pseudo)";
   if (Inst.isReturn)           OS << "|(1<<MCID::Return)";
   if (Inst.isBranch)           OS << "|(1<<MCID::Branch)";
@@ -587,14 +587,16 @@ void InstrInfoEmitter::emitEnums(raw_ostream &OS) {
   for (const CodeGenInstruction *Inst : NumberedInstructions)
     OS << "    " << Inst->TheDef->getName() << "\t= " << Num++ << ",\n";
   OS << "    INSTRUCTION_LIST_END = " << NumberedInstructions.size() << "\n";
-  OS << "  };\n";
+  OS << "  };\n\n";
   OS << "namespace Sched {\n";
   OS << "  enum {\n";
   Num = 0;
   for (const auto &Class : SchedModels.explicit_classes())
     OS << "    " << Class.Name << "\t= " << Num++ << ",\n";
   OS << "    SCHED_LIST_END = " << SchedModels.numInstrSchedClasses() << "\n";
-  OS << "  };\n}\n}\n";
+  OS << "  };\n";
+  OS << "} // End Sched namespace\n";
+  OS << "} // End " << Namespace << " namespace\n";
   OS << "} // End llvm namespace \n";
 
   OS << "#endif // GET_INSTRINFO_ENUM\n\n";
diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp
index 37f6de0..e71c695 100644
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@@ -257,7 +257,10 @@ enum IIT_Info {
   IIT_ANYPTR = 26,
   IIT_V1   = 27,
   IIT_VARARG = 28,
-  IIT_HALF_VEC_ARG = 29
+  IIT_HALF_VEC_ARG = 29,
+  IIT_SAME_VEC_WIDTH_ARG = 30,
+  IIT_PTR_TO_ARG = 31,
+  IIT_VEC_OF_PTRS_TO_ELT = 32
 };
 
 
@@ -289,7 +292,7 @@ static void EncodeFixedValueType(MVT::SimpleValueType VT,
   }
 }
 
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
 #pragma optimize("",off) // MSVC 2010 optimizer can't deal with this function.
 #endif
 
@@ -305,9 +308,20 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
       Sig.push_back(IIT_TRUNC_ARG);
     else if (R->isSubClassOf("LLVMHalfElementsVectorType"))
       Sig.push_back(IIT_HALF_VEC_ARG);
+    else if (R->isSubClassOf("LLVMVectorSameWidth")) {
+      Sig.push_back(IIT_SAME_VEC_WIDTH_ARG);
+      Sig.push_back((Number << 3) | ArgCodes[Number]);
+      MVT::SimpleValueType VT = getValueType(R->getValueAsDef("ElTy"));
+      EncodeFixedValueType(VT, Sig);
+      return;
+    }
+    else if (R->isSubClassOf("LLVMPointerTo"))
+      Sig.push_back(IIT_PTR_TO_ARG);
+    else if (R->isSubClassOf("LLVMVectorOfPointersToElt"))
+      Sig.push_back(IIT_VEC_OF_PTRS_TO_ELT);
     else
       Sig.push_back(IIT_ARG);
-    return Sig.push_back((Number << 2) | ArgCodes[Number]);
+    return Sig.push_back((Number << 3) | ArgCodes[Number]);
   }
 
   MVT::SimpleValueType VT = getValueType(R->getValueAsDef("VT"));
@@ -318,7 +332,8 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
   case MVT::iPTRAny: ++Tmp; // FALL THROUGH.
   case MVT::vAny: ++Tmp; // FALL THROUGH.
   case MVT::fAny: ++Tmp; // FALL THROUGH.
-  case MVT::iAny: {
+  case MVT::iAny: ++Tmp; // FALL THROUGH.
+  case MVT::Any: {
     // If this is an "any" valuetype, then the type is the type of the next
     // type in the list specified to getIntrinsic().
     Sig.push_back(IIT_ARG);
@@ -327,8 +342,8 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
     unsigned ArgNo = ArgCodes.size();
     ArgCodes.push_back(Tmp);
 
-    // Encode what sort of argument it must be in the low 2 bits of the ArgNo.
-    return Sig.push_back((ArgNo << 2) | Tmp);
+    // Encode what sort of argument it must be in the low 3 bits of the ArgNo.
+    return Sig.push_back((ArgNo << 3) | Tmp);
   }
 
   case MVT::iPTR: {
@@ -366,7 +381,7 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
   EncodeFixedValueType(VT, Sig);
 }
 
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
 #pragma optimize("",on)
 #endif
 
diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp
index 79d08a9..5a6694e 100644
--- a/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/utils/TableGen/RegisterInfoEmitter.cpp
@@ -53,27 +53,30 @@ public:
   void run(raw_ostream &o);
 
 private:
-  void EmitRegMapping(raw_ostream &o,
-                      const std::vector<CodeGenRegister*> &Regs, bool isCtor);
+  void EmitRegMapping(raw_ostream &o, const std::deque<CodeGenRegister> &Regs,
+                      bool isCtor);
   void EmitRegMappingTables(raw_ostream &o,
-                            const std::vector<CodeGenRegister*> &Regs,
+                            const std::deque<CodeGenRegister> &Regs,
                             bool isCtor);
   void EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
                            const std::string &ClassName);
   void emitComposeSubRegIndices(raw_ostream &OS, CodeGenRegBank &RegBank,
                                 const std::string &ClassName);
+  void emitComposeSubRegIndexLaneMask(raw_ostream &OS, CodeGenRegBank &RegBank,
+                                      const std::string &ClassName);
 };
 } // End anonymous namespace
 
 // runEnums - Print out enum values for all of the registers.
 void RegisterInfoEmitter::runEnums(raw_ostream &OS,
                                    CodeGenTarget &Target, CodeGenRegBank &Bank) {
-  const std::vector<CodeGenRegister*> &Registers = Bank.getRegisters();
+  const auto &Registers = Bank.getRegisters();
 
   // Register enums are stored as uint16_t in the tables. Make sure we'll fit.
   assert(Registers.size() <= 0xffff && "Too many regs to fit in tables");
 
-  std::string Namespace = Registers[0]->TheDef->getValueAsString("Namespace");
+  std::string Namespace =
+      Registers.front().TheDef->getValueAsString("Namespace");
 
   emitSourceFileHeader("Target Register Enum Values", OS);
 
@@ -90,17 +93,16 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS,
     OS << "namespace " << Namespace << " {\n";
   OS << "enum {\n  NoRegister,\n";
 
-  for (unsigned i = 0, e = Registers.size(); i != e; ++i)
-    OS << "  " << Registers[i]->getName() << " = " <<
-      Registers[i]->EnumValue << ",\n";
-  assert(Registers.size() == Registers[Registers.size()-1]->EnumValue &&
+  for (const auto &Reg : Registers)
+    OS << "  " << Reg.getName() << " = " << Reg.EnumValue << ",\n";
+  assert(Registers.size() == Registers.back().EnumValue &&
          "Register enum value mismatch!");
   OS << "  NUM_TARGET_REGS \t// " << Registers.size()+1 << "\n";
   OS << "};\n";
   if (!Namespace.empty())
     OS << "}\n";
 
-  ArrayRef<CodeGenRegisterClass*> RegisterClasses = Bank.getRegClasses();
+  const auto &RegisterClasses = Bank.getRegClasses();
   if (!RegisterClasses.empty()) {
 
     // RegisterClass enums are stored as uint16_t in the tables.
@@ -111,11 +113,9 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS,
     if (!Namespace.empty())
       OS << "namespace " << Namespace << " {\n";
     OS << "enum {\n";
-    for (unsigned i = 0, e = RegisterClasses.size(); i != e; ++i) {
-      if (i) OS << ",\n";
-      OS << "  " << RegisterClasses[i]->getName() << "RegClassID";
-      OS << " = " << i;
-    }
+    for (const auto &RC : RegisterClasses)
+      OS << "  " << RC.getName() << "RegClassID"
+         << " = " << RC.EnumValue << ",\n";
     OS << "\n  };\n";
     if (!Namespace.empty())
       OS << "}\n";
@@ -137,25 +137,38 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS,
       OS << "}\n";
   }
 
-  ArrayRef<CodeGenSubRegIndex*> SubRegIndices = Bank.getSubRegIndices();
+  auto &SubRegIndices = Bank.getSubRegIndices();
   if (!SubRegIndices.empty()) {
     OS << "\n// Subregister indices\n";
-    std::string Namespace =
-      SubRegIndices[0]->getNamespace();
+    std::string Namespace = SubRegIndices.front().getNamespace();
     if (!Namespace.empty())
       OS << "namespace " << Namespace << " {\n";
     OS << "enum {\n  NoSubRegister,\n";
-    for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i)
-      OS << "  " << SubRegIndices[i]->getName() << ",\t// " << i+1 << "\n";
+    unsigned i = 0;
+    for (const auto &Idx : SubRegIndices)
+      OS << "  " << Idx.getName() << ",\t// " << ++i << "\n";
     OS << "  NUM_TARGET_SUBREGS\n};\n";
     if (!Namespace.empty())
       OS << "}\n";
   }
 
-  OS << "} // End llvm namespace \n";
+  OS << "} // End llvm namespace\n";
   OS << "#endif // GET_REGINFO_ENUM\n\n";
 }
 
+static void printInt(raw_ostream &OS, int Val) {
+  OS << Val;
+}
+
+static const char *getMinimalTypeForRange(uint64_t Range) {
+  assert(Range < 0xFFFFFFFFULL && "Enum too large");
+  if (Range > 0xFFFF)
+    return "uint32_t";
+  if (Range > 0xFF)
+    return "uint16_t";
+  return "uint8_t";
+}
+
 void RegisterInfoEmitter::
 EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
                     const std::string &ClassName) {
@@ -166,9 +179,8 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
      << "const RegClassWeight &" << ClassName << "::\n"
      << "getRegClassWeight(const TargetRegisterClass *RC) const {\n"
      << "  static const RegClassWeight RCWeightTable[] = {\n";
-  for (unsigned i = 0, e = NumRCs; i != e; ++i) {
-    const CodeGenRegisterClass &RC = *RegBank.getRegClasses()[i];
-    const CodeGenRegister::Set &Regs = RC.getMembers();
+  for (const auto &RC : RegBank.getRegClasses()) {
+    const CodeGenRegister::Vec &Regs = RC.getMembers();
     if (Regs.empty())
       OS << "    {0, 0";
     else {
@@ -179,7 +191,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
     }
     OS << "},  \t// " << RC.getName() << "\n";
   }
-  OS << "    {0, 0} };\n"
+  OS << "  };\n"
      << "  return RCWeightTable[RC->getID()];\n"
      << "}\n\n";
 
@@ -204,7 +216,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
       assert(RU.Weight < 256 && "RegUnit too heavy");
       OS << RU.Weight << ", ";
     }
-    OS << "0 };\n"
+    OS << "};\n"
        << "  return RUWeightTable[RegUnit];\n";
   }
   else {
@@ -222,8 +234,11 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
      << "const char *" << ClassName << "::\n"
      << "getRegPressureSetName(unsigned Idx) const {\n"
      << "  static const char *PressureNameTable[] = {\n";
+  unsigned MaxRegUnitWeight = 0;
   for (unsigned i = 0; i < NumSets; ++i ) {
-    OS << "    \"" << RegBank.getRegSetAt(i).Name << "\",\n";
+    const RegUnitSet &RegUnits = RegBank.getRegSetAt(i);
+    MaxRegUnitWeight = std::max(MaxRegUnitWeight, RegUnits.Weight);
+    OS << "    \"" << RegUnits.Name << "\",\n";
   }
   OS << "    nullptr };\n"
      << "  return PressureNameTable[Idx];\n"
@@ -233,63 +248,54 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
      << "// This limit must be adjusted dynamically for reserved registers.\n"
      << "unsigned " << ClassName << "::\n"
      << "getRegPressureSetLimit(unsigned Idx) const {\n"
-     << "  static const unsigned PressureLimitTable[] = {\n";
+     << "  static const " << getMinimalTypeForRange(MaxRegUnitWeight)
+     << " PressureLimitTable[] = {\n";
   for (unsigned i = 0; i < NumSets; ++i ) {
     const RegUnitSet &RegUnits = RegBank.getRegSetAt(i);
     OS << "    " << RegUnits.Weight << ",  \t// " << i << ": "
        << RegUnits.Name << "\n";
   }
-  OS << "    0 };\n"
+  OS << "  };\n"
      << "  return PressureLimitTable[Idx];\n"
      << "}\n\n";
 
+  SequenceToOffsetTable<std::vector<int>> PSetsSeqs;
+
   // This table may be larger than NumRCs if some register units needed a list
   // of unit sets that did not correspond to a register class.
   unsigned NumRCUnitSets = RegBank.getNumRegClassPressureSetLists();
-  OS << "/// Table of pressure sets per register class or unit.\n"
-     << "static const int RCSetsTable[] = {\n    ";
-  std::vector<unsigned> RCSetStarts(NumRCUnitSets);
-  for (unsigned i = 0, StartIdx = 0, e = NumRCUnitSets; i != e; ++i) {
-    RCSetStarts[i] = StartIdx;
+  std::vector<std::vector<int>> PSets(NumRCUnitSets);
+
+  for (unsigned i = 0, e = NumRCUnitSets; i != e; ++i) {
     ArrayRef<unsigned> PSetIDs = RegBank.getRCPressureSetIDs(i);
-    std::vector<unsigned> PSets;
-    PSets.reserve(PSetIDs.size());
+    PSets[i].reserve(PSetIDs.size());
     for (ArrayRef<unsigned>::iterator PSetI = PSetIDs.begin(),
            PSetE = PSetIDs.end(); PSetI != PSetE; ++PSetI) {
-      PSets.push_back(RegBank.getRegPressureSet(*PSetI).Order);
-    }
-    std::sort(PSets.begin(), PSets.end());
-    for (unsigned j = 0, e = PSets.size(); j < e; ++j) {
-      OS << PSets[j] << ",  ";
-      ++StartIdx;
+      PSets[i].push_back(RegBank.getRegPressureSet(*PSetI).Order);
     }
-    OS << "-1,  \t// #" << RCSetStarts[i] << " ";
-    if (i < NumRCs)
-      OS << RegBank.getRegClasses()[i]->getName();
-    else {
-      OS << "inferred";
-      for (ArrayRef<unsigned>::iterator PSetI = PSetIDs.begin(),
-             PSetE = PSetIDs.end(); PSetI != PSetE; ++PSetI) {
-        OS << "~" << RegBank.getRegSetAt(*PSetI).Name;
-      }
-    }
-    OS << "\n    ";
-    ++StartIdx;
+    std::sort(PSets[i].begin(), PSets[i].end());
+    PSetsSeqs.add(PSets[i]);
   }
-  OS << "-1 };\n\n";
+
+  PSetsSeqs.layout();
+
+  OS << "/// Table of pressure sets per register class or unit.\n"
+     << "static const int RCSetsTable[] = {\n";
+  PSetsSeqs.emit(OS, printInt, "-1");
+  OS << "};\n\n";
 
   OS << "/// Get the dimensions of register pressure impacted by this "
      << "register class.\n"
      << "/// Returns a -1 terminated array of pressure set IDs\n"
      << "const int* " << ClassName << "::\n"
      << "getRegClassPressureSets(const TargetRegisterClass *RC) const {\n";
-  OS << "  static const unsigned RCSetStartTable[] = {\n    ";
+  OS << "  static const " << getMinimalTypeForRange(PSetsSeqs.size()-1)
+     << " RCSetStartTable[] = {\n    ";
   for (unsigned i = 0, e = NumRCs; i != e; ++i) {
-    OS << RCSetStarts[i] << ",";
+    OS << PSetsSeqs.get(PSets[i]) << ",";
   }
-  OS << "0 };\n"
-     << "  unsigned SetListStart = RCSetStartTable[RC->getID()];\n"
-     << "  return &RCSetsTable[SetListStart];\n"
+  OS << "};\n"
+     << "  return &RCSetsTable[RCSetStartTable[RC->getID()]];\n"
      << "}\n\n";
 
   OS << "/// Get the dimensions of register pressure impacted by this "
@@ -299,29 +305,28 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
      << "getRegUnitPressureSets(unsigned RegUnit) const {\n"
      << "  assert(RegUnit < " << RegBank.getNumNativeRegUnits()
      << " && \"invalid register unit\");\n";
-  OS << "  static const unsigned RUSetStartTable[] = {\n    ";
+  OS << "  static const " << getMinimalTypeForRange(PSetsSeqs.size()-1)
+     << " RUSetStartTable[] = {\n    ";
   for (unsigned UnitIdx = 0, UnitEnd = RegBank.getNumNativeRegUnits();
        UnitIdx < UnitEnd; ++UnitIdx) {
-    OS << RCSetStarts[RegBank.getRegUnit(UnitIdx).RegClassUnitSetsIdx] << ",";
+    OS << PSetsSeqs.get(PSets[RegBank.getRegUnit(UnitIdx).RegClassUnitSetsIdx])
+       << ",";
   }
-  OS << "0 };\n"
-     << "  unsigned SetListStart = RUSetStartTable[RegUnit];\n"
-     << "  return &RCSetsTable[SetListStart];\n"
+  OS << "};\n"
+     << "  return &RCSetsTable[RUSetStartTable[RegUnit]];\n"
      << "}\n\n";
 }
 
-void
-RegisterInfoEmitter::EmitRegMappingTables(raw_ostream &OS,
-                                       const std::vector<CodeGenRegister*> &Regs,
-                                          bool isCtor) {
+void RegisterInfoEmitter::EmitRegMappingTables(
+    raw_ostream &OS, const std::deque<CodeGenRegister> &Regs, bool isCtor) {
   // Collect all information about dwarf register numbers
   typedef std::map<Record*, std::vector<int64_t>, LessRecordRegister> DwarfRegNumsMapTy;
   DwarfRegNumsMapTy DwarfRegNums;
 
   // First, just pull all provided information to the map
   unsigned maxLength = 0;
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    Record *Reg = Regs[i]->TheDef;
+  for (auto &RE : Regs) {
+    Record *Reg = RE.TheDef;
     std::vector<int64_t> RegNums = Reg->getValueAsListOfInts("DwarfNumbers");
     maxLength = std::max((size_t)maxLength, RegNums.size());
     if (DwarfRegNums.count(Reg))
@@ -339,7 +344,7 @@ RegisterInfoEmitter::EmitRegMappingTables(raw_ostream &OS,
     for (unsigned i = I->second.size(), e = maxLength; i != e; ++i)
       I->second.push_back(-1);
 
-  std::string Namespace = Regs[0]->TheDef->getValueAsString("Namespace");
+  std::string Namespace = Regs.front().TheDef->getValueAsString("Namespace");
 
   OS << "// " << Namespace << " Dwarf<->LLVM register mappings.\n";
 
@@ -387,8 +392,8 @@ RegisterInfoEmitter::EmitRegMappingTables(raw_ostream &OS,
     }
   }
 
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    Record *Reg = Regs[i]->TheDef;
+  for (auto &RE : Regs) {
+    Record *Reg = RE.TheDef;
     const RecordVal *V = Reg->getValue("DwarfAlias");
     if (!V || !V->getValue())
       continue;
@@ -435,15 +440,13 @@ RegisterInfoEmitter::EmitRegMappingTables(raw_ostream &OS,
   }
 }
 
-void
-RegisterInfoEmitter::EmitRegMapping(raw_ostream &OS,
-                                    const std::vector<CodeGenRegister*> &Regs,
-                                    bool isCtor) {
+void RegisterInfoEmitter::EmitRegMapping(
+    raw_ostream &OS, const std::deque<CodeGenRegister> &Regs, bool isCtor) {
   // Emit the initializer so the tables from EmitRegMappingTables get wired up
   // to the MCRegisterInfo object.
   unsigned maxLength = 0;
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    Record *Reg = Regs[i]->TheDef;
+  for (auto &RE : Regs) {
+    Record *Reg = RE.TheDef;
     maxLength = std::max((size_t)maxLength,
                          Reg->getValueAsListOfInts("DwarfNumbers").size());
   }
@@ -451,7 +454,7 @@ RegisterInfoEmitter::EmitRegMapping(raw_ostream &OS,
   if (!maxLength)
     return;
 
-  std::string Namespace = Regs[0]->TheDef->getValueAsString("Namespace");
+  std::string Namespace = Regs.front().TheDef->getValueAsString("Namespace");
 
   // Emit reverse information about the dwarf register numbers.
   for (unsigned j = 0; j < 2; ++j) {
@@ -565,15 +568,16 @@ static void printSubRegIndex(raw_ostream &OS, const CodeGenSubRegIndex *Idx) {
 // 0 differential which means we can't encode repeated elements.
 
 typedef SmallVector<uint16_t, 4> DiffVec;
+typedef SmallVector<unsigned, 4> MaskVec;
 
 // Differentially encode a sequence of numbers into V. The starting value and
 // terminating 0 are not added to V, so it will have the same size as List.
 static
-DiffVec &diffEncode(DiffVec &V, unsigned InitVal, ArrayRef<unsigned> List) {
+DiffVec &diffEncode(DiffVec &V, unsigned InitVal, SparseBitVector<> List) {
   assert(V.empty() && "Clear DiffVec before diffEncode.");
   uint16_t Val = uint16_t(InitVal);
-  for (unsigned i = 0; i != List.size(); ++i) {
-    uint16_t Cur = List[i];
+
+  for (uint16_t Cur : List) {
     V.push_back(Cur - Val);
     Val = Cur;
   }
@@ -597,6 +601,10 @@ static void printDiff16(raw_ostream &OS, uint16_t Val) {
   OS << Val;
 }
 
+static void printMask(raw_ostream &OS, unsigned Val) {
+  OS << format("0x%08X", Val);
+}
+
 // Try to combine Idx's compose map into Vec if it is compatible.
 // Return false if it's not possible.
 static bool combine(const CodeGenSubRegIndex *Idx,
@@ -616,20 +624,11 @@ static bool combine(const CodeGenSubRegIndex *Idx,
   return true;
 }
 
-static const char *getMinimalTypeForRange(uint64_t Range) {
-  assert(Range < 0xFFFFFFFFULL && "Enum too large");
-  if (Range > 0xFFFF)
-    return "uint32_t";
-  if (Range > 0xFF)
-    return "uint16_t";
-  return "uint8_t";
-}
-
 void
 RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
                                               CodeGenRegBank &RegBank,
                                               const std::string &ClName) {
-  ArrayRef<CodeGenSubRegIndex*> SubRegIndices = RegBank.getSubRegIndices();
+  const auto &SubRegIndices = RegBank.getSubRegIndices();
   OS << "unsigned " << ClName
      << "::composeSubRegIndicesImpl(unsigned IdxA, unsigned IdxB) const {\n";
 
@@ -644,10 +643,12 @@ RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
   SmallVector<unsigned, 4> RowMap;
   SmallVector<SmallVector<CodeGenSubRegIndex*, 4>, 4> Rows;
 
-  for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
+  auto SubRegIndicesSize =
+      std::distance(SubRegIndices.begin(), SubRegIndices.end());
+  for (const auto &Idx : SubRegIndices) {
     unsigned Found = ~0u;
     for (unsigned r = 0, re = Rows.size(); r != re; ++r) {
-      if (combine(SubRegIndices[i], Rows[r])) {
+      if (combine(&Idx, Rows[r])) {
         Found = r;
         break;
       }
@@ -655,27 +656,27 @@ RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
     if (Found == ~0u) {
       Found = Rows.size();
       Rows.resize(Found + 1);
-      Rows.back().resize(SubRegIndices.size());
-      combine(SubRegIndices[i], Rows.back());
+      Rows.back().resize(SubRegIndicesSize);
+      combine(&Idx, Rows.back());
     }
     RowMap.push_back(Found);
   }
 
   // Output the row map if there is multiple rows.
   if (Rows.size() > 1) {
-    OS << "  static const " << getMinimalTypeForRange(Rows.size())
-       << " RowMap[" << SubRegIndices.size() << "] = {\n    ";
-    for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i)
+    OS << "  static const " << getMinimalTypeForRange(Rows.size()) << " RowMap["
+       << SubRegIndicesSize << "] = {\n    ";
+    for (unsigned i = 0, e = SubRegIndicesSize; i != e; ++i)
       OS << RowMap[i] << ", ";
     OS << "\n  };\n";
   }
 
   // Output the rows.
-  OS << "  static const " << getMinimalTypeForRange(SubRegIndices.size()+1)
-     << " Rows[" << Rows.size() << "][" << SubRegIndices.size() << "] = {\n";
+  OS << "  static const " << getMinimalTypeForRange(SubRegIndicesSize + 1)
+     << " Rows[" << Rows.size() << "][" << SubRegIndicesSize << "] = {\n";
   for (unsigned r = 0, re = Rows.size(); r != re; ++r) {
     OS << "    { ";
-    for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i)
+    for (unsigned i = 0, e = SubRegIndicesSize; i != e; ++i)
       if (Rows[r][i])
         OS << Rows[r][i]->EnumValue << ", ";
       else
@@ -684,8 +685,8 @@ RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
   }
   OS << "  };\n\n";
 
-  OS << "  --IdxA; assert(IdxA < " << SubRegIndices.size() << ");\n"
-     << "  --IdxB; assert(IdxB < " << SubRegIndices.size() << ");\n";
+  OS << "  --IdxA; assert(IdxA < " << SubRegIndicesSize << ");\n"
+     << "  --IdxB; assert(IdxB < " << SubRegIndicesSize << ");\n";
   if (Rows.size() > 1)
     OS << "  return Rows[RowMap[IdxA]][IdxB];\n";
   else
@@ -693,6 +694,95 @@ RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
   OS << "}\n\n";
 }
 
+void
+RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
+                                                    CodeGenRegBank &RegBank,
+                                                    const std::string &ClName) {
+  // See the comments in computeSubRegLaneMasks() for our goal here.
+  const auto &SubRegIndices = RegBank.getSubRegIndices();
+
+  // Create a list of Mask+Rotate operations, with equivalent entries merged.
+  SmallVector<unsigned, 4> SubReg2SequenceIndexMap;
+  SmallVector<SmallVector<MaskRolPair, 1>, 4> Sequences;
+  for (const auto &Idx : SubRegIndices) {
+    const SmallVector<MaskRolPair, 1> &IdxSequence
+      = Idx.CompositionLaneMaskTransform;
+
+    unsigned Found = ~0u;
+    unsigned SIdx = 0;
+    unsigned NextSIdx;
+    for (size_t s = 0, se = Sequences.size(); s != se; ++s, SIdx = NextSIdx) {
+      SmallVectorImpl<MaskRolPair> &Sequence = Sequences[s];
+      NextSIdx = SIdx + Sequence.size() + 1;
+      if (Sequence.size() != IdxSequence.size())
+        continue;
+      bool Identical = true;
+      for (size_t o = 0, oe = Sequence.size(); o != oe; ++o) {
+        if (Sequence[o] != IdxSequence[o]) {
+          Identical = false;
+          break;
+        }
+      }
+      if (Identical) {
+        Found = SIdx;
+        break;
+      }
+    }
+    if (Found == ~0u) {
+      Sequences.push_back(IdxSequence);
+      Found = SIdx;
+    }
+    SubReg2SequenceIndexMap.push_back(Found);
+  }
+
+  OS << "unsigned " << ClName
+     << "::composeSubRegIndexLaneMaskImpl(unsigned IdxA, unsigned LaneMask)"
+        " const {\n";
+
+  OS << "  struct MaskRolOp {\n"
+        "    unsigned Mask;\n"
+        "    uint8_t  RotateLeft;\n"
+        "  };\n"
+        "  static const MaskRolOp Seqs[] = {\n";
+  unsigned Idx = 0;
+  for (size_t s = 0, se = Sequences.size(); s != se; ++s) {
+    OS << "    ";
+    const SmallVectorImpl<MaskRolPair> &Sequence = Sequences[s];
+    for (size_t p = 0, pe = Sequence.size(); p != pe; ++p) {
+      const MaskRolPair &P = Sequence[p];
+      OS << format("{ 0x%08X, %2u }, ", P.Mask, P.RotateLeft);
+    }
+    OS << "{ 0, 0 }";
+    if (s+1 != se)
+      OS << ", ";
+    OS << "  // Sequence " << Idx << "\n";
+    Idx += Sequence.size() + 1;
+  }
+  OS << "  };\n"
+        "  static const MaskRolOp *CompositeSequences[] = {\n";
+  for (size_t i = 0, e = SubRegIndices.size(); i != e; ++i) {
+    OS << "    ";
+    unsigned Idx = SubReg2SequenceIndexMap[i];
+    OS << format("&Seqs[%u]", Idx);
+    if (i+1 != e)
+      OS << ",";
+    OS << " // to " << SubRegIndices[i].getName() << "\n";
+  }
+  OS << "  };\n\n";
+
+  OS << "  --IdxA; assert(IdxA < " << SubRegIndices.size()
+     << " && \"Subregister index out of bounds\");\n"
+        "  unsigned Result = 0;\n"
+        "  for (const MaskRolOp *Ops = CompositeSequences[IdxA]; Ops->Mask != 0; ++Ops)"
+        " {\n"
+        "    unsigned Masked = LaneMask & Ops->Mask;\n"
+        "    Result |= (Masked << Ops->RotateLeft) & 0xFFFFFFFF;\n"
+        "    Result |= (Masked >> ((32 - Ops->RotateLeft) & 0x1F));\n"
+        "  }\n"
+        "  return Result;\n"
+        "}\n";
+}
+
 //
 // runMCDesc - Print out MC register descriptions.
 //
@@ -704,9 +794,9 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   OS << "\n#ifdef GET_REGINFO_MC_DESC\n";
   OS << "#undef GET_REGINFO_MC_DESC\n";
 
-  const std::vector<CodeGenRegister*> &Regs = RegBank.getRegisters();
+  const auto &Regs = RegBank.getRegisters();
 
-  ArrayRef<CodeGenSubRegIndex*> SubRegIndices = RegBank.getSubRegIndices();
+  auto &SubRegIndices = RegBank.getSubRegIndices();
   // The lists of sub-registers and super-registers go in the same array.  That
   // allows us to share suffixes.
   typedef std::vector<const CodeGenRegister*> RegVec;
@@ -718,36 +808,40 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   SmallVector<DiffVec, 4> RegUnitLists(Regs.size());
   SmallVector<unsigned, 4> RegUnitInitScale(Regs.size());
 
+  // List of lane masks accompanying register unit sequences.
+  SequenceToOffsetTable<MaskVec> LaneMaskSeqs;
+  SmallVector<MaskVec, 4> RegUnitLaneMasks(Regs.size());
+
   // Keep track of sub-register names as well. These are not differentially
   // encoded.
   typedef SmallVector<const CodeGenSubRegIndex*, 4> SubRegIdxVec;
-  SequenceToOffsetTable<SubRegIdxVec, CodeGenSubRegIndex::Less> SubRegIdxSeqs;
+  SequenceToOffsetTable<SubRegIdxVec, deref<llvm::less>> SubRegIdxSeqs;
   SmallVector<SubRegIdxVec, 4> SubRegIdxLists(Regs.size());
 
   SequenceToOffsetTable<std::string> RegStrings;
 
   // Precompute register lists for the SequenceToOffsetTable.
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    const CodeGenRegister *Reg = Regs[i];
-
-    RegStrings.add(Reg->getName());
+  unsigned i = 0;
+  for (auto I = Regs.begin(), E = Regs.end(); I != E; ++I, ++i) {
+    const auto &Reg = *I;
+    RegStrings.add(Reg.getName());
 
     // Compute the ordered sub-register list.
     SetVector<const CodeGenRegister*> SR;
-    Reg->addSubRegsPreOrder(SR, RegBank);
-    diffEncode(SubRegLists[i], Reg->EnumValue, SR.begin(), SR.end());
+    Reg.addSubRegsPreOrder(SR, RegBank);
+    diffEncode(SubRegLists[i], Reg.EnumValue, SR.begin(), SR.end());
     DiffSeqs.add(SubRegLists[i]);
 
     // Compute the corresponding sub-register indexes.
     SubRegIdxVec &SRIs = SubRegIdxLists[i];
     for (unsigned j = 0, je = SR.size(); j != je; ++j)
-      SRIs.push_back(Reg->getSubRegIndex(SR[j]));
+      SRIs.push_back(Reg.getSubRegIndex(SR[j]));
     SubRegIdxSeqs.add(SRIs);
 
     // Super-registers are already computed.
-    const RegVec &SuperRegList = Reg->getSuperRegs();
-    diffEncode(SuperRegLists[i], Reg->EnumValue,
-               SuperRegList.begin(), SuperRegList.end());
+    const RegVec &SuperRegList = Reg.getSuperRegs();
+    diffEncode(SuperRegLists[i], Reg.EnumValue, SuperRegList.begin(),
+               SuperRegList.end());
     DiffSeqs.add(SuperRegLists[i]);
 
     // Differentially encode the register unit list, seeded by register number.
@@ -762,22 +856,36 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
     //
     // Check the neighboring registers for arithmetic progressions.
     unsigned ScaleA = ~0u, ScaleB = ~0u;
-    ArrayRef<unsigned> RUs = Reg->getNativeRegUnits();
-    if (i > 0 && Regs[i-1]->getNativeRegUnits().size() == RUs.size())
-      ScaleB = RUs.front() - Regs[i-1]->getNativeRegUnits().front();
-    if (i+1 != Regs.size() &&
-        Regs[i+1]->getNativeRegUnits().size() == RUs.size())
-      ScaleA = Regs[i+1]->getNativeRegUnits().front() - RUs.front();
+    SparseBitVector<> RUs = Reg.getNativeRegUnits();
+    if (I != Regs.begin() &&
+        std::prev(I)->getNativeRegUnits().count() == RUs.count())
+      ScaleB = *RUs.begin() - *std::prev(I)->getNativeRegUnits().begin();
+    if (std::next(I) != Regs.end() &&
+        std::next(I)->getNativeRegUnits().count() == RUs.count())
+      ScaleA = *std::next(I)->getNativeRegUnits().begin() - *RUs.begin();
     unsigned Scale = std::min(ScaleB, ScaleA);
     // Default the scale to 0 if it can't be encoded in 4 bits.
     if (Scale >= 16)
       Scale = 0;
     RegUnitInitScale[i] = Scale;
-    DiffSeqs.add(diffEncode(RegUnitLists[i], Scale * Reg->EnumValue, RUs));
+    DiffSeqs.add(diffEncode(RegUnitLists[i], Scale * Reg.EnumValue, RUs));
+
+    const auto &RUMasks = Reg.getRegUnitLaneMasks();
+    MaskVec &LaneMaskVec = RegUnitLaneMasks[i];
+    assert(LaneMaskVec.empty());
+    LaneMaskVec.insert(LaneMaskVec.begin(), RUMasks.begin(), RUMasks.end());
+    // Terminator mask should not be used inside of the list.
+#ifndef NDEBUG
+    for (unsigned M : LaneMaskVec) {
+      assert(M != ~0u && "terminator mask should not be part of the list");
+    }
+#endif
+    LaneMaskSeqs.add(LaneMaskVec);
   }
 
   // Compute the final layout of the sequence table.
   DiffSeqs.layout();
+  LaneMaskSeqs.layout();
   SubRegIdxSeqs.layout();
 
   OS << "namespace llvm {\n\n";
@@ -789,6 +897,11 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   DiffSeqs.emit(OS, printDiff16);
   OS << "};\n\n";
 
+  // Emit the shared table of regunit lane mask sequences.
+  OS << "extern const unsigned " << TargetName << "LaneMaskLists[] = {\n";
+  LaneMaskSeqs.emit(OS, printMask, "~0u");
+  OS << "};\n\n";
+
   // Emit the table of sub-register indexes.
   OS << "extern const uint16_t " << TargetName << "SubRegIdxLists[] = {\n";
   SubRegIdxSeqs.emit(OS, printSubRegIndex);
@@ -798,12 +911,9 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   OS << "extern const MCRegisterInfo::SubRegCoveredBits "
      << TargetName << "SubRegIdxRanges[] = {\n";
   OS << "  { " << (uint16_t)-1 << ", " << (uint16_t)-1 << " },\n";
-  for (ArrayRef<CodeGenSubRegIndex*>::const_iterator
-         SRI = SubRegIndices.begin(), SRE = SubRegIndices.end();
-         SRI != SRE; ++SRI) {
-    OS << "  { " << (*SRI)->Offset << ", "
-                 << (*SRI)->Size
-       << " },\t// " << (*SRI)->getName() << "\n";
+  for (const auto &Idx : SubRegIndices) {
+    OS << "  { " << Idx.Offset << ", " << Idx.Size << " },\t// "
+       << Idx.getName() << "\n";
   }
   OS << "};\n\n";
 
@@ -815,16 +925,17 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
 
   OS << "extern const MCRegisterDesc " << TargetName
      << "RegDesc[] = { // Descriptors\n";
-  OS << "  { " << RegStrings.get("") << ", 0, 0, 0, 0 },\n";
+  OS << "  { " << RegStrings.get("") << ", 0, 0, 0, 0, 0 },\n";
 
   // Emit the register descriptors now.
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    const CodeGenRegister *Reg = Regs[i];
-    OS << "  { " << RegStrings.get(Reg->getName()) << ", "
-       << DiffSeqs.get(SubRegLists[i]) << ", "
-       << DiffSeqs.get(SuperRegLists[i]) << ", "
-       << SubRegIdxSeqs.get(SubRegIdxLists[i]) << ", "
-       << (DiffSeqs.get(RegUnitLists[i])*16 + RegUnitInitScale[i]) << " },\n";
+  i = 0;
+  for (const auto &Reg : Regs) {
+    OS << "  { " << RegStrings.get(Reg.getName()) << ", "
+       << DiffSeqs.get(SubRegLists[i]) << ", " << DiffSeqs.get(SuperRegLists[i])
+       << ", " << SubRegIdxSeqs.get(SubRegIdxLists[i]) << ", "
+       << (DiffSeqs.get(RegUnitLists[i]) * 16 + RegUnitInitScale[i]) << ", "
+       << LaneMaskSeqs.get(RegUnitLaneMasks[i]) << " },\n";
+    ++i;
   }
   OS << "};\n\n";      // End of register descriptors...
 
@@ -842,7 +953,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   }
   OS << "};\n\n";
 
-  ArrayRef<CodeGenRegisterClass*> RegisterClasses = RegBank.getRegClasses();
+  const auto &RegisterClasses = RegBank.getRegClasses();
 
   // Loop over all of the register classes... emitting each one.
   OS << "namespace {     // Register classes...\n";
@@ -850,8 +961,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   SequenceToOffsetTable<std::string> RegClassStrings;
 
   // Emit the register enum value arrays for each RegisterClass
-  for (unsigned rc = 0, e = RegisterClasses.size(); rc != e; ++rc) {
-    const CodeGenRegisterClass &RC = *RegisterClasses[rc];
+  for (const auto &RC : RegisterClasses) {
     ArrayRef<Record*> Order = RC.getOrder();
 
     // Give the register class a legal C name if it's anonymous.
@@ -891,9 +1001,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   OS << "extern const MCRegisterClass " << TargetName
      << "MCRegisterClasses[] = {\n";
 
-  for (unsigned rc = 0, e = RegisterClasses.size(); rc != e; ++rc) {
-    const CodeGenRegisterClass &RC = *RegisterClasses[rc];
-
+  for (const auto &RC : RegisterClasses) {
     // Asserts to make sure values will fit in table assuming types from
     // MCRegisterInfo.h
     assert((RC.SpillSize/8) <= 0xffff && "SpillSize too large.");
@@ -919,8 +1027,8 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   OS << "RegEncodingTable[] = {\n";
   // Add entry for NoRegister
   OS << "  0,\n";
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    Record *Reg = Regs[i]->TheDef;
+  for (const auto &RE : Regs) {
+    Record *Reg = RE.TheDef;
     BitsInit *BI = Reg->getValueAsBitsInit("HWEncoding");
     uint64_t Value = 0;
     for (unsigned b = 0, be = BI->getNumBits(); b != be; ++b) {
@@ -934,25 +1042,23 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   // MCRegisterInfo initialization routine.
   OS << "static inline void Init" << TargetName
      << "MCRegisterInfo(MCRegisterInfo *RI, unsigned RA, "
-     << "unsigned DwarfFlavour = 0, unsigned EHFlavour = 0, unsigned PC = 0) {\n"
+     << "unsigned DwarfFlavour = 0, unsigned EHFlavour = 0, unsigned PC = 0) "
+        "{\n"
      << "  RI->InitMCRegisterInfo(" << TargetName << "RegDesc, "
-     << Regs.size()+1 << ", RA, PC, " << TargetName << "MCRegisterClasses, "
-     << RegisterClasses.size() << ", "
-     << TargetName << "RegUnitRoots, "
-     << RegBank.getNumNativeRegUnits() << ", "
-     << TargetName << "RegDiffLists, "
-     << TargetName << "RegStrings, "
-     << TargetName << "RegClassStrings, "
-     << TargetName << "SubRegIdxLists, "
-     << (SubRegIndices.size() + 1) << ",\n"
-     << TargetName << "SubRegIdxRanges, "
-     << TargetName << "RegEncodingTable);\n\n";
+     << Regs.size() + 1 << ", RA, PC, " << TargetName << "MCRegisterClasses, "
+     << RegisterClasses.size() << ", " << TargetName << "RegUnitRoots, "
+     << RegBank.getNumNativeRegUnits() << ", " << TargetName << "RegDiffLists, "
+     << TargetName << "LaneMaskLists, " << TargetName << "RegStrings, "
+     << TargetName << "RegClassStrings, " << TargetName << "SubRegIdxLists, "
+     << (std::distance(SubRegIndices.begin(), SubRegIndices.end()) + 1) << ",\n"
+     << TargetName << "SubRegIdxRanges, " << TargetName
+     << "RegEncodingTable);\n\n";
 
   EmitRegMapping(OS, Regs, false);
 
   OS << "}\n\n";
 
-  OS << "} // End llvm namespace \n";
+  OS << "} // End llvm namespace\n";
   OS << "#endif // GET_REGINFO_MC_DESC\n\n";
 }
 
@@ -979,6 +1085,8 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
   if (!RegBank.getSubRegIndices().empty()) {
     OS << "  unsigned composeSubRegIndicesImpl"
        << "(unsigned, unsigned) const override;\n"
+       << "  unsigned composeSubRegIndexLaneMaskImpl"
+       << "(unsigned, unsigned) const override;\n"
        << "  const TargetRegisterClass *getSubClassWithSubReg"
        << "(const TargetRegisterClass*, unsigned) const override;\n";
   }
@@ -994,14 +1102,13 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
      << "unsigned RegUnit) const override;\n"
      << "};\n\n";
 
-  ArrayRef<CodeGenRegisterClass*> RegisterClasses = RegBank.getRegClasses();
+  const auto &RegisterClasses = RegBank.getRegClasses();
 
   if (!RegisterClasses.empty()) {
-    OS << "namespace " << RegisterClasses[0]->Namespace
+    OS << "namespace " << RegisterClasses.front().Namespace
        << " { // Register classes\n";
 
-    for (unsigned i = 0, e = RegisterClasses.size(); i != e; ++i) {
-      const CodeGenRegisterClass &RC = *RegisterClasses[i];
+    for (const auto &RC : RegisterClasses) {
       const std::string &Name = RC.getName();
 
       // Output the extern for the instance.
@@ -1009,7 +1116,7 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
     }
     OS << "} // end of namespace " << TargetName << "\n\n";
   }
-  OS << "} // End llvm namespace \n";
+  OS << "} // End llvm namespace\n";
   OS << "#endif // GET_REGINFO_HEADER\n\n";
 }
 
@@ -1031,15 +1138,14 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
      << "MCRegisterClasses[];\n";
 
   // Start out by emitting each of the register classes.
-  ArrayRef<CodeGenRegisterClass*> RegisterClasses = RegBank.getRegClasses();
-  ArrayRef<CodeGenSubRegIndex*> SubRegIndices = RegBank.getSubRegIndices();
+  const auto &RegisterClasses = RegBank.getRegClasses();
+  const auto &SubRegIndices = RegBank.getSubRegIndices();
 
   // Collect all registers belonging to any allocatable class.
   std::set<Record*> AllocatableRegs;
 
   // Collect allocatable registers.
-  for (unsigned rc = 0, e = RegisterClasses.size(); rc != e; ++rc) {
-    const CodeGenRegisterClass &RC = *RegisterClasses[rc];
+  for (const auto &RC : RegisterClasses) {
     ArrayRef<Record*> Order = RC.getOrder();
 
     if (RC.Allocatable)
@@ -1048,8 +1154,8 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
 
   // Build a shared array of value types.
   SequenceToOffsetTable<SmallVector<MVT::SimpleValueType, 4> > VTSeqs;
-  for (unsigned rc = 0, e = RegisterClasses.size(); rc != e; ++rc)
-    VTSeqs.add(RegisterClasses[rc]->VTs);
+  for (const auto &RC : RegisterClasses)
+    VTSeqs.add(RC.VTs);
   VTSeqs.layout();
   OS << "\nstatic const MVT::SimpleValueType VTLists[] = {\n";
   VTSeqs.emit(OS, printSimpleValueType, "MVT::Other");
@@ -1057,18 +1163,17 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
 
   // Emit SubRegIndex names, skipping 0.
   OS << "\nstatic const char *const SubRegIndexNameTable[] = { \"";
-  for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
-    OS << SubRegIndices[i]->getName();
-    if (i + 1 != e)
-      OS << "\", \"";
+
+  for (const auto &Idx : SubRegIndices) {
+    OS << Idx.getName();
+    OS << "\", \"";
   }
   OS << "\" };\n\n";
 
   // Emit SubRegIndex lane masks, including 0.
   OS << "\nstatic const unsigned SubRegIndexLaneMaskTable[] = {\n  ~0u,\n";
-  for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
-    OS << format("  0x%08x, // ", SubRegIndices[i]->LaneMask)
-       << SubRegIndices[i]->getName() << '\n';
+  for (const auto &Idx : SubRegIndices) {
+    OS << format("  0x%08x, // ", Idx.LaneMask) << Idx.getName() << '\n';
   }
   OS << " };\n\n";
 
@@ -1100,27 +1205,25 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     // Compress the sub-reg index lists.
     typedef std::vector<const CodeGenSubRegIndex*> IdxList;
     SmallVector<IdxList, 8> SuperRegIdxLists(RegisterClasses.size());
-    SequenceToOffsetTable<IdxList, CodeGenSubRegIndex::Less> SuperRegIdxSeqs;
+    SequenceToOffsetTable<IdxList, deref<llvm::less>> SuperRegIdxSeqs;
     BitVector MaskBV(RegisterClasses.size());
 
-    for (unsigned rc = 0, e = RegisterClasses.size(); rc != e; ++rc) {
-      const CodeGenRegisterClass &RC = *RegisterClasses[rc];
+    for (const auto &RC : RegisterClasses) {
       OS << "static const uint32_t " << RC.getName() << "SubClassMask[] = {\n  ";
       printBitVectorAsHex(OS, RC.getSubClasses(), 32);
 
       // Emit super-reg class masks for any relevant SubRegIndices that can
       // project into RC.
-      IdxList &SRIList = SuperRegIdxLists[rc];
-      for (unsigned sri = 0, sre = SubRegIndices.size(); sri != sre; ++sri) {
-        CodeGenSubRegIndex *Idx = SubRegIndices[sri];
+      IdxList &SRIList = SuperRegIdxLists[RC.EnumValue];
+      for (auto &Idx : SubRegIndices) {
         MaskBV.reset();
-        RC.getSuperRegClasses(Idx, MaskBV);
+        RC.getSuperRegClasses(&Idx, MaskBV);
         if (MaskBV.none())
           continue;
-        SRIList.push_back(Idx);
+        SRIList.push_back(&Idx);
         OS << "\n  ";
         printBitVectorAsHex(OS, MaskBV, 32);
-        OS << "// " << Idx->getName();
+        OS << "// " << Idx.getName();
       }
       SuperRegIdxSeqs.add(SRIList);
       OS << "\n};\n\n";
@@ -1132,8 +1235,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     OS << "};\n\n";
 
     // Emit NULL terminated super-class lists.
-    for (unsigned rc = 0, e = RegisterClasses.size(); rc != e; ++rc) {
-      const CodeGenRegisterClass &RC = *RegisterClasses[rc];
+    for (const auto &RC : RegisterClasses) {
       ArrayRef<CodeGenRegisterClass*> Supers = RC.getSuperClasses();
 
       // Skip classes without supers.  We can reuse NullRegClasses.
@@ -1142,14 +1244,13 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
 
       OS << "static const TargetRegisterClass *const "
          << RC.getName() << "Superclasses[] = {\n";
-      for (unsigned i = 0; i != Supers.size(); ++i)
-        OS << "  &" << Supers[i]->getQualifiedName() << "RegClass,\n";
+      for (const auto *Super : Supers)
+        OS << "  &" << Super->getQualifiedName() << "RegClass,\n";
       OS << "  nullptr\n};\n\n";
     }
 
     // Emit methods.
-    for (unsigned i = 0, e = RegisterClasses.size(); i != e; ++i) {
-      const CodeGenRegisterClass &RC = *RegisterClasses[i];
+    for (const auto &RC : RegisterClasses) {
       if (!RC.AltOrderSelect.empty()) {
         OS << "\nstatic inline unsigned " << RC.getName()
            << "AltOrderSelect(const MachineFunction &MF) {"
@@ -1177,22 +1278,21 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
         OS << ")\n  };\n  const unsigned Select = " << RC.getName()
            << "AltOrderSelect(MF);\n  assert(Select < " << RC.getNumOrders()
            << ");\n  return Order[Select];\n}\n";
-        }
+      }
     }
 
     // Now emit the actual value-initialized register class instances.
-    OS << "namespace " << RegisterClasses[0]->Namespace
+    OS << "\nnamespace " << RegisterClasses.front().Namespace
        << " {   // Register class instances\n";
 
-    for (unsigned i = 0, e = RegisterClasses.size(); i != e; ++i) {
-      const CodeGenRegisterClass &RC = *RegisterClasses[i];
-      OS << "  extern const TargetRegisterClass "
-         << RegisterClasses[i]->getName() << "RegClass = {\n    "
-         << '&' << Target.getName() << "MCRegisterClasses[" << RC.getName()
-         << "RegClassID],\n    "
-         << "VTLists + " << VTSeqs.get(RC.VTs) << ",\n    "
-         << RC.getName() << "SubClassMask,\n    SuperRegIdxSeqs + "
-         << SuperRegIdxSeqs.get(SuperRegIdxLists[i]) << ",\n    ";
+    for (const auto &RC : RegisterClasses) {
+      OS << "  extern const TargetRegisterClass " << RC.getName()
+         << "RegClass = {\n    " << '&' << Target.getName()
+         << "MCRegisterClasses[" << RC.getName() << "RegClassID],\n    "
+         << "VTLists + " << VTSeqs.get(RC.VTs) << ",\n    " << RC.getName()
+         << "SubClassMask,\n    SuperRegIdxSeqs + "
+         << SuperRegIdxSeqs.get(SuperRegIdxLists[RC.EnumValue]) << ",\n    "
+         << format("0x%08x,\n    ", RC.LaneMask);
       if (RC.getSuperClasses().empty())
         OS << "NullRegClasses,\n    ";
       else
@@ -1209,9 +1309,8 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
 
   OS << "\nnamespace {\n";
   OS << "  const TargetRegisterClass* const RegisterClasses[] = {\n";
-  for (unsigned i = 0, e = RegisterClasses.size(); i != e; ++i)
-    OS << "    &" << RegisterClasses[i]->getQualifiedName()
-       << "RegClass,\n";
+  for (const auto &RC : RegisterClasses)
+    OS << "    &" << RC.getQualifiedName() << "RegClass,\n";
   OS << "  };\n";
   OS << "}\n";       // End of anonymous namespace...
 
@@ -1221,9 +1320,8 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
      << TargetName << "RegInfoDesc[] = { // Extra Descriptors\n";
   OS << "  { 0, 0 },\n";
 
-  const std::vector<CodeGenRegister*> &Regs = RegBank.getRegisters();
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    const CodeGenRegister &Reg = *Regs[i];
+  const auto &Regs = RegBank.getRegisters();
+  for (const auto &Reg : Regs) {
     OS << "  { ";
     OS << Reg.CostPerUse << ", "
        << int(AllocatableRegs.count(Reg.TheDef)) << " },\n";
@@ -1233,8 +1331,13 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
 
   std::string ClassName = Target.getName() + "GenRegisterInfo";
 
-  if (!SubRegIndices.empty())
+  auto SubRegIndicesSize =
+      std::distance(SubRegIndices.begin(), SubRegIndices.end());
+
+  if (!SubRegIndices.empty()) {
     emitComposeSubRegIndices(OS, RegBank, ClassName);
+    emitComposeSubRegIndexLaneMask(OS, RegBank, ClassName);
+  }
 
   // Emit getSubClassWithSubReg.
   if (!SubRegIndices.empty()) {
@@ -1249,23 +1352,21 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
       OS << "  static const uint16_t Table[";
     else
       PrintFatalError("Too many register classes.");
-    OS << RegisterClasses.size() << "][" << SubRegIndices.size() << "] = {\n";
-    for (unsigned rci = 0, rce = RegisterClasses.size(); rci != rce; ++rci) {
-      const CodeGenRegisterClass &RC = *RegisterClasses[rci];
+    OS << RegisterClasses.size() << "][" << SubRegIndicesSize << "] = {\n";
+    for (const auto &RC : RegisterClasses) {
       OS << "    {\t// " << RC.getName() << "\n";
-      for (unsigned sri = 0, sre = SubRegIndices.size(); sri != sre; ++sri) {
-        CodeGenSubRegIndex *Idx = SubRegIndices[sri];
-        if (CodeGenRegisterClass *SRC = RC.getSubClassWithSubReg(Idx))
-          OS << "      " << SRC->EnumValue + 1 << ",\t// " << Idx->getName()
+      for (auto &Idx : SubRegIndices) {
+        if (CodeGenRegisterClass *SRC = RC.getSubClassWithSubReg(&Idx))
+          OS << "      " << SRC->EnumValue + 1 << ",\t// " << Idx.getName()
              << " -> " << SRC->getName() << "\n";
         else
-          OS << "      0,\t// " << Idx->getName() << "\n";
+          OS << "      0,\t// " << Idx.getName() << "\n";
       }
       OS << "    },\n";
     }
     OS << "  };\n  assert(RC && \"Missing regclass\");\n"
        << "  if (!Idx) return RC;\n  --Idx;\n"
-       << "  assert(Idx < " << SubRegIndices.size() << " && \"Bad subreg\");\n"
+       << "  assert(Idx < " << SubRegIndicesSize << " && \"Bad subreg\");\n"
        << "  unsigned TV = Table[RC->getID()][Idx];\n"
        << "  return TV ? getRegClass(TV - 1) : nullptr;\n}\n\n";
   }
@@ -1275,6 +1376,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   // Emit the constructor of the class...
   OS << "extern const MCRegisterDesc " << TargetName << "RegDesc[];\n";
   OS << "extern const MCPhysReg " << TargetName << "RegDiffLists[];\n";
+  OS << "extern const unsigned " << TargetName << "LaneMaskLists[];\n";
   OS << "extern const char " << TargetName << "RegStrings[];\n";
   OS << "extern const char " << TargetName << "RegClassStrings[];\n";
   OS << "extern const MCPhysReg " << TargetName << "RegUnitRoots[][2];\n";
@@ -1292,16 +1394,17 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
      << "             SubRegIndexNameTable, SubRegIndexLaneMaskTable, 0x";
   OS.write_hex(RegBank.CoveringLanes);
   OS << ") {\n"
-     << "  InitMCRegisterInfo(" << TargetName << "RegDesc, "
-     << Regs.size()+1 << ", RA, PC,\n                     " << TargetName
+     << "  InitMCRegisterInfo(" << TargetName << "RegDesc, " << Regs.size() + 1
+     << ", RA, PC,\n                     " << TargetName
      << "MCRegisterClasses, " << RegisterClasses.size() << ",\n"
      << "                     " << TargetName << "RegUnitRoots,\n"
      << "                     " << RegBank.getNumNativeRegUnits() << ",\n"
      << "                     " << TargetName << "RegDiffLists,\n"
+     << "                     " << TargetName << "LaneMaskLists,\n"
      << "                     " << TargetName << "RegStrings,\n"
      << "                     " << TargetName << "RegClassStrings,\n"
      << "                     " << TargetName << "SubRegIdxLists,\n"
-     << "                     " << SubRegIndices.size() + 1 << ",\n"
+     << "                     " << SubRegIndicesSize + 1 << ",\n"
      << "                     " << TargetName << "SubRegIdxRanges,\n"
      << "                     " << TargetName << "RegEncodingTable);\n\n";
 
@@ -1345,7 +1448,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   }
   OS << "\n\n";
 
-  OS << "} // End llvm namespace \n";
+  OS << "} // End llvm namespace\n";
   OS << "#endif // GET_REGINFO_TARGET_DESC\n\n";
 }
 
diff --git a/utils/TableGen/SequenceToOffsetTable.h b/utils/TableGen/SequenceToOffsetTable.h
index b58de4c..66506ea 100644
--- a/utils/TableGen/SequenceToOffsetTable.h
+++ b/utils/TableGen/SequenceToOffsetTable.h
@@ -84,6 +84,11 @@ public:
 
   bool empty() const { return Seqs.empty(); }
 
+  unsigned size() const {
+    assert(Entries && "Call layout() before size()");
+    return Entries;
+  }
+
   /// layout - Computes the final table layout.
   void layout() {
     assert(Entries == 0 && "Can only call layout() once");
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index 9f2fc92..d8cf0d1 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -404,7 +404,7 @@ EmitStageAndOperandCycleData(raw_ostream &OS,
     OS << "}\n";
 
     std::vector<Record*> BPs = PI->ItinsDef->getValueAsListOfDefs("BP");
-    if (BPs.size()) {
+    if (!BPs.empty()) {
       OS << "\n// Pipeline forwarding pathes for itineraries \"" << Name
          << "\"\n" << "namespace " << Name << "Bypass {\n";
 
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index bbd61f5..02fe4dc 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -143,9 +143,8 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
     break;
   case PrintEnums:
   {
-    std::vector<Record*> Recs = Records.getAllDerivedDefinitions(Class);
-    for (unsigned i = 0, e = Recs.size(); i != e; ++i)
-      OS << Recs[i]->getName() << ", ";
+    for (Record *Rec : Records.getAllDerivedDefinitions(Class))
+      OS << Rec->getName() << ", ";
     OS << "\n";
     break;
   }
@@ -153,13 +152,12 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   {
     SetTheory Sets;
     Sets.addFieldExpander("Set", "Elements");
-    std::vector<Record*> Recs = Records.getAllDerivedDefinitions("Set");
-    for (unsigned i = 0, e = Recs.size(); i != e; ++i) {
-      OS << Recs[i]->getName() << " = [";
-      const std::vector<Record*> *Elts = Sets.expand(Recs[i]);
+    for (Record *Rec : Records.getAllDerivedDefinitions("Set")) {
+      OS << Rec->getName() << " = [";
+      const std::vector<Record*> *Elts = Sets.expand(Rec);
       assert(Elts && "Couldn't expand Set instance");
-      for (unsigned ei = 0, ee = Elts->size(); ei != ee; ++ei)
-        OS << ' ' << (*Elts)[ei]->getName();
+      for (Record *Elt : *Elts)
+        OS << ' ' << Elt->getName();
       OS << " ]\n";
     }
     break;
diff --git a/utils/TableGen/X86DisassemblerTables.cpp b/utils/TableGen/X86DisassemblerTables.cpp
index d7e981c..fbe5502 100644
--- a/utils/TableGen/X86DisassemblerTables.cpp
+++ b/utils/TableGen/X86DisassemblerTables.cpp
@@ -75,13 +75,13 @@ static inline const char* stringForOperandEncoding(OperandEncoding encoding) {
 /// @return       - True if child is a subset of parent, false otherwise.
 static inline bool inheritsFrom(InstructionContext child,
                                 InstructionContext parent,
-                                bool VEX_LIG = false) {
+                                bool VEX_LIG = false, bool AdSize64 = false) {
   if (child == parent)
     return true;
 
   switch (parent) {
   case IC:
-    return(inheritsFrom(child, IC_64BIT) ||
+    return(inheritsFrom(child, IC_64BIT, AdSize64) ||
            inheritsFrom(child, IC_OPSIZE) ||
            inheritsFrom(child, IC_ADSIZE) ||
            inheritsFrom(child, IC_XD) ||
@@ -89,13 +89,19 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_64BIT:
     return(inheritsFrom(child, IC_64BIT_REXW)   ||
            inheritsFrom(child, IC_64BIT_OPSIZE) ||
-           inheritsFrom(child, IC_64BIT_ADSIZE) ||
+           (!AdSize64 && inheritsFrom(child, IC_64BIT_ADSIZE)) ||
            inheritsFrom(child, IC_64BIT_XD)     ||
            inheritsFrom(child, IC_64BIT_XS));
   case IC_OPSIZE:
-    return inheritsFrom(child, IC_64BIT_OPSIZE);
+    return inheritsFrom(child, IC_64BIT_OPSIZE) ||
+           inheritsFrom(child, IC_OPSIZE_ADSIZE);
   case IC_ADSIZE:
+    return inheritsFrom(child, IC_OPSIZE_ADSIZE);
+  case IC_OPSIZE_ADSIZE:
+    return false;
   case IC_64BIT_ADSIZE:
+    return inheritsFrom(child, IC_64BIT_OPSIZE_ADSIZE);
+  case IC_64BIT_OPSIZE_ADSIZE:
     return false;
   case IC_XD:
     return inheritsFrom(child, IC_64BIT_XD);
@@ -108,9 +114,12 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_64BIT_REXW:
     return(inheritsFrom(child, IC_64BIT_REXW_XS) ||
            inheritsFrom(child, IC_64BIT_REXW_XD) ||
-           inheritsFrom(child, IC_64BIT_REXW_OPSIZE));
+           inheritsFrom(child, IC_64BIT_REXW_OPSIZE) ||
+           (!AdSize64 && inheritsFrom(child, IC_64BIT_REXW_ADSIZE)));
   case IC_64BIT_OPSIZE:
-    return(inheritsFrom(child, IC_64BIT_REXW_OPSIZE));
+    return inheritsFrom(child, IC_64BIT_REXW_OPSIZE) ||
+           (!AdSize64 && inheritsFrom(child, IC_64BIT_OPSIZE_ADSIZE)) ||
+           (!AdSize64 && inheritsFrom(child, IC_64BIT_REXW_ADSIZE));
   case IC_64BIT_XD:
     return(inheritsFrom(child, IC_64BIT_REXW_XD));
   case IC_64BIT_XS:
@@ -121,6 +130,7 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_64BIT_REXW_XD:
   case IC_64BIT_REXW_XS:
   case IC_64BIT_REXW_OPSIZE:
+  case IC_64BIT_REXW_ADSIZE:
     return false;
   case IC_VEX:
     return (VEX_LIG && inheritsFrom(child, IC_VEX_L_W)) ||
@@ -713,6 +723,9 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
     else if ((index & ATTR_64BIT) && (index & ATTR_REXW) &&
              (index & ATTR_OPSIZE))
       o << "IC_64BIT_REXW_OPSIZE";
+    else if ((index & ATTR_64BIT) && (index & ATTR_REXW) &&
+             (index & ATTR_ADSIZE))
+      o << "IC_64BIT_REXW_ADSIZE";
     else if ((index & ATTR_64BIT) && (index & ATTR_XD) && (index & ATTR_OPSIZE))
       o << "IC_64BIT_XD_OPSIZE";
     else if ((index & ATTR_64BIT) && (index & ATTR_XS) && (index & ATTR_OPSIZE))
@@ -721,6 +734,9 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
       o << "IC_64BIT_XS";
     else if ((index & ATTR_64BIT) && (index & ATTR_XD))
       o << "IC_64BIT_XD";
+    else if ((index & ATTR_64BIT) && (index & ATTR_OPSIZE) &&
+             (index & ATTR_ADSIZE))
+      o << "IC_64BIT_OPSIZE_ADSIZE";
     else if ((index & ATTR_64BIT) && (index & ATTR_OPSIZE))
       o << "IC_64BIT_OPSIZE";
     else if ((index & ATTR_64BIT) && (index & ATTR_ADSIZE))
@@ -737,6 +753,8 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
       o << "IC_XS";
     else if (index & ATTR_XD)
       o << "IC_XD";
+    else if ((index & ATTR_OPSIZE) && (index & ATTR_ADSIZE))
+      o << "IC_OPSIZE_ADSIZE";
     else if (index & ATTR_OPSIZE)
       o << "IC_OPSIZE";
     else if (index & ATTR_ADSIZE)
@@ -822,15 +840,6 @@ void DisassemblerTables::setTableFields(ModRMDecision     &decision,
         InstructionSpecifier &previousInfo =
           InstructionSpecifiers[decision.instructionIDs[index]];
 
-        // Instructions such as MOV8ao8 and MOV8ao8_16 differ only in the
-        // presence of the AdSize prefix. However, the disassembler doesn't
-        // care about that difference in the instruction definition; it
-        // handles 16-bit vs. 32-bit addressing for itself based purely
-        // on the 0x67 prefix and the CPU mode. So there's no need to
-        // disambiguate between them; just let them conflict/coexist.
-        if (previousInfo.name + "_16" == newInfo.name)
-          continue;
-
         if(previousInfo.name == "NOOP" && (newInfo.name == "XCHG16ar" ||
                                            newInfo.name == "XCHG32ar" ||
                                            newInfo.name == "XCHG32ar64" ||
@@ -862,15 +871,19 @@ void DisassemblerTables::setTableFields(OpcodeType          type,
                                         const ModRMFilter   &filter,
                                         InstrUID            uid,
                                         bool                is32bit,
-                                        bool                ignoresVEX_L) {
+                                        bool                ignoresVEX_L,
+                                        unsigned            addressSize) {
   ContextDecision &decision = *Tables[type];
 
   for (unsigned index = 0; index < IC_max; ++index) {
-    if (is32bit && inheritsFrom((InstructionContext)index, IC_64BIT))
+    if ((is32bit || addressSize == 16) &&
+        inheritsFrom((InstructionContext)index, IC_64BIT))
       continue;
 
+    bool adSize64 = addressSize == 64;
     if (inheritsFrom((InstructionContext)index,
-                     InstructionSpecifiers[uid].insnContext, ignoresVEX_L))
+                     InstructionSpecifiers[uid].insnContext, ignoresVEX_L,
+                     adSize64))
       setTableFields(decision.opcodeDecisions[index].modRMDecisions[opcode],
                      filter,
                      uid,
diff --git a/utils/TableGen/X86DisassemblerTables.h b/utils/TableGen/X86DisassemblerTables.h
index d86b926..5a8688b 100644
--- a/utils/TableGen/X86DisassemblerTables.h
+++ b/utils/TableGen/X86DisassemblerTables.h
@@ -245,13 +245,15 @@ public:
   /// @param uid          - The unique ID of the instruction.
   /// @param is32bit      - Instructon is only 32-bit
   /// @param ignoresVEX_L - Instruction ignores VEX.L
+  /// @param AddrSize     - Instructions address size 16/32/64. 0 is unspecified
   void setTableFields(OpcodeType type,
                       InstructionContext insnContext,
                       uint8_t opcode,
                       const ModRMFilter &filter,
                       InstrUID uid,
                       bool is32bit,
-                      bool ignoresVEX_L);
+                      bool ignoresVEX_L,
+                      unsigned AddrSize);
 
   /// specForUID - Returns the instruction specifier for a given unique
   ///   instruction ID.  Used when resolving collisions.
diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp
index 9b8092f..5f12965 100644
--- a/utils/TableGen/X86RecognizableInstr.cpp
+++ b/utils/TableGen/X86RecognizableInstr.cpp
@@ -28,54 +28,65 @@ using namespace llvm;
   MAP(C2, 34)           \
   MAP(C3, 35)           \
   MAP(C4, 36)           \
-  MAP(C8, 37)           \
-  MAP(C9, 38)           \
-  MAP(CA, 39)           \
-  MAP(CB, 40)           \
-  MAP(CF, 41)           \
-  MAP(D0, 42)           \
-  MAP(D1, 43)           \
-  MAP(D4, 44)           \
-  MAP(D5, 45)           \
-  MAP(D6, 46)           \
-  MAP(D7, 47)           \
-  MAP(D8, 48)           \
-  MAP(D9, 49)           \
-  MAP(DA, 50)           \
-  MAP(DB, 51)           \
-  MAP(DC, 52)           \
-  MAP(DD, 53)           \
-  MAP(DE, 54)           \
-  MAP(DF, 55)           \
-  MAP(E0, 56)           \
-  MAP(E1, 57)           \
-  MAP(E2, 58)           \
-  MAP(E3, 59)           \
-  MAP(E4, 60)           \
-  MAP(E5, 61)           \
-  MAP(E8, 62)           \
-  MAP(E9, 63)           \
-  MAP(EA, 64)           \
-  MAP(EB, 65)           \
-  MAP(EC, 66)           \
-  MAP(ED, 67)           \
-  MAP(EE, 68)           \
-  MAP(F0, 69)           \
-  MAP(F1, 70)           \
-  MAP(F2, 71)           \
-  MAP(F3, 72)           \
-  MAP(F4, 73)           \
-  MAP(F5, 74)           \
-  MAP(F6, 75)           \
-  MAP(F7, 76)           \
-  MAP(F8, 77)           \
-  MAP(F9, 78)           \
-  MAP(FA, 79)           \
-  MAP(FB, 80)           \
-  MAP(FC, 81)           \
-  MAP(FD, 82)           \
-  MAP(FE, 83)           \
-  MAP(FF, 84)
+  MAP(C5, 37)           \
+  MAP(C6, 38)           \
+  MAP(C7, 39)           \
+  MAP(C8, 40)           \
+  MAP(C9, 41)           \
+  MAP(CA, 42)           \
+  MAP(CB, 43)           \
+  MAP(CC, 44)           \
+  MAP(CD, 45)           \
+  MAP(CE, 46)           \
+  MAP(CF, 47)           \
+  MAP(D0, 48)           \
+  MAP(D1, 49)           \
+  MAP(D2, 50)           \
+  MAP(D3, 51)           \
+  MAP(D4, 52)           \
+  MAP(D5, 53)           \
+  MAP(D6, 54)           \
+  MAP(D7, 55)           \
+  MAP(D8, 56)           \
+  MAP(D9, 57)           \
+  MAP(DA, 58)           \
+  MAP(DB, 59)           \
+  MAP(DC, 60)           \
+  MAP(DD, 61)           \
+  MAP(DE, 62)           \
+  MAP(DF, 63)           \
+  MAP(E0, 64)           \
+  MAP(E1, 65)           \
+  MAP(E2, 66)           \
+  MAP(E3, 67)           \
+  MAP(E4, 68)           \
+  MAP(E5, 69)           \
+  MAP(E6, 70)           \
+  MAP(E7, 71)           \
+  MAP(E8, 72)           \
+  MAP(E9, 73)           \
+  MAP(EA, 74)           \
+  MAP(EB, 75)           \
+  MAP(EC, 76)           \
+  MAP(ED, 77)           \
+  MAP(EE, 78)           \
+  MAP(EF, 79)           \
+  MAP(F0, 80)           \
+  MAP(F1, 81)           \
+  MAP(F2, 82)           \
+  MAP(F3, 83)           \
+  MAP(F4, 84)           \
+  MAP(F5, 85)           \
+  MAP(F6, 86)           \
+  MAP(F7, 87)           \
+  MAP(F8, 88)           \
+  MAP(F9, 89)           \
+  MAP(FA, 90)           \
+  MAP(FB, 91)           \
+  MAP(FC, 92)           \
+  MAP(FD, 93)           \
+  MAP(FE, 94)           \
+  MAP(FF, 95)
 
 // A clone of X86 since we can't depend on something that is generated.
 namespace X86Local {
@@ -119,6 +130,10 @@ namespace X86Local {
   enum {
     OpSize16 = 1, OpSize32 = 2
   };
+
+  enum {
+    AdSize16 = 1, AdSize32 = 2, AdSize64 = 3
+  };
 }
 
 using namespace X86Disassembler;
@@ -194,7 +209,7 @@ RecognizableInstr::RecognizableInstr(DisassemblerTables &tables,
   Encoding = byteFromRec(Rec, "OpEncBits");
 
   OpSize           = byteFromRec(Rec, "OpSizeBits");
-  HasAdSizePrefix  = Rec->getValueAsBit("hasAdSizePrefix");
+  AdSize           = byteFromRec(Rec, "AdSizeBits");
   HasREX_WPrefix   = Rec->getValueAsBit("hasREX_WPrefix");
   HasVEX_4V        = Rec->getValueAsBit("hasVEX_4V");
   HasVEX_4VOp3     = Rec->getValueAsBit("hasVEX_4VOp3");
@@ -401,16 +416,20 @@ InstructionContext RecognizableInstr::insnContext() const {
       errs() << "Instruction does not use a prefix: " << Name << "\n";
       llvm_unreachable("Invalid prefix");
     }
-  } else if (Is64Bit || HasREX_WPrefix) {
+  } else if (Is64Bit || HasREX_WPrefix || AdSize == X86Local::AdSize64) {
     if (HasREX_WPrefix && (OpSize == X86Local::OpSize16 || OpPrefix == X86Local::PD))
       insnContext = IC_64BIT_REXW_OPSIZE;
+    else if (HasREX_WPrefix && AdSize == X86Local::AdSize32)
+      insnContext = IC_64BIT_REXW_ADSIZE;
     else if (OpSize == X86Local::OpSize16 && OpPrefix == X86Local::XD)
       insnContext = IC_64BIT_XD_OPSIZE;
     else if (OpSize == X86Local::OpSize16 && OpPrefix == X86Local::XS)
       insnContext = IC_64BIT_XS_OPSIZE;
+    else if (OpSize == X86Local::OpSize16 && AdSize == X86Local::AdSize32)
+      insnContext = IC_64BIT_OPSIZE_ADSIZE;
     else if (OpSize == X86Local::OpSize16 || OpPrefix == X86Local::PD)
       insnContext = IC_64BIT_OPSIZE;
-    else if (HasAdSizePrefix)
+    else if (AdSize == X86Local::AdSize32)
       insnContext = IC_64BIT_ADSIZE;
     else if (HasREX_WPrefix && OpPrefix == X86Local::XS)
       insnContext = IC_64BIT_REXW_XS;
@@ -429,9 +448,11 @@ InstructionContext RecognizableInstr::insnContext() const {
       insnContext = IC_XD_OPSIZE;
     else if (OpSize == X86Local::OpSize16 && OpPrefix == X86Local::XS)
       insnContext = IC_XS_OPSIZE;
+    else if (OpSize == X86Local::OpSize16 && AdSize == X86Local::AdSize16)
+      insnContext = IC_OPSIZE_ADSIZE;
     else if (OpSize == X86Local::OpSize16 || OpPrefix == X86Local::PD)
       insnContext = IC_OPSIZE;
-    else if (HasAdSizePrefix)
+    else if (AdSize == X86Local::AdSize16)
       insnContext = IC_ADSIZE;
     else if (OpPrefix == X86Local::XD)
       insnContext = IC_XD;
@@ -504,7 +525,7 @@ void RecognizableInstr::emitInstructionSpecifier() {
   assert(numOperands <= X86_MAX_OPERANDS && "X86_MAX_OPERANDS is not large enough");
 
   for (unsigned operandIndex = 0; operandIndex < numOperands; ++operandIndex) {
-    if (OperandList[operandIndex].Constraints.size()) {
+    if (!OperandList[operandIndex].Constraints.empty()) {
       const CGIOperandList::ConstraintInfo &Constraint =
         OperandList[operandIndex].Constraints[0];
       if (Constraint.isTied()) {
@@ -793,9 +814,7 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
   // Special cases where the LLVM tables are not complete
 
 #define MAP(from, to)                     \
-  case X86Local::MRM_##from:              \
-    filter = new ExactFilter(0x##from);   \
-    break;
+  case X86Local::MRM_##from:
 
   OpcodeType    opcodeType  = (OpcodeType)-1;
 
@@ -844,12 +863,21 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
       filter = new ExtendedFilter(false, Form - X86Local::MRM0m);
       break;
     MRM_MAPPING
+      filter = new ExactFilter(0xC0 + Form - X86Local::MRM_C0);   \
+      break;
     } // switch (Form)
 
     opcodeToSet = Opcode;
     break;
   } // switch (OpMap)
 
+  unsigned AddressSize = 0;
+  switch (AdSize) {
+  case X86Local::AdSize16: AddressSize = 16; break;
+  case X86Local::AdSize32: AddressSize = 32; break;
+  case X86Local::AdSize64: AddressSize = 64; break;
+  }
+
   assert(opcodeType != (OpcodeType)-1 &&
          "Opcode type not set");
   assert(filter && "Filter not set");
@@ -867,13 +895,13 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
                             insnContext(),
                             currentOpcode,
                             *filter,
-                            UID, Is32Bit, IgnoresVEX_L);
+                            UID, Is32Bit, IgnoresVEX_L, AddressSize);
   } else {
     tables.setTableFields(opcodeType,
                           insnContext(),
                           opcodeToSet,
                           *filter,
-                          UID, Is32Bit, IgnoresVEX_L);
+                          UID, Is32Bit, IgnoresVEX_L, AddressSize);
   }
 
   delete filter;
@@ -915,6 +943,8 @@ OperandType RecognizableInstr::typeFromString(const std::string &s,
   TYPE("GR64",                TYPE_R64)
   TYPE("i8mem",               TYPE_M8)
   TYPE("i8imm",               TYPE_IMM8)
+  TYPE("u8imm",               TYPE_UIMM8)
+  TYPE("i32u8imm",            TYPE_UIMM8)
   TYPE("GR8",                 TYPE_R8)
   TYPE("VR128",               TYPE_XMM128)
   TYPE("VR128X",              TYPE_XMM128)
@@ -937,17 +967,19 @@ OperandType RecognizableInstr::typeFromString(const std::string &s,
   TYPE("i16imm_pcrel",        TYPE_REL16)
   TYPE("i32imm_pcrel",        TYPE_REL32)
   TYPE("SSECC",               TYPE_IMM3)
+  TYPE("XOPCC",               TYPE_IMM3)
   TYPE("AVXCC",               TYPE_IMM5)
+  TYPE("AVX512ICC",           TYPE_AVX512ICC)
   TYPE("AVX512RC",            TYPE_IMM32)
-  TYPE("brtarget",            TYPE_RELv)
-  TYPE("uncondbrtarget",      TYPE_RELv)
+  TYPE("brtarget32",          TYPE_RELv)
+  TYPE("brtarget16",          TYPE_RELv)
   TYPE("brtarget8",           TYPE_REL8)
   TYPE("f80mem",              TYPE_M80FP)
-  TYPE("lea32mem",            TYPE_LEA)
   TYPE("lea64_32mem",         TYPE_LEA)
   TYPE("lea64mem",            TYPE_LEA)
   TYPE("VR64",                TYPE_MM64)
   TYPE("i64imm",              TYPE_IMMv)
+  TYPE("anymem",              TYPE_M)
   TYPE("opaque32mem",         TYPE_M1616)
   TYPE("opaque48mem",         TYPE_M1632)
   TYPE("opaque80mem",         TYPE_M1664)
@@ -963,10 +995,17 @@ OperandType RecognizableInstr::typeFromString(const std::string &s,
   TYPE("dstidx16",            TYPE_DSTIDX16)
   TYPE("dstidx32",            TYPE_DSTIDX32)
   TYPE("dstidx64",            TYPE_DSTIDX64)
-  TYPE("offset8",             TYPE_MOFFS8)
-  TYPE("offset16",            TYPE_MOFFS16)
-  TYPE("offset32",            TYPE_MOFFS32)
-  TYPE("offset64",            TYPE_MOFFS64)
+  TYPE("offset16_8",          TYPE_MOFFS8)
+  TYPE("offset16_16",         TYPE_MOFFS16)
+  TYPE("offset16_32",         TYPE_MOFFS32)
+  TYPE("offset32_8",          TYPE_MOFFS8)
+  TYPE("offset32_16",         TYPE_MOFFS16)
+  TYPE("offset32_32",         TYPE_MOFFS32)
+  TYPE("offset32_64",         TYPE_MOFFS64)
+  TYPE("offset64_8",          TYPE_MOFFS8)
+  TYPE("offset64_16",         TYPE_MOFFS16)
+  TYPE("offset64_32",         TYPE_MOFFS32)
+  TYPE("offset64_64",         TYPE_MOFFS64)
   TYPE("VR256",               TYPE_XMM256)
   TYPE("VR256X",              TYPE_XMM256)
   TYPE("VR512",               TYPE_XMM512)
@@ -1010,7 +1049,9 @@ RecognizableInstr::immediateEncodingFromString(const std::string &s,
   }
   ENCODING("i32i8imm",        ENCODING_IB)
   ENCODING("SSECC",           ENCODING_IB)
+  ENCODING("XOPCC",           ENCODING_IB)
   ENCODING("AVXCC",           ENCODING_IB)
+  ENCODING("AVX512ICC",       ENCODING_IB)
   ENCODING("AVX512RC",        ENCODING_IB)
   ENCODING("i16imm",          ENCODING_Iv)
   ENCODING("i16i8imm",        ENCODING_IB)
@@ -1018,6 +1059,8 @@ RecognizableInstr::immediateEncodingFromString(const std::string &s,
   ENCODING("i64i32imm",       ENCODING_ID)
   ENCODING("i64i8imm",        ENCODING_IB)
   ENCODING("i8imm",           ENCODING_IB)
+  ENCODING("u8imm",           ENCODING_IB)
+  ENCODING("i32u8imm",        ENCODING_IB)
   // This is not a typo.  Instructions like BLENDVPD put
   // register IDs in 8-bit immediates nowadays.
   ENCODING("FR32",            ENCODING_IB)
@@ -1153,9 +1196,9 @@ RecognizableInstr::memoryEncodingFromString(const std::string &s,
   ENCODING("i256mem",         ENCODING_RM)
   ENCODING("i512mem",         ENCODING_RM)
   ENCODING("f80mem",          ENCODING_RM)
-  ENCODING("lea32mem",        ENCODING_RM)
   ENCODING("lea64_32mem",     ENCODING_RM)
   ENCODING("lea64mem",        ENCODING_RM)
+  ENCODING("anymem",          ENCODING_RM)
   ENCODING("opaque32mem",     ENCODING_RM)
   ENCODING("opaque48mem",     ENCODING_RM)
   ENCODING("opaque80mem",     ENCODING_RM)
@@ -1186,16 +1229,26 @@ RecognizableInstr::relocationEncodingFromString(const std::string &s,
   ENCODING("i64i32imm",       ENCODING_ID)
   ENCODING("i64i8imm",        ENCODING_IB)
   ENCODING("i8imm",           ENCODING_IB)
+  ENCODING("u8imm",           ENCODING_IB)
+  ENCODING("i32u8imm",        ENCODING_IB)
   ENCODING("i64i32imm_pcrel", ENCODING_ID)
   ENCODING("i16imm_pcrel",    ENCODING_IW)
   ENCODING("i32imm_pcrel",    ENCODING_ID)
-  ENCODING("brtarget",        ENCODING_Iv)
+  ENCODING("brtarget32",      ENCODING_Iv)
+  ENCODING("brtarget16",      ENCODING_Iv)
   ENCODING("brtarget8",       ENCODING_IB)
   ENCODING("i64imm",          ENCODING_IO)
-  ENCODING("offset8",         ENCODING_Ia)
-  ENCODING("offset16",        ENCODING_Ia)
-  ENCODING("offset32",        ENCODING_Ia)
-  ENCODING("offset64",        ENCODING_Ia)
+  ENCODING("offset16_8",      ENCODING_Ia)
+  ENCODING("offset16_16",     ENCODING_Ia)
+  ENCODING("offset16_32",     ENCODING_Ia)
+  ENCODING("offset32_8",      ENCODING_Ia)
+  ENCODING("offset32_16",     ENCODING_Ia)
+  ENCODING("offset32_32",     ENCODING_Ia)
+  ENCODING("offset32_64",     ENCODING_Ia)
+  ENCODING("offset64_8",      ENCODING_Ia)
+  ENCODING("offset64_16",     ENCODING_Ia)
+  ENCODING("offset64_32",     ENCODING_Ia)
+  ENCODING("offset64_64",     ENCODING_Ia)
   ENCODING("srcidx8",         ENCODING_SI)
   ENCODING("srcidx16",        ENCODING_SI)
   ENCODING("srcidx32",        ENCODING_SI)
diff --git a/utils/TableGen/X86RecognizableInstr.h b/utils/TableGen/X86RecognizableInstr.h
index 95d7a40..28e1053 100644
--- a/utils/TableGen/X86RecognizableInstr.h
+++ b/utils/TableGen/X86RecognizableInstr.h
@@ -50,8 +50,8 @@ private:
   uint8_t Encoding;
   /// The OpSize field from the record
   uint8_t OpSize;
-  /// The hasAdSizePrefix field from the record
-  bool HasAdSizePrefix;
+  /// The AdSize field from the record
+  uint8_t AdSize;
   /// The hasREX_WPrefix field from the record
   bool HasREX_WPrefix;
   /// The hasVEX_4V field from the record
diff --git a/utils/emacs/emacs.el b/utils/emacs/emacs.el
index 969f538..4a3a503 100644
--- a/utils/emacs/emacs.el
+++ b/utils/emacs/emacs.el
@@ -1,39 +1,22 @@
 ;; LLVM coding style guidelines in emacs
 ;; Maintainer: LLVM Team, http://llvm.org/
-;; Modified:   2009-07-28
 
-;; Max 80 cols per line, indent by two spaces, no tabs.
-;; Apparently, this does not affect tabs in Makefiles.
-(custom-set-variables
-  '(fill-column 80)
-  '(c++-indent-level 2)
-  '(c-basic-offset 2)
-  '(indent-tabs-mode nil))
-
-
-;; Alternative to setting the global style.  Only files with "llvm" in
-;; their names will automatically set to the llvm.org coding style.
+;; Add a cc-mode style for editing LLVM C and C++ code
 (c-add-style "llvm.org"
-             '((fill-column . 80)
+             '("gnu"
+	       (fill-column . 80)
 	       (c++-indent-level . 2)
 	       (c-basic-offset . 2)
 	       (indent-tabs-mode . nil)
-               (c-offsets-alist . ((innamespace 0)))))
-
-(add-hook 'c-mode-hook
-	  (function
-	   (lambda nil 
-	     (if (string-match "llvm" buffer-file-name)
-		 (progn
-		   (c-set-style "llvm.org")
-		   )
-	       ))))
+	       (c-offsets-alist . ((arglist-intro . ++)
+				   (innamespace . 0)
+				   (member-init-intro . ++)))))
 
-(add-hook 'c++-mode-hook
+;; Files with "llvm" in their names will automatically be set to the
+;; llvm.org coding style.
+(add-hook 'c-mode-common-hook
 	  (function
 	   (lambda nil 
 	     (if (string-match "llvm" buffer-file-name)
 		 (progn
-		   (c-set-style "llvm.org")
-		   )
-	       ))))
+		   (c-set-style "llvm.org"))))))
diff --git a/utils/emacs/llvm-mode.el b/utils/emacs/llvm-mode.el
index 61127b6..5fb1eb3 100644
--- a/utils/emacs/llvm-mode.el
+++ b/utils/emacs/llvm-mode.el
@@ -1,8 +1,13 @@
+;;; llvm-mode.el --- Major mode for the LLVM assembler language.
+
 ;; Maintainer:  The LLVM team, http://llvm.org/
-;; Description: Major mode for the LLVM assembler language.
-;; Updated:     2007-09-19
 
-;; Create mode-specific tables.
+;;; Commentary:
+
+;; Major mode for editing LLVM IR files.
+
+;;; Code:
+
 (defvar llvm-mode-syntax-table nil
   "Syntax table used while in LLVM mode.")
 (defvar llvm-font-lock-keywords
@@ -18,7 +23,7 @@
    ;; Unnamed variable slots
    '("%[-]?[0-9]+" . font-lock-variable-name-face)
    ;; Types
-   `(,(regexp-opt '("void" "i[0-9]+" "float" "double" "type" "label" "opaque") 'words) . font-lock-type-face)
+   `(,(regexp-opt '("void" "i1" "i8" "i16" "i32" "i64" "i128" "float" "double" "type" "label" "opaque") 'symbol) . font-lock-type-face)
    ;; Integer literals
    '("\\b[-]?[0-9]+\\b" . font-lock-preprocessor-face)
    ;; Floating point constants
@@ -30,29 +35,31 @@
                     "define" "global" "constant" "const" "internal" "linkonce" "linkonce_odr"
                     "weak" "weak_odr" "appending" "uninitialized" "implementation" "..."
                     "null" "undef" "to" "except" "not" "target" "endian" "little" "big"
-                    "pointersize" "volatile" "fastcc" "coldcc" "cc") 'words) . font-lock-keyword-face)
+                    "pointersize" "volatile" "fastcc" "coldcc" "cc" "personality") 'symbols) . font-lock-keyword-face)
    ;; Arithmetic and Logical Operators
    `(,(regexp-opt '("add" "sub" "mul" "sdiv" "udiv" "urem" "srem" "and" "or" "xor"
-                    "setne" "seteq" "setlt" "setgt" "setle" "setge") 'words) . font-lock-keyword-face)
+                    "setne" "seteq" "setlt" "setgt" "setle" "setge") 'symbols) . font-lock-keyword-face)
    ;; Floating-point operators
-   `(,(regexp-opt '("fadd" "fsub" "fmul" "fdiv" "frem") 'words) . font-lock-keyword-face)
+   `(,(regexp-opt '("fadd" "fsub" "fmul" "fdiv" "frem") 'symbols) . font-lock-keyword-face)
    ;; Special instructions
-   `(,(regexp-opt '("phi" "tail" "call" "select" "to" "shl" "lshr" "ashr" "fcmp" "icmp" "va_arg" "landingpad") 'words) . font-lock-keyword-face)
+   `(,(regexp-opt '("phi" "tail" "call" "select" "to" "shl" "lshr" "ashr" "fcmp" "icmp" "va_arg" "landingpad") 'symbols) . font-lock-keyword-face)
    ;; Control instructions
-   `(,(regexp-opt '("ret" "br" "switch" "invoke" "resume" "unwind" "unreachable" "indirectbr") 'words) . font-lock-keyword-face)
+   `(,(regexp-opt '("ret" "br" "switch" "invoke" "resume" "unwind" "unreachable" "indirectbr") 'symbols) . font-lock-keyword-face)
    ;; Memory operators
-   `(,(regexp-opt '("malloc" "alloca" "free" "load" "store" "getelementptr" "fence" "cmpxchg" "atomicrmw") 'words) . font-lock-keyword-face)
+   `(,(regexp-opt '("malloc" "alloca" "free" "load" "store" "getelementptr" "fence" "cmpxchg" "atomicrmw") 'symbols) . font-lock-keyword-face)
    ;; Casts
-   `(,(regexp-opt '("bitcast" "inttoptr" "ptrtoint" "trunc" "zext" "sext" "fptrunc" "fpext" "fptoui" "fptosi" "uitofp" "sitofp" "addrspacecast") 'words) . font-lock-keyword-face)
+   `(,(regexp-opt '("bitcast" "inttoptr" "ptrtoint" "trunc" "zext" "sext" "fptrunc" "fpext" "fptoui" "fptosi" "uitofp" "sitofp" "addrspacecast") 'symbols) . font-lock-keyword-face)
    ;; Vector ops
-   `(,(regexp-opt '("extractelement" "insertelement" "shufflevector") 'words) . font-lock-keyword-face)
+   `(,(regexp-opt '("extractelement" "insertelement" "shufflevector") 'symbols) . font-lock-keyword-face)
    ;; Aggregate ops
-   `(,(regexp-opt '("extractvalue" "insertvalue") 'words) . font-lock-keyword-face)
+   `(,(regexp-opt '("extractvalue" "insertvalue") 'symbols) . font-lock-keyword-face)
+   ;; Metadata types
+   `(,(regexp-opt '("distinct") 'symbols) . font-lock-keyword-face)
    ;; Use-list order directives
-   `(,(regexp-opt '("uselistorder" "uselistorder_bb") 'words) . font-lock-keyword-face)
+   `(,(regexp-opt '("uselistorder" "uselistorder_bb") 'symbols) . font-lock-keyword-face)
 
    )
-  "Syntax highlighting for LLVM"
+  "Syntax highlighting for LLVM."
   )
 
 ;; ---------------------- Syntax table ---------------------------
@@ -62,40 +69,33 @@
 (if (not llvm-mode-syntax-table)
     (progn
       (setq llvm-mode-syntax-table (make-syntax-table))
-      (mapcar (function (lambda (n)
-                          (modify-syntax-entry (aref n 0)
-                                               (aref n 1)
-                                               llvm-mode-syntax-table)))
-              '(
-                ;; whitespace (` ')
-                [?\^m " "]
-                [?\f  " "]
-                [?\n  " "]
-                [?\t  " "]
-                [?\   " "]
-                ;; word constituents (`w')
-                ;;[?<  "w"]
-                ;;[?>  "w"]
-                [?\%  "w"]
-                ;;[?_  "w  "]
-                ;; comments
-                [?\;  "< "]
-                [?\n  "> "]
-                ;;[?\r  "> "]
-                ;;[?\^m "> "]
-                ;; symbol constituents (`_')
-                ;; punctuation (`.')
-                ;; open paren (`(')
-                [?\( "("]
-                [?\[ "("]
-                [?\{ "("]
-                ;; close paren (`)')
-                [?\) ")"]
-                [?\] ")"]
-                [?\} ")"]
-                ;; string quote ('"')
-                [?\" "\""]
-                ))))
+      (mapc (function (lambda (n)
+                        (modify-syntax-entry (aref n 0)
+                                             (aref n 1)
+                                             llvm-mode-syntax-table)))
+            '(
+              ;; whitespace (` ')
+              [?\^m " "]
+              [?\f  " "]
+              [?\n  " "]
+              [?\t  " "]
+              [?\   " "]
+              ;; word constituents (`w')
+              ;;[?<  "w"]
+              ;;[?>  "w"]
+              [?%  "w"]
+              ;;[?_  "w  "]
+              ;; comments
+              [?\;  "< "]
+              [?\n  "> "]
+              ;;[?\r  "> "]
+              ;;[?\^m "> "]
+              ;; symbol constituents (`_')
+              ;; punctuation (`.')
+              ;; open paren (`(')
+              ;; close paren (`)')
+              ;; string quote ('"')
+              [?\" "\""]))))
 
 ;; --------------------- Abbrev table -----------------------------
 
@@ -113,11 +113,11 @@
   (define-key llvm-mode-map "\es" 'center-line)
   (define-key llvm-mode-map "\eS" 'center-paragraph))
 
-
+;;;###autoload
 (defun llvm-mode ()
   "Major mode for editing LLVM source files.
-  \\{llvm-mode-map}
-  Runs llvm-mode-hook on startup."
+\\{llvm-mode-map}
+  Runs `llvm-mode-hook' on startup."
   (interactive)
   (kill-all-local-variables)
   (use-local-map llvm-mode-map)         ; Provides the local keymap.
@@ -136,8 +136,9 @@
                                         ;   customize the mode with a hook.
 
 ;; Associate .ll files with llvm-mode
-(setq auto-mode-alist
-   (append '(("\\.ll$" . llvm-mode)) auto-mode-alist))
+;;;###autoload
+(add-to-list 'auto-mode-alist (cons (purecopy "\\.ll\\'")  'llvm-mode))
 
 (provide 'llvm-mode)
-;; end of llvm-mode.el
+
+;;; llvm-mode.el ends here
diff --git a/utils/emacs/tablegen-mode.el b/utils/emacs/tablegen-mode.el
index c0ae751..035455d 100644
--- a/utils/emacs/tablegen-mode.el
+++ b/utils/emacs/tablegen-mode.el
@@ -1,12 +1,17 @@
+;;; tablegen-mode.el --- Major mode for TableGen description files (part of LLVM project)
+
 ;; Maintainer:  The LLVM team, http://llvm.org/
-;; Description: Major mode for TableGen description files (part of LLVM project)
-;; Updated:     2007-12-18
+
+;;; Commentary:
+;; A major mode for TableGen description files in LLVM.
 
 (require 'comint)
 (require 'custom)
 (require 'ansi-color)
 
 ;; Create mode-specific tables.
+;;; Code:
+
 (defvar td-decorators-face 'td-decorators-face
   "Face method decorators.")
 (make-face 'td-decorators-face)
@@ -93,10 +98,11 @@
   (define-key tablegen-mode-map "\es" 'center-line)
   (define-key tablegen-mode-map "\eS" 'center-paragraph))
 
+;;;###autoload
 (defun tablegen-mode ()
   "Major mode for editing TableGen description files.
-  \\{tablegen-mode-map}
-  Runs tablegen-mode-hook on startup."
+\\{tablegen-mode-map}
+  Runs `tablegen-mode-hook' on startup."
   (interactive)
   (kill-all-local-variables)
   (use-local-map tablegen-mode-map)      ; Provides the local keymap.
@@ -117,7 +123,9 @@
                                          ;   customize the mode with a hook.
 
 ;; Associate .td files with tablegen-mode
-(setq auto-mode-alist (append '(("\\.td$" . tablegen-mode)) auto-mode-alist))
+;;;###autoload
+(add-to-list 'auto-mode-alist (cons (purecopy "\\.td\\'")  'tablegen-mode))
 
 (provide 'tablegen-mode)
-;; end of tablegen-mode.el
+
+;;; tablegen-mode.el ends here
diff --git a/utils/lit/lit/LitConfig.py b/utils/lit/lit/LitConfig.py
index b0dde5d..b818380 100644
--- a/utils/lit/lit/LitConfig.py
+++ b/utils/lit/lit/LitConfig.py
@@ -76,7 +76,6 @@ class LitConfig:
             self.bashPath = lit.util.which('bash')
 
         if self.bashPath is None:
-            self.warning("Unable to find 'bash'.")
             self.bashPath = ''
 
         return self.bashPath
@@ -91,7 +90,6 @@ class LitConfig:
         # bash
         self.bashPath = lit.util.which('bash', dir)
         if self.bashPath is None:
-            self.note("Unable to find 'bash.exe'.")
             self.bashPath = ''
 
         return dir
diff --git a/utils/lit/lit/Test.py b/utils/lit/lit/Test.py
index b810230..682d04f 100644
--- a/utils/lit/lit/Test.py
+++ b/utils/lit/lit/Test.py
@@ -91,7 +91,8 @@ class JSONMetricValue(MetricValue):
         self.value = value
 
     def format(self):
-        return str(self.value)
+        e = JSONEncoder(indent=2, sort_keys=True)
+        return e.encode(self.value)
 
     def todata(self):
         return self.value
@@ -252,4 +253,4 @@ class Test:
             xml += "\n\t</failure>\n</testcase>"
         else:
             xml += "/>"
-        return xml
-\ No newline at end of file
+        return xml
diff --git a/utils/lit/lit/TestingConfig.py b/utils/lit/lit/TestingConfig.py
index 52cebbf..c7ef94d 100644
--- a/utils/lit/lit/TestingConfig.py
+++ b/utils/lit/lit/TestingConfig.py
@@ -26,7 +26,11 @@ class TestingConfig:
                      'LD_PRELOAD', 'ASAN_OPTIONS', 'UBSAN_OPTIONS',
                      'LSAN_OPTIONS']
         for var in pass_vars:
-            environment[var] = os.environ.get(var, '')
+            val = os.environ.get(var, '')
+            # Check for empty string as some variables such as LD_PRELOAD cannot be empty
+            # ('') for OS's such as OpenBSD.
+            if val:
+                environment[var] = val
 
         if sys.platform == 'win32':
             environment.update({
diff --git a/utils/lit/lit/formats/googletest.py b/utils/lit/lit/formats/googletest.py
index 1b5b785..59ac3c5 100644
--- a/utils/lit/lit/formats/googletest.py
+++ b/utils/lit/lit/formats/googletest.py
@@ -31,7 +31,6 @@ class GoogleTest(TestFormat):
         try:
             lines = lit.util.capture([path, '--gtest_list_tests'],
                                      env=localConfig.environment)
-            lines = lines.decode('utf-8')
             if kIsWindows:
               lines = lines.replace('\r', '')
             lines = lines.split('\n')
diff --git a/utils/lit/lit/util.py b/utils/lit/lit/util.py
index ca1aeb6..08f7b71 100644
--- a/utils/lit/lit/util.py
+++ b/utils/lit/lit/util.py
@@ -16,6 +16,12 @@ def to_string(bytes):
         return bytes
     return to_bytes(bytes)
 
+def convert_string(bytes):
+    try:
+        return to_string(bytes.decode('utf-8'))
+    except UnicodeError:
+        return str(bytes)
+
 def detectCPUs():
     """
     Detects the number of CPUs on a system. Cribbed from pp.
@@ -60,7 +66,7 @@ def capture(args, env=None):
     p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                          env=env)
     out,_ = p.communicate()
-    return out
+    return convert_string(out)
 
 def which(command, paths = None):
     """which(command, [paths]) - Look up the given command in the paths string
@@ -166,14 +172,8 @@ def executeCommand(command, cwd=None, env=None):
         raise KeyboardInterrupt
 
     # Ensure the resulting output is always of string type.
-    try:
-        out = to_string(out.decode('utf-8'))
-    except:
-        out = str(out)
-    try:
-        err = to_string(err.decode('utf-8'))
-    except:
-        err = str(err)
+    out = convert_string(out)
+    err = convert_string(err)
 
     return out, err, exitCode
 
diff --git a/utils/lit/utils/check-coverage b/utils/lit/utils/check-coverage
index 128e827..cded7a2 100755
--- a/utils/lit/utils/check-coverage
+++ b/utils/lit/utils/check-coverage
@@ -23,7 +23,7 @@ fi
 # sitecustomize.
 if ! python -c \
       'import sitecustomize, sys; sys.exit("coverage" not in dir(sitecustomize))' \
-      &> /dev/null; then
+      >/dev/null 2>&1; then
     printf 1>&2 "error: active python does not appear to enable coverage in its 'sitecustomize.py'\n"
     exit 1
 fi
diff --git a/utils/lldbDataFormatters.py b/utils/lldbDataFormatters.py
index f570fb4..687729f 100644
--- a/utils/lldbDataFormatters.py
+++ b/utils/lldbDataFormatters.py
@@ -15,6 +15,9 @@ def __lldb_init_module(debugger, internal_dict):
     debugger.HandleCommand('type synthetic add -w llvm '
                            '-l lldbDataFormatters.ArrayRefSynthProvider '
                            '-x "^llvm::ArrayRef<.+>$"')
+    debugger.HandleCommand('type summary add -w llvm '
+                           '-F lldbDataFormatters.OptionalSummaryProvider '
+                           '-x "^llvm::Optional<.+>$"')
 
 # Pretty printer for llvm::SmallVector/llvm::SmallVectorImpl
 class SmallVectorSynthProvider:
@@ -86,3 +89,10 @@ class ArrayRefSynthProvider:
         self.data_type = self.data.GetType().GetPointeeType()
         self.type_size = self.data_type.GetByteSize()
         assert self.type_size != 0
+
+def OptionalSummaryProvider(valobj, internal_dict):
+    if not valobj.GetChildMemberWithName('hasVal').GetValueAsUnsigned(0):
+        return 'None'
+    underlying_type = valobj.GetType().GetTemplateArgumentType(0)
+    storage = valobj.GetChildMemberWithName('storage')
+    return str(storage.Cast(underlying_type))
diff --git a/utils/not/not.cpp b/utils/not/not.cpp
index 2adeded..23062fb 100644
--- a/utils/not/not.cpp
+++ b/utils/not/not.cpp
@@ -6,6 +6,11 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+// Usage:
+//   not cmd
+//     Will return true if cmd doesn't crash and returns false.
+//   not --crash cmd
+//     Will return true if cmd crashes (e.g. for testing crash reporting).
 
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
diff --git a/utils/release/merge.sh b/utils/release/merge.sh
index 2cf39b2..949c298 100755
--- a/utils/release/merge.sh
+++ b/utils/release/merge.sh
@@ -66,9 +66,11 @@ svn log -c $rev http://llvm.org/svn/llvm-project/$proj/trunk >> $tempfile 2>&1
 cd $proj.src
 echo "# Updating tree"
 svn up
-echo "# Merging r$rev into $proj"
+echo "# Merging r$rev into $proj locally"
 svn merge -c $rev https://llvm.org/svn/llvm-project/$proj/trunk . || exit 1
-echo "# Committing changes"
-svn commit -F $tempfile || exit 1
-rm -f $tempfile
+
+echo
+echo "# To commit the merge, run the following in $proj.src/:"
+echo svn commit -F $tempfile
+
 exit 0
diff --git a/utils/release/tag.sh b/utils/release/tag.sh
index 390acaf..503413b 100755
--- a/utils/release/tag.sh
+++ b/utils/release/tag.sh
@@ -18,17 +18,21 @@ release=""
 rc=""
 rebranch="no"
 projects="llvm cfe dragonegg test-suite compiler-rt libcxx libcxxabi clang-tools-extra polly lldb lld openmp"
+dryrun=""
+revision="HEAD"
 
 base_url="https://llvm.org/svn/llvm-project"
 
 function usage() {
-    echo "usage: `basename $0` -release <num> [-rebranch]"
-    echo "usage: `basename $0` -release <num> -rc <num>"
+    echo "usage: `basename $0` -release <num> [-rebranch] [-revision <num>] [-dry-run]"
+    echo "usage: `basename $0` -release <num> -rc <num> [-dry-run]"
     echo " "
-    echo "  -release <num>  The version number of the release"
-    echo "  -rc <num>       The release candidate number"
-    echo "  -rebranch       Remove existing branch, if present, before branching"
-    echo "  -final          Tag final release candidate"
+    echo "  -release <num>   The version number of the release"
+    echo "  -rc <num>        The release candidate number"
+    echo "  -rebranch        Remove existing branch, if present, before branching"
+    echo "  -final           Tag final release candidate"
+    echo "  -revision <num>  Revision to branch off (default: HEAD)"
+    echo "  -dry-run         Make no changes to the repository, just print the commands"
 }
 
 function tag_version() {
@@ -38,10 +42,11 @@ function tag_version() {
             if [ $rebranch = "no" ]; then
                 continue
             fi
-            svn remove -m "Removing old release_$branch_release branch for rebranching." \
+            ${dryrun} svn remove -m "Removing old release_$branch_release branch for rebranching." \
                 $base_url/$proj/branches/release_$branch_release
         fi
-        svn copy -m "Creating release_$branch_release branch" \
+        ${dryrun} svn copy -m "Creating release_$branch_release branch off revision ${revision}" \
+            -r ${revision} \
             $base_url/$proj/trunk \
             $base_url/$proj/branches/release_$branch_release
     done
@@ -52,10 +57,10 @@ function tag_release_candidate() {
     set -x
     for proj in $projects ; do
         if ! svn ls $base_url/$proj/tags/RELEASE_$tag_release > /dev/null 2>&1 ; then
-            svn mkdir -m "Creating release directory for release_$tag_release." $base_url/$proj/tags/RELEASE_$tag_release
+            ${dryrun} svn mkdir -m "Creating release directory for release_$tag_release." $base_url/$proj/tags/RELEASE_$tag_release
         fi
         if ! svn ls $base_url/$proj/tags/RELEASE_$tag_release/$rc > /dev/null 2>&1 ; then
-            svn copy -m "Creating release candidate $rc from release_$tag_release branch" \
+            ${dryrun} svn copy -m "Creating release candidate $rc from release_$tag_release branch" \
                 $base_url/$proj/branches/release_$branch_release \
                 $base_url/$proj/tags/RELEASE_$tag_release/$rc
         fi
@@ -79,6 +84,13 @@ while [ $# -gt 0 ]; do
         -final | --final )
             rc="final"
             ;;
+        -revision | --revision )
+            shift
+            revision="$1"
+            ;;
+        -dry-run | --dry-run )
+            dryrun="echo"
+            ;;
         -h | --help | -help )
             usage
             exit 0
@@ -105,6 +117,13 @@ tag_release=`echo $release | sed -e 's,\.,,g'`
 if [ "x$rc" = "x" ]; then
     tag_version
 else
+    if [ "$revision" != "HEAD" ]; then
+        echo "error: cannot use -revision with -rc"
+        echo
+        usage
+        exit 1
+    fi
+
     tag_release_candidate
 fi
 
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index b028924..20f8d97 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -39,27 +39,30 @@ do_debug="no"
 do_asserts="no"
 do_compare="yes"
 BuildDir="`pwd`"
+BuildTriple=""
 
 function usage() {
     echo "usage: `basename $0` -release X.Y -rc NUM [OPTIONS]"
     echo ""
-    echo " -release X.Y      The release number to test."
-    echo " -rc NUM           The pre-release candidate number."
-    echo " -final            The final release candidate."
-    echo " -triple TRIPLE    The target triple for this machine."
-    echo " -j NUM            Number of compile jobs to run. [default: 3]"
-    echo " -build-dir DIR    Directory to perform testing in. [default: pwd]"
-    echo " -no-checkout      Don't checkout the sources from SVN."
-    echo " -no-64bit         Don't test the 64-bit version. [default: yes]"
-    echo " -enable-ada       Build Ada. [default: disable]"
-    echo " -disable-clang    Do not test clang. [default: enable]"
-    echo " -enable-dragonegg Test dragonegg. [default: disable]"
-    echo " -enable-fortran   Enable Fortran build. [default: disable]"
-    echo " -disable-objc     Disable ObjC build. [default: enable]"
-    echo " -test-debug       Test the debug build. [default: no]"
-    echo " -test-asserts     Test with asserts on. [default: no]"
-    echo " -no-compare-files Don't test that phase 2 and 3 files are identical."
-    echo " -use-gzip         Use gzip instead of xz."
+    echo " -release X.Y         The release number to test."
+    echo " -rc NUM              The pre-release candidate number."
+    echo " -final               The final release candidate."
+    echo " -triple TRIPLE       The target triple for this machine."
+    echo " -j NUM               Number of compile jobs to run. [default: 3]"
+    echo " -build-dir DIR       Directory to perform testing in. [default: pwd]"
+    echo " -no-checkout         Don't checkout the sources from SVN."
+    echo " -no-64bit            Don't test the 64-bit version. [default: yes]"
+    echo " -enable-ada          Build Ada. [default: disable]"
+    echo " -disable-clang       Do not test clang. [default: enable]"
+    echo " -enable-dragonegg    Test dragonegg. [default: disable]"
+    echo " -enable-fortran      Enable Fortran build. [default: disable]"
+    echo " -disable-objc        Disable ObjC build. [default: enable]"
+    echo " -test-debug          Test the debug build. [default: no]"
+    echo " -test-asserts        Test with asserts on. [default: no]"
+    echo " -no-compare-files    Don't test that phase 2 and 3 files are identical."
+    echo " -use-gzip            Use gzip instead of xz."
+    echo " -build-triple TRIPLE The build triple for this machine"
+    echo "                      [default: use config.guess]"
 }
 
 while [ $# -gt 0 ]; do
@@ -80,6 +83,10 @@ while [ $# -gt 0 ]; do
             shift
             Triple="$1"
             ;;
+        -build-triple | --build-triple )
+            shift
+            BuildTriple="$1"
+            ;;
         -j* )
             NumJobs="`echo $1 | sed -e 's,-j\([0-9]*\),\1,g'`"
             if [ -z "$NumJobs" ]; then
@@ -295,16 +302,21 @@ function configure_llvmCore() {
     echo "# Using C compiler: $c_compiler"
     echo "# Using C++ compiler: $cxx_compiler"
 
+    build_triple_option="${BuildTriple:+--build=$BuildTriple}"
+
     cd $ObjDir
     echo "# Configuring llvm $Release-$RC $Flavor"
     echo "# $BuildDir/llvm.src/configure --prefix=$InstallDir \
         --enable-optimized=$Optimized \
-        --enable-assertions=$Assertions"
+        --enable-assertions=$Assertions \
+        --disable-timestamps \
+        $build_triple_option"
     env CC="$c_compiler" CXX="$cxx_compiler" \
     $BuildDir/llvm.src/configure --prefix=$InstallDir \
         --enable-optimized=$Optimized \
         --enable-assertions=$Assertions \
         --disable-timestamps \
+        $build_triple_option \
         2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log
     cd $BuildDir
 }
diff --git a/utils/shuffle_fuzz.py b/utils/shuffle_fuzz.py
index 384a93a..985d1da 100755
--- a/utils/shuffle_fuzz.py
+++ b/utils/shuffle_fuzz.py
@@ -173,7 +173,7 @@ entry:""" % dict(subst,
   # Generate some string constants that we can use to report errors.
   for i, r in enumerate(result):
     if r != -1:
-      s = ('FAIL(%(seed)s): lane %(lane)d, expected %(result)d, found %%d\\0A' %
+      s = ('FAIL(%(seed)s): lane %(lane)d, expected %(result)d, found %%d\n\\0A' %
            {'seed': args.seed, 'lane': i, 'result': r})
       s += ''.join(['\\00' for _ in itertools.repeat(None, 128 - len(s) + 2)])
       print """
@@ -235,8 +235,7 @@ die.%(i)d:
   %%bad.%(i)d = trunc i2048 %%tmp.%(i)d to i32
   call i32 (i8*, i8*, ...)* @sprintf(i8* %%str.ptr, i8* getelementptr inbounds ([128 x i8]* @error.%(i)d, i32 0, i32 0), i32 %%bad.%(i)d)
   %%length.%(i)d = call i32 @strlen(i8* %%str.ptr)
-  %%size.%(i)d = add i32 %%length.%(i)d, 1
-  call i32 @write(i32 2, i8* %%str.ptr, i32 %%size.%(i)d)
+  call i32 @write(i32 2, i8* %%str.ptr, i32 %%length.%(i)d)
   call void @llvm.trap()
   unreachable
 """ % dict(subst, i=i, next_i=i + 1, r=r)
diff --git a/utils/unittest/CMakeLists.txt b/utils/unittest/CMakeLists.txt
index b6d2d6d..b34e22a 100644
--- a/utils/unittest/CMakeLists.txt
+++ b/utils/unittest/CMakeLists.txt
@@ -32,17 +32,20 @@ if (NOT LLVM_ENABLE_THREADS)
   add_definitions( -DGTEST_HAS_PTHREAD=0 )
 endif()
 
-# Visual Studio 2012 only supports up to 8 template parameters in
-# std::tr1::tuple by default, but gtest requires 10
-if(MSVC AND MSVC_VERSION EQUAL 1700)
-  add_definitions(-D_VARIADIC_MAX=10)
-endif ()
+set(LIBS
+  LLVMSupport # Depends on llvm::raw_ostream
+)
+
+find_library(PTHREAD_LIBRARY_PATH pthread)
+if (PTHREAD_LIBRARY_PATH)
+  list(APPEND LIBS pthread)
+endif()
 
 add_llvm_library(gtest
   googletest/src/gtest-all.cc
 
   LINK_LIBS
-  LLVMSupport # Depends on llvm::raw_ostream
-  )
+  ${LIBS}
+)
 
 add_subdirectory(UnitTestMain)
diff --git a/utils/update_llc_test_checks.py b/utils/update_llc_test_checks.py
new file mode 100755
index 0000000..df01d89
--- /dev/null
+++ b/utils/update_llc_test_checks.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python2.7
+
+"""A test case update script.
+
+This script is a utility to update LLVM X86 'llc' based test cases with new
+FileCheck patterns. It can either update all of the tests in the file or
+a single test function.
+"""
+
+import argparse
+import itertools
+import string
+import subprocess
+import sys
+import tempfile
+import re
+
+
+def llc(args, cmd_args, ir):
+  with open(ir) as ir_file:
+    stdout = subprocess.check_output(args.llc_binary + ' ' + cmd_args,
+                                     shell=True, stdin=ir_file)
+  return stdout
+
+
+ASM_SCRUB_WHITESPACE_RE = re.compile(r'(?!^(|  \w))[ \t]+', flags=re.M)
+ASM_SCRUB_TRAILING_WHITESPACE_RE = re.compile(r'[ \t]+$', flags=re.M)
+ASM_SCRUB_SHUFFLES_RE = (
+    re.compile(
+        r'^(\s*\w+) [^#\n]+#+ ((?:[xyz]mm\d+|mem) = .*)$',
+        flags=re.M))
+ASM_SCRUB_SP_RE = re.compile(r'\d+\(%(esp|rsp)\)')
+ASM_SCRUB_RIP_RE = re.compile(r'[.\w]+\(%rip\)')
+ASM_SCRUB_KILL_COMMENT_RE = re.compile(r'^ *#+ +kill:.*\n')
+
+
+def scrub_asm(asm):
+  # Scrub runs of whitespace out of the assembly, but leave the leading
+  # whitespace in place.
+  asm = ASM_SCRUB_WHITESPACE_RE.sub(r' ', asm)
+  # Expand the tabs used for indentation.
+  asm = string.expandtabs(asm, 2)
+  # Detect shuffle asm comments and hide the operands in favor of the comments.
+  asm = ASM_SCRUB_SHUFFLES_RE.sub(r'\1 {{.*#+}} \2', asm)
+  # Generically match the stack offset of a memory operand.
+  asm = ASM_SCRUB_SP_RE.sub(r'{{[0-9]+}}(%\1)', asm)
+  # Generically match a RIP-relative memory operand.
+  asm = ASM_SCRUB_RIP_RE.sub(r'{{.*}}(%rip)', asm)
+  # Strip kill operands inserted into the asm.
+  asm = ASM_SCRUB_KILL_COMMENT_RE.sub('', asm)
+  # Strip trailing whitespace.
+  asm = ASM_SCRUB_TRAILING_WHITESPACE_RE.sub(r'', asm)
+  return asm
+
+
+def main():
+  parser = argparse.ArgumentParser(description=__doc__)
+  parser.add_argument('-v', '--verbose', action='store_true',
+                      help='Show verbose output')
+  parser.add_argument('--llc-binary', default='llc',
+                      help='The "llc" binary to use to generate the test case')
+  parser.add_argument(
+      '--function', help='The function in the test file to update')
+  parser.add_argument('tests', nargs='+')
+  args = parser.parse_args()
+
+  run_line_re = re.compile('^\s*;\s*RUN:\s*(.*)$')
+  ir_function_re = re.compile('^\s*define\s+(?:internal\s+)?[^@]*@(\w+)\s*\(')
+  asm_function_re = re.compile(
+      r'^_?(?P<f>[^:]+):[ \t]*#+[ \t]*@(?P=f)\n[^:]*?'
+      r'(?P<body>^##?[ \t]+[^:]+:.*?)\s*'
+      r'^\s*(?:[^:\n]+?:\s*\n\s*\.size|\.cfi_endproc|\.globl|\.comm|\.(?:sub)?section)',
+      flags=(re.M | re.S))
+  check_prefix_re = re.compile('--check-prefix=(\S+)')
+  check_re = re.compile(r'^\s*;\s*([^:]+?)(?:-NEXT|-NOT|-DAG|-LABEL)?:')
+
+  for test in args.tests:
+    if args.verbose:
+      print >>sys.stderr, 'Scanning for RUN lines in test file: %s' % (test,)
+    with open(test) as f:
+      test_lines = [l.rstrip() for l in f]
+
+    run_lines = [m.group(1)
+                 for m in [run_line_re.match(l) for l in test_lines] if m]
+    if args.verbose:
+      print >>sys.stderr, 'Found %d RUN lines:' % (len(run_lines),)
+      for l in run_lines:
+        print >>sys.stderr, '  RUN: ' + l
+
+    checks = []
+    for l in run_lines:
+      (llc_cmd, filecheck_cmd) = tuple([cmd.strip() for cmd in l.split('|', 1)])
+      if not llc_cmd.startswith('llc '):
+        print >>sys.stderr, 'WARNING: Skipping non-llc RUN line: ' + l
+        continue
+
+      if not filecheck_cmd.startswith('FileCheck '):
+        print >>sys.stderr, 'WARNING: Skipping non-FileChecked RUN line: ' + l
+        continue
+
+      llc_cmd_args = llc_cmd[len('llc'):].strip()
+      llc_cmd_args = llc_cmd_args.replace('< %s', '').replace('%s', '').strip()
+
+      check_prefixes = [m.group(1)
+                        for m in check_prefix_re.finditer(filecheck_cmd)]
+      if not check_prefixes:
+        check_prefixes = ['CHECK']
+
+      # FIXME: We should use multiple check prefixes to common check lines. For
+      # now, we just ignore all but the last.
+      checks.append((check_prefixes, llc_cmd_args))
+
+    asm = {}
+    for prefixes, _ in checks:
+      for prefix in prefixes:
+        asm.update({prefix: dict()})
+    for prefixes, llc_args in checks:
+      if args.verbose:
+        print >>sys.stderr, 'Extracted LLC cmd: llc ' + llc_args
+        print >>sys.stderr, 'Extracted FileCheck prefixes: ' + str(prefixes)
+      raw_asm = llc(args, llc_args, test)
+      # Build up a dictionary of all the function bodies.
+      for m in asm_function_re.finditer(raw_asm):
+        if not m:
+          continue
+        f = m.group('f')
+        f_asm = scrub_asm(m.group('body'))
+        if f.startswith('stress'):
+          # We only use the last line of the asm for stress tests.
+          f_asm = '\n'.join(f_asm.splitlines()[-1:])
+        if args.verbose:
+          print >>sys.stderr, 'Processing asm for function: ' + f
+          for l in f_asm.splitlines():
+            print >>sys.stderr, '  ' + l
+        for prefix in prefixes:
+          if f in asm[prefix] and asm[prefix][f] != f_asm:
+            if prefix == prefixes[-1]:
+              print >>sys.stderr, ('WARNING: Found conflicting asm under the '
+                                   'same prefix!')
+            else:
+              asm[prefix][f] = None
+              continue
+
+          asm[prefix][f] = f_asm
+
+    is_in_function = False
+    is_in_function_start = False
+    prefix_set = set([prefix for prefixes, _ in checks for prefix in prefixes])
+    if args.verbose:
+      print >>sys.stderr, 'Rewriting FileCheck prefixes: %s' % (prefix_set,)
+    fixed_lines = []
+    for l in test_lines:
+      if is_in_function_start:
+        if l.lstrip().startswith(';'):
+          m = check_re.match(l)
+          if not m or m.group(1) not in prefix_set:
+            fixed_lines.append(l)
+            continue
+
+        # Print out the various check lines here
+        printed_prefixes = []
+        for prefixes, _ in checks:
+          for prefix in prefixes:
+            if prefix in printed_prefixes:
+              break
+            if not asm[prefix][name]:
+              continue
+            if len(printed_prefixes) != 0:
+              fixed_lines.append(';')
+            printed_prefixes.append(prefix)
+            fixed_lines.append('; %s-LABEL: %s:' % (prefix, name))
+            asm_lines = asm[prefix][name].splitlines()
+            fixed_lines.append('; %s:       %s' % (prefix, asm_lines[0]))
+            for asm_line in asm_lines[1:]:
+              fixed_lines.append('; %s-NEXT:  %s' % (prefix, asm_line))
+            break
+        is_in_function_start = False
+
+      if is_in_function:
+        # Skip any blank comment lines in the IR.
+        if l.strip() == ';':
+          continue
+        # And skip any CHECK lines. We'll build our own.
+        m = check_re.match(l)
+        if m and m.group(1) in prefix_set:
+          continue
+        # Collect the remaining lines in the function body and look for the end
+        # of the function.
+        fixed_lines.append(l)
+        if l.strip() == '}':
+          is_in_function = False
+        continue
+
+      fixed_lines.append(l)
+
+      m = ir_function_re.match(l)
+      if not m:
+        continue
+      name = m.group(1)
+      if args.function is not None and name != args.function:
+        # When filtering on a specific function, skip all others.
+        continue
+      is_in_function = is_in_function_start = True
+
+    if args.verbose:
+      print>>sys.stderr, 'Writing %d fixed lines to %s...' % (
+          len(fixed_lines), test)
+    with open(test, 'w') as f:
+      f.writelines([l + '\n' for l in fixed_lines])
+
+
+if __name__ == '__main__':
+  main()
diff --git a/utils/vim/llvm.vim b/utils/vim/llvm.vim
index e8273dd..913d0f5 100644
--- a/utils/vim/llvm.vim
+++ b/utils/vim/llvm.vim
@@ -41,29 +41,28 @@ syn keyword llvmKeyword alignstack alwaysinline appending arm_aapcs_vfpcc
 syn keyword llvmKeyword arm_aapcscc arm_apcscc asm atomic available_externally
 syn keyword llvmKeyword blockaddress byval c catch cc ccc cleanup coldcc common
 syn keyword llvmKeyword constant datalayout declare default define deplibs
-syn keyword llvmKeyword dllexport dllimport except extern_weak external fastcc
-syn keyword llvmKeyword filter gc global hidden initialexec inlinehint inreg
-syn keyword llvmKeyword intel_ocl_bicc inteldialect internal
-syn keyword llvmKeyword linkonce linkonce_odr
-syn keyword llvmKeyword localdynamic localexec minsize module monotonic
-syn keyword llvmKeyword msp430_intrcc naked nest noalias nocapture
-syn keyword llvmKeyword noimplicitfloat noinline nonlazybind noredzone noreturn
-syn keyword llvmKeyword nounwind optnone optsize personality private protected
-syn keyword llvmKeyword ptx_device ptx_kernel readnone readonly release
-syn keyword llvmKeyword returns_twice sanitize_thread sanitize_memory
-syn keyword llvmKeyword section seq_cst sideeffect signext singlethread
-syn keyword llvmKeyword spir_func spir_kernel sret ssp sspreq sspstrong
-syn keyword llvmKeyword tail target thread_local to triple unnamed_addr
-syn keyword llvmKeyword unordered uwtable volatile weak weak_odr
-syn keyword llvmKeyword x86_fastcallcc x86_stdcallcc x86_thiscallcc x86_64_sysvcc
-syn keyword llvmKeyword x86_64_win64cc zeroext
-syn keyword llvmKeyword uselistorder uselistorder_bb
+syn keyword llvmKeyword distinct dllexport dllimport except extern_weak external
+syn keyword llvmKeyword externally_initialized fastcc filter gc global hidden
+syn keyword llvmKeyword initialexec inlinehint inreg intel_ocl_bicc inteldialect
+syn keyword llvmKeyword internal linkonce linkonce_odr localdynamic localexec
+syn keyword llvmKeyword minsize module monotonic msp430_intrcc naked nest
+syn keyword llvmKeyword noalias nocapture noimplicitfloat noinline nonlazybind
+syn keyword llvmKeyword noredzone noreturn nounwind optnone optsize personality
+syn keyword llvmKeyword private protected ptx_device ptx_kernel readnone
+syn keyword llvmKeyword readonly release returns_twice sanitize_thread
+syn keyword llvmKeyword sanitize_memory section seq_cst sideeffect signext
+syn keyword llvmKeyword singlethread spir_func spir_kernel sret ssp sspreq
+syn keyword llvmKeyword sspstrong tail target thread_local to triple
+syn keyword llvmKeyword unnamed_addr unordered uwtable volatile weak weak_odr
+syn keyword llvmKeyword x86_fastcallcc x86_stdcallcc x86_thiscallcc
+syn keyword llvmKeyword x86_64_sysvcc x86_64_win64cc zeroext uselistorder
+syn keyword llvmKeyword uselistorder_bb
 
 " Obsolete keywords.
 syn keyword llvmError  getresult begin end
 
 " Misc syntax.
-syn match   llvmNoName /[%@]\d\+\>/
+syn match   llvmNoName /[%@!]\d\+\>/
 syn match   llvmNumber /-\?\<\d\+\>/
 syn match   llvmFloat  /-\?\<\d\+\.\d*\(e[+-]\d\+\)\?\>/
 syn match   llvmFloat  /\<0x\x\+\>/
@@ -74,6 +73,17 @@ syn region  llvmString start=/"/ skip=/\\"/ end=/"/
 syn match   llvmLabel /[-a-zA-Z$._][-a-zA-Z$._0-9]*:/
 syn match   llvmIdentifier /[%@][-a-zA-Z$._][-a-zA-Z$._0-9]*/
 
+" Named metadata and specialized metadata keywords.
+syn match   llvmIdentifier /![-a-zA-Z$._][-a-zA-Z$._0-9]*\ze\s*$/
+syn match   llvmIdentifier /![-a-zA-Z$._][-a-zA-Z$._0-9]*\ze\s*[=!]/
+syn match   llvmType /!\zs\a\+\ze\s*(/
+syn match   llvmConstant /\<DW_TAG_[a-z_]\+\>/
+syn match   llvmConstant /\<DW_ATE_[a-zA-Z_]\+\>/
+syn match   llvmConstant /\<DW_OP_[a-zA-Z0-9_]\+\>/
+syn match   llvmConstant /\<DW_LANG_[a-zA-Z0-9_]\+\>/
+syn match   llvmConstant /\<DW_VIRTUALITY_[a-z_]\+\>/
+syn match   llvmConstant /\<DIFlag[A-Za-z]\+\>/
+
 " Syntax-highlight dejagnu test commands.
 syn match  llvmSpecialComment /;\s*RUN:.*$/
 syn match  llvmSpecialComment /;\s*PR\d*\s*$/